diff --git a/.azuredevops/rocm-ci.yml b/.azuredevops/rocm-ci.yml
index 407b41802eea7..2be059422043a 100644
--- a/.azuredevops/rocm-ci.yml
+++ b/.azuredevops/rocm-ci.yml
@@ -23,5 +23,7 @@ trigger:
     - '**/*.md'
     - LICENSE.TXT
 
+pr: none
+
 jobs:
   - template: ${{ variables.CI_COMPONENT_PATH }}/llvm-project.yml@pipelines_repo
diff --git a/.ci/generate_test_report_lib.py b/.ci/generate_test_report_lib.py
index 5026c292a7934..36c95852452ac 100644
--- a/.ci/generate_test_report_lib.py
+++ b/.ci/generate_test_report_lib.py
@@ -98,6 +98,23 @@ def _format_ninja_failures(ninja_failures: list[tuple[str, str]]) -> list[str]:
         )
     return output
 
+def get_failures(junit_objects) -> dict[str, list[tuple[str, str]]]:
+    failures = {}
+    for results in junit_objects:
+        for testsuite in results:
+            for test in testsuite:
+                if (
+                    not test.is_passed
+                    and test.result
+                    and isinstance(test.result[0], Failure)
+                ):
+                    if failures.get(testsuite.name) is None:
+                        failures[testsuite.name] = []
+                    failures[testsuite.name].append(
+                        (test.classname + "/" + test.name, test.result[0].text)
+                    )
+    return failures
+
 
 # Set size_limit to limit the byte size of the report. The default is 1MB as this
 # is the most that can be put into an annotation. If the generated report exceeds
@@ -113,7 +130,7 @@ def generate_report(
     size_limit=1024 * 1024,
     list_failures=True,
 ):
-    failures = {}
+    failures = get_failures(junit_objects)
     tests_run = 0
     tests_skipped = 0
     tests_failed = 0
@@ -124,18 +141,6 @@ def generate_report(
             tests_skipped += testsuite.skipped
             tests_failed += testsuite.failures
 
-            for test in testsuite:
-                if (
-                    not test.is_passed
-                    and test.result
-                    and isinstance(test.result[0], Failure)
-                ):
-                    if failures.get(testsuite.name) is None:
-                        failures[testsuite.name] = []
-                    failures[testsuite.name].append(
-                        (test.classname + "/" + test.name, test.result[0].text)
-                    )
-
     report = [f"# {title}", ""]
 
     if tests_run == 0:
@@ -258,7 +263,7 @@ def plural(num_tests):
     return report
 
 
-def generate_report_from_files(title, return_code, build_log_files):
+def load_info_from_files(build_log_files):
     junit_files = [
         junit_file for junit_file in build_log_files if junit_file.endswith(".xml")
     ]
@@ -271,6 +276,9 @@ def generate_report_from_files(title, return_code, build_log_files):
             ninja_logs.append(
                 [log_line.strip() for log_line in ninja_log_file_handle.readlines()]
             )
-    return generate_report(
-        title, return_code, [JUnitXml.fromfile(p) for p in junit_files], ninja_logs
-    )
+    return [JUnitXml.fromfile(p) for p in junit_files], ninja_logs
+
+
+def generate_report_from_files(title, return_code, build_log_files):
+    junit_objects, ninja_logs = load_info_from_files(build_log_files)
+    return generate_report(title, return_code, junit_objects, ninja_logs)
diff --git a/.ci/monolithic-linux.sh b/.ci/monolithic-linux.sh
index c8f331204bd49..4a8418d7baa8c 100755
--- a/.ci/monolithic-linux.sh
+++ b/.ci/monolithic-linux.sh
@@ -66,11 +66,13 @@ start-group "ninja"
 
 # Targets are not escaped as they are passed as separate arguments.
 ninja -C "${BUILD_DIR}" -k 0 ${targets} |& tee ninja.log
+cp ${BUILD_DIR}/.ninja_log ninja.ninja_log
 
 if [[ "${runtime_targets}" != "" ]]; then
   start-group "ninja Runtimes"
 
   ninja -C "${BUILD_DIR}" ${runtime_targets} |& tee ninja_runtimes.log
+  cp ${BUILD_DIR}/.ninja_log ninja_runtimes.ninja_log
 fi
 
 # Compiling runtimes with just-built Clang and running their tests
@@ -87,6 +89,7 @@ if [[ "${runtime_targets_needs_reconfig}" != "" ]]; then
 
   ninja -C "${BUILD_DIR}" ${runtime_targets_needs_reconfig} \
     |& tee ninja_runtimes_needs_reconfig1.log
+  cp ${BUILD_DIR}/.ninja_log ninja_runtimes_needs_reconig.ninja_log
 
   start-group "CMake Runtimes Clang Modules"
 
@@ -99,4 +102,5 @@ if [[ "${runtime_targets_needs_reconfig}" != "" ]]; then
 
   ninja -C "${BUILD_DIR}" ${runtime_targets_needs_reconfig} \
     |& tee ninja_runtimes_needs_reconfig2.log
+  cp ${BUILD_DIR}/.ninja_log ninja_runtimes_needs_reconfig2.ninja_log
 fi
diff --git a/.ci/monolithic-windows.sh b/.ci/monolithic-windows.sh
index f85d6e3d51b57..219979dd3e36e 100755
--- a/.ci/monolithic-windows.sh
+++ b/.ci/monolithic-windows.sh
@@ -55,9 +55,11 @@ start-group "ninja"
 
 # Targets are not escaped as they are passed as separate arguments.
 ninja -C "${BUILD_DIR}" -k 0 ${targets} |& tee ninja.log
+cp ${BUILD_DIR}/.ninja_log ninja.ninja_log
 
 if [[ "${runtime_targets}" != "" ]]; then
   start-group "ninja runtimes"
   
   ninja -C "${BUILD_DIR}" -k 0 ${runtimes_targets} |& tee ninja_runtimes.log
+  cp ${BUILD_DIR}/.ninja_log ninja_runtimes.ninja_log
 fi
diff --git a/.ci/premerge_advisor_explain.py b/.ci/premerge_advisor_explain.py
new file mode 100644
index 0000000000000..06c6cb9aaa46b
--- /dev/null
+++ b/.ci/premerge_advisor_explain.py
@@ -0,0 +1,63 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Script for getting explanations from the premerge advisor."""
+
+import argparse
+import os
+import platform
+import sys
+
+import requests
+
+import generate_test_report_lib
+
+PREMERGE_ADVISOR_URL = (
+    "http://premerge-advisor.premerge-advisor.svc.cluster.local:5000/explain"
+)
+
+
+def main(commit_sha: str, build_log_files: list[str]):
+    junit_objects, ninja_logs = generate_test_report_lib.load_info_from_files(
+        build_log_files
+    )
+    test_failures = generate_test_report_lib.get_failures(junit_objects)
+    current_platform = f"{platform.system()}-{platform.machine()}".lower()
+    explanation_request = {
+        "base_commit_sha": commit_sha,
+        "platform": current_platform,
+        "failures": [],
+    }
+    if test_failures:
+        for _, failures in test_failures.items():
+            for name, failure_messsage in failures:
+                explanation_request["failures"].append(
+                    {"name": name, "message": failure_messsage}
+                )
+    else:
+        ninja_failures = generate_test_report_lib.find_failure_in_ninja_logs(ninja_logs)
+        for name, failure_message in ninja_failures:
+            explanation_request["failures"].append(
+                {"name": name, "message": failure_message}
+            )
+    advisor_response = requests.get(PREMERGE_ADVISOR_URL, json=explanation_request)
+    if advisor_response.status_code == 200:
+        print(advisor_response.json())
+    else:
+        print(advisor_response.reason)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("commit_sha", help="The base commit SHA for the test.")
+    parser.add_argument(
+        "build_log_files", help="Paths to JUnit report files and ninja logs.", nargs="*"
+    )
+    args = parser.parse_args()
+
+    # Skip looking for results on AArch64 for now because the premerge advisor
+    # service is not available on AWS currently.
+    if platform.machine() == "arm64":
+        sys.exit(0)
+
+    main(args.commit_sha, args.build_log_files)
diff --git a/.ci/premerge_advisor_upload.py b/.ci/premerge_advisor_upload.py
new file mode 100644
index 0000000000000..1fc2423dd24fc
--- /dev/null
+++ b/.ci/premerge_advisor_upload.py
@@ -0,0 +1,61 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Script for uploading results to the premerge advisor."""
+
+import argparse
+import os
+import platform
+import sys
+
+import requests
+
+import generate_test_report_lib
+
+PREMERGE_ADVISOR_URL = (
+    "http://premerge-advisor.premerge-advisor.svc.cluster.local:5000/upload"
+)
+
+
+def main(commit_sha, workflow_run_number, build_log_files):
+    junit_objects, ninja_logs = generate_test_report_lib.load_info_from_files(
+        build_log_files
+    )
+    test_failures = generate_test_report_lib.get_failures(junit_objects)
+    source = "pull_request" if "GITHUB_ACTIONS" in os.environ else "postcommit"
+    current_platform = f"{platform.system()}-{platform.machine()}".lower()
+    failure_info = {
+        "source_type": source,
+        "base_commit_sha": commit_sha,
+        "source_id": workflow_run_number,
+        "failures": [],
+        "platform": current_platform,
+    }
+    if test_failures:
+        for _, failures in test_failures.items():
+            for name, failure_message in failures:
+                failure_info["failures"].append(
+                    {"name": name, "message": failure_message}
+                )
+    else:
+        ninja_failures = generate_test_report_lib.find_failure_in_ninja_logs(ninja_logs)
+        for name, failure_message in ninja_failures:
+            failure_info["failures"].append({"name": name, "message": failure_message})
+    requests.post(PREMERGE_ADVISOR_URL, json=failure_info)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("commit_sha", help="The base commit SHA for the test.")
+    parser.add_argument("workflow_run_number", help="The run number from GHA.")
+    parser.add_argument(
+        "build_log_files", help="Paths to JUnit report files and ninja logs.", nargs="*"
+    )
+    args = parser.parse_args()
+
+    # Skip uploading results on AArch64 for now because the premerge advisor
+    # service is not available on AWS currently.
+    if platform.machine() == "arm64":
+        sys.exit(0)
+
+    main(args.commit_sha, args.workflow_run_number, args.build_log_files)
diff --git a/.ci/utils.sh b/.ci/utils.sh
index 5d32968babb39..dc8ce9b9a4214 100644
--- a/.ci/utils.sh
+++ b/.ci/utils.sh
@@ -26,7 +26,7 @@ function at-exit {
   mkdir -p artifacts
   sccache --show-stats
   sccache --show-stats >> artifacts/sccache_stats.txt
-  cp "${BUILD_DIR}"/.ninja_log artifacts/.ninja_log
+  cp "${MONOREPO_ROOT}"/*.ninja_log artifacts/ || :
   cp "${MONOREPO_ROOT}"/*.log artifacts/ || :
   cp "${BUILD_DIR}"/test-results.*.xml artifacts/ || :
 
@@ -38,6 +38,17 @@ function at-exit {
       $retcode "${BUILD_DIR}"/test-results.*.xml "${MONOREPO_ROOT}"/ninja*.log \
       >> $GITHUB_STEP_SUMMARY
   fi
+
+  if [[ "$retcode" != "0" ]]; then
+    python "${MONOREPO_ROOT}"/.ci/premerge_advisor_upload.py \
+      $(git rev-parse HEAD~1) $GITHUB_RUN_NUMBER \
+      "${BUILD_DIR}"/test-results.*.xml "${MONOREPO_ROOT}"/ninja*.log
+    if [[ "$GITHUB_ACTIONS" != "" ]]; then
+      python "${MONOREPO_ROOT}"/.ci/premerge_advisor_explain.py \
+        $(git rev-parse HEAD~1) "${BUILD_DIR}"/test-results.*.xml \
+        "${MONOREPO_ROOT}"/ninja*.log
+    fi
+  fi
 }
 trap at-exit EXIT
 
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index a6192e1b3e81b..3876539652334 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -53,6 +53,13 @@
 /mlir/include/mlir/Interfaces/DestinationStyleOpInterface.* @matthias-springer
 /mlir/lib/Interfaces/DestinationStyleOpInterface.* @matthias-springer
 
+# AMDGPU and ROCDL dialects in MLIR.
+/mlir/include/mlir/Dialect/AMDGPU @krzysz00 @kuhar
+/mlir/lib/Dialect/AMDGPU @krzysz00 @kuhar
+/mlir/lib/Conversion/*AMDGPU* @krzysz00 @kuhar
+/mlir/lib/Conversion/*ToROCDL @krzysz00 @kuhar
+/mlir/include/mlir/Dialect/LLVMIR/ROCDL* @krzysz00 @kuhar
+
 # Bufferization Dialect in MLIR.
 /mlir/include/mlir/Dialect/Bufferization @matthias-springer
 /mlir/lib/Dialect/Bufferization @matthias-springer
@@ -131,6 +138,7 @@
 /mlir/test/python/ @ftynse @makslevental @stellaraccident @rolfmorel
 /mlir/python/ @ftynse @makslevental @stellaraccident @rolfmorel
 /mlir/lib/Bindings/Python @makslevental @rolfmorel
+/mlir/include/Bindings/Python @makslevental @rolfmorel
 
 # MLIR Mem2Reg/SROA
 /mlir/**/Transforms/Mem2Reg.* @moxinilian
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
new file mode 100644
index 0000000000000..8b137891791fe
--- /dev/null
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1 @@
+
diff --git a/.github/workflows/containers/github-action-ci-tooling/Dockerfile b/.github/workflows/containers/github-action-ci-tooling/Dockerfile
index 7d64562876628..9d2aaf6bbd48a 100644
--- a/.github/workflows/containers/github-action-ci-tooling/Dockerfile
+++ b/.github/workflows/containers/github-action-ci-tooling/Dockerfile
@@ -1,7 +1,10 @@
 ARG LLVM_VERSION=21.1.0
+# FIXME: Use "${LLVM_VERSION%%.*}" instead of "LLVM_VERSION_MAJOR" once we update runners to Ubuntu-26.04 with Buildah >= 1.37
+ARG LLVM_VERSION_MAJOR=21
 
 FROM docker.io/library/ubuntu:24.04 AS llvm-downloader
 ARG LLVM_VERSION
+ARG LLVM_VERSION_MAJOR
 
 RUN apt-get update && \
     apt-get install -y wget xz-utils && \
@@ -9,6 +12,8 @@ RUN apt-get update && \
     mkdir -p /llvm-extract && \
     tar -xvJf llvm.tar.xz -C /llvm-extract \
         # Only unpack these tools to save space on Github runner.
+        LLVM-${LLVM_VERSION}-Linux-X64/bin/clang-${LLVM_VERSION_MAJOR} \
+        LLVM-${LLVM_VERSION}-Linux-X64/lib/clang/${LLVM_VERSION_MAJOR}/include \
         LLVM-${LLVM_VERSION}-Linux-X64/bin/clang-tidy \
         LLVM-${LLVM_VERSION}-Linux-X64/bin/clang-format \
         LLVM-${LLVM_VERSION}-Linux-X64/bin/git-clang-format && \
@@ -50,12 +55,27 @@ RUN pip install -r requirements_formatting.txt --break-system-packages && \
 
 FROM base AS ci-container-code-lint
 ARG LLVM_VERSION
+ARG LLVM_VERSION_MAJOR
 
-COPY --from=llvm-downloader /llvm-extract/LLVM-${LLVM_VERSION}-Linux-X64/bin/clang-tidy ${LLVM_SYSROOT}/bin/
+COPY --from=llvm-downloader /llvm-extract/LLVM-${LLVM_VERSION}-Linux-X64/bin/clang-tidy \
+                            /llvm-extract/LLVM-${LLVM_VERSION}-Linux-X64/bin/clang-${LLVM_VERSION_MAJOR} \
+                            ${LLVM_SYSROOT}/bin/
+COPY --from=llvm-downloader /llvm-extract/LLVM-${LLVM_VERSION}-Linux-X64/lib/clang/${LLVM_VERSION_MAJOR}/include \
+                            ${LLVM_SYSROOT}/lib/clang/${LLVM_VERSION_MAJOR}/include
 COPY clang-tools-extra/clang-tidy/tool/clang-tidy-diff.py ${LLVM_SYSROOT}/bin/clang-tidy-diff.py
 
+RUN ln -s ${LLVM_SYSROOT}/bin/clang-${LLVM_VERSION_MAJOR} ${LLVM_SYSROOT}/bin/clang && \
+    ln -s ${LLVM_SYSROOT}/bin/clang ${LLVM_SYSROOT}/bin/clang++
+
 ENV PATH=${LLVM_SYSROOT}/bin:${PATH}
 
+RUN apt-get update && \
+    DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    cmake \
+    ninja-build && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
 # Install dependencies for 'pr-code-lint.yml' job
 COPY llvm/utils/git/requirements_linting.txt requirements_linting.txt
 RUN pip install -r requirements_linting.txt --break-system-packages && \
diff --git a/.github/workflows/containers/github-action-ci-windows/Dockerfile b/.github/workflows/containers/github-action-ci-windows/Dockerfile
index 640d34da02532..9ddf5017bc020 100644
--- a/.github/workflows/containers/github-action-ci-windows/Dockerfile
+++ b/.github/workflows/containers/github-action-ci-windows/Dockerfile
@@ -90,7 +90,7 @@ RUN powershell -Command \
 RUN git config --system core.longpaths true & \
     git config --global core.autocrlf false
 
-ARG RUNNER_VERSION=2.328.0
+ARG RUNNER_VERSION=2.329.0
 ENV RUNNER_VERSION=$RUNNER_VERSION
 
 RUN powershell -Command \
diff --git a/.github/workflows/containers/github-action-ci/Dockerfile b/.github/workflows/containers/github-action-ci/Dockerfile
index 1d3f5f9c35d7f..1b376dd4420b3 100644
--- a/.github/workflows/containers/github-action-ci/Dockerfile
+++ b/.github/workflows/containers/github-action-ci/Dockerfile
@@ -2,7 +2,7 @@ FROM docker.io/library/ubuntu:24.04 AS base
 ENV LLVM_SYSROOT=/opt/llvm
 
 FROM base AS stage1-toolchain
-ENV LLVM_VERSION=21.1.1
+ENV LLVM_VERSION=21.1.3
 
 RUN apt-get update && \
     apt-get install -y \
@@ -99,7 +99,7 @@ WORKDIR /home/gha
 
 FROM ci-container AS ci-container-agent
 
-ENV GITHUB_RUNNER_VERSION=2.328.0
+ENV GITHUB_RUNNER_VERSION=2.329.0
 
 RUN mkdir actions-runner && \
     cd actions-runner && \
diff --git a/.github/workflows/pr-code-format.yml b/.github/workflows/pr-code-format.yml
index 1e0dc7045c1cc..2b85d8b59869c 100644
--- a/.github/workflows/pr-code-format.yml
+++ b/.github/workflows/pr-code-format.yml
@@ -12,6 +12,8 @@ on:
 jobs:
   code_formatter:
     runs-on: ubuntu-24.04
+    container:
+      image: 'ghcr.io/llvm/ci-ubuntu-24.04-format'
     timeout-minutes: 30
     concurrency:
       group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
@@ -23,6 +25,14 @@ jobs:
         with:
           fetch-depth: 2
 
+      # We need to set the repo checkout as safe, otherwise tj-actions/changed-files
+      # will fail due to the changed ownership inside the container.
+      # TODO(boomanaiden154): We should probably fix this by having the default user
+      # in the container have the same ID as the GHA user on the host.
+      - name: Set Safe Directory
+        run: |
+          chown -R root $(pwd)
+
       - name: Get changed files
         id: changed-files
         uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
@@ -39,24 +49,6 @@ jobs:
           echo "Formatting files:"
           echo "$CHANGED_FILES"
 
-      # The clang format version should always be upgraded to the first version
-      # of a release cycle (x.1.0) or the last version of a release cycle, or
-      # if there have been relevant clang-format backports.
-      - name: Install clang-format
-        uses: aminya/setup-cpp@a276e6e3d1db9160db5edc458e99a30d3b109949 # v1.7.1
-        with:
-          clangformat: 21.1.0
-
-      - name: Setup Python env
-        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
-        with:
-          python-version: '3.13'
-          cache: 'pip'
-          cache-dependency-path: 'llvm/utils/git/requirements_formatting.txt'
-
-      - name: Install python dependencies
-        run: pip install -r llvm/utils/git/requirements_formatting.txt
-
       - name: Run code formatter
         env:
           GITHUB_PR_NUMBER: ${{ github.event.pull_request.number }}
diff --git a/.github/workflows/pr-code-lint.yml b/.github/workflows/pr-code-lint.yml
index 776ec4af9d2dc..e67b518149c2c 100644
--- a/.github/workflows/pr-code-lint.yml
+++ b/.github/workflows/pr-code-lint.yml
@@ -20,7 +20,7 @@ jobs:
       run:
         shell: bash
     container:
-      image: 'ghcr.io/llvm/ci-ubuntu-24.04:latest'
+      image: 'ghcr.io/llvm/ci-ubuntu-24.04-lint'
     timeout-minutes: 60
     concurrency:
       group: ${{ github.workflow }}-${{ github.ref }}
@@ -31,6 +31,11 @@ jobs:
         with:
           fetch-depth: 2
       
+      # FIXME: same as in ".github/workflows/pr-code-format.yml"
+      - name: Set Safe Directory
+        run: |
+          chown -R root $(pwd)
+      
       - name: Get changed files
         id: changed-files
         uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
@@ -46,22 +51,6 @@ jobs:
         run: |
           echo "Changed files:"
           echo "$CHANGED_FILES"
-
-      # The clang tidy version should always be upgraded to the first version
-      # of a release cycle (x.1.0) or the last version of a release cycle, or
-      # if there have been relevant clang-format backports.
-      - name: Install clang-tidy
-        uses: aminya/setup-cpp@a276e6e3d1db9160db5edc458e99a30d3b109949 # v1.7.1
-        with:
-          clang-tidy: 21.1.0
-      
-      - name: Setup Python env
-        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
-        with:
-          python-version: '3.13'
-
-      - name: Install Python dependencies
-        run: python3 -m pip install -r llvm/utils/git/requirements_linting.txt
       
       # TODO: create special mapping for 'codegen' targets, for now build predefined set
       # TODO: add entrypoint in 'compute_projects.py' that only adds a project and its direct dependencies
diff --git a/.github/workflows/premerge.yaml b/.github/workflows/premerge.yaml
index 03c0c01d382ff..951fc16bed215 100644
--- a/.github/workflows/premerge.yaml
+++ b/.github/workflows/premerge.yaml
@@ -62,6 +62,7 @@ jobs:
         with:
           fetch-depth: 2
       - name: Build and Test
+        timeout-minutes: 120
         continue-on-error: ${{ runner.arch == 'ARM64' }}
         run: |
           git config --global --add safe.directory '*'
@@ -149,6 +150,7 @@ jobs:
           echo "windows-runtimes=${runtimes_to_build}" >> $GITHUB_OUTPUT
           echo "windows-runtimes-check-targets=${runtimes_check_targets}" >> $GITHUB_OUTPUT
       - name: Build and Test
+        timeout-minutes: 180
         if: ${{ steps.vars.outputs.windows-projects != '' }}
         shell: cmd
         run: |
diff --git a/amd/comgr/src/comgr-cache-command.cpp b/amd/comgr/src/comgr-cache-command.cpp
index 2eed16beb92db..1c92016ffe4ab 100644
--- a/amd/comgr/src/comgr-cache-command.cpp
+++ b/amd/comgr/src/comgr-cache-command.cpp
@@ -27,20 +27,6 @@ namespace COMGR {
 using namespace llvm;
 using namespace clang;
 
-namespace {
-// std::isalnum is locale dependent and can have issues
-// depending on the stdlib version and application. We prefer to avoid it
-bool isalnum(char c) {
-  char low[] = {'0', 'a', 'A'};
-  char hi[] = {'9', 'z', 'Z'};
-  for (unsigned i = 0; i != 3; ++i) {
-    if (low[i] <= c && c <= hi[i])
-      return true;
-  }
-  return false;
-}
-} // namespace
-
 std::optional<CachedCommandAdaptor::ComgrTmpSearchResult>
 CachedCommandAdaptor::searchComgrTmpModel(StringRef S) {
   // Ideally, we would use std::regex_search with the regex
@@ -82,7 +68,9 @@ CachedCommandAdaptor::searchComgrTmpModel(StringRef S) {
       continue;
     }
 
-    if (!all_of(Remaining.substr(0, AlnumCount), isalnum)) {
+    // Use llvm::isAlnum and not std::isalnum. The later is locale dependent and
+    // can have issues depending on the stdlib version and application.
+    if (!all_of(Remaining.substr(0, AlnumCount), llvm::isAlnum)) {
       continue;
     }
 
diff --git a/amd/hipcc/bin/hipvars.pm b/amd/hipcc/bin/hipvars.pm
deleted file mode 100644
index 672799d457fce..0000000000000
--- a/amd/hipcc/bin/hipvars.pm
+++ /dev/null
@@ -1,171 +0,0 @@
-#!/usr/bin/perl perl
-# Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-# THE SOFTWARE.
-
-package hipvars;
-use warnings;
-use Getopt::Long;
-use Cwd;
-use File::Basename;
-
-$HIP_BASE_VERSION_MAJOR = "6";
-$HIP_BASE_VERSION_MINOR = "2";
-$HIP_BASE_VERSION_PATCH = "0";
-
-#---
-# Function to parse config file
-sub parse_config_file {
-    my ($file, $config) = @_;
-    if (open (CONFIG, "$file")) {
-        while (<CONFIG>) {
-            my $config_line=$_;
-            chop ($config_line);
-            $config_line =~ s/^\s*//;
-            $config_line =~ s/\s*$//;
-            if (($config_line !~ /^#/) && ($config_line ne "")) {
-                my ($name, $value) = split (/=/, $config_line);
-                $$config{$name} = $value;
-            }
-        }
-        close(CONFIG);
-    }
-}
-
-#---
-# Function to check if executable can be run
-sub can_run {
-    my ($exe) = @_;
-    `$exe --version 2>&1`;
-    if ($? == 0) {
-        return 1;
-    } else {
-        return 0;
-    }
-}
-
-$isWindows =  ($^O eq 'MSWin32' or $^O eq 'msys');
-$doubleQuote = "\"";
-
-#
-# TODO: Fix rpath LDFLAGS settings
-#
-# Since this hipcc script gets installed at two uneven hierarchical levels,
-# linked by symlink, the absolute path of this script should be used to
-# derive HIP_PATH, as dirname $0 could be /opt/rocm/bin or /opt/rocm/hip/bin
-# depending on how it gets invoked.
-# ROCM_PATH which points to <rocm_install_dir> is determined based on whether
-# we find bin/rocm_agent_enumerator in the parent of HIP_PATH or not. If it is found,
-# ROCM_PATH is defined relative to HIP_PATH else it is hardcoded to /opt/rocm.
-#
-$HIP_PATH=$ENV{'HIP_PATH'} // dirname(Cwd::abs_path("$0/../")); # use parent directory of hipcc
-if ($isWindows and defined $ENV{'HIP_PATH'}) {
-  $HIP_PATH =~ s/^"(.*)"$/$1/;
-  $HIP_PATH =~ s/\\/\//g;
-}
-if (-e "$HIP_PATH/bin/rocm_agent_enumerator") {
-    $ROCM_PATH=$ENV{'ROCM_PATH'} // "$HIP_PATH"; # use HIP_PATH
-}elsif (-e "$HIP_PATH/../bin/rocm_agent_enumerator") { # case for backward compatibility
-    $ROCM_PATH=$ENV{'ROCM_PATH'} // dirname("$HIP_PATH"); # use parent directory of HIP_PATH
-} else {
-    $ROCM_PATH=$ENV{'ROCM_PATH'} // "/opt/rocm";
-}
-$CUDA_PATH=$ENV{'CUDA_PATH'} // '/usr/local/cuda';
-if ($isWindows and defined $ENV{'CUDA_PATH'}) {
-  $CUDA_PATH =~ s/^"(.*)"$/$1/;
-  $CUDA_PATH =~ s/\\/\//g;
-}
-
-# Windows/Distro's have a different structure, all binaries are with hipcc
-if ($isWindows or -e "$HIP_PATH/bin/clang") {
-    $HIP_CLANG_PATH=$ENV{'HIP_CLANG_PATH'} // "$HIP_PATH/bin";
-} else {
-    $HIP_CLANG_PATH=$ENV{'HIP_CLANG_PATH'} // "$ROCM_PATH/lib/llvm/bin";
-}
-# HIP_ROCCLR_HOME is used by Windows builds
-$HIP_ROCCLR_HOME=$ENV{'HIP_ROCCLR_HOME'};
-
-if (defined $HIP_ROCCLR_HOME) {
-    $HIP_INFO_PATH= "$HIP_ROCCLR_HOME/lib/.hipInfo";
-} else {
-    $HIP_INFO_PATH= "$HIP_PATH/lib/.hipInfo"; # use actual file
-}
-#---
-#HIP_PLATFORM controls whether to use nvidia or amd platform:
-$HIP_PLATFORM=$ENV{'HIP_PLATFORM'};
-# Read .hipInfo
-my %hipInfo = ();
-parse_config_file("$HIP_INFO_PATH", \%hipInfo);
-# Prioritize Env first, otherwise use the hipInfo config file
-$HIP_COMPILER = $ENV{'HIP_COMPILER'} // $hipInfo{'HIP_COMPILER'} // "clang";
-$HIP_RUNTIME = $ENV{'HIP_RUNTIME'} // $hipInfo{'HIP_RUNTIME'} // "rocclr";
-
-# If using ROCclr runtime, need to find HIP_ROCCLR_HOME
-if (defined $HIP_RUNTIME and $HIP_RUNTIME eq "rocclr" and !defined $HIP_ROCCLR_HOME) {
-    my $hipvars_dir = dirname(Cwd::abs_path($0));
-    if (-e "$hipvars_dir/../lib/bitcode") {
-        $HIP_ROCCLR_HOME = Cwd::abs_path($hipvars_dir . "/.."); #FILE_REORG Backward compatibility
-    } elsif (-e "$hipvars_dir/lib/bitcode") {
-        $HIP_ROCCLR_HOME = Cwd::abs_path($hipvars_dir);
-    } else {
-        $HIP_ROCCLR_HOME = $HIP_PATH; # use HIP_PATH
-    }
-}
-
-if (not defined $HIP_PLATFORM) {
-    if (can_run($doubleQuote . "$HIP_CLANG_PATH/clang++" . $doubleQuote) or can_run("amdclang++")) {
-        $HIP_PLATFORM = "amd";
-    } elsif (can_run($doubleQuote . "$CUDA_PATH/bin/nvcc" . $doubleQuote) or can_run("nvcc")) {
-        $HIP_PLATFORM = "nvidia";
-        $HIP_COMPILER = "nvcc";
-        $HIP_RUNTIME = "cuda";
-    } else {
-        # Default to amd for now
-        $HIP_PLATFORM = "amd";
-    }
-} elsif ($HIP_PLATFORM eq "hcc") {
-    $HIP_PLATFORM = "amd";
-    warn("Warning: HIP_PLATFORM=hcc is deprecated. Please use HIP_PLATFORM=amd. \n")
-} elsif ($HIP_PLATFORM eq "nvcc") {
-    $HIP_PLATFORM = "nvidia";
-    $HIP_COMPILER = "nvcc";
-    $HIP_RUNTIME = "cuda";
-    warn("Warning: HIP_PLATFORM=nvcc is deprecated. Please use HIP_PLATFORM=nvidia. \n")
-}
-
-if ($HIP_COMPILER eq "clang") {
-    # Windows does not have clang at linux default path
-    if (defined $HIP_ROCCLR_HOME and (-e "$HIP_ROCCLR_HOME/bin/clang" or -e "$HIP_ROCCLR_HOME/bin/clang.exe")) {
-        $HIP_CLANG_PATH = "$HIP_ROCCLR_HOME/bin";
-    }
-}
-
-#---
-# Read .hipVersion
-my %hipVersion = ();
-if ($isWindows) {
-    parse_config_file("$hipvars::HIP_PATH/bin/.hipVersion", \%hipVersion);
-} else {
-    parse_config_file("$hipvars::HIP_PATH/share/hip/version", \%hipVersion);
-}
-$HIP_VERSION_MAJOR = $hipVersion{'HIP_VERSION_MAJOR'} // $HIP_BASE_VERSION_MAJOR;
-$HIP_VERSION_MINOR = $hipVersion{'HIP_VERSION_MINOR'} // $HIP_BASE_VERSION_MINOR;
-$HIP_VERSION_PATCH = $hipVersion{'HIP_VERSION_PATCH'} // $HIP_BASE_VERSION_PATCH;
-$HIP_VERSION_GITHASH = $hipVersion{'HIP_VERSION_GITHASH'} // 0;
-$HIP_VERSION="$HIP_VERSION_MAJOR.$HIP_VERSION_MINOR.$HIP_VERSION_PATCH-$HIP_VERSION_GITHASH";
diff --git a/bolt/README.md b/bolt/README.md
index fe54bd82a356a..bf52f8c416915 100644
--- a/bolt/README.md
+++ b/bolt/README.md
@@ -108,9 +108,10 @@ $ perf record -e cycles:u -j any,u -o perf.data -- <executable> <args> ...
 #### For Services
 
 Once you get the service deployed and warmed-up, it is time to collect perf
-data with LBR (branch information). The exact perf command to use will depend
-on the service. E.g., to collect the data for all processes running on the
-server for the next 3 minutes use:
+data with brstack (branch information). Different architectures implement this
+using different hardware units, for example LBR on X86, and BRBE on AArch64.
+The exact perf command to use will depend on the service. E.g., to collect the
+data for all processes running on the server for the next 3 minutes use:
 ```
 $ perf record -e cycles:u -j any,u -a -o perf.data -- sleep 180
 ```
@@ -163,7 +164,7 @@ $ perf2bolt -p perf.data -o perf.fdata <executable>
 This command will aggregate branch data from `perf.data` and store it in a
 format that is both more compact and more resilient to binary modifications.
 
-If the profile was collected without LBRs, you will need to add `-nl` flag to
+If the profile was collected without brstacks, you will need to add `-nl` flag to
 the command line above.
 
 ### Step 3: Optimize with BOLT
diff --git a/bolt/docs/BinaryAnalysis.md b/bolt/docs/BinaryAnalysis.md
index b13410cd96355..07f096eef784e 100644
--- a/bolt/docs/BinaryAnalysis.md
+++ b/bolt/docs/BinaryAnalysis.md
@@ -1,7 +1,7 @@
 # BOLT-based binary analysis
 
 As part of post-link-time optimizing, BOLT needs to perform a range of analyses
-on binaries such as recontructing control flow graphs, and more.
+on binaries such as reconstructing control flow graphs, and more.
 
 The `llvm-bolt-binary-analysis` tool enables running requested binary analyses
 on binaries, and generating reports. It does this by building on top of the
diff --git a/bolt/docs/CommandLineArgumentReference.md b/bolt/docs/CommandLineArgumentReference.md
index 151399d69f213..43ceceee7de45 100644
--- a/bolt/docs/CommandLineArgumentReference.md
+++ b/bolt/docs/CommandLineArgumentReference.md
@@ -375,7 +375,7 @@
 
 - `--use-old-text`
 
-  Re-use space in old .text if possible (relocation mode)
+  Reuse space in old .text if possible (relocation mode)
 
 - `-v <uint>`
 
diff --git a/bolt/docs/Heatmaps.md b/bolt/docs/Heatmaps.md
index 6cf9c4da533b1..b10e72f0632e3 100644
--- a/bolt/docs/Heatmaps.md
+++ b/bolt/docs/Heatmaps.md
@@ -1,7 +1,7 @@
 # Code Heatmaps
 
 BOLT has gained the ability to print code heatmaps based on
-sampling-based profiles generated by `perf`, either with `LBR` data or not.
+sampling-based profiles generated by `perf`, either with `brstack` data or not.
 The output is produced in colored ASCII to be displayed in a color-capable
 terminal. It looks something like this:
 
@@ -20,9 +20,9 @@ or if you want to monitor the existing process(es):
 $ perf record -e cycles:u -j any,u [-p PID|-a] -- sleep <interval>
 ```
 
-Running with LBR (`-j any,u` or `-b`) is recommended. Heatmaps can be generated
-from basic events by using the llvm-bolt-heatmap option `-nl` (no LBR) but
-such heatmaps do not have the coverage provided by LBR and may only be useful
+Running with brstack (`-j any,u` or `-b`) is recommended. Heatmaps can be generated
+from basic events by using the llvm-bolt-heatmap option `-nl` (no brstack) but
+such heatmaps do not have the coverage provided by brstack and may only be useful
 for finding event hotspots at larger code block granularities.
 
 Once the run is complete, and `perf.data` is generated, run llvm-bolt-heatmap:
diff --git a/bolt/docs/OptimizingClang.md b/bolt/docs/OptimizingClang.md
index 685fcc2b738fa..9992b4c735c22 100644
--- a/bolt/docs/OptimizingClang.md
+++ b/bolt/docs/OptimizingClang.md
@@ -97,7 +97,7 @@ BOLT-INFO: basic block reordering modified layout of 7848 (10.32%) functions
            790053908 : all conditional branches (=)
 ...
 ```
-The statistics in the output is based on the LBR profile collected with `perf`, and since we were using
+The statistics in the output is based on the brstack profile (LBR) collected with `perf`, and since we were using
 the `cycles` counter, its accuracy is affected. However, the relative improvement in `taken conditional
  branches` is a good indication that BOLT was able to straighten out the code even after PGO.
 
diff --git a/bolt/docs/OptimizingLinux.md b/bolt/docs/OptimizingLinux.md
index c85fecabcccc2..4712d5cdf7e85 100644
--- a/bolt/docs/OptimizingLinux.md
+++ b/bolt/docs/OptimizingLinux.md
@@ -5,7 +5,7 @@
 
 Many Linux applications spend a significant amount of their execution time in the kernel. Thus, when we consider code optimization for system performance, it is essential to improve the CPU utilization not only in the user-space applications and libraries but also in the kernel. BOLT has demonstrated double-digit gains while being applied to user-space programs. This guide shows how to apply BOLT to the x86-64 Linux kernel and enhance your system's performance. In our experiments, BOLT boosted database TPS by 2 percent when applied to the kernel compiled with the highest level optimizations, including PGO and LTO. The database spent ~40% of the time in the kernel and was quite sensitive to kernel performance.
 
-BOLT optimizes code layout based on a low-level execution profile collected with the Linux `perf` tool. The best quality profile should include branch history, such as Intel's last branch records (LBR). BOLT runs on a linked binary and reorders the code while combining frequently executed blocks of instructions in a manner best suited for the hardware. Other than branch instructions, most of the code is left unchanged. Additionally, BOLT updates all metadata associated with the modified code, including DWARF debug information and Linux ORC unwind information.
+BOLT optimizes code layout based on a low-level execution profile collected with the Linux `perf` tool. The best quality profile should include branch history (brstack), such as Intel's last branch records (LBR) or AArch64's Branch Record Buffer Extension (BRBE). BOLT runs on a linked binary and reorders the code while combining frequently executed blocks of instructions in a manner best suited for the hardware. Other than branch instructions, most of the code is left unchanged. Additionally, BOLT updates all metadata associated with the modified code, including DWARF debug information and Linux ORC unwind information.
 
 While BOLT optimizations are not specific to the Linux kernel, certain quirks distinguish the kernel from user-level applications.
 
diff --git a/bolt/docs/RuntimeLibrary.md b/bolt/docs/RuntimeLibrary.md
index 58d9497a195b2..b969ebd3e3547 100644
--- a/bolt/docs/RuntimeLibrary.md
+++ b/bolt/docs/RuntimeLibrary.md
@@ -15,7 +15,7 @@ However, this approach quickly becomes awkward if we want to insert a lot of cod
 Currently, our runtime library is written in C++ and contains code that helps us instrument a binary.
 
 ### Limitations
-Our library is not written with regular C++ code as it is not linked against any other libraries (this means we cannnot rely on anything defined on libstdc++, glibc, libgcc etc), but is self sufficient. In runtime/CMakeLists.txt, we can see it is built with -ffreestanding, which requires the compiler to avoid using a runtime library by itself.
+Our library is not written with regular C++ code as it is not linked against any other libraries (this means we cannot rely on anything defined on libstdc++, glibc, libgcc etc), but is self sufficient. In runtime/CMakeLists.txt, we can see it is built with -ffreestanding, which requires the compiler to avoid using a runtime library by itself.
 
 While this requires us to make our own syscalls, it does simplify our linker a lot, which is very limited and can only do basic function name resolving. However, this is a big improvement in comparison with programmatically generating the code in assembly language using MCInsts.
 
diff --git a/bolt/docs/doxygen.cfg.in b/bolt/docs/doxygen.cfg.in
index 538285f47d924..de8b1f7bd6b3d 100644
--- a/bolt/docs/doxygen.cfg.in
+++ b/bolt/docs/doxygen.cfg.in
@@ -1070,7 +1070,7 @@ HTML_STYLESHEET        =
 # defined cascading style sheet that is included after the standard style sheets
 # created by doxygen. Using this option one can overrule certain style aspects.
 # This is preferred over using HTML_STYLESHEET since it does not replace the
-# standard style sheet and is therefor more robust against future updates.
+# standard style sheet and is therefore more robust against future updates.
 # Doxygen will copy the style sheet file to the output directory. For an example
 # see the documentation.
 # This tag requires that the tag GENERATE_HTML is set to YES.
diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h
index f5e9887b56f70..7b10b2d28d7e3 100644
--- a/bolt/include/bolt/Core/BinaryFunction.h
+++ b/bolt/include/bolt/Core/BinaryFunction.h
@@ -1336,7 +1336,7 @@ class BinaryFunction {
     ColdCodeSectionName = Name.str();
   }
 
-  /// Return true iif the function will halt execution on entry.
+  /// Return true if the function will halt execution on entry.
   bool trapsOnEntry() const { return TrapsOnEntry; }
 
   /// Make the function always trap on entry. Other than the trap instruction,
diff --git a/bolt/include/bolt/Core/DIEBuilder.h b/bolt/include/bolt/Core/DIEBuilder.h
index 4c3c277adf422..95e958f16cffd 100644
--- a/bolt/include/bolt/Core/DIEBuilder.h
+++ b/bolt/include/bolt/Core/DIEBuilder.h
@@ -60,7 +60,7 @@ class DIEBuilder {
     uint32_t UnitLength = 0;
     bool IsConstructed = false;
     // A map of DIE offsets in original DWARF section to DIE ID.
-    // Whih is used to access DieInfoVector.
+    // Which is used to access DieInfoVector.
     std::unordered_map<uint64_t, uint32_t> DIEIDMap;
 
     // Some STL implementations don't have a noexcept move constructor for
diff --git a/bolt/include/bolt/Core/DebugData.h b/bolt/include/bolt/Core/DebugData.h
index 814978965ce3a..7c8ea12ee3ee3 100644
--- a/bolt/include/bolt/Core/DebugData.h
+++ b/bolt/include/bolt/Core/DebugData.h
@@ -326,8 +326,8 @@ class DebugAddrWriter {
   /// Write out entries in to .debug_addr section for CUs.
   virtual std::optional<uint64_t> finalize(const size_t BufferSize);
 
-  /// Return buffer with all the entries in .debug_addr already writen out using
-  /// update(...).
+  /// Return buffer with all the entries in .debug_addr already written out
+  /// using update(...).
   virtual std::unique_ptr<AddressSectionBuffer> releaseBuffer() {
     return std::move(Buffer);
   }
@@ -409,7 +409,7 @@ class DebugAddrWriter {
   std::mutex WriterMutex;
   std::unique_ptr<AddressSectionBuffer> Buffer;
   std::unique_ptr<raw_svector_ostream> AddressStream;
-  /// Used to track sections that were not modified so that they can be re-used.
+  /// Used to track sections that were not modified so that they can be reused.
   static DenseMap<uint64_t, uint64_t> UnmodifiedAddressOffsets;
 };
 
diff --git a/bolt/include/bolt/Core/DebugNames.h b/bolt/include/bolt/Core/DebugNames.h
index cc4e13a481b2d..4ec49ca7207b5 100644
--- a/bolt/include/bolt/Core/DebugNames.h
+++ b/bolt/include/bolt/Core/DebugNames.h
@@ -65,7 +65,7 @@ class DWARF5AcceleratorTable {
   void setCurrentUnit(DWARFUnit &Unit, const uint64_t UnitStartOffset);
   /// Emit Accelerator table.
   void emitAccelTable();
-  /// Returns true if the table was crated.
+  /// Returns true if the table was created.
   bool isCreated() const { return NeedToCreate; }
   /// Returns buffer containing the accelerator table.
   std::unique_ptr<DebugBufferVector> releaseBuffer() {
@@ -91,7 +91,7 @@ class DWARF5AcceleratorTable {
   uint64_t CurrentUnitOffset = 0;
   const DWARFUnit *CurrentUnit = nullptr;
   std::unordered_map<uint32_t, uint32_t> AbbrevTagToIndexMap;
-  /// Contains a map of TU hashes to a Foreign TU indecies.
+  /// Contains a map of TU hashes to a Foreign TU indices.
   /// This is used to reduce the size of Foreign TU list since there could be
   /// multiple TUs with the same hash.
   DenseMap<uint64_t, uint32_t> TUHashToIndexMap;
diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index 2772de73081d1..d666c10885ad5 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -432,7 +432,7 @@ class MCPlusBuilder {
     return Analysis->isConditionalBranch(Inst);
   }
 
-  /// Returns true if Inst is a condtional move instruction
+  /// Returns true if Inst is a conditional move instruction
   virtual bool isConditionalMove(const MCInst &Inst) const {
     llvm_unreachable("not implemented");
     return false;
@@ -1564,7 +1564,7 @@ class MCPlusBuilder {
   }
 
   /// Get the default def_in and live_out registers for the function
-  /// Currently only used for the Stoke optimzation
+  /// Currently only used for the Stoke optimization
   virtual void getDefaultDefIn(BitVector &Regs) const {
     llvm_unreachable("not implemented");
   }
diff --git a/bolt/include/bolt/Passes/FrameAnalysis.h b/bolt/include/bolt/Passes/FrameAnalysis.h
index d71c338bdcc37..5ce85be26cc48 100644
--- a/bolt/include/bolt/Passes/FrameAnalysis.h
+++ b/bolt/include/bolt/Passes/FrameAnalysis.h
@@ -37,7 +37,7 @@ struct FrameIndexEntry {
   int64_t StackOffset;
   uint8_t Size;
 
-  /// If this is false, we will never atempt to remove or optimize this
+  /// If this is false, we will never attempt to remove or optimize this
   /// instruction. We just use it to keep track of stores we don't fully
   /// understand but we know it may write to a frame position.
   bool IsSimple;
diff --git a/bolt/include/bolt/Passes/LongJmp.h b/bolt/include/bolt/Passes/LongJmp.h
index df3ea9620918a..84da4535648b2 100644
--- a/bolt/include/bolt/Passes/LongJmp.h
+++ b/bolt/include/bolt/Passes/LongJmp.h
@@ -30,7 +30,7 @@ namespace bolt {
 /// 64-bit range, we guarantee it can reach any code location.
 ///
 class LongJmpPass : public BinaryFunctionPass {
-  /// Used to implement stub grouping (re-using a stub from one function into
+  /// Used to implement stub grouping (reusing a stub from one function into
   /// another)
   using StubTy = std::pair<uint64_t, BinaryBasicBlock *>;
   using StubGroupTy = SmallVector<StubTy, 4>;
diff --git a/bolt/include/bolt/Passes/PLTCall.h b/bolt/include/bolt/Passes/PLTCall.h
index 09ef96e27293d..9c46f5ddf701c 100644
--- a/bolt/include/bolt/Passes/PLTCall.h
+++ b/bolt/include/bolt/Passes/PLTCall.h
@@ -26,7 +26,7 @@ class PLTCall : public BinaryFunctionPass {
   explicit PLTCall(const cl::opt<bool> &PrintPass)
       : BinaryFunctionPass(PrintPass) {}
 
-  const char *getName() const override { return "PLT call optimization"; }
+  const char *getName() const override { return "plt-call-optimization"; }
   bool shouldPrint(const BinaryFunction &BF) const override {
     return BinaryFunctionPass::shouldPrint(BF);
   }
diff --git a/bolt/include/bolt/Passes/ProfileQualityStats.h b/bolt/include/bolt/Passes/ProfileQualityStats.h
index 86fc88cefc10e..ee74b12bd79bb 100644
--- a/bolt/include/bolt/Passes/ProfileQualityStats.h
+++ b/bolt/include/bolt/Passes/ProfileQualityStats.h
@@ -49,7 +49,7 @@
 // aggregates the block gaps into 2 values for the function: "weighted" is the
 // weighted average of the block conservation gaps, where the weights depend on
 // each block's execution count and instruction count; "worst" is the worst
-// (biggest) block gap acorss all basic blocks in the function with an execution
+// (biggest) block gap across all basic blocks in the function with an execution
 // count of > 500. The pass then reports the 95th percentile of the weighted and
 // worst values of the 1000 functions in a single BOLT-INFO line. The smaller
 // the reported values are, the better the BOLT profile satisfies the function
diff --git a/bolt/include/bolt/Passes/ReorderAlgorithm.h b/bolt/include/bolt/Passes/ReorderAlgorithm.h
index 95d9e831ec68b..42bb33370cf7c 100644
--- a/bolt/include/bolt/Passes/ReorderAlgorithm.h
+++ b/bolt/include/bolt/Passes/ReorderAlgorithm.h
@@ -26,7 +26,7 @@ namespace bolt {
 
 /// Objects of this class implement various basic block clustering algorithms.
 /// Basic block clusters are chains of basic blocks that should be laid out
-/// in this order to maximize performace. These algorithms group basic blocks
+/// in this order to maximize performance. These algorithms group basic blocks
 /// into clusters using execution profile data and various heuristics.
 class ClusterAlgorithm {
 public:
diff --git a/bolt/include/bolt/Passes/TailDuplication.h b/bolt/include/bolt/Passes/TailDuplication.h
index a2fcab0720ca2..4a7ec083bc485 100644
--- a/bolt/include/bolt/Passes/TailDuplication.h
+++ b/bolt/include/bolt/Passes/TailDuplication.h
@@ -143,7 +143,7 @@ class TailDuplication : public BinaryFunctionPass {
 
   explicit TailDuplication() : BinaryFunctionPass(false) {}
 
-  const char *getName() const override { return "tail duplication"; }
+  const char *getName() const override { return "tail-duplication"; }
 
   Error runOnFunctions(BinaryContext &BC) override;
 };
diff --git a/bolt/include/bolt/Rewrite/DWARFRewriter.h b/bolt/include/bolt/Rewrite/DWARFRewriter.h
index 624245650a092..cab346b5aebc5 100644
--- a/bolt/include/bolt/Rewrite/DWARFRewriter.h
+++ b/bolt/include/bolt/Rewrite/DWARFRewriter.h
@@ -128,7 +128,7 @@ class DWARFRewriter {
   CUOffsetMap finalizeTypeSections(DIEBuilder &DIEBlder, DIEStreamer &Streamer,
                                    GDBIndex &GDBIndexSection);
 
-  /// Process and write out CUs that are passsed in.
+  /// Process and write out CUs that are passed in.
   void finalizeCompileUnits(DIEBuilder &DIEBlder, DIEStreamer &Streamer,
                             CUOffsetMap &CUMap,
                             const std::list<DWARFUnit *> &CUs,
diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp
index 206d8eef40288..7dded4c59ed38 100644
--- a/bolt/lib/Core/BinaryContext.cpp
+++ b/bolt/lib/Core/BinaryContext.cpp
@@ -844,7 +844,7 @@ BinaryContext::getOrCreateJumpTable(BinaryFunction &Function, uint64_t Address,
     auto isSibling = std::bind(&BinaryContext::areRelatedFragments, this,
                                &Function, std::placeholders::_1);
     assert(llvm::all_of(JT->Parents, isSibling) &&
-           "cannot re-use jump table of a different function");
+           "cannot reuse jump table of a different function");
     (void)isSibling;
     if (opts::Verbosity > 2) {
       this->outs() << "BOLT-INFO: multiple fragments access the same jump table"
@@ -860,7 +860,7 @@ BinaryContext::getOrCreateJumpTable(BinaryFunction &Function, uint64_t Address,
     return JT->getFirstLabel();
   }
 
-  // Re-use the existing symbol if possible.
+  // Reuse the existing symbol if possible.
   MCSymbol *JTLabel = nullptr;
   if (BinaryData *Object = getBinaryDataAtAddress(Address)) {
     if (!isInternalSymbolName(Object->getSymbol()->getName()))
@@ -1337,8 +1337,17 @@ void BinaryContext::processInterproceduralReferences() {
             << Function.getPrintName() << " and "
             << TargetFunction->getPrintName() << '\n';
       }
-      if (uint64_t Offset = Address - TargetFunction->getAddress())
-        TargetFunction->addEntryPointAtOffset(Offset);
+      if (uint64_t Offset = Address - TargetFunction->getAddress()) {
+        if (!TargetFunction->isInConstantIsland(Address)) {
+          TargetFunction->addEntryPointAtOffset(Offset);
+        } else {
+          TargetFunction->setIgnored();
+          this->outs() << "BOLT-WARNING: Ignoring entry point at address 0x"
+                       << Twine::utohexstr(Address)
+                       << " in constant island of function " << *TargetFunction
+                       << '\n';
+        }
+      }
 
       continue;
     }
diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp
index 96878925eccad..4dfd4ba6f611b 100644
--- a/bolt/lib/Core/BinaryFunction.cpp
+++ b/bolt/lib/Core/BinaryFunction.cpp
@@ -3875,7 +3875,7 @@ uint64_t BinaryFunction::getEntryIDForSymbol(const MCSymbol *Symbol) const {
     if (FunctionSymbol == Symbol)
       return 0;
 
-  // Check all secondary entries available as either basic blocks or lables.
+  // Check all secondary entries available as either basic blocks or labels.
   uint64_t NumEntries = 1;
   for (const BinaryBasicBlock *BB : BasicBlocks) {
     MCSymbol *EntrySymbol = getSecondaryEntryPointSymbol(*BB);
diff --git a/bolt/lib/Core/BinaryFunctionCallGraph.cpp b/bolt/lib/Core/BinaryFunctionCallGraph.cpp
index f0c46a82fc74e..af2241998c93b 100644
--- a/bolt/lib/Core/BinaryFunctionCallGraph.cpp
+++ b/bolt/lib/Core/BinaryFunctionCallGraph.cpp
@@ -122,7 +122,7 @@ buildCallGraph(BinaryContext &BC, CgFilterFunction Filter, bool CgFromPerfData,
       // create a node for a function unless it was the target of a call from
       // a hot block.  The alternative would be to set the count to one or
       // accumulate the number of calls from the callsite into the function
-      // samples.  Results from perfomance testing seem to favor the zero
+      // samples.  Results from performance testing seem to favor the zero
       // count though, so I'm leaving it this way for now.
       return Cg.addNode(Function, Size, Function->getKnownExecutionCount());
     }
diff --git a/bolt/lib/Core/CallGraph.cpp b/bolt/lib/Core/CallGraph.cpp
index f1d52737bf556..f07add3a88112 100644
--- a/bolt/lib/Core/CallGraph.cpp
+++ b/bolt/lib/Core/CallGraph.cpp
@@ -22,7 +22,7 @@
 #undef USE_SSECRC
 #endif
 
-static LLVM_ATTRIBUTE_UNUSED inline size_t hash_int64_fallback(int64_t k) {
+[[maybe_unused]] static inline size_t hash_int64_fallback(int64_t k) {
   uint64_t key = (unsigned long long)k;
   // "64 bit Mix Functions", from Thomas Wang's "Integer Hash Function."
   // http://www.concentric.net/~ttwang/tech/inthash.htm
@@ -35,7 +35,7 @@ static LLVM_ATTRIBUTE_UNUSED inline size_t hash_int64_fallback(int64_t k) {
   return static_cast<size_t>(static_cast<uint32_t>(key));
 }
 
-static LLVM_ATTRIBUTE_UNUSED inline size_t hash_int64(int64_t k) {
+[[maybe_unused]] static inline size_t hash_int64(int64_t k) {
 #if defined(USE_SSECRC) && defined(__SSE4_2__)
   size_t h = 0;
   __asm("crc32q %1, %0\n" : "+r"(h) : "rm"(k));
diff --git a/bolt/lib/Core/DIEBuilder.cpp b/bolt/lib/Core/DIEBuilder.cpp
index 7ce55f9165136..5b628f67c2b9b 100644
--- a/bolt/lib/Core/DIEBuilder.cpp
+++ b/bolt/lib/Core/DIEBuilder.cpp
@@ -137,7 +137,7 @@ void DIEBuilder::updateReferences() {
                                   DIEInteger(NewAddr));
   }
 
-  // Handling referenes in location expressions.
+  // Handling references in location expressions.
   for (LocWithReference &LocExpr : getState().LocWithReferencesToProcess) {
     SmallVector<uint8_t, 32> Buffer;
     DataExtractor Data(StringRef((const char *)LocExpr.BlockData.data(),
@@ -336,7 +336,7 @@ void DIEBuilder::buildCompileUnits(const bool Init) {
     registerUnit(*DU, false);
   }
 
-  // Using DULIst since it can be modified by cross CU refrence resolution.
+  // Using DULIst since it can be modified by cross CU reference resolution.
   for (DWARFUnit *DU : getState().DUList) {
     if (DU->isTypeUnit())
       continue;
@@ -508,7 +508,7 @@ void DIEBuilder::finish() {
     UnitStartOffset += CurUnitInfo.UnitLength;
   };
   // Computing offsets for .debug_types section.
-  // It's processed first when CU is registered so will be at the begginnig of
+  // It's processed first when CU is registered so will be at the beginning of
   // the vector.
   uint64_t TypeUnitStartOffset = 0;
   for (DWARFUnit *CU : getState().DUList) {
diff --git a/bolt/lib/Core/DebugData.cpp b/bolt/lib/Core/DebugData.cpp
index e05f28f08572c..2687788167a8a 100644
--- a/bolt/lib/Core/DebugData.cpp
+++ b/bolt/lib/Core/DebugData.cpp
@@ -101,7 +101,7 @@ std::optional<AttrInfo> findAttributeInfo(const DWARFDie DIE,
   return findAttributeInfo(DIE, AbbrevDecl, *Index);
 }
 
-LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]]
 static void printLE64(const std::string &S) {
   for (uint32_t I = 0, Size = S.size(); I < Size; ++I) {
     errs() << Twine::utohexstr(S[I]);
@@ -876,7 +876,7 @@ void DebugStrOffsetsWriter::finalizeSection(DWARFUnit &Unit,
   DIEValue StrListBaseAttrInfo =
       Die.findAttribute(dwarf::DW_AT_str_offsets_base);
   auto RetVal = ProcessedBaseOffsets.find(*Val);
-  // Handling re-use of str-offsets section.
+  // Handling reuse of str-offsets section.
   if (RetVal == ProcessedBaseOffsets.end() || StrOffsetSectionWasModified) {
     initialize(Unit);
     // Update String Offsets that were modified.
@@ -1167,7 +1167,7 @@ void DwarfLineTable::emitCU(MCStreamer *MCOS, MCDwarfLineTableParams Params,
 // For functions that we do not modify we output them as raw data.
 // Re-constructing .debug_line_str so that offsets are correct for those
 // debug line tables.
-// Bonus is that when we output a final binary we can re-use .debug_line_str
+// Bonus is that when we output a final binary we can reuse .debug_line_str
 // section. So we don't have to do the SHF_ALLOC trick we did with
 // .debug_line.
 static void parseAndPopulateDebugLineStr(BinarySection &LineStrSection,
diff --git a/bolt/lib/Core/DebugNames.cpp b/bolt/lib/Core/DebugNames.cpp
index a9d98a6ba879b..6be2c5aa4e6c1 100644
--- a/bolt/lib/Core/DebugNames.cpp
+++ b/bolt/lib/Core/DebugNames.cpp
@@ -55,7 +55,7 @@ DWARF5AcceleratorTable::DWARF5AcceleratorTable(
           llvm::hash_value(llvm::StringRef(CStr)), StrOffset);
       if (!R.second)
         BC.errs()
-            << "BOLT-WARNING: [internal-dwarf-error]: collision occured on "
+            << "BOLT-WARNING: [internal-dwarf-error]: collision occurred on "
             << CStr << " at offset : 0x" << Twine::utohexstr(StrOffset)
             << ". Previous string offset is: 0x"
             << Twine::utohexstr(R.first->second) << ".\n";
@@ -86,7 +86,7 @@ void DWARF5AcceleratorTable::addUnit(DWARFUnit &Unit,
   if (Unit.isTypeUnit()) {
     if (DWOID) {
       // We adding an entry for a DWO TU. The DWO CU might not have any entries,
-      // so need to add it to the list pre-emptively.
+      // so need to add it to the list preemptively.
       auto Iter = CUOffsetsToPatch.insert({*DWOID, CUList.size()});
       if (Iter.second)
         CUList.push_back(BADCUOFFSET);
diff --git a/bolt/lib/Passes/Aligner.cpp b/bolt/lib/Passes/Aligner.cpp
index c3ddedaaa1466..5d21bdb3f154a 100644
--- a/bolt/lib/Passes/Aligner.cpp
+++ b/bolt/lib/Passes/Aligner.cpp
@@ -60,7 +60,7 @@ namespace llvm {
 namespace bolt {
 
 // Align function to the specified byte-boundary (typically, 64) offsetting
-// the fuction by not more than the corresponding value
+// the function by not more than the corresponding value
 static void alignMaxBytes(BinaryFunction &Function) {
   Function.setAlignment(opts::AlignFunctions);
   Function.setMaxAlignmentBytes(opts::AlignFunctionsMaxBytes);
@@ -68,7 +68,7 @@ static void alignMaxBytes(BinaryFunction &Function) {
 }
 
 // Align function to the specified byte-boundary (typically, 64) offsetting
-// the fuction by not more than the minimum over
+// the function by not more than the minimum over
 // -- the size of the function
 // -- the specified number of bytes
 static void alignCompact(BinaryFunction &Function,
diff --git a/bolt/lib/Passes/FrameAnalysis.cpp b/bolt/lib/Passes/FrameAnalysis.cpp
index f568039bbf163..0b26da3371234 100644
--- a/bolt/lib/Passes/FrameAnalysis.cpp
+++ b/bolt/lib/Passes/FrameAnalysis.cpp
@@ -198,7 +198,7 @@ class FrameAccessAnalysis {
         if (CFIStack.empty())
           dbgs() << "Assertion is about to fail: " << BF.getPrintName() << "\n";
         assert(!CFIStack.empty() && "Corrupt CFI stack");
-        std::pair<int64_t, uint16_t> &Elem = CFIStack.top();
+        std::pair<int64_t, uint16_t> Elem = CFIStack.top();
         CFIStack.pop();
         CfaOffset = Elem.first;
         CfaReg = Elem.second;
diff --git a/bolt/lib/Passes/RegReAssign.cpp b/bolt/lib/Passes/RegReAssign.cpp
index 60349f18b11d3..0859cd244ce40 100644
--- a/bolt/lib/Passes/RegReAssign.cpp
+++ b/bolt/lib/Passes/RegReAssign.cpp
@@ -145,7 +145,7 @@ void RegReAssign::rankRegisters(BinaryFunction &Function) {
       const bool CannotUseREX = BC.MIB->cannotUseREX(Inst);
       const MCInstrDesc &Desc = BC.MII->get(Inst.getOpcode());
 
-      // Disallow substituitions involving regs in implicit uses lists
+      // Disallow substitutions involving regs in implicit uses lists
       for (MCPhysReg ImplicitUse : Desc.implicit_uses()) {
         const size_t RegEC =
             BC.MIB->getAliases(ImplicitUse, false).find_first();
@@ -153,7 +153,7 @@ void RegReAssign::rankRegisters(BinaryFunction &Function) {
             std::numeric_limits<decltype(RegScore)::value_type>::min();
       }
 
-      // Disallow substituitions involving regs in implicit defs lists
+      // Disallow substitutions involving regs in implicit defs lists
       for (MCPhysReg ImplicitDef : Desc.implicit_defs()) {
         const size_t RegEC =
             BC.MIB->getAliases(ImplicitDef, false).find_first();
@@ -174,7 +174,7 @@ void RegReAssign::rankRegisters(BinaryFunction &Function) {
         if (RegEC == 0)
           continue;
 
-        // Disallow substituitions involving regs in instrs that cannot use REX
+        // Disallow substitutions involving regs in instrs that cannot use REX
         // The relationship of X86 registers is shown in the diagram. BL and BH
         // do not have a direct alias relationship. However, if the BH register
         // cannot be swapped, then the BX/EBX/RBX registers cannot be swapped as
diff --git a/bolt/lib/Passes/ShrinkWrapping.cpp b/bolt/lib/Passes/ShrinkWrapping.cpp
index 4ea60f388e2fa..fe342ccd38a67 100644
--- a/bolt/lib/Passes/ShrinkWrapping.cpp
+++ b/bolt/lib/Passes/ShrinkWrapping.cpp
@@ -402,7 +402,7 @@ void StackLayoutModifier::classifyCFIs() {
         break;
       case MCCFIInstruction::OpRestoreState: {
         assert(!CFIStack.empty() && "Corrupt CFI stack");
-        std::pair<int64_t, uint16_t> &Elem = CFIStack.top();
+        std::pair<int64_t, uint16_t> Elem = CFIStack.top();
         CFIStack.pop();
         CfaOffset = Elem.first;
         CfaReg = Elem.second;
diff --git a/bolt/lib/Passes/SplitFunctions.cpp b/bolt/lib/Passes/SplitFunctions.cpp
index eab669b32b71e..66a373ad2de72 100644
--- a/bolt/lib/Passes/SplitFunctions.cpp
+++ b/bolt/lib/Passes/SplitFunctions.cpp
@@ -386,7 +386,7 @@ struct SplitCacheDirected final : public SplitStrategy {
   }
 
   /// Compute sum of scores over jumps within \p BlockOrder given \p SplitIndex.
-  /// Increament Score.LocalScore in place by the sum.
+  /// Increment Score.LocalScore in place by the sum.
   void computeJumpScore(const BasicBlockOrder &BlockOrder,
                         const size_t SplitIndex, SplitScore &Score) {
 
@@ -413,7 +413,7 @@ struct SplitCacheDirected final : public SplitStrategy {
   }
 
   /// Compute sum of scores over calls originated in the current function
-  /// given \p SplitIndex. Increament Score.LocalScore in place by the sum.
+  /// given \p SplitIndex. Increment Score.LocalScore in place by the sum.
   void computeLocalCallScore(const BasicBlockOrder &BlockOrder,
                              const size_t SplitIndex, SplitScore &Score) {
     if (opts::CallScale == 0)
@@ -455,7 +455,7 @@ struct SplitCacheDirected final : public SplitStrategy {
   }
 
   /// Compute sum of splitting scores for cover calls of the input function.
-  /// Increament Score.CoverCallScore in place by the sum.
+  /// Increment Score.CoverCallScore in place by the sum.
   void computeCoverCallScore(const BasicBlockOrder &BlockOrder,
                              const size_t SplitIndex,
                              const std::vector<CallInfo> &CoverCalls,
@@ -467,7 +467,7 @@ struct SplitCacheDirected final : public SplitStrategy {
       assert(CI.Length >= Score.HotSizeReduction &&
              "Length of cover calls must exceed reduced size of hot fragment.");
       // Compute the new length of the call, which is shorter than the original
-      // one by the size of the splitted fragment minus the total size increase.
+      // one by the size of the split fragment minus the total size increase.
       const size_t NewCallLength = CI.Length - Score.HotSizeReduction;
       Score.CoverCallScore += computeCallScore(CI.Count, NewCallLength);
     }
@@ -502,12 +502,12 @@ struct SplitCacheDirected final : public SplitStrategy {
 
     // First part of LocalScore is the sum over call edges originated in the
     // input function. These edges can get shorter or longer depending on
-    // SplitIndex. Score.LocalScore is increamented in place.
+    // SplitIndex. Score.LocalScore is incremented in place.
     computeLocalCallScore(BlockOrder, SplitIndex, Score);
 
     // Second part of LocalScore is the sum over jump edges with src basic block
     // and dst basic block in the current function. Score.LocalScore is
-    // increamented in place.
+    // incremented in place.
     computeJumpScore(BlockOrder, SplitIndex, Score);
 
     // Compute CoverCallScore and store in Score in place.
diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index 3604fdd3a94b4..9faccc23b4b81 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -46,16 +46,15 @@ namespace opts {
 
 static cl::opt<bool>
     BasicAggregation("nl",
-                     cl::desc("aggregate basic samples (without LBR info)"),
+                     cl::desc("aggregate basic samples (without brstack info)"),
                      cl::cat(AggregatorCategory));
 
 cl::opt<bool> ArmSPE("spe", cl::desc("Enable Arm SPE mode."),
                      cl::cat(AggregatorCategory));
 
-static cl::opt<std::string>
-    ITraceAggregation("itrace",
-                      cl::desc("Generate LBR info with perf itrace argument"),
-                      cl::cat(AggregatorCategory));
+static cl::opt<std::string> ITraceAggregation(
+    "itrace", cl::desc("Generate brstack info with perf itrace argument"),
+    cl::cat(AggregatorCategory));
 
 static cl::opt<bool>
 FilterMemProfile("filter-mem-profile",
@@ -201,7 +200,7 @@ void DataAggregator::start() {
   }
 
   if (opts::BasicAggregation) {
-    launchPerfProcess("events without LBR", MainEventsPPI,
+    launchPerfProcess("events without brstack", MainEventsPPI,
                       "script -F pid,event,ip");
   } else if (!opts::ITraceAggregation.empty()) {
     // Disable parsing memory profile from trace data, unless requested by user.
@@ -1069,7 +1068,7 @@ ErrorOr<DataAggregator::LBREntry> DataAggregator::parseLBREntry() {
   if (std::error_code EC = Rest.getError())
     return EC;
   if (Rest.get().size() < 5) {
-    reportError("expected rest of LBR entry");
+    reportError("expected rest of brstack entry");
     Diag << "Found: " << Rest.get() << "\n";
     return make_error_code(llvm::errc::io_error);
   }
@@ -1433,7 +1432,7 @@ std::error_code DataAggregator::printLBRHeatMap() {
       errs() << "HEATMAP-ERROR: no basic event samples detected in profile. "
                 "Cannot build heatmap.";
     } else {
-      errs() << "HEATMAP-ERROR: no LBR traces detected in profile. "
+      errs() << "HEATMAP-ERROR: no brstack traces detected in profile. "
                 "Cannot build heatmap. Use -nl for building heatmap from "
                 "basic events.\n";
     }
@@ -1572,7 +1571,7 @@ void DataAggregator::printBranchStacksDiagnostics(
 
 std::error_code DataAggregator::parseBranchEvents() {
   std::string BranchEventTypeStr =
-      opts::ArmSPE ? "SPE branch events in LBR-format" : "branch events";
+      opts::ArmSPE ? "SPE branch events in brstack-format" : "branch events";
   outs() << "PERF2BOLT: parse " << BranchEventTypeStr << "...\n";
   NamedRegionTimer T("parseBranch", "Parsing branch events", TimerGroupName,
                      TimerGroupDesc, opts::TimeAggregator);
@@ -1620,7 +1619,7 @@ std::error_code DataAggregator::parseBranchEvents() {
   clear(TraceMap);
 
   outs() << "PERF2BOLT: read " << NumSamples << " samples and " << NumEntries
-         << " LBR entries\n";
+         << " brstack entries\n";
   if (NumTotalSamples) {
     if (NumSamples && NumSamplesNoLBR == NumSamples) {
       // Note: we don't know if perf2bolt is being used to parse memory samples
@@ -1628,8 +1627,10 @@ std::error_code DataAggregator::parseBranchEvents() {
       if (!opts::ArmSPE)
         errs()
             << "PERF2BOLT-WARNING: all recorded samples for this binary lack "
-               "LBR. Record profile with perf record -j any or run perf2bolt "
-               "in no-LBR mode with -nl (the performance improvement in -nl "
+               "brstack. Record profile with perf record -j any or run "
+               "perf2bolt "
+               "in non-brstack mode with -nl (the performance improvement in "
+               "-nl "
                "mode may be limited)\n";
       else
         errs()
@@ -1664,7 +1665,7 @@ void DataAggregator::processBranchEvents() {
 }
 
 std::error_code DataAggregator::parseBasicEvents() {
-  outs() << "PERF2BOLT: parsing basic events (without LBR)...\n";
+  outs() << "PERF2BOLT: parsing basic events (without brstack)...\n";
   NamedRegionTimer T("parseBasic", "Parsing basic events", TimerGroupName,
                      TimerGroupDesc, opts::TimeAggregator);
   while (hasData()) {
@@ -1688,7 +1689,7 @@ std::error_code DataAggregator::parseBasicEvents() {
 }
 
 void DataAggregator::processBasicEvents() {
-  outs() << "PERF2BOLT: processing basic events (without LBR)...\n";
+  outs() << "PERF2BOLT: processing basic events (without brstack)...\n";
   NamedRegionTimer T("processBasic", "Processing basic events", TimerGroupName,
                      TimerGroupDesc, opts::TimeAggregator);
   uint64_t OutOfRangeSamples = 0;
@@ -1777,7 +1778,8 @@ std::error_code DataAggregator::parsePreAggregatedLBRSamples() {
     ++AggregatedLBRs;
   }
 
-  outs() << "PERF2BOLT: read " << AggregatedLBRs << " aggregated LBR entries\n";
+  outs() << "PERF2BOLT: read " << AggregatedLBRs
+         << " aggregated brstack entries\n";
 
   return std::error_code();
 }
@@ -2426,7 +2428,7 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC,
 void DataAggregator::dump() const { DataReader::dump(); }
 
 void DataAggregator::dump(const PerfBranchSample &Sample) const {
-  Diag << "Sample LBR entries: " << Sample.LBR.size() << "\n";
+  Diag << "Sample brstack entries: " << Sample.LBR.size() << "\n";
   for (const LBREntry &LBR : Sample.LBR)
     Diag << LBR << '\n';
 }
diff --git a/bolt/lib/Profile/DataReader.cpp b/bolt/lib/Profile/DataReader.cpp
index afe24216d7f5d..d7e2d0fe4000a 100644
--- a/bolt/lib/Profile/DataReader.cpp
+++ b/bolt/lib/Profile/DataReader.cpp
@@ -570,7 +570,7 @@ void DataReader::readBasicSampleData(BinaryFunction &BF) {
   if (!SampleDataOrErr)
     return;
 
-  // Basic samples mode territory (without LBR info)
+  // Basic samples mode territory (without brstack info)
   // First step is to assign BB execution count based on samples from perf
   BF.ProfileMatchRatio = 1.0f;
   BF.removeTagsFromProfile();
@@ -578,8 +578,8 @@ void DataReader::readBasicSampleData(BinaryFunction &BF) {
   bool NormalizeByCalls = usesEvent("branches");
   static bool NagUser = true;
   if (NagUser) {
-    outs()
-        << "BOLT-INFO: operating with basic samples profiling data (no LBR).\n";
+    outs() << "BOLT-INFO: operating with basic samples profiling data (no "
+              "brstack).\n";
     if (NormalizeByInsnCount)
       outs() << "BOLT-INFO: normalizing samples by instruction count.\n";
     else if (NormalizeByCalls)
@@ -907,7 +907,7 @@ ErrorOr<uint64_t> DataReader::parseHexField(char EndChar, bool EndNl) {
   StringRef NumStr = NumStrRes.get();
   uint64_t Num;
   if (NumStr.getAsInteger(16, Num)) {
-    reportError("expected hexidecimal number");
+    reportError("expected hexadecimal number");
     Diag << "Found: " << NumStr << "\n";
     return make_error_code(llvm::errc::io_error);
   }
diff --git a/bolt/lib/Rewrite/BuildIDRewriter.cpp b/bolt/lib/Rewrite/BuildIDRewriter.cpp
index d50416fb80c6c..706a3d0d92d56 100644
--- a/bolt/lib/Rewrite/BuildIDRewriter.cpp
+++ b/bolt/lib/Rewrite/BuildIDRewriter.cpp
@@ -48,7 +48,7 @@ class BuildIDRewriter final : public MetadataRewriter {
 };
 
 Error BuildIDRewriter::sectionInitializer() {
-  // Typically, build ID will reside in .note.gnu.build-id section. Howerver,
+  // Typically, build ID will reside in .note.gnu.build-id section. However,
   // a linker script can change the section name and such is the case with
   // the Linux kernel. Hence, we iterate over all note sections.
   for (BinarySection &NoteSection : BC.sections()) {
diff --git a/bolt/lib/Rewrite/DWARFRewriter.cpp b/bolt/lib/Rewrite/DWARFRewriter.cpp
index 7366d2aca35ea..5e3fa931e826f 100644
--- a/bolt/lib/Rewrite/DWARFRewriter.cpp
+++ b/bolt/lib/Rewrite/DWARFRewriter.cpp
@@ -69,7 +69,7 @@ static void printDie(const DWARFDie &DIE) {
 }
 
 /// Lazily parse DWARF DIE and print it out.
-LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]]
 static void printDie(DWARFUnit &DU, uint64_t DIEOffset) {
   uint64_t OriginalOffsets = DIEOffset;
   uint64_t NextCUOffset = DU.getNextUnitOffset();
@@ -1723,7 +1723,7 @@ StringRef getSectionName(const SectionRef &Section) {
   return Name;
 }
 
-// Exctracts an appropriate slice if input is DWP.
+// Extracts an appropriate slice if input is DWP.
 // Applies patches or overwrites the section.
 std::optional<StringRef> updateDebugData(
     DWARFContext &DWCtx, StringRef SectionName, StringRef SectionContents,
@@ -1759,7 +1759,7 @@ std::optional<StringRef> updateDebugData(
     auto Iter = OverridenSections.find(Kind);
     if (Iter == OverridenSections.end()) {
       errs()
-          << "BOLT-WARNING: [internal-dwarf-error]: Could not find overriden "
+          << "BOLT-WARNING: [internal-dwarf-error]: Could not find overridden "
              "section for: "
           << Twine::utohexstr(DWOId) << ".\n";
       return std::nullopt;
@@ -1991,7 +1991,7 @@ void DWARFRewriter::convertToRangesPatchDebugInfo(
     }
   }
 
-  // HighPC was conveted into DW_AT_ranges.
+  // HighPC was converted into DW_AT_ranges.
   // For DWARF5 we only access ranges through index.
 
   DIEBldr.replaceValue(&Die, HighPCAttrInfo.getAttribute(), dwarf::DW_AT_ranges,
diff --git a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
index ee021fee3cea5..947d8992890d4 100644
--- a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
+++ b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
@@ -308,7 +308,7 @@ void PseudoProbeRewriter::encodePseudoProbes() {
     Contents.append(OSE.str().begin(), OSE.str().end());
   };
 
-  // Emit indiviual pseudo probes in a inline tree node
+  // Emit individual pseudo probes in a inline tree node
   // Probe index, type, attribute, address type and address are encoded
   // Address of the first probe is absolute.
   // Other probes' address are represented by delta
diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp
index a0e79957edc01..6fa66ab55210d 100644
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -1087,7 +1087,7 @@ void RewriteInstance::discoverFileObjects() {
 
     if (SymbolAddress == Section->getAddress() + Section->getSize()) {
       assert(SymbolSize == 0 &&
-             "unexpect non-zero sized symbol at end of section");
+             "unexpected non-zero sized symbol at end of section");
       LLVM_DEBUG(
           dbgs()
           << "BOLT-DEBUG: rejecting as symbol points to end of its section\n");
@@ -2440,7 +2440,7 @@ void RewriteInstance::processDynamicRelocations() {
   }
 
   // The rest of dynamic relocations - DT_RELA.
-  // The static executable might have .rela.dyn secion and not have PT_DYNAMIC
+  // The static executable might have .rela.dyn section and not have PT_DYNAMIC
   if (!DynamicRelocationsSize && BC->IsStaticExecutable) {
     ErrorOr<BinarySection &> DynamicRelSectionOrErr =
         BC->getUniqueSectionByName(getRelaDynSectionName());
@@ -2665,8 +2665,9 @@ void RewriteInstance::readRelocations(const SectionRef &Section) {
     return;
   }
   const bool SkipRelocs = StringSwitch<bool>(RelocatedSectionName)
-                              .Cases(".plt", ".rela.plt", ".got.plt",
-                                     ".eh_frame", ".gcc_except_table", true)
+                              .Cases({".plt", ".rela.plt", ".got.plt",
+                                      ".eh_frame", ".gcc_except_table"},
+                                     true)
                               .Default(false);
   if (SkipRelocs) {
     LLVM_DEBUG(
@@ -5017,7 +5018,7 @@ void RewriteInstance::updateELFSymbolTable(
     if (!Section)
       return false;
 
-    // Remove the section symbol iif the corresponding section was stripped.
+    // Remove the section symbol if the corresponding section was stripped.
     if (Symbol.getType() == ELF::STT_SECTION) {
       if (!getNewSectionIndex(Symbol.st_shndx))
         return true;
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index df4f42128605e..6954cb295e86a 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -1296,7 +1296,7 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     AArch64_AM::ShiftExtendType ExtendType =
         AArch64_AM::getArithExtendType(OperandExtension);
     if (ShiftVal != 2) {
-      // TODO: Handle the patten where ShiftVal != 2.
+      // TODO: Handle the pattern where ShiftVal != 2.
       // The following code sequence below has no shift amount,
       // the range could be 0 to 4.
       // The pattern comes from libc, it occurs when the binary is static.
@@ -1626,7 +1626,7 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
   int getUncondBranchEncodingSize() const override { return 28; }
 
   // This helper function creates the snippet of code that compares a register
-  // RegNo with an immedaite Imm, and jumps to Target if they are equal.
+  // RegNo with an immediate Imm, and jumps to Target if they are equal.
   // cmp RegNo, #Imm
   // b.eq Target
   // where cmp is an alias for subs, which results in the code below:
@@ -1648,7 +1648,7 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
   }
 
   // This helper function creates the snippet of code that compares a register
-  // RegNo with an immedaite Imm, and jumps to Target if they are not equal.
+  // RegNo with an immediate Imm, and jumps to Target if they are not equal.
   // cmp RegNo, #Imm
   // b.ne Target
   // where cmp is an alias for subs, which results in the code below:
diff --git a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
index 9026a9df7b5c2..5fca5e813515f 100644
--- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
+++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
@@ -2715,7 +2715,7 @@ class X86MCPlusBuilder : public MCPlusBuilder {
 
     bool FoundOne = false;
 
-    // Iterate only through src operands that arent also dest operands
+    // Iterate only through src operands that aren't also dest operands
     for (unsigned Index = InstDesc.getNumDefs() + (HasLHS ? 1 : 0),
                   E = InstDesc.getNumOperands();
          Index != E; ++Index) {
diff --git a/bolt/lib/Utils/CommandLineOpts.cpp b/bolt/lib/Utils/CommandLineOpts.cpp
index 095612ac3a4ac..5be04d2ceea94 100644
--- a/bolt/lib/Utils/CommandLineOpts.cpp
+++ b/bolt/lib/Utils/CommandLineOpts.cpp
@@ -285,7 +285,7 @@ cl::opt<bool> TimeRewrite("time-rewrite",
 
 cl::opt<bool> UseOldText(
     "use-old-text",
-    cl::desc("re-use space in old .text if possible (relocation mode)"),
+    cl::desc("reuse space in old .text if possible (relocation mode)"),
     cl::cat(BoltCategory));
 
 cl::opt<bool> UpdateDebugSections(
diff --git a/bolt/runtime/hugify.cpp b/bolt/runtime/hugify.cpp
index 672b04247dfa4..de896307f24fc 100644
--- a/bolt/runtime/hugify.cpp
+++ b/bolt/runtime/hugify.cpp
@@ -24,7 +24,7 @@
   {}
 #endif
 
-// Function constains trampoline to _start,
+// Function constrains trampoline to _start,
 // so we can resume regular execution of the function that we hooked.
 extern void __bolt_hugify_start_program();
 
diff --git a/bolt/runtime/instr.cpp b/bolt/runtime/instr.cpp
index 1f54a500dbf98..f586db2b0f9ba 100644
--- a/bolt/runtime/instr.cpp
+++ b/bolt/runtime/instr.cpp
@@ -214,7 +214,7 @@ class BumpPtrAllocator {
 /// __bolt_instr_setup, our initialization routine.
 BumpPtrAllocator *GlobalAlloc;
 
-// Base address which we substract from recorded PC values when searching for
+// Base address which we subtract from recorded PC values when searching for
 // indirect call description entries. Needed because indCall descriptions are
 // mapped read-only and contain static addresses. Initialized in
 // __bolt_instr_setup.
@@ -261,7 +261,7 @@ struct SimpleHashTableEntryBase {
     // Currently we have to do it the ugly way because
     // we want every message to be printed atomically via a single call to
     // __write. If we use reportNumber() and others nultiple times, we'll get
-    // garbage in mulithreaded environment
+    // garbage in multithreaded environment
     char Buf[BufSize];
     char *Ptr = Buf;
     Ptr = intToStr(Ptr, __getpid(), 10);
@@ -1585,7 +1585,7 @@ __bolt_instr_data_dump(int FD, const char *LibPath = nullptr,
 /// at user-specified intervals
 void watchProcess() {
   timespec ts, rem;
-  uint64_t Ellapsed = 0ull;
+  uint64_t Elapsed = 0ull;
   int FD = openProfile();
   uint64_t ppid;
   if (__bolt_instr_wait_forks) {
@@ -1615,10 +1615,10 @@ void watchProcess() {
       break;
     }
 
-    if (++Ellapsed < __bolt_instr_sleep_time)
+    if (++Elapsed < __bolt_instr_sleep_time)
       continue;
 
-    Ellapsed = 0;
+    Elapsed = 0;
     __bolt_instr_data_dump(FD);
     if (__bolt_instr_no_counters_clear == false)
       __bolt_instr_clear_counters();
diff --git a/bolt/runtime/sys_aarch64.h b/bolt/runtime/sys_aarch64.h
index 77c9cfcc99f98..b1d04f9d558e0 100644
--- a/bolt/runtime/sys_aarch64.h
+++ b/bolt/runtime/sys_aarch64.h
@@ -41,7 +41,7 @@
 // Anonymous namespace covering everything but our library entry point
 namespace {
 
-// Get the difference between runtime addrress of .text section and
+// Get the difference between runtime address of .text section and
 // static address in section header table. Can be extracted from arbitrary
 // pc value recorded at runtime to get the corresponding static address, which
 // in turn can be used to search for indirect call description. Needed because
diff --git a/bolt/runtime/sys_riscv64.h b/bolt/runtime/sys_riscv64.h
index 00a21e4945f0f..442fa2e018494 100644
--- a/bolt/runtime/sys_riscv64.h
+++ b/bolt/runtime/sys_riscv64.h
@@ -105,7 +105,7 @@
 // Anonymous namespace covering everything but our library entry point
 namespace {
 
-// Get the difference between runtime addrress of .text section and
+// Get the difference between runtime address of .text section and
 // static address in section header table. Can be extracted from arbitrary
 // pc value recorded at runtime to get the corresponding static address, which
 // in turn can be used to search for indirect call description. Needed because
diff --git a/bolt/runtime/sys_x86_64.h b/bolt/runtime/sys_x86_64.h
index ca2c69326a14f..933e939012481 100644
--- a/bolt/runtime/sys_x86_64.h
+++ b/bolt/runtime/sys_x86_64.h
@@ -40,7 +40,7 @@
 
 namespace {
 
-// Get the difference between runtime addrress of .text section and
+// Get the difference between runtime address of .text section and
 // static address in section header table. Can be extracted from arbitrary
 // pc value recorded at runtime to get the corresponding static address, which
 // in turn can be used to search for indirect call description. Needed because
@@ -171,8 +171,9 @@ uint64_t __exit(uint64_t code) {
 #if !defined(__APPLE__)
 // We use a stack-allocated buffer for string manipulation in many pieces of
 // this code, including the code that prints each line of the fdata file. This
-// buffer needs to accomodate large function names, but shouldn't be arbitrarily
-// large (dynamically allocated) for simplicity of our memory space usage.
+// buffer needs to accommodate large function names, but shouldn't be
+// arbitrarily large (dynamically allocated) for simplicity of our memory space
+// usage.
 
 // Declare some syscall wrappers we use throughout this code to avoid linking
 // against system libc.
diff --git a/bolt/test/AArch64/constant-island-alignment.s b/bolt/test/AArch64/constant-island-alignment.s
index 957c4705f5eec..99fe7333e2dba 100644
--- a/bolt/test/AArch64/constant-island-alignment.s
+++ b/bolt/test/AArch64/constant-island-alignment.s
@@ -3,7 +3,7 @@
 # RUN: split-file %s %t
 
 // For the first test case, in case the nop before .Lci will be removed
-// the pointer to exit function won't be alinged and the test will fail.
+// the pointer to exit function won't be aligned and the test will fail.
 
 # RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \
 # RUN:   %t/xword_align.s -o %t_xa.o
diff --git a/bolt/test/AArch64/constant-island-entry.s b/bolt/test/AArch64/constant-island-entry.s
new file mode 100644
index 0000000000000..6567114eb980a
--- /dev/null
+++ b/bolt/test/AArch64/constant-island-entry.s
@@ -0,0 +1,27 @@
+// This test checks that we ignore functions which add an entry point that
+// is in a constant island.
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
+# RUN: %clang %cflags %t.o -pie -Wl,-q -o %t.exe
+# RUN: llvm-bolt %t.exe -o %t.bolt 2>&1 | FileCheck %s
+
+# CHECK: BOLT-WARNING: Ignoring entry point at address 0x{{[0-9a-f]+}} in constant island of function func
+
+.globl func
+.type func, %function
+func:
+  b .Lafter_constant
+
+.type constant_island, %object
+constant_island:
+  .xword 0xabcdef
+
+.Lafter_constant:
+  ret
+  .size func, .-func
+
+.globl caller
+.type caller, %function
+caller:
+  bl constant_island
+  ret
diff --git a/bolt/test/AArch64/ifunc.test b/bolt/test/AArch64/ifunc.test
index 3da42c67c5a0a..15ecc3503f22f 100644
--- a/bolt/test/AArch64/ifunc.test
+++ b/bolt/test/AArch64/ifunc.test
@@ -9,7 +9,7 @@
 // RUN:   FileCheck --check-prefix=REL_CHECK %s
 
 // Non-pie static executable doesn't generate PT_DYNAMIC, check relocation
-// is readed successfully and IPLT trampoline has been identified by bolt.
+// is read successfully and IPLT trampoline has been identified by bolt.
 // RUN: %clang %cflags -nostdlib -O3 %p/../Inputs/ifunc.c -fuse-ld=lld -no-pie \
 // RUN:   -o %t.O3_nopie.exe -Wl,-q
 // RUN: llvm-readelf -l %t.O3_nopie.exe | \
diff --git a/bolt/test/X86/bolt-address-translation-yaml.test b/bolt/test/X86/bolt-address-translation-yaml.test
index cffe848a16ae1..fb16708f13a4a 100644
--- a/bolt/test/X86/bolt-address-translation-yaml.test
+++ b/bolt/test/X86/bolt-address-translation-yaml.test
@@ -46,7 +46,7 @@ WRITE-BAT-CHECK: BOLT-INFO: BAT section size (bytes): 404
 
 READ-BAT-CHECK-NOT: BOLT-ERROR: unable to save profile in YAML format for input file processed by BOLT
 READ-BAT-CHECK: BOLT-INFO: Parsed 5 BAT entries
-READ-BAT-CHECK: PERF2BOLT: read 79 aggregated LBR entries
+READ-BAT-CHECK: PERF2BOLT: read 79 aggregated brstack entries
 READ-BAT-CHECK: HEATMAP: building heat map
 READ-BAT-CHECK: BOLT-INFO: 5 out of 21 functions in the binary (23.8%) have non-empty execution profile
 READ-BAT-FDATA-CHECK: BOLT-INFO: 5 out of 16 functions in the binary (31.2%) have non-empty execution profile
diff --git a/bolt/test/X86/cdsplit-call-scale.s b/bolt/test/X86/cdsplit-call-scale.s
index 66f30036de8c1..caa11b6feb6c4 100644
--- a/bolt/test/X86/cdsplit-call-scale.s
+++ b/bolt/test/X86/cdsplit-call-scale.s
@@ -1,8 +1,8 @@
 ## Test the control of aggressiveness of 3-way splitting by -call-scale.
-## When -call-scale=0.0, the tested function is 2-way splitted.
-## When -call-scale=1.0, the tested function is 3-way splitted with 5 blocks
+## When -call-scale=0.0, the tested function is 2-way split.
+## When -call-scale=1.0, the tested function is 3-way split with 5 blocks
 ## in warm because of the increased benefit of shortening the call edges.
-## When -call-scale=1000.0, the tested function is still 3-way splitted with
+## When -call-scale=1000.0, the tested function is still 3-way split with
 ## 5 blocks in warm because cdsplit does not allow hot-warm splitting to break
 ## a fall through branch from a basic block to its most likely successor.
 
diff --git a/bolt/test/X86/dwarf5-two-cu-str-offset-table.test b/bolt/test/X86/dwarf5-two-cu-str-offset-table.test
index e59664e3281a1..488635b582d0d 100644
--- a/bolt/test/X86/dwarf5-two-cu-str-offset-table.test
+++ b/bolt/test/X86/dwarf5-two-cu-str-offset-table.test
@@ -8,7 +8,7 @@
 # RUN: llvm-dwarfdump --show-form --verbose --debug-str-offsets %t.bolt >> %t.txt
 # RUN: cat %t.txt | FileCheck --check-prefix=CHECK %s
 
-## This test checks we correclty re-renerate .debug_str_offsets.
+## This test checks we correctly re-renerate .debug_str_offsets.
 
 # CHECK: .debug_str_offsets contents
 # CHECK-NEXT: 0x00000000: Contribution size = 52, Format = DWARF32, Version = 5
diff --git a/bolt/test/X86/dwarf5-type-unit-no-cu-str-offset-table.test b/bolt/test/X86/dwarf5-type-unit-no-cu-str-offset-table.test
index dc6255ff8c7bc..0cb62ed21abd1 100644
--- a/bolt/test/X86/dwarf5-type-unit-no-cu-str-offset-table.test
+++ b/bolt/test/X86/dwarf5-type-unit-no-cu-str-offset-table.test
@@ -7,7 +7,7 @@
 # RUN: llvm-dwarfdump --show-form --verbose --debug-str-offsets %t.exe | FileCheck -check-prefix=PRE-BOLT %s
 # RUN: llvm-dwarfdump --show-form --verbose --debug-str-offsets %t.bolt | FileCheck -check-prefix=POST-BOLT %s
 
-## This test checks we correclty re-generate .debug_str_offsets when there are type units that have an offset not shared with CU.
+## This test checks we correctly re-generate .debug_str_offsets when there are type units that have an offset not shared with CU.
 
 # PRE-BOLT: .debug_str_offsets contents
 # PRE-BOLT-NEXT: Contribution size = 24, Format = DWARF32, Version = 5
diff --git a/bolt/test/X86/heatmap-preagg.test b/bolt/test/X86/heatmap-preagg.test
index 493101664c4fd..4c46be4948951 100644
--- a/bolt/test/X86/heatmap-preagg.test
+++ b/bolt/test/X86/heatmap-preagg.test
@@ -32,7 +32,7 @@ RUN:   --block-size=1024 | FileCheck --check-prefix CHECK-HEATMAP-BAT-1K %s
 CHECK-HEATMAP-BAT-1K: HEATMAP: dumping heatmap with bucket size 1024
 CHECK-HEATMAP-BAT-1K-NOT: HEATMAP: dumping heatmap with bucket size
 
-CHECK-HEATMAP: PERF2BOLT: read 81 aggregated LBR entries
+CHECK-HEATMAP: PERF2BOLT: read 81 aggregated brstack entries
 CHECK-HEATMAP: HEATMAP: invalid traces: 1
 CHECK-HEATMAP: HEATMAP: dumping heatmap with bucket size 64
 CHECK-HEATMAP: HEATMAP: dumping heatmap with bucket size 128
@@ -71,7 +71,7 @@ CHECK-HM-1024-NEXT: 0
 CHECK-BAT-HM-64: (349, 1126]
 CHECK-BAT-HM-4K: (605, 2182]
 
-CHECK-HEATMAP-BAT: PERF2BOLT: read 79 aggregated LBR entries
+CHECK-HEATMAP-BAT: PERF2BOLT: read 79 aggregated brstack entries
 CHECK-HEATMAP-BAT: HEATMAP: invalid traces: 2
 CHECK-HEATMAP-BAT: HEATMAP: dumping heatmap with bucket size 64
 CHECK-HEATMAP-BAT: HEATMAP: dumping heatmap with bucket size 4096
diff --git a/bolt/test/X86/jt-symbol-disambiguation-3.s b/bolt/test/X86/jt-symbol-disambiguation-3.s
index 22b34cef1bc4d..c06fd3bb69894 100644
--- a/bolt/test/X86/jt-symbol-disambiguation-3.s
+++ b/bolt/test/X86/jt-symbol-disambiguation-3.s
@@ -1,6 +1,6 @@
 ## In this test case, we reproduce the behavior seen in gcc where the
 ## base address of a jump table is decremented by some number and ends up
-## at the exact addess of a jump table from another function. After
+## at the exact address of a jump table from another function. After
 ## linking, the instruction references another jump table and that
 ## confuses BOLT.
 ## We repro here the following issue:
@@ -28,7 +28,7 @@
 # ----
 # Func foo contains a jump table whose start is colocated with a
 # jump table reference in another function. However, the other function
-# does not use the first entries of it and is merely doing arithmetics
+# does not use the first entries of it and is merely doing arithmetic
 # to save the creation of unused first entries.
 # ----
   .globl foo
diff --git a/bolt/test/X86/nolbr.s b/bolt/test/X86/nolbr.s
index 999c68566c949..d6710deeec343 100644
--- a/bolt/test/X86/nolbr.s
+++ b/bolt/test/X86/nolbr.s
@@ -17,7 +17,7 @@
 # CHECK-FDATA-NEXT: 1 _start [[#]] 1
 
 # CHECK-BOLT: BOLT-INFO: pre-processing profile using branch profile reader
-# CHECK-BOLT: BOLT-INFO: operating with basic samples profiling data (no LBR).
+# CHECK-BOLT: BOLT-INFO: operating with basic samples profiling data (no brstack).
 # CHECK-BOLT: BOLT-INFO: 1 out of 1 functions in the binary (100.0%) have non-empty execution profile
 
   .globl _start
diff --git a/bolt/test/X86/split-landing-pad.s b/bolt/test/X86/split-landing-pad.s
index 681f14f1e533e..149193dbe5188 100644
--- a/bolt/test/X86/split-landing-pad.s
+++ b/bolt/test/X86/split-landing-pad.s
@@ -1,5 +1,5 @@
 ## This test reproduces the case where C++ exception handling is used and split
-## function optimization is enabled. In particular, function foo is splitted
+## function optimization is enabled. In particular, function foo is split
 ## to two fragments:
 ##    foo: contains 2 try blocks, which invokes bar to throw exception
 ##    foo.cold.1: contains 2 corresponding catch blocks (landing pad)
diff --git a/bolt/test/perf2bolt/AArch64/perf2bolt-spe.test b/bolt/test/perf2bolt/AArch64/perf2bolt-spe.test
index 1f44f7510a9fb..741fd5ef9b4b4 100644
--- a/bolt/test/perf2bolt/AArch64/perf2bolt-spe.test
+++ b/bolt/test/perf2bolt/AArch64/perf2bolt-spe.test
@@ -6,6 +6,6 @@ RUN: %clang %cflags %p/../../Inputs/asm_foo.s %p/../../Inputs/asm_main.c -o %t.e
 
 RUN: perf record -e cycles -q -o %t.perf.data -- %t.exe 2> /dev/null
 
-RUN: perf2bolt -p %t.perf.data -o %t.perf.boltdata --spe %t.exe | FileCheck %s --check-prefix=CHECK-SPE-LBR
+RUN: perf2bolt -p %t.perf.data -o %t.perf.boltdata --spe %t.exe | FileCheck %s --check-prefix=CHECK-SPE-BRSTACK
 
-CHECK-SPE-LBR: PERF2BOLT: parse SPE branch events in LBR-format
+CHECK-SPE-BRSTACK: PERF2BOLT: parse SPE branch events in brstack-format
diff --git a/bolt/test/runtime/X86/asm-dump.c b/bolt/test/runtime/X86/asm-dump.c
index 7656fda44d8d4..fa0de9b72fb64 100644
--- a/bolt/test/runtime/X86/asm-dump.c
+++ b/bolt/test/runtime/X86/asm-dump.c
@@ -30,7 +30,7 @@
  * Reconstruct fdata
  * RUN: link_fdata %t/main.s %t.o %t.fdata.reconst
  *
- * XXX: reenable once dumping data is supported
+ * XXX: re-enable once dumping data is supported
  * Check if reoptimized file produces the same results
  * dontrun: %t.exe.reopt > %t.result.reopt
  * dontrun: cmp %t.result %t.result.reopt
diff --git a/bolt/test/runtime/wait_file.sh b/bolt/test/runtime/wait_file.sh
index 42d4c5b29e795..73464764249d5 100644
--- a/bolt/test/runtime/wait_file.sh
+++ b/bolt/test/runtime/wait_file.sh
@@ -12,7 +12,7 @@ check_file() {
 
     fuser -s "$file"
     local ret=$?
-    if [ $ret -eq 1 ]; then # noone has file open
+    if [ $ret -eq 1 ]; then # no one has file open
         return 0
     fi
     if [ $ret -eq 0 ]; then # file open by some processes
diff --git a/bolt/tools/heatmap/heatmap.cpp b/bolt/tools/heatmap/heatmap.cpp
index 43167558b6758..17a969e0c8598 100644
--- a/bolt/tools/heatmap/heatmap.cpp
+++ b/bolt/tools/heatmap/heatmap.cpp
@@ -69,7 +69,8 @@ int main(int argc, char **argv) {
       "  - Sampled profile collected from the binary:\n"
       "    - perf data or pre-aggregated profile data (instrumentation profile "
       "not supported)\n"
-      "    - perf data can have basic (IP) or branch-stack (LBR) samples\n\n"
+      "    - perf data can have basic (IP) or branch-stack (brstack) "
+      "samples\n\n"
 
       "  Outputs:\n"
       "  - Heatmaps: colored ASCII (requires a color-capable terminal or a"
diff --git a/bolt/tools/merge-fdata/merge-fdata.cpp b/bolt/tools/merge-fdata/merge-fdata.cpp
index cfcb9373548a1..f5b0251ea0331 100644
--- a/bolt/tools/merge-fdata/merge-fdata.cpp
+++ b/bolt/tools/merge-fdata/merge-fdata.cpp
@@ -120,14 +120,14 @@ void mergeProfileHeaders(BinaryProfileHeader &MergedHeader,
   if (!MergedHeader.Id.empty() && (MergedHeader.Id != Header.Id))
     errs() << "WARNING: build-ids in merged profiles do not match\n";
 
-  // Cannot merge samples profile with LBR profile.
+  // Cannot merge samples profile with brstack profile.
   if (!MergedHeader.Flags)
     MergedHeader.Flags = Header.Flags;
 
   constexpr auto Mask = llvm::bolt::BinaryFunction::PF_BRANCH |
                         llvm::bolt::BinaryFunction::PF_BASIC;
   if ((MergedHeader.Flags & Mask) != (Header.Flags & Mask)) {
-    errs() << "ERROR: cannot merge LBR profile with non-LBR profile\n";
+    errs() << "ERROR: cannot merge brstack profile with non-brstack profile\n";
     exit(1);
   }
   MergedHeader.Flags = MergedHeader.Flags | Header.Flags;
diff --git a/bolt/unittests/Core/MCPlusBuilder.cpp b/bolt/unittests/Core/MCPlusBuilder.cpp
index af4cc9da9c9f4..bc37cedb435ae 100644
--- a/bolt/unittests/Core/MCPlusBuilder.cpp
+++ b/bolt/unittests/Core/MCPlusBuilder.cpp
@@ -261,6 +261,82 @@ TEST_P(MCPlusBuilderTester, testAccessedRegsMultipleDefs) {
                 {AArch64::W5, AArch64::X5, AArch64::W5_HI});
 }
 
+TEST_P(MCPlusBuilderTester, AArch64_Psign_Pauth_variants) {
+  if (GetParam() != Triple::aarch64)
+    GTEST_SKIP();
+
+  MCInst Paciasp = MCInstBuilder(AArch64::PACIASP);
+  MCInst Pacibsp = MCInstBuilder(AArch64::PACIBSP);
+  ASSERT_TRUE(BC->MIB->isPSignOnLR(Paciasp));
+  ASSERT_TRUE(BC->MIB->isPSignOnLR(Pacibsp));
+
+  MCInst PaciaSPLR =
+      MCInstBuilder(AArch64::PACIA).addReg(AArch64::LR).addReg(AArch64::SP);
+  MCInst PacibSPLR =
+      MCInstBuilder(AArch64::PACIB).addReg(AArch64::LR).addReg(AArch64::SP);
+  ASSERT_TRUE(BC->MIB->isPSignOnLR(PaciaSPLR));
+  ASSERT_TRUE(BC->MIB->isPSignOnLR(PacibSPLR));
+
+  MCInst PacizaX5 = MCInstBuilder(AArch64::PACIZA).addReg(AArch64::X5);
+  MCInst PacizbX5 = MCInstBuilder(AArch64::PACIZB).addReg(AArch64::X5);
+  ASSERT_FALSE(BC->MIB->isPSignOnLR(PacizaX5));
+  ASSERT_FALSE(BC->MIB->isPSignOnLR(PacizbX5));
+
+  MCInst Paciaz = MCInstBuilder(AArch64::PACIZA).addReg(AArch64::LR);
+  MCInst Pacibz = MCInstBuilder(AArch64::PACIZB).addReg(AArch64::LR);
+  ASSERT_TRUE(BC->MIB->isPSignOnLR(Paciaz));
+  ASSERT_TRUE(BC->MIB->isPSignOnLR(Pacibz));
+
+  MCInst Pacia1716 = MCInstBuilder(AArch64::PACIA1716);
+  MCInst Pacib1716 = MCInstBuilder(AArch64::PACIB1716);
+  ASSERT_FALSE(BC->MIB->isPSignOnLR(Pacia1716));
+  ASSERT_FALSE(BC->MIB->isPSignOnLR(Pacib1716));
+
+  MCInst Pacia171615 = MCInstBuilder(AArch64::PACIA171615);
+  MCInst Pacib171615 = MCInstBuilder(AArch64::PACIB171615);
+  ASSERT_FALSE(BC->MIB->isPSignOnLR(Pacia171615));
+  ASSERT_FALSE(BC->MIB->isPSignOnLR(Pacib171615));
+
+  MCInst Autiasp = MCInstBuilder(AArch64::AUTIASP);
+  MCInst Autibsp = MCInstBuilder(AArch64::AUTIBSP);
+  ASSERT_TRUE(BC->MIB->isPAuthOnLR(Autiasp));
+  ASSERT_TRUE(BC->MIB->isPAuthOnLR(Autibsp));
+
+  MCInst AutiaSPLR =
+      MCInstBuilder(AArch64::AUTIA).addReg(AArch64::LR).addReg(AArch64::SP);
+  MCInst AutibSPLR =
+      MCInstBuilder(AArch64::AUTIB).addReg(AArch64::LR).addReg(AArch64::SP);
+  ASSERT_TRUE(BC->MIB->isPAuthOnLR(AutiaSPLR));
+  ASSERT_TRUE(BC->MIB->isPAuthOnLR(AutibSPLR));
+
+  MCInst AutizaX5 = MCInstBuilder(AArch64::AUTIZA).addReg(AArch64::X5);
+  MCInst AutizbX5 = MCInstBuilder(AArch64::AUTIZB).addReg(AArch64::X5);
+  ASSERT_FALSE(BC->MIB->isPAuthOnLR(AutizaX5));
+  ASSERT_FALSE(BC->MIB->isPAuthOnLR(AutizbX5));
+
+  MCInst Autiaz = MCInstBuilder(AArch64::AUTIZA).addReg(AArch64::LR);
+  MCInst Autibz = MCInstBuilder(AArch64::AUTIZB).addReg(AArch64::LR);
+  ASSERT_TRUE(BC->MIB->isPAuthOnLR(Autiaz));
+  ASSERT_TRUE(BC->MIB->isPAuthOnLR(Autibz));
+
+  MCInst Autia1716 = MCInstBuilder(AArch64::AUTIA1716);
+  MCInst Autib1716 = MCInstBuilder(AArch64::AUTIB1716);
+  ASSERT_FALSE(BC->MIB->isPAuthOnLR(Autia1716));
+  ASSERT_FALSE(BC->MIB->isPAuthOnLR(Autib1716));
+
+  MCInst Autia171615 = MCInstBuilder(AArch64::AUTIA171615);
+  MCInst Autib171615 = MCInstBuilder(AArch64::AUTIB171615);
+  ASSERT_FALSE(BC->MIB->isPAuthOnLR(Autia171615));
+  ASSERT_FALSE(BC->MIB->isPAuthOnLR(Autib171615));
+
+  MCInst Retaa = MCInstBuilder(AArch64::RETAA);
+  MCInst Retab = MCInstBuilder(AArch64::RETAB);
+  ASSERT_FALSE(BC->MIB->isPAuthOnLR(Retaa));
+  ASSERT_FALSE(BC->MIB->isPAuthOnLR(Retab));
+  ASSERT_TRUE(BC->MIB->isPAuthAndRet(Retaa));
+  ASSERT_TRUE(BC->MIB->isPAuthAndRet(Retab));
+}
+
 #endif // AARCH64_AVAILABLE
 
 #ifdef X86_AVAILABLE
diff --git a/bolt/utils/bughunter.sh b/bolt/utils/bughunter.sh
index c5dddc41fb41f..d5ce0592708e2 100755
--- a/bolt/utils/bughunter.sh
+++ b/bolt/utils/bughunter.sh
@@ -28,7 +28,7 @@
 #
 #   TIMEOUT_OR_CMD    - optional timeout or command on optimized binary command
 #                       if the value is a number with an optional trailing letter
-#                       [smhd] it is considered a paramter to "timeout",
+#                       [smhd] it is considered a parameter to "timeout",
 #                       otherwise it's a shell command that wraps the optimized
 #                       binary command.
 #
diff --git a/clang-tools-extra/clang-doc/Generators.cpp b/clang-tools-extra/clang-doc/Generators.cpp
index 3fb5b63c403a7..a5f6f1c7ea732 100644
--- a/clang-tools-extra/clang-doc/Generators.cpp
+++ b/clang-tools-extra/clang-doc/Generators.cpp
@@ -97,15 +97,11 @@ void Generator::addInfoToIndex(Index &Idx, const doc::Info *Info) {
 
 // This anchor is used to force the linker to link in the generated object file
 // and thus register the generators.
-static int LLVM_ATTRIBUTE_UNUSED YAMLGeneratorAnchorDest =
-    YAMLGeneratorAnchorSource;
-static int LLVM_ATTRIBUTE_UNUSED MDGeneratorAnchorDest =
-    MDGeneratorAnchorSource;
-static int LLVM_ATTRIBUTE_UNUSED HTMLGeneratorAnchorDest =
-    HTMLGeneratorAnchorSource;
-static int LLVM_ATTRIBUTE_UNUSED MHTMLGeneratorAnchorDest =
+[[maybe_unused]] static int YAMLGeneratorAnchorDest = YAMLGeneratorAnchorSource;
+[[maybe_unused]] static int MDGeneratorAnchorDest = MDGeneratorAnchorSource;
+[[maybe_unused]] static int HTMLGeneratorAnchorDest = HTMLGeneratorAnchorSource;
+[[maybe_unused]] static int MHTMLGeneratorAnchorDest =
     MHTMLGeneratorAnchorSource;
-static int LLVM_ATTRIBUTE_UNUSED JSONGeneratorAnchorDest =
-    JSONGeneratorAnchorSource;
+[[maybe_unused]] static int JSONGeneratorAnchorDest = JSONGeneratorAnchorSource;
 } // namespace doc
 } // namespace clang
diff --git a/clang-tools-extra/clang-tidy/.clang-format b/clang-tools-extra/clang-tidy/.clang-format
index d18cf7c108ca6..5b5066116bbaa 100644
--- a/clang-tools-extra/clang-tidy/.clang-format
+++ b/clang-tools-extra/clang-tidy/.clang-format
@@ -1,2 +1,3 @@
 BasedOnStyle: LLVM
 QualifierAlignment: Left
+LineEnding: LF
diff --git a/clang-tools-extra/clang-tidy/ClangTidyForceLinker.h b/clang-tools-extra/clang-tidy/ClangTidyForceLinker.h
index cdf6ce2045a5d..afc358ad4e966 100644
--- a/clang-tools-extra/clang-tidy/ClangTidyForceLinker.h
+++ b/clang-tools-extra/clang-tidy/ClangTidyForceLinker.h
@@ -16,132 +16,131 @@ namespace clang::tidy {
 
 // This anchor is used to force the linker to link the AbseilModule.
 extern volatile int AbseilModuleAnchorSource;
-static int LLVM_ATTRIBUTE_UNUSED AbseilModuleAnchorDestination =
+[[maybe_unused]] static int AbseilModuleAnchorDestination =
     AbseilModuleAnchorSource;
 
 // This anchor is used to force the linker to link the AlteraModule.
 extern volatile int AlteraModuleAnchorSource;
-static int LLVM_ATTRIBUTE_UNUSED AlteraModuleAnchorDestination =
+[[maybe_unused]] static int AlteraModuleAnchorDestination =
     AlteraModuleAnchorSource;
 
 // This anchor is used to force the linker to link the AndroidModule.
 extern volatile int AndroidModuleAnchorSource;
-static int LLVM_ATTRIBUTE_UNUSED AndroidModuleAnchorDestination =
+[[maybe_unused]] static int AndroidModuleAnchorDestination =
     AndroidModuleAnchorSource;
 
 // This anchor is used to force the linker to link the BoostModule.
 extern volatile int BoostModuleAnchorSource;
-static int LLVM_ATTRIBUTE_UNUSED BoostModuleAnchorDestination =
+[[maybe_unused]] static int BoostModuleAnchorDestination =
     BoostModuleAnchorSource;
 
 // This anchor is used to force the linker to link the BugproneModule.
 extern volatile int BugproneModuleAnchorSource;
-static int LLVM_ATTRIBUTE_UNUSED BugproneModuleAnchorDestination =
+[[maybe_unused]] static int BugproneModuleAnchorDestination =
     BugproneModuleAnchorSource;
 
 // This anchor is used to force the linker to link the CERTModule.
 extern volatile int CERTModuleAnchorSource;
-static int LLVM_ATTRIBUTE_UNUSED CERTModuleAnchorDestination =
+[[maybe_unused]] static int CERTModuleAnchorDestination =
     CERTModuleAnchorSource;
 
 // This anchor is used to force the linker to link the ConcurrencyModule.
 extern volatile int ConcurrencyModuleAnchorSource;
-static int LLVM_ATTRIBUTE_UNUSED ConcurrencyModuleAnchorDestination =
+[[maybe_unused]] static int ConcurrencyModuleAnchorDestination =
     ConcurrencyModuleAnchorSource;
 
 // This anchor is used to force the linker to link the CppCoreGuidelinesModule.
 extern volatile int CppCoreGuidelinesModuleAnchorSource;
-static int LLVM_ATTRIBUTE_UNUSED CppCoreGuidelinesModuleAnchorDestination =
+[[maybe_unused]] static int CppCoreGuidelinesModuleAnchorDestination =
     CppCoreGuidelinesModuleAnchorSource;
 
 #if CLANG_TIDY_ENABLE_QUERY_BASED_CUSTOM_CHECKS
 // This anchor is used to force the linker to link the CustomModule.
 extern volatile int CustomModuleAnchorSource;
-static int LLVM_ATTRIBUTE_UNUSED CustomModuleAnchorDestination =
+[[maybe_unused]] static int CustomModuleAnchorDestination =
     CustomModuleAnchorSource;
 #endif
 
 // This anchor is used to force the linker to link the DarwinModule.
 extern volatile int DarwinModuleAnchorSource;
-static int LLVM_ATTRIBUTE_UNUSED DarwinModuleAnchorDestination =
+[[maybe_unused]] static int DarwinModuleAnchorDestination =
     DarwinModuleAnchorSource;
 
 // This anchor is used to force the linker to link the FuchsiaModule.
 extern volatile int FuchsiaModuleAnchorSource;
-static int LLVM_ATTRIBUTE_UNUSED FuchsiaModuleAnchorDestination =
+[[maybe_unused]] static int FuchsiaModuleAnchorDestination =
     FuchsiaModuleAnchorSource;
 
 // This anchor is used to force the linker to link the GoogleModule.
 extern volatile int GoogleModuleAnchorSource;
-static int LLVM_ATTRIBUTE_UNUSED GoogleModuleAnchorDestination =
+[[maybe_unused]] static int GoogleModuleAnchorDestination =
     GoogleModuleAnchorSource;
 
 // This anchor is used to force the linker to link the HICPPModule.
 extern volatile int HICPPModuleAnchorSource;
-static int LLVM_ATTRIBUTE_UNUSED HICPPModuleAnchorDestination =
+[[maybe_unused]] static int HICPPModuleAnchorDestination =
     HICPPModuleAnchorSource;
 
 // This anchor is used to force the linker to link the LinuxKernelModule.
 extern volatile int LinuxKernelModuleAnchorSource;
-static int LLVM_ATTRIBUTE_UNUSED LinuxKernelModuleAnchorDestination =
+[[maybe_unused]] static int LinuxKernelModuleAnchorDestination =
     LinuxKernelModuleAnchorSource;
 
 // This anchor is used to force the linker to link the LLVMModule.
 extern volatile int LLVMModuleAnchorSource;
-static int LLVM_ATTRIBUTE_UNUSED LLVMModuleAnchorDestination =
+[[maybe_unused]] static int LLVMModuleAnchorDestination =
     LLVMModuleAnchorSource;
 
 // This anchor is used to force the linker to link the LLVMLibcModule.
 extern volatile int LLVMLibcModuleAnchorSource;
-static int LLVM_ATTRIBUTE_UNUSED LLVMLibcModuleAnchorDestination =
+[[maybe_unused]] static int LLVMLibcModuleAnchorDestination =
     LLVMLibcModuleAnchorSource;
 
 // This anchor is used to force the linker to link the MiscModule.
 extern volatile int MiscModuleAnchorSource;
-static int LLVM_ATTRIBUTE_UNUSED MiscModuleAnchorDestination =
+[[maybe_unused]] static int MiscModuleAnchorDestination =
     MiscModuleAnchorSource;
 
 // This anchor is used to force the linker to link the ModernizeModule.
 extern volatile int ModernizeModuleAnchorSource;
-static int LLVM_ATTRIBUTE_UNUSED ModernizeModuleAnchorDestination =
+[[maybe_unused]] static int ModernizeModuleAnchorDestination =
     ModernizeModuleAnchorSource;
 
 #if CLANG_TIDY_ENABLE_STATIC_ANALYZER &&                                       \
     !defined(CLANG_TIDY_DISABLE_STATIC_ANALYZER_CHECKS)
 // This anchor is used to force the linker to link the MPIModule.
 extern volatile int MPIModuleAnchorSource;
-static int LLVM_ATTRIBUTE_UNUSED MPIModuleAnchorDestination =
-    MPIModuleAnchorSource;
+[[maybe_unused]] static int MPIModuleAnchorDestination = MPIModuleAnchorSource;
 #endif
 
 // This anchor is used to force the linker to link the ObjCModule.
 extern volatile int ObjCModuleAnchorSource;
-static int LLVM_ATTRIBUTE_UNUSED ObjCModuleAnchorDestination =
+[[maybe_unused]] static int ObjCModuleAnchorDestination =
     ObjCModuleAnchorSource;
 
 // This anchor is used to force the linker to link the OpenMPModule.
 extern volatile int OpenMPModuleAnchorSource;
-static int LLVM_ATTRIBUTE_UNUSED OpenMPModuleAnchorDestination =
+[[maybe_unused]] static int OpenMPModuleAnchorDestination =
     OpenMPModuleAnchorSource;
 
 // This anchor is used to force the linker to link the PerformanceModule.
 extern volatile int PerformanceModuleAnchorSource;
-static int LLVM_ATTRIBUTE_UNUSED PerformanceModuleAnchorDestination =
+[[maybe_unused]] static int PerformanceModuleAnchorDestination =
     PerformanceModuleAnchorSource;
 
 // This anchor is used to force the linker to link the PortabilityModule.
 extern volatile int PortabilityModuleAnchorSource;
-static int LLVM_ATTRIBUTE_UNUSED PortabilityModuleAnchorDestination =
+[[maybe_unused]] static int PortabilityModuleAnchorDestination =
     PortabilityModuleAnchorSource;
 
 // This anchor is used to force the linker to link the ReadabilityModule.
 extern volatile int ReadabilityModuleAnchorSource;
-static int LLVM_ATTRIBUTE_UNUSED ReadabilityModuleAnchorDestination =
+[[maybe_unused]] static int ReadabilityModuleAnchorDestination =
     ReadabilityModuleAnchorSource;
 
 // This anchor is used to force the linker to link the ZirconModule.
 extern volatile int ZirconModuleAnchorSource;
-static int LLVM_ATTRIBUTE_UNUSED ZirconModuleAnchorDestination =
+[[maybe_unused]] static int ZirconModuleAnchorDestination =
     ZirconModuleAnchorSource;
 
 } // namespace clang::tidy
diff --git a/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp b/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp
index b752a9beb0e34..21455db7c7e7b 100644
--- a/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp
+++ b/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp
@@ -154,6 +154,7 @@ template <> struct ScalarEnumerationTraits<clang::DiagnosticIDs::Level> {
   }
 };
 template <> struct SequenceElementTraits<ClangTidyOptions::CustomCheckDiag> {
+  // NOLINTNEXTLINE(readability-identifier-naming) Defined by YAMLTraits.h
   static const bool flow = false;
 };
 template <> struct MappingTraits<ClangTidyOptions::CustomCheckDiag> {
@@ -165,6 +166,7 @@ template <> struct MappingTraits<ClangTidyOptions::CustomCheckDiag> {
   }
 };
 template <> struct SequenceElementTraits<ClangTidyOptions::CustomCheckValue> {
+  // NOLINTNEXTLINE(readability-identifier-naming) Defined by YAMLTraits.h
   static const bool flow = false;
 };
 template <> struct MappingTraits<ClangTidyOptions::CustomCheckValue> {
diff --git a/clang-tools-extra/clang-tidy/abseil/FasterStrsplitDelimiterCheck.cpp b/clang-tools-extra/clang-tidy/abseil/FasterStrsplitDelimiterCheck.cpp
index 13d566087688f..d9f6551739d9e 100644
--- a/clang-tools-extra/clang-tidy/abseil/FasterStrsplitDelimiterCheck.cpp
+++ b/clang-tools-extra/clang-tidy/abseil/FasterStrsplitDelimiterCheck.cpp
@@ -20,8 +20,10 @@ namespace {
 
 AST_MATCHER(StringLiteral, lengthIsOne) { return Node.getLength() == 1; }
 
-std::optional<std::string> makeCharacterLiteral(const StringLiteral *Literal,
-                                                const ASTContext &Context) {
+} // anonymous namespace
+
+static std::optional<std::string>
+makeCharacterLiteral(const StringLiteral *Literal, const ASTContext &Context) {
   assert(Literal->getLength() == 1 &&
          "Only single character string should be matched");
   assert(Literal->getCharByteWidth() == 1 &&
@@ -53,8 +55,6 @@ std::optional<std::string> makeCharacterLiteral(const StringLiteral *Literal,
   return Result;
 }
 
-} // anonymous namespace
-
 void FasterStrsplitDelimiterCheck::registerMatchers(MatchFinder *Finder) {
   // Binds to one character string literals.
   const auto SingleChar =
diff --git a/clang-tools-extra/clang-tidy/abseil/RedundantStrcatCallsCheck.cpp b/clang-tools-extra/clang-tidy/abseil/RedundantStrcatCallsCheck.cpp
index d7cc0cacab6ea..a58c0410c4e35 100644
--- a/clang-tools-extra/clang-tidy/abseil/RedundantStrcatCallsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/abseil/RedundantStrcatCallsCheck.cpp
@@ -45,7 +45,10 @@ struct StrCatCheckResult {
   std::vector<FixItHint> Hints;
 };
 
-void removeCallLeaveArgs(const CallExpr *Call, StrCatCheckResult *CheckResult) {
+} // namespace
+
+static void removeCallLeaveArgs(const CallExpr *Call,
+                                StrCatCheckResult *CheckResult) {
   if (Call->getNumArgs() == 0)
     return;
   // Remove 'Foo('
@@ -58,9 +61,9 @@ void removeCallLeaveArgs(const CallExpr *Call, StrCatCheckResult *CheckResult) {
           Call->getRParenLoc(), Call->getEndLoc().getLocWithOffset(1))));
 }
 
-const clang::CallExpr *processArgument(const Expr *Arg,
-                                       const MatchFinder::MatchResult &Result,
-                                       StrCatCheckResult *CheckResult) {
+static const clang::CallExpr *
+processArgument(const Expr *Arg, const MatchFinder::MatchResult &Result,
+                StrCatCheckResult *CheckResult) {
   const auto IsAlphanum = hasDeclaration(cxxMethodDecl(hasName("AlphaNum")));
   static const auto *const Strcat = new auto(hasName("::absl::StrCat"));
   const auto IsStrcat = cxxBindTemporaryExpr(
@@ -78,8 +81,8 @@ const clang::CallExpr *processArgument(const Expr *Arg,
   return nullptr;
 }
 
-StrCatCheckResult processCall(const CallExpr *RootCall, bool IsAppend,
-                              const MatchFinder::MatchResult &Result) {
+static StrCatCheckResult processCall(const CallExpr *RootCall, bool IsAppend,
+                                     const MatchFinder::MatchResult &Result) {
   StrCatCheckResult CheckResult;
   std::deque<const CallExpr *> CallsToProcess = {RootCall};
 
@@ -101,7 +104,6 @@ StrCatCheckResult processCall(const CallExpr *RootCall, bool IsAppend,
   }
   return CheckResult;
 }
-} // namespace
 
 void RedundantStrcatCallsCheck::check(const MatchFinder::MatchResult &Result) {
   bool IsAppend = false;
diff --git a/clang-tools-extra/clang-tidy/altera/UnrollLoopsCheck.cpp b/clang-tools-extra/clang-tidy/altera/UnrollLoopsCheck.cpp
index 6aad3c6b191ed..e90cdd00eb1fe 100644
--- a/clang-tools-extra/clang-tidy/altera/UnrollLoopsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/altera/UnrollLoopsCheck.cpp
@@ -215,13 +215,13 @@ bool UnrollLoopsCheck::hasLargeNumIterations(const Stmt *Statement,
       break;
     case (BO_MulAssign):
       Iterations =
-          1 + (std::log((double)EndValue) - std::log((double)InitValue)) /
-                  std::log((double)ConstantValue);
+          1 + ((std::log((double)EndValue) - std::log((double)InitValue)) /
+               std::log((double)ConstantValue));
       break;
     case (BO_DivAssign):
       Iterations =
-          1 + (std::log((double)InitValue) - std::log((double)EndValue)) /
-                  std::log((double)ConstantValue);
+          1 + ((std::log((double)InitValue) - std::log((double)EndValue)) /
+               std::log((double)ConstantValue));
       break;
     default:
       // All other operators are not handled; assume large bounds.
diff --git a/clang-tools-extra/clang-tidy/android/CloexecCheck.cpp b/clang-tools-extra/clang-tidy/android/CloexecCheck.cpp
index cd83423adae05..48c54c0ae02c3 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecCheck.cpp
+++ b/clang-tools-extra/clang-tidy/android/CloexecCheck.cpp
@@ -16,12 +16,13 @@ using namespace clang::ast_matchers;
 
 namespace clang::tidy::android {
 
-namespace {
 // Helper function to form the correct string mode for Type3.
 // Build the replace text. If it's string constant, add <Mode> directly in the
 // end of the string. Else, add <Mode>.
-std::string buildFixMsgForStringFlag(const Expr *Arg, const SourceManager &SM,
-                                     const LangOptions &LangOpts, char Mode) {
+static std::string buildFixMsgForStringFlag(const Expr *Arg,
+                                            const SourceManager &SM,
+                                            const LangOptions &LangOpts,
+                                            char Mode) {
   if (Arg->getBeginLoc().isMacroID())
     return (Lexer::getSourceText(
                 CharSourceRange::getTokenRange(Arg->getSourceRange()), SM,
@@ -32,11 +33,6 @@ std::string buildFixMsgForStringFlag(const Expr *Arg, const SourceManager &SM,
   StringRef SR = cast<StringLiteral>(Arg->IgnoreParenCasts())->getString();
   return ("\"" + SR + Twine(Mode) + "\"").str();
 }
-} // namespace
-
-const char *CloexecCheck::FuncDeclBindingStr = "funcDecl";
-
-const char *CloexecCheck::FuncBindingStr = "func";
 
 void CloexecCheck::registerMatchersImpl(
     MatchFinder *Finder, internal::Matcher<FunctionDecl> Function) {
diff --git a/clang-tools-extra/clang-tidy/android/CloexecCheck.h b/clang-tools-extra/clang-tidy/android/CloexecCheck.h
index 79f7ab3354d8d..b2b59f5be1b9a 100644
--- a/clang-tools-extra/clang-tidy/android/CloexecCheck.h
+++ b/clang-tools-extra/clang-tidy/android/CloexecCheck.h
@@ -89,10 +89,10 @@ class CloexecCheck : public ClangTidyCheck {
                            int N) const;
 
   /// Binding name of the FuncDecl of a function call.
-  static const char *FuncDeclBindingStr;
+  static constexpr char FuncDeclBindingStr[] = "funcDecl";
 
   /// Binding name of the function call expression.
-  static const char *FuncBindingStr;
+  static constexpr char FuncBindingStr[] = "func";
 };
 
 } // namespace clang::tidy::android
diff --git a/clang-tools-extra/clang-tidy/bugprone/ChainedComparisonCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ChainedComparisonCheck.cpp
index 6af535f712d71..3d3fc785b71f4 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ChainedComparisonCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/ChainedComparisonCheck.cpp
@@ -51,6 +51,8 @@ struct ChainedComparisonData {
   void extract(const CXXOperatorCallExpr *Op);
 };
 
+} // namespace
+
 void ChainedComparisonData::add(const Expr *Operand) {
   if (!Name.empty())
     Name += ' ';
@@ -111,8 +113,6 @@ void ChainedComparisonData::extract(const Expr *Op) {
   }
 }
 
-} // namespace
-
 void ChainedComparisonCheck::registerMatchers(MatchFinder *Finder) {
   const auto OperatorMatcher = expr(anyOf(
       binaryOperator(isComparisonOperator(),
diff --git a/clang-tools-extra/clang-tidy/bugprone/DanglingHandleCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/DanglingHandleCheck.cpp
index 5b741e8c35b9a..9f8e885c91fff 100644
--- a/clang-tools-extra/clang-tidy/bugprone/DanglingHandleCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/DanglingHandleCheck.cpp
@@ -17,9 +17,7 @@ using namespace clang::tidy::matchers;
 
 namespace clang::tidy::bugprone {
 
-namespace {
-
-ast_matchers::internal::BindableMatcher<Stmt>
+static ast_matchers::internal::BindableMatcher<Stmt>
 handleFrom(const ast_matchers::internal::Matcher<RecordDecl> &IsAHandle,
            const ast_matchers::internal::Matcher<Expr> &Arg) {
   return expr(
@@ -31,7 +29,7 @@ handleFrom(const ast_matchers::internal::Matcher<RecordDecl> &IsAHandle,
                               on(Arg))));
 }
 
-ast_matchers::internal::Matcher<Stmt> handleFromTemporaryValue(
+static ast_matchers::internal::Matcher<Stmt> handleFromTemporaryValue(
     const ast_matchers::internal::Matcher<RecordDecl> &IsAHandle) {
 
   const auto TemporaryExpr = anyOf(
@@ -49,22 +47,22 @@ ast_matchers::internal::Matcher<Stmt> handleFromTemporaryValue(
   return handleFrom(IsAHandle, anyOf(TemporaryExpr, TemporaryTernary));
 }
 
-ast_matchers::internal::Matcher<RecordDecl> isASequence() {
+static ast_matchers::internal::Matcher<RecordDecl> isASequence() {
   return hasAnyName("::std::deque", "::std::forward_list", "::std::list",
                     "::std::vector");
 }
 
-ast_matchers::internal::Matcher<RecordDecl> isASet() {
+static ast_matchers::internal::Matcher<RecordDecl> isASet() {
   return hasAnyName("::std::set", "::std::multiset", "::std::unordered_set",
                     "::std::unordered_multiset");
 }
 
-ast_matchers::internal::Matcher<RecordDecl> isAMap() {
+static ast_matchers::internal::Matcher<RecordDecl> isAMap() {
   return hasAnyName("::std::map", "::std::multimap", "::std::unordered_map",
                     "::std::unordered_multimap");
 }
 
-ast_matchers::internal::BindableMatcher<Stmt> makeContainerMatcher(
+static ast_matchers::internal::BindableMatcher<Stmt> makeContainerMatcher(
     const ast_matchers::internal::Matcher<RecordDecl> &IsAHandle) {
   // This matcher could be expanded to detect:
   //  - Constructors: eg. vector<string_view>(3, string("A"));
@@ -91,8 +89,6 @@ ast_matchers::internal::BindableMatcher<Stmt> makeContainerMatcher(
                               hasOverloadedOperatorName("[]"))));
 }
 
-} // anonymous namespace
-
 DanglingHandleCheck::DanglingHandleCheck(StringRef Name,
                                          ClangTidyContext *Context)
     : ClangTidyCheck(Name, Context),
diff --git a/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.cpp
index d8207b30f1b5e..b4ee35154f5f0 100644
--- a/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.cpp
@@ -1074,7 +1074,7 @@ approximateStandardConversionSequence(const TheCheck &Check, QualType From,
     WorkType = To;
   }
 
-  if (Ctx.hasSameType(WorkType, To)) {
+  if (ASTContext::hasSameType(WorkType, To)) {
     LLVM_DEBUG(llvm::dbgs() << "<<< approximateStdConv. Reached 'To' type.\n");
     return {Ctx.getCommonSugaredType(WorkType, To)};
   }
diff --git a/clang-tools-extra/clang-tidy/bugprone/InvalidEnumDefaultInitializationCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/InvalidEnumDefaultInitializationCheck.cpp
index 4fc1b3b99ece4..76df992f29fc1 100644
--- a/clang-tools-extra/clang-tidy/bugprone/InvalidEnumDefaultInitializationCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/InvalidEnumDefaultInitializationCheck.cpp
@@ -69,14 +69,14 @@ class FindEnumMember : public TypeVisitor<FindEnumMember, bool> {
     return Visit(T->getElementType().getTypePtr());
   }
   bool VisitEnumType(const EnumType *T) {
-    if (isCompleteAndHasNoZeroValue(T->getOriginalDecl())) {
+    if (isCompleteAndHasNoZeroValue(T->getDecl())) {
       FoundEnum = T;
       return true;
     }
     return false;
   }
   bool VisitRecordType(const RecordType *T) {
-    const RecordDecl *RD = T->getOriginalDecl()->getDefinition();
+    const RecordDecl *RD = T->getDecl()->getDefinition();
     if (!RD || RD->isUnion())
       return false;
     auto VisitField = [this](const FieldDecl *F) {
@@ -139,7 +139,7 @@ void InvalidEnumDefaultInitializationCheck::check(
     if (!Finder.Visit(InitList->getArrayFiller()->getType().getTypePtr()))
       return;
     InitExpr = InitList;
-    Enum = Finder.FoundEnum->getOriginalDecl();
+    Enum = Finder.FoundEnum->getDecl();
   }
 
   if (!InitExpr || !Enum)
diff --git a/clang-tools-extra/clang-tidy/bugprone/MultipleStatementMacroCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/MultipleStatementMacroCheck.cpp
index 390f3dd472a5b..54ed8994d0352 100644
--- a/clang-tools-extra/clang-tidy/bugprone/MultipleStatementMacroCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/MultipleStatementMacroCheck.cpp
@@ -18,8 +18,11 @@ namespace {
 
 AST_MATCHER(Expr, isInMacro) { return Node.getBeginLoc().isMacroID(); }
 
+} // namespace
+
 /// Find the next statement after `S`.
-const Stmt *nextStmt(const MatchFinder::MatchResult &Result, const Stmt *S) {
+static const Stmt *nextStmt(const MatchFinder::MatchResult &Result,
+                            const Stmt *S) {
   auto Parents = Result.Context->getParents(*S);
   if (Parents.empty())
     return nullptr;
@@ -40,8 +43,8 @@ using ExpansionRanges = std::vector<SourceRange>;
 /// \brief Get all the macro expansion ranges related to `Loc`.
 ///
 /// The result is ordered from most inner to most outer.
-ExpansionRanges getExpansionRanges(SourceLocation Loc,
-                                   const MatchFinder::MatchResult &Result) {
+static ExpansionRanges
+getExpansionRanges(SourceLocation Loc, const MatchFinder::MatchResult &Result) {
   ExpansionRanges Locs;
   while (Loc.isMacroID()) {
     Locs.push_back(
@@ -51,8 +54,6 @@ ExpansionRanges getExpansionRanges(SourceLocation Loc,
   return Locs;
 }
 
-} // namespace
-
 void MultipleStatementMacroCheck::registerMatchers(MatchFinder *Finder) {
   const auto Inner = expr(isInMacro(), unless(compoundStmt())).bind("inner");
   Finder->addMatcher(
diff --git a/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.cpp
index d4676842a97ff..ca85168ffce0b 100644
--- a/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.cpp
@@ -64,15 +64,17 @@ static unsigned getLength(const Expr *E,
   if (!E)
     return 0;
 
-  Expr::EvalResult Length;
   E = E->IgnoreImpCasts();
 
   if (const auto *LengthDRE = dyn_cast<DeclRefExpr>(E))
     if (const auto *LengthVD = dyn_cast<VarDecl>(LengthDRE->getDecl()))
       if (!isa<ParmVarDecl>(LengthVD))
-        if (const Expr *LengthInit = LengthVD->getInit())
+        if (const Expr *LengthInit = LengthVD->getInit();
+            LengthInit && !LengthInit->isValueDependent()) {
+          Expr::EvalResult Length;
           if (LengthInit->EvaluateAsInt(Length, *Result.Context))
             return Length.Val.getInt().getZExtValue();
+        }
 
   if (const auto *LengthIL = dyn_cast<IntegerLiteral>(E))
     return LengthIL->getValue().getZExtValue();
@@ -307,10 +309,9 @@ static void lengthExprHandle(const Expr *LengthExpr,
   // Try to obtain an 'IntegerLiteral' and adjust it.
   if (!IsMacroDefinition) {
     if (const auto *LengthIL = dyn_cast<IntegerLiteral>(LengthExpr)) {
-      size_t NewLength = LengthIL->getValue().getZExtValue() +
-                         (LengthHandle == LengthHandleKind::Increase
-                              ? (isInjectUL(Result) ? 1UL : 1)
-                              : -1);
+      uint64_t NewLength =
+          LengthIL->getValue().getZExtValue() +
+          (LengthHandle == LengthHandleKind::Increase ? 1 : -1);
 
       const auto NewLengthFix = FixItHint::CreateReplacement(
           LengthIL->getSourceRange(),
diff --git a/clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.cpp
index 86af5cbd94374..c262b1c05b047 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.cpp
@@ -245,12 +245,10 @@ struct OptionEnumMapping<
 
 namespace bugprone {
 
-namespace {
-
 /// Returns if a function is declared inside a system header.
 /// These functions are considered to be "standard" (system-provided) library
 /// functions.
-bool isStandardFunction(const FunctionDecl *FD) {
+static bool isStandardFunction(const FunctionDecl *FD) {
   // Find a possible redeclaration in system header.
   // FIXME: Looking at the canonical declaration is not the most exact way
   // to do this.
@@ -284,7 +282,7 @@ bool isStandardFunction(const FunctionDecl *FD) {
 /// Check if a statement is "C++-only".
 /// This includes all statements that have a class name with "CXX" prefix
 /// and every other statement that is declared in file ExprCXX.h.
-bool isCXXOnlyStmt(const Stmt *S) {
+static bool isCXXOnlyStmt(const Stmt *S) {
   StringRef Name = S->getStmtClassName();
   if (Name.starts_with("CXX"))
     return true;
@@ -304,7 +302,8 @@ bool isCXXOnlyStmt(const Stmt *S) {
 /// called from \p Caller, get a \c CallExpr of the corresponding function call.
 /// It is unspecified which call is found if multiple calls exist, but the order
 /// should be deterministic (depend only on the AST).
-Expr *findCallExpr(const CallGraphNode *Caller, const CallGraphNode *Callee) {
+static Expr *findCallExpr(const CallGraphNode *Caller,
+                          const CallGraphNode *Callee) {
   const auto *FoundCallee = llvm::find_if(
       Caller->callees(), [Callee](const CallGraphNode::CallRecord &Call) {
         return Call.Callee == Callee;
@@ -314,7 +313,7 @@ Expr *findCallExpr(const CallGraphNode *Caller, const CallGraphNode *Callee) {
   return FoundCallee->CallExpr;
 }
 
-SourceRange getSourceRangeOfStmt(const Stmt *S, ASTContext &Ctx) {
+static SourceRange getSourceRangeOfStmt(const Stmt *S, ASTContext &Ctx) {
   ParentMapContext &PM = Ctx.getParentMapContext();
   DynTypedNode P = DynTypedNode::create(*S);
   while (P.getSourceRange().isInvalid()) {
@@ -326,9 +325,9 @@ SourceRange getSourceRangeOfStmt(const Stmt *S, ASTContext &Ctx) {
   return P.getSourceRange();
 }
 
-AST_MATCHER(FunctionDecl, isStandardFunction) {
-  return isStandardFunction(&Node);
-}
+namespace {
+
+AST_MATCHER(FunctionDecl, isStandard) { return isStandardFunction(&Node); }
 
 } // namespace
 
@@ -354,7 +353,7 @@ bool SignalHandlerCheck::isLanguageVersionSupported(
 
 void SignalHandlerCheck::registerMatchers(MatchFinder *Finder) {
   auto SignalFunction = functionDecl(hasAnyName("::signal", "::std::signal"),
-                                     parameterCountIs(2), isStandardFunction());
+                                     parameterCountIs(2), isStandard());
   auto HandlerExpr =
       declRefExpr(hasDeclaration(functionDecl().bind("handler_decl")),
                   unless(isExpandedFromMacro("SIG_IGN")),
diff --git a/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp
index cdb6a088b9d0a..2672dc74f82f7 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp
@@ -50,15 +50,15 @@ AST_MATCHER_P2(Expr, hasSizeOfDescendant, int, Depth,
 
 AST_MATCHER(Expr, offsetOfExpr) { return isa<OffsetOfExpr>(Node); }
 
-CharUnits getSizeOfType(const ASTContext &Ctx, const Type *Ty) {
+} // namespace
+
+static CharUnits getSizeOfType(const ASTContext &Ctx, const Type *Ty) {
   if (!Ty || Ty->isIncompleteType() || Ty->isDependentType() ||
       isa<DependentSizedArrayType>(Ty) || !Ty->isConstantSizeType())
     return CharUnits::Zero();
   return Ctx.getTypeSizeInChars(Ty);
 }
 
-} // namespace
-
 SizeofExpressionCheck::SizeofExpressionCheck(StringRef Name,
                                              ClangTidyContext *Context)
     : ClangTidyCheck(Name, Context),
@@ -424,7 +424,7 @@ void SizeofExpressionCheck::check(const MatchFinder::MatchResult &Result) {
            "suspicious usage of 'sizeof(array)/sizeof(...)';"
            " denominator differs from the size of array elements")
           << E->getLHS()->getSourceRange() << E->getRHS()->getSourceRange();
-    } else if (NumTy && DenomTy && Ctx.hasSameType(NumTy, DenomTy) &&
+    } else if (NumTy && DenomTy && ASTContext::hasSameType(NumTy, DenomTy) &&
                !NumTy->isDependentType()) {
       // Dependent type should not be compared.
       diag(E->getOperatorLoc(),
@@ -433,7 +433,7 @@ void SizeofExpressionCheck::check(const MatchFinder::MatchResult &Result) {
           << E->getLHS()->getSourceRange() << E->getRHS()->getSourceRange();
     } else if (!WarnOnSizeOfPointer) {
       // When 'WarnOnSizeOfPointer' is enabled, these messages become redundant:
-      if (PointedTy && DenomTy && Ctx.hasSameType(PointedTy, DenomTy)) {
+      if (PointedTy && DenomTy && ASTContext::hasSameType(PointedTy, DenomTy)) {
         diag(E->getOperatorLoc(),
              "suspicious usage of 'sizeof(...)/sizeof(...)'; size of pointer "
              "is divided by size of pointed type")
@@ -462,8 +462,8 @@ void SizeofExpressionCheck::check(const MatchFinder::MatchResult &Result) {
     const auto *SizeOfExpr =
         Result.Nodes.getNodeAs<UnaryExprOrTypeTraitExpr>("sizeof-ptr-mul-expr");
 
-    if (Ctx.hasSameType(LPtrTy, RPtrTy) &&
-        Ctx.hasSameType(LPtrTy, SizeofArgTy)) {
+    if (ASTContext::hasSameType(LPtrTy, RPtrTy) &&
+        ASTContext::hasSameType(LPtrTy, SizeofArgTy)) {
       diag(SizeOfExpr->getBeginLoc(), "suspicious usage of 'sizeof(...)' in "
                                       "pointer arithmetic")
           << SizeOfExpr->getSourceRange() << E->getOperatorLoc()
@@ -477,8 +477,8 @@ void SizeofExpressionCheck::check(const MatchFinder::MatchResult &Result) {
     const auto *SizeOfExpr =
         Result.Nodes.getNodeAs<UnaryExprOrTypeTraitExpr>("sizeof-ptr-div-expr");
 
-    if (Ctx.hasSameType(LPtrTy, RPtrTy) &&
-        Ctx.hasSameType(LPtrTy, SizeofArgTy)) {
+    if (ASTContext::hasSameType(LPtrTy, RPtrTy) &&
+        ASTContext::hasSameType(LPtrTy, SizeofArgTy)) {
       diag(SizeOfExpr->getBeginLoc(), "suspicious usage of 'sizeof(...)' in "
                                       "pointer arithmetic")
           << SizeOfExpr->getSourceRange() << E->getOperatorLoc()
diff --git a/clang-tools-extra/clang-tidy/bugprone/StringConstructorCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/StringConstructorCheck.cpp
index e4f7a1778fd44..832377e376feb 100644
--- a/clang-tools-extra/clang-tidy/bugprone/StringConstructorCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/StringConstructorCheck.cpp
@@ -20,8 +20,9 @@ namespace {
 AST_MATCHER_P(IntegerLiteral, isBiggerThan, unsigned, N) {
   return Node.getValue().getZExtValue() > N;
 }
+} // namespace
 
-const char DefaultStringNames[] =
+static const char DefaultStringNames[] =
     "::std::basic_string;::std::basic_string_view";
 
 static std::vector<StringRef>
@@ -36,8 +37,6 @@ removeNamespaces(const std::vector<StringRef> &Names) {
   return Result;
 }
 
-} // namespace
-
 StringConstructorCheck::StringConstructorCheck(StringRef Name,
                                                ClangTidyContext *Context)
     : ClangTidyCheck(Name, Context),
diff --git a/clang-tools-extra/clang-tidy/bugprone/VirtualNearMissCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/VirtualNearMissCheck.cpp
index 0c8d2b8ef40f9..cef8b4da7fc17 100644
--- a/clang-tools-extra/clang-tidy/bugprone/VirtualNearMissCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/VirtualNearMissCheck.cpp
@@ -50,7 +50,7 @@ static bool checkOverridingFunctionReturnType(const ASTContext *Context,
     return false;
 
   // Check if return types are identical.
-  if (Context->hasSameType(DerivedReturnTy, BaseReturnTy))
+  if (ASTContext::hasSameType(DerivedReturnTy, BaseReturnTy))
     return true;
 
   /// Check if the return types are covariant.
@@ -77,7 +77,7 @@ static bool checkOverridingFunctionReturnType(const ASTContext *Context,
   if (DRD == BRD)
     return true;
 
-  if (!Context->hasSameUnqualifiedType(DTy, BTy)) {
+  if (!ASTContext::hasSameUnqualifiedType(DTy, BTy)) {
     // Begin checking whether the conversion from D to B is valid.
     CXXBasePaths Paths(/*FindAmbiguities=*/true, /*RecordPaths=*/true,
                        /*DetectVirtual=*/false);
@@ -87,7 +87,8 @@ static bool checkOverridingFunctionReturnType(const ASTContext *Context,
       return false;
 
     // Check ambiguity.
-    if (Paths.isAmbiguous(Context->getCanonicalType(BTy).getUnqualifiedType()))
+    if (Paths.isAmbiguous(
+            ASTContext::getCanonicalType(BTy).getUnqualifiedType()))
       return false;
 
     // Check accessibility.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.cpp
index 5de4e33a1e16d..1ac9b8bbdfedb 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.cpp
@@ -28,10 +28,13 @@ AST_MATCHER(CXXRecordDecl, hasDefaultConstructor) {
   return Node.hasDefaultConstructor();
 }
 
+} // namespace
+
 // Iterate over all the fields in a record type, both direct and indirect (e.g.
 // if the record contains an anonymous struct).
 template <typename T, typename Func>
-void forEachField(const RecordDecl &Record, const T &Fields, const Func &Fn) {
+static void forEachField(const RecordDecl &Record, const T &Fields,
+                         const Func &Fn) {
   for (const FieldDecl *F : Fields) {
     if (F->isAnonymousStructOrUnion()) {
       if (const CXXRecordDecl *R = F->getType()->getAsCXXRecordDecl())
@@ -43,8 +46,9 @@ void forEachField(const RecordDecl &Record, const T &Fields, const Func &Fn) {
 }
 
 template <typename T, typename Func>
-void forEachFieldWithFilter(const RecordDecl &Record, const T &Fields,
-                            bool &AnyMemberHasInitPerUnion, const Func &Fn) {
+static void forEachFieldWithFilter(const RecordDecl &Record, const T &Fields,
+                                   bool &AnyMemberHasInitPerUnion,
+                                   const Func &Fn) {
   for (const FieldDecl *F : Fields) {
     if (F->isAnonymousStructOrUnion()) {
       if (const CXXRecordDecl *R = F->getType()->getAsCXXRecordDecl()) {
@@ -59,8 +63,9 @@ void forEachFieldWithFilter(const RecordDecl &Record, const T &Fields,
   }
 }
 
-void removeFieldInitialized(const FieldDecl *M,
-                            SmallPtrSetImpl<const FieldDecl *> &FieldDecls) {
+static void
+removeFieldInitialized(const FieldDecl *M,
+                       SmallPtrSetImpl<const FieldDecl *> &FieldDecls) {
   const RecordDecl *R = M->getParent();
   if (R && R->isUnion()) {
     // Erase all members in a union if any member of it is initialized.
@@ -70,9 +75,9 @@ void removeFieldInitialized(const FieldDecl *M,
     FieldDecls.erase(M);
 }
 
-void removeFieldsInitializedInBody(
-    const Stmt &Stmt, ASTContext &Context,
-    SmallPtrSetImpl<const FieldDecl *> &FieldDecls) {
+static void
+removeFieldsInitializedInBody(const Stmt &Stmt, ASTContext &Context,
+                              SmallPtrSetImpl<const FieldDecl *> &FieldDecls) {
   auto Matches =
       match(findAll(binaryOperator(
                 hasOperatorName("="),
@@ -82,9 +87,9 @@ void removeFieldsInitializedInBody(
     removeFieldInitialized(Match.getNodeAs<FieldDecl>("fieldDecl"), FieldDecls);
 }
 
-StringRef getName(const FieldDecl *Field) { return Field->getName(); }
+static StringRef getName(const FieldDecl *Field) { return Field->getName(); }
 
-StringRef getName(const RecordDecl *Record) {
+static StringRef getName(const RecordDecl *Record) {
   // Get the typedef name if this is a C-style anonymous struct and typedef.
   if (const TypedefNameDecl *Typedef = Record->getTypedefNameForAnonDecl())
     return Typedef->getName();
@@ -94,7 +99,7 @@ StringRef getName(const RecordDecl *Record) {
 // Creates comma separated list of decls requiring initialization in order of
 // declaration.
 template <typename R, typename T>
-std::string
+static std::string
 toCommaSeparatedString(const R &OrderedDecls,
                        const SmallPtrSetImpl<const T *> &DeclsToInit) {
   SmallVector<StringRef, 16> Names;
@@ -105,12 +110,14 @@ toCommaSeparatedString(const R &OrderedDecls,
   return llvm::join(Names.begin(), Names.end(), ", ");
 }
 
-SourceLocation getLocationForEndOfToken(const ASTContext &Context,
-                                        SourceLocation Location) {
+static SourceLocation getLocationForEndOfToken(const ASTContext &Context,
+                                               SourceLocation Location) {
   return Lexer::getLocForEndOfToken(Location, 0, Context.getSourceManager(),
                                     Context.getLangOpts());
 }
 
+namespace {
+
 // There are 3 kinds of insertion placements:
 enum class InitializerPlacement {
   // 1. The fields are inserted after an existing CXXCtorInitializer stored in
@@ -187,15 +194,17 @@ struct InitializerInsertion {
   SmallVector<std::string, 4> Initializers;
 };
 
+} // namespace
+
 // Convenience utility to get a RecordDecl from a QualType.
-const RecordDecl *getCanonicalRecordDecl(const QualType &Type) {
+static const RecordDecl *getCanonicalRecordDecl(const QualType &Type) {
   if (const auto *RT = Type->getAsCanonical<RecordType>())
-    return RT->getOriginalDecl();
+    return RT->getDecl();
   return nullptr;
 }
 
 template <typename R, typename T>
-SmallVector<InitializerInsertion, 16>
+static SmallVector<InitializerInsertion, 16>
 computeInsertions(const CXXConstructorDecl::init_const_range &Inits,
                   const R &OrderedDecls,
                   const SmallPtrSetImpl<const T *> &DeclsToInit) {
@@ -239,8 +248,9 @@ computeInsertions(const CXXConstructorDecl::init_const_range &Inits,
 
 // Gets the list of bases and members that could possibly be initialized, in
 // order as they appear in the class declaration.
-void getInitializationsInOrder(const CXXRecordDecl &ClassDecl,
-                               SmallVectorImpl<const NamedDecl *> &Decls) {
+static void
+getInitializationsInOrder(const CXXRecordDecl &ClassDecl,
+                          SmallVectorImpl<const NamedDecl *> &Decls) {
   Decls.clear();
   for (const auto &Base : ClassDecl.bases()) {
     // Decl may be null if the base class is a template parameter.
@@ -253,9 +263,10 @@ void getInitializationsInOrder(const CXXRecordDecl &ClassDecl,
 }
 
 template <typename T>
-void fixInitializerList(const ASTContext &Context, DiagnosticBuilder &Diag,
-                        const CXXConstructorDecl *Ctor,
-                        const SmallPtrSetImpl<const T *> &DeclsToInit) {
+static void fixInitializerList(const ASTContext &Context,
+                               DiagnosticBuilder &Diag,
+                               const CXXConstructorDecl *Ctor,
+                               const SmallPtrSetImpl<const T *> &DeclsToInit) {
   // Do not propose fixes in macros since we cannot place them correctly.
   if (Ctor->getBeginLoc().isMacroID())
     return;
@@ -271,8 +282,6 @@ void fixInitializerList(const ASTContext &Context, DiagnosticBuilder &Diag,
   }
 }
 
-} // anonymous namespace
-
 ProTypeMemberInitCheck::ProTypeMemberInitCheck(StringRef Name,
                                                ClangTidyContext *Context)
     : ClangTidyCheck(Name, Context),
diff --git a/clang-tools-extra/clang-tidy/google/ExplicitConstructorCheck.cpp b/clang-tools-extra/clang-tidy/google/ExplicitConstructorCheck.cpp
index a038af4fa9543..6d5182d1e9787 100644
--- a/clang-tools-extra/clang-tidy/google/ExplicitConstructorCheck.cpp
+++ b/clang-tools-extra/clang-tidy/google/ExplicitConstructorCheck.cpp
@@ -72,7 +72,7 @@ static bool isStdInitializerList(QualType Type) {
   }
   if (const auto *RT = Type->getAs<RecordType>()) {
     if (const auto *Specialization =
-            dyn_cast<ClassTemplateSpecializationDecl>(RT->getOriginalDecl()))
+            dyn_cast<ClassTemplateSpecializationDecl>(RT->getDecl()))
       return declIsStdInitializerList(Specialization->getSpecializedTemplate());
   }
   return false;
diff --git a/clang-tools-extra/clang-tidy/google/GlobalVariableDeclarationCheck.cpp b/clang-tools-extra/clang-tidy/google/GlobalVariableDeclarationCheck.cpp
index c0c3ffaee796f..a4c76be92192e 100644
--- a/clang-tools-extra/clang-tidy/google/GlobalVariableDeclarationCheck.cpp
+++ b/clang-tools-extra/clang-tidy/google/GlobalVariableDeclarationCheck.cpp
@@ -21,7 +21,9 @@ namespace {
 
 AST_MATCHER(VarDecl, isLocalVariable) { return Node.isLocalVarDecl(); }
 
-FixItHint generateFixItHint(const VarDecl *Decl, bool IsConst) {
+} // namespace
+
+static FixItHint generateFixItHint(const VarDecl *Decl, bool IsConst) {
   if (IsConst && (Decl->getStorageClass() != SC_Static)) {
     // No fix available if it is not a static constant, since it is difficult
     // to determine the proper fix in this case.
@@ -52,7 +54,6 @@ FixItHint generateFixItHint(const VarDecl *Decl, bool IsConst) {
       CharSourceRange::getTokenRange(SourceRange(Decl->getLocation())),
       llvm::StringRef(NewName));
 }
-} // namespace
 
 void GlobalVariableDeclarationCheck::registerMatchers(MatchFinder *Finder) {
   // need to add two matchers since we need to bind different ids to distinguish
diff --git a/clang-tools-extra/clang-tidy/llvm/UseNewMLIROpBuilderCheck.cpp b/clang-tools-extra/clang-tidy/llvm/UseNewMLIROpBuilderCheck.cpp
index 0d81b9a9e38ca..bd51cc5037dca 100644
--- a/clang-tools-extra/clang-tidy/llvm/UseNewMLIROpBuilderCheck.cpp
+++ b/clang-tools-extra/clang-tidy/llvm/UseNewMLIROpBuilderCheck.cpp
@@ -111,10 +111,10 @@ EditGenerator rewrite(RangeSelector Call, RangeSelector Builder,
 }
 
 RewriteRuleWith<std::string> useNewMlirOpBuilderCheckRule() {
-  Stencil message = cat("use 'OpType::create(builder, ...)' instead of "
+  Stencil Message = cat("use 'OpType::create(builder, ...)' instead of "
                         "'builder.create<OpType>(...)'");
   // Match a create call on an OpBuilder.
-  ast_matchers::internal::Matcher<Stmt> base =
+  ast_matchers::internal::Matcher<Stmt> Base =
       cxxMemberCallExpr(
           on(expr(hasType(
                       cxxRecordDecl(isSameOrDerivedFrom("::mlir::OpBuilder"))))
@@ -124,10 +124,10 @@ RewriteRuleWith<std::string> useNewMlirOpBuilderCheckRule() {
           .bind("call");
   return applyFirst(
       //  Attempt rewrite given an lvalue builder, else just warn.
-      {makeRule(cxxMemberCallExpr(unless(on(cxxTemporaryObjectExpr())), base),
+      {makeRule(cxxMemberCallExpr(unless(on(cxxTemporaryObjectExpr())), Base),
                 rewrite(node("call"), node("builder"), callArgs("call")),
-                message),
-       makeRule(base, noopEdit(node("call")), message)});
+                Message),
+       makeRule(Base, noopEdit(node("call")), Message)});
 }
 } // namespace
 
diff --git a/clang-tools-extra/clang-tidy/llvmlibc/InlineFunctionDeclCheck.cpp b/clang-tools-extra/clang-tidy/llvmlibc/InlineFunctionDeclCheck.cpp
index 9dae57a50bb52..091e4fe84b36a 100644
--- a/clang-tools-extra/clang-tidy/llvmlibc/InlineFunctionDeclCheck.cpp
+++ b/clang-tools-extra/clang-tidy/llvmlibc/InlineFunctionDeclCheck.cpp
@@ -16,9 +16,7 @@ using namespace clang::ast_matchers;
 
 namespace clang::tidy::llvm_libc {
 
-namespace {
-
-const TemplateParameterList *
+static const TemplateParameterList *
 getLastTemplateParameterList(const FunctionDecl *FuncDecl) {
   const TemplateParameterList *ReturnList =
       FuncDecl->getDescribedTemplateParams();
@@ -35,8 +33,6 @@ getLastTemplateParameterList(const FunctionDecl *FuncDecl) {
   return ReturnList;
 }
 
-} // namespace
-
 InlineFunctionDeclCheck::InlineFunctionDeclCheck(StringRef Name,
                                                  ClangTidyContext *Context)
     : ClangTidyCheck(Name, Context),
diff --git a/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.cpp b/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.cpp
index 8ec7695aa842f..3b9b8e0daa62a 100644
--- a/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.cpp
@@ -60,12 +60,12 @@ AST_MATCHER_P(CoawaitExpr, awaitable, ast_matchers::internal::Matcher<Expr>,
     return InnerMatcher.matches(*E, Finder, Builder);
   return false;
 }
+} // namespace
 
-auto typeWithNameIn(const std::vector<StringRef> &Names) {
+static auto typeWithNameIn(const std::vector<StringRef> &Names) {
   return hasType(
       hasCanonicalType(hasDeclaration(namedDecl(hasAnyName(Names)))));
 }
-} // namespace
 
 CoroutineHostileRAIICheck::CoroutineHostileRAIICheck(StringRef Name,
                                                      ClangTidyContext *Context)
diff --git a/clang-tools-extra/clang-tidy/misc/NewDeleteOverloadsCheck.cpp b/clang-tools-extra/clang-tidy/misc/NewDeleteOverloadsCheck.cpp
index 5e0f32a900ea8..9801c9ea04d2d 100644
--- a/clang-tools-extra/clang-tidy/misc/NewDeleteOverloadsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/NewDeleteOverloadsCheck.cpp
@@ -53,7 +53,7 @@ AST_MATCHER(FunctionDecl, isPlacementOverload) {
   const auto *FPT = Node.getType()->castAs<FunctionProtoType>();
   ASTContext &Ctx = Node.getASTContext();
   if (Ctx.getLangOpts().SizedDeallocation &&
-      Ctx.hasSameType(FPT->getParamType(1), Ctx.getSizeType()))
+      ASTContext::hasSameType(FPT->getParamType(1), Ctx.getSizeType()))
     return false;
 
   return true;
diff --git a/clang-tools-extra/clang-tidy/misc/NoRecursionCheck.cpp b/clang-tools-extra/clang-tidy/misc/NoRecursionCheck.cpp
index 0d7667ce53c0c..035598d354503 100644
--- a/clang-tools-extra/clang-tidy/misc/NoRecursionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/NoRecursionCheck.cpp
@@ -151,10 +151,12 @@ constexpr unsigned SmallSCCSize = 32;
 using CallStackTy =
     llvm::SmallVector<CallGraphNode::CallRecord, SmallCallStackSize>;
 
+} // namespace
+
 // In given SCC, find *some* call stack that will be cyclic.
 // This will only find *one* such stack, it might not be the smallest one,
 // and there may be other loops.
-CallStackTy pathfindSomeCycle(ArrayRef<CallGraphNode *> SCC) {
+static CallStackTy pathfindSomeCycle(ArrayRef<CallGraphNode *> SCC) {
   // We'll need to be able to performantly look up whether some CallGraphNode
   // is in SCC or not, so cache all the SCC elements in a set.
   const ImmutableSmallSet<CallGraphNode *, SmallSCCSize> SCCElts(SCC);
@@ -190,8 +192,6 @@ CallStackTy pathfindSomeCycle(ArrayRef<CallGraphNode *> SCC) {
   return CallStack;
 }
 
-} // namespace
-
 void NoRecursionCheck::registerMatchers(MatchFinder *Finder) {
   Finder->addMatcher(translationUnitDecl().bind("TUDecl"), this);
 }
diff --git a/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.cpp b/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.cpp
index 17a8a50ff04ac..6baa12a8bcedf 100644
--- a/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.cpp
@@ -29,7 +29,6 @@ using namespace clang::ast_matchers;
 using namespace clang::tidy::matchers;
 
 namespace clang::tidy::misc {
-namespace {
 using llvm::APSInt;
 
 static constexpr llvm::StringLiteral KnownBannedMacroNames[] = {
@@ -420,6 +419,8 @@ markDuplicateOperands(const TExpr *TheExpr,
   return Duplicates.any();
 }
 
+namespace {
+
 AST_MATCHER(Expr, isIntegerConstantExpr) {
   if (Node.isInstantiationDependent())
     return false;
@@ -470,6 +471,8 @@ AST_MATCHER_P(Expr, expandedByMacro, ArrayRef<llvm::StringLiteral>, Names) {
   return false;
 }
 
+} // namespace
+
 // Returns a matcher for integer constant expressions.
 static ast_matchers::internal::Matcher<Expr>
 matchIntegerConstantExpr(StringRef Id) {
@@ -805,7 +808,8 @@ static bool isSameRawIdentifierToken(const Token &T1, const Token &T2,
          StringRef(SM.getCharacterData(T2.getLocation()), T2.getLength());
 }
 
-bool isTokAtEndOfExpr(SourceRange ExprSR, Token T, const SourceManager &SM) {
+static bool isTokAtEndOfExpr(SourceRange ExprSR, Token T,
+                             const SourceManager &SM) {
   return SM.getExpansionLoc(ExprSR.getEnd()) == T.getLocation();
 }
 
@@ -921,7 +925,6 @@ static bool areExprsSameMacroOrLiteral(const BinaryOperator *BinOp,
 
   return false;
 }
-} // namespace
 
 void RedundantExpressionCheck::registerMatchers(MatchFinder *Finder) {
   const auto BannedIntegerLiteral =
diff --git a/clang-tools-extra/clang-tidy/misc/UniqueptrResetReleaseCheck.cpp b/clang-tools-extra/clang-tidy/misc/UniqueptrResetReleaseCheck.cpp
index 27ddb7cb9b71c..ab2077b304f13 100644
--- a/clang-tools-extra/clang-tidy/misc/UniqueptrResetReleaseCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/UniqueptrResetReleaseCheck.cpp
@@ -53,9 +53,8 @@ void UniqueptrResetReleaseCheck::registerMatchers(MatchFinder *Finder) {
       this);
 }
 
-namespace {
-const Type *getDeleterForUniquePtr(const MatchFinder::MatchResult &Result,
-                                   StringRef ID) {
+static const Type *
+getDeleterForUniquePtr(const MatchFinder::MatchResult &Result, StringRef ID) {
   const auto *Class =
       Result.Nodes.getNodeAs<ClassTemplateSpecializationDecl>(ID);
   if (!Class)
@@ -66,7 +65,7 @@ const Type *getDeleterForUniquePtr(const MatchFinder::MatchResult &Result,
   return DeleterArgument.getAsType().getTypePtr();
 }
 
-bool areDeletersCompatible(const MatchFinder::MatchResult &Result) {
+static bool areDeletersCompatible(const MatchFinder::MatchResult &Result) {
   const Type *LeftDeleterType = getDeleterForUniquePtr(Result, "left_class");
   const Type *RightDeleterType = getDeleterForUniquePtr(Result, "right_class");
 
@@ -103,8 +102,6 @@ bool areDeletersCompatible(const MatchFinder::MatchResult &Result) {
   return false;
 }
 
-} // namespace
-
 void UniqueptrResetReleaseCheck::check(const MatchFinder::MatchResult &Result) {
   if (!areDeletersCompatible(Result))
     return;
diff --git a/clang-tools-extra/clang-tidy/misc/UnusedParametersCheck.cpp b/clang-tools-extra/clang-tidy/misc/UnusedParametersCheck.cpp
index 37e289cd9e497..f2189f546cf55 100644
--- a/clang-tools-extra/clang-tidy/misc/UnusedParametersCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/UnusedParametersCheck.cpp
@@ -22,15 +22,14 @@ using namespace clang::ast_matchers;
 
 namespace clang::tidy::misc {
 
-namespace {
-bool isOverrideMethod(const FunctionDecl *Function) {
+static bool isOverrideMethod(const FunctionDecl *Function) {
   if (const auto *MD = dyn_cast<CXXMethodDecl>(Function))
     return MD->size_overridden_methods() > 0 || MD->hasAttr<OverrideAttr>();
   return false;
 }
 
-bool hasAttrAfterParam(const SourceManager *SourceManager,
-                       const ParmVarDecl *Param) {
+static bool hasAttrAfterParam(const SourceManager *SourceManager,
+                              const ParmVarDecl *Param) {
   for (const auto *Attr : Param->attrs()) {
     if (SourceManager->isBeforeInTranslationUnit(Param->getLocation(),
                                                  Attr->getLocation())) {
@@ -39,7 +38,6 @@ bool hasAttrAfterParam(const SourceManager *SourceManager,
   }
   return false;
 }
-} // namespace
 
 void UnusedParametersCheck::registerMatchers(MatchFinder *Finder) {
   Finder->addMatcher(functionDecl(isDefinition(), hasBody(stmt()),
diff --git a/clang-tools-extra/clang-tidy/misc/UnusedUsingDeclsCheck.cpp b/clang-tools-extra/clang-tidy/misc/UnusedUsingDeclsCheck.cpp
index 31524e41f12a3..81840cc984135 100644
--- a/clang-tools-extra/clang-tidy/misc/UnusedUsingDeclsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/UnusedUsingDeclsCheck.cpp
@@ -132,7 +132,7 @@ void UnusedUsingDeclsCheck::check(const MatchFinder::MatchResult &Result) {
     }
     if (const auto *ECD = dyn_cast<EnumConstantDecl>(Used)) {
       if (const auto *ET = ECD->getType()->getAsCanonical<EnumType>())
-        removeFromFoundDecls(ET->getOriginalDecl());
+        removeFromFoundDecls(ET->getDecl());
     }
   };
   // We rely on the fact that the clang AST is walked in order, usages are only
diff --git a/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp b/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp
index 37482583760f2..fea5ac6f29eff 100644
--- a/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp
@@ -499,7 +499,7 @@ static bool canBeModified(ASTContext *Context, const Expr *E) {
     return true;
   if (const auto *Cast = Parents[0].get<ImplicitCastExpr>()) {
     if ((Cast->getCastKind() == CK_NoOp &&
-         Context->hasSameType(Cast->getType(), E->getType().withConst())) ||
+         ASTContext::hasSameType(Cast->getType(), E->getType().withConst())) ||
         (Cast->getCastKind() == CK_LValueToRValue &&
          !Cast->getType().isNull() && Cast->getType()->isFundamentalType()))
       return false;
@@ -664,7 +664,8 @@ void LoopConvertCheck::doConversion(
         AliasVarIsRef = true;
       }
       if (Descriptor.ElemType.isNull() ||
-          !Context->hasSameUnqualifiedType(AliasVarType, Descriptor.ElemType))
+          !ASTContext::hasSameUnqualifiedType(AliasVarType,
+                                              Descriptor.ElemType))
         Descriptor.ElemType = AliasVarType;
     }
 
@@ -944,7 +945,7 @@ bool LoopConvertCheck::isConvertible(ASTContext *Context,
         CanonicalInitVarType->isPointerType()) {
       // If the initializer and the variable are both pointers check if the
       // un-qualified pointee types match, otherwise we don't use auto.
-      return Context->hasSameUnqualifiedType(
+      return ASTContext::hasSameUnqualifiedType(
           CanonicalBeginType->getPointeeType(),
           CanonicalInitVarType->getPointeeType());
     }
diff --git a/clang-tools-extra/clang-tidy/modernize/LoopConvertUtils.cpp b/clang-tools-extra/clang-tidy/modernize/LoopConvertUtils.cpp
index 286c39be44ce4..586deea46e86f 100644
--- a/clang-tools-extra/clang-tidy/modernize/LoopConvertUtils.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/LoopConvertUtils.cpp
@@ -370,7 +370,7 @@ static bool isAliasDecl(ASTContext *Context, const Decl *TheDecl,
       DeclarationType = DeclarationType.getNonReferenceType();
 
     if (InitType.isNull() || DeclarationType.isNull() ||
-        !Context->hasSameUnqualifiedType(DeclarationType, InitType))
+        !ASTContext::hasSameUnqualifiedType(DeclarationType, InitType))
       return false;
   }
 
diff --git a/clang-tools-extra/clang-tidy/modernize/RedundantVoidArgCheck.cpp b/clang-tools-extra/clang-tidy/modernize/RedundantVoidArgCheck.cpp
index 5eebccc366fd5..38b30f7994ff3 100644
--- a/clang-tools-extra/clang-tidy/modernize/RedundantVoidArgCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/RedundantVoidArgCheck.cpp
@@ -14,10 +14,8 @@ using namespace clang::ast_matchers;
 
 namespace clang::tidy::modernize {
 
-namespace {
-
 // Determine if the given QualType is a nullary function or pointer to same.
-bool protoTypeHasNoParms(QualType QT) {
+static bool protoTypeHasNoParms(QualType QT) {
   if (const auto *PT = QT->getAs<PointerType>())
     QT = PT->getPointeeType();
   if (auto *MPT = QT->getAs<MemberPointerType>())
@@ -27,16 +25,14 @@ bool protoTypeHasNoParms(QualType QT) {
   return false;
 }
 
-const char FunctionId[] = "function";
-const char TypedefId[] = "typedef";
-const char FieldId[] = "field";
-const char VarId[] = "var";
-const char NamedCastId[] = "named-cast";
-const char CStyleCastId[] = "c-style-cast";
-const char ExplicitCastId[] = "explicit-cast";
-const char LambdaId[] = "lambda";
-
-} // namespace
+static const char FunctionId[] = "function";
+static const char TypedefId[] = "typedef";
+static const char FieldId[] = "field";
+static const char VarId[] = "var";
+static const char NamedCastId[] = "named-cast";
+static const char CStyleCastId[] = "c-style-cast";
+static const char ExplicitCastId[] = "explicit-cast";
+static const char LambdaId[] = "lambda";
 
 void RedundantVoidArgCheck::registerMatchers(MatchFinder *Finder) {
   Finder->addMatcher(functionDecl(parameterCountIs(0), unless(isImplicit()),
diff --git a/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.cpp
index c7fd0a9695952..01796a6f4af2d 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.cpp
@@ -316,7 +316,7 @@ void UseAutoCheck::replaceIterators(const DeclStmt *D, ASTContext *Context) {
       if (NestedConstruct->getConstructor()->isConvertingConstructor(false))
         return;
     }
-    if (!Context->hasSameType(V->getType(), E->getType()))
+    if (!ASTContext::hasSameType(V->getType(), E->getType()))
       return;
   }
 
@@ -378,7 +378,7 @@ void UseAutoCheck::replaceExpr(
       return;
 
     // If VarDecl and Initializer have mismatching unqualified types.
-    if (!Context->hasSameUnqualifiedType(V->getType(), GetType(Expr)))
+    if (!ASTContext::hasSameUnqualifiedType(V->getType(), GetType(Expr)))
       return;
 
     // All subsequent variables in this declaration should have the same
diff --git a/clang-tools-extra/clang-tidy/modernize/UseIntegerSignComparisonCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseIntegerSignComparisonCheck.cpp
index 0003429c62890..35320e83c5d4b 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseIntegerSignComparisonCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseIntegerSignComparisonCheck.cpp
@@ -34,13 +34,12 @@ AST_MATCHER(clang::QualType, isActualChar) {
 } // namespace
 
 static BindableMatcher<clang::Stmt>
-intCastExpression(bool IsSigned,
-                  const std::string &CastBindName = std::string()) {
+intCastExpression(bool IsSigned, StringRef CastBindName = {}) {
   // std::cmp_{} functions trigger a compile-time error if either LHS or RHS
   // is a non-integer type, char, enum or bool
   // (unsigned char/ signed char are Ok and can be used).
   auto IntTypeExpr = expr(hasType(hasCanonicalType(qualType(
-      isInteger(), IsSigned ? isSignedInteger() : isUnsignedInteger(),
+      IsSigned ? isSignedInteger() : isUnsignedInteger(),
       unless(isActualChar()), unless(booleanType()), unless(enumType())))));
 
   const auto ImplicitCastExpr =
@@ -71,7 +70,7 @@ static StringRef parseOpCode(BinaryOperator::Opcode Code) {
   case BO_NE:
     return "cmp_not_equal";
   default:
-    return "";
+    llvm_unreachable("invalid opcode");
   }
 }
 
@@ -119,23 +118,16 @@ void UseIntegerSignComparisonCheck::check(
   Expr::EvalResult EVResult;
   if (!SignedCastExpression->isValueDependent() &&
       SignedCastExpression->getSubExpr()->EvaluateAsInt(EVResult,
-                                                        *Result.Context)) {
-    const llvm::APSInt SValue = EVResult.Val.getInt();
-    if (SValue.isNonNegative())
-      return;
-  }
+                                                        *Result.Context) &&
+      EVResult.Val.getInt().isNonNegative())
+    return;
 
   const auto *BinaryOp =
       Result.Nodes.getNodeAs<BinaryOperator>("intComparison");
-  if (BinaryOp == nullptr)
-    return;
-
-  const BinaryOperator::Opcode OpCode = BinaryOp->getOpcode();
+  assert(BinaryOp);
 
   const Expr *LHS = BinaryOp->getLHS()->IgnoreImpCasts();
   const Expr *RHS = BinaryOp->getRHS()->IgnoreImpCasts();
-  if (LHS == nullptr || RHS == nullptr)
-    return;
   const Expr *SubExprLHS = nullptr;
   const Expr *SubExprRHS = nullptr;
   SourceRange R1(LHS->getBeginLoc());
@@ -144,33 +136,34 @@ void UseIntegerSignComparisonCheck::check(
       RHS->getEndLoc(), 0, *Result.SourceManager, getLangOpts()));
   if (const auto *LHSCast = llvm::dyn_cast<ExplicitCastExpr>(LHS)) {
     SubExprLHS = LHSCast->getSubExpr();
-    R1 = SourceRange(LHS->getBeginLoc(),
-                     SubExprLHS->getBeginLoc().getLocWithOffset(-1));
+    R1.setEnd(SubExprLHS->getBeginLoc().getLocWithOffset(-1));
     R2.setBegin(Lexer::getLocForEndOfToken(
         SubExprLHS->getEndLoc(), 0, *Result.SourceManager, getLangOpts()));
   }
   if (const auto *RHSCast = llvm::dyn_cast<ExplicitCastExpr>(RHS)) {
     SubExprRHS = RHSCast->getSubExpr();
     R2.setEnd(SubExprRHS->getBeginLoc().getLocWithOffset(-1));
+    R3.setBegin(Lexer::getLocForEndOfToken(
+        SubExprRHS->getEndLoc(), 0, *Result.SourceManager, getLangOpts()));
   }
   DiagnosticBuilder Diag =
       diag(BinaryOp->getBeginLoc(),
            "comparison between 'signed' and 'unsigned' integers");
-  std::string CmpNamespace;
-  llvm::StringRef CmpHeader;
+  StringRef CmpNamespace;
+  StringRef CmpHeader;
 
   if (getLangOpts().CPlusPlus20) {
     CmpHeader = "<utility>";
-    CmpNamespace = llvm::Twine("std::" + parseOpCode(OpCode)).str();
+    CmpNamespace = "std::";
   } else if (getLangOpts().CPlusPlus17 && EnableQtSupport) {
     CmpHeader = "<QtCore/q20utility.h>";
-    CmpNamespace = llvm::Twine("q20::" + parseOpCode(OpCode)).str();
+    CmpNamespace = "q20::";
   }
 
   // Prefer modernize-use-integer-sign-comparison when C++20 is available!
   Diag << FixItHint::CreateReplacement(
       CharSourceRange(R1, SubExprLHS != nullptr),
-      llvm::Twine(CmpNamespace + "(").str());
+      Twine(CmpNamespace + parseOpCode(BinaryOp->getOpcode()) + "(").str());
   Diag << FixItHint::CreateReplacement(R2, ",");
   Diag << FixItHint::CreateReplacement(CharSourceRange::getCharRange(R3), ")");
 
diff --git a/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp
index b921819ad13e6..b6834c69204c2 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp
@@ -21,8 +21,6 @@ using namespace llvm;
 namespace clang::tidy::modernize {
 namespace {
 
-const char CastSequence[] = "sequence";
-
 AST_MATCHER(Type, sugaredNullptrType) {
   const Type *DesugaredType = Node.getUnqualifiedDesugaredType();
   if (const auto *BT = dyn_cast<BuiltinType>(DesugaredType))
@@ -30,6 +28,10 @@ AST_MATCHER(Type, sugaredNullptrType) {
   return false;
 }
 
+} // namespace
+
+static const char CastSequence[] = "sequence";
+
 /// Create a matcher that finds implicit casts as well as the head of a
 /// sequence of zero or more nested explicit casts that have an implicit cast
 /// to null within.
@@ -43,7 +45,8 @@ AST_MATCHER(Type, sugaredNullptrType) {
 /// would check for the "NULL" macro instead, but that'd be harder to express.
 /// In practice, "NULL" is often defined as "__null", and this is a useful
 /// condition.
-StatementMatcher makeCastSequenceMatcher(llvm::ArrayRef<StringRef> NameList) {
+static StatementMatcher
+makeCastSequenceMatcher(llvm::ArrayRef<StringRef> NameList) {
   auto ImplicitCastToNull = implicitCastExpr(
       anyOf(hasCastKind(CK_NullToPointer), hasCastKind(CK_NullToMemberPointer)),
       anyOf(hasSourceExpression(gnuNullExpr()),
@@ -79,16 +82,16 @@ StatementMatcher makeCastSequenceMatcher(llvm::ArrayRef<StringRef> NameList) {
                 unless(hasAncestor(functionDecl(isDefaulted()))))));
 }
 
-bool isReplaceableRange(SourceLocation StartLoc, SourceLocation EndLoc,
-                        const SourceManager &SM) {
+static bool isReplaceableRange(SourceLocation StartLoc, SourceLocation EndLoc,
+                               const SourceManager &SM) {
   return SM.isWrittenInSameFile(StartLoc, EndLoc);
 }
 
 /// Replaces the provided range with the text "nullptr", but only if
 /// the start and end location are both in main file.
 /// Returns true if and only if a replacement was made.
-void replaceWithNullptr(ClangTidyCheck &Check, SourceManager &SM,
-                        SourceLocation StartLoc, SourceLocation EndLoc) {
+static void replaceWithNullptr(ClangTidyCheck &Check, SourceManager &SM,
+                               SourceLocation StartLoc, SourceLocation EndLoc) {
   CharSourceRange Range(SourceRange(StartLoc, EndLoc), true);
   // Add a space if nullptr follows an alphanumeric character. This happens
   // whenever there is an c-style explicit cast to nullptr not surrounded by
@@ -106,8 +109,9 @@ void replaceWithNullptr(ClangTidyCheck &Check, SourceManager &SM,
 /// #define MY_NULL NULL
 /// \endcode
 /// If \p Loc points to NULL, this function will return the name MY_NULL.
-StringRef getOutermostMacroName(SourceLocation Loc, const SourceManager &SM,
-                                const LangOptions &LO) {
+static StringRef getOutermostMacroName(SourceLocation Loc,
+                                       const SourceManager &SM,
+                                       const LangOptions &LO) {
   assert(Loc.isMacroID());
   SourceLocation OutermostMacroLoc;
 
@@ -119,6 +123,8 @@ StringRef getOutermostMacroName(SourceLocation Loc, const SourceManager &SM,
   return Lexer::getImmediateMacroName(OutermostMacroLoc, SM, LO);
 }
 
+namespace {
+
 /// RecursiveASTVisitor for ensuring all nodes rooted at a given AST
 /// subtree that have file-level source locations corresponding to a macro
 /// argument have implicit NullTo(Member)Pointer nodes as ancestors.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseScopedLockCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseScopedLockCheck.cpp
index aa1ee6db8917a..a004480cb1b92 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseScopedLockCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseScopedLockCheck.cpp
@@ -29,7 +29,7 @@ static bool isLockGuardDecl(const NamedDecl *Decl) {
 
 static bool isLockGuard(const QualType &Type) {
   if (const auto *Record = Type->getAsCanonical<RecordType>())
-    if (const RecordDecl *Decl = Record->getOriginalDecl())
+    if (const RecordDecl *Decl = Record->getDecl())
       return isLockGuardDecl(Decl);
 
   if (const auto *TemplateSpecType = Type->getAs<TemplateSpecializationType>())
diff --git a/clang-tools-extra/clang-tidy/modernize/UseStdNumbersCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseStdNumbersCheck.cpp
index a04f78c271d42..414aa86c5fbd2 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseStdNumbersCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseStdNumbersCheck.cpp
@@ -255,8 +255,10 @@ struct MatchBuilder {
   double DiffThreshold;
 };
 
-std::string getCode(const StringRef Constant, const bool IsFloat,
-                    const bool IsLongDouble) {
+} // namespace
+
+static std::string getCode(const StringRef Constant, const bool IsFloat,
+                           const bool IsLongDouble) {
   if (IsFloat) {
     return ("std::numbers::" + Constant + "_v<float>").str();
   }
@@ -266,9 +268,9 @@ std::string getCode(const StringRef Constant, const bool IsFloat,
   return ("std::numbers::" + Constant).str();
 }
 
-bool isRangeOfCompleteMacro(const clang::SourceRange &Range,
-                            const clang::SourceManager &SM,
-                            const clang::LangOptions &LO) {
+static bool isRangeOfCompleteMacro(const clang::SourceRange &Range,
+                                   const clang::SourceManager &SM,
+                                   const clang::LangOptions &LO) {
   if (!Range.getBegin().isMacroID()) {
     return false;
   }
@@ -287,8 +289,6 @@ bool isRangeOfCompleteMacro(const clang::SourceRange &Range,
   return true;
 }
 
-} // namespace
-
 namespace clang::tidy::modernize {
 UseStdNumbersCheck::UseStdNumbersCheck(const StringRef Name,
                                        ClangTidyContext *const Context)
diff --git a/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp
index 3e27d8fa1fe42..d623ec402179b 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp
@@ -77,7 +77,7 @@ struct UnqualNameVisitor : public RecursiveASTVisitor<UnqualNameVisitor> {
       if (T->getKeyword() != ElaboratedTypeKeyword::None ||
           TTL.getQualifierLoc())
         break;
-      if (visitUnqualName(T->getOriginalDecl()->getName()))
+      if (visitUnqualName(T->getDecl()->getName()))
         return false;
       break;
     }
diff --git a/clang-tools-extra/clang-tidy/performance/EnumSizeCheck.cpp b/clang-tools-extra/clang-tidy/performance/EnumSizeCheck.cpp
index edd3ded2e2858..03981c1aadff0 100644
--- a/clang-tools-extra/clang-tidy/performance/EnumSizeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/EnumSizeCheck.cpp
@@ -35,7 +35,8 @@ const std::uint64_t Min32 =
     std::imaxabs(std::numeric_limits<std::int32_t>::min());
 const std::uint64_t Max32 = std::numeric_limits<std::int32_t>::max();
 
-std::pair<const char *, std::uint32_t>
+} // namespace
+static std::pair<const char *, std::uint32_t>
 getNewType(std::size_t Size, std::uint64_t Min, std::uint64_t Max) noexcept {
   if (Min) {
     if (Min <= Min8 && Max <= Max8) {
@@ -75,8 +76,6 @@ getNewType(std::size_t Size, std::uint64_t Min, std::uint64_t Max) noexcept {
   return {"std::uint8_t", sizeof(std::uint8_t)};
 }
 
-} // namespace
-
 EnumSizeCheck::EnumSizeCheck(StringRef Name, ClangTidyContext *Context)
     : ClangTidyCheck(Name, Context),
       EnumIgnoreList(
diff --git a/clang-tools-extra/clang-tidy/performance/FasterStringFindCheck.cpp b/clang-tools-extra/clang-tidy/performance/FasterStringFindCheck.cpp
index d26480fc9f60d..7c90130c826f0 100644
--- a/clang-tools-extra/clang-tidy/performance/FasterStringFindCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/FasterStringFindCheck.cpp
@@ -17,9 +17,8 @@ using namespace clang::ast_matchers;
 
 namespace clang::tidy::performance {
 
-namespace {
-
-std::optional<std::string> makeCharacterLiteral(const StringLiteral *Literal) {
+static std::optional<std::string>
+makeCharacterLiteral(const StringLiteral *Literal) {
   std::string Result;
   {
     llvm::raw_string_ostream OS(Result);
@@ -43,6 +42,8 @@ std::optional<std::string> makeCharacterLiteral(const StringLiteral *Literal) {
   return Result;
 }
 
+namespace {
+
 AST_MATCHER_FUNCTION(ast_matchers::internal::Matcher<Expr>,
                      hasSubstitutedType) {
   return hasType(qualType(anyOf(substTemplateTypeParmType(),
diff --git a/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.cpp b/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.cpp
index 3da1469a9f120..4a8f292b726ee 100644
--- a/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.cpp
@@ -17,8 +17,6 @@ using namespace clang::ast_matchers;
 
 namespace clang::tidy::performance {
 
-namespace {
-
 // Matcher names. Given the code:
 //
 // \code
@@ -60,12 +58,14 @@ static const char LoopInitVarName[] = "loop_init_var";
 static const char LoopEndExprName[] = "loop_end_expr";
 static const char RangeLoopName[] = "for_range_loop";
 
-ast_matchers::internal::Matcher<Expr> supportedContainerTypesMatcher() {
+static ast_matchers::internal::Matcher<Expr> supportedContainerTypesMatcher() {
   return hasType(cxxRecordDecl(hasAnyName(
       "::std::vector", "::std::set", "::std::unordered_set", "::std::map",
       "::std::unordered_map", "::std::array", "::std::deque")));
 }
 
+namespace {
+
 AST_MATCHER(Expr, hasSideEffects) {
   return Node.HasSideEffects(Finder->getASTContext());
 }
diff --git a/clang-tools-extra/clang-tidy/readability/AvoidConstParamsInDecls.cpp b/clang-tools-extra/clang-tidy/readability/AvoidConstParamsInDecls.cpp
index 554996730c2be..02fe913ee7918 100644
--- a/clang-tools-extra/clang-tidy/readability/AvoidConstParamsInDecls.cpp
+++ b/clang-tools-extra/clang-tidy/readability/AvoidConstParamsInDecls.cpp
@@ -15,9 +15,8 @@
 using namespace clang::ast_matchers;
 
 namespace clang::tidy::readability {
-namespace {
 
-SourceRange getTypeRange(const ParmVarDecl &Param) {
+static SourceRange getTypeRange(const ParmVarDecl &Param) {
   return {Param.getBeginLoc(), Param.getLocation().getLocWithOffset(-1)};
 }
 
@@ -39,8 +38,6 @@ findConstToRemove(const ParmVarDecl &Param,
       tok::kw_const, FileRange, *Result.Context, *Result.SourceManager);
 }
 
-} // namespace
-
 void AvoidConstParamsInDecls::storeOptions(ClangTidyOptions::OptionMap &Opts) {
   Options.store(Opts, "IgnoreMacros", IgnoreMacros);
 }
diff --git a/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.cpp b/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.cpp
index 570a109e55b14..0237c057afed5 100644
--- a/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/DuplicateIncludeCheck.cpp
@@ -64,6 +64,8 @@ class DuplicateIncludeCallbacks : public PPCallbacks {
   const SourceManager &SM;
 };
 
+} // namespace
+
 void DuplicateIncludeCallbacks::FileChanged(SourceLocation Loc,
                                             FileChangeReason Reason,
                                             SrcMgr::CharacteristicKind FileType,
@@ -107,8 +109,6 @@ void DuplicateIncludeCallbacks::MacroUndefined(const Token &MacroNameTok,
   Files.back().clear();
 }
 
-} // namespace
-
 void DuplicateIncludeCheck::registerPPCallbacks(
     const SourceManager &SM, Preprocessor *PP, Preprocessor *ModuleExpanderPP) {
   PP->addPPCallbacks(std::make_unique<DuplicateIncludeCallbacks>(*this, SM));
diff --git a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
index bfdf9cbd92d2b..6f6da57d7822b 100644
--- a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
@@ -51,7 +51,7 @@ static StringRef getZeroLiteralToCompareWithForType(CastKind CastExprKind,
     return Type->isUnsignedIntegerType() ? "0u" : "0";
 
   case CK_FloatingToBoolean:
-    return Context.hasSameType(Type, Context.FloatTy) ? "0.0f" : "0.0";
+    return ASTContext::hasSameType(Type, Context.FloatTy) ? "0.0f" : "0.0";
 
   case CK_PointerToBoolean:
   case CK_MemberPointerToBoolean: // Fall-through on purpose.
@@ -215,7 +215,7 @@ getEquivalentForBoolLiteral(const CXXBoolLiteralExpr *BoolLiteral,
   }
 
   if (DestType->isFloatingType()) {
-    if (Context.hasSameType(DestType, Context.FloatTy)) {
+    if (ASTContext::hasSameType(DestType, Context.FloatTy)) {
       return BoolLiteral->getValue() ? "1.0f" : "0.0f";
     }
     return BoolLiteral->getValue() ? "1.0" : "0.0";
diff --git a/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.cpp b/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.cpp
index 2eb26fcf840cd..93580a7e67c4a 100644
--- a/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.cpp
@@ -54,9 +54,12 @@ struct InconsistentDeclarationInfo {
 using InconsistentDeclarationsContainer =
     llvm::SmallVector<InconsistentDeclarationInfo, 2>;
 
-bool checkIfFixItHintIsApplicable(
-    const FunctionDecl *ParameterSourceDeclaration,
-    const ParmVarDecl *SourceParam, const FunctionDecl *OriginalDeclaration) {
+} // namespace
+
+static bool
+checkIfFixItHintIsApplicable(const FunctionDecl *ParameterSourceDeclaration,
+                             const ParmVarDecl *SourceParam,
+                             const FunctionDecl *OriginalDeclaration) {
   // Assumptions with regard to function declarations/definition:
   //  * If both function declaration and definition are seen, assume that
   //    definition is most up-to-date, and use it to generate replacements.
@@ -83,7 +86,7 @@ bool checkIfFixItHintIsApplicable(
   return true;
 }
 
-bool nameMatch(StringRef L, StringRef R, bool Strict) {
+static bool nameMatch(StringRef L, StringRef R, bool Strict) {
   if (Strict)
     return L.empty() || R.empty() || L == R;
   // We allow two names if one is a prefix/suffix of the other, ignoring case.
@@ -92,7 +95,7 @@ bool nameMatch(StringRef L, StringRef R, bool Strict) {
          L.ends_with_insensitive(R) || R.ends_with_insensitive(L);
 }
 
-DifferingParamsContainer
+static DifferingParamsContainer
 findDifferingParamsInDeclaration(const FunctionDecl *ParameterSourceDeclaration,
                                  const FunctionDecl *OtherDeclaration,
                                  const FunctionDecl *OriginalDeclaration,
@@ -129,7 +132,7 @@ findDifferingParamsInDeclaration(const FunctionDecl *ParameterSourceDeclaration,
   return DifferingParams;
 }
 
-InconsistentDeclarationsContainer
+static InconsistentDeclarationsContainer
 findInconsistentDeclarations(const FunctionDecl *OriginalDeclaration,
                              const FunctionDecl *ParameterSourceDeclaration,
                              SourceManager &SM, bool Strict) {
@@ -162,7 +165,7 @@ findInconsistentDeclarations(const FunctionDecl *OriginalDeclaration,
   return InconsistentDeclarations;
 }
 
-const FunctionDecl *
+static const FunctionDecl *
 getParameterSourceDeclaration(const FunctionDecl *OriginalDeclaration) {
   const FunctionTemplateDecl *PrimaryTemplate =
       OriginalDeclaration->getPrimaryTemplate();
@@ -187,7 +190,7 @@ getParameterSourceDeclaration(const FunctionDecl *OriginalDeclaration) {
   return OriginalDeclaration;
 }
 
-std::string joinParameterNames(
+static std::string joinParameterNames(
     const DifferingParamsContainer &DifferingParams,
     llvm::function_ref<StringRef(const DifferingParamInfo &)> ChooseParamName) {
   llvm::SmallString<40> Str;
@@ -202,7 +205,7 @@ std::string joinParameterNames(
   return std::string(Str);
 }
 
-void formatDifferingParamsDiagnostic(
+static void formatDifferingParamsDiagnostic(
     InconsistentDeclarationParameterNameCheck *Check, SourceLocation Location,
     StringRef OtherDeclarationDescription,
     const DifferingParamsContainer &DifferingParams) {
@@ -230,7 +233,7 @@ void formatDifferingParamsDiagnostic(
   }
 }
 
-void formatDiagnosticsForDeclarations(
+static void formatDiagnosticsForDeclarations(
     InconsistentDeclarationParameterNameCheck *Check,
     const FunctionDecl *ParameterSourceDeclaration,
     const FunctionDecl *OriginalDeclaration,
@@ -256,7 +259,7 @@ void formatDiagnosticsForDeclarations(
   }
 }
 
-void formatDiagnostics(
+static void formatDiagnostics(
     InconsistentDeclarationParameterNameCheck *Check,
     const FunctionDecl *ParameterSourceDeclaration,
     const FunctionDecl *OriginalDeclaration,
@@ -279,8 +282,6 @@ void formatDiagnostics(
   }
 }
 
-} // anonymous namespace
-
 void InconsistentDeclarationParameterNameCheck::storeOptions(
     ClangTidyOptions::OptionMap &Opts) {
   Options.store(Opts, "IgnoreMacros", IgnoreMacros);
diff --git a/clang-tools-extra/clang-tidy/readability/QualifiedAutoCheck.cpp b/clang-tools-extra/clang-tidy/readability/QualifiedAutoCheck.cpp
index dc9510d1dab62..942a0a8a4469f 100644
--- a/clang-tools-extra/clang-tidy/readability/QualifiedAutoCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/QualifiedAutoCheck.cpp
@@ -142,12 +142,11 @@ void QualifiedAutoCheck::registerMatchers(MatchFinder *Finder) {
     if (this->IgnoreAliasing) {
       return qualType(
           hasUnqualifiedDesugaredType(pointerType(pointee(InnerMatchers...))));
-    } else {
-      return qualType(
-          anyOf(qualType(pointerType(pointee(InnerMatchers...))),
-                qualType(substTemplateTypeParmType(hasReplacementType(
-                    pointerType(pointee(InnerMatchers...)))))));
     }
+    return qualType(anyOf(qualType(pointerType(pointee(InnerMatchers...))),
+                          qualType(substTemplateTypeParmType(hasReplacementType(
+                              pointerType(pointee(InnerMatchers...)))))));
+   
   };
 
   auto IsAutoDeducedToPointer =
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantSmartptrGetCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantSmartptrGetCheck.cpp
index 0598683bff6c2..11065230edc60 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantSmartptrGetCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/RedundantSmartptrGetCheck.cpp
@@ -14,8 +14,8 @@ using namespace clang::ast_matchers;
 
 namespace clang::tidy::readability {
 
-namespace {
-internal::Matcher<Expr> callToGet(const internal::Matcher<Decl> &OnClass) {
+static internal::Matcher<Expr>
+callToGet(const internal::Matcher<Decl> &OnClass) {
   return expr(
              anyOf(cxxMemberCallExpr(
                        on(expr(anyOf(hasType(OnClass),
@@ -43,12 +43,13 @@ internal::Matcher<Expr> callToGet(const internal::Matcher<Decl> &OnClass) {
       .bind("redundant_get");
 }
 
-internal::Matcher<Decl> knownSmartptr() {
+static internal::Matcher<Decl> knownSmartptr() {
   return recordDecl(hasAnyName("::std::unique_ptr", "::std::shared_ptr"));
 }
 
-void registerMatchersForGetArrowStart(MatchFinder *Finder,
-                                      MatchFinder::MatchCallback *Callback) {
+static void
+registerMatchersForGetArrowStart(MatchFinder *Finder,
+                                 MatchFinder::MatchCallback *Callback) {
   const auto MatchesOpArrow =
       allOf(hasName("operator->"),
             returns(qualType(pointsTo(type().bind("op->Type")))));
@@ -100,8 +101,8 @@ void registerMatchersForGetArrowStart(MatchFinder *Finder,
                      Callback);
 }
 
-void registerMatchersForGetEquals(MatchFinder *Finder,
-                                  MatchFinder::MatchCallback *Callback) {
+static void registerMatchersForGetEquals(MatchFinder *Finder,
+                                         MatchFinder::MatchCallback *Callback) {
   // This one is harder to do with duck typing.
   // The operator==/!= that we are looking for might be member or non-member,
   // might be on global namespace or found by ADL, might be a template, etc.
@@ -118,8 +119,6 @@ void registerMatchersForGetEquals(MatchFinder *Finder,
   // FIXME: Match and fix if (l.get() == r.get()).
 }
 
-} // namespace
-
 void RedundantSmartptrGetCheck::storeOptions(
     ClangTidyOptions::OptionMap &Opts) {
   Options.store(Opts, "IgnoreMacros", IgnoreMacros);
@@ -130,8 +129,7 @@ void RedundantSmartptrGetCheck::registerMatchers(MatchFinder *Finder) {
   registerMatchersForGetEquals(Finder, this);
 }
 
-namespace {
-bool allReturnTypesMatch(const MatchFinder::MatchResult &Result) {
+static bool allReturnTypesMatch(const MatchFinder::MatchResult &Result) {
   if (Result.Nodes.getNodeAs<Decl>("duck_typing") == nullptr)
     return true;
   // Verify that the types match.
@@ -146,7 +144,6 @@ bool allReturnTypesMatch(const MatchFinder::MatchResult &Result) {
       Result.Nodes.getNodeAs<Type>("getType")->getUnqualifiedDesugaredType();
   return OpArrowType == OpStarType && OpArrowType == GetType;
 }
-} // namespace
 
 void RedundantSmartptrGetCheck::check(const MatchFinder::MatchResult &Result) {
   if (!allReturnTypesMatch(Result))
diff --git a/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.cpp b/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.cpp
index 4184c295b5f0a..9f3f26b775c9a 100644
--- a/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.cpp
@@ -21,20 +21,17 @@ using namespace clang::ast_matchers;
 
 namespace clang::tidy::readability {
 
-namespace {
-
-StringRef getText(const ASTContext &Context, SourceRange Range) {
+static StringRef getText(const ASTContext &Context, SourceRange Range) {
   return Lexer::getSourceText(CharSourceRange::getTokenRange(Range),
                               Context.getSourceManager(),
                               Context.getLangOpts());
 }
 
-template <typename T> StringRef getText(const ASTContext &Context, T &Node) {
+template <typename T>
+static StringRef getText(const ASTContext &Context, T &Node) {
   return getText(Context, Node.getSourceRange());
 }
 
-} // namespace
-
 static constexpr char SimplifyOperatorDiagnostic[] =
     "redundant boolean literal supplied to boolean operator";
 static constexpr char SimplifyConditionDiagnostic[] =
diff --git a/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.cpp b/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.cpp
index 29084f4e875f7..feb248dd62411 100644
--- a/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.cpp
@@ -288,8 +288,8 @@ static bool applyDiceHeuristic(StringRef Arg, StringRef Param,
   std::size_t Intersection = 0;
 
   // Find the intersection between the two sets.
-  for (auto IT = ParamBigrams.begin(); IT != ParamBigrams.end(); ++IT)
-    Intersection += ArgBigrams.count((IT->getKey()));
+  for (const auto &[Key, _] : ParamBigrams)
+    Intersection += ArgBigrams.count(Key);
 
   // Calculate Dice coefficient.
   return percentage(Intersection * 2.0,
@@ -413,11 +413,10 @@ static bool areTypesCompatible(QualType ArgType, QualType ParamType,
 
   // Arithmetic types are interconvertible, except scoped enums.
   if (ParamType->isArithmeticType() && ArgType->isArithmeticType()) {
-    if ((ParamType->isEnumeralType() && ParamType->castAsCanonical<EnumType>()
-                                            ->getOriginalDecl()
-                                            ->isScoped()) ||
+    if ((ParamType->isEnumeralType() &&
+         ParamType->castAsCanonical<EnumType>()->getDecl()->isScoped()) ||
         (ArgType->isEnumeralType() &&
-         ArgType->castAsCanonical<EnumType>()->getOriginalDecl()->isScoped()))
+         ArgType->castAsCanonical<EnumType>()->getDecl()->isScoped()))
       return false;
 
     return true;
diff --git a/clang-tools-extra/clang-tidy/readability/UppercaseLiteralSuffixCheck.cpp b/clang-tools-extra/clang-tidy/readability/UppercaseLiteralSuffixCheck.cpp
index c1dc209fd079d..740a68d852c9e 100644
--- a/clang-tools-extra/clang-tidy/readability/UppercaseLiteralSuffixCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/UppercaseLiteralSuffixCheck.cpp
@@ -55,8 +55,10 @@ struct NewSuffix {
   std::optional<FixItHint> FixIt;
 };
 
-std::optional<SourceLocation> getMacroAwareLocation(SourceLocation Loc,
-                                                    const SourceManager &SM) {
+} // namespace
+
+static std::optional<SourceLocation>
+getMacroAwareLocation(SourceLocation Loc, const SourceManager &SM) {
   // Do nothing if the provided location is invalid.
   if (Loc.isInvalid())
     return std::nullopt;
@@ -67,8 +69,8 @@ std::optional<SourceLocation> getMacroAwareLocation(SourceLocation Loc,
   return SpellingLoc;
 }
 
-std::optional<SourceRange> getMacroAwareSourceRange(SourceRange Loc,
-                                                    const SourceManager &SM) {
+static std::optional<SourceRange>
+getMacroAwareSourceRange(SourceRange Loc, const SourceManager &SM) {
   std::optional<SourceLocation> Begin =
       getMacroAwareLocation(Loc.getBegin(), SM);
   std::optional<SourceLocation> End = getMacroAwareLocation(Loc.getEnd(), SM);
@@ -77,7 +79,7 @@ std::optional<SourceRange> getMacroAwareSourceRange(SourceRange Loc,
   return SourceRange(*Begin, *End);
 }
 
-std::optional<std::string>
+static std::optional<std::string>
 getNewSuffix(llvm::StringRef OldSuffix,
              const std::vector<StringRef> &NewSuffixes) {
   // If there is no config, just uppercase the entirety of the suffix.
@@ -96,7 +98,7 @@ getNewSuffix(llvm::StringRef OldSuffix,
 }
 
 template <typename LiteralType>
-std::optional<NewSuffix>
+static std::optional<NewSuffix>
 shouldReplaceLiteralSuffix(const Expr &Literal,
                            const std::vector<StringRef> &NewSuffixes,
                            const SourceManager &SM, const LangOptions &LO) {
@@ -174,8 +176,6 @@ shouldReplaceLiteralSuffix(const Expr &Literal,
   return ReplacementDsc;
 }
 
-} // namespace
-
 UppercaseLiteralSuffixCheck::UppercaseLiteralSuffixCheck(
     StringRef Name, ClangTidyContext *Context)
     : ClangTidyCheck(Name, Context),
diff --git a/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.cpp b/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.cpp
index 57453ad089a2c..a5b08836db2c8 100644
--- a/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.cpp
+++ b/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.cpp
@@ -19,9 +19,8 @@ namespace clang::tidy::utils::decl_ref_expr {
 using namespace ::clang::ast_matchers;
 using llvm::SmallPtrSet;
 
-namespace {
-
-template <typename S> bool isSetDifferenceEmpty(const S &S1, const S &S2) {
+template <typename S>
+static bool isSetDifferenceEmpty(const S &S1, const S &S2) {
   for (auto E : S1)
     if (S2.count(E) == 0)
       return false;
@@ -30,15 +29,15 @@ template <typename S> bool isSetDifferenceEmpty(const S &S1, const S &S2) {
 
 // Extracts all Nodes keyed by ID from Matches and inserts them into Nodes.
 template <typename Node>
-void extractNodesByIdTo(ArrayRef<BoundNodes> Matches, StringRef ID,
-                        SmallPtrSet<const Node *, 16> &Nodes) {
+static void extractNodesByIdTo(ArrayRef<BoundNodes> Matches, StringRef ID,
+                               SmallPtrSet<const Node *, 16> &Nodes) {
   for (const auto &Match : Matches)
     Nodes.insert(Match.getNodeAs<Node>(ID));
 }
 
 // Returns true if both types refer to the same type,
 // ignoring the const-qualifier.
-bool isSameTypeIgnoringConst(QualType A, QualType B) {
+static bool isSameTypeIgnoringConst(QualType A, QualType B) {
   A = A.getCanonicalType();
   B = B.getCanonicalType();
   A.addConst();
@@ -47,7 +46,8 @@ bool isSameTypeIgnoringConst(QualType A, QualType B) {
 }
 
 // Returns true if `D` and `O` have the same parameter types.
-bool hasSameParameterTypes(const CXXMethodDecl &D, const CXXMethodDecl &O) {
+static bool hasSameParameterTypes(const CXXMethodDecl &D,
+                                  const CXXMethodDecl &O) {
   if (D.getNumParams() != O.getNumParams())
     return false;
   for (int I = 0, E = D.getNumParams(); I < E; ++I) {
@@ -60,7 +60,7 @@ bool hasSameParameterTypes(const CXXMethodDecl &D, const CXXMethodDecl &O) {
 
 // If `D` has a const-qualified overload with otherwise identical
 // ref-qualifiers and parameter types, returns that overload.
-const CXXMethodDecl *findConstOverload(const CXXMethodDecl &D) {
+static const CXXMethodDecl *findConstOverload(const CXXMethodDecl &D) {
   assert(!D.isConst());
 
   DeclContext::lookup_result LookupResult =
@@ -81,7 +81,7 @@ const CXXMethodDecl *findConstOverload(const CXXMethodDecl &D) {
 
 // Returns true if both types are pointers or reference to the same type,
 // ignoring the const-qualifier.
-bool pointsToSameTypeIgnoringConst(QualType A, QualType B) {
+static bool pointsToSameTypeIgnoringConst(QualType A, QualType B) {
   assert(A->isPointerType() || A->isReferenceType());
   assert(B->isPointerType() || B->isReferenceType());
   return isSameTypeIgnoringConst(A->getPointeeType(), B->getPointeeType());
@@ -122,7 +122,7 @@ bool pointsToSameTypeIgnoringConst(QualType A, QualType B) {
 //
 // This function checks (A) ad (B), but the caller should make sure that the
 // object is not mutated through the return value.
-bool isLikelyShallowConst(const CXXMethodDecl &M) {
+static bool isLikelyShallowConst(const CXXMethodDecl &M) {
   assert(!M.isConst());
   // The method can mutate our variable.
 
@@ -146,6 +146,8 @@ bool isLikelyShallowConst(const CXXMethodDecl &M) {
   return isSameTypeIgnoringConst(CallTy, OverloadTy);
 }
 
+namespace {
+
 // A matcher that matches DeclRefExprs that are used in ways such that the
 // underlying declaration is not modified.
 // If the declaration is of pointer type, `Indirections` specifies the level
diff --git a/clang-tools-extra/clang-tidy/utils/DesignatedInitializers.cpp b/clang-tools-extra/clang-tidy/utils/DesignatedInitializers.cpp
index 044f89be61342..b068ae24a391b 100644
--- a/clang-tools-extra/clang-tidy/utils/DesignatedInitializers.cpp
+++ b/clang-tools-extra/clang-tidy/utils/DesignatedInitializers.cpp
@@ -19,8 +19,6 @@
 
 namespace clang::tidy::utils {
 
-namespace {
-
 /// Returns true if Name is reserved, like _Foo or __Vector_base.
 static inline bool isReservedName(llvm::StringRef Name) {
   // This doesn't catch all cases, but the most common.
@@ -28,6 +26,8 @@ static inline bool isReservedName(llvm::StringRef Name) {
          (isUppercase(Name[1]) || Name[1] == '_');
 }
 
+namespace {
+
 // Helper class to iterate over the designator names of an aggregate type.
 //
 // For an array type, yields [0], [1], [2]...
@@ -112,6 +112,8 @@ class AggregateDesignatorNames {
   RecordDecl::field_iterator FieldsEnd;
 };
 
+} // namespace
+
 // Collect designator labels describing the elements of an init list.
 //
 // This function contributes the designators of some (sub)object, which is
@@ -127,10 +129,9 @@ class AggregateDesignatorNames {
 // '.a:' is produced directly without recursing into the written sublist.
 // (The written sublist will have a separate collectDesignators() call later).
 // Recursion with Prefix='.b' and Sem = {3, ImplicitValue} produces '.b.x:'.
-void collectDesignators(const InitListExpr *Sem,
-                        llvm::DenseMap<SourceLocation, std::string> &Out,
-                        const llvm::DenseSet<SourceLocation> &NestedBraces,
-                        std::string &Prefix) {
+static void collectDesignators(
+    const InitListExpr *Sem, llvm::DenseMap<SourceLocation, std::string> &Out,
+    const llvm::DenseSet<SourceLocation> &NestedBraces, std::string &Prefix) {
   if (!Sem || Sem->isTransparent())
     return;
   assert(Sem->isSemanticForm());
@@ -170,8 +171,6 @@ void collectDesignators(const InitListExpr *Sem,
   }
 }
 
-} // namespace
-
 llvm::DenseMap<SourceLocation, std::string>
 getUnwrittenDesignators(const InitListExpr *Syn) {
   assert(Syn->isSyntacticForm());
diff --git a/clang-tools-extra/clang-tidy/utils/ExprSequence.cpp b/clang-tools-extra/clang-tidy/utils/ExprSequence.cpp
index 393f935fc31e4..46eebf4e7a86e 100644
--- a/clang-tools-extra/clang-tidy/utils/ExprSequence.cpp
+++ b/clang-tools-extra/clang-tidy/utils/ExprSequence.cpp
@@ -49,10 +49,8 @@ static SmallVector<const Stmt *, 1> getParentStmts(const Stmt *S,
   return Result;
 }
 
-namespace {
-
-bool isDescendantOrEqual(const Stmt *Descendant, const Stmt *Ancestor,
-                         ASTContext *Context) {
+static bool isDescendantOrEqual(const Stmt *Descendant, const Stmt *Ancestor,
+                                ASTContext *Context) {
   if (Descendant == Ancestor)
     return true;
   return llvm::any_of(getParentStmts(Descendant, Context),
@@ -61,15 +59,15 @@ bool isDescendantOrEqual(const Stmt *Descendant, const Stmt *Ancestor,
                       });
 }
 
-bool isDescendantOfArgs(const Stmt *Descendant, const CallExpr *Call,
-                        ASTContext *Context) {
+static bool isDescendantOfArgs(const Stmt *Descendant, const CallExpr *Call,
+                               ASTContext *Context) {
   return llvm::any_of(Call->arguments(),
                       [Descendant, Context](const Expr *Arg) {
                         return isDescendantOrEqual(Descendant, Arg, Context);
                       });
 }
 
-llvm::SmallVector<const InitListExpr *>
+static llvm::SmallVector<const InitListExpr *>
 getAllInitListForms(const InitListExpr *InitList) {
   llvm::SmallVector<const InitListExpr *> Result = {InitList};
   if (const InitListExpr *AltForm = InitList->getSyntacticForm())
@@ -79,8 +77,6 @@ getAllInitListForms(const InitListExpr *InitList) {
   return Result;
 }
 
-} // namespace
-
 ExprSequence::ExprSequence(const CFG *TheCFG, const Stmt *Root,
                            ASTContext *TheContext)
     : Context(TheContext), Root(Root) {
diff --git a/clang-tools-extra/clang-tidy/utils/IncludeSorter.cpp b/clang-tools-extra/clang-tidy/utils/IncludeSorter.cpp
index 6a71a11c18754..58e33567f07e9 100644
--- a/clang-tools-extra/clang-tidy/utils/IncludeSorter.cpp
+++ b/clang-tools-extra/clang-tidy/utils/IncludeSorter.cpp
@@ -15,9 +15,8 @@
 namespace clang::tidy {
 namespace utils {
 
-namespace {
-
-StringRef removeFirstSuffix(StringRef Str, ArrayRef<const char *> Suffixes) {
+static StringRef removeFirstSuffix(StringRef Str,
+                                   ArrayRef<const char *> Suffixes) {
   for (StringRef Suffix : Suffixes) {
     if (Str.consume_back(Suffix))
       return Str;
@@ -25,7 +24,8 @@ StringRef removeFirstSuffix(StringRef Str, ArrayRef<const char *> Suffixes) {
   return Str;
 }
 
-StringRef makeCanonicalName(StringRef Str, IncludeSorter::IncludeStyle Style) {
+static StringRef makeCanonicalName(StringRef Str,
+                                   IncludeSorter::IncludeStyle Style) {
   // The list of suffixes to remove from source file names to get the
   // "canonical" file names.
   // E.g. tools/sort_includes.cc and tools/sort_includes_test.cc
@@ -56,12 +56,12 @@ StringRef makeCanonicalName(StringRef Str, IncludeSorter::IncludeStyle Style) {
 }
 
 // Scan to the end of the line and return the offset of the next line.
-size_t findNextLine(const char *Text) {
+static size_t findNextLine(const char *Text) {
   size_t EOLIndex = std::strcspn(Text, "\n");
   return Text[EOLIndex] == '\0' ? EOLIndex : EOLIndex + 1;
 }
 
-IncludeSorter::IncludeKinds
+static IncludeSorter::IncludeKinds
 determineIncludeKind(StringRef CanonicalFile, StringRef IncludeFile,
                      bool IsAngled, IncludeSorter::IncludeStyle Style) {
   // Compute the two "canonical" forms of the include's filename sans extension.
@@ -101,8 +101,8 @@ determineIncludeKind(StringRef CanonicalFile, StringRef IncludeFile,
   return IncludeSorter::IK_NonSystemInclude;
 }
 
-int compareHeaders(StringRef LHS, StringRef RHS,
-                   IncludeSorter::IncludeStyle Style) {
+static int compareHeaders(StringRef LHS, StringRef RHS,
+                          IncludeSorter::IncludeStyle Style) {
   if (Style == IncludeSorter::IncludeStyle::IS_Google_ObjC) {
     const std::pair<const char *, const char *> &Mismatch =
         std::mismatch(LHS.begin(), LHS.end(), RHS.begin(), RHS.end());
@@ -118,8 +118,6 @@ int compareHeaders(StringRef LHS, StringRef RHS,
   return LHS.compare(RHS);
 }
 
-} // namespace
-
 IncludeSorter::IncludeSorter(const SourceManager *SourceMgr,
                              const FileID FileID, StringRef FileName,
                              IncludeStyle Style)
diff --git a/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp b/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp
index 70f6092a5e4bc..6bd6d981858cb 100644
--- a/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp
+++ b/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp
@@ -93,9 +93,9 @@ static const NamedDecl *findDecl(const RecordDecl &RecDecl,
   return nullptr;
 }
 
-/// Returns the function that \p Method is overridding. If There are none or
+/// Returns the function that \p Method is overriding. If There are none or
 /// multiple overrides it returns nullptr. If the overridden function itself is
-/// overridding then it will recurse up to find the first decl of the function.
+/// overriding then it will recurse up to find the first decl of the function.
 static const CXXMethodDecl *getOverrideMethod(const CXXMethodDecl *Method) {
   if (Method->size_overridden_methods() != 1)
     return nullptr;
@@ -331,7 +331,7 @@ class RenamerClangTidyVisitor
   }
 
   bool VisitTagTypeLoc(const TagTypeLoc &Loc) {
-    Check->addUsage(Loc.getOriginalDecl(), Loc.getNameLoc(), SM);
+    Check->addUsage(Loc.getDecl(), Loc.getNameLoc(), SM);
     return true;
   }
 
@@ -490,7 +490,7 @@ void RenamerClangTidyCheck::addUsage(const NamedDecl *Decl,
   NamingCheckFailure &Failure = FailureIter->second;
   Failure.Info = std::move(*MaybeFailure);
 
-  // Don't overwritte the failure status if it was already set.
+  // Don't overwrite the failure status if it was already set.
   if (!Failure.shouldFix()) {
     return;
   }
diff --git a/clang-tools-extra/clangd/DumpAST.cpp b/clang-tools-extra/clangd/DumpAST.cpp
index 9a8d41d870929..cd409a2b930ef 100644
--- a/clang-tools-extra/clangd/DumpAST.cpp
+++ b/clang-tools-extra/clangd/DumpAST.cpp
@@ -261,7 +261,7 @@ class DumpVisitor : public RecursiveASTVisitor<DumpVisitor> {
       return TL.getType().getLocalQualifiers().getAsString(
           Ctx.getPrintingPolicy());
     if (const auto *TT = dyn_cast<TagType>(TL.getTypePtr()))
-      return getDetail(TT->getOriginalDecl());
+      return getDetail(TT->getDecl());
     if (const auto *DT = dyn_cast<DeducedType>(TL.getTypePtr()))
       if (DT->isDeduced())
         return DT->getDeducedType().getAsString(Ctx.getPrintingPolicy());
diff --git a/clang-tools-extra/clangd/FindTarget.cpp b/clang-tools-extra/clangd/FindTarget.cpp
index 799c64b8dab4d..f80f7325d17ae 100644
--- a/clang-tools-extra/clangd/FindTarget.cpp
+++ b/clang-tools-extra/clangd/FindTarget.cpp
@@ -50,7 +50,7 @@ namespace clang {
 namespace clangd {
 namespace {
 
-LLVM_ATTRIBUTE_UNUSED std::string nodeToString(const DynTypedNode &N) {
+[[maybe_unused]] std::string nodeToString(const DynTypedNode &N) {
   std::string S = std::string(N.getNodeKind().asStringRef());
   {
     llvm::raw_string_ostream OS(S);
@@ -366,7 +366,7 @@ struct TargetFinder {
       Visitor(TargetFinder &Outer, RelSet Flags) : Outer(Outer), Flags(Flags) {}
 
       void VisitTagType(const TagType *TT) {
-        Outer.add(cast<TagType>(TT)->getOriginalDecl(), Flags);
+        Outer.add(cast<TagType>(TT)->getDecl(), Flags);
       }
 
       void VisitUsingType(const UsingType *ET) {
@@ -861,7 +861,7 @@ refInTypeLoc(TypeLoc L, const HeuristicResolver *Resolver) {
       Refs.push_back(ReferenceLoc{L.getQualifierLoc(),
                                   L.getNameLoc(),
                                   /*IsDecl=*/false,
-                                  {L.getOriginalDecl()}});
+                                  {L.getDecl()}});
     }
 
     void VisitTemplateTypeParmTypeLoc(TemplateTypeParmTypeLoc L) {
@@ -1040,8 +1040,8 @@ class ExplicitReferenceCollector
     if (auto *S = N.get<Stmt>())
       return refInStmt(S, Resolver);
     if (auto *NNSL = N.get<NestedNameSpecifierLoc>()) {
-      if (auto TL = NNSL->getAsTypeLoc())
-        return refInTypeLoc(NNSL->getAsTypeLoc(), Resolver);
+      if (TypeLoc TL = NNSL->getAsTypeLoc())
+        return refInTypeLoc(TL, Resolver);
       // (!) 'DeclRelation::Alias' ensures we do not lose namespace aliases.
       NestedNameSpecifierLoc Qualifier = NNSL->getAsNamespaceAndPrefix().Prefix;
       SourceLocation NameLoc = NNSL->getLocalBeginLoc();
diff --git a/clang-tools-extra/clangd/Hover.cpp b/clang-tools-extra/clangd/Hover.cpp
index acc8e87ed4764..34369e188d4ec 100644
--- a/clang-tools-extra/clangd/Hover.cpp
+++ b/clang-tools-extra/clangd/Hover.cpp
@@ -179,7 +179,7 @@ HoverInfo::PrintedType printType(QualType QT, ASTContext &ASTCtx,
   if (!QT.isNull() && !QT.hasQualifiers() && PP.SuppressTagKeyword) {
     if (auto *TT = llvm::dyn_cast<TagType>(QT.getTypePtr());
         TT && TT->isCanonicalUnqualified())
-      OS << TT->getOriginalDecl()->getKindName() << " ";
+      OS << TT->getDecl()->getKindName() << " ";
   }
   QT.print(OS, PP);
 
diff --git a/clang-tools-extra/clangd/IncludeFixer.cpp b/clang-tools-extra/clangd/IncludeFixer.cpp
index c27d960cd963b..3f3d7fbefd58e 100644
--- a/clang-tools-extra/clangd/IncludeFixer.cpp
+++ b/clang-tools-extra/clangd/IncludeFixer.cpp
@@ -173,7 +173,7 @@ std::vector<Fix> IncludeFixer::fix(DiagnosticsEngine::Level DiagLevel,
           // `enum x : int;' is not formally an incomplete type.
           // We may need a full definition anyway.
           if (auto * ET = llvm::dyn_cast<EnumType>(T))
-            if (!ET->getOriginalDecl()->getDefinition())
+            if (!ET->getDecl()->getDefinition())
               return fixIncompleteType(*T);
         }
       }
diff --git a/clang-tools-extra/clangd/InlayHints.cpp b/clang-tools-extra/clangd/InlayHints.cpp
index d56b93e5f36dc..23bd02304a4fd 100644
--- a/clang-tools-extra/clangd/InlayHints.cpp
+++ b/clang-tools-extra/clangd/InlayHints.cpp
@@ -60,7 +60,7 @@ const NamedDecl *getDeclForType(const Type *T) {
   case Type::Enum:
   case Type::Record:
   case Type::InjectedClassName:
-    return cast<TagType>(T)->getOriginalDecl();
+    return cast<TagType>(T)->getDecl();
   case Type::TemplateSpecialization:
     return cast<TemplateSpecializationType>(T)
         ->getTemplateName()
diff --git a/clang-tools-extra/clangd/Protocol.h b/clang-tools-extra/clangd/Protocol.h
index 3a6bf155ee153..2248572060431 100644
--- a/clang-tools-extra/clangd/Protocol.h
+++ b/clang-tools-extra/clangd/Protocol.h
@@ -1627,6 +1627,12 @@ struct CallHierarchyIncomingCall {
   /// The range at which the calls appear.
   /// This is relative to the caller denoted by `From`.
   std::vector<Range> fromRanges;
+
+  /// For the case of being a virtual function we also return calls
+  /// to the base function. This caller might be a false positive.
+  /// We currently have no way of discerning this.
+  /// This is a clangd extension.
+  bool mightNeverCall = false;
 };
 llvm::json::Value toJSON(const CallHierarchyIncomingCall &);
 
diff --git a/clang-tools-extra/clangd/XRefs.cpp b/clang-tools-extra/clangd/XRefs.cpp
index 05e04ac161e54..ef45acf501612 100644
--- a/clang-tools-extra/clangd/XRefs.cpp
+++ b/clang-tools-extra/clangd/XRefs.cpp
@@ -21,6 +21,7 @@
 #include "clang-include-cleaner/Types.h"
 #include "index/Index.h"
 #include "index/Merge.h"
+#include "index/Ref.h"
 #include "index/Relation.h"
 #include "index/SymbolCollector.h"
 #include "index/SymbolID.h"
@@ -56,6 +57,7 @@
 #include "clang/Tooling/Syntax/Tokens.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallSet.h"
@@ -66,6 +68,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
 #include <optional>
 #include <string>
 #include <vector>
@@ -2350,51 +2353,64 @@ incomingCalls(const CallHierarchyItem &Item, const SymbolIndex *Index) {
   // to an AST node isn't cheap, particularly when the declaration isn't
   // in the main file.
   // FIXME: Consider also using AST information when feasible.
-  RefsRequest Request;
-  Request.IDs.insert(*ID);
-  Request.WantContainer = true;
-  // We could restrict more specifically to calls by introducing a new RefKind,
-  // but non-call references (such as address-of-function) can still be
-  // interesting as they can indicate indirect calls.
-  Request.Filter = RefKind::Reference;
-  // Initially store the ranges in a map keyed by SymbolID of the caller.
-  // This allows us to group different calls with the same caller
-  // into the same CallHierarchyIncomingCall.
-  llvm::DenseMap<SymbolID, std::vector<Location>> CallsIn;
-  // We can populate the ranges based on a refs request only. As we do so, we
-  // also accumulate the container IDs into a lookup request.
-  LookupRequest ContainerLookup;
-  Index->refs(Request, [&](const Ref &R) {
-    auto Loc = indexToLSPLocation(R.Location, Item.uri.file());
-    if (!Loc) {
-      elog("incomingCalls failed to convert location: {0}", Loc.takeError());
-      return;
-    }
-    CallsIn[R.Container].push_back(*Loc);
+  auto QueryIndex = [&](llvm::DenseSet<SymbolID> IDs, bool MightNeverCall) {
+    RefsRequest Request;
+    Request.IDs = std::move(IDs);
+    Request.WantContainer = true;
+    // We could restrict more specifically to calls by introducing a new
+    // RefKind, but non-call references (such as address-of-function) can still
+    // be interesting as they can indicate indirect calls.
+    Request.Filter = RefKind::Reference;
+    // Initially store the ranges in a map keyed by SymbolID of the caller.
+    // This allows us to group different calls with the same caller
+    // into the same CallHierarchyIncomingCall.
+    llvm::DenseMap<SymbolID, std::vector<Location>> CallsIn;
+    // We can populate the ranges based on a refs request only. As we do so, we
+    // also accumulate the container IDs into a lookup request.
+    LookupRequest ContainerLookup;
+    Index->refs(Request, [&](const Ref &R) {
+      auto Loc = indexToLSPLocation(R.Location, Item.uri.file());
+      if (!Loc) {
+        elog("incomingCalls failed to convert location: {0}", Loc.takeError());
+        return;
+      }
+      CallsIn[R.Container].push_back(*Loc);
 
-    ContainerLookup.IDs.insert(R.Container);
-  });
-  // Perform the lookup request and combine its results with CallsIn to
-  // get complete CallHierarchyIncomingCall objects.
-  Index->lookup(ContainerLookup, [&](const Symbol &Caller) {
-    auto It = CallsIn.find(Caller.ID);
-    assert(It != CallsIn.end());
-    if (auto CHI = symbolToCallHierarchyItem(Caller, Item.uri.file())) {
-      std::vector<Range> FromRanges;
-      for (const Location &L : It->second) {
-        if (L.uri != CHI->uri) {
-          // Call location not in same file as caller.
-          // This can happen in some edge cases. There's not much we can do,
-          // since the protocol only allows returning ranges interpreted as
-          // being in the caller's file.
-          continue;
+      ContainerLookup.IDs.insert(R.Container);
+    });
+    // Perform the lookup request and combine its results with CallsIn to
+    // get complete CallHierarchyIncomingCall objects.
+    Index->lookup(ContainerLookup, [&](const Symbol &Caller) {
+      auto It = CallsIn.find(Caller.ID);
+      assert(It != CallsIn.end());
+      if (auto CHI = symbolToCallHierarchyItem(Caller, Item.uri.file())) {
+        std::vector<Range> FromRanges;
+        for (const Location &L : It->second) {
+          if (L.uri != CHI->uri) {
+            // Call location not in same file as caller.
+            // This can happen in some edge cases. There's not much we can do,
+            // since the protocol only allows returning ranges interpreted as
+            // being in the caller's file.
+            continue;
+          }
+          FromRanges.push_back(L.range);
         }
-        FromRanges.push_back(L.range);
+        Results.push_back(CallHierarchyIncomingCall{
+            std::move(*CHI), std::move(FromRanges), MightNeverCall});
       }
-      Results.push_back(
-          CallHierarchyIncomingCall{std::move(*CHI), std::move(FromRanges)});
-    }
-  });
+    });
+  };
+  QueryIndex({ID.get()}, false);
+  // In the case of being a virtual function we also want to return
+  // potential calls through the base function.
+  if (Item.kind == SymbolKind::Method) {
+    llvm::DenseSet<SymbolID> IDs;
+    RelationsRequest Req{{ID.get()}, RelationKind::OverriddenBy, std::nullopt};
+    Index->reverseRelations(Req, [&](const SymbolID &, const Symbol &Caller) {
+      IDs.insert(Caller.ID);
+    });
+    QueryIndex(std::move(IDs), true);
+  }
   // Sort results by name of container.
   llvm::sort(Results, [](const CallHierarchyIncomingCall &A,
                          const CallHierarchyIncomingCall &B) {
diff --git a/clang-tools-extra/clangd/index/Index.cpp b/clang-tools-extra/clangd/index/Index.cpp
index 86dc6ed763344..a2ec910606d92 100644
--- a/clang-tools-extra/clangd/index/Index.cpp
+++ b/clang-tools-extra/clangd/index/Index.cpp
@@ -77,6 +77,12 @@ void SwapIndex::relations(
   return snapshot()->relations(R, CB);
 }
 
+void SwapIndex::reverseRelations(
+    const RelationsRequest &R,
+    llvm::function_ref<void(const SymbolID &, const Symbol &)> CB) const {
+  return snapshot()->reverseRelations(R, CB);
+}
+
 llvm::unique_function<IndexContents(llvm::StringRef) const>
 SwapIndex::indexedFiles() const {
   // The index snapshot should outlive this method return value.
diff --git a/clang-tools-extra/clangd/index/Index.h b/clang-tools-extra/clangd/index/Index.h
index a193b1a191216..b62b15d103112 100644
--- a/clang-tools-extra/clangd/index/Index.h
+++ b/clang-tools-extra/clangd/index/Index.h
@@ -181,6 +181,14 @@ class SymbolIndex {
       llvm::function_ref<void(const SymbolID &Subject, const Symbol &Object)>
           Callback) const = 0;
 
+  /// Finds all relations (O, P, S) stored in the index such that S is among
+  /// Req.Subjects and P is Req.Predicate, and invokes \p Callback for (S, O) in
+  /// each. Currently only allows the OverriddenBy relation.
+  virtual void reverseRelations(
+      const RelationsRequest &Req,
+      llvm::function_ref<void(const SymbolID &Subject, const Symbol &Object)>
+          Callback) const = 0;
+
   /// Returns function which checks if the specified file was used to build this
   /// index or not. The function must only be called while the index is alive.
   using IndexedFiles =
@@ -214,6 +222,11 @@ class SwapIndex : public SymbolIndex {
                  llvm::function_ref<void(const SymbolID &, const Symbol &)>)
       const override;
 
+  void
+  reverseRelations(const RelationsRequest &,
+                   llvm::function_ref<void(const SymbolID &, const Symbol &)>)
+      const override;
+
   llvm::unique_function<IndexContents(llvm::StringRef) const>
   indexedFiles() const override;
 
diff --git a/clang-tools-extra/clangd/index/MemIndex.cpp b/clang-tools-extra/clangd/index/MemIndex.cpp
index 9c9d3942bdee6..feac1cf4fb7a7 100644
--- a/clang-tools-extra/clangd/index/MemIndex.cpp
+++ b/clang-tools-extra/clangd/index/MemIndex.cpp
@@ -125,6 +125,27 @@ void MemIndex::relations(
   }
 }
 
+void MemIndex::reverseRelations(
+    const RelationsRequest &Req,
+    llvm::function_ref<void(const SymbolID &, const Symbol &)> Callback) const {
+  assert(Req.Predicate == RelationKind::OverriddenBy);
+  uint32_t Remaining = Req.Limit.value_or(std::numeric_limits<uint32_t>::max());
+  for (const SymbolID &Subject : Req.Subjects) {
+    LookupRequest LookupReq;
+    auto It = ReverseRelations.find(
+        std::make_pair(Subject, static_cast<uint8_t>(Req.Predicate)));
+    if (It != ReverseRelations.end()) {
+      for (const auto &Obj : It->second) {
+        if (Remaining > 0) {
+          --Remaining;
+          LookupReq.IDs.insert(Obj);
+        }
+      }
+    }
+    lookup(LookupReq, [&](const Symbol &Object) { Callback(Subject, Object); });
+  }
+}
+
 llvm::unique_function<IndexContents(llvm::StringRef) const>
 MemIndex::indexedFiles() const {
   return [this](llvm::StringRef FileURI) {
@@ -134,7 +155,8 @@ MemIndex::indexedFiles() const {
 
 size_t MemIndex::estimateMemoryUsage() const {
   return Index.getMemorySize() + Refs.getMemorySize() +
-         Relations.getMemorySize() + BackingDataSize;
+         Relations.getMemorySize() + ReverseRelations.getMemorySize() +
+         BackingDataSize;
 }
 
 } // namespace clangd
diff --git a/clang-tools-extra/clangd/index/MemIndex.h b/clang-tools-extra/clangd/index/MemIndex.h
index fb1052b0c7ca8..8ece9994872a9 100644
--- a/clang-tools-extra/clangd/index/MemIndex.h
+++ b/clang-tools-extra/clangd/index/MemIndex.h
@@ -10,6 +10,7 @@
 #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_MEMINDEX_H
 
 #include "index/Index.h"
+#include "index/Relation.h"
 #include "llvm/ADT/StringSet.h"
 #include <mutex>
 
@@ -27,10 +28,16 @@ class MemIndex : public SymbolIndex {
       Index[S.ID] = &S;
     for (const std::pair<SymbolID, llvm::ArrayRef<Ref>> &R : Refs)
       this->Refs.try_emplace(R.first, R.second.begin(), R.second.end());
-    for (const Relation &R : Relations)
+    for (const Relation &R : Relations) {
       this->Relations[std::make_pair(R.Subject,
                                      static_cast<uint8_t>(R.Predicate))]
           .push_back(R.Object);
+      if (R.Predicate == RelationKind::OverriddenBy) {
+        this->ReverseRelations[std::make_pair(
+                                   R.Object, static_cast<uint8_t>(R.Predicate))]
+            .push_back(R.Subject);
+      }
+    }
   }
   // Symbols are owned by BackingData, Index takes ownership.
   template <typename SymbolRange, typename RefRange, typename RelationRange,
@@ -80,6 +87,11 @@ class MemIndex : public SymbolIndex {
                  llvm::function_ref<void(const SymbolID &, const Symbol &)>
                      Callback) const override;
 
+  void
+  reverseRelations(const RelationsRequest &Req,
+                   llvm::function_ref<void(const SymbolID &, const Symbol &)>
+                       Callback) const override;
+
   llvm::unique_function<IndexContents(llvm::StringRef) const>
   indexedFiles() const override;
 
@@ -94,6 +106,9 @@ class MemIndex : public SymbolIndex {
   static_assert(sizeof(RelationKind) == sizeof(uint8_t),
                 "RelationKind should be of same size as a uint8_t");
   llvm::DenseMap<std::pair<SymbolID, uint8_t>, std::vector<SymbolID>> Relations;
+  // Reverse relations, currently only for OverriddenBy
+  llvm::DenseMap<std::pair<SymbolID, uint8_t>, std::vector<SymbolID>>
+      ReverseRelations;
   // Set of files which were used during this index build.
   llvm::StringSet<> Files;
   // Contents of the index (symbols, references, etc.)
diff --git a/clang-tools-extra/clangd/index/Merge.cpp b/clang-tools-extra/clangd/index/Merge.cpp
index aecca38a885b6..625b1c6926a28 100644
--- a/clang-tools-extra/clangd/index/Merge.cpp
+++ b/clang-tools-extra/clangd/index/Merge.cpp
@@ -221,6 +221,32 @@ void MergedIndex::relations(
   });
 }
 
+void MergedIndex::reverseRelations(
+    const RelationsRequest &Req,
+    llvm::function_ref<void(const SymbolID &, const Symbol &)> Callback) const {
+  uint32_t Remaining = Req.Limit.value_or(std::numeric_limits<uint32_t>::max());
+  // Return results from both indexes but avoid duplicates.
+  // We might return stale relations from the static index;
+  // we don't currently have a good way of identifying them.
+  llvm::DenseSet<std::pair<SymbolID, SymbolID>> SeenRelations;
+  Dynamic->reverseRelations(
+      Req, [&](const SymbolID &Subject, const Symbol &Object) {
+        Callback(Subject, Object);
+        SeenRelations.insert(std::make_pair(Subject, Object.ID));
+        --Remaining;
+      });
+  if (Remaining == 0)
+    return;
+  Static->reverseRelations(
+      Req, [&](const SymbolID &Subject, const Symbol &Object) {
+        if (Remaining > 0 &&
+            !SeenRelations.count(std::make_pair(Subject, Object.ID))) {
+          --Remaining;
+          Callback(Subject, Object);
+        }
+      });
+}
+
 // Returns true if \p L is (strictly) preferred to \p R (e.g. by file paths). If
 // neither is preferred, this returns false.
 static bool prefer(const SymbolLocation &L, const SymbolLocation &R) {
diff --git a/clang-tools-extra/clangd/index/Merge.h b/clang-tools-extra/clangd/index/Merge.h
index 7441be6e57e85..5910c27cab58a 100644
--- a/clang-tools-extra/clangd/index/Merge.h
+++ b/clang-tools-extra/clangd/index/Merge.h
@@ -44,6 +44,10 @@ class MergedIndex : public SymbolIndex {
   void relations(const RelationsRequest &,
                  llvm::function_ref<void(const SymbolID &, const Symbol &)>)
       const override;
+  void
+  reverseRelations(const RelationsRequest &,
+                   llvm::function_ref<void(const SymbolID &, const Symbol &)>)
+      const override;
   llvm::unique_function<IndexContents(llvm::StringRef) const>
   indexedFiles() const override;
   size_t estimateMemoryUsage() const override {
diff --git a/clang-tools-extra/clangd/index/ProjectAware.cpp b/clang-tools-extra/clangd/index/ProjectAware.cpp
index 9836f0130362a..34d037b854e3d 100644
--- a/clang-tools-extra/clangd/index/ProjectAware.cpp
+++ b/clang-tools-extra/clangd/index/ProjectAware.cpp
@@ -51,6 +51,11 @@ class ProjectAwareIndex : public SymbolIndex {
                  llvm::function_ref<void(const SymbolID &, const Symbol &)>
                      Callback) const override;
 
+  void
+  reverseRelations(const RelationsRequest &,
+                   llvm::function_ref<void(const SymbolID &, const Symbol &)>)
+      const override;
+
   llvm::unique_function<IndexContents(llvm::StringRef) const>
   indexedFiles() const override;
 
@@ -124,6 +129,14 @@ void ProjectAwareIndex::relations(
     return Idx->relations(Req, Callback);
 }
 
+void ProjectAwareIndex::reverseRelations(
+    const RelationsRequest &Req,
+    llvm::function_ref<void(const SymbolID &, const Symbol &)> Callback) const {
+  trace::Span Tracer("ProjectAwareIndex::relations");
+  if (auto *Idx = getIndex())
+    return Idx->reverseRelations(Req, Callback);
+}
+
 llvm::unique_function<IndexContents(llvm::StringRef) const>
 ProjectAwareIndex::indexedFiles() const {
   trace::Span Tracer("ProjectAwareIndex::indexedFiles");
diff --git a/clang-tools-extra/clangd/index/dex/Dex.cpp b/clang-tools-extra/clangd/index/dex/Dex.cpp
index 575a96a112979..179d8c4da0b3e 100644
--- a/clang-tools-extra/clangd/index/dex/Dex.cpp
+++ b/clang-tools-extra/clangd/index/dex/Dex.cpp
@@ -379,6 +379,28 @@ void Dex::relations(
   }
 }
 
+void Dex::reverseRelations(
+    const RelationsRequest &Req,
+    llvm::function_ref<void(const SymbolID &, const Symbol &)> Callback) const {
+  trace::Span Tracer("Dex reverseRelations");
+  assert(Req.Predicate == RelationKind::OverriddenBy);
+  uint32_t Remaining = Req.Limit.value_or(std::numeric_limits<uint32_t>::max());
+  for (const SymbolID &Subject : Req.Subjects) {
+    LookupRequest LookupReq;
+    auto It = ReverseRelations.find(
+        std::make_pair(Subject, static_cast<uint8_t>(Req.Predicate)));
+    if (It != ReverseRelations.end()) {
+      for (const auto &Obj : It->second) {
+        if (Remaining > 0) {
+          --Remaining;
+          LookupReq.IDs.insert(Obj);
+        }
+      }
+    }
+    lookup(LookupReq, [&](const Symbol &Object) { Callback(Subject, Object); });
+  }
+}
+
 llvm::unique_function<IndexContents(llvm::StringRef) const>
 Dex::indexedFiles() const {
   return [this](llvm::StringRef FileURI) {
@@ -396,6 +418,7 @@ size_t Dex::estimateMemoryUsage() const {
   Bytes += Refs.getMemorySize();
   Bytes += RevRefs.size() * sizeof(RevRef);
   Bytes += Relations.getMemorySize();
+  Bytes += ReverseRelations.getMemorySize();
   return Bytes + BackingDataSize;
 }
 
diff --git a/clang-tools-extra/clangd/index/dex/Dex.h b/clang-tools-extra/clangd/index/dex/Dex.h
index 502f597d81ef0..1ea7d8c06c67c 100644
--- a/clang-tools-extra/clangd/index/dex/Dex.h
+++ b/clang-tools-extra/clangd/index/dex/Dex.h
@@ -43,10 +43,16 @@ class Dex : public SymbolIndex {
       this->Symbols.push_back(&Sym);
     for (auto &&Ref : Refs)
       this->Refs.try_emplace(Ref.first, Ref.second);
-    for (auto &&Rel : Relations)
+    for (auto &&Rel : Relations) {
       this->Relations[std::make_pair(Rel.Subject,
                                      static_cast<uint8_t>(Rel.Predicate))]
           .push_back(Rel.Object);
+      if (Rel.Predicate == RelationKind::OverriddenBy) {
+        this->ReverseRelations[std::make_pair(Rel.Object, static_cast<uint8_t>(
+                                                              Rel.Predicate))]
+            .push_back(Rel.Subject);
+      }
+    }
     buildIndex(SupportContainedRefs);
   }
   // Symbols and Refs are owned by BackingData, Index takes ownership.
@@ -96,6 +102,11 @@ class Dex : public SymbolIndex {
                  llvm::function_ref<void(const SymbolID &, const Symbol &)>
                      Callback) const override;
 
+  void
+  reverseRelations(const RelationsRequest &,
+                   llvm::function_ref<void(const SymbolID &, const Symbol &)>)
+      const override;
+
   llvm::unique_function<IndexContents(llvm::StringRef) const>
   indexedFiles() const override;
 
@@ -142,6 +153,9 @@ class Dex : public SymbolIndex {
   static_assert(sizeof(RelationKind) == sizeof(uint8_t),
                 "RelationKind should be of same size as a uint8_t");
   llvm::DenseMap<std::pair<SymbolID, uint8_t>, std::vector<SymbolID>> Relations;
+  // Reverse relations, currently only for OverriddenBy
+  llvm::DenseMap<std::pair<SymbolID, uint8_t>, std::vector<SymbolID>>
+      ReverseRelations;
   std::shared_ptr<void> KeepAlive; // poor man's move-only std::any
   // Set of files which were used during this index build.
   llvm::StringSet<> Files;
diff --git a/clang-tools-extra/clangd/index/remote/Client.cpp b/clang-tools-extra/clangd/index/remote/Client.cpp
index 79b827126b4ef..3b31a9fb67272 100644
--- a/clang-tools-extra/clangd/index/remote/Client.cpp
+++ b/clang-tools-extra/clangd/index/remote/Client.cpp
@@ -164,6 +164,17 @@ class IndexClient : public clangd::SymbolIndex {
               });
   }
 
+  void reverseRelations(
+      const clangd::RelationsRequest &Request,
+      llvm::function_ref<void(const SymbolID &, const clangd::Symbol &)>
+          Callback) const override {
+    streamRPC(Request, &remote::v1::SymbolIndex::Stub::ReverseRelations,
+              // Unpack protobuf Relation.
+              [&](std::pair<SymbolID, clangd::Symbol> SubjectAndObject) {
+                Callback(SubjectAndObject.first, SubjectAndObject.second);
+              });
+  }
+
   llvm::unique_function<IndexContents(llvm::StringRef) const>
   indexedFiles() const override {
     // FIXME: For now we always return IndexContents::None regardless of whether
diff --git a/clang-tools-extra/clangd/index/remote/Service.proto b/clang-tools-extra/clangd/index/remote/Service.proto
index 43023321cb9e1..3223298f608fc 100644
--- a/clang-tools-extra/clangd/index/remote/Service.proto
+++ b/clang-tools-extra/clangd/index/remote/Service.proto
@@ -24,4 +24,6 @@ service SymbolIndex {
   rpc ContainedRefs(ContainedRefsRequest) returns (stream ContainedRefsReply) {}
 
   rpc Relations(RelationsRequest) returns (stream RelationsReply) {}
+
+  rpc ReverseRelations(RelationsRequest) returns (stream RelationsReply) {}
 }
diff --git a/clang-tools-extra/clangd/index/remote/server/Server.cpp b/clang-tools-extra/clangd/index/remote/server/Server.cpp
index 890b6c27ed928..af9e9c3c8ff71 100644
--- a/clang-tools-extra/clangd/index/remote/server/Server.cpp
+++ b/clang-tools-extra/clangd/index/remote/server/Server.cpp
@@ -351,6 +351,54 @@ class RemoteIndexServer final : public v1::SymbolIndex::Service {
     return grpc::Status::OK;
   }
 
+  grpc::Status
+  ReverseRelations(grpc::ServerContext *Context,
+                   const RelationsRequest *Request,
+                   grpc::ServerWriter<RelationsReply> *Reply) override {
+    auto StartTime = stopwatch::now();
+    WithContextValue WithRequestContext(CurrentRequest, Context);
+    logRequest(*Request);
+    trace::Span Tracer("ReverseRelationsRequest");
+    auto Req = ProtobufMarshaller->fromProtobuf(Request);
+    if (!Req) {
+      elog("Can not parse ReverseRelationsRequest from protobuf: {0}",
+           Req.takeError());
+      return grpc::Status::CANCELLED;
+    }
+    if (!Req->Limit || *Req->Limit > LimitResults) {
+      log("[public] Limiting result size for ReverseRelations request from {0} "
+          "to "
+          "{1}.",
+          Req->Limit, LimitResults);
+      Req->Limit = LimitResults;
+    }
+    unsigned Sent = 0;
+    unsigned FailedToSend = 0;
+    Index.reverseRelations(
+        *Req, [&](const SymbolID &Subject, const clangd::Symbol &Object) {
+          auto SerializedItem = ProtobufMarshaller->toProtobuf(Subject, Object);
+          if (!SerializedItem) {
+            elog("Unable to convert Relation to protobuf: {0}",
+                 SerializedItem.takeError());
+            ++FailedToSend;
+            return;
+          }
+          RelationsReply NextMessage;
+          *NextMessage.mutable_stream_result() = *SerializedItem;
+          logResponse(NextMessage);
+          Reply->Write(NextMessage);
+          ++Sent;
+        });
+    RelationsReply LastMessage;
+    LastMessage.mutable_final_result()->set_has_more(true);
+    logResponse(LastMessage);
+    Reply->Write(LastMessage);
+    SPAN_ATTACH(Tracer, "Sent", Sent);
+    SPAN_ATTACH(Tracer, "Failed to send", FailedToSend);
+    logRequestSummary("v1/ReverseRelations", Sent, StartTime);
+    return grpc::Status::OK;
+  }
+
   // Proxy object to allow proto messages to be lazily serialized as text.
   struct TextProto {
     const google::protobuf::Message &M;
diff --git a/clang-tools-extra/clangd/refactor/tweaks/AddUsing.cpp b/clang-tools-extra/clangd/refactor/tweaks/AddUsing.cpp
index f65c74fdbc9ee..3f43362a307a9 100644
--- a/clang-tools-extra/clangd/refactor/tweaks/AddUsing.cpp
+++ b/clang-tools-extra/clangd/refactor/tweaks/AddUsing.cpp
@@ -322,7 +322,7 @@ bool AddUsing::prepare(const Selection &Inputs) {
       if (!QualifierToRemove)
         break;
       SpelledNameRange = TL.getNameLoc();
-      MustInsertAfterLoc = TL.getOriginalDecl()->getBeginLoc();
+      MustInsertAfterLoc = TL.getDecl()->getBeginLoc();
       break;
     }
     case TypeLoc::Typedef: {
diff --git a/clang-tools-extra/clangd/refactor/tweaks/PopulateSwitch.cpp b/clang-tools-extra/clangd/refactor/tweaks/PopulateSwitch.cpp
index 2c9841762b869..769d73f34d445 100644
--- a/clang-tools-extra/clangd/refactor/tweaks/PopulateSwitch.cpp
+++ b/clang-tools-extra/clangd/refactor/tweaks/PopulateSwitch.cpp
@@ -116,7 +116,7 @@ bool PopulateSwitch::prepare(const Selection &Sel) {
   EnumT = Cond->getType()->getAsCanonical<EnumType>();
   if (!EnumT)
     return false;
-  EnumD = EnumT->getOriginalDecl()->getDefinitionOrSelf();
+  EnumD = EnumT->getDecl()->getDefinitionOrSelf();
   if (EnumD->isDependentType())
     return false;
 
diff --git a/clang-tools-extra/clangd/unittests/CallHierarchyTests.cpp b/clang-tools-extra/clangd/unittests/CallHierarchyTests.cpp
index 08cc80ff8981e..9859577c7cf7e 100644
--- a/clang-tools-extra/clangd/unittests/CallHierarchyTests.cpp
+++ b/clang-tools-extra/clangd/unittests/CallHierarchyTests.cpp
@@ -162,6 +162,43 @@ TEST(CallHierarchy, IncomingOneFileObjC) {
   EXPECT_THAT(IncomingLevel4, IsEmpty());
 }
 
+TEST(CallHierarchy, IncomingIncludeOverrides) {
+  Annotations Source(R"cpp(
+    void call^ee() {}
+    struct Interface {
+      virtual void Func() = 0;
+    };
+    struct Implementation : public Interface {
+      void Func() override {
+          $Callee[[callee]]();
+      }
+    };
+    void Test(Interface& cls){
+      cls.$FuncCall[[Func]]();
+    }
+  )cpp");
+  TestTU TU = TestTU::withCode(Source.code());
+  auto AST = TU.build();
+  auto Index = TU.index();
+
+  std::vector<CallHierarchyItem> Items =
+      prepareCallHierarchy(AST, Source.point(), testPath(TU.Filename));
+  ASSERT_THAT(Items, ElementsAre(withName("callee")));
+  auto IncomingLevel1 = incomingCalls(Items[0], Index.get());
+  ASSERT_THAT(IncomingLevel1,
+              ElementsAre(AllOf(from(AllOf(withName("Func"),
+                                           withDetail("Implementation::Func"))),
+                                iFromRanges(Source.range("Callee")))));
+  auto IncomingLevel2 = incomingCalls(IncomingLevel1[0].from, Index.get());
+  ASSERT_THAT(
+      IncomingLevel2,
+      ElementsAre(AllOf(from(AllOf(withName("Test"), withDetail("Test"))),
+                        iFromRanges(Source.range("FuncCall")))));
+
+  auto IncomingLevel3 = incomingCalls(IncomingLevel2[0].from, Index.get());
+  EXPECT_THAT(IncomingLevel3, IsEmpty());
+}
+
 TEST(CallHierarchy, MainFileOnlyRef) {
   // In addition to testing that we store refs to main-file only symbols,
   // this tests that anonymous namespaces do not interfere with the
diff --git a/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp b/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp
index 768f88f177e56..e2bdb0fe46e37 100644
--- a/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp
+++ b/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp
@@ -1852,6 +1852,11 @@ class IndexRequestCollector : public SymbolIndex {
                  llvm::function_ref<void(const SymbolID &, const Symbol &)>)
       const override {}
 
+  void
+  reverseRelations(const RelationsRequest &,
+                   llvm::function_ref<void(const SymbolID &, const Symbol &)>)
+      const override {}
+
   llvm::unique_function<IndexContents(llvm::StringRef) const>
   indexedFiles() const override {
     return [](llvm::StringRef) { return IndexContents::None; };
diff --git a/clang-tools-extra/clangd/unittests/FileDistanceTests.cpp b/clang-tools-extra/clangd/unittests/FileDistanceTests.cpp
index 3003582959af3..aed3400ab1296 100644
--- a/clang-tools-extra/clangd/unittests/FileDistanceTests.cpp
+++ b/clang-tools-extra/clangd/unittests/FileDistanceTests.cpp
@@ -58,7 +58,7 @@ TEST(FileDistanceTests, BadSource) {
 }
 
 // Force the unittest URI scheme to be linked,
-static int LLVM_ATTRIBUTE_UNUSED UseUnittestScheme = UnittestSchemeAnchorSource;
+[[maybe_unused]] static int UseUnittestScheme = UnittestSchemeAnchorSource;
 
 TEST(FileDistanceTests, URI) {
   FileDistanceOptions Opts;
diff --git a/clang-tools-extra/clangd/unittests/QualityTests.cpp b/clang-tools-extra/clangd/unittests/QualityTests.cpp
index 4954659a45e02..879a1793128c0 100644
--- a/clang-tools-extra/clangd/unittests/QualityTests.cpp
+++ b/clang-tools-extra/clangd/unittests/QualityTests.cpp
@@ -33,7 +33,7 @@ namespace clang {
 namespace clangd {
 
 // Force the unittest URI scheme to be linked,
-static int LLVM_ATTRIBUTE_UNUSED UnittestSchemeAnchorDest =
+[[maybe_unused]] static int UnittestSchemeAnchorDest =
     UnittestSchemeAnchorSource;
 
 namespace {
diff --git a/clang-tools-extra/clangd/unittests/RenameTests.cpp b/clang-tools-extra/clangd/unittests/RenameTests.cpp
index 5d2a77b62a219..42279b51230e7 100644
--- a/clang-tools-extra/clangd/unittests/RenameTests.cpp
+++ b/clang-tools-extra/clangd/unittests/RenameTests.cpp
@@ -1674,6 +1674,11 @@ TEST(CrossFileRenameTests, DirtyBuffer) {
                    llvm::function_ref<void(const SymbolID &, const Symbol &)>
                        Callback) const override {}
 
+    void
+    reverseRelations(const RelationsRequest &Req,
+                     llvm::function_ref<void(const SymbolID &, const Symbol &)>
+                         Callback) const override {}
+
     llvm::unique_function<IndexContents(llvm::StringRef) const>
     indexedFiles() const override {
       return [](llvm::StringRef) { return IndexContents::None; };
@@ -1729,6 +1734,11 @@ TEST(CrossFileRenameTests, DeduplicateRefsFromIndex) {
                    llvm::function_ref<void(const SymbolID &, const Symbol &)>)
         const override {}
 
+    void
+    reverseRelations(const RelationsRequest &,
+                     llvm::function_ref<void(const SymbolID &, const Symbol &)>)
+        const override {}
+
     llvm::unique_function<IndexContents(llvm::StringRef) const>
     indexedFiles() const override {
       return [](llvm::StringRef) { return IndexContents::None; };
diff --git a/clang-tools-extra/clangd/unittests/URITests.cpp b/clang-tools-extra/clangd/unittests/URITests.cpp
index 99d59b65cbbc1..c0ccfc539c452 100644
--- a/clang-tools-extra/clangd/unittests/URITests.cpp
+++ b/clang-tools-extra/clangd/unittests/URITests.cpp
@@ -16,7 +16,7 @@ namespace clang {
 namespace clangd {
 
 // Force the unittest URI scheme to be linked,
-static int LLVM_ATTRIBUTE_UNUSED UnittestSchemeAnchorDest =
+[[maybe_unused]] static int UnittestSchemeAnchorDest =
     UnittestSchemeAnchorSource;
 
 namespace {
diff --git a/clang-tools-extra/clangd/unittests/lit.cfg.py b/clang-tools-extra/clangd/unittests/lit.cfg.py
index 33aa9e61f4ce9..666e9879bb4ad 100644
--- a/clang-tools-extra/clangd/unittests/lit.cfg.py
+++ b/clang-tools-extra/clangd/unittests/lit.cfg.py
@@ -19,12 +19,12 @@
 
 if platform.system() == "Darwin":
     shlibpath_var = "DYLD_LIBRARY_PATH"
-elif platform.system() == "Windows":
+elif platform.system() == "Windows" or sys.platform == "cygwin":
     shlibpath_var = "PATH"
 else:
     shlibpath_var = "LD_LIBRARY_PATH"
 config.environment[shlibpath_var] = os.path.pathsep.join(
-    ("@SHLIBDIR@", "@LLVM_LIBS_DIR@", config.environment.get(shlibpath_var, ""))
+    (config.shlibdir, config.llvm_libs_dir, config.environment.get(shlibpath_var, ""))
 )
 
 # It is not realistically possible to account for all options that could
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 33cc401bcb78f..8a0151f567c24 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -132,7 +132,7 @@ Improvements to clang-tidy
   when run over C files. If ``-std`` is not specified, it defaults to
   ``c99-or-later``.
 
-- :program:`clang-tidy` no longer attemps to analyze code from system headers
+- :program:`clang-tidy` no longer attempts to analyze code from system headers
   by default, greatly improving performance. This behavior is disabled if the
   `SystemHeaders` option is enabled.
 
@@ -274,6 +274,11 @@ Changes in existing checks
   <clang-tidy/checks/bugprone/narrowing-conversions>` check by fixing
   false positive from analysis of a conditional expression in C.
 
+- Improved :doc:`bugprone-not-null-terminated-result
+  <clang-tidy/checks/bugprone/not-null-terminated-result>` check by fixing
+  bogus fix-its for ``strncmp`` and ``wcsncmp`` on Windows and
+  a crash caused by certain value-dependent expressions.
+
 - Improved :doc:`bugprone-reserved-identifier
   <clang-tidy/checks/bugprone/reserved-identifier>` check by ignoring
   declarations and macros in system headers.
@@ -336,8 +341,9 @@ Changes in existing checks
 
 - Improved :doc:`misc-const-correctness
   <clang-tidy/checks/misc/const-correctness>` check to avoid false
-  positives when pointers is tranferred to non-const references 
-  and avoid false positives of function pointer.
+  positives when pointers is transferred to non-const references 
+  and avoid false positives of function pointer and fix false
+  positives on return of non-const pointer.
 
 - Improved :doc:`misc-header-include-cycle
   <clang-tidy/checks/misc/header-include-cycle>` check performance.
@@ -359,6 +365,11 @@ Changes in existing checks
   <clang-tidy/checks/modernize/use-designated-initializers>` check to
   suggest using designated initializers for aliased aggregate types.
 
+- Improved :doc:`modernize-use-integer-sign-comparison
+  <clang-tidy/checks/modernize/use-integer-sign-comparison>` by providing
+  correct fix-its when the right-hand side of a comparison contains a
+  non-C-style cast.
+
 - Improved :doc:`modernize-use-nullptr
   <clang-tidy/checks/modernize/use-nullptr>` check by fixing a crash
   on Windows when the check was enabled with a 32-bit :program:`clang-tidy`
diff --git a/clang-tools-extra/include-cleaner/lib/WalkAST.cpp b/clang-tools-extra/include-cleaner/lib/WalkAST.cpp
index 7bbdc8ba00dca..d444ddd90839d 100644
--- a/clang-tools-extra/include-cleaner/lib/WalkAST.cpp
+++ b/clang-tools-extra/include-cleaner/lib/WalkAST.cpp
@@ -342,7 +342,7 @@ class ASTWalker : public RecursiveASTVisitor<ASTWalker> {
   }
 
   bool VisitTagTypeLoc(TagTypeLoc TTL) {
-    reportType(TTL.getNameLoc(), TTL.getOriginalDecl());
+    reportType(TTL.getNameLoc(), TTL.getDecl());
     return true;
   }
 
diff --git a/clang-tools-extra/include-cleaner/test/Unit/lit.cfg.py b/clang-tools-extra/include-cleaner/test/Unit/lit.cfg.py
index 0963351abe3b1..c4454df06b386 100644
--- a/clang-tools-extra/include-cleaner/test/Unit/lit.cfg.py
+++ b/clang-tools-extra/include-cleaner/test/Unit/lit.cfg.py
@@ -11,12 +11,12 @@
 
 if platform.system() == "Darwin":
     shlibpath_var = "DYLD_LIBRARY_PATH"
-elif platform.system() == "Windows":
+elif platform.system() == "Windows" or sys.platform == "cygwin":
     shlibpath_var = "PATH"
 else:
     shlibpath_var = "LD_LIBRARY_PATH"
 config.environment[shlibpath_var] = os.path.pathsep.join(
-    ("@SHLIBDIR@", "@LLVM_LIBS_DIR@", config.environment.get(shlibpath_var, ""))
+    (config.shlibdir, config.llvm_libs_dir, config.environment.get(shlibpath_var, ""))
 )
 
 # It is not realistically possible to account for all options that could
diff --git a/clang-tools-extra/test/Unit/lit.cfg.py b/clang-tools-extra/test/Unit/lit.cfg.py
index b7376a02c89e1..0254829ed67e4 100644
--- a/clang-tools-extra/test/Unit/lit.cfg.py
+++ b/clang-tools-extra/test/Unit/lit.cfg.py
@@ -21,7 +21,7 @@
 
 if platform.system() == "Darwin":
     shlibpath_var = "DYLD_LIBRARY_PATH"
-elif platform.system() == "Windows":
+elif platform.system() == "Windows" or sys.platform == "cygwin":
     shlibpath_var = "PATH"
 else:
     shlibpath_var = "LD_LIBRARY_PATH"
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/not-null-terminated-result-strlen.c b/clang-tools-extra/test/clang-tidy/checkers/bugprone/not-null-terminated-result-strlen.c
index dccf4ed799499..ca86986f4afce 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/not-null-terminated-result-strlen.c
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/not-null-terminated-result-strlen.c
@@ -1,11 +1,6 @@
 // RUN: %check_clang_tidy %s bugprone-not-null-terminated-result %t -- \
 // RUN: -- -I %S/Inputs/not-null-terminated-result
 
-// FIXME: Something wrong with the APInt un/signed conversion on Windows:
-// in 'strncmp(str6, "string", 7);' it tries to inject '4294967302' as length.
-
-// UNSUPPORTED: system-windows
-
 #include "not-null-terminated-result-c.h"
 
 #define __STDC_LIB_EXT1__ 1
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/not-null-terminated-result-value-dependent-crash.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/not-null-terminated-result-value-dependent-crash.cpp
new file mode 100644
index 0000000000000..5f361c35e448c
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/not-null-terminated-result-value-dependent-crash.cpp
@@ -0,0 +1,23 @@
+// RUN: %check_clang_tidy %s bugprone-not-null-terminated-result %t -- \
+// RUN: -- -std=c++17 -I %S/Inputs/not-null-terminated-result
+
+// This test case reproduces the crash when the check tries to evaluate
+// a value-dependent expression using EvaluateAsInt() in
+// bugprone-not-null-terminated-result, where the src parameter of memcpy is
+// value-dependent, but the length is not.
+
+// expected-no-diagnostics
+
+#include "not-null-terminated-result-cxx.h"
+
+template<size_t N>
+class ValueDependentClass {
+public:
+  void copyData(char* Dst) {
+    const char* Src = reinterpret_cast<const char*>(this);
+    // The length parameter is arbitrary, but the crash is not reproduced if it is N.
+    memcpy(Dst, Src, 32);
+  }
+};
+
+template class ValueDependentClass<42>; // The template parameter value is arbitrary.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/not-null-terminated-result-wcslen.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/not-null-terminated-result-wcslen.cpp
index 8047db3f19969..688e41471d867 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/not-null-terminated-result-wcslen.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/not-null-terminated-result-wcslen.cpp
@@ -1,11 +1,6 @@
 // RUN: %check_clang_tidy -std=c++11-or-later %s bugprone-not-null-terminated-result %t -- \
 // RUN: -- -I %S/Inputs/not-null-terminated-result
 
-// FIXME: Something wrong with the APInt un/signed conversion on Windows:
-// in 'wcsncmp(wcs6, L"string", 7);' it tries to inject '4294967302' as length.
-
-// UNSUPPORTED: system-windows
-
 #include "not-null-terminated-result-cxx.h"
 
 #define __STDC_LIB_EXT1__ 1
diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-pointer-as-pointers.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-pointer-as-pointers.cpp
index e20680ceeefa5..4c847b58d395c 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-pointer-as-pointers.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-pointer-as-pointers.cpp
@@ -48,6 +48,11 @@ void ignore_const_alias() {
   p_local0 = &a[1];
 }
 
+void *return_non_const() {
+  void *const a = nullptr;
+  return a;
+}
+
 void function_pointer_basic() {
   void (*const fp)() = nullptr;
   fp();
diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/misplaced-const-cxx17.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/misplaced-const-cxx17.cpp
index 7816a091d7adb..56029325420e1 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/misc/misplaced-const-cxx17.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/misc/misplaced-const-cxx17.cpp
@@ -3,7 +3,7 @@
 // This test previously would cause a failed assertion because the structured
 // binding declaration had no valid type associated with it. This ensures the
 // expected clang diagnostic is generated instead.
-// CHECK-MESSAGES: :[[@LINE+1]]:6: error: decomposition declaration '[x]' requires an initializer [clang-diagnostic-error]
+// CHECK-MESSAGES: :[[@LINE+1]]:6: error: structured binding declaration '[x]' requires an initializer [clang-diagnostic-error]
 auto [x];
 
 struct S { int a; };
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-integer-sign-comparison-qt.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-integer-sign-comparison-qt.cpp
index 1f26ff34a4d04..31a3677c2bbd0 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-integer-sign-comparison-qt.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-integer-sign-comparison-qt.cpp
@@ -92,8 +92,7 @@ int AllComparisons() {
     if (static_cast<unsigned int>(uArray[2]) < static_cast<int>(sArray[2]))
         return 0;
 // CHECK-MESSAGES: :[[@LINE-2]]:9: warning: comparison between 'signed' and 'unsigned' integers [modernize-use-integer-sign-comparison]
-// CHECK-FIXES: if (q20::cmp_less(uArray[2],sArray[2])))
-// FIXME: There should only be 2 closing braces. The fix-it inserts an unbalanced one.
+// CHECK-FIXES: if (q20::cmp_less(uArray[2],sArray[2]))
 
     if ((unsigned int)uArray[3] < (int)sArray[3])
         return 0;
@@ -116,6 +115,11 @@ int AllComparisons() {
 // CHECK-MESSAGES: :[[@LINE-2]]:9: warning: comparison between 'signed' and 'unsigned' integers [modernize-use-integer-sign-comparison]
 // CHECK-FIXES: if (q20::cmp_greater(uArray[6] , VALUE))
 
+    if (unsigned(uArray[7]) >= int(sArray[7]))
+        return 0;
+// CHECK-MESSAGES: :[[@LINE-2]]:9: warning: comparison between 'signed' and 'unsigned' integers [modernize-use-integer-sign-comparison]
+// CHECK-FIXES: if (q20::cmp_greater_equal(uArray[7],sArray[7]))
+
 
     FuncParameters(uVar);
     TemplateFuncParameter(sVar);
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-integer-sign-comparison.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-integer-sign-comparison.cpp
index 628cee0bb0de7..e7981a6d41883 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-integer-sign-comparison.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-integer-sign-comparison.cpp
@@ -91,8 +91,7 @@ int AllComparisons() {
     if (static_cast<unsigned int>(uArray[2]) < static_cast<int>(sArray[2]))
         return 0;
 // CHECK-MESSAGES: :[[@LINE-2]]:9: warning: comparison between 'signed' and 'unsigned' integers [modernize-use-integer-sign-comparison]
-// CHECK-FIXES: if (std::cmp_less(uArray[2],sArray[2])))
-// FIXME: There should only be 2 closing braces. The fix-it inserts an unbalanced one.
+// CHECK-FIXES: if (std::cmp_less(uArray[2],sArray[2]))
 
     if ((unsigned int)uArray[3] < (int)sArray[3])
         return 0;
@@ -115,6 +114,11 @@ int AllComparisons() {
 // CHECK-MESSAGES: :[[@LINE-2]]:9: warning: comparison between 'signed' and 'unsigned' integers [modernize-use-integer-sign-comparison]
 // CHECK-FIXES: if (std::cmp_greater(uArray[6] , VALUE))
 
+    if (unsigned(uArray[7]) >= int(sArray[7]))
+        return 0;
+// CHECK-MESSAGES: :[[@LINE-2]]:9: warning: comparison between 'signed' and 'unsigned' integers [modernize-use-integer-sign-comparison]
+// CHECK-FIXES: if (std::cmp_greater_equal(uArray[7],sArray[7]))
+
 
     FuncParameters(uVar);
     TemplateFuncParameter(sVar);
diff --git a/clang/cmake/modules/CMakeLists.txt b/clang/cmake/modules/CMakeLists.txt
index d2d68121371bf..9ad2f984f0e27 100644
--- a/clang/cmake/modules/CMakeLists.txt
+++ b/clang/cmake/modules/CMakeLists.txt
@@ -8,15 +8,19 @@ include(FindPrefixFromConfig)
 # the usual CMake convention seems to be ${Project}Targets.cmake.
 set(CLANG_INSTALL_PACKAGE_DIR "${CMAKE_INSTALL_PACKAGEDIR}/clang" CACHE STRING
   "Path for CMake subdirectory for Clang (defaults to '${CMAKE_INSTALL_PACKAGEDIR}/clang')")
-# CMAKE_INSTALL_PACKAGEDIR might be absolute, so don't reuse below.
-set(clang_cmake_builddir "${CMAKE_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX}/cmake/clang")
 
 # Keep this in sync with llvm/cmake/CMakeLists.txt!
 set(LLVM_INSTALL_PACKAGE_DIR "${CMAKE_INSTALL_PACKAGEDIR}/llvm" CACHE STRING
   "Path for CMake subdirectory for LLVM (defaults to '${CMAKE_INSTALL_PACKAGEDIR}/llvm')")
 # CMAKE_INSTALL_PACKAGEDIR might be absolute, so don't reuse below.
-string(REPLACE "${CMAKE_CFG_INTDIR}" "." llvm_cmake_builddir "${LLVM_LIBRARY_DIR}")
-set(llvm_cmake_builddir "${llvm_cmake_builddir}/cmake/llvm")
+string(REPLACE "${CMAKE_CFG_INTDIR}" "." llvm_builddir "${LLVM_LIBRARY_DIR}")
+set(llvm_cmake_builddir "${llvm_builddir}/cmake/llvm")
+if(CLANG_BUILT_STANDALONE)
+  # CMAKE_INSTALL_PACKAGEDIR might be absolute, so don't reuse below.
+  set(clang_cmake_builddir "${CMAKE_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX}/cmake/clang")
+else()
+  set(clang_cmake_builddir "${llvm_builddir}/cmake/clang")
+endif()
 
 get_property(CLANG_EXPORTS GLOBAL PROPERTY CLANG_EXPORTS)
 export(TARGETS ${CLANG_EXPORTS} FILE ${clang_cmake_builddir}/ClangTargets.cmake)
diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst
index b746df5dab264..570cab262c115 100644
--- a/clang/docs/ClangFormatStyleOptions.rst
+++ b/clang/docs/ClangFormatStyleOptions.rst
@@ -245,7 +245,7 @@ the configuration (without a prefix: ``Auto``).
     .. note::
 
      This currently only applies to braced initializer lists (when
-     ``Cpp11BracedListStyle`` is ``true``) and parentheses.
+     ``Cpp11BracedListStyle`` is not ``Block``) and parentheses.
 
 
 
@@ -3816,29 +3816,72 @@ the configuration (without a prefix: ``Auto``).
 
 .. _Cpp11BracedListStyle:
 
-**Cpp11BracedListStyle** (``Boolean``) :versionbadge:`clang-format 3.4` :ref:`¶ <Cpp11BracedListStyle>`
-  If ``true``, format braced lists as best suited for C++11 braced
-  lists.
+**Cpp11BracedListStyle** (``BracedListStyle``) :versionbadge:`clang-format 3.4` :ref:`¶ <Cpp11BracedListStyle>`
+  The style to handle braced lists.
 
-  Important differences:
+  Possible values:
 
-  * No spaces inside the braced list.
-  * No line break before the closing brace.
-  * Indentation with the continuation indent, not with the block indent.
+  * ``BLS_Block`` (in configuration: ``Block``)
+    Best suited for pre C++11 braced lists.
 
-  Fundamentally, C++11 braced lists are formatted exactly like function
-  calls would be formatted in their place. If the braced list follows a name
-  (e.g. a type or variable name), clang-format formats as if the ``{}`` were
-  the parentheses of a function call with that name. If there is no name,
-  a zero-length name is assumed.
+    * Spaces inside the braced list.
+    * Line break before the closing brace.
+    * Indentation with the block indent.
+
+
+    .. code-block:: c++
+
+       vector<int> x{ 1, 2, 3, 4 };
+       vector<T> x{ {}, {}, {}, {} };
+       f(MyMap[{ composite, key }]);
+       new int[3]{ 1, 2, 3 };
+       Type name{ // Comment
+                  value
+       };
+
+  * ``BLS_FunctionCall`` (in configuration: ``FunctionCall``)
+    Best suited for C++11 braced lists.
+
+    * No spaces inside the braced list.
+    * No line break before the closing brace.
+    * Indentation with the continuation indent.
+
+    Fundamentally, C++11 braced lists are formatted exactly like function
+    calls would be formatted in their place. If the braced list follows a
+    name (e.g. a type or variable name), clang-format formats as if the
+    ``{}`` were the parentheses of a function call with that name. If there
+    is no name, a zero-length name is assumed.
+
+    .. code-block:: c++
+
+       vector<int> x{1, 2, 3, 4};
+       vector<T> x{{}, {}, {}, {}};
+       f(MyMap[{composite, key}]);
+       new int[3]{1, 2, 3};
+       Type name{ // Comment
+           value};
+
+  * ``BLS_AlignFirstComment`` (in configuration: ``AlignFirstComment``)
+    Same as ``FunctionCall``, except for the handling of a comment at the
+    begin, it then aligns everything following with the comment.
+
+    * No spaces inside the braced list. (Even for a comment at the first
+      position.)
+    * No line break before the closing brace.
+    * Indentation with the continuation indent, except when followed by a
+      line comment, then it uses the block indent.
+
+
+    .. code-block:: c++
+
+       vector<int> x{1, 2, 3, 4};
+       vector<T> x{{}, {}, {}, {}};
+       f(MyMap[{composite, key}]);
+       new int[3]{1, 2, 3};
+       Type name{// Comment
+                 value};
 
-  .. code-block:: c++
 
-     true:                                  false:
-     vector<int> x{1, 2, 3, 4};     vs.     vector<int> x{ 1, 2, 3, 4 };
-     vector<T> x{{}, {}, {}, {}};           vector<T> x{ {}, {}, {}, {} };
-     f(MyMap[{composite, key}]);            f(MyMap[{ composite, key }]);
-     new int[3]{1, 2, 3};                   new int[3]{ 1, 2, 3 };
 
 .. _DeriveLineEnding:
 
@@ -6625,7 +6668,7 @@ the configuration (without a prefix: ``Auto``).
   .. note::
 
    This option doesn't apply to initializer braces if
-   ``Cpp11BracedListStyle`` is set to ``true``.
+   ``Cpp11BracedListStyle`` is not ``Block``.
 
   Possible values:
 
diff --git a/clang/docs/HIPSupport.rst b/clang/docs/HIPSupport.rst
index b4a671e3cfa3c..ec2af2a6f569d 100644
--- a/clang/docs/HIPSupport.rst
+++ b/clang/docs/HIPSupport.rst
@@ -164,6 +164,8 @@ Predefined Macros
      - Represents wavefront memory scope in HIP (value is 2).
    * - ``__HIP_MEMORY_SCOPE_WORKGROUP``
      - Represents workgroup memory scope in HIP (value is 3).
+   * - ``__HIP_MEMORY_SCOPE_CLUSTER``
+     - Represents cluster memory scope in HIP (value is 6).
    * - ``__HIP_MEMORY_SCOPE_AGENT``
      - Represents agent memory scope in HIP (value is 4).
    * - ``__HIP_MEMORY_SCOPE_SYSTEM``
diff --git a/clang/docs/InternalsManual.rst b/clang/docs/InternalsManual.rst
index c677ddfa5ecc1..eff46ab46e1ca 100644
--- a/clang/docs/InternalsManual.rst
+++ b/clang/docs/InternalsManual.rst
@@ -10,7 +10,7 @@ Introduction
 
 This document describes some of the more important APIs and internal design
 decisions made in the Clang C front-end.  The purpose of this document is to
-both capture some of this high level information and also describe some of the
+both capture some of this high-level information and also describe some of the
 design decisions behind it.  This is meant for people interested in hacking on
 Clang, not for end-users.  The description below is categorized by libraries,
 and does not describe any of the clients of the libraries.
@@ -20,7 +20,7 @@ LLVM Support Library
 
 The LLVM ``libSupport`` library provides many underlying libraries and
 `data-structures <https://llvm.org/docs/ProgrammersManual.html>`_, including
-command line option processing, various containers and a system abstraction
+command line option processing, various containers, and a system abstraction
 layer, which is used for file system access.
 
 The Clang "Basic" Library
@@ -34,7 +34,7 @@ and information about the subset of the language being compiled for.
 Part of this infrastructure is specific to C (such as the ``TargetInfo``
 class), other parts could be reused for other non-C-based languages
 (``SourceLocation``, ``SourceManager``, ``Diagnostics``, ``FileManager``).
-When and if there is future demand we can figure out if it makes sense to
+When and if there is future demand, we can figure out if it makes sense to
 introduce a new library, move the general classes somewhere else, or introduce
 some other solution.
 
@@ -96,7 +96,7 @@ The ``EXTENSION`` and ``EXTWARN`` severities are used for extensions to the
 language that Clang accepts.  This means that Clang fully understands and can
 represent them in the AST, but we produce diagnostics to tell the user their
 code is non-portable.  The difference is that the former are ignored by
-default, and the later warn by default.  The ``WARNING`` severity is used for
+default, and the latter warn by default.  The ``WARNING`` severity is used for
 constructs that are valid in the currently selected source language but that
 are dubious in some way.  The ``REMARK`` severity provides generic information
 about the compilation that is not necessarily related to any dubious code.  The
@@ -106,7 +106,7 @@ These *severities* are mapped into a smaller set (the ``Diagnostic::Level``
 enum, {``Ignored``, ``Note``, ``Remark``, ``Warning``, ``Error``, ``Fatal``}) of
 output
 *levels* by the diagnostics subsystem based on various configuration options.
-Clang internally supports a fully fine grained mapping mechanism that allows
+Clang internally supports a fully fine-grained mapping mechanism that allows
 you to map almost any diagnostic to the output level that you want.  The only
 diagnostics that cannot be mapped are ``NOTE``\ s, which always follow the
 severity of the previously emitted diagnostic and ``ERROR``\ s, which can only
@@ -116,18 +116,18 @@ example).
 Diagnostic mappings are used in many ways.  For example, if the user specifies
 ``-pedantic``, ``EXTENSION`` maps to ``Warning``, if they specify
 ``-pedantic-errors``, it turns into ``Error``.  This is used to implement
-options like ``-Wunused_macros``, ``-Wundef`` etc.
+options like ``-Wunused_macros``, ``-Wundef``, etc.
 
 Mapping to ``Fatal`` should only be used for diagnostics that are considered so
 severe that error recovery won't be able to recover sensibly from them (thus
-spewing a ton of bogus errors).  One example of this class of error are failure
+spewing a ton of bogus errors).  One example of this class of error is failure
 to ``#include`` a file.
 
 Diagnostic Wording
 ^^^^^^^^^^^^^^^^^^
 The wording used for a diagnostic is critical because it is the only way for a
 user to know how to correct their code. Use the following suggestions when
-wording a diagnostic.
+wording a diagnostic:
 
 * Diagnostics in Clang do not start with a capital letter and do not end with
   punctuation.
@@ -162,7 +162,7 @@ wording a diagnostic.
   cannot be null in well-defined C++ code``.
 * Prefer diagnostic wording without contractions whenever possible. The single
   quote in a contraction can be visually distracting due to its use with
-  syntactic constructs and contractions can be harder to understand for non-
+  syntactic constructs, and contractions can be harder to understand for non-
   native English speakers.
 
 The Format String
@@ -195,14 +195,14 @@ the C++ code that :ref:`produces them <internals-producing-diag>`, and are
 referenced by ``%0`` .. ``%9``.  If you have more than 10 arguments to your
 diagnostic, you are doing something wrong :).  Unlike ``printf``, there is no
 requirement that arguments to the diagnostic end up in the output in the same
-order as they are specified, you could have a format string with "``%1 %0``"
+order as they are specified; you could have a format string with "``%1 %0``"
 that swaps them, for example.  The text in between the percent and digit are
 formatting instructions.  If there are no instructions, the argument is just
 turned into a string and substituted in.
 
 Here are some "best practices" for writing the English format string:
 
-* Keep the string short.  It should ideally fit in the 80 column limit of the
+* Keep the string short.  It should ideally fit in the 80-column limit of the
   ``DiagnosticKinds.td`` file.  This avoids the diagnostic wrapping when
   printed, and forces you to think about the important point you are conveying
   with the diagnostic.
@@ -227,7 +227,7 @@ used to achieve this sort of thing in a localizable way, see below.
 Formatting a Diagnostic Argument
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Arguments to diagnostics are fully typed internally, and come from a couple
+Arguments to diagnostics are fully typed internally and come from a couple of
 different classes: integers, types, names, and random strings.  Depending on
 the class of the argument, it can be optionally formatted in different ways.
 This gives the ``DiagnosticConsumer`` information about what the argument means
@@ -268,7 +268,7 @@ Description:
   This format specifier is used to merge multiple related diagnostics together
   into one common one, without requiring the difference to be specified as an
   English string argument.  Instead of specifying the string, the diagnostic
-  gets an integer argument and the format string selects the numbered option.
+  gets an integer argument, and the format string selects the numbered option.
   In this case, the "``%0``" value must be an integer in the range [0..2].  If
   it is 0, it prints "unary", if it is 1 it prints "binary" if it is 2, it
   prints "unary or binary".  This allows other language translations to
@@ -287,7 +287,7 @@ Description:
   additionally generates a namespace, enumeration, and enumerator list based on
   the format string given. In the above case, a namespace is generated named
   ``FrobbleKind`` that has an unscoped enumeration with the enumerators
-  ``VarDecl`` and ``FuncDecl`` which correspond to the values 0 and 1. This
+  ``VarDecl`` and ``FuncDecl``, which correspond to the values 0 and 1. This
   permits a clearer use of the ``Diag`` in source code, as the above could be
   called as: ``Diag(Loc, diag::frobble) << diag::FrobbleKind::VarDecl``.
 
@@ -407,7 +407,7 @@ Example:
     def note_ovl_candidate : Note<
       "candidate %sub{select_ovl_candidate}3,2,1 not viable">;
 
-  and will act as if it was written
+  and will act as if it were written
   ``"candidate %select{function|constructor}3%select{| template| %1}2 not viable"``.
 Description:
   This format specifier is used to avoid repeating strings verbatim in multiple
@@ -447,7 +447,7 @@ For example, the binary expression error comes from code like this:
       << lex->getType() << rex->getType()
       << lex->getSourceRange() << rex->getSourceRange();
 
-This shows that use of the ``Diag`` method: it takes a location (a
+This shows the use of the ``Diag`` method: it takes a location (a
 :ref:`SourceLocation <SourceLocation>` object) and a diagnostic enum value
 (which matches the name from ``Diagnostic*Kinds.td``).  If the diagnostic takes
 arguments, they are specified with the ``<<`` operator: the first argument
@@ -586,7 +586,7 @@ Strangely enough, the ``SourceLocation`` class represents a location within the
 source code of the program.  Important design points include:
 
 #. ``sizeof(SourceLocation)`` must be extremely small, as these are embedded
-   into many AST nodes and are passed around often.  Currently it is 32 bits.
+   into many AST nodes and are passed around often.  Currently, it is 32 bits.
 #. ``SourceLocation`` must be a simple value object that can be efficiently
    copied.
 #. We should be able to represent a source location for any byte of any input
@@ -605,7 +605,7 @@ In practice, the ``SourceLocation`` works together with the ``SourceManager``
 class to encode two pieces of information about a location: its spelling
 location and its expansion location.  For most tokens, these will be the
 same.  However, for a macro expansion (or tokens that came from a ``_Pragma``
-directive) these will describe the location of the characters corresponding to
+directive), these will describe the location of the characters corresponding to
 the token and the location where the token was used (i.e., the macro
 expansion point or the location of the ``_Pragma`` itself).
 
@@ -621,7 +621,7 @@ token.  This concept maps directly to the "spelling location" for the token.
 .. mostly taken from https://discourse.llvm.org/t/code-ranges-of-tokens-ast-elements/16893/2
 
 Clang represents most source ranges by [first, last], where "first" and "last"
-each point to the beginning of their respective tokens.  For example consider
+each point to the beginning of their respective tokens.  For example, consider
 the ``SourceRange`` of the following statement:
 
 .. code-block:: text
@@ -632,7 +632,7 @@ the ``SourceRange`` of the following statement:
 To map from this representation to a character-based representation, the "last"
 location needs to be adjusted to point to (or past) the end of that token with
 either ``Lexer::MeasureTokenLength()`` or ``Lexer::getLocForEndOfToken()``.  For
-the rare cases where character-level source ranges information is needed we use
+the rare cases where character-level source ranges information is needed, we use
 the ``CharSourceRange`` class.
 
 The Driver Library
@@ -651,17 +651,17 @@ The Frontend Library
 ====================
 
 The Frontend library contains functionality useful for building tools on top of
-the Clang libraries, for example several methods for outputting diagnostics.
+the Clang libraries, including several methods for outputting diagnostics.
 
 Compiler Invocation
 -------------------
 
 One of the classes provided by the Frontend library is ``CompilerInvocation``,
-which holds information that describe current invocation of the Clang ``-cc1``
+which holds information that describes the current invocation of the Clang ``-cc1``
 frontend. The information typically comes from the command line constructed by
 the Clang driver or from clients performing custom initialization. The data
 structure is split into logical units used by different parts of the compiler,
-for example ``PreprocessorOptions``, ``LanguageOptions`` or ``CodeGenOptions``.
+for example, ``PreprocessorOptions``, ``LanguageOptions``, or ``CodeGenOptions``.
 
 Command Line Interface
 ----------------------
@@ -698,7 +698,7 @@ Adding new Command Line Option
 ------------------------------
 
 When adding a new command line option, the first place of interest is the header
-file declaring the corresponding options class (e.g. ``CodeGenOptions.h`` for
+file declaring the corresponding options class (e.g., ``CodeGenOptions.h`` for
 command line option that affects the code generation). Create new member
 variable for the option value:
 
@@ -739,7 +739,7 @@ The helper classes take a list of acceptable prefixes of the option (e.g.
 Then, specify additional attributes via mix-ins:
 
 * ``HelpText`` holds the text that will be printed besides the option name when
-  the user requests help (e.g. via ``clang --help``).
+  the user requests help (e.g., via ``clang --help``).
 * ``Group`` specifies the "category" of options this option belongs to. This is
   used by various tools to categorize and sometimes filter options.
 * ``Flags`` may contain "tags" associated with the option. These may affect how
@@ -779,7 +779,7 @@ use them to construct the ``-cc1`` job:
     }
 
 The last step is implementing the ``-cc1`` command line argument
-parsing/generation that initializes/serializes the option class (in our case
+parsing/generation that initializes/serializes the option class (in our case,
 ``CodeGenOptions``) stored within ``CompilerInvocation``. This can be done
 automatically by using the marshalling annotations on the option definition:
 
@@ -946,13 +946,13 @@ described below. All of them take a key path argument and possibly other
 information required for parsing or generating the command line argument.
 
 **Note:** The marshalling infrastructure is not intended for driver-only
-options. Only options of the ``-cc1`` frontend need to be marshalled to/from
+options. Only options of the ``-cc1`` frontend need to be marshalled to/from a
 ``CompilerInvocation`` instance.
 
 **Positive Flag**
 
 The key path defaults to ``false`` and is set to ``true`` when the flag is
-present on command line.
+present on the command line.
 
 .. code-block:: text
 
@@ -963,7 +963,7 @@ present on command line.
 **Negative Flag**
 
 The key path defaults to ``true`` and is set to ``false`` when the flag is
-present on command line.
+present on the command line.
 
 .. code-block:: text
 
@@ -1041,7 +1041,7 @@ and the result is assigned to the key path on success.
 
 The key path defaults to the value specified in ``MarshallingInfoEnum`` prefixed
 by the contents of ``NormalizedValuesScope`` and ``::``. This ensures correct
-reference to an enum case is formed even if the enum resides in different
+reference to an enum case is formed even if the enum resides in a different
 namespace or is an enum class. If the value present on the command line does not
 match any of the comma-separated values from ``Values``, an error diagnostic is
 issued. Otherwise, the corresponding element from ``NormalizedValues`` at the
@@ -1410,7 +1410,7 @@ or a clear engineering tradeoff -- should desugar minimally and wrap the result
 in a construct representing the original source form.
 
 For example, ``CXXForRangeStmt`` directly represents the syntactic form of a
-range-based for statement, but also holds a semantic representation of the
+range-based for statement but also holds a semantic representation of the
 range declaration and iterator declarations. It does not contain a
 fully-desugared ``ForStmt``, however.
 
@@ -1425,7 +1425,7 @@ with the same or similar semantics.
 The ``Type`` class and its subclasses
 -------------------------------------
 
-The ``Type`` class (and its subclasses) are an important part of the AST.
+The ``Type`` class (and its subclasses) is an important part of the AST.
 Types are accessed through the ``ASTContext`` class, which implicitly creates
 and uniques them as they are needed.  Types have a couple of non-obvious
 features: 1) they do not capture type qualifiers like ``const`` or ``volatile``
@@ -1474,7 +1474,7 @@ various operators (for example, the type of ``*Y`` is "``foo``", not
 is an instance of the ``TypedefType`` class, which indicates that the type of
 these expressions is a typedef for "``foo``".
 
-Representing types like this is great for diagnostics, because the
+Representing types like this is great for diagnostics because the
 user-specified type is always immediately available.  There are two problems
 with this: first, various semantic checks need to make judgements about the
 *actual structure* of a type, ignoring typedefs.  Second, we need an efficient
@@ -1521,7 +1521,7 @@ know it exists.  To continue the example, the result type of the indirection
 operator is the pointee type of the subexpression.  In order to determine the
 type, we need to get the instance of ``PointerType`` that best captures the
 typedef information in the program.  If the type of the expression is literally
-a ``PointerType``, we can return that, otherwise we have to dig through the
+a ``PointerType``, we can return that; otherwise, we have to dig through the
 typedefs to find the pointer type.  For example, if the subexpression had type
 "``foo*``", we could return that type as the result.  If the subexpression had
 type "``bar``", we want to return "``foo*``" (note that we do *not* want
@@ -1552,7 +1552,7 @@ that sets a bit), and remove one or more type qualifiers (just return a
 ``QualType`` with the bitfield set to empty).
 
 Further, because the bits are stored outside of the type itself, we do not need
-to create duplicates of types with different sets of qualifiers (i.e. there is
+to create duplicates of types with different sets of qualifiers (i.e., there is
 only a single heap allocated "``int``" type: "``const int``" and "``volatile
 const int``" both point to the same heap allocated "``int``" type).  This
 reduces the heap size used to represent bits and also means we do not have to
@@ -1972,7 +1972,7 @@ and optimize code for it, but it's used as parsing continues to detect further
 errors in the input. Clang-based tools also depend on such ASTs, and IDEs in
 particular benefit from a high-quality AST for broken code.
 
-In presence of errors, clang uses a few error-recovery strategies to present the
+In the presence of errors, clang uses a few error-recovery strategies to present the
 broken code in the AST:
 
 - correcting errors: in cases where clang is confident about the fix, it
@@ -1981,7 +1981,7 @@ broken code in the AST:
   provide more accurate subsequent diagnostics. Typo correction is a typical
   example.
 - representing invalid node: the invalid node is preserved in the AST in some
-  form, e.g. when the "declaration" part of the declaration contains semantic
+  form, e.g., when the "declaration" part of the declaration contains semantic
   errors, the Decl node is marked as invalid.
 - dropping invalid node: this often happens for errors that we don’t have
   graceful recovery. Prior to Recovery AST, a mismatched-argument function call
@@ -1994,9 +1994,9 @@ for broken code.
 Recovery AST
 ^^^^^^^^^^^^
 
-The idea of Recovery AST is to use recovery nodes which act as a placeholder to
+The idea of Recovery AST is to use recovery nodes, which act as a placeholder to
 maintain the rough structure of the parsing tree, preserve locations and
-children but have no language semantics attached to them.
+children, but have no language semantics attached to them.
 
 For example, consider the following mismatched function call:
 
@@ -2031,10 +2031,10 @@ With Recovery AST, the AST looks like:
           `-DeclRefExpr <col:9> 'int' lvalue ParmVar 'abc' 'int'
 
 
-An alternative is to use existing Exprs, e.g. CallExpr for the above example.
-This would capture more call details (e.g. locations of parentheses) and allow
+An alternative is to use existing Exprs, e.g., CallExpr for the above example.
+This would capture more call details (e.g., locations of parentheses) and allow
 it to be treated uniformly with valid CallExprs. However, jamming the data we
-have into CallExpr forces us to weaken its invariants, e.g. arg count may be
+have into CallExpr forces us to weaken its invariants, e.g., arg count may be
 wrong. This would introduce a huge burden on consumers of the AST to handle such
 "impossible" cases. So when we're representing (rather than correcting) errors,
 we use a distinct recovery node type with extremely weak invariants instead.
@@ -2048,7 +2048,7 @@ Types and dependence
 ^^^^^^^^^^^^^^^^^^^^
 
 ``RecoveryExpr`` is an ``Expr``, so it must have a type. In many cases the true
-type can't really be known until the code is corrected (e.g. a call to a
+type can't really be known until the code is corrected (e.g., a call to a
 function that doesn't exist). And it means that we can't properly perform type
 checks on some containing constructs, such as ``return 42 + unknownFunction()``.
 
@@ -2058,7 +2058,7 @@ mean dependence on a template parameter or how an error is repaired. The
 ``DependentTy``, and this suppresses type-based analysis in the same way it
 would inside a template.
 
-In cases where we are confident about the concrete type (e.g. the return type
+In cases where we are confident about the concrete type (e.g., the return type
 for a broken non-overloaded function call), the ``RecoveryExpr`` will have this
 type. This allows more code to be typechecked, and produces a better AST and
 more diagnostics. For example:
@@ -2071,7 +2071,7 @@ more diagnostics. For example:
 Whether or not the ``RecoveryExpr`` has a dependent type, it is always
 considered value-dependent, because its value isn't well-defined until the error
 is resolved. Among other things, this means that clang doesn't emit more errors
-where a RecoveryExpr is used as a constant (e.g. array size), but also won't try
+where a RecoveryExpr is used as a constant (e.g., array size), but also won't try
 to evaluate it.
 
 ContainsErrors bit
@@ -2122,7 +2122,7 @@ cycles. One example of a cycle is the connection between a
 ``ClassTemplateDecl`` and its "templated" ``CXXRecordDecl``. The *templated*
 ``CXXRecordDecl`` represents all the fields and methods inside the class
 template, while the ``ClassTemplateDecl`` holds the information which is
-related to being a template, i.e. template arguments, etc. We can get the
+related to being a template, i.e., template arguments, etc. We can get the
 *templated* class (the ``CXXRecordDecl``) of a ``ClassTemplateDecl`` with
 ``ClassTemplateDecl::getTemplatedDecl()``. And we can get back a pointer of the
 "described" class template from the *templated* class:
@@ -2145,7 +2145,7 @@ we skip the copy.
 The informal definition of structural equivalency is the following:
 Two nodes are **structurally equivalent** if they are
 
-- builtin types and refer to the same type, e.g. ``int`` and ``int`` are
+- builtin types and refer to the same type, e.g., ``int`` and ``int`` are
   structurally equivalent,
 - function types and all their parameters have structurally equivalent types,
 - record types and all their fields in order of their definition have the same
@@ -2162,7 +2162,7 @@ mentioned properties, we have to check for equivalent template
 parameters/arguments, etc.
 
 The structural equivalent check can be and is used independently from the
-ASTImporter, e.g. the ``clang::Sema`` class uses it also.
+ASTImporter, e.g., the ``clang::Sema`` class uses it also.
 
 The equivalence of nodes may depend on the equivalency of other pairs of nodes.
 Thus, the check is implemented as a parallel graph traversal. We traverse
@@ -2195,7 +2195,7 @@ Redeclaration Chains
 ^^^^^^^^^^^^^^^^^^^^
 
 The early version of the ``ASTImporter``'s merge mechanism squashed the
-declarations, i.e. it aimed to have only one declaration instead of maintaining
+declarations, i.e., it aimed to have only one declaration instead of maintaining
 a whole redeclaration chain. This early approach simply skipped importing a
 function prototype, but it imported a definition. To demonstrate the problem
 with this approach let's consider an empty "to" context and the following
@@ -2225,7 +2225,7 @@ another definition, we will use the existing definition. However, we can import
 prototype(s): we chain the newly imported prototype(s) to the existing
 definition. Whenever we import a new prototype from a third context, that will
 be added to the end of the redeclaration chain. This may result in long
-redeclaration chains in certain cases, e.g. if we import from several
+redeclaration chains in certain cases, e.g., if we import from several
 translation units which include the same header with the prototype.
 
 .. Squashing prototypes
@@ -2290,7 +2290,7 @@ Traversal during the Import
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 The node specific import mechanisms are implemented in
-``ASTNodeImporter::VisitNode()`` functions, e.g. ``VisitFunctionDecl()``.
+``ASTNodeImporter::VisitNode()`` functions, e.g., ``VisitFunctionDecl()``.
 When we import a declaration then first we import everything which is needed to
 call the constructor of that declaration node. Everything which can be set
 later is set after the node is created. For example, in case of  a
@@ -2490,7 +2490,7 @@ In case of LLDB, an implementation of the ``ExternalASTSource`` interface is
 attached to the AST context which is related to the parsed expression. This
 implementation of the ``ExternalASTSource`` interface is realized with the help
 of the ``ASTImporter`` class. This way, LLDB can reuse Clang's parsing
-machinery while synthesizing the underlying AST from the debug data (e.g. from
+machinery while synthesizing the underlying AST from the debug data (e.g., from
 DWARF). From the view of the ``ASTImporter`` this means both the "to" and the
 "from" context may have declaration contexts with external lexical storage. If
 a ``DeclContext`` in the "to" AST context has external lexical storage then we
@@ -2573,7 +2573,7 @@ conflict error (ODR violation in C++). In this case, we return with an
 clients of the ``ASTImporter`` may require a different, perhaps less
 conservative and more liberal error handling strategy.
 
-E.g. static analysis clients may benefit if the node is created even if there
+E.g., static analysis clients may benefit if the node is created even if there
 is a name conflict. During the CTU analysis of certain projects, we recognized
 that there are global declarations which collide with declarations from other
 translation units, but they are not referenced outside from their translation
@@ -2916,7 +2916,7 @@ Any error during satisfaction is recorded in ``ConstraintSatisfaction``.
 for nested requirements, ``ConstraintSatisfaction`` is stored (including
 diagnostics) in the AST, which is something we might want to improve.
 
-When an atomic constraint is not satified, we try to substitute into any
+When an atomic constraint is not satisfied, we try to substitute into any
 enclosing concept-id using the same mechanism described above, for
 diagnostics purpose, and inject that in the ``ConstraintSatisfaction``.
 
@@ -3584,7 +3584,7 @@ be specified by appending a ``+`` to the number. For example:
   void f(); // expected-note 0+ {{previous declaration is here}}
   void g(); // expected-note 1+ {{previous declaration is here}}
 
-In the first example, the diagnostic becomes optional, i.e. it will be
+In the first example, the diagnostic becomes optional, i.e., it will be
 swallowed if it occurs, but will not generate an error if it does not occur. In
 the second example, the diagnostic must occur at least once. As a short-hand,
 "one or more" can be specified simply by ``+``. For example:
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index d7253b79d8b7d..29b4288a18c35 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -4843,6 +4843,7 @@ currently supported:
 * ``__MEMORY_SCOPE_SYSTEM``
 * ``__MEMORY_SCOPE_DEVICE``
 * ``__MEMORY_SCOPE_WRKGRP``
+* ``__MEMORY_SCOPE_CLUSTR``
 * ``__MEMORY_SCOPE_WVFRNT``
 * ``__MEMORY_SCOPE_SINGLE``
 
@@ -5324,6 +5325,53 @@ returns the bit at the position of the current lane. It is almost equivalent to
 ``(mask & (1 << lane_id)) != 0``, except that its behavior is only defined if
 the given mask has the same value for all active lanes of the current wave.
 
+
+__builtin_amdgcn_global_load_b128 and __builtin_amdgcn_global_store_b128
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Signature:
+
+.. code-block:: c
+
+    typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u;
+    typedef v4u __attribute__((address_space(1))) *global_ptr_to_v4u;
+
+    v4u __builtin_amdgcn_global_load_b128(
+       v4u __attribute__((address_space(1))) *src,
+       const char                            *scope);
+
+    void __builtin_amdgcn_global_store_b128(
+       v4u __attribute__((address_space(1))) *dst,
+       v4u                                    data,
+       const char                            *scope);
+
+Load or store a vector of 4 unsigned integers from or to global memory with
+cache behavior specified by `scope` which must be a string literal.
+
+Valid values for `scope` are:
+
+===================== ==========================================================
+scope                 architecture name
+===================== ==========================================================
+``"wavefront"``       wave
+
+``"workgroup"``       group
+
+``"agent"``           device
+
+``""`` (empty string) system
+===================== ==========================================================
+
+These builtins are only supported on gfx942 and gfx950 devices.
+
+For semantics on gfx942, see Tables 47 and 48 in section 9.1.10 "Memory Scope
+and Temporal Controls" of the "AMD Instinct MI300" Instruction Set Architecture
+Reference.
+
+For semantics on gfx950, see Tables 49 and 50 in section 9.1.10 "Memory Scope
+and Temporal Controls" of the CDNA4 Instruction Set Architecture Reference.
+
+
 ARM/AArch64 Language Extensions
 -------------------------------
 
diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst
index 5c73e2486030e..c75c1703a8dc3 100644
--- a/clang/docs/OpenMPSupport.rst
+++ b/clang/docs/OpenMPSupport.rst
@@ -193,7 +193,7 @@ implementation.
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
 | device                       | support non-contiguous array sections for target update      | :good:`done`             | https://github.com/llvm/llvm-project/pull/144635                      |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | pointer attachment                                           | :good:`done`             |                                                                       |
+| device                       | pointer attachment                                           | :part:`being repaired`   | @abhinavgaba (https://github.com/llvm/llvm-project/pull/153683)       |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
 | atomic                       | hints for the atomic construct                               | :good:`done`             | D51233                                                                |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
@@ -627,6 +627,10 @@ implementation.
 +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
 | loop grid/tile modifiers for sizes clause                   | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
 +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| attach map-type modifier                                    | :part:`In Progress`       | :none:`unclaimed`         | C/C++: @abhinavgaba;                                                     |
+|                                                             |                           |                           | RT: @abhinavgaba (https://github.com/llvm/llvm-project/pull/149036,      |
+|                                                             |                           |                           | https://github.com/llvm/llvm-project/pull/158370)                        |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
 
 
 OpenMP Extensions
diff --git a/clang/docs/PointerAuthentication.rst b/clang/docs/PointerAuthentication.rst
index 96eb498bc48b6..7e65f4b1b4915 100644
--- a/clang/docs/PointerAuthentication.rst
+++ b/clang/docs/PointerAuthentication.rst
@@ -592,6 +592,36 @@ The result value is never zero and always within range for both the
 
 This can be used in constant expressions.
 
+``ptrauth_type_discriminator``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: c
+
+  ptrauth_type_discriminator(type)
+
+Compute the constant discriminator derived from the given type, as is computed
+for automatically type diversified schemas.
+
+``type`` must be a type. The result has the type ``ptrauth_extra_data_t``.
+
+This can be used in constant expressions.
+
+``ptrauth_function_pointer_type_discriminator``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: c
+
+  ptrauth_function_pointer_type_discriminator(function_type)
+
+Compute the constant discriminator derived from the provided function type, for
+use in contexts where the default function authentication schema. If function
+pointer type diversity is enabled, this is equivalent to
+`ptrauth_type_discriminator(function_type)`, if it is not enabled this is `0`.
+
+``function_type`` must be a function type. The result has the type ``ptrauth_extra_data_t``.
+
+This can be used in constant expressions.
+
 ``ptrauth_strip``
 ^^^^^^^^^^^^^^^^^
 
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 8f9a30d7ba97d..caa2802994ed0 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -128,6 +128,17 @@ AST Dumping Potentially Breaking Changes
 
 - Default arguments of template template parameters are pretty-printed now.
 
+- Pretty-printing of ``asm`` attributes are now always the first attribute
+  on the right side of the declaration.  Before we had, e.g.:
+
+    ``__attribute__(("visibility")) asm("string")``
+
+  Now we have:
+
+    ``asm("string") __attribute__(("visibility"))``
+
+  Which is accepted by both clang and gcc parsers.
+
 Clang Frontend Potentially Breaking Changes
 -------------------------------------------
 - Members of anonymous unions/structs are now injected as ``IndirectFieldDecl``
@@ -191,6 +202,8 @@ C23 Feature Support
 - Added ``FLT_SNAN``, ``DBL_SNAN``, and ``LDBL_SNAN`` to Clang's ``<float.h>``
   header in C23 and later modes. This implements
   `WG14 N2710 <https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2710.htm>`_.
+- Fixed accepting as compatible unnamed tag types with the same fields within
+  the same translation unit but from different types.
 
 Non-comprehensive list of changes in this release
 -------------------------------------------------
@@ -269,12 +282,14 @@ Non-comprehensive list of changes in this release
   allocation functions with a token ID can be enabled via the
   ``-fsanitize=alloc-token`` flag.
 
+- Clang now rejects the invalid use of ``constexpr`` with ``auto`` and an explicit type in C. (#GH163090)
+
 New Compiler Flags
 ------------------
 - New option ``-fno-sanitize-debug-trap-reasons`` added to disable emitting trap reasons into the debug info when compiling with trapping UBSan (e.g. ``-fsanitize-trap=undefined``).
 - New option ``-fsanitize-debug-trap-reasons=`` added to control emitting trap reasons into the debug info when compiling with trapping UBSan (e.g. ``-fsanitize-trap=undefined``).
 - New options for enabling allocation token instrumentation: ``-fsanitize=alloc-token``, ``-falloc-token-max=``, ``-fsanitize-alloc-token-fast-abi``, ``-fsanitize-alloc-token-extended``.
-
+- The ``-resource-dir`` option is now displayed in the list of options shown by ``--help``.
 
 Lanai Support
 ^^^^^^^^^^^^^^
@@ -550,6 +565,8 @@ Improvements to Clang's diagnostics
   pointer, provided it can be proven that the pointer only points to
   ``[[noreturn]]`` functions.
 
+- Diagnostics messages now refer to ``structured binding`` instead of ``decomposition``,
+  to align with `P0615R0 <https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2017/p0615r0.html>`_ changing the term. (#GH157880)
 - Added a separate diagnostic group ``-Wfunction-effect-redeclarations``, for the more pedantic
   diagnostics for function effects (``[[clang::nonblocking]]`` and ``[[clang::nonallocating]]``).
   Moved the warning for a missing (though implied) attribute on a redeclaration into this group.
@@ -659,6 +676,7 @@ Bug Fixes in This Version
   (#GH159080)
 - Fixed a failed assertion with empty filename arguments in ``__has_embed``. (#GH159898)
 - Fixed a failed assertion with empty filename in ``#embed`` directive. (#GH162951)
+- Fixed a crash triggered by unterminated ``__has_embed``. (#GH162953)
 
 Bug Fixes to Compiler Builtins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -672,7 +690,8 @@ Bug Fixes to Attribute Support
   (#GH141504) and on types returned from indirect calls (#GH142453).
 - Fixes some late parsed attributes, when applied to function definitions, not being parsed
   in function try blocks, and some situations where parsing of the function body
-  is skipped, such as error recovery and code completion. (#GH153551)
+  is skipped, such as error recovery, code completion, and msvc-compatible delayed
+  template parsing. (#GH153551)
 - Using ``[[gnu::cleanup(some_func)]]`` where some_func is annotated with
   ``[[gnu::error("some error")]]`` now correctly triggers an error. (#GH146520)
 - Fix a crash when the function name is empty in the `swift_name` attribute. (#GH157075)
@@ -683,6 +702,7 @@ Bug Fixes to C++ Support
 - Suppress ``-Wdeprecated-declarations`` in implicitly generated functions. (#GH147293)
 - Fix a crash when deleting a pointer to an incomplete array (#GH150359).
 - Fixed a mismatched lambda scope bug when propagating up ``consteval`` within nested lambdas. (#GH145776)
+- Disallow immediate escalation in destructors. (#GH109096)
 - Fix an assertion failure when expression in assumption attribute
   (``[[assume(expr)]]``) creates temporary objects.
 - Fix the dynamic_cast to final class optimization to correctly handle
@@ -717,6 +737,7 @@ Bug Fixes to C++ Support
 - Fix a crash when attempting to deduce a deduction guide from a non deducible template template parameter. (#130604)
 - Fix for clang incorrectly rejecting the default construction of a union with
   nontrivial member when another member has an initializer. (#GH81774)
+- Fixed a template depth issue when parsing lambdas inside a type constraint. (#GH162092)
 - Diagnose unresolved overload sets in non-dependent compound requirements. (#GH51246) (#GH97753)
 
 Bug Fixes to AST Handling
@@ -766,6 +787,8 @@ X86 Support
   driver.
 - Remove `[no-]evex512` feature request from intrinsics and builtins.
 - Change features `avx10.x-[256,512]` to `avx10.x`.
+- `-march=wildcatlake` is now supported.
+- `-march=novalake` is now supported.
 
 Arm and AArch64 Support
 ^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst
index 01f0b27846f0b..e82b16f24c73f 100644
--- a/clang/docs/UsersManual.rst
+++ b/clang/docs/UsersManual.rst
@@ -2325,7 +2325,7 @@ are listed below.
    devirtualization and virtual constant propagation, for classes with
    :doc:`hidden LTO visibility <LTOVisibility>`. Requires ``-flto``.
 
-.. option:: -f[no]split-lto-unit
+.. option:: -f[no-]split-lto-unit
 
    Controls splitting the :doc:`LTO unit <LTOVisibility>` into regular LTO and
    :doc:`ThinLTO` portions, when compiling with -flto=thin. Defaults to false
@@ -2518,7 +2518,7 @@ are listed below.
 
 .. _funique_internal_linkage_names:
 
-.. option:: -f[no]-unique-internal-linkage-names
+.. option:: -f[no-]unique-internal-linkage-names
 
    Controls whether Clang emits a unique (best-effort) symbol name for internal
    linkage symbols.  When this option is set, compiler hashes the main source
@@ -2539,7 +2539,7 @@ are listed below.
      $ cd $P/bar && clang -c -funique-internal-linkage-names name_conflict.c
      $ cd $P && clang foo/name_conflict.o && bar/name_conflict.o
 
-.. option:: -f[no]-basic-block-address-map:
+.. option:: -f[no-]basic-block-address-map:
   Emits a ``SHT_LLVM_BB_ADDR_MAP`` section which includes address offsets for each
   basic block in the program, relative to the parent function address.
 
diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst
index d942578dd7596..fd0b304cba0df 100644
--- a/clang/docs/analyzer/checkers.rst
+++ b/clang/docs/analyzer/checkers.rst
@@ -205,6 +205,50 @@ pointers with a specified address space. If the option is set to false, then
 reports from the specific x86 address spaces 256, 257 and 258 are still
 suppressed, but null dereferences from other address spaces are reported.
 
+.. _core-NullPointerArithm:
+
+core.NullPointerArithm (C, C++)
+"""""""""""""""""""""""""""""""
+Check for undefined arithmetic operations with null pointers.
+
+The checker can detect the following cases:
+
+  - ``p + x`` and ``x + p`` where ``p`` is a null pointer and ``x`` is a nonzero
+    integer value.
+  - ``p - x`` where ``p`` is a null pointer and ``x`` is a nonzero integer
+    value.
+  - ``p1 - p2`` where one of ``p1`` and ``p2`` is null and the other a
+    non-null pointer.
+
+Result of these operations is undefined according to the standard.
+In the above listed cases, the checker will warn even if the expression
+described to be "nonzero" or "non-null" has unknown value, because it is likely
+that it can have non-zero value during the program execution.
+
+.. code-block:: c
+
+ void test1(int *p, int offset) {
+   if (p)
+     return;
+
+   int *p1 = p + offset; // warn: 'p' is null, 'offset' is unknown but likely non-zero
+ }
+
+ void test2(int *p, int offset) {
+   if (p) { } // this indicates that it is possible for 'p' to be null
+   if (offset == 0)
+     return;
+
+   int *p1 = p - offset; // warn: 'p' is null, 'offset' is known to be non-zero
+ }
+
+ void test3(char *p1, char *p2) {
+   if (p1)
+     return;
+
+   int a = p1 - p2; // warn: 'p1' is null, 'p2' can be likely non-null
+ }
+
 .. _core-StackAddressEscape:
 
 core.StackAddressEscape (C)
@@ -3421,12 +3465,6 @@ Check for an out-of-bound pointer being returned to callers.
    return x; // warn: undefined or garbage returned
  }
 
-
-alpha.security.cert
-^^^^^^^^^^^^^^^^^^^
-
-SEI CERT checkers which tries to find errors based on their `C coding rules <https://wiki.sei.cmu.edu/confluence/display/c/2+Rules>`_.
-
 alpha.unix
 ^^^^^^^^^^
 
diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 78220d4d8ff5b..33aa2d343aa7a 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -2874,11 +2874,11 @@ class ASTContext : public RefCountedBase<ASTContext> {
   /// returned type is guaranteed to be free of any of these, allowing two
   /// canonical types to be compared for exact equality with a simple pointer
   /// comparison.
-  CanQualType getCanonicalType(QualType T) const {
+  static CanQualType getCanonicalType(QualType T) {
     return CanQualType::CreateUnsafe(T.getCanonicalType());
   }
 
-  const Type *getCanonicalType(const Type *T) const {
+  static const Type *getCanonicalType(const Type *T) {
     return T->getCanonicalTypeInternal().getTypePtr();
   }
 
@@ -2890,10 +2890,10 @@ class ASTContext : public RefCountedBase<ASTContext> {
   CanQualType getCanonicalParamType(QualType T) const;
 
   /// Determine whether the given types \p T1 and \p T2 are equivalent.
-  bool hasSameType(QualType T1, QualType T2) const {
+  static bool hasSameType(QualType T1, QualType T2) {
     return getCanonicalType(T1) == getCanonicalType(T2);
   }
-  bool hasSameType(const Type *T1, const Type *T2) const {
+  static bool hasSameType(const Type *T1, const Type *T2) {
     return getCanonicalType(T1) == getCanonicalType(T2);
   }
 
@@ -2921,7 +2921,7 @@ class ASTContext : public RefCountedBase<ASTContext> {
 
   /// Determine whether the given types are equivalent after
   /// cvr-qualifiers have been removed.
-  bool hasSameUnqualifiedType(QualType T1, QualType T2) const {
+  static bool hasSameUnqualifiedType(QualType T1, QualType T2) {
     return getCanonicalType(T1).getTypePtr() ==
            getCanonicalType(T2).getTypePtr();
   }
diff --git a/clang/include/clang/AST/ASTNodeTraverser.h b/clang/include/clang/AST/ASTNodeTraverser.h
index 092160405aff4..e74bb72571d64 100644
--- a/clang/include/clang/AST/ASTNodeTraverser.h
+++ b/clang/include/clang/AST/ASTNodeTraverser.h
@@ -770,7 +770,7 @@ class ASTNodeTraverser
       // it will not be in the parent context:
       if (auto *TT = D->getFriendType()->getType()->getAs<TagType>())
         if (TT->isTagOwned())
-          Visit(TT->getOriginalDecl());
+          Visit(TT->getDecl());
     } else {
       Visit(D->getFriendDecl());
     }
diff --git a/clang/include/clang/AST/CanonicalType.h b/clang/include/clang/AST/CanonicalType.h
index b5a4e94e1330a..87bbd7b5d885d 100644
--- a/clang/include/clang/AST/CanonicalType.h
+++ b/clang/include/clang/AST/CanonicalType.h
@@ -551,18 +551,18 @@ struct CanProxyAdaptor<UnaryTransformType>
 
 template<>
 struct CanProxyAdaptor<TagType> : public CanProxyBase<TagType> {
-  LLVM_CLANG_CANPROXY_SIMPLE_ACCESSOR(TagDecl *, getOriginalDecl)
+  LLVM_CLANG_CANPROXY_SIMPLE_ACCESSOR(TagDecl *, getDecl)
 };
 
 template<>
 struct CanProxyAdaptor<RecordType> : public CanProxyBase<RecordType> {
-  LLVM_CLANG_CANPROXY_SIMPLE_ACCESSOR(RecordDecl *, getOriginalDecl)
+  LLVM_CLANG_CANPROXY_SIMPLE_ACCESSOR(RecordDecl *, getDecl)
   LLVM_CLANG_CANPROXY_SIMPLE_ACCESSOR(bool, hasConstFields)
 };
 
 template<>
 struct CanProxyAdaptor<EnumType> : public CanProxyBase<EnumType> {
-  LLVM_CLANG_CANPROXY_SIMPLE_ACCESSOR(EnumDecl *, getOriginalDecl)
+  LLVM_CLANG_CANPROXY_SIMPLE_ACCESSOR(EnumDecl *, getDecl)
 };
 
 template<>
diff --git a/clang/include/clang/AST/DeclCXX.h b/clang/include/clang/AST/DeclCXX.h
index 898487bffec08..dfa3befb27dd0 100644
--- a/clang/include/clang/AST/DeclCXX.h
+++ b/clang/include/clang/AST/DeclCXX.h
@@ -3832,7 +3832,7 @@ class UsingEnumDecl : public BaseUsingDecl, public Mergeable<UsingEnumDecl> {
 
 public:
   EnumDecl *getEnumDecl() const {
-    return EnumType->getType()->castAs<clang::EnumType>()->getOriginalDecl();
+    return EnumType->getType()->castAs<clang::EnumType>()->getDecl();
   }
 
   static UsingEnumDecl *Create(ASTContext &C, DeclContext *DC,
diff --git a/clang/include/clang/AST/OpenACCClause.h b/clang/include/clang/AST/OpenACCClause.h
index 1e351f31f4b92..83f2b18435633 100644
--- a/clang/include/clang/AST/OpenACCClause.h
+++ b/clang/include/clang/AST/OpenACCClause.h
@@ -1301,46 +1301,25 @@ struct OpenACCReductionRecipe {
   // AST), or in a separate collection when being semantically analyzed.
   llvm::ArrayRef<CombinerRecipe> CombinerRecipes;
 
+  bool isSet() const { return AllocaDecl; }
+
+private:
+  friend class OpenACCReductionClause;
   OpenACCReductionRecipe(VarDecl *A, llvm::ArrayRef<CombinerRecipe> Combiners)
       : AllocaDecl(A), CombinerRecipes(Combiners) {}
-
-  bool isSet() const { return AllocaDecl; }
 };
 
 // A version of the above that is used for semantic analysis, at a time before
 // the OpenACCReductionClause node has been created.  This one has storage for
 // the CombinerRecipe, since Trailing storage for it doesn't exist yet.
-struct OpenACCReductionRecipeWithStorage : OpenACCReductionRecipe {
-private:
-  llvm::SmallVector<CombinerRecipe, 1> CombinerRecipeStorage;
-
-public:
-  OpenACCReductionRecipeWithStorage(VarDecl *A,
-                                    llvm::ArrayRef<CombinerRecipe> Combiners)
-      : OpenACCReductionRecipe(A, {}), CombinerRecipeStorage(Combiners) {
-    CombinerRecipes = CombinerRecipeStorage;
-  }
+struct OpenACCReductionRecipeWithStorage {
+  VarDecl *AllocaDecl;
+  llvm::SmallVector<OpenACCReductionRecipe::CombinerRecipe, 1> CombinerRecipes;
 
   OpenACCReductionRecipeWithStorage(
-      const OpenACCReductionRecipeWithStorage &Other)
-      : OpenACCReductionRecipe(Other),
-        CombinerRecipeStorage(Other.CombinerRecipeStorage) {
-    CombinerRecipes = CombinerRecipeStorage;
-  }
-
-  OpenACCReductionRecipeWithStorage(OpenACCReductionRecipeWithStorage &&Other)
-      : OpenACCReductionRecipe(std::move(Other)),
-        CombinerRecipeStorage(std::move(Other.CombinerRecipeStorage)) {
-    CombinerRecipes = CombinerRecipeStorage;
-  }
-
-  // There is no real problem implementing these, we just have to make sure the
-  // array-ref this inherits from stays in sync. But as we don't need it at the
-  // moment, make sure we don't accidentially call these.
-  OpenACCReductionRecipeWithStorage &
-  operator=(OpenACCReductionRecipeWithStorage &&) = delete;
-  OpenACCReductionRecipeWithStorage &
-  operator=(const OpenACCReductionRecipeWithStorage &) = delete;
+      VarDecl *A,
+      llvm::ArrayRef<OpenACCReductionRecipe::CombinerRecipe> Combiners)
+      : AllocaDecl(A), CombinerRecipes(Combiners) {}
 
   static OpenACCReductionRecipeWithStorage Empty() {
     return OpenACCReductionRecipeWithStorage(/*AllocaDecl=*/nullptr, {});
diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index c246c4ab458ba..32b2b6bdb989c 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -1710,7 +1710,7 @@ DEF_TRAVERSE_DECL(FriendDecl, {
     // it will not be in the parent context:
     if (auto *TT = D->getFriendType()->getType()->getAs<TagType>();
         TT && TT->isTagOwned())
-      TRY_TO(TraverseDecl(TT->getOriginalDecl()));
+      TRY_TO(TraverseDecl(TT->getDecl()));
   } else {
     TRY_TO(TraverseDecl(D->getFriendDecl()));
   }
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index df106d5b12c8d..7bd2441289619 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -27,7 +27,7 @@ inline CXXRecordDecl *Type::getAsCXXRecordDecl() const {
   const auto *TT = dyn_cast<TagType>(CanonicalType);
   if (!isa_and_present<RecordType, InjectedClassNameType>(TT))
     return nullptr;
-  auto *TD = TT->getOriginalDecl();
+  auto *TD = TT->getDecl();
   if (isa<RecordType>(TT) && !isa<CXXRecordDecl>(TD))
     return nullptr;
   return cast<CXXRecordDecl>(TD)->getDefinitionOrSelf();
@@ -35,41 +35,39 @@ inline CXXRecordDecl *Type::getAsCXXRecordDecl() const {
 
 inline CXXRecordDecl *Type::castAsCXXRecordDecl() const {
   const auto *TT = cast<TagType>(CanonicalType);
-  return cast<CXXRecordDecl>(TT->getOriginalDecl())->getDefinitionOrSelf();
+  return cast<CXXRecordDecl>(TT->getDecl())->getDefinitionOrSelf();
 }
 
 inline RecordDecl *Type::getAsRecordDecl() const {
   const auto *TT = dyn_cast<TagType>(CanonicalType);
   if (!isa_and_present<RecordType, InjectedClassNameType>(TT))
     return nullptr;
-  return cast<RecordDecl>(TT->getOriginalDecl())->getDefinitionOrSelf();
+  return cast<RecordDecl>(TT->getDecl())->getDefinitionOrSelf();
 }
 
 inline RecordDecl *Type::castAsRecordDecl() const {
   const auto *TT = cast<TagType>(CanonicalType);
-  return cast<RecordDecl>(TT->getOriginalDecl())->getDefinitionOrSelf();
+  return cast<RecordDecl>(TT->getDecl())->getDefinitionOrSelf();
 }
 
 inline EnumDecl *Type::getAsEnumDecl() const {
   if (const auto *TT = dyn_cast<EnumType>(CanonicalType))
-    return TT->getOriginalDecl()->getDefinitionOrSelf();
+    return TT->getDecl()->getDefinitionOrSelf();
   return nullptr;
 }
 
 inline EnumDecl *Type::castAsEnumDecl() const {
-  return cast<EnumType>(CanonicalType)
-      ->getOriginalDecl()
-      ->getDefinitionOrSelf();
+  return cast<EnumType>(CanonicalType)->getDecl()->getDefinitionOrSelf();
 }
 
 inline TagDecl *Type::getAsTagDecl() const {
   if (const auto *TT = dyn_cast<TagType>(CanonicalType))
-    return TT->getOriginalDecl()->getDefinitionOrSelf();
+    return TT->getDecl()->getDefinitionOrSelf();
   return nullptr;
 }
 
 inline TagDecl *Type::castAsTagDecl() const {
-  return cast<TagType>(CanonicalType)->getOriginalDecl()->getDefinitionOrSelf();
+  return cast<TagType>(CanonicalType)->getDecl()->getDefinitionOrSelf();
 }
 
 inline bool QualType::hasNonTrivialToPrimitiveDefaultInitializeCUnion() const {
diff --git a/clang/include/clang/AST/TypeBase.h b/clang/include/clang/AST/TypeBase.h
index 625cc77dc1f08..f07861f50fe8c 100644
--- a/clang/include/clang/AST/TypeBase.h
+++ b/clang/include/clang/AST/TypeBase.h
@@ -4378,8 +4378,6 @@ class ConstantMatrixType final : public MatrixType {
   unsigned NumRows;
   unsigned NumColumns;
 
-  static constexpr unsigned MaxElementsPerDimension = (1 << 20) - 1;
-
   ConstantMatrixType(QualType MatrixElementType, unsigned NRows,
                      unsigned NColumns, QualType CanonElementType);
 
@@ -4398,16 +4396,6 @@ class ConstantMatrixType final : public MatrixType {
     return getNumRows() * getNumColumns();
   }
 
-  /// Returns true if \p NumElements is a valid matrix dimension.
-  static constexpr bool isDimensionValid(size_t NumElements) {
-    return NumElements > 0 && NumElements <= MaxElementsPerDimension;
-  }
-
-  /// Returns the maximum number of elements per dimension.
-  static constexpr unsigned getMaxElementsPerDimension() {
-    return MaxElementsPerDimension;
-  }
-
   void Profile(llvm::FoldingSetNodeID &ID) {
     Profile(ID, getElementType(), getNumRows(), getNumColumns(),
             getTypeClass());
@@ -6419,10 +6407,10 @@ class TagType : public TypeWithKeyword {
           bool IsInjected, const Type *CanonicalType);
 
 public:
-  // FIXME: Temporarily renamed from `getDecl` in order to facilitate
-  // rebasing, due to change in behaviour. This should be renamed back
-  // to `getDecl` once the change is settled.
-  TagDecl *getOriginalDecl() const { return decl; }
+  TagDecl *getDecl() const { return decl; }
+  [[deprecated("Use getDecl instead")]] TagDecl *getOriginalDecl() const {
+    return decl;
+  }
 
   NestedNameSpecifier getQualifier() const;
 
@@ -6463,7 +6451,7 @@ struct TagTypeFoldingSetPlaceholder : public llvm::FoldingSetNode {
 
   void Profile(llvm::FoldingSetNodeID &ID) const {
     const TagType *T = getTagType();
-    Profile(ID, T->getKeyword(), T->getQualifier(), T->getOriginalDecl(),
+    Profile(ID, T->getKeyword(), T->getQualifier(), T->getDecl(),
             T->isTagOwned(), T->isInjected());
   }
 
@@ -6487,11 +6475,11 @@ class RecordType final : public TagType {
   using TagType::TagType;
 
 public:
-  // FIXME: Temporarily renamed from `getDecl` in order to facilitate
-  // rebasing, due to change in behaviour. This should be renamed back
-  // to `getDecl` once the change is settled.
-  RecordDecl *getOriginalDecl() const {
-    return reinterpret_cast<RecordDecl *>(TagType::getOriginalDecl());
+  RecordDecl *getDecl() const {
+    return reinterpret_cast<RecordDecl *>(TagType::getDecl());
+  }
+  [[deprecated("Use getDecl instead")]] RecordDecl *getOriginalDecl() const {
+    return getDecl();
   }
 
   /// Recursively check all fields in the record for const-ness. If any field
@@ -6507,11 +6495,11 @@ class EnumType final : public TagType {
   using TagType::TagType;
 
 public:
-  // FIXME: Temporarily renamed from `getDecl` in order to facilitate
-  // rebasing, due to change in behaviour. This should be renamed back
-  // to `getDecl` once the change is settled.
-  EnumDecl *getOriginalDecl() const {
-    return reinterpret_cast<EnumDecl *>(TagType::getOriginalDecl());
+  EnumDecl *getDecl() const {
+    return reinterpret_cast<EnumDecl *>(TagType::getDecl());
+  }
+  [[deprecated("Use getDecl instead")]] EnumDecl *getOriginalDecl() const {
+    return getDecl();
   }
 
   static bool classof(const Type *T) { return T->getTypeClass() == Enum; }
@@ -6542,11 +6530,11 @@ class InjectedClassNameType final : public TagType {
                         bool IsInjected, const Type *CanonicalType);
 
 public:
-  // FIXME: Temporarily renamed from `getDecl` in order to facilitate
-  // rebasing, due to change in behaviour. This should be renamed back
-  // to `getDecl` once the change is settled.
-  CXXRecordDecl *getOriginalDecl() const {
-    return reinterpret_cast<CXXRecordDecl *>(TagType::getOriginalDecl());
+  CXXRecordDecl *getDecl() const {
+    return reinterpret_cast<CXXRecordDecl *>(TagType::getDecl());
+  }
+  [[deprecated("Use getDecl instead")]] CXXRecordDecl *getOriginalDecl() const {
+    return getDecl();
   }
 
   static bool classof(const Type *T) {
@@ -8930,8 +8918,8 @@ inline bool Type::isIntegerType() const {
   if (const EnumType *ET = dyn_cast<EnumType>(CanonicalType)) {
     // Incomplete enum types are not treated as integer types.
     // FIXME: In C++, enum types are never integer types.
-    return IsEnumDeclComplete(ET->getOriginalDecl()) &&
-           !IsEnumDeclScoped(ET->getOriginalDecl());
+    return IsEnumDeclComplete(ET->getDecl()) &&
+           !IsEnumDeclScoped(ET->getDecl());
   }
   return isBitIntType();
 }
@@ -8989,7 +8977,7 @@ inline bool Type::isScalarType() const {
   if (const EnumType *ET = dyn_cast<EnumType>(CanonicalType))
     // Enums are scalar types, but only if they are defined.  Incomplete enums
     // are not treated as scalar types.
-    return IsEnumDeclComplete(ET->getOriginalDecl());
+    return IsEnumDeclComplete(ET->getDecl());
   return isa<PointerType>(CanonicalType) ||
          isa<BlockPointerType>(CanonicalType) ||
          isa<MemberPointerType>(CanonicalType) ||
@@ -9005,7 +8993,7 @@ inline bool Type::isIntegralOrEnumerationType() const {
   // Check for a complete enum type; incomplete enum types are not properly an
   // enumeration type in the sense required here.
   if (const auto *ET = dyn_cast<EnumType>(CanonicalType))
-    return IsEnumDeclComplete(ET->getOriginalDecl());
+    return IsEnumDeclComplete(ET->getDecl());
 
   return isBitIntType();
 }
diff --git a/clang/include/clang/AST/TypeLoc.h b/clang/include/clang/AST/TypeLoc.h
index 3f14ee86d55b1..2cefaa9611c98 100644
--- a/clang/include/clang/AST/TypeLoc.h
+++ b/clang/include/clang/AST/TypeLoc.h
@@ -793,7 +793,7 @@ struct TagTypeLocInfo {
 class TagTypeLoc : public ConcreteTypeLoc<UnqualTypeLoc, TagTypeLoc, TagType,
                                           TagTypeLocInfo> {
 public:
-  TagDecl *getOriginalDecl() const { return getTypePtr()->getOriginalDecl(); }
+  TagDecl *getDecl() const { return getTypePtr()->getDecl(); }
 
   /// True if the tag was defined in this type specifier.
   bool isDefinition() const;
@@ -854,9 +854,7 @@ class RecordTypeLoc : public InheritingConcreteTypeLoc<TagTypeLoc,
                                                        RecordTypeLoc,
                                                        RecordType> {
 public:
-  RecordDecl *getOriginalDecl() const {
-    return getTypePtr()->getOriginalDecl();
-  }
+  RecordDecl *getDecl() const { return getTypePtr()->getDecl(); }
 };
 
 /// Wrapper for source info for enum types.
@@ -864,7 +862,7 @@ class EnumTypeLoc : public InheritingConcreteTypeLoc<TagTypeLoc,
                                                      EnumTypeLoc,
                                                      EnumType> {
 public:
-  EnumDecl *getOriginalDecl() const { return getTypePtr()->getOriginalDecl(); }
+  EnumDecl *getDecl() const { return getTypePtr()->getDecl(); }
 };
 
 /// Wrapper for source info for injected class names of class
@@ -873,9 +871,7 @@ class InjectedClassNameTypeLoc
     : public InheritingConcreteTypeLoc<TagTypeLoc, InjectedClassNameTypeLoc,
                                        InjectedClassNameType> {
 public:
-  CXXRecordDecl *getOriginalDecl() const {
-    return getTypePtr()->getOriginalDecl();
-  }
+  CXXRecordDecl *getDecl() const { return getTypePtr()->getDecl(); }
 };
 
 /// Wrapper for template type parameters.
diff --git a/clang/include/clang/AST/TypeProperties.td b/clang/include/clang/AST/TypeProperties.td
index 9dc85fb88e267..03613d53b2776 100644
--- a/clang/include/clang/AST/TypeProperties.td
+++ b/clang/include/clang/AST/TypeProperties.td
@@ -575,7 +575,7 @@ let Class = TagType in {
     let Conditional = [{ !IsCanonical }];
     let Read = [{ node->getQualifier() }];
   }
-  def : Property<"TD", TagDeclRef> { let Read = [{ node->getOriginalDecl() }]; }
+  def : Property<"TD", TagDeclRef> { let Read = [{ node->getDecl() }]; }
 }
 
 let Class = EnumType in {
diff --git a/clang/include/clang/ASTMatchers/ASTMatchersInternal.h b/clang/include/clang/ASTMatchers/ASTMatchersInternal.h
index 1ab6f11a23e12..c050fb7d797e3 100644
--- a/clang/include/clang/ASTMatchers/ASTMatchersInternal.h
+++ b/clang/include/clang/ASTMatchers/ASTMatchersInternal.h
@@ -1017,7 +1017,7 @@ class HasDeclarationMatcher : public MatcherInterface<T> {
     // First, for any types that have a declaration, extract the declaration and
     // match on it.
     if (const auto *S = dyn_cast<TagType>(&Node)) {
-      return matchesDecl(S->getOriginalDecl(), Finder, Builder);
+      return matchesDecl(S->getDecl(), Finder, Builder);
     }
     if (const auto *S = dyn_cast<TemplateTypeParmType>(&Node)) {
       return matchesDecl(S->getDecl(), Finder, Builder);
diff --git a/clang/include/clang/Analysis/FlowSensitive/Models/UncheckedStatusOrAccessModel.h b/clang/include/clang/Analysis/FlowSensitive/Models/UncheckedStatusOrAccessModel.h
new file mode 100644
index 0000000000000..24f8b0b99870a
--- /dev/null
+++ b/clang/include/clang/Analysis/FlowSensitive/Models/UncheckedStatusOrAccessModel.h
@@ -0,0 +1,111 @@
+//===- UncheckedStatusOrAccessModel.h -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CLANG_ANALYSIS_FLOWSENSITIVE_MODELS_UNCHECKEDSTATUSORACCESSMODEL_H
+#define CLANG_ANALYSIS_FLOWSENSITIVE_MODELS_UNCHECKEDSTATUSORACCESSMODEL_H
+
+#include "clang/AST/Type.h"
+#include "clang/ASTMatchers/ASTMatchers.h"
+#include "clang/Analysis/CFG.h"
+#include "clang/Analysis/FlowSensitive/CFGMatchSwitch.h"
+#include "clang/Analysis/FlowSensitive/DataflowAnalysis.h"
+#include "clang/Analysis/FlowSensitive/DataflowEnvironment.h"
+#include "clang/Analysis/FlowSensitive/MatchSwitch.h"
+#include "clang/Analysis/FlowSensitive/NoopLattice.h"
+#include "clang/Analysis/FlowSensitive/StorageLocation.h"
+#include "clang/Analysis/FlowSensitive/Value.h"
+#include "clang/Basic/SourceLocation.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace clang::dataflow::statusor_model {
+
+// The helper functions exported here are for use of downstream vendor
+// extensions of this model.
+
+// Match declaration of `absl::StatusOr<T>` and bind `T` to "T".
+clang::ast_matchers::DeclarationMatcher statusOrClass();
+// Match declaration of `absl::Status`.
+clang::ast_matchers::DeclarationMatcher statusClass();
+// Match declaration of `absl::internal_statusor::OperatorBase`.
+clang::ast_matchers::DeclarationMatcher statusOrOperatorBaseClass();
+clang::ast_matchers::TypeMatcher statusOrType();
+
+// Get RecordStorageLocation for the `Status` contained in the `StatusOr`
+RecordStorageLocation &locForStatus(RecordStorageLocation &StatusOrLoc);
+// Get the StorageLocation for the OK boolean in the `Status`
+StorageLocation &locForOk(RecordStorageLocation &StatusLoc);
+// Get the OK boolean in the `Status`, and initialize it if necessary.
+BoolValue &valForOk(RecordStorageLocation &StatusLoc, Environment &Env);
+// Get synthetic fields for the types modelled by
+// `UncheckedStatusOrAccessModel`.
+llvm::StringMap<QualType> getSyntheticFields(QualType Ty, QualType StatusType,
+                                             const CXXRecordDecl &RD);
+
+// Initialize the synthetic fields of the `StatusOr`.
+// N.B. if it is already initialized, the value gets reset.
+BoolValue &initializeStatusOr(RecordStorageLocation &StatusOrLoc,
+                              Environment &Env);
+// Initialize the synthetic fields of the `Status`.
+// N.B. if it is already initialized, the value gets reset.
+BoolValue &initializeStatus(RecordStorageLocation &StatusLoc, Environment &Env);
+
+// Return true if `Type` is instantiation of `absl::StatusOr<T>`
+bool isStatusOrType(QualType Type);
+// Return true if `Type` is `absl::Status`
+bool isStatusType(QualType Type);
+
+// Get `QualType` for `absl::Status`, or default-constructed
+// QualType if it does not exist.
+QualType findStatusType(const ASTContext &Ctx);
+
+struct UncheckedStatusOrAccessModelOptions {};
+
+// Dataflow analysis that discovers unsafe uses of StatusOr values.
+class UncheckedStatusOrAccessModel
+    : public DataflowAnalysis<UncheckedStatusOrAccessModel, NoopLattice> {
+public:
+  explicit UncheckedStatusOrAccessModel(ASTContext &Ctx, Environment &Env);
+
+  static Lattice initialElement() { return {}; }
+
+  void transfer(const CFGElement &Elt, Lattice &L, Environment &Env);
+
+private:
+  CFGMatchSwitch<TransferState<Lattice>> TransferMatchSwitch;
+};
+
+using LatticeTransferState =
+    TransferState<UncheckedStatusOrAccessModel::Lattice>;
+
+// Extend the Builder with the transfer functions for
+// `UncheckedStatusOrAccessModel`. This is useful to write downstream models
+// that extend the model.
+CFGMatchSwitch<LatticeTransferState>
+buildTransferMatchSwitch(ASTContext &Ctx,
+                         CFGMatchSwitchBuilder<LatticeTransferState> Builder);
+
+class UncheckedStatusOrAccessDiagnoser {
+public:
+  explicit UncheckedStatusOrAccessDiagnoser(
+      UncheckedStatusOrAccessModelOptions Options = {});
+
+  llvm::SmallVector<SourceLocation> operator()(
+      const CFGElement &Elt, ASTContext &Ctx,
+      const TransferStateForDiagnostics<UncheckedStatusOrAccessModel::Lattice>
+          &State);
+
+private:
+  CFGMatchSwitch<const Environment, llvm::SmallVector<SourceLocation>>
+      DiagnoseMatchSwitch;
+};
+
+} // namespace clang::dataflow::statusor_model
+
+#endif // CLANG_ANALYSIS_FLOWSENSITIVE_MODELS_UNCHECKEDSTATUSORACCESSMODEL_H
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 22e60aa9fe312..eb48a0c01fd1e 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -1572,6 +1572,23 @@ def HIPManaged : InheritableAttr {
   let Documentation = [HIPManagedAttrDocs];
 }
 
+def CUDAClusterDims : InheritableAttr {
+  let Spellings = [GNU<"cluster_dims">];
+  let Args = [ExprArgument<"X">, ExprArgument<"Y", /*opt=*/1>, ExprArgument<"Z", /*opt=*/1>];
+  let Subjects = SubjectList<[ObjCMethod, FunctionLike]>;
+  let LangOpts = [CUDA];
+  let Documentation = [CUDAClusterDimsAttrDoc];
+}
+
+def CUDANoCluster : InheritableAttr {
+  let Spellings = [GNU<"no_cluster">];
+  let Subjects = SubjectList<[ObjCMethod, FunctionLike]>;
+  let LangOpts = [CUDA];
+  let Documentation = [CUDANoClusterAttrDoc];
+}
+
+def : MutualExclusions<[CUDAClusterDims, CUDANoCluster]>;
+
 def CUDAInvalidTarget : InheritableAttr {
   let Spellings = [];
   let Subjects = SubjectList<[Function]>;
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index e0bbda083b5cf..2fdd041c1b46e 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -7545,6 +7545,45 @@ A managed variable can be accessed in both device and host code.
   }];
 }
 
+def CUDAClusterDimsAttrDoc : Documentation {
+  let Category = DocCatDecl;
+  let Content = [{
+In CUDA/HIP programming, the ``cluster_dims`` attribute, conventionally exposed as the
+``__cluster_dims__`` macro, can be applied to a kernel function to set the dimensions of a
+thread block cluster, which is an optional level of hierarchy and made up of thread blocks.
+``__cluster_dims__`` defines the cluster size as ``(X, Y, Z)``, where each value is the number
+of thread blocks in that dimension. The ``cluster_dims`` and `no_cluster`` attributes are
+mutually exclusive.
+
+.. code::
+
+  __global__ __cluster_dims__(2, 1, 1) void kernel(...) {
+    ...
+  }
+
+  }];
+}
+
+def CUDANoClusterAttrDoc : Documentation {
+  let Category = DocCatDecl;
+  let Content = [{
+In CUDA/HIP programming, a kernel function can still be launched with the cluster feature enabled
+at runtime, even without being annotated with ``__cluster_dims__``. The LLVM/Clang-exclusive
+``no_cluster`` attribute, conventionally exposed as the ``__no_cluster__`` macro, can be applied to
+a kernel function to explicitly indicate that the cluster feature will not be enabled either at
+compile time or at kernel launch time. This allows the compiler to apply certain optimizations
+without assuming that clustering could be enabled at runtime. It is undefined behavior to launch a
+kernel annotated with ``__no_cluster__`` if the cluster feature is enabled at runtime.
+The ``cluster_dims`` and ``no_cluster`` attributes are mutually exclusive.
+
+.. code::
+
+  __global__ __no_cluster__ void kernel(...) {
+    ...
+  }
+  }];
+}
+
 def LifetimeOwnerDocs : Documentation {
   let Category = DocCatDecl;
   let Content = [{
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 792e2e07ec594..a350acdf146ab 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -4957,6 +4957,18 @@ def HLSLResourceNonUniformIndex : LangBuiltin<"HLSL_LANG"> {
   let Prototype = "uint32_t(uint32_t)";
 }
 
+def HLSLResourceGetDimensionsX : LangBuiltin<"HLSL_LANG"> {
+  let Spellings = ["__builtin_hlsl_resource_getdimensions_x"];
+  let Attributes = [NoThrow];
+  let Prototype = "void(...)";
+}
+
+def HLSLResourceGetStride : LangBuiltin<"HLSL_LANG"> {
+  let Spellings = ["__builtin_hlsl_resource_getstride"];
+  let Attributes = [NoThrow];
+  let Prototype = "void(...)";
+}
+
 def HLSLAll : LangBuiltin<"HLSL_LANG"> {
   let Spellings = ["__builtin_hlsl_all"];
   let Attributes = [NoThrow, Const];
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index b916456d76de2..f98266f9a86fe 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -190,6 +190,9 @@ TARGET_BUILTIN(__builtin_amdgcn_raw_ptr_buffer_atomic_fmax_f64, "ddQbiiIi", "t",
 TARGET_BUILTIN(__builtin_amdgcn_raw_ptr_buffer_load_lds, "vQbv*3IUiiiIiIi", "t", "vmem-to-lds-load-insts")
 TARGET_BUILTIN(__builtin_amdgcn_struct_ptr_buffer_load_lds, "vQbv*3IUiiiiIiIi", "t", "vmem-to-lds-load-insts")
 
+TARGET_BUILTIN(__builtin_amdgcn_global_load_b128, "V4UiV4Ui*1cC*", "n", "gfx940-insts")
+TARGET_BUILTIN(__builtin_amdgcn_global_store_b128, "vV4Ui*1V4UicC*", "n", "gfx940-insts")
+
 //===----------------------------------------------------------------------===//
 // Ballot builtins.
 //===----------------------------------------------------------------------===//
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index b197d81e8feaa..54b3ce08f5625 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -110,34 +110,35 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in {
   }
 
   let Features = "sse3" in {
-    foreach Op = ["addsub", "hadd", "hsub"] in {
+    foreach Op = ["addsub"] in {
       def Op#ps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>)">;
       def Op#pd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>)">;
     }
   }
 
-  let Features = "ssse3" in {
-    foreach Op = ["phadd", "phsub"] in {
-      def Op#w128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
-      def Op#sw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
-      def Op#d128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+  let Features = "sse3", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
+    foreach Op = ["hadd", "hsub"] in {
+      def Op#ps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>)">;
+      def Op#pd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>)">;
     }
+  }
 
-    def pmulhrsw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
-    def pshufb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
+  let Features = "ssse3" in {
     def psignb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
     def psignw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
     def psignd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
   }
 
   let Features = "ssse3", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
+    def pmulhrsw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
     def pmaddubsw128 : X86Builtin<"_Vector<8, short>(_Vector<16, char>, _Vector<16, char>)">;
+    def pshufb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
   }
 }
 
 // AVX
 let Attributes = [Const, NoThrow, RequiredVectorWidth<256>], Features = "avx" in {
-  foreach Op = ["addsub", "hadd", "hsub", "max", "min"] in {
+  foreach Op = ["addsub", "max", "min"] in {
     def Op#pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>)">;
     def Op#ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>)">;
   }
@@ -184,7 +185,8 @@ let Features = "sse", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in
   def cvttss2si : X86Builtin<"int(_Vector<4, float>)">;
 }
 
-let Features = "sse", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+let Features = "sse",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def movmskps : X86Builtin<"int(_Vector<4, float>)">;
 }
 
@@ -210,11 +212,6 @@ let Features = "sse2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
   def maskmovdqu : X86Builtin<"void(_Vector<16, char>, _Vector<16, char>, char *)">;
 }
 
-let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def movmskpd : X86Builtin<"int(_Vector<2, double>)">;
-  def pmovmskb128 : X86Builtin<"int(_Vector<16, char>)">;
-}
-
 let Features = "sse2", Attributes = [NoThrow] in {
   def movnti : X86Builtin<"void(int *, int)">;
 }
@@ -223,6 +220,8 @@ let Features = "sse2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi
   def pshuflw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int)">;
   def pshufd : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Constant int)">;
   def pshufhw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int)">;
+  def movmskpd : X86Builtin<"int(_Vector<2, double>)">;
+  def pmovmskb128 : X86Builtin<"int(_Vector<16, char>)">;
 }
 
 let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
@@ -316,6 +315,14 @@ let Features = "ssse3", Attributes = [NoThrow, Const, RequiredVectorWidth<128>]
   def palignr128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Constant int)">;
 }
 
+let Features = "ssse3", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
+  foreach Op = ["phadd", "phsub"] in {
+    def Op#w128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
+    def Op#sw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
+    def Op#d128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+  }
+}
+
 let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
   def insertps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant char)">;
   def roundps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int)">;
@@ -325,8 +332,8 @@ let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>]
   def dpps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant char)">;
   def dppd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, "
                         "_Vector<2,double>, _Constant char)">;
-  def mpsadbw128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
-  def phminposuw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>)">;
+  def mpsadbw128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, "
+                              "_Vector<16, char>, _Constant char)">;
 }
 
 let Features = "sse4.1",
@@ -349,6 +356,7 @@ let Features = "sse4.1", Attributes = [NoThrow, Const, Constexpr, RequiredVector
 
   def pmuldq128 : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>)">;
   def packusdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">;
+  def phminposuw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>)">;
 
   def vec_ext_v16qi : X86Builtin<"char(_Vector<16, char>, _Constant int)">;
   def vec_set_v16qi : X86Builtin<"_Vector<16, char>(_Vector<16, char>, char, _Constant int)">;
@@ -489,9 +497,6 @@ let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in
   def dpps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant char)">;
   def cmppd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant char)">;
   def cmpps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant char)">;
-  def vextractf128_pd256 : X86Builtin<"_Vector<2, double>(_Vector<4, double>, _Constant int)">;
-  def vextractf128_ps256 : X86Builtin<"_Vector<4, float>(_Vector<8, float>, _Constant int)">;
-  def vextractf128_si256 : X86Builtin<"_Vector<4, int>(_Vector<8, int>, _Constant int)">;
   def cvtpd2ps256 : X86Builtin<"_Vector<4, float>(_Vector<4, double>)">;
   def cvtps2dq256 : X86Builtin<"_Vector<8, int>(_Vector<8, float>)">;
   def cvttpd2dq256 : X86Builtin<"_Vector<4, int>(_Vector<4, double>)">;
@@ -512,9 +517,17 @@ let Features = "avx", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWid
   def blendps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
   def blendvpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>)">;
   def blendvps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>)">;
+  def vextractf128_pd256 : X86Builtin<"_Vector<2, double>(_Vector<4, double>, _Constant int)">;
+  def vextractf128_ps256 : X86Builtin<"_Vector<4, float>(_Vector<8, float>, _Constant int)">;
+  def vextractf128_si256 : X86Builtin<"_Vector<4, int>(_Vector<8, int>, _Constant int)">;
   def vinsertf128_pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<2, double>, _Constant int)">;
   def vinsertf128_ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<4, float>, _Constant int)">;
   def vinsertf128_si256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>, _Constant int)">;
+
+  foreach Op = ["hadd", "hsub"] in {
+    def Op#pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>)">;
+    def Op#ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>)">;
+  }
 }
 
 let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
@@ -548,11 +561,8 @@ let Features = "avx",
   def vtestnzcps256 : X86Builtin<"int(_Vector<8, float>, _Vector<8, float>)">;
   def ptestz256 : X86Builtin<"int(_Vector<4, long long int>, _Vector<4, long long int>)">;
   def ptestc256 : X86Builtin<"int(_Vector<4, long long int>, _Vector<4, long long int>)">;
-  def ptestnzc256 : X86Builtin<"int(_Vector<4, long long int>, _Vector<4, long long int>)">;
-}
-
-let Features = "avx",
-    Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def ptestnzc256
+      : X86Builtin<"int(_Vector<4, long long int>, _Vector<4, long long int>)">;
   def movmskpd256 : X86Builtin<"int(_Vector<4, double>)">;
   def movmskps256 : X86Builtin<"int(_Vector<8, float>)">;
 }
@@ -591,17 +601,9 @@ let Features = "avx", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWid
 
 let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
   def mpsadbw256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant char)">;
-  def palignr256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant int)">;
-  def phaddw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
-  def phaddd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
-  def phaddsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
-  def phsubw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
-  def phsubd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
-  def phsubsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
-  def pmovmskb256 : X86Builtin<"int(_Vector<32, char>)">;
-  def pmulhrsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+  def palignr256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, "
+                              "_Vector<32, char>, _Constant int)">;
   def psadbw256 : X86Builtin<"_Vector<4, long long int>(_Vector<32, char>, _Vector<32, char>)">;
-  def pshufb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
   def psignb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
   def psignw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
   def psignd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
@@ -620,11 +622,11 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i
   def permvarsf256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>)">;
   def permti256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Constant int)">;
   def permdi256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Constant int)">;
-  def extract128i256 : X86Builtin<"_Vector<2, long long int>(_Vector<4, long long int>, _Constant int)">;
 }
 
 
 let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
+  def pmovmskb256 : X86Builtin<"int(_Vector<32, char>)">;
   def pavgb256 : X86Builtin<"_Vector<32, unsigned char>(_Vector<32, unsigned char>, _Vector<32, unsigned char>)">;
   def pavgw256 : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, unsigned short>, _Vector<16, unsigned short>)">;
 
@@ -640,6 +642,8 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi
   def pmuldq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
   def pmuludq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
 
+  def pshufb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
+
   def psllwi256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, int)">;
   def pslldi256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, int)">;
   def psllqi256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, int)">;
@@ -651,6 +655,7 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi
   def psrawi256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, int)">;
   def psradi256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, int)">;
 
+  def pmulhrsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
   def pmulhuw256 : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, unsigned short>, _Vector<16, unsigned short>)">;
   def pmulhw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
 
@@ -666,6 +671,13 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi
   def packssdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">;
   def packuswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">;
 
+  def phaddw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+  def phaddd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+  def phaddsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+  def phsubw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+  def phsubd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+  def phsubsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+  
   def pshuflw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Constant int)">;
   def pshufhw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Constant int)">;
   def pshufd256  : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Constant int)">;
@@ -677,6 +689,7 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi
   def psrlv4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
   def psllv2di : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
   def psrlv2di : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+  def extract128i256 : X86Builtin<"_Vector<2, long long int>(_Vector<4, long long int>, _Constant int)">;
 }
 
 let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
@@ -1078,7 +1091,7 @@ let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256
   def alignq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Constant int)">;
 }
 
-let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def extractf64x4_mask : X86Builtin<"_Vector<4, double>(_Vector<8, double>, _Constant int, _Vector<4, double>, unsigned char)">;
   def extractf32x4_mask : X86Builtin<"_Vector<4, float>(_Vector<16, float>, _Constant int, _Vector<4, float>, unsigned char)">;
 }
@@ -1331,7 +1344,6 @@ let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>
 
 let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
   def ucmpw512_mask : X86Builtin<"unsigned int(_Vector<32, short>, _Vector<32, short>, _Constant int, unsigned int)">;
-  def pshufb512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">;
 }
 
 let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
@@ -1339,25 +1351,21 @@ let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVect
   def packssdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">;
   def packuswb512 : X86Builtin<"_Vector<64, char>(_Vector<32, short>, _Vector<32, short>)">;
   def packusdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">;
-}
 
-let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def vpconflictdi_128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>)">;
-}
-
-let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def vpconflictdi_256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>)">;
+  def pshufb512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">;
 }
 
-let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
+  def vpconflictdi_128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>)">;
   def vpconflictsi_128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>)">;
 }
 
-let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
+  def vpconflictdi_256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>)">;
   def vpconflictsi_256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>)">;
 }
 
-let Features = "avx512cd", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512cd", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def vpconflictdi_512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>)">;
   def vpconflictsi_512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>)">;
 }
@@ -1374,13 +1382,10 @@ let Features = "avx512bitalg", Attributes = [NoThrow, Const, RequiredVectorWidth
   def vpshufbitqmb512_mask : X86Builtin<"unsigned long long int(_Vector<64, char>, _Vector<64, char>, unsigned long long int)">;
 }
 
-let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
-  def pmulhrsw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">;
-}
-
 let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def pavgb512 : X86Builtin<"_Vector<64, unsigned char>(_Vector<64, unsigned char>, _Vector<64, unsigned char>)">;
   def pavgw512 : X86Builtin<"_Vector<32, unsigned short>(_Vector<32, unsigned short>, _Vector<32, unsigned short>)">;
+  def pmulhrsw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">;
   def pmulhuw512 : X86Builtin<"_Vector<32, unsigned short>(_Vector<32, unsigned short>, _Vector<32, unsigned short>)">;
   def pmulhw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">;
 }
@@ -2121,24 +2126,18 @@ let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
   def movdqa64store256_mask : X86Builtin<"void(_Vector<4, long long int *>, _Vector<4, long long int>, unsigned char)">;
 }
 
-let Features = "avx512ifma", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512ifma", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def vpmadd52huq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Vector<8, long long int>)">;
   def vpmadd52luq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Vector<8, long long int>)">;
 }
 
-let Features = "avx512ifma,avx512vl|avxifma", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512ifma,avx512vl|avxifma", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def vpmadd52huq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>)">;
-}
-
-let Features = "avx512ifma,avx512vl|avxifma", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def vpmadd52huq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>)">;
-}
-
-let Features = "avx512ifma,avx512vl|avxifma", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
   def vpmadd52luq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>)">;
 }
 
-let Features = "avx512ifma,avx512vl|avxifma", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512ifma,avx512vl|avxifma", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
+  def vpmadd52huq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>)">;
   def vpmadd52luq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>)">;
 }
 
@@ -2957,24 +2956,24 @@ let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
   def pmovqw256mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<4, long long int>, unsigned char)">;
 }
 
-let Features = "avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512dq", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def extractf32x8_mask : X86Builtin<"_Vector<8, float>(_Vector<16, float>, _Constant int, _Vector<8, float>, unsigned char)">;
   def extractf64x2_512_mask : X86Builtin<"_Vector<2, double>(_Vector<8, double>, _Constant int, _Vector<2, double>, unsigned char)">;
   def extracti32x8_mask : X86Builtin<"_Vector<8, int>(_Vector<16, int>, _Constant int, _Vector<8, int>, unsigned char)">;
   def extracti64x2_512_mask : X86Builtin<"_Vector<2, long long int>(_Vector<8, long long int>, _Constant int, _Vector<2, long long int>, unsigned char)">;
 }
 
-let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def extracti32x4_mask : X86Builtin<"_Vector<4, int>(_Vector<16, int>, _Constant int, _Vector<4, int>, unsigned char)">;
   def extracti64x4_mask : X86Builtin<"_Vector<4, long long int>(_Vector<8, long long int>, _Constant int, _Vector<4, long long int>, unsigned char)">;
 }
 
-let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
   def extractf64x2_256_mask : X86Builtin<"_Vector<2, double>(_Vector<4, double>, _Constant int, _Vector<2, double>, unsigned char)">;
   def extracti64x2_256_mask : X86Builtin<"_Vector<2, long long int>(_Vector<4, long long int>, _Constant int, _Vector<2, long long int>, unsigned char)">;
 }
 
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
   def extractf32x4_256_mask : X86Builtin<"_Vector<4, float>(_Vector<8, float>, _Constant int, _Vector<4, float>, unsigned char)">;
   def extracti32x4_256_mask : X86Builtin<"_Vector<4, int>(_Vector<8, int>, _Constant int, _Vector<4, int>, unsigned char)">;
 }
diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td
index c724136a7fdaf..e5e071f43fa75 100644
--- a/clang/include/clang/Basic/DiagnosticParseKinds.td
+++ b/clang/include/clang/Basic/DiagnosticParseKinds.td
@@ -505,7 +505,7 @@ def err_expected_coloncolon_after_super : Error<
   "expected '::' after '__super'">;
 
 def ext_decomp_decl_empty : ExtWarn<
-  "ISO C++17 does not allow a decomposition group to be empty">,
+  "ISO C++17 does not allow a structured binding group to be empty">,
   InGroup<DiagGroup<"empty-decomposition">>;
 
 def err_function_parameter_limit_exceeded : Error<
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 1072f4e86a901..dd1fbc68c17e2 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -31,12 +31,12 @@ defm constexpr_body_multiple_return : CXX14Compat<
 defm variable_template : CXX14Compat<"variable templates are">;
 
 // C++17 compatibility with C++14 and earlier.
-defm decomp_decl : CXX17Compat<"decomposition declarations are">;
+defm decomp_decl : CXX17Compat<"structured binding declarations are">;
 defm inline_variable : CXX17Compat<"inline variables are">;
 
 // C++20 compatibility with C++17 and earlier.
 defm decomp_decl_spec
-    : CXX20Compat<"decomposition declaration declared '%0' is">;
+    : CXX20Compat<"structured binding declaration declared '%0' is">;
 defm constexpr_local_var_no_init : CXX20Compat<
   "uninitialized variable in a constexpr %select{function|constructor}0 is">;
 defm constexpr_function_try_block : CXX20Compat<
@@ -591,58 +591,58 @@ def warn_modifying_shadowing_decl :
 
 // C++ decomposition declarations
 def err_decomp_decl_context : Error<
-  "decomposition declaration not permitted in this context">;
+  "structured binding declaration not permitted in this context">;
 def err_decomp_decl_spec
-    : Error<"decomposition declaration cannot be declared '%0'">;
+    : Error<"structured binding declaration cannot be declared '%0'">;
 def err_decomp_decl_type : Error<
-  "decomposition declaration cannot be declared with type %0; "
+  "structured binding declaration cannot be declared with type %0; "
   "declared type must be 'auto' or reference to 'auto'">;
 def err_decomp_decl_constraint : Error<
-  "decomposition declaration cannot be declared with constrained 'auto'">;
+  "structured binding declaration cannot be declared with constrained 'auto'">;
 def err_decomp_decl_parens : Error<
-  "decomposition declaration cannot be declared with parentheses">;
+  "structured binding declaration cannot be declared with parentheses">;
 def err_decomp_decl_template : Error<
-  "decomposition declaration cannot be a template">;
+  "structured binding declaration cannot be a template">;
 def err_decomp_decl_not_alone : Error<
-  "decomposition declaration must be the only declaration in its group">;
+  "structured binding declaration must be the only declaration in its group">;
 def err_decomp_decl_requires_init : Error<
-  "decomposition declaration %0 requires an initializer">;
+  "structured binding declaration %0 requires an initializer">;
 def err_decomp_decl_wrong_number_bindings : Error<
-  "type %0 decomposes into %3 %plural{1:element|:elements}2, but "
+  "type %0 binds to %3 %plural{1:element|:elements}2, but "
   "%select{%plural{0:no|:only %1}1|%1}4 "
   "%plural{1:name was|:names were}1 provided">;
 def err_decomp_decl_unbindable_type : Error<
-  "cannot decompose %select{union|non-class, non-array}1 type %2">;
+  "cannot bind %select{union|non-class, non-array}1 type %2">;
 def err_decomp_decl_multiple_bases_with_members : Error<
-  "cannot decompose class type %1: "
+  "cannot bind class type %1: "
   "%select{its base classes %2 and|both it and its base class}0 %3 "
   "have non-static data members">;
 def err_decomp_decl_ambiguous_base : Error<
-  "cannot decompose members of ambiguous base class %1 of %0:%2">;
+  "cannot bind members of ambiguous base class %1 of %0:%2">;
 def err_decomp_decl_inaccessible_base : Error<
-  "cannot decompose members of inaccessible base class %1 of %0">,
+  "cannot bind members of inaccessible base class %1 of %0">,
   AccessControl;
 def err_decomp_decl_inaccessible_field : Error<
-  "cannot decompose %select{private|protected}0 member %1 of %3">,
+  "cannot bind %select{private|protected}0 member %1 of %3">,
   AccessControl;
 def err_decomp_decl_lambda : Error<
-  "cannot decompose lambda closure type">;
+  "cannot bind lambda closure type">;
 def err_decomp_decl_anon_union_member : Error<
-  "cannot decompose class type %0 because it has an anonymous "
+  "cannot bind class type %0 because it has an anonymous "
   "%select{struct|union}1 member">;
 def err_decomp_decl_std_tuple_element_not_specialized : Error<
-  "cannot decompose this type; 'std::tuple_element<%0>::type' "
+  "cannot bind this type; 'std::tuple_element<%0>::type' "
   "does not name a type">;
 def err_decomp_decl_std_tuple_size_not_constant : Error<
-  "cannot decompose this type; 'std::tuple_size<%0>::value' "
+  "cannot bind this type; 'std::tuple_size<%0>::value' "
   "is not a valid integral constant expression">;
 def err_decomp_decl_std_tuple_size_invalid
-    : Error<"cannot decompose this type; 'std::tuple_size<%0>::value' "
+    : Error<"cannot bind this type; 'std::tuple_size<%0>::value' "
             "is not a valid size: %1">;
 def note_in_binding_decl_init : Note<
   "in implicit initialization of binding declaration %0">;
 def err_arg_is_not_destructurable : Error<
-  "type %0 cannot be decomposed">;
+  "type %0 cannot be bound">;
 
 def err_std_type_trait_not_class_template : Error<
   "unsupported standard library implementation: "
@@ -2616,7 +2616,7 @@ def err_auto_variable_cannot_appear_in_own_initializer
           "declared with deduced type %2 cannot appear in its own initializer">;
 def err_binding_cannot_appear_in_own_initializer : Error<
   "binding %0 cannot appear in the initializer of its own "
-  "decomposition declaration">;
+  "structured binding declaration">;
 def err_new_array_of_auto : Error<
   "cannot allocate array of 'auto'">;
 def err_auto_not_allowed : Error<
@@ -13070,6 +13070,12 @@ def warn_cuda_maxclusterrank_sm_90 : Warning<
   "maxclusterrank requires sm_90 or higher, CUDA arch provided: %0, ignoring "
   "%1 attribute">, InGroup<IgnoredAttributes>;
 
+def err_cluster_attr_not_supported : Error<
+  "%0 is not supported for this GPU architecture">;
+
+def err_cluster_dims_too_large : Error<
+  "cluster does not support more than %0 thread blocks; %1 provided">;
+
 // VTable pointer authentication errors
 def err_non_polymorphic_vtable_pointer_auth : Error<
   "cannot set vtable pointer authentication on monomorphic type %0">;
@@ -13670,6 +13676,9 @@ def err_acc_reduction_recipe_no_op
             "not have a valid operation available">;
 def note_acc_reduction_recipe_noop_field
     : Note<"while forming combiner for compound type %0">;
+def note_acc_reduction_combiner_forming
+    : Note<"while forming %select{|binary operator '%1'|conditional "
+           "operator|final assignment operator}0">;
 
 // AMDGCN builtins diagnostics
 def err_amdgcn_load_lds_size_invalid_value : Error<"invalid size value">;
diff --git a/clang/include/clang/Basic/DirectoryEntry.h b/clang/include/clang/Basic/DirectoryEntry.h
index 35fe529ba79df..69656125e422a 100644
--- a/clang/include/clang/Basic/DirectoryEntry.h
+++ b/clang/include/clang/Basic/DirectoryEntry.h
@@ -119,10 +119,10 @@ namespace FileMgr {
 /// the private optional_none_tag to keep it to the size of a single pointer.
 template <class RefTy> class MapEntryOptionalStorage {
   using optional_none_tag = typename RefTy::optional_none_tag;
-  RefTy MaybeRef;
+  RefTy MaybeRef = optional_none_tag();
 
 public:
-  MapEntryOptionalStorage() : MaybeRef(optional_none_tag()) {}
+  MapEntryOptionalStorage() = default;
 
   template <class... ArgTypes>
   explicit MapEntryOptionalStorage(std::in_place_t, ArgTypes &&...Args)
@@ -168,11 +168,7 @@ class OptionalStorage<clang::DirectoryEntryRef>
       clang::FileMgr::MapEntryOptionalStorage<clang::DirectoryEntryRef>;
 
 public:
-  OptionalStorage() = default;
-
-  template <class... ArgTypes>
-  explicit OptionalStorage(std::in_place_t, ArgTypes &&...Args)
-      : StorageImpl(std::in_place_t{}, std::forward<ArgTypes>(Args)...) {}
+  using StorageImpl::StorageImpl;
 
   OptionalStorage &operator=(clang::DirectoryEntryRef Ref) {
     StorageImpl::operator=(Ref);
diff --git a/clang/include/clang/Basic/FileEntry.h b/clang/include/clang/Basic/FileEntry.h
index c973ba38bdf7e..e7091fd1def59 100644
--- a/clang/include/clang/Basic/FileEntry.h
+++ b/clang/include/clang/Basic/FileEntry.h
@@ -218,11 +218,7 @@ class OptionalStorage<clang::FileEntryRef>
       clang::FileMgr::MapEntryOptionalStorage<clang::FileEntryRef>;
 
 public:
-  OptionalStorage() = default;
-
-  template <class... ArgTypes>
-  explicit OptionalStorage(std::in_place_t, ArgTypes &&...Args)
-      : StorageImpl(std::in_place_t{}, std::forward<ArgTypes>(Args)...) {}
+  using StorageImpl::StorageImpl;
 
   OptionalStorage &operator=(clang::FileEntryRef Ref) {
     StorageImpl::operator=(Ref);
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 3b13e0221a332..74eee0761bac4 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -258,6 +258,7 @@ ENUM_LANGOPT(HLSLVersion, HLSLLangStd, 16, HLSL_Unset, NotCompatible, "HLSL Vers
 LANGOPT(HLSLStrictAvailability, 1, 0, NotCompatible,
         "Strict availability diagnostic mode for HLSL built-in functions.")
 LANGOPT(HLSLSpvUseUnknownImageFormat, 1, 0, NotCompatible, "For storage images and texel buffers, sets the default format to 'Unknown' when not specified via the `vk::image_format` attribute. If this option is not used, the format is inferred from the resource's data type.")
+LANGOPT(HLSLSpvEnableMaximalReconvergence, 1, 0, NotCompatible, "Enables the MaximallyReconvergesKHR execution mode for this module. This ensures that control flow reconverges at well-defined merge points as defined by the Vulkan spec.")
 
 LANGOPT(CUDAIsDevice      , 1, 0, NotCompatible, "compiling for CUDA device")
 LANGOPT(CUDAHostDeviceConstexpr, 1, 1, NotCompatible, "treating unattributed constexpr functions as __host__ __device__")
@@ -447,6 +448,7 @@ ENUM_LANGOPT(RegisterStaticDestructors, RegisterStaticDestructorsKind, 2,
 LANGOPT(RegCall4, 1, 0, NotCompatible, "Set __regcall4 as a default calling convention to respect __regcall ABI v.4")
 
 LANGOPT(MatrixTypes, 1, 0, NotCompatible, "Enable or disable the builtin matrix type")
+VALUE_LANGOPT(MaxMatrixDimension, 32, (1 << 20) - 1, NotCompatible, "maximum allowed matrix dimension")
 
 LANGOPT(CXXAssumptions, 1, 1, NotCompatible, "Enable or disable codegen and compile-time checks for C++23's [[assume]] attribute")
 
diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index 41595ec2a060d..260a7537edb9d 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -756,6 +756,15 @@ class LangOptions : public LangOptionsBase {
   bool isTargetDevice() const {
     return OpenMPIsTargetDevice || CUDAIsDevice || SYCLIsDevice;
   }
+
+  /// Returns the most applicable C standard-compliant language version code.
+  /// If none could be determined, returns \ref std::nullopt.
+  std::optional<uint32_t> getCLangStd() const;
+
+  /// Returns the most applicable C++ standard-compliant language
+  /// version code.
+  /// If none could be determined, returns \ref std::nullopt.
+  std::optional<uint32_t> getCPlusPlusLangStd() const;
 };
 
 /// Floating point control options
diff --git a/clang/include/clang/Basic/LangStandard.h b/clang/include/clang/Basic/LangStandard.h
index 49412232c9c5e..64645fee933a5 100644
--- a/clang/include/clang/Basic/LangStandard.h
+++ b/clang/include/clang/Basic/LangStandard.h
@@ -70,8 +70,7 @@ enum LangFeatures {
 /// standard.
 struct LangStandard {
   enum Kind {
-#define LANGSTANDARD(id, name, lang, desc, features) \
-    lang_##id,
+#define LANGSTANDARD(id, name, lang, desc, features, version) lang_##id,
 #include "clang/Basic/LangStandards.def"
     lang_unspecified
   };
@@ -80,6 +79,7 @@ struct LangStandard {
   const char *Description;
   unsigned Flags;
   clang::Language Language;
+  std::optional<uint32_t> Version;
 
 public:
   /// getName - Get the name of this standard.
@@ -91,6 +91,9 @@ struct LangStandard {
   /// Get the language that this standard describes.
   clang::Language getLanguage() const { return Language; }
 
+  /// Get the version code for this language standard.
+  std::optional<uint32_t> getVersion() const { return Version; }
+
   /// Language supports '//' comments.
   bool hasLineComments() const { return Flags & LineComment; }
 
diff --git a/clang/include/clang/Basic/LangStandards.def b/clang/include/clang/Basic/LangStandards.def
index 244692ab4296a..4edc93503cdf5 100644
--- a/clang/include/clang/Basic/LangStandards.def
+++ b/clang/include/clang/Basic/LangStandards.def
@@ -10,7 +10,7 @@
 #error "LANGSTANDARD must be defined before including this file"
 #endif
 
-/// LANGSTANDARD(IDENT, NAME, LANG, DESC, FEATURES)
+/// LANGSTANDARD(IDENT, NAME, LANG, DESC, FEATURES, VERSION)
 ///
 /// \param IDENT - The name of the standard as a C++ identifier.
 /// \param NAME - The name of the standard.
@@ -18,6 +18,8 @@
 /// \param DESC - A short description of the standard.
 /// \param FEATURES - The standard features as flags, these are enums from the
 /// clang::frontend namespace, which is assumed to be available.
+/// \param VERSION - The official version code for this standard.
+/// Has value 'std::nullopt' if no official version exists.
 
 /// LANGSTANDARD_ALIAS(IDENT, ALIAS)
 /// \param IDENT - The name of the standard as a C++ identifier.
@@ -36,186 +38,188 @@
 
 // C89-ish modes.
 LANGSTANDARD(c89, "c89",
-             C, "ISO C 1990", 0)
+             C, "ISO C 1990", 0, std::nullopt)
 LANGSTANDARD_ALIAS(c89, "c90")
 LANGSTANDARD_ALIAS(c89, "iso9899:1990")
 
 LANGSTANDARD(c94, "iso9899:199409",
              C, "ISO C 1990 with amendment 1",
-             Digraphs)
+             Digraphs, 199409)
 
 LANGSTANDARD(gnu89, "gnu89",
              C, "ISO C 1990 with GNU extensions",
-             LineComment | Digraphs | GNUMode)
+             LineComment | Digraphs | GNUMode, std::nullopt)
 LANGSTANDARD_ALIAS(gnu89, "gnu90")
 
 // C99-ish modes
 LANGSTANDARD(c99, "c99",
              C, "ISO C 1999",
-             LineComment | C99 | Digraphs | HexFloat)
+             LineComment | C99 | Digraphs | HexFloat, 199901)
 LANGSTANDARD_ALIAS(c99, "iso9899:1999")
 LANGSTANDARD_ALIAS_DEPR(c99, "c9x")
 LANGSTANDARD_ALIAS_DEPR(c99, "iso9899:199x")
 
 LANGSTANDARD(gnu99, "gnu99",
              C, "ISO C 1999 with GNU extensions",
-             LineComment | C99 | Digraphs | GNUMode | HexFloat)
+             LineComment | C99 | Digraphs | GNUMode | HexFloat, 199901)
 LANGSTANDARD_ALIAS_DEPR(gnu99, "gnu9x")
 
 // C11 modes
 LANGSTANDARD(c11, "c11",
              C, "ISO C 2011",
-             LineComment | C99 | C11 | Digraphs | HexFloat)
+             LineComment | C99 | C11 | Digraphs | HexFloat, 201112)
 LANGSTANDARD_ALIAS(c11, "iso9899:2011")
 LANGSTANDARD_ALIAS_DEPR(c11, "c1x")
 LANGSTANDARD_ALIAS_DEPR(c11, "iso9899:201x")
 
 LANGSTANDARD(gnu11, "gnu11",
              C, "ISO C 2011 with GNU extensions",
-             LineComment | C99 | C11 | Digraphs | GNUMode | HexFloat)
+             LineComment | C99 | C11 | Digraphs | GNUMode | HexFloat, 201112)
 LANGSTANDARD_ALIAS_DEPR(gnu11, "gnu1x")
 
 // C17 modes
 LANGSTANDARD(c17, "c17",
              C, "ISO C 2017",
-             LineComment | C99 | C11 | C17 | Digraphs | HexFloat)
+             LineComment | C99 | C11 | C17 | Digraphs | HexFloat, 201710)
 LANGSTANDARD_ALIAS(c17, "iso9899:2017")
 LANGSTANDARD_ALIAS(c17, "c18")
 LANGSTANDARD_ALIAS(c17, "iso9899:2018")
 LANGSTANDARD(gnu17, "gnu17",
              C, "ISO C 2017 with GNU extensions",
-             LineComment | C99 | C11 | C17 | Digraphs | GNUMode | HexFloat)
+             LineComment | C99 | C11 | C17 | Digraphs | GNUMode | HexFloat, 201710)
 LANGSTANDARD_ALIAS(gnu17, "gnu18")
 
 // C23 modes
 LANGSTANDARD(c23, "c23",
              C, "ISO C 2023",
-             LineComment | C99 | C11 | C17 | C23 | Digraphs | HexFloat)
+             LineComment | C99 | C11 | C17 | C23 | Digraphs | HexFloat, 202311)
 LANGSTANDARD_ALIAS(c23, "iso9899:2024")
 LANGSTANDARD_ALIAS_DEPR(c23, "c2x")
 LANGSTANDARD(gnu23, "gnu23",
              C, "ISO C 2023 with GNU extensions",
-             LineComment | C99 | C11 | C17 | C23 | Digraphs | GNUMode | HexFloat)
+             LineComment | C99 | C11 | C17 | C23 | Digraphs | GNUMode | HexFloat, 202311)
 LANGSTANDARD_ALIAS_DEPR(gnu23, "gnu2x")
 
 // C2y modes
+// FIXME: Use correct version code for C2y once published.
 LANGSTANDARD(c2y, "c2y",
              C, "Working Draft for ISO C2y",
-             LineComment | C99 | C11 | C17 | C23 | C2y | Digraphs | HexFloat)
+             LineComment | C99 | C11 | C17 | C23 | C2y | Digraphs | HexFloat, 202400)
 LANGSTANDARD(gnu2y, "gnu2y",
              C, "Working Draft for ISO C2y with GNU extensions",
-             LineComment | C99 | C11 | C17 | C23 | C2y | Digraphs | GNUMode | HexFloat)
+             LineComment | C99 | C11 | C17 | C23 | C2y | Digraphs | GNUMode | HexFloat, 202400)
 // TODO: Add the iso9899:202y alias once ISO publishes the standard.
 
 // C++ modes
 LANGSTANDARD(cxx98, "c++98",
              CXX, "ISO C++ 1998 with amendments",
-             LineComment | CPlusPlus | Digraphs)
+             LineComment | CPlusPlus | Digraphs, 199711)
 LANGSTANDARD_ALIAS(cxx98, "c++03")
 
 LANGSTANDARD(gnucxx98, "gnu++98",
              CXX, "ISO C++ 1998 with amendments and GNU extensions",
-             LineComment | CPlusPlus | Digraphs | GNUMode)
+             LineComment | CPlusPlus | Digraphs | GNUMode, 199711)
 LANGSTANDARD_ALIAS(gnucxx98, "gnu++03")
 
 LANGSTANDARD(cxx11, "c++11",
              CXX, "ISO C++ 2011 with amendments",
-             LineComment | CPlusPlus | CPlusPlus11 | Digraphs)
+             LineComment | CPlusPlus | CPlusPlus11 | Digraphs, 201103)
 LANGSTANDARD_ALIAS_DEPR(cxx11, "c++0x")
 
 LANGSTANDARD(gnucxx11, "gnu++11", CXX,
              "ISO C++ 2011 with amendments and GNU extensions",
-             LineComment | CPlusPlus | CPlusPlus11 | Digraphs | GNUMode)
+             LineComment | CPlusPlus | CPlusPlus11 | Digraphs | GNUMode, 201103)
 LANGSTANDARD_ALIAS_DEPR(gnucxx11, "gnu++0x")
 
 LANGSTANDARD(cxx14, "c++14",
              CXX, "ISO C++ 2014 with amendments",
-             LineComment | CPlusPlus | CPlusPlus11 | CPlusPlus14 | Digraphs)
+             LineComment | CPlusPlus | CPlusPlus11 | CPlusPlus14 | Digraphs, 201402)
 LANGSTANDARD_ALIAS_DEPR(cxx14, "c++1y")
 
 LANGSTANDARD(gnucxx14, "gnu++14",
              CXX, "ISO C++ 2014 with amendments and GNU extensions",
              LineComment | CPlusPlus | CPlusPlus11 | CPlusPlus14 | Digraphs |
-             GNUMode)
+             GNUMode, 201402)
 LANGSTANDARD_ALIAS_DEPR(gnucxx14, "gnu++1y")
 
 LANGSTANDARD(cxx17, "c++17",
              CXX, "ISO C++ 2017 with amendments",
              LineComment | CPlusPlus | CPlusPlus11 | CPlusPlus14 | CPlusPlus17 |
-             Digraphs | HexFloat)
+             Digraphs | HexFloat, 201703)
 LANGSTANDARD_ALIAS_DEPR(cxx17, "c++1z")
 
 LANGSTANDARD(gnucxx17, "gnu++17",
              CXX, "ISO C++ 2017 with amendments and GNU extensions",
              LineComment | CPlusPlus | CPlusPlus11 | CPlusPlus14 | CPlusPlus17 |
-             Digraphs | HexFloat | GNUMode)
+             Digraphs | HexFloat | GNUMode, 201703)
 LANGSTANDARD_ALIAS_DEPR(gnucxx17, "gnu++1z")
 
 LANGSTANDARD(cxx20, "c++20",
              CXX, "ISO C++ 2020 DIS",
              LineComment | CPlusPlus | CPlusPlus11 | CPlusPlus14 | CPlusPlus17 |
-             CPlusPlus20 | Digraphs | HexFloat)
+             CPlusPlus20 | Digraphs | HexFloat, 202002)
 LANGSTANDARD_ALIAS_DEPR(cxx20, "c++2a")
 
 LANGSTANDARD(gnucxx20, "gnu++20",
              CXX, "ISO C++ 2020 DIS with GNU extensions",
              LineComment | CPlusPlus | CPlusPlus11 | CPlusPlus14 | CPlusPlus17 |
-             CPlusPlus20 | Digraphs | HexFloat | GNUMode)
+             CPlusPlus20 | Digraphs | HexFloat | GNUMode, 202002)
 LANGSTANDARD_ALIAS_DEPR(gnucxx20, "gnu++2a")
 
 LANGSTANDARD(cxx23, "c++23",
              CXX, "ISO C++ 2023 DIS",
              LineComment | CPlusPlus | CPlusPlus11 | CPlusPlus14 | CPlusPlus17 |
-             CPlusPlus20 | CPlusPlus23 | Digraphs | HexFloat)
+             CPlusPlus20 | CPlusPlus23 | Digraphs | HexFloat, 202302)
 LANGSTANDARD_ALIAS_DEPR(cxx23, "c++2b")
 
 LANGSTANDARD(gnucxx23, "gnu++23",
              CXX, "ISO C++ 2023 DIS with GNU extensions",
              LineComment | CPlusPlus | CPlusPlus11 | CPlusPlus14 | CPlusPlus17 |
-             CPlusPlus20 | CPlusPlus23 | Digraphs | HexFloat | GNUMode)
+             CPlusPlus20 | CPlusPlus23 | Digraphs | HexFloat | GNUMode, 202302)
 LANGSTANDARD_ALIAS_DEPR(gnucxx23, "gnu++2b")
 
+// FIXME: Use correct version code for C++26 once published.
 LANGSTANDARD(cxx26, "c++2c",
              CXX, "Working draft for C++2c",
              LineComment | CPlusPlus | CPlusPlus11 | CPlusPlus14 | CPlusPlus17 |
-             CPlusPlus20 | CPlusPlus23 | CPlusPlus26 | Digraphs | HexFloat)
+             CPlusPlus20 | CPlusPlus23 | CPlusPlus26 | Digraphs | HexFloat, 202400)
 LANGSTANDARD_ALIAS(cxx26, "c++26")
 
 LANGSTANDARD(gnucxx26, "gnu++2c",
              CXX, "Working draft for C++2c with GNU extensions",
              LineComment | CPlusPlus | CPlusPlus11 | CPlusPlus14 | CPlusPlus17 |
-             CPlusPlus20 | CPlusPlus23 | CPlusPlus26 | Digraphs | HexFloat | GNUMode)
+             CPlusPlus20 | CPlusPlus23 | CPlusPlus26 | Digraphs | HexFloat | GNUMode, 202400)
 LANGSTANDARD_ALIAS(gnucxx26, "gnu++26")
 
 // OpenCL
 LANGSTANDARD(opencl10, "cl1.0",
              OpenCL, "OpenCL 1.0",
-             LineComment | C99 | Digraphs | HexFloat | OpenCL)
+             LineComment | C99 | Digraphs | HexFloat | OpenCL, std::nullopt)
 LANGSTANDARD_ALIAS_DEPR(opencl10, "cl")
 
 LANGSTANDARD(opencl11, "cl1.1",
              OpenCL, "OpenCL 1.1",
-             LineComment | C99 | Digraphs | HexFloat | OpenCL)
+             LineComment | C99 | Digraphs | HexFloat | OpenCL, std::nullopt)
 LANGSTANDARD(opencl12, "cl1.2",
              OpenCL, "OpenCL 1.2",
-             LineComment | C99 | Digraphs | HexFloat | OpenCL)
+             LineComment | C99 | Digraphs | HexFloat | OpenCL, std::nullopt)
 LANGSTANDARD(opencl20, "cl2.0",
              OpenCL, "OpenCL 2.0",
-             LineComment | C99 | Digraphs | HexFloat | OpenCL)
+             LineComment | C99 | Digraphs | HexFloat | OpenCL, std::nullopt)
 LANGSTANDARD(opencl30, "cl3.0",
              OpenCL, "OpenCL 3.0",
-             LineComment | C99 | Digraphs | HexFloat | OpenCL)
+             LineComment | C99 | Digraphs | HexFloat | OpenCL, std::nullopt)
 
 LANGSTANDARD(openclcpp10, "clc++1.0",
              OpenCL, "C++ for OpenCL 1.0",
              LineComment | CPlusPlus | CPlusPlus11 | CPlusPlus14 | CPlusPlus17 |
-             Digraphs | HexFloat | OpenCL)
+             Digraphs | HexFloat | OpenCL, std::nullopt)
 LANGSTANDARD_ALIAS(openclcpp10, "clc++")
 
 LANGSTANDARD(openclcpp2021, "clc++2021",
              OpenCL, "C++ for OpenCL 2021",
              LineComment | CPlusPlus | CPlusPlus11 | CPlusPlus14 | CPlusPlus17 |
-             Digraphs | HexFloat | OpenCL)
+             Digraphs | HexFloat | OpenCL, std::nullopt)
 
 LANGSTANDARD_ALIAS_DEPR(opencl10, "CL")
 LANGSTANDARD_ALIAS_DEPR(opencl11, "CL1.1")
@@ -229,35 +233,35 @@ LANGSTANDARD_ALIAS_DEPR(openclcpp2021, "CLC++2021")
 // HLSL
 LANGSTANDARD(hlsl, "hlsl",
              HLSL, "High Level Shader Language",
-             LineComment | HLSL | CPlusPlus | CPlusPlus11)
+             LineComment | HLSL | CPlusPlus | CPlusPlus11, std::nullopt)
 
 LANGSTANDARD(hlsl2015, "hlsl2015",
              HLSL, "High Level Shader Language 2015",
-             LineComment | HLSL | CPlusPlus | CPlusPlus11)
+             LineComment | HLSL | CPlusPlus | CPlusPlus11, std::nullopt)
 
 LANGSTANDARD(hlsl2016, "hlsl2016",
              HLSL, "High Level Shader Language 2016",
-             LineComment | HLSL | CPlusPlus | CPlusPlus11)
+             LineComment | HLSL | CPlusPlus | CPlusPlus11, std::nullopt)
 
 LANGSTANDARD(hlsl2017, "hlsl2017",
              HLSL, "High Level Shader Language 2017",
-             LineComment | HLSL | CPlusPlus | CPlusPlus11)
+             LineComment | HLSL | CPlusPlus | CPlusPlus11, std::nullopt)
 
 LANGSTANDARD(hlsl2018, "hlsl2018",
              HLSL, "High Level Shader Language 2018",
-             LineComment | HLSL | CPlusPlus | CPlusPlus11)
+             LineComment | HLSL | CPlusPlus | CPlusPlus11, std::nullopt)
 
 LANGSTANDARD(hlsl2021, "hlsl2021",
              HLSL, "High Level Shader Language 2021",
-             LineComment | HLSL | CPlusPlus | CPlusPlus11)
+             LineComment | HLSL | CPlusPlus | CPlusPlus11, std::nullopt)
 
 LANGSTANDARD(hlsl202x, "hlsl202x",
              HLSL, "High Level Shader Language 202x",
-             LineComment | HLSL | CPlusPlus | CPlusPlus11)
+             LineComment | HLSL | CPlusPlus | CPlusPlus11, std::nullopt)
 
 LANGSTANDARD(hlsl202y, "hlsl202y",
              HLSL, "High Level Shader Language 202y",
-             LineComment | HLSL | CPlusPlus | CPlusPlus11)
+             LineComment | HLSL | CPlusPlus | CPlusPlus11, std::nullopt)
 
 
 #undef LANGSTANDARD
diff --git a/clang/include/clang/Basic/SyncScope.h b/clang/include/clang/Basic/SyncScope.h
index cc8b020cbb1bb..27a31b491a508 100644
--- a/clang/include/clang/Basic/SyncScope.h
+++ b/clang/include/clang/Basic/SyncScope.h
@@ -43,11 +43,13 @@ enum class SyncScope {
   SystemScope,
   DeviceScope,
   WorkgroupScope,
+  ClusterScope,
   WavefrontScope,
   SingleScope,
   HIPSingleThread,
   HIPWavefront,
   HIPWorkgroup,
+  HIPCluster,
   HIPAgent,
   HIPSystem,
   OpenCLWorkGroup,
@@ -65,6 +67,8 @@ inline llvm::StringRef getAsString(SyncScope S) {
     return "device_scope";
   case SyncScope::WorkgroupScope:
     return "workgroup_scope";
+  case SyncScope::ClusterScope:
+    return "cluster_scope";
   case SyncScope::WavefrontScope:
     return "wavefront_scope";
   case SyncScope::SingleScope:
@@ -75,6 +79,8 @@ inline llvm::StringRef getAsString(SyncScope S) {
     return "hip_wavefront";
   case SyncScope::HIPWorkgroup:
     return "hip_workgroup";
+  case SyncScope::HIPCluster:
+    return "hip_cluster";
   case SyncScope::HIPAgent:
     return "hip_agent";
   case SyncScope::HIPSystem:
@@ -174,13 +180,18 @@ class AtomicScopeHIPModel : public AtomicScopeModel {
   /// The enum values match the pre-defined macros
   /// __HIP_MEMORY_SCOPE_*, which are used to define memory_scope_*
   /// enums in hip-c.h.
+  /// These may be present in pch files or bitcode so preserve existing values
+  /// when adding a new ID.
   enum ID {
     SingleThread = 1,
     Wavefront = 2,
     Workgroup = 3,
     Agent = 4,
     System = 5,
-    Last = System
+    Cluster = 6,
+    End,
+    Last = End - 1,
+    Count = Last
   };
 
   AtomicScopeHIPModel() {}
@@ -193,10 +204,14 @@ class AtomicScopeHIPModel : public AtomicScopeModel {
       return SyncScope::HIPWavefront;
     case Workgroup:
       return SyncScope::HIPWorkgroup;
+    case Cluster:
+      return SyncScope::HIPCluster;
     case Agent:
       return SyncScope::HIPAgent;
     case System:
       return SyncScope::HIPSystem;
+    case End:
+      break;
     }
     llvm_unreachable("Invalid language sync scope value");
   }
@@ -207,11 +222,12 @@ class AtomicScopeHIPModel : public AtomicScopeModel {
   }
 
   ArrayRef<unsigned> getRuntimeValues() const override {
-    static_assert(Last == System, "Does not include all sync scopes");
     static const unsigned Scopes[] = {
         static_cast<unsigned>(SingleThread), static_cast<unsigned>(Wavefront),
-        static_cast<unsigned>(Workgroup), static_cast<unsigned>(Agent),
-        static_cast<unsigned>(System)};
+        static_cast<unsigned>(Workgroup),    static_cast<unsigned>(Cluster),
+        static_cast<unsigned>(System),       static_cast<unsigned>(Agent)};
+    static_assert(sizeof(Scopes) / sizeof(Scopes[0]) == Count,
+                  "Does not include all sync scopes");
     return llvm::ArrayRef(Scopes);
   }
 
@@ -223,14 +239,18 @@ class AtomicScopeHIPModel : public AtomicScopeModel {
 /// Defines the generic atomic scope model.
 class AtomicScopeGenericModel : public AtomicScopeModel {
 public:
-  /// The enum values match predefined built-in macros __ATOMIC_SCOPE_*.
+  /// The enum values match predefined built-in macros __MEMORY_SCOPE_*.
+  /// These may be present in pch files or bitcode so preserve existing values
+  /// when adding a new ID.
   enum ID {
     System = 0,
     Device = 1,
     Workgroup = 2,
     Wavefront = 3,
     Single = 4,
-    Last = Single
+    Cluster = 5,
+    Count,
+    Last = Count - 1
   };
 
   AtomicScopeGenericModel() = default;
@@ -243,10 +263,14 @@ class AtomicScopeGenericModel : public AtomicScopeModel {
       return SyncScope::SystemScope;
     case Workgroup:
       return SyncScope::WorkgroupScope;
+    case Cluster:
+      return SyncScope::ClusterScope;
     case Wavefront:
       return SyncScope::WavefrontScope;
     case Single:
       return SyncScope::SingleScope;
+    case Count:
+      break;
     }
     llvm_unreachable("Invalid language sync scope value");
   }
@@ -256,11 +280,12 @@ class AtomicScopeGenericModel : public AtomicScopeModel {
   }
 
   ArrayRef<unsigned> getRuntimeValues() const override {
-    static_assert(Last == Single, "Does not include all sync scopes");
     static const unsigned Scopes[] = {
-        static_cast<unsigned>(Device), static_cast<unsigned>(System),
-        static_cast<unsigned>(Workgroup), static_cast<unsigned>(Wavefront),
-        static_cast<unsigned>(Single)};
+        static_cast<unsigned>(System),    static_cast<unsigned>(Device),
+        static_cast<unsigned>(Workgroup), static_cast<unsigned>(Cluster),
+        static_cast<unsigned>(Wavefront), static_cast<unsigned>(Single)};
+    static_assert(sizeof(Scopes) / sizeof(Scopes[0]) == Count,
+                  "Does not include all sync scopes");
     return llvm::ArrayRef(Scopes);
   }
 
diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index ceb16174e13e7..ea73ed915bf03 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -1211,6 +1211,25 @@ class TargetInfo : public TransferrableTargetInfo,
       TiedOperand = N;
       // Don't copy Name or constraint string.
     }
+
+    // For output operand constraints, the target can set bounds to indicate
+    // that the result value is guaranteed to fall within a certain range.
+    // This will cause corresponding assertions to be emitted that will allow
+    // for potential optimization based of that guarantee.
+    //
+    // NOTE: This re-uses the `ImmRange` fields to store the range, which are
+    // otherwise unused for constraint types used for output operands.
+    void setOutputOperandBounds(unsigned Min, unsigned Max) {
+      ImmRange.Min = Min;
+      ImmRange.Max = Max;
+      ImmRange.isConstrained = true;
+    }
+    std::optional<std::pair<unsigned, unsigned>>
+    getOutputOperandBounds() const {
+      return ImmRange.isConstrained
+                 ? std::make_pair(ImmRange.Min, ImmRange.Max)
+                 : std::optional<std::pair<unsigned, unsigned>>();
+    }
   };
 
   /// Validate register name used for global register variables.
diff --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td
index 412ef9abac1bc..2e5e1d93be096 100644
--- a/clang/include/clang/Basic/arm_mve.td
+++ b/clang/include/clang/Basic/arm_mve.td
@@ -831,9 +831,8 @@ multiclass contiguous_load<string mnemonic, PrimitiveType memtype,
          NameOverride<mnemonic>;
     def: Intrinsic<Vector, (args CPtr<CopyKind<same_size[0], Scalar>>:$addr,
                                  Predicate:$pred),
-                   (IRIntBase<"masked_load", [Vector, CPtr<Vector>]>
-                        (CPtr<Vector> $addr), !srl(memtype.size,3),
-                        $pred, (zeroinit Vector))>,
+                   (masked_load Vector, (CPtr<Vector> $addr),
+                       !srl(memtype.size,3), $pred, (zeroinit Vector))>,
          NameOverride<mnemonic # "_z">;
   }
 
@@ -846,9 +845,8 @@ multiclass contiguous_load<string mnemonic, PrimitiveType memtype,
          NameOverride<"vld1q">;
     def: Intrinsic<Vector, (args CPtr<CopyKind<same_size[0], Scalar>>:$addr,
                                  Predicate:$pred),
-                   (IRIntBase<"masked_load", [Vector, CPtr<Vector>]>
-                        (CPtr<Vector> $addr), !srl(memtype.size,3),
-                        $pred, (zeroinit Vector))>,
+                   (masked_load Vector, (CPtr<Vector> $addr),
+                        !srl(memtype.size,3), $pred, (zeroinit Vector))>,
          NameOverride<"vld1q_z">;
   }
 
@@ -863,9 +861,7 @@ multiclass contiguous_load<string mnemonic, PrimitiveType memtype,
          NameOverride<mnemonic>;
     def: Intrinsic<Vector, (args CPtr<CopyKind<same_size[0], Scalar>>:$addr,
                                  Predicate:$pred),
-                   (extend (IRIntBase<"masked_load",
-                                      [NarrowedVecOf<memtype,Vector>,
-                                      CPtr<NarrowedVecOf<memtype,Vector>>]>
+                   (extend (masked_load NarrowedVecOf<memtype,Vector>,
                                 (CPtr<NarrowedVecOf<memtype,Vector>> $addr),
                                 !srl(memtype.size,3), $pred,
                                 (zeroinit NarrowedVecOf<memtype,Vector>)),
@@ -890,8 +886,7 @@ multiclass contiguous_store<string mnemonic, PrimitiveType memtype,
          NameOverride<mnemonic>;
     def: Intrinsic<Void, (args Ptr<CopyKind<same_size[0], Scalar>>:$addr,
                                Vector:$value, Predicate:$pred),
-                   (IRIntBase<"masked_store", [Vector, Ptr<Vector>]>
-                        $value, (Ptr<Vector> $addr),
+                   (masked_store $value, (Ptr<Vector> $addr),
                         !srl(memtype.size,3), $pred)>,
          NameOverride<mnemonic # "_p">;
   }
@@ -907,8 +902,7 @@ multiclass contiguous_store<string mnemonic, PrimitiveType memtype,
          NameOverride<"vst1q">;
     def: Intrinsic<Void, (args Ptr<CopyKind<same_size[0], Scalar>>:$addr,
                                Vector:$value, Predicate:$pred),
-                   (IRIntBase<"masked_store", [Vector, Ptr<Vector>]>
-                        $value, (Ptr<Vector> $addr),
+                   (masked_store $value, (Ptr<Vector> $addr),
                         !srl(memtype.size,3), $pred)>,
          NameOverride<"vst1q_p">;
   }
@@ -925,9 +919,7 @@ multiclass contiguous_store<string mnemonic, PrimitiveType memtype,
          NameOverride<mnemonic>;
     def: Intrinsic<Void, (args Ptr<CopyKind<same_size[0], Scalar>>:$addr,
                                Vector:$value, Predicate:$pred),
-                   (IRIntBase<"masked_store",
-                              [NarrowedVecOf<memtype,Vector>,
-                               Ptr<NarrowedVecOf<memtype,Vector>>]>
+                   (masked_store
                         (trunc $value, NarrowedVecOf<memtype,Vector>),
                         (Ptr<NarrowedVecOf<memtype,Vector>> $addr),
                         !srl(memtype.size,3), $pred)>,
diff --git a/clang/include/clang/Basic/arm_mve_defs.td b/clang/include/clang/Basic/arm_mve_defs.td
index 083d03a396ba3..c1562a0c1f04c 100644
--- a/clang/include/clang/Basic/arm_mve_defs.td
+++ b/clang/include/clang/Basic/arm_mve_defs.td
@@ -134,6 +134,13 @@ def unzip: CGHelperFn<"VectorUnzip"> {
 }
 def zip: CGHelperFn<"VectorZip">;
 
+def masked_load: IRBuilder<"CreateMaskedLoad"> {
+  let special_params = [IRBuilderIntParam<2, "Align">];
+}
+def masked_store: IRBuilder<"CreateMaskedStore"> {
+  let special_params = [IRBuilderIntParam<2, "Align">];
+}
+
 // Trivial 'codegen' function that just returns its argument. Useful
 // for wrapping up a variable name like $foo into a thing you can pass
 // around as type 'dag'.
diff --git a/clang/include/clang/Basic/riscv_vector.td b/clang/include/clang/Basic/riscv_vector.td
index 07a8724b6f33d..96d8300a0faf3 100644
--- a/clang/include/clang/Basic/riscv_vector.td
+++ b/clang/include/clang/Basic/riscv_vector.td
@@ -1013,9 +1013,9 @@ let ManualCodegen = [{
 }] in {
   let HasFRMRoundModeOp = true in {
     // 13.2. Vector Single-Width Floating-Point Add/Subtract Instructions
-    defm vfadd  : RVVFloatingBinBuiltinSetRoundingMode;
-    defm vfsub  : RVVFloatingBinBuiltinSetRoundingMode;
-    defm vfrsub : RVVFloatingBinVFBuiltinSetRoundingMode;
+    defm vfadd  : RVVFloatingBinBuiltinSetRoundingMode<HasBF=1>;
+    defm vfsub  : RVVFloatingBinBuiltinSetRoundingMode<HasBF=1>;
+    defm vfrsub : RVVFloatingBinVFBuiltinSetRoundingMode<HasBF=1>;
 
     // 13.3. Vector Widening Floating-Point Add/Subtract Instructions
     // Widening FP add/subtract, 2*SEW = 2*SEW +/- SEW
@@ -1023,14 +1023,14 @@ let ManualCodegen = [{
     defm vfwsub : RVVFloatingWidenOp0BinBuiltinSetRoundingMode;
 
     // 13.4. Vector Single-Width Floating-Point Multiply/Divide Instructions
-    defm vfmul  : RVVFloatingBinBuiltinSetRoundingMode;
+    defm vfmul  : RVVFloatingBinBuiltinSetRoundingMode<HasBF=1>;
     defm vfdiv  : RVVFloatingBinBuiltinSetRoundingMode;
     defm vfrdiv : RVVFloatingBinVFBuiltinSetRoundingMode;
   }
   // 13.2. Vector Single-Width Floating-Point Add/Subtract Instructions
-  defm vfadd  : RVVFloatingBinBuiltinSet;
-  defm vfsub  : RVVFloatingBinBuiltinSet;
-  defm vfrsub : RVVFloatingBinVFBuiltinSet;
+  defm vfadd  : RVVFloatingBinBuiltinSet<HasBF=1>;
+  defm vfsub  : RVVFloatingBinBuiltinSet<HasBF=1>;
+  defm vfrsub : RVVFloatingBinVFBuiltinSet<HasBF=1>;
 
   // 13.3. Vector Widening Floating-Point Add/Subtract Instructions
   // Widening FP add/subtract, 2*SEW = 2*SEW +/- SEW
@@ -1038,7 +1038,7 @@ let ManualCodegen = [{
   defm vfwsub : RVVFloatingWidenOp0BinBuiltinSet;
 
   // 13.4. Vector Single-Width Floating-Point Multiply/Divide Instructions
-  defm vfmul  : RVVFloatingBinBuiltinSet;
+  defm vfmul  : RVVFloatingBinBuiltinSet<HasBF=1>;
   defm vfdiv  : RVVFloatingBinBuiltinSet;
   defm vfrdiv : RVVFloatingBinVFBuiltinSet;
 }
@@ -1065,6 +1065,10 @@ let ManualCodegen = [{
         defm vfwmul : RVVOutOp0Op1BuiltinSet<"vfwmul", "x",
                                              [["vv", "w", "wvvu"],
                                               ["vf", "w", "wveu"]]>;
+      let RequiredFeatures = ["zvfbfa"] in
+        defm vfwmul : RVVOutOp0Op1BuiltinSet<"vfwmul", "y",
+                                             [["vv", "vw", "wvvu"],
+                                              ["vf", "vw", "wveu"]]>;
     }
   }
   // 13.3. Vector Widening Floating-Point Add/Subtract Instructions
@@ -1081,6 +1085,10 @@ let ManualCodegen = [{
       defm vfwmul : RVVOutOp0Op1BuiltinSet<"vfwmul", "x",
                                            [["vv", "w", "wvv"],
                                             ["vf", "w", "wve"]]>;
+    let RequiredFeatures = ["zvfbfa"] in
+      defm vfwmul : RVVOutOp0Op1BuiltinSet<"vfwmul", "y",
+                                           [["vv", "vw", "wvv"],
+                                            ["vf", "vw", "wve"]]>;
   }
 }
 }
@@ -1170,6 +1178,8 @@ let ManualCodegen = [{
     defm vfrec7 : RVVOutBuiltinSet<"vfrec7", "fd", [["v", "v", "vvu"]]>;
     let RequiredFeatures = ["zvfh"] in
       defm vfrec7 : RVVOutBuiltinSet<"vfrec7", "x", [["v", "v", "vvu"]]>;
+    let RequiredFeatures = ["zvfbfa"] in
+      defm vfrec7 : RVVOutBuiltinSet<"vfrec7", "y", [["v", "v", "vvu"]]>;
   }
   // 13.8. Vector Floating-Point Square-Root Instruction
   defm vfsqrt : RVVOutBuiltinSet<"vfsqrt", "fd", [["v", "v", "vv"]]>;
@@ -1180,21 +1190,26 @@ let ManualCodegen = [{
   defm vfrec7 : RVVOutBuiltinSet<"vfrec7", "fd", [["v", "v", "vv"]]>;
   let RequiredFeatures = ["zvfh"] in
     defm vfrec7 : RVVOutBuiltinSet<"vfrec7", "x", [["v", "v", "vv"]]>;
+  let RequiredFeatures = ["zvfbfa"] in
+    defm vfrec7 : RVVOutBuiltinSet<"vfrec7", "y", [["v", "v", "vv"]]>;
 }
 
 // 13.9. Vector Floating-Point Reciprocal Square-Root Estimate Instruction
 defm vfrsqrt7 : RVVOutBuiltinSet<"vfrsqrt7", "fd", [["v", "v", "vv"]]>;
 let RequiredFeatures = ["zvfh"] in
   defm vfrsqrt7 : RVVOutBuiltinSet<"vfrsqrt7", "x", [["v", "v", "vv"]]>;
+let RequiredFeatures = ["zvfbfa"] in
+  defm vfrsqrt7 : RVVOutBuiltinSet<"vfrsqrt7", "y", [["v", "v", "vv"]]>;
+
 
 // 13.11. Vector Floating-Point MIN/MAX Instructions
-defm vfmin : RVVFloatingBinBuiltinSet;
-defm vfmax : RVVFloatingBinBuiltinSet;
+defm vfmin : RVVFloatingBinBuiltinSet<HasBF=1>;
+defm vfmax : RVVFloatingBinBuiltinSet<HasBF=1>;
 
 // 13.12. Vector Floating-Point Sign-Injection Instructions
-defm vfsgnj  : RVVFloatingBinBuiltinSet;
-defm vfsgnjn : RVVFloatingBinBuiltinSet;
-defm vfsgnjx : RVVFloatingBinBuiltinSet;
+defm vfsgnj  : RVVFloatingBinBuiltinSet<HasBF=1>;
+defm vfsgnjn : RVVFloatingBinBuiltinSet<HasBF=1>;
+defm vfsgnjx : RVVFloatingBinBuiltinSet<HasBF=1>;
 }
 defm vfneg_v : RVVPseudoVFUnaryBuiltin<"vfsgnjn", "fd">;
 let RequiredFeatures = ["zvfh"] in
@@ -1219,6 +1234,8 @@ let UnMaskedPolicyScheme = HasPassthruOperand in {
 defm vfclass : RVVOp0BuiltinSet<"vfclass", "fd", [["v", "Uv", "Uvv"]]>;
 let RequiredFeatures = ["zvfh"] in
   defm vfclass : RVVOp0BuiltinSet<"vfclass", "x", [["v", "Uv", "Uvv"]]>;
+let RequiredFeatures = ["zvfbfa"] in
+  defm vfclass : RVVOp0BuiltinSet<"vfclass", "y", [["v", "vUv", "Uvv"]]>;
 }
 
 // 13.15. Vector Floating-Point Merge Instruction
@@ -1239,6 +1256,9 @@ let HasMasked = false,
   let RequiredFeatures = ["zvfh"] in
     defm vfmerge : RVVOutOp1BuiltinSet<"vfmerge", "x",
                                        [["vfm", "v", "vvem"]]>;
+  let RequiredFeatures = ["zvfbfa"] in
+    defm vfmerge : RVVOutOp1BuiltinSet<"vfmerge", "y",
+                                       [["vfm", "v", "vvem"]]>;
 }
 
 // 13.16. Vector Floating-Point Move Instruction
@@ -1252,6 +1272,9 @@ let HasMasked = false,
   let RequiredFeatures = ["zvfh"] in
     defm vfmv_v : RVVOutBuiltinSet<"vfmv_v_f", "x",
                                    [["f", "v", "ve"]]>;
+  let RequiredFeatures = ["zvfbfa"] in
+    defm vfmv_v : RVVOutBuiltinSet<"vfmv_v_f", "y",
+                                   [["f", "v", "ve"]]>;
 }
 
 // 13.17. Single-Width Floating-Point/Integer Type-Convert Instructions
@@ -1287,10 +1310,16 @@ let Log2LMUL = [-3, -2, -1, 0, 1, 2] in {
       defm : RVVConvBuiltinSet<"vfwcvt_f_x_v", "c", [["Fw", "Fwv"]]>;
     }
   }
+  let RequiredFeatures = ["zvfbfa"], OverloadedName = "vfwcvt_f_bf16" in {
+    defm : RVVConvBuiltinSet<"vfwcvt_f_xu_v", "c", [["Yw", "YwUv"]]>;
+    defm : RVVConvBuiltinSet<"vfwcvt_f_x_v", "c", [["Yw", "Ywv"]]>;
+  }
   let OverloadedName = "vfwcvt_f" in {
     defm : RVVConvBuiltinSet<"vfwcvt_f_f_v", "f", [["w", "wv"]]>;
     let RequiredFeatures = ["zvfhmin"] in
       defm : RVVConvBuiltinSet<"vfwcvt_f_f_v", "x", [["w", "wv"]]>;
+    let RequiredFeatures = ["zvfbfa"] in
+      defm : RVVConvBuiltinSet<"vfwcvt_f_f_v", "y", [["vw", "wv"]]>;
   }
 }
 
@@ -1300,17 +1329,23 @@ let Log2LMUL = [-3, -2, -1, 0, 1, 2] in {
     defm : RVVConvBuiltinSet<"vfncvt_rtz_xu_f_w", "si", [["Uv", "UvFw"]]>;
     let RequiredFeatures = ["zvfh"] in
       defm : RVVConvBuiltinSet<"vfncvt_rtz_xu_f_w", "c", [["Uv", "UvFw"]]>;
+    let RequiredFeatures = ["zvfbfa"] in
+      defm : RVVConvBuiltinSet<"vfncvt_rtz_xu_f_w", "c", [["YwUv", "UvYw"]]>;
   }
   let OverloadedName = "vfncvt_rtz_x" in {
     defm : RVVConvBuiltinSet<"vfncvt_rtz_x_f_w", "si", [["Iv", "IvFw"]]>;
     let RequiredFeatures = ["zvfh"] in
       defm : RVVConvBuiltinSet<"vfncvt_rtz_x_f_w", "c", [["Iv", "IvFw"]]>;
+    let RequiredFeatures = ["zvfbfa"] in
+      defm : RVVConvBuiltinSet<"vfncvt_rtz_x_f_w", "c", [["YwIv", "IvYw"]]>;
   }
   let OverloadedName = "vfncvt_rod_f" in {
     defm : RVVConvBuiltinSet<"vfncvt_rod_f_f_w", "f", [["v", "vw"]]>;
     let RequiredFeatures = ["zvfh"] in
       defm : RVVConvBuiltinSet<"vfncvt_rod_f_f_w", "x", [["v", "vw"]]>;
   }
+  let RequiredFeatures = ["zvfbfa"], OverloadedName = "vfncvt_rod_f_bf16" in
+    defm : RVVConvBuiltinSet<"vfncvt_rod_f_f_w", "y", [["v", "vw"]]>;
 }
 
 // Zvfbfmin - Vector convert BF16 to FP32
@@ -1363,11 +1398,15 @@ let ManualCodegen = [{
         defm : RVVConvBuiltinSet<"vfncvt_x_f_w", "si", [["Iv", "IvFwu"]]>;
         let RequiredFeatures = ["zvfh"] in
           defm : RVVConvBuiltinSet<"vfncvt_x_f_w", "c", [["Iv", "IvFwu"]]>;
+        let RequiredFeatures = ["zvfbfa"] in
+          defm : RVVConvBuiltinSet<"vfncvt_x_f_w", "c", [["YwIv", "IvYwu"]]>;
       }
       let OverloadedName = "vfncvt_xu" in {
         defm : RVVConvBuiltinSet<"vfncvt_xu_f_w", "si", [["Uv", "UvFwu"]]>;
         let RequiredFeatures = ["zvfh"] in
           defm : RVVConvBuiltinSet<"vfncvt_xu_f_w", "c", [["Uv", "UvFwu"]]>;
+        let RequiredFeatures = ["zvfbfa"] in
+          defm : RVVConvBuiltinSet<"vfncvt_xu_f_w", "c", [["YwUv", "UvYwu"]]>;
       }
       let OverloadedName = "vfncvt_f" in {
         defm : RVVConvBuiltinSet<"vfncvt_f_x_w", "f", [["v", "vIwu"]]>;
@@ -1382,6 +1421,8 @@ let ManualCodegen = [{
         let RequiredFeatures = ["zvfhmin"] in
           defm : RVVConvBuiltinSet<"vfncvt_f_f_w", "x", [["v", "vwu"]]>;
       }
+      let RequiredFeatures = ["zvfbfa"], OverloadedName = "vfncvt_f_bf16" in
+        defm : RVVConvBuiltinSet<"vfncvt_f_f_w", "y", [["v", "vwu"]]>;
     }
 
     // Zvfbfmin - Vector convert FP32 to BF16
@@ -1430,11 +1471,15 @@ let ManualCodegen = [{
       defm : RVVConvBuiltinSet<"vfncvt_x_f_w", "si", [["Iv", "IvFw"]]>;
       let RequiredFeatures = ["zvfh"] in
         defm : RVVConvBuiltinSet<"vfncvt_x_f_w", "c", [["Iv", "IvFw"]]>;
+      let RequiredFeatures = ["zvfbfa"] in
+        defm : RVVConvBuiltinSet<"vfncvt_x_f_w", "c", [["YwIv", "IvYw"]]>;
     }
     let OverloadedName = "vfncvt_xu" in {
       defm : RVVConvBuiltinSet<"vfncvt_xu_f_w", "si", [["Uv", "UvFw"]]>;
       let RequiredFeatures = ["zvfh"] in
         defm : RVVConvBuiltinSet<"vfncvt_xu_f_w", "c", [["Uv", "UvFw"]]>;
+      let RequiredFeatures = ["zvfbfa"] in
+        defm : RVVConvBuiltinSet<"vfncvt_xu_f_w", "c", [["YwUv", "UvYw"]]>;
     }
     let OverloadedName = "vfncvt_f" in {
       defm : RVVConvBuiltinSet<"vfncvt_f_x_w", "f", [["v", "vIw"]]>;
@@ -1449,6 +1494,8 @@ let ManualCodegen = [{
       let RequiredFeatures = ["zvfhmin"] in
         defm : RVVConvBuiltinSet<"vfncvt_f_f_w", "x", [["v", "vw"]]>;
     }
+    let RequiredFeatures = ["zvfbfa"], OverloadedName = "vfncvt_f_bf16" in
+      defm : RVVConvBuiltinSet<"vfncvt_f_f_w", "y", [["v", "vw"]]>;
   }
 
   // Zvfbfmin - Vector convert FP32 to BF16
@@ -1578,6 +1625,9 @@ let HasMasked = false, MaskedPolicyScheme = NonePolicy in {
     let RequiredFeatures = ["zvfh"] in
       defm vfmv_f : RVVOp0BuiltinSet<"vfmv_f_s", "x",
                                      [["s", "ve", "ev"]]>;
+    let RequiredFeatures = ["zvfbfa"] in
+      defm vfmv_f : RVVOp0BuiltinSet<"vfmv_f_s", "y",
+                                     [["s", "ve", "ev"]]>;
   }
   let OverloadedName = "vfmv_s",
       UnMaskedPolicyScheme = HasPassthruOperand,
@@ -1589,6 +1639,9 @@ let HasMasked = false, MaskedPolicyScheme = NonePolicy in {
       defm vfmv_s : RVVOutBuiltinSet<"vfmv_s_f", "x",
                                      [["f", "v", "ve"],
                                       ["x", "Uv", "UvUe"]]>;
+    let RequiredFeatures = ["zvfbfa"] in
+      defm vfmv_s : RVVOutBuiltinSet<"vfmv_s_f", "y",
+                                     [["f", "v", "ve"]]>;
   }
 }
 
@@ -1601,11 +1654,11 @@ defm vslidedown : RVVSlideDownBuiltinSet;
 // 16.3.3. Vector Slide1up Instructions
 let UnMaskedPolicyScheme = HasPassthruOperand in {
 defm vslide1up : RVVSlideOneBuiltinSet;
-defm vfslide1up : RVVFloatingBinVFBuiltinSet;
+defm vfslide1up : RVVFloatingBinVFBuiltinSet<HasBF=1>;
 
 // 16.3.4. Vector Slide1down Instruction
 defm vslide1down : RVVSlideOneBuiltinSet;
-defm vfslide1down : RVVFloatingBinVFBuiltinSet;
+defm vfslide1down : RVVFloatingBinVFBuiltinSet<HasBF=1>;
 
 // 16.4. Vector Register Gather Instructions
 // signed and floating type
diff --git a/clang/include/clang/Basic/riscv_vector_common.td b/clang/include/clang/Basic/riscv_vector_common.td
index 767bcee7b1596..eaa2ba43885b0 100644
--- a/clang/include/clang/Basic/riscv_vector_common.td
+++ b/clang/include/clang/Basic/riscv_vector_common.td
@@ -83,6 +83,8 @@
 //      elements of the same width
 //   F: given a vector type, compute the vector type with floating-point type
 //      elements of the same width
+//   Y: given a vector type, compute the vector type with bfloat16 type elements
+//      of the same width
 //   S: given a vector type, computes its equivalent one for LMUL=1. This is a
 //      no-op if the vector was already LMUL=1
 //   (Log2EEW:Value): Log2EEW value could be 3/4/5/6 (8/16/32/64), given a
@@ -470,6 +472,10 @@ let HasMaskedOffOperand = false in {
       defm "" : RVVOutOp1BuiltinSet<NAME, "x",
                                     [["vv", "v", "vvvv"],
                                      ["vf", "v", "vvev"]]>;
+    let RequiredFeatures = ["zvfbfa"] in
+      defm "" : RVVOutOp1BuiltinSet<NAME, "y",
+                                    [["vv", "v", "vvvv"],
+                                     ["vf", "v", "vvev"]]>;
   }
   multiclass RVVFloatingTerBuiltinSetRoundingMode {
     defm "" : RVVOutOp1BuiltinSet<NAME, "fd",
@@ -479,6 +485,10 @@ let HasMaskedOffOperand = false in {
       defm "" : RVVOutOp1BuiltinSet<NAME, "x",
                                     [["vv", "v", "vvvvu"],
                                      ["vf", "v", "vvevu"]]>;
+    let RequiredFeatures = ["zvfbfa"] in
+      defm "" : RVVOutOp1BuiltinSet<NAME, "y",
+                                    [["vv", "v", "vvvvu"],
+                                     ["vf", "v", "vvevu"]]>;
   }
 }
 
@@ -491,6 +501,10 @@ let HasMaskedOffOperand = false, Log2LMUL = [-2, -1, 0, 1, 2] in {
       defm ""  : RVVOutOp1Op2BuiltinSet<NAME, "x",
                                         [["vv", "w", "wwvv"],
                                          ["vf", "w", "wwev"]]>;
+    let RequiredFeatures = ["zvfbfa"] in
+      defm ""  : RVVOutOp1Op2BuiltinSet<NAME, "y",
+                                        [["vv", "vw", "wwvv"],
+                                         ["vf", "vw", "wwev"]]>;
   }
   multiclass RVVFloatingWidenTerBuiltinSetRoundingMode {
     defm ""  : RVVOutOp1Op2BuiltinSet<NAME, "f",
@@ -500,10 +514,14 @@ let HasMaskedOffOperand = false, Log2LMUL = [-2, -1, 0, 1, 2] in {
       defm ""  : RVVOutOp1Op2BuiltinSet<NAME, "x",
                                         [["vv", "w", "wwvvu"],
                                          ["vf", "w", "wwevu"]]>;
+    let RequiredFeatures = ["zvfbfa"] in
+      defm ""  : RVVOutOp1Op2BuiltinSet<NAME, "y",
+                                        [["vv", "vw", "wwvvu"],
+                                         ["vf", "vw", "wwevu"]]>;
   }
 }
 
-multiclass RVVFloatingBinBuiltinSet {
+multiclass RVVFloatingBinBuiltinSet<bit HasBF = 0> {
   defm "" : RVVOutOp1BuiltinSet<NAME, "fd",
                                 [["vv", "v", "vvv"],
                                  ["vf", "v", "vve"]]>;
@@ -511,9 +529,15 @@ multiclass RVVFloatingBinBuiltinSet {
     defm "" : RVVOutOp1BuiltinSet<NAME, "x",
                                   [["vv", "v", "vvv"],
                                    ["vf", "v", "vve"]]>;
+  if HasBF then {
+    let RequiredFeatures = ["zvfbfa"] in
+      defm "" : RVVOutOp1BuiltinSet<NAME, "y",
+                                    [["vv", "v", "vvv"],
+                                     ["vf", "v", "vve"]]>;
+  }
 }
 
-multiclass RVVFloatingBinBuiltinSetRoundingMode {
+multiclass RVVFloatingBinBuiltinSetRoundingMode<bit HasBF = 0> {
   defm "" : RVVOutOp1BuiltinSet<NAME, "fd",
                                 [["vv", "v", "vvvu"],
                                  ["vf", "v", "vveu"]]>;
@@ -521,22 +545,38 @@ multiclass RVVFloatingBinBuiltinSetRoundingMode {
     defm "" : RVVOutOp1BuiltinSet<NAME, "x",
                                   [["vv", "v", "vvvu"],
                                    ["vf", "v", "vveu"]]>;
+  if HasBF then {
+    let RequiredFeatures = ["zvfbfa"] in
+      defm "" : RVVOutOp1BuiltinSet<NAME, "y",
+                                    [["vv", "v", "vvvu"],
+                                     ["vf", "v", "vveu"]]>;
+  }
 }
 
-multiclass RVVFloatingBinVFBuiltinSet {
+multiclass RVVFloatingBinVFBuiltinSet<bit HasBF = 0> {
   defm "" : RVVOutOp1BuiltinSet<NAME, "fd",
                                 [["vf", "v", "vve"]]>;
   let RequiredFeatures = ["zvfh"] in
     defm "" : RVVOutOp1BuiltinSet<NAME, "x",
                                   [["vf", "v", "vve"]]>;
+  if HasBF then {
+    let RequiredFeatures = ["zvfbfa"] in
+      defm "" : RVVOutOp1BuiltinSet<NAME, "y",
+                                    [["vf", "v", "vve"]]>;
+  }
 }
 
-multiclass RVVFloatingBinVFBuiltinSetRoundingMode {
+multiclass RVVFloatingBinVFBuiltinSetRoundingMode<bit HasBF = 0> {
   defm "" : RVVOutOp1BuiltinSet<NAME, "fd",
                                 [["vf", "v", "vveu"]]>;
   let RequiredFeatures = ["zvfh"] in
     defm "" : RVVOutOp1BuiltinSet<NAME, "x",
                                   [["vf", "v", "vveu"]]>;
+  if HasBF then {
+    let RequiredFeatures = ["zvfbfa"] in
+      defm "" : RVVOutOp1BuiltinSet<NAME, "y",
+                                    [["vf", "v", "vveu"]]>;
+  }
 }
 
 multiclass RVVFloatingMaskOutBuiltinSet {
@@ -547,6 +587,10 @@ multiclass RVVFloatingMaskOutBuiltinSet {
     defm "" : RVVOp0Op1BuiltinSet<NAME, "x",
                                   [["vv", "vm", "mvv"],
                                    ["vf", "vm", "mve"]]>;
+  let RequiredFeatures = ["zvfbfa"] in
+    defm "" : RVVOp0Op1BuiltinSet<NAME, "y",
+                                  [["vv", "vm", "mvv"],
+                                   ["vf", "vm", "mve"]]>;
 }
 
 multiclass RVVFloatingMaskOutVFBuiltinSet
@@ -748,6 +792,10 @@ multiclass RVVFloatingWidenBinBuiltinSet {
     defm "" : RVVWidenBuiltinSet<NAME, "x",
                                  [["vv", "w", "wvv"],
                                   ["vf", "w", "wve"]]>;
+  let RequiredFeatures = ["zvfbfa"] in
+    defm "" : RVVWidenBuiltinSet<NAME, "y",
+                                 [["vv", "vw", "wvv"],
+                                  ["vf", "vw", "wve"]]>;
 }
 
 multiclass RVVFloatingWidenBinBuiltinSetRoundingMode {
@@ -758,6 +806,10 @@ multiclass RVVFloatingWidenBinBuiltinSetRoundingMode {
     defm "" : RVVWidenBuiltinSet<NAME, "x",
                                  [["vv", "w", "wvvu"],
                                   ["vf", "w", "wveu"]]>;
+  let RequiredFeatures = ["zvfbfa"] in
+    defm "" : RVVWidenBuiltinSet<NAME, "y",
+                                 [["vv", "vw", "wvvu"],
+                                  ["vf", "vw", "wveu"]]>;
 }
 
 multiclass RVVFloatingWidenOp0BinBuiltinSet {
@@ -768,6 +820,10 @@ multiclass RVVFloatingWidenOp0BinBuiltinSet {
     defm "" : RVVWidenWOp0BuiltinSet<NAME # "_w", "x",
                                      [["wv", "w", "wwv"],
                                       ["wf", "w", "wwe"]]>;
+  let RequiredFeatures = ["zvfbfa"] in
+    defm "" : RVVWidenWOp0BuiltinSet<NAME # "_w", "y",
+                                     [["wv", "vw", "wwv"],
+                                      ["wf", "ew", "wwe"]]>;
 }
 
 multiclass RVVFloatingWidenOp0BinBuiltinSetRoundingMode {
@@ -778,4 +834,8 @@ multiclass RVVFloatingWidenOp0BinBuiltinSetRoundingMode {
     defm "" : RVVWidenWOp0BuiltinSet<NAME # "_w", "x",
                                      [["wv", "w", "wwvu"],
                                       ["wf", "w", "wweu"]]>;
+  let RequiredFeatures = ["zvfbfa"] in
+    defm "" : RVVWidenWOp0BuiltinSet<NAME # "_w", "y",
+                                     [["wv", "vw", "wwvu"],
+                                      ["wf", "ew", "wweu"]]>;
 }
diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
index b4c24d7e4a8aa..3288f5b12c77e 100644
--- a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
+++ b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
@@ -127,6 +127,14 @@ class CIRBaseBuilderTy : public mlir::OpBuilder {
   cir::BoolType getBoolTy() { return cir::BoolType::get(getContext()); }
   cir::VoidType getVoidTy() { return cir::VoidType::get(getContext()); }
 
+  cir::IntType getUIntNTy(int n) {
+    return cir::IntType::get(getContext(), n, false);
+  }
+
+  cir::IntType getSIntNTy(int n) {
+    return cir::IntType::get(getContext(), n, true);
+  }
+
   cir::PointerType getPointerTo(mlir::Type ty) {
     return cir::PointerType::get(ty);
   }
@@ -303,8 +311,9 @@ class CIRBaseBuilderTy : public mlir::OpBuilder {
   }
 
   /// Create a copy with inferred length.
-  cir::CopyOp createCopy(mlir::Value dst, mlir::Value src) {
-    return cir::CopyOp::create(*this, dst.getLoc(), dst, src);
+  cir::CopyOp createCopy(mlir::Value dst, mlir::Value src,
+                         bool isVolatile = false) {
+    return cir::CopyOp::create(*this, dst.getLoc(), dst, src, isVolatile);
   }
 
   cir::StoreOp createStore(mlir::Location loc, mlir::Value val, mlir::Value dst,
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRAttrConstraints.td b/clang/include/clang/CIR/Dialect/IR/CIRAttrConstraints.td
index 8f72ff4d754ad..2548d464fb07f 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRAttrConstraints.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRAttrConstraints.td
@@ -38,14 +38,32 @@ def CIR_AnyIntOrFloatAttr : AnyAttrOf<[CIR_AnyIntAttr, CIR_AnyFPAttr],
   string cppType = "::mlir::TypedAttr";
 }
 
+//===----------------------------------------------------------------------===//
+// Exceptions constraints
+//===----------------------------------------------------------------------===//
+
+def CIR_AnyCatchAllAttr
+    : CIR_AttrConstraint<"::cir::CatchAllAttr", "catch all attribute">;
+
+def CIR_AnyUnwindAttr
+    : CIR_AttrConstraint<"::cir::UnwindAttr", "unwind attribute">;
+
 //===----------------------------------------------------------------------===//
 // GlobalViewAttr constraints
 //===----------------------------------------------------------------------===//
 
-def CIR_AnyGlobalViewAttr : CIR_AttrConstraint<"::cir::GlobalViewAttr", "GlobalView attribute">;
+def CIR_AnyGlobalViewAttr
+    : CIR_AttrConstraint<"::cir::GlobalViewAttr", "GlobalView attribute">;
 
-def CIR_AnyIntOrGlobalViewAttr : AnyAttrOf<[CIR_AnyIntAttr, CIR_AnyGlobalViewAttr],
-    "integer or global view attribute"> {
+def CIR_AnyIntOrGlobalViewAttr
+    : AnyAttrOf<[CIR_AnyIntAttr, CIR_AnyGlobalViewAttr],
+                "integer or global view attribute"> {
+  string cppType = "::mlir::TypedAttr";
+}
+
+def CIR_TryHandlerAttr
+    : AnyAttrOf<[CIR_AnyGlobalViewAttr, CIR_AnyCatchAllAttr, CIR_AnyUnwindAttr],
+                "catch all or unwind or global view attribute"> {
   string cppType = "::mlir::TypedAttr";
 }
 
@@ -61,4 +79,7 @@ def CIR_IntOrGlobalViewArrayAttr : TypedArrayAttrBase<CIR_AnyIntOrGlobalViewAttr
   string cppType = "::mlir::ArrayAttr";
 }
 
-#endif // CLANG_CIR_DIALECT_IR_CIRATTRCONSTRAINTS_TD
\ No newline at end of file
+def CIR_TryHandlerArrayAttr : TypedArrayAttrBase<CIR_TryHandlerAttr,
+                         "catch all or unwind or global view array attribute">;
+
+#endif // CLANG_CIR_DIALECT_IR_CIRATTRCONSTRAINTS_TD
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
index 610e349717e12..1e0fb038b19d8 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
@@ -967,5 +967,63 @@ def CIR_TypeInfoAttr : CIR_Attr<"TypeInfo", "typeinfo", [TypedAttrInterface]> {
     `<` custom<RecordMembers>($data) `>`
   }];
 }
+//===----------------------------------------------------------------------===//
+// InlineAttr
+//===----------------------------------------------------------------------===//
+
+def CIR_InlineKind : CIR_I32EnumAttr<"InlineKind", "inlineKind", [
+  I32EnumAttrCase<"NoInline", 1, "never">,
+  I32EnumAttrCase<"AlwaysInline", 2, "always">,
+  I32EnumAttrCase<"InlineHint", 3, "hint">
+]> {
+  let genSpecializedAttr = 0;
+}
+
+def CIR_InlineAttr : CIR_EnumAttr<CIR_InlineKind, "inline"> {
+  let summary = "Inline attribute";
+  let description = [{
+    Inline attribute represents user directives for inlining behavior.
+    This attribute is only used by `cir.func` operations.
+
+    Values:
+    - `never`: Prevents the function from being inlined (__attribute__((noinline)))
+    - `always`: Forces the function to be inlined (__attribute__((always_inline)))
+    - `hint`: Suggests the function should be inlined (inline keyword)
+
+    Example:
+    ```
+    cir.func @noinline_func(%arg0: !s32i) -> !s32i inline(never) {
+      cir.return %arg0 : !s32i
+    }
+    cir.func @always_inline_func() -> !s32i inline(always) {
+      %0 = cir.const #cir.int<42> : !s32i
+      cir.return %0 : !s32i
+    }
+    ```
+  }];
+
+  let cppClassName = "InlineAttr";
+
+  let extraClassDeclaration = [{
+    bool isNoInline() const { return getValue() == InlineKind::NoInline; };
+    bool isAlwaysInline() const { return getValue() == InlineKind::AlwaysInline; };
+    bool isInlineHint() const { return getValue() == InlineKind::InlineHint; };
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// CatchAllAttr & UnwindAttr
+//===----------------------------------------------------------------------===//
+
+// Represents the catch_all region.
+def CIR_CatchAllAttr : CIR_UnitAttr<"CatchAll", "all"> {
+  let storageType = [{ CatchAllAttr }];
+}
+
+// Represents the unwind region where unwind continues or
+// the program std::terminate's.
+def CIR_UnwindAttr : CIR_UnitAttr<"Unwind", "unwind"> {
+  let storageType = [{ CatchUnwind }];
+}
 
 #endif // CLANG_CIR_DIALECT_IR_CIRATTRS_TD
diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index 4c15d9ed0f834..86d09d72fe6ca 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -644,7 +644,7 @@ def CIR_StoreOp : CIR_Op<"store", [
 
 defvar CIR_ReturnableScopes = [
   "FuncOp", "ScopeOp", "IfOp", "SwitchOp", "CaseOp",
-  "DoWhileOp", "WhileOp", "ForOp"
+  "DoWhileOp", "WhileOp", "ForOp", "TryOp"
 ];
 
 def CIR_ReturnOp : CIR_Op<"return", [
@@ -791,7 +791,7 @@ def CIR_ConditionOp : CIR_Op<"condition", [
 
 defvar CIR_YieldableScopes = [
   "ArrayCtor", "ArrayDtor", "CaseOp", "DoWhileOp", "ForOp", "GlobalOp", "IfOp",
-  "ScopeOp", "SwitchOp", "TernaryOp", "WhileOp"
+  "ScopeOp", "SwitchOp", "TernaryOp", "WhileOp", "TryOp"
 ];
 
 def CIR_YieldOp : CIR_Op<"yield", [
@@ -2476,6 +2476,10 @@ def CIR_FuncOp : CIR_Op<"func", [
     Similarly, for global destructors both `global_dtor` and
     `global_dtor(<priority>)` are available.
 
+    The `inline(never)` keyword marks a function that should not be inlined.
+    The `inline(always)` keyword marks a function that should always be inlined.
+    The `inline(hint)` keyword suggests that the function should be inlined.
+
     Example:
 
     ```mlir
@@ -2510,6 +2514,7 @@ def CIR_FuncOp : CIR_Op<"func", [
                        UnitAttr:$dso_local,
                        DefaultValuedAttr<CIR_GlobalLinkageKind,
                                          "cir::GlobalLinkageKind::ExternalLinkage">:$linkage,
+                       OptionalAttr<CIR_InlineAttr>:$inline_kind,
                        OptionalAttr<StrAttr>:$sym_visibility,
                        UnitAttr:$comdat,
                        OptionalAttr<DictArrayAttr>:$arg_attrs,
@@ -2719,6 +2724,8 @@ def CIR_CopyOp : CIR_Op<"copy",[
     type of `src` and `dst` must match and both must implement the
     `DataLayoutTypeInterface`.
 
+    The `volatile` keyword indicates that the operation is volatile.
+
     Examples:
 
     ```mlir
@@ -2729,10 +2736,11 @@ def CIR_CopyOp : CIR_Op<"copy",[
 
   let arguments = (ins
       Arg<CIR_PointerType, "", [MemWrite]>:$dst,
-      Arg<CIR_PointerType, "", [MemRead]>:$src
+      Arg<CIR_PointerType, "", [MemRead]>:$src,
+      UnitAttr:$is_volatile
   );
 
-  let assemblyFormat = [{$src `to` $dst
+  let assemblyFormat = [{$src `to` $dst (`volatile` $is_volatile^)?
                         attr-dict `:` qualified(type($dst))
   }];
   let hasVerifier = 1;
@@ -4044,6 +4052,43 @@ def CIR_ExpectOp : CIR_Op<"expect", [
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// PtrDiffOp
+//===----------------------------------------------------------------------===//
+
+def CIR_PtrDiffOp : CIR_Op<"ptr_diff", [Pure, SameTypeOperands]> {
+  let summary = "Pointer subtraction arithmetic";
+  let description = [{
+    The cir.ptr_diff operation computes the difference between two pointers that
+    have the same element type.
+
+    The result reflects the ABI-defined size of the pointed-to type. For example,
+    subtracting two !cir.ptr<!u64i> values may yield 1, representing an 8-byte
+    difference. In contrast, for pointers to void or function types, a result of
+    8 corresponds to an 8-byte difference.
+
+    For pointers to types whose size are not aligned with the target data
+    layout, the size is generally rounded to the next power of 2 bits. For
+    example, subtracting two !cir.ptr<!s24i> values for the _BitInt(24) type may
+    yield 1, representing a 4-byte difference (as opposed to a 3-byte
+    difference).
+
+    Example:
+
+    ```mlir
+    %7 = cir.ptr_diff %0, %1 : !cir.ptr<!u64i> -> !u64i
+    ```
+  }];
+
+  let arguments = (ins CIR_PointerType:$lhs, CIR_PointerType:$rhs);
+  let results = (outs CIR_AnyFundamentalIntType:$result);
+
+  let assemblyFormat = [{
+    $lhs `,` $rhs  `:` qualified(type($lhs)) `->` qualified(type($result)) 
+    attr-dict
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // Floating Point Ops
 //===----------------------------------------------------------------------===//
@@ -4325,11 +4370,167 @@ def CIR_AllocExceptionOp : CIR_Op<"alloc.exception"> {
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// TryOp
+//===----------------------------------------------------------------------===//
+
+def CIR_TryOp : CIR_Op<"try",[
+  DeclareOpInterfaceMethods<RegionBranchOpInterface>,
+  RecursivelySpeculatable, AutomaticAllocationScope, NoRegionArguments
+]> {
+  let summary = "C++ try block";
+  let description = [{
+    Holds the lexical scope of `try {}`. Note that resources used on catch
+    clauses are usually allocated in the same parent as `cir.try`.
+
+    `synthetic`: use `cir.try` to represent try/catches not originally
+    present in the source code. For example, a synthetic `cir.try` region
+    is created around the constructor call when `operator new` is used
+    so that the memory allocated will be freed if the constructor throws
+    an exception.
+
+    `cleanup`: indicates that there are cleanups that must be performed
+    when exiting the try region via exception, even if the exception is not
+    caught.
+
+    Example:
+
+    ```mlir
+    cir.try {
+      cir.call exception @function() : () -> ()
+      cir.yield
+    } catch [type #cir.global_view<@_ZTIPf> : !cir.ptr<!u8i>] {
+      ...
+      cir.yield
+    } unwind {
+      cir.resume
+    }
+    ```
+  }];
+
+  let arguments = (ins
+    UnitAttr:$synthetic,
+    UnitAttr:$cleanup,
+    DefaultValuedAttr<CIR_TryHandlerArrayAttr, "{}">:$handler_types
+  );
+
+  let regions = (region
+    AnyRegion:$try_region,
+    VariadicRegion<AnyRegion>:$handler_regions
+  );
+
+  let assemblyFormat = [{
+    (`synthetic` $synthetic^)?
+    (`cleanup` $cleanup^)?
+    $try_region
+    custom<TryHandlerRegions>($handler_regions, $handler_types)
+    attr-dict
+  }];
+
+  let builders = [
+    OpBuilder<(ins
+      "llvm::function_ref<void(mlir::OpBuilder &, "
+        "mlir::Location)>":$tryBuilder,
+      "llvm::function_ref<void(mlir::OpBuilder &, mlir::Location, "
+        "mlir::OperationState &)>":$handlersBuilder),
+    [{
+      assert(tryBuilder && "expected builder callback for 'cir.try' body");
+      assert(handlersBuilder
+        && "expected builder callback for 'handlers' body");
+
+      OpBuilder::InsertionGuard guard($_builder);
+
+      // Try body region
+      mlir::Region *tryBodyRegion = $_state.addRegion();
+
+      // Create try body region and set insertion point
+      $_builder.createBlock(tryBodyRegion);
+      tryBuilder($_builder, $_state.location);
+      handlersBuilder($_builder, $_state.location, $_state);
+    }]>
+  ];
+
+  let hasLLVMLowering = false;
+}
+
 //===----------------------------------------------------------------------===//
 // Atomic operations
 //===----------------------------------------------------------------------===//
 
-def CIR_AtomicXchg : CIR_Op<"atomic.xchg", [
+def CIR_AtomicFetchKind : CIR_I32EnumAttr<
+  "AtomicFetchKind", "Binary opcode for atomic fetch-and-update operations", [
+    I32EnumAttrCase<"Add", 0, "add">,
+    I32EnumAttrCase<"Sub", 1, "sub">,
+    I32EnumAttrCase<"And", 2, "and">,
+    I32EnumAttrCase<"Xor", 3, "xor">,
+    I32EnumAttrCase<"Or", 4, "or">,
+    I32EnumAttrCase<"Nand", 5, "nand">,
+    I32EnumAttrCase<"Max", 6, "max">,
+    I32EnumAttrCase<"Min", 7, "min">
+]>;
+
+def CIR_AtomicFetchOp : CIR_Op<"atomic.fetch", [
+  AllTypesMatch<["result", "val"]>,
+  TypesMatchWith<"type of 'val' must match the pointee type of 'ptr'",
+    "ptr", "val", "mlir::cast<cir::PointerType>($_self).getPointee()">
+]> {
+  let summary = "Atomic fetch-and-update operation";
+  let description = [{
+    C/C++ atomic fetch-and-update operation. This operation implements the C/C++
+    builtin functions `__atomic_<binop>_fetch`, `__atomic_fetch_<binop>`, and
+    `__c11_atomic_fetch_<binop>`, where `<binop>` is one of the following binary
+    opcodes: `add`, `sub`, `and`, `xor`, `or`, `nand`, `max`, and `min`.
+
+    This operation takes 2 arguments: a pointer `ptr` and a value `val`. The
+    type of `val` must match the pointee type of `ptr`. If the binary operation
+    is `add`, `sub`, `max`, or `min`, the type of `val` may either be an integer
+    type or a floating-point type. Otherwise, `val` must be an integer.
+
+    This operation atomically loads the value from `ptr`, performs the binary
+    operation as indicated by `binop` on the loaded value and `val`, and stores
+    the result back to `ptr`. If the `fetch_first` flag is present, the result
+    of this operation is the old value loaded from `ptr` before the binary
+    operation. Otherwise, the result of this operation is the result of the
+    binary operation.
+
+    Example:
+    %res = cir.atomic.fetch add seq_cst %ptr, %val
+        : (!cir.ptr<!s32i>, !s32i) -> !s32i
+  }];
+  let results = (outs CIR_AnyIntOrFloatType:$result);
+  let arguments = (ins
+    Arg<CIR_PtrToIntOrFloatType, "", [MemRead, MemWrite]>:$ptr,
+    CIR_AnyIntOrFloatType:$val,
+    CIR_AtomicFetchKind:$binop,
+    Arg<CIR_MemOrder, "memory order">:$mem_order,
+    UnitAttr:$is_volatile,
+    UnitAttr:$fetch_first
+  );
+
+  let assemblyFormat = [{
+    $binop $mem_order
+    (`fetch_first` $fetch_first^)?
+    $ptr `,` $val
+    (`volatile` $is_volatile^)?
+    `:` `(` qualified(type($ptr)) `,` qualified(type($val)) `)`
+    `->` type($result) attr-dict
+  }];
+
+  let hasVerifier = 1;
+
+  let extraLLVMLoweringPatternDecl = [{
+    mlir::Value buildPostOp(cir::AtomicFetchOp op, OpAdaptor adaptor,
+                            mlir::ConversionPatternRewriter &rewriter,
+                            mlir::Value rmwVal, bool isInt) const;
+
+    mlir::Value buildMinMaxPostOp(cir::AtomicFetchOp op, OpAdaptor adaptor,
+                                  mlir::ConversionPatternRewriter &rewriter,
+                                  mlir::Value rmwVal, bool isInt,
+                                  bool isSigned) const;
+  }];
+}
+
+def CIR_AtomicXchgOp : CIR_Op<"atomic.xchg", [
   AllTypesMatch<["result", "val"]>,
   TypesMatchWith<"type of 'val' must match the pointee type of 'ptr'",
     "ptr", "val", "mlir::cast<cir::PointerType>($_self).getPointee()">
@@ -4347,9 +4548,7 @@ def CIR_AtomicXchg : CIR_Op<"atomic.xchg", [
     Example:
 
     ```mlir
-    %res = cir.atomic.xchg(%ptr : !cir.ptr<!u64i>,
-                           %val : !u64i,
-                           seq_cst) : !u64i
+    %res = cir.atomic.xchg seq_cst %ptr, %val : !cir.ptr<!u64i> -> !u64i
     ```
   }];
 
@@ -4364,12 +4563,16 @@ def CIR_AtomicXchg : CIR_Op<"atomic.xchg", [
   let assemblyFormat = [{
     $mem_order (`volatile` $is_volatile^)?
     $ptr `,` $val
-    `:` qualified(type($ptr)) `->` type($result) attr-dict
+    `:` functional-type(operands, results) attr-dict
   }];
 }
 
-def CIR_AtomicCmpXchg : CIR_Op<"atomic.cmpxchg", [
-  AllTypesMatch<["old", "expected", "desired"]>
+def CIR_AtomicCmpXchgOp : CIR_Op<"atomic.cmpxchg", [
+  AllTypesMatch<["old", "expected", "desired"]>,
+  TypesMatchWith<"type of 'expected' must match the pointee type of 'ptr'",
+    "ptr", "expected", "mlir::cast<cir::PointerType>($_self).getPointee()">,
+  TypesMatchWith<"type of 'desired' must match the pointee type of 'ptr'",
+    "ptr", "desired", "mlir::cast<cir::PointerType>($_self).getPointee()">
 ]> {
   let summary = "Atomic compare and exchange";
   let description = [{
@@ -4402,12 +4605,9 @@ def CIR_AtomicCmpXchg : CIR_Op<"atomic.cmpxchg", [
     Example:
 
     ```mlir
-    %old, %success = cir.atomic.cmpxchg(%ptr : !cir.ptr<!u64i>,
-                                        %expected : !u64i,
-                                        %desired : !u64i,
-                                        success = seq_cst,
-                                        failure = seq_cst) weak
-                                        : (!u64i, !cir.bool)
+    %old, %success = cir.atomic.cmpxchg weak success(seq_cst) failure(acquire)
+        %ptr, %expected, %desired
+        : (!cir.ptr<!u64i>, !u64i, !u64i) -> (!u64i, !cir.bool)
     ```
   }];
   let results = (outs CIR_AnyType:$old, CIR_BoolType:$success);
@@ -4421,20 +4621,75 @@ def CIR_AtomicCmpXchg : CIR_Op<"atomic.cmpxchg", [
                        UnitAttr:$is_volatile);
 
   let assemblyFormat = [{
-    `(`
-      $ptr `:` qualified(type($ptr)) `,`
-      $expected `:` type($expected) `,`
-      $desired `:` type($desired) `,`
-      `success` `=`  $succ_order `,`
-      `failure` `=`  $fail_order
-    `)`
-    (`align` `(` $alignment^ `)`)?
     (`weak` $weak^)?
+    `success` `(` $succ_order `)` `failure` `(` $fail_order `)`
+    $ptr `,` $expected `,` $desired
+    (`align` `(` $alignment^ `)`)?
     (`volatile` $is_volatile^)?
-    `:` `(` type($old) `,` type($success) `)` attr-dict
+    `:` functional-type(operands, results) attr-dict
   }];
+}
 
-  let hasVerifier = 1;
+def CIR_AtomicTestAndSetOp : CIR_Op<"atomic.test_and_set"> {
+  let summary = "Atomic test and set";
+  let description = [{
+    C/C++ atomic test and set operation. Implements the builtin function
+    `__atomic_test_and_set`.
+
+    The operation takes as its only operand a pointer to an 8-bit signed
+    integer. The operation atomically set the integer to an implementation-
+    defined non-zero "set" value. The result of the operation is a boolean value
+    indicating whether the previous value of the integer was the "set" value.
+
+    Example:
+    ```mlir
+      %res = cir.atomic.test_and_set seq_cst %ptr : !cir.ptr<!s8i> -> !cir.bool
+    ```
+  }];
+
+  let arguments = (ins
+    Arg<CIR_PtrToType<CIR_SInt8>, "", [MemRead, MemWrite]>:$ptr,
+    Arg<CIR_MemOrder, "memory order">:$mem_order,
+    OptionalAttr<I64Attr>:$alignment,
+    UnitAttr:$is_volatile
+  );
+
+  let results = (outs CIR_BoolType:$result);
+
+  let assemblyFormat = [{
+    $mem_order $ptr
+    (`volatile` $is_volatile^)?
+    `:` qualified(type($ptr)) `->` qualified(type($result)) attr-dict
+  }];
+}
+
+def CIR_AtomicClearOp : CIR_Op<"atomic.clear"> {
+  let summary = "Atomic clear";
+  let description = [{
+    C/C++ atomic clear operation. Implements the builtin function
+    `__atomic_clear`.
+
+    The operation takes as its only operand a pointer to an 8-bit signed
+    integer. The operation atomically sets the integer to zero.
+
+    Example:
+    ```mlir
+      cir.atomic.clear seq_cst %ptr : !cir.ptr<!s8i>
+    ```
+  }];
+
+  let arguments = (ins
+    Arg<CIR_PtrToType<CIR_SInt8>, "", [MemRead, MemWrite]>:$ptr,
+    Arg<CIR_MemOrder, "memory order">:$mem_order,
+    OptionalAttr<I64Attr>:$alignment,
+    UnitAttr:$is_volatile
+  );
+
+  let assemblyFormat = [{
+    $mem_order $ptr
+    (`volatile` $is_volatile^)?
+    `:` qualified(type($ptr)) attr-dict
+  }];
 }
 
 #endif // CLANG_CIR_DIALECT_IR_CIROPS_TD
diff --git a/clang/include/clang/CIR/Dialect/IR/CIROpsEnums.h b/clang/include/clang/CIR/Dialect/IR/CIROpsEnums.h
index 17fddaee871b3..dbd030446a6fc 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROpsEnums.h
+++ b/clang/include/clang/CIR/Dialect/IR/CIROpsEnums.h
@@ -54,10 +54,10 @@ static bool isLocalLinkage(GlobalLinkageKind linkage) {
 static bool isExternalWeakLinkage(GlobalLinkageKind linkage) {
   return linkage == GlobalLinkageKind::ExternalWeakLinkage;
 }
-LLVM_ATTRIBUTE_UNUSED static bool isCommonLinkage(GlobalLinkageKind linkage) {
+[[maybe_unused]] static bool isCommonLinkage(GlobalLinkageKind linkage) {
   return linkage == GlobalLinkageKind::CommonLinkage;
 }
-LLVM_ATTRIBUTE_UNUSED static bool
+[[maybe_unused]] static bool
 isValidDeclarationLinkage(GlobalLinkageKind linkage) {
   return isExternalWeakLinkage(linkage) || isExternalLinkage(linkage);
 }
@@ -65,8 +65,7 @@ isValidDeclarationLinkage(GlobalLinkageKind linkage) {
 /// Whether the definition of this global may be replaced by something
 /// non-equivalent at link time. For example, if a function has weak linkage
 /// then the code defining it may be replaced by different code.
-LLVM_ATTRIBUTE_UNUSED static bool
-isInterposableLinkage(GlobalLinkageKind linkage) {
+[[maybe_unused]] static bool isInterposableLinkage(GlobalLinkageKind linkage) {
   switch (linkage) {
   case GlobalLinkageKind::WeakAnyLinkage:
   case GlobalLinkageKind::LinkOnceAnyLinkage:
@@ -89,8 +88,7 @@ isInterposableLinkage(GlobalLinkageKind linkage) {
 
 /// Whether the definition of this global may be discarded if it is not used
 /// in its compilation unit.
-LLVM_ATTRIBUTE_UNUSED static bool
-isDiscardableIfUnused(GlobalLinkageKind linkage) {
+[[maybe_unused]] static bool isDiscardableIfUnused(GlobalLinkageKind linkage) {
   return isLinkOnceLinkage(linkage) || isLocalLinkage(linkage) ||
          isAvailableExternallyLinkage(linkage);
 }
@@ -99,7 +97,7 @@ isDiscardableIfUnused(GlobalLinkageKind linkage) {
 /// Using this method outside of the code generators is almost always a
 /// mistake: when working at the IR level use isInterposable instead as it
 /// knows about ODR semantics.
-LLVM_ATTRIBUTE_UNUSED static bool isWeakForLinker(GlobalLinkageKind linkage) {
+[[maybe_unused]] static bool isWeakForLinker(GlobalLinkageKind linkage) {
   return linkage == GlobalLinkageKind::WeakAnyLinkage ||
          linkage == GlobalLinkageKind::WeakODRLinkage ||
          linkage == GlobalLinkageKind::LinkOnceAnyLinkage ||
@@ -108,7 +106,7 @@ LLVM_ATTRIBUTE_UNUSED static bool isWeakForLinker(GlobalLinkageKind linkage) {
          linkage == GlobalLinkageKind::ExternalWeakLinkage;
 }
 
-LLVM_ATTRIBUTE_UNUSED static bool isValidLinkage(GlobalLinkageKind gl) {
+[[maybe_unused]] static bool isValidLinkage(GlobalLinkageKind gl) {
   return isExternalLinkage(gl) || isLocalLinkage(gl) || isWeakLinkage(gl) ||
          isLinkOnceLinkage(gl);
 }
diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index 4fbae150b587e..2374c8d1850ac 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -69,24 +69,31 @@ struct MissingFeatures {
   static bool opAllocaCaptureByInit() { return false; }
 
   // FuncOp handling
-  static bool opFuncOpenCLKernelMetadata() { return false; }
+  static bool opFuncArmNewAttr() { return false; }
+  static bool opFuncArmStreamingAttr() { return false; }
   static bool opFuncAstDeclAttr() { return false; }
-  static bool opFuncAttributesForDefinition() { return false; }
   static bool opFuncCallingConv() { return false; }
+  static bool opFuncColdHotAttr() { return false; }
   static bool opFuncCPUAndFeaturesAttributes() { return false; }
   static bool opFuncExceptions() { return false; }
   static bool opFuncExtraAttrs() { return false; }
   static bool opFuncMaybeHandleStaticInExternC() { return false; }
+  static bool opFuncMinSizeAttr() { return false; }
   static bool opFuncMultipleReturnVals() { return false; }
+  static bool opFuncNakedAttr() { return false; }
+  static bool opFuncNoDuplicateAttr() { return false; }
   static bool opFuncNoUnwind() { return false; }
+  static bool opFuncOpenCLKernelMetadata() { return false; }
   static bool opFuncOperandBundles() { return false; }
+  static bool opFuncOptNoneAttr() { return false; }
   static bool opFuncParameterAttributes() { return false; }
   static bool opFuncReadOnly() { return false; }
   static bool opFuncSection() { return false; }
+  static bool opFuncUnwindTablesAttr() { return false; }
   static bool opFuncWillReturn() { return false; }
   static bool opFuncNoReturn() { return false; }
-  static bool setLLVMFunctionFEnvAttributes() { return false; }
   static bool setFunctionAttributes() { return false; }
+  static bool setLLVMFunctionFEnvAttributes() { return false; }
 
   // CallOp handling
   static bool opCallAggregateArgs() { return false; }
@@ -112,6 +119,7 @@ struct MissingFeatures {
   static bool opCallLandingPad() { return false; }
   static bool opCallContinueBlock() { return false; }
   static bool opCallChain() { return false; }
+  static bool opCallExceptionAttr() { return false; }
 
   // CXXNewExpr
   static bool exprNewNullCheck() { return false; }
@@ -170,9 +178,10 @@ struct MissingFeatures {
   static bool atomicInfo() { return false; }
   static bool atomicInfoGetAtomicPointer() { return false; }
   static bool atomicInfoGetAtomicAddress() { return false; }
-  static bool atomicUseLibCall() { return false; }
   static bool atomicScope() { return false; }
   static bool atomicSyncScopeID() { return false; }
+  static bool atomicTypes() { return false; }
+  static bool atomicUseLibCall() { return false; }
 
   // Global ctor handling
   static bool globalCtorLexOrder() { return false; }
@@ -271,6 +280,7 @@ struct MissingFeatures {
   static bool objCBlocks() { return false; }
   static bool objCGC() { return false; }
   static bool objCLifetime() { return false; }
+  static bool hlsl() { return false; }
   static bool openCL() { return false; }
   static bool openMP() { return false; }
   static bool opTBAA() { return false; }
@@ -288,6 +298,7 @@ struct MissingFeatures {
   static bool sourceLanguageCases() { return false; }
   static bool stackBase() { return false; }
   static bool stackSaveOp() { return false; }
+  static bool stackProtector() { return false; }
   static bool targetCIRGenInfoArch() { return false; }
   static bool targetCIRGenInfoOS() { return false; }
   static bool targetCodeGenInfoGetNullPointer() { return false; }
@@ -322,6 +333,7 @@ struct MissingFeatures {
   static bool invokeOp() { return false; }
   static bool labelOp() { return false; }
   static bool ptrDiffOp() { return false; }
+  static bool llvmLoweringPtrDiffConsidersPointee() { return false; }
   static bool ptrStrideOp() { return false; }
   static bool switchOp() { return false; }
   static bool throwOp() { return false; }
diff --git a/clang/include/clang/Driver/Distro.h b/clang/include/clang/Driver/Distro.h
index a515cbf91ccd6..0e17b30eb7e8d 100644
--- a/clang/include/clang/Driver/Distro.h
+++ b/clang/include/clang/Driver/Distro.h
@@ -30,9 +30,6 @@ class Distro {
     // the first and last known member in the family, e.g. IsRedHat().
     AlpineLinux,
     ArchLinux,
-    DebianLenny,
-    DebianSqueeze,
-    DebianWheezy,
     DebianJessie,
     DebianStretch,
     DebianBuster,
@@ -42,16 +39,13 @@ class Distro {
     DebianForky,
     DebianDuke,
     Exherbo,
-    RHEL5,
-    RHEL6,
     RHEL7,
+    RHEL8,
+    RHEL9,
+    RHEL10,
     Fedora,
     Gentoo,
     OpenSUSE,
-    UbuntuMaverick,
-    UbuntuNatty,
-    UbuntuOneiric,
-    UbuntuPrecise,
     UbuntuQuantal,
     UbuntuRaring,
     UbuntuSaucy,
@@ -121,17 +115,17 @@ class Distro {
   /// @{
 
   bool IsRedhat() const {
-    return DistroVal == Fedora || (DistroVal >= RHEL5 && DistroVal <= RHEL7);
+    return DistroVal == Fedora || (DistroVal >= RHEL7 && DistroVal <= RHEL10);
   }
 
   bool IsOpenSUSE() const { return DistroVal == OpenSUSE; }
 
   bool IsDebian() const {
-    return DistroVal >= DebianLenny && DistroVal <= DebianDuke;
+    return DistroVal >= DebianJessie && DistroVal <= DebianDuke;
   }
 
   bool IsUbuntu() const {
-    return DistroVal >= UbuntuMaverick && DistroVal <= UbuntuResolute;
+    return DistroVal >= UbuntuQuantal && DistroVal <= UbuntuResolute;
   }
 
   bool IsAlpineLinux() const { return DistroVal == AlpineLinux; }
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 86433044123a3..f6a434001152b 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -6267,7 +6267,7 @@ def rewrite_legacy_objc : Flag<["-"], "rewrite-legacy-objc">,
 def rdynamic : Flag<["-"], "rdynamic">, Group<Link_Group>,
   Visibility<[ClangOption, FlangOption]>;
 def resource_dir : Separate<["-"], "resource-dir">,
-  Flags<[NoXarchOption, HelpHidden]>,
+  Flags<[NoXarchOption]>,
   Visibility<[ClangOption, CC1Option, CLOption, DXCOption, FlangOption, FC1Option]>,
   HelpText<"The directory which holds the compiler resource files">,
   MarshallingInfoString<HeaderSearchOpts<"ResourceDir">>;
@@ -6363,11 +6363,12 @@ def static : Flag<["-", "--"], "static">, Group<Link_Group>,
   Flags<[NoArgumentUnused]>;
 def std_default_EQ : Joined<["-"], "std-default=">;
 def std_EQ : Joined<["-", "--"], "std=">,
-  Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>,
-  Group<CompileOnly_Group>, HelpText<"Language standard to compile for">,
-  ValuesCode<[{
+             Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>,
+             Group<CompileOnly_Group>,
+             HelpText<"Language standard to compile for">,
+             ValuesCode<[{
     static constexpr const char VALUES_CODE [] =
-    #define LANGSTANDARD(id, name, lang, desc, features) name ","
+    #define LANGSTANDARD(id, name, lang, desc, features, version) name ","
     #define LANGSTANDARD_ALIAS(id, alias) alias ","
     #include "clang/Basic/LangStandards.def"
     ;
@@ -9654,6 +9655,9 @@ def famd_opt : Flag<["-"], "famd-opt">, Group<f_Group>,
   HelpText<"Enable \"All\" [AMD] proprietary Optimizations">;
 def fno_amd_opt : Flag<["-"], "fno-amd-opt">, Group<f_Group>;
 }
+def famd_allow_threadprivate_equivalence : Flag<["-"], "famd-allow-threadprivate-equivalence">,
+  Flags<[HelpHidden]>, Group<f_Group>, Visibility<[FlangOption, FC1Option]>,
+  HelpText<"Allow to use veriables in EQUIVALENCE statements with THREADPRIVATE">;
 let Visibility = [ClangOption, CLOption] in {
 def floop_unswitch_aggressive : Flag<["-"], "floop-unswitch-aggressive">, Group<f_Group>,
   HelpText<"Aggressively unswitch loops.">;
@@ -9890,6 +9894,15 @@ def fhlsl_spv_use_unknown_image_format
                "from the resource's data type.">,
       MarshallingInfoFlag<LangOpts<"HLSLSpvUseUnknownImageFormat">>;
 
+def fhlsl_spv_enable_maximal_reconvergence
+    : Flag<["-"], "fspv-enable-maximal-reconvergence">,
+      Group<dxc_Group>,
+      Visibility<[CC1Option, DXCOption]>,
+      HelpText<"Enables the MaximallyReconvergesKHR execution mode for this "
+               "module. This ensures that control flow reconverges at "
+               "well-defined merge points as defined by the Vulkan spec.">,
+      MarshallingInfoFlag<LangOpts<"HLSLSpvEnableMaximalReconvergence">>;
+
 def no_wasm_opt : Flag<["--"], "no-wasm-opt">,
   Group<m_Group>,
   HelpText<"Disable the wasm-opt optimizer">,
diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index 3df5b92654094..2852c4a2916a4 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -94,7 +94,7 @@ struct FormatStyle {
     ///
     /// \note
     ///  This currently only applies to braced initializer lists (when
-    ///  ``Cpp11BracedListStyle`` is ``true``) and parentheses.
+    ///  ``Cpp11BracedListStyle`` is not ``Block``) and parentheses.
     /// \endnote
     BAS_BlockIndent,
   };
@@ -2555,29 +2555,67 @@ struct FormatStyle {
   /// \version 3.7
   unsigned ContinuationIndentWidth;
 
-  /// If ``true``, format braced lists as best suited for C++11 braced
-  /// lists.
-  ///
-  /// Important differences:
-  ///
-  /// * No spaces inside the braced list.
-  /// * No line break before the closing brace.
-  /// * Indentation with the continuation indent, not with the block indent.
-  ///
-  /// Fundamentally, C++11 braced lists are formatted exactly like function
-  /// calls would be formatted in their place. If the braced list follows a name
-  /// (e.g. a type or variable name), clang-format formats as if the ``{}`` were
-  /// the parentheses of a function call with that name. If there is no name,
-  /// a zero-length name is assumed.
-  /// \code
-  ///    true:                                  false:
-  ///    vector<int> x{1, 2, 3, 4};     vs.     vector<int> x{ 1, 2, 3, 4 };
-  ///    vector<T> x{{}, {}, {}, {}};           vector<T> x{ {}, {}, {}, {} };
-  ///    f(MyMap[{composite, key}]);            f(MyMap[{ composite, key }]);
-  ///    new int[3]{1, 2, 3};                   new int[3]{ 1, 2, 3 };
-  /// \endcode
+  /// Different ways to handle braced lists.
+  enum BracedListStyle : int8_t {
+    /// Best suited for pre C++11 braced lists.
+    ///
+    /// * Spaces inside the braced list.
+    /// * Line break before the closing brace.
+    /// * Indentation with the block indent.
+    ///
+    /// \code
+    ///    vector<int> x{ 1, 2, 3, 4 };
+    ///    vector<T> x{ {}, {}, {}, {} };
+    ///    f(MyMap[{ composite, key }]);
+    ///    new int[3]{ 1, 2, 3 };
+    ///    Type name{ // Comment
+    ///               value
+    ///    };
+    /// \endcode
+    BLS_Block,
+    /// Best suited for C++11 braced lists.
+    ///
+    /// * No spaces inside the braced list.
+    /// * No line break before the closing brace.
+    /// * Indentation with the continuation indent.
+    ///
+    /// Fundamentally, C++11 braced lists are formatted exactly like function
+    /// calls would be formatted in their place. If the braced list follows a
+    /// name (e.g. a type or variable name), clang-format formats as if the
+    /// ``{}`` were the parentheses of a function call with that name. If there
+    /// is no name, a zero-length name is assumed.
+    /// \code
+    ///    vector<int> x{1, 2, 3, 4};
+    ///    vector<T> x{{}, {}, {}, {}};
+    ///    f(MyMap[{composite, key}]);
+    ///    new int[3]{1, 2, 3};
+    ///    Type name{ // Comment
+    ///        value};
+    /// \endcode
+    BLS_FunctionCall,
+    /// Same as ``FunctionCall``, except for the handling of a comment at the
+    /// begin, it then aligns everything following with the comment.
+    ///
+    /// * No spaces inside the braced list. (Even for a comment at the first
+    ///   position.)
+    /// * No line break before the closing brace.
+    /// * Indentation with the continuation indent, except when followed by a
+    ///   line comment, then it uses the block indent.
+    ///
+    /// \code
+    ///    vector<int> x{1, 2, 3, 4};
+    ///    vector<T> x{{}, {}, {}, {}};
+    ///    f(MyMap[{composite, key}]);
+    ///    new int[3]{1, 2, 3};
+    ///    Type name{// Comment
+    ///              value};
+    /// \endcode
+    BLS_AlignFirstComment,
+  };
+
+  /// The style to handle braced lists.
   /// \version 3.4
-  bool Cpp11BracedListStyle;
+  BracedListStyle Cpp11BracedListStyle;
 
   /// This option is **deprecated**. See ``DeriveLF`` and ``DeriveCRLF`` of
   /// ``LineEnding``.
@@ -4933,7 +4971,7 @@ struct FormatStyle {
   /// Specifies when to insert a space in empty braces.
   /// \note
   ///  This option doesn't apply to initializer braces if
-  ///  ``Cpp11BracedListStyle`` is set to ``true``.
+  ///  ``Cpp11BracedListStyle`` is not ``Block``.
   /// \endnote
   /// \version 22
   SpaceInEmptyBracesStyle SpaceInEmptyBraces;
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 37598f8530c09..87b96c2d5ad09 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -3961,6 +3961,13 @@ class Sema final : public SemaBase {
                                      bool &AddToScope,
                                      ArrayRef<BindingDecl *> Bindings = {});
 
+private:
+  // Perform a check on an AsmLabel to verify its consistency and emit
+  // diagnostics in case of an error.
+  void CheckAsmLabel(Scope *S, Expr *AsmLabelExpr, StorageClass SC,
+                     TypeSourceInfo *TInfo, VarDecl *);
+
+public:
   /// Perform semantic checking on a newly-created variable
   /// declaration.
   ///
@@ -5010,6 +5017,14 @@ class Sema final : public SemaBase {
   void AddLaunchBoundsAttr(Decl *D, const AttributeCommonInfo &CI,
                            Expr *MaxThreads, Expr *MinBlocks, Expr *MaxBlocks);
 
+  /// Add a cluster_dims attribute to a particular declaration.
+  CUDAClusterDimsAttr *createClusterDimsAttr(const AttributeCommonInfo &CI,
+                                             Expr *X, Expr *Y, Expr *Z);
+  void addClusterDimsAttr(Decl *D, const AttributeCommonInfo &CI, Expr *X,
+                          Expr *Y, Expr *Z);
+  /// Add a no_cluster attribute to a particular declaration.
+  void addNoClusterAttr(Decl *D, const AttributeCommonInfo &CI);
+
   enum class RetainOwnershipKind { NS, CF, OS };
 
   UuidAttr *mergeUuidAttr(Decl *D, const AttributeCommonInfo &CI,
@@ -10006,7 +10021,7 @@ class Sema final : public SemaBase {
   public:
     DeferDiagsRAII(Sema &S, bool DeferDiags)
         : S(S), SavedDeferDiags(S.DeferDiags) {
-      S.DeferDiags = DeferDiags;
+      S.DeferDiags = SavedDeferDiags || DeferDiags;
     }
     ~DeferDiagsRAII() { S.DeferDiags = SavedDeferDiags; }
   };
@@ -12992,6 +13007,37 @@ class Sema final : public SemaBase {
   /// default arguments of its methods have been parsed.
   UnparsedDefaultArgInstantiationsMap UnparsedDefaultArgInstantiations;
 
+  using InstantiatingSpecializationsKey = llvm::PointerIntPair<Decl *, 2>;
+
+  struct RecursiveInstGuard {
+    enum class Kind {
+      Template,
+      DefaultArgument,
+      ExceptionSpec,
+    };
+
+    RecursiveInstGuard(Sema &S, Decl *D, Kind Kind)
+        : S(S), Key(D->getCanonicalDecl(), unsigned(Kind)) {
+      auto [_, Created] = S.InstantiatingSpecializations.insert(Key);
+      if (!Created)
+        Key = {};
+    }
+
+    ~RecursiveInstGuard() {
+      if (Key.getOpaqueValue()) {
+        [[maybe_unused]] bool Erased =
+            S.InstantiatingSpecializations.erase(Key);
+        assert(Erased);
+      }
+    }
+
+    operator bool() const { return Key.getOpaqueValue() == nullptr; }
+
+  private:
+    Sema &S;
+    Sema::InstantiatingSpecializationsKey Key;
+  };
+
   /// A context in which code is being synthesized (where a source location
   /// alone is not sufficient to identify the context). This covers template
   /// instantiation and various forms of implicitly-generated functions.
@@ -13353,14 +13399,9 @@ class Sema final : public SemaBase {
     /// recursive template instantiations.
     bool isInvalid() const { return Invalid; }
 
-    /// Determine whether we are already instantiating this
-    /// specialization in some surrounding active instantiation.
-    bool isAlreadyInstantiating() const { return AlreadyInstantiating; }
-
   private:
     Sema &SemaRef;
     bool Invalid;
-    bool AlreadyInstantiating;
 
     InstantiatingTemplate(Sema &SemaRef,
                           CodeSynthesisContext::SynthesisKind Kind,
@@ -13385,6 +13426,13 @@ class Sema final : public SemaBase {
                          const MultiLevelTemplateArgumentList &TemplateArgs,
                          TemplateArgumentListInfo &Outputs);
 
+  /// Substitute concept template arguments in the constraint expression
+  /// of a concept-id. This is used to implement [temp.constr.normal].
+  ExprResult
+  SubstConceptTemplateArguments(const ConceptSpecializationExpr *CSE,
+                                const Expr *ConstraintExpr,
+                                const MultiLevelTemplateArgumentList &MLTAL);
+
   bool SubstTemplateArgumentsInParameterMapping(
       ArrayRef<TemplateArgumentLoc> Args, SourceLocation BaseLoc,
       const MultiLevelTemplateArgumentList &TemplateArgs,
@@ -13483,7 +13531,7 @@ class Sema final : public SemaBase {
   SmallVector<CodeSynthesisContext, 16> CodeSynthesisContexts;
 
   /// Specializations whose definitions are currently being instantiated.
-  llvm::DenseSet<std::pair<Decl *, unsigned>> InstantiatingSpecializations;
+  llvm::DenseSet<InstantiatingSpecializationsKey> InstantiatingSpecializations;
 
   /// Non-dependent types used in templates that have already been instantiated
   /// by some template instantiation.
@@ -13758,6 +13806,14 @@ class Sema final : public SemaBase {
                         const MultiLevelTemplateArgumentList &TemplateArgs,
                         TemplateSpecializationKind TSK, bool Complain = true);
 
+private:
+  bool InstantiateClassImpl(SourceLocation PointOfInstantiation,
+                            CXXRecordDecl *Instantiation,
+                            CXXRecordDecl *Pattern,
+                            const MultiLevelTemplateArgumentList &TemplateArgs,
+                            TemplateSpecializationKind TSK, bool Complain);
+
+public:
   /// Instantiate the definition of an enum from a given pattern.
   ///
   /// \param PointOfInstantiation The point of instantiation within the
diff --git a/clang/include/clang/Sema/SemaAMDGPU.h b/clang/include/clang/Sema/SemaAMDGPU.h
index 761070c4996bc..41e950ccc0800 100644
--- a/clang/include/clang/Sema/SemaAMDGPU.h
+++ b/clang/include/clang/Sema/SemaAMDGPU.h
@@ -32,6 +32,8 @@ class SemaAMDGPU : public SemaBase {
 
   bool checkCoopAtomicFunctionCall(CallExpr *TheCall, bool IsStore);
 
+  bool checkScopedMemAccessFunctionCall(CallExpr *TheCall);
+
   bool checkMovDPPFunctionCall(CallExpr *TheCall, unsigned NumArgs,
                                unsigned NumDataArgs);
 
diff --git a/clang/include/clang/Sema/SemaBase.h b/clang/include/clang/Sema/SemaBase.h
index 550f530af72f5..8e43b0b1ae7ff 100644
--- a/clang/include/clang/Sema/SemaBase.h
+++ b/clang/include/clang/Sema/SemaBase.h
@@ -212,16 +212,13 @@ class SemaBase {
   };
 
   /// Emit a diagnostic.
-  SemaDiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID,
-                             bool DeferHint = false);
+  SemaDiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID);
 
   /// Emit a partial diagnostic.
-  SemaDiagnosticBuilder Diag(SourceLocation Loc, const PartialDiagnostic &PD,
-                             bool DeferHint = false);
+  SemaDiagnosticBuilder Diag(SourceLocation Loc, const PartialDiagnostic &PD);
 
   /// Emit a compatibility diagnostic.
-  SemaDiagnosticBuilder DiagCompat(SourceLocation Loc, unsigned CompatDiagId,
-                                   bool DeferHint = false);
+  SemaDiagnosticBuilder DiagCompat(SourceLocation Loc, unsigned CompatDiagId);
 
   /// Build a partial diagnostic.
   PartialDiagnostic PDiag(unsigned DiagID = 0);
diff --git a/clang/include/clang/Sema/Template.h b/clang/include/clang/Sema/Template.h
index 60c7d275f1aaf..e963439b05c98 100644
--- a/clang/include/clang/Sema/Template.h
+++ b/clang/include/clang/Sema/Template.h
@@ -205,8 +205,8 @@ enum class TemplateSubstitutionKind : char {
 
     /// Add a new outmost level to the multi-level template argument
     /// list.
-    /// A 'Final' substitution means that Subst* nodes won't be built
-    /// for the replacements.
+    /// A 'Final' substitution means that these Args don't need to be
+    /// resugared later.
     void addOuterTemplateArguments(Decl *AssociatedDecl, ArgList Args,
                                    bool Final) {
       assert(!NumRetainedOuterLevels &&
diff --git a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
index 4473c54d8d6e3..ffae3b9310979 100644
--- a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
+++ b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
@@ -76,9 +76,6 @@ def SecurityAlpha : Package<"security">, ParentPackage<Alpha>;
 def CERT : Package<"cert">, ParentPackage<Security>;
 def ENV : Package<"env">, ParentPackage<CERT>;
 
-def CERTAlpha : Package<"cert">, ParentPackage<SecurityAlpha>;
-def POSAlpha : Package<"pos">, ParentPackage<CERTAlpha>;
-
 def Unix : Package<"unix">;
 def UnixAlpha : Package<"unix">, ParentPackage<Alpha>;
 def CString : Package<"cstring">, ParentPackage<Unix>;
@@ -195,6 +192,11 @@ def NullDereferenceChecker
       HelpText<"Check for dereferences of null pointers">,
       Documentation<HasDocumentation>;
 
+def NullPointerArithmChecker
+    : Checker<"NullPointerArithm">,
+      HelpText<"Check for undefined arithmetic operations on null pointers">,
+      Documentation<HasDocumentation>;
+
 def NonNullParamChecker : Checker<"NonNullParamChecker">,
   HelpText<"Check for null pointers passed as arguments to a function whose "
            "arguments are references or marked with the 'nonnull' attribute">,
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/AnalysisManager.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/AnalysisManager.h
index e3cf1bac83ad0..1e87b479d1cf8 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/AnalysisManager.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/AnalysisManager.h
@@ -140,8 +140,8 @@ class AnalysisManager : public BugReporterData {
       // It might be great to reuse FrontendOptions::getInputKindForExtension()
       // but for now it doesn't discriminate between code and header files.
       return llvm::StringSwitch<bool>(SM.getFilename(SL).rsplit('.').second)
-          .Cases("c", "m", "mm", "C", "cc", "cp", true)
-          .Cases("cpp", "CPP", "c++", "cxx", "cppm", true)
+          .Cases({"c", "m", "mm", "C", "cc", "cp"}, true)
+          .Cases({"cpp", "CPP", "c++", "cxx", "cppm"}, true)
           .Default(false);
     }
 
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h
index c233ca1af0256..4aee16576ebd1 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h
@@ -211,6 +211,16 @@ class CallEvent {
   getExtraInvalidatedValues(ValueList &Values,
                             RegionAndSymbolInvalidationTraits *ETraits) const {}
 
+  /// A state for looking up relevant Environment entries (arguments, return
+  /// value), dynamic type information and similar "stable" things.
+  /// WARNING: During the evaluation of a function call, several state
+  /// transitions happen, so this state can become partially obsolete!
+  ///
+  /// TODO: Instead of storing a complete state object in the CallEvent, only
+  /// store the relevant parts (such as argument/return SVals etc.) that aren't
+  /// allowed to become obsolete until the end of the call evaluation.
+  ProgramStateRef getState() const { return State; }
+
 public:
   CallEvent &operator=(const CallEvent &) = delete;
   virtual ~CallEvent() = default;
@@ -231,8 +241,11 @@ class CallEvent {
   }
   void setForeign(bool B) const { Foreign = B; }
 
-  /// The state in which the call is being evaluated.
-  const ProgramStateRef &getState() const { return State; }
+  /// NOTE: There are plans for refactoring that would eliminate this method.
+  /// Prefer to use CheckerContext::getASTContext if possible!
+  const ASTContext &getASTContext() const {
+    return getState()->getStateManager().getContext();
+  }
 
   /// The context in which the call is being evaluated.
   const LocationContext *getLocationContext() const { return LCtx; }
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/FunctionSummary.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/FunctionSummary.h
index 761395260a0cf..db4aec7c84754 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/FunctionSummary.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/FunctionSummary.h
@@ -48,6 +48,9 @@ class FunctionSummariesTy {
     /// The number of times the function has been inlined.
     unsigned TimesInlined : 32;
 
+    /// Running time for syntax-based AST analysis in milliseconds.
+    std::optional<unsigned> SyntaxRunningTime = std::nullopt;
+
     FunctionSummary()
         : TotalBasicBlocks(0), InlineChecked(0), MayInline(0),
           TimesInlined(0) {}
@@ -69,6 +72,11 @@ class FunctionSummariesTy {
     return I;
   }
 
+  FunctionSummary const *findSummary(const Decl *D) const {
+    auto I = Map.find(D);
+    return I == Map.end() ? nullptr : &I->second;
+  }
+
   void markMayInline(const Decl *D) {
     MapTy::iterator I = findOrInsertSummary(D);
     I->second.InlineChecked = 1;
diff --git a/clang/lib/AST/ASTConcept.cpp b/clang/lib/AST/ASTConcept.cpp
index fd12bc4e83827..9ea104c4c3c9d 100644
--- a/clang/lib/AST/ASTConcept.cpp
+++ b/clang/lib/AST/ASTConcept.cpp
@@ -86,7 +86,7 @@ void ConstraintSatisfaction::Profile(llvm::FoldingSetNodeID &ID,
   ID.AddPointer(ConstraintOwner);
   ID.AddInteger(TemplateArgs.size());
   for (auto &Arg : TemplateArgs)
-    Arg.Profile(ID, C);
+    C.getCanonicalTemplateArgument(Arg).Profile(ID, C);
 }
 
 ConceptReference *
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 373c5f1310692..a98574445bf33 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -563,8 +563,7 @@ comments::FullComment *ASTContext::getCommentForDecl(
       // does not have one of its own.
       QualType QT = TD->getUnderlyingType();
       if (const auto *TT = QT->getAs<TagType>())
-        if (comments::FullComment *FC =
-                getCommentForDecl(TT->getOriginalDecl(), PP))
+        if (comments::FullComment *FC = getCommentForDecl(TT->getDecl(), PP))
           return cloneFullComment(FC, D);
     }
     else if (const auto *IC = dyn_cast<ObjCInterfaceDecl>(D)) {
@@ -2392,7 +2391,7 @@ TypeInfo ASTContext::getTypeInfoImpl(const Type *T) const {
   case Type::Record:
   case Type::Enum: {
     const auto *TT = cast<TagType>(T);
-    const TagDecl *TD = TT->getOriginalDecl()->getDefinitionOrSelf();
+    const TagDecl *TD = TT->getDecl()->getDefinitionOrSelf();
 
     if (TD->isInvalidDecl()) {
       Width = 8;
@@ -2536,7 +2535,7 @@ unsigned ASTContext::getTypeUnadjustedAlign(const Type *T) const {
 
   unsigned UnadjustedAlign;
   if (const auto *RT = T->getAsCanonical<RecordType>()) {
-    const ASTRecordLayout &Layout = getASTRecordLayout(RT->getOriginalDecl());
+    const ASTRecordLayout &Layout = getASTRecordLayout(RT->getDecl());
     UnadjustedAlign = toBits(Layout.getUnadjustedAlignment());
   } else if (const auto *ObjCI = T->getAsCanonical<ObjCInterfaceType>()) {
     const ASTRecordLayout &Layout = getASTObjCInterfaceLayout(ObjCI->getDecl());
@@ -3474,7 +3473,7 @@ static void encodeTypeForFunctionPointerAuth(const ASTContext &Ctx,
     llvm_unreachable("should never get here");
   }
   case Type::Record: {
-    const RecordDecl *RD = T->castAsCanonical<RecordType>()->getOriginalDecl();
+    const RecordDecl *RD = T->castAsCanonical<RecordType>()->getDecl();
     const IdentifierInfo *II = RD->getIdentifier();
 
     // In C++, an immediate typedef of an anonymous struct or union
@@ -4718,8 +4717,8 @@ QualType ASTContext::getConstantMatrixType(QualType ElementTy, unsigned NumRows,
 
   assert(MatrixType::isValidElementType(ElementTy) &&
          "need a valid element type");
-  assert(ConstantMatrixType::isDimensionValid(NumRows) &&
-         ConstantMatrixType::isDimensionValid(NumColumns) &&
+  assert(NumRows > 0 && NumRows <= LangOpts.MaxMatrixDimension &&
+         NumColumns > 0 && NumColumns <= LangOpts.MaxMatrixDimension &&
          "need valid matrix dimensions");
   void *InsertPos = nullptr;
   if (ConstantMatrixType *MTP = MatrixTypes.FindNodeOrInsertPos(ID, InsertPos))
@@ -5382,7 +5381,7 @@ TagType *ASTContext::getTagTypeInternal(ElaboratedTypeKeyword Keyword,
   }();
   assert(T->getKeyword() == Keyword);
   assert(T->getQualifier() == Qualifier);
-  assert(T->getOriginalDecl() == TD);
+  assert(T->getDecl() == TD);
   assert(T->isInjected() == IsInjected);
   assert(T->isTagOwned() == OwnsTag);
   assert((T->isCanonicalUnqualified()
@@ -8276,7 +8275,7 @@ Qualifiers::ObjCLifetime ASTContext::getInnerObjCOwnership(QualType T) const {
 static const Type *getIntegerTypeForEnum(const EnumType *ET) {
   // Incomplete enum types are not treated as integer types.
   // FIXME: In C++, enum types are never integer types.
-  const EnumDecl *ED = ET->getOriginalDecl()->getDefinitionOrSelf();
+  const EnumDecl *ED = ET->getDecl()->getDefinitionOrSelf();
   if (ED->isComplete() && !ED->isScoped())
     return ED->getIntegerType().getTypePtr();
   return nullptr;
@@ -9194,7 +9193,7 @@ static void EncodeBitField(const ASTContext *Ctx, std::string& S,
     S += llvm::utostr(Offset);
 
     if (const auto *ET = T->getAsCanonical<EnumType>())
-      S += ObjCEncodingForEnumDecl(Ctx, ET->getOriginalDecl());
+      S += ObjCEncodingForEnumDecl(Ctx, ET->getDecl());
     else {
       const auto *BT = T->castAs<BuiltinType>();
       S += getObjCEncodingForPrimitiveType(Ctx, BT);
@@ -9251,7 +9250,7 @@ void ASTContext::getObjCEncodingForTypeImpl(QualType T, std::string &S,
     if (const auto *BT = dyn_cast<BuiltinType>(CT))
       S += getObjCEncodingForPrimitiveType(this, BT);
     else
-      S += ObjCEncodingForEnumDecl(this, cast<EnumType>(CT)->getOriginalDecl());
+      S += ObjCEncodingForEnumDecl(this, cast<EnumType>(CT)->getDecl());
     return;
 
   case Type::Complex:
@@ -9319,7 +9318,7 @@ void ASTContext::getObjCEncodingForTypeImpl(QualType T, std::string &S,
         return;
       }
     } else if (const auto *RTy = PointeeTy->getAsCanonical<RecordType>()) {
-      const IdentifierInfo *II = RTy->getOriginalDecl()->getIdentifier();
+      const IdentifierInfo *II = RTy->getDecl()->getIdentifier();
       // GCC binary compat: Need to convert "struct objc_class *" to "#".
       if (II == &Idents.get("objc_class")) {
         S += '#';
@@ -9391,7 +9390,7 @@ void ASTContext::getObjCEncodingForTypeImpl(QualType T, std::string &S,
     return;
 
   case Type::Record: {
-    RecordDecl *RDecl = cast<RecordType>(CT)->getOriginalDecl();
+    RecordDecl *RDecl = cast<RecordType>(CT)->getDecl();
     S += RDecl->isUnion() ? '(' : '{';
     // Anonymous structures print as '?'
     if (const IdentifierInfo *II = RDecl->getIdentifier()) {
@@ -11295,7 +11294,7 @@ QualType ASTContext::mergeTransparentUnionType(QualType T, QualType SubType,
                                                bool OfBlockPointer,
                                                bool Unqualified) {
   if (const RecordType *UT = T->getAsUnionType()) {
-    RecordDecl *UD = UT->getOriginalDecl()->getMostRecentDecl();
+    RecordDecl *UD = UT->getDecl()->getMostRecentDecl();
     if (UD->hasAttr<TransparentUnionAttr>()) {
       for (const auto *I : UD->fields()) {
         QualType ET = I->getType().getUnqualifiedType();
@@ -11565,7 +11564,7 @@ static QualType mergeEnumWithInteger(ASTContext &Context, const EnumType *ET,
   // Compatibility is based on the underlying type, not the promotion
   // type.
   QualType underlyingType =
-      ET->getOriginalDecl()->getDefinitionOrSelf()->getIntegerType();
+      ET->getDecl()->getDefinitionOrSelf()->getIntegerType();
   if (underlyingType.isNull())
     return {};
   if (Context.hasSameType(underlyingType, other))
@@ -11586,6 +11585,12 @@ QualType ASTContext::mergeTagDefinitions(QualType LHS, QualType RHS) {
   if (LangOpts.CPlusPlus || !LangOpts.C23)
     return {};
 
+  // Nameless tags are comparable only within outer definitions. At the top
+  // level they are not comparable.
+  const TagDecl *LTagD = LHS->castAsTagDecl(), *RTagD = RHS->castAsTagDecl();
+  if (!LTagD->getIdentifier() || !RTagD->getIdentifier())
+    return {};
+
   // C23, on the other hand, requires the members to be "the same enough", so
   // we use a structural equivalence check.
   StructuralEquivalenceContext::NonEquivalentDeclSet NonEquivalentDecls;
@@ -14163,11 +14168,10 @@ static QualType getCommonNonSugarTypeNode(const ASTContext &Ctx, const Type *X,
   case Type::Record:
   case Type::InjectedClassName: {
     const auto *TX = cast<TagType>(X), *TY = cast<TagType>(Y);
-    return Ctx.getTagType(
-        ::getCommonTypeKeyword(TX, TY, /*IsSame=*/false),
-        ::getCommonQualifier(Ctx, TX, TY, /*IsSame=*/false),
-        ::getCommonDeclChecked(TX->getOriginalDecl(), TY->getOriginalDecl()),
-        /*OwnedTag=*/false);
+    return Ctx.getTagType(::getCommonTypeKeyword(TX, TY, /*IsSame=*/false),
+                          ::getCommonQualifier(Ctx, TX, TY, /*IsSame=*/false),
+                          ::getCommonDeclChecked(TX->getDecl(), TY->getDecl()),
+                          /*OwnedTag=*/false);
   }
   case Type::TemplateSpecialization: {
     const auto *TX = cast<TemplateSpecializationType>(X),
diff --git a/clang/lib/AST/ASTDiagnostic.cpp b/clang/lib/AST/ASTDiagnostic.cpp
index d7fd411ab464c..b8023cb6fa10f 100644
--- a/clang/lib/AST/ASTDiagnostic.cpp
+++ b/clang/lib/AST/ASTDiagnostic.cpp
@@ -196,8 +196,7 @@ break; \
     // Don't desugar through the primary typedef of an anonymous type.
     if (const TagType *UTT = Underlying->getAs<TagType>())
       if (const TypedefType *QTT = dyn_cast<TypedefType>(QT))
-        if (UTT->getOriginalDecl()->getTypedefNameForAnonDecl() ==
-            QTT->getDecl())
+        if (UTT->getDecl()->getTypedefNameForAnonDecl() == QTT->getDecl())
           break;
 
     // Record that we actually looked through an opaque type here.
@@ -1147,14 +1146,11 @@ class TemplateDiff {
     if (const auto* SubstType = Ty->getAs<SubstTemplateTypeParmType>())
       Ty = SubstType->getReplacementType();
 
-    const RecordType *RT = Ty->getAs<RecordType>();
-
+    const auto *RT = Ty->getAs<RecordType>();
     if (!RT)
       return nullptr;
 
-    const ClassTemplateSpecializationDecl *CTSD =
-        dyn_cast<ClassTemplateSpecializationDecl>(RT->getOriginalDecl());
-
+    const auto *CTSD = dyn_cast<ClassTemplateSpecializationDecl>(RT->getDecl());
     if (!CTSD)
       return nullptr;
 
diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index f43fa8c90ad3b..bf51c3e42719c 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -1740,7 +1740,7 @@ ExpectedType ASTNodeImporter::VisitDeducedTemplateSpecializationType(
 }
 
 ExpectedType ASTNodeImporter::VisitTagType(const TagType *T) {
-  TagDecl *DeclForType = T->getOriginalDecl();
+  TagDecl *DeclForType = T->getDecl();
   Expected<TagDecl *> ToDeclOrErr = import(DeclForType);
   if (!ToDeclOrErr)
     return ToDeclOrErr.takeError();
@@ -2155,7 +2155,7 @@ Error ASTNodeImporter::ImportDeclParts(
       const Type *LeafT =
           getLeafPointeeType(P->getType().getCanonicalType().getTypePtr());
       auto *RT = dyn_cast<RecordType>(LeafT);
-      if (RT && RT->getOriginalDecl() == D) {
+      if (RT && RT->getDecl() == D) {
         Importer.FromDiag(D->getLocation(), diag::err_unsupported_ast_node)
             << D->getDeclKindName();
         return make_error<ASTImportError>(ASTImportError::UnsupportedConstruct);
@@ -2408,8 +2408,8 @@ Error ASTNodeImporter::ImportFieldDeclDefinition(const FieldDecl *From,
     const RecordType *RecordTo = ToType->getAs<RecordType>();
 
     if (RecordFrom && RecordTo) {
-      FromRecordDecl = RecordFrom->getOriginalDecl();
-      ToRecordDecl = RecordTo->getOriginalDecl();
+      FromRecordDecl = RecordFrom->getDecl();
+      ToRecordDecl = RecordTo->getDecl();
     }
   }
 
@@ -3205,7 +3205,7 @@ ExpectedDecl ASTNodeImporter::VisitEnumDecl(EnumDecl *D) {
 
       if (auto *Typedef = dyn_cast<TypedefNameDecl>(FoundDecl)) {
         if (const auto *Tag = Typedef->getUnderlyingType()->getAs<TagType>())
-          FoundDecl = Tag->getOriginalDecl();
+          FoundDecl = Tag->getDecl();
       }
 
       if (auto *FoundEnum = dyn_cast<EnumDecl>(FoundDecl)) {
@@ -3336,7 +3336,7 @@ ExpectedDecl ASTNodeImporter::VisitRecordDecl(RecordDecl *D) {
       Decl *Found = FoundDecl;
       if (auto *Typedef = dyn_cast<TypedefNameDecl>(Found)) {
         if (const auto *Tag = Typedef->getUnderlyingType()->getAs<TagType>())
-          Found = Tag->getOriginalDecl();
+          Found = Tag->getDecl();
       }
 
       if (auto *FoundRecord = dyn_cast<RecordDecl>(Found)) {
@@ -3757,12 +3757,11 @@ class IsTypeDeclaredInsideVisitor
   }
 
   std::optional<bool> VisitTagType(const TagType *T) {
-    if (auto *Spec =
-            dyn_cast<ClassTemplateSpecializationDecl>(T->getOriginalDecl()))
+    if (auto *Spec = dyn_cast<ClassTemplateSpecializationDecl>(T->getDecl()))
       for (const auto &Arg : Spec->getTemplateArgs().asArray())
         if (checkTemplateArgument(Arg))
           return true;
-    return isAncestorDeclContextOf(ParentDC, T->getOriginalDecl());
+    return isAncestorDeclContextOf(ParentDC, T->getDecl());
   }
 
   std::optional<bool> VisitPointerType(const PointerType *T) {
diff --git a/clang/lib/AST/ASTStructuralEquivalence.cpp b/clang/lib/AST/ASTStructuralEquivalence.cpp
index 155734679b2da..da64c92221837 100644
--- a/clang/lib/AST/ASTStructuralEquivalence.cpp
+++ b/clang/lib/AST/ASTStructuralEquivalence.cpp
@@ -878,10 +878,10 @@ static bool IsStructurallyEquivalent(StructuralEquivalenceContext &Context,
       // Treat the enumeration as its underlying type and use the builtin type
       // class comparison.
       if (T1->getTypeClass() == Type::Enum) {
-        T1 = cast<EnumType>(T1)->getOriginalDecl()->getIntegerType();
+        T1 = cast<EnumType>(T1)->getDecl()->getIntegerType();
         assert(T2->isBuiltinType() && !T1.isNull()); // Sanity check
       } else if (T2->getTypeClass() == Type::Enum) {
-        T2 = cast<EnumType>(T2)->getOriginalDecl()->getIntegerType();
+        T2 = cast<EnumType>(T2)->getDecl()->getIntegerType();
         assert(T1->isBuiltinType() && !T2.isNull()); // Sanity check
       }
       TC = Type::Builtin;
@@ -1300,8 +1300,7 @@ static bool IsStructurallyEquivalent(StructuralEquivalenceContext &Context,
     if (!IsStructurallyEquivalent(Context, TT1->getQualifier(),
                                   TT2->getQualifier()))
       return false;
-    if (!IsStructurallyEquivalent(Context, TT1->getOriginalDecl(),
-                                  TT2->getOriginalDecl()))
+    if (!IsStructurallyEquivalent(Context, TT1->getDecl(), TT2->getDecl()))
       return false;
     break;
   }
@@ -1531,8 +1530,8 @@ static bool IsStructurallyEquivalent(StructuralEquivalenceContext &Context,
   // types
   if (Field1->isAnonymousStructOrUnion() &&
       Field2->isAnonymousStructOrUnion()) {
-    RecordDecl *D1 = Field1->getType()->castAs<RecordType>()->getOriginalDecl();
-    RecordDecl *D2 = Field2->getType()->castAs<RecordType>()->getOriginalDecl();
+    RecordDecl *D1 = Field1->getType()->castAs<RecordType>()->getDecl();
+    RecordDecl *D2 = Field2->getType()->castAs<RecordType>()->getDecl();
     return IsStructurallyEquivalent(Context, D1, D2);
   }
 
@@ -1763,19 +1762,6 @@ static bool IsStructurallyEquivalent(StructuralEquivalenceContext &Context,
   // another anonymous structure or union, respectively, if their members
   // fulfill the preceding requirements. ... Otherwise, the structure, union,
   // or enumerated types are incompatible.
-
-  // Note: "the same tag" refers to the identifier for the structure; two
-  // structures without names are not compatible within a TU. In C23, if either
-  // declaration has no name, they're not equivalent. However, the paragraph
-  // after the bulleted list goes on to talk about compatibility of anonymous
-  // structure and union members, so this prohibition only applies to top-level
-  // declarations; if either declaration is not a member, they cannot be
-  // compatible.
-  if (Context.LangOpts.C23 && (!D1->getIdentifier() || !D2->getIdentifier()) &&
-      (!D1->getDeclContext()->isRecord() || !D2->getDeclContext()->isRecord()))
-    return false;
-
-  // Otherwise, check the names for equivalence.
   if (!NameIsStructurallyEquivalent(*D1, *D2))
     return false;
 
@@ -2600,7 +2586,7 @@ StructuralEquivalenceContext::findUntaggedStructOrUnionIndex(RecordDecl *Anon) {
     // struct { ... } A;
     QualType FieldType = F->getType();
     if (const auto *RecType = dyn_cast<RecordType>(FieldType)) {
-      const RecordDecl *RecDecl = RecType->getOriginalDecl();
+      const RecordDecl *RecDecl = RecType->getDecl();
       if (RecDecl->getDeclContext() == Owner && !RecDecl->getIdentifier()) {
         if (Context.hasSameType(FieldType, AnonTy))
           break;
diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index c71fd22fe9d7e..6b989276e6d7d 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -4660,7 +4660,7 @@ const RecordType *Compiler<Emitter>::getRecordTy(QualType Ty) {
 
 template <class Emitter> Record *Compiler<Emitter>::getRecord(QualType Ty) {
   if (const auto *RecordTy = getRecordTy(Ty))
-    return getRecord(RecordTy->getOriginalDecl()->getDefinitionOrSelf());
+    return getRecord(RecordTy->getDecl()->getDefinitionOrSelf());
   return nullptr;
 }
 
@@ -4841,46 +4841,39 @@ Compiler<Emitter>::visitVarDecl(const VarDecl *VD, const Expr *Init,
       return !NeedsOp || this->emitCheckDecl(VD, VD);
     };
 
-    auto initGlobal = [&](unsigned GlobalIndex) -> bool {
-      assert(Init);
-
-      if (VarT) {
-        if (!this->visit(Init))
-          return checkDecl() && false;
-
-        return checkDecl() && this->emitInitGlobal(*VarT, GlobalIndex, VD);
-      }
-
-      if (!checkDecl())
-        return false;
-
-      if (!this->emitGetPtrGlobal(GlobalIndex, Init))
-        return false;
-
-      if (!visitInitializer(Init))
-        return false;
-
-      return this->emitFinishInitGlobal(Init);
-    };
-
     DeclScope<Emitter> LocalScope(this, VD);
 
-    // We've already seen and initialized this global.
-    if (UnsignedOrNone GlobalIndex = P.getGlobal(VD)) {
+    UnsignedOrNone GlobalIndex = P.getGlobal(VD);
+    if (GlobalIndex) {
+      // We've already seen and initialized this global.
       if (P.getPtrGlobal(*GlobalIndex).isInitialized())
         return checkDecl();
-
       // The previous attempt at initialization might've been unsuccessful,
       // so let's try this one.
-      return Init && checkDecl() && initGlobal(*GlobalIndex);
+    } else if ((GlobalIndex = P.createGlobal(VD, Init))) {
+    } else {
+      return false;
     }
+    if (!Init)
+      return true;
 
-    UnsignedOrNone GlobalIndex = P.createGlobal(VD, Init);
+    if (!checkDecl())
+      return false;
 
-    if (!GlobalIndex)
+    if (VarT) {
+      if (!this->visit(Init))
+        return false;
+
+      return this->emitInitGlobal(*VarT, *GlobalIndex, VD);
+    }
+
+    if (!this->emitGetPtrGlobal(*GlobalIndex, Init))
+      return false;
+
+    if (!visitInitializer(Init))
       return false;
 
-    return !Init || (checkDecl() && initGlobal(*GlobalIndex));
+    return this->emitFinishInitGlobal(Init);
   }
   // Local variables.
   InitLinkScope<Emitter> ILS(this, InitLink::Decl(VD));
@@ -4890,36 +4883,37 @@ Compiler<Emitter>::visitVarDecl(const VarDecl *VD, const Expr *Init,
         VD, *VarT, VD->getType().isConstQualified(),
         VD->getType().isVolatileQualified(), nullptr, ScopeKind::Block,
         IsConstexprUnknown);
-    if (Init) {
-      // If this is a toplevel declaration, create a scope for the
-      // initializer.
-      if (Toplevel) {
-        LocalScope<Emitter> Scope(this);
-        if (!this->visit(Init))
-          return false;
-        return this->emitSetLocal(*VarT, Offset, VD) && Scope.destroyLocals();
-      }
-        if (!this->visit(Init))
-          return false;
-        return this->emitSetLocal(*VarT, Offset, VD);
-    }
-  } else {
-    if (UnsignedOrNone Offset = this->allocateLocal(
-            VD, VD->getType(), nullptr, ScopeKind::Block, IsConstexprUnknown)) {
-      if (!Init)
-        return true;
 
-      if (!this->emitGetPtrLocal(*Offset, Init))
-        return false;
+    if (!Init)
+      return true;
 
-      if (!visitInitializer(Init))
+    // If this is a toplevel declaration, create a scope for the
+    // initializer.
+    if (Toplevel) {
+      LocalScope<Emitter> Scope(this);
+      if (!this->visit(Init))
         return false;
-
-      return this->emitFinishInitPop(Init);
+      return this->emitSetLocal(*VarT, Offset, VD) && Scope.destroyLocals();
     }
-    return false;
+    if (!this->visit(Init))
+      return false;
+    return this->emitSetLocal(*VarT, Offset, VD);
   }
-  return true;
+  // Local composite variables.
+  if (UnsignedOrNone Offset = this->allocateLocal(
+          VD, VD->getType(), nullptr, ScopeKind::Block, IsConstexprUnknown)) {
+    if (!Init)
+      return true;
+
+    if (!this->emitGetPtrLocal(*Offset, Init))
+      return false;
+
+    if (!visitInitializer(Init))
+      return false;
+
+    return this->emitFinishInitPop(Init);
+  }
+  return false;
 }
 
 template <class Emitter>
@@ -6633,7 +6627,7 @@ bool Compiler<Emitter>::VisitUnaryOperator(const UnaryOperator *E) {
     if (!this->visit(SubExpr))
       return false;
 
-    if (!this->emitCheckNull(E))
+    if (!SubExpr->getType()->isFunctionPointerType() && !this->emitCheckNull(E))
       return false;
 
     if (classifyPrim(SubExpr) == PT_Ptr)
diff --git a/clang/lib/AST/ByteCode/Context.cpp b/clang/lib/AST/ByteCode/Context.cpp
index 683e916391337..12bf3a3954b1b 100644
--- a/clang/lib/AST/ByteCode/Context.cpp
+++ b/clang/lib/AST/ByteCode/Context.cpp
@@ -7,12 +7,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "Context.h"
+#include "Boolean.h"
 #include "ByteCodeEmitter.h"
 #include "Compiler.h"
 #include "EvalEmitter.h"
-#include "Interp.h"
+#include "Integral.h"
 #include "InterpFrame.h"
+#include "InterpHelpers.h"
 #include "InterpStack.h"
+#include "Pointer.h"
 #include "PrimType.h"
 #include "Program.h"
 #include "clang/AST/ASTLambda.h"
@@ -566,10 +569,15 @@ const Function *Context::getOrCreateFunction(const FunctionDecl *FuncDecl) {
 
   // Assign descriptors to all parameters.
   // Composite objects are lowered to pointers.
-  for (const ParmVarDecl *PD : FuncDecl->parameters()) {
+  const auto *FuncProto = FuncDecl->getType()->getAs<FunctionProtoType>();
+  for (auto [ParamIndex, PD] : llvm::enumerate(FuncDecl->parameters())) {
     bool IsConst = PD->getType().isConstQualified();
     bool IsVolatile = PD->getType().isVolatileQualified();
 
+    if (!getASTContext().hasSameType(PD->getType(),
+                                     FuncProto->getParamType(ParamIndex)))
+      return nullptr;
+
     OptPrimType T = classify(PD->getType());
     PrimType PT = T.value_or(PT_Ptr);
     Descriptor *Desc = P->createDescriptor(PD, PT, nullptr, std::nullopt,
diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp
index 89043968915a9..a72282caf5e73 100644
--- a/clang/lib/AST/ByteCode/Interp.cpp
+++ b/clang/lib/AST/ByteCode/Interp.cpp
@@ -1358,9 +1358,6 @@ bool Free(InterpState &S, CodePtr OpPC, bool DeleteIsArrayForm,
 
 void diagnoseEnumValue(InterpState &S, CodePtr OpPC, const EnumDecl *ED,
                        const APSInt &Value) {
-  if (S.EvaluatingDecl && !S.EvaluatingDecl->isConstexpr())
-    return;
-
   llvm::APInt Min;
   llvm::APInt Max;
   ED->getValueRange(Max, Min);
diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h
index 57cc705282d1b..d8529daf591ee 100644
--- a/clang/lib/AST/ByteCode/Interp.h
+++ b/clang/lib/AST/ByteCode/Interp.h
@@ -22,6 +22,7 @@
 #include "Function.h"
 #include "InterpBuiltinBitCast.h"
 #include "InterpFrame.h"
+#include "InterpHelpers.h"
 #include "InterpStack.h"
 #include "InterpState.h"
 #include "MemberPointer.h"
@@ -43,28 +44,10 @@ using FixedPointSemantics = llvm::FixedPointSemantics;
 /// Checks if the variable has externally defined storage.
 bool CheckExtern(InterpState &S, CodePtr OpPC, const Pointer &Ptr);
 
-/// Checks if the array is offsetable.
-bool CheckArray(InterpState &S, CodePtr OpPC, const Pointer &Ptr);
-
-/// Checks if a pointer is live and accessible.
-bool CheckLive(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
-               AccessKinds AK);
-
-/// Checks if a pointer is a dummy pointer.
-bool CheckDummy(InterpState &S, CodePtr OpPC, const Block *B, AccessKinds AK);
-
 /// Checks if a pointer is null.
 bool CheckNull(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
                CheckSubobjectKind CSK);
 
-/// Checks if a pointer is in range.
-bool CheckRange(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
-                AccessKinds AK);
-
-/// Checks if a field from which a pointer is going to be derived is valid.
-bool CheckRange(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
-                CheckSubobjectKind CSK);
-
 /// Checks if Ptr is a one-past-the-end pointer.
 bool CheckSubobject(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
                     CheckSubobjectKind CSK);
@@ -80,12 +63,6 @@ bool CheckConst(InterpState &S, CodePtr OpPC, const Pointer &Ptr);
 /// Checks if the Descriptor is of a constexpr or const global variable.
 bool CheckConstant(InterpState &S, CodePtr OpPC, const Descriptor *Desc);
 
-/// Checks if a pointer points to a mutable field.
-bool CheckMutable(InterpState &S, CodePtr OpPC, const Pointer &Ptr);
-
-/// Checks if a value can be loaded from a block.
-bool CheckLoad(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
-               AccessKinds AK = AK_Read);
 bool CheckFinalLoad(InterpState &S, CodePtr OpPC, const Pointer &Ptr);
 
 bool DiagnoseUninitialized(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
@@ -110,12 +87,6 @@ bool CheckThis(InterpState &S, CodePtr OpPC);
 /// language mode.
 bool CheckDynamicMemoryAllocation(InterpState &S, CodePtr OpPC);
 
-/// Diagnose mismatched new[]/delete or new/delete[] pairs.
-bool CheckNewDeleteForms(InterpState &S, CodePtr OpPC,
-                         DynamicAllocator::Form AllocForm,
-                         DynamicAllocator::Form DeleteForm, const Descriptor *D,
-                         const Expr *NewExpr);
-
 /// Check the source of the pointer passed to delete/delete[] has actually
 /// been heap allocated by us.
 bool CheckDeleteSource(InterpState &S, CodePtr OpPC, const Expr *Source,
@@ -129,9 +100,6 @@ bool CheckActive(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
 bool SetThreeWayComparisonField(InterpState &S, CodePtr OpPC,
                                 const Pointer &Ptr, const APSInt &IntValue);
 
-/// Copy the contents of Src into Dest.
-bool DoMemcpy(InterpState &S, CodePtr OpPC, const Pointer &Src, Pointer &Dest);
-
 bool CallVar(InterpState &S, CodePtr OpPC, const Function *Func,
              uint32_t VarArgSize);
 bool Call(InterpState &S, CodePtr OpPC, const Function *Func,
@@ -149,19 +117,11 @@ bool CheckBitCast(InterpState &S, CodePtr OpPC, bool HasIndeterminateBits,
 bool CheckBCPResult(InterpState &S, const Pointer &Ptr);
 bool CheckDestructor(InterpState &S, CodePtr OpPC, const Pointer &Ptr);
 
-template <typename T>
-static bool handleOverflow(InterpState &S, CodePtr OpPC, const T &SrcValue) {
-  const Expr *E = S.Current->getExpr(OpPC);
-  S.CCEDiag(E, diag::note_constexpr_overflow) << SrcValue << E->getType();
-  return S.noteUndefinedBehavior();
-}
 bool handleFixedPointOverflow(InterpState &S, CodePtr OpPC,
                               const FixedPoint &FP);
 
 bool isConstexprUnknown(const Pointer &P);
 
-inline bool CheckArraySize(InterpState &S, CodePtr OpPC, uint64_t NumElems);
-
 enum class ShiftDir { Left, Right };
 
 /// Checks if the shift operation is legal.
@@ -241,43 +201,6 @@ bool CheckDivRem(InterpState &S, CodePtr OpPC, const T &LHS, const T &RHS) {
   return true;
 }
 
-template <typename SizeT>
-bool CheckArraySize(InterpState &S, CodePtr OpPC, SizeT *NumElements,
-                    unsigned ElemSize, bool IsNoThrow) {
-  // FIXME: Both the SizeT::from() as well as the
-  // NumElements.toAPSInt() in this function are rather expensive.
-
-  // Can't be too many elements if the bitwidth of NumElements is lower than
-  // that of Descriptor::MaxArrayElemBytes.
-  if ((NumElements->bitWidth() - NumElements->isSigned()) <
-      (sizeof(Descriptor::MaxArrayElemBytes) * 8))
-    return true;
-
-  // FIXME: GH63562
-  // APValue stores array extents as unsigned,
-  // so anything that is greater that unsigned would overflow when
-  // constructing the array, we catch this here.
-  SizeT MaxElements = SizeT::from(Descriptor::MaxArrayElemBytes / ElemSize);
-  assert(MaxElements.isPositive());
-  if (NumElements->toAPSInt().getActiveBits() >
-          ConstantArrayType::getMaxSizeBits(S.getASTContext()) ||
-      *NumElements > MaxElements) {
-    if (!IsNoThrow) {
-      const SourceInfo &Loc = S.Current->getSource(OpPC);
-
-      if (NumElements->isSigned() && NumElements->isNegative()) {
-        S.FFDiag(Loc, diag::note_constexpr_new_negative)
-            << NumElements->toDiagnosticString(S.getASTContext());
-      } else {
-        S.FFDiag(Loc, diag::note_constexpr_new_too_large)
-            << NumElements->toDiagnosticString(S.getASTContext());
-      }
-    }
-    return false;
-  }
-  return true;
-}
-
 /// Checks if the result of a floating-point operation is valid
 /// in the current context.
 bool CheckFloatResult(InterpState &S, CodePtr OpPC, const Floating &Result,
@@ -286,19 +209,6 @@ bool CheckFloatResult(InterpState &S, CodePtr OpPC, const Floating &Result,
 /// Checks why the given DeclRefExpr is invalid.
 bool CheckDeclRef(InterpState &S, CodePtr OpPC, const DeclRefExpr *DR);
 
-/// Interpreter entry point.
-bool Interpret(InterpState &S);
-
-/// Interpret a builtin function.
-bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
-                      uint32_t BuiltinID);
-
-/// Interpret an offsetof operation.
-bool InterpretOffsetOf(InterpState &S, CodePtr OpPC, const OffsetOfExpr *E,
-                       ArrayRef<int64_t> ArrayIndices, int64_t &Result);
-
-inline bool Invalid(InterpState &S, CodePtr OpPC);
-
 enum class ArithOp { Add, Sub };
 
 //===----------------------------------------------------------------------===//
@@ -403,13 +313,6 @@ bool Add(InterpState &S, CodePtr OpPC) {
   return AddSubMulHelper<T, T::add, std::plus>(S, OpPC, Bits, LHS, RHS);
 }
 
-static inline llvm::RoundingMode getRoundingMode(FPOptions FPO) {
-  auto RM = FPO.getRoundingMode();
-  if (RM == llvm::RoundingMode::Dynamic)
-    return llvm::RoundingMode::NearestTiesToEven;
-  return RM;
-}
-
 inline bool Addf(InterpState &S, CodePtr OpPC, uint32_t FPOI) {
   const Floating &RHS = S.Stk.pop<Floating>();
   const Floating &LHS = S.Stk.pop<Floating>();
@@ -2258,6 +2161,8 @@ std::optional<Pointer> OffsetHelper(InterpState &S, CodePtr OpPC,
       S.CCEDiag(S.Current->getSource(OpPC), diag::note_constexpr_array_index)
           << N << /*non-array*/ true << 0;
     return Pointer(Ptr.asFunctionPointer().getFunction(), N);
+  } else if (!Ptr.isBlockPointer()) {
+    return std::nullopt;
   }
 
   assert(Ptr.isBlockPointer());
@@ -3096,7 +3001,8 @@ inline bool ArrayElemPtr(InterpState &S, CodePtr OpPC) {
   }
 
   if (Offset.isZero()) {
-    if (Ptr.getFieldDesc()->isArray() && Ptr.getIndex() == 0) {
+    if (const Descriptor *Desc = Ptr.getFieldDesc();
+        Desc && Desc->isArray() && Ptr.getIndex() == 0) {
       S.Stk.push<Pointer>(Ptr.atIndex(0).narrow());
       return true;
     }
@@ -3126,7 +3032,8 @@ inline bool ArrayElemPtrPop(InterpState &S, CodePtr OpPC) {
   }
 
   if (Offset.isZero()) {
-    if (Ptr.getFieldDesc()->isArray() && Ptr.getIndex() == 0) {
+    if (const Descriptor *Desc = Ptr.getFieldDesc();
+        Desc && Desc->isArray() && Ptr.getIndex() == 0) {
       S.Stk.push<Pointer>(Ptr.atIndex(0).narrow());
       return true;
     }
@@ -3260,12 +3167,6 @@ inline bool GetMemberPtrDecl(InterpState &S, CodePtr OpPC) {
 
 /// Just emit a diagnostic. The expression that caused emission of this
 /// op is not valid in a constant context.
-inline bool Invalid(InterpState &S, CodePtr OpPC) {
-  const SourceLocation &Loc = S.Current->getLocation(OpPC);
-  S.FFDiag(Loc, diag::note_invalid_subexpr_in_const_expr)
-      << S.Current->getRange(OpPC);
-  return false;
-}
 
 inline bool Unsupported(InterpState &S, CodePtr OpPC) {
   const SourceLocation &Loc = S.Current->getLocation(OpPC);
@@ -3697,17 +3598,6 @@ inline bool CheckDestruction(InterpState &S, CodePtr OpPC) {
   return CheckDestructor(S, OpPC, Ptr);
 }
 
-inline bool CheckArraySize(InterpState &S, CodePtr OpPC, uint64_t NumElems) {
-  uint64_t Limit = S.getLangOpts().ConstexprStepLimit;
-  if (Limit != 0 && NumElems > Limit) {
-    S.FFDiag(S.Current->getSource(OpPC),
-             diag::note_constexpr_new_exceeds_limits)
-        << NumElems << Limit;
-    return false;
-  }
-  return true;
-}
-
 //===----------------------------------------------------------------------===//
 // Read opcode arguments
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/AST/ByteCode/InterpBlock.cpp b/clang/lib/AST/ByteCode/InterpBlock.cpp
index ac6f01f3cdca1..24825ad2557ef 100644
--- a/clang/lib/AST/ByteCode/InterpBlock.cpp
+++ b/clang/lib/AST/ByteCode/InterpBlock.cpp
@@ -100,6 +100,19 @@ bool Block::hasPointer(const Pointer *P) const {
 }
 #endif
 
+void Block::movePointersTo(Block *B) {
+  assert(B != this);
+
+  while (Pointers) {
+    Pointer *P = Pointers;
+
+    this->removePointer(P);
+    P->BS.Pointee = B;
+    B->addPointer(P);
+  }
+  assert(!this->hasPointers());
+}
+
 DeadBlock::DeadBlock(DeadBlock *&Root, Block *Blk)
     : Root(Root), B(~0u, Blk->Desc, Blk->isExtern(), Blk->IsStatic,
                     Blk->isWeak(), Blk->isDummy(), /*IsDead=*/true) {
diff --git a/clang/lib/AST/ByteCode/InterpBlock.h b/clang/lib/AST/ByteCode/InterpBlock.h
index 9b3dadca6cc14..73fdc8d85da11 100644
--- a/clang/lib/AST/ByteCode/InterpBlock.h
+++ b/clang/lib/AST/ByteCode/InterpBlock.h
@@ -92,6 +92,8 @@ class Block final {
   bool isInitialized() const { return IsInitialized; }
   /// The Evaluation ID this block was created in.
   unsigned getEvalID() const { return EvalID; }
+  /// Move all pointers from this block to \param B.
+  void movePointersTo(Block *B);
 
   /// Returns a pointer to the stored data.
   /// You are allowed to read Desc->getSize() bytes from this address.
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 84c5ecc9aac1e..ff83c52b0c8f6 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -8,9 +8,10 @@
 #include "../ExprConstShared.h"
 #include "Boolean.h"
 #include "EvalEmitter.h"
-#include "Interp.h"
 #include "InterpBuiltinBitCast.h"
+#include "InterpHelpers.h"
 #include "PrimType.h"
+#include "Program.h"
 #include "clang/AST/OSLog.h"
 #include "clang/AST/RecordLayout.h"
 #include "clang/Basic/Builtins.h"
@@ -23,7 +24,7 @@
 namespace clang {
 namespace interp {
 
-LLVM_ATTRIBUTE_UNUSED static bool isNoopBuiltin(unsigned ID) {
+[[maybe_unused]] static bool isNoopBuiltin(unsigned ID) {
   switch (ID) {
   case Builtin::BIas_const:
   case Builtin::BIforward:
@@ -2041,10 +2042,16 @@ static bool interp__builtin_memchr(InterpState &S, CodePtr OpPC,
   }
 
   if (ID == Builtin::BIstrchr || ID == Builtin::BI__builtin_strchr) {
+    int64_t DesiredTrunc;
+    if (S.getASTContext().CharTy->isSignedIntegerType())
+      DesiredTrunc =
+          Desired.trunc(S.getASTContext().getCharWidth()).getSExtValue();
+    else
+      DesiredTrunc =
+          Desired.trunc(S.getASTContext().getCharWidth()).getZExtValue();
     // strchr compares directly to the passed integer, and therefore
     // always fails if given an int that is not a char.
-    if (Desired !=
-        Desired.trunc(S.getASTContext().getCharWidth()).getSExtValue()) {
+    if (Desired != DesiredTrunc) {
       S.Stk.push<Pointer>();
       return true;
     }
@@ -2587,6 +2594,82 @@ static bool interp__builtin_ia32_pmul(
   return true;
 }
 
+static bool interp_builtin_horizontal_int_binop(
+    InterpState &S, CodePtr OpPC, const CallExpr *Call,
+    llvm::function_ref<APInt(const APSInt &, const APSInt &)> Fn) {
+  const auto *VT = Call->getArg(0)->getType()->castAs<VectorType>();
+  PrimType ElemT = *S.getContext().classify(VT->getElementType());
+  bool DestUnsigned = Call->getType()->isUnsignedIntegerOrEnumerationType();
+
+  const Pointer &RHS = S.Stk.pop<Pointer>();
+  const Pointer &LHS = S.Stk.pop<Pointer>();
+  const Pointer &Dst = S.Stk.peek<Pointer>();
+  unsigned NumElts = VT->getNumElements();
+  unsigned EltBits = S.getASTContext().getIntWidth(VT->getElementType());
+  unsigned EltsPerLane = 128 / EltBits;
+  unsigned Lanes = NumElts * EltBits / 128;
+  unsigned DestIndex = 0;
+
+  for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
+    unsigned LaneStart = Lane * EltsPerLane;
+    for (unsigned I = 0; I < EltsPerLane; I += 2) {
+      INT_TYPE_SWITCH_NO_BOOL(ElemT, {
+        APSInt Elem1 = LHS.elem<T>(LaneStart + I).toAPSInt();
+        APSInt Elem2 = LHS.elem<T>(LaneStart + I + 1).toAPSInt();
+        APSInt ResL = APSInt(Fn(Elem1, Elem2), DestUnsigned);
+        Dst.elem<T>(DestIndex++) = static_cast<T>(ResL);
+      });
+    }
+
+    for (unsigned I = 0; I < EltsPerLane; I += 2) {
+      INT_TYPE_SWITCH_NO_BOOL(ElemT, {
+        APSInt Elem1 = RHS.elem<T>(LaneStart + I).toAPSInt();
+        APSInt Elem2 = RHS.elem<T>(LaneStart + I + 1).toAPSInt();
+        APSInt ResR = APSInt(Fn(Elem1, Elem2), DestUnsigned);
+        Dst.elem<T>(DestIndex++) = static_cast<T>(ResR);
+      });
+    }
+  }
+  Dst.initializeAllElements();
+  return true;
+}
+
+static bool interp_builtin_horizontal_fp_binop(
+    InterpState &S, CodePtr OpPC, const CallExpr *Call,
+    llvm::function_ref<APFloat(const APFloat &, const APFloat &,
+                               llvm::RoundingMode)>
+        Fn) {
+  const Pointer &RHS = S.Stk.pop<Pointer>();
+  const Pointer &LHS = S.Stk.pop<Pointer>();
+  const Pointer &Dst = S.Stk.peek<Pointer>();
+  FPOptions FPO = Call->getFPFeaturesInEffect(S.Ctx.getLangOpts());
+  llvm::RoundingMode RM = getRoundingMode(FPO);
+  const auto *VT = Call->getArg(0)->getType()->castAs<VectorType>();
+
+  unsigned NumElts = VT->getNumElements();
+  unsigned EltBits = S.getASTContext().getTypeSize(VT->getElementType());
+  unsigned NumLanes = NumElts * EltBits / 128;
+  unsigned NumElemsPerLane = NumElts / NumLanes;
+  unsigned HalfElemsPerLane = NumElemsPerLane / 2;
+
+  for (unsigned L = 0; L != NumElts; L += NumElemsPerLane) {
+    using T = PrimConv<PT_Float>::T;
+    for (unsigned E = 0; E != HalfElemsPerLane; ++E) {
+      APFloat Elem1 = LHS.elem<T>(L + (2 * E) + 0).getAPFloat();
+      APFloat Elem2 = LHS.elem<T>(L + (2 * E) + 1).getAPFloat();
+      Dst.elem<T>(L + E) = static_cast<T>(Fn(Elem1, Elem2, RM));
+    }
+    for (unsigned E = 0; E != HalfElemsPerLane; ++E) {
+      APFloat Elem1 = RHS.elem<T>(L + (2 * E) + 0).getAPFloat();
+      APFloat Elem2 = RHS.elem<T>(L + (2 * E) + 1).getAPFloat();
+      Dst.elem<T>(L + E + HalfElemsPerLane) =
+          static_cast<T>(Fn(Elem1, Elem2, RM));
+    }
+  }
+  Dst.initializeAllElements();
+  return true;
+}
+
 static bool interp__builtin_elementwise_triop_fp(
     InterpState &S, CodePtr OpPC, const CallExpr *Call,
     llvm::function_ref<APFloat(const APFloat &, const APFloat &,
@@ -2714,6 +2797,34 @@ static bool interp__builtin_blend(InterpState &S, CodePtr OpPC,
   return true;
 }
 
+static bool interp__builtin_ia32_pshufb(InterpState &S, CodePtr OpPC,
+                                        const CallExpr *Call) {
+  assert(Call->getNumArgs() == 2 && "masked forms handled via select*");
+  const Pointer &Control = S.Stk.pop<Pointer>();
+  const Pointer &Src = S.Stk.pop<Pointer>();
+  const Pointer &Dst = S.Stk.peek<Pointer>();
+
+  unsigned NumElems = Dst.getNumElems();
+  assert(NumElems == Control.getNumElems());
+  assert(NumElems == Dst.getNumElems());
+
+  for (unsigned Idx = 0; Idx != NumElems; ++Idx) {
+    uint8_t Ctlb = static_cast<uint8_t>(Control.elem<int8_t>(Idx));
+
+    if (Ctlb & 0x80) {
+      Dst.elem<int8_t>(Idx) = 0;
+    } else {
+      unsigned LaneBase = (Idx / 16) * 16;
+      unsigned SrcOffset = Ctlb & 0x0F;
+      unsigned SrcIdx = LaneBase + SrcOffset;
+
+      Dst.elem<int8_t>(Idx) = Src.elem<int8_t>(SrcIdx);
+    }
+  }
+  Dst.initializeAllElements();
+  return true;
+}
+
 static bool interp__builtin_ia32_pshuf(InterpState &S, CodePtr OpPC,
                                        const CallExpr *Call, bool IsShufHW) {
   assert(Call->getNumArgs() == 2 && "masked forms handled via select*");
@@ -2795,6 +2906,35 @@ static bool interp__builtin_ia32_test_op(
   return true;
 }
 
+static bool interp__builtin_ia32_movmsk_op(InterpState &S, CodePtr OpPC,
+                                           const CallExpr *Call) {
+  assert(Call->getNumArgs() == 1);
+
+  const Pointer &Source = S.Stk.pop<Pointer>();
+
+  unsigned SourceLen = Source.getNumElems();
+  QualType ElemQT = getElemType(Source);
+  OptPrimType ElemT = S.getContext().classify(ElemQT);
+  unsigned ResultLen =
+      S.getASTContext().getTypeSize(Call->getType()); // Always 32-bit integer.
+  APInt Result(ResultLen, 0);
+
+  for (unsigned I = 0; I != SourceLen; ++I) {
+    APInt Elem;
+    if (ElemQT->isIntegerType()) {
+      INT_TYPE_SWITCH_NO_BOOL(*ElemT, { Elem = Source.elem<T>(I).toAPSInt(); });
+    } else if (ElemQT->isRealFloatingType()) {
+      using T = PrimConv<PT_Float>::T;
+      Elem = Source.elem<T>(I).getAPFloat().bitcastToAPInt();
+    } else {
+      return false;
+    }
+    Result.setBitVal(I, Elem.isNegative());
+  }
+  pushInteger(S, Result, Call->getType());
+  return true;
+}
+
 static bool interp__builtin_elementwise_triop(
     InterpState &S, CodePtr OpPC, const CallExpr *Call,
     llvm::function_ref<APInt(const APSInt &, const APSInt &, const APSInt &)>
@@ -2858,6 +2998,82 @@ static bool interp__builtin_elementwise_triop(
   return true;
 }
 
+static bool interp__builtin_x86_extract_vector(InterpState &S, CodePtr OpPC,
+                                               const CallExpr *Call,
+                                               unsigned ID) {
+  assert(Call->getNumArgs() == 2);
+
+  APSInt ImmAPS = popToAPSInt(S, Call->getArg(1));
+  uint64_t Index = ImmAPS.getZExtValue();
+
+  const Pointer &Src = S.Stk.pop<Pointer>();
+  if (!Src.getFieldDesc()->isPrimitiveArray())
+    return false;
+
+  const Pointer &Dst = S.Stk.peek<Pointer>();
+  if (!Dst.getFieldDesc()->isPrimitiveArray())
+    return false;
+
+  unsigned SrcElems = Src.getNumElems();
+  unsigned DstElems = Dst.getNumElems();
+
+  unsigned NumLanes = SrcElems / DstElems;
+  unsigned Lane = static_cast<unsigned>(Index % NumLanes);
+  unsigned ExtractPos = Lane * DstElems;
+
+  PrimType ElemT = Src.getFieldDesc()->getPrimType();
+
+  TYPE_SWITCH(ElemT, {
+    for (unsigned I = 0; I != DstElems; ++I) {
+      Dst.elem<T>(I) = Src.elem<T>(ExtractPos + I);
+    }
+  });
+
+  Dst.initializeAllElements();
+  return true;
+}
+
+static bool interp__builtin_x86_extract_vector_masked(InterpState &S,
+                                                      CodePtr OpPC,
+                                                      const CallExpr *Call,
+                                                      unsigned ID) {
+  assert(Call->getNumArgs() == 4);
+
+  APSInt MaskAPS = popToAPSInt(S, Call->getArg(3));
+  const Pointer &Merge = S.Stk.pop<Pointer>();
+  APSInt ImmAPS = popToAPSInt(S, Call->getArg(1));
+  const Pointer &Src = S.Stk.pop<Pointer>();
+
+  if (!Src.getFieldDesc()->isPrimitiveArray() ||
+      !Merge.getFieldDesc()->isPrimitiveArray())
+    return false;
+
+  const Pointer &Dst = S.Stk.peek<Pointer>();
+  if (!Dst.getFieldDesc()->isPrimitiveArray())
+    return false;
+
+  unsigned SrcElems = Src.getNumElems();
+  unsigned DstElems = Dst.getNumElems();
+
+  unsigned NumLanes = SrcElems / DstElems;
+  unsigned Lane = static_cast<unsigned>(ImmAPS.getZExtValue() % NumLanes);
+  unsigned Base = Lane * DstElems;
+
+  PrimType ElemT = Src.getFieldDesc()->getPrimType();
+
+  TYPE_SWITCH(ElemT, {
+    for (unsigned I = 0; I != DstElems; ++I) {
+      if (MaskAPS[I])
+        Dst.elem<T>(I) = Src.elem<T>(Base + I);
+      else
+        Dst.elem<T>(I) = Merge.elem<T>(I);
+    }
+  });
+
+  Dst.initializeAllElements();
+  return true;
+}
+
 static bool interp__builtin_x86_insert_subvector(InterpState &S, CodePtr OpPC,
                                                  const CallExpr *Call,
                                                  unsigned ID) {
@@ -2899,6 +3115,45 @@ static bool interp__builtin_x86_insert_subvector(InterpState &S, CodePtr OpPC,
   return true;
 }
 
+static bool interp__builtin_ia32_phminposuw(InterpState &S, CodePtr OpPC,
+                                            const CallExpr *Call) {
+  assert(Call->getNumArgs() == 1);
+
+  const Pointer &Source = S.Stk.pop<Pointer>();
+  const Pointer &Dest = S.Stk.peek<Pointer>();
+
+  unsigned SourceLen = Source.getNumElems();
+  QualType ElemQT = getElemType(Source);
+  OptPrimType ElemT = S.getContext().classify(ElemQT);
+  unsigned ElemBitWidth = S.getASTContext().getTypeSize(ElemQT);
+
+  bool DestUnsigned = Call->getCallReturnType(S.getASTContext())
+                          ->castAs<VectorType>()
+                          ->getElementType()
+                          ->isUnsignedIntegerOrEnumerationType();
+
+  INT_TYPE_SWITCH_NO_BOOL(*ElemT, {
+    APSInt MinIndex(ElemBitWidth, DestUnsigned);
+    APSInt MinVal = Source.elem<T>(0).toAPSInt();
+
+    for (unsigned I = 1; I != SourceLen; ++I) {
+      APSInt Val = Source.elem<T>(I).toAPSInt();
+      if (MinVal.ugt(Val)) {
+        MinVal = Val;
+        MinIndex = I;
+      }
+    }
+
+    Dest.elem<T>(0) = static_cast<T>(MinVal);
+    Dest.elem<T>(1) = static_cast<T>(MinIndex);
+    for (unsigned I = 2; I != SourceLen; ++I) {
+      Dest.elem<T>(I) = static_cast<T>(APSInt(ElemBitWidth, DestUnsigned));
+    }
+  });
+  Dest.initializeAllElements();
+  return true;
+}
+
 static bool interp__builtin_ia32_pternlog(InterpState &S, CodePtr OpPC,
                                           const CallExpr *Call, bool MaskZ) {
   assert(Call->getNumArgs() == 5);
@@ -2997,6 +3252,33 @@ static bool interp__builtin_vec_set(InterpState &S, CodePtr OpPC,
   return true;
 }
 
+static bool interp__builtin_ia32_vpconflict(InterpState &S, CodePtr OpPC,
+                                            const CallExpr *Call) {
+  assert(Call->getNumArgs() == 1);
+
+  QualType Arg0Type = Call->getArg(0)->getType();
+  const auto *VecT = Arg0Type->castAs<VectorType>();
+  PrimType ElemT = *S.getContext().classify(VecT->getElementType());
+  unsigned NumElems = VecT->getNumElements();
+  bool DestUnsigned = Call->getType()->isUnsignedIntegerOrEnumerationType();
+  const Pointer &Src = S.Stk.pop<Pointer>();
+  const Pointer &Dst = S.Stk.peek<Pointer>();
+
+  for (unsigned I = 0; I != NumElems; ++I) {
+    INT_TYPE_SWITCH_NO_BOOL(ElemT, {
+      APSInt ElemI = Src.elem<T>(I).toAPSInt();
+      APInt ConflictMask(ElemI.getBitWidth(), 0);
+      for (unsigned J = 0; J != I; ++J) {
+        APSInt ElemJ = Src.elem<T>(J).toAPSInt();
+        ConflictMask.setBitVal(J, ElemI == ElemJ);
+      }
+      Dst.elem<T>(I) = static_cast<T>(APSInt(ConflictMask, DestUnsigned));
+    });
+  }
+  Dst.initializeAllElements();
+  return true;
+}
+
 bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
                       uint32_t BuiltinID) {
   if (!S.getASTContext().BuiltinInfo.isConstantEvaluated(BuiltinID))
@@ -3154,14 +3436,14 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
   case Builtin::BI__builtin_parityl:
   case Builtin::BI__builtin_parityll:
     return interp__builtin_elementwise_int_unaryop(
-        S, OpPC, Call, [](const APSInt &Val) -> APInt {
+        S, OpPC, Call, [](const APSInt &Val) {
           return APInt(Val.getBitWidth(), Val.popcount() % 2);
         });
   case Builtin::BI__builtin_clrsb:
   case Builtin::BI__builtin_clrsbl:
   case Builtin::BI__builtin_clrsbll:
     return interp__builtin_elementwise_int_unaryop(
-        S, OpPC, Call, [](const APSInt &Val) -> APInt {
+        S, OpPC, Call, [](const APSInt &Val) {
           return APInt(Val.getBitWidth(),
                        Val.getBitWidth() - Val.getSignificantBits());
         });
@@ -3170,8 +3452,7 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
   case Builtin::BI__builtin_bitreverse32:
   case Builtin::BI__builtin_bitreverse64:
     return interp__builtin_elementwise_int_unaryop(
-        S, OpPC, Call,
-        [](const APSInt &Val) -> APInt { return Val.reverseBits(); });
+        S, OpPC, Call, [](const APSInt &Val) { return Val.reverseBits(); });
 
   case Builtin::BI__builtin_classify_type:
     return interp__builtin_classify_type(S, OpPC, Frame, Call);
@@ -3490,6 +3771,43 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
         S, OpPC, Call, [](const APSInt &LHS, const APSInt &RHS) {
           return LHS.isSigned() ? LHS.ssub_sat(RHS) : LHS.usub_sat(RHS);
         });
+  case X86::BI__builtin_ia32_extract128i256:
+  case X86::BI__builtin_ia32_vextractf128_pd256:
+  case X86::BI__builtin_ia32_vextractf128_ps256:
+  case X86::BI__builtin_ia32_vextractf128_si256:
+    return interp__builtin_x86_extract_vector(S, OpPC, Call, BuiltinID);
+
+  case X86::BI__builtin_ia32_extractf32x4_256_mask:
+  case X86::BI__builtin_ia32_extractf32x4_mask:
+  case X86::BI__builtin_ia32_extractf32x8_mask:
+  case X86::BI__builtin_ia32_extractf64x2_256_mask:
+  case X86::BI__builtin_ia32_extractf64x2_512_mask:
+  case X86::BI__builtin_ia32_extractf64x4_mask:
+  case X86::BI__builtin_ia32_extracti32x4_256_mask:
+  case X86::BI__builtin_ia32_extracti32x4_mask:
+  case X86::BI__builtin_ia32_extracti32x8_mask:
+  case X86::BI__builtin_ia32_extracti64x2_256_mask:
+  case X86::BI__builtin_ia32_extracti64x2_512_mask:
+  case X86::BI__builtin_ia32_extracti64x4_mask:
+    return interp__builtin_x86_extract_vector_masked(S, OpPC, Call, BuiltinID);
+
+  case clang::X86::BI__builtin_ia32_pmulhrsw128:
+  case clang::X86::BI__builtin_ia32_pmulhrsw256:
+  case clang::X86::BI__builtin_ia32_pmulhrsw512:
+    return interp__builtin_elementwise_int_binop(
+        S, OpPC, Call, [](const APSInt &LHS, const APSInt &RHS) {
+          return (llvm::APIntOps::mulsExtended(LHS, RHS).ashr(14) + 1)
+              .extractBits(16, 1);
+        });
+
+  case clang::X86::BI__builtin_ia32_movmskps:
+  case clang::X86::BI__builtin_ia32_movmskpd:
+  case clang::X86::BI__builtin_ia32_pmovmskb128:
+  case clang::X86::BI__builtin_ia32_pmovmskb256:
+  case clang::X86::BI__builtin_ia32_movmskps256:
+  case clang::X86::BI__builtin_ia32_movmskpd256: {
+    return interp__builtin_ia32_movmsk_op(S, OpPC, Call);
+  }
 
   case clang::X86::BI__builtin_ia32_pavgb128:
   case clang::X86::BI__builtin_ia32_pavgw128:
@@ -3665,6 +3983,53 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
   case Builtin::BI__builtin_elementwise_min:
     return interp__builtin_elementwise_maxmin(S, OpPC, Call, BuiltinID);
 
+  case clang::X86::BI__builtin_ia32_phaddw128:
+  case clang::X86::BI__builtin_ia32_phaddw256:
+  case clang::X86::BI__builtin_ia32_phaddd128:
+  case clang::X86::BI__builtin_ia32_phaddd256:
+    return interp_builtin_horizontal_int_binop(
+        S, OpPC, Call,
+        [](const APSInt &LHS, const APSInt &RHS) { return LHS + RHS; });
+  case clang::X86::BI__builtin_ia32_phaddsw128:
+  case clang::X86::BI__builtin_ia32_phaddsw256:
+    return interp_builtin_horizontal_int_binop(
+        S, OpPC, Call,
+        [](const APSInt &LHS, const APSInt &RHS) { return LHS.sadd_sat(RHS); });
+  case clang::X86::BI__builtin_ia32_phsubw128:
+  case clang::X86::BI__builtin_ia32_phsubw256:
+  case clang::X86::BI__builtin_ia32_phsubd128:
+  case clang::X86::BI__builtin_ia32_phsubd256:
+    return interp_builtin_horizontal_int_binop(
+        S, OpPC, Call,
+        [](const APSInt &LHS, const APSInt &RHS) { return LHS - RHS; });
+  case clang::X86::BI__builtin_ia32_phsubsw128:
+  case clang::X86::BI__builtin_ia32_phsubsw256:
+    return interp_builtin_horizontal_int_binop(
+        S, OpPC, Call,
+        [](const APSInt &LHS, const APSInt &RHS) { return LHS.ssub_sat(RHS); });
+  case clang::X86::BI__builtin_ia32_haddpd:
+  case clang::X86::BI__builtin_ia32_haddps:
+  case clang::X86::BI__builtin_ia32_haddpd256:
+  case clang::X86::BI__builtin_ia32_haddps256:
+    return interp_builtin_horizontal_fp_binop(
+        S, OpPC, Call,
+        [](const APFloat &LHS, const APFloat &RHS, llvm::RoundingMode RM) {
+          APFloat F = LHS;
+          F.add(RHS, RM);
+          return F;
+        });
+  case clang::X86::BI__builtin_ia32_hsubpd:
+  case clang::X86::BI__builtin_ia32_hsubps:
+  case clang::X86::BI__builtin_ia32_hsubpd256:
+  case clang::X86::BI__builtin_ia32_hsubps256:
+    return interp_builtin_horizontal_fp_binop(
+        S, OpPC, Call,
+        [](const APFloat &LHS, const APFloat &RHS, llvm::RoundingMode RM) {
+          APFloat F = LHS;
+          F.subtract(RHS, RM);
+          return F;
+        });
+
   case clang::X86::BI__builtin_ia32_pmuldq128:
   case clang::X86::BI__builtin_ia32_pmuldq256:
   case clang::X86::BI__builtin_ia32_pmuldq512:
@@ -3695,6 +4060,21 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
           return F;
         });
 
+  case X86::BI__builtin_ia32_vpmadd52luq128:
+  case X86::BI__builtin_ia32_vpmadd52luq256:
+  case X86::BI__builtin_ia32_vpmadd52luq512:
+    return interp__builtin_elementwise_triop(
+        S, OpPC, Call, [](const APSInt &A, const APSInt &B, const APSInt &C) {
+          return A + (B.trunc(52) * C.trunc(52)).zext(64);
+        });
+  case X86::BI__builtin_ia32_vpmadd52huq128:
+  case X86::BI__builtin_ia32_vpmadd52huq256:
+  case X86::BI__builtin_ia32_vpmadd52huq512:
+    return interp__builtin_elementwise_triop(
+        S, OpPC, Call, [](const APSInt &A, const APSInt &B, const APSInt &C) {
+          return A + llvm::APIntOps::mulhu(B.trunc(52), C.trunc(52)).zext(64);
+        });
+
   case X86::BI__builtin_ia32_vpshldd128:
   case X86::BI__builtin_ia32_vpshldd256:
   case X86::BI__builtin_ia32_vpshldd512:
@@ -3725,7 +4105,13 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
         [](const APSInt &Lo, const APSInt &Hi, const APSInt &Amt) {
           return llvm::APIntOps::fshr(Hi, Lo, Amt);
         });
-
+  case X86::BI__builtin_ia32_vpconflictsi_128:
+  case X86::BI__builtin_ia32_vpconflictsi_256:
+  case X86::BI__builtin_ia32_vpconflictsi_512:
+  case X86::BI__builtin_ia32_vpconflictdi_128:
+  case X86::BI__builtin_ia32_vpconflictdi_256:
+  case X86::BI__builtin_ia32_vpconflictdi_512:
+    return interp__builtin_ia32_vpconflict(S, OpPC, Call);
   case clang::X86::BI__builtin_ia32_blendpd:
   case clang::X86::BI__builtin_ia32_blendpd256:
   case clang::X86::BI__builtin_ia32_blendps:
@@ -3805,6 +4191,11 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
   case X86::BI__builtin_ia32_selectpd_512:
     return interp__builtin_select(S, OpPC, Call);
 
+  case X86::BI__builtin_ia32_pshufb128:
+  case X86::BI__builtin_ia32_pshufb256:
+  case X86::BI__builtin_ia32_pshufb512:
+    return interp__builtin_ia32_pshufb(S, OpPC, Call);
+
   case X86::BI__builtin_ia32_pshuflw:
   case X86::BI__builtin_ia32_pshuflw256:
   case X86::BI__builtin_ia32_pshuflw512:
@@ -3875,6 +4266,9 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
         S, OpPC, Call,
         [](const APSInt &LHS, const APSInt &RHS) { return LHS + RHS; });
 
+  case X86::BI__builtin_ia32_phminposuw128:
+    return interp__builtin_ia32_phminposuw(S, OpPC, Call);
+
   case X86::BI__builtin_ia32_pternlogd128_mask:
   case X86::BI__builtin_ia32_pternlogd256_mask:
   case X86::BI__builtin_ia32_pternlogd512_mask:
diff --git a/clang/lib/AST/ByteCode/InterpHelpers.h b/clang/lib/AST/ByteCode/InterpHelpers.h
new file mode 100644
index 0000000000000..6bf89d318378c
--- /dev/null
+++ b/clang/lib/AST/ByteCode/InterpHelpers.h
@@ -0,0 +1,141 @@
+//===--- InterpHelpers.h - Interpreter Helper Functions --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_AST_INTERP_INTERPHELPERS_H
+#define LLVM_CLANG_AST_INTERP_INTERPHELPERS_H
+
+#include "DynamicAllocator.h"
+#include "InterpState.h"
+#include "Pointer.h"
+
+namespace clang {
+class CallExpr;
+class OffsetOfExpr;
+
+namespace interp {
+class Block;
+struct Descriptor;
+
+/// Interpreter entry point.
+bool Interpret(InterpState &S);
+
+/// Interpret a builtin function.
+bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
+                      uint32_t BuiltinID);
+
+/// Interpret an offsetof operation.
+bool InterpretOffsetOf(InterpState &S, CodePtr OpPC, const OffsetOfExpr *E,
+                       ArrayRef<int64_t> ArrayIndices, int64_t &Result);
+
+/// Checks if the array is offsetable.
+bool CheckArray(InterpState &S, CodePtr OpPC, const Pointer &Ptr);
+
+/// Checks if a pointer is live and accessible.
+bool CheckLive(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
+               AccessKinds AK);
+
+/// Checks if a pointer is a dummy pointer.
+bool CheckDummy(InterpState &S, CodePtr OpPC, const Block *B, AccessKinds AK);
+
+/// Checks if a pointer is in range.
+bool CheckRange(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
+                AccessKinds AK);
+
+/// Checks if a field from which a pointer is going to be derived is valid.
+bool CheckRange(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
+                CheckSubobjectKind CSK);
+
+/// Checks if a pointer points to a mutable field.
+bool CheckMutable(InterpState &S, CodePtr OpPC, const Pointer &Ptr);
+
+/// Checks if a value can be loaded from a block.
+bool CheckLoad(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
+               AccessKinds AK = AK_Read);
+
+/// Diagnose mismatched new[]/delete or new/delete[] pairs.
+bool CheckNewDeleteForms(InterpState &S, CodePtr OpPC,
+                         DynamicAllocator::Form AllocForm,
+                         DynamicAllocator::Form DeleteForm, const Descriptor *D,
+                         const Expr *NewExpr);
+
+/// Copy the contents of Src into Dest.
+bool DoMemcpy(InterpState &S, CodePtr OpPC, const Pointer &Src, Pointer &Dest);
+
+template <typename T>
+static bool handleOverflow(InterpState &S, CodePtr OpPC, const T &SrcValue) {
+  const Expr *E = S.Current->getExpr(OpPC);
+  S.CCEDiag(E, diag::note_constexpr_overflow) << SrcValue << E->getType();
+  return S.noteUndefinedBehavior();
+}
+
+inline bool CheckArraySize(InterpState &S, CodePtr OpPC, uint64_t NumElems) {
+  uint64_t Limit = S.getLangOpts().ConstexprStepLimit;
+  if (Limit != 0 && NumElems > Limit) {
+    S.FFDiag(S.Current->getSource(OpPC),
+             diag::note_constexpr_new_exceeds_limits)
+        << NumElems << Limit;
+    return false;
+  }
+  return true;
+}
+
+static inline llvm::RoundingMode getRoundingMode(FPOptions FPO) {
+  auto RM = FPO.getRoundingMode();
+  if (RM == llvm::RoundingMode::Dynamic)
+    return llvm::RoundingMode::NearestTiesToEven;
+  return RM;
+}
+
+inline bool Invalid(InterpState &S, CodePtr OpPC) {
+  const SourceLocation &Loc = S.Current->getLocation(OpPC);
+  S.FFDiag(Loc, diag::note_invalid_subexpr_in_const_expr)
+      << S.Current->getRange(OpPC);
+  return false;
+}
+
+template <typename SizeT>
+bool CheckArraySize(InterpState &S, CodePtr OpPC, SizeT *NumElements,
+                    unsigned ElemSize, bool IsNoThrow) {
+  // FIXME: Both the SizeT::from() as well as the
+  // NumElements.toAPSInt() in this function are rather expensive.
+
+  // Can't be too many elements if the bitwidth of NumElements is lower than
+  // that of Descriptor::MaxArrayElemBytes.
+  if ((NumElements->bitWidth() - NumElements->isSigned()) <
+      (sizeof(Descriptor::MaxArrayElemBytes) * 8))
+    return true;
+
+  // FIXME: GH63562
+  // APValue stores array extents as unsigned,
+  // so anything that is greater that unsigned would overflow when
+  // constructing the array, we catch this here.
+  SizeT MaxElements = SizeT::from(Descriptor::MaxArrayElemBytes / ElemSize);
+  assert(MaxElements.isPositive());
+  if (NumElements->toAPSInt().getActiveBits() >
+          ConstantArrayType::getMaxSizeBits(S.getASTContext()) ||
+      *NumElements > MaxElements) {
+    if (!IsNoThrow) {
+      const SourceInfo &Loc = S.Current->getSource(OpPC);
+
+      if (NumElements->isSigned() && NumElements->isNegative()) {
+        S.FFDiag(Loc, diag::note_constexpr_new_negative)
+            << NumElements->toDiagnosticString(S.getASTContext());
+      } else {
+        S.FFDiag(Loc, diag::note_constexpr_new_too_large)
+            << NumElements->toDiagnosticString(S.getASTContext());
+      }
+    }
+    return false;
+  }
+  return true;
+}
+
+} // namespace interp
+} // namespace clang
+
+#endif // LLVM_CLANG_AST_INTERP_INTERPHELPERS_H
diff --git a/clang/lib/AST/ByteCode/InterpState.h b/clang/lib/AST/ByteCode/InterpState.h
index a13244bf383ae..e2e4d5c985f93 100644
--- a/clang/lib/AST/ByteCode/InterpState.h
+++ b/clang/lib/AST/ByteCode/InterpState.h
@@ -114,7 +114,7 @@ class InterpState final : public State, public SourceMapper {
       Alloc = std::make_unique<DynamicAllocator>();
     }
 
-    return *Alloc.get();
+    return *Alloc;
   }
 
   /// Diagnose any dynamic allocations that haven't been freed yet.
diff --git a/clang/lib/AST/ByteCode/Pointer.cpp b/clang/lib/AST/ByteCode/Pointer.cpp
index 663134c8696de..e417bdfb81b8f 100644
--- a/clang/lib/AST/ByteCode/Pointer.cpp
+++ b/clang/lib/AST/ByteCode/Pointer.cpp
@@ -751,7 +751,7 @@ std::optional<APValue> Pointer::toRValue(const Context &Ctx,
       assert(Record && "Missing record descriptor");
 
       bool Ok = true;
-      if (RT->getOriginalDecl()->isUnion()) {
+      if (RT->getDecl()->isUnion()) {
         const FieldDecl *ActiveField = nullptr;
         APValue Value;
         for (const auto &F : Record->fields()) {
diff --git a/clang/lib/AST/ByteCode/Program.cpp b/clang/lib/AST/ByteCode/Program.cpp
index 75bfd9fd2d8ec..e0b2852f0e906 100644
--- a/clang/lib/AST/ByteCode/Program.cpp
+++ b/clang/lib/AST/ByteCode/Program.cpp
@@ -226,11 +226,10 @@ UnsignedOrNone Program::createGlobal(const ValueDecl *VD, const Expr *Init) {
         Globals[PIdx] = NewGlobal;
         // All pointers pointing to the previous extern decl now point to the
         // new decl.
-        for (Pointer *Ptr = RedeclBlock->Pointers; Ptr; Ptr = Ptr->BS.Next) {
-          RedeclBlock->removePointer(Ptr);
-          Ptr->BS.Pointee = NewGlobal->block();
-          NewGlobal->block()->addPointer(Ptr);
-        }
+        // A previous iteration might've already fixed up the pointers for this
+        // global.
+        if (RedeclBlock != NewGlobal->block())
+          RedeclBlock->movePointersTo(NewGlobal->block());
       }
     }
     PIdx = *Idx;
diff --git a/clang/lib/AST/Comment.cpp b/clang/lib/AST/Comment.cpp
index 37e21c340c316..361a8a7e68990 100644
--- a/clang/lib/AST/Comment.cpp
+++ b/clang/lib/AST/Comment.cpp
@@ -56,16 +56,16 @@ good implements_child_begin_end(Comment::child_iterator (T::*)() const) {
   return good();
 }
 
-LLVM_ATTRIBUTE_UNUSED
-static inline bad implements_child_begin_end(
-                      Comment::child_iterator (Comment::*)() const) {
+[[maybe_unused]]
+static inline bad
+implements_child_begin_end(Comment::child_iterator (Comment::*)() const) {
   return bad();
 }
 
 #define ASSERT_IMPLEMENTS_child_begin(function) \
   (void) good(implements_child_begin_end(function))
 
-LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]]
 static inline void CheckCommentASTNodes() {
 #define ABSTRACT_COMMENT(COMMENT)
 #define COMMENT(CLASS, PARENT) \
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index c7341552be365..8579e51e45697 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -2989,10 +2989,7 @@ bool ParmVarDecl::isDestroyedInCallee() const {
   // FIXME: isParamDestroyedInCallee() should probably imply
   // isDestructedType()
   const auto *RT = getType()->getAsCanonical<RecordType>();
-  if (RT &&
-      RT->getOriginalDecl()
-          ->getDefinitionOrSelf()
-          ->isParamDestroyedInCallee() &&
+  if (RT && RT->getDecl()->getDefinitionOrSelf()->isParamDestroyedInCallee() &&
       getType().isDestructedType())
     return true;
 
@@ -3316,6 +3313,10 @@ bool FunctionDecl::isImmediateEscalating() const {
       CD && CD->isInheritingConstructor())
     return CD->getInheritedConstructor().getConstructor();
 
+  // Destructors are not immediate escalating.
+  if (isa<CXXDestructorDecl>(this))
+    return false;
+
   // - a function that results from the instantiation of a templated entity
   // defined with the constexpr specifier.
   TemplatedKind TK = getTemplatedKind();
@@ -3379,11 +3380,11 @@ bool FunctionDecl::isMSVCRTEntryPoint() const {
     return false;
 
   return llvm::StringSwitch<bool>(getName())
-      .Cases("main",     // an ANSI console app
-             "wmain",    // a Unicode console App
-             "WinMain",  // an ANSI GUI app
-             "wWinMain", // a Unicode GUI app
-             "DllMain",  // a DLL
+      .Cases({"main",     // an ANSI console app
+              "wmain",    // a Unicode console App
+              "WinMain",  // an ANSI GUI app
+              "wWinMain", // a Unicode GUI app
+              "DllMain"}, // a DLL
              true)
       .Default(false);
 }
@@ -3503,7 +3504,7 @@ bool FunctionDecl::isUsableAsGlobalAllocationFunctionInConstantEvaluation(
     while (const auto *TD = T->getAs<TypedefType>())
       T = TD->getDecl()->getUnderlyingType();
     const IdentifierInfo *II =
-        T->castAsCanonical<EnumType>()->getOriginalDecl()->getIdentifier();
+        T->castAsCanonical<EnumType>()->getDecl()->getIdentifier();
     if (II && II->isStr("__hot_cold_t"))
       Consume();
   }
@@ -4705,7 +4706,7 @@ bool FieldDecl::isAnonymousStructOrUnion() const {
     return false;
 
   if (const auto *Record = getType()->getAsCanonical<RecordType>())
-    return Record->getOriginalDecl()->isAnonymousStructOrUnion();
+    return Record->getDecl()->isAnonymousStructOrUnion();
 
   return false;
 }
@@ -4765,7 +4766,7 @@ bool FieldDecl::isZeroSize(const ASTContext &Ctx) const {
   const auto *RT = getType()->getAsCanonical<RecordType>();
   if (!RT)
     return false;
-  const RecordDecl *RD = RT->getOriginalDecl()->getDefinition();
+  const RecordDecl *RD = RT->getDecl()->getDefinition();
   if (!RD) {
     assert(isInvalidDecl() && "valid field has incomplete type");
     return false;
@@ -5190,7 +5191,7 @@ bool RecordDecl::isOrContainsUnion() const {
   if (const RecordDecl *Def = getDefinition()) {
     for (const FieldDecl *FD : Def->fields()) {
       const RecordType *RT = FD->getType()->getAsCanonical<RecordType>();
-      if (RT && RT->getOriginalDecl()->isOrContainsUnion())
+      if (RT && RT->getDecl()->isOrContainsUnion())
         return true;
     }
   }
@@ -5688,14 +5689,14 @@ void TypedefNameDecl::anchor() {}
 
 TagDecl *TypedefNameDecl::getAnonDeclWithTypedefName(bool AnyRedecl) const {
   if (auto *TT = getTypeSourceInfo()->getType()->getAs<TagType>()) {
-    auto *OwningTypedef = TT->getOriginalDecl()->getTypedefNameForAnonDecl();
+    auto *OwningTypedef = TT->getDecl()->getTypedefNameForAnonDecl();
     auto *ThisTypedef = this;
     if (AnyRedecl && OwningTypedef) {
       OwningTypedef = OwningTypedef->getCanonicalDecl();
       ThisTypedef = ThisTypedef->getCanonicalDecl();
     }
     if (OwningTypedef == ThisTypedef)
-      return TT->getOriginalDecl()->getDefinitionOrSelf();
+      return TT->getDecl()->getDefinitionOrSelf();
   }
 
   return nullptr;
@@ -5704,7 +5705,7 @@ TagDecl *TypedefNameDecl::getAnonDeclWithTypedefName(bool AnyRedecl) const {
 bool TypedefNameDecl::isTransparentTagSlow() const {
   auto determineIsTransparent = [&]() {
     if (auto *TT = getUnderlyingType()->getAs<TagType>()) {
-      if (auto *TD = TT->getOriginalDecl()) {
+      if (auto *TD = TT->getDecl()) {
         if (TD->getName() != getName())
           return false;
         SourceLocation TTLoc = getLocation();
diff --git a/clang/lib/AST/DeclBase.cpp b/clang/lib/AST/DeclBase.cpp
index b244f0a6e6a95..30c6d3ed91f1e 100644
--- a/clang/lib/AST/DeclBase.cpp
+++ b/clang/lib/AST/DeclBase.cpp
@@ -77,8 +77,11 @@ void *Decl::operator new(std::size_t Size, const ASTContext &Context,
   *PrefixPtr = ID.getRawValue();
 
   // We leave the upper 16 bits to store the module IDs. 48 bits should be
-  // sufficient to store a declaration ID.
-  assert(*PrefixPtr < llvm::maskTrailingOnes<uint64_t>(48));
+  // sufficient to store a declaration ID. See the comments in setOwningModuleID
+  // for details.
+  assert((*PrefixPtr < llvm::maskTrailingOnes<uint64_t>(48)) &&
+         "Current Implementation limits the number of module files to not "
+         "exceed 2^16. Contact Clang Developers to remove the limitation.");
 
   return Result;
 }
@@ -122,6 +125,25 @@ unsigned Decl::getOwningModuleID() const {
 
 void Decl::setOwningModuleID(unsigned ID) {
   assert(isFromASTFile() && "Only works on a deserialized declaration");
+  // Currently, we use 64 bits to store the GlobalDeclID and the module ID
+  // to save the space. See `Decl::operator new` for details. To make it,
+  // we split the higher 32 bits to 2 16bits for the module file index of
+  // GlobalDeclID and the module ID. This introduces a limitation that the
+  // number of modules can't exceed 2^16. (The number of module files should be
+  // less than the number of modules).
+  //
+  // It is counter-intuitive to store both the module file index and the
+  // module ID as it seems redundant. However, this is not true.
+  // The module ID may be different from the module file where it is serialized
+  // from for implicit template instantiations. See
+  // https://github.com/llvm/llvm-project/issues/101939
+  //
+  // If we reach the limitation, we have to remove the limitation by asking
+  // every deserialized declaration to pay for yet another 32 bits, or we have
+  // to review the above issue to decide what we should do for it.
+  assert((ID < llvm::maskTrailingOnes<unsigned>(16)) &&
+         "Current Implementation limits the number of modules to not exceed "
+         "2^16. Contact Clang Developers to remove the limitation.");
   uint64_t *IDAddress = (uint64_t *)this - 1;
   *IDAddress &= llvm::maskTrailingOnes<uint64_t>(48);
   *IDAddress |= (uint64_t)ID << 48;
diff --git a/clang/lib/AST/DeclCXX.cpp b/clang/lib/AST/DeclCXX.cpp
index 43264f835122f..24e4f189cbe4a 100644
--- a/clang/lib/AST/DeclCXX.cpp
+++ b/clang/lib/AST/DeclCXX.cpp
@@ -2314,7 +2314,7 @@ bool CXXRecordDecl::mayBeAbstract() const {
 
   for (const auto &B : bases()) {
     const auto *BaseDecl = cast<CXXRecordDecl>(
-        B.getType()->castAsCanonical<RecordType>()->getOriginalDecl());
+        B.getType()->castAsCanonical<RecordType>()->getDecl());
     if (BaseDecl->isAbstract())
       return true;
   }
diff --git a/clang/lib/AST/DeclPrinter.cpp b/clang/lib/AST/DeclPrinter.cpp
index 7f3dcca926cd3..47ae613b643b6 100644
--- a/clang/lib/AST/DeclPrinter.cpp
+++ b/clang/lib/AST/DeclPrinter.cpp
@@ -485,7 +485,7 @@ void DeclPrinter::VisitDeclContext(DeclContext *DC, bool Indent) {
       QualType BaseType = GetBaseType(CurDeclType);
       if (const auto *TT = dyn_cast_or_null<TagType>(BaseType);
           TT && TT->isTagOwned()) {
-        if (TT->getOriginalDecl() == Decls[0]) {
+        if (TT->getDecl() == Decls[0]) {
           Decls.push_back(*D);
           continue;
         }
diff --git a/clang/lib/AST/DeclarationName.cpp b/clang/lib/AST/DeclarationName.cpp
index 55f5a994d788b..9a89a66f3f63a 100644
--- a/clang/lib/AST/DeclarationName.cpp
+++ b/clang/lib/AST/DeclarationName.cpp
@@ -116,12 +116,12 @@ static void printCXXConstructorDestructorName(QualType ClassType,
   Policy.SuppressScope = true;
 
   if (const RecordType *ClassRec = ClassType->getAsCanonical<RecordType>()) {
-    ClassRec->getOriginalDecl()->printName(OS, Policy);
+    ClassRec->getDecl()->printName(OS, Policy);
     return;
   }
   if (Policy.SuppressTemplateArgsInCXXConstructors) {
     if (auto *InjTy = ClassType->getAsCanonical<InjectedClassNameType>()) {
-      InjTy->getOriginalDecl()->printName(OS, Policy);
+      InjTy->getDecl()->printName(OS, Policy);
       return;
     }
   }
@@ -185,7 +185,7 @@ void DeclarationName::print(raw_ostream &OS,
     OS << "operator ";
     QualType Type = getCXXNameType();
     if (const RecordType *Rec = Type->getAs<RecordType>()) {
-      OS << *Rec->getOriginalDecl();
+      OS << *Rec->getDecl();
       return;
     }
     // We know we're printing C++ here, ensure we print 'bool' properly.
diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp
index 597cbd846e4d9..340bb4b2ed6a3 100644
--- a/clang/lib/AST/Expr.cpp
+++ b/clang/lib/AST/Expr.cpp
@@ -4126,9 +4126,7 @@ Expr::isNullPointerConstant(ASTContext &Ctx,
 
   if (const RecordType *UT = getType()->getAsUnionType())
     if (!Ctx.getLangOpts().CPlusPlus11 && UT &&
-        UT->getOriginalDecl()
-            ->getMostRecentDecl()
-            ->hasAttr<TransparentUnionAttr>())
+        UT->getDecl()->getMostRecentDecl()->hasAttr<TransparentUnionAttr>())
       if (const CompoundLiteralExpr *CLE = dyn_cast<CompoundLiteralExpr>(this)){
         const Expr *InitExpr = CLE->getInitializer();
         if (const InitListExpr *ILE = dyn_cast<InitListExpr>(InitExpr))
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index dfdfef2a87bff..00aaaab957591 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -4074,8 +4074,7 @@ findSubobject(EvalInfo &Info, const Expr *E, const CompleteObject &Obj,
       }
 
       // Next subobject is a class, struct or union field.
-      RecordDecl *RD =
-          ObjType->castAsCanonical<RecordType>()->getOriginalDecl();
+      RecordDecl *RD = ObjType->castAsCanonical<RecordType>()->getDecl();
       if (RD->isUnion()) {
         const FieldDecl *UnionField = O->getUnionField();
         if (!UnionField ||
@@ -7810,7 +7809,7 @@ class BufferToAPValueConverter {
 
   std::optional<APValue> visit(const EnumType *Ty, CharUnits Offset) {
     QualType RepresentationType =
-        Ty->getOriginalDecl()->getDefinitionOrSelf()->getIntegerType();
+        Ty->getDecl()->getDefinitionOrSelf()->getIntegerType();
     assert(!RepresentationType.isNull() &&
            "enum forward decl should be caught by Sema");
     const auto *AsBuiltin =
@@ -8607,7 +8606,7 @@ class ExprEvaluatorBase
     const FieldDecl *FD = dyn_cast<FieldDecl>(E->getMemberDecl());
     if (!FD) return Error(E);
     assert(!FD->getType()->isReferenceType() && "prvalue reference?");
-    assert(BaseTy->castAsCanonical<RecordType>()->getOriginalDecl() ==
+    assert(BaseTy->castAsCanonical<RecordType>()->getDecl() ==
                FD->getParent()->getCanonicalDecl() &&
            "record / field mismatch");
 
@@ -8836,7 +8835,7 @@ class LValueExprEvaluatorBase
 
     const ValueDecl *MD = E->getMemberDecl();
     if (const FieldDecl *FD = dyn_cast<FieldDecl>(E->getMemberDecl())) {
-      assert(BaseTy->castAsCanonical<RecordType>()->getOriginalDecl() ==
+      assert(BaseTy->castAsCanonical<RecordType>()->getDecl() ==
                  FD->getParent()->getCanonicalDecl() &&
              "record / field mismatch");
       (void)BaseTy;
@@ -11619,6 +11618,44 @@ static bool evalPackBuiltin(const CallExpr *E, EvalInfo &Info, APValue &Result,
   return true;
 }
 
+static bool evalPshufbBuiltin(EvalInfo &Info, const CallExpr *Call,
+                              APValue &Out) {
+  APValue SrcVec, ControlVec;
+  if (!EvaluateAsRValue(Info, Call->getArg(0), SrcVec))
+    return false;
+  if (!EvaluateAsRValue(Info, Call->getArg(1), ControlVec))
+    return false;
+
+  const auto *VT = Call->getType()->getAs<VectorType>();
+  if (!VT)
+    return false;
+
+  QualType ElemT = VT->getElementType();
+  unsigned NumElts = VT->getNumElements();
+
+  SmallVector<APValue, 64> ResultElements;
+  ResultElements.reserve(NumElts);
+
+  for (unsigned Idx = 0; Idx != NumElts; ++Idx) {
+    APValue CtlVal = ControlVec.getVectorElt(Idx);
+    APSInt CtlByte = CtlVal.getInt();
+    uint8_t Ctl = static_cast<uint8_t>(CtlByte.getZExtValue());
+
+    if (Ctl & 0x80) {
+      APValue Zero(Info.Ctx.MakeIntValue(0, ElemT));
+      ResultElements.push_back(Zero);
+    } else {
+      unsigned LaneBase = (Idx / 16) * 16;
+      unsigned SrcOffset = Ctl & 0x0F;
+      unsigned SrcIdx = LaneBase + SrcOffset;
+
+      ResultElements.push_back(SrcVec.getVectorElt(SrcIdx));
+    }
+  }
+  Out = APValue(ResultElements.data(), ResultElements.size());
+  return true;
+}
+
 static bool evalPshufBuiltin(EvalInfo &Info, const CallExpr *Call,
                              bool IsShufHW, APValue &Out) {
   APValue Vec;
@@ -11774,6 +11811,73 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
       return LHS.isSigned() ? LHS.ssub_sat(RHS) : LHS.usub_sat(RHS);
     });
 
+  case X86::BI__builtin_ia32_extract128i256:
+  case X86::BI__builtin_ia32_vextractf128_pd256:
+  case X86::BI__builtin_ia32_vextractf128_ps256:
+  case X86::BI__builtin_ia32_vextractf128_si256: {
+    APValue SourceVec, SourceImm;
+    if (!EvaluateAsRValue(Info, E->getArg(0), SourceVec) ||
+        !EvaluateAsRValue(Info, E->getArg(1), SourceImm))
+      return false;
+
+    if (!SourceVec.isVector())
+      return false;
+
+    const auto *RetVT = E->getType()->castAs<VectorType>();
+    unsigned RetLen = RetVT->getNumElements();
+    unsigned Idx = SourceImm.getInt().getZExtValue() & 1;
+
+    SmallVector<APValue, 32> ResultElements;
+    ResultElements.reserve(RetLen);
+
+    for (unsigned I = 0; I < RetLen; I++)
+      ResultElements.push_back(SourceVec.getVectorElt(Idx * RetLen + I));
+
+    return Success(APValue(ResultElements.data(), RetLen), E);
+  }
+
+  case X86::BI__builtin_ia32_extracti32x4_256_mask:
+  case X86::BI__builtin_ia32_extractf32x4_256_mask:
+  case X86::BI__builtin_ia32_extracti32x4_mask:
+  case X86::BI__builtin_ia32_extractf32x4_mask:
+  case X86::BI__builtin_ia32_extracti32x8_mask:
+  case X86::BI__builtin_ia32_extractf32x8_mask:
+  case X86::BI__builtin_ia32_extracti64x2_256_mask:
+  case X86::BI__builtin_ia32_extractf64x2_256_mask:
+  case X86::BI__builtin_ia32_extracti64x2_512_mask:
+  case X86::BI__builtin_ia32_extractf64x2_512_mask:
+  case X86::BI__builtin_ia32_extracti64x4_mask:
+  case X86::BI__builtin_ia32_extractf64x4_mask: {
+    APValue SourceVec, MergeVec;
+    APSInt Imm, MaskImm;
+
+    if (!EvaluateAsRValue(Info, E->getArg(0), SourceVec) ||
+        !EvaluateInteger(E->getArg(1), Imm, Info) ||
+        !EvaluateAsRValue(Info, E->getArg(2), MergeVec) ||
+        !EvaluateInteger(E->getArg(3), MaskImm, Info))
+      return false;
+
+    const auto *RetVT = E->getType()->castAs<VectorType>();
+    unsigned RetLen = RetVT->getNumElements();
+
+    if (!SourceVec.isVector() || !MergeVec.isVector())
+      return false;
+    unsigned SrcLen = SourceVec.getVectorLength();
+    unsigned Lanes = SrcLen / RetLen;
+    unsigned Lane = static_cast<unsigned>(Imm.getZExtValue() % Lanes);
+    unsigned Base = Lane * RetLen;
+
+    SmallVector<APValue, 32> ResultElements;
+    ResultElements.reserve(RetLen);
+    for (unsigned I = 0; I < RetLen; ++I) {
+      if (MaskImm[I])
+        ResultElements.push_back(SourceVec.getVectorElt(Base + I));
+      else
+        ResultElements.push_back(MergeVec.getVectorElt(I));
+    }
+    return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+  }
+
   case clang::X86::BI__builtin_ia32_pavgb128:
   case clang::X86::BI__builtin_ia32_pavgw128:
   case clang::X86::BI__builtin_ia32_pavgb256:
@@ -11782,6 +11886,14 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
   case clang::X86::BI__builtin_ia32_pavgw512:
     return EvaluateBinOpExpr(llvm::APIntOps::avgCeilU);
 
+  case clang::X86::BI__builtin_ia32_pmulhrsw128:
+  case clang::X86::BI__builtin_ia32_pmulhrsw256:
+  case clang::X86::BI__builtin_ia32_pmulhrsw512:
+    return EvaluateBinOpExpr([](const APSInt &LHS, const APSInt &RHS) {
+      return (llvm::APIntOps::mulsExtended(LHS, RHS).ashr(14) + 1)
+          .extractBits(16, 1);
+    });
+
   case clang::X86::BI__builtin_ia32_pmaddubsw128:
   case clang::X86::BI__builtin_ia32_pmaddubsw256:
   case clang::X86::BI__builtin_ia32_pmaddubsw512:
@@ -11974,6 +12086,54 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
 
     return Success(APValue(ResultElements.data(), ResultElements.size()), E);
   }
+
+  case X86::BI__builtin_ia32_vpmadd52luq128:
+  case X86::BI__builtin_ia32_vpmadd52luq256:
+  case X86::BI__builtin_ia32_vpmadd52luq512: {
+    APValue A, B, C;
+    if (!EvaluateAsRValue(Info, E->getArg(0), A) ||
+        !EvaluateAsRValue(Info, E->getArg(1), B) ||
+        !EvaluateAsRValue(Info, E->getArg(2), C))
+      return false;
+
+    unsigned ALen = A.getVectorLength();
+    SmallVector<APValue, 4> ResultElements;
+    ResultElements.reserve(ALen);
+
+    for (unsigned EltNum = 0; EltNum < ALen; EltNum += 1) {
+      APInt AElt = A.getVectorElt(EltNum).getInt();
+      APInt BElt = B.getVectorElt(EltNum).getInt().trunc(52);
+      APInt CElt = C.getVectorElt(EltNum).getInt().trunc(52);
+      APSInt ResElt(AElt + (BElt * CElt).zext(64), false);
+      ResultElements.push_back(APValue(ResElt));
+    }
+
+    return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+  }
+  case X86::BI__builtin_ia32_vpmadd52huq128:
+  case X86::BI__builtin_ia32_vpmadd52huq256:
+  case X86::BI__builtin_ia32_vpmadd52huq512: {
+    APValue A, B, C;
+    if (!EvaluateAsRValue(Info, E->getArg(0), A) ||
+        !EvaluateAsRValue(Info, E->getArg(1), B) ||
+        !EvaluateAsRValue(Info, E->getArg(2), C))
+      return false;
+
+    unsigned ALen = A.getVectorLength();
+    SmallVector<APValue, 4> ResultElements;
+    ResultElements.reserve(ALen);
+
+    for (unsigned EltNum = 0; EltNum < ALen; EltNum += 1) {
+      APInt AElt = A.getVectorElt(EltNum).getInt();
+      APInt BElt = B.getVectorElt(EltNum).getInt().trunc(52);
+      APInt CElt = C.getVectorElt(EltNum).getInt().trunc(52);
+      APSInt ResElt(AElt + llvm::APIntOps::mulhu(BElt, CElt).zext(64), false);
+      ResultElements.push_back(APValue(ResElt));
+    }
+
+    return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+  }
+
   case clang::X86::BI__builtin_ia32_vprotbi:
   case clang::X86::BI__builtin_ia32_vprotdi:
   case clang::X86::BI__builtin_ia32_vprotqi:
@@ -12094,6 +12254,37 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
 
     return Success(APValue(ResultElements.data(), ResultElements.size()), E);
   }
+  case X86::BI__builtin_ia32_vpconflictsi_128:
+  case X86::BI__builtin_ia32_vpconflictsi_256:
+  case X86::BI__builtin_ia32_vpconflictsi_512:
+  case X86::BI__builtin_ia32_vpconflictdi_128:
+  case X86::BI__builtin_ia32_vpconflictdi_256:
+  case X86::BI__builtin_ia32_vpconflictdi_512: {
+    APValue Source;
+
+    if (!EvaluateAsRValue(Info, E->getArg(0), Source))
+      return false;
+
+    unsigned SourceLen = Source.getVectorLength();
+    SmallVector<APValue, 32> ResultElements;
+    ResultElements.reserve(SourceLen);
+
+    const auto *VecT = E->getType()->castAs<VectorType>();
+    bool DestUnsigned =
+        VecT->getElementType()->isUnsignedIntegerOrEnumerationType();
+
+    for (unsigned I = 0; I != SourceLen; ++I) {
+      const APValue &EltI = Source.getVectorElt(I);
+
+      APInt ConflictMask(EltI.getInt().getBitWidth(), 0);
+      for (unsigned J = 0; J != I; ++J) {
+        const APValue &EltJ = Source.getVectorElt(J);
+        ConflictMask.setBitVal(J, EltI.getInt() == EltJ.getInt());
+      }
+      ResultElements.push_back(APValue(APSInt(ConflictMask, DestUnsigned)));
+    }
+    return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+  }
   case X86::BI__builtin_ia32_blendpd:
   case X86::BI__builtin_ia32_blendpd256:
   case X86::BI__builtin_ia32_blendps:
@@ -12193,6 +12384,15 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
     return Success(APValue(ResultElements.data(), ResultElements.size()), E);
   }
 
+  case X86::BI__builtin_ia32_pshufb128:
+  case X86::BI__builtin_ia32_pshufb256:
+  case X86::BI__builtin_ia32_pshufb512: {
+    APValue R;
+    if (!evalPshufbBuiltin(Info, E, R))
+      return false;
+    return Success(R, E);
+  }
+
   case X86::BI__builtin_ia32_pshuflw:
   case X86::BI__builtin_ia32_pshuflw256:
   case X86::BI__builtin_ia32_pshuflw512: {
@@ -12220,6 +12420,40 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
     return Success(R, E);
   }
 
+  case X86::BI__builtin_ia32_phminposuw128: {
+    APValue Source;
+    if (!Evaluate(Source, Info, E->getArg(0)))
+      return false;
+    unsigned SourceLen = Source.getVectorLength();
+    const VectorType *VT = E->getArg(0)->getType()->castAs<VectorType>();
+    QualType ElemQT = VT->getElementType();
+    unsigned ElemBitWidth = Info.Ctx.getTypeSize(ElemQT);
+
+    APInt MinIndex(ElemBitWidth, 0);
+    APInt MinVal = Source.getVectorElt(0).getInt();
+    for (unsigned I = 1; I != SourceLen; ++I) {
+      APInt Val = Source.getVectorElt(I).getInt();
+      if (MinVal.ugt(Val)) {
+        MinVal = Val;
+        MinIndex = I;
+      }
+    }
+
+    bool ResultUnsigned = E->getCallReturnType(Info.Ctx)
+                              ->castAs<VectorType>()
+                              ->getElementType()
+                              ->isUnsignedIntegerOrEnumerationType();
+
+    SmallVector<APValue, 8> Result;
+    Result.reserve(SourceLen);
+    Result.emplace_back(APSInt(MinVal, ResultUnsigned));
+    Result.emplace_back(APSInt(MinIndex, ResultUnsigned));
+    for (unsigned I = 0; I != SourceLen - 2; ++I) {
+      Result.emplace_back(APSInt(APInt(ElemBitWidth, 0), ResultUnsigned));
+    }
+    return Success(APValue(Result.data(), Result.size()), E);
+  }
+
   case X86::BI__builtin_ia32_pternlogd128_mask:
   case X86::BI__builtin_ia32_pternlogd256_mask:
   case X86::BI__builtin_ia32_pternlogd512_mask:
@@ -12381,6 +12615,169 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
     return Success(APValue(ResultElements.data(), ResultElements.size()), E);
   }
 
+  case clang::X86::BI__builtin_ia32_phaddw128:
+  case clang::X86::BI__builtin_ia32_phaddw256:
+  case clang::X86::BI__builtin_ia32_phaddd128:
+  case clang::X86::BI__builtin_ia32_phaddd256:
+  case clang::X86::BI__builtin_ia32_phaddsw128:
+  case clang::X86::BI__builtin_ia32_phaddsw256:
+
+  case clang::X86::BI__builtin_ia32_phsubw128:
+  case clang::X86::BI__builtin_ia32_phsubw256:
+  case clang::X86::BI__builtin_ia32_phsubd128:
+  case clang::X86::BI__builtin_ia32_phsubd256:
+  case clang::X86::BI__builtin_ia32_phsubsw128:
+  case clang::X86::BI__builtin_ia32_phsubsw256: {
+    APValue SourceLHS, SourceRHS;
+    if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) ||
+        !EvaluateAsRValue(Info, E->getArg(1), SourceRHS))
+      return false;
+    QualType DestEltTy = E->getType()->castAs<VectorType>()->getElementType();
+    bool DestUnsigned = DestEltTy->isUnsignedIntegerOrEnumerationType();
+
+    unsigned NumElts = SourceLHS.getVectorLength();
+    unsigned EltBits = Info.Ctx.getIntWidth(DestEltTy);
+    unsigned EltsPerLane = 128 / EltBits;
+    SmallVector<APValue, 4> ResultElements;
+    ResultElements.reserve(NumElts);
+
+    for (unsigned LaneStart = 0; LaneStart != NumElts;
+         LaneStart += EltsPerLane) {
+      for (unsigned I = 0; I != EltsPerLane; I += 2) {
+        APSInt LHSA = SourceLHS.getVectorElt(LaneStart + I).getInt();
+        APSInt LHSB = SourceLHS.getVectorElt(LaneStart + I + 1).getInt();
+        switch (E->getBuiltinCallee()) {
+        case clang::X86::BI__builtin_ia32_phaddw128:
+        case clang::X86::BI__builtin_ia32_phaddw256:
+        case clang::X86::BI__builtin_ia32_phaddd128:
+        case clang::X86::BI__builtin_ia32_phaddd256: {
+          APSInt Res(LHSA + LHSB, DestUnsigned);
+          ResultElements.push_back(APValue(Res));
+          break;
+        }
+        case clang::X86::BI__builtin_ia32_phaddsw128:
+        case clang::X86::BI__builtin_ia32_phaddsw256: {
+          APSInt Res(LHSA.sadd_sat(LHSB));
+          ResultElements.push_back(APValue(Res));
+          break;
+        }
+        case clang::X86::BI__builtin_ia32_phsubw128:
+        case clang::X86::BI__builtin_ia32_phsubw256:
+        case clang::X86::BI__builtin_ia32_phsubd128:
+        case clang::X86::BI__builtin_ia32_phsubd256: {
+          APSInt Res(LHSA - LHSB, DestUnsigned);
+          ResultElements.push_back(APValue(Res));
+          break;
+        }
+        case clang::X86::BI__builtin_ia32_phsubsw128:
+        case clang::X86::BI__builtin_ia32_phsubsw256: {
+          APSInt Res(LHSA.ssub_sat(LHSB));
+          ResultElements.push_back(APValue(Res));
+          break;
+        }
+        }
+      }
+      for (unsigned I = 0; I != EltsPerLane; I += 2) {
+        APSInt RHSA = SourceRHS.getVectorElt(LaneStart + I).getInt();
+        APSInt RHSB = SourceRHS.getVectorElt(LaneStart + I + 1).getInt();
+        switch (E->getBuiltinCallee()) {
+        case clang::X86::BI__builtin_ia32_phaddw128:
+        case clang::X86::BI__builtin_ia32_phaddw256:
+        case clang::X86::BI__builtin_ia32_phaddd128:
+        case clang::X86::BI__builtin_ia32_phaddd256: {
+          APSInt Res(RHSA + RHSB, DestUnsigned);
+          ResultElements.push_back(APValue(Res));
+          break;
+        }
+        case clang::X86::BI__builtin_ia32_phaddsw128:
+        case clang::X86::BI__builtin_ia32_phaddsw256: {
+          APSInt Res(RHSA.sadd_sat(RHSB));
+          ResultElements.push_back(APValue(Res));
+          break;
+        }
+        case clang::X86::BI__builtin_ia32_phsubw128:
+        case clang::X86::BI__builtin_ia32_phsubw256:
+        case clang::X86::BI__builtin_ia32_phsubd128:
+        case clang::X86::BI__builtin_ia32_phsubd256: {
+          APSInt Res(RHSA - RHSB, DestUnsigned);
+          ResultElements.push_back(APValue(Res));
+          break;
+        }
+        case clang::X86::BI__builtin_ia32_phsubsw128:
+        case clang::X86::BI__builtin_ia32_phsubsw256: {
+          APSInt Res(RHSA.ssub_sat(RHSB));
+          ResultElements.push_back(APValue(Res));
+          break;
+        }
+        }
+      }
+    }
+    return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+  }
+  case clang::X86::BI__builtin_ia32_haddpd:
+  case clang::X86::BI__builtin_ia32_haddps:
+  case clang::X86::BI__builtin_ia32_haddps256:
+  case clang::X86::BI__builtin_ia32_haddpd256:
+  case clang::X86::BI__builtin_ia32_hsubpd:
+  case clang::X86::BI__builtin_ia32_hsubps:
+  case clang::X86::BI__builtin_ia32_hsubps256:
+  case clang::X86::BI__builtin_ia32_hsubpd256: {
+    APValue SourceLHS, SourceRHS;
+    if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) ||
+        !EvaluateAsRValue(Info, E->getArg(1), SourceRHS))
+      return false;
+    unsigned NumElts = SourceLHS.getVectorLength();
+    SmallVector<APValue, 4> ResultElements;
+    ResultElements.reserve(NumElts);
+    llvm::RoundingMode RM = getActiveRoundingMode(getEvalInfo(), E);
+    QualType DestEltTy = E->getType()->castAs<VectorType>()->getElementType();
+    unsigned EltBits = Info.Ctx.getTypeSize(DestEltTy);
+    unsigned NumLanes = NumElts * EltBits / 128;
+    unsigned NumElemsPerLane = NumElts / NumLanes;
+    unsigned HalfElemsPerLane = NumElemsPerLane / 2;
+
+    for (unsigned L = 0; L != NumElts; L += NumElemsPerLane) {
+      for (unsigned I = 0; I != HalfElemsPerLane; ++I) {
+        APFloat LHSA = SourceLHS.getVectorElt(L + (2 * I) + 0).getFloat();
+        APFloat LHSB = SourceLHS.getVectorElt(L + (2 * I) + 1).getFloat();
+        switch (E->getBuiltinCallee()) {
+        case clang::X86::BI__builtin_ia32_haddpd:
+        case clang::X86::BI__builtin_ia32_haddps:
+        case clang::X86::BI__builtin_ia32_haddps256:
+        case clang::X86::BI__builtin_ia32_haddpd256:
+          LHSA.add(LHSB, RM);
+          break;
+        case clang::X86::BI__builtin_ia32_hsubpd:
+        case clang::X86::BI__builtin_ia32_hsubps:
+        case clang::X86::BI__builtin_ia32_hsubps256:
+        case clang::X86::BI__builtin_ia32_hsubpd256:
+          LHSA.subtract(LHSB, RM);
+          break;
+        }
+        ResultElements.push_back(APValue(LHSA));
+      }
+      for (unsigned I = 0; I != HalfElemsPerLane; ++I) {
+        APFloat RHSA = SourceRHS.getVectorElt(L + (2 * I) + 0).getFloat();
+        APFloat RHSB = SourceRHS.getVectorElt(L + (2 * I) + 1).getFloat();
+        switch (E->getBuiltinCallee()) {
+        case clang::X86::BI__builtin_ia32_haddpd:
+        case clang::X86::BI__builtin_ia32_haddps:
+        case clang::X86::BI__builtin_ia32_haddps256:
+        case clang::X86::BI__builtin_ia32_haddpd256:
+          RHSA.add(RHSB, RM);
+          break;
+        case clang::X86::BI__builtin_ia32_hsubpd:
+        case clang::X86::BI__builtin_ia32_hsubps:
+        case clang::X86::BI__builtin_ia32_hsubps256:
+        case clang::X86::BI__builtin_ia32_hsubpd256:
+          RHSA.subtract(RHSB, RM);
+          break;
+        }
+        ResultElements.push_back(APValue(RHSA));
+      }
+    }
+    return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+  }
   case Builtin::BI__builtin_elementwise_fshl:
   case Builtin::BI__builtin_elementwise_fshr: {
     APValue SourceHi, SourceLo, SourceShift;
@@ -14972,6 +15369,36 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
     return Success(CarryOut, E);
   }
 
+  case clang::X86::BI__builtin_ia32_movmskps:
+  case clang::X86::BI__builtin_ia32_movmskpd:
+  case clang::X86::BI__builtin_ia32_pmovmskb128:
+  case clang::X86::BI__builtin_ia32_pmovmskb256:
+  case clang::X86::BI__builtin_ia32_movmskps256:
+  case clang::X86::BI__builtin_ia32_movmskpd256: {
+    APValue Source;
+    if (!Evaluate(Source, Info, E->getArg(0)))
+      return false;
+    unsigned SourceLen = Source.getVectorLength();
+    const VectorType *VT = E->getArg(0)->getType()->castAs<VectorType>();
+    QualType ElemQT = VT->getElementType();
+    unsigned ResultLen = Info.Ctx.getTypeSize(
+        E->getCallReturnType(Info.Ctx)); // Always 32-bit integer.
+    APInt Result(ResultLen, 0);
+
+    for (unsigned I = 0; I != SourceLen; ++I) {
+      APInt Elem;
+      if (ElemQT->isIntegerType()) {
+        Elem = Source.getVectorElt(I).getInt();
+      } else if (ElemQT->isRealFloatingType()) {
+        Elem = Source.getVectorElt(I).getFloat().bitcastToAPInt();
+      } else {
+        return false;
+      }
+      Result.setBitVal(I, Elem.isNegative());
+    }
+    return Success(Result, E);
+  }
+
   case clang::X86::BI__builtin_ia32_bextr_u32:
   case clang::X86::BI__builtin_ia32_bextr_u64:
   case clang::X86::BI__builtin_ia32_bextri_u32:
diff --git a/clang/lib/AST/InheritViz.cpp b/clang/lib/AST/InheritViz.cpp
index 3c4a5a8e2c4a6..c5f4c2bc95719 100644
--- a/clang/lib/AST/InheritViz.cpp
+++ b/clang/lib/AST/InheritViz.cpp
@@ -89,8 +89,8 @@ void InheritanceHierarchyWriter::WriteNode(QualType Type, bool FromVirtual) {
   Out << " \"];\n";
 
   // Display the base classes.
-  const auto *Decl = cast<CXXRecordDecl>(
-      Type->castAsCanonical<RecordType>()->getOriginalDecl());
+  const auto *Decl =
+      cast<CXXRecordDecl>(Type->castAsCanonical<RecordType>()->getDecl());
   for (const auto &Base : Decl->bases()) {
     QualType CanonBaseType = Context.getCanonicalType(Base.getType());
 
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index 844db79f18a4a..5572e0a7ae59c 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -2491,7 +2491,7 @@ bool CXXNameMangler::mangleUnresolvedTypeOrSimpleId(QualType Ty,
   case Type::Enum:
   case Type::Record:
     mangleSourceNameWithAbiTags(
-        cast<TagType>(Ty)->getOriginalDecl()->getDefinitionOrSelf());
+        cast<TagType>(Ty)->getDecl()->getDefinitionOrSelf());
     break;
 
   case Type::TemplateSpecialization: {
@@ -2556,9 +2556,8 @@ bool CXXNameMangler::mangleUnresolvedTypeOrSimpleId(QualType Ty,
   }
 
   case Type::InjectedClassName:
-    mangleSourceNameWithAbiTags(cast<InjectedClassNameType>(Ty)
-                                    ->getOriginalDecl()
-                                    ->getDefinitionOrSelf());
+    mangleSourceNameWithAbiTags(
+        cast<InjectedClassNameType>(Ty)->getDecl()->getDefinitionOrSelf());
     break;
 
   case Type::DependentName:
@@ -3795,7 +3794,7 @@ void CXXNameMangler::mangleType(const RecordType *T) {
   mangleType(static_cast<const TagType*>(T));
 }
 void CXXNameMangler::mangleType(const TagType *T) {
-  mangleName(T->getOriginalDecl()->getDefinitionOrSelf());
+  mangleName(T->getDecl()->getDefinitionOrSelf());
 }
 
 // <type>       ::= <array-type>
@@ -4430,8 +4429,8 @@ void CXXNameMangler::mangleType(const InjectedClassNameType *T) {
   // Mangle injected class name types as if the user had written the
   // specialization out fully.  It may not actually be possible to see
   // this mangling, though.
-  mangleType(T->getOriginalDecl()->getCanonicalTemplateSpecializationType(
-      getASTContext()));
+  mangleType(
+      T->getDecl()->getCanonicalTemplateSpecializationType(getASTContext()));
 }
 
 void CXXNameMangler::mangleType(const TemplateSpecializationType *T) {
@@ -4692,7 +4691,7 @@ void CXXNameMangler::mangleIntegerLiteral(QualType T,
 void CXXNameMangler::mangleMemberExprBase(const Expr *Base, bool IsArrow) {
   // Ignore member expressions involving anonymous unions.
   while (const auto *RT = Base->getType()->getAsCanonical<RecordType>()) {
-    if (!RT->getOriginalDecl()->isAnonymousStructOrUnion())
+    if (!RT->getDecl()->isAnonymousStructOrUnion())
       break;
     const auto *ME = dyn_cast<MemberExpr>(Base);
     if (!ME)
@@ -7010,8 +7009,7 @@ bool CXXNameMangler::isSpecializedAs(QualType S, llvm::StringRef Name,
   if (!RT)
     return false;
 
-  const ClassTemplateSpecializationDecl *SD =
-      dyn_cast<ClassTemplateSpecializationDecl>(RT->getOriginalDecl());
+  const auto *SD = dyn_cast<ClassTemplateSpecializationDecl>(RT->getDecl());
   if (!SD || !SD->getIdentifier()->isStr(Name))
     return false;
 
diff --git a/clang/lib/AST/JSONNodeDumper.cpp b/clang/lib/AST/JSONNodeDumper.cpp
index 0ef632805d67c..9f4dba9f14fa6 100644
--- a/clang/lib/AST/JSONNodeDumper.cpp
+++ b/clang/lib/AST/JSONNodeDumper.cpp
@@ -396,7 +396,7 @@ llvm::json::Array JSONNodeDumper::createCastPath(const CastExpr *C) {
   for (auto I = C->path_begin(), E = C->path_end(); I != E; ++I) {
     const CXXBaseSpecifier *Base = *I;
     const auto *RD = cast<CXXRecordDecl>(
-        Base->getType()->castAsCanonical<RecordType>()->getOriginalDecl());
+        Base->getType()->castAsCanonical<RecordType>()->getDecl());
 
     llvm::json::Object Val{{"name", RD->getName()}};
     if (Base->isVirtual())
@@ -764,7 +764,7 @@ void JSONNodeDumper::VisitTagType(const TagType *TT) {
     Qualifier.print(OS, PrintPolicy, /*ResolveTemplateArguments=*/true);
     JOS.attribute("qualifier", Str);
   }
-  JOS.attribute("decl", createBareDeclRef(TT->getOriginalDecl()));
+  JOS.attribute("decl", createBareDeclRef(TT->getDecl()));
   if (TT->isTagOwned())
     JOS.attribute("isTagOwned", true);
 }
@@ -816,7 +816,7 @@ void JSONNodeDumper::VisitTemplateSpecializationType(
 
 void JSONNodeDumper::VisitInjectedClassNameType(
     const InjectedClassNameType *ICNT) {
-  JOS.attribute("decl", createBareDeclRef(ICNT->getOriginalDecl()));
+  JOS.attribute("decl", createBareDeclRef(ICNT->getDecl()));
 }
 
 void JSONNodeDumper::VisitObjCInterfaceType(const ObjCInterfaceType *OIT) {
diff --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp
index 8cbc72b1db735..f1baf9f49384b 100644
--- a/clang/lib/AST/MicrosoftMangle.cpp
+++ b/clang/lib/AST/MicrosoftMangle.cpp
@@ -3248,11 +3248,11 @@ void MicrosoftCXXNameMangler::mangleTagTypeKind(TagTypeKind TTK) {
 }
 void MicrosoftCXXNameMangler::mangleType(const EnumType *T, Qualifiers,
                                          SourceRange) {
-  mangleType(cast<TagType>(T)->getOriginalDecl());
+  mangleType(cast<TagType>(T)->getDecl());
 }
 void MicrosoftCXXNameMangler::mangleType(const RecordType *T, Qualifiers,
                                          SourceRange) {
-  mangleType(cast<TagType>(T)->getOriginalDecl());
+  mangleType(cast<TagType>(T)->getDecl());
 }
 void MicrosoftCXXNameMangler::mangleType(const TagDecl *TD) {
   // MSVC chooses the tag kind of the definition if it exists, otherwise it
diff --git a/clang/lib/AST/ODRHash.cpp b/clang/lib/AST/ODRHash.cpp
index 6842038b7eb57..46a4e256ea3e5 100644
--- a/clang/lib/AST/ODRHash.cpp
+++ b/clang/lib/AST/ODRHash.cpp
@@ -913,7 +913,7 @@ class ODRTypeVisitor : public TypeVisitor<ODRTypeVisitor> {
       return false;
 
     if (TypedefT->getDecl()->getIdentifier() !=
-        TagT->getOriginalDecl()->getIdentifier())
+        TagT->getDecl()->getIdentifier())
       return false;
 
     ID.AddInteger(TagT->getTypeClass());
@@ -1059,7 +1059,7 @@ class ODRTypeVisitor : public TypeVisitor<ODRTypeVisitor> {
   }
 
   void VisitInjectedClassNameType(const InjectedClassNameType *T) {
-    AddDecl(T->getOriginalDecl()->getDefinitionOrSelf());
+    AddDecl(T->getDecl()->getDefinitionOrSelf());
     VisitType(T);
   }
 
@@ -1164,7 +1164,7 @@ class ODRTypeVisitor : public TypeVisitor<ODRTypeVisitor> {
     AddNestedNameSpecifier(ElaboratedOverride
                                ? ElaboratedOverride->getQualifier()
                                : T->getQualifier());
-    AddDecl(T->getOriginalDecl()->getDefinitionOrSelf());
+    AddDecl(T->getDecl()->getDefinitionOrSelf());
     VisitType(T);
   }
 
diff --git a/clang/lib/AST/OpenACCClause.cpp b/clang/lib/AST/OpenACCClause.cpp
index 17c6bece44c82..142c9329141a9 100644
--- a/clang/lib/AST/OpenACCClause.cpp
+++ b/clang/lib/AST/OpenACCClause.cpp
@@ -509,7 +509,7 @@ OpenACCReductionClause *OpenACCReductionClause::Create(
     ArrayRef<OpenACCReductionRecipeWithStorage> Recipes,
     SourceLocation EndLoc) {
   size_t NumCombiners = llvm::accumulate(
-      Recipes, 0, [](size_t Num, const OpenACCReductionRecipe &R) {
+      Recipes, 0, [](size_t Num, const OpenACCReductionRecipeWithStorage &R) {
         return Num + R.CombinerRecipes.size();
       });
 
diff --git a/clang/lib/AST/ParentMapContext.cpp b/clang/lib/AST/ParentMapContext.cpp
index acc011cb2faa4..7138dffb46e19 100644
--- a/clang/lib/AST/ParentMapContext.cpp
+++ b/clang/lib/AST/ParentMapContext.cpp
@@ -20,36 +20,6 @@
 
 using namespace clang;
 
-ParentMapContext::ParentMapContext(ASTContext &Ctx) : ASTCtx(Ctx) {}
-
-ParentMapContext::~ParentMapContext() = default;
-
-void ParentMapContext::clear() { Parents.reset(); }
-
-const Expr *ParentMapContext::traverseIgnored(const Expr *E) const {
-  return traverseIgnored(const_cast<Expr *>(E));
-}
-
-Expr *ParentMapContext::traverseIgnored(Expr *E) const {
-  if (!E)
-    return nullptr;
-
-  switch (Traversal) {
-  case TK_AsIs:
-    return E;
-  case TK_IgnoreUnlessSpelledInSource:
-    return E->IgnoreUnlessSpelledInSource();
-  }
-  llvm_unreachable("Invalid Traversal type!");
-}
-
-DynTypedNode ParentMapContext::traverseIgnored(const DynTypedNode &N) const {
-  if (const auto *E = N.get<Expr>()) {
-    return DynTypedNode::create(*traverseIgnored(E));
-  }
-  return N;
-}
-
 template <typename T, typename... U>
 static std::tuple<bool, DynTypedNodeList, const T *, const U *...>
 matchParents(const DynTypedNodeList &NodeList,
@@ -334,6 +304,36 @@ matchParents(const DynTypedNodeList &NodeList,
   return MatchParents<T, U...>::match(NodeList, ParentMap);
 }
 
+ParentMapContext::ParentMapContext(ASTContext &Ctx) : ASTCtx(Ctx) {}
+
+ParentMapContext::~ParentMapContext() = default;
+
+void ParentMapContext::clear() { Parents.reset(); }
+
+const Expr *ParentMapContext::traverseIgnored(const Expr *E) const {
+  return traverseIgnored(const_cast<Expr *>(E));
+}
+
+Expr *ParentMapContext::traverseIgnored(Expr *E) const {
+  if (!E)
+    return nullptr;
+
+  switch (Traversal) {
+  case TK_AsIs:
+    return E;
+  case TK_IgnoreUnlessSpelledInSource:
+    return E->IgnoreUnlessSpelledInSource();
+  }
+  llvm_unreachable("Invalid Traversal type!");
+}
+
+DynTypedNode ParentMapContext::traverseIgnored(const DynTypedNode &N) const {
+  if (const auto *E = N.get<Expr>()) {
+    return DynTypedNode::create(*traverseIgnored(E));
+  }
+  return N;
+}
+
 /// Template specializations to abstract away from pointers and TypeLocs.
 /// @{
 template <typename T> static DynTypedNode createDynTypedNode(const T &Node) {
diff --git a/clang/lib/AST/QualTypeNames.cpp b/clang/lib/AST/QualTypeNames.cpp
index a2f930911bfe5..191841649a86f 100644
--- a/clang/lib/AST/QualTypeNames.cpp
+++ b/clang/lib/AST/QualTypeNames.cpp
@@ -125,7 +125,7 @@ static const Type *getFullyQualifiedTemplateType(const ASTContext &Ctx,
   // which can point to a template instantiation with no sugar in any of
   // its template argument, however we still need to fully qualify them.
 
-  const auto *TD = TSTRecord->getOriginalDecl();
+  const auto *TD = TSTRecord->getDecl();
   const auto *TSTDecl = dyn_cast<ClassTemplateSpecializationDecl>(TD);
   if (!TSTDecl)
     return Ctx.getTagType(Keyword, Qualifier, TD, /*OwnsTag=*/false)
@@ -232,7 +232,7 @@ static NestedNameSpecifier getFullyQualifiedNestedNameSpecifier(
     // Find decl context.
     const TypeDecl *TD;
     if (const TagType *TagDeclType = Type->getAs<TagType>())
-      TD = TagDeclType->getOriginalDecl();
+      TD = TagDeclType->getDecl();
     else if (const auto *D = dyn_cast<TypedefType>(Type))
       TD = D->getDecl();
     else
@@ -316,7 +316,7 @@ createNestedNameSpecifierForScopeOf(const ASTContext &Ctx, const Type *TypePtr,
   if (const auto *TDT = dyn_cast<TypedefType>(TypePtr)) {
     Decl = TDT->getDecl();
   } else if (const auto *TagDeclType = dyn_cast<TagType>(TypePtr)) {
-    Decl = TagDeclType->getOriginalDecl();
+    Decl = TagDeclType->getDecl();
   } else if (const auto *TST = dyn_cast<TemplateSpecializationType>(TypePtr)) {
     Decl = TST->getTemplateName().getAsTemplateDecl();
   } else {
diff --git a/clang/lib/AST/RecordLayoutBuilder.cpp b/clang/lib/AST/RecordLayoutBuilder.cpp
index 00b938bdf308d..ac18d4da22e8c 100644
--- a/clang/lib/AST/RecordLayoutBuilder.cpp
+++ b/clang/lib/AST/RecordLayoutBuilder.cpp
@@ -2009,7 +2009,7 @@ void ItaniumRecordLayoutBuilder::LayoutField(const FieldDecl *D,
     } else if (const BuiltinType *BTy = BaseTy->getAs<BuiltinType>()) {
       performBuiltinTypeAlignmentUpgrade(BTy);
     } else if (const RecordType *RT = BaseTy->getAsCanonical<RecordType>()) {
-      const RecordDecl *RD = RT->getOriginalDecl();
+      const RecordDecl *RD = RT->getDecl();
       const ASTRecordLayout &FieldRecord = Context.getASTRecordLayout(RD);
       PreferredAlign = FieldRecord.getPreferredAlignment();
     }
@@ -2710,7 +2710,7 @@ MicrosoftRecordLayoutBuilder::getAdjustedElementInfo(
     if (const auto *RT = FD->getType()
                              ->getBaseElementTypeUnsafe()
                              ->getAsCanonical<RecordType>()) {
-      auto const &Layout = Context.getASTRecordLayout(RT->getOriginalDecl());
+      auto const &Layout = Context.getASTRecordLayout(RT->getDecl());
       EndsWithZeroSizedObject = Layout.endsWithZeroSizedObject();
       FieldRequiredAlignment = std::max(FieldRequiredAlignment,
                                         Layout.getRequiredAlignment());
diff --git a/clang/lib/AST/Stmt.cpp b/clang/lib/AST/Stmt.cpp
index 9ae8aea3ab37a..11ece494490de 100644
--- a/clang/lib/AST/Stmt.cpp
+++ b/clang/lib/AST/Stmt.cpp
@@ -252,7 +252,7 @@ namespace {
   template <class T> good implements_children(children_t T::*) {
     return good();
   }
-  LLVM_ATTRIBUTE_UNUSED
+  [[maybe_unused]]
   static bad implements_children(children_t Stmt::*) {
     return bad();
   }
@@ -261,15 +261,19 @@ namespace {
   template <class T> good implements_getBeginLoc(getBeginLoc_t T::*) {
     return good();
   }
-  LLVM_ATTRIBUTE_UNUSED
-  static bad implements_getBeginLoc(getBeginLoc_t Stmt::*) { return bad(); }
+  [[maybe_unused]]
+  static bad implements_getBeginLoc(getBeginLoc_t Stmt::*) {
+    return bad();
+  }
 
   typedef SourceLocation getLocEnd_t() const;
   template <class T> good implements_getEndLoc(getLocEnd_t T::*) {
     return good();
   }
-  LLVM_ATTRIBUTE_UNUSED
-  static bad implements_getEndLoc(getLocEnd_t Stmt::*) { return bad(); }
+  [[maybe_unused]]
+  static bad implements_getEndLoc(getLocEnd_t Stmt::*) {
+    return bad();
+  }
 
 #define ASSERT_IMPLEMENTS_children(type) \
   (void) is_good(implements_children(&type::children))
@@ -282,7 +286,7 @@ namespace {
 
 /// Check whether the various Stmt classes implement their member
 /// functions.
-LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]]
 static inline void check_implementations() {
 #define ABSTRACT_STMT(type)
 #define STMT(type, base)                                                       \
diff --git a/clang/lib/AST/StmtPrinter.cpp b/clang/lib/AST/StmtPrinter.cpp
index 586c3000f105c..ff8ca01ec5477 100644
--- a/clang/lib/AST/StmtPrinter.cpp
+++ b/clang/lib/AST/StmtPrinter.cpp
@@ -151,11 +151,11 @@ namespace {
       else StmtVisitor<StmtPrinter>::Visit(S);
     }
 
-    void VisitStmt(Stmt *Node) LLVM_ATTRIBUTE_UNUSED {
+    [[maybe_unused]] void VisitStmt(Stmt *Node) {
       Indent() << "<<unknown stmt type>>" << NL;
     }
 
-    void VisitExpr(Expr *Node) LLVM_ATTRIBUTE_UNUSED {
+    [[maybe_unused]] void VisitExpr(Expr *Node) {
       OS << "<<unknown expr type>>";
     }
 
diff --git a/clang/lib/AST/TemplateBase.cpp b/clang/lib/AST/TemplateBase.cpp
index 76f96fb8c5dcc..131ae6e8a478f 100644
--- a/clang/lib/AST/TemplateBase.cpp
+++ b/clang/lib/AST/TemplateBase.cpp
@@ -340,13 +340,14 @@ bool TemplateArgument::isPackExpansion() const {
 }
 
 bool TemplateArgument::isConceptOrConceptTemplateParameter() const {
-  if (getKind() == TemplateArgument::Template) {
-    if (isa<ConceptDecl>(getAsTemplate().getAsTemplateDecl()))
-      return true;
-    else if (auto *TTP = dyn_cast_if_present<TemplateTemplateParmDecl>(
-                 getAsTemplate().getAsTemplateDecl()))
-      return TTP->templateParameterKind() == TNK_Concept_template;
-  }
+  if (getKind() != TemplateArgument::Template)
+    return false;
+
+  if (isa_and_nonnull<ConceptDecl>(getAsTemplate().getAsTemplateDecl()))
+    return true;
+  if (auto *TTP = llvm::dyn_cast_or_null<TemplateTemplateParmDecl>(
+          getAsTemplate().getAsTemplateDecl()))
+    return TTP->templateParameterKind() == TNK_Concept_template;
   return false;
 }
 
diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp
index cf5e9147ad78b..41aebdb8d2f1b 100644
--- a/clang/lib/AST/TextNodeDumper.cpp
+++ b/clang/lib/AST/TextNodeDumper.cpp
@@ -1401,7 +1401,7 @@ static void dumpBasePath(raw_ostream &OS, const CastExpr *Node) {
       OS << " -> ";
 
     const auto *RD = cast<CXXRecordDecl>(
-        Base->getType()->castAsCanonical<RecordType>()->getOriginalDecl());
+        Base->getType()->castAsCanonical<RecordType>()->getDecl());
 
     if (Base->isVirtual())
       OS << "virtual ";
@@ -2180,7 +2180,7 @@ void TextNodeDumper::VisitTagType(const TagType *T) {
       K != ElaboratedTypeKeyword::None)
     OS << ' ' << TypeWithKeyword::getKeywordName(K);
   dumpNestedNameSpecifier(T->getQualifier());
-  dumpDeclRef(T->getOriginalDecl());
+  dumpDeclRef(T->getDecl());
 }
 
 void TextNodeDumper::VisitTemplateTypeParmType(const TemplateTypeParmType *T) {
@@ -2232,7 +2232,7 @@ void TextNodeDumper::VisitTemplateSpecializationType(
 
 void TextNodeDumper::VisitInjectedClassNameType(
     const InjectedClassNameType *T) {
-  dumpDeclRef(T->getOriginalDecl());
+  dumpDeclRef(T->getDecl());
 }
 
 void TextNodeDumper::VisitObjCInterfaceType(const ObjCInterfaceType *T) {
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index ee7a68ee8ba7e..4548af17e37f2 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -114,7 +114,7 @@ const IdentifierInfo *QualType::getBaseTypeIdentifier() const {
   if (ty->isPointerOrReferenceType())
     return ty->getPointeeType().getBaseTypeIdentifier();
   if (const auto *TT = ty->getAs<TagType>())
-    ND = TT->getOriginalDecl();
+    ND = TT->getDecl();
   else if (ty->getTypeClass() == Type::Typedef)
     ND = ty->castAs<TypedefType>()->getDecl();
   else if (ty->isArrayType())
@@ -671,13 +671,13 @@ const Type *Type::getUnqualifiedDesugaredType() const {
 
 bool Type::isClassType() const {
   if (const auto *RT = getAsCanonical<RecordType>())
-    return RT->getOriginalDecl()->isClass();
+    return RT->getDecl()->isClass();
   return false;
 }
 
 bool Type::isStructureType() const {
   if (const auto *RT = getAsCanonical<RecordType>())
-    return RT->getOriginalDecl()->isStruct();
+    return RT->getDecl()->isStruct();
   return false;
 }
 
@@ -685,7 +685,7 @@ bool Type::isStructureTypeWithFlexibleArrayMember() const {
   const auto *RT = getAsCanonical<RecordType>();
   if (!RT)
     return false;
-  const auto *Decl = RT->getOriginalDecl();
+  const auto *Decl = RT->getDecl();
   if (!Decl->isStruct())
     return false;
   return Decl->getDefinitionOrSelf()->hasFlexibleArrayMember();
@@ -699,13 +699,13 @@ bool Type::isObjCBoxableRecordType() const {
 
 bool Type::isInterfaceType() const {
   if (const auto *RT = getAsCanonical<RecordType>())
-    return RT->getOriginalDecl()->isInterface();
+    return RT->getDecl()->isInterface();
   return false;
 }
 
 bool Type::isStructureOrClassType() const {
   if (const auto *RT = getAsCanonical<RecordType>())
-    return RT->getOriginalDecl()->isStructureOrClass();
+    return RT->getDecl()->isStructureOrClass();
   return false;
 }
 
@@ -717,7 +717,7 @@ bool Type::isVoidPointerType() const {
 
 bool Type::isUnionType() const {
   if (const auto *RT = getAsCanonical<RecordType>())
-    return RT->getOriginalDecl()->isUnion();
+    return RT->getDecl()->isUnion();
   return false;
 }
 
@@ -734,7 +734,7 @@ bool Type::isComplexIntegerType() const {
 
 bool Type::isScopedEnumeralType() const {
   if (const auto *ET = getAsCanonical<EnumType>())
-    return ET->getOriginalDecl()->isScoped();
+    return ET->getDecl()->isScoped();
   return false;
 }
 
@@ -768,13 +768,13 @@ QualType Type::getPointeeType() const {
 const RecordType *Type::getAsStructureType() const {
   // If this is directly a structure type, return it.
   if (const auto *RT = dyn_cast<RecordType>(this)) {
-    if (RT->getOriginalDecl()->isStruct())
+    if (RT->getDecl()->isStruct())
       return RT;
   }
 
   // If the canonical form of this type isn't the right kind, reject it.
   if (const auto *RT = dyn_cast<RecordType>(CanonicalType)) {
-    if (!RT->getOriginalDecl()->isStruct())
+    if (!RT->getDecl()->isStruct())
       return nullptr;
 
     // If this is a typedef for a structure type, strip the typedef off without
@@ -787,13 +787,13 @@ const RecordType *Type::getAsStructureType() const {
 const RecordType *Type::getAsUnionType() const {
   // If this is directly a union type, return it.
   if (const auto *RT = dyn_cast<RecordType>(this)) {
-    if (RT->getOriginalDecl()->isUnion())
+    if (RT->getDecl()->isUnion())
       return RT;
   }
 
   // If the canonical form of this type isn't the right kind, reject it.
   if (const auto *RT = dyn_cast<RecordType>(CanonicalType)) {
-    if (!RT->getOriginalDecl()->isUnion())
+    if (!RT->getDecl()->isUnion())
       return nullptr;
 
     // If this is a typedef for a union type, strip the typedef off without
@@ -2107,7 +2107,7 @@ bool Type::isIntegralType(const ASTContext &Ctx) const {
   // Complete enum types are integral in C.
   if (!Ctx.getLangOpts().CPlusPlus)
     if (const auto *ET = dyn_cast<EnumType>(CanonicalType))
-      return IsEnumDeclComplete(ET->getOriginalDecl());
+      return IsEnumDeclComplete(ET->getDecl());
 
   return isBitIntType();
 }
@@ -2124,7 +2124,7 @@ bool Type::isIntegralOrUnscopedEnumerationType() const {
 
 bool Type::isUnscopedEnumerationType() const {
   if (const auto *ET = dyn_cast<EnumType>(CanonicalType))
-    return !ET->getOriginalDecl()->isScoped();
+    return !ET->getDecl()->isScoped();
 
   return false;
 }
@@ -2328,7 +2328,7 @@ bool Type::isRealType() const {
     return BT->getKind() >= BuiltinType::Bool &&
            BT->getKind() <= BuiltinType::Ibm128;
   if (const auto *ET = dyn_cast<EnumType>(CanonicalType)) {
-    const auto *ED = ET->getOriginalDecl();
+    const auto *ED = ET->getDecl();
     return !ED->isScoped() && ED->getDefinitionOrSelf()->isComplete();
   }
   return isBitIntType();
@@ -2345,7 +2345,7 @@ bool Type::isArithmeticType() const {
     // C++0x: Enumerations are not arithmetic types. For now, just return
     // false for scoped enumerations since that will disable any
     // unwanted implicit conversions.
-    const auto *ED = ET->getOriginalDecl();
+    const auto *ED = ET->getDecl();
     return !ED->isScoped() && ED->getDefinitionOrSelf()->isComplete();
   }
   return isa<ComplexType>(CanonicalType) || isBitIntType();
@@ -2410,8 +2410,7 @@ Type::ScalarTypeKind Type::getScalarTypeKind() const {
 /// includes union types.
 bool Type::isAggregateType() const {
   if (const auto *Record = dyn_cast<RecordType>(CanonicalType)) {
-    if (const auto *ClassDecl =
-            dyn_cast<CXXRecordDecl>(Record->getOriginalDecl()))
+    if (const auto *ClassDecl = dyn_cast<CXXRecordDecl>(Record->getDecl()))
       return ClassDecl->isAggregate();
 
     return true;
@@ -2746,8 +2745,8 @@ bool QualType::isCXX98PODType(const ASTContext &Context) const {
     return true;
 
   case Type::Record:
-    if (const auto *ClassDecl = dyn_cast<CXXRecordDecl>(
-            cast<RecordType>(CanonicalType)->getOriginalDecl()))
+    if (const auto *ClassDecl =
+            dyn_cast<CXXRecordDecl>(cast<RecordType>(CanonicalType)->getDecl()))
       return ClassDecl->isPOD();
 
     // C struct/union is POD.
@@ -3179,7 +3178,7 @@ bool Type::isNothrowT() const {
 
 bool Type::isAlignValT() const {
   if (const auto *ET = getAsCanonical<EnumType>()) {
-    const auto *ED = ET->getOriginalDecl();
+    const auto *ED = ET->getDecl();
     IdentifierInfo *II = ED->getIdentifier();
     if (II && II->isStr("align_val_t") && ED->isInStdNamespace())
       return true;
@@ -3189,7 +3188,7 @@ bool Type::isAlignValT() const {
 
 bool Type::isStdByteType() const {
   if (const auto *ET = getAsCanonical<EnumType>()) {
-    const auto *ED = ET->getOriginalDecl();
+    const auto *ED = ET->getDecl();
     IdentifierInfo *II = ED->getIdentifier();
     if (II && II->isStr("byte") && ED->isInStdNamespace())
       return true;
@@ -4321,7 +4320,7 @@ bool RecordType::hasConstFields() const {
 
   while (RecordTypeList.size() > NextToCheckIndex) {
     for (FieldDecl *FD : RecordTypeList[NextToCheckIndex]
-                             ->getOriginalDecl()
+                             ->getDecl()
                              ->getDefinitionOrSelf()
                              ->fields()) {
       QualType FieldTy = FD->getType();
@@ -4815,8 +4814,7 @@ static CachedProperties computeCachedProperties(const Type *T) {
 
   case Type::Record:
   case Type::Enum: {
-    const TagDecl *Tag =
-        cast<TagType>(T)->getOriginalDecl()->getDefinitionOrSelf();
+    const auto *Tag = cast<TagType>(T)->getDecl()->getDefinitionOrSelf();
 
     // C++ [basic.link]p8:
     //     - it is a class or enumeration type that is named (or has a name
@@ -4926,7 +4924,7 @@ LinkageInfo LinkageComputer::computeTypeLinkageInfo(const Type *T) {
   case Type::Record:
   case Type::Enum:
     return getDeclLinkageAndVisibility(
-        cast<TagType>(T)->getOriginalDecl()->getDefinitionOrSelf());
+        cast<TagType>(T)->getDecl()->getDefinitionOrSelf());
 
   case Type::Complex:
     return computeTypeLinkageInfo(cast<ComplexType>(T)->getElementType());
@@ -5129,7 +5127,7 @@ bool Type::canHaveNullability(bool ResultIfUnknown) const {
     llvm_unreachable("unknown builtin type");
 
   case Type::Record: {
-    const RecordDecl *RD = cast<RecordType>(type)->getOriginalDecl();
+    const auto *RD = cast<RecordType>(type)->getDecl();
     // For template specializations, look only at primary template attributes.
     // This is a consistent regardless of whether the instantiation is known.
     if (const auto *CTSD = dyn_cast<ClassTemplateSpecializationDecl>(RD))
@@ -5327,7 +5325,7 @@ bool Type::isCARCBridgableType() const {
 /// Check if the specified type is the CUDA device builtin surface type.
 bool Type::isCUDADeviceBuiltinSurfaceType() const {
   if (const auto *RT = getAsCanonical<RecordType>())
-    return RT->getOriginalDecl()
+    return RT->getDecl()
         ->getMostRecentDecl()
         ->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>();
   return false;
@@ -5336,7 +5334,7 @@ bool Type::isCUDADeviceBuiltinSurfaceType() const {
 /// Check if the specified type is the CUDA device builtin texture type.
 bool Type::isCUDADeviceBuiltinTextureType() const {
   if (const auto *RT = getAsCanonical<RecordType>())
-    return RT->getOriginalDecl()
+    return RT->getDecl()
         ->getMostRecentDecl()
         ->hasAttr<CUDADeviceBuiltinTextureTypeAttr>();
   return false;
diff --git a/clang/lib/AST/TypeLoc.cpp b/clang/lib/AST/TypeLoc.cpp
index e952e82031976..f54ccf0932bc7 100644
--- a/clang/lib/AST/TypeLoc.cpp
+++ b/clang/lib/AST/TypeLoc.cpp
@@ -303,8 +303,7 @@ bool TypeSpecTypeLoc::isKind(const TypeLoc &TL) {
 }
 
 bool TagTypeLoc::isDefinition() const {
-  return getTypePtr()->isTagOwned() &&
-         getOriginalDecl()->isCompleteDefinition();
+  return getTypePtr()->isTagOwned() && getDecl()->isCompleteDefinition();
 }
 
 // Reimplemented to account for GNU/C++ extension
diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp
index 66a1b684ec68b..2da7789fe8117 100644
--- a/clang/lib/AST/TypePrinter.cpp
+++ b/clang/lib/AST/TypePrinter.cpp
@@ -1421,7 +1421,7 @@ void TypePrinter::printDeducedTemplateSpecializationBefore(
     } else {
       // Should only get here for canonical types.
       const auto *CD = cast<ClassTemplateSpecializationDecl>(
-          cast<RecordType>(T->getDeducedType())->getOriginalDecl());
+          cast<RecordType>(T->getDeducedType())->getDecl());
       DeducedTD = CD->getSpecializedTemplate();
       Args = CD->getTemplateArgs().asArray();
     }
@@ -1565,7 +1565,7 @@ void TypePrinter::AppendScope(DeclContext *DC, raw_ostream &OS,
 }
 
 void TypePrinter::printTagType(const TagType *T, raw_ostream &OS) {
-  TagDecl *D = T->getOriginalDecl();
+  TagDecl *D = T->getDecl();
 
   if (Policy.IncludeTagDefinition && T->isTagOwned()) {
     D->print(OS, Policy, Indentation);
@@ -1669,11 +1669,11 @@ void TypePrinter::printTagType(const TagType *T, raw_ostream &OS) {
 void TypePrinter::printRecordBefore(const RecordType *T, raw_ostream &OS) {
   // Print the preferred name if we have one for this type.
   if (Policy.UsePreferredNames) {
-    for (const auto *PNA : T->getOriginalDecl()
+    for (const auto *PNA : T->getDecl()
                                ->getMostRecentDecl()
                                ->specific_attrs<PreferredNameAttr>()) {
       if (!declaresSameEntity(PNA->getTypedefType()->getAsCXXRecordDecl(),
-                              T->getOriginalDecl()))
+                              T->getDecl()))
         continue;
       // Find the outermost typedef or alias template.
       QualType T = PNA->getTypedefType();
@@ -1700,11 +1700,11 @@ void TypePrinter::printEnumAfter(const EnumType *T, raw_ostream &OS) {}
 
 void TypePrinter::printInjectedClassNameBefore(const InjectedClassNameType *T,
                                                raw_ostream &OS) {
-  const ASTContext &Ctx = T->getOriginalDecl()->getASTContext();
+  const ASTContext &Ctx = T->getDecl()->getASTContext();
   IncludeStrongLifetimeRAII Strong(Policy);
   T->getTemplateName(Ctx).print(OS, Policy);
   if (Policy.PrintInjectedClassNameWithArguments) {
-    auto *Decl = T->getOriginalDecl();
+    auto *Decl = T->getDecl();
     // FIXME: Use T->getTemplateArgs(Ctx) when that supports as-written
     // arguments.
     if (auto *RD = dyn_cast<ClassTemplateSpecializationDecl>(Decl)) {
diff --git a/clang/lib/AST/VTableBuilder.cpp b/clang/lib/AST/VTableBuilder.cpp
index 6cec526ba8443..3ded3a51206da 100644
--- a/clang/lib/AST/VTableBuilder.cpp
+++ b/clang/lib/AST/VTableBuilder.cpp
@@ -312,13 +312,12 @@ ComputeReturnAdjustmentBaseOffset(ASTContext &Context,
     return BaseOffset();
   }
 
-  const CXXRecordDecl *DerivedRD =
-      cast<CXXRecordDecl>(
-          cast<RecordType>(CanDerivedReturnType)->getOriginalDecl())
+  const auto *DerivedRD =
+      cast<CXXRecordDecl>(cast<RecordType>(CanDerivedReturnType)->getDecl())
           ->getDefinitionOrSelf();
 
-  const CXXRecordDecl *BaseRD = cast<CXXRecordDecl>(
-      cast<RecordType>(CanBaseReturnType)->getOriginalDecl());
+  const auto *BaseRD =
+      cast<CXXRecordDecl>(cast<RecordType>(CanBaseReturnType)->getDecl());
 
   return ComputeBaseOffset(Context, BaseRD, DerivedRD);
 }
diff --git a/clang/lib/Analysis/ExprMutationAnalyzer.cpp b/clang/lib/Analysis/ExprMutationAnalyzer.cpp
index 1e376da1be83d..75b17c545bb78 100644
--- a/clang/lib/Analysis/ExprMutationAnalyzer.cpp
+++ b/clang/lib/Analysis/ExprMutationAnalyzer.cpp
@@ -140,7 +140,8 @@ class ExprPointeeResolve {
       // explicit cast will be checked in `findPointeeToNonConst`
       const CastKind kind = ICE->getCastKind();
       if (kind == CK_LValueToRValue || kind == CK_DerivedToBase ||
-          kind == CK_UncheckedDerivedToBase)
+          kind == CK_UncheckedDerivedToBase ||
+          (kind == CK_NoOp && (ICE->getType() == ICE->getSubExpr()->getType())))
         return resolveExpr(ICE->getSubExpr());
       return false;
     }
@@ -788,13 +789,16 @@ ExprMutationAnalyzer::Analyzer::findPointeeToNonConst(const Expr *Exp) {
   // FIXME: false positive if the pointee does not change in lambda
   const auto CaptureNoConst = lambdaExpr(hasCaptureInit(Exp));
 
-  const auto Matches =
-      match(stmt(anyOf(forEachDescendant(
-                           stmt(anyOf(AssignToNonConst, PassAsNonConstArg,
-                                      CastToNonConst, CaptureNoConst))
-                               .bind("stmt")),
-                       forEachDescendant(InitToNonConst))),
-            Stm, Context);
+  const auto ReturnNoConst =
+      returnStmt(hasReturnValue(canResolveToExprPointee(Exp)));
+
+  const auto Matches = match(
+      stmt(anyOf(forEachDescendant(
+                     stmt(anyOf(AssignToNonConst, PassAsNonConstArg,
+                                CastToNonConst, CaptureNoConst, ReturnNoConst))
+                         .bind("stmt")),
+                 forEachDescendant(InitToNonConst))),
+      Stm, Context);
   return selectFirst<Stmt>("stmt", Matches);
 }
 
diff --git a/clang/lib/Analysis/FlowSensitive/Models/CMakeLists.txt b/clang/lib/Analysis/FlowSensitive/Models/CMakeLists.txt
index 89bbe8791eb2c..d1236f5714881 100644
--- a/clang/lib/Analysis/FlowSensitive/Models/CMakeLists.txt
+++ b/clang/lib/Analysis/FlowSensitive/Models/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_clang_library(clangAnalysisFlowSensitiveModels
   ChromiumCheckModel.cpp
   UncheckedOptionalAccessModel.cpp
+  UncheckedStatusOrAccessModel.cpp
 
   LINK_LIBS
   clangAnalysis
diff --git a/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp b/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp
index bb703eff4baff..0fa333eedcfdd 100644
--- a/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp
@@ -241,9 +241,9 @@ auto nulloptTypeDecl() {
 auto hasNulloptType() { return hasType(nulloptTypeDecl()); }
 
 auto inPlaceClass() {
-  return recordDecl(hasAnyName("std::in_place_t", "absl::in_place_t",
-                               "base::in_place_t", "folly::in_place_t",
-                               "bsl::in_place_t"));
+  return namedDecl(hasAnyName("std::in_place_t", "absl::in_place_t",
+                              "base::in_place_t", "folly::in_place_t",
+                              "bsl::in_place_t"));
 }
 
 auto isOptionalNulloptConstructor() {
diff --git a/clang/lib/Analysis/FlowSensitive/Models/UncheckedStatusOrAccessModel.cpp b/clang/lib/Analysis/FlowSensitive/Models/UncheckedStatusOrAccessModel.cpp
new file mode 100644
index 0000000000000..c88a4702cae07
--- /dev/null
+++ b/clang/lib/Analysis/FlowSensitive/Models/UncheckedStatusOrAccessModel.cpp
@@ -0,0 +1,290 @@
+//===- UncheckedStatusOrAccessModel.cpp -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Analysis/FlowSensitive/Models/UncheckedStatusOrAccessModel.h"
+
+#include <cassert>
+#include <utility>
+
+#include "clang/AST/DeclCXX.h"
+#include "clang/AST/DeclTemplate.h"
+#include "clang/AST/Expr.h"
+#include "clang/AST/ExprCXX.h"
+#include "clang/AST/TypeBase.h"
+#include "clang/ASTMatchers/ASTMatchFinder.h"
+#include "clang/ASTMatchers/ASTMatchers.h"
+#include "clang/ASTMatchers/ASTMatchersInternal.h"
+#include "clang/Analysis/CFG.h"
+#include "clang/Analysis/FlowSensitive/CFGMatchSwitch.h"
+#include "clang/Analysis/FlowSensitive/DataflowAnalysis.h"
+#include "clang/Analysis/FlowSensitive/DataflowEnvironment.h"
+#include "clang/Analysis/FlowSensitive/MatchSwitch.h"
+#include "clang/Analysis/FlowSensitive/StorageLocation.h"
+#include "clang/Analysis/FlowSensitive/Value.h"
+#include "clang/Basic/LLVM.h"
+#include "clang/Basic/SourceLocation.h"
+#include "llvm/ADT/StringMap.h"
+
+namespace clang::dataflow::statusor_model {
+namespace {
+
+using ::clang::ast_matchers::MatchFinder;
+using ::clang::ast_matchers::StatementMatcher;
+
+} // namespace
+
+static bool namespaceEquals(const NamespaceDecl *NS,
+                            clang::ArrayRef<clang::StringRef> NamespaceNames) {
+  while (!NamespaceNames.empty() && NS) {
+    if (NS->getName() != NamespaceNames.consume_back())
+      return false;
+    NS = dyn_cast_or_null<NamespaceDecl>(NS->getParent());
+  }
+  return NamespaceNames.empty() && !NS;
+}
+
+// TODO: move this to a proper place to share with the rest of clang
+static bool isTypeNamed(QualType Type, clang::ArrayRef<clang::StringRef> NS,
+                        StringRef Name) {
+  if (Type.isNull())
+    return false;
+  if (auto *RD = Type->getAsRecordDecl())
+    if (RD->getName() == Name)
+      if (const auto *N = dyn_cast_or_null<NamespaceDecl>(RD->getDeclContext()))
+        return namespaceEquals(N, NS);
+  return false;
+}
+
+static bool isStatusOrOperatorBaseType(QualType Type) {
+  return isTypeNamed(Type, {"absl", "internal_statusor"}, "OperatorBase");
+}
+
+static bool isSafeUnwrap(RecordStorageLocation *StatusOrLoc,
+                         const Environment &Env) {
+  if (!StatusOrLoc)
+    return false;
+  auto &StatusLoc = locForStatus(*StatusOrLoc);
+  auto *OkVal = Env.get<BoolValue>(locForOk(StatusLoc));
+  return OkVal != nullptr && Env.proves(OkVal->formula());
+}
+
+static ClassTemplateSpecializationDecl *
+getStatusOrBaseClass(const QualType &Ty) {
+  auto *RD = Ty->getAsCXXRecordDecl();
+  if (RD == nullptr)
+    return nullptr;
+  if (isStatusOrType(Ty) ||
+      // In case we are analyzing code under OperatorBase itself that uses
+      // operator* (e.g. to implement operator->).
+      isStatusOrOperatorBaseType(Ty))
+    return cast<ClassTemplateSpecializationDecl>(RD);
+  if (!RD->hasDefinition())
+    return nullptr;
+  for (const auto &Base : RD->bases())
+    if (auto *QT = getStatusOrBaseClass(Base.getType()))
+      return QT;
+  return nullptr;
+}
+
+static QualType getStatusOrValueType(ClassTemplateSpecializationDecl *TRD) {
+  return TRD->getTemplateArgs().get(0).getAsType();
+}
+
+static auto isStatusOrMemberCallWithName(llvm::StringRef member_name) {
+  using namespace ::clang::ast_matchers; // NOLINT: Too many names
+  return cxxMemberCallExpr(
+      on(expr(unless(cxxThisExpr()))),
+      callee(cxxMethodDecl(
+          hasName(member_name),
+          ofClass(anyOf(statusOrClass(), statusOrOperatorBaseClass())))));
+}
+
+static auto isStatusOrOperatorCallWithName(llvm::StringRef operator_name) {
+  using namespace ::clang::ast_matchers; // NOLINT: Too many names
+  return cxxOperatorCallExpr(
+      hasOverloadedOperatorName(operator_name),
+      callee(cxxMethodDecl(
+          ofClass(anyOf(statusOrClass(), statusOrOperatorBaseClass())))));
+}
+
+static auto valueCall() {
+  using namespace ::clang::ast_matchers; // NOLINT: Too many names
+  return anyOf(isStatusOrMemberCallWithName("value"),
+               isStatusOrMemberCallWithName("ValueOrDie"));
+}
+
+static auto valueOperatorCall() {
+  using namespace ::clang::ast_matchers; // NOLINT: Too many names
+  return expr(anyOf(isStatusOrOperatorCallWithName("*"),
+                    isStatusOrOperatorCallWithName("->")));
+}
+
+static auto
+buildDiagnoseMatchSwitch(const UncheckedStatusOrAccessModelOptions &Options) {
+  return CFGMatchSwitchBuilder<const Environment,
+                               llvm::SmallVector<SourceLocation>>()
+      // StatusOr::value, StatusOr::ValueOrDie
+      .CaseOfCFGStmt<CXXMemberCallExpr>(
+          valueCall(),
+          [](const CXXMemberCallExpr *E,
+             const ast_matchers::MatchFinder::MatchResult &,
+             const Environment &Env) {
+            if (!isSafeUnwrap(getImplicitObjectLocation(*E, Env), Env))
+              return llvm::SmallVector<SourceLocation>({E->getExprLoc()});
+            return llvm::SmallVector<SourceLocation>();
+          })
+
+      // StatusOr::operator*, StatusOr::operator->
+      .CaseOfCFGStmt<CXXOperatorCallExpr>(
+          valueOperatorCall(),
+          [](const CXXOperatorCallExpr *E,
+             const ast_matchers::MatchFinder::MatchResult &,
+             const Environment &Env) {
+            RecordStorageLocation *StatusOrLoc =
+                Env.get<RecordStorageLocation>(*E->getArg(0));
+            if (!isSafeUnwrap(StatusOrLoc, Env))
+              return llvm::SmallVector<SourceLocation>({E->getOperatorLoc()});
+            return llvm::SmallVector<SourceLocation>();
+          })
+      .Build();
+}
+
+UncheckedStatusOrAccessDiagnoser::UncheckedStatusOrAccessDiagnoser(
+    UncheckedStatusOrAccessModelOptions Options)
+    : DiagnoseMatchSwitch(buildDiagnoseMatchSwitch(Options)) {}
+
+llvm::SmallVector<SourceLocation> UncheckedStatusOrAccessDiagnoser::operator()(
+    const CFGElement &Elt, ASTContext &Ctx,
+    const TransferStateForDiagnostics<UncheckedStatusOrAccessModel::Lattice>
+        &State) {
+  return DiagnoseMatchSwitch(Elt, Ctx, State.Env);
+}
+
+BoolValue &initializeStatus(RecordStorageLocation &StatusLoc,
+                            Environment &Env) {
+  auto &OkVal = Env.makeAtomicBoolValue();
+  Env.setValue(locForOk(StatusLoc), OkVal);
+  return OkVal;
+}
+
+BoolValue &initializeStatusOr(RecordStorageLocation &StatusOrLoc,
+                              Environment &Env) {
+  return initializeStatus(locForStatus(StatusOrLoc), Env);
+}
+
+clang::ast_matchers::DeclarationMatcher statusOrClass() {
+  using namespace ::clang::ast_matchers; // NOLINT: Too many names
+  return classTemplateSpecializationDecl(
+      hasName("absl::StatusOr"),
+      hasTemplateArgument(0, refersToType(type().bind("T"))));
+}
+
+clang::ast_matchers::DeclarationMatcher statusClass() {
+  using namespace ::clang::ast_matchers; // NOLINT: Too many names
+  return cxxRecordDecl(hasName("absl::Status"));
+}
+
+clang::ast_matchers::DeclarationMatcher statusOrOperatorBaseClass() {
+  using namespace ::clang::ast_matchers; // NOLINT: Too many names
+  return classTemplateSpecializationDecl(
+      hasName("absl::internal_statusor::OperatorBase"));
+}
+
+clang::ast_matchers::TypeMatcher statusOrType() {
+  using namespace ::clang::ast_matchers; // NOLINT: Too many names
+  return hasCanonicalType(qualType(hasDeclaration(statusOrClass())));
+}
+
+bool isStatusOrType(QualType Type) {
+  return isTypeNamed(Type, {"absl"}, "StatusOr");
+}
+
+bool isStatusType(QualType Type) {
+  return isTypeNamed(Type, {"absl"}, "Status");
+}
+
+llvm::StringMap<QualType> getSyntheticFields(QualType Ty, QualType StatusType,
+                                             const CXXRecordDecl &RD) {
+  if (auto *TRD = getStatusOrBaseClass(Ty))
+    return {{"status", StatusType}, {"value", getStatusOrValueType(TRD)}};
+  if (isStatusType(Ty) || (RD.hasDefinition() &&
+                           RD.isDerivedFrom(StatusType->getAsCXXRecordDecl())))
+    return {{"ok", RD.getASTContext().BoolTy}};
+  return {};
+}
+
+RecordStorageLocation &locForStatus(RecordStorageLocation &StatusOrLoc) {
+  return cast<RecordStorageLocation>(StatusOrLoc.getSyntheticField("status"));
+}
+
+StorageLocation &locForOk(RecordStorageLocation &StatusLoc) {
+  return StatusLoc.getSyntheticField("ok");
+}
+
+BoolValue &valForOk(RecordStorageLocation &StatusLoc, Environment &Env) {
+  if (auto *Val = Env.get<BoolValue>(locForOk(StatusLoc)))
+    return *Val;
+  return initializeStatus(StatusLoc, Env);
+}
+
+static void transferStatusOrOkCall(const CXXMemberCallExpr *Expr,
+                                   const MatchFinder::MatchResult &,
+                                   LatticeTransferState &State) {
+  RecordStorageLocation *StatusOrLoc =
+      getImplicitObjectLocation(*Expr, State.Env);
+  if (StatusOrLoc == nullptr)
+    return;
+
+  auto &OkVal = valForOk(locForStatus(*StatusOrLoc), State.Env);
+  State.Env.setValue(*Expr, OkVal);
+}
+
+CFGMatchSwitch<LatticeTransferState>
+buildTransferMatchSwitch(ASTContext &Ctx,
+                         CFGMatchSwitchBuilder<LatticeTransferState> Builder) {
+  using namespace ::clang::ast_matchers; // NOLINT: Too many names
+  return std::move(Builder)
+      .CaseOfCFGStmt<CXXMemberCallExpr>(isStatusOrMemberCallWithName("ok"),
+                                        transferStatusOrOkCall)
+      .Build();
+}
+
+QualType findStatusType(const ASTContext &Ctx) {
+  for (Type *Ty : Ctx.getTypes())
+    if (isStatusType(QualType(Ty, 0)))
+      return QualType(Ty, 0);
+
+  return QualType();
+}
+
+UncheckedStatusOrAccessModel::UncheckedStatusOrAccessModel(ASTContext &Ctx,
+                                                           Environment &Env)
+    : DataflowAnalysis<UncheckedStatusOrAccessModel,
+                       UncheckedStatusOrAccessModel::Lattice>(Ctx),
+      TransferMatchSwitch(buildTransferMatchSwitch(Ctx, {})) {
+  QualType StatusType = findStatusType(Ctx);
+  Env.getDataflowAnalysisContext().setSyntheticFieldCallback(
+      [StatusType](QualType Ty) -> llvm::StringMap<QualType> {
+        CXXRecordDecl *RD = Ty->getAsCXXRecordDecl();
+        if (RD == nullptr)
+          return {};
+
+        if (auto Fields = getSyntheticFields(Ty, StatusType, *RD);
+            !Fields.empty())
+          return Fields;
+        return {};
+      });
+}
+
+void UncheckedStatusOrAccessModel::transfer(const CFGElement &Elt, Lattice &L,
+                                            Environment &Env) {
+  LatticeTransferState State(L, Env);
+  TransferMatchSwitch(Elt, getASTContext(), State);
+}
+
+} // namespace clang::dataflow::statusor_model
diff --git a/clang/lib/Analysis/LifetimeSafety/FactsGenerator.cpp b/clang/lib/Analysis/LifetimeSafety/FactsGenerator.cpp
index 485308f5b2c3b..9b68de107e314 100644
--- a/clang/lib/Analysis/LifetimeSafety/FactsGenerator.cpp
+++ b/clang/lib/Analysis/LifetimeSafety/FactsGenerator.cpp
@@ -9,9 +9,11 @@
 #include "clang/Analysis/Analyses/LifetimeSafety/FactsGenerator.h"
 #include "clang/Analysis/Analyses/LifetimeSafety/LifetimeAnnotations.h"
 #include "clang/Analysis/Analyses/PostOrderCFGView.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/TimeProfiler.h"
 
 namespace clang::lifetimes::internal {
+using llvm::isa_and_present;
 
 static bool isGslPointerType(QualType QT) {
   if (const auto *RD = QT->getAsCXXRecordDecl()) {
@@ -108,7 +110,7 @@ void FactsGenerator::VisitCXXMemberCallExpr(const CXXMemberCallExpr *MCE) {
   // Specifically for conversion operators,
   // like `std::string_view p = std::string{};`
   if (isGslPointerType(MCE->getType()) &&
-      isa<CXXConversionDecl>(MCE->getCalleeDecl())) {
+      isa_and_present<CXXConversionDecl>(MCE->getCalleeDecl())) {
     // The argument is the implicit object itself.
     handleFunctionCall(MCE, MCE->getMethodDecl(),
                        {MCE->getImplicitObjectArgument()},
diff --git a/clang/lib/Analysis/ThreadSafety.cpp b/clang/lib/Analysis/ThreadSafety.cpp
index a56fdb1abd625..77750cf89d7a7 100644
--- a/clang/lib/Analysis/ThreadSafety.cpp
+++ b/clang/lib/Analysis/ThreadSafety.cpp
@@ -2032,9 +2032,7 @@ void BuildLockset::handleCall(const Expr *Exp, const NamedDecl *D,
       assert(inserted.second && "Are we visiting the same expression again?");
       if (isa<CXXConstructExpr>(Exp))
         Self = Placeholder;
-      if (TagT->getOriginalDecl()
-              ->getMostRecentDecl()
-              ->hasAttr<ScopedLockableAttr>())
+      if (TagT->getDecl()->getMostRecentDecl()->hasAttr<ScopedLockableAttr>())
         Scp = CapabilityExpr(Placeholder, Exp->getType(), /*Neg=*/false);
     }
 
diff --git a/clang/lib/Basic/LangOptions.cpp b/clang/lib/Basic/LangOptions.cpp
index f034514466d3f..19b557603d135 100644
--- a/clang/lib/Basic/LangOptions.cpp
+++ b/clang/lib/Basic/LangOptions.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Basic/LangOptions.h"
+#include "clang/Basic/LangStandard.h"
 #include "llvm/Support/Path.h"
 
 using namespace clang;
@@ -131,8 +132,12 @@ void LangOptions::setLangDefaults(LangOptions &Opts, Language Lang,
   Opts.NamedLoops = Std.isC2y();
 
   Opts.HLSL = Lang == Language::HLSL;
-  if (Opts.HLSL && Opts.IncludeDefaultHeader)
-    Includes.push_back("hlsl.h");
+  if (Opts.HLSL) {
+    if (Opts.IncludeDefaultHeader)
+      Includes.push_back("hlsl.h");
+    // Set maximum matrix dimension to 4 for HLSL
+    Opts.MaxMatrixDimension = 4;
+  }
 
   // Set OpenCL Version.
   Opts.OpenCL = Std.isOpenCL();
@@ -243,3 +248,46 @@ LLVM_DUMP_METHOD void FPOptionsOverride::dump() {
 #include "clang/Basic/FPOptions.def"
   llvm::errs() << "\n";
 }
+
+std::optional<uint32_t> LangOptions::getCPlusPlusLangStd() const {
+  if (!CPlusPlus)
+    return std::nullopt;
+
+  LangStandard::Kind Std;
+  if (CPlusPlus26)
+    Std = LangStandard::lang_cxx26;
+  else if (CPlusPlus23)
+    Std = LangStandard::lang_cxx23;
+  else if (CPlusPlus20)
+    Std = LangStandard::lang_cxx20;
+  else if (CPlusPlus17)
+    Std = LangStandard::lang_cxx17;
+  else if (CPlusPlus14)
+    Std = LangStandard::lang_cxx14;
+  else if (CPlusPlus11)
+    Std = LangStandard::lang_cxx11;
+  else
+    Std = LangStandard::lang_cxx98;
+
+  return LangStandard::getLangStandardForKind(Std).getVersion();
+}
+
+std::optional<uint32_t> LangOptions::getCLangStd() const {
+  LangStandard::Kind Std;
+  if (C2y)
+    Std = LangStandard::lang_c2y;
+  else if (C23)
+    Std = LangStandard::lang_c23;
+  else if (C17)
+    Std = LangStandard::lang_c17;
+  else if (C11)
+    Std = LangStandard::lang_c11;
+  else if (C99)
+    Std = LangStandard::lang_c99;
+  else if (!GNUMode && Digraphs)
+    Std = LangStandard::lang_c94;
+  else
+    return std::nullopt;
+
+  return LangStandard::getLangStandardForKind(Std).getVersion();
+}
diff --git a/clang/lib/Basic/LangStandards.cpp b/clang/lib/Basic/LangStandards.cpp
index c49d095018b20..01c524b7220fb 100644
--- a/clang/lib/Basic/LangStandards.cpp
+++ b/clang/lib/Basic/LangStandards.cpp
@@ -46,16 +46,18 @@ StringRef clang::languageToString(Language L) {
   llvm_unreachable("unhandled language kind");
 }
 
-#define LANGSTANDARD(id, name, lang, desc, features)                           \
-  static const LangStandard Lang_##id = {name, desc, features, Language::lang};
+#define LANGSTANDARD(id, name, lang, desc, features, version)                  \
+  static const LangStandard Lang_##id = {name, desc, features, Language::lang, \
+                                         version};
 #include "clang/Basic/LangStandards.def"
 
 const LangStandard &LangStandard::getLangStandardForKind(Kind K) {
   switch (K) {
   case lang_unspecified:
     llvm::report_fatal_error("getLangStandardForKind() on unspecified kind");
-#define LANGSTANDARD(id, name, lang, desc, features) \
-    case lang_##id: return Lang_##id;
+#define LANGSTANDARD(id, name, lang, desc, features, version)                  \
+  case lang_##id:                                                              \
+    return Lang_##id;
 #include "clang/Basic/LangStandards.def"
   }
   llvm_unreachable("Invalid language kind!");
@@ -63,7 +65,8 @@ const LangStandard &LangStandard::getLangStandardForKind(Kind K) {
 
 LangStandard::Kind LangStandard::getLangKind(StringRef Name) {
   return llvm::StringSwitch<Kind>(Name)
-#define LANGSTANDARD(id, name, lang, desc, features) .Case(name, lang_##id)
+#define LANGSTANDARD(id, name, lang, desc, features, version)                  \
+  .Case(name, lang_##id)
 #define LANGSTANDARD_ALIAS(id, alias) .Case(alias, lang_##id)
 #include "clang/Basic/LangStandards.def"
       .Default(lang_unspecified);
diff --git a/clang/lib/Basic/Targets.cpp b/clang/lib/Basic/Targets.cpp
index b7e8bad4f404b..f39c698b5d734 100644
--- a/clang/lib/Basic/Targets.cpp
+++ b/clang/lib/Basic/Targets.cpp
@@ -222,6 +222,8 @@ std::unique_ptr<TargetInfo> AllocateTarget(const llvm::Triple &Triple,
       return std::make_unique<OHOSTargetInfo<ARMleTargetInfo>>(Triple, Opts);
     case llvm::Triple::FreeBSD:
       return std::make_unique<FreeBSDTargetInfo<ARMleTargetInfo>>(Triple, Opts);
+    case llvm::Triple::Fuchsia:
+      return std::make_unique<FuchsiaTargetInfo<ARMleTargetInfo>>(Triple, Opts);
     case llvm::Triple::NetBSD:
       return std::make_unique<NetBSDTargetInfo<ARMleTargetInfo>>(Triple, Opts);
     case llvm::Triple::OpenBSD:
@@ -254,6 +256,8 @@ std::unique_ptr<TargetInfo> AllocateTarget(const llvm::Triple &Triple,
       return std::make_unique<AppleMachOARMTargetInfo>(Triple, Opts);
 
     switch (os) {
+    case llvm::Triple::Fuchsia:
+      return std::make_unique<FuchsiaTargetInfo<ARMbeTargetInfo>>(Triple, Opts);
     case llvm::Triple::Linux:
       return std::make_unique<LinuxTargetInfo<ARMbeTargetInfo>>(Triple, Opts);
     case llvm::Triple::NetBSD:
diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index 9e03a0846ffba..18641a96063cd 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -1568,6 +1568,7 @@ bool AArch64TargetInfo::validateAsmConstraint(
     if (const unsigned Len = matchAsmCCConstraint(Name)) {
       Name += Len - 1;
       Info.setAllowsRegister();
+      Info.setOutputOperandBounds(0, 2);
       return true;
     }
   }
diff --git a/clang/lib/Basic/Targets/ARM.cpp b/clang/lib/Basic/Targets/ARM.cpp
index 3de17d2c829f1..d00a3a453ed52 100644
--- a/clang/lib/Basic/Targets/ARM.cpp
+++ b/clang/lib/Basic/Targets/ARM.cpp
@@ -260,6 +260,7 @@ ARMTargetInfo::ARMTargetInfo(const llvm::Triple &Triple,
     : TargetInfo(Triple), FPMath(FP_Default), IsAAPCS(true), LDREX(0),
       HW_FP(0) {
   bool IsFreeBSD = Triple.isOSFreeBSD();
+  bool IsFuchsia = Triple.isOSFuchsia();
   bool IsOpenBSD = Triple.isOSOpenBSD();
   bool IsNetBSD = Triple.isOSNetBSD();
   bool IsHaiku = Triple.isOSHaiku();
@@ -332,7 +333,7 @@ ARMTargetInfo::ARMTargetInfo(const llvm::Triple &Triple,
     default:
       if (IsNetBSD)
         setABI("apcs-gnu");
-      else if (IsFreeBSD || IsOpenBSD || IsHaiku || IsOHOS)
+      else if (IsFreeBSD || IsFuchsia || IsOpenBSD || IsHaiku || IsOHOS)
         setABI("aapcs-linux");
       else
         setABI("aapcs");
diff --git a/clang/lib/Basic/Targets/AVR.cpp b/clang/lib/Basic/Targets/AVR.cpp
index bbe7b01ca036d..2673669bc9035 100644
--- a/clang/lib/Basic/Targets/AVR.cpp
+++ b/clang/lib/Basic/Targets/AVR.cpp
@@ -420,23 +420,23 @@ static MCUInfo AVRMcus[] = {
 
 static bool ArchHasELPM(StringRef Arch) {
   return llvm::StringSwitch<bool>(Arch)
-    .Cases("31", "51", "6", true)
-    .Cases("102", "104", "105", "106", "107", true)
-    .Default(false);
+      .Cases({"31", "51", "6"}, true)
+      .Cases({"102", "104", "105", "106", "107"}, true)
+      .Default(false);
 }
 
 static bool ArchHasELPMX(StringRef Arch) {
   return llvm::StringSwitch<bool>(Arch)
-    .Cases("51", "6", true)
-    .Cases("102", "104", "105", "106", "107", true)
-    .Default(false);
+      .Cases({"51", "6"}, true)
+      .Cases({"102", "104", "105", "106", "107"}, true)
+      .Default(false);
 }
 
 static bool ArchHasMOVW(StringRef Arch) {
   return llvm::StringSwitch<bool>(Arch)
-    .Cases("25", "35", "4", "5", "51", "6", true)
-    .Cases("102", "103", "104", "105", "106", "107", true)
-    .Default(false);
+      .Cases({"25", "35", "4", "5", "51", "6"}, true)
+      .Cases({"102", "103", "104", "105", "106", "107"}, true)
+      .Default(false);
 }
 
 static bool ArchHasLPMX(StringRef Arch) {
@@ -445,16 +445,16 @@ static bool ArchHasLPMX(StringRef Arch) {
 
 static bool ArchHasMUL(StringRef Arch) {
   return llvm::StringSwitch<bool>(Arch)
-    .Cases("4", "5", "51", "6", true)
-    .Cases("102", "103", "104", "105", "106", "107", true)
-    .Default(false);
+      .Cases({"4", "5", "51", "6"}, true)
+      .Cases({"102", "103", "104", "105", "106", "107"}, true)
+      .Default(false);
 }
 
 static bool ArchHasJMPCALL(StringRef Arch) {
   return llvm::StringSwitch<bool>(Arch)
-    .Cases("3", "31", "35", "5", "51", "6", true)
-    .Cases("102", "103", "104", "105", "106", "107", true)
-    .Default(false);
+      .Cases({"3", "31", "35", "5", "51", "6"}, true)
+      .Cases({"102", "103", "104", "105", "106", "107"}, true)
+      .Default(false);
 }
 
 static bool ArchHas3BytePC(StringRef Arch) {
diff --git a/clang/lib/Basic/Targets/DirectX.h b/clang/lib/Basic/Targets/DirectX.h
index bd13c9ee0fd05..a21a593365773 100644
--- a/clang/lib/Basic/Targets/DirectX.h
+++ b/clang/lib/Basic/Targets/DirectX.h
@@ -64,8 +64,11 @@ class LLVM_LIBRARY_VISIBILITY DirectXTargetInfo : public TargetInfo {
     NoAsmVariants = true;
     PlatformMinVersion = Triple.getOSVersion();
     PlatformName = llvm::Triple::getOSTypeName(Triple.getOS());
+    // TODO: We need to align vectors on the element size generally, but for now
+    // we hard code this for 3-element 32- and 64-bit vectors as a workaround.
+    // See https://github.com/llvm/llvm-project/issues/123968
     resetDataLayout("e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:"
-                    "32-f64:64-n8:16:32:64");
+                    "32-f64:64-n8:16:32:64-v48:16:16-v96:32:32-v192:64:64");
     TheCXXABI.set(TargetCXXABI::GenericItanium);
   }
   bool useFP16ConversionIntrinsics() const override { return false; }
diff --git a/clang/lib/Basic/Targets/Mips.cpp b/clang/lib/Basic/Targets/Mips.cpp
index de6ccff64f4eb..a999d1410d254 100644
--- a/clang/lib/Basic/Targets/Mips.cpp
+++ b/clang/lib/Basic/Targets/Mips.cpp
@@ -68,11 +68,11 @@ void MipsTargetInfo::fillValidCPUList(
 
 unsigned MipsTargetInfo::getISARev() const {
   return llvm::StringSwitch<unsigned>(getCPU())
-      .Cases("mips32", "mips64", 1)
-      .Cases("mips32r2", "mips64r2", "octeon", "octeon+", 2)
-      .Cases("mips32r3", "mips64r3", 3)
-      .Cases("mips32r5", "mips64r5", "p5600", 5)
-      .Cases("mips32r6", "mips64r6", "i6400", "i6500", 6)
+      .Cases({"mips32", "mips64"}, 1)
+      .Cases({"mips32r2", "mips64r2", "octeon", "octeon+"}, 2)
+      .Cases({"mips32r3", "mips64r3"}, 3)
+      .Cases({"mips32r5", "mips64r5", "p5600"}, 5)
+      .Cases({"mips32r6", "mips64r6", "i6400", "i6500"}, 6)
       .Default(0);
 }
 
diff --git a/clang/lib/Basic/Targets/RISCV.cpp b/clang/lib/Basic/Targets/RISCV.cpp
index 04da4e637af51..685925b0773dc 100644
--- a/clang/lib/Basic/Targets/RISCV.cpp
+++ b/clang/lib/Basic/Targets/RISCV.cpp
@@ -192,8 +192,11 @@ void RISCVTargetInfo::getTargetDefines(const LangOptions &Opts,
     Builder.defineMacro("__riscv_muldiv");
   }
 
-  if (ISAInfo->hasExtension("a")) {
+  // The "a" extension is composed of "zalrsc" and "zaamo"
+  if (ISAInfo->hasExtension("a"))
     Builder.defineMacro("__riscv_atomic");
+
+  if (ISAInfo->hasExtension("zalrsc")) {
     Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_1");
     Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_2");
     Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_4");
diff --git a/clang/lib/Basic/Targets/RISCV.h b/clang/lib/Basic/Targets/RISCV.h
index d8b0e64c90dd6..85fa4cc07dccf 100644
--- a/clang/lib/Basic/Targets/RISCV.h
+++ b/clang/lib/Basic/Targets/RISCV.h
@@ -195,7 +195,8 @@ class LLVM_LIBRARY_VISIBILITY RISCV32TargetInfo : public RISCVTargetInfo {
   void setMaxAtomicWidth() override {
     MaxAtomicPromoteWidth = 128;
 
-    if (ISAInfo->hasExtension("a"))
+    // "a" implies "zalrsc" which is sufficient to inline atomics
+    if (ISAInfo->hasExtension("zalrsc"))
       MaxAtomicInlineWidth = 32;
   }
 };
@@ -225,7 +226,8 @@ class LLVM_LIBRARY_VISIBILITY RISCV64TargetInfo : public RISCVTargetInfo {
   void setMaxAtomicWidth() override {
     MaxAtomicPromoteWidth = 128;
 
-    if (ISAInfo->hasExtension("a"))
+    // "a" implies "zalrsc" which is sufficient to inline atomics
+    if (ISAInfo->hasExtension("zalrsc"))
       MaxAtomicInlineWidth = 64;
   }
 };
diff --git a/clang/lib/Basic/Targets/SystemZ.cpp b/clang/lib/Basic/Targets/SystemZ.cpp
index 13b86234eed79..30f846cb900f8 100644
--- a/clang/lib/Basic/Targets/SystemZ.cpp
+++ b/clang/lib/Basic/Targets/SystemZ.cpp
@@ -99,6 +99,16 @@ bool SystemZTargetInfo::validateAsmConstraint(
   case 'T': // Likewise, plus an index
     Info.setAllowsMemory();
     return true;
+  case '@':
+    // CC condition changes.
+    if (StringRef(Name) == "@cc") {
+      Name += 2;
+      Info.setAllowsRegister();
+      // SystemZ has 2-bits CC, and hence Interval [0, 4).
+      Info.setOutputOperandBounds(0, 4);
+      return true;
+    }
+    return false;
   }
 }
 
@@ -161,6 +171,9 @@ unsigned SystemZTargetInfo::getMinGlobalAlign(uint64_t Size,
 
 void SystemZTargetInfo::getTargetDefines(const LangOptions &Opts,
                                          MacroBuilder &Builder) const {
+  // Inline assembly supports SystemZ flag outputs.
+  Builder.defineMacro("__GCC_ASM_FLAG_OUTPUTS__");
+
   Builder.defineMacro("__s390__");
   Builder.defineMacro("__s390x__");
   Builder.defineMacro("__zarch__");
diff --git a/clang/lib/Basic/Targets/SystemZ.h b/clang/lib/Basic/Targets/SystemZ.h
index dc2185e1b45ca..4e15d5af1cde6 100644
--- a/clang/lib/Basic/Targets/SystemZ.h
+++ b/clang/lib/Basic/Targets/SystemZ.h
@@ -136,6 +136,12 @@ class LLVM_LIBRARY_VISIBILITY SystemZTargetInfo : public TargetInfo {
 
   std::string convertConstraint(const char *&Constraint) const override {
     switch (Constraint[0]) {
+    case '@': // Flag output operand.
+      if (llvm::StringRef(Constraint) == "@cc") {
+        Constraint += 2;
+        return std::string("{@cc}");
+      }
+      break;
     case 'p': // Keep 'p' constraint.
       return std::string("p");
     case 'Z':
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index 6eb4db51d4e6e..e71f10c4c16fc 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -625,6 +625,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
   case CK_ArrowlakeS:
   case CK_Lunarlake:
   case CK_Pantherlake:
+  case CK_Wildcatlake:
+  case CK_Novalake:
   case CK_Sierraforest:
   case CK_Grandridge:
   case CK_Graniterapids:
@@ -1516,6 +1518,7 @@ bool X86TargetInfo::validateAsmConstraint(
     if (auto Len = matchAsmCCConstraint(Name)) {
       Name += Len - 1;
       Info.setAllowsRegister();
+      Info.setOutputOperandBounds(0, 2);
       return true;
     }
     return false;
@@ -1612,6 +1615,8 @@ std::optional<unsigned> X86TargetInfo::getCPUCacheLineSize() const {
     case CK_ArrowlakeS:
     case CK_Lunarlake:
     case CK_Pantherlake:
+    case CK_Wildcatlake:
+    case CK_Novalake:
     case CK_Sierraforest:
     case CK_Grandridge:
     case CK_Graniterapids:
diff --git a/clang/lib/CIR/CodeGen/CIRGenAtomic.cpp b/clang/lib/CIR/CodeGen/CIRGenAtomic.cpp
index 0f4d6d20afc52..7db6e283ec0a5 100644
--- a/clang/lib/CIR/CodeGen/CIRGenAtomic.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenAtomic.cpp
@@ -255,7 +255,7 @@ static void emitAtomicCmpXchg(CIRGenFunction &cgf, AtomicExpr *e, bool isWeak,
   mlir::Value expected = builder.createLoad(loc, val1);
   mlir::Value desired = builder.createLoad(loc, val2);
 
-  auto cmpxchg = cir::AtomicCmpXchg::create(
+  auto cmpxchg = cir::AtomicCmpXchgOp::create(
       builder, loc, expected.getType(), builder.getBoolTy(), ptr.getPointer(),
       expected, desired,
       cir::MemOrderAttr::get(&cgf.getMLIRContext(), successOrder),
@@ -346,6 +346,8 @@ static void emitAtomicOp(CIRGenFunction &cgf, AtomicExpr *expr, Address dest,
   CIRGenBuilderTy &builder = cgf.getBuilder();
   mlir::Location loc = cgf.getLoc(expr->getSourceRange());
   auto orderAttr = cir::MemOrderAttr::get(builder.getContext(), order);
+  cir::AtomicFetchKindAttr fetchAttr;
+  bool fetchFirst = true;
 
   switch (expr->getOp()) {
   case AtomicExpr::AO__c11_atomic_init:
@@ -404,9 +406,106 @@ static void emitAtomicOp(CIRGenFunction &cgf, AtomicExpr *expr, Address dest,
   case AtomicExpr::AO__c11_atomic_exchange:
   case AtomicExpr::AO__atomic_exchange_n:
   case AtomicExpr::AO__atomic_exchange:
-    opName = cir::AtomicXchg::getOperationName();
+    opName = cir::AtomicXchgOp::getOperationName();
     break;
 
+  case AtomicExpr::AO__atomic_add_fetch:
+    fetchFirst = false;
+    [[fallthrough]];
+  case AtomicExpr::AO__c11_atomic_fetch_add:
+  case AtomicExpr::AO__atomic_fetch_add:
+    opName = cir::AtomicFetchOp::getOperationName();
+    fetchAttr = cir::AtomicFetchKindAttr::get(builder.getContext(),
+                                              cir::AtomicFetchKind::Add);
+    break;
+
+  case AtomicExpr::AO__atomic_sub_fetch:
+    fetchFirst = false;
+    [[fallthrough]];
+  case AtomicExpr::AO__c11_atomic_fetch_sub:
+  case AtomicExpr::AO__atomic_fetch_sub:
+    opName = cir::AtomicFetchOp::getOperationName();
+    fetchAttr = cir::AtomicFetchKindAttr::get(builder.getContext(),
+                                              cir::AtomicFetchKind::Sub);
+    break;
+
+  case AtomicExpr::AO__atomic_min_fetch:
+    fetchFirst = false;
+    [[fallthrough]];
+  case AtomicExpr::AO__c11_atomic_fetch_min:
+  case AtomicExpr::AO__atomic_fetch_min:
+    opName = cir::AtomicFetchOp::getOperationName();
+    fetchAttr = cir::AtomicFetchKindAttr::get(builder.getContext(),
+                                              cir::AtomicFetchKind::Min);
+    break;
+
+  case AtomicExpr::AO__atomic_max_fetch:
+    fetchFirst = false;
+    [[fallthrough]];
+  case AtomicExpr::AO__c11_atomic_fetch_max:
+  case AtomicExpr::AO__atomic_fetch_max:
+    opName = cir::AtomicFetchOp::getOperationName();
+    fetchAttr = cir::AtomicFetchKindAttr::get(builder.getContext(),
+                                              cir::AtomicFetchKind::Max);
+    break;
+
+  case AtomicExpr::AO__atomic_and_fetch:
+    fetchFirst = false;
+    [[fallthrough]];
+  case AtomicExpr::AO__c11_atomic_fetch_and:
+  case AtomicExpr::AO__atomic_fetch_and:
+    opName = cir::AtomicFetchOp::getOperationName();
+    fetchAttr = cir::AtomicFetchKindAttr::get(builder.getContext(),
+                                              cir::AtomicFetchKind::And);
+    break;
+
+  case AtomicExpr::AO__atomic_or_fetch:
+    fetchFirst = false;
+    [[fallthrough]];
+  case AtomicExpr::AO__c11_atomic_fetch_or:
+  case AtomicExpr::AO__atomic_fetch_or:
+    opName = cir::AtomicFetchOp::getOperationName();
+    fetchAttr = cir::AtomicFetchKindAttr::get(builder.getContext(),
+                                              cir::AtomicFetchKind::Or);
+    break;
+
+  case AtomicExpr::AO__atomic_xor_fetch:
+    fetchFirst = false;
+    [[fallthrough]];
+  case AtomicExpr::AO__c11_atomic_fetch_xor:
+  case AtomicExpr::AO__atomic_fetch_xor:
+    opName = cir::AtomicFetchOp::getOperationName();
+    fetchAttr = cir::AtomicFetchKindAttr::get(builder.getContext(),
+                                              cir::AtomicFetchKind::Xor);
+    break;
+
+  case AtomicExpr::AO__atomic_nand_fetch:
+    fetchFirst = false;
+    [[fallthrough]];
+  case AtomicExpr::AO__c11_atomic_fetch_nand:
+  case AtomicExpr::AO__atomic_fetch_nand:
+    opName = cir::AtomicFetchOp::getOperationName();
+    fetchAttr = cir::AtomicFetchKindAttr::get(builder.getContext(),
+                                              cir::AtomicFetchKind::Nand);
+    break;
+
+  case AtomicExpr::AO__atomic_test_and_set: {
+    auto op = cir::AtomicTestAndSetOp::create(
+        builder, loc, ptr.getPointer(), order,
+        builder.getI64IntegerAttr(ptr.getAlignment().getQuantity()),
+        expr->isVolatile());
+    builder.createStore(loc, op, dest);
+    return;
+  }
+
+  case AtomicExpr::AO__atomic_clear: {
+    cir::AtomicClearOp::create(
+        builder, loc, ptr.getPointer(), order,
+        builder.getI64IntegerAttr(ptr.getAlignment().getQuantity()),
+        expr->isVolatile());
+    return;
+  }
+
   case AtomicExpr::AO__opencl_atomic_init:
 
   case AtomicExpr::AO__hip_atomic_compare_exchange_strong:
@@ -433,79 +532,51 @@ static void emitAtomicOp(CIRGenFunction &cgf, AtomicExpr *expr, Address dest,
   case AtomicExpr::AO__scoped_atomic_exchange_n:
   case AtomicExpr::AO__scoped_atomic_exchange:
 
-  case AtomicExpr::AO__atomic_add_fetch:
   case AtomicExpr::AO__scoped_atomic_add_fetch:
 
-  case AtomicExpr::AO__c11_atomic_fetch_add:
   case AtomicExpr::AO__hip_atomic_fetch_add:
   case AtomicExpr::AO__opencl_atomic_fetch_add:
-  case AtomicExpr::AO__atomic_fetch_add:
   case AtomicExpr::AO__scoped_atomic_fetch_add:
 
-  case AtomicExpr::AO__atomic_sub_fetch:
   case AtomicExpr::AO__scoped_atomic_sub_fetch:
 
-  case AtomicExpr::AO__c11_atomic_fetch_sub:
   case AtomicExpr::AO__hip_atomic_fetch_sub:
   case AtomicExpr::AO__opencl_atomic_fetch_sub:
-  case AtomicExpr::AO__atomic_fetch_sub:
   case AtomicExpr::AO__scoped_atomic_fetch_sub:
 
-  case AtomicExpr::AO__atomic_min_fetch:
   case AtomicExpr::AO__scoped_atomic_min_fetch:
 
-  case AtomicExpr::AO__c11_atomic_fetch_min:
   case AtomicExpr::AO__hip_atomic_fetch_min:
   case AtomicExpr::AO__opencl_atomic_fetch_min:
-  case AtomicExpr::AO__atomic_fetch_min:
   case AtomicExpr::AO__scoped_atomic_fetch_min:
 
-  case AtomicExpr::AO__atomic_max_fetch:
   case AtomicExpr::AO__scoped_atomic_max_fetch:
 
-  case AtomicExpr::AO__c11_atomic_fetch_max:
   case AtomicExpr::AO__hip_atomic_fetch_max:
   case AtomicExpr::AO__opencl_atomic_fetch_max:
-  case AtomicExpr::AO__atomic_fetch_max:
   case AtomicExpr::AO__scoped_atomic_fetch_max:
 
-  case AtomicExpr::AO__atomic_and_fetch:
   case AtomicExpr::AO__scoped_atomic_and_fetch:
 
-  case AtomicExpr::AO__c11_atomic_fetch_and:
   case AtomicExpr::AO__hip_atomic_fetch_and:
   case AtomicExpr::AO__opencl_atomic_fetch_and:
-  case AtomicExpr::AO__atomic_fetch_and:
   case AtomicExpr::AO__scoped_atomic_fetch_and:
 
-  case AtomicExpr::AO__atomic_or_fetch:
   case AtomicExpr::AO__scoped_atomic_or_fetch:
 
-  case AtomicExpr::AO__c11_atomic_fetch_or:
   case AtomicExpr::AO__hip_atomic_fetch_or:
   case AtomicExpr::AO__opencl_atomic_fetch_or:
-  case AtomicExpr::AO__atomic_fetch_or:
   case AtomicExpr::AO__scoped_atomic_fetch_or:
 
-  case AtomicExpr::AO__atomic_xor_fetch:
   case AtomicExpr::AO__scoped_atomic_xor_fetch:
 
-  case AtomicExpr::AO__c11_atomic_fetch_xor:
   case AtomicExpr::AO__hip_atomic_fetch_xor:
   case AtomicExpr::AO__opencl_atomic_fetch_xor:
-  case AtomicExpr::AO__atomic_fetch_xor:
   case AtomicExpr::AO__scoped_atomic_fetch_xor:
 
-  case AtomicExpr::AO__atomic_nand_fetch:
   case AtomicExpr::AO__scoped_atomic_nand_fetch:
 
-  case AtomicExpr::AO__c11_atomic_fetch_nand:
-  case AtomicExpr::AO__atomic_fetch_nand:
   case AtomicExpr::AO__scoped_atomic_fetch_nand:
-
-  case AtomicExpr::AO__atomic_test_and_set:
-
-  case AtomicExpr::AO__atomic_clear:
     cgf.cgm.errorNYI(expr->getSourceRange(), "emitAtomicOp: expr op NYI");
     return;
   }
@@ -518,9 +589,13 @@ static void emitAtomicOp(CIRGenFunction &cgf, AtomicExpr *expr, Address dest,
   mlir::Operation *rmwOp = builder.create(loc, builder.getStringAttr(opName),
                                           atomicOperands, atomicResTys);
 
+  if (fetchAttr)
+    rmwOp->setAttr("binop", fetchAttr);
   rmwOp->setAttr("mem_order", orderAttr);
   if (expr->isVolatile())
     rmwOp->setAttr("is_volatile", builder.getUnitAttr());
+  if (fetchFirst && opName == cir::AtomicFetchOp::getOperationName())
+    rmwOp->setAttr("fetch_first", builder.getUnitAttr());
 
   mlir::Value result = rmwOp->getResult(0);
   builder.createStore(loc, result, dest);
@@ -581,6 +656,8 @@ RValue CIRGenFunction::emitAtomicExpr(AtomicExpr *e) {
 
   case AtomicExpr::AO__atomic_load_n:
   case AtomicExpr::AO__c11_atomic_load:
+  case AtomicExpr::AO__atomic_test_and_set:
+  case AtomicExpr::AO__atomic_clear:
     break;
 
   case AtomicExpr::AO__atomic_load:
@@ -614,8 +691,41 @@ RValue CIRGenFunction::emitAtomicExpr(AtomicExpr *e) {
       isWeakExpr = e->getWeak();
     break;
 
+  case AtomicExpr::AO__c11_atomic_fetch_add:
+  case AtomicExpr::AO__c11_atomic_fetch_sub:
+    if (memTy->isPointerType()) {
+      cgm.errorNYI(e->getSourceRange(),
+                   "atomic fetch-and-add and fetch-and-sub for pointers");
+      return RValue::get(nullptr);
+    }
+    [[fallthrough]];
+  case AtomicExpr::AO__atomic_fetch_add:
+  case AtomicExpr::AO__atomic_fetch_max:
+  case AtomicExpr::AO__atomic_fetch_min:
+  case AtomicExpr::AO__atomic_fetch_sub:
+  case AtomicExpr::AO__atomic_add_fetch:
+  case AtomicExpr::AO__atomic_max_fetch:
+  case AtomicExpr::AO__atomic_min_fetch:
+  case AtomicExpr::AO__atomic_sub_fetch:
+  case AtomicExpr::AO__c11_atomic_fetch_max:
+  case AtomicExpr::AO__c11_atomic_fetch_min:
+    shouldCastToIntPtrTy = !memTy->isFloatingType();
+    [[fallthrough]];
+
+  case AtomicExpr::AO__atomic_fetch_and:
+  case AtomicExpr::AO__atomic_fetch_nand:
+  case AtomicExpr::AO__atomic_fetch_or:
+  case AtomicExpr::AO__atomic_fetch_xor:
+  case AtomicExpr::AO__atomic_and_fetch:
+  case AtomicExpr::AO__atomic_nand_fetch:
+  case AtomicExpr::AO__atomic_or_fetch:
+  case AtomicExpr::AO__atomic_xor_fetch:
   case AtomicExpr::AO__atomic_exchange_n:
   case AtomicExpr::AO__atomic_store_n:
+  case AtomicExpr::AO__c11_atomic_fetch_and:
+  case AtomicExpr::AO__c11_atomic_fetch_nand:
+  case AtomicExpr::AO__c11_atomic_fetch_or:
+  case AtomicExpr::AO__c11_atomic_fetch_xor:
   case AtomicExpr::AO__c11_atomic_exchange:
   case AtomicExpr::AO__c11_atomic_store:
     val1 = emitValToTemp(*this, e->getVal1());
@@ -640,6 +750,9 @@ RValue CIRGenFunction::emitAtomicExpr(AtomicExpr *e) {
       dest = atomics.castToAtomicIntPointer(dest);
   } else if (e->isCmpXChg()) {
     dest = createMemTemp(resultTy, getLoc(e->getSourceRange()), "cmpxchg.bool");
+  } else if (e->getOp() == AtomicExpr::AO__atomic_test_and_set) {
+    dest = createMemTemp(resultTy, getLoc(e->getSourceRange()),
+                         "test_and_set.bool");
   } else if (!resultTy->isVoidType()) {
     dest = atomics.createTempAlloca();
     if (shouldCastToIntPtrTy)
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
index a6f10e6bb9e9c..50d585dca3b8c 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuilder.h
+++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
@@ -380,6 +380,16 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy {
                                       /*relative_layout=*/false);
   }
 
+  mlir::Value createDynCastToVoid(mlir::Location loc, mlir::Value src,
+                                  bool vtableUseRelativeLayout) {
+    // TODO(cir): consider address space here.
+    assert(!cir::MissingFeatures::addressSpace());
+    cir::PointerType destTy = getVoidPtrTy();
+    return cir::DynamicCastOp::create(
+        *this, loc, destTy, cir::DynamicCastKind::Ptr, src,
+        cir::DynamicCastInfoAttr{}, vtableUseRelativeLayout);
+  }
+
   Address createBaseClassAddr(mlir::Location loc, Address addr,
                               mlir::Type destType, unsigned offset,
                               bool assumeNotNull) {
@@ -519,6 +529,14 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy {
     return createGlobal(module, loc, uniqueName, type, isConstant, linkage);
   }
 
+  cir::StackSaveOp createStackSave(mlir::Location loc, mlir::Type ty) {
+    return cir::StackSaveOp::create(*this, loc, ty);
+  }
+
+  cir::StackRestoreOp createStackRestore(mlir::Location loc, mlir::Value v) {
+    return cir::StackRestoreOp::create(*this, loc, v);
+  }
+
   mlir::Value createSetBitfield(mlir::Location loc, mlir::Type resultType,
                                 Address dstAddr, mlir::Type storageType,
                                 mlir::Value src, const CIRGenBitFieldInfo &info,
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
index 4cfa91e09efb4..ea31871806bd7 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -463,7 +463,9 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
     return emitLibraryCall(*this, fd, e,
                            cgm.getBuiltinLibFunction(fd, builtinID));
 
-  cgm.errorNYI(e->getSourceRange(), "unimplemented builtin call");
+  cgm.errorNYI(e->getSourceRange(),
+               std::string("unimplemented builtin call: ") +
+                   getContext().BuiltinInfo.getName(builtinID));
   return getUndefRValue(e->getType());
 }
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenCXX.cpp b/clang/lib/CIR/CodeGen/CIRGenCXX.cpp
index 274d11b8c7629..171ce1c950907 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCXX.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCXX.cpp
@@ -171,7 +171,8 @@ cir::FuncOp CIRGenModule::codegenCXXStructor(GlobalDecl gd) {
   curCGF = nullptr;
 
   setNonAliasAttributes(gd, fn);
-  assert(!cir::MissingFeatures::opFuncAttributesForDefinition());
+  setCIRFunctionAttributesForDefinition(mlir::cast<FunctionDecl>(gd.getDecl()),
+                                        fn);
   return fn;
 }
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenCXXABI.cpp b/clang/lib/CIR/CodeGen/CIRGenCXXABI.cpp
index df42af828b0a3..d81492ef01540 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCXXABI.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCXXABI.cpp
@@ -37,6 +37,10 @@ CIRGenCXXABI::AddedStructorArgCounts CIRGenCXXABI::addImplicitConstructorArgs(
                                 addedArgs.suffix.size());
 }
 
+CatchTypeInfo CIRGenCXXABI::getCatchAllTypeInfo() {
+  return CatchTypeInfo{{}, 0};
+}
+
 void CIRGenCXXABI::buildThisParam(CIRGenFunction &cgf,
                                   FunctionArgList &params) {
   const auto *md = cast<CXXMethodDecl>(cgf.curGD.getDecl());
diff --git a/clang/lib/CIR/CodeGen/CIRGenCXXABI.h b/clang/lib/CIR/CodeGen/CIRGenCXXABI.h
index 6d3741c417351..da7cc72136ef1 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCXXABI.h
+++ b/clang/lib/CIR/CodeGen/CIRGenCXXABI.h
@@ -15,6 +15,7 @@
 #define LLVM_CLANG_LIB_CIR_CIRGENCXXABI_H
 
 #include "CIRGenCall.h"
+#include "CIRGenCleanup.h"
 #include "CIRGenFunction.h"
 #include "CIRGenModule.h"
 
@@ -155,6 +156,8 @@ class CIRGenCXXABI {
   /// Loads the incoming C++ this pointer as it was passed by the caller.
   mlir::Value loadIncomingCXXThis(CIRGenFunction &cgf);
 
+  virtual CatchTypeInfo getCatchAllTypeInfo();
+
   /// Get the implicit (second) parameter that comes after the "this" pointer,
   /// or nullptr if there is isn't one.
   virtual mlir::Value getCXXDestructorImplicitParam(CIRGenFunction &cgf,
diff --git a/clang/lib/CIR/CodeGen/CIRGenClass.cpp b/clang/lib/CIR/CodeGen/CIRGenClass.cpp
index dd357ce69f1b3..89f492603b11c 100644
--- a/clang/lib/CIR/CodeGen/CIRGenClass.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenClass.cpp
@@ -478,8 +478,7 @@ void CIRGenFunction::getVTablePointers(BaseSubobject base,
 
   for (const auto &nextBase : rd->bases()) {
     const auto *baseDecl =
-        cast<CXXRecordDecl>(
-            nextBase.getType()->castAs<RecordType>()->getOriginalDecl())
+        cast<CXXRecordDecl>(nextBase.getType()->castAs<RecordType>()->getDecl())
             ->getDefinitionOrSelf();
 
     // Ignore classes without a vtable.
@@ -1025,7 +1024,7 @@ void CIRGenFunction::enterDtorCleanups(const CXXDestructorDecl *dd,
 
     // Anonymous union members do not have their destructors called.
     const RecordType *rt = type->getAsUnionType();
-    if (rt && rt->getOriginalDecl()->isAnonymousStructOrUnion())
+    if (rt && rt->getDecl()->isAnonymousStructOrUnion())
       continue;
 
     CleanupKind cleanupKind = getCleanupKind(dtorKind);
diff --git a/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp b/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp
index 870069715df22..aabe4bbdf18c8 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp
@@ -108,6 +108,13 @@ void EHScopeStack::popCleanup() {
   assert(!cir::MissingFeatures::ehCleanupBranchFixups());
 }
 
+EHCatchScope *EHScopeStack::pushCatch(unsigned numHandlers) {
+  char *buffer = allocate(EHCatchScope::getSizeForNumHandlers(numHandlers));
+  assert(!cir::MissingFeatures::innermostEHScope());
+  EHCatchScope *scope = new (buffer) EHCatchScope(numHandlers);
+  return scope;
+}
+
 static void emitCleanup(CIRGenFunction &cgf, EHScopeStack::Cleanup *cleanup) {
   // Ask the cleanup to emit itself.
   assert(cgf.haveInsertPoint() && "expected insertion point");
diff --git a/clang/lib/CIR/CodeGen/CIRGenCleanup.h b/clang/lib/CIR/CodeGen/CIRGenCleanup.h
index 30f5607d655da..5852ea3f43dd5 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCleanup.h
+++ b/clang/lib/CIR/CodeGen/CIRGenCleanup.h
@@ -20,6 +20,13 @@
 
 namespace clang::CIRGen {
 
+/// The MS C++ ABI needs a pointer to RTTI data plus some flags to describe the
+/// type of a catch handler, so we use this wrapper.
+struct CatchTypeInfo {
+  mlir::TypedAttr rtti;
+  unsigned flags;
+};
+
 /// A protected scope for zero-cost EH handling.
 class EHScope {
   class CommonBitFields {
@@ -29,6 +36,12 @@ class EHScope {
   enum { NumCommonBits = 3 };
 
 protected:
+  class CatchBitFields {
+    friend class EHCatchScope;
+    unsigned : NumCommonBits;
+    unsigned numHandlers : 32 - NumCommonBits;
+  };
+
   class CleanupBitFields {
     friend class EHCleanupScope;
     unsigned : NumCommonBits;
@@ -58,6 +71,7 @@ class EHScope {
 
   union {
     CommonBitFields commonBits;
+    CatchBitFields catchBits;
     CleanupBitFields cleanupBits;
   };
 
@@ -67,6 +81,71 @@ class EHScope {
   EHScope(Kind kind) { commonBits.kind = kind; }
 
   Kind getKind() const { return static_cast<Kind>(commonBits.kind); }
+
+  bool mayThrow() const {
+    // Traditional LLVM codegen also checks for `!block->use_empty()`, but
+    // in CIRGen the block content is not important, just used as a way to
+    // signal `hasEHBranches`.
+    assert(!cir::MissingFeatures::ehstackBranches());
+    return false;
+  }
+};
+
+/// A scope which attempts to handle some, possibly all, types of
+/// exceptions.
+///
+/// Objective C \@finally blocks are represented using a cleanup scope
+/// after the catch scope.
+
+class EHCatchScope : public EHScope {
+  // In effect, we have a flexible array member
+  //   Handler Handlers[0];
+  // But that's only standard in C99, not C++, so we have to do
+  // annoying pointer arithmetic instead.
+
+public:
+  struct Handler {
+    /// A type info value, or null MLIR attribute for a catch-all
+    CatchTypeInfo type;
+
+    /// The catch handler for this type.
+    mlir::Region *region;
+  };
+
+private:
+  friend class EHScopeStack;
+
+  Handler *getHandlers() { return reinterpret_cast<Handler *>(this + 1); }
+
+public:
+  static size_t getSizeForNumHandlers(unsigned n) {
+    return sizeof(EHCatchScope) + n * sizeof(Handler);
+  }
+
+  EHCatchScope(unsigned numHandlers) : EHScope(Catch) {
+    catchBits.numHandlers = numHandlers;
+    assert(catchBits.numHandlers == numHandlers && "NumHandlers overflow?");
+  }
+
+  unsigned getNumHandlers() const { return catchBits.numHandlers; }
+
+  void setHandler(unsigned i, CatchTypeInfo type, mlir::Region *region) {
+    assert(i < getNumHandlers());
+    getHandlers()[i].type = type;
+    getHandlers()[i].region = region;
+  }
+
+  // Clear all handler blocks.
+  // FIXME: it's better to always call clearHandlerBlocks in DTOR and have a
+  // 'takeHandler' or some such function which removes ownership from the
+  // EHCatchScope object if the handlers should live longer than EHCatchScope.
+  void clearHandlerBlocks() {
+    // The blocks are owned by TryOp, nothing to delete.
+  }
+
+  static bool classof(const EHScope *scope) {
+    return scope->getKind() == Catch;
+  }
 };
 
 /// A cleanup scope which generates the cleanup blocks lazily.
@@ -147,5 +226,13 @@ EHScopeStack::find(stable_iterator savePoint) const {
   return iterator(endOfBuffer - savePoint.size);
 }
 
+inline void EHScopeStack::popCatch() {
+  assert(!empty() && "popping exception stack when not empty");
+
+  EHCatchScope &scope = llvm::cast<EHCatchScope>(*begin());
+  assert(!cir::MissingFeatures::innermostEHScope());
+  deallocate(EHCatchScope::getSizeForNumHandlers(scope.getNumHandlers()));
+}
+
 } // namespace clang::CIRGen
 #endif // CLANG_LIB_CIR_CODEGEN_CIRGENCLEANUP_H
diff --git a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
index 039d29033ea87..5667273c00daf 100644
--- a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
@@ -44,38 +44,70 @@ CIRGenFunction::emitAutoVarAlloca(const VarDecl &d,
 
   // If the type is variably-modified, emit all the VLA sizes for it.
   if (ty->isVariablyModifiedType())
-    cgm.errorNYI(d.getSourceRange(), "emitAutoVarDecl: variably modified type");
+    emitVariablyModifiedType(ty);
 
   assert(!cir::MissingFeatures::openMP());
 
   Address address = Address::invalid();
-  if (!ty->isConstantSizeType())
-    cgm.errorNYI(d.getSourceRange(), "emitAutoVarDecl: non-constant size type");
-
-  // A normal fixed sized variable becomes an alloca in the entry block,
-  // unless:
-  // - it's an NRVO variable.
-  // - we are compiling OpenMP and it's an OpenMP local variable.
-  if (nrvo) {
-    // The named return value optimization: allocate this variable in the
-    // return slot, so that we can elide the copy when returning this
-    // variable (C++0x [class.copy]p34).
-    address = returnValue;
-
-    if (const RecordDecl *rd = ty->getAsRecordDecl()) {
-      if (const auto *cxxrd = dyn_cast<CXXRecordDecl>(rd);
-          (cxxrd && !cxxrd->hasTrivialDestructor()) ||
-          rd->isNonTrivialToPrimitiveDestroy())
-        cgm.errorNYI(d.getSourceRange(), "emitAutoVarAlloca: set NRVO flag");
+  if (ty->isConstantSizeType()) {
+    // A normal fixed sized variable becomes an alloca in the entry block,
+    // unless:
+    // - it's an NRVO variable.
+    // - we are compiling OpenMP and it's an OpenMP local variable.
+    if (nrvo) {
+      // The named return value optimization: allocate this variable in the
+      // return slot, so that we can elide the copy when returning this
+      // variable (C++0x [class.copy]p34).
+      address = returnValue;
+
+      if (const RecordDecl *rd = ty->getAsRecordDecl()) {
+        if (const auto *cxxrd = dyn_cast<CXXRecordDecl>(rd);
+            (cxxrd && !cxxrd->hasTrivialDestructor()) ||
+            rd->isNonTrivialToPrimitiveDestroy())
+          cgm.errorNYI(d.getSourceRange(), "emitAutoVarAlloca: set NRVO flag");
+      }
+    } else {
+      // A normal fixed sized variable becomes an alloca in the entry block,
+      mlir::Type allocaTy = convertTypeForMem(ty);
+      // Create the temp alloca and declare variable using it.
+      address = createTempAlloca(allocaTy, alignment, loc, d.getName(),
+                                 /*arraySize=*/nullptr, /*alloca=*/nullptr, ip);
+      declare(address.getPointer(), &d, ty, getLoc(d.getSourceRange()),
+              alignment);
     }
   } else {
-    // A normal fixed sized variable becomes an alloca in the entry block,
-    mlir::Type allocaTy = convertTypeForMem(ty);
-    // Create the temp alloca and declare variable using it.
-    address = createTempAlloca(allocaTy, alignment, loc, d.getName(),
-                               /*arraySize=*/nullptr, /*alloca=*/nullptr, ip);
-    declare(address.getPointer(), &d, ty, getLoc(d.getSourceRange()),
-            alignment);
+    // Non-constant size type
+    assert(!cir::MissingFeatures::openMP());
+    if (!didCallStackSave) {
+      // Save the stack.
+      cir::PointerType defaultTy = AllocaInt8PtrTy;
+      CharUnits align = CharUnits::fromQuantity(
+          cgm.getDataLayout().getAlignment(defaultTy, false));
+      Address stack = createTempAlloca(defaultTy, align, loc, "saved_stack");
+
+      mlir::Value v = builder.createStackSave(loc, defaultTy);
+      assert(v.getType() == AllocaInt8PtrTy);
+      builder.createStore(loc, v, stack);
+
+      didCallStackSave = true;
+
+      // Push a cleanup block and restore the stack there.
+      // FIXME: in general circumstances, this should be an EH cleanup.
+      pushStackRestore(NormalCleanup, stack);
+    }
+
+    VlaSizePair vlaSize = getVLASize(ty);
+    mlir::Type memTy = convertTypeForMem(vlaSize.type);
+
+    // Allocate memory for the array.
+    address =
+        createTempAlloca(memTy, alignment, loc, d.getName(), vlaSize.numElts,
+                         /*alloca=*/nullptr, builder.saveInsertionPoint());
+
+    // If we have debug info enabled, properly describe the VLA dimensions for
+    // this type by registering the vla size expression for each of the
+    // dimensions.
+    assert(!cir::MissingFeatures::generateDebugInfo());
   }
 
   emission.addr = address;
@@ -696,8 +728,28 @@ struct DestroyObject final : EHScopeStack::Cleanup {
     cgf.emitDestroy(addr, type, destroyer);
   }
 };
+
+struct CallStackRestore final : EHScopeStack::Cleanup {
+  Address stack;
+  CallStackRestore(Address stack) : stack(stack) {}
+  void emit(CIRGenFunction &cgf) override {
+    mlir::Location loc = stack.getPointer().getLoc();
+    mlir::Value v = cgf.getBuilder().createLoad(loc, stack);
+    cgf.getBuilder().createStackRestore(loc, v);
+  }
+};
 } // namespace
 
+/// Push the standard destructor for the given type as
+/// at least a normal cleanup.
+void CIRGenFunction::pushDestroy(QualType::DestructionKind dtorKind,
+                                 Address addr, QualType type) {
+  assert(dtorKind && "cannot push destructor for trivial type");
+
+  CleanupKind cleanupKind = getCleanupKind(dtorKind);
+  pushDestroy(cleanupKind, addr, type, getDestroyer(dtorKind));
+}
+
 void CIRGenFunction::pushDestroy(CleanupKind cleanupKind, Address addr,
                                  QualType type, Destroyer *destroyer) {
   pushFullExprCleanup<DestroyObject>(cleanupKind, addr, type, destroyer);
@@ -805,6 +857,10 @@ CIRGenFunction::getDestroyer(QualType::DestructionKind kind) {
   llvm_unreachable("Unknown DestructionKind");
 }
 
+void CIRGenFunction::pushStackRestore(CleanupKind kind, Address spMem) {
+  ehStack.pushCleanup<CallStackRestore>(kind, spMem);
+}
+
 /// Enter a destroy cleanup for the given local variable.
 void CIRGenFunction::emitAutoVarTypeCleanup(
     const CIRGenFunction::AutoVarEmission &emission,
diff --git a/clang/lib/CIR/CodeGen/CIRGenException.cpp b/clang/lib/CIR/CodeGen/CIRGenException.cpp
index f9ff37bc1975c..717a3e0032cea 100644
--- a/clang/lib/CIR/CodeGen/CIRGenException.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenException.cpp
@@ -69,6 +69,153 @@ mlir::LogicalResult CIRGenFunction::emitCXXTryStmt(const CXXTryStmt &s) {
   if (s.getTryBlock()->body_empty())
     return mlir::LogicalResult::success();
 
-  cgm.errorNYI("exitCXXTryStmt: CXXTryStmt with non-empty body");
-  return mlir::LogicalResult::success();
+  mlir::Location loc = getLoc(s.getSourceRange());
+  // Create a scope to hold try local storage for catch params.
+
+  mlir::OpBuilder::InsertPoint scopeIP;
+  cir::ScopeOp::create(
+      builder, loc,
+      /*scopeBuilder=*/[&](mlir::OpBuilder &b, mlir::Location loc) {
+        scopeIP = builder.saveInsertionPoint();
+      });
+
+  mlir::OpBuilder::InsertionGuard guard(builder);
+  builder.restoreInsertionPoint(scopeIP);
+  mlir::LogicalResult result = emitCXXTryStmtUnderScope(s);
+  cir::YieldOp::create(builder, loc);
+  return result;
+}
+
+mlir::LogicalResult
+CIRGenFunction::emitCXXTryStmtUnderScope(const CXXTryStmt &s) {
+  const llvm::Triple &t = getTarget().getTriple();
+  // If we encounter a try statement on in an OpenMP target region offloaded to
+  // a GPU, we treat it as a basic block.
+  const bool isTargetDevice =
+      (cgm.getLangOpts().OpenMPIsTargetDevice && (t.isNVPTX() || t.isAMDGCN()));
+  if (isTargetDevice) {
+    cgm.errorNYI(
+        "emitCXXTryStmtUnderScope: OpenMP target region offloaded to GPU");
+    return mlir::success();
+  }
+
+  unsigned numHandlers = s.getNumHandlers();
+  mlir::Location tryLoc = getLoc(s.getBeginLoc());
+  mlir::OpBuilder::InsertPoint beginInsertTryBody;
+
+  bool hasCatchAll = false;
+  for (unsigned i = 0; i != numHandlers; ++i) {
+    hasCatchAll |= s.getHandler(i)->getExceptionDecl() == nullptr;
+    if (hasCatchAll)
+      break;
+  }
+
+  // Create the scope to represent only the C/C++ `try {}` part. However,
+  // don't populate right away. Create regions for the catch handlers,
+  // but don't emit the handler bodies yet. For now, only make sure the
+  // scope returns the exception information.
+  auto tryOp = cir::TryOp::create(
+      builder, tryLoc,
+      /*tryBuilder=*/
+      [&](mlir::OpBuilder &b, mlir::Location loc) {
+        beginInsertTryBody = builder.saveInsertionPoint();
+      },
+      /*handlersBuilder=*/
+      [&](mlir::OpBuilder &b, mlir::Location loc,
+          mlir::OperationState &result) {
+        mlir::OpBuilder::InsertionGuard guard(b);
+
+        // We create an extra region for an unwind catch handler in case the
+        // catch-all handler doesn't exists
+        unsigned numRegionsToCreate =
+            hasCatchAll ? numHandlers : numHandlers + 1;
+
+        for (unsigned i = 0; i != numRegionsToCreate; ++i) {
+          mlir::Region *region = result.addRegion();
+          builder.createBlock(region);
+        }
+      });
+
+  // Finally emit the body for try/catch.
+  {
+    mlir::Location loc = tryOp.getLoc();
+    mlir::OpBuilder::InsertionGuard guard(builder);
+    builder.restoreInsertionPoint(beginInsertTryBody);
+    CIRGenFunction::LexicalScope tryScope{*this, loc,
+                                          builder.getInsertionBlock()};
+
+    tryScope.setAsTry(tryOp);
+
+    // Attach the basic blocks for the catch regions.
+    enterCXXTryStmt(s, tryOp);
+
+    // Emit the body for the `try {}` part.
+    {
+      mlir::OpBuilder::InsertionGuard guard(builder);
+      CIRGenFunction::LexicalScope tryBodyScope{*this, loc,
+                                                builder.getInsertionBlock()};
+      if (emitStmt(s.getTryBlock(), /*useCurrentScope=*/true).failed())
+        return mlir::failure();
+    }
+
+    // Emit catch clauses.
+    exitCXXTryStmt(s);
+  }
+
+  return mlir::success();
+}
+
+void CIRGenFunction::enterCXXTryStmt(const CXXTryStmt &s, cir::TryOp tryOp,
+                                     bool isFnTryBlock) {
+  unsigned numHandlers = s.getNumHandlers();
+  EHCatchScope *catchScope = ehStack.pushCatch(numHandlers);
+  for (unsigned i = 0; i != numHandlers; ++i) {
+    const CXXCatchStmt *catchStmt = s.getHandler(i);
+    if (catchStmt->getExceptionDecl()) {
+      cgm.errorNYI("enterCXXTryStmt: CatchStmt with ExceptionDecl");
+      return;
+    }
+
+    // No exception decl indicates '...', a catch-all.
+    mlir::Region *handler = &tryOp.getHandlerRegions()[i];
+    catchScope->setHandler(i, cgm.getCXXABI().getCatchAllTypeInfo(), handler);
+
+    // Under async exceptions, catch(...) needs to catch HW exception too
+    // Mark scope with SehTryBegin as a SEH __try scope
+    if (getLangOpts().EHAsynch) {
+      cgm.errorNYI("enterCXXTryStmt: EHAsynch");
+      return;
+    }
+  }
+}
+
+void CIRGenFunction::exitCXXTryStmt(const CXXTryStmt &s, bool isFnTryBlock) {
+  unsigned numHandlers = s.getNumHandlers();
+  EHCatchScope &catchScope = cast<EHCatchScope>(*ehStack.begin());
+  assert(catchScope.getNumHandlers() == numHandlers);
+  cir::TryOp tryOp = curLexScope->getTry();
+
+  // If the catch was not required, bail out now.
+  if (!catchScope.mayThrow()) {
+    catchScope.clearHandlerBlocks();
+    ehStack.popCatch();
+
+    // Drop all basic block from all catch regions.
+    SmallVector<mlir::Block *> eraseBlocks;
+    for (mlir::Region &handlerRegion : tryOp.getHandlerRegions()) {
+      if (handlerRegion.empty())
+        continue;
+
+      for (mlir::Block &b : handlerRegion.getBlocks())
+        eraseBlocks.push_back(&b);
+    }
+
+    for (mlir::Block *b : eraseBlocks)
+      b->erase();
+
+    tryOp.setHandlerTypesAttr({});
+    return;
+  }
+
+  cgm.errorNYI("exitCXXTryStmt: Required catch");
 }
diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index f416571181153..52021fce1c675 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -1626,14 +1626,15 @@ LValue CIRGenFunction::emitBinaryOperatorLValue(const BinaryOperator *e) {
 
 /// Emit code to compute the specified expression which
 /// can have any type.  The result is returned as an RValue struct.
-RValue CIRGenFunction::emitAnyExpr(const Expr *e, AggValueSlot aggSlot) {
+RValue CIRGenFunction::emitAnyExpr(const Expr *e, AggValueSlot aggSlot,
+                                   bool ignoreResult) {
   switch (CIRGenFunction::getEvaluationKind(e->getType())) {
   case cir::TEK_Scalar:
     return RValue::get(emitScalarExpr(e));
   case cir::TEK_Complex:
     return RValue::getComplex(emitComplexExpr(e));
   case cir::TEK_Aggregate: {
-    if (aggSlot.isIgnored())
+    if (!ignoreResult && aggSlot.isIgnored())
       aggSlot = createAggTemp(e->getType(), getLoc(e->getSourceRange()),
                               getCounterAggTmpAsString());
     emitAggExpr(e, aggSlot);
@@ -1674,7 +1675,25 @@ CIRGenCallee CIRGenFunction::emitDirectCallee(const GlobalDecl &gd) {
     // name to make it clear it's not the actual builtin.
     auto fn = cast<cir::FuncOp>(curFn);
     if (fn.getName() != fdInlineName && onlyHasInlineBuiltinDeclaration(fd)) {
-      cgm.errorNYI("Inline only builtin function calls");
+      cir::FuncOp clone =
+          mlir::cast_or_null<cir::FuncOp>(cgm.getGlobalValue(fdInlineName));
+
+      if (!clone) {
+        // Create a forward declaration - the body will be generated in
+        // generateCode when the function definition is processed
+        cir::FuncOp calleeFunc = emitFunctionDeclPointer(cgm, gd);
+        mlir::OpBuilder::InsertionGuard guard(builder);
+        builder.setInsertionPointToStart(cgm.getModule().getBody());
+
+        clone = builder.create<cir::FuncOp>(calleeFunc.getLoc(), fdInlineName,
+                                            calleeFunc.getFunctionType());
+        clone.setLinkageAttr(cir::GlobalLinkageKindAttr::get(
+            &cgm.getMLIRContext(), cir::GlobalLinkageKind::InternalLinkage));
+        clone.setSymVisibility("private");
+        clone.setInlineKindAttr(cir::InlineAttr::get(
+            &cgm.getMLIRContext(), cir::InlineKind::AlwaysInline));
+      }
+      return CIRGenCallee::forDirect(clone, gd);
     }
 
     // Replaceable builtins provide their own implementation of a builtin. If we
@@ -1869,8 +1888,7 @@ RValue CIRGenFunction::emitCallExpr(const clang::CallExpr *e,
 /// Emit code to compute the specified expression, ignoring the result.
 void CIRGenFunction::emitIgnoredExpr(const Expr *e) {
   if (e->isPRValue()) {
-    assert(!cir::MissingFeatures::aggValueSlot());
-    emitAnyExpr(e);
+    emitAnyExpr(e, AggValueSlot::ignored(), /*ignoreResult=*/true);
     return;
   }
 
@@ -2068,7 +2086,7 @@ mlir::Value CIRGenFunction::emitAlloca(StringRef name, mlir::Type ty,
     mlir::OpBuilder::InsertionGuard guard(builder);
     builder.restoreInsertionPoint(ip);
     addr = builder.createAlloca(loc, /*addr type*/ localVarPtrTy,
-                                /*var type*/ ty, name, alignIntAttr);
+                                /*var type*/ ty, name, alignIntAttr, arraySize);
     assert(!cir::MissingFeatures::astVarDeclInterface());
   }
   return addr;
@@ -2394,6 +2412,180 @@ LValue CIRGenFunction::emitPredefinedLValue(const PredefinedExpr *e) {
   return emitStringLiteralLValue(sl, gvName);
 }
 
+LValue CIRGenFunction::emitOpaqueValueLValue(const OpaqueValueExpr *e) {
+  assert(OpaqueValueMappingData::shouldBindAsLValue(e));
+  return getOrCreateOpaqueLValueMapping(e);
+}
+
+namespace {
+// Handle the case where the condition is a constant evaluatable simple integer,
+// which means we don't have to separately handle the true/false blocks.
+std::optional<LValue> handleConditionalOperatorLValueSimpleCase(
+    CIRGenFunction &cgf, const AbstractConditionalOperator *e) {
+  const Expr *condExpr = e->getCond();
+  llvm::APSInt condExprVal;
+  if (!cgf.constantFoldsToSimpleInteger(condExpr, condExprVal))
+    return std::nullopt;
+
+  const Expr *live = e->getTrueExpr(), *dead = e->getFalseExpr();
+  if (!condExprVal.getBoolValue())
+    std::swap(live, dead);
+
+  if (cgf.containsLabel(dead))
+    return std::nullopt;
+
+  // If the true case is live, we need to track its region.
+  assert(!cir::MissingFeatures::incrementProfileCounter());
+  assert(!cir::MissingFeatures::pgoUse());
+  // If a throw expression we emit it and return an undefined lvalue
+  // because it can't be used.
+  if (auto *throwExpr = dyn_cast<CXXThrowExpr>(live->IgnoreParens())) {
+    cgf.emitCXXThrowExpr(throwExpr);
+    // Return an undefined lvalue - the throw terminates execution
+    // so this value will never actually be used
+    mlir::Type elemTy = cgf.convertType(dead->getType());
+    mlir::Value undefPtr =
+        cgf.getBuilder().getNullPtr(cgf.getBuilder().getPointerTo(elemTy),
+                                    cgf.getLoc(throwExpr->getSourceRange()));
+    return cgf.makeAddrLValue(Address(undefPtr, elemTy, CharUnits::One()),
+                              dead->getType());
+  }
+  return cgf.emitLValue(live);
+}
+
+/// Emit the operand of a glvalue conditional operator. This is either a glvalue
+/// or a (possibly-parenthesized) throw-expression. If this is a throw, no
+/// LValue is returned and the current block has been terminated.
+static std::optional<LValue> emitLValueOrThrowExpression(CIRGenFunction &cgf,
+                                                         const Expr *operand) {
+  if (auto *throwExpr = dyn_cast<CXXThrowExpr>(operand->IgnoreParens())) {
+    cgf.emitCXXThrowExpr(throwExpr);
+    return std::nullopt;
+  }
+
+  return cgf.emitLValue(operand);
+}
+} // namespace
+
+// Create and generate the 3 blocks for a conditional operator.
+// Leaves the 'current block' in the continuation basic block.
+template <typename FuncTy>
+CIRGenFunction::ConditionalInfo
+CIRGenFunction::emitConditionalBlocks(const AbstractConditionalOperator *e,
+                                      const FuncTy &branchGenFunc) {
+  ConditionalInfo info;
+  ConditionalEvaluation eval(*this);
+  mlir::Location loc = getLoc(e->getSourceRange());
+  CIRGenBuilderTy &builder = getBuilder();
+
+  mlir::Value condV = emitOpOnBoolExpr(loc, e->getCond());
+  SmallVector<mlir::OpBuilder::InsertPoint, 2> insertPoints{};
+  mlir::Type yieldTy{};
+
+  auto emitBranch = [&](mlir::OpBuilder &b, mlir::Location loc,
+                        const Expr *expr, std::optional<LValue> &resultLV) {
+    CIRGenFunction::LexicalScope lexScope{*this, loc, b.getInsertionBlock()};
+    curLexScope->setAsTernary();
+
+    assert(!cir::MissingFeatures::incrementProfileCounter());
+    eval.beginEvaluation();
+    resultLV = branchGenFunc(*this, expr);
+    mlir::Value resultPtr = resultLV ? resultLV->getPointer() : mlir::Value();
+    eval.endEvaluation();
+
+    if (resultPtr) {
+      yieldTy = resultPtr.getType();
+      cir::YieldOp::create(b, loc, resultPtr);
+    } else {
+      // If LHS or RHS is a void expression we need
+      // to patch arms as to properly match yield types.
+      // If the current block's terminator is an UnreachableOp (from a throw),
+      // we don't need a yield
+      if (builder.getInsertionBlock()->mightHaveTerminator()) {
+        mlir::Operation *terminator =
+            builder.getInsertionBlock()->getTerminator();
+        if (isa_and_nonnull<cir::UnreachableOp>(terminator))
+          insertPoints.push_back(b.saveInsertionPoint());
+      }
+    }
+  };
+
+  info.result = cir::TernaryOp::create(
+                    builder, loc, condV,
+                    /*trueBuilder=*/
+                    [&](mlir::OpBuilder &b, mlir::Location loc) {
+                      emitBranch(b, loc, e->getTrueExpr(), info.lhs);
+                    },
+                    /*falseBuilder=*/
+                    [&](mlir::OpBuilder &b, mlir::Location loc) {
+                      emitBranch(b, loc, e->getFalseExpr(), info.rhs);
+                    })
+                    .getResult();
+
+  // If both arms are void, so be it.
+  if (!yieldTy)
+    yieldTy = VoidTy;
+
+  // Insert required yields.
+  for (mlir::OpBuilder::InsertPoint &toInsert : insertPoints) {
+    mlir::OpBuilder::InsertionGuard guard(builder);
+    builder.restoreInsertionPoint(toInsert);
+
+    // Block does not return: build empty yield.
+    if (!yieldTy) {
+      cir::YieldOp::create(builder, loc);
+    } else { // Block returns: set null yield value.
+      mlir::Value op0 = builder.getNullValue(yieldTy, loc);
+      cir::YieldOp::create(builder, loc, op0);
+    }
+  }
+
+  return info;
+}
+
+LValue CIRGenFunction::emitConditionalOperatorLValue(
+    const AbstractConditionalOperator *expr) {
+  if (!expr->isGLValue()) {
+    // ?: here should be an aggregate.
+    assert(hasAggregateEvaluationKind(expr->getType()) &&
+           "Unexpected conditional operator!");
+    return emitAggExprToLValue(expr);
+  }
+
+  OpaqueValueMapping binding(*this, expr);
+  if (std::optional<LValue> res =
+          handleConditionalOperatorLValueSimpleCase(*this, expr))
+    return *res;
+
+  ConditionalInfo info =
+      emitConditionalBlocks(expr, [](CIRGenFunction &cgf, const Expr *e) {
+        return emitLValueOrThrowExpression(cgf, e);
+      });
+
+  if ((info.lhs && !info.lhs->isSimple()) ||
+      (info.rhs && !info.rhs->isSimple())) {
+    cgm.errorNYI(expr->getSourceRange(),
+                 "unsupported conditional operator with non-simple lvalue");
+    return LValue();
+  }
+
+  if (info.lhs && info.rhs) {
+    Address lhsAddr = info.lhs->getAddress();
+    Address rhsAddr = info.rhs->getAddress();
+    Address result(info.result, lhsAddr.getElementType(),
+                   std::min(lhsAddr.getAlignment(), rhsAddr.getAlignment()));
+    AlignmentSource alignSource =
+        std::max(info.lhs->getBaseInfo().getAlignmentSource(),
+                 info.rhs->getBaseInfo().getAlignmentSource());
+    assert(!cir::MissingFeatures::opTBAA());
+    return makeAddrLValue(result, expr->getType(), LValueBaseInfo(alignSource));
+  }
+
+  assert((info.lhs || info.rhs) &&
+         "both operands of glvalue conditional are throw-expressions?");
+  return info.lhs ? *info.lhs : *info.rhs;
+}
+
 /// An LValue is a candidate for having its loads and stores be made atomic if
 /// we are operating under /volatile:ms *and* the LValue itself is volatile and
 /// performing such an operation can be performed without a libcall.
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
index 901b937e4e3e7..568cbdb06bb48 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
@@ -24,6 +24,73 @@ using namespace clang;
 using namespace clang::CIRGen;
 
 namespace {
+// FIXME(cir): This should be a common helper between CIRGen
+// and traditional CodeGen
+/// Is the value of the given expression possibly a reference to or
+/// into a __block variable?
+static bool isBlockVarRef(const Expr *e) {
+  // Make sure we look through parens.
+  e = e->IgnoreParens();
+
+  // Check for a direct reference to a __block variable.
+  if (const DeclRefExpr *dre = dyn_cast<DeclRefExpr>(e)) {
+    const VarDecl *var = dyn_cast<VarDecl>(dre->getDecl());
+    return (var && var->hasAttr<BlocksAttr>());
+  }
+
+  // More complicated stuff.
+
+  // Binary operators.
+  if (const BinaryOperator *op = dyn_cast<BinaryOperator>(e)) {
+    // For an assignment or pointer-to-member operation, just care
+    // about the LHS.
+    if (op->isAssignmentOp() || op->isPtrMemOp())
+      return isBlockVarRef(op->getLHS());
+
+    // For a comma, just care about the RHS.
+    if (op->getOpcode() == BO_Comma)
+      return isBlockVarRef(op->getRHS());
+
+    // FIXME: pointer arithmetic?
+    return false;
+
+    // Check both sides of a conditional operator.
+  } else if (const AbstractConditionalOperator *op =
+                 dyn_cast<AbstractConditionalOperator>(e)) {
+    return isBlockVarRef(op->getTrueExpr()) ||
+           isBlockVarRef(op->getFalseExpr());
+
+    // OVEs are required to support BinaryConditionalOperators.
+  } else if (const OpaqueValueExpr *op = dyn_cast<OpaqueValueExpr>(e)) {
+    if (const Expr *src = op->getSourceExpr())
+      return isBlockVarRef(src);
+
+    // Casts are necessary to get things like (*(int*)&var) = foo().
+    // We don't really care about the kind of cast here, except
+    // we don't want to look through l2r casts, because it's okay
+    // to get the *value* in a __block variable.
+  } else if (const CastExpr *cast = dyn_cast<CastExpr>(e)) {
+    if (cast->getCastKind() == CK_LValueToRValue)
+      return false;
+    return isBlockVarRef(cast->getSubExpr());
+
+    // Handle unary operators.  Again, just aggressively look through
+    // it, ignoring the operation.
+  } else if (const UnaryOperator *uop = dyn_cast<UnaryOperator>(e)) {
+    return isBlockVarRef(uop->getSubExpr());
+
+    // Look into the base of a field access.
+  } else if (const MemberExpr *mem = dyn_cast<MemberExpr>(e)) {
+    return isBlockVarRef(mem->getBase());
+
+    // Look into the base of a subscript.
+  } else if (const ArraySubscriptExpr *sub = dyn_cast<ArraySubscriptExpr>(e)) {
+    return isBlockVarRef(sub->getBase());
+  }
+
+  return false;
+}
+
 class AggExprEmitter : public StmtVisitor<AggExprEmitter> {
 
   CIRGenFunction &cgf;
@@ -41,9 +108,7 @@ class AggExprEmitter : public StmtVisitor<AggExprEmitter> {
   AggValueSlot ensureSlot(mlir::Location loc, QualType t) {
     if (!dest.isIgnored())
       return dest;
-
-    cgf.cgm.errorNYI(loc, "Slot for ignored address");
-    return dest;
+    return cgf.createAggTemp(t, loc, "agg.tmp.ensured");
   }
 
   void ensureDest(mlir::Location loc, QualType ty) {
@@ -89,6 +154,47 @@ class AggExprEmitter : public StmtVisitor<AggExprEmitter> {
     (void)cgf.emitCompoundStmt(*e->getSubStmt(), &retAlloca, dest);
   }
 
+  void VisitBinAssign(const BinaryOperator *e) {
+    // For an assignment to work, the value on the right has
+    // to be compatible with the value on the left.
+    assert(cgf.getContext().hasSameUnqualifiedType(e->getLHS()->getType(),
+                                                   e->getRHS()->getType()) &&
+           "Invalid assignment");
+
+    if (isBlockVarRef(e->getLHS()) &&
+        e->getRHS()->HasSideEffects(cgf.getContext())) {
+      cgf.cgm.errorNYI(e->getSourceRange(),
+                       "block var reference with side effects");
+      return;
+    }
+
+    LValue lhs = cgf.emitLValue(e->getLHS());
+
+    // If we have an atomic type, evaluate into the destination and then
+    // do an atomic copy.
+    assert(!cir::MissingFeatures::atomicTypes());
+
+    // Codegen the RHS so that it stores directly into the LHS.
+    assert(!cir::MissingFeatures::aggValueSlotGC());
+    AggValueSlot lhsSlot = AggValueSlot::forLValue(
+        lhs, AggValueSlot::IsDestructed, AggValueSlot::IsAliased,
+        AggValueSlot::MayOverlap);
+
+    // A non-volatile aggregate destination might have volatile member.
+    if (!lhsSlot.isVolatile() && cgf.hasVolatileMember(e->getLHS()->getType()))
+      lhsSlot.setVolatile(true);
+
+    cgf.emitAggExpr(e->getRHS(), lhsSlot);
+
+    // Copy into the destination if the assignment isn't ignored.
+    emitFinalDestCopy(e->getType(), lhs);
+
+    if (!dest.isIgnored() && !dest.isExternallyDestructed() &&
+        e->getType().isDestructedType() == QualType::DK_nontrivial_c_struct)
+      cgf.pushDestroy(QualType::DK_nontrivial_c_struct, dest.getAddress(),
+                      e->getType());
+  }
+
   void VisitDeclRefExpr(DeclRefExpr *e) { emitAggLoadOfLValue(e); }
 
   void VisitInitListExpr(InitListExpr *e);
@@ -170,19 +276,10 @@ class AggExprEmitter : public StmtVisitor<AggExprEmitter> {
   void VisitConstantExpr(ConstantExpr *e) {
     cgf.cgm.errorNYI(e->getSourceRange(), "AggExprEmitter: VisitConstantExpr");
   }
-  void VisitMemberExpr(MemberExpr *e) {
-    cgf.cgm.errorNYI(e->getSourceRange(), "AggExprEmitter: VisitMemberExpr");
-  }
-  void VisitUnaryDeref(UnaryOperator *e) {
-    cgf.cgm.errorNYI(e->getSourceRange(), "AggExprEmitter: VisitUnaryDeref");
-  }
-  void VisitStringLiteral(StringLiteral *e) {
-    cgf.cgm.errorNYI(e->getSourceRange(), "AggExprEmitter: VisitStringLiteral");
-  }
-  void VisitCompoundLiteralExpr(CompoundLiteralExpr *e) {
-    cgf.cgm.errorNYI(e->getSourceRange(),
-                     "AggExprEmitter: VisitCompoundLiteralExpr");
-  }
+  void VisitMemberExpr(MemberExpr *e) { emitAggLoadOfLValue(e); }
+  void VisitUnaryDeref(UnaryOperator *e) { emitAggLoadOfLValue(e); }
+  void VisitStringLiteral(StringLiteral *e) { emitAggLoadOfLValue(e); }
+  void VisitCompoundLiteralExpr(CompoundLiteralExpr *e);
   void VisitPredefinedExpr(const PredefinedExpr *e) {
     cgf.cgm.errorNYI(e->getSourceRange(),
                      "AggExprEmitter: VisitPredefinedExpr");
@@ -195,9 +292,6 @@ class AggExprEmitter : public StmtVisitor<AggExprEmitter> {
     cgf.cgm.errorNYI(e->getSourceRange(),
                      "AggExprEmitter: VisitPointerToDataMemberBinaryOperator");
   }
-  void VisitBinAssign(const BinaryOperator *e) {
-    cgf.cgm.errorNYI(e->getSourceRange(), "AggExprEmitter: VisitBinAssign");
-  }
   void VisitBinComma(const BinaryOperator *e) {
     cgf.emitIgnoredExpr(e->getLHS());
     Visit(e->getRHS());
@@ -325,6 +419,31 @@ void AggExprEmitter::emitAggLoadOfLValue(const Expr *e) {
   emitFinalDestCopy(e->getType(), lv);
 }
 
+void AggExprEmitter::VisitCompoundLiteralExpr(CompoundLiteralExpr *e) {
+  if (dest.isPotentiallyAliased() && e->getType().isPODType(cgf.getContext())) {
+    // For a POD type, just emit a load of the lvalue + a copy, because our
+    // compound literal might alias the destination.
+    emitAggLoadOfLValue(e);
+    return;
+  }
+
+  AggValueSlot slot = ensureSlot(cgf.getLoc(e->getSourceRange()), e->getType());
+
+  // Block-scope compound literals are destroyed at the end of the enclosing
+  // scope in C.
+  bool destruct =
+      !cgf.getLangOpts().CPlusPlus && !slot.isExternallyDestructed();
+  if (destruct)
+    slot.setExternallyDestructed();
+
+  cgf.emitAggExpr(e->getInitializer(), slot);
+
+  if (destruct)
+    if ([[maybe_unused]] QualType::DestructionKind dtorKind =
+            e->getType().isDestructedType())
+      cgf.cgm.errorNYI(e->getSourceRange(), "compound literal with destructor");
+}
+
 void AggExprEmitter::emitArrayInit(Address destPtr, cir::ArrayType arrayTy,
                                    QualType arrayQTy, Expr *e,
                                    ArrayRef<Expr *> args, Expr *arrayFiller) {
@@ -487,7 +606,8 @@ void AggExprEmitter::emitCopy(QualType type, const AggValueSlot &dest,
   LValue destLV = cgf.makeAddrLValue(dest.getAddress(), type);
   LValue srcLV = cgf.makeAddrLValue(src.getAddress(), type);
   assert(!cir::MissingFeatures::aggValueSlotVolatile());
-  cgf.emitAggregateCopy(destLV, srcLV, type, dest.mayOverlap());
+  cgf.emitAggregateCopy(destLV, srcLV, type, dest.mayOverlap(),
+                        dest.isVolatile() || src.isVolatile());
 }
 
 void AggExprEmitter::emitInitializationToLValue(Expr *e, LValue lv) {
@@ -788,7 +908,8 @@ void CIRGenFunction::emitAggExpr(const Expr *e, AggValueSlot slot) {
 }
 
 void CIRGenFunction::emitAggregateCopy(LValue dest, LValue src, QualType ty,
-                                       AggValueSlot::Overlap_t mayOverlap) {
+                                       AggValueSlot::Overlap_t mayOverlap,
+                                       bool isVolatile) {
   // TODO(cir): this function needs improvements, commented code for now since
   // this will be touched again soon.
   assert(!ty->isAnyComplexType() && "Unexpected copy of complex");
@@ -844,7 +965,7 @@ void CIRGenFunction::emitAggregateCopy(LValue dest, LValue src, QualType ty,
     cgm.errorNYI("emitAggregateCopy: GC");
 
   [[maybe_unused]] cir::CopyOp copyOp =
-      builder.createCopy(destPtr.getPointer(), srcPtr.getPointer());
+      builder.createCopy(destPtr.getPointer(), srcPtr.getPointer(), isVolatile);
 
   assert(!cir::MissingFeatures::opTBAA());
 }
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
index fcde4875393cd..d8f4943a7755a 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
@@ -196,9 +196,8 @@ class ComplexExprEmitter : public StmtVisitor<ComplexExprEmitter, mlir::Value> {
     return Visit(e->getSubExpr());
   }
   mlir::Value VisitCXXDefaultArgExpr(CXXDefaultArgExpr *dae) {
-    cgf.cgm.errorNYI(dae->getExprLoc(),
-                     "ComplexExprEmitter VisitCXXDefaultArgExpr");
-    return {};
+    CIRGenFunction::CXXDefaultArgExprScope scope(cgf, dae);
+    return Visit(dae->getExpr());
   }
   mlir::Value VisitCXXDefaultInitExpr(CXXDefaultInitExpr *die) {
     CIRGenFunction::CXXDefaultInitExprScope scope(cgf, die);
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp b/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
index 89e9ec4e9d613..19ed6560245b4 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
@@ -614,7 +614,7 @@ bool ConstRecordBuilder::applyZeroInitPadding(const ASTRecordLayout &layout,
 bool ConstRecordBuilder::build(InitListExpr *ile, bool allowOverwrite) {
   RecordDecl *rd = ile->getType()
                        ->castAs<clang::RecordType>()
-                       ->getOriginalDecl()
+                       ->getDecl()
                        ->getDefinitionOrSelf();
   const ASTRecordLayout &layout = cgm.getASTContext().getASTRecordLayout(rd);
 
@@ -817,9 +817,8 @@ bool ConstRecordBuilder::build(const APValue &val, const RecordDecl *rd,
 
 mlir::Attribute ConstRecordBuilder::finalize(QualType type) {
   type = type.getNonReferenceType();
-  RecordDecl *rd = type->castAs<clang::RecordType>()
-                       ->getOriginalDecl()
-                       ->getDefinitionOrSelf();
+  RecordDecl *rd =
+      type->castAs<clang::RecordType>()->getDecl()->getDefinitionOrSelf();
   mlir::Type valTy = cgm.convertType(type);
   return builder.build(valTy, rd->hasFlexibleArrayMember());
 }
@@ -842,9 +841,8 @@ mlir::Attribute ConstRecordBuilder::buildRecord(ConstantEmitter &emitter,
   ConstantAggregateBuilder constant(emitter.cgm);
   ConstRecordBuilder builder(emitter, constant, CharUnits::Zero());
 
-  const RecordDecl *rd = valTy->castAs<clang::RecordType>()
-                             ->getOriginalDecl()
-                             ->getDefinitionOrSelf();
+  const RecordDecl *rd =
+      valTy->castAs<clang::RecordType>()->getDecl()->getDefinitionOrSelf();
   const CXXRecordDecl *cd = dyn_cast<CXXRecordDecl>(rd);
   if (!builder.build(val, rd, false, cd, CharUnits::Zero()))
     return nullptr;
@@ -873,7 +871,7 @@ bool ConstRecordBuilder::updateRecord(ConstantEmitter &emitter,
 class ConstExprEmitter
     : public StmtVisitor<ConstExprEmitter, mlir::Attribute, QualType> {
   CIRGenModule &cgm;
-  LLVM_ATTRIBUTE_UNUSED ConstantEmitter &emitter;
+  [[maybe_unused]] ConstantEmitter &emitter;
 
 public:
   ConstExprEmitter(ConstantEmitter &emitter)
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index 637f9ef65c88f..138082b59fecd 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -1734,9 +1734,9 @@ mlir::Value ScalarExprEmitter::emitSub(const BinOpInfo &ops) {
   // LLVM we shall take VLA's, division by element size, etc.
   //
   // See more in `EmitSub` in CGExprScalar.cpp.
-  assert(!cir::MissingFeatures::ptrDiffOp());
-  cgf.cgm.errorNYI("ptrdiff");
-  return {};
+  assert(!cir::MissingFeatures::llvmLoweringPtrDiffConsidersPointee());
+  return cir::PtrDiffOp::create(builder, cgf.getLoc(ops.loc), cgf.PtrDiffTy,
+                                ops.lhs, ops.rhs);
 }
 
 mlir::Value ScalarExprEmitter::emitShl(const BinOpInfo &ops) {
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
index 01a43a997637c..d3c0d9f109317 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
@@ -410,6 +410,8 @@ void CIRGenFunction::startFunction(GlobalDecl gd, QualType returnType,
   curFn = fn;
 
   const Decl *d = gd.getDecl();
+
+  didCallStackSave = false;
   curCodeDecl = d;
   const auto *fd = dyn_cast_or_null<FunctionDecl>(d);
   curFuncDecl = d->getNonClosureContext();
@@ -549,6 +551,49 @@ cir::FuncOp CIRGenFunction::generateCode(clang::GlobalDecl gd, cir::FuncOp fn,
   const auto funcDecl = cast<FunctionDecl>(gd.getDecl());
   curGD = gd;
 
+  if (funcDecl->isInlineBuiltinDeclaration()) {
+    // When generating code for a builtin with an inline declaration, use a
+    // mangled name to hold the actual body, while keeping an external
+    // declaration in case the function pointer is referenced somewhere.
+    std::string fdInlineName = (cgm.getMangledName(funcDecl) + ".inline").str();
+    cir::FuncOp clone =
+        mlir::cast_or_null<cir::FuncOp>(cgm.getGlobalValue(fdInlineName));
+    if (!clone) {
+      mlir::OpBuilder::InsertionGuard guard(builder);
+      builder.setInsertionPoint(fn);
+      clone = builder.create<cir::FuncOp>(fn.getLoc(), fdInlineName,
+                                          fn.getFunctionType());
+      clone.setLinkage(cir::GlobalLinkageKind::InternalLinkage);
+      clone.setSymVisibility("private");
+      clone.setInlineKind(cir::InlineKind::AlwaysInline);
+    }
+    fn.setLinkage(cir::GlobalLinkageKind::ExternalLinkage);
+    fn.setSymVisibility("private");
+    fn = clone;
+  } else {
+    // Detect the unusual situation where an inline version is shadowed by a
+    // non-inline version. In that case we should pick the external one
+    // everywhere. That's GCC behavior too.
+    for (const FunctionDecl *pd = funcDecl->getPreviousDecl(); pd;
+         pd = pd->getPreviousDecl()) {
+      if (LLVM_UNLIKELY(pd->isInlineBuiltinDeclaration())) {
+        std::string inlineName = funcDecl->getName().str() + ".inline";
+        if (auto inlineFn = mlir::cast_or_null<cir::FuncOp>(
+                cgm.getGlobalValue(inlineName))) {
+          // Replace all uses of the .inline function with the regular function
+          // FIXME: This performs a linear walk over the module. Introduce some
+          // caching here.
+          if (inlineFn
+                  .replaceAllSymbolUses(fn.getSymNameAttr(), cgm.getModule())
+                  .failed())
+            llvm_unreachable("Failed to replace inline builtin symbol uses");
+          inlineFn.erase();
+        }
+        break;
+      }
+    }
+  }
+
   SourceLocation loc = funcDecl->getLocation();
   Stmt *body = funcDecl->getBody();
   SourceRange bodyRange =
@@ -820,6 +865,10 @@ LValue CIRGenFunction::emitLValue(const Expr *e) {
                                std::string("l-value not implemented for '") +
                                    e->getStmtClassName() + "'");
     return LValue();
+  case Expr::ConditionalOperatorClass:
+    return emitConditionalOperatorLValue(cast<ConditionalOperator>(e));
+  case Expr::BinaryConditionalOperatorClass:
+    return emitConditionalOperatorLValue(cast<BinaryConditionalOperator>(e));
   case Expr::ArraySubscriptExprClass:
     return emitArraySubscriptExpr(cast<ArraySubscriptExpr>(e));
   case Expr::UnaryOperatorClass:
@@ -864,6 +913,8 @@ LValue CIRGenFunction::emitLValue(const Expr *e) {
     return emitCastLValue(cast<CastExpr>(e));
   case Expr::MaterializeTemporaryExprClass:
     return emitMaterializeTemporaryExpr(cast<MaterializeTemporaryExpr>(e));
+  case Expr::OpaqueValueExprClass:
+    return emitOpaqueValueLValue(cast<OpaqueValueExpr>(e));
   case Expr::ChooseExprClass:
     return emitLValue(cast<ChooseExpr>(e)->getChosenSubExpr());
   }
@@ -1006,6 +1057,41 @@ mlir::Value CIRGenFunction::emitAlignmentAssumption(
                                  offsetValue);
 }
 
+CIRGenFunction::VlaSizePair CIRGenFunction::getVLASize(QualType type) {
+  const VariableArrayType *vla =
+      cgm.getASTContext().getAsVariableArrayType(type);
+  assert(vla && "type was not a variable array type!");
+  return getVLASize(vla);
+}
+
+CIRGenFunction::VlaSizePair
+CIRGenFunction::getVLASize(const VariableArrayType *type) {
+  // The number of elements so far; always size_t.
+  mlir::Value numElements;
+
+  QualType elementType;
+  do {
+    elementType = type->getElementType();
+    mlir::Value vlaSize = vlaSizeMap[type->getSizeExpr()];
+    assert(vlaSize && "no size for VLA!");
+    assert(vlaSize.getType() == SizeTy);
+
+    if (!numElements) {
+      numElements = vlaSize;
+    } else {
+      // It's undefined behavior if this wraps around, so mark it that way.
+      // FIXME: Teach -fsanitize=undefined to trap this.
+
+      numElements =
+          builder.createMul(numElements.getLoc(), numElements, vlaSize,
+                            cir::OverflowBehavior::NoUnsignedWrap);
+    }
+  } while ((type = getContext().getAsVariableArrayType(elementType)));
+
+  assert(numElements && "Undefined elements number");
+  return {numElements, elementType};
+}
+
 // TODO(cir): Most of this function can be shared between CIRGen
 // and traditional LLVM codegen
 void CIRGenFunction::emitVariablyModifiedType(QualType type) {
@@ -1086,7 +1172,26 @@ void CIRGenFunction::emitVariablyModifiedType(QualType type) {
       break;
 
     case Type::VariableArray: {
-      cgm.errorNYI("CIRGenFunction::emitVariablyModifiedType VLA");
+      // Losing element qualification here is fine.
+      const VariableArrayType *vat = cast<clang::VariableArrayType>(ty);
+
+      // Unknown size indication requires no size computation.
+      // Otherwise, evaluate and record it.
+      if (const Expr *sizeExpr = vat->getSizeExpr()) {
+        // It's possible that we might have emitted this already,
+        // e.g. with a typedef and a pointer to it.
+        mlir::Value &entry = vlaSizeMap[sizeExpr];
+        if (!entry) {
+          mlir::Value size = emitScalarExpr(sizeExpr);
+          assert(!cir::MissingFeatures::sanitizers());
+
+          // Always zexting here would be wrong if it weren't
+          // undefined behavior to have a negative bound.
+          // FIXME: What about when size's type is larger than size_t?
+          entry = builder.createIntCast(size, SizeTy);
+        }
+      }
+      type = vat->getElementType();
       break;
     }
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index d71de2ffde6a1..fa3260c5ec5b8 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -149,6 +149,10 @@ class CIRGenFunction : public CIRGenTypeCache {
   using SymTableTy = llvm::ScopedHashTable<const clang::Decl *, mlir::Value>;
   SymTableTy symbolTable;
 
+  /// Whether a cir.stacksave operation has been added. Used to avoid
+  /// inserting cir.stacksave for multiple VLAs in the same scope.
+  bool didCallStackSave = false;
+
   /// Whether or not a Microsoft-style asm block has been processed within
   /// this fuction. These can potentially set the return value.
   bool sawAsmBlock = false;
@@ -188,6 +192,14 @@ class CIRGenFunction : public CIRGenTypeCache {
   llvm::DenseMap<const OpaqueValueExpr *, LValue> opaqueLValues;
   llvm::DenseMap<const OpaqueValueExpr *, RValue> opaqueRValues;
 
+  // This keeps track of the associated size for each VLA type.
+  // We track this by the size expression rather than the type itself because
+  // in certain situations, like a const qualifier applied to an VLA typedef,
+  // multiple VLA types can share the same size expression.
+  // FIXME: Maybe this could be a stack of maps that is pushed/popped as we
+  // enter/leave scopes.
+  llvm::DenseMap<const Expr *, mlir::Value> vlaSizeMap;
+
 public:
   /// A non-RAII class containing all the information about a bound
   /// opaque value.  OpaqueValueMapping, below, is a RAII wrapper for
@@ -436,6 +448,20 @@ class CIRGenFunction : public CIRGenTypeCache {
     }
   };
 
+  struct VlaSizePair {
+    mlir::Value numElts;
+    QualType type;
+
+    VlaSizePair(mlir::Value num, QualType ty) : numElts(num), type(ty) {}
+  };
+
+  /// Returns an MLIR::Value+QualType pair that corresponds to the size,
+  /// in non-variably-sized elements, of a variable length array type,
+  /// plus that largest non-variably-sized element type.  Assumes that
+  /// the type has already been emitted with emitVariablyModifiedType.
+  VlaSizePair getVLASize(const VariableArrayType *type);
+  VlaSizePair getVLASize(QualType type);
+
   void finishFunction(SourceLocation endLoc);
 
   /// Determine whether the given initializer is trivial in the sense
@@ -583,6 +609,8 @@ class CIRGenFunction : public CIRGenTypeCache {
     return needsEHCleanup(kind) ? NormalAndEHCleanup : NormalCleanup;
   }
 
+  void pushStackRestore(CleanupKind kind, Address spMem);
+
   /// Set the address of a local variable.
   void setAddrOfLocalVar(const clang::VarDecl *vd, Address addr) {
     assert(!localDeclMap.count(vd) && "Decl already exists in LocalDeclMap!");
@@ -705,6 +733,11 @@ class CIRGenFunction : public CIRGenTypeCache {
     SourceLocExprScopeGuard sourceLocScope;
   };
 
+  struct CXXDefaultArgExprScope : SourceLocExprScopeGuard {
+    CXXDefaultArgExprScope(CIRGenFunction &cfg, const CXXDefaultArgExpr *e)
+        : SourceLocExprScopeGuard(e, cfg.curSourceLocExprScope) {}
+  };
+
   LValue makeNaturalAlignPointeeAddrLValue(mlir::Value v, clang::QualType t);
   LValue makeNaturalAlignAddrLValue(mlir::Value val, QualType ty);
 
@@ -825,6 +858,13 @@ class CIRGenFunction : public CIRGenTypeCache {
                      FunctionArgList args, clang::SourceLocation loc,
                      clang::SourceLocation startLoc);
 
+  /// returns true if aggregate type has a volatile member.
+  bool hasVolatileMember(QualType t) {
+    if (const auto *rd = t->getAsRecordDecl())
+      return rd->hasVolatileMember();
+    return false;
+  }
+
   /// The cleanup depth enclosing all the cleanups associated with the
   /// parameters.
   EHScopeStack::stable_iterator prologueCleanupDepth;
@@ -854,6 +894,7 @@ class CIRGenFunction : public CIRGenTypeCache {
 
   protected:
     bool performCleanup;
+    bool oldDidCallStackSave;
 
   private:
     RunCleanupsScope(const RunCleanupsScope &) = delete;
@@ -867,6 +908,8 @@ class CIRGenFunction : public CIRGenTypeCache {
     explicit RunCleanupsScope(CIRGenFunction &cgf)
         : performCleanup(true), cgf(cgf) {
       cleanupStackDepth = cgf.ehStack.stable_begin();
+      oldDidCallStackSave = cgf.didCallStackSave;
+      cgf.didCallStackSave = false;
       oldCleanupStackDepth = cgf.currentCleanupStackDepth;
       cgf.currentCleanupStackDepth = cleanupStackDepth;
     }
@@ -883,6 +926,7 @@ class CIRGenFunction : public CIRGenTypeCache {
       assert(performCleanup && "Already forced cleanup");
       {
         mlir::OpBuilder::InsertionGuard guard(cgf.getBuilder());
+        cgf.didCallStackSave = oldDidCallStackSave;
         cgf.popCleanupBlocks(cleanupStackDepth);
         performCleanup = false;
         cgf.currentCleanupStackDepth = oldCleanupStackDepth;
@@ -910,6 +954,9 @@ class CIRGenFunction : public CIRGenTypeCache {
 
     LexicalScope *parentScope = nullptr;
 
+    // Holds the actual value for ScopeKind::Try
+    cir::TryOp tryOp = nullptr;
+
     // Only Regular is used at the moment. Support for other kinds will be
     // added as the relevant statements/expressions are upstreamed.
     enum Kind {
@@ -969,6 +1016,10 @@ class CIRGenFunction : public CIRGenTypeCache {
     void setAsGlobalInit() { scopeKind = Kind::GlobalInit; }
     void setAsSwitch() { scopeKind = Kind::Switch; }
     void setAsTernary() { scopeKind = Kind::Ternary; }
+    void setAsTry(cir::TryOp op) {
+      scopeKind = Kind::Try;
+      tryOp = op;
+    }
 
     // Lazy create cleanup block or return what's available.
     mlir::Block *getOrCreateCleanupBlock(mlir::OpBuilder &builder) {
@@ -978,6 +1029,11 @@ class CIRGenFunction : public CIRGenTypeCache {
       return cleanupBlock;
     }
 
+    cir::TryOp getTry() {
+      assert(isTry());
+      return tryOp;
+    }
+
     mlir::Block *getCleanupBlock(mlir::OpBuilder &builder) {
       return cleanupBlock;
     }
@@ -1045,6 +1101,9 @@ class CIRGenFunction : public CIRGenTypeCache {
 
   static Destroyer destroyCXXObject;
 
+  void pushDestroy(QualType::DestructionKind dtorKind, Address addr,
+                   QualType type);
+
   void pushDestroy(CleanupKind kind, Address addr, QualType type,
                    Destroyer *destroyer);
 
@@ -1099,14 +1158,16 @@ class CIRGenFunction : public CIRGenTypeCache {
   ///        occupied by some other object. More efficient code can often be
   ///        generated if not.
   void emitAggregateCopy(LValue dest, LValue src, QualType eltTy,
-                         AggValueSlot::Overlap_t mayOverlap);
+                         AggValueSlot::Overlap_t mayOverlap,
+                         bool isVolatile = false);
 
   /// Emit code to compute the specified expression which can have any type. The
   /// result is returned as an RValue struct. If this is an aggregate
   /// expression, the aggloc/agglocvolatile arguments indicate where the result
   /// should be returned.
   RValue emitAnyExpr(const clang::Expr *e,
-                     AggValueSlot aggSlot = AggValueSlot::ignored());
+                     AggValueSlot aggSlot = AggValueSlot::ignored(),
+                     bool ignoreResult = false);
 
   /// Emits the code necessary to evaluate an arbitrary expression into the
   /// given memory location.
@@ -1281,10 +1342,10 @@ class CIRGenFunction : public CIRGenTypeCache {
 
   mlir::Value emitCXXNewExpr(const CXXNewExpr *e);
 
-  void emitNewArrayInitializer(const CXXNewExpr *E, QualType ElementType,
-                               mlir::Type ElementTy, Address BeginPtr,
-                               mlir::Value NumElements,
-                               mlir::Value AllocSizeWithoutCookie);
+  void emitNewArrayInitializer(const CXXNewExpr *e, QualType elementType,
+                               mlir::Type elementTy, Address beginPtr,
+                               mlir::Value numElements,
+                               mlir::Value allocSizeWithoutCookie);
 
   RValue emitCXXOperatorMemberCallExpr(const CXXOperatorCallExpr *e,
                                        const CXXMethodDecl *md,
@@ -1299,6 +1360,13 @@ class CIRGenFunction : public CIRGenTypeCache {
 
   mlir::LogicalResult emitCXXTryStmt(const clang::CXXTryStmt &s);
 
+  mlir::LogicalResult emitCXXTryStmtUnderScope(const clang::CXXTryStmt &s);
+
+  void enterCXXTryStmt(const CXXTryStmt &s, cir::TryOp tryOp,
+                       bool isFnTryBlock = false);
+
+  void exitCXXTryStmt(const CXXTryStmt &s, bool isFnTryBlock = false);
+
   void emitCtorPrologue(const clang::CXXConstructorDecl *ctor,
                         clang::CXXCtorType ctorType, FunctionArgList &args);
 
@@ -1486,6 +1554,10 @@ class CIRGenFunction : public CIRGenTypeCache {
 
   LValue emitMemberExpr(const MemberExpr *e);
 
+  LValue emitOpaqueValueLValue(const OpaqueValueExpr *e);
+
+  LValue emitConditionalOperatorLValue(const AbstractConditionalOperator *expr);
+
   /// Given an expression with a pointer type, emit the value and compute our
   /// best estimate of the alignment of the pointee.
   ///
diff --git a/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp b/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp
index d30c975a8ffb6..c184d4a4b1d97 100644
--- a/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp
@@ -744,8 +744,8 @@ static bool shouldUseExternalRttiDescriptor(CIRGenModule &cgm, QualType ty) {
     return false;
 
   if (const auto *recordTy = dyn_cast<RecordType>(ty)) {
-    const CXXRecordDecl *rd =
-        cast<CXXRecordDecl>(recordTy->getOriginalDecl())->getDefinitionOrSelf();
+    const auto *rd =
+        cast<CXXRecordDecl>(recordTy->getDecl())->getDefinitionOrSelf();
     if (!rd->hasDefinition())
       return false;
 
@@ -859,9 +859,7 @@ static bool canUseSingleInheritance(const CXXRecordDecl *rd) {
 
 /// IsIncompleteClassType - Returns whether the given record type is incomplete.
 static bool isIncompleteClassType(const RecordType *recordTy) {
-  return !recordTy->getOriginalDecl()
-              ->getDefinitionOrSelf()
-              ->isCompleteDefinition();
+  return !recordTy->getDecl()->getDefinitionOrSelf()->isCompleteDefinition();
 }
 
 /// Returns whether the given type contains an
@@ -939,8 +937,7 @@ const char *vTableClassNameForType(const CIRGenModule &cgm, const Type *ty) {
   case Type::Atomic:
   // FIXME: GCC treats block pointers as fundamental types?!
   case Type::BlockPointer:
-    cgm.errorNYI("VTableClassNameForType: __fundamental_type_info");
-    break;
+    return "_ZTVN10__cxxabiv123__fundamental_type_infoE";
   case Type::ConstantArray:
   case Type::IncompleteArray:
   case Type::VariableArray:
@@ -953,13 +950,11 @@ const char *vTableClassNameForType(const CIRGenModule &cgm, const Type *ty) {
     break;
 
   case Type::Enum:
-    cgm.errorNYI("VTableClassNameForType: Enum");
-    break;
+    return "_ZTVN10__cxxabiv116__enum_type_infoE";
 
   case Type::Record: {
-    const CXXRecordDecl *rd =
-        cast<CXXRecordDecl>(cast<RecordType>(ty)->getOriginalDecl())
-            ->getDefinitionOrSelf();
+    const auto *rd = cast<CXXRecordDecl>(cast<RecordType>(ty)->getDecl())
+                         ->getDefinitionOrSelf();
 
     if (!rd->hasDefinition() || !rd->getNumBases()) {
       return classTypeInfo;
@@ -1031,8 +1026,8 @@ static cir::GlobalLinkageKind getTypeInfoLinkage(CIRGenModule &cgm,
       return cir::GlobalLinkageKind::LinkOnceODRLinkage;
 
     if (const RecordType *record = dyn_cast<RecordType>(ty)) {
-      const CXXRecordDecl *rd =
-          cast<CXXRecordDecl>(record->getOriginalDecl())->getDefinitionOrSelf();
+      const auto *rd =
+          cast<CXXRecordDecl>(record->getDecl())->getDefinitionOrSelf();
       if (rd->hasAttr<WeakAttr>())
         return cir::GlobalLinkageKind::WeakODRLinkage;
 
@@ -1382,9 +1377,8 @@ mlir::Attribute CIRGenItaniumRTTIBuilder::buildTypeInfo(
     break;
 
   case Type::Record: {
-    const auto *rd =
-        cast<CXXRecordDecl>(cast<RecordType>(ty)->getOriginalDecl())
-            ->getDefinitionOrSelf();
+    const auto *rd = cast<CXXRecordDecl>(cast<RecordType>(ty)->getDecl())
+                         ->getDefinitionOrSelf();
     if (!rd->hasDefinition() || !rd->getNumBases()) {
       // We don't need to emit any fields.
       break;
@@ -1651,8 +1645,7 @@ void CIRGenItaniumCXXABI::emitThrow(CIRGenFunction &cgf,
   // Lowering pass to skip passing the trivial function.
   //
   if (const RecordType *recordTy = clangThrowType->getAs<RecordType>()) {
-    CXXRecordDecl *rec =
-        cast<CXXRecordDecl>(recordTy->getOriginalDecl()->getDefinition());
+    auto *rec = cast<CXXRecordDecl>(recordTy->getDecl()->getDefinition());
     assert(!cir::MissingFeatures::isTrivialCtorOrDtor());
     if (!rec->hasTrivialDestructor()) {
       cgm.errorNYI("emitThrow: non-trivial destructor");
@@ -1951,6 +1944,15 @@ static cir::FuncOp getItaniumDynamicCastFn(CIRGenFunction &cgf) {
   return cgf.cgm.createRuntimeFunction(FTy, "__dynamic_cast");
 }
 
+static Address emitDynamicCastToVoid(CIRGenFunction &cgf, mlir::Location loc,
+                                     QualType srcRecordTy, Address src) {
+  bool vtableUsesRelativeLayout =
+      cgf.cgm.getItaniumVTableContext().isRelativeLayout();
+  mlir::Value ptr = cgf.getBuilder().createDynCastToVoid(
+      loc, src.getPointer(), vtableUsesRelativeLayout);
+  return Address{ptr, src.getAlignment()};
+}
+
 static cir::DynamicCastInfoAttr emitDynamicCastInfo(CIRGenFunction &cgf,
                                                     mlir::Location loc,
                                                     QualType srcRecordTy,
@@ -1985,10 +1987,8 @@ mlir::Value CIRGenItaniumCXXABI::emitDynamicCast(CIRGenFunction &cgf,
   bool isCastToVoid = destRecordTy.isNull();
   assert((!isCastToVoid || !isRefCast) && "cannot cast to void reference");
 
-  if (isCastToVoid) {
-    cgm.errorNYI(loc, "emitDynamicCastToVoid");
-    return {};
-  }
+  if (isCastToVoid)
+    return emitDynamicCastToVoid(cgf, loc, srcRecordTy, src).getPointer();
 
   // If the destination is effectively final, the cast succeeds if and only
   // if the dynamic type of the pointer is exactly the destination type.
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index 82b10515a412f..74d39b73fff8f 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -88,6 +88,8 @@ CIRGenModule::CIRGenModule(mlir::MLIRContext &mlirContext,
   FP80Ty = cir::FP80Type::get(&getMLIRContext());
   FP128Ty = cir::FP128Type::get(&getMLIRContext());
 
+  AllocaInt8PtrTy = cir::PointerType::get(UInt8Ty, cirAllocaAddressSpace);
+
   PointerAlignInBytes =
       astContext
           .toCharUnitsFromBits(
@@ -449,7 +451,7 @@ void CIRGenModule::emitGlobalFunctionDefinition(clang::GlobalDecl gd,
   curCGF = nullptr;
 
   setNonAliasAttributes(gd, funcOp);
-  assert(!cir::MissingFeatures::opFuncAttributesForDefinition());
+  setCIRFunctionAttributesForDefinition(funcDecl, funcOp);
 
   auto getPriority = [this](const auto *attr) -> int {
     Expr *e = attr->getPriority();
@@ -1915,6 +1917,102 @@ void CIRGenModule::setFunctionAttributes(GlobalDecl globalDecl,
     const Decl *decl = globalDecl.getDecl();
     func.setGlobalVisibilityAttr(getGlobalVisibilityAttrFromDecl(decl));
   }
+
+  // If we plan on emitting this inline builtin, we can't treat it as a builtin.
+  const auto *fd = cast<FunctionDecl>(globalDecl.getDecl());
+  if (fd->isInlineBuiltinDeclaration()) {
+    const FunctionDecl *fdBody;
+    bool hasBody = fd->hasBody(fdBody);
+    (void)hasBody;
+    assert(hasBody && "Inline builtin declarations should always have an "
+                      "available body!");
+    assert(!cir::MissingFeatures::attributeNoBuiltin());
+  }
+}
+
+void CIRGenModule::setCIRFunctionAttributesForDefinition(
+    const clang::FunctionDecl *decl, cir::FuncOp f) {
+  assert(!cir::MissingFeatures::opFuncUnwindTablesAttr());
+  assert(!cir::MissingFeatures::stackProtector());
+
+  std::optional<cir::InlineKind> existingInlineKind = f.getInlineKind();
+  bool isNoInline =
+      existingInlineKind && *existingInlineKind == cir::InlineKind::NoInline;
+  bool isAlwaysInline = existingInlineKind &&
+                        *existingInlineKind == cir::InlineKind::AlwaysInline;
+
+  if (!decl) {
+    assert(!cir::MissingFeatures::hlsl());
+
+    if (!isAlwaysInline &&
+        codeGenOpts.getInlining() == CodeGenOptions::OnlyAlwaysInlining) {
+      // If inlining is disabled and we don't have a declaration to control
+      // inlining, mark the function as 'noinline' unless it is explicitly
+      // marked as 'alwaysinline'.
+      f.setInlineKindAttr(
+          cir::InlineAttr::get(&getMLIRContext(), cir::InlineKind::NoInline));
+    }
+
+    return;
+  }
+
+  assert(!cir::MissingFeatures::opFuncArmStreamingAttr());
+  assert(!cir::MissingFeatures::opFuncArmNewAttr());
+  assert(!cir::MissingFeatures::opFuncOptNoneAttr());
+  assert(!cir::MissingFeatures::opFuncMinSizeAttr());
+  assert(!cir::MissingFeatures::opFuncNakedAttr());
+  assert(!cir::MissingFeatures::opFuncNoDuplicateAttr());
+  assert(!cir::MissingFeatures::hlsl());
+
+  // Handle inline attributes
+  if (decl->hasAttr<NoInlineAttr>() && !isAlwaysInline) {
+    // Add noinline if the function isn't always_inline.
+    f.setInlineKindAttr(
+        cir::InlineAttr::get(&getMLIRContext(), cir::InlineKind::NoInline));
+  } else if (decl->hasAttr<AlwaysInlineAttr>() && !isNoInline) {
+    // Don't override AlwaysInline with NoInline, or vice versa, since we can't
+    // specify both in IR.
+    f.setInlineKindAttr(
+        cir::InlineAttr::get(&getMLIRContext(), cir::InlineKind::AlwaysInline));
+  } else if (codeGenOpts.getInlining() == CodeGenOptions::OnlyAlwaysInlining) {
+    // If inlining is disabled, force everything that isn't always_inline
+    // to carry an explicit noinline attribute.
+    if (!isAlwaysInline) {
+      f.setInlineKindAttr(
+          cir::InlineAttr::get(&getMLIRContext(), cir::InlineKind::NoInline));
+    }
+  } else {
+    // Otherwise, propagate the inline hint attribute and potentially use its
+    // absence to mark things as noinline.
+    // Search function and template pattern redeclarations for inline.
+    if (auto *fd = dyn_cast<FunctionDecl>(decl)) {
+      // TODO: Share this checkForInline implementation with classic codegen.
+      // This logic is likely to change over time, so sharing would help ensure
+      // consistency.
+      auto checkForInline = [](const FunctionDecl *decl) {
+        auto checkRedeclForInline = [](const FunctionDecl *redecl) {
+          return redecl->isInlineSpecified();
+        };
+        if (any_of(decl->redecls(), checkRedeclForInline))
+          return true;
+        const FunctionDecl *pattern = decl->getTemplateInstantiationPattern();
+        if (!pattern)
+          return false;
+        return any_of(pattern->redecls(), checkRedeclForInline);
+      };
+      if (checkForInline(fd)) {
+        f.setInlineKindAttr(cir::InlineAttr::get(&getMLIRContext(),
+                                                 cir::InlineKind::InlineHint));
+      } else if (codeGenOpts.getInlining() ==
+                     CodeGenOptions::OnlyHintInlining &&
+                 !fd->isInlined() && !isAlwaysInline) {
+        f.setInlineKindAttr(
+            cir::InlineAttr::get(&getMLIRContext(), cir::InlineKind::NoInline));
+      }
+    }
+  }
+
+  assert(!cir::MissingFeatures::opFuncColdHotAttr());
 }
 
 cir::FuncOp CIRGenModule::getOrCreateCIRFunction(
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h
index 690f0ed0e9bde..1fc116d98a858 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.h
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.h
@@ -429,6 +429,10 @@ class CIRGenModule : public CIRGenTypeCache {
   void setFunctionAttributes(GlobalDecl gd, cir::FuncOp f,
                              bool isIncompleteFunction, bool isThunk);
 
+  /// Set extra attributes (inline, etc.) for a function.
+  void setCIRFunctionAttributesForDefinition(const clang::FunctionDecl *fd,
+                                             cir::FuncOp f);
+
   void emitGlobalDefinition(clang::GlobalDecl gd,
                             mlir::Operation *op = nullptr);
   void emitGlobalFunctionDefinition(clang::GlobalDecl gd, mlir::Operation *op);
diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
index 3d86f71b077e2..ce4ae7ec5efc4 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
@@ -1005,7 +1005,7 @@ class OpenACCClauseCIREmitter final
                       /*temporary=*/nullptr, OpenACCReductionOperator::Invalid,
                       Decl::castToDeclContext(cgf.curFuncDecl), opInfo.origType,
                       opInfo.bounds.size(), opInfo.boundTypes, opInfo.baseType,
-                      privateOp);
+                      privateOp, /*reductionCombinerRecipes=*/{});
           // TODO: OpenACC: The dialect is going to change in the near future to
           // have these be on a different operation, so when that changes, we
           // probably need to change these here.
@@ -1046,7 +1046,7 @@ class OpenACCClauseCIREmitter final
                       OpenACCReductionOperator::Invalid,
                       Decl::castToDeclContext(cgf.curFuncDecl), opInfo.origType,
                       opInfo.bounds.size(), opInfo.boundTypes, opInfo.baseType,
-                      firstPrivateOp);
+                      firstPrivateOp, /*reductionCombinerRecipe=*/{});
 
           // TODO: OpenACC: The dialect is going to change in the near future to
           // have these be on a different operation, so when that changes, we
@@ -1088,7 +1088,7 @@ class OpenACCClauseCIREmitter final
                       /*temporary=*/nullptr, clause.getReductionOp(),
                       Decl::castToDeclContext(cgf.curFuncDecl), opInfo.origType,
                       opInfo.bounds.size(), opInfo.boundTypes, opInfo.baseType,
-                      reductionOp);
+                      reductionOp, varRecipe.CombinerRecipes);
 
           operation.addReduction(builder.getContext(), reductionOp, recipe);
         }
diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
index 24a5fc27c4775..be063033ddcfc 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
@@ -398,6 +398,7 @@ void OpenACCRecipeBuilderBase::createRecipeDestroySection(
     emitDestroy(block->getArgument(1), elementTy);
   }
 
+  ls.forceCleanup();
   mlir::acc::YieldOp::create(builder, locEnd);
 }
 void OpenACCRecipeBuilderBase::makeBoundsInit(
@@ -480,6 +481,7 @@ void OpenACCRecipeBuilderBase::createInitRecipe(
                      /*isInitSection=*/true);
   }
 
+  ls.forceCleanup();
   mlir::acc::YieldOp::create(builder, locEnd);
 }
 
@@ -518,6 +520,7 @@ void OpenACCRecipeBuilderBase::createFirstprivateRecipeCopy(
   cgf.emitAutoVarInit(tempDeclEmission);
 
   builder.setInsertionPointToEnd(&copyRegion.back());
+  ls.forceCleanup();
   mlir::acc::YieldOp::create(builder, locEnd);
 }
 
@@ -527,16 +530,146 @@ void OpenACCRecipeBuilderBase::createFirstprivateRecipeCopy(
 // doesn't restore it aftewards.
 void OpenACCRecipeBuilderBase::createReductionRecipeCombiner(
     mlir::Location loc, mlir::Location locEnd, mlir::Value mainOp,
-    mlir::acc::ReductionRecipeOp recipe, size_t numBounds) {
+    mlir::acc::ReductionRecipeOp recipe, size_t numBounds, QualType origType,
+    llvm::ArrayRef<OpenACCReductionRecipe::CombinerRecipe> combinerRecipes) {
   mlir::Block *block =
       createRecipeBlock(recipe.getCombinerRegion(), mainOp.getType(), loc,
                         numBounds, /*isInit=*/false);
   builder.setInsertionPointToEnd(&recipe.getCombinerRegion().back());
   CIRGenFunction::LexicalScope ls(cgf, loc, block);
 
-  mlir::BlockArgument lhsArg = block->getArgument(0);
+  mlir::Value lhsArg = block->getArgument(0);
+  mlir::Value rhsArg = block->getArgument(1);
+  llvm::MutableArrayRef<mlir::BlockArgument> boundsRange =
+      block->getArguments().drop_front(2);
+
+  if (llvm::any_of(combinerRecipes, [](auto &r) { return r.Op == nullptr; })) {
+    cgf.cgm.errorNYI(loc, "OpenACC Reduction combiner not generated");
+    mlir::acc::YieldOp::create(builder, locEnd, block->getArgument(0));
+    return;
+  }
+
+  // apply the bounds so that we can get our bounds emitted correctly.
+  for (mlir::BlockArgument boundArg : llvm::reverse(boundsRange))
+    std::tie(lhsArg, rhsArg) =
+        createBoundsLoop(lhsArg, rhsArg, boundArg, loc, /*inverse=*/false);
+
+  // Emitter for when we know this isn't a struct or array we have to loop
+  // through. This should work for the 'field' once the get-element call has
+  // been made.
+  auto emitSingleCombiner =
+      [&](mlir::Value lhsArg, mlir::Value rhsArg,
+          const OpenACCReductionRecipe::CombinerRecipe &combiner) {
+        mlir::Type elementTy =
+            mlir::cast<cir::PointerType>(lhsArg.getType()).getPointee();
+        CIRGenFunction::DeclMapRevertingRAII declMapRAIILhs{cgf, combiner.LHS};
+        cgf.setAddrOfLocalVar(
+            combiner.LHS, Address{lhsArg, elementTy,
+                                  cgf.getContext().getDeclAlign(combiner.LHS)});
+        CIRGenFunction::DeclMapRevertingRAII declMapRAIIRhs{cgf, combiner.RHS};
+        cgf.setAddrOfLocalVar(
+            combiner.RHS, Address{rhsArg, elementTy,
+                                  cgf.getContext().getDeclAlign(combiner.RHS)});
+
+        [[maybe_unused]] mlir::LogicalResult stmtRes =
+            cgf.emitStmt(combiner.Op, /*useCurrentScope=*/true);
+      };
+
+  // Emitter for when we know this is either a non-array or element of an array
+  // (which also shouldn't be an array type?). This function should generate the
+  // initialization code for an entire 'array-element'/non-array, including
+  // diving into each element of a struct (if necessary).
+  auto emitCombiner = [&](mlir::Value lhsArg, mlir::Value rhsArg, QualType ty) {
+    assert(!ty->isArrayType() && "Array type shouldn't get here");
+    if (const auto *rd = ty->getAsRecordDecl()) {
+      if (combinerRecipes.size() == 1 &&
+          cgf.getContext().hasSameType(ty, combinerRecipes[0].LHS->getType())) {
+        // If this is a 'top level' operator on the type we can just emit this
+        // as a simple one.
+        emitSingleCombiner(lhsArg, rhsArg, combinerRecipes[0]);
+      } else {
+        // else we have to handle each individual field after after a
+        // get-element.
+        const CIRGenRecordLayout &layout =
+            cgf.cgm.getTypes().getCIRGenRecordLayout(rd);
+        for (const auto &[field, combiner] :
+             llvm::zip_equal(rd->fields(), combinerRecipes)) {
+          mlir::Type fieldType = cgf.convertType(field->getType());
+          auto fieldPtr = cir::PointerType::get(fieldType);
+          unsigned fieldIndex = layout.getCIRFieldNo(field);
+
+          mlir::Value lhsField = builder.createGetMember(
+              loc, fieldPtr, lhsArg, field->getName(), fieldIndex);
+          mlir::Value rhsField = builder.createGetMember(
+              loc, fieldPtr, rhsArg, field->getName(), fieldIndex);
+
+          emitSingleCombiner(lhsField, rhsField, combiner);
+        }
+      }
+
+    } else {
+      // if this is a single-thing (because we should know this isn't an array,
+      // as Sema wouldn't let us get here), we can just do a normal emit call.
+      emitSingleCombiner(lhsArg, rhsArg, combinerRecipes[0]);
+    }
+  };
+
+  if (const auto *cat = cgf.getContext().getAsConstantArrayType(origType)) {
+    // If we're in an array, we have to emit the combiner for each element of
+    // the array.
+    auto itrTy = mlir::cast<cir::IntType>(cgf.PtrDiffTy);
+    auto itrPtrTy = cir::PointerType::get(itrTy);
+
+    mlir::Value zero =
+        builder.getConstInt(loc, mlir::cast<cir::IntType>(cgf.PtrDiffTy), 0);
+    mlir::Value itr =
+        cir::AllocaOp::create(builder, loc, itrPtrTy, itrTy, "itr",
+                              cgf.cgm.getSize(cgf.getPointerAlign()));
+    builder.CIRBaseBuilderTy::createStore(loc, zero, itr);
+
+    builder.setInsertionPointAfter(builder.createFor(
+        loc,
+        /*condBuilder=*/
+        [&](mlir::OpBuilder &b, mlir::Location loc) {
+          auto loadItr = cir::LoadOp::create(builder, loc, {itr});
+          mlir::Value arraySize = builder.getConstInt(
+              loc, mlir::cast<cir::IntType>(cgf.PtrDiffTy), cat->getZExtSize());
+          auto cmp = builder.createCompare(loc, cir::CmpOpKind::lt, loadItr,
+                                           arraySize);
+          builder.createCondition(cmp);
+        },
+        /*bodyBuilder=*/
+        [&](mlir::OpBuilder &b, mlir::Location loc) {
+          auto loadItr = cir::LoadOp::create(builder, loc, {itr});
+          auto lhsElt = builder.getArrayElement(
+              loc, loc, lhsArg, cgf.convertType(cat->getElementType()), loadItr,
+              /*shouldDecay=*/true);
+          auto rhsElt = builder.getArrayElement(
+              loc, loc, rhsArg, cgf.convertType(cat->getElementType()), loadItr,
+              /*shouldDecay=*/true);
+
+          emitCombiner(lhsElt, rhsElt, cat->getElementType());
+          builder.createYield(loc);
+        },
+        /*stepBuilder=*/
+        [&](mlir::OpBuilder &b, mlir::Location loc) {
+          auto loadItr = cir::LoadOp::create(builder, loc, {itr});
+          auto inc = cir::UnaryOp::create(builder, loc, loadItr.getType(),
+                                          cir::UnaryOpKind::Inc, loadItr);
+          builder.CIRBaseBuilderTy::createStore(loc, inc, itr);
+          builder.createYield(loc);
+        }));
 
-  mlir::acc::YieldOp::create(builder, locEnd, lhsArg);
+  } else if (origType->isArrayType()) {
+    cgf.cgm.errorNYI(loc,
+                     "OpenACC Reduction combiner non-constant array recipe");
+  } else {
+    emitCombiner(lhsArg, rhsArg, origType);
+  }
+
+  builder.setInsertionPointToEnd(&recipe.getCombinerRegion().back());
+  ls.forceCleanup();
+  mlir::acc::YieldOp::create(builder, locEnd, block->getArgument(0));
 }
 
 } // namespace clang::CIRGen
diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h
index a5da7444ebf96..745d42446896e 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h
@@ -64,10 +64,10 @@ class OpenACCRecipeBuilderBase {
   // that this function is not 'insertion point' clean, in that it alters the
   // insertion point to be inside of the 'combiner' section of the recipe, but
   // doesn't restore it aftewards.
-  void createReductionRecipeCombiner(mlir::Location loc, mlir::Location locEnd,
-                                     mlir::Value mainOp,
-                                     mlir::acc::ReductionRecipeOp recipe,
-                                     size_t numBounds);
+  void createReductionRecipeCombiner(
+      mlir::Location loc, mlir::Location locEnd, mlir::Value mainOp,
+      mlir::acc::ReductionRecipeOp recipe, size_t numBounds, QualType origType,
+      llvm::ArrayRef<OpenACCReductionRecipe::CombinerRecipe> combinerRecipes);
 
   void createInitRecipe(mlir::Location loc, mlir::Location locEnd,
                         SourceRange exprRange, mlir::Value mainOp,
@@ -169,7 +169,9 @@ class OpenACCRecipeBuilder : OpenACCRecipeBuilderBase {
       const Expr *varRef, const VarDecl *varRecipe, const VarDecl *temporary,
       OpenACCReductionOperator reductionOp, DeclContext *dc, QualType origType,
       size_t numBounds, llvm::ArrayRef<QualType> boundTypes, QualType baseType,
-      mlir::Value mainOp) {
+      mlir::Value mainOp,
+      llvm::ArrayRef<OpenACCReductionRecipe::CombinerRecipe>
+          reductionCombinerRecipes) {
     assert(!varRecipe->getType()->isSpecificBuiltinType(
                BuiltinType::ArraySection) &&
            "array section shouldn't make it to recipe creation");
@@ -208,7 +210,8 @@ class OpenACCRecipeBuilder : OpenACCRecipeBuilderBase {
       createInitRecipe(loc, locEnd, varRef->getSourceRange(), mainOp,
                        recipe.getInitRegion(), numBounds, boundTypes, varRecipe,
                        origType, /*emitInitExpr=*/true);
-      createReductionRecipeCombiner(loc, locEnd, mainOp, recipe, numBounds);
+      createReductionRecipeCombiner(loc, locEnd, mainOp, recipe, numBounds,
+                                    origType, reductionCombinerRecipes);
     } else {
       static_assert(std::is_same_v<RecipeTy, mlir::acc::FirstprivateRecipeOp>);
       createInitRecipe(loc, locEnd, varRef->getSourceRange(), mainOp,
diff --git a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
index cfd48a227ed20..ad8c4d004654f 100644
--- a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
@@ -475,8 +475,8 @@ mlir::LogicalResult CIRGenFunction::emitReturnStmt(const ReturnStmt &s) {
       }
       break;
     case cir::TEK_Complex:
-      getCIRGenModule().errorNYI(s.getSourceRange(),
-                                 "complex function return type");
+      emitComplexExprIntoLValue(rv, makeAddrLValue(returnValue, rv->getType()),
+                                /*isInit=*/true);
       break;
     case cir::TEK_Aggregate:
       assert(!cir::MissingFeatures::aggValueSlotGC());
@@ -536,7 +536,7 @@ mlir::LogicalResult CIRGenFunction::emitLabel(const clang::LabelDecl &d) {
   mlir::Block *currBlock = builder.getBlock();
   mlir::Block *labelBlock = currBlock;
 
-  if (!currBlock->empty()) {
+  if (!currBlock->empty() || currBlock->isEntryBlock()) {
     {
       mlir::OpBuilder::InsertionGuard guard(builder);
       labelBlock = builder.createBlock(builder.getBlock()->getParent());
diff --git a/clang/lib/CIR/CodeGen/CIRGenTypeCache.h b/clang/lib/CIR/CodeGen/CIRGenTypeCache.h
index 273ec7f06b4b5..b5612d9127506 100644
--- a/clang/lib/CIR/CodeGen/CIRGenTypeCache.h
+++ b/clang/lib/CIR/CodeGen/CIRGenTypeCache.h
@@ -65,6 +65,9 @@ struct CIRGenTypeCache {
   cir::PointerType VoidPtrTy;
   cir::PointerType UInt8PtrTy;
 
+  /// void* in alloca address space
+  cir::PointerType AllocaInt8PtrTy;
+
   /// The size and alignment of a pointer into the generic address space.
   union {
     unsigned char PointerAlignInBytes;
diff --git a/clang/lib/CIR/CodeGen/CIRGenTypes.cpp b/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
index 2ab1ea0c8ff8e..d1b91d0c73c04 100644
--- a/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
@@ -159,7 +159,7 @@ isSafeToConvert(const RecordDecl *rd, CIRGenTypes &cgt,
     for (const clang::CXXBaseSpecifier &i : crd->bases())
       if (!isSafeToConvert(i.getType()
                                ->castAs<RecordType>()
-                               ->getOriginalDecl()
+                               ->getDecl()
                                ->getDefinitionOrSelf(),
                            cgt, alreadyChecked))
         return false;
@@ -279,8 +279,7 @@ mlir::Type CIRGenTypes::convertType(QualType type) {
 
   // Process record types before the type cache lookup.
   if (const auto *recordType = dyn_cast<RecordType>(type))
-    return convertRecordDeclType(
-        recordType->getOriginalDecl()->getDefinitionOrSelf());
+    return convertRecordDeclType(recordType->getDecl()->getDefinitionOrSelf());
 
   // Has the type already been processed?
   TypeCacheTy::iterator tci = typeCache.find(ty);
@@ -421,6 +420,16 @@ mlir::Type CIRGenTypes::convertType(QualType type) {
     break;
   }
 
+  case Type::VariableArray: {
+    const VariableArrayType *a = cast<VariableArrayType>(ty);
+    if (a->getIndexTypeCVRQualifiers() != 0)
+      cgm.errorNYI(SourceLocation(), "non trivial array types", type);
+    // VLAs resolve to the innermost element type; this matches
+    // the return of alloca, and there isn't any obviously better choice.
+    resultType = convertTypeForMem(a->getElementType());
+    break;
+  }
+
   case Type::IncompleteArray: {
     const IncompleteArrayType *arrTy = cast<IncompleteArrayType>(ty);
     if (arrTy->getIndexTypeCVRQualifiers() != 0)
diff --git a/clang/lib/CIR/CodeGen/CIRGenValue.h b/clang/lib/CIR/CodeGen/CIRGenValue.h
index 25b6ecb503a6e..ab245a771d72c 100644
--- a/clang/lib/CIR/CodeGen/CIRGenValue.h
+++ b/clang/lib/CIR/CodeGen/CIRGenValue.h
@@ -307,8 +307,8 @@ class AggValueSlot {
   /// This is set to true if some external code is responsible for setting up a
   /// destructor for the slot.  Otherwise the code which constructs it should
   /// push the appropriate cleanup.
-  LLVM_PREFERRED_TYPE(bool)
-  LLVM_ATTRIBUTE_UNUSED unsigned destructedFlag : 1;
+  [[maybe_unused]]
+  LLVM_PREFERRED_TYPE(bool) unsigned destructedFlag : 1;
 
   /// This is set to true if the memory in the slot is known to be zero before
   /// the assignment into it.  This means that zero fields don't need to be set.
@@ -326,16 +326,16 @@ class AggValueSlot {
   /// over.  Since it's invalid in general to memcpy a non-POD C++
   /// object, it's important that this flag never be set when
   /// evaluating an expression which constructs such an object.
-  LLVM_PREFERRED_TYPE(bool)
-  LLVM_ATTRIBUTE_UNUSED unsigned aliasedFlag : 1;
+  [[maybe_unused]]
+  LLVM_PREFERRED_TYPE(bool) unsigned aliasedFlag : 1;
 
   /// This is set to true if the tail padding of this slot might overlap
   /// another object that may have already been initialized (and whose
   /// value must be preserved by this initialization). If so, we may only
   /// store up to the dsize of the type. Otherwise we can widen stores to
   /// the size of the type.
-  LLVM_PREFERRED_TYPE(bool)
-  LLVM_ATTRIBUTE_UNUSED unsigned overlapFlag : 1;
+  [[maybe_unused]]
+  LLVM_PREFERRED_TYPE(bool) unsigned overlapFlag : 1;
 
 public:
   enum IsDestructed_t { IsNotDestructed, IsDestructed };
@@ -380,6 +380,15 @@ class AggValueSlot {
 
   clang::Qualifiers getQualifiers() const { return quals; }
 
+  bool isVolatile() const { return quals.hasVolatile(); }
+
+  void setVolatile(bool flag) {
+    if (flag)
+      quals.addVolatile();
+    else
+      quals.removeVolatile();
+  }
+
   Address getAddress() const { return addr; }
 
   bool isIgnored() const { return !addr.isValid(); }
@@ -390,6 +399,8 @@ class AggValueSlot {
 
   IsZeroed_t isZeroed() const { return IsZeroed_t(zeroedFlag); }
 
+  IsAliased_t isPotentiallyAliased() const { return IsAliased_t(aliasedFlag); }
+
   RValue asRValue() const {
     if (isIgnored())
       return RValue::getIgnored();
diff --git a/clang/lib/CIR/CodeGen/EHScopeStack.h b/clang/lib/CIR/CodeGen/EHScopeStack.h
index 67a72f5384c32..1d6e671b21b79 100644
--- a/clang/lib/CIR/CodeGen/EHScopeStack.h
+++ b/clang/lib/CIR/CodeGen/EHScopeStack.h
@@ -155,6 +155,14 @@ class EHScopeStack {
   /// Pops a cleanup scope off the stack.  This is private to CIRGenCleanup.cpp.
   void popCleanup();
 
+  /// Push a set of catch handlers on the stack.  The catch is
+  /// uninitialized and will need to have the given number of handlers
+  /// set on it.
+  class EHCatchScope *pushCatch(unsigned numHandlers);
+
+  /// Pops a catch scope off the stack. This is private to CIRGenException.cpp.
+  void popCatch();
+
   /// Determines whether the exception-scopes stack is empty.
   bool empty() const { return startOfData == endOfBuffer; }
 
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index 12837d953d677..fa180f54c4474 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -1758,6 +1758,36 @@ ParseResult cir::FuncOp::parse(OpAsmParser &parser, OperationState &state) {
       }).failed())
     return failure();
 
+  // Parse optional inline kind: inline(never|always|hint)
+  if (parser.parseOptionalKeyword("inline").succeeded()) {
+    if (parser.parseLParen().failed())
+      return failure();
+
+    llvm::StringRef inlineKindStr;
+    const std::array<llvm::StringRef, cir::getMaxEnumValForInlineKind()>
+        allowedInlineKindStrs{
+            cir::stringifyInlineKind(cir::InlineKind::NoInline),
+            cir::stringifyInlineKind(cir::InlineKind::AlwaysInline),
+            cir::stringifyInlineKind(cir::InlineKind::InlineHint),
+        };
+    if (parser.parseOptionalKeyword(&inlineKindStr, allowedInlineKindStrs)
+            .failed())
+      return parser.emitError(parser.getCurrentLocation(),
+                              "expected 'never', 'always', or 'hint'");
+
+    std::optional<InlineKind> inlineKind =
+        cir::symbolizeInlineKind(inlineKindStr);
+    if (!inlineKind)
+      return parser.emitError(parser.getCurrentLocation(),
+                              "invalid inline kind");
+
+    state.addAttribute(getInlineKindAttrName(state.name),
+                       cir::InlineAttr::get(builder.getContext(), *inlineKind));
+
+    if (parser.parseRParen().failed())
+      return failure();
+  }
+
   // Parse the optional function body.
   auto *body = state.addRegion();
   OptionalParseResult parseResult = parser.parseOptionalRegion(
@@ -1851,6 +1881,10 @@ void cir::FuncOp::print(OpAsmPrinter &p) {
       p << "(" << globalDtorPriority.value() << ")";
   }
 
+  if (cir::InlineAttr inlineAttr = getInlineKindAttr()) {
+    p << " inline(" << cir::stringifyInlineKind(inlineAttr.getValue()) << ")";
+  }
+
   // Print the body if this is not an external function.
   Region &body = getOperation()->getRegion(0);
   if (!body.empty()) {
@@ -1944,13 +1978,19 @@ void cir::TernaryOp::build(
   result.addOperands(cond);
   OpBuilder::InsertionGuard guard(builder);
   Region *trueRegion = result.addRegion();
-  Block *block = builder.createBlock(trueRegion);
+  builder.createBlock(trueRegion);
   trueBuilder(builder, result.location);
   Region *falseRegion = result.addRegion();
   builder.createBlock(falseRegion);
   falseBuilder(builder, result.location);
 
-  auto yield = dyn_cast<YieldOp>(block->getTerminator());
+  // Get result type from whichever branch has a yield (the other may have
+  // unreachable from a throw expression)
+  auto yield =
+      dyn_cast_or_null<cir::YieldOp>(trueRegion->back().getTerminator());
+  if (!yield)
+    yield = dyn_cast_or_null<cir::YieldOp>(falseRegion->back().getTerminator());
+
   assert((yield && yield.getNumOperands() <= 1) &&
          "expected zero or one result type");
   if (yield.getNumOperands() == 1)
@@ -2901,16 +2941,17 @@ mlir::LogicalResult cir::ThrowOp::verify() {
 }
 
 //===----------------------------------------------------------------------===//
-// AtomicCmpXchg
+// AtomicFetchOp
 //===----------------------------------------------------------------------===//
 
-LogicalResult cir::AtomicCmpXchg::verify() {
-  mlir::Type pointeeType = getPtr().getType().getPointee();
-
-  if (pointeeType != getExpected().getType() ||
-      pointeeType != getDesired().getType())
-    return emitOpError("ptr, expected and desired types must match");
-
+LogicalResult cir::AtomicFetchOp::verify() {
+  if (getBinop() != cir::AtomicFetchKind::Add &&
+      getBinop() != cir::AtomicFetchKind::Sub &&
+      getBinop() != cir::AtomicFetchKind::Max &&
+      getBinop() != cir::AtomicFetchKind::Min &&
+      !mlir::isa<cir::IntType>(getVal().getType()))
+    return emitError("only atomic add, sub, max, and min operation could "
+                     "operate on floating-point values");
   return success();
 }
 
@@ -2928,6 +2969,133 @@ LogicalResult cir::TypeInfoAttr::verify(
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// TryOp
+//===----------------------------------------------------------------------===//
+
+void cir::TryOp::getSuccessorRegions(
+    mlir::RegionBranchPoint point,
+    llvm::SmallVectorImpl<mlir::RegionSuccessor> &regions) {
+  // The `try` and the `catchers` region branch back to the parent operation.
+  if (!point.isParent()) {
+    regions.push_back(mlir::RegionSuccessor());
+    return;
+  }
+
+  regions.push_back(mlir::RegionSuccessor(&getTryRegion()));
+
+  // TODO(CIR): If we know a target function never throws a specific type, we
+  // can remove the catch handler.
+  for (mlir::Region &handlerRegion : this->getHandlerRegions())
+    regions.push_back(mlir::RegionSuccessor(&handlerRegion));
+}
+
+static void
+printTryHandlerRegions(mlir::OpAsmPrinter &printer, cir::TryOp op,
+                       mlir::MutableArrayRef<mlir::Region> handlerRegions,
+                       mlir::ArrayAttr handlerTypes) {
+  if (!handlerTypes)
+    return;
+
+  for (const auto [typeIdx, typeAttr] : llvm::enumerate(handlerTypes)) {
+    if (typeIdx)
+      printer << " ";
+
+    if (mlir::isa<cir::CatchAllAttr>(typeAttr)) {
+      printer << "catch all ";
+    } else if (mlir::isa<cir::UnwindAttr>(typeAttr)) {
+      printer << "unwind ";
+    } else {
+      printer << "catch [type ";
+      printer.printAttribute(typeAttr);
+      printer << "] ";
+    }
+
+    printer.printRegion(handlerRegions[typeIdx],
+                        /*printEntryBLockArgs=*/false,
+                        /*printBlockTerminators=*/true);
+  }
+}
+
+static mlir::ParseResult parseTryHandlerRegions(
+    mlir::OpAsmParser &parser,
+    llvm::SmallVectorImpl<std::unique_ptr<mlir::Region>> &handlerRegions,
+    mlir::ArrayAttr &handlerTypes) {
+
+  auto parseCheckedCatcherRegion = [&]() -> mlir::ParseResult {
+    handlerRegions.emplace_back(new mlir::Region);
+
+    mlir::Region &currRegion = *handlerRegions.back();
+    mlir::SMLoc regionLoc = parser.getCurrentLocation();
+    if (parser.parseRegion(currRegion)) {
+      handlerRegions.clear();
+      return failure();
+    }
+
+    if (currRegion.empty())
+      return parser.emitError(regionLoc, "handler region shall not be empty");
+
+    if (!(currRegion.back().mightHaveTerminator() &&
+          currRegion.back().getTerminator()))
+      return parser.emitError(
+          regionLoc, "blocks are expected to be explicitly terminated");
+
+    return success();
+  };
+
+  bool hasCatchAll = false;
+  llvm::SmallVector<mlir::Attribute, 4> catcherAttrs;
+  while (parser.parseOptionalKeyword("catch").succeeded()) {
+    bool hasLSquare = parser.parseOptionalLSquare().succeeded();
+
+    llvm::StringRef attrStr;
+    if (parser.parseOptionalKeyword(&attrStr, {"all", "type"}).failed())
+      return parser.emitError(parser.getCurrentLocation(),
+                              "expected 'all' or 'type' keyword");
+
+    bool isCatchAll = attrStr == "all";
+    if (isCatchAll) {
+      if (hasCatchAll)
+        return parser.emitError(parser.getCurrentLocation(),
+                                "can't have more than one catch all");
+      hasCatchAll = true;
+    }
+
+    mlir::Attribute exceptionRTTIAttr;
+    if (!isCatchAll && parser.parseAttribute(exceptionRTTIAttr).failed())
+      return parser.emitError(parser.getCurrentLocation(),
+                              "expected valid RTTI info attribute");
+
+    catcherAttrs.push_back(isCatchAll
+                               ? cir::CatchAllAttr::get(parser.getContext())
+                               : exceptionRTTIAttr);
+
+    if (hasLSquare && isCatchAll)
+      return parser.emitError(parser.getCurrentLocation(),
+                              "catch all dosen't need RTTI info attribute");
+
+    if (hasLSquare && parser.parseRSquare().failed())
+      return parser.emitError(parser.getCurrentLocation(),
+                              "expected `]` after RTTI info attribute");
+
+    if (parseCheckedCatcherRegion().failed())
+      return mlir::failure();
+  }
+
+  if (parser.parseOptionalKeyword("unwind").succeeded()) {
+    if (hasCatchAll)
+      return parser.emitError(parser.getCurrentLocation(),
+                              "unwind can't be used with catch all");
+
+    catcherAttrs.push_back(cir::UnwindAttr::get(parser.getContext()));
+    if (parseCheckedCatcherRegion().failed())
+      return mlir::failure();
+  }
+
+  handlerTypes = parser.getBuilder().getArrayAttr(catcherAttrs);
+  return mlir::success();
+}
+
 //===----------------------------------------------------------------------===//
 // TableGen'd op method definitions
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/CIR/Dialect/Transforms/FlattenCFG.cpp b/clang/lib/CIR/Dialect/Transforms/FlattenCFG.cpp
index 26e5c0572f12e..46bd186ccada1 100644
--- a/clang/lib/CIR/Dialect/Transforms/FlattenCFG.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/FlattenCFG.cpp
@@ -505,10 +505,19 @@ class CIRTernaryOpFlattening : public mlir::OpRewritePattern<cir::TernaryOp> {
     Block *trueBlock = &trueRegion.front();
     mlir::Operation *trueTerminator = trueRegion.back().getTerminator();
     rewriter.setInsertionPointToEnd(&trueRegion.back());
-    auto trueYieldOp = dyn_cast<cir::YieldOp>(trueTerminator);
 
-    rewriter.replaceOpWithNewOp<cir::BrOp>(trueYieldOp, trueYieldOp.getArgs(),
-                                           continueBlock);
+    // Handle both yield and unreachable terminators (throw expressions)
+    if (auto trueYieldOp = dyn_cast<cir::YieldOp>(trueTerminator)) {
+      rewriter.replaceOpWithNewOp<cir::BrOp>(trueYieldOp, trueYieldOp.getArgs(),
+                                             continueBlock);
+    } else if (isa<cir::UnreachableOp>(trueTerminator)) {
+      // Terminator is unreachable (e.g., from throw), just keep it
+    } else {
+      trueTerminator->emitError("unexpected terminator in ternary true region, "
+                                "expected yield or unreachable, got: ")
+          << trueTerminator->getName();
+      return mlir::failure();
+    }
     rewriter.inlineRegionBefore(trueRegion, continueBlock);
 
     Block *falseBlock = continueBlock;
@@ -517,9 +526,19 @@ class CIRTernaryOpFlattening : public mlir::OpRewritePattern<cir::TernaryOp> {
     falseBlock = &falseRegion.front();
     mlir::Operation *falseTerminator = falseRegion.back().getTerminator();
     rewriter.setInsertionPointToEnd(&falseRegion.back());
-    auto falseYieldOp = dyn_cast<cir::YieldOp>(falseTerminator);
-    rewriter.replaceOpWithNewOp<cir::BrOp>(falseYieldOp, falseYieldOp.getArgs(),
-                                           continueBlock);
+
+    // Handle both yield and unreachable terminators (throw expressions)
+    if (auto falseYieldOp = dyn_cast<cir::YieldOp>(falseTerminator)) {
+      rewriter.replaceOpWithNewOp<cir::BrOp>(
+          falseYieldOp, falseYieldOp.getArgs(), continueBlock);
+    } else if (isa<cir::UnreachableOp>(falseTerminator)) {
+      // Terminator is unreachable (e.g., from throw), just keep it
+    } else {
+      falseTerminator->emitError("unexpected terminator in ternary false "
+                                 "region, expected yield or unreachable, got: ")
+          << falseTerminator->getName();
+      return mlir::failure();
+    }
     rewriter.inlineRegionBefore(falseRegion, continueBlock);
 
     rewriter.setInsertionPointToEnd(condBlock);
@@ -532,10 +551,100 @@ class CIRTernaryOpFlattening : public mlir::OpRewritePattern<cir::TernaryOp> {
   }
 };
 
+class CIRTryOpFlattening : public mlir::OpRewritePattern<cir::TryOp> {
+public:
+  using OpRewritePattern<cir::TryOp>::OpRewritePattern;
+
+  mlir::Block *buildTryBody(cir::TryOp tryOp,
+                            mlir::PatternRewriter &rewriter) const {
+    // Split the current block before the TryOp to create the inlining
+    // point.
+    mlir::Block *beforeTryScopeBlock = rewriter.getInsertionBlock();
+    mlir::Block *afterTry =
+        rewriter.splitBlock(beforeTryScopeBlock, rewriter.getInsertionPoint());
+
+    // Inline body region.
+    mlir::Block *beforeBody = &tryOp.getTryRegion().front();
+    rewriter.inlineRegionBefore(tryOp.getTryRegion(), afterTry);
+
+    // Branch into the body of the region.
+    rewriter.setInsertionPointToEnd(beforeTryScopeBlock);
+    cir::BrOp::create(rewriter, tryOp.getLoc(), mlir::ValueRange(), beforeBody);
+    return afterTry;
+  }
+
+  void buildHandlers(cir::TryOp tryOp, mlir::PatternRewriter &rewriter,
+                     mlir::Block *afterBody, mlir::Block *afterTry,
+                     SmallVectorImpl<cir::CallOp> &callsToRewrite,
+                     SmallVectorImpl<mlir::Block *> &landingPads) const {
+    // Replace the tryOp return with a branch that jumps out of the body.
+    rewriter.setInsertionPointToEnd(afterBody);
+
+    mlir::Block *beforeCatch = rewriter.getInsertionBlock();
+    rewriter.setInsertionPointToEnd(beforeCatch);
+
+    // Check if the terminator is a YieldOp because there could be another
+    // terminator, e.g. unreachable
+    if (auto tryBodyYield = dyn_cast<cir::YieldOp>(afterBody->getTerminator()))
+      rewriter.replaceOpWithNewOp<cir::BrOp>(tryBodyYield, afterTry);
+
+    mlir::ArrayAttr handlers = tryOp.getHandlerTypesAttr();
+    if (!handlers || handlers.empty())
+      return;
+
+    llvm_unreachable("TryOpFlattening buildHandlers with CallsOp is NYI");
+  }
+
+  mlir::LogicalResult
+  matchAndRewrite(cir::TryOp tryOp,
+                  mlir::PatternRewriter &rewriter) const override {
+    mlir::OpBuilder::InsertionGuard guard(rewriter);
+    mlir::Block *afterBody = &tryOp.getTryRegion().back();
+
+    // Grab the collection of `cir.call exception`s to rewrite to
+    // `cir.try_call`.
+    llvm::SmallVector<cir::CallOp, 4> callsToRewrite;
+    tryOp.getTryRegion().walk([&](CallOp op) {
+      // Only grab calls within immediate closest TryOp scope.
+      if (op->getParentOfType<cir::TryOp>() != tryOp)
+        return;
+      assert(!cir::MissingFeatures::opCallExceptionAttr());
+      callsToRewrite.push_back(op);
+    });
+
+    if (!callsToRewrite.empty())
+      llvm_unreachable(
+          "TryOpFlattening with try block that contains CallOps is NYI");
+
+    // Build try body.
+    mlir::Block *afterTry = buildTryBody(tryOp, rewriter);
+
+    // Build handlers.
+    llvm::SmallVector<mlir::Block *, 4> landingPads;
+    buildHandlers(tryOp, rewriter, afterBody, afterTry, callsToRewrite,
+                  landingPads);
+
+    rewriter.eraseOp(tryOp);
+
+    assert((landingPads.size() == callsToRewrite.size()) &&
+           "expected matching number of entries");
+
+    // Quick block cleanup: no indirection to the post try block.
+    auto brOp = dyn_cast<cir::BrOp>(afterTry->getTerminator());
+    if (brOp && brOp.getDest()->hasNoPredecessors()) {
+      mlir::Block *srcBlock = brOp.getDest();
+      rewriter.eraseOp(brOp);
+      rewriter.mergeBlocks(srcBlock, afterTry);
+    }
+
+    return mlir::success();
+  }
+};
+
 void populateFlattenCFGPatterns(RewritePatternSet &patterns) {
   patterns
       .add<CIRIfFlattening, CIRLoopOpInterfaceFlattening, CIRScopeOpFlattening,
-           CIRSwitchOpFlattening, CIRTernaryOpFlattening>(
+           CIRSwitchOpFlattening, CIRTernaryOpFlattening, CIRTryOpFlattening>(
           patterns.getContext());
 }
 
@@ -549,7 +658,7 @@ void CIRFlattenCFGPass::runOnOperation() {
     assert(!cir::MissingFeatures::ifOp());
     assert(!cir::MissingFeatures::switchOp());
     assert(!cir::MissingFeatures::tryOp());
-    if (isa<IfOp, ScopeOp, SwitchOp, LoopOpInterface, TernaryOp>(op))
+    if (isa<IfOp, ScopeOp, SwitchOp, LoopOpInterface, TernaryOp, TryOp>(op))
       ops.push_back(op);
   });
 
diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepareItaniumCXXABI.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepareItaniumCXXABI.cpp
index 7d3c711251b9f..11ce2a81e7f39 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepareItaniumCXXABI.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepareItaniumCXXABI.cpp
@@ -92,7 +92,53 @@ static mlir::Value
 buildDynamicCastToVoidAfterNullCheck(cir::CIRBaseBuilderTy &builder,
                                      clang::ASTContext &astCtx,
                                      cir::DynamicCastOp op) {
-  llvm_unreachable("dynamic cast to void is NYI");
+  mlir::Location loc = op.getLoc();
+  bool vtableUsesRelativeLayout = op.getRelativeLayout();
+
+  // TODO(cir): consider address space in this function.
+  assert(!cir::MissingFeatures::addressSpace());
+
+  mlir::Type vtableElemTy;
+  uint64_t vtableElemAlign;
+  if (vtableUsesRelativeLayout) {
+    vtableElemTy = builder.getSIntNTy(32);
+    vtableElemAlign = 4;
+  } else {
+    const auto &targetInfo = astCtx.getTargetInfo();
+    auto ptrdiffTy = targetInfo.getPtrDiffType(clang::LangAS::Default);
+    bool ptrdiffTyIsSigned = clang::TargetInfo::isTypeSigned(ptrdiffTy);
+    uint64_t ptrdiffTyWidth = targetInfo.getTypeWidth(ptrdiffTy);
+
+    vtableElemTy = cir::IntType::get(builder.getContext(), ptrdiffTyWidth,
+                                     ptrdiffTyIsSigned);
+    vtableElemAlign =
+        llvm::divideCeil(targetInfo.getPointerAlign(clang::LangAS::Default), 8);
+  }
+
+  // Access vtable to get the offset from the given object to its containing
+  // complete object.
+  // TODO: Add a specialized operation to get the object offset?
+  auto vptrTy = cir::VPtrType::get(builder.getContext());
+  cir::PointerType vptrPtrTy = builder.getPointerTo(vptrTy);
+  auto vptrPtr =
+      cir::VTableGetVPtrOp::create(builder, loc, vptrPtrTy, op.getSrc());
+  mlir::Value vptr = builder.createLoad(loc, vptrPtr);
+  mlir::Value elementPtr =
+      builder.createBitcast(vptr, builder.getPointerTo(vtableElemTy));
+  mlir::Value minusTwo = builder.getSignedInt(loc, -2, 64);
+  auto offsetToTopSlotPtr = cir::PtrStrideOp::create(
+      builder, loc, builder.getPointerTo(vtableElemTy), elementPtr, minusTwo);
+  mlir::Value offsetToTop =
+      builder.createAlignedLoad(loc, offsetToTopSlotPtr, vtableElemAlign);
+
+  // Add the offset to the given pointer to get the cast result.
+  // Cast the input pointer to a uint8_t* to allow pointer arithmetic.
+  cir::PointerType u8PtrTy = builder.getPointerTo(builder.getUIntNTy(8));
+  mlir::Value srcBytePtr = builder.createBitcast(op.getSrc(), u8PtrTy);
+  auto dstBytePtr =
+      cir::PtrStrideOp::create(builder, loc, u8PtrTy, srcBytePtr, offsetToTop);
+  // Cast the result to a void*.
+  return builder.createBitcast(dstBytePtr, builder.getVoidPtrTy());
 }
 
 mlir::Value
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index f0d73ac872386..bb75f2d94066f 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -182,7 +182,7 @@ mlir::LogicalResult CIRToLLVMCopyOpLowering::matchAndRewrite(
       rewriter, op.getLoc(), rewriter.getI32Type(), op.getLength(layout));
   assert(!cir::MissingFeatures::aggValueSlotVolatile());
   rewriter.replaceOpWithNewOp<mlir::LLVM::MemcpyOp>(
-      op, adaptor.getDst(), adaptor.getSrc(), length, /*isVolatile=*/false);
+      op, adaptor.getDst(), adaptor.getSrc(), length, op.getIsVolatile());
   return mlir::success();
 }
 
@@ -694,8 +694,8 @@ getLLVMMemOrder(std::optional<cir::MemOrder> memorder) {
   llvm_unreachable("unknown memory order");
 }
 
-mlir::LogicalResult CIRToLLVMAtomicCmpXchgLowering::matchAndRewrite(
-    cir::AtomicCmpXchg op, OpAdaptor adaptor,
+mlir::LogicalResult CIRToLLVMAtomicCmpXchgOpLowering::matchAndRewrite(
+    cir::AtomicCmpXchgOp op, OpAdaptor adaptor,
     mlir::ConversionPatternRewriter &rewriter) const {
   mlir::Value expected = adaptor.getExpected();
   mlir::Value desired = adaptor.getDesired();
@@ -719,8 +719,8 @@ mlir::LogicalResult CIRToLLVMAtomicCmpXchgLowering::matchAndRewrite(
   return mlir::success();
 }
 
-mlir::LogicalResult CIRToLLVMAtomicXchgLowering::matchAndRewrite(
-    cir::AtomicXchg op, OpAdaptor adaptor,
+mlir::LogicalResult CIRToLLVMAtomicXchgOpLowering::matchAndRewrite(
+    cir::AtomicXchgOp op, OpAdaptor adaptor,
     mlir::ConversionPatternRewriter &rewriter) const {
   assert(!cir::MissingFeatures::atomicSyncScopeID());
   mlir::LLVM::AtomicOrdering llvmOrder = getLLVMMemOrder(adaptor.getMemOrder());
@@ -730,6 +730,187 @@ mlir::LogicalResult CIRToLLVMAtomicXchgLowering::matchAndRewrite(
   return mlir::success();
 }
 
+mlir::LogicalResult CIRToLLVMAtomicTestAndSetOpLowering::matchAndRewrite(
+    cir::AtomicTestAndSetOp op, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  assert(!cir::MissingFeatures::atomicSyncScopeID());
+
+  mlir::LLVM::AtomicOrdering llvmOrder = getLLVMMemOrder(op.getMemOrder());
+
+  auto one = mlir::LLVM::ConstantOp::create(rewriter, op.getLoc(),
+                                            rewriter.getI8Type(), 1);
+  auto rmw = mlir::LLVM::AtomicRMWOp::create(
+      rewriter, op.getLoc(), mlir::LLVM::AtomicBinOp::xchg, adaptor.getPtr(),
+      one, llvmOrder, /*syncscope=*/llvm::StringRef(),
+      adaptor.getAlignment().value_or(0), op.getIsVolatile());
+
+  auto zero = mlir::LLVM::ConstantOp::create(rewriter, op.getLoc(),
+                                             rewriter.getI8Type(), 0);
+  auto cmp = mlir::LLVM::ICmpOp::create(
+      rewriter, op.getLoc(), mlir::LLVM::ICmpPredicate::ne, rmw, zero);
+
+  rewriter.replaceOp(op, cmp);
+  return mlir::success();
+}
+
+mlir::LogicalResult CIRToLLVMAtomicClearOpLowering::matchAndRewrite(
+    cir::AtomicClearOp op, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  assert(!cir::MissingFeatures::atomicSyncScopeID());
+
+  mlir::LLVM::AtomicOrdering llvmOrder = getLLVMMemOrder(op.getMemOrder());
+  auto zero = mlir::LLVM::ConstantOp::create(rewriter, op.getLoc(),
+                                             rewriter.getI8Type(), 0);
+  auto store = mlir::LLVM::StoreOp::create(
+      rewriter, op.getLoc(), zero, adaptor.getPtr(),
+      adaptor.getAlignment().value_or(0), op.getIsVolatile(),
+      /*isNonTemporal=*/false, /*isInvariantGroup=*/false, llvmOrder);
+
+  rewriter.replaceOp(op, store);
+  return mlir::success();
+}
+
+static mlir::LLVM::AtomicBinOp
+getLLVMAtomicBinOp(cir::AtomicFetchKind k, bool isInt, bool isSignedInt) {
+  switch (k) {
+  case cir::AtomicFetchKind::Add:
+    return isInt ? mlir::LLVM::AtomicBinOp::add : mlir::LLVM::AtomicBinOp::fadd;
+  case cir::AtomicFetchKind::Sub:
+    return isInt ? mlir::LLVM::AtomicBinOp::sub : mlir::LLVM::AtomicBinOp::fsub;
+  case cir::AtomicFetchKind::And:
+    return mlir::LLVM::AtomicBinOp::_and;
+  case cir::AtomicFetchKind::Xor:
+    return mlir::LLVM::AtomicBinOp::_xor;
+  case cir::AtomicFetchKind::Or:
+    return mlir::LLVM::AtomicBinOp::_or;
+  case cir::AtomicFetchKind::Nand:
+    return mlir::LLVM::AtomicBinOp::nand;
+  case cir::AtomicFetchKind::Max: {
+    if (!isInt)
+      return mlir::LLVM::AtomicBinOp::fmax;
+    return isSignedInt ? mlir::LLVM::AtomicBinOp::max
+                       : mlir::LLVM::AtomicBinOp::umax;
+  }
+  case cir::AtomicFetchKind::Min: {
+    if (!isInt)
+      return mlir::LLVM::AtomicBinOp::fmin;
+    return isSignedInt ? mlir::LLVM::AtomicBinOp::min
+                       : mlir::LLVM::AtomicBinOp::umin;
+  }
+  }
+  llvm_unreachable("Unknown atomic fetch opcode");
+}
+
+static llvm::StringLiteral getLLVMBinop(cir::AtomicFetchKind k, bool isInt) {
+  switch (k) {
+  case cir::AtomicFetchKind::Add:
+    return isInt ? mlir::LLVM::AddOp::getOperationName()
+                 : mlir::LLVM::FAddOp::getOperationName();
+  case cir::AtomicFetchKind::Sub:
+    return isInt ? mlir::LLVM::SubOp::getOperationName()
+                 : mlir::LLVM::FSubOp::getOperationName();
+  case cir::AtomicFetchKind::And:
+    return mlir::LLVM::AndOp::getOperationName();
+  case cir::AtomicFetchKind::Xor:
+    return mlir::LLVM::XOrOp::getOperationName();
+  case cir::AtomicFetchKind::Or:
+    return mlir::LLVM::OrOp::getOperationName();
+  case cir::AtomicFetchKind::Nand:
+    // There's no nand binop in LLVM, this is later fixed with a not.
+    return mlir::LLVM::AndOp::getOperationName();
+  case cir::AtomicFetchKind::Max:
+  case cir::AtomicFetchKind::Min:
+    llvm_unreachable("handled in buildMinMaxPostOp");
+  }
+  llvm_unreachable("Unknown atomic fetch opcode");
+}
+
+mlir::Value CIRToLLVMAtomicFetchOpLowering::buildPostOp(
+    cir::AtomicFetchOp op, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter, mlir::Value rmwVal,
+    bool isInt) const {
+  SmallVector<mlir::Value> atomicOperands = {rmwVal, adaptor.getVal()};
+  SmallVector<mlir::Type> atomicResTys = {rmwVal.getType()};
+  return rewriter
+      .create(op.getLoc(),
+              rewriter.getStringAttr(getLLVMBinop(op.getBinop(), isInt)),
+              atomicOperands, atomicResTys, {})
+      ->getResult(0);
+}
+
+mlir::Value CIRToLLVMAtomicFetchOpLowering::buildMinMaxPostOp(
+    cir::AtomicFetchOp op, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter, mlir::Value rmwVal, bool isInt,
+    bool isSigned) const {
+  mlir::Location loc = op.getLoc();
+
+  if (!isInt) {
+    if (op.getBinop() == cir::AtomicFetchKind::Max)
+      return mlir::LLVM::MaxNumOp::create(rewriter, loc, rmwVal,
+                                          adaptor.getVal());
+    return mlir::LLVM::MinNumOp::create(rewriter, loc, rmwVal,
+                                        adaptor.getVal());
+  }
+
+  mlir::LLVM::ICmpPredicate pred;
+  if (op.getBinop() == cir::AtomicFetchKind::Max) {
+    pred = isSigned ? mlir::LLVM::ICmpPredicate::sgt
+                    : mlir::LLVM::ICmpPredicate::ugt;
+  } else { // Min
+    pred = isSigned ? mlir::LLVM::ICmpPredicate::slt
+                    : mlir::LLVM::ICmpPredicate::ult;
+  }
+  mlir::Value cmp = mlir::LLVM::ICmpOp::create(
+      rewriter, loc,
+      mlir::LLVM::ICmpPredicateAttr::get(rewriter.getContext(), pred), rmwVal,
+      adaptor.getVal());
+  return mlir::LLVM::SelectOp::create(rewriter, loc, cmp, rmwVal,
+                                      adaptor.getVal());
+}
+
+mlir::LogicalResult CIRToLLVMAtomicFetchOpLowering::matchAndRewrite(
+    cir::AtomicFetchOp op, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  bool isInt = false;
+  bool isSignedInt = false;
+  if (auto intTy = mlir::dyn_cast<cir::IntType>(op.getVal().getType())) {
+    isInt = true;
+    isSignedInt = intTy.isSigned();
+  } else if (mlir::isa<cir::SingleType, cir::DoubleType>(
+                 op.getVal().getType())) {
+    isInt = false;
+  } else {
+    return op.emitError() << "Unsupported type: " << op.getVal().getType();
+  }
+
+  mlir::LLVM::AtomicOrdering llvmOrder = getLLVMMemOrder(op.getMemOrder());
+  mlir::LLVM::AtomicBinOp llvmBinOp =
+      getLLVMAtomicBinOp(op.getBinop(), isInt, isSignedInt);
+  auto rmwVal = mlir::LLVM::AtomicRMWOp::create(rewriter, op.getLoc(),
+                                                llvmBinOp, adaptor.getPtr(),
+                                                adaptor.getVal(), llvmOrder);
+
+  mlir::Value result = rmwVal.getResult();
+  if (!op.getFetchFirst()) {
+    if (op.getBinop() == cir::AtomicFetchKind::Max ||
+        op.getBinop() == cir::AtomicFetchKind::Min)
+      result = buildMinMaxPostOp(op, adaptor, rewriter, rmwVal.getRes(), isInt,
+                                 isSignedInt);
+    else
+      result = buildPostOp(op, adaptor, rewriter, rmwVal.getRes(), isInt);
+
+    // Compensate lack of nand binop in LLVM IR.
+    if (op.getBinop() == cir::AtomicFetchKind::Nand) {
+      auto negOne = mlir::LLVM::ConstantOp::create(rewriter, op.getLoc(),
+                                                   result.getType(), -1);
+      result = mlir::LLVM::XOrOp::create(rewriter, op.getLoc(), result, negOne);
+    }
+  }
+
+  rewriter.replaceOp(op, result);
+  return mlir::success();
+}
+
 mlir::LogicalResult CIRToLLVMBitClrsbOpLowering::matchAndRewrite(
     cir::BitClrsbOp op, OpAdaptor adaptor,
     mlir::ConversionPatternRewriter &rewriter) const {
@@ -1499,6 +1680,54 @@ mlir::LogicalResult CIRToLLVMConstantOpLowering::matchAndRewrite(
   return mlir::success();
 }
 
+static uint64_t getTypeSize(mlir::Type type, mlir::Operation &op) {
+  mlir::DataLayout layout(op.getParentOfType<mlir::ModuleOp>());
+  // For LLVM purposes we treat void as u8.
+  if (isa<cir::VoidType>(type))
+    type = cir::IntType::get(type.getContext(), 8, /*isSigned=*/false);
+  return llvm::divideCeil(layout.getTypeSizeInBits(type), 8);
+}
+
+mlir::LogicalResult CIRToLLVMPtrDiffOpLowering::matchAndRewrite(
+    cir::PtrDiffOp op, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  auto dstTy = mlir::cast<cir::IntType>(op.getType());
+  mlir::Type llvmDstTy = getTypeConverter()->convertType(dstTy);
+
+  auto lhs = rewriter.create<mlir::LLVM::PtrToIntOp>(op.getLoc(), llvmDstTy,
+                                                     adaptor.getLhs());
+  auto rhs = rewriter.create<mlir::LLVM::PtrToIntOp>(op.getLoc(), llvmDstTy,
+                                                     adaptor.getRhs());
+
+  auto diff =
+      rewriter.create<mlir::LLVM::SubOp>(op.getLoc(), llvmDstTy, lhs, rhs);
+
+  cir::PointerType ptrTy = op.getLhs().getType();
+  assert(!cir::MissingFeatures::llvmLoweringPtrDiffConsidersPointee());
+  uint64_t typeSize = getTypeSize(ptrTy.getPointee(), *op);
+
+  // Avoid silly division by 1.
+  mlir::Value resultVal = diff.getResult();
+  if (typeSize != 1) {
+    auto typeSizeVal = rewriter.create<mlir::LLVM::ConstantOp>(
+        op.getLoc(), llvmDstTy, typeSize);
+
+    if (dstTy.isUnsigned()) {
+      auto uDiv =
+          rewriter.create<mlir::LLVM::UDivOp>(op.getLoc(), diff, typeSizeVal);
+      uDiv.setIsExact(true);
+      resultVal = uDiv.getResult();
+    } else {
+      auto sDiv =
+          rewriter.create<mlir::LLVM::SDivOp>(op.getLoc(), diff, typeSizeVal);
+      sDiv.setIsExact(true);
+      resultVal = sDiv.getResult();
+    }
+  }
+  rewriter.replaceOp(op, resultVal);
+  return mlir::success();
+}
+
 mlir::LogicalResult CIRToLLVMExpectOpLowering::matchAndRewrite(
     cir::ExpectOp op, OpAdaptor adaptor,
     mlir::ConversionPatternRewriter &rewriter) const {
@@ -1539,6 +1768,7 @@ void CIRToLLVMFuncOpLowering::lowerFuncAttributes(
         attr.getName() == getLinkageAttrNameString() ||
         attr.getName() == func.getGlobalVisibilityAttrName() ||
         attr.getName() == func.getDsoLocalAttrName() ||
+        attr.getName() == func.getInlineKindAttrName() ||
         (filterArgAndResAttrs &&
          (attr.getName() == func.getArgAttrsAttrName() ||
           attr.getName() == func.getResAttrsAttrName())))
@@ -1623,6 +1853,12 @@ mlir::LogicalResult CIRToLLVMFuncOpLowering::matchAndRewrite(
 
   assert(!cir::MissingFeatures::opFuncMultipleReturnVals());
 
+  if (auto inlineKind = op.getInlineKind()) {
+    fn.setNoInline(inlineKind == cir::InlineKind::NoInline);
+    fn.setInlineHint(inlineKind == cir::InlineKind::InlineHint);
+    fn.setAlwaysInline(inlineKind == cir::InlineKind::AlwaysInline);
+  }
+
   fn.setVisibility_Attr(mlir::LLVM::VisibilityAttr::get(
       getContext(), lowerCIRVisibilityToLLVMVisibility(
                         op.getGlobalVisibilityAttr().getValue())));
@@ -1793,12 +2029,20 @@ CIRToLLVMGlobalOpLowering::getComdatAttr(cir::GlobalOp &op,
   if (!comdatOp) {
     builder.setInsertionPointToStart(module.getBody());
     comdatOp =
-        builder.create<mlir::LLVM::ComdatOp>(module.getLoc(), comdatName);
+        mlir::LLVM::ComdatOp::create(builder, module.getLoc(), comdatName);
+  }
+
+  if (auto comdatSelector = comdatOp.lookupSymbol<mlir::LLVM::ComdatSelectorOp>(
+          op.getSymName())) {
+    return mlir::SymbolRefAttr::get(
+        builder.getContext(), comdatName,
+        mlir::FlatSymbolRefAttr::get(comdatSelector.getSymNameAttr()));
   }
 
   builder.setInsertionPointToStart(&comdatOp.getBody().back());
-  auto selectorOp = builder.create<mlir::LLVM::ComdatSelectorOp>(
-      comdatOp.getLoc(), op.getSymName(), mlir::LLVM::comdat::Comdat::Any);
+  auto selectorOp = mlir::LLVM::ComdatSelectorOp::create(
+      builder, comdatOp.getLoc(), op.getSymName(),
+      mlir::LLVM::comdat::Comdat::Any);
   return mlir::SymbolRefAttr::get(
       builder.getContext(), comdatName,
       mlir::FlatSymbolRefAttr::get(selectorOp.getSymNameAttr()));
diff --git a/clang/lib/CodeGen/ABIInfoImpl.cpp b/clang/lib/CodeGen/ABIInfoImpl.cpp
index 13c837a0fb680..1e3ac2e31870f 100644
--- a/clang/lib/CodeGen/ABIInfoImpl.cpp
+++ b/clang/lib/CodeGen/ABIInfoImpl.cpp
@@ -105,7 +105,7 @@ llvm::Type *CodeGen::getVAListElementType(CodeGenFunction &CGF) {
 
 CGCXXABI::RecordArgABI CodeGen::getRecordArgABI(const RecordType *RT,
                                                 CGCXXABI &CXXABI) {
-  const RecordDecl *RD = RT->getOriginalDecl()->getDefinitionOrSelf();
+  const RecordDecl *RD = RT->getDecl()->getDefinitionOrSelf();
   if (const auto *CXXRD = dyn_cast<CXXRecordDecl>(RD))
     return CXXABI.getRecordArgABI(CXXRD);
   if (!RD->canPassInRegisters())
@@ -136,7 +136,7 @@ bool CodeGen::classifyReturnType(const CGCXXABI &CXXABI, CGFunctionInfo &FI,
 
 QualType CodeGen::useFirstFieldIfTransparentUnion(QualType Ty) {
   if (const RecordType *UT = Ty->getAsUnionType()) {
-    const RecordDecl *UD = UT->getOriginalDecl()->getDefinitionOrSelf();
+    const RecordDecl *UD = UT->getDecl()->getDefinitionOrSelf();
     if (UD->hasAttr<TransparentUnionAttr>()) {
       assert(!UD->field_empty() && "sema created an empty transparent union");
       return UD->field_begin()->getType();
@@ -274,7 +274,7 @@ bool CodeGen::isEmptyField(ASTContext &Context, const FieldDecl *FD,
   // according to the Itanium ABI.  The exception applies only to records,
   // not arrays of records, so we must also check whether we stripped off an
   // array type above.
-  if (isa<CXXRecordDecl>(RT->getOriginalDecl()) &&
+  if (isa<CXXRecordDecl>(RT->getDecl()) &&
       (WasArray || (!AsIfNoUniqueAddr && !FD->hasAttr<NoUniqueAddressAttr>())))
     return false;
 
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index f8e8086afc36f..602068436101b 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -1200,7 +1200,8 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
       }
     }
 
-    if (shouldEmitUnifiedLTOModueFlag())
+    if (shouldEmitUnifiedLTOModueFlag() &&
+        !TheModule->getModuleFlag("UnifiedLTO"))
       TheModule->addModuleFlag(llvm::Module::Error, "UnifiedLTO", uint32_t(1));
   }
 
diff --git a/clang/lib/CodeGen/CGAtomic.cpp b/clang/lib/CodeGen/CGAtomic.cpp
index d95dab3a19fea..a0125817df493 100644
--- a/clang/lib/CodeGen/CGAtomic.cpp
+++ b/clang/lib/CodeGen/CGAtomic.cpp
@@ -86,7 +86,7 @@ namespace {
         llvm::Value *StoragePtr = CGF.Builder.CreateConstGEP1_64(
             CGF.Int8Ty, BitFieldPtr, OffsetInChars.getQuantity());
         StoragePtr = CGF.Builder.CreateAddrSpaceCast(
-            StoragePtr, CGF.UnqualPtrTy, "atomic_bitfield_base");
+            StoragePtr, CGF.DefaultPtrTy, "atomic_bitfield_base");
         BFI = OrigBFI;
         BFI.Offset = Offset;
         BFI.StorageSize = AtomicSizeInBits;
@@ -374,10 +374,9 @@ bool AtomicInfo::emitMemSetZeroIfNecessary() const {
 }
 
 static void emitAtomicCmpXchg(CodeGenFunction &CGF, AtomicExpr *E, bool IsWeak,
-                              Address Dest, Address Ptr,
-                              Address Val1, Address Val2,
-                              uint64_t Size,
-                              llvm::AtomicOrdering SuccessOrder,
+                              Address Dest, Address Ptr, Address Val1,
+                              Address Val2, Address ExpectedResult,
+                              uint64_t Size, llvm::AtomicOrdering SuccessOrder,
                               llvm::AtomicOrdering FailureOrder,
                               llvm::SyncScope::ID Scope) {
   // Note that cmpxchg doesn't support weak cmpxchg, at least at the moment.
@@ -411,8 +410,30 @@ static void emitAtomicCmpXchg(CodeGenFunction &CGF, AtomicExpr *E, bool IsWeak,
 
   CGF.Builder.SetInsertPoint(StoreExpectedBB);
   // Update the memory at Expected with Old's value.
-  auto *I = CGF.Builder.CreateStore(Old, Val1);
-  CGF.addInstToCurrentSourceAtom(I, Old);
+  llvm::Type *ExpectedType = ExpectedResult.getElementType();
+  const llvm::DataLayout &DL = CGF.CGM.getDataLayout();
+  uint64_t ExpectedSizeInBytes = DL.getTypeStoreSize(ExpectedType);
+
+  if (ExpectedSizeInBytes == Size) {
+    // Sizes match: store directly
+    auto *I = CGF.Builder.CreateStore(Old, ExpectedResult);
+    CGF.addInstToCurrentSourceAtom(I, Old);
+  } else {
+    // store only the first ExpectedSizeInBytes bytes of Old
+    llvm::Type *OldType = Old->getType();
+
+    // Allocate temporary storage for Old value
+    Address OldTmp =
+        CGF.CreateTempAlloca(OldType, Ptr.getAlignment(), "old.tmp");
+
+    // Store Old into this temporary
+    auto *I = CGF.Builder.CreateStore(Old, OldTmp);
+    CGF.addInstToCurrentSourceAtom(I, Old);
+
+    // Perform memcpy for first ExpectedSizeInBytes bytes
+    CGF.Builder.CreateMemCpy(ExpectedResult, OldTmp, ExpectedSizeInBytes,
+                             /*isVolatile=*/false);
+  }
 
   // Finally, branch to the exit point.
   CGF.Builder.CreateBr(ContinueBB);
@@ -425,13 +446,11 @@ static void emitAtomicCmpXchg(CodeGenFunction &CGF, AtomicExpr *E, bool IsWeak,
 /// Given an ordering required on success, emit all possible cmpxchg
 /// instructions to cope with the provided (but possibly only dynamically known)
 /// FailureOrder.
-static void emitAtomicCmpXchgFailureSet(CodeGenFunction &CGF, AtomicExpr *E,
-                                        bool IsWeak, Address Dest, Address Ptr,
-                                        Address Val1, Address Val2,
-                                        llvm::Value *FailureOrderVal,
-                                        uint64_t Size,
-                                        llvm::AtomicOrdering SuccessOrder,
-                                        llvm::SyncScope::ID Scope) {
+static void emitAtomicCmpXchgFailureSet(
+    CodeGenFunction &CGF, AtomicExpr *E, bool IsWeak, Address Dest, Address Ptr,
+    Address Val1, Address Val2, Address ExpectedResult,
+    llvm::Value *FailureOrderVal, uint64_t Size,
+    llvm::AtomicOrdering SuccessOrder, llvm::SyncScope::ID Scope) {
   llvm::AtomicOrdering FailureOrder;
   if (llvm::ConstantInt *FO = dyn_cast<llvm::ConstantInt>(FailureOrderVal)) {
     auto FOS = FO->getSExtValue();
@@ -458,8 +477,8 @@ static void emitAtomicCmpXchgFailureSet(CodeGenFunction &CGF, AtomicExpr *E,
     // success argument". This condition has been lifted and the only
     // precondition is 31.7.2.18. Effectively treat this as a DR and skip
     // language version checks.
-    emitAtomicCmpXchg(CGF, E, IsWeak, Dest, Ptr, Val1, Val2, Size, SuccessOrder,
-                      FailureOrder, Scope);
+    emitAtomicCmpXchg(CGF, E, IsWeak, Dest, Ptr, Val1, Val2, ExpectedResult,
+                      Size, SuccessOrder, FailureOrder, Scope);
     return;
   }
 
@@ -483,18 +502,19 @@ static void emitAtomicCmpXchgFailureSet(CodeGenFunction &CGF, AtomicExpr *E,
 
   // Emit all the different atomics
   CGF.Builder.SetInsertPoint(MonotonicBB);
-  emitAtomicCmpXchg(CGF, E, IsWeak, Dest, Ptr, Val1, Val2,
-                    Size, SuccessOrder, llvm::AtomicOrdering::Monotonic, Scope);
+  emitAtomicCmpXchg(CGF, E, IsWeak, Dest, Ptr, Val1, Val2, ExpectedResult, Size,
+                    SuccessOrder, llvm::AtomicOrdering::Monotonic, Scope);
   CGF.Builder.CreateBr(ContBB);
 
   CGF.Builder.SetInsertPoint(AcquireBB);
-  emitAtomicCmpXchg(CGF, E, IsWeak, Dest, Ptr, Val1, Val2, Size, SuccessOrder,
-                    llvm::AtomicOrdering::Acquire, Scope);
+  emitAtomicCmpXchg(CGF, E, IsWeak, Dest, Ptr, Val1, Val2, ExpectedResult, Size,
+                    SuccessOrder, llvm::AtomicOrdering::Acquire, Scope);
   CGF.Builder.CreateBr(ContBB);
 
   CGF.Builder.SetInsertPoint(SeqCstBB);
-  emitAtomicCmpXchg(CGF, E, IsWeak, Dest, Ptr, Val1, Val2, Size, SuccessOrder,
-                    llvm::AtomicOrdering::SequentiallyConsistent, Scope);
+  emitAtomicCmpXchg(CGF, E, IsWeak, Dest, Ptr, Val1, Val2, ExpectedResult, Size,
+                    SuccessOrder, llvm::AtomicOrdering::SequentiallyConsistent,
+                    Scope);
   CGF.Builder.CreateBr(ContBB);
 
   CGF.Builder.SetInsertPoint(ContBB);
@@ -538,8 +558,9 @@ static llvm::Value *EmitPostAtomicMinMax(CGBuilderTy &Builder,
 
 static void EmitAtomicOp(CodeGenFunction &CGF, AtomicExpr *E, Address Dest,
                          Address Ptr, Address Val1, Address Val2,
-                         llvm::Value *IsWeak, llvm::Value *FailureOrder,
-                         uint64_t Size, llvm::AtomicOrdering Order,
+                         Address ExpectedResult, llvm::Value *IsWeak,
+                         llvm::Value *FailureOrder, uint64_t Size,
+                         llvm::AtomicOrdering Order,
                          llvm::SyncScope::ID Scope) {
   llvm::AtomicRMWInst::BinOp Op = llvm::AtomicRMWInst::Add;
   bool PostOpMinMax = false;
@@ -554,13 +575,15 @@ static void EmitAtomicOp(CodeGenFunction &CGF, AtomicExpr *E, Address Dest,
   case AtomicExpr::AO__hip_atomic_compare_exchange_strong:
   case AtomicExpr::AO__opencl_atomic_compare_exchange_strong:
     emitAtomicCmpXchgFailureSet(CGF, E, false, Dest, Ptr, Val1, Val2,
-                                FailureOrder, Size, Order, Scope);
+                                ExpectedResult, FailureOrder, Size, Order,
+                                Scope);
     return;
   case AtomicExpr::AO__c11_atomic_compare_exchange_weak:
   case AtomicExpr::AO__opencl_atomic_compare_exchange_weak:
   case AtomicExpr::AO__hip_atomic_compare_exchange_weak:
     emitAtomicCmpXchgFailureSet(CGF, E, true, Dest, Ptr, Val1, Val2,
-                                FailureOrder, Size, Order, Scope);
+                                ExpectedResult, FailureOrder, Size, Order,
+                                Scope);
     return;
   case AtomicExpr::AO__atomic_compare_exchange:
   case AtomicExpr::AO__atomic_compare_exchange_n:
@@ -568,7 +591,8 @@ static void EmitAtomicOp(CodeGenFunction &CGF, AtomicExpr *E, Address Dest,
   case AtomicExpr::AO__scoped_atomic_compare_exchange_n: {
     if (llvm::ConstantInt *IsWeakC = dyn_cast<llvm::ConstantInt>(IsWeak)) {
       emitAtomicCmpXchgFailureSet(CGF, E, IsWeakC->getZExtValue(), Dest, Ptr,
-                                  Val1, Val2, FailureOrder, Size, Order, Scope);
+                                  Val1, Val2, ExpectedResult, FailureOrder,
+                                  Size, Order, Scope);
     } else {
       // Create all the relevant BB's
       llvm::BasicBlock *StrongBB =
@@ -582,12 +606,14 @@ static void EmitAtomicOp(CodeGenFunction &CGF, AtomicExpr *E, Address Dest,
 
       CGF.Builder.SetInsertPoint(StrongBB);
       emitAtomicCmpXchgFailureSet(CGF, E, false, Dest, Ptr, Val1, Val2,
-                                  FailureOrder, Size, Order, Scope);
+                                  ExpectedResult, FailureOrder, Size, Order,
+                                  Scope);
       CGF.Builder.CreateBr(ContBB);
 
       CGF.Builder.SetInsertPoint(WeakBB);
       emitAtomicCmpXchgFailureSet(CGF, E, true, Dest, Ptr, Val1, Val2,
-                                  FailureOrder, Size, Order, Scope);
+                                  ExpectedResult, FailureOrder, Size, Order,
+                                  Scope);
       CGF.Builder.CreateBr(ContBB);
 
       CGF.Builder.SetInsertPoint(ContBB);
@@ -797,9 +823,9 @@ EmitValToTemp(CodeGenFunction &CGF, Expr *E) {
 
 static void EmitAtomicOp(CodeGenFunction &CGF, AtomicExpr *Expr, Address Dest,
                          Address Ptr, Address Val1, Address Val2,
-                         llvm::Value *IsWeak, llvm::Value *FailureOrder,
-                         uint64_t Size, llvm::AtomicOrdering Order,
-                         llvm::Value *Scope) {
+                         Address OriginalVal1, llvm::Value *IsWeak,
+                         llvm::Value *FailureOrder, uint64_t Size,
+                         llvm::AtomicOrdering Order, llvm::Value *Scope) {
   auto ScopeModel = Expr->getScopeModel();
 
   // LLVM atomic instructions always have sync scope. If clang atomic
@@ -816,8 +842,8 @@ static void EmitAtomicOp(CodeGenFunction &CGF, AtomicExpr *Expr, Address Dest,
                                                    Order, CGF.getLLVMContext());
     else
       SS = llvm::SyncScope::System;
-    EmitAtomicOp(CGF, Expr, Dest, Ptr, Val1, Val2, IsWeak, FailureOrder, Size,
-                 Order, SS);
+    EmitAtomicOp(CGF, Expr, Dest, Ptr, Val1, Val2, OriginalVal1, IsWeak,
+                 FailureOrder, Size, Order, SS);
     return;
   }
 
@@ -826,8 +852,8 @@ static void EmitAtomicOp(CodeGenFunction &CGF, AtomicExpr *Expr, Address Dest,
     auto SCID = CGF.getTargetHooks().getLLVMSyncScopeID(
         CGF.CGM.getLangOpts(), ScopeModel->map(SC->getZExtValue()),
         Order, CGF.CGM.getLLVMContext());
-    EmitAtomicOp(CGF, Expr, Dest, Ptr, Val1, Val2, IsWeak, FailureOrder, Size,
-                 Order, SCID);
+    EmitAtomicOp(CGF, Expr, Dest, Ptr, Val1, Val2, OriginalVal1, IsWeak,
+                 FailureOrder, Size, Order, SCID);
     return;
   }
 
@@ -852,12 +878,11 @@ static void EmitAtomicOp(CodeGenFunction &CGF, AtomicExpr *Expr, Address Dest,
       SI->addCase(Builder.getInt32(S), B);
 
     Builder.SetInsertPoint(B);
-    EmitAtomicOp(CGF, Expr, Dest, Ptr, Val1, Val2, IsWeak, FailureOrder, Size,
-                 Order,
-                 CGF.getTargetHooks().getLLVMSyncScopeID(CGF.CGM.getLangOpts(),
-                                                         ScopeModel->map(S),
-                                                         Order,
-                                                         CGF.getLLVMContext()));
+    EmitAtomicOp(CGF, Expr, Dest, Ptr, Val1, Val2, OriginalVal1, IsWeak,
+                 FailureOrder, Size, Order,
+                 CGF.getTargetHooks().getLLVMSyncScopeID(
+                     CGF.CGM.getLangOpts(), ScopeModel->map(S), Order,
+                     CGF.getLLVMContext()));
     Builder.CreateBr(ContBB);
   }
 
@@ -1058,6 +1083,7 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) {
   LValue AtomicVal = MakeAddrLValue(Ptr, AtomicTy);
   AtomicInfo Atomics(*this, AtomicVal);
 
+  Address OriginalVal1 = Val1;
   if (ShouldCastToIntPtrTy) {
     Ptr = Atomics.castToAtomicIntPointer(Ptr);
     if (Val1.isValid())
@@ -1301,30 +1327,32 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) {
     if (llvm::isValidAtomicOrderingCABI(ord))
       switch ((llvm::AtomicOrderingCABI)ord) {
       case llvm::AtomicOrderingCABI::relaxed:
-        EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, IsWeak, OrderFail, Size,
-                     llvm::AtomicOrdering::Monotonic, Scope);
+        EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, OriginalVal1, IsWeak,
+                     OrderFail, Size, llvm::AtomicOrdering::Monotonic, Scope);
         break;
       case llvm::AtomicOrderingCABI::consume:
       case llvm::AtomicOrderingCABI::acquire:
         if (IsStore)
           break; // Avoid crashing on code with undefined behavior
-        EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, IsWeak, OrderFail, Size,
-                     llvm::AtomicOrdering::Acquire, Scope);
+        EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, OriginalVal1, IsWeak,
+                     OrderFail, Size, llvm::AtomicOrdering::Acquire, Scope);
         break;
       case llvm::AtomicOrderingCABI::release:
         if (IsLoad)
           break; // Avoid crashing on code with undefined behavior
-        EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, IsWeak, OrderFail, Size,
-                     llvm::AtomicOrdering::Release, Scope);
+        EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, OriginalVal1, IsWeak,
+                     OrderFail, Size, llvm::AtomicOrdering::Release, Scope);
         break;
       case llvm::AtomicOrderingCABI::acq_rel:
         if (IsLoad || IsStore)
           break; // Avoid crashing on code with undefined behavior
-        EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, IsWeak, OrderFail, Size,
-                     llvm::AtomicOrdering::AcquireRelease, Scope);
+        EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, OriginalVal1, IsWeak,
+                     OrderFail, Size, llvm::AtomicOrdering::AcquireRelease,
+                     Scope);
         break;
       case llvm::AtomicOrderingCABI::seq_cst:
-        EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, IsWeak, OrderFail, Size,
+        EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, OriginalVal1, IsWeak,
+                     OrderFail, Size,
                      llvm::AtomicOrdering::SequentiallyConsistent, Scope);
         break;
       }
@@ -1360,13 +1388,13 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) {
 
   // Emit all the different atomics
   Builder.SetInsertPoint(MonotonicBB);
-  EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, IsWeak, OrderFail, Size,
-               llvm::AtomicOrdering::Monotonic, Scope);
+  EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, OriginalVal1, IsWeak, OrderFail,
+               Size, llvm::AtomicOrdering::Monotonic, Scope);
   Builder.CreateBr(ContBB);
   if (!IsStore) {
     Builder.SetInsertPoint(AcquireBB);
-    EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, IsWeak, OrderFail, Size,
-                 llvm::AtomicOrdering::Acquire, Scope);
+    EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, OriginalVal1, IsWeak,
+                 OrderFail, Size, llvm::AtomicOrdering::Acquire, Scope);
     Builder.CreateBr(ContBB);
     SI->addCase(Builder.getInt32((int)llvm::AtomicOrderingCABI::consume),
                 AcquireBB);
@@ -1375,23 +1403,23 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) {
   }
   if (!IsLoad) {
     Builder.SetInsertPoint(ReleaseBB);
-    EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, IsWeak, OrderFail, Size,
-                 llvm::AtomicOrdering::Release, Scope);
+    EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, OriginalVal1, IsWeak,
+                 OrderFail, Size, llvm::AtomicOrdering::Release, Scope);
     Builder.CreateBr(ContBB);
     SI->addCase(Builder.getInt32((int)llvm::AtomicOrderingCABI::release),
                 ReleaseBB);
   }
   if (!IsLoad && !IsStore) {
     Builder.SetInsertPoint(AcqRelBB);
-    EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, IsWeak, OrderFail, Size,
-                 llvm::AtomicOrdering::AcquireRelease, Scope);
+    EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, OriginalVal1, IsWeak,
+                 OrderFail, Size, llvm::AtomicOrdering::AcquireRelease, Scope);
     Builder.CreateBr(ContBB);
     SI->addCase(Builder.getInt32((int)llvm::AtomicOrderingCABI::acq_rel),
                 AcqRelBB);
   }
   Builder.SetInsertPoint(SeqCstBB);
-  EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, IsWeak, OrderFail, Size,
-               llvm::AtomicOrdering::SequentiallyConsistent, Scope);
+  EmitAtomicOp(*this, E, Dest, Ptr, Val1, Val2, OriginalVal1, IsWeak, OrderFail,
+               Size, llvm::AtomicOrdering::SequentiallyConsistent, Scope);
   Builder.CreateBr(ContBB);
   SI->addCase(Builder.getInt32((int)llvm::AtomicOrderingCABI::seq_cst),
               SeqCstBB);
@@ -1417,6 +1445,11 @@ Address AtomicInfo::convertToAtomicIntPointer(Address Addr) const {
   uint64_t SourceSizeInBits = CGF.CGM.getDataLayout().getTypeSizeInBits(Ty);
   if (SourceSizeInBits != AtomicSizeInBits) {
     Address Tmp = CreateTempAlloca();
+    CGF.Builder.CreateMemSet(
+        Tmp.emitRawPointer(CGF), llvm::ConstantInt::get(CGF.Int8Ty, 0),
+        CGF.getContext().toCharUnitsFromBits(AtomicSizeInBits).getQuantity(),
+        Tmp.getAlignment().getAsAlign());
+
     CGF.Builder.CreateMemCpy(Tmp, Addr,
                              std::min(AtomicSizeInBits, SourceSizeInBits) / 8);
     Addr = Tmp;
diff --git a/clang/lib/CodeGen/CGBlocks.cpp b/clang/lib/CodeGen/CGBlocks.cpp
index 597127abc9120..20a595e58e4c2 100644
--- a/clang/lib/CodeGen/CGBlocks.cpp
+++ b/clang/lib/CodeGen/CGBlocks.cpp
@@ -1207,7 +1207,7 @@ RValue CodeGenFunction::EmitBlockCallExpr(const CallExpr *E,
   } else {
     // Bitcast the block literal to a generic block literal.
     BlockPtr =
-        Builder.CreatePointerCast(BlockPtr, UnqualPtrTy, "block.literal");
+        Builder.CreatePointerCast(BlockPtr, DefaultPtrTy, "block.literal");
     // Get pointer to the block invoke function
     FuncPtr = Builder.CreateStructGEP(GenBlockTy, BlockPtr, 3);
 
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index e82c305b66068..772cb245b7cd7 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -1673,7 +1673,7 @@ static llvm::Value *EmitX86BitTestIntrinsic(CodeGenFunction &CGF,
       CGF.getLLVMContext(),
       CGF.getContext().getTypeSize(E->getArg(1)->getType()));
   llvm::FunctionType *FTy =
-      llvm::FunctionType::get(CGF.Int8Ty, {CGF.UnqualPtrTy, IntType}, false);
+      llvm::FunctionType::get(CGF.Int8Ty, {CGF.DefaultPtrTy, IntType}, false);
 
   llvm::InlineAsm *IA =
       llvm::InlineAsm::get(FTy, Asm, Constraints, /*hasSideEffects=*/true);
@@ -4283,15 +4283,11 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
 
     CharUnits Align = CGM.getNaturalTypeAlignment(
         E->getType()->getAs<VectorType>()->getElementType(), nullptr);
-    llvm::Value *AlignVal =
-        llvm::ConstantInt::get(Int32Ty, Align.getQuantity());
 
     llvm::Value *Result;
     if (BuiltinID == Builtin::BI__builtin_masked_load) {
-      Function *F =
-          CGM.getIntrinsic(Intrinsic::masked_load, {RetTy, Ptr->getType()});
-      Result =
-          Builder.CreateCall(F, {Ptr, AlignVal, Mask, PassThru}, "masked_load");
+      Result = Builder.CreateMaskedLoad(RetTy, Ptr, Align.getAsAlign(), Mask,
+                                        PassThru, "masked_load");
     } else {
       Function *F = CGM.getIntrinsic(Intrinsic::masked_expandload, {RetTy});
       Result =
@@ -4307,8 +4303,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     llvm::Type *RetTy = CGM.getTypes().ConvertType(E->getType());
     CharUnits Align = CGM.getNaturalTypeAlignment(
         E->getType()->getAs<VectorType>()->getElementType(), nullptr);
-    llvm::Value *AlignVal =
-        llvm::ConstantInt::get(Int32Ty, Align.getQuantity());
 
     llvm::Value *PassThru = llvm::PoisonValue::get(RetTy);
     if (E->getNumArgs() > 3)
@@ -4318,12 +4312,8 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
         E->getType()->getAs<VectorType>()->getElementType());
     llvm::Value *PtrVec = Builder.CreateGEP(ElemTy, Ptr, Idx);
 
-    llvm::Value *Result;
-    Function *F =
-        CGM.getIntrinsic(Intrinsic::masked_gather, {RetTy, PtrVec->getType()});
-
-    Result = Builder.CreateCall(F, {PtrVec, AlignVal, Mask, PassThru},
-                                "masked_gather");
+    llvm::Value *Result = Builder.CreateMaskedGather(
+        RetTy, PtrVec, Align.getAsAlign(), Mask, PassThru, "masked_gather");
     return RValue::get(Result);
   }
   case Builtin::BI__builtin_masked_store:
@@ -4338,13 +4328,9 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     CharUnits Align = CGM.getNaturalTypeAlignment(
         E->getArg(1)->getType()->getAs<VectorType>()->getElementType(),
         nullptr);
-    llvm::Value *AlignVal =
-        llvm::ConstantInt::get(Int32Ty, Align.getQuantity());
 
     if (BuiltinID == Builtin::BI__builtin_masked_store) {
-      llvm::Function *F = CGM.getIntrinsic(llvm::Intrinsic::masked_store,
-                                           {ValLLTy, Ptr->getType()});
-      Builder.CreateCall(F, {Val, Ptr, AlignVal, Mask});
+      Builder.CreateMaskedStore(Val, Ptr, Align.getAsAlign(), Mask);
     } else {
       llvm::Function *F =
           CGM.getIntrinsic(llvm::Intrinsic::masked_compressstore, {ValLLTy});
@@ -4361,17 +4347,12 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     CharUnits Align = CGM.getNaturalTypeAlignment(
         E->getArg(2)->getType()->getAs<VectorType>()->getElementType(),
         nullptr);
-    llvm::Value *AlignVal =
-        llvm::ConstantInt::get(Int32Ty, Align.getQuantity());
 
     llvm::Type *ElemTy = CGM.getTypes().ConvertType(
         E->getArg(1)->getType()->getAs<VectorType>()->getElementType());
     llvm::Value *PtrVec = Builder.CreateGEP(ElemTy, Ptr, Idx);
 
-    Function *F = CGM.getIntrinsic(Intrinsic::masked_scatter,
-                                   {Val->getType(), PtrVec->getType()});
-
-    Builder.CreateCall(F, {Val, PtrVec, AlignVal, Mask});
+    Builder.CreateMaskedScatter(Val, PtrVec, Align.getAsAlign(), Mask);
     return RValue();
   }
   case Builtin::BI__builtin_isinf_sign: {
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index cb16fe1b36c68..b463f889df373 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -230,7 +230,7 @@ CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
   IntTy = CGM.IntTy;
   SizeTy = CGM.SizeTy;
   VoidTy = CGM.VoidTy;
-  PtrTy = CGM.UnqualPtrTy;
+  PtrTy = CGM.DefaultPtrTy;
 
   if (CGM.getLangOpts().OffloadViaLLVM)
     Prefix = "llvm";
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 423bd2632e186..d2b8bf5497d26 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -1897,7 +1897,7 @@ bool CodeGenModule::MayDropFunctionReturn(const ASTContext &Context,
   // complex destructor or a non-trivially copyable type.
   if (const RecordType *RT =
           ReturnType.getCanonicalType()->getAsCanonical<RecordType>()) {
-    if (const auto *ClassDecl = dyn_cast<CXXRecordDecl>(RT->getOriginalDecl()))
+    if (const auto *ClassDecl = dyn_cast<CXXRecordDecl>(RT->getDecl()))
       return ClassDecl->hasTrivialDestructor();
   }
   return ReturnType.isTriviallyCopyableType(Context);
@@ -3865,7 +3865,7 @@ static void setUsedBits(CodeGenModule &CGM, const RecordType *RTy, int Offset,
                         SmallVectorImpl<uint64_t> &Bits) {
   ASTContext &Context = CGM.getContext();
   int CharWidth = Context.getCharWidth();
-  const RecordDecl *RD = RTy->getOriginalDecl()->getDefinition();
+  const RecordDecl *RD = RTy->getDecl()->getDefinition();
   const ASTRecordLayout &ASTLayout = Context.getASTRecordLayout(RD);
   const CGRecordLayout &Layout = CGM.getTypes().getCGRecordLayout(RD);
 
@@ -5949,8 +5949,24 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
       CI->getCalledFunction()->getName().starts_with("_Z4sqrt")) {
     SetSqrtFPAccuracy(CI);
   }
-  if (callOrInvoke)
+  if (callOrInvoke) {
     *callOrInvoke = CI;
+    if (CGM.getCodeGenOpts().CallGraphSection) {
+      QualType CST;
+      if (TargetDecl && TargetDecl->getFunctionType())
+        CST = QualType(TargetDecl->getFunctionType(), 0);
+      else if (const auto *FPT =
+                   Callee.getAbstractInfo().getCalleeFunctionProtoType())
+        CST = QualType(FPT, 0);
+      else
+        llvm_unreachable(
+            "Cannot find the callee type to generate callee_type metadata.");
+
+      // Set type identifier metadata of indirect calls for call graph section.
+      if (!CST.isNull())
+        CGM.createCalleeTypeMetadataForIcall(CST, *callOrInvoke);
+    }
+  }
 
   // If this is within a function that has the guard(nocf) attribute and is an
   // indirect call, add the "guard_nocf" attribute to this call to indicate that
diff --git a/clang/lib/CodeGen/CGClass.cpp b/clang/lib/CodeGen/CGClass.cpp
index 41fac9e436f3d..62f5d2f789326 100644
--- a/clang/lib/CodeGen/CGClass.cpp
+++ b/clang/lib/CodeGen/CGClass.cpp
@@ -2026,7 +2026,7 @@ void CodeGenFunction::EnterDtorCleanups(const CXXDestructorDecl *DD,
 
     // Anonymous union members do not have their destructors called.
     const RecordType *RT = type->getAsUnionType();
-    if (RT && RT->getOriginalDecl()->isAnonymousStructOrUnion())
+    if (RT && RT->getDecl()->isAnonymousStructOrUnion())
       continue;
 
     CleanupKind cleanupKind = getCleanupKind(dtorKind);
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index 9dcfe7a6d0bc5..4a4201c30b272 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -697,6 +697,68 @@ StringRef CGDebugInfo::getCurrentDirname() {
   return CGM.getCodeGenOpts().DebugCompilationDir;
 }
 
+static llvm::dwarf::SourceLanguage GetSourceLanguage(const CodeGenModule &CGM) {
+  const CodeGenOptions &CGO = CGM.getCodeGenOpts();
+  const LangOptions &LO = CGM.getLangOpts();
+
+  assert(CGO.DwarfVersion <= 5);
+
+  llvm::dwarf::SourceLanguage LangTag;
+  if (LO.CPlusPlus) {
+    if (LO.ObjC)
+      LangTag = llvm::dwarf::DW_LANG_ObjC_plus_plus;
+    else if (CGO.DebugStrictDwarf && CGO.DwarfVersion < 5)
+      LangTag = llvm::dwarf::DW_LANG_C_plus_plus;
+    else if (LO.CPlusPlus14)
+      LangTag = llvm::dwarf::DW_LANG_C_plus_plus_14;
+    else if (LO.CPlusPlus11)
+      LangTag = llvm::dwarf::DW_LANG_C_plus_plus_11;
+    else
+      LangTag = llvm::dwarf::DW_LANG_C_plus_plus;
+  } else if (LO.ObjC) {
+    LangTag = llvm::dwarf::DW_LANG_ObjC;
+  } else if (LO.OpenCL && (!CGO.DebugStrictDwarf || CGO.DwarfVersion >= 5)) {
+    LangTag = llvm::dwarf::DW_LANG_OpenCL;
+  } else if (LO.C11 && !(CGO.DebugStrictDwarf && CGO.DwarfVersion < 5)) {
+    LangTag = llvm::dwarf::DW_LANG_C11;
+  } else if (LO.C99) {
+    LangTag = llvm::dwarf::DW_LANG_C99;
+  } else {
+    LangTag = llvm::dwarf::DW_LANG_C89;
+  }
+
+  return LangTag;
+}
+
+static llvm::DISourceLanguageName
+GetDISourceLanguageName(const CodeGenModule &CGM) {
+  // Emit pre-DWARFv6 language codes.
+  if (CGM.getCodeGenOpts().DwarfVersion < 6)
+    return llvm::DISourceLanguageName(GetSourceLanguage(CGM));
+
+  const LangOptions &LO = CGM.getLangOpts();
+
+  uint32_t LangVersion = 0;
+  llvm::dwarf::SourceLanguageName LangTag;
+  if (LO.CPlusPlus) {
+    if (LO.ObjC) {
+      LangTag = llvm::dwarf::DW_LNAME_ObjC_plus_plus;
+    } else {
+      LangTag = llvm::dwarf::DW_LNAME_C_plus_plus;
+      LangVersion = LO.getCPlusPlusLangStd().value_or(0);
+    }
+  } else if (LO.ObjC) {
+    LangTag = llvm::dwarf::DW_LNAME_ObjC;
+  } else if (LO.OpenCL) {
+    LangTag = llvm::dwarf::DW_LNAME_OpenCL_C;
+  } else {
+    LangTag = llvm::dwarf::DW_LNAME_C;
+    LangVersion = LO.getCLangStd().value_or(0);
+  }
+
+  return llvm::DISourceLanguageName(LangTag, LangVersion);
+}
+
 void CGDebugInfo::CreateCompileUnit() {
   SmallString<64> Checksum;
   std::optional<llvm::DIFile::ChecksumKind> CSKind;
@@ -752,31 +814,6 @@ void CGDebugInfo::CreateCompileUnit() {
     }
   }
 
-  llvm::dwarf::SourceLanguage LangTag;
-  if (LO.CPlusPlus) {
-    if (LO.ObjC)
-      LangTag = llvm::dwarf::DW_LANG_ObjC_plus_plus;
-    else if (CGO.DebugStrictDwarf && CGO.DwarfVersion < 5)
-      LangTag = llvm::dwarf::DW_LANG_C_plus_plus;
-    else if (LO.CPlusPlus14)
-      LangTag = llvm::dwarf::DW_LANG_C_plus_plus_14;
-    else if (LO.CPlusPlus11)
-      LangTag = llvm::dwarf::DW_LANG_C_plus_plus_11;
-    else
-      LangTag = llvm::dwarf::DW_LANG_C_plus_plus;
-  } else if (LO.ObjC) {
-    LangTag = llvm::dwarf::DW_LANG_ObjC;
-  } else if (LO.OpenCL && (!CGM.getCodeGenOpts().DebugStrictDwarf ||
-                           CGM.getCodeGenOpts().DwarfVersion >= 5)) {
-    LangTag = llvm::dwarf::DW_LANG_OpenCL;
-  } else if (LO.C11 && !(CGO.DebugStrictDwarf && CGO.DwarfVersion < 5)) {
-      LangTag = llvm::dwarf::DW_LANG_C11;
-  } else if (LO.C99) {
-    LangTag = llvm::dwarf::DW_LANG_C99;
-  } else {
-    LangTag = llvm::dwarf::DW_LANG_C89;
-  }
-
   std::string Producer = getClangFullVersion();
 
   // Figure out which version of the ObjC runtime we have.
@@ -837,7 +874,7 @@ void CGDebugInfo::CreateCompileUnit() {
 
   // Create new compile unit.
   TheCU = DBuilder.createCompileUnit(
-      llvm::DISourceLanguageName(LangTag), CUFile,
+      GetDISourceLanguageName(CGM), CUFile,
       CGOpts.EmitVersionIdentMetadata ? Producer : "",
       CGOpts.OptimizationLevel != 0 || CGOpts.PrepareForLTO ||
           CGOpts.PrepareForThinLTO,
@@ -1291,20 +1328,46 @@ llvm::DIType *CGDebugInfo::CreateType(const PointerType *Ty,
                                Ty->getPointeeType(), Unit);
 }
 
-/// \return whether a C++ mangling exists for the type defined by TD.
-static bool hasCXXMangling(const TagDecl *TD, llvm::DICompileUnit *TheCU) {
-  switch (TheCU->getSourceLanguage().getUnversionedName()) {
+static bool hasCXXMangling(llvm::dwarf::SourceLanguage Lang, bool IsTagDecl) {
+  switch (Lang) {
   case llvm::dwarf::DW_LANG_C_plus_plus:
   case llvm::dwarf::DW_LANG_C_plus_plus_11:
   case llvm::dwarf::DW_LANG_C_plus_plus_14:
     return true;
   case llvm::dwarf::DW_LANG_ObjC_plus_plus:
-    return isa<CXXRecordDecl>(TD) || isa<EnumDecl>(TD);
+    return IsTagDecl;
+  default:
+    return false;
+  }
+}
+
+static bool hasCXXMangling(llvm::dwarf::SourceLanguageName Lang,
+                           bool IsTagDecl) {
+  switch (Lang) {
+  case llvm::dwarf::DW_LNAME_C_plus_plus:
+    return true;
+  case llvm::dwarf::DW_LNAME_ObjC_plus_plus:
+    return IsTagDecl;
   default:
     return false;
   }
 }
 
+/// \return whether a C++ mangling exists for the type defined by TD.
+static bool hasCXXMangling(const TagDecl *TD, llvm::DICompileUnit *TheCU) {
+  const bool IsTagDecl = isa<CXXRecordDecl>(TD) || isa<EnumDecl>(TD);
+
+  if (llvm::DISourceLanguageName SourceLang = TheCU->getSourceLanguage();
+      SourceLang.hasVersionedName())
+    return hasCXXMangling(
+        static_cast<llvm::dwarf::SourceLanguageName>(SourceLang.getName()),
+        IsTagDecl);
+  else
+    return hasCXXMangling(
+        static_cast<llvm::dwarf::SourceLanguage>(SourceLang.getName()),
+        IsTagDecl);
+}
+
 // Determines if the debug info for this tag declaration needs a type
 // identifier. The purpose of the unique identifier is to deduplicate type
 // information for identical types across TUs. Because of the C++ one definition
@@ -1347,7 +1410,7 @@ static bool needsTypeIdentifier(const TagDecl *TD, CodeGenModule &CGM,
 static SmallString<256> getTypeIdentifier(const TagType *Ty, CodeGenModule &CGM,
                                           llvm::DICompileUnit *TheCU) {
   SmallString<256> Identifier;
-  const TagDecl *TD = Ty->getOriginalDecl()->getDefinitionOrSelf();
+  const TagDecl *TD = Ty->getDecl()->getDefinitionOrSelf();
 
   if (!needsTypeIdentifier(TD, CGM, TheCU))
     return Identifier;
@@ -1383,7 +1446,7 @@ static llvm::dwarf::Tag getTagForRecord(const RecordDecl *RD) {
 llvm::DICompositeType *
 CGDebugInfo::getOrCreateRecordFwdDecl(const RecordType *Ty,
                                       llvm::DIScope *Ctx) {
-  const RecordDecl *RD = Ty->getOriginalDecl()->getDefinitionOrSelf();
+  const RecordDecl *RD = Ty->getDecl()->getDefinitionOrSelf();
   if (llvm::DIType *T = getTypeOrNull(QualType(Ty, 0)))
     return cast<llvm::DICompositeType>(T);
   llvm::DIFile *DefUnit = getOrCreateFile(RD->getLocation());
@@ -2460,7 +2523,7 @@ void CGDebugInfo::CollectCXXBasesAux(
   for (const auto &BI : Bases) {
     const auto *Base =
         cast<CXXRecordDecl>(
-            BI.getType()->castAsCanonical<RecordType>()->getOriginalDecl())
+            BI.getType()->castAsCanonical<RecordType>()->getDecl())
             ->getDefinition();
     if (!SeenTypes.insert(Base).second)
       continue;
@@ -3137,7 +3200,7 @@ void CGDebugInfo::completeRequiredType(const RecordDecl *RD) {
 }
 
 llvm::DIType *CGDebugInfo::CreateType(const RecordType *Ty) {
-  RecordDecl *RD = Ty->getOriginalDecl()->getDefinitionOrSelf();
+  RecordDecl *RD = Ty->getDecl()->getDefinitionOrSelf();
   llvm::DIType *T = cast_or_null<llvm::DIType>(getTypeOrNull(QualType(Ty, 0)));
   if (T || shouldOmitDefinition(DebugKind, DebugTypeExtRefs, RD,
                                 CGM.getLangOpts())) {
@@ -3165,7 +3228,7 @@ llvm::DIType *CGDebugInfo::GetPreferredNameType(const CXXRecordDecl *RD,
 
 std::pair<llvm::DIType *, llvm::DIType *>
 CGDebugInfo::CreateTypeDefinition(const RecordType *Ty) {
-  RecordDecl *RD = Ty->getOriginalDecl()->getDefinitionOrSelf();
+  RecordDecl *RD = Ty->getDecl()->getDefinitionOrSelf();
 
   // Get overall information about the record type for the debug info.
   llvm::DIFile *DefUnit = getOrCreateFile(RD->getLocation());
@@ -3808,7 +3871,7 @@ llvm::DIType *CGDebugInfo::CreateType(const HLSLInlineSpirvType *Ty,
 
 static auto getEnumInfo(CodeGenModule &CGM, llvm::DICompileUnit *TheCU,
                         const EnumType *Ty) {
-  const EnumDecl *ED = Ty->getOriginalDecl()->getDefinitionOrSelf();
+  const EnumDecl *ED = Ty->getDecl()->getDefinitionOrSelf();
 
   uint64_t Size = 0;
   uint32_t Align = 0;
@@ -4211,7 +4274,7 @@ CGDebugInfo::getOrCreateLimitedType(const RecordType *Ty) {
 
 // TODO: Currently used for context chains when limiting debug info.
 llvm::DICompositeType *CGDebugInfo::CreateLimitedType(const RecordType *Ty) {
-  RecordDecl *RD = Ty->getOriginalDecl()->getDefinitionOrSelf();
+  RecordDecl *RD = Ty->getDecl()->getDefinitionOrSelf();
 
   // Get overall information about the record type for the debug info.
   StringRef RDName = getClassName(RD);
@@ -4298,8 +4361,7 @@ llvm::DICompositeType *CGDebugInfo::CreateLimitedType(const RecordType *Ty) {
     break;
   }
 
-  if (auto *CTSD =
-          dyn_cast<ClassTemplateSpecializationDecl>(Ty->getOriginalDecl())) {
+  if (auto *CTSD = dyn_cast<ClassTemplateSpecializationDecl>(Ty->getDecl())) {
     CXXRecordDecl *TemplateDecl =
         CTSD->getSpecializedTemplate()->getTemplatedDecl();
     RegionMap[TemplateDecl].reset(RealDecl);
@@ -5209,7 +5271,7 @@ llvm::DILocalVariable *CGDebugInfo::EmitDeclare(const VarDecl *VD,
   } else if (const auto *RT = dyn_cast<RecordType>(VD->getType())) {
     // If VD is an anonymous union then Storage represents value for
     // all union fields.
-    const RecordDecl *RD = RT->getOriginalDecl()->getDefinitionOrSelf();
+    const RecordDecl *RD = RT->getDecl()->getDefinitionOrSelf();
     if (RD->isUnion() && RD->isAnonymousStructOrUnion()) {
       // GDB has trouble finding local variables in anonymous unions, so we emit
       // artificial local variables for each of the members.
@@ -6053,9 +6115,8 @@ llvm::DIGlobalVariableExpression *CGDebugInfo::CollectAnonRecordDecls(
     // Ignore unnamed fields, but recurse into anonymous records.
     if (FieldName.empty()) {
       if (const auto *RT = dyn_cast<RecordType>(Field->getType()))
-        GVE = CollectAnonRecordDecls(
-            RT->getOriginalDecl()->getDefinitionOrSelf(), Unit, LineNo,
-            LinkageName, MS, Var, DContext);
+        GVE = CollectAnonRecordDecls(RT->getDecl()->getDefinitionOrSelf(), Unit,
+                     LineNo, LinkageName, MS, Var, DContext);
       continue;
     }
     // Use VarDecl's Tag, Scope and Line number.
@@ -6104,7 +6165,7 @@ static bool ReferencesAnonymousEntity(RecordType *RT) {
   // But so long as it's not one of those, it doesn't matter if some sub-type
   // of the record (a template parameter) can't be reconstituted - because the
   // un-reconstitutable type itself will carry its own name.
-  const auto *RD = dyn_cast<CXXRecordDecl>(RT->getOriginalDecl());
+  const auto *RD = dyn_cast<CXXRecordDecl>(RT->getDecl());
   if (!RD)
     return false;
   if (!RD->getIdentifier())
@@ -6166,7 +6227,7 @@ struct ReconstitutableType : public RecursiveASTVisitor<ReconstitutableType> {
   bool TraverseEnumType(EnumType *ET, bool = false) {
     // Unnamed enums can't be reconstituted due to a lack of column info we
     // produce in the DWARF, so we can't get Clang's full name back.
-    if (const auto *ED = dyn_cast<EnumDecl>(ET->getOriginalDecl())) {
+    if (const auto *ED = dyn_cast<EnumDecl>(ET->getDecl())) {
       if (!ED->getIdentifier()) {
         Reconstitutable = false;
         return false;
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index d577d5d530310..657bae4f97945 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -1756,7 +1756,7 @@ LValue CodeGenFunction::EmitUnsupportedLValue(const Expr *E,
                                               const char *Name) {
   ErrorUnsupported(E, Name);
   llvm::Type *ElTy = ConvertType(E->getType());
-  llvm::Type *Ty = UnqualPtrTy;
+  llvm::Type *Ty = DefaultPtrTy;
   return MakeAddrLValue(
       Address(llvm::UndefValue::get(Ty), ElTy, CharUnits::One()), E->getType());
 }
@@ -2011,7 +2011,7 @@ static bool isConstantEmittableObjectType(QualType type) {
   // Otherwise, all object types satisfy this except C++ classes with
   // mutable subobjects or non-trivial copy/destroy behavior.
   if (const auto *RT = dyn_cast<RecordType>(type))
-    if (const auto *RD = dyn_cast<CXXRecordDecl>(RT->getOriginalDecl())) {
+    if (const auto *RD = dyn_cast<CXXRecordDecl>(RT->getDecl())) {
       RD = RD->getDefinitionOrSelf();
       if (RD->hasMutableFields() || !RD->isTrivial())
         return false;
@@ -4270,7 +4270,7 @@ void CodeGenFunction::EmitCfiCheckFail() {
       llvm::StructType::get(Int8Ty, SourceLocationTy, VoidPtrTy);
 
   llvm::Value *V = Builder.CreateConstGEP2_32(
-      CfiCheckFailDataTy, Builder.CreatePointerCast(Data, UnqualPtrTy), 0, 0);
+      CfiCheckFailDataTy, Builder.CreatePointerCast(Data, DefaultPtrTy), 0, 0);
 
   Address CheckKindAddr(V, Int8Ty, getIntAlign());
   llvm::Value *CheckKind = Builder.CreateLoad(CheckKindAddr);
@@ -4564,7 +4564,7 @@ static bool IsPreserveAIArrayBase(CodeGenFunction &CGF, const Expr *ArrayBase) {
     const auto *PointeeT = PtrT->getPointeeType()
                              ->getUnqualifiedDesugaredType();
     if (const auto *RecT = dyn_cast<RecordType>(PointeeT))
-      return RecT->getOriginalDecl()
+      return RecT->getDecl()
           ->getMostRecentDecl()
           ->hasAttr<BPFPreserveAccessIndexAttr>();
     return false;
@@ -5711,7 +5711,7 @@ std::optional<LValue> HandleConditionalOperatorLValueSimpleCase(
       if (auto *ThrowExpr = dyn_cast<CXXThrowExpr>(Live->IgnoreParens())) {
         CGF.EmitCXXThrowExpr(ThrowExpr);
         llvm::Type *ElemTy = CGF.ConvertType(Dead->getType());
-        llvm::Type *Ty = CGF.UnqualPtrTy;
+        llvm::Type *Ty = CGF.DefaultPtrTy;
         return CGF.MakeAddrLValue(
             Address(llvm::UndefValue::get(Ty), ElemTy, CharUnits::One()),
             Dead->getType());
@@ -7023,7 +7023,7 @@ void CodeGenFunction::FlattenAccessAndTypeLValue(
         WorkList.emplace_back(LVal, CAT->getElementType(), IdxListCopy);
       }
     } else if (const auto *RT = dyn_cast<RecordType>(T)) {
-      const RecordDecl *Record = RT->getOriginalDecl()->getDefinitionOrSelf();
+      const RecordDecl *Record = RT->getDecl()->getDefinitionOrSelf();
       assert(!Record->isUnion() && "Union types not supported in flat cast.");
 
       const CXXRecordDecl *CXXD = dyn_cast<CXXRecordDecl>(Record);
diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp
index 07b9aebe0bbe3..eee397f1f3d19 100644
--- a/clang/lib/CodeGen/CGExprAgg.cpp
+++ b/clang/lib/CodeGen/CGExprAgg.cpp
@@ -2080,7 +2080,7 @@ static CharUnits GetNumNonZeroBytesInInit(const Expr *E, CodeGenFunction &CGF) {
   // referencee.  InitListExprs for unions and arrays can't have references.
   if (const RecordType *RT = E->getType()->getAsCanonical<RecordType>()) {
     if (!RT->isUnionType()) {
-      RecordDecl *SD = RT->getOriginalDecl()->getDefinitionOrSelf();
+      RecordDecl *SD = RT->getDecl()->getDefinitionOrSelf();
       CharUnits NumNonZeroBytes = CharUnits::Zero();
 
       unsigned ILEElement = 0;
@@ -2133,7 +2133,7 @@ static void CheckAggExprForMemSetUse(AggValueSlot &Slot, const Expr *E,
     if (const RecordType *RT = CGF.getContext()
                                    .getBaseElementType(E->getType())
                                    ->getAsCanonical<RecordType>()) {
-      const CXXRecordDecl *RD = cast<CXXRecordDecl>(RT->getOriginalDecl());
+      const auto *RD = cast<CXXRecordDecl>(RT->getDecl());
       if (RD->hasUserDeclaredConstructor())
         return;
     }
diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp
index 31ac26662b4c8..14d8db32bafc6 100644
--- a/clang/lib/CodeGen/CGExprCXX.cpp
+++ b/clang/lib/CodeGen/CGExprCXX.cpp
@@ -1236,8 +1236,8 @@ void CodeGenFunction::EmitNewArrayInitializer(
   if (auto *ILE = dyn_cast<InitListExpr>(Init)) {
     if (const RecordType *RType =
             ILE->getType()->getAsCanonical<RecordType>()) {
-      if (RType->getOriginalDecl()->isStruct()) {
-        const RecordDecl *RD = RType->getOriginalDecl()->getDefinitionOrSelf();
+      if (RType->getDecl()->isStruct()) {
+        const RecordDecl *RD = RType->getDecl()->getDefinitionOrSelf();
         unsigned NumElements = 0;
         if (auto *CXXRD = dyn_cast<CXXRecordDecl>(RD))
           NumElements = CXXRD->getNumBases();
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index 16d9f2961b149..334951251c634 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -3587,7 +3587,7 @@ Value *ScalarExprEmitter::VisitOffsetOfExpr(OffsetOfExpr *E) {
       }
 
       const ASTRecordLayout &RL = CGF.getContext().getASTRecordLayout(
-          CurrentType->castAsCanonical<RecordType>()->getOriginalDecl());
+          CurrentType->castAsCanonical<RecordType>()->getDecl());
 
       // Save the element type.
       CurrentType = ON.getBase()->getType();
diff --git a/clang/lib/CodeGen/CGHLSLBuiltins.cpp b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
index 4f2f5a761f197..384bd59e7533a 100644
--- a/clang/lib/CodeGen/CGHLSLBuiltins.cpp
+++ b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
@@ -160,6 +160,16 @@ static Value *handleHlslSplitdouble(const CallExpr *E, CodeGenFunction *CGF) {
   return LastInst;
 }
 
+static Value *emitBufferStride(CodeGenFunction *CGF, const Expr *HandleExpr,
+                               LValue &Stride) {
+  // Figure out the stride of the buffer elements from the handle type.
+  auto *HandleTy =
+      cast<HLSLAttributedResourceType>(HandleExpr->getType().getTypePtr());
+  QualType ElementTy = HandleTy->getContainedType();
+  Value *StrideValue = CGF->getTypeSize(ElementTy);
+  return CGF->Builder.CreateStore(StrideValue, Stride.getAddress());
+}
+
 // Return dot product intrinsic that corresponds to the QT scalar type
 static Intrinsic::ID getDotProductIntrinsic(CGHLSLRuntime &RT, QualType QT) {
   if (QT->isFloatingType())
@@ -372,6 +382,19 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
         RetTy, CGM.getHLSLRuntime().getNonUniformResourceIndexIntrinsic(),
         ArrayRef<Value *>{IndexOp});
   }
+  case Builtin::BI__builtin_hlsl_resource_getdimensions_x: {
+    Value *Handle = EmitScalarExpr(E->getArg(0));
+    LValue Dim = EmitLValue(E->getArg(1));
+    llvm::Type *RetTy = llvm::Type::getInt32Ty(getLLVMContext());
+    Value *DimValue = Builder.CreateIntrinsic(
+        RetTy, CGM.getHLSLRuntime().getGetDimensionsXIntrinsic(),
+        ArrayRef<Value *>{Handle});
+    return Builder.CreateStore(DimValue, Dim.getAddress());
+  }
+  case Builtin::BI__builtin_hlsl_resource_getstride: {
+    LValue Stride = EmitLValue(E->getArg(1));
+    return emitBufferStride(this, E->getArg(0), Stride);
+  }
   case Builtin::BI__builtin_hlsl_all: {
     Value *Op0 = EmitScalarExpr(E->getArg(0));
     return Builder.CreateIntrinsic(
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp
index 603cef9116dc2..ecab9336a9f82 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.cpp
+++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp
@@ -519,6 +519,10 @@ void clang::CodeGen::CGHLSLRuntime::setHLSLEntryAttributes(
   if (CGM.getCodeGenOpts().OptimizationLevel == 0)
     Fn->addFnAttr(llvm::Attribute::OptimizeNone);
   Fn->addFnAttr(llvm::Attribute::NoInline);
+
+  if (CGM.getLangOpts().HLSLSpvEnableMaximalReconvergence) {
+    Fn->addFnAttr("enable-maximal-reconvergence", "true");
+  }
 }
 
 static Value *buildVectorInput(IRBuilder<> &B, Function *F, llvm::Type *Ty) {
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h
index 7c6c2850fd4d4..103b4a98f6c26 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.h
+++ b/clang/lib/CodeGen/CGHLSLRuntime.h
@@ -135,6 +135,7 @@ class CGHLSLRuntime {
   GENERATE_HLSL_INTRINSIC_FUNCTION(BufferUpdateCounter, resource_updatecounter)
   GENERATE_HLSL_INTRINSIC_FUNCTION(GroupMemoryBarrierWithGroupSync,
                                    group_memory_barrier_with_group_sync)
+  GENERATE_HLSL_INTRINSIC_FUNCTION(GetDimensionsX, resource_getdimensions_x)
 
   //===----------------------------------------------------------------------===//
   // End of reserved area for HLSL intrinsic getters.
diff --git a/clang/lib/CodeGen/CGNonTrivialStruct.cpp b/clang/lib/CodeGen/CGNonTrivialStruct.cpp
index 2d70e4c2e0394..0a383c8f919d9 100644
--- a/clang/lib/CodeGen/CGNonTrivialStruct.cpp
+++ b/clang/lib/CodeGen/CGNonTrivialStruct.cpp
@@ -464,8 +464,7 @@ template <class Derived> struct GenFuncBase {
 
       if (WrongType) {
         std::string FuncName = std::string(F->getName());
-        SourceLocation Loc =
-            QT->castAs<RecordType>()->getOriginalDecl()->getLocation();
+        SourceLocation Loc = QT->castAs<RecordType>()->getDecl()->getLocation();
         CGM.Error(Loc, "special function " + FuncName +
                            " for non-trivial C struct has incorrect type");
         return nullptr;
diff --git a/clang/lib/CodeGen/CGObjCMac.cpp b/clang/lib/CodeGen/CGObjCMac.cpp
index dbcce9b86ad52..cb5bb403bb53b 100644
--- a/clang/lib/CodeGen/CGObjCMac.cpp
+++ b/clang/lib/CodeGen/CGObjCMac.cpp
@@ -338,7 +338,7 @@ class ObjCCommonTypesHelper {
   /// GcReadWeakFn -- LLVM objc_read_weak (id *src) function.
   llvm::FunctionCallee getGcReadWeakFn() {
     // id objc_read_weak (id *)
-    llvm::Type *args[] = {CGM.UnqualPtrTy};
+    llvm::Type *args[] = {CGM.DefaultPtrTy};
     llvm::FunctionType *FTy = llvm::FunctionType::get(ObjectPtrTy, args, false);
     return CGM.CreateRuntimeFunction(FTy, "objc_read_weak");
   }
@@ -346,7 +346,7 @@ class ObjCCommonTypesHelper {
   /// GcAssignWeakFn -- LLVM objc_assign_weak function.
   llvm::FunctionCallee getGcAssignWeakFn() {
     // id objc_assign_weak (id, id *)
-    llvm::Type *args[] = {ObjectPtrTy, CGM.UnqualPtrTy};
+    llvm::Type *args[] = {ObjectPtrTy, CGM.DefaultPtrTy};
     llvm::FunctionType *FTy = llvm::FunctionType::get(ObjectPtrTy, args, false);
     return CGM.CreateRuntimeFunction(FTy, "objc_assign_weak");
   }
@@ -354,7 +354,7 @@ class ObjCCommonTypesHelper {
   /// GcAssignGlobalFn -- LLVM objc_assign_global function.
   llvm::FunctionCallee getGcAssignGlobalFn() {
     // id objc_assign_global(id, id *)
-    llvm::Type *args[] = {ObjectPtrTy, CGM.UnqualPtrTy};
+    llvm::Type *args[] = {ObjectPtrTy, CGM.DefaultPtrTy};
     llvm::FunctionType *FTy = llvm::FunctionType::get(ObjectPtrTy, args, false);
     return CGM.CreateRuntimeFunction(FTy, "objc_assign_global");
   }
@@ -362,7 +362,7 @@ class ObjCCommonTypesHelper {
   /// GcAssignThreadLocalFn -- LLVM objc_assign_threadlocal function.
   llvm::FunctionCallee getGcAssignThreadLocalFn() {
     // id objc_assign_threadlocal(id src, id * dest)
-    llvm::Type *args[] = {ObjectPtrTy, CGM.UnqualPtrTy};
+    llvm::Type *args[] = {ObjectPtrTy, CGM.DefaultPtrTy};
     llvm::FunctionType *FTy = llvm::FunctionType::get(ObjectPtrTy, args, false);
     return CGM.CreateRuntimeFunction(FTy, "objc_assign_threadlocal");
   }
@@ -370,7 +370,7 @@ class ObjCCommonTypesHelper {
   /// GcAssignIvarFn -- LLVM objc_assign_ivar function.
   llvm::FunctionCallee getGcAssignIvarFn() {
     // id objc_assign_ivar(id, id *, ptrdiff_t)
-    llvm::Type *args[] = {ObjectPtrTy, CGM.UnqualPtrTy, CGM.PtrDiffTy};
+    llvm::Type *args[] = {ObjectPtrTy, CGM.DefaultPtrTy, CGM.PtrDiffTy};
     llvm::FunctionType *FTy = llvm::FunctionType::get(ObjectPtrTy, args, false);
     return CGM.CreateRuntimeFunction(FTy, "objc_assign_ivar");
   }
@@ -386,7 +386,7 @@ class ObjCCommonTypesHelper {
   /// GcAssignStrongCastFn -- LLVM objc_assign_strongCast function.
   llvm::FunctionCallee getGcAssignStrongCastFn() {
     // id objc_assign_strongCast(id, id *)
-    llvm::Type *args[] = {ObjectPtrTy, CGM.UnqualPtrTy};
+    llvm::Type *args[] = {ObjectPtrTy, CGM.DefaultPtrTy};
     llvm::FunctionType *FTy = llvm::FunctionType::get(ObjectPtrTy, args, false);
     return CGM.CreateRuntimeFunction(FTy, "objc_assign_strongCast");
   }
@@ -517,7 +517,7 @@ class ObjCTypesHelper : public ObjCCommonTypesHelper {
 
   /// ExceptionTryEnterFn - LLVM objc_exception_try_enter function.
   llvm::FunctionCallee getExceptionTryEnterFn() {
-    llvm::Type *params[] = {CGM.UnqualPtrTy};
+    llvm::Type *params[] = {CGM.DefaultPtrTy};
     return CGM.CreateRuntimeFunction(
         llvm::FunctionType::get(CGM.VoidTy, params, false),
         "objc_exception_try_enter");
@@ -525,7 +525,7 @@ class ObjCTypesHelper : public ObjCCommonTypesHelper {
 
   /// ExceptionTryExitFn - LLVM objc_exception_try_exit function.
   llvm::FunctionCallee getExceptionTryExitFn() {
-    llvm::Type *params[] = {CGM.UnqualPtrTy};
+    llvm::Type *params[] = {CGM.DefaultPtrTy};
     return CGM.CreateRuntimeFunction(
         llvm::FunctionType::get(CGM.VoidTy, params, false),
         "objc_exception_try_exit");
@@ -533,7 +533,7 @@ class ObjCTypesHelper : public ObjCCommonTypesHelper {
 
   /// ExceptionExtractFn - LLVM objc_exception_extract function.
   llvm::FunctionCallee getExceptionExtractFn() {
-    llvm::Type *params[] = {CGM.UnqualPtrTy};
+    llvm::Type *params[] = {CGM.DefaultPtrTy};
     return CGM.CreateRuntimeFunction(
         llvm::FunctionType::get(ObjectPtrTy, params, false),
         "objc_exception_extract");
@@ -550,7 +550,7 @@ class ObjCTypesHelper : public ObjCCommonTypesHelper {
   /// SetJmpFn - LLVM _setjmp function.
   llvm::FunctionCallee getSetJmpFn() {
     // This is specifically the prototype for x86.
-    llvm::Type *params[] = {CGM.UnqualPtrTy};
+    llvm::Type *params[] = {CGM.DefaultPtrTy};
     return CGM.CreateRuntimeFunction(
         llvm::FunctionType::get(CGM.Int32Ty, params, false), "_setjmp",
         llvm::AttributeList::get(CGM.getLLVMContext(),
@@ -1927,7 +1927,7 @@ CGObjCCommonMac::GenerateConstantNSString(const StringLiteral *Literal) {
   // If we don't already have it, construct the type for a constant NSString.
   if (!NSConstantStringType) {
     NSConstantStringType =
-        llvm::StructType::create({CGM.UnqualPtrTy, CGM.Int8PtrTy, CGM.IntTy},
+        llvm::StructType::create({CGM.DefaultPtrTy, CGM.Int8PtrTy, CGM.IntTy},
                                  "struct.__builtin_NSString");
   }
 
@@ -2495,7 +2495,7 @@ void CGObjCCommonMac::BuildRCBlockVarRecordLayout(const RecordType *RT,
                                                   CharUnits BytePos,
                                                   bool &HasUnion,
                                                   bool ByrefLayout) {
-  const RecordDecl *RD = RT->getOriginalDecl()->getDefinitionOrSelf();
+  const RecordDecl *RD = RT->getDecl()->getDefinitionOrSelf();
   SmallVector<const FieldDecl *, 16> Fields(RD->fields());
   llvm::Type *Ty = CGM.getTypes().ConvertType(QualType(RT, 0));
   const llvm::StructLayout *RecLayout =
@@ -5184,7 +5184,7 @@ CGObjCCommonMac::GetIvarLayoutName(IdentifierInfo *Ident,
 }
 
 void IvarLayoutBuilder::visitRecord(const RecordType *RT, CharUnits offset) {
-  const RecordDecl *RD = RT->getOriginalDecl()->getDefinitionOrSelf();
+  const RecordDecl *RD = RT->getDecl()->getDefinitionOrSelf();
 
   // If this is a union, remember that we had one, because it might mess
   // up the ordering of layout entries.
@@ -5959,7 +5959,7 @@ ObjCNonFragileABITypesHelper::ObjCNonFragileABITypesHelper(
       Int8PtrTy, PropertyListPtrTy);
 
   // ImpnfABITy - LLVM for id (*)(id, SEL, ...)
-  ImpnfABITy = CGM.UnqualPtrTy;
+  ImpnfABITy = CGM.DefaultPtrTy;
 
   // struct _class_t {
   //   struct _class_t *isa;
@@ -6380,7 +6380,7 @@ void CGObjCNonFragileABIMac::GenerateClass(const ObjCImplementationDecl *ID) {
           CGM.getModule(), ObjCTypes.ImpnfABITy, false,
           llvm::GlobalValue::ExternalLinkage, nullptr, "_objc_empty_vtable");
     else
-      ObjCEmptyVtableVar = llvm::ConstantPointerNull::get(CGM.UnqualPtrTy);
+      ObjCEmptyVtableVar = llvm::ConstantPointerNull::get(CGM.DefaultPtrTy);
   }
 
   // FIXME: Is this correct (that meta class size is never computed)?
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index f27d2b6bc8f9d..1c3647258163d 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -1738,12 +1738,12 @@ llvm::Function *CGOpenMPRuntime::emitThreadPrivateVarDefinition(
     // Copying constructor for the threadprivate variable.
     // Must be NULL - reserved by runtime, but currently it requires that this
     // parameter is always NULL. Otherwise it fires assertion.
-    CopyCtor = llvm::Constant::getNullValue(CGM.UnqualPtrTy);
+    CopyCtor = llvm::Constant::getNullValue(CGM.DefaultPtrTy);
     if (Ctor == nullptr) {
-      Ctor = llvm::Constant::getNullValue(CGM.UnqualPtrTy);
+      Ctor = llvm::Constant::getNullValue(CGM.DefaultPtrTy);
     }
     if (Dtor == nullptr) {
-      Dtor = llvm::Constant::getNullValue(CGM.UnqualPtrTy);
+      Dtor = llvm::Constant::getNullValue(CGM.DefaultPtrTy);
     }
     if (!CGF) {
       auto *InitFunctionTy =
diff --git a/clang/lib/CodeGen/CGPointerAuth.cpp b/clang/lib/CodeGen/CGPointerAuth.cpp
index 375f87a68ee6d..dbb7bc99ac638 100644
--- a/clang/lib/CodeGen/CGPointerAuth.cpp
+++ b/clang/lib/CodeGen/CGPointerAuth.cpp
@@ -426,10 +426,10 @@ CodeGenModule::getConstantSignedPointer(llvm::Constant *Pointer, unsigned Key,
                                         llvm::ConstantInt *OtherDiscriminator) {
   llvm::Constant *AddressDiscriminator;
   if (StorageAddress) {
-    assert(StorageAddress->getType() == UnqualPtrTy);
+    assert(StorageAddress->getType() == DefaultPtrTy);
     AddressDiscriminator = StorageAddress;
   } else {
-    AddressDiscriminator = llvm::Constant::getNullValue(UnqualPtrTy);
+    AddressDiscriminator = llvm::Constant::getNullValue(DefaultPtrTy);
   }
 
   llvm::ConstantInt *IntegerDiscriminator;
diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index db783d11fa7d0..6a24cc37539e4 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -3938,7 +3938,8 @@ EmitAsmStores(CodeGenFunction &CGF, const AsmStmt &S,
               const llvm::ArrayRef<LValue> ResultRegDests,
               const llvm::ArrayRef<QualType> ResultRegQualTys,
               const llvm::BitVector &ResultTypeRequiresCast,
-              const llvm::BitVector &ResultRegIsFlagReg) {
+              const std::vector<std::optional<std::pair<unsigned, unsigned>>>
+                  &ResultBounds) {
   CGBuilderTy &Builder = CGF.Builder;
   CodeGenModule &CGM = CGF.CGM;
   llvm::LLVMContext &CTX = CGF.getLLVMContext();
@@ -3949,18 +3950,20 @@ EmitAsmStores(CodeGenFunction &CGF, const AsmStmt &S,
   // ResultRegDests can be also populated by addReturnRegisterOutputs() above,
   // in which case its size may grow.
   assert(ResultTypeRequiresCast.size() <= ResultRegDests.size());
-  assert(ResultRegIsFlagReg.size() <= ResultRegDests.size());
+  assert(ResultBounds.size() <= ResultRegDests.size());
 
   for (unsigned i = 0, e = RegResults.size(); i != e; ++i) {
     llvm::Value *Tmp = RegResults[i];
     llvm::Type *TruncTy = ResultTruncRegTypes[i];
 
-    if ((i < ResultRegIsFlagReg.size()) && ResultRegIsFlagReg[i]) {
-      // Target must guarantee the Value `Tmp` here is lowered to a boolean
-      // value.
-      llvm::Constant *Two = llvm::ConstantInt::get(Tmp->getType(), 2);
+    if ((i < ResultBounds.size()) && ResultBounds[i].has_value()) {
+      const auto [LowerBound, UpperBound] = ResultBounds[i].value();
+      // FIXME: Support for nonzero lower bounds not yet implemented.
+      assert(LowerBound == 0 && "Output operand lower bound is not zero.");
+      llvm::Constant *UpperBoundConst =
+          llvm::ConstantInt::get(Tmp->getType(), UpperBound);
       llvm::Value *IsBooleanValue =
-          Builder.CreateCmp(llvm::CmpInst::ICMP_ULT, Tmp, Two);
+          Builder.CreateCmp(llvm::CmpInst::ICMP_ULT, Tmp, UpperBoundConst);
       llvm::Function *FnAssume = CGM.getIntrinsic(llvm::Intrinsic::assume);
       Builder.CreateCall(FnAssume, IsBooleanValue);
     }
@@ -4089,7 +4092,7 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
   std::vector<llvm::Type *> ArgElemTypes;
   std::vector<llvm::Value*> Args;
   llvm::BitVector ResultTypeRequiresCast;
-  llvm::BitVector ResultRegIsFlagReg;
+  std::vector<std::optional<std::pair<unsigned, unsigned>>> ResultBounds;
 
   // Keep track of inout constraints.
   std::string InOutConstraints;
@@ -4147,8 +4150,7 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
       ResultRegQualTys.push_back(QTy);
       ResultRegDests.push_back(Dest);
 
-      bool IsFlagReg = llvm::StringRef(OutputConstraint).starts_with("{@cc");
-      ResultRegIsFlagReg.push_back(IsFlagReg);
+      ResultBounds.emplace_back(Info.getOutputOperandBounds());
 
       llvm::Type *Ty = ConvertTypeForMem(QTy);
       const bool RequiresCast = Info.allowsRegister() &&
@@ -4495,7 +4497,7 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
 
   EmitAsmStores(*this, S, RegResults, ResultRegTypes, ResultTruncRegTypes,
                 ResultRegDests, ResultRegQualTys, ResultTypeRequiresCast,
-                ResultRegIsFlagReg);
+                ResultBounds);
 
   // If this is an asm goto with outputs, repeat EmitAsmStores, but with a
   // different insertion point; one for each indirect destination and with
@@ -4506,7 +4508,7 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
       Builder.SetInsertPoint(Succ, --(Succ->end()));
       EmitAsmStores(*this, S, CBRRegResults[Succ], ResultRegTypes,
                     ResultTruncRegTypes, ResultRegDests, ResultRegQualTys,
-                    ResultTypeRequiresCast, ResultRegIsFlagReg);
+                    ResultTypeRequiresCast, ResultBounds);
     }
   }
 }
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 6d8c3965598e5..a1fc17bb80212 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -87,6 +87,7 @@ static llvm::cl::opt<bool> LimitedCoverage(
     llvm::cl::desc("Emit limited coverage mapping information (experimental)"));
 
 static const char AnnotationSection[] = "llvm.metadata";
+static constexpr auto ErrnoTBAAMDName = "llvm.errno.tbaa";
 
 static CGCXXABI *createCXXABI(CodeGenModule &CGM) {
   switch (CGM.getContext().getCXXABIKind()) {
@@ -1587,6 +1588,17 @@ void CodeGenModule::Release() {
       }
     }
   }
+
+  // Emit `!llvm.errno.tbaa`, a module-level metadata that specifies the TBAA
+  // for an int access. This allows LLVM to reason about what memory can be
+  // accessed by certain library calls that only touch errno.
+  if (TBAA) {
+    TBAAAccessInfo TBAAInfo = getTBAAAccessInfo(Context.IntTy);
+    if (llvm::MDNode *IntegerNode = getTBAAAccessTagInfo(TBAAInfo)) {
+      auto *ErrnoTBAAMD = TheModule.getOrInsertNamedMetadata(ErrnoTBAAMDName);
+      ErrnoTBAAMD->addOperand(IntegerNode);
+    }
+  }
 }
 
 void CodeGenModule::EmitOpenCLMetadata() {
@@ -2352,7 +2364,7 @@ static QualType GeneralizeTransparentUnion(QualType Ty) {
   const RecordType *UT = Ty->getAsUnionType();
   if (!UT)
     return Ty;
-  const RecordDecl *UD = UT->getOriginalDecl()->getDefinitionOrSelf();
+  const RecordDecl *UD = UT->getDecl()->getDefinitionOrSelf();
   if (!UD->hasAttr<TransparentUnionAttr>())
     return Ty;
   for (const auto *it : UD->fields()) {
@@ -2855,6 +2867,11 @@ void CodeGenModule::SetLLVMFunctionAttributesForDefinition(const Decl *D,
     }
   }
 
+  if (CodeGenOpts.CallGraphSection) {
+    if (auto *FD = dyn_cast<FunctionDecl>(D))
+      createIndirectFunctionTypeMD(FD, F);
+  }
+
   // Emit type metadata on member functions for member function pointer checks.
   // These are only ever necessary on definitions; we're guaranteed that the
   // definition will be present in the LTO unit as a result of LTO visibility.
@@ -3058,6 +3075,26 @@ static void setLinkageForGV(llvm::GlobalValue *GV, const NamedDecl *ND) {
     GV->setLinkage(llvm::GlobalValue::ExternalWeakLinkage);
 }
 
+static bool hasExistingGeneralizedTypeMD(llvm::Function *F) {
+  llvm::MDNode *MD = F->getMetadata(llvm::LLVMContext::MD_type);
+  return MD && MD->hasGeneralizedMDString();
+}
+
+void CodeGenModule::createIndirectFunctionTypeMD(const FunctionDecl *FD,
+                                                 llvm::Function *F) {
+  // Return if generalized type metadata is already attached.
+  if (hasExistingGeneralizedTypeMD(F))
+    return;
+
+  // All functions which are not internal linkage could be indirect targets.
+  // Address taken functions with internal linkage could be indirect targets.
+  if (!F->hasLocalLinkage() ||
+      F->getFunction().hasAddressTaken(nullptr, /*IgnoreCallbackUses=*/true,
+                                       /*IgnoreAssumeLikeCalls=*/true,
+                                       /*IgnoreLLVMUsed=*/false))
+    F->addTypeMetadata(0, CreateMetadataIdentifierGeneralized(FD->getType()));
+}
+
 void CodeGenModule::createFunctionTypeMetadataForIcall(const FunctionDecl *FD,
                                                        llvm::Function *F) {
   // Only if we are checking indirect calls.
@@ -3073,10 +3110,12 @@ void CodeGenModule::createFunctionTypeMetadataForIcall(const FunctionDecl *FD,
                                            /*GeneralizePointers=*/false);
   llvm::Metadata *MD = CreateMetadataIdentifierForType(FnType);
   F->addTypeMetadata(0, MD);
-
-  QualType GenPtrFnType = GeneralizeFunctionType(getContext(), FD->getType(),
-                                                 /*GeneralizePointers=*/true);
-  F->addTypeMetadata(0, CreateMetadataIdentifierGeneralized(GenPtrFnType));
+  // Add the generalized identifier if not added already.
+  if (!hasExistingGeneralizedTypeMD(F)) {
+    QualType GenPtrFnType = GeneralizeFunctionType(getContext(), FD->getType(),
+                                                   /*GeneralizePointers=*/true);
+    F->addTypeMetadata(0, CreateMetadataIdentifierGeneralized(GenPtrFnType));
+  }
 
   // Emit a hash-based bit set entry for cross-DSO calls.
   if (CodeGenOpts.SanitizeCfiCrossDso)
@@ -3084,6 +3123,21 @@ void CodeGenModule::createFunctionTypeMetadataForIcall(const FunctionDecl *FD,
       F->addTypeMetadata(0, llvm::ConstantAsMetadata::get(CrossDsoTypeId));
 }
 
+void CodeGenModule::createCalleeTypeMetadataForIcall(const QualType &QT,
+                                                     llvm::CallBase *CB) {
+  // Only if needed for call graph section and only for indirect calls.
+  if (!CodeGenOpts.CallGraphSection || !CB->isIndirectCall())
+    return;
+
+  llvm::Metadata *TypeIdMD = CreateMetadataIdentifierGeneralized(QT);
+  llvm::MDTuple *TypeTuple = llvm::MDTuple::get(
+      getLLVMContext(), {llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
+                             llvm::Type::getInt64Ty(getLLVMContext()), 0)),
+                         TypeIdMD});
+  llvm::MDTuple *MDN = llvm::MDNode::get(getLLVMContext(), {TypeTuple});
+  CB->setMetadata(llvm::LLVMContext::MD_callee_type, MDN);
+}
+
 void CodeGenModule::setKCFIType(const FunctionDecl *FD, llvm::Function *F) {
   llvm::LLVMContext &Ctx = F->getContext();
   llvm::MDBuilder MDB(Ctx);
@@ -3219,6 +3273,9 @@ void CodeGenModule::SetFunctionAttributes(GlobalDecl GD, llvm::Function *F,
       !CodeGenOpts.SanitizeCfiCanonicalJumpTables)
     createFunctionTypeMetadataForIcall(FD, F);
 
+  if (CodeGenOpts.CallGraphSection)
+    createIndirectFunctionTypeMD(FD, F);
+
   if (LangOpts.Sanitize.has(SanitizerKind::KCFI))
     setKCFIType(FD, F);
 
@@ -4237,7 +4294,7 @@ void CodeGenModule::EmitGlobal(GlobalDecl GD) {
 static bool HasNonDllImportDtor(QualType T) {
   if (const auto *RT =
           T->getBaseElementTypeUnsafe()->getAsCanonical<RecordType>())
-    if (auto *RD = dyn_cast<CXXRecordDecl>(RT->getOriginalDecl())) {
+    if (auto *RD = dyn_cast<CXXRecordDecl>(RT->getDecl())) {
       RD = RD->getDefinitionOrSelf();
       if (RD->getDestructor() && !RD->getDestructor()->hasAttr<DLLImportAttr>())
         return true;
diff --git a/clang/lib/CodeGen/CodeGenModule.h b/clang/lib/CodeGen/CodeGenModule.h
index 3a5e6d36b8e28..4b3b39bb3ad44 100644
--- a/clang/lib/CodeGen/CodeGenModule.h
+++ b/clang/lib/CodeGen/CodeGenModule.h
@@ -1833,6 +1833,13 @@ class CodeGenModule : public CodeGenTypeCache {
   void createFunctionTypeMetadataForIcall(const FunctionDecl *FD,
                                           llvm::Function *F);
 
+  /// Create and attach type metadata if the function is a potential indirect
+  /// call target to support call graph section.
+  void createIndirectFunctionTypeMD(const FunctionDecl *FD, llvm::Function *F);
+
+  /// Create and attach type metadata to the given call.
+  void createCalleeTypeMetadataForIcall(const QualType &QT, llvm::CallBase *CB);
+
   /// Set type metadata to the given function.
   void setKCFIType(const FunctionDecl *FD, llvm::Function *F);
 
diff --git a/clang/lib/CodeGen/CodeGenTBAA.cpp b/clang/lib/CodeGen/CodeGenTBAA.cpp
index f8c7d64cc1aa2..cd08f3ec397a0 100644
--- a/clang/lib/CodeGen/CodeGenTBAA.cpp
+++ b/clang/lib/CodeGen/CodeGenTBAA.cpp
@@ -310,7 +310,7 @@ llvm::MDNode *CodeGenTBAA::getTypeInfoHelper(const Type *Ty) {
       // This also covers anonymous structs and unions, which have a different
       // compatibility rule, but it doesn't matter because you can never have a
       // pointer to an anonymous struct or union.
-      if (!RT->getOriginalDecl()->getDeclName())
+      if (!RT->getDecl()->getDeclName())
         return getAnyPtr(PtrDepth);
 
       // For non-builtin types use the mangled name of the canonical type.
@@ -332,7 +332,7 @@ llvm::MDNode *CodeGenTBAA::getTypeInfoHelper(const Type *Ty) {
   // Enum types are distinct types. In C++ they have "underlying types",
   // however they aren't related for TBAA.
   if (const EnumType *ETy = dyn_cast<EnumType>(Ty)) {
-    const EnumDecl *ED = ETy->getOriginalDecl()->getDefinitionOrSelf();
+    const EnumDecl *ED = ETy->getDecl()->getDefinitionOrSelf();
     if (!Features.CPlusPlus)
       return getTypeInfo(ED->getIntegerType());
 
@@ -433,7 +433,7 @@ CodeGenTBAA::CollectFields(uint64_t BaseOffset,
           llvm::MDBuilder::TBAAStructField(BaseOffset, Size, TBAATag));
       return true;
     }
-    const RecordDecl *RD = TTy->getOriginalDecl()->getDefinition();
+    const RecordDecl *RD = TTy->getDecl()->getDefinition();
     if (RD->hasFlexibleArrayMember())
       return false;
 
@@ -514,7 +514,7 @@ CodeGenTBAA::getTBAAStructInfo(QualType QTy) {
 
 llvm::MDNode *CodeGenTBAA::getBaseTypeInfoHelper(const Type *Ty) {
   if (auto *TTy = dyn_cast<RecordType>(Ty)) {
-    const RecordDecl *RD = TTy->getOriginalDecl()->getDefinition();
+    const RecordDecl *RD = TTy->getDecl()->getDefinition();
     const ASTRecordLayout &Layout = Context.getASTRecordLayout(RD);
     using TBAAStructField = llvm::MDBuilder::TBAAStructField;
     SmallVector<TBAAStructField, 4> Fields;
@@ -609,8 +609,7 @@ llvm::MDNode *CodeGenTBAA::getValidBaseTypeInfo(QualType QTy) {
   // First calculate the metadata, before recomputing the insertion point, as
   // the helper can recursively call us.
   llvm::MDNode *TypeNode = getBaseTypeInfoHelper(Ty);
-  LLVM_ATTRIBUTE_UNUSED auto inserted =
-      BaseTypeMetadataCache.insert({Ty, TypeNode});
+  [[maybe_unused]] auto inserted = BaseTypeMetadataCache.insert({Ty, TypeNode});
   assert(inserted.second && "BaseType metadata was already inserted");
 
   return TypeNode;
diff --git a/clang/lib/CodeGen/CodeGenTypeCache.h b/clang/lib/CodeGen/CodeGenTypeCache.h
index e273ebe3b060f..015306bb97373 100644
--- a/clang/lib/CodeGen/CodeGenTypeCache.h
+++ b/clang/lib/CodeGen/CodeGenTypeCache.h
@@ -53,7 +53,7 @@ struct CodeGenTypeCache {
 
   /// void*, void** in the target's default address space (often 0)
   union {
-    llvm::PointerType *UnqualPtrTy;
+    llvm::PointerType *DefaultPtrTy;
     llvm::PointerType *VoidPtrTy;
     llvm::PointerType *Int8PtrTy;
     llvm::PointerType *VoidPtrPtrTy;
diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp
index f23c7d24e262e..40ebf6e057740 100644
--- a/clang/lib/CodeGen/CodeGenTypes.cpp
+++ b/clang/lib/CodeGen/CodeGenTypes.cpp
@@ -373,8 +373,8 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) {
   }
 
   // RecordTypes are cached and processed specially.
-  if (const RecordType *RT = dyn_cast<RecordType>(Ty))
-    return ConvertRecordDeclType(RT->getOriginalDecl()->getDefinitionOrSelf());
+  if (const auto *RT = dyn_cast<RecordType>(Ty))
+    return ConvertRecordDeclType(RT->getDecl()->getDefinitionOrSelf());
 
   llvm::Type *CachedType = nullptr;
   auto TCI = TypeCache.find(Ty);
diff --git a/clang/lib/CodeGen/ItaniumCXXABI.cpp b/clang/lib/CodeGen/ItaniumCXXABI.cpp
index 7dc2eaf1e9f75..65c47633bc5c4 100644
--- a/clang/lib/CodeGen/ItaniumCXXABI.cpp
+++ b/clang/lib/CodeGen/ItaniumCXXABI.cpp
@@ -774,7 +774,7 @@ CGCallee ItaniumCXXABI::EmitLoadOfMemberFunctionPointer(
       } else {
         llvm::Value *VFPAddr =
             CGF.Builder.CreateGEP(CGF.Int8Ty, VTable, VTableOffset);
-        VirtualFn = CGF.Builder.CreateAlignedLoad(CGF.UnqualPtrTy, VFPAddr,
+        VirtualFn = CGF.Builder.CreateAlignedLoad(CGF.DefaultPtrTy, VFPAddr,
                                                   CGF.getPointerAlign(),
                                                   "memptr.virtualfn");
       }
@@ -816,7 +816,7 @@ CGCallee ItaniumCXXABI::EmitLoadOfMemberFunctionPointer(
   // function pointer.
   CGF.EmitBlock(FnNonVirtual);
   llvm::Value *NonVirtualFn =
-      Builder.CreateIntToPtr(FnAsInt, CGF.UnqualPtrTy, "memptr.nonvirtualfn");
+      Builder.CreateIntToPtr(FnAsInt, CGF.DefaultPtrTy, "memptr.nonvirtualfn");
 
   // Check the function pointer if CFI on member function pointers is enabled.
   if (ShouldEmitCFICheck) {
@@ -856,7 +856,7 @@ CGCallee ItaniumCXXABI::EmitLoadOfMemberFunctionPointer(
 
   // We're done.
   CGF.EmitBlock(FnEnd);
-  llvm::PHINode *CalleePtr = Builder.CreatePHI(CGF.UnqualPtrTy, 2);
+  llvm::PHINode *CalleePtr = Builder.CreatePHI(CGF.DefaultPtrTy, 2);
   CalleePtr->addIncoming(VirtualFn, FnVirtual);
   CalleePtr->addIncoming(NonVirtualFn, FnNonVirtual);
 
@@ -1403,7 +1403,7 @@ void ItaniumCXXABI::emitVirtualObjectDelete(CodeGenFunction &CGF,
 
     // Grab the vtable pointer as an intptr_t*.
     auto *ClassDecl = ElementType->castAsCXXRecordDecl();
-    llvm::Value *VTable = CGF.GetVTablePtr(Ptr, CGF.UnqualPtrTy, ClassDecl);
+    llvm::Value *VTable = CGF.GetVTablePtr(Ptr, CGF.DefaultPtrTy, ClassDecl);
 
     // Track back to entry -2 and pull out the offset there.
     llvm::Value *OffsetPtr = CGF.Builder.CreateConstInBoundsGEP1_64(
@@ -1749,7 +1749,7 @@ llvm::Value *ItaniumCXXABI::emitExactDynamicCast(
   auto AuthenticateVTable = [&](Address ThisAddr, const CXXRecordDecl *Decl) {
     if (!CGF.getLangOpts().PointerAuthCalls)
       return;
-    (void)CGF.GetVTablePtr(ThisAddr, CGF.UnqualPtrTy, Decl,
+    (void)CGF.GetVTablePtr(ThisAddr, CGF.DefaultPtrTy, Decl,
                            CodeGenFunction::VTableAuthMode::MustTrap);
   };
 
@@ -1775,7 +1775,7 @@ llvm::Value *ItaniumCXXABI::emitExactDynamicCast(
     if (PerformPostCastAuthentication)
       VTable = CGF.EmitPointerAuthAuth(StrippingAuthInfo, VTable);
   } else
-    VTable = CGF.GetVTablePtr(ThisAddr, CGF.UnqualPtrTy, SrcDecl);
+    VTable = CGF.GetVTablePtr(ThisAddr, CGF.DefaultPtrTy, SrcDecl);
 
   // Compare the vptr against the expected vptr for the destination type at
   // this offset.
@@ -1828,7 +1828,7 @@ llvm::Value *ItaniumCXXABI::emitDynamicCastToVoid(CodeGenFunction &CGF,
   if (CGM.getItaniumVTableContext().isRelativeLayout()) {
     // Get the vtable pointer.
     llvm::Value *VTable =
-        CGF.GetVTablePtr(ThisAddr, CGF.UnqualPtrTy, ClassDecl);
+        CGF.GetVTablePtr(ThisAddr, CGF.DefaultPtrTy, ClassDecl);
 
     // Get the offset-to-top from the vtable.
     OffsetToTop =
@@ -1841,7 +1841,7 @@ llvm::Value *ItaniumCXXABI::emitDynamicCastToVoid(CodeGenFunction &CGF,
 
     // Get the vtable pointer.
     llvm::Value *VTable =
-        CGF.GetVTablePtr(ThisAddr, CGF.UnqualPtrTy, ClassDecl);
+        CGF.GetVTablePtr(ThisAddr, CGF.DefaultPtrTy, ClassDecl);
 
     // Get the offset-to-top from the vtable.
     OffsetToTop =
@@ -2578,7 +2578,7 @@ llvm::Value *ItaniumCXXABI::readArrayCookieImpl(CodeGenFunction &CGF,
   // We can't simply ignore this load using nosanitize metadata because
   // the metadata may be lost.
   llvm::FunctionType *FTy =
-      llvm::FunctionType::get(CGF.SizeTy, CGF.UnqualPtrTy, false);
+      llvm::FunctionType::get(CGF.SizeTy, CGF.DefaultPtrTy, false);
   llvm::FunctionCallee F =
       CGM.CreateRuntimeFunction(FTy, "__asan_load_cxx_array_cookie");
   return CGF.Builder.CreateCall(F, numElementsPtr.emitRawPointer(CGF));
@@ -2921,7 +2921,7 @@ static void emitGlobalDtorWithCXAAtExit(CodeGenFunction &CGF,
 
   // We're assuming that the destructor function is something we can
   // reasonably call with the default CC.
-  llvm::Type *dtorTy = CGF.UnqualPtrTy;
+  llvm::Type *dtorTy = CGF.DefaultPtrTy;
 
   // Preserve address space of addr.
   auto AddrAS = addr ? addr->getType()->getPointerAddressSpace() : 0;
@@ -3816,7 +3816,7 @@ static bool ShouldUseExternalRTTIDescriptor(CodeGenModule &CGM,
 
   if (const RecordType *RecordTy = dyn_cast<RecordType>(Ty)) {
     const CXXRecordDecl *RD =
-        cast<CXXRecordDecl>(RecordTy->getOriginalDecl())->getDefinitionOrSelf();
+        cast<CXXRecordDecl>(RecordTy->getDecl())->getDefinitionOrSelf();
     if (!RD->hasDefinition())
       return false;
 
@@ -3850,9 +3850,7 @@ static bool ShouldUseExternalRTTIDescriptor(CodeGenModule &CGM,
 
 /// IsIncompleteClassType - Returns whether the given record type is incomplete.
 static bool IsIncompleteClassType(const RecordType *RecordTy) {
-  return !RecordTy->getOriginalDecl()
-              ->getDefinitionOrSelf()
-              ->isCompleteDefinition();
+  return !RecordTy->getDecl()->getDefinitionOrSelf()->isCompleteDefinition();
 }
 
 /// ContainsIncompleteClassType - Returns whether the given type contains an
@@ -3985,9 +3983,8 @@ void ItaniumRTTIBuilder::BuildVTablePointer(const Type *Ty,
     break;
 
   case Type::Record: {
-    const CXXRecordDecl *RD =
-        cast<CXXRecordDecl>(cast<RecordType>(Ty)->getOriginalDecl())
-            ->getDefinitionOrSelf();
+    const auto *RD = cast<CXXRecordDecl>(cast<RecordType>(Ty)->getDecl())
+                         ->getDefinitionOrSelf();
 
     if (!RD->hasDefinition() || !RD->getNumBases()) {
       VTableName = ClassTypeInfo;
@@ -4109,8 +4106,8 @@ static llvm::GlobalVariable::LinkageTypes getTypeInfoLinkage(CodeGenModule &CGM,
       return llvm::GlobalValue::LinkOnceODRLinkage;
 
     if (const RecordType *Record = dyn_cast<RecordType>(Ty)) {
-      const CXXRecordDecl *RD =
-          cast<CXXRecordDecl>(Record->getOriginalDecl())->getDefinitionOrSelf();
+      const auto *RD =
+          cast<CXXRecordDecl>(Record->getDecl())->getDefinitionOrSelf();
       if (RD->hasAttr<WeakAttr>())
         return llvm::GlobalValue::WeakODRLinkage;
       if (CGM.getTriple().isWindowsItaniumEnvironment())
@@ -4273,9 +4270,8 @@ llvm::Constant *ItaniumRTTIBuilder::BuildTypeInfo(
     break;
 
   case Type::Record: {
-    const CXXRecordDecl *RD =
-        cast<CXXRecordDecl>(cast<RecordType>(Ty)->getOriginalDecl())
-            ->getDefinitionOrSelf();
+    const auto *RD = cast<CXXRecordDecl>(cast<RecordType>(Ty)->getDecl())
+                         ->getDefinitionOrSelf();
     if (!RD->hasDefinition() || !RD->getNumBases()) {
       // We don't need to emit any fields.
       break;
@@ -4322,8 +4318,8 @@ llvm::Constant *ItaniumRTTIBuilder::BuildTypeInfo(
   if (CGM.getTarget().hasPS4DLLImportExport() &&
       GVDLLStorageClass != llvm::GlobalVariable::DLLExportStorageClass) {
     if (const RecordType *RecordTy = dyn_cast<RecordType>(Ty)) {
-      const CXXRecordDecl *RD = cast<CXXRecordDecl>(RecordTy->getOriginalDecl())
-                                    ->getDefinitionOrSelf();
+      const auto *RD =
+          cast<CXXRecordDecl>(RecordTy->getDecl())->getDefinitionOrSelf();
       if (RD->hasAttr<DLLExportAttr>() ||
           CXXRecordNonInlineHasAttr<DLLExportAttr>(RD))
         GVDLLStorageClass = llvm::GlobalVariable::DLLExportStorageClass;
@@ -5039,7 +5035,7 @@ static void InitCatchParam(CodeGenFunction &CGF,
   auto catchRD = CatchType->getAsCXXRecordDecl();
   CharUnits caughtExnAlignment = CGF.CGM.getClassPointerAlignment(catchRD);
 
-  llvm::Type *PtrTy = CGF.UnqualPtrTy; // addrspace 0 ok
+  llvm::Type *PtrTy = CGF.DefaultPtrTy;
 
   // Check for a copy expression.  If we don't have a copy expression,
   // that means a trivial copy is okay.
@@ -5248,7 +5244,7 @@ void XLCXXABI::registerGlobalDtor(CodeGenFunction &CGF, const VarDecl &D,
                                   llvm::FunctionCallee Dtor,
                                   llvm::Constant *Addr) {
   if (D.getTLSKind() != VarDecl::TLS_None) {
-    llvm::PointerType *PtrTy = CGF.UnqualPtrTy;
+    llvm::PointerType *PtrTy = CGF.DefaultPtrTy;
 
     // extern "C" int __pt_atexit_np(int flags, int(*)(int,...), ...);
     llvm::FunctionType *AtExitTy =
diff --git a/clang/lib/CodeGen/MicrosoftCXXABI.cpp b/clang/lib/CodeGen/MicrosoftCXXABI.cpp
index 19d9265247119..71e24491f19a4 100644
--- a/clang/lib/CodeGen/MicrosoftCXXABI.cpp
+++ b/clang/lib/CodeGen/MicrosoftCXXABI.cpp
@@ -528,7 +528,7 @@ class MicrosoftCXXABI : public CGCXXABI {
         CGM.IntTy,
         CGM.IntTy,
         CGM.IntTy,
-        getImageRelativeType(CGM.UnqualPtrTy),
+        getImageRelativeType(CGM.DefaultPtrTy),
     };
     BaseClassDescriptorType = llvm::StructType::create(
         CGM.getLLVMContext(), FieldTypes, "rtti.BaseClassDescriptor");
@@ -540,7 +540,7 @@ class MicrosoftCXXABI : public CGCXXABI {
       return ClassHierarchyDescriptorType;
     // Forward-declare RTTIClassHierarchyDescriptor to break a cycle.
     llvm::Type *FieldTypes[] = {CGM.IntTy, CGM.IntTy, CGM.IntTy,
-                                getImageRelativeType(CGM.UnqualPtrTy)};
+                                getImageRelativeType(CGM.DefaultPtrTy)};
     ClassHierarchyDescriptorType =
         llvm::StructType::create(FieldTypes, "rtti.ClassHierarchyDescriptor");
     return ClassHierarchyDescriptorType;
@@ -554,7 +554,7 @@ class MicrosoftCXXABI : public CGCXXABI {
         CGM.IntTy,
         CGM.IntTy,
         getImageRelativeType(CGM.Int8PtrTy),
-        getImageRelativeType(CGM.UnqualPtrTy),
+        getImageRelativeType(CGM.DefaultPtrTy),
         getImageRelativeType(CGM.VoidTy),
     };
     llvm::ArrayRef<llvm::Type *> FieldTypesRef(FieldTypes);
@@ -752,7 +752,7 @@ class MicrosoftCXXABI : public CGCXXABI {
 
     llvm::SmallString<23> CTATypeName("eh.CatchableTypeArray.");
     CTATypeName += llvm::utostr(NumEntries);
-    llvm::Type *CTType = getImageRelativeType(CGM.UnqualPtrTy);
+    llvm::Type *CTType = getImageRelativeType(CGM.DefaultPtrTy);
     llvm::Type *FieldTypes[] = {
         CGM.IntTy,                               // NumEntries
         llvm::ArrayType::get(CTType, NumEntries) // CatchableTypes
@@ -779,7 +779,7 @@ class MicrosoftCXXABI : public CGCXXABI {
   llvm::FunctionCallee getThrowFn() {
     // _CxxThrowException is passed an exception object and a ThrowInfo object
     // which describes the exception.
-    llvm::Type *Args[] = {CGM.Int8PtrTy, CGM.UnqualPtrTy};
+    llvm::Type *Args[] = {CGM.Int8PtrTy, CGM.DefaultPtrTy};
     llvm::FunctionType *FTy =
         llvm::FunctionType::get(CGM.VoidTy, Args, /*isVarArg=*/false);
     llvm::FunctionCallee Throw =
@@ -920,7 +920,7 @@ void MicrosoftCXXABI::emitVirtualObjectDelete(CodeGenFunction &CGF,
 
 void MicrosoftCXXABI::emitRethrow(CodeGenFunction &CGF, bool isNoReturn) {
   llvm::Value *Args[] = {llvm::ConstantPointerNull::get(CGM.Int8PtrTy),
-                         llvm::ConstantPointerNull::get(CGM.UnqualPtrTy)};
+                         llvm::ConstantPointerNull::get(CGM.DefaultPtrTy)};
   llvm::FunctionCallee Fn = getThrowFn();
   if (isNoReturn)
     CGF.EmitNoreturnRuntimeCallOrInvoke(Fn, Args);
@@ -1969,13 +1969,13 @@ CGCallee MicrosoftCXXABI::getVirtualFunctionPointer(CodeGenFunction &CGF,
                                                     SourceLocation Loc) {
   CGBuilderTy &Builder = CGF.Builder;
 
-  Ty = CGF.UnqualPtrTy;
+  Ty = CGF.DefaultPtrTy;
   Address VPtr =
       adjustThisArgumentForVirtualFunctionCall(CGF, GD, This, true);
 
   auto *MethodDecl = cast<CXXMethodDecl>(GD.getDecl());
   llvm::Value *VTable =
-      CGF.GetVTablePtr(VPtr, CGF.UnqualPtrTy, MethodDecl->getParent());
+      CGF.GetVTablePtr(VPtr, CGF.DefaultPtrTy, MethodDecl->getParent());
 
   MicrosoftVTableContext &VFTContext = CGM.getMicrosoftVTableContext();
   MethodVFTableLocation ML = VFTContext.getMethodVFTableLocation(GD);
@@ -2136,9 +2136,9 @@ MicrosoftCXXABI::EmitVirtualMemPtrThunk(const CXXMethodDecl *MD,
 
   // Load the vfptr and then callee from the vftable.  The callee should have
   // adjusted 'this' so that the vfptr is at offset zero.
-  llvm::Type *ThunkPtrTy = CGF.UnqualPtrTy;
+  llvm::Type *ThunkPtrTy = CGF.DefaultPtrTy;
   llvm::Value *VTable =
-      CGF.GetVTablePtr(getThisAddress(CGF), CGF.UnqualPtrTy, MD->getParent());
+      CGF.GetVTablePtr(getThisAddress(CGF), CGF.DefaultPtrTy, MD->getParent());
 
   llvm::Value *VFuncPtr = CGF.Builder.CreateConstInBoundsGEP1_64(
       ThunkPtrTy, VTable, ML.Index, "vfn");
@@ -2562,7 +2562,7 @@ static ConstantAddress getInitThreadEpochPtr(CodeGenModule &CGM) {
 static llvm::FunctionCallee getInitThreadHeaderFn(CodeGenModule &CGM) {
   llvm::FunctionType *FTy =
       llvm::FunctionType::get(llvm::Type::getVoidTy(CGM.getLLVMContext()),
-                              CGM.UnqualPtrTy, /*isVarArg=*/false);
+                              CGM.DefaultPtrTy, /*isVarArg=*/false);
   return CGM.CreateRuntimeFunction(
       FTy, "_Init_thread_header",
       llvm::AttributeList::get(CGM.getLLVMContext(),
@@ -2574,7 +2574,7 @@ static llvm::FunctionCallee getInitThreadHeaderFn(CodeGenModule &CGM) {
 static llvm::FunctionCallee getInitThreadFooterFn(CodeGenModule &CGM) {
   llvm::FunctionType *FTy =
       llvm::FunctionType::get(llvm::Type::getVoidTy(CGM.getLLVMContext()),
-                              CGM.UnqualPtrTy, /*isVarArg=*/false);
+                              CGM.DefaultPtrTy, /*isVarArg=*/false);
   return CGM.CreateRuntimeFunction(
       FTy, "_Init_thread_footer",
       llvm::AttributeList::get(CGM.getLLVMContext(),
@@ -2586,7 +2586,7 @@ static llvm::FunctionCallee getInitThreadFooterFn(CodeGenModule &CGM) {
 static llvm::FunctionCallee getInitThreadAbortFn(CodeGenModule &CGM) {
   llvm::FunctionType *FTy =
       llvm::FunctionType::get(llvm::Type::getVoidTy(CGM.getLLVMContext()),
-                              CGM.UnqualPtrTy, /*isVarArg=*/false);
+                              CGM.DefaultPtrTy, /*isVarArg=*/false);
   return CGM.CreateRuntimeFunction(
       FTy, "_Init_thread_abort",
       llvm::AttributeList::get(CGM.getLLVMContext(),
@@ -3169,7 +3169,7 @@ MicrosoftCXXABI::GetVBaseOffsetFromVBPtr(CodeGenFunction &CGF,
   }
 
   llvm::Value *VBTable =
-      Builder.CreateAlignedLoad(CGM.UnqualPtrTy, VBPtr, VBPtrAlign, "vbtable");
+      Builder.CreateAlignedLoad(CGM.DefaultPtrTy, VBPtr, VBPtrAlign, "vbtable");
 
   // Translate from byte offset to table index. It improves analyzability.
   llvm::Value *VBTableIndex = Builder.CreateAShr(
@@ -3825,7 +3825,7 @@ MSRTTIBuilder::getBaseClassArray(SmallVectorImpl<MSRTTIClass> &Classes) {
   // mode) bytes of padding.  We provide a pointer sized amount of padding by
   // adding +1 to Classes.size().  The sections have pointer alignment and are
   // marked pick-any so it shouldn't matter.
-  llvm::Type *PtrType = ABI.getImageRelativeType(CGM.UnqualPtrTy);
+  llvm::Type *PtrType = ABI.getImageRelativeType(CGM.DefaultPtrTy);
   auto *ArrType = llvm::ArrayType::get(PtrType, Classes.size() + 1);
   auto *BCA =
       new llvm::GlobalVariable(Module, ArrType,
@@ -4372,7 +4372,7 @@ llvm::GlobalVariable *MicrosoftCXXABI::getCatchableTypeArray(QualType T) {
     CatchableTypes.insert(getCatchableType(getContext().VoidPtrTy));
 
   uint32_t NumEntries = CatchableTypes.size();
-  llvm::Type *CTType = getImageRelativeType(CGM.UnqualPtrTy);
+  llvm::Type *CTType = getImageRelativeType(CGM.DefaultPtrTy);
   llvm::ArrayType *AT = llvm::ArrayType::get(CTType, NumEntries);
   llvm::StructType *CTAType = getCatchableTypeArrayType(NumEntries);
   llvm::Constant *Fields[] = {
diff --git a/clang/lib/CodeGen/SwiftCallingConv.cpp b/clang/lib/CodeGen/SwiftCallingConv.cpp
index 4d894fd99db05..209654303a82b 100644
--- a/clang/lib/CodeGen/SwiftCallingConv.cpp
+++ b/clang/lib/CodeGen/SwiftCallingConv.cpp
@@ -66,7 +66,7 @@ void SwiftAggLowering::addTypedData(QualType type, CharUnits begin) {
 
   // Record types.
   if (auto recType = type->getAsCanonical<RecordType>()) {
-    addTypedData(recType->getOriginalDecl(), begin);
+    addTypedData(recType->getDecl(), begin);
 
     // Array types.
   } else if (type->isArrayType()) {
@@ -814,7 +814,7 @@ static ABIArgInfo classifyType(CodeGenModule &CGM, CanQualType type,
                                bool forReturn) {
   unsigned IndirectAS = CGM.getDataLayout().getAllocaAddrSpace();
   if (auto recordType = dyn_cast<RecordType>(type)) {
-    auto record = recordType->getOriginalDecl();
+    auto record = recordType->getDecl();
     auto &layout = CGM.getContext().getASTRecordLayout(record);
 
     if (mustPassRecordIndirectly(CGM, record))
@@ -822,8 +822,7 @@ static ABIArgInfo classifyType(CodeGenModule &CGM, CanQualType type,
                                      /*AddrSpace=*/IndirectAS, /*byval=*/false);
 
     SwiftAggLowering lowering(CGM);
-    lowering.addTypedData(recordType->getOriginalDecl(), CharUnits::Zero(),
-                          layout);
+    lowering.addTypedData(recordType->getDecl(), CharUnits::Zero(), layout);
     lowering.finish();
 
     return classifyExpandedType(lowering, forReturn, layout.getAlignment(),
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index 84b177b83d92f..988c7ec1271dd 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -12,6 +12,7 @@
 
 #include "CGBuiltin.h"
 #include "CodeGenFunction.h"
+#include "clang/Basic/SyncScope.h"
 #include "clang/Basic/TargetBuiltins.h"
 #include "clang/Frontend/FrontendDiagnostic.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -313,33 +314,33 @@ void CodeGenFunction::ProcessOrderScopeAMDGCN(Value *Order, Value *Scope,
   }
 
   // Older builtins had an enum argument for the memory scope.
+  const char *SSN = nullptr;
   int scope = cast<llvm::ConstantInt>(Scope)->getZExtValue();
   switch (scope) {
-  case 0: // __MEMORY_SCOPE_SYSTEM
+  case AtomicScopeGenericModel::System: // __MEMORY_SCOPE_SYSTEM
     SSID = llvm::SyncScope::System;
     break;
-  case 1: // __MEMORY_SCOPE_DEVICE
-    if (getTarget().getTriple().isSPIRV())
-      SSID = getLLVMContext().getOrInsertSyncScopeID("device");
-    else
-      SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
+  case AtomicScopeGenericModel::Device: // __MEMORY_SCOPE_DEVICE
+    SSN = getTarget().getTriple().isSPIRV() ? "device" : "agent";
     break;
-  case 2: // __MEMORY_SCOPE_WRKGRP
-    SSID = getLLVMContext().getOrInsertSyncScopeID("workgroup");
+  case AtomicScopeGenericModel::Workgroup: // __MEMORY_SCOPE_WRKGRP
+    SSN = "workgroup";
     break;
-  case 3: // __MEMORY_SCOPE_WVFRNT
-    if (getTarget().getTriple().isSPIRV())
-      SSID = getLLVMContext().getOrInsertSyncScopeID("subgroup");
-    else
-      SSID = getLLVMContext().getOrInsertSyncScopeID("wavefront");
+  case AtomicScopeGenericModel::Cluster: // __MEMORY_SCOPE_CLUSTR
+    SSN = getTarget().getTriple().isSPIRV() ? "workgroup" : "cluster";
     break;
-  case 4: // __MEMORY_SCOPE_SINGLE
+  case AtomicScopeGenericModel::Wavefront: // __MEMORY_SCOPE_WVFRNT
+    SSN = getTarget().getTriple().isSPIRV() ? "subgroup" : "wavefront";
+    break;
+  case AtomicScopeGenericModel::Single: // __MEMORY_SCOPE_SINGLE
     SSID = llvm::SyncScope::SingleThread;
     break;
   default:
     SSID = llvm::SyncScope::System;
     break;
   }
+  if (SSN)
+    SSID = getLLVMContext().getOrInsertSyncScopeID(SSN);
 }
 
 llvm::Value *CodeGenFunction::EmitScalarOrConstFoldImmArg(unsigned ICEArguments,
@@ -901,6 +902,26 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
     llvm::Function *F = CGM.getIntrinsic(IID, {Args[0]->getType()});
     return Builder.CreateCall(F, {Args});
   }
+  case AMDGPU::BI__builtin_amdgcn_global_load_b128:
+  case AMDGPU::BI__builtin_amdgcn_global_store_b128: {
+    const bool IsStore =
+        BuiltinID == AMDGPU::BI__builtin_amdgcn_global_store_b128;
+    LLVMContext &Ctx = CGM.getLLVMContext();
+    SmallVector<Value *, 5> Args = {EmitScalarExpr(E->getArg(0))}; // addr
+    if (IsStore)
+      Args.push_back(EmitScalarExpr(E->getArg(1))); // data
+    const unsigned ScopeIdx = E->getNumArgs() - 1;
+    StringRef ScopeLit =
+        cast<StringLiteral>(E->getArg(ScopeIdx)->IgnoreParenCasts())
+            ->getString();
+    llvm::MDNode *MD =
+        llvm::MDNode::get(Ctx, {llvm::MDString::get(Ctx, ScopeLit)});
+    Args.push_back(llvm::MetadataAsValue::get(Ctx, MD)); // scope
+    llvm::Function *F =
+        CGM.getIntrinsic(IsStore ? Intrinsic::amdgcn_global_store_b128
+                                 : Intrinsic::amdgcn_global_load_b128);
+    return Builder.CreateCall(F, Args);
+  }
   case AMDGPU::BI__builtin_amdgcn_get_fpenv: {
     Function *F = CGM.getIntrinsic(Intrinsic::get_fpenv,
                                    {llvm::Type::getInt64Ty(getLLVMContext())});
diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
index 2429a430433d7..60f9b86333670 100644
--- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
@@ -2037,7 +2037,7 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
   case NEON::BI__builtin_neon_vld1q_x3_v:
   case NEON::BI__builtin_neon_vld1_x4_v:
   case NEON::BI__builtin_neon_vld1q_x4_v: {
-    llvm::Type *Tys[2] = {VTy, UnqualPtrTy};
+    llvm::Type *Tys[2] = {VTy, DefaultPtrTy};
     Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys);
     Ops[1] = Builder.CreateCall(F, Ops[1], "vld1xN");
     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
@@ -2263,11 +2263,11 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
     // in AArch64 it comes last. We may want to stick to one or another.
     if (Arch == llvm::Triple::aarch64 || Arch == llvm::Triple::aarch64_be ||
         Arch == llvm::Triple::aarch64_32) {
-      llvm::Type *Tys[2] = {VTy, UnqualPtrTy};
+      llvm::Type *Tys[2] = {VTy, DefaultPtrTy};
       std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end());
       return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, "");
     }
-    llvm::Type *Tys[2] = {UnqualPtrTy, VTy};
+    llvm::Type *Tys[2] = {DefaultPtrTy, VTy};
     return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, "");
   }
   case NEON::BI__builtin_neon_vsubhn_v: {
@@ -2858,7 +2858,7 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID,
     Function *F = CGM.getIntrinsic(
         BuiltinID == clang::ARM::BI__builtin_arm_ldaex ? Intrinsic::arm_ldaex
                                                        : Intrinsic::arm_ldrex,
-        UnqualPtrTy);
+        DefaultPtrTy);
     CallInst *Val = Builder.CreateCall(F, LoadAddr, "ldrex");
     Val->addParamAttr(
         0, Attribute::get(getLLVMContext(), Attribute::ElementType, IntTy));
@@ -5225,7 +5225,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
         CGM.getIntrinsic(BuiltinID == clang::AArch64::BI__builtin_arm_ldaex
                              ? Intrinsic::aarch64_ldaxr
                              : Intrinsic::aarch64_ldxr,
-                         UnqualPtrTy);
+                         DefaultPtrTy);
     CallInst *Val = Builder.CreateCall(F, LoadAddr, "ldxr");
     Val->addParamAttr(
         0, Attribute::get(getLLVMContext(), Attribute::ElementType, IntTy));
@@ -7482,42 +7482,42 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   }
   case NEON::BI__builtin_neon_vld2_v:
   case NEON::BI__builtin_neon_vld2q_v: {
-    llvm::Type *Tys[2] = {VTy, UnqualPtrTy};
+    llvm::Type *Tys[2] = {VTy, DefaultPtrTy};
     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld2, Tys);
     Ops[1] = Builder.CreateCall(F, Ops[1], "vld2");
     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
   }
   case NEON::BI__builtin_neon_vld3_v:
   case NEON::BI__builtin_neon_vld3q_v: {
-    llvm::Type *Tys[2] = {VTy, UnqualPtrTy};
+    llvm::Type *Tys[2] = {VTy, DefaultPtrTy};
     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld3, Tys);
     Ops[1] = Builder.CreateCall(F, Ops[1], "vld3");
     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
   }
   case NEON::BI__builtin_neon_vld4_v:
   case NEON::BI__builtin_neon_vld4q_v: {
-    llvm::Type *Tys[2] = {VTy, UnqualPtrTy};
+    llvm::Type *Tys[2] = {VTy, DefaultPtrTy};
     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld4, Tys);
     Ops[1] = Builder.CreateCall(F, Ops[1], "vld4");
     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
   }
   case NEON::BI__builtin_neon_vld2_dup_v:
   case NEON::BI__builtin_neon_vld2q_dup_v: {
-    llvm::Type *Tys[2] = {VTy, UnqualPtrTy};
+    llvm::Type *Tys[2] = {VTy, DefaultPtrTy};
     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld2r, Tys);
     Ops[1] = Builder.CreateCall(F, Ops[1], "vld2");
     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
   }
   case NEON::BI__builtin_neon_vld3_dup_v:
   case NEON::BI__builtin_neon_vld3q_dup_v: {
-    llvm::Type *Tys[2] = {VTy, UnqualPtrTy};
+    llvm::Type *Tys[2] = {VTy, DefaultPtrTy};
     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld3r, Tys);
     Ops[1] = Builder.CreateCall(F, Ops[1], "vld3");
     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
   }
   case NEON::BI__builtin_neon_vld4_dup_v:
   case NEON::BI__builtin_neon_vld4q_dup_v: {
-    llvm::Type *Tys[2] = {VTy, UnqualPtrTy};
+    llvm::Type *Tys[2] = {VTy, DefaultPtrTy};
     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld4r, Tys);
     Ops[1] = Builder.CreateCall(F, Ops[1], "vld4");
     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
diff --git a/clang/lib/CodeGen/TargetBuiltins/PPC.cpp b/clang/lib/CodeGen/TargetBuiltins/PPC.cpp
index e71dc9ea523a2..44d5938e38724 100644
--- a/clang/lib/CodeGen/TargetBuiltins/PPC.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/PPC.cpp
@@ -59,7 +59,7 @@ static llvm::Value *emitPPCLoadReserveIntrinsic(CodeGenFunction &CGF,
     Constraints += MachineClobbers;
   }
 
-  llvm::Type *PtrType = CGF.UnqualPtrTy;
+  llvm::Type *PtrType = CGF.DefaultPtrTy;
   llvm::FunctionType *FTy = llvm::FunctionType::get(RetType, {PtrType}, false);
 
   llvm::InlineAsm *IA =
diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp
index 1e58c3f217812..342a3af0ac1ee 100644
--- a/clang/lib/CodeGen/TargetInfo.cpp
+++ b/clang/lib/CodeGen/TargetInfo.cpp
@@ -82,6 +82,8 @@ TargetCodeGenInfo::~TargetCodeGenInfo() = default;
 // If someone can figure out a general rule for this, that would be great.
 // It's probably just doomed to be platform-dependent, though.
 unsigned TargetCodeGenInfo::getSizeOfUnwindException() const {
+  if (getABIInfo().getCodeGenOpts().hasSEHExceptions())
+    return getABIInfo().getDataLayout().getPointerSizeInBits() > 32 ? 64 : 48;
   // Verified for:
   //   x86-64     FreeBSD, Linux, Darwin
   //   x86-32     FreeBSD, Linux, Darwin
diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp b/clang/lib/CodeGen/Targets/AArch64.cpp
index d7deece232a9f..bb41a14f5d2f3 100644
--- a/clang/lib/CodeGen/Targets/AArch64.cpp
+++ b/clang/lib/CodeGen/Targets/AArch64.cpp
@@ -743,7 +743,7 @@ bool AArch64ABIInfo::passAsPureScalableType(
       return false;
 
     // Pure scalable types are never unions and never contain unions.
-    const RecordDecl *RD = RT->getOriginalDecl()->getDefinitionOrSelf();
+    const RecordDecl *RD = RT->getDecl()->getDefinitionOrSelf();
     if (RD->isUnion())
       return false;
 
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 0fcbf7e458a34..0bc4b4b7025f2 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -402,6 +402,26 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
 
     F->addFnAttr("amdgpu-max-num-workgroups", AttrVal.str());
   }
+
+  if (auto *Attr = FD->getAttr<CUDAClusterDimsAttr>()) {
+    auto GetExprVal = [&](const auto &E) {
+      return E ? E->EvaluateKnownConstInt(M.getContext()).getExtValue() : 1;
+    };
+    unsigned X = GetExprVal(Attr->getX());
+    unsigned Y = GetExprVal(Attr->getY());
+    unsigned Z = GetExprVal(Attr->getZ());
+    llvm::SmallString<32> AttrVal;
+    llvm::raw_svector_ostream OS(AttrVal);
+    OS << X << ',' << Y << ',' << Z;
+    F->addFnAttr("amdgpu-cluster-dims", AttrVal.str());
+  }
+
+  // OpenCL doesn't support cluster feature.
+  const TargetInfo &TTI = M.getContext().getTargetInfo();
+  if ((IsOpenCLKernel &&
+       TTI.hasFeatureEnabled(TTI.getTargetOpts().FeatureMap, "clusters")) ||
+      FD->hasAttr<CUDANoClusterAttr>())
+    F->addFnAttr("amdgpu-cluster-dims", "0,0,0");
 }
 
 void AMDGPUTargetCodeGenInfo::setTargetAttributes(
@@ -488,6 +508,10 @@ AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &LangOpts,
   case SyncScope::WavefrontScope:
     Name = "wavefront";
     break;
+  case SyncScope::HIPCluster:
+  case SyncScope::ClusterScope:
+    Name = "cluster";
+    break;
   case SyncScope::HIPWorkgroup:
   case SyncScope::OpenCLWorkGroup:
   case SyncScope::WorkgroupScope:
diff --git a/clang/lib/CodeGen/Targets/ARC.cpp b/clang/lib/CodeGen/Targets/ARC.cpp
index 67275877cbd9e..6c9444d7897e7 100644
--- a/clang/lib/CodeGen/Targets/ARC.cpp
+++ b/clang/lib/CodeGen/Targets/ARC.cpp
@@ -112,8 +112,7 @@ ABIArgInfo ARCABIInfo::classifyArgumentType(QualType Ty,
 
   if (isAggregateTypeForABI(Ty)) {
     // Structures with flexible arrays are always indirect.
-    if (RT &&
-        RT->getOriginalDecl()->getDefinitionOrSelf()->hasFlexibleArrayMember())
+    if (RT && RT->getDecl()->getDefinitionOrSelf()->hasFlexibleArrayMember())
       return getIndirectByValue(Ty);
 
     // Ignore empty structs/unions.
diff --git a/clang/lib/CodeGen/Targets/ARM.cpp b/clang/lib/CodeGen/Targets/ARM.cpp
index c84c9f2f643ee..4d05217cafb79 100644
--- a/clang/lib/CodeGen/Targets/ARM.cpp
+++ b/clang/lib/CodeGen/Targets/ARM.cpp
@@ -516,7 +516,7 @@ static bool isIntegerLikeType(QualType Ty, ASTContext &Context,
   if (!RT) return false;
 
   // Ignore records with flexible arrays.
-  const RecordDecl *RD = RT->getOriginalDecl()->getDefinitionOrSelf();
+  const RecordDecl *RD = RT->getDecl()->getDefinitionOrSelf();
   if (RD->hasFlexibleArrayMember())
     return false;
 
diff --git a/clang/lib/CodeGen/Targets/Lanai.cpp b/clang/lib/CodeGen/Targets/Lanai.cpp
index e76431a484e70..871a13513e374 100644
--- a/clang/lib/CodeGen/Targets/Lanai.cpp
+++ b/clang/lib/CodeGen/Targets/Lanai.cpp
@@ -102,8 +102,7 @@ ABIArgInfo LanaiABIInfo::classifyArgumentType(QualType Ty,
 
   if (isAggregateTypeForABI(Ty)) {
     // Structures with flexible arrays are always indirect.
-    if (RT &&
-        RT->getOriginalDecl()->getDefinitionOrSelf()->hasFlexibleArrayMember())
+    if (RT && RT->getDecl()->getDefinitionOrSelf()->hasFlexibleArrayMember())
       return getIndirectResult(Ty, /*ByVal=*/true, State);
 
     // Ignore empty structs/unions.
diff --git a/clang/lib/CodeGen/Targets/LoongArch.cpp b/clang/lib/CodeGen/Targets/LoongArch.cpp
index 1f344d6582510..878723d67f081 100644
--- a/clang/lib/CodeGen/Targets/LoongArch.cpp
+++ b/clang/lib/CodeGen/Targets/LoongArch.cpp
@@ -150,7 +150,7 @@ bool LoongArchABIInfo::detectFARsEligibleStructHelper(
     // Non-zero-length arrays of empty records make the struct ineligible to be
     // passed via FARs in C++.
     if (const auto *RTy = EltTy->getAsCanonical<RecordType>()) {
-      if (ArraySize != 0 && isa<CXXRecordDecl>(RTy->getOriginalDecl()) &&
+      if (ArraySize != 0 && isa<CXXRecordDecl>(RTy->getDecl()) &&
           isEmptyRecord(getContext(), EltTy, true, true))
         return false;
     }
@@ -169,7 +169,7 @@ bool LoongArchABIInfo::detectFARsEligibleStructHelper(
     // copy constructor are not eligible for the FP calling convention.
     if (getRecordArgABI(Ty, CGT.getCXXABI()))
       return false;
-    const RecordDecl *RD = RTy->getOriginalDecl()->getDefinitionOrSelf();
+    const RecordDecl *RD = RTy->getDecl()->getDefinitionOrSelf();
     if (isEmptyRecord(getContext(), Ty, true, true) &&
         (!RD->isUnion() || !isa<CXXRecordDecl>(RD)))
       return true;
diff --git a/clang/lib/CodeGen/Targets/Mips.cpp b/clang/lib/CodeGen/Targets/Mips.cpp
index f26ab974d699e..22fdcd95ea8fa 100644
--- a/clang/lib/CodeGen/Targets/Mips.cpp
+++ b/clang/lib/CodeGen/Targets/Mips.cpp
@@ -161,7 +161,7 @@ llvm::Type* MipsABIInfo::HandleAggregates(QualType Ty, uint64_t TySize) const {
     return llvm::StructType::get(getVMContext(), ArgList);
   }
 
-  const RecordDecl *RD = RT->getOriginalDecl()->getDefinitionOrSelf();
+  const RecordDecl *RD = RT->getDecl()->getDefinitionOrSelf();
   const ASTRecordLayout &Layout = getContext().getASTRecordLayout(RD);
   assert(!(TySize % 8) && "Size of structure must be multiple of 8.");
 
@@ -265,7 +265,7 @@ MipsABIInfo::returnAggregateInRegs(QualType RetTy, uint64_t Size) const {
   SmallVector<llvm::Type*, 8> RTList;
 
   if (RT && RT->isStructureOrClassType()) {
-    const RecordDecl *RD = RT->getOriginalDecl()->getDefinitionOrSelf();
+    const RecordDecl *RD = RT->getDecl()->getDefinitionOrSelf();
     const ASTRecordLayout &Layout = getContext().getASTRecordLayout(RD);
     unsigned FieldCnt = Layout.getFieldCount();
 
diff --git a/clang/lib/CodeGen/Targets/PPC.cpp b/clang/lib/CodeGen/Targets/PPC.cpp
index 380e8c06c46f0..35e7655646ade 100644
--- a/clang/lib/CodeGen/Targets/PPC.cpp
+++ b/clang/lib/CodeGen/Targets/PPC.cpp
@@ -490,7 +490,7 @@ RValue PPC32_SVR4_ABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAList,
 
   llvm::Type *DirectTy = CGF.ConvertType(Ty), *ElementTy = DirectTy;
   if (isIndirect)
-    DirectTy = CGF.UnqualPtrTy;
+    DirectTy = CGF.DefaultPtrTy;
 
   // Case 1: consume registers.
   Address RegAddr = Address::invalid();
diff --git a/clang/lib/CodeGen/Targets/RISCV.cpp b/clang/lib/CodeGen/Targets/RISCV.cpp
index 0ef39b68eb6e3..d1345891e9fb6 100644
--- a/clang/lib/CodeGen/Targets/RISCV.cpp
+++ b/clang/lib/CodeGen/Targets/RISCV.cpp
@@ -234,7 +234,7 @@ bool RISCVABIInfo::detectFPCCEligibleStructHelper(QualType Ty, CharUnits CurOff,
     // Non-zero-length arrays of empty records make the struct ineligible for
     // the FP calling convention in C++.
     if (const auto *RTy = EltTy->getAsCanonical<RecordType>()) {
-      if (ArraySize != 0 && isa<CXXRecordDecl>(RTy->getOriginalDecl()) &&
+      if (ArraySize != 0 && isa<CXXRecordDecl>(RTy->getDecl()) &&
           isEmptyRecord(getContext(), EltTy, true, true))
         return false;
     }
@@ -256,7 +256,7 @@ bool RISCVABIInfo::detectFPCCEligibleStructHelper(QualType Ty, CharUnits CurOff,
       return false;
     if (isEmptyRecord(getContext(), Ty, true, true))
       return true;
-    const RecordDecl *RD = RTy->getOriginalDecl()->getDefinitionOrSelf();
+    const RecordDecl *RD = RTy->getDecl()->getDefinitionOrSelf();
     // Unions aren't eligible unless they're empty (which is caught above).
     if (RD->isUnion())
       return false;
@@ -680,22 +680,22 @@ ABIArgInfo RISCVABIInfo::classifyArgumentType(QualType Ty, bool IsFixed,
     if (const auto *ED = Ty->getAsEnumDecl())
       Ty = ED->getIntegerType();
 
-    // All integral types are promoted to XLen width
-    if (Size < XLen && Ty->isIntegralOrEnumerationType()) {
-      return extendType(Ty, CGT.ConvertType(Ty));
-    }
-
     if (const auto *EIT = Ty->getAs<BitIntType>()) {
-      if (EIT->getNumBits() < XLen)
+
+      if (XLen == 64 && EIT->getNumBits() == 32)
         return extendType(Ty, CGT.ConvertType(Ty));
-      if (EIT->getNumBits() > 128 ||
-          (!getContext().getTargetInfo().hasInt128Type() &&
-           EIT->getNumBits() > 64))
-        return getNaturalAlignIndirect(
-            Ty, /*AddrSpace=*/getDataLayout().getAllocaAddrSpace(),
-            /*ByVal=*/false);
+
+      if (EIT->getNumBits() <= 2 * XLen)
+        return ABIArgInfo::getExtend(Ty, CGT.ConvertType(Ty));
+      return getNaturalAlignIndirect(
+          Ty, /*AddrSpace=*/getDataLayout().getAllocaAddrSpace(),
+          /*ByVal=*/false);
     }
 
+    // All integral types are promoted to XLen width
+    if (Size < XLen && Ty->isIntegralOrEnumerationType())
+      return extendType(Ty, CGT.ConvertType(Ty));
+
     return ABIArgInfo::getDirect();
   }
 
diff --git a/clang/lib/CodeGen/Targets/SPIR.cpp b/clang/lib/CodeGen/Targets/SPIR.cpp
index 3f6d4e0a9277a..80e096ecf5ae9 100644
--- a/clang/lib/CodeGen/Targets/SPIR.cpp
+++ b/clang/lib/CodeGen/Targets/SPIR.cpp
@@ -93,6 +93,8 @@ inline StringRef mapClangSyncScopeToLLVM(SyncScope Scope) {
   case SyncScope::OpenCLSubGroup:
   case SyncScope::WavefrontScope:
     return "subgroup";
+  case SyncScope::HIPCluster:
+  case SyncScope::ClusterScope:
   case SyncScope::HIPWorkgroup:
   case SyncScope::OpenCLWorkGroup:
   case SyncScope::WorkgroupScope:
diff --git a/clang/lib/CodeGen/Targets/X86.cpp b/clang/lib/CodeGen/Targets/X86.cpp
index fb789489664df..8daf8eb1d39f1 100644
--- a/clang/lib/CodeGen/Targets/X86.cpp
+++ b/clang/lib/CodeGen/Targets/X86.cpp
@@ -795,8 +795,7 @@ ABIArgInfo X86_32ABIInfo::classifyArgumentType(QualType Ty, CCState &State,
   if (isAggregateTypeForABI(Ty)) {
     // Structures with flexible arrays are always indirect.
     // FIXME: This should not be byval!
-    if (RT &&
-        RT->getOriginalDecl()->getDefinitionOrSelf()->hasFlexibleArrayMember())
+    if (RT && RT->getDecl()->getDefinitionOrSelf()->hasFlexibleArrayMember())
       return getIndirectResult(Ty, true, State);
 
     // Ignore empty structs/unions on non-Windows.
@@ -831,7 +830,7 @@ ABIArgInfo X86_32ABIInfo::classifyArgumentType(QualType Ty, CCState &State,
       unsigned AlignInBits = 0;
       if (RT) {
         const ASTRecordLayout &Layout =
-            getContext().getASTRecordLayout(RT->getOriginalDecl());
+            getContext().getASTRecordLayout(RT->getDecl());
         AlignInBits = getContext().toBits(Layout.getRequiredAlignment());
       } else if (TI.isAlignRequired()) {
         AlignInBits = TI.Align;
@@ -2042,7 +2041,7 @@ void X86_64ABIInfo::classify(QualType Ty, uint64_t OffsetBase, Class &Lo,
     if (getRecordArgABI(RT, getCXXABI()))
       return;
 
-    const RecordDecl *RD = RT->getOriginalDecl()->getDefinitionOrSelf();
+    const RecordDecl *RD = RT->getDecl()->getDefinitionOrSelf();
 
     // Assume variable sized types are passed in memory.
     if (RD->hasFlexibleArrayMember())
@@ -2851,9 +2850,8 @@ ABIArgInfo
 X86_64ABIInfo::classifyRegCallStructTypeImpl(QualType Ty, unsigned &NeededInt,
                                              unsigned &NeededSSE,
                                              unsigned &MaxVectorWidth) const {
-  auto *RD = cast<RecordType>(Ty.getCanonicalType())
-                 ->getOriginalDecl()
-                 ->getDefinitionOrSelf();
+  auto *RD =
+      cast<RecordType>(Ty.getCanonicalType())->getDecl()->getDefinitionOrSelf();
 
   if (RD->hasFlexibleArrayMember())
     return getIndirectReturnResult(Ty);
@@ -3313,7 +3311,7 @@ ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, unsigned &FreeSSERegs,
                                        RAA == CGCXXABI::RAA_DirectInMemory);
     }
 
-    if (RT->getOriginalDecl()->getDefinitionOrSelf()->hasFlexibleArrayMember())
+    if (RT->getDecl()->getDefinitionOrSelf()->hasFlexibleArrayMember())
       return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace(),
                                      /*ByVal=*/false);
   }
diff --git a/clang/lib/CodeGen/Targets/XCore.cpp b/clang/lib/CodeGen/Targets/XCore.cpp
index ab0115467e521..f9726ec0a661d 100644
--- a/clang/lib/CodeGen/Targets/XCore.cpp
+++ b/clang/lib/CodeGen/Targets/XCore.cpp
@@ -380,7 +380,7 @@ static bool appendRecordType(SmallStringEnc &Enc, const RecordType *RT,
 
   // We collect all encoded fields and order as necessary.
   bool IsRecursive = false;
-  const RecordDecl *RD = RT->getOriginalDecl()->getDefinition();
+  const RecordDecl *RD = RT->getDecl()->getDefinition();
   if (RD && !RD->field_empty()) {
     // An incomplete TypeString stub is placed in the cache for this RecordType
     // so that recursive calls to this RecordType will use it whilst building a
@@ -429,7 +429,7 @@ static bool appendEnumType(SmallStringEnc &Enc, const EnumType *ET,
   Enc += "){";
 
   // We collect all encoded enumerations and order them alphanumerically.
-  if (const EnumDecl *ED = ET->getOriginalDecl()->getDefinition()) {
+  if (const EnumDecl *ED = ET->getDecl()->getDefinition()) {
     SmallVector<FieldEncoding, 16> FE;
     for (auto I = ED->enumerator_begin(), E = ED->enumerator_end(); I != E;
          ++I) {
diff --git a/clang/lib/Driver/Distro.cpp b/clang/lib/Driver/Distro.cpp
index 838e087475ccf..df10458d092d6 100644
--- a/clang/lib/Driver/Distro.cpp
+++ b/clang/lib/Driver/Distro.cpp
@@ -61,10 +61,6 @@ static Distro::DistroType DetectLsbRelease(llvm::vfs::FileSystem &VFS) {
     if (Version == Distro::UnknownDistro &&
         Line.starts_with("DISTRIB_CODENAME="))
       Version = llvm::StringSwitch<Distro::DistroType>(Line.substr(17))
-                    .Case("maverick", Distro::UbuntuMaverick)
-                    .Case("natty", Distro::UbuntuNatty)
-                    .Case("oneiric", Distro::UbuntuOneiric)
-                    .Case("precise", Distro::UbuntuPrecise)
                     .Case("quantal", Distro::UbuntuQuantal)
                     .Case("raring", Distro::UbuntuRaring)
                     .Case("saucy", Distro::UbuntuSaucy)
@@ -120,13 +116,17 @@ static Distro::DistroType DetectDistro(llvm::vfs::FileSystem &VFS) {
     if (Data.starts_with("Fedora release"))
       return Distro::Fedora;
     if (Data.starts_with("Red Hat Enterprise Linux") ||
-        Data.starts_with("CentOS") || Data.starts_with("Scientific Linux")) {
+        Data.starts_with("CentOS") || Data.starts_with("AlmaLinux") ||
+        Data.starts_with("Rocky Linux") ||
+        Data.starts_with("Scientific Linux")) {
+      if (Data.contains("release 10"))
+        return Distro::RHEL10;
+      if (Data.contains("release 9"))
+        return Distro::RHEL9;
+      if (Data.contains("release 8"))
+        return Distro::RHEL8;
       if (Data.contains("release 7"))
         return Distro::RHEL7;
-      else if (Data.contains("release 6"))
-        return Distro::RHEL6;
-      else if (Data.contains("release 5"))
-        return Distro::RHEL5;
     }
     return Distro::UnknownDistro;
   }
@@ -139,12 +139,6 @@ static Distro::DistroType DetectDistro(llvm::vfs::FileSystem &VFS) {
     int MajorVersion;
     if (!Data.split('.').first.getAsInteger(10, MajorVersion)) {
       switch (MajorVersion) {
-      case 5:
-        return Distro::DebianLenny;
-      case 6:
-        return Distro::DebianSqueeze;
-      case 7:
-        return Distro::DebianWheezy;
       case 8:
         return Distro::DebianJessie;
       case 9:
@@ -166,8 +160,6 @@ static Distro::DistroType DetectDistro(llvm::vfs::FileSystem &VFS) {
       }
     }
     return llvm::StringSwitch<Distro::DistroType>(Data.split("\n").first)
-        .Case("squeeze/sid", Distro::DebianSqueeze)
-        .Case("wheezy/sid", Distro::DebianWheezy)
         .Case("jessie/sid", Distro::DebianJessie)
         .Case("stretch/sid", Distro::DebianStretch)
         .Case("buster/sid", Distro::DebianBuster)
diff --git a/clang/lib/Driver/Job.cpp b/clang/lib/Driver/Job.cpp
index ac983f177dc57..11192ec68b372 100644
--- a/clang/lib/Driver/Job.cpp
+++ b/clang/lib/Driver/Job.cpp
@@ -60,24 +60,25 @@ static bool skipArgs(const char *Flag, bool HaveCrashVFS, int &SkipNum,
   SkipNum = 2;
   // These flags are all of the form -Flag <Arg> and are treated as two
   // arguments.  Therefore, we need to skip the flag and the next argument.
-  bool ShouldSkip = llvm::StringSwitch<bool>(Flag)
-    .Cases("-MF", "-MT", "-MQ", "-serialize-diagnostic-file", true)
-    .Cases("-o", "-dependency-file", true)
-    .Cases("-fdebug-compilation-dir", "-diagnostic-log-file", true)
-    .Cases("-dwarf-debug-flags", "-ivfsoverlay", true)
-    .Default(false);
+  bool ShouldSkip =
+      llvm::StringSwitch<bool>(Flag)
+          .Cases({"-MF", "-MT", "-MQ", "-serialize-diagnostic-file"}, true)
+          .Cases({"-o", "-dependency-file"}, true)
+          .Cases({"-fdebug-compilation-dir", "-diagnostic-log-file"}, true)
+          .Cases({"-dwarf-debug-flags", "-ivfsoverlay"}, true)
+          .Default(false);
   if (ShouldSkip)
     return true;
 
   // Some include flags shouldn't be skipped if we have a crash VFS
   IsInclude =
       llvm::StringSwitch<bool>(Flag)
-          .Cases("-include", "-header-include-file", true)
-          .Cases("-idirafter", "-internal-isystem", "-iwithprefix", true)
-          .Cases("-internal-externc-isystem", "-iprefix", true)
-          .Cases("-iwithprefixbefore", "-isystem", "-iquote", true)
-          .Cases("-isysroot", "-I", "-F", "-resource-dir", true)
-          .Cases("-internal-iframework", "-iframework", "-include-pch", true)
+          .Cases({"-include", "-header-include-file"}, true)
+          .Cases({"-idirafter", "-internal-isystem", "-iwithprefix"}, true)
+          .Cases({"-internal-externc-isystem", "-iprefix"}, true)
+          .Cases({"-iwithprefixbefore", "-isystem", "-iquote"}, true)
+          .Cases({"-isysroot", "-I", "-F", "-resource-dir"}, true)
+          .Cases({"-internal-iframework", "-iframework", "-include-pch"}, true)
           .Default(false);
   if (IsInclude)
     return !HaveCrashVFS;
@@ -86,9 +87,9 @@ static bool skipArgs(const char *Flag, bool HaveCrashVFS, int &SkipNum,
 
   // These flags are all of the form -Flag and have no second argument.
   ShouldSkip = llvm::StringSwitch<bool>(Flag)
-    .Cases("-M", "-MM", "-MG", "-MP", "-MD", true)
-    .Case("-MMD", true)
-    .Default(false);
+                   .Cases({"-M", "-MM", "-MG", "-MP", "-MD"}, true)
+                   .Case("-MMD", true)
+                   .Default(false);
 
   // Match found.
   SkipNum = 1;
diff --git a/clang/lib/Driver/ToolChains/Arch/ARM.cpp b/clang/lib/Driver/ToolChains/Arch/ARM.cpp
index 954ecabe86836..61beb0455147d 100644
--- a/clang/lib/Driver/ToolChains/Arch/ARM.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/ARM.cpp
@@ -290,6 +290,8 @@ void arm::setArchNameInTriple(const Driver &D, const ArgList &Args,
                       // Thumb2 is the default for V7 on Darwin.
                       (llvm::ARM::parseArchVersion(Suffix) == 7 &&
                        Triple.isOSBinFormatMachO()) ||
+                      // Thumb2 is the default for Fuchsia.
+                      Triple.isOSFuchsia() ||
                       // FIXME: this is invalid for WindowsCE
                       Triple.isOSWindows();
 
@@ -452,6 +454,9 @@ arm::FloatABI arm::getDefaultFloatABI(const llvm::Triple &Triple) {
   case llvm::Triple::OpenBSD:
     return FloatABI::SoftFP;
 
+  case llvm::Triple::Fuchsia:
+    return FloatABI::Hard;
+
   default:
     if (Triple.isOHOSFamily())
       return FloatABI::Soft;
diff --git a/clang/lib/Driver/ToolChains/Arch/Mips.cpp b/clang/lib/Driver/ToolChains/Arch/Mips.cpp
index bac8681921877..227c6a0d3d202 100644
--- a/clang/lib/Driver/ToolChains/Arch/Mips.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/Mips.cpp
@@ -482,9 +482,9 @@ bool mips::isFPXXDefault(const llvm::Triple &Triple, StringRef CPUName,
     return false;
 
   return llvm::StringSwitch<bool>(CPUName)
-      .Cases("mips2", "mips3", "mips4", "mips5", true)
-      .Cases("mips32", "mips32r2", "mips32r3", "mips32r5", true)
-      .Cases("mips64", "mips64r2", "mips64r3", "mips64r5", true)
+      .Cases({"mips2", "mips3", "mips4", "mips5"}, true)
+      .Cases({"mips32", "mips32r2", "mips32r3", "mips32r5"}, true)
+      .Cases({"mips64", "mips64r2", "mips64r3", "mips64r5"}, true)
       .Default(false);
 }
 
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 82f31ba8001fd..efc9318498cea 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -3799,7 +3799,8 @@ static void RenderHLSLOptions(const ArgList &Args, ArgStringList &CmdArgs,
       options::OPT_hlsl_entrypoint,
       options::OPT_fdx_rootsignature_define,
       options::OPT_fdx_rootsignature_version,
-      options::OPT_fhlsl_spv_use_unknown_image_format};
+      options::OPT_fhlsl_spv_use_unknown_image_format,
+      options::OPT_fhlsl_spv_enable_maximal_reconvergence};
   if (!types::isHLSL(InputType))
     return;
   for (const auto &Arg : ForwardedArguments)
@@ -9435,8 +9436,9 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
                    options::OPT_nogpulibc)) {
     forAllAssociatedToolChains(C, JA, getToolChain(), [&](const ToolChain &TC) {
       // The device C library is only available for NVPTX and AMDGPU targets
-      // currently.
-      if (!TC.getTriple().isNVPTX() && !TC.getTriple().isAMDGPU())
+      // and we only link it by default for OpenMP currently.
+      if ((!TC.getTriple().isNVPTX() && !TC.getTriple().isAMDGPU()) ||
+          !JA.isHostOffloading(Action::OFK_OpenMP))
         return;
       bool HasLibC = TC.getStdlibIncludePath().has_value();
       if (HasLibC) {
diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp
index d2356ebdfa86c..cc5bcd1816c52 100644
--- a/clang/lib/Driver/ToolChains/Darwin.cpp
+++ b/clang/lib/Driver/ToolChains/Darwin.cpp
@@ -51,15 +51,15 @@ llvm::Triple::ArchType darwin::getArchTypeForMachOArchName(StringRef Str) {
   // translation.
 
   return llvm::StringSwitch<llvm::Triple::ArchType>(Str)
-      .Cases("i386", "i486", "i486SX", "i586", "i686", llvm::Triple::x86)
-      .Cases("pentium", "pentpro", "pentIIm3", "pentIIm5", "pentium4",
+      .Cases({"i386", "i486", "i486SX", "i586", "i686"}, llvm::Triple::x86)
+      .Cases({"pentium", "pentpro", "pentIIm3", "pentIIm5", "pentium4"},
              llvm::Triple::x86)
-      .Cases("x86_64", "x86_64h", llvm::Triple::x86_64)
+      .Cases({"x86_64", "x86_64h"}, llvm::Triple::x86_64)
       // This is derived from the driver.
-      .Cases("arm", "armv4t", "armv5", "armv6", "armv6m", llvm::Triple::arm)
-      .Cases("armv7", "armv7em", "armv7k", "armv7m", llvm::Triple::arm)
-      .Cases("armv7s", "xscale", llvm::Triple::arm)
-      .Cases("arm64", "arm64e", llvm::Triple::aarch64)
+      .Cases({"arm", "armv4t", "armv5", "armv6", "armv6m"}, llvm::Triple::arm)
+      .Cases({"armv7", "armv7em", "armv7k", "armv7m"}, llvm::Triple::arm)
+      .Cases({"armv7s", "xscale"}, llvm::Triple::arm)
+      .Cases({"arm64", "arm64e"}, llvm::Triple::aarch64)
       .Case("arm64_32", llvm::Triple::aarch64_32)
       .Case("r600", llvm::Triple::r600)
       .Case("amdgcn", llvm::Triple::amdgcn)
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index 409140440c58a..8abc4ccaa2e17 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -1057,6 +1057,9 @@ void Flang::ConstructJob(Compilation &C, const JobAction &JA,
                     options::OPT_fno_openmp_simd);
   }
 
+  if (Args.hasArg(options::OPT_famd_allow_threadprivate_equivalence))
+    CmdArgs.push_back("-famd-allow-threadprivate-equivalence");
+
   // Pass the path to compiler resource files.
   CmdArgs.push_back("-resource-dir");
   CmdArgs.push_back(D.ResourceDir.c_str());
diff --git a/clang/lib/Driver/ToolChains/HLSL.cpp b/clang/lib/Driver/ToolChains/HLSL.cpp
index 5c8891fbfb98a..20a320ea233d4 100644
--- a/clang/lib/Driver/ToolChains/HLSL.cpp
+++ b/clang/lib/Driver/ToolChains/HLSL.cpp
@@ -191,23 +191,35 @@ void getSpirvExtOperand(StringRef SpvExtensionArg, raw_ostream &out) {
   // The extensions that are commented out are supported in DXC, but the SPIR-V
   // backend does not know about them yet.
   static const std::vector<StringRef> DxcSupportedExtensions = {
-      "SPV_KHR_16bit_storage", "SPV_KHR_device_group",
-      "SPV_KHR_fragment_shading_rate", "SPV_KHR_multiview",
-      "SPV_KHR_post_depth_coverage", "SPV_KHR_non_semantic_info",
-      "SPV_KHR_shader_draw_parameters", "SPV_KHR_ray_tracing",
-      "SPV_KHR_shader_clock", "SPV_EXT_demote_to_helper_invocation",
-      "SPV_EXT_descriptor_indexing", "SPV_EXT_fragment_fully_covered",
+      "SPV_KHR_16bit_storage",
+      "SPV_KHR_device_group",
+      "SPV_KHR_fragment_shading_rate",
+      "SPV_KHR_multiview",
+      "SPV_KHR_post_depth_coverage",
+      "SPV_KHR_non_semantic_info",
+      "SPV_KHR_shader_draw_parameters",
+      "SPV_KHR_ray_tracing",
+      "SPV_KHR_shader_clock",
+      "SPV_EXT_demote_to_helper_invocation",
+      "SPV_EXT_descriptor_indexing",
+      "SPV_EXT_fragment_fully_covered",
       "SPV_EXT_fragment_invocation_density",
-      "SPV_EXT_fragment_shader_interlock", "SPV_EXT_mesh_shader",
-      "SPV_EXT_shader_stencil_export", "SPV_EXT_shader_viewport_index_layer",
+      "SPV_EXT_fragment_shader_interlock",
+      "SPV_EXT_mesh_shader",
+      "SPV_EXT_shader_stencil_export",
+      "SPV_EXT_shader_viewport_index_layer",
       // "SPV_AMD_shader_early_and_late_fragment_tests",
-      "SPV_GOOGLE_hlsl_functionality1", "SPV_GOOGLE_user_type",
-      "SPV_KHR_ray_query", "SPV_EXT_shader_image_int64",
-      "SPV_KHR_fragment_shader_barycentric", "SPV_KHR_physical_storage_buffer",
+      "SPV_GOOGLE_hlsl_functionality1",
+      "SPV_GOOGLE_user_type",
+      "SPV_KHR_ray_query",
+      "SPV_EXT_shader_image_int64",
+      "SPV_KHR_fragment_shader_barycentric",
+      "SPV_KHR_physical_storage_buffer",
       "SPV_KHR_vulkan_memory_model",
       // "SPV_KHR_compute_shader_derivatives",
-      // "SPV_KHR_maximal_reconvergence",
-      "SPV_KHR_float_controls", "SPV_NV_shader_subgroup_partitioned",
+      "SPV_KHR_maximal_reconvergence",
+      "SPV_KHR_float_controls",
+      "SPV_NV_shader_subgroup_partitioned",
       // "SPV_KHR_quad_control"
   };
 
diff --git a/clang/lib/Driver/XRayArgs.cpp b/clang/lib/Driver/XRayArgs.cpp
index ceed7cb6acbbf..0325296f84b19 100644
--- a/clang/lib/Driver/XRayArgs.cpp
+++ b/clang/lib/Driver/XRayArgs.cpp
@@ -105,8 +105,9 @@ XRayArgs::XRayArgs(const ToolChain &TC, const ArgList &Args) {
       for (const auto &P : BundleParts) {
         // TODO: Automate the generation of the string case table.
         auto Valid = llvm::StringSwitch<bool>(P)
-                         .Cases("none", "all", "function", "function-entry",
-                                "function-exit", "custom", true)
+                         .Cases({"none", "all", "function", "function-entry",
+                                 "function-exit", "custom"},
+                                true)
                          .Default(false);
 
         if (!Valid) {
diff --git a/clang/lib/ExtractAPI/DeclarationFragments.cpp b/clang/lib/ExtractAPI/DeclarationFragments.cpp
index 541af6d587174..e5eda46df8056 100644
--- a/clang/lib/ExtractAPI/DeclarationFragments.cpp
+++ b/clang/lib/ExtractAPI/DeclarationFragments.cpp
@@ -422,7 +422,7 @@ DeclarationFragments DeclarationFragmentsBuilder::getFragmentsForType(
 
     Fragments.append(getFragmentsForNNS(TagTy->getQualifier(), Context, After));
 
-    const TagDecl *Decl = TagTy->getOriginalDecl();
+    const TagDecl *Decl = TagTy->getDecl();
     // Anonymous decl, skip this fragment.
     if (Decl->getName().empty())
       return Fragments.append("{ ... }",
diff --git a/clang/lib/ExtractAPI/TypedefUnderlyingTypeResolver.cpp b/clang/lib/ExtractAPI/TypedefUnderlyingTypeResolver.cpp
index 5adbbc6d1c34c..41e4e0cf1795f 100644
--- a/clang/lib/ExtractAPI/TypedefUnderlyingTypeResolver.cpp
+++ b/clang/lib/ExtractAPI/TypedefUnderlyingTypeResolver.cpp
@@ -26,7 +26,7 @@ TypedefUnderlyingTypeResolver::getUnderlyingTypeDecl(QualType Type) const {
   if (TypedefTy)
     TypeDecl = TypedefTy->getDecl();
   if (const TagType *TagTy = Type->getAs<TagType>()) {
-    TypeDecl = TagTy->getOriginalDecl();
+    TypeDecl = TagTy->getDecl();
   } else if (const ObjCInterfaceType *ObjCITy =
                  Type->getAs<ObjCInterfaceType>()) {
     TypeDecl = ObjCITy->getDecl();
diff --git a/clang/lib/Format/BreakableToken.cpp b/clang/lib/Format/BreakableToken.cpp
index 29db20067c361..994a427517ffc 100644
--- a/clang/lib/Format/BreakableToken.cpp
+++ b/clang/lib/Format/BreakableToken.cpp
@@ -306,8 +306,10 @@ BreakableStringLiteralUsingOperators::BreakableStringLiteralUsingOperators(
     // In Verilog, all strings are quoted by double quotes, joined by commas,
     // and wrapped in braces.  The comma is always before the newline.
     assert(QuoteStyle == DoubleQuotes);
-    LeftBraceQuote = Style.Cpp11BracedListStyle ? "{\"" : "{ \"";
-    RightBraceQuote = Style.Cpp11BracedListStyle ? "\"}" : "\" }";
+    LeftBraceQuote =
+        Style.Cpp11BracedListStyle != FormatStyle::BLS_Block ? "{\"" : "{ \"";
+    RightBraceQuote =
+        Style.Cpp11BracedListStyle != FormatStyle::BLS_Block ? "\"}" : "\" }";
     Postfix = "\",";
     Prefix = "\"";
   } else {
diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp
index cd4c1aabac971..37c10c66503bb 100644
--- a/clang/lib/Format/ContinuationIndenter.cpp
+++ b/clang/lib/Format/ContinuationIndenter.cpp
@@ -411,7 +411,7 @@ bool ContinuationIndenter::mustBreak(const LineState &State) {
   }
   if (CurrentState.BreakBeforeClosingBrace &&
       (Current.closesBlockOrBlockTypeList(Style) ||
-       (Current.is(tok::r_brace) &&
+       (Current.is(tok::r_brace) && Current.MatchingParen &&
         Current.isBlockIndentedInitRBrace(Style)))) {
     return true;
   }
@@ -433,7 +433,7 @@ bool ContinuationIndenter::mustBreak(const LineState &State) {
   }
   if ((startsNextParameter(Current, Style) || Previous.is(tok::semi) ||
        (Previous.is(TT_TemplateCloser) && Current.is(TT_StartOfName) &&
-        State.Line->First->isNot(TT_AttributeSquare) && Style.isCpp() &&
+        State.Line->First->isNot(TT_AttributeLSquare) && Style.isCpp() &&
         // FIXME: This is a temporary workaround for the case where clang-format
         // sets BreakBeforeParameter to avoid bin packing and this creates a
         // completely unnecessary line break after a template type that isn't
@@ -833,7 +833,7 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun,
   auto IsOpeningBracket = [&](const FormatToken &Tok) {
     auto IsStartOfBracedList = [&]() {
       return Tok.is(tok::l_brace) && Tok.isNot(BK_Block) &&
-             Style.Cpp11BracedListStyle;
+             Style.Cpp11BracedListStyle != FormatStyle::BLS_Block;
     };
     if (Tok.isNoneOf(tok::l_paren, TT_TemplateOpener, tok::l_square) &&
         !IsStartOfBracedList()) {
@@ -925,7 +925,12 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun,
                         TT_TableGenDAGArgOpenerToBreak) &&
       !(Current.MacroParent && Previous.MacroParent) &&
       (Current.isNot(TT_LineComment) ||
-       Previous.isOneOf(BK_BracedInit, TT_VerilogMultiLineListLParen)) &&
+       (Previous.is(BK_BracedInit) &&
+        (Style.Cpp11BracedListStyle != FormatStyle::BLS_FunctionCall ||
+         !Previous.Previous ||
+         Previous.Previous->isNoneOf(tok::identifier, tok::l_paren,
+                                     BK_BracedInit))) ||
+       Previous.is(TT_VerilogMultiLineListLParen)) &&
       !IsInTemplateString(Current)) {
     CurrentState.Indent = State.Column + Spaces;
     CurrentState.IsAligned = true;
@@ -1369,7 +1374,8 @@ unsigned ContinuationIndenter::getNewLineColumn(const LineState &State) {
   }
   if (Current.is(TT_LambdaArrow) &&
       Previous.isOneOf(tok::kw_noexcept, tok::kw_mutable, tok::kw_constexpr,
-                       tok::kw_consteval, tok::kw_static, TT_AttributeSquare)) {
+                       tok::kw_consteval, tok::kw_static,
+                       TT_AttributeRSquare)) {
     return ContinuationIndent;
   }
   if ((Current.isOneOf(tok::r_brace, tok::r_square) ||
@@ -1494,9 +1500,10 @@ unsigned ContinuationIndenter::getNewLineColumn(const LineState &State) {
          Current.isNot(tok::l_paren) &&
          !Current.endsSequence(TT_StartOfName, TT_AttributeMacro,
                                TT_PointerOrReference)) ||
-        PreviousNonComment->isOneOf(
-            TT_AttributeRParen, TT_AttributeSquare, TT_FunctionAnnotationRParen,
-            TT_JavaAnnotation, TT_LeadingJavaAnnotation))) ||
+        PreviousNonComment->isOneOf(TT_AttributeRParen, TT_AttributeRSquare,
+                                    TT_FunctionAnnotationRParen,
+                                    TT_JavaAnnotation,
+                                    TT_LeadingJavaAnnotation))) ||
       (!Style.IndentWrappedFunctionNames &&
        NextNonComment->isOneOf(tok::kw_operator, TT_FunctionDeclarationName))) {
     return std::max(CurrentState.LastSpace, CurrentState.Indent);
diff --git a/clang/lib/Format/DefinitionBlockSeparator.cpp b/clang/lib/Format/DefinitionBlockSeparator.cpp
index 3f4ce5fa3a428..855f2efad1e53 100644
--- a/clang/lib/Format/DefinitionBlockSeparator.cpp
+++ b/clang/lib/Format/DefinitionBlockSeparator.cpp
@@ -169,7 +169,7 @@ void DefinitionBlockSeparator::separateBlocks(
         }
       }
 
-      if (Style.isCSharp() && OperateLine->First->is(TT_AttributeSquare))
+      if (Style.isCSharp() && OperateLine->First->is(TT_AttributeLSquare))
         return true;
       return false;
     };
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index 686e54128d372..edd126c7724b8 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -304,6 +304,18 @@ struct ScalarEnumerationTraits<FormatStyle::BreakTemplateDeclarationsStyle> {
   }
 };
 
+template <> struct ScalarEnumerationTraits<FormatStyle::BracedListStyle> {
+  static void enumeration(IO &IO, FormatStyle::BracedListStyle &Value) {
+    IO.enumCase(Value, "Block", FormatStyle::BLS_Block);
+    IO.enumCase(Value, "FunctionCall", FormatStyle::BLS_FunctionCall);
+    IO.enumCase(Value, "AlignFirstComment", FormatStyle::BLS_AlignFirstComment);
+
+    // For backward compatibility.
+    IO.enumCase(Value, "false", FormatStyle::BLS_Block);
+    IO.enumCase(Value, "true", FormatStyle::BLS_AlignFirstComment);
+  }
+};
+
 template <> struct ScalarEnumerationTraits<FormatStyle::DAGArgStyle> {
   static void enumeration(IO &IO, FormatStyle::DAGArgStyle &Value) {
     IO.enumCase(Value, "DontBreak", FormatStyle::DAS_DontBreak);
@@ -1628,7 +1640,7 @@ FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) {
   LLVMStyle.CompactNamespaces = false;
   LLVMStyle.ConstructorInitializerIndentWidth = 4;
   LLVMStyle.ContinuationIndentWidth = 4;
-  LLVMStyle.Cpp11BracedListStyle = true;
+  LLVMStyle.Cpp11BracedListStyle = FormatStyle::BLS_AlignFirstComment;
   LLVMStyle.DerivePointerAlignment = false;
   LLVMStyle.DisableFormat = false;
   LLVMStyle.EmptyLineAfterAccessModifier = FormatStyle::ELAAMS_Never;
@@ -1904,7 +1916,7 @@ FormatStyle getGoogleStyle(FormatStyle::LanguageKind Language) {
     // beneficial there. Investigate turning this on once proper string reflow
     // has been implemented.
     GoogleStyle.BreakStringLiterals = false;
-    GoogleStyle.Cpp11BracedListStyle = false;
+    GoogleStyle.Cpp11BracedListStyle = FormatStyle::BLS_Block;
     GoogleStyle.SpacesInContainerLiterals = false;
   } else if (Language == FormatStyle::LK_ObjC) {
     GoogleStyle.AlwaysBreakBeforeMultilineStrings = false;
@@ -2000,7 +2012,7 @@ FormatStyle getMozillaStyle() {
   MozillaStyle.BreakTemplateDeclarations = FormatStyle::BTDS_Yes;
   MozillaStyle.ConstructorInitializerIndentWidth = 2;
   MozillaStyle.ContinuationIndentWidth = 2;
-  MozillaStyle.Cpp11BracedListStyle = false;
+  MozillaStyle.Cpp11BracedListStyle = FormatStyle::BLS_Block;
   MozillaStyle.FixNamespaceComments = false;
   MozillaStyle.IndentCaseLabels = true;
   MozillaStyle.ObjCSpaceAfterProperty = true;
@@ -2023,7 +2035,7 @@ FormatStyle getWebKitStyle() {
   Style.BreakBeforeBraces = FormatStyle::BS_WebKit;
   Style.BreakConstructorInitializers = FormatStyle::BCIS_BeforeComma;
   Style.ColumnLimit = 0;
-  Style.Cpp11BracedListStyle = false;
+  Style.Cpp11BracedListStyle = FormatStyle::BLS_Block;
   Style.FixNamespaceComments = false;
   Style.IndentWidth = 4;
   Style.NamespaceIndentation = FormatStyle::NI_Inner;
@@ -2043,7 +2055,7 @@ FormatStyle getGNUStyle() {
   Style.BreakBeforeBraces = FormatStyle::BS_GNU;
   Style.BreakBeforeTernaryOperators = true;
   Style.ColumnLimit = 79;
-  Style.Cpp11BracedListStyle = false;
+  Style.Cpp11BracedListStyle = FormatStyle::BLS_Block;
   Style.FixNamespaceComments = false;
   Style.KeepFormFeed = true;
   Style.SpaceBeforeParens = FormatStyle::SBPO_Always;
@@ -2184,8 +2196,9 @@ std::error_code parseConfiguration(llvm::MemoryBufferRef Config,
   Input >> Styles;
   if (Input.error())
     return Input.error();
+  if (Styles.empty())
+    return make_error_code(ParseError::Success);
 
-  assert(!Styles.empty());
   const auto StyleCount = Styles.size();
 
   // Start from the second style as (only) the first one may be the default.
diff --git a/clang/lib/Format/FormatToken.cpp b/clang/lib/Format/FormatToken.cpp
index c2956a179b8ed..d1c62642efd43 100644
--- a/clang/lib/Format/FormatToken.cpp
+++ b/clang/lib/Format/FormatToken.cpp
@@ -41,8 +41,7 @@ static constexpr std::array<StringRef, 14> QtPropertyKeywords = {
 
 bool FormatToken::isQtProperty() const {
   assert(llvm::is_sorted(QtPropertyKeywords));
-  return std::binary_search(QtPropertyKeywords.begin(),
-                            QtPropertyKeywords.end(), TokenText);
+  return llvm::binary_search(QtPropertyKeywords, TokenText);
 }
 
 // Sorted common C++ non-keyword types.
@@ -66,12 +65,13 @@ bool FormatToken::isTypeOrIdentifier(const LangOptions &LangOpts) const {
 
 bool FormatToken::isBlockIndentedInitRBrace(const FormatStyle &Style) const {
   assert(is(tok::r_brace));
-  if (!Style.Cpp11BracedListStyle ||
+  assert(MatchingParen);
+  assert(MatchingParen->is(tok::l_brace));
+  if (Style.Cpp11BracedListStyle == FormatStyle::BLS_Block ||
       Style.AlignAfterOpenBracket != FormatStyle::BAS_BlockIndent) {
     return false;
   }
   const auto *LBrace = MatchingParen;
-  assert(LBrace && LBrace->is(tok::l_brace));
   if (LBrace->is(BK_BracedInit))
     return true;
   if (LBrace->Previous && LBrace->Previous->is(tok::equal))
@@ -88,7 +88,8 @@ bool FormatToken::opensBlockOrBlockTypeList(const FormatStyle &Style) const {
   return is(TT_ArrayInitializerLSquare) || is(TT_ProtoExtensionLSquare) ||
          (is(tok::l_brace) &&
           (getBlockKind() == BK_Block || is(TT_DictLiteral) ||
-           (!Style.Cpp11BracedListStyle && NestingLevel == 0))) ||
+           (Style.Cpp11BracedListStyle == FormatStyle::BLS_Block &&
+            NestingLevel == 0))) ||
          (is(tok::less) && Style.isProto());
 }
 
@@ -184,7 +185,8 @@ void CommaSeparatedList::precomputeFormattingInfos(const FormatToken *Token) {
   // In C++11 braced list style, we should not format in columns unless they
   // have many items (20 or more) or we allow bin-packing of function call
   // arguments.
-  if (Style.Cpp11BracedListStyle && !Style.BinPackArguments &&
+  if (Style.Cpp11BracedListStyle != FormatStyle::BLS_Block &&
+      !Style.BinPackArguments &&
       (Commas.size() < 19 || !Style.BinPackLongBracedList)) {
     return;
   }
@@ -228,7 +230,7 @@ void CommaSeparatedList::precomputeFormattingInfos(const FormatToken *Token) {
       ItemEnd = Token->MatchingParen;
       const FormatToken *NonCommentEnd = ItemEnd->getPreviousNonComment();
       ItemLengths.push_back(CodePointsBetween(ItemBegin, NonCommentEnd));
-      if (Style.Cpp11BracedListStyle &&
+      if (Style.Cpp11BracedListStyle != FormatStyle::BLS_Block &&
           !ItemEnd->Previous->isTrailingComment()) {
         // In Cpp11 braced list style, the } and possibly other subsequent
         // tokens will need to stay on a line with the last element.
diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h
index f015d27bed6af..6f3d24aefc1ca 100644
--- a/clang/lib/Format/FormatToken.h
+++ b/clang/lib/Format/FormatToken.h
@@ -30,9 +30,10 @@ namespace format {
   TYPE(ArraySubscriptLSquare)                                                  \
   TYPE(AttributeColon)                                                         \
   TYPE(AttributeLParen)                                                        \
+  TYPE(AttributeLSquare)                                                       \
   TYPE(AttributeMacro)                                                         \
   TYPE(AttributeRParen)                                                        \
-  TYPE(AttributeSquare)                                                        \
+  TYPE(AttributeRSquare)                                                       \
   TYPE(BinaryOperator)                                                         \
   TYPE(BitFieldColon)                                                          \
   TYPE(BlockComment)                                                           \
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 5b784eded4601..c97a9e81eb59e 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -531,10 +531,6 @@ class AnnotatingParser {
             OpeningParen.Previous->is(TT_LeadingJavaAnnotation)) {
           CurrentToken->setType(TT_LeadingJavaAnnotation);
         }
-        if (OpeningParen.Previous &&
-            OpeningParen.Previous->is(TT_AttributeSquare)) {
-          CurrentToken->setType(TT_AttributeSquare);
-        }
 
         if (!HasMultipleLines)
           OpeningParen.setPackingKind(PPK_Inconclusive);
@@ -722,9 +718,11 @@ class AnnotatingParser {
       } else if (InsideInlineASM) {
         Left->setType(TT_InlineASMSymbolicNameLSquare);
       } else if (IsCpp11AttributeSpecifier) {
-        Left->setType(TT_AttributeSquare);
-        if (!IsInnerSquare && Left->Previous)
-          Left->Previous->EndsCppAttributeGroup = false;
+        if (!IsInnerSquare) {
+          Left->setType(TT_AttributeLSquare);
+          if (Left->Previous)
+            Left->Previous->EndsCppAttributeGroup = false;
+        }
       } else if (Style.isJavaScript() && Parent &&
                  Contexts.back().ContextKind == tok::l_brace &&
                  Parent->isOneOf(tok::l_brace, tok::comma)) {
@@ -733,7 +731,7 @@ class AnnotatingParser {
                  Parent && Parent->isOneOf(tok::l_brace, tok::comma)) {
         Left->setType(TT_DesignatedInitializerLSquare);
       } else if (IsCSharpAttributeSpecifier) {
-        Left->setType(TT_AttributeSquare);
+        Left->setType(TT_AttributeLSquare);
       } else if (CurrentToken->is(tok::r_square) && Parent &&
                  Parent->is(TT_TemplateCloser)) {
         Left->setType(TT_ArraySubscriptLSquare);
@@ -797,13 +795,12 @@ class AnnotatingParser {
 
     while (CurrentToken) {
       if (CurrentToken->is(tok::r_square)) {
-        if (IsCpp11AttributeSpecifier) {
-          CurrentToken->setType(TT_AttributeSquare);
-          if (!IsInnerSquare)
-            CurrentToken->EndsCppAttributeGroup = true;
+        if (IsCpp11AttributeSpecifier && !IsInnerSquare) {
+          CurrentToken->setType(TT_AttributeRSquare);
+          CurrentToken->EndsCppAttributeGroup = true;
         }
         if (IsCSharpAttributeSpecifier) {
-          CurrentToken->setType(TT_AttributeSquare);
+          CurrentToken->setType(TT_AttributeRSquare);
         } else if (((CurrentToken->Next &&
                      CurrentToken->Next->is(tok::l_paren)) ||
                     (CurrentToken->Previous &&
@@ -1297,7 +1294,7 @@ class AnnotatingParser {
   bool consumeToken() {
     if (IsCpp) {
       const auto *Prev = CurrentToken->getPreviousNonComment();
-      if (Prev && Prev->is(tok::r_square) && Prev->is(TT_AttributeSquare) &&
+      if (Prev && Prev->is(TT_AttributeRSquare) &&
           CurrentToken->isOneOf(tok::kw_if, tok::kw_switch, tok::kw_case,
                                 tok::kw_default, tok::kw_for, tok::kw_while) &&
           mustBreakAfterAttributes(*CurrentToken, Style)) {
@@ -2850,7 +2847,7 @@ class AnnotatingParser {
             T = Tok->Previous;
             continue;
           }
-        } else if (T->is(TT_AttributeSquare)) {
+        } else if (T->is(TT_AttributeRSquare)) {
           // Handle `x = (foo *[[clang::foo]])&v;`:
           if (T->MatchingParen && T->MatchingParen->Previous) {
             T = T->MatchingParen->Previous;
@@ -3656,7 +3653,7 @@ static FormatToken *getFunctionName(const AnnotatedLine &Line,
   for (FormatToken *Tok = Line.getFirstNonComment(), *Name = nullptr; Tok;
        Tok = Tok->getNextNonComment()) {
     // Skip C++11 attributes both before and after the function name.
-    if (Tok->is(tok::l_square) && Tok->is(TT_AttributeSquare)) {
+    if (Tok->is(TT_AttributeLSquare)) {
       Tok = Tok->MatchingParen;
       if (!Tok)
         break;
@@ -3794,18 +3791,12 @@ static bool isFunctionDeclarationName(const LangOptions &LangOpts,
   if (Current.is(TT_FunctionDeclarationName))
     return true;
 
-  if (!Current.Tok.getIdentifierInfo())
+  if (Current.isNoneOf(tok::identifier, tok::kw_operator))
     return false;
 
   const auto *Prev = Current.getPreviousNonComment();
   assert(Prev);
 
-  if (Prev->is(tok::coloncolon))
-    Prev = Prev->Previous;
-
-  if (!Prev)
-    return false;
-
   const auto &Previous = *Prev;
 
   if (const auto *PrevPrev = Previous.getPreviousNonComment();
@@ -3854,6 +3845,8 @@ static bool isFunctionDeclarationName(const LangOptions &LangOpts,
 
   // Find parentheses of parameter list.
   if (Current.is(tok::kw_operator)) {
+    if (Line.startsWith(tok::kw_friend))
+      return true;
     if (Previous.Tok.getIdentifierInfo() &&
         Previous.isNoneOf(tok::kw_return, tok::kw_co_return)) {
       return true;
@@ -4098,7 +4091,8 @@ void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) const {
     if (Current->is(TT_LineComment)) {
       if (Prev->is(BK_BracedInit) && Prev->opensScope()) {
         Current->SpacesRequiredBefore =
-            (Style.Cpp11BracedListStyle && !Style.SpacesInParensOptions.Other)
+            (Style.Cpp11BracedListStyle == FormatStyle::BLS_AlignFirstComment &&
+             !Style.SpacesInParensOptions.Other)
                 ? 0
                 : 1;
       } else if (Prev->is(TT_VerilogMultiLineListLParen)) {
@@ -4328,7 +4322,7 @@ unsigned TokenAnnotator::splitPenalty(const AnnotatedLine &Line,
       return 35;
     if (Right.isNoneOf(TT_ObjCMethodExpr, TT_LambdaLSquare,
                        TT_ArrayInitializerLSquare,
-                       TT_DesignatedInitializerLSquare, TT_AttributeSquare)) {
+                       TT_DesignatedInitializerLSquare, TT_AttributeLSquare)) {
       return 500;
     }
   }
@@ -4449,8 +4443,10 @@ unsigned TokenAnnotator::splitPenalty(const AnnotatedLine &Line,
         (Left.ParameterCount <= 1 || Style.AllowAllArgumentsOnNextLine)) {
       return 0;
     }
-    if (Left.is(tok::l_brace) && !Style.Cpp11BracedListStyle)
+    if (Left.is(tok::l_brace) &&
+        Style.Cpp11BracedListStyle == FormatStyle::BLS_Block) {
       return 19;
+    }
     return Left.ParameterCount > 1 ? Style.PenaltyBreakBeforeFirstCallParameter
                                    : 19;
   }
@@ -4616,7 +4612,7 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line,
       // Format empty list as `<>`.
       if (Left.is(tok::less) && Right.is(tok::greater))
         return false;
-      return !Style.Cpp11BracedListStyle;
+      return Style.Cpp11BracedListStyle == FormatStyle::BLS_Block;
     }
     // Don't attempt to format operator<(), as it is handled later.
     if (Right.isNot(TT_OverloadedOperatorLParen))
@@ -4784,7 +4780,8 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line,
   const auto SpaceRequiredForArrayInitializerLSquare =
       [](const FormatToken &LSquareTok, const FormatStyle &Style) {
         return Style.SpacesInContainerLiterals ||
-               (Style.isProto() && !Style.Cpp11BracedListStyle &&
+               (Style.isProto() &&
+                Style.Cpp11BracedListStyle == FormatStyle::BLS_Block &&
                 LSquareTok.endsSequence(tok::l_square, tok::colon,
                                         TT_SelectorName));
       };
@@ -4808,7 +4805,7 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line,
   if (Right.is(tok::l_square) &&
       Right.isNoneOf(TT_ObjCMethodExpr, TT_LambdaLSquare,
                      TT_DesignatedInitializerLSquare,
-                     TT_StructuredBindingLSquare, TT_AttributeSquare) &&
+                     TT_StructuredBindingLSquare, TT_AttributeLSquare) &&
       Left.isNoneOf(tok::numeric_constant, TT_DictLiteral) &&
       !(Left.isNot(tok::r_square) && Style.SpaceBeforeSquareBrackets &&
         Right.is(TT_ArraySubscriptLSquare))) {
@@ -4817,7 +4814,8 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line,
   if ((Left.is(tok::l_brace) && Left.isNot(BK_Block)) ||
       (Right.is(tok::r_brace) && Right.MatchingParen &&
        Right.MatchingParen->isNot(BK_Block))) {
-    return !Style.Cpp11BracedListStyle || Style.SpacesInParensOptions.Other;
+    return Style.Cpp11BracedListStyle == FormatStyle::BLS_Block ||
+           Style.SpacesInParensOptions.Other;
   }
   if (Left.is(TT_BlockComment)) {
     // No whitespace in x(/*foo=*/1), except for JavaScript.
@@ -4826,7 +4824,7 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line,
 
   // Space between template and attribute.
   // e.g. template <typename T> [[nodiscard]] ...
-  if (Left.is(TT_TemplateCloser) && Right.is(TT_AttributeSquare))
+  if (Left.is(TT_TemplateCloser) && Right.is(TT_AttributeLSquare))
     return true;
   // Space before parentheses common for all languages
   if (Right.is(tok::l_paren)) {
@@ -4841,10 +4839,8 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line,
       return Style.SpaceBeforeParensOptions.AfterRequiresInExpression ||
              spaceRequiredBeforeParens(Right);
     }
-    if (Left.is(TT_AttributeRParen) ||
-        (Left.is(tok::r_square) && Left.is(TT_AttributeSquare))) {
+    if (Left.isOneOf(TT_AttributeRParen, TT_AttributeRSquare))
       return true;
-    }
     if (Left.is(TT_ForEachMacro)) {
       return Style.SpaceBeforeParensOptions.AfterForeachMacros ||
              spaceRequiredBeforeParens(Right);
@@ -4999,7 +4995,7 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line,
       Left.Children.empty()) {
     if (Left.is(BK_Block))
       return Style.SpaceInEmptyBraces != FormatStyle::SIEB_Never;
-    if (Style.Cpp11BracedListStyle) {
+    if (Style.Cpp11BracedListStyle != FormatStyle::BLS_Block) {
       return Style.SpacesInParens == FormatStyle::SIPO_Custom &&
              Style.SpacesInParensOptions.InEmptyParentheses;
     }
@@ -5081,7 +5077,7 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line,
     if (Left.MatchingParen &&
         Left.MatchingParen->is(TT_ProtoExtensionLSquare) &&
         Right.isOneOf(tok::l_brace, tok::less)) {
-      return !Style.Cpp11BracedListStyle;
+      return Style.Cpp11BracedListStyle == FormatStyle::BLS_Block;
     }
     // A percent is probably part of a formatting specification, such as %lld.
     if (Left.is(tok::percent))
@@ -5521,7 +5517,7 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line,
   if (Left.is(tok::greater) && Right.is(tok::greater)) {
     if (Style.isTextProto() ||
         (Style.Language == FormatStyle::LK_Proto && Left.is(TT_DictLiteral))) {
-      return !Style.Cpp11BracedListStyle;
+      return Style.Cpp11BracedListStyle == FormatStyle::BLS_Block;
     }
     return Right.is(TT_TemplateCloser) && Left.is(TT_TemplateCloser) &&
            ((Style.Standard < FormatStyle::LS_Cpp11) ||
@@ -5662,16 +5658,14 @@ bool TokenAnnotator::mustBreakBefore(const AnnotatedLine &Line,
     }
 
     // Break after C# [...] and before public/protected/private/internal.
-    if (Left.is(TT_AttributeSquare) && Left.is(tok::r_square) &&
+    if (Left.is(TT_AttributeRSquare) &&
         (Right.isAccessSpecifier(/*ColonRequired=*/false) ||
          Right.is(Keywords.kw_internal))) {
       return true;
     }
     // Break between ] and [ but only when there are really 2 attributes.
-    if (Left.is(TT_AttributeSquare) && Right.is(TT_AttributeSquare) &&
-        Left.is(tok::r_square) && Right.is(tok::l_square)) {
+    if (Left.is(TT_AttributeRSquare) && Right.is(TT_AttributeLSquare))
       return true;
-    }
   } else if (Style.isJavaScript()) {
     // FIXME: This might apply to other languages and token kinds.
     if (Right.is(tok::string_literal) && Left.is(tok::plus) && BeforeLeft &&
@@ -6382,7 +6376,7 @@ bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line,
     return false;
   }
   if (Left.is(tok::equal) && Right.is(tok::l_brace) &&
-      !Style.Cpp11BracedListStyle) {
+      Style.Cpp11BracedListStyle == FormatStyle::BLS_Block) {
     return false;
   }
   if (Left.is(TT_AttributeLParen) ||
@@ -6411,8 +6405,10 @@ bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line,
   if (Right.isAttribute())
     return true;
 
-  if (Right.is(tok::l_square) && Right.is(TT_AttributeSquare))
-    return Left.isNot(TT_AttributeSquare);
+  if (Right.is(TT_AttributeLSquare)) {
+    assert(Left.isNot(tok::l_square));
+    return true;
+  }
 
   if (Left.is(tok::identifier) && Right.is(tok::string_literal))
     return true;
@@ -6453,8 +6449,12 @@ bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line,
        Left.getPrecedence() == prec::Assignment)) {
     return true;
   }
-  if ((Left.is(TT_AttributeSquare) && Right.is(tok::l_square)) ||
-      (Left.is(tok::r_square) && Right.is(TT_AttributeSquare))) {
+  if (Left.is(TT_AttributeLSquare) && Right.is(tok::l_square)) {
+    assert(Right.isNot(TT_AttributeLSquare));
+    return false;
+  }
+  if (Left.is(tok::r_square) && Right.is(TT_AttributeRSquare)) {
+    assert(Left.isNot(TT_AttributeRSquare));
     return false;
   }
 
diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index dec71191d7356..5e2584edac8f4 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -60,7 +60,7 @@ void printLine(llvm::raw_ostream &OS, const UnwrappedLine &Line,
     OS << "\n";
 }
 
-LLVM_ATTRIBUTE_UNUSED static void printDebugInfo(const UnwrappedLine &Line) {
+[[maybe_unused]] static void printDebugInfo(const UnwrappedLine &Line) {
   printLine(llvm::dbgs(), Line);
 }
 
diff --git a/clang/lib/Format/WhitespaceManager.cpp b/clang/lib/Format/WhitespaceManager.cpp
index 54f366fc02502..65fc65e79fdc3 100644
--- a/clang/lib/Format/WhitespaceManager.cpp
+++ b/clang/lib/Format/WhitespaceManager.cpp
@@ -289,17 +289,20 @@ AlignTokenSequence(const FormatStyle &Style, unsigned Start, unsigned End,
                    SmallVector<WhitespaceManager::Change, 16> &Changes) {
   int Shift = 0;
 
-  // ScopeStack keeps track of the current scope depth. It contains indices of
-  // the first token on each scope.
+  // ScopeStack keeps track of the current scope depth. It contains the levels
+  // of at most 2 scopes. The first one is the one that the matched token is
+  // in. The second one is the one that should not be moved by this procedure.
   // The "Matches" indices should only have tokens from the outer-most scope.
   // However, we do need to pay special attention to one class of tokens
-  // that are not in the outer-most scope, and that is function parameters
-  // which are split across multiple lines, as illustrated by this example:
+  // that are not in the outer-most scope, and that is the continuations of an
+  // unwrapped line whose positions are derived from a token to the right of the
+  // aligned token, as illustrated by this example:
   //   double a(int x);
   //   int    b(int  y,
   //          double z);
   // In the above example, we need to take special care to ensure that
-  // 'double z' is indented along with it's owning function 'b'.
+  // 'double z' is indented along with its owning function 'b', because its
+  // position is derived from the '(' token to the right of the 'b' token.
   // The same holds for calling a function:
   //   double a = foo(x);
   //   int    b = bar(foo(y),
@@ -309,32 +312,28 @@ AlignTokenSequence(const FormatStyle &Style, unsigned Start, unsigned End,
   //   auto s   = "Hello"
   //          "World";
   // Special handling is required for 'nested' ternary operators.
-  SmallVector<unsigned, 16> ScopeStack;
+  SmallVector<std::tuple<unsigned, unsigned, unsigned>, 2> ScopeStack;
 
   for (unsigned i = Start; i != End; ++i) {
     auto &CurrentChange = Changes[i];
     if (!Matches.empty() && Matches[0] < i)
       Matches.consume_front();
     assert(Matches.empty() || Matches[0] >= i);
-    if (!ScopeStack.empty() &&
-        CurrentChange.indentAndNestingLevel() <
-            Changes[ScopeStack.back()].indentAndNestingLevel()) {
+    while (!ScopeStack.empty() &&
+           CurrentChange.indentAndNestingLevel() < ScopeStack.back()) {
       ScopeStack.pop_back();
     }
 
-    // Compare current token to previous non-comment token to ensure whether
-    // it is in a deeper scope or not.
-    unsigned PreviousNonComment = i - 1;
-    while (PreviousNonComment > Start &&
-           Changes[PreviousNonComment].Tok->is(tok::comment)) {
-      --PreviousNonComment;
-    }
-    if (i != Start && CurrentChange.indentAndNestingLevel() >
-                          Changes[PreviousNonComment].indentAndNestingLevel()) {
-      ScopeStack.push_back(i);
+    // Keep track of the level that should not move with the aligned token.
+    if (ScopeStack.size() == 1u && CurrentChange.NewlinesBefore != 0u &&
+        CurrentChange.indentAndNestingLevel() > ScopeStack[0] &&
+        !CurrentChange.IsAligned) {
+      ScopeStack.push_back(CurrentChange.indentAndNestingLevel());
     }
 
-    bool InsideNestedScope = !ScopeStack.empty();
+    bool InsideNestedScope =
+        !ScopeStack.empty() &&
+        CurrentChange.indentAndNestingLevel() > ScopeStack[0];
     bool ContinuedStringLiteral = i > Start &&
                                   CurrentChange.Tok->is(tok::string_literal) &&
                                   Changes[i - 1].Tok->is(tok::string_literal);
@@ -349,103 +348,20 @@ AlignTokenSequence(const FormatStyle &Style, unsigned Start, unsigned End,
     if (!Matches.empty() && Matches[0] == i) {
       Shift = Column - (RightJustify ? CurrentChange.TokenLength : 0) -
               CurrentChange.StartOfTokenColumn;
+      ScopeStack = {CurrentChange.indentAndNestingLevel()};
       CurrentChange.Spaces += Shift;
     }
 
     if (Shift == 0)
       continue;
 
-    // This is for function parameters that are split across multiple lines,
-    // as mentioned in the ScopeStack comment.
-    if (InsideNestedScope && CurrentChange.NewlinesBefore > 0) {
-      unsigned ScopeStart = ScopeStack.back();
-      auto ShouldShiftBeAdded = [&] {
-        // Function declaration
-        if (Changes[ScopeStart - 1].Tok->is(TT_FunctionDeclarationName))
-          return true;
-
-        // Lambda.
-        if (Changes[ScopeStart - 1].Tok->is(TT_LambdaLBrace))
-          return false;
-
-        // Continued function declaration
-        if (ScopeStart > Start + 1 &&
-            Changes[ScopeStart - 2].Tok->is(TT_FunctionDeclarationName)) {
-          return true;
-        }
-
-        // Continued (template) function call.
-        if (ScopeStart > Start + 1 &&
-            Changes[ScopeStart - 2].Tok->isOneOf(tok::identifier,
-                                                 TT_TemplateCloser) &&
-            Changes[ScopeStart - 1].Tok->is(tok::l_paren) &&
-            Changes[ScopeStart].Tok->isNot(TT_LambdaLSquare)) {
-          if (CurrentChange.Tok->MatchingParen &&
-              CurrentChange.Tok->MatchingParen->is(TT_LambdaLBrace)) {
-            return false;
-          }
-          if (Changes[ScopeStart].NewlinesBefore > 0)
-            return false;
-          if (CurrentChange.Tok->is(tok::l_brace) &&
-              CurrentChange.Tok->is(BK_BracedInit)) {
-            return true;
-          }
-          return Style.BinPackArguments;
-        }
-
-        // Ternary operator
-        if (CurrentChange.Tok->is(TT_ConditionalExpr))
-          return true;
-
-        // Period Initializer .XXX = 1.
-        if (CurrentChange.Tok->is(TT_DesignatedInitializerPeriod))
-          return true;
-
-        // Continued ternary operator
-        if (CurrentChange.Tok->Previous &&
-            CurrentChange.Tok->Previous->is(TT_ConditionalExpr)) {
-          return true;
-        }
-
-        // Continued direct-list-initialization using braced list.
-        if (ScopeStart > Start + 1 &&
-            Changes[ScopeStart - 2].Tok->is(tok::identifier) &&
-            Changes[ScopeStart - 1].Tok->is(tok::l_brace) &&
-            CurrentChange.Tok->is(tok::l_brace) &&
-            CurrentChange.Tok->is(BK_BracedInit)) {
-          return true;
-        }
-
-        // Continued braced list.
-        if (ScopeStart > Start + 1 &&
-            Changes[ScopeStart - 2].Tok->isNot(tok::identifier) &&
-            Changes[ScopeStart - 1].Tok->is(tok::l_brace) &&
-            CurrentChange.Tok->isNot(tok::r_brace)) {
-          for (unsigned OuterScopeStart : llvm::reverse(ScopeStack)) {
-            // Lambda.
-            if (OuterScopeStart > Start &&
-                Changes[OuterScopeStart - 1].Tok->is(TT_LambdaLBrace)) {
-              return false;
-            }
-          }
-          if (Changes[ScopeStart].NewlinesBefore > 0)
-            return false;
-          return true;
-        }
-
-        // Continued template parameter.
-        if (Changes[ScopeStart - 1].Tok->is(TT_TemplateOpener))
-          return true;
-
-        return false;
-      };
-
-      if (ShouldShiftBeAdded())
-        CurrentChange.Spaces += Shift;
-    }
-
-    if (ContinuedStringLiteral)
+    // This is for lines that are split across multiple lines, as mentioned in
+    // the ScopeStack comment. The stack size being 1 means that the token is
+    // not in a scope that should not move.
+    if (ScopeStack.size() == 1u && CurrentChange.NewlinesBefore > 0 &&
+        (ContinuedStringLiteral || InsideNestedScope)) {
       CurrentChange.Spaces += Shift;
+    }
 
     // We should not remove required spaces unless we break the line before.
     assert(Shift > 0 || Changes[i].NewlinesBefore > 0 ||
@@ -516,7 +432,11 @@ AlignTokenSequence(const FormatStyle &Style, unsigned Start, unsigned End,
 // right-justified. It is used to align compound assignments like `+=` and `=`.
 // When RightJustify and ACS.PadOperators are true, operators in each block to
 // be aligned will be padded on the left to the same length before aligning.
-template <typename F>
+//
+// The simple check will not look at the indentaion and nesting level to recurse
+// into the line for alignment. It will also not count the commas. This is e.g.
+// for aligning macro definitions.
+template <typename F, bool SimpleCheck = false>
 static unsigned AlignTokens(const FormatStyle &Style, F &&Matches,
                             SmallVector<WhitespaceManager::Change, 16> &Changes,
                             unsigned StartAt,
@@ -549,9 +469,9 @@ static unsigned AlignTokens(const FormatStyle &Style, F &&Matches,
 
   // Measure the scope level (i.e. depth of (), [], {}) of the first token, and
   // abort when we hit any token in a higher scope than the starting one.
-  auto IndentAndNestingLevel = StartAt < Changes.size()
-                                   ? Changes[StartAt].indentAndNestingLevel()
-                                   : std::tuple<unsigned, unsigned, unsigned>();
+  const auto IndentAndNestingLevel =
+      StartAt < Changes.size() ? Changes[StartAt].indentAndNestingLevel()
+                               : std::tuple<unsigned, unsigned, unsigned>();
 
   // Keep track of the number of commas before the matching tokens, we will only
   // align a sequence of matching tokens if they are preceded by the same number
@@ -586,15 +506,15 @@ static unsigned AlignTokens(const FormatStyle &Style, F &&Matches,
     MatchedIndices.clear();
   };
 
-  unsigned i = StartAt;
-  for (unsigned e = Changes.size(); i != e; ++i) {
-    auto &CurrentChange = Changes[i];
+  unsigned I = StartAt;
+  for (unsigned E = Changes.size(); I != E; ++I) {
+    auto &CurrentChange = Changes[I];
     if (CurrentChange.indentAndNestingLevel() < IndentAndNestingLevel)
       break;
 
     if (CurrentChange.NewlinesBefore != 0) {
       CommasBeforeMatch = 0;
-      EndOfSequence = i;
+      EndOfSequence = I;
 
       // Whether to break the alignment sequence because of an empty line.
       bool EmptyLineBreak =
@@ -610,8 +530,8 @@ static unsigned AlignTokens(const FormatStyle &Style, F &&Matches,
 
       // A new line starts, re-initialize line status tracking bools.
       // Keep the match state if a string literal is continued on this line.
-      if (i == 0 || CurrentChange.Tok->isNot(tok::string_literal) ||
-          Changes[i - 1].Tok->isNot(tok::string_literal)) {
+      if (I == 0 || CurrentChange.Tok->isNot(tok::string_literal) ||
+          Changes[I - 1].Tok->isNot(tok::string_literal)) {
         FoundMatchOnLine = false;
       }
       LineIsComment = true;
@@ -620,14 +540,17 @@ static unsigned AlignTokens(const FormatStyle &Style, F &&Matches,
     if (CurrentChange.Tok->isNot(tok::comment))
       LineIsComment = false;
 
-    if (CurrentChange.Tok->is(tok::comma)) {
-      ++CommasBeforeMatch;
-    } else if (CurrentChange.indentAndNestingLevel() > IndentAndNestingLevel) {
-      // Call AlignTokens recursively, skipping over this scope block.
-      unsigned StoppedAt =
-          AlignTokens(Style, Matches, Changes, i, ACS, RightJustify);
-      i = StoppedAt - 1;
-      continue;
+    if (!SimpleCheck) {
+      if (CurrentChange.Tok->is(tok::comma)) {
+        ++CommasBeforeMatch;
+      } else if (CurrentChange.indentAndNestingLevel() >
+                 IndentAndNestingLevel) {
+        // Call AlignTokens recursively, skipping over this scope block.
+        const auto StoppedAt =
+            AlignTokens(Style, Matches, Changes, I, ACS, RightJustify);
+        I = StoppedAt - 1;
+        continue;
+      }
     }
 
     if (!Matches(CurrentChange))
@@ -636,7 +559,7 @@ static unsigned AlignTokens(const FormatStyle &Style, F &&Matches,
     // If there is more than one matching token per line, or if the number of
     // preceding commas, do not match anymore, end the sequence.
     if (FoundMatchOnLine || CommasBeforeMatch != CommasBeforeLastMatch) {
-      MatchedIndices.push_back(i);
+      MatchedIndices.push_back(I);
       AlignCurrentSequence();
     }
 
@@ -644,29 +567,69 @@ static unsigned AlignTokens(const FormatStyle &Style, F &&Matches,
     FoundMatchOnLine = true;
 
     if (StartOfSequence == 0)
-      StartOfSequence = i;
+      StartOfSequence = I;
 
     unsigned ChangeWidthLeft = CurrentChange.StartOfTokenColumn;
     unsigned ChangeWidthAnchor = 0;
     unsigned ChangeWidthRight = 0;
+    unsigned CurrentChangeWidthRight = 0;
     if (RightJustify)
       if (ACS.PadOperators)
         ChangeWidthAnchor = CurrentChange.TokenLength;
       else
         ChangeWidthLeft += CurrentChange.TokenLength;
     else
-      ChangeWidthRight = CurrentChange.TokenLength;
-    for (unsigned j = i + 1; j != e && Changes[j].NewlinesBefore == 0; ++j) {
-      ChangeWidthRight += Changes[j].Spaces;
+      CurrentChangeWidthRight = CurrentChange.TokenLength;
+    const FormatToken *MatchingParenToEncounter = nullptr;
+    for (unsigned J = I + 1;
+         J != E && (Changes[J].NewlinesBefore == 0 || MatchingParenToEncounter);
+         ++J) {
+      const auto &Change = Changes[J];
+      const auto *Tok = Change.Tok;
+
+      if (Tok->MatchingParen) {
+        if (Tok->isOneOf(tok::l_paren, tok::l_brace, tok::l_square,
+                         TT_TemplateOpener) &&
+            !MatchingParenToEncounter) {
+          // If the next token is on the next line, we probably don't need to
+          // check the following lengths, because it most likely isn't aligned
+          // with the rest.
+          if (J + 1 != E && Changes[J + 1].NewlinesBefore == 0)
+            MatchingParenToEncounter = Tok->MatchingParen;
+        } else if (MatchingParenToEncounter == Tok->MatchingParen) {
+          MatchingParenToEncounter = nullptr;
+        }
+      }
+
+      if (Change.NewlinesBefore != 0) {
+        ChangeWidthRight = std::max(ChangeWidthRight, CurrentChangeWidthRight);
+        const auto ChangeWidthStart = ChangeWidthLeft + ChangeWidthAnchor;
+        // If the position of the current token is columnwise before the begin
+        // of the alignment, we drop out here, because the next line does not
+        // have to be moved with the previous one(s) for the alignment. E.g.:
+        // int i1 = 1;     | <- ColumnLimit                | int i1 = 1;
+        // int j  = 0;     |          Without the break -> | int j  = 0;
+        // int k  = bar(   | We still want to align the =  | int k = bar(
+        //     argument1,  | here, even if we can't move   |     argument1,
+        //     argument2); | the following lines.          |     argument2);
+        if (static_cast<unsigned>(Change.Spaces) < ChangeWidthStart)
+          break;
+        CurrentChangeWidthRight = Change.Spaces - ChangeWidthStart;
+      } else {
+        CurrentChangeWidthRight += Change.Spaces;
+      }
+
       // Changes are generally 1:1 with the tokens, but a change could also be
       // inside of a token, in which case it's counted more than once: once for
       // the whitespace surrounding the token (!IsInsideToken) and once for
       // each whitespace change within it (IsInsideToken).
       // Therefore, changes inside of a token should only count the space.
-      if (!Changes[j].IsInsideToken)
-        ChangeWidthRight += Changes[j].TokenLength;
+      if (!Change.IsInsideToken)
+        CurrentChangeWidthRight += Change.TokenLength;
     }
 
+    ChangeWidthRight = std::max(ChangeWidthRight, CurrentChangeWidthRight);
+
     // If we are restricted by the maximum column width, end the sequence.
     unsigned NewLeft = std::max(ChangeWidthLeft, WidthLeft);
     unsigned NewAnchor = std::max(ChangeWidthAnchor, WidthAnchor);
@@ -675,7 +638,7 @@ static unsigned AlignTokens(const FormatStyle &Style, F &&Matches,
     if (Style.ColumnLimit != 0 &&
         Style.ColumnLimit < NewLeft + NewAnchor + NewRight) {
       AlignCurrentSequence();
-      StartOfSequence = i;
+      StartOfSequence = I;
       WidthLeft = ChangeWidthLeft;
       WidthAnchor = ChangeWidthAnchor;
       WidthRight = ChangeWidthRight;
@@ -684,12 +647,12 @@ static unsigned AlignTokens(const FormatStyle &Style, F &&Matches,
       WidthAnchor = NewAnchor;
       WidthRight = NewRight;
     }
-    MatchedIndices.push_back(i);
+    MatchedIndices.push_back(I);
   }
 
-  EndOfSequence = i;
+  EndOfSequence = I;
   AlignCurrentSequence();
-  return i;
+  return I;
 }
 
 // Aligns a sequence of matching tokens, on the MinColumn column.
@@ -740,7 +703,7 @@ void WhitespaceManager::alignConsecutiveMacros() {
 
   auto AlignMacrosMatches = [](const Change &C) {
     const FormatToken *Current = C.Tok;
-    unsigned SpacesRequiredBefore = 1;
+    assert(Current);
 
     if (Current->SpacesRequiredBefore == 0 || !Current->Previous)
       return false;
@@ -749,79 +712,26 @@ void WhitespaceManager::alignConsecutiveMacros() {
 
     // If token is a ")", skip over the parameter list, to the
     // token that precedes the "("
-    if (Current->is(tok::r_paren) && Current->MatchingParen) {
-      Current = Current->MatchingParen->Previous;
-      SpacesRequiredBefore = 0;
-    }
-
-    if (!Current || Current->isNot(tok::identifier))
-      return false;
-
-    if (!Current->Previous || Current->Previous->isNot(tok::pp_define))
-      return false;
-
-    // For a macro function, 0 spaces are required between the
-    // identifier and the lparen that opens the parameter list.
-    // For a simple macro, 1 space is required between the
-    // identifier and the first token of the defined value.
-    return Current->Next->SpacesRequiredBefore == SpacesRequiredBefore;
-  };
-
-  unsigned MinColumn = 0;
-
-  // Start and end of the token sequence we're processing.
-  unsigned StartOfSequence = 0;
-  unsigned EndOfSequence = 0;
-
-  // Whether a matching token has been found on the current line.
-  bool FoundMatchOnLine = false;
-
-  // Whether the current line consists only of comments
-  bool LineIsComment = true;
-
-  unsigned I = 0;
-  for (unsigned E = Changes.size(); I != E; ++I) {
-    if (Changes[I].NewlinesBefore != 0) {
-      EndOfSequence = I;
-
-      // Whether to break the alignment sequence because of an empty line.
-      bool EmptyLineBreak = (Changes[I].NewlinesBefore > 1) &&
-                            !Style.AlignConsecutiveMacros.AcrossEmptyLines;
-
-      // Whether to break the alignment sequence because of a line without a
-      // match.
-      bool NoMatchBreak =
-          !FoundMatchOnLine &&
-          !(LineIsComment && Style.AlignConsecutiveMacros.AcrossComments);
-
-      if (EmptyLineBreak || NoMatchBreak) {
-        AlignMatchingTokenSequence(StartOfSequence, EndOfSequence, MinColumn,
-                                   AlignMacrosMatches, Changes);
+    if (Current->is(tok::r_paren)) {
+      const auto *MatchingParen = Current->MatchingParen;
+      // For a macro function, 0 spaces are required between the
+      // identifier and the lparen that opens the parameter list.
+      if (!MatchingParen || MatchingParen->SpacesRequiredBefore > 0 ||
+          !MatchingParen->Previous) {
+        return false;
       }
-
-      // A new line starts, re-initialize line status tracking bools.
-      FoundMatchOnLine = false;
-      LineIsComment = true;
+      Current = MatchingParen->Previous;
+    } else if (Current->Next->SpacesRequiredBefore != 1) {
+      // For a simple macro, 1 space is required between the
+      // identifier and the first token of the defined value.
+      return false;
     }
 
-    if (Changes[I].Tok->isNot(tok::comment))
-      LineIsComment = false;
-
-    if (!AlignMacrosMatches(Changes[I]))
-      continue;
-
-    FoundMatchOnLine = true;
-
-    if (StartOfSequence == 0)
-      StartOfSequence = I;
-
-    unsigned ChangeMinColumn = Changes[I].StartOfTokenColumn;
-    MinColumn = std::max(MinColumn, ChangeMinColumn);
-  }
+    return Current->endsSequence(tok::identifier, tok::pp_define);
+  };
 
-  EndOfSequence = I;
-  AlignMatchingTokenSequence(StartOfSequence, EndOfSequence, MinColumn,
-                             AlignMacrosMatches, Changes);
+  AlignTokens<decltype(AlignMacrosMatches) &, /*SimpleCheck=*/true>(
+      Style, AlignMacrosMatches, Changes, 0, Style.AlignConsecutiveMacros);
 }
 
 void WhitespaceManager::alignConsecutiveAssignments() {
@@ -1322,7 +1232,8 @@ void WhitespaceManager::alignArrayInitializersRightJustified(
   if (!CellDescs.isRectangular())
     return;
 
-  const int BracePadding = Style.Cpp11BracedListStyle ? 0 : 1;
+  const int BracePadding =
+      Style.Cpp11BracedListStyle != FormatStyle::BLS_Block ? 0 : 1;
   auto &Cells = CellDescs.Cells;
   // Now go through and fixup the spaces.
   auto *CellIter = Cells.begin();
@@ -1398,7 +1309,8 @@ void WhitespaceManager::alignArrayInitializersLeftJustified(
   if (!CellDescs.isRectangular())
     return;
 
-  const int BracePadding = Style.Cpp11BracedListStyle ? 0 : 1;
+  const int BracePadding =
+      Style.Cpp11BracedListStyle != FormatStyle::BLS_Block ? 0 : 1;
   auto &Cells = CellDescs.Cells;
   // Now go through and fixup the spaces.
   auto *CellIter = Cells.begin();
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 7eb4ef3c75fc2..2fd283aaf8b64 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -4083,13 +4083,13 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
           auto Diag = Diags.Report(diag::note_drv_use_standard);
           Diag << Std.getName() << Std.getDescription();
           unsigned NumAliases = 0;
-#define LANGSTANDARD(id, name, lang, desc, features)
+#define LANGSTANDARD(id, name, lang, desc, features, version)
 #define LANGSTANDARD_ALIAS(id, alias) \
           if (KindValue == LangStandard::lang_##id) ++NumAliases;
 #define LANGSTANDARD_ALIAS_DEPR(id, alias)
 #include "clang/Basic/LangStandards.def"
           Diag << NumAliases;
-#define LANGSTANDARD(id, name, lang, desc, features)
+#define LANGSTANDARD(id, name, lang, desc, features, version)
 #define LANGSTANDARD_ALIAS(id, alias) \
           if (KindValue == LangStandard::lang_##id) Diag << alias;
 #define LANGSTANDARD_ALIAS_DEPR(id, alias)
diff --git a/clang/lib/Frontend/FrontendAction.cpp b/clang/lib/Frontend/FrontendAction.cpp
index 1b63c40a6efd7..0daa20a87dd7d 100644
--- a/clang/lib/Frontend/FrontendAction.cpp
+++ b/clang/lib/Frontend/FrontendAction.cpp
@@ -629,7 +629,7 @@ static std::error_code collectModuleHeaderIncludes(
       // Check whether this entry has an extension typically associated with
       // headers.
       if (!llvm::StringSwitch<bool>(llvm::sys::path::extension(Dir->path()))
-               .Cases(".h", ".H", ".hh", ".hpp", true)
+               .Cases({".h", ".H", ".hh", ".hpp"}, true)
                .Default(false))
         continue;
 
diff --git a/clang/lib/Frontend/FrontendOptions.cpp b/clang/lib/Frontend/FrontendOptions.cpp
index 32ed99571e85d..fb178b6b942e2 100644
--- a/clang/lib/Frontend/FrontendOptions.cpp
+++ b/clang/lib/Frontend/FrontendOptions.cpp
@@ -14,25 +14,26 @@ using namespace clang;
 
 InputKind FrontendOptions::getInputKindForExtension(StringRef Extension) {
   return llvm::StringSwitch<InputKind>(Extension)
-      .Cases("ast", "pcm", InputKind(Language::Unknown, InputKind::Precompiled))
+      .Cases({"ast", "pcm"},
+             InputKind(Language::Unknown, InputKind::Precompiled))
       .Case("c", Language::C)
-      .Cases("S", "s", Language::Asm)
+      .Cases({"S", "s"}, Language::Asm)
       .Case("i", InputKind(Language::C).getPreprocessed())
       .Case("ii", InputKind(Language::CXX).getPreprocessed())
       .Case("cui", InputKind(Language::CUDA).getPreprocessed())
       .Case("m", Language::ObjC)
       .Case("mi", InputKind(Language::ObjC).getPreprocessed())
-      .Cases("mm", "M", Language::ObjCXX)
+      .Cases({"mm", "M"}, Language::ObjCXX)
       .Case("mii", InputKind(Language::ObjCXX).getPreprocessed())
-      .Cases("C", "cc", "cp", Language::CXX)
-      .Cases("cpp", "CPP", "c++", "cxx", "hpp", "hxx", Language::CXX)
+      .Cases({"C", "cc", "cp"}, Language::CXX)
+      .Cases({"cpp", "CPP", "c++", "cxx", "hpp", "hxx"}, Language::CXX)
       .Case("cppm", Language::CXX)
-      .Cases("iim", "iih", InputKind(Language::CXX).getPreprocessed())
+      .Cases({"iim", "iih"}, InputKind(Language::CXX).getPreprocessed())
       .Case("cl", Language::OpenCL)
       .Case("clcpp", Language::OpenCLCXX)
-      .Cases("cu", "cuh", Language::CUDA)
+      .Cases({"cu", "cuh"}, Language::CUDA)
       .Case("hip", Language::HIP)
-      .Cases("ll", "bc", Language::LLVM_IR)
+      .Cases({"ll", "bc"}, Language::LLVM_IR)
       .Case("hlsl", Language::HLSL)
       .Case("cir", Language::CIR)
       .Default(Language::Unknown);
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index b899fb9c6494a..47f1d5a6b636c 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -459,43 +459,12 @@ static void InitializeStandardPredefinedMacros(const TargetInfo &TI,
   //      value is, are implementation-defined.
   // (Removed in C++20.)
   if (!LangOpts.CPlusPlus) {
-    if (LangOpts.C2y)
-      Builder.defineMacro("__STDC_VERSION__", "202400L");
-    else if (LangOpts.C23)
-      Builder.defineMacro("__STDC_VERSION__", "202311L");
-    else if (LangOpts.C17)
-      Builder.defineMacro("__STDC_VERSION__", "201710L");
-    else if (LangOpts.C11)
-      Builder.defineMacro("__STDC_VERSION__", "201112L");
-    else if (LangOpts.C99)
-      Builder.defineMacro("__STDC_VERSION__", "199901L");
-    else if (!LangOpts.GNUMode && LangOpts.Digraphs)
-      Builder.defineMacro("__STDC_VERSION__", "199409L");
+    if (std::optional<uint32_t> Lang = LangOpts.getCLangStd())
+      Builder.defineMacro("__STDC_VERSION__", Twine(*Lang) + "L");
   } else {
     //   -- __cplusplus
-    if (LangOpts.CPlusPlus26)
-      // FIXME: Use correct value for C++26.
-      Builder.defineMacro("__cplusplus", "202400L");
-    else if (LangOpts.CPlusPlus23)
-      Builder.defineMacro("__cplusplus", "202302L");
-    //      [C++20] The integer literal 202002L.
-    else if (LangOpts.CPlusPlus20)
-      Builder.defineMacro("__cplusplus", "202002L");
-    //      [C++17] The integer literal 201703L.
-    else if (LangOpts.CPlusPlus17)
-      Builder.defineMacro("__cplusplus", "201703L");
-    //      [C++14] The name __cplusplus is defined to the value 201402L when
-    //      compiling a C++ translation unit.
-    else if (LangOpts.CPlusPlus14)
-      Builder.defineMacro("__cplusplus", "201402L");
-    //      [C++11] The name __cplusplus is defined to the value 201103L when
-    //      compiling a C++ translation unit.
-    else if (LangOpts.CPlusPlus11)
-      Builder.defineMacro("__cplusplus", "201103L");
-    //      [C++03] The name __cplusplus is defined to the value 199711L when
-    //      compiling a C++ translation unit.
-    else
-      Builder.defineMacro("__cplusplus", "199711L");
+    Builder.defineMacro("__cplusplus",
+                        Twine(*LangOpts.getCPlusPlusLangStd()) + "L");
 
     //   -- __STDCPP_DEFAULT_NEW_ALIGNMENT__
     //      [C++17] An integer literal of type std::size_t whose value is the
@@ -616,6 +585,7 @@ static void InitializeStandardPredefinedMacros(const TargetInfo &TI,
     Builder.defineMacro("__HIP_MEMORY_SCOPE_WORKGROUP", "3");
     Builder.defineMacro("__HIP_MEMORY_SCOPE_AGENT", "4");
     Builder.defineMacro("__HIP_MEMORY_SCOPE_SYSTEM", "5");
+    Builder.defineMacro("__HIP_MEMORY_SCOPE_CLUSTER", "6");
     if (LangOpts.HIPStdPar) {
       Builder.defineMacro("__HIPSTDPAR__");
       if (LangOpts.HIPStdParInterposeAlloc) {
@@ -904,6 +874,7 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
   Builder.defineMacro("__MEMORY_SCOPE_WRKGRP", "2");
   Builder.defineMacro("__MEMORY_SCOPE_WVFRNT", "3");
   Builder.defineMacro("__MEMORY_SCOPE_SINGLE", "4");
+  Builder.defineMacro("__MEMORY_SCOPE_CLUSTR", "5");
 
   // Define macros for the OpenCL memory scope.
   // The values should match AtomicScopeOpenCLModel::ID enum.
diff --git a/clang/lib/Frontend/Rewrite/RewriteModernObjC.cpp b/clang/lib/Frontend/Rewrite/RewriteModernObjC.cpp
index 42f2d6591b213..dee29fe004f42 100644
--- a/clang/lib/Frontend/Rewrite/RewriteModernObjC.cpp
+++ b/clang/lib/Frontend/Rewrite/RewriteModernObjC.cpp
@@ -852,7 +852,7 @@ RewriteModernObjC::getIvarAccessString(ObjCIvarDecl *D) {
     IvarT = GetGroupRecordTypeForObjCIvarBitfield(D);
 
   if (!IvarT->getAs<TypedefType>() && IvarT->isRecordType()) {
-    RecordDecl *RD = IvarT->castAsCanonical<RecordType>()->getOriginalDecl();
+    RecordDecl *RD = IvarT->castAsCanonical<RecordType>()->getDecl();
     RD = RD->getDefinition();
     if (RD && !RD->getDeclName().getAsIdentifierInfo()) {
       // decltype(((Foo_IMPL*)0)->bar) *
@@ -7453,8 +7453,7 @@ Stmt *RewriteModernObjC::RewriteObjCIvarRefExpr(ObjCIvarRefExpr *IV) {
         IvarT = GetGroupRecordTypeForObjCIvarBitfield(D);
 
       if (!IvarT->getAs<TypedefType>() && IvarT->isRecordType()) {
-        RecordDecl *RD =
-            IvarT->castAsCanonical<RecordType>()->getOriginalDecl();
+        RecordDecl *RD = IvarT->castAsCanonical<RecordType>()->getDecl();
         RD = RD->getDefinition();
         if (RD && !RD->getDeclName().getAsIdentifierInfo()) {
           // decltype(((Foo_IMPL*)0)->bar) *
diff --git a/clang/lib/Headers/__clang_hip_runtime_wrapper.h b/clang/lib/Headers/__clang_hip_runtime_wrapper.h
index da1e39ac7270e..19ce7a5d2c86b 100644
--- a/clang/lib/Headers/__clang_hip_runtime_wrapper.h
+++ b/clang/lib/Headers/__clang_hip_runtime_wrapper.h
@@ -25,6 +25,9 @@
 #define __constant__ __attribute__((constant))
 #define __managed__ __attribute__((managed))
 
+#define __cluster_dims__(...) __attribute__((cluster_dims(__VA_ARGS__)))
+#define __no_cluster__ __attribute__((no_cluster))
+
 #if !defined(__cplusplus) || __cplusplus < 201103L
   #define nullptr NULL;
 #endif
diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h
index 4aaca2db8236a..fdb825fbbd134 100644
--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@@ -834,10 +834,9 @@ _mm256_cmpgt_epi64(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 /// \returns A 256-bit vector of [16 x i16] containing the sums.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_hadd_epi16(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_hadd_epi16(__m256i __a, __m256i __b) {
+  return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
 }
 
 /// Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit
@@ -866,10 +865,9 @@ _mm256_hadd_epi16(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 /// \returns A 256-bit vector of [8 x i32] containing the sums.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_hadd_epi32(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_hadd_epi32(__m256i __a, __m256i __b) {
+  return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
 }
 
 /// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
@@ -901,10 +899,9 @@ _mm256_hadd_epi32(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 /// \returns A 256-bit vector of [16 x i16] containing the sums.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_hadds_epi16(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_hadds_epi16(__m256i __a, __m256i __b) {
+  return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
 }
 
 /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
@@ -937,10 +934,9 @@ _mm256_hadds_epi16(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 /// \returns A 256-bit vector of [16 x i16] containing the differences.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_hsub_epi16(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_hsub_epi16(__m256i __a, __m256i __b) {
+  return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
 }
 
 /// Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit
@@ -969,10 +965,9 @@ _mm256_hsub_epi16(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 /// \returns A 256-bit vector of [8 x i32] containing the differences.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_hsub_epi32(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_hsub_epi32(__m256i __a, __m256i __b) {
+  return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
 }
 
 /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
@@ -1005,10 +1000,9 @@ _mm256_hsub_epi32(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 /// \returns A 256-bit vector of [16 x i16] containing the differences.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_hsubs_epi16(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_hsubs_epi16(__m256i __a, __m256i __b) {
+  return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
 }
 
 /// Multiplies each unsigned byte from the 256-bit integer vector in \a __a
@@ -1304,9 +1298,8 @@ _mm256_min_epu32(__m256i __a, __m256i __b) {
 /// \param __a
 ///    A 256-bit integer vector containing the source bytes.
 /// \returns The 32-bit integer mask.
-static __inline__ int __DEFAULT_FN_ATTRS256
-_mm256_movemask_epi8(__m256i __a)
-{
+static __inline__ int __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_movemask_epi8(__m256i __a) {
   return __builtin_ia32_pmovmskb256((__v32qi)__a);
 }
 
@@ -1656,9 +1649,8 @@ _mm256_mul_epi32(__m256i __a, __m256i __b) {
 /// \param __b
 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 /// \returns A 256-bit vector of [16 x i16] containing the rounded products.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mulhrs_epi16(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mulhrs_epi16(__m256i __a, __m256i __b) {
   return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b);
 }
 
@@ -1676,8 +1668,7 @@ _mm256_mulhrs_epi16(__m256i __a, __m256i __b)
 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 /// \returns A 256-bit vector of [16 x i16] containing the products.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_mulhi_epu16(__m256i __a, __m256i __b)
-{
+_mm256_mulhi_epu16(__m256i __a, __m256i __b) {
   return (__m256i)__builtin_ia32_pmulhuw256((__v16hu)__a, (__v16hu)__b);
 }
 
@@ -1858,9 +1849,8 @@ _mm256_sad_epu8(__m256i __a, __m256i __b)
 ///    control byte specify the index (within the same 128-bit half) of \a __a
 ///    to copy to the result byte.
 /// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_shuffle_epi8(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_shuffle_epi8(__m256i __a, __m256i __b) {
   return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b);
 }
 
diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h
index 473fe94af65d8..ac75b6ccde735 100644
--- a/clang/lib/Headers/avx512bwintrin.h
+++ b/clang/lib/Headers/avx512bwintrin.h
@@ -866,23 +866,20 @@ _mm512_mask_min_epu16(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) {
                                             (__v32hi)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_shuffle_epi8(__m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_shuffle_epi8(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_pshufb512((__v64qi)__A,(__v64qi)__B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_shuffle_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_shuffle_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
                                          (__v64qi)_mm512_shuffle_epi8(__A, __B),
                                          (__v64qi)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_shuffle_epi8(__mmask64 __U, __m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_shuffle_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
                                          (__v64qi)_mm512_shuffle_epi8(__A, __B),
                                          (__v64qi)_mm512_setzero_si512());
@@ -1006,23 +1003,20 @@ _mm512_maskz_permutex2var_epi16(__mmask32 __U, __m512i __A, __m512i __I,
                               (__v32hi)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mulhrs_epi16(__m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mulhrs_epi16(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_pmulhrsw512((__v32hi)__A, (__v32hi)__B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_mulhrs_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_mulhrs_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
                                          (__v32hi)_mm512_mulhrs_epi16(__A, __B),
                                          (__v32hi)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_mulhrs_epi16(__mmask32 __U, __m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_mulhrs_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
                                          (__v32hi)_mm512_mulhrs_epi16(__A, __B),
                                          (__v32hi)_mm512_setzero_si512());
diff --git a/clang/lib/Headers/avx512cdintrin.h b/clang/lib/Headers/avx512cdintrin.h
index 88992983cdd89..fb6dcb6dd8ad1 100644
--- a/clang/lib/Headers/avx512cdintrin.h
+++ b/clang/lib/Headers/avx512cdintrin.h
@@ -15,94 +15,82 @@
 #define __AVX512CDINTRIN_H
 
 /* Define the default attributes for the functions in this file. */
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS                                                     \
+  constexpr __attribute__((__always_inline__, __nodebug__,                     \
+                           __target__("avx512cd"), __min_vector_width__(512)))
+#else
 #define __DEFAULT_FN_ATTRS                                                     \
   __attribute__((__always_inline__, __nodebug__, __target__("avx512cd"),       \
                  __min_vector_width__(512)))
-
-#if defined(__cplusplus) && (__cplusplus >= 201103L)
-#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
-#else
-#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
 #endif
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_conflict_epi64 (__m512i __A)
-{
-  return (__m512i) __builtin_ia32_vpconflictdi_512 ((__v8di) __A);
+_mm512_conflict_epi64(__m512i __A) {
+  return (__m512i)__builtin_ia32_vpconflictdi_512((__v8di)__A);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_conflict_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
-{
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                             (__v8di)_mm512_conflict_epi64(__A),
-                                             (__v8di)__W);
+_mm512_mask_conflict_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
+  return (__m512i)__builtin_ia32_selectq_512(
+      (__mmask8)__U, (__v8di)_mm512_conflict_epi64(__A), (__v8di)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_conflict_epi64 (__mmask8 __U, __m512i __A)
-{
+_mm512_maskz_conflict_epi64(__mmask8 __U, __m512i __A) {
   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
                                              (__v8di)_mm512_conflict_epi64(__A),
-                                             (__v8di)_mm512_setzero_si512 ());
+                                             (__v8di)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_conflict_epi32 (__m512i __A)
-{
-  return (__m512i) __builtin_ia32_vpconflictsi_512 ((__v16si) __A);
+_mm512_conflict_epi32(__m512i __A) {
+  return (__m512i)__builtin_ia32_vpconflictsi_512((__v16si)__A);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_conflict_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                            (__v16si)_mm512_conflict_epi32(__A),
-                                            (__v16si)__W);
+_mm512_mask_conflict_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      (__mmask16)__U, (__v16si)_mm512_conflict_epi32(__A), (__v16si)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_conflict_epi32 (__mmask16 __U, __m512i __A)
-{
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                            (__v16si)_mm512_conflict_epi32(__A),
-                                            (__v16si)_mm512_setzero_si512());
+_mm512_maskz_conflict_epi32(__mmask16 __U, __m512i __A) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      (__mmask16)__U, (__v16si)_mm512_conflict_epi32(__A),
+      (__v16si)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm512_lzcnt_epi32(__m512i __A) {
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_lzcnt_epi32(__m512i __A) {
   return (__m512i)__builtin_elementwise_clzg((__v16si)__A,
                                              (__v16si)_mm512_set1_epi32(32));
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_lzcnt_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
-  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
-                                             (__v16si)_mm512_lzcnt_epi32(__A),
-                                             (__v16si)__W);
+  return (__m512i)__builtin_ia32_selectd_512(
+      (__mmask16)__U, (__v16si)_mm512_lzcnt_epi32(__A), (__v16si)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_lzcnt_epi32(__mmask16 __U, __m512i __A) {
   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
                                              (__v16si)_mm512_lzcnt_epi32(__A),
                                              (__v16si)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm512_lzcnt_epi64(__m512i __A) {
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_lzcnt_epi64(__m512i __A) {
   return (__m512i)__builtin_elementwise_clzg(
       (__v8di)__A, (__v8di)_mm512_set1_epi64((long long)64));
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_lzcnt_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
-  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
-                                             (__v8di)_mm512_lzcnt_epi64(__A),
-                                             (__v8di)__W);
+  return (__m512i)__builtin_ia32_selectq_512(
+      (__mmask8)__U, (__v8di)_mm512_lzcnt_epi64(__A), (__v8di)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
+static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_lzcnt_epi64(__mmask8 __U, __m512i __A) {
   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
                                              (__v8di)_mm512_lzcnt_epi64(__A),
@@ -110,19 +98,15 @@ _mm512_maskz_lzcnt_epi64(__mmask8 __U, __m512i __A) {
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_broadcastmb_epi64 (__mmask8 __A)
-{
-  return (__m512i) _mm512_set1_epi64((long long) __A);
+_mm512_broadcastmb_epi64(__mmask8 __A) {
+  return (__m512i)_mm512_set1_epi64((long long)__A);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_broadcastmw_epi32 (__mmask16 __A)
-{
-  return (__m512i) _mm512_set1_epi32((int) __A);
-
+_mm512_broadcastmw_epi32(__mmask16 __A) {
+  return (__m512i)_mm512_set1_epi32((int)__A);
 }
 
 #undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS_CONSTEXPR
 
 #endif
diff --git a/clang/lib/Headers/avx512dqintrin.h b/clang/lib/Headers/avx512dqintrin.h
index fb65bf933b8ad..fef1a2d64d538 100644
--- a/clang/lib/Headers/avx512dqintrin.h
+++ b/clang/lib/Headers/avx512dqintrin.h
@@ -1083,17 +1083,15 @@ _mm512_broadcast_f32x2(__m128 __A) {
                                          0, 1, 0, 1, 0, 1, 0, 1);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcast_f32x2 (__m512 __O, __mmask16 __M, __m128 __A)
-{
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_broadcast_f32x2(__m512 __O, __mmask16 __M, __m128 __A) {
   return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
                                              (__v16sf)_mm512_broadcast_f32x2(__A),
                                              (__v16sf)__O);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcast_f32x2 (__mmask16 __M, __m128 __A)
-{
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_broadcast_f32x2(__mmask16 __M, __m128 __A) {
   return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
                                              (__v16sf)_mm512_broadcast_f32x2(__A),
                                              (__v16sf)_mm512_setzero_ps());
@@ -1106,17 +1104,15 @@ _mm512_broadcast_f32x8(__m256 __A) {
                                          0, 1, 2, 3, 4, 5, 6, 7);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcast_f32x8(__m512 __O, __mmask16 __M, __m256 __A)
-{
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_broadcast_f32x8(__m512 __O, __mmask16 __M, __m256 __A) {
   return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
                                            (__v16sf)_mm512_broadcast_f32x8(__A),
                                            (__v16sf)__O);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcast_f32x8(__mmask16 __M, __m256 __A)
-{
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_broadcast_f32x8(__mmask16 __M, __m256 __A) {
   return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
                                            (__v16sf)_mm512_broadcast_f32x8(__A),
                                            (__v16sf)_mm512_setzero_ps());
@@ -1128,17 +1124,15 @@ _mm512_broadcast_f64x2(__m128d __A) {
                                           0, 1, 0, 1, 0, 1, 0, 1);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcast_f64x2(__m512d __O, __mmask8 __M, __m128d __A)
-{
+static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_broadcast_f64x2(__m512d __O, __mmask8 __M, __m128d __A) {
   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
                                             (__v8df)_mm512_broadcast_f64x2(__A),
                                             (__v8df)__O);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A)
-{
+static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A) {
   return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
                                             (__v8df)_mm512_broadcast_f64x2(__A),
                                             (__v8df)_mm512_setzero_pd());
@@ -1151,17 +1145,15 @@ _mm512_broadcast_i32x2(__m128i __A) {
                                           0, 1, 0, 1, 0, 1, 0, 1);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcast_i32x2 (__m512i __O, __mmask16 __M, __m128i __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_broadcast_i32x2(__m512i __O, __mmask16 __M, __m128i __A) {
   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
                                              (__v16si)_mm512_broadcast_i32x2(__A),
                                              (__v16si)__O);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcast_i32x2 (__mmask16 __M, __m128i __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_broadcast_i32x2(__mmask16 __M, __m128i __A) {
   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
                                              (__v16si)_mm512_broadcast_i32x2(__A),
                                              (__v16si)_mm512_setzero_si512());
@@ -1174,17 +1166,15 @@ _mm512_broadcast_i32x8(__m256i __A) {
                                           0, 1, 2, 3, 4, 5, 6, 7);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcast_i32x8(__m512i __O, __mmask16 __M, __m256i __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_broadcast_i32x8(__m512i __O, __mmask16 __M, __m256i __A) {
   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
                                            (__v16si)_mm512_broadcast_i32x8(__A),
                                            (__v16si)__O);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcast_i32x8(__mmask16 __M, __m256i __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_broadcast_i32x8(__mmask16 __M, __m256i __A) {
   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
                                            (__v16si)_mm512_broadcast_i32x8(__A),
                                            (__v16si)_mm512_setzero_si512());
@@ -1196,26 +1186,24 @@ _mm512_broadcast_i64x2(__m128i __A) {
                                           0, 1, 0, 1, 0, 1, 0, 1);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcast_i64x2(__m512i __O, __mmask8 __M, __m128i __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_broadcast_i64x2(__m512i __O, __mmask8 __M, __m128i __A) {
   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
                                             (__v8di)_mm512_broadcast_i64x2(__A),
                                             (__v8di)__O);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A) {
   return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
                                             (__v8di)_mm512_broadcast_i64x2(__A),
                                             (__v8di)_mm512_setzero_si512());
 }
 
-#define _mm512_extractf32x8_ps(A, imm) \
-  ((__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
-                                            (__v8sf)_mm256_undefined_ps(), \
-                                            (__mmask8)-1))
+#define _mm512_extractf32x8_ps(A, imm)                                         \
+  ((__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm),  \
+                                            (__v8sf)_mm256_setzero_ps(),       \
+                                            (__mmask8) - 1))
 
 #define _mm512_mask_extractf32x8_ps(W, U, A, imm) \
   ((__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
@@ -1227,11 +1215,10 @@ _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A)
                                             (__v8sf)_mm256_setzero_ps(), \
                                             (__mmask8)(U)))
 
-#define _mm512_extractf64x2_pd(A, imm) \
-  ((__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
-                                                 (int)(imm), \
-                                                 (__v2df)_mm_undefined_pd(), \
-                                                 (__mmask8)-1))
+#define _mm512_extractf64x2_pd(A, imm)                                         \
+  ((__m128d)__builtin_ia32_extractf64x2_512_mask(                              \
+      (__v8df)(__m512d)(A), (int)(imm), (__v2df)_mm_setzero_pd(),              \
+      (__mmask8) - 1))
 
 #define _mm512_mask_extractf64x2_pd(W, U, A, imm) \
   ((__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
@@ -1245,10 +1232,10 @@ _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A)
                                                  (__v2df)_mm_setzero_pd(), \
                                                  (__mmask8)(U)))
 
-#define _mm512_extracti32x8_epi32(A, imm) \
-  ((__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
-                                             (__v8si)_mm256_undefined_si256(), \
-                                             (__mmask8)-1))
+#define _mm512_extracti32x8_epi32(A, imm)                                      \
+  ((__m256i)__builtin_ia32_extracti32x8_mask(                                  \
+      (__v16si)(__m512i)(A), (int)(imm), (__v8si)_mm256_setzero_si256(),       \
+      (__mmask8) - 1))
 
 #define _mm512_mask_extracti32x8_epi32(W, U, A, imm) \
   ((__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
@@ -1260,11 +1247,10 @@ _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A)
                                              (__v8si)_mm256_setzero_si256(), \
                                              (__mmask8)(U)))
 
-#define _mm512_extracti64x2_epi64(A, imm) \
-  ((__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
-                                                (int)(imm), \
-                                                (__v2di)_mm_undefined_si128(), \
-                                                (__mmask8)-1))
+#define _mm512_extracti64x2_epi64(A, imm)                                      \
+  ((__m128i)__builtin_ia32_extracti64x2_512_mask(                              \
+      (__v8di)(__m512i)(A), (int)(imm), (__v2di)_mm_setzero_si128(),           \
+      (__mmask8) - 1))
 
 #define _mm512_mask_extracti64x2_epi64(W, U, A, imm) \
   ((__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h
index 80e58425cdd71..18c4a44a4c76e 100644
--- a/clang/lib/Headers/avx512fintrin.h
+++ b/clang/lib/Headers/avx512fintrin.h
@@ -225,17 +225,15 @@ _mm512_broadcastd_epi32(__m128i __A) {
                                           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcastd_epi32 (__m512i __O, __mmask16 __M, __m128i __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_broadcastd_epi32(__m512i __O, __mmask16 __M, __m128i __A) {
   return (__m512i)__builtin_ia32_selectd_512(__M,
                                              (__v16si) _mm512_broadcastd_epi32(__A),
                                              (__v16si) __O);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcastd_epi32 (__mmask16 __M, __m128i __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_broadcastd_epi32(__mmask16 __M, __m128i __A) {
   return (__m512i)__builtin_ia32_selectd_512(__M,
                                              (__v16si) _mm512_broadcastd_epi32(__A),
                                              (__v16si) _mm512_setzero_si512());
@@ -247,18 +245,14 @@ _mm512_broadcastq_epi64(__m128i __A) {
                                           0, 0, 0, 0, 0, 0, 0, 0);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcastq_epi64 (__m512i __O, __mmask8 __M, __m128i __A)
-{
-  return (__m512i)__builtin_ia32_selectq_512(__M,
-                                             (__v8di) _mm512_broadcastq_epi64(__A),
-                                             (__v8di) __O);
-
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_broadcastq_epi64(__m512i __O, __mmask8 __M, __m128i __A) {
+  return (__m512i)__builtin_ia32_selectq_512(
+      __M, (__v8di)_mm512_broadcastq_epi64(__A), (__v8di)__O);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_broadcastq_epi64(__mmask8 __M, __m128i __A) {
   return (__m512i)__builtin_ia32_selectq_512(__M,
                                              (__v8di) _mm512_broadcastq_epi64(__A),
                                              (__v8di) _mm512_setzero_si512());
@@ -321,9 +315,8 @@ _mm512_set1_epi32(int __s)
     __s, __s, __s, __s, __s, __s, __s, __s };
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_set1_epi32(__mmask16 __M, int __A)
-{
+static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_set1_epi32(__mmask16 __M, int __A) {
   return (__m512i)__builtin_ia32_selectd_512(__M,
                                              (__v16si)_mm512_set1_epi32(__A),
                                              (__v16si)_mm512_setzero_si512());
@@ -335,9 +328,8 @@ _mm512_set1_epi64(long long __d)
   return __extension__(__m512i)(__v8di){ __d, __d, __d, __d, __d, __d, __d, __d };
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_set1_epi64(__mmask8 __M, long long __A)
-{
+static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_set1_epi64(__mmask8 __M, long long __A) {
   return (__m512i)__builtin_ia32_selectq_512(__M,
                                              (__v8di)_mm512_set1_epi64(__A),
                                              (__v8di)_mm512_setzero_si512());
@@ -3164,10 +3156,10 @@ _mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i __I,
                                  (__v16si)_mm512_setzero_si512()))
 /* Vector Extract */
 
-#define _mm512_extractf64x4_pd(A, I) \
-  ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(I), \
-                                             (__v4df)_mm256_undefined_pd(), \
-                                             (__mmask8)-1))
+#define _mm512_extractf64x4_pd(A, I)                                           \
+  ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(I),   \
+                                             (__v4df)_mm256_setzero_pd(),      \
+                                             (__mmask8) - 1))
 
 #define _mm512_mask_extractf64x4_pd(W, U, A, imm) \
   ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \
@@ -3179,10 +3171,10 @@ _mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i __I,
                                              (__v4df)_mm256_setzero_pd(), \
                                              (__mmask8)(U)))
 
-#define _mm512_extractf32x4_ps(A, I) \
-  ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(I), \
-                                            (__v4sf)_mm_undefined_ps(), \
-                                            (__mmask8)-1))
+#define _mm512_extractf32x4_ps(A, I)                                           \
+  ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(I),    \
+                                            (__v4sf)_mm_setzero_ps(),          \
+                                            (__mmask8) - 1))
 
 #define _mm512_mask_extractf32x4_ps(W, U, A, imm) \
   ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \
@@ -6552,17 +6544,15 @@ _mm512_broadcast_f32x4(__m128 __A) {
                                          0, 1, 2, 3, 0, 1, 2, 3);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, __m128 __A)
-{
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, __m128 __A) {
   return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
                                            (__v16sf)_mm512_broadcast_f32x4(__A),
                                            (__v16sf)__O);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcast_f32x4(__mmask16 __M, __m128 __A)
-{
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_broadcast_f32x4(__mmask16 __M, __m128 __A) {
   return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
                                            (__v16sf)_mm512_broadcast_f32x4(__A),
                                            (__v16sf)_mm512_setzero_ps());
@@ -6597,17 +6587,15 @@ _mm512_broadcast_i32x4(__m128i __A) {
                                           0, 1, 2, 3, 0, 1, 2, 3);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcast_i32x4(__m512i __O, __mmask16 __M, __m128i __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_broadcast_i32x4(__m512i __O, __mmask16 __M, __m128i __A) {
   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
                                            (__v16si)_mm512_broadcast_i32x4(__A),
                                            (__v16si)__O);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i __A) {
   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
                                            (__v16si)_mm512_broadcast_i32x4(__A),
                                            (__v16si)_mm512_setzero_si512());
@@ -6635,33 +6623,29 @@ _mm512_maskz_broadcast_i64x4(__mmask8 __M, __m256i __A)
                                             (__v8di)_mm512_setzero_si512());
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcastsd_pd (__m512d __O, __mmask8 __M, __m128d __A)
-{
+static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_broadcastsd_pd(__m512d __O, __mmask8 __M, __m128d __A) {
   return (__m512d)__builtin_ia32_selectpd_512(__M,
                                               (__v8df) _mm512_broadcastsd_pd(__A),
                                               (__v8df) __O);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A)
-{
+static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_broadcastsd_pd(__mmask8 __M, __m128d __A) {
   return (__m512d)__builtin_ia32_selectpd_512(__M,
                                               (__v8df) _mm512_broadcastsd_pd(__A),
                                               (__v8df) _mm512_setzero_pd());
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcastss_ps (__m512 __O, __mmask16 __M, __m128 __A)
-{
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_broadcastss_ps(__m512 __O, __mmask16 __M, __m128 __A) {
   return (__m512)__builtin_ia32_selectps_512(__M,
                                              (__v16sf) _mm512_broadcastss_ps(__A),
                                              (__v16sf) __O);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcastss_ps (__mmask16 __M, __m128 __A)
-{
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_broadcastss_ps(__mmask16 __M, __m128 __A) {
   return (__m512)__builtin_ia32_selectps_512(__M,
                                              (__v16sf) _mm512_broadcastss_ps(__A),
                                              (__v16sf) _mm512_setzero_ps());
@@ -7105,10 +7089,10 @@ _mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
   __builtin_ia32_pmovqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M);
 }
 
-#define _mm512_extracti32x4_epi32(A, imm) \
-  ((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
-                                             (__v4si)_mm_undefined_si128(), \
-                                             (__mmask8)-1))
+#define _mm512_extracti32x4_epi32(A, imm)                                      \
+  ((__m128i)__builtin_ia32_extracti32x4_mask(                                  \
+      (__v16si)(__m512i)(A), (int)(imm), (__v4si)_mm_setzero_si128(),          \
+      (__mmask8) - 1))
 
 #define _mm512_mask_extracti32x4_epi32(W, U, A, imm) \
   ((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
@@ -7120,10 +7104,10 @@ _mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
                                              (__v4si)_mm_setzero_si128(), \
                                              (__mmask8)(U)))
 
-#define _mm512_extracti64x4_epi64(A, imm) \
+#define _mm512_extracti64x4_epi64(A, imm)                                      \
   ((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
-                                             (__v4di)_mm256_undefined_si256(), \
-                                             (__mmask8)-1))
+                                             (__v4di)_mm256_setzero_si256(),   \
+                                             (__mmask8) - 1))
 
 #define _mm512_mask_extracti64x4_epi64(W, U, A, imm) \
   ((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
@@ -8381,17 +8365,15 @@ _mm512_movehdup_ps (__m512 __A)
                          1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_movehdup_ps (__m512 __W, __mmask16 __U, __m512 __A)
-{
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_movehdup_ps(__m512 __W, __mmask16 __U, __m512 __A) {
   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
                                              (__v16sf)_mm512_movehdup_ps(__A),
                                              (__v16sf)__W);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_movehdup_ps (__mmask16 __U, __m512 __A)
-{
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_movehdup_ps(__mmask16 __U, __m512 __A) {
   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
                                              (__v16sf)_mm512_movehdup_ps(__A),
                                              (__v16sf)_mm512_setzero_ps());
@@ -8404,44 +8386,38 @@ _mm512_moveldup_ps (__m512 __A)
                          0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_moveldup_ps (__m512 __W, __mmask16 __U, __m512 __A)
-{
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_moveldup_ps(__m512 __W, __mmask16 __U, __m512 __A) {
   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
                                              (__v16sf)_mm512_moveldup_ps(__A),
                                              (__v16sf)__W);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A)
-{
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_moveldup_ps(__mmask16 __U, __m512 __A) {
   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
                                              (__v16sf)_mm512_moveldup_ps(__A),
                                              (__v16sf)_mm512_setzero_ps());
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_move_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
   return __builtin_ia32_selectss_128(__U, _mm_move_ss(__A, __B), __W);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_maskz_move_ss(__mmask8 __U, __m128 __A, __m128 __B) {
   return __builtin_ia32_selectss_128(__U, _mm_move_ss(__A, __B),
                                      _mm_setzero_ps());
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
-{
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_move_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
   return __builtin_ia32_selectsd_128(__U, _mm_move_sd(__A, __B), __W);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B)
-{
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_maskz_move_sd(__mmask8 __U, __m128d __A, __m128d __B) {
   return __builtin_ia32_selectsd_128(__U, _mm_move_sd(__A, __B),
                                      _mm_setzero_pd());
 }
@@ -8884,17 +8860,15 @@ _mm_cvtu64_ss (__m128 __A, unsigned long long __B)
 }
 #endif
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_set1_epi32(__m512i __O, __mmask16 __M, int __A) {
   return (__m512i) __builtin_ia32_selectd_512(__M,
                                               (__v16si) _mm512_set1_epi32(__A),
                                               (__v16si) __O);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_set1_epi64(__m512i __O, __mmask8 __M, long long __A) {
   return (__m512i) __builtin_ia32_selectq_512(__M,
                                               (__v8di) _mm512_set1_epi64(__A),
                                               (__v8di) __O);
diff --git a/clang/lib/Headers/avx512ifmaintrin.h b/clang/lib/Headers/avx512ifmaintrin.h
index f01b322ce7787..625a8ff66dc60 100644
--- a/clang/lib/Headers/avx512ifmaintrin.h
+++ b/clang/lib/Headers/avx512ifmaintrin.h
@@ -15,54 +15,53 @@
 #define __IFMAINTRIN_H
 
 /* Define the default attributes for the functions in this file. */
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS                                                     \
+  constexpr                                                                    \
+      __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma"), \
+                     __min_vector_width__(512)))
+#else
 #define __DEFAULT_FN_ATTRS                                                     \
   __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma"),     \
                  __min_vector_width__(512)))
+#endif
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_madd52hi_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
-{
-  return (__m512i)__builtin_ia32_vpmadd52huq512((__v8di) __X, (__v8di) __Y,
-                                                (__v8di) __Z);
+_mm512_madd52hi_epu64(__m512i __X, __m512i __Y, __m512i __Z) {
+  return (__m512i)__builtin_ia32_vpmadd52huq512((__v8di)__X, (__v8di)__Y,
+                                                (__v8di)__Z);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_madd52hi_epu64 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
-{
-  return (__m512i)__builtin_ia32_selectq_512(__M,
-                                   (__v8di)_mm512_madd52hi_epu64(__W, __X, __Y),
-                                   (__v8di)__W);
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_madd52hi_epu64(
+    __m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) {
+  return (__m512i)__builtin_ia32_selectq_512(
+      __M, (__v8di)_mm512_madd52hi_epu64(__W, __X, __Y), (__v8di)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_madd52hi_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
-{
-  return (__m512i)__builtin_ia32_selectq_512(__M,
-                                   (__v8di)_mm512_madd52hi_epu64(__X, __Y, __Z),
-                                   (__v8di)_mm512_setzero_si512());
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_madd52hi_epu64(
+    __mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z) {
+  return (__m512i)__builtin_ia32_selectq_512(
+      __M, (__v8di)_mm512_madd52hi_epu64(__X, __Y, __Z),
+      (__v8di)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_madd52lo_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
-{
-  return (__m512i)__builtin_ia32_vpmadd52luq512((__v8di) __X, (__v8di) __Y,
-                                                (__v8di) __Z);
+_mm512_madd52lo_epu64(__m512i __X, __m512i __Y, __m512i __Z) {
+  return (__m512i)__builtin_ia32_vpmadd52luq512((__v8di)__X, (__v8di)__Y,
+                                                (__v8di)__Z);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_madd52lo_epu64 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
-{
-  return (__m512i)__builtin_ia32_selectq_512(__M,
-                                   (__v8di)_mm512_madd52lo_epu64(__W, __X, __Y),
-                                   (__v8di)__W);
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_madd52lo_epu64(
+    __m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) {
+  return (__m512i)__builtin_ia32_selectq_512(
+      __M, (__v8di)_mm512_madd52lo_epu64(__W, __X, __Y), (__v8di)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_madd52lo_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
-{
-  return (__m512i)__builtin_ia32_selectq_512(__M,
-                                   (__v8di)_mm512_madd52lo_epu64(__X, __Y, __Z),
-                                   (__v8di)_mm512_setzero_si512());
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_madd52lo_epu64(
+    __mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z) {
+  return (__m512i)__builtin_ia32_selectq_512(
+      __M, (__v8di)_mm512_madd52lo_epu64(__X, __Y, __Z),
+      (__v8di)_mm512_setzero_si512());
 }
 
 #undef __DEFAULT_FN_ATTRS
diff --git a/clang/lib/Headers/avx512ifmavlintrin.h b/clang/lib/Headers/avx512ifmavlintrin.h
index a72b56113a12b..c4449c7ece9ff 100644
--- a/clang/lib/Headers/avx512ifmavlintrin.h
+++ b/clang/lib/Headers/avx512ifmavlintrin.h
@@ -8,13 +8,24 @@
  *===-----------------------------------------------------------------------===
  */
 #ifndef __IMMINTRIN_H
-#error "Never use <avx512ifmavlintrin.h> directly; include <immintrin.h> instead."
+#error                                                                         \
+    "Never use <avx512ifmavlintrin.h> directly; include <immintrin.h> instead."
 #endif
 
 #ifndef __IFMAVLINTRIN_H
 #define __IFMAVLINTRIN_H
 
 /* Define the default attributes for the functions in this file. */
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS128                                                  \
+  constexpr __attribute__((__always_inline__, __nodebug__,                     \
+                           __target__("avx512ifma,avx512vl"),                  \
+                           __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  constexpr __attribute__((__always_inline__, __nodebug__,                     \
+                           __target__("avx512ifma,avx512vl"),                  \
+                           __min_vector_width__(256)))
+#else
 #define __DEFAULT_FN_ATTRS128                                                  \
   __attribute__((__always_inline__, __nodebug__,                               \
                  __target__("avx512ifma,avx512vl"),                            \
@@ -24,6 +35,8 @@
                  __target__("avx512ifma,avx512vl"),                            \
                  __min_vector_width__(256)))
 
+#endif
+
 #define _mm_madd52hi_epu64(X, Y, Z)                                            \
   ((__m128i)__builtin_ia32_vpmadd52huq128((__v2di)(X), (__v2di)(Y),            \
                                           (__v2di)(Z)))
@@ -41,70 +54,57 @@
                                           (__v4di)(Z)))
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_madd52hi_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
-{
-  return (__m128i)__builtin_ia32_selectq_128(__M,
-                                      (__v2di)_mm_madd52hi_epu64(__W, __X, __Y),
-                                      (__v2di)__W);
+_mm_mask_madd52hi_epu64(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) {
+  return (__m128i)__builtin_ia32_selectq_128(
+      __M, (__v2di)_mm_madd52hi_epu64(__W, __X, __Y), (__v2di)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_madd52hi_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
-{
-  return (__m128i)__builtin_ia32_selectq_128(__M,
-                                      (__v2di)_mm_madd52hi_epu64(__X, __Y, __Z),
-                                      (__v2di)_mm_setzero_si128());
+_mm_maskz_madd52hi_epu64(__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z) {
+  return (__m128i)__builtin_ia32_selectq_128(
+      __M, (__v2di)_mm_madd52hi_epu64(__X, __Y, __Z),
+      (__v2di)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_madd52hi_epu64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
-{
-  return (__m256i)__builtin_ia32_selectq_256(__M,
-                                   (__v4di)_mm256_madd52hi_epu64(__W, __X, __Y),
-                                   (__v4di)__W);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_madd52hi_epu64(
+    __m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) {
+  return (__m256i)__builtin_ia32_selectq_256(
+      __M, (__v4di)_mm256_madd52hi_epu64(__W, __X, __Y), (__v4di)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_madd52hi_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z)
-{
-  return (__m256i)__builtin_ia32_selectq_256(__M,
-                                   (__v4di)_mm256_madd52hi_epu64(__X, __Y, __Z),
-                                   (__v4di)_mm256_setzero_si256());
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_madd52hi_epu64(
+    __mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z) {
+  return (__m256i)__builtin_ia32_selectq_256(
+      __M, (__v4di)_mm256_madd52hi_epu64(__X, __Y, __Z),
+      (__v4di)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_madd52lo_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
-{
-  return (__m128i)__builtin_ia32_selectq_128(__M,
-                                      (__v2di)_mm_madd52lo_epu64(__W, __X, __Y),
-                                      (__v2di)__W);
+_mm_mask_madd52lo_epu64(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) {
+  return (__m128i)__builtin_ia32_selectq_128(
+      __M, (__v2di)_mm_madd52lo_epu64(__W, __X, __Y), (__v2di)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_madd52lo_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
-{
-  return (__m128i)__builtin_ia32_selectq_128(__M,
-                                      (__v2di)_mm_madd52lo_epu64(__X, __Y, __Z),
-                                      (__v2di)_mm_setzero_si128());
+_mm_maskz_madd52lo_epu64(__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z) {
+  return (__m128i)__builtin_ia32_selectq_128(
+      __M, (__v2di)_mm_madd52lo_epu64(__X, __Y, __Z),
+      (__v2di)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_madd52lo_epu64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
-{
-  return (__m256i)__builtin_ia32_selectq_256(__M,
-                                   (__v4di)_mm256_madd52lo_epu64(__W, __X, __Y),
-                                   (__v4di)__W);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_madd52lo_epu64(
+    __m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) {
+  return (__m256i)__builtin_ia32_selectq_256(
+      __M, (__v4di)_mm256_madd52lo_epu64(__W, __X, __Y), (__v4di)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_madd52lo_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z)
-{
-  return (__m256i)__builtin_ia32_selectq_256(__M,
-                                   (__v4di)_mm256_madd52lo_epu64(__X, __Y, __Z),
-                                   (__v4di)_mm256_setzero_si256());
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_madd52lo_epu64(
+    __mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z) {
+  return (__m256i)__builtin_ia32_selectq_256(
+      __M, (__v4di)_mm256_madd52lo_epu64(__X, __Y, __Z),
+      (__v4di)_mm256_setzero_si256());
 }
 
-
 #undef __DEFAULT_FN_ATTRS128
 #undef __DEFAULT_FN_ATTRS256
 
diff --git a/clang/lib/Headers/avx512vlbwintrin.h b/clang/lib/Headers/avx512vlbwintrin.h
index 81e4cbb9615c1..0fcfe3779fa19 100644
--- a/clang/lib/Headers/avx512vlbwintrin.h
+++ b/clang/lib/Headers/avx512vlbwintrin.h
@@ -1067,33 +1067,29 @@ _mm256_mask_min_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) {
                                             (__v16hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_shuffle_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_shuffle_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
                                             (__v16qi)_mm_shuffle_epi8(__A, __B),
                                             (__v16qi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_shuffle_epi8(__mmask16 __U, __m128i __A, __m128i __B)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_maskz_shuffle_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
                                             (__v16qi)_mm_shuffle_epi8(__A, __B),
                                             (__v16qi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_shuffle_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_shuffle_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
   return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
                                          (__v32qi)_mm256_shuffle_epi8(__A, __B),
                                          (__v32qi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_shuffle_epi8(__mmask32 __U, __m256i __A, __m256i __B)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_shuffle_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
   return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
                                          (__v32qi)_mm256_shuffle_epi8(__A, __B),
                                          (__v32qi)_mm256_setzero_si256());
@@ -1514,28 +1510,28 @@ _mm256_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i __A)
   __builtin_ia32_pmovuswb256mem_mask ((__v16qi*) __P, (__v16hi) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_mulhrs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
                                              (__v8hi)_mm_mulhrs_epi16(__X, __Y),
                                              (__v8hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_mulhrs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
                                              (__v8hi)_mm_mulhrs_epi16(__X, __Y),
                                              (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_mulhrs_epi16(__m256i __W, __mmask16 __U, __m256i __X, __m256i __Y) {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
                                          (__v16hi)_mm256_mulhrs_epi16(__X, __Y),
                                          (__v16hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_mulhrs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
                                          (__v16hi)_mm256_mulhrs_epi16(__X, __Y),
diff --git a/clang/lib/Headers/avx512vlcdintrin.h b/clang/lib/Headers/avx512vlcdintrin.h
index 30c9f9017f0bf..7719680faf93a 100644
--- a/clang/lib/Headers/avx512vlcdintrin.h
+++ b/clang/lib/Headers/avx512vlcdintrin.h
@@ -14,208 +14,182 @@
 #define __AVX512VLCDINTRIN_H
 
 /* Define the default attributes for the functions in this file. */
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS128                                                  \
+  constexpr __attribute__((__always_inline__, __nodebug__,                     \
+                           __target__("avx512vl,avx512cd"),                    \
+                           __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  constexpr __attribute__((__always_inline__, __nodebug__,                     \
+                           __target__("avx512vl,avx512cd"),                    \
+                           __min_vector_width__(256)))
+#else
 #define __DEFAULT_FN_ATTRS128                                                  \
   __attribute__((__always_inline__, __nodebug__,                               \
                  __target__("avx512vl,avx512cd"), __min_vector_width__(128)))
 #define __DEFAULT_FN_ATTRS256                                                  \
   __attribute__((__always_inline__, __nodebug__,                               \
                  __target__("avx512vl,avx512cd"), __min_vector_width__(256)))
-
-#if defined(__cplusplus) && (__cplusplus >= 201103L)
-#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
-#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
-#else
-#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
-#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
 #endif
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_broadcastmb_epi64 (__mmask8 __A)
-{
+_mm_broadcastmb_epi64(__mmask8 __A) {
   return (__m128i) _mm_set1_epi64x((long long) __A);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_broadcastmb_epi64 (__mmask8 __A)
-{
-  return (__m256i) _mm256_set1_epi64x((long long)__A);
+_mm256_broadcastmb_epi64(__mmask8 __A) {
+  return (__m256i)_mm256_set1_epi64x((long long)__A);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_broadcastmw_epi32 (__mmask16 __A)
-{
+_mm_broadcastmw_epi32(__mmask16 __A) {
   return (__m128i) _mm_set1_epi32((int)__A);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_broadcastmw_epi32 (__mmask16 __A)
-{
+_mm256_broadcastmw_epi32(__mmask16 __A) {
   return (__m256i) _mm256_set1_epi32((int)__A);
 }
 
-
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_conflict_epi64 (__m128i __A)
-{
-  return (__m128i) __builtin_ia32_vpconflictdi_128 ((__v2di) __A);
+_mm_conflict_epi64(__m128i __A) {
+  return (__m128i)__builtin_ia32_vpconflictdi_128((__v2di)__A);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_conflict_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
-{
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_conflict_epi64(__A),
-                                             (__v2di)__W);
+_mm_mask_conflict_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
+  return (__m128i)__builtin_ia32_selectq_128(
+      (__mmask8)__U, (__v2di)_mm_conflict_epi64(__A), (__v2di)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_conflict_epi64 (__mmask8 __U, __m128i __A)
-{
+_mm_maskz_conflict_epi64(__mmask8 __U, __m128i __A) {
   return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
                                              (__v2di)_mm_conflict_epi64(__A),
                                              (__v2di)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_conflict_epi64 (__m256i __A)
-{
-  return (__m256i) __builtin_ia32_vpconflictdi_256 ((__v4di) __A);
+_mm256_conflict_epi64(__m256i __A) {
+  return (__m256i)__builtin_ia32_vpconflictdi_256((__v4di)__A);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_conflict_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
-{
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                             (__v4di)_mm256_conflict_epi64(__A),
-                                             (__v4di)__W);
+_mm256_mask_conflict_epi64(__m256i __W, __mmask8 __U, __m256i __A) {
+  return (__m256i)__builtin_ia32_selectq_256(
+      (__mmask8)__U, (__v4di)_mm256_conflict_epi64(__A), (__v4di)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_conflict_epi64 (__mmask8 __U, __m256i __A)
-{
+_mm256_maskz_conflict_epi64(__mmask8 __U, __m256i __A) {
   return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
                                              (__v4di)_mm256_conflict_epi64(__A),
                                              (__v4di)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_conflict_epi32 (__m128i __A)
-{
-  return (__m128i) __builtin_ia32_vpconflictsi_128 ((__v4si) __A);
+_mm_conflict_epi32(__m128i __A) {
+  return (__m128i)__builtin_ia32_vpconflictsi_128((__v4si)__A);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_conflict_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
-{
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_conflict_epi32(__A),
-                                             (__v4si)__W);
+_mm_mask_conflict_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
+  return (__m128i)__builtin_ia32_selectd_128(
+      (__mmask8)__U, (__v4si)_mm_conflict_epi32(__A), (__v4si)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_conflict_epi32 (__mmask8 __U, __m128i __A)
-{
+_mm_maskz_conflict_epi32(__mmask8 __U, __m128i __A) {
   return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
                                              (__v4si)_mm_conflict_epi32(__A),
                                              (__v4si)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_conflict_epi32 (__m256i __A)
-{
-  return (__m256i) __builtin_ia32_vpconflictsi_256 ((__v8si) __A);
+_mm256_conflict_epi32(__m256i __A) {
+  return (__m256i)__builtin_ia32_vpconflictsi_256((__v8si)__A);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_conflict_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
-{
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_conflict_epi32(__A),
-                                             (__v8si)__W);
+_mm256_mask_conflict_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
+  return (__m256i)__builtin_ia32_selectd_256(
+      (__mmask8)__U, (__v8si)_mm256_conflict_epi32(__A), (__v8si)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_conflict_epi32 (__mmask8 __U, __m256i __A)
-{
+_mm256_maskz_conflict_epi32(__mmask8 __U, __m256i __A) {
   return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
                                              (__v8si)_mm256_conflict_epi32(__A),
                                              (__v8si)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
-_mm_lzcnt_epi32(__m128i __A) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_lzcnt_epi32(__m128i __A) {
   return (__m128i)__builtin_elementwise_clzg((__v4si)__A,
                                              (__v4si)_mm_set1_epi32(32));
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_lzcnt_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_lzcnt_epi32(__A),
-                                             (__v4si)__W);
+  return (__m128i)__builtin_ia32_selectd_128(
+      (__mmask8)__U, (__v4si)_mm_lzcnt_epi32(__A), (__v4si)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_lzcnt_epi32(__mmask8 __U, __m128i __A) {
-  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_lzcnt_epi32(__A),
-                                             (__v4si)_mm_setzero_si128());
+  return (__m128i)__builtin_ia32_selectd_128(
+      (__mmask8)__U, (__v4si)_mm_lzcnt_epi32(__A), (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_lzcnt_epi32(__m256i __A) {
   return (__m256i)__builtin_elementwise_clzg((__v8si)__A,
                                              (__v8si)_mm256_set1_epi32(32));
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_lzcnt_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
-  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_lzcnt_epi32(__A),
-                                             (__v8si)__W);
+  return (__m256i)__builtin_ia32_selectd_256(
+      (__mmask8)__U, (__v8si)_mm256_lzcnt_epi32(__A), (__v8si)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_lzcnt_epi32(__mmask8 __U, __m256i __A) {
   return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
                                              (__v8si)_mm256_lzcnt_epi32(__A),
                                              (__v8si)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
-_mm_lzcnt_epi64(__m128i __A) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_lzcnt_epi64(__m128i __A) {
   return (__m128i)__builtin_elementwise_clzg(
       (__v2di)__A, (__v2di)_mm_set1_epi64x((long long)64));
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_lzcnt_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_lzcnt_epi64(__A),
-                                             (__v2di)__W);
+  return (__m128i)__builtin_ia32_selectq_128(
+      (__mmask8)__U, (__v2di)_mm_lzcnt_epi64(__A), (__v2di)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_lzcnt_epi64(__mmask8 __U, __m128i __A) {
-  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_lzcnt_epi64(__A),
-                                             (__v2di)_mm_setzero_si128());
+  return (__m128i)__builtin_ia32_selectq_128(
+      (__mmask8)__U, (__v2di)_mm_lzcnt_epi64(__A), (__v2di)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_lzcnt_epi64(__m256i __A) {
   return (__m256i)__builtin_elementwise_clzg(
       (__v4di)__A, (__v4di)_mm256_set1_epi64x((long long)64));
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_lzcnt_epi64(__m256i __W, __mmask8 __U, __m256i __A) {
-  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                             (__v4di)_mm256_lzcnt_epi64(__A),
-                                             (__v4di)__W);
+  return (__m256i)__builtin_ia32_selectq_256(
+      (__mmask8)__U, (__v4di)_mm256_lzcnt_epi64(__A), (__v4di)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_lzcnt_epi64(__mmask8 __U, __m256i __A) {
   return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
                                              (__v4di)_mm256_lzcnt_epi64(__A),
@@ -224,7 +198,5 @@ _mm256_maskz_lzcnt_epi64(__mmask8 __U, __m256i __A) {
 
 #undef __DEFAULT_FN_ATTRS128
 #undef __DEFAULT_FN_ATTRS256
-#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
-#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
 
 #endif /* __AVX512VLCDINTRIN_H */
diff --git a/clang/lib/Headers/avx512vldqintrin.h b/clang/lib/Headers/avx512vldqintrin.h
index 68bd52e43981a..707d039cf4c07 100644
--- a/clang/lib/Headers/avx512vldqintrin.h
+++ b/clang/lib/Headers/avx512vldqintrin.h
@@ -968,17 +968,15 @@ _mm256_broadcast_f32x2(__m128 __A) {
                                          0, 1, 0, 1, 0, 1, 0, 1);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_broadcast_f32x2 (__m256 __O, __mmask8 __M, __m128 __A)
-{
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_broadcast_f32x2(__m256 __O, __mmask8 __M, __m128 __A) {
   return (__m256)__builtin_ia32_selectps_256((__mmask8)__M,
                                              (__v8sf)_mm256_broadcast_f32x2(__A),
                                              (__v8sf)__O);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_broadcast_f32x2 (__mmask8 __M, __m128 __A)
-{
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_broadcast_f32x2(__mmask8 __M, __m128 __A) {
   return (__m256)__builtin_ia32_selectps_256((__mmask8)__M,
                                              (__v8sf)_mm256_broadcast_f32x2(__A),
                                              (__v8sf)_mm256_setzero_ps());
@@ -990,17 +988,15 @@ _mm256_broadcast_f64x2(__m128d __A) {
                                           0, 1, 0, 1);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_broadcast_f64x2(__m256d __O, __mmask8 __M, __m128d __A)
-{
+static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_broadcast_f64x2(__m256d __O, __mmask8 __M, __m128d __A) {
   return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__M,
                                             (__v4df)_mm256_broadcast_f64x2(__A),
                                             (__v4df)__O);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A)
-{
+static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A) {
   return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__M,
                                             (__v4df)_mm256_broadcast_f64x2(__A),
                                             (__v4df)_mm256_setzero_pd());
@@ -1012,17 +1008,15 @@ _mm_broadcast_i32x2(__m128i __A) {
                                           0, 1, 0, 1);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_broadcast_i32x2 (__m128i __O, __mmask8 __M, __m128i __A)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_broadcast_i32x2(__m128i __O, __mmask8 __M, __m128i __A) {
   return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
                                              (__v4si)_mm_broadcast_i32x2(__A),
                                              (__v4si)__O);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_maskz_broadcast_i32x2(__mmask8 __M, __m128i __A) {
   return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
                                              (__v4si)_mm_broadcast_i32x2(__A),
                                              (__v4si)_mm_setzero_si128());
@@ -1034,17 +1028,15 @@ _mm256_broadcast_i32x2(__m128i __A) {
                                           0, 1, 0, 1, 0, 1, 0, 1);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_broadcast_i32x2 (__m256i __O, __mmask8 __M, __m128i __A)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_broadcast_i32x2(__m256i __O, __mmask8 __M, __m128i __A) {
   return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
                                              (__v8si)_mm256_broadcast_i32x2(__A),
                                              (__v8si)__O);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_broadcast_i32x2(__mmask8 __M, __m128i __A) {
   return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
                                              (__v8si)_mm256_broadcast_i32x2(__A),
                                              (__v8si)_mm256_setzero_si256());
@@ -1056,27 +1048,24 @@ _mm256_broadcast_i64x2(__m128i __A) {
                                           0, 1, 0, 1);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_broadcast_i64x2(__m256i __O, __mmask8 __M, __m128i __A)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_broadcast_i64x2(__m256i __O, __mmask8 __M, __m128i __A) {
   return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
                                             (__v4di)_mm256_broadcast_i64x2(__A),
                                             (__v4di)__O);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A) {
   return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
                                             (__v4di)_mm256_broadcast_i64x2(__A),
                                             (__v4di)_mm256_setzero_si256());
 }
 
-#define _mm256_extractf64x2_pd(A, imm) \
-  ((__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
-                                                 (int)(imm), \
-                                                 (__v2df)_mm_undefined_pd(), \
-                                                 (__mmask8)-1))
+#define _mm256_extractf64x2_pd(A, imm)                                         \
+  ((__m128d)__builtin_ia32_extractf64x2_256_mask(                              \
+      (__v4df)(__m256d)(A), (int)(imm), (__v2df)_mm_setzero_pd(),              \
+      (__mmask8) - 1))
 
 #define _mm256_mask_extractf64x2_pd(W, U, A, imm) \
   ((__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
@@ -1090,11 +1079,10 @@ _mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
                                                  (__v2df)_mm_setzero_pd(), \
                                                  (__mmask8)(U)))
 
-#define _mm256_extracti64x2_epi64(A, imm) \
-  ((__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
-                                                (int)(imm), \
-                                                (__v2di)_mm_undefined_si128(), \
-                                                (__mmask8)-1))
+#define _mm256_extracti64x2_epi64(A, imm)                                      \
+  ((__m128i)__builtin_ia32_extracti64x2_256_mask(                              \
+      (__v4di)(__m256i)(A), (int)(imm), (__v2di)_mm_setzero_si128(),           \
+      (__mmask8) - 1))
 
 #define _mm256_mask_extracti64x2_epi64(W, U, A, imm) \
   ((__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
diff --git a/clang/lib/Headers/avx512vlintrin.h b/clang/lib/Headers/avx512vlintrin.h
index 965741f0ff944..92bb444aeb5b8 100644
--- a/clang/lib/Headers/avx512vlintrin.h
+++ b/clang/lib/Headers/avx512vlintrin.h
@@ -5101,69 +5101,55 @@ _mm256_maskz_movedup_pd (__mmask8 __U, __m256d __A)
                                               (__v4df)_mm256_setzero_pd());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_set1_epi32(__m128i __O, __mmask8 __M, int __A)
-{
-   return (__m128i)__builtin_ia32_selectd_128(__M,
-                                              (__v4si) _mm_set1_epi32(__A),
-                                              (__v4si)__O);
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_set1_epi32(__m128i __O, __mmask8 __M, int __A) {
+  return (__m128i)__builtin_ia32_selectd_128(__M, (__v4si)_mm_set1_epi32(__A),
+                                             (__v4si)__O);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_set1_epi32( __mmask8 __M, int __A)
-{
-   return (__m128i)__builtin_ia32_selectd_128(__M,
-                                              (__v4si) _mm_set1_epi32(__A),
-                                              (__v4si)_mm_setzero_si128());
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_maskz_set1_epi32(__mmask8 __M, int __A) {
+  return (__m128i)__builtin_ia32_selectd_128(__M, (__v4si)_mm_set1_epi32(__A),
+                                             (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_set1_epi32(__m256i __O, __mmask8 __M, int __A)
-{
-   return (__m256i)__builtin_ia32_selectd_256(__M,
-                                              (__v8si) _mm256_set1_epi32(__A),
-                                              (__v8si)__O);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_set1_epi32(__m256i __O, __mmask8 __M, int __A) {
+  return (__m256i)__builtin_ia32_selectd_256(
+      __M, (__v8si)_mm256_set1_epi32(__A), (__v8si)__O);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_set1_epi32( __mmask8 __M, int __A)
-{
-   return (__m256i)__builtin_ia32_selectd_256(__M,
-                                              (__v8si) _mm256_set1_epi32(__A),
-                                              (__v8si)_mm256_setzero_si256());
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_set1_epi32(__mmask8 __M, int __A) {
+  return (__m256i)__builtin_ia32_selectd_256(
+      __M, (__v8si)_mm256_set1_epi32(__A), (__v8si)_mm256_setzero_si256());
 }
 
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_set1_epi64 (__m128i __O, __mmask8 __M, long long __A)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_set1_epi64(__m128i __O, __mmask8 __M, long long __A) {
   return (__m128i) __builtin_ia32_selectq_128(__M,
                                               (__v2di) _mm_set1_epi64x(__A),
                                               (__v2di) __O);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_set1_epi64 (__mmask8 __M, long long __A)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_maskz_set1_epi64(__mmask8 __M, long long __A) {
   return (__m128i) __builtin_ia32_selectq_128(__M,
                                               (__v2di) _mm_set1_epi64x(__A),
                                               (__v2di) _mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_set1_epi64 (__m256i __O, __mmask8 __M, long long __A)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_set1_epi64(__m256i __O, __mmask8 __M, long long __A) {
   return (__m256i) __builtin_ia32_selectq_256(__M,
                                               (__v4di) _mm256_set1_epi64x(__A),
                                               (__v4di) __O) ;
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_set1_epi64 (__mmask8 __M, long long __A)
-{
-   return (__m256i) __builtin_ia32_selectq_256(__M,
-                                               (__v4di) _mm256_set1_epi64x(__A),
-                                               (__v4di) _mm256_setzero_si256());
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_set1_epi64(__mmask8 __M, long long __A) {
+  return (__m256i)__builtin_ia32_selectq_256(
+      __M, (__v4di)_mm256_set1_epi64x(__A), (__v4di)_mm256_setzero_si256());
 }
 
 #define _mm_fixupimm_pd(A, B, C, imm) \
@@ -5610,130 +5596,113 @@ _mm256_mask_storeu_ps (void *__P, __mmask8 __U, __m256 __A)
            (__mmask8) __U);
 }
 
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_unpackhi_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
-{
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_unpackhi_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
   return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
                                               (__v2df)_mm_unpackhi_pd(__A, __B),
                                               (__v2df)__W);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_unpackhi_pd(__mmask8 __U, __m128d __A, __m128d __B)
-{
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_maskz_unpackhi_pd(__mmask8 __U, __m128d __A, __m128d __B) {
   return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
                                               (__v2df)_mm_unpackhi_pd(__A, __B),
                                               (__v2df)_mm_setzero_pd());
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_unpackhi_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B)
-{
+static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_unpackhi_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
   return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
                                            (__v4df)_mm256_unpackhi_pd(__A, __B),
                                            (__v4df)__W);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_unpackhi_pd(__mmask8 __U, __m256d __A, __m256d __B)
-{
+static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_unpackhi_pd(__mmask8 __U, __m256d __A, __m256d __B) {
   return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
                                            (__v4df)_mm256_unpackhi_pd(__A, __B),
                                            (__v4df)_mm256_setzero_pd());
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_unpackhi_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_unpackhi_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
   return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
                                              (__v4sf)_mm_unpackhi_ps(__A, __B),
                                              (__v4sf)__W);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_unpackhi_ps(__mmask8 __U, __m128 __A, __m128 __B)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_maskz_unpackhi_ps(__mmask8 __U, __m128 __A, __m128 __B) {
   return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
                                              (__v4sf)_mm_unpackhi_ps(__A, __B),
                                              (__v4sf)_mm_setzero_ps());
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_unpackhi_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B)
-{
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_unpackhi_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
   return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
                                            (__v8sf)_mm256_unpackhi_ps(__A, __B),
                                            (__v8sf)__W);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_unpackhi_ps(__mmask8 __U, __m256 __A, __m256 __B)
-{
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_unpackhi_ps(__mmask8 __U, __m256 __A, __m256 __B) {
   return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
                                            (__v8sf)_mm256_unpackhi_ps(__A, __B),
                                            (__v8sf)_mm256_setzero_ps());
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_unpacklo_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
-{
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_unpacklo_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
   return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
                                               (__v2df)_mm_unpacklo_pd(__A, __B),
                                               (__v2df)__W);
 }
 
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_unpacklo_pd(__mmask8 __U, __m128d __A, __m128d __B)
-{
+static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_maskz_unpacklo_pd(__mmask8 __U, __m128d __A, __m128d __B) {
   return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
                                               (__v2df)_mm_unpacklo_pd(__A, __B),
                                               (__v2df)_mm_setzero_pd());
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_unpacklo_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B)
-{
+static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_unpacklo_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
   return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
                                            (__v4df)_mm256_unpacklo_pd(__A, __B),
                                            (__v4df)__W);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_unpacklo_pd(__mmask8 __U, __m256d __A, __m256d __B)
-{
+static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_unpacklo_pd(__mmask8 __U, __m256d __A, __m256d __B) {
   return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
                                            (__v4df)_mm256_unpacklo_pd(__A, __B),
                                            (__v4df)_mm256_setzero_pd());
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_unpacklo_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_unpacklo_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
   return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
                                              (__v4sf)_mm_unpacklo_ps(__A, __B),
                                              (__v4sf)__W);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_unpacklo_ps(__mmask8 __U, __m128 __A, __m128 __B)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_maskz_unpacklo_ps(__mmask8 __U, __m128 __A, __m128 __B) {
   return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
                                              (__v4sf)_mm_unpacklo_ps(__A, __B),
                                              (__v4sf)_mm_setzero_ps());
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_unpacklo_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B)
-{
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_unpacklo_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
   return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
                                            (__v8sf)_mm256_unpacklo_ps(__A, __B),
                                            (__v8sf)__W);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_unpacklo_ps(__mmask8 __U, __m256 __A, __m256 __B)
-{
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_unpacklo_ps(__mmask8 __U, __m256 __A, __m256 __B) {
   return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
                                            (__v8sf)_mm256_unpacklo_ps(__A, __B),
                                            (__v8sf)_mm256_setzero_ps());
@@ -6055,129 +6024,117 @@ _mm256_mask_testn_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B)
                                        _mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_unpackhi_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_unpackhi_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
                                            (__v4si)_mm_unpackhi_epi32(__A, __B),
                                            (__v4si)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_unpackhi_epi32(__mmask8 __U, __m128i __A, __m128i __B)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_maskz_unpackhi_epi32(__mmask8 __U, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
                                            (__v4si)_mm_unpackhi_epi32(__A, __B),
                                            (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_unpackhi_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_unpackhi_epi32(__m256i __W, __mmask8 __U, __m256i __A,
+                           __m256i __B) {
   return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
                                         (__v8si)_mm256_unpackhi_epi32(__A, __B),
                                         (__v8si)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_unpackhi_epi32(__mmask8 __U, __m256i __A, __m256i __B)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_unpackhi_epi32(__mmask8 __U, __m256i __A, __m256i __B) {
   return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
                                         (__v8si)_mm256_unpackhi_epi32(__A, __B),
                                         (__v8si)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_unpackhi_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_unpackhi_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
                                            (__v2di)_mm_unpackhi_epi64(__A, __B),
                                            (__v2di)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_unpackhi_epi64(__mmask8 __U, __m128i __A, __m128i __B)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_maskz_unpackhi_epi64(__mmask8 __U, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
                                            (__v2di)_mm_unpackhi_epi64(__A, __B),
                                            (__v2di)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_unpackhi_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_unpackhi_epi64(__m256i __W, __mmask8 __U, __m256i __A,
+                           __m256i __B) {
   return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
                                         (__v4di)_mm256_unpackhi_epi64(__A, __B),
                                         (__v4di)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_unpackhi_epi64(__mmask8 __U, __m256i __A, __m256i __B)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_unpackhi_epi64(__mmask8 __U, __m256i __A, __m256i __B) {
   return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
                                         (__v4di)_mm256_unpackhi_epi64(__A, __B),
                                         (__v4di)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_unpacklo_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_unpacklo_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
                                            (__v4si)_mm_unpacklo_epi32(__A, __B),
                                            (__v4si)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_unpacklo_epi32(__mmask8 __U, __m128i __A, __m128i __B)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_maskz_unpacklo_epi32(__mmask8 __U, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
                                            (__v4si)_mm_unpacklo_epi32(__A, __B),
                                            (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_unpacklo_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_unpacklo_epi32(__m256i __W, __mmask8 __U, __m256i __A,
+                           __m256i __B) {
   return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
                                         (__v8si)_mm256_unpacklo_epi32(__A, __B),
                                         (__v8si)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_unpacklo_epi32(__mmask8 __U, __m256i __A, __m256i __B)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_unpacklo_epi32(__mmask8 __U, __m256i __A, __m256i __B) {
   return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
                                         (__v8si)_mm256_unpacklo_epi32(__A, __B),
                                         (__v8si)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_unpacklo_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_unpacklo_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
                                            (__v2di)_mm_unpacklo_epi64(__A, __B),
                                            (__v2di)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_unpacklo_epi64(__mmask8 __U, __m128i __A, __m128i __B)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_maskz_unpacklo_epi64(__mmask8 __U, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
                                            (__v2di)_mm_unpacklo_epi64(__A, __B),
                                            (__v2di)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_unpacklo_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_unpacklo_epi64(__m256i __W, __mmask8 __U, __m256i __A,
+                           __m256i __B) {
   return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
                                         (__v4di)_mm256_unpacklo_epi64(__A, __B),
                                         (__v4di)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_unpacklo_epi64(__mmask8 __U, __m256i __A, __m256i __B)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_unpacklo_epi64(__mmask8 __U, __m256i __A, __m256i __B) {
   return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
                                         (__v4di)_mm256_unpacklo_epi64(__A, __B),
                                         (__v4di)_mm256_setzero_si256());
@@ -6594,17 +6551,15 @@ _mm256_broadcast_f32x4(__m128 __A) {
                                          0, 1, 2, 3, 0, 1, 2, 3);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_broadcast_f32x4(__m256 __O, __mmask8 __M, __m128 __A)
-{
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_broadcast_f32x4(__m256 __O, __mmask8 __M, __m128 __A) {
   return (__m256)__builtin_ia32_selectps_256((__mmask8)__M,
                                             (__v8sf)_mm256_broadcast_f32x4(__A),
                                             (__v8sf)__O);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_broadcast_f32x4 (__mmask8 __M, __m128 __A)
-{
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_broadcast_f32x4(__mmask8 __M, __m128 __A) {
   return (__m256)__builtin_ia32_selectps_256((__mmask8)__M,
                                             (__v8sf)_mm256_broadcast_f32x4(__A),
                                             (__v8sf)_mm256_setzero_ps());
@@ -6616,129 +6571,113 @@ _mm256_broadcast_i32x4(__m128i __A) {
                                           0, 1, 2, 3, 0, 1, 2, 3);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_broadcast_i32x4(__m256i __O, __mmask8 __M, __m128i __A)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_broadcast_i32x4(__m256i __O, __mmask8 __M, __m128i __A) {
   return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
                                             (__v8si)_mm256_broadcast_i32x4(__A),
                                             (__v8si)__O);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_broadcast_i32x4(__mmask8 __M, __m128i __A)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_broadcast_i32x4(__mmask8 __M, __m128i __A) {
   return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
                                             (__v8si)_mm256_broadcast_i32x4(__A),
                                             (__v8si)_mm256_setzero_si256());
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_broadcastsd_pd (__m256d __O, __mmask8 __M, __m128d __A)
-{
+static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_broadcastsd_pd(__m256d __O, __mmask8 __M, __m128d __A) {
   return (__m256d)__builtin_ia32_selectpd_256(__M,
                                               (__v4df) _mm256_broadcastsd_pd(__A),
                                               (__v4df) __O);
 }
 
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A)
-{
+static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_broadcastsd_pd(__mmask8 __M, __m128d __A) {
   return (__m256d)__builtin_ia32_selectpd_256(__M,
                                               (__v4df) _mm256_broadcastsd_pd(__A),
                                               (__v4df) _mm256_setzero_pd());
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_broadcastss_ps (__m128 __O, __mmask8 __M, __m128 __A)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_broadcastss_ps(__m128 __O, __mmask8 __M, __m128 __A) {
   return (__m128)__builtin_ia32_selectps_128(__M,
                                              (__v4sf) _mm_broadcastss_ps(__A),
                                              (__v4sf) __O);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_broadcastss_ps (__mmask8 __M, __m128 __A)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_maskz_broadcastss_ps(__mmask8 __M, __m128 __A) {
   return (__m128)__builtin_ia32_selectps_128(__M,
                                              (__v4sf) _mm_broadcastss_ps(__A),
                                              (__v4sf) _mm_setzero_ps());
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_broadcastss_ps (__m256 __O, __mmask8 __M, __m128 __A)
-{
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_broadcastss_ps(__m256 __O, __mmask8 __M, __m128 __A) {
   return (__m256)__builtin_ia32_selectps_256(__M,
                                              (__v8sf) _mm256_broadcastss_ps(__A),
                                              (__v8sf) __O);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_broadcastss_ps (__mmask8 __M, __m128 __A)
-{
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_broadcastss_ps(__mmask8 __M, __m128 __A) {
   return (__m256)__builtin_ia32_selectps_256(__M,
                                              (__v8sf) _mm256_broadcastss_ps(__A),
                                              (__v8sf) _mm256_setzero_ps());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_broadcastd_epi32 (__m128i __O, __mmask8 __M, __m128i __A)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_broadcastd_epi32(__m128i __O, __mmask8 __M, __m128i __A) {
   return (__m128i)__builtin_ia32_selectd_128(__M,
                                              (__v4si) _mm_broadcastd_epi32(__A),
                                              (__v4si) __O);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_maskz_broadcastd_epi32(__mmask8 __M, __m128i __A) {
   return (__m128i)__builtin_ia32_selectd_128(__M,
                                              (__v4si) _mm_broadcastd_epi32(__A),
                                              (__v4si) _mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_broadcastd_epi32 (__m256i __O, __mmask8 __M, __m128i __A)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_broadcastd_epi32(__m256i __O, __mmask8 __M, __m128i __A) {
   return (__m256i)__builtin_ia32_selectd_256(__M,
                                              (__v8si) _mm256_broadcastd_epi32(__A),
                                              (__v8si) __O);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_broadcastd_epi32(__mmask8 __M, __m128i __A) {
   return (__m256i)__builtin_ia32_selectd_256(__M,
                                              (__v8si) _mm256_broadcastd_epi32(__A),
                                              (__v8si) _mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_broadcastq_epi64 (__m128i __O, __mmask8 __M, __m128i __A)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_broadcastq_epi64(__m128i __O, __mmask8 __M, __m128i __A) {
   return (__m128i)__builtin_ia32_selectq_128(__M,
                                              (__v2di) _mm_broadcastq_epi64(__A),
                                              (__v2di) __O);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_maskz_broadcastq_epi64(__mmask8 __M, __m128i __A) {
   return (__m128i)__builtin_ia32_selectq_128(__M,
                                              (__v2di) _mm_broadcastq_epi64(__A),
                                              (__v2di) _mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_broadcastq_epi64 (__m256i __O, __mmask8 __M, __m128i __A)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_broadcastq_epi64(__m256i __O, __mmask8 __M, __m128i __A) {
   return (__m256i)__builtin_ia32_selectq_256(__M,
                                              (__v4di) _mm256_broadcastq_epi64(__A),
                                              (__v4di) __O);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_broadcastq_epi64(__mmask8 __M, __m128i __A) {
   return (__m256i)__builtin_ia32_selectq_256(__M,
                                              (__v4di) _mm256_broadcastq_epi64(__A),
                                              (__v4di) _mm256_setzero_si256());
@@ -7606,11 +7545,10 @@ _mm256_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
   __builtin_ia32_pmovqw256mem_mask ((__v8hi *) __P, (__v4di) __A, __M);
 }
 
-#define _mm256_extractf32x4_ps(A, imm) \
-  ((__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \
-                                                (int)(imm), \
-                                                (__v4sf)_mm_undefined_ps(), \
-                                                (__mmask8)-1))
+#define _mm256_extractf32x4_ps(A, imm)                                         \
+  ((__m128)__builtin_ia32_extractf32x4_256_mask(                               \
+      (__v8sf)(__m256)(A), (int)(imm), (__v4sf)_mm_setzero_ps(),               \
+      (__mmask8) - 1))
 
 #define _mm256_mask_extractf32x4_ps(W, U, A, imm) \
   ((__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \
@@ -7624,11 +7562,10 @@ _mm256_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
                                                 (__v4sf)_mm_setzero_ps(), \
                                                 (__mmask8)(U)))
 
-#define _mm256_extracti32x4_epi32(A, imm) \
-  ((__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \
-                                                 (int)(imm), \
-                                                 (__v4si)_mm_undefined_si128(), \
-                                                 (__mmask8)-1))
+#define _mm256_extracti32x4_epi32(A, imm)                                      \
+  ((__m128i)__builtin_ia32_extracti32x4_256_mask(                              \
+      (__v8si)(__m256i)(A), (int)(imm), (__v4si)_mm_setzero_si128(),           \
+      (__mmask8) - 1))
 
 #define _mm256_mask_extracti32x4_epi32(W, U, A, imm) \
   ((__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \
@@ -8003,65 +7940,57 @@ _mm256_maskz_permutexvar_epi32(__mmask8 __M, __m256i __X, __m256i __Y)
                                  (__v4di)_mm256_alignr_epi64((A), (B), (imm)), \
                                  (__v4di)_mm256_setzero_si256()))
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_movehdup_ps (__m128 __W, __mmask8 __U, __m128 __A)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_movehdup_ps(__m128 __W, __mmask8 __U, __m128 __A) {
   return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
                                              (__v4sf)_mm_movehdup_ps(__A),
                                              (__v4sf)__W);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_movehdup_ps (__mmask8 __U, __m128 __A)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_maskz_movehdup_ps(__mmask8 __U, __m128 __A) {
   return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
                                              (__v4sf)_mm_movehdup_ps(__A),
                                              (__v4sf)_mm_setzero_ps());
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_movehdup_ps (__m256 __W, __mmask8 __U, __m256 __A)
-{
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_movehdup_ps(__m256 __W, __mmask8 __U, __m256 __A) {
   return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
                                              (__v8sf)_mm256_movehdup_ps(__A),
                                              (__v8sf)__W);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_movehdup_ps (__mmask8 __U, __m256 __A)
-{
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_movehdup_ps(__mmask8 __U, __m256 __A) {
   return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
                                              (__v8sf)_mm256_movehdup_ps(__A),
                                              (__v8sf)_mm256_setzero_ps());
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_moveldup_ps (__m128 __W, __mmask8 __U, __m128 __A)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_moveldup_ps(__m128 __W, __mmask8 __U, __m128 __A) {
   return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
                                              (__v4sf)_mm_moveldup_ps(__A),
                                              (__v4sf)__W);
 }
 
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_moveldup_ps (__mmask8 __U, __m128 __A)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_maskz_moveldup_ps(__mmask8 __U, __m128 __A) {
   return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
                                              (__v4sf)_mm_moveldup_ps(__A),
                                              (__v4sf)_mm_setzero_ps());
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_moveldup_ps (__m256 __W, __mmask8 __U, __m256 __A)
-{
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_moveldup_ps(__m256 __W, __mmask8 __U, __m256 __A) {
   return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
                                              (__v8sf)_mm256_moveldup_ps(__A),
                                              (__v8sf)__W);
 }
 
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_moveldup_ps (__mmask8 __U, __m256 __A)
-{
+static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_moveldup_ps(__mmask8 __U, __m256 __A) {
   return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
                                              (__v8sf)_mm256_moveldup_ps(__A),
                                              (__v8sf)_mm256_setzero_ps());
diff --git a/clang/lib/Headers/avxifmaintrin.h b/clang/lib/Headers/avxifmaintrin.h
index 5c782d2a5b865..a2ef601913431 100644
--- a/clang/lib/Headers/avxifmaintrin.h
+++ b/clang/lib/Headers/avxifmaintrin.h
@@ -15,12 +15,21 @@
 #define __AVXIFMAINTRIN_H
 
 /* Define the default attributes for the functions in this file. */
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS128                                                  \
+  constexpr __attribute__((__always_inline__, __nodebug__,                     \
+                           __target__("avxifma"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  constexpr __attribute__((__always_inline__, __nodebug__,                     \
+                           __target__("avxifma"), __min_vector_width__(256)))
+#else
 #define __DEFAULT_FN_ATTRS128                                                  \
   __attribute__((__always_inline__, __nodebug__, __target__("avxifma"),        \
                  __min_vector_width__(128)))
 #define __DEFAULT_FN_ATTRS256                                                  \
   __attribute__((__always_inline__, __nodebug__, __target__("avxifma"),        \
                  __min_vector_width__(256)))
+#endif
 
 // must vex-encoding
 
diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h
index 123fa7933c4f8..4aef9245323fb 100644
--- a/clang/lib/Headers/avxintrin.h
+++ b/clang/lib/Headers/avxintrin.h
@@ -694,9 +694,8 @@ _mm256_xor_ps(__m256 __a, __m256 __b)
 ///    elements of a vector of [4 x double].
 /// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
 ///    both operands.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_hadd_pd(__m256d __a, __m256d __b)
-{
+static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm256_hadd_pd(__m256d __a, __m256d __b) {
   return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
 }
 
@@ -717,9 +716,8 @@ _mm256_hadd_pd(__m256d __a, __m256d __b)
 ///    index 2, 3, 6, 7 of a vector of [8 x float].
 /// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
 ///    both operands.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_hadd_ps(__m256 __a, __m256 __b)
-{
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_hadd_ps(__m256 __a,
+                                                                   __m256 __b) {
   return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
 }
 
@@ -740,9 +738,8 @@ _mm256_hadd_ps(__m256 __a, __m256 __b)
 ///    odd-indexed elements of a vector of [4 x double].
 /// \returns A 256-bit vector of [4 x double] containing the horizontal
 ///    differences of both operands.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_hsub_pd(__m256d __a, __m256d __b)
-{
+static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm256_hsub_pd(__m256d __a, __m256d __b) {
   return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
 }
 
@@ -763,9 +760,8 @@ _mm256_hsub_pd(__m256d __a, __m256d __b)
 ///    elements with index 2, 3, 6, 7 of a vector of [8 x float].
 /// \returns A 256-bit vector of [8 x float] containing the horizontal
 ///    differences of both operands.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_hsub_ps(__m256 __a, __m256 __b)
-{
+static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_hsub_ps(__m256 __a,
+                                                                   __m256 __b) {
   return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
 }
 
@@ -2945,9 +2941,8 @@ _mm256_testnzc_si256(__m256i __a, __m256i __b) {
 ///    A 256-bit vector of [4 x double] containing the double-precision
 ///    floating point values with sign bits to be extracted.
 /// \returns The sign bits from the operand, written to bits [3:0].
-static __inline int __DEFAULT_FN_ATTRS
-_mm256_movemask_pd(__m256d __a)
-{
+static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm256_movemask_pd(__m256d __a) {
   return __builtin_ia32_movmskpd256((__v4df)__a);
 }
 
@@ -2963,9 +2958,8 @@ _mm256_movemask_pd(__m256d __a)
 ///    A 256-bit vector of [8 x float] containing the single-precision floating
 ///    point values with sign bits to be extracted.
 /// \returns The sign bits from the operand, written to bits [7:0].
-static __inline int __DEFAULT_FN_ATTRS
-_mm256_movemask_ps(__m256 __a)
-{
+static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm256_movemask_ps(__m256 __a) {
   return __builtin_ia32_movmskps256((__v8sf)__a);
 }
 
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index 454e9a2504949..dbe5ca0379cf5 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -4280,7 +4280,8 @@ _mm_packus_epi16(__m128i __a, __m128i __b) {
 ///    A 128-bit integer vector containing the values with bits to be extracted.
 /// \returns The most significant bits from each 8-bit element in \a __a,
 ///    written to bits [15:0]. The other bits are assigned zeros.
-static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a) {
+static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_movemask_epi8(__m128i __a) {
   return __builtin_ia32_pmovmskb128((__v16qi)__a);
 }
 
@@ -4699,7 +4700,8 @@ _mm_unpacklo_pd(__m128d __a, __m128d __b) {
 ///    be extracted.
 /// \returns The sign bits from each of the double-precision elements in \a __a,
 ///    written to bits [1:0]. The remaining bits are assigned values of zero.
-static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a) {
+static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_movemask_pd(__m128d __a) {
   return __builtin_ia32_movmskpd((__v2df)__a);
 }
 
diff --git a/clang/lib/Headers/pmmintrin.h b/clang/lib/Headers/pmmintrin.h
index f0c9b2ba38b0c..42bd343e326de 100644
--- a/clang/lib/Headers/pmmintrin.h
+++ b/clang/lib/Headers/pmmintrin.h
@@ -83,9 +83,8 @@ _mm_addsub_ps(__m128 __a, __m128 __b)
 ///    destination.
 /// \returns A 128-bit vector of [4 x float] containing the horizontal sums of
 ///    both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_hadd_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hadd_ps(__m128 __a,
+                                                                  __m128 __b) {
   return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b);
 }
 
@@ -106,9 +105,8 @@ _mm_hadd_ps(__m128 __a, __m128 __b)
 ///    bits of the destination.
 /// \returns A 128-bit vector of [4 x float] containing the horizontal
 ///    differences of both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_hsub_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hsub_ps(__m128 __a,
+                                                                  __m128 __b) {
   return __builtin_ia32_hsubps((__v4sf)__a, (__v4sf)__b);
 }
 
@@ -168,9 +166,8 @@ _mm_moveldup_ps(__m128 __a)
 ///    A 128-bit vector of [2 x double] containing the right source operand.
 /// \returns A 128-bit vector of [2 x double] containing the alternating sums
 ///    and differences of both operands.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_addsub_pd(__m128d __a, __m128d __b)
-{
+static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_addsub_pd(__m128d __a, __m128d __b) {
   return __builtin_ia32_addsubpd((__v2df)__a, (__v2df)__b);
 }
 
@@ -191,9 +188,8 @@ _mm_addsub_pd(__m128d __a, __m128d __b)
 ///    destination.
 /// \returns A 128-bit vector of [2 x double] containing the horizontal sums of
 ///    both operands.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_hadd_pd(__m128d __a, __m128d __b)
-{
+static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_hadd_pd(__m128d __a, __m128d __b) {
   return __builtin_ia32_haddpd((__v2df)__a, (__v2df)__b);
 }
 
@@ -214,9 +210,8 @@ _mm_hadd_pd(__m128d __a, __m128d __b)
 ///    the destination.
 /// \returns A 128-bit vector of [2 x double] containing the horizontal
 ///    differences of both operands.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_hsub_pd(__m128d __a, __m128d __b)
-{
+static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_hsub_pd(__m128d __a, __m128d __b) {
   return __builtin_ia32_hsubpd((__v2df)__a, (__v2df)__b);
 }
 
diff --git a/clang/lib/Headers/ptrauth.h b/clang/lib/Headers/ptrauth.h
index f902ca1e3bbd3..ad28f06f0930c 100644
--- a/clang/lib/Headers/ptrauth.h
+++ b/clang/lib/Headers/ptrauth.h
@@ -241,6 +241,18 @@ typedef __UINTPTR_TYPE__ ptrauth_generic_signature_t;
 #define ptrauth_type_discriminator(__type)                                     \
   __builtin_ptrauth_type_discriminator(__type)
 
+/* Compute the constant discriminator used by Clang to sign pointers with the
+   given C function pointer type.
+
+   A call to this function is an integer constant expression. */
+#if __has_feature(ptrauth_function_pointer_type_discrimination)
+#define ptrauth_function_pointer_type_discriminator(__type)                    \
+  __builtin_ptrauth_type_discriminator(__type)
+#else
+#define ptrauth_function_pointer_type_discriminator(__type)                    \
+  ((ptrauth_extra_data_t)0)
+#endif
+
 /* Compute a signature for the given pair of pointer-sized values.
    The order of the arguments is significant.
 
@@ -372,6 +384,8 @@ typedef __UINTPTR_TYPE__ ptrauth_generic_signature_t;
   })
 
 #define ptrauth_type_discriminator(__type) ((ptrauth_extra_data_t)0)
+#define ptrauth_function_pointer_type_discriminator(__type)                    \
+  ((ptrauth_extra_data_t)0)
 
 #define ptrauth_sign_generic_data(__value, __data)                             \
   ({                                                                           \
diff --git a/clang/lib/Headers/smmintrin.h b/clang/lib/Headers/smmintrin.h
index 4f197d5ecaff9..511a135375295 100644
--- a/clang/lib/Headers/smmintrin.h
+++ b/clang/lib/Headers/smmintrin.h
@@ -1524,7 +1524,8 @@ _mm_packus_epi32(__m128i __V1, __m128i __V2) {
 /// \returns A 128-bit value where bits [15:0] contain the minimum value found
 ///    in parameter \a __V, bits [18:16] contain the index of the minimum value
 ///    and the remaining bits are set to 0.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_minpos_epu16(__m128i __V) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_minpos_epu16(__m128i __V) {
   return (__m128i)__builtin_ia32_phminposuw128((__v8hi)__V);
 }
 
diff --git a/clang/lib/Headers/tmmintrin.h b/clang/lib/Headers/tmmintrin.h
index 3fc9f9834baa9..5d0f20f4d527d 100644
--- a/clang/lib/Headers/tmmintrin.h
+++ b/clang/lib/Headers/tmmintrin.h
@@ -202,10 +202,9 @@ _mm_abs_epi32(__m128i __a) {
 ///    destination.
 /// \returns A 128-bit vector of [8 x i16] containing the horizontal sums of
 ///    both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_hadd_epi16(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_hadd_epi16(__m128i __a, __m128i __b) {
+  return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
 }
 
 /// Horizontally adds the adjacent pairs of values contained in 2 packed
@@ -225,10 +224,9 @@ _mm_hadd_epi16(__m128i __a, __m128i __b)
 ///    destination.
 /// \returns A 128-bit vector of [4 x i32] containing the horizontal sums of
 ///    both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_hadd_epi32(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_hadd_epi32(__m128i __a, __m128i __b) {
+  return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
 }
 
 /// Horizontally adds the adjacent pairs of values contained in 2 packed
@@ -248,11 +246,10 @@ _mm_hadd_epi32(__m128i __a, __m128i __b)
 ///    destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
 ///    operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_hadd_pi16(__m64 __a, __m64 __b)
-{
-    return __trunc64(__builtin_ia32_phaddw128(
-        (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hadd_pi16(__m64 __a,
+                                                                   __m64 __b) {
+  return __trunc64(__builtin_ia32_phaddw128(
+      (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
 }
 
 /// Horizontally adds the adjacent pairs of values contained in 2 packed
@@ -272,11 +269,10 @@ _mm_hadd_pi16(__m64 __a, __m64 __b)
 ///    destination.
 /// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
 ///    operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_hadd_pi32(__m64 __a, __m64 __b)
-{
-    return __trunc64(__builtin_ia32_phaddd128(
-        (__v4si)__builtin_shufflevector(__a, __b, 0, 1), (__v4si){}));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hadd_pi32(__m64 __a,
+                                                                   __m64 __b) {
+  return __trunc64(__builtin_ia32_phaddd128(
+      (__v4si)__builtin_shufflevector(__a, __b, 0, 1), (__v4si){}));
 }
 
 /// Horizontally adds, with saturation, the adjacent pairs of values contained
@@ -299,10 +295,9 @@ _mm_hadd_pi32(__m64 __a, __m64 __b)
 ///    destination.
 /// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
 ///    sums of both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_hadds_epi16(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_hadds_epi16(__m128i __a, __m128i __b) {
+  return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
 }
 
 /// Horizontally adds, with saturation, the adjacent pairs of values contained
@@ -325,11 +320,10 @@ _mm_hadds_epi16(__m128i __a, __m128i __b)
 ///    destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
 ///    sums of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_hadds_pi16(__m64 __a, __m64 __b)
-{
-    return __trunc64(__builtin_ia32_phaddsw128(
-        (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hadds_pi16(__m64 __a,
+                                                                    __m64 __b) {
+  return __trunc64(__builtin_ia32_phaddsw128(
+      (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
 }
 
 /// Horizontally subtracts the adjacent pairs of values contained in 2
@@ -349,10 +343,9 @@ _mm_hadds_pi16(__m64 __a, __m64 __b)
 ///    the destination.
 /// \returns A 128-bit vector of [8 x i16] containing the horizontal differences
 ///    of both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_hsub_epi16(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_hsub_epi16(__m128i __a, __m128i __b) {
+  return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
 }
 
 /// Horizontally subtracts the adjacent pairs of values contained in 2
@@ -372,10 +365,9 @@ _mm_hsub_epi16(__m128i __a, __m128i __b)
 ///    the destination.
 /// \returns A 128-bit vector of [4 x i32] containing the horizontal differences
 ///    of both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_hsub_epi32(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_hsub_epi32(__m128i __a, __m128i __b) {
+  return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
 }
 
 /// Horizontally subtracts the adjacent pairs of values contained in 2
@@ -395,11 +387,10 @@ _mm_hsub_epi32(__m128i __a, __m128i __b)
 ///    the destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
 ///    of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_hsub_pi16(__m64 __a, __m64 __b)
-{
-    return __trunc64(__builtin_ia32_phsubw128(
-        (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hsub_pi16(__m64 __a,
+                                                                   __m64 __b) {
+  return __trunc64(__builtin_ia32_phsubw128(
+      (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
 }
 
 /// Horizontally subtracts the adjacent pairs of values contained in 2
@@ -419,11 +410,10 @@ _mm_hsub_pi16(__m64 __a, __m64 __b)
 ///    the destination.
 /// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
 ///    of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_hsub_pi32(__m64 __a, __m64 __b)
-{
-    return __trunc64(__builtin_ia32_phsubd128(
-        (__v4si)__builtin_shufflevector(__a, __b, 0, 1), (__v4si){}));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hsub_pi32(__m64 __a,
+                                                                   __m64 __b) {
+  return __trunc64(__builtin_ia32_phsubd128(
+      (__v4si)__builtin_shufflevector(__a, __b, 0, 1), (__v4si){}));
 }
 
 /// Horizontally subtracts, with saturation, the adjacent pairs of values
@@ -446,10 +436,9 @@ _mm_hsub_pi32(__m64 __a, __m64 __b)
 ///    the destination.
 /// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
 ///    differences of both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_hsubs_epi16(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_hsubs_epi16(__m128i __a, __m128i __b) {
+  return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
 }
 
 /// Horizontally subtracts, with saturation, the adjacent pairs of values
@@ -472,11 +461,10 @@ _mm_hsubs_epi16(__m128i __a, __m128i __b)
 ///    the destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
 ///    differences of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_hsubs_pi16(__m64 __a, __m64 __b)
-{
-    return __trunc64(__builtin_ia32_phsubsw128(
-        (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_hsubs_pi16(__m64 __a,
+                                                                    __m64 __b) {
+  return __trunc64(__builtin_ia32_phsubsw128(
+      (__v8hi)__builtin_shufflevector(__a, __b, 0, 1), (__v8hi){}));
 }
 
 /// Multiplies corresponding pairs of packed 8-bit unsigned integer
@@ -556,10 +544,9 @@ _mm_maddubs_pi16(__m64 __a, __m64 __b) {
 ///    A 128-bit vector of [8 x i16] containing one of the source operands.
 /// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled
 ///    products of both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mulhrs_epi16(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_mulhrs_epi16(__m128i __a, __m128i __b) {
+  return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
 }
 
 /// Multiplies packed 16-bit signed integer values, truncates the 32-bit
@@ -576,11 +563,10 @@ _mm_mulhrs_epi16(__m128i __a, __m128i __b)
 ///    A 64-bit vector of [4 x i16] containing one of the source operands.
 /// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
 ///    products of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_mulhrs_pi16(__m64 __a, __m64 __b)
-{
-    return __trunc64(__builtin_ia32_pmulhrsw128((__v8hi)__anyext128(__a),
-                                                (__v8hi)__anyext128(__b)));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_mulhrs_pi16(__m64 __a, __m64 __b) {
+  return __trunc64(__builtin_ia32_pmulhrsw128((__v8hi)__zext128(__a),
+                                              (__v8hi)__zext128(__b)));
 }
 
 /// Copies the 8-bit integers from a 128-bit integer vector to the
@@ -603,10 +589,9 @@ _mm_mulhrs_pi16(__m64 __a, __m64 __b)
 ///    Bits [6:4] Reserved.  \n
 ///    Bits [3:0] select the source byte to be copied.
 /// \returns A 128-bit integer vector containing the copied or cleared values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_shuffle_epi8(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_shuffle_epi8(__m128i __a, __m128i __b) {
+  return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
 }
 
 /// Copies the 8-bit integers from a 64-bit integer vector to the
@@ -628,13 +613,12 @@ _mm_shuffle_epi8(__m128i __a, __m128i __b)
 ///    destination. \n
 ///    Bits [2:0] select the source byte to be copied.
 /// \returns A 64-bit integer vector containing the copied or cleared values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_shuffle_pi8(__m64 __a, __m64 __b)
-{
-    return __trunc64(__builtin_ia32_pshufb128(
-        (__v16qi)__builtin_shufflevector(
-            (__v2si)(__a), __extension__ (__v2si){}, 0, 1, 0, 1),
-        (__v16qi)__anyext128(__b)));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_shuffle_pi8(__m64 __a, __m64 __b) {
+  return __trunc64(__builtin_ia32_pshufb128(
+      (__v16qi)__builtin_shufflevector((__v2si)(__a), __extension__(__v2si){},
+                                       0, 1, 0, 1),
+      (__v16qi)__zext128(__b)));
 }
 
 /// For each 8-bit integer in the first source operand, perform one of
diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h
index 605409c1f43b9..fe6afdcfc3fdb 100644
--- a/clang/lib/Headers/xmmintrin.h
+++ b/clang/lib/Headers/xmmintrin.h
@@ -3014,9 +3014,7 @@ _mm_cvtps_pi8(__m128 __a)
 /// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
 ///    single-precision floating-point element of the parameter. Bits [31:4] are
 ///    set to zero.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_movemask_ps(__m128 __a)
-{
+static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR _mm_movemask_ps(__m128 __a) {
   return __builtin_ia32_movmskps((__v4sf)__a);
 }
 
diff --git a/clang/lib/Index/IndexTypeSourceInfo.cpp b/clang/lib/Index/IndexTypeSourceInfo.cpp
index 74c6c116b274e..3c1e038e0c173 100644
--- a/clang/lib/Index/IndexTypeSourceInfo.cpp
+++ b/clang/lib/Index/IndexTypeSourceInfo.cpp
@@ -117,7 +117,7 @@ class TypeIndexer : public RecursiveASTVisitor<TypeIndexer> {
   }
 
   bool VisitTagTypeLoc(TagTypeLoc TL) {
-    TagDecl *D = TL.getOriginalDecl();
+    TagDecl *D = TL.getDecl();
     if (!IndexCtx.shouldIndexFunctionLocalSymbols() &&
         D->getParentFunctionOrMethod())
       return true;
diff --git a/clang/lib/Index/USRGeneration.cpp b/clang/lib/Index/USRGeneration.cpp
index c78d66f9502dd..08835ea786997 100644
--- a/clang/lib/Index/USRGeneration.cpp
+++ b/clang/lib/Index/USRGeneration.cpp
@@ -911,11 +911,10 @@ void USRGenerator::VisitType(QualType T) {
     }
     if (const TagType *TT = T->getAs<TagType>()) {
       if (const auto *ICNT = dyn_cast<InjectedClassNameType>(TT)) {
-        T = ICNT->getOriginalDecl()->getCanonicalTemplateSpecializationType(
-            Ctx);
+        T = ICNT->getDecl()->getCanonicalTemplateSpecializationType(Ctx);
       } else {
         Out << '$';
-        VisitTagDecl(TT->getOriginalDecl());
+        VisitTagDecl(TT->getDecl());
         return;
       }
     }
diff --git a/clang/lib/InstallAPI/HeaderFile.cpp b/clang/lib/InstallAPI/HeaderFile.cpp
index 0b7041ec8147e..d736a0af0dd47 100644
--- a/clang/lib/InstallAPI/HeaderFile.cpp
+++ b/clang/lib/InstallAPI/HeaderFile.cpp
@@ -38,7 +38,7 @@ std::optional<std::string> createIncludeHeaderName(const StringRef FullPath) {
 
 bool isHeaderFile(StringRef Path) {
   return StringSwitch<bool>(sys::path::extension(Path))
-      .Cases(".h", ".H", ".hh", ".hpp", ".hxx", true)
+      .Cases({".h", ".H", ".hh", ".hpp", ".hxx"}, true)
       .Default(false);
 }
 
diff --git a/clang/lib/InstallAPI/Visitor.cpp b/clang/lib/InstallAPI/Visitor.cpp
index f12e04069817b..53fbc36ae7600 100644
--- a/clang/lib/InstallAPI/Visitor.cpp
+++ b/clang/lib/InstallAPI/Visitor.cpp
@@ -543,8 +543,8 @@ void InstallAPIVisitor::emitVTableSymbols(const CXXRecordDecl *D,
   }
 
   for (const auto &It : D->bases()) {
-    const CXXRecordDecl *Base = cast<CXXRecordDecl>(
-        It.getType()->castAs<RecordType>()->getOriginalDecl());
+    const auto *Base =
+        cast<CXXRecordDecl>(It.getType()->castAs<RecordType>()->getDecl());
     const auto BaseAccess = getAccessForDecl(Base);
     if (!BaseAccess)
       continue;
diff --git a/clang/lib/Interpreter/InterpreterValuePrinter.cpp b/clang/lib/Interpreter/InterpreterValuePrinter.cpp
index a55b7f5f1a5fc..0ed02f3bfabe8 100644
--- a/clang/lib/Interpreter/InterpreterValuePrinter.cpp
+++ b/clang/lib/Interpreter/InterpreterValuePrinter.cpp
@@ -66,10 +66,10 @@ static std::string QualTypeToString(ASTContext &Ctx, QualType QT) {
   const QualType NonRefTy = QT.getNonReferenceType();
 
   if (const auto *TTy = llvm::dyn_cast<TagType>(NonRefTy))
-    return DeclTypeToString(NonRefTy, TTy->getOriginalDecl());
+    return DeclTypeToString(NonRefTy, TTy->getDecl());
 
   if (const auto *TRy = dyn_cast<RecordType>(NonRefTy))
-    return DeclTypeToString(NonRefTy, TRy->getOriginalDecl());
+    return DeclTypeToString(NonRefTy, TRy->getDecl());
 
   const QualType Canon = NonRefTy.getCanonicalType();
 
diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp
index 5c6ecdbc304d6..6a5e5d4bad3a6 100644
--- a/clang/lib/Lex/PPDirectives.cpp
+++ b/clang/lib/Lex/PPDirectives.cpp
@@ -248,50 +248,67 @@ static bool warnByDefaultOnWrongCase(StringRef Include) {
 
   // The standard C/C++ and Posix headers
   return llvm::StringSwitch<bool>(LowerInclude)
-    // C library headers
-    .Cases("assert.h", "complex.h", "ctype.h", "errno.h", "fenv.h", true)
-    .Cases("float.h", "inttypes.h", "iso646.h", "limits.h", "locale.h", true)
-    .Cases("math.h", "setjmp.h", "signal.h", "stdalign.h", "stdarg.h", true)
-    .Cases("stdatomic.h", "stdbool.h", "stdckdint.h", "stdcountof.h", true)
-    .Cases("stddef.h", "stdint.h", "stdio.h", "stdlib.h", "stdnoreturn.h", true)
-    .Cases("string.h", "tgmath.h", "threads.h", "time.h", "uchar.h", true)
-    .Cases("wchar.h", "wctype.h", true)
-
-    // C++ headers for C library facilities
-    .Cases("cassert", "ccomplex", "cctype", "cerrno", "cfenv", true)
-    .Cases("cfloat", "cinttypes", "ciso646", "climits", "clocale", true)
-    .Cases("cmath", "csetjmp", "csignal", "cstdalign", "cstdarg", true)
-    .Cases("cstdbool", "cstddef", "cstdint", "cstdio", "cstdlib", true)
-    .Cases("cstring", "ctgmath", "ctime", "cuchar", "cwchar", true)
-    .Case("cwctype", true)
-
-    // C++ library headers
-    .Cases("algorithm", "fstream", "list", "regex", "thread", true)
-    .Cases("array", "functional", "locale", "scoped_allocator", "tuple", true)
-    .Cases("atomic", "future", "map", "set", "type_traits", true)
-    .Cases("bitset", "initializer_list", "memory", "shared_mutex", "typeindex", true)
-    .Cases("chrono", "iomanip", "mutex", "sstream", "typeinfo", true)
-    .Cases("codecvt", "ios", "new", "stack", "unordered_map", true)
-    .Cases("complex", "iosfwd", "numeric", "stdexcept", "unordered_set", true)
-    .Cases("condition_variable", "iostream", "ostream", "streambuf", "utility", true)
-    .Cases("deque", "istream", "queue", "string", "valarray", true)
-    .Cases("exception", "iterator", "random", "strstream", "vector", true)
-    .Cases("forward_list", "limits", "ratio", "system_error", true)
-
-    // POSIX headers (which aren't also C headers)
-    .Cases("aio.h", "arpa/inet.h", "cpio.h", "dirent.h", "dlfcn.h", true)
-    .Cases("fcntl.h", "fmtmsg.h", "fnmatch.h", "ftw.h", "glob.h", true)
-    .Cases("grp.h", "iconv.h", "langinfo.h", "libgen.h", "monetary.h", true)
-    .Cases("mqueue.h", "ndbm.h", "net/if.h", "netdb.h", "netinet/in.h", true)
-    .Cases("netinet/tcp.h", "nl_types.h", "poll.h", "pthread.h", "pwd.h", true)
-    .Cases("regex.h", "sched.h", "search.h", "semaphore.h", "spawn.h", true)
-    .Cases("strings.h", "stropts.h", "sys/ipc.h", "sys/mman.h", "sys/msg.h", true)
-    .Cases("sys/resource.h", "sys/select.h",  "sys/sem.h", "sys/shm.h", "sys/socket.h", true)
-    .Cases("sys/stat.h", "sys/statvfs.h", "sys/time.h", "sys/times.h", "sys/types.h", true)
-    .Cases("sys/uio.h", "sys/un.h", "sys/utsname.h", "sys/wait.h", "syslog.h", true)
-    .Cases("tar.h", "termios.h", "trace.h", "ulimit.h", true)
-    .Cases("unistd.h", "utime.h", "utmpx.h", "wordexp.h", true)
-    .Default(false);
+      // C library headers
+      .Cases({"assert.h", "complex.h", "ctype.h", "errno.h", "fenv.h"}, true)
+      .Cases({"float.h", "inttypes.h", "iso646.h", "limits.h", "locale.h"},
+             true)
+      .Cases({"math.h", "setjmp.h", "signal.h", "stdalign.h", "stdarg.h"}, true)
+      .Cases({"stdatomic.h", "stdbool.h", "stdckdint.h", "stdcountof.h"}, true)
+      .Cases({"stddef.h", "stdint.h", "stdio.h", "stdlib.h", "stdnoreturn.h"},
+             true)
+      .Cases({"string.h", "tgmath.h", "threads.h", "time.h", "uchar.h"}, true)
+      .Cases({"wchar.h", "wctype.h"}, true)
+
+      // C++ headers for C library facilities
+      .Cases({"cassert", "ccomplex", "cctype", "cerrno", "cfenv"}, true)
+      .Cases({"cfloat", "cinttypes", "ciso646", "climits", "clocale"}, true)
+      .Cases({"cmath", "csetjmp", "csignal", "cstdalign", "cstdarg"}, true)
+      .Cases({"cstdbool", "cstddef", "cstdint", "cstdio", "cstdlib"}, true)
+      .Cases({"cstring", "ctgmath", "ctime", "cuchar", "cwchar"}, true)
+      .Case("cwctype", true)
+
+      // C++ library headers
+      .Cases({"algorithm", "fstream", "list", "regex", "thread"}, true)
+      .Cases({"array", "functional", "locale", "scoped_allocator", "tuple"},
+             true)
+      .Cases({"atomic", "future", "map", "set", "type_traits"}, true)
+      .Cases(
+          {"bitset", "initializer_list", "memory", "shared_mutex", "typeindex"},
+          true)
+      .Cases({"chrono", "iomanip", "mutex", "sstream", "typeinfo"}, true)
+      .Cases({"codecvt", "ios", "new", "stack", "unordered_map"}, true)
+      .Cases({"complex", "iosfwd", "numeric", "stdexcept", "unordered_set"},
+             true)
+      .Cases(
+          {"condition_variable", "iostream", "ostream", "streambuf", "utility"},
+          true)
+      .Cases({"deque", "istream", "queue", "string", "valarray"}, true)
+      .Cases({"exception", "iterator", "random", "strstream", "vector"}, true)
+      .Cases({"forward_list", "limits", "ratio", "system_error"}, true)
+
+      // POSIX headers (which aren't also C headers)
+      .Cases({"aio.h", "arpa/inet.h", "cpio.h", "dirent.h", "dlfcn.h"}, true)
+      .Cases({"fcntl.h", "fmtmsg.h", "fnmatch.h", "ftw.h", "glob.h"}, true)
+      .Cases({"grp.h", "iconv.h", "langinfo.h", "libgen.h", "monetary.h"}, true)
+      .Cases({"mqueue.h", "ndbm.h", "net/if.h", "netdb.h", "netinet/in.h"},
+             true)
+      .Cases({"netinet/tcp.h", "nl_types.h", "poll.h", "pthread.h", "pwd.h"},
+             true)
+      .Cases({"regex.h", "sched.h", "search.h", "semaphore.h", "spawn.h"}, true)
+      .Cases({"strings.h", "stropts.h", "sys/ipc.h", "sys/mman.h", "sys/msg.h"},
+             true)
+      .Cases({"sys/resource.h", "sys/select.h", "sys/sem.h", "sys/shm.h",
+              "sys/socket.h"},
+             true)
+      .Cases({"sys/stat.h", "sys/statvfs.h", "sys/time.h", "sys/times.h",
+              "sys/types.h"},
+             true)
+      .Cases(
+          {"sys/uio.h", "sys/un.h", "sys/utsname.h", "sys/wait.h", "syslog.h"},
+          true)
+      .Cases({"tar.h", "termios.h", "trace.h", "ulimit.h"}, true)
+      .Cases({"unistd.h", "utime.h", "utmpx.h", "wordexp.h"}, true)
+      .Default(false);
 }
 
 /// Find a similar string in `Candidates`.
@@ -3648,14 +3665,14 @@ Preprocessor::LexEmbedParameters(Token &CurTok, bool ForHasEmbed) {
           std::pair<tok::TokenKind, SourceLocation> Matches) {
         Diag(CurTok, diag::err_expected) << Expected;
         Diag(Matches.second, diag::note_matching) << Matches.first;
-        if (CurTok.isNot(tok::eod))
+        if (CurTok.isNot(EndTokenKind))
           DiscardUntilEndOfDirective(CurTok);
       };
 
   auto ExpectOrDiagAndSkipToEOD = [&](tok::TokenKind Kind) {
     if (CurTok.isNot(Kind)) {
       Diag(CurTok, diag::err_expected) << Kind;
-      if (CurTok.isNot(tok::eod))
+      if (CurTok.isNot(EndTokenKind))
         DiscardUntilEndOfDirective(CurTok);
       return false;
     }
@@ -3746,7 +3763,7 @@ Preprocessor::LexEmbedParameters(Token &CurTok, bool ForHasEmbed) {
       if (Result.isNegative()) {
         Diag(CurTok, diag::err_requires_positive_value)
             << toString(Result, 10) << /*positive*/ 0;
-        if (CurTok.isNot(tok::eod))
+        if (CurTok.isNot(EndTokenKind))
           DiscardUntilEndOfDirective(CurTok);
         return std::nullopt;
       }
@@ -3889,7 +3906,7 @@ Preprocessor::LexEmbedParameters(Token &CurTok, bool ForHasEmbed) {
       }
       if (!ForHasEmbed) {
         Diag(ParamStartLoc, diag::err_pp_unknown_parameter) << 1 << Parameter;
-        if (CurTok.isNot(tok::eod))
+        if (CurTok.isNot(EndTokenKind))
           DiscardUntilEndOfDirective(CurTok);
         return std::nullopt;
       }
diff --git a/clang/lib/Lex/PPLexerChange.cpp b/clang/lib/Lex/PPLexerChange.cpp
index d8f61c02a9837..b014124153c83 100644
--- a/clang/lib/Lex/PPLexerChange.cpp
+++ b/clang/lib/Lex/PPLexerChange.cpp
@@ -302,7 +302,7 @@ void Preprocessor::diagnoseMissingHeaderInUmbrellaDir(const Module &Mod) {
     // Check whether this entry has an extension typically associated with
     // headers.
     if (!StringSwitch<bool>(llvm::sys::path::extension(Entry->path()))
-             .Cases(".h", ".H", ".hh", ".hpp", true)
+             .Cases({".h", ".H", ".hh", ".hpp"}, true)
              .Default(false))
       continue;
 
diff --git a/clang/lib/Lex/PPMacroExpansion.cpp b/clang/lib/Lex/PPMacroExpansion.cpp
index bfe9a1a784b0d..d8281144366ee 100644
--- a/clang/lib/Lex/PPMacroExpansion.cpp
+++ b/clang/lib/Lex/PPMacroExpansion.cpp
@@ -1262,16 +1262,11 @@ EmbedResult Preprocessor::EvaluateHasEmbed(Token &Tok, IdentifierInfo *II) {
 
   std::optional<LexEmbedParametersResult> Params =
       this->LexEmbedParameters(Tok, /*ForHasEmbed=*/true);
-  assert((Params || Tok.is(tok::eod)) &&
-         "expected success or to be at the end of the directive");
 
   if (!Params)
     return EmbedResult::Invalid;
 
-  if (Params->UnrecognizedParams > 0)
-    return EmbedResult::NotFound;
-
-  if (!Tok.is(tok::r_paren)) {
+  if (Tok.isNot(tok::r_paren)) {
     Diag(this->getLocForEndOfToken(FilenameLoc), diag::err_pp_expected_after)
         << II << tok::r_paren;
     Diag(LParenLoc, diag::note_matching) << tok::l_paren;
@@ -1280,6 +1275,9 @@ EmbedResult Preprocessor::EvaluateHasEmbed(Token &Tok, IdentifierInfo *II) {
     return EmbedResult::Invalid;
   }
 
+  if (Params->UnrecognizedParams > 0)
+    return EmbedResult::NotFound;
+
   SmallString<128> FilenameBuffer;
   StringRef Filename = this->getSpelling(FilenameTok, FilenameBuffer);
   if (Filename.empty())
diff --git a/clang/lib/Parse/ParsePragma.cpp b/clang/lib/Parse/ParsePragma.cpp
index 98933811265e8..7c2b9280f0b76 100644
--- a/clang/lib/Parse/ParsePragma.cpp
+++ b/clang/lib/Parse/ParsePragma.cpp
@@ -1419,10 +1419,11 @@ bool Parser::HandlePragmaLoopHint(LoopHint &Hint) {
 
   // Return a valid hint if pragma unroll or nounroll were specified
   // without an argument.
-  auto IsLoopHint = llvm::StringSwitch<bool>(PragmaNameInfo->getName())
-                        .Cases("unroll", "nounroll", "unroll_and_jam",
-                               "nounroll_and_jam", true)
-                        .Default(false);
+  auto IsLoopHint =
+      llvm::StringSwitch<bool>(PragmaNameInfo->getName())
+          .Cases({"unroll", "nounroll", "unroll_and_jam", "nounroll_and_jam"},
+                 true)
+          .Default(false);
 
   if (Toks.empty() && IsLoopHint) {
     ConsumeAnnotationToken();
diff --git a/clang/lib/Parse/ParseTemplate.cpp b/clang/lib/Parse/ParseTemplate.cpp
index dbc7cbc6cdc0c..330a9c6aea0c5 100644
--- a/clang/lib/Parse/ParseTemplate.cpp
+++ b/clang/lib/Parse/ParseTemplate.cpp
@@ -533,6 +533,12 @@ bool Parser::isTypeConstraintAnnotation() {
 bool Parser::TryAnnotateTypeConstraint() {
   if (!getLangOpts().CPlusPlus20)
     return false;
+  // The type constraint may declare template parameters, notably
+  // if it contains a generic lambda, so we need to increment
+  // the template depth as these parameters would not be instantiated
+  // at the current depth.
+  TemplateParameterDepthRAII CurTemplateDepthTracker(TemplateParameterDepth);
+  ++CurTemplateDepthTracker;
   CXXScopeSpec SS;
   bool WasScopeAnnotation = Tok.is(tok::annot_cxxscope);
   if (ParseOptionalCXXScopeSpecifier(SS, /*ObjectType=*/nullptr,
diff --git a/clang/lib/Parse/Parser.cpp b/clang/lib/Parse/Parser.cpp
index bbff627d46600..ec01faf446e8d 100644
--- a/clang/lib/Parse/Parser.cpp
+++ b/clang/lib/Parse/Parser.cpp
@@ -1272,7 +1272,7 @@ Decl *Parser::ParseFunctionDefinition(ParsingDeclarator &D,
   // tokens and store them for late parsing at the end of the translation unit.
   if (getLangOpts().DelayedTemplateParsing && Tok.isNot(tok::equal) &&
       TemplateInfo.Kind == ParsedTemplateKind::Template &&
-      Actions.canDelayFunctionBody(D)) {
+      LateParsedAttrs->empty() && Actions.canDelayFunctionBody(D)) {
     MultiTemplateParamsArg TemplateParameterLists(*TemplateInfo.TemplateParams);
 
     ParseScope BodyScope(this, Scope::FnScope | Scope::DeclScope |
@@ -1301,10 +1301,8 @@ Decl *Parser::ParseFunctionDefinition(ParsingDeclarator &D,
     }
     return DP;
   }
-  else if (CurParsedObjCImpl &&
-           !TemplateInfo.TemplateParams &&
-           (Tok.is(tok::l_brace) || Tok.is(tok::kw_try) ||
-            Tok.is(tok::colon)) &&
+  if (CurParsedObjCImpl && !TemplateInfo.TemplateParams &&
+      (Tok.is(tok::l_brace) || Tok.is(tok::kw_try) || Tok.is(tok::colon)) &&
       Actions.CurContext->isTranslationUnit()) {
     ParseScope BodyScope(this, Scope::FnScope | Scope::DeclScope |
                                    Scope::CompoundStmtScope);
@@ -1420,7 +1418,8 @@ Decl *Parser::ParseFunctionDefinition(ParsingDeclarator &D,
 
   // Late attributes are parsed in the same scope as the function body.
   if (LateParsedAttrs)
-    ParseLexedAttributeList(*LateParsedAttrs, Res, false, true);
+    ParseLexedAttributeList(*LateParsedAttrs, Res, /*EnterScope=*/false,
+                            /*OnDefinition=*/true);
 
   if (SkipFunctionBodies && (!Res || Actions.canSkipFunctionBody(Res)) &&
       trySkippingFunctionBody()) {
diff --git a/clang/lib/Sema/CheckExprLifetime.cpp b/clang/lib/Sema/CheckExprLifetime.cpp
index db14349430933..e797400397d1b 100644
--- a/clang/lib/Sema/CheckExprLifetime.cpp
+++ b/clang/lib/Sema/CheckExprLifetime.cpp
@@ -361,11 +361,11 @@ static bool shouldTrackImplicitObjectArg(const CXXMethodDecl *Callee) {
     if (!Callee->getIdentifier())
       return false;
     return llvm::StringSwitch<bool>(Callee->getName())
-        .Cases("begin", "rbegin", "cbegin", "crbegin", true)
-        .Cases("end", "rend", "cend", "crend", true)
-        .Cases("c_str", "data", "get", true)
+        .Cases({"begin", "rbegin", "cbegin", "crbegin"}, true)
+        .Cases({"end", "rend", "cend", "crend"}, true)
+        .Cases({"c_str", "data", "get"}, true)
         // Map and set types.
-        .Cases("find", "equal_range", "lower_bound", "upper_bound", true)
+        .Cases({"find", "equal_range", "lower_bound", "upper_bound"}, true)
         .Default(false);
   }
   if (Callee->getReturnType()->isReferenceType()) {
@@ -377,7 +377,7 @@ static bool shouldTrackImplicitObjectArg(const CXXMethodDecl *Callee) {
              OO == OverloadedOperatorKind::OO_Star;
     }
     return llvm::StringSwitch<bool>(Callee->getName())
-        .Cases("front", "back", "at", "top", "value", true)
+        .Cases({"front", "back", "at", "top", "value"}, true)
         .Default(false);
   }
   return false;
@@ -394,14 +394,14 @@ static bool shouldTrackFirstArgument(const FunctionDecl *FD) {
   if (FD->getReturnType()->isPointerType() ||
       isRecordWithAttr<PointerAttr>(FD->getReturnType())) {
     return llvm::StringSwitch<bool>(FD->getName())
-        .Cases("begin", "rbegin", "cbegin", "crbegin", true)
-        .Cases("end", "rend", "cend", "crend", true)
+        .Cases({"begin", "rbegin", "cbegin", "crbegin"}, true)
+        .Cases({"end", "rend", "cend", "crend"}, true)
         .Case("data", true)
         .Default(false);
   }
   if (FD->getReturnType()->isReferenceType()) {
     return llvm::StringSwitch<bool>(FD->getName())
-        .Cases("get", "any_cast", true)
+        .Cases({"get", "any_cast"}, true)
         .Default(false);
   }
   return false;
diff --git a/clang/lib/Sema/DeclSpec.cpp b/clang/lib/Sema/DeclSpec.cpp
index 184d31ecd1e40..9da3d0d2ef599 100644
--- a/clang/lib/Sema/DeclSpec.cpp
+++ b/clang/lib/Sema/DeclSpec.cpp
@@ -1369,7 +1369,8 @@ void DeclSpec::Finish(Sema &S, const PrintingPolicy &Policy) {
 
   if (S.getLangOpts().C23 &&
       getConstexprSpecifier() == ConstexprSpecKind::Constexpr &&
-      StorageClassSpec == SCS_extern) {
+      getTypeSpecType() != TST_unspecified &&
+      (StorageClassSpec == SCS_extern || StorageClassSpec == SCS_auto)) {
     S.Diag(ConstexprLoc, diag::err_invalid_decl_spec_combination)
         << DeclSpec::getSpecifierName(getStorageClassSpec())
         << SourceRange(getStorageClassSpecLoc());
diff --git a/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.cpp b/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.cpp
index 40c318ae55427..066acf6f01a90 100644
--- a/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.cpp
+++ b/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.cpp
@@ -57,6 +57,29 @@ CXXConstructorDecl *lookupCopyConstructor(QualType ResTy) {
       return CD;
   return nullptr;
 }
+
+ParameterABI
+convertParamModifierToParamABI(HLSLParamModifierAttr::Spelling Modifier) {
+  assert(Modifier != HLSLParamModifierAttr::Spelling::Keyword_in &&
+         "HLSL 'in' parameters modifier cannot be converted to ParameterABI");
+  switch (Modifier) {
+  case HLSLParamModifierAttr::Spelling::Keyword_out:
+    return ParameterABI::HLSLOut;
+  case HLSLParamModifierAttr::Spelling::Keyword_inout:
+    return ParameterABI::HLSLInOut;
+  default:
+    llvm_unreachable("Invalid HLSL parameter modifier");
+  }
+}
+
+QualType getInoutParameterType(ASTContext &AST, QualType Ty) {
+  assert(!Ty->isReferenceType() &&
+         "Pointer and reference types cannot be inout or out parameters");
+  Ty = AST.getLValueReferenceType(Ty);
+  Ty.addRestrict();
+  return Ty;
+}
+
 } // namespace
 
 // Builder for template arguments of builtin types. Used internally
@@ -430,19 +453,36 @@ BuiltinTypeMethodBuilder::addParam(StringRef Name, QualType Ty,
 void BuiltinTypeMethodBuilder::createDecl() {
   assert(Method == nullptr && "Method or constructor is already created");
 
-  // create method or constructor type
+  // create function prototype
   ASTContext &AST = DeclBuilder.SemaRef.getASTContext();
   SmallVector<QualType> ParamTypes;
-  for (Param &MP : Params)
+  SmallVector<FunctionType::ExtParameterInfo> ParamExtInfos(Params.size());
+  uint32_t ArgIndex = 0;
+
+  // Create function prototype.
+  bool UseParamExtInfo = false;
+  for (Param &MP : Params) {
+    if (MP.Modifier != HLSLParamModifierAttr::Keyword_in) {
+      UseParamExtInfo = true;
+      FunctionType::ExtParameterInfo &PI = ParamExtInfos[ArgIndex];
+      ParamExtInfos[ArgIndex] =
+          PI.withABI(convertParamModifierToParamABI(MP.Modifier));
+      if (!MP.Ty->isDependentType())
+        MP.Ty = getInoutParameterType(AST, MP.Ty);
+    }
     ParamTypes.emplace_back(MP.Ty);
+    ++ArgIndex;
+  }
 
   FunctionProtoType::ExtProtoInfo ExtInfo;
+  if (UseParamExtInfo)
+    ExtInfo.ExtParameterInfos = ParamExtInfos.data();
   if (IsConst)
     ExtInfo.TypeQuals.addConst();
 
   QualType FuncTy = AST.getFunctionType(ReturnTy, ParamTypes, ExtInfo);
 
-  // create method or constructor decl
+  // Create method or constructor declaration.
   auto *TSInfo = AST.getTrivialTypeSourceInfo(FuncTy, SourceLocation());
   DeclarationNameInfo NameInfo = DeclarationNameInfo(Name, SourceLocation());
   if (IsCtor)
@@ -455,7 +495,7 @@ void BuiltinTypeMethodBuilder::createDecl() {
         AST, DeclBuilder.Record, SourceLocation(), NameInfo, FuncTy, TSInfo, SC,
         false, false, ConstexprSpecKind::Unspecified, SourceLocation());
 
-  // create params & set them to the function prototype
+  // Create params & set them to the method/constructor and function prototype.
   SmallVector<ParmVarDecl *> ParmDecls;
   unsigned CurScopeDepth = DeclBuilder.SemaRef.getCurScope()->getDepth();
   auto FnProtoLoc =
@@ -1258,5 +1298,37 @@ BuiltinTypeDeclBuilder &BuiltinTypeDeclBuilder::addConsumeMethod() {
       .finalize();
 }
 
+BuiltinTypeDeclBuilder &
+BuiltinTypeDeclBuilder::addGetDimensionsMethodForBuffer() {
+  using PH = BuiltinTypeMethodBuilder::PlaceHolder;
+  ASTContext &AST = SemaRef.getASTContext();
+  QualType UIntTy = AST.UnsignedIntTy;
+
+  QualType HandleTy = getResourceHandleField()->getType();
+  auto *AttrResTy = cast<HLSLAttributedResourceType>(HandleTy.getTypePtr());
+
+  // Structured buffers except {RW}ByteAddressBuffer have overload
+  // GetDimensions(out uint numStructs, out uint stride).
+  if (AttrResTy->getAttrs().RawBuffer &&
+      AttrResTy->getContainedType() != AST.Char8Ty) {
+    return BuiltinTypeMethodBuilder(*this, "GetDimensions", AST.VoidTy)
+        .addParam("numStructs", UIntTy, HLSLParamModifierAttr::Keyword_out)
+        .addParam("stride", UIntTy, HLSLParamModifierAttr::Keyword_out)
+        .callBuiltin("__builtin_hlsl_resource_getdimensions_x", QualType(),
+                     PH::Handle, PH::_0)
+        .callBuiltin("__builtin_hlsl_resource_getstride", QualType(),
+                     PH::Handle, PH::_1)
+        .finalize();
+  }
+
+  // Typed buffers and {RW}ByteAddressBuffer have overload
+  // GetDimensions(out uint dim).
+  return BuiltinTypeMethodBuilder(*this, "GetDimensions", AST.VoidTy)
+      .addParam("dim", UIntTy, HLSLParamModifierAttr::Keyword_out)
+      .callBuiltin("__builtin_hlsl_resource_getdimensions_x", QualType(),
+                   PH::Handle, PH::_0)
+      .finalize();
+}
+
 } // namespace hlsl
 } // namespace clang
diff --git a/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.h b/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.h
index 86cbd10e4cd6c..95e3a6c4fb2f1 100644
--- a/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.h
+++ b/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.h
@@ -94,6 +94,8 @@ class BuiltinTypeDeclBuilder {
   BuiltinTypeDeclBuilder &addAppendMethod();
   BuiltinTypeDeclBuilder &addConsumeMethod();
 
+  BuiltinTypeDeclBuilder &addGetDimensionsMethodForBuffer();
+
 private:
   BuiltinTypeDeclBuilder &addCreateFromBinding();
   BuiltinTypeDeclBuilder &addCreateFromImplicitBinding();
diff --git a/clang/lib/Sema/HLSLExternalSemaSource.cpp b/clang/lib/Sema/HLSLExternalSemaSource.cpp
index e118dda4780e2..6be84f19a8f08 100644
--- a/clang/lib/Sema/HLSLExternalSemaSource.cpp
+++ b/clang/lib/Sema/HLSLExternalSemaSource.cpp
@@ -159,7 +159,8 @@ void HLSLExternalSemaSource::defineHLSLMatrixAlias() {
                                                   SourceLocation(), ColsParam));
   TemplateParams.emplace_back(ColsParam);
 
-  const unsigned MaxMatDim = 4;
+  const unsigned MaxMatDim = SemaPtr->getLangOpts().MaxMatrixDimension;
+
   auto *MaxRow = IntegerLiteral::Create(
       AST, llvm::APInt(AST.getIntWidth(AST.IntTy), MaxMatDim), AST.IntTy,
       SourceLocation());
@@ -379,6 +380,7 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() {
                     /*RawBuffer=*/false, /*HasCounter=*/false)
         .addArraySubscriptOperators()
         .addLoadMethods()
+        .addGetDimensionsMethodForBuffer()
         .completeDefinition();
   });
 
@@ -391,6 +393,7 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() {
                     /*RawBuffer=*/false, /*HasCounter=*/false)
         .addArraySubscriptOperators()
         .addLoadMethods()
+        .addGetDimensionsMethodForBuffer()
         .completeDefinition();
   });
 
@@ -403,6 +406,7 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() {
                     /*RawBuffer=*/false, /*HasCounter=*/false)
         .addArraySubscriptOperators()
         .addLoadMethods()
+        .addGetDimensionsMethodForBuffer()
         .completeDefinition();
   });
 
@@ -414,6 +418,7 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() {
                     /*RawBuffer=*/true, /*HasCounter=*/false)
         .addArraySubscriptOperators()
         .addLoadMethods()
+        .addGetDimensionsMethodForBuffer()
         .completeDefinition();
   });
 
@@ -427,6 +432,7 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() {
         .addLoadMethods()
         .addIncrementCounterMethod()
         .addDecrementCounterMethod()
+        .addGetDimensionsMethodForBuffer()
         .completeDefinition();
   });
 
@@ -438,6 +444,7 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() {
     setupBufferType(Decl, *SemaPtr, ResourceClass::UAV, /*IsROV=*/false,
                     /*RawBuffer=*/true, /*HasCounter=*/true)
         .addAppendMethod()
+        .addGetDimensionsMethodForBuffer()
         .completeDefinition();
   });
 
@@ -449,6 +456,7 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() {
     setupBufferType(Decl, *SemaPtr, ResourceClass::UAV, /*IsROV=*/false,
                     /*RawBuffer=*/true, /*HasCounter=*/true)
         .addConsumeMethod()
+        .addGetDimensionsMethodForBuffer()
         .completeDefinition();
   });
 
@@ -463,6 +471,7 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() {
         .addLoadMethods()
         .addIncrementCounterMethod()
         .addDecrementCounterMethod()
+        .addGetDimensionsMethodForBuffer()
         .completeDefinition();
   });
 
@@ -471,6 +480,7 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() {
   onCompletion(Decl, [this](CXXRecordDecl *Decl) {
     setupBufferType(Decl, *SemaPtr, ResourceClass::SRV, /*IsROV=*/false,
                     /*RawBuffer=*/true, /*HasCounter=*/false)
+        .addGetDimensionsMethodForBuffer()
         .completeDefinition();
   });
   Decl = BuiltinTypeDeclBuilder(*SemaPtr, HLSLNamespace, "RWByteAddressBuffer")
@@ -478,6 +488,7 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() {
   onCompletion(Decl, [this](CXXRecordDecl *Decl) {
     setupBufferType(Decl, *SemaPtr, ResourceClass::UAV, /*IsROV=*/false,
                     /*RawBuffer=*/true, /*HasCounter=*/false)
+        .addGetDimensionsMethodForBuffer()
         .completeDefinition();
   });
   Decl = BuiltinTypeDeclBuilder(*SemaPtr, HLSLNamespace,
@@ -486,6 +497,7 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() {
   onCompletion(Decl, [this](CXXRecordDecl *Decl) {
     setupBufferType(Decl, *SemaPtr, ResourceClass::UAV, /*IsROV=*/true,
                     /*RawBuffer=*/true, /*HasCounter=*/false)
+        .addGetDimensionsMethodForBuffer()
         .completeDefinition();
   });
 }
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index d7c66d8be2fd6..00ad27fbbedf6 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -2219,9 +2219,9 @@ void Sema::checkTypeSupport(QualType Ty, SourceLocation Loc, ValueDecl *D) {
       else
         PD << "expression";
 
-      if (Diag(Loc, PD, FD)
-          << false /*show bit size*/ << 0 << Ty << false /*return*/
-          << TI.getTriple().str()) {
+      if (Diag(Loc, PD) << false /*show bit size*/ << 0 << Ty
+                        << false /*return*/
+                        << TI.getTriple().str()) {
         if (D)
           D->setInvalidDecl();
       }
@@ -2238,9 +2238,8 @@ void Sema::checkTypeSupport(QualType Ty, SourceLocation Loc, ValueDecl *D) {
       else
         PD << "expression";
 
-      if (Diag(Loc, PD, FD)
-          << false /*show bit size*/ << 0 << Ty << true /*return*/
-          << TI.getTriple().str()) {
+      if (Diag(Loc, PD) << false /*show bit size*/ << 0 << Ty << true /*return*/
+                        << TI.getTriple().str()) {
         if (D)
           D->setInvalidDecl();
       }
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index 134a5d8e5a363..0a0ffb69e662c 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -216,6 +216,9 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
            (SemaRef.BuiltinConstantArg(TheCall, ArgCount, Result)) ||
            (SemaRef.BuiltinConstantArg(TheCall, (ArgCount - 1), Result));
   }
+  case AMDGPU::BI__builtin_amdgcn_global_load_b128:
+  case AMDGPU::BI__builtin_amdgcn_global_store_b128:
+    return checkScopedMemAccessFunctionCall(TheCall);
   default:
     return false;
   }
@@ -305,6 +308,19 @@ bool SemaAMDGPU::checkCoopAtomicFunctionCall(CallExpr *TheCall, bool IsStore) {
   return Fail;
 }
 
+bool SemaAMDGPU::checkScopedMemAccessFunctionCall(CallExpr *TheCall) {
+  bool Fail = false;
+  // Last argument is a string literal
+  Expr *Arg = TheCall->getArg(TheCall->getNumArgs() - 1);
+  auto Scope = dyn_cast<StringLiteral>(Arg->IgnoreParenCasts());
+  if (!Scope) {
+    Fail = true;
+    Diag(TheCall->getBeginLoc(), diag::err_expr_not_string_literal)
+        << Arg->getSourceRange();
+  }
+  return Fail;
+}
+
 bool SemaAMDGPU::checkMovDPPFunctionCall(CallExpr *TheCall, unsigned NumArgs,
                                          unsigned NumDataArgs) {
   assert(NumDataArgs <= 2);
diff --git a/clang/lib/Sema/SemaAvailability.cpp b/clang/lib/Sema/SemaAvailability.cpp
index f8d61d9f8f5ea..b09e1684e4e64 100644
--- a/clang/lib/Sema/SemaAvailability.cpp
+++ b/clang/lib/Sema/SemaAvailability.cpp
@@ -102,7 +102,7 @@ Sema::ShouldDiagnoseAvailabilityOfDecl(const NamedDecl *D, std::string *Message,
       break;
     for (const Type *T = TD->getUnderlyingType().getTypePtr(); /**/; /**/) {
       if (auto *TT = dyn_cast<TagType>(T)) {
-        D = TT->getOriginalDecl()->getDefinitionOrSelf();
+        D = TT->getDecl()->getDefinitionOrSelf();
       } else if (isa<SubstTemplateTypeParmType>(T)) {
         // A Subst* node represents a use through a template.
         // Any uses of the underlying declaration happened through it's template
@@ -1019,7 +1019,7 @@ bool DiagnoseUnguardedAvailability::VisitTypeLoc(TypeLoc Ty) {
     return true;
 
   if (const auto *TT = dyn_cast<TagType>(TyPtr)) {
-    TagDecl *TD = TT->getOriginalDecl()->getDefinitionOrSelf();
+    TagDecl *TD = TT->getDecl()->getDefinitionOrSelf();
     DiagnoseDeclAvailability(TD, Range);
 
   } else if (const auto *TD = dyn_cast<TypedefType>(TyPtr)) {
diff --git a/clang/lib/Sema/SemaBPF.cpp b/clang/lib/Sema/SemaBPF.cpp
index be890ab7fa75f..a9761764bbdc6 100644
--- a/clang/lib/Sema/SemaBPF.cpp
+++ b/clang/lib/Sema/SemaBPF.cpp
@@ -57,7 +57,7 @@ static bool isValidPreserveTypeInfoArg(Expr *Arg) {
 
   // Record type or Enum type.
   if (const auto *RT = ArgType->getAsCanonical<TagType>())
-    if (!RT->getOriginalDecl()->getDeclName().isEmpty())
+    if (!RT->getDecl()->getDeclName().isEmpty())
       return true;
 
   return false;
diff --git a/clang/lib/Sema/SemaBase.cpp b/clang/lib/Sema/SemaBase.cpp
index 9b677f446f3e6..bf32491be31ba 100644
--- a/clang/lib/Sema/SemaBase.cpp
+++ b/clang/lib/Sema/SemaBase.cpp
@@ -58,13 +58,13 @@ SemaBase::SemaDiagnosticBuilder::getDeviceDeferredDiags() const {
   return S.DeviceDeferredDiags;
 }
 
-Sema::SemaDiagnosticBuilder SemaBase::Diag(SourceLocation Loc, unsigned DiagID,
-                                           bool DeferHint) {
+Sema::SemaDiagnosticBuilder SemaBase::Diag(SourceLocation Loc,
+                                           unsigned DiagID) {
   bool IsError =
       getDiagnostics().getDiagnosticIDs()->isDefaultMappingAsError(DiagID);
   bool ShouldDefer = getLangOpts().CUDA && getLangOpts().GPUDeferDiag &&
                      DiagnosticIDs::isDeferrable(DiagID) &&
-                     (DeferHint || SemaRef.DeferDiags || !IsError);
+                     (SemaRef.DeferDiags || !IsError);
   auto SetIsLastErrorImmediate = [&](bool Flag) {
     if (IsError)
       SemaRef.IsLastErrorImmediate = Flag;
@@ -83,16 +83,13 @@ Sema::SemaDiagnosticBuilder SemaBase::Diag(SourceLocation Loc, unsigned DiagID,
 }
 
 Sema::SemaDiagnosticBuilder SemaBase::Diag(SourceLocation Loc,
-                                           const PartialDiagnostic &PD,
-                                           bool DeferHint) {
-  return Diag(Loc, PD.getDiagID(), DeferHint) << PD;
+                                           const PartialDiagnostic &PD) {
+  return Diag(Loc, PD.getDiagID()) << PD;
 }
 
 SemaBase::SemaDiagnosticBuilder SemaBase::DiagCompat(SourceLocation Loc,
-                                                     unsigned CompatDiagId,
-                                                     bool DeferHint) {
+                                                     unsigned CompatDiagId) {
   return Diag(Loc,
-              DiagnosticIDs::getCXXCompatDiagId(getLangOpts(), CompatDiagId),
-              DeferHint);
+              DiagnosticIDs::getCXXCompatDiagId(getLangOpts(), CompatDiagId));
 }
 } // namespace clang
diff --git a/clang/lib/Sema/SemaCXXScopeSpec.cpp b/clang/lib/Sema/SemaCXXScopeSpec.cpp
index 97ba1a510cee4..c52fc5bf815af 100644
--- a/clang/lib/Sema/SemaCXXScopeSpec.cpp
+++ b/clang/lib/Sema/SemaCXXScopeSpec.cpp
@@ -31,8 +31,7 @@ static CXXRecordDecl *getCurrentInstantiationOf(QualType T,
   const TagType *TagTy = dyn_cast<TagType>(T->getCanonicalTypeInternal());
   if (!isa_and_present<RecordType, InjectedClassNameType>(TagTy))
     return nullptr;
-  auto *RD =
-      cast<CXXRecordDecl>(TagTy->getOriginalDecl())->getDefinitionOrSelf();
+  auto *RD = cast<CXXRecordDecl>(TagTy->getDecl())->getDefinitionOrSelf();
   if (isa<InjectedClassNameType>(TagTy) ||
       RD->isCurrentInstantiation(CurContext))
     return RD;
@@ -121,7 +120,7 @@ DeclContext *Sema::computeDeclContext(const CXXScopeSpec &SS,
         }
       } else if (const auto *RecordT = dyn_cast<RecordType>(NNSType)) {
         // The nested name specifier refers to a member of a class template.
-        return RecordT->getOriginalDecl()->getDefinitionOrSelf();
+        return RecordT->getDecl()->getDefinitionOrSelf();
       }
     }
 
diff --git a/clang/lib/Sema/SemaCast.cpp b/clang/lib/Sema/SemaCast.cpp
index b8bafcfdc41ea..46fc19e8240e3 100644
--- a/clang/lib/Sema/SemaCast.cpp
+++ b/clang/lib/Sema/SemaCast.cpp
@@ -963,7 +963,7 @@ void CastOperation::CheckDynamicCast() {
   }
 
   // C++ 5.2.7p6: Otherwise, v shall be [polymorphic].
-  const RecordDecl *SrcDecl = SrcRecord->getOriginalDecl()->getDefinition();
+  const RecordDecl *SrcDecl = SrcRecord->getDecl()->getDefinition();
   assert(SrcDecl && "Definition missing");
   if (!cast<CXXRecordDecl>(SrcDecl)->isPolymorphic()) {
     Self.Diag(OpRange.getBegin(), diag::err_bad_dynamic_cast_not_polymorphic)
@@ -1454,7 +1454,7 @@ static TryCastResult TryStaticCast(Sema &Self, ExprResult &SrcExpr,
   // converted to an integral type. [...] A value of a scoped enumeration type
   // can also be explicitly converted to a floating-point type [...].
   if (const EnumType *Enum = dyn_cast<EnumType>(SrcType)) {
-    if (Enum->getOriginalDecl()->isScoped()) {
+    if (Enum->getDecl()->isScoped()) {
       if (DestType->isBooleanType()) {
         Kind = CK_IntegralToBoolean;
         return TC_Success;
@@ -2944,6 +2944,8 @@ bool CastOperation::CheckHLSLCStyleCast(CheckedConversionKind CCK) {
       SrcExpr = Self.ImpCastExprToType(
           SrcExpr.get(), Self.Context.getArrayParameterType(SrcTy),
           CK_HLSLArrayRValue, VK_PRValue, nullptr, CCK);
+    else
+      SrcExpr = Self.DefaultLvalueConversion(SrcExpr.get());
     Kind = CK_HLSLElementwiseCast;
     return true;
   }
@@ -2952,6 +2954,7 @@ bool CastOperation::CheckHLSLCStyleCast(CheckedConversionKind CCK) {
   // If the relative order of this and the HLSLElementWise cast checks
   // are changed, it might change which cast handles what in a few cases
   if (Self.HLSL().CanPerformAggregateSplatCast(SrcExpr.get(), DestType)) {
+    SrcExpr = Self.DefaultLvalueConversion(SrcExpr.get());
     const VectorType *VT = SrcTy->getAs<VectorType>();
     // change splat from vec1 case to splat from scalar
     if (VT && VT->getNumElements() == 1)
@@ -3122,7 +3125,7 @@ void CastOperation::CheckCStyleCast() {
       }
 
       // GCC's cast to union extension.
-      if (RecordDecl *RD = DestRecordTy->getOriginalDecl(); RD->isUnion()) {
+      if (RecordDecl *RD = DestRecordTy->getDecl(); RD->isUnion()) {
         if (CastExpr::getTargetFieldForToUnionCast(RD->getDefinitionOrSelf(),
                                                    SrcType)) {
           Self.Diag(OpRange.getBegin(), diag::ext_typecheck_cast_to_union)
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 5e4c8491b3e6b..e007d9caac244 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -3580,9 +3580,8 @@ static bool CheckNonNullExpr(Sema &S, const Expr *Expr) {
   // As a special case, transparent unions initialized with zero are
   // considered null for the purposes of the nonnull attribute.
   if (const RecordType *UT = Expr->getType()->getAsUnionType();
-      UT && UT->getOriginalDecl()
-                ->getMostRecentDecl()
-                ->hasAttr<TransparentUnionAttr>()) {
+      UT &&
+      UT->getDecl()->getMostRecentDecl()->hasAttr<TransparentUnionAttr>()) {
     if (const auto *CLE = dyn_cast<CompoundLiteralExpr>(Expr))
       if (const auto *ILE = dyn_cast<InitListExpr>(CLE->getInitializer()))
         Expr = ILE->getInit(0);
@@ -6927,13 +6926,13 @@ StringRef Sema::GetFormatStringTypeName(FormatStringType FST) {
 
 FormatStringType Sema::GetFormatStringType(StringRef Flavor) {
   return llvm::StringSwitch<FormatStringType>(Flavor)
-      .Cases("gnu_scanf", "scanf", FormatStringType::Scanf)
-      .Cases("gnu_printf", "printf", "printf0", "syslog",
+      .Cases({"gnu_scanf", "scanf"}, FormatStringType::Scanf)
+      .Cases({"gnu_printf", "printf", "printf0", "syslog"},
              FormatStringType::Printf)
-      .Cases("NSString", "CFString", FormatStringType::NSString)
-      .Cases("gnu_strftime", "strftime", FormatStringType::Strftime)
-      .Cases("gnu_strfmon", "strfmon", FormatStringType::Strfmon)
-      .Cases("kprintf", "cmn_err", "vcmn_err", "zcmn_err",
+      .Cases({"NSString", "CFString"}, FormatStringType::NSString)
+      .Cases({"gnu_strftime", "strftime"}, FormatStringType::Strftime)
+      .Cases({"gnu_strfmon", "strfmon"}, FormatStringType::Strfmon)
+      .Cases({"kprintf", "cmn_err", "vcmn_err", "zcmn_err"},
              FormatStringType::Kprintf)
       .Case("freebsd_kprintf", FormatStringType::FreeBSDKPrintf)
       .Case("os_trace", FormatStringType::OSLog)
@@ -12310,13 +12309,20 @@ static void DiagnoseMixedUnicodeImplicitConversion(Sema &S, const Type *Source,
                                                    SourceLocation CC) {
   assert(Source->isUnicodeCharacterType() && Target->isUnicodeCharacterType() &&
          Source != Target);
+
+  // Lone surrogates have a distinct representation in UTF-32.
+  // Converting between UTF-16 and UTF-32 codepoints seems very widespread,
+  // so don't warn on such conversion.
+  if (Source->isChar16Type() && Target->isChar32Type())
+    return;
+
   Expr::EvalResult Result;
   if (E->EvaluateAsInt(Result, S.getASTContext(), Expr::SE_AllowSideEffects,
                        S.isConstantEvaluatedContext())) {
     llvm::APSInt Value(32);
     Value = Result.Val.getInt();
     bool IsASCII = Value <= 0x7F;
-    bool IsBMP = Value <= 0xD7FF || (Value >= 0xE000 && Value <= 0xFFFF);
+    bool IsBMP = Value <= 0xDFFF || (Value >= 0xE000 && Value <= 0xFFFF);
     bool ConversionPreservesSemantics =
         IsASCII || (!Source->isChar8Type() && !Target->isChar8Type() && IsBMP);
 
@@ -12881,8 +12887,8 @@ void Sema::CheckImplicitConversion(Expr *E, QualType T, SourceLocation CC,
 
   if (const EnumType *SourceEnum = Source->getAsCanonical<EnumType>())
     if (const EnumType *TargetEnum = Target->getAsCanonical<EnumType>())
-      if (SourceEnum->getOriginalDecl()->hasNameForLinkage() &&
-          TargetEnum->getOriginalDecl()->hasNameForLinkage() &&
+      if (SourceEnum->getDecl()->hasNameForLinkage() &&
+          TargetEnum->getDecl()->hasNameForLinkage() &&
           SourceEnum != TargetEnum) {
         if (SourceMgr.isInSystemMacro(CC))
           return;
@@ -16242,9 +16248,9 @@ getAndVerifyMatrixDimension(Expr *Expr, StringRef Name, Sema &S) {
     return {};
   }
   uint64_t Dim = Value->getZExtValue();
-  if (!ConstantMatrixType::isDimensionValid(Dim)) {
+  if (Dim == 0 || Dim > S.Context.getLangOpts().MaxMatrixDimension) {
     S.Diag(Expr->getBeginLoc(), diag::err_builtin_matrix_invalid_dimension)
-        << Name << ConstantMatrixType::getMaxElementsPerDimension();
+        << Name << S.Context.getLangOpts().MaxMatrixDimension;
     return {};
   }
   return Dim;
diff --git a/clang/lib/Sema/SemaCodeComplete.cpp b/clang/lib/Sema/SemaCodeComplete.cpp
index 5dd49497ce9fd..0514d1033f74f 100644
--- a/clang/lib/Sema/SemaCodeComplete.cpp
+++ b/clang/lib/Sema/SemaCodeComplete.cpp
@@ -2070,7 +2070,7 @@ static const char *GetCompletionTypeString(QualType T, ASTContext &Context,
 
     // Anonymous tag types are constant strings.
     if (const TagType *TagT = dyn_cast<TagType>(T))
-      if (TagDecl *Tag = TagT->getOriginalDecl())
+      if (TagDecl *Tag = TagT->getDecl())
         if (!Tag->hasNameForLinkage()) {
           switch (Tag->getTagKind()) {
           case TagTypeKind::Struct:
diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp
index 7c44efd7b9c1c..54cbfe46a6528 100644
--- a/clang/lib/Sema/SemaConcept.cpp
+++ b/clang/lib/Sema/SemaConcept.cpp
@@ -432,7 +432,7 @@ class ConstraintSatisfactionChecker {
   // XXX: It is SLOW! Use it very carefully.
   std::optional<MultiLevelTemplateArgumentList> SubstitutionInTemplateArguments(
       const NormalizedConstraintWithParamMapping &Constraint,
-      MultiLevelTemplateArgumentList MLTAL,
+      const MultiLevelTemplateArgumentList &MLTAL,
       llvm::SmallVector<TemplateArgument> &SubstitutedOuterMost);
 
   ExprResult EvaluateSlow(const AtomicConstraint &Constraint,
@@ -564,12 +564,17 @@ ExprResult ConstraintSatisfactionChecker::EvaluateAtomicConstraint(
 std::optional<MultiLevelTemplateArgumentList>
 ConstraintSatisfactionChecker::SubstitutionInTemplateArguments(
     const NormalizedConstraintWithParamMapping &Constraint,
-    MultiLevelTemplateArgumentList MLTAL,
-    llvm::SmallVector<TemplateArgument> &SubstitutedOuterMost) {
+    const MultiLevelTemplateArgumentList &MLTAL,
+    llvm::SmallVector<TemplateArgument> &SubstitutedOutermost) {
 
   if (!Constraint.hasParameterMapping())
     return std::move(MLTAL);
 
+  // The mapping is empty, meaning no template arguments are needed for
+  // evaluation.
+  if (Constraint.getParameterMapping().empty())
+    return MultiLevelTemplateArgumentList();
+
   TemplateDeductionInfo Info(Constraint.getBeginLoc());
   Sema::InstantiatingTemplate Inst(
       S, Constraint.getBeginLoc(),
@@ -606,37 +611,39 @@ ConstraintSatisfactionChecker::SubstitutionInTemplateArguments(
       Constraint.mappingOccurenceList();
   // The empty MLTAL situation should only occur when evaluating non-dependent
   // constraints.
-  if (!MLTAL.getNumSubstitutedLevels())
-    MLTAL.addOuterTemplateArguments(TD, {}, /*Final=*/false);
-  SubstitutedOuterMost =
-      llvm::to_vector_of<TemplateArgument>(MLTAL.getOutermost());
+  if (MLTAL.getNumSubstitutedLevels())
+    SubstitutedOutermost =
+        llvm::to_vector_of<TemplateArgument>(MLTAL.getOutermost());
   unsigned Offset = 0;
   for (unsigned I = 0, MappedIndex = 0; I < Used.size(); I++) {
     TemplateArgument Arg;
     if (Used[I])
-      Arg = CTAI.SugaredConverted[MappedIndex++];
-    if (I < SubstitutedOuterMost.size()) {
-      SubstitutedOuterMost[I] = Arg;
+      Arg = S.Context.getCanonicalTemplateArgument(
+          CTAI.SugaredConverted[MappedIndex++]);
+    if (I < SubstitutedOutermost.size()) {
+      SubstitutedOutermost[I] = Arg;
       Offset = I + 1;
     } else {
-      SubstitutedOuterMost.push_back(Arg);
-      Offset = SubstitutedOuterMost.size();
+      SubstitutedOutermost.push_back(Arg);
+      Offset = SubstitutedOutermost.size();
     }
   }
-  if (Offset < SubstitutedOuterMost.size())
-    SubstitutedOuterMost.erase(SubstitutedOuterMost.begin() + Offset);
+  if (Offset < SubstitutedOutermost.size())
+    SubstitutedOutermost.erase(SubstitutedOutermost.begin() + Offset);
 
-  MLTAL.replaceOutermostTemplateArguments(TD, SubstitutedOuterMost);
-  return std::move(MLTAL);
+  MultiLevelTemplateArgumentList SubstitutedTemplateArgs;
+  SubstitutedTemplateArgs.addOuterTemplateArguments(TD, SubstitutedOutermost,
+                                                    /*Final=*/false);
+  return std::move(SubstitutedTemplateArgs);
 }
 
 ExprResult ConstraintSatisfactionChecker::EvaluateSlow(
     const AtomicConstraint &Constraint,
     const MultiLevelTemplateArgumentList &MLTAL) {
 
-  llvm::SmallVector<TemplateArgument> SubstitutedOuterMost;
+  llvm::SmallVector<TemplateArgument> SubstitutedOutermost;
   std::optional<MultiLevelTemplateArgumentList> SubstitutedArgs =
-      SubstitutionInTemplateArguments(Constraint, MLTAL, SubstitutedOuterMost);
+      SubstitutionInTemplateArguments(Constraint, MLTAL, SubstitutedOutermost);
   if (!SubstitutedArgs) {
     Satisfaction.IsSatisfied = false;
     return ExprEmpty();
@@ -734,8 +741,9 @@ ExprResult ConstraintSatisfactionChecker::Evaluate(
   UnsubstitutedConstraintSatisfactionCacheResult Cache;
   Cache.Satisfaction.ContainsErrors = Satisfaction.ContainsErrors;
   Cache.Satisfaction.IsSatisfied = Satisfaction.IsSatisfied;
-  std::copy(Satisfaction.Details.begin() + Size, Satisfaction.Details.end(),
-            std::back_inserter(Cache.Satisfaction.Details));
+  Cache.Satisfaction.Details.insert(Cache.Satisfaction.Details.end(),
+                                    Satisfaction.Details.begin() + Size,
+                                    Satisfaction.Details.end());
   Cache.SubstExpr = E;
   S.UnsubstitutedConstraintSatisfactionCache.insert({ID, std::move(Cache)});
 
@@ -784,13 +792,13 @@ ExprResult ConstraintSatisfactionChecker::EvaluateSlow(
                      FoldExpandedConstraint::FoldOperatorKind::And;
   unsigned EffectiveDetailEndIndex = Satisfaction.Details.size();
 
-  llvm::SmallVector<TemplateArgument> SubstitutedOuterMost;
+  llvm::SmallVector<TemplateArgument> SubstitutedOutermost;
   // FIXME: Is PackSubstitutionIndex correct?
   llvm::SaveAndRestore _(PackSubstitutionIndex, S.ArgPackSubstIndex);
   std::optional<MultiLevelTemplateArgumentList> SubstitutedArgs =
       SubstitutionInTemplateArguments(
           static_cast<const NormalizedConstraintWithParamMapping &>(Constraint),
-          MLTAL, SubstitutedOuterMost);
+          MLTAL, SubstitutedOutermost);
   if (!SubstitutedArgs) {
     Satisfaction.IsSatisfied = false;
     return ExprError();
@@ -866,8 +874,9 @@ ExprResult ConstraintSatisfactionChecker::Evaluate(
   UnsubstitutedConstraintSatisfactionCacheResult Cache;
   Cache.Satisfaction.ContainsErrors = Satisfaction.ContainsErrors;
   Cache.Satisfaction.IsSatisfied = Satisfaction.IsSatisfied;
-  std::copy(Satisfaction.Details.begin() + Size, Satisfaction.Details.end(),
-            std::back_inserter(Cache.Satisfaction.Details));
+  Cache.Satisfaction.Details.insert(Cache.Satisfaction.Details.end(),
+                                    Satisfaction.Details.begin() + Size,
+                                    Satisfaction.Details.end());
   Cache.SubstExpr = E;
   S.UnsubstitutedConstraintSatisfactionCache.insert({ID, std::move(Cache)});
   return E;
@@ -878,9 +887,9 @@ ExprResult ConstraintSatisfactionChecker::EvaluateSlow(
     const MultiLevelTemplateArgumentList &MLTAL, unsigned Size) {
   const ConceptReference *ConceptId = Constraint.getConceptId();
 
-  llvm::SmallVector<TemplateArgument> SubstitutedOuterMost;
+  llvm::SmallVector<TemplateArgument> SubstitutedOutermost;
   std::optional<MultiLevelTemplateArgumentList> SubstitutedArgs =
-      SubstitutionInTemplateArguments(Constraint, MLTAL, SubstitutedOuterMost);
+      SubstitutionInTemplateArguments(Constraint, MLTAL, SubstitutedOutermost);
 
   if (!SubstitutedArgs) {
     Satisfaction.IsSatisfied = false;
@@ -1010,8 +1019,9 @@ ExprResult ConstraintSatisfactionChecker::Evaluate(
   UnsubstitutedConstraintSatisfactionCacheResult Cache;
   Cache.Satisfaction.ContainsErrors = Satisfaction.ContainsErrors;
   Cache.Satisfaction.IsSatisfied = Satisfaction.IsSatisfied;
-  std::copy(Satisfaction.Details.begin() + Size, Satisfaction.Details.end(),
-            std::back_inserter(Cache.Satisfaction.Details));
+  Cache.Satisfaction.Details.insert(Cache.Satisfaction.Details.end(),
+                                    Satisfaction.Details.begin() + Size,
+                                    Satisfaction.Details.end());
   Cache.SubstExpr = CE;
   S.UnsubstitutedConstraintSatisfactionCache.insert({ID, std::move(Cache)});
   return CE;
@@ -1215,13 +1225,51 @@ bool Sema::CheckConstraintSatisfaction(
   return false;
 }
 
+static ExprResult
+SubstituteConceptsInConstraintExpression(Sema &S, const NamedDecl *D,
+                                         const ConceptSpecializationExpr *CSE,
+                                         UnsignedOrNone SubstIndex) {
+
+  // [C++2c] [temp.constr.normal]
+  // Otherwise, to form CE, any non-dependent concept template argument Ai
+  // is substituted into the constraint-expression of C.
+  // If any such substitution results in an invalid concept-id,
+  // the program is ill-formed; no diagnostic is required.
+
+  ConceptDecl *Concept = CSE->getNamedConcept()->getCanonicalDecl();
+  Sema::ArgPackSubstIndexRAII _(S, SubstIndex);
+
+  const ASTTemplateArgumentListInfo *ArgsAsWritten =
+      CSE->getTemplateArgsAsWritten();
+  if (llvm::none_of(
+          ArgsAsWritten->arguments(), [&](const TemplateArgumentLoc &ArgLoc) {
+            return !ArgLoc.getArgument().isDependent() &&
+                   ArgLoc.getArgument().isConceptOrConceptTemplateParameter();
+          })) {
+    return Concept->getConstraintExpr();
+  }
+
+  MultiLevelTemplateArgumentList MLTAL = S.getTemplateInstantiationArgs(
+      Concept, Concept->getLexicalDeclContext(),
+      /*Final=*/false, CSE->getTemplateArguments(),
+      /*RelativeToPrimary=*/true,
+      /*Pattern=*/nullptr,
+      /*ForConstraintInstantiation=*/true);
+  return S.SubstConceptTemplateArguments(CSE, Concept->getConstraintExpr(),
+                                         MLTAL);
+}
+
 bool Sema::CheckConstraintSatisfaction(
     const ConceptSpecializationExpr *ConstraintExpr,
     ConstraintSatisfaction &Satisfaction) {
 
+  ExprResult Res = SubstituteConceptsInConstraintExpression(
+      *this, nullptr, ConstraintExpr, ArgPackSubstIndex);
+  if (!Res.isUsable())
+    return true;
+
   llvm::SmallVector<AssociatedConstraint, 1> Constraints;
-  Constraints.emplace_back(
-      ConstraintExpr->getNamedConcept()->getConstraintExpr());
+  Constraints.emplace_back(Res.get());
 
   MultiLevelTemplateArgumentList MLTAL(ConstraintExpr->getNamedConcept(),
                                        ConstraintExpr->getTemplateArguments(),
@@ -1977,8 +2025,13 @@ void SubstituteParameterMappings::buildParameterMapping(
       SemaRef.MarkUsedTemplateParameters(Args->arguments(),
                                          /*Depth=*/0, OccurringIndices);
   }
+  unsigned Size = OccurringIndices.count();
+  // When the constraint is independent of any template parameters,
+  // we build an empty mapping so that we can distinguish these cases
+  // from cases where no mapping exists at all, e.g. when there are only atomic
+  // constraints.
   TemplateArgumentLoc *TempArgs =
-      new (SemaRef.Context) TemplateArgumentLoc[OccurringIndices.count()];
+      new (SemaRef.Context) TemplateArgumentLoc[Size];
   llvm::SmallVector<NamedDecl *> UsedParams;
   for (unsigned I = 0, J = 0, C = TemplateParams->size(); I != C; ++I) {
     SourceLocation Loc = ArgsAsWritten->NumTemplateArgs > I
@@ -1999,7 +2052,6 @@ void SubstituteParameterMappings::buildParameterMapping(
       TemplateParams->getLAngleLoc(), UsedParams,
       /*RAngleLoc=*/SourceLocation(),
       /*RequiresClause=*/nullptr);
-  unsigned Size = OccurringIndices.count();
   N.updateParameterMapping(
       std::move(OccurringIndices), std::move(OccurringIndicesForSubsumption),
       MutableArrayRef<TemplateArgumentLoc>{TempArgs, Size}, UsedList);
@@ -2010,6 +2062,10 @@ bool SubstituteParameterMappings::substitute(
   if (!N.hasParameterMapping())
     buildParameterMapping(N);
 
+  // If the parameter mapping is empty, there is nothing to substitute.
+  if (N.getParameterMapping().empty())
+    return false;
+
   SourceLocation InstLocBegin, InstLocEnd;
   llvm::ArrayRef Arguments = ArgsAsWritten->arguments();
   if (Arguments.empty()) {
@@ -2247,8 +2303,14 @@ NormalizedConstraint *NormalizedConstraint::fromConstraintExpr(
       // Use canonical declarations to merge ConceptDecls across
       // different modules.
       ConceptDecl *CD = CSE->getNamedConcept()->getCanonicalDecl();
+
+      ExprResult Res =
+          SubstituteConceptsInConstraintExpression(S, D, CSE, SubstIndex);
+      if (!Res.isUsable())
+        return nullptr;
+
       SubNF = NormalizedConstraint::fromAssociatedConstraints(
-          S, CD, AssociatedConstraint(CD->getConstraintExpr(), SubstIndex));
+          S, CD, AssociatedConstraint(Res.get(), SubstIndex));
 
       if (!SubNF)
         return nullptr;
diff --git a/clang/lib/Sema/SemaCoroutine.cpp b/clang/lib/Sema/SemaCoroutine.cpp
index 229e91ed04caa..c0aba832dba94 100644
--- a/clang/lib/Sema/SemaCoroutine.cpp
+++ b/clang/lib/Sema/SemaCoroutine.cpp
@@ -640,10 +640,9 @@ static void checkNoThrow(Sema &S, const Stmt *E,
         QualType::DestructionKind::DK_cxx_destructor) {
       const auto *T =
           cast<RecordType>(ReturnType.getCanonicalType().getTypePtr());
-      checkDeclNoexcept(cast<CXXRecordDecl>(T->getOriginalDecl())
-                            ->getDefinition()
-                            ->getDestructor(),
-                        /*IsDtor=*/true);
+      checkDeclNoexcept(
+          cast<CXXRecordDecl>(T->getDecl())->getDefinition()->getDestructor(),
+          /*IsDtor=*/true);
     }
   } else
     for (const auto *Child : E->children()) {
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 3b1e9c59813e2..5c5f127aa4e33 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -7640,6 +7640,58 @@ static bool isMainVar(DeclarationName Name, VarDecl *VD) {
           VD->isExternC());
 }
 
+void Sema::CheckAsmLabel(Scope *S, Expr *E, StorageClass SC,
+                         TypeSourceInfo *TInfo, VarDecl *NewVD) {
+
+  // Quickly return if the function does not have an `asm` attribute.
+  if (E == nullptr)
+    return;
+
+  // The parser guarantees this is a string.
+  StringLiteral *SE = cast<StringLiteral>(E);
+  StringRef Label = SE->getString();
+  QualType R = TInfo->getType();
+  if (S->getFnParent() != nullptr) {
+    switch (SC) {
+    case SC_None:
+    case SC_Auto:
+      Diag(E->getExprLoc(), diag::warn_asm_label_on_auto_decl) << Label;
+      break;
+    case SC_Register:
+      // Local Named register
+      if (!Context.getTargetInfo().isValidGCCRegisterName(Label) &&
+          DeclAttrsMatchCUDAMode(getLangOpts(), getCurFunctionDecl()))
+        Diag(E->getExprLoc(), diag::err_asm_unknown_register_name) << Label;
+      break;
+    case SC_Static:
+    case SC_Extern:
+    case SC_PrivateExtern:
+      break;
+    }
+  } else if (SC == SC_Register) {
+    // Global Named register
+    if (DeclAttrsMatchCUDAMode(getLangOpts(), NewVD)) {
+      const auto &TI = Context.getTargetInfo();
+      bool HasSizeMismatch;
+
+      if (!TI.isValidGCCRegisterName(Label))
+        Diag(E->getExprLoc(), diag::err_asm_unknown_register_name) << Label;
+      else if (!TI.validateGlobalRegisterVariable(Label, Context.getTypeSize(R),
+                                                  HasSizeMismatch))
+        Diag(E->getExprLoc(), diag::err_asm_invalid_global_var_reg) << Label;
+      else if (HasSizeMismatch)
+        Diag(E->getExprLoc(), diag::err_asm_register_size_mismatch) << Label;
+    }
+
+    if (!R->isIntegralType(Context) && !R->isPointerType()) {
+      Diag(TInfo->getTypeLoc().getBeginLoc(),
+           diag::err_asm_unsupported_register_type)
+          << TInfo->getTypeLoc().getSourceRange();
+      NewVD->setInvalidDecl(true);
+    }
+  }
+}
+
 NamedDecl *Sema::ActOnVariableDeclarator(
     Scope *S, Declarator &D, DeclContext *DC, TypeSourceInfo *TInfo,
     LookupResult &Previous, MultiTemplateParamsArg TemplateParamLists,
@@ -8124,6 +8176,26 @@ NamedDecl *Sema::ActOnVariableDeclarator(
     }
   }
 
+  if (Expr *E = D.getAsmLabel()) {
+    // The parser guarantees this is a string.
+    StringLiteral *SE = cast<StringLiteral>(E);
+    StringRef Label = SE->getString();
+
+    // Insert the asm attribute.
+    NewVD->addAttr(AsmLabelAttr::Create(Context, Label, SE->getStrTokenLoc(0)));
+  } else if (!ExtnameUndeclaredIdentifiers.empty()) {
+    llvm::DenseMap<IdentifierInfo *, AsmLabelAttr *>::iterator I =
+        ExtnameUndeclaredIdentifiers.find(NewVD->getIdentifier());
+    if (I != ExtnameUndeclaredIdentifiers.end()) {
+      if (isDeclExternC(NewVD)) {
+        NewVD->addAttr(I->second);
+        ExtnameUndeclaredIdentifiers.erase(I);
+      } else
+        Diag(NewVD->getLocation(), diag::warn_redefine_extname_not_applied)
+            << /*Variable*/ 1 << NewVD;
+    }
+  }
+
   // Handle attributes prior to checking for duplicates in MergeVarDecl
   ProcessDeclAttributes(S, NewVD, D);
 
@@ -8174,65 +8246,11 @@ NamedDecl *Sema::ActOnVariableDeclarator(
   if (getLangOpts().ObjCAutoRefCount && ObjC().inferObjCARCLifetime(NewVD))
     NewVD->setInvalidDecl();
 
-  // Handle GNU asm-label extension (encoded as an attribute).
-  if (Expr *E = D.getAsmLabel()) {
-    // The parser guarantees this is a string.
-    StringLiteral *SE = cast<StringLiteral>(E);
-    StringRef Label = SE->getString();
-    if (S->getFnParent() != nullptr) {
-      switch (SC) {
-      case SC_None:
-      case SC_Auto:
-        Diag(E->getExprLoc(), diag::warn_asm_label_on_auto_decl) << Label;
-        break;
-      case SC_Register:
-        // Local Named register
-        if (!Context.getTargetInfo().isValidGCCRegisterName(Label) &&
-            DeclAttrsMatchCUDAMode(getLangOpts(), getCurFunctionDecl()))
-          Diag(E->getExprLoc(), diag::err_asm_unknown_register_name) << Label;
-        break;
-      case SC_Static:
-      case SC_Extern:
-      case SC_PrivateExtern:
-        break;
-      }
-    } else if (SC == SC_Register) {
-      // Global Named register
-      if (DeclAttrsMatchCUDAMode(getLangOpts(), NewVD)) {
-        const auto &TI = Context.getTargetInfo();
-        bool HasSizeMismatch;
-
-        if (!TI.isValidGCCRegisterName(Label))
-          Diag(E->getExprLoc(), diag::err_asm_unknown_register_name) << Label;
-        else if (!TI.validateGlobalRegisterVariable(Label,
-                                                    Context.getTypeSize(R),
-                                                    HasSizeMismatch))
-          Diag(E->getExprLoc(), diag::err_asm_invalid_global_var_reg) << Label;
-        else if (HasSizeMismatch)
-          Diag(E->getExprLoc(), diag::err_asm_register_size_mismatch) << Label;
-      }
-
-      if (!R->isIntegralType(Context) && !R->isPointerType()) {
-        Diag(TInfo->getTypeLoc().getBeginLoc(),
-             diag::err_asm_unsupported_register_type)
-            << TInfo->getTypeLoc().getSourceRange();
-        NewVD->setInvalidDecl(true);
-      }
-    }
-
-    NewVD->addAttr(AsmLabelAttr::Create(Context, Label, SE->getStrTokenLoc(0)));
-  } else if (!ExtnameUndeclaredIdentifiers.empty()) {
-    llvm::DenseMap<IdentifierInfo*,AsmLabelAttr*>::iterator I =
-      ExtnameUndeclaredIdentifiers.find(NewVD->getIdentifier());
-    if (I != ExtnameUndeclaredIdentifiers.end()) {
-      if (isDeclExternC(NewVD)) {
-        NewVD->addAttr(I->second);
-        ExtnameUndeclaredIdentifiers.erase(I);
-      } else
-        Diag(NewVD->getLocation(), diag::warn_redefine_extname_not_applied)
-            << /*Variable*/1 << NewVD;
-    }
-  }
+  // Check the ASM label here, as we need to know all other attributes of the
+  // Decl first.  Otherwise, we can't know if the asm label refers to the
+  // host or device in a CUDA context. The device has other registers than
+  // host and we must know where the function will be placed.
+  CheckAsmLabel(S, D.getAsmLabel(), SC, TInfo, NewVD);
 
   // Find the shadowed declaration before filtering for scope.
   NamedDecl *ShadowedDecl = D.getCXXScopeSpec().isEmpty()
@@ -13836,13 +13854,20 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) {
       VDecl->setInvalidDecl();
   }
 
-  // C++ [module.import/6] external definitions are not permitted in header
-  // units.
+  // C++ [module.import/6]
+  //   ...
+  //   A header unit shall not contain a definition of a non-inline function or
+  //   variable whose name has external linkage.
+  //
+  // We choose to allow weak & selectany definitions, as they are common in
+  // headers, and have semantics similar to inline definitions which are allowed
+  // in header units.
   if (getLangOpts().CPlusPlusModules && currentModuleIsHeaderUnit() &&
       !VDecl->isInvalidDecl() && VDecl->isThisDeclarationADefinition() &&
       VDecl->getFormalLinkage() == Linkage::External && !VDecl->isInline() &&
       !VDecl->isTemplated() && !isa<VarTemplateSpecializationDecl>(VDecl) &&
-      !VDecl->getInstantiatedFromStaticDataMember()) {
+      !VDecl->getInstantiatedFromStaticDataMember() &&
+      !(VDecl->hasAttr<SelectAnyAttr>() || VDecl->hasAttr<WeakAttr>())) {
     Diag(VDecl->getLocation(), diag::err_extern_def_in_header_unit);
     VDecl->setInvalidDecl();
   }
@@ -16180,16 +16205,24 @@ Decl *Sema::ActOnStartOfFunctionDef(Scope *FnBodyScope, Decl *D,
     }
   }
 
-  // C++ [module.import/6] external definitions are not permitted in header
-  // units.  Deleted and Defaulted functions are implicitly inline (but the
+  // C++ [module.import/6]
+  //   ...
+  //   A header unit shall not contain a definition of a non-inline function or
+  //   variable whose name has external linkage.
+  //
+  // Deleted and Defaulted functions are implicitly inline (but the
   // inline state is not set at this point, so check the BodyKind explicitly).
+  // We choose to allow weak & selectany definitions, as they are common in
+  // headers, and have semantics similar to inline definitions which are allowed
+  // in header units.
   // FIXME: Consider an alternate location for the test where the inlined()
   // state is complete.
   if (getLangOpts().CPlusPlusModules && currentModuleIsHeaderUnit() &&
       !FD->isInvalidDecl() && !FD->isInlined() &&
       BodyKind != FnBodyKind::Delete && BodyKind != FnBodyKind::Default &&
       FD->getFormalLinkage() == Linkage::External && !FD->isTemplated() &&
-      !FD->isTemplateInstantiation()) {
+      !FD->isTemplateInstantiation() &&
+      !(FD->hasAttr<SelectAnyAttr>() || FD->hasAttr<WeakAttr>())) {
     assert(FD->isThisDeclarationADefinition());
     Diag(FD->getLocation(), diag::err_extern_def_in_header_unit);
     FD->setInvalidDecl();
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 3107876565e8e..9475b8a684082 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -1255,7 +1255,7 @@ bool Sema::isValidPointerAttrType(QualType T, bool RefOkay) {
   // The nonnull attribute, and other similar attributes, can be applied to a
   // transparent union that contains a pointer type.
   if (const RecordType *UT = T->getAsUnionType()) {
-    RecordDecl *UD = UT->getOriginalDecl()->getDefinitionOrSelf();
+    RecordDecl *UD = UT->getDecl()->getDefinitionOrSelf();
     if (UD->hasAttr<TransparentUnionAttr>()) {
       for (const auto *I : UD->fields()) {
         QualType QT = I->getType();
@@ -3629,18 +3629,20 @@ static FormatAttrKind getFormatAttrKind(StringRef Format) {
       // Check for formats that get handled specially.
       .Case("NSString", NSStringFormat)
       .Case("CFString", CFStringFormat)
-      .Cases("gnu_strftime", "strftime", StrftimeFormat)
+      .Cases({"gnu_strftime", "strftime"}, StrftimeFormat)
 
       // Otherwise, check for supported formats.
-      .Cases("gnu_scanf", "scanf", "gnu_printf", "printf", "printf0",
-             "gnu_strfmon", "strfmon", SupportedFormat)
-      .Cases("cmn_err", "vcmn_err", "zcmn_err", SupportedFormat)
-      .Cases("kprintf", "syslog", SupportedFormat) // OpenBSD.
-      .Case("freebsd_kprintf", SupportedFormat)    // FreeBSD.
+      .Cases({"gnu_scanf", "scanf", "gnu_printf", "printf", "printf0",
+              "gnu_strfmon", "strfmon"},
+             SupportedFormat)
+      .Cases({"cmn_err", "vcmn_err", "zcmn_err"}, SupportedFormat)
+      .Cases({"kprintf", "syslog"}, SupportedFormat) // OpenBSD.
+      .Case("freebsd_kprintf", SupportedFormat)      // FreeBSD.
       .Case("os_trace", SupportedFormat)
       .Case("os_log", SupportedFormat)
 
-      .Cases("gcc_diag", "gcc_cdiag", "gcc_cxxdiag", "gcc_tdiag", IgnoredFormat)
+      .Cases({"gcc_diag", "gcc_cdiag", "gcc_cxxdiag", "gcc_tdiag"},
+             IgnoredFormat)
       .Default(InvalidFormat);
 }
 
@@ -5674,6 +5676,114 @@ static void handleLaunchBoundsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
                         AL.getNumArgs() > 2 ? AL.getArgAsExpr(2) : nullptr);
 }
 
+static std::pair<Expr *, int>
+makeClusterDimsArgExpr(Sema &S, Expr *E, const CUDAClusterDimsAttr &AL,
+                       const unsigned Idx) {
+  if (!E || S.DiagnoseUnexpandedParameterPack(E))
+    return {};
+
+  // Accept template arguments for now as they depend on something else.
+  // We'll get to check them when they eventually get instantiated.
+  if (E->isInstantiationDependent())
+    return {E, 1};
+
+  std::optional<llvm::APSInt> I = E->getIntegerConstantExpr(S.Context);
+  if (!I) {
+    S.Diag(E->getExprLoc(), diag::err_attribute_argument_n_type)
+        << &AL << Idx << AANT_ArgumentIntegerConstant << E->getSourceRange();
+    return {};
+  }
+  // Make sure we can fit it in 4 bits.
+  if (!I->isIntN(4)) {
+    S.Diag(E->getExprLoc(), diag::err_ice_too_large)
+        << toString(*I, 10, false) << 4 << /*Unsigned=*/1;
+    return {};
+  }
+  if (*I < 0) {
+    S.Diag(E->getExprLoc(), diag::warn_attribute_argument_n_negative)
+        << &AL << Idx << E->getSourceRange();
+  }
+
+  return {ConstantExpr::Create(S.getASTContext(), E, APValue(*I)),
+          I->getZExtValue()};
+}
+
+CUDAClusterDimsAttr *Sema::createClusterDimsAttr(const AttributeCommonInfo &CI,
+                                                 Expr *X, Expr *Y, Expr *Z) {
+  CUDAClusterDimsAttr TmpAttr(Context, CI, X, Y, Z);
+
+  auto [NewX, ValX] = makeClusterDimsArgExpr(*this, X, TmpAttr, /*Idx=*/0);
+  auto [NewY, ValY] = makeClusterDimsArgExpr(*this, Y, TmpAttr, /*Idx=*/1);
+  auto [NewZ, ValZ] = makeClusterDimsArgExpr(*this, Z, TmpAttr, /*Idx=*/2);
+
+  if (!NewX || (Y && !NewY) || (Z && !NewZ))
+    return nullptr;
+
+  int FlatDim = ValX * ValY * ValZ;
+  const llvm::Triple TT =
+      (!Context.getLangOpts().CUDAIsDevice && Context.getAuxTargetInfo())
+          ? Context.getAuxTargetInfo()->getTriple()
+          : Context.getTargetInfo().getTriple();
+  int MaxDim = 1;
+  if (TT.isNVPTX())
+    MaxDim = 8;
+  else if (TT.isAMDGPU())
+    MaxDim = 16;
+  else
+    return nullptr;
+
+  // A maximum of 8 thread blocks in a cluster is supported as a portable
+  // cluster size in CUDA. The number is 16 for AMDGPU.
+  if (FlatDim > MaxDim) {
+    Diag(CI.getLoc(), diag::err_cluster_dims_too_large) << MaxDim << FlatDim;
+    return nullptr;
+  }
+
+  return CUDAClusterDimsAttr::Create(Context, NewX, NewY, NewZ, CI);
+}
+
+void Sema::addClusterDimsAttr(Decl *D, const AttributeCommonInfo &CI, Expr *X,
+                              Expr *Y, Expr *Z) {
+  if (auto *Attr = createClusterDimsAttr(CI, X, Y, Z))
+    D->addAttr(Attr);
+}
+
+void Sema::addNoClusterAttr(Decl *D, const AttributeCommonInfo &CI) {
+  D->addAttr(CUDANoClusterAttr::Create(Context, CI));
+}
+
+static void handleClusterDimsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
+  const TargetInfo &TTI = S.Context.getTargetInfo();
+  OffloadArch Arch = StringToOffloadArch(TTI.getTargetOpts().CPU);
+  if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) ||
+      (TTI.getTriple().isAMDGPU() &&
+       !TTI.hasFeatureEnabled(TTI.getTargetOpts().FeatureMap, "clusters"))) {
+    S.Diag(AL.getLoc(), diag::err_cluster_attr_not_supported) << AL;
+    return;
+  }
+
+  if (!AL.checkAtLeastNumArgs(S, /*Num=*/1) ||
+      !AL.checkAtMostNumArgs(S, /*Num=*/3))
+    return;
+
+  S.addClusterDimsAttr(D, AL, AL.getArgAsExpr(0),
+                       AL.getNumArgs() > 1 ? AL.getArgAsExpr(1) : nullptr,
+                       AL.getNumArgs() > 2 ? AL.getArgAsExpr(2) : nullptr);
+}
+
+static void handleNoClusterAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
+  const TargetInfo &TTI = S.Context.getTargetInfo();
+  OffloadArch Arch = StringToOffloadArch(TTI.getTargetOpts().CPU);
+  if ((TTI.getTriple().isNVPTX() && Arch < clang::OffloadArch::SM_90) ||
+      (TTI.getTriple().isAMDGPU() &&
+       !TTI.hasFeatureEnabled(TTI.getTargetOpts().FeatureMap, "clusters"))) {
+    S.Diag(AL.getLoc(), diag::err_cluster_attr_not_supported) << AL;
+    return;
+  }
+
+  S.addNoClusterAttr(D, AL);
+}
+
 static void handleArgumentWithTypeTagAttr(Sema &S, Decl *D,
                                           const ParsedAttr &AL) {
   if (!AL.isArgIdent(0)) {
@@ -7139,6 +7249,12 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
   case ParsedAttr::AT_CUDALaunchBounds:
     handleLaunchBoundsAttr(S, D, AL);
     break;
+  case ParsedAttr::AT_CUDAClusterDims:
+    handleClusterDimsAttr(S, D, AL);
+    break;
+  case ParsedAttr::AT_CUDANoCluster:
+    handleNoClusterAttr(S, D, AL);
+    break;
   case ParsedAttr::AT_Restrict:
     handleRestrictAttr(S, D, AL);
     break;
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 215431ca71310..d41ab126c426f 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -5630,7 +5630,7 @@ bool Sema::SetCtorInitializers(CXXConstructorDecl *Constructor, bool AnyErrors,
 
 static void PopulateKeysForFields(FieldDecl *Field, SmallVectorImpl<const void*> &IdealInits) {
   if (const RecordType *RT = Field->getType()->getAsCanonical<RecordType>()) {
-    const RecordDecl *RD = RT->getOriginalDecl();
+    const RecordDecl *RD = RT->getDecl();
     if (RD->isAnonymousStructOrUnion()) {
       for (auto *Field : RD->getDefinitionOrSelf()->fields())
         PopulateKeysForFields(Field, IdealInits);
@@ -7630,9 +7630,8 @@ static bool defaultedSpecialMemberIsConstexpr(
         continue;
       QualType BaseType = S.Context.getBaseElementType(F->getType());
       if (const RecordType *RecordTy = BaseType->getAsCanonical<RecordType>()) {
-        CXXRecordDecl *FieldRecDecl =
-            cast<CXXRecordDecl>(RecordTy->getOriginalDecl())
-                ->getDefinitionOrSelf();
+        auto *FieldRecDecl =
+            cast<CXXRecordDecl>(RecordTy->getDecl())->getDefinitionOrSelf();
         if (!specialMemberIsConstexpr(S, FieldRecDecl, CSM,
                                       BaseType.getCVRQualifiers(),
                                       ConstArg && !F->isMutable()))
@@ -10645,7 +10644,7 @@ void Sema::checkIllFormedTrivialABIStruct(CXXRecordDecl &RD) {
     if (const auto *RT =
             FT->getBaseElementTypeUnsafe()->getAsCanonical<RecordType>())
       if (!RT->isDependentType() &&
-          !cast<CXXRecordDecl>(RT->getOriginalDecl()->getDefinitionOrSelf())
+          !cast<CXXRecordDecl>(RT->getDecl()->getDefinitionOrSelf())
                ->canPassInRegisters()) {
         PrintDiagAndRemoveAttr(5);
         return;
diff --git a/clang/lib/Sema/SemaDeclObjC.cpp b/clang/lib/Sema/SemaDeclObjC.cpp
index 98eb5afb7c992..3df9f9c1d68c7 100644
--- a/clang/lib/Sema/SemaDeclObjC.cpp
+++ b/clang/lib/Sema/SemaDeclObjC.cpp
@@ -3232,10 +3232,8 @@ static bool tryMatchRecordTypes(ASTContext &Context,
   assert(lt && rt && lt != rt);
 
   if (!isa<RecordType>(lt) || !isa<RecordType>(rt)) return false;
-  RecordDecl *left =
-      cast<RecordType>(lt)->getOriginalDecl()->getDefinitionOrSelf();
-  RecordDecl *right =
-      cast<RecordType>(rt)->getOriginalDecl()->getDefinitionOrSelf();
+  RecordDecl *left = cast<RecordType>(lt)->getDecl()->getDefinitionOrSelf();
+  RecordDecl *right = cast<RecordType>(rt)->getDecl()->getDefinitionOrSelf();
 
   // Require union-hood to match.
   if (left->isUnion() != right->isUnion()) return false;
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 76344581edc24..686acd3416ecf 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -1536,12 +1536,8 @@ void Sema::checkEnumArithmeticConversions(Expr *LHS, Expr *RHS,
     // are ill-formed.
     if (getLangOpts().CPlusPlus26)
       DiagID = diag::warn_conv_mixed_enum_types_cxx26;
-    else if (!L->castAsCanonical<EnumType>()
-                  ->getOriginalDecl()
-                  ->hasNameForLinkage() ||
-             !R->castAsCanonical<EnumType>()
-                  ->getOriginalDecl()
-                  ->hasNameForLinkage()) {
+    else if (!L->castAsCanonical<EnumType>()->getDecl()->hasNameForLinkage() ||
+             !R->castAsCanonical<EnumType>()->getDecl()->hasNameForLinkage()) {
       // If either enumeration type is unnamed, it's less likely that the
       // user cares about this, but this situation is still deprecated in
       // C++2a. Use a different warning group.
@@ -7112,7 +7108,7 @@ ExprResult Sema::BuildResolvedCallExpr(Expr *Fn, NamedDecl *NDecl,
     for (unsigned i = 0, e = Args.size(); i != e; i++) {
       if (const auto *RT =
               dyn_cast<RecordType>(Args[i]->getType().getCanonicalType())) {
-        if (RT->getOriginalDecl()->isOrContainsUnion())
+        if (RT->getDecl()->isOrContainsUnion())
           Diag(Args[i]->getBeginLoc(), diag::warn_cmse_nonsecure_union)
               << 0 << i;
       }
@@ -9765,7 +9761,7 @@ Sema::CheckTransparentUnionArgumentConstraints(QualType ArgType,
   if (!UT)
     return AssignConvertType::Incompatible;
 
-  RecordDecl *UD = UT->getOriginalDecl()->getDefinitionOrSelf();
+  RecordDecl *UD = UT->getDecl()->getDefinitionOrSelf();
   if (!UD->hasAttr<TransparentUnionAttr>())
     return AssignConvertType::Incompatible;
 
@@ -10861,7 +10857,7 @@ static void diagnoseScopedEnums(Sema &S, const SourceLocation Loc,
   auto DiagnosticHelper = [&S](const Expr *expr, const QualType type) {
     SourceLocation BeginLoc = expr->getBeginLoc();
     QualType IntType = type->castAs<EnumType>()
-                           ->getOriginalDecl()
+                           ->getDecl()
                            ->getDefinitionOrSelf()
                            ->getIntegerType();
     std::string InsertionString = "static_cast<" + IntType.getAsString() + ">(";
@@ -11550,7 +11546,7 @@ QualType Sema::CheckSubtractionOperands(ExprResult &LHS, ExprResult &RHS,
 
 static bool isScopedEnumerationType(QualType T) {
   if (const EnumType *ET = T->getAsCanonical<EnumType>())
-    return ET->getOriginalDecl()->isScoped();
+    return ET->getDecl()->isScoped();
   return false;
 }
 
@@ -12832,7 +12828,7 @@ QualType Sema::CheckCompareOperands(ExprResult &LHS, ExprResult &RHS,
       if (auto *CTSD = dyn_cast<ClassTemplateSpecializationDecl>(DC)) {
         if (CTSD->isInStdNamespace() &&
             llvm::StringSwitch<bool>(CTSD->getName())
-                .Cases("less", "less_equal", "greater", "greater_equal", true)
+                .Cases({"less", "less_equal", "greater", "greater_equal"}, true)
                 .Default(false)) {
           if (RHSType->isNullPtrType())
             RHS = ImpCastExprToType(RHS.get(), LHSType, CK_NullToPointer);
@@ -13873,7 +13869,7 @@ static void DiagnoseRecursiveConstFields(Sema &S, const ValueDecl *VD,
   while (RecordTypeList.size() > NextToCheckIndex) {
     bool IsNested = NextToCheckIndex > 0;
     for (const FieldDecl *Field : RecordTypeList[NextToCheckIndex]
-                                      ->getOriginalDecl()
+                                      ->getDecl()
                                       ->getDefinitionOrSelf()
                                       ->fields()) {
       // First, check every field for constness.
@@ -16014,6 +16010,20 @@ ExprResult Sema::CreateBuiltinUnaryOp(SourceLocation OpLoc,
             return ExprError(Diag(OpLoc, diag::err_typecheck_unary_expr)
                              << resultType << Input.get()->getSourceRange());
         }
+      } else if (Context.getLangOpts().HLSL && resultType->isVectorType() &&
+                 !resultType->hasBooleanRepresentation()) {
+        // HLSL unary logical 'not' behaves like C++, which states that the
+        // operand is converted to bool and the result is bool, however HLSL
+        // extends this property to vectors.
+        const VectorType *VTy = resultType->castAs<VectorType>();
+        resultType =
+            Context.getExtVectorType(Context.BoolTy, VTy->getNumElements());
+
+        Input = ImpCastExprToType(
+                    Input.get(), resultType,
+                    ScalarTypeToBooleanCastKind(VTy->getElementType()))
+                    .get();
+        break;
       } else if (resultType->isExtVectorType()) {
         if (Context.getLangOpts().OpenCL &&
             Context.getLangOpts().getOpenCLCompatibleVersion() < 120) {
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index 0fe242dce45e9..fe1f89b7a5dfa 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -1988,7 +1988,7 @@ static bool doesUsualArrayDeleteWantSize(Sema &S, SourceLocation loc,
   DeclarationName deleteName =
     S.Context.DeclarationNames.getCXXOperatorName(OO_Array_Delete);
   LookupResult ops(S, deleteName, loc, Sema::LookupOrdinaryName);
-  S.LookupQualifiedName(ops, record->getOriginalDecl()->getDefinitionOrSelf());
+  S.LookupQualifiedName(ops, record->getDecl()->getDefinitionOrSelf());
 
   // We're just doing this for information.
   ops.suppressDiagnostics();
@@ -6667,8 +6667,7 @@ ExprResult Sema::MaybeBindToTemporary(Expr *E) {
 
   // That should be enough to guarantee that this type is complete, if we're
   // not processing a decltype expression.
-  CXXRecordDecl *RD =
-      cast<CXXRecordDecl>(RT->getOriginalDecl())->getDefinitionOrSelf();
+  auto *RD = cast<CXXRecordDecl>(RT->getDecl())->getDefinitionOrSelf();
   if (RD->isInvalidDecl() || RD->isDependentContext())
     return E;
 
diff --git a/clang/lib/Sema/SemaExprObjC.cpp b/clang/lib/Sema/SemaExprObjC.cpp
index 331f6e585555b..4daf01703d7dd 100644
--- a/clang/lib/Sema/SemaExprObjC.cpp
+++ b/clang/lib/Sema/SemaExprObjC.cpp
@@ -3846,8 +3846,7 @@ static inline T *getObjCBridgeAttr(const TypedefType *TD) {
   if (QT->isPointerType()) {
     QT = QT->getPointeeType();
     if (const RecordType *RT = QT->getAsCanonical<RecordType>()) {
-      for (auto *Redecl :
-           RT->getOriginalDecl()->getMostRecentDecl()->redecls()) {
+      for (auto *Redecl : RT->getDecl()->getMostRecentDecl()->redecls()) {
         if (auto *attr = Redecl->getAttr<T>())
           return attr;
       }
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 72b2ac99ec53c..5b3e89f936327 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -782,7 +782,7 @@ bool SemaHLSL::isSemanticValid(FunctionDecl *FD, DeclaratorDecl *D) {
   if (!RT)
     return false;
 
-  const RecordDecl *RD = RT->getOriginalDecl();
+  const RecordDecl *RD = RT->getDecl();
   for (FieldDecl *Field : RD->fields()) {
     if (!isSemanticValid(FD, Field))
       return false;
@@ -1986,7 +1986,7 @@ SemaHLSL::TakeLocForHLSLAttribute(const HLSLAttributedResourceType *RT) {
 // requirements and adds them to Bindings
 void SemaHLSL::collectResourceBindingsOnUserRecordDecl(const VarDecl *VD,
                                                        const RecordType *RT) {
-  const RecordDecl *RD = RT->getOriginalDecl()->getDefinitionOrSelf();
+  const RecordDecl *RD = RT->getDecl()->getDefinitionOrSelf();
   for (FieldDecl *FD : RD->fields()) {
     const Type *Ty = FD->getType()->getUnqualifiedDesugaredType();
 
@@ -3006,6 +3006,24 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
     TheCall->setType(CounterHandleTy);
     break;
   }
+  case Builtin::BI__builtin_hlsl_resource_getdimensions_x: {
+    ASTContext &AST = SemaRef.getASTContext();
+    if (SemaRef.checkArgCount(TheCall, 2) ||
+        CheckResourceHandle(&SemaRef, TheCall, 0) ||
+        CheckArgTypeMatches(&SemaRef, TheCall->getArg(1), AST.UnsignedIntTy) ||
+        CheckModifiableLValue(&SemaRef, TheCall, 1))
+      return true;
+    break;
+  }
+  case Builtin::BI__builtin_hlsl_resource_getstride: {
+    ASTContext &AST = SemaRef.getASTContext();
+    if (SemaRef.checkArgCount(TheCall, 2) ||
+        CheckResourceHandle(&SemaRef, TheCall, 0) ||
+        CheckArgTypeMatches(&SemaRef, TheCall->getArg(1), AST.UnsignedIntTy) ||
+        CheckModifiableLValue(&SemaRef, TheCall, 1))
+      return true;
+    break;
+  }
   case Builtin::BI__builtin_hlsl_and:
   case Builtin::BI__builtin_hlsl_or: {
     if (SemaRef.checkArgCount(TheCall, 2))
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index ba2633946017b..42cb3c07b7247 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -672,11 +672,12 @@ ExprResult InitListChecker::PerformEmptyInit(SourceLocation Loc,
           IsInStd = true;
       }
 
-      if (IsInStd && llvm::StringSwitch<bool>(R->getName())
-              .Cases("basic_string", "deque", "forward_list", true)
-              .Cases("list", "map", "multimap", "multiset", true)
-              .Cases("priority_queue", "queue", "set", "stack", true)
-              .Cases("unordered_map", "unordered_set", "vector", true)
+      if (IsInStd &&
+          llvm::StringSwitch<bool>(R->getName())
+              .Cases({"basic_string", "deque", "forward_list"}, true)
+              .Cases({"list", "map", "multimap", "multiset"}, true)
+              .Cases({"priority_queue", "queue", "set", "stack"}, true)
+              .Cases({"unordered_map", "unordered_set", "vector"}, true)
               .Default(false)) {
         InitSeq.InitializeFrom(
             SemaRef, Entity,
@@ -775,7 +776,7 @@ void InitListChecker::FillInEmptyInitForField(unsigned Init, FieldDecl *Field,
 
   if (Init >= NumInits || !ILE->getInit(Init)) {
     if (const RecordType *RType = ILE->getType()->getAsCanonical<RecordType>())
-      if (!RType->getOriginalDecl()->isUnion())
+      if (!RType->getDecl()->isUnion())
         assert((Init < NumInits || VerifyOnly) &&
                "This ILE should have been expanded");
 
@@ -9195,9 +9196,8 @@ bool InitializationSequence::Diagnose(Sema &S,
                    diag::note_member_declared_at);
 
             if (const auto *Record = Entity.getType()->getAs<RecordType>())
-              S.Diag(Record->getOriginalDecl()->getLocation(),
-                     diag::note_previous_decl)
-                  << S.Context.getCanonicalTagType(Record->getOriginalDecl());
+              S.Diag(Record->getDecl()->getLocation(), diag::note_previous_decl)
+                  << S.Context.getCanonicalTagType(Record->getDecl());
           }
           break;
         }
@@ -9991,8 +9991,8 @@ QualType Sema::DeduceTemplateSpecializationFromInitializer(
         // Cases where template arguments in the RHS of the alias are not
         // dependent. e.g.
         //   using AliasFoo = Foo<bool>;
-        if (const auto *CTSD = llvm::dyn_cast<ClassTemplateSpecializationDecl>(
-                RT->getOriginalDecl()))
+        if (const auto *CTSD =
+                llvm::dyn_cast<ClassTemplateSpecializationDecl>(RT->getDecl()))
           Template = CTSD->getSpecializedTemplate();
       }
     }
diff --git a/clang/lib/Sema/SemaLookup.cpp b/clang/lib/Sema/SemaLookup.cpp
index 25728de1779ad..5915d6e57d893 100644
--- a/clang/lib/Sema/SemaLookup.cpp
+++ b/clang/lib/Sema/SemaLookup.cpp
@@ -2727,9 +2727,7 @@ bool Sema::LookupParsedName(LookupResult &R, Scope *S, CXXScopeSpec *SS,
     IsDependent = !DC && ObjectType->isDependentType();
     assert(((!DC && ObjectType->isDependentType()) ||
             !ObjectType->isIncompleteType() || !ObjectType->getAs<TagType>() ||
-            ObjectType->castAs<TagType>()
-                ->getOriginalDecl()
-                ->isEntityBeingDefined()) &&
+            ObjectType->castAs<TagType>()->getDecl()->isEntityBeingDefined()) &&
            "Caller should have completed object type");
   } else if (SS && SS->isNotEmpty()) {
     // This nested-name-specifier occurs after another nested-name-specifier,
@@ -3191,9 +3189,8 @@ addAssociatedClassesAndNamespaces(AssociatedLookup &Result, QualType Ty) {
     //        namespaces of its associated classes.
     case Type::Record: {
       // FIXME: This should use the original decl.
-      CXXRecordDecl *Class =
-          cast<CXXRecordDecl>(cast<RecordType>(T)->getOriginalDecl())
-              ->getDefinitionOrSelf();
+      auto *Class = cast<CXXRecordDecl>(cast<RecordType>(T)->getDecl())
+                        ->getDefinitionOrSelf();
       addAssociatedClassesAndNamespaces(Result, Class);
       break;
     }
@@ -4606,7 +4603,7 @@ static void getNestedNameSpecifierIdentifiers(
       case Type::InjectedClassName: {
         auto *TT = cast<TagType>(T);
         getNestedNameSpecifierIdentifiers(TT->getQualifier(), Identifiers);
-        Identifiers.push_back(TT->getOriginalDecl()->getIdentifier());
+        Identifiers.push_back(TT->getDecl()->getIdentifier());
         return;
       }
       case Type::Typedef: {
diff --git a/clang/lib/Sema/SemaObjC.cpp b/clang/lib/Sema/SemaObjC.cpp
index 4f9470a361d2d..7aaa56e37b3be 100644
--- a/clang/lib/Sema/SemaObjC.cpp
+++ b/clang/lib/Sema/SemaObjC.cpp
@@ -1407,7 +1407,7 @@ SemaObjC::ObjCSubscriptKind SemaObjC::CheckSubscriptingKind(Expr *FromE) {
   int NoIntegrals = 0, NoObjCIdPointers = 0;
   SmallVector<CXXConversionDecl *, 4> ConversionDecls;
 
-  for (NamedDecl *D : cast<CXXRecordDecl>(RecordTy->getOriginalDecl())
+  for (NamedDecl *D : cast<CXXRecordDecl>(RecordTy->getDecl())
                           ->getDefinitionOrSelf()
                           ->getVisibleConversionFunctions()) {
     if (CXXConversionDecl *Conversion =
@@ -1511,7 +1511,7 @@ bool SemaObjC::isCFStringType(QualType T) {
   if (!RT)
     return false;
 
-  const RecordDecl *RD = RT->getOriginalDecl();
+  const RecordDecl *RD = RT->getDecl();
   if (RD->getTagKind() != TagTypeKind::Struct)
     return false;
 
diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp
index ca99834ce8266..3bb8080f6e72c 100644
--- a/clang/lib/Sema/SemaOpenACC.cpp
+++ b/clang/lib/Sema/SemaOpenACC.cpp
@@ -2996,6 +2996,8 @@ bool SemaOpenACC::CreateReductionCombinerRecipe(
 
   case OpenACCReductionOperator::Max:
   case OpenACCReductionOperator::Min:
+    BinOp = BinaryOperatorKind::BO_LT;
+    break;
   case OpenACCReductionOperator::And:
   case OpenACCReductionOperator::Or:
     // We just want a 'NYI' error in the backend, so leave an empty combiner
@@ -3011,26 +3013,80 @@ bool SemaOpenACC::CreateReductionCombinerRecipe(
 
   assert(!VarTy->isArrayType() && "Only 1 level of array allowed");
 
+  enum class CombinerFailureKind {
+    None = 0,
+    BinOp = 1,
+    Conditional = 2,
+    Assignment = 3,
+  };
+
+  auto genCombiner = [&, this](DeclRefExpr *LHSDRE, DeclRefExpr *RHSDRE)
+      -> std::pair<ExprResult, CombinerFailureKind> {
+    ExprResult BinOpRes =
+        SemaRef.BuildBinOp(SemaRef.getCurScope(), Loc, BinOp, LHSDRE, RHSDRE,
+                           /*ForFoldExpr=*/false);
+    switch (ReductionOperator) {
+    case OpenACCReductionOperator::Addition:
+    case OpenACCReductionOperator::Multiplication:
+    case OpenACCReductionOperator::BitwiseAnd:
+    case OpenACCReductionOperator::BitwiseOr:
+    case OpenACCReductionOperator::BitwiseXOr:
+      // These 5 are simple and are being done  as compound operators, so we can
+      // immediately quit here.
+      return {BinOpRes, BinOpRes.isUsable() ? CombinerFailureKind::None
+                                            : CombinerFailureKind::BinOp};
+    case OpenACCReductionOperator::Max:
+    case OpenACCReductionOperator::Min: {
+      // These are done as:
+      // LHS = (LHS < RHS) ? LHS : RHS; and LHS = (LHS < RHS) ? RHS : LHS;
+      //
+      // The BinOpRes should have been created with the less-than, so we just
+      // have to build the conditional and assignment.
+      if (!BinOpRes.isUsable())
+        return {BinOpRes, CombinerFailureKind::BinOp};
+
+      // Create the correct conditional operator, swapping the results
+      // (true/false value) depending on min/max.
+      ExprResult CondRes;
+      if (ReductionOperator == OpenACCReductionOperator::Min)
+        CondRes = SemaRef.ActOnConditionalOp(Loc, Loc, BinOpRes.get(), LHSDRE,
+                                             RHSDRE);
+      else
+        CondRes = SemaRef.ActOnConditionalOp(Loc, Loc, BinOpRes.get(), RHSDRE,
+                                             LHSDRE);
+
+      if (!CondRes.isUsable())
+        return {CondRes, CombinerFailureKind::Conditional};
+
+      // Build assignment.
+      ExprResult Assignment = SemaRef.BuildBinOp(SemaRef.getCurScope(), Loc,
+                                                 BinaryOperatorKind::BO_Assign,
+                                                 LHSDRE, CondRes.get(),
+                                                 /*ForFoldExpr=*/false);
+      return {Assignment, Assignment.isUsable()
+                              ? CombinerFailureKind::None
+                              : CombinerFailureKind::Assignment};
+    }
+    case OpenACCReductionOperator::And:
+    case OpenACCReductionOperator::Or:
+      llvm_unreachable("And/Or not implemented, but should fail earlier");
+    case OpenACCReductionOperator::Invalid:
+      llvm_unreachable("Invalid should have been caught above");
+    }
+  };
+
   auto tryCombiner = [&, this](DeclRefExpr *LHSDRE, DeclRefExpr *RHSDRE,
                                bool IncludeTrap) {
-    // TODO: OpenACC: we have to figure out based on the bin-op how to do the
-    // ones that we can't just use compound operators for.  So &&, ||, max, and
-    // min aren't really clear what we could do here.
     if (IncludeTrap) {
       // Trap all of the errors here, we'll emit our own at the end.
       Sema::TentativeAnalysisScope Trap{SemaRef};
-
-      return SemaRef.BuildBinOp(SemaRef.getCurScope(), Loc, BinOp, LHSDRE,
-                                RHSDRE,
-                                /*ForFoldExpr=*/false);
-    } else {
-      return SemaRef.BuildBinOp(SemaRef.getCurScope(), Loc, BinOp, LHSDRE,
-                                RHSDRE,
-                                /*ForFoldExpr=*/false);
+      return genCombiner(LHSDRE, RHSDRE);
     }
+    return genCombiner(LHSDRE, RHSDRE);
   };
 
   struct CombinerAttemptTy {
+    CombinerFailureKind FailKind;
     VarDecl *LHS;
     DeclRefExpr *LHSDRE;
     VarDecl *RHS;
@@ -3058,9 +3114,11 @@ bool SemaOpenACC::CreateReductionCombinerRecipe(
                             RHSDecl->getBeginLoc()},
         Ty, clang::VK_LValue, RHSDecl, nullptr, NOUR_None);
 
-    ExprResult BinOpResult = tryCombiner(LHSDRE, RHSDRE, /*IncludeTrap=*/true);
+    std::pair<ExprResult, CombinerFailureKind> BinOpResult =
+        tryCombiner(LHSDRE, RHSDRE, /*IncludeTrap=*/true);
 
-    return {LHSDecl, LHSDRE, RHSDecl, RHSDRE, BinOpResult.get()};
+    return {BinOpResult.second,     LHSDecl, LHSDRE, RHSDecl, RHSDRE,
+            BinOpResult.first.get()};
   };
 
   CombinerAttemptTy TopLevelCombinerInfo = formCombiner(VarTy);
@@ -3081,12 +3139,20 @@ bool SemaOpenACC::CreateReductionCombinerRecipe(
     }
   }
 
+  auto EmitFailureNote = [&](CombinerFailureKind CFK) {
+    if (CFK == CombinerFailureKind::BinOp)
+      return Diag(Loc, diag::note_acc_reduction_combiner_forming)
+             << CFK << BinaryOperator::getOpcodeStr(BinOp);
+    return Diag(Loc, diag::note_acc_reduction_combiner_forming) << CFK;
+  };
+
   // Since the 'root' level didn't fail, the only thing that could be successful
   // is a struct that we decompose on its individual fields.
 
   RecordDecl *RD = VarTy->getAsRecordDecl();
   if (!RD) {
     Diag(Loc, diag::err_acc_reduction_recipe_no_op) << VarTy;
+    EmitFailureNote(TopLevelCombinerInfo.FailKind);
     tryCombiner(TopLevelCombinerInfo.LHSDRE, TopLevelCombinerInfo.RHSDRE,
                 /*IncludeTrap=*/false);
     return true;
@@ -3098,6 +3164,7 @@ bool SemaOpenACC::CreateReductionCombinerRecipe(
     if (!FieldCombinerInfo.Op || FieldCombinerInfo.Op->containsErrors()) {
       Diag(Loc, diag::err_acc_reduction_recipe_no_op) << FD->getType();
       Diag(FD->getBeginLoc(), diag::note_acc_reduction_recipe_noop_field) << RD;
+      EmitFailureNote(FieldCombinerInfo.FailKind);
       tryCombiner(FieldCombinerInfo.LHSDRE, FieldCombinerInfo.RHSDRE,
                   /*IncludeTrap=*/false);
       return true;
diff --git a/clang/lib/Sema/SemaOpenACCClause.cpp b/clang/lib/Sema/SemaOpenACCClause.cpp
index ead97816defe7..17078e8814a43 100644
--- a/clang/lib/Sema/SemaOpenACCClause.cpp
+++ b/clang/lib/Sema/SemaOpenACCClause.cpp
@@ -1924,7 +1924,7 @@ bool SemaOpenACC::CheckReductionVarType(Expr *VarExpr) {
   // off here. This will result in CurType being the actual 'type' of the
   // expression, which is what we are looking to check.
   QualType CurType = isa<ArraySectionExpr>(VarExpr)
-                         ? ArraySectionExpr::getBaseOriginalType(VarExpr)
+                         ? cast<ArraySectionExpr>(VarExpr)->getElementType()
                          : VarExpr->getType();
 
   // This can happen when we have a dependent type in an array element that the
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 6b820b33ffce2..06e5dab35cc3e 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -2595,7 +2595,7 @@ IsTransparentUnionStandardConversion(Sema &S, Expr* From,
   if (!UT)
     return false;
   // The field to initialize within the transparent union.
-  const RecordDecl *UD = UT->getOriginalDecl()->getDefinitionOrSelf();
+  const RecordDecl *UD = UT->getDecl()->getDefinitionOrSelf();
   if (!UD->hasAttr<TransparentUnionAttr>())
     return false;
   // It's compatible if the expression matches any of the fields.
@@ -3974,7 +3974,7 @@ IsUserDefinedConversion(Sema &S, Expr *From, QualType ToType,
     if (!S.isCompleteType(From->getExprLoc(), ToType)) {
       // We're not going to find any constructors.
     } else if (auto *ToRecordDecl =
-                   dyn_cast<CXXRecordDecl>(ToRecordType->getOriginalDecl())) {
+                   dyn_cast<CXXRecordDecl>(ToRecordType->getDecl())) {
       ToRecordDecl = ToRecordDecl->getDefinitionOrSelf();
 
       Expr **Args = &From;
@@ -4049,7 +4049,7 @@ IsUserDefinedConversion(Sema &S, Expr *From, QualType ToType,
   } else if (const RecordType *FromRecordType =
                  From->getType()->getAsCanonical<RecordType>()) {
     if (auto *FromRecordDecl =
-            dyn_cast<CXXRecordDecl>(FromRecordType->getOriginalDecl())) {
+            dyn_cast<CXXRecordDecl>(FromRecordType->getDecl())) {
       FromRecordDecl = FromRecordDecl->getDefinitionOrSelf();
       // Add all of the conversion functions as candidates.
       const auto &Conversions = FromRecordDecl->getVisibleConversionFunctions();
@@ -6842,7 +6842,7 @@ ExprResult Sema::PerformContextualImplicitConversion(
   UnresolvedSet<4>
       ViableConversions; // These are *potentially* viable in C++1y.
   UnresolvedSet<4> ExplicitConversions;
-  const auto &Conversions = cast<CXXRecordDecl>(RecordTy->getOriginalDecl())
+  const auto &Conversions = cast<CXXRecordDecl>(RecordTy->getDecl())
                                 ->getDefinitionOrSelf()
                                 ->getVisibleConversionFunctions();
 
@@ -10196,9 +10196,7 @@ class BuiltinOperatorOverloadBuilder {
 
       if (S.getLangOpts().CPlusPlus11) {
         for (QualType EnumTy : CandidateTypes[ArgIdx].enumeration_types()) {
-          if (!EnumTy->castAsCanonical<EnumType>()
-                   ->getOriginalDecl()
-                   ->isScoped())
+          if (!EnumTy->castAsCanonical<EnumType>()->getDecl()->isScoped())
             continue;
 
           if (!AddedTypes.insert(S.Context.getCanonicalType(EnumTy)).second)
@@ -13222,7 +13220,10 @@ void OverloadCandidateSet::NoteCandidates(
 
   auto Cands = CompleteCandidates(S, OCD, Args, OpLoc, Filter);
 
-  S.Diag(PD.first, PD.second, shouldDeferDiags(S, Args, OpLoc));
+  {
+    Sema::DeferDiagsRAII RAII{S, shouldDeferDiags(S, Args, OpLoc)};
+    S.Diag(PD.first, PD.second);
+  }
 
   // In WebAssembly we don't want to emit further diagnostics if a table is
   // passed as an argument to a function.
@@ -13285,10 +13286,10 @@ void OverloadCandidateSet::NoteCandidates(Sema &S, ArrayRef<Expr *> Args,
   // inform the future value of S.Diags.getNumOverloadCandidatesToShow().
   S.Diags.overloadCandidatesShown(CandsShown);
 
-  if (I != E)
-    S.Diag(OpLoc, diag::note_ovl_too_many_candidates,
-           shouldDeferDiags(S, Args, OpLoc))
-        << int(E - I);
+  if (I != E) {
+    Sema::DeferDiagsRAII RAII{S, shouldDeferDiags(S, Args, OpLoc)};
+    S.Diag(OpLoc, diag::note_ovl_too_many_candidates) << int(E - I);
+  }
 }
 
 static SourceLocation
diff --git a/clang/lib/Sema/SemaRISCV.cpp b/clang/lib/Sema/SemaRISCV.cpp
index 3ba93ff98898b..b5f91a318ced9 100644
--- a/clang/lib/Sema/SemaRISCV.cpp
+++ b/clang/lib/Sema/SemaRISCV.cpp
@@ -1445,39 +1445,40 @@ void SemaRISCV::checkRVVTypeSupport(QualType Ty, SourceLocation Loc, Decl *D,
 
   if (Info.ElementType->isSpecificBuiltinType(BuiltinType::Double) &&
       !FeatureMap.lookup("zve64d"))
-    Diag(Loc, diag::err_riscv_type_requires_extension, D) << Ty << "zve64d";
+    Diag(Loc, diag::err_riscv_type_requires_extension) << Ty << "zve64d";
   // (ELEN, LMUL) pairs of (8, mf8), (16, mf4), (32, mf2), (64, m1) requires at
   // least zve64x
   else if (((EltSize == 64 && Info.ElementType->isIntegerType()) ||
             MinElts == 1) &&
            !FeatureMap.lookup("zve64x"))
-    Diag(Loc, diag::err_riscv_type_requires_extension, D) << Ty << "zve64x";
+    Diag(Loc, diag::err_riscv_type_requires_extension) << Ty << "zve64x";
   else if (Info.ElementType->isFloat16Type() && !FeatureMap.lookup("zvfh") &&
            !FeatureMap.lookup("zvfhmin") &&
            !FeatureMap.lookup("xandesvpackfph"))
     if (DeclareAndesVectorBuiltins) {
-      Diag(Loc, diag::err_riscv_type_requires_extension, D)
+      Diag(Loc, diag::err_riscv_type_requires_extension)
           << Ty << "zvfh, zvfhmin or xandesvpackfph";
     } else {
-      Diag(Loc, diag::err_riscv_type_requires_extension, D)
+      Diag(Loc, diag::err_riscv_type_requires_extension)
           << Ty << "zvfh or zvfhmin";
     }
   else if (Info.ElementType->isBFloat16Type() &&
            !FeatureMap.lookup("zvfbfmin") &&
-           !FeatureMap.lookup("xandesvbfhcvt"))
+           !FeatureMap.lookup("xandesvbfhcvt") &&
+           !FeatureMap.lookup("experimental-zvfbfa"))
     if (DeclareAndesVectorBuiltins) {
-      Diag(Loc, diag::err_riscv_type_requires_extension, D)
+      Diag(Loc, diag::err_riscv_type_requires_extension)
           << Ty << "zvfbfmin or xandesvbfhcvt";
     } else {
-      Diag(Loc, diag::err_riscv_type_requires_extension, D) << Ty << "zvfbfmin";
+      Diag(Loc, diag::err_riscv_type_requires_extension) << Ty << "zvfbfmin";
     }
   else if (Info.ElementType->isSpecificBuiltinType(BuiltinType::Float) &&
            !FeatureMap.lookup("zve32f"))
-    Diag(Loc, diag::err_riscv_type_requires_extension, D) << Ty << "zve32f";
+    Diag(Loc, diag::err_riscv_type_requires_extension) << Ty << "zve32f";
   // Given that caller already checked isRVVType() before calling this function,
   // if we don't have at least zve32x supported, then we need to emit error.
   else if (!FeatureMap.lookup("zve32x"))
-    Diag(Loc, diag::err_riscv_type_requires_extension, D) << Ty << "zve32x";
+    Diag(Loc, diag::err_riscv_type_requires_extension) << Ty << "zve32x";
 }
 
 /// Are the two types RVV-bitcast-compatible types? I.e. is bitcasting from the
diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp
index b981c35c8083f..67f3856c10615 100644
--- a/clang/lib/Sema/SemaSYCL.cpp
+++ b/clang/lib/Sema/SemaSYCL.cpp
@@ -221,8 +221,8 @@ static SourceLocation SourceLocationForUserDeclaredType(QualType QT) {
   SourceLocation Loc;
   const Type *T = QT->getUnqualifiedDesugaredType();
   if (const TagType *TT = dyn_cast<TagType>(T))
-    Loc = TT->getOriginalDecl()->getLocation();
-  else if (const ObjCInterfaceType *ObjCIT = dyn_cast<ObjCInterfaceType>(T))
+    Loc = TT->getDecl()->getLocation();
+  else if (const auto *ObjCIT = dyn_cast<ObjCInterfaceType>(T))
     Loc = ObjCIT->getDecl()->getLocation();
   return Loc;
 }
diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp
index ae0bb616beb82..f39896336053e 100644
--- a/clang/lib/Sema/SemaStmt.cpp
+++ b/clang/lib/Sema/SemaStmt.cpp
@@ -1277,11 +1277,11 @@ static void checkEnumTypesInSwitchStmt(Sema &S, const Expr *Cond,
     return;
 
   // Ignore anonymous enums.
-  if (!CondEnumType->getOriginalDecl()->getIdentifier() &&
-      !CondEnumType->getOriginalDecl()->getTypedefNameForAnonDecl())
+  if (!CondEnumType->getDecl()->getIdentifier() &&
+      !CondEnumType->getDecl()->getTypedefNameForAnonDecl())
     return;
-  if (!CaseEnumType->getOriginalDecl()->getIdentifier() &&
-      !CaseEnumType->getOriginalDecl()->getTypedefNameForAnonDecl())
+  if (!CaseEnumType->getDecl()->getIdentifier() &&
+      !CaseEnumType->getDecl()->getTypedefNameForAnonDecl())
     return;
 
   if (S.Context.hasSameUnqualifiedType(CondType, CaseType))
@@ -3760,7 +3760,7 @@ class LocalTypedefNameReferencer : public DynamicRecursiveASTVisitor {
   Sema &S;
 };
 bool LocalTypedefNameReferencer::VisitRecordType(RecordType *RT) {
-  auto *R = dyn_cast<CXXRecordDecl>(RT->getOriginalDecl());
+  auto *R = dyn_cast<CXXRecordDecl>(RT->getDecl());
   if (!R || !R->isLocalClass() || !R->isLocalClass()->isExternallyVisible() ||
       R->isDependentType())
     return true;
@@ -3979,7 +3979,7 @@ StmtResult Sema::BuildReturnStmt(SourceLocation ReturnLoc, Expr *RetValExp,
             << RetValExp->getSourceRange();
     if (FD->hasAttr<CmseNSEntryAttr>() && RetValExp) {
       if (const auto *RT = dyn_cast<RecordType>(FnRetType.getCanonicalType())) {
-        if (RT->getOriginalDecl()->isOrContainsUnion())
+        if (RT->getDecl()->isOrContainsUnion())
           Diag(RetValExp->getBeginLoc(), diag::warn_cmse_nonsecure_union) << 1;
       }
     }
diff --git a/clang/lib/Sema/SemaStmtAsm.cpp b/clang/lib/Sema/SemaStmtAsm.cpp
index 0438af752a69e..f957bdf7156c7 100644
--- a/clang/lib/Sema/SemaStmtAsm.cpp
+++ b/clang/lib/Sema/SemaStmtAsm.cpp
@@ -908,7 +908,7 @@ bool Sema::LookupInlineAsmField(StringRef Base, StringRef Member,
     LookupResult FieldResult(*this, &Context.Idents.get(NextMember),
                              SourceLocation(), LookupMemberName);
 
-    RecordDecl *RD = RT->getOriginalDecl()->getDefinitionOrSelf();
+    RecordDecl *RD = RT->getDecl()->getDefinitionOrSelf();
     if (!LookupQualifiedName(FieldResult, RD))
       return true;
 
diff --git a/clang/lib/Sema/SemaStmtAttr.cpp b/clang/lib/Sema/SemaStmtAttr.cpp
index 50acc83f1841c..27fd5563cc40e 100644
--- a/clang/lib/Sema/SemaStmtAttr.cpp
+++ b/clang/lib/Sema/SemaStmtAttr.cpp
@@ -81,7 +81,7 @@ static Attr *handleLoopHintAttr(Sema &S, Stmt *St, const ParsedAttr &A,
   StringRef PragmaName =
       llvm::StringSwitch<StringRef>(
           PragmaNameLoc->getIdentifierInfo()->getName())
-          .Cases("unroll", "nounroll", "unroll_and_jam", "nounroll_and_jam",
+          .Cases({"unroll", "nounroll", "unroll_and_jam", "nounroll_and_jam"},
                  PragmaNameLoc->getIdentifierInfo()->getName())
           .Default("clang loop");
 
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 3a6ff9910667d..2cc65935def53 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -408,9 +408,7 @@ bool Sema::LookupTemplateName(LookupResult &Found, Scope *S, CXXScopeSpec &SS,
     IsDependent = !LookupCtx && ObjectType->isDependentType();
     assert((IsDependent || !ObjectType->isIncompleteType() ||
             !ObjectType->getAs<TagType>() ||
-            ObjectType->castAs<TagType>()
-                ->getOriginalDecl()
-                ->isEntityBeingDefined()) &&
+            ObjectType->castAs<TagType>()->getDecl()->isEntityBeingDefined()) &&
            "Caller should have completed object type");
 
     // Template names cannot appear inside an Objective-C class or object type
@@ -1819,7 +1817,7 @@ class ConstraintRefersToContainingTemplateChecker
   }
 
   bool VisitTagType(const TagType *T) override {
-    return TraverseDecl(T->getOriginalDecl());
+    return TraverseDecl(T->getDecl());
   }
 
   bool TraverseDecl(const Decl *D) override {
@@ -2790,7 +2788,7 @@ struct DependencyChecker : DynamicRecursiveASTVisitor {
     // An InjectedClassNameType will never have a dependent template name,
     // so no need to traverse it.
     return TraverseTemplateArguments(
-        T->getTemplateArgs(T->getOriginalDecl()->getASTContext()));
+        T->getTemplateArgs(T->getDecl()->getASTContext()));
   }
 };
 } // end anonymous namespace
@@ -2914,7 +2912,7 @@ TemplateParameterList *Sema::MatchTemplateParametersToScopeSpecifier(
     if (const EnumType *EnumT = T->getAsCanonical<EnumType>()) {
       // FIXME: Forward-declared enums require a TSK_ExplicitSpecialization
       // check here.
-      EnumDecl *Enum = EnumT->getOriginalDecl();
+      EnumDecl *Enum = EnumT->getDecl();
 
       // Get to the parent type.
       if (TypeDecl *Parent = dyn_cast<TypeDecl>(Enum->getParent()))
@@ -3352,7 +3350,7 @@ static QualType builtinCommonTypeImpl(Sema &S, ElaboratedTypeKeyword Keyword,
 }
 
 static bool isInVkNamespace(const RecordType *RT) {
-  DeclContext *DC = RT->getOriginalDecl()->getDeclContext();
+  DeclContext *DC = RT->getDecl()->getDeclContext();
   if (!DC)
     return false;
 
@@ -3369,9 +3367,8 @@ static SpirvOperand checkHLSLSpirvTypeOperand(Sema &SemaRef,
   if (auto *RT = OperandArg->getAsCanonical<RecordType>()) {
     bool Literal = false;
     SourceLocation LiteralLoc;
-    if (isInVkNamespace(RT) && RT->getOriginalDecl()->getName() == "Literal") {
-      auto SpecDecl =
-          dyn_cast<ClassTemplateSpecializationDecl>(RT->getOriginalDecl());
+    if (isInVkNamespace(RT) && RT->getDecl()->getName() == "Literal") {
+      auto SpecDecl = dyn_cast<ClassTemplateSpecializationDecl>(RT->getDecl());
       assert(SpecDecl);
 
       const TemplateArgumentList &LiteralArgs = SpecDecl->getTemplateArgs();
@@ -3382,9 +3379,8 @@ static SpirvOperand checkHLSLSpirvTypeOperand(Sema &SemaRef,
     }
 
     if (RT && isInVkNamespace(RT) &&
-        RT->getOriginalDecl()->getName() == "integral_constant") {
-      auto SpecDecl =
-          dyn_cast<ClassTemplateSpecializationDecl>(RT->getOriginalDecl());
+        RT->getDecl()->getName() == "integral_constant") {
+      auto SpecDecl = dyn_cast<ClassTemplateSpecializationDecl>(RT->getDecl());
       assert(SpecDecl);
 
       const TemplateArgumentList &ConstantArgs = SpecDecl->getTemplateArgs();
@@ -4110,7 +4106,7 @@ TypeResult Sema::ActOnTagTemplateIdType(TagUseKind TUK,
 
   // Check the tag kind
   if (const RecordType *RT = Result->getAs<RecordType>()) {
-    RecordDecl *D = RT->getOriginalDecl();
+    RecordDecl *D = RT->getDecl();
 
     IdentifierInfo *Id = D->getIdentifier();
     assert(Id && "templated class must have an identifier");
@@ -6383,11 +6379,11 @@ bool UnnamedLocalNoLinkageFinder::VisitDeducedTemplateSpecializationType(
 }
 
 bool UnnamedLocalNoLinkageFinder::VisitRecordType(const RecordType* T) {
-  return VisitTagDecl(T->getOriginalDecl()->getDefinitionOrSelf());
+  return VisitTagDecl(T->getDecl()->getDefinitionOrSelf());
 }
 
 bool UnnamedLocalNoLinkageFinder::VisitEnumType(const EnumType* T) {
-  return VisitTagDecl(T->getOriginalDecl()->getDefinitionOrSelf());
+  return VisitTagDecl(T->getDecl()->getDefinitionOrSelf());
 }
 
 bool UnnamedLocalNoLinkageFinder::VisitTemplateTypeParmType(
@@ -6412,7 +6408,7 @@ bool UnnamedLocalNoLinkageFinder::VisitTemplateSpecializationType(
 
 bool UnnamedLocalNoLinkageFinder::VisitInjectedClassNameType(
                                               const InjectedClassNameType* T) {
-  return VisitTagDecl(T->getOriginalDecl()->getDefinitionOrSelf());
+  return VisitTagDecl(T->getDecl()->getDefinitionOrSelf());
 }
 
 bool UnnamedLocalNoLinkageFinder::VisitDependentNameType(
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index 3baa9775a49e4..6964242b39d6e 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -5577,7 +5577,7 @@ static TemplateDeductionResult CheckDeductionConsistency(
   bool IsDeductionGuide = isa<CXXDeductionGuideDecl>(FTD->getTemplatedDecl());
   if (IsDeductionGuide) {
     if (auto *Injected = P->getAsCanonical<InjectedClassNameType>())
-      P = Injected->getOriginalDecl()->getCanonicalTemplateSpecializationType(
+      P = Injected->getDecl()->getCanonicalTemplateSpecializationType(
           S.Context);
   }
   QualType InstP = S.SubstType(P.getCanonicalType(), MLTAL, FTD->getLocation(),
@@ -5598,10 +5598,10 @@ static TemplateDeductionResult CheckDeductionConsistency(
   auto T2 = S.Context.getUnqualifiedArrayType(A.getNonReferenceType());
   if (IsDeductionGuide) {
     if (auto *Injected = T1->getAsCanonical<InjectedClassNameType>())
-      T1 = Injected->getOriginalDecl()->getCanonicalTemplateSpecializationType(
+      T1 = Injected->getDecl()->getCanonicalTemplateSpecializationType(
           S.Context);
     if (auto *Injected = T2->getAsCanonical<InjectedClassNameType>())
-      T2 = Injected->getOriginalDecl()->getCanonicalTemplateSpecializationType(
+      T2 = Injected->getDecl()->getCanonicalTemplateSpecializationType(
           S.Context);
   }
   if (!S.Context.hasSameType(T1, T2))
@@ -6973,7 +6973,7 @@ MarkUsedTemplateParameters(ASTContext &Ctx, QualType T,
 
   case Type::InjectedClassName:
     T = cast<InjectedClassNameType>(T)
-            ->getOriginalDecl()
+            ->getDecl()
             ->getCanonicalTemplateSpecializationType(Ctx);
     [[fallthrough]];
 
diff --git a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
index 8ba23aa334afd..ad50600f6399c 100644
--- a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
+++ b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
@@ -996,7 +996,7 @@ getRHSTemplateDeclAndArgs(Sema &SemaRef, TypeAliasTemplateDecl *AliasTemplate) {
     // dependent. e.g.
     //   using AliasFoo = Foo<bool>;
     if (const auto *CTSD =
-            dyn_cast<ClassTemplateSpecializationDecl>(RT->getOriginalDecl())) {
+            dyn_cast<ClassTemplateSpecializationDecl>(RT->getDecl())) {
       Template = CTSD->getSpecializedTemplate();
       AliasRhsTemplateArgs = CTSD->getTemplateArgs().asArray();
     }
@@ -1054,12 +1054,11 @@ BuildDeductionGuideForTypeAlias(Sema &SemaRef,
   // such that T can be deduced as U.
   auto RType = F->getTemplatedDecl()->getReturnType();
   // The (trailing) return type of the deduction guide.
-  const TemplateSpecializationType *FReturnType =
-      RType->getAs<TemplateSpecializationType>();
+  const auto *FReturnType = RType->getAs<TemplateSpecializationType>();
   if (const auto *ICNT = RType->getAsCanonical<InjectedClassNameType>())
     // implicitly-generated deduction guide.
     FReturnType = cast<TemplateSpecializationType>(
-        ICNT->getOriginalDecl()->getCanonicalTemplateSpecializationType(
+        ICNT->getDecl()->getCanonicalTemplateSpecializationType(
             SemaRef.Context));
   assert(FReturnType && "expected to see a return type");
   // Deduce template arguments of the deduction guide f from the RHS of
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index 7b05e4cf1385f..7f858050db13e 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -35,6 +35,7 @@
 #include "clang/Sema/Template.h"
 #include "clang/Sema/TemplateDeduction.h"
 #include "clang/Sema/TemplateInstCallback.h"
+#include "llvm/ADT/SmallVectorExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/SaveAndRestore.h"
@@ -638,15 +639,8 @@ Sema::InstantiatingTemplate::InstantiatingTemplate(
   }
 
   Invalid = SemaRef.pushCodeSynthesisContext(Inst);
-  if (!Invalid) {
-    AlreadyInstantiating =
-        !Inst.Entity
-            ? false
-            : !SemaRef.InstantiatingSpecializations
-                   .insert({Inst.Entity->getCanonicalDecl(), Inst.Kind})
-                   .second;
+  if (!Invalid)
     atTemplateBegin(SemaRef.TemplateInstCallbacks, SemaRef, Inst);
-  }
 }
 
 Sema::InstantiatingTemplate::InstantiatingTemplate(
@@ -901,13 +895,6 @@ void Sema::popCodeSynthesisContext() {
 
 void Sema::InstantiatingTemplate::Clear() {
   if (!Invalid) {
-    if (!AlreadyInstantiating) {
-      auto &Active = SemaRef.CodeSynthesisContexts.back();
-      if (Active.Entity)
-        SemaRef.InstantiatingSpecializations.erase(
-            {Active.Entity->getCanonicalDecl(), Active.Kind});
-    }
-
     atTemplateEnd(SemaRef.TemplateInstCallbacks, SemaRef,
                   SemaRef.CodeSynthesisContexts.back());
 
@@ -1681,7 +1668,7 @@ namespace {
           ICNT && SemaRef.CodeSynthesisContexts.back().Kind ==
                       Sema::CodeSynthesisContext::BuildingDeductionGuides) {
         Type = inherited::TransformType(
-            ICNT->getOriginalDecl()->getCanonicalTemplateSpecializationType(
+            ICNT->getDecl()->getCanonicalTemplateSpecializationType(
                 SemaRef.Context));
         TLB.pushTrivial(SemaRef.Context, Type, TL.getNameLoc());
       }
@@ -2105,7 +2092,7 @@ TemplateInstantiator::TransformFirstQualifierInScope(NamedDecl *D,
         return cast_or_null<NamedDecl>(TransformDecl(Loc, D));
 
       if (const TagType *Tag = T->getAs<TagType>())
-        return Tag->getOriginalDecl();
+        return Tag->getDecl();
 
       // The resulting type is not a tag; complain.
       getSema().Diag(Loc, diag::err_nested_name_spec_non_tag) << T;
@@ -2863,9 +2850,9 @@ TemplateInstantiator::TransformNestedRequirement(
         TemplateArgs, Constraint->getSourceRange(), Satisfaction,
         /*TopLevelConceptId=*/nullptr, &NewConstraint);
 
-    assert(!Success || !Trap.hasErrorOccurred() &&
-                           "Substitution failures must be handled "
-                           "by CheckConstraintSatisfaction.");
+    assert((!Success || !Trap.hasErrorOccurred()) &&
+           "Substitution failures must be handled "
+           "by CheckConstraintSatisfaction.");
   }
 
   if (!Success || Satisfaction.HasSubstitutionFailure())
@@ -3311,17 +3298,20 @@ bool Sema::SubstDefaultArgument(
   FunctionDecl *FD = cast<FunctionDecl>(Param->getDeclContext());
   Expr *PatternExpr = Param->getUninstantiatedDefaultArg();
 
+  RecursiveInstGuard AlreadyInstantiating(
+      *this, Param, RecursiveInstGuard::Kind::DefaultArgument);
+  if (AlreadyInstantiating) {
+    Param->setInvalidDecl();
+    return Diag(Param->getBeginLoc(), diag::err_recursive_default_argument)
+           << FD << PatternExpr->getSourceRange();
+  }
+
   EnterExpressionEvaluationContext EvalContext(
       *this, ExpressionEvaluationContext::PotentiallyEvaluated, Param);
 
   InstantiatingTemplate Inst(*this, Loc, Param, TemplateArgs.getInnermost());
   if (Inst.isInvalid())
     return true;
-  if (Inst.isAlreadyInstantiating()) {
-    Diag(Param->getBeginLoc(), diag::err_recursive_default_argument) << FD;
-    Param->setInvalidDecl();
-    return true;
-  }
 
   ExprResult Result;
   // C++ [dcl.fct.default]p5:
@@ -3553,12 +3543,26 @@ namespace clang {
   }
 }
 
-bool
-Sema::InstantiateClass(SourceLocation PointOfInstantiation,
-                       CXXRecordDecl *Instantiation, CXXRecordDecl *Pattern,
-                       const MultiLevelTemplateArgumentList &TemplateArgs,
-                       TemplateSpecializationKind TSK,
-                       bool Complain) {
+bool Sema::InstantiateClass(SourceLocation PointOfInstantiation,
+                            CXXRecordDecl *Instantiation,
+                            CXXRecordDecl *Pattern,
+                            const MultiLevelTemplateArgumentList &TemplateArgs,
+                            TemplateSpecializationKind TSK, bool Complain) {
+#ifndef NDEBUG
+  RecursiveInstGuard AlreadyInstantiating(*this, Instantiation,
+                                          RecursiveInstGuard::Kind::Template);
+  assert(!AlreadyInstantiating && "should have been caught by caller");
+#endif
+
+  return InstantiateClassImpl(PointOfInstantiation, Instantiation, Pattern,
+                              TemplateArgs, TSK, Complain);
+}
+
+bool Sema::InstantiateClassImpl(
+    SourceLocation PointOfInstantiation, CXXRecordDecl *Instantiation,
+    CXXRecordDecl *Pattern, const MultiLevelTemplateArgumentList &TemplateArgs,
+    TemplateSpecializationKind TSK, bool Complain) {
+
   CXXRecordDecl *PatternDef
     = cast_or_null<CXXRecordDecl>(Pattern->getDefinition());
   if (DiagnoseUninstantiableTemplate(PointOfInstantiation, Instantiation,
@@ -3595,7 +3599,6 @@ Sema::InstantiateClass(SourceLocation PointOfInstantiation,
   InstantiatingTemplate Inst(*this, PointOfInstantiation, Instantiation);
   if (Inst.isInvalid())
     return true;
-  assert(!Inst.isAlreadyInstantiating() && "should have been caught by caller");
   PrettyDeclStackTraceEntry CrashInfo(Context, Instantiation, SourceLocation(),
                                       "instantiating class definition");
 
@@ -3807,6 +3810,12 @@ bool Sema::InstantiateEnum(SourceLocation PointOfInstantiation,
                            EnumDecl *Instantiation, EnumDecl *Pattern,
                            const MultiLevelTemplateArgumentList &TemplateArgs,
                            TemplateSpecializationKind TSK) {
+#ifndef NDEBUG
+  RecursiveInstGuard AlreadyInstantiating(*this, Instantiation,
+                                          RecursiveInstGuard::Kind::Template);
+  assert(!AlreadyInstantiating && "should have been caught by caller");
+#endif
+
   EnumDecl *PatternDef = Pattern->getDefinition();
   if (DiagnoseUninstantiableTemplate(PointOfInstantiation, Instantiation,
                                  Instantiation->getInstantiatedFromMemberEnum(),
@@ -3824,8 +3833,6 @@ bool Sema::InstantiateEnum(SourceLocation PointOfInstantiation,
   InstantiatingTemplate Inst(*this, PointOfInstantiation, Instantiation);
   if (Inst.isInvalid())
     return true;
-  if (Inst.isAlreadyInstantiating())
-    return false;
   PrettyDeclStackTraceEntry CrashInfo(Context, Instantiation, SourceLocation(),
                                       "instantiating enum definition");
 
@@ -3864,6 +3871,14 @@ bool Sema::InstantiateInClassInitializer(
              Pattern->getInClassInitStyle() &&
          "pattern and instantiation disagree about init style");
 
+  RecursiveInstGuard AlreadyInstantiating(*this, Instantiation,
+                                          RecursiveInstGuard::Kind::Template);
+  if (AlreadyInstantiating)
+    // Error out if we hit an instantiation cycle for this initializer.
+    return Diag(PointOfInstantiation,
+                diag::err_default_member_initializer_cycle)
+           << Instantiation;
+
   // Error out if we haven't parsed the initializer of the pattern yet because
   // we are waiting for the closing brace of the outer class.
   Expr *OldInit = Pattern->getInClassInitializer();
@@ -3882,12 +3897,6 @@ bool Sema::InstantiateInClassInitializer(
   InstantiatingTemplate Inst(*this, PointOfInstantiation, Instantiation);
   if (Inst.isInvalid())
     return true;
-  if (Inst.isAlreadyInstantiating()) {
-    // Error out if we hit an instantiation cycle for this initializer.
-    Diag(PointOfInstantiation, diag::err_default_member_initializer_cycle)
-      << Instantiation;
-    return true;
-  }
   PrettyDeclStackTraceEntry CrashInfo(Context, Instantiation, SourceLocation(),
                                       "instantiating default member init");
 
@@ -3971,8 +3980,6 @@ static ActionResult<CXXRecordDecl *> getPatternForClassTemplateSpecialization(
   Sema::InstantiatingTemplate Inst(S, PointOfInstantiation, ClassTemplateSpec);
   if (Inst.isInvalid())
     return {/*Invalid=*/true};
-  if (Inst.isAlreadyInstantiating())
-    return {/*Invalid=*/false};
 
   llvm::PointerUnion<ClassTemplateDecl *,
                      ClassTemplatePartialSpecializationDecl *>
@@ -4135,6 +4142,11 @@ bool Sema::InstantiateClassTemplateSpecialization(
   if (ClassTemplateSpec->isInvalidDecl())
     return true;
 
+  Sema::RecursiveInstGuard AlreadyInstantiating(
+      *this, ClassTemplateSpec, Sema::RecursiveInstGuard::Kind::Template);
+  if (AlreadyInstantiating)
+    return false;
+
   bool HadAvaibilityWarning =
       ShouldDiagnoseAvailabilityOfDecl(ClassTemplateSpec, nullptr, nullptr)
           .first != AR_Available;
@@ -4147,7 +4159,7 @@ bool Sema::InstantiateClassTemplateSpecialization(
   if (!Pattern.isUsable())
     return Pattern.isInvalid();
 
-  bool Err = InstantiateClass(
+  bool Err = InstantiateClassImpl(
       PointOfInstantiation, ClassTemplateSpec, Pattern.get(),
       getTemplateInstantiationArgs(ClassTemplateSpec), TSK, Complain);
 
@@ -4487,6 +4499,119 @@ ExprResult Sema::SubstConstraintExprWithoutSatisfaction(
   return Instantiator.TransformExpr(E);
 }
 
+ExprResult Sema::SubstConceptTemplateArguments(
+    const ConceptSpecializationExpr *CSE, const Expr *ConstraintExpr,
+    const MultiLevelTemplateArgumentList &MLTAL) {
+  TemplateInstantiator Instantiator(*this, MLTAL, SourceLocation(),
+                                    DeclarationName());
+  const ASTTemplateArgumentListInfo *ArgsAsWritten =
+      CSE->getTemplateArgsAsWritten();
+  TemplateArgumentListInfo SubstArgs(ArgsAsWritten->getLAngleLoc(),
+                                     ArgsAsWritten->getRAngleLoc());
+
+  Sema::InstantiatingTemplate Inst(
+      *this, ArgsAsWritten->arguments().front().getSourceRange().getBegin(),
+      Sema::InstantiatingTemplate::ConstraintNormalization{},
+      CSE->getNamedConcept(),
+      ArgsAsWritten->arguments().front().getSourceRange());
+
+  if (Inst.isInvalid())
+    return ExprError();
+
+  if (Instantiator.TransformConceptTemplateArguments(
+          ArgsAsWritten->getTemplateArgs(),
+          ArgsAsWritten->getTemplateArgs() +
+              ArgsAsWritten->getNumTemplateArgs(),
+          SubstArgs))
+    return true;
+
+  llvm::SmallVector<TemplateArgument, 4> NewArgList = llvm::map_to_vector(
+      SubstArgs.arguments(),
+      [](const TemplateArgumentLoc &Loc) { return Loc.getArgument(); });
+
+  MultiLevelTemplateArgumentList MLTALForConstraint =
+      getTemplateInstantiationArgs(
+          CSE->getNamedConcept(),
+          CSE->getNamedConcept()->getLexicalDeclContext(),
+          /*Final=*/false,
+          /*Innermost=*/NewArgList,
+          /*RelativeToPrimary=*/true,
+          /*Pattern=*/nullptr,
+          /*ForConstraintInstantiation=*/true);
+
+  // Rebuild a constraint, only substituting non-dependent concept names
+  // and nothing else.
+  // Given C<SomeType, SomeValue, SomeConceptName, SomeDependentConceptName>.
+  // only SomeConceptName is substituted, in the constraint expression of C.
+  struct ConstraintExprTransformer : TreeTransform<ConstraintExprTransformer> {
+    using Base = TreeTransform<ConstraintExprTransformer>;
+    MultiLevelTemplateArgumentList &MLTAL;
+
+    ConstraintExprTransformer(Sema &SemaRef,
+                              MultiLevelTemplateArgumentList &MLTAL)
+        : TreeTransform(SemaRef), MLTAL(MLTAL) {}
+
+    ExprResult TransformExpr(Expr *E) {
+      if (!E)
+        return E;
+      switch (E->getStmtClass()) {
+      case Stmt::BinaryOperatorClass:
+      case Stmt::ConceptSpecializationExprClass:
+      case Stmt::ParenExprClass:
+      case Stmt::UnresolvedLookupExprClass:
+        return Base::TransformExpr(E);
+      default:
+        break;
+      }
+      return E;
+    }
+
+    // Rebuild both branches of a conjunction / disjunction
+    // even if there is a substitution failure in one of
+    // the branch.
+    ExprResult TransformBinaryOperator(BinaryOperator *E) {
+      if (!(E->getOpcode() == BinaryOperatorKind::BO_LAnd ||
+            E->getOpcode() == BinaryOperatorKind::BO_LOr))
+        return E;
+
+      ExprResult LHS = TransformExpr(E->getLHS());
+      ExprResult RHS = TransformExpr(E->getRHS());
+
+      if (LHS.get() == E->getLHS() && RHS.get() == E->getRHS())
+        return E;
+
+      return BinaryOperator::Create(SemaRef.Context, LHS.get(), RHS.get(),
+                                    E->getOpcode(), SemaRef.Context.BoolTy,
+                                    VK_PRValue, OK_Ordinary,
+                                    E->getOperatorLoc(), FPOptionsOverride{});
+    }
+
+    bool TransformTemplateArgument(const TemplateArgumentLoc &Input,
+                                   TemplateArgumentLoc &Output,
+                                   bool Uneval = false) {
+      if (Input.getArgument().isConceptOrConceptTemplateParameter())
+        return Base::TransformTemplateArgument(Input, Output, Uneval);
+
+      Output = Input;
+      return false;
+    }
+
+    ExprResult TransformUnresolvedLookupExpr(UnresolvedLookupExpr *E,
+                                             bool IsAddressOfOperand = false) {
+      if (E->isConceptReference()) {
+        ExprResult Res = SemaRef.SubstExpr(E, MLTAL);
+        return Res;
+      }
+      return E;
+    }
+  };
+
+  ConstraintExprTransformer Transformer(*this, MLTALForConstraint);
+  ExprResult Res =
+      Transformer.TransformExpr(const_cast<Expr *>(ConstraintExpr));
+  return Res;
+}
+
 ExprResult Sema::SubstInitializer(Expr *Init,
                           const MultiLevelTemplateArgumentList &TemplateArgs,
                           bool CXXDirectInit) {
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index 85e3d207b2cf2..28925cca8f956 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -707,6 +707,23 @@ static void instantiateDependentAMDGPUMaxNumWorkGroupsAttr(
     S.AMDGPU().addAMDGPUMaxNumWorkGroupsAttr(New, Attr, XExpr, YExpr, ZExpr);
 }
 
+static void instantiateDependentCUDAClusterDimsAttr(
+    Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs,
+    const CUDAClusterDimsAttr &Attr, Decl *New) {
+  EnterExpressionEvaluationContext Unevaluated(
+      S, Sema::ExpressionEvaluationContext::ConstantEvaluated);
+
+  auto SubstElt = [&S, &TemplateArgs](Expr *E) {
+    return E ? S.SubstExpr(E, TemplateArgs).get() : nullptr;
+  };
+
+  Expr *XExpr = SubstElt(Attr.getX());
+  Expr *YExpr = SubstElt(Attr.getY());
+  Expr *ZExpr = SubstElt(Attr.getZ());
+
+  S.addClusterDimsAttr(New, Attr, XExpr, YExpr, ZExpr);
+}
+
 // This doesn't take any template parameters, but we have a custom action that
 // needs to happen when the kernel itself is instantiated. We need to run the
 // ItaniumMangler to mark the names required to name this kernel.
@@ -765,10 +782,18 @@ static bool isRelevantAttr(Sema &S, const Decl *D, const Attr *A) {
 
 static void instantiateDependentHLSLParamModifierAttr(
     Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs,
-    const HLSLParamModifierAttr *Attr, Decl *New) {
-  ParmVarDecl *P = cast<ParmVarDecl>(New);
-  P->addAttr(Attr->clone(S.getASTContext()));
-  P->setType(S.HLSL().getInoutParameterType(P->getType()));
+    const HLSLParamModifierAttr *Attr, const Decl *Old, Decl *New) {
+  ParmVarDecl *NewParm = cast<ParmVarDecl>(New);
+  NewParm->addAttr(Attr->clone(S.getASTContext()));
+
+  const Type *OldParmTy = cast<ParmVarDecl>(Old)->getType().getTypePtr();
+  if (OldParmTy->isDependentType() && Attr->isAnyOut())
+    NewParm->setType(S.HLSL().getInoutParameterType(NewParm->getType()));
+
+  assert(
+      (!Attr->isAnyOut() || (NewParm->getType().isRestrictQualified() &&
+                             NewParm->getType()->isReferenceType())) &&
+      "out or inout parameter type must be a reference and restrict qualified");
 }
 
 void Sema::InstantiateAttrsForDecl(
@@ -921,9 +946,14 @@ void Sema::InstantiateAttrs(const MultiLevelTemplateArgumentList &TemplateArgs,
           *this, TemplateArgs, *AMDGPUMaxNumWorkGroups, New);
     }
 
+    if (const auto *CUDAClusterDims = dyn_cast<CUDAClusterDimsAttr>(TmplAttr)) {
+      instantiateDependentCUDAClusterDimsAttr(*this, TemplateArgs,
+                                              *CUDAClusterDims, New);
+    }
+
     if (const auto *ParamAttr = dyn_cast<HLSLParamModifierAttr>(TmplAttr)) {
       instantiateDependentHLSLParamModifierAttr(*this, TemplateArgs, ParamAttr,
-                                                New);
+                                                Tmpl, New);
       continue;
     }
 
@@ -1522,9 +1552,9 @@ Decl *TemplateDeclInstantiator::InstantiateTypedefNameDecl(TypedefNameDecl *D,
   // If the old typedef was the name for linkage purposes of an anonymous
   // tag decl, re-establish that relationship for the new typedef.
   if (const TagType *oldTagType = D->getUnderlyingType()->getAs<TagType>()) {
-    TagDecl *oldTag = oldTagType->getOriginalDecl();
+    TagDecl *oldTag = oldTagType->getDecl();
     if (oldTag->getTypedefNameForAnonDecl() == D && !Invalid) {
-      TagDecl *newTag = DI->getType()->castAs<TagType>()->getOriginalDecl();
+      TagDecl *newTag = DI->getType()->castAs<TagType>()->getDecl();
       assert(!newTag->hasNameForLinkage());
       newTag->setTypedefNameForAnonDecl(Typedef);
     }
@@ -5282,6 +5312,16 @@ void Sema::InstantiateExceptionSpec(SourceLocation PointOfInstantiation,
   if (Proto->getExceptionSpecType() != EST_Uninstantiated)
     return;
 
+  RecursiveInstGuard AlreadyInstantiating(
+      *this, Decl, RecursiveInstGuard::Kind::ExceptionSpec);
+  if (AlreadyInstantiating) {
+    // This exception specification indirectly depends on itself. Reject.
+    // FIXME: Corresponding rule in the standard?
+    Diag(PointOfInstantiation, diag::err_exception_spec_cycle) << Decl;
+    UpdateExceptionSpec(Decl, EST_None);
+    return;
+  }
+
   InstantiatingTemplate Inst(*this, PointOfInstantiation, Decl,
                              InstantiatingTemplate::ExceptionSpecification());
   if (Inst.isInvalid()) {
@@ -5290,13 +5330,6 @@ void Sema::InstantiateExceptionSpec(SourceLocation PointOfInstantiation,
     UpdateExceptionSpec(Decl, EST_None);
     return;
   }
-  if (Inst.isAlreadyInstantiating()) {
-    // This exception specification indirectly depends on itself. Reject.
-    // FIXME: Corresponding rule in the standard?
-    Diag(PointOfInstantiation, diag::err_exception_spec_cycle) << Decl;
-    UpdateExceptionSpec(Decl, EST_None);
-    return;
-  }
 
   // Enter the scope of this instantiation. We don't use
   // PushDeclContext because we don't have a scope.
@@ -5356,8 +5389,6 @@ TemplateDeclInstantiator::InitFunctionInstantiation(FunctionDecl *New,
   if (ActiveInst.Kind == ActiveInstType::ExplicitTemplateArgumentSubstitution ||
       ActiveInst.Kind == ActiveInstType::DeducedTemplateArgumentSubstitution) {
     if (isa<FunctionTemplateDecl>(ActiveInst.Entity)) {
-      SemaRef.InstantiatingSpecializations.erase(
-          {ActiveInst.Entity->getCanonicalDecl(), ActiveInst.Kind});
       atTemplateEnd(SemaRef.TemplateInstCallbacks, SemaRef, ActiveInst);
       ActiveInst.Kind = ActiveInstType::TemplateInstantiation;
       ActiveInst.Entity = New;
@@ -5515,6 +5546,12 @@ void Sema::InstantiateFunctionDefinition(SourceLocation PointOfInstantiation,
     Function = const_cast<FunctionDecl*>(ExistingDefn);
   }
 
+#ifndef NDEBUG
+  RecursiveInstGuard AlreadyInstantiating(*this, Function,
+                                          RecursiveInstGuard::Kind::Template);
+  assert(!AlreadyInstantiating && "should have been caught by caller");
+#endif
+
   // Find the function body that we'll be substituting.
   const FunctionDecl *PatternDecl = Function->getTemplateInstantiationPattern();
   assert(PatternDecl && "instantiating a non-template");
@@ -5654,7 +5691,7 @@ void Sema::InstantiateFunctionDefinition(SourceLocation PointOfInstantiation,
   }
 
   InstantiatingTemplate Inst(*this, PointOfInstantiation, Function);
-  if (Inst.isInvalid() || Inst.isAlreadyInstantiating())
+  if (Inst.isInvalid())
     return;
   PrettyDeclStackTraceEntry CrashInfo(Context, Function, SourceLocation(),
                                       "instantiating function definition");
@@ -5727,7 +5764,7 @@ void Sema::InstantiateFunctionDefinition(SourceLocation PointOfInstantiation,
   Function->setDeclarationNameLoc(NameLocPointsToPattern());
 
   EnterExpressionEvaluationContextForFunction EvalContext(
-      *this, Sema::ExpressionEvaluationContext::PotentiallyEvaluated);
+      *this, Sema::ExpressionEvaluationContext::PotentiallyEvaluated, Function);
 
   Qualifiers ThisTypeQuals;
   CXXRecordDecl *ThisContext = nullptr;
@@ -5791,7 +5828,7 @@ void Sema::InstantiateFunctionDefinition(SourceLocation PointOfInstantiation,
       QualType TransformRecordType(TypeLocBuilder &TLB, RecordTypeLoc TL) {
         const RecordType *T = TL.getTypePtr();
         RecordDecl *Record = cast_or_null<RecordDecl>(
-            getDerived().TransformDecl(TL.getNameLoc(), T->getOriginalDecl()));
+            getDerived().TransformDecl(TL.getNameLoc(), T->getDecl()));
         if (Record != OldDecl)
           return Base::TransformRecordType(TLB, TL);
 
@@ -6223,6 +6260,11 @@ void Sema::InstantiateVariableDefinition(SourceLocation PointOfInstantiation,
   if (TSK == TSK_ExplicitSpecialization)
     return;
 
+  RecursiveInstGuard AlreadyInstantiating(*this, Var,
+                                          RecursiveInstGuard::Kind::Template);
+  if (AlreadyInstantiating)
+    return;
+
   // Find the pattern and the arguments to substitute into it.
   VarDecl *PatternDecl = Var->getTemplateInstantiationPattern();
   assert(PatternDecl && "no pattern for templated variable");
@@ -6246,7 +6288,7 @@ void Sema::InstantiateVariableDefinition(SourceLocation PointOfInstantiation,
       // FIXME: Factor out the duplicated instantiation context setup/tear down
       // code here.
       InstantiatingTemplate Inst(*this, PointOfInstantiation, Var);
-      if (Inst.isInvalid() || Inst.isAlreadyInstantiating())
+      if (Inst.isInvalid())
         return;
       PrettyDeclStackTraceEntry CrashInfo(Context, Var, SourceLocation(),
                                           "instantiating variable initializer");
@@ -6350,7 +6392,7 @@ void Sema::InstantiateVariableDefinition(SourceLocation PointOfInstantiation,
   }
 
   InstantiatingTemplate Inst(*this, PointOfInstantiation, Var);
-  if (Inst.isInvalid() || Inst.isAlreadyInstantiating())
+  if (Inst.isInvalid())
     return;
   PrettyDeclStackTraceEntry CrashInfo(Context, Var, SourceLocation(),
                                       "instantiating variable definition");
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index a9e7c34de94f4..7c1fb12a549e6 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -1238,8 +1238,8 @@ static QualType ConvertDeclSpecToType(TypeProcessingState &state) {
     Result = S.GetTypeFromParser(DS.getRepAsType());
     assert(!Result.isNull() && "Didn't get a type for typeof?");
     if (!Result->isDependentType())
-      if (const TagType *TT = Result->getAs<TagType>())
-        S.DiagnoseUseOfDecl(TT->getOriginalDecl(), DS.getTypeSpecTypeLoc());
+      if (const auto *TT = Result->getAs<TagType>())
+        S.DiagnoseUseOfDecl(TT->getDecl(), DS.getTypeSpecTypeLoc());
     // TypeQuals handled by caller.
     Result = Context.getTypeOfType(
         Result, DS.getTypeSpecType() == DeclSpec::TST_typeof_unqualType
@@ -2517,12 +2517,18 @@ QualType Sema::BuildMatrixType(QualType ElementTy, Expr *NumRows, Expr *NumCols,
     Diag(AttrLoc, diag::err_attribute_zero_size) << "matrix" << ColRange;
     return QualType();
   }
-  if (!ConstantMatrixType::isDimensionValid(MatrixRows)) {
+  if (MatrixRows > Context.getLangOpts().MaxMatrixDimension &&
+      MatrixColumns > Context.getLangOpts().MaxMatrixDimension) {
+    Diag(AttrLoc, diag::err_attribute_size_too_large)
+        << RowRange << ColRange << "matrix row and column";
+    return QualType();
+  }
+  if (MatrixRows > Context.getLangOpts().MaxMatrixDimension) {
     Diag(AttrLoc, diag::err_attribute_size_too_large)
         << RowRange << "matrix row";
     return QualType();
   }
-  if (!ConstantMatrixType::isDimensionValid(MatrixColumns)) {
+  if (MatrixColumns > Context.getLangOpts().MaxMatrixDimension) {
     Diag(AttrLoc, diag::err_attribute_size_too_large)
         << ColRange << "matrix column";
     return QualType();
@@ -9699,7 +9705,7 @@ QualType Sema::BuildTypeofExprType(Expr *E, TypeOfKind Kind) {
   if (!E->isTypeDependent()) {
     QualType T = E->getType();
     if (const TagType *TT = T->getAs<TagType>())
-      DiagnoseUseOfDecl(TT->getOriginalDecl(), E->getExprLoc());
+      DiagnoseUseOfDecl(TT->getDecl(), E->getExprLoc());
   }
   return Context.getTypeOfExprType(E, Kind);
 }
@@ -9865,7 +9871,7 @@ QualType Sema::BuildPackIndexingType(QualType Pattern, Expr *IndexExpr,
 static QualType GetEnumUnderlyingType(Sema &S, QualType BaseType,
                                       SourceLocation Loc) {
   assert(BaseType->isEnumeralType());
-  EnumDecl *ED = BaseType->castAs<EnumType>()->getOriginalDecl();
+  EnumDecl *ED = BaseType->castAs<EnumType>()->getDecl();
 
   S.DiagnoseUseOfDecl(ED, Loc);
 
diff --git a/clang/lib/Sema/SemaTypeTraits.cpp b/clang/lib/Sema/SemaTypeTraits.cpp
index 3e34675cbf064..38877967af05e 100644
--- a/clang/lib/Sema/SemaTypeTraits.cpp
+++ b/clang/lib/Sema/SemaTypeTraits.cpp
@@ -1076,8 +1076,7 @@ static bool EvaluateUnaryTypeTrait(Sema &Self, TypeTrait UTT,
     if (T.isPODType(C) || T->isObjCLifetimeType())
       return true;
     if (CXXRecordDecl *RD = C.getBaseElementType(T)->getAsCXXRecordDecl()) {
-      if (RD->hasTrivialDefaultConstructor() &&
-          !RD->hasNonTrivialDefaultConstructor())
+      if (RD->hasTrivialDefaultConstructor())
         return true;
 
       bool FoundConstructor = false;
@@ -1165,14 +1164,26 @@ static bool EvaluateUnaryTypeTrait(Sema &Self, TypeTrait UTT,
     const CXXDestructorDecl *Dtor = RD->getDestructor();
     if (UnqualT->isAggregateType() && (!Dtor || !Dtor->isUserProvided()))
       return true;
-    if (RD->hasTrivialDestructor() && (!Dtor || !Dtor->isDeleted())) {
-      for (CXXConstructorDecl *Ctr : RD->ctors()) {
-        if (Ctr->isIneligibleOrNotSelected() || Ctr->isDeleted())
-          continue;
-        if (Ctr->isTrivial())
-          return true;
-      }
+    bool HasTrivialNonDeletedDtr =
+        RD->hasTrivialDestructor() && (!Dtor || !Dtor->isDeleted());
+    if (!HasTrivialNonDeletedDtr)
+      return false;
+    for (CXXConstructorDecl *Ctr : RD->ctors()) {
+      if (Ctr->isIneligibleOrNotSelected() || Ctr->isDeleted())
+        continue;
+      if (Ctr->isTrivial())
+        return true;
     }
+    if (RD->needsImplicitDefaultConstructor() &&
+        RD->hasTrivialDefaultConstructor() &&
+        !RD->hasNonTrivialDefaultConstructor())
+      return true;
+    if (RD->needsImplicitCopyConstructor() && RD->hasTrivialCopyConstructor() &&
+        !RD->defaultedCopyConstructorIsDeleted())
+      return true;
+    if (RD->needsImplicitMoveConstructor() && RD->hasTrivialMoveConstructor() &&
+        !RD->defaultedMoveConstructorIsDeleted())
+      return true;
     return false;
   }
   case UTT_IsIntangibleType:
@@ -1613,9 +1624,9 @@ bool Sema::BuiltinIsBaseOf(SourceLocation RhsTLoc, QualType LhsT,
 
   // Unions are never base classes, and never have base classes.
   // It doesn't matter if they are complete or not. See PR#41843
-  if (lhsRecord && lhsRecord->getOriginalDecl()->isUnion())
+  if (lhsRecord && lhsRecord->getDecl()->isUnion())
     return false;
-  if (rhsRecord && rhsRecord->getOriginalDecl()->isUnion())
+  if (rhsRecord && rhsRecord->getDecl()->isUnion())
     return false;
 
   if (lhsRecord == rhsRecord)
@@ -1629,8 +1640,8 @@ bool Sema::BuiltinIsBaseOf(SourceLocation RhsTLoc, QualType LhsT,
                           diag::err_incomplete_type_used_in_type_trait_expr))
     return false;
 
-  return cast<CXXRecordDecl>(rhsRecord->getOriginalDecl())
-      ->isDerivedFrom(cast<CXXRecordDecl>(lhsRecord->getOriginalDecl()));
+  return cast<CXXRecordDecl>(rhsRecord->getDecl())
+      ->isDerivedFrom(cast<CXXRecordDecl>(lhsRecord->getDecl()));
 }
 
 static bool EvaluateBinaryTypeTrait(Sema &Self, TypeTrait BTT,
@@ -1670,9 +1681,8 @@ static bool EvaluateBinaryTypeTrait(Sema &Self, TypeTrait BTT,
                                  diag::err_incomplete_type))
       return false;
 
-    return cast<CXXRecordDecl>(DerivedRecord->getOriginalDecl())
-        ->isVirtuallyDerivedFrom(
-            cast<CXXRecordDecl>(BaseRecord->getOriginalDecl()));
+    return cast<CXXRecordDecl>(DerivedRecord->getDecl())
+        ->isVirtuallyDerivedFrom(cast<CXXRecordDecl>(BaseRecord->getDecl()));
   }
   case BTT_IsSame:
     return Self.Context.hasSameType(LhsT, RhsT);
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 04a5e4b4ef90d..0c8c1d18d317e 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -694,6 +694,12 @@ class TreeTransform {
                                   TemplateArgumentListInfo &Outputs,
                                   bool Uneval = false);
 
+  template <typename InputIterator>
+  bool TransformConceptTemplateArguments(InputIterator First,
+                                         InputIterator Last,
+                                         TemplateArgumentListInfo &Outputs,
+                                         bool Uneval = false);
+
   /// Checks if the argument pack from \p In will need to be expanded and does
   /// the necessary prework.
   /// Whether the expansion is needed is captured in Info.Expand.
@@ -5192,6 +5198,49 @@ bool TreeTransform<Derived>::TransformTemplateArguments(
   return false;
 }
 
+template <typename Derived>
+template <typename InputIterator>
+bool TreeTransform<Derived>::TransformConceptTemplateArguments(
+    InputIterator First, InputIterator Last, TemplateArgumentListInfo &Outputs,
+    bool Uneval) {
+
+  // [C++26][temp.constr.normal]
+  // any non-dependent concept template argument
+  // is substituted into the constraint-expression of C.
+  auto isNonDependentConceptArgument = [](const TemplateArgument &Arg) {
+    return !Arg.isDependent() && Arg.isConceptOrConceptTemplateParameter();
+  };
+
+  for (; First != Last; ++First) {
+    TemplateArgumentLoc Out;
+    TemplateArgumentLoc In = *First;
+
+    if (In.getArgument().getKind() == TemplateArgument::Pack) {
+      typedef TemplateArgumentLocInventIterator<Derived,
+                                                TemplateArgument::pack_iterator>
+          PackLocIterator;
+      if (TransformConceptTemplateArguments(
+              PackLocIterator(*this, In.getArgument().pack_begin()),
+              PackLocIterator(*this, In.getArgument().pack_end()), Outputs,
+              Uneval))
+        return true;
+      continue;
+    }
+
+    if (!isNonDependentConceptArgument(In.getArgument())) {
+      Outputs.addArgument(In);
+      continue;
+    }
+
+    if (getDerived().TransformTemplateArgument(In, Out, Uneval))
+      return true;
+
+    Outputs.addArgument(Out);
+  }
+
+  return false;
+}
+
 // FIXME: Find ways to reduce code duplication for pack expansions.
 template <typename Derived>
 bool TreeTransform<Derived>::PreparePackForExpansion(TemplateArgumentLoc In,
@@ -7160,13 +7209,13 @@ QualType TreeTransform<Derived>::TransformTagType(TypeLocBuilder &TLB,
   }
 
   auto *TD = cast_or_null<TagDecl>(
-      getDerived().TransformDecl(TL.getNameLoc(), T->getOriginalDecl()));
+      getDerived().TransformDecl(TL.getNameLoc(), T->getDecl()));
   if (!TD)
     return QualType();
 
   QualType Result = TL.getType();
   if (getDerived().AlwaysRebuild() || QualifierLoc != TL.getQualifierLoc() ||
-      TD != T->getOriginalDecl()) {
+      TD != T->getDecl()) {
     if (T->isCanonicalUnqualified())
       Result = getDerived().RebuildCanonicalTagType(TD);
     else
@@ -16381,12 +16430,16 @@ ExprResult TreeTransform<Derived>::TransformSubstNonTypeTemplateParmExpr(
       AssociatedDecl == E->getAssociatedDecl())
     return E;
 
-  auto getParamAndType = [Index = E->getIndex()](Decl *AssociatedDecl)
+  auto getParamAndType = [E](Decl *AssociatedDecl)
       -> std::tuple<NonTypeTemplateParmDecl *, QualType> {
-    auto [PDecl, Arg] = getReplacedTemplateParameter(AssociatedDecl, Index);
+    auto [PDecl, Arg] =
+        getReplacedTemplateParameter(AssociatedDecl, E->getIndex());
     auto *Param = cast<NonTypeTemplateParmDecl>(PDecl);
-    return {Param, Arg.isNull() ? Param->getType()
-                                : Arg.getNonTypeTemplateArgumentType()};
+    if (Arg.isNull())
+      return {Param, Param->getType()};
+    if (UnsignedOrNone PackIndex = E->getPackIndex())
+      Arg = Arg.getPackAsArray()[*PackIndex];
+    return {Param, Arg.getNonTypeTemplateArgumentType()};
   };
 
   // If the replacement expression did not change, and the parameter type
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 32f7a0ef50bc2..8b3fd41adb465 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -5504,7 +5504,7 @@ void ASTReader::InitializeContext() {
             Error("Invalid FILE type in AST file");
             return;
           }
-          Context.setFILEDecl(Tag->getOriginalDecl());
+          Context.setFILEDecl(Tag->getDecl());
         }
       }
     }
@@ -5525,7 +5525,7 @@ void ASTReader::InitializeContext() {
             Error("Invalid jmp_buf type in AST file");
             return;
           }
-          Context.setjmp_bufDecl(Tag->getOriginalDecl());
+          Context.setjmp_bufDecl(Tag->getDecl());
         }
       }
     }
@@ -5543,7 +5543,7 @@ void ASTReader::InitializeContext() {
         else {
           const TagType *Tag = Sigjmp_bufType->getAs<TagType>();
           assert(Tag && "Invalid sigjmp_buf type in AST file");
-          Context.setsigjmp_bufDecl(Tag->getOriginalDecl());
+          Context.setsigjmp_bufDecl(Tag->getDecl());
         }
       }
     }
@@ -5578,7 +5578,7 @@ void ASTReader::InitializeContext() {
         else {
           const TagType *Tag = Ucontext_tType->getAs<TagType>();
           assert(Tag && "Invalid ucontext_t type in AST file");
-          Context.setucontext_tDecl(Tag->getOriginalDecl());
+          Context.setucontext_tDecl(Tag->getDecl());
         }
       }
     }
diff --git a/clang/lib/Serialization/TemplateArgumentHasher.cpp b/clang/lib/Serialization/TemplateArgumentHasher.cpp
index 3e8ffea78c2f1..353e8a2daa925 100644
--- a/clang/lib/Serialization/TemplateArgumentHasher.cpp
+++ b/clang/lib/Serialization/TemplateArgumentHasher.cpp
@@ -358,7 +358,7 @@ class TypeVisitorHelper : public TypeVisitor<TypeVisitorHelper> {
     AddQualType(T->getReplacementType());
   }
 
-  void VisitTagType(const TagType *T) { AddDecl(T->getOriginalDecl()); }
+  void VisitTagType(const TagType *T) { AddDecl(T->getDecl()); }
 
   void VisitRecordType(const RecordType *T) { VisitTagType(T); }
   void VisitEnumType(const EnumType *T) { VisitTagType(T); }
diff --git a/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp
index bf35bee70870b..3ddd6590fcbb0 100644
--- a/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp
@@ -104,7 +104,7 @@ class RAIIMutexDescriptor {
       // this function is called instead of early returning it. To avoid this, a
       // bool variable (IdentifierInfoInitialized) is used and the function will
       // be run only once.
-      const auto &ASTCtx = Call.getState()->getStateManager().getContext();
+      const auto &ASTCtx = Call.getASTContext();
       Guard = &ASTCtx.Idents.get(GuardName);
     }
   }
diff --git a/clang/lib/StaticAnalyzer/Checkers/CStringChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/CStringChecker.cpp
index 0ae784c000f60..144411495f5a1 100644
--- a/clang/lib/StaticAnalyzer/Checkers/CStringChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/CStringChecker.cpp
@@ -251,6 +251,8 @@ class CStringChecker
                                         const Expr *Ex,
                                         const MemRegion *MR,
                                         bool hypothetical);
+  static const StringLiteral *getStringLiteralFromRegion(const MemRegion *MR);
+
   SVal getCStringLength(CheckerContext &C,
                         ProgramStateRef &state,
                         const Expr *Ex,
@@ -983,6 +985,21 @@ SVal CStringChecker::getCStringLengthForRegion(CheckerContext &C,
   return strLength;
 }
 
+const StringLiteral *
+CStringChecker::getStringLiteralFromRegion(const MemRegion *MR) {
+  switch (MR->getKind()) {
+  case MemRegion::StringRegionKind:
+    return cast<StringRegion>(MR)->getStringLiteral();
+  case MemRegion::NonParamVarRegionKind:
+    if (const VarDecl *Decl = cast<NonParamVarRegion>(MR)->getDecl();
+        Decl->getType().isConstQualified() && Decl->hasGlobalStorage())
+      return dyn_cast_or_null<StringLiteral>(Decl->getInit());
+    return nullptr;
+  default:
+    return nullptr;
+  }
+}
+
 SVal CStringChecker::getCStringLength(CheckerContext &C, ProgramStateRef &state,
                                       const Expr *Ex, SVal Buf,
                                       bool hypothetical) const {
@@ -1013,30 +1030,19 @@ SVal CStringChecker::getCStringLength(CheckerContext &C, ProgramStateRef &state,
   // its length. For anything we can't figure out, just return UnknownVal.
   MR = MR->StripCasts();
 
-  switch (MR->getKind()) {
-  case MemRegion::StringRegionKind: {
-    // Modifying the contents of string regions is undefined [C99 6.4.5p6],
-    // so we can assume that the byte length is the correct C string length.
-    SValBuilder &svalBuilder = C.getSValBuilder();
-    QualType sizeTy = svalBuilder.getContext().getSizeType();
-    const StringLiteral *strLit = cast<StringRegion>(MR)->getStringLiteral();
-    return svalBuilder.makeIntVal(strLit->getLength(), sizeTy);
-  }
-  case MemRegion::NonParamVarRegionKind: {
+  if (const StringLiteral *StrLit = getStringLiteralFromRegion(MR)) {
     // If we have a global constant with a string literal initializer,
     // compute the initializer's length.
-    const VarDecl *Decl = cast<NonParamVarRegion>(MR)->getDecl();
-    if (Decl->getType().isConstQualified() && Decl->hasGlobalStorage()) {
-      if (const Expr *Init = Decl->getInit()) {
-        if (auto *StrLit = dyn_cast<StringLiteral>(Init)) {
-          SValBuilder &SvalBuilder = C.getSValBuilder();
-          QualType SizeTy = SvalBuilder.getContext().getSizeType();
-          return SvalBuilder.makeIntVal(StrLit->getLength(), SizeTy);
-        }
-      }
-    }
-    [[fallthrough]];
+    // Modifying the contents of string regions is undefined [C99 6.4.5p6],
+    // so we can assume that the byte length is the correct C string length.
+    // FIXME: Embedded null characters are not handled.
+    SValBuilder &SVB = C.getSValBuilder();
+    return SVB.makeIntVal(StrLit->getLength(), SVB.getContext().getSizeType());
   }
+
+  switch (MR->getKind()) {
+  case MemRegion::StringRegionKind:
+  case MemRegion::NonParamVarRegionKind:
   case MemRegion::SymbolicRegionKind:
   case MemRegion::AllocaRegionKind:
   case MemRegion::ParamVarRegionKind:
@@ -1046,10 +1052,28 @@ SVal CStringChecker::getCStringLength(CheckerContext &C, ProgramStateRef &state,
   case MemRegion::CompoundLiteralRegionKind:
     // FIXME: Can we track this? Is it necessary?
     return UnknownVal();
-  case MemRegion::ElementRegionKind:
-    // FIXME: How can we handle this? It's not good enough to subtract the
-    // offset from the base string length; consider "123\x00567" and &a[5].
+  case MemRegion::ElementRegionKind: {
+    // If an offset into the string literal is used, use the original length
+    // minus the offset.
+    // FIXME: Embedded null characters are not handled.
+    const ElementRegion *ER = cast<ElementRegion>(MR);
+    const SubRegion *SuperReg =
+        cast<SubRegion>(ER->getSuperRegion()->StripCasts());
+    const StringLiteral *StrLit = getStringLiteralFromRegion(SuperReg);
+    if (!StrLit)
+      return UnknownVal();
+    SValBuilder &SVB = C.getSValBuilder();
+    NonLoc Idx = ER->getIndex();
+    QualType SizeTy = SVB.getContext().getSizeType();
+    NonLoc LengthVal =
+        SVB.makeIntVal(StrLit->getLength(), SizeTy).castAs<NonLoc>();
+    if (state->assume(SVB.evalBinOpNN(state, BO_LE, Idx, LengthVal,
+                                      SVB.getConditionType())
+                          .castAs<DefinedOrUnknownSVal>(),
+                      true))
+      return SVB.evalBinOp(state, BO_Sub, LengthVal, Idx, SizeTy);
     return UnknownVal();
+  }
   default:
     // Other regions (mostly non-data) can't have a reliable C string length.
     // In this case, an error is emitted and UndefinedVal is returned.
@@ -1074,6 +1098,7 @@ SVal CStringChecker::getCStringLength(CheckerContext &C, ProgramStateRef &state,
 
 const StringLiteral *CStringChecker::getCStringLiteral(CheckerContext &C,
   ProgramStateRef &state, const Expr *expr, SVal val) const {
+  // FIXME: use getStringLiteralFromRegion (and remove unused parameters)?
 
   // Get the memory region pointed to by the val.
   const MemRegion *bufRegion = val.getAsRegion();
diff --git a/clang/lib/StaticAnalyzer/Checkers/CallAndMessageChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/CallAndMessageChecker.cpp
index b304350e0a2ef..7cc146ed29d0d 100644
--- a/clang/lib/StaticAnalyzer/Checkers/CallAndMessageChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/CallAndMessageChecker.cpp
@@ -246,7 +246,7 @@ class FindUninitializedField {
   bool Find(const TypedValueRegion *R) {
     QualType T = R->getValueType();
     if (const RecordType *RT = T->getAsStructureType()) {
-      const RecordDecl *RD = RT->getOriginalDecl()->getDefinition();
+      const RecordDecl *RD = RT->getDecl()->getDefinition();
       assert(RD && "Referred record has no definition");
       for (const auto *I : RD->fields()) {
         if (I->isUnnamedBitField())
diff --git a/clang/lib/StaticAnalyzer/Checkers/CheckObjCDealloc.cpp b/clang/lib/StaticAnalyzer/Checkers/CheckObjCDealloc.cpp
index 9d3aeff465ca1..242084876a3c5 100644
--- a/clang/lib/StaticAnalyzer/Checkers/CheckObjCDealloc.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/CheckObjCDealloc.cpp
@@ -929,7 +929,7 @@ ObjCDeallocChecker::getValueReleasedByNillingOut(const ObjCMethodCall &M,
   SVal Arg = M.getArgSVal(0);
   ProgramStateRef notNilState, nilState;
   std::tie(notNilState, nilState) =
-      M.getState()->assume(Arg.castAs<DefinedOrUnknownSVal>());
+      C.getState()->assume(Arg.castAs<DefinedOrUnknownSVal>());
   if (!(nilState && !notNilState))
     return nullptr;
 
diff --git a/clang/lib/StaticAnalyzer/Checkers/CheckSecuritySyntaxOnly.cpp b/clang/lib/StaticAnalyzer/Checkers/CheckSecuritySyntaxOnly.cpp
index 17af1aebd6d2a..5e75c1c4a3abd 100644
--- a/clang/lib/StaticAnalyzer/Checkers/CheckSecuritySyntaxOnly.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/CheckSecuritySyntaxOnly.cpp
@@ -154,15 +154,15 @@ void WalkAST::VisitCallExpr(CallExpr *CE) {
           .Case("mkstemp", &WalkAST::checkCall_mkstemp)
           .Case("mkdtemp", &WalkAST::checkCall_mkstemp)
           .Case("mkstemps", &WalkAST::checkCall_mkstemp)
-          .Cases("strcpy", "__strcpy_chk", &WalkAST::checkCall_strcpy)
-          .Cases("strcat", "__strcat_chk", &WalkAST::checkCall_strcat)
-          .Cases("sprintf", "vsprintf", "scanf", "wscanf", "fscanf", "fwscanf",
-                 "vscanf", "vwscanf", "vfscanf", "vfwscanf",
+          .Cases({"strcpy", "__strcpy_chk"}, &WalkAST::checkCall_strcpy)
+          .Cases({"strcat", "__strcat_chk"}, &WalkAST::checkCall_strcat)
+          .Cases({"sprintf", "vsprintf", "scanf", "wscanf", "fscanf", "fwscanf",
+                  "vscanf", "vwscanf", "vfscanf", "vfwscanf"},
                  &WalkAST::checkDeprecatedOrUnsafeBufferHandling)
-          .Cases("sscanf", "swscanf", "vsscanf", "vswscanf", "swprintf",
-                 "snprintf", "vswprintf", "vsnprintf", "memcpy", "memmove",
+          .Cases({"sscanf", "swscanf", "vsscanf", "vswscanf", "swprintf",
+                  "snprintf", "vswprintf", "vsnprintf", "memcpy", "memmove"},
                  &WalkAST::checkDeprecatedOrUnsafeBufferHandling)
-          .Cases("strncpy", "strncat", "memset", "fprintf",
+          .Cases({"strncpy", "strncat", "memset", "fprintf"},
                  &WalkAST::checkDeprecatedOrUnsafeBufferHandling)
           .Case("drand48", &WalkAST::checkCall_rand)
           .Case("erand48", &WalkAST::checkCall_rand)
@@ -766,12 +766,14 @@ void WalkAST::checkDeprecatedOrUnsafeBufferHandling(const CallExpr *CE,
 
   int ArgIndex =
       llvm::StringSwitch<int>(Name)
-          .Cases("scanf", "wscanf", "vscanf", "vwscanf", 0)
-          .Cases("fscanf", "fwscanf", "vfscanf", "vfwscanf", "sscanf",
-                 "swscanf", "vsscanf", "vswscanf", 1)
-          .Cases("sprintf", "vsprintf", "fprintf", 1)
-          .Cases("swprintf", "snprintf", "vswprintf", "vsnprintf", "memcpy",
-                 "memmove", "memset", "strncpy", "strncat", DEPR_ONLY)
+          .Cases({"scanf", "wscanf", "vscanf", "vwscanf"}, 0)
+          .Cases({"fscanf", "fwscanf", "vfscanf", "vfwscanf", "sscanf",
+                  "swscanf", "vsscanf", "vswscanf"},
+                 1)
+          .Cases({"sprintf", "vsprintf", "fprintf"}, 1)
+          .Cases({"swprintf", "snprintf", "vswprintf", "vsnprintf", "memcpy",
+                  "memmove", "memset", "strncpy", "strncat"},
+                 DEPR_ONLY)
           .Default(UNKNOWN_CALL);
 
   assert(ArgIndex != UNKNOWN_CALL && "Unsupported function");
diff --git a/clang/lib/StaticAnalyzer/Checkers/DereferenceChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/DereferenceChecker.cpp
index 395d724cdfd11..37f5ec3557400 100644
--- a/clang/lib/StaticAnalyzer/Checkers/DereferenceChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/DereferenceChecker.cpp
@@ -19,6 +19,7 @@
 #include "clang/StaticAnalyzer/Core/CheckerManager.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerHelpers.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace clang;
@@ -39,9 +40,10 @@ class DerefBugType : public BugType {
 
 class DereferenceChecker
     : public CheckerFamily<check::Location, check::Bind,
+                           check::PreStmt<BinaryOperator>,
                            EventDispatcher<ImplicitNullDerefEvent>> {
-  void reportBug(const DerefBugType &BT, ProgramStateRef State, const Stmt *S,
-                 CheckerContext &C) const;
+  void reportDerefBug(const DerefBugType &BT, ProgramStateRef State,
+                      const Stmt *S, CheckerContext &C) const;
 
   bool suppressReport(CheckerContext &C, const Expr *E) const;
 
@@ -50,6 +52,7 @@ class DereferenceChecker
                      CheckerContext &C) const;
   void checkBind(SVal L, SVal V, const Stmt *S, bool AtDeclInit,
                  CheckerContext &C) const;
+  void checkPreStmt(const BinaryOperator *Op, CheckerContext &C) const;
 
   static void AddDerefSource(raw_ostream &os,
                              SmallVectorImpl<SourceRange> &Ranges,
@@ -57,7 +60,7 @@ class DereferenceChecker
                              const LocationContext *LCtx,
                              bool loadedFrom = false);
 
-  CheckerFrontend NullDerefChecker, FixedDerefChecker;
+  CheckerFrontend NullDerefChecker, FixedDerefChecker, NullPointerArithmChecker;
   const DerefBugType NullBug{&NullDerefChecker, "Dereference of null pointer",
                              "a null pointer dereference",
                              "a dereference of a null pointer"};
@@ -72,9 +75,22 @@ class DereferenceChecker
   const DerefBugType FixedAddressBug{&FixedDerefChecker,
                                      "Dereference of a fixed address",
                                      "a dereference of a fixed address"};
+  const BugType NullPointerArithmBug{
+      &NullPointerArithmChecker,
+      "Possibly undefined arithmetic operation involving a null pointer"};
 
   StringRef getDebugTag() const override { return "DereferenceChecker"; }
 };
+
+struct ValueDescStr {
+  SmallVectorImpl<SourceRange> &Ranges;
+  const Expr *Ex;
+  const ProgramState *State;
+  const LocationContext *LCtx;
+  bool IsPointer;
+  ConditionTruthVal IsNull;
+};
+
 } // end anonymous namespace
 
 void
@@ -173,9 +189,9 @@ static bool isDeclRefExprToReference(const Expr *E) {
   return false;
 }
 
-void DereferenceChecker::reportBug(const DerefBugType &BT,
-                                   ProgramStateRef State, const Stmt *S,
-                                   CheckerContext &C) const {
+void DereferenceChecker::reportDerefBug(const DerefBugType &BT,
+                                        ProgramStateRef State, const Stmt *S,
+                                        CheckerContext &C) const {
   if (&BT == &FixedAddressBug) {
     if (!FixedDerefChecker.isEnabled())
       // Deliberately don't add a sink node if check is disabled.
@@ -249,9 +265,8 @@ void DereferenceChecker::reportBug(const DerefBugType &BT,
 
   bugreporter::trackExpressionValue(N, bugreporter::getDerefExpr(S), *BR);
 
-  for (SmallVectorImpl<SourceRange>::iterator
-       I = Ranges.begin(), E = Ranges.end(); I!=E; ++I)
-    BR->addRange(*I);
+  for (const auto &R : Ranges)
+    BR->addRange(R);
 
   C.emitReport(std::move(BR));
 }
@@ -262,7 +277,7 @@ void DereferenceChecker::checkLocation(SVal l, bool isLoad, const Stmt* S,
   if (l.isUndef()) {
     const Expr *DerefExpr = getDereferenceExpr(S);
     if (!suppressReport(C, DerefExpr))
-      reportBug(UndefBug, C.getState(), DerefExpr, C);
+      reportDerefBug(UndefBug, C.getState(), DerefExpr, C);
     return;
   }
 
@@ -283,7 +298,7 @@ void DereferenceChecker::checkLocation(SVal l, bool isLoad, const Stmt* S,
       // we call an "explicit" null dereference.
       const Expr *expr = getDereferenceExpr(S);
       if (!suppressReport(C, expr)) {
-        reportBug(NullBug, nullState, expr, C);
+        reportDerefBug(NullBug, nullState, expr, C);
         return;
       }
     }
@@ -301,7 +316,7 @@ void DereferenceChecker::checkLocation(SVal l, bool isLoad, const Stmt* S,
   if (location.isConstant()) {
     const Expr *DerefExpr = getDereferenceExpr(S, isLoad);
     if (!suppressReport(C, DerefExpr))
-      reportBug(FixedAddressBug, notNullState, DerefExpr, C);
+      reportDerefBug(FixedAddressBug, notNullState, DerefExpr, C);
     return;
   }
 
@@ -317,7 +332,7 @@ void DereferenceChecker::checkBind(SVal L, SVal V, const Stmt *S,
 
   // One should never write to label addresses.
   if (auto Label = L.getAs<loc::GotoLabel>()) {
-    reportBug(LabelBug, C.getState(), S, C);
+    reportDerefBug(LabelBug, C.getState(), S, C);
     return;
   }
 
@@ -338,7 +353,7 @@ void DereferenceChecker::checkBind(SVal L, SVal V, const Stmt *S,
     if (!StNonNull) {
       const Expr *expr = getDereferenceExpr(S, /*IsBind=*/true);
       if (!suppressReport(C, expr)) {
-        reportBug(NullBug, StNull, expr, C);
+        reportDerefBug(NullBug, StNull, expr, C);
         return;
       }
     }
@@ -356,7 +371,7 @@ void DereferenceChecker::checkBind(SVal L, SVal V, const Stmt *S,
   if (V.isConstant()) {
     const Expr *DerefExpr = getDereferenceExpr(S, true);
     if (!suppressReport(C, DerefExpr))
-      reportBug(FixedAddressBug, State, DerefExpr, C);
+      reportDerefBug(FixedAddressBug, State, DerefExpr, C);
     return;
   }
 
@@ -379,6 +394,96 @@ void DereferenceChecker::checkBind(SVal L, SVal V, const Stmt *S,
   C.addTransition(State, this);
 }
 
+namespace llvm {
+template <> struct format_provider<ValueDescStr> {
+  static void format(const ValueDescStr &V, raw_ostream &Stream,
+                     StringRef Style) {
+    static const char *ValueStr[2][3] = {
+        {"zero", "nonzero integer value", "probably nonzero integer value"},
+        {"null pointer", "non-null pointer", "probably non-null pointer"},
+    };
+    Stream
+        << ValueStr[V.IsPointer][V.IsNull.isConstrainedTrue()
+                                     ? 0
+                                     : (V.IsNull.isConstrainedFalse() ? 1 : 2)];
+    DereferenceChecker::AddDerefSource(Stream, V.Ranges, V.Ex, V.State, V.LCtx,
+                                       false);
+  }
+};
+} // namespace llvm
+
+void DereferenceChecker::checkPreStmt(const BinaryOperator *Op,
+                                      CheckerContext &C) const {
+  if (!Op->isAdditiveOp() || !NullPointerArithmChecker.isEnabled())
+    return;
+  const Expr *E1 = Op->getLHS();
+  const Expr *E2 = Op->getRHS();
+  QualType T1 = E1->getType().getCanonicalType();
+  QualType T2 = E2->getType().getCanonicalType();
+  bool T1IsPointer = T1->isPointerType();
+  bool T2IsPointer = T2->isPointerType();
+  if (T1->isIntegerType() && T2->isIntegerType())
+    return;
+  if (!T1IsPointer && !T1->isIntegerType() && !T2IsPointer &&
+      !T2->isIntegerType())
+    return;
+
+  ProgramStateRef State = C.getState();
+  ConditionTruthVal V1IsNull = State->isNull(C.getSVal(E1));
+  ConditionTruthVal V2IsNull = State->isNull(C.getSVal(E2));
+  bool IsConstrained = true;
+
+  // Check cases 'NULL + x' and 'NULL - x'
+  if (T1IsPointer && !T2IsPointer) {
+    if (!V1IsNull.isConstrainedTrue() || V2IsNull.isConstrainedTrue())
+      return;
+    IsConstrained = V2IsNull.isConstrainedFalse();
+  }
+
+  // Check case 'x + NULL'
+  if (!T1IsPointer && T2IsPointer) {
+    if (V1IsNull.isConstrainedTrue() || !V2IsNull.isConstrainedTrue())
+      return;
+    IsConstrained = V1IsNull.isConstrainedFalse();
+  }
+
+  // Check case 'NULL - p' or 'p - NULL'
+  if (T1IsPointer && T2IsPointer) {
+    if (!V1IsNull.isConstrainedTrue() && !V2IsNull.isConstrainedTrue())
+      return;
+    if (V1IsNull.isConstrainedTrue() && V2IsNull.isConstrainedTrue())
+      return;
+    IsConstrained =
+        V1IsNull.isConstrainedFalse() || V2IsNull.isConstrainedFalse();
+  }
+
+  SmallVector<SourceRange, 2> Ranges;
+  const char *OpcodeStr =
+      Op->getOpcode() == BO_Add ? "Addition" : "Subtraction";
+  const char *ResultStr = IsConstrained ? "results" : "may result";
+  ValueDescStr DerefArg1{
+      Ranges, E1, State.get(), C.getLocationContext(), T1IsPointer, V1IsNull};
+  ValueDescStr DerefArg2{
+      Ranges, E2, State.get(), C.getLocationContext(), T2IsPointer, V2IsNull};
+  std::string Msg =
+      llvm::formatv("{0} of a {1} and a {2} {3} in undefined behavior",
+                    OpcodeStr, DerefArg1, DerefArg2, ResultStr);
+
+  ExplodedNode *N = C.generateErrorNode(State);
+  if (!N)
+    return;
+  auto BR =
+      std::make_unique<PathSensitiveBugReport>(NullPointerArithmBug, Msg, N);
+  if (V1IsNull.isConstrainedTrue())
+    bugreporter::trackExpressionValue(N, E1, *BR);
+  if (V2IsNull.isConstrainedTrue())
+    bugreporter::trackExpressionValue(N, E2, *BR);
+  for (const auto &R : Ranges)
+    BR->addRange(R);
+
+  C.emitReport(std::move(BR));
+}
+
 void ento::registerNullDereferenceChecker(CheckerManager &Mgr) {
   Mgr.getChecker<DereferenceChecker>()->NullDerefChecker.enable(Mgr);
 }
@@ -395,3 +500,11 @@ bool ento::shouldRegisterFixedAddressDereferenceChecker(
     const CheckerManager &) {
   return true;
 }
+
+void ento::registerNullPointerArithmChecker(CheckerManager &Mgr) {
+  Mgr.getChecker<DereferenceChecker>()->NullPointerArithmChecker.enable(Mgr);
+}
+
+bool ento::shouldRegisterNullPointerArithmChecker(const CheckerManager &) {
+  return true;
+}
diff --git a/clang/lib/StaticAnalyzer/Checkers/NonNullParamChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/NonNullParamChecker.cpp
index b1a7cd7620424..bc673910c16c3 100644
--- a/clang/lib/StaticAnalyzer/Checkers/NonNullParamChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/NonNullParamChecker.cpp
@@ -148,9 +148,8 @@ void NonNullParamChecker::checkPreCall(const CallEvent &Call,
 
       QualType T = ArgE->getType();
       const RecordType *UT = T->getAsUnionType();
-      if (!UT || !UT->getOriginalDecl()
-                      ->getMostRecentDecl()
-                      ->hasAttr<TransparentUnionAttr>())
+      if (!UT ||
+          !UT->getDecl()->getMostRecentDecl()->hasAttr<TransparentUnionAttr>())
         continue;
 
       auto CSV = DV->getAs<nonloc::CompoundVal>();
diff --git a/clang/lib/StaticAnalyzer/Checkers/ObjCSuperDeallocChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/ObjCSuperDeallocChecker.cpp
index f984caf59afb8..227cbfac770d2 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ObjCSuperDeallocChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ObjCSuperDeallocChecker.cpp
@@ -34,7 +34,7 @@ class ObjCSuperDeallocChecker
       this, "[super dealloc] should not be called more than once",
       categories::CoreFoundationObjectiveC};
 
-  void initIdentifierInfoAndSelectors(ASTContext &Ctx) const;
+  void initIdentifierInfoAndSelectors(const ASTContext &Ctx) const;
 
   bool isSuperDeallocMessage(const ObjCMethodCall &M) const;
 
@@ -214,8 +214,8 @@ void ObjCSuperDeallocChecker::diagnoseCallArguments(const CallEvent &CE,
   }
 }
 
-void
-ObjCSuperDeallocChecker::initIdentifierInfoAndSelectors(ASTContext &Ctx) const {
+void ObjCSuperDeallocChecker::initIdentifierInfoAndSelectors(
+    const ASTContext &Ctx) const {
   if (IIdealloc)
     return;
 
@@ -230,7 +230,7 @@ ObjCSuperDeallocChecker::isSuperDeallocMessage(const ObjCMethodCall &M) const {
   if (M.getOriginExpr()->getReceiverKind() != ObjCMessageExpr::SuperInstance)
     return false;
 
-  ASTContext &Ctx = M.getState()->getStateManager().getContext();
+  const ASTContext &Ctx = M.getASTContext();
   initIdentifierInfoAndSelectors(Ctx);
 
   return M.getSelector() == SELdealloc;
diff --git a/clang/lib/StaticAnalyzer/Checkers/StdVariantChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StdVariantChecker.cpp
index 4fc1c576a9687..db8bbee8761d5 100644
--- a/clang/lib/StaticAnalyzer/Checkers/StdVariantChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/StdVariantChecker.cpp
@@ -211,13 +211,13 @@ class StdVariantChecker : public Checker<eval::Call, check::RegionChanges> {
     if (!DefaultType)
       return;
 
-    ProgramStateRef State = ConstructorCall->getState();
+    ProgramStateRef State = C.getState();
     State = State->set<VariantHeldTypeMap>(ThisMemRegion, *DefaultType);
     C.addTransition(State);
   }
 
   bool handleStdGetCall(const CallEvent &Call, CheckerContext &C) const {
-    ProgramStateRef State = Call.getState();
+    ProgramStateRef State = C.getState();
 
     const auto &ArgType = Call.getArgSVal(0)
                               .getType(C.getASTContext())
diff --git a/clang/lib/StaticAnalyzer/Checkers/TaggedUnionModeling.h b/clang/lib/StaticAnalyzer/Checkers/TaggedUnionModeling.h
index dec461296fed5..b8fb57213fd65 100644
--- a/clang/lib/StaticAnalyzer/Checkers/TaggedUnionModeling.h
+++ b/clang/lib/StaticAnalyzer/Checkers/TaggedUnionModeling.h
@@ -52,7 +52,7 @@ removeInformationStoredForDeadInstances(const CallEvent &Call,
 template <class TypeMap>
 void handleConstructorAndAssignment(const CallEvent &Call, CheckerContext &C,
                                     SVal ThisSVal) {
-  ProgramStateRef State = Call.getState();
+  ProgramStateRef State = C.getState();
 
   if (!State)
     return;
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp
index 66cfccbecf31f..84adbf318e9f8 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp
@@ -26,6 +26,7 @@ bool tryToFindPtrOrigin(
     const Expr *E, bool StopAtFirstRefCountedObj,
     std::function<bool(const clang::CXXRecordDecl *)> isSafePtr,
     std::function<bool(const clang::QualType)> isSafePtrType,
+    std::function<bool(const clang::Decl *)> isSafeGlobalDecl,
     std::function<bool(const clang::Expr *, bool)> callback) {
   while (E) {
     if (auto *DRE = dyn_cast<DeclRefExpr>(E)) {
@@ -34,6 +35,8 @@ bool tryToFindPtrOrigin(
         auto IsImmortal = safeGetName(VD) == "NSApp";
         if (VD->hasGlobalStorage() && (IsImmortal || QT.isConstQualified()))
           return callback(E, true);
+        if (VD->hasGlobalStorage() && isSafeGlobalDecl(VD))
+          return callback(E, true);
       }
     }
     if (auto *tempExpr = dyn_cast<MaterializeTemporaryExpr>(E)) {
@@ -71,9 +74,11 @@ bool tryToFindPtrOrigin(
     }
     if (auto *Expr = dyn_cast<ConditionalOperator>(E)) {
       return tryToFindPtrOrigin(Expr->getTrueExpr(), StopAtFirstRefCountedObj,
-                                isSafePtr, isSafePtrType, callback) &&
+                                isSafePtr, isSafePtrType, isSafeGlobalDecl,
+                                callback) &&
              tryToFindPtrOrigin(Expr->getFalseExpr(), StopAtFirstRefCountedObj,
-                                isSafePtr, isSafePtrType, callback);
+                                isSafePtr, isSafePtrType, isSafeGlobalDecl,
+                                callback);
     }
     if (auto *cast = dyn_cast<CastExpr>(E)) {
       if (StopAtFirstRefCountedObj) {
@@ -93,7 +98,8 @@ bool tryToFindPtrOrigin(
     if (auto *call = dyn_cast<CallExpr>(E)) {
       if (auto *Callee = call->getCalleeDecl()) {
         if (Callee->hasAttr<CFReturnsRetainedAttr>() ||
-            Callee->hasAttr<NSReturnsRetainedAttr>()) {
+            Callee->hasAttr<NSReturnsRetainedAttr>() ||
+            Callee->hasAttr<NSReturnsAutoreleasedAttr>()) {
           return callback(E, true);
         }
       }
@@ -158,13 +164,23 @@ bool tryToFindPtrOrigin(
 
         auto Name = safeGetName(callee);
         if (Name == "__builtin___CFStringMakeConstantString" ||
-            Name == "NSClassFromString")
+            Name == "NSStringFromSelector" || Name == "NSSelectorFromString" ||
+            Name == "NSStringFromClass" || Name == "NSClassFromString" ||
+            Name == "NSStringFromProtocol" || Name == "NSProtocolFromString")
           return callback(E, true);
       } else if (auto *CalleeE = call->getCallee()) {
         if (auto *E = dyn_cast<DeclRefExpr>(CalleeE->IgnoreParenCasts())) {
           if (isSingleton(E->getFoundDecl()))
             return callback(E, true);
         }
+
+        if (auto *MemberExpr = dyn_cast<CXXDependentScopeMemberExpr>(CalleeE)) {
+          auto *Base = MemberExpr->getBase();
+          auto MemberName = MemberExpr->getMember().getAsString();
+          bool IsGetter = MemberName == "get" || MemberName == "ptr";
+          if (Base && isSafePtrType(Base->getType()) && IsGetter)
+            return callback(E, true);
+        }
       }
 
       // Sometimes, canonical type erroneously turns Ref<T> into T.
@@ -176,7 +192,7 @@ bool tryToFindPtrOrigin(
           if (auto *Subst = dyn_cast<SubstTemplateTypeParmType>(RetType)) {
             if (auto *SubstType = Subst->desugar().getTypePtr()) {
               if (auto *RD = dyn_cast<RecordType>(SubstType)) {
-                if (auto *CXX = dyn_cast<CXXRecordDecl>(RD->getOriginalDecl()))
+                if (auto *CXX = dyn_cast<CXXRecordDecl>(RD->getDecl()))
                   if (isSafePtr(CXX))
                     return callback(E, true);
               }
@@ -196,6 +212,8 @@ bool tryToFindPtrOrigin(
           !Selector.getNumArgs())
         return callback(E, true);
     }
+    if (auto *ObjCProtocol = dyn_cast<ObjCProtocolExpr>(E))
+      return callback(ObjCProtocol, true);
     if (auto *ObjCDict = dyn_cast<ObjCDictionaryLiteral>(E))
       return callback(ObjCDict, true);
     if (auto *ObjCArray = dyn_cast<ObjCArrayLiteral>(E))
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.h b/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.h
index 3a009d65efea6..9fff456b7e8b8 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.h
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.h
@@ -56,6 +56,7 @@ bool tryToFindPtrOrigin(
     const clang::Expr *E, bool StopAtFirstRefCountedObj,
     std::function<bool(const clang::CXXRecordDecl *)> isSafePtr,
     std::function<bool(const clang::QualType)> isSafePtrType,
+    std::function<bool(const clang::Decl *)> isSafeGlobalDecl,
     std::function<bool(const clang::Expr *, bool)> callback);
 
 /// For \p E referring to a ref-countable/-counted pointer/reference we return
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/ForwardDeclChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/ForwardDeclChecker.cpp
index d8539eaaac49f..1d4e6dd572749 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/ForwardDeclChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/ForwardDeclChecker.cpp
@@ -263,18 +263,43 @@ class ForwardDeclChecker : public Checker<check::ASTDecl<TranslationUnitDecl>> {
   void visitCallArg(const Expr *Arg, const ParmVarDecl *Param,
                     const Decl *DeclWithIssue) const {
     auto *ArgExpr = Arg->IgnoreParenCasts();
-    if (auto *InnerCE = dyn_cast<CallExpr>(Arg)) {
-      auto *InnerCallee = InnerCE->getDirectCallee();
-      if (InnerCallee && InnerCallee->isInStdNamespace() &&
-          safeGetName(InnerCallee) == "move" && InnerCE->getNumArgs() == 1) {
-        ArgExpr = InnerCE->getArg(0);
-        if (ArgExpr)
-          ArgExpr = ArgExpr->IgnoreParenCasts();
+    while (ArgExpr) {
+      ArgExpr = ArgExpr->IgnoreParenCasts();
+      if (auto *InnerCE = dyn_cast<CallExpr>(ArgExpr)) {
+        auto *InnerCallee = InnerCE->getDirectCallee();
+        if (InnerCallee && InnerCallee->isInStdNamespace() &&
+            safeGetName(InnerCallee) == "move" && InnerCE->getNumArgs() == 1) {
+          ArgExpr = InnerCE->getArg(0);
+          continue;
+        }
+      }
+      if (auto *UO = dyn_cast<UnaryOperator>(ArgExpr)) {
+        auto OpCode = UO->getOpcode();
+        if (OpCode == UO_Deref || OpCode == UO_AddrOf) {
+          ArgExpr = UO->getSubExpr();
+          continue;
+        }
       }
+      break;
     }
+
+    if (auto *MemberCallExpr = dyn_cast<CXXMemberCallExpr>(ArgExpr)) {
+      if (isOwnerPtrType(MemberCallExpr->getObjectType()))
+        return;
+    }
+
+    if (auto *OpCE = dyn_cast<CXXOperatorCallExpr>(ArgExpr)) {
+      auto *Method = dyn_cast_or_null<CXXMethodDecl>(OpCE->getDirectCallee());
+      if (Method && isOwnerPtr(safeGetName(Method->getParent()))) {
+        if (OpCE->getOperator() == OO_Star && OpCE->getNumArgs() == 1)
+          return;
+      }
+    }
+
     if (isNullPtr(ArgExpr) || isa<IntegerLiteral>(ArgExpr) ||
         isa<CXXDefaultArgExpr>(ArgExpr))
       return;
+
     if (auto *DRE = dyn_cast<DeclRefExpr>(ArgExpr)) {
       if (auto *ValDecl = DRE->getDecl()) {
         if (isa<ParmVarDecl>(ValDecl))
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
index e5c74bbaf3d6b..d3d1f13ab1c78 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
@@ -138,6 +138,11 @@ bool isCheckedPtr(const std::string &Name) {
   return Name == "CheckedPtr" || Name == "CheckedRef";
 }
 
+bool isOwnerPtr(const std::string &Name) {
+  return isRefType(Name) || isCheckedPtr(Name) || Name == "unique_ptr" ||
+         Name == "UniqueRef" || Name == "LazyUniqueRef";
+}
+
 bool isSmartPtrClass(const std::string &Name) {
   return isRefType(Name) || isCheckedPtr(Name) || isRetainPtrOrOSPtr(Name) ||
          Name == "WeakPtr" || Name == "WeakPtrFactory" ||
@@ -206,10 +211,7 @@ bool isRetainPtrOrOSPtrType(const clang::QualType T) {
 }
 
 bool isOwnerPtrType(const clang::QualType T) {
-  return isPtrOfType(T, [](auto Name) {
-    return isRefType(Name) || isCheckedPtr(Name) || Name == "unique_ptr" ||
-           Name == "UniqueRef" || Name == "LazyUniqueRef";
-  });
+  return isPtrOfType(T, [](auto Name) { return isOwnerPtr(Name); });
 }
 
 std::optional<bool> isUncounted(const QualType T) {
@@ -255,7 +257,7 @@ void RetainTypeChecker::visitTypedef(const TypedefDecl *TD) {
     return;
   }
 
-  for (auto *Redecl : RT->getOriginalDecl()->getMostRecentDecl()->redecls()) {
+  for (auto *Redecl : RT->getDecl()->getMostRecentDecl()->redecls()) {
     if (Redecl->getAttr<ObjCBridgeAttr>() ||
         Redecl->getAttr<ObjCBridgeMutableAttr>()) {
       CFPointees.insert(RT);
@@ -296,7 +298,7 @@ std::optional<bool> isUnretained(const QualType T, bool IsARCEnabled) {
   auto *Record = PointeeType->getAsStructureType();
   if (!Record)
     return false;
-  auto *Decl = Record->getOriginalDecl();
+  auto *Decl = Record->getDecl();
   if (!Decl)
     return false;
   auto TypeName = Decl->getName();
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h
index 8300a6c051f3e..12e2e2d75b75d 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h
@@ -143,6 +143,10 @@ bool isCheckedPtr(const std::string &Name);
 /// \returns true if \p Name is RetainPtr or its variant, false if not.
 bool isRetainPtrOrOSPtr(const std::string &Name);
 
+/// \returns true if \p Name is an owning smar pointer such as Ref, CheckedPtr,
+/// and unique_ptr.
+bool isOwnerPtr(const std::string &Name);
+
 /// \returns true if \p Name is a smart pointer type name, false if not.
 bool isSmartPtrClass(const std::string &Name);
 
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefCallArgsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefCallArgsChecker.cpp
index 9585ceb40f95e..791e70998477f 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefCallArgsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefCallArgsChecker.cpp
@@ -29,12 +29,12 @@ namespace {
 class RawPtrRefCallArgsChecker
     : public Checker<check::ASTDecl<TranslationUnitDecl>> {
   BugType Bug;
-  mutable BugReporter *BR;
 
   TrivialFunctionAnalysis TFA;
   EnsureFunctionAnalysis EFA;
 
 protected:
+  mutable BugReporter *BR;
   mutable std::optional<RetainTypeChecker> RTC;
 
 public:
@@ -46,6 +46,7 @@ class RawPtrRefCallArgsChecker
   virtual bool isSafePtr(const CXXRecordDecl *Record) const = 0;
   virtual bool isSafePtrType(const QualType type) const = 0;
   virtual bool isSafeExpr(const Expr *) const { return false; }
+  virtual bool isSafeDecl(const Decl *) const { return false; }
   virtual const char *ptrKind() const = 0;
 
   void checkASTDecl(const TranslationUnitDecl *TUD, AnalysisManager &MGR,
@@ -214,6 +215,7 @@ class RawPtrRefCallArgsChecker
         Arg, /*StopAtFirstRefCountedObj=*/true,
         [&](const clang::CXXRecordDecl *Record) { return isSafePtr(Record); },
         [&](const clang::QualType T) { return isSafePtrType(T); },
+        [&](const clang::Decl *D) { return isSafeDecl(D); },
         [&](const clang::Expr *ArgOrigin, bool IsSafe) {
           if (IsSafe)
             return true;
@@ -479,6 +481,11 @@ class UnretainedCallArgsChecker final : public RawPtrRefCallArgsChecker {
            isa<ObjCMessageExpr>(E);
   }
 
+  bool isSafeDecl(const Decl *D) const final {
+    // Treat NS/CF globals in system header as immortal.
+    return BR->getSourceManager().isInSystemHeader(D->getLocation());
+  }
+
   const char *ptrKind() const final { return "unretained"; }
 };
 
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLocalVarsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLocalVarsChecker.cpp
index dd9701fbbb017..c13df47920f72 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLocalVarsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLocalVarsChecker.cpp
@@ -166,10 +166,10 @@ bool isGuardedScopeEmbeddedInGuardianScope(const VarDecl *Guarded,
 class RawPtrRefLocalVarsChecker
     : public Checker<check::ASTDecl<TranslationUnitDecl>> {
   BugType Bug;
-  mutable BugReporter *BR;
   EnsureFunctionAnalysis EFA;
 
 protected:
+  mutable BugReporter *BR;
   mutable std::optional<RetainTypeChecker> RTC;
 
 public:
@@ -180,6 +180,7 @@ class RawPtrRefLocalVarsChecker
   virtual bool isSafePtr(const CXXRecordDecl *) const = 0;
   virtual bool isSafePtrType(const QualType) const = 0;
   virtual bool isSafeExpr(const Expr *) const { return false; }
+  virtual bool isSafeDecl(const Decl *) const { return false; }
   virtual const char *ptrKind() const = 0;
 
   void checkASTDecl(const TranslationUnitDecl *TUD, AnalysisManager &MGR,
@@ -288,6 +289,7 @@ class RawPtrRefLocalVarsChecker
                 return isSafePtr(Record);
               },
               [&](const clang::QualType Type) { return isSafePtrType(Type); },
+              [&](const clang::Decl *D) { return isSafeDecl(D); },
               [&](const clang::Expr *InitArgOrigin, bool IsSafe) {
                 if (!InitArgOrigin || IsSafe)
                   return true;
@@ -443,6 +445,10 @@ class UnretainedLocalVarsChecker final : public RawPtrRefLocalVarsChecker {
     return ento::cocoa::isCocoaObjectRef(E->getType()) &&
            isa<ObjCMessageExpr>(E);
   }
+  bool isSafeDecl(const Decl *D) const final {
+    // Treat NS/CF globals in system header as immortal.
+    return BR->getSourceManager().isInSystemHeader(D->getLocation());
+  }
   const char *ptrKind() const final { return "unretained"; }
 };
 
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/RefCntblBaseVirtualDtorChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RefCntblBaseVirtualDtorChecker.cpp
index 6f3a280971cb8..c6421f8616264 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/RefCntblBaseVirtualDtorChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RefCntblBaseVirtualDtorChecker.cpp
@@ -121,13 +121,13 @@ class DerefFuncDeleteExprVisitor
                 return true;
             }
           } else if (auto *RD = dyn_cast<RecordType>(PointeeType)) {
-            if (declaresSameEntity(RD->getOriginalDecl(), ClassDecl))
+            if (declaresSameEntity(RD->getDecl(), ClassDecl))
               return true;
           } else if (auto *ST =
                          dyn_cast<SubstTemplateTypeParmType>(PointeeType)) {
             auto Type = ST->getReplacementType();
             if (auto *RD = dyn_cast<RecordType>(Type)) {
-              if (declaresSameEntity(RD->getOriginalDecl(), ClassDecl))
+              if (declaresSameEntity(RD->getDecl(), ClassDecl))
                 return true;
             }
           }
diff --git a/clang/lib/StaticAnalyzer/Core/BasicValueFactory.cpp b/clang/lib/StaticAnalyzer/Core/BasicValueFactory.cpp
index 02f34bc30f554..c905ee6bc9fc9 100644
--- a/clang/lib/StaticAnalyzer/Core/BasicValueFactory.cpp
+++ b/clang/lib/StaticAnalyzer/Core/BasicValueFactory.cpp
@@ -173,7 +173,7 @@ const PointerToMemberData *BasicValueFactory::getPointerToMemberData(
   return D;
 }
 
-LLVM_ATTRIBUTE_UNUSED static bool hasNoRepeatedElements(
+[[maybe_unused]] static bool hasNoRepeatedElements(
     llvm::ImmutableList<const CXXBaseSpecifier *> BaseSpecList) {
   llvm::SmallPtrSet<QualType, 16> BaseSpecSeen;
   for (const CXXBaseSpecifier *BaseSpec : BaseSpecList) {
diff --git a/clang/lib/StaticAnalyzer/Core/CallEvent.cpp b/clang/lib/StaticAnalyzer/Core/CallEvent.cpp
index 06ba01507fa4f..62460cc6f5b19 100644
--- a/clang/lib/StaticAnalyzer/Core/CallEvent.cpp
+++ b/clang/lib/StaticAnalyzer/Core/CallEvent.cpp
@@ -89,7 +89,7 @@ static bool isCallback(QualType T) {
     T = T->getPointeeType();
 
   if (const RecordType *RT = T->getAsStructureType()) {
-    const RecordDecl *RD = RT->getOriginalDecl()->getDefinitionOrSelf();
+    const RecordDecl *RD = RT->getDecl()->getDefinitionOrSelf();
     for (const auto *I : RD->fields()) {
       QualType FieldT = I->getType();
       if (FieldT->isBlockPointerType() || FieldT->isFunctionPointerType())
@@ -391,9 +391,8 @@ bool CallEvent::isVariadic(const Decl *D) {
 
 static bool isTransparentUnion(QualType T) {
   const RecordType *UT = T->getAsUnionType();
-  return UT && UT->getOriginalDecl()
-                   ->getMostRecentDecl()
-                   ->hasAttr<TransparentUnionAttr>();
+  return UT &&
+         UT->getDecl()->getMostRecentDecl()->hasAttr<TransparentUnionAttr>();
 }
 
 // In some cases, symbolic cases should be transformed before we associate
diff --git a/clang/lib/StaticAnalyzer/Core/CheckerManager.cpp b/clang/lib/StaticAnalyzer/Core/CheckerManager.cpp
index 44c6f9f52cca6..8ee4832643b91 100644
--- a/clang/lib/StaticAnalyzer/Core/CheckerManager.cpp
+++ b/clang/lib/StaticAnalyzer/Core/CheckerManager.cpp
@@ -731,19 +731,22 @@ void CheckerManager::runCheckersForEvalCall(ExplodedNodeSet &Dst,
     ExplodedNodeSet checkDst;
     NodeBuilder B(Pred, checkDst, Eng.getBuilderContext());
 
+    ProgramStateRef State = Pred->getState();
+    CallEventRef<> UpdatedCall = Call.cloneWithState(State);
+
     // Check if any of the EvalCall callbacks can evaluate the call.
     for (const auto &EvalCallChecker : EvalCallCheckers) {
       // TODO: Support the situation when the call doesn't correspond
       // to any Expr.
       ProgramPoint L = ProgramPoint::getProgramPoint(
-          Call.getOriginExpr(), ProgramPoint::PostStmtKind,
+          UpdatedCall->getOriginExpr(), ProgramPoint::PostStmtKind,
           Pred->getLocationContext(), EvalCallChecker.Checker);
       bool evaluated = false;
-      { // CheckerContext generates transitions(populates checkDest) on
+      { // CheckerContext generates transitions (populates checkDest) on
         // destruction, so introduce the scope to make sure it gets properly
         // populated.
         CheckerContext C(B, Eng, Pred, L);
-        evaluated = EvalCallChecker(Call, C);
+        evaluated = EvalCallChecker(*UpdatedCall, C);
       }
 #ifndef NDEBUG
       if (evaluated && evaluatorChecker) {
@@ -774,7 +777,7 @@ void CheckerManager::runCheckersForEvalCall(ExplodedNodeSet &Dst,
     // If none of the checkers evaluated the call, ask ExprEngine to handle it.
     if (!evaluatorChecker) {
       NodeBuilder B(Pred, Dst, Eng.getBuilderContext());
-      Eng.defaultEvalCall(B, Pred, Call, CallOpts);
+      Eng.defaultEvalCall(B, Pred, *UpdatedCall, CallOpts);
     }
   }
 }
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngineCallAndReturn.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngineCallAndReturn.cpp
index 0c491b8c4ca90..ac6c1d76b6b8e 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngineCallAndReturn.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngineCallAndReturn.cpp
@@ -628,6 +628,8 @@ void ExprEngine::VisitCallExpr(const CallExpr *CE, ExplodedNode *Pred,
 
 ProgramStateRef ExprEngine::finishArgumentConstruction(ProgramStateRef State,
                                                        const CallEvent &Call) {
+  // WARNING: The state attached to 'Call' may be obsolete, do not call any
+  // methods that rely on it!
   const Expr *E = Call.getOriginExpr();
   // FIXME: Constructors to placement arguments of operator new
   // are not supported yet.
@@ -653,6 +655,8 @@ ProgramStateRef ExprEngine::finishArgumentConstruction(ProgramStateRef State,
 void ExprEngine::finishArgumentConstruction(ExplodedNodeSet &Dst,
                                             ExplodedNode *Pred,
                                             const CallEvent &Call) {
+  // WARNING: The state attached to 'Call' may be obsolete, do not call any
+  // methods that rely on it!
   ProgramStateRef State = Pred->getState();
   ProgramStateRef CleanedState = finishArgumentConstruction(State, Call);
   if (CleanedState == State) {
@@ -670,35 +674,33 @@ void ExprEngine::finishArgumentConstruction(ExplodedNodeSet &Dst,
 }
 
 void ExprEngine::evalCall(ExplodedNodeSet &Dst, ExplodedNode *Pred,
-                          const CallEvent &Call) {
-  // WARNING: At this time, the state attached to 'Call' may be older than the
-  // state in 'Pred'. This is a minor optimization since CheckerManager will
-  // use an updated CallEvent instance when calling checkers, but if 'Call' is
-  // ever used directly in this function all callers should be updated to pass
-  // the most recent state. (It is probably not worth doing the work here since
-  // for some callers this will not be necessary.)
+                          const CallEvent &CallTemplate) {
+  // NOTE: CallTemplate is called a "template" because its attached state may
+  // be obsolete (compared to the state of Pred). The state-dependent methods
+  // of CallEvent should be used only after a `cloneWithState` call that
+  // attaches the up-to-date state to this template object.
 
   // Run any pre-call checks using the generic call interface.
   ExplodedNodeSet dstPreVisit;
-  getCheckerManager().runCheckersForPreCall(dstPreVisit, Pred,
-                                            Call, *this);
+  getCheckerManager().runCheckersForPreCall(dstPreVisit, Pred, CallTemplate,
+                                            *this);
 
   // Actually evaluate the function call.  We try each of the checkers
   // to see if the can evaluate the function call, and get a callback at
   // defaultEvalCall if all of them fail.
   ExplodedNodeSet dstCallEvaluated;
-  getCheckerManager().runCheckersForEvalCall(dstCallEvaluated, dstPreVisit,
-                                             Call, *this, EvalCallOptions());
+  getCheckerManager().runCheckersForEvalCall(
+      dstCallEvaluated, dstPreVisit, CallTemplate, *this, EvalCallOptions());
 
   // If there were other constructors called for object-type arguments
   // of this call, clean them up.
   ExplodedNodeSet dstArgumentCleanup;
   for (ExplodedNode *I : dstCallEvaluated)
-    finishArgumentConstruction(dstArgumentCleanup, I, Call);
+    finishArgumentConstruction(dstArgumentCleanup, I, CallTemplate);
 
   ExplodedNodeSet dstPostCall;
   getCheckerManager().runCheckersForPostCall(dstPostCall, dstArgumentCleanup,
-                                             Call, *this);
+                                             CallTemplate, *this);
 
   // Escaping symbols conjured during invalidating the regions above.
   // Note that, for inlined calls the nodes were put back into the worklist,
@@ -708,12 +710,13 @@ void ExprEngine::evalCall(ExplodedNodeSet &Dst, ExplodedNode *Pred,
   // Run pointerEscape callback with the newly conjured symbols.
   SmallVector<std::pair<SVal, SVal>, 8> Escaped;
   for (ExplodedNode *I : dstPostCall) {
-    NodeBuilder B(I, Dst, *currBldrCtx);
     ProgramStateRef State = I->getState();
+    CallEventRef<> Call = CallTemplate.cloneWithState(State);
+    NodeBuilder B(I, Dst, *currBldrCtx);
     Escaped.clear();
     {
       unsigned Arg = -1;
-      for (const ParmVarDecl *PVD : Call.parameters()) {
+      for (const ParmVarDecl *PVD : Call->parameters()) {
         ++Arg;
         QualType ParamTy = PVD->getType();
         if (ParamTy.isNull() ||
@@ -722,13 +725,13 @@ void ExprEngine::evalCall(ExplodedNodeSet &Dst, ExplodedNode *Pred,
         QualType Pointee = ParamTy->getPointeeType();
         if (Pointee.isConstQualified() || Pointee->isVoidType())
           continue;
-        if (const MemRegion *MR = Call.getArgSVal(Arg).getAsRegion())
+        if (const MemRegion *MR = Call->getArgSVal(Arg).getAsRegion())
           Escaped.emplace_back(loc::MemRegionVal(MR), State->getSVal(MR, Pointee));
       }
     }
 
     State = processPointerEscapedOnBind(State, Escaped, I->getLocationContext(),
-                                        PSK_EscapeOutParameters, &Call);
+                                        PSK_EscapeOutParameters, &*Call);
 
     if (State == I->getState())
       Dst.insert(I);
@@ -1212,48 +1215,47 @@ static bool isTrivialObjectAssignment(const CallEvent &Call) {
 }
 
 void ExprEngine::defaultEvalCall(NodeBuilder &Bldr, ExplodedNode *Pred,
-                                 const CallEvent &CallTemplate,
+                                 const CallEvent &Call,
                                  const EvalCallOptions &CallOpts) {
   // Make sure we have the most recent state attached to the call.
   ProgramStateRef State = Pred->getState();
-  CallEventRef<> Call = CallTemplate.cloneWithState(State);
 
   // Special-case trivial assignment operators.
-  if (isTrivialObjectAssignment(*Call)) {
-    performTrivialCopy(Bldr, Pred, *Call);
+  if (isTrivialObjectAssignment(Call)) {
+    performTrivialCopy(Bldr, Pred, Call);
     return;
   }
 
   // Try to inline the call.
   // The origin expression here is just used as a kind of checksum;
   // this should still be safe even for CallEvents that don't come from exprs.
-  const Expr *E = Call->getOriginExpr();
+  const Expr *E = Call.getOriginExpr();
 
   ProgramStateRef InlinedFailedState = getInlineFailedState(State, E);
   if (InlinedFailedState) {
     // If we already tried once and failed, make sure we don't retry later.
     State = InlinedFailedState;
   } else {
-    RuntimeDefinition RD = Call->getRuntimeDefinition();
-    Call->setForeign(RD.isForeign());
+    RuntimeDefinition RD = Call.getRuntimeDefinition();
+    Call.setForeign(RD.isForeign());
     const Decl *D = RD.getDecl();
-    if (shouldInlineCall(*Call, D, Pred, CallOpts)) {
+    if (shouldInlineCall(Call, D, Pred, CallOpts)) {
       if (RD.mayHaveOtherDefinitions()) {
         AnalyzerOptions &Options = getAnalysisManager().options;
 
         // Explore with and without inlining the call.
         if (Options.getIPAMode() == IPAK_DynamicDispatchBifurcate) {
-          BifurcateCall(RD.getDispatchRegion(), *Call, D, Bldr, Pred);
+          BifurcateCall(RD.getDispatchRegion(), Call, D, Bldr, Pred);
           return;
         }
 
         // Don't inline if we're not in any dynamic dispatch mode.
         if (Options.getIPAMode() != IPAK_DynamicDispatch) {
-          conservativeEvalCall(*Call, Bldr, Pred, State);
+          conservativeEvalCall(Call, Bldr, Pred, State);
           return;
         }
       }
-      ctuBifurcate(*Call, D, Bldr, Pred, State);
+      ctuBifurcate(Call, D, Bldr, Pred, State);
       return;
     }
   }
@@ -1261,10 +1263,10 @@ void ExprEngine::defaultEvalCall(NodeBuilder &Bldr, ExplodedNode *Pred,
   // If we can't inline it, clean up the state traits used only if the function
   // is inlined.
   State = removeStateTraitsUsedForArrayEvaluation(
-      State, dyn_cast_or_null<CXXConstructExpr>(E), Call->getLocationContext());
+      State, dyn_cast_or_null<CXXConstructExpr>(E), Call.getLocationContext());
 
   // Also handle the return value and invalidate the regions.
-  conservativeEvalCall(*Call, Bldr, Pred, State);
+  conservativeEvalCall(Call, Bldr, Pred, State);
 }
 
 void ExprEngine::BifurcateCall(const MemRegion *BifurReg,
diff --git a/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp b/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp
index ab45e678bafd5..245a73047513b 100644
--- a/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp
+++ b/clang/lib/StaticAnalyzer/Core/RangeConstraintManager.cpp
@@ -983,7 +983,7 @@ class EquivalenceClass : public llvm::FoldingSetNode {
   }
 
   /// Check equivalence data for consistency.
-  [[nodiscard]] LLVM_ATTRIBUTE_UNUSED static bool
+  [[nodiscard]] [[maybe_unused]] static bool
   isClassDataConsistent(ProgramStateRef State);
 
   [[nodiscard]] QualType getType() const {
@@ -1041,8 +1041,7 @@ class EquivalenceClass : public llvm::FoldingSetNode {
 //                             Constraint functions
 //===----------------------------------------------------------------------===//
 
-[[nodiscard]] LLVM_ATTRIBUTE_UNUSED bool
-areFeasible(ConstraintRangeTy Constraints) {
+[[nodiscard]] [[maybe_unused]] bool areFeasible(ConstraintRangeTy Constraints) {
   return llvm::none_of(
       Constraints,
       [](const std::pair<EquivalenceClass, RangeSet> &ClassConstraint) {
@@ -1134,7 +1133,7 @@ template <class EndTy>
   return End;
 }
 
-[[nodiscard]] LLVM_ATTRIBUTE_UNUSED inline std::optional<RangeSet>
+[[nodiscard]] [[maybe_unused]] inline std::optional<RangeSet>
 intersect(RangeSet::Factory &F, const RangeSet *End) {
   // This is an extraneous conversion from a raw pointer into
   // std::optional<RangeSet>
diff --git a/clang/lib/StaticAnalyzer/Core/RegionStore.cpp b/clang/lib/StaticAnalyzer/Core/RegionStore.cpp
index af0ef52334bd7..2838533c1a406 100644
--- a/clang/lib/StaticAnalyzer/Core/RegionStore.cpp
+++ b/clang/lib/StaticAnalyzer/Core/RegionStore.cpp
@@ -2457,7 +2457,7 @@ NonLoc RegionStoreManager::createLazyBinding(RegionBindingsConstRef B,
 SVal RegionStoreManager::getBindingForStruct(RegionBindingsConstRef B,
                                              const TypedValueRegion *R) {
   const RecordDecl *RD =
-      R->getValueType()->castAsCanonical<RecordType>()->getOriginalDecl();
+      R->getValueType()->castAsCanonical<RecordType>()->getDecl();
   if (!RD->getDefinition())
     return UnknownVal();
 
diff --git a/clang/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp b/clang/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp
index 84a9c43d3572e..6108931f737d4 100644
--- a/clang/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp
+++ b/clang/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp
@@ -1111,6 +1111,10 @@ SVal SimpleSValBuilder::evalBinOpLN(ProgramStateRef state,
   assert(!BinaryOperator::isComparisonOp(op) &&
          "arguments to comparison ops must be of the same type");
 
+  SVal simplifiedRhs = simplifySVal(state, rhs);
+  if (auto simplifiedRhsAsNonLoc = simplifiedRhs.getAs<NonLoc>())
+    rhs = *simplifiedRhsAsNonLoc;
+
   // Special case: rhs is a zero constant.
   if (rhs.isZeroConstant())
     return lhs;
diff --git a/clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp b/clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp
index 4efde59aab763..82b560b2613f8 100644
--- a/clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp
+++ b/clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp
@@ -62,7 +62,9 @@ ALWAYS_ENABLED_STATISTIC(
     "The # of visited basic blocks in the analyzed functions.");
 ALWAYS_ENABLED_STATISTIC(PercentReachableBlocks,
                          "The % of reachable basic blocks.");
-STAT_MAX(MaxCFGSize, "The maximum number of basic blocks in a function.");
+ALWAYS_ENABLED_STATISTIC(MaxCFGSize,
+                         "The maximum number of basic blocks in a function.");
+static UnsignedEPStat CFGSize("CFGSize");
 //===----------------------------------------------------------------------===//
 // AnalysisConsumer declaration.
 //===----------------------------------------------------------------------===//
@@ -721,6 +723,7 @@ AnalysisConsumer::getModeForDecl(Decl *D, AnalysisMode Mode) {
 }
 
 static UnsignedEPStat PathRunningTime("PathRunningTime");
+static UnsignedEPStat SyntaxRunningTime("SyntaxRunningTime");
 
 void AnalysisConsumer::HandleCode(Decl *D, AnalysisMode Mode,
                                   ExprEngine::InliningModes IMode,
@@ -757,9 +760,11 @@ void AnalysisConsumer::HandleCode(Decl *D, AnalysisMode Mode,
     ++NumFunctionsAnalyzedSyntaxOnly;
     if (SyntaxCheckTimer) {
       SyntaxCheckTimer->stopTimer();
-      llvm::TimeRecord CheckerEndTime = SyntaxCheckTimer->getTotalTime();
-      CheckerEndTime -= CheckerStartTime;
-      DisplayTime(CheckerEndTime);
+      llvm::TimeRecord CheckerDuration =
+          SyntaxCheckTimer->getTotalTime() - CheckerStartTime;
+      FunctionSummaries.findOrInsertSummary(D)->second.SyntaxRunningTime =
+          std::lround(CheckerDuration.getWallTime() * 1000);
+      DisplayTime(CheckerDuration);
       if (AnalyzerTimers && ShouldClearTimersToPreventDisplayingThem) {
         AnalyzerTimers->clear();
       }
@@ -783,15 +788,31 @@ void AnalysisConsumer::HandleCode(Decl *D, AnalysisMode Mode,
 void AnalysisConsumer::RunPathSensitiveChecks(Decl *D,
                                               ExprEngine::InliningModes IMode,
                                               SetOfConstDecls *VisitedCallees) {
+  auto *CFG = Mgr->getCFG(D);
+
   // Construct the analysis engine.  First check if the CFG is valid.
   // FIXME: Inter-procedural analysis will need to handle invalid CFGs.
-  if (!Mgr->getCFG(D))
+  if (!CFG)
     return;
 
+  CFGSize.set(CFG->size());
+
+  auto *DeclContext = Mgr->getAnalysisDeclContext(D);
   // See if the LiveVariables analysis scales.
-  if (!Mgr->getAnalysisDeclContext(D)->getAnalysis<RelaxedLiveVariables>())
+  if (!DeclContext->getAnalysis<RelaxedLiveVariables>())
     return;
 
+  // DeclContext declaration is the redeclaration of D that has a body.
+  const Decl *DefDecl = DeclContext->getDecl();
+
+  // Get the SyntaxRunningTime from the function summary, because it is computed
+  // during the AM_Syntax analysis, which is done at a different point in time
+  // and in different order, but always before AM_Path.
+  if (const auto *Summary = FunctionSummaries.findSummary(DefDecl);
+      Summary && Summary->SyntaxRunningTime.has_value()) {
+    SyntaxRunningTime.set(*Summary->SyntaxRunningTime);
+  }
+
   ExprEngine Eng(CTU, *Mgr, VisitedCallees, &FunctionSummaries, IMode);
 
   // Execute the worklist algorithm.
@@ -804,11 +825,11 @@ void AnalysisConsumer::RunPathSensitiveChecks(Decl *D,
                       Mgr->options.MaxNodesPerTopLevelFunction);
   if (ExprEngineTimer) {
     ExprEngineTimer->stopTimer();
-    llvm::TimeRecord ExprEngineEndTime = ExprEngineTimer->getTotalTime();
-    ExprEngineEndTime -= ExprEngineStartTime;
+    llvm::TimeRecord ExprEngineDuration =
+        ExprEngineTimer->getTotalTime() - ExprEngineStartTime;
     PathRunningTime.set(static_cast<unsigned>(
-        std::lround(ExprEngineEndTime.getWallTime() * 1000)));
-    DisplayTime(ExprEngineEndTime);
+        std::lround(ExprEngineDuration.getWallTime() * 1000)));
+    DisplayTime(ExprEngineDuration);
     if (AnalyzerTimers && ShouldClearTimersToPreventDisplayingThem) {
       AnalyzerTimers->clear();
     }
diff --git a/clang/lib/Support/RISCVVIntrinsicUtils.cpp b/clang/lib/Support/RISCVVIntrinsicUtils.cpp
index 5a4e805d4a9d1..dad3d0dae423a 100644
--- a/clang/lib/Support/RISCVVIntrinsicUtils.cpp
+++ b/clang/lib/Support/RISCVVIntrinsicUtils.cpp
@@ -654,6 +654,9 @@ PrototypeDescriptor::parsePrototypeDescriptor(
     case 'F':
       TM |= TypeModifier::Float;
       break;
+    case 'Y':
+      TM |= TypeModifier::BFloat;
+      break;
     case 'S':
       TM |= TypeModifier::LMUL1;
       break;
@@ -704,6 +707,8 @@ void RVVType::applyModifier(const PrototypeDescriptor &Transformer) {
     ElementBitwidth *= 2;
     LMUL.MulLog2LMUL(1);
     Scale = LMUL.getScale(ElementBitwidth);
+    if (ScalarType == ScalarTypeKind::BFloat)
+      ScalarType = ScalarTypeKind::Float;
     break;
   case VectorTypeModifier::Widening4XVector:
     ElementBitwidth *= 4;
diff --git a/clang/lib/Tooling/CompilationDatabase.cpp b/clang/lib/Tooling/CompilationDatabase.cpp
index 860457acced85..4070bb81c6f74 100644
--- a/clang/lib/Tooling/CompilationDatabase.cpp
+++ b/clang/lib/Tooling/CompilationDatabase.cpp
@@ -403,7 +403,7 @@ namespace tooling {
 // This anchor is used to force the linker to link in the generated object file
 // and thus register the JSONCompilationDatabasePlugin.
 extern volatile int JSONAnchorSource;
-static int LLVM_ATTRIBUTE_UNUSED JSONAnchorDest = JSONAnchorSource;
+[[maybe_unused]] static int JSONAnchorDest = JSONAnchorSource;
 
 } // namespace tooling
 } // namespace clang
diff --git a/clang/lib/Tooling/Execution.cpp b/clang/lib/Tooling/Execution.cpp
index 247b260b97edc..d0499fa364cfe 100644
--- a/clang/lib/Tooling/Execution.cpp
+++ b/clang/lib/Tooling/Execution.cpp
@@ -96,9 +96,9 @@ createExecutorFromCommandLineArgs(int &argc, const char **argv,
 // and thus register the StandaloneToolExecutorPlugin etc.
 extern volatile int StandaloneToolExecutorAnchorSource;
 extern volatile int AllTUsToolExecutorAnchorSource;
-static int LLVM_ATTRIBUTE_UNUSED StandaloneToolExecutorAnchorDest =
+[[maybe_unused]] static int StandaloneToolExecutorAnchorDest =
     StandaloneToolExecutorAnchorSource;
-static int LLVM_ATTRIBUTE_UNUSED AllTUsToolExecutorAnchorDest =
+[[maybe_unused]] static int AllTUsToolExecutorAnchorDest =
     AllTUsToolExecutorAnchorSource;
 
 } // end namespace tooling
diff --git a/clang/lib/Tooling/Syntax/BuildTree.cpp b/clang/lib/Tooling/Syntax/BuildTree.cpp
index 90fd1f91b9ef2..9d49d72dea69b 100644
--- a/clang/lib/Tooling/Syntax/BuildTree.cpp
+++ b/clang/lib/Tooling/Syntax/BuildTree.cpp
@@ -77,8 +77,10 @@ static Expr *IgnoreImplicit(Expr *E) {
                          IgnoreCXXFunctionalCastExprWrappingConstructor);
 }
 
-LLVM_ATTRIBUTE_UNUSED
-static bool isImplicitExpr(Expr *E) { return IgnoreImplicit(E) != E; }
+[[maybe_unused]]
+static bool isImplicitExpr(Expr *E) {
+  return IgnoreImplicit(E) != E;
+}
 
 namespace {
 /// Get start location of the Declarator from the TypeLoc.
diff --git a/clang/test/AST/ByteCode/builtin-functions.cpp b/clang/test/AST/ByteCode/builtin-functions.cpp
index f47bc49d9a1a8..0b7d51be8d824 100644
--- a/clang/test/AST/ByteCode/builtin-functions.cpp
+++ b/clang/test/AST/ByteCode/builtin-functions.cpp
@@ -63,6 +63,19 @@ constexpr int test_address_of_incomplete_array_type() { // both-error {{never pr
 static_assert(test_address_of_incomplete_array_type() == 1234, ""); // both-error {{constant}} \
                                                                     // both-note {{in call}}
 
+namespace LocalExternRedecl {
+  constexpr int externRedecl1() {
+    extern int arr[];
+    return 0;
+  }
+  constexpr int externRedecl2() { // both-error {{never produces a constant expression}}
+    extern int arr[];
+    __builtin_memmove(&arr, &arr, 4 * sizeof(arr[0])); // both-note 2{{incomplete type}}
+    return 1234;
+  }
+  static_assert(externRedecl2() == 1234); // both-error {{not an integral constant expression}} \
+                                          // both-note {{in call to}}
+}
 
   struct NonTrivial {
     constexpr NonTrivial() : n(0) {}
diff --git a/clang/test/AST/ByteCode/c.c b/clang/test/AST/ByteCode/c.c
index 657a920e7d02c..cfdc9d0d3dd86 100644
--- a/clang/test/AST/ByteCode/c.c
+++ b/clang/test/AST/ByteCode/c.c
@@ -372,3 +372,12 @@ void discardedCmp(void)
 /// ArraySubscriptExpr that's not an lvalue
 typedef unsigned char U __attribute__((vector_size(1)));
 void nonLValueASE(U f) { f[0] = f[((U)(U){0})[0]]; }
+
+static char foo_(a) // all-warning {{definition without a prototype}}
+  char a;
+{
+  return 'a';
+}
+static void bar_(void) {
+  foo_(foo_(1));
+}
diff --git a/clang/test/AST/ByteCode/cxx11.cpp b/clang/test/AST/ByteCode/cxx11.cpp
index 72bc7622eb6d8..8efd3201d6200 100644
--- a/clang/test/AST/ByteCode/cxx11.cpp
+++ b/clang/test/AST/ByteCode/cxx11.cpp
@@ -146,6 +146,14 @@ void testValueInRangeOfEnumerationValues() {
 
   const NumberType neg_one = (NumberType) ((NumberType) 0 - (NumberType) 1); // ok, not a constant expression context
 }
+struct EnumTest {
+  enum type {
+      Type1,
+      BOUND
+  };
+  static const type binding_completed = type(BOUND + 1); // both-error {{in-class initializer for static data member is not a constant expression}} \
+                                                         // both-note {{integer value 2 is outside the valid range of values}}
+};
 
 template<class T, unsigned size> struct Bitfield {
   static constexpr T max = static_cast<T>((1 << size) - 1);
diff --git a/clang/test/AST/ByteCode/cxx14.cpp b/clang/test/AST/ByteCode/cxx14.cpp
new file mode 100644
index 0000000000000..9622311e100cb
--- /dev/null
+++ b/clang/test/AST/ByteCode/cxx14.cpp
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 -std=c++14 -verify=both,expected %s -fexperimental-new-constant-interpreter
+// RUN: %clang_cc1 -std=c++14 -verify=both,ref      %s
+
+
+
+constexpr int(*null_ptr)() = nullptr;
+constexpr int test4 = (*null_ptr)(); // both-error {{must be initialized by a constant expression}} \
+                                     // both-note {{evaluates to a null function pointer}}
+
diff --git a/clang/test/AST/ByteCode/extern.cpp b/clang/test/AST/ByteCode/extern.cpp
index a616269911a7e..c3215931d41f8 100644
--- a/clang/test/AST/ByteCode/extern.cpp
+++ b/clang/test/AST/ByteCode/extern.cpp
@@ -1,9 +1,11 @@
 // RUN: %clang_cc1 -fexperimental-new-constant-interpreter -verify=both,expected %s
-// RUN: %clang_cc1 -verify=both,ref %s
-
+// RUN: %clang_cc1                                         -verify=both,ref      %s
 
 // both-no-diagnostics
 
+extern const double Num;
+extern const double Num = 12;
+
 extern const int E;
 constexpr int getE() {
   return E;
diff --git a/clang/test/AST/ByteCode/typeid.cpp b/clang/test/AST/ByteCode/typeid.cpp
index 00b01c8e40682..aca18d4e25277 100644
--- a/clang/test/AST/ByteCode/typeid.cpp
+++ b/clang/test/AST/ByteCode/typeid.cpp
@@ -59,3 +59,16 @@ namespace TypeidPtrInEvaluationResult {
   consteval const std::type_info *ftype_info() { return &typeid(c); }
   const std::type_info *T1 = ftype_info();
 }
+
+// Regression test for crash in ArrayElemPtrPop with typeid pointers. GH-163127
+namespace TypeidPtrRegression {
+  void dontcrash() {
+    constexpr auto res = ((void**)&typeid(int))[0]; // both-error {{must be initialized by a constant expression}} \
+                                                    // both-note {{cast that performs the conversions of a reinterpret_cast is not allowed in a constant expression}}
+  }
+  void dontcrash2() {
+    constexpr auto res = ((void**)&typeid(int))[1]; // both-error {{must be initialized by a constant expression}} \
+                                                    // both-note {{cast that performs the conversions of a reinterpret_cast is not allowed in a constant expression}}
+  }
+}
+
diff --git a/clang/test/AST/HLSL/ByteAddressBuffers-AST.hlsl b/clang/test/AST/HLSL/ByteAddressBuffers-AST.hlsl
index 43d8ddee6ccad..61d5e5ab44c97 100644
--- a/clang/test/AST/HLSL/ByteAddressBuffers-AST.hlsl
+++ b/clang/test/AST/HLSL/ByteAddressBuffers-AST.hlsl
@@ -142,5 +142,19 @@ RESOURCE Buffer;
 // CHECK-NEXT: DeclRefExpr {{.*}} 'hlsl::[[RESOURCE]]' lvalue Var {{.*}} 'tmp' 'hlsl::[[RESOURCE]]'
 // CHECK-NEXT: AlwaysInlineAttr {{.*}} Implicit always_inline
 
+// GetDimensions method
+
+// CHECK-NEXT: CXXMethodDecl {{.*}} GetDimensions 'void (out unsigned int)'
+// CHECK-NEXT: ParmVarDecl {{.*}} dim 'unsigned int &__restrict'
+// CHECK-NEXT: HLSLParamModifierAttr {{.*}} out
+// CHECK-NEXT: CompoundStmt
+// CHECK-NEXT: CallExpr {{.*}} 'void'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(...) noexcept' <BuiltinFnToFnPtr>
+// CHECK-NEXT: DeclRefExpr {{.*}} '<builtin fn type>' Function {{.*}} '__builtin_hlsl_resource_getdimensions_x' 'void (...) noexcept'
+// CHECK-NEXT: MemberExpr {{.*}} '__hlsl_resource_t {{.*}}' lvalue .__handle {{.*}}
+// CHECK-NEXT: CXXThisExpr {{.*}} 'hlsl::[[RESOURCE]]' lvalue implicit this
+// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}}  'dim' 'unsigned int &__restrict'
+// CHECK-NEXT: AlwaysInlineAttr {{.*}} Implicit always_inline
+
 // CHECK-NOSUBSCRIPT-NOT: CXXMethodDecl {{.*}} operator[] 'const char8_t &(unsigned int) const'
 // CHECK-NOSUBSCRIPT-NOT: CXXMethodDecl {{.*}} operator[] 'char8_t &(unsigned int)'
diff --git a/clang/test/AST/HLSL/StructuredBuffers-AST.hlsl b/clang/test/AST/HLSL/StructuredBuffers-AST.hlsl
index e72207e10132c..7a8c57c59643d 100644
--- a/clang/test/AST/HLSL/StructuredBuffers-AST.hlsl
+++ b/clang/test/AST/HLSL/StructuredBuffers-AST.hlsl
@@ -408,6 +408,28 @@ RESOURCE<float> Buffer;
 // CHECK-CONSUME-NEXT: CXXThisExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue implicit this
 // CHECK-CONSUME-NEXT: IntegerLiteral {{.*}} 'int' -1
 
+// GetDimensions method
+
+// CHECK: CXXMethodDecl {{.*}} GetDimensions 'void (out unsigned int, out unsigned int)'
+// CHECK-NEXT: ParmVarDecl {{.*}} numStructs 'unsigned int &__restrict'
+// CHECK-NEXT: HLSLParamModifierAttr {{.*}} out
+// CHECK-NEXT: ParmVarDecl {{.*}} stride 'unsigned int &__restrict'
+// CHECK-NEXT: HLSLParamModifierAttr {{.*}} out
+// CHECK-NEXT: CompoundStmt
+// CHECK-NEXT: CallExpr {{.*}} 'void'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(...) noexcept' <BuiltinFnToFnPtr>
+// CHECK-NEXT: DeclRefExpr {{.*}} '<builtin fn type>' Function {{.*}} '__builtin_hlsl_resource_getdimensions_x' 'void (...) noexcept'
+// CHECK-NEXT: MemberExpr {{.*}} '__hlsl_resource_t {{.*}}' lvalue .__handle {{.*}}
+// CHECK-NEXT: CXXThisExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue implicit this
+// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}}  'numStructs' 'unsigned int &__restrict'
+// CHECK-NEXT: CallExpr {{.*}} 'void'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(...) noexcept' <BuiltinFnToFnPtr>
+// CHECK-NEXT: DeclRefExpr {{.*}} '<builtin fn type>' Function {{.*}} '__builtin_hlsl_resource_getstride' 'void (...) noexcept'
+// CHECK-NEXT: MemberExpr {{.*}} '__hlsl_resource_t {{.*}}' lvalue .__handle {{.*}}
+// CHECK-NEXT: CXXThisExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue implicit this
+// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}}  'stride' 'unsigned int &__restrict'
+// CHECK-NEXT: AlwaysInlineAttr {{.*}} Implicit always_inline
+
 // CHECK: ClassTemplateSpecializationDecl {{.*}} class [[RESOURCE]] definition
 // CHECK: TemplateArgument type 'float'
 // CHECK-NEXT: BuiltinType {{.*}} 'float'
diff --git a/clang/test/AST/HLSL/TypedBuffers-AST.hlsl b/clang/test/AST/HLSL/TypedBuffers-AST.hlsl
index 5182ce194cfb0..14e274d3855ed 100644
--- a/clang/test/AST/HLSL/TypedBuffers-AST.hlsl
+++ b/clang/test/AST/HLSL/TypedBuffers-AST.hlsl
@@ -214,6 +214,20 @@ RESOURCE<float> Buffer;
 // CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}}  'Index' 'unsigned int'
 // CHECK-NEXT: AlwaysInlineAttr {{.*}} Implicit always_inline
 
+// GetDimensions method
+
+// CHECK-NEXT: CXXMethodDecl {{.*}} GetDimensions 'void (out unsigned int)'
+// CHECK-NEXT: ParmVarDecl {{.*}} dim 'unsigned int &__restrict'
+// CHECK-NEXT: HLSLParamModifierAttr {{.*}} out
+// CHECK-NEXT: CompoundStmt
+// CHECK-NEXT: CallExpr {{.*}} 'void'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(...) noexcept' <BuiltinFnToFnPtr>
+// CHECK-NEXT: DeclRefExpr {{.*}} '<builtin fn type>' Function {{.*}} '__builtin_hlsl_resource_getdimensions_x' 'void (...) noexcept'
+// CHECK-NEXT: MemberExpr {{.*}} '__hlsl_resource_t {{.*}}' lvalue .__handle {{.*}}
+// CHECK-NEXT: CXXThisExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue implicit this
+// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}}  'dim' 'unsigned int &__restrict'
+// CHECK-NEXT: AlwaysInlineAttr {{.*}} Implicit always_inline
+
 // CHECK: ClassTemplateSpecializationDecl {{.*}} class [[RESOURCE]] definition
 
 // CHECK: TemplateArgument type 'float'
diff --git a/clang/test/Analysis/Checkers/WebKit/forward-decl-checker.mm b/clang/test/Analysis/Checkers/WebKit/forward-decl-checker.mm
index 104b555c1c41d..8aad838b71b35 100644
--- a/clang/test/Analysis/Checkers/WebKit/forward-decl-checker.mm
+++ b/clang/test/Analysis/Checkers/WebKit/forward-decl-checker.mm
@@ -11,6 +11,8 @@
 
 Obj* provide_obj_ptr();
 void receive_obj_ptr(Obj* p = nullptr);
+void receive_obj_ref(Obj&);
+void receive_obj_rref(Obj&&);
 sqlite3* open_db();
 void close_db(sqlite3*);
 
@@ -38,6 +40,16 @@
   return obj;
 }
 
+void opaque_call_arg(Obj* obj, Obj&& otherObj, const RefPtr<Obj>& safeObj, WeakPtr<Obj> weakObj, std::unique_ptr<Obj>& uniqObj) {
+  receive_obj_ref(*obj);
+  receive_obj_ptr(&*obj);
+  receive_obj_rref(std::move(otherObj));
+  receive_obj_ref(*safeObj.get());
+  receive_obj_ptr(weakObj.get());
+  // expected-warning@-1{{Call argument for parameter 'p' uses a forward declared type 'Obj *'}}
+  receive_obj_ref(*uniqObj);
+}
+
 Obj&& provide_obj_rval();
 void receive_obj_rval(Obj&& p);
 
diff --git a/clang/test/Analysis/Checkers/WebKit/mock-system-header.h b/clang/test/Analysis/Checkers/WebKit/mock-system-header.h
index 1e44de8eb62ad..d55b3abd34f4c 100644
--- a/clang/test/Analysis/Checkers/WebKit/mock-system-header.h
+++ b/clang/test/Analysis/Checkers/WebKit/mock-system-header.h
@@ -34,6 +34,8 @@ void os_log_msg(os_log_t oslog, os_log_type_t type, const char *msg, ...);
 
 typedef const struct __attribute__((objc_bridge(NSString))) __CFString * CFStringRef;
 
+extern CFStringRef const kCFURLTagNamesKey;
+
 #ifdef __OBJC__
 @class NSString;
 @interface SystemObject {
@@ -41,4 +43,8 @@ typedef const struct __attribute__((objc_bridge(NSString))) __CFString * CFStrin
   CFStringRef cf_string;
 }
 @end
+
+typedef NSString *NSNotificationName;
+extern "C" NSNotificationName NSApplicationDidBecomeActiveNotification;
+
 #endif
diff --git a/clang/test/Analysis/Checkers/WebKit/mock-types.h b/clang/test/Analysis/Checkers/WebKit/mock-types.h
index a49faa1d25336..7055a94753a37 100644
--- a/clang/test/Analysis/Checkers/WebKit/mock-types.h
+++ b/clang/test/Analysis/Checkers/WebKit/mock-types.h
@@ -25,23 +25,23 @@ namespace std {
 template <typename T>
 class unique_ptr {
 private:
-  T *t;
+  void *t;
 
 public:
   unique_ptr() : t(nullptr) { }
   unique_ptr(T *t) : t(t) { }
   ~unique_ptr() {
     if (t)
-      delete t;
+      delete static_cast<T*>(t);
   }
   template <typename U> unique_ptr(unique_ptr<U>&& u)
     : t(u.t)
   {
     u.t = nullptr;
   }
-  T *get() const { return t; }
-  T *operator->() const { return t; }
-  T &operator*() const { return *t; }
+  T *get() const { return static_cast<T*>(t); }
+  T *operator->() const { return get(); }
+  T &operator*() const { return *get(); }
   unique_ptr &operator=(T *) { return *this; }
   explicit operator bool() const { return !!t; }
 };
@@ -313,4 +313,90 @@ class UniqueRef {
   UniqueRef &operator=(T &) { return *this; }
 };
 
+class WeakPtrImpl {
+private:
+  void* ptr { nullptr };
+  mutable unsigned m_refCount { 0 };
+
+  template <typename U> friend class CanMakeWeakPtr;
+  template <typename U> friend class WeakPtr;
+
+public:
+  template <typename T>
+  static Ref<WeakPtrImpl> create(T& t)
+  {
+    return adoptRef(*new WeakPtrImpl(t));
+  }
+
+  void ref() const { m_refCount++; }
+  void deref() const {
+    m_refCount--;
+    if (!m_refCount)
+      delete const_cast<WeakPtrImpl*>(this);
+  }
+
+  template <typename T>
+  T* get() { return static_cast<T*>(ptr); }
+  operator bool() const { return !!ptr; }
+  void clear() { ptr = nullptr; }
+
+private:
+  template <typename T>
+  WeakPtrImpl(T* t)
+    : ptr(static_cast<void*>(t))
+  { }
+};
+
+template <typename T>
+class CanMakeWeakPtr {
+private:
+  RefPtr<WeakPtrImpl> impl;
+
+  template <typename U> friend class CanMakeWeakPtr;
+  template <typename U> friend class WeakPtr;
+
+  Ref<WeakPtrImpl> createWeakPtrImpl() {
+    if (!impl)
+      impl = WeakPtrImpl::create(static_cast<T>(*this));
+    return *impl;
+  }
+
+public:
+  ~CanMakeWeakPtr() {
+    if (!impl)
+      return;
+    impl->clear();
+    impl = nullptr;
+  }
+};
+
+template <typename T>
+class WeakPtr {
+private:
+  RefPtr<WeakPtrImpl> impl;
+
+public:
+  WeakPtr(T& t) {
+    *this = t;
+  }
+  WeakPtr(T* t) {
+    *this = t;
+  }
+
+  template <typename U>
+  WeakPtr<T> operator=(U& obj) {
+    impl = obj.createWeakPtrImpl();
+  }
+
+  template <typename U>
+  WeakPtr<T> operator=(U* obj) {
+    impl = obj ? obj->createWeakPtrImpl() : nullptr;
+  }
+
+  T* get() {
+    return impl ? impl->get<T>() : nullptr;
+  }
+
+};
+
 #endif
diff --git a/clang/test/Analysis/Checkers/WebKit/objc-mock-types.h b/clang/test/Analysis/Checkers/WebKit/objc-mock-types.h
index a5fc3d7f9a932..edf40115afa19 100644
--- a/clang/test/Analysis/Checkers/WebKit/objc-mock-types.h
+++ b/clang/test/Analysis/Checkers/WebKit/objc-mock-types.h
@@ -98,12 +98,20 @@ typedef CVImageBufferRef CVPixelBufferRef;
 typedef signed int CVReturn;
 CVReturn CVPixelBufferCreateWithIOSurface(CFAllocatorRef allocator, IOSurfaceRef surface, CFDictionaryRef pixelBufferAttributes, CF_RETURNS_RETAINED CVPixelBufferRef * pixelBufferOut);
 
+extern "C" NSString *NSStringFromSelector(SEL aSelector);
+extern "C" SEL NSSelectorFromString(NSString *aSelectorName);
+
+extern "C" NSString *NSStringFromClass(Class aClass);
+extern "C" Class NSClassFromString(NSString *aClassName);
+
+extern "C" NSString *NSStringFromProtocol(Protocol *proto);
+extern "C" Protocol * NSProtocolFromString(NSString *namestr);
+
 CFRunLoopRef CFRunLoopGetCurrent(void);
 CFRunLoopRef CFRunLoopGetMain(void);
 extern CFTypeRef CFRetain(CFTypeRef cf);
 extern void CFRelease(CFTypeRef cf);
 #define CFSTR(cStr) ((CFStringRef) __builtin___CFStringMakeConstantString ("" cStr ""))
-extern Class NSClassFromString(NSString *aClassName);
 
 #if __has_feature(objc_arc)
 id CFBridgingRelease(CFTypeRef X) {
diff --git a/clang/test/Analysis/Checkers/WebKit/unretained-call-args.mm b/clang/test/Analysis/Checkers/WebKit/unretained-call-args.mm
index a517dbc394dbb..8bef24f93ceed 100644
--- a/clang/test/Analysis/Checkers/WebKit/unretained-call-args.mm
+++ b/clang/test/Analysis/Checkers/WebKit/unretained-call-args.mm
@@ -567,6 +567,64 @@ void foo() {
 
 } // namespace ns_retained_return_value
 
+namespace autoreleased {
+
+NSString *provideAutoreleased() __attribute__((ns_returns_autoreleased));
+void consume(NSString *);
+
+void foo() {
+  consume(provideAutoreleased());
+}
+
+} // autoreleased
+
+namespace sel_string {
+
+void consumeStr(NSString *);
+void consumeSel(SEL);
+void consumeClass(Class);
+void consumeProtocol(Protocol *);
+
+void foo() {
+  consumeStr(NSStringFromSelector(@selector(mutableCopy)));
+  consumeSel(NSSelectorFromString(@"mutableCopy"));
+  consumeStr(NSStringFromClass(NSNumber.class));
+  consumeClass(NSClassFromString(@"NSNumber"));
+  consumeStr(NSStringFromProtocol(@protocol(NSCopying)));
+  consumeProtocol(NSProtocolFromString(@"NSCopying"));
+}
+
+} // namespace sel_string
+
+namespace template_function {
+
+class Base {
+public:
+    virtual ~Base() = default;
+    void send(dispatch_queue_t) const;
+    void ref() const;
+    void deref() const;
+};
+
+template<typename Traits>
+class Derived : public Base {
+public:
+    virtual ~Derived() = default;
+
+    void send(typename Traits::MessageType) const;
+
+    virtual OSObjectPtr<dispatch_queue_t> msg(typename Traits::MessageType) const = 0;
+};
+
+template<typename Traits>
+void Derived<Traits>::send(typename Traits::MessageType messageType) const
+{
+    OSObjectPtr dictionary = msg(messageType);
+    Base::send(dictionary.get());
+}
+
+} // namespace template_function
+
 @interface TestObject : NSObject
 - (void)doWork:(NSString *)msg, ...;
 - (void)doWorkOnSelf;
diff --git a/clang/test/Analysis/Checkers/WebKit/unretained-local-vars.mm b/clang/test/Analysis/Checkers/WebKit/unretained-local-vars.mm
index 307a4d03fe101..f49e7bdb3e79c 100644
--- a/clang/test/Analysis/Checkers/WebKit/unretained-local-vars.mm
+++ b/clang/test/Analysis/Checkers/WebKit/unretained-local-vars.mm
@@ -1,8 +1,11 @@
 // RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.UnretainedLocalVarsChecker -verify %s
 
 #import "objc-mock-types.h"
+#import "mock-system-header.h"
 
 void someFunction();
+extern "C" CFStringRef LocalGlobalCFString;
+extern "C" NSString *LocalGlobalNSString;
 
 namespace raw_ptr {
 void foo() {
@@ -535,6 +538,41 @@ unsigned foo() {
 
 } // namespace ns_retained_return_value
 
+namespace autoreleased {
+
+NSString *provideAutoreleased() __attribute__((ns_returns_autoreleased));
+void consume(NSString *);
+
+void foo() {
+  auto *string = provideAutoreleased();
+  consume(string);
+}
+
+} // autoreleased
+
+namespace ns_global {
+
+void consumeCFString(CFStringRef);
+void consumeNSString(NSString *);
+
+void cf() {
+  auto *str = kCFURLTagNamesKey;
+  consumeCFString(str);
+  auto *localStr = LocalGlobalCFString;
+  // expected-warning@-1{{Local variable 'localStr' is unretained and unsafe [alpha.webkit.UnretainedLocalVarsChecker]}}
+  consumeCFString(localStr);
+}
+
+void ns() {
+  auto *str = NSApplicationDidBecomeActiveNotification;
+  consumeNSString(str);
+  auto *localStr = LocalGlobalNSString;
+  // expected-warning@-1{{Local variable 'localStr' is unretained and unsafe [alpha.webkit.UnretainedLocalVarsChecker]}}
+  consumeNSString(localStr);
+}
+
+}
+
 bool doMoreWorkOpaque(OtherObj*);
 SomeObj* provide();
 
diff --git a/clang/test/Analysis/Checkers/WebKit/unretained-obj-arg.mm b/clang/test/Analysis/Checkers/WebKit/unretained-obj-arg.mm
new file mode 100644
index 0000000000000..5c78b21d6c94f
--- /dev/null
+++ b/clang/test/Analysis/Checkers/WebKit/unretained-obj-arg.mm
@@ -0,0 +1,18 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.UnretainedCallArgsChecker -verify %s
+
+#import "mock-types.h"
+#import "mock-system-header.h"
+
+void consumeCFString(CFStringRef);
+extern "C" CFStringRef LocalGlobalCFString;
+void consumeNSString(NSString *);
+extern "C" NSString *LocalGlobalNSString;
+
+void foo() {
+  consumeCFString(kCFURLTagNamesKey);
+  consumeCFString(LocalGlobalCFString);
+    // expected-warning@-1{{Call argument is unretained and unsafe}}
+  consumeNSString(NSApplicationDidBecomeActiveNotification);
+  consumeNSString(LocalGlobalNSString);
+    // expected-warning@-1{{Call argument is unretained and unsafe}}
+}
diff --git a/clang/test/Analysis/analyzer-enabled-checkers.c b/clang/test/Analysis/analyzer-enabled-checkers.c
index 009233108a70a..bfe418b112a9d 100644
--- a/clang/test/Analysis/analyzer-enabled-checkers.c
+++ b/clang/test/Analysis/analyzer-enabled-checkers.c
@@ -19,6 +19,7 @@
 // CHECK-NEXT: core.NonNullParamChecker
 // CHECK-NEXT: core.NonnilStringConstants
 // CHECK-NEXT: core.NullDereference
+// CHECK-NEXT: core.NullPointerArithm
 // CHECK-NEXT: core.StackAddressEscape
 // CHECK-NEXT: core.UndefinedBinaryOperatorResult
 // CHECK-NEXT: core.VLASize
diff --git a/clang/test/Analysis/analyzer-stats/entry-point-stats.cpp b/clang/test/Analysis/analyzer-stats/entry-point-stats.cpp
index 2a0caad5950ec..3ff3bb1e81ef0 100644
--- a/clang/test/Analysis/analyzer-stats/entry-point-stats.cpp
+++ b/clang/test/Analysis/analyzer-stats/entry-point-stats.cpp
@@ -8,9 +8,10 @@
 // CHECK-NEXT:   "c:@F@fib#i#": {
 // CHECK-NEXT:     "File": "{{.*}}entry-point-stats.cpp",
 // CHECK-NEXT:     "DebugName": "fib(unsigned int)",
+// CHECK-NEXT:     "CFGSize": "5",
 // CHECK-NEXT:     "PathRunningTime": "{{[0-9]+}}",
+// CHECK-NEXT:     "SyntaxRunningTime": "{{[0-9]+}}",
 // CHECK-NEXT:     "MaxBugClassSize": "{{[0-9]+}}",
-// CHECK-NEXT:     "MaxCFGSize": "{{[0-9]+}}",
 // CHECK-NEXT:     "MaxQueueSize": "{{[0-9]+}}",
 // CHECK-NEXT:     "MaxReachableSize": "{{[0-9]+}}",
 // CHECK-NEXT:     "MaxTimeSpentSolvingZ3Queries": "{{[0-9]+}}",
@@ -45,9 +46,10 @@
 // CHECK-NEXT:   "c:@F@main#I#**C#": {
 // CHECK-NEXT:     "File": "{{.*}}entry-point-stats.cpp",
 // CHECK-NEXT:     "DebugName": "main(int, char **)",
+// CHECK-NEXT:     "CFGSize": "3",
 // CHECK-NEXT:     "PathRunningTime": "{{[0-9]+}}",
+// CHECK-NEXT:     "SyntaxRunningTime": "{{[0-9]+}}",
 // CHECK-NEXT:     "MaxBugClassSize": "{{[0-9]+}}",
-// CHECK-NEXT:     "MaxCFGSize": "{{[0-9]+}}",
 // CHECK-NEXT:     "MaxQueueSize": "{{[0-9]+}}",
 // CHECK-NEXT:     "MaxReachableSize": "{{[0-9]+}}",
 // CHECK-NEXT:     "MaxTimeSpentSolvingZ3Queries": "{{[0-9]+}}",
diff --git a/clang/test/Analysis/loc-folding.cpp b/clang/test/Analysis/loc-folding.cpp
new file mode 100644
index 0000000000000..1fcb0668e50fe
--- /dev/null
+++ b/clang/test/Analysis/loc-folding.cpp
@@ -0,0 +1,61 @@
+// RUN: %clang_analyze_cc1 -verify %s -analyzer-config eagerly-assume=false \
+// RUN:   -analyzer-checker=core,debug.ExprInspection
+
+void clang_analyzer_eval(bool);
+
+void element_constant() {
+  char arr[10];
+  clang_analyzer_eval(arr + 1 > arr); // expected-warning{{TRUE}}
+}
+
+void element_known() {
+  char arr[10];
+  int off = 1;
+  clang_analyzer_eval(arr + off > arr); // expected-warning{{TRUE}}
+}
+
+void element_constrained(int off) {
+  char arr[10];
+  if (off == 1) {
+    clang_analyzer_eval(arr + off > arr); // expected-warning{{TRUE}}
+  }
+}
+
+void element_unknown(int off) {
+  char arr[10];
+  clang_analyzer_eval(arr + off > arr); // expected-warning{{UNKNOWN}}
+}
+
+void element_complex(int off) {
+  char arr[10];
+  int comp = off * 2;
+  if (off == 1) {
+    clang_analyzer_eval(arr + comp); // expected-warning{{TRUE}}
+  }
+}
+
+void base_constant(int *arr) {
+  clang_analyzer_eval(arr + 1 > arr); // expected-warning{{TRUE}}
+}
+
+void base_known(int *arr) {
+  int off = 1;
+  clang_analyzer_eval(arr + off > arr); // expected-warning{{TRUE}}
+}
+
+void base_constrained(int *arr, int off) {
+  if (off == 1) {
+    clang_analyzer_eval(arr + off > arr); // expected-warning{{TRUE}}
+  }
+}
+
+void base_unknown(int *arr, int off) {
+  clang_analyzer_eval(arr + off > arr); // expected-warning{{UNKNOWN}}
+}
+
+void base_complex(int *arr, int off) {
+  int comp = off * 2;
+  if (off == 1) {
+    clang_analyzer_eval(arr + comp > arr); // expected-warning{{TRUE}}
+  }
+}
diff --git a/clang/test/Analysis/null-pointer-arithm.c b/clang/test/Analysis/null-pointer-arithm.c
new file mode 100644
index 0000000000000..228824767937f
--- /dev/null
+++ b/clang/test/Analysis/null-pointer-arithm.c
@@ -0,0 +1,76 @@
+// RUN: %clang_analyze_cc1 -verify %s \
+// RUN:   -analyzer-checker=core
+
+extern int *get_pointer();
+
+int *test_add1(int offset) {
+  int *p = get_pointer();
+  if (p) {}
+  return p + offset; // expected-warning{{Addition of a null pointer (from variable 'p') and a probably nonzero integer value (from variable 'offset') may result in undefined behavior}}
+}
+
+int *test_add2(int offset) {
+  int *p = get_pointer();
+  if (p) {}
+  if (offset) {}
+  return p + offset; // expected-warning{{Addition of a null pointer (from variable 'p') and a nonzero integer value (from variable 'offset') results in undefined behavior}}
+}
+
+int *test_add3(int offset) {
+  int *p = get_pointer();
+  if (p) {}
+  if (offset != 0) return 0;
+  return p + offset;
+}
+
+int *test_add4(int offset) {
+  int *p = get_pointer();
+  if (p) {}
+  if (offset == 0) return 0;
+  return p + offset; // expected-warning{{Addition of a null pointer (from variable 'p') and a nonzero integer value (from variable 'offset') results in undefined behavior}}
+}
+
+int *test_add5(int offset) {
+  int *p = get_pointer();
+  if (p) {}
+  return offset + p; // expected-warning{{Addition of a probably nonzero integer value (from variable 'offset') and a null pointer (from variable 'p') may result in undefined behavior}}
+}
+
+int *test_sub1(int offset) {
+  int *p = get_pointer();
+  if (p) {}
+  return p - offset; // expected-warning{{Subtraction of a null pointer (from variable 'p') and a probably nonzero integer value (from variable 'offset') may result in undefined behavior}}
+}
+
+int test_sub_p1() {
+  int *p = get_pointer();
+  if (p) {}
+  return p - p;
+}
+
+int test_sub_p2() {
+  int *p1 = get_pointer();
+  int *p2 = get_pointer();
+  if (p1) {}
+  if (p2) {}
+  return p1 - p2;
+  // expected-warning@-1{{Subtraction of a non-null pointer (from variable 'p1') and a null pointer (from variable 'p2') results in undefined behavior}}
+  // expected-warning@-2{{Subtraction of a null pointer (from variable 'p1') and a non-null pointer (from variable 'p2') results in undefined behavior}}
+}
+
+int test_sub_p3() {
+  int *p1 = get_pointer();
+  int *p2 = get_pointer();
+  if (p1) {}
+  return p1 - p2; // expected-warning{{Subtraction of a null pointer (from variable 'p1') and a probably non-null pointer (from variable 'p2') may result in undefined behavior}}
+}
+
+struct S {
+  char *p;
+  int offset;
+};
+
+char *test_struct(struct S s) {
+  if (s.p) {}
+  return s.p + s.offset; // expected-warning{{Addition of a null pointer (via field 'p') and a probably nonzero integer value (via field 'offset') may result in undefined behavior}}
+}
diff --git a/clang/test/Analysis/std-c-library-functions-arg-enabled-checkers.c b/clang/test/Analysis/std-c-library-functions-arg-enabled-checkers.c
index 7fae958f6afc6..9b3296064981f 100644
--- a/clang/test/Analysis/std-c-library-functions-arg-enabled-checkers.c
+++ b/clang/test/Analysis/std-c-library-functions-arg-enabled-checkers.c
@@ -27,6 +27,7 @@
 // CHECK-NEXT: core.NonNullParamChecker
 // CHECK-NEXT: core.NonnilStringConstants
 // CHECK-NEXT: core.NullDereference
+// CHECK-NEXT: core.NullPointerArithm
 // CHECK-NEXT: core.StackAddressEscape
 // CHECK-NEXT: core.UndefinedBinaryOperatorResult
 // CHECK-NEXT: core.VLASize
diff --git a/clang/test/Analysis/string.c b/clang/test/Analysis/string.c
index cdd36275568e3..9d2458332b723 100644
--- a/clang/test/Analysis/string.c
+++ b/clang/test/Analysis/string.c
@@ -82,16 +82,21 @@ size_t strlen(const char *s);
 
 void strlen_constant0(void) {
   clang_analyzer_eval(strlen("123") == 3); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strlen(&("123"[1])) == 2); // expected-warning{{TRUE}}
 }
 
 void strlen_constant1(void) {
   const char *a = "123";
   clang_analyzer_eval(strlen(a) == 3); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strlen(a + 1) == 2); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strlen(a + 3) == 0); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strlen(a + 4)); // expected-warning{{UNKNOWN}}
 }
 
 void strlen_constant2(char x) {
   char a[] = "123";
   clang_analyzer_eval(strlen(a) == 3); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strlen(a + 1) == 2); // expected-warning{{UNKNOWN}}
 
   a[0] = x;
   clang_analyzer_eval(strlen(a) == 3); // expected-warning{{UNKNOWN}}
@@ -105,10 +110,12 @@ char global_non_const_arr[] = "op";
 
 void strlen_global_constant_ptr(void) {
   clang_analyzer_eval(strlen(global_str_ptr) == 4); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strlen(global_str_ptr + 1) == 3); // expected-warning{{TRUE}}
 }
 
 void strlen_global_constant_arr(void) {
   clang_analyzer_eval(strlen(global_str_arr) == 4); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strlen(global_str_arr + 1) == 3); // expected-warning{{TRUE}}
 }
 
 void strlen_global_non_const_ptr(void) {
@@ -235,6 +242,17 @@ void testStrlenCallee(void) {
   clang_analyzer_eval(lenBefore == lenAfter); // expected-warning{{UNKNOWN}}
 }
 
+void strlen_symbolic_offset(unsigned x) {
+  const char *str = "abcd";
+  if (x < 1 || x > 3)
+    return;
+  // FIXME: these should be TRUE
+  clang_analyzer_eval(strlen(str + x) >= 1); // expected-warning{{UNKNOWN}}
+  clang_analyzer_eval(strlen(str + x) <= 3); // expected-warning{{UNKNOWN}}
+  if (x != 1)
+    return;
+  clang_analyzer_eval(strlen(str + x) == 3); // expected-warning{{TRUE}}
+}
 
 //===----------------------------------------------------------------------===
 // strnlen()
@@ -244,32 +262,38 @@ size_t strnlen(const char *s, size_t maxlen);
 
 void strnlen_constant0(void) {
   clang_analyzer_eval(strnlen("123", 10) == 3); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strnlen(&("123"[1]), 10) == 2); // expected-warning{{TRUE}}
 }
 
 void strnlen_constant1(void) {
   const char *a = "123";
   clang_analyzer_eval(strnlen(a, 10) == 3); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strnlen(a + 1, 10) == 2); // expected-warning{{TRUE}}
 }
 
 void strnlen_constant2(char x) {
   char a[] = "123";
   clang_analyzer_eval(strnlen(a, 10) == 3); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strnlen(a + 1, 10) == 2); // expected-warning{{UNKNOWN}}
   a[0] = x;
   clang_analyzer_eval(strnlen(a, 10) == 3); // expected-warning{{UNKNOWN}}
 }
 
 void strnlen_constant4(void) {
   clang_analyzer_eval(strnlen("123456", 3) == 3); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strnlen(&("123456"[1]), 3) == 3); // expected-warning{{TRUE}}
 }
 
 void strnlen_constant5(void) {
   const char *a = "123456";
   clang_analyzer_eval(strnlen(a, 3) == 3); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strnlen(a + 1, 3) == 3); // expected-warning{{TRUE}}
 }
 
 void strnlen_constant6(char x) {
   char a[] = "123456";
   clang_analyzer_eval(strnlen(a, 3) == 3); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strnlen(a + 1, 3) == 3); // expected-warning{{UNKNOWN}}
   a[0] = x;
   clang_analyzer_eval(strnlen(a, 3) == 3); // expected-warning{{UNKNOWN}}
 }
@@ -326,6 +350,19 @@ void strnlen_at_actual(size_t limit) {
   }
 }
 
+void strnlen_at_actual_1(size_t limit) {
+  const char *str = "abc";
+  size_t len = strnlen(str + 1, limit);
+  clang_analyzer_eval(len <= 2); // expected-warning{{TRUE}}
+  // This is due to eager assertion in strnlen.
+  if (limit == 0) {
+    clang_analyzer_eval(len == 0); // expected-warning{{TRUE}}
+  } else {
+    clang_analyzer_eval(len == 2); // expected-warning{{UNKNOWN}}
+    clang_analyzer_eval(len < 2); // expected-warning{{UNKNOWN}}
+  }
+}
+
 //===----------------------------------------------------------------------===
 // strcpy()
 //===----------------------------------------------------------------------===
diff --git a/clang/test/Analysis/z3-crosscheck-max-attempts.cpp b/clang/test/Analysis/z3-crosscheck-max-attempts.cpp
index 8439236883f10..ab66fe94582b3 100644
--- a/clang/test/Analysis/z3-crosscheck-max-attempts.cpp
+++ b/clang/test/Analysis/z3-crosscheck-max-attempts.cpp
@@ -3,10 +3,10 @@
 // RUN: | FileCheck %s --match-full-lines
 // CHECK: crosscheck-with-z3-max-attempts-per-query = 3
 
-// DEFINE: %{mocked_clang} =                                      \
-// DEFINE: LD_PRELOAD="%llvmshlibdir/MockZ3SolverCheck%pluginext" \
-// DEFINE: %clang_analyze_cc1 %s                                  \
-// DEFINE:   -analyzer-config crosscheck-with-z3=true             \
+// DEFINE: %{mocked_clang} =                                          \
+// DEFINE: env LD_PRELOAD="%llvmshlibdir/MockZ3SolverCheck%pluginext" \
+// DEFINE: %clang_analyze_cc1 %s                                      \
+// DEFINE:   -analyzer-config crosscheck-with-z3=true                 \
 // DEFINE:   -analyzer-checker=core
 
 // DEFINE: %{attempts} = -analyzer-config crosscheck-with-z3-max-attempts-per-query
@@ -14,17 +14,17 @@
 // RUN: not %clang_analyze_cc1 %{attempts}=0 2>&1 | FileCheck %s --check-prefix=VERIFY-INVALID
 // VERIFY-INVALID: invalid input for analyzer-config option 'crosscheck-with-z3-max-attempts-per-query', that expects a positive value
 
-// RUN: Z3_SOLVER_RESULTS="UNDEF"             %{mocked_clang} %{attempts}=1 -verify=refuted
-// RUN: Z3_SOLVER_RESULTS="UNSAT"             %{mocked_clang} %{attempts}=1 -verify=refuted
-// RUN: Z3_SOLVER_RESULTS="SAT"               %{mocked_clang} %{attempts}=1 -verify=accepted
+// RUN: env Z3_SOLVER_RESULTS="UNDEF"             %{mocked_clang} %{attempts}=1 -verify=refuted
+// RUN: env Z3_SOLVER_RESULTS="UNSAT"             %{mocked_clang} %{attempts}=1 -verify=refuted
+// RUN: env Z3_SOLVER_RESULTS="SAT"               %{mocked_clang} %{attempts}=1 -verify=accepted
 
-// RUN: Z3_SOLVER_RESULTS="UNDEF,UNDEF"       %{mocked_clang} %{attempts}=2 -verify=refuted
-// RUN: Z3_SOLVER_RESULTS="UNDEF,UNSAT"       %{mocked_clang} %{attempts}=2 -verify=refuted
-// RUN: Z3_SOLVER_RESULTS="UNDEF,SAT"         %{mocked_clang} %{attempts}=2 -verify=accepted
+// RUN: env Z3_SOLVER_RESULTS="UNDEF,UNDEF"       %{mocked_clang} %{attempts}=2 -verify=refuted
+// RUN: env Z3_SOLVER_RESULTS="UNDEF,UNSAT"       %{mocked_clang} %{attempts}=2 -verify=refuted
+// RUN: env Z3_SOLVER_RESULTS="UNDEF,SAT"         %{mocked_clang} %{attempts}=2 -verify=accepted
 
-// RUN: Z3_SOLVER_RESULTS="UNDEF,UNDEF,UNDEF" %{mocked_clang} %{attempts}=3 -verify=refuted
-// RUN: Z3_SOLVER_RESULTS="UNDEF,UNDEF,UNSAT" %{mocked_clang} %{attempts}=3 -verify=refuted
-// RUN: Z3_SOLVER_RESULTS="UNDEF,UNDEF,SAT"   %{mocked_clang} %{attempts}=3 -verify=accepted
+// RUN: env Z3_SOLVER_RESULTS="UNDEF,UNDEF,UNDEF" %{mocked_clang} %{attempts}=3 -verify=refuted
+// RUN: env Z3_SOLVER_RESULTS="UNDEF,UNDEF,UNSAT" %{mocked_clang} %{attempts}=3 -verify=refuted
+// RUN: env Z3_SOLVER_RESULTS="UNDEF,UNDEF,SAT"   %{mocked_clang} %{attempts}=3 -verify=accepted
 
 
 // REQUIRES: z3, z3-mock, asserts, shell, system-linux
diff --git a/clang/test/Analysis/z3/D83660.c b/clang/test/Analysis/z3/D83660.c
index 16ea4ffb0e1be..a81ce3a63ff16 100644
--- a/clang/test/Analysis/z3/D83660.c
+++ b/clang/test/Analysis/z3/D83660.c
@@ -1,4 +1,4 @@
-// RUN: Z3_SOLVER_RESULTS="SAT,SAT,SAT,SAT,UNDEF" \
+// RUN: env Z3_SOLVER_RESULTS="SAT,SAT,SAT,SAT,UNDEF" \
 // RUN: LD_PRELOAD="%llvmshlibdir/MockZ3SolverCheck%pluginext" \
 // RUN: %clang_analyze_cc1 -analyzer-constraints=z3 \
 // RUN:   -analyzer-checker=core %s -verify
diff --git a/clang/test/Analysis/zero-size-non-pod-array.cpp b/clang/test/Analysis/zero-size-non-pod-array.cpp
index 628be0d7896e0..8a32a4989613e 100644
--- a/clang/test/Analysis/zero-size-non-pod-array.cpp
+++ b/clang/test/Analysis/zero-size-non-pod-array.cpp
@@ -95,7 +95,7 @@ void zeroSizeArrayBinding() {
     // Note: This is an error in gcc but a warning in clang.
     // In MSVC the declaration of 'S arr[0]' is already an error
     // and it doesn't recognize this syntax as a structured binding.
-    auto [] = arr; //expected-warning{{ISO C++17 does not allow a decomposition group to be empty}}
+    auto [] = arr; //expected-warning{{ISO C++17 does not allow a structured binding group to be empty}}
 
     clang_analyzer_eval(S::CtorInvocationCount == 0); //expected-warning{{TRUE}}
 }
diff --git a/clang/test/C/C11/n1285_1.c b/clang/test/C/C11/n1285_1.c
index 345ec94a1eeef..a1422f2b6ff63 100644
--- a/clang/test/C/C11/n1285_1.c
+++ b/clang/test/C/C11/n1285_1.c
@@ -32,9 +32,9 @@ struct X f(void);
 // C11-O2-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_X]], ptr [[REF_TMP]], i32 0, i32 0
 // C11-O2-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [5 x i32], ptr [[A]], i64 0, i64 0
 // C11-O2-NEXT:    call void @llvm.lifetime.end.p0(ptr [[REF_TMP]]) #[[ATTR5]]
-// C11-O2-NEXT:    store ptr [[ARRAYDECAY]], ptr [[P]], align 8, !tbaa [[INTPTR_TBAA2:![0-9]+]]
-// C11-O2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P]], align 8, !tbaa [[INTPTR_TBAA2]]
-// C11-O2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[INT_TBAA7:![0-9]+]]
+// C11-O2-NEXT:    store ptr [[ARRAYDECAY]], ptr [[P]], align 8, !tbaa [[INTPTR_TBAA6:![0-9]+]]
+// C11-O2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P]], align 8, !tbaa [[INTPTR_TBAA6]]
+// C11-O2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[INT_TBAA2:![0-9]+]]
 // C11-O2-NEXT:    call void @llvm.lifetime.end.p0(ptr [[P]]) #[[ATTR5]]
 // C11-O2-NEXT:    ret i32 [[TMP1]]
 //
@@ -91,18 +91,18 @@ int func_return(void) {
 // C11-O2:       [[COND_END]]:
 // C11-O2-NEXT:    [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_X]], ptr [[REF_TMP]], i32 0, i32 0
 // C11-O2-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [5 x i32], ptr [[A1]], i64 0, i64 0
-// C11-O2-NEXT:    store ptr [[ARRAYDECAY]], ptr @p, align 8, !tbaa [[INTPTR_TBAA2]]
+// C11-O2-NEXT:    store ptr [[ARRAYDECAY]], ptr @p, align 8, !tbaa [[INTPTR_TBAA6]]
 // C11-O2-NEXT:    call void @llvm.lifetime.end.p0(ptr [[REF_TMP]]) #[[ATTR5]]
 // C11-O2-NEXT:    call void @llvm.lifetime.start.p0(ptr [[Q]]) #[[ATTR5]]
 // C11-O2-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[DOTCOMPOUNDLITERAL]], i8 0, i64 20, i1 false)
 // C11-O2-NEXT:    [[A2:%.*]] = getelementptr inbounds nuw [[STRUCT_X]], ptr [[DOTCOMPOUNDLITERAL]], i32 0, i32 0
 // C11-O2-NEXT:    [[A3:%.*]] = getelementptr inbounds nuw [[STRUCT_X]], ptr [[DOTCOMPOUNDLITERAL]], i32 0, i32 0
 // C11-O2-NEXT:    [[ARRAYDECAY4:%.*]] = getelementptr inbounds [5 x i32], ptr [[A3]], i64 0, i64 0
-// C11-O2-NEXT:    store ptr [[ARRAYDECAY4]], ptr [[Q]], align 8, !tbaa [[INTPTR_TBAA2]]
-// C11-O2-NEXT:    [[TMP0:%.*]] = load ptr, ptr @p, align 8, !tbaa [[INTPTR_TBAA2]]
-// C11-O2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[INT_TBAA7]]
-// C11-O2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[Q]], align 8, !tbaa [[INTPTR_TBAA2]]
-// C11-O2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4, !tbaa [[INT_TBAA7]]
+// C11-O2-NEXT:    store ptr [[ARRAYDECAY4]], ptr [[Q]], align 8, !tbaa [[INTPTR_TBAA6]]
+// C11-O2-NEXT:    [[TMP0:%.*]] = load ptr, ptr @p, align 8, !tbaa [[INTPTR_TBAA6]]
+// C11-O2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[INT_TBAA2]]
+// C11-O2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[Q]], align 8, !tbaa [[INTPTR_TBAA6]]
+// C11-O2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4, !tbaa [[INT_TBAA2]]
 // C11-O2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP3]]
 // C11-O2-NEXT:    call void @llvm.lifetime.end.p0(ptr [[Q]]) #[[ATTR5]]
 // C11-O2-NEXT:    ret i32 [[ADD]]
@@ -138,10 +138,10 @@ int ternary(void) {
 // C11-O2-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[REF_TMP]], ptr align 4 [[X]], i64 20, i1 false), !tbaa.struct [[TBAA_STRUCT9:![0-9]+]]
 // C11-O2-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_X]], ptr [[REF_TMP]], i32 0, i32 0
 // C11-O2-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [5 x i32], ptr [[A]], i64 0, i64 0
-// C11-O2-NEXT:    store ptr [[ARRAYDECAY]], ptr @p, align 8, !tbaa [[INTPTR_TBAA2]]
+// C11-O2-NEXT:    store ptr [[ARRAYDECAY]], ptr @p, align 8, !tbaa [[INTPTR_TBAA6]]
 // C11-O2-NEXT:    call void @llvm.lifetime.end.p0(ptr [[REF_TMP]]) #[[ATTR5]]
-// C11-O2-NEXT:    [[TMP0:%.*]] = load ptr, ptr @p, align 8, !tbaa [[INTPTR_TBAA2]]
-// C11-O2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[INT_TBAA7]]
+// C11-O2-NEXT:    [[TMP0:%.*]] = load ptr, ptr @p, align 8, !tbaa [[INTPTR_TBAA6]]
+// C11-O2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[INT_TBAA2]]
 // C11-O2-NEXT:    call void @llvm.lifetime.end.p0(ptr [[X]]) #[[ATTR5]]
 // C11-O2-NEXT:    ret i32 [[TMP1]]
 //
@@ -175,10 +175,10 @@ int comma(void) {
 // C11-O2-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[REF_TMP]], ptr align 4 [[X]], i64 20, i1 false), !tbaa.struct [[TBAA_STRUCT9]]
 // C11-O2-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_X]], ptr [[REF_TMP]], i32 0, i32 0
 // C11-O2-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [5 x i32], ptr [[A]], i64 0, i64 0
-// C11-O2-NEXT:    store ptr [[ARRAYDECAY]], ptr @p, align 8, !tbaa [[INTPTR_TBAA2]]
+// C11-O2-NEXT:    store ptr [[ARRAYDECAY]], ptr @p, align 8, !tbaa [[INTPTR_TBAA6]]
 // C11-O2-NEXT:    call void @llvm.lifetime.end.p0(ptr [[REF_TMP]]) #[[ATTR5]]
-// C11-O2-NEXT:    [[TMP0:%.*]] = load ptr, ptr @p, align 8, !tbaa [[INTPTR_TBAA2]]
-// C11-O2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[INT_TBAA7]]
+// C11-O2-NEXT:    [[TMP0:%.*]] = load ptr, ptr @p, align 8, !tbaa [[INTPTR_TBAA6]]
+// C11-O2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[INT_TBAA2]]
 // C11-O2-NEXT:    call void @llvm.lifetime.end.p0(ptr [[X]]) #[[ATTR5]]
 // C11-O2-NEXT:    ret i32 [[TMP1]]
 //
@@ -217,10 +217,10 @@ int cast(void) {
 // C11-O2-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[REF_TMP]], ptr align 4 [[X]], i64 20, i1 false), !tbaa.struct [[TBAA_STRUCT9]]
 // C11-O2-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_X]], ptr [[REF_TMP]], i32 0, i32 0
 // C11-O2-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [5 x i32], ptr [[A]], i64 0, i64 0
-// C11-O2-NEXT:    store ptr [[ARRAYDECAY]], ptr @p, align 8, !tbaa [[INTPTR_TBAA2]]
+// C11-O2-NEXT:    store ptr [[ARRAYDECAY]], ptr @p, align 8, !tbaa [[INTPTR_TBAA6]]
 // C11-O2-NEXT:    call void @llvm.lifetime.end.p0(ptr [[REF_TMP]]) #[[ATTR5]]
-// C11-O2-NEXT:    [[TMP0:%.*]] = load ptr, ptr @p, align 8, !tbaa [[INTPTR_TBAA2]]
-// C11-O2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[INT_TBAA7]]
+// C11-O2-NEXT:    [[TMP0:%.*]] = load ptr, ptr @p, align 8, !tbaa [[INTPTR_TBAA6]]
+// C11-O2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[INT_TBAA2]]
 // C11-O2-NEXT:    call void @llvm.lifetime.end.p0(ptr [[S]]) #[[ATTR5]]
 // C11-O2-NEXT:    call void @llvm.lifetime.end.p0(ptr [[X]]) #[[ATTR5]]
 // C11-O2-NEXT:    ret i32 [[TMP1]]
@@ -232,13 +232,13 @@ int assign(void) {
   return *p;
 }
 //.
-// C11-O2: [[INTPTR_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
-// C11-O2: [[META3]] = !{!"p1 int", [[META4:![0-9]+]], i64 0}
-// C11-O2: [[META4]] = !{!"any pointer", [[META5:![0-9]+]], i64 0}
-// C11-O2: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
-// C11-O2: [[META6]] = !{!"Simple C/C++ TBAA"}
-// C11-O2: [[INT_TBAA7]] = !{[[META8:![0-9]+]], [[META8]], i64 0}
-// C11-O2: [[META8]] = !{!"int", [[META5]], i64 0}
+// C11-O2: [[INT_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// C11-O2: [[META3]] = !{!"int", [[META4:![0-9]+]], i64 0}
+// C11-O2: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// C11-O2: [[META5]] = !{!"Simple C/C++ TBAA"}
+// C11-O2: [[INTPTR_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// C11-O2: [[META7]] = !{!"p1 int", [[META8:![0-9]+]], i64 0}
+// C11-O2: [[META8]] = !{!"any pointer", [[META4]], i64 0}
 // C11-O2: [[TBAA_STRUCT9]] = !{i64 0, i64 20, [[META10:![0-9]+]]}
-// C11-O2: [[META10]] = !{[[META5]], [[META5]], i64 0}
+// C11-O2: [[META10]] = !{[[META4]], [[META4]], i64 0}
 //.
diff --git a/clang/test/C/C23/n3037.c b/clang/test/C/C23/n3037.c
index 3748375692430..113ecf74d8bef 100644
--- a/clang/test/C/C23/n3037.c
+++ b/clang/test/C/C23/n3037.c
@@ -30,11 +30,24 @@ void func2(PRODUCT(int, SUM(float, double)) y) { // c17-warning {{declaration of
 
 struct foop { struct { int x; }; }; // c17-note {{previous definition is here}}
 struct foop { struct { int x; }; }; // c17-error {{redefinition of 'foop'}}
+// Test the field lookup compatibility isn't sufficient, the structure of types should be compatible.
+struct AnonymousStructNotMatchingFields { // c17-note {{previous definition is here}}
+  struct { // c23-note {{field has name '' here}}
+    int x;
+  };
+};
+struct AnonymousStructNotMatchingFields { // c23-error {{type 'struct AnonymousStructNotMatchingFields' has incompatible definitions}} \
+                                             c17-error {{redefinition of 'AnonymousStructNotMatchingFields'}}
+  int x; // c23-note {{field has name 'x' here}}
+};
+
 union barp { int x; float y; };     // c17-note {{previous definition is here}}
 union barp { int x; float y; };     // c17-error {{redefinition of 'barp'}}
 typedef struct q { int x; } q_t;    // c17-note 2 {{previous definition is here}}
 typedef struct q { int x; } q_t;    // c17-error {{redefinition of 'q'}} \
                                        c17-error-re {{typedef redefinition with different types ('struct (unnamed struct at {{.*}})' vs 'struct q')}}
+typedef struct { int x; } untagged_q_t; // both-note {{previous definition is here}}
+typedef struct { int x; } untagged_q_t; // both-error {{typedef redefinition with different types}}
 void func3(void) {
   struct S { int x; };       // c17-note {{previous definition is here}}
   struct T { struct S s; };  // c17-note {{previous definition is here}}
@@ -389,13 +402,40 @@ void nontag_both_in_params(struct { int i; } Arg1, struct { int i; } Arg2) {
   _Static_assert(0 == _Generic(__typeof__(Arg1), __typeof__(Arg2) : 1, default : 0)); // both-warning {{passing a type argument as the first operand to '_Generic' is a C2y extension}}
 }
 
-struct InnerAnonStruct {
+struct InnerUnnamedStruct {
   struct {
     int i;
   } untagged;
-} inner_anon_tagged;
+} inner_unnamed_tagged;
+_Static_assert(0 == _Generic(inner_unnamed_tagged.untagged, struct { int i; } : 1, default : 0));
 
-_Static_assert(0 == _Generic(inner_anon_tagged.untagged, struct { int i; } : 1, default : 0));
+struct InnerUnnamedStruct_same {
+  struct {
+    int i;
+  } untagged;
+};
+struct InnerUnnamedStruct_differentNaming {
+  struct {
+    int i;
+  } untaggedDifferent;
+};
+struct InnerUnnamedStruct_differentShape {
+  float x;
+  struct {
+    int i;
+  } untagged;
+  int y;
+};
+void compare_unnamed_struct_from_different_outer_type(
+    struct InnerUnnamedStruct sameOuterType,
+    struct InnerUnnamedStruct_same matchingType,
+    struct InnerUnnamedStruct_differentNaming differentFieldName,
+    struct InnerUnnamedStruct_differentShape differentType) {
+  inner_unnamed_tagged.untagged = sameOuterType.untagged;
+  inner_unnamed_tagged.untagged = matchingType.untagged; // both-error-re {{assigning to 'struct (unnamed struct at {{.*}})' from incompatible type 'struct (unnamed struct at {{.*}})'}}
+  inner_unnamed_tagged.untagged = differentFieldName.untaggedDifferent; // both-error-re {{assigning to 'struct (unnamed struct at {{.*}})' from incompatible type 'struct (unnamed struct at {{.*}})'}}
+  inner_unnamed_tagged.untagged = differentType.untagged; // both-error-re {{assigning to 'struct (unnamed struct at {{.*}})' from incompatible type 'struct (unnamed struct at {{.*}})'}}
+}
 
 // Test the same thing with enumerations (test for unions is omitted because
 // unions and structures are both RecordDecl objects, whereas EnumDecl is not).
diff --git a/clang/test/C/C2y/n3364.c b/clang/test/C/C2y/n3364.c
index d75f17d0a7a84..ccf7e8d491346 100644
--- a/clang/test/C/C2y/n3364.c
+++ b/clang/test/C/C2y/n3364.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -verify -std=c2y -ffreestanding -Wall -pedantic -emit-llvm -o - %s
-// RUN: %clang_cc1 -verify -ffreestanding -Wall -pedantic -emit-llvm -o - %s
+// RUN: %clang_cc1 -verify -std=c2y -ffreestanding -Wall -pedantic -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -verify -ffreestanding -Wall -pedantic -emit-llvm -o - %s | FileCheck %s
 // expected-no-diagnostics
 
 /* WG14 N3364: Yes
@@ -23,20 +23,20 @@
 float f1 = FLT_SNAN;
 float f2 = +FLT_SNAN;
 float f3 = -FLT_SNAN;
-// CHECK: @f1 = {{.*}}global float 0x7FF0000020000000
-// CHECK: @f2 = {{.*}}global float 0x7FF0000020000000
-// CHECK: @f3 = {{.*}}global float 0xFFF0000020000000
+// CHECK: @f1 = {{.*}}global float 0x7FF4000000000000
+// CHECK: @f2 = {{.*}}global float 0x7FF4000000000000
+// CHECK: @f3 = {{.*}}global float 0xFFF4000000000000
 
 double d1 = DBL_SNAN;
 double d2 = +DBL_SNAN;
 double d3 = -DBL_SNAN;
-// CHECK: @d1 = {{.*}}global double 0x7FF0000000000001
-// CHECK: @d2 = {{.*}}global double 0x7FF0000000000001
-// CHECK: @d3 = {{.*}}global double 0xFFF0000000000001
+// CHECK: @d1 = {{.*}}global double 0x7FF4000000000000
+// CHECK: @d2 = {{.*}}global double 0x7FF4000000000000
+// CHECK: @d3 = {{.*}}global double 0xFFF4000000000000
 
 long double ld1 = LDBL_SNAN;
 long double ld2 = +LDBL_SNAN;
 long double ld3 = -LDBL_SNAN;
-// CHECK: @ld1 = {{.*}}global {{double 0x7FF0000000000001|x86_fp80 0xK7FFF8000000000000001|fp128 0xL00000000000000017FFF000000000000}}
-// CHECK: @ld2 = {{.*}}global {{double 0x7FF0000000000001|x86_fp80 0xK7FFF8000000000000001|fp128 0xL00000000000000017FFF000000000000}}
-// CHECK: @ld3 = {{.*}}global {{double 0xFFF0000000000001|x86_fp80 0xKFFFF8000000000000001|fp128 0xL0000000000000001FFFF000000000000}}
+// CHECK: @ld1 = {{.*}}global {{double 0x7FF4000000000000|x86_fp80 0xK7FFFA000000000000000|fp128 0xL00000000000000007FFF400000000000|ppc_fp128 0xM7FF40000000000000000000000000000}}
+// CHECK: @ld2 = {{.*}}global {{double 0x7FF4000000000000|x86_fp80 0xK7FFFA000000000000000|fp128 0xL00000000000000007FFF400000000000|ppc_fp128 0xM7FF40000000000000000000000000000}}
+// CHECK: @ld3 = {{.*}}global {{double 0xFFF4000000000000|x86_fp80 0xKFFFFA000000000000000|fp128 0xL0000000000000000FFFF400000000000|ppc_fp128 0xMFFF40000000000008000000000000000}}
diff --git a/clang/test/CIR/CodeGen/agg-expr-lvalue.c b/clang/test/CIR/CodeGen/agg-expr-lvalue.c
new file mode 100644
index 0000000000000..c826f8fa829d0
--- /dev/null
+++ b/clang/test/CIR/CodeGen/agg-expr-lvalue.c
@@ -0,0 +1,111 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t-cir.ll %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll
+// RUN: FileCheck --check-prefix=OGCG --input-file=%t.ll %s
+
+struct Point {
+  int x, y;
+};
+
+struct Line {
+  struct Point start;
+  struct Point end;
+};
+
+// AggExprEmitter::VisitMemberExpr
+void test_member_in_array(void) {
+  struct Line line = {{1, 2}, {3, 4}};
+  struct Point arr[1] = {line.start};
+}
+
+// CIR-LABEL: cir.func{{.*}} @test_member_in_array
+// CIR:   %[[LINE:.*]] = cir.alloca !rec_Line{{.*}}, ["line", init]
+// CIR:   %[[ARR:.*]] = cir.alloca !cir.array<!rec_Point x 1>{{.*}}, ["arr", init]
+// CIR:   %[[MEMBER:.*]] = cir.get_member %[[LINE]][0] {name = "start"}
+// CIR:   cir.copy
+
+// LLVM-LABEL: define{{.*}} @test_member_in_array
+// LLVM:   %[[LINE:.*]] = alloca %struct.Line
+// LLVM:   %[[ARR:.*]] = alloca [1 x %struct.Point]
+// LLVM:   %[[MEMBER:.*]] = getelementptr{{.*}}%struct.Line{{.*}}%[[LINE]]{{.*}}i32 0, i32 0
+// LLVM:   call void @llvm.memcpy
+
+// OGCG-LABEL: define{{.*}} @test_member_in_array
+// OGCG:   %[[LINE:.*]] = alloca %struct.Line
+// OGCG:   %[[ARR:.*]] = alloca [1 x %struct.Point]
+// OGCG:   %[[MEMBER:.*]] = getelementptr{{.*}}%struct.Line{{.*}}%[[LINE]]{{.*}}i32 0, i32 0
+// OGCG:   call void @llvm.memcpy
+
+// AggExprEmitter::VisitMemberExpr
+void test_member_arrow_in_array(void) {
+  struct Line *line_ptr;
+  struct Point arr[1] = {line_ptr->start};
+}
+
+// CIR-LABEL: cir.func{{.*}} @test_member_arrow_in_array
+// CIR:   %[[PTR:.*]] = cir.alloca !cir.ptr<!rec_Line>{{.*}}, ["line_ptr"]
+// CIR:   %[[ARR:.*]] = cir.alloca !cir.array<!rec_Point x 1>{{.*}}, ["arr", init]
+// CIR:   %[[LOADED:.*]] = cir.load{{.*}}%[[PTR]]
+// CIR:   %[[MEMBER:.*]] = cir.get_member %[[LOADED]][0] {name = "start"}
+// CIR:   cir.copy
+
+// LLVM-LABEL: define{{.*}} @test_member_arrow_in_array
+// LLVM:   %[[PTR:.*]] = alloca ptr
+// LLVM:   %[[ARR:.*]] = alloca [1 x %struct.Point]
+// LLVM:   %[[LOADED:.*]] = load ptr{{.*}}%[[PTR]]
+// LLVM:   %[[MEMBER:.*]] = getelementptr{{.*}}%struct.Line{{.*}}%[[LOADED]]{{.*}}i32 0, i32 0
+// LLVM:   call void @llvm.memcpy
+
+// OGCG-LABEL: define{{.*}} @test_member_arrow_in_array
+// OGCG:   %[[PTR:.*]] = alloca ptr
+// OGCG:   %[[ARR:.*]] = alloca [1 x %struct.Point]
+// OGCG:   %[[LOADED:.*]] = load ptr{{.*}}%[[PTR]]
+// OGCG:   %[[MEMBER:.*]] = getelementptr{{.*}}%struct.Line{{.*}}%[[LOADED]]{{.*}}i32 0, i32 0
+// OGCG:   call void @llvm.memcpy
+
+// AggExprEmitter::VisitUnaryDeref
+void test_deref_in_array(void) {
+  struct Point *ptr;
+  struct Point arr[1] = {*ptr};
+}
+
+// CIR-LABEL: cir.func{{.*}} @test_deref_in_array
+// CIR:   %[[PTR:.*]] = cir.alloca !cir.ptr<!rec_Point>{{.*}}, ["ptr"]
+// CIR:   %[[ARR:.*]] = cir.alloca !cir.array<!rec_Point x 1>{{.*}}, ["arr", init]
+// CIR:   %[[LOADED:.*]] = cir.load{{.*}}%[[PTR]]
+// CIR:   cir.copy
+
+// LLVM-LABEL: define{{.*}} @test_deref_in_array
+// LLVM:   %[[PTR:.*]] = alloca ptr
+// LLVM:   %[[ARR:.*]] = alloca [1 x %struct.Point]
+// LLVM:   %[[LOADED:.*]] = load ptr{{.*}}%[[PTR]]
+// LLVM:   call void @llvm.memcpy
+
+// OGCG-LABEL: define{{.*}} @test_deref_in_array
+// OGCG:   %[[PTR:.*]] = alloca ptr
+// OGCG:   %[[ARR:.*]] = alloca [1 x %struct.Point]
+// OGCG:   %[[LOADED:.*]] = load ptr{{.*}}%[[PTR]]
+// OGCG:   call void @llvm.memcpy
+
+// AggExprEmitter::VisitStringLiteral
+void test_string_array_in_array(void) {
+    char matrix[2][6] = {"hello", "world"};
+}
+  
+// CIR-LABEL: cir.func{{.*}} @test_string_array_in_array
+// CIR:   cir.alloca !cir.array<!cir.array<!s8i x 6> x 2>, {{.*}}, ["matrix", init]
+// CIR:   cir.get_global
+// CIR:   cir.copy
+// CIR:   cir.get_global
+// CIR:   cir.copy
+
+// LLVM-LABEL: define{{.*}} @test_string_array_in_array
+// LLVM:   alloca [2 x [6 x i8]]
+// LLVM:   call void @llvm.memcpy
+// LLVM:   call void @llvm.memcpy
+
+// OGCG-LABEL: define{{.*}} @test_string_array_in_array
+// OGCG:   alloca [2 x [6 x i8]]
+// OGCG:   call void @llvm.memcpy{{.*}}@__const.test_string_array_in_array
diff --git a/clang/test/CIR/CodeGen/array.cpp b/clang/test/CIR/CodeGen/array.cpp
index d7488bfb258f8..82add4b347e72 100644
--- a/clang/test/CIR/CodeGen/array.cpp
+++ b/clang/test/CIR/CodeGen/array.cpp
@@ -123,7 +123,7 @@ void func() {
 // CIR: %[[TMP:.*]] = cir.load{{.*}} %[[ELE_PTR]] : !cir.ptr<!s32i>, !s32i
 // CIR" cir.store %[[TMP]], %[[INIT_2]] : !s32i, !cir.ptr<!s32i>
 
-// LLVM: define{{.*}} void @_Z4funcv()
+// LLVM: define{{.*}} void @_Z4funcv(){{.*}}
 // LLVM-NEXT: %[[ARR:.*]] = alloca [10 x i32], i64 1, align 16
 // LLVM-NEXT: %[[INIT:.*]] = alloca i32, i64 1, align 4
 // LLVM-NEXT: %[[INIT_2:.*]] = alloca i32, i64 1, align 4
@@ -174,7 +174,7 @@ void func2() {
 // CIR:   cir.condition(%[[CMP]])
 // CIR: }
 
-// LLVM: define{{.*}} void @_Z5func2v()
+// LLVM: define{{.*}} void @_Z5func2v(){{.*}}
 // LLVM:   %[[ARR:.*]] = alloca [2 x i32], i64 1, align 4
 // LLVM:   %[[TMP:.*]] = alloca ptr, i64 1, align 8
 // LLVM:   %[[ARR_PTR:.*]] = getelementptr i32, ptr %[[ARR]], i32 0
@@ -224,7 +224,7 @@ void func3() {
 // CIR: %[[ELE_TMP:.*]] = cir.load{{.*}} %[[ELE_PTR]] : !cir.ptr<!s32i>, !s32i
 // CIR: cir.store{{.*}} %[[ELE_TMP]], %[[INIT]] : !s32i, !cir.ptr<!s32i>
 
-// LLVM: define{{.*}} void @_Z5func3v()
+// LLVM: define{{.*}} void @_Z5func3v(){{.*}}
 // LLVM:  %[[ARR:.*]] = alloca [2 x i32], i64 1, align 4
 // LLVM:  %[[IDX:.*]] = alloca i32, i64 1, align 4
 // LLVM:  %[[INIT:.*]] = alloca i32, i64 1, align 4
@@ -276,7 +276,7 @@ void func4() {
 // CIR: %[[TMP:.*]] = cir.load{{.*}} %[[ELE_0]] : !cir.ptr<!s32i>, !s32i
 // CIR: cir.store{{.*}} %[[TMP]], %[[INIT]] : !s32i, !cir.ptr<!s32i>
 
-// LLVM: define{{.*}} void @_Z5func4v()
+// LLVM: define{{.*}} void @_Z5func4v(){{.*}}
 // LLVM:  %[[ARR:.*]] = alloca [2 x [1 x i32]], i64 1, align 4
 // LLVM:  %[[INIT:.*]] = alloca i32, i64 1, align 4
 // LLVM:  %[[ARR_PTR:.*]] = getelementptr [1 x i32], ptr %[[ARR]], i32 0
@@ -329,7 +329,7 @@ void func5() {
 // CIR:   cir.condition(%[[CMP]])
 // CIR: }
 
-// LLVM: define{{.*}} void @_Z5func5v()
+// LLVM: define{{.*}} void @_Z5func5v(){{.*}}
 // LLVM:   %[[ARR:.*]] = alloca [2 x [1 x i32]], i64 1, align 4
 // LLVM:   %[[TMP:.*]] = alloca ptr, i64 1, align 8
 // LLVM:   %[[ARR_PTR:.*]] = getelementptr [1 x i32], ptr %[[ARR]], i32 0
@@ -372,7 +372,7 @@ void func6() {
 // CIR: %[[V1:.*]] = cir.const #cir.int<5> : !s32i
 // CIR: cir.store{{.*}} %[[V1]], %[[ELE_PTR]] : !s32i, !cir.ptr<!s32i>
 
-// LLVM: define{{.*}} void @_Z5func6v()
+// LLVM: define{{.*}} void @_Z5func6v(){{.*}}
 // LLVM:  %[[VAR:.*]] = alloca i32, i64 1, align 4
 // LLVM:  %[[ARR:.*]] = alloca [2 x i32], i64 1, align 4
 // LLVM:  store i32 4, ptr %[[VAR]], align 4
@@ -414,7 +414,7 @@ void func7() {
 // CIR:   cir.condition(%[[CMP]])
 // CIR: }
 
-// LLVM: define{{.*}} void @_Z5func7v()
+// LLVM: define{{.*}} void @_Z5func7v(){{.*}}
 // LLVM:   %[[ARR:.*]] = alloca [1 x ptr], i64 1, align 8
 // LLVM:   %[[TMP:.*]] = alloca ptr, i64 1, align 8
 // LLVM:   %[[ARR_PTR:.*]] = getelementptr ptr, ptr %[[ARR]], i32 0
@@ -458,7 +458,7 @@ void func8(int arr[10]) {
 // CIR:  %[[TMP_4:.*]] = cir.load{{.*}} %[[ELE_1]] : !cir.ptr<!s32i>, !s32i
 // CIR:  cir.store{{.*}} %[[TMP_4]], %[[INIT_2]] : !s32i, !cir.ptr<!s32i>
 
-// LLVM: define{{.*}} void @_Z5func8Pi(ptr %[[ARG:.*]])
+// LLVM: define{{.*}} void @_Z5func8Pi(ptr %[[ARG:.*]]){{.*}}
 // LLVM:  %[[ARR:.*]] = alloca ptr, i64 1, align 8
 // LLVM:  %[[INIT:.*]] = alloca i32, i64 1, align 4
 // LLVM:  %[[INIT_2:.*]] = alloca i32, i64 1, align 4
@@ -502,7 +502,7 @@ void func9(int arr[10][5]) {
 // CIR:  %[[TMP_2:.*]] = cir.load{{.*}} %[[ARR_1_2]] : !cir.ptr<!s32i>, !s32i
 // CIR:  cir.store{{.*}} %[[TMP_2]], %[[INIT]] : !s32i, !cir.ptr<!s32i>
 
-// LLVM: define{{.*}} void @_Z5func9PA5_i(ptr %[[ARG:.*]])
+// LLVM: define{{.*}} void @_Z5func9PA5_i(ptr %[[ARG:.*]]){{.*}}
 // LLVM:  %[[ARR:.*]] = alloca ptr, i64 1, align 8
 // LLVM:  %[[INIT:.*]] = alloca i32, i64 1, align 4
 // LLVM:  store ptr %[[ARG]], ptr %[[ARR]], align 8
@@ -536,7 +536,7 @@ void func10(int *a) {
 // CIR: %[[TMP_2:.*]] = cir.load{{.*}} %[[ELE]] : !cir.ptr<!s32i>, !s32i
 // CIR: cir.store{{.*}} %[[TMP_2]], %[[INIT]] : !s32i, !cir.ptr<!s32i>
 
-// LLVM: define{{.*}} void @_Z6func10Pi(ptr %[[ARG:.*]]) {
+// LLVM: define{{.*}} void @_Z6func10Pi(ptr %[[ARG:.*]]){{.*}} {
 // LLVM:  %[[ARR:.*]] = alloca ptr, i64 1, align 8
 // LLVM:  %[[INIT:.*]] = alloca i32, i64 1, align 4
 // LLVM:  store ptr %[[ARG]], ptr %[[ARR]], align 8
diff --git a/clang/test/CIR/CodeGen/assign-operator.cpp b/clang/test/CIR/CodeGen/assign-operator.cpp
index 1089d4b6e69f8..66d4b4818c10e 100644
--- a/clang/test/CIR/CodeGen/assign-operator.cpp
+++ b/clang/test/CIR/CodeGen/assign-operator.cpp
@@ -20,7 +20,7 @@ void a() {
 // CIR:   %[[ONE_CAST:.*]] = cir.cast integral %[[ONE]] : !u32i -> !s32i
 // CIR:   %[[RET:.*]] = cir.call @_ZN1xaSEi(%[[A_ADDR]], %[[ONE_CAST]]) : (!cir.ptr<!rec_x>, !s32i) -> !s32i
 
-// LLVM: define{{.*}} @_Z1av()
+// LLVM: define{{.*}} @_Z1av(){{.*}}
 // OGCG: define{{.*}} @_Z1av()
 
 void f(int i, int j) {
@@ -121,7 +121,7 @@ void copy_ref_to_ref(E &e1, E &e2) {
 // CIR:   %[[D1_REF_2:.*]] = cir.call @_ZN1DaSERKS_(%[[D1_REF]], %[[D2_REF]])
 // CIR:   cir.return
 
-// LLVM: define{{.*}} void @_Z15copy_ref_to_refR1ES0_(ptr %[[ARG0:.*]], ptr %[[ARG1:.*]]) {
+// LLVM: define{{.*}} void @_Z15copy_ref_to_refR1ES0_(ptr %[[ARG0:.*]], ptr %[[ARG1:.*]]){{.*}} {
 // LLVM:   %[[E1_ADDR:.*]] = alloca ptr
 // LLVM:   %[[E2_ADDR:.*]] = alloca ptr
 // LLVM:   store ptr %[[ARG0]], ptr %[[E1_ADDR]]
diff --git a/clang/test/CIR/CodeGen/atomic.c b/clang/test/CIR/CodeGen/atomic.c
index 76289c597a2b5..65799881a0cbe 100644
--- a/clang/test/CIR/CodeGen/atomic.c
+++ b/clang/test/CIR/CodeGen/atomic.c
@@ -211,7 +211,7 @@ void c11_atomic_cmpxchg_strong(_Atomic(int) *ptr, int *expected, int desired) {
 
   __c11_atomic_compare_exchange_strong(ptr, expected, desired,
                                        __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE);
-  // CIR:         %[[OLD:.+]], %[[SUCCESS:.+]] = cir.atomic.cmpxchg(%{{.+}} : !cir.ptr<!s32i>, %{{.+}} : !s32i, %{{.+}} : !s32i, success = seq_cst, failure = acquire) align(4) : (!s32i, !cir.bool)
+  // CIR:         %[[OLD:.+]], %[[SUCCESS:.+]] = cir.atomic.cmpxchg success(seq_cst) failure(acquire) %{{.+}}, %{{.+}}, %{{.+}} align(4) : (!cir.ptr<!s32i>, !s32i, !s32i) -> (!s32i, !cir.bool)
   // CIR-NEXT:    %[[FAILED:.+]] = cir.unary(not, %[[SUCCESS]]) : !cir.bool, !cir.bool
   // CIR-NEXT:    cir.if %[[FAILED]] {
   // CIR-NEXT:      cir.store align(4) %[[OLD]], %{{.+}} : !s32i, !cir.ptr<!s32i>
@@ -249,7 +249,7 @@ void c11_atomic_cmpxchg_weak(_Atomic(int) *ptr, int *expected, int desired) {
 
   __c11_atomic_compare_exchange_weak(ptr, expected, desired,
                                      __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE);
-  // CIR:         %[[OLD:.+]], %[[SUCCESS:.+]] = cir.atomic.cmpxchg(%{{.+}} : !cir.ptr<!s32i>, %{{.+}} : !s32i, %{{.+}} : !s32i, success = seq_cst, failure = acquire) align(4) weak : (!s32i, !cir.bool)
+  // CIR:         %[[OLD:.+]], %[[SUCCESS:.+]] = cir.atomic.cmpxchg weak success(seq_cst) failure(acquire) %{{.+}}, %{{.+}}, %{{.+}} align(4) : (!cir.ptr<!s32i>, !s32i, !s32i) -> (!s32i, !cir.bool)
   // CIR-NEXT:    %[[FAILED:.+]] = cir.unary(not, %[[SUCCESS]]) : !cir.bool, !cir.bool
   // CIR-NEXT:    cir.if %[[FAILED]] {
   // CIR-NEXT:      cir.store align(4) %[[OLD]], %{{.+}} : !s32i, !cir.ptr<!s32i>
@@ -286,7 +286,7 @@ void atomic_cmpxchg(int *ptr, int *expected, int *desired) {
   // OGCG-LABEL: @atomic_cmpxchg
 
   __atomic_compare_exchange(ptr, expected, desired, /*weak=*/0, __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE);
-  // CIR:         %[[OLD:.+]], %[[SUCCESS:.+]] = cir.atomic.cmpxchg(%{{.+}} : !cir.ptr<!s32i>, %{{.+}} : !s32i, %{{.+}} : !s32i, success = seq_cst, failure = acquire) align(4) : (!s32i, !cir.bool)
+  // CIR:         %[[OLD:.+]], %[[SUCCESS:.+]] = cir.atomic.cmpxchg success(seq_cst) failure(acquire) %{{.+}}, %{{.+}}, %{{.+}} align(4) : (!cir.ptr<!s32i>, !s32i, !s32i) -> (!s32i, !cir.bool)
   // CIR-NEXT:    %[[FAILED:.+]] = cir.unary(not, %[[SUCCESS]]) : !cir.bool, !cir.bool
   // CIR-NEXT:    cir.if %[[FAILED]] {
   // CIR-NEXT:      cir.store align(4) %[[OLD]], %{{.+}} : !s32i, !cir.ptr<!s32i>
@@ -317,7 +317,7 @@ void atomic_cmpxchg(int *ptr, int *expected, int *desired) {
   // OGCG-NEXT:    store i8 %[[SUCCESS_2]], ptr %{{.+}}, align 1
 
   __atomic_compare_exchange(ptr, expected, desired, /*weak=*/1, __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE);
-  // CIR:         %[[OLD:.+]], %[[SUCCESS:.+]] = cir.atomic.cmpxchg(%{{.+}} : !cir.ptr<!s32i>, %{{.+}} : !s32i, %{{.+}} : !s32i, success = seq_cst, failure = acquire) align(4) weak : (!s32i, !cir.bool)
+  // CIR:         %[[OLD:.+]], %[[SUCCESS:.+]] = cir.atomic.cmpxchg weak success(seq_cst) failure(acquire) %{{.+}}, %{{.+}}, %{{.+}} align(4) : (!cir.ptr<!s32i>, !s32i, !s32i) -> (!s32i, !cir.bool)
   // CIR-NEXT:    %[[FAILED:.+]] = cir.unary(not, %[[SUCCESS]]) : !cir.bool, !cir.bool
   // CIR-NEXT:    cir.if %[[FAILED]] {
   // CIR-NEXT:      cir.store align(4) %[[OLD]], %{{.+}} : !s32i, !cir.ptr<!s32i>
@@ -354,7 +354,7 @@ void atomic_cmpxchg_n(int *ptr, int *expected, int desired) {
   // OGCG-LABEL: @atomic_cmpxchg_n
 
   __atomic_compare_exchange_n(ptr, expected, desired, /*weak=*/0, __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE);
-  // CIR:         %[[OLD:.+]], %[[SUCCESS:.+]] = cir.atomic.cmpxchg(%{{.+}} : !cir.ptr<!s32i>, %{{.+}} : !s32i, %{{.+}} : !s32i, success = seq_cst, failure = acquire) align(4) : (!s32i, !cir.bool)
+  // CIR:         %[[OLD:.+]], %[[SUCCESS:.+]] = cir.atomic.cmpxchg success(seq_cst) failure(acquire) %{{.+}}, %{{.+}}, %{{.+}} align(4) : (!cir.ptr<!s32i>, !s32i, !s32i) -> (!s32i, !cir.bool)
   // CIR-NEXT:    %[[FAILED:.+]] = cir.unary(not, %[[SUCCESS]]) : !cir.bool, !cir.bool
   // CIR-NEXT:    cir.if %[[FAILED]] {
   // CIR-NEXT:      cir.store align(4) %[[OLD]], %{{.+}} : !s32i, !cir.ptr<!s32i>
@@ -385,7 +385,7 @@ void atomic_cmpxchg_n(int *ptr, int *expected, int desired) {
   // OGCG-NEXT:    store i8 %[[SUCCESS_2]], ptr %{{.+}}, align 1
 
   __atomic_compare_exchange_n(ptr, expected, desired, /*weak=*/1, __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE);
-  // CIR:         %[[OLD:.+]], %[[SUCCESS:.+]] = cir.atomic.cmpxchg(%{{.+}} : !cir.ptr<!s32i>, %{{.+}} : !s32i, %{{.+}} : !s32i, success = seq_cst, failure = acquire) align(4) weak : (!s32i, !cir.bool)
+  // CIR:         %[[OLD:.+]], %[[SUCCESS:.+]] = cir.atomic.cmpxchg weak success(seq_cst) failure(acquire) %{{.+}}, %{{.+}}, %{{.+}} align(4) : (!cir.ptr<!s32i>, !s32i, !s32i) -> (!s32i, !cir.bool)
   // CIR-NEXT:    %[[FAILED:.+]] = cir.unary(not, %[[SUCCESS]]) : !cir.bool, !cir.bool
   // CIR-NEXT:    cir.if %[[FAILED]] {
   // CIR-NEXT:      cir.store align(4) %[[OLD]], %{{.+}} : !s32i, !cir.ptr<!s32i>
@@ -427,12 +427,12 @@ void c11_atomic_exchange(_Atomic(int) *ptr, int value) {
   __c11_atomic_exchange(ptr, value, __ATOMIC_RELEASE);
   __c11_atomic_exchange(ptr, value, __ATOMIC_ACQ_REL);
   __c11_atomic_exchange(ptr, value, __ATOMIC_SEQ_CST);
-  // CIR: %{{.+}} = cir.atomic.xchg relaxed %{{.+}}, %{{.+}} : !cir.ptr<!s32i> -> !s32i
-  // CIR: %{{.+}} = cir.atomic.xchg consume %{{.+}}, %{{.+}} : !cir.ptr<!s32i> -> !s32i
-  // CIR: %{{.+}} = cir.atomic.xchg acquire %{{.+}}, %{{.+}} : !cir.ptr<!s32i> -> !s32i
-  // CIR: %{{.+}} = cir.atomic.xchg release %{{.+}}, %{{.+}} : !cir.ptr<!s32i> -> !s32i
-  // CIR: %{{.+}} = cir.atomic.xchg acq_rel %{{.+}}, %{{.+}} : !cir.ptr<!s32i> -> !s32i
-  // CIR: %{{.+}} = cir.atomic.xchg seq_cst %{{.+}}, %{{.+}} : !cir.ptr<!s32i> -> !s32i
+  // CIR: %{{.+}} = cir.atomic.xchg relaxed %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
+  // CIR: %{{.+}} = cir.atomic.xchg consume %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
+  // CIR: %{{.+}} = cir.atomic.xchg acquire %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
+  // CIR: %{{.+}} = cir.atomic.xchg release %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
+  // CIR: %{{.+}} = cir.atomic.xchg acq_rel %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
+  // CIR: %{{.+}} = cir.atomic.xchg seq_cst %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
 
   // LLVM: %{{.+}} = atomicrmw xchg ptr %{{.+}}, i32 %{{.+}} monotonic, align 4
   // LLVM: %{{.+}} = atomicrmw xchg ptr %{{.+}}, i32 %{{.+}} acquire, align 4
@@ -460,12 +460,12 @@ void atomic_exchange(int *ptr, int *value, int *old) {
   __atomic_exchange(ptr, value, old, __ATOMIC_RELEASE);
   __atomic_exchange(ptr, value, old, __ATOMIC_ACQ_REL);
   __atomic_exchange(ptr, value, old, __ATOMIC_SEQ_CST);
-  // CIR: %{{.+}} = cir.atomic.xchg relaxed %{{.+}}, %{{.+}} : !cir.ptr<!s32i> -> !s32i
-  // CIR: %{{.+}} = cir.atomic.xchg consume %{{.+}}, %{{.+}} : !cir.ptr<!s32i> -> !s32i
-  // CIR: %{{.+}} = cir.atomic.xchg acquire %{{.+}}, %{{.+}} : !cir.ptr<!s32i> -> !s32i
-  // CIR: %{{.+}} = cir.atomic.xchg release %{{.+}}, %{{.+}} : !cir.ptr<!s32i> -> !s32i
-  // CIR: %{{.+}} = cir.atomic.xchg acq_rel %{{.+}}, %{{.+}} : !cir.ptr<!s32i> -> !s32i
-  // CIR: %{{.+}} = cir.atomic.xchg seq_cst %{{.+}}, %{{.+}} : !cir.ptr<!s32i> -> !s32i
+  // CIR: %{{.+}} = cir.atomic.xchg relaxed %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
+  // CIR: %{{.+}} = cir.atomic.xchg consume %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
+  // CIR: %{{.+}} = cir.atomic.xchg acquire %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
+  // CIR: %{{.+}} = cir.atomic.xchg release %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
+  // CIR: %{{.+}} = cir.atomic.xchg acq_rel %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
+  // CIR: %{{.+}} = cir.atomic.xchg seq_cst %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
 
   // LLVM: %{{.+}} = atomicrmw xchg ptr %{{.+}}, i32 %{{.+}} monotonic, align 4
   // LLVM: %{{.+}} = atomicrmw xchg ptr %{{.+}}, i32 %{{.+}} acquire, align 4
@@ -493,12 +493,12 @@ void atomic_exchange_n(int *ptr, int value) {
   __atomic_exchange_n(ptr, value, __ATOMIC_RELEASE);
   __atomic_exchange_n(ptr, value, __ATOMIC_ACQ_REL);
   __atomic_exchange_n(ptr, value, __ATOMIC_SEQ_CST);
-  // CIR: %{{.+}} = cir.atomic.xchg relaxed %{{.+}}, %{{.+}} : !cir.ptr<!s32i> -> !s32i
-  // CIR: %{{.+}} = cir.atomic.xchg consume %{{.+}}, %{{.+}} : !cir.ptr<!s32i> -> !s32i
-  // CIR: %{{.+}} = cir.atomic.xchg acquire %{{.+}}, %{{.+}} : !cir.ptr<!s32i> -> !s32i
-  // CIR: %{{.+}} = cir.atomic.xchg release %{{.+}}, %{{.+}} : !cir.ptr<!s32i> -> !s32i
-  // CIR: %{{.+}} = cir.atomic.xchg acq_rel %{{.+}}, %{{.+}} : !cir.ptr<!s32i> -> !s32i
-  // CIR: %{{.+}} = cir.atomic.xchg seq_cst %{{.+}}, %{{.+}} : !cir.ptr<!s32i> -> !s32i
+  // CIR: %{{.+}} = cir.atomic.xchg relaxed %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
+  // CIR: %{{.+}} = cir.atomic.xchg consume %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
+  // CIR: %{{.+}} = cir.atomic.xchg acquire %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
+  // CIR: %{{.+}} = cir.atomic.xchg release %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
+  // CIR: %{{.+}} = cir.atomic.xchg acq_rel %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
+  // CIR: %{{.+}} = cir.atomic.xchg seq_cst %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
 
   // LLVM: %{{.+}} = atomicrmw xchg ptr %{{.+}}, i32 %{{.+}} monotonic, align 4
   // LLVM: %{{.+}} = atomicrmw xchg ptr %{{.+}}, i32 %{{.+}} acquire, align 4
@@ -514,3 +514,596 @@ void atomic_exchange_n(int *ptr, int value) {
   // OGCG: %{{.+}} = atomicrmw xchg ptr %{{.+}}, i32 %{{.+}} acq_rel, align 4
   // OGCG: %{{.+}} = atomicrmw xchg ptr %{{.+}}, i32 %{{.+}} seq_cst, align 4
 }
+
+void test_and_set(void *p) {
+  // CIR-LABEL: @test_and_set
+  // LLVM-LABEL: @test_and_set
+  // OGCG-LABEL: @test_and_set
+
+  __atomic_test_and_set(p, __ATOMIC_SEQ_CST);
+  // CIR:      %[[VOID_PTR:.+]] = cir.load align(8) %{{.+}} : !cir.ptr<!cir.ptr<!void>>, !cir.ptr<!void>
+  // CIR-NEXT: %[[PTR:.+]] = cir.cast bitcast %[[VOID_PTR]] : !cir.ptr<!void> -> !cir.ptr<!s8i>
+  // CIR-NEXT: %[[RES:.+]] = cir.atomic.test_and_set seq_cst %[[PTR]] : !cir.ptr<!s8i> -> !cir.bool
+  // CIR-NEXT: cir.store align(1) %[[RES]], %{{.+}} : !cir.bool, !cir.ptr<!cir.bool>
+
+  // LLVM:      %[[PTR:.+]] = load ptr, ptr %{{.+}}, align 8
+  // LLVM-NEXT: %[[RES:.+]] = atomicrmw xchg ptr %[[PTR]], i8 1 seq_cst, align 1
+  // LLVM-NEXT: %{{.+}} = icmp ne i8 %[[RES]], 0
+
+  // OGCG:      %[[PTR:.+]] = load ptr, ptr %{{.+}}, align 8
+  // OGCG-NEXT: %[[RES:.+]] = atomicrmw xchg ptr %[[PTR]], i8 1 seq_cst, align 1
+  // OGCG-NEXT: %{{.+}} = icmp ne i8 %[[RES]], 0
+}
+
+void test_and_set_volatile(volatile void *p) {
+  // CIR-LABEL: @test_and_set_volatile
+  // LLVM-LABEL: @test_and_set_volatile
+  // OGCG-LABEL: @test_and_set_volatile
+
+  __atomic_test_and_set(p, __ATOMIC_SEQ_CST);
+  // CIR:      %[[VOID_PTR:.+]] = cir.load align(8) %{{.+}} : !cir.ptr<!cir.ptr<!void>>, !cir.ptr<!void>
+  // CIR-NEXT: %[[PTR:.+]] = cir.cast bitcast %[[VOID_PTR]] : !cir.ptr<!void> -> !cir.ptr<!s8i>
+  // CIR-NEXT: %[[RES:.+]] = cir.atomic.test_and_set seq_cst %[[PTR]] volatile : !cir.ptr<!s8i> -> !cir.bool
+  // CIR-NEXT: cir.store align(1) %[[RES]], %{{.+}} : !cir.bool, !cir.ptr<!cir.bool>
+
+  // LLVM:      %[[PTR:.+]] = load ptr, ptr %{{.+}}, align 8
+  // LLVM-NEXT: %[[RES:.+]] = atomicrmw volatile xchg ptr %[[PTR]], i8 1 seq_cst, align 1
+  // LLVM-NEXT: %{{.+}} = icmp ne i8 %[[RES]], 0
+
+  // OGCG:      %[[PTR:.+]] = load ptr, ptr %{{.+}}, align 8
+  // OGCG-NEXT: %[[RES:.+]] = atomicrmw volatile xchg ptr %[[PTR]], i8 1 seq_cst, align 1
+  // OGCG-NEXT: %{{.+}} = icmp ne i8 %[[RES]], 0
+}
+
+void clear(void *p) {
+  // CIR-LABEL: @clear
+  // LLVM-LABEL: @clear
+  // OGCG-LABEL: @clear
+
+  __atomic_clear(p, __ATOMIC_SEQ_CST);
+  // CIR:      %[[VOID_PTR:.+]] = cir.load align(8) %{{.+}} : !cir.ptr<!cir.ptr<!void>>, !cir.ptr<!void>
+  // CIR-NEXT: %[[PTR:.+]] = cir.cast bitcast %[[VOID_PTR]] : !cir.ptr<!void> -> !cir.ptr<!s8i>
+  // CIR:      cir.atomic.clear seq_cst %[[PTR]] : !cir.ptr<!s8i>
+
+  // LLVM: store atomic i8 0, ptr %{{.+}} seq_cst, align 1
+
+  // OGCG: store atomic i8 0, ptr %{{.+}} seq_cst, align 1
+}
+
+void clear_volatile(volatile void *p) {
+  // CIR-LABEL: @clear_volatile
+  // LLVM-LABEL: @clear_volatile
+  // OGCG-LABEL: @clear_volatile
+
+  __atomic_clear(p, __ATOMIC_SEQ_CST);
+  // CIR:      %[[VOID_PTR:.+]] = cir.load align(8) %{{.+}} : !cir.ptr<!cir.ptr<!void>>, !cir.ptr<!void>
+  // CIR-NEXT: %[[PTR:.+]] = cir.cast bitcast %[[VOID_PTR]] : !cir.ptr<!void> -> !cir.ptr<!s8i>
+  // CIR:      cir.atomic.clear seq_cst %[[PTR]] volatile : !cir.ptr<!s8i>
+
+  // LLVM: store atomic volatile i8 0, ptr %{{.+}} seq_cst, align 1
+
+  // OGCG: store atomic volatile i8 0, ptr %{{.+}} seq_cst, align 1
+}
+
+int atomic_fetch_add(int *ptr, int value) {
+  // CIR-LABEL: @atomic_fetch_add
+  // LLVM-LABEL: @atomic_fetch_add
+  // OGCG-LABEL: @atomic_fetch_add
+
+  return __atomic_fetch_add(ptr, value, __ATOMIC_SEQ_CST);
+  // CIR: %{{.+}} = cir.atomic.fetch add seq_cst fetch_first %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
+
+  // LLVM:      %[[RES:.+]] = atomicrmw add ptr %{{.+}}, i32 %{{.+}} seq_cst, align 4
+  // LLVM-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+
+  // OGCG:      %[[RES:.+]] = atomicrmw add ptr %{{.+}}, i32 %{{.+}} seq_cst, align 4
+  // OGCG-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+}
+
+int atomic_add_fetch(int *ptr, int value) {
+  // CIR-LABEL: @atomic_add_fetch
+  // LLVM-LABEL: @atomic_add_fetch
+  // OGCG-LABEL: @atomic_add_fetch
+
+  return __atomic_add_fetch(ptr, value, __ATOMIC_SEQ_CST);
+  // CIR: %{{.+}} = cir.atomic.fetch add seq_cst %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
+
+  // LLVM:      %[[OLD:.+]] = atomicrmw add ptr %{{.+}}, i32 %[[VAL:.+]] seq_cst, align 4
+  // LLVM-NEXT: %[[RES:.+]] = add i32 %[[OLD]], %[[VAL]]
+  // LLVM-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+
+  // OGCG:      %[[OLD:.+]] = atomicrmw add ptr %{{.+}}, i32 %[[VAL:.+]] seq_cst, align 4
+  // OGCG-NEXT: %[[RES:.+]] = add i32 %[[OLD]], %[[VAL]]
+  // OGCG-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+}
+
+int c11_atomic_fetch_add(_Atomic(int) *ptr, int value) {
+  // CIR-LABEL: @c11_atomic_fetch_add
+  // LLVM-LABEL: @c11_atomic_fetch_add
+  // OGCG-LABEL: @c11_atomic_fetch_add
+
+  return __c11_atomic_fetch_add(ptr, value, __ATOMIC_SEQ_CST);
+  // CIR: %{{.+}} = cir.atomic.fetch add seq_cst fetch_first %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
+
+  // LLVM:      %[[RES:.+]] = atomicrmw add ptr %{{.+}}, i32 %{{.+}} seq_cst, align 4
+  // LLVM-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+
+  // OGCG:      %[[RES:.+]] = atomicrmw add ptr %{{.+}}, i32 %{{.+}} seq_cst, align 4
+  // OGCG-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+}
+
+int atomic_fetch_sub(int *ptr, int value) {
+  // CIR-LABEL: @atomic_fetch_sub
+  // LLVM-LABEL: @atomic_fetch_sub
+  // OGCG-LABEL: @atomic_fetch_sub
+
+  return __atomic_fetch_sub(ptr, value, __ATOMIC_SEQ_CST);
+  // CIR: %{{.+}} = cir.atomic.fetch sub seq_cst fetch_first %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
+
+  // LLVM:      %[[RES:.+]] = atomicrmw sub ptr %{{.+}}, i32 %{{.+}} seq_cst, align 4
+  // LLVM-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+
+  // OGCG:      %[[RES:.+]] = atomicrmw sub ptr %{{.+}}, i32 %{{.+}} seq_cst, align 4
+  // OGCG-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+}
+
+int atomic_sub_fetch(int *ptr, int value) {
+  // CIR-LABEL: @atomic_sub_fetch
+  // LLVM-LABEL: @atomic_sub_fetch
+  // OGCG-LABEL: @atomic_sub_fetch
+
+  return __atomic_sub_fetch(ptr, value, __ATOMIC_SEQ_CST);
+  // CIR: %{{.+}} = cir.atomic.fetch sub seq_cst %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
+
+  // LLVM:      %[[OLD:.+]] = atomicrmw sub ptr %{{.+}}, i32 %[[VAL:.+]] seq_cst, align 4
+  // LLVM-NEXT: %[[RES:.+]] = sub i32 %[[OLD]], %[[VAL]]
+  // LLVM-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+
+  // OGCG:      %[[OLD:.+]] = atomicrmw sub ptr %{{.+}}, i32 %[[VAL:.+]] seq_cst, align 4
+  // OGCG-NEXT: %[[RES:.+]] = sub i32 %[[OLD]], %[[VAL]]
+  // OGCG-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+}
+
+int c11_atomic_fetch_sub(_Atomic(int) *ptr, int value) {
+  // CIR-LABEL: @c11_atomic_fetch_sub
+  // LLVM-LABEL: @c11_atomic_fetch_sub
+  // OGCG-LABEL: @c11_atomic_fetch_sub
+
+  return __c11_atomic_fetch_sub(ptr, value, __ATOMIC_SEQ_CST);
+  // CIR: %{{.+}} = cir.atomic.fetch sub seq_cst fetch_first %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
+
+  // LLVM:      %[[RES:.+]] = atomicrmw sub ptr %{{.+}}, i32 %{{.+}} seq_cst, align 4
+  // LLVM-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+
+  // OGCG:      %[[RES:.+]] = atomicrmw sub ptr %{{.+}}, i32 %{{.+}} seq_cst, align 4
+  // OGCG-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+}
+
+float atomic_fetch_add_fp(float *ptr, float value) {
+  // CIR-LABEL: @atomic_fetch_add_fp
+  // LLVM-LABEL: @atomic_fetch_add_fp
+  // OGCG-LABEL: @atomic_fetch_add_fp
+
+  return __atomic_fetch_add(ptr, value, __ATOMIC_SEQ_CST);
+  // CIR: %{{.+}} = cir.atomic.fetch add seq_cst fetch_first %{{.+}}, %{{.+}} : (!cir.ptr<!cir.float>, !cir.float) -> !cir.float
+
+  // LLVM:      %[[RES:.+]] = atomicrmw fadd ptr %{{.+}}, float %{{.+}} seq_cst, align 4
+  // LLVM-NEXT: store float %[[RES]], ptr %{{.+}}, align 4
+
+  // OGCG:      %[[RES:.+]] = atomicrmw fadd ptr %{{.+}}, float %{{.+}} seq_cst, align 4
+  // OGCG-NEXT: store float %[[RES]], ptr %{{.+}}, align 4
+}
+
+float atomic_add_fetch_fp(float *ptr, float value) {
+  // CIR-LABEL: @atomic_add_fetch_fp
+  // LLVM-LABEL: @atomic_add_fetch_fp
+  // OGCG-LABEL: @atomic_add_fetch_fp
+
+  return __atomic_add_fetch(ptr, value, __ATOMIC_SEQ_CST);
+  // CIR: %{{.+}} = cir.atomic.fetch add seq_cst %{{.+}}, %{{.+}} : (!cir.ptr<!cir.float>, !cir.float) -> !cir.float
+
+  // LLVM:      %[[OLD:.+]] = atomicrmw fadd ptr %{{.+}}, float %[[VAL:.+]] seq_cst, align 4
+  // LLVM-NEXT: %[[RES:.+]] = fadd float %[[OLD]], %[[VAL]]
+  // LLVM-NEXT: store float %[[RES]], ptr %{{.+}}, align 4
+
+  // OGCG:      %[[OLD:.+]] = atomicrmw fadd ptr %{{.+}}, float %[[VAL:.+]] seq_cst, align 4
+  // OGCG-NEXT: %[[RES:.+]] = fadd float %[[OLD]], %[[VAL]]
+  // OGCG-NEXT: store float %[[RES]], ptr %{{.+}}, align 4
+}
+
+float c11_atomic_fetch_sub_fp(_Atomic(float) *ptr, float value) {
+  // CIR-LABEL: @c11_atomic_fetch_sub_fp
+  // LLVM-LABEL: @c11_atomic_fetch_sub_fp
+  // OGCG-LABEL: @c11_atomic_fetch_sub_fp
+
+  return __c11_atomic_fetch_sub(ptr, value, __ATOMIC_SEQ_CST);
+  // CIR: %{{.+}} = cir.atomic.fetch sub seq_cst fetch_first %{{.+}}, %{{.+}} : (!cir.ptr<!cir.float>, !cir.float) -> !cir.float
+
+  // LLVM:      %[[RES:.+]] = atomicrmw fsub ptr %{{.+}}, float %{{.+}} seq_cst, align 4
+  // LLVM-NEXT: store float %[[RES]], ptr %{{.+}}, align 4
+
+  // OGCG:      %[[RES:.+]] = atomicrmw fsub ptr %{{.+}}, float %{{.+}} seq_cst, align 4
+  // OGCG-NEXT: store float %[[RES]], ptr %{{.+}}, align 4
+}
+
+int atomic_fetch_min(int *ptr, int value) {
+  // CIR-LABEL: @atomic_fetch_min
+  // LLVM-LABEL: @atomic_fetch_min
+  // OGCG-LABEL: @atomic_fetch_min
+
+  return __atomic_fetch_min(ptr, value, __ATOMIC_SEQ_CST);
+  // CIR: %{{.+}} = cir.atomic.fetch min seq_cst fetch_first %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
+
+  // LLVM:      %[[RES:.+]] = atomicrmw min ptr %{{.+}}, i32 %{{.+}} seq_cst, align 4
+  // LLVM-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+
+  // OGCG:      %[[RES:.+]] = atomicrmw min ptr %{{.+}}, i32 %{{.+}} seq_cst, align 4
+  // OGCG-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+}
+
+int atomic_min_fetch(int *ptr, int value) {
+  // CIR-LABEL: @atomic_min_fetch
+  // LLVM-LABEL: @atomic_min_fetch
+  // OGCG-LABEL: @atomic_min_fetch
+
+  return __atomic_min_fetch(ptr, value, __ATOMIC_SEQ_CST);
+  // CIR: %{{.+}} = cir.atomic.fetch min seq_cst %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
+
+  // LLVM:      %[[OLD:.+]] = atomicrmw min ptr %{{.+}}, i32 %[[VAL:.+]] seq_cst, align 4
+  // LLVM-NEXT: %[[OLD_LESS:.+]] = icmp slt i32 %[[OLD]], %[[VAL]]
+  // LLVM-NEXT: %[[RES:.+]] = select i1 %[[OLD_LESS]], i32 %[[OLD]], i32 %[[VAL]]
+  // LLVM-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+
+  // OGCG:      %[[OLD:.+]] = atomicrmw min ptr %{{.+}}, i32 %[[VAL:.+]] seq_cst, align 4
+  // OGCG-NEXT: %[[OLD_LESS:.+]] = icmp slt i32 %[[OLD]], %[[VAL]]
+  // OGCG-NEXT: %[[RES:.+]] = select i1 %[[OLD_LESS]], i32 %[[OLD]], i32 %[[VAL]]
+  // OGCG-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+}
+
+int c11_atomic_fetch_min(_Atomic(int) *ptr, int value) {
+  // CIR-LABEL: @c11_atomic_fetch_min
+  // LLVM-LABEL: @c11_atomic_fetch_min
+  // OGCG-LABEL: @c11_atomic_fetch_min
+
+  return __c11_atomic_fetch_min(ptr, value, __ATOMIC_SEQ_CST);
+  // CIR: %{{.+}} = cir.atomic.fetch min seq_cst fetch_first %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
+
+  // LLVM:      %[[RES:.+]] = atomicrmw min ptr %{{.+}}, i32 %{{.+}} seq_cst, align 4
+  // LLVM-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+
+  // OGCG:      %[[RES:.+]] = atomicrmw min ptr %{{.+}}, i32 %{{.+}} seq_cst, align 4
+  // OGCG-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+}
+
+float atomic_fetch_min_fp(float *ptr, float value) {
+  // CIR-LABEL: @atomic_fetch_min_fp
+  // LLVM-LABEL: @atomic_fetch_min_fp
+  // OGCG-LABEL: @atomic_fetch_min_fp
+
+  return __atomic_fetch_min(ptr, value, __ATOMIC_SEQ_CST);
+  // CIR: %{{.+}} = cir.atomic.fetch min seq_cst fetch_first %{{.+}}, %{{.+}} : (!cir.ptr<!cir.float>, !cir.float) -> !cir.float
+
+  // LLVM:      %[[RES:.+]] = atomicrmw fmin ptr %{{.+}}, float %{{.+}} seq_cst, align 4
+  // LLVM-NEXT: store float %[[RES]], ptr %{{.+}}, align 4
+
+  // OGCG:      %[[RES:.+]] = atomicrmw fmin ptr %{{.+}}, float %{{.+}} seq_cst, align 4
+  // OGCG-NEXT: store float %[[RES]], ptr %{{.+}}, align 4
+}
+
+float atomic_min_fetch_fp(float *ptr, float value) {
+  // CIR-LABEL: @atomic_min_fetch_fp
+  // LLVM-LABEL: @atomic_min_fetch_fp
+  // OGCG-LABEL: @atomic_min_fetch_fp
+
+  return __atomic_min_fetch(ptr, value, __ATOMIC_SEQ_CST);
+  // CIR: %{{.+}} = cir.atomic.fetch min seq_cst %{{.+}}, %{{.+}} : (!cir.ptr<!cir.float>, !cir.float) -> !cir.float
+
+  // LLVM:      %[[OLD:.+]] = atomicrmw fmin ptr %{{.+}}, float %[[VAL:.+]] seq_cst, align 4
+  // LLVM-NEXT: %[[RES:.+]] = call float @llvm.minnum.f32(float %[[OLD]], float %[[VAL]])
+  // LLVM-NEXT: store float %[[RES]], ptr %{{.+}}, align 4
+
+  // OGCG:      %[[OLD:.+]] = atomicrmw fmin ptr %{{.+}}, float %[[VAL:.+]] seq_cst, align 4
+  // OGCG-NEXT: %[[RES:.+]] = call float @llvm.minnum.f32(float %[[OLD]], float %[[VAL]])
+  // OGCG-NEXT: store float %[[RES]], ptr %{{.+}}, align 4
+}
+
+float c11_atomic_fetch_min_fp(_Atomic(float) *ptr, float value) {
+  // CIR-LABEL: @c11_atomic_fetch_min_fp
+  // LLVM-LABEL: @c11_atomic_fetch_min_fp
+  // OGCG-LABEL: @c11_atomic_fetch_min_fp
+
+  return __c11_atomic_fetch_min(ptr, value, __ATOMIC_SEQ_CST);
+  // CIR: %{{.+}} = cir.atomic.fetch min seq_cst fetch_first %{{.+}}, %{{.+}} : (!cir.ptr<!cir.float>, !cir.float) -> !cir.float
+
+  // LLVM:      %[[RES:.+]] = atomicrmw fmin ptr %{{.+}}, float %{{.+}} seq_cst, align 4
+  // LLVM-NEXT: store float %[[RES]], ptr %{{.+}}, align 4
+
+  // OGCG:      %[[RES:.+]] = atomicrmw fmin ptr %{{.+}}, float %{{.+}} seq_cst, align 4
+  // OGCG-NEXT: store float %[[RES]], ptr %{{.+}}, align 4
+}
+
+int atomic_fetch_max(int *ptr, int value) {
+  // CIR-LABEL: @atomic_fetch_max
+  // LLVM-LABEL: @atomic_fetch_max
+  // OGCG-LABEL: @atomic_fetch_max
+
+  return __atomic_fetch_max(ptr, value, __ATOMIC_SEQ_CST);
+  // CIR: %{{.+}} = cir.atomic.fetch max seq_cst fetch_first %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
+
+  // LLVM:      %[[RES:.+]] = atomicrmw max ptr %{{.+}}, i32 %{{.+}} seq_cst, align 4
+  // LLVM-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+
+  // OGCG:      %[[RES:.+]] = atomicrmw max ptr %{{.+}}, i32 %{{.+}} seq_cst, align 4
+  // OGCG-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+}
+
+int atomic_max_fetch(int *ptr, int value) {
+  // CIR-LABEL: @atomic_max_fetch
+  // LLVM-LABEL: @atomic_max_fetch
+  // OGCG-LABEL: @atomic_max_fetch
+
+  return __atomic_max_fetch(ptr, value, __ATOMIC_SEQ_CST);
+  // CIR: %{{.+}} = cir.atomic.fetch max seq_cst %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
+
+  // LLVM:      %[[OLD:.+]] = atomicrmw max ptr %{{.+}}, i32 %[[VAL:.+]] seq_cst, align 4
+  // LLVM-NEXT: %[[OLD_GREATER:.+]] = icmp sgt i32 %[[OLD]], %[[VAL]]
+  // LLVM-NEXT: %[[RES:.+]] = select i1 %[[OLD_GREATER]], i32 %[[OLD]], i32 %[[VAL]]
+  // LLVM-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+
+  // OGCG:      %[[OLD:.+]] = atomicrmw max ptr %{{.+}}, i32 %[[VAL:.+]] seq_cst, align 4
+  // OGCG-NEXT: %[[OLD_GREATER:.+]] = icmp sgt i32 %[[OLD]], %[[VAL]]
+  // OGCG-NEXT: %[[RES:.+]] = select i1 %[[OLD_GREATER]], i32 %[[OLD]], i32 %[[VAL]]
+  // OGCG-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+}
+
+int c11_atomic_fetch_max(_Atomic(int) *ptr, int value) {
+  // CIR-LABEL: @c11_atomic_fetch_max
+  // LLVM-LABEL: @c11_atomic_fetch_max
+  // OGCG-LABEL: @c11_atomic_fetch_max
+
+  return __c11_atomic_fetch_max(ptr, value, __ATOMIC_SEQ_CST);
+  // CIR: %{{.+}} = cir.atomic.fetch max seq_cst fetch_first %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
+
+  // LLVM:      %[[RES:.+]] = atomicrmw max ptr %{{.+}}, i32 %{{.+}} seq_cst, align 4
+  // LLVM-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+
+  // OGCG:      %[[RES:.+]] = atomicrmw max ptr %{{.+}}, i32 %{{.+}} seq_cst, align 4
+  // OGCG-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+}
+
+float atomic_fetch_max_fp(float *ptr, float value) {
+  // CIR-LABEL: @atomic_fetch_max_fp
+  // LLVM-LABEL: @atomic_fetch_max_fp
+  // OGCG-LABEL: @atomic_fetch_max_fp
+
+  return __atomic_fetch_max(ptr, value, __ATOMIC_SEQ_CST);
+  // CIR: %{{.+}} = cir.atomic.fetch max seq_cst fetch_first %{{.+}}, %{{.+}} : (!cir.ptr<!cir.float>, !cir.float) -> !cir.float
+
+  // LLVM:      %[[RES:.+]] = atomicrmw fmax ptr %{{.+}}, float %{{.+}} seq_cst, align 4
+  // LLVM-NEXT: store float %[[RES]], ptr %{{.+}}, align 4
+
+  // OGCG:      %[[RES:.+]] = atomicrmw fmax ptr %{{.+}}, float %{{.+}} seq_cst, align 4
+  // OGCG-NEXT: store float %[[RES]], ptr %{{.+}}, align 4
+}
+
+float atomic_max_fetch_fp(float *ptr, float value) {
+  // CIR-LABEL: @atomic_max_fetch_fp
+  // LLVM-LABEL: @atomic_max_fetch_fp
+  // OGCG-LABEL: @atomic_max_fetch_fp
+
+  return __atomic_max_fetch(ptr, value, __ATOMIC_SEQ_CST);
+  // CIR: %{{.+}} = cir.atomic.fetch max seq_cst %{{.+}}, %{{.+}} : (!cir.ptr<!cir.float>, !cir.float) -> !cir.float
+
+  // LLVM:      %[[OLD:.+]] = atomicrmw fmax ptr %{{.+}}, float %[[VAL:.+]] seq_cst, align 4
+  // LLVM-NEXT: %[[RES:.+]] = call float @llvm.maxnum.f32(float %[[OLD]], float %[[VAL]])
+  // LLVM-NEXT: store float %[[RES]], ptr %{{.+}}, align 4
+
+  // OGCG:      %[[OLD:.+]] = atomicrmw fmax ptr %{{.+}}, float %[[VAL:.+]] seq_cst, align 4
+  // OGCG-NEXT: %[[RES:.+]] = call float @llvm.maxnum.f32(float %[[OLD]], float %[[VAL]])
+  // OGCG-NEXT: store float %[[RES]], ptr %{{.+}}, align 4
+}
+
+float c11_atomic_fetch_max_fp(_Atomic(float) *ptr, float value) {
+  // CIR-LABEL: @c11_atomic_fetch_max_fp
+  // LLVM-LABEL: @c11_atomic_fetch_max_fp
+  // OGCG-LABEL: @c11_atomic_fetch_max_fp
+
+  return __c11_atomic_fetch_max(ptr, value, __ATOMIC_SEQ_CST);
+  // CIR: %{{.+}} = cir.atomic.fetch max seq_cst fetch_first %{{.+}}, %{{.+}} : (!cir.ptr<!cir.float>, !cir.float) -> !cir.float
+
+  // LLVM:      %[[RES:.+]] = atomicrmw fmax ptr %{{.+}}, float %{{.+}} seq_cst, align 4
+  // LLVM-NEXT: store float %[[RES]], ptr %{{.+}}, align 4
+
+  // OGCG:      %[[RES:.+]] = atomicrmw fmax ptr %{{.+}}, float %{{.+}} seq_cst, align 4
+  // OGCG-NEXT: store float %[[RES]], ptr %{{.+}}, align 4
+}
+
+int atomic_fetch_and(int *ptr, int value) {
+  // CIR-LABEL: @atomic_fetch_and
+  // LLVM-LABEL: @atomic_fetch_and
+  // OGCG-LABEL: @atomic_fetch_and
+
+  return __atomic_fetch_and(ptr, value, __ATOMIC_SEQ_CST);
+  // CIR: %{{.+}} = cir.atomic.fetch and seq_cst fetch_first %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
+
+  // LLVM:      %[[RES:.+]] = atomicrmw and ptr %{{.+}}, i32 %{{.+}} seq_cst, align 4
+  // LLVM-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+
+  // OGCG:      %[[RES:.+]] = atomicrmw and ptr %{{.+}}, i32 %{{.+}} seq_cst, align 4
+  // OGCG-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+}
+
+int atomic_and_fetch(int *ptr, int value) {
+  // CIR-LABEL: @atomic_and_fetch
+  // LLVM-LABEL: @atomic_and_fetch
+  // OGCG-LABEL: @atomic_and_fetch
+
+  return __atomic_and_fetch(ptr, value, __ATOMIC_SEQ_CST);
+  // CIR: %{{.+}} = cir.atomic.fetch and seq_cst %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
+
+  // LLVM:      %[[OLD:.+]] = atomicrmw and ptr %{{.+}}, i32 %[[VAL:.+]] seq_cst, align 4
+  // LLVM-NEXT: %[[RES:.+]] = and i32 %[[OLD]], %[[VAL]]
+  // LLVM-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+
+  // OGCG:      %[[OLD:.+]] = atomicrmw and ptr %{{.+}}, i32 %[[VAL:.+]] seq_cst, align 4
+  // OGCG-NEXT: %[[RES:.+]] = and i32 %[[OLD]], %[[VAL]]
+  // OGCG-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+}
+
+int c11_atomic_fetch_and(_Atomic(int) *ptr, int value) {
+  // CIR-LABEL: @c11_atomic_fetch_and
+  // LLVM-LABEL: @c11_atomic_fetch_and
+  // OGCG-LABEL: @c11_atomic_fetch_and
+
+  return __c11_atomic_fetch_and(ptr, value, __ATOMIC_SEQ_CST);
+  // CIR: %{{.+}} = cir.atomic.fetch and seq_cst fetch_first %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
+
+  // LLVM:      %[[RES:.+]] = atomicrmw and ptr %{{.+}}, i32 %{{.+}} seq_cst, align 4
+  // LLVM-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+
+  // OGCG:      %[[RES:.+]] = atomicrmw and ptr %{{.+}}, i32 %{{.+}} seq_cst, align 4
+  // OGCG-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+}
+
+int atomic_fetch_or(int *ptr, int value) {
+  // CIR-LABEL: @atomic_fetch_or
+  // LLVM-LABEL: @atomic_fetch_or
+  // OGCG-LABEL: @atomic_fetch_or
+
+  return __atomic_fetch_or(ptr, value, __ATOMIC_SEQ_CST);
+  // CIR: %{{.+}} = cir.atomic.fetch or seq_cst fetch_first %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
+
+  // LLVM:      %[[RES:.+]] = atomicrmw or ptr %{{.+}}, i32 %{{.+}} seq_cst, align 4
+  // LLVM-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+
+  // OGCG:      %[[RES:.+]] = atomicrmw or ptr %{{.+}}, i32 %{{.+}} seq_cst, align 4
+  // OGCG-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+}
+
+int atomic_or_fetch(int *ptr, int value) {
+  // CIR-LABEL: @atomic_or_fetch
+  // LLVM-LABEL: @atomic_or_fetch
+  // OGCG-LABEL: @atomic_or_fetch
+
+  return __atomic_or_fetch(ptr, value, __ATOMIC_SEQ_CST);
+  // CIR: %{{.+}} = cir.atomic.fetch or seq_cst %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
+
+  // LLVM:      %[[OLD:.+]] = atomicrmw or ptr %{{.+}}, i32 %[[VAL:.+]] seq_cst, align 4
+  // LLVM-NEXT: %[[RES:.+]] = or i32 %[[OLD]], %[[VAL]]
+  // LLVM-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+
+  // OGCG:      %[[OLD:.+]] = atomicrmw or ptr %{{.+}}, i32 %[[VAL:.+]] seq_cst, align 4
+  // OGCG-NEXT: %[[RES:.+]] = or i32 %[[OLD]], %[[VAL]]
+  // OGCG-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+}
+
+int c11_atomic_fetch_or(_Atomic(int) *ptr, int value) {
+  // CIR-LABEL: @c11_atomic_fetch_or
+  // LLVM-LABEL: @c11_atomic_fetch_or
+  // OGCG-LABEL: @c11_atomic_fetch_or
+
+  return __c11_atomic_fetch_or(ptr, value, __ATOMIC_SEQ_CST);
+  // CIR: %{{.+}} = cir.atomic.fetch or seq_cst fetch_first %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
+
+  // LLVM:      %[[RES:.+]] = atomicrmw or ptr %{{.+}}, i32 %{{.+}} seq_cst, align 4
+  // LLVM-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+
+  // OGCG:      %[[RES:.+]] = atomicrmw or ptr %{{.+}}, i32 %{{.+}} seq_cst, align 4
+  // OGCG-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+}
+
+int atomic_fetch_xor(int *ptr, int value) {
+  // CIR-LABEL: @atomic_fetch_xor
+  // LLVM-LABEL: @atomic_fetch_xor
+  // OGCG-LABEL: @atomic_fetch_xor
+
+  return __atomic_fetch_xor(ptr, value, __ATOMIC_SEQ_CST);
+  // CIR: %{{.+}} = cir.atomic.fetch xor seq_cst fetch_first %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
+
+  // LLVM:      %[[RES:.+]] = atomicrmw xor ptr %{{.+}}, i32 %{{.+}} seq_cst, align 4
+  // LLVM-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+
+  // OGCG:      %[[RES:.+]] = atomicrmw xor ptr %{{.+}}, i32 %{{.+}} seq_cst, align 4
+  // OGCG-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+}
+
+int atomic_xor_fetch(int *ptr, int value) {
+  // CIR-LABEL: @atomic_xor_fetch
+  // LLVM-LABEL: @atomic_xor_fetch
+  // OGCG-LABEL: @atomic_xor_fetch
+
+  return __atomic_xor_fetch(ptr, value, __ATOMIC_SEQ_CST);
+  // CIR: %{{.+}} = cir.atomic.fetch xor seq_cst %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
+
+  // LLVM:      %[[OLD:.+]] = atomicrmw xor ptr %{{.+}}, i32 %[[VAL:.+]] seq_cst, align 4
+  // LLVM-NEXT: %[[RES:.+]] = xor i32 %[[OLD]], %[[VAL]]
+  // LLVM-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+
+  // OGCG:      %[[OLD:.+]] = atomicrmw xor ptr %{{.+}}, i32 %[[VAL:.+]] seq_cst, align 4
+  // OGCG-NEXT: %[[RES:.+]] = xor i32 %[[OLD]], %[[VAL]]
+  // OGCG-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+}
+
+int c11_atomic_fetch_xor(_Atomic(int) *ptr, int value) {
+  // CIR-LABEL: @c11_atomic_fetch_xor
+  // LLVM-LABEL: @c11_atomic_fetch_xor
+  // OGCG-LABEL: @c11_atomic_fetch_xor
+
+  return __c11_atomic_fetch_xor(ptr, value, __ATOMIC_SEQ_CST);
+  // CIR: %{{.+}} = cir.atomic.fetch xor seq_cst fetch_first %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
+
+  // LLVM:      %[[RES:.+]] = atomicrmw xor ptr %{{.+}}, i32 %{{.+}} seq_cst, align 4
+  // LLVM-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+
+  // OGCG:      %[[RES:.+]] = atomicrmw xor ptr %{{.+}}, i32 %{{.+}} seq_cst, align 4
+  // OGCG-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+}
+
+int atomic_fetch_nand(int *ptr, int value) {
+  // CIR-LABEL: @atomic_fetch_nand
+  // LLVM-LABEL: @atomic_fetch_nand
+  // OGCG-LABEL: @atomic_fetch_nand
+
+  return __atomic_fetch_nand(ptr, value, __ATOMIC_SEQ_CST);
+  // CIR: %{{.+}} = cir.atomic.fetch nand seq_cst fetch_first %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
+
+  // LLVM:      %[[RES:.+]] = atomicrmw nand ptr %{{.+}}, i32 %{{.+}} seq_cst, align 4
+  // LLVM-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+
+  // OGCG:      %[[RES:.+]] = atomicrmw nand ptr %{{.+}}, i32 %{{.+}} seq_cst, align 4
+  // OGCG-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+}
+
+int atomic_nand_fetch(int *ptr, int value) {
+  // CIR-LABEL: @atomic_nand_fetch
+  // LLVM-LABEL: @atomic_nand_fetch
+  // OGCG-LABEL: @atomic_nand_fetch
+
+  return __atomic_nand_fetch(ptr, value, __ATOMIC_SEQ_CST);
+  // CIR: %{{.+}} = cir.atomic.fetch nand seq_cst %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
+
+  // LLVM:      %[[OLD:.+]] = atomicrmw nand ptr %{{.+}}, i32 %[[VAL:.+]] seq_cst, align 4
+  // LLVM-NEXT: %[[TMP:.+]] = and i32 %[[OLD]], %[[VAL]]
+  // LLVM-NEXT: %[[RES:.+]] = xor i32 %[[TMP]], -1
+  // LLVM-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+
+  // OGCG:      %[[OLD:.+]] = atomicrmw nand ptr %{{.+}}, i32 %[[VAL:.+]] seq_cst, align 4
+  // OGCG-NEXT: %[[TMP:.+]] = and i32 %[[OLD]], %[[VAL]]
+  // OGCG-NEXT: %[[RES:.+]] = xor i32 %[[TMP]], -1
+  // OGCG-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+}
+
+int c11_atomic_fetch_nand(_Atomic(int) *ptr, int value) {
+  // CIR-LABEL: @c11_atomic_fetch_nand
+  // LLVM-LABEL: @c11_atomic_fetch_nand
+  // OGCG-LABEL: @c11_atomic_fetch_nand
+
+  return __c11_atomic_fetch_nand(ptr, value, __ATOMIC_SEQ_CST);
+  // CIR: %{{.+}} = cir.atomic.fetch nand seq_cst fetch_first %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
+
+  // LLVM:      %[[RES:.+]] = atomicrmw nand ptr %{{.+}}, i32 %{{.+}} seq_cst, align 4
+  // LLVM-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+
+  // OGCG:      %[[RES:.+]] = atomicrmw nand ptr %{{.+}}, i32 %{{.+}} seq_cst, align 4
+  // OGCG-NEXT: store i32 %[[RES]], ptr %{{.+}}, align 4
+}
diff --git a/clang/test/CIR/CodeGen/binassign.c b/clang/test/CIR/CodeGen/binassign.c
index 65bea4df7d837..44c54b4a2969a 100644
--- a/clang/test/CIR/CodeGen/binassign.c
+++ b/clang/test/CIR/CodeGen/binassign.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c23 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: %clang_cc1 -std=c23 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir -mmlir --mlir-print-ir-before=cir-lowering-prepare %s -o %t.cir 2> %t-before-lp.cir
 // RUN: FileCheck --input-file=%t.cir %s -check-prefix=CIR
 // RUN: %clang_cc1 -std=c23 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-llvm %s -o %t-cir.ll
 // RUN: FileCheck --input-file=%t-cir.ll %s -check-prefix=LLVM
@@ -17,7 +17,7 @@ void binary_assign(void) {
     i = 42;
 }
 
-// CIR-LABEL: cir.func{{.*}} @binary_assign() {
+// CIR-LABEL: cir.func{{.*}} @binary_assign()
 // CIR:         %[[B:.*]] = cir.alloca !cir.bool, !cir.ptr<!cir.bool>, ["b"]
 // CIR:         %[[C:.*]] = cir.alloca !s8i, !cir.ptr<!s8i>, ["c"]
 // CIR:         %[[F:.*]] = cir.alloca !cir.float, !cir.ptr<!cir.float>, ["f"]
@@ -33,7 +33,7 @@ void binary_assign(void) {
 // CIR:         cir.store{{.*}} %[[INT_VAL]], %[[I]] : !s32i, !cir.ptr<!s32i>
 // CIR:         cir.return
 
-// LLVM-LABEL: define {{.*}}void @binary_assign() {
+// LLVM-LABEL: define {{.*}}void @binary_assign(){{.*}} {
 // LLVM:         %[[B_PTR:.*]] = alloca i8
 // LLVM:         %[[C_PTR:.*]] = alloca i8
 // LLVM:         %[[F_PTR:.*]] = alloca float
@@ -54,3 +54,49 @@ void binary_assign(void) {
 // OGCG:         store float 0x40091EB860000000, ptr %[[F_PTR]]
 // OGCG:         store i32 42, ptr %[[I_PTR]]
 // OGCG:         ret void
+
+struct S {
+  int a;
+  float b;
+};
+
+struct SV {
+  int a;
+  volatile float b;
+};
+
+struct S gs;
+struct SV gsv;
+
+void binary_assign_struct() {
+  // Test normal struct assignment
+  struct S ls;
+  ls = gs;
+
+  // Test assignment of a struct with a volatile member
+  struct SV lsv;
+  lsv = gsv;
+}
+
+// CIR: cir.func{{.*}} @binary_assign_struct()
+// CIR:   %[[LS:.*]] = cir.alloca ![[REC_S:.*]], !cir.ptr<![[REC_S]]>, ["ls"]
+// CIR:   %[[LSV:.*]] = cir.alloca ![[REC_SV:.*]], !cir.ptr<![[REC_SV]]>, ["lsv"]
+// CIR:   %[[GS_PTR:.*]] = cir.get_global @gs : !cir.ptr<![[REC_S]]>
+// CIR:   cir.copy %[[GS_PTR]] to %[[LS]] : !cir.ptr<![[REC_S]]>
+// CIR:   %[[GSV_PTR:.*]] = cir.get_global @gsv : !cir.ptr<![[REC_SV]]>
+// CIR:   cir.copy %[[GSV_PTR]] to %[[LSV]] volatile : !cir.ptr<![[REC_SV]]>
+// CIR:   cir.return
+
+// LLVM: define {{.*}}void @binary_assign_struct()
+// LLVM:   %[[LS_PTR:.*]] = alloca %struct.S
+// LLVM:   %[[LSV_PTR:.*]] = alloca %struct.SV
+// LLVM:   call void @llvm.memcpy.p0.p0.i32(ptr %[[LS_PTR]], ptr @gs, i32 8, i1 false)
+// LLVM:   call void @llvm.memcpy.p0.p0.i32(ptr %[[LSV_PTR]], ptr @gsv, i32 8, i1 true)
+// LLVM:   ret void
+
+// OGCG: define {{.*}}void @binary_assign_struct()
+// OGCG:   %[[LS_PTR:.*]] = alloca %struct.S
+// OGCG:   %[[LSV_PTR:.*]] = alloca %struct.SV
+// OGCG:   call void @llvm.memcpy.p0.p0.i64(ptr align 4 %[[LS_PTR]], ptr align 4 @gs, i64 8, i1 false)
+// OGCG:   call void @llvm.memcpy.p0.p0.i64(ptr align 4 %[[LSV_PTR]], ptr align 4 @gsv, i64 8, i1 true)
+// OGCG:   ret void
diff --git a/clang/test/CIR/CodeGen/bitfields_be.c b/clang/test/CIR/CodeGen/bitfields_be.c
index 77741ba74870b..3e1f05401728a 100644
--- a/clang/test/CIR/CodeGen/bitfields_be.c
+++ b/clang/test/CIR/CodeGen/bitfields_be.c
@@ -27,7 +27,7 @@ int init(S* s) {
 //CIR:   [[TMP2:%.*]] = cir.get_member [[TMP1]][0] {name = "c"} : !cir.ptr<!rec_S> -> !cir.ptr<!u32i>
 //CIR:   [[TMP3:%.*]] = cir.get_bitfield align(4) (#bfi_c, [[TMP2]] : !cir.ptr<!u32i>) -> !s32i
 
-//LLVM: define dso_local i32 @init(ptr %0) {
+//LLVM: define dso_local i32 @init(ptr %0){{.*}} {
 //LLVM:   [[TMP0:%.*]] = alloca ptr, i64 1, align 8
 //LLVM:   [[TMP1:%.*]] = alloca i32, i64 1, align 4
 //LLVM:   [[TMP2:%.*]] = load ptr, ptr [[TMP0]], align 8
@@ -59,7 +59,7 @@ void load(S* s) {
 // CIR:    %[[GET0:.*]] = cir.get_member %[[VAL0]][0] {name = "a"} : !cir.ptr<!rec_S> -> !cir.ptr<!u32i>
 // CIR:    %[[SET0:.*]] = cir.set_bitfield align(4) (#bfi_a, %[[GET0]] : !cir.ptr<!u32i>, %[[MIN1]] : !s32i) -> !s32i
 
-// LLVM: define dso_local void @load
+// LLVM: define dso_local void @load{{.*}}{{.*}}
 // LLVM:   %[[PTR0:.*]] = load ptr
 // LLVM:   %[[GET0:.*]] = getelementptr %struct.S, ptr %[[PTR0]], i32 0, i32 0
 // LLVM:   %[[VAL0:.*]] = load i32, ptr %[[GET0]], align 4
diff --git a/clang/test/CIR/CodeGen/builtin_call.cpp b/clang/test/CIR/CodeGen/builtin_call.cpp
index a30df97250d19..a08a784951247 100644
--- a/clang/test/CIR/CodeGen/builtin_call.cpp
+++ b/clang/test/CIR/CodeGen/builtin_call.cpp
@@ -82,7 +82,7 @@ void library_builtins() {
   __builtin_abort();
 }
 
-// CIR: cir.func{{.*}} @_Z16library_builtinsv() {
+// CIR: cir.func{{.*}} @_Z16library_builtinsv()
 // CIR: %[[NULL:.+]] = cir.const #cir.ptr<null> : !cir.ptr<!s8i>
 // CIR: cir.call @printf(%[[NULL]]) nothrow : (!cir.ptr<!s8i>) -> !s32i
 // CIR: cir.call @abort() nothrow : () -> ()
diff --git a/clang/test/CIR/CodeGen/builtin_inline.c b/clang/test/CIR/CodeGen/builtin_inline.c
new file mode 100644
index 0000000000000..83a3ba6e53f4b
--- /dev/null
+++ b/clang/test/CIR/CodeGen/builtin_inline.c
@@ -0,0 +1,91 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s -check-prefix=CIR
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm -disable-llvm-passes %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s -check-prefix=LLVM
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -disable-llvm-passes %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s -check-prefix=OGCG
+
+typedef unsigned long size_t;
+
+// Normal inline builtin declaration
+// When a builtin is redefined with extern inline + always_inline attributes,
+// the compiler creates a .inline version to avoid conflicts with the builtin
+
+extern inline __attribute__((always_inline)) __attribute__((gnu_inline))
+void *memcpy(void *a, const void *b, size_t c) {
+  return __builtin_memcpy(a, b, c);
+}
+
+void *test_inline_builtin_memcpy(void *a, const void *b, size_t c) {
+  return memcpy(a, b, c);
+}
+
+// CIR: cir.func internal private{{.*}}@memcpy.inline({{.*}}) -> !cir.ptr<!void> inline(always)
+
+// CIR-LABEL: @test_inline_builtin_memcpy(
+// CIR:         cir.call @memcpy.inline(
+// CIR:       }
+
+// LLVM: define internal ptr @memcpy.inline(ptr{{.*}}, ptr{{.*}}, i64{{.*}}) #{{[0-9]+}}
+
+// LLVM-LABEL: @test_inline_builtin_memcpy(
+// LLVM:         call ptr @memcpy.inline(
+
+// OGCG-LABEL: @test_inline_builtin_memcpy(
+// OGCG:         call ptr @memcpy.inline(
+
+// OGCG: define internal ptr @memcpy.inline(ptr{{.*}} %a, ptr{{.*}} %b, i64{{.*}} %c) #{{[0-9]+}}
+
+// Shadowing case
+// When a non-inline function definition shadows an inline builtin declaration,
+// the .inline version should be replaced with the regular function and removed.
+
+extern inline __attribute__((always_inline)) __attribute__((gnu_inline))
+void *memmove(void *a, const void *b, size_t c) {
+  return __builtin_memmove(a, b, c);
+}
+
+void *memmove(void *a, const void *b, size_t c) {
+  char *dst = (char *)a;
+  const char *src = (const char *)b;
+  if (dst < src) {
+    for (size_t i = 0; i < c; i++) {
+      dst[i] = src[i];
+    }
+  } else {
+    for (size_t i = c; i > 0; i--) {
+      dst[i-1] = src[i-1];
+    }
+  }
+  return a;
+}
+
+void *test_shadowed_memmove(void *a, const void *b, size_t c) {
+  return memmove(a, b, c);
+}
+
+// CIR: cir.func{{.*}}@memmove({{.*}}) -> !cir.ptr<!void>{{.*}}{
+// CIR-NOT: @memmove.inline
+
+// CIR-LABEL: @test_shadowed_memmove(
+// CIR: cir.call @memmove(
+// CIR-NOT: @memmove.inline
+// CIR: }
+
+// LLVM: define dso_local ptr @memmove(ptr{{.*}}, ptr{{.*}}, i64{{.*}}) #{{[0-9]+}}
+// LLVM-NOT: @memmove.inline
+
+// LLVM-LABEL: @test_shadowed_memmove(
+// TODO - this deviation from OGCG is expected until we implement the nobuiltin
+// attribute. See CIRGenFunction::emitDirectCallee
+// LLVM: call ptr @memmove(
+// LLVM-NOT: @memmove.inline
+// LLVM: }
+
+// OGCG: define dso_local ptr @memmove(ptr{{.*}} %a, ptr{{.*}} %b, i64{{.*}} %c) #{{[0-9]+}}
+// OGCG-NOT: @memmove.inline
+
+// OGCG-LABEL: @test_shadowed_memmove(
+// OGCG: call void @llvm.memmove.p0.p0.i64(
+// OGCG-NOT: @memmove.inline
+// OGCG: }
diff --git a/clang/test/CIR/CodeGen/builtin_printf.cpp b/clang/test/CIR/CodeGen/builtin_printf.cpp
index 898984a6c12d3..7200df1a8ba35 100644
--- a/clang/test/CIR/CodeGen/builtin_printf.cpp
+++ b/clang/test/CIR/CodeGen/builtin_printf.cpp
@@ -20,7 +20,7 @@ void func(char const * const str, int i) {
 
 // CIR: cir.func{{.*}} @printf(!cir.ptr<!s8i>, ...) -> !s32i
 
-// CIR: cir.func{{.*}} @_Z4funcPKci(%[[arg0:.+]]: !cir.ptr<!s8i>{{.*}}, %[[arg1:.+]]: !s32i{{.*}}) {
+// CIR: cir.func{{.*}} @_Z4funcPKci(%[[arg0:.+]]: !cir.ptr<!s8i>{{.*}}, %[[arg1:.+]]: !s32i
 // CIR:   %[[str_ptr:.+]] = cir.alloca !cir.ptr<!s8i>, !cir.ptr<!cir.ptr<!s8i>>, ["str", init, const]
 // CIR:   %[[i_ptr:.+]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["i", init]
 // CIR:   cir.store %[[arg0]], %[[str_ptr]] : !cir.ptr<!s8i>, !cir.ptr<!cir.ptr<!s8i>>
diff --git a/clang/test/CIR/CodeGen/call.c b/clang/test/CIR/CodeGen/call.c
index 9d516c6d831d8..d780e37f3d153 100644
--- a/clang/test/CIR/CodeGen/call.c
+++ b/clang/test/CIR/CodeGen/call.c
@@ -16,11 +16,11 @@ void f2(void) {
   f1(s);
 }
 
-// CIR-LABEL: cir.func{{.*}} @f2()
+// CIR-LABEL: cir.func{{.*}} @f2(){{.*}} {
 // CIR:         %[[S:.+]] = cir.load align(4) %{{.+}} : !cir.ptr<!rec_S>, !rec_S
 // CIR-NEXT:    cir.call @f1(%[[S]]) : (!rec_S) -> ()
 
-// LLVM-LABEL: define{{.*}} void @f2()
+// LLVM-LABEL: define{{.*}} void @f2(){{.*}}
 // LLVM:         %[[S:.+]] = load %struct.S, ptr %{{.+}}, align 4
 // LLVM-NEXT:    call void @f1(%struct.S %[[S]])
 
@@ -33,11 +33,11 @@ void f4(void) {
   struct S s = f3();
 }
 
-// CIR-LABEL: cir.func{{.*}} @f4() {
+// CIR-LABEL: cir.func{{.*}} @f4(){{.*}} {
 // CIR:         %[[S:.+]] = cir.call @f3() : () -> !rec_S
 // CIR-NEXT:    cir.store align(4) %[[S]], %{{.+}} : !rec_S, !cir.ptr<!rec_S>
 
-// LLVM-LABEL: define{{.*}} void @f4() {
+// LLVM-LABEL: define{{.*}} void @f4(){{.*}} {
 // LLVM:         %[[S:.+]] = call %struct.S @f3()
 // LLVM-NEXT:    store %struct.S %[[S]], ptr %{{.+}}, align 4
 
@@ -57,11 +57,11 @@ void f7(void) {
   f5(b);
 }
 
-// CIR-LABEL: cir.func{{.*}} @f7()
+// CIR-LABEL: cir.func{{.*}} @f7(){{.*}} {
 // CIR:         %[[B:.+]] = cir.load align(4) %{{.+}} : !cir.ptr<!rec_Big>, !rec_Big
 // CIR-NEXT:    cir.call @f5(%[[B]]) : (!rec_Big) -> ()
 
-// LLVM-LABEL: define{{.*}} void @f7() {
+// LLVM-LABEL: define{{.*}} void @f7(){{.*}} {
 // LLVM:         %[[B:.+]] = load %struct.Big, ptr %{{.+}}, align 4
 // LLVM-NEXT:    call void @f5(%struct.Big %[[B]])
 
@@ -73,11 +73,11 @@ void f8(void) {
   struct Big b = f6();
 }
 
-// CIR-LABEL: cir.func{{.*}} @f8()
+// CIR-LABEL: cir.func{{.*}} @f8(){{.*}} {
 // CIR:         %[[B:.+]] = cir.call @f6() : () -> !rec_Big
 // CIR:         cir.store align(4) %[[B]], %{{.+}} : !rec_Big, !cir.ptr<!rec_Big>
 
-// LLVM-LABEL: define{{.*}} void @f8() {
+// LLVM-LABEL: define{{.*}} void @f8(){{.*}} {
 // LLVM:        %[[B:.+]] = call %struct.Big @f6()
 // LLVM-NEXT:   store %struct.Big %[[B]], ptr %{{.+}}, align 4
 
@@ -89,14 +89,14 @@ void f9(void) {
   f1(f3());
 }
 
-// CIR-LABEL: cir.func{{.*}} @f9()
+// CIR-LABEL: cir.func{{.*}} @f9(){{.*}} {
 // CIR:         %[[SLOT:.+]] = cir.alloca !rec_S, !cir.ptr<!rec_S>, ["agg.tmp0"] {alignment = 4 : i64}
 // CIR-NEXT:    %[[RET:.+]] = cir.call @f3() : () -> !rec_S
 // CIR-NEXT:    cir.store align(4) %[[RET]], %[[SLOT]] : !rec_S, !cir.ptr<!rec_S>
 // CIR-NEXT:    %[[ARG:.+]] = cir.load align(4) %[[SLOT]] : !cir.ptr<!rec_S>, !rec_S
 // CIR-NEXT:    cir.call @f1(%[[ARG]]) : (!rec_S) -> ()
 
-// LLVM-LABEL: define{{.*}} void @f9() {
+// LLVM-LABEL: define{{.*}} void @f9(){{.*}} {
 // LLVM:         %[[SLOT:.+]] = alloca %struct.S, i64 1, align 4
 // LLVM-NEXT:    %[[RET:.+]] = call %struct.S @f3()
 // LLVM-NEXT:    store %struct.S %[[RET]], ptr %[[SLOT]], align 4
@@ -116,13 +116,13 @@ int f12(void) {
   return f10(1) + f11(2);
 }
 
-// CIR-LABEL: cir.func{{.*}} @f12() -> !s32i
+// CIR-LABEL: cir.func{{.*}} @f12() -> !s32i{{.*}} {
 // CIR:         %[[A:.+]] = cir.const #cir.int<1> : !s32i
 // CIR-NEXT:    %{{.+}} = cir.call @f10(%[[A]]) side_effect(pure) : (!s32i) -> !s32i
 // CIR-NEXT:    %[[B:.+]] = cir.const #cir.int<2> : !s32i
 // CIR-NEXT:    %{{.+}} = cir.call @f11(%[[B]]) side_effect(const) : (!s32i) -> !s32i
 
-// LLVM-LABEL: define{{.*}} i32 @f12()
+// LLVM-LABEL: define{{.*}} i32 @f12(){{.*}}
 // LLVM:         %{{.+}} = call i32 @f10(i32 1) #[[ATTR0:.+]]
 // LLVM-NEXT:    %{{.+}} = call i32 @f11(i32 2) #[[ATTR1:.+]]
 
diff --git a/clang/test/CIR/CodeGen/call.cpp b/clang/test/CIR/CodeGen/call.cpp
index 3e8cfc1cceb51..affa8af47694c 100644
--- a/clang/test/CIR/CodeGen/call.cpp
+++ b/clang/test/CIR/CodeGen/call.cpp
@@ -12,7 +12,7 @@ void f2() {
 // CIR-LABEL: cir.func{{.*}} @_Z2f2v
 // CIR:         cir.call @_Z2f1v() : () -> ()
 
-// LLVM-LABEL: define{{.*}} void @_Z2f2v() {
+// LLVM-LABEL: define{{.*}} void @_Z2f2v(){{.*}} {
 // LLVM:         call void @_Z2f1v()
 
 int f3() { return 2; }
@@ -25,7 +25,7 @@ int f4() {
 // CIR-LABEL: cir.func{{.*}} @_Z2f4v() -> !s32i
 // CIR:         cir.call @_Z2f3v() : () -> !s32i
 
-// LLVM-LABEL: define{{.*}} i32 @_Z2f4v() {
+// LLVM-LABEL: define{{.*}} i32 @_Z2f4v(){{.*}} {
 // LLVM:         %{{.+}} = call i32 @_Z2f3v()
 
 int f5(int a, int *b, bool c);
@@ -40,7 +40,7 @@ int f6() {
 // CIR-NEXT:    %[[#c:]] = cir.const #false
 // CIR-NEXT:    %{{.+}} = cir.call @_Z2f5iPib(%[[#a]], %[[#b:]], %[[#c]]) : (!s32i, !cir.ptr<!s32i>, !cir.bool) -> !s32i
 
-// LLVM-LABEL: define{{.*}} i32 @_Z2f6v() {
+// LLVM-LABEL: define{{.*}} i32 @_Z2f6v(){{.*}} {
 // LLVM:         %{{.+}} = call i32 @_Z2f5iPib(i32 2, ptr %{{.+}}, i1 false)
 
 int f7(int (*ptr)(int, int)) {
@@ -67,7 +67,7 @@ void f9() {
 // CIR:         cir.call @_Z2f8iz(%{{.+}}) : (!s32i) -> ()
 // CIR:         cir.call @_Z2f8iz(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) : (!s32i, !s32i, !s32i, !s32i) -> ()
 
-// LLVM-LABEL: define{{.*}} void @_Z2f9v()
+// LLVM-LABEL: define{{.*}} void @_Z2f9v(){{.*}}
 // LLVM:         call void (i32, ...) @_Z2f8iz(i32 1)
 // LLVM:         call void (i32, ...) @_Z2f8iz(i32 1, i32 2, i32 3, i32 4)
 
@@ -85,7 +85,7 @@ void f11() {
 // CIR:         %[[#s:]] = cir.call @_Z3f10v() : () -> !rec_S
 // CIR-NEXT:    cir.store align(4) %[[#s]], %{{.+}} : !rec_S, !cir.ptr<!rec_S>
 
-// LLVM-LABEL: define{{.*}} void @_Z3f11v()
+// LLVM-LABEL: define{{.*}} void @_Z3f11v(){{.*}}
 // LLVM:         %[[#s:]] = call %struct.S @_Z3f10v()
 // LLVM-NEXT:    store %struct.S %[[#s]], ptr %{{.+}}, align 4
 
@@ -98,7 +98,7 @@ void f12() {
 // CIR-NEXT:    %[[#ret:]] = cir.call @_Z3f10v() : () -> !rec_S
 // CIR-NEXT:    cir.store align(4) %[[#ret]], %[[#slot]] : !rec_S, !cir.ptr<!rec_S>
 
-// LLVM-LABEL: define{{.*}} void @_Z3f12v() {
+// LLVM-LABEL: define{{.*}} void @_Z3f12v(){{.*}} {
 // LLVM:         %[[#slot:]] = alloca %struct.S, i64 1, align 4
 // LLVM-NEXT:    %[[#ret:]] = call %struct.S @_Z3f10v()
 // LLVM-NEXT:    store %struct.S %[[#ret]], ptr %[[#slot]], align 4
@@ -112,7 +112,7 @@ void f14() {
 // CIR:         cir.call @_Z3f13v() nothrow : () -> ()
 // CIR:       }
 
-// LLVM-LABEL: define{{.+}} void @_Z3f14v()
+// LLVM-LABEL: define{{.+}} void @_Z3f14v(){{.*}}
 // LLVM:         call void @_Z3f13v() #[[LLVM_ATTR_0:.+]]
 // LLVM:       }
 
@@ -126,7 +126,7 @@ void f16() {
 // CIR-NEXT:    %{{.+}} = cir.call @_Z3f15v() : () -> !s32i
 // CIR:       }
 
-// LLVM-LABEL: define{{.+}} void @_Z3f16v() {
+// LLVM-LABEL: define{{.+}} void @_Z3f16v(){{.*}} {
 // LLVM-NEXT:    %{{.+}} = call i32 @_Z3f15v()
 // LLVM:       }
 
diff --git a/clang/test/CIR/CodeGen/cmp.cpp b/clang/test/CIR/CodeGen/cmp.cpp
index 7e32d16e88d57..1871f94ec56d6 100644
--- a/clang/test/CIR/CodeGen/cmp.cpp
+++ b/clang/test/CIR/CodeGen/cmp.cpp
@@ -45,7 +45,7 @@ void c0(int a, int b) {
 // CIR: %[[B6:.*]] = cir.load{{.*}} %[[B_PTR]]
 // CIR: %{{.*}} = cir.cmp(eq, %[[A6]], %[[B6]]) : !s32i, !cir.bool
 
-// LLVM-LABEL: define{{.*}} void @_Z2c0ii(i32 %0, i32 %1) {
+// LLVM-LABEL: define{{.*}} void @_Z2c0ii(i32 %0, i32 %1){{.*}} {
 // LLVM: %[[PTR1:.*]] = alloca i32, i64 1
 // LLVM: %[[PTR2:.*]] = alloca i32, i64 1
 // LLVM: %[[BOOL_PTR:.*]] = alloca i8, i64 1
@@ -170,7 +170,7 @@ void c0_unsigned(unsigned int a, unsigned int b) {
 // CIR: %[[UB6:.*]] = cir.load{{.*}} %[[U_B_PTR]]
 // CIR: %{{.*}} = cir.cmp(eq, %[[UA6]], %[[UB6]]) : !u32i, !cir.bool
 
-// LLVM-LABEL: define{{.*}} void @_Z11c0_unsignedjj(i32 %0, i32 %1) {
+// LLVM-LABEL: define{{.*}} void @_Z11c0_unsignedjj(i32 %0, i32 %1){{.*}} {
 // LLVM: %[[U_PTR1:.*]] = alloca i32, i64 1
 // LLVM: %[[U_PTR2:.*]] = alloca i32, i64 1
 // LLVM: %[[U_BOOL_PTR:.*]] = alloca i8, i64 1
@@ -265,7 +265,7 @@ void c0_float(float a, float b) {
   x = a == b;
 }
 
-// CIR-LABEL: cir.func{{.*}} @_Z8c0_floatff(%arg0: !cir.float{{.*}}, %arg1: !cir.float{{.*}}) {
+// CIR-LABEL: cir.func{{.*}} @_Z8c0_floatff(%arg0: !cir.float{{.*}}, %arg1: !cir.float{{.*}})
 // CIR: %[[A_PTR:.*]] = cir.alloca !cir.float, !cir.ptr<!cir.float>, ["a", init]
 // CIR: %[[B_PTR:.*]] = cir.alloca !cir.float, !cir.ptr<!cir.float>, ["b", init]
 // CIR: %[[X_PTR:.*]] = cir.alloca !cir.bool, !cir.ptr<!cir.bool>, ["x", init]
@@ -303,7 +303,7 @@ void c0_float(float a, float b) {
 // CIR: %[[CMP6:.*]] = cir.cmp(eq, %[[A6]], %[[B6]]) : !cir.float, !cir.bool
 // CIR: cir.store{{.*}} %[[CMP6]], %[[X_PTR]] : !cir.bool, !cir.ptr<!cir.bool>
 
-// LLVM-LABEL: define{{.*}} void @_Z8c0_floatff(float %0, float %1) {
+// LLVM-LABEL: define{{.*}} void @_Z8c0_floatff(float %0, float %1){{.*}} {
 // LLVM: %[[A_PTR:.*]] = alloca float
 // LLVM: %[[B_PTR:.*]] = alloca float
 // LLVM: store float %0, ptr %[[A_PTR]]
@@ -346,7 +346,7 @@ void pointer_cmp(int *a, int *b) {
   x = a != b;
 }
 
-// CIR-LABEL: cir.func{{.*}} @_Z11pointer_cmpPiS_(%arg0: !cir.ptr<!s32i>{{.*}}, %arg1: !cir.ptr<!s32i>{{.*}}) {
+// CIR-LABEL: cir.func{{.*}} @_Z11pointer_cmpPiS_(%arg0: !cir.ptr<!s32i>{{.*}}, %arg1: !cir.ptr<!s32i>{{.*}}){{.*}} {
 // CIR: %[[A_PTR:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["a", init]
 // CIR: %[[B_PTR:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["b", init]
 
@@ -360,7 +360,7 @@ void pointer_cmp(int *a, int *b) {
 // CIR: cir.cmp(eq, {{.*}}, {{.*}}) : !cir.ptr<!s32i>, !cir.bool
 // CIR: cir.cmp(ne, {{.*}}, {{.*}}) : !cir.ptr<!s32i>, !cir.bool
 
-// LLVM-LABEL: define{{.*}} void @_Z11pointer_cmpPiS_(ptr %0, ptr %1) {
+// LLVM-LABEL: define{{.*}} void @_Z11pointer_cmpPiS_(ptr %0, ptr %1){{.*}} {
 // LLVM: %[[A_PTR:.*]] = alloca ptr
 // LLVM: %[[B_PTR:.*]] = alloca ptr
 // LLVM: store ptr %0, ptr %[[A_PTR]]
@@ -401,7 +401,7 @@ void bool_cmp(bool a, bool b) {
   x = a != b;
 }
 
-// CIR-LABEL: cir.func{{.*}} @_Z8bool_cmpbb(%arg0: !cir.bool{{.*}}, %arg1: !cir.bool{{.*}}) {
+// CIR-LABEL: cir.func{{.*}} @_Z8bool_cmpbb(%arg0: !cir.bool{{.*}}, %arg1: !cir.bool{{.*}}){{.*}} {
 // CIR: %[[A_PTR:.*]] = cir.alloca !cir.bool, !cir.ptr<!cir.bool>, ["a", init]
 // CIR: %[[B_PTR:.*]] = cir.alloca !cir.bool, !cir.ptr<!cir.bool>, ["b", init]
 // CIR: %[[X_PTR:.*]] = cir.alloca !cir.bool, !cir.ptr<!cir.bool>, ["x", init]
@@ -419,7 +419,7 @@ void bool_cmp(bool a, bool b) {
 // CIR: cir.cmp(eq
 // CIR: cir.cmp(ne
 
-// LLVM-LABEL: define{{.*}} void @_Z8bool_cmpbb(i1 %0, i1 %1) {
+// LLVM-LABEL: define{{.*}} void @_Z8bool_cmpbb(i1 %0, i1 %1){{.*}} {
 // LLVM: %[[A_PTR:.*]] = alloca i8
 // LLVM: %[[B_PTR:.*]] = alloca i8
 // LLVM: %[[X_PTR:.*]] = alloca i8
diff --git a/clang/test/CIR/CodeGen/comma.c b/clang/test/CIR/CodeGen/comma.c
index cc26a3f200664..c0bc4428354b2 100644
--- a/clang/test/CIR/CodeGen/comma.c
+++ b/clang/test/CIR/CodeGen/comma.c
@@ -16,7 +16,7 @@ void comma(void) {
     i = 100, 200;
 }
 
-// CIR-LABEL: cir.func{{.*}} @comma() {
+// CIR-LABEL: cir.func{{.*}} @comma()
 // CIR:         %[[B:.*]] = cir.alloca !cir.bool, !cir.ptr<!cir.bool>, ["b"]
 // CIR:         %[[C:.*]] = cir.alloca !s8i, !cir.ptr<!s8i>, ["c"]
 // CIR:         %[[F:.*]] = cir.alloca !cir.float, !cir.ptr<!cir.float>, ["f"]
@@ -34,7 +34,7 @@ void comma(void) {
 // CIR:         cir.store{{.*}} %[[HUNDRED]], %[[I]] : !s32i, !cir.ptr<!s32i>
 // CIR:         cir.return
 
-// LLVM-LABEL: define {{.*}}void @comma() {
+// LLVM-LABEL: define {{.*}}void @comma(){{.*}} {
 // LLVM:         %[[B_PTR:.*]] = alloca i8
 // LLVM:         %[[C_PTR:.*]] = alloca i8
 // LLVM:         %[[F_PTR:.*]] = alloca float
diff --git a/clang/test/CIR/CodeGen/complex.cpp b/clang/test/CIR/CodeGen/complex.cpp
index 083d4383e1f7d..4e89af44dcd18 100644
--- a/clang/test/CIR/CodeGen/complex.cpp
+++ b/clang/test/CIR/CodeGen/complex.cpp
@@ -1360,6 +1360,30 @@ void complex_type_argument() {
 // OGCG: %[[TMP_ARG:.*]] = load <2 x float>, ptr %[[ARG_ADDR]], align 4
 // OGCG: call void @_Z22complex_type_parameterCf(<2 x float> noundef %[[TMP_ARG]])
 
+float _Complex complex_type_return_type() {
+  return { 1.0f, 2.0f };
+}
+
+// CIR: %[[RET_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["__retval"]
+// CIR: %[[RET_VAL:.*]] = cir.const #cir.const_complex<#cir.fp<1.000000e+00> : !cir.float, #cir.fp<2.000000e+00> : !cir.float> : !cir.complex<!cir.float>
+// CIR: cir.store{{.*}} %[[RET_VAL]], %[[RET_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+// CIR: %[[TMP_RET:.*]] = cir.load %[[RET_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
+// CIR: cir.return %[[TMP_RET]] : !cir.complex<!cir.float>
+
+// TODO(CIR): the difference between the CIR LLVM and OGCG is because the lack of calling convention lowering,
+// LLVM: %[[RET_ADDR:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: store { float, float } { float 1.000000e+00, float 2.000000e+00 }, ptr %[[RET_ADDR]], align 4
+// LLVM: %[[TMP_RET:.*]] = load { float, float }, ptr %[[RET_ADDR]], align 4
+// LLVM: ret { float, float } %[[TMP_RET]]
+
+// OGCG: %[[RET_ADDR:.*]] = alloca { float, float }, align 4
+// OGCG: %[[RET_VAL_REAL:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[RET_ADDR]], i32 0, i32 0
+// OGCG: %[[RET_VAL_IMAG:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[RET_ADDR]], i32 0, i32 1
+// OGCG: store float 1.000000e+00, ptr %[[RET_VAL_REAL]], align 4
+// OGCG: store float 2.000000e+00, ptr %[[RET_VAL_IMAG]], align 4
+// OGCG: %[[TMP_RET:.*]] = load <2 x float>, ptr %[[RET_ADDR]], align 4
+// OGCG: ret <2 x float> %[[TMP_RET]]
+
 void real_on_scalar_bool() {
   bool a;
   bool b = __real__ a;
@@ -1405,3 +1429,42 @@ void imag_on_scalar_bool() {
 // OGCG: %[[A_ADDR:.*]] = alloca i8, align 1
 // OGCG: %[[B_ADDR:.*]] = alloca i8, align 1
 // OGCG: store i8 0, ptr %[[B_ADDR]], align 1
+
+void function_with_complex_default_arg(
+    float _Complex a = __builtin_complex(1.0f, 2.2f)) {}
+
+// CIR: %[[ARG_0_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a", init]
+// CIR: cir.store %{{.*}}, %[[ARG_0_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+
+// TODO(CIR): the difference between the CIR LLVM and OGCG is because the lack of calling convention lowering,
+
+// LLVM: %[[ARG_0_ADDR:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: store { float, float } %{{.*}}, ptr %[[ARG_0_ADDR]], align 4
+
+// OGCG: %[[ARG_0_ADDR:.*]] = alloca { float, float }, align 4
+// OGCG: store <2 x float> %{{.*}}, ptr %[[ARG_0_ADDR]], align 4
+
+void calling_function_with_default_arg() {
+  function_with_complex_default_arg();
+}
+
+// CIR: %[[DEFAULT_ARG_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["coerce"]
+// CIR: %[[DEFAULT_ARG_VAL:.*]] = cir.const #cir.const_complex<#cir.fp<1.000000e+00> : !cir.float, #cir.fp<2.200000e+00> : !cir.float> : !cir.complex<!cir.float>
+// CIR: cir.store{{.*}} %[[DEFAULT_ARG_VAL]], %[[DEFAULT_ARG_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+// CIR: %[[TMP_DEFAULT_ARG:.*]] = cir.load{{.*}} %[[DEFAULT_ARG_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
+// CIR: cir.call @_Z33function_with_complex_default_argCf(%[[TMP_DEFAULT_ARG]]) : (!cir.complex<!cir.float>) -> ()
+
+// TODO(CIR): the difference between the CIR LLVM and OGCG is because the lack of calling convention lowering,
+
+// LLVM: %[[DEFAULT_ARG_ADDR:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: store { float, float } { float 1.000000e+00, float 0x40019999A0000000 }, ptr %[[DEFAULT_ARG_ADDR]], align 4
+// LLVM: %[[TMP_DEFAULT_ARG:.*]] = load { float, float }, ptr %[[DEFAULT_ARG_ADDR]], align 4
+// LLVM: call void @_Z33function_with_complex_default_argCf({ float, float } %[[TMP_DEFAULT_ARG]])
+
+// OGCG: %[[DEFAULT_ARG_ADDR:.*]] = alloca { float, float }, align 4
+// OGCG: %[[DEFAULT_ARG_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[DEFAULT_ARG_ADDR]], i32 0, i32 0
+// OGCG: %[[DEFAULT_ARG_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[DEFAULT_ARG_ADDR]], i32 0, i32 1
+// OGCG: store float 1.000000e+00, ptr %[[DEFAULT_ARG_REAL_PTR]], align 4
+// OGCG: store float 0x40019999A0000000, ptr %[[DEFAULT_ARG_IMAG_PTR]], align 4
+// OGCG: %[[TMP_DEFAULT_ARG:.*]] = load <2 x float>, ptr %[[DEFAULT_ARG_ADDR]], align 4
+// OGCG: call void @_Z33function_with_complex_default_argCf(<2 x float> {{.*}} %[[TMP_DEFAULT_ARG]])
diff --git a/clang/test/CIR/CodeGen/compound_literal.cpp b/clang/test/CIR/CodeGen/compound_literal.cpp
index a92af95c62a1b..30a1dc03c449b 100644
--- a/clang/test/CIR/CodeGen/compound_literal.cpp
+++ b/clang/test/CIR/CodeGen/compound_literal.cpp
@@ -97,3 +97,30 @@ void foo3() {
 // OGCG: %[[TMP:.*]] = load <4 x i32>, ptr %[[CL_ADDR]], align 16
 // OGCG: store <4 x i32> %[[TMP]], ptr %[[A_ADDR]], align 16
 
+struct Point {
+  int x, y;
+};
+
+void foo4() {
+  Point p = (Point){5, 10};
+}
+
+// CIR-LABEL: @_Z4foo4v
+// CIR:   %[[P:.*]] = cir.alloca !rec_Point, !cir.ptr<!rec_Point>, ["p", init]
+// CIR:   %[[P_X:.*]] = cir.get_member %[[P]][0] {name = "x"}
+// CIR:   %[[FIVE:.*]] = cir.const #cir.int<5> : !s32i
+// CIR:   cir.store{{.*}} %[[FIVE]], %[[P_X]]
+// CIR:   %[[P_Y:.*]] = cir.get_member %[[P]][1] {name = "y"}
+// CIR:   %[[TEN:.*]] = cir.const #cir.int<10> : !s32i
+// CIR:   cir.store{{.*}} %[[TEN]], %[[P_Y]]
+
+// LLVM-LABEL: @_Z4foo4v
+// LLVM:   %[[P:.*]] = alloca %struct.Point
+// LLVM:   %[[P_X:.*]] = getelementptr %struct.Point, ptr %[[P]], i32 0, i32 0
+// LLVM:   store i32 5, ptr %[[P_X]]
+// LLVM:   %[[P_Y:.*]] = getelementptr %struct.Point, ptr %[[P]], i32 0, i32 1
+// LLVM:   store i32 10, ptr %[[P_Y]]
+
+// OGCG-LABEL: @_Z4foo4v
+// OGCG:   %[[P:.*]] = alloca %struct.Point
+// OGCG:   call void @llvm.memcpy{{.*}}(ptr{{.*}} %[[P]], ptr{{.*}} @__const._Z4foo4v.p
diff --git a/clang/test/CIR/CodeGen/ctor.cpp b/clang/test/CIR/CodeGen/ctor.cpp
index 2b06bb0f7cb08..238e4151aeb81 100644
--- a/clang/test/CIR/CodeGen/ctor.cpp
+++ b/clang/test/CIR/CodeGen/ctor.cpp
@@ -49,7 +49,7 @@ void bar() {
 
 // CHECK:      cir.func{{.*}} @_ZN13VariadicStrukC1Eiz(%arg0: !cir.ptr<!rec_VariadicStruk>
 // CHECK-SAME:                                   %arg1: !s32i
-// CHECK-SAME:                                   ...) {
+// CHECK-SAME:                                   ...){{.*}} {
 // CHECK-NEXT:   %[[THIS_ADDR:.*]] = cir.alloca {{.*}} ["this", init]
 // CHECK-NEXT:   %[[N_ADDR:.*]] = cir.alloca {{.*}} ["n", init]
 // CHECK-NEXT:   cir.store %arg0, %[[THIS_ADDR]]
diff --git a/clang/test/CIR/CodeGen/dtors.cpp b/clang/test/CIR/CodeGen/dtors.cpp
index 7fb09757a27bf..f2c80a547f1d3 100644
--- a/clang/test/CIR/CodeGen/dtors.cpp
+++ b/clang/test/CIR/CodeGen/dtors.cpp
@@ -14,10 +14,10 @@ void test_temporary_dtor() {
 }
 
 // CIR: cir.func dso_local @_Z19test_temporary_dtorv()
-// CIR:   %[[ALLOCA:.*]] = cir.alloca !rec_A, !cir.ptr<!rec_A>, ["agg.tmp0"]
+// CIR:   %[[ALLOCA:.*]] = cir.alloca !rec_A, !cir.ptr<!rec_A>, ["agg.tmp.ensured"]
 // CIR:   cir.call @_ZN1AD1Ev(%[[ALLOCA]]) nothrow : (!cir.ptr<!rec_A>) -> ()
 
-// LLVM: define dso_local void @_Z19test_temporary_dtorv()
+// LLVM: define dso_local void @_Z19test_temporary_dtorv(){{.*}}
 // LLVM:   %[[ALLOCA:.*]] = alloca %struct.A, i64 1, align 1
 // LLVM:   call void @_ZN1AD1Ev(ptr %[[ALLOCA]])
 
@@ -55,7 +55,7 @@ bool test_temp_or() { return make_temp(1) || make_temp(2); }
 // CIR:     cir.yield %[[TERNARY]] : !cir.bool
 // CIR:   } : !cir.bool
 
-// LLVM: define{{.*}} i1 @_Z12test_temp_orv() {
+// LLVM: define{{.*}} i1 @_Z12test_temp_orv(){{.*}} {
 // LLVM:   %[[REF_TMP0:.*]] = alloca %struct.B
 // LLVM:   %[[REF_TMP1:.*]] = alloca %struct.B
 // LLVM:   br label %[[LOR_BEGIN:.*]]
@@ -125,7 +125,7 @@ bool test_temp_and() { return make_temp(1) && make_temp(2); }
 // CIR:     cir.yield %[[TERNARY]] : !cir.bool
 // CIR:   } : !cir.bool
 
-// LLVM: define{{.*}} i1 @_Z13test_temp_andv() {
+// LLVM: define{{.*}} i1 @_Z13test_temp_andv(){{.*}} {
 // LLVM:   %[[REF_TMP0:.*]] = alloca %struct.B
 // LLVM:   %[[REF_TMP1:.*]] = alloca %struct.B
 // LLVM:   br label %[[LAND_BEGIN:.*]]
@@ -199,7 +199,7 @@ void test_nested_dtor() {
 // CIR: cir.func{{.*}} @_Z16test_nested_dtorv()
 // CIR:   cir.call @_ZN1DD2Ev(%{{.*}})
 
-// LLVM: define {{.*}} void @_Z16test_nested_dtorv()
+// LLVM: define {{.*}} void @_Z16test_nested_dtorv(){{.*}}
 // LLVM:   call void @_ZN1DD2Ev(ptr %{{.*}})
 
 // OGCG: define {{.*}} void @_Z16test_nested_dtorv()
@@ -236,7 +236,7 @@ void test_base_dtor_call() {
 // CIR: cir.func {{.*}} @_Z19test_base_dtor_callv()
 //   cir.call @_ZN1FD2Ev(%{{.*}}) nothrow : (!cir.ptr<!rec_F>) -> ()
 
-// LLVM: define {{.*}} void @_Z19test_base_dtor_callv()
+// LLVM: define {{.*}} void @_Z19test_base_dtor_callv(){{.*}}
 // LLVM:   call void @_ZN1FD2Ev(ptr %{{.*}})
 
 // OGCG: define {{.*}} void @_Z19test_base_dtor_callv()
diff --git a/clang/test/CIR/CodeGen/dynamic-cast.cpp b/clang/test/CIR/CodeGen/dynamic-cast.cpp
index b4938402f0256..5d010d20bb9f1 100644
--- a/clang/test/CIR/CodeGen/dynamic-cast.cpp
+++ b/clang/test/CIR/CodeGen/dynamic-cast.cpp
@@ -101,3 +101,59 @@ Derived &ref_cast(Base &b) {
 // OGCG:   br i1 %[[IS_NULL]], label %[[BAD_CAST:.*]], label %[[DONE:.*]]
 // OGCG: [[BAD_CAST]]:
 // OGCG:   call void @__cxa_bad_cast()
+
+void *ptr_cast_to_complete(Base *ptr) {
+  return dynamic_cast<void *>(ptr);
+}
+
+// CIR-BEFORE: cir.func dso_local @_Z20ptr_cast_to_completeP4Base
+// CIR-BEFORE:   %{{.+}} = cir.dyn_cast ptr %{{.+}} : !cir.ptr<!rec_Base> -> !cir.ptr<!void>
+// CIR-BEFORE: }
+
+//      CIR-AFTER: cir.func dso_local @_Z20ptr_cast_to_completeP4Base
+//      CIR-AFTER:   %[[SRC:.*]] = cir.load{{.*}} %{{.+}} : !cir.ptr<!cir.ptr<!rec_Base>>, !cir.ptr<!rec_Base>
+// CIR-AFTER-NEXT:   %[[SRC_IS_NOT_NULL:.*]] = cir.cast ptr_to_bool %[[SRC]] : !cir.ptr<!rec_Base> -> !cir.bool
+// CIR-AFTER-NEXT:   %{{.+}} = cir.ternary(%[[SRC_IS_NOT_NULL]], true {
+// CIR-AFTER-NEXT:     %[[VPTR_PTR:.*]] = cir.vtable.get_vptr %[[SRC]] : !cir.ptr<!rec_Base> -> !cir.ptr<!cir.vptr>
+// CIR-AFTER-NEXT:     %[[VPTR:.*]] = cir.load %[[VPTR_PTR]] : !cir.ptr<!cir.vptr>, !cir.vptr
+// CIR-AFTER-NEXT:     %[[ELEM_PTR:.*]] = cir.cast bitcast %[[VPTR]] : !cir.vptr -> !cir.ptr<!s64i>
+// CIR-AFTER-NEXT:     %[[MINUS_TWO:.*]] = cir.const #cir.int<-2> : !s64i
+// CIR-AFTER-NEXT:     %[[BASE_OFFSET_PTR:.*]] = cir.ptr_stride %[[ELEM_PTR]], %[[MINUS_TWO]] : (!cir.ptr<!s64i>, !s64i) -> !cir.ptr<!s64i>
+// CIR-AFTER-NEXT:     %[[BASE_OFFSET:.*]] = cir.load{{.*}} %[[BASE_OFFSET_PTR]] : !cir.ptr<!s64i>, !s64i
+// CIR-AFTER-NEXT:     %[[SRC_BYTES_PTR:.*]] = cir.cast bitcast %[[SRC]] : !cir.ptr<!rec_Base> -> !cir.ptr<!u8i>
+// CIR-AFTER-NEXT:     %[[DST_BYTES_PTR:.*]] = cir.ptr_stride %[[SRC_BYTES_PTR]], %[[BASE_OFFSET]] : (!cir.ptr<!u8i>, !s64i) -> !cir.ptr<!u8i>
+// CIR-AFTER-NEXT:     %[[CASTED_PTR:.*]] = cir.cast bitcast %[[DST_BYTES_PTR]] : !cir.ptr<!u8i> -> !cir.ptr<!void>
+// CIR-AFTER-NEXT:     cir.yield %[[CASTED_PTR]] : !cir.ptr<!void>
+// CIR-AFTER-NEXT:   }, false {
+// CIR-AFTER-NEXT:     %[[NULL_PTR:.*]] = cir.const #cir.ptr<null> : !cir.ptr<!void>
+// CIR-AFTER-NEXT:     cir.yield %[[NULL_PTR]] : !cir.ptr<!void>
+// CIR-AFTER-NEXT:   }) : (!cir.bool) -> !cir.ptr<!void>
+//      CIR-AFTER: }
+
+// LLVM: define {{.*}} @_Z20ptr_cast_to_completeP4Base
+// LLVM:   %[[IS_NOT_NULL:.*]] = icmp ne ptr %[[PTR:.*]], null
+// LLVM:   br i1 %[[IS_NOT_NULL]], label %[[NOT_NULL:.*]], label %[[NULL:.*]]
+// LLVM: [[NOT_NULL]]:
+// LLVM:   %[[VPTR:.*]] = load ptr, ptr %[[PTR]]
+// LLVM:   %[[BASE_OFFSET_PTR:.*]] = getelementptr i64, ptr %7, i64 -2
+// LLVM:   %[[BASE_OFFSET:.*]] = load i64, ptr %[[BASE_OFFSET_PTR]]
+// LLVM:   %[[RESULT:.*]] = getelementptr i8, ptr %[[PTR]], i64 %[[BASE_OFFSET]]
+// LLVM:   br label %[[DONE:.*]]
+// LLVM: [[NULL]]:
+// LLVM:   br label %[[DONE]]
+// LLVM: [[DONE]]:
+// LLVM:   %[[RET:.*]] = phi ptr [ null, %[[NULL]] ], [ %[[RESULT]], %[[NOT_NULL]] ]
+
+// OGCG: define {{.*}} @_Z20ptr_cast_to_completeP4Base
+// OGCG:   %[[IS_NULL:.*]] = icmp eq ptr %[[PTR:.*]], null
+// OGCG:   br i1 %[[IS_NULL]], label %[[NULL:.*]], label %[[NOT_NULL:.*]]
+// OGCG: [[NOT_NULL]]:
+// OGCG:   %[[VPTR:.*]] = load ptr, ptr %[[PTR]]
+// OGCG:   %[[BASE_OFFSET_PTR:.*]] = getelementptr inbounds i64, ptr %[[VPTR]], i64 -2
+// OGCG:   %[[BASE_OFFSET:.*]] = load i64, ptr %[[BASE_OFFSET_PTR]]
+// OGCG:   %[[RESULT:.*]] = getelementptr inbounds i8, ptr %[[PTR]], i64 %[[BASE_OFFSET]]
+// OGCG:   br label %[[DONE:.*]]
+// OGCG: [[NULL]]:
+// OGCG:   br label %[[DONE]]
+// OGCG: [[DONE]]:
+// OGCG:   %[[RET:.*]] = phi ptr [ %[[RESULT]], %[[NOT_NULL]] ], [ null, %[[NULL]] ]
diff --git a/clang/test/CIR/CodeGen/goto.cpp b/clang/test/CIR/CodeGen/goto.cpp
index 48cb44ed0f478..257c2550c2399 100644
--- a/clang/test/CIR/CodeGen/goto.cpp
+++ b/clang/test/CIR/CodeGen/goto.cpp
@@ -205,6 +205,8 @@ extern "C" void case_follow_label(int v) {
 // CIR: cir.func dso_local @case_follow_label
 // CIR: cir.switch
 // CIR: cir.case(equal, [#cir.int<1> : !s32i]) {
+// CIR:   cir.br ^bb1
+// CIR: ^bb1:
 // CIR:   cir.label "label"
 // CIR: cir.case(equal, [#cir.int<2> : !s32i]) {
 // CIR:   cir.call @action1()
@@ -215,9 +217,11 @@ extern "C" void case_follow_label(int v) {
 
 // LLVM: define dso_local void @case_follow_label
 // LLVM:  switch i32 {{.*}}, label %[[SWDEFAULT:.*]] [
-// LLVM:    i32 1, label %[[LABEL:.*]]
+// LLVM:    i32 1, label %[[CASE1:.*]]
 // LLVM:    i32 2, label %[[CASE2:.*]]
 // LLVM:  ]
+// LLVM: [[CASE1]]:
+// LLVM:   br label %[[LABEL:.*]]
 // LLVM: [[LABEL]]:
 // LLVM:   br label %[[CASE2]]
 // LLVM: [[CASE2]]:
@@ -303,3 +307,24 @@ extern "C" void default_follow_label(int v) {
 // OGCG:   br label %label
 // OGCG: sw.epilog:
 // OGCG:   ret void
+
+void g3() {
+label:
+  goto label;
+}
+
+// CIR:  cir.func dso_local @_Z2g3v
+// CIR:    cir.br ^bb1
+// CIR:  ^bb1:
+// CIR:    cir.label "label"
+// CIR:    cir.goto "label"
+
+// LLVM: define dso_local void @_Z2g3v()
+// LLVM:   br label %1
+// LLVM: 1:
+// LLVM:   br label %1
+
+// OGCG: define dso_local void @_Z2g3v()
+// OGCG:   br label %label
+// OGCG: label:
+// OGCG:   br label %label
diff --git a/clang/test/CIR/CodeGen/inline-attributes.cpp b/clang/test/CIR/CodeGen/inline-attributes.cpp
new file mode 100644
index 0000000000000..fab4010354daf
--- /dev/null
+++ b/clang/test/CIR/CodeGen/inline-attributes.cpp
@@ -0,0 +1,75 @@
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-linux-gnu -O1 -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-linux-gnu -O1 -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t-cir.ll %s
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-linux-gnu -O1 -emit-llvm %s -o %t.ll
+// RUN: FileCheck --check-prefix=OGCG --input-file=%t.ll %s
+
+extern int global_var;
+
+__attribute__((always_inline)) inline int always_inline_function(int x) {
+  return x * 2 + global_var;
+}
+
+inline int inline_hint_function(int x) {
+  return x - 1 + global_var;
+}
+
+__attribute__((noinline)) int noinline_function(int x) {
+  return x / 2 + global_var;
+}
+
+int regular_function(int x) {
+  return x + 1 + global_var;
+}
+
+// Force emission of all functions with function pointers
+int (*always_inline_ptr)(int) = &always_inline_function;
+int (*inline_hint_ptr)(int) = &inline_hint_function;
+int (*noinline_ptr)(int) = &noinline_function;
+int (*regular_ptr)(int) = &regular_function;
+
+// CIR-LABEL: cir.func dso_local @_Z17noinline_functioni(%arg0: !s32i {{.*}}) -> !s32i inline(never)
+
+// CIR-LABEL: cir.func dso_local @_Z16regular_functioni(%arg0: !s32i {{.*}}) -> !s32i
+// CIR-NOT: inline(never)
+// CIR-NOT: inline(always)
+// CIR-NOT: inline(hint)
+// CIR-SAME: {
+
+// CIR-LABEL: cir.func {{.*}}@_Z22always_inline_functioni(%arg0: !s32i {{.*}}) -> !s32i inline(always)
+
+// CIR-LABEL: cir.func {{.*}}@_Z20inline_hint_functioni(%arg0: !s32i {{.*}}) -> !s32i inline(hint)
+
+// LLVM: ; Function Attrs:{{.*}} noinline
+// LLVM: define{{.*}} i32 @_Z17noinline_functioni
+
+// LLVM: ; Function Attrs:
+// LLVM-NOT: noinline
+// LLVM-NOT: alwaysinline
+// LLVM-NOT: inlinehint
+// LLVM-SAME: {{$}}
+// LLVM: define{{.*}} i32 @_Z16regular_functioni
+
+// LLVM: ; Function Attrs:{{.*}} alwaysinline
+// LLVM: define{{.*}} i32 @_Z22always_inline_functioni
+
+// LLVM: ; Function Attrs:{{.*}} inlinehint
+// LLVM: define{{.*}} i32 @_Z20inline_hint_functioni
+
+// OGCG: ; Function Attrs:{{.*}} noinline
+// OGCG: define{{.*}} i32 @_Z17noinline_functioni
+
+// OGCG: ; Function Attrs:
+// OGCG-NOT: noinline
+// OGCG-NOT: alwaysinline
+// OGCG-NOT: inlinehint
+// OGCG-SAME: {{$}}
+// OGCG: define{{.*}} i32 @_Z16regular_functioni
+
+// OGCG: ; Function Attrs:{{.*}} alwaysinline
+// OGCG: define{{.*}} i32 @_Z22always_inline_functioni
+
+// OGCG: ; Function Attrs:{{.*}} inlinehint
+// OGCG: define{{.*}} i32 @_Z20inline_hint_functioni
+
diff --git a/clang/test/CIR/CodeGen/label.c b/clang/test/CIR/CodeGen/label.c
index a050094de678b..fd3c7f233fc8b 100644
--- a/clang/test/CIR/CodeGen/label.c
+++ b/clang/test/CIR/CodeGen/label.c
@@ -11,10 +11,14 @@ void label() {
 }
 
 // CIR:  cir.func no_proto dso_local @label
+// CIR:     cir.br ^bb1
+// CIR:  ^bb1:
 // CIR:    cir.label "labelA"
 // CIR:    cir.return
 
 // LLVM:define dso_local void @label
+// LLVM:   br label %1
+// LLVM: 1:
 // LLVM:  ret void
 
 // OGCG: define dso_local void @label
@@ -29,15 +33,19 @@ void multiple_labels() {
 }
 
 // CIR:  cir.func no_proto dso_local @multiple_labels
-// CIR:    cir.label "labelB"
 // CIR:    cir.br ^bb1
-// CIR:  ^bb1:  // pred: ^bb0
+// CIR:  ^bb1:
+// CIR:    cir.label "labelB"
+// CIR:    cir.br ^bb2
+// CIR:  ^bb2:
 // CIR:    cir.label "labelC"
 // CIR:    cir.return
 
-// LLVM: define dso_local void @multiple_labels()
+// LLVM: define dso_local void @multiple_labels(){{.*}}
 // LLVM:   br label %1
 // LLVM: 1:
+// LLVM:   br label %2
+// LLVM: 2:
 // LLVM:   ret void
 
 // OGCG: define dso_local void @multiple_labels
@@ -56,6 +64,8 @@ void label_in_if(int cond) {
 
 // CIR:  cir.func dso_local @label_in_if
 // CIR:      cir.if {{.*}} {
+// CIR:        cir.br ^bb1
+// CIR:      ^bb1:
 // CIR:        cir.label "labelD"
 // CIR:        [[LOAD:%.*]] = cir.load align(4) [[COND:%.*]] : !cir.ptr<!s32i>, !s32i
 // CIR:        [[INC:%.*]] = cir.unary(inc, %3) nsw : !s32i, !s32i
@@ -63,20 +73,22 @@ void label_in_if(int cond) {
 // CIR:      }
 // CIR:    cir.return
 
-// LLVM: define dso_local void @label_in_if
+// LLVM: define dso_local void @label_in_if{{.*}}
 // LLVM:   br label %3
 // LLVM: 3:
 // LLVM:   [[LOAD:%.*]] = load i32, ptr [[COND:%.*]], align 4
 // LLVM:   [[CMP:%.*]] = icmp ne i32 [[LOAD]], 0
-// LLVM:   br i1 [[CMP]], label %6, label %9
+// LLVM:   br i1 [[CMP]], label %6, label %10
 // LLVM: 6:
+// LLVM:   br label %7
+// LLVM: 7:
 // LLVM:   [[LOAD2:%.*]] = load i32, ptr [[COND]], align 4
 // LLVM:   [[ADD1:%.*]] = add nsw i32 [[LOAD2]], 1
 // LLVM:   store i32 [[ADD1]], ptr [[COND]], align 4
-// LLVM:   br label %9
-// LLVM: 9:
 // LLVM:   br label %10
 // LLVM: 10:
+// LLVM:   br label %11
+// LLVM: 11:
 // LLVM:  ret void
 
 // OGCG: define dso_local void @label_in_if
@@ -103,7 +115,7 @@ void after_return() {
 // CIR:    cir.label "label"
 // CIR:    cir.br ^bb1
 
-// LLVM: define dso_local void @after_return
+// LLVM: define dso_local void @after_return{{.*}}
 // LLVM:   br label %1
 // LLVM: 1:
 // LLVM:   ret void
@@ -127,7 +139,7 @@ void after_unreachable() {
 // CIR:    cir.label "label"
 // CIR:    cir.return
 
-// LLVM: define dso_local void @after_unreachable
+// LLVM: define dso_local void @after_unreachable{{.*}}
 // LLVM:   unreachable
 // LLVM: 1:
 // LLVM:   ret void
@@ -142,11 +154,15 @@ void labelWithoutMatch() {
   return;
 }
 // CIR:  cir.func no_proto dso_local @labelWithoutMatch
+// CIR:    cir.br ^bb1
+// CIR:  ^bb1:
 // CIR:    cir.label "end"
 // CIR:    cir.return
 // CIR:  }
 
 // LLVM: define dso_local void @labelWithoutMatch
+// LLVM:   br label %1
+// LLVM: 1:
 // LLVM:   ret void
 
 // OGCG: define dso_local void @labelWithoutMatch
@@ -167,13 +183,17 @@ void foo() {
 
 // CIR: cir.func no_proto dso_local @foo
 // CIR:   cir.scope {
-// CIR:     cir.label "label"
 // CIR:     %0 = cir.alloca !rec_S, !cir.ptr<!rec_S>, ["agg.tmp0"]
+// CIR:      cir.br ^bb1
+// CIR:    ^bb1:
+// CIR:     cir.label "label"
 
-// LLVM:define dso_local void @foo() {
+// LLVM: define dso_local void @foo(){{.*}} {
 // LLVM:  [[ALLOC:%.*]] = alloca %struct.S, i64 1, align 1
 // LLVM:  br label %2
 // LLVM:2:
+// LLVM:  br label %3
+// LLVM:3:
 // LLVM:  [[CALL:%.*]] = call %struct.S @get()
 // LLVM:  store %struct.S [[CALL]], ptr [[ALLOC]], align 1
 // LLVM:  [[LOAD:%.*]] = load %struct.S, ptr [[ALLOC]], align 1
diff --git a/clang/test/CIR/CodeGen/lambda-static-invoker.cpp b/clang/test/CIR/CodeGen/lambda-static-invoker.cpp
index 15d768ef21b03..e7d199b976865 100644
--- a/clang/test/CIR/CodeGen/lambda-static-invoker.cpp
+++ b/clang/test/CIR/CodeGen/lambda-static-invoker.cpp
@@ -50,7 +50,7 @@ int g3() {
 // CIR:   %[[RET:.*]] = cir.load %[[RETVAL]]
 // CIR:   cir.return %[[RET]]
 
-// LLVM: define internal i32 @"_ZZ2g3vENK3$_0clERKi"(ptr %[[THIS_ARG:.*]], ptr %[[REF_I_ARG:.*]]) {
+// LLVM: define internal i32 @"_ZZ2g3vENK3$_0clERKi"(ptr %[[THIS_ARG:.*]], ptr %[[REF_I_ARG:.*]]){{.*}} {
 // LLVM:   %[[THIS_ALLOCA:.*]] = alloca ptr
 // LLVM:   %[[REF_I_ALLOCA:.*]] = alloca ptr
 // LLVM:   %[[RETVAL:.*]] = alloca i32
@@ -66,7 +66,7 @@ int g3() {
 // In OGCG, the _ZZ2g3vENK3$_0clERKi function is emitted after _ZZ2g3vEN3$_08__invokeERKi, see below.
 
 // lambda invoker
-// CIR: cir.func internal private dso_local @_ZZ2g3vEN3$_08__invokeERKi(%[[REF_I_ARG:.*]]: !cir.ptr<!s32i> {{.*}}) -> !s32i {
+// CIR: cir.func internal private dso_local @_ZZ2g3vEN3$_08__invokeERKi(%[[REF_I_ARG:.*]]: !cir.ptr<!s32i> {{.*}}) -> !s32i{{.*}} {
 // CIR:   %[[REF_I_ALLOCA:.*]] = cir.alloca {{.*}} ["i", init, const]
 // CIR:   %[[RETVAL:.*]] = cir.alloca {{.*}} ["__retval"]
 // CIR:   %[[LAM_ALLOCA:.*]] = cir.alloca ![[REC_LAM_G3]], !cir.ptr<![[REC_LAM_G3]]>, ["unused.capture"]
@@ -77,7 +77,7 @@ int g3() {
 // CIR:   %[[RET:.*]] = cir.load %[[RETVAL]]
 // CIR:   cir.return %[[RET]]
 
-// LLVM: define internal i32 @"_ZZ2g3vEN3$_08__invokeERKi"(ptr %[[REF_I_ARG:.*]]) {
+// LLVM: define internal i32 @"_ZZ2g3vEN3$_08__invokeERKi"(ptr %[[REF_I_ARG:.*]]){{.*}} {
 // LLVM:   %[[REF_I_ALLOCA:.*]] = alloca ptr
 // LLVM:   %[[RETVAL:.*]] = alloca i32
 // LLVM:   %[[LAM_ALLOCA:.*]] = alloca %[[REC_LAM_G3:.*]],
@@ -91,7 +91,7 @@ int g3() {
 // In OGCG, the _ZZ2g3vEN3$_08__invokeERKi function is emitted after _ZN1A3barEv, see below.
 
 // lambda operator int (*)(int const&)()
-// CIR:   cir.func internal private dso_local @_ZZ2g3vENK3$_0cvPFiRKiEEv(%[[THIS_ARG:.*]]: !cir.ptr<![[REC_LAM_G3]]> {{.*}}) -> !cir.ptr<!cir.func<(!cir.ptr<!s32i>) -> !s32i>> {
+// CIR:   cir.func internal private dso_local @_ZZ2g3vENK3$_0cvPFiRKiEEv(%[[THIS_ARG:.*]]: !cir.ptr<![[REC_LAM_G3]]> {{.*}}) -> !cir.ptr<!cir.func<(!cir.ptr<!s32i>) -> !s32i>>{{.*}} {
 // CIR:   %[[THIS_ALLOCA:.*]] = cir.alloca !cir.ptr<![[REC_LAM_G3]]>, !cir.ptr<!cir.ptr<![[REC_LAM_G3]]>>, ["this", init]
 // CIR:   %[[RETVAL:.*]] = cir.alloca !cir.ptr<!cir.func<(!cir.ptr<!s32i>) -> !s32i>>, !cir.ptr<!cir.ptr<!cir.func<(!cir.ptr<!s32i>) -> !s32i>>>, ["__retval"]
 // CIR:   cir.store %[[THIS_ARG]], %[[THIS_ALLOCA]]
@@ -101,7 +101,7 @@ int g3() {
 // CIR:   %[[RET:.*]] = cir.load %[[RETVAL]]
 // CIR:   cir.return %[[RET]]
 
-// LLVM: define internal ptr @"_ZZ2g3vENK3$_0cvPFiRKiEEv"(ptr %[[THIS_ARG:.*]]) {
+// LLVM: define internal ptr @"_ZZ2g3vENK3$_0cvPFiRKiEEv"(ptr %[[THIS_ARG:.*]]){{.*}} {
 // LLVM:  %[[THIS_ALLOCA:.*]] = alloca ptr
 // LLVM:  %[[RETVAL:.*]] = alloca ptr
 // LLVM:  store ptr %[[THIS_ARG]], ptr %[[THIS_ALLOCA]]
@@ -112,7 +112,7 @@ int g3() {
 
 // In OGCG, the _ZZ2g3vENK3$_0cvPFiRKiEEv function is emitted just after the _Z2g3v function, see above.
 
-// CIR: cir.func{{.*}} @_Z2g3v() -> !s32i {
+// CIR: cir.func{{.*}} @_Z2g3v() -> !s32i{{.*}} {
 // CIR:   %[[RETVAL:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["__retval"]
 // CIR:   %[[FN_ADDR:.*]] = cir.alloca !cir.ptr<!cir.func<(!cir.ptr<!s32i>) -> !s32i>>, !cir.ptr<!cir.ptr<!cir.func<(!cir.ptr<!s32i>) -> !s32i>>>, ["fn", init]
 // CIR:   %[[TASK:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["task", init]
@@ -145,7 +145,7 @@ int g3() {
 // CIR:     cir.return %[[RET]]
 // CIR:   }
 
-// LLVM: define dso_local i32 @_Z2g3v() {
+// LLVM: define dso_local i32 @_Z2g3v(){{.*}} {
 // LLVM:   %[[LAM_ALLOCA:.*]] = alloca %[[REC_LAM_G3]]
 // LLVM:   %[[REF_TMP1:.*]] = alloca i32
 // LLVM:   %[[RETVAL:.*]] = alloca i32
diff --git a/clang/test/CIR/CodeGen/lambda.cpp b/clang/test/CIR/CodeGen/lambda.cpp
index 033adc60be1ed..0c32ceb187df4 100644
--- a/clang/test/CIR/CodeGen/lambda.cpp
+++ b/clang/test/CIR/CodeGen/lambda.cpp
@@ -13,13 +13,13 @@ void fn() {
   a();
 }
 
-// CIR: cir.func lambda internal private dso_local @_ZZ2fnvENK3$_0clEv(%[[THIS_ARG:.*]]: !cir.ptr<![[REC_LAM_FN_A:.*]]> {{.*}})
+// CIR: cir.func lambda internal private dso_local @_ZZ2fnvENK3$_0clEv(%[[THIS_ARG:.*]]: !cir.ptr<![[REC_LAM_FN_A:.*]]> {{.*}}) {{.*}} {
 // CIR:   %[[THIS:.*]] = cir.alloca !cir.ptr<![[REC_LAM_FN_A]]>, !cir.ptr<!cir.ptr<![[REC_LAM_FN_A]]>>, ["this", init]
 // CIR:   cir.store %[[THIS_ARG]], %[[THIS]]
 // CIR:   cir.load %[[THIS]]
 // CIR:   cir.return
 
-// CIR: cir.func dso_local @_Z2fnv()
+// CIR: cir.func dso_local @_Z2fnv() {{.*}} {
 // CIR:   %[[A:.*]] = cir.alloca ![[REC_LAM_FN_A]], !cir.ptr<![[REC_LAM_FN_A]]>, ["a"]
 // CIR:   cir.call @_ZZ2fnvENK3$_0clEv(%[[A]])
 
@@ -52,7 +52,7 @@ void l0() {
   a();
 }
 
-// CIR: cir.func lambda internal private dso_local @_ZZ2l0vENK3$_0clEv(%[[THIS_ARG:.*]]: !cir.ptr<![[REC_LAM_L0_A:.*]]> {{.*}})
+// CIR: cir.func lambda internal private dso_local @_ZZ2l0vENK3$_0clEv(%[[THIS_ARG:.*]]: !cir.ptr<![[REC_LAM_L0_A:.*]]> {{.*}}) {{.*}} {
 // CIR:   %[[THIS_ADDR:.*]] = cir.alloca !cir.ptr<![[REC_LAM_L0_A]]>, !cir.ptr<!cir.ptr<![[REC_LAM_L0_A]]>>, ["this", init] {alignment = 8 : i64}
 // CIR:   cir.store %[[THIS_ARG]], %[[THIS_ADDR]]
 // CIR:   %[[THIS:.*]] = cir.load %[[THIS_ADDR]]
@@ -66,7 +66,7 @@ void l0() {
 // CIR:   cir.store{{.*}} %[[I_PLUS_ONE]], %[[I_ADDR]]
 // CIR:   cir.return
 
-// CIR: cir.func {{.*}} @_Z2l0v()
+// CIR: cir.func {{.*}} @_Z2l0v() {{.*}} {
 // CIR:   %[[I:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["i"]
 // CIR:   %[[A:.*]] = cir.alloca ![[REC_LAM_L0_A]], !cir.ptr<![[REC_LAM_L0_A]]>, ["a", init]
 // CIR:   %[[I_ADDR:.*]] = cir.get_member %[[A]][0] {name = "i"}
@@ -124,7 +124,7 @@ auto g() {
   };
 }
 
-// CIR: cir.func dso_local @_Z1gv() -> ![[REC_LAM_G:.*]] {
+// CIR: cir.func dso_local @_Z1gv() -> ![[REC_LAM_G:.*]] {{.*}} {
 // CIR:   %[[RETVAL:.*]] = cir.alloca ![[REC_LAM_G]], !cir.ptr<![[REC_LAM_G]]>, ["__retval"]
 // CIR:   %[[I_ADDR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["i", init]
 // CIR:   %[[TWELVE:.*]] = cir.const #cir.int<12> : !s32i
@@ -166,7 +166,7 @@ auto g2() {
 }
 
 // Should be same as above because of NRVO
-// CIR: cir.func dso_local @_Z2g2v() -> ![[REC_LAM_G2:.*]] {
+// CIR: cir.func dso_local @_Z2g2v() -> ![[REC_LAM_G2:.*]] {{.*}} {
 // CIR:   %[[RETVAL:.*]] = cir.alloca ![[REC_LAM_G2]], !cir.ptr<![[REC_LAM_G2]]>, ["__retval", init]
 // CIR:   %[[I_ADDR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["i", init]
 // CIR:   %[[TWELVE:.*]] = cir.const #cir.int<12> : !s32i
@@ -199,7 +199,7 @@ int f() {
   return g2()();
 }
 
-// CIR:cir.func lambda internal private dso_local @_ZZ2g2vENK3$_0clEv(%[[THIS_ARG:.*]]: !cir.ptr<![[REC_LAM_G2]]> {{.*}}) -> !s32i
+// CIR:cir.func lambda internal private dso_local @_ZZ2g2vENK3$_0clEv(%[[THIS_ARG:.*]]: !cir.ptr<![[REC_LAM_G2]]> {{.*}}) -> !s32i {{.*}} {
 // CIR:   %[[THIS_ADDR:.*]] = cir.alloca !cir.ptr<![[REC_LAM_G2]]>, !cir.ptr<!cir.ptr<![[REC_LAM_G2]]>>, ["this", init]
 // CIR:   %[[RETVAL:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["__retval"]
 // CIR:   cir.store %[[THIS_ARG]], %[[THIS_ADDR]]
@@ -217,7 +217,7 @@ int f() {
 // CIR:   %[[RET:.*]] = cir.load %[[RETVAL]]
 // CIR:   cir.return %[[RET]]
 
-// CIR: cir.func dso_local @_Z1fv() -> !s32i
+// CIR: cir.func dso_local @_Z1fv() -> !s32i {{.*}} {
 // CIR:   %[[RETVAL:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["__retval"]
 // CIR:   %[[SCOPE_RET:.*]] = cir.scope {
 // CIR:     %[[TMP:.*]] = cir.alloca ![[REC_LAM_G2]], !cir.ptr<![[REC_LAM_G2]]>, ["ref.tmp0"]
@@ -301,7 +301,7 @@ struct A {
 // OGCG:   call noundef i32 @_ZN1A3barEv(ptr {{.*}} %[[A_THIS]])
 
 // lambda operator() in foo()
-// CIR: cir.func lambda comdat linkonce_odr @_ZZN1A3fooEvENKUlvE_clEv(%[[THIS_ARG:.*]]: !cir.ptr<![[REC_LAM_A:.*]]> {{.*}})
+// CIR: cir.func lambda comdat linkonce_odr @_ZZN1A3fooEvENKUlvE_clEv(%[[THIS_ARG:.*]]: !cir.ptr<![[REC_LAM_A:.*]]> {{.*}}) {{.*}} {
 // CIR:   %[[THIS_ADDR:.*]] = cir.alloca !cir.ptr<![[REC_LAM_A]]>, !cir.ptr<!cir.ptr<![[REC_LAM_A]]>>, ["this", init]
 // CIR:   %[[RETVAL:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["__retval"]
 // CIR:   cir.store{{.*}} %[[THIS_ARG]], %[[THIS_ADDR]]
@@ -328,7 +328,7 @@ struct A {
 // The function above is defined after _ZN1A3barEv in OGCG, see below.
 
 // A::foo()
-// CIR: cir.func {{.*}} @_ZN1A3fooEv(%[[THIS_ARG:.*]]: !cir.ptr<!rec_A> {{.*}}) -> !s32i
+// CIR: cir.func {{.*}} @_ZN1A3fooEv(%[[THIS_ARG:.*]]: !cir.ptr<!rec_A> {{.*}}) -> !s32i {{.*}} {
 // CIR:   %[[THIS_ADDR:.*]] = cir.alloca !cir.ptr<!rec_A>, !cir.ptr<!cir.ptr<!rec_A>>, ["this", init]
 // CIR:   %[[RETVAL:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["__retval"]
 // CIR:   cir.store %[[THIS_ARG]], %[[THIS_ADDR]]
@@ -373,7 +373,7 @@ struct A {
 // OGCG:   ret i32 %[[LAM_RET]]
 
 // lambda operator() in bar()
-// CIR: cir.func {{.*}} @_ZZN1A3barEvENKUlvE_clEv(%[[THIS_ARG2:.*]]: !cir.ptr<![[REC_LAM_PTR_A:.*]]> {{.*}}) -> !s32i
+// CIR: cir.func {{.*}} @_ZZN1A3barEvENKUlvE_clEv(%[[THIS_ARG2:.*]]: !cir.ptr<![[REC_LAM_PTR_A:.*]]> {{.*}}) -> !s32i {{.*}} {
 // CIR:   %[[THIS_ADDR:.*]] = cir.alloca !cir.ptr<![[REC_LAM_PTR_A]]>, !cir.ptr<!cir.ptr<![[REC_LAM_PTR_A]]>>, ["this", init]
 // CIR:   %[[RETVAL:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["__retval"]
 // CIR:   cir.store{{.*}} %[[THIS_ARG]], %[[THIS_ADDR]]
@@ -402,7 +402,7 @@ struct A {
 // The function above is defined after _ZZN1A3fooEvENKUlvE_clEv in OGCG, see below.
 
 // A::bar()
-// CIR: cir.func {{.*}} @_ZN1A3barEv(%[[THIS_ARG:.*]]: !cir.ptr<!rec_A> {{.*}}) -> !s32i
+// CIR: cir.func {{.*}} @_ZN1A3barEv(%[[THIS_ARG:.*]]: !cir.ptr<!rec_A> {{.*}}) -> !s32i {{.*}} {
 // CIR:   %[[THIS_ADDR:.*]] = cir.alloca !cir.ptr<!rec_A>, !cir.ptr<!cir.ptr<!rec_A>>, ["this", init]
 // CIR:   %[[RETVAL:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["__retval"]
 // CIR:   cir.store %[[THIS_ARG]], %[[THIS_ADDR]]
@@ -472,7 +472,7 @@ int test_lambda_this1(){
   return x+y;
 }
 
-// CIR: cir.func {{.*}} @_Z17test_lambda_this1v
+// CIR: cir.func {{.*}} @_Z17test_lambda_this1v{{.*}} {
 // CIR:   cir.call @_ZN1AC1Ev(%[[A_THIS:.*]]){{.*}} : (!cir.ptr<!rec_A>) -> ()
 // CIR:   cir.call @_ZN1A3fooEv(%[[A_THIS]]){{.*}} : (!cir.ptr<!rec_A>) -> !s32i
 // CIR:   cir.call @_ZN1A3barEv(%[[A_THIS]]){{.*}} : (!cir.ptr<!rec_A>) -> !s32i
diff --git a/clang/test/CIR/CodeGen/linkage-spec.cpp b/clang/test/CIR/CodeGen/linkage-spec.cpp
index eb6c7b0a546a9..1affecd28d488 100644
--- a/clang/test/CIR/CodeGen/linkage-spec.cpp
+++ b/clang/test/CIR/CodeGen/linkage-spec.cpp
@@ -1,42 +1,42 @@
 // RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o - 2>&1 | FileCheck %s
 
 extern "C" void TopLevelC(){}
-// CHECK: cir.func{{.*}} @TopLevelC() {
+// CHECK: cir.func dso_local @TopLevelC() inline(never) {
 extern "C++" void TopLevelCpp(){}
-// CHECK: cir.func{{.*}} @_Z11TopLevelCppv() {
+// CHECK: cir.func dso_local @_Z11TopLevelCppv() inline(never) {
 
 extern "C++" {
   void ExternCppEmpty(){}
-  // CHECK: cir.func{{.*}} @_Z14ExternCppEmptyv() {
+  // CHECK: cir.func dso_local @_Z14ExternCppEmptyv() inline(never) {
   extern "C" void ExternCpp_C(){}
-  // CHECK: cir.func{{.*}} @ExternCpp_C() {
+  // CHECK: cir.func dso_local @ExternCpp_C() inline(never) {
   extern "C++" void ExternCpp_Cpp(){}
-  // CHECK: cir.func{{.*}} @_Z13ExternCpp_Cppv() {
+  // CHECK: cir.func dso_local @_Z13ExternCpp_Cppv() inline(never) {
 
   extern "C" {
   void ExternCpp_CEmpty(){}
-  // CHECK: cir.func{{.*}} @ExternCpp_CEmpty() {
+  // CHECK: cir.func dso_local @ExternCpp_CEmpty() inline(never) {
   extern "C" void ExternCpp_C_C(){}
-  // CHECK: cir.func{{.*}} @ExternCpp_C_C() {
+  // CHECK: cir.func dso_local @ExternCpp_C_C() inline(never) {
   extern "C++" void ExternCpp_C_Cpp(){}
-  // CHECK: cir.func{{.*}} @_Z15ExternCpp_C_Cppv() {
+  // CHECK: cir.func dso_local @_Z15ExternCpp_C_Cppv() inline(never) {
   }
 }
 
 extern "C" {
   void ExternCEmpty(){}
-  // CHECK: cir.func{{.*}} @ExternCEmpty() {
+  // CHECK: cir.func dso_local @ExternCEmpty() inline(never) {
   extern "C" void ExternC_C(){}
-  // CHECK: cir.func{{.*}} @ExternC_C() {
+  // CHECK: cir.func dso_local @ExternC_C() inline(never) {
   extern "C++" void ExternC_Cpp(){}
-  // CHECK: cir.func{{.*}} @_Z11ExternC_Cppv() {
+  // CHECK: cir.func dso_local @_Z11ExternC_Cppv() inline(never) {
   extern "C++" {
   void ExternC_CppEmpty(){}
-  // CHECK: cir.func{{.*}} @_Z16ExternC_CppEmptyv() {
+  // CHECK: cir.func dso_local @_Z16ExternC_CppEmptyv() inline(never) {
   extern "C" void ExternC_Cpp_C(){}
-  // CHECK: cir.func{{.*}} @ExternC_Cpp_C() {
+  // CHECK: cir.func dso_local @ExternC_Cpp_C() inline(never) {
   extern "C++" void ExternC_Cpp_Cpp(){}
-  // CHECK: cir.func{{.*}} @_Z15ExternC_Cpp_Cppv() {
+  // CHECK: cir.func dso_local @_Z15ExternC_Cpp_Cppv() inline(never) {
   }
 }
 
diff --git a/clang/test/CIR/CodeGen/loop.cpp b/clang/test/CIR/CodeGen/loop.cpp
index b932f9d4b8a9c..3d286664bba85 100644
--- a/clang/test/CIR/CodeGen/loop.cpp
+++ b/clang/test/CIR/CodeGen/loop.cpp
@@ -24,7 +24,7 @@ void l0() {
 // CIR:   cir.return
 // CIR: }
 
-// LLVM: define{{.*}} void @_Z2l0v()
+// LLVM: define{{.*}} void @_Z2l0v(){{.*}}
 // LLVM:   br label %[[LABEL1:.*]]
 // LLVM: [[LABEL1]]:
 // LLVM:   br label %[[LABEL2:.*]]
@@ -67,7 +67,7 @@ void l1() {
 // CIR-NEXT:   cir.return
 // CIR-NEXT: }
 
-// LLVM: define{{.*}} void @_Z2l1v()
+// LLVM: define{{.*}} void @_Z2l1v(){{.*}}
 // LLVM:   %[[I:.*]] = alloca i32, i64 1, align 4
 // LLVM:   br label %[[LABEL1:.*]]
 // LLVM: [[LABEL1]]:
@@ -117,7 +117,7 @@ void l2() {
 // CIR-NEXT:   cir.return
 // CIR-NEXT: }
 
-// LLVM: define{{.*}} void @_Z2l2v()
+// LLVM: define{{.*}} void @_Z2l2v(){{.*}}
 // LLVM:   %[[I:.*]] = alloca i32, i64 1, align 4
 // LLVM:   br label %[[LABEL1:.*]]
 // LLVM: [[LABEL1]]:
@@ -165,7 +165,7 @@ void l3() {
 // CIR-NEXT:   cir.return
 // CIR-NEXT: }
 
-// LLVM: define{{.*}} void @_Z2l3v()
+// LLVM: define{{.*}} void @_Z2l3v(){{.*}}
 // LLVM:   %[[I:.*]] = alloca i32, i64 1, align 4
 // LLVM:   br label %[[LABEL1:.*]]
 // LLVM: [[LABEL1]]:
@@ -231,7 +231,7 @@ void l4() {
 // CIR:     }
 // CIR:   }
 
-// LLVM: define{{.*}} void @_Z2l4v() {
+// LLVM: define{{.*}} void @_Z2l4v(){{.*}} {
 // LLVM:   %[[RANGE_ADDR:.*]] = alloca ptr
 // LLVM:   %[[BEGIN_ADDR:.*]] = alloca ptr
 // LLVM:   %[[END_ADDR:.*]] = alloca ptr
@@ -355,7 +355,7 @@ void l5() {
 // CIR:     }
 // CIR:   }
 
-// LLVM: define{{.*}} void @_Z2l5v() {
+// LLVM: define{{.*}} void @_Z2l5v(){{.*}} {
 // LLVM:   %[[ARR_ADDR:.*]] = alloca [4 x i32]
 // LLVM:   %[[RANGE_ADDR:.*]] = alloca ptr
 // LLVM:   %[[BEGIN_ADDR:.*]] = alloca ptr
@@ -448,7 +448,7 @@ void test_do_while_false() {
 // CIR-NEXT:       %[[FALSE:.*]] = cir.cast int_to_bool %[[ZERO]] : !s32i -> !cir.bool
 // CIR-NEXT:       cir.condition(%[[FALSE]])
 
-// LLVM: define{{.*}} void @_Z19test_do_while_falsev()
+// LLVM: define{{.*}} void @_Z19test_do_while_falsev(){{.*}}
 // LLVM:   br label %[[LABEL1:.*]]
 // LLVM: [[LABEL1]]:
 // LLVM:   br label %[[LABEL3:.*]]
@@ -486,7 +486,7 @@ void test_empty_while_true() {
 // CIR-NEXT:       }
 // CIR-NEXT:       cir.yield
 
-// LLVM: define{{.*}} void @_Z21test_empty_while_truev()
+// LLVM: define{{.*}} void @_Z21test_empty_while_truev(){{.*}}
 // LLVM:   br label %[[LABEL1:.*]]
 // LLVM: [[LABEL1]]:
 // LLVM:   br label %[[LABEL2:.*]]
@@ -539,7 +539,7 @@ void unreachable_after_continue() {
 // CIR:   cir.return
 // CIR: }
 
-// LLVM: define{{.*}} void @_Z26unreachable_after_continuev()
+// LLVM: define{{.*}} void @_Z26unreachable_after_continuev(){{.*}}
 // LLVM:   %[[X:.*]] = alloca i32, i64 1, align 4
 // LLVM:   br label %[[LABEL1:.*]]
 // LLVM: [[LABEL1]]:
@@ -599,7 +599,7 @@ void unreachable_after_break() {
 // CIR:   cir.return
 // CIR: }
 
-// LLVM: define{{.*}} void @_Z23unreachable_after_breakv()
+// LLVM: define{{.*}} void @_Z23unreachable_after_breakv(){{.*}}
 // LLVM:   %[[X:.*]] = alloca i32, i64 1, align 4
 // LLVM:   br label %[[LABEL1:.*]]
 // LLVM: [[LABEL1]]:
diff --git a/clang/test/CIR/CodeGen/member-functions.cpp b/clang/test/CIR/CodeGen/member-functions.cpp
index 8be2c7fc2edbe..d46345dbadd6d 100644
--- a/clang/test/CIR/CodeGen/member-functions.cpp
+++ b/clang/test/CIR/CodeGen/member-functions.cpp
@@ -19,7 +19,7 @@ void C::f() {}
 
 void C::f2(int a, int b) {}
 
-// CIR:      cir.func{{.*}} @_ZN1C2f2Eii(%[[THIS_ARG:.*]]: !cir.ptr<!rec_C> {{.*}}, %[[A_ARG:.*]]: !s32i {{.*}}, %[[B_ARG:.*]]: !s32i {{.*}}) {
+// CIR:      cir.func{{.*}} @_ZN1C2f2Eii(%[[THIS_ARG:.*]]: !cir.ptr<!rec_C> {{.*}}, %[[A_ARG:.*]]: !s32i {{.*}}, %[[B_ARG:.*]]: !s32i {{.*}})
 // CIR-NEXT:   %[[THIS_ADDR:.*]] = cir.alloca !cir.ptr<!rec_C>, !cir.ptr<!cir.ptr<!rec_C>>, ["this", init]
 // CIR-NEXT:   %[[A_ADDR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["a", init]
 // CIR-NEXT:   %[[B_ADDR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["b", init]
@@ -36,7 +36,7 @@ void test1() {
   c.f2(1, 2);
 }
 
-// CIR: cir.func{{.*}} @_Z5test1v() {
+// CIR: cir.func{{.*}} @_Z5test1v()
 // CIR-NEXT:   %[[C_ADDR:.*]] = cir.alloca !rec_C, !cir.ptr<!rec_C>, ["c"]
 // CIR-NEXT:   cir.call @_ZN1C1fEv(%[[C_ADDR]]) : (!cir.ptr<!rec_C>) -> ()
 // CIR-NEXT:   %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
diff --git a/clang/test/CIR/CodeGen/nrvo.cpp b/clang/test/CIR/CodeGen/nrvo.cpp
index 72c39d7878dc6..ce08c795a77ee 100644
--- a/clang/test/CIR/CodeGen/nrvo.cpp
+++ b/clang/test/CIR/CodeGen/nrvo.cpp
@@ -22,13 +22,13 @@ struct S f1() {
   return s;
 }
 
-// CIR:      cir.func{{.*}} @_Z2f1v() -> !rec_S {
+// CIR:      cir.func{{.*}} @_Z2f1v() -> !rec_S
 // CIR-NEXT:   %[[RETVAL:.*]] = cir.alloca !rec_S, !cir.ptr<!rec_S>, ["__retval", init]
 // CIR-NEXT:   cir.call @_ZN1SC1Ev(%[[RETVAL]]) : (!cir.ptr<!rec_S>) -> ()
 // CIR-NEXT:   %[[RET:.*]] = cir.load %[[RETVAL]] : !cir.ptr<!rec_S>, !rec_S
 // CIR-NEXT:   cir.return %[[RET]]
 
-// CIR-NOELIDE:      cir.func{{.*}} @_Z2f1v() -> !rec_S {
+// CIR-NOELIDE:      cir.func{{.*}} @_Z2f1v() -> !rec_S
 // CIR-NOELIDE-NEXT:   %[[RETVAL:.*]] = cir.alloca !rec_S, !cir.ptr<!rec_S>, ["__retval"]
 // CIR-NOELIDE-NEXT:   %[[S:.*]] = cir.alloca !rec_S, !cir.ptr<!rec_S>, ["s", init]
 // CIR-NOELIDE-NEXT:   cir.call @_ZN1SC1Ev(%[[S]]) : (!cir.ptr<!rec_S>) -> ()
diff --git a/clang/test/CIR/CodeGen/opaque.cpp b/clang/test/CIR/CodeGen/opaque.cpp
index 028bfd9ef4cd0..eac0dfa3755ab 100644
--- a/clang/test/CIR/CodeGen/opaque.cpp
+++ b/clang/test/CIR/CodeGen/opaque.cpp
@@ -154,3 +154,166 @@ void foo3() {
 // OGCG: [[COND_END]]:
 // OGCG:  %[[RESULT:.*]] = phi i32 [ %[[TMP_A]], %[[COND_TRUE]] ], [ %[[TMP_B]], %[[COND_FALSE]] ]
 // OGCG:  store i32 %[[RESULT]], ptr %[[C_ADDR]], align 4
+
+void test_gnu_binary_lvalue_assign() {
+  int a = 5;
+  int b = 10;
+  (a ?: b) = 42;
+}
+
+// CIR-LABEL: cir.func{{.*}} @_Z29test_gnu_binary_lvalue_assignv(
+// CIR: %[[A:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["a", init]
+// CIR: %[[B:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["b", init]
+// CIR: %[[A_VAL:.*]] = cir.load{{.*}} %[[A]] : !cir.ptr<!s32i>, !s32i
+// CIR: %[[A_BOOL:.*]] = cir.cast int_to_bool %[[A_VAL]] : !s32i -> !cir.bool
+// CIR: %[[TERNARY_PTR:.*]] = cir.ternary(%[[A_BOOL]], true {
+// CIR:   cir.yield %[[A]] : !cir.ptr<!s32i>
+// CIR: }, false {
+// CIR:   cir.yield %[[B]] : !cir.ptr<!s32i>
+// CIR: }) : (!cir.bool) -> !cir.ptr<!s32i>
+// CIR: cir.store{{.*}} %{{.*}}, %[[TERNARY_PTR]] : !s32i, !cir.ptr<!s32i>
+
+// LLVM-LABEL: define{{.*}} void @_Z29test_gnu_binary_lvalue_assignv(
+// LLVM: %[[A:.*]] = alloca i32
+// LLVM: %[[B:.*]] = alloca i32
+// LLVM: %[[A_VAL:.*]] = load i32, ptr %[[A]]
+// LLVM: %[[COND:.*]] = icmp ne i32 %[[A_VAL]], 0
+// LLVM: br i1 %[[COND]], label %[[TRUE_BB:.*]], label %[[FALSE_BB:.*]]
+// LLVM: [[TRUE_BB]]:
+// LLVM:   br label %[[MERGE_BB:.*]]
+// LLVM: [[FALSE_BB]]:
+// LLVM:   br label %[[MERGE_BB]]
+// LLVM: [[MERGE_BB]]:
+// LLVM:   %[[PHI_PTR:.*]] = phi ptr [ %[[B]], %[[FALSE_BB]] ], [ %[[A]], %[[TRUE_BB]] ]
+// LLVM:   br label %[[CONT_BB:.*]]
+// LLVM: [[CONT_BB]]:
+// LLVM:   store i32 42, ptr %[[PHI_PTR]]
+
+// OGCG-LABEL: define{{.*}} void @_Z29test_gnu_binary_lvalue_assignv(
+// OGCG: %[[A:.*]] = alloca i32
+// OGCG: %[[B:.*]] = alloca i32
+// OGCG: %[[A_VAL:.*]] = load i32, ptr %[[A]]
+// OGCG: %[[COND:.*]] = icmp ne i32 %[[A_VAL]], 0
+// OGCG: br i1 %[[COND]], label %[[TRUE_BB:.*]], label %[[FALSE_BB:.*]]
+// OGCG: [[TRUE_BB]]:
+// OGCG:   br label %[[MERGE_BB:.*]]
+// OGCG: [[FALSE_BB]]:
+// OGCG:   br label %[[MERGE_BB]]
+// OGCG: [[MERGE_BB]]:
+// OGCG:   %[[PHI_PTR:.*]] = phi ptr [ %[[A]], %[[TRUE_BB]] ], [ %[[B]], %[[FALSE_BB]] ]
+// OGCG:   store i32 42, ptr %[[PHI_PTR]]
+
+void test_gnu_binary_lvalue_compound() {
+  int a = 7;
+  int b = 14;
+  (a ?: b) += 5;
+}
+
+// CIR-LABEL: cir.func{{.*}} @_Z31test_gnu_binary_lvalue_compoundv(
+// CIR: %[[A:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["a", init]
+// CIR: %[[B:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["b", init]
+// CIR: %[[A_VAL:.*]] = cir.load{{.*}} %[[A]] : !cir.ptr<!s32i>, !s32i
+// CIR: %[[A_BOOL:.*]] = cir.cast int_to_bool %[[A_VAL]] : !s32i -> !cir.bool
+// CIR: %[[LVAL_PTR:.*]] = cir.ternary(%[[A_BOOL]], true {
+// CIR:   cir.yield %[[A]] : !cir.ptr<!s32i>
+// CIR: }, false {
+// CIR:   cir.yield %[[B]] : !cir.ptr<!s32i>
+// CIR: }) : (!cir.bool) -> !cir.ptr<!s32i>
+// CIR: %[[OLD_VAL:.*]] = cir.load{{.*}} %[[LVAL_PTR]]
+// CIR: %[[NEW_VAL:.*]] = cir.binop(add, %[[OLD_VAL]], %{{.*}})
+// CIR: cir.store{{.*}} %[[NEW_VAL]], %[[LVAL_PTR]]
+
+// LLVM-LABEL: define{{.*}} void @_Z31test_gnu_binary_lvalue_compoundv(
+// LLVM: %[[A:.*]] = alloca i32
+// LLVM: %[[B:.*]] = alloca i32
+// LLVM: %[[A_VAL:.*]] = load i32, ptr %[[A]]
+// LLVM: %[[COND:.*]] = icmp ne i32 %[[A_VAL]], 0
+// LLVM: br i1 %[[COND]], label %[[TRUE_BB:.*]], label %[[FALSE_BB:.*]]
+// LLVM: [[TRUE_BB]]:
+// LLVM:   br label %[[MERGE_BB:.*]]
+// LLVM: [[FALSE_BB]]:
+// LLVM:   br label %[[MERGE_BB]]
+// LLVM: [[MERGE_BB]]:
+// LLVM:   %[[PTR:.*]] = phi ptr [ %[[B]], %[[FALSE_BB]] ], [ %[[A]], %[[TRUE_BB]] ]
+// LLVM:   br label %[[CONT:.*]]
+// LLVM: [[CONT]]:
+// LLVM:   %[[OLD:.*]] = load i32, ptr %[[PTR]]
+// LLVM:   %[[NEW:.*]] = add{{.*}} i32 %[[OLD]], 5
+// LLVM:   store i32 %[[NEW]], ptr %[[PTR]]
+
+// OGCG-LABEL: define{{.*}} void @_Z31test_gnu_binary_lvalue_compoundv(
+// OGCG: %[[A:.*]] = alloca i32
+// OGCG: %[[B:.*]] = alloca i32
+// OGCG: %[[A_VAL:.*]] = load i32, ptr %[[A]]
+// OGCG: %[[COND:.*]] = icmp ne i32 %[[A_VAL]], 0
+// OGCG: br i1 %[[COND]], label %[[TRUE_BB:.*]], label %[[FALSE_BB:.*]]
+// OGCG: [[TRUE_BB]]:
+// OGCG:   br label %[[MERGE_BB:.*]]
+// OGCG: [[FALSE_BB]]:
+// OGCG:   br label %[[MERGE_BB]]
+// OGCG: [[MERGE_BB]]:
+// OGCG:   %[[PTR:.*]] = phi ptr [ %[[A]], %[[TRUE_BB]] ], [ %[[B]], %[[FALSE_BB]] ]
+// OGCG:   %[[OLD:.*]] = load i32, ptr %[[PTR]]
+// OGCG:   %[[NEW:.*]] = add{{.*}} i32 %[[OLD]], 5
+// OGCG:   store i32 %[[NEW]], ptr %[[PTR]]
+
+void test_gnu_binary_lvalue_ptr() {
+  int x = 1, y = 2;
+  int *p = &x;
+  int *q = nullptr;
+  *(p ?: q) = 99;
+}
+
+// CIR-LABEL: cir.func{{.*}} @_Z26test_gnu_binary_lvalue_ptrv(
+// CIR: %[[X:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["x", init]
+// CIR: %[[Y:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["y", init]
+// CIR: %[[P:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["p", init]
+// CIR: %[[Q:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["q", init]
+// CIR: %[[P_VAL:.*]] = cir.load{{.*}} %[[P]]
+// CIR: %[[P_BOOL:.*]] = cir.cast ptr_to_bool %[[P_VAL]]
+// CIR: %[[PTR_RESULT:.*]] = cir.ternary(%[[P_BOOL]], true {
+// CIR:   %[[P_LOAD:.*]] = cir.load{{.*}} %[[P]]
+// CIR:   cir.yield %[[P_LOAD]] : !cir.ptr<!s32i>
+// CIR: }, false {
+// CIR:   %[[Q_LOAD:.*]] = cir.load{{.*}} %[[Q]]
+// CIR:   cir.yield %[[Q_LOAD]] : !cir.ptr<!s32i>
+// CIR: }) : (!cir.bool) -> !cir.ptr<!s32i>
+// CIR: cir.store{{.*}} %{{.*}}, %[[PTR_RESULT]]
+
+// LLVM-LABEL: define{{.*}} void @_Z26test_gnu_binary_lvalue_ptrv(
+// LLVM: %[[X:.*]] = alloca i32
+// LLVM: %[[Y:.*]] = alloca i32
+// LLVM: %[[P:.*]] = alloca ptr
+// LLVM: %[[Q:.*]] = alloca ptr
+// LLVM: %[[P_VAL:.*]] = load ptr, ptr %[[P]]
+// LLVM: %[[COND:.*]] = icmp ne ptr %[[P_VAL]], null
+// LLVM: br i1 %[[COND]], label %[[TRUE_BB:.*]], label %[[FALSE_BB:.*]]
+// LLVM: [[TRUE_BB]]:
+// LLVM:   %[[P_LOAD:.*]] = load ptr, ptr %[[P]]
+// LLVM:   br label %[[MERGE_BB:.*]]
+// LLVM: [[FALSE_BB]]:
+// LLVM:   %[[Q_LOAD:.*]] = load ptr, ptr %[[Q]]
+// LLVM:   br label %[[MERGE_BB]]
+// LLVM: [[MERGE_BB]]:
+// LLVM:   %[[PHI:.*]] = phi ptr [ %[[Q_LOAD]], %[[FALSE_BB]] ], [ %[[P_LOAD]], %[[TRUE_BB]] ]
+// LLVM:   br label %[[CONT:.*]]
+// LLVM: [[CONT]]:
+// LLVM:   store i32 99, ptr %[[PHI]]
+
+// OGCG-LABEL: define{{.*}} void @_Z26test_gnu_binary_lvalue_ptrv(
+// OGCG: %[[X:.*]] = alloca i32
+// OGCG: %[[Y:.*]] = alloca i32
+// OGCG: %[[P:.*]] = alloca ptr
+// OGCG: %[[Q:.*]] = alloca ptr
+// OGCG: %[[P_VAL:.*]] = load ptr, ptr %[[P]]
+// OGCG: %[[COND:.*]] = icmp ne ptr %[[P_VAL]], null
+// OGCG: br i1 %[[COND]], label %[[TRUE_BB:.*]], label %[[FALSE_BB:.*]]
+// OGCG: [[TRUE_BB]]:
+// OGCG:   %[[P_LOAD:.*]] = load ptr, ptr %[[P]]
+// OGCG:   br label %[[MERGE_BB:.*]]
+// OGCG: [[FALSE_BB]]:
+// OGCG:   %[[Q_LOAD:.*]] = load ptr, ptr %[[Q]]
+// OGCG:   br label %[[MERGE_BB]]
+// OGCG: [[MERGE_BB]]:
+// OGCG:   %[[PHI:.*]] = phi ptr [ %[[P_LOAD]], %[[TRUE_BB]] ], [ %[[Q_LOAD]], %[[FALSE_BB]] ]
+// OGCG:   store i32 99, ptr %[[PHI]]
diff --git a/clang/test/CIR/CodeGen/ptrdiff.c b/clang/test/CIR/CodeGen/ptrdiff.c
new file mode 100644
index 0000000000000..e6a754ebc8d4b
--- /dev/null
+++ b/clang/test/CIR/CodeGen/ptrdiff.c
@@ -0,0 +1,51 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t-cir.ll %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll
+// RUN: FileCheck --check-prefix=OGCG --input-file=%t.ll %s
+
+int addrcmp(const void* a, const void* b) {
+  // CIR-LABEL: addrcmp
+  // CIR: %[[R:.*]] = cir.ptr_diff
+  // CIR: cir.cast integral %[[R]] : !s64i -> !s32i
+
+  // LLVM-LABEL: define dso_local i32 @addrcmp(
+  // LLVM: %[[PTR_A:.*]] = ptrtoint ptr {{.*}} to i64
+  // LLVM: %[[PTR_B:.*]] = ptrtoint ptr {{.*}} to i64
+  // LLVM: %[[SUB:.*]] = sub i64 %[[PTR_A]], %[[PTR_B]]
+  // LLVM-NOT: sdiv
+  // LLVM: trunc i64 %[[SUB]] to i32
+
+  // OGCG-LABEL: define dso_local i32 @addrcmp(
+  // OGCG: %[[PTR_A:.*]] = ptrtoint ptr {{.*}} to i64
+  // OGCG: %[[PTR_B:.*]] = ptrtoint ptr {{.*}} to i64
+  // OGCG: %[[SUB:.*]] = sub i64 %[[PTR_A]], %[[PTR_B]]
+  // OGCG-NOT: sdiv
+  // OGCG: trunc i64 %[[SUB]] to i32
+  return *(const void**)a - *(const void**)b;
+}
+
+unsigned long long test_ptr_diff(int *a, int* b) {
+  // CIR-LABEL: test_ptr_diff
+  // CIR: %[[D:.*]] = cir.ptr_diff {{.*}} : !cir.ptr<!s32i> -> !s64i
+  // CIR: %[[U:.*]] = cir.cast integral %[[D]] : !s64i -> !u64i
+  // CIR: cir.return {{.*}} : !u64i
+
+  // LLVM-LABEL: define dso_local i64 @test_ptr_diff(
+  // LLVM: %[[IA:.*]] = ptrtoint ptr %{{.*}} to i64
+  // LLVM: %[[IB:.*]] = ptrtoint ptr %{{.*}} to i64
+  // LLVM: %[[SUB:.*]] = sub i64 %[[IA]], %[[IB]]
+  // LLVM: %[[Q:.*]] = sdiv exact i64 %[[SUB]], 4
+  // LLVM: store i64 %[[Q]], ptr %[[RETADDR:.*]], align
+  // LLVM: %[[RETLOAD:.*]] = load i64, ptr %[[RETADDR]], align
+  // LLVM: ret i64 %[[RETLOAD]]
+
+  // OGCG-LABEL: define dso_local i64 @test_ptr_diff(
+  // OGCG: %[[IA:.*]] = ptrtoint ptr %{{.*}} to i64
+  // OGCG: %[[IB:.*]] = ptrtoint ptr %{{.*}} to i64
+  // OGCG: %[[SUB:.*]] = sub i64 %[[IA]], %[[IB]]
+  // OGCG: %[[Q:.*]] = sdiv exact i64 %[[SUB]], 4
+  // OGCG: ret i64 %[[Q]]
+  return a - b;
+}
\ No newline at end of file
diff --git a/clang/test/CIR/CodeGen/ptrdiff.cpp b/clang/test/CIR/CodeGen/ptrdiff.cpp
new file mode 100644
index 0000000000000..34ba0ff725581
--- /dev/null
+++ b/clang/test/CIR/CodeGen/ptrdiff.cpp
@@ -0,0 +1,33 @@
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s --check-prefix=LLVM
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s --check-prefix=OGCG
+
+typedef unsigned long size_type;
+
+size_type size(unsigned long *_start, unsigned long *_finish) {
+  // CIR-LABEL: cir.func dso_local @_Z4sizePmS_
+  // CIR: %[[D:.*]] = cir.ptr_diff {{.*}} : !cir.ptr<!u64i> -> !s64i
+  // CIR: %[[U:.*]] = cir.cast integral %[[D]] : !s64i -> !u64i
+  // CIR: cir.return {{.*}} : !u64i
+
+  // LLVM-LABEL: define dso_local {{.*}}i64 @_Z4sizePmS_(
+  // LLVM: %[[IA:.*]] = ptrtoint ptr %{{.*}} to i64
+  // LLVM: %[[IB:.*]] = ptrtoint ptr %{{.*}} to i64
+  // LLVM: %[[SUB:.*]] = sub i64 %[[IA]], %[[IB]]
+  // LLVM: %[[Q:.*]] = sdiv exact i64 %[[SUB]], 8
+  // LLVM: store i64 %[[Q]], ptr %[[RETADDR:.*]], align
+  // LLVM: %[[RET:.*]] = load i64, ptr %[[RETADDR]], align
+  // LLVM: ret i64 %[[RET]]
+
+  // OGCG-LABEL: define dso_local {{.*}}i64 @_Z4sizePmS_(
+  // OGCG: %[[IA:.*]] = ptrtoint ptr %{{.*}} to i64
+  // OGCG: %[[IB:.*]] = ptrtoint ptr %{{.*}} to i64
+  // OGCG: %[[SUB:.*]] = sub i64 %[[IA]], %[[IB]]
+  // OGCG: %[[Q:.*]] = sdiv exact i64 %[[SUB]], 8
+  // OGCG: ret i64 %[[Q]]
+
+  return static_cast<size_type>(_finish - _start);
+}
\ No newline at end of file
diff --git a/clang/test/CIR/CodeGen/struct.cpp b/clang/test/CIR/CodeGen/struct.cpp
index 263799f8a5deb..6d362c79c1c44 100644
--- a/clang/test/CIR/CodeGen/struct.cpp
+++ b/clang/test/CIR/CodeGen/struct.cpp
@@ -265,7 +265,7 @@ void bin_comma() {
 
 // CIR: cir.func{{.*}} @_Z9bin_commav()
 // CIR:   %[[A_ADDR:.*]] = cir.alloca !rec_CompleteS, !cir.ptr<!rec_CompleteS>, ["a", init]
-// CIR:   %[[TMP_ADDR:.*]] = cir.alloca !rec_CompleteS, !cir.ptr<!rec_CompleteS>, ["agg.tmp0"]
+// CIR:   %[[TMP_ADDR:.*]] = cir.alloca !rec_CompleteS, !cir.ptr<!rec_CompleteS>, ["agg.tmp.ensured"]
 // CIR:   %[[ZERO:.*]] = cir.const #cir.zero : !rec_CompleteS
 // CIR:   cir.store{{.*}} %[[ZERO]], %[[TMP_ADDR]] : !rec_CompleteS, !cir.ptr<!rec_CompleteS>
 // CIR:   %[[ZERO:.*]] = cir.const #cir.zero : !rec_CompleteS
diff --git a/clang/test/CIR/CodeGen/ternary-throw.cpp b/clang/test/CIR/CodeGen/ternary-throw.cpp
new file mode 100644
index 0000000000000..fb8897fa18a74
--- /dev/null
+++ b/clang/test/CIR/CodeGen/ternary-throw.cpp
@@ -0,0 +1,197 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -fexceptions -fcxx-exceptions -emit-cir %s -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -fexceptions -fcxx-exceptions -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t-cir.ll %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fexceptions -fcxx-exceptions -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s --check-prefix=OGCG
+
+const int& test_cond_throw_false(bool flag) {
+  const int a = 10;
+  return flag ? a : throw 0;
+}
+
+// CIR-LABEL: cir.func{{.*}} @_Z21test_cond_throw_falseb(
+// CIR: %[[FLAG:.*]] = cir.alloca !cir.bool, !cir.ptr<!cir.bool>, ["flag", init]
+// CIR: %[[A:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["a", init, const]
+// CIR: %[[TEN:.*]] = cir.const #cir.int<10> : !s32i
+// CIR: cir.store{{.*}} %[[TEN]], %[[A]] : !s32i, !cir.ptr<!s32i>
+// CIR: %[[FLAG_VAL:.*]] = cir.load{{.*}} %[[FLAG]] : !cir.ptr<!cir.bool>, !cir.bool
+// CIR: %[[RESULT:.*]] = cir.ternary(%[[FLAG_VAL]], true {
+// CIR:   cir.yield %[[A]] : !cir.ptr<!s32i>
+// CIR: }, false {
+// CIR:   %[[EXCEPTION:.*]] = cir.alloc.exception{{.*}} -> !cir.ptr<!s32i>
+// CIR:   %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CIR:   cir.store{{.*}} %[[ZERO]], %[[EXCEPTION]] : !s32i, !cir.ptr<!s32i>
+// CIR:   cir.throw %[[EXCEPTION]] : !cir.ptr<!s32i>, @_ZTIi
+// CIR:   cir.unreachable
+// CIR: }) : (!cir.bool) -> !cir.ptr<!s32i>
+
+// LLVM-LABEL: define{{.*}} ptr @_Z21test_cond_throw_falseb(
+// LLVM: %[[FLAG_ALLOCA:.*]] = alloca i8
+// LLVM: %[[RET_ALLOCA:.*]] = alloca ptr
+// LLVM: %[[A_ALLOCA:.*]] = alloca i32
+// LLVM: %[[ZEXT:.*]] = zext i1 %{{.*}} to i8
+// LLVM: store i8 %[[ZEXT]], ptr %[[FLAG_ALLOCA]]
+// LLVM: store i32 10, ptr %[[A_ALLOCA]]
+// LLVM: %[[LOAD:.*]] = load i8, ptr %[[FLAG_ALLOCA]]
+// LLVM: %[[BOOL:.*]] = trunc i8 %[[LOAD]] to i1
+// LLVM: br i1 %[[BOOL]], label %[[TRUE_BB:.*]], label %[[FALSE_BB:.*]]
+// LLVM: [[TRUE_BB]]:
+// LLVM:   br label %[[PHI_BB:.*]]
+// LLVM: [[FALSE_BB]]:
+// LLVM:   %[[EXC:.*]] = call{{.*}} ptr @__cxa_allocate_exception
+// LLVM:   store i32 0, ptr %[[EXC]]
+// LLVM:   call void @__cxa_throw(ptr %[[EXC]], ptr @_ZTIi
+// LLVM:   unreachable
+// LLVM: [[PHI_BB]]:
+// LLVM:   %[[PHI:.*]] = phi ptr [ %[[A_ALLOCA]], %[[TRUE_BB]] ]
+// LLVM:   br label %[[CONT_BB:.*]]
+// LLVM: [[CONT_BB]]:
+// LLVM:   store ptr %[[A_ALLOCA]], ptr %[[RET_ALLOCA]]
+// LLVM:   %[[RET:.*]] = load ptr, ptr %[[RET_ALLOCA]]
+// LLVM:   ret ptr %[[RET]]
+
+// OGCG-LABEL: define{{.*}} ptr @_Z21test_cond_throw_falseb(
+// OGCG: %{{.*}} = alloca i8
+// OGCG: %[[A:.*]] = alloca i32
+// OGCG: store i32 10, ptr %[[A]]
+// OGCG: %{{.*}} = load i8, ptr %{{.*}}
+// OGCG: %[[BOOL:.*]] = trunc i8 %{{.*}} to i1
+// OGCG: br i1 %[[BOOL]], label %[[TRUE_BB:.*]], label %[[FALSE_BB:.*]]
+// OGCG: [[TRUE_BB]]:
+// OGCG:   br label %[[END:.*]]
+// OGCG: [[FALSE_BB]]:
+// OGCG:   %{{.*}} = call{{.*}} ptr @__cxa_allocate_exception
+// OGCG:   store i32 0, ptr %{{.*}}
+// OGCG:   call void @__cxa_throw(ptr %{{.*}}, ptr @_ZTIi
+// OGCG:   unreachable
+// OGCG: [[END]]:
+// OGCG:   ret ptr %[[A]]
+
+const int& test_cond_throw_true(bool flag) {
+  const int a = 10;
+  return flag ? throw 0 : a;
+}
+
+// CIR-LABEL: cir.func{{.*}} @_Z20test_cond_throw_trueb(
+// CIR: %[[FLAG:.*]] = cir.alloca !cir.bool, !cir.ptr<!cir.bool>, ["flag", init]
+// CIR: %[[A:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["a", init, const]
+// CIR: %[[TEN:.*]] = cir.const #cir.int<10> : !s32i
+// CIR: cir.store{{.*}} %[[TEN]], %[[A]] : !s32i, !cir.ptr<!s32i>
+// CIR: %[[FLAG_VAL:.*]] = cir.load{{.*}} %[[FLAG]] : !cir.ptr<!cir.bool>, !cir.bool
+// CIR: %[[RESULT:.*]] = cir.ternary(%[[FLAG_VAL]], true {
+// CIR:   %[[EXCEPTION:.*]] = cir.alloc.exception{{.*}} -> !cir.ptr<!s32i>
+// CIR:   %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CIR:   cir.store{{.*}} %[[ZERO]], %[[EXCEPTION]] : !s32i, !cir.ptr<!s32i>
+// CIR:   cir.throw %[[EXCEPTION]] : !cir.ptr<!s32i>, @_ZTIi
+// CIR:   cir.unreachable
+// CIR: }, false {
+// CIR:   cir.yield %[[A]] : !cir.ptr<!s32i>
+// CIR: }) : (!cir.bool) -> !cir.ptr<!s32i>
+
+// LLVM-LABEL: define{{.*}} ptr @_Z20test_cond_throw_trueb(
+// LLVM: %[[FLAG_ALLOCA:.*]] = alloca i8
+// LLVM: %[[RET_ALLOCA:.*]] = alloca ptr
+// LLVM: %[[A_ALLOCA:.*]] = alloca i32
+// LLVM: %[[ZEXT:.*]] = zext i1 %{{.*}} to i8
+// LLVM: store i8 %[[ZEXT]], ptr %[[FLAG_ALLOCA]]
+// LLVM: store i32 10, ptr %[[A_ALLOCA]]
+// LLVM: %[[LOAD:.*]] = load i8, ptr %[[FLAG_ALLOCA]]
+// LLVM: %[[BOOL:.*]] = trunc i8 %[[LOAD]] to i1
+// LLVM: br i1 %[[BOOL]], label %[[TRUE_BB:.*]], label %[[FALSE_BB:.*]]
+// LLVM: [[TRUE_BB]]:
+// LLVM:   %[[EXC:.*]] = call{{.*}} ptr @__cxa_allocate_exception
+// LLVM:   store i32 0, ptr %[[EXC]]
+// LLVM:   call void @__cxa_throw(ptr %[[EXC]], ptr @_ZTIi
+// LLVM:   unreachable
+// LLVM: [[FALSE_BB]]:
+// LLVM:   br label %[[PHI_BB:.*]]
+// LLVM: [[PHI_BB]]:
+// LLVM:   %[[PHI:.*]] = phi ptr [ %[[A_ALLOCA]], %[[FALSE_BB]] ]
+// LLVM:   br label %[[CONT_BB:.*]]
+// LLVM: [[CONT_BB]]:
+// LLVM:   store ptr %[[A_ALLOCA]], ptr %[[RET_ALLOCA]]
+// LLVM:   %[[RET:.*]] = load ptr, ptr %[[RET_ALLOCA]]
+// LLVM:   ret ptr %[[RET]]
+
+// OGCG-LABEL: define{{.*}} ptr @_Z20test_cond_throw_trueb(
+// OGCG: %{{.*}} = alloca i8
+// OGCG: %[[A:.*]] = alloca i32
+// OGCG: store i32 10, ptr %[[A]]
+// OGCG: %{{.*}} = load i8, ptr %{{.*}}
+// OGCG: %[[BOOL:.*]] = trunc i8 %{{.*}} to i1
+// OGCG: br i1 %[[BOOL]], label %[[TRUE_BB:.*]], label %[[FALSE_BB:.*]]
+// OGCG: [[TRUE_BB]]:
+// OGCG:   %{{.*}} = call{{.*}} ptr @__cxa_allocate_exception
+// OGCG:   store i32 0, ptr %{{.*}}
+// OGCG:   call void @__cxa_throw(ptr %{{.*}}, ptr @_ZTIi
+// OGCG:   unreachable
+// OGCG: [[FALSE_BB]]:
+// OGCG:   br label %[[END:.*]]
+// OGCG: [[END]]:
+// OGCG:   ret ptr %[[A]]
+
+// Test constant folding with throw - compile-time true condition, dead throw in false branch
+const int& test_cond_const_true_throw_false() {
+  const int a = 20;
+  return true ? a : throw 0;
+}
+
+// CIR-LABEL: cir.func{{.*}} @_Z32test_cond_const_true_throw_falsev(
+// CIR: %[[A:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["a", init, const]
+// CIR: %[[TWENTY:.*]] = cir.const #cir.int<20> : !s32i
+// CIR: cir.store{{.*}} %[[TWENTY]], %[[A]] : !s32i, !cir.ptr<!s32i>
+// CIR-NOT: cir.ternary
+// CIR-NOT: cir.throw
+// CIR: cir.store %[[A]]
+// CIR: %[[RET:.*]] = cir.load
+// CIR: cir.return %[[RET]] : !cir.ptr<!s32i>
+
+// LLVM-LABEL: define{{.*}} ptr @_Z32test_cond_const_true_throw_falsev(
+// LLVM: %[[A:.*]] = alloca i32
+// LLVM: store i32 20, ptr %[[A]]
+// LLVM-NOT: br i1
+// LLVM-NOT: __cxa_throw
+// LLVM: store ptr %[[A]]
+// LLVM: %[[RET:.*]] = load ptr
+// LLVM: ret ptr %[[RET]]
+
+// OGCG-LABEL: define{{.*}} ptr @_Z32test_cond_const_true_throw_falsev(
+// OGCG: %[[A:.*]] = alloca i32
+// OGCG: store i32 20, ptr %[[A]]
+// OGCG-NOT: br i1
+// OGCG-NOT: __cxa_throw
+// OGCG: ret ptr %[[A]]
+
+// Test constant folding with throw - compile-time false condition, dead throw in true branch
+const int& test_cond_const_false_throw_true() {
+  const int a = 30;
+  return false ? throw 0 : a;
+}
+
+// CIR-LABEL: cir.func{{.*}} @_Z32test_cond_const_false_throw_truev(
+// CIR: %[[A:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["a", init, const]
+// CIR: %[[THIRTY:.*]] = cir.const #cir.int<30> : !s32i
+// CIR: cir.store{{.*}} %[[THIRTY]], %[[A]] : !s32i, !cir.ptr<!s32i>
+// CIR-NOT: cir.ternary
+// CIR-NOT: cir.throw
+// CIR: cir.store %[[A]]
+// CIR: %[[RET:.*]] = cir.load
+// CIR: cir.return %[[RET]] : !cir.ptr<!s32i>
+
+// LLVM-LABEL: define{{.*}} ptr @_Z32test_cond_const_false_throw_truev(
+// LLVM: %[[A:.*]] = alloca i32
+// LLVM: store i32 30, ptr %[[A]]
+// LLVM-NOT: br i1
+// LLVM-NOT: __cxa_throw
+// LLVM: store ptr %[[A]]
+// LLVM: %[[RET:.*]] = load ptr
+// LLVM: ret ptr %[[RET]]
+
+// OGCG-LABEL: define{{.*}} ptr @_Z32test_cond_const_false_throw_truev(
+// OGCG: %[[A:.*]] = alloca i32
+// OGCG: store i32 30, ptr %[[A]]
+// OGCG-NOT: br i1
+// OGCG-NOT: __cxa_throw
+// OGCG: ret ptr %[[A]]
+
diff --git a/clang/test/CIR/CodeGen/ternary.cpp b/clang/test/CIR/CodeGen/ternary.cpp
index eb38ee3083e5c..847c0b4a04009 100644
--- a/clang/test/CIR/CodeGen/ternary.cpp
+++ b/clang/test/CIR/CodeGen/ternary.cpp
@@ -10,11 +10,11 @@ int x(int y) {
 }
 
 // CIR-LABEL: cir.func{{.*}} @_Z1xi(
-// CIR-SAME: %[[ARG0:.*]]: !s32i {{.*}}) -> !s32i {
-// CIR: [[Y:%.+]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["y", init] {alignment = 4 : i64}
-// CIR: [[RETVAL:%.+]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["__retval"] {alignment = 4 : i64}
+// CIR-SAME: %[[ARG0:.*]]: !s32i {{.*}}) -> !s32i{{.*}}{
+// CIR: [[Y:%.+]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["y", init]
+// CIR: [[RETVAL:%.+]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["__retval"]
 // CIR: cir.store %[[ARG0]], [[Y]] : !s32i, !cir.ptr<!s32i>
-// CIR: [[YVAL:%.+]] = cir.load align(4) [[Y]] : !cir.ptr<!s32i>, !s32i
+// CIR: [[YVAL:%.+]] = cir.load{{.*}} [[Y]] : !cir.ptr<!s32i>, !s32i
 // CIR: [[ZERO:%.+]] = cir.const #cir.int<0> : !s32i
 // CIR: [[CMP:%.+]] = cir.cmp(gt, [[YVAL]], [[ZERO]]) : !s32i, !cir.bool
 // CIR: [[THREE:%.+]] = cir.const #cir.int<3> : !s32i
@@ -52,21 +52,21 @@ int foo(int a, int b) {
 }
 
 // CIR-LABEL: cir.func{{.*}} @_Z3fooii(
-// CIR-SAME: %[[ARG0:.*]]: !s32i {{.*}}, %[[ARG1:.*]]: !s32i {{.*}}) -> !s32i {
-// CIR: [[A:%.+]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["a", init] {alignment = 4 : i64}
-// CIR: [[B:%.+]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["b", init] {alignment = 4 : i64}
-// CIR: [[RETVAL:%.+]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["__retval"] {alignment = 4 : i64}
+// CIR-SAME: %[[ARG0:.*]]: !s32i {{.*}}, %[[ARG1:.*]]: !s32i {{.*}}) -> !s32i{{.*}}{
+// CIR: [[A:%.+]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["a", init]
+// CIR: [[B:%.+]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["b", init]
+// CIR: [[RETVAL:%.+]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["__retval"]
 // CIR: cir.store %[[ARG0]], [[A]] : !s32i, !cir.ptr<!s32i>
 // CIR: cir.store %[[ARG1]], [[B]] : !s32i, !cir.ptr<!s32i>
 // CIR: cir.scope {
-// CIR: [[ALOAD:%.+]] = cir.load align(4) [[A]] : !cir.ptr<!s32i>, !s32i
-// CIR: [[BLOAD:%.+]] = cir.load align(4) [[B]] : !cir.ptr<!s32i>, !s32i
+// CIR: [[ALOAD:%.+]] = cir.load{{.*}} [[A]] : !cir.ptr<!s32i>, !s32i
+// CIR: [[BLOAD:%.+]] = cir.load{{.*}} [[B]] : !cir.ptr<!s32i>, !s32i
 // CIR: [[CMP:%.+]] = cir.cmp(lt, [[ALOAD]], [[BLOAD]]) : !s32i, !cir.bool
 // CIR: [[TERNARY_RES:%.+]] = cir.ternary([[CMP]], true {
 // CIR: [[ZERO:%.+]] = cir.const #cir.int<0> : !s32i
 // CIR: cir.yield [[ZERO]] : !s32i
 // CIR: }, false {
-// CIR: [[ALOAD2:%.+]] = cir.load align(4) [[A]] : !cir.ptr<!s32i>, !s32i
+// CIR: [[ALOAD2:%.+]] = cir.load{{.*}} [[A]] : !cir.ptr<!s32i>, !s32i
 // CIR: cir.yield [[ALOAD2]] : !s32i
 // CIR: }) : (!cir.bool) -> !s32i
 // CIR: [[CAST:%.+]] = cir.cast int_to_bool [[TERNARY_RES]] : !s32i -> !cir.bool
@@ -145,3 +145,214 @@ int foo(int a, int b) {
 // OGCG: [[RETURN]]:
 // OGCG: %[[RET2:.*]] = load i32, ptr %[[RETVAL]]
 // OGCG: ret i32 %[[RET2]]
+
+void test_cond_lvalue_assign(bool flag) {
+  int a = 1;
+  int b = 2;
+  (flag ? a : b) = 42;
+}
+
+// CIR-LABEL: cir.func{{.*}} @_Z23test_cond_lvalue_assignb(
+// CIR: %[[FLAG:.*]] = cir.alloca !cir.bool, !cir.ptr<!cir.bool>, ["flag", init]
+// CIR: %[[A:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["a", init]
+// CIR: %[[B:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["b", init]
+// CIR: %[[FLAG_VAL:.*]] = cir.load{{.*}} %[[FLAG]]
+// CIR: %[[TERNARY_PTR:.*]] = cir.ternary(%[[FLAG_VAL]], true {
+// CIR:   cir.yield %[[A]] : !cir.ptr<!s32i>
+// CIR: }, false {
+// CIR:   cir.yield %[[B]] : !cir.ptr<!s32i>
+// CIR: }) : (!cir.bool) -> !cir.ptr<!s32i>
+// CIR: cir.store{{.*}} %{{.*}}, %[[TERNARY_PTR]] : !s32i, !cir.ptr<!s32i>
+
+// LLVM-LABEL: define{{.*}} void @_Z23test_cond_lvalue_assignb(
+// LLVM: %[[FLAG:.*]] = alloca i8
+// LLVM: %[[A:.*]] = alloca i32
+// LLVM: %[[B:.*]] = alloca i32
+// LLVM: %[[FLAG_VAL:.*]] = load i8, ptr %[[FLAG]]
+// LLVM: %[[COND:.*]] = trunc i8 %[[FLAG_VAL]] to i1
+// LLVM: br i1 %[[COND]], label %[[TRUE_BB:.*]], label %[[FALSE_BB:.*]]
+// LLVM: [[TRUE_BB]]:
+// LLVM:   br label %[[MERGE_BB:.*]]
+// LLVM: [[FALSE_BB]]:
+// LLVM:   br label %[[MERGE_BB]]
+// LLVM: [[MERGE_BB]]:
+// LLVM:   %[[PHI_PTR:.*]] = phi ptr [ %[[B]], %[[FALSE_BB]] ], [ %[[A]], %[[TRUE_BB]] ]
+// LLVM:   br label %[[CONT_BB:.*]]
+// LLVM: [[CONT_BB]]:
+// LLVM:   store i32 42, ptr %[[PHI_PTR]]
+
+// OGCG-LABEL: define{{.*}} void @_Z23test_cond_lvalue_assignb(
+// OGCG: %[[FLAG:.*]] = alloca i8
+// OGCG: %[[A:.*]] = alloca i32
+// OGCG: %[[B:.*]] = alloca i32
+// OGCG: %[[FLAG_VAL:.*]] = load i8, ptr %[[FLAG]]
+// OGCG: %[[COND:.*]] = trunc i8 %[[FLAG_VAL]] to i1
+// OGCG: br i1 %[[COND]], label %[[TRUE_BB:.*]], label %[[FALSE_BB:.*]]
+// OGCG: [[TRUE_BB]]:
+// OGCG:   br label %[[MERGE_BB:.*]]
+// OGCG: [[FALSE_BB]]:
+// OGCG:   br label %[[MERGE_BB]]
+// OGCG: [[MERGE_BB]]:
+// OGCG:   %[[PHI_PTR:.*]] = phi ptr [ %[[A]], %[[TRUE_BB]] ], [ %[[B]], %[[FALSE_BB]] ]
+// OGCG:   store i32 42, ptr %[[PHI_PTR]]
+
+int& test_cond_lvalue_ref(bool cond, int x, int y) {
+  return cond ? x : y;
+}
+
+// CIR-LABEL: cir.func{{.*}} @_Z20test_cond_lvalue_refbii(
+// CIR: %[[COND:.*]] = cir.alloca !cir.bool, !cir.ptr<!cir.bool>, ["cond", init]
+// CIR: %[[X:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["x", init]
+// CIR: %[[Y:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["y", init]
+// CIR: %[[COND_VAL:.*]] = cir.load{{.*}} %[[COND]]
+// CIR: %[[REF_PTR:.*]] = cir.ternary(%[[COND_VAL]], true {
+// CIR:   cir.yield %[[X]] : !cir.ptr<!s32i>
+// CIR: }, false {
+// CIR:   cir.yield %[[Y]] : !cir.ptr<!s32i>
+// CIR: }) : (!cir.bool) -> !cir.ptr<!s32i>
+// CIR: cir.store %[[REF_PTR]]
+// CIR: %[[RET_VAL:.*]] = cir.load
+// CIR: cir.return %[[RET_VAL]] : !cir.ptr<!s32i>
+
+// LLVM-LABEL: define{{.*}} ptr @_Z20test_cond_lvalue_refbii(
+// LLVM: %[[COND:.*]] = alloca i8
+// LLVM: %[[X:.*]] = alloca i32
+// LLVM: %[[Y:.*]] = alloca i32
+// LLVM: %[[COND_VAL:.*]] = load i8, ptr %[[COND]]
+// LLVM: %[[BOOL:.*]] = trunc i8 %[[COND_VAL]] to i1
+// LLVM: br i1 %[[BOOL]], label %[[TRUE_BB:.*]], label %[[FALSE_BB:.*]]
+// LLVM: [[TRUE_BB]]:
+// LLVM:   br label %[[MERGE_BB:.*]]
+// LLVM: [[FALSE_BB]]:
+// LLVM:   br label %[[MERGE_BB]]
+// LLVM: [[MERGE_BB]]:
+// LLVM:   %[[PHI:.*]] = phi ptr [ %[[Y]], %[[FALSE_BB]] ], [ %[[X]], %[[TRUE_BB]] ]
+// LLVM:   br label %[[CONT:.*]]
+// LLVM: [[CONT]]:
+// LLVM:   store ptr %[[PHI]]
+// LLVM:   %[[RET:.*]] = load ptr
+// LLVM:   ret ptr %[[RET]]
+
+// OGCG-LABEL: define{{.*}} ptr @_Z20test_cond_lvalue_refbii(
+// OGCG: %[[COND:.*]] = alloca i8
+// OGCG: %[[X:.*]] = alloca i32
+// OGCG: %[[Y:.*]] = alloca i32
+// OGCG: %[[COND_VAL:.*]] = load i8, ptr %[[COND]]
+// OGCG: %[[BOOL:.*]] = trunc i8 %[[COND_VAL]] to i1
+// OGCG: br i1 %[[BOOL]], label %[[TRUE_BB:.*]], label %[[FALSE_BB:.*]]
+// OGCG: [[TRUE_BB]]:
+// OGCG:   br label %[[MERGE_BB:.*]]
+// OGCG: [[FALSE_BB]]:
+// OGCG:   br label %[[MERGE_BB]]
+// OGCG: [[MERGE_BB]]:
+// OGCG:   %[[PHI:.*]] = phi ptr [ %[[X]], %[[TRUE_BB]] ], [ %[[Y]], %[[FALSE_BB]] ]
+// OGCG:   ret ptr %[[PHI]]
+
+// Test ConditionalOperator as lvalue - compound assignment
+void test_cond_lvalue_compound(bool flag) {
+  int a = 5;
+  int b = 10;
+  (flag ? a : b) += 3;
+}
+
+// CIR-LABEL: cir.func{{.*}} @_Z25test_cond_lvalue_compoundb(
+// CIR: %[[FLAG:.*]] = cir.alloca !cir.bool, !cir.ptr<!cir.bool>, ["flag", init]
+// CIR: %[[A:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["a", init]
+// CIR: %[[B:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["b", init]
+// CIR: %[[FLAG_VAL:.*]] = cir.load{{.*}} %[[FLAG]]
+// CIR: %[[LVAL_PTR:.*]] = cir.ternary(%[[FLAG_VAL]], true {
+// CIR:   cir.yield %[[A]] : !cir.ptr<!s32i>
+// CIR: }, false {
+// CIR:   cir.yield %[[B]] : !cir.ptr<!s32i>
+// CIR: }) : (!cir.bool) -> !cir.ptr<!s32i>
+// CIR: %[[OLD_VAL:.*]] = cir.load{{.*}} %[[LVAL_PTR]]
+// CIR: %[[NEW_VAL:.*]] = cir.binop(add, %[[OLD_VAL]], %{{.*}})
+// CIR: cir.store{{.*}} %[[NEW_VAL]], %[[LVAL_PTR]]
+
+// LLVM-LABEL: define{{.*}} void @_Z25test_cond_lvalue_compoundb(
+// LLVM: %[[FLAG:.*]] = alloca i8
+// LLVM: %[[A:.*]] = alloca i32
+// LLVM: %[[B:.*]] = alloca i32
+// LLVM: %[[FLAG_VAL:.*]] = load i8, ptr %[[FLAG]]
+// LLVM: %[[BOOL:.*]] = trunc i8 %[[FLAG_VAL]] to i1
+// LLVM: br i1 %[[BOOL]], label %[[TRUE_BB:.*]], label %[[FALSE_BB:.*]]
+// LLVM: [[TRUE_BB]]:
+// LLVM:   br label %[[MERGE_BB:.*]]
+// LLVM: [[FALSE_BB]]:
+// LLVM:   br label %[[MERGE_BB]]
+// LLVM: [[MERGE_BB]]:
+// LLVM:   %[[PTR:.*]] = phi ptr [ %[[B]], %[[FALSE_BB]] ], [ %[[A]], %[[TRUE_BB]] ]
+// LLVM:   br label %[[CONT:.*]]
+// LLVM: [[CONT]]:
+// LLVM:   %[[OLD:.*]] = load i32, ptr %[[PTR]]
+// LLVM:   %[[NEW:.*]] = add{{.*}} i32 %[[OLD]], 3
+// LLVM:   store i32 %[[NEW]], ptr %[[PTR]]
+
+// OGCG-LABEL: define{{.*}} void @_Z25test_cond_lvalue_compoundb(
+// OGCG: %[[FLAG:.*]] = alloca i8
+// OGCG: %[[A:.*]] = alloca i32
+// OGCG: %[[B:.*]] = alloca i32
+// OGCG: %[[FLAG_VAL:.*]] = load i8, ptr %[[FLAG]]
+// OGCG: %[[BOOL:.*]] = trunc i8 %[[FLAG_VAL]] to i1
+// OGCG: br i1 %[[BOOL]], label %[[TRUE_BB:.*]], label %[[FALSE_BB:.*]]
+// OGCG: [[TRUE_BB]]:
+// OGCG:   br label %[[MERGE_BB:.*]]
+// OGCG: [[FALSE_BB]]:
+// OGCG:   br label %[[MERGE_BB]]
+// OGCG: [[MERGE_BB]]:
+// OGCG:   %[[PTR:.*]] = phi ptr [ %[[A]], %[[TRUE_BB]] ], [ %[[B]], %[[FALSE_BB]] ]
+// OGCG:   %[[OLD:.*]] = load i32, ptr %[[PTR]]
+// OGCG:   %[[NEW:.*]] = add{{.*}} i32 %[[OLD]], 3
+// OGCG:   store i32 %[[NEW]], ptr %[[PTR]]
+
+// Test constant folding - compile-time true condition with lvalue assignment
+void test_cond_const_true_lvalue() {
+  int a = 1;
+  int b = 2;
+  (true ? a : b) = 99;
+}
+
+// CIR-LABEL: cir.func{{.*}} @_Z27test_cond_const_true_lvaluev(
+// CIR: %[[A:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["a", init]
+// CIR: %[[B:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["b", init]
+// CIR-NOT: cir.ternary
+// CIR: %[[NINETYNINE:.*]] = cir.const #cir.int<99> : !s32i
+// CIR: cir.store{{.*}} %[[NINETYNINE]], %[[A]] : !s32i, !cir.ptr<!s32i>
+
+// LLVM-LABEL: define{{.*}} void @_Z27test_cond_const_true_lvaluev(
+// LLVM: %[[A:.*]] = alloca i32
+// LLVM: %[[B:.*]] = alloca i32
+// LLVM-NOT: br i1
+// LLVM: store i32 99, ptr %[[A]]
+
+// OGCG-LABEL: define{{.*}} void @_Z27test_cond_const_true_lvaluev(
+// OGCG: %[[A:.*]] = alloca i32
+// OGCG: %[[B:.*]] = alloca i32
+// OGCG-NOT: br i1
+// OGCG: store i32 99, ptr %[[A]]
+
+// Test constant folding - compile-time false condition with lvalue assignment
+void test_cond_const_false_lvalue() {
+  int a = 1;
+  int b = 2;
+  (false ? a : b) = 88;
+}
+
+// CIR-LABEL: cir.func{{.*}} @_Z28test_cond_const_false_lvaluev(
+// CIR: %[[A:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["a", init]
+// CIR: %[[B:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["b", init]
+// CIR-NOT: cir.ternary
+// CIR: %[[EIGHTYEIGHT:.*]] = cir.const #cir.int<88> : !s32i
+// CIR: cir.store{{.*}} %[[EIGHTYEIGHT]], %[[B]] : !s32i, !cir.ptr<!s32i>
+
+// LLVM-LABEL: define{{.*}} void @_Z28test_cond_const_false_lvaluev(
+// LLVM: %[[A:.*]] = alloca i32
+// LLVM: %[[B:.*]] = alloca i32
+// LLVM-NOT: br i1
+// LLVM: store i32 88, ptr %[[B]]
+
+// OGCG-LABEL: define{{.*}} void @_Z28test_cond_const_false_lvaluev(
+// OGCG: %[[A:.*]] = alloca i32
+// OGCG: %[[B:.*]] = alloca i32
+// OGCG-NOT: br i1
+// OGCG: store i32 88, ptr %[[B]]
diff --git a/clang/test/CIR/CodeGen/throws.cpp b/clang/test/CIR/CodeGen/throws.cpp
index ff6aa62157faa..53af1efc22cd4 100644
--- a/clang/test/CIR/CodeGen/throws.cpp
+++ b/clang/test/CIR/CodeGen/throws.cpp
@@ -123,3 +123,124 @@ void paren_expr() { (throw 0, 1 + 2); }
 // OGCG: %[[EXCEPTION_ADDR:.*]] = call ptr @__cxa_allocate_exception(i64 4)
 // OGCG: store i32 0, ptr %[[EXCEPTION_ADDR]], align 16
 // OGCG: call void @__cxa_throw(ptr %[[EXCEPTION_ADDR]], ptr @_ZTIi, ptr null)
+
+void throw_complex_expr() {
+  throw __builtin_complex(1.1f, 2.2f);
+}
+
+// CIR: %[[EXCEPTION_ADDR:.*]] = cir.alloc.exception 8 -> !cir.ptr<!cir.complex<!cir.float>>
+// CIR: %[[EXCEPTION_VALUE:.*]] = cir.const #cir.const_complex<#cir.fp<1.100000e+00> : !cir.float, #cir.fp<2.200000e+00> : !cir.float> : !cir.complex<!cir.float>
+// CIR: cir.store{{.*}} %[[EXCEPTION_VALUE]], %[[EXCEPTION_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+// CIR: cir.throw %[[EXCEPTION_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, @_ZTICf
+// CIR: cir.unreachable
+
+// LLVM: %[[EXCEPTION_ADDR:.*]] = call ptr @__cxa_allocate_exception(i64 8)
+// LLVM: store { float, float } { float 0x3FF19999A0000000, float 0x40019999A0000000 }, ptr %[[EXCEPTION_ADDR]], align 16
+// LLVM: call void @__cxa_throw(ptr %[[EXCEPTION_ADDR]], ptr @_ZTICf, ptr null)
+
+// OGCG: %[[EXCEPTION_ADDR:.*]] = call ptr @__cxa_allocate_exception(i64 8)
+// OGCG: %[[EXCEPTION_REAL:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[EXCEPTION_ADDR]], i32 0, i32 0
+// OGCG: %[[EXCEPTION_IMAG:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[EXCEPTION_ADDR]], i32 0, i32 1
+// OGCG: store float 0x3FF19999A0000000, ptr %[[EXCEPTION_REAL]], align 16
+// OGCG: store float 0x40019999A0000000, ptr %[[EXCEPTION_IMAG]], align 4
+// OGCG: call void @__cxa_throw(ptr %[[EXCEPTION_ADDR]], ptr @_ZTICf, ptr null)
+
+void throw_vector_type() {
+  typedef int vi4 __attribute__((vector_size(16)));
+  vi4 a;
+  throw a;
+}
+
+// CIR: %[[A_ADDR:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a"]
+// CIR: %[[EXCEPTION_ADDR:.*]] = cir.alloc.exception 16 -> !cir.ptr<!cir.vector<4 x !s32i>>
+// CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
+// CIR: cir.store{{.*}} %[[TMP_A]], %[[EXCEPTION_ADDR]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
+// CIR: cir.throw %[[EXCEPTION_ADDR]] : !cir.ptr<!cir.vector<4 x !s32i>>, @_ZTIDv4_i
+// CIR: cir.unreachable
+
+// LLVM: %[[A_ADDR:.*]] = alloca <4 x i32>, i64 1, align 16
+// LLVM: %[[EXCEPTION_ADDR:.*]] = call ptr @__cxa_allocate_exception(i64 16)
+// LLVM: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[A_ADDR]], align 16
+// LLVM: store <4 x i32> %[[TMP_A]], ptr %[[EXCEPTION_ADDR]], align 16
+// LLVM: call void @__cxa_throw(ptr %[[EXCEPTION_ADDR]], ptr @_ZTIDv4_i, ptr null)
+
+// OGCG: %[[A_ADDR:.*]] = alloca <4 x i32>, align 16
+// OGCG: %[[EXCEPTION_ADDR:.*]] = call ptr @__cxa_allocate_exception(i64 16)
+// OGCG: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[A_ADDR]], align 16
+// OGCG: store <4 x i32> %[[TMP_A]], ptr %[[EXCEPTION_ADDR]], align 16
+// OGCG: call void @__cxa_throw(ptr %[[EXCEPTION_ADDR]], ptr @_ZTIDv4_i, ptr null)
+// OGCG: unreachable
+
+void throw_ext_vector_type() {
+  typedef int vi4 __attribute__((ext_vector_type(4)));
+  vi4 a;
+  throw a;
+}
+
+// CIR: %[[A_ADDR:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a"]
+// CIR: %[[EXCEPTION_ADDR:.*]] = cir.alloc.exception 16 -> !cir.ptr<!cir.vector<4 x !s32i>>
+// CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
+// CIR: cir.store{{.*}} %[[TMP_A]], %[[EXCEPTION_ADDR]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
+// CIR: cir.throw %[[EXCEPTION_ADDR]] : !cir.ptr<!cir.vector<4 x !s32i>>, @_ZTIDv4_i
+// CIR: cir.unreachable
+
+// LLVM: %[[A_ADDR:.*]] = alloca <4 x i32>, i64 1, align 16
+// LLVM: %[[EXCEPTION_ADDR:.*]] = call ptr @__cxa_allocate_exception(i64 16)
+// LLVM: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[A_ADDR]], align 16
+// LLVM: store <4 x i32> %[[TMP_A]], ptr %[[EXCEPTION_ADDR]], align 16
+// LLVM: call void @__cxa_throw(ptr %[[EXCEPTION_ADDR]], ptr @_ZTIDv4_i, ptr null)
+
+// OGCG: %[[A_ADDR:.*]] = alloca <4 x i32>, align 16
+// OGCG: %[[EXCEPTION_ADDR:.*]] = call ptr @__cxa_allocate_exception(i64 16)
+// OGCG: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[A_ADDR]], align 16
+// OGCG: store <4 x i32> %[[TMP_A]], ptr %[[EXCEPTION_ADDR]], align 16
+// OGCG: call void @__cxa_throw(ptr %[[EXCEPTION_ADDR]], ptr @_ZTIDv4_i, ptr null)
+// OGCG: unreachable
+
+void throw_enum_expr() {
+  enum Test {
+    TestA,
+    TestB
+  };
+  throw Test::TestA;
+}
+
+// CIR: %[[EXCEPTION_ADDR:.*]] = cir.alloc.exception 4 -> !cir.ptr<!u32i>
+// CIR: %[[EXCEPTION_VALUE:.*]] = cir.const #cir.int<0> : !u32i
+// CIR: cir.store{{.*}} %[[EXCEPTION_VALUE]], %[[EXCEPTION_ADDR]] : !u32i, !cir.ptr<!u32i>
+// CIR: cir.throw %[[EXCEPTION_ADDR]] : !cir.ptr<!u32i>, @_ZTIZ15throw_enum_exprvE4Test
+// CIR: cir.unreachable
+
+// LLVM: %[[EXCEPTION_ADDR:.*]] = call ptr @__cxa_allocate_exception(i64 4)
+// LLVM: store i32 0, ptr %[[EXCEPTION_ADDR]], align 16
+// LLVM: call void @__cxa_throw(ptr %[[EXCEPTION_ADDR]], ptr @_ZTIZ15throw_enum_exprvE4Test, ptr null)
+// LLVM: unreachable
+
+// OGCG: %[[EXCEPTION_ADDR:.*]] = call ptr @__cxa_allocate_exception(i64 4)
+// OGCG: store i32 0, ptr %[[EXCEPTION_ADDR]], align 16
+// OGCG: call void @__cxa_throw(ptr %[[EXCEPTION_ADDR]], ptr @_ZTIZ15throw_enum_exprvE4Test, ptr null)
+// OGCG: unreachable
+
+void throw_enum_class_expr() {
+  enum class Test {
+    TestA,
+    TestB
+  };
+  throw Test::TestA;
+}
+
+// CIR: %[[EXCEPTION_ADDR:.*]] = cir.alloc.exception 4 -> !cir.ptr<!s32i>
+// CIR: %[[EXCEPTION_VALUE:.*]] = cir.const #cir.int<0> : !s32i
+// CIR: cir.store{{.*}} %[[EXCEPTION_VALUE]], %[[EXCEPTION_ADDR]] : !s32i, !cir.ptr<!s32i>
+// CIR: cir.throw %[[EXCEPTION_ADDR]] : !cir.ptr<!s32i>, @_ZTIZ21throw_enum_class_exprvE4Test
+// CIR: cir.unreachable
+
+// LLVM: %[[EXCEPTION_ADDR:.*]] = call ptr @__cxa_allocate_exception(i64 4)
+// LLVM: store i32 0, ptr %[[EXCEPTION_ADDR]], align 16
+// LLVM: call void @__cxa_throw(ptr %[[EXCEPTION_ADDR]], ptr @_ZTIZ21throw_enum_class_exprvE4Test, ptr null)
+// LLVM: unreachable
+
+// OGCG: %[[EXCEPTION_ADDR:.*]] = call ptr @__cxa_allocate_exception(i64 4)
+// OGCG: store i32 0, ptr %[[EXCEPTION_ADDR]], align 16
+// OGCG: call void @__cxa_throw(ptr %[[EXCEPTION_ADDR]], ptr @_ZTIZ21throw_enum_class_exprvE4Test, ptr null)
+// OGCG: unreachable
diff --git a/clang/test/CIR/CodeGen/try-catch.cpp b/clang/test/CIR/CodeGen/try-catch.cpp
index 8f0b3c4f879dd..5a503102d13df 100644
--- a/clang/test/CIR/CodeGen/try-catch.cpp
+++ b/clang/test/CIR/CodeGen/try-catch.cpp
@@ -30,3 +30,90 @@ void empty_try_block_with_catch_with_int_exception() {
 
 // OGCG: define{{.*}} void @_Z45empty_try_block_with_catch_with_int_exceptionv()
 // OGCG:   ret void
+
+void try_catch_with_empty_catch_all() {
+  int a = 1;
+  try {
+    return;
+    ++a;
+  } catch (...) {
+  }
+}
+
+// CIR: %[[A_ADDR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["a", init]
+// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
+// CIR: cir.store{{.*}} %[[CONST_1]], %[[A_ADDR]] : !s32i, !cir.ptr<!s32i
+// CIR: cir.scope {
+// CIR:   cir.try {
+// CIR:     cir.return
+// CIR:   ^bb1:  // no predecessors
+// CIR:     %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!s32i>, !s32i
+// CIR:     %[[RESULT:.*]] = cir.unary(inc, %[[TMP_A]]) nsw : !s32i, !s32i
+// CIR:     cir.store{{.*}} %[[RESULT]], %[[A_ADDR]] : !s32i, !cir.ptr<!s32i>
+// CIR:     cir.yield
+// CIR:   }
+// CIR: }
+
+// LLVM:   %[[A_ADDR:.*]] = alloca i32, i64 1, align 4
+// LLVM:   store i32 1, ptr %[[A_ADDR]], align 4
+// LLVM:   br label %[[BB_2:.*]]
+// LLVM: [[BB_2]]:
+// LLVM:   br label %[[BB_3:.*]]
+// LLVM: [[BB_3]]:
+// LLVM:   ret void
+// LLVM: [[BB_4:.*]]:
+// LLVM:   %[[TMP_A:.*]] = load i32, ptr %[[A_ADDR]], align 4
+// LLVM:   %[[RESULT:.*]] = add nsw i32 %[[TMP_A]], 1
+// LLVM:   store i32 %[[RESULT]], ptr %[[A_ADDR]], align 4
+// LLVM:   br label %[[BB_7:.*]]
+// LLVM: [[BB_7]]:
+// LLVM:   br label %[[BB_8:.*]]
+// LLVM: [[BB_8]]:
+// LLVM:   ret void
+
+// OGCG: %[[A_ADDR:.*]] = alloca i32, align 4
+// OGCG: store i32 1, ptr %[[A_ADDR]], align 4
+// OGCG: ret void
+
+void try_catch_with_empty_catch_all_2() {
+  int a = 1;
+  try {
+    ++a;
+    return;
+  } catch (...) {
+  }
+}
+
+// CIR: %[[A_ADDR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["a", init]
+// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
+// CIR: cir.store{{.*}} %[[CONST_1]], %[[A_ADDR]] : !s32i, !cir.ptr<!s32i>
+// CIR: cir.scope {
+// CIR:   cir.try {
+// CIR:     %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!s32i>, !s32i
+// CIR:     %[[RESULT:.*]] = cir.unary(inc, %[[TMP_A]]) nsw : !s32i, !s32i
+// CIR:     cir.store{{.*}} %[[RESULT]], %[[A_ADDR]] : !s32i, !cir.ptr<!s32i>
+// CIR:     cir.return
+// CIR:   }
+// CIR: }
+
+// LLVM:   %[[A_ADDR]] = alloca i32, i64 1, align 4
+// LLVM:   store i32 1, ptr %[[A_ADDR]], align 4
+// LLVM:   br label %[[BB_2:.*]]
+// LLVM: [[BB_2]]:
+// LLVM:   br label %[[BB_3:.*]]
+// LLVM: [[BB_3]]:
+// LLVM:   %[[TMP_A:.*]] = load i32, ptr %[[A_ADDR]], align 4
+// LLVM:   %[[RESULT:.*]] = add nsw i32 %[[TMP_A:.*]], 1
+// LLVM:   store i32 %[[RESULT]], ptr %[[A_ADDR]], align 4
+// LLVM:   ret void
+// LLVM: [[BB_6:.*]]:
+// LLVM:   br label %[[BB_7:.*]]
+// LLVM: [[BB_7]]:
+// LLVM:   ret void
+
+// OGCG: %[[A_ADDR:.*]] = alloca i32, align 4
+// OGCG: store i32 1, ptr %[[A_ADDR]], align 4
+// OGCG: %[[TMP_A:.*]] = load i32, ptr %[[A_ADDR]], align 4
+// OGCG: %[[RESULT:.*]] = add nsw i32 %[[TMP_A]], 1
+// OGCG: store i32 %[[RESULT]], ptr %[[A_ADDR]], align 4
+// OGCG: ret void
diff --git a/clang/test/CIR/CodeGen/vbase.cpp b/clang/test/CIR/CodeGen/vbase.cpp
index 9e4232317977f..8fcb2a442cd16 100644
--- a/clang/test/CIR/CodeGen/vbase.cpp
+++ b/clang/test/CIR/CodeGen/vbase.cpp
@@ -57,7 +57,7 @@ void ppp() { B b; }
 
 // OGCG: @_ZTV1B = linkonce_odr unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr inttoptr (i64 12 to ptr), ptr null, ptr @_ZTI1B] }, comdat, align 8
 
-// CIR: cir.func {{.*}}@_Z1fv() {
+// CIR: cir.func {{.*}}@_Z1fv()
 // CIR:   %[[D:.+]] = cir.alloca !rec_Derived, !cir.ptr<!rec_Derived>, ["d", init]
 // CIR:   cir.call @_ZN7DerivedC1Ev(%[[D]]) nothrow : (!cir.ptr<!rec_Derived>) -> ()
 // CIR:   %[[VPTR_PTR:.+]] = cir.vtable.get_vptr %[[D]] : !cir.ptr<!rec_Derived> -> !cir.ptr<!cir.vptr>
@@ -78,7 +78,7 @@ void ppp() { B b; }
 // CIR:   cir.call %[[FN]](%[[BASE_THIS]]) : (!cir.ptr<!cir.func<(!cir.ptr<!rec_Base>)>>, !cir.ptr<!rec_Base>) -> ()
 // CIR:   cir.return
 
-// CIR: cir.func {{.*}}@_Z1gv() {
+// CIR: cir.func {{.*}}@_Z1gv()
 // CIR:   %[[DF:.+]] = cir.alloca !rec_DerivedFinal, !cir.ptr<!rec_DerivedFinal>, ["df", init]
 // CIR:   cir.call @_ZN12DerivedFinalC1Ev(%[[DF]]) nothrow : (!cir.ptr<!rec_DerivedFinal>) -> ()
 // CIR:   %[[BASE_THIS_2:.+]] = cir.base_class_addr %[[DF]] : !cir.ptr<!rec_DerivedFinal> nonnull [0] -> !cir.ptr<!rec_Base>
@@ -89,7 +89,7 @@ void ppp() { B b; }
 // CIR:   cir.call %[[FN_2]](%[[BASE_THIS_2]]) : (!cir.ptr<!cir.func<(!cir.ptr<!rec_Base>)>>, !cir.ptr<!rec_Base>) -> ()
 // CIR:   cir.return
 
-// LLVM: define {{.*}}void @_Z1fv()
+// LLVM: define {{.*}}void @_Z1fv(){{.*}}
 // LLVM:   %[[D:.+]] = alloca {{.*}}
 // LLVM:   call void @_ZN7DerivedC1Ev(ptr %[[D]])
 // LLVM:   %[[VPTR_ADDR:.+]] = load ptr, ptr %[[D]]
@@ -102,7 +102,7 @@ void ppp() { B b; }
 // LLVM:   call void %[[VFN]](ptr %[[ADJ_THIS]])
 // LLVM:   ret void
 
-// LLVM: define {{.*}}void @_Z1gv()
+// LLVM: define {{.*}}void @_Z1gv(){{.*}}
 // LLVM:   %[[DF:.+]] = alloca {{.*}}
 // LLVM:   call void @_ZN12DerivedFinalC1Ev(ptr %[[DF]])
 // LLVM:   %[[VPTR2:.+]] = load ptr, ptr %[[DF]]
@@ -138,7 +138,7 @@ void ppp() { B b; }
 // CIR:   cir.store align(8) %[[VTABLE]], %[[B_VPTR]] : !cir.vptr, !cir.ptr<!cir.vptr>
 // CIR:   cir.return
 
-// LLVM: define{{.*}} void @_ZN1BC1Ev(ptr %[[THIS_ARG:.*]]) {
+// LLVM: define{{.*}} void @_ZN1BC1Ev(ptr %[[THIS_ARG:.*]]){{.*}} {
 // LLVM:   %[[THIS_ADDR:.*]] = alloca ptr
 // LLVM:   store ptr %[[THIS_ARG]], ptr %[[THIS_ADDR]]
 // LLVM:   %[[THIS:.*]] = load ptr, ptr %[[THIS_ADDR]]
diff --git a/clang/test/CIR/CodeGen/vla.c b/clang/test/CIR/CodeGen/vla.c
new file mode 100644
index 0000000000000..b22c704f67198
--- /dev/null
+++ b/clang/test/CIR/CodeGen/vla.c
@@ -0,0 +1,285 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s -check-prefix=CIR
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s -check-prefix=LLVM
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -Wno-unused-value -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s -check-prefix=OGCG
+
+void f0(int len) {
+  int arr[len];
+}
+
+// CIR: cir.func{{.*}} @f0(%[[LEN_ARG:.*]]: !s32i {{.*}})
+// CIR:   %[[LEN_ADDR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["len", init]
+// CIR:   %[[SAVED_STACK:.*]] = cir.alloca !cir.ptr<!u8i>, !cir.ptr<!cir.ptr<!u8i>>, ["saved_stack"]
+// CIR:   cir.store{{.*}} %[[LEN_ARG]], %[[LEN_ADDR]]
+// CIR:   %[[LEN:.*]] = cir.load{{.*}} %[[LEN_ADDR]]
+// CIR:   %[[LEN_SIZE_T:.*]] = cir.cast integral %[[LEN]] : !s32i -> !u64i
+// CIR:   %[[STACK_PTR:.*]] = cir.stacksave
+// CIR:   cir.store{{.*}} %[[STACK_PTR]], %[[SAVED_STACK]]
+// CIR:   %[[ARR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, %[[LEN_SIZE_T]] : !u64i, ["arr"]
+// CIR:   %[[STACK_RESTORE_PTR:.*]] = cir.load{{.*}} %[[SAVED_STACK]]
+// CIR:   cir.stackrestore %[[STACK_RESTORE_PTR]]
+
+// LLVM: define{{.*}} void @f0(i32 %[[LEN_ARG:.*]])
+// LLVM:   %[[LEN_ADDR:.*]] = alloca i32
+// LLVM:   %[[SAVED_STACK:.*]] = alloca ptr
+// LLVM:   store i32 %[[LEN_ARG]], ptr %[[LEN_ADDR]]
+// LLVM:   %[[LEN:.*]] = load i32, ptr %[[LEN_ADDR]]
+// LLVM:   %[[LEN_SIZE_T:.*]] = sext i32 %[[LEN]] to i64
+// LLVM:   %[[STACK_PTR:.*]] = call ptr @llvm.stacksave.p0()
+// LLVM:   store ptr %[[STACK_PTR]], ptr %[[SAVED_STACK]]
+// LLVM:   %[[ARR:.*]] = alloca i32, i64 %[[LEN_SIZE_T]]
+// LLVM:   %[[STACK_RESTORE_PTR:.*]] = load ptr, ptr %[[SAVED_STACK]]
+// LLVM:   call void @llvm.stackrestore.p0(ptr %[[STACK_RESTORE_PTR]])
+
+// Note: VLA_EXPR0 below is emitted to capture debug info.
+
+// OGCG: define{{.*}} void @f0(i32 {{.*}} %[[LEN_ARG:.*]])
+// OGCG:   %[[LEN_ADDR:.*]] = alloca i32
+// OGCG:   %[[SAVED_STACK:.*]] = alloca ptr
+// OGCG:   %[[VLA_EXPR0:.*]] = alloca i64
+// OGCG:   store i32 %[[LEN_ARG]], ptr %[[LEN_ADDR]]
+// OGCG:   %[[LEN:.*]] = load i32, ptr %[[LEN_ADDR]]
+// OGCG:   %[[LEN_SIZE_T:.*]] = zext i32 %[[LEN]] to i64
+// OGCG:   %[[STACK_PTR:.*]] = call ptr @llvm.stacksave.p0()
+// OGCG:   store ptr %[[STACK_PTR]], ptr %[[SAVED_STACK]]
+// OGCG:   %[[ARR:.*]] = alloca i32, i64 %[[LEN_SIZE_T]]
+// OGCG:   store i64 %[[LEN_SIZE_T]], ptr %[[VLA_EXPR0]]
+// OGCG:   %[[STACK_RESTORE_PTR:.*]] = load ptr, ptr %[[SAVED_STACK]]
+// OGCG:   call void @llvm.stackrestore.p0(ptr %[[STACK_RESTORE_PTR]])
+
+void f1(int len) {
+  int arr[16][len];
+}
+
+// CIR: cir.func{{.*}} @f1(%[[LEN_ARG:.*]]: !s32i {{.*}})
+// CIR:   %[[LEN_ADDR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["len", init]
+// CIR:   %[[SAVED_STACK:.*]] = cir.alloca !cir.ptr<!u8i>, !cir.ptr<!cir.ptr<!u8i>>, ["saved_stack"]
+// CIR:   cir.store{{.*}} %[[LEN_ARG]], %[[LEN_ADDR]]
+// CIR:   %[[SIXTEEN:.*]] = cir.const #cir.int<16> : !s32i
+// CIR:   %[[SIXTEEN_SIZE_T:.*]] = cir.cast integral %[[SIXTEEN]] : !s32i -> !u64i
+// CIR:   %[[LEN:.*]] = cir.load{{.*}} %[[LEN_ADDR]]
+// CIR:   %[[LEN_SIZE_T:.*]] = cir.cast integral %[[LEN]] : !s32i -> !u64i
+// CIR:   %[[STACK_PTR:.*]] = cir.stacksave
+// CIR:   cir.store{{.*}} %[[STACK_PTR]], %[[SAVED_STACK]]
+// CIR:   %[[TOTAL_LEN:.*]] = cir.binop(mul, %[[SIXTEEN_SIZE_T]], %[[LEN_SIZE_T]]) nuw
+// CIR:   %[[ARR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, %[[TOTAL_LEN]] : !u64i, ["arr"]
+// CIR:   %[[STACK_RESTORE_PTR:.*]] = cir.load{{.*}} %[[SAVED_STACK]]
+// CIR:   cir.stackrestore %[[STACK_RESTORE_PTR]]
+
+// LLVM: define{{.*}} void @f1(i32 %[[LEN_ARG:.*]])
+// LLVM:   %[[LEN_ADDR:.*]] = alloca i32
+// LLVM:   %[[SAVED_STACK:.*]] = alloca ptr
+// LLVM:   store i32 %[[LEN_ARG]], ptr %[[LEN_ADDR]]
+// LLVM:   %[[LEN:.*]] = load i32, ptr %[[LEN_ADDR]]
+// LLVM:   %[[LEN_SIZE_T:.*]] = sext i32 %[[LEN]] to i64
+// LLVM:   %[[STACK_PTR:.*]] = call ptr @llvm.stacksave.p0()
+// LLVM:   store ptr %[[STACK_PTR]], ptr %[[SAVED_STACK]]
+// LLVM:   %[[TOTAL_LEN:.*]] = mul nuw i64 16, %[[LEN_SIZE_T]]
+// LLVM:   %[[ARR:.*]] = alloca i32, i64 %[[TOTAL_LEN]]
+// LLVM:   %[[STACK_RESTORE_PTR:.*]] = load ptr, ptr %[[SAVED_STACK]]
+// LLVM:   call void @llvm.stackrestore.p0(ptr %[[STACK_RESTORE_PTR]])
+
+// Note: VLA_EXPR0 below is emitted to capture debug info.
+
+// OGCG: define{{.*}} void @f1(i32 {{.*}} %[[LEN_ARG:.*]])
+// OGCG:   %[[LEN_ADDR:.*]] = alloca i32
+// OGCG:   %[[SAVED_STACK:.*]] = alloca ptr
+// OGCG:   %[[VLA_EXPR0:.*]] = alloca i64
+// OGCG:   store i32 %[[LEN_ARG]], ptr %[[LEN_ADDR]]
+// OGCG:   %[[LEN:.*]] = load i32, ptr %[[LEN_ADDR]]
+// OGCG:   %[[LEN_SIZE_T:.*]] = zext i32 %[[LEN]] to i64
+// OGCG:   %[[STACK_PTR:.*]] = call ptr @llvm.stacksave.p0()
+// OGCG:   store ptr %[[STACK_PTR]], ptr %[[SAVED_STACK]]
+// OGCG:   %[[TOTAL_LEN:.*]] = mul nuw i64 16, %[[LEN_SIZE_T]]
+// OGCG:   %[[ARR:.*]] = alloca i32, i64 %[[TOTAL_LEN]]
+// OGCG:   store i64 %[[LEN_SIZE_T]], ptr %[[VLA_EXPR0]]
+// OGCG:   %[[STACK_RESTORE_PTR:.*]] = load ptr, ptr %[[SAVED_STACK]]
+// OGCG:   call void @llvm.stackrestore.p0(ptr %[[STACK_RESTORE_PTR]])
+
+void f2(int len) {
+  int arr[len + 4];
+}
+  
+// CIR: cir.func{{.*}} @f2(%[[LEN_ARG:.*]]: !s32i {{.*}})
+// CIR:   %[[LEN_ADDR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["len", init]
+// CIR:   %[[SAVED_STACK:.*]] = cir.alloca !cir.ptr<!u8i>, !cir.ptr<!cir.ptr<!u8i>>, ["saved_stack"]
+// CIR:   cir.store{{.*}} %[[LEN_ARG]], %[[LEN_ADDR]]
+// CIR:   %[[LEN:.*]] = cir.load{{.*}} %[[LEN_ADDR]]
+// CIR:   %[[FOUR:.*]] = cir.const #cir.int<4> : !s32i
+// CIR:   %[[TOTAL_LEN:.*]] = cir.binop(add, %[[LEN]], %[[FOUR]]) nsw : !s32i
+// CIR:   %[[TOTAL_LEN_SIZE_T:.*]] = cir.cast integral %[[TOTAL_LEN]] : !s32i -> !u64i
+// CIR:   %[[STACK_PTR:.*]] = cir.stacksave
+// CIR:   cir.store{{.*}} %[[STACK_PTR]], %[[SAVED_STACK]]
+// CIR:   %[[ARR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, %[[TOTAL_LEN_SIZE_T]] : !u64i, ["arr"]
+// CIR:   %[[STACK_RESTORE_PTR:.*]] = cir.load{{.*}} %[[SAVED_STACK]]
+// CIR:   cir.stackrestore %[[STACK_RESTORE_PTR]]
+  
+// LLVM: define{{.*}} void @f2(i32 %[[LEN_ARG:.*]])
+// LLVM:   %[[LEN_ADDR:.*]] = alloca i32
+// LLVM:   %[[SAVED_STACK:.*]] = alloca ptr
+// LLVM:   store i32 %[[LEN_ARG]], ptr %[[LEN_ADDR]]
+// LLVM:   %[[LEN:.*]] = load i32, ptr %[[LEN_ADDR]]
+// LLVM:   %[[TOTAL_LEN:.*]] = add nsw i32 %[[LEN]], 4
+// LLVM:   %[[TOTAL_LEN_SIZE_T:.*]] = sext i32 %[[TOTAL_LEN]] to i64
+// LLVM:   %[[STACK_PTR:.*]] = call ptr @llvm.stacksave.p0()
+// LLVM:   store ptr %[[STACK_PTR]], ptr %[[SAVED_STACK]]
+// LLVM:   %[[ARR:.*]] = alloca i32, i64 %[[TOTAL_LEN_SIZE_T]]
+// LLVM:   %[[STACK_RESTORE_PTR:.*]] = load ptr, ptr %[[SAVED_STACK]]
+// LLVM:   call void @llvm.stackrestore.p0(ptr %[[STACK_RESTORE_PTR]])
+  
+// Note: VLA_EXPR0 below is emitted to capture debug info.
+  
+// OGCG: define{{.*}} void @f2(i32 {{.*}} %[[LEN_ARG:.*]])
+// OGCG:   %[[LEN_ADDR:.*]] = alloca i32
+// OGCG:   %[[SAVED_STACK:.*]] = alloca ptr
+// OGCG:   %[[VLA_EXPR0:.*]] = alloca i64
+// OGCG:   store i32 %[[LEN_ARG]], ptr %[[LEN_ADDR]]
+// OGCG:   %[[LEN:.*]] = load i32, ptr %[[LEN_ADDR]]
+// OGCG:   %[[TOTAL_LEN:.*]] = add nsw i32 %[[LEN]], 4
+// OGCG:   %[[TOTAL_LEN_SIZE_T:.*]] = zext i32 %[[TOTAL_LEN]] to i64
+// OGCG:   %[[STACK_PTR:.*]] = call ptr @llvm.stacksave.p0()
+// OGCG:   store ptr %[[STACK_PTR]], ptr %[[SAVED_STACK]]
+// OGCG:   %[[ARR:.*]] = alloca i32, i64 %[[TOTAL_LEN_SIZE_T]]
+// OGCG:   store i64 %[[TOTAL_LEN_SIZE_T]], ptr %[[VLA_EXPR0]]
+// OGCG:   %[[STACK_RESTORE_PTR:.*]] = load ptr, ptr %[[SAVED_STACK]]
+// OGCG:   call void @llvm.stackrestore.p0(ptr %[[STACK_RESTORE_PTR]])
+
+void f3(unsigned len) {
+  char s1[len];
+  unsigned i = 0u;
+  while (++i < len) {
+    char s2[i];
+  }
+}
+
+// CIR: cir.func{{.*}} @f3(%[[LEN_ARG:.*]]: !u32i {{.*}})
+// CIR:   %[[LEN_ADDR:.*]] = cir.alloca !u32i, !cir.ptr<!u32i>, ["len", init]
+// CIR:   %[[SAVED_STACK:.*]] = cir.alloca !cir.ptr<!u8i>, !cir.ptr<!cir.ptr<!u8i>>, ["saved_stack"]
+// CIR:   cir.store{{.*}} %[[LEN_ARG]], %[[LEN_ADDR]]
+// CIR:   %[[LEN:.*]] = cir.load{{.*}} %[[LEN_ADDR]]
+// CIR:   %[[LEN_SIZE_T:.*]] = cir.cast integral %[[LEN]] : !u32i -> !u64i
+// CIR:   %[[STACK_PTR:.*]] = cir.stacksave
+// CIR:   cir.store{{.*}} %[[STACK_PTR]], %[[SAVED_STACK]]
+// CIR:   %[[S1:.*]] = cir.alloca !s8i, !cir.ptr<!s8i>, %[[LEN_SIZE_T]] : !u64i, ["s1"]
+// CIR:   %[[I:.*]] = cir.alloca !u32i, !cir.ptr<!u32i>, ["i", init]
+// CIR:   %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
+// CIR:   cir.store{{.*}} %[[ZERO]], %[[I]]
+// CIR:   cir.scope {
+// CIR:     cir.while {
+// CIR:     %[[CUR_I:.*]] = cir.load{{.*}} %[[I]]
+// CIR:     %[[NEXT:.*]] = cir.unary(inc, %[[CUR_I]])
+// CIR:     cir.store{{.*}} %[[NEXT]], %[[I]]
+// CIR:     %[[LEN2:.*]] = cir.load{{.*}} %[[LEN_ADDR]]
+// CIR:     %[[CMP:.*]] = cir.cmp(lt, %[[NEXT]], %[[LEN2]])
+// CIR:     cir.condition(%[[CMP]])
+// CIR:   } do {
+// CIR:       cir.scope {
+// CIR:         %[[SAVED_STACK2:.*]] = cir.alloca !cir.ptr<!u8i>, !cir.ptr<!cir.ptr<!u8i>>, ["saved_stack"]
+// CIR:         %[[I_LEN:.*]] = cir.load{{.*}} %[[I]]
+// CIR:         %[[I_LEN_SIZE_T2:.*]] = cir.cast integral %[[I_LEN]] : !u32i -> !u64i
+// CIR:         %[[STACK_PTR2:.*]] = cir.stacksave
+// CIR:         cir.store{{.*}} %[[STACK_PTR2]], %[[SAVED_STACK2]]
+// CIR:         %[[S2:.*]] = cir.alloca !s8i, !cir.ptr<!s8i>, %[[I_LEN_SIZE_T2]] : !u64i, ["s2"]
+// CIR:         %[[SAVED_RESTORE_PTR2:.*]] = cir.load{{.*}} %[[SAVED_STACK2]]
+// CIR:         cir.stackrestore %[[SAVED_RESTORE_PTR2]]
+// CIR:       }
+// CIR:       cir.yield
+// CIR:     }
+// CIR:   }
+// CIR:   %[[STACK_RESTORE_PTR:.*]] = cir.load{{.*}} %[[SAVED_STACK]]
+// CIR:   cir.stackrestore %[[STACK_RESTORE_PTR]]
+
+// LLVM: define{{.*}} void @f3(i32 %[[LEN_ARG:.*]])
+// LLVM:   %[[SAVED_STACK2:.*]] = alloca ptr
+// LLVM:   %[[LEN_ADDR:.*]] = alloca i32
+// LLVM:   %[[SAVED_STACK:.*]] = alloca ptr
+// LLVM:   store i32 %[[LEN_ARG]], ptr %[[LEN_ADDR]]
+// LLVM:   %[[LEN:.*]] = load i32, ptr %[[LEN_ADDR]]
+// LLVM:   %[[LEN_SIZE_T:.*]] = zext i32 %[[LEN]] to i64
+// LLVM:   %[[STACK_PTR:.*]] = call ptr @llvm.stacksave.p0()
+// LLVM:   store ptr %[[STACK_PTR]], ptr %[[SAVED_STACK]]
+// LLVM:   %[[S1:.*]] = alloca i8, i64 %[[LEN_SIZE_T]]
+// LLVM:   %[[I:.*]] = alloca i32
+// LLVM:   store i32 0, ptr %[[I]]
+// LLVM:   br label %[[WHILE_START:.*]]
+// LLVM: [[WHILE_START]]:
+// LLVM:   br label %[[WHILE_COND:.*]]
+// LLVM: [[WHILE_COND]]:
+// LLVM:   %[[CUR_I:.*]] = load i32, ptr %[[I]]
+// LLVM:   %[[NEXT:.*]] = add i32 %[[CUR_I]], 1
+// LLVM:   store i32 %[[NEXT]], ptr %[[I]]
+// LLVM:   %[[LEN2:.*]] = load i32, ptr %[[LEN_ADDR]]
+// LLVM:   %[[CMP:.*]] = icmp ult i32 %[[NEXT]], %[[LEN2]]
+// LLVM:   br i1 %[[CMP]], label %[[WHILE_BODY:.*]], label %[[WHILE_END:.*]]
+// LLVM: [[WHILE_BODY]]:
+// LLVM:   br label %[[WHILE_BODY2:.*]]
+// LLVM: [[WHILE_BODY2]]:
+// LLVM:   %[[I_LEN:.*]] = load i32, ptr %[[I]]
+// LLVM:   %[[I_LEN_SIZE_T2:.*]] = zext i32 %[[I_LEN]] to i64
+// LLVM:   %[[STACK_PTR2:.*]] = call ptr @llvm.stacksave.p0()
+// LLVM:   store ptr %[[STACK_PTR2]], ptr %[[SAVED_STACK2]]
+// LLVM:   %[[S2:.*]] = alloca i8, i64 %[[I_LEN_SIZE_T2]]
+// LLVM:   %[[STACK_RESTORE_PTR2:.*]] = load ptr, ptr %[[SAVED_STACK2]]
+// LLVM:   call void @llvm.stackrestore.p0(ptr %[[STACK_RESTORE_PTR2]])
+// LLVM:   br label %[[WHILE_BODY_END:.*]]
+// LLVM: [[WHILE_BODY_END]]:
+// LLVM:   br label %[[WHILE_COND]]
+// LLVM: [[WHILE_END]]:
+// LLVM:   br label %[[F3_END:.*]]
+// LLVM: [[F3_END]]:
+// LLVM:   %[[STACK_RESTORE_PTR:.*]] = load ptr, ptr %[[SAVED_STACK]]
+// LLVM:   call void @llvm.stackrestore.p0(ptr %[[STACK_RESTORE_PTR]])
+
+// Note: VLA_EXPR0 and VLA_EXPR1 below are emitted to capture debug info.
+
+// OGCG: define{{.*}} void @f3(i32 {{.*}} %[[LEN_ARG:.*]])
+// OGCG:   %[[LEN_ADDR:.*]] = alloca i32
+// OGCG:   %[[SAVED_STACK:.*]] = alloca ptr
+// OGCG:   %[[VLA_EXPR0:.*]] = alloca i64
+// OGCG:   %[[I:.*]] = alloca i32
+// OGCG:   %[[SAVED_STACK1:.*]] = alloca ptr
+// OGCG:   %[[VLA_EXPR1:.*]] = alloca i64
+// OGCG:   store i32 %[[LEN_ARG]], ptr %[[LEN_ADDR]]
+// OGCG:   %[[LEN:.*]] = load i32, ptr %[[LEN_ADDR]]
+// OGCG:   %[[LEN_SIZE_T:.*]] = zext i32 %[[LEN]] to i64
+// OGCG:   %[[STACK_PTR:.*]] = call ptr @llvm.stacksave.p0()
+// OGCG:   store ptr %[[STACK_PTR]], ptr %[[SAVED_STACK]]
+// OGCG:   %[[S1:.*]] = alloca i8, i64 %[[LEN_SIZE_T]]
+// OGCG:   store i64 %[[LEN_SIZE_T]], ptr %[[VLA_EXPR0]]
+// OGCG:   br label %[[WHILE_COND:.*]]
+// OGCG: [[WHILE_COND]]:
+// OGCG:   %[[CUR_I:.*]] = load i32, ptr %[[I]]
+// OGCG:   %[[NEXT:.*]] = add i32 %[[CUR_I]], 1
+// OGCG:   store i32 %[[NEXT]], ptr %[[I]]
+// OGCG:   %[[LEN2:.*]] = load i32, ptr %[[LEN_ADDR]]
+// OGCG:   %[[CMP:.*]] = icmp ult i32 %[[NEXT]], %[[LEN2]]
+// OGCG:   br i1 %[[CMP]], label %[[WHILE_BODY:.*]], label %[[WHILE_END:.*]]
+// OGCG: [[WHILE_BODY]]:
+// OGCG:   %[[I_LEN:.*]] = load i32, ptr %[[I]]
+// OGCG:   %[[I_LEN_SIZE_T:.*]] = zext i32 %[[I_LEN]] to i64
+// OGCG:   %[[STACK_PTR1:.*]] = call ptr @llvm.stacksave.p0()
+// OGCG:   store ptr %[[STACK_PTR1]], ptr %[[SAVED_STACK1]]
+// OGCG:   %[[S2:.*]] = alloca i8, i64 %[[I_LEN_SIZE_T]]
+// OGCG:   store i64 %[[I_LEN_SIZE_T]], ptr %[[VLA_EXPR1]]
+// OGCG:   %[[STACK_RESTORE_PTR1:.*]] = load ptr, ptr %[[SAVED_STACK1]]
+// OGCG:   call void @llvm.stackrestore.p0(ptr %[[STACK_RESTORE_PTR1]])
+// OGCG:   br label %[[WHILE_COND]]
+// OGCG: [[WHILE_END]]:
+// OGCG:   %[[STACK_RESTORE_PTR:.*]] = load ptr, ptr %[[SAVED_STACK]]
+// OGCG:   call void @llvm.stackrestore.p0(ptr %[[STACK_RESTORE_PTR]])
+
+
+// The following test case is disabled because it runs into a bug (unrelated
+// to VLA) in the handling of cleanups in loops with break statements.
+//
+// void f4(unsigned len) {
+//     char s1[len];
+//     while (1) {
+//       char s2[len];
+//       if (1)
+//         break;
+//     }
+// }
+  
\ No newline at end of file
diff --git a/clang/test/CIR/CodeGen/vtt.cpp b/clang/test/CIR/CodeGen/vtt.cpp
index dc30c3251db69..f9a62e37450cf 100644
--- a/clang/test/CIR/CodeGen/vtt.cpp
+++ b/clang/test/CIR/CodeGen/vtt.cpp
@@ -445,7 +445,7 @@ D::D() {}
 // CIR-COMMON:        %[[C_VPTR_ADDR:.*]] = cir.vtable.get_vptr %[[C_ADDR]] : !cir.ptr<!rec_C> -> !cir.ptr<!cir.vptr>
 // CIR-COMMON:        cir.store{{.*}} %[[C_VPTR]], %[[C_VPTR_ADDR]] : !cir.vptr, !cir.ptr<!cir.vptr>
 
-// LLVM-COMMON: define {{.*}} void @_ZN1DC2Ev(ptr %[[THIS_ARG:.*]], ptr %[[VTT_ARG:.*]]) {
+// LLVM-COMMON: define {{.*}} void @_ZN1DC2Ev(ptr %[[THIS_ARG:.*]], ptr %[[VTT_ARG:.*]]){{.*}} {
 // LLVM-COMMON:   %[[THIS_ADDR:.*]] = alloca ptr
 // LLVM-COMMON:   %[[VTT_ADDR:.*]] = alloca ptr
 // LLVM-COMMON:   store ptr %[[THIS_ARG]], ptr %[[THIS_ADDR]]
@@ -484,7 +484,7 @@ D::D() {}
 // CIR-COMMON:        %[[VPTR_ADDR:.*]] = cir.vtable.get_vptr %[[THIS]] : !cir.ptr<!rec_A> -> !cir.ptr<!cir.vptr>
 // CIR-COMMON:        cir.store{{.*}} %[[VPTR]], %[[VPTR_ADDR]] : !cir.vptr, !cir.ptr<!cir.vptr>
 
-// LLVM-COMMON: define {{.*}} void @_ZN1AC2Ev(ptr %[[THIS_ARG:.*]]) {
+// LLVM-COMMON: define {{.*}} void @_ZN1AC2Ev(ptr %[[THIS_ARG:.*]]){{.*}} {
 // LLVM-COMMON:   %[[THIS_ADDR:.*]] = alloca ptr, i64 1, align 8
 // LLVM-COMMON:   store ptr %[[THIS_ARG]], ptr %[[THIS_ADDR]], align 8
 // LLVM-COMMON:   %[[THIS:.*]] = load ptr, ptr %[[THIS_ADDR]], align 8
diff --git a/clang/test/CIR/CodeGenOpenACC/cache.c b/clang/test/CIR/CodeGenOpenACC/cache.c
index 76651c132f738..d82230a5e3841 100644
--- a/clang/test/CIR/CodeGenOpenACC/cache.c
+++ b/clang/test/CIR/CodeGenOpenACC/cache.c
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -fopenacc -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir %s -o - | FileCheck %s
 
 void acc_cache() {
-  // CHECK: cir.func{{.*}} @acc_cache() {
+  // CHECK: cir.func{{.*}} @acc_cache()
 
   int iArr[10];
   // CHECK-NEXT: %[[IARR:.*]] = cir.alloca !cir.array<!s32i x 10>, !cir.ptr<!cir.array<!s32i x 10>>, ["iArr"]
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-copy.c b/clang/test/CIR/CodeGenOpenACC/combined-copy.c
index 9afbab52c18ac..31956b383df02 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-copy.c
+++ b/clang/test/CIR/CodeGenOpenACC/combined-copy.c
@@ -2,7 +2,7 @@
 
 int global;
 void acc_compute(int parmVar) {
-  // CHECK: cir.func{{.*}} @acc_compute(%[[ARG:.*]]: !s32i{{.*}}) {
+  // CHECK: cir.func{{.*}} @acc_compute(%[[ARG:.*]]: !s32i{{.*}})
   // CHECK-NEXT: %[[PARM:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["parmVar", init]
   int localVar1;
   short localVar2;
@@ -1082,7 +1082,7 @@ typedef struct OuterTy {
 } Outer;
 
 void copy_member_of_array_element_member() {
-  // CHECK: cir.func{{.*}} @copy_member_of_array_element_member() {
+  // CHECK: cir.func{{.*}} @copy_member_of_array_element_member()
   Outer outer;
   // CHECK-NEXT: %[[OUTER:.*]] = cir.alloca !rec_OuterTy, !cir.ptr<!rec_OuterTy>, ["outer"]
 
@@ -1104,7 +1104,7 @@ void copy_member_of_array_element_member() {
 }
 
 void modifier_list() {
-  // CHECK: cir.func{{.*}} @modifier_list() {
+  // CHECK: cir.func{{.*}} @modifier_list()
   int localVar;
   // CHECK-NEXT: %[[LOCALVAR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["localVar"]
 
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-firstprivate-clause.cpp b/clang/test/CIR/CodeGenOpenACC/combined-firstprivate-clause.cpp
index aa1103d49c7ae..94f3f1ace4350 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-firstprivate-clause.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined-firstprivate-clause.cpp
@@ -324,7 +324,7 @@ struct HasDtor {
 //
 
 extern "C" void acc_combined() {
-  // CHECK: cir.func{{.*}} @acc_combined() {
+  // CHECK: cir.func{{.*}} @acc_combined()
 
   int someInt;
   // CHECK-NEXT: %[[SOMEINT:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["someInt"]
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-private-clause.cpp b/clang/test/CIR/CodeGenOpenACC/combined-private-clause.cpp
index 7a7338cd93549..ee82757aa9f7c 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-private-clause.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined-private-clause.cpp
@@ -158,7 +158,7 @@ struct HasDtor {
 // CHECK-NEXT: }
 
 extern "C" void acc_combined() {
-  // CHECK: cir.func{{.*}} @acc_combined() {
+  // CHECK: cir.func{{.*}} @acc_combined()
 
   int someInt;
   // CHECK-NEXT: %[[SOMEINT:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["someInt"]
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-default-ops.cpp b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-default-ops.cpp
index 040ddd3ca458c..c1c2e4b715365 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-default-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-default-ops.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 struct DefaultOperators {
   int i;
@@ -43,12 +43,43 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) nsw : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[ADD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: }
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc parallel loop reduction(*:someVar)
-
 // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTS16DefaultOperators : !cir.ptr<!rec_DefaultOperators> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperators>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>, ["openacc.reduction.init", init]
@@ -71,10 +102,42 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) nsw : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[MUL]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: }
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc parallel loop reduction(max:someVar)
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTS16DefaultOperators : !cir.ptr<!rec_DefaultOperators> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperators>{{.*}})
@@ -98,10 +161,81 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.double
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_CAST]], %[[RHS_CAST]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+//
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: }
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc parallel loop reduction(min:someVar)
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTS16DefaultOperators : !cir.ptr<!rec_DefaultOperators> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperators>{{.*}})
@@ -125,10 +259,81 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.double
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_CAST]], %[[RHS_CAST]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+//
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: }
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc parallel loop reduction(&:someVarNoFloats)
 // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTS24DefaultOperatorsNoFloats : !cir.ptr<!rec_DefaultOperatorsNoFloats> reduction_operator <iand> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperatorsNoFloats>{{.*}})
@@ -146,7 +351,27 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_DefaultOperatorsNoFloats> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_DefaultOperatorsNoFloats> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[AND]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[AND]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[AND]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperatorsNoFloats>
 // CHECK-NEXT: }
   for(int i = 0; i < 5; ++i);
@@ -167,7 +392,27 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_DefaultOperatorsNoFloats> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_DefaultOperatorsNoFloats> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[OR]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[OR]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[OR]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperatorsNoFloats>
 // CHECK-NEXT: }
   for(int i = 0; i < 5; ++i);
@@ -188,10 +433,30 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_DefaultOperatorsNoFloats> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_DefaultOperatorsNoFloats> {{.*}}) 
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[XOR]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[XOR]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[XOR]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperatorsNoFloats>
 // CHECK-NEXT: }
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc parallel loop reduction(&&:someVar)
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTS16DefaultOperators : !cir.ptr<!rec_DefaultOperators> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperators>{{.*}})
@@ -218,7 +483,7 @@ void acc_combined() {
 // TODO OpenACC: Expecting combination operation here
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: }
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc parallel loop reduction(||:someVar)
 // CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTS16DefaultOperators : !cir.ptr<!rec_DefaultOperators> reduction_operator <lor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperators>{{.*}})
@@ -245,7 +510,7 @@ void acc_combined() {
 // TODO OpenACC: Expecting combination operation here
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: }
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 
 #pragma acc parallel loop reduction(+:someVarArr)
 // CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <add> init {
@@ -286,10 +551,65 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) nsw : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[ADD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc parallel loop reduction(*:someVarArr)
 // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
@@ -388,11 +708,67 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) nsw : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[MUL]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc parallel loop reduction(max:someVarArr)
+
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
@@ -490,10 +866,103 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"]
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[MAX_IDX:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[ITR_CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[MAX_IDX]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[ITR_CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.double
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_CAST]], %[[RHS_CAST]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc parallel loop reduction(min:someVarArr)
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
@@ -592,7 +1061,100 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"]
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[MAX_IDX:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[ITR_CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[MAX_IDX]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[ITR_CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.double
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_CAST]], %[[RHS_CAST]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   for(int i = 0; i < 5; ++i);
@@ -664,10 +1226,53 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[AND]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[AND]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[AND]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>
 // CHECK-NEXT: }
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc parallel loop reduction(|:someVarArrNoFloats)
 // CHECK-NEXT: acc.reduction.recipe @reduction_ior__ZTSA5_24DefaultOperatorsNoFloats : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> reduction_operator <ior> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>{{.*}})
@@ -702,7 +1307,50 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[OR]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[OR]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[OR]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>
 // CHECK-NEXT: }
   for(int i = 0; i < 5; ++i);
@@ -739,10 +1387,53 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[XOR]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[XOR]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[XOR]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>
 // CHECK-NEXT: }
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc parallel loop reduction(&&:someVarArr)
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
@@ -844,9 +1535,9 @@ void acc_combined() {
 // TODO OpenACC: Expecting combination operation here
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc parallel loop reduction(||:someVarArr)
-// CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <lor> init {
+// CHECK: acc.reduction.recipe @reduction_lor__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <lor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>, ["arrayinit.temp"]
@@ -888,7 +1579,7 @@ void acc_combined() {
 // TODO OpenACC: Expecting combination operation here
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 
 #pragma acc parallel loop reduction(+:someVarArr[2])
 // CHECK-NEXT: acc.reduction.recipe @reduction_add__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <add> init {
@@ -935,9 +1626,67 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) nsw : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[ADD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc parallel loop reduction(*:someVarArr[2])
 // CHECK-NEXT: acc.reduction.recipe @reduction_mul__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
@@ -983,9 +1732,67 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) nsw : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[MUL]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc parallel loop reduction(max:someVarArr[2])
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
@@ -1031,9 +1838,109 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"]
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+//
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.double
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_CAST]], %[[RHS_CAST]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+//
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc parallel loop reduction(min:someVarArr[2])
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
@@ -1079,9 +1986,109 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"]
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+//
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.double
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_CAST]], %[[RHS_CAST]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+//
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc parallel loop reduction(&:someVarArrNoFloats[2])
 // CHECK-NEXT: acc.reduction.recipe @reduction_iand__Bcnt1__ZTSA5_24DefaultOperatorsNoFloats : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> reduction_operator <iand> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
@@ -1121,9 +2128,55 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !u64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !u64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[AND]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[AND]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[AND]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>
 // CHECK-NEXT: }
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc parallel loop reduction(|:someVarArrNoFloats[2])
 // CHECK-NEXT: acc.reduction.recipe @reduction_ior__Bcnt1__ZTSA5_24DefaultOperatorsNoFloats : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> reduction_operator <ior> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
@@ -1163,9 +2216,55 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !u64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !u64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[OR]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[OR]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[OR]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>
 // CHECK-NEXT: }
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc parallel loop reduction(^:someVarArrNoFloats[2])
 // CHECK-NEXT: acc.reduction.recipe @reduction_xor__Bcnt1__ZTSA5_24DefaultOperatorsNoFloats : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> reduction_operator <xor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
@@ -1205,9 +2304,55 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !u64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !u64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[XOR]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[XOR]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[XOR]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>
 // CHECK-NEXT: }
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc parallel loop reduction(&&:someVarArr[2])
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
@@ -1255,7 +2400,7 @@ void acc_combined() {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc parallel loop reduction(||:someVarArr[2])
 // CHECK-NEXT: acc.reduction.recipe @reduction_lor__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <lor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
@@ -1303,26 +2448,26 @@ void acc_combined() {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 
 #pragma acc parallel loop reduction(+:someVarArr[1:1])
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc parallel loop reduction(*:someVarArr[1:1])
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc parallel loop reduction(max:someVarArr[1:1])
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc parallel loop reduction(min:someVarArr[1:1])
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc parallel loop reduction(&:someVarArrNoFloats[1:1])
   for(int i = 0; i < 5; ++i);
 #pragma acc parallel loop reduction(|:someVarArrNoFloats[1:1])
   for(int i = 0; i < 5; ++i);
 #pragma acc parallel loop reduction(^:someVarArrNoFloats[1:1])
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc parallel loop reduction(&&:someVarArr[1:1])
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc parallel loop reduction(||:someVarArr[1:1])
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
   // CHECK-NEXT: cir.func {{.*}}@_Z12acc_combined
 }
 
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-float.cpp b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-float.cpp
index 6e885cc12d01a..853f345e53ddf 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-float.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-float.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 template<typename T>
 void acc_combined() {
   T someVar;
@@ -13,7 +13,10 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.float> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.float> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[LHSARG]]
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.float>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -26,7 +29,10 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.float> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.float> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[LHSARG]]
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.float>
 // CHECK-NEXT: }
 
@@ -40,7 +46,17 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.float> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.float> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHSARG]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHSARG]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHSARG]] : !cir.float, !cir.ptr<!cir.float>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.float>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -53,7 +69,17 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.float> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.float> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHSARG]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHSARG]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHSARG]] : !cir.float, !cir.ptr<!cir.float>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.float>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -111,7 +137,31 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -142,7 +192,31 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -173,7 +247,38 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"]
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[MAX_IDX:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[ITR_CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[MAX_IDX]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[ITR_CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -204,7 +309,38 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"]
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[MAX_IDX:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[ITR_CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[MAX_IDX]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[ITR_CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -302,6 +438,35 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -337,6 +502,35 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -372,6 +566,42 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"]
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -407,6 +637,42 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"]
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -442,6 +708,7 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// TODO OpenACC: Expecting combination operation here
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -477,6 +744,7 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// TODO OpenACC: Expecting combination operation here
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-inline-ops.cpp b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-inline-ops.cpp
index 3d46ac716c1ad..67e8460649f7e 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-inline-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-inline-ops.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 struct HasOperatorsInline {
   int i;
@@ -17,7 +17,7 @@ struct HasOperatorsInline {
   bool &operator&&(HasOperatorsInline& other);
   bool &operator||(HasOperatorsInline& other);
   // For min/max
-  HasOperatorsInline &operator<(HasOperatorsInline& other);
+  bool operator<(HasOperatorsInline& other);
   HasOperatorsInline &operator=(HasOperatorsInline& other);
 };
 
@@ -48,7 +48,7 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT:  cir.call @_ZN18HasOperatorsInlinepLERS_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}):  
@@ -79,7 +79,7 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT:  cir.call @_ZN18HasOperatorsInlinemLERS_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}):  
@@ -88,7 +88,7 @@ void acc_combined() {
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(max:someVar)
-// CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTS18HasOperatorsInline : !cir.ptr<!rec_HasOperatorsInline> reduction_operator <max> init {
+// CHECK: acc.reduction.recipe @reduction_max__ZTS18HasOperatorsInline : !cir.ptr<!rec_HasOperatorsInline> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsInline, !cir.ptr<!rec_HasOperatorsInline>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
@@ -110,7 +110,13 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LT:.*]] = cir.call @_ZN18HasOperatorsInlineltERS_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LT]], true {
+// CHECK-NEXT: cir.yield %[[RHSARG]] : !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: cir.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineaSERS_(%[[LHSARG]], %[[TERNARY]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}):  
@@ -119,7 +125,7 @@ void acc_combined() {
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(min:someVar)
-// CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTS18HasOperatorsInline : !cir.ptr<!rec_HasOperatorsInline> reduction_operator <min> init {
+// CHECK: acc.reduction.recipe @reduction_min__ZTS18HasOperatorsInline : !cir.ptr<!rec_HasOperatorsInline> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsInline, !cir.ptr<!rec_HasOperatorsInline>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
@@ -141,7 +147,13 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LT:.*]] = cir.call @_ZN18HasOperatorsInlineltERS_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LT]], true {
+// CHECK-NEXT: cir.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: cir.yield %[[RHSARG]] : !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineaSERS_(%[[LHSARG]], %[[TERNARY]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}):  
@@ -172,7 +184,7 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineaNERS_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}):  
@@ -181,7 +193,7 @@ void acc_combined() {
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(|:someVar)
-// CHECK-NEXT: acc.reduction.recipe @reduction_ior__ZTS18HasOperatorsInline : !cir.ptr<!rec_HasOperatorsInline> reduction_operator <ior> init {
+// CHECK: acc.reduction.recipe @reduction_ior__ZTS18HasOperatorsInline : !cir.ptr<!rec_HasOperatorsInline> reduction_operator <ior> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsInline, !cir.ptr<!rec_HasOperatorsInline>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
@@ -203,7 +215,7 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineoRERS_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}):  
@@ -212,7 +224,7 @@ void acc_combined() {
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(^:someVar)
-// CHECK-NEXT: acc.reduction.recipe @reduction_xor__ZTS18HasOperatorsInline : !cir.ptr<!rec_HasOperatorsInline> reduction_operator <xor> init {
+// CHECK: acc.reduction.recipe @reduction_xor__ZTS18HasOperatorsInline : !cir.ptr<!rec_HasOperatorsInline> reduction_operator <xor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsInline, !cir.ptr<!rec_HasOperatorsInline>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
@@ -234,7 +246,7 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineeOERS_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}):  
@@ -243,7 +255,7 @@ void acc_combined() {
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(&&:someVar)
-// CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTS18HasOperatorsInline : !cir.ptr<!rec_HasOperatorsInline> reduction_operator <land> init {
+// CHECK: acc.reduction.recipe @reduction_land__ZTS18HasOperatorsInline : !cir.ptr<!rec_HasOperatorsInline> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsInline, !cir.ptr<!rec_HasOperatorsInline>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
@@ -344,7 +356,29 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlinepLERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+//
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
@@ -466,7 +500,28 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlinemLERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
@@ -588,7 +643,34 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LT:.*]] = cir.call @_ZN18HasOperatorsInlineltERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LT]], true {
+// CHECK-NEXT: cir.yield %[[RHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: cir.yield %[[LHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[TERNARY]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
@@ -710,7 +792,34 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LT:.*]] = cir.call @_ZN18HasOperatorsInlineltERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LT]], true {
+// CHECK-NEXT: cir.yield %[[LHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: cir.yield %[[RHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[TERNARY]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
@@ -832,7 +941,28 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineaNERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
@@ -896,7 +1026,28 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineoRERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
@@ -959,7 +1110,28 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineeOERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
@@ -1216,6 +1388,32 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT:  cir.call @_ZN18HasOperatorsInlinepLERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -1293,6 +1491,32 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT:  cir.call @_ZN18HasOperatorsInlinemLERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -1370,6 +1594,38 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LT:.*]] = cir.call @_ZN18HasOperatorsInlineltERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LT]], true {
+// CHECK-NEXT: cir.yield %[[RHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: cir.yield %[[LHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[TERNARY]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -1447,6 +1703,38 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LT:.*]] = cir.call @_ZN18HasOperatorsInlineltERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LT]], true {
+// CHECK-NEXT: cir.yield %[[LHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: cir.yield %[[RHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[TERNARY]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -1524,6 +1812,32 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT:  cir.call @_ZN18HasOperatorsInlineaNERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -1601,6 +1915,32 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT:  cir.call @_ZN18HasOperatorsInlineoRERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -1678,6 +2018,32 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT:  cir.call @_ZN18HasOperatorsInlineeOERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-int.cpp b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-int.cpp
index e9669d390f594..d74de8220225a 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-int.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-int.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 template<typename T>
 void acc_combined() {
@@ -14,7 +14,10 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!s32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!s32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[LHSARG]]
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!s32i>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -28,7 +31,10 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!s32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!s32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[LHSARG]]
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!s32i>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -41,7 +47,17 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!s32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!s32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHSARG]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHSARG]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHSARG]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!s32i>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -54,7 +70,17 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!s32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!s32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHSARG]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHSARG]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHSARG]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!s32i>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -67,7 +93,10 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!s32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!s32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[AND]], %[[LHSARG]]
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!s32i>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -81,7 +110,10 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!s32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!s32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[OR]], %[[LHSARG]]
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!s32i>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -95,7 +127,10 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!s32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!s32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[XOR]], %[[LHSARG]]
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!s32i>
 // CHECK-NEXT: }
 
@@ -155,7 +190,31 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -186,7 +245,31 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -217,7 +300,38 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"]
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[MAX_IDX:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[ITR_CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[MAX_IDX]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[ITR_CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -248,7 +362,38 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"]
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[MAX_IDX:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[ITR_CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[MAX_IDX]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[ITR_CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -279,7 +424,31 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[AND]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -309,7 +478,31 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[OR]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -339,7 +532,31 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[XOR]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -437,6 +654,35 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -472,6 +718,35 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -507,6 +782,42 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"]
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -542,6 +853,42 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"]
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -577,6 +924,35 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[AND]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -612,6 +988,35 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[OR]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -647,6 +1052,35 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[XOR]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -682,6 +1116,7 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// TODO OpenACC: Expecting combination operation here
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -717,6 +1152,7 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// TODO OpenACC: Expecting combination operation here
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-outline-ops.cpp b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-outline-ops.cpp
index c99e3c179ebb9..a6df6c03f5c8e 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-outline-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-outline-ops.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 struct HasOperatorsOutline {
   int i;
   unsigned u;
@@ -7,7 +7,7 @@ struct HasOperatorsOutline {
   bool b;
 
   ~HasOperatorsOutline();
-HasOperatorsOutline &operator=(const HasOperatorsOutline &);
+  HasOperatorsOutline &operator=(const HasOperatorsOutline &);
 };
 
 HasOperatorsOutline &operator+=(HasOperatorsOutline &, HasOperatorsOutline &);
@@ -18,7 +18,7 @@ HasOperatorsOutline &operator^=(HasOperatorsOutline &, HasOperatorsOutline &);
 bool &operator&&(HasOperatorsOutline &, HasOperatorsOutline &);
 bool &operator||(HasOperatorsOutline &, HasOperatorsOutline &);
 // For min/max
-HasOperatorsOutline &operator<(HasOperatorsOutline &, HasOperatorsOutline &);
+bool operator<(HasOperatorsOutline &, HasOperatorsOutline &);
 
 template<typename T>
 void acc_combined() {
@@ -47,7 +47,7 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT:  cir.call @_ZpLR19HasOperatorsOutlineS0_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}):  
@@ -55,7 +55,6 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
-
 #pragma acc parallel loop reduction(*:someVar)
 // CHECK: acc.reduction.recipe @reduction_mul__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline>{{.*}})
@@ -78,17 +77,17 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
-  for(int i=0;i < 5; ++i);
-#pragma acc parallel loop reduction(max:someVar)
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT:  cir.call @_ZmLR19HasOperatorsOutlineS0_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}):  
 // CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineD1Ev(%[[ARG]]) nothrow : (!cir.ptr<!rec_HasOperatorsOutline>)
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
-// CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <max> init {
+  for(int i=0;i < 5; ++i);
+#pragma acc parallel loop reduction(max:someVar)
+// CHECK: acc.reduction.recipe @reduction_max__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsOutline, !cir.ptr<!rec_HasOperatorsOutline>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
@@ -110,7 +109,13 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LT:.*]] = cir.call @_ZltR19HasOperatorsOutlineS0_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LT]], true {
+// CHECK-NEXT: cir.yield %[[RHSARG]] : !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: cir.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineaSERKS_(%[[LHSARG]], %[[TERNARY]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}):  
@@ -119,7 +124,7 @@ void acc_combined() {
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(min:someVar)
-// CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <min> init {
+// CHECK: acc.reduction.recipe @reduction_min__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsOutline, !cir.ptr<!rec_HasOperatorsOutline>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
@@ -141,7 +146,13 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LT:.*]] = cir.call @_ZltR19HasOperatorsOutlineS0_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LT]], true {
+// CHECK-NEXT: cir.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: cir.yield %[[RHSARG]] : !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineaSERKS_(%[[LHSARG]], %[[TERNARY]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}):  
@@ -172,7 +183,7 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.call @_ZaNR19HasOperatorsOutlineS0_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}):  
@@ -181,7 +192,7 @@ void acc_combined() {
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(|:someVar)
-// CHECK-NEXT: acc.reduction.recipe @reduction_ior__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <ior> init {
+// CHECK: acc.reduction.recipe @reduction_ior__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <ior> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsOutline, !cir.ptr<!rec_HasOperatorsOutline>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
@@ -203,7 +214,7 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.call @_ZoRR19HasOperatorsOutlineS0_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}):  
@@ -212,7 +223,7 @@ void acc_combined() {
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(^:someVar)
-// CHECK-NEXT: acc.reduction.recipe @reduction_xor__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <xor> init {
+// CHECK: acc.reduction.recipe @reduction_xor__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <xor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsOutline, !cir.ptr<!rec_HasOperatorsOutline>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
@@ -234,7 +245,7 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.call @_ZeOR19HasOperatorsOutlineS0_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}):  
@@ -243,7 +254,7 @@ void acc_combined() {
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(&&:someVar)
-// CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <land> init {
+// CHECK: acc.reduction.recipe @reduction_land__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsOutline, !cir.ptr<!rec_HasOperatorsOutline>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
@@ -344,7 +355,29 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZpLR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+//
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
@@ -466,7 +499,28 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZmLR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
@@ -588,7 +642,34 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LT:.*]] = cir.call @_ZltR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LT]], true {
+// CHECK-NEXT: cir.yield %[[RHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: cir.yield %[[LHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[TERNARY]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
@@ -710,7 +791,34 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LT:.*]] = cir.call @_ZltR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LT]], true {
+// CHECK-NEXT: cir.yield %[[LHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: cir.yield %[[RHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[TERNARY]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
@@ -832,7 +940,28 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZaNR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
@@ -896,7 +1025,28 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZoRR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
@@ -959,7 +1109,28 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZeOR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
@@ -982,6 +1153,7 @@ void acc_combined() {
 // CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
+
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(&&:someVarArr)
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <land> init {
@@ -1106,8 +1278,7 @@ void acc_combined() {
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(||:someVarArr)
-
-// CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <lor> init {
+// CHECK: acc.reduction.recipe @reduction_lor__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <lor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["arrayinit.temp"]
@@ -1216,6 +1387,32 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT:  cir.call @_ZpLR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -1293,6 +1490,32 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT:  cir.call @_ZmLR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -1370,6 +1593,38 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LT:.*]] = cir.call @_ZltR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LT]], true {
+// CHECK-NEXT: cir.yield %[[RHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: cir.yield %[[LHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[TERNARY]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -1447,6 +1702,38 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LT:.*]] = cir.call @_ZltR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LT]], true {
+// CHECK-NEXT: cir.yield %[[LHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: cir.yield %[[RHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[TERNARY]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -1524,6 +1811,32 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT:  cir.call @_ZaNR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -1601,6 +1914,32 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT:  cir.call @_ZoRR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -1678,6 +2017,32 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT:  cir.call @_ZeOR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-copy.c b/clang/test/CIR/CodeGenOpenACC/compute-copy.c
index d7676d6d30c1e..41e594ec3551b 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-copy.c
+++ b/clang/test/CIR/CodeGenOpenACC/compute-copy.c
@@ -2,7 +2,7 @@
 
 int global;
 void acc_compute(int parmVar) {
-  // CHECK: cir.func{{.*}} @acc_compute(%[[ARG:.*]]: !s32i{{.*}}) {
+  // CHECK: cir.func{{.*}} @acc_compute(%[[ARG:.*]]: !s32i{{.*}})
   // CHECK-NEXT: %[[PARM:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["parmVar", init]
   int localVar1;
   short localVar2;
@@ -899,7 +899,7 @@ void acc_compute_members() {
 }
 
 void modifier_list() {
-  // CHECK: cir.func{{.*}} @modifier_list() {
+  // CHECK: cir.func{{.*}} @modifier_list()
   int localVar;
   // CHECK-NEXT: %[[LOCALVAR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["localVar"]
 
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-firstprivate-clause.c b/clang/test/CIR/CodeGenOpenACC/compute-firstprivate-clause.c
index 184f904d3928f..52342e7b03e79 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-firstprivate-clause.c
+++ b/clang/test/CIR/CodeGenOpenACC/compute-firstprivate-clause.c
@@ -142,7 +142,7 @@ struct NoCopyConstruct {};
 // CHECK-NEXT: }
 
 void acc_compute() {
-  // CHECK: cir.func{{.*}} @acc_compute() {
+  // CHECK: cir.func{{.*}} @acc_compute()
 
   int someInt;
   // CHECK-NEXT: %[[SOMEINT:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["someInt"]
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-firstprivate-clause.cpp b/clang/test/CIR/CodeGenOpenACC/compute-firstprivate-clause.cpp
index e3f091a5cc1e8..a2c6c3834b1e2 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-firstprivate-clause.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/compute-firstprivate-clause.cpp
@@ -324,7 +324,7 @@ struct HasDtor {
 //
 
 extern "C" void acc_compute() {
-  // CHECK: cir.func{{.*}} @acc_compute() {
+  // CHECK: cir.func{{.*}} @acc_compute()
 
   int someInt;
   // CHECK-NEXT: %[[SOMEINT:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["someInt"]
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-private-clause.c b/clang/test/CIR/CodeGenOpenACC/compute-private-clause.c
index 34b8b6995792b..943539e78ca5b 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-private-clause.c
+++ b/clang/test/CIR/CodeGenOpenACC/compute-private-clause.c
@@ -45,7 +45,7 @@ struct NoCopyConstruct {};
 // CHECK-NEXT: }
 
 void acc_compute() {
-  // CHECK: cir.func{{.*}} @acc_compute() {
+  // CHECK: cir.func{{.*}} @acc_compute()
 
   int someInt;
   // CHECK-NEXT: %[[SOMEINT:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["someInt"]
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-private-clause.cpp b/clang/test/CIR/CodeGenOpenACC/compute-private-clause.cpp
index 8262a318edc10..f0bd98c1d8b19 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-private-clause.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/compute-private-clause.cpp
@@ -146,7 +146,7 @@ struct HasDtor {
 // CHECK-NEXT: }
 
 extern "C" void acc_compute() {
-  // CHECK: cir.func{{.*}} @acc_compute() {
+  // CHECK: cir.func{{.*}} @acc_compute()
 
   int someInt;
   // CHECK-NEXT: %[[SOMEINT:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["someInt"]
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.c b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.c
index c99dc09e0ff7a..d65d5d4add0ac 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.c
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -std=c23 -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -std=c23 -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 struct DefaultOperators {
   int i;
@@ -42,7 +42,39 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) nsw : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[ADD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: }
   ;
@@ -69,7 +101,39 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) nsw : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[MUL]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: }
   ;
@@ -96,7 +160,81 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.double
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_CAST]], %[[RHS_CAST]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RES_CAST:.*]] = cir.cast bool_to_int %[[RESULT]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.yield %[[RES_CAST]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RES_CAST:.*]] = cir.cast bool_to_int %[[RESULT]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.yield %[[RES_CAST]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: %[[RES_CAST:.*]] = cir.cast int_to_bool %[[TERNARY]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store{{.*}} %[[RES_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+//
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: }
   ;
@@ -123,7 +261,81 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.double
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_CAST]], %[[RHS_CAST]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RES_CAST:.*]] = cir.cast bool_to_int %[[RESULT]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.yield %[[RES_CAST]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RES_CAST:.*]] = cir.cast bool_to_int %[[RESULT]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.yield %[[RES_CAST]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: %[[RES_CAST:.*]] = cir.cast int_to_bool %[[TERNARY]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store{{.*}} %[[RES_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+//
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: }
   ;
@@ -144,7 +356,27 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_DefaultOperatorsNoFloats> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_DefaultOperatorsNoFloats> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[AND]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[AND]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[AND]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperatorsNoFloats>
 // CHECK-NEXT: }
   ;
@@ -165,7 +397,27 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_DefaultOperatorsNoFloats> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_DefaultOperatorsNoFloats> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[OR]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[OR]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[OR]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperatorsNoFloats>
 // CHECK-NEXT: }
   ;
@@ -186,7 +438,27 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_DefaultOperatorsNoFloats> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_DefaultOperatorsNoFloats> {{.*}}) 
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[XOR]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[XOR]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[XOR]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperatorsNoFloats>
 // CHECK-NEXT: }
   ;
@@ -271,7 +543,62 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) nsw : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[ADD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   ;
@@ -373,7 +700,62 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) nsw : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[MUL]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   ;
@@ -476,7 +858,103 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"]
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[MAX_IDX:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[ITR_CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[MAX_IDX]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[ITR_CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.double
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_CAST]], %[[RHS_CAST]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RES_CAST:.*]] = cir.cast bool_to_int %[[RESULT]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.yield %[[RES_CAST]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RES_CAST:.*]] = cir.cast bool_to_int %[[RESULT]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.yield %[[RES_CAST]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: %[[RES_CAST:.*]] = cir.cast int_to_bool %[[TERNARY]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store{{.*}} %[[RES_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   ;
@@ -578,7 +1056,104 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"]
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[MAX_IDX:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[ITR_CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[MAX_IDX]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[ITR_CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.double
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_CAST]], %[[RHS_CAST]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RES_CAST:.*]] = cir.cast bool_to_int %[[RESULT]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.yield %[[RES_CAST]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RES_CAST:.*]] = cir.cast bool_to_int %[[RESULT]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.yield %[[RES_CAST]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: %[[RES_CAST:.*]] = cir.cast int_to_bool %[[TERNARY]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store{{.*}} %[[RES_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   ;
@@ -650,7 +1225,50 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[AND]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[AND]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[AND]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>
 // CHECK-NEXT: }
   ;
@@ -680,7 +1298,50 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[OR]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[OR]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[OR]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>
 // CHECK-NEXT: }
   ;
@@ -710,7 +1371,50 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[XOR]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[XOR]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[XOR]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>
 // CHECK-NEXT: }
   ;
@@ -892,6 +1596,64 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) nsw : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[ADD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   ;
@@ -940,6 +1702,64 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) nsw : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[MUL]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   ;
@@ -988,6 +1808,110 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"]
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+//
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.double
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_CAST]], %[[RHS_CAST]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RES_CAST:.*]] = cir.cast bool_to_int %[[RESULT]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.yield %[[RES_CAST]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RES_CAST:.*]] = cir.cast bool_to_int %[[RESULT]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.yield %[[RES_CAST]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: %[[RES_CAST:.*]] = cir.cast int_to_bool %[[TERNARY]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store{{.*}} %[[RES_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+//
+// CHECK-NEXT: cir.yield
+//
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   ;
@@ -1036,6 +1960,110 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"]
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+//
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.double
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_CAST]], %[[RHS_CAST]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RES_CAST:.*]] = cir.cast bool_to_int %[[RESULT]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.yield %[[RES_CAST]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RES_CAST:.*]] = cir.cast bool_to_int %[[RESULT]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.yield %[[RES_CAST]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: %[[RES_CAST:.*]] = cir.cast int_to_bool %[[TERNARY]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store{{.*}} %[[RES_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+//
+// CHECK-NEXT: cir.yield
+//
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   ;
@@ -1056,7 +2084,6 @@ void acc_compute() {
 // CHECK-NEXT: cir.condition(%[[COND]])
 // CHECK-NEXT: } body {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-
 // CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
 // CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride %[[DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !u64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
@@ -1079,6 +2106,52 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !u64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !u64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[AND]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[AND]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[AND]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>
 // CHECK-NEXT: }
   ;
@@ -1121,6 +2194,52 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !u64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !u64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[OR]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[OR]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[OR]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>
 // CHECK-NEXT: }
   ;
@@ -1163,6 +2282,52 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !u64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !u64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[XOR]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[XOR]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[XOR]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>
 // CHECK-NEXT: }
   ;
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.cpp b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.cpp
index b90a2fc8110c2..f32fa2d2d6372 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 struct DefaultOperators {
   int i;
@@ -21,7 +21,6 @@ void acc_compute() {
   struct DefaultOperatorsNoFloats someVarNoFloats;
   struct DefaultOperatorsNoFloats someVarArrNoFloats[5];
 #pragma acc parallel reduction(+:someVar)
-  ;
 // CHECK: acc.reduction.recipe @reduction_add__ZTS16DefaultOperators : !cir.ptr<!rec_DefaultOperators> reduction_operator <add> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperators>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>, ["openacc.reduction.init", init]
@@ -44,11 +43,43 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) nsw : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[ADD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: }
+  ;
 #pragma acc parallel reduction(*:someVar)
-
 // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTS16DefaultOperators : !cir.ptr<!rec_DefaultOperators> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperators>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>, ["openacc.reduction.init", init]
@@ -71,7 +102,39 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) nsw : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[MUL]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: }
   ;
@@ -98,7 +161,78 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.double
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_CAST]], %[[RHS_CAST]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+//
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: }
   ;
@@ -125,7 +259,78 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.double
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_CAST]], %[[RHS_CAST]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+//
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: }
   ;
@@ -146,7 +351,27 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_DefaultOperatorsNoFloats> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_DefaultOperatorsNoFloats> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[AND]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[AND]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[AND]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperatorsNoFloats>
 // CHECK-NEXT: }
   ;
@@ -167,7 +392,27 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_DefaultOperatorsNoFloats> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_DefaultOperatorsNoFloats> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[OR]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[OR]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[OR]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperatorsNoFloats>
 // CHECK-NEXT: }
   ;
@@ -188,7 +433,27 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_DefaultOperatorsNoFloats> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_DefaultOperatorsNoFloats> {{.*}}) 
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[XOR]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[XOR]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[XOR]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperatorsNoFloats>
 // CHECK-NEXT: }
   ;
@@ -286,7 +551,62 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) nsw : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[ADD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   ;
@@ -388,11 +708,67 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) nsw : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[MUL]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(max:someVarArr)
+
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
@@ -490,7 +866,100 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"]
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[MAX_IDX:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[ITR_CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[MAX_IDX]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[ITR_CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.double
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_CAST]], %[[RHS_CAST]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   ;
@@ -592,7 +1061,100 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"]
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[MAX_IDX:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[ITR_CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[MAX_IDX]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[ITR_CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.double
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_CAST]], %[[RHS_CAST]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   ;
@@ -664,7 +1226,50 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[AND]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[AND]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[AND]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>
 // CHECK-NEXT: }
   ;
@@ -702,7 +1307,50 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[OR]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[OR]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[OR]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>
 // CHECK-NEXT: }
   ;
@@ -739,7 +1387,50 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[XOR]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[XOR]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[XOR]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>
 // CHECK-NEXT: }
   ;
@@ -846,7 +1537,7 @@ void acc_compute() {
 // CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(||:someVarArr)
-// CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <lor> init {
+// CHECK: acc.reduction.recipe @reduction_lor__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <lor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>, ["arrayinit.temp"]
@@ -935,6 +1626,64 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) nsw : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[ADD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   ;
@@ -983,6 +1732,64 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) nsw : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[MUL]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   ;
@@ -1031,6 +1838,106 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"]
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+//
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.double
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_CAST]], %[[RHS_CAST]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+//
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   ;
@@ -1079,6 +1986,106 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"]
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+//
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.double
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_CAST]], %[[RHS_CAST]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+//
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   ;
@@ -1121,6 +2128,52 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !u64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !u64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[AND]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[AND]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[AND]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>
 // CHECK-NEXT: }
   ;
@@ -1163,6 +2216,52 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !u64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !u64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[OR]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[OR]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[OR]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>
 // CHECK-NEXT: }
   ;
@@ -1205,6 +2304,52 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !u64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !u64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[XOR]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[XOR]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[XOR]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>
 // CHECK-NEXT: }
   ;
@@ -1323,9 +2468,11 @@ void acc_compute() {
   ;
 #pragma acc parallel reduction(||:someVarArr[1:1])
   ;
+
   // CHECK-NEXT: cir.func {{.*}}@_Z11acc_compute
 }
 
 void uses() {
   acc_compute<DefaultOperators>();
 }
+
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.c b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.c
index 0f7fd84841fb6..9f7336727e5a9 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.c
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 void acc_compute() {
   float someVar;
@@ -13,7 +13,10 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.float> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.float> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[LHSARG]]
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.float>
 // CHECK-NEXT: }
   ;
@@ -26,7 +29,10 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.float> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.float> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[LHSARG]]
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.float>
 // CHECK-NEXT: }
   ;
@@ -39,7 +45,18 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.float> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.float> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHSARG]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHSARG]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHSARG]] : !cir.float, !cir.ptr<!cir.float>
+//
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.float>
 // CHECK-NEXT: }
   ;
@@ -52,7 +69,17 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.float> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.float> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHSARG]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHSARG]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHSARG]] : !cir.float, !cir.ptr<!cir.float>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.float>
 // CHECK-NEXT: }
   ;
@@ -110,7 +137,31 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   ;
@@ -141,7 +192,31 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   ;
@@ -172,7 +247,38 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"]
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[MAX_IDX:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[ITR_CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[MAX_IDX]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[ITR_CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   ;
@@ -203,7 +309,38 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"]
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[MAX_IDX:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[ITR_CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[MAX_IDX]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[ITR_CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   ;
@@ -301,6 +438,35 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   ;
@@ -336,6 +502,35 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   ;
@@ -371,6 +566,42 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"]
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   ;
@@ -406,6 +637,42 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"]
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   ;
@@ -441,6 +708,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// TODO OpenACC: Expecting combination operation here
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   ;
@@ -476,6 +744,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// TODO OpenACC: Expecting combination operation here
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   ;
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.cpp b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.cpp
index 4d99a43ccb9bd..ffd26319e9bfc 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 template<typename T>
 void acc_compute() {
@@ -14,7 +14,10 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.float> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.float> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[LHSARG]]
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.float>
 // CHECK-NEXT: }
   ;
@@ -27,7 +30,10 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.float> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.float> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[LHSARG]]
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.float>
 // CHECK-NEXT: }
 
@@ -41,7 +47,17 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.float> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.float> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHSARG]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHSARG]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHSARG]] : !cir.float, !cir.ptr<!cir.float>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.float>
 // CHECK-NEXT: }
   ;
@@ -54,7 +70,17 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.float> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.float> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHSARG]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHSARG]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHSARG]] : !cir.float, !cir.ptr<!cir.float>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.float>
 // CHECK-NEXT: }
   ;
@@ -112,7 +138,31 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   ;
@@ -143,7 +193,31 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   ;
@@ -174,7 +248,38 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"]
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[MAX_IDX:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[ITR_CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[MAX_IDX]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[ITR_CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   ;
@@ -205,7 +310,38 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"]
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[MAX_IDX:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[ITR_CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[MAX_IDX]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[ITR_CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   ;
@@ -303,6 +439,35 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   ;
@@ -338,6 +503,35 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   ;
@@ -373,6 +567,42 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"]
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   ;
@@ -408,6 +638,42 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"]
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   ;
@@ -443,6 +709,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// TODO OpenACC: Expecting combination operation here
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   ;
@@ -478,6 +745,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// TODO OpenACC: Expecting combination operation here
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   ;
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-inline-ops.cpp b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-inline-ops.cpp
index ea00c07513dd0..1e367ee37a30d 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-inline-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-inline-ops.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 struct HasOperatorsInline {
   int i;
@@ -17,7 +17,7 @@ struct HasOperatorsInline {
   bool &operator&&(HasOperatorsInline& other);
   bool &operator||(HasOperatorsInline& other);
   // For min/max
-  HasOperatorsInline &operator<(HasOperatorsInline& other);
+  bool operator<(HasOperatorsInline& other);
   HasOperatorsInline &operator=(HasOperatorsInline& other);
 };
 
@@ -48,7 +48,7 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT:  cir.call @_ZN18HasOperatorsInlinepLERS_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}):  
@@ -79,7 +79,7 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT:  cir.call @_ZN18HasOperatorsInlinemLERS_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}):  
@@ -88,7 +88,7 @@ void acc_compute() {
 // CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(max:someVar)
-// CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTS18HasOperatorsInline : !cir.ptr<!rec_HasOperatorsInline> reduction_operator <max> init {
+// CHECK: acc.reduction.recipe @reduction_max__ZTS18HasOperatorsInline : !cir.ptr<!rec_HasOperatorsInline> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsInline, !cir.ptr<!rec_HasOperatorsInline>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
@@ -110,7 +110,13 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LT:.*]] = cir.call @_ZN18HasOperatorsInlineltERS_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LT]], true {
+// CHECK-NEXT: cir.yield %[[RHSARG]] : !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: cir.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineaSERS_(%[[LHSARG]], %[[TERNARY]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}):  
@@ -119,7 +125,7 @@ void acc_compute() {
 // CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(min:someVar)
-// CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTS18HasOperatorsInline : !cir.ptr<!rec_HasOperatorsInline> reduction_operator <min> init {
+// CHECK: acc.reduction.recipe @reduction_min__ZTS18HasOperatorsInline : !cir.ptr<!rec_HasOperatorsInline> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsInline, !cir.ptr<!rec_HasOperatorsInline>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
@@ -141,7 +147,13 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LT:.*]] = cir.call @_ZN18HasOperatorsInlineltERS_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LT]], true {
+// CHECK-NEXT: cir.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: cir.yield %[[RHSARG]] : !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineaSERS_(%[[LHSARG]], %[[TERNARY]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}):  
@@ -172,7 +184,7 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineaNERS_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}):  
@@ -181,7 +193,7 @@ void acc_compute() {
 // CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(|:someVar)
-// CHECK-NEXT: acc.reduction.recipe @reduction_ior__ZTS18HasOperatorsInline : !cir.ptr<!rec_HasOperatorsInline> reduction_operator <ior> init {
+// CHECK: acc.reduction.recipe @reduction_ior__ZTS18HasOperatorsInline : !cir.ptr<!rec_HasOperatorsInline> reduction_operator <ior> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsInline, !cir.ptr<!rec_HasOperatorsInline>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
@@ -203,7 +215,7 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineoRERS_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}):  
@@ -212,7 +224,7 @@ void acc_compute() {
 // CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(^:someVar)
-// CHECK-NEXT: acc.reduction.recipe @reduction_xor__ZTS18HasOperatorsInline : !cir.ptr<!rec_HasOperatorsInline> reduction_operator <xor> init {
+// CHECK: acc.reduction.recipe @reduction_xor__ZTS18HasOperatorsInline : !cir.ptr<!rec_HasOperatorsInline> reduction_operator <xor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsInline, !cir.ptr<!rec_HasOperatorsInline>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
@@ -234,7 +246,7 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineeOERS_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}):  
@@ -243,7 +255,7 @@ void acc_compute() {
 // CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(&&:someVar)
-// CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTS18HasOperatorsInline : !cir.ptr<!rec_HasOperatorsInline> reduction_operator <land> init {
+// CHECK: acc.reduction.recipe @reduction_land__ZTS18HasOperatorsInline : !cir.ptr<!rec_HasOperatorsInline> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsInline, !cir.ptr<!rec_HasOperatorsInline>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
@@ -344,7 +356,29 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlinepLERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+//
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
@@ -466,7 +500,28 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlinemLERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
@@ -588,7 +643,34 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LT:.*]] = cir.call @_ZN18HasOperatorsInlineltERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LT]], true {
+// CHECK-NEXT: cir.yield %[[RHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: cir.yield %[[LHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[TERNARY]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
@@ -710,7 +792,34 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LT:.*]] = cir.call @_ZN18HasOperatorsInlineltERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LT]], true {
+// CHECK-NEXT: cir.yield %[[LHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: cir.yield %[[RHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[TERNARY]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
@@ -832,7 +941,28 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineaNERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
@@ -896,7 +1026,28 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineoRERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
@@ -959,7 +1110,28 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineeOERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
@@ -1216,6 +1388,32 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT:  cir.call @_ZN18HasOperatorsInlinepLERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -1293,6 +1491,32 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT:  cir.call @_ZN18HasOperatorsInlinemLERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -1370,6 +1594,38 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LT:.*]] = cir.call @_ZN18HasOperatorsInlineltERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LT]], true {
+// CHECK-NEXT: cir.yield %[[RHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: cir.yield %[[LHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[TERNARY]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -1447,6 +1703,38 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LT:.*]] = cir.call @_ZN18HasOperatorsInlineltERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LT]], true {
+// CHECK-NEXT: cir.yield %[[LHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: cir.yield %[[RHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[TERNARY]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -1524,6 +1812,32 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT:  cir.call @_ZN18HasOperatorsInlineaNERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -1601,6 +1915,32 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT:  cir.call @_ZN18HasOperatorsInlineoRERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -1678,6 +2018,32 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT:  cir.call @_ZN18HasOperatorsInlineeOERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.c b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.c
index b170ed0bf92fa..2f42a5c63f149 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.c
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 void acc_compute() {
   int someVar;
@@ -13,7 +13,10 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!s32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!s32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[LHSARG]]
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!s32i>
 // CHECK-NEXT: }
   ;
@@ -27,7 +30,10 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!s32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!s32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[LHSARG]]
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!s32i>
 // CHECK-NEXT: }
   ;
@@ -40,7 +46,17 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!s32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!s32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHSARG]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHSARG]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHSARG]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!s32i>
 // CHECK-NEXT: }
   ;
@@ -53,7 +69,17 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!s32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!s32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHSARG]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHSARG]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHSARG]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!s32i>
 // CHECK-NEXT: }
   ;
@@ -66,7 +92,10 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!s32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!s32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[AND]], %[[LHSARG]]
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!s32i>
 // CHECK-NEXT: }
   ;
@@ -80,7 +109,10 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!s32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!s32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[OR]], %[[LHSARG]]
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!s32i>
 // CHECK-NEXT: }
   ;
@@ -94,9 +126,13 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!s32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!s32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[XOR]], %[[LHSARG]]
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!s32i>
 // CHECK-NEXT: }
+
   ;
 #pragma acc parallel reduction(&&:someVar)
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSi : !cir.ptr<!s32i> reduction_operator <land> init {
@@ -153,7 +189,31 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -184,7 +244,31 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -215,7 +299,38 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"]
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[MAX_IDX:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[ITR_CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[MAX_IDX]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[ITR_CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -246,7 +361,38 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"]
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[MAX_IDX:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[ITR_CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[MAX_IDX]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[ITR_CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -277,7 +423,31 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[AND]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -307,7 +477,31 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[OR]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -337,7 +531,31 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[XOR]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -435,6 +653,35 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -470,6 +717,35 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -505,6 +781,42 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"]
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -540,6 +852,42 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"]
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -575,6 +923,35 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[AND]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -610,6 +987,35 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[OR]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -645,6 +1051,35 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[XOR]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -680,6 +1115,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// TODO OpenACC: Expecting combination operation here
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -715,6 +1151,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// TODO OpenACC: Expecting combination operation here
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.cpp b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.cpp
index c678eaee302c1..af7bcf3770fe1 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 template<typename T>
 void acc_compute() {
@@ -14,7 +14,10 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!s32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!s32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[LHSARG]]
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!s32i>
 // CHECK-NEXT: }
   ;
@@ -28,7 +31,10 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!s32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!s32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[LHSARG]]
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!s32i>
 // CHECK-NEXT: }
   ;
@@ -41,7 +47,17 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!s32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!s32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHSARG]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHSARG]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHSARG]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!s32i>
 // CHECK-NEXT: }
   ;
@@ -54,7 +70,17 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!s32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!s32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHSARG]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHSARG]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHSARG]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!s32i>
 // CHECK-NEXT: }
   ;
@@ -67,7 +93,10 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!s32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!s32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[AND]], %[[LHSARG]]
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!s32i>
 // CHECK-NEXT: }
   ;
@@ -81,7 +110,10 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!s32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!s32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[OR]], %[[LHSARG]]
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!s32i>
 // CHECK-NEXT: }
   ;
@@ -95,7 +127,10 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!s32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!s32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[XOR]], %[[LHSARG]]
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!s32i>
 // CHECK-NEXT: }
 
@@ -155,7 +190,31 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -186,7 +245,31 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -217,7 +300,38 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"]
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[MAX_IDX:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[ITR_CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[MAX_IDX]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[ITR_CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -248,7 +362,38 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"]
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[MAX_IDX:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[ITR_CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[MAX_IDX]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[ITR_CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -279,7 +424,31 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[AND]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -309,7 +478,31 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[OR]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -339,7 +532,31 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[XOR]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -437,6 +654,35 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -472,6 +718,35 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -507,6 +782,42 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"]
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -542,6 +853,42 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"]
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -577,6 +924,35 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[AND]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -612,6 +988,35 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[OR]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -647,6 +1052,35 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[XOR]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -682,6 +1116,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// TODO OpenACC: Expecting combination operation here
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -717,6 +1152,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// TODO OpenACC: Expecting combination operation here
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-outline-ops.cpp b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-outline-ops.cpp
index 9ccaea205870b..ec890e2b1de65 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-outline-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-outline-ops.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 struct HasOperatorsOutline {
   int i;
   unsigned u;
@@ -18,13 +18,13 @@ HasOperatorsOutline &operator^=(HasOperatorsOutline &, HasOperatorsOutline &);
 bool &operator&&(HasOperatorsOutline &, HasOperatorsOutline &);
 bool &operator||(HasOperatorsOutline &, HasOperatorsOutline &);
 // For min/max
-HasOperatorsOutline &operator<(HasOperatorsOutline &, HasOperatorsOutline &);
+bool operator<(HasOperatorsOutline &, HasOperatorsOutline &);
 
 template<typename T>
 void acc_compute() {
   T someVar;
   T someVarArr[5];
-#pragma acc parallel reduction(+:someVar)
+#pragma acc parallel  reduction(+:someVar)
 // CHECK: acc.reduction.recipe @reduction_add__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <add> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsOutline, !cir.ptr<!rec_HasOperatorsOutline>, ["openacc.reduction.init", init]
@@ -47,7 +47,7 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT:  cir.call @_ZpLR19HasOperatorsOutlineS0_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}):  
@@ -55,8 +55,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
-
-#pragma acc parallel reduction(*:someVar)
+#pragma acc parallel  reduction(*:someVar)
 // CHECK: acc.reduction.recipe @reduction_mul__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsOutline, !cir.ptr<!rec_HasOperatorsOutline>, ["openacc.reduction.init", init]
@@ -78,17 +77,17 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
-  ;
-#pragma acc parallel reduction(max:someVar)
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT:  cir.call @_ZmLR19HasOperatorsOutlineS0_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}):  
 // CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineD1Ev(%[[ARG]]) nothrow : (!cir.ptr<!rec_HasOperatorsOutline>)
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
-// CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <max> init {
+  ;
+#pragma acc parallel  reduction(max:someVar)
+// CHECK: acc.reduction.recipe @reduction_max__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsOutline, !cir.ptr<!rec_HasOperatorsOutline>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
@@ -110,7 +109,13 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LT:.*]] = cir.call @_ZltR19HasOperatorsOutlineS0_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LT]], true {
+// CHECK-NEXT: cir.yield %[[RHSARG]] : !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: cir.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineaSERKS_(%[[LHSARG]], %[[TERNARY]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}):  
@@ -118,8 +123,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
-#pragma acc parallel reduction(min:someVar)
-// CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <min> init {
+#pragma acc parallel  reduction(min:someVar)
+// CHECK: acc.reduction.recipe @reduction_min__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsOutline, !cir.ptr<!rec_HasOperatorsOutline>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
@@ -141,7 +146,13 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LT:.*]] = cir.call @_ZltR19HasOperatorsOutlineS0_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LT]], true {
+// CHECK-NEXT: cir.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: cir.yield %[[RHSARG]] : !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineaSERKS_(%[[LHSARG]], %[[TERNARY]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}):  
@@ -149,7 +160,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
-#pragma acc parallel reduction(&:someVar)
+#pragma acc parallel  reduction(&:someVar)
 // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <iand> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsOutline, !cir.ptr<!rec_HasOperatorsOutline>, ["openacc.reduction.init", init]
@@ -172,7 +183,7 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.call @_ZaNR19HasOperatorsOutlineS0_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}):  
@@ -180,8 +191,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
-#pragma acc parallel reduction(|:someVar)
-// CHECK-NEXT: acc.reduction.recipe @reduction_ior__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <ior> init {
+#pragma acc parallel  reduction(|:someVar)
+// CHECK: acc.reduction.recipe @reduction_ior__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <ior> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsOutline, !cir.ptr<!rec_HasOperatorsOutline>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
@@ -203,7 +214,7 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.call @_ZoRR19HasOperatorsOutlineS0_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}):  
@@ -211,8 +222,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
-#pragma acc parallel reduction(^:someVar)
-// CHECK-NEXT: acc.reduction.recipe @reduction_xor__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <xor> init {
+#pragma acc parallel  reduction(^:someVar)
+// CHECK: acc.reduction.recipe @reduction_xor__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <xor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsOutline, !cir.ptr<!rec_HasOperatorsOutline>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
@@ -234,7 +245,7 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.call @_ZeOR19HasOperatorsOutlineS0_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}):  
@@ -242,8 +253,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
-#pragma acc parallel reduction(&&:someVar)
-// CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <land> init {
+#pragma acc parallel  reduction(&&:someVar)
+// CHECK: acc.reduction.recipe @reduction_land__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsOutline, !cir.ptr<!rec_HasOperatorsOutline>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
@@ -273,7 +284,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
-#pragma acc parallel reduction(||:someVar)
+#pragma acc parallel  reduction(||:someVar)
 // CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <lor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsOutline, !cir.ptr<!rec_HasOperatorsOutline>, ["openacc.reduction.init", init]
@@ -305,7 +316,7 @@ void acc_compute() {
 // CHECK-NEXT: }
   ;
 
-#pragma acc parallel reduction(+:someVarArr)
+#pragma acc parallel  reduction(+:someVarArr)
 // CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <add> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init", init]
@@ -344,7 +355,29 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZpLR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+//
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
@@ -368,7 +401,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
-#pragma acc parallel reduction(*:someVarArr)
+#pragma acc parallel  reduction(*:someVarArr)
 // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init", init]
@@ -466,7 +499,28 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZmLR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
@@ -490,7 +544,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
-#pragma acc parallel reduction(max:someVarArr)
+#pragma acc parallel  reduction(max:someVarArr)
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init", init]
@@ -588,7 +642,34 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LT:.*]] = cir.call @_ZltR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LT]], true {
+// CHECK-NEXT: cir.yield %[[RHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: cir.yield %[[LHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[TERNARY]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
@@ -612,7 +693,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
-#pragma acc parallel reduction(min:someVarArr)
+#pragma acc parallel  reduction(min:someVarArr)
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init", init]
@@ -710,7 +791,34 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LT:.*]] = cir.call @_ZltR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LT]], true {
+// CHECK-NEXT: cir.yield %[[LHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: cir.yield %[[RHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[TERNARY]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
@@ -734,7 +842,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
-#pragma acc parallel reduction(&:someVarArr)
+#pragma acc parallel  reduction(&:someVarArr)
 // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <iand> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init", init]
@@ -832,7 +940,28 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZaNR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
@@ -856,7 +985,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
-#pragma acc parallel reduction(|:someVarArr)
+#pragma acc parallel  reduction(|:someVarArr)
 // CHECK-NEXT: acc.reduction.recipe @reduction_ior__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <ior> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init", init]
@@ -896,7 +1025,28 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZoRR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
@@ -920,7 +1070,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
-#pragma acc parallel reduction(^:someVarArr)
+#pragma acc parallel  reduction(^:someVarArr)
 // CHECK-NEXT: acc.reduction.recipe @reduction_xor__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <xor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init", init]
@@ -959,7 +1109,28 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZeOR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
@@ -982,8 +1153,9 @@ void acc_compute() {
 // CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
+
   ;
-#pragma acc parallel reduction(&&:someVarArr)
+#pragma acc parallel  reduction(&&:someVarArr)
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init", init]
@@ -1105,9 +1277,8 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
-#pragma acc parallel reduction(||:someVarArr)
-
-// CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <lor> init {
+#pragma acc parallel  reduction(||:someVarArr)
+// CHECK: acc.reduction.recipe @reduction_lor__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <lor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["arrayinit.temp"]
@@ -1171,7 +1342,7 @@ void acc_compute() {
 // CHECK-NEXT: }
   ;
 
-#pragma acc parallel reduction(+:someVarArr[2])
+#pragma acc parallel  reduction(+:someVarArr[2])
 // CHECK-NEXT: acc.reduction.recipe @reduction_add__Bcnt1__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <add> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init"]
@@ -1216,6 +1387,32 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT:  cir.call @_ZpLR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -1248,7 +1445,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
-#pragma acc parallel reduction(*:someVarArr[2])
+#pragma acc parallel  reduction(*:someVarArr[2])
 // CHECK-NEXT: acc.reduction.recipe @reduction_mul__Bcnt1__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init"]
@@ -1293,6 +1490,32 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT:  cir.call @_ZmLR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -1325,7 +1548,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
-#pragma acc parallel reduction(max:someVarArr[2])
+#pragma acc parallel  reduction(max:someVarArr[2])
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__Bcnt1__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init"]
@@ -1370,6 +1593,38 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LT:.*]] = cir.call @_ZltR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LT]], true {
+// CHECK-NEXT: cir.yield %[[RHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: cir.yield %[[LHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[TERNARY]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -1402,7 +1657,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
-#pragma acc parallel reduction(min:someVarArr[2])
+#pragma acc parallel  reduction(min:someVarArr[2])
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__Bcnt1__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init"]
@@ -1447,6 +1702,38 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LT:.*]] = cir.call @_ZltR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LT]], true {
+// CHECK-NEXT: cir.yield %[[LHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: cir.yield %[[RHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[TERNARY]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -1479,7 +1766,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
-#pragma acc parallel reduction(&:someVarArr[2])
+#pragma acc parallel  reduction(&:someVarArr[2])
 // CHECK-NEXT: acc.reduction.recipe @reduction_iand__Bcnt1__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <iand> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init"]
@@ -1524,6 +1811,32 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT:  cir.call @_ZaNR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -1556,7 +1869,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
-#pragma acc parallel reduction(|:someVarArr[2])
+#pragma acc parallel  reduction(|:someVarArr[2])
 // CHECK-NEXT: acc.reduction.recipe @reduction_ior__Bcnt1__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <ior> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init"]
@@ -1601,6 +1914,32 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT:  cir.call @_ZoRR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -1633,7 +1972,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
-#pragma acc parallel reduction(^:someVarArr[2])
+#pragma acc parallel  reduction(^:someVarArr[2])
 // CHECK-NEXT: acc.reduction.recipe @reduction_xor__Bcnt1__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <xor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init"]
@@ -1678,6 +2017,32 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT:  cir.call @_ZeOR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -1710,7 +2075,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
-#pragma acc parallel reduction(&&:someVarArr[2])
+#pragma acc parallel  reduction(&&:someVarArr[2])
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__Bcnt1__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init"]
@@ -1787,7 +2152,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
-#pragma acc parallel reduction(||:someVarArr[2])
+#pragma acc parallel  reduction(||:someVarArr[2])
 // CHECK-NEXT: acc.reduction.recipe @reduction_lor__Bcnt1__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <lor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init"]
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-unsigned-int.c b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-unsigned-int.c
index 783aa9a5a110c..08daa702c47f8 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-unsigned-int.c
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-unsigned-int.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 void acc_compute() {
   unsigned int someVar;
@@ -13,7 +13,10 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!u32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!u32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[LHSARG]]
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!u32i>
 // CHECK-NEXT: }
   ;
@@ -27,7 +30,10 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!u32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!u32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!u32i>
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[LHSARG]]
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!u32i>
 // CHECK-NEXT: }
   ;
@@ -40,7 +46,17 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!u32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!u32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!u32i>
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHSARG]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHSARG]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHSARG]] : !u32i, !cir.ptr<!u32i>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!u32i>
 // CHECK-NEXT: }
   ;
@@ -53,7 +69,17 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!u32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!u32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!u32i>
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHSARG]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHSARG]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHSARG]] : !u32i, !cir.ptr<!u32i>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!u32i>
 // CHECK-NEXT: }
   ;
@@ -66,7 +92,10 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!u32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!u32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!u32i>
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[AND]], %[[LHSARG]]
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!u32i>
 // CHECK-NEXT: }
   ;
@@ -80,7 +109,10 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!u32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!u32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!u32i>
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[OR]], %[[LHSARG]]
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!u32i>
 // CHECK-NEXT: }
   ;
@@ -94,7 +126,10 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!u32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!u32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!u32i>
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[XOR]], %[[LHSARG]]
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!u32i>
 // CHECK-NEXT: }
   ;
@@ -153,7 +188,31 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -184,7 +243,31 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -215,7 +298,38 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"]
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[MAX_IDX:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[ITR_CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[MAX_IDX]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[ITR_CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHS_STRIDE]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -246,7 +360,38 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"]
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[MAX_IDX:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[ITR_CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[MAX_IDX]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[ITR_CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHS_STRIDE]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -277,7 +422,31 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[AND]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -307,7 +476,31 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[OR]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -337,7 +530,31 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[XOR]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -435,6 +652,35 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!u32i>, !u64i) -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!u32i>, !u64i) -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -470,6 +716,35 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!u32i>, !u64i) -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!u32i>, !u64i) -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -505,6 +780,42 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"]
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!u32i>, !u64i) -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!u32i>, !u64i) -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHS_STRIDE]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -540,6 +851,42 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"]
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!u32i>, !u64i) -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!u32i>, !u64i) -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHS_STRIDE]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -575,6 +922,35 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!u32i>, !u64i) -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!u32i>, !u64i) -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[AND]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -610,6 +986,35 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!u32i>, !u64i) -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!u32i>, !u64i) -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[OR]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -645,6 +1050,35 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!u32i>, !u64i) -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!u32i>, !u64i) -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[XOR]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -680,6 +1114,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// TODO OpenACC: Expecting combination operation here
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -715,6 +1150,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// TODO OpenACC: Expecting combination operation here
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>>
 // CHECK-NEXT: }
   ;
diff --git a/clang/test/CIR/CodeGenOpenACC/loop-private-clause.cpp b/clang/test/CIR/CodeGenOpenACC/loop-private-clause.cpp
index 7bbc58109a18a..423b42b695758 100644
--- a/clang/test/CIR/CodeGenOpenACC/loop-private-clause.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/loop-private-clause.cpp
@@ -158,7 +158,7 @@ struct HasDtor {
 // CHECK-NEXT: }
 
 extern "C" void acc_loop() {
-  // CHECK: cir.func{{.*}} @acc_loop() {
+  // CHECK: cir.func{{.*}} @acc_loop()
 
   int someInt;
   // CHECK-NEXT: %[[SOMEINT:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["someInt"]
diff --git a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-default-ops.cpp b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-default-ops.cpp
index 038afcaa28be0..1a77c0f10a144 100644
--- a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-default-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-default-ops.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 struct DefaultOperators {
   int i;
@@ -15,7 +15,7 @@ struct DefaultOperatorsNoFloats {
 };
 
 template<typename T>
-void acc_combined() {
+void acc_loop() {
   T someVar;
   T someVarArr[5];
   struct DefaultOperatorsNoFloats someVarNoFloats;
@@ -43,12 +43,43 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) nsw : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[ADD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: }
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc loop reduction(*:someVar)
-
 // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTS16DefaultOperators : !cir.ptr<!rec_DefaultOperators> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperators>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_DefaultOperators, !cir.ptr<!rec_DefaultOperators>, ["openacc.reduction.init", init]
@@ -71,10 +102,42 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) nsw : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[MUL]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: }
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc loop reduction(max:someVar)
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTS16DefaultOperators : !cir.ptr<!rec_DefaultOperators> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperators>{{.*}})
@@ -98,10 +161,81 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.double
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_CAST]], %[[RHS_CAST]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+//
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: }
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc loop reduction(min:someVar)
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTS16DefaultOperators : !cir.ptr<!rec_DefaultOperators> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperators>{{.*}})
@@ -125,10 +259,81 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.double
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_CAST]], %[[RHS_CAST]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+//
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: }
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc loop reduction(&:someVarNoFloats)
 // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTS24DefaultOperatorsNoFloats : !cir.ptr<!rec_DefaultOperatorsNoFloats> reduction_operator <iand> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperatorsNoFloats>{{.*}})
@@ -146,7 +351,27 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_DefaultOperatorsNoFloats> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_DefaultOperatorsNoFloats> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[AND]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[AND]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[AND]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperatorsNoFloats>
 // CHECK-NEXT: }
   for(int i = 0; i < 5; ++i);
@@ -167,7 +392,27 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_DefaultOperatorsNoFloats> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_DefaultOperatorsNoFloats> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[OR]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[OR]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[OR]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperatorsNoFloats>
 // CHECK-NEXT: }
   for(int i = 0; i < 5; ++i);
@@ -188,10 +433,30 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_DefaultOperatorsNoFloats> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_DefaultOperatorsNoFloats> {{.*}}) 
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[XOR]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[XOR]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHSARG]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHSARG]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[XOR]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperatorsNoFloats>
 // CHECK-NEXT: }
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc loop reduction(&&:someVar)
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTS16DefaultOperators : !cir.ptr<!rec_DefaultOperators> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperators>{{.*}})
@@ -218,7 +483,7 @@ void acc_combined() {
 // TODO OpenACC: Expecting combination operation here
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: }
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc loop reduction(||:someVar)
 // CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTS16DefaultOperators : !cir.ptr<!rec_DefaultOperators> reduction_operator <lor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_DefaultOperators>{{.*}})
@@ -245,7 +510,7 @@ void acc_combined() {
 // TODO OpenACC: Expecting combination operation here
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: }
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 
 #pragma acc loop reduction(+:someVarArr)
 // CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <add> init {
@@ -286,10 +551,65 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) nsw : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[ADD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc loop reduction(*:someVarArr)
 // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
@@ -388,11 +708,67 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) nsw : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[MUL]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc loop reduction(max:someVarArr)
+
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
@@ -490,10 +866,103 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"]
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[MAX_IDX:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[ITR_CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[MAX_IDX]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[ITR_CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.double
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_CAST]], %[[RHS_CAST]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc loop reduction(min:someVarArr)
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
@@ -592,7 +1061,100 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"]
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[MAX_IDX:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[ITR_CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[MAX_IDX]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[ITR_CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.double
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_CAST]], %[[RHS_CAST]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   for(int i = 0; i < 5; ++i);
@@ -664,10 +1226,53 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[AND]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[AND]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[AND]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>
 // CHECK-NEXT: }
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc loop reduction(|:someVarArrNoFloats)
 // CHECK-NEXT: acc.reduction.recipe @reduction_ior__ZTSA5_24DefaultOperatorsNoFloats : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> reduction_operator <ior> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>{{.*}})
@@ -702,7 +1307,50 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[OR]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[OR]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[OR]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>
 // CHECK-NEXT: }
   for(int i = 0; i < 5; ++i);
@@ -739,10 +1387,53 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !s64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[XOR]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[XOR]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[XOR]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>
 // CHECK-NEXT: }
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc loop reduction(&&:someVarArr)
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
@@ -844,9 +1535,9 @@ void acc_combined() {
 // TODO OpenACC: Expecting combination operation here
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc loop reduction(||:someVarArr)
-// CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <lor> init {
+// CHECK: acc.reduction.recipe @reduction_lor__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <lor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>, ["arrayinit.temp"]
@@ -888,7 +1579,7 @@ void acc_combined() {
 // TODO OpenACC: Expecting combination operation here
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 
 #pragma acc loop reduction(+:someVarArr[2])
 // CHECK-NEXT: acc.reduction.recipe @reduction_add__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <add> init {
@@ -935,9 +1626,67 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) nsw : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[ADD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc loop reduction(*:someVarArr[2])
 // CHECK-NEXT: acc.reduction.recipe @reduction_mul__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
@@ -983,9 +1732,67 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) nsw : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[MUL]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc loop reduction(max:someVarArr[2])
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
@@ -1031,9 +1838,109 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"]
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+//
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.double
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_CAST]], %[[RHS_CAST]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+//
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc loop reduction(min:someVarArr[2])
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
@@ -1079,9 +1986,109 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"]
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+//
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.double, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.double
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_CAST]], %[[RHS_CAST]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+//
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc loop reduction(&:someVarArrNoFloats[2])
 // CHECK-NEXT: acc.reduction.recipe @reduction_iand__Bcnt1__ZTSA5_24DefaultOperatorsNoFloats : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> reduction_operator <iand> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
@@ -1121,9 +2128,55 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !u64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !u64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[AND]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[AND]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[AND]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>
 // CHECK-NEXT: }
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc loop reduction(|:someVarArrNoFloats[2])
 // CHECK-NEXT: acc.reduction.recipe @reduction_ior__Bcnt1__ZTSA5_24DefaultOperatorsNoFloats : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> reduction_operator <ior> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
@@ -1163,9 +2216,55 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !u64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !u64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[OR]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[OR]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[OR]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>
 // CHECK-NEXT: }
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc loop reduction(^:someVarArrNoFloats[2])
 // CHECK-NEXT: acc.reduction.recipe @reduction_xor__Bcnt1__ZTSA5_24DefaultOperatorsNoFloats : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> reduction_operator <xor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
@@ -1205,9 +2304,55 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !u64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>> -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperatorsNoFloats>, !u64i) -> !cir.ptr<!rec_DefaultOperatorsNoFloats>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[XOR]], %[[GET_MEM_LHS]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[XOR]], %[[GET_MEM_LHS]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_MEM_LHS:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[GET_MEM_RHS:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "b"} : !cir.ptr<!rec_DefaultOperatorsNoFloats> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_RHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[RHS_INT_CAST:.*]] = cir.cast bool_to_int %[[RHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[GET_MEM_LHS]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[LHS_INT_CAST:.*]] = cir.cast bool_to_int %[[LHS_LOAD]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_INT_CAST]], %[[RHS_INT_CAST]]) : !s32i
+// CHECK-NEXT: %[[RES_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[XOR]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_BOOL_CAST]], %[[GET_MEM_LHS]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperatorsNoFloats x 5>>
 // CHECK-NEXT: }
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc loop reduction(&&:someVarArr[2])
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
@@ -1255,7 +2400,7 @@ void acc_combined() {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc loop reduction(||:someVarArr[2])
 // CHECK-NEXT: acc.reduction.recipe @reduction_lor__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <lor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
@@ -1303,29 +2448,29 @@ void acc_combined() {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 
 #pragma acc loop reduction(+:someVarArr[1:1])
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc loop reduction(*:someVarArr[1:1])
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc loop reduction(max:someVarArr[1:1])
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc loop reduction(min:someVarArr[1:1])
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc loop reduction(&:someVarArrNoFloats[1:1])
   for(int i = 0; i < 5; ++i);
 #pragma acc loop reduction(|:someVarArrNoFloats[1:1])
   for(int i = 0; i < 5; ++i);
 #pragma acc loop reduction(^:someVarArrNoFloats[1:1])
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc loop reduction(&&:someVarArr[1:1])
-  for(int i=0;i < 5; ++i);
+  for(int i = 0; i < 5; ++i);
 #pragma acc loop reduction(||:someVarArr[1:1])
-  for(int i=0;i < 5; ++i);
-  // CHECK-NEXT: cir.func {{.*}}@_Z12acc_combined
+  for(int i = 0; i < 5; ++i);
+  // CHECK-NEXT: cir.func {{.*}}@_Z8acc_loop
 }
 
 void uses() {
-  acc_combined<DefaultOperators>();
+  acc_loop<DefaultOperators>();
 }
diff --git a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-float.cpp b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-float.cpp
index 11b7c359ebb07..7faef7111a9c8 100644
--- a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-float.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-float.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 template<typename T>
 void acc_loop() {
@@ -14,7 +14,10 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.float> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.float> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[LHSARG]]
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.float>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -27,7 +30,10 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.float> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.float> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[LHSARG]]
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.float>
 // CHECK-NEXT: }
 
@@ -41,7 +47,17 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.float> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.float> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHSARG]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHSARG]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHSARG]] : !cir.float, !cir.ptr<!cir.float>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.float>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -54,7 +70,17 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.float> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.float> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHSARG]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHSARG]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHSARG]] : !cir.float, !cir.ptr<!cir.float>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.float>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -112,7 +138,31 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -143,7 +193,31 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -174,7 +248,38 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"]
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[MAX_IDX:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[ITR_CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[MAX_IDX]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[ITR_CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -205,7 +310,38 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"]
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[MAX_IDX:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[ITR_CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[MAX_IDX]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[ITR_CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -303,6 +439,35 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -338,6 +503,35 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -373,6 +567,42 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"]
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -408,6 +638,42 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"]
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !cir.float, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -443,6 +709,7 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// TODO OpenACC: Expecting combination operation here
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -478,6 +745,7 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// TODO OpenACC: Expecting combination operation here
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
diff --git a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-inline-ops.cpp b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-inline-ops.cpp
index d95da8cc883da..43c9fbbce7533 100644
--- a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-inline-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-inline-ops.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 struct HasOperatorsInline {
   int i;
@@ -17,7 +17,7 @@ struct HasOperatorsInline {
   bool &operator&&(HasOperatorsInline& other);
   bool &operator||(HasOperatorsInline& other);
   // For min/max
-  HasOperatorsInline &operator<(HasOperatorsInline& other);
+  bool operator<(HasOperatorsInline& other);
   HasOperatorsInline &operator=(HasOperatorsInline& other);
 };
 
@@ -48,7 +48,7 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT:  cir.call @_ZN18HasOperatorsInlinepLERS_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}):  
@@ -79,7 +79,7 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT:  cir.call @_ZN18HasOperatorsInlinemLERS_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}):  
@@ -88,7 +88,7 @@ void acc_loop() {
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(max:someVar)
-// CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTS18HasOperatorsInline : !cir.ptr<!rec_HasOperatorsInline> reduction_operator <max> init {
+// CHECK: acc.reduction.recipe @reduction_max__ZTS18HasOperatorsInline : !cir.ptr<!rec_HasOperatorsInline> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsInline, !cir.ptr<!rec_HasOperatorsInline>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
@@ -110,7 +110,13 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LT:.*]] = cir.call @_ZN18HasOperatorsInlineltERS_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LT]], true {
+// CHECK-NEXT: cir.yield %[[RHSARG]] : !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: cir.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineaSERS_(%[[LHSARG]], %[[TERNARY]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}):  
@@ -119,7 +125,7 @@ void acc_loop() {
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(min:someVar)
-// CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTS18HasOperatorsInline : !cir.ptr<!rec_HasOperatorsInline> reduction_operator <min> init {
+// CHECK: acc.reduction.recipe @reduction_min__ZTS18HasOperatorsInline : !cir.ptr<!rec_HasOperatorsInline> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsInline, !cir.ptr<!rec_HasOperatorsInline>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
@@ -141,7 +147,13 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LT:.*]] = cir.call @_ZN18HasOperatorsInlineltERS_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LT]], true {
+// CHECK-NEXT: cir.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: cir.yield %[[RHSARG]] : !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineaSERS_(%[[LHSARG]], %[[TERNARY]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}):  
@@ -172,7 +184,7 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineaNERS_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}):  
@@ -181,7 +193,7 @@ void acc_loop() {
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(|:someVar)
-// CHECK-NEXT: acc.reduction.recipe @reduction_ior__ZTS18HasOperatorsInline : !cir.ptr<!rec_HasOperatorsInline> reduction_operator <ior> init {
+// CHECK: acc.reduction.recipe @reduction_ior__ZTS18HasOperatorsInline : !cir.ptr<!rec_HasOperatorsInline> reduction_operator <ior> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsInline, !cir.ptr<!rec_HasOperatorsInline>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
@@ -203,7 +215,7 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineoRERS_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}):  
@@ -212,7 +224,7 @@ void acc_loop() {
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(^:someVar)
-// CHECK-NEXT: acc.reduction.recipe @reduction_xor__ZTS18HasOperatorsInline : !cir.ptr<!rec_HasOperatorsInline> reduction_operator <xor> init {
+// CHECK: acc.reduction.recipe @reduction_xor__ZTS18HasOperatorsInline : !cir.ptr<!rec_HasOperatorsInline> reduction_operator <xor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsInline, !cir.ptr<!rec_HasOperatorsInline>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
@@ -234,7 +246,7 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineeOERS_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}):  
@@ -243,7 +255,7 @@ void acc_loop() {
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(&&:someVar)
-// CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTS18HasOperatorsInline : !cir.ptr<!rec_HasOperatorsInline> reduction_operator <land> init {
+// CHECK: acc.reduction.recipe @reduction_land__ZTS18HasOperatorsInline : !cir.ptr<!rec_HasOperatorsInline> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsInline, !cir.ptr<!rec_HasOperatorsInline>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
@@ -344,7 +356,29 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlinepLERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+//
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
@@ -466,7 +500,28 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlinemLERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
@@ -588,7 +643,34 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LT:.*]] = cir.call @_ZN18HasOperatorsInlineltERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LT]], true {
+// CHECK-NEXT: cir.yield %[[RHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: cir.yield %[[LHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[TERNARY]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
@@ -710,7 +792,34 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LT:.*]] = cir.call @_ZN18HasOperatorsInlineltERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LT]], true {
+// CHECK-NEXT: cir.yield %[[LHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: cir.yield %[[RHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[TERNARY]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
@@ -832,7 +941,28 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineaNERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
@@ -896,7 +1026,28 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineoRERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
@@ -959,7 +1110,28 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineeOERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
@@ -1216,6 +1388,32 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT:  cir.call @_ZN18HasOperatorsInlinepLERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -1293,6 +1491,32 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT:  cir.call @_ZN18HasOperatorsInlinemLERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -1370,6 +1594,38 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LT:.*]] = cir.call @_ZN18HasOperatorsInlineltERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LT]], true {
+// CHECK-NEXT: cir.yield %[[RHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: cir.yield %[[LHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[TERNARY]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -1447,6 +1703,38 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LT:.*]] = cir.call @_ZN18HasOperatorsInlineltERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LT]], true {
+// CHECK-NEXT: cir.yield %[[LHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: cir.yield %[[RHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[TERNARY]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -1524,6 +1812,32 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT:  cir.call @_ZN18HasOperatorsInlineaNERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -1601,6 +1915,32 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT:  cir.call @_ZN18HasOperatorsInlineoRERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -1678,6 +2018,32 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT:  cir.call @_ZN18HasOperatorsInlineeOERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
diff --git a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-int.cpp b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-int.cpp
index d207e07b78dfd..5353218866d47 100644
--- a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-int.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-int.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 template<typename T>
 void acc_loop() {
@@ -14,7 +14,10 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!s32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!s32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[LHSARG]]
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!s32i>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -28,7 +31,10 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!s32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!s32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[LHSARG]]
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!s32i>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -41,7 +47,17 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!s32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!s32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHSARG]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHSARG]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHSARG]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!s32i>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -54,7 +70,17 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!s32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!s32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHSARG]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHSARG]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHSARG]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!s32i>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -67,7 +93,10 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!s32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!s32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[AND]], %[[LHSARG]]
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!s32i>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -81,7 +110,10 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!s32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!s32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[OR]], %[[LHSARG]]
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!s32i>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -95,7 +127,10 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!s32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!s32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[XOR]], %[[LHSARG]]
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!s32i>
 // CHECK-NEXT: }
 
@@ -155,7 +190,31 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -186,7 +245,31 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -217,7 +300,38 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"]
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[MAX_IDX:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[ITR_CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[MAX_IDX]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[ITR_CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -248,7 +362,38 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"]
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[MAX_IDX:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[ITR_CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[MAX_IDX]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[ITR_CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -279,7 +424,31 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[AND]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -309,7 +478,31 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[OR]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -339,7 +532,31 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[XOR]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -437,6 +654,35 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ADD]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -472,6 +718,35 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[MUL]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -507,6 +782,42 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"]
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -542,6 +853,42 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"]
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i> 
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i> 
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[CMP]], true {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RESULT:.*]] = cir.load{{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.yield %[[RESULT]]
+// CHECK-NEXT: }) : (!cir.bool) -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[TERNARY]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -577,6 +924,35 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[AND:.*]] = cir.binop(and, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[AND]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -612,6 +988,35 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[OR]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -647,6 +1052,35 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[XOR:.*]] = cir.binop(xor, %[[LHS_LOAD]], %[[RHS_LOAD]]) : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[XOR]], %[[LHS_STRIDE]]
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -682,6 +1116,7 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// TODO OpenACC: Expecting combination operation here
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -717,6 +1152,7 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// TODO OpenACC: Expecting combination operation here
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
diff --git a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-outline-ops.cpp b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-outline-ops.cpp
index a33c25a36bf77..e193cfa1a5ab2 100644
--- a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-outline-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-outline-ops.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 struct HasOperatorsOutline {
   int i;
   unsigned u;
@@ -18,7 +18,7 @@ HasOperatorsOutline &operator^=(HasOperatorsOutline &, HasOperatorsOutline &);
 bool &operator&&(HasOperatorsOutline &, HasOperatorsOutline &);
 bool &operator||(HasOperatorsOutline &, HasOperatorsOutline &);
 // For min/max
-HasOperatorsOutline &operator<(HasOperatorsOutline &, HasOperatorsOutline &);
+bool operator<(HasOperatorsOutline &, HasOperatorsOutline &);
 
 template<typename T>
 void acc_loop() {
@@ -47,7 +47,7 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT:  cir.call @_ZpLR19HasOperatorsOutlineS0_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}):  
@@ -55,7 +55,6 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
-
 #pragma acc loop reduction(*:someVar)
 // CHECK: acc.reduction.recipe @reduction_mul__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline>{{.*}})
@@ -78,17 +77,17 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 //
 // CHECK-NEXT: } combiner {
-  for(int i=0;i < 5; ++i);
-#pragma acc loop reduction(max:someVar)
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT:  cir.call @_ZmLR19HasOperatorsOutlineS0_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}):  
 // CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineD1Ev(%[[ARG]]) nothrow : (!cir.ptr<!rec_HasOperatorsOutline>)
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
-// CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <max> init {
+  for(int i=0;i < 5; ++i);
+#pragma acc loop reduction(max:someVar)
+// CHECK: acc.reduction.recipe @reduction_max__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsOutline, !cir.ptr<!rec_HasOperatorsOutline>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
@@ -110,7 +109,13 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LT:.*]] = cir.call @_ZltR19HasOperatorsOutlineS0_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LT]], true {
+// CHECK-NEXT: cir.yield %[[RHSARG]] : !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: cir.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineaSERKS_(%[[LHSARG]], %[[TERNARY]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}):  
@@ -119,7 +124,7 @@ void acc_loop() {
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(min:someVar)
-// CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <min> init {
+// CHECK: acc.reduction.recipe @reduction_min__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsOutline, !cir.ptr<!rec_HasOperatorsOutline>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
@@ -141,7 +146,13 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LT:.*]] = cir.call @_ZltR19HasOperatorsOutlineS0_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LT]], true {
+// CHECK-NEXT: cir.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: cir.yield %[[RHSARG]] : !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineaSERKS_(%[[LHSARG]], %[[TERNARY]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}):  
@@ -172,7 +183,7 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.call @_ZaNR19HasOperatorsOutlineS0_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}):  
@@ -181,7 +192,7 @@ void acc_loop() {
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(|:someVar)
-// CHECK-NEXT: acc.reduction.recipe @reduction_ior__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <ior> init {
+// CHECK: acc.reduction.recipe @reduction_ior__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <ior> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsOutline, !cir.ptr<!rec_HasOperatorsOutline>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
@@ -203,7 +214,7 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.call @_ZoRR19HasOperatorsOutlineS0_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}):  
@@ -212,7 +223,7 @@ void acc_loop() {
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(^:someVar)
-// CHECK-NEXT: acc.reduction.recipe @reduction_xor__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <xor> init {
+// CHECK: acc.reduction.recipe @reduction_xor__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <xor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsOutline, !cir.ptr<!rec_HasOperatorsOutline>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
@@ -234,7 +245,7 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.call @_ZeOR19HasOperatorsOutlineS0_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}):  
@@ -243,7 +254,7 @@ void acc_loop() {
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(&&:someVar)
-// CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <land> init {
+// CHECK: acc.reduction.recipe @reduction_land__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsOutline, !cir.ptr<!rec_HasOperatorsOutline>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
@@ -344,7 +355,29 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZpLR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+//
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
@@ -466,7 +499,28 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZmLR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
@@ -588,7 +642,34 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LT:.*]] = cir.call @_ZltR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LT]], true {
+// CHECK-NEXT: cir.yield %[[RHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: cir.yield %[[LHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[TERNARY]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
@@ -710,7 +791,34 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LT:.*]] = cir.call @_ZltR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LT]], true {
+// CHECK-NEXT: cir.yield %[[LHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: cir.yield %[[RHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[TERNARY]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
@@ -832,7 +940,28 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZaNR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
@@ -896,7 +1025,28 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZoRR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
@@ -959,7 +1109,28 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZeOR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
@@ -982,6 +1153,7 @@ void acc_loop() {
 // CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
+
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(&&:someVarArr)
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <land> init {
@@ -1106,8 +1278,7 @@ void acc_loop() {
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(||:someVarArr)
-
-// CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <lor> init {
+// CHECK: acc.reduction.recipe @reduction_lor__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <lor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["arrayinit.temp"]
@@ -1216,6 +1387,32 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT:  cir.call @_ZpLR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -1293,6 +1490,32 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT:  cir.call @_ZmLR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -1370,6 +1593,38 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LT:.*]] = cir.call @_ZltR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LT]], true {
+// CHECK-NEXT: cir.yield %[[RHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: cir.yield %[[LHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[TERNARY]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -1447,6 +1702,38 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LT:.*]] = cir.call @_ZltR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LT]], true {
+// CHECK-NEXT: cir.yield %[[LHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: cir.yield %[[RHS_STRIDE]] : !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[TERNARY]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -1524,6 +1811,32 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT:  cir.call @_ZaNR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -1601,6 +1914,32 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT:  cir.call @_ZoRR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -1678,6 +2017,32 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT:  cir.call @_ZeOR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
diff --git a/clang/test/CIR/CodeGenOpenACC/reduction-clause-recipes.cpp b/clang/test/CIR/CodeGenOpenACC/reduction-clause-recipes.cpp
index fc696ff32decb..20ad7a31b635e 100644
--- a/clang/test/CIR/CodeGenOpenACC/reduction-clause-recipes.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/reduction-clause-recipes.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 // Note: unlike the 'private' recipe checks, this is just for spot-checking,
 // so this test isn't as comprehensive.  The same code paths are used for
@@ -90,7 +90,88 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
-// CHECK-NEXT: ^bb0(%[[REF:.*]]: !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>> {{.*}}, %[[PRIVATE:.*]]: !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB3:.*]] = acc.get_lowerbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB3]] : index to !u64i
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i
+// CHECK-NEXT: %[[ITR3:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB3_CAST]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR3_LOAD]], %[[UB3_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_TLA_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>> -> !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>>
+// CHECK-NEXT: %[[LHS_BOUND3_STRIDE:.*]] = cir.ptr_stride %[[LHS_TLA_DECAY]], %[[ITR3_LOAD]] : (!cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>>, !u64i) -> !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>>
+// CHECK-NEXT: %[[RHS_TLA_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>> -> !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>>
+// CHECK-NEXT: %[[RHS_BOUND3_STRIDE:.*]] = cir.ptr_stride %[[RHS_TLA_DECAY]], %[[ITR3_LOAD]] : (!cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>>, !u64i) -> !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB2:.*]] = acc.get_lowerbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB2]] : index to !u64i
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i
+// CHECK-NEXT: %[[ITR2:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB2_CAST]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR2_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_BOUND3_STRIDE_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHS_BOUND3_STRIDE]] : !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>> -> !cir.ptr<!cir.array<!rec_NoOps x 5>>
+// CHECK-NEXT: %[[LHS_BOUND2_STRIDE:.*]] = cir.ptr_stride %[[LHS_BOUND3_STRIDE_DECAY]], %[[ITR2_LOAD]] : (!cir.ptr<!cir.array<!rec_NoOps x 5>>, !u64i) -> !cir.ptr<!cir.array<!rec_NoOps x 5>>
+// CHECK-NEXT: %[[RHS_BOUND3_STRIDE_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHS_BOUND3_STRIDE]] : !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>> -> !cir.ptr<!cir.array<!rec_NoOps x 5>>
+// CHECK-NEXT: %[[RHS_BOUND2_STRIDE:.*]] = cir.ptr_stride %[[RHS_BOUND3_STRIDE_DECAY]], %[[ITR2_LOAD]] : (!cir.ptr<!cir.array<!rec_NoOps x 5>>, !u64i) -> !cir.ptr<!cir.array<!rec_NoOps x 5>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB1_CAST]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR1_LOAD]], %[[UB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_BOUND2_STRIDE_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHS_BOUND2_STRIDE]] : !cir.ptr<!cir.array<!rec_NoOps x 5>> -> !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[LHS_BOUND1_STRIDE:.*]] = cir.ptr_stride %[[LHS_BOUND2_STRIDE_DECAY]], %[[ITR1_LOAD]] : (!cir.ptr<!rec_NoOps>, !u64i) -> !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[RHS_BOUND2_STRIDE_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHS_BOUND2_STRIDE]] : !cir.ptr<!cir.array<!rec_NoOps x 5>> -> !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[RHS_BOUND1_STRIDE:.*]] = cir.ptr_stride %[[RHS_BOUND2_STRIDE_DECAY]], %[[ITR1_LOAD]] : (!cir.ptr<!rec_NoOps>, !u64i) -> !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_BOUND1_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_NoOps> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_BOUND1_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_NoOps> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[OP:.*]] = cir.binop(add, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw
+// CHECK-NEXT: cir.store{{.*}} %[[OP]], %[[LHS_GET_I]] : !s32i
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT:} destroy {
 // CHECK-NEXT: ^bb0(%[[REF:.*]]: !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>> {{.*}}, %[[PRIVATE:.*]]: !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
@@ -355,7 +436,89 @@ void do_things(unsigned A, unsigned B) {
 //
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
-// CHECK-NEXT: ^bb0(%[[REF:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>> {{.*}}, %[[PRIVATE:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB3:.*]] = acc.get_lowerbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB3]] : index to !u64i
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i
+// CHECK-NEXT: %[[ITR3:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB3_CAST]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
+
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR3_LOAD]], %[[UB3_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_TLA_LOAD:.*]] = cir.load %[[LHSARG]] : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: %[[LHS_BOUND3_STRIDE:.*]] = cir.ptr_stride %[[LHS_TLA_LOAD]], %[[ITR3_LOAD]] : (!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, !u64i) -> !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: %[[RHS_TLA_LOAD:.*]] = cir.load %[[RHSARG]] : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: %[[RHS_BOUND3_STRIDE:.*]] = cir.ptr_stride %[[RHS_TLA_LOAD]], %[[ITR3_LOAD]] : (!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, !u64i) -> !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB2:.*]] = acc.get_lowerbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB2]] : index to !u64i
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i
+// CHECK-NEXT: %[[ITR2:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB2_CAST]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR2_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_BOUND3_STRIDE_LOAD:.*]] = cir.load %[[LHS_BOUND3_STRIDE]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: %[[LHS_BOUND2_STRIDE:.*]] = cir.ptr_stride %[[LHS_BOUND3_STRIDE_LOAD]], %[[ITR2_LOAD]] : (!cir.ptr<!cir.ptr<!rec_NoOps>>, !u64i) -> !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: %[[RHS_BOUND3_STRIDE_LOAD:.*]] = cir.load %[[RHS_BOUND3_STRIDE]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: %[[RHS_BOUND2_STRIDE:.*]] = cir.ptr_stride %[[RHS_BOUND3_STRIDE_LOAD]], %[[ITR2_LOAD]] : (!cir.ptr<!cir.ptr<!rec_NoOps>>, !u64i) -> !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB1_CAST]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR1_LOAD]], %[[UB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_BOUND2_STRIDE_LOAD:.*]] = cir.load %[[LHS_BOUND2_STRIDE]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_BOUND2_STRIDE_LOAD]], %[[ITR1_LOAD]] : (!cir.ptr<!rec_NoOps>, !u64i) -> !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[RHS_BOUND2_STRIDE_LOAD:.*]] = cir.load %[[RHS_BOUND2_STRIDE]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_BOUND2_STRIDE_LOAD]], %[[ITR1_LOAD]] : (!cir.ptr<!rec_NoOps>, !u64i) -> !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_NoOps> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_NoOps> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[OP:.*]] = cir.binop(mul, %[[LHS_LOAD]], %[[RHS_LOAD]]) nsw
+// CHECK-NEXT: cir.store{{.*}} %[[OP]], %[[LHS_GET_I]] : !s32i
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[REF:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>> {{.*}}, %[[PRIVATE:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
diff --git a/clang/test/CIR/IR/atomic.cir b/clang/test/CIR/IR/atomic.cir
index 85207633a5294..790297ff99f47 100644
--- a/clang/test/CIR/IR/atomic.cir
+++ b/clang/test/CIR/IR/atomic.cir
@@ -5,17 +5,30 @@
 
 cir.func @atomic_xchg(%ptr: !cir.ptr<!s32i>, %val: !s32i) {
   // CHECK-LABEL: @atomic_xchg
-  %0 = cir.atomic.xchg relaxed %ptr, %val : !cir.ptr<!s32i> -> !s32i
-  // CHECK: cir.atomic.xchg relaxed %{{.+}}, %{{.+}} : !cir.ptr<!s32i> -> !s32i
-  %1 = cir.atomic.xchg consume %ptr, %val : !cir.ptr<!s32i> -> !s32i
-  // CHECK: cir.atomic.xchg consume %{{.+}}, %{{.+}} : !cir.ptr<!s32i> -> !s32i
-  %2 = cir.atomic.xchg acquire %ptr, %val : !cir.ptr<!s32i> -> !s32i
-  // CHECK: cir.atomic.xchg acquire %{{.+}}, %{{.+}} : !cir.ptr<!s32i> -> !s32i
-  %3 = cir.atomic.xchg release %ptr, %val : !cir.ptr<!s32i> -> !s32i
-  // CHECK: cir.atomic.xchg release %{{.+}}, %{{.+}} : !cir.ptr<!s32i> -> !s32i
-  %4 = cir.atomic.xchg acq_rel %ptr, %val : !cir.ptr<!s32i> -> !s32i
-  // CHECK: cir.atomic.xchg acq_rel %{{.+}}, %{{.+}} : !cir.ptr<!s32i> -> !s32i
-  %5 = cir.atomic.xchg seq_cst %ptr, %val : !cir.ptr<!s32i> -> !s32i
-  // CHECK: cir.atomic.xchg seq_cst %{{.+}}, %{{.+}} : !cir.ptr<!s32i> -> !s32i
+  %0 = cir.atomic.xchg relaxed %ptr, %val : (!cir.ptr<!s32i>, !s32i) -> !s32i
+  // CHECK: cir.atomic.xchg relaxed %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
+  %1 = cir.atomic.xchg consume %ptr, %val : (!cir.ptr<!s32i>, !s32i) -> !s32i
+  // CHECK: cir.atomic.xchg consume %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
+  %2 = cir.atomic.xchg acquire %ptr, %val : (!cir.ptr<!s32i>, !s32i) -> !s32i
+  // CHECK: cir.atomic.xchg acquire %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
+  %3 = cir.atomic.xchg release %ptr, %val : (!cir.ptr<!s32i>, !s32i) -> !s32i
+  // CHECK: cir.atomic.xchg release %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
+  %4 = cir.atomic.xchg acq_rel %ptr, %val : (!cir.ptr<!s32i>, !s32i) -> !s32i
+  // CHECK: cir.atomic.xchg acq_rel %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
+  %5 = cir.atomic.xchg seq_cst %ptr, %val : (!cir.ptr<!s32i>, !s32i) -> !s32i
+  // CHECK: cir.atomic.xchg seq_cst %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i) -> !s32i
+  cir.return
+}
+
+cir.func @atomic_cmpxchg(%ptr: !cir.ptr<!s32i>, %expected: !s32i, %desired: !s32i) {
+  // CHECK-LABEL: @atomic_cmpxchg
+  %0, %1 = cir.atomic.cmpxchg success(relaxed) failure(relaxed) %ptr, %expected, %desired : (!cir.ptr<!s32i>, !s32i, !s32i) -> (!s32i, !cir.bool)
+  // CHECK: cir.atomic.cmpxchg success(relaxed) failure(relaxed) %{{.+}}, %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i, !s32i) -> (!s32i, !cir.bool)
+  %2, %3 = cir.atomic.cmpxchg weak success(relaxed) failure(relaxed) %ptr, %expected, %desired : (!cir.ptr<!s32i>, !s32i, !s32i) -> (!s32i, !cir.bool)
+  // CHECK: cir.atomic.cmpxchg weak success(relaxed) failure(relaxed) %{{.+}}, %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i, !s32i) -> (!s32i, !cir.bool)
+  %4, %5 = cir.atomic.cmpxchg success(seq_cst) failure(acquire) %ptr, %expected, %desired : (!cir.ptr<!s32i>, !s32i, !s32i) -> (!s32i, !cir.bool)
+  // CHECK: cir.atomic.cmpxchg success(seq_cst) failure(acquire) %{{.+}}, %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i, !s32i) -> (!s32i, !cir.bool)
+  %6, %7 = cir.atomic.cmpxchg weak success(seq_cst) failure(acquire) %ptr, %expected, %desired : (!cir.ptr<!s32i>, !s32i, !s32i) -> (!s32i, !cir.bool)
+  // CHECK: cir.atomic.cmpxchg weak success(seq_cst) failure(acquire) %{{.+}}, %{{.+}}, %{{.+}} : (!cir.ptr<!s32i>, !s32i, !s32i) -> (!s32i, !cir.bool)
   cir.return
 }
diff --git a/clang/test/CIR/IR/inline-attrs.cir b/clang/test/CIR/IR/inline-attrs.cir
new file mode 100644
index 0000000000000..f525abe240366
--- /dev/null
+++ b/clang/test/CIR/IR/inline-attrs.cir
@@ -0,0 +1,33 @@
+// RUN: cir-opt %s --verify-roundtrip | FileCheck %s
+
+!s32i = !cir.int<s, 32>
+
+module {
+  cir.func @noinline_func(%arg0: !s32i) -> !s32i inline(never) {
+    cir.return %arg0 : !s32i
+  }
+  cir.func @always_inline_func(%arg0: !s32i) -> !s32i inline(always) {
+    cir.return %arg0 : !s32i
+  }
+  cir.func @inline_hint_func(%arg0: !s32i) -> !s32i inline(hint) {
+    cir.return %arg0 : !s32i
+  }
+  cir.func @regular_func(%arg0: !s32i) -> !s32i {
+    cir.return %arg0 : !s32i
+  }
+  cir.func dso_local @noinline_with_attrs(%arg0: !s32i) -> !s32i inline(never) {
+    cir.return %arg0 : !s32i
+  }
+  cir.func private @noinline_decl(!s32i) -> !s32i inline(never)
+  cir.func private @always_inline_decl(!s32i) -> !s32i inline(always)
+  cir.func private @inline_hint_decl(!s32i) -> !s32i inline(hint)
+}
+
+// CHECK: cir.func @noinline_func(%arg0: !s32i) -> !s32i inline(never)
+// CHECK: cir.func @always_inline_func(%arg0: !s32i) -> !s32i inline(always)
+// CHECK: cir.func @inline_hint_func(%arg0: !s32i) -> !s32i inline(hint)
+// CHECK: cir.func @regular_func(%arg0: !s32i) -> !s32i {
+// CHECK: cir.func dso_local @noinline_with_attrs(%arg0: !s32i) -> !s32i inline(never)
+// CHECK: cir.func private @noinline_decl(!s32i) -> !s32i inline(never)
+// CHECK: cir.func private @always_inline_decl(!s32i) -> !s32i inline(always)
+// CHECK: cir.func private @inline_hint_decl(!s32i) -> !s32i inline(hint)
diff --git a/clang/test/CIR/IR/invalid-atomic.cir b/clang/test/CIR/IR/invalid-atomic.cir
new file mode 100644
index 0000000000000..a124e43a98479
--- /dev/null
+++ b/clang/test/CIR/IR/invalid-atomic.cir
@@ -0,0 +1,7 @@
+// RUN: cir-opt %s -verify-diagnostics -split-input-file
+
+cir.func @f1(%arg0: !cir.ptr<!cir.float>, %arg1: !cir.float) {
+  // expected-error @below {{only atomic add, sub, max, and min operation could operate on floating-point values}}
+  %0 = cir.atomic.fetch and seq_cst %arg0, %arg1 : (!cir.ptr<!cir.float>, !cir.float) -> !cir.float
+  cir.return
+}
diff --git a/clang/test/CIR/IR/invalid-try-catch.cir b/clang/test/CIR/IR/invalid-try-catch.cir
new file mode 100644
index 0000000000000..94df4b63ed629
--- /dev/null
+++ b/clang/test/CIR/IR/invalid-try-catch.cir
@@ -0,0 +1,157 @@
+// RUN: cir-opt %s -verify-diagnostics -split-input-file
+
+module {
+
+cir.func dso_local @invalid_catch_without_all_or_type() {
+  cir.scope {
+    cir.try {
+      cir.yield
+     // expected-error @below {{'cir.try' expected 'all' or 'type' keyword}}
+    } catch [invalid_keyword {
+      cir.yield
+    }
+  }
+  cir.return
+}
+
+}
+
+// -----
+
+module {
+
+cir.func dso_local @invalid_catch_rtti_type() {
+  cir.scope {
+    // expected-error @below {{'cir.try' op attribute 'handler_types' failed to satisfy constraint: catch all or unwind or global view array attribute}}
+    cir.try {
+      cir.yield
+    } catch [type #cir.undef] {
+      cir.yield
+    }
+  }
+  cir.return
+}
+
+}
+
+// -----
+
+module {
+
+cir.func dso_local @invalid_catch_empty_block() {
+  cir.scope {
+    cir.try {
+      cir.yield
+    }
+    // expected-error @below {{'cir.try' handler region shall not be empty}}
+    catch all {
+    }
+  }
+  cir.return
+}
+
+}
+
+// -----
+
+!s32i = !cir.int<s, 32>
+
+module {
+
+cir.func dso_local @invalid_catch_not_terminated() {
+  %a = cir.alloca !s32i, !cir.ptr<!s32i>, ["a", init]
+  cir.scope {
+    cir.try {
+      cir.yield
+    } 
+    // expected-error @below {{'cir.try' blocks are expected to be explicitly terminated}}
+    catch all {
+       %tmp_a = cir.load %a : !cir.ptr<!s32i>, !s32i
+    }
+  }
+  cir.return
+}
+
+}
+
+// -----
+
+module {
+
+cir.func dso_local @invalid_catch_multiple_catch_all() {
+  cir.scope {
+    cir.try {
+      cir.yield
+    } catch all {
+      cir.yield
+    } 
+    // expected-error @below {{op 'cir.try' can't have more than one catch all}}
+    catch all {
+      cir.yield
+    }
+  }
+  cir.return
+}
+
+}
+
+// -----
+
+module {
+
+cir.func dso_local @invalid_catch_without_type_info() {
+  cir.scope {
+    cir.try {
+      cir.yield
+    } 
+    // expected-error @below {{expected attribute value}}
+    // expected-error @below {{op 'cir.try' expected valid RTTI info attribute}}
+    catch [type] {
+      cir.yield
+    }
+  }
+  cir.return
+}
+
+}
+
+// -----
+
+module {
+
+cir.func dso_local @invalid_catch_all_with_type_info() {
+  cir.scope {
+    cir.try {
+      cir.yield
+    }
+    // expected-error @below {{op 'cir.try' catch all dosen't need RTTI info attribute}}
+    catch [all] {
+      cir.yield
+    }
+  }
+  cir.return
+}
+
+}
+
+// -----
+
+module {
+
+cir.func dso_local @invalid_unwind_with_catch_all() {
+  cir.scope {
+    cir.try {
+      cir.yield
+    }
+    catch all {
+      cir.yield
+    }
+    // expected-error @below {{op 'cir.try' unwind can't be used with catch all}}
+    unwind {
+
+    }
+  }
+  cir.return
+}
+
+}
diff --git a/clang/test/CIR/IR/try-catch.cir b/clang/test/CIR/IR/try-catch.cir
new file mode 100644
index 0000000000000..7becd0b559f5e
--- /dev/null
+++ b/clang/test/CIR/IR/try-catch.cir
@@ -0,0 +1,84 @@
+// RUN: cir-opt %s --verify-roundtrip | FileCheck %s
+
+!u8i = !cir.int<u, 8>
+
+module {
+
+cir.global "private" constant external @_ZTIi : !cir.ptr<!u8i>
+cir.global "private" constant external @_ZTIPKc : !cir.ptr<!u8i>
+
+cir.func dso_local @empty_try_block_with_catch_all() {
+  cir.scope {
+    cir.try {
+      cir.yield
+    } catch all {
+      cir.yield
+    }
+  }
+  cir.return
+}
+
+// CHECK: cir.func dso_local @empty_try_block_with_catch_all() {
+// CHECK:   cir.scope {
+// CHECK:     cir.try {
+// CHECK:       cir.yield
+// CHECK:     } catch all {
+// CHECK:       cir.yield
+// CHECK:     }
+// CHECK:   }
+// CHECK:   cir.return
+// CHECK: }
+
+cir.func dso_local @empty_try_block_with_catch_unwind() {
+  cir.scope {
+    cir.try {
+      cir.yield
+    } unwind {
+      cir.yield
+    }
+  }
+  cir.return
+}
+
+// CHECK: cir.func dso_local @empty_try_block_with_catch_unwind() {
+// CHECK:   cir.scope {
+// CHECK:     cir.try {
+// CHECK:       cir.yield
+// CHECK:     } unwind {
+// CHECK:       cir.yield
+// CHECK:     }
+// CHECK:   }
+// CHECK:   cir.return
+// CHECK: }
+
+cir.func dso_local @empty_try_block_with_catch_ist() {
+  cir.scope {
+    cir.try {
+      cir.yield
+    } catch [type #cir.global_view<@_ZTIi> : !cir.ptr<!u8i>] {
+      cir.yield
+    } catch [type #cir.global_view<@_ZTIPKc> : !cir.ptr<!u8i>] {
+      cir.yield
+    } unwind {
+      cir.yield
+    }
+  }
+  cir.return
+}
+
+// CHECK: cir.func dso_local @empty_try_block_with_catch_ist() {
+// CHECK:   cir.scope {
+// CHECK:     cir.try {
+// CHECK:       cir.yield
+// CHECK:     } catch [type #cir.global_view<@_ZTIi> : !cir.ptr<!u8i>] {
+// CHECK:       cir.yield
+// CHECK:     } catch [type #cir.global_view<@_ZTIPKc> : !cir.ptr<!u8i>] {
+// CHECK:       cir.yield
+// CHECK:     } unwind {
+// CHECK:       cir.yield
+// CHECK:     }
+// CHECK:   }
+// CHECK:   cir.return
+// CHECK: }
+
+}
diff --git a/clang/test/CIR/Lowering/basic.cpp b/clang/test/CIR/Lowering/basic.cpp
index 5642cef68c2d0..63beb0af6d688 100644
--- a/clang/test/CIR/Lowering/basic.cpp
+++ b/clang/test/CIR/Lowering/basic.cpp
@@ -5,7 +5,7 @@ int f1() {
   return i;
 }
 
-// CHECK: define{{.*}} i32 @_Z2f1v() {
+// CHECK: define{{.*}} i32 @_Z2f1v(){{.*}} {
 // CHECK:    %[[RV:.*]] = alloca i32, i64 1, align 4
 // CHECK:    %[[I_PTR:.*]] = alloca i32, i64 1, align 4
 // CHECK:    %[[I:.*]] = load i32, ptr %[[I_PTR]], align 4
@@ -18,7 +18,7 @@ int f2() {
   return i;
 }
 
-// CHECK: define{{.*}} i32 @_Z2f2v() {
+// CHECK: define{{.*}} i32 @_Z2f2v(){{.*}} {
 // CHECK:    %[[RV:.*]] = alloca i32, i64 1, align 4
 // CHECK:    %[[I_PTR:.*]] = alloca i32, i64 1, align 4
 // CHECK:    store i32 2, ptr %[[I_PTR]], align 4
diff --git a/clang/test/CIR/Lowering/func-simple.cpp b/clang/test/CIR/Lowering/func-simple.cpp
index 96306babc8674..df5b007c3483d 100644
--- a/clang/test/CIR/Lowering/func-simple.cpp
+++ b/clang/test/CIR/Lowering/func-simple.cpp
@@ -23,7 +23,7 @@ int scopes() {
     }
   }
 }
-// CHECK: define{{.*}} i32 @_Z6scopesv() {
+// CHECK: define{{.*}} i32 @_Z6scopesv(){{.*}} {
 // CHECK:   %[[RV:.*]] = alloca i32, i64 1, align 4
 // CHECK:   br label %[[LABEL1:.*]]
 // CHECK: [[LABEL1]]:
@@ -40,7 +40,7 @@ int scopes() {
 // CHECK: }
 
 long longfunc() { return 42l; }
-// CHECK: define{{.*}} i64 @_Z8longfuncv() {
+// CHECK: define{{.*}} i64 @_Z8longfuncv(){{.*}} {
 // CHECK:   %[[RV:.*]] = alloca i64, i64 1, align 8
 // CHECK:   store i64 42, ptr %[[RV]], align 8
 // CHECK:   %[[R:.*]] = load i64, ptr %[[RV]], align 8
@@ -48,7 +48,7 @@ long longfunc() { return 42l; }
 // CHECK: }
 
 unsigned unsignedfunc() { return 42u; }
-// CHECK: define{{.*}} i32 @_Z12unsignedfuncv() {
+// CHECK: define{{.*}} i32 @_Z12unsignedfuncv(){{.*}} {
 // CHECK:   %[[RV:.*]] = alloca i32, i64 1, align 4
 // CHECK:   store i32 42, ptr %[[RV]], align 4
 // CHECK:   %[[R:.*]] = load i32, ptr %[[RV]], align 4
@@ -56,7 +56,7 @@ unsigned unsignedfunc() { return 42u; }
 // CHECK: }
 
 unsigned long long ullfunc() { return 42ull; }
-// CHECK: define{{.*}} i64 @_Z7ullfuncv() {
+// CHECK: define{{.*}} i64 @_Z7ullfuncv(){{.*}} {
 // CHECK:   %[[RV:.*]] = alloca i64, i64 1, align 8
 // CHECK:   store i64 42, ptr %[[RV]], align 8
 // CHECK:   %[[R:.*]] = load i64, ptr %[[RV]], align 8
@@ -64,7 +64,7 @@ unsigned long long ullfunc() { return 42ull; }
 // CHECK: }
 
 bool boolfunc() { return true; }
-// CHECK: define{{.*}} i1 @_Z8boolfuncv() {
+// CHECK: define{{.*}} i1 @_Z8boolfuncv(){{.*}} {
 // CHECK:   %[[RV:.*]] = alloca i8, i64 1, align 1
 // CHECK:   store i8 1, ptr %[[RV]], align 1
 // CHECK:   %[[R8:.*]] = load i8, ptr %[[RV]], align 1
diff --git a/clang/test/CIR/func-simple.cpp b/clang/test/CIR/func-simple.cpp
index c9cb5c5595352..88947578b5f9e 100644
--- a/clang/test/CIR/func-simple.cpp
+++ b/clang/test/CIR/func-simple.cpp
@@ -2,17 +2,17 @@
 // RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o -  | FileCheck %s
 
 void empty() { }
-// CHECK: cir.func{{.*}} @_Z5emptyv() {
+// CHECK: cir.func{{.*}} @_Z5emptyv()
 // CHECK:   cir.return
 // CHECK: }
 
 void voidret() { return; }
-// CHECK: cir.func{{.*}} @_Z7voidretv() {
+// CHECK: cir.func{{.*}} @_Z7voidretv()
 // CHECK:   cir.return
 // CHECK: }
 
 int intfunc() { return 42; }
-// CHECK: cir.func{{.*}} @_Z7intfuncv() -> !s32i {
+// CHECK: cir.func{{.*}} @_Z7intfuncv() -> !s32i
 // CHECK:   %0 = cir.alloca !s32i, !cir.ptr<!s32i>, ["__retval"] {alignment = 4 : i64}
 // CHECK:   %1 = cir.const #cir.int<42> : !s32i
 // CHECK:   cir.store %1, %0 : !s32i, !cir.ptr<!s32i>
@@ -27,7 +27,7 @@ int scopes() {
     }
   }
 }
-// CHECK: cir.func{{.*}} @_Z6scopesv() -> !s32i {
+// CHECK: cir.func{{.*}} @_Z6scopesv() -> !s32i
 // CHECK:   %0 = cir.alloca !s32i, !cir.ptr<!s32i>, ["__retval"] {alignment = 4 : i64}
 // CHECK:   cir.scope {
 // CHECK:     cir.scope {
@@ -68,7 +68,7 @@ unsigned long long ullfunc() { return 42ull; }
 // CHECK: }
 
 bool boolfunc() { return true; }
-// CHECK: cir.func{{.*}} @_Z8boolfuncv() -> !cir.bool {
+// CHECK: cir.func{{.*}} @_Z8boolfuncv() -> !cir.bool
 // CHECK:   %0 = cir.alloca !cir.bool, !cir.ptr<!cir.bool>, ["__retval"] {alignment = 1 : i64}
 // CHECK:   %1 = cir.const #true
 // CHECK:   cir.store %1, %0 : !cir.bool, !cir.ptr<!cir.bool>
@@ -77,7 +77,7 @@ bool boolfunc() { return true; }
 // CHECK: }
 
 float floatfunc() { return 42.42f; }
-// CHECK: cir.func{{.*}} @_Z9floatfuncv() -> !cir.float {
+// CHECK: cir.func{{.*}} @_Z9floatfuncv() -> !cir.float
 // CHECK:   %0 = cir.alloca !cir.float, !cir.ptr<!cir.float>, ["__retval"] {alignment = 4 : i64}
 // CHECK:   %1 = cir.const #cir.fp<4.242
 // CHECK:   cir.store %1, %0 : !cir.float, !cir.ptr<!cir.float>
@@ -86,7 +86,7 @@ float floatfunc() { return 42.42f; }
 // CHECK: }
 
 double doublefunc() { return 42.42; }
-// CHECK: cir.func{{.*}} @_Z10doublefuncv() -> !cir.double {
+// CHECK: cir.func{{.*}} @_Z10doublefuncv() -> !cir.double
 // CHECK:   %0 = cir.alloca !cir.double, !cir.ptr<!cir.double>, ["__retval"] {alignment = 8 : i64}
 // CHECK:   %1 = cir.const #cir.fp<4.242
 // CHECK:   cir.store %1, %0 : !cir.double, !cir.ptr<!cir.double>
diff --git a/clang/test/CXX/dcl.decl/dcl.decomp/p2.cpp b/clang/test/CXX/dcl.decl/dcl.decomp/p2.cpp
index cad7d8e89ce0f..717ea3aaa2276 100644
--- a/clang/test/CXX/dcl.decl/dcl.decomp/p2.cpp
+++ b/clang/test/CXX/dcl.decl/dcl.decomp/p2.cpp
@@ -16,8 +16,8 @@ int array() {
   using X3 = X[3];
   auto [a3, b3, c3] = X3{1, 2, 3};
 
-  auto &[d, e] = arr; // expected-error {{type 'int[3]' decomposes into 3 elements, but only 2 names were provided}}
-  auto &[f, g, h, i] = arr; // expected-error {{type 'int[3]' decomposes into 3 elements, but 4 names were provided}}
+  auto &[d, e] = arr; // expected-error {{type 'int[3]' binds to 3 elements, but only 2 names were provided}}
+  auto &[f, g, h, i] = arr; // expected-error {{type 'int[3]' binds to 3 elements, but 4 names were provided}}
 
   auto &[r0, r1, r2] = arr;
   const auto &[cr0, cr1, cr2] = arr;
diff --git a/clang/test/CXX/dcl.decl/dcl.decomp/p3.cpp b/clang/test/CXX/dcl.decl/dcl.decomp/p3.cpp
index ce5eefc6bfdb4..b7fef12a40b38 100644
--- a/clang/test/CXX/dcl.decl/dcl.decomp/p3.cpp
+++ b/clang/test/CXX/dcl.decl/dcl.decomp/p3.cpp
@@ -5,10 +5,10 @@ using size_t = decltype(sizeof(0));
 struct A { int x, y; };
 struct B { int x, y; };
 
-void no_tuple_size_1() { auto [x, y] = A(); } // ok, decompose elementwise
+void no_tuple_size_1() { auto [x, y] = A(); } // ok, bind elementwise
 
 namespace std { template<typename T> struct tuple_size; }
-void no_tuple_size_2() { auto [x, y] = A(); } // ok, decompose elementwise
+void no_tuple_size_2() { auto [x, y] = A(); } // ok, bind elementwise
 
 struct Bad1 { int a, b; };
 template<> struct std::tuple_size<Bad1> {};
@@ -16,15 +16,15 @@ void no_tuple_size_3() { auto [x, y] = Bad1(); } // ok, omitting value is valid
 
 struct Bad2 {};
 template<> struct std::tuple_size<Bad2> { const int value = 5; };
-void no_tuple_size_4() { auto [x, y] = Bad2(); } // expected-error {{cannot decompose this type; 'std::tuple_size<Bad2>::value' is not a valid integral constant expression}}
+void no_tuple_size_4() { auto [x, y] = Bad2(); } // expected-error {{cannot bind this type; 'std::tuple_size<Bad2>::value' is not a valid integral constant expression}}
 
 template<> struct std::tuple_size<A> { static const int value = 3; };
 template<> struct std::tuple_size<B> { enum { value = 3 }; };
 
 void no_get_1() {
   {
-    auto [a0, a1] = A(); // expected-error {{decomposes into 3 elements}}
-    auto [b0, b1] = B(); // expected-error {{decomposes into 3 elements}}
+    auto [a0, a1] = A(); // expected-error {{binds to 3 elements}}
+    auto [b0, b1] = B(); // expected-error {{binds to 3 elements}}
   }
   auto [a0, a1, a2] = A(); // expected-error {{undeclared identifier 'get'}} expected-note {{in implicit initialization of binding declaration 'a0'}}
 }
diff --git a/clang/test/CXX/dcl.decl/dcl.decomp/p4.cpp b/clang/test/CXX/dcl.decl/dcl.decomp/p4.cpp
index 7141124768e01..532a967aa8f76 100644
--- a/clang/test/CXX/dcl.decl/dcl.decomp/p4.cpp
+++ b/clang/test/CXX/dcl.decl/dcl.decomp/p4.cpp
@@ -25,10 +25,10 @@ namespace NonPublicMembers {
   struct NonPublic4 : NonPublic2 {};
 
   void test() {
-    auto [a1] = NonPublic1(); // expected-error {{cannot decompose protected member 'a' of 'NonPublicMembers::NonPublic1'}}
-    auto [a2] = NonPublic2(); // expected-error {{cannot decompose private member 'a' of 'NonPublicMembers::NonPublic2'}}
-    auto [a3] = NonPublic3(); // expected-error {{cannot decompose members of inaccessible base class 'A' of 'NonPublicMembers::NonPublic3'}}
-    auto [a4] = NonPublic4(); // expected-error {{cannot decompose private member 'a' of 'NonPublicMembers::NonPublic2'}}
+    auto [a1] = NonPublic1(); // expected-error {{cannot bind protected member 'a' of 'NonPublicMembers::NonPublic1'}}
+    auto [a2] = NonPublic2(); // expected-error {{cannot bind private member 'a' of 'NonPublicMembers::NonPublic2'}}
+    auto [a3] = NonPublic3(); // expected-error {{cannot bind members of inaccessible base class 'A' of 'NonPublicMembers::NonPublic3'}}
+    auto [a4] = NonPublic4(); // expected-error {{cannot bind private member 'a' of 'NonPublicMembers::NonPublic2'}}
   }
 }
 
@@ -46,8 +46,8 @@ namespace AnonymousMember {
   };
 
   void test() {
-    auto [a1] = Struct(); // expected-error {{cannot decompose class type 'Struct' because it has an anonymous struct member}}
-    auto [a2] = Union(); // expected-error {{cannot decompose class type 'Union' because it has an anonymous union member}}
+    auto [a1] = Struct(); // expected-error {{cannot bind class type 'Struct' because it has an anonymous struct member}}
+    auto [a2] = Union(); // expected-error {{cannot bind class type 'Union' because it has an anonymous union member}}
   }
 }
 
@@ -73,12 +73,12 @@ namespace MultipleClasses {
   struct M : virtual J, L {};
 
   void test() {
-    auto [b] = B(); // expected-error {{cannot decompose class type 'B': both it and its base class 'A' have non-static data members}}
-    auto [d] = D(); // expected-error {{cannot decompose class type 'D': its base classes 'A' and 'C' have non-static data members}}
+    auto [b] = B(); // expected-error {{cannot bind class type 'B': both it and its base class 'A' have non-static data members}}
+    auto [d] = D(); // expected-error {{cannot bind class type 'D': its base classes 'A' and 'C' have non-static data members}}
     auto [e] = E();
-    auto [f] = F(); // expected-error-re {{cannot decompose members of ambiguous base class 'A' of 'F':{{.*}}struct MultipleClasses::F -> A{{.*}}struct MultipleClasses::F -> E -> A}}
+    auto [f] = F(); // expected-error-re {{cannot bind members of ambiguous base class 'A' of 'F':{{.*}}struct MultipleClasses::F -> A{{.*}}struct MultipleClasses::F -> E -> A}}
     auto [h] = H(); // ok, only one (virtual) base subobject even though there are two paths to it
-    auto [k] = K(); // expected-error {{cannot decompose members of ambiguous base class 'I'}}
+    auto [k] = K(); // expected-error {{cannot bind members of ambiguous base class 'I'}}
     auto [m] = M(); // ok, all paths to I are through the same virtual base subobject J
 
     same<decltype(m), int>();
@@ -214,7 +214,7 @@ namespace p0969r0 {
     auto &[x, y] = b;
   }
   void test_external(B b) {
-    auto &[x, y] = b; // expected-error {{cannot decompose members of inaccessible base class 'A' of 'p0969r0::B'}}
+    auto &[x, y] = b; // expected-error {{cannot bind members of inaccessible base class 'A' of 'p0969r0::B'}}
   }
 
   struct C {
@@ -229,13 +229,13 @@ namespace p0969r0 {
   struct D : C {
     static void test_member(D d, C c) {
       auto &[x1, y1] = d;
-      auto &[x2, y2] = c; // expected-error {{cannot decompose protected member 'y' of 'p0969r0::C'}}
+      auto &[x2, y2] = c; // expected-error {{cannot bind protected member 'y' of 'p0969r0::C'}}
     }
   };
   void test_friend(D d) {
     auto &[x, y] = d;
   }
   void test_external(D d) {
-    auto &[x, y] = d; // expected-error {{cannot decompose protected member 'y' of 'p0969r0::C'}}
+    auto &[x, y] = d; // expected-error {{cannot bind protected member 'y' of 'p0969r0::C'}}
   }
 }
diff --git a/clang/test/CXX/drs/cwg22xx.cpp b/clang/test/CXX/drs/cwg22xx.cpp
index 8c8ad9f7f74ee..34119a162623e 100644
--- a/clang/test/CXX/drs/cwg22xx.cpp
+++ b/clang/test/CXX/drs/cwg22xx.cpp
@@ -202,7 +202,7 @@ namespace cwg2285 { // cwg2285: 4
   void test() {
     using T = int[1];
     auto [a] = T{a};
-    // since-cxx17-error@-1 {{binding 'a' cannot appear in the initializer of its own decomposition declaration}}
+    // since-cxx17-error@-1 {{binding 'a' cannot appear in the initializer of its own structured binding declaration}}
   }
 #endif
 } // namespace cwg2285
diff --git a/clang/test/CXX/drs/cwg23xx.cpp b/clang/test/CXX/drs/cwg23xx.cpp
index 128566537b577..72cf249f8b53d 100644
--- a/clang/test/CXX/drs/cwg23xx.cpp
+++ b/clang/test/CXX/drs/cwg23xx.cpp
@@ -440,7 +440,7 @@ template <> struct tuple_size<cwg2386::Bad2> {
 namespace cwg2386 {
 void no_value() { auto [x, y] = Bad1(); }
 void wrong_value() { auto [x, y] = Bad2(); }
-// since-cxx17-error@-1 {{type 'Bad2' decomposes into 42 elements, but only 2 names were provided}}
+// since-cxx17-error@-1 {{type 'Bad2' binds to 42 elements, but only 2 names were provided}}
 #endif
 } // namespace cwg2386
 
diff --git a/clang/test/CXX/drs/cwg26xx.cpp b/clang/test/CXX/drs/cwg26xx.cpp
index bceef6419b00a..aa87f5a1857c6 100644
--- a/clang/test/CXX/drs/cwg26xx.cpp
+++ b/clang/test/CXX/drs/cwg26xx.cpp
@@ -183,17 +183,17 @@ T get_T();
 
 void use() {
   UnaryC auto [a, b] = get_S();
-  // since-cxx20-error@-1 {{decomposition declaration cannot be declared with constrained 'auto'}}
+  // since-cxx20-error@-1 {{structured binding declaration cannot be declared with constrained 'auto'}}
   BinaryC<int> auto [c, d] = get_S();
-  // since-cxx20-error@-1 {{decomposition declaration cannot be declared with constrained 'auto'}}
+  // since-cxx20-error@-1 {{structured binding declaration cannot be declared with constrained 'auto'}}
 }
 
 template<typename T>
 void TemplUse() {
   UnaryC auto [a, b] = get_T<T>();
-  // since-cxx20-error@-1 {{decomposition declaration cannot be declared with constrained 'auto'}}
+  // since-cxx20-error@-1 {{structured binding declaration cannot be declared with constrained 'auto'}}
   BinaryC<T> auto [c, d] = get_T<T>();
-  // since-cxx20-error@-1 {{decomposition declaration cannot be declared with constrained 'auto'}}
+  // since-cxx20-error@-1 {{structured binding declaration cannot be declared with constrained 'auto'}}
 }
 #endif
 } // namespace cwg2635
diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.req/compound-requirement.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.req/compound-requirement.cpp
index 5f1243a654f54..af2dce81d8a4b 100644
--- a/clang/test/CXX/expr/expr.prim/expr.prim.req/compound-requirement.cpp
+++ b/clang/test/CXX/expr/expr.prim/expr.prim.req/compound-requirement.cpp
@@ -149,7 +149,7 @@ namespace std_example {
   template<typename T> constexpr bool is_same_v<T, T> = true;
 
   template<typename T, typename U> concept same_as = is_same_v<T, U>;
-  // expected-note@-1 {{because 'is_same_v<int, typename T2::inner>' evaluated to false}}
+  // expected-note@-1 {{because 'is_same_v<int, typename std_example::T2::inner>' evaluated to false}}
 
   static_assert(C1<int>);
   static_assert(C1<int*>);
@@ -160,7 +160,7 @@ namespace std_example {
   template<typename T> concept C2 =
     requires(T x) {
       {*x} -> same_as<typename T::inner>;
-      // expected-note@-1{{because 'same_as<int, typename T2::inner>' evaluated to false}}
+      // expected-note@-1{{because 'same_as<int, typename std_example::T2::inner>' evaluated to false}}
       // expected-note@-2{{because '*x' would be invalid: indirection requires pointer operand ('int' invalid)}}
     };
 
diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp
index 9fc4906459373..70a96bed05867 100644
--- a/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp
+++ b/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp
@@ -27,7 +27,7 @@ using r4i = X<void>::r4<int>; // expected-error{{constraints not satisfied for c
 
 // C++ [expr.prim.req.nested] Examples
 namespace std_example {
-  template<typename U> concept C1 = sizeof(U) == 1; // expected-note{{because 'sizeof(decltype(+t)) == 1' (4 == 1) evaluated to false}}
+  template<typename U> concept C1 = sizeof(U) == 1; // expected-note{{because 'sizeof(int) == 1' (4 == 1) evaluated to false}}
   template<typename T> concept D =
     requires (T t) {
       requires C1<decltype (+t)>; // expected-note{{because 'decltype(+t)' (aka 'int') does not satisfy 'C1'}}
diff --git a/clang/test/CXX/module/module.import/p6.cpp b/clang/test/CXX/module/module.import/p6.cpp
index cb2d799e5b565..9e378a5fe7759 100644
--- a/clang/test/CXX/module/module.import/p6.cpp
+++ b/clang/test/CXX/module/module.import/p6.cpp
@@ -3,6 +3,9 @@
 
 // RUN: %clang_cc1 -std=c++20 -x c++-header %t/bad-header-unit.h \
 // RUN:  -emit-header-unit -o %t/bad-header-unit.pcm -verify
+// RUN: %clang_cc1 -std=c++20 -x c++-header %t/bad-header-unit-declspec.h \
+// RUN:  -emit-header-unit -o %t/bad-header-unit.pcm -verify \
+// RUN:  -fdeclspec
 
 //--- bad-header-unit.h
 
@@ -77,3 +80,13 @@ template <typename T> bool b() {
 }
 
 inline bool B = b<int>();
+
+__attribute__((weak)) int weak_fun_definition() { return 42; }
+
+__attribute__((weak)) int weak_var_definition = 42;
+
+//--- bad-header-unit-declspec.h
+
+/* The cases below should compile without diagnostics.  */
+
+__declspec(selectany) int selectany_var_definition = 42; // expected-no-diagnostics
diff --git a/clang/test/CXX/stmt.stmt/stmt.iter/stmt.ranged/p1.cpp b/clang/test/CXX/stmt.stmt/stmt.iter/stmt.ranged/p1.cpp
index 93c2beb5ba1fe..938a3d096ae37 100644
--- a/clang/test/CXX/stmt.stmt/stmt.iter/stmt.ranged/p1.cpp
+++ b/clang/test/CXX/stmt.stmt/stmt.iter/stmt.ranged/p1.cpp
@@ -75,7 +75,7 @@ namespace X {
   void test_D() {
 #if __cplusplus >= 201703L
     for (extern auto [x, y] : D()) {
-    } // expected-error@-1 {{decomposition declaration cannot be declared 'extern'}}
+    } // expected-error@-1 {{structured binding declaration cannot be declared 'extern'}}
       // expected-error@-2 {{loop variable '[x, y]' may not be declared 'extern'}}
 #endif
   }
diff --git a/clang/test/CXX/temp/temp.param/p10-2a.cpp b/clang/test/CXX/temp/temp.param/p10-2a.cpp
index 4f192d33ebe7e..c0406f88db5f3 100644
--- a/clang/test/CXX/temp/temp.param/p10-2a.cpp
+++ b/clang/test/CXX/temp/temp.param/p10-2a.cpp
@@ -95,8 +95,8 @@ concept OneOf = (is_same_v<T, Ts> || ...); // #OneOf
 // expected-note@#OneOf 3{{because 'is_same_v<int, char[1]>' evaluated to false}}
 // expected-note@#OneOf 3{{and 'is_same_v<int, char[2]>' evaluated to false}}
 // expected-note@#OneOf {{because 'is_same_v<decltype(nullptr), char>' evaluated to false}}
-// expected-note@#OneOf {{because 'is_same_v<decltype(nullptr), char>' evaluated to false}}
-// expected-note@#OneOf {{and 'is_same_v<decltype(nullptr), int>' evaluated to false}}
+// expected-note@#OneOf {{because 'is_same_v<std::nullptr_t, char>' evaluated to false}}
+// expected-note@#OneOf {{and 'is_same_v<std::nullptr_t, int>' evaluated to false}}
 // expected-note@#OneOf {{and 'is_same_v<decltype(nullptr), int>' evaluated to false}}
 
 template<OneOf<char[1], char[2]> T, OneOf<int, long, char> U>
diff --git a/clang/test/CXX/temp/temp.res/temp.local/p6.cpp b/clang/test/CXX/temp/temp.res/temp.local/p6.cpp
index 205df5fa25870..beed7bfd15acf 100644
--- a/clang/test/CXX/temp/temp.res/temp.local/p6.cpp
+++ b/clang/test/CXX/temp/temp.res/temp.local/p6.cpp
@@ -165,7 +165,7 @@ A<0>::B a;
 
 template <typename T> int shadow() {  // expected-note{{template parameter is declared here}}
   using arr = int[1];
-  // expected-warning@+1 {{decomposition declarations are a C++17 extension}}
+  // expected-warning@+1 {{structured binding declarations are a C++17 extension}}
   auto [
     T // expected-error {{declaration of 'T' shadows template parameter}}
     ] = arr{};
diff --git a/clang/test/CodeGen/AArch64/ABI-align-packed.c b/clang/test/CodeGen/AArch64/ABI-align-packed.c
index 09f9180cdb0a7..d88a3383fe402 100644
--- a/clang/test/CodeGen/AArch64/ABI-align-packed.c
+++ b/clang/test/CodeGen/AArch64/ABI-align-packed.c
@@ -1,3 +1,4 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple aarch64 -target-feature +neon -emit-llvm -O2 -o - %s | FileCheck %s
 #include <stdarg.h>
@@ -58,8 +59,8 @@ struct non_packed_struct gs_non_packed_struct;
 // CHECK-SAME: (double [[D0:%.*]], double [[D1:%.*]], double [[D2:%.*]], double [[D3:%.*]], double [[D4:%.*]], double [[D5:%.*]], double [[D6:%.*]], double [[D7:%.*]], double noundef [[D8:%.*]], [1 x <8 x i16>] alignstack(16) [[S_NON_PACKED_STRUCT_COERCE:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[S_NON_PACKED_STRUCT_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [1 x <8 x i16>] [[S_NON_PACKED_STRUCT_COERCE]], 0
-// CHECK-NEXT:    store double [[D8]], ptr @gd, align 8, !tbaa [[TBAA2:![0-9]+]]
-// CHECK-NEXT:    store <8 x i16> [[S_NON_PACKED_STRUCT_COERCE_FCA_0_EXTRACT]], ptr @gs_non_packed_struct, align 16, !tbaa [[TBAA6:![0-9]+]]
+// CHECK-NEXT:    store double [[D8]], ptr @gd, align 8, !tbaa [[TBAA6:![0-9]+]]
+// CHECK-NEXT:    store <8 x i16> [[S_NON_PACKED_STRUCT_COERCE_FCA_0_EXTRACT]], ptr @gs_non_packed_struct, align 16, !tbaa [[TBAA8:![0-9]+]]
 // CHECK-NEXT:    ret void
 __attribute__((noinline)) void named_arg_non_packed_struct(double d0, double d1, double d2, double d3,
                                  double d4, double d5, double d6, double d7,
@@ -113,8 +114,8 @@ struct packed_struct gs_packed_struct;
 // CHECK-SAME: (double [[D0:%.*]], double [[D1:%.*]], double [[D2:%.*]], double [[D3:%.*]], double [[D4:%.*]], double [[D5:%.*]], double [[D6:%.*]], double [[D7:%.*]], double noundef [[D8:%.*]], [1 x <8 x i16>] alignstack(8) [[S_PACKED_STRUCT_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[S_PACKED_STRUCT_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [1 x <8 x i16>] [[S_PACKED_STRUCT_COERCE]], 0
-// CHECK-NEXT:    store double [[D8]], ptr @gd, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store <8 x i16> [[S_PACKED_STRUCT_COERCE_FCA_0_EXTRACT]], ptr @gs_packed_struct, align 1, !tbaa [[TBAA6]]
+// CHECK-NEXT:    store double [[D8]], ptr @gd, align 8, !tbaa [[TBAA6]]
+// CHECK-NEXT:    store <8 x i16> [[S_PACKED_STRUCT_COERCE_FCA_0_EXTRACT]], ptr @gs_packed_struct, align 1, !tbaa [[TBAA8]]
 // CHECK-NEXT:    ret void
 __attribute__((noinline)) void named_arg_packed_struct(double d0, double d1, double d2, double d3,
                                  double d4, double d5, double d6, double d7,
@@ -168,8 +169,8 @@ struct packed_member gs_packed_member;
 // CHECK-SAME: (double [[D0:%.*]], double [[D1:%.*]], double [[D2:%.*]], double [[D3:%.*]], double [[D4:%.*]], double [[D5:%.*]], double [[D6:%.*]], double [[D7:%.*]], double noundef [[D8:%.*]], [1 x <8 x i16>] alignstack(8) [[S_PACKED_MEMBER_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[S_PACKED_MEMBER_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [1 x <8 x i16>] [[S_PACKED_MEMBER_COERCE]], 0
-// CHECK-NEXT:    store double [[D8]], ptr @gd, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store <8 x i16> [[S_PACKED_MEMBER_COERCE_FCA_0_EXTRACT]], ptr @gs_packed_member, align 1, !tbaa [[TBAA6]]
+// CHECK-NEXT:    store double [[D8]], ptr @gd, align 8, !tbaa [[TBAA6]]
+// CHECK-NEXT:    store <8 x i16> [[S_PACKED_MEMBER_COERCE_FCA_0_EXTRACT]], ptr @gs_packed_member, align 1, !tbaa [[TBAA8]]
 // CHECK-NEXT:    ret void
 __attribute__((noinline)) void named_arg_packed_member(double d0, double d1, double d2, double d3,
                                  double d4, double d5, double d6, double d7,
@@ -223,8 +224,8 @@ struct aligned_struct_8 gs_aligned_struct_8;
 // CHECK-SAME: (double [[D0:%.*]], double [[D1:%.*]], double [[D2:%.*]], double [[D3:%.*]], double [[D4:%.*]], double [[D5:%.*]], double [[D6:%.*]], double [[D7:%.*]], double noundef [[D8:%.*]], [1 x <8 x i16>] alignstack(16) [[S_ALIGNED_STRUCT_8_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[S_ALIGNED_STRUCT_8_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [1 x <8 x i16>] [[S_ALIGNED_STRUCT_8_COERCE]], 0
-// CHECK-NEXT:    store double [[D8]], ptr @gd, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store <8 x i16> [[S_ALIGNED_STRUCT_8_COERCE_FCA_0_EXTRACT]], ptr @gs_aligned_struct_8, align 16, !tbaa [[TBAA6]]
+// CHECK-NEXT:    store double [[D8]], ptr @gd, align 8, !tbaa [[TBAA6]]
+// CHECK-NEXT:    store <8 x i16> [[S_ALIGNED_STRUCT_8_COERCE_FCA_0_EXTRACT]], ptr @gs_aligned_struct_8, align 16, !tbaa [[TBAA8]]
 // CHECK-NEXT:    ret void
 __attribute__((noinline)) void named_arg_aligned_struct_8(double d0, double d1, double d2, double d3,
                                  double d4, double d5, double d6, double d7,
@@ -278,8 +279,8 @@ struct aligned_member_8 gs_aligned_member_8;
 // CHECK-SAME: (double [[D0:%.*]], double [[D1:%.*]], double [[D2:%.*]], double [[D3:%.*]], double [[D4:%.*]], double [[D5:%.*]], double [[D6:%.*]], double [[D7:%.*]], double noundef [[D8:%.*]], [1 x <8 x i16>] alignstack(16) [[S_ALIGNED_MEMBER_8_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[S_ALIGNED_MEMBER_8_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [1 x <8 x i16>] [[S_ALIGNED_MEMBER_8_COERCE]], 0
-// CHECK-NEXT:    store double [[D8]], ptr @gd, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store <8 x i16> [[S_ALIGNED_MEMBER_8_COERCE_FCA_0_EXTRACT]], ptr @gs_aligned_member_8, align 16, !tbaa [[TBAA6]]
+// CHECK-NEXT:    store double [[D8]], ptr @gd, align 8, !tbaa [[TBAA6]]
+// CHECK-NEXT:    store <8 x i16> [[S_ALIGNED_MEMBER_8_COERCE_FCA_0_EXTRACT]], ptr @gs_aligned_member_8, align 16, !tbaa [[TBAA8]]
 // CHECK-NEXT:    ret void
 __attribute__((noinline)) void named_arg_aligned_member_8(double d0, double d1, double d2, double d3,
                                  double d4, double d5, double d6, double d7,
@@ -333,8 +334,8 @@ struct pragma_packed_struct_8 gs_pragma_packed_struct_8;
 // CHECK-SAME: (double [[D0:%.*]], double [[D1:%.*]], double [[D2:%.*]], double [[D3:%.*]], double [[D4:%.*]], double [[D5:%.*]], double [[D6:%.*]], double [[D7:%.*]], double noundef [[D8:%.*]], [1 x <8 x i16>] alignstack(8) [[S_PRAGMA_PACKED_STRUCT_8_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[S_PRAGMA_PACKED_STRUCT_8_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [1 x <8 x i16>] [[S_PRAGMA_PACKED_STRUCT_8_COERCE]], 0
-// CHECK-NEXT:    store double [[D8]], ptr @gd, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store <8 x i16> [[S_PRAGMA_PACKED_STRUCT_8_COERCE_FCA_0_EXTRACT]], ptr @gs_pragma_packed_struct_8, align 8, !tbaa [[TBAA6]]
+// CHECK-NEXT:    store double [[D8]], ptr @gd, align 8, !tbaa [[TBAA6]]
+// CHECK-NEXT:    store <8 x i16> [[S_PRAGMA_PACKED_STRUCT_8_COERCE_FCA_0_EXTRACT]], ptr @gs_pragma_packed_struct_8, align 8, !tbaa [[TBAA8]]
 // CHECK-NEXT:    ret void
 __attribute__((noinline)) void named_arg_pragma_packed_struct_8(double d0, double d1, double d2, double d3,
                                  double d4, double d5, double d6, double d7,
@@ -388,8 +389,8 @@ struct pragma_packed_struct_4 gs_pragma_packed_struct_4;
 // CHECK-SAME: (double [[D0:%.*]], double [[D1:%.*]], double [[D2:%.*]], double [[D3:%.*]], double [[D4:%.*]], double [[D5:%.*]], double [[D6:%.*]], double [[D7:%.*]], double noundef [[D8:%.*]], [1 x <8 x i16>] alignstack(8) [[S_PRAGMA_PACKED_STRUCT_4_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[S_PRAGMA_PACKED_STRUCT_4_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [1 x <8 x i16>] [[S_PRAGMA_PACKED_STRUCT_4_COERCE]], 0
-// CHECK-NEXT:    store double [[D8]], ptr @gd, align 8, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store <8 x i16> [[S_PRAGMA_PACKED_STRUCT_4_COERCE_FCA_0_EXTRACT]], ptr @gs_pragma_packed_struct_4, align 4, !tbaa [[TBAA6]]
+// CHECK-NEXT:    store double [[D8]], ptr @gd, align 8, !tbaa [[TBAA6]]
+// CHECK-NEXT:    store <8 x i16> [[S_PRAGMA_PACKED_STRUCT_4_COERCE_FCA_0_EXTRACT]], ptr @gs_pragma_packed_struct_4, align 4, !tbaa [[TBAA8]]
 // CHECK-NEXT:    ret void
 __attribute__((noinline)) void named_arg_pragma_packed_struct_4(double d0, double d1, double d2, double d3,
                                  double d4, double d5, double d6, double d7,
@@ -437,9 +438,9 @@ void test_pragma_packed_struct_4() {
     variadic_pragma_packed_struct_4(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, s_pragma_packed_struct_4);
 }
 //.
-// CHECK: [[TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
-// CHECK: [[META3]] = !{!"double", [[META4:![0-9]+]], i64 0}
-// CHECK: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK: [[META4:![0-9]+]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
 // CHECK: [[META5]] = !{!"Simple C/C++ TBAA"}
-// CHECK: [[TBAA6]] = !{[[META4]], [[META4]], i64 0}
+// CHECK: [[TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// CHECK: [[META7]] = !{!"double", [[META4]], i64 0}
+// CHECK: [[TBAA8]] = !{[[META4]], [[META4]], i64 0}
 //.
diff --git a/clang/test/CodeGen/AArch64/fp8-init-list.c b/clang/test/CodeGen/AArch64/fp8-init-list.c
index 7c0f6278b2090..f461977ea835a 100644
--- a/clang/test/CodeGen/AArch64/fp8-init-list.c
+++ b/clang/test/CodeGen/AArch64/fp8-init-list.c
@@ -34,26 +34,26 @@ struct S s;
 // CHECK-LABEL: define dso_local void @f(
 // CHECK-SAME: <1 x i8> [[X:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    store <1 x i8> [[X]], ptr @s, align 1, !tbaa [[__MFP8_TBAA2:![0-9]+]]
+// CHECK-NEXT:    store <1 x i8> [[X]], ptr @s, align 1, !tbaa [[__MFP8_TBAA6:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
 // CHECK-CXX-LABEL: define dso_local void @_Z1fu6__mfp8(
 // CHECK-CXX-SAME: <1 x i8> [[X:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    store <1 x i8> [[X]], ptr @s, align 1, !tbaa [[__MFP8_TBAA2:![0-9]+]]
+// CHECK-CXX-NEXT:    store <1 x i8> [[X]], ptr @s, align 1, !tbaa [[__MFP8_TBAA6:![0-9]+]]
 // CHECK-CXX-NEXT:    ret void
 //
 void f(__mfp8 x) {
     s = (struct S){x};
 }
 //.
-// CHECK: [[__MFP8_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
-// CHECK: [[META3]] = !{!"__mfp8", [[META4:![0-9]+]], i64 0}
-// CHECK: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK: [[META4:![0-9]+]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
 // CHECK: [[META5]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[__MFP8_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// CHECK: [[META7]] = !{!"__mfp8", [[META4]], i64 0}
 //.
-// CHECK-CXX: [[__MFP8_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
-// CHECK-CXX: [[META3]] = !{!"__mfp8", [[META4:![0-9]+]], i64 0}
-// CHECK-CXX: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK-CXX: [[META4:![0-9]+]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
 // CHECK-CXX: [[META5]] = !{!"Simple C++ TBAA"}
+// CHECK-CXX: [[__MFP8_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// CHECK-CXX: [[META7]] = !{!"__mfp8", [[META4]], i64 0}
 //.
diff --git a/clang/test/CodeGen/AArch64/ls64-inline-asm.c b/clang/test/CodeGen/AArch64/ls64-inline-asm.c
index 1d217eb8801e5..04e2207357817 100644
--- a/clang/test/CodeGen/AArch64/ls64-inline-asm.c
+++ b/clang/test/CodeGen/AArch64/ls64-inline-asm.c
@@ -6,7 +6,7 @@ struct foo { unsigned long long x[8]; };
 // CHECK-LABEL: define dso_local void @load(
 // CHECK-SAME: ptr noundef writeonly captures(none) initializes((0, 64)) [[OUTPUT:%.*]], ptr noundef [[ADDR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i512 asm sideeffect "ld64b $0,[$1]", "=r,r,~{memory}"(ptr [[ADDR]]) #[[ATTR1:[0-9]+]], !srcloc [[META2:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i512 asm sideeffect "ld64b $0,[$1]", "=r,r,~{memory}"(ptr [[ADDR]]) #[[ATTR1:[0-9]+]], !srcloc [[META6:![0-9]+]]
 // CHECK-NEXT:    store i512 [[TMP0]], ptr [[OUTPUT]], align 8
 // CHECK-NEXT:    ret void
 //
@@ -19,7 +19,7 @@ void load(struct foo *output, void *addr)
 // CHECK-SAME: ptr noundef readonly captures(none) [[INPUT:%.*]], ptr noundef [[ADDR:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load i512, ptr [[INPUT]], align 8
-// CHECK-NEXT:    tail call void asm sideeffect "st64b $0,[$1]", "r,r,~{memory}"(i512 [[TMP0]], ptr [[ADDR]]) #[[ATTR1]], !srcloc [[META3:![0-9]+]]
+// CHECK-NEXT:    tail call void asm sideeffect "st64b $0,[$1]", "r,r,~{memory}"(i512 [[TMP0]], ptr [[ADDR]]) #[[ATTR1]], !srcloc [[META7:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
 void store(const struct foo *input, void *addr)
@@ -30,28 +30,28 @@ void store(const struct foo *input, void *addr)
 // CHECK-LABEL: define dso_local void @store2(
 // CHECK-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef [[ADDR:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[IN]], align 4, !tbaa [[INT_TBAA4:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[IN]], align 4, !tbaa [[INT_TBAA2:![0-9]+]]
 // CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[TMP0]] to i64
 // CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[IN]], i64 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4, !tbaa [[INT_TBAA4]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4, !tbaa [[INT_TBAA2]]
 // CHECK-NEXT:    [[CONV2:%.*]] = sext i32 [[TMP1]] to i64
 // CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[IN]], i64 16
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4, !tbaa [[INT_TBAA4]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4, !tbaa [[INT_TBAA2]]
 // CHECK-NEXT:    [[CONV5:%.*]] = sext i32 [[TMP2]] to i64
 // CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw i8, ptr [[IN]], i64 64
-// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX7]], align 4, !tbaa [[INT_TBAA4]]
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX7]], align 4, !tbaa [[INT_TBAA2]]
 // CHECK-NEXT:    [[CONV8:%.*]] = sext i32 [[TMP3]] to i64
 // CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw i8, ptr [[IN]], i64 100
-// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX10]], align 4, !tbaa [[INT_TBAA4]]
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX10]], align 4, !tbaa [[INT_TBAA2]]
 // CHECK-NEXT:    [[CONV11:%.*]] = sext i32 [[TMP4]] to i64
 // CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds nuw i8, ptr [[IN]], i64 144
-// CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4, !tbaa [[INT_TBAA4]]
+// CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4, !tbaa [[INT_TBAA2]]
 // CHECK-NEXT:    [[CONV14:%.*]] = sext i32 [[TMP5]] to i64
 // CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds nuw i8, ptr [[IN]], i64 196
-// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX16]], align 4, !tbaa [[INT_TBAA4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX16]], align 4, !tbaa [[INT_TBAA2]]
 // CHECK-NEXT:    [[CONV17:%.*]] = sext i32 [[TMP6]] to i64
 // CHECK-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds nuw i8, ptr [[IN]], i64 256
-// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX19]], align 4, !tbaa [[INT_TBAA4]]
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX19]], align 4, !tbaa [[INT_TBAA2]]
 // CHECK-NEXT:    [[CONV20:%.*]] = sext i32 [[TMP7]] to i64
 // CHECK-NEXT:    [[S_SROA_10_0_INSERT_EXT:%.*]] = zext i64 [[CONV20]] to i512
 // CHECK-NEXT:    [[S_SROA_10_0_INSERT_SHIFT:%.*]] = shl nuw i512 [[S_SROA_10_0_INSERT_EXT]], 448
@@ -84,11 +84,11 @@ void store2(int *in, void *addr)
     __asm__ volatile ("st64b %0,[%1]" : : "r" (s), "r" (addr) : "memory" );
 }
 //.
-// CHECK: [[META2]] = !{i64 789}
-// CHECK: [[META3]] = !{i64 1368}
-// CHECK: [[INT_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
-// CHECK: [[META5]] = !{!"int", [[META6:![0-9]+]], i64 0}
-// CHECK: [[META6]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
-// CHECK: [[META7]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[INT_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// CHECK: [[META3]] = !{!"int", [[META4:![0-9]+]], i64 0}
+// CHECK: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK: [[META5]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[META6]] = !{i64 789}
+// CHECK: [[META7]] = !{i64 1368}
 // CHECK: [[META8]] = !{i64 5992}
 //.
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1.c
index 19e5243c8a625..1ed59c6c80bdc 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1.c
@@ -24,12 +24,12 @@
 
 // CHECK-LABEL: @test_svld1_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[BASE:%.*]], i32 1, <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> zeroinitializer)
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z13test_svld1_s8u10__SVBool_tPKa(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[BASE:%.*]], i32 1, <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> zeroinitializer)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 svint8_t test_svld1_s8(svbool_t pg, const int8_t *base) MODE_ATTR
@@ -40,13 +40,13 @@ svint8_t test_svld1_s8(svbool_t pg, const int8_t *base) MODE_ATTR
 // CHECK-LABEL: @test_svld1_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[BASE:%.*]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> zeroinitializer)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr align 1 [[BASE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> zeroinitializer)
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
 // CPP-CHECK-LABEL: @_Z14test_svld1_s16u10__SVBool_tPKs(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[BASE:%.*]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr align 1 [[BASE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> zeroinitializer)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
 svint16_t test_svld1_s16(svbool_t pg, const int16_t *base) MODE_ATTR
@@ -57,13 +57,13 @@ svint16_t test_svld1_s16(svbool_t pg, const int16_t *base) MODE_ATTR
 // CHECK-LABEL: @test_svld1_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[BASE:%.*]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> zeroinitializer)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 1 [[BASE:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> zeroinitializer)
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
 // CPP-CHECK-LABEL: @_Z14test_svld1_s32u10__SVBool_tPKi(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[BASE:%.*]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 1 [[BASE:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> zeroinitializer)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
 svint32_t test_svld1_s32(svbool_t pg, const int32_t *base) MODE_ATTR
@@ -74,13 +74,13 @@ svint32_t test_svld1_s32(svbool_t pg, const int32_t *base) MODE_ATTR
 // CHECK-LABEL: @test_svld1_s64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[BASE:%.*]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> zeroinitializer)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 1 [[BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> zeroinitializer)
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
 // CPP-CHECK-LABEL: @_Z14test_svld1_s64u10__SVBool_tPKl(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[BASE:%.*]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 1 [[BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> zeroinitializer)
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
 svint64_t test_svld1_s64(svbool_t pg, const int64_t *base) MODE_ATTR
@@ -90,12 +90,12 @@ svint64_t test_svld1_s64(svbool_t pg, const int64_t *base) MODE_ATTR
 
 // CHECK-LABEL: @test_svld1_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[BASE:%.*]], i32 1, <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> zeroinitializer)
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z13test_svld1_u8u10__SVBool_tPKh(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[BASE:%.*]], i32 1, <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> zeroinitializer)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 svuint8_t test_svld1_u8(svbool_t pg, const uint8_t *base) MODE_ATTR
@@ -106,13 +106,13 @@ svuint8_t test_svld1_u8(svbool_t pg, const uint8_t *base) MODE_ATTR
 // CHECK-LABEL: @test_svld1_u16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[BASE:%.*]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> zeroinitializer)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr align 1 [[BASE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> zeroinitializer)
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
 // CPP-CHECK-LABEL: @_Z14test_svld1_u16u10__SVBool_tPKt(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[BASE:%.*]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr align 1 [[BASE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> zeroinitializer)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
 svuint16_t test_svld1_u16(svbool_t pg, const uint16_t *base) MODE_ATTR
@@ -123,13 +123,13 @@ svuint16_t test_svld1_u16(svbool_t pg, const uint16_t *base) MODE_ATTR
 // CHECK-LABEL: @test_svld1_u32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[BASE:%.*]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> zeroinitializer)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 1 [[BASE:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> zeroinitializer)
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
 // CPP-CHECK-LABEL: @_Z14test_svld1_u32u10__SVBool_tPKj(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[BASE:%.*]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 1 [[BASE:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> zeroinitializer)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
 svuint32_t test_svld1_u32(svbool_t pg, const uint32_t *base) MODE_ATTR
@@ -140,13 +140,13 @@ svuint32_t test_svld1_u32(svbool_t pg, const uint32_t *base) MODE_ATTR
 // CHECK-LABEL: @test_svld1_u64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[BASE:%.*]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> zeroinitializer)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 1 [[BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> zeroinitializer)
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
 // CPP-CHECK-LABEL: @_Z14test_svld1_u64u10__SVBool_tPKm(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[BASE:%.*]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 1 [[BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> zeroinitializer)
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
 svuint64_t test_svld1_u64(svbool_t pg, const uint64_t *base) MODE_ATTR
@@ -157,13 +157,13 @@ svuint64_t test_svld1_u64(svbool_t pg, const uint64_t *base) MODE_ATTR
 // CHECK-LABEL: @test_svld1_f16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.masked.load.nxv8f16.p0(ptr [[BASE:%.*]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> zeroinitializer)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.masked.load.nxv8f16.p0(ptr align 1 [[BASE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> zeroinitializer)
 // CHECK-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 //
 // CPP-CHECK-LABEL: @_Z14test_svld1_f16u10__SVBool_tPKDh(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.masked.load.nxv8f16.p0(ptr [[BASE:%.*]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.masked.load.nxv8f16.p0(ptr align 1 [[BASE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> zeroinitializer)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 //
 svfloat16_t test_svld1_f16(svbool_t pg, const float16_t *base) MODE_ATTR
@@ -174,13 +174,13 @@ svfloat16_t test_svld1_f16(svbool_t pg, const float16_t *base) MODE_ATTR
 // CHECK-LABEL: @test_svld1_f32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[BASE:%.*]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> zeroinitializer)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[BASE:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> zeroinitializer)
 // CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 //
 // CPP-CHECK-LABEL: @_Z14test_svld1_f32u10__SVBool_tPKf(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[BASE:%.*]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[BASE:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> zeroinitializer)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 //
 svfloat32_t test_svld1_f32(svbool_t pg, const float32_t *base) MODE_ATTR
@@ -191,13 +191,13 @@ svfloat32_t test_svld1_f32(svbool_t pg, const float32_t *base) MODE_ATTR
 // CHECK-LABEL: @test_svld1_f64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[BASE:%.*]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> zeroinitializer)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 1 [[BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> zeroinitializer)
 // CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 //
 // CPP-CHECK-LABEL: @_Z14test_svld1_f64u10__SVBool_tPKd(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[BASE:%.*]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 1 [[BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> zeroinitializer)
 // CPP-CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 //
 svfloat64_t test_svld1_f64(svbool_t pg, const float64_t *base) MODE_ATTR
@@ -207,12 +207,12 @@ svfloat64_t test_svld1_f64(svbool_t pg, const float64_t *base) MODE_ATTR
 
 // CHECK-LABEL: @test_svld1_mf8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[BASE:%.*]], i32 1, <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> zeroinitializer)
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z14test_svld1_mf8u10__SVBool_tPKu6__mfp8(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[BASE:%.*]], i32 1, <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> zeroinitializer)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
 svmfloat8_t test_svld1_mf8(svbool_t pg, const mfloat8_t *base) MODE_ATTR
@@ -226,7 +226,7 @@ svmfloat8_t test_svld1_mf8(svbool_t pg, const mfloat8_t *base) MODE_ATTR
 // CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP1]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP2]], i32 1, <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP2]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> zeroinitializer)
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP3]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svld1_vnum_s8u10__SVBool_tPKal(
@@ -235,7 +235,7 @@ svmfloat8_t test_svld1_mf8(svbool_t pg, const mfloat8_t *base) MODE_ATTR
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP1]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP2]], i32 1, <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP2]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> zeroinitializer)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP3]]
 //
 svint8_t test_svld1_vnum_s8(svbool_t pg, const int8_t *base, int64_t vnum) MODE_ATTR
@@ -250,7 +250,7 @@ svint8_t test_svld1_vnum_s8(svbool_t pg, const int8_t *base, int64_t vnum) MODE_
 // CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> zeroinitializer)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr align 1 [[TMP3]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> zeroinitializer)
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP4]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svld1_vnum_s16u10__SVBool_tPKsl(
@@ -260,7 +260,7 @@ svint8_t test_svld1_vnum_s8(svbool_t pg, const int8_t *base, int64_t vnum) MODE_
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr align 1 [[TMP3]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> zeroinitializer)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP4]]
 //
 svint16_t test_svld1_vnum_s16(svbool_t pg, const int16_t *base, int64_t vnum) MODE_ATTR
@@ -275,7 +275,7 @@ svint16_t test_svld1_vnum_s16(svbool_t pg, const int16_t *base, int64_t vnum) MO
 // CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> zeroinitializer)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 1 [[TMP3]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> zeroinitializer)
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP4]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svld1_vnum_s32u10__SVBool_tPKil(
@@ -285,7 +285,7 @@ svint16_t test_svld1_vnum_s16(svbool_t pg, const int16_t *base, int64_t vnum) MO
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 1 [[TMP3]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> zeroinitializer)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP4]]
 //
 svint32_t test_svld1_vnum_s32(svbool_t pg, const int32_t *base, int64_t vnum) MODE_ATTR
@@ -300,7 +300,7 @@ svint32_t test_svld1_vnum_s32(svbool_t pg, const int32_t *base, int64_t vnum) MO
 // CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> zeroinitializer)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 1 [[TMP3]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> zeroinitializer)
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP4]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svld1_vnum_s64u10__SVBool_tPKll(
@@ -310,7 +310,7 @@ svint32_t test_svld1_vnum_s32(svbool_t pg, const int32_t *base, int64_t vnum) MO
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 1 [[TMP3]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> zeroinitializer)
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP4]]
 //
 svint64_t test_svld1_vnum_s64(svbool_t pg, const int64_t *base, int64_t vnum) MODE_ATTR
@@ -324,7 +324,7 @@ svint64_t test_svld1_vnum_s64(svbool_t pg, const int64_t *base, int64_t vnum) MO
 // CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP1]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP2]], i32 1, <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP2]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> zeroinitializer)
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP3]]
 //
 // CPP-CHECK-LABEL: @_Z18test_svld1_vnum_u8u10__SVBool_tPKhl(
@@ -333,7 +333,7 @@ svint64_t test_svld1_vnum_s64(svbool_t pg, const int64_t *base, int64_t vnum) MO
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP1]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP2]], i32 1, <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP2]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> zeroinitializer)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP3]]
 //
 svuint8_t test_svld1_vnum_u8(svbool_t pg, const uint8_t *base, int64_t vnum) MODE_ATTR
@@ -348,7 +348,7 @@ svuint8_t test_svld1_vnum_u8(svbool_t pg, const uint8_t *base, int64_t vnum) MOD
 // CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> zeroinitializer)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr align 1 [[TMP3]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> zeroinitializer)
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP4]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svld1_vnum_u16u10__SVBool_tPKtl(
@@ -358,7 +358,7 @@ svuint8_t test_svld1_vnum_u8(svbool_t pg, const uint8_t *base, int64_t vnum) MOD
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr align 1 [[TMP3]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> zeroinitializer)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP4]]
 //
 svuint16_t test_svld1_vnum_u16(svbool_t pg, const uint16_t *base, int64_t vnum) MODE_ATTR
@@ -373,7 +373,7 @@ svuint16_t test_svld1_vnum_u16(svbool_t pg, const uint16_t *base, int64_t vnum)
 // CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> zeroinitializer)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 1 [[TMP3]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> zeroinitializer)
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP4]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svld1_vnum_u32u10__SVBool_tPKjl(
@@ -383,7 +383,7 @@ svuint16_t test_svld1_vnum_u16(svbool_t pg, const uint16_t *base, int64_t vnum)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 1 [[TMP3]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> zeroinitializer)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP4]]
 //
 svuint32_t test_svld1_vnum_u32(svbool_t pg, const uint32_t *base, int64_t vnum) MODE_ATTR
@@ -398,7 +398,7 @@ svuint32_t test_svld1_vnum_u32(svbool_t pg, const uint32_t *base, int64_t vnum)
 // CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> zeroinitializer)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 1 [[TMP3]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> zeroinitializer)
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP4]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svld1_vnum_u64u10__SVBool_tPKml(
@@ -408,7 +408,7 @@ svuint32_t test_svld1_vnum_u32(svbool_t pg, const uint32_t *base, int64_t vnum)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 1 [[TMP3]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> zeroinitializer)
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP4]]
 //
 svuint64_t test_svld1_vnum_u64(svbool_t pg, const uint64_t *base, int64_t vnum) MODE_ATTR
@@ -423,7 +423,7 @@ svuint64_t test_svld1_vnum_u64(svbool_t pg, const uint64_t *base, int64_t vnum)
 // CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.masked.load.nxv8f16.p0(ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> zeroinitializer)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.masked.load.nxv8f16.p0(ptr align 1 [[TMP3]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> zeroinitializer)
 // CHECK-NEXT:    ret <vscale x 8 x half> [[TMP4]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svld1_vnum_f16u10__SVBool_tPKDhl(
@@ -433,7 +433,7 @@ svuint64_t test_svld1_vnum_u64(svbool_t pg, const uint64_t *base, int64_t vnum)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.masked.load.nxv8f16.p0(ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.masked.load.nxv8f16.p0(ptr align 1 [[TMP3]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> zeroinitializer)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP4]]
 //
 svfloat16_t test_svld1_vnum_f16(svbool_t pg, const float16_t *base, int64_t vnum) MODE_ATTR
@@ -448,7 +448,7 @@ svfloat16_t test_svld1_vnum_f16(svbool_t pg, const float16_t *base, int64_t vnum
 // CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> zeroinitializer)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[TMP3]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> zeroinitializer)
 // CHECK-NEXT:    ret <vscale x 4 x float> [[TMP4]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svld1_vnum_f32u10__SVBool_tPKfl(
@@ -458,7 +458,7 @@ svfloat16_t test_svld1_vnum_f16(svbool_t pg, const float16_t *base, int64_t vnum
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[TMP3]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> zeroinitializer)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP4]]
 //
 svfloat32_t test_svld1_vnum_f32(svbool_t pg, const float32_t *base, int64_t vnum) MODE_ATTR
@@ -473,7 +473,7 @@ svfloat32_t test_svld1_vnum_f32(svbool_t pg, const float32_t *base, int64_t vnum
 // CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> zeroinitializer)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 1 [[TMP3]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> zeroinitializer)
 // CHECK-NEXT:    ret <vscale x 2 x double> [[TMP4]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svld1_vnum_f64u10__SVBool_tPKdl(
@@ -483,7 +483,7 @@ svfloat32_t test_svld1_vnum_f32(svbool_t pg, const float32_t *base, int64_t vnum
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 1 [[TMP3]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> zeroinitializer)
 // CPP-CHECK-NEXT:    ret <vscale x 2 x double> [[TMP4]]
 //
 svfloat64_t test_svld1_vnum_f64(svbool_t pg, const float64_t *base, int64_t vnum) MODE_ATTR
@@ -497,7 +497,7 @@ svfloat64_t test_svld1_vnum_f64(svbool_t pg, const float64_t *base, int64_t vnum
 // CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP1]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP2]], i32 1, <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP2]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> zeroinitializer)
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP3]]
 //
 // CPP-CHECK-LABEL: @_Z19test_svld1_vnum_mf8u10__SVBool_tPKu6__mfp8l(
@@ -506,7 +506,7 @@ svfloat64_t test_svld1_vnum_f64(svbool_t pg, const float64_t *base, int64_t vnum
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP1]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP2]], i32 1, <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP2]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> zeroinitializer)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP3]]
 //
 svmfloat8_t test_svld1_vnum_mf8(svbool_t pg, const mfloat8_t *base, int64_t vnum) MODE_ATTR
@@ -1205,13 +1205,13 @@ svfloat64_t test_svld1_gather_u64base_index_f64(svbool_t pg, svuint64_t bases, i
 // CHECK-LABEL: @test_svld1_bf16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16.p0(ptr [[BASE:%.*]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> zeroinitializer)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16.p0(ptr align 1 [[BASE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> zeroinitializer)
 // CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
 // CPP-CHECK-LABEL: @_Z15test_svld1_bf16u10__SVBool_tPKu6__bf16(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16.p0(ptr [[BASE:%.*]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16.p0(ptr align 1 [[BASE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> zeroinitializer)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
 svbfloat16_t test_svld1_bf16(svbool_t pg, const bfloat16_t *base) MODE_ATTR
@@ -1226,7 +1226,7 @@ svbfloat16_t test_svld1_bf16(svbool_t pg, const bfloat16_t *base) MODE_ATTR
 // CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16.p0(ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> zeroinitializer)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16.p0(ptr align 1 [[TMP3]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> zeroinitializer)
 // CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP4]]
 //
 // CPP-CHECK-LABEL: @_Z20test_svld1_vnum_bf16u10__SVBool_tPKu6__bf16l(
@@ -1236,7 +1236,7 @@ svbfloat16_t test_svld1_bf16(svbool_t pg, const bfloat16_t *base) MODE_ATTR
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16.p0(ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16.p0(ptr align 1 [[TMP3]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> zeroinitializer)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP4]]
 //
 svbfloat16_t test_svld1_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum) MODE_ATTR
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1sb.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1sb.c
index 2757f2873cc83..eb40da2960f3d 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1sb.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1sb.c
@@ -25,14 +25,14 @@
 // CHECK-LABEL: @test_svld1sb_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[BASE:%.*]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr align 1 [[BASE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
 // CHECK-NEXT:    [[TMP2:%.*]] = sext <vscale x 8 x i8> [[TMP1]] to <vscale x 8 x i16>
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svld1sb_s16u10__SVBool_tPKa(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[BASE:%.*]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr align 1 [[BASE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = sext <vscale x 8 x i8> [[TMP1]] to <vscale x 8 x i16>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
 //
@@ -44,14 +44,14 @@ svint16_t test_svld1sb_s16(svbool_t pg, const int8_t *base) MODE_ATTR
 // CHECK-LABEL: @test_svld1sb_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[BASE:%.*]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 1 [[BASE:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
 // CHECK-NEXT:    [[TMP2:%.*]] = sext <vscale x 4 x i8> [[TMP1]] to <vscale x 4 x i32>
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svld1sb_s32u10__SVBool_tPKa(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[BASE:%.*]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 1 [[BASE:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = sext <vscale x 4 x i8> [[TMP1]] to <vscale x 4 x i32>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
 //
@@ -63,14 +63,14 @@ svint32_t test_svld1sb_s32(svbool_t pg, const int8_t *base) MODE_ATTR
 // CHECK-LABEL: @test_svld1sb_s64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[BASE:%.*]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr align 1 [[BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
 // CHECK-NEXT:    [[TMP2:%.*]] = sext <vscale x 2 x i8> [[TMP1]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svld1sb_s64u10__SVBool_tPKa(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[BASE:%.*]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr align 1 [[BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = sext <vscale x 2 x i8> [[TMP1]] to <vscale x 2 x i64>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
 //
@@ -82,14 +82,14 @@ svint64_t test_svld1sb_s64(svbool_t pg, const int8_t *base) MODE_ATTR
 // CHECK-LABEL: @test_svld1sb_u16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[BASE:%.*]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr align 1 [[BASE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
 // CHECK-NEXT:    [[TMP2:%.*]] = sext <vscale x 8 x i8> [[TMP1]] to <vscale x 8 x i16>
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svld1sb_u16u10__SVBool_tPKa(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[BASE:%.*]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr align 1 [[BASE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = sext <vscale x 8 x i8> [[TMP1]] to <vscale x 8 x i16>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
 //
@@ -101,14 +101,14 @@ svuint16_t test_svld1sb_u16(svbool_t pg, const int8_t *base) MODE_ATTR
 // CHECK-LABEL: @test_svld1sb_u32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[BASE:%.*]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 1 [[BASE:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
 // CHECK-NEXT:    [[TMP2:%.*]] = sext <vscale x 4 x i8> [[TMP1]] to <vscale x 4 x i32>
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svld1sb_u32u10__SVBool_tPKa(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[BASE:%.*]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 1 [[BASE:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = sext <vscale x 4 x i8> [[TMP1]] to <vscale x 4 x i32>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
 //
@@ -120,14 +120,14 @@ svuint32_t test_svld1sb_u32(svbool_t pg, const int8_t *base) MODE_ATTR
 // CHECK-LABEL: @test_svld1sb_u64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[BASE:%.*]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr align 1 [[BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
 // CHECK-NEXT:    [[TMP2:%.*]] = sext <vscale x 2 x i8> [[TMP1]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svld1sb_u64u10__SVBool_tPKa(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[BASE:%.*]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr align 1 [[BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = sext <vscale x 2 x i8> [[TMP1]] to <vscale x 2 x i64>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
 //
@@ -143,7 +143,7 @@ svuint64_t test_svld1sb_u64(svbool_t pg, const int8_t *base) MODE_ATTR
 // CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr align 1 [[TMP3]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
 // CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 8 x i8> [[TMP4]] to <vscale x 8 x i16>
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP5]]
 //
@@ -154,7 +154,7 @@ svuint64_t test_svld1sb_u64(svbool_t pg, const int8_t *base) MODE_ATTR
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr align 1 [[TMP3]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 8 x i8> [[TMP4]] to <vscale x 8 x i16>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP5]]
 //
@@ -170,7 +170,7 @@ svint16_t test_svld1sb_vnum_s16(svbool_t pg, const int8_t *base, int64_t vnum) M
 // CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 1 [[TMP3]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
 // CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 4 x i8> [[TMP4]] to <vscale x 4 x i32>
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP5]]
 //
@@ -181,7 +181,7 @@ svint16_t test_svld1sb_vnum_s16(svbool_t pg, const int8_t *base, int64_t vnum) M
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 1 [[TMP3]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 4 x i8> [[TMP4]] to <vscale x 4 x i32>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP5]]
 //
@@ -197,7 +197,7 @@ svint32_t test_svld1sb_vnum_s32(svbool_t pg, const int8_t *base, int64_t vnum) M
 // CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr align 1 [[TMP3]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
 // CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 2 x i8> [[TMP4]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
@@ -208,7 +208,7 @@ svint32_t test_svld1sb_vnum_s32(svbool_t pg, const int8_t *base, int64_t vnum) M
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr align 1 [[TMP3]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 2 x i8> [[TMP4]] to <vscale x 2 x i64>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
@@ -224,7 +224,7 @@ svint64_t test_svld1sb_vnum_s64(svbool_t pg, const int8_t *base, int64_t vnum) M
 // CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr align 1 [[TMP3]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
 // CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 8 x i8> [[TMP4]] to <vscale x 8 x i16>
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP5]]
 //
@@ -235,7 +235,7 @@ svint64_t test_svld1sb_vnum_s64(svbool_t pg, const int8_t *base, int64_t vnum) M
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr align 1 [[TMP3]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 8 x i8> [[TMP4]] to <vscale x 8 x i16>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP5]]
 //
@@ -251,7 +251,7 @@ svuint16_t test_svld1sb_vnum_u16(svbool_t pg, const int8_t *base, int64_t vnum)
 // CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 1 [[TMP3]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
 // CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 4 x i8> [[TMP4]] to <vscale x 4 x i32>
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP5]]
 //
@@ -262,7 +262,7 @@ svuint16_t test_svld1sb_vnum_u16(svbool_t pg, const int8_t *base, int64_t vnum)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 1 [[TMP3]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 4 x i8> [[TMP4]] to <vscale x 4 x i32>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP5]]
 //
@@ -278,7 +278,7 @@ svuint32_t test_svld1sb_vnum_u32(svbool_t pg, const int8_t *base, int64_t vnum)
 // CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr align 1 [[TMP3]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
 // CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 2 x i8> [[TMP4]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
@@ -289,7 +289,7 @@ svuint32_t test_svld1sb_vnum_u32(svbool_t pg, const int8_t *base, int64_t vnum)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr align 1 [[TMP3]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 2 x i8> [[TMP4]] to <vscale x 2 x i64>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1sh.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1sh.c
index dbc762fb8632a..e1cbb53cb6e91 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1sh.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1sh.c
@@ -25,14 +25,14 @@
 // CHECK-LABEL: @test_svld1sh_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[BASE:%.*]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr align 1 [[BASE:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
 // CHECK-NEXT:    [[TMP2:%.*]] = sext <vscale x 4 x i16> [[TMP1]] to <vscale x 4 x i32>
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svld1sh_s32u10__SVBool_tPKs(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[BASE:%.*]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr align 1 [[BASE:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = sext <vscale x 4 x i16> [[TMP1]] to <vscale x 4 x i32>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
 //
@@ -44,14 +44,14 @@ svint32_t test_svld1sh_s32(svbool_t pg, const int16_t *base) MODE_ATTR
 // CHECK-LABEL: @test_svld1sh_s64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr [[BASE:%.*]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr align 1 [[BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
 // CHECK-NEXT:    [[TMP2:%.*]] = sext <vscale x 2 x i16> [[TMP1]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svld1sh_s64u10__SVBool_tPKs(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr [[BASE:%.*]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr align 1 [[BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = sext <vscale x 2 x i16> [[TMP1]] to <vscale x 2 x i64>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
 //
@@ -63,14 +63,14 @@ svint64_t test_svld1sh_s64(svbool_t pg, const int16_t *base) MODE_ATTR
 // CHECK-LABEL: @test_svld1sh_u32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[BASE:%.*]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr align 1 [[BASE:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
 // CHECK-NEXT:    [[TMP2:%.*]] = sext <vscale x 4 x i16> [[TMP1]] to <vscale x 4 x i32>
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svld1sh_u32u10__SVBool_tPKs(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[BASE:%.*]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr align 1 [[BASE:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = sext <vscale x 4 x i16> [[TMP1]] to <vscale x 4 x i32>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
 //
@@ -82,14 +82,14 @@ svuint32_t test_svld1sh_u32(svbool_t pg, const int16_t *base) MODE_ATTR
 // CHECK-LABEL: @test_svld1sh_u64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr [[BASE:%.*]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr align 1 [[BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
 // CHECK-NEXT:    [[TMP2:%.*]] = sext <vscale x 2 x i16> [[TMP1]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svld1sh_u64u10__SVBool_tPKs(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr [[BASE:%.*]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr align 1 [[BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = sext <vscale x 2 x i16> [[TMP1]] to <vscale x 2 x i64>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
 //
@@ -105,7 +105,7 @@ svuint64_t test_svld1sh_u64(svbool_t pg, const int16_t *base) MODE_ATTR
 // CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr align 1 [[TMP3]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
 // CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 4 x i16> [[TMP4]] to <vscale x 4 x i32>
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP5]]
 //
@@ -116,7 +116,7 @@ svuint64_t test_svld1sh_u64(svbool_t pg, const int16_t *base) MODE_ATTR
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr align 1 [[TMP3]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 4 x i16> [[TMP4]] to <vscale x 4 x i32>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP5]]
 //
@@ -132,7 +132,7 @@ svint32_t test_svld1sh_vnum_s32(svbool_t pg, const int16_t *base, int64_t vnum)
 // CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr align 1 [[TMP3]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
 // CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 2 x i16> [[TMP4]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
@@ -143,7 +143,7 @@ svint32_t test_svld1sh_vnum_s32(svbool_t pg, const int16_t *base, int64_t vnum)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr align 1 [[TMP3]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 2 x i16> [[TMP4]] to <vscale x 2 x i64>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
@@ -159,7 +159,7 @@ svint64_t test_svld1sh_vnum_s64(svbool_t pg, const int16_t *base, int64_t vnum)
 // CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr align 1 [[TMP3]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
 // CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 4 x i16> [[TMP4]] to <vscale x 4 x i32>
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP5]]
 //
@@ -170,7 +170,7 @@ svint64_t test_svld1sh_vnum_s64(svbool_t pg, const int16_t *base, int64_t vnum)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr align 1 [[TMP3]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 4 x i16> [[TMP4]] to <vscale x 4 x i32>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP5]]
 //
@@ -186,7 +186,7 @@ svuint32_t test_svld1sh_vnum_u32(svbool_t pg, const int16_t *base, int64_t vnum)
 // CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr align 1 [[TMP3]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
 // CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 2 x i16> [[TMP4]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
@@ -197,7 +197,7 @@ svuint32_t test_svld1sh_vnum_u32(svbool_t pg, const int16_t *base, int64_t vnum)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr align 1 [[TMP3]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 2 x i16> [[TMP4]] to <vscale x 2 x i64>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1sw.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1sw.c
index 575d2141d2815..14ee095f5d03d 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1sw.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1sw.c
@@ -25,14 +25,14 @@
 // CHECK-LABEL: @test_svld1sw_s64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr [[BASE:%.*]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr align 1 [[BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
 // CHECK-NEXT:    [[TMP2:%.*]] = sext <vscale x 2 x i32> [[TMP1]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svld1sw_s64u10__SVBool_tPKi(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr [[BASE:%.*]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr align 1 [[BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = sext <vscale x 2 x i32> [[TMP1]] to <vscale x 2 x i64>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
 //
@@ -44,14 +44,14 @@ svint64_t test_svld1sw_s64(svbool_t pg, const int32_t *base) MODE_ATTR
 // CHECK-LABEL: @test_svld1sw_u64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr [[BASE:%.*]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr align 1 [[BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
 // CHECK-NEXT:    [[TMP2:%.*]] = sext <vscale x 2 x i32> [[TMP1]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svld1sw_u64u10__SVBool_tPKi(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr [[BASE:%.*]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr align 1 [[BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = sext <vscale x 2 x i32> [[TMP1]] to <vscale x 2 x i64>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
 //
@@ -67,7 +67,7 @@ svuint64_t test_svld1sw_u64(svbool_t pg, const int32_t *base) MODE_ATTR
 // CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr align 1 [[TMP3]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
 // CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 2 x i32> [[TMP4]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
@@ -78,7 +78,7 @@ svuint64_t test_svld1sw_u64(svbool_t pg, const int32_t *base) MODE_ATTR
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr align 1 [[TMP3]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 2 x i32> [[TMP4]] to <vscale x 2 x i64>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
@@ -94,7 +94,7 @@ svint64_t test_svld1sw_vnum_s64(svbool_t pg, const int32_t *base, int64_t vnum)
 // CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr align 1 [[TMP3]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
 // CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 2 x i32> [[TMP4]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
@@ -105,7 +105,7 @@ svint64_t test_svld1sw_vnum_s64(svbool_t pg, const int32_t *base, int64_t vnum)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr align 1 [[TMP3]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = sext <vscale x 2 x i32> [[TMP4]] to <vscale x 2 x i64>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1ub.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1ub.c
index 07e88152a6f53..3e0d28273d0b6 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1ub.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1ub.c
@@ -25,14 +25,14 @@
 // CHECK-LABEL: @test_svld1ub_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[BASE:%.*]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr align 1 [[BASE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
 // CHECK-NEXT:    [[TMP2:%.*]] = zext <vscale x 8 x i8> [[TMP1]] to <vscale x 8 x i16>
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svld1ub_s16u10__SVBool_tPKh(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[BASE:%.*]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr align 1 [[BASE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = zext <vscale x 8 x i8> [[TMP1]] to <vscale x 8 x i16>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
 //
@@ -44,14 +44,14 @@ svint16_t test_svld1ub_s16(svbool_t pg, const uint8_t *base) MODE_ATTR
 // CHECK-LABEL: @test_svld1ub_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[BASE:%.*]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 1 [[BASE:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
 // CHECK-NEXT:    [[TMP2:%.*]] = zext <vscale x 4 x i8> [[TMP1]] to <vscale x 4 x i32>
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svld1ub_s32u10__SVBool_tPKh(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[BASE:%.*]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 1 [[BASE:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = zext <vscale x 4 x i8> [[TMP1]] to <vscale x 4 x i32>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
 //
@@ -63,14 +63,14 @@ svint32_t test_svld1ub_s32(svbool_t pg, const uint8_t *base) MODE_ATTR
 // CHECK-LABEL: @test_svld1ub_s64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[BASE:%.*]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr align 1 [[BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
 // CHECK-NEXT:    [[TMP2:%.*]] = zext <vscale x 2 x i8> [[TMP1]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svld1ub_s64u10__SVBool_tPKh(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[BASE:%.*]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr align 1 [[BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = zext <vscale x 2 x i8> [[TMP1]] to <vscale x 2 x i64>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
 //
@@ -82,14 +82,14 @@ svint64_t test_svld1ub_s64(svbool_t pg, const uint8_t *base) MODE_ATTR
 // CHECK-LABEL: @test_svld1ub_u16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[BASE:%.*]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr align 1 [[BASE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
 // CHECK-NEXT:    [[TMP2:%.*]] = zext <vscale x 8 x i8> [[TMP1]] to <vscale x 8 x i16>
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svld1ub_u16u10__SVBool_tPKh(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[BASE:%.*]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr align 1 [[BASE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = zext <vscale x 8 x i8> [[TMP1]] to <vscale x 8 x i16>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP2]]
 //
@@ -101,14 +101,14 @@ svuint16_t test_svld1ub_u16(svbool_t pg, const uint8_t *base) MODE_ATTR
 // CHECK-LABEL: @test_svld1ub_u32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[BASE:%.*]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 1 [[BASE:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
 // CHECK-NEXT:    [[TMP2:%.*]] = zext <vscale x 4 x i8> [[TMP1]] to <vscale x 4 x i32>
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svld1ub_u32u10__SVBool_tPKh(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[BASE:%.*]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 1 [[BASE:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = zext <vscale x 4 x i8> [[TMP1]] to <vscale x 4 x i32>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
 //
@@ -120,14 +120,14 @@ svuint32_t test_svld1ub_u32(svbool_t pg, const uint8_t *base) MODE_ATTR
 // CHECK-LABEL: @test_svld1ub_u64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[BASE:%.*]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr align 1 [[BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
 // CHECK-NEXT:    [[TMP2:%.*]] = zext <vscale x 2 x i8> [[TMP1]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svld1ub_u64u10__SVBool_tPKh(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[BASE:%.*]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr align 1 [[BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = zext <vscale x 2 x i8> [[TMP1]] to <vscale x 2 x i64>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
 //
@@ -143,7 +143,7 @@ svuint64_t test_svld1ub_u64(svbool_t pg, const uint8_t *base) MODE_ATTR
 // CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr align 1 [[TMP3]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
 // CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 8 x i8> [[TMP4]] to <vscale x 8 x i16>
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP5]]
 //
@@ -154,7 +154,7 @@ svuint64_t test_svld1ub_u64(svbool_t pg, const uint8_t *base) MODE_ATTR
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr align 1 [[TMP3]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 8 x i8> [[TMP4]] to <vscale x 8 x i16>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP5]]
 //
@@ -170,7 +170,7 @@ svint16_t test_svld1ub_vnum_s16(svbool_t pg, const uint8_t *base, int64_t vnum)
 // CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 1 [[TMP3]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
 // CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 4 x i8> [[TMP4]] to <vscale x 4 x i32>
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP5]]
 //
@@ -181,7 +181,7 @@ svint16_t test_svld1ub_vnum_s16(svbool_t pg, const uint8_t *base, int64_t vnum)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 1 [[TMP3]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 4 x i8> [[TMP4]] to <vscale x 4 x i32>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP5]]
 //
@@ -197,7 +197,7 @@ svint32_t test_svld1ub_vnum_s32(svbool_t pg, const uint8_t *base, int64_t vnum)
 // CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr align 1 [[TMP3]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
 // CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 2 x i8> [[TMP4]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
@@ -208,7 +208,7 @@ svint32_t test_svld1ub_vnum_s32(svbool_t pg, const uint8_t *base, int64_t vnum)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr align 1 [[TMP3]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 2 x i8> [[TMP4]] to <vscale x 2 x i64>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
@@ -224,7 +224,7 @@ svint64_t test_svld1ub_vnum_s64(svbool_t pg, const uint8_t *base, int64_t vnum)
 // CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr align 1 [[TMP3]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
 // CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 8 x i8> [[TMP4]] to <vscale x 8 x i16>
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP5]]
 //
@@ -235,7 +235,7 @@ svint64_t test_svld1ub_vnum_s64(svbool_t pg, const uint8_t *base, int64_t vnum)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr align 1 [[TMP3]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 8 x i8> [[TMP4]] to <vscale x 8 x i16>
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP5]]
 //
@@ -251,7 +251,7 @@ svuint16_t test_svld1ub_vnum_u16(svbool_t pg, const uint8_t *base, int64_t vnum)
 // CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 1 [[TMP3]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
 // CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 4 x i8> [[TMP4]] to <vscale x 4 x i32>
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP5]]
 //
@@ -262,7 +262,7 @@ svuint16_t test_svld1ub_vnum_u16(svbool_t pg, const uint8_t *base, int64_t vnum)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 1 [[TMP3]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 4 x i8> [[TMP4]] to <vscale x 4 x i32>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP5]]
 //
@@ -278,7 +278,7 @@ svuint32_t test_svld1ub_vnum_u32(svbool_t pg, const uint8_t *base, int64_t vnum)
 // CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr align 1 [[TMP3]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
 // CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 2 x i8> [[TMP4]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
@@ -289,7 +289,7 @@ svuint32_t test_svld1ub_vnum_u32(svbool_t pg, const uint8_t *base, int64_t vnum)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr align 1 [[TMP3]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i8> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 2 x i8> [[TMP4]] to <vscale x 2 x i64>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1uh.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1uh.c
index 6d91c1ecd7c7a..18dfc0825a975 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1uh.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1uh.c
@@ -25,14 +25,14 @@
 // CHECK-LABEL: @test_svld1uh_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[BASE:%.*]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr align 1 [[BASE:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
 // CHECK-NEXT:    [[TMP2:%.*]] = zext <vscale x 4 x i16> [[TMP1]] to <vscale x 4 x i32>
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svld1uh_s32u10__SVBool_tPKt(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[BASE:%.*]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr align 1 [[BASE:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = zext <vscale x 4 x i16> [[TMP1]] to <vscale x 4 x i32>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
 //
@@ -44,14 +44,14 @@ svint32_t test_svld1uh_s32(svbool_t pg, const uint16_t *base) MODE_ATTR
 // CHECK-LABEL: @test_svld1uh_s64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr [[BASE:%.*]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr align 1 [[BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
 // CHECK-NEXT:    [[TMP2:%.*]] = zext <vscale x 2 x i16> [[TMP1]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svld1uh_s64u10__SVBool_tPKt(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr [[BASE:%.*]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr align 1 [[BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = zext <vscale x 2 x i16> [[TMP1]] to <vscale x 2 x i64>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
 //
@@ -63,14 +63,14 @@ svint64_t test_svld1uh_s64(svbool_t pg, const uint16_t *base) MODE_ATTR
 // CHECK-LABEL: @test_svld1uh_u32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[BASE:%.*]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr align 1 [[BASE:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
 // CHECK-NEXT:    [[TMP2:%.*]] = zext <vscale x 4 x i16> [[TMP1]] to <vscale x 4 x i32>
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svld1uh_u32u10__SVBool_tPKt(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[BASE:%.*]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr align 1 [[BASE:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = zext <vscale x 4 x i16> [[TMP1]] to <vscale x 4 x i32>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
 //
@@ -82,14 +82,14 @@ svuint32_t test_svld1uh_u32(svbool_t pg, const uint16_t *base) MODE_ATTR
 // CHECK-LABEL: @test_svld1uh_u64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr [[BASE:%.*]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr align 1 [[BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
 // CHECK-NEXT:    [[TMP2:%.*]] = zext <vscale x 2 x i16> [[TMP1]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svld1uh_u64u10__SVBool_tPKt(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr [[BASE:%.*]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr align 1 [[BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = zext <vscale x 2 x i16> [[TMP1]] to <vscale x 2 x i64>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
 //
@@ -105,7 +105,7 @@ svuint64_t test_svld1uh_u64(svbool_t pg, const uint16_t *base) MODE_ATTR
 // CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr align 1 [[TMP3]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
 // CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 4 x i16> [[TMP4]] to <vscale x 4 x i32>
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP5]]
 //
@@ -116,7 +116,7 @@ svuint64_t test_svld1uh_u64(svbool_t pg, const uint16_t *base) MODE_ATTR
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr align 1 [[TMP3]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 4 x i16> [[TMP4]] to <vscale x 4 x i32>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP5]]
 //
@@ -132,7 +132,7 @@ svint32_t test_svld1uh_vnum_s32(svbool_t pg, const uint16_t *base, int64_t vnum)
 // CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr align 1 [[TMP3]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
 // CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 2 x i16> [[TMP4]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
@@ -143,7 +143,7 @@ svint32_t test_svld1uh_vnum_s32(svbool_t pg, const uint16_t *base, int64_t vnum)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr align 1 [[TMP3]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 2 x i16> [[TMP4]] to <vscale x 2 x i64>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
@@ -159,7 +159,7 @@ svint64_t test_svld1uh_vnum_s64(svbool_t pg, const uint16_t *base, int64_t vnum)
 // CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr align 1 [[TMP3]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
 // CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 4 x i16> [[TMP4]] to <vscale x 4 x i32>
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP5]]
 //
@@ -170,7 +170,7 @@ svint64_t test_svld1uh_vnum_s64(svbool_t pg, const uint16_t *base, int64_t vnum)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr align 1 [[TMP3]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 4 x i16> [[TMP4]] to <vscale x 4 x i32>
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP5]]
 //
@@ -186,7 +186,7 @@ svuint32_t test_svld1uh_vnum_u32(svbool_t pg, const uint16_t *base, int64_t vnum
 // CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr align 1 [[TMP3]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
 // CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 2 x i16> [[TMP4]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
@@ -197,7 +197,7 @@ svuint32_t test_svld1uh_vnum_u32(svbool_t pg, const uint16_t *base, int64_t vnum
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr align 1 [[TMP3]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i16> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 2 x i16> [[TMP4]] to <vscale x 2 x i64>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1uw.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1uw.c
index 7be23987aedf5..62637ff5529bf 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1uw.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_ld1uw.c
@@ -25,14 +25,14 @@
 // CHECK-LABEL: @test_svld1uw_s64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr [[BASE:%.*]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr align 1 [[BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
 // CHECK-NEXT:    [[TMP2:%.*]] = zext <vscale x 2 x i32> [[TMP1]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svld1uw_s64u10__SVBool_tPKj(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr [[BASE:%.*]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr align 1 [[BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = zext <vscale x 2 x i32> [[TMP1]] to <vscale x 2 x i64>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
 //
@@ -44,14 +44,14 @@ svint64_t test_svld1uw_s64(svbool_t pg, const uint32_t *base) MODE_ATTR
 // CHECK-LABEL: @test_svld1uw_u64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr [[BASE:%.*]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr align 1 [[BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
 // CHECK-NEXT:    [[TMP2:%.*]] = zext <vscale x 2 x i32> [[TMP1]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
 //
 // CPP-CHECK-LABEL: @_Z16test_svld1uw_u64u10__SVBool_tPKj(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr [[BASE:%.*]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr align 1 [[BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = zext <vscale x 2 x i32> [[TMP1]] to <vscale x 2 x i64>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
 //
@@ -67,7 +67,7 @@ svuint64_t test_svld1uw_u64(svbool_t pg, const uint32_t *base) MODE_ATTR
 // CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr align 1 [[TMP3]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
 // CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 2 x i32> [[TMP4]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
@@ -78,7 +78,7 @@ svuint64_t test_svld1uw_u64(svbool_t pg, const uint32_t *base) MODE_ATTR
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr align 1 [[TMP3]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 2 x i32> [[TMP4]] to <vscale x 2 x i64>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
@@ -94,7 +94,7 @@ svint64_t test_svld1uw_vnum_s64(svbool_t pg, const uint32_t *base, int64_t vnum)
 // CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr align 1 [[TMP3]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
 // CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 2 x i32> [[TMP4]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
@@ -105,7 +105,7 @@ svint64_t test_svld1uw_vnum_s64(svbool_t pg, const uint32_t *base, int64_t vnum)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr align 1 [[TMP3]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i32> zeroinitializer)
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = zext <vscale x 2 x i32> [[TMP4]] to <vscale x 2 x i64>
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP5]]
 //
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_st1.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_st1.c
index 56f8c32c23099..4d0005e07a02f 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_st1.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_st1.c
@@ -24,12 +24,12 @@
 
 // CHECK-LABEL: @test_svst1_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[DATA:%.*]], ptr [[BASE:%.*]], i32 1, <vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[DATA:%.*]], ptr align 1 [[BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z13test_svst1_s8u10__SVBool_tPau10__SVInt8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[DATA:%.*]], ptr [[BASE:%.*]], i32 1, <vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[DATA:%.*]], ptr align 1 [[BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_s8(svbool_t pg, int8_t *base, svint8_t data) MODE_ATTR
@@ -40,13 +40,13 @@ void test_svst1_s8(svbool_t pg, int8_t *base, svint8_t data) MODE_ATTR
 // CHECK-LABEL: @test_svst1_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[DATA:%.*]], ptr [[BASE:%.*]], i32 1, <vscale x 8 x i1> [[TMP0]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[DATA:%.*]], ptr align 1 [[BASE:%.*]], <vscale x 8 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst1_s16u10__SVBool_tPsu11__SVInt16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[DATA:%.*]], ptr [[BASE:%.*]], i32 1, <vscale x 8 x i1> [[TMP0]])
+// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[DATA:%.*]], ptr align 1 [[BASE:%.*]], <vscale x 8 x i1> [[TMP0]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_s16(svbool_t pg, int16_t *base, svint16_t data) MODE_ATTR
@@ -57,13 +57,13 @@ void test_svst1_s16(svbool_t pg, int16_t *base, svint16_t data) MODE_ATTR
 // CHECK-LABEL: @test_svst1_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[DATA:%.*]], ptr [[BASE:%.*]], i32 1, <vscale x 4 x i1> [[TMP0]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[DATA:%.*]], ptr align 1 [[BASE:%.*]], <vscale x 4 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst1_s32u10__SVBool_tPiu11__SVInt32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[DATA:%.*]], ptr [[BASE:%.*]], i32 1, <vscale x 4 x i1> [[TMP0]])
+// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[DATA:%.*]], ptr align 1 [[BASE:%.*]], <vscale x 4 x i1> [[TMP0]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_s32(svbool_t pg, int32_t *base, svint32_t data) MODE_ATTR
@@ -74,13 +74,13 @@ void test_svst1_s32(svbool_t pg, int32_t *base, svint32_t data) MODE_ATTR
 // CHECK-LABEL: @test_svst1_s64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[DATA:%.*]], ptr [[BASE:%.*]], i32 1, <vscale x 2 x i1> [[TMP0]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[DATA:%.*]], ptr align 1 [[BASE:%.*]], <vscale x 2 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst1_s64u10__SVBool_tPlu11__SVInt64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[DATA:%.*]], ptr [[BASE:%.*]], i32 1, <vscale x 2 x i1> [[TMP0]])
+// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[DATA:%.*]], ptr align 1 [[BASE:%.*]], <vscale x 2 x i1> [[TMP0]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_s64(svbool_t pg, int64_t *base, svint64_t data) MODE_ATTR
@@ -90,12 +90,12 @@ void test_svst1_s64(svbool_t pg, int64_t *base, svint64_t data) MODE_ATTR
 
 // CHECK-LABEL: @test_svst1_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[DATA:%.*]], ptr [[BASE:%.*]], i32 1, <vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[DATA:%.*]], ptr align 1 [[BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z13test_svst1_u8u10__SVBool_tPhu11__SVUint8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[DATA:%.*]], ptr [[BASE:%.*]], i32 1, <vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[DATA:%.*]], ptr align 1 [[BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_u8(svbool_t pg, uint8_t *base, svuint8_t data) MODE_ATTR
@@ -106,13 +106,13 @@ void test_svst1_u8(svbool_t pg, uint8_t *base, svuint8_t data) MODE_ATTR
 // CHECK-LABEL: @test_svst1_u16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[DATA:%.*]], ptr [[BASE:%.*]], i32 1, <vscale x 8 x i1> [[TMP0]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[DATA:%.*]], ptr align 1 [[BASE:%.*]], <vscale x 8 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst1_u16u10__SVBool_tPtu12__SVUint16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[DATA:%.*]], ptr [[BASE:%.*]], i32 1, <vscale x 8 x i1> [[TMP0]])
+// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[DATA:%.*]], ptr align 1 [[BASE:%.*]], <vscale x 8 x i1> [[TMP0]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_u16(svbool_t pg, uint16_t *base, svuint16_t data) MODE_ATTR
@@ -123,13 +123,13 @@ void test_svst1_u16(svbool_t pg, uint16_t *base, svuint16_t data) MODE_ATTR
 // CHECK-LABEL: @test_svst1_u32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[DATA:%.*]], ptr [[BASE:%.*]], i32 1, <vscale x 4 x i1> [[TMP0]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[DATA:%.*]], ptr align 1 [[BASE:%.*]], <vscale x 4 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst1_u32u10__SVBool_tPju12__SVUint32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[DATA:%.*]], ptr [[BASE:%.*]], i32 1, <vscale x 4 x i1> [[TMP0]])
+// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[DATA:%.*]], ptr align 1 [[BASE:%.*]], <vscale x 4 x i1> [[TMP0]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_u32(svbool_t pg, uint32_t *base, svuint32_t data) MODE_ATTR
@@ -140,13 +140,13 @@ void test_svst1_u32(svbool_t pg, uint32_t *base, svuint32_t data) MODE_ATTR
 // CHECK-LABEL: @test_svst1_u64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[DATA:%.*]], ptr [[BASE:%.*]], i32 1, <vscale x 2 x i1> [[TMP0]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[DATA:%.*]], ptr align 1 [[BASE:%.*]], <vscale x 2 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst1_u64u10__SVBool_tPmu12__SVUint64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[DATA:%.*]], ptr [[BASE:%.*]], i32 1, <vscale x 2 x i1> [[TMP0]])
+// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[DATA:%.*]], ptr align 1 [[BASE:%.*]], <vscale x 2 x i1> [[TMP0]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_u64(svbool_t pg, uint64_t *base, svuint64_t data) MODE_ATTR
@@ -157,13 +157,13 @@ void test_svst1_u64(svbool_t pg, uint64_t *base, svuint64_t data) MODE_ATTR
 // CHECK-LABEL: @test_svst1_f16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8f16.p0(<vscale x 8 x half> [[DATA:%.*]], ptr [[BASE:%.*]], i32 1, <vscale x 8 x i1> [[TMP0]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8f16.p0(<vscale x 8 x half> [[DATA:%.*]], ptr align 1 [[BASE:%.*]], <vscale x 8 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst1_f16u10__SVBool_tPDhu13__SVFloat16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv8f16.p0(<vscale x 8 x half> [[DATA:%.*]], ptr [[BASE:%.*]], i32 1, <vscale x 8 x i1> [[TMP0]])
+// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv8f16.p0(<vscale x 8 x half> [[DATA:%.*]], ptr align 1 [[BASE:%.*]], <vscale x 8 x i1> [[TMP0]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_f16(svbool_t pg, float16_t *base, svfloat16_t data) MODE_ATTR
@@ -174,13 +174,13 @@ void test_svst1_f16(svbool_t pg, float16_t *base, svfloat16_t data) MODE_ATTR
 // CHECK-LABEL: @test_svst1_f32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[DATA:%.*]], ptr [[BASE:%.*]], i32 1, <vscale x 4 x i1> [[TMP0]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[DATA:%.*]], ptr align 1 [[BASE:%.*]], <vscale x 4 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst1_f32u10__SVBool_tPfu13__SVFloat32_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[DATA:%.*]], ptr [[BASE:%.*]], i32 1, <vscale x 4 x i1> [[TMP0]])
+// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[DATA:%.*]], ptr align 1 [[BASE:%.*]], <vscale x 4 x i1> [[TMP0]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_f32(svbool_t pg, float32_t *base, svfloat32_t data) MODE_ATTR
@@ -191,13 +191,13 @@ void test_svst1_f32(svbool_t pg, float32_t *base, svfloat32_t data) MODE_ATTR
 // CHECK-LABEL: @test_svst1_f64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[DATA:%.*]], ptr [[BASE:%.*]], i32 1, <vscale x 2 x i1> [[TMP0]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[DATA:%.*]], ptr align 1 [[BASE:%.*]], <vscale x 2 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst1_f64u10__SVBool_tPdu13__SVFloat64_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[DATA:%.*]], ptr [[BASE:%.*]], i32 1, <vscale x 2 x i1> [[TMP0]])
+// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[DATA:%.*]], ptr align 1 [[BASE:%.*]], <vscale x 2 x i1> [[TMP0]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_f64(svbool_t pg, float64_t *base, svfloat64_t data) MODE_ATTR
@@ -207,12 +207,12 @@ void test_svst1_f64(svbool_t pg, float64_t *base, svfloat64_t data) MODE_ATTR
 
 // CHECK-LABEL: @test_svst1_mf8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[DATA:%.*]], ptr [[BASE:%.*]], i32 1, <vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[DATA:%.*]], ptr align 1 [[BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst1_mf8u10__SVBool_tPu6__mfp8u13__SVMfloat8_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[DATA:%.*]], ptr [[BASE:%.*]], i32 1, <vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[DATA:%.*]], ptr align 1 [[BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_mf8(svbool_t pg, mfloat8_t *base, svmfloat8_t data) MODE_ATTR
@@ -226,7 +226,7 @@ void test_svst1_mf8(svbool_t pg, mfloat8_t *base, svmfloat8_t data) MODE_ATTR
 // CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP1]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[DATA:%.*]], ptr [[TMP2]], i32 1, <vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[DATA:%.*]], ptr align 1 [[TMP2]], <vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z18test_svst1_vnum_s8u10__SVBool_tPalu10__SVInt8_t(
@@ -235,7 +235,7 @@ void test_svst1_mf8(svbool_t pg, mfloat8_t *base, svmfloat8_t data) MODE_ATTR
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP1]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[DATA:%.*]], ptr [[TMP2]], i32 1, <vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[DATA:%.*]], ptr align 1 [[TMP2]], <vscale x 16 x i1> [[PG:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_s8(svbool_t pg, int8_t *base, int64_t vnum, svint8_t data) MODE_ATTR
@@ -250,7 +250,7 @@ void test_svst1_vnum_s8(svbool_t pg, int8_t *base, int64_t vnum, svint8_t data)
 // CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[DATA:%.*]], ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[DATA:%.*]], ptr align 1 [[TMP3]], <vscale x 8 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst1_vnum_s16u10__SVBool_tPslu11__SVInt16_t(
@@ -260,7 +260,7 @@ void test_svst1_vnum_s8(svbool_t pg, int8_t *base, int64_t vnum, svint8_t data)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[DATA:%.*]], ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]])
+// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[DATA:%.*]], ptr align 1 [[TMP3]], <vscale x 8 x i1> [[TMP0]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_s16(svbool_t pg, int16_t *base, int64_t vnum, svint16_t data) MODE_ATTR
@@ -275,7 +275,7 @@ void test_svst1_vnum_s16(svbool_t pg, int16_t *base, int64_t vnum, svint16_t dat
 // CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[DATA:%.*]], ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[DATA:%.*]], ptr align 1 [[TMP3]], <vscale x 4 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst1_vnum_s32u10__SVBool_tPilu11__SVInt32_t(
@@ -285,7 +285,7 @@ void test_svst1_vnum_s16(svbool_t pg, int16_t *base, int64_t vnum, svint16_t dat
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[DATA:%.*]], ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]])
+// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[DATA:%.*]], ptr align 1 [[TMP3]], <vscale x 4 x i1> [[TMP0]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_s32(svbool_t pg, int32_t *base, int64_t vnum, svint32_t data) MODE_ATTR
@@ -300,7 +300,7 @@ void test_svst1_vnum_s32(svbool_t pg, int32_t *base, int64_t vnum, svint32_t dat
 // CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[DATA:%.*]], ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[DATA:%.*]], ptr align 1 [[TMP3]], <vscale x 2 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst1_vnum_s64u10__SVBool_tPllu11__SVInt64_t(
@@ -310,7 +310,7 @@ void test_svst1_vnum_s32(svbool_t pg, int32_t *base, int64_t vnum, svint32_t dat
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[DATA:%.*]], ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]])
+// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[DATA:%.*]], ptr align 1 [[TMP3]], <vscale x 2 x i1> [[TMP0]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_s64(svbool_t pg, int64_t *base, int64_t vnum, svint64_t data) MODE_ATTR
@@ -324,7 +324,7 @@ void test_svst1_vnum_s64(svbool_t pg, int64_t *base, int64_t vnum, svint64_t dat
 // CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP1]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[DATA:%.*]], ptr [[TMP2]], i32 1, <vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[DATA:%.*]], ptr align 1 [[TMP2]], <vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z18test_svst1_vnum_u8u10__SVBool_tPhlu11__SVUint8_t(
@@ -333,7 +333,7 @@ void test_svst1_vnum_s64(svbool_t pg, int64_t *base, int64_t vnum, svint64_t dat
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP1]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[DATA:%.*]], ptr [[TMP2]], i32 1, <vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[DATA:%.*]], ptr align 1 [[TMP2]], <vscale x 16 x i1> [[PG:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_u8(svbool_t pg, uint8_t *base, int64_t vnum, svuint8_t data) MODE_ATTR
@@ -348,7 +348,7 @@ void test_svst1_vnum_u8(svbool_t pg, uint8_t *base, int64_t vnum, svuint8_t data
 // CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[DATA:%.*]], ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[DATA:%.*]], ptr align 1 [[TMP3]], <vscale x 8 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst1_vnum_u16u10__SVBool_tPtlu12__SVUint16_t(
@@ -358,7 +358,7 @@ void test_svst1_vnum_u8(svbool_t pg, uint8_t *base, int64_t vnum, svuint8_t data
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[DATA:%.*]], ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]])
+// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[DATA:%.*]], ptr align 1 [[TMP3]], <vscale x 8 x i1> [[TMP0]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_u16(svbool_t pg, uint16_t *base, int64_t vnum, svuint16_t data) MODE_ATTR
@@ -373,7 +373,7 @@ void test_svst1_vnum_u16(svbool_t pg, uint16_t *base, int64_t vnum, svuint16_t d
 // CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[DATA:%.*]], ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[DATA:%.*]], ptr align 1 [[TMP3]], <vscale x 4 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst1_vnum_u32u10__SVBool_tPjlu12__SVUint32_t(
@@ -383,7 +383,7 @@ void test_svst1_vnum_u16(svbool_t pg, uint16_t *base, int64_t vnum, svuint16_t d
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[DATA:%.*]], ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]])
+// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[DATA:%.*]], ptr align 1 [[TMP3]], <vscale x 4 x i1> [[TMP0]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_u32(svbool_t pg, uint32_t *base, int64_t vnum, svuint32_t data) MODE_ATTR
@@ -398,7 +398,7 @@ void test_svst1_vnum_u32(svbool_t pg, uint32_t *base, int64_t vnum, svuint32_t d
 // CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[DATA:%.*]], ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[DATA:%.*]], ptr align 1 [[TMP3]], <vscale x 2 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst1_vnum_u64u10__SVBool_tPmlu12__SVUint64_t(
@@ -408,7 +408,7 @@ void test_svst1_vnum_u32(svbool_t pg, uint32_t *base, int64_t vnum, svuint32_t d
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[DATA:%.*]], ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]])
+// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[DATA:%.*]], ptr align 1 [[TMP3]], <vscale x 2 x i1> [[TMP0]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_u64(svbool_t pg, uint64_t *base, int64_t vnum, svuint64_t data) MODE_ATTR
@@ -423,7 +423,7 @@ void test_svst1_vnum_u64(svbool_t pg, uint64_t *base, int64_t vnum, svuint64_t d
 // CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8f16.p0(<vscale x 8 x half> [[DATA:%.*]], ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8f16.p0(<vscale x 8 x half> [[DATA:%.*]], ptr align 1 [[TMP3]], <vscale x 8 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst1_vnum_f16u10__SVBool_tPDhlu13__SVFloat16_t(
@@ -433,7 +433,7 @@ void test_svst1_vnum_u64(svbool_t pg, uint64_t *base, int64_t vnum, svuint64_t d
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv8f16.p0(<vscale x 8 x half> [[DATA:%.*]], ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]])
+// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv8f16.p0(<vscale x 8 x half> [[DATA:%.*]], ptr align 1 [[TMP3]], <vscale x 8 x i1> [[TMP0]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_f16(svbool_t pg, float16_t *base, int64_t vnum, svfloat16_t data) MODE_ATTR
@@ -448,7 +448,7 @@ void test_svst1_vnum_f16(svbool_t pg, float16_t *base, int64_t vnum, svfloat16_t
 // CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[DATA:%.*]], ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[DATA:%.*]], ptr align 1 [[TMP3]], <vscale x 4 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst1_vnum_f32u10__SVBool_tPflu13__SVFloat32_t(
@@ -458,7 +458,7 @@ void test_svst1_vnum_f16(svbool_t pg, float16_t *base, int64_t vnum, svfloat16_t
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[DATA:%.*]], ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]])
+// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[DATA:%.*]], ptr align 1 [[TMP3]], <vscale x 4 x i1> [[TMP0]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_f32(svbool_t pg, float32_t *base, int64_t vnum, svfloat32_t data) MODE_ATTR
@@ -473,7 +473,7 @@ void test_svst1_vnum_f32(svbool_t pg, float32_t *base, int64_t vnum, svfloat32_t
 // CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[DATA:%.*]], ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[DATA:%.*]], ptr align 1 [[TMP3]], <vscale x 2 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst1_vnum_f64u10__SVBool_tPdlu13__SVFloat64_t(
@@ -483,7 +483,7 @@ void test_svst1_vnum_f32(svbool_t pg, float32_t *base, int64_t vnum, svfloat32_t
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[DATA:%.*]], ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]])
+// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[DATA:%.*]], ptr align 1 [[TMP3]], <vscale x 2 x i1> [[TMP0]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_f64(svbool_t pg, float64_t *base, int64_t vnum, svfloat64_t data) MODE_ATTR
@@ -497,7 +497,7 @@ void test_svst1_vnum_f64(svbool_t pg, float64_t *base, int64_t vnum, svfloat64_t
 // CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP1]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[DATA:%.*]], ptr [[TMP2]], i32 1, <vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[DATA:%.*]], ptr align 1 [[TMP2]], <vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst1_vnum_mf8u10__SVBool_tPu6__mfp8lu13__SVMfloat8_t(
@@ -506,7 +506,7 @@ void test_svst1_vnum_f64(svbool_t pg, float64_t *base, int64_t vnum, svfloat64_t
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP1]]
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[DATA:%.*]], ptr [[TMP2]], i32 1, <vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[DATA:%.*]], ptr align 1 [[TMP2]], <vscale x 16 x i1> [[PG:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_mf8(svbool_t pg, mfloat8_t *base, int64_t vnum, svmfloat8_t data) MODE_ATTR
@@ -1247,13 +1247,13 @@ void test_svst1_scatter_u64base_index_f64(svbool_t pg, svuint64_t bases, int64_t
 // CHECK-LABEL: @test_svst1_bf16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8bf16.p0(<vscale x 8 x bfloat> [[DATA:%.*]], ptr [[BASE:%.*]], i32 1, <vscale x 8 x i1> [[TMP0]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8bf16.p0(<vscale x 8 x bfloat> [[DATA:%.*]], ptr align 1 [[BASE:%.*]], <vscale x 8 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z15test_svst1_bf16u10__SVBool_tPu6__bf16u14__SVBfloat16_t(
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv8bf16.p0(<vscale x 8 x bfloat> [[DATA:%.*]], ptr [[BASE:%.*]], i32 1, <vscale x 8 x i1> [[TMP0]])
+// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv8bf16.p0(<vscale x 8 x bfloat> [[DATA:%.*]], ptr align 1 [[BASE:%.*]], <vscale x 8 x i1> [[TMP0]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_bf16(svbool_t pg, bfloat16_t *base, svbfloat16_t data) MODE_ATTR
@@ -1268,7 +1268,7 @@ void test_svst1_bf16(svbool_t pg, bfloat16_t *base, svbfloat16_t data) MODE_ATTR
 // CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8bf16.p0(<vscale x 8 x bfloat> [[DATA:%.*]], ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8bf16.p0(<vscale x 8 x bfloat> [[DATA:%.*]], ptr align 1 [[TMP3]], <vscale x 8 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svst1_vnum_bf16u10__SVBool_tPu6__bf16lu14__SVBfloat16_t(
@@ -1278,7 +1278,7 @@ void test_svst1_bf16(svbool_t pg, bfloat16_t *base, svbfloat16_t data) MODE_ATTR
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
 // CPP-CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
-// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv8bf16.p0(<vscale x 8 x bfloat> [[DATA:%.*]], ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]])
+// CPP-CHECK-NEXT:    tail call void @llvm.masked.store.nxv8bf16.p0(<vscale x 8 x bfloat> [[DATA:%.*]], ptr align 1 [[TMP3]], <vscale x 8 x i1> [[TMP0]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst1_vnum_bf16(svbool_t pg, bfloat16_t *base, int64_t vnum, svbfloat16_t data) MODE_ATTR
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_st1b.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_st1b.c
index c908bc2a483ce..3ac49e26d058e 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_st1b.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_st1b.c
@@ -24,7 +24,7 @@
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TMP1:%.*]] = trunc <vscale x 8 x i16> [[DATA:%.*]] to <vscale x 8 x i8>
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP1]], ptr [[BASE:%.*]], i32 1, <vscale x 8 x i1> [[TMP0]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP1]], ptr align 1 [[BASE:%.*]], <vscale x 8 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 void test_svst1b_s16(svbool_t pg, int8_t *base, svint16_t data) MODE_ATTR
@@ -36,7 +36,7 @@ void test_svst1b_s16(svbool_t pg, int8_t *base, svint16_t data) MODE_ATTR
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TMP1:%.*]] = trunc <vscale x 4 x i32> [[DATA:%.*]] to <vscale x 4 x i8>
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP1]], ptr [[BASE:%.*]], i32 1, <vscale x 4 x i1> [[TMP0]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP1]], ptr align 1 [[BASE:%.*]], <vscale x 4 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 void test_svst1b_s32(svbool_t pg, int8_t *base, svint32_t data) MODE_ATTR
@@ -48,7 +48,7 @@ void test_svst1b_s32(svbool_t pg, int8_t *base, svint32_t data) MODE_ATTR
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TMP1:%.*]] = trunc <vscale x 2 x i64> [[DATA:%.*]] to <vscale x 2 x i8>
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i8.p0(<vscale x 2 x i8> [[TMP1]], ptr [[BASE:%.*]], i32 1, <vscale x 2 x i1> [[TMP0]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i8.p0(<vscale x 2 x i8> [[TMP1]], ptr align 1 [[BASE:%.*]], <vscale x 2 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 void test_svst1b_s64(svbool_t pg, int8_t *base, svint64_t data) MODE_ATTR
@@ -60,7 +60,7 @@ void test_svst1b_s64(svbool_t pg, int8_t *base, svint64_t data) MODE_ATTR
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TMP1:%.*]] = trunc <vscale x 8 x i16> [[DATA:%.*]] to <vscale x 8 x i8>
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP1]], ptr [[BASE:%.*]], i32 1, <vscale x 8 x i1> [[TMP0]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP1]], ptr align 1 [[BASE:%.*]], <vscale x 8 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 void test_svst1b_u16(svbool_t pg, uint8_t *base, svuint16_t data) MODE_ATTR
@@ -72,7 +72,7 @@ void test_svst1b_u16(svbool_t pg, uint8_t *base, svuint16_t data) MODE_ATTR
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TMP1:%.*]] = trunc <vscale x 4 x i32> [[DATA:%.*]] to <vscale x 4 x i8>
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP1]], ptr [[BASE:%.*]], i32 1, <vscale x 4 x i1> [[TMP0]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP1]], ptr align 1 [[BASE:%.*]], <vscale x 4 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 void test_svst1b_u32(svbool_t pg, uint8_t *base, svuint32_t data) MODE_ATTR
@@ -84,7 +84,7 @@ void test_svst1b_u32(svbool_t pg, uint8_t *base, svuint32_t data) MODE_ATTR
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TMP1:%.*]] = trunc <vscale x 2 x i64> [[DATA:%.*]] to <vscale x 2 x i8>
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i8.p0(<vscale x 2 x i8> [[TMP1]], ptr [[BASE:%.*]], i32 1, <vscale x 2 x i1> [[TMP0]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i8.p0(<vscale x 2 x i8> [[TMP1]], ptr align 1 [[BASE:%.*]], <vscale x 2 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 void test_svst1b_u64(svbool_t pg, uint8_t *base, svuint64_t data) MODE_ATTR
@@ -100,7 +100,7 @@ void test_svst1b_u64(svbool_t pg, uint8_t *base, svuint64_t data) MODE_ATTR
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP4:%.*]] = trunc <vscale x 8 x i16> [[DATA:%.*]] to <vscale x 8 x i8>
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP4]], ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP4]], ptr align 1 [[TMP3]], <vscale x 8 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 void test_svst1b_vnum_s16(svbool_t pg, int8_t *base, int64_t vnum, svint16_t data) MODE_ATTR
@@ -116,7 +116,7 @@ void test_svst1b_vnum_s16(svbool_t pg, int8_t *base, int64_t vnum, svint16_t dat
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP4:%.*]] = trunc <vscale x 4 x i32> [[DATA:%.*]] to <vscale x 4 x i8>
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP4]], ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP4]], ptr align 1 [[TMP3]], <vscale x 4 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 void test_svst1b_vnum_s32(svbool_t pg, int8_t *base, int64_t vnum, svint32_t data) MODE_ATTR
@@ -132,7 +132,7 @@ void test_svst1b_vnum_s32(svbool_t pg, int8_t *base, int64_t vnum, svint32_t dat
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP4:%.*]] = trunc <vscale x 2 x i64> [[DATA:%.*]] to <vscale x 2 x i8>
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i8.p0(<vscale x 2 x i8> [[TMP4]], ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i8.p0(<vscale x 2 x i8> [[TMP4]], ptr align 1 [[TMP3]], <vscale x 2 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 void test_svst1b_vnum_s64(svbool_t pg, int8_t *base, int64_t vnum, svint64_t data) MODE_ATTR
@@ -148,7 +148,7 @@ void test_svst1b_vnum_s64(svbool_t pg, int8_t *base, int64_t vnum, svint64_t dat
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP4:%.*]] = trunc <vscale x 8 x i16> [[DATA:%.*]] to <vscale x 8 x i8>
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP4]], ptr [[TMP3]], i32 1, <vscale x 8 x i1> [[TMP0]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP4]], ptr align 1 [[TMP3]], <vscale x 8 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 void test_svst1b_vnum_u16(svbool_t pg, uint8_t *base, int64_t vnum, svuint16_t data) MODE_ATTR
@@ -164,7 +164,7 @@ void test_svst1b_vnum_u16(svbool_t pg, uint8_t *base, int64_t vnum, svuint16_t d
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP4:%.*]] = trunc <vscale x 4 x i32> [[DATA:%.*]] to <vscale x 4 x i8>
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP4]], ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP4]], ptr align 1 [[TMP3]], <vscale x 4 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 void test_svst1b_vnum_u32(svbool_t pg, uint8_t *base, int64_t vnum, svuint32_t data) MODE_ATTR
@@ -180,7 +180,7 @@ void test_svst1b_vnum_u32(svbool_t pg, uint8_t *base, int64_t vnum, svuint32_t d
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP4:%.*]] = trunc <vscale x 2 x i64> [[DATA:%.*]] to <vscale x 2 x i8>
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i8.p0(<vscale x 2 x i8> [[TMP4]], ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i8.p0(<vscale x 2 x i8> [[TMP4]], ptr align 1 [[TMP3]], <vscale x 2 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 void test_svst1b_vnum_u64(svbool_t pg, uint8_t *base, int64_t vnum, svuint64_t data) MODE_ATTR
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_st1h.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_st1h.c
index 959b658425f01..1e3e0b2e14004 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_st1h.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_st1h.c
@@ -24,7 +24,7 @@
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TMP1:%.*]] = trunc <vscale x 4 x i32> [[DATA:%.*]] to <vscale x 4 x i16>
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i16.p0(<vscale x 4 x i16> [[TMP1]], ptr [[BASE:%.*]], i32 1, <vscale x 4 x i1> [[TMP0]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i16.p0(<vscale x 4 x i16> [[TMP1]], ptr align 1 [[BASE:%.*]], <vscale x 4 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 void test_svst1h_s32(svbool_t pg, int16_t *base, svint32_t data) MODE_ATTR
@@ -36,7 +36,7 @@ void test_svst1h_s32(svbool_t pg, int16_t *base, svint32_t data) MODE_ATTR
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TMP1:%.*]] = trunc <vscale x 2 x i64> [[DATA:%.*]] to <vscale x 2 x i16>
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i16.p0(<vscale x 2 x i16> [[TMP1]], ptr [[BASE:%.*]], i32 1, <vscale x 2 x i1> [[TMP0]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i16.p0(<vscale x 2 x i16> [[TMP1]], ptr align 1 [[BASE:%.*]], <vscale x 2 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 void test_svst1h_s64(svbool_t pg, int16_t *base, svint64_t data) MODE_ATTR
@@ -48,7 +48,7 @@ void test_svst1h_s64(svbool_t pg, int16_t *base, svint64_t data) MODE_ATTR
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TMP1:%.*]] = trunc <vscale x 4 x i32> [[DATA:%.*]] to <vscale x 4 x i16>
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i16.p0(<vscale x 4 x i16> [[TMP1]], ptr [[BASE:%.*]], i32 1, <vscale x 4 x i1> [[TMP0]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i16.p0(<vscale x 4 x i16> [[TMP1]], ptr align 1 [[BASE:%.*]], <vscale x 4 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 void test_svst1h_u32(svbool_t pg, uint16_t *base, svuint32_t data) MODE_ATTR
@@ -60,7 +60,7 @@ void test_svst1h_u32(svbool_t pg, uint16_t *base, svuint32_t data) MODE_ATTR
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TMP1:%.*]] = trunc <vscale x 2 x i64> [[DATA:%.*]] to <vscale x 2 x i16>
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i16.p0(<vscale x 2 x i16> [[TMP1]], ptr [[BASE:%.*]], i32 1, <vscale x 2 x i1> [[TMP0]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i16.p0(<vscale x 2 x i16> [[TMP1]], ptr align 1 [[BASE:%.*]], <vscale x 2 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 void test_svst1h_u64(svbool_t pg, uint16_t *base, svuint64_t data) MODE_ATTR
@@ -76,7 +76,7 @@ void test_svst1h_u64(svbool_t pg, uint16_t *base, svuint64_t data) MODE_ATTR
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP4:%.*]] = trunc <vscale x 4 x i32> [[DATA:%.*]] to <vscale x 4 x i16>
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i16.p0(<vscale x 4 x i16> [[TMP4]], ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i16.p0(<vscale x 4 x i16> [[TMP4]], ptr align 1 [[TMP3]], <vscale x 4 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 void test_svst1h_vnum_s32(svbool_t pg, int16_t *base, int64_t vnum, svint32_t data) MODE_ATTR
@@ -92,7 +92,7 @@ void test_svst1h_vnum_s32(svbool_t pg, int16_t *base, int64_t vnum, svint32_t da
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP4:%.*]] = trunc <vscale x 2 x i64> [[DATA:%.*]] to <vscale x 2 x i16>
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i16.p0(<vscale x 2 x i16> [[TMP4]], ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i16.p0(<vscale x 2 x i16> [[TMP4]], ptr align 1 [[TMP3]], <vscale x 2 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 void test_svst1h_vnum_s64(svbool_t pg, int16_t *base, int64_t vnum, svint64_t data) MODE_ATTR
@@ -108,7 +108,7 @@ void test_svst1h_vnum_s64(svbool_t pg, int16_t *base, int64_t vnum, svint64_t da
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP4:%.*]] = trunc <vscale x 4 x i32> [[DATA:%.*]] to <vscale x 4 x i16>
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i16.p0(<vscale x 4 x i16> [[TMP4]], ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[TMP0]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i16.p0(<vscale x 4 x i16> [[TMP4]], ptr align 1 [[TMP3]], <vscale x 4 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 void test_svst1h_vnum_u32(svbool_t pg, uint16_t *base, int64_t vnum, svuint32_t data) MODE_ATTR
@@ -124,7 +124,7 @@ void test_svst1h_vnum_u32(svbool_t pg, uint16_t *base, int64_t vnum, svuint32_t
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP4:%.*]] = trunc <vscale x 2 x i64> [[DATA:%.*]] to <vscale x 2 x i16>
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i16.p0(<vscale x 2 x i16> [[TMP4]], ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i16.p0(<vscale x 2 x i16> [[TMP4]], ptr align 1 [[TMP3]], <vscale x 2 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 void test_svst1h_vnum_u64(svbool_t pg, uint16_t *base, int64_t vnum, svuint64_t data) MODE_ATTR
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_st1w.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_st1w.c
index 3d9e45bda7b3f..1a1241286ac2b 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_st1w.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_st1w.c
@@ -24,7 +24,7 @@
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TMP1:%.*]] = trunc <vscale x 2 x i64> [[DATA:%.*]] to <vscale x 2 x i32>
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i32.p0(<vscale x 2 x i32> [[TMP1]], ptr [[BASE:%.*]], i32 1, <vscale x 2 x i1> [[TMP0]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i32.p0(<vscale x 2 x i32> [[TMP1]], ptr align 1 [[BASE:%.*]], <vscale x 2 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 void test_svst1w_s64(svbool_t pg, int32_t *base, svint64_t data) MODE_ATTR
@@ -36,7 +36,7 @@ void test_svst1w_s64(svbool_t pg, int32_t *base, svint64_t data) MODE_ATTR
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TMP1:%.*]] = trunc <vscale x 2 x i64> [[DATA:%.*]] to <vscale x 2 x i32>
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i32.p0(<vscale x 2 x i32> [[TMP1]], ptr [[BASE:%.*]], i32 1, <vscale x 2 x i1> [[TMP0]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i32.p0(<vscale x 2 x i32> [[TMP1]], ptr align 1 [[BASE:%.*]], <vscale x 2 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 void test_svst1w_u64(svbool_t pg, uint32_t *base, svuint64_t data) MODE_ATTR
@@ -52,7 +52,7 @@ void test_svst1w_u64(svbool_t pg, uint32_t *base, svuint64_t data) MODE_ATTR
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP4:%.*]] = trunc <vscale x 2 x i64> [[DATA:%.*]] to <vscale x 2 x i32>
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i32.p0(<vscale x 2 x i32> [[TMP4]], ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i32.p0(<vscale x 2 x i32> [[TMP4]], ptr align 1 [[TMP3]], <vscale x 2 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 void test_svst1w_vnum_s64(svbool_t pg, int32_t *base, int64_t vnum, svint64_t data) MODE_ATTR
@@ -68,7 +68,7 @@ void test_svst1w_vnum_s64(svbool_t pg, int32_t *base, int64_t vnum, svint64_t da
 // CHECK-NEXT:    [[DOTIDX:%.*]] = mul i64 [[VNUM:%.*]], [[TMP2]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 [[DOTIDX]]
 // CHECK-NEXT:    [[TMP4:%.*]] = trunc <vscale x 2 x i64> [[DATA:%.*]] to <vscale x 2 x i32>
-// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i32.p0(<vscale x 2 x i32> [[TMP4]], ptr [[TMP3]], i32 1, <vscale x 2 x i1> [[TMP0]])
+// CHECK-NEXT:    tail call void @llvm.masked.store.nxv2i32.p0(<vscale x 2 x i32> [[TMP4]], ptr align 1 [[TMP3]], <vscale x 2 x i1> [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 void test_svst1w_vnum_u64(svbool_t pg, uint32_t *base, int64_t vnum, svuint64_t data) MODE_ATTR
diff --git a/clang/test/CodeGen/LoongArch/inline-asm-operand-modifiers.c b/clang/test/CodeGen/LoongArch/inline-asm-operand-modifiers.c
index b36fe7a7b69bb..bbb9962dd5dd2 100644
--- a/clang/test/CodeGen/LoongArch/inline-asm-operand-modifiers.c
+++ b/clang/test/CodeGen/LoongArch/inline-asm-operand-modifiers.c
@@ -6,7 +6,7 @@
 
 // CHECK-LABEL: @test_z_zero(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 asm sideeffect "add.w $0, $1, ${2:z}", "=r,r,ri"(i32 [[A:%.*]], i32 0) #[[ATTR1:[0-9]+]], !srcloc !2
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 asm sideeffect "add.w $0, $1, ${2:z}", "=r,r,ri"(i32 [[A:%.*]], i32 0) #[[ATTR1:[0-9]+]], !srcloc [[META6:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
 void test_z_zero(int a) {
@@ -16,7 +16,7 @@ void test_z_zero(int a) {
 
 // CHECK-LABEL: @test_z_nonzero(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 asm sideeffect "add.w $0, $1, ${2:z}", "=r,r,ri"(i32 [[A:%.*]], i32 1) #[[ATTR1]], !srcloc !3
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 asm sideeffect "add.w $0, $1, ${2:z}", "=r,r,ri"(i32 [[A:%.*]], i32 1) #[[ATTR1]], !srcloc [[META7:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
 void test_z_nonzero(int a) {
diff --git a/clang/test/CodeGen/LoongArch/lasx/builtin-alias.c b/clang/test/CodeGen/LoongArch/lasx/builtin-alias.c
index dd094e5493a60..03a746c966cdd 100644
--- a/clang/test/CodeGen/LoongArch/lasx/builtin-alias.c
+++ b/clang/test/CodeGen/LoongArch/lasx/builtin-alias.c
@@ -6,2808 +6,2808 @@
 // CHECK-LABEL: define dso_local void @xvsll_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2:![0-9]+]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6:![0-9]+]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsll.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsll_b(v32i8 _1, v32i8 _2) { return __lasx_xvsll_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsll_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsll.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsll_h(v16i16 _1, v16i16 _2) { return __lasx_xvsll_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsll_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsll.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsll_w(v8i32 _1, v8i32 _2) { return __lasx_xvsll_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsll_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsll.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsll_d(v4i64 _1, v4i64 _2) { return __lasx_xvsll_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvslli_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslli.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvslli_b(v32i8 _1) { return __lasx_xvslli_b(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvslli_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslli.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvslli_h(v16i16 _1) { return __lasx_xvslli_h(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvslli_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslli.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvslli_w(v8i32 _1) { return __lasx_xvslli_w(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvslli_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslli.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvslli_d(v4i64 _1) { return __lasx_xvslli_d(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsra_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsra.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsra_b(v32i8 _1, v32i8 _2) { return __lasx_xvsra_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsra_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsra.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsra_h(v16i16 _1, v16i16 _2) { return __lasx_xvsra_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsra_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsra.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsra_w(v8i32 _1, v8i32 _2) { return __lasx_xvsra_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsra_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsra.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsra_d(v4i64 _1, v4i64 _2) { return __lasx_xvsra_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsrai_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrai.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrai_b(v32i8 _1) { return __lasx_xvsrai_b(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsrai_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrai.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrai_h(v16i16 _1) { return __lasx_xvsrai_h(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsrai_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrai.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrai_w(v8i32 _1) { return __lasx_xvsrai_w(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsrai_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrai.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsrai_d(v4i64 _1) { return __lasx_xvsrai_d(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsrar_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrar.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrar_b(v32i8 _1, v32i8 _2) { return __lasx_xvsrar_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsrar_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrar.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrar_h(v16i16 _1, v16i16 _2) { return __lasx_xvsrar_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsrar_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrar.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrar_w(v8i32 _1, v8i32 _2) { return __lasx_xvsrar_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsrar_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrar.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsrar_d(v4i64 _1, v4i64 _2) { return __lasx_xvsrar_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsrari_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrari.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrari_b(v32i8 _1) { return __lasx_xvsrari_b(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsrari_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrari.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrari_h(v16i16 _1) { return __lasx_xvsrari_h(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsrari_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrari.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrari_w(v8i32 _1) { return __lasx_xvsrari_w(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsrari_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrari.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsrari_d(v4i64 _1) { return __lasx_xvsrari_d(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsrl_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrl.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrl_b(v32i8 _1, v32i8 _2) { return __lasx_xvsrl_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsrl_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrl.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrl_h(v16i16 _1, v16i16 _2) { return __lasx_xvsrl_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsrl_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrl.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrl_w(v8i32 _1, v8i32 _2) { return __lasx_xvsrl_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsrl_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrl.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsrl_d(v4i64 _1, v4i64 _2) { return __lasx_xvsrl_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsrli_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrli.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrli_b(v32i8 _1) { return __lasx_xvsrli_b(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsrli_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrli.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrli_h(v16i16 _1) { return __lasx_xvsrli_h(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsrli_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrli.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrli_w(v8i32 _1) { return __lasx_xvsrli_w(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsrli_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrli.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsrli_d(v4i64 _1) { return __lasx_xvsrli_d(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsrlr_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlr.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrlr_b(v32i8 _1, v32i8 _2) { return __lasx_xvsrlr_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsrlr_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlr.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrlr_h(v16i16 _1, v16i16 _2) { return __lasx_xvsrlr_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsrlr_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlr.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrlr_w(v8i32 _1, v8i32 _2) { return __lasx_xvsrlr_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsrlr_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlr.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsrlr_d(v4i64 _1, v4i64 _2) { return __lasx_xvsrlr_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsrlri_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlri.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrlri_b(v32i8 _1) { return __lasx_xvsrlri_b(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsrlri_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlri.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrlri_h(v16i16 _1) { return __lasx_xvsrlri_h(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsrlri_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlri.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrlri_w(v8i32 _1) { return __lasx_xvsrlri_w(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsrlri_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlri.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsrlri_d(v4i64 _1) { return __lasx_xvsrlri_d(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvbitclr_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitclr.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvbitclr_b(v32u8 _1, v32u8 _2) { return __lasx_xvbitclr_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvbitclr_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitclr.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvbitclr_h(v16u16 _1, v16u16 _2) { return __lasx_xvbitclr_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvbitclr_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitclr.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvbitclr_w(v8u32 _1, v8u32 _2) { return __lasx_xvbitclr_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvbitclr_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitclr.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvbitclr_d(v4u64 _1, v4u64 _2) { return __lasx_xvbitclr_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvbitclri_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitclri.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvbitclri_b(v32u8 _1) { return __lasx_xvbitclri_b(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvbitclri_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitclri.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvbitclri_h(v16u16 _1) { return __lasx_xvbitclri_h(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvbitclri_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitclri.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvbitclri_w(v8u32 _1) { return __lasx_xvbitclri_w(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvbitclri_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitclri.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvbitclri_d(v4u64 _1) { return __lasx_xvbitclri_d(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvbitset_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitset.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvbitset_b(v32u8 _1, v32u8 _2) { return __lasx_xvbitset_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvbitset_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitset.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvbitset_h(v16u16 _1, v16u16 _2) { return __lasx_xvbitset_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvbitset_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitset.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvbitset_w(v8u32 _1, v8u32 _2) { return __lasx_xvbitset_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvbitset_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitset.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvbitset_d(v4u64 _1, v4u64 _2) { return __lasx_xvbitset_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvbitseti_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitseti.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvbitseti_b(v32u8 _1) { return __lasx_xvbitseti_b(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvbitseti_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitseti.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvbitseti_h(v16u16 _1) { return __lasx_xvbitseti_h(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvbitseti_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitseti.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvbitseti_w(v8u32 _1) { return __lasx_xvbitseti_w(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvbitseti_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitseti.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvbitseti_d(v4u64 _1) { return __lasx_xvbitseti_d(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvbitrev_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitrev.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvbitrev_b(v32u8 _1, v32u8 _2) { return __lasx_xvbitrev_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvbitrev_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitrev.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvbitrev_h(v16u16 _1, v16u16 _2) { return __lasx_xvbitrev_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvbitrev_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitrev.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvbitrev_w(v8u32 _1, v8u32 _2) { return __lasx_xvbitrev_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvbitrev_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitrev.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvbitrev_d(v4u64 _1, v4u64 _2) { return __lasx_xvbitrev_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvbitrevi_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitrevi.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvbitrevi_b(v32u8 _1) { return __lasx_xvbitrevi_b(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvbitrevi_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitrevi.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvbitrevi_h(v16u16 _1) { return __lasx_xvbitrevi_h(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvbitrevi_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitrevi.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvbitrevi_w(v8u32 _1) { return __lasx_xvbitrevi_w(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvbitrevi_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitrevi.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvbitrevi_d(v4u64 _1) { return __lasx_xvbitrevi_d(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvadd_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvadd.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvadd_b(v32i8 _1, v32i8 _2) { return __lasx_xvadd_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvadd_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvadd.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvadd_h(v16i16 _1, v16i16 _2) { return __lasx_xvadd_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvadd_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvadd.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvadd_w(v8i32 _1, v8i32 _2) { return __lasx_xvadd_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvadd_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvadd.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvadd_d(v4i64 _1, v4i64 _2) { return __lasx_xvadd_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddi_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvaddi.bu(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvaddi_bu(v32i8 _1) { return __lasx_xvaddi_bu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvaddi_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddi.hu(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvaddi_hu(v16i16 _1) { return __lasx_xvaddi_hu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvaddi_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddi.wu(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvaddi_wu(v8i32 _1) { return __lasx_xvaddi_wu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvaddi_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddi.du(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddi_du(v4i64 _1) { return __lasx_xvaddi_du(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsub_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsub.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsub_b(v32i8 _1, v32i8 _2) { return __lasx_xvsub_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsub_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsub.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsub_h(v16i16 _1, v16i16 _2) { return __lasx_xvsub_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsub_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsub.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsub_w(v8i32 _1, v8i32 _2) { return __lasx_xvsub_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsub_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsub.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsub_d(v4i64 _1, v4i64 _2) { return __lasx_xvsub_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsubi_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsubi.bu(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsubi_bu(v32i8 _1) { return __lasx_xvsubi_bu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsubi_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubi.hu(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsubi_hu(v16i16 _1) { return __lasx_xvsubi_hu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsubi_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubi.wu(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsubi_wu(v8i32 _1) { return __lasx_xvsubi_wu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsubi_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubi.du(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsubi_du(v4i64 _1) { return __lasx_xvsubi_du(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvmax_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmax.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmax_b(v32i8 _1, v32i8 _2) { return __lasx_xvmax_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmax_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmax.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmax_h(v16i16 _1, v16i16 _2) { return __lasx_xvmax_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmax_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmax.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmax_w(v8i32 _1, v8i32 _2) { return __lasx_xvmax_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmax_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmax.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmax_d(v4i64 _1, v4i64 _2) { return __lasx_xvmax_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmaxi_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmaxi.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmaxi_b(v32i8 _1) { return __lasx_xvmaxi_b(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvmaxi_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaxi.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmaxi_h(v16i16 _1) { return __lasx_xvmaxi_h(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvmaxi_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaxi.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmaxi_w(v8i32 _1) { return __lasx_xvmaxi_w(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvmaxi_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaxi.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmaxi_d(v4i64 _1) { return __lasx_xvmaxi_d(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvmax_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmax.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvmax_bu(v32u8 _1, v32u8 _2) { return __lasx_xvmax_bu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmax_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmax.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvmax_hu(v16u16 _1, v16u16 _2) { return __lasx_xvmax_hu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmax_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmax.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvmax_wu(v8u32 _1, v8u32 _2) { return __lasx_xvmax_wu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmax_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmax.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvmax_du(v4u64 _1, v4u64 _2) { return __lasx_xvmax_du(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmaxi_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmaxi.bu(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvmaxi_bu(v32u8 _1) { return __lasx_xvmaxi_bu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvmaxi_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaxi.hu(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvmaxi_hu(v16u16 _1) { return __lasx_xvmaxi_hu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvmaxi_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaxi.wu(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvmaxi_wu(v8u32 _1) { return __lasx_xvmaxi_wu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvmaxi_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaxi.du(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvmaxi_du(v4u64 _1) { return __lasx_xvmaxi_du(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvmin_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmin.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmin_b(v32i8 _1, v32i8 _2) { return __lasx_xvmin_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmin_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmin.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmin_h(v16i16 _1, v16i16 _2) { return __lasx_xvmin_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmin_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmin.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmin_w(v8i32 _1, v8i32 _2) { return __lasx_xvmin_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmin_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmin.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmin_d(v4i64 _1, v4i64 _2) { return __lasx_xvmin_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmini_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmini.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmini_b(v32i8 _1) { return __lasx_xvmini_b(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvmini_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmini.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmini_h(v16i16 _1) { return __lasx_xvmini_h(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvmini_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmini.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmini_w(v8i32 _1) { return __lasx_xvmini_w(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvmini_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmini.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmini_d(v4i64 _1) { return __lasx_xvmini_d(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvmin_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmin.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvmin_bu(v32u8 _1, v32u8 _2) { return __lasx_xvmin_bu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmin_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmin.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvmin_hu(v16u16 _1, v16u16 _2) { return __lasx_xvmin_hu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmin_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmin.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvmin_wu(v8u32 _1, v8u32 _2) { return __lasx_xvmin_wu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmin_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmin.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvmin_du(v4u64 _1, v4u64 _2) { return __lasx_xvmin_du(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmini_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmini.bu(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvmini_bu(v32u8 _1) { return __lasx_xvmini_bu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvmini_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmini.hu(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvmini_hu(v16u16 _1) { return __lasx_xvmini_hu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvmini_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmini.wu(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvmini_wu(v8u32 _1) { return __lasx_xvmini_wu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvmini_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmini.du(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvmini_du(v4u64 _1) { return __lasx_xvmini_du(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvseq_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvseq.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvseq_b(v32i8 _1, v32i8 _2) { return __lasx_xvseq_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvseq_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvseq.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvseq_h(v16i16 _1, v16i16 _2) { return __lasx_xvseq_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvseq_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvseq.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvseq_w(v8i32 _1, v8i32 _2) { return __lasx_xvseq_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvseq_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvseq.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvseq_d(v4i64 _1, v4i64 _2) { return __lasx_xvseq_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvseqi_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvseqi.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvseqi_b(v32i8 _1) { return __lasx_xvseqi_b(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvseqi_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvseqi.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvseqi_h(v16i16 _1) { return __lasx_xvseqi_h(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvseqi_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvseqi.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvseqi_w(v8i32 _1) { return __lasx_xvseqi_w(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvseqi_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvseqi.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvseqi_d(v4i64 _1) { return __lasx_xvseqi_d(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvslt_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslt.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvslt_b(v32i8 _1, v32i8 _2) { return __lasx_xvslt_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvslt_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslt.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvslt_h(v16i16 _1, v16i16 _2) { return __lasx_xvslt_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvslt_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslt.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvslt_w(v8i32 _1, v8i32 _2) { return __lasx_xvslt_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvslt_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslt.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvslt_d(v4i64 _1, v4i64 _2) { return __lasx_xvslt_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvslti_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslti.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvslti_b(v32i8 _1) { return __lasx_xvslti_b(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvslti_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslti.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvslti_h(v16i16 _1) { return __lasx_xvslti_h(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvslti_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslti.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvslti_w(v8i32 _1) { return __lasx_xvslti_w(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvslti_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslti.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvslti_d(v4i64 _1) { return __lasx_xvslti_d(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvslt_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslt.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvslt_bu(v32u8 _1, v32u8 _2) { return __lasx_xvslt_bu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvslt_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslt.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvslt_hu(v16u16 _1, v16u16 _2) { return __lasx_xvslt_hu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvslt_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslt.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvslt_wu(v8u32 _1, v8u32 _2) { return __lasx_xvslt_wu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvslt_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslt.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvslt_du(v4u64 _1, v4u64 _2) { return __lasx_xvslt_du(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvslti_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslti.bu(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvslti_bu(v32u8 _1) { return __lasx_xvslti_bu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvslti_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslti.hu(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvslti_hu(v16u16 _1) { return __lasx_xvslti_hu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvslti_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslti.wu(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvslti_wu(v8u32 _1) { return __lasx_xvslti_wu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvslti_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslti.du(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvslti_du(v4u64 _1) { return __lasx_xvslti_du(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsle_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsle.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsle_b(v32i8 _1, v32i8 _2) { return __lasx_xvsle_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsle_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsle.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsle_h(v16i16 _1, v16i16 _2) { return __lasx_xvsle_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsle_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsle.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsle_w(v8i32 _1, v8i32 _2) { return __lasx_xvsle_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsle_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsle.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsle_d(v4i64 _1, v4i64 _2) { return __lasx_xvsle_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvslei_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslei.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvslei_b(v32i8 _1) { return __lasx_xvslei_b(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvslei_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslei.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvslei_h(v16i16 _1) { return __lasx_xvslei_h(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvslei_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslei.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvslei_w(v8i32 _1) { return __lasx_xvslei_w(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvslei_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslei.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvslei_d(v4i64 _1) { return __lasx_xvslei_d(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsle_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsle.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsle_bu(v32u8 _1, v32u8 _2) { return __lasx_xvsle_bu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsle_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsle.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsle_hu(v16u16 _1, v16u16 _2) { return __lasx_xvsle_hu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsle_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsle.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsle_wu(v8u32 _1, v8u32 _2) { return __lasx_xvsle_wu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsle_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsle.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsle_du(v4u64 _1, v4u64 _2) { return __lasx_xvsle_du(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvslei_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslei.bu(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvslei_bu(v32u8 _1) { return __lasx_xvslei_bu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvslei_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslei.hu(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvslei_hu(v16u16 _1) { return __lasx_xvslei_hu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvslei_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslei.wu(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvslei_wu(v8u32 _1) { return __lasx_xvslei_wu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvslei_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslei.du(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvslei_du(v4u64 _1) { return __lasx_xvslei_du(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsat_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsat.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsat_b(v32i8 _1) { return __lasx_xvsat_b(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsat_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsat.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsat_h(v16i16 _1) { return __lasx_xvsat_h(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsat_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsat.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsat_w(v8i32 _1) { return __lasx_xvsat_w(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsat_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsat.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsat_d(v4i64 _1) { return __lasx_xvsat_d(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsat_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsat.bu(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvsat_bu(v32u8 _1) { return __lasx_xvsat_bu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsat_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsat.hu(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvsat_hu(v16u16 _1) { return __lasx_xvsat_hu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsat_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsat.wu(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvsat_wu(v8u32 _1) { return __lasx_xvsat_wu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsat_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsat.du(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvsat_du(v4u64 _1) { return __lasx_xvsat_du(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvadda_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvadda.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvadda_b(v32i8 _1, v32i8 _2) { return __lasx_xvadda_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvadda_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvadda.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvadda_h(v16i16 _1, v16i16 _2) { return __lasx_xvadda_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvadda_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvadda.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvadda_w(v8i32 _1, v8i32 _2) { return __lasx_xvadda_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvadda_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvadda.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvadda_d(v4i64 _1, v4i64 _2) { return __lasx_xvadda_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsadd_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsadd.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsadd_b(v32i8 _1, v32i8 _2) { return __lasx_xvsadd_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsadd_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsadd.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsadd_h(v16i16 _1, v16i16 _2) { return __lasx_xvsadd_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsadd_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsadd.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsadd_w(v8i32 _1, v8i32 _2) { return __lasx_xvsadd_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsadd_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsadd.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsadd_d(v4i64 _1, v4i64 _2) { return __lasx_xvsadd_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsadd_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsadd.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvsadd_bu(v32u8 _1, v32u8 _2) { return __lasx_xvsadd_bu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsadd_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsadd.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvsadd_hu(v16u16 _1, v16u16 _2) { return __lasx_xvsadd_hu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsadd_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsadd.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvsadd_wu(v8u32 _1, v8u32 _2) { return __lasx_xvsadd_wu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsadd_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsadd.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvsadd_du(v4u64 _1, v4u64 _2) { return __lasx_xvsadd_du(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvavg_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavg.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvavg_b(v32i8 _1, v32i8 _2) { return __lasx_xvavg_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvavg_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavg.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvavg_h(v16i16 _1, v16i16 _2) { return __lasx_xvavg_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvavg_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavg.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvavg_w(v8i32 _1, v8i32 _2) { return __lasx_xvavg_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvavg_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavg.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvavg_d(v4i64 _1, v4i64 _2) { return __lasx_xvavg_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvavg_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavg.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvavg_bu(v32u8 _1, v32u8 _2) { return __lasx_xvavg_bu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvavg_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavg.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvavg_hu(v16u16 _1, v16u16 _2) { return __lasx_xvavg_hu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvavg_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavg.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvavg_wu(v8u32 _1, v8u32 _2) { return __lasx_xvavg_wu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvavg_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavg.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvavg_du(v4u64 _1, v4u64 _2) { return __lasx_xvavg_du(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvavgr_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavgr.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvavgr_b(v32i8 _1, v32i8 _2) { return __lasx_xvavgr_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvavgr_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavgr.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvavgr_h(v16i16 _1, v16i16 _2) { return __lasx_xvavgr_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvavgr_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavgr.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvavgr_w(v8i32 _1, v8i32 _2) { return __lasx_xvavgr_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvavgr_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavgr.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvavgr_d(v4i64 _1, v4i64 _2) { return __lasx_xvavgr_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvavgr_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavgr.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvavgr_bu(v32u8 _1, v32u8 _2) { return __lasx_xvavgr_bu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvavgr_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavgr.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvavgr_hu(v16u16 _1, v16u16 _2) { return __lasx_xvavgr_hu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvavgr_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavgr.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvavgr_wu(v8u32 _1, v8u32 _2) { return __lasx_xvavgr_wu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvavgr_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavgr.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvavgr_du(v4u64 _1, v4u64 _2) { return __lasx_xvavgr_du(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssub_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssub.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvssub_b(v32i8 _1, v32i8 _2) { return __lasx_xvssub_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssub_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssub.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvssub_h(v16i16 _1, v16i16 _2) { return __lasx_xvssub_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssub_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssub.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvssub_w(v8i32 _1, v8i32 _2) { return __lasx_xvssub_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssub_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssub.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvssub_d(v4i64 _1, v4i64 _2) { return __lasx_xvssub_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssub_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssub.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvssub_bu(v32u8 _1, v32u8 _2) { return __lasx_xvssub_bu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssub_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssub.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvssub_hu(v16u16 _1, v16u16 _2) { return __lasx_xvssub_hu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssub_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssub.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvssub_wu(v8u32 _1, v8u32 _2) { return __lasx_xvssub_wu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssub_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssub.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvssub_du(v4u64 _1, v4u64 _2) { return __lasx_xvssub_du(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvabsd_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvabsd.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvabsd_b(v32i8 _1, v32i8 _2) { return __lasx_xvabsd_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvabsd_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvabsd.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvabsd_h(v16i16 _1, v16i16 _2) { return __lasx_xvabsd_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvabsd_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvabsd.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvabsd_w(v8i32 _1, v8i32 _2) { return __lasx_xvabsd_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvabsd_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvabsd.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvabsd_d(v4i64 _1, v4i64 _2) { return __lasx_xvabsd_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvabsd_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvabsd.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvabsd_bu(v32u8 _1, v32u8 _2) { return __lasx_xvabsd_bu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvabsd_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvabsd.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvabsd_hu(v16u16 _1, v16u16 _2) { return __lasx_xvabsd_hu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvabsd_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvabsd.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvabsd_wu(v8u32 _1, v8u32 _2) { return __lasx_xvabsd_wu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvabsd_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvabsd.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvabsd_du(v4u64 _1, v4u64 _2) { return __lasx_xvabsd_du(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmul_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmul.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmul_b(v32i8 _1, v32i8 _2) { return __lasx_xvmul_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmul_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmul.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmul_h(v16i16 _1, v16i16 _2) { return __lasx_xvmul_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmul_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmul.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmul_w(v8i32 _1, v8i32 _2) { return __lasx_xvmul_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmul_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmul.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmul_d(v4i64 _1, v4i64 _2) { return __lasx_xvmul_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmadd_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_136:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_136:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmadd.b(<32 x i8> [[_136]], <32 x i8> [[_247]], <32 x i8> [[_358]])
-// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmadd_b(v32i8 _1, v32i8 _2, v32i8 _3) { return __lasx_xvmadd_b(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmadd_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmadd.h(<16 x i16> [[_136]], <16 x i16> [[_247]], <16 x i16> [[_358]])
-// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmadd_h(v16i16 _1, v16i16 _2, v16i16 _3) { return __lasx_xvmadd_h(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmadd_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_247:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_358:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmadd.w(<8 x i32> [[_136]], <8 x i32> [[_247]], <8 x i32> [[_358]])
-// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmadd_w(v8i32 _1, v8i32 _2, v8i32 _3) { return __lasx_xvmadd_w(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmadd_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmadd.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmadd_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __lasx_xvmadd_d(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmsub_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_136:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_136:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmsub.b(<32 x i8> [[_136]], <32 x i8> [[_247]], <32 x i8> [[_358]])
-// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmsub_b(v32i8 _1, v32i8 _2, v32i8 _3) { return __lasx_xvmsub_b(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmsub_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmsub.h(<16 x i16> [[_136]], <16 x i16> [[_247]], <16 x i16> [[_358]])
-// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmsub_h(v16i16 _1, v16i16 _2, v16i16 _3) { return __lasx_xvmsub_h(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmsub_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_247:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_358:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmsub.w(<8 x i32> [[_136]], <8 x i32> [[_247]], <8 x i32> [[_358]])
-// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmsub_w(v8i32 _1, v8i32 _2, v8i32 _3) { return __lasx_xvmsub_w(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmsub_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmsub.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmsub_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __lasx_xvmsub_d(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvdiv_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvdiv.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvdiv_b(v32i8 _1, v32i8 _2) { return __lasx_xvdiv_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvdiv_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvdiv.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvdiv_h(v16i16 _1, v16i16 _2) { return __lasx_xvdiv_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvdiv_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvdiv.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvdiv_w(v8i32 _1, v8i32 _2) { return __lasx_xvdiv_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvdiv_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvdiv.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvdiv_d(v4i64 _1, v4i64 _2) { return __lasx_xvdiv_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvdiv_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvdiv.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvdiv_bu(v32u8 _1, v32u8 _2) { return __lasx_xvdiv_bu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvdiv_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvdiv.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvdiv_hu(v16u16 _1, v16u16 _2) { return __lasx_xvdiv_hu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvdiv_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvdiv.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvdiv_wu(v8u32 _1, v8u32 _2) { return __lasx_xvdiv_wu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvdiv_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvdiv.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvdiv_du(v4u64 _1, v4u64 _2) { return __lasx_xvdiv_du(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvhaddw_h_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhaddw.h.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvhaddw_h_b(v32i8 _1, v32i8 _2) { return __lasx_xvhaddw_h_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvhaddw_w_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhaddw.w.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvhaddw_w_h(v16i16 _1, v16i16 _2) { return __lasx_xvhaddw_w_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvhaddw_d_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.d.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvhaddw_d_w(v8i32 _1, v8i32 _2) { return __lasx_xvhaddw_d_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvhaddw_hu_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhaddw.hu.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvhaddw_hu_bu(v32u8 _1, v32u8 _2) { return __lasx_xvhaddw_hu_bu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvhaddw_wu_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhaddw.wu.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvhaddw_wu_hu(v16u16 _1, v16u16 _2) { return __lasx_xvhaddw_wu_hu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvhaddw_du_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.du.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvhaddw_du_wu(v8u32 _1, v8u32 _2) { return __lasx_xvhaddw_du_wu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvhsubw_h_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhsubw.h.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvhsubw_h_b(v32i8 _1, v32i8 _2) { return __lasx_xvhsubw_h_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvhsubw_w_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhsubw.w.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvhsubw_w_h(v16i16 _1, v16i16 _2) { return __lasx_xvhsubw_w_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvhsubw_d_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.d.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvhsubw_d_w(v8i32 _1, v8i32 _2) { return __lasx_xvhsubw_d_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvhsubw_hu_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhsubw.hu.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvhsubw_hu_bu(v32u8 _1, v32u8 _2) { return __lasx_xvhsubw_hu_bu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvhsubw_wu_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhsubw.wu.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvhsubw_wu_hu(v16u16 _1, v16u16 _2) { return __lasx_xvhsubw_wu_hu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvhsubw_du_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.du.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvhsubw_du_wu(v8u32 _1, v8u32 _2) { return __lasx_xvhsubw_du_wu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmod_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmod.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmod_b(v32i8 _1, v32i8 _2) { return __lasx_xvmod_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmod_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmod.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmod_h(v16i16 _1, v16i16 _2) { return __lasx_xvmod_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmod_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmod.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmod_w(v8i32 _1, v8i32 _2) { return __lasx_xvmod_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmod_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmod.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmod_d(v4i64 _1, v4i64 _2) { return __lasx_xvmod_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmod_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmod.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvmod_bu(v32u8 _1, v32u8 _2) { return __lasx_xvmod_bu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmod_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmod.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvmod_hu(v16u16 _1, v16u16 _2) { return __lasx_xvmod_hu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmod_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmod.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvmod_wu(v8u32 _1, v8u32 _2) { return __lasx_xvmod_wu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmod_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmod.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvmod_du(v4u64 _1, v4u64 _2) { return __lasx_xvmod_du(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvrepl128vei_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrepl128vei.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvrepl128vei_b(v32i8 _1) { return __lasx_xvrepl128vei_b(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvrepl128vei_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrepl128vei.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvrepl128vei_h(v16i16 _1) { return __lasx_xvrepl128vei_h(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvrepl128vei_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrepl128vei.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvrepl128vei_w(v8i32 _1) { return __lasx_xvrepl128vei_w(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvrepl128vei_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrepl128vei.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvrepl128vei_d(v4i64 _1) { return __lasx_xvrepl128vei_d(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvpickev_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpickev.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvpickev_b(v32i8 _1, v32i8 _2) { return __lasx_xvpickev_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvpickev_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpickev.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvpickev_h(v16i16 _1, v16i16 _2) { return __lasx_xvpickev_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvpickev_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpickev.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvpickev_w(v8i32 _1, v8i32 _2) { return __lasx_xvpickev_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvpickev_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpickev.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvpickev_d(v4i64 _1, v4i64 _2) { return __lasx_xvpickev_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvpickod_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpickod.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvpickod_b(v32i8 _1, v32i8 _2) { return __lasx_xvpickod_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvpickod_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpickod.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvpickod_h(v16i16 _1, v16i16 _2) { return __lasx_xvpickod_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvpickod_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpickod.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvpickod_w(v8i32 _1, v8i32 _2) { return __lasx_xvpickod_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvpickod_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpickod.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvpickod_d(v4i64 _1, v4i64 _2) { return __lasx_xvpickod_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvilvh_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvilvh.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvilvh_b(v32i8 _1, v32i8 _2) { return __lasx_xvilvh_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvilvh_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvilvh.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvilvh_h(v16i16 _1, v16i16 _2) { return __lasx_xvilvh_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvilvh_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvilvh.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvilvh_w(v8i32 _1, v8i32 _2) { return __lasx_xvilvh_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvilvh_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvilvh.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvilvh_d(v4i64 _1, v4i64 _2) { return __lasx_xvilvh_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvilvl_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvilvl.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvilvl_b(v32i8 _1, v32i8 _2) { return __lasx_xvilvl_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvilvl_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvilvl.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvilvl_h(v16i16 _1, v16i16 _2) { return __lasx_xvilvl_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvilvl_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvilvl.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvilvl_w(v8i32 _1, v8i32 _2) { return __lasx_xvilvl_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvilvl_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvilvl.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvilvl_d(v4i64 _1, v4i64 _2) { return __lasx_xvilvl_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvpackev_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpackev.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvpackev_b(v32i8 _1, v32i8 _2) { return __lasx_xvpackev_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvpackev_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpackev.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvpackev_h(v16i16 _1, v16i16 _2) { return __lasx_xvpackev_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvpackev_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpackev.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvpackev_w(v8i32 _1, v8i32 _2) { return __lasx_xvpackev_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvpackev_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpackev.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvpackev_d(v4i64 _1, v4i64 _2) { return __lasx_xvpackev_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvpackod_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpackod.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvpackod_b(v32i8 _1, v32i8 _2) { return __lasx_xvpackod_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvpackod_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpackod.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvpackod_h(v16i16 _1, v16i16 _2) { return __lasx_xvpackod_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvpackod_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpackod.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvpackod_w(v8i32 _1, v8i32 _2) { return __lasx_xvpackod_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvpackod_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpackod.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvpackod_d(v4i64 _1, v4i64 _2) { return __lasx_xvpackod_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvshuf_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_136:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_136:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvshuf.b(<32 x i8> [[_136]], <32 x i8> [[_247]], <32 x i8> [[_358]])
-// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvshuf_b(v32i8 _1, v32i8 _2, v32i8 _3) { return __lasx_xvshuf_b(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvshuf_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvshuf.h(<16 x i16> [[_136]], <16 x i16> [[_247]], <16 x i16> [[_358]])
-// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvshuf_h(v16i16 _1, v16i16 _2, v16i16 _3) { return __lasx_xvshuf_h(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvshuf_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_247:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_358:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvshuf.w(<8 x i32> [[_136]], <8 x i32> [[_247]], <8 x i32> [[_358]])
-// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvshuf_w(v8i32 _1, v8i32 _2, v8i32 _3) { return __lasx_xvshuf_w(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvshuf_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvshuf.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvshuf_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __lasx_xvshuf_d(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvand_v(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvand.v(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvand_v(v32u8 _1, v32u8 _2) { return __lasx_xvand_v(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvandi_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvandi.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvandi_b(v32u8 _1) { return __lasx_xvandi_b(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvor_v(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvor.v(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvor_v(v32u8 _1, v32u8 _2) { return __lasx_xvor_v(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvori_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvori.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvori_b(v32u8 _1) { return __lasx_xvori_b(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvnor_v(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvnor.v(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvnor_v(v32u8 _1, v32u8 _2) { return __lasx_xvnor_v(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvnori_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvnori.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvnori_b(v32u8 _1) { return __lasx_xvnori_b(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvxor_v(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvxor.v(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvxor_v(v32u8 _1, v32u8 _2) { return __lasx_xvxor_v(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvxori_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvxori.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvxori_b(v32u8 _1) { return __lasx_xvxori_b(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvbitsel_v(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_136:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_136:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitsel.v(<32 x i8> [[_136]], <32 x i8> [[_247]], <32 x i8> [[_358]])
-// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvbitsel_v(v32u8 _1, v32u8 _2, v32u8 _3) { return __lasx_xvbitsel_v(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvbitseli_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitseli.b(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvbitseli_b(v32u8 _1, v32u8 _2) { return __lasx_xvbitseli_b(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvshuf4i_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvshuf4i.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvshuf4i_b(v32i8 _1) { return __lasx_xvshuf4i_b(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvshuf4i_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvshuf4i.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvshuf4i_h(v16i16 _1) { return __lasx_xvshuf4i_h(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvshuf4i_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvshuf4i.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvshuf4i_w(v8i32 _1) { return __lasx_xvshuf4i_w(_1, 1); }
@@ -2815,7 +2815,7 @@ v8i32 xvshuf4i_w(v8i32 _1) { return __lasx_xvshuf4i_w(_1, 1); }
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i32 noundef signext [[_1:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplgr2vr.b(i32 [[_1]])
-// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvreplgr2vr_b(int _1) { return __lasx_xvreplgr2vr_b(_1); }
@@ -2823,7 +2823,7 @@ v32i8 xvreplgr2vr_b(int _1) { return __lasx_xvreplgr2vr_b(_1); }
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i32 noundef signext [[_1:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvreplgr2vr.h(i32 [[_1]])
-// CHECK-NEXT:    store <16 x i16> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvreplgr2vr_h(int _1) { return __lasx_xvreplgr2vr_h(_1); }
@@ -2831,7 +2831,7 @@ v16i16 xvreplgr2vr_h(int _1) { return __lasx_xvreplgr2vr_h(_1); }
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i32 noundef signext [[_1:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvreplgr2vr.w(i32 [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvreplgr2vr_w(int _1) { return __lasx_xvreplgr2vr_w(_1); }
@@ -2840,1641 +2840,1641 @@ v8i32 xvreplgr2vr_w(int _1) { return __lasx_xvreplgr2vr_w(_1); }
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[_1]] to i64
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvreplgr2vr.d(i64 [[CONV]])
-// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvreplgr2vr_d(int _1) { return __lasx_xvreplgr2vr_d(_1); }
 // CHECK-LABEL: define dso_local void @xvpcnt_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpcnt.b(<32 x i8> [[_112]])
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvpcnt_b(v32i8 _1) { return __lasx_xvpcnt_b(_1); }
 // CHECK-LABEL: define dso_local void @xvpcnt_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpcnt.h(<16 x i16> [[_112]])
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvpcnt_h(v16i16 _1) { return __lasx_xvpcnt_h(_1); }
 // CHECK-LABEL: define dso_local void @xvpcnt_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpcnt.w(<8 x i32> [[_112]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvpcnt_w(v8i32 _1) { return __lasx_xvpcnt_w(_1); }
 // CHECK-LABEL: define dso_local void @xvpcnt_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpcnt.d(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvpcnt_d(v4i64 _1) { return __lasx_xvpcnt_d(_1); }
 // CHECK-LABEL: define dso_local void @xvclo_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvclo.b(<32 x i8> [[_112]])
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvclo_b(v32i8 _1) { return __lasx_xvclo_b(_1); }
 // CHECK-LABEL: define dso_local void @xvclo_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvclo.h(<16 x i16> [[_112]])
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvclo_h(v16i16 _1) { return __lasx_xvclo_h(_1); }
 // CHECK-LABEL: define dso_local void @xvclo_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvclo.w(<8 x i32> [[_112]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvclo_w(v8i32 _1) { return __lasx_xvclo_w(_1); }
 // CHECK-LABEL: define dso_local void @xvclo_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvclo.d(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvclo_d(v4i64 _1) { return __lasx_xvclo_d(_1); }
 // CHECK-LABEL: define dso_local void @xvclz_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvclz.b(<32 x i8> [[_112]])
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvclz_b(v32i8 _1) { return __lasx_xvclz_b(_1); }
 // CHECK-LABEL: define dso_local void @xvclz_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvclz.h(<16 x i16> [[_112]])
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvclz_h(v16i16 _1) { return __lasx_xvclz_h(_1); }
 // CHECK-LABEL: define dso_local void @xvclz_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvclz.w(<8 x i32> [[_112]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvclz_w(v8i32 _1) { return __lasx_xvclz_w(_1); }
 // CHECK-LABEL: define dso_local void @xvclz_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvclz.d(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvclz_d(v4i64 _1) { return __lasx_xvclz_d(_1); }
 // CHECK-LABEL: define dso_local void @xvfadd_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfadd.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfadd_s(v8f32 _1, v8f32 _2) { return __lasx_xvfadd_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfadd_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfadd.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfadd_d(v4f64 _1, v4f64 _2) { return __lasx_xvfadd_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfsub_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfsub.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfsub_s(v8f32 _1, v8f32 _2) { return __lasx_xvfsub_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfsub_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfsub.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfsub_d(v4f64 _1, v4f64 _2) { return __lasx_xvfsub_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfmul_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmul.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfmul_s(v8f32 _1, v8f32 _2) { return __lasx_xvfmul_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfmul_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmul.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfmul_d(v4f64 _1, v4f64 _2) { return __lasx_xvfmul_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfdiv_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfdiv.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfdiv_s(v8f32 _1, v8f32 _2) { return __lasx_xvfdiv_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfdiv_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfdiv.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfdiv_d(v4f64 _1, v4f64 _2) { return __lasx_xvfdiv_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcvt_h_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvfcvt.h.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvfcvt_h_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcvt_h_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcvt_s_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfcvt.s.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfcvt_s_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcvt_s_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfmin_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmin.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfmin_s(v8f32 _1, v8f32 _2) { return __lasx_xvfmin_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfmin_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmin.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfmin_d(v4f64 _1, v4f64 _2) { return __lasx_xvfmin_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfmina_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmina.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfmina_s(v8f32 _1, v8f32 _2) { return __lasx_xvfmina_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfmina_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmina.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfmina_d(v4f64 _1, v4f64 _2) { return __lasx_xvfmina_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfmax_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmax.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfmax_s(v8f32 _1, v8f32 _2) { return __lasx_xvfmax_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfmax_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmax.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfmax_d(v4f64 _1, v4f64 _2) { return __lasx_xvfmax_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfmaxa_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmaxa.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfmaxa_s(v8f32 _1, v8f32 _2) { return __lasx_xvfmaxa_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfmaxa_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmaxa.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfmaxa_d(v4f64 _1, v4f64 _2) { return __lasx_xvfmaxa_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfclass_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfclass.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfclass_s(v8f32 _1) { return __lasx_xvfclass_s(_1); }
 // CHECK-LABEL: define dso_local void @xvfclass_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfclass.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfclass_d(v4f64 _1) { return __lasx_xvfclass_d(_1); }
 // CHECK-LABEL: define dso_local void @xvfsqrt_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfsqrt.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfsqrt_s(v8f32 _1) { return __lasx_xvfsqrt_s(_1); }
 // CHECK-LABEL: define dso_local void @xvfsqrt_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfsqrt.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfsqrt_d(v4f64 _1) { return __lasx_xvfsqrt_d(_1); }
 // CHECK-LABEL: define dso_local void @xvfrecip_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrecip.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfrecip_s(v8f32 _1) { return __lasx_xvfrecip_s(_1); }
 // CHECK-LABEL: define dso_local void @xvfrecip_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrecip.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfrecip_d(v4f64 _1) { return __lasx_xvfrecip_d(_1); }
 // CHECK-LABEL: define dso_local void @xvfrint_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrint.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfrint_s(v8f32 _1) { return __lasx_xvfrint_s(_1); }
 // CHECK-LABEL: define dso_local void @xvfrint_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrint.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfrint_d(v4f64 _1) { return __lasx_xvfrint_d(_1); }
 // CHECK-LABEL: define dso_local void @xvfrsqrt_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrsqrt.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfrsqrt_s(v8f32 _1) { return __lasx_xvfrsqrt_s(_1); }
 // CHECK-LABEL: define dso_local void @xvfrsqrt_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrsqrt.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfrsqrt_d(v4f64 _1) { return __lasx_xvfrsqrt_d(_1); }
 // CHECK-LABEL: define dso_local void @xvflogb_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvflogb.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvflogb_s(v8f32 _1) { return __lasx_xvflogb_s(_1); }
 // CHECK-LABEL: define dso_local void @xvflogb_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvflogb.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvflogb_d(v4f64 _1) { return __lasx_xvflogb_d(_1); }
 // CHECK-LABEL: define dso_local void @xvfcvth_s_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfcvth.s.h(<16 x i16> [[_112]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfcvth_s_h(v16i16 _1) { return __lasx_xvfcvth_s_h(_1); }
 // CHECK-LABEL: define dso_local void @xvfcvth_d_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfcvth.d.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfcvth_d_s(v8f32 _1) { return __lasx_xvfcvth_d_s(_1); }
 // CHECK-LABEL: define dso_local void @xvfcvtl_s_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfcvtl.s.h(<16 x i16> [[_112]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfcvtl_s_h(v16i16 _1) { return __lasx_xvfcvtl_s_h(_1); }
 // CHECK-LABEL: define dso_local void @xvfcvtl_d_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfcvtl.d.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfcvtl_d_s(v8f32 _1) { return __lasx_xvfcvtl_d_s(_1); }
 // CHECK-LABEL: define dso_local void @xvftint_w_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftint.w.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvftint_w_s(v8f32 _1) { return __lasx_xvftint_w_s(_1); }
 // CHECK-LABEL: define dso_local void @xvftint_l_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftint.l.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftint_l_d(v4f64 _1) { return __lasx_xvftint_l_d(_1); }
 // CHECK-LABEL: define dso_local void @xvftint_wu_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftint.wu.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvftint_wu_s(v8f32 _1) { return __lasx_xvftint_wu_s(_1); }
 // CHECK-LABEL: define dso_local void @xvftint_lu_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftint.lu.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvftint_lu_d(v4f64 _1) { return __lasx_xvftint_lu_d(_1); }
 // CHECK-LABEL: define dso_local void @xvftintrz_w_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrz.w.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvftintrz_w_s(v8f32 _1) { return __lasx_xvftintrz_w_s(_1); }
 // CHECK-LABEL: define dso_local void @xvftintrz_l_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrz.l.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrz_l_d(v4f64 _1) { return __lasx_xvftintrz_l_d(_1); }
 // CHECK-LABEL: define dso_local void @xvftintrz_wu_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrz.wu.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvftintrz_wu_s(v8f32 _1) { return __lasx_xvftintrz_wu_s(_1); }
 // CHECK-LABEL: define dso_local void @xvftintrz_lu_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrz.lu.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvftintrz_lu_d(v4f64 _1) { return __lasx_xvftintrz_lu_d(_1); }
 // CHECK-LABEL: define dso_local void @xvffint_s_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvffint.s.w(<8 x i32> [[_112]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvffint_s_w(v8i32 _1) { return __lasx_xvffint_s_w(_1); }
 // CHECK-LABEL: define dso_local void @xvffint_d_l(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffint.d.l(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvffint_d_l(v4i64 _1) { return __lasx_xvffint_d_l(_1); }
 // CHECK-LABEL: define dso_local void @xvffint_s_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvffint.s.wu(<8 x i32> [[_112]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvffint_s_wu(v8u32 _1) { return __lasx_xvffint_s_wu(_1); }
 // CHECK-LABEL: define dso_local void @xvffint_d_lu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffint.d.lu(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvffint_d_lu(v4u64 _1) { return __lasx_xvffint_d_lu(_1); }
 // CHECK-LABEL: define dso_local void @xvreplve_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i32 noundef signext [[_2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplve.b(<32 x i8> [[_112]], i32 [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvreplve_b(v32i8 _1, int _2) { return __lasx_xvreplve_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvreplve_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i32 noundef signext [[_2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvreplve.h(<16 x i16> [[_112]], i32 [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvreplve_h(v16i16 _1, int _2) { return __lasx_xvreplve_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvreplve_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i32 noundef signext [[_2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvreplve.w(<8 x i32> [[_112]], i32 [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvreplve_w(v8i32 _1, int _2) { return __lasx_xvreplve_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvreplve_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i32 noundef signext [[_2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvreplve.d(<4 x i64> [[_1]], i32 [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvreplve_d(v4i64 _1, int _2) { return __lasx_xvreplve_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvpermi_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpermi.w(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvpermi_w(v8i32 _1, v8i32 _2) { return __lasx_xvpermi_w(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvandn_v(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvandn.v(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvandn_v(v32u8 _1, v32u8 _2) { return __lasx_xvandn_v(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvneg_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvneg.b(<32 x i8> [[_112]])
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvneg_b(v32i8 _1) { return __lasx_xvneg_b(_1); }
 // CHECK-LABEL: define dso_local void @xvneg_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvneg.h(<16 x i16> [[_112]])
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvneg_h(v16i16 _1) { return __lasx_xvneg_h(_1); }
 // CHECK-LABEL: define dso_local void @xvneg_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvneg.w(<8 x i32> [[_112]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvneg_w(v8i32 _1) { return __lasx_xvneg_w(_1); }
 // CHECK-LABEL: define dso_local void @xvneg_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvneg.d(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvneg_d(v4i64 _1) { return __lasx_xvneg_d(_1); }
 // CHECK-LABEL: define dso_local void @xvmuh_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmuh.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmuh_b(v32i8 _1, v32i8 _2) { return __lasx_xvmuh_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmuh_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmuh.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmuh_h(v16i16 _1, v16i16 _2) { return __lasx_xvmuh_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmuh_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmuh.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmuh_w(v8i32 _1, v8i32 _2) { return __lasx_xvmuh_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmuh_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmuh.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmuh_d(v4i64 _1, v4i64 _2) { return __lasx_xvmuh_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmuh_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmuh.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvmuh_bu(v32u8 _1, v32u8 _2) { return __lasx_xvmuh_bu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmuh_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmuh.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvmuh_hu(v16u16 _1, v16u16 _2) { return __lasx_xvmuh_hu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmuh_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmuh.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvmuh_wu(v8u32 _1, v8u32 _2) { return __lasx_xvmuh_wu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmuh_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmuh.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvmuh_du(v4u64 _1, v4u64 _2) { return __lasx_xvmuh_du(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsllwil_h_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsllwil.h.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsllwil_h_b(v32i8 _1) { return __lasx_xvsllwil_h_b(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsllwil_w_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsllwil.w.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsllwil_w_h(v16i16 _1) { return __lasx_xvsllwil_w_h(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsllwil_d_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsllwil.d.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsllwil_d_w(v8i32 _1) { return __lasx_xvsllwil_d_w(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsllwil_hu_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsllwil.hu.bu(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvsllwil_hu_bu(v32u8 _1) { return __lasx_xvsllwil_hu_bu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsllwil_wu_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsllwil.wu.hu(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvsllwil_wu_hu(v16u16 _1) { return __lasx_xvsllwil_wu_hu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsllwil_du_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsllwil.du.wu(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvsllwil_du_wu(v8u32 _1) { return __lasx_xvsllwil_du_wu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsran_b_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsran.b.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsran_b_h(v16i16 _1, v16i16 _2) { return __lasx_xvsran_b_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsran_h_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsran.h.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsran_h_w(v8i32 _1, v8i32 _2) { return __lasx_xvsran_h_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsran_w_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsran.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsran_w_d(v4i64 _1, v4i64 _2) { return __lasx_xvsran_w_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssran_b_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssran.b.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvssran_b_h(v16i16 _1, v16i16 _2) { return __lasx_xvssran_b_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssran_h_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssran.h.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvssran_h_w(v8i32 _1, v8i32 _2) { return __lasx_xvssran_h_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssran_w_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssran.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvssran_w_d(v4i64 _1, v4i64 _2) { return __lasx_xvssran_w_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssran_bu_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssran.bu.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvssran_bu_h(v16u16 _1, v16u16 _2) { return __lasx_xvssran_bu_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssran_hu_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssran.hu.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvssran_hu_w(v8u32 _1, v8u32 _2) { return __lasx_xvssran_hu_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssran_wu_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssran.wu.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvssran_wu_d(v4u64 _1, v4u64 _2) { return __lasx_xvssran_wu_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsrarn_b_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrarn.b.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrarn_b_h(v16i16 _1, v16i16 _2) { return __lasx_xvsrarn_b_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsrarn_h_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrarn.h.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrarn_h_w(v8i32 _1, v8i32 _2) { return __lasx_xvsrarn_h_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsrarn_w_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrarn.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrarn_w_d(v4i64 _1, v4i64 _2) { return __lasx_xvsrarn_w_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssrarn_b_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarn.b.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvssrarn_b_h(v16i16 _1, v16i16 _2) { return __lasx_xvssrarn_b_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssrarn_h_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarn.h.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvssrarn_h_w(v8i32 _1, v8i32 _2) { return __lasx_xvssrarn_h_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssrarn_w_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarn.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvssrarn_w_d(v4i64 _1, v4i64 _2) { return __lasx_xvssrarn_w_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssrarn_bu_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarn.bu.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvssrarn_bu_h(v16u16 _1, v16u16 _2) { return __lasx_xvssrarn_bu_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssrarn_hu_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarn.hu.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvssrarn_hu_w(v8u32 _1, v8u32 _2) { return __lasx_xvssrarn_hu_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssrarn_wu_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarn.wu.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvssrarn_wu_d(v4u64 _1, v4u64 _2) { return __lasx_xvssrarn_wu_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsrln_b_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrln.b.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrln_b_h(v16i16 _1, v16i16 _2) { return __lasx_xvsrln_b_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsrln_h_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrln.h.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrln_h_w(v8i32 _1, v8i32 _2) { return __lasx_xvsrln_h_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsrln_w_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrln.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrln_w_d(v4i64 _1, v4i64 _2) { return __lasx_xvsrln_w_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssrln_bu_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrln.bu.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvssrln_bu_h(v16u16 _1, v16u16 _2) { return __lasx_xvssrln_bu_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssrln_hu_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrln.hu.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvssrln_hu_w(v8u32 _1, v8u32 _2) { return __lasx_xvssrln_hu_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssrln_wu_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrln.wu.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvssrln_wu_d(v4u64 _1, v4u64 _2) { return __lasx_xvssrln_wu_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsrlrn_b_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlrn.b.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrlrn_b_h(v16i16 _1, v16i16 _2) { return __lasx_xvsrlrn_b_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsrlrn_h_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlrn.h.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrlrn_h_w(v8i32 _1, v8i32 _2) { return __lasx_xvsrlrn_h_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsrlrn_w_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlrn.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrlrn_w_d(v4i64 _1, v4i64 _2) { return __lasx_xvsrlrn_w_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssrlrn_bu_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrn.bu.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvssrlrn_bu_h(v16u16 _1, v16u16 _2) { return __lasx_xvssrlrn_bu_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssrlrn_hu_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrn.hu.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvssrlrn_hu_w(v8u32 _1, v8u32 _2) { return __lasx_xvssrlrn_hu_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssrlrn_wu_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrn.wu.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvssrlrn_wu_d(v4u64 _1, v4u64 _2) { return __lasx_xvssrlrn_wu_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfrstpi_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvfrstpi.b(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvfrstpi_b(v32i8 _1, v32i8 _2) { return __lasx_xvfrstpi_b(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvfrstpi_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvfrstpi.h(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvfrstpi_h(v16i16 _1, v16i16 _2) { return __lasx_xvfrstpi_h(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvfrstp_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_136:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_136:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvfrstp.b(<32 x i8> [[_136]], <32 x i8> [[_247]], <32 x i8> [[_358]])
-// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvfrstp_b(v32i8 _1, v32i8 _2, v32i8 _3) { return __lasx_xvfrstp_b(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvfrstp_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvfrstp.h(<16 x i16> [[_136]], <16 x i16> [[_247]], <16 x i16> [[_358]])
-// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvfrstp_h(v16i16 _1, v16i16 _2, v16i16 _3) { return __lasx_xvfrstp_h(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvshuf4i_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvshuf4i.d(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvshuf4i_d(v4i64 _1, v4i64 _2) { return __lasx_xvshuf4i_d(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvbsrl_v(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbsrl.v(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvbsrl_v(v32i8 _1) { return __lasx_xvbsrl_v(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvbsll_v(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbsll.v(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvbsll_v(v32i8 _1) { return __lasx_xvbsll_v(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvextrins_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvextrins.b(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvextrins_b(v32i8 _1, v32i8 _2) { return __lasx_xvextrins_b(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvextrins_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvextrins.h(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvextrins_h(v16i16 _1, v16i16 _2) { return __lasx_xvextrins_h(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvextrins_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvextrins.w(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvextrins_w(v8i32 _1, v8i32 _2) { return __lasx_xvextrins_w(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvextrins_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvextrins.d(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvextrins_d(v4i64 _1, v4i64 _2) { return __lasx_xvextrins_d(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvmskltz_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmskltz.b(<32 x i8> [[_112]])
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmskltz_b(v32i8 _1) { return __lasx_xvmskltz_b(_1); }
 // CHECK-LABEL: define dso_local void @xvmskltz_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmskltz.h(<16 x i16> [[_112]])
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmskltz_h(v16i16 _1) { return __lasx_xvmskltz_h(_1); }
 // CHECK-LABEL: define dso_local void @xvmskltz_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmskltz.w(<8 x i32> [[_112]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmskltz_w(v8i32 _1) { return __lasx_xvmskltz_w(_1); }
 // CHECK-LABEL: define dso_local void @xvmskltz_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmskltz.d(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmskltz_d(v4i64 _1) { return __lasx_xvmskltz_d(_1); }
 // CHECK-LABEL: define dso_local void @xvsigncov_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsigncov.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsigncov_b(v32i8 _1, v32i8 _2) { return __lasx_xvsigncov_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsigncov_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsigncov.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsigncov_h(v16i16 _1, v16i16 _2) { return __lasx_xvsigncov_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsigncov_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsigncov.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsigncov_w(v8i32 _1, v8i32 _2) { return __lasx_xvsigncov_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsigncov_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsigncov.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsigncov_d(v4i64 _1, v4i64 _2) { return __lasx_xvsigncov_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfmadd_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmadd.s(<8 x float> [[_1]], <8 x float> [[_2]], <8 x float> [[_3]])
-// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfmadd_s(v8f32 _1, v8f32 _2, v8f32 _3) { return __lasx_xvfmadd_s(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvfmadd_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmadd.d(<4 x double> [[_1]], <4 x double> [[_2]], <4 x double> [[_3]])
-// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfmadd_d(v4f64 _1, v4f64 _2, v4f64 _3) { return __lasx_xvfmadd_d(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvfmsub_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmsub.s(<8 x float> [[_1]], <8 x float> [[_2]], <8 x float> [[_3]])
-// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfmsub_s(v8f32 _1, v8f32 _2, v8f32 _3) { return __lasx_xvfmsub_s(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvfmsub_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmsub.d(<4 x double> [[_1]], <4 x double> [[_2]], <4 x double> [[_3]])
-// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfmsub_d(v4f64 _1, v4f64 _2, v4f64 _3) { return __lasx_xvfmsub_d(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvfnmadd_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfnmadd.s(<8 x float> [[_1]], <8 x float> [[_2]], <8 x float> [[_3]])
-// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfnmadd_s(v8f32 _1, v8f32 _2, v8f32 _3) { return __lasx_xvfnmadd_s(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvfnmadd_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfnmadd.d(<4 x double> [[_1]], <4 x double> [[_2]], <4 x double> [[_3]])
-// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfnmadd_d(v4f64 _1, v4f64 _2, v4f64 _3) { return __lasx_xvfnmadd_d(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvfnmsub_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfnmsub.s(<8 x float> [[_1]], <8 x float> [[_2]], <8 x float> [[_3]])
-// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfnmsub_s(v8f32 _1, v8f32 _2, v8f32 _3) { return __lasx_xvfnmsub_s(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvfnmsub_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfnmsub.d(<4 x double> [[_1]], <4 x double> [[_2]], <4 x double> [[_3]])
-// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfnmsub_d(v4f64 _1, v4f64 _2, v4f64 _3) { return __lasx_xvfnmsub_d(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvftintrne_w_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrne.w.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvftintrne_w_s(v8f32 _1) { return __lasx_xvftintrne_w_s(_1); }
 // CHECK-LABEL: define dso_local void @xvftintrne_l_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrne.l.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrne_l_d(v4f64 _1) { return __lasx_xvftintrne_l_d(_1); }
 // CHECK-LABEL: define dso_local void @xvftintrp_w_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrp.w.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvftintrp_w_s(v8f32 _1) { return __lasx_xvftintrp_w_s(_1); }
 // CHECK-LABEL: define dso_local void @xvftintrp_l_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrp.l.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrp_l_d(v4f64 _1) { return __lasx_xvftintrp_l_d(_1); }
 // CHECK-LABEL: define dso_local void @xvftintrm_w_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrm.w.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvftintrm_w_s(v8f32 _1) { return __lasx_xvftintrm_w_s(_1); }
 // CHECK-LABEL: define dso_local void @xvftintrm_l_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrm.l.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrm_l_d(v4f64 _1) { return __lasx_xvftintrm_l_d(_1); }
 // CHECK-LABEL: define dso_local void @xvftint_w_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftint.w.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvftint_w_d(v4f64 _1, v4f64 _2) { return __lasx_xvftint_w_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvffint_s_l(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvffint.s.l(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvffint_s_l(v4i64 _1, v4i64 _2) { return __lasx_xvffint_s_l(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvftintrz_w_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrz.w.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvftintrz_w_d(v4f64 _1, v4f64 _2) { return __lasx_xvftintrz_w_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvftintrp_w_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrp.w.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvftintrp_w_d(v4f64 _1, v4f64 _2) { return __lasx_xvftintrp_w_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvftintrm_w_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrm.w.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvftintrm_w_d(v4f64 _1, v4f64 _2) { return __lasx_xvftintrm_w_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvftintrne_w_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrne.w.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvftintrne_w_d(v4f64 _1, v4f64 _2) { return __lasx_xvftintrne_w_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvftinth_l_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftinth.l.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftinth_l_s(v8f32 _1) { return __lasx_xvftinth_l_s(_1); }
 // CHECK-LABEL: define dso_local void @xvftintl_l_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintl.l.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintl_l_s(v8f32 _1) { return __lasx_xvftintl_l_s(_1); }
 // CHECK-LABEL: define dso_local void @xvffinth_d_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffinth.d.w(<8 x i32> [[_112]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvffinth_d_w(v8i32 _1) { return __lasx_xvffinth_d_w(_1); }
 // CHECK-LABEL: define dso_local void @xvffintl_d_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffintl.d.w(<8 x i32> [[_112]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvffintl_d_w(v8i32 _1) { return __lasx_xvffintl_d_w(_1); }
 // CHECK-LABEL: define dso_local void @xvftintrzh_l_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrzh.l.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrzh_l_s(v8f32 _1) { return __lasx_xvftintrzh_l_s(_1); }
 // CHECK-LABEL: define dso_local void @xvftintrzl_l_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrzl.l.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrzl_l_s(v8f32 _1) { return __lasx_xvftintrzl_l_s(_1); }
 // CHECK-LABEL: define dso_local void @xvftintrph_l_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrph.l.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrph_l_s(v8f32 _1) { return __lasx_xvftintrph_l_s(_1); }
 // CHECK-LABEL: define dso_local void @xvftintrpl_l_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrpl.l.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrpl_l_s(v8f32 _1) { return __lasx_xvftintrpl_l_s(_1); }
 // CHECK-LABEL: define dso_local void @xvftintrmh_l_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrmh.l.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrmh_l_s(v8f32 _1) { return __lasx_xvftintrmh_l_s(_1); }
 // CHECK-LABEL: define dso_local void @xvftintrml_l_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrml.l.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrml_l_s(v8f32 _1) { return __lasx_xvftintrml_l_s(_1); }
 // CHECK-LABEL: define dso_local void @xvftintrneh_l_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrneh.l.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrneh_l_s(v8f32 _1) { return __lasx_xvftintrneh_l_s(_1); }
 // CHECK-LABEL: define dso_local void @xvftintrnel_l_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrnel.l.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrnel_l_s(v8f32 _1) { return __lasx_xvftintrnel_l_s(_1); }
 // CHECK-LABEL: define dso_local void @xvfrintrne_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrne.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfrintrne_s(v8f32 _1) { return __lasx_xvfrintrne_s(_1); }
 // CHECK-LABEL: define dso_local void @xvfrintrne_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrne.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfrintrne_d(v4f64 _1) { return __lasx_xvfrintrne_d(_1); }
 // CHECK-LABEL: define dso_local void @xvfrintrz_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrz.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfrintrz_s(v8f32 _1) { return __lasx_xvfrintrz_s(_1); }
 // CHECK-LABEL: define dso_local void @xvfrintrz_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrz.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfrintrz_d(v4f64 _1) { return __lasx_xvfrintrz_d(_1); }
 // CHECK-LABEL: define dso_local void @xvfrintrp_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrp.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfrintrp_s(v8f32 _1) { return __lasx_xvfrintrp_s(_1); }
 // CHECK-LABEL: define dso_local void @xvfrintrp_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrp.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfrintrp_d(v4f64 _1) { return __lasx_xvfrintrp_d(_1); }
 // CHECK-LABEL: define dso_local void @xvfrintrm_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrm.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfrintrm_s(v8f32 _1) { return __lasx_xvfrintrm_s(_1); }
 // CHECK-LABEL: define dso_local void @xvfrintrm_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrm.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfrintrm_d(v4f64 _1) { return __lasx_xvfrintrm_d(_1); }
@@ -4482,14 +4482,14 @@ v4i64 xvfrintrm_d(v4f64 _1) { return __lasx_xvfrintrm_d(_1); }
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr noundef [[_1:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvld(ptr [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvld(void * _1) { return __lasx_xvld(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvst(
 // CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr noundef [[_2:%.*]]) local_unnamed_addr #[[ATTR5:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvst(<32 x i8> [[_1]], ptr [[_2]], i32 1)
 // CHECK-NEXT:    ret void
 //
@@ -4497,7 +4497,7 @@ void xvst(v32i8 _1, void * _2) { return __lasx_xvst(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvstelm_b(
 // CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr noundef [[_2:%.*]]) local_unnamed_addr #[[ATTR5]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.b(<32 x i8> [[_1]], ptr [[_2]], i32 1, i32 1)
 // CHECK-NEXT:    ret void
 //
@@ -4505,7 +4505,7 @@ void xvstelm_b(v32i8 _1, void * _2) { return __lasx_xvstelm_b(_1, _2, 1, 1); }
 // CHECK-LABEL: define dso_local void @xvstelm_h(
 // CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr noundef [[_2:%.*]]) local_unnamed_addr #[[ATTR5]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.h(<16 x i16> [[_1]], ptr [[_2]], i32 2, i32 1)
 // CHECK-NEXT:    ret void
 //
@@ -4513,7 +4513,7 @@ void xvstelm_h(v16i16 _1, void * _2) { return __lasx_xvstelm_h(_1, _2, 2, 1); }
 // CHECK-LABEL: define dso_local void @xvstelm_w(
 // CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr noundef [[_2:%.*]]) local_unnamed_addr #[[ATTR5]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.w(<8 x i32> [[_1]], ptr [[_2]], i32 4, i32 1)
 // CHECK-NEXT:    ret void
 //
@@ -4521,7 +4521,7 @@ void xvstelm_w(v8i32 _1, void * _2) { return __lasx_xvstelm_w(_1, _2, 4, 1); }
 // CHECK-LABEL: define dso_local void @xvstelm_d(
 // CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr noundef [[_2:%.*]]) local_unnamed_addr #[[ATTR5]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.d(<4 x i64> [[_1]], ptr [[_2]], i32 8, i32 1)
 // CHECK-NEXT:    ret void
 //
@@ -4529,108 +4529,108 @@ void xvstelm_d(v4i64 _1, void * _2) { return __lasx_xvstelm_d(_1, _2, 8, 1); }
 // CHECK-LABEL: define dso_local void @xvinsve0_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvinsve0.w(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvinsve0_w(v8i32 _1, v8i32 _2) { return __lasx_xvinsve0_w(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvinsve0_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvinsve0.d(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvinsve0_d(v4i64 _1, v4i64 _2) { return __lasx_xvinsve0_d(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvpickve_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpickve.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvpickve_w(v8i32 _1) { return __lasx_xvpickve_w(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvpickve_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpickve.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvpickve_d(v4i64 _1) { return __lasx_xvpickve_d(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvssrlrn_b_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrn.b.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvssrlrn_b_h(v16i16 _1, v16i16 _2) { return __lasx_xvssrlrn_b_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssrlrn_h_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrn.h.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvssrlrn_h_w(v8i32 _1, v8i32 _2) { return __lasx_xvssrlrn_h_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssrlrn_w_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrn.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvssrlrn_w_d(v4i64 _1, v4i64 _2) { return __lasx_xvssrlrn_w_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssrln_b_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrln.b.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvssrln_b_h(v16i16 _1, v16i16 _2) { return __lasx_xvssrln_b_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssrln_h_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrln.h.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvssrln_h_w(v8i32 _1, v8i32 _2) { return __lasx_xvssrln_h_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssrln_w_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrln.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvssrln_w_d(v4i64 _1, v4i64 _2) { return __lasx_xvssrln_w_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvorn_v(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvorn.v(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvorn_v(v32i8 _1, v32i8 _2) { return __lasx_xvorn_v(_1, _2); }
@@ -4638,22 +4638,22 @@ v32i8 xvorn_v(v32i8 _1, v32i8 _2) { return __lasx_xvorn_v(_1, _2); }
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvldi(i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvldi() { return __lasx_xvldi(1); }
 // CHECK-LABEL: define dso_local void @xvldx(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr noundef [[_1:%.*]]) local_unnamed_addr #[[ATTR3]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvldx(ptr [[_1]], i64 1), !noalias [[META5:![0-9]+]]
-// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvldx(ptr [[_1]], i64 1), !noalias [[META7:![0-9]+]]
+// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvldx(void * _1) { return __lasx_xvldx(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvstx(
 // CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr noundef [[_2:%.*]]) local_unnamed_addr #[[ATTR5]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstx(<32 x i8> [[_112]], ptr [[_2]], i64 1)
 // CHECK-NEXT:    ret void
 //
@@ -4661,209 +4661,209 @@ void xvstx(v32i8 _1, void * _2) { return __lasx_xvstx(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvextl_qu_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvextl.qu.du(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvextl_qu_du(v4u64 _1) { return __lasx_xvextl_qu_du(_1); }
 // CHECK-LABEL: define dso_local void @xvinsgr2vr_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvinsgr2vr.w(<8 x i32> [[_1]], i32 1, i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvinsgr2vr_w(v8i32 _1) { return __lasx_xvinsgr2vr_w(_1, 1, 1); }
 // CHECK-LABEL: define dso_local void @xvinsgr2vr_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvinsgr2vr.d(<4 x i64> [[_1]], i64 1, i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvinsgr2vr_d(v4i64 _1) { return __lasx_xvinsgr2vr_d(_1, 1, 1); }
 // CHECK-LABEL: define dso_local void @xvreplve0_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplve0.b(<32 x i8> [[_112]])
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvreplve0_b(v32i8 _1) { return __lasx_xvreplve0_b(_1); }
 // CHECK-LABEL: define dso_local void @xvreplve0_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvreplve0.h(<16 x i16> [[_112]])
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvreplve0_h(v16i16 _1) { return __lasx_xvreplve0_h(_1); }
 // CHECK-LABEL: define dso_local void @xvreplve0_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvreplve0.w(<8 x i32> [[_112]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvreplve0_w(v8i32 _1) { return __lasx_xvreplve0_w(_1); }
 // CHECK-LABEL: define dso_local void @xvreplve0_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvreplve0.d(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvreplve0_d(v4i64 _1) { return __lasx_xvreplve0_d(_1); }
 // CHECK-LABEL: define dso_local void @xvreplve0_q(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplve0.q(<32 x i8> [[_112]])
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvreplve0_q(v32i8 _1) { return __lasx_xvreplve0_q(_1); }
 // CHECK-LABEL: define dso_local void @vext2xv_h_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.vext2xv.h.b(<32 x i8> [[_112]])
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 vext2xv_h_b(v32i8 _1) { return __lasx_vext2xv_h_b(_1); }
 // CHECK-LABEL: define dso_local void @vext2xv_w_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.w.h(<16 x i16> [[_112]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 vext2xv_w_h(v16i16 _1) { return __lasx_vext2xv_w_h(_1); }
 // CHECK-LABEL: define dso_local void @vext2xv_d_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.d.w(<8 x i32> [[_112]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_d_w(v8i32 _1) { return __lasx_vext2xv_d_w(_1); }
 // CHECK-LABEL: define dso_local void @vext2xv_w_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.w.b(<32 x i8> [[_112]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 vext2xv_w_b(v32i8 _1) { return __lasx_vext2xv_w_b(_1); }
 // CHECK-LABEL: define dso_local void @vext2xv_d_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.d.h(<16 x i16> [[_112]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_d_h(v16i16 _1) { return __lasx_vext2xv_d_h(_1); }
 // CHECK-LABEL: define dso_local void @vext2xv_d_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.d.b(<32 x i8> [[_112]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_d_b(v32i8 _1) { return __lasx_vext2xv_d_b(_1); }
 // CHECK-LABEL: define dso_local void @vext2xv_hu_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.vext2xv.hu.bu(<32 x i8> [[_112]])
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 vext2xv_hu_bu(v32i8 _1) { return __lasx_vext2xv_hu_bu(_1); }
 // CHECK-LABEL: define dso_local void @vext2xv_wu_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.wu.hu(<16 x i16> [[_112]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 vext2xv_wu_hu(v16i16 _1) { return __lasx_vext2xv_wu_hu(_1); }
 // CHECK-LABEL: define dso_local void @vext2xv_du_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.du.wu(<8 x i32> [[_112]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_du_wu(v8i32 _1) { return __lasx_vext2xv_du_wu(_1); }
 // CHECK-LABEL: define dso_local void @vext2xv_wu_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.wu.bu(<32 x i8> [[_112]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 vext2xv_wu_bu(v32i8 _1) { return __lasx_vext2xv_wu_bu(_1); }
 // CHECK-LABEL: define dso_local void @vext2xv_du_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.du.hu(<16 x i16> [[_112]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_du_hu(v16i16 _1) { return __lasx_vext2xv_du_hu(_1); }
 // CHECK-LABEL: define dso_local void @vext2xv_du_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.du.bu(<32 x i8> [[_112]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_du_bu(v32i8 _1) { return __lasx_vext2xv_du_bu(_1); }
 // CHECK-LABEL: define dso_local void @xvpermi_q(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpermi.q(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvpermi_q(v32i8 _1, v32i8 _2) { return __lasx_xvpermi_q(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvpermi_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpermi.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvpermi_d(v4i64 _1) { return __lasx_xvpermi_d(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvperm_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvperm.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvperm_w(v8i32 _1, v8i32 _2) { return __lasx_xvperm_w(_1, _2); }
@@ -4871,7 +4871,7 @@ v8i32 xvperm_w(v8i32 _1, v8i32 _2) { return __lasx_xvperm_w(_1, _2); }
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr noundef [[_1:%.*]]) local_unnamed_addr #[[ATTR3]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvldrepl.b(ptr [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvldrepl_b(void * _1) { return __lasx_xvldrepl_b(_1, 1); }
@@ -4879,7 +4879,7 @@ v32i8 xvldrepl_b(void * _1) { return __lasx_xvldrepl_b(_1, 1); }
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr noundef [[_1:%.*]]) local_unnamed_addr #[[ATTR3]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvldrepl.h(ptr [[_1]], i32 2)
-// CHECK-NEXT:    store <16 x i16> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvldrepl_h(void * _1) { return __lasx_xvldrepl_h(_1, 2); }
@@ -4887,7 +4887,7 @@ v16i16 xvldrepl_h(void * _1) { return __lasx_xvldrepl_h(_1, 2); }
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr noundef [[_1:%.*]]) local_unnamed_addr #[[ATTR3]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvldrepl.w(ptr [[_1]], i32 4)
-// CHECK-NEXT:    store <8 x i32> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvldrepl_w(void * _1) { return __lasx_xvldrepl_w(_1, 4); }
@@ -4895,14 +4895,14 @@ v8i32 xvldrepl_w(void * _1) { return __lasx_xvldrepl_w(_1, 4); }
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr noundef [[_1:%.*]]) local_unnamed_addr #[[ATTR3]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvldrepl.d(ptr [[_1]], i32 8)
-// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvldrepl_d(void * _1) { return __lasx_xvldrepl_d(_1, 8); }
 // CHECK-LABEL: define dso_local signext i32 @xvpickve2gr_w(
 // CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xvpickve2gr.w(<8 x i32> [[_1]], i32 1)
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
@@ -4910,7 +4910,7 @@ int xvpickve2gr_w(v8i32 _1) { return __lasx_xvpickve2gr_w(_1, 1); }
 // CHECK-LABEL: define dso_local signext i32 @xvpickve2gr_wu(
 // CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xvpickve2gr.wu(<8 x i32> [[_1]], i32 1)
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
@@ -4918,7 +4918,7 @@ unsigned int xvpickve2gr_wu(v8i32 _1) { return __lasx_xvpickve2gr_wu(_1, 1); }
 // CHECK-LABEL: define dso_local i64 @xvpickve2gr_d(
 // CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.loongarch.lasx.xvpickve2gr.d(<4 x i64> [[_1]], i32 1)
 // CHECK-NEXT:    ret i64 [[TMP1]]
 //
@@ -4926,7 +4926,7 @@ long xvpickve2gr_d(v4i64 _1) { return __lasx_xvpickve2gr_d(_1, 1); }
 // CHECK-LABEL: define dso_local i64 @xvpickve2gr_du(
 // CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.loongarch.lasx.xvpickve2gr.du(<4 x i64> [[_1]], i32 1)
 // CHECK-NEXT:    ret i64 [[TMP1]]
 //
@@ -4934,1626 +4934,1626 @@ unsigned long int xvpickve2gr_du(v4i64 _1) { return __lasx_xvpickve2gr_du(_1, 1)
 // CHECK-LABEL: define dso_local void @xvaddwev_q_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_q_d(v4i64 _1, v4i64 _2) { return __lasx_xvaddwev_q_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddwev_d_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_d_w(v8i32 _1, v8i32 _2) { return __lasx_xvaddwev_d_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddwev_w_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvaddwev_w_h(v16i16 _1, v16i16 _2) { return __lasx_xvaddwev_w_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddwev_h_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvaddwev_h_b(v32i8 _1, v32i8 _2) { return __lasx_xvaddwev_h_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddwev_q_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_q_du(v4u64 _1, v4u64 _2) { return __lasx_xvaddwev_q_du(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddwev_d_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_d_wu(v8u32 _1, v8u32 _2) { return __lasx_xvaddwev_d_wu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddwev_w_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvaddwev_w_hu(v16u16 _1, v16u16 _2) { return __lasx_xvaddwev_w_hu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddwev_h_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvaddwev_h_bu(v32u8 _1, v32u8 _2) { return __lasx_xvaddwev_h_bu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsubwev_q_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsubwev_q_d(v4i64 _1, v4i64 _2) { return __lasx_xvsubwev_q_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsubwev_d_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.d.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsubwev_d_w(v8i32 _1, v8i32 _2) { return __lasx_xvsubwev_d_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsubwev_w_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwev.w.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsubwev_w_h(v16i16 _1, v16i16 _2) { return __lasx_xvsubwev_w_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsubwev_h_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwev.h.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsubwev_h_b(v32i8 _1, v32i8 _2) { return __lasx_xvsubwev_h_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsubwev_q_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsubwev_q_du(v4u64 _1, v4u64 _2) { return __lasx_xvsubwev_q_du(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsubwev_d_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.d.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsubwev_d_wu(v8u32 _1, v8u32 _2) { return __lasx_xvsubwev_d_wu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsubwev_w_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwev.w.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsubwev_w_hu(v16u16 _1, v16u16 _2) { return __lasx_xvsubwev_w_hu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsubwev_h_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwev.h.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsubwev_h_bu(v32u8 _1, v32u8 _2) { return __lasx_xvsubwev_h_bu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwev_q_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_q_d(v4i64 _1, v4i64 _2) { return __lasx_xvmulwev_q_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwev_d_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_d_w(v8i32 _1, v8i32 _2) { return __lasx_xvmulwev_d_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwev_w_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmulwev_w_h(v16i16 _1, v16i16 _2) { return __lasx_xvmulwev_w_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwev_h_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmulwev_h_b(v32i8 _1, v32i8 _2) { return __lasx_xvmulwev_h_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwev_q_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_q_du(v4u64 _1, v4u64 _2) { return __lasx_xvmulwev_q_du(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwev_d_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_d_wu(v8u32 _1, v8u32 _2) { return __lasx_xvmulwev_d_wu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwev_w_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmulwev_w_hu(v16u16 _1, v16u16 _2) { return __lasx_xvmulwev_w_hu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwev_h_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmulwev_h_bu(v32u8 _1, v32u8 _2) { return __lasx_xvmulwev_h_bu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddwod_q_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_q_d(v4i64 _1, v4i64 _2) { return __lasx_xvaddwod_q_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddwod_d_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_d_w(v8i32 _1, v8i32 _2) { return __lasx_xvaddwod_d_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddwod_w_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvaddwod_w_h(v16i16 _1, v16i16 _2) { return __lasx_xvaddwod_w_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddwod_h_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvaddwod_h_b(v32i8 _1, v32i8 _2) { return __lasx_xvaddwod_h_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddwod_q_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_q_du(v4u64 _1, v4u64 _2) { return __lasx_xvaddwod_q_du(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddwod_d_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_d_wu(v8u32 _1, v8u32 _2) { return __lasx_xvaddwod_d_wu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddwod_w_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvaddwod_w_hu(v16u16 _1, v16u16 _2) { return __lasx_xvaddwod_w_hu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddwod_h_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvaddwod_h_bu(v32u8 _1, v32u8 _2) { return __lasx_xvaddwod_h_bu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsubwod_q_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsubwod_q_d(v4i64 _1, v4i64 _2) { return __lasx_xvsubwod_q_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsubwod_d_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.d.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsubwod_d_w(v8i32 _1, v8i32 _2) { return __lasx_xvsubwod_d_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsubwod_w_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwod.w.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsubwod_w_h(v16i16 _1, v16i16 _2) { return __lasx_xvsubwod_w_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsubwod_h_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwod.h.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsubwod_h_b(v32i8 _1, v32i8 _2) { return __lasx_xvsubwod_h_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsubwod_q_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsubwod_q_du(v4u64 _1, v4u64 _2) { return __lasx_xvsubwod_q_du(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsubwod_d_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.d.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsubwod_d_wu(v8u32 _1, v8u32 _2) { return __lasx_xvsubwod_d_wu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsubwod_w_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwod.w.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsubwod_w_hu(v16u16 _1, v16u16 _2) { return __lasx_xvsubwod_w_hu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsubwod_h_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwod.h.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsubwod_h_bu(v32u8 _1, v32u8 _2) { return __lasx_xvsubwod_h_bu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwod_q_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_q_d(v4i64 _1, v4i64 _2) { return __lasx_xvmulwod_q_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwod_d_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_d_w(v8i32 _1, v8i32 _2) { return __lasx_xvmulwod_d_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwod_w_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmulwod_w_h(v16i16 _1, v16i16 _2) { return __lasx_xvmulwod_w_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwod_h_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmulwod_h_b(v32i8 _1, v32i8 _2) { return __lasx_xvmulwod_h_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwod_q_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_q_du(v4u64 _1, v4u64 _2) { return __lasx_xvmulwod_q_du(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwod_d_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.wu(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_d_wu(v8u32 _1, v8u32 _2) { return __lasx_xvmulwod_d_wu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwod_w_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.hu(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmulwod_w_hu(v16u16 _1, v16u16 _2) { return __lasx_xvmulwod_w_hu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwod_h_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.bu(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmulwod_h_bu(v32u8 _1, v32u8 _2) { return __lasx_xvmulwod_h_bu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddwev_d_wu_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.wu.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_d_wu_w(v8u32 _1, v8i32 _2) { return __lasx_xvaddwev_d_wu_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddwev_w_hu_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.hu.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvaddwev_w_hu_h(v16u16 _1, v16i16 _2) { return __lasx_xvaddwev_w_hu_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddwev_h_bu_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.bu.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvaddwev_h_bu_b(v32u8 _1, v32i8 _2) { return __lasx_xvaddwev_h_bu_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwev_d_wu_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.wu.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_d_wu_w(v8u32 _1, v8i32 _2) { return __lasx_xvmulwev_d_wu_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwev_w_hu_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.hu.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmulwev_w_hu_h(v16u16 _1, v16i16 _2) { return __lasx_xvmulwev_w_hu_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwev_h_bu_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.bu.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmulwev_h_bu_b(v32u8 _1, v32i8 _2) { return __lasx_xvmulwev_h_bu_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddwod_d_wu_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.wu.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_d_wu_w(v8u32 _1, v8i32 _2) { return __lasx_xvaddwod_d_wu_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddwod_w_hu_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.hu.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvaddwod_w_hu_h(v16u16 _1, v16i16 _2) { return __lasx_xvaddwod_w_hu_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddwod_h_bu_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.bu.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvaddwod_h_bu_b(v32u8 _1, v32i8 _2) { return __lasx_xvaddwod_h_bu_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwod_d_wu_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.wu.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_d_wu_w(v8u32 _1, v8i32 _2) { return __lasx_xvmulwod_d_wu_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwod_w_hu_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.hu.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmulwod_w_hu_h(v16u16 _1, v16i16 _2) { return __lasx_xvmulwod_w_hu_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwod_h_bu_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.bu.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmulwod_h_bu_b(v32u8 _1, v32i8 _2) { return __lasx_xvmulwod_h_bu_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvhaddw_q_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvhaddw_q_d(v4i64 _1, v4i64 _2) { return __lasx_xvhaddw_q_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvhaddw_qu_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.qu.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvhaddw_qu_du(v4u64 _1, v4u64 _2) { return __lasx_xvhaddw_qu_du(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvhsubw_q_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvhsubw_q_d(v4i64 _1, v4i64 _2) { return __lasx_xvhsubw_q_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvhsubw_qu_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.qu.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvhsubw_qu_du(v4u64 _1, v4u64 _2) { return __lasx_xvhsubw_qu_du(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmaddwev_q_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwev_q_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __lasx_xvmaddwev_q_d(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmaddwev_d_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_346:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_346:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.w(<4 x i64> [[_1]], <8 x i32> [[_235]], <8 x i32> [[_346]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwev_d_w(v4i64 _1, v8i32 _2, v8i32 _3) { return __lasx_xvmaddwev_d_w(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmaddwev_w_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.h(<8 x i32> [[_136]], <16 x i16> [[_247]], <16 x i16> [[_358]])
-// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmaddwev_w_h(v8i32 _1, v16i16 _2, v16i16 _3) { return __lasx_xvmaddwev_w_h(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmaddwev_h_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.b(<16 x i16> [[_136]], <32 x i8> [[_247]], <32 x i8> [[_358]])
-// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmaddwev_h_b(v16i16 _1, v32i8 _2, v32i8 _3) { return __lasx_xvmaddwev_h_b(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmaddwev_q_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvmaddwev_q_du(v4u64 _1, v4u64 _2, v4u64 _3) { return __lasx_xvmaddwev_q_du(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmaddwev_d_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_346:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_346:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.wu(<4 x i64> [[_1]], <8 x i32> [[_235]], <8 x i32> [[_346]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvmaddwev_d_wu(v4u64 _1, v8u32 _2, v8u32 _3) { return __lasx_xvmaddwev_d_wu(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmaddwev_w_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.hu(<8 x i32> [[_136]], <16 x i16> [[_247]], <16 x i16> [[_358]])
-// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvmaddwev_w_hu(v8u32 _1, v16u16 _2, v16u16 _3) { return __lasx_xvmaddwev_w_hu(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmaddwev_h_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.bu(<16 x i16> [[_136]], <32 x i8> [[_247]], <32 x i8> [[_358]])
-// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvmaddwev_h_bu(v16u16 _1, v32u8 _2, v32u8 _3) { return __lasx_xvmaddwev_h_bu(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmaddwod_q_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwod_q_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __lasx_xvmaddwod_q_d(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmaddwod_d_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_346:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_346:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.w(<4 x i64> [[_1]], <8 x i32> [[_235]], <8 x i32> [[_346]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwod_d_w(v4i64 _1, v8i32 _2, v8i32 _3) { return __lasx_xvmaddwod_d_w(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmaddwod_w_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.h(<8 x i32> [[_136]], <16 x i16> [[_247]], <16 x i16> [[_358]])
-// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmaddwod_w_h(v8i32 _1, v16i16 _2, v16i16 _3) { return __lasx_xvmaddwod_w_h(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmaddwod_h_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.b(<16 x i16> [[_136]], <32 x i8> [[_247]], <32 x i8> [[_358]])
-// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmaddwod_h_b(v16i16 _1, v32i8 _2, v32i8 _3) { return __lasx_xvmaddwod_h_b(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmaddwod_q_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvmaddwod_q_du(v4u64 _1, v4u64 _2, v4u64 _3) { return __lasx_xvmaddwod_q_du(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmaddwod_d_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_346:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_346:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.wu(<4 x i64> [[_1]], <8 x i32> [[_235]], <8 x i32> [[_346]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvmaddwod_d_wu(v4u64 _1, v8u32 _2, v8u32 _3) { return __lasx_xvmaddwod_d_wu(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmaddwod_w_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.hu(<8 x i32> [[_136]], <16 x i16> [[_247]], <16 x i16> [[_358]])
-// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvmaddwod_w_hu(v8u32 _1, v16u16 _2, v16u16 _3) { return __lasx_xvmaddwod_w_hu(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmaddwod_h_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.bu(<16 x i16> [[_136]], <32 x i8> [[_247]], <32 x i8> [[_358]])
-// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvmaddwod_h_bu(v16u16 _1, v32u8 _2, v32u8 _3) { return __lasx_xvmaddwod_h_bu(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmaddwev_q_du_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwev_q_du_d(v4i64 _1, v4u64 _2, v4i64 _3) { return __lasx_xvmaddwev_q_du_d(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmaddwev_d_wu_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_346:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_346:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.wu.w(<4 x i64> [[_1]], <8 x i32> [[_235]], <8 x i32> [[_346]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwev_d_wu_w(v4i64 _1, v8u32 _2, v8i32 _3) { return __lasx_xvmaddwev_d_wu_w(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmaddwev_w_hu_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.hu.h(<8 x i32> [[_136]], <16 x i16> [[_247]], <16 x i16> [[_358]])
-// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmaddwev_w_hu_h(v8i32 _1, v16u16 _2, v16i16 _3) { return __lasx_xvmaddwev_w_hu_h(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmaddwev_h_bu_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.bu.b(<16 x i16> [[_136]], <32 x i8> [[_247]], <32 x i8> [[_358]])
-// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmaddwev_h_bu_b(v16i16 _1, v32u8 _2, v32i8 _3) { return __lasx_xvmaddwev_h_bu_b(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmaddwod_q_du_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwod_q_du_d(v4i64 _1, v4u64 _2, v4i64 _3) { return __lasx_xvmaddwod_q_du_d(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmaddwod_d_wu_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_346:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_346:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.wu.w(<4 x i64> [[_1]], <8 x i32> [[_235]], <8 x i32> [[_346]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwod_d_wu_w(v4i64 _1, v8u32 _2, v8i32 _3) { return __lasx_xvmaddwod_d_wu_w(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmaddwod_w_hu_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_136:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_247:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_358:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.hu.h(<8 x i32> [[_136]], <16 x i16> [[_247]], <16 x i16> [[_358]])
-// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmaddwod_w_hu_h(v8i32 _1, v16u16 _2, v16i16 _3) { return __lasx_xvmaddwod_w_hu_h(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmaddwod_h_bu_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_136:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_247:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_358:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.bu.b(<16 x i16> [[_136]], <32 x i8> [[_247]], <32 x i8> [[_358]])
-// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmaddwod_h_bu_b(v16i16 _1, v32u8 _2, v32i8 _3) { return __lasx_xvmaddwod_h_bu_b(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvrotr_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrotr.b(<32 x i8> [[_124]], <32 x i8> [[_235]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvrotr_b(v32i8 _1, v32i8 _2) { return __lasx_xvrotr_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvrotr_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrotr.h(<16 x i16> [[_124]], <16 x i16> [[_235]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvrotr_h(v16i16 _1, v16i16 _2) { return __lasx_xvrotr_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvrotr_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_124:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_235:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrotr.w(<8 x i32> [[_124]], <8 x i32> [[_235]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvrotr_w(v8i32 _1, v8i32 _2) { return __lasx_xvrotr_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvrotr_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrotr.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvrotr_d(v4i64 _1, v4i64 _2) { return __lasx_xvrotr_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvadd_q(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvadd.q(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvadd_q(v4i64 _1, v4i64 _2) { return __lasx_xvadd_q(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsub_q(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsub.q(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsub_q(v4i64 _1, v4i64 _2) { return __lasx_xvsub_q(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddwev_q_du_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_q_du_d(v4u64 _1, v4i64 _2) { return __lasx_xvaddwev_q_du_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddwod_q_du_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_q_du_d(v4u64 _1, v4i64 _2) { return __lasx_xvaddwod_q_du_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwev_q_du_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_q_du_d(v4u64 _1, v4i64 _2) { return __lasx_xvmulwev_q_du_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwod_q_du_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_q_du_d(v4u64 _1, v4i64 _2) { return __lasx_xvmulwod_q_du_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmskgez_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmskgez.b(<32 x i8> [[_112]])
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmskgez_b(v32i8 _1) { return __lasx_xvmskgez_b(_1); }
 // CHECK-LABEL: define dso_local void @xvmsknz_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmsknz.b(<32 x i8> [[_112]])
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmsknz_b(v32i8 _1) { return __lasx_xvmsknz_b(_1); }
 // CHECK-LABEL: define dso_local void @xvexth_h_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvexth.h.b(<32 x i8> [[_112]])
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvexth_h_b(v32i8 _1) { return __lasx_xvexth_h_b(_1); }
 // CHECK-LABEL: define dso_local void @xvexth_w_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvexth.w.h(<16 x i16> [[_112]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvexth_w_h(v16i16 _1) { return __lasx_xvexth_w_h(_1); }
 // CHECK-LABEL: define dso_local void @xvexth_d_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.d.w(<8 x i32> [[_112]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvexth_d_w(v8i32 _1) { return __lasx_xvexth_d_w(_1); }
 // CHECK-LABEL: define dso_local void @xvexth_q_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.q.d(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvexth_q_d(v4i64 _1) { return __lasx_xvexth_q_d(_1); }
 // CHECK-LABEL: define dso_local void @xvexth_hu_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvexth.hu.bu(<32 x i8> [[_112]])
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvexth_hu_bu(v32u8 _1) { return __lasx_xvexth_hu_bu(_1); }
 // CHECK-LABEL: define dso_local void @xvexth_wu_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvexth.wu.hu(<16 x i16> [[_112]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvexth_wu_hu(v16u16 _1) { return __lasx_xvexth_wu_hu(_1); }
 // CHECK-LABEL: define dso_local void @xvexth_du_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_112:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.du.wu(<8 x i32> [[_112]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvexth_du_wu(v8u32 _1) { return __lasx_xvexth_du_wu(_1); }
 // CHECK-LABEL: define dso_local void @xvexth_qu_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.qu.du(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvexth_qu_du(v4u64 _1) { return __lasx_xvexth_qu_du(_1); }
 // CHECK-LABEL: define dso_local void @xvrotri_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrotri.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvrotri_b(v32i8 _1) { return __lasx_xvrotri_b(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvrotri_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrotri.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvrotri_h(v16i16 _1) { return __lasx_xvrotri_h(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvrotri_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrotri.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvrotri_w(v8i32 _1) { return __lasx_xvrotri_w(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvrotri_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrotri.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvrotri_d(v4i64 _1) { return __lasx_xvrotri_d(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvextl_q_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvextl.q.d(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvextl_q_d(v4i64 _1) { return __lasx_xvextl_q_d(_1); }
 // CHECK-LABEL: define dso_local void @xvsrlni_b_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrlni_b_h(v32i8 _1, v32i8 _2) { return __lasx_xvsrlni_b_h(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvsrlni_h_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrlni_h_w(v16i16 _1, v16i16 _2) { return __lasx_xvsrlni_h_w(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvsrlni_w_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrlni_w_d(v8i32 _1, v8i32 _2) { return __lasx_xvsrlni_w_d(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvsrlni_d_q(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsrlni_d_q(v4i64 _1, v4i64 _2) { return __lasx_xvsrlni_d_q(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvsrlrni_b_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlrni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrlrni_b_h(v32i8 _1, v32i8 _2) { return __lasx_xvsrlrni_b_h(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvsrlrni_h_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlrni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrlrni_h_w(v16i16 _1, v16i16 _2) { return __lasx_xvsrlrni_h_w(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvsrlrni_w_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlrni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrlrni_w_d(v8i32 _1, v8i32 _2) { return __lasx_xvsrlrni_w_d(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvsrlrni_d_q(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlrni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsrlrni_d_q(v4i64 _1, v4i64 _2) { return __lasx_xvsrlrni_d_q(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrlni_b_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvssrlni_b_h(v32i8 _1, v32i8 _2) { return __lasx_xvssrlni_b_h(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrlni_h_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvssrlni_h_w(v16i16 _1, v16i16 _2) { return __lasx_xvssrlni_h_w(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrlni_w_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvssrlni_w_d(v8i32 _1, v8i32 _2) { return __lasx_xvssrlni_w_d(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrlni_d_q(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvssrlni_d_q(v4i64 _1, v4i64 _2) { return __lasx_xvssrlni_d_q(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrlni_bu_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlni.bu.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvssrlni_bu_h(v32u8 _1, v32i8 _2) { return __lasx_xvssrlni_bu_h(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrlni_hu_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlni.hu.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvssrlni_hu_w(v16u16 _1, v16i16 _2) { return __lasx_xvssrlni_hu_w(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrlni_wu_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlni.wu.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvssrlni_wu_d(v8u32 _1, v8i32 _2) { return __lasx_xvssrlni_wu_d(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrlni_du_q(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlni.du.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvssrlni_du_q(v4u64 _1, v4i64 _2) { return __lasx_xvssrlni_du_q(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrlrni_b_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvssrlrni_b_h(v32i8 _1, v32i8 _2) { return __lasx_xvssrlrni_b_h(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrlrni_h_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvssrlrni_h_w(v16i16 _1, v16i16 _2) { return __lasx_xvssrlrni_h_w(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrlrni_w_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvssrlrni_w_d(v8i32 _1, v8i32 _2) { return __lasx_xvssrlrni_w_d(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrlrni_d_q(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlrni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvssrlrni_d_q(v4i64 _1, v4i64 _2) { return __lasx_xvssrlrni_d_q(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrlrni_bu_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrni.bu.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvssrlrni_bu_h(v32u8 _1, v32i8 _2) { return __lasx_xvssrlrni_bu_h(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrlrni_hu_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrni.hu.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvssrlrni_hu_w(v16u16 _1, v16i16 _2) { return __lasx_xvssrlrni_hu_w(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrlrni_wu_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrni.wu.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvssrlrni_wu_d(v8u32 _1, v8i32 _2) { return __lasx_xvssrlrni_wu_d(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrlrni_du_q(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlrni.du.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvssrlrni_du_q(v4u64 _1, v4i64 _2) { return __lasx_xvssrlrni_du_q(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvsrani_b_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrani.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrani_b_h(v32i8 _1, v32i8 _2) { return __lasx_xvsrani_b_h(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvsrani_h_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrani.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrani_h_w(v16i16 _1, v16i16 _2) { return __lasx_xvsrani_h_w(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvsrani_w_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrani.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrani_w_d(v8i32 _1, v8i32 _2) { return __lasx_xvsrani_w_d(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvsrani_d_q(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrani.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsrani_d_q(v4i64 _1, v4i64 _2) { return __lasx_xvsrani_d_q(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvsrarni_b_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrarni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrarni_b_h(v32i8 _1, v32i8 _2) { return __lasx_xvsrarni_b_h(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvsrarni_h_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrarni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrarni_h_w(v16i16 _1, v16i16 _2) { return __lasx_xvsrarni_h_w(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvsrarni_w_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrarni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrarni_w_d(v8i32 _1, v8i32 _2) { return __lasx_xvsrarni_w_d(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvsrarni_d_q(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrarni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsrarni_d_q(v4i64 _1, v4i64 _2) { return __lasx_xvsrarni_d_q(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrani_b_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrani.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvssrani_b_h(v32i8 _1, v32i8 _2) { return __lasx_xvssrani_b_h(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrani_h_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrani.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvssrani_h_w(v16i16 _1, v16i16 _2) { return __lasx_xvssrani_h_w(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrani_w_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrani.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvssrani_w_d(v8i32 _1, v8i32 _2) { return __lasx_xvssrani_w_d(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrani_d_q(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrani.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvssrani_d_q(v4i64 _1, v4i64 _2) { return __lasx_xvssrani_d_q(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrani_bu_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrani.bu.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvssrani_bu_h(v32u8 _1, v32i8 _2) { return __lasx_xvssrani_bu_h(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrani_hu_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrani.hu.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvssrani_hu_w(v16u16 _1, v16i16 _2) { return __lasx_xvssrani_hu_w(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrani_wu_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrani.wu.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvssrani_wu_d(v8u32 _1, v8i32 _2) { return __lasx_xvssrani_wu_d(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrani_du_q(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrani.du.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvssrani_du_q(v4u64 _1, v4i64 _2) { return __lasx_xvssrani_du_q(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrarni_b_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvssrarni_b_h(v32i8 _1, v32i8 _2) { return __lasx_xvssrarni_b_h(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrarni_h_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvssrarni_h_w(v16i16 _1, v16i16 _2) { return __lasx_xvssrarni_h_w(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrarni_w_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvssrarni_w_d(v8i32 _1, v8i32 _2) { return __lasx_xvssrarni_w_d(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrarni_d_q(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrarni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvssrarni_d_q(v4i64 _1, v4i64 _2) { return __lasx_xvssrarni_d_q(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrarni_bu_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarni.bu.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvssrarni_bu_h(v32u8 _1, v32i8 _2) { return __lasx_xvssrarni_bu_h(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrarni_hu_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarni.hu.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvssrarni_hu_w(v16u16 _1, v16i16 _2) { return __lasx_xvssrarni_hu_w(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrarni_wu_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarni.wu.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvssrarni_wu_d(v8u32 _1, v8i32 _2) { return __lasx_xvssrarni_wu_d(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrarni_du_q(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrarni.du.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvssrarni_du_q(v4u64 _1, v4i64 _2) { return __lasx_xvssrarni_du_q(_1, _2, 1); }
 // CHECK-LABEL: define dso_local signext i32 @xbnz_b(
 // CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.b(<32 x i8> [[_1]])
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
@@ -6561,7 +6561,7 @@ int xbnz_b(v32u8 _1) { return __lasx_xbnz_b(_1); }
 // CHECK-LABEL: define dso_local signext i32 @xbnz_d(
 // CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.d(<4 x i64> [[_1]])
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
@@ -6569,7 +6569,7 @@ int xbnz_d(v4u64 _1) { return __lasx_xbnz_d(_1); }
 // CHECK-LABEL: define dso_local signext i32 @xbnz_h(
 // CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.h(<16 x i16> [[_1]])
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
@@ -6577,7 +6577,7 @@ int xbnz_h(v16u16 _1) { return __lasx_xbnz_h(_1); }
 // CHECK-LABEL: define dso_local signext i32 @xbnz_v(
 // CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.v(<32 x i8> [[_1]])
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
@@ -6585,7 +6585,7 @@ int xbnz_v(v32u8 _1) { return __lasx_xbnz_v(_1); }
 // CHECK-LABEL: define dso_local signext i32 @xbnz_w(
 // CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.w(<8 x i32> [[_1]])
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
@@ -6593,7 +6593,7 @@ int xbnz_w(v8u32 _1) { return __lasx_xbnz_w(_1); }
 // CHECK-LABEL: define dso_local signext i32 @xbz_b(
 // CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.b(<32 x i8> [[_1]])
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
@@ -6601,7 +6601,7 @@ int xbz_b(v32u8 _1) { return __lasx_xbz_b(_1); }
 // CHECK-LABEL: define dso_local signext i32 @xbz_d(
 // CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.d(<4 x i64> [[_1]])
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
@@ -6609,7 +6609,7 @@ int xbz_d(v4u64 _1) { return __lasx_xbz_d(_1); }
 // CHECK-LABEL: define dso_local signext i32 @xbz_h(
 // CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.h(<16 x i16> [[_1]])
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
@@ -6617,7 +6617,7 @@ int xbz_h(v16u16 _1) { return __lasx_xbz_h(_1); }
 // CHECK-LABEL: define dso_local signext i32 @xbz_v(
 // CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.v(<32 x i8> [[_1]])
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
@@ -6625,7 +6625,7 @@ int xbz_v(v32u8 _1) { return __lasx_xbz_v(_1); }
 // CHECK-LABEL: define dso_local signext i32 @xbz_w(
 // CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.w(<8 x i32> [[_1]])
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
@@ -6633,458 +6633,458 @@ int xbz_w(v8u32 _1) { return __lasx_xbz_w(_1); }
 // CHECK-LABEL: define dso_local void @xvfcmp_caf_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.caf.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_caf_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_caf_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_caf_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.caf.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_caf_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_caf_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_ceq_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.ceq.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_ceq_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_ceq_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_ceq_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.ceq.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_ceq_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_ceq_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_cle_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cle.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cle_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_cle_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_cle_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cle.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cle_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_cle_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_clt_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.clt.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_clt_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_clt_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_clt_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.clt.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_clt_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_clt_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_cne_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cne.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cne_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_cne_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_cne_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cne.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cne_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_cne_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_cor_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cor.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cor_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_cor_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_cor_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cor.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cor_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_cor_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_cueq_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cueq.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cueq_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_cueq_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_cueq_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cueq.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cueq_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_cueq_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_cule_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cule.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cule_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_cule_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_cule_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cule.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cule_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_cule_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_cult_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cult.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cult_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_cult_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_cult_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cult.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cult_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_cult_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_cun_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cun.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cun_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_cun_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_cune_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cune.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cune_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_cune_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_cune_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cune.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cune_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_cune_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_cun_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cun.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cun_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_cun_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_saf_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.saf.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_saf_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_saf_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_saf_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.saf.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_saf_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_saf_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_seq_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.seq.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_seq_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_seq_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_seq_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.seq.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_seq_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_seq_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_sle_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sle.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sle_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_sle_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_sle_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sle.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sle_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_sle_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_slt_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.slt.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_slt_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_slt_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_slt_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.slt.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_slt_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_slt_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_sne_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sne.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sne_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_sne_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_sne_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sne.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sne_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_sne_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_sor_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sor.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sor_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_sor_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_sor_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sor.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sor_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_sor_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_sueq_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sueq.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sueq_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_sueq_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_sueq_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sueq.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sueq_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_sueq_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_sule_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sule.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sule_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_sule_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_sule_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sule.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sule_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_sule_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_sult_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sult.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sult_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_sult_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_sult_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sult.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sult_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_sult_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_sun_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sun.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sun_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_sun_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_sune_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sune.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sune_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_sune_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_sune_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sune.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sune_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_sune_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_sun_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sun.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sun_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_sun_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvpickve_d_f(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvpickve.d.f(<4 x double> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvpickve_d_f(v4f64 _1) { return __lasx_xvpickve_d_f(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvpickve_w_f(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvpickve.w.f(<8 x float> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvpickve_w_f(v8f32 _1) { return __lasx_xvpickve_w_f(_1, 1); }
@@ -7092,7 +7092,7 @@ v8f32 xvpickve_w_f(v8f32 _1) { return __lasx_xvpickve_w_f(_1, 1); }
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrepli.b(i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvrepli_b() { return __lasx_xvrepli_b(1); }
@@ -7100,7 +7100,7 @@ v32i8 xvrepli_b() { return __lasx_xvrepli_b(1); }
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrepli.d(i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvrepli_d() { return __lasx_xvrepli_d(1); }
@@ -7108,7 +7108,7 @@ v4i64 xvrepli_d() { return __lasx_xvrepli_d(1); }
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrepli.h(i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvrepli_h() { return __lasx_xvrepli_h(1); }
@@ -7116,15 +7116,15 @@ v16i16 xvrepli_h() { return __lasx_xvrepli_h(1); }
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrepli.w(i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvrepli_w() { return __lasx_xvrepli_w(1); }
 //.
-// CHECK: [[CHAR_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
-// CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
-// CHECK: [[META4]] = !{!"Simple C/C++ TBAA"}
-// CHECK: [[META5]] = !{[[META6:![0-9]+]]}
-// CHECK: [[META6]] = distinct !{[[META6]], [[META7:![0-9]+]], !"__lasx_xvldx: %agg.result"}
-// CHECK: [[META7]] = distinct !{[[META7]], !"__lasx_xvldx"}
+// CHECK: [[META4:![0-9]+]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK: [[META5]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[CHAR_TBAA6]] = !{[[META4]], [[META4]], i64 0}
+// CHECK: [[META7]] = !{[[META8:![0-9]+]]}
+// CHECK: [[META8]] = distinct !{[[META8]], [[META9:![0-9]+]], !"__lasx_xvldx: %agg.result"}
+// CHECK: [[META9]] = distinct !{[[META9]], !"__lasx_xvldx"}
 //.
diff --git a/clang/test/CodeGen/LoongArch/lasx/builtin-approximate-alias.c b/clang/test/CodeGen/LoongArch/lasx/builtin-approximate-alias.c
index b194ea8f3182a..1cbcdcf402893 100644
--- a/clang/test/CodeGen/LoongArch/lasx/builtin-approximate-alias.c
+++ b/clang/test/CodeGen/LoongArch/lasx/builtin-approximate-alias.c
@@ -6,41 +6,41 @@
 // CHECK-LABEL: define dso_local void @xvfrecipe_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6:![0-9]+]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrecipe.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfrecipe_s(v8f32 _1) { return __lasx_xvfrecipe_s(_1); }
 // CHECK-LABEL: define dso_local void @xvfrecipe_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrecipe.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfrecipe_d(v4f64 _1) { return __lasx_xvfrecipe_d(_1); }
 // CHECK-LABEL: define dso_local void @xvfrsqrte_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrsqrte.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfrsqrte_s(v8f32 _1) { return __lasx_xvfrsqrte_s(_1); }
 // CHECK-LABEL: define dso_local void @xvfrsqrte_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrsqrte.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfrsqrte_d(v4f64 _1) { return __lasx_xvfrsqrte_d(_1); }
 //.
-// CHECK: [[CHAR_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
-// CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
-// CHECK: [[META4]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[META4:![0-9]+]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK: [[META5]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[CHAR_TBAA6]] = !{[[META4]], [[META4]], i64 0}
 //.
diff --git a/clang/test/CodeGen/LoongArch/lasx/builtin-approximate.c b/clang/test/CodeGen/LoongArch/lasx/builtin-approximate.c
index 9d543dfabe3d2..5276a488d1d1a 100644
--- a/clang/test/CodeGen/LoongArch/lasx/builtin-approximate.c
+++ b/clang/test/CodeGen/LoongArch/lasx/builtin-approximate.c
@@ -7,41 +7,41 @@ typedef double v4f64 __attribute__((vector_size(32), aligned(32)));
 // CHECK-LABEL: define dso_local void @xvfrecipe_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6:![0-9]+]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrecipe.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfrecipe_s(v8f32 _1) { return __builtin_lasx_xvfrecipe_s(_1); }
 // CHECK-LABEL: define dso_local void @xvfrecipe_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrecipe.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfrecipe_d(v4f64 _1) { return __builtin_lasx_xvfrecipe_d(_1); }
 // CHECK-LABEL: define dso_local void @xvfrsqrte_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrsqrte.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfrsqrte_s(v8f32 _1) { return __builtin_lasx_xvfrsqrte_s(_1); }
 // CHECK-LABEL: define dso_local void @xvfrsqrte_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrsqrte.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfrsqrte_d(v4f64 _1) { return __builtin_lasx_xvfrsqrte_d(_1); }
 //.
-// CHECK: [[CHAR_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
-// CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
-// CHECK: [[META4]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[META4:![0-9]+]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK: [[META5]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[CHAR_TBAA6]] = !{[[META4]], [[META4]], i64 0}
 //.
diff --git a/clang/test/CodeGen/LoongArch/lasx/builtin.c b/clang/test/CodeGen/LoongArch/lasx/builtin.c
index 9b21c7ea3e8a5..700e845cd662a 100644
--- a/clang/test/CodeGen/LoongArch/lasx/builtin.c
+++ b/clang/test/CodeGen/LoongArch/lasx/builtin.c
@@ -28,2808 +28,2808 @@ typedef double v4f64_d __attribute__((vector_size(32), aligned(8)));
 // CHECK-LABEL: define dso_local void @xvsll_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2:![0-9]+]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6:![0-9]+]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsll.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsll_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsll_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsll_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsll.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsll_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsll_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsll_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsll.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsll_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsll_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsll_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsll.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsll_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsll_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvslli_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslli.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvslli_b(v32i8 _1) { return __builtin_lasx_xvslli_b(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvslli_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslli.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvslli_h(v16i16 _1) { return __builtin_lasx_xvslli_h(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvslli_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslli.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvslli_w(v8i32 _1) { return __builtin_lasx_xvslli_w(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvslli_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslli.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvslli_d(v4i64 _1) { return __builtin_lasx_xvslli_d(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsra_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsra.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsra_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsra_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsra_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsra.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsra_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsra_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsra_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsra.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsra_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsra_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsra_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsra.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsra_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsra_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsrai_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrai.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrai_b(v32i8 _1) { return __builtin_lasx_xvsrai_b(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsrai_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrai.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrai_h(v16i16 _1) { return __builtin_lasx_xvsrai_h(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsrai_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrai.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrai_w(v8i32 _1) { return __builtin_lasx_xvsrai_w(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsrai_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrai.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsrai_d(v4i64 _1) { return __builtin_lasx_xvsrai_d(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsrar_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrar.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrar_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsrar_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsrar_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrar.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrar_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrar_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsrar_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrar.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrar_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrar_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsrar_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrar.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsrar_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrar_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsrari_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrari.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrari_b(v32i8 _1) { return __builtin_lasx_xvsrari_b(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsrari_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrari.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrari_h(v16i16 _1) { return __builtin_lasx_xvsrari_h(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsrari_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrari.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrari_w(v8i32 _1) { return __builtin_lasx_xvsrari_w(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsrari_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrari.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsrari_d(v4i64 _1) { return __builtin_lasx_xvsrari_d(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsrl_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrl.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrl_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsrl_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsrl_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrl.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrl_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrl_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsrl_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrl.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrl_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrl_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsrl_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrl.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsrl_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrl_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsrli_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrli.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrli_b(v32i8 _1) { return __builtin_lasx_xvsrli_b(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsrli_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrli.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrli_h(v16i16 _1) { return __builtin_lasx_xvsrli_h(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsrli_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrli.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrli_w(v8i32 _1) { return __builtin_lasx_xvsrli_w(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsrli_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrli.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsrli_d(v4i64 _1) { return __builtin_lasx_xvsrli_d(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsrlr_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlr.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrlr_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsrlr_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsrlr_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlr.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrlr_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrlr_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsrlr_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlr.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrlr_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrlr_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsrlr_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlr.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsrlr_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrlr_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsrlri_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlri.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrlri_b(v32i8 _1) { return __builtin_lasx_xvsrlri_b(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsrlri_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlri.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrlri_h(v16i16 _1) { return __builtin_lasx_xvsrlri_h(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsrlri_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlri.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrlri_w(v8i32 _1) { return __builtin_lasx_xvsrlri_w(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsrlri_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlri.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsrlri_d(v4i64 _1) { return __builtin_lasx_xvsrlri_d(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvbitclr_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitclr.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvbitclr_b(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvbitclr_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvbitclr_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitclr.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvbitclr_h(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvbitclr_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvbitclr_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitclr.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvbitclr_w(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvbitclr_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvbitclr_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitclr.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvbitclr_d(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvbitclr_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvbitclri_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitclri.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvbitclri_b(v32u8 _1) { return __builtin_lasx_xvbitclri_b(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvbitclri_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitclri.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvbitclri_h(v16u16 _1) { return __builtin_lasx_xvbitclri_h(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvbitclri_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitclri.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvbitclri_w(v8u32 _1) { return __builtin_lasx_xvbitclri_w(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvbitclri_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitclri.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvbitclri_d(v4u64 _1) { return __builtin_lasx_xvbitclri_d(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvbitset_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitset.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvbitset_b(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvbitset_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvbitset_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitset.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvbitset_h(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvbitset_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvbitset_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitset.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvbitset_w(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvbitset_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvbitset_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitset.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvbitset_d(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvbitset_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvbitseti_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitseti.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvbitseti_b(v32u8 _1) { return __builtin_lasx_xvbitseti_b(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvbitseti_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitseti.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvbitseti_h(v16u16 _1) { return __builtin_lasx_xvbitseti_h(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvbitseti_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitseti.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvbitseti_w(v8u32 _1) { return __builtin_lasx_xvbitseti_w(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvbitseti_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitseti.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvbitseti_d(v4u64 _1) { return __builtin_lasx_xvbitseti_d(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvbitrev_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitrev.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvbitrev_b(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvbitrev_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvbitrev_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitrev.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvbitrev_h(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvbitrev_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvbitrev_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitrev.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvbitrev_w(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvbitrev_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvbitrev_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitrev.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvbitrev_d(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvbitrev_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvbitrevi_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitrevi.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvbitrevi_b(v32u8 _1) { return __builtin_lasx_xvbitrevi_b(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvbitrevi_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitrevi.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvbitrevi_h(v16u16 _1) { return __builtin_lasx_xvbitrevi_h(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvbitrevi_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitrevi.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvbitrevi_w(v8u32 _1) { return __builtin_lasx_xvbitrevi_w(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvbitrevi_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitrevi.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvbitrevi_d(v4u64 _1) { return __builtin_lasx_xvbitrevi_d(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvadd_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvadd.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvadd_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvadd_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvadd_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvadd.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvadd_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvadd_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvadd_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvadd.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvadd_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvadd_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvadd_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvadd.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvadd_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvadd_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddi_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvaddi.bu(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvaddi_bu(v32i8 _1) { return __builtin_lasx_xvaddi_bu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvaddi_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddi.hu(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvaddi_hu(v16i16 _1) { return __builtin_lasx_xvaddi_hu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvaddi_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddi.wu(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvaddi_wu(v8i32 _1) { return __builtin_lasx_xvaddi_wu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvaddi_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddi.du(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddi_du(v4i64 _1) { return __builtin_lasx_xvaddi_du(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsub_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsub.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsub_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsub_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsub_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsub.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsub_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsub_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsub_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsub.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsub_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsub_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsub_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsub.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsub_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsub_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsubi_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsubi.bu(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsubi_bu(v32i8 _1) { return __builtin_lasx_xvsubi_bu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsubi_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubi.hu(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsubi_hu(v16i16 _1) { return __builtin_lasx_xvsubi_hu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsubi_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubi.wu(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsubi_wu(v8i32 _1) { return __builtin_lasx_xvsubi_wu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsubi_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubi.du(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsubi_du(v4i64 _1) { return __builtin_lasx_xvsubi_du(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvmax_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmax.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmax_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvmax_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmax_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmax.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmax_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvmax_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmax_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmax.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmax_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvmax_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmax_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmax.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmax_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvmax_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmaxi_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmaxi.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmaxi_b(v32i8 _1) { return __builtin_lasx_xvmaxi_b(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvmaxi_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaxi.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmaxi_h(v16i16 _1) { return __builtin_lasx_xvmaxi_h(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvmaxi_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaxi.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmaxi_w(v8i32 _1) { return __builtin_lasx_xvmaxi_w(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvmaxi_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaxi.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmaxi_d(v4i64 _1) { return __builtin_lasx_xvmaxi_d(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvmax_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmax.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvmax_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvmax_bu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmax_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmax.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvmax_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvmax_hu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmax_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmax.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvmax_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvmax_wu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmax_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmax.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvmax_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvmax_du(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmaxi_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmaxi.bu(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvmaxi_bu(v32u8 _1) { return __builtin_lasx_xvmaxi_bu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvmaxi_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaxi.hu(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvmaxi_hu(v16u16 _1) { return __builtin_lasx_xvmaxi_hu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvmaxi_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaxi.wu(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvmaxi_wu(v8u32 _1) { return __builtin_lasx_xvmaxi_wu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvmaxi_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaxi.du(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvmaxi_du(v4u64 _1) { return __builtin_lasx_xvmaxi_du(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvmin_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmin.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmin_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvmin_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmin_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmin.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmin_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvmin_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmin_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmin.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmin_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvmin_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmin_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmin.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmin_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvmin_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmini_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmini.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmini_b(v32i8 _1) { return __builtin_lasx_xvmini_b(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvmini_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmini.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmini_h(v16i16 _1) { return __builtin_lasx_xvmini_h(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvmini_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmini.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmini_w(v8i32 _1) { return __builtin_lasx_xvmini_w(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvmini_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmini.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmini_d(v4i64 _1) { return __builtin_lasx_xvmini_d(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvmin_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmin.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvmin_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvmin_bu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmin_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmin.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvmin_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvmin_hu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmin_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmin.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvmin_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvmin_wu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmin_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmin.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvmin_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvmin_du(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmini_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmini.bu(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvmini_bu(v32u8 _1) { return __builtin_lasx_xvmini_bu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvmini_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmini.hu(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvmini_hu(v16u16 _1) { return __builtin_lasx_xvmini_hu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvmini_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmini.wu(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvmini_wu(v8u32 _1) { return __builtin_lasx_xvmini_wu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvmini_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmini.du(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvmini_du(v4u64 _1) { return __builtin_lasx_xvmini_du(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvseq_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvseq.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvseq_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvseq_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvseq_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvseq.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvseq_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvseq_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvseq_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvseq.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvseq_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvseq_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvseq_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvseq.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvseq_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvseq_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvseqi_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvseqi.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvseqi_b(v32i8 _1) { return __builtin_lasx_xvseqi_b(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvseqi_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvseqi.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvseqi_h(v16i16 _1) { return __builtin_lasx_xvseqi_h(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvseqi_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvseqi.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvseqi_w(v8i32 _1) { return __builtin_lasx_xvseqi_w(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvseqi_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvseqi.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvseqi_d(v4i64 _1) { return __builtin_lasx_xvseqi_d(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvslt_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslt.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvslt_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvslt_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvslt_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslt.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvslt_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvslt_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvslt_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslt.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvslt_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvslt_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvslt_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslt.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvslt_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvslt_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvslti_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslti.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvslti_b(v32i8 _1) { return __builtin_lasx_xvslti_b(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvslti_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslti.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvslti_h(v16i16 _1) { return __builtin_lasx_xvslti_h(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvslti_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslti.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvslti_w(v8i32 _1) { return __builtin_lasx_xvslti_w(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvslti_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslti.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvslti_d(v4i64 _1) { return __builtin_lasx_xvslti_d(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvslt_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslt.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvslt_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvslt_bu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvslt_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslt.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvslt_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvslt_hu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvslt_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslt.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvslt_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvslt_wu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvslt_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslt.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvslt_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvslt_du(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvslti_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslti.bu(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvslti_bu(v32u8 _1) { return __builtin_lasx_xvslti_bu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvslti_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslti.hu(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvslti_hu(v16u16 _1) { return __builtin_lasx_xvslti_hu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvslti_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslti.wu(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvslti_wu(v8u32 _1) { return __builtin_lasx_xvslti_wu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvslti_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslti.du(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvslti_du(v4u64 _1) { return __builtin_lasx_xvslti_du(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsle_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsle.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsle_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsle_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsle_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsle.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsle_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsle_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsle_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsle.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsle_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsle_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsle_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsle.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsle_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsle_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvslei_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslei.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvslei_b(v32i8 _1) { return __builtin_lasx_xvslei_b(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvslei_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslei.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvslei_h(v16i16 _1) { return __builtin_lasx_xvslei_h(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvslei_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslei.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvslei_w(v8i32 _1) { return __builtin_lasx_xvslei_w(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvslei_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslei.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvslei_d(v4i64 _1) { return __builtin_lasx_xvslei_d(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsle_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsle.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsle_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvsle_bu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsle_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsle.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsle_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvsle_hu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsle_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsle.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsle_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvsle_wu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsle_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsle.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsle_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvsle_du(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvslei_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslei.bu(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvslei_bu(v32u8 _1) { return __builtin_lasx_xvslei_bu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvslei_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslei.hu(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvslei_hu(v16u16 _1) { return __builtin_lasx_xvslei_hu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvslei_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslei.wu(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvslei_wu(v8u32 _1) { return __builtin_lasx_xvslei_wu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvslei_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslei.du(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvslei_du(v4u64 _1) { return __builtin_lasx_xvslei_du(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsat_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsat.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsat_b(v32i8 _1) { return __builtin_lasx_xvsat_b(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsat_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsat.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsat_h(v16i16 _1) { return __builtin_lasx_xvsat_h(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsat_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsat.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsat_w(v8i32 _1) { return __builtin_lasx_xvsat_w(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsat_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsat.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsat_d(v4i64 _1) { return __builtin_lasx_xvsat_d(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsat_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsat.bu(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvsat_bu(v32u8 _1) { return __builtin_lasx_xvsat_bu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsat_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsat.hu(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvsat_hu(v16u16 _1) { return __builtin_lasx_xvsat_hu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsat_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsat.wu(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvsat_wu(v8u32 _1) { return __builtin_lasx_xvsat_wu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsat_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsat.du(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvsat_du(v4u64 _1) { return __builtin_lasx_xvsat_du(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvadda_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvadda.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvadda_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvadda_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvadda_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvadda.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvadda_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvadda_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvadda_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvadda.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvadda_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvadda_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvadda_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvadda.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvadda_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvadda_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsadd_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsadd.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsadd_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsadd_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsadd_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsadd.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsadd_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsadd_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsadd_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsadd.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsadd_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsadd_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsadd_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsadd.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsadd_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsadd_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsadd_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsadd.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvsadd_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvsadd_bu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsadd_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsadd.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvsadd_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvsadd_hu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsadd_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsadd.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvsadd_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvsadd_wu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsadd_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsadd.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvsadd_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvsadd_du(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvavg_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavg.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvavg_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvavg_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvavg_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavg.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvavg_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvavg_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvavg_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavg.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvavg_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvavg_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvavg_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavg.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvavg_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvavg_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvavg_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavg.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvavg_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvavg_bu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvavg_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavg.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvavg_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvavg_hu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvavg_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavg.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvavg_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvavg_wu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvavg_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavg.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvavg_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvavg_du(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvavgr_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavgr.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvavgr_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvavgr_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvavgr_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavgr.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvavgr_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvavgr_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvavgr_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavgr.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvavgr_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvavgr_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvavgr_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavgr.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvavgr_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvavgr_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvavgr_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavgr.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvavgr_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvavgr_bu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvavgr_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavgr.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvavgr_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvavgr_hu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvavgr_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavgr.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvavgr_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvavgr_wu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvavgr_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavgr.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvavgr_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvavgr_du(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssub_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssub.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvssub_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvssub_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssub_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssub.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvssub_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvssub_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssub_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssub.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvssub_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvssub_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssub_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssub.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvssub_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvssub_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssub_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssub.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvssub_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvssub_bu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssub_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssub.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvssub_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvssub_hu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssub_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssub.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvssub_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvssub_wu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssub_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssub.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvssub_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvssub_du(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvabsd_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvabsd.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvabsd_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvabsd_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvabsd_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvabsd.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvabsd_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvabsd_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvabsd_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvabsd.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvabsd_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvabsd_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvabsd_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvabsd.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvabsd_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvabsd_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvabsd_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvabsd.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvabsd_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvabsd_bu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvabsd_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvabsd.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvabsd_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvabsd_hu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvabsd_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvabsd.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvabsd_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvabsd_wu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvabsd_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvabsd.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvabsd_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvabsd_du(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmul_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmul.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmul_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvmul_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmul_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmul.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmul_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvmul_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmul_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmul.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmul_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvmul_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmul_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmul.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmul_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvmul_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmadd_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmadd.b(<32 x i8> [[_1]], <32 x i8> [[_2]], <32 x i8> [[_3]])
-// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmadd_b(v32i8 _1, v32i8 _2, v32i8 _3) { return __builtin_lasx_xvmadd_b(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmadd_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmadd.h(<16 x i16> [[_1]], <16 x i16> [[_2]], <16 x i16> [[_3]])
-// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmadd_h(v16i16 _1, v16i16 _2, v16i16 _3) { return __builtin_lasx_xvmadd_h(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmadd_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmadd.w(<8 x i32> [[_1]], <8 x i32> [[_2]], <8 x i32> [[_3]])
-// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmadd_w(v8i32 _1, v8i32 _2, v8i32 _3) { return __builtin_lasx_xvmadd_w(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmadd_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmadd.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmadd_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __builtin_lasx_xvmadd_d(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmsub_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmsub.b(<32 x i8> [[_1]], <32 x i8> [[_2]], <32 x i8> [[_3]])
-// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmsub_b(v32i8 _1, v32i8 _2, v32i8 _3) { return __builtin_lasx_xvmsub_b(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmsub_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmsub.h(<16 x i16> [[_1]], <16 x i16> [[_2]], <16 x i16> [[_3]])
-// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmsub_h(v16i16 _1, v16i16 _2, v16i16 _3) { return __builtin_lasx_xvmsub_h(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmsub_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmsub.w(<8 x i32> [[_1]], <8 x i32> [[_2]], <8 x i32> [[_3]])
-// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmsub_w(v8i32 _1, v8i32 _2, v8i32 _3) { return __builtin_lasx_xvmsub_w(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmsub_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmsub.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmsub_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __builtin_lasx_xvmsub_d(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvdiv_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvdiv.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvdiv_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvdiv_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvdiv_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvdiv.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvdiv_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvdiv_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvdiv_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvdiv.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvdiv_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvdiv_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvdiv_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvdiv.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvdiv_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvdiv_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvdiv_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvdiv.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvdiv_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvdiv_bu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvdiv_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvdiv.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvdiv_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvdiv_hu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvdiv_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvdiv.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvdiv_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvdiv_wu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvdiv_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvdiv.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvdiv_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvdiv_du(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvhaddw_h_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhaddw.h.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvhaddw_h_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvhaddw_h_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvhaddw_w_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhaddw.w.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvhaddw_w_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvhaddw_w_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvhaddw_d_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.d.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvhaddw_d_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvhaddw_d_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvhaddw_hu_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhaddw.hu.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvhaddw_hu_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvhaddw_hu_bu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvhaddw_wu_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhaddw.wu.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvhaddw_wu_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvhaddw_wu_hu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvhaddw_du_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.du.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvhaddw_du_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvhaddw_du_wu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvhsubw_h_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhsubw.h.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvhsubw_h_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvhsubw_h_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvhsubw_w_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhsubw.w.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvhsubw_w_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvhsubw_w_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvhsubw_d_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.d.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvhsubw_d_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvhsubw_d_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvhsubw_hu_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhsubw.hu.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvhsubw_hu_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvhsubw_hu_bu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvhsubw_wu_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhsubw.wu.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvhsubw_wu_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvhsubw_wu_hu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvhsubw_du_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.du.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvhsubw_du_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvhsubw_du_wu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmod_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmod.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmod_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvmod_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmod_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmod.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmod_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvmod_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmod_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmod.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmod_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvmod_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmod_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmod.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmod_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvmod_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmod_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmod.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvmod_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvmod_bu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmod_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmod.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvmod_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvmod_hu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmod_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmod.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvmod_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvmod_wu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmod_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmod.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvmod_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvmod_du(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvrepl128vei_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrepl128vei.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvrepl128vei_b(v32i8 _1) { return __builtin_lasx_xvrepl128vei_b(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvrepl128vei_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrepl128vei.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvrepl128vei_h(v16i16 _1) { return __builtin_lasx_xvrepl128vei_h(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvrepl128vei_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrepl128vei.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvrepl128vei_w(v8i32 _1) { return __builtin_lasx_xvrepl128vei_w(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvrepl128vei_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrepl128vei.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvrepl128vei_d(v4i64 _1) { return __builtin_lasx_xvrepl128vei_d(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvpickev_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpickev.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvpickev_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvpickev_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvpickev_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpickev.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvpickev_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvpickev_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvpickev_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpickev.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvpickev_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvpickev_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvpickev_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpickev.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvpickev_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvpickev_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvpickod_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpickod.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvpickod_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvpickod_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvpickod_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpickod.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvpickod_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvpickod_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvpickod_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpickod.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvpickod_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvpickod_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvpickod_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpickod.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvpickod_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvpickod_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvilvh_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvilvh.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvilvh_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvilvh_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvilvh_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvilvh.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvilvh_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvilvh_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvilvh_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvilvh.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvilvh_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvilvh_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvilvh_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvilvh.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvilvh_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvilvh_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvilvl_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvilvl.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvilvl_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvilvl_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvilvl_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvilvl.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvilvl_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvilvl_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvilvl_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvilvl.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvilvl_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvilvl_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvilvl_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvilvl.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvilvl_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvilvl_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvpackev_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpackev.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvpackev_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvpackev_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvpackev_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpackev.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvpackev_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvpackev_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvpackev_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpackev.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvpackev_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvpackev_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvpackev_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpackev.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvpackev_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvpackev_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvpackod_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpackod.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvpackod_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvpackod_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvpackod_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpackod.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvpackod_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvpackod_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvpackod_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpackod.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvpackod_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvpackod_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvpackod_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpackod.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvpackod_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvpackod_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvshuf_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvshuf.b(<32 x i8> [[_1]], <32 x i8> [[_2]], <32 x i8> [[_3]])
-// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvshuf_b(v32i8 _1, v32i8 _2, v32i8 _3) { return __builtin_lasx_xvshuf_b(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvshuf_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvshuf.h(<16 x i16> [[_1]], <16 x i16> [[_2]], <16 x i16> [[_3]])
-// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvshuf_h(v16i16 _1, v16i16 _2, v16i16 _3) { return __builtin_lasx_xvshuf_h(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvshuf_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvshuf.w(<8 x i32> [[_1]], <8 x i32> [[_2]], <8 x i32> [[_3]])
-// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvshuf_w(v8i32 _1, v8i32 _2, v8i32 _3) { return __builtin_lasx_xvshuf_w(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvshuf_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvshuf.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvshuf_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __builtin_lasx_xvshuf_d(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvand_v(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvand.v(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvand_v(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvand_v(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvandi_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvandi.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvandi_b(v32u8 _1) { return __builtin_lasx_xvandi_b(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvor_v(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvor.v(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvor_v(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvor_v(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvori_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvori.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvori_b(v32u8 _1) { return __builtin_lasx_xvori_b(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvnor_v(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvnor.v(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvnor_v(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvnor_v(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvnori_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvnori.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvnori_b(v32u8 _1) { return __builtin_lasx_xvnori_b(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvxor_v(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvxor.v(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvxor_v(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvxor_v(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvxori_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvxori.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvxori_b(v32u8 _1) { return __builtin_lasx_xvxori_b(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvbitsel_v(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitsel.v(<32 x i8> [[_1]], <32 x i8> [[_2]], <32 x i8> [[_3]])
-// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvbitsel_v(v32u8 _1, v32u8 _2, v32u8 _3) { return __builtin_lasx_xvbitsel_v(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvbitseli_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitseli.b(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvbitseli_b(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvbitseli_b(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvshuf4i_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvshuf4i.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvshuf4i_b(v32i8 _1) { return __builtin_lasx_xvshuf4i_b(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvshuf4i_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvshuf4i.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvshuf4i_h(v16i16 _1) { return __builtin_lasx_xvshuf4i_h(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvshuf4i_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvshuf4i.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvshuf4i_w(v8i32 _1) { return __builtin_lasx_xvshuf4i_w(_1, 1); }
@@ -2837,7 +2837,7 @@ v8i32 xvshuf4i_w(v8i32 _1) { return __builtin_lasx_xvshuf4i_w(_1, 1); }
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i32 noundef signext [[_1:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplgr2vr.b(i32 [[_1]])
-// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvreplgr2vr_b(int _1) { return __builtin_lasx_xvreplgr2vr_b(_1); }
@@ -2845,7 +2845,7 @@ v32i8 xvreplgr2vr_b(int _1) { return __builtin_lasx_xvreplgr2vr_b(_1); }
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i32 noundef signext [[_1:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvreplgr2vr.h(i32 [[_1]])
-// CHECK-NEXT:    store <16 x i16> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvreplgr2vr_h(int _1) { return __builtin_lasx_xvreplgr2vr_h(_1); }
@@ -2853,7 +2853,7 @@ v16i16 xvreplgr2vr_h(int _1) { return __builtin_lasx_xvreplgr2vr_h(_1); }
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i32 noundef signext [[_1:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvreplgr2vr.w(i32 [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvreplgr2vr_w(int _1) { return __builtin_lasx_xvreplgr2vr_w(_1); }
@@ -2862,1641 +2862,1641 @@ v8i32 xvreplgr2vr_w(int _1) { return __builtin_lasx_xvreplgr2vr_w(_1); }
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[_1]] to i64
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvreplgr2vr.d(i64 [[CONV]])
-// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvreplgr2vr_d(int _1) { return __builtin_lasx_xvreplgr2vr_d(_1); }
 // CHECK-LABEL: define dso_local void @xvpcnt_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpcnt.b(<32 x i8> [[_1]])
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvpcnt_b(v32i8 _1) { return __builtin_lasx_xvpcnt_b(_1); }
 // CHECK-LABEL: define dso_local void @xvpcnt_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpcnt.h(<16 x i16> [[_1]])
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvpcnt_h(v16i16 _1) { return __builtin_lasx_xvpcnt_h(_1); }
 // CHECK-LABEL: define dso_local void @xvpcnt_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpcnt.w(<8 x i32> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvpcnt_w(v8i32 _1) { return __builtin_lasx_xvpcnt_w(_1); }
 // CHECK-LABEL: define dso_local void @xvpcnt_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpcnt.d(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvpcnt_d(v4i64 _1) { return __builtin_lasx_xvpcnt_d(_1); }
 // CHECK-LABEL: define dso_local void @xvclo_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvclo.b(<32 x i8> [[_1]])
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvclo_b(v32i8 _1) { return __builtin_lasx_xvclo_b(_1); }
 // CHECK-LABEL: define dso_local void @xvclo_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvclo.h(<16 x i16> [[_1]])
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvclo_h(v16i16 _1) { return __builtin_lasx_xvclo_h(_1); }
 // CHECK-LABEL: define dso_local void @xvclo_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvclo.w(<8 x i32> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvclo_w(v8i32 _1) { return __builtin_lasx_xvclo_w(_1); }
 // CHECK-LABEL: define dso_local void @xvclo_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvclo.d(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvclo_d(v4i64 _1) { return __builtin_lasx_xvclo_d(_1); }
 // CHECK-LABEL: define dso_local void @xvclz_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvclz.b(<32 x i8> [[_1]])
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvclz_b(v32i8 _1) { return __builtin_lasx_xvclz_b(_1); }
 // CHECK-LABEL: define dso_local void @xvclz_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvclz.h(<16 x i16> [[_1]])
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvclz_h(v16i16 _1) { return __builtin_lasx_xvclz_h(_1); }
 // CHECK-LABEL: define dso_local void @xvclz_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvclz.w(<8 x i32> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvclz_w(v8i32 _1) { return __builtin_lasx_xvclz_w(_1); }
 // CHECK-LABEL: define dso_local void @xvclz_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvclz.d(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvclz_d(v4i64 _1) { return __builtin_lasx_xvclz_d(_1); }
 // CHECK-LABEL: define dso_local void @xvfadd_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfadd.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfadd_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfadd_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfadd_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfadd.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfadd_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfadd_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfsub_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfsub.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfsub_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfsub_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfsub_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfsub.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfsub_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfsub_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfmul_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmul.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfmul_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfmul_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfmul_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmul.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfmul_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfmul_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfdiv_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfdiv.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfdiv_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfdiv_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfdiv_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfdiv.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfdiv_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfdiv_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcvt_h_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvfcvt.h.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvfcvt_h_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcvt_h_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcvt_s_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfcvt.s.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfcvt_s_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcvt_s_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfmin_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmin.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfmin_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfmin_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfmin_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmin.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfmin_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfmin_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfmina_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmina.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfmina_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfmina_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfmina_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmina.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfmina_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfmina_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfmax_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmax.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfmax_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfmax_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfmax_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmax.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfmax_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfmax_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfmaxa_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmaxa.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfmaxa_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfmaxa_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfmaxa_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmaxa.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfmaxa_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfmaxa_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfclass_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfclass.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfclass_s(v8f32 _1) { return __builtin_lasx_xvfclass_s(_1); }
 // CHECK-LABEL: define dso_local void @xvfclass_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfclass.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfclass_d(v4f64 _1) { return __builtin_lasx_xvfclass_d(_1); }
 // CHECK-LABEL: define dso_local void @xvfsqrt_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfsqrt.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfsqrt_s(v8f32 _1) { return __builtin_lasx_xvfsqrt_s(_1); }
 // CHECK-LABEL: define dso_local void @xvfsqrt_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfsqrt.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfsqrt_d(v4f64 _1) { return __builtin_lasx_xvfsqrt_d(_1); }
 // CHECK-LABEL: define dso_local void @xvfrecip_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrecip.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfrecip_s(v8f32 _1) { return __builtin_lasx_xvfrecip_s(_1); }
 // CHECK-LABEL: define dso_local void @xvfrecip_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrecip.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfrecip_d(v4f64 _1) { return __builtin_lasx_xvfrecip_d(_1); }
 // CHECK-LABEL: define dso_local void @xvfrint_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrint.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfrint_s(v8f32 _1) { return __builtin_lasx_xvfrint_s(_1); }
 // CHECK-LABEL: define dso_local void @xvfrint_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrint.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfrint_d(v4f64 _1) { return __builtin_lasx_xvfrint_d(_1); }
 // CHECK-LABEL: define dso_local void @xvfrsqrt_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrsqrt.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfrsqrt_s(v8f32 _1) { return __builtin_lasx_xvfrsqrt_s(_1); }
 // CHECK-LABEL: define dso_local void @xvfrsqrt_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrsqrt.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfrsqrt_d(v4f64 _1) { return __builtin_lasx_xvfrsqrt_d(_1); }
 // CHECK-LABEL: define dso_local void @xvflogb_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvflogb.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvflogb_s(v8f32 _1) { return __builtin_lasx_xvflogb_s(_1); }
 // CHECK-LABEL: define dso_local void @xvflogb_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvflogb.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvflogb_d(v4f64 _1) { return __builtin_lasx_xvflogb_d(_1); }
 // CHECK-LABEL: define dso_local void @xvfcvth_s_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfcvth.s.h(<16 x i16> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfcvth_s_h(v16i16 _1) { return __builtin_lasx_xvfcvth_s_h(_1); }
 // CHECK-LABEL: define dso_local void @xvfcvth_d_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfcvth.d.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfcvth_d_s(v8f32 _1) { return __builtin_lasx_xvfcvth_d_s(_1); }
 // CHECK-LABEL: define dso_local void @xvfcvtl_s_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfcvtl.s.h(<16 x i16> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfcvtl_s_h(v16i16 _1) { return __builtin_lasx_xvfcvtl_s_h(_1); }
 // CHECK-LABEL: define dso_local void @xvfcvtl_d_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfcvtl.d.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfcvtl_d_s(v8f32 _1) { return __builtin_lasx_xvfcvtl_d_s(_1); }
 // CHECK-LABEL: define dso_local void @xvftint_w_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftint.w.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvftint_w_s(v8f32 _1) { return __builtin_lasx_xvftint_w_s(_1); }
 // CHECK-LABEL: define dso_local void @xvftint_l_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftint.l.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftint_l_d(v4f64 _1) { return __builtin_lasx_xvftint_l_d(_1); }
 // CHECK-LABEL: define dso_local void @xvftint_wu_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftint.wu.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvftint_wu_s(v8f32 _1) { return __builtin_lasx_xvftint_wu_s(_1); }
 // CHECK-LABEL: define dso_local void @xvftint_lu_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftint.lu.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvftint_lu_d(v4f64 _1) { return __builtin_lasx_xvftint_lu_d(_1); }
 // CHECK-LABEL: define dso_local void @xvftintrz_w_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrz.w.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvftintrz_w_s(v8f32 _1) { return __builtin_lasx_xvftintrz_w_s(_1); }
 // CHECK-LABEL: define dso_local void @xvftintrz_l_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrz.l.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrz_l_d(v4f64 _1) { return __builtin_lasx_xvftintrz_l_d(_1); }
 // CHECK-LABEL: define dso_local void @xvftintrz_wu_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrz.wu.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvftintrz_wu_s(v8f32 _1) { return __builtin_lasx_xvftintrz_wu_s(_1); }
 // CHECK-LABEL: define dso_local void @xvftintrz_lu_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrz.lu.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvftintrz_lu_d(v4f64 _1) { return __builtin_lasx_xvftintrz_lu_d(_1); }
 // CHECK-LABEL: define dso_local void @xvffint_s_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvffint.s.w(<8 x i32> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvffint_s_w(v8i32 _1) { return __builtin_lasx_xvffint_s_w(_1); }
 // CHECK-LABEL: define dso_local void @xvffint_d_l(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffint.d.l(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvffint_d_l(v4i64 _1) { return __builtin_lasx_xvffint_d_l(_1); }
 // CHECK-LABEL: define dso_local void @xvffint_s_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvffint.s.wu(<8 x i32> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvffint_s_wu(v8u32 _1) { return __builtin_lasx_xvffint_s_wu(_1); }
 // CHECK-LABEL: define dso_local void @xvffint_d_lu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffint.d.lu(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvffint_d_lu(v4u64 _1) { return __builtin_lasx_xvffint_d_lu(_1); }
 // CHECK-LABEL: define dso_local void @xvreplve_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i32 noundef signext [[_2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplve.b(<32 x i8> [[_1]], i32 [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvreplve_b(v32i8 _1, int _2) { return __builtin_lasx_xvreplve_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvreplve_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i32 noundef signext [[_2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvreplve.h(<16 x i16> [[_1]], i32 [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvreplve_h(v16i16 _1, int _2) { return __builtin_lasx_xvreplve_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvreplve_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i32 noundef signext [[_2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvreplve.w(<8 x i32> [[_1]], i32 [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvreplve_w(v8i32 _1, int _2) { return __builtin_lasx_xvreplve_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvreplve_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i32 noundef signext [[_2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvreplve.d(<4 x i64> [[_1]], i32 [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvreplve_d(v4i64 _1, int _2) { return __builtin_lasx_xvreplve_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvpermi_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpermi.w(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvpermi_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvpermi_w(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvandn_v(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvandn.v(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvandn_v(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvandn_v(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvneg_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvneg.b(<32 x i8> [[_1]])
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvneg_b(v32i8 _1) { return __builtin_lasx_xvneg_b(_1); }
 // CHECK-LABEL: define dso_local void @xvneg_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvneg.h(<16 x i16> [[_1]])
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvneg_h(v16i16 _1) { return __builtin_lasx_xvneg_h(_1); }
 // CHECK-LABEL: define dso_local void @xvneg_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvneg.w(<8 x i32> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvneg_w(v8i32 _1) { return __builtin_lasx_xvneg_w(_1); }
 // CHECK-LABEL: define dso_local void @xvneg_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvneg.d(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvneg_d(v4i64 _1) { return __builtin_lasx_xvneg_d(_1); }
 // CHECK-LABEL: define dso_local void @xvmuh_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmuh.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmuh_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvmuh_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmuh_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmuh.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmuh_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvmuh_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmuh_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmuh.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmuh_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvmuh_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmuh_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmuh.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmuh_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvmuh_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmuh_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmuh.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvmuh_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvmuh_bu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmuh_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmuh.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvmuh_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvmuh_hu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmuh_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmuh.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvmuh_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvmuh_wu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmuh_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmuh.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvmuh_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvmuh_du(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsllwil_h_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsllwil.h.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsllwil_h_b(v32i8 _1) { return __builtin_lasx_xvsllwil_h_b(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsllwil_w_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsllwil.w.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsllwil_w_h(v16i16 _1) { return __builtin_lasx_xvsllwil_w_h(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsllwil_d_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsllwil.d.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsllwil_d_w(v8i32 _1) { return __builtin_lasx_xvsllwil_d_w(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsllwil_hu_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsllwil.hu.bu(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvsllwil_hu_bu(v32u8 _1) { return __builtin_lasx_xvsllwil_hu_bu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsllwil_wu_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsllwil.wu.hu(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvsllwil_wu_hu(v16u16 _1) { return __builtin_lasx_xvsllwil_wu_hu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsllwil_du_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsllwil.du.wu(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvsllwil_du_wu(v8u32 _1) { return __builtin_lasx_xvsllwil_du_wu(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvsran_b_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsran.b.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsran_b_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsran_b_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsran_h_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsran.h.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsran_h_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsran_h_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsran_w_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsran.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsran_w_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsran_w_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssran_b_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssran.b.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvssran_b_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvssran_b_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssran_h_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssran.h.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvssran_h_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvssran_h_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssran_w_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssran.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvssran_w_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvssran_w_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssran_bu_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssran.bu.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvssran_bu_h(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvssran_bu_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssran_hu_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssran.hu.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvssran_hu_w(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvssran_hu_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssran_wu_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssran.wu.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvssran_wu_d(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvssran_wu_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsrarn_b_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrarn.b.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrarn_b_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrarn_b_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsrarn_h_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrarn.h.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrarn_h_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrarn_h_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsrarn_w_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrarn.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrarn_w_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrarn_w_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssrarn_b_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarn.b.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvssrarn_b_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvssrarn_b_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssrarn_h_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarn.h.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvssrarn_h_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvssrarn_h_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssrarn_w_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarn.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvssrarn_w_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvssrarn_w_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssrarn_bu_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarn.bu.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvssrarn_bu_h(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvssrarn_bu_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssrarn_hu_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarn.hu.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvssrarn_hu_w(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvssrarn_hu_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssrarn_wu_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarn.wu.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvssrarn_wu_d(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvssrarn_wu_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsrln_b_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrln.b.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrln_b_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrln_b_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsrln_h_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrln.h.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrln_h_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrln_h_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsrln_w_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrln.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrln_w_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrln_w_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssrln_bu_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrln.bu.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvssrln_bu_h(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvssrln_bu_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssrln_hu_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrln.hu.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvssrln_hu_w(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvssrln_hu_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssrln_wu_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrln.wu.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvssrln_wu_d(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvssrln_wu_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsrlrn_b_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlrn.b.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrlrn_b_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrlrn_b_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsrlrn_h_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlrn.h.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrlrn_h_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrlrn_h_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsrlrn_w_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlrn.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrlrn_w_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrlrn_w_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssrlrn_bu_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrn.bu.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvssrlrn_bu_h(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvssrlrn_bu_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssrlrn_hu_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrn.hu.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvssrlrn_hu_w(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvssrlrn_hu_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssrlrn_wu_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrn.wu.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvssrlrn_wu_d(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvssrlrn_wu_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfrstpi_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvfrstpi.b(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvfrstpi_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvfrstpi_b(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvfrstpi_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvfrstpi.h(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvfrstpi_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvfrstpi_h(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvfrstp_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvfrstp.b(<32 x i8> [[_1]], <32 x i8> [[_2]], <32 x i8> [[_3]])
-// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvfrstp_b(v32i8 _1, v32i8 _2, v32i8 _3) { return __builtin_lasx_xvfrstp_b(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvfrstp_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvfrstp.h(<16 x i16> [[_1]], <16 x i16> [[_2]], <16 x i16> [[_3]])
-// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvfrstp_h(v16i16 _1, v16i16 _2, v16i16 _3) { return __builtin_lasx_xvfrstp_h(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvshuf4i_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvshuf4i.d(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvshuf4i_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvshuf4i_d(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvbsrl_v(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbsrl.v(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvbsrl_v(v32i8 _1) { return __builtin_lasx_xvbsrl_v(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvbsll_v(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbsll.v(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvbsll_v(v32i8 _1) { return __builtin_lasx_xvbsll_v(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvextrins_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvextrins.b(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvextrins_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvextrins_b(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvextrins_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvextrins.h(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvextrins_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvextrins_h(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvextrins_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvextrins.w(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvextrins_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvextrins_w(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvextrins_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvextrins.d(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvextrins_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvextrins_d(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvmskltz_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmskltz.b(<32 x i8> [[_1]])
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmskltz_b(v32i8 _1) { return __builtin_lasx_xvmskltz_b(_1); }
 // CHECK-LABEL: define dso_local void @xvmskltz_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmskltz.h(<16 x i16> [[_1]])
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmskltz_h(v16i16 _1) { return __builtin_lasx_xvmskltz_h(_1); }
 // CHECK-LABEL: define dso_local void @xvmskltz_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmskltz.w(<8 x i32> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmskltz_w(v8i32 _1) { return __builtin_lasx_xvmskltz_w(_1); }
 // CHECK-LABEL: define dso_local void @xvmskltz_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmskltz.d(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmskltz_d(v4i64 _1) { return __builtin_lasx_xvmskltz_d(_1); }
 // CHECK-LABEL: define dso_local void @xvsigncov_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsigncov.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsigncov_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsigncov_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsigncov_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsigncov.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsigncov_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsigncov_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsigncov_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsigncov.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsigncov_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsigncov_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsigncov_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsigncov.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsigncov_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsigncov_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfmadd_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmadd.s(<8 x float> [[_1]], <8 x float> [[_2]], <8 x float> [[_3]])
-// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfmadd_s(v8f32 _1, v8f32 _2, v8f32 _3) { return __builtin_lasx_xvfmadd_s(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvfmadd_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmadd.d(<4 x double> [[_1]], <4 x double> [[_2]], <4 x double> [[_3]])
-// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfmadd_d(v4f64 _1, v4f64 _2, v4f64 _3) { return __builtin_lasx_xvfmadd_d(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvfmsub_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmsub.s(<8 x float> [[_1]], <8 x float> [[_2]], <8 x float> [[_3]])
-// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfmsub_s(v8f32 _1, v8f32 _2, v8f32 _3) { return __builtin_lasx_xvfmsub_s(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvfmsub_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmsub.d(<4 x double> [[_1]], <4 x double> [[_2]], <4 x double> [[_3]])
-// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfmsub_d(v4f64 _1, v4f64 _2, v4f64 _3) { return __builtin_lasx_xvfmsub_d(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvfnmadd_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfnmadd.s(<8 x float> [[_1]], <8 x float> [[_2]], <8 x float> [[_3]])
-// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfnmadd_s(v8f32 _1, v8f32 _2, v8f32 _3) { return __builtin_lasx_xvfnmadd_s(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvfnmadd_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfnmadd.d(<4 x double> [[_1]], <4 x double> [[_2]], <4 x double> [[_3]])
-// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfnmadd_d(v4f64 _1, v4f64 _2, v4f64 _3) { return __builtin_lasx_xvfnmadd_d(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvfnmsub_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x float>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfnmsub.s(<8 x float> [[_1]], <8 x float> [[_2]], <8 x float> [[_3]])
-// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvfnmsub_s(v8f32 _1, v8f32 _2, v8f32 _3) { return __builtin_lasx_xvfnmsub_s(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvfnmsub_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x double>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfnmsub.d(<4 x double> [[_1]], <4 x double> [[_2]], <4 x double> [[_3]])
-// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvfnmsub_d(v4f64 _1, v4f64 _2, v4f64 _3) { return __builtin_lasx_xvfnmsub_d(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvftintrne_w_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrne.w.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvftintrne_w_s(v8f32 _1) { return __builtin_lasx_xvftintrne_w_s(_1); }
 // CHECK-LABEL: define dso_local void @xvftintrne_l_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrne.l.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrne_l_d(v4f64 _1) { return __builtin_lasx_xvftintrne_l_d(_1); }
 // CHECK-LABEL: define dso_local void @xvftintrp_w_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrp.w.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvftintrp_w_s(v8f32 _1) { return __builtin_lasx_xvftintrp_w_s(_1); }
 // CHECK-LABEL: define dso_local void @xvftintrp_l_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrp.l.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrp_l_d(v4f64 _1) { return __builtin_lasx_xvftintrp_l_d(_1); }
 // CHECK-LABEL: define dso_local void @xvftintrm_w_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrm.w.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvftintrm_w_s(v8f32 _1) { return __builtin_lasx_xvftintrm_w_s(_1); }
 // CHECK-LABEL: define dso_local void @xvftintrm_l_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrm.l.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrm_l_d(v4f64 _1) { return __builtin_lasx_xvftintrm_l_d(_1); }
 // CHECK-LABEL: define dso_local void @xvftint_w_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftint.w.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvftint_w_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvftint_w_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvffint_s_l(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvffint.s.l(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvffint_s_l(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvffint_s_l(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvftintrz_w_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrz.w.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvftintrz_w_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvftintrz_w_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvftintrp_w_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrp.w.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvftintrp_w_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvftintrp_w_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvftintrm_w_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrm.w.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvftintrm_w_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvftintrm_w_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvftintrne_w_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrne.w.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvftintrne_w_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvftintrne_w_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvftinth_l_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftinth.l.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftinth_l_s(v8f32 _1) { return __builtin_lasx_xvftinth_l_s(_1); }
 // CHECK-LABEL: define dso_local void @xvftintl_l_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintl.l.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintl_l_s(v8f32 _1) { return __builtin_lasx_xvftintl_l_s(_1); }
 // CHECK-LABEL: define dso_local void @xvffinth_d_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffinth.d.w(<8 x i32> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvffinth_d_w(v8i32 _1) { return __builtin_lasx_xvffinth_d_w(_1); }
 // CHECK-LABEL: define dso_local void @xvffintl_d_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffintl.d.w(<8 x i32> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvffintl_d_w(v8i32 _1) { return __builtin_lasx_xvffintl_d_w(_1); }
 // CHECK-LABEL: define dso_local void @xvftintrzh_l_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrzh.l.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrzh_l_s(v8f32 _1) { return __builtin_lasx_xvftintrzh_l_s(_1); }
 // CHECK-LABEL: define dso_local void @xvftintrzl_l_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrzl.l.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrzl_l_s(v8f32 _1) { return __builtin_lasx_xvftintrzl_l_s(_1); }
 // CHECK-LABEL: define dso_local void @xvftintrph_l_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrph.l.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrph_l_s(v8f32 _1) { return __builtin_lasx_xvftintrph_l_s(_1); }
 // CHECK-LABEL: define dso_local void @xvftintrpl_l_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrpl.l.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrpl_l_s(v8f32 _1) { return __builtin_lasx_xvftintrpl_l_s(_1); }
 // CHECK-LABEL: define dso_local void @xvftintrmh_l_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrmh.l.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrmh_l_s(v8f32 _1) { return __builtin_lasx_xvftintrmh_l_s(_1); }
 // CHECK-LABEL: define dso_local void @xvftintrml_l_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrml.l.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrml_l_s(v8f32 _1) { return __builtin_lasx_xvftintrml_l_s(_1); }
 // CHECK-LABEL: define dso_local void @xvftintrneh_l_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrneh.l.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrneh_l_s(v8f32 _1) { return __builtin_lasx_xvftintrneh_l_s(_1); }
 // CHECK-LABEL: define dso_local void @xvftintrnel_l_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrnel.l.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvftintrnel_l_s(v8f32 _1) { return __builtin_lasx_xvftintrnel_l_s(_1); }
 // CHECK-LABEL: define dso_local void @xvfrintrne_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrne.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfrintrne_s(v8f32 _1) { return __builtin_lasx_xvfrintrne_s(_1); }
 // CHECK-LABEL: define dso_local void @xvfrintrne_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrne.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfrintrne_d(v4f64 _1) { return __builtin_lasx_xvfrintrne_d(_1); }
 // CHECK-LABEL: define dso_local void @xvfrintrz_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrz.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfrintrz_s(v8f32 _1) { return __builtin_lasx_xvfrintrz_s(_1); }
 // CHECK-LABEL: define dso_local void @xvfrintrz_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrz.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfrintrz_d(v4f64 _1) { return __builtin_lasx_xvfrintrz_d(_1); }
 // CHECK-LABEL: define dso_local void @xvfrintrp_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrp.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfrintrp_s(v8f32 _1) { return __builtin_lasx_xvfrintrp_s(_1); }
 // CHECK-LABEL: define dso_local void @xvfrintrp_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrp.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfrintrp_d(v4f64 _1) { return __builtin_lasx_xvfrintrp_d(_1); }
 // CHECK-LABEL: define dso_local void @xvfrintrm_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrm.s(<8 x float> [[_1]])
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfrintrm_s(v8f32 _1) { return __builtin_lasx_xvfrintrm_s(_1); }
 // CHECK-LABEL: define dso_local void @xvfrintrm_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrm.d(<4 x double> [[_1]])
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfrintrm_d(v4f64 _1) { return __builtin_lasx_xvfrintrm_d(_1); }
@@ -4504,14 +4504,14 @@ v4i64 xvfrintrm_d(v4f64 _1) { return __builtin_lasx_xvfrintrm_d(_1); }
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr noundef [[_1:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvld(ptr [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvld(void *_1) { return __builtin_lasx_xvld(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvst(
 // CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr noundef [[_2:%.*]]) local_unnamed_addr #[[ATTR5:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvst(<32 x i8> [[_1]], ptr [[_2]], i32 1)
 // CHECK-NEXT:    ret void
 //
@@ -4519,7 +4519,7 @@ void xvst(v32i8 _1, void *_2) { return __builtin_lasx_xvst(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvstelm_b(
 // CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr noundef [[_2:%.*]]) local_unnamed_addr #[[ATTR5]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.b(<32 x i8> [[_1]], ptr [[_2]], i32 1, i32 1)
 // CHECK-NEXT:    ret void
 //
@@ -4527,7 +4527,7 @@ void xvstelm_b(v32i8 _1, void * _2) { return __builtin_lasx_xvstelm_b(_1, _2, 1,
 // CHECK-LABEL: define dso_local void @xvstelm_h(
 // CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr noundef [[_2:%.*]]) local_unnamed_addr #[[ATTR5]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.h(<16 x i16> [[_1]], ptr [[_2]], i32 2, i32 1)
 // CHECK-NEXT:    ret void
 //
@@ -4535,7 +4535,7 @@ void xvstelm_h(v16i16 _1, void * _2) { return __builtin_lasx_xvstelm_h(_1, _2, 2
 // CHECK-LABEL: define dso_local void @xvstelm_w(
 // CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr noundef [[_2:%.*]]) local_unnamed_addr #[[ATTR5]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.w(<8 x i32> [[_1]], ptr [[_2]], i32 4, i32 1)
 // CHECK-NEXT:    ret void
 //
@@ -4543,7 +4543,7 @@ void xvstelm_w(v8i32 _1, void * _2) { return __builtin_lasx_xvstelm_w(_1, _2, 4,
 // CHECK-LABEL: define dso_local void @xvstelm_d(
 // CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr noundef [[_2:%.*]]) local_unnamed_addr #[[ATTR5]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.d(<4 x i64> [[_1]], ptr [[_2]], i32 8, i32 1)
 // CHECK-NEXT:    ret void
 //
@@ -4551,108 +4551,108 @@ void xvstelm_d(v4i64 _1, void * _2) { return __builtin_lasx_xvstelm_d(_1, _2, 8,
 // CHECK-LABEL: define dso_local void @xvinsve0_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvinsve0.w(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvinsve0_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvinsve0_w(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvinsve0_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvinsve0.d(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvinsve0_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvinsve0_d(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvpickve_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpickve.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvpickve_w(v8i32 _1) { return __builtin_lasx_xvpickve_w(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvpickve_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpickve.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvpickve_d(v4i64 _1) { return __builtin_lasx_xvpickve_d(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvssrlrn_b_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrn.b.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvssrlrn_b_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvssrlrn_b_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssrlrn_h_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrn.h.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvssrlrn_h_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvssrlrn_h_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssrlrn_w_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrn.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvssrlrn_w_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvssrlrn_w_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssrln_b_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrln.b.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvssrln_b_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvssrln_b_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssrln_h_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrln.h.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvssrln_h_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvssrln_h_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvssrln_w_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrln.w.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvssrln_w_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvssrln_w_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvorn_v(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvorn.v(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvorn_v(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvorn_v(_1, _2); }
@@ -4660,7 +4660,7 @@ v32i8 xvorn_v(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvorn_v(_1, _2); }
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvldi(i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvldi() { return __builtin_lasx_xvldi(1); }
@@ -4668,14 +4668,14 @@ v4i64 xvldi() { return __builtin_lasx_xvldi(1); }
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr noundef [[_1:%.*]]) local_unnamed_addr #[[ATTR3]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvldx(ptr [[_1]], i64 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvldx(void *_1) { return __builtin_lasx_xvldx(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvstx(
 // CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr noundef [[_2:%.*]]) local_unnamed_addr #[[ATTR5]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstx(<32 x i8> [[_1]], ptr [[_2]], i64 1)
 // CHECK-NEXT:    ret void
 //
@@ -4683,209 +4683,209 @@ void xvstx(v32i8 _1, void *_2) { return __builtin_lasx_xvstx(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvextl_qu_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvextl.qu.du(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvextl_qu_du(v4u64 _1) { return __builtin_lasx_xvextl_qu_du(_1); }
 // CHECK-LABEL: define dso_local void @xvinsgr2vr_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvinsgr2vr.w(<8 x i32> [[_1]], i32 1, i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvinsgr2vr_w(v8i32 _1) { return __builtin_lasx_xvinsgr2vr_w(_1, 1, 1); }
 // CHECK-LABEL: define dso_local void @xvinsgr2vr_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvinsgr2vr.d(<4 x i64> [[_1]], i64 1, i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvinsgr2vr_d(v4i64 _1) { return __builtin_lasx_xvinsgr2vr_d(_1, 1, 1); }
 // CHECK-LABEL: define dso_local void @xvreplve0_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplve0.b(<32 x i8> [[_1]])
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvreplve0_b(v32i8 _1) { return __builtin_lasx_xvreplve0_b(_1); }
 // CHECK-LABEL: define dso_local void @xvreplve0_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvreplve0.h(<16 x i16> [[_1]])
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvreplve0_h(v16i16 _1) { return __builtin_lasx_xvreplve0_h(_1); }
 // CHECK-LABEL: define dso_local void @xvreplve0_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvreplve0.w(<8 x i32> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvreplve0_w(v8i32 _1) { return __builtin_lasx_xvreplve0_w(_1); }
 // CHECK-LABEL: define dso_local void @xvreplve0_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvreplve0.d(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvreplve0_d(v4i64 _1) { return __builtin_lasx_xvreplve0_d(_1); }
 // CHECK-LABEL: define dso_local void @xvreplve0_q(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplve0.q(<32 x i8> [[_1]])
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvreplve0_q(v32i8 _1) { return __builtin_lasx_xvreplve0_q(_1); }
 // CHECK-LABEL: define dso_local void @vext2xv_h_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.vext2xv.h.b(<32 x i8> [[_1]])
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 vext2xv_h_b(v32i8 _1) { return __builtin_lasx_vext2xv_h_b(_1); }
 // CHECK-LABEL: define dso_local void @vext2xv_w_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.w.h(<16 x i16> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 vext2xv_w_h(v16i16 _1) { return __builtin_lasx_vext2xv_w_h(_1); }
 // CHECK-LABEL: define dso_local void @vext2xv_d_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.d.w(<8 x i32> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_d_w(v8i32 _1) { return __builtin_lasx_vext2xv_d_w(_1); }
 // CHECK-LABEL: define dso_local void @vext2xv_w_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.w.b(<32 x i8> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 vext2xv_w_b(v32i8 _1) { return __builtin_lasx_vext2xv_w_b(_1); }
 // CHECK-LABEL: define dso_local void @vext2xv_d_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.d.h(<16 x i16> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_d_h(v16i16 _1) { return __builtin_lasx_vext2xv_d_h(_1); }
 // CHECK-LABEL: define dso_local void @vext2xv_d_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.d.b(<32 x i8> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_d_b(v32i8 _1) { return __builtin_lasx_vext2xv_d_b(_1); }
 // CHECK-LABEL: define dso_local void @vext2xv_hu_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.vext2xv.hu.bu(<32 x i8> [[_1]])
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 vext2xv_hu_bu(v32i8 _1) { return __builtin_lasx_vext2xv_hu_bu(_1); }
 // CHECK-LABEL: define dso_local void @vext2xv_wu_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.wu.hu(<16 x i16> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 vext2xv_wu_hu(v16i16 _1) { return __builtin_lasx_vext2xv_wu_hu(_1); }
 // CHECK-LABEL: define dso_local void @vext2xv_du_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.du.wu(<8 x i32> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_du_wu(v8i32 _1) { return __builtin_lasx_vext2xv_du_wu(_1); }
 // CHECK-LABEL: define dso_local void @vext2xv_wu_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.wu.bu(<32 x i8> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 vext2xv_wu_bu(v32i8 _1) { return __builtin_lasx_vext2xv_wu_bu(_1); }
 // CHECK-LABEL: define dso_local void @vext2xv_du_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.du.hu(<16 x i16> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_du_hu(v16i16 _1) { return __builtin_lasx_vext2xv_du_hu(_1); }
 // CHECK-LABEL: define dso_local void @vext2xv_du_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.du.bu(<32 x i8> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 vext2xv_du_bu(v32i8 _1) { return __builtin_lasx_vext2xv_du_bu(_1); }
 // CHECK-LABEL: define dso_local void @xvpermi_q(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpermi.q(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvpermi_q(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvpermi_q(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvpermi_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpermi.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvpermi_d(v4i64 _1) { return __builtin_lasx_xvpermi_d(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvperm_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvperm.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvperm_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvperm_w(_1, _2); }
@@ -4893,7 +4893,7 @@ v8i32 xvperm_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvperm_w(_1, _2); }
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr noundef [[_1:%.*]]) local_unnamed_addr #[[ATTR3]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvldrepl.b(ptr [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvldrepl_b(void *_1) { return __builtin_lasx_xvldrepl_b(_1, 1); }
@@ -4901,7 +4901,7 @@ v32i8 xvldrepl_b(void *_1) { return __builtin_lasx_xvldrepl_b(_1, 1); }
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr noundef [[_1:%.*]]) local_unnamed_addr #[[ATTR3]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvldrepl.h(ptr [[_1]], i32 2)
-// CHECK-NEXT:    store <16 x i16> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvldrepl_h(void *_1) { return __builtin_lasx_xvldrepl_h(_1, 2); }
@@ -4909,7 +4909,7 @@ v16i16 xvldrepl_h(void *_1) { return __builtin_lasx_xvldrepl_h(_1, 2); }
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr noundef [[_1:%.*]]) local_unnamed_addr #[[ATTR3]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvldrepl.w(ptr [[_1]], i32 4)
-// CHECK-NEXT:    store <8 x i32> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvldrepl_w(void *_1) { return __builtin_lasx_xvldrepl_w(_1, 4); }
@@ -4917,14 +4917,14 @@ v8i32 xvldrepl_w(void *_1) { return __builtin_lasx_xvldrepl_w(_1, 4); }
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr noundef [[_1:%.*]]) local_unnamed_addr #[[ATTR3]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvldrepl.d(ptr [[_1]], i32 8)
-// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvldrepl_d(void *_1) { return __builtin_lasx_xvldrepl_d(_1, 8); }
 // CHECK-LABEL: define dso_local signext i32 @xvpickve2gr_w(
 // CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xvpickve2gr.w(<8 x i32> [[_1]], i32 1)
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
@@ -4932,7 +4932,7 @@ int xvpickve2gr_w(v8i32 _1) { return __builtin_lasx_xvpickve2gr_w(_1, 1); }
 // CHECK-LABEL: define dso_local signext i32 @xvpickve2gr_wu(
 // CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xvpickve2gr.wu(<8 x i32> [[_1]], i32 1)
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
@@ -4940,7 +4940,7 @@ unsigned int xvpickve2gr_wu(v8i32 _1) { return __builtin_lasx_xvpickve2gr_wu(_1,
 // CHECK-LABEL: define dso_local i64 @xvpickve2gr_d(
 // CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.loongarch.lasx.xvpickve2gr.d(<4 x i64> [[_1]], i32 1)
 // CHECK-NEXT:    ret i64 [[TMP1]]
 //
@@ -4948,7 +4948,7 @@ long xvpickve2gr_d(v4i64 _1) { return __builtin_lasx_xvpickve2gr_d(_1, 1); }
 // CHECK-LABEL: define dso_local i64 @xvpickve2gr_du(
 // CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.loongarch.lasx.xvpickve2gr.du(<4 x i64> [[_1]], i32 1)
 // CHECK-NEXT:    ret i64 [[TMP1]]
 //
@@ -4956,1626 +4956,1626 @@ unsigned long int xvpickve2gr_du(v4i64 _1) { return __builtin_lasx_xvpickve2gr_d
 // CHECK-LABEL: define dso_local void @xvaddwev_q_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_q_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvaddwev_q_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddwev_d_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_d_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvaddwev_d_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddwev_w_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvaddwev_w_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvaddwev_w_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddwev_h_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvaddwev_h_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvaddwev_h_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddwev_q_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_q_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvaddwev_q_du(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddwev_d_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_d_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvaddwev_d_wu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddwev_w_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvaddwev_w_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvaddwev_w_hu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddwev_h_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvaddwev_h_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvaddwev_h_bu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsubwev_q_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsubwev_q_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsubwev_q_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsubwev_d_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.d.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsubwev_d_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsubwev_d_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsubwev_w_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwev.w.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsubwev_w_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsubwev_w_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsubwev_h_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwev.h.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsubwev_h_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsubwev_h_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsubwev_q_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsubwev_q_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvsubwev_q_du(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsubwev_d_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.d.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsubwev_d_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvsubwev_d_wu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsubwev_w_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwev.w.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsubwev_w_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvsubwev_w_hu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsubwev_h_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwev.h.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsubwev_h_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvsubwev_h_bu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwev_q_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_q_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvmulwev_q_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwev_d_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_d_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvmulwev_d_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwev_w_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmulwev_w_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvmulwev_w_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwev_h_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmulwev_h_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvmulwev_h_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwev_q_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_q_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvmulwev_q_du(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwev_d_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_d_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvmulwev_d_wu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwev_w_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmulwev_w_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvmulwev_w_hu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwev_h_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmulwev_h_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvmulwev_h_bu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddwod_q_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_q_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvaddwod_q_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddwod_d_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_d_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvaddwod_d_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddwod_w_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvaddwod_w_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvaddwod_w_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddwod_h_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvaddwod_h_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvaddwod_h_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddwod_q_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_q_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvaddwod_q_du(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddwod_d_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_d_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvaddwod_d_wu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddwod_w_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvaddwod_w_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvaddwod_w_hu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddwod_h_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvaddwod_h_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvaddwod_h_bu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsubwod_q_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsubwod_q_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsubwod_q_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsubwod_d_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.d.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsubwod_d_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsubwod_d_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsubwod_w_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwod.w.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsubwod_w_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsubwod_w_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsubwod_h_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwod.h.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsubwod_h_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsubwod_h_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsubwod_q_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsubwod_q_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvsubwod_q_du(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsubwod_d_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.d.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsubwod_d_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvsubwod_d_wu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsubwod_w_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwod.w.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsubwod_w_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvsubwod_w_hu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsubwod_h_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwod.h.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsubwod_h_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvsubwod_h_bu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwod_q_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_q_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvmulwod_q_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwod_d_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_d_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvmulwod_d_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwod_w_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmulwod_w_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvmulwod_w_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwod_h_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmulwod_h_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvmulwod_h_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwod_q_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_q_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvmulwod_q_du(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwod_d_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.wu(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_d_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvmulwod_d_wu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwod_w_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.hu(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmulwod_w_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvmulwod_w_hu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwod_h_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.bu(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmulwod_h_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvmulwod_h_bu(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddwev_d_wu_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.wu.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_d_wu_w(v8u32 _1, v8i32 _2) { return __builtin_lasx_xvaddwev_d_wu_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddwev_w_hu_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.hu.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvaddwev_w_hu_h(v16u16 _1, v16i16 _2) { return __builtin_lasx_xvaddwev_w_hu_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddwev_h_bu_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.bu.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvaddwev_h_bu_b(v32u8 _1, v32i8 _2) { return __builtin_lasx_xvaddwev_h_bu_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwev_d_wu_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.wu.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_d_wu_w(v8u32 _1, v8i32 _2) { return __builtin_lasx_xvmulwev_d_wu_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwev_w_hu_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.hu.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmulwev_w_hu_h(v16u16 _1, v16i16 _2) { return __builtin_lasx_xvmulwev_w_hu_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwev_h_bu_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.bu.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmulwev_h_bu_b(v32u8 _1, v32i8 _2) { return __builtin_lasx_xvmulwev_h_bu_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddwod_d_wu_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.wu.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_d_wu_w(v8u32 _1, v8i32 _2) { return __builtin_lasx_xvaddwod_d_wu_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddwod_w_hu_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.hu.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvaddwod_w_hu_h(v16u16 _1, v16i16 _2) { return __builtin_lasx_xvaddwod_w_hu_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddwod_h_bu_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.bu.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvaddwod_h_bu_b(v32u8 _1, v32i8 _2) { return __builtin_lasx_xvaddwod_h_bu_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwod_d_wu_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.wu.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_d_wu_w(v8u32 _1, v8i32 _2) { return __builtin_lasx_xvmulwod_d_wu_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwod_w_hu_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.hu.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmulwod_w_hu_h(v16u16 _1, v16i16 _2) { return __builtin_lasx_xvmulwod_w_hu_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwod_h_bu_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.bu.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmulwod_h_bu_b(v32u8 _1, v32i8 _2) { return __builtin_lasx_xvmulwod_h_bu_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvhaddw_q_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvhaddw_q_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvhaddw_q_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvhaddw_qu_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.qu.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvhaddw_qu_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvhaddw_qu_du(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvhsubw_q_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvhsubw_q_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvhsubw_q_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvhsubw_qu_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.qu.du(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvhsubw_qu_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvhsubw_qu_du(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmaddwev_q_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwev_q_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __builtin_lasx_xvmaddwev_q_d(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmaddwev_d_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.w(<4 x i64> [[_1]], <8 x i32> [[_2]], <8 x i32> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwev_d_w(v4i64 _1, v8i32 _2, v8i32 _3) { return __builtin_lasx_xvmaddwev_d_w(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmaddwev_w_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.h(<8 x i32> [[_1]], <16 x i16> [[_2]], <16 x i16> [[_3]])
-// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmaddwev_w_h(v8i32 _1, v16i16 _2, v16i16 _3) { return __builtin_lasx_xvmaddwev_w_h(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmaddwev_h_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.b(<16 x i16> [[_1]], <32 x i8> [[_2]], <32 x i8> [[_3]])
-// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmaddwev_h_b(v16i16 _1, v32i8 _2, v32i8 _3) { return __builtin_lasx_xvmaddwev_h_b(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmaddwev_q_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvmaddwev_q_du(v4u64 _1, v4u64 _2, v4u64 _3) { return __builtin_lasx_xvmaddwev_q_du(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmaddwev_d_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.wu(<4 x i64> [[_1]], <8 x i32> [[_2]], <8 x i32> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvmaddwev_d_wu(v4u64 _1, v8u32 _2, v8u32 _3) { return __builtin_lasx_xvmaddwev_d_wu(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmaddwev_w_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.hu(<8 x i32> [[_1]], <16 x i16> [[_2]], <16 x i16> [[_3]])
-// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvmaddwev_w_hu(v8u32 _1, v16u16 _2, v16u16 _3) { return __builtin_lasx_xvmaddwev_w_hu(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmaddwev_h_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.bu(<16 x i16> [[_1]], <32 x i8> [[_2]], <32 x i8> [[_3]])
-// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvmaddwev_h_bu(v16u16 _1, v32u8 _2, v32u8 _3) { return __builtin_lasx_xvmaddwev_h_bu(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmaddwod_q_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwod_q_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __builtin_lasx_xvmaddwod_q_d(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmaddwod_d_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.w(<4 x i64> [[_1]], <8 x i32> [[_2]], <8 x i32> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwod_d_w(v4i64 _1, v8i32 _2, v8i32 _3) { return __builtin_lasx_xvmaddwod_d_w(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmaddwod_w_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.h(<8 x i32> [[_1]], <16 x i16> [[_2]], <16 x i16> [[_3]])
-// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmaddwod_w_h(v8i32 _1, v16i16 _2, v16i16 _3) { return __builtin_lasx_xvmaddwod_w_h(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmaddwod_h_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.b(<16 x i16> [[_1]], <32 x i8> [[_2]], <32 x i8> [[_3]])
-// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmaddwod_h_b(v16i16 _1, v32i8 _2, v32i8 _3) { return __builtin_lasx_xvmaddwod_h_b(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmaddwod_q_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.du(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvmaddwod_q_du(v4u64 _1, v4u64 _2, v4u64 _3) { return __builtin_lasx_xvmaddwod_q_du(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmaddwod_d_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.wu(<4 x i64> [[_1]], <8 x i32> [[_2]], <8 x i32> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvmaddwod_d_wu(v4u64 _1, v8u32 _2, v8u32 _3) { return __builtin_lasx_xvmaddwod_d_wu(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmaddwod_w_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.hu(<8 x i32> [[_1]], <16 x i16> [[_2]], <16 x i16> [[_3]])
-// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvmaddwod_w_hu(v8u32 _1, v16u16 _2, v16u16 _3) { return __builtin_lasx_xvmaddwod_w_hu(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmaddwod_h_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.bu(<16 x i16> [[_1]], <32 x i8> [[_2]], <32 x i8> [[_3]])
-// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvmaddwod_h_bu(v16u16 _1, v32u8 _2, v32u8 _3) { return __builtin_lasx_xvmaddwod_h_bu(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmaddwev_q_du_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwev_q_du_d(v4i64 _1, v4u64 _2, v4i64 _3) { return __builtin_lasx_xvmaddwev_q_du_d(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmaddwev_d_wu_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.wu.w(<4 x i64> [[_1]], <8 x i32> [[_2]], <8 x i32> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwev_d_wu_w(v4i64 _1, v8u32 _2, v8i32 _3) { return __builtin_lasx_xvmaddwev_d_wu_w(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmaddwev_w_hu_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.hu.h(<8 x i32> [[_1]], <16 x i16> [[_2]], <16 x i16> [[_3]])
-// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmaddwev_w_hu_h(v8i32 _1, v16u16 _2, v16i16 _3) { return __builtin_lasx_xvmaddwev_w_hu_h(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmaddwev_h_bu_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.bu.b(<16 x i16> [[_1]], <32 x i8> [[_2]], <32 x i8> [[_3]])
-// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmaddwev_h_bu_b(v16i16 _1, v32u8 _2, v32i8 _3) { return __builtin_lasx_xvmaddwev_h_bu_b(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmaddwod_q_du_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <4 x i64>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]], <4 x i64> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwod_q_du_d(v4i64 _1, v4u64 _2, v4i64 _3) { return __builtin_lasx_xvmaddwod_q_du_d(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmaddwod_d_wu_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.wu.w(<4 x i64> [[_1]], <8 x i32> [[_2]], <8 x i32> [[_3]])
-// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmaddwod_d_wu_w(v4i64 _1, v8u32 _2, v8i32 _3) { return __builtin_lasx_xvmaddwod_d_wu_w(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmaddwod_w_hu_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <16 x i16>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.hu.h(<8 x i32> [[_1]], <16 x i16> [[_2]], <16 x i16> [[_3]])
-// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvmaddwod_w_hu_h(v8i32 _1, v16u16 _2, v16i16 _3) { return __builtin_lasx_xvmaddwod_w_hu_h(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvmaddwod_h_bu_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_3:%.*]] = load <32 x i8>, ptr [[TMP2]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.bu.b(<16 x i16> [[_1]], <32 x i8> [[_2]], <32 x i8> [[_3]])
-// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvmaddwod_h_bu_b(v16i16 _1, v32u8 _2, v32i8 _3) { return __builtin_lasx_xvmaddwod_h_bu_b(_1, _2, _3); }
 // CHECK-LABEL: define dso_local void @xvrotr_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrotr.b(<32 x i8> [[_1]], <32 x i8> [[_2]])
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvrotr_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvrotr_b(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvrotr_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrotr.h(<16 x i16> [[_1]], <16 x i16> [[_2]])
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvrotr_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvrotr_h(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvrotr_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrotr.w(<8 x i32> [[_1]], <8 x i32> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvrotr_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvrotr_w(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvrotr_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrotr.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvrotr_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvrotr_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvadd_q(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvadd.q(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvadd_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvadd_q(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvsub_q(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsub.q(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsub_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsub_q(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddwev_q_du_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwev_q_du_d(v4u64 _1, v4i64 _2) { return __builtin_lasx_xvaddwev_q_du_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvaddwod_q_du_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvaddwod_q_du_d(v4u64 _1, v4i64 _2) { return __builtin_lasx_xvaddwod_q_du_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwev_q_du_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwev_q_du_d(v4u64 _1, v4i64 _2) { return __builtin_lasx_xvmulwev_q_du_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmulwod_q_du_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.du.d(<4 x i64> [[_1]], <4 x i64> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvmulwod_q_du_d(v4u64 _1, v4i64 _2) { return __builtin_lasx_xvmulwod_q_du_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvmskgez_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmskgez.b(<32 x i8> [[_1]])
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmskgez_b(v32i8 _1) { return __builtin_lasx_xvmskgez_b(_1); }
 // CHECK-LABEL: define dso_local void @xvmsknz_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmsknz.b(<32 x i8> [[_1]])
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvmsknz_b(v32i8 _1) { return __builtin_lasx_xvmsknz_b(_1); }
 // CHECK-LABEL: define dso_local void @xvexth_h_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvexth.h.b(<32 x i8> [[_1]])
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvexth_h_b(v32i8 _1) { return __builtin_lasx_xvexth_h_b(_1); }
 // CHECK-LABEL: define dso_local void @xvexth_w_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvexth.w.h(<16 x i16> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvexth_w_h(v16i16 _1) { return __builtin_lasx_xvexth_w_h(_1); }
 // CHECK-LABEL: define dso_local void @xvexth_d_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.d.w(<8 x i32> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvexth_d_w(v8i32 _1) { return __builtin_lasx_xvexth_d_w(_1); }
 // CHECK-LABEL: define dso_local void @xvexth_q_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.q.d(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvexth_q_d(v4i64 _1) { return __builtin_lasx_xvexth_q_d(_1); }
 // CHECK-LABEL: define dso_local void @xvexth_hu_bu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvexth.hu.bu(<32 x i8> [[_1]])
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvexth_hu_bu(v32u8 _1) { return __builtin_lasx_xvexth_hu_bu(_1); }
 // CHECK-LABEL: define dso_local void @xvexth_wu_hu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvexth.wu.hu(<16 x i16> [[_1]])
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvexth_wu_hu(v16u16 _1) { return __builtin_lasx_xvexth_wu_hu(_1); }
 // CHECK-LABEL: define dso_local void @xvexth_du_wu(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.du.wu(<8 x i32> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvexth_du_wu(v8u32 _1) { return __builtin_lasx_xvexth_du_wu(_1); }
 // CHECK-LABEL: define dso_local void @xvexth_qu_du(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.qu.du(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvexth_qu_du(v4u64 _1) { return __builtin_lasx_xvexth_qu_du(_1); }
 // CHECK-LABEL: define dso_local void @xvrotri_b(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrotri.b(<32 x i8> [[_1]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvrotri_b(v32i8 _1) { return __builtin_lasx_xvrotri_b(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvrotri_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrotri.h(<16 x i16> [[_1]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvrotri_h(v16i16 _1) { return __builtin_lasx_xvrotri_h(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvrotri_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrotri.w(<8 x i32> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvrotri_w(v8i32 _1) { return __builtin_lasx_xvrotri_w(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvrotri_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrotri.d(<4 x i64> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvrotri_d(v4i64 _1) { return __builtin_lasx_xvrotri_d(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvextl_q_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvextl.q.d(<4 x i64> [[_1]])
-// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvextl_q_d(v4i64 _1) { return __builtin_lasx_xvextl_q_d(_1); }
 // CHECK-LABEL: define dso_local void @xvsrlni_b_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrlni_b_h(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsrlni_b_h(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvsrlni_h_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrlni_h_w(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrlni_h_w(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvsrlni_w_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrlni_w_d(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrlni_w_d(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvsrlni_d_q(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsrlni_d_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrlni_d_q(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvsrlrni_b_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlrni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrlrni_b_h(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsrlrni_b_h(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvsrlrni_h_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlrni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrlrni_h_w(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrlrni_h_w(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvsrlrni_w_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlrni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrlrni_w_d(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrlrni_w_d(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvsrlrni_d_q(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlrni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsrlrni_d_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrlrni_d_q(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrlni_b_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvssrlni_b_h(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvssrlni_b_h(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrlni_h_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvssrlni_h_w(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvssrlni_h_w(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrlni_w_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvssrlni_w_d(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvssrlni_w_d(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrlni_d_q(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvssrlni_d_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvssrlni_d_q(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrlni_bu_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlni.bu.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvssrlni_bu_h(v32u8 _1, v32i8 _2) { return __builtin_lasx_xvssrlni_bu_h(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrlni_hu_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlni.hu.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvssrlni_hu_w(v16u16 _1, v16i16 _2) { return __builtin_lasx_xvssrlni_hu_w(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrlni_wu_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlni.wu.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvssrlni_wu_d(v8u32 _1, v8i32 _2) { return __builtin_lasx_xvssrlni_wu_d(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrlni_du_q(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlni.du.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvssrlni_du_q(v4u64 _1, v4i64 _2) { return __builtin_lasx_xvssrlni_du_q(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrlrni_b_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvssrlrni_b_h(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvssrlrni_b_h(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrlrni_h_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvssrlrni_h_w(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvssrlrni_h_w(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrlrni_w_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvssrlrni_w_d(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvssrlrni_w_d(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrlrni_d_q(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlrni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvssrlrni_d_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvssrlrni_d_q(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrlrni_bu_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrni.bu.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvssrlrni_bu_h(v32u8 _1, v32i8 _2) { return __builtin_lasx_xvssrlrni_bu_h(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrlrni_hu_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrni.hu.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvssrlrni_hu_w(v16u16 _1, v16i16 _2) { return __builtin_lasx_xvssrlrni_hu_w(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrlrni_wu_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrni.wu.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvssrlrni_wu_d(v8u32 _1, v8i32 _2) { return __builtin_lasx_xvssrlrni_wu_d(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrlrni_du_q(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlrni.du.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvssrlrni_du_q(v4u64 _1, v4i64 _2) { return __builtin_lasx_xvssrlrni_du_q(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvsrani_b_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrani.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrani_b_h(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsrani_b_h(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvsrani_h_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrani.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrani_h_w(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrani_h_w(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvsrani_w_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrani.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrani_w_d(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrani_w_d(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvsrani_d_q(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrani.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsrani_d_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrani_d_q(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvsrarni_b_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrarni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvsrarni_b_h(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsrarni_b_h(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvsrarni_h_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrarni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvsrarni_h_w(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrarni_h_w(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvsrarni_w_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrarni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvsrarni_w_d(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrarni_w_d(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvsrarni_d_q(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrarni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvsrarni_d_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrarni_d_q(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrani_b_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrani.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvssrani_b_h(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvssrani_b_h(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrani_h_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrani.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvssrani_h_w(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvssrani_h_w(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrani_w_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrani.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvssrani_w_d(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvssrani_w_d(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrani_d_q(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrani.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvssrani_d_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvssrani_d_q(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrani_bu_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrani.bu.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvssrani_bu_h(v32u8 _1, v32i8 _2) { return __builtin_lasx_xvssrani_bu_h(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrani_hu_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrani.hu.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvssrani_hu_w(v16u16 _1, v16i16 _2) { return __builtin_lasx_xvssrani_hu_w(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrani_wu_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrani.wu.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvssrani_wu_d(v8u32 _1, v8i32 _2) { return __builtin_lasx_xvssrani_wu_d(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrani_du_q(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrani.du.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvssrani_du_q(v4u64 _1, v4i64 _2) { return __builtin_lasx_xvssrani_du_q(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrarni_b_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarni.b.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvssrarni_b_h(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvssrarni_b_h(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrarni_h_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarni.h.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvssrarni_h_w(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvssrarni_h_w(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrarni_w_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarni.w.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvssrarni_w_d(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvssrarni_w_d(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrarni_d_q(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrarni.d.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvssrarni_d_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvssrarni_d_q(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrarni_bu_h(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <32 x i8>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarni.bu.h(<32 x i8> [[_1]], <32 x i8> [[_2]], i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32u8 xvssrarni_bu_h(v32u8 _1, v32i8 _2) { return __builtin_lasx_xvssrarni_bu_h(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrarni_hu_w(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <16 x i16>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarni.hu.w(<16 x i16> [[_1]], <16 x i16> [[_2]], i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16u16 xvssrarni_hu_w(v16u16 _1, v16i16 _2) { return __builtin_lasx_xvssrarni_hu_w(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrarni_wu_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarni.wu.d(<8 x i32> [[_1]], <8 x i32> [[_2]], i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8u32 xvssrarni_wu_d(v8u32 _1, v8i32 _2) { return __builtin_lasx_xvssrarni_wu_d(_1, _2, 1); }
 // CHECK-LABEL: define dso_local void @xvssrarni_du_q(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrarni.du.q(<4 x i64> [[_1]], <4 x i64> [[_2]], i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4u64 xvssrarni_du_q(v4u64 _1, v4i64 _2) { return __builtin_lasx_xvssrarni_du_q(_1, _2, 1); }
 // CHECK-LABEL: define dso_local signext i32 @xbnz_b(
 // CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.b(<32 x i8> [[_1]])
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
@@ -6583,7 +6583,7 @@ int xbnz_b(v32u8 _1) { return __builtin_lasx_xbnz_b(_1); }
 // CHECK-LABEL: define dso_local signext i32 @xbnz_d(
 // CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.d(<4 x i64> [[_1]])
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
@@ -6591,7 +6591,7 @@ int xbnz_d(v4u64 _1) { return __builtin_lasx_xbnz_d(_1); }
 // CHECK-LABEL: define dso_local signext i32 @xbnz_h(
 // CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.h(<16 x i16> [[_1]])
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
@@ -6599,7 +6599,7 @@ int xbnz_h(v16u16 _1) { return __builtin_lasx_xbnz_h(_1); }
 // CHECK-LABEL: define dso_local signext i32 @xbnz_v(
 // CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.v(<32 x i8> [[_1]])
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
@@ -6607,7 +6607,7 @@ int xbnz_v(v32u8 _1) { return __builtin_lasx_xbnz_v(_1); }
 // CHECK-LABEL: define dso_local signext i32 @xbnz_w(
 // CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.w(<8 x i32> [[_1]])
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
@@ -6615,7 +6615,7 @@ int xbnz_w(v8u32 _1) { return __builtin_lasx_xbnz_w(_1); }
 // CHECK-LABEL: define dso_local signext i32 @xbz_b(
 // CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.b(<32 x i8> [[_1]])
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
@@ -6623,7 +6623,7 @@ int xbz_b(v32u8 _1) { return __builtin_lasx_xbz_b(_1); }
 // CHECK-LABEL: define dso_local signext i32 @xbz_d(
 // CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.d(<4 x i64> [[_1]])
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
@@ -6631,7 +6631,7 @@ int xbz_d(v4u64 _1) { return __builtin_lasx_xbz_d(_1); }
 // CHECK-LABEL: define dso_local signext i32 @xbz_h(
 // CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <16 x i16>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.h(<16 x i16> [[_1]])
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
@@ -6639,7 +6639,7 @@ int xbz_h(v16u16 _1) { return __builtin_lasx_xbz_h(_1); }
 // CHECK-LABEL: define dso_local signext i32 @xbz_v(
 // CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <32 x i8>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.v(<32 x i8> [[_1]])
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
@@ -6647,7 +6647,7 @@ int xbz_v(v32u8 _1) { return __builtin_lasx_xbz_v(_1); }
 // CHECK-LABEL: define dso_local signext i32 @xbz_w(
 // CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.w(<8 x i32> [[_1]])
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
@@ -6655,458 +6655,458 @@ int xbz_w(v8u32 _1) { return __builtin_lasx_xbz_w(_1); }
 // CHECK-LABEL: define dso_local void @xvfcmp_caf_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.caf.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_caf_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_caf_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_caf_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.caf.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_caf_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_caf_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_ceq_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.ceq.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_ceq_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_ceq_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_ceq_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.ceq.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_ceq_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_ceq_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_cle_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cle.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cle_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_cle_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_cle_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cle.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cle_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_cle_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_clt_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.clt.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_clt_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_clt_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_clt_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.clt.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_clt_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_clt_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_cne_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cne.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cne_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_cne_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_cne_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cne.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cne_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_cne_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_cor_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cor.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cor_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_cor_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_cor_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cor.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cor_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_cor_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_cueq_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cueq.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cueq_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_cueq_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_cueq_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cueq.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cueq_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_cueq_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_cule_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cule.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cule_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_cule_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_cule_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cule.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cule_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_cule_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_cult_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cult.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cult_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_cult_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_cult_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cult.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cult_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_cult_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_cun_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cun.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cun_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_cun_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_cune_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cune.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_cune_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_cune_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_cune_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cune.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cune_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_cune_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_cun_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cun.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_cun_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_cun_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_saf_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.saf.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_saf_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_saf_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_saf_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.saf.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_saf_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_saf_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_seq_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.seq.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_seq_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_seq_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_seq_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.seq.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_seq_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_seq_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_sle_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sle.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sle_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_sle_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_sle_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sle.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sle_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_sle_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_slt_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.slt.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_slt_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_slt_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_slt_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.slt.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_slt_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_slt_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_sne_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sne.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sne_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_sne_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_sne_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sne.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sne_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_sne_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_sor_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sor.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sor_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_sor_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_sor_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sor.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sor_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_sor_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_sueq_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sueq.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sueq_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_sueq_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_sueq_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sueq.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sueq_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_sueq_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_sule_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sule.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sule_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_sule_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_sule_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sule.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sule_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_sule_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_sult_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sult.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sult_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_sult_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_sult_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sult.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sult_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_sult_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_sun_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sun.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sun_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_sun_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_sune_d(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sune.d(<4 x double> [[_1]], <4 x double> [[_2]])
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvfcmp_sune_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_sune_d(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_sune_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sune.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sune_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_sune_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvfcmp_sun_s(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[_2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sun.s(<8 x float> [[_1]], <8 x float> [[_2]])
-// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvfcmp_sun_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_sun_s(_1, _2); }
 // CHECK-LABEL: define dso_local void @xvpickve_d_f(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvpickve.d.f(<4 x double> [[_1]], i32 1)
-// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4f64 xvpickve_d_f(v4f64 _1) { return __builtin_lasx_xvpickve_d_f(_1, 1); }
 // CHECK-LABEL: define dso_local void @xvpickve_w_f(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvpickve.w.f(<8 x float> [[_1]], i32 1)
-// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8f32 xvpickve_w_f(v8f32 _1) { return __builtin_lasx_xvpickve_w_f(_1, 1); }
@@ -7114,7 +7114,7 @@ v8f32 xvpickve_w_f(v8f32 _1) { return __builtin_lasx_xvpickve_w_f(_1, 1); }
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrepli.b(i32 1)
-// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <32 x i8> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v32i8 xvrepli_b() { return __builtin_lasx_xvrepli_b(1); }
@@ -7122,7 +7122,7 @@ v32i8 xvrepli_b() { return __builtin_lasx_xvrepli_b(1); }
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrepli.d(i32 1)
-// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v4i64 xvrepli_d() { return __builtin_lasx_xvrepli_d(1); }
@@ -7130,7 +7130,7 @@ v4i64 xvrepli_d() { return __builtin_lasx_xvrepli_d(1); }
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i16>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrepli.h(i32 1)
-// CHECK-NEXT:    store <16 x i16> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i16> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v16i16 xvrepli_h() { return __builtin_lasx_xvrepli_h(1); }
@@ -7138,12 +7138,12 @@ v16i16 xvrepli_h() { return __builtin_lasx_xvrepli_h(1); }
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrepli.w(i32 1)
-// CHECK-NEXT:    store <8 x i32> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x i32> [[TMP0]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 v8i32 xvrepli_w() { return __builtin_lasx_xvrepli_w(1); }
 //.
-// CHECK: [[CHAR_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
-// CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
-// CHECK: [[META4]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[META4:![0-9]+]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK: [[META5]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[CHAR_TBAA6]] = !{[[META4]], [[META4]], i64 0}
 //.
diff --git a/clang/test/CodeGen/LoongArch/lasx/inline-asm-gcc-regs.c b/clang/test/CodeGen/LoongArch/lasx/inline-asm-gcc-regs.c
index ed1a9660a06c9..0dc74ff63d089 100644
--- a/clang/test/CodeGen/LoongArch/lasx/inline-asm-gcc-regs.c
+++ b/clang/test/CodeGen/LoongArch/lasx/inline-asm-gcc-regs.c
@@ -4,7 +4,7 @@
 typedef signed char v32i8 __attribute__((vector_size(32), aligned(32)));
 
 // CHECK-LABEL: @test_xr0(
-// CHECK:    tail call void asm sideeffect "", "{$xr0}"(<32 x i8> undef) #[[ATTR1:[0-9]+]], !srcloc !2
+// CHECK:    tail call void asm sideeffect "", "{$xr0}"(<32 x i8> undef) #[[ATTR1:[0-9]+]], !srcloc [[META6:![0-9]+]]
 //
 void test_xr0() {
     register v32i8 a asm ("$xr0");
@@ -12,7 +12,7 @@ void test_xr0() {
 }
 
 // CHECK-LABEL: @test_xr7(
-// CHECK:    tail call void asm sideeffect "", "{$xr7}"(<32 x i8> undef) #[[ATTR1]], !srcloc !3
+// CHECK:    tail call void asm sideeffect "", "{$xr7}"(<32 x i8> undef) #[[ATTR1]], !srcloc [[META7:![0-9]+]]
 //
 void test_xr7() {
     register v32i8 a asm ("$xr7");
@@ -20,7 +20,7 @@ void test_xr7() {
 }
 
 // CHECK-LABEL: @test_xr15(
-// CHECK:    tail call void asm sideeffect "", "{$xr15}"(<32 x i8> undef) #[[ATTR1]], !srcloc !4
+// CHECK:    tail call void asm sideeffect "", "{$xr15}"(<32 x i8> undef) #[[ATTR1]], !srcloc [[META8:![0-9]+]]
 //
 void test_xr15() {
     register v32i8 a asm ("$xr15");
@@ -28,7 +28,7 @@ void test_xr15() {
 }
 
 // CHECK-LABEL: @test_xr31(
-// CHECK:    tail call void asm sideeffect "", "{$xr31}"(<32 x i8> undef) #[[ATTR1]], !srcloc !5
+// CHECK:    tail call void asm sideeffect "", "{$xr31}"(<32 x i8> undef) #[[ATTR1]], !srcloc [[META9:![0-9]+]]
 //
 void test_xr31() {
     register v32i8 a asm ("$xr31");
diff --git a/clang/test/CodeGen/LoongArch/lasx/inline-asm-operand-modifier.c b/clang/test/CodeGen/LoongArch/lasx/inline-asm-operand-modifier.c
index a5cc8798fd66b..cb5e6891885dc 100644
--- a/clang/test/CodeGen/LoongArch/lasx/inline-asm-operand-modifier.c
+++ b/clang/test/CodeGen/LoongArch/lasx/inline-asm-operand-modifier.c
@@ -6,7 +6,7 @@ typedef long long v4i64 __attribute__ ((vector_size(32), aligned(32)));
 // CHECK-LABEL: define dso_local void @test_u
 // CHECK-SAME: () local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> asm sideeffect "xvldi ${0:u}, 1", "=f"() #[[ATTR1:[0-9]+]], !srcloc !2
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> asm sideeffect "xvldi ${0:u}, 1", "=f"() #[[ATTR1:[0-9]+]], !srcloc [[META6:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
 void test_u() {
diff --git a/clang/test/CodeGen/LoongArch/lsx/inline-asm-gcc-regs.c b/clang/test/CodeGen/LoongArch/lsx/inline-asm-gcc-regs.c
index b05b1c8c15fae..588a3a1249247 100644
--- a/clang/test/CodeGen/LoongArch/lsx/inline-asm-gcc-regs.c
+++ b/clang/test/CodeGen/LoongArch/lsx/inline-asm-gcc-regs.c
@@ -4,7 +4,7 @@
 typedef signed char v16i8 __attribute__((vector_size(16), aligned(16)));
 
 // CHECK-LABEL: @test_vr0(
-// CHECK:    tail call void asm sideeffect "", "{$vr0}"(<16 x i8> undef) #[[ATTR1:[0-9]+]], !srcloc !2
+// CHECK:    tail call void asm sideeffect "", "{$vr0}"(<16 x i8> undef) #[[ATTR1:[0-9]+]], !srcloc [[META6:![0-9]+]]
 //
 void test_vr0() {
     register v16i8 a asm ("$vr0");
@@ -12,7 +12,7 @@ void test_vr0() {
 }
 
 // CHECK-LABEL: @test_vr7(
-// CHECK:    tail call void asm sideeffect "", "{$vr7}"(<16 x i8> undef) #[[ATTR1]], !srcloc !3
+// CHECK:    tail call void asm sideeffect "", "{$vr7}"(<16 x i8> undef) #[[ATTR1]], !srcloc [[META7:![0-9]+]]
 //
 void test_vr7() {
     register v16i8 a asm ("$vr7");
@@ -20,7 +20,7 @@ void test_vr7() {
 }
 
 // CHECK-LABEL: @test_vr15(
-// CHECK:    tail call void asm sideeffect "", "{$vr15}"(<16 x i8> undef) #[[ATTR1]], !srcloc !4
+// CHECK:    tail call void asm sideeffect "", "{$vr15}"(<16 x i8> undef) #[[ATTR1]], !srcloc [[META8:![0-9]+]]
 //
 void test_vr15() {
     register v16i8 a asm ("$vr15");
@@ -28,7 +28,7 @@ void test_vr15() {
 }
 
 // CHECK-LABEL: @test_vr31(
-// CHECK:    tail call void asm sideeffect "", "{$vr31}"(<16 x i8> undef) #[[ATTR1]], !srcloc !5
+// CHECK:    tail call void asm sideeffect "", "{$vr31}"(<16 x i8> undef) #[[ATTR1]], !srcloc [[META9:![0-9]+]]
 //
 void test_vr31() {
     register v16i8 a asm ("$vr31");
diff --git a/clang/test/CodeGen/LoongArch/lsx/inline-asm-operand-modifier.c b/clang/test/CodeGen/LoongArch/lsx/inline-asm-operand-modifier.c
index 5e0fae984134e..f0fb6e31a1a02 100644
--- a/clang/test/CodeGen/LoongArch/lsx/inline-asm-operand-modifier.c
+++ b/clang/test/CodeGen/LoongArch/lsx/inline-asm-operand-modifier.c
@@ -6,7 +6,7 @@ typedef long long v2i64 __attribute__ ((vector_size(16), aligned(16)));
 // CHECK-LABEL: define dso_local void @test_w
 // CHECK-SAME: () local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> asm sideeffect "vldi ${0:w}, 1", "=f"() #[[ATTR1:[0-9]+]], !srcloc !2
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> asm sideeffect "vldi ${0:w}, 1", "=f"() #[[ATTR1:[0-9]+]], !srcloc [[META6:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
 void test_w() {
diff --git a/clang/test/CodeGen/PowerPC/builtins-dmf-vsx-vector-float.c b/clang/test/CodeGen/PowerPC/builtins-dmf-vsx-vector-float.c
index 8fc9a68a5a613..f59a964641119 100644
--- a/clang/test/CodeGen/PowerPC/builtins-dmf-vsx-vector-float.c
+++ b/clang/test/CodeGen/PowerPC/builtins-dmf-vsx-vector-float.c
@@ -1,17 +1,26 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
 //       Update then manual applied to commonize the checks for AIX and LoP.
 // RUN: %clang_cc1 -O3 -triple powerpc64le-unknown-unknown -target-cpu future \
 // RUN:            -emit-llvm %s -o - | FileCheck %s
 // RUN: %clang_cc1 -O3 -triple powerpc64-ibm-aix -target-cpu future \
-// RUN:             -emit-llvm %s -o - | FileCheck %s
+// RUN:             -emit-llvm %s -o - | FileCheck %s --check-prefix=AIX
 
-// CHECK-LABEL: void @test_dmxvbf16gerx2(
+// CHECK-LABEL: define dso_local void @test_dmxvbf16gerx2(
+// CHECK-SAME: ptr noundef readnone captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2:![0-9]+]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2(<256 x i1> [[TMP0]], <16 x i8> [[VC:%.*]])
-// CHECK-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2(<256 x i1> [[TMP0]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
+// AIX-LABEL: define void @test_dmxvbf16gerx2(
+// AIX-SAME: ptr noundef readnone captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// AIX-NEXT:  [[ENTRY:.*:]]
+// AIX-NEXT:    [[TMP0:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6:![0-9]+]]
+// AIX-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2(<256 x i1> [[TMP0]], <16 x i8> [[VC]])
+// AIX-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8:![0-9]+]]
+// AIX-NEXT:    ret void
+//
 void test_dmxvbf16gerx2(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
   __dmr1024 vdmr = *((__dmr1024 *)vdmrp);
   __vector_pair vp = *((__vector_pair *)vpp);
@@ -19,14 +28,24 @@ void test_dmxvbf16gerx2(unsigned char *vdmrp, unsigned char *vpp, vector unsigne
   *((__dmr1024 *)resp) = vdmr;
 }
 
-// CHECK-LABEL: void @test_dmxvbf16gerx2nn(
+// CHECK-LABEL: define dso_local void @test_dmxvbf16gerx2nn(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2nn(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]])
-// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2nn(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
 // CHECK-NEXT:    ret void
 //
+// AIX-LABEL: define void @test_dmxvbf16gerx2nn(
+// AIX-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// AIX-NEXT:  [[ENTRY:.*:]]
+// AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// AIX-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// AIX-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2nn(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]])
+// AIX-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// AIX-NEXT:    ret void
+//
 void test_dmxvbf16gerx2nn(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
   __dmr1024 vdmr = *((__dmr1024 *)vdmrp);
   __vector_pair vp = *((__vector_pair *)vpp);
@@ -34,14 +53,24 @@ void test_dmxvbf16gerx2nn(unsigned char *vdmrp, unsigned char *vpp, vector unsig
   *((__dmr1024 *)resp) = vdmr;
 }
 
-// CHECK-LABEL: void @test_dmxvbf16gerx2np(
+// CHECK-LABEL: define dso_local void @test_dmxvbf16gerx2np(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2np(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]])
-// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2np(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
 // CHECK-NEXT:    ret void
 //
+// AIX-LABEL: define void @test_dmxvbf16gerx2np(
+// AIX-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// AIX-NEXT:  [[ENTRY:.*:]]
+// AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// AIX-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// AIX-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2np(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]])
+// AIX-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// AIX-NEXT:    ret void
+//
 void test_dmxvbf16gerx2np(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
   __dmr1024 vdmr = *((__dmr1024 *)vdmrp);
   __vector_pair vp = *((__vector_pair *)vpp);
@@ -49,14 +78,24 @@ void test_dmxvbf16gerx2np(unsigned char *vdmrp, unsigned char *vpp, vector unsig
   *((__dmr1024 *)resp) = vdmr;
 }
 
-// CHECK-LABEL: void @test_dmxvbf16gerx2pn(
+// CHECK-LABEL: define dso_local void @test_dmxvbf16gerx2pn(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2pn(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]])
-// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2pn(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
 // CHECK-NEXT:    ret void
 //
+// AIX-LABEL: define void @test_dmxvbf16gerx2pn(
+// AIX-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// AIX-NEXT:  [[ENTRY:.*:]]
+// AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// AIX-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// AIX-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2pn(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]])
+// AIX-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// AIX-NEXT:    ret void
+//
 void test_dmxvbf16gerx2pn(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
   __dmr1024 vdmr = *((__dmr1024 *)vdmrp);
   __vector_pair vp = *((__vector_pair *)vpp);
@@ -64,14 +103,24 @@ void test_dmxvbf16gerx2pn(unsigned char *vdmrp, unsigned char *vpp, vector unsig
   *((__dmr1024 *)resp) = vdmr;
 }
 
-// CHECK-LABEL: void @test_dmxvbf16gerx2pp(
+// CHECK-LABEL: define dso_local void @test_dmxvbf16gerx2pp(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2pp(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]])
-// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2pp(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
 // CHECK-NEXT:    ret void
 //
+// AIX-LABEL: define void @test_dmxvbf16gerx2pp(
+// AIX-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// AIX-NEXT:  [[ENTRY:.*:]]
+// AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// AIX-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// AIX-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2pp(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]])
+// AIX-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// AIX-NEXT:    ret void
+//
 void test_dmxvbf16gerx2pp(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
   __dmr1024 vdmr = *((__dmr1024 *)vdmrp);
   __vector_pair vp = *((__vector_pair *)vpp);
@@ -79,13 +128,22 @@ void test_dmxvbf16gerx2pp(unsigned char *vdmrp, unsigned char *vpp, vector unsig
   *((__dmr1024 *)resp) = vdmr;
 }
 
-// CHECK-LABEL: void @test_pmdmxvbf16gerx2(
+// CHECK-LABEL: define dso_local void @test_pmdmxvbf16gerx2(
+// CHECK-SAME: ptr noundef readnone captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvbf16gerx2(<256 x i1> [[TMP0]], <16 x i8> [[VC:%.*]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvbf16gerx2(<256 x i1> [[TMP0]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
 // CHECK-NEXT:    ret void
 //
+// AIX-LABEL: define void @test_pmdmxvbf16gerx2(
+// AIX-SAME: ptr noundef readnone captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// AIX-NEXT:  [[ENTRY:.*:]]
+// AIX-NEXT:    [[TMP0:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// AIX-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvbf16gerx2(<256 x i1> [[TMP0]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// AIX-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// AIX-NEXT:    ret void
+//
 void test_pmdmxvbf16gerx2(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
   __dmr1024 vdmr = *((__dmr1024 *)vdmrp);
   __vector_pair vp = *((__vector_pair *)vpp);
@@ -93,14 +151,24 @@ void test_pmdmxvbf16gerx2(unsigned char *vdmrp, unsigned char *vpp, vector unsig
   *((__dmr1024 *)resp) = vdmr;
 }
 
-// CHECK-LABEL: void @test_pmdmxvbf16gerx2nn(
+// CHECK-LABEL: define dso_local void @test_pmdmxvbf16gerx2nn(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvbf16gerx2nn(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvbf16gerx2nn(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
 // CHECK-NEXT:    ret void
 //
+// AIX-LABEL: define void @test_pmdmxvbf16gerx2nn(
+// AIX-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// AIX-NEXT:  [[ENTRY:.*:]]
+// AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// AIX-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// AIX-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvbf16gerx2nn(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// AIX-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// AIX-NEXT:    ret void
+//
 void test_pmdmxvbf16gerx2nn(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
   __dmr1024 vdmr = *((__dmr1024 *)vdmrp);
   __vector_pair vp = *((__vector_pair *)vpp);
@@ -108,14 +176,24 @@ void test_pmdmxvbf16gerx2nn(unsigned char *vdmrp, unsigned char *vpp, vector uns
   *((__dmr1024 *)resp) = vdmr;
 }
 
-// CHECK-LABEL: void @test_pmdmxvbf16gerx2np(
+// CHECK-LABEL: define dso_local void @test_pmdmxvbf16gerx2np(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvbf16gerx2np(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvbf16gerx2np(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
 // CHECK-NEXT:    ret void
 //
+// AIX-LABEL: define void @test_pmdmxvbf16gerx2np(
+// AIX-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// AIX-NEXT:  [[ENTRY:.*:]]
+// AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// AIX-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// AIX-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvbf16gerx2np(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// AIX-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// AIX-NEXT:    ret void
+//
 void test_pmdmxvbf16gerx2np(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
   __dmr1024 vdmr = *((__dmr1024 *)vdmrp);
   __vector_pair vp = *((__vector_pair *)vpp);
@@ -123,14 +201,24 @@ void test_pmdmxvbf16gerx2np(unsigned char *vdmrp, unsigned char *vpp, vector uns
   *((__dmr1024 *)resp) = vdmr;
 }
 
-// CHECK-LABEL: void @test_pmdmxvbf16gerx2pn(
+// CHECK-LABEL: define dso_local void @test_pmdmxvbf16gerx2pn(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvbf16gerx2pn(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvbf16gerx2pn(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
 // CHECK-NEXT:    ret void
 //
+// AIX-LABEL: define void @test_pmdmxvbf16gerx2pn(
+// AIX-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// AIX-NEXT:  [[ENTRY:.*:]]
+// AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// AIX-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// AIX-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvbf16gerx2pn(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// AIX-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// AIX-NEXT:    ret void
+//
 void test_pmdmxvbf16gerx2pn(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
   __dmr1024 vdmr = *((__dmr1024 *)vdmrp);
   __vector_pair vp = *((__vector_pair *)vpp);
@@ -138,14 +226,24 @@ void test_pmdmxvbf16gerx2pn(unsigned char *vdmrp, unsigned char *vpp, vector uns
   *((__dmr1024 *)resp) = vdmr;
 }
 
-// CHECK-LABEL: void @test_pmdmxvbf16gerx2pp(
+// CHECK-LABEL: define dso_local void @test_pmdmxvbf16gerx2pp(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvbf16gerx2pp(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvbf16gerx2pp(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
 // CHECK-NEXT:    ret void
 //
+// AIX-LABEL: define void @test_pmdmxvbf16gerx2pp(
+// AIX-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// AIX-NEXT:  [[ENTRY:.*:]]
+// AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// AIX-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// AIX-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvbf16gerx2pp(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// AIX-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// AIX-NEXT:    ret void
+//
 void test_pmdmxvbf16gerx2pp(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
   __dmr1024 vdmr = *((__dmr1024 *)vdmrp);
   __vector_pair vp = *((__vector_pair *)vpp);
@@ -153,13 +251,22 @@ void test_pmdmxvbf16gerx2pp(unsigned char *vdmrp, unsigned char *vpp, vector uns
   *((__dmr1024 *)resp) = vdmr;
 }
 
-// CHECK-LABEL: void @test_dmxvf16gerx2(
+// CHECK-LABEL: define dso_local void @test_dmxvf16gerx2(
+// CHECK-SAME: ptr noundef readnone captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2:![0-9]+]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvf16gerx2(<256 x i1> [[TMP0]], <16 x i8> [[VC:%.*]])
-// CHECK-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvf16gerx2(<256 x i1> [[TMP0]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
 // CHECK-NEXT:    ret void
 //
+// AIX-LABEL: define void @test_dmxvf16gerx2(
+// AIX-SAME: ptr noundef readnone captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// AIX-NEXT:  [[ENTRY:.*:]]
+// AIX-NEXT:    [[TMP0:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// AIX-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvf16gerx2(<256 x i1> [[TMP0]], <16 x i8> [[VC]])
+// AIX-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// AIX-NEXT:    ret void
+//
 void test_dmxvf16gerx2(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
   __dmr1024 vdmr = *((__dmr1024 *)vdmrp);
   __vector_pair vp = *((__vector_pair *)vpp);
@@ -167,14 +274,24 @@ void test_dmxvf16gerx2(unsigned char *vdmrp, unsigned char *vpp, vector unsigned
   *((__dmr1024 *)resp) = vdmr;
 }
 
-// CHECK-LABEL: void @test_dmxvf16gerx2nn(
+// CHECK-LABEL: define dso_local void @test_dmxvf16gerx2nn(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvf16gerx2nn(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]])
-// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvf16gerx2nn(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
 // CHECK-NEXT:    ret void
 //
+// AIX-LABEL: define void @test_dmxvf16gerx2nn(
+// AIX-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// AIX-NEXT:  [[ENTRY:.*:]]
+// AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// AIX-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// AIX-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvf16gerx2nn(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]])
+// AIX-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// AIX-NEXT:    ret void
+//
 void test_dmxvf16gerx2nn(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
   __dmr1024 vdmr = *((__dmr1024 *)vdmrp);
   __vector_pair vp = *((__vector_pair *)vpp);
@@ -182,14 +299,24 @@ void test_dmxvf16gerx2nn(unsigned char *vdmrp, unsigned char *vpp, vector unsign
   *((__dmr1024 *)resp) = vdmr;
 }
 
-// CHECK-LABEL: void @test_dmxvf16gerx2np(
+// CHECK-LABEL: define dso_local void @test_dmxvf16gerx2np(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvf16gerx2np(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]])
-// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvf16gerx2np(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
 // CHECK-NEXT:    ret void
 //
+// AIX-LABEL: define void @test_dmxvf16gerx2np(
+// AIX-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// AIX-NEXT:  [[ENTRY:.*:]]
+// AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// AIX-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// AIX-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvf16gerx2np(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]])
+// AIX-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// AIX-NEXT:    ret void
+//
 void test_dmxvf16gerx2np(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
   __dmr1024 vdmr = *((__dmr1024 *)vdmrp);
   __vector_pair vp = *((__vector_pair *)vpp);
@@ -197,14 +324,24 @@ void test_dmxvf16gerx2np(unsigned char *vdmrp, unsigned char *vpp, vector unsign
   *((__dmr1024 *)resp) = vdmr;
 }
 
-// CHECK-LABEL: void @test_dmxvf16gerx2pn(
+// CHECK-LABEL: define dso_local void @test_dmxvf16gerx2pn(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvf16gerx2pn(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]])
-// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvf16gerx2pn(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
 // CHECK-NEXT:    ret void
 //
+// AIX-LABEL: define void @test_dmxvf16gerx2pn(
+// AIX-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// AIX-NEXT:  [[ENTRY:.*:]]
+// AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// AIX-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// AIX-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvf16gerx2pn(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]])
+// AIX-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// AIX-NEXT:    ret void
+//
 void test_dmxvf16gerx2pn(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
   __dmr1024 vdmr = *((__dmr1024 *)vdmrp);
   __vector_pair vp = *((__vector_pair *)vpp);
@@ -212,14 +349,24 @@ void test_dmxvf16gerx2pn(unsigned char *vdmrp, unsigned char *vpp, vector unsign
   *((__dmr1024 *)resp) = vdmr;
 }
 
-// CHECK-LABEL: void @test_dmxvf16gerx2pp(
+// CHECK-LABEL: define dso_local void @test_dmxvf16gerx2pp(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvf16gerx2pp(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]])
-// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvf16gerx2pp(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]])
+// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
 // CHECK-NEXT:    ret void
 //
+// AIX-LABEL: define void @test_dmxvf16gerx2pp(
+// AIX-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// AIX-NEXT:  [[ENTRY:.*:]]
+// AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// AIX-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// AIX-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvf16gerx2pp(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]])
+// AIX-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// AIX-NEXT:    ret void
+//
 void test_dmxvf16gerx2pp(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
   __dmr1024 vdmr = *((__dmr1024 *)vdmrp);
   __vector_pair vp = *((__vector_pair *)vpp);
@@ -227,13 +374,22 @@ void test_dmxvf16gerx2pp(unsigned char *vdmrp, unsigned char *vpp, vector unsign
   *((__dmr1024 *)resp) = vdmr;
 }
 
-// CHECK-LABEL: void @test_pmdmxvf16gerx2(
+// CHECK-LABEL: define dso_local void @test_pmdmxvf16gerx2(
+// CHECK-SAME: ptr noundef readnone captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvf16gerx2(<256 x i1> [[TMP0]], <16 x i8> [[VC:%.*]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvf16gerx2(<256 x i1> [[TMP0]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
 // CHECK-NEXT:    ret void
 //
+// AIX-LABEL: define void @test_pmdmxvf16gerx2(
+// AIX-SAME: ptr noundef readnone captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// AIX-NEXT:  [[ENTRY:.*:]]
+// AIX-NEXT:    [[TMP0:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// AIX-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvf16gerx2(<256 x i1> [[TMP0]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// AIX-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// AIX-NEXT:    ret void
+//
 void test_pmdmxvf16gerx2(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
   __dmr1024 vdmr = *((__dmr1024 *)vdmrp);
   __vector_pair vp = *((__vector_pair *)vpp);
@@ -241,14 +397,24 @@ void test_pmdmxvf16gerx2(unsigned char *vdmrp, unsigned char *vpp, vector unsign
   *((__dmr1024 *)resp) = vdmr;
 }
 
-// CHECK-LABEL: void @test_pmdmxvf16gerx2nn(
+// CHECK-LABEL: define dso_local void @test_pmdmxvf16gerx2nn(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvf16gerx2nn(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvf16gerx2nn(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
 // CHECK-NEXT:    ret void
 //
+// AIX-LABEL: define void @test_pmdmxvf16gerx2nn(
+// AIX-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// AIX-NEXT:  [[ENTRY:.*:]]
+// AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// AIX-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// AIX-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvf16gerx2nn(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// AIX-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// AIX-NEXT:    ret void
+//
 void test_pmdmxvf16gerx2nn(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
   __dmr1024 vdmr = *((__dmr1024 *)vdmrp);
   __vector_pair vp = *((__vector_pair *)vpp);
@@ -256,14 +422,24 @@ void test_pmdmxvf16gerx2nn(unsigned char *vdmrp, unsigned char *vpp, vector unsi
   *((__dmr1024 *)resp) = vdmr;
 }
 
-// CHECK-LABEL: void @test_pmdmxvf16gerx2np(
+// CHECK-LABEL: define dso_local void @test_pmdmxvf16gerx2np(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvf16gerx2np(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvf16gerx2np(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
 // CHECK-NEXT:    ret void
 //
+// AIX-LABEL: define void @test_pmdmxvf16gerx2np(
+// AIX-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// AIX-NEXT:  [[ENTRY:.*:]]
+// AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// AIX-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// AIX-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvf16gerx2np(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// AIX-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// AIX-NEXT:    ret void
+//
 void test_pmdmxvf16gerx2np(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
   __dmr1024 vdmr = *((__dmr1024 *)vdmrp);
   __vector_pair vp = *((__vector_pair *)vpp);
@@ -271,14 +447,24 @@ void test_pmdmxvf16gerx2np(unsigned char *vdmrp, unsigned char *vpp, vector unsi
   *((__dmr1024 *)resp) = vdmr;
 }
 
-// CHECK-LABEL: void @test_pmdmxvf16gerx2pn(
+// CHECK-LABEL: define dso_local void @test_pmdmxvf16gerx2pn(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvf16gerx2pn(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvf16gerx2pn(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
 // CHECK-NEXT:    ret void
 //
+// AIX-LABEL: define void @test_pmdmxvf16gerx2pn(
+// AIX-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// AIX-NEXT:  [[ENTRY:.*:]]
+// AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// AIX-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// AIX-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvf16gerx2pn(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// AIX-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// AIX-NEXT:    ret void
+//
 void test_pmdmxvf16gerx2pn(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
   __dmr1024 vdmr = *((__dmr1024 *)vdmrp);
   __vector_pair vp = *((__vector_pair *)vpp);
@@ -286,14 +472,24 @@ void test_pmdmxvf16gerx2pn(unsigned char *vdmrp, unsigned char *vpp, vector unsi
   *((__dmr1024 *)resp) = vdmr;
 }
 
-// CHECK-LABEL: void @test_pmdmxvf16gerx2pp(
+// CHECK-LABEL: define dso_local void @test_pmdmxvf16gerx2pp(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvf16gerx2pp(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvf16gerx2pp(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
 // CHECK-NEXT:    ret void
 //
+// AIX-LABEL: define void @test_pmdmxvf16gerx2pp(
+// AIX-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// AIX-NEXT:  [[ENTRY:.*:]]
+// AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// AIX-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// AIX-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvf16gerx2pp(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// AIX-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// AIX-NEXT:    ret void
+//
 void test_pmdmxvf16gerx2pp(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
   __dmr1024 vdmr = *((__dmr1024 *)vdmrp);
   __vector_pair vp = *((__vector_pair *)vpp);
@@ -301,9 +497,18 @@ void test_pmdmxvf16gerx2pp(unsigned char *vdmrp, unsigned char *vpp, vector unsi
   *((__dmr1024 *)resp) = vdmr;
 }
 
-// CHECK: [[TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
-// CHECK: [[META3]] = !{!"__vector_pair", [[META4:![0-9]+]], i64 0}
-// CHECK: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+//.
+// CHECK: [[META4:![0-9]+]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
 // CHECK: [[META5]] = !{!"Simple C/C++ TBAA"}
-// CHECK: [[TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
-// CHECK: [[META7]] = !{!"__dmr1024", [[META4]], i64 0}
+// CHECK: [[__VECTOR_PAIR_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// CHECK: [[META7]] = !{!"__vector_pair", [[META4]], i64 0}
+// CHECK: [[__DMR1024_TBAA8]] = !{[[META9:![0-9]+]], [[META9]], i64 0}
+// CHECK: [[META9]] = !{!"__dmr1024", [[META4]], i64 0}
+//.
+// AIX: [[META4:![0-9]+]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// AIX: [[META5]] = !{!"Simple C/C++ TBAA"}
+// AIX: [[__VECTOR_PAIR_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// AIX: [[META7]] = !{!"__vector_pair", [[META4]], i64 0}
+// AIX: [[__DMR1024_TBAA8]] = !{[[META9:![0-9]+]], [[META9]], i64 0}
+// AIX: [[META9]] = !{!"__dmr1024", [[META4]], i64 0}
+//.
diff --git a/clang/test/CodeGen/PowerPC/builtins-ppc-build-pair-mma.c b/clang/test/CodeGen/PowerPC/builtins-ppc-build-pair-mma.c
index 59b71cd355813..e602be7c59ae5 100644
--- a/clang/test/CodeGen/PowerPC/builtins-ppc-build-pair-mma.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-build-pair-mma.c
@@ -10,14 +10,14 @@
 // CHECK-LE-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC1:%.*]], <16 x i8> noundef [[VC2:%.*]], <16 x i8> noundef [[VC3:%.*]], <16 x i8> noundef [[VC4:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-LE-NEXT:  [[ENTRY:.*:]]
 // CHECK-LE-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> [[VC4]], <16 x i8> [[VC3]], <16 x i8> [[VC2]], <16 x i8> [[VC1]])
-// CHECK-LE-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2:![0-9]+]]
+// CHECK-LE-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6:![0-9]+]]
 // CHECK-LE-NEXT:    ret void
 //
 // CHECK-BE-LABEL: define dso_local void @test1(
 // CHECK-BE-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC1:%.*]], <16 x i8> noundef [[VC2:%.*]], <16 x i8> noundef [[VC3:%.*]], <16 x i8> noundef [[VC4:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-BE-NEXT:  [[ENTRY:.*:]]
 // CHECK-BE-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> [[VC1]], <16 x i8> [[VC2]], <16 x i8> [[VC3]], <16 x i8> [[VC4]])
-// CHECK-BE-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2:![0-9]+]]
+// CHECK-BE-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6:![0-9]+]]
 // CHECK-BE-NEXT:    ret void
 //
 // CHECK-LE-NOOPT-LABEL: define dso_local void @test1(
@@ -70,14 +70,14 @@ void test1(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc1, vec
 // CHECK-LE-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC1:%.*]], <16 x i8> noundef [[VC2:%.*]], ptr noundef writeonly captures(none) initializes((0, 32)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-LE-NEXT:  [[ENTRY:.*:]]
 // CHECK-LE-NEXT:    [[TMP0:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> [[VC2]], <16 x i8> [[VC1]])
-// CHECK-LE-NEXT:    store <256 x i1> [[TMP0]], ptr [[RESP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6:![0-9]+]]
+// CHECK-LE-NEXT:    store <256 x i1> [[TMP0]], ptr [[RESP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA8:![0-9]+]]
 // CHECK-LE-NEXT:    ret void
 //
 // CHECK-BE-LABEL: define dso_local void @test2(
 // CHECK-BE-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC1:%.*]], <16 x i8> noundef [[VC2:%.*]], ptr noundef writeonly captures(none) initializes((0, 32)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-BE-NEXT:  [[ENTRY:.*:]]
 // CHECK-BE-NEXT:    [[TMP0:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> [[VC1]], <16 x i8> [[VC2]])
-// CHECK-BE-NEXT:    store <256 x i1> [[TMP0]], ptr [[RESP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6:![0-9]+]]
+// CHECK-BE-NEXT:    store <256 x i1> [[TMP0]], ptr [[RESP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA8:![0-9]+]]
 // CHECK-BE-NEXT:    ret void
 //
 // CHECK-LE-NOOPT-LABEL: define dso_local void @test2(
@@ -120,17 +120,17 @@ void test2(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc1,
   *((__vector_pair *)resp) = res;
 }
 //.
-// CHECK-LE: [[__VECTOR_QUAD_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
-// CHECK-LE: [[META3]] = !{!"__vector_quad", [[META4:![0-9]+]], i64 0}
-// CHECK-LE: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK-LE: [[META4:![0-9]+]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
 // CHECK-LE: [[META5]] = !{!"Simple C/C++ TBAA"}
-// CHECK-LE: [[__VECTOR_PAIR_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
-// CHECK-LE: [[META7]] = !{!"__vector_pair", [[META4]], i64 0}
+// CHECK-LE: [[__VECTOR_QUAD_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// CHECK-LE: [[META7]] = !{!"__vector_quad", [[META4]], i64 0}
+// CHECK-LE: [[__VECTOR_PAIR_TBAA8]] = !{[[META9:![0-9]+]], [[META9]], i64 0}
+// CHECK-LE: [[META9]] = !{!"__vector_pair", [[META4]], i64 0}
 //.
-// CHECK-BE: [[__VECTOR_QUAD_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
-// CHECK-BE: [[META3]] = !{!"__vector_quad", [[META4:![0-9]+]], i64 0}
-// CHECK-BE: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK-BE: [[META4:![0-9]+]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
 // CHECK-BE: [[META5]] = !{!"Simple C/C++ TBAA"}
-// CHECK-BE: [[__VECTOR_PAIR_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
-// CHECK-BE: [[META7]] = !{!"__vector_pair", [[META4]], i64 0}
+// CHECK-BE: [[__VECTOR_QUAD_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// CHECK-BE: [[META7]] = !{!"__vector_quad", [[META4]], i64 0}
+// CHECK-BE: [[__VECTOR_PAIR_TBAA8]] = !{[[META9:![0-9]+]], [[META9]], i64 0}
+// CHECK-BE: [[META9]] = !{!"__vector_pair", [[META4]], i64 0}
 //.
diff --git a/clang/test/CodeGen/PowerPC/builtins-ppc-dmf.c b/clang/test/CodeGen/PowerPC/builtins-ppc-dmf.c
index d8306a74ad2e9..585d8bac57181 100644
--- a/clang/test/CodeGen/PowerPC/builtins-ppc-dmf.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-dmf.c
@@ -8,17 +8,17 @@
 // CHECK-LABEL: define dso_local void @test_dmxvi8gerx4(
 // CHECK-SAME: ptr noundef readnone captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6:![0-9]+]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvi8gerx4(<256 x i1> [[TMP0]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA6:![0-9]+]]
+// CHECK-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
 // AIX-LABEL: define void @test_dmxvi8gerx4(
 // AIX-SAME: ptr noundef readnone captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // AIX-NEXT:  [[ENTRY:.*:]]
-// AIX-NEXT:    [[TMP0:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA2:![0-9]+]]
+// AIX-NEXT:    [[TMP0:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6:![0-9]+]]
 // AIX-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvi8gerx4(<256 x i1> [[TMP0]], <16 x i8> [[VC]])
-// AIX-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA6:![0-9]+]]
+// AIX-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8:![0-9]+]]
 // AIX-NEXT:    ret void
 //
 void test_dmxvi8gerx4(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -31,17 +31,17 @@ void test_dmxvi8gerx4(unsigned char *vdmrp, unsigned char *vpp, vector unsigned
 // CHECK-LABEL: define dso_local void @test_pmdmxvi8gerx4(
 // CHECK-SAME: ptr noundef readnone captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvi8gerx4(<256 x i1> [[TMP0]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// CHECK-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
 // CHECK-NEXT:    ret void
 //
 // AIX-LABEL: define void @test_pmdmxvi8gerx4(
 // AIX-SAME: ptr noundef readnone captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // AIX-NEXT:  [[ENTRY:.*:]]
-// AIX-NEXT:    [[TMP0:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA2]]
+// AIX-NEXT:    [[TMP0:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
 // AIX-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvi8gerx4(<256 x i1> [[TMP0]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
-// AIX-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// AIX-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
 // AIX-NEXT:    ret void
 //
 void test_pmdmxvi8gerx4(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -54,19 +54,19 @@ void test_pmdmxvi8gerx4(unsigned char *vdmrp, unsigned char *vpp, vector unsigne
 // CHECK-LABEL: define dso_local void @test_dmxvi8gerx4pp(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA6]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvi8gerx4pp(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
 // CHECK-NEXT:    ret void
 //
 // AIX-LABEL: define void @test_dmxvi8gerx4pp(
 // AIX-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // AIX-NEXT:  [[ENTRY:.*:]]
-// AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA6]]
-// AIX-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA2]]
+// AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// AIX-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
 // AIX-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvi8gerx4pp(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]])
-// AIX-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// AIX-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
 // AIX-NEXT:    ret void
 //
 void test_dmxvi8gerx4pp(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -79,19 +79,19 @@ void test_dmxvi8gerx4pp(unsigned char *vdmrp, unsigned char *vpp, vector unsigne
 // CHECK-LABEL: define dso_local void @test_pmdmxvi8gerx4pp(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA6]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvi8gerx4pp(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
 // CHECK-NEXT:    ret void
 //
 // AIX-LABEL: define void @test_pmdmxvi8gerx4pp(
 // AIX-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // AIX-NEXT:  [[ENTRY:.*:]]
-// AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA6]]
-// AIX-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA2]]
+// AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// AIX-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
 // AIX-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvi8gerx4pp(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
-// AIX-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// AIX-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
 // AIX-NEXT:    ret void
 //
 void test_pmdmxvi8gerx4pp(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -104,19 +104,19 @@ void test_pmdmxvi8gerx4pp(unsigned char *vdmrp, unsigned char *vpp, vector unsig
 // CHECK-LABEL: define dso_local void @test_dmxvi8gerx4spp(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA6]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvi8gerx4spp(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
 // CHECK-NEXT:    ret void
 //
 // AIX-LABEL: define void @test_dmxvi8gerx4spp(
 // AIX-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // AIX-NEXT:  [[ENTRY:.*:]]
-// AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA6]]
-// AIX-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA2]]
+// AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// AIX-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
 // AIX-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvi8gerx4spp(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]])
-// AIX-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// AIX-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
 // AIX-NEXT:    ret void
 //
 void test_dmxvi8gerx4spp(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -129,19 +129,19 @@ void test_dmxvi8gerx4spp(unsigned char *vdmrp, unsigned char *vpp, vector unsign
 // CHECK-LABEL: define dso_local void @test_pmdmxvi8gerx4spp(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA6]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvi8gerx4spp(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
 // CHECK-NEXT:    ret void
 //
 // AIX-LABEL: define void @test_pmdmxvi8gerx4spp(
 // AIX-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // AIX-NEXT:  [[ENTRY:.*:]]
-// AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA6]]
-// AIX-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA2]]
+// AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// AIX-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
 // AIX-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvi8gerx4spp(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
-// AIX-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// AIX-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
 // AIX-NEXT:    ret void
 //
 void test_pmdmxvi8gerx4spp(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -185,7 +185,7 @@ void test_dmf_basic(char *p, char *res1, char *res2) {
 // CHECK-LABEL: define dso_local void @test_dmf_basic2(
 // CHECK-SAME: ptr noundef readonly captures(none) [[P1:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RES1:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RES2:%.*]], ptr noundef readonly captures(none) [[V:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[V]], align 16, !tbaa [[CHAR_TBAA8:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[V]], align 16, !tbaa [[CHAR_TBAA10:![0-9]+]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.build.dmr(<16 x i8> [[TMP0]], <16 x i8> [[TMP0]], <16 x i8> [[TMP0]], <16 x i8> [[TMP0]], <16 x i8> [[TMP0]], <16 x i8> [[TMP0]], <16 x i8> [[TMP0]], <16 x i8> [[TMP0]])
 // CHECK-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RES2]], align 128
 // CHECK-NEXT:    [[TMP2:%.*]] = load <1024 x i1>, ptr [[P1]], align 128
@@ -195,7 +195,7 @@ void test_dmf_basic(char *p, char *res1, char *res2) {
 // AIX-LABEL: define void @test_dmf_basic2(
 // AIX-SAME: ptr noundef readonly captures(none) [[P1:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RES1:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RES2:%.*]], ptr noundef readonly captures(none) [[V:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // AIX-NEXT:  [[ENTRY:.*:]]
-// AIX-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[V]], align 16, !tbaa [[CHAR_TBAA8:![0-9]+]]
+// AIX-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[V]], align 16, !tbaa [[CHAR_TBAA10:![0-9]+]]
 // AIX-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.build.dmr(<16 x i8> [[TMP0]], <16 x i8> [[TMP0]], <16 x i8> [[TMP0]], <16 x i8> [[TMP0]], <16 x i8> [[TMP0]], <16 x i8> [[TMP0]], <16 x i8> [[TMP0]], <16 x i8> [[TMP0]])
 // AIX-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RES2]], align 128
 // AIX-NEXT:    [[TMP2:%.*]] = load <1024 x i1>, ptr [[P1]], align 128
@@ -212,19 +212,19 @@ void test_dmf_basic2(char *p1, char *res1, char *res2,
 // CHECK-LABEL: define dso_local void @test_dmsha2hash(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VDMRP1:%.*]], ptr noundef readonly captures(none) [[VDMRP2:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP1]], align 128, !tbaa [[__DMR1024_TBAA6]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <1024 x i1>, ptr [[VDMRP2]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP1]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1024 x i1>, ptr [[VDMRP2]], align 128, !tbaa [[__DMR1024_TBAA8]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmsha2hash(<1024 x i1> [[TMP0]], <1024 x i1> [[TMP1]], i32 1)
-// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
 // CHECK-NEXT:    ret void
 //
 // AIX-LABEL: define void @test_dmsha2hash(
 // AIX-SAME: ptr noundef readonly captures(none) [[VDMRP1:%.*]], ptr noundef readonly captures(none) [[VDMRP2:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // AIX-NEXT:  [[ENTRY:.*:]]
-// AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP1]], align 128, !tbaa [[__DMR1024_TBAA6]]
-// AIX-NEXT:    [[TMP1:%.*]] = load <1024 x i1>, ptr [[VDMRP2]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP1]], align 128, !tbaa [[__DMR1024_TBAA8]]
+// AIX-NEXT:    [[TMP1:%.*]] = load <1024 x i1>, ptr [[VDMRP2]], align 128, !tbaa [[__DMR1024_TBAA8]]
 // AIX-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmsha2hash(<1024 x i1> [[TMP0]], <1024 x i1> [[TMP1]], i32 1)
-// AIX-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// AIX-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
 // AIX-NEXT:    ret void
 //
 void test_dmsha2hash(unsigned char *vdmrp1, unsigned char *vdmrp2, unsigned char *resp) {
@@ -237,17 +237,17 @@ void test_dmsha2hash(unsigned char *vdmrp1, unsigned char *vdmrp2, unsigned char
 // CHECK-LABEL: define dso_local void @test_dmsha3hash(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VDMRPP:%.*]], ptr noundef writeonly captures(none) initializes((0, 256)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2048 x i1>, ptr [[VDMRPP]], align 256, !tbaa [[__DMR2048_TBAA9:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2048 x i1>, ptr [[VDMRPP]], align 256, !tbaa [[__DMR2048_TBAA11:![0-9]+]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <2048 x i1> @llvm.ppc.mma.dmsha3hash(<2048 x i1> [[TMP0]], i32 4)
-// CHECK-NEXT:    store <2048 x i1> [[TMP1]], ptr [[RESP]], align 256, !tbaa [[__DMR2048_TBAA9]]
+// CHECK-NEXT:    store <2048 x i1> [[TMP1]], ptr [[RESP]], align 256, !tbaa [[__DMR2048_TBAA11]]
 // CHECK-NEXT:    ret void
 //
 // AIX-LABEL: define void @test_dmsha3hash(
 // AIX-SAME: ptr noundef readonly captures(none) [[VDMRPP:%.*]], ptr noundef writeonly captures(none) initializes((0, 256)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // AIX-NEXT:  [[ENTRY:.*:]]
-// AIX-NEXT:    [[TMP0:%.*]] = load <2048 x i1>, ptr [[VDMRPP]], align 256, !tbaa [[__DMR2048_TBAA9:![0-9]+]]
+// AIX-NEXT:    [[TMP0:%.*]] = load <2048 x i1>, ptr [[VDMRPP]], align 256, !tbaa [[__DMR2048_TBAA11:![0-9]+]]
 // AIX-NEXT:    [[TMP1:%.*]] = tail call <2048 x i1> @llvm.ppc.mma.dmsha3hash(<2048 x i1> [[TMP0]], i32 4)
-// AIX-NEXT:    store <2048 x i1> [[TMP1]], ptr [[RESP]], align 256, !tbaa [[__DMR2048_TBAA9]]
+// AIX-NEXT:    store <2048 x i1> [[TMP1]], ptr [[RESP]], align 256, !tbaa [[__DMR2048_TBAA11]]
 // AIX-NEXT:    ret void
 //
 void test_dmsha3hash(unsigned char *vdmrpp,  unsigned char *resp) {
@@ -259,17 +259,17 @@ void test_dmsha3hash(unsigned char *vdmrpp,  unsigned char *resp) {
 // CHECK-LABEL: define dso_local void @test_dmxxshapad(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA8]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxxshapad(<1024 x i1> [[TMP0]], <16 x i8> [[VC]], i32 2, i32 1, i32 5)
-// CHECK-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// CHECK-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
 // CHECK-NEXT:    ret void
 //
 // AIX-LABEL: define void @test_dmxxshapad(
 // AIX-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // AIX-NEXT:  [[ENTRY:.*:]]
-// AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA8]]
 // AIX-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxxshapad(<1024 x i1> [[TMP0]], <16 x i8> [[VC]], i32 2, i32 1, i32 5)
-// AIX-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// AIX-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA8]]
 // AIX-NEXT:    ret void
 //
 void test_dmxxshapad(unsigned char *vdmrp, vector unsigned char vc, unsigned char *resp) {
@@ -278,23 +278,23 @@ void test_dmxxshapad(unsigned char *vdmrp, vector unsigned char vc, unsigned cha
   *((__dmr1024 *)resp) = vdmr;
 }
 //.
-// CHECK: [[__VECTOR_PAIR_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
-// CHECK: [[META3]] = !{!"__vector_pair", [[META4:![0-9]+]], i64 0}
-// CHECK: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK: [[META4:![0-9]+]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
 // CHECK: [[META5]] = !{!"Simple C/C++ TBAA"}
-// CHECK: [[__DMR1024_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
-// CHECK: [[META7]] = !{!"__dmr1024", [[META4]], i64 0}
-// CHECK: [[CHAR_TBAA8]] = !{[[META4]], [[META4]], i64 0}
-// CHECK: [[__DMR2048_TBAA9]] = !{[[META10:![0-9]+]], [[META10]], i64 0}
-// CHECK: [[META10]] = !{!"__dmr2048", [[META4]], i64 0}
+// CHECK: [[__VECTOR_PAIR_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// CHECK: [[META7]] = !{!"__vector_pair", [[META4]], i64 0}
+// CHECK: [[__DMR1024_TBAA8]] = !{[[META9:![0-9]+]], [[META9]], i64 0}
+// CHECK: [[META9]] = !{!"__dmr1024", [[META4]], i64 0}
+// CHECK: [[CHAR_TBAA10]] = !{[[META4]], [[META4]], i64 0}
+// CHECK: [[__DMR2048_TBAA11]] = !{[[META12:![0-9]+]], [[META12]], i64 0}
+// CHECK: [[META12]] = !{!"__dmr2048", [[META4]], i64 0}
 //.
-// AIX: [[__VECTOR_PAIR_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
-// AIX: [[META3]] = !{!"__vector_pair", [[META4:![0-9]+]], i64 0}
-// AIX: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// AIX: [[META4:![0-9]+]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
 // AIX: [[META5]] = !{!"Simple C/C++ TBAA"}
-// AIX: [[__DMR1024_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
-// AIX: [[META7]] = !{!"__dmr1024", [[META4]], i64 0}
-// AIX: [[CHAR_TBAA8]] = !{[[META4]], [[META4]], i64 0}
-// AIX: [[__DMR2048_TBAA9]] = !{[[META10:![0-9]+]], [[META10]], i64 0}
-// AIX: [[META10]] = !{!"__dmr2048", [[META4]], i64 0}
+// AIX: [[__VECTOR_PAIR_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// AIX: [[META7]] = !{!"__vector_pair", [[META4]], i64 0}
+// AIX: [[__DMR1024_TBAA8]] = !{[[META9:![0-9]+]], [[META9]], i64 0}
+// AIX: [[META9]] = !{!"__dmr1024", [[META4]], i64 0}
+// AIX: [[CHAR_TBAA10]] = !{[[META4]], [[META4]], i64 0}
+// AIX: [[__DMR2048_TBAA11]] = !{[[META12:![0-9]+]], [[META12]], i64 0}
+// AIX: [[META12]] = !{!"__dmr2048", [[META4]], i64 0}
 //.
diff --git a/clang/test/CodeGen/PowerPC/builtins-ppc-pair-mma.c b/clang/test/CodeGen/PowerPC/builtins-ppc-pair-mma.c
index 5c7b222cb618e..74f9714446e00 100644
--- a/clang/test/CodeGen/PowerPC/builtins-ppc-pair-mma.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-pair-mma.c
@@ -8,7 +8,7 @@
 // CHECK-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> [[VC]], <16 x i8> [[VC]], <16 x i8> [[VC]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2:![0-9]+]]
+// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
 void test1(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -45,7 +45,7 @@ void test2(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsi
 // CHECK-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 32)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> [[VC]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <256 x i1> [[TMP0]], ptr [[RESP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6:![0-9]+]]
+// CHECK-NEXT:    store <256 x i1> [[TMP0]], ptr [[RESP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA8:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
 void test3(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -75,9 +75,9 @@ void test4(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsi
 // CHECK-LABEL: define dso_local void @test5(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xxmtacc(<512 x i1> [[TMP0]])
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test5(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -90,9 +90,9 @@ void test5(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsi
 // CHECK-LABEL: define dso_local void @test6(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xxmfacc(<512 x i1> [[TMP0]])
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test6(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -106,7 +106,7 @@ void test6(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsi
 // CHECK-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
-// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test7(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -120,7 +120,7 @@ void test7(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsi
 // CHECK-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvi4ger8(<16 x i8> [[VC]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test8(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -134,7 +134,7 @@ void test8(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsi
 // CHECK-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvi8ger4(<16 x i8> [[VC]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test9(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -148,7 +148,7 @@ void test9(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsi
 // CHECK-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2(<16 x i8> [[VC]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test10(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -162,7 +162,7 @@ void test10(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2s(<16 x i8> [[VC]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test11(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -176,7 +176,7 @@ void test11(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2(<16 x i8> [[VC]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test12(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -190,7 +190,7 @@ void test12(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf32ger(<16 x i8> [[VC]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test13(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -203,9 +203,9 @@ void test13(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-LABEL: define dso_local void @test14(
 // CHECK-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA8]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64ger(<256 x i1> [[TMP0]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test14(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -219,7 +219,7 @@ void test14(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvi4ger8(<16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test15(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -233,7 +233,7 @@ void test15(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvi8ger4(<16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test16(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -247,7 +247,7 @@ void test16(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2(<16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test17(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -261,7 +261,7 @@ void test17(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2s(<16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test18(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -275,7 +275,7 @@ void test18(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2(<16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test19(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -289,7 +289,7 @@ void test19(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf32ger(<16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test20(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -302,9 +302,9 @@ void test20(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-LABEL: define dso_local void @test21(
 // CHECK-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA8]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf64ger(<256 x i1> [[TMP0]], <16 x i8> [[VC]], i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test21(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -317,9 +317,9 @@ void test21(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-LABEL: define dso_local void @test22(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvi4ger8pp(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test22(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -332,9 +332,9 @@ void test22(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-LABEL: define dso_local void @test23(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvi8ger4pp(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test23(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -347,9 +347,9 @@ void test23(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-LABEL: define dso_local void @test24(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvi8ger4spp(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test24(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -362,9 +362,9 @@ void test24(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-LABEL: define dso_local void @test25(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2pp(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test25(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -377,9 +377,9 @@ void test25(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-LABEL: define dso_local void @test26(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2spp(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test26(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -392,9 +392,9 @@ void test26(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-LABEL: define dso_local void @test27(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvi4ger8pp(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test27(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -407,9 +407,9 @@ void test27(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-LABEL: define dso_local void @test28(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvi8ger4pp(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test28(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -422,9 +422,9 @@ void test28(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-LABEL: define dso_local void @test29(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvi8ger4spp(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test29(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -437,9 +437,9 @@ void test29(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-LABEL: define dso_local void @test30(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2pp(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test30(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -452,9 +452,9 @@ void test30(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-LABEL: define dso_local void @test31(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2spp(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test31(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -467,9 +467,9 @@ void test31(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-LABEL: define dso_local void @test32(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test32(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -482,9 +482,9 @@ void test32(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-LABEL: define dso_local void @test33(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pn(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test33(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -497,9 +497,9 @@ void test33(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-LABEL: define dso_local void @test34(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2np(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test34(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -512,9 +512,9 @@ void test34(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-LABEL: define dso_local void @test35(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2nn(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test35(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -527,9 +527,9 @@ void test35(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-LABEL: define dso_local void @test36(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2pp(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test36(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -542,9 +542,9 @@ void test36(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-LABEL: define dso_local void @test37(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2pn(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test37(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -557,9 +557,9 @@ void test37(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-LABEL: define dso_local void @test38(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2np(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test38(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -572,9 +572,9 @@ void test38(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-LABEL: define dso_local void @test39(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2nn(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test39(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -587,9 +587,9 @@ void test39(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-LABEL: define dso_local void @test40(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test40(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -602,9 +602,9 @@ void test40(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-LABEL: define dso_local void @test41(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpn(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test41(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -617,9 +617,9 @@ void test41(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-LABEL: define dso_local void @test42(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf32gernp(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test42(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -632,9 +632,9 @@ void test42(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-LABEL: define dso_local void @test43(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf32gernn(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test43(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -647,9 +647,9 @@ void test43(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-LABEL: define dso_local void @test44(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gerpp(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test44(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -662,9 +662,9 @@ void test44(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-LABEL: define dso_local void @test45(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gerpn(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test45(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -677,9 +677,9 @@ void test45(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-LABEL: define dso_local void @test46(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gernp(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test46(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -692,9 +692,9 @@ void test46(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-LABEL: define dso_local void @test47(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gernn(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test47(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -707,10 +707,10 @@ void test47(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-LABEL: define dso_local void @test48(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA8]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP2]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP2]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test48(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -723,10 +723,10 @@ void test48(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-LABEL: define dso_local void @test49(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA8]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpn(<512 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP2]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP2]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test49(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -739,10 +739,10 @@ void test49(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-LABEL: define dso_local void @test50(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA8]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP2]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP2]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test50(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -755,10 +755,10 @@ void test50(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-LABEL: define dso_local void @test51(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA8]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64gernn(<512 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP2]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP2]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test51(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -771,10 +771,10 @@ void test51(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-LABEL: define dso_local void @test52(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA8]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf64gerpp(<512 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]], i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP2]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP2]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test52(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -787,10 +787,10 @@ void test52(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-LABEL: define dso_local void @test53(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA8]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf64gerpn(<512 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]], i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP2]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP2]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test53(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -803,10 +803,10 @@ void test53(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-LABEL: define dso_local void @test54(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA8]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf64gernp(<512 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]], i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP2]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP2]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test54(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -819,10 +819,10 @@ void test54(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-LABEL: define dso_local void @test55(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readonly captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA8]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]], i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP2]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP2]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test55(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -836,7 +836,7 @@ void test55(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvbf16ger2(<16 x i8> [[VC]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test56(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -850,7 +850,7 @@ void test56(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2(<16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test57(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -863,9 +863,9 @@ void test57(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-LABEL: define dso_local void @test58(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvbf16ger2pp(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test58(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -878,9 +878,9 @@ void test58(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-LABEL: define dso_local void @test59(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvbf16ger2pn(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test59(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -893,9 +893,9 @@ void test59(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-LABEL: define dso_local void @test60(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvbf16ger2np(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test60(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -908,9 +908,9 @@ void test60(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-LABEL: define dso_local void @test61(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvbf16ger2nn(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test61(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -923,9 +923,9 @@ void test61(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-LABEL: define dso_local void @test62(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2pp(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test62(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -938,9 +938,9 @@ void test62(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-LABEL: define dso_local void @test63(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2pn(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test63(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -953,9 +953,9 @@ void test63(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-LABEL: define dso_local void @test64(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2np(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test64(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -968,9 +968,9 @@ void test64(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-LABEL: define dso_local void @test65(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvbf16ger2nn(<512 x i1> [[TMP0]], <16 x i8> [[VC]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP1]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test65(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -1079,11 +1079,11 @@ void test72(const __vector_pair *vpp, __vector_pair *vp2) {
 // CHECK-LABEL: define dso_local void @test73(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[VPP]], i64 8
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1> [[TMP0]], <256 x i1> [[TMP2]], <16 x i8> [[VC]], i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP3]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP3]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test73(unsigned char *vqp, const __vector_pair *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -1096,10 +1096,10 @@ void test73(unsigned char *vqp, const __vector_pair *vpp, vector unsigned char v
 // CHECK-LABEL: define dso_local void @test74(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr [[VPP]])
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP2]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP2]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test74(unsigned char *vqp, const __vector_pair *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -1112,11 +1112,11 @@ void test74(unsigned char *vqp, const __vector_pair *vpp, vector unsigned char v
 // CHECK-LABEL: define dso_local void @test75(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], i64 noundef [[OFFS:%.*]], ptr noundef [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[VPP]], i64 [[OFFS]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> [[TMP0]], <256 x i1> [[TMP2]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP3]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP3]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test75(unsigned char *vqp, signed long offs, const __vector_pair *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -1130,7 +1130,7 @@ void test75(unsigned char *vqp, signed long offs, const __vector_pair *vpp, vect
 // CHECK-SAME: ptr noundef readnone captures(none) [[VQP:%.*]], ptr noundef readnone captures(none) [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 32)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> [[VC]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <256 x i1> [[TMP0]], ptr [[RESP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA6]]
+// CHECK-NEXT:    store <256 x i1> [[TMP0]], ptr [[RESP]], align 32, !tbaa [[__VECTOR_PAIR_TBAA8]]
 // CHECK-NEXT:    ret void
 //
 void test76(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -1256,11 +1256,11 @@ void test84(const __vector_pair *vpp, __vector_pair *vp2) {
 // CHECK-LABEL: define dso_local void @test85(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[VPP]], i64 8
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1> [[TMP0]], <256 x i1> [[TMP2]], <16 x i8> [[VC]], i32 0, i32 0)
-// CHECK-NEXT:    store <512 x i1> [[TMP3]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP3]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test85(unsigned char *vqp, const __vector_pair *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -1273,10 +1273,10 @@ void test85(unsigned char *vqp, const __vector_pair *vpp, vector unsigned char v
 // CHECK-LABEL: define dso_local void @test86(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], ptr noundef [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr [[VPP]])
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP2]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP2]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test86(unsigned char *vqp, const __vector_pair *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -1289,11 +1289,11 @@ void test86(unsigned char *vqp, const __vector_pair *vpp, vector unsigned char v
 // CHECK-LABEL: define dso_local void @test87(
 // CHECK-SAME: ptr noundef readonly captures(none) [[VQP:%.*]], i64 noundef [[OFFS:%.*]], ptr noundef [[VPP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 64)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <512 x i1>, ptr [[VQP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[VPP]], i64 [[OFFS]]
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> [[TMP0]], <256 x i1> [[TMP2]], <16 x i8> [[VC]])
-// CHECK-NEXT:    store <512 x i1> [[TMP3]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA2]]
+// CHECK-NEXT:    store <512 x i1> [[TMP3]], ptr [[RESP]], align 64, !tbaa [[__VECTOR_QUAD_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test87(unsigned char *vqp, signed long offs, const __vector_pair *vpp, vector unsigned char vc, unsigned char *resp) {
@@ -1303,10 +1303,10 @@ void test87(unsigned char *vqp, signed long offs, const __vector_pair *vpp, vect
   *((__vector_quad *)resp) = vq;
 }
 //.
-// CHECK: [[__VECTOR_QUAD_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
-// CHECK: [[META3]] = !{!"__vector_quad", [[META4:![0-9]+]], i64 0}
-// CHECK: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK: [[META4:![0-9]+]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
 // CHECK: [[META5]] = !{!"Simple C/C++ TBAA"}
-// CHECK: [[__VECTOR_PAIR_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
-// CHECK: [[META7]] = !{!"__vector_pair", [[META4]], i64 0}
+// CHECK: [[__VECTOR_QUAD_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// CHECK: [[META7]] = !{!"__vector_quad", [[META4]], i64 0}
+// CHECK: [[__VECTOR_PAIR_TBAA8]] = !{[[META9:![0-9]+]], [[META9]], i64 0}
+// CHECK: [[META9]] = !{!"__vector_pair", [[META4]], i64 0}
 //.
diff --git a/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-bitcast-less-8.c b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-bitcast-less-8.c
index 1f0b3d4a560e7..c943649c76abf 100644
--- a/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-bitcast-less-8.c
+++ b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-bitcast-less-8.c
@@ -57,7 +57,7 @@ DEFINE_STRUCT(bool64)
 // CHECK-128-SAME: ptr noundef readonly captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-128-NEXT:  [[ENTRY:.*:]]
 // CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 1
-// CHECK-128-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[Y]], align 1, !tbaa [[CHAR_TBAA6:![0-9]+]]
+// CHECK-128-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[Y]], align 1, !tbaa [[CHAR_TBAA10:![0-9]+]]
 // CHECK-128-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 1 x i8> @llvm.vector.insert.nxv1i8.v1i8(<vscale x 1 x i8> poison, <1 x i8> [[TMP0]], i64 0)
 // CHECK-128-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 1 x i8> [[CAST_SCALABLE]] to <vscale x 8 x i1>
 // CHECK-128-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv8i1(<vscale x 8 x i1> [[TMP1]], i64 0)
@@ -74,7 +74,7 @@ vbool32_t read_bool32(struct struct_bool32 *s) {
 // CHECK-128-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 8 x i1> [[TMP0]] to <vscale x 1 x i8>
 // CHECK-128-NEXT:    [[CAST_FIXED:%.*]] = tail call <1 x i8> @llvm.vector.extract.v1i8.nxv1i8(<vscale x 1 x i8> [[TMP1]], i64 0)
 // CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 1
-// CHECK-128-NEXT:    store <1 x i8> [[CAST_FIXED]], ptr [[Y]], align 1, !tbaa [[CHAR_TBAA6]]
+// CHECK-128-NEXT:    store <1 x i8> [[CAST_FIXED]], ptr [[Y]], align 1, !tbaa [[CHAR_TBAA10]]
 // CHECK-128-NEXT:    ret void
 //
 void write_bool32(struct struct_bool32 *s, vbool32_t x) {
@@ -85,7 +85,7 @@ void write_bool32(struct struct_bool32 *s, vbool32_t x) {
 // CHECK-128-SAME: ptr noundef readonly captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-128-NEXT:  [[ENTRY:.*:]]
 // CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 1
-// CHECK-128-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[Y]], align 1, !tbaa [[CHAR_TBAA6]]
+// CHECK-128-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[Y]], align 1, !tbaa [[CHAR_TBAA10]]
 // CHECK-128-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 1 x i8> @llvm.vector.insert.nxv1i8.v1i8(<vscale x 1 x i8> poison, <1 x i8> [[TMP0]], i64 0)
 // CHECK-128-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 1 x i8> [[CAST_SCALABLE]] to <vscale x 8 x i1>
 // CHECK-128-NEXT:    [[TMP2:%.*]] = tail call <vscale x 1 x i1> @llvm.vector.extract.nxv1i1.nxv8i1(<vscale x 8 x i1> [[TMP1]], i64 0)
@@ -102,14 +102,14 @@ vbool64_t read_bool64(struct struct_bool64 *s) {
 // CHECK-128-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 8 x i1> [[TMP0]] to <vscale x 1 x i8>
 // CHECK-128-NEXT:    [[CAST_FIXED:%.*]] = tail call <1 x i8> @llvm.vector.extract.v1i8.nxv1i8(<vscale x 1 x i8> [[TMP1]], i64 0)
 // CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 1
-// CHECK-128-NEXT:    store <1 x i8> [[CAST_FIXED]], ptr [[Y]], align 1, !tbaa [[CHAR_TBAA6]]
+// CHECK-128-NEXT:    store <1 x i8> [[CAST_FIXED]], ptr [[Y]], align 1, !tbaa [[CHAR_TBAA10]]
 // CHECK-128-NEXT:    ret void
 //
 void write_bool64(struct struct_bool64 *s, vbool64_t x) {
   s->y[0] = x;
 }
 //.
-// CHECK-128: [[CHAR_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
-// CHECK-128: [[META7]] = !{!"omnipotent char", [[META8:![0-9]+]], i64 0}
-// CHECK-128: [[META8]] = !{!"Simple C/C++ TBAA"}
+// CHECK-128: [[META8:![0-9]+]] = !{!"omnipotent char", [[META9:![0-9]+]], i64 0}
+// CHECK-128: [[META9]] = !{!"Simple C/C++ TBAA"}
+// CHECK-128: [[CHAR_TBAA10]] = !{[[META8]], [[META8]], i64 0}
 //.
diff --git a/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-bitcast.c b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-bitcast.c
index b92e6dff31748..71d6f469ded39 100644
--- a/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-bitcast.c
+++ b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-bitcast.c
@@ -71,7 +71,7 @@ DEFINE_STRUCT(bool64)
 // CHECK-64-SAME: ptr noundef readonly captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-64-NEXT:  [[ENTRY:.*:]]
 // CHECK-64-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 8
-// CHECK-64-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[Y]], align 8, !tbaa [[CHAR_TBAA6:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[Y]], align 8, !tbaa [[CHAR_TBAA10:![0-9]+]]
 // CHECK-64-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 1 x i64> @llvm.vector.insert.nxv1i64.v1i64(<vscale x 1 x i64> poison, <1 x i64> [[TMP0]], i64 0)
 // CHECK-64-NEXT:    ret <vscale x 1 x i64> [[CAST_SCALABLE]]
 //
@@ -79,7 +79,7 @@ DEFINE_STRUCT(bool64)
 // CHECK-128-SAME: ptr noundef readonly captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-128-NEXT:  [[ENTRY:.*:]]
 // CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 16
-// CHECK-128-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[Y]], align 8, !tbaa [[CHAR_TBAA6:![0-9]+]]
+// CHECK-128-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[Y]], align 8, !tbaa [[CHAR_TBAA10:![0-9]+]]
 // CHECK-128-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 1 x i64> @llvm.vector.insert.nxv1i64.v2i64(<vscale x 1 x i64> poison, <2 x i64> [[TMP0]], i64 0)
 // CHECK-128-NEXT:    ret <vscale x 1 x i64> [[CAST_SCALABLE]]
 //
@@ -87,7 +87,7 @@ DEFINE_STRUCT(bool64)
 // CHECK-256-SAME: ptr noundef readonly captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-256-NEXT:  [[ENTRY:.*:]]
 // CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 32
-// CHECK-256-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[Y]], align 8, !tbaa [[CHAR_TBAA6:![0-9]+]]
+// CHECK-256-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[Y]], align 8, !tbaa [[CHAR_TBAA10:![0-9]+]]
 // CHECK-256-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 1 x i64> @llvm.vector.insert.nxv1i64.v4i64(<vscale x 1 x i64> poison, <4 x i64> [[TMP0]], i64 0)
 // CHECK-256-NEXT:    ret <vscale x 1 x i64> [[CAST_SCALABLE]]
 //
@@ -100,7 +100,7 @@ vint64m1_t read_int64m1(struct struct_int64m1 *s) {
 // CHECK-64-NEXT:  [[ENTRY:.*:]]
 // CHECK-64-NEXT:    [[CAST_FIXED:%.*]] = tail call <1 x i64> @llvm.vector.extract.v1i64.nxv1i64(<vscale x 1 x i64> [[X]], i64 0)
 // CHECK-64-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 8
-// CHECK-64-NEXT:    store <1 x i64> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[CHAR_TBAA6]]
+// CHECK-64-NEXT:    store <1 x i64> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[CHAR_TBAA10]]
 // CHECK-64-NEXT:    ret void
 //
 // CHECK-128-LABEL: define dso_local void @write_int64m1(
@@ -108,7 +108,7 @@ vint64m1_t read_int64m1(struct struct_int64m1 *s) {
 // CHECK-128-NEXT:  [[ENTRY:.*:]]
 // CHECK-128-NEXT:    [[CAST_FIXED:%.*]] = tail call <2 x i64> @llvm.vector.extract.v2i64.nxv1i64(<vscale x 1 x i64> [[X]], i64 0)
 // CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 16
-// CHECK-128-NEXT:    store <2 x i64> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[CHAR_TBAA6]]
+// CHECK-128-NEXT:    store <2 x i64> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[CHAR_TBAA10]]
 // CHECK-128-NEXT:    ret void
 //
 // CHECK-256-LABEL: define dso_local void @write_int64m1(
@@ -116,7 +116,7 @@ vint64m1_t read_int64m1(struct struct_int64m1 *s) {
 // CHECK-256-NEXT:  [[ENTRY:.*:]]
 // CHECK-256-NEXT:    [[CAST_FIXED:%.*]] = tail call <4 x i64> @llvm.vector.extract.v4i64.nxv1i64(<vscale x 1 x i64> [[X]], i64 0)
 // CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 32
-// CHECK-256-NEXT:    store <4 x i64> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[CHAR_TBAA6]]
+// CHECK-256-NEXT:    store <4 x i64> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[CHAR_TBAA10]]
 // CHECK-256-NEXT:    ret void
 //
 void write_int64m1(struct struct_int64m1 *s, vint64m1_t x) {
@@ -131,7 +131,7 @@ void write_int64m1(struct struct_int64m1 *s, vint64m1_t x) {
 // CHECK-64-SAME: ptr noundef readonly captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-64-NEXT:  [[ENTRY:.*:]]
 // CHECK-64-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 8
-// CHECK-64-NEXT:    [[TMP0:%.*]] = load <1 x double>, ptr [[Y]], align 8, !tbaa [[CHAR_TBAA6]]
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load <1 x double>, ptr [[Y]], align 8, !tbaa [[CHAR_TBAA10]]
 // CHECK-64-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 1 x double> @llvm.vector.insert.nxv1f64.v1f64(<vscale x 1 x double> poison, <1 x double> [[TMP0]], i64 0)
 // CHECK-64-NEXT:    ret <vscale x 1 x double> [[CAST_SCALABLE]]
 //
@@ -139,7 +139,7 @@ void write_int64m1(struct struct_int64m1 *s, vint64m1_t x) {
 // CHECK-128-SAME: ptr noundef readonly captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-128-NEXT:  [[ENTRY:.*:]]
 // CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 16
-// CHECK-128-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[Y]], align 8, !tbaa [[CHAR_TBAA6]]
+// CHECK-128-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[Y]], align 8, !tbaa [[CHAR_TBAA10]]
 // CHECK-128-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 1 x double> @llvm.vector.insert.nxv1f64.v2f64(<vscale x 1 x double> poison, <2 x double> [[TMP0]], i64 0)
 // CHECK-128-NEXT:    ret <vscale x 1 x double> [[CAST_SCALABLE]]
 //
@@ -147,7 +147,7 @@ void write_int64m1(struct struct_int64m1 *s, vint64m1_t x) {
 // CHECK-256-SAME: ptr noundef readonly captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-256-NEXT:  [[ENTRY:.*:]]
 // CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 32
-// CHECK-256-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[Y]], align 8, !tbaa [[CHAR_TBAA6]]
+// CHECK-256-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[Y]], align 8, !tbaa [[CHAR_TBAA10]]
 // CHECK-256-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 1 x double> @llvm.vector.insert.nxv1f64.v4f64(<vscale x 1 x double> poison, <4 x double> [[TMP0]], i64 0)
 // CHECK-256-NEXT:    ret <vscale x 1 x double> [[CAST_SCALABLE]]
 //
@@ -160,7 +160,7 @@ vfloat64m1_t read_float64m1(struct struct_float64m1 *s) {
 // CHECK-64-NEXT:  [[ENTRY:.*:]]
 // CHECK-64-NEXT:    [[CAST_FIXED:%.*]] = tail call <1 x double> @llvm.vector.extract.v1f64.nxv1f64(<vscale x 1 x double> [[X]], i64 0)
 // CHECK-64-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 8
-// CHECK-64-NEXT:    store <1 x double> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[CHAR_TBAA6]]
+// CHECK-64-NEXT:    store <1 x double> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[CHAR_TBAA10]]
 // CHECK-64-NEXT:    ret void
 //
 // CHECK-128-LABEL: define dso_local void @write_float64m1(
@@ -168,7 +168,7 @@ vfloat64m1_t read_float64m1(struct struct_float64m1 *s) {
 // CHECK-128-NEXT:  [[ENTRY:.*:]]
 // CHECK-128-NEXT:    [[CAST_FIXED:%.*]] = tail call <2 x double> @llvm.vector.extract.v2f64.nxv1f64(<vscale x 1 x double> [[X]], i64 0)
 // CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 16
-// CHECK-128-NEXT:    store <2 x double> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[CHAR_TBAA6]]
+// CHECK-128-NEXT:    store <2 x double> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[CHAR_TBAA10]]
 // CHECK-128-NEXT:    ret void
 //
 // CHECK-256-LABEL: define dso_local void @write_float64m1(
@@ -176,7 +176,7 @@ vfloat64m1_t read_float64m1(struct struct_float64m1 *s) {
 // CHECK-256-NEXT:  [[ENTRY:.*:]]
 // CHECK-256-NEXT:    [[CAST_FIXED:%.*]] = tail call <4 x double> @llvm.vector.extract.v4f64.nxv1f64(<vscale x 1 x double> [[X]], i64 0)
 // CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 32
-// CHECK-256-NEXT:    store <4 x double> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[CHAR_TBAA6]]
+// CHECK-256-NEXT:    store <4 x double> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[CHAR_TBAA10]]
 // CHECK-256-NEXT:    ret void
 //
 void write_float64m1(struct struct_float64m1 *s, vfloat64m1_t x) {
@@ -191,7 +191,7 @@ void write_float64m1(struct struct_float64m1 *s, vfloat64m1_t x) {
 // CHECK-64-SAME: ptr noundef readonly captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-64-NEXT:  [[ENTRY:.*:]]
 // CHECK-64-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 8
-// CHECK-64-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[Y]], align 8, !tbaa [[CHAR_TBAA6]]
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[Y]], align 8, !tbaa [[CHAR_TBAA10]]
 // CHECK-64-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 8 x i8> @llvm.vector.insert.nxv8i8.v8i8(<vscale x 8 x i8> poison, <8 x i8> [[TMP0]], i64 0)
 // CHECK-64-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 8 x i8> [[CAST_SCALABLE]] to <vscale x 64 x i1>
 // CHECK-64-NEXT:    ret <vscale x 64 x i1> [[TMP1]]
@@ -200,7 +200,7 @@ void write_float64m1(struct struct_float64m1 *s, vfloat64m1_t x) {
 // CHECK-128-SAME: ptr noundef readonly captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-128-NEXT:  [[ENTRY:.*:]]
 // CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 16
-// CHECK-128-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[Y]], align 8, !tbaa [[CHAR_TBAA6]]
+// CHECK-128-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[Y]], align 8, !tbaa [[CHAR_TBAA10]]
 // CHECK-128-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 8 x i8> @llvm.vector.insert.nxv8i8.v16i8(<vscale x 8 x i8> poison, <16 x i8> [[TMP0]], i64 0)
 // CHECK-128-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 8 x i8> [[CAST_SCALABLE]] to <vscale x 64 x i1>
 // CHECK-128-NEXT:    ret <vscale x 64 x i1> [[TMP1]]
@@ -209,7 +209,7 @@ void write_float64m1(struct struct_float64m1 *s, vfloat64m1_t x) {
 // CHECK-256-SAME: ptr noundef readonly captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-256-NEXT:  [[ENTRY:.*:]]
 // CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 32
-// CHECK-256-NEXT:    [[TMP0:%.*]] = load <32 x i8>, ptr [[Y]], align 8, !tbaa [[CHAR_TBAA6]]
+// CHECK-256-NEXT:    [[TMP0:%.*]] = load <32 x i8>, ptr [[Y]], align 8, !tbaa [[CHAR_TBAA10]]
 // CHECK-256-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 8 x i8> @llvm.vector.insert.nxv8i8.v32i8(<vscale x 8 x i8> poison, <32 x i8> [[TMP0]], i64 0)
 // CHECK-256-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 8 x i8> [[CAST_SCALABLE]] to <vscale x 64 x i1>
 // CHECK-256-NEXT:    ret <vscale x 64 x i1> [[TMP1]]
@@ -224,7 +224,7 @@ vbool1_t read_bool1(struct struct_bool1 *s) {
 // CHECK-64-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i1> [[X]] to <vscale x 8 x i8>
 // CHECK-64-NEXT:    [[CAST_FIXED:%.*]] = tail call <8 x i8> @llvm.vector.extract.v8i8.nxv8i8(<vscale x 8 x i8> [[TMP0]], i64 0)
 // CHECK-64-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 8
-// CHECK-64-NEXT:    store <8 x i8> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[CHAR_TBAA6]]
+// CHECK-64-NEXT:    store <8 x i8> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[CHAR_TBAA10]]
 // CHECK-64-NEXT:    ret void
 //
 // CHECK-128-LABEL: define dso_local void @write_bool1(
@@ -233,7 +233,7 @@ vbool1_t read_bool1(struct struct_bool1 *s) {
 // CHECK-128-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i1> [[X]] to <vscale x 8 x i8>
 // CHECK-128-NEXT:    [[CAST_FIXED:%.*]] = tail call <16 x i8> @llvm.vector.extract.v16i8.nxv8i8(<vscale x 8 x i8> [[TMP0]], i64 0)
 // CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 16
-// CHECK-128-NEXT:    store <16 x i8> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[CHAR_TBAA6]]
+// CHECK-128-NEXT:    store <16 x i8> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[CHAR_TBAA10]]
 // CHECK-128-NEXT:    ret void
 //
 // CHECK-256-LABEL: define dso_local void @write_bool1(
@@ -242,22 +242,22 @@ vbool1_t read_bool1(struct struct_bool1 *s) {
 // CHECK-256-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i1> [[X]] to <vscale x 8 x i8>
 // CHECK-256-NEXT:    [[CAST_FIXED:%.*]] = tail call <32 x i8> @llvm.vector.extract.v32i8.nxv8i8(<vscale x 8 x i8> [[TMP0]], i64 0)
 // CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 32
-// CHECK-256-NEXT:    store <32 x i8> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[CHAR_TBAA6]]
+// CHECK-256-NEXT:    store <32 x i8> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[CHAR_TBAA10]]
 // CHECK-256-NEXT:    ret void
 //
 void write_bool1(struct struct_bool1 *s, vbool1_t x) {
   s->y[0] = x;
 }
 //.
-// CHECK-64: [[CHAR_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
-// CHECK-64: [[META7]] = !{!"omnipotent char", [[META8:![0-9]+]], i64 0}
-// CHECK-64: [[META8]] = !{!"Simple C/C++ TBAA"}
+// CHECK-64: [[META8:![0-9]+]] = !{!"omnipotent char", [[META9:![0-9]+]], i64 0}
+// CHECK-64: [[META9]] = !{!"Simple C/C++ TBAA"}
+// CHECK-64: [[CHAR_TBAA10]] = !{[[META8]], [[META8]], i64 0}
 //.
-// CHECK-128: [[CHAR_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
-// CHECK-128: [[META7]] = !{!"omnipotent char", [[META8:![0-9]+]], i64 0}
-// CHECK-128: [[META8]] = !{!"Simple C/C++ TBAA"}
+// CHECK-128: [[META8:![0-9]+]] = !{!"omnipotent char", [[META9:![0-9]+]], i64 0}
+// CHECK-128: [[META9]] = !{!"Simple C/C++ TBAA"}
+// CHECK-128: [[CHAR_TBAA10]] = !{[[META8]], [[META8]], i64 0}
 //.
-// CHECK-256: [[CHAR_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
-// CHECK-256: [[META7]] = !{!"omnipotent char", [[META8:![0-9]+]], i64 0}
-// CHECK-256: [[META8]] = !{!"Simple C/C++ TBAA"}
+// CHECK-256: [[META8:![0-9]+]] = !{!"omnipotent char", [[META9:![0-9]+]], i64 0}
+// CHECK-256: [[META9]] = !{!"Simple C/C++ TBAA"}
+// CHECK-256: [[CHAR_TBAA10]] = !{[[META8]], [[META8]], i64 0}
 //.
diff --git a/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-cast.c b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-cast.c
index 4517b52aefdfd..fd500f014da82 100644
--- a/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-cast.c
+++ b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-cast.c
@@ -124,7 +124,7 @@ vbool32_t to_vbool32_t(fixed_bool32_t type) {
 // CHECK-LABEL: define dso_local <vscale x 2 x i32> @to_vint32m1_t__from_gnu_int32m1_t(
 // CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6:![0-9]+]]
+// CHECK-NEXT:    [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA10:![0-9]+]]
 // CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i32> @llvm.vector.insert.nxv2i32.v8i32(<vscale x 2 x i32> poison, <8 x i32> [[TYPE]], i64 0)
 // CHECK-NEXT:    ret <vscale x 2 x i32> [[CAST_SCALABLE]]
 //
@@ -136,7 +136,7 @@ vint32m1_t to_vint32m1_t__from_gnu_int32m1_t(gnu_int32m1_t type) {
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], <vscale x 2 x i32> [[TYPE:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[CAST_FIXED:%.*]] = tail call <8 x i32> @llvm.vector.extract.v8i32.nxv2i32(<vscale x 2 x i32> [[TYPE]], i64 0)
-// CHECK-NEXT:    store <8 x i32> [[CAST_FIXED]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    store <8 x i32> [[CAST_FIXED]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA10]]
 // CHECK-NEXT:    ret void
 //
 gnu_int32m1_t from_vint32m1_t__to_gnu_int32m1_t(vint32m1_t type) {
@@ -146,7 +146,7 @@ gnu_int32m1_t from_vint32m1_t__to_gnu_int32m1_t(vint32m1_t type) {
 // CHECK-LABEL: define dso_local <vscale x 2 x i32> @to_fixed_int32m1_t__from_gnu_int32m1_t(
 // CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA10]]
 // CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i32> @llvm.vector.insert.nxv2i32.v8i32(<vscale x 2 x i32> poison, <8 x i32> [[TYPE]], i64 0)
 // CHECK-NEXT:    ret <vscale x 2 x i32> [[CAST_SCALABLE]]
 //
@@ -158,14 +158,14 @@ fixed_int32m1_t to_fixed_int32m1_t__from_gnu_int32m1_t(gnu_int32m1_t type) {
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], <vscale x 2 x i32> noundef [[TYPE_COERCE:%.*]]) local_unnamed_addr #[[ATTR3]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TYPE:%.*]] = tail call <8 x i32> @llvm.vector.extract.v8i32.nxv2i32(<vscale x 2 x i32> [[TYPE_COERCE]], i64 0)
-// CHECK-NEXT:    store <8 x i32> [[TYPE]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    store <8 x i32> [[TYPE]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA10]]
 // CHECK-NEXT:    ret void
 //
 gnu_int32m1_t from_fixed_int32m1_t__to_gnu_int32m1_t(fixed_int32m1_t type) {
   return type;
 }
 //.
-// CHECK: [[CHAR_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
-// CHECK: [[META7]] = !{!"omnipotent char", [[META8:![0-9]+]], i64 0}
-// CHECK: [[META8]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[META8:![0-9]+]] = !{!"omnipotent char", [[META9:![0-9]+]], i64 0}
+// CHECK: [[META9]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[CHAR_TBAA10]] = !{[[META8]], [[META8]], i64 0}
 //.
diff --git a/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-globals.c b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-globals.c
index f3b91b23a73e4..f6c734f0dba66 100644
--- a/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-globals.c
+++ b/clang/test/CodeGen/RISCV/attr-rvv-vector-bits-globals.c
@@ -44,14 +44,14 @@ fixed_bool32_t global_bool32;
 // CHECK-64-SAME: <vscale x 1 x i64> [[V:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-64-NEXT:  [[ENTRY:.*:]]
 // CHECK-64-NEXT:    [[CAST_FIXED:%.*]] = tail call <1 x i64> @llvm.vector.extract.v1i64.nxv1i64(<vscale x 1 x i64> [[V]], i64 0)
-// CHECK-64-NEXT:    store <1 x i64> [[CAST_FIXED]], ptr @global_i64, align 8, !tbaa [[CHAR_TBAA6:![0-9]+]]
+// CHECK-64-NEXT:    store <1 x i64> [[CAST_FIXED]], ptr @global_i64, align 8, !tbaa [[CHAR_TBAA10:![0-9]+]]
 // CHECK-64-NEXT:    ret void
 //
 // CHECK-256-LABEL: define dso_local void @write_global_i64(
 // CHECK-256-SAME: <vscale x 1 x i64> [[V:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-256-NEXT:  [[ENTRY:.*:]]
 // CHECK-256-NEXT:    [[CAST_FIXED:%.*]] = tail call <4 x i64> @llvm.vector.extract.v4i64.nxv1i64(<vscale x 1 x i64> [[V]], i64 0)
-// CHECK-256-NEXT:    store <4 x i64> [[CAST_FIXED]], ptr @global_i64, align 8, !tbaa [[CHAR_TBAA6:![0-9]+]]
+// CHECK-256-NEXT:    store <4 x i64> [[CAST_FIXED]], ptr @global_i64, align 8, !tbaa [[CHAR_TBAA10:![0-9]+]]
 // CHECK-256-NEXT:    ret void
 //
 void write_global_i64(vint64m1_t v) { global_i64 = v; }
@@ -61,7 +61,7 @@ void write_global_i64(vint64m1_t v) { global_i64 = v; }
 // CHECK-64-NEXT:  [[ENTRY:.*:]]
 // CHECK-64-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i1> [[V]] to <vscale x 8 x i8>
 // CHECK-64-NEXT:    [[CAST_FIXED:%.*]] = tail call <8 x i8> @llvm.vector.extract.v8i8.nxv8i8(<vscale x 8 x i8> [[TMP0]], i64 0)
-// CHECK-64-NEXT:    store <8 x i8> [[CAST_FIXED]], ptr @global_bool1, align 8, !tbaa [[CHAR_TBAA6]]
+// CHECK-64-NEXT:    store <8 x i8> [[CAST_FIXED]], ptr @global_bool1, align 8, !tbaa [[CHAR_TBAA10]]
 // CHECK-64-NEXT:    ret void
 //
 // CHECK-256-LABEL: define dso_local void @write_global_bool1(
@@ -69,7 +69,7 @@ void write_global_i64(vint64m1_t v) { global_i64 = v; }
 // CHECK-256-NEXT:  [[ENTRY:.*:]]
 // CHECK-256-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i1> [[V]] to <vscale x 8 x i8>
 // CHECK-256-NEXT:    [[CAST_FIXED:%.*]] = tail call <32 x i8> @llvm.vector.extract.v32i8.nxv8i8(<vscale x 8 x i8> [[TMP0]], i64 0)
-// CHECK-256-NEXT:    store <32 x i8> [[CAST_FIXED]], ptr @global_bool1, align 8, !tbaa [[CHAR_TBAA6]]
+// CHECK-256-NEXT:    store <32 x i8> [[CAST_FIXED]], ptr @global_bool1, align 8, !tbaa [[CHAR_TBAA10]]
 // CHECK-256-NEXT:    ret void
 //
 void write_global_bool1(vbool1_t v) { global_bool1 = v; }
@@ -79,7 +79,7 @@ void write_global_bool1(vbool1_t v) { global_bool1 = v; }
 // CHECK-64-NEXT:  [[ENTRY:.*:]]
 // CHECK-64-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i1> [[V]] to <vscale x 2 x i8>
 // CHECK-64-NEXT:    [[CAST_FIXED:%.*]] = tail call <2 x i8> @llvm.vector.extract.v2i8.nxv2i8(<vscale x 2 x i8> [[TMP0]], i64 0)
-// CHECK-64-NEXT:    store <2 x i8> [[CAST_FIXED]], ptr @global_bool4, align 2, !tbaa [[CHAR_TBAA6]]
+// CHECK-64-NEXT:    store <2 x i8> [[CAST_FIXED]], ptr @global_bool4, align 2, !tbaa [[CHAR_TBAA10]]
 // CHECK-64-NEXT:    ret void
 //
 // CHECK-256-LABEL: define dso_local void @write_global_bool4(
@@ -87,7 +87,7 @@ void write_global_bool1(vbool1_t v) { global_bool1 = v; }
 // CHECK-256-NEXT:  [[ENTRY:.*:]]
 // CHECK-256-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i1> [[V]] to <vscale x 2 x i8>
 // CHECK-256-NEXT:    [[CAST_FIXED:%.*]] = tail call <8 x i8> @llvm.vector.extract.v8i8.nxv2i8(<vscale x 2 x i8> [[TMP0]], i64 0)
-// CHECK-256-NEXT:    store <8 x i8> [[CAST_FIXED]], ptr @global_bool4, align 8, !tbaa [[CHAR_TBAA6]]
+// CHECK-256-NEXT:    store <8 x i8> [[CAST_FIXED]], ptr @global_bool4, align 8, !tbaa [[CHAR_TBAA10]]
 // CHECK-256-NEXT:    ret void
 //
 void write_global_bool4(vbool4_t v) { global_bool4 = v; }
@@ -99,7 +99,7 @@ void write_global_bool4(vbool4_t v) { global_bool4 = v; }
 // CHECK-256-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv2i1(<vscale x 8 x i1> zeroinitializer, <vscale x 2 x i1> [[V]], i64 0)
 // CHECK-256-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 8 x i1> [[TMP0]] to <vscale x 1 x i8>
 // CHECK-256-NEXT:    [[CAST_FIXED:%.*]] = tail call <1 x i8> @llvm.vector.extract.v1i8.nxv1i8(<vscale x 1 x i8> [[TMP1]], i64 0)
-// CHECK-256-NEXT:    store <1 x i8> [[CAST_FIXED]], ptr @global_bool32, align 1, !tbaa [[CHAR_TBAA6]]
+// CHECK-256-NEXT:    store <1 x i8> [[CAST_FIXED]], ptr @global_bool32, align 1, !tbaa [[CHAR_TBAA10]]
 // CHECK-256-NEXT:    ret void
 //
 void write_global_bool32(vbool32_t v) { global_bool32 = v; }
@@ -112,14 +112,14 @@ void write_global_bool32(vbool32_t v) { global_bool32 = v; }
 // CHECK-64-LABEL: define dso_local <vscale x 1 x i64> @read_global_i64(
 // CHECK-64-SAME: ) local_unnamed_addr #[[ATTR2:[0-9]+]] {
 // CHECK-64-NEXT:  [[ENTRY:.*:]]
-// CHECK-64-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr @global_i64, align 8, !tbaa [[CHAR_TBAA6]]
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr @global_i64, align 8, !tbaa [[CHAR_TBAA10]]
 // CHECK-64-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 1 x i64> @llvm.vector.insert.nxv1i64.v1i64(<vscale x 1 x i64> poison, <1 x i64> [[TMP0]], i64 0)
 // CHECK-64-NEXT:    ret <vscale x 1 x i64> [[CAST_SCALABLE]]
 //
 // CHECK-256-LABEL: define dso_local <vscale x 1 x i64> @read_global_i64(
 // CHECK-256-SAME: ) local_unnamed_addr #[[ATTR2:[0-9]+]] {
 // CHECK-256-NEXT:  [[ENTRY:.*:]]
-// CHECK-256-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr @global_i64, align 8, !tbaa [[CHAR_TBAA6]]
+// CHECK-256-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr @global_i64, align 8, !tbaa [[CHAR_TBAA10]]
 // CHECK-256-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 1 x i64> @llvm.vector.insert.nxv1i64.v4i64(<vscale x 1 x i64> poison, <4 x i64> [[TMP0]], i64 0)
 // CHECK-256-NEXT:    ret <vscale x 1 x i64> [[CAST_SCALABLE]]
 //
@@ -128,7 +128,7 @@ vint64m1_t read_global_i64() { return global_i64; }
 // CHECK-64-LABEL: define dso_local <vscale x 64 x i1> @read_global_bool1(
 // CHECK-64-SAME: ) local_unnamed_addr #[[ATTR2]] {
 // CHECK-64-NEXT:  [[ENTRY:.*:]]
-// CHECK-64-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr @global_bool1, align 8, !tbaa [[CHAR_TBAA6]]
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr @global_bool1, align 8, !tbaa [[CHAR_TBAA10]]
 // CHECK-64-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 8 x i8> @llvm.vector.insert.nxv8i8.v8i8(<vscale x 8 x i8> poison, <8 x i8> [[TMP0]], i64 0)
 // CHECK-64-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 8 x i8> [[CAST_SCALABLE]] to <vscale x 64 x i1>
 // CHECK-64-NEXT:    ret <vscale x 64 x i1> [[TMP1]]
@@ -136,7 +136,7 @@ vint64m1_t read_global_i64() { return global_i64; }
 // CHECK-256-LABEL: define dso_local <vscale x 64 x i1> @read_global_bool1(
 // CHECK-256-SAME: ) local_unnamed_addr #[[ATTR2]] {
 // CHECK-256-NEXT:  [[ENTRY:.*:]]
-// CHECK-256-NEXT:    [[TMP0:%.*]] = load <32 x i8>, ptr @global_bool1, align 8, !tbaa [[CHAR_TBAA6]]
+// CHECK-256-NEXT:    [[TMP0:%.*]] = load <32 x i8>, ptr @global_bool1, align 8, !tbaa [[CHAR_TBAA10]]
 // CHECK-256-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 8 x i8> @llvm.vector.insert.nxv8i8.v32i8(<vscale x 8 x i8> poison, <32 x i8> [[TMP0]], i64 0)
 // CHECK-256-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 8 x i8> [[CAST_SCALABLE]] to <vscale x 64 x i1>
 // CHECK-256-NEXT:    ret <vscale x 64 x i1> [[TMP1]]
@@ -146,7 +146,7 @@ vbool1_t read_global_bool1() { return global_bool1; }
 // CHECK-64-LABEL: define dso_local <vscale x 16 x i1> @read_global_bool4(
 // CHECK-64-SAME: ) local_unnamed_addr #[[ATTR2]] {
 // CHECK-64-NEXT:  [[ENTRY:.*:]]
-// CHECK-64-NEXT:    [[TMP0:%.*]] = load <2 x i8>, ptr @global_bool4, align 2, !tbaa [[CHAR_TBAA6]]
+// CHECK-64-NEXT:    [[TMP0:%.*]] = load <2 x i8>, ptr @global_bool4, align 2, !tbaa [[CHAR_TBAA10]]
 // CHECK-64-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i8> @llvm.vector.insert.nxv2i8.v2i8(<vscale x 2 x i8> poison, <2 x i8> [[TMP0]], i64 0)
 // CHECK-64-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 2 x i8> [[CAST_SCALABLE]] to <vscale x 16 x i1>
 // CHECK-64-NEXT:    ret <vscale x 16 x i1> [[TMP1]]
@@ -154,7 +154,7 @@ vbool1_t read_global_bool1() { return global_bool1; }
 // CHECK-256-LABEL: define dso_local <vscale x 16 x i1> @read_global_bool4(
 // CHECK-256-SAME: ) local_unnamed_addr #[[ATTR2]] {
 // CHECK-256-NEXT:  [[ENTRY:.*:]]
-// CHECK-256-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr @global_bool4, align 8, !tbaa [[CHAR_TBAA6]]
+// CHECK-256-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr @global_bool4, align 8, !tbaa [[CHAR_TBAA10]]
 // CHECK-256-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i8> @llvm.vector.insert.nxv2i8.v8i8(<vscale x 2 x i8> poison, <8 x i8> [[TMP0]], i64 0)
 // CHECK-256-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 2 x i8> [[CAST_SCALABLE]] to <vscale x 16 x i1>
 // CHECK-256-NEXT:    ret <vscale x 16 x i1> [[TMP1]]
@@ -165,7 +165,7 @@ vbool4_t read_global_bool4() { return global_bool4; }
 // CHECK-256-LABEL: define dso_local <vscale x 2 x i1> @read_global_bool32(
 // CHECK-256-SAME: ) local_unnamed_addr #[[ATTR2]] {
 // CHECK-256-NEXT:  [[ENTRY:.*:]]
-// CHECK-256-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr @global_bool32, align 1, !tbaa [[CHAR_TBAA6]]
+// CHECK-256-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr @global_bool32, align 1, !tbaa [[CHAR_TBAA10]]
 // CHECK-256-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 1 x i8> @llvm.vector.insert.nxv1i8.v1i8(<vscale x 1 x i8> poison, <1 x i8> [[TMP0]], i64 0)
 // CHECK-256-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 1 x i8> [[CAST_SCALABLE]] to <vscale x 8 x i1>
 // CHECK-256-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv8i1(<vscale x 8 x i1> [[TMP1]], i64 0)
@@ -174,11 +174,11 @@ vbool4_t read_global_bool4() { return global_bool4; }
 vbool32_t read_global_bool32() { return global_bool32; }
 #endif
 //.
-// CHECK-64: [[CHAR_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
-// CHECK-64: [[META7]] = !{!"omnipotent char", [[META8:![0-9]+]], i64 0}
-// CHECK-64: [[META8]] = !{!"Simple C/C++ TBAA"}
+// CHECK-64: [[META8:![0-9]+]] = !{!"omnipotent char", [[META9:![0-9]+]], i64 0}
+// CHECK-64: [[META9]] = !{!"Simple C/C++ TBAA"}
+// CHECK-64: [[CHAR_TBAA10]] = !{[[META8]], [[META8]], i64 0}
 //.
-// CHECK-256: [[CHAR_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
-// CHECK-256: [[META7]] = !{!"omnipotent char", [[META8:![0-9]+]], i64 0}
-// CHECK-256: [[META8]] = !{!"Simple C/C++ TBAA"}
+// CHECK-256: [[META8:![0-9]+]] = !{!"omnipotent char", [[META9:![0-9]+]], i64 0}
+// CHECK-256: [[META9]] = !{!"Simple C/C++ TBAA"}
+// CHECK-256: [[CHAR_TBAA10]] = !{[[META8]], [[META8]], i64 0}
 //.
diff --git a/clang/test/CodeGen/RISCV/bitint.c b/clang/test/CodeGen/RISCV/bitint.c
new file mode 100644
index 0000000000000..1ad43affac9e6
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/bitint.c
@@ -0,0 +1,342 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature
+// RUN: %clang_cc1 -triple riscv64 -O2 -emit-llvm -o - %s | FileCheck %s --check-prefix=RISCV64
+// RUN: %clang_cc1 -triple riscv32 -O2 -emit-llvm -o - %s | FileCheck %s --check-prefix=RISCV32
+// RUN: %clang_cc1 -triple riscv32 -fforce-enable-int128 -O2 -emit-llvm -o - %s | FileCheck %s --check-prefix=RISCV32_INT128
+
+// RISCV64-LABEL: define {{[^@]+}}@test_bitint_17_add_unsigned
+// RISCV64-SAME: (i17 noundef zeroext [[A:%.*]], i17 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// RISCV64-NEXT:  entry:
+// RISCV64-NEXT:    [[ADD:%.*]] = add i17 [[B]], [[A]]
+// RISCV64-NEXT:    ret i17 [[ADD]]
+//
+// RISCV32-LABEL: define {{[^@]+}}@test_bitint_17_add_unsigned
+// RISCV32-SAME: (i17 noundef zeroext [[A:%.*]], i17 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// RISCV32-NEXT:  entry:
+// RISCV32-NEXT:    [[ADD:%.*]] = add i17 [[B]], [[A]]
+// RISCV32-NEXT:    ret i17 [[ADD]]
+//
+// RISCV32_INT128-LABEL: define {{[^@]+}}@test_bitint_17_add_unsigned
+// RISCV32_INT128-SAME: (i17 noundef zeroext [[A:%.*]], i17 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// RISCV32_INT128-NEXT:  entry:
+// RISCV32_INT128-NEXT:    [[ADD:%.*]] = add i17 [[B]], [[A]]
+// RISCV32_INT128-NEXT:    ret i17 [[ADD]]
+//
+unsigned _BitInt(17) test_bitint_17_add_unsigned(unsigned _BitInt(17) a, unsigned _BitInt(17) b) {
+    return a + b;
+}
+
+// RISCV64-LABEL: define {{[^@]+}}@test_bitint_17_add_signed
+// RISCV64-SAME: (i17 noundef signext [[A:%.*]], i17 noundef signext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// RISCV64-NEXT:  entry:
+// RISCV64-NEXT:    [[ADD:%.*]] = add nsw i17 [[B]], [[A]]
+// RISCV64-NEXT:    ret i17 [[ADD]]
+//
+// RISCV32-LABEL: define {{[^@]+}}@test_bitint_17_add_signed
+// RISCV32-SAME: (i17 noundef signext [[A:%.*]], i17 noundef signext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// RISCV32-NEXT:  entry:
+// RISCV32-NEXT:    [[ADD:%.*]] = add nsw i17 [[B]], [[A]]
+// RISCV32-NEXT:    ret i17 [[ADD]]
+//
+// RISCV32_INT128-LABEL: define {{[^@]+}}@test_bitint_17_add_signed
+// RISCV32_INT128-SAME: (i17 noundef signext [[A:%.*]], i17 noundef signext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// RISCV32_INT128-NEXT:  entry:
+// RISCV32_INT128-NEXT:    [[ADD:%.*]] = add nsw i17 [[B]], [[A]]
+// RISCV32_INT128-NEXT:    ret i17 [[ADD]]
+//
+signed _BitInt(17) test_bitint_17_add_signed(signed _BitInt(17) a, signed _BitInt(17) b) {
+    return a + b;
+}
+
+// RISCV64-LABEL: define {{[^@]+}}@test_bitint_17_add_default
+// RISCV64-SAME: (i17 noundef signext [[A:%.*]], i17 noundef signext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// RISCV64-NEXT:  entry:
+// RISCV64-NEXT:    [[ADD:%.*]] = add nsw i17 [[B]], [[A]]
+// RISCV64-NEXT:    ret i17 [[ADD]]
+//
+// RISCV32-LABEL: define {{[^@]+}}@test_bitint_17_add_default
+// RISCV32-SAME: (i17 noundef signext [[A:%.*]], i17 noundef signext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// RISCV32-NEXT:  entry:
+// RISCV32-NEXT:    [[ADD:%.*]] = add nsw i17 [[B]], [[A]]
+// RISCV32-NEXT:    ret i17 [[ADD]]
+//
+// RISCV32_INT128-LABEL: define {{[^@]+}}@test_bitint_17_add_default
+// RISCV32_INT128-SAME: (i17 noundef signext [[A:%.*]], i17 noundef signext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// RISCV32_INT128-NEXT:  entry:
+// RISCV32_INT128-NEXT:    [[ADD:%.*]] = add nsw i17 [[B]], [[A]]
+// RISCV32_INT128-NEXT:    ret i17 [[ADD]]
+//
+_BitInt(17) test_bitint_17_add_default(_BitInt(17) a, _BitInt(17) b) {
+    return a + b;
+}
+
+// RISCV64-LABEL: define {{[^@]+}}@test_bitint_32_add_unsigned
+// RISCV64-SAME: (i32 noundef signext [[A:%.*]], i32 noundef signext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// RISCV64-NEXT:  entry:
+// RISCV64-NEXT:    [[ADD:%.*]] = add i32 [[B]], [[A]]
+// RISCV64-NEXT:    ret i32 [[ADD]]
+//
+// RISCV32-LABEL: define {{[^@]+}}@test_bitint_32_add_unsigned
+// RISCV32-SAME: (i32 noundef zeroext [[A:%.*]], i32 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// RISCV32-NEXT:  entry:
+// RISCV32-NEXT:    [[ADD:%.*]] = add i32 [[B]], [[A]]
+// RISCV32-NEXT:    ret i32 [[ADD]]
+//
+// RISCV32_INT128-LABEL: define {{[^@]+}}@test_bitint_32_add_unsigned
+// RISCV32_INT128-SAME: (i32 noundef zeroext [[A:%.*]], i32 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// RISCV32_INT128-NEXT:  entry:
+// RISCV32_INT128-NEXT:    [[ADD:%.*]] = add i32 [[B]], [[A]]
+// RISCV32_INT128-NEXT:    ret i32 [[ADD]]
+//
+unsigned _BitInt(32) test_bitint_32_add_unsigned(unsigned _BitInt(32) a, unsigned _BitInt(32) b) {
+    return a + b;
+}
+
+// RISCV64-LABEL: define {{[^@]+}}@test_bitint_32_add_signed
+// RISCV64-SAME: (i32 noundef signext [[A:%.*]], i32 noundef signext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// RISCV64-NEXT:  entry:
+// RISCV64-NEXT:    [[ADD:%.*]] = add nsw i32 [[B]], [[A]]
+// RISCV64-NEXT:    ret i32 [[ADD]]
+//
+// RISCV32-LABEL: define {{[^@]+}}@test_bitint_32_add_signed
+// RISCV32-SAME: (i32 noundef signext [[A:%.*]], i32 noundef signext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// RISCV32-NEXT:  entry:
+// RISCV32-NEXT:    [[ADD:%.*]] = add nsw i32 [[B]], [[A]]
+// RISCV32-NEXT:    ret i32 [[ADD]]
+//
+// RISCV32_INT128-LABEL: define {{[^@]+}}@test_bitint_32_add_signed
+// RISCV32_INT128-SAME: (i32 noundef signext [[A:%.*]], i32 noundef signext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// RISCV32_INT128-NEXT:  entry:
+// RISCV32_INT128-NEXT:    [[ADD:%.*]] = add nsw i32 [[B]], [[A]]
+// RISCV32_INT128-NEXT:    ret i32 [[ADD]]
+//
+signed _BitInt(32) test_bitint_32_add_signed(signed _BitInt(32) a, signed _BitInt(32) b) {
+    return a + b;
+}
+
+// RISCV64-LABEL: define {{[^@]+}}@test_bitint_32_add_default
+// RISCV64-SAME: (i32 noundef signext [[A:%.*]], i32 noundef signext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// RISCV64-NEXT:  entry:
+// RISCV64-NEXT:    [[ADD:%.*]] = add nsw i32 [[B]], [[A]]
+// RISCV64-NEXT:    ret i32 [[ADD]]
+//
+// RISCV32-LABEL: define {{[^@]+}}@test_bitint_32_add_default
+// RISCV32-SAME: (i32 noundef signext [[A:%.*]], i32 noundef signext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// RISCV32-NEXT:  entry:
+// RISCV32-NEXT:    [[ADD:%.*]] = add nsw i32 [[B]], [[A]]
+// RISCV32-NEXT:    ret i32 [[ADD]]
+//
+// RISCV32_INT128-LABEL: define {{[^@]+}}@test_bitint_32_add_default
+// RISCV32_INT128-SAME: (i32 noundef signext [[A:%.*]], i32 noundef signext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// RISCV32_INT128-NEXT:  entry:
+// RISCV32_INT128-NEXT:    [[ADD:%.*]] = add nsw i32 [[B]], [[A]]
+// RISCV32_INT128-NEXT:    ret i32 [[ADD]]
+//
+_BitInt(32) test_bitint_32_add_default(_BitInt(32) a, _BitInt(32) b) {
+    return a + b;
+}
+
+
+// RISCV64-LABEL: define {{[^@]+}}@test_bitint_65_add_unsigned
+// RISCV64-SAME: (i65 noundef zeroext [[A:%.*]], i65 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// RISCV64-NEXT:  entry:
+// RISCV64-NEXT:    [[ADD:%.*]] = add i65 [[B]], [[A]]
+// RISCV64-NEXT:    ret i65 [[ADD]]
+//
+// RISCV32-LABEL: define {{[^@]+}}@test_bitint_65_add_unsigned
+// RISCV32-SAME: (ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
+// RISCV32-NEXT:  entry:
+// RISCV32-NEXT:    [[TMP2:%.*]] = load i128, ptr [[TMP0]], align 8, !tbaa [[TBAA6:![0-9]+]]
+// RISCV32-NEXT:    [[A:%.*]] = trunc i128 [[TMP2]] to i65
+// RISCV32-NEXT:    [[TMP3:%.*]] = load i128, ptr [[TMP1]], align 8, !tbaa [[TBAA6]]
+// RISCV32-NEXT:    [[B:%.*]] = trunc i128 [[TMP3]] to i65
+// RISCV32-NEXT:    [[ADD:%.*]] = add i65 [[B]], [[A]]
+// RISCV32-NEXT:    [[STOREDV4:%.*]] = zext i65 [[ADD]] to i128
+// RISCV32-NEXT:    store i128 [[STOREDV4]], ptr [[AGG_RESULT]], align 8, !tbaa [[TBAA6]]
+// RISCV32-NEXT:    ret void
+//
+// RISCV32_INT128-LABEL: define {{[^@]+}}@test_bitint_65_add_unsigned
+// RISCV32_INT128-SAME: (ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
+// RISCV32_INT128-NEXT:  entry:
+// RISCV32_INT128-NEXT:    [[TMP2:%.*]] = load i128, ptr [[TMP0]], align 8, !tbaa [[TBAA6:![0-9]+]]
+// RISCV32_INT128-NEXT:    [[A:%.*]] = trunc i128 [[TMP2]] to i65
+// RISCV32_INT128-NEXT:    [[TMP3:%.*]] = load i128, ptr [[TMP1]], align 8, !tbaa [[TBAA6]]
+// RISCV32_INT128-NEXT:    [[B:%.*]] = trunc i128 [[TMP3]] to i65
+// RISCV32_INT128-NEXT:    [[ADD:%.*]] = add i65 [[B]], [[A]]
+// RISCV32_INT128-NEXT:    [[STOREDV4:%.*]] = zext i65 [[ADD]] to i128
+// RISCV32_INT128-NEXT:    store i128 [[STOREDV4]], ptr [[AGG_RESULT]], align 8, !tbaa [[TBAA6]]
+// RISCV32_INT128-NEXT:    ret void
+//
+unsigned _BitInt(65) test_bitint_65_add_unsigned(unsigned _BitInt(65) a, unsigned _BitInt(65) b) {
+    return a + b;
+}
+
+// RISCV64-LABEL: define {{[^@]+}}@test_bitint_65_add_signed
+// RISCV64-SAME: (i65 noundef signext [[A:%.*]], i65 noundef signext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// RISCV64-NEXT:  entry:
+// RISCV64-NEXT:    [[ADD:%.*]] = add nsw i65 [[B]], [[A]]
+// RISCV64-NEXT:    ret i65 [[ADD]]
+//
+// RISCV32-LABEL: define {{[^@]+}}@test_bitint_65_add_signed
+// RISCV32-SAME: (ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// RISCV32-NEXT:  entry:
+// RISCV32-NEXT:    [[TMP2:%.*]] = load i128, ptr [[TMP0]], align 8, !tbaa [[TBAA6]]
+// RISCV32-NEXT:    [[A:%.*]] = trunc i128 [[TMP2]] to i65
+// RISCV32-NEXT:    [[TMP3:%.*]] = load i128, ptr [[TMP1]], align 8, !tbaa [[TBAA6]]
+// RISCV32-NEXT:    [[B:%.*]] = trunc i128 [[TMP3]] to i65
+// RISCV32-NEXT:    [[ADD:%.*]] = add nsw i65 [[B]], [[A]]
+// RISCV32-NEXT:    [[STOREDV4:%.*]] = sext i65 [[ADD]] to i128
+// RISCV32-NEXT:    store i128 [[STOREDV4]], ptr [[AGG_RESULT]], align 8, !tbaa [[TBAA6]]
+// RISCV32-NEXT:    ret void
+//
+// RISCV32_INT128-LABEL: define {{[^@]+}}@test_bitint_65_add_signed
+// RISCV32_INT128-SAME: (ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// RISCV32_INT128-NEXT:  entry:
+// RISCV32_INT128-NEXT:    [[TMP2:%.*]] = load i128, ptr [[TMP0]], align 8, !tbaa [[TBAA6]]
+// RISCV32_INT128-NEXT:    [[A:%.*]] = trunc i128 [[TMP2]] to i65
+// RISCV32_INT128-NEXT:    [[TMP3:%.*]] = load i128, ptr [[TMP1]], align 8, !tbaa [[TBAA6]]
+// RISCV32_INT128-NEXT:    [[B:%.*]] = trunc i128 [[TMP3]] to i65
+// RISCV32_INT128-NEXT:    [[ADD:%.*]] = add nsw i65 [[B]], [[A]]
+// RISCV32_INT128-NEXT:    [[STOREDV4:%.*]] = sext i65 [[ADD]] to i128
+// RISCV32_INT128-NEXT:    store i128 [[STOREDV4]], ptr [[AGG_RESULT]], align 8, !tbaa [[TBAA6]]
+// RISCV32_INT128-NEXT:    ret void
+//
+signed _BitInt(65) test_bitint_65_add_signed(signed _BitInt(65) a, signed _BitInt(65) b) {
+    return a + b;
+}
+
+// RISCV64-LABEL: define {{[^@]+}}@test_bitint_65_add_default
+// RISCV64-SAME: (i65 noundef signext [[A:%.*]], i65 noundef signext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// RISCV64-NEXT:  entry:
+// RISCV64-NEXT:    [[ADD:%.*]] = add nsw i65 [[B]], [[A]]
+// RISCV64-NEXT:    ret i65 [[ADD]]
+//
+// RISCV32-LABEL: define {{[^@]+}}@test_bitint_65_add_default
+// RISCV32-SAME: (ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// RISCV32-NEXT:  entry:
+// RISCV32-NEXT:    [[TMP2:%.*]] = load i128, ptr [[TMP0]], align 8, !tbaa [[TBAA6]]
+// RISCV32-NEXT:    [[A:%.*]] = trunc i128 [[TMP2]] to i65
+// RISCV32-NEXT:    [[TMP3:%.*]] = load i128, ptr [[TMP1]], align 8, !tbaa [[TBAA6]]
+// RISCV32-NEXT:    [[B:%.*]] = trunc i128 [[TMP3]] to i65
+// RISCV32-NEXT:    [[ADD:%.*]] = add nsw i65 [[B]], [[A]]
+// RISCV32-NEXT:    [[STOREDV4:%.*]] = sext i65 [[ADD]] to i128
+// RISCV32-NEXT:    store i128 [[STOREDV4]], ptr [[AGG_RESULT]], align 8, !tbaa [[TBAA6]]
+// RISCV32-NEXT:    ret void
+//
+// RISCV32_INT128-LABEL: define {{[^@]+}}@test_bitint_65_add_default
+// RISCV32_INT128-SAME: (ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// RISCV32_INT128-NEXT:  entry:
+// RISCV32_INT128-NEXT:    [[TMP2:%.*]] = load i128, ptr [[TMP0]], align 8, !tbaa [[TBAA6]]
+// RISCV32_INT128-NEXT:    [[A:%.*]] = trunc i128 [[TMP2]] to i65
+// RISCV32_INT128-NEXT:    [[TMP3:%.*]] = load i128, ptr [[TMP1]], align 8, !tbaa [[TBAA6]]
+// RISCV32_INT128-NEXT:    [[B:%.*]] = trunc i128 [[TMP3]] to i65
+// RISCV32_INT128-NEXT:    [[ADD:%.*]] = add nsw i65 [[B]], [[A]]
+// RISCV32_INT128-NEXT:    [[STOREDV4:%.*]] = sext i65 [[ADD]] to i128
+// RISCV32_INT128-NEXT:    store i128 [[STOREDV4]], ptr [[AGG_RESULT]], align 8, !tbaa [[TBAA6]]
+// RISCV32_INT128-NEXT:    ret void
+//
+_BitInt(65) test_bitint_65_add_default(_BitInt(65) a, _BitInt(65) b) {
+    return a + b;
+}
+
+
+// RISCV64-LABEL: define {{[^@]+}}@test_bitint_77_add_unsigned
+// RISCV64-SAME: (i77 noundef zeroext [[A:%.*]], i77 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// RISCV64-NEXT:  entry:
+// RISCV64-NEXT:    [[ADD:%.*]] = add i77 [[B]], [[A]]
+// RISCV64-NEXT:    ret i77 [[ADD]]
+//
+// RISCV32-LABEL: define {{[^@]+}}@test_bitint_77_add_unsigned
+// RISCV32-SAME: (ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// RISCV32-NEXT:  entry:
+// RISCV32-NEXT:    [[TMP2:%.*]] = load i128, ptr [[TMP0]], align 8, !tbaa [[TBAA10:![0-9]+]]
+// RISCV32-NEXT:    [[A:%.*]] = trunc i128 [[TMP2]] to i77
+// RISCV32-NEXT:    [[TMP3:%.*]] = load i128, ptr [[TMP1]], align 8, !tbaa [[TBAA10]]
+// RISCV32-NEXT:    [[B:%.*]] = trunc i128 [[TMP3]] to i77
+// RISCV32-NEXT:    [[ADD:%.*]] = add i77 [[B]], [[A]]
+// RISCV32-NEXT:    [[STOREDV4:%.*]] = zext i77 [[ADD]] to i128
+// RISCV32-NEXT:    store i128 [[STOREDV4]], ptr [[AGG_RESULT]], align 8, !tbaa [[TBAA10]]
+// RISCV32-NEXT:    ret void
+//
+// RISCV32_INT128-LABEL: define {{[^@]+}}@test_bitint_77_add_unsigned
+// RISCV32_INT128-SAME: (ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// RISCV32_INT128-NEXT:  entry:
+// RISCV32_INT128-NEXT:    [[TMP2:%.*]] = load i128, ptr [[TMP0]], align 8, !tbaa [[TBAA10:![0-9]+]]
+// RISCV32_INT128-NEXT:    [[A:%.*]] = trunc i128 [[TMP2]] to i77
+// RISCV32_INT128-NEXT:    [[TMP3:%.*]] = load i128, ptr [[TMP1]], align 8, !tbaa [[TBAA10]]
+// RISCV32_INT128-NEXT:    [[B:%.*]] = trunc i128 [[TMP3]] to i77
+// RISCV32_INT128-NEXT:    [[ADD:%.*]] = add i77 [[B]], [[A]]
+// RISCV32_INT128-NEXT:    [[STOREDV4:%.*]] = zext i77 [[ADD]] to i128
+// RISCV32_INT128-NEXT:    store i128 [[STOREDV4]], ptr [[AGG_RESULT]], align 8, !tbaa [[TBAA10]]
+// RISCV32_INT128-NEXT:    ret void
+//
+unsigned _BitInt(77) test_bitint_77_add_unsigned(unsigned _BitInt(77) a, unsigned _BitInt(77) b) {
+    return a + b;
+}
+
+// RISCV64-LABEL: define {{[^@]+}}@test_bitint_77_add_signed
+// RISCV64-SAME: (i77 noundef signext [[A:%.*]], i77 noundef signext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// RISCV64-NEXT:  entry:
+// RISCV64-NEXT:    [[ADD:%.*]] = add nsw i77 [[B]], [[A]]
+// RISCV64-NEXT:    ret i77 [[ADD]]
+//
+// RISCV32-LABEL: define {{[^@]+}}@test_bitint_77_add_signed
+// RISCV32-SAME: (ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// RISCV32-NEXT:  entry:
+// RISCV32-NEXT:    [[TMP2:%.*]] = load i128, ptr [[TMP0]], align 8, !tbaa [[TBAA10]]
+// RISCV32-NEXT:    [[A:%.*]] = trunc i128 [[TMP2]] to i77
+// RISCV32-NEXT:    [[TMP3:%.*]] = load i128, ptr [[TMP1]], align 8, !tbaa [[TBAA10]]
+// RISCV32-NEXT:    [[B:%.*]] = trunc i128 [[TMP3]] to i77
+// RISCV32-NEXT:    [[ADD:%.*]] = add nsw i77 [[B]], [[A]]
+// RISCV32-NEXT:    [[STOREDV4:%.*]] = sext i77 [[ADD]] to i128
+// RISCV32-NEXT:    store i128 [[STOREDV4]], ptr [[AGG_RESULT]], align 8, !tbaa [[TBAA10]]
+// RISCV32-NEXT:    ret void
+//
+// RISCV32_INT128-LABEL: define {{[^@]+}}@test_bitint_77_add_signed
+// RISCV32_INT128-SAME: (ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// RISCV32_INT128-NEXT:  entry:
+// RISCV32_INT128-NEXT:    [[TMP2:%.*]] = load i128, ptr [[TMP0]], align 8, !tbaa [[TBAA10]]
+// RISCV32_INT128-NEXT:    [[A:%.*]] = trunc i128 [[TMP2]] to i77
+// RISCV32_INT128-NEXT:    [[TMP3:%.*]] = load i128, ptr [[TMP1]], align 8, !tbaa [[TBAA10]]
+// RISCV32_INT128-NEXT:    [[B:%.*]] = trunc i128 [[TMP3]] to i77
+// RISCV32_INT128-NEXT:    [[ADD:%.*]] = add nsw i77 [[B]], [[A]]
+// RISCV32_INT128-NEXT:    [[STOREDV4:%.*]] = sext i77 [[ADD]] to i128
+// RISCV32_INT128-NEXT:    store i128 [[STOREDV4]], ptr [[AGG_RESULT]], align 8, !tbaa [[TBAA10]]
+// RISCV32_INT128-NEXT:    ret void
+//
+signed _BitInt(77) test_bitint_77_add_signed(signed _BitInt(77) a, signed _BitInt(77) b) {
+    return a + b;
+}
+
+// RISCV64-LABEL: define {{[^@]+}}@test_bitint_77_add_default
+// RISCV64-SAME: (i77 noundef signext [[A:%.*]], i77 noundef signext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// RISCV64-NEXT:  entry:
+// RISCV64-NEXT:    [[ADD:%.*]] = add nsw i77 [[B]], [[A]]
+// RISCV64-NEXT:    ret i77 [[ADD]]
+//
+// RISCV32-LABEL: define {{[^@]+}}@test_bitint_77_add_default
+// RISCV32-SAME: (ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// RISCV32-NEXT:  entry:
+// RISCV32-NEXT:    [[TMP2:%.*]] = load i128, ptr [[TMP0]], align 8, !tbaa [[TBAA10]]
+// RISCV32-NEXT:    [[A:%.*]] = trunc i128 [[TMP2]] to i77
+// RISCV32-NEXT:    [[TMP3:%.*]] = load i128, ptr [[TMP1]], align 8, !tbaa [[TBAA10]]
+// RISCV32-NEXT:    [[B:%.*]] = trunc i128 [[TMP3]] to i77
+// RISCV32-NEXT:    [[ADD:%.*]] = add nsw i77 [[B]], [[A]]
+// RISCV32-NEXT:    [[STOREDV4:%.*]] = sext i77 [[ADD]] to i128
+// RISCV32-NEXT:    store i128 [[STOREDV4]], ptr [[AGG_RESULT]], align 8, !tbaa [[TBAA10]]
+// RISCV32-NEXT:    ret void
+//
+// RISCV32_INT128-LABEL: define {{[^@]+}}@test_bitint_77_add_default
+// RISCV32_INT128-SAME: (ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// RISCV32_INT128-NEXT:  entry:
+// RISCV32_INT128-NEXT:    [[TMP2:%.*]] = load i128, ptr [[TMP0]], align 8, !tbaa [[TBAA10]]
+// RISCV32_INT128-NEXT:    [[A:%.*]] = trunc i128 [[TMP2]] to i77
+// RISCV32_INT128-NEXT:    [[TMP3:%.*]] = load i128, ptr [[TMP1]], align 8, !tbaa [[TBAA10]]
+// RISCV32_INT128-NEXT:    [[B:%.*]] = trunc i128 [[TMP3]] to i77
+// RISCV32_INT128-NEXT:    [[ADD:%.*]] = add nsw i77 [[B]], [[A]]
+// RISCV32_INT128-NEXT:    [[STOREDV4:%.*]] = sext i77 [[ADD]] to i128
+// RISCV32_INT128-NEXT:    store i128 [[STOREDV4]], ptr [[AGG_RESULT]], align 8, !tbaa [[TBAA10]]
+// RISCV32_INT128-NEXT:    ret void
+//
+_BitInt(77) test_bitint_77_add_default(_BitInt(77) a, _BitInt(77) b) {
+    return a + b;
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfadd.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfadd.c
new file mode 100644
index 0000000000000..d7734e05a8aaa
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfadd.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfadd_vv_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfadd.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfadd_vv_bf16mf4(vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfadd_vv_bf16mf4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfadd_vf_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfadd.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfadd_vf_bf16mf4(vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_vf_bf16mf4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfadd_vv_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfadd.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfadd_vv_bf16mf2(vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfadd_vv_bf16mf2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfadd_vf_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfadd.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfadd_vf_bf16mf2(vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_vf_bf16mf2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfadd_vv_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfadd.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfadd_vv_bf16m1(vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfadd_vv_bf16m1(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfadd_vf_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfadd.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfadd_vf_bf16m1(vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_vf_bf16m1(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfadd_vv_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfadd.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfadd_vv_bf16m2(vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfadd_vv_bf16m2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfadd_vf_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfadd.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfadd_vf_bf16m2(vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_vf_bf16m2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfadd_vv_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfadd.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfadd_vv_bf16m4(vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfadd_vv_bf16m4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfadd_vf_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfadd.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfadd_vf_bf16m4(vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_vf_bf16m4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfadd_vv_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfadd.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfadd_vv_bf16m8(vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfadd_vv_bf16m8(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfadd_vf_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfadd.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfadd_vf_bf16m8(vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_vf_bf16m8(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfadd_vv_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfadd.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfadd_vv_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfadd_vv_bf16mf4_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfadd_vf_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfadd.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfadd_vf_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_vf_bf16mf4_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfadd_vv_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfadd.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfadd_vv_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfadd_vv_bf16mf2_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfadd_vf_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfadd.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfadd_vf_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_vf_bf16mf2_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfadd_vv_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfadd.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfadd_vv_bf16m1_m(vbool16_t mask, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfadd_vv_bf16m1_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfadd_vf_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfadd.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfadd_vf_bf16m1_m(vbool16_t mask, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_vf_bf16m1_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfadd_vv_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfadd.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfadd_vv_bf16m2_m(vbool8_t mask, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfadd_vv_bf16m2_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfadd_vf_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfadd.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfadd_vf_bf16m2_m(vbool8_t mask, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_vf_bf16m2_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfadd_vv_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfadd.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfadd_vv_bf16m4_m(vbool4_t mask, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfadd_vv_bf16m4_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfadd_vf_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfadd.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfadd_vf_bf16m4_m(vbool4_t mask, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_vf_bf16m4_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfadd_vv_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfadd.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfadd_vv_bf16m8_m(vbool2_t mask, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfadd_vv_bf16m8_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfadd_vf_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfadd.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfadd_vf_bf16m8_m(vbool2_t mask, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_vf_bf16m8_m(mask, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfclass.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfclass.c
new file mode 100644
index 0000000000000..68814f4672d05
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfclass.c
@@ -0,0 +1,134 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i16> @test_vfclass_v_bf16mf4_u16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vfclass.nxv1bf16.i64(<vscale x 1 x i16> poison, <vscale x 1 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
+//
+vuint16mf4_t test_vfclass_v_bf16mf4_u16mf4(vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfclass_v_bf16mf4_u16mf4(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i16> @test_vfclass_v_bf16mf2_u16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vfclass.nxv2bf16.i64(<vscale x 2 x i16> poison, <vscale x 2 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
+//
+vuint16mf2_t test_vfclass_v_bf16mf2_u16mf2(vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfclass_v_bf16mf2_u16mf2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i16> @test_vfclass_v_bf16m1_u16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vfclass.nxv4bf16.i64(<vscale x 4 x i16> poison, <vscale x 4 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
+//
+vuint16m1_t test_vfclass_v_bf16m1_u16m1(vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfclass_v_bf16m1_u16m1(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i16> @test_vfclass_v_bf16m2_u16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vfclass.nxv8bf16.i64(<vscale x 8 x i16> poison, <vscale x 8 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+vuint16m2_t test_vfclass_v_bf16m2_u16m2(vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfclass_v_bf16m2_u16m2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i16> @test_vfclass_v_bf16m4_u16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vfclass.nxv16bf16.i64(<vscale x 16 x i16> poison, <vscale x 16 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+vuint16m4_t test_vfclass_v_bf16m4_u16m4(vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfclass_v_bf16m4_u16m4(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_vfclass_v_bf16m8_u16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vfclass.nxv32bf16.i64(<vscale x 32 x i16> poison, <vscale x 32 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
+vuint16m8_t test_vfclass_v_bf16m8_u16m8(vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfclass_v_bf16m8_u16m8(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i16> @test_vfclass_v_bf16mf4_u16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vfclass.mask.nxv1bf16.i64(<vscale x 1 x i16> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
+//
+vuint16mf4_t test_vfclass_v_bf16mf4_u16mf4_m(vbool64_t vm, vbfloat16mf4_t vs2,
+                                             size_t vl) {
+  return __riscv_vfclass_v_bf16mf4_u16mf4_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i16> @test_vfclass_v_bf16mf2_u16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vfclass.mask.nxv2bf16.i64(<vscale x 2 x i16> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
+//
+vuint16mf2_t test_vfclass_v_bf16mf2_u16mf2_m(vbool32_t vm, vbfloat16mf2_t vs2,
+                                             size_t vl) {
+  return __riscv_vfclass_v_bf16mf2_u16mf2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i16> @test_vfclass_v_bf16m1_u16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vfclass.mask.nxv4bf16.i64(<vscale x 4 x i16> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
+//
+vuint16m1_t test_vfclass_v_bf16m1_u16m1_m(vbool16_t vm, vbfloat16m1_t vs2,
+                                          size_t vl) {
+  return __riscv_vfclass_v_bf16m1_u16m1_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i16> @test_vfclass_v_bf16m2_u16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vfclass.mask.nxv8bf16.i64(<vscale x 8 x i16> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+vuint16m2_t test_vfclass_v_bf16m2_u16m2_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                          size_t vl) {
+  return __riscv_vfclass_v_bf16m2_u16m2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i16> @test_vfclass_v_bf16m4_u16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vfclass.mask.nxv16bf16.i64(<vscale x 16 x i16> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+vuint16m4_t test_vfclass_v_bf16m4_u16m4_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                          size_t vl) {
+  return __riscv_vfclass_v_bf16m4_u16m4_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_vfclass_v_bf16m8_u16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vfclass.mask.nxv32bf16.i64(<vscale x 32 x i16> poison, <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
+vuint16m8_t test_vfclass_v_bf16m8_u16m8_m(vbool2_t vm, vbfloat16m8_t vs2,
+                                          size_t vl) {
+  return __riscv_vfclass_v_bf16m8_u16m8_m(vm, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfmacc.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfmacc.c
new file mode 100644
index 0000000000000..616455d5f3f9e
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfmacc.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmacc_vv_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmacc.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmacc_vv_bf16mf4(vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmacc_vv_bf16mf4(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmacc_vf_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmacc.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmacc_vf_bf16mf4(vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmacc_vf_bf16mf4(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmacc_vv_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmacc.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmacc_vv_bf16mf2(vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmacc_vv_bf16mf2(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmacc_vf_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmacc.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmacc_vf_bf16mf2(vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmacc_vf_bf16mf2(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmacc_vv_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmacc.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmacc_vv_bf16m1(vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmacc_vv_bf16m1(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmacc_vf_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmacc.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmacc_vf_bf16m1(vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmacc_vf_bf16m1(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmacc_vv_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmacc.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmacc_vv_bf16m2(vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmacc_vv_bf16m2(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmacc_vf_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmacc.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmacc_vf_bf16m2(vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmacc_vf_bf16m2(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmacc_vv_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmacc.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmacc_vv_bf16m4(vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmacc_vv_bf16m4(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmacc_vf_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmacc.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmacc_vf_bf16m4(vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmacc_vf_bf16m4(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmacc_vv_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmacc.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmacc_vv_bf16m8(vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmacc_vv_bf16m8(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmacc_vf_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmacc.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmacc_vf_bf16m8(vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmacc_vf_bf16m8(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmacc_vv_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmacc.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmacc_vv_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmacc_vv_bf16mf4_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmacc_vf_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmacc.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmacc_vf_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmacc_vf_bf16mf4_m(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmacc_vv_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmacc.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmacc_vv_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmacc_vv_bf16mf2_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmacc_vf_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmacc.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmacc_vf_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmacc_vf_bf16mf2_m(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmacc_vv_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmacc.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmacc_vv_bf16m1_m(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmacc_vv_bf16m1_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmacc_vf_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmacc.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmacc_vf_bf16m1_m(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmacc_vf_bf16m1_m(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmacc_vv_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmacc.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmacc_vv_bf16m2_m(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmacc_vv_bf16m2_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmacc_vf_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmacc.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmacc_vf_bf16m2_m(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmacc_vf_bf16m2_m(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmacc_vv_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmacc.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmacc_vv_bf16m4_m(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmacc_vv_bf16m4_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmacc_vf_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmacc.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmacc_vf_bf16m4_m(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmacc_vf_bf16m4_m(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmacc_vv_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmacc.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmacc_vv_bf16m8_m(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmacc_vv_bf16m8_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmacc_vf_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmacc.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmacc_vf_bf16m8_m(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmacc_vf_bf16m8_m(mask, vd, rs1, vs2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfmadd.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfmadd.c
new file mode 100644
index 0000000000000..eec662a3671c8
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfmadd.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmadd_vv_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmadd.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmadd_vv_bf16mf4(vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmadd_vv_bf16mf4(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmadd_vf_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmadd.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmadd_vf_bf16mf4(vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmadd_vf_bf16mf4(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmadd_vv_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmadd.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmadd_vv_bf16mf2(vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmadd_vv_bf16mf2(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmadd_vf_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmadd.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmadd_vf_bf16mf2(vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmadd_vf_bf16mf2(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmadd_vv_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmadd.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmadd_vv_bf16m1(vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmadd_vv_bf16m1(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmadd_vf_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmadd.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmadd_vf_bf16m1(vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmadd_vf_bf16m1(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmadd_vv_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmadd.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmadd_vv_bf16m2(vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmadd_vv_bf16m2(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmadd_vf_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmadd.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmadd_vf_bf16m2(vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmadd_vf_bf16m2(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmadd_vv_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmadd.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmadd_vv_bf16m4(vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmadd_vv_bf16m4(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmadd_vf_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmadd.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmadd_vf_bf16m4(vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmadd_vf_bf16m4(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmadd_vv_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmadd.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmadd_vv_bf16m8(vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmadd_vv_bf16m8(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmadd_vf_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmadd.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmadd_vf_bf16m8(vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmadd_vf_bf16m8(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmadd_vv_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmadd.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmadd_vv_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmadd_vv_bf16mf4_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmadd_vf_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmadd.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmadd_vf_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmadd_vf_bf16mf4_m(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmadd_vv_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmadd.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmadd_vv_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmadd_vv_bf16mf2_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmadd_vf_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmadd.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmadd_vf_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmadd_vf_bf16mf2_m(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmadd_vv_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmadd.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmadd_vv_bf16m1_m(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmadd_vv_bf16m1_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmadd_vf_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmadd.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmadd_vf_bf16m1_m(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmadd_vf_bf16m1_m(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmadd_vv_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmadd.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmadd_vv_bf16m2_m(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmadd_vv_bf16m2_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmadd_vf_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmadd.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmadd_vf_bf16m2_m(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmadd_vf_bf16m2_m(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmadd_vv_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmadd.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmadd_vv_bf16m4_m(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmadd_vv_bf16m4_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmadd_vf_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmadd.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmadd_vf_bf16m4_m(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmadd_vf_bf16m4_m(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmadd_vv_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmadd.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmadd_vv_bf16m8_m(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmadd_vv_bf16m8_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmadd_vf_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmadd.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmadd_vf_bf16m8_m(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmadd_vf_bf16m8_m(mask, vd, rs1, vs2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfmax.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfmax.c
new file mode 100644
index 0000000000000..dfdeb4e967a46
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfmax.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmax_vv_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmax.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmax_vv_bf16mf4(vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfmax_vv_bf16mf4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmax_vf_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmax.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmax_vf_bf16mf4(vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_vf_bf16mf4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmax_vv_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmax.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmax_vv_bf16mf2(vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfmax_vv_bf16mf2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmax_vf_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmax.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmax_vf_bf16mf2(vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_vf_bf16mf2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmax_vv_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmax.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmax_vv_bf16m1(vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfmax_vv_bf16m1(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmax_vf_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmax.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmax_vf_bf16m1(vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_vf_bf16m1(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmax_vv_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmax.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmax_vv_bf16m2(vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfmax_vv_bf16m2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmax_vf_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmax.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmax_vf_bf16m2(vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_vf_bf16m2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmax_vv_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmax.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmax_vv_bf16m4(vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfmax_vv_bf16m4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmax_vf_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmax.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmax_vf_bf16m4(vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_vf_bf16m4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmax_vv_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmax.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmax_vv_bf16m8(vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfmax_vv_bf16m8(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmax_vf_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmax.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmax_vf_bf16m8(vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_vf_bf16m8(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmax_vv_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmax.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmax_vv_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfmax_vv_bf16mf4_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmax_vf_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmax.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmax_vf_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_vf_bf16mf4_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmax_vv_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmax.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmax_vv_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfmax_vv_bf16mf2_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmax_vf_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmax.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmax_vf_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_vf_bf16mf2_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmax_vv_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmax.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmax_vv_bf16m1_m(vbool16_t mask, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfmax_vv_bf16m1_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmax_vf_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmax.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmax_vf_bf16m1_m(vbool16_t mask, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_vf_bf16m1_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmax_vv_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmax.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmax_vv_bf16m2_m(vbool8_t mask, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfmax_vv_bf16m2_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmax_vf_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmax.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmax_vf_bf16m2_m(vbool8_t mask, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_vf_bf16m2_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmax_vv_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmax.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmax_vv_bf16m4_m(vbool4_t mask, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfmax_vv_bf16m4_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmax_vf_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmax.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmax_vf_bf16m4_m(vbool4_t mask, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_vf_bf16m4_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmax_vv_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmax.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmax_vv_bf16m8_m(vbool2_t mask, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfmax_vv_bf16m8_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmax_vf_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmax.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmax_vf_bf16m8_m(vbool2_t mask, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_vf_bf16m8_m(mask, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfmerge.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfmerge.c
new file mode 100644
index 0000000000000..96221c5385dd9
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfmerge.c
@@ -0,0 +1,69 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmerge_vfm_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmerge.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmerge_vfm_bf16mf4(vbfloat16mf4_t op1, __bf16 op2, vbool64_t mask, size_t vl) {
+  return __riscv_vfmerge_vfm_bf16mf4(op1, op2, mask, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmerge_vfm_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmerge.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmerge_vfm_bf16mf2(vbfloat16mf2_t op1, __bf16 op2, vbool32_t mask, size_t vl) {
+  return __riscv_vfmerge_vfm_bf16mf2(op1, op2, mask, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmerge_vfm_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmerge.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmerge_vfm_bf16m1(vbfloat16m1_t op1, __bf16 op2, vbool16_t mask, size_t vl) {
+  return __riscv_vfmerge_vfm_bf16m1(op1, op2, mask, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmerge_vfm_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmerge.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmerge_vfm_bf16m2(vbfloat16m2_t op1, __bf16 op2, vbool8_t mask, size_t vl) {
+  return __riscv_vfmerge_vfm_bf16m2(op1, op2, mask, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmerge_vfm_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmerge.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmerge_vfm_bf16m4(vbfloat16m4_t op1, __bf16 op2, vbool4_t mask, size_t vl) {
+  return __riscv_vfmerge_vfm_bf16m4(op1, op2, mask, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmerge_vfm_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmerge.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmerge_vfm_bf16m8(vbfloat16m8_t op1, __bf16 op2, vbool2_t mask, size_t vl) {
+  return __riscv_vfmerge_vfm_bf16m8(op1, op2, mask, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfmin.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfmin.c
new file mode 100644
index 0000000000000..8f8d82ba21bc5
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfmin.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmin_vv_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmin.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmin_vv_bf16mf4(vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfmin_vv_bf16mf4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmin_vf_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmin.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmin_vf_bf16mf4(vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_vf_bf16mf4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmin_vv_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmin.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmin_vv_bf16mf2(vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfmin_vv_bf16mf2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmin_vf_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmin.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmin_vf_bf16mf2(vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_vf_bf16mf2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmin_vv_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmin.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmin_vv_bf16m1(vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfmin_vv_bf16m1(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmin_vf_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmin.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmin_vf_bf16m1(vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_vf_bf16m1(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmin_vv_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmin.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmin_vv_bf16m2(vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfmin_vv_bf16m2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmin_vf_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmin.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmin_vf_bf16m2(vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_vf_bf16m2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmin_vv_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmin.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmin_vv_bf16m4(vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfmin_vv_bf16m4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmin_vf_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmin.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmin_vf_bf16m4(vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_vf_bf16m4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmin_vv_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmin.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmin_vv_bf16m8(vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfmin_vv_bf16m8(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmin_vf_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmin.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmin_vf_bf16m8(vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_vf_bf16m8(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmin_vv_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmin.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmin_vv_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfmin_vv_bf16mf4_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmin_vf_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmin.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmin_vf_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_vf_bf16mf4_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmin_vv_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmin.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmin_vv_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfmin_vv_bf16mf2_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmin_vf_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmin.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmin_vf_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_vf_bf16mf2_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmin_vv_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmin.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmin_vv_bf16m1_m(vbool16_t mask, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfmin_vv_bf16m1_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmin_vf_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmin.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmin_vf_bf16m1_m(vbool16_t mask, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_vf_bf16m1_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmin_vv_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmin.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmin_vv_bf16m2_m(vbool8_t mask, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfmin_vv_bf16m2_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmin_vf_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmin.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmin_vf_bf16m2_m(vbool8_t mask, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_vf_bf16m2_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmin_vv_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmin.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmin_vv_bf16m4_m(vbool4_t mask, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfmin_vv_bf16m4_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmin_vf_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmin.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmin_vf_bf16m4_m(vbool4_t mask, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_vf_bf16m4_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmin_vv_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmin.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmin_vv_bf16m8_m(vbool2_t mask, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfmin_vv_bf16m8_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmin_vf_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmin.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmin_vf_bf16m8_m(vbool2_t mask, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_vf_bf16m8_m(mask, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfmsac.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfmsac.c
new file mode 100644
index 0000000000000..f4644dfb8d7e7
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfmsac.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsac_vv_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsac.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsac_vv_bf16mf4(vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsac_vv_bf16mf4(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsac_vf_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsac.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsac_vf_bf16mf4(vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsac_vf_bf16mf4(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsac_vv_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsac.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsac_vv_bf16mf2(vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsac_vv_bf16mf2(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsac_vf_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsac.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsac_vf_bf16mf2(vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsac_vf_bf16mf2(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsac_vv_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsac.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsac_vv_bf16m1(vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsac_vv_bf16m1(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsac_vf_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsac.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsac_vf_bf16m1(vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsac_vf_bf16m1(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsac_vv_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsac.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsac_vv_bf16m2(vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsac_vv_bf16m2(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsac_vf_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsac.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsac_vf_bf16m2(vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsac_vf_bf16m2(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsac_vv_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsac.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsac_vv_bf16m4(vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsac_vv_bf16m4(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsac_vf_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsac.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsac_vf_bf16m4(vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsac_vf_bf16m4(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsac_vv_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsac.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsac_vv_bf16m8(vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsac_vv_bf16m8(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsac_vf_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsac.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsac_vf_bf16m8(vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsac_vf_bf16m8(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsac_vv_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsac.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsac_vv_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsac_vv_bf16mf4_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsac_vf_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsac.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsac_vf_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsac_vf_bf16mf4_m(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsac_vv_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsac.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsac_vv_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsac_vv_bf16mf2_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsac_vf_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsac.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsac_vf_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsac_vf_bf16mf2_m(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsac_vv_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsac.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsac_vv_bf16m1_m(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsac_vv_bf16m1_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsac_vf_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsac.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsac_vf_bf16m1_m(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsac_vf_bf16m1_m(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsac_vv_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsac.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsac_vv_bf16m2_m(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsac_vv_bf16m2_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsac_vf_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsac.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsac_vf_bf16m2_m(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsac_vf_bf16m2_m(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsac_vv_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsac.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsac_vv_bf16m4_m(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsac_vv_bf16m4_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsac_vf_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsac.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsac_vf_bf16m4_m(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsac_vf_bf16m4_m(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsac_vv_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsac.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsac_vv_bf16m8_m(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsac_vv_bf16m8_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsac_vf_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsac.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsac_vf_bf16m8_m(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsac_vf_bf16m8_m(mask, vd, rs1, vs2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfmsub.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfmsub.c
new file mode 100644
index 0000000000000..07053afa3355c
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfmsub.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsub_vv_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsub.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsub_vv_bf16mf4(vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsub_vv_bf16mf4(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsub_vf_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsub.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsub_vf_bf16mf4(vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsub_vf_bf16mf4(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsub_vv_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsub.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsub_vv_bf16mf2(vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsub_vv_bf16mf2(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsub_vf_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsub.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsub_vf_bf16mf2(vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsub_vf_bf16mf2(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsub_vv_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsub.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsub_vv_bf16m1(vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsub_vv_bf16m1(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsub_vf_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsub.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsub_vf_bf16m1(vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsub_vf_bf16m1(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsub_vv_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsub.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsub_vv_bf16m2(vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsub_vv_bf16m2(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsub_vf_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsub.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsub_vf_bf16m2(vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsub_vf_bf16m2(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsub_vv_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsub.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsub_vv_bf16m4(vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsub_vv_bf16m4(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsub_vf_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsub.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsub_vf_bf16m4(vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsub_vf_bf16m4(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsub_vv_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsub.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsub_vv_bf16m8(vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsub_vv_bf16m8(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsub_vf_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsub.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsub_vf_bf16m8(vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsub_vf_bf16m8(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsub_vv_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsub.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsub_vv_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsub_vv_bf16mf4_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsub_vf_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsub.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsub_vf_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsub_vf_bf16mf4_m(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsub_vv_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsub.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsub_vv_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsub_vv_bf16mf2_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsub_vf_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsub.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsub_vf_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsub_vf_bf16mf2_m(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsub_vv_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsub.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsub_vv_bf16m1_m(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsub_vv_bf16m1_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsub_vf_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsub.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsub_vf_bf16m1_m(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsub_vf_bf16m1_m(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsub_vv_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsub.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsub_vv_bf16m2_m(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsub_vv_bf16m2_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsub_vf_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsub.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsub_vf_bf16m2_m(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsub_vf_bf16m2_m(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsub_vv_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsub.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsub_vv_bf16m4_m(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsub_vv_bf16m4_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsub_vf_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsub.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsub_vf_bf16m4_m(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsub_vf_bf16m4_m(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsub_vv_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsub.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsub_vv_bf16m8_m(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsub_vv_bf16m8_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsub_vf_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsub.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsub_vf_bf16m8_m(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsub_vf_bf16m8_m(mask, vd, rs1, vs2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfmul.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfmul.c
new file mode 100644
index 0000000000000..88fb329934365
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfmul.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmul_vv_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmul.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmul_vv_bf16mf4(vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfmul_vv_bf16mf4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmul_vf_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmul.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmul_vf_bf16mf4(vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_vf_bf16mf4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmul_vv_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmul.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmul_vv_bf16mf2(vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfmul_vv_bf16mf2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmul_vf_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmul.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmul_vf_bf16mf2(vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_vf_bf16mf2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmul_vv_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmul.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmul_vv_bf16m1(vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfmul_vv_bf16m1(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmul_vf_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmul.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmul_vf_bf16m1(vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_vf_bf16m1(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmul_vv_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmul.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmul_vv_bf16m2(vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfmul_vv_bf16m2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmul_vf_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmul.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmul_vf_bf16m2(vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_vf_bf16m2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmul_vv_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmul.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmul_vv_bf16m4(vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfmul_vv_bf16m4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmul_vf_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmul.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmul_vf_bf16m4(vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_vf_bf16m4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmul_vv_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmul.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmul_vv_bf16m8(vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfmul_vv_bf16m8(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmul_vf_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmul.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmul_vf_bf16m8(vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_vf_bf16m8(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmul_vv_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmul.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmul_vv_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfmul_vv_bf16mf4_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmul_vf_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmul.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmul_vf_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_vf_bf16mf4_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmul_vv_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmul.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmul_vv_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfmul_vv_bf16mf2_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmul_vf_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmul.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmul_vf_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_vf_bf16mf2_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmul_vv_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmul.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmul_vv_bf16m1_m(vbool16_t mask, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfmul_vv_bf16m1_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmul_vf_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmul.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmul_vf_bf16m1_m(vbool16_t mask, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_vf_bf16m1_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmul_vv_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmul.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmul_vv_bf16m2_m(vbool8_t mask, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfmul_vv_bf16m2_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmul_vf_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmul.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmul_vf_bf16m2_m(vbool8_t mask, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_vf_bf16m2_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmul_vv_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmul.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmul_vv_bf16m4_m(vbool4_t mask, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfmul_vv_bf16m4_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmul_vf_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmul.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmul_vf_bf16m4_m(vbool4_t mask, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_vf_bf16m4_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmul_vv_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmul.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmul_vv_bf16m8_m(vbool2_t mask, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfmul_vv_bf16m8_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmul_vf_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmul.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmul_vf_bf16m8_m(vbool2_t mask, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_vf_bf16m8_m(mask, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfmv.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfmv.c
new file mode 100644
index 0000000000000..d80ec3df1bbaa
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfmv.c
@@ -0,0 +1,189 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmv_v_f_bf16mf4(
+// CHECK-RV64-SAME: bfloat noundef [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmv.v.f.nxv1bf16.i64(<vscale x 1 x bfloat> poison, bfloat [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmv_v_f_bf16mf4(__bf16 src, size_t vl) {
+  return __riscv_vfmv_v_f_bf16mf4(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmv_v_f_bf16mf2(
+// CHECK-RV64-SAME: bfloat noundef [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmv.v.f.nxv2bf16.i64(<vscale x 2 x bfloat> poison, bfloat [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmv_v_f_bf16mf2(__bf16 src, size_t vl) {
+  return __riscv_vfmv_v_f_bf16mf2(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmv_v_f_bf16m1(
+// CHECK-RV64-SAME: bfloat noundef [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmv.v.f.nxv4bf16.i64(<vscale x 4 x bfloat> poison, bfloat [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmv_v_f_bf16m1(__bf16 src, size_t vl) {
+  return __riscv_vfmv_v_f_bf16m1(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmv_v_f_bf16m2(
+// CHECK-RV64-SAME: bfloat noundef [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmv.v.f.nxv8bf16.i64(<vscale x 8 x bfloat> poison, bfloat [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmv_v_f_bf16m2(__bf16 src, size_t vl) {
+  return __riscv_vfmv_v_f_bf16m2(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmv_v_f_bf16m4(
+// CHECK-RV64-SAME: bfloat noundef [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmv.v.f.nxv16bf16.i64(<vscale x 16 x bfloat> poison, bfloat [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmv_v_f_bf16m4(__bf16 src, size_t vl) {
+  return __riscv_vfmv_v_f_bf16m4(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmv_v_f_bf16m8(
+// CHECK-RV64-SAME: bfloat noundef [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmv.v.f.nxv32bf16.i64(<vscale x 32 x bfloat> poison, bfloat [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmv_v_f_bf16m8(__bf16 src, size_t vl) {
+  return __riscv_vfmv_v_f_bf16m8(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local bfloat @test_vfmv_f_s_bf16mf4_bf16(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[SRC:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call bfloat @llvm.riscv.vfmv.f.s.nxv1bf16(<vscale x 1 x bfloat> [[SRC]])
+// CHECK-RV64-NEXT:    ret bfloat [[TMP0]]
+//
+__bf16 test_vfmv_f_s_bf16mf4_bf16(vbfloat16mf4_t src) {
+  return __riscv_vfmv_f_s_bf16mf4_bf16(src);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmv_s_f_bf16mf4(
+// CHECK-RV64-SAME: bfloat noundef [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmv.s.f.nxv1bf16.i64(<vscale x 1 x bfloat> poison, bfloat [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmv_s_f_bf16mf4(__bf16 src, size_t vl) {
+  return __riscv_vfmv_s_f_bf16mf4(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local bfloat @test_vfmv_f_s_bf16mf2_bf16(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[SRC:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call bfloat @llvm.riscv.vfmv.f.s.nxv2bf16(<vscale x 2 x bfloat> [[SRC]])
+// CHECK-RV64-NEXT:    ret bfloat [[TMP0]]
+//
+__bf16 test_vfmv_f_s_bf16mf2_bf16(vbfloat16mf2_t src) {
+  return __riscv_vfmv_f_s_bf16mf2_bf16(src);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmv_s_f_bf16mf2(
+// CHECK-RV64-SAME: bfloat noundef [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmv.s.f.nxv2bf16.i64(<vscale x 2 x bfloat> poison, bfloat [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmv_s_f_bf16mf2(__bf16 src, size_t vl) {
+  return __riscv_vfmv_s_f_bf16mf2(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local bfloat @test_vfmv_f_s_bf16m1_bf16(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[SRC:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call bfloat @llvm.riscv.vfmv.f.s.nxv4bf16(<vscale x 4 x bfloat> [[SRC]])
+// CHECK-RV64-NEXT:    ret bfloat [[TMP0]]
+//
+__bf16 test_vfmv_f_s_bf16m1_bf16(vbfloat16m1_t src) {
+  return __riscv_vfmv_f_s_bf16m1_bf16(src);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmv_s_f_bf16m1(
+// CHECK-RV64-SAME: bfloat noundef [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmv.s.f.nxv4bf16.i64(<vscale x 4 x bfloat> poison, bfloat [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmv_s_f_bf16m1(__bf16 src, size_t vl) {
+  return __riscv_vfmv_s_f_bf16m1(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local bfloat @test_vfmv_f_s_bf16m2_bf16(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[SRC:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call bfloat @llvm.riscv.vfmv.f.s.nxv8bf16(<vscale x 8 x bfloat> [[SRC]])
+// CHECK-RV64-NEXT:    ret bfloat [[TMP0]]
+//
+__bf16 test_vfmv_f_s_bf16m2_bf16(vbfloat16m2_t src) {
+  return __riscv_vfmv_f_s_bf16m2_bf16(src);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmv_s_f_bf16m2(
+// CHECK-RV64-SAME: bfloat noundef [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmv.s.f.nxv8bf16.i64(<vscale x 8 x bfloat> poison, bfloat [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmv_s_f_bf16m2(__bf16 src, size_t vl) {
+  return __riscv_vfmv_s_f_bf16m2(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local bfloat @test_vfmv_f_s_bf16m4_bf16(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[SRC:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call bfloat @llvm.riscv.vfmv.f.s.nxv16bf16(<vscale x 16 x bfloat> [[SRC]])
+// CHECK-RV64-NEXT:    ret bfloat [[TMP0]]
+//
+__bf16 test_vfmv_f_s_bf16m4_bf16(vbfloat16m4_t src) {
+  return __riscv_vfmv_f_s_bf16m4_bf16(src);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmv_s_f_bf16m4(
+// CHECK-RV64-SAME: bfloat noundef [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmv.s.f.nxv16bf16.i64(<vscale x 16 x bfloat> poison, bfloat [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmv_s_f_bf16m4(__bf16 src, size_t vl) {
+  return __riscv_vfmv_s_f_bf16m4(src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local bfloat @test_vfmv_f_s_bf16m8_bf16(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[SRC:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call bfloat @llvm.riscv.vfmv.f.s.nxv32bf16(<vscale x 32 x bfloat> [[SRC]])
+// CHECK-RV64-NEXT:    ret bfloat [[TMP0]]
+//
+__bf16 test_vfmv_f_s_bf16m8_bf16(vbfloat16m8_t src) {
+  return __riscv_vfmv_f_s_bf16m8_bf16(src);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmv_s_f_bf16m8(
+// CHECK-RV64-SAME: bfloat noundef [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmv.s.f.nxv32bf16.i64(<vscale x 32 x bfloat> poison, bfloat [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmv_s_f_bf16m8(__bf16 src, size_t vl) {
+  return __riscv_vfmv_s_f_bf16m8(src, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfncvt.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfncvt.c
new file mode 100644
index 0000000000000..a5afab9bec1ec
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfncvt.c
@@ -0,0 +1,724 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_x_f_w_bf16mf4_i8mf8(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.x.f.w.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> poison, <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vint8mf8_t test_vfncvt_x_f_w_bf16mf4_i8mf8(vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16mf4_i8mf8(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_x_f_w_bf16mf2_i8mf4(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.x.f.w.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> poison, <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vint8mf4_t test_vfncvt_x_f_w_bf16mf2_i8mf4(vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16mf2_i8mf4(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_x_f_w_bf16m1_i8mf2(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.x.f.w.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> poison, <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vint8mf2_t test_vfncvt_x_f_w_bf16m1_i8mf2(vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m1_i8mf2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_x_f_w_bf16m2_i8m1(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.x.f.w.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> poison, <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vint8m1_t test_vfncvt_x_f_w_bf16m2_i8m1(vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m2_i8m1(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_x_f_w_bf16m4_i8m2(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.x.f.w.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> poison, <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vint8m2_t test_vfncvt_x_f_w_bf16m4_i8m2(vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m4_i8m2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_x_f_w_bf16m8_i8m4(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.x.f.w.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> poison, <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vint8m4_t test_vfncvt_x_f_w_bf16m8_i8m4(vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m8_i8m4(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_xu_f_w_bf16mf4_u8mf8(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> poison, <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vuint8mf8_t test_vfncvt_xu_f_w_bf16mf4_u8mf8(vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16mf4_u8mf8(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_xu_f_w_bf16mf2_u8mf4(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> poison, <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vuint8mf4_t test_vfncvt_xu_f_w_bf16mf2_u8mf4(vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16mf2_u8mf4(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_xu_f_w_bf16m1_u8mf2(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> poison, <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vuint8mf2_t test_vfncvt_xu_f_w_bf16m1_u8mf2(vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m1_u8mf2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_xu_f_w_bf16m2_u8m1(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> poison, <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vuint8m1_t test_vfncvt_xu_f_w_bf16m2_u8m1(vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m2_u8m1(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_xu_f_w_bf16m4_u8m2(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> poison, <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vuint8m2_t test_vfncvt_xu_f_w_bf16m4_u8m2(vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m4_u8m2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_xu_f_w_bf16m8_u8m4(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> poison, <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vuint8m4_t test_vfncvt_xu_f_w_bf16m8_u8m4(vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m8_u8m4(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfncvt_f_f_w_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfncvt.f.f.w.nxv1bf16.nxv1f32.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x float> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfncvt_f_f_w_bf16mf4(vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16mf4(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfncvt_f_f_w_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfncvt.f.f.w.nxv2bf16.nxv2f32.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x float> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfncvt_f_f_w_bf16mf2(vfloat32m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16mf2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfncvt_f_f_w_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfncvt.f.f.w.nxv4bf16.nxv4f32.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x float> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfncvt_f_f_w_bf16m1(vfloat32m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16m1(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfncvt_f_f_w_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfncvt.f.f.w.nxv8bf16.nxv8f32.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x float> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfncvt_f_f_w_bf16m2(vfloat32m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16m2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfncvt_f_f_w_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfncvt.f.f.w.nxv16bf16.nxv16f32.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x float> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfncvt_f_f_w_bf16m4(vfloat32m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16m4(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_x_f_w_bf16mf4_i8mf8_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vint8mf8_t test_vfncvt_x_f_w_bf16mf4_i8mf8_m(vbool64_t vm, vbfloat16mf4_t vs2,
+                                             size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16mf4_i8mf8_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_x_f_w_bf16mf2_i8mf4_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vint8mf4_t test_vfncvt_x_f_w_bf16mf2_i8mf4_m(vbool32_t vm, vbfloat16mf2_t vs2,
+                                             size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16mf2_i8mf4_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_x_f_w_bf16m1_i8mf2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vint8mf2_t test_vfncvt_x_f_w_bf16m1_i8mf2_m(vbool16_t vm, vbfloat16m1_t vs2,
+                                            size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m1_i8mf2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_x_f_w_bf16m2_i8m1_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vint8m1_t test_vfncvt_x_f_w_bf16m2_i8m1_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                          size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m2_i8m1_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_x_f_w_bf16m4_i8m2_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vint8m2_t test_vfncvt_x_f_w_bf16m4_i8m2_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                          size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m4_i8m2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_x_f_w_bf16m8_i8m4_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> poison, <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vint8m4_t test_vfncvt_x_f_w_bf16m8_i8m4_m(vbool2_t vm, vbfloat16m8_t vs2,
+                                          size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m8_i8m4_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_xu_f_w_bf16mf4_u8mf8_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vuint8mf8_t test_vfncvt_xu_f_w_bf16mf4_u8mf8_m(vbool64_t vm, vbfloat16mf4_t vs2,
+                                               size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16mf4_u8mf8_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_xu_f_w_bf16mf2_u8mf4_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vuint8mf4_t test_vfncvt_xu_f_w_bf16mf2_u8mf4_m(vbool32_t vm, vbfloat16mf2_t vs2,
+                                               size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16mf2_u8mf4_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_xu_f_w_bf16m1_u8mf2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vuint8mf2_t test_vfncvt_xu_f_w_bf16m1_u8mf2_m(vbool16_t vm, vbfloat16m1_t vs2,
+                                              size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m1_u8mf2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_xu_f_w_bf16m2_u8m1_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vuint8m1_t test_vfncvt_xu_f_w_bf16m2_u8m1_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                            size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m2_u8m1_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_xu_f_w_bf16m4_u8m2_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vuint8m2_t test_vfncvt_xu_f_w_bf16m4_u8m2_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                            size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m4_u8m2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_xu_f_w_bf16m8_u8m4_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> poison, <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vuint8m4_t test_vfncvt_xu_f_w_bf16m8_u8m4_m(vbool2_t vm, vbfloat16m8_t vs2,
+                                            size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m8_u8m4_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfncvt_f_f_w_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv1bf16.nxv1f32.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfncvt_f_f_w_bf16mf4_m(vbool64_t vm, vfloat32mf2_t vs2,
+                                           size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16mf4_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfncvt_f_f_w_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv2bf16.nxv2f32.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfncvt_f_f_w_bf16mf2_m(vbool32_t vm, vfloat32m1_t vs2,
+                                           size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16mf2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfncvt_f_f_w_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv4bf16.nxv4f32.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfncvt_f_f_w_bf16m1_m(vbool16_t vm, vfloat32m2_t vs2,
+                                         size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16m1_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfncvt_f_f_w_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv8bf16.nxv8f32.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfncvt_f_f_w_bf16m2_m(vbool8_t vm, vfloat32m4_t vs2,
+                                         size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16m2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfncvt_f_f_w_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv16bf16.nxv16f32.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfncvt_f_f_w_bf16m4_m(vbool4_t vm, vfloat32m8_t vs2,
+                                         size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16m4_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_x_f_w_bf16mf4_i8mf8_rm(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.x.f.w.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> poison, <vscale x 1 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vint8mf8_t test_vfncvt_x_f_w_bf16mf4_i8mf8_rm(vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16mf4_i8mf8_rm(vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_x_f_w_bf16mf2_i8mf4_rm(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.x.f.w.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> poison, <vscale x 2 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vint8mf4_t test_vfncvt_x_f_w_bf16mf2_i8mf4_rm(vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16mf2_i8mf4_rm(vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_x_f_w_bf16m1_i8mf2_rm(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.x.f.w.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> poison, <vscale x 4 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vint8mf2_t test_vfncvt_x_f_w_bf16m1_i8mf2_rm(vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m1_i8mf2_rm(vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_x_f_w_bf16m2_i8m1_rm(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.x.f.w.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> poison, <vscale x 8 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vint8m1_t test_vfncvt_x_f_w_bf16m2_i8m1_rm(vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m2_i8m1_rm(vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_x_f_w_bf16m4_i8m2_rm(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.x.f.w.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> poison, <vscale x 16 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vint8m2_t test_vfncvt_x_f_w_bf16m4_i8m2_rm(vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m4_i8m2_rm(vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_x_f_w_bf16m8_i8m4_rm(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.x.f.w.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> poison, <vscale x 32 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vint8m4_t test_vfncvt_x_f_w_bf16m8_i8m4_rm(vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m8_i8m4_rm(vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_xu_f_w_bf16mf4_u8mf8_rm(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> poison, <vscale x 1 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vuint8mf8_t test_vfncvt_xu_f_w_bf16mf4_u8mf8_rm(vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16mf4_u8mf8_rm(vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_xu_f_w_bf16mf2_u8mf4_rm(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> poison, <vscale x 2 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vuint8mf4_t test_vfncvt_xu_f_w_bf16mf2_u8mf4_rm(vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16mf2_u8mf4_rm(vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_xu_f_w_bf16m1_u8mf2_rm(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> poison, <vscale x 4 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vuint8mf2_t test_vfncvt_xu_f_w_bf16m1_u8mf2_rm(vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m1_u8mf2_rm(vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_xu_f_w_bf16m2_u8m1_rm(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> poison, <vscale x 8 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vuint8m1_t test_vfncvt_xu_f_w_bf16m2_u8m1_rm(vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m2_u8m1_rm(vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_xu_f_w_bf16m4_u8m2_rm(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> poison, <vscale x 16 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vuint8m2_t test_vfncvt_xu_f_w_bf16m4_u8m2_rm(vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m4_u8m2_rm(vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_xu_f_w_bf16m8_u8m4_rm(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> poison, <vscale x 32 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vuint8m4_t test_vfncvt_xu_f_w_bf16m8_u8m4_rm(vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m8_u8m4_rm(vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfncvt_f_f_w_bf16mf4_rm(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfncvt.f.f.w.nxv1bf16.nxv1f32.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x float> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfncvt_f_f_w_bf16mf4_rm(vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16mf4_rm(vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfncvt_f_f_w_bf16mf2_rm(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfncvt.f.f.w.nxv2bf16.nxv2f32.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x float> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfncvt_f_f_w_bf16mf2_rm(vfloat32m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16mf2_rm(vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfncvt_f_f_w_bf16m1_rm(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfncvt.f.f.w.nxv4bf16.nxv4f32.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x float> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfncvt_f_f_w_bf16m1_rm(vfloat32m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16m1_rm(vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfncvt_f_f_w_bf16m2_rm(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfncvt.f.f.w.nxv8bf16.nxv8f32.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x float> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfncvt_f_f_w_bf16m2_rm(vfloat32m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16m2_rm(vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfncvt_f_f_w_bf16m4_rm(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfncvt.f.f.w.nxv16bf16.nxv16f32.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x float> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfncvt_f_f_w_bf16m4_rm(vfloat32m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16m4_rm(vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_x_f_w_bf16mf4_i8mf8_rm_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vint8mf8_t test_vfncvt_x_f_w_bf16mf4_i8mf8_rm_m(vbool64_t vm,
+                                                vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16mf4_i8mf8_rm_m(vm, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_x_f_w_bf16mf2_i8mf4_rm_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vint8mf4_t test_vfncvt_x_f_w_bf16mf2_i8mf4_rm_m(vbool32_t vm,
+                                                vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16mf2_i8mf4_rm_m(vm, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_x_f_w_bf16m1_i8mf2_rm_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vint8mf2_t test_vfncvt_x_f_w_bf16m1_i8mf2_rm_m(vbool16_t vm, vbfloat16m1_t vs2,
+                                               size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m1_i8mf2_rm_m(vm, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_x_f_w_bf16m2_i8m1_rm_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vint8m1_t test_vfncvt_x_f_w_bf16m2_i8m1_rm_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                             size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m2_i8m1_rm_m(vm, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_x_f_w_bf16m4_i8m2_rm_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vint8m2_t test_vfncvt_x_f_w_bf16m4_i8m2_rm_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                             size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m4_i8m2_rm_m(vm, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_x_f_w_bf16m8_i8m4_rm_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> poison, <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vint8m4_t test_vfncvt_x_f_w_bf16m8_i8m4_rm_m(vbool2_t vm, vbfloat16m8_t vs2,
+                                             size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m8_i8m4_rm_m(vm, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_xu_f_w_bf16mf4_u8mf8_rm_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vuint8mf8_t test_vfncvt_xu_f_w_bf16mf4_u8mf8_rm_m(vbool64_t vm,
+                                                  vbfloat16mf4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16mf4_u8mf8_rm_m(vm, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_xu_f_w_bf16mf2_u8mf4_rm_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vuint8mf4_t test_vfncvt_xu_f_w_bf16mf2_u8mf4_rm_m(vbool32_t vm,
+                                                  vbfloat16mf2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16mf2_u8mf4_rm_m(vm, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_xu_f_w_bf16m1_u8mf2_rm_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vuint8mf2_t test_vfncvt_xu_f_w_bf16m1_u8mf2_rm_m(vbool16_t vm,
+                                                 vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m1_u8mf2_rm_m(vm, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_xu_f_w_bf16m2_u8m1_rm_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vuint8m1_t test_vfncvt_xu_f_w_bf16m2_u8m1_rm_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                               size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m2_u8m1_rm_m(vm, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_xu_f_w_bf16m4_u8m2_rm_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vuint8m2_t test_vfncvt_xu_f_w_bf16m4_u8m2_rm_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                               size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m4_u8m2_rm_m(vm, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_xu_f_w_bf16m8_u8m4_rm_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> poison, <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vuint8m4_t test_vfncvt_xu_f_w_bf16m8_u8m4_rm_m(vbool2_t vm, vbfloat16m8_t vs2,
+                                               size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m8_u8m4_rm_m(vm, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfncvt_f_f_w_bf16mf4_rm_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv1bf16.nxv1f32.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfncvt_f_f_w_bf16mf4_rm_m(vbool64_t vm, vfloat32mf2_t vs2,
+                                              size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16mf4_rm_m(vm, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfncvt_f_f_w_bf16mf2_rm_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv2bf16.nxv2f32.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfncvt_f_f_w_bf16mf2_rm_m(vbool32_t vm, vfloat32m1_t vs2,
+                                              size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16mf2_rm_m(vm, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfncvt_f_f_w_bf16m1_rm_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv4bf16.nxv4f32.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfncvt_f_f_w_bf16m1_rm_m(vbool16_t vm, vfloat32m2_t vs2,
+                                            size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16m1_rm_m(vm, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfncvt_f_f_w_bf16m2_rm_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv8bf16.nxv8f32.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfncvt_f_f_w_bf16m2_rm_m(vbool8_t vm, vfloat32m4_t vs2,
+                                            size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16m2_rm_m(vm, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfncvt_f_f_w_bf16m4_rm_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv16bf16.nxv16f32.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfncvt_f_f_w_bf16m4_rm_m(vbool4_t vm, vfloat32m8_t vs2,
+                                            size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16m4_rm_m(vm, vs2, __RISCV_FRM_RNE, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfncvt_rod.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfncvt_rod.c
new file mode 100644
index 0000000000000..70c377bba1dfb
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfncvt_rod.c
@@ -0,0 +1,113 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfncvt_rod_f_f_w_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.nxv1bf16.nxv1f32.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfncvt_rod_f_f_w_bf16mf4(vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_f_w_bf16mf4(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfncvt_rod_f_f_w_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.nxv2bf16.nxv2f32.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfncvt_rod_f_f_w_bf16mf2(vfloat32m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_f_w_bf16mf2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfncvt_rod_f_f_w_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.nxv4bf16.nxv4f32.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfncvt_rod_f_f_w_bf16m1(vfloat32m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_f_w_bf16m1(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfncvt_rod_f_f_w_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.nxv8bf16.nxv8f32.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfncvt_rod_f_f_w_bf16m2(vfloat32m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_f_w_bf16m2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfncvt_rod_f_f_w_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.nxv16bf16.nxv16f32.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfncvt_rod_f_f_w_bf16m4(vfloat32m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_f_w_bf16m4(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfncvt_rod_f_f_w_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv1bf16.nxv1f32.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfncvt_rod_f_f_w_bf16mf4_m(vbool64_t vm, vfloat32mf2_t vs2,
+                                               size_t vl) {
+  return __riscv_vfncvt_rod_f_f_w_bf16mf4_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfncvt_rod_f_f_w_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv2bf16.nxv2f32.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfncvt_rod_f_f_w_bf16mf2_m(vbool32_t vm, vfloat32m1_t vs2,
+                                               size_t vl) {
+  return __riscv_vfncvt_rod_f_f_w_bf16mf2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfncvt_rod_f_f_w_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv4bf16.nxv4f32.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfncvt_rod_f_f_w_bf16m1_m(vbool16_t vm, vfloat32m2_t vs2,
+                                             size_t vl) {
+  return __riscv_vfncvt_rod_f_f_w_bf16m1_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfncvt_rod_f_f_w_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv8bf16.nxv8f32.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfncvt_rod_f_f_w_bf16m2_m(vbool8_t vm, vfloat32m4_t vs2,
+                                             size_t vl) {
+  return __riscv_vfncvt_rod_f_f_w_bf16m2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfncvt_rod_f_f_w_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv16bf16.nxv16f32.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfncvt_rod_f_f_w_bf16m4_m(vbool4_t vm, vfloat32m8_t vs2,
+                                             size_t vl) {
+  return __riscv_vfncvt_rod_f_f_w_bf16m4_m(vm, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfncvt_rtz.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfncvt_rtz.c
new file mode 100644
index 0000000000000..854e9868109e4
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfncvt_rtz.c
@@ -0,0 +1,267 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_rtz_x_f_w_bf16mf4_i8mf8(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> poison, <vscale x 1 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vint8mf8_t test_vfncvt_rtz_x_f_w_bf16mf4_i8mf8(vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_x_f_w_bf16mf4_i8mf8(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_rtz_x_f_w_bf16mf2_i8mf4(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> poison, <vscale x 2 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vint8mf4_t test_vfncvt_rtz_x_f_w_bf16mf2_i8mf4(vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_x_f_w_bf16mf2_i8mf4(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_rtz_x_f_w_bf16m1_i8mf2(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> poison, <vscale x 4 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vint8mf2_t test_vfncvt_rtz_x_f_w_bf16m1_i8mf2(vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_x_f_w_bf16m1_i8mf2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_rtz_x_f_w_bf16m2_i8m1(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> poison, <vscale x 8 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vint8m1_t test_vfncvt_rtz_x_f_w_bf16m2_i8m1(vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_x_f_w_bf16m2_i8m1(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_rtz_x_f_w_bf16m4_i8m2(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> poison, <vscale x 16 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vint8m2_t test_vfncvt_rtz_x_f_w_bf16m4_i8m2(vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_x_f_w_bf16m4_i8m2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_rtz_x_f_w_bf16m8_i8m4(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> poison, <vscale x 32 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vint8m4_t test_vfncvt_rtz_x_f_w_bf16m8_i8m4(vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_x_f_w_bf16m8_i8m4(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_rtz_xu_f_w_bf16mf4_u8mf8(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> poison, <vscale x 1 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vuint8mf8_t test_vfncvt_rtz_xu_f_w_bf16mf4_u8mf8(vbfloat16mf4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfncvt_rtz_xu_f_w_bf16mf4_u8mf8(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_rtz_xu_f_w_bf16mf2_u8mf4(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> poison, <vscale x 2 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vuint8mf4_t test_vfncvt_rtz_xu_f_w_bf16mf2_u8mf4(vbfloat16mf2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfncvt_rtz_xu_f_w_bf16mf2_u8mf4(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_rtz_xu_f_w_bf16m1_u8mf2(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> poison, <vscale x 4 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vuint8mf2_t test_vfncvt_rtz_xu_f_w_bf16m1_u8mf2(vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_xu_f_w_bf16m1_u8mf2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_rtz_xu_f_w_bf16m2_u8m1(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> poison, <vscale x 8 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vuint8m1_t test_vfncvt_rtz_xu_f_w_bf16m2_u8m1(vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_xu_f_w_bf16m2_u8m1(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_rtz_xu_f_w_bf16m4_u8m2(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> poison, <vscale x 16 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vuint8m2_t test_vfncvt_rtz_xu_f_w_bf16m4_u8m2(vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_xu_f_w_bf16m4_u8m2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_rtz_xu_f_w_bf16m8_u8m4(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> poison, <vscale x 32 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vuint8m4_t test_vfncvt_rtz_xu_f_w_bf16m8_u8m4(vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_xu_f_w_bf16m8_u8m4(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_rtz_x_f_w_bf16mf4_i8mf8_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vint8mf8_t test_vfncvt_rtz_x_f_w_bf16mf4_i8mf8_m(vbool64_t vm,
+                                                 vbfloat16mf4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfncvt_rtz_x_f_w_bf16mf4_i8mf8_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_rtz_x_f_w_bf16mf2_i8mf4_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vint8mf4_t test_vfncvt_rtz_x_f_w_bf16mf2_i8mf4_m(vbool32_t vm,
+                                                 vbfloat16mf2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfncvt_rtz_x_f_w_bf16mf2_i8mf4_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_rtz_x_f_w_bf16m1_i8mf2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vint8mf2_t test_vfncvt_rtz_x_f_w_bf16m1_i8mf2_m(vbool16_t vm, vbfloat16m1_t vs2,
+                                                size_t vl) {
+  return __riscv_vfncvt_rtz_x_f_w_bf16m1_i8mf2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_rtz_x_f_w_bf16m2_i8m1_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vint8m1_t test_vfncvt_rtz_x_f_w_bf16m2_i8m1_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                              size_t vl) {
+  return __riscv_vfncvt_rtz_x_f_w_bf16m2_i8m1_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_rtz_x_f_w_bf16m4_i8m2_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vint8m2_t test_vfncvt_rtz_x_f_w_bf16m4_i8m2_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                              size_t vl) {
+  return __riscv_vfncvt_rtz_x_f_w_bf16m4_i8m2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_rtz_x_f_w_bf16m8_i8m4_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> poison, <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vint8m4_t test_vfncvt_rtz_x_f_w_bf16m8_i8m4_m(vbool2_t vm, vbfloat16m8_t vs2,
+                                              size_t vl) {
+  return __riscv_vfncvt_rtz_x_f_w_bf16m8_i8m4_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_rtz_xu_f_w_bf16mf4_u8mf8_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vuint8mf8_t test_vfncvt_rtz_xu_f_w_bf16mf4_u8mf8_m(vbool64_t vm,
+                                                   vbfloat16mf4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfncvt_rtz_xu_f_w_bf16mf4_u8mf8_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_rtz_xu_f_w_bf16mf2_u8mf4_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vuint8mf4_t test_vfncvt_rtz_xu_f_w_bf16mf2_u8mf4_m(vbool32_t vm,
+                                                   vbfloat16mf2_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfncvt_rtz_xu_f_w_bf16mf2_u8mf4_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_rtz_xu_f_w_bf16m1_u8mf2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vuint8mf2_t test_vfncvt_rtz_xu_f_w_bf16m1_u8mf2_m(vbool16_t vm,
+                                                  vbfloat16m1_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfncvt_rtz_xu_f_w_bf16m1_u8mf2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_rtz_xu_f_w_bf16m2_u8m1_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vuint8m1_t test_vfncvt_rtz_xu_f_w_bf16m2_u8m1_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                                size_t vl) {
+  return __riscv_vfncvt_rtz_xu_f_w_bf16m2_u8m1_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_rtz_xu_f_w_bf16m4_u8m2_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vuint8m2_t test_vfncvt_rtz_xu_f_w_bf16m4_u8m2_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                                size_t vl) {
+  return __riscv_vfncvt_rtz_xu_f_w_bf16m4_u8m2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_rtz_xu_f_w_bf16m8_u8m4_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> poison, <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vuint8m4_t test_vfncvt_rtz_xu_f_w_bf16m8_u8m4_m(vbool2_t vm, vbfloat16m8_t vs2,
+                                                size_t vl) {
+  return __riscv_vfncvt_rtz_xu_f_w_bf16m8_u8m4_m(vm, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfnmacc.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfnmacc.c
new file mode 100644
index 0000000000000..18484883a14f4
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfnmacc.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmacc_vv_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmacc.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmacc_vv_bf16mf4(vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vv_bf16mf4(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmacc_vf_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmacc.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmacc_vf_bf16mf4(vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vf_bf16mf4(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmacc_vv_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmacc.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmacc_vv_bf16mf2(vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vv_bf16mf2(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmacc_vf_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmacc.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmacc_vf_bf16mf2(vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vf_bf16mf2(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmacc_vv_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmacc.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmacc_vv_bf16m1(vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vv_bf16m1(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmacc_vf_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmacc.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmacc_vf_bf16m1(vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vf_bf16m1(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmacc_vv_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmacc.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmacc_vv_bf16m2(vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vv_bf16m2(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmacc_vf_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmacc.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmacc_vf_bf16m2(vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vf_bf16m2(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmacc_vv_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmacc.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmacc_vv_bf16m4(vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vv_bf16m4(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmacc_vf_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmacc.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmacc_vf_bf16m4(vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vf_bf16m4(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmacc_vv_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmacc.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmacc_vv_bf16m8(vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vv_bf16m8(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmacc_vf_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmacc.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmacc_vf_bf16m8(vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vf_bf16m8(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmacc_vv_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmacc.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmacc_vv_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vv_bf16mf4_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmacc_vf_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmacc.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmacc_vf_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vf_bf16mf4_m(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmacc_vv_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmacc.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmacc_vv_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vv_bf16mf2_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmacc_vf_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmacc.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmacc_vf_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vf_bf16mf2_m(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmacc_vv_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmacc.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmacc_vv_bf16m1_m(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vv_bf16m1_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmacc_vf_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmacc.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmacc_vf_bf16m1_m(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vf_bf16m1_m(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmacc_vv_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmacc.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmacc_vv_bf16m2_m(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vv_bf16m2_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmacc_vf_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmacc.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmacc_vf_bf16m2_m(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vf_bf16m2_m(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmacc_vv_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmacc.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmacc_vv_bf16m4_m(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vv_bf16m4_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmacc_vf_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmacc.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmacc_vf_bf16m4_m(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vf_bf16m4_m(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmacc_vv_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmacc.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmacc_vv_bf16m8_m(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vv_bf16m8_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmacc_vf_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmacc.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmacc_vf_bf16m8_m(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vf_bf16m8_m(mask, vd, rs1, vs2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfnmadd.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfnmadd.c
new file mode 100644
index 0000000000000..e519e5acb4575
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfnmadd.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmadd_vv_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmadd.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmadd_vv_bf16mf4(vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vv_bf16mf4(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmadd_vf_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmadd.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmadd_vf_bf16mf4(vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vf_bf16mf4(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmadd_vv_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmadd.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmadd_vv_bf16mf2(vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vv_bf16mf2(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmadd_vf_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmadd.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmadd_vf_bf16mf2(vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vf_bf16mf2(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmadd_vv_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmadd.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmadd_vv_bf16m1(vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vv_bf16m1(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmadd_vf_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmadd.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmadd_vf_bf16m1(vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vf_bf16m1(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmadd_vv_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmadd.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmadd_vv_bf16m2(vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vv_bf16m2(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmadd_vf_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmadd.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmadd_vf_bf16m2(vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vf_bf16m2(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmadd_vv_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmadd.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmadd_vv_bf16m4(vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vv_bf16m4(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmadd_vf_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmadd.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmadd_vf_bf16m4(vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vf_bf16m4(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmadd_vv_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmadd.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmadd_vv_bf16m8(vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vv_bf16m8(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmadd_vf_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmadd.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmadd_vf_bf16m8(vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vf_bf16m8(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmadd_vv_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmadd.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmadd_vv_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vv_bf16mf4_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmadd_vf_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmadd.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmadd_vf_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vf_bf16mf4_m(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmadd_vv_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmadd.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmadd_vv_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vv_bf16mf2_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmadd_vf_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmadd.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmadd_vf_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vf_bf16mf2_m(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmadd_vv_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmadd.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmadd_vv_bf16m1_m(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vv_bf16m1_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmadd_vf_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmadd.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmadd_vf_bf16m1_m(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vf_bf16m1_m(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmadd_vv_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmadd.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmadd_vv_bf16m2_m(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vv_bf16m2_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmadd_vf_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmadd.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmadd_vf_bf16m2_m(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vf_bf16m2_m(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmadd_vv_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmadd.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmadd_vv_bf16m4_m(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vv_bf16m4_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmadd_vf_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmadd.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmadd_vf_bf16m4_m(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vf_bf16m4_m(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmadd_vv_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmadd.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmadd_vv_bf16m8_m(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vv_bf16m8_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmadd_vf_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmadd.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmadd_vf_bf16m8_m(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vf_bf16m8_m(mask, vd, rs1, vs2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfnmsac.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfnmsac.c
new file mode 100644
index 0000000000000..47e1f44f5a45f
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfnmsac.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsac_vv_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsac.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsac_vv_bf16mf4(vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vv_bf16mf4(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsac_vf_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsac.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsac_vf_bf16mf4(vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vf_bf16mf4(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsac_vv_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsac.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsac_vv_bf16mf2(vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vv_bf16mf2(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsac_vf_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsac.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsac_vf_bf16mf2(vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vf_bf16mf2(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsac_vv_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsac.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsac_vv_bf16m1(vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vv_bf16m1(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsac_vf_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsac.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsac_vf_bf16m1(vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vf_bf16m1(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsac_vv_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsac.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsac_vv_bf16m2(vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vv_bf16m2(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsac_vf_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsac.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsac_vf_bf16m2(vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vf_bf16m2(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsac_vv_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsac.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsac_vv_bf16m4(vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vv_bf16m4(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsac_vf_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsac.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsac_vf_bf16m4(vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vf_bf16m4(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsac_vv_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsac.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsac_vv_bf16m8(vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vv_bf16m8(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsac_vf_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsac.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsac_vf_bf16m8(vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vf_bf16m8(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsac_vv_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsac.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsac_vv_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vv_bf16mf4_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsac_vf_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsac.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsac_vf_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vf_bf16mf4_m(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsac_vv_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsac.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsac_vv_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vv_bf16mf2_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsac_vf_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsac.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsac_vf_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vf_bf16mf2_m(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsac_vv_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsac.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsac_vv_bf16m1_m(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vv_bf16m1_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsac_vf_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsac.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsac_vf_bf16m1_m(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vf_bf16m1_m(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsac_vv_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsac.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsac_vv_bf16m2_m(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vv_bf16m2_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsac_vf_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsac.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsac_vf_bf16m2_m(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vf_bf16m2_m(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsac_vv_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsac.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsac_vv_bf16m4_m(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vv_bf16m4_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsac_vf_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsac.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsac_vf_bf16m4_m(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vf_bf16m4_m(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsac_vv_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsac.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsac_vv_bf16m8_m(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vv_bf16m8_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsac_vf_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsac.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsac_vf_bf16m8_m(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vf_bf16m8_m(mask, vd, rs1, vs2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfnmsub.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfnmsub.c
new file mode 100644
index 0000000000000..4b55b64542c61
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfnmsub.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsub_vv_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsub.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsub_vv_bf16mf4(vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vv_bf16mf4(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsub_vf_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsub.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsub_vf_bf16mf4(vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vf_bf16mf4(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsub_vv_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsub.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsub_vv_bf16mf2(vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vv_bf16mf2(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsub_vf_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsub.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsub_vf_bf16mf2(vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vf_bf16mf2(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsub_vv_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsub.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsub_vv_bf16m1(vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vv_bf16m1(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsub_vf_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsub.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsub_vf_bf16m1(vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vf_bf16m1(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsub_vv_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsub.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsub_vv_bf16m2(vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vv_bf16m2(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsub_vf_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsub.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsub_vf_bf16m2(vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vf_bf16m2(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsub_vv_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsub.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsub_vv_bf16m4(vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vv_bf16m4(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsub_vf_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsub.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsub_vf_bf16m4(vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vf_bf16m4(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsub_vv_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsub.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsub_vv_bf16m8(vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vv_bf16m8(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsub_vf_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsub.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsub_vf_bf16m8(vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vf_bf16m8(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsub_vv_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsub.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsub_vv_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vv_bf16mf4_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsub_vf_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsub.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsub_vf_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vf_bf16mf4_m(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsub_vv_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsub.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsub_vv_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vv_bf16mf2_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsub_vf_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsub.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsub_vf_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vf_bf16mf2_m(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsub_vv_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsub.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsub_vv_bf16m1_m(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vv_bf16m1_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsub_vf_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsub.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsub_vf_bf16m1_m(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vf_bf16m1_m(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsub_vv_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsub.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsub_vv_bf16m2_m(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vv_bf16m2_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsub_vf_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsub.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsub_vf_bf16m2_m(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vf_bf16m2_m(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsub_vv_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsub.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsub_vv_bf16m4_m(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vv_bf16m4_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsub_vf_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsub.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsub_vf_bf16m4_m(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vf_bf16m4_m(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsub_vv_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsub.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsub_vv_bf16m8_m(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vv_bf16m8_m(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsub_vf_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsub.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsub_vf_bf16m8_m(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vf_bf16m8_m(mask, vd, rs1, vs2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfrec7.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfrec7.c
new file mode 100644
index 0000000000000..1ffee73e91d04
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfrec7.c
@@ -0,0 +1,129 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfrec7_v_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfrec7.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfrec7_v_bf16mf4(vbfloat16mf4_t op1, size_t vl) {
+  return __riscv_vfrec7_v_bf16mf4(op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfrec7_v_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfrec7.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfrec7_v_bf16mf2(vbfloat16mf2_t op1, size_t vl) {
+  return __riscv_vfrec7_v_bf16mf2(op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfrec7_v_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfrec7.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfrec7_v_bf16m1(vbfloat16m1_t op1, size_t vl) {
+  return __riscv_vfrec7_v_bf16m1(op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfrec7_v_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfrec7.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfrec7_v_bf16m2(vbfloat16m2_t op1, size_t vl) {
+  return __riscv_vfrec7_v_bf16m2(op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfrec7_v_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfrec7.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfrec7_v_bf16m4(vbfloat16m4_t op1, size_t vl) {
+  return __riscv_vfrec7_v_bf16m4(op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfrec7_v_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfrec7.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfrec7_v_bf16m8(vbfloat16m8_t op1, size_t vl) {
+  return __riscv_vfrec7_v_bf16m8(op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfrec7_v_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfrec7.mask.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfrec7_v_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t op1, size_t vl) {
+  return __riscv_vfrec7_v_bf16mf4_m(mask, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfrec7_v_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfrec7.mask.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfrec7_v_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t op1, size_t vl) {
+  return __riscv_vfrec7_v_bf16mf2_m(mask, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfrec7_v_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfrec7.mask.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfrec7_v_bf16m1_m(vbool16_t mask, vbfloat16m1_t op1, size_t vl) {
+  return __riscv_vfrec7_v_bf16m1_m(mask, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfrec7_v_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfrec7.mask.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfrec7_v_bf16m2_m(vbool8_t mask, vbfloat16m2_t op1, size_t vl) {
+  return __riscv_vfrec7_v_bf16m2_m(mask, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfrec7_v_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfrec7.mask.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfrec7_v_bf16m4_m(vbool4_t mask, vbfloat16m4_t op1, size_t vl) {
+  return __riscv_vfrec7_v_bf16m4_m(mask, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfrec7_v_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfrec7.mask.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfrec7_v_bf16m8_m(vbool2_t mask, vbfloat16m8_t op1, size_t vl) {
+  return __riscv_vfrec7_v_bf16m8_m(mask, op1, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfrsqrt7.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfrsqrt7.c
new file mode 100644
index 0000000000000..964c4869622aa
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfrsqrt7.c
@@ -0,0 +1,129 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfrsqrt7_v_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfrsqrt7.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfrsqrt7_v_bf16mf4(vbfloat16mf4_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_v_bf16mf4(op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfrsqrt7_v_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfrsqrt7.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfrsqrt7_v_bf16mf2(vbfloat16mf2_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_v_bf16mf2(op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfrsqrt7_v_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfrsqrt7.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfrsqrt7_v_bf16m1(vbfloat16m1_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_v_bf16m1(op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfrsqrt7_v_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfrsqrt7.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfrsqrt7_v_bf16m2(vbfloat16m2_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_v_bf16m2(op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfrsqrt7_v_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfrsqrt7.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfrsqrt7_v_bf16m4(vbfloat16m4_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_v_bf16m4(op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfrsqrt7_v_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfrsqrt7.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfrsqrt7_v_bf16m8(vbfloat16m8_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_v_bf16m8(op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfrsqrt7_v_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfrsqrt7_v_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_v_bf16mf4_m(mask, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfrsqrt7_v_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfrsqrt7_v_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_v_bf16mf2_m(mask, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfrsqrt7_v_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfrsqrt7_v_bf16m1_m(vbool16_t mask, vbfloat16m1_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_v_bf16m1_m(mask, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfrsqrt7_v_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfrsqrt7_v_bf16m2_m(vbool8_t mask, vbfloat16m2_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_v_bf16m2_m(mask, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfrsqrt7_v_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfrsqrt7_v_bf16m4_m(vbool4_t mask, vbfloat16m4_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_v_bf16m4_m(mask, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfrsqrt7_v_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfrsqrt7_v_bf16m8_m(vbool2_t mask, vbfloat16m8_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_v_bf16m8_m(mask, op1, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfrsub.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfrsub.c
new file mode 100644
index 0000000000000..c7c3869e7b77c
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfrsub.c
@@ -0,0 +1,129 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfrsub_vf_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfrsub.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfrsub_vf_bf16mf4(vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_vf_bf16mf4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfrsub_vf_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfrsub.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfrsub_vf_bf16mf2(vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_vf_bf16mf2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfrsub_vf_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfrsub.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfrsub_vf_bf16m1(vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_vf_bf16m1(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfrsub_vf_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfrsub.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfrsub_vf_bf16m2(vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_vf_bf16m2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfrsub_vf_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfrsub.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfrsub_vf_bf16m4(vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_vf_bf16m4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfrsub_vf_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfrsub.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfrsub_vf_bf16m8(vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_vf_bf16m8(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfrsub_vf_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfrsub.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfrsub_vf_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_vf_bf16mf4_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfrsub_vf_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfrsub.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfrsub_vf_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_vf_bf16mf2_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfrsub_vf_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfrsub.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfrsub_vf_bf16m1_m(vbool16_t mask, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_vf_bf16m1_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfrsub_vf_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfrsub.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfrsub_vf_bf16m2_m(vbool8_t mask, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_vf_bf16m2_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfrsub_vf_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfrsub.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfrsub_vf_bf16m4_m(vbool4_t mask, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_vf_bf16m4_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfrsub_vf_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfrsub.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfrsub_vf_bf16m8_m(vbool2_t mask, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_vf_bf16m8_m(mask, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfsgnj.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfsgnj.c
new file mode 100644
index 0000000000000..778b8b83e9841
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfsgnj.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnj_vv_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnj.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnj_vv_bf16mf4(vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsgnj_vv_bf16mf4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnj_vf_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnj.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnj_vf_bf16mf4(vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_vf_bf16mf4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnj_vv_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnj.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnj_vv_bf16mf2(vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsgnj_vv_bf16mf2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnj_vf_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnj.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnj_vf_bf16mf2(vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_vf_bf16mf2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnj_vv_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnj.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnj_vv_bf16m1(vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsgnj_vv_bf16m1(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnj_vf_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnj.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnj_vf_bf16m1(vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_vf_bf16m1(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnj_vv_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnj.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnj_vv_bf16m2(vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsgnj_vv_bf16m2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnj_vf_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnj.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnj_vf_bf16m2(vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_vf_bf16m2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnj_vv_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnj.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnj_vv_bf16m4(vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsgnj_vv_bf16m4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnj_vf_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnj.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnj_vf_bf16m4(vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_vf_bf16m4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnj_vv_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnj.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnj_vv_bf16m8(vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsgnj_vv_bf16m8(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnj_vf_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnj.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnj_vf_bf16m8(vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_vf_bf16m8(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnj_vv_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnj.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnj_vv_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsgnj_vv_bf16mf4_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnj_vf_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnj.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnj_vf_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_vf_bf16mf4_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnj_vv_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnj.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnj_vv_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsgnj_vv_bf16mf2_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnj_vf_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnj.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnj_vf_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_vf_bf16mf2_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnj_vv_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnj.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnj_vv_bf16m1_m(vbool16_t mask, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsgnj_vv_bf16m1_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnj_vf_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnj.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnj_vf_bf16m1_m(vbool16_t mask, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_vf_bf16m1_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnj_vv_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnj.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnj_vv_bf16m2_m(vbool8_t mask, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsgnj_vv_bf16m2_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnj_vf_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnj.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnj_vf_bf16m2_m(vbool8_t mask, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_vf_bf16m2_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnj_vv_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnj.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnj_vv_bf16m4_m(vbool4_t mask, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsgnj_vv_bf16m4_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnj_vf_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnj.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnj_vf_bf16m4_m(vbool4_t mask, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_vf_bf16m4_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnj_vv_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnj.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnj_vv_bf16m8_m(vbool2_t mask, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsgnj_vv_bf16m8_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnj_vf_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnj.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnj_vf_bf16m8_m(vbool2_t mask, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_vf_bf16m8_m(mask, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfsgnjn.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfsgnjn.c
new file mode 100644
index 0000000000000..7de308978e1d3
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfsgnjn.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjn_vv_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjn.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjn_vv_bf16mf4(vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsgnjn_vv_bf16mf4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjn_vf_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjn.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjn_vf_bf16mf4(vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_vf_bf16mf4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjn_vv_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjn.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjn_vv_bf16mf2(vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsgnjn_vv_bf16mf2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjn_vf_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjn.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjn_vf_bf16mf2(vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_vf_bf16mf2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjn_vv_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjn.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjn_vv_bf16m1(vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsgnjn_vv_bf16m1(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjn_vf_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjn.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjn_vf_bf16m1(vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_vf_bf16m1(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjn_vv_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjn.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjn_vv_bf16m2(vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsgnjn_vv_bf16m2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjn_vf_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjn.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjn_vf_bf16m2(vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_vf_bf16m2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjn_vv_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjn.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjn_vv_bf16m4(vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsgnjn_vv_bf16m4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjn_vf_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjn.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjn_vf_bf16m4(vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_vf_bf16m4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjn_vv_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjn.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjn_vv_bf16m8(vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsgnjn_vv_bf16m8(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjn_vf_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjn.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjn_vf_bf16m8(vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_vf_bf16m8(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjn_vv_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjn_vv_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsgnjn_vv_bf16mf4_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjn_vf_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjn_vf_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_vf_bf16mf4_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjn_vv_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjn_vv_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsgnjn_vv_bf16mf2_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjn_vf_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjn_vf_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_vf_bf16mf2_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjn_vv_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjn_vv_bf16m1_m(vbool16_t mask, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsgnjn_vv_bf16m1_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjn_vf_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjn_vf_bf16m1_m(vbool16_t mask, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_vf_bf16m1_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjn_vv_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjn_vv_bf16m2_m(vbool8_t mask, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsgnjn_vv_bf16m2_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjn_vf_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjn_vf_bf16m2_m(vbool8_t mask, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_vf_bf16m2_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjn_vv_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjn_vv_bf16m4_m(vbool4_t mask, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsgnjn_vv_bf16m4_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjn_vf_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjn_vf_bf16m4_m(vbool4_t mask, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_vf_bf16m4_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjn_vv_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjn_vv_bf16m8_m(vbool2_t mask, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsgnjn_vv_bf16m8_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjn_vf_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjn_vf_bf16m8_m(vbool2_t mask, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_vf_bf16m8_m(mask, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfsgnjx.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfsgnjx.c
new file mode 100644
index 0000000000000..5fa285cc78b63
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfsgnjx.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjx_vv_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjx.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjx_vv_bf16mf4(vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsgnjx_vv_bf16mf4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjx_vf_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjx.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjx_vf_bf16mf4(vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_vf_bf16mf4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjx_vv_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjx.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjx_vv_bf16mf2(vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsgnjx_vv_bf16mf2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjx_vf_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjx.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjx_vf_bf16mf2(vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_vf_bf16mf2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjx_vv_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjx.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjx_vv_bf16m1(vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsgnjx_vv_bf16m1(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjx_vf_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjx.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjx_vf_bf16m1(vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_vf_bf16m1(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjx_vv_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjx.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjx_vv_bf16m2(vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsgnjx_vv_bf16m2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjx_vf_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjx.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjx_vf_bf16m2(vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_vf_bf16m2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjx_vv_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjx.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjx_vv_bf16m4(vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsgnjx_vv_bf16m4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjx_vf_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjx.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjx_vf_bf16m4(vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_vf_bf16m4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjx_vv_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjx.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjx_vv_bf16m8(vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsgnjx_vv_bf16m8(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjx_vf_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjx.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjx_vf_bf16m8(vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_vf_bf16m8(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjx_vv_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjx_vv_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsgnjx_vv_bf16mf4_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjx_vf_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjx_vf_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_vf_bf16mf4_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjx_vv_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjx_vv_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsgnjx_vv_bf16mf2_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjx_vf_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjx_vf_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_vf_bf16mf2_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjx_vv_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjx_vv_bf16m1_m(vbool16_t mask, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsgnjx_vv_bf16m1_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjx_vf_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjx_vf_bf16m1_m(vbool16_t mask, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_vf_bf16m1_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjx_vv_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjx_vv_bf16m2_m(vbool8_t mask, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsgnjx_vv_bf16m2_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjx_vf_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjx_vf_bf16m2_m(vbool8_t mask, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_vf_bf16m2_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjx_vv_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjx_vv_bf16m4_m(vbool4_t mask, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsgnjx_vv_bf16m4_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjx_vf_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjx_vf_bf16m4_m(vbool4_t mask, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_vf_bf16m4_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjx_vv_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjx_vv_bf16m8_m(vbool2_t mask, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsgnjx_vv_bf16m8_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjx_vf_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjx_vf_bf16m8_m(vbool2_t mask, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_vf_bf16m8_m(mask, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfslide1down.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfslide1down.c
new file mode 100644
index 0000000000000..b94d26b4ddf40
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfslide1down.c
@@ -0,0 +1,129 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfslide1down_vf_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfslide1down.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfslide1down_vf_bf16mf4(vbfloat16mf4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_vf_bf16mf4(src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfslide1down_vf_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfslide1down.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfslide1down_vf_bf16mf2(vbfloat16mf2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_vf_bf16mf2(src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfslide1down_vf_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfslide1down.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfslide1down_vf_bf16m1(vbfloat16m1_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_vf_bf16m1(src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfslide1down_vf_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfslide1down.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfslide1down_vf_bf16m2(vbfloat16m2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_vf_bf16m2(src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfslide1down_vf_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfslide1down.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfslide1down_vf_bf16m4(vbfloat16m4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_vf_bf16m4(src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfslide1down_vf_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfslide1down.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfslide1down_vf_bf16m8(vbfloat16m8_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_vf_bf16m8(src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfslide1down_vf_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfslide1down.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfslide1down_vf_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_vf_bf16mf4_m(mask, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfslide1down_vf_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfslide1down.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfslide1down_vf_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_vf_bf16mf2_m(mask, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfslide1down_vf_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfslide1down.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfslide1down_vf_bf16m1_m(vbool16_t mask, vbfloat16m1_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_vf_bf16m1_m(mask, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfslide1down_vf_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfslide1down.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfslide1down_vf_bf16m2_m(vbool8_t mask, vbfloat16m2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_vf_bf16m2_m(mask, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfslide1down_vf_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfslide1down.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfslide1down_vf_bf16m4_m(vbool4_t mask, vbfloat16m4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_vf_bf16m4_m(mask, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfslide1down_vf_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfslide1down.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfslide1down_vf_bf16m8_m(vbool2_t mask, vbfloat16m8_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_vf_bf16m8_m(mask, src, value, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfslide1up.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfslide1up.c
new file mode 100644
index 0000000000000..06e8b49af19d0
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfslide1up.c
@@ -0,0 +1,129 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfslide1up_vf_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfslide1up.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfslide1up_vf_bf16mf4(vbfloat16mf4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_vf_bf16mf4(src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfslide1up_vf_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfslide1up.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfslide1up_vf_bf16mf2(vbfloat16mf2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_vf_bf16mf2(src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfslide1up_vf_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfslide1up.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfslide1up_vf_bf16m1(vbfloat16m1_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_vf_bf16m1(src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfslide1up_vf_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfslide1up.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfslide1up_vf_bf16m2(vbfloat16m2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_vf_bf16m2(src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfslide1up_vf_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfslide1up.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfslide1up_vf_bf16m4(vbfloat16m4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_vf_bf16m4(src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfslide1up_vf_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfslide1up.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfslide1up_vf_bf16m8(vbfloat16m8_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_vf_bf16m8(src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfslide1up_vf_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfslide1up.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfslide1up_vf_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_vf_bf16mf4_m(mask, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfslide1up_vf_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfslide1up.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfslide1up_vf_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_vf_bf16mf2_m(mask, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfslide1up_vf_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfslide1up.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfslide1up_vf_bf16m1_m(vbool16_t mask, vbfloat16m1_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_vf_bf16m1_m(mask, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfslide1up_vf_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfslide1up.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfslide1up_vf_bf16m2_m(vbool8_t mask, vbfloat16m2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_vf_bf16m2_m(mask, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfslide1up_vf_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfslide1up.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfslide1up_vf_bf16m4_m(vbool4_t mask, vbfloat16m4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_vf_bf16m4_m(mask, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfslide1up_vf_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfslide1up.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfslide1up_vf_bf16m8_m(vbool2_t mask, vbfloat16m8_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_vf_bf16m8_m(mask, src, value, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfsub.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfsub.c
new file mode 100644
index 0000000000000..2423b0bbdbb80
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfsub.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsub_vv_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsub.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsub_vv_bf16mf4(vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsub_vv_bf16mf4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsub_vf_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsub.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsub_vf_bf16mf4(vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_vf_bf16mf4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsub_vv_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsub.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsub_vv_bf16mf2(vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsub_vv_bf16mf2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsub_vf_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsub.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsub_vf_bf16mf2(vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_vf_bf16mf2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsub_vv_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsub.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsub_vv_bf16m1(vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsub_vv_bf16m1(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsub_vf_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsub.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsub_vf_bf16m1(vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_vf_bf16m1(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsub_vv_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsub.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsub_vv_bf16m2(vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsub_vv_bf16m2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsub_vf_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsub.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsub_vf_bf16m2(vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_vf_bf16m2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsub_vv_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsub.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsub_vv_bf16m4(vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsub_vv_bf16m4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsub_vf_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsub.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsub_vf_bf16m4(vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_vf_bf16m4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsub_vv_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsub.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsub_vv_bf16m8(vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsub_vv_bf16m8(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsub_vf_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsub.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsub_vf_bf16m8(vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_vf_bf16m8(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsub_vv_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsub.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsub_vv_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsub_vv_bf16mf4_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsub_vf_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsub.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsub_vf_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_vf_bf16mf4_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsub_vv_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsub.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsub_vv_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsub_vv_bf16mf2_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsub_vf_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsub.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsub_vf_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_vf_bf16mf2_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsub_vv_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsub.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsub_vv_bf16m1_m(vbool16_t mask, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsub_vv_bf16m1_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsub_vf_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsub.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsub_vf_bf16m1_m(vbool16_t mask, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_vf_bf16m1_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsub_vv_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsub.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsub_vv_bf16m2_m(vbool8_t mask, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsub_vv_bf16m2_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsub_vf_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsub.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsub_vf_bf16m2_m(vbool8_t mask, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_vf_bf16m2_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsub_vv_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsub.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsub_vv_bf16m4_m(vbool4_t mask, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsub_vv_bf16m4_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsub_vf_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsub.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsub_vf_bf16m4_m(vbool4_t mask, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_vf_bf16m4_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsub_vv_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsub.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsub_vv_bf16m8_m(vbool2_t mask, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsub_vv_bf16m8_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsub_vf_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsub.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsub_vf_bf16m8_m(vbool2_t mask, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_vf_bf16m8_m(mask, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfwadd.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfwadd.c
new file mode 100644
index 0000000000000..24d34f46f4203
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfwadd.c
@@ -0,0 +1,899 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vv_bf16mf4_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vv_bf16mf4_f32mf2(vbfloat16mf4_t vs2,
+                                            vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16mf4_f32mf2(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vf_bf16mf4_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vf_bf16mf4_f32mf2(vbfloat16mf4_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwadd_vf_bf16mf4_f32mf2(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wv_bf16mf4_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wv_bf16mf4_f32mf2(vfloat32mf2_t vs2,
+                                            vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_bf16mf4_f32mf2(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wf_bf16_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.nxv1f32.bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wf_bf16_f32mf2(vfloat32mf2_t vs2, __bf16 rs1,
+                                         size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32mf2(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vv_bf16mf2_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vv_bf16mf2_f32m1(vbfloat16mf2_t vs2,
+                                          vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16mf2_f32m1(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vf_bf16mf2_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vf_bf16mf2_f32m1(vbfloat16mf2_t vs2, __bf16 rs1,
+                                          size_t vl) {
+  return __riscv_vfwadd_vf_bf16mf2_f32m1(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wv_bf16mf2_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wv_bf16mf2_f32m1(vfloat32m1_t vs2, vbfloat16mf2_t vs1,
+                                          size_t vl) {
+  return __riscv_vfwadd_wv_bf16mf2_f32m1(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wf_bf16_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.nxv2f32.bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wf_bf16_f32m1(vfloat32m1_t vs2, __bf16 rs1,
+                                       size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m1(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vv_bf16m1_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vv_bf16m1_f32m2(vbfloat16m1_t vs2, vbfloat16m1_t vs1,
+                                         size_t vl) {
+  return __riscv_vfwadd_vv_bf16m1_f32m2(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vf_bf16m1_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vf_bf16m1_f32m2(vbfloat16m1_t vs2, __bf16 rs1,
+                                         size_t vl) {
+  return __riscv_vfwadd_vf_bf16m1_f32m2(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wv_bf16m1_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wv_bf16m1_f32m2(vfloat32m2_t vs2, vbfloat16m1_t vs1,
+                                         size_t vl) {
+  return __riscv_vfwadd_wv_bf16m1_f32m2(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wf_bf16_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.nxv4f32.bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wf_bf16_f32m2(vfloat32m2_t vs2, __bf16 rs1,
+                                       size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m2(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vv_bf16m2_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vv_bf16m2_f32m4(vbfloat16m2_t vs2, vbfloat16m2_t vs1,
+                                         size_t vl) {
+  return __riscv_vfwadd_vv_bf16m2_f32m4(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vf_bf16m2_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vf_bf16m2_f32m4(vbfloat16m2_t vs2, __bf16 rs1,
+                                         size_t vl) {
+  return __riscv_vfwadd_vf_bf16m2_f32m4(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wv_bf16m2_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wv_bf16m2_f32m4(vfloat32m4_t vs2, vbfloat16m2_t vs1,
+                                         size_t vl) {
+  return __riscv_vfwadd_wv_bf16m2_f32m4(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wf_bf16_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.nxv8f32.bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wf_bf16_f32m4(vfloat32m4_t vs2, __bf16 rs1,
+                                       size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m4(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vv_bf16m4_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vv_bf16m4_f32m8(vbfloat16m4_t vs2, vbfloat16m4_t vs1,
+                                         size_t vl) {
+  return __riscv_vfwadd_vv_bf16m4_f32m8(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vf_bf16m4_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vf_bf16m4_f32m8(vbfloat16m4_t vs2, __bf16 rs1,
+                                         size_t vl) {
+  return __riscv_vfwadd_vf_bf16m4_f32m8(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wv_bf16m4_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wv_bf16m4_f32m8(vfloat32m8_t vs2, vbfloat16m4_t vs1,
+                                         size_t vl) {
+  return __riscv_vfwadd_wv_bf16m4_f32m8(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wf_bf16_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.nxv16f32.bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wf_bf16_f32m8(vfloat32m8_t vs2, __bf16 rs1,
+                                       size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m8(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vv_bf16mf4_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vv_bf16mf4_f32mf2_m(vbool64_t vm, vbfloat16mf4_t vs2,
+                                              vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16mf4_f32mf2_m(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vf_bf16mf4_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vf_bf16mf4_f32mf2_m(vbool64_t vm, vbfloat16mf4_t vs2,
+                                              __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_vf_bf16mf4_f32mf2_m(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wv_bf16mf4_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wv_bf16mf4_f32mf2_m(vbool64_t vm, vfloat32mf2_t vs2,
+                                              vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_bf16mf4_f32mf2_m(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wf_bf16_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wf_bf16_f32mf2_m(vbool64_t vm, vfloat32mf2_t vs2,
+                                           __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32mf2_m(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vv_bf16mf2_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vv_bf16mf2_f32m1_m(vbool32_t vm, vbfloat16mf2_t vs2,
+                                            vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16mf2_f32m1_m(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vf_bf16mf2_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vf_bf16mf2_f32m1_m(vbool32_t vm, vbfloat16mf2_t vs2,
+                                            __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_vf_bf16mf2_f32m1_m(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wv_bf16mf2_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wv_bf16mf2_f32m1_m(vbool32_t vm, vfloat32m1_t vs2,
+                                            vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_bf16mf2_f32m1_m(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wf_bf16_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wf_bf16_f32m1_m(vbool32_t vm, vfloat32m1_t vs2,
+                                         __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m1_m(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vv_bf16m1_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vv_bf16m1_f32m2_m(vbool16_t vm, vbfloat16m1_t vs2,
+                                           vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16m1_f32m2_m(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vf_bf16m1_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vf_bf16m1_f32m2_m(vbool16_t vm, vbfloat16m1_t vs2,
+                                           __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_vf_bf16m1_f32m2_m(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wv_bf16m1_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wv_bf16m1_f32m2_m(vbool16_t vm, vfloat32m2_t vs2,
+                                           vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_bf16m1_f32m2_m(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wf_bf16_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wf_bf16_f32m2_m(vbool16_t vm, vfloat32m2_t vs2,
+                                         __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m2_m(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vv_bf16m2_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vv_bf16m2_f32m4_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                           vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16m2_f32m4_m(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vf_bf16m2_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vf_bf16m2_f32m4_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                           __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_vf_bf16m2_f32m4_m(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wv_bf16m2_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wv_bf16m2_f32m4_m(vbool8_t vm, vfloat32m4_t vs2,
+                                           vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_bf16m2_f32m4_m(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wf_bf16_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wf_bf16_f32m4_m(vbool8_t vm, vfloat32m4_t vs2,
+                                         __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m4_m(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vv_bf16m4_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vv_bf16m4_f32m8_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                           vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16m4_f32m8_m(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vf_bf16m4_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vf_bf16m4_f32m8_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                           __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_vf_bf16m4_f32m8_m(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wv_bf16m4_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wv_bf16m4_f32m8_m(vbool4_t vm, vfloat32m8_t vs2,
+                                           vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_bf16m4_f32m8_m(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wf_bf16_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wf_bf16_f32m8_m(vbool4_t vm, vfloat32m8_t vs2,
+                                         __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m8_m(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vv_bf16mf4_f32mf2_rm(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vv_bf16mf4_f32mf2_rm(vbfloat16mf4_t vs2,
+                                               vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16mf4_f32mf2_rm(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vf_bf16mf4_f32mf2_rm(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vf_bf16mf4_f32mf2_rm(vbfloat16mf4_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwadd_vf_bf16mf4_f32mf2_rm(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wv_bf16mf4_f32mf2_rm(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wv_bf16mf4_f32mf2_rm(vfloat32mf2_t vs2,
+                                               vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_bf16mf4_f32mf2_rm(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wf_bf16_f32mf2_rm(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.nxv1f32.bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wf_bf16_f32mf2_rm(vfloat32mf2_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32mf2_rm(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vv_bf16mf2_f32m1_rm(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vv_bf16mf2_f32m1_rm(vbfloat16mf2_t vs2,
+                                             vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16mf2_f32m1_rm(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vf_bf16mf2_f32m1_rm(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vf_bf16mf2_f32m1_rm(vbfloat16mf2_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwadd_vf_bf16mf2_f32m1_rm(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wv_bf16mf2_f32m1_rm(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wv_bf16mf2_f32m1_rm(vfloat32m1_t vs2,
+                                             vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_bf16mf2_f32m1_rm(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wf_bf16_f32m1_rm(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.nxv2f32.bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wf_bf16_f32m1_rm(vfloat32m1_t vs2, __bf16 rs1,
+                                          size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m1_rm(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vv_bf16m1_f32m2_rm(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vv_bf16m1_f32m2_rm(vbfloat16m1_t vs2,
+                                            vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16m1_f32m2_rm(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vf_bf16m1_f32m2_rm(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vf_bf16m1_f32m2_rm(vbfloat16m1_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwadd_vf_bf16m1_f32m2_rm(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wv_bf16m1_f32m2_rm(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wv_bf16m1_f32m2_rm(vfloat32m2_t vs2, vbfloat16m1_t vs1,
+                                            size_t vl) {
+  return __riscv_vfwadd_wv_bf16m1_f32m2_rm(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wf_bf16_f32m2_rm(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.nxv4f32.bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wf_bf16_f32m2_rm(vfloat32m2_t vs2, __bf16 rs1,
+                                          size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m2_rm(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vv_bf16m2_f32m4_rm(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vv_bf16m2_f32m4_rm(vbfloat16m2_t vs2,
+                                            vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16m2_f32m4_rm(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vf_bf16m2_f32m4_rm(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vf_bf16m2_f32m4_rm(vbfloat16m2_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwadd_vf_bf16m2_f32m4_rm(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wv_bf16m2_f32m4_rm(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wv_bf16m2_f32m4_rm(vfloat32m4_t vs2, vbfloat16m2_t vs1,
+                                            size_t vl) {
+  return __riscv_vfwadd_wv_bf16m2_f32m4_rm(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wf_bf16_f32m4_rm(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.nxv8f32.bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wf_bf16_f32m4_rm(vfloat32m4_t vs2, __bf16 rs1,
+                                          size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m4_rm(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vv_bf16m4_f32m8_rm(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vv_bf16m4_f32m8_rm(vbfloat16m4_t vs2,
+                                            vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16m4_f32m8_rm(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vf_bf16m4_f32m8_rm(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vf_bf16m4_f32m8_rm(vbfloat16m4_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwadd_vf_bf16m4_f32m8_rm(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wv_bf16m4_f32m8_rm(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wv_bf16m4_f32m8_rm(vfloat32m8_t vs2, vbfloat16m4_t vs1,
+                                            size_t vl) {
+  return __riscv_vfwadd_wv_bf16m4_f32m8_rm(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wf_bf16_f32m8_rm(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.nxv16f32.bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wf_bf16_f32m8_rm(vfloat32m8_t vs2, __bf16 rs1,
+                                          size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m8_rm(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vv_bf16mf4_f32mf2_rm_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vv_bf16mf4_f32mf2_rm_m(vbool64_t vm,
+                                                 vbfloat16mf4_t vs2,
+                                                 vbfloat16mf4_t vs1,
+                                                 size_t vl) {
+  return __riscv_vfwadd_vv_bf16mf4_f32mf2_rm_m(vm, vs2, vs1, __RISCV_FRM_RNE,
+                                               vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vf_bf16mf4_f32mf2_rm_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vf_bf16mf4_f32mf2_rm_m(vbool64_t vm,
+                                                 vbfloat16mf4_t vs2, __bf16 rs1,
+                                                 size_t vl) {
+  return __riscv_vfwadd_vf_bf16mf4_f32mf2_rm_m(vm, vs2, rs1, __RISCV_FRM_RNE,
+                                               vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wv_bf16mf4_f32mf2_rm_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wv_bf16mf4_f32mf2_rm_m(vbool64_t vm,
+                                                 vfloat32mf2_t vs2,
+                                                 vbfloat16mf4_t vs1,
+                                                 size_t vl) {
+  return __riscv_vfwadd_wv_bf16mf4_f32mf2_rm_m(vm, vs2, vs1, __RISCV_FRM_RNE,
+                                               vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wf_bf16_f32mf2_rm_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wf_bf16_f32mf2_rm_m(vbool64_t vm, vfloat32mf2_t vs2,
+                                              __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32mf2_rm_m(vm, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vv_bf16mf2_f32m1_rm_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vv_bf16mf2_f32m1_rm_m(vbool32_t vm, vbfloat16mf2_t vs2,
+                                               vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16mf2_f32m1_rm_m(vm, vs2, vs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vf_bf16mf2_f32m1_rm_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vf_bf16mf2_f32m1_rm_m(vbool32_t vm, vbfloat16mf2_t vs2,
+                                               __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_vf_bf16mf2_f32m1_rm_m(vm, vs2, rs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wv_bf16mf2_f32m1_rm_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wv_bf16mf2_f32m1_rm_m(vbool32_t vm, vfloat32m1_t vs2,
+                                               vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_bf16mf2_f32m1_rm_m(vm, vs2, vs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wf_bf16_f32m1_rm_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wf_bf16_f32m1_rm_m(vbool32_t vm, vfloat32m1_t vs2,
+                                            __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m1_rm_m(vm, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vv_bf16m1_f32m2_rm_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vv_bf16m1_f32m2_rm_m(vbool16_t vm, vbfloat16m1_t vs2,
+                                              vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16m1_f32m2_rm_m(vm, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vf_bf16m1_f32m2_rm_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vf_bf16m1_f32m2_rm_m(vbool16_t vm, vbfloat16m1_t vs2,
+                                              __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_vf_bf16m1_f32m2_rm_m(vm, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wv_bf16m1_f32m2_rm_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wv_bf16m1_f32m2_rm_m(vbool16_t vm, vfloat32m2_t vs2,
+                                              vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_bf16m1_f32m2_rm_m(vm, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wf_bf16_f32m2_rm_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wf_bf16_f32m2_rm_m(vbool16_t vm, vfloat32m2_t vs2,
+                                            __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m2_rm_m(vm, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vv_bf16m2_f32m4_rm_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vv_bf16m2_f32m4_rm_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                              vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16m2_f32m4_rm_m(vm, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vf_bf16m2_f32m4_rm_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vf_bf16m2_f32m4_rm_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                              __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_vf_bf16m2_f32m4_rm_m(vm, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wv_bf16m2_f32m4_rm_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wv_bf16m2_f32m4_rm_m(vbool8_t vm, vfloat32m4_t vs2,
+                                              vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_bf16m2_f32m4_rm_m(vm, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wf_bf16_f32m4_rm_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wf_bf16_f32m4_rm_m(vbool8_t vm, vfloat32m4_t vs2,
+                                            __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m4_rm_m(vm, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vv_bf16m4_f32m8_rm_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vv_bf16m4_f32m8_rm_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                              vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16m4_f32m8_rm_m(vm, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vf_bf16m4_f32m8_rm_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vf_bf16m4_f32m8_rm_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                              __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_vf_bf16m4_f32m8_rm_m(vm, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wv_bf16m4_f32m8_rm_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wv_bf16m4_f32m8_rm_m(vbool4_t vm, vfloat32m8_t vs2,
+                                              vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_bf16m4_f32m8_rm_m(vm, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wf_bf16_f32m8_rm_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wf_bf16_f32m8_rm_m(vbool4_t vm, vfloat32m8_t vs2,
+                                            __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m8_rm_m(vm, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfwcvt.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfwcvt.c
new file mode 100644
index 0000000000000..fb3e0031af98c
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfwcvt.c
@@ -0,0 +1,366 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfwcvt_f_x_v_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv1bf16.nxv1i8.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfwcvt_f_x_v_bf16mf4(vint8mf8_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_x_v_bf16mf4(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfwcvt_f_x_v_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv2bf16.nxv2i8.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfwcvt_f_x_v_bf16mf2(vint8mf4_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_x_v_bf16mf2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfwcvt_f_x_v_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv4bf16.nxv4i8.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfwcvt_f_x_v_bf16m1(vint8mf2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_x_v_bf16m1(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfwcvt_f_x_v_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv8bf16.nxv8i8.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfwcvt_f_x_v_bf16m2(vint8m1_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_x_v_bf16m2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfwcvt_f_x_v_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv16bf16.nxv16i8.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfwcvt_f_x_v_bf16m4(vint8m2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_x_v_bf16m4(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfwcvt_f_x_v_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv32bf16.nxv32i8.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfwcvt_f_x_v_bf16m8(vint8m4_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_x_v_bf16m8(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfwcvt_f_xu_v_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv1bf16.nxv1i8.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfwcvt_f_xu_v_bf16mf4(vuint8mf8_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_xu_v_bf16mf4(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfwcvt_f_xu_v_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv2bf16.nxv2i8.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfwcvt_f_xu_v_bf16mf2(vuint8mf4_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_xu_v_bf16mf2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfwcvt_f_xu_v_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv4bf16.nxv4i8.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfwcvt_f_xu_v_bf16m1(vuint8mf2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_xu_v_bf16m1(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfwcvt_f_xu_v_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv8bf16.nxv8i8.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfwcvt_f_xu_v_bf16m2(vuint8m1_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_xu_v_bf16m2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfwcvt_f_xu_v_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv16bf16.nxv16i8.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfwcvt_f_xu_v_bf16m4(vuint8m2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_xu_v_bf16m4(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfwcvt_f_xu_v_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv32bf16.nxv32i8.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfwcvt_f_xu_v_bf16m8(vuint8m4_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_xu_v_bf16m8(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwcvt_f_f_v_bf16mf4_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwcvt.f.f.v.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwcvt_f_f_v_bf16mf4_f32mf2(vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_bf16mf4_f32mf2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwcvt_f_f_v_bf16mf2_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwcvt.f.f.v.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwcvt_f_f_v_bf16mf2_f32m1(vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_bf16mf2_f32m1(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwcvt_f_f_v_bf16m1_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwcvt.f.f.v.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwcvt_f_f_v_bf16m1_f32m2(vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_bf16m1_f32m2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwcvt_f_f_v_bf16m2_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwcvt.f.f.v.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwcvt_f_f_v_bf16m2_f32m4(vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_bf16m2_f32m4(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwcvt_f_f_v_bf16m4_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwcvt.f.f.v.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwcvt_f_f_v_bf16m4_f32m8(vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_bf16m4_f32m8(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfwcvt_f_x_v_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv1bf16.nxv1i8.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x i8> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfwcvt_f_x_v_bf16mf4_m(vbool64_t vm, vint8mf8_t vs2,
+                                           size_t vl) {
+  return __riscv_vfwcvt_f_x_v_bf16mf4_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfwcvt_f_x_v_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv2bf16.nxv2i8.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x i8> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfwcvt_f_x_v_bf16mf2_m(vbool32_t vm, vint8mf4_t vs2,
+                                           size_t vl) {
+  return __riscv_vfwcvt_f_x_v_bf16mf2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfwcvt_f_x_v_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv4bf16.nxv4i8.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x i8> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfwcvt_f_x_v_bf16m1_m(vbool16_t vm, vint8mf2_t vs2,
+                                         size_t vl) {
+  return __riscv_vfwcvt_f_x_v_bf16m1_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfwcvt_f_x_v_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv8bf16.nxv8i8.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x i8> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfwcvt_f_x_v_bf16m2_m(vbool8_t vm, vint8m1_t vs2,
+                                         size_t vl) {
+  return __riscv_vfwcvt_f_x_v_bf16m2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfwcvt_f_x_v_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv16bf16.nxv16i8.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x i8> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfwcvt_f_x_v_bf16m4_m(vbool4_t vm, vint8m2_t vs2,
+                                         size_t vl) {
+  return __riscv_vfwcvt_f_x_v_bf16m4_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfwcvt_f_x_v_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv32bf16.nxv32i8.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x i8> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfwcvt_f_x_v_bf16m8_m(vbool2_t vm, vint8m4_t vs2,
+                                         size_t vl) {
+  return __riscv_vfwcvt_f_x_v_bf16m8_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfwcvt_f_xu_v_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv1bf16.nxv1i8.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x i8> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfwcvt_f_xu_v_bf16mf4_m(vbool64_t vm, vuint8mf8_t vs2,
+                                            size_t vl) {
+  return __riscv_vfwcvt_f_xu_v_bf16mf4_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfwcvt_f_xu_v_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv2bf16.nxv2i8.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x i8> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfwcvt_f_xu_v_bf16mf2_m(vbool32_t vm, vuint8mf4_t vs2,
+                                            size_t vl) {
+  return __riscv_vfwcvt_f_xu_v_bf16mf2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfwcvt_f_xu_v_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv4bf16.nxv4i8.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x i8> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfwcvt_f_xu_v_bf16m1_m(vbool16_t vm, vuint8mf2_t vs2,
+                                          size_t vl) {
+  return __riscv_vfwcvt_f_xu_v_bf16m1_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfwcvt_f_xu_v_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv8bf16.nxv8i8.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x i8> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfwcvt_f_xu_v_bf16m2_m(vbool8_t vm, vuint8m1_t vs2,
+                                          size_t vl) {
+  return __riscv_vfwcvt_f_xu_v_bf16m2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfwcvt_f_xu_v_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv16bf16.nxv16i8.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x i8> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfwcvt_f_xu_v_bf16m4_m(vbool4_t vm, vuint8m2_t vs2,
+                                          size_t vl) {
+  return __riscv_vfwcvt_f_xu_v_bf16m4_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfwcvt_f_xu_v_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv32bf16.nxv32i8.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x i8> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfwcvt_f_xu_v_bf16m8_m(vbool2_t vm, vuint8m4_t vs2,
+                                          size_t vl) {
+  return __riscv_vfwcvt_f_xu_v_bf16m8_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwcvt_f_f_v_bf16mf4_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwcvt_f_f_v_bf16mf4_f32mf2_m(vbool64_t vm,
+                                                 vbfloat16mf4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwcvt_f_f_v_bf16mf4_f32mf2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwcvt_f_f_v_bf16mf2_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwcvt_f_f_v_bf16mf2_f32m1_m(vbool32_t vm, vbfloat16mf2_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwcvt_f_f_v_bf16mf2_f32m1_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwcvt_f_f_v_bf16m1_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwcvt_f_f_v_bf16m1_f32m2_m(vbool16_t vm, vbfloat16m1_t vs2,
+                                              size_t vl) {
+  return __riscv_vfwcvt_f_f_v_bf16m1_f32m2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwcvt_f_f_v_bf16m2_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwcvt_f_f_v_bf16m2_f32m4_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                              size_t vl) {
+  return __riscv_vfwcvt_f_f_v_bf16m2_f32m4_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwcvt_f_f_v_bf16m4_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwcvt_f_f_v_bf16m4_f32m8_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                              size_t vl) {
+  return __riscv_vfwcvt_f_f_v_bf16m4_f32m8_m(vm, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfwmacc.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfwmacc.c
new file mode 100644
index 0000000000000..be09003320386
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfwmacc.c
@@ -0,0 +1,486 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vv_bf16mf4_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vv_bf16mf4_f32mf2(vfloat32mf2_t vd,
+                                             vbfloat16mf4_t vs1,
+                                             vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vv_bf16mf4_f32mf2(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vf_bf16mf4_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vf_bf16mf4_f32mf2(vfloat32mf2_t vd, __bf16 vs1,
+                                             vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vf_bf16mf4_f32mf2(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vv_bf16mf2_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vv_bf16mf2_f32m1(vfloat32m1_t vd, vbfloat16mf2_t vs1,
+                                           vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vv_bf16mf2_f32m1(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vf_bf16mf2_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vf_bf16mf2_f32m1(vfloat32m1_t vd, __bf16 vs1,
+                                           vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vf_bf16mf2_f32m1(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vv_bf16m1_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vv_bf16m1_f32m2(vfloat32m2_t vd, vbfloat16m1_t vs1,
+                                          vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vv_bf16m1_f32m2(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vf_bf16m1_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vf_bf16m1_f32m2(vfloat32m2_t vd, __bf16 vs1,
+                                          vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vf_bf16m1_f32m2(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vv_bf16m2_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vv_bf16m2_f32m4(vfloat32m4_t vd, vbfloat16m2_t vs1,
+                                          vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vv_bf16m2_f32m4(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vf_bf16m2_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vf_bf16m2_f32m4(vfloat32m4_t vd, __bf16 vs1,
+                                          vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vf_bf16m2_f32m4(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vv_bf16m4_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vv_bf16m4_f32m8(vfloat32m8_t vd, vbfloat16m4_t vs1,
+                                          vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vv_bf16m4_f32m8(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vf_bf16m4_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vf_bf16m4_f32m8(vfloat32m8_t vd, __bf16 vs1,
+                                          vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vf_bf16m4_f32m8(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vv_bf16mf4_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vv_bf16mf4_f32mf2_m(vbool64_t vm, vfloat32mf2_t vd,
+                                               vbfloat16mf4_t vs1,
+                                               vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vv_bf16mf4_f32mf2_m(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vf_bf16mf4_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vf_bf16mf4_f32mf2_m(vbool64_t vm, vfloat32mf2_t vd,
+                                               __bf16 vs1, vbfloat16mf4_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwmacc_vf_bf16mf4_f32mf2_m(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vv_bf16mf2_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vv_bf16mf2_f32m1_m(vbool32_t vm, vfloat32m1_t vd,
+                                             vbfloat16mf2_t vs1,
+                                             vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vv_bf16mf2_f32m1_m(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vf_bf16mf2_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vf_bf16mf2_f32m1_m(vbool32_t vm, vfloat32m1_t vd,
+                                             __bf16 vs1, vbfloat16mf2_t vs2,
+                                             size_t vl) {
+  return __riscv_vfwmacc_vf_bf16mf2_f32m1_m(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vv_bf16m1_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vv_bf16m1_f32m2_m(vbool16_t vm, vfloat32m2_t vd,
+                                            vbfloat16m1_t vs1,
+                                            vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vv_bf16m1_f32m2_m(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vf_bf16m1_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vf_bf16m1_f32m2_m(vbool16_t vm, vfloat32m2_t vd,
+                                            __bf16 vs1, vbfloat16m1_t vs2,
+                                            size_t vl) {
+  return __riscv_vfwmacc_vf_bf16m1_f32m2_m(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vv_bf16m2_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vv_bf16m2_f32m4_m(vbool8_t vm, vfloat32m4_t vd,
+                                            vbfloat16m2_t vs1,
+                                            vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vv_bf16m2_f32m4_m(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vf_bf16m2_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vf_bf16m2_f32m4_m(vbool8_t vm, vfloat32m4_t vd,
+                                            __bf16 vs1, vbfloat16m2_t vs2,
+                                            size_t vl) {
+  return __riscv_vfwmacc_vf_bf16m2_f32m4_m(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vv_bf16m4_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vv_bf16m4_f32m8_m(vbool4_t vm, vfloat32m8_t vd,
+                                            vbfloat16m4_t vs1,
+                                            vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vv_bf16m4_f32m8_m(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vf_bf16m4_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vf_bf16m4_f32m8_m(vbool4_t vm, vfloat32m8_t vd,
+                                            __bf16 vs1, vbfloat16m4_t vs2,
+                                            size_t vl) {
+  return __riscv_vfwmacc_vf_bf16m4_f32m8_m(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vv_bf16mf4_f32mf2_rm(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vv_bf16mf4_f32mf2_rm(vfloat32mf2_t vd,
+                                                vbfloat16mf4_t vs1,
+                                                vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vv_bf16mf4_f32mf2_rm(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vf_bf16mf4_f32mf2_rm(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vf_bf16mf4_f32mf2_rm(vfloat32mf2_t vd, __bf16 vs1,
+                                                vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vf_bf16mf4_f32mf2_rm(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vv_bf16mf2_f32m1_rm(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vv_bf16mf2_f32m1_rm(vfloat32m1_t vd,
+                                              vbfloat16mf2_t vs1,
+                                              vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vv_bf16mf2_f32m1_rm(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vf_bf16mf2_f32m1_rm(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vf_bf16mf2_f32m1_rm(vfloat32m1_t vd, __bf16 vs1,
+                                              vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vf_bf16mf2_f32m1_rm(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vv_bf16m1_f32m2_rm(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vv_bf16m1_f32m2_rm(vfloat32m2_t vd, vbfloat16m1_t vs1,
+                                             vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vv_bf16m1_f32m2_rm(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vf_bf16m1_f32m2_rm(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vf_bf16m1_f32m2_rm(vfloat32m2_t vd, __bf16 vs1,
+                                             vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vf_bf16m1_f32m2_rm(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vv_bf16m2_f32m4_rm(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vv_bf16m2_f32m4_rm(vfloat32m4_t vd, vbfloat16m2_t vs1,
+                                             vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vv_bf16m2_f32m4_rm(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vf_bf16m2_f32m4_rm(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vf_bf16m2_f32m4_rm(vfloat32m4_t vd, __bf16 vs1,
+                                             vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vf_bf16m2_f32m4_rm(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vv_bf16m4_f32m8_rm(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vv_bf16m4_f32m8_rm(vfloat32m8_t vd, vbfloat16m4_t vs1,
+                                             vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vv_bf16m4_f32m8_rm(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vf_bf16m4_f32m8_rm(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vf_bf16m4_f32m8_rm(vfloat32m8_t vd, __bf16 vs1,
+                                             vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vf_bf16m4_f32m8_rm(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vv_bf16mf4_f32mf2_rm_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vv_bf16mf4_f32mf2_rm_m(vbool64_t vm,
+                                                  vfloat32mf2_t vd,
+                                                  vbfloat16mf4_t vs1,
+                                                  vbfloat16mf4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmacc_vv_bf16mf4_f32mf2_rm_m(vm, vd, vs1, vs2,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vf_bf16mf4_f32mf2_rm_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vf_bf16mf4_f32mf2_rm_m(vbool64_t vm,
+                                                  vfloat32mf2_t vd, __bf16 vs1,
+                                                  vbfloat16mf4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmacc_vf_bf16mf4_f32mf2_rm_m(vm, vd, vs1, vs2,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vv_bf16mf2_f32m1_rm_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vv_bf16mf2_f32m1_rm_m(vbool32_t vm, vfloat32m1_t vd,
+                                                vbfloat16mf2_t vs1,
+                                                vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vv_bf16mf2_f32m1_rm_m(vm, vd, vs1, vs2,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vf_bf16mf2_f32m1_rm_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vf_bf16mf2_f32m1_rm_m(vbool32_t vm, vfloat32m1_t vd,
+                                                __bf16 vs1, vbfloat16mf2_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwmacc_vf_bf16mf2_f32m1_rm_m(vm, vd, vs1, vs2,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vv_bf16m1_f32m2_rm_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vv_bf16m1_f32m2_rm_m(vbool16_t vm, vfloat32m2_t vd,
+                                               vbfloat16m1_t vs1,
+                                               vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vv_bf16m1_f32m2_rm_m(vm, vd, vs1, vs2, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vf_bf16m1_f32m2_rm_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vf_bf16m1_f32m2_rm_m(vbool16_t vm, vfloat32m2_t vd,
+                                               __bf16 vs1, vbfloat16m1_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwmacc_vf_bf16m1_f32m2_rm_m(vm, vd, vs1, vs2, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vv_bf16m2_f32m4_rm_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vv_bf16m2_f32m4_rm_m(vbool8_t vm, vfloat32m4_t vd,
+                                               vbfloat16m2_t vs1,
+                                               vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vv_bf16m2_f32m4_rm_m(vm, vd, vs1, vs2, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vf_bf16m2_f32m4_rm_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vf_bf16m2_f32m4_rm_m(vbool8_t vm, vfloat32m4_t vd,
+                                               __bf16 vs1, vbfloat16m2_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwmacc_vf_bf16m2_f32m4_rm_m(vm, vd, vs1, vs2, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vv_bf16m4_f32m8_rm_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vv_bf16m4_f32m8_rm_m(vbool4_t vm, vfloat32m8_t vd,
+                                               vbfloat16m4_t vs1,
+                                               vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vv_bf16m4_f32m8_rm_m(vm, vd, vs1, vs2, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vf_bf16m4_f32m8_rm_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vf_bf16m4_f32m8_rm_m(vbool4_t vm, vfloat32m8_t vd,
+                                               __bf16 vs1, vbfloat16m4_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwmacc_vf_bf16m4_f32m8_rm_m(vm, vd, vs1, vs2, __RISCV_FRM_RNE,
+                                              vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfwmsac.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfwmsac.c
new file mode 100644
index 0000000000000..749081333c2b3
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfwmsac.c
@@ -0,0 +1,486 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vv_bf16mf4_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vv_bf16mf4_f32mf2(vfloat32mf2_t vd,
+                                             vbfloat16mf4_t vs1,
+                                             vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vv_bf16mf4_f32mf2(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vf_bf16mf4_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vf_bf16mf4_f32mf2(vfloat32mf2_t vd, __bf16 vs1,
+                                             vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vf_bf16mf4_f32mf2(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vv_bf16mf2_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vv_bf16mf2_f32m1(vfloat32m1_t vd, vbfloat16mf2_t vs1,
+                                           vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vv_bf16mf2_f32m1(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vf_bf16mf2_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vf_bf16mf2_f32m1(vfloat32m1_t vd, __bf16 vs1,
+                                           vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vf_bf16mf2_f32m1(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vv_bf16m1_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vv_bf16m1_f32m2(vfloat32m2_t vd, vbfloat16m1_t vs1,
+                                          vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vv_bf16m1_f32m2(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vf_bf16m1_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vf_bf16m1_f32m2(vfloat32m2_t vd, __bf16 vs1,
+                                          vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vf_bf16m1_f32m2(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vv_bf16m2_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vv_bf16m2_f32m4(vfloat32m4_t vd, vbfloat16m2_t vs1,
+                                          vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vv_bf16m2_f32m4(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vf_bf16m2_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vf_bf16m2_f32m4(vfloat32m4_t vd, __bf16 vs1,
+                                          vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vf_bf16m2_f32m4(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vv_bf16m4_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vv_bf16m4_f32m8(vfloat32m8_t vd, vbfloat16m4_t vs1,
+                                          vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vv_bf16m4_f32m8(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vf_bf16m4_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vf_bf16m4_f32m8(vfloat32m8_t vd, __bf16 vs1,
+                                          vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vf_bf16m4_f32m8(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vv_bf16mf4_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vv_bf16mf4_f32mf2_m(vbool64_t vm, vfloat32mf2_t vd,
+                                               vbfloat16mf4_t vs1,
+                                               vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vv_bf16mf4_f32mf2_m(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vf_bf16mf4_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vf_bf16mf4_f32mf2_m(vbool64_t vm, vfloat32mf2_t vd,
+                                               __bf16 vs1, vbfloat16mf4_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwmsac_vf_bf16mf4_f32mf2_m(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vv_bf16mf2_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vv_bf16mf2_f32m1_m(vbool32_t vm, vfloat32m1_t vd,
+                                             vbfloat16mf2_t vs1,
+                                             vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vv_bf16mf2_f32m1_m(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vf_bf16mf2_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vf_bf16mf2_f32m1_m(vbool32_t vm, vfloat32m1_t vd,
+                                             __bf16 vs1, vbfloat16mf2_t vs2,
+                                             size_t vl) {
+  return __riscv_vfwmsac_vf_bf16mf2_f32m1_m(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vv_bf16m1_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vv_bf16m1_f32m2_m(vbool16_t vm, vfloat32m2_t vd,
+                                            vbfloat16m1_t vs1,
+                                            vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vv_bf16m1_f32m2_m(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vf_bf16m1_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vf_bf16m1_f32m2_m(vbool16_t vm, vfloat32m2_t vd,
+                                            __bf16 vs1, vbfloat16m1_t vs2,
+                                            size_t vl) {
+  return __riscv_vfwmsac_vf_bf16m1_f32m2_m(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vv_bf16m2_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vv_bf16m2_f32m4_m(vbool8_t vm, vfloat32m4_t vd,
+                                            vbfloat16m2_t vs1,
+                                            vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vv_bf16m2_f32m4_m(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vf_bf16m2_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vf_bf16m2_f32m4_m(vbool8_t vm, vfloat32m4_t vd,
+                                            __bf16 vs1, vbfloat16m2_t vs2,
+                                            size_t vl) {
+  return __riscv_vfwmsac_vf_bf16m2_f32m4_m(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vv_bf16m4_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vv_bf16m4_f32m8_m(vbool4_t vm, vfloat32m8_t vd,
+                                            vbfloat16m4_t vs1,
+                                            vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vv_bf16m4_f32m8_m(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vf_bf16m4_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vf_bf16m4_f32m8_m(vbool4_t vm, vfloat32m8_t vd,
+                                            __bf16 vs1, vbfloat16m4_t vs2,
+                                            size_t vl) {
+  return __riscv_vfwmsac_vf_bf16m4_f32m8_m(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vv_bf16mf4_f32mf2_rm(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vv_bf16mf4_f32mf2_rm(vfloat32mf2_t vd,
+                                                vbfloat16mf4_t vs1,
+                                                vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vv_bf16mf4_f32mf2_rm(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vf_bf16mf4_f32mf2_rm(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vf_bf16mf4_f32mf2_rm(vfloat32mf2_t vd, __bf16 vs1,
+                                                vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vf_bf16mf4_f32mf2_rm(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vv_bf16mf2_f32m1_rm(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vv_bf16mf2_f32m1_rm(vfloat32m1_t vd,
+                                              vbfloat16mf2_t vs1,
+                                              vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vv_bf16mf2_f32m1_rm(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vf_bf16mf2_f32m1_rm(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vf_bf16mf2_f32m1_rm(vfloat32m1_t vd, __bf16 vs1,
+                                              vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vf_bf16mf2_f32m1_rm(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vv_bf16m1_f32m2_rm(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vv_bf16m1_f32m2_rm(vfloat32m2_t vd, vbfloat16m1_t vs1,
+                                             vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vv_bf16m1_f32m2_rm(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vf_bf16m1_f32m2_rm(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vf_bf16m1_f32m2_rm(vfloat32m2_t vd, __bf16 vs1,
+                                             vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vf_bf16m1_f32m2_rm(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vv_bf16m2_f32m4_rm(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vv_bf16m2_f32m4_rm(vfloat32m4_t vd, vbfloat16m2_t vs1,
+                                             vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vv_bf16m2_f32m4_rm(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vf_bf16m2_f32m4_rm(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vf_bf16m2_f32m4_rm(vfloat32m4_t vd, __bf16 vs1,
+                                             vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vf_bf16m2_f32m4_rm(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vv_bf16m4_f32m8_rm(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vv_bf16m4_f32m8_rm(vfloat32m8_t vd, vbfloat16m4_t vs1,
+                                             vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vv_bf16m4_f32m8_rm(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vf_bf16m4_f32m8_rm(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vf_bf16m4_f32m8_rm(vfloat32m8_t vd, __bf16 vs1,
+                                             vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vf_bf16m4_f32m8_rm(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vv_bf16mf4_f32mf2_rm_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vv_bf16mf4_f32mf2_rm_m(vbool64_t vm,
+                                                  vfloat32mf2_t vd,
+                                                  vbfloat16mf4_t vs1,
+                                                  vbfloat16mf4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmsac_vv_bf16mf4_f32mf2_rm_m(vm, vd, vs1, vs2,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vf_bf16mf4_f32mf2_rm_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vf_bf16mf4_f32mf2_rm_m(vbool64_t vm,
+                                                  vfloat32mf2_t vd, __bf16 vs1,
+                                                  vbfloat16mf4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmsac_vf_bf16mf4_f32mf2_rm_m(vm, vd, vs1, vs2,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vv_bf16mf2_f32m1_rm_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vv_bf16mf2_f32m1_rm_m(vbool32_t vm, vfloat32m1_t vd,
+                                                vbfloat16mf2_t vs1,
+                                                vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vv_bf16mf2_f32m1_rm_m(vm, vd, vs1, vs2,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vf_bf16mf2_f32m1_rm_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vf_bf16mf2_f32m1_rm_m(vbool32_t vm, vfloat32m1_t vd,
+                                                __bf16 vs1, vbfloat16mf2_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwmsac_vf_bf16mf2_f32m1_rm_m(vm, vd, vs1, vs2,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vv_bf16m1_f32m2_rm_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vv_bf16m1_f32m2_rm_m(vbool16_t vm, vfloat32m2_t vd,
+                                               vbfloat16m1_t vs1,
+                                               vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vv_bf16m1_f32m2_rm_m(vm, vd, vs1, vs2, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vf_bf16m1_f32m2_rm_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vf_bf16m1_f32m2_rm_m(vbool16_t vm, vfloat32m2_t vd,
+                                               __bf16 vs1, vbfloat16m1_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwmsac_vf_bf16m1_f32m2_rm_m(vm, vd, vs1, vs2, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vv_bf16m2_f32m4_rm_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vv_bf16m2_f32m4_rm_m(vbool8_t vm, vfloat32m4_t vd,
+                                               vbfloat16m2_t vs1,
+                                               vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vv_bf16m2_f32m4_rm_m(vm, vd, vs1, vs2, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vf_bf16m2_f32m4_rm_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vf_bf16m2_f32m4_rm_m(vbool8_t vm, vfloat32m4_t vd,
+                                               __bf16 vs1, vbfloat16m2_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwmsac_vf_bf16m2_f32m4_rm_m(vm, vd, vs1, vs2, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vv_bf16m4_f32m8_rm_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vv_bf16m4_f32m8_rm_m(vbool4_t vm, vfloat32m8_t vd,
+                                               vbfloat16m4_t vs1,
+                                               vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vv_bf16m4_f32m8_rm_m(vm, vd, vs1, vs2, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vf_bf16m4_f32m8_rm_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vf_bf16m4_f32m8_rm_m(vbool4_t vm, vfloat32m8_t vd,
+                                               __bf16 vs1, vbfloat16m4_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwmsac_vf_bf16m4_f32m8_rm_m(vm, vd, vs1, vs2, __RISCV_FRM_RNE,
+                                              vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfwmul.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfwmul.c
new file mode 100644
index 0000000000000..6783ba43b0570
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfwmul.c
@@ -0,0 +1,455 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vv_bf16mf4_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vv_bf16mf4_f32mf2(vbfloat16mf4_t vs2,
+                                            vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16mf4_f32mf2(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vf_bf16mf4_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vf_bf16mf4_f32mf2(vbfloat16mf4_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwmul_vf_bf16mf4_f32mf2(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vv_bf16mf2_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vv_bf16mf2_f32m1(vbfloat16mf2_t vs2,
+                                          vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16mf2_f32m1(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vf_bf16mf2_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vf_bf16mf2_f32m1(vbfloat16mf2_t vs2, __bf16 rs1,
+                                          size_t vl) {
+  return __riscv_vfwmul_vf_bf16mf2_f32m1(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vv_bf16m1_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vv_bf16m1_f32m2(vbfloat16m1_t vs2, vbfloat16m1_t vs1,
+                                         size_t vl) {
+  return __riscv_vfwmul_vv_bf16m1_f32m2(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vf_bf16m1_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vf_bf16m1_f32m2(vbfloat16m1_t vs2, __bf16 rs1,
+                                         size_t vl) {
+  return __riscv_vfwmul_vf_bf16m1_f32m2(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vv_bf16m2_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vv_bf16m2_f32m4(vbfloat16m2_t vs2, vbfloat16m2_t vs1,
+                                         size_t vl) {
+  return __riscv_vfwmul_vv_bf16m2_f32m4(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vf_bf16m2_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vf_bf16m2_f32m4(vbfloat16m2_t vs2, __bf16 rs1,
+                                         size_t vl) {
+  return __riscv_vfwmul_vf_bf16m2_f32m4(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vv_bf16m4_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vv_bf16m4_f32m8(vbfloat16m4_t vs2, vbfloat16m4_t vs1,
+                                         size_t vl) {
+  return __riscv_vfwmul_vv_bf16m4_f32m8(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vf_bf16m4_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vf_bf16m4_f32m8(vbfloat16m4_t vs2, __bf16 rs1,
+                                         size_t vl) {
+  return __riscv_vfwmul_vf_bf16m4_f32m8(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vv_bf16mf4_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vv_bf16mf4_f32mf2_m(vbool64_t vm, vbfloat16mf4_t vs2,
+                                              vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16mf4_f32mf2_m(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vf_bf16mf4_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vf_bf16mf4_f32mf2_m(vbool64_t vm, vbfloat16mf4_t vs2,
+                                              __bf16 rs1, size_t vl) {
+  return __riscv_vfwmul_vf_bf16mf4_f32mf2_m(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vv_bf16mf2_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vv_bf16mf2_f32m1_m(vbool32_t vm, vbfloat16mf2_t vs2,
+                                            vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16mf2_f32m1_m(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vf_bf16mf2_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vf_bf16mf2_f32m1_m(vbool32_t vm, vbfloat16mf2_t vs2,
+                                            __bf16 rs1, size_t vl) {
+  return __riscv_vfwmul_vf_bf16mf2_f32m1_m(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vv_bf16m1_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vv_bf16m1_f32m2_m(vbool16_t vm, vbfloat16m1_t vs2,
+                                           vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16m1_f32m2_m(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vf_bf16m1_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vf_bf16m1_f32m2_m(vbool16_t vm, vbfloat16m1_t vs2,
+                                           __bf16 rs1, size_t vl) {
+  return __riscv_vfwmul_vf_bf16m1_f32m2_m(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vv_bf16m2_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vv_bf16m2_f32m4_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                           vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16m2_f32m4_m(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vf_bf16m2_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vf_bf16m2_f32m4_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                           __bf16 rs1, size_t vl) {
+  return __riscv_vfwmul_vf_bf16m2_f32m4_m(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vv_bf16m4_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vv_bf16m4_f32m8_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                           vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16m4_f32m8_m(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vf_bf16m4_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vf_bf16m4_f32m8_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                           __bf16 rs1, size_t vl) {
+  return __riscv_vfwmul_vf_bf16m4_f32m8_m(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vv_bf16mf4_f32mf2_rm(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vv_bf16mf4_f32mf2_rm(vbfloat16mf4_t vs2,
+                                               vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16mf4_f32mf2_rm(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vf_bf16mf4_f32mf2_rm(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vf_bf16mf4_f32mf2_rm(vbfloat16mf4_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwmul_vf_bf16mf4_f32mf2_rm(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vv_bf16mf2_f32m1_rm(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vv_bf16mf2_f32m1_rm(vbfloat16mf2_t vs2,
+                                             vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16mf2_f32m1_rm(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vf_bf16mf2_f32m1_rm(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vf_bf16mf2_f32m1_rm(vbfloat16mf2_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwmul_vf_bf16mf2_f32m1_rm(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vv_bf16m1_f32m2_rm(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vv_bf16m1_f32m2_rm(vbfloat16m1_t vs2,
+                                            vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16m1_f32m2_rm(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vf_bf16m1_f32m2_rm(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vf_bf16m1_f32m2_rm(vbfloat16m1_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwmul_vf_bf16m1_f32m2_rm(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vv_bf16m2_f32m4_rm(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vv_bf16m2_f32m4_rm(vbfloat16m2_t vs2,
+                                            vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16m2_f32m4_rm(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vf_bf16m2_f32m4_rm(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vf_bf16m2_f32m4_rm(vbfloat16m2_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwmul_vf_bf16m2_f32m4_rm(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vv_bf16m4_f32m8_rm(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vv_bf16m4_f32m8_rm(vbfloat16m4_t vs2,
+                                            vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16m4_f32m8_rm(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vf_bf16m4_f32m8_rm(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vf_bf16m4_f32m8_rm(vbfloat16m4_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwmul_vf_bf16m4_f32m8_rm(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vv_bf16mf4_f32mf2_rm_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vv_bf16mf4_f32mf2_rm_m(vbool64_t vm,
+                                                 vbfloat16mf4_t vs2,
+                                                 vbfloat16mf4_t vs1,
+                                                 size_t vl) {
+  return __riscv_vfwmul_vv_bf16mf4_f32mf2_rm_m(vm, vs2, vs1, __RISCV_FRM_RNE,
+                                               vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vf_bf16mf4_f32mf2_rm_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vf_bf16mf4_f32mf2_rm_m(vbool64_t vm,
+                                                 vbfloat16mf4_t vs2, __bf16 rs1,
+                                                 size_t vl) {
+  return __riscv_vfwmul_vf_bf16mf4_f32mf2_rm_m(vm, vs2, rs1, __RISCV_FRM_RNE,
+                                               vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vv_bf16mf2_f32m1_rm_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vv_bf16mf2_f32m1_rm_m(vbool32_t vm, vbfloat16mf2_t vs2,
+                                               vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16mf2_f32m1_rm_m(vm, vs2, vs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vf_bf16mf2_f32m1_rm_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vf_bf16mf2_f32m1_rm_m(vbool32_t vm, vbfloat16mf2_t vs2,
+                                               __bf16 rs1, size_t vl) {
+  return __riscv_vfwmul_vf_bf16mf2_f32m1_rm_m(vm, vs2, rs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vv_bf16m1_f32m2_rm_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vv_bf16m1_f32m2_rm_m(vbool16_t vm, vbfloat16m1_t vs2,
+                                              vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16m1_f32m2_rm_m(vm, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vf_bf16m1_f32m2_rm_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vf_bf16m1_f32m2_rm_m(vbool16_t vm, vbfloat16m1_t vs2,
+                                              __bf16 rs1, size_t vl) {
+  return __riscv_vfwmul_vf_bf16m1_f32m2_rm_m(vm, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vv_bf16m2_f32m4_rm_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vv_bf16m2_f32m4_rm_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                              vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16m2_f32m4_rm_m(vm, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vf_bf16m2_f32m4_rm_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vf_bf16m2_f32m4_rm_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                              __bf16 rs1, size_t vl) {
+  return __riscv_vfwmul_vf_bf16m2_f32m4_rm_m(vm, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vv_bf16m4_f32m8_rm_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vv_bf16m4_f32m8_rm_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                              vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16m4_f32m8_rm_m(vm, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vf_bf16m4_f32m8_rm_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vf_bf16m4_f32m8_rm_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                              __bf16 rs1, size_t vl) {
+  return __riscv_vfwmul_vf_bf16m4_f32m8_rm_m(vm, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfwnmacc.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfwnmacc.c
new file mode 100644
index 0000000000000..6127a94c919d9
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfwnmacc.c
@@ -0,0 +1,494 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vv_bf16mf4_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vv_bf16mf4_f32mf2(vfloat32mf2_t vd,
+                                              vbfloat16mf4_t vs1,
+                                              vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16mf4_f32mf2(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vf_bf16mf4_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vf_bf16mf4_f32mf2(vfloat32mf2_t vd, __bf16 vs1,
+                                              vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16mf4_f32mf2(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vv_bf16mf2_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vv_bf16mf2_f32m1(vfloat32m1_t vd, vbfloat16mf2_t vs1,
+                                            vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16mf2_f32m1(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vf_bf16mf2_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vf_bf16mf2_f32m1(vfloat32m1_t vd, __bf16 vs1,
+                                            vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16mf2_f32m1(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vv_bf16m1_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vv_bf16m1_f32m2(vfloat32m2_t vd, vbfloat16m1_t vs1,
+                                           vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16m1_f32m2(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vf_bf16m1_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vf_bf16m1_f32m2(vfloat32m2_t vd, __bf16 vs1,
+                                           vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16m1_f32m2(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vv_bf16m2_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vv_bf16m2_f32m4(vfloat32m4_t vd, vbfloat16m2_t vs1,
+                                           vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16m2_f32m4(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vf_bf16m2_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vf_bf16m2_f32m4(vfloat32m4_t vd, __bf16 vs1,
+                                           vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16m2_f32m4(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vv_bf16m4_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vv_bf16m4_f32m8(vfloat32m8_t vd, vbfloat16m4_t vs1,
+                                           vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16m4_f32m8(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vf_bf16m4_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vf_bf16m4_f32m8(vfloat32m8_t vd, __bf16 vs1,
+                                           vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16m4_f32m8(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vv_bf16mf4_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vv_bf16mf4_f32mf2_m(vbool64_t vm, vfloat32mf2_t vd,
+                                                vbfloat16mf4_t vs1,
+                                                vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16mf4_f32mf2_m(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vf_bf16mf4_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vf_bf16mf4_f32mf2_m(vbool64_t vm, vfloat32mf2_t vd,
+                                                __bf16 vs1, vbfloat16mf4_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16mf4_f32mf2_m(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vv_bf16mf2_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vv_bf16mf2_f32m1_m(vbool32_t vm, vfloat32m1_t vd,
+                                              vbfloat16mf2_t vs1,
+                                              vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16mf2_f32m1_m(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vf_bf16mf2_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vf_bf16mf2_f32m1_m(vbool32_t vm, vfloat32m1_t vd,
+                                              __bf16 vs1, vbfloat16mf2_t vs2,
+                                              size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16mf2_f32m1_m(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vv_bf16m1_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vv_bf16m1_f32m2_m(vbool16_t vm, vfloat32m2_t vd,
+                                             vbfloat16m1_t vs1,
+                                             vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16m1_f32m2_m(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vf_bf16m1_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vf_bf16m1_f32m2_m(vbool16_t vm, vfloat32m2_t vd,
+                                             __bf16 vs1, vbfloat16m1_t vs2,
+                                             size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16m1_f32m2_m(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vv_bf16m2_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vv_bf16m2_f32m4_m(vbool8_t vm, vfloat32m4_t vd,
+                                             vbfloat16m2_t vs1,
+                                             vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16m2_f32m4_m(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vf_bf16m2_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vf_bf16m2_f32m4_m(vbool8_t vm, vfloat32m4_t vd,
+                                             __bf16 vs1, vbfloat16m2_t vs2,
+                                             size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16m2_f32m4_m(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vv_bf16m4_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vv_bf16m4_f32m8_m(vbool4_t vm, vfloat32m8_t vd,
+                                             vbfloat16m4_t vs1,
+                                             vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16m4_f32m8_m(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vf_bf16m4_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vf_bf16m4_f32m8_m(vbool4_t vm, vfloat32m8_t vd,
+                                             __bf16 vs1, vbfloat16m4_t vs2,
+                                             size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16m4_f32m8_m(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vv_bf16mf4_f32mf2_rm(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vv_bf16mf4_f32mf2_rm(vfloat32mf2_t vd,
+                                                 vbfloat16mf4_t vs1,
+                                                 vbfloat16mf4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16mf4_f32mf2_rm(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                               vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vf_bf16mf4_f32mf2_rm(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vf_bf16mf4_f32mf2_rm(vfloat32mf2_t vd, __bf16 vs1,
+                                                 vbfloat16mf4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16mf4_f32mf2_rm(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                               vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vv_bf16mf2_f32m1_rm(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vv_bf16mf2_f32m1_rm(vfloat32m1_t vd,
+                                               vbfloat16mf2_t vs1,
+                                               vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16mf2_f32m1_rm(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vf_bf16mf2_f32m1_rm(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vf_bf16mf2_f32m1_rm(vfloat32m1_t vd, __bf16 vs1,
+                                               vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16mf2_f32m1_rm(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vv_bf16m1_f32m2_rm(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vv_bf16m1_f32m2_rm(vfloat32m2_t vd,
+                                              vbfloat16m1_t vs1,
+                                              vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16m1_f32m2_rm(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vf_bf16m1_f32m2_rm(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vf_bf16m1_f32m2_rm(vfloat32m2_t vd, __bf16 vs1,
+                                              vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16m1_f32m2_rm(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vv_bf16m2_f32m4_rm(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vv_bf16m2_f32m4_rm(vfloat32m4_t vd,
+                                              vbfloat16m2_t vs1,
+                                              vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16m2_f32m4_rm(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vf_bf16m2_f32m4_rm(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vf_bf16m2_f32m4_rm(vfloat32m4_t vd, __bf16 vs1,
+                                              vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16m2_f32m4_rm(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vv_bf16m4_f32m8_rm(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vv_bf16m4_f32m8_rm(vfloat32m8_t vd,
+                                              vbfloat16m4_t vs1,
+                                              vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16m4_f32m8_rm(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vf_bf16m4_f32m8_rm(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vf_bf16m4_f32m8_rm(vfloat32m8_t vd, __bf16 vs1,
+                                              vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16m4_f32m8_rm(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vv_bf16mf4_f32mf2_rm_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vv_bf16mf4_f32mf2_rm_m(vbool64_t vm,
+                                                   vfloat32mf2_t vd,
+                                                   vbfloat16mf4_t vs1,
+                                                   vbfloat16mf4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16mf4_f32mf2_rm_m(vm, vd, vs1, vs2,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vf_bf16mf4_f32mf2_rm_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vf_bf16mf4_f32mf2_rm_m(vbool64_t vm,
+                                                   vfloat32mf2_t vd, __bf16 vs1,
+                                                   vbfloat16mf4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16mf4_f32mf2_rm_m(vm, vd, vs1, vs2,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vv_bf16mf2_f32m1_rm_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vv_bf16mf2_f32m1_rm_m(vbool32_t vm, vfloat32m1_t vd,
+                                                 vbfloat16mf2_t vs1,
+                                                 vbfloat16mf2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16mf2_f32m1_rm_m(vm, vd, vs1, vs2,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vf_bf16mf2_f32m1_rm_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vf_bf16mf2_f32m1_rm_m(vbool32_t vm, vfloat32m1_t vd,
+                                                 __bf16 vs1, vbfloat16mf2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16mf2_f32m1_rm_m(vm, vd, vs1, vs2,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vv_bf16m1_f32m2_rm_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vv_bf16m1_f32m2_rm_m(vbool16_t vm, vfloat32m2_t vd,
+                                                vbfloat16m1_t vs1,
+                                                vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16m1_f32m2_rm_m(vm, vd, vs1, vs2,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vf_bf16m1_f32m2_rm_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vf_bf16m1_f32m2_rm_m(vbool16_t vm, vfloat32m2_t vd,
+                                                __bf16 vs1, vbfloat16m1_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16m1_f32m2_rm_m(vm, vd, vs1, vs2,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vv_bf16m2_f32m4_rm_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vv_bf16m2_f32m4_rm_m(vbool8_t vm, vfloat32m4_t vd,
+                                                vbfloat16m2_t vs1,
+                                                vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16m2_f32m4_rm_m(vm, vd, vs1, vs2,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vf_bf16m2_f32m4_rm_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vf_bf16m2_f32m4_rm_m(vbool8_t vm, vfloat32m4_t vd,
+                                                __bf16 vs1, vbfloat16m2_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16m2_f32m4_rm_m(vm, vd, vs1, vs2,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vv_bf16m4_f32m8_rm_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vv_bf16m4_f32m8_rm_m(vbool4_t vm, vfloat32m8_t vd,
+                                                vbfloat16m4_t vs1,
+                                                vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16m4_f32m8_rm_m(vm, vd, vs1, vs2,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vf_bf16m4_f32m8_rm_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vf_bf16m4_f32m8_rm_m(vbool4_t vm, vfloat32m8_t vd,
+                                                __bf16 vs1, vbfloat16m4_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16m4_f32m8_rm_m(vm, vd, vs1, vs2,
+                                               __RISCV_FRM_RNE, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfwnmsac.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfwnmsac.c
new file mode 100644
index 0000000000000..f37dd310d944d
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfwnmsac.c
@@ -0,0 +1,494 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vv_bf16mf4_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vv_bf16mf4_f32mf2(vfloat32mf2_t vd,
+                                              vbfloat16mf4_t vs1,
+                                              vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16mf4_f32mf2(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vf_bf16mf4_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vf_bf16mf4_f32mf2(vfloat32mf2_t vd, __bf16 vs1,
+                                              vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16mf4_f32mf2(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vv_bf16mf2_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vv_bf16mf2_f32m1(vfloat32m1_t vd, vbfloat16mf2_t vs1,
+                                            vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16mf2_f32m1(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vf_bf16mf2_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vf_bf16mf2_f32m1(vfloat32m1_t vd, __bf16 vs1,
+                                            vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16mf2_f32m1(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vv_bf16m1_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vv_bf16m1_f32m2(vfloat32m2_t vd, vbfloat16m1_t vs1,
+                                           vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16m1_f32m2(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vf_bf16m1_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vf_bf16m1_f32m2(vfloat32m2_t vd, __bf16 vs1,
+                                           vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16m1_f32m2(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vv_bf16m2_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vv_bf16m2_f32m4(vfloat32m4_t vd, vbfloat16m2_t vs1,
+                                           vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16m2_f32m4(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vf_bf16m2_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vf_bf16m2_f32m4(vfloat32m4_t vd, __bf16 vs1,
+                                           vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16m2_f32m4(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vv_bf16m4_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vv_bf16m4_f32m8(vfloat32m8_t vd, vbfloat16m4_t vs1,
+                                           vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16m4_f32m8(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vf_bf16m4_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vf_bf16m4_f32m8(vfloat32m8_t vd, __bf16 vs1,
+                                           vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16m4_f32m8(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vv_bf16mf4_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vv_bf16mf4_f32mf2_m(vbool64_t vm, vfloat32mf2_t vd,
+                                                vbfloat16mf4_t vs1,
+                                                vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16mf4_f32mf2_m(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vf_bf16mf4_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vf_bf16mf4_f32mf2_m(vbool64_t vm, vfloat32mf2_t vd,
+                                                __bf16 vs1, vbfloat16mf4_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16mf4_f32mf2_m(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vv_bf16mf2_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vv_bf16mf2_f32m1_m(vbool32_t vm, vfloat32m1_t vd,
+                                              vbfloat16mf2_t vs1,
+                                              vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16mf2_f32m1_m(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vf_bf16mf2_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vf_bf16mf2_f32m1_m(vbool32_t vm, vfloat32m1_t vd,
+                                              __bf16 vs1, vbfloat16mf2_t vs2,
+                                              size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16mf2_f32m1_m(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vv_bf16m1_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vv_bf16m1_f32m2_m(vbool16_t vm, vfloat32m2_t vd,
+                                             vbfloat16m1_t vs1,
+                                             vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16m1_f32m2_m(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vf_bf16m1_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vf_bf16m1_f32m2_m(vbool16_t vm, vfloat32m2_t vd,
+                                             __bf16 vs1, vbfloat16m1_t vs2,
+                                             size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16m1_f32m2_m(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vv_bf16m2_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vv_bf16m2_f32m4_m(vbool8_t vm, vfloat32m4_t vd,
+                                             vbfloat16m2_t vs1,
+                                             vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16m2_f32m4_m(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vf_bf16m2_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vf_bf16m2_f32m4_m(vbool8_t vm, vfloat32m4_t vd,
+                                             __bf16 vs1, vbfloat16m2_t vs2,
+                                             size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16m2_f32m4_m(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vv_bf16m4_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vv_bf16m4_f32m8_m(vbool4_t vm, vfloat32m8_t vd,
+                                             vbfloat16m4_t vs1,
+                                             vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16m4_f32m8_m(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vf_bf16m4_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vf_bf16m4_f32m8_m(vbool4_t vm, vfloat32m8_t vd,
+                                             __bf16 vs1, vbfloat16m4_t vs2,
+                                             size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16m4_f32m8_m(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vv_bf16mf4_f32mf2_rm(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vv_bf16mf4_f32mf2_rm(vfloat32mf2_t vd,
+                                                 vbfloat16mf4_t vs1,
+                                                 vbfloat16mf4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16mf4_f32mf2_rm(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                               vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vf_bf16mf4_f32mf2_rm(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vf_bf16mf4_f32mf2_rm(vfloat32mf2_t vd, __bf16 vs1,
+                                                 vbfloat16mf4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16mf4_f32mf2_rm(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                               vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vv_bf16mf2_f32m1_rm(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vv_bf16mf2_f32m1_rm(vfloat32m1_t vd,
+                                               vbfloat16mf2_t vs1,
+                                               vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16mf2_f32m1_rm(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vf_bf16mf2_f32m1_rm(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vf_bf16mf2_f32m1_rm(vfloat32m1_t vd, __bf16 vs1,
+                                               vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16mf2_f32m1_rm(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vv_bf16m1_f32m2_rm(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vv_bf16m1_f32m2_rm(vfloat32m2_t vd,
+                                              vbfloat16m1_t vs1,
+                                              vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16m1_f32m2_rm(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vf_bf16m1_f32m2_rm(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vf_bf16m1_f32m2_rm(vfloat32m2_t vd, __bf16 vs1,
+                                              vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16m1_f32m2_rm(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vv_bf16m2_f32m4_rm(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vv_bf16m2_f32m4_rm(vfloat32m4_t vd,
+                                              vbfloat16m2_t vs1,
+                                              vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16m2_f32m4_rm(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vf_bf16m2_f32m4_rm(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vf_bf16m2_f32m4_rm(vfloat32m4_t vd, __bf16 vs1,
+                                              vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16m2_f32m4_rm(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vv_bf16m4_f32m8_rm(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vv_bf16m4_f32m8_rm(vfloat32m8_t vd,
+                                              vbfloat16m4_t vs1,
+                                              vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16m4_f32m8_rm(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vf_bf16m4_f32m8_rm(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vf_bf16m4_f32m8_rm(vfloat32m8_t vd, __bf16 vs1,
+                                              vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16m4_f32m8_rm(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vv_bf16mf4_f32mf2_rm_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vv_bf16mf4_f32mf2_rm_m(vbool64_t vm,
+                                                   vfloat32mf2_t vd,
+                                                   vbfloat16mf4_t vs1,
+                                                   vbfloat16mf4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16mf4_f32mf2_rm_m(vm, vd, vs1, vs2,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vf_bf16mf4_f32mf2_rm_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vf_bf16mf4_f32mf2_rm_m(vbool64_t vm,
+                                                   vfloat32mf2_t vd, __bf16 vs1,
+                                                   vbfloat16mf4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16mf4_f32mf2_rm_m(vm, vd, vs1, vs2,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vv_bf16mf2_f32m1_rm_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vv_bf16mf2_f32m1_rm_m(vbool32_t vm, vfloat32m1_t vd,
+                                                 vbfloat16mf2_t vs1,
+                                                 vbfloat16mf2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16mf2_f32m1_rm_m(vm, vd, vs1, vs2,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vf_bf16mf2_f32m1_rm_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vf_bf16mf2_f32m1_rm_m(vbool32_t vm, vfloat32m1_t vd,
+                                                 __bf16 vs1, vbfloat16mf2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16mf2_f32m1_rm_m(vm, vd, vs1, vs2,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vv_bf16m1_f32m2_rm_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vv_bf16m1_f32m2_rm_m(vbool16_t vm, vfloat32m2_t vd,
+                                                vbfloat16m1_t vs1,
+                                                vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16m1_f32m2_rm_m(vm, vd, vs1, vs2,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vf_bf16m1_f32m2_rm_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vf_bf16m1_f32m2_rm_m(vbool16_t vm, vfloat32m2_t vd,
+                                                __bf16 vs1, vbfloat16m1_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16m1_f32m2_rm_m(vm, vd, vs1, vs2,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vv_bf16m2_f32m4_rm_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vv_bf16m2_f32m4_rm_m(vbool8_t vm, vfloat32m4_t vd,
+                                                vbfloat16m2_t vs1,
+                                                vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16m2_f32m4_rm_m(vm, vd, vs1, vs2,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vf_bf16m2_f32m4_rm_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vf_bf16m2_f32m4_rm_m(vbool8_t vm, vfloat32m4_t vd,
+                                                __bf16 vs1, vbfloat16m2_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16m2_f32m4_rm_m(vm, vd, vs1, vs2,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vv_bf16m4_f32m8_rm_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vv_bf16m4_f32m8_rm_m(vbool4_t vm, vfloat32m8_t vd,
+                                                vbfloat16m4_t vs1,
+                                                vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16m4_f32m8_rm_m(vm, vd, vs1, vs2,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vf_bf16m4_f32m8_rm_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vf_bf16m4_f32m8_rm_m(vbool4_t vm, vfloat32m8_t vd,
+                                                __bf16 vs1, vbfloat16m4_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16m4_f32m8_rm_m(vm, vd, vs1, vs2,
+                                               __RISCV_FRM_RNE, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfwsub.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfwsub.c
new file mode 100644
index 0000000000000..510ff9193389e
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vfwsub.c
@@ -0,0 +1,899 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vv_bf16mf4_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vv_bf16mf4_f32mf2(vbfloat16mf4_t vs2,
+                                            vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16mf4_f32mf2(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vf_bf16mf4_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vf_bf16mf4_f32mf2(vbfloat16mf4_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwsub_vf_bf16mf4_f32mf2(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wv_bf16mf4_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wv_bf16mf4_f32mf2(vfloat32mf2_t vs2,
+                                            vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_bf16mf4_f32mf2(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wf_bf16_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.nxv1f32.bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wf_bf16_f32mf2(vfloat32mf2_t vs2, __bf16 rs1,
+                                         size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32mf2(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vv_bf16mf2_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vv_bf16mf2_f32m1(vbfloat16mf2_t vs2,
+                                          vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16mf2_f32m1(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vf_bf16mf2_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vf_bf16mf2_f32m1(vbfloat16mf2_t vs2, __bf16 rs1,
+                                          size_t vl) {
+  return __riscv_vfwsub_vf_bf16mf2_f32m1(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wv_bf16mf2_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wv_bf16mf2_f32m1(vfloat32m1_t vs2, vbfloat16mf2_t vs1,
+                                          size_t vl) {
+  return __riscv_vfwsub_wv_bf16mf2_f32m1(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wf_bf16_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.nxv2f32.bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wf_bf16_f32m1(vfloat32m1_t vs2, __bf16 rs1,
+                                       size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m1(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vv_bf16m1_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vv_bf16m1_f32m2(vbfloat16m1_t vs2, vbfloat16m1_t vs1,
+                                         size_t vl) {
+  return __riscv_vfwsub_vv_bf16m1_f32m2(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vf_bf16m1_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vf_bf16m1_f32m2(vbfloat16m1_t vs2, __bf16 rs1,
+                                         size_t vl) {
+  return __riscv_vfwsub_vf_bf16m1_f32m2(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wv_bf16m1_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wv_bf16m1_f32m2(vfloat32m2_t vs2, vbfloat16m1_t vs1,
+                                         size_t vl) {
+  return __riscv_vfwsub_wv_bf16m1_f32m2(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wf_bf16_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.nxv4f32.bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wf_bf16_f32m2(vfloat32m2_t vs2, __bf16 rs1,
+                                       size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m2(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vv_bf16m2_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vv_bf16m2_f32m4(vbfloat16m2_t vs2, vbfloat16m2_t vs1,
+                                         size_t vl) {
+  return __riscv_vfwsub_vv_bf16m2_f32m4(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vf_bf16m2_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vf_bf16m2_f32m4(vbfloat16m2_t vs2, __bf16 rs1,
+                                         size_t vl) {
+  return __riscv_vfwsub_vf_bf16m2_f32m4(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wv_bf16m2_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wv_bf16m2_f32m4(vfloat32m4_t vs2, vbfloat16m2_t vs1,
+                                         size_t vl) {
+  return __riscv_vfwsub_wv_bf16m2_f32m4(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wf_bf16_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.nxv8f32.bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wf_bf16_f32m4(vfloat32m4_t vs2, __bf16 rs1,
+                                       size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m4(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vv_bf16m4_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vv_bf16m4_f32m8(vbfloat16m4_t vs2, vbfloat16m4_t vs1,
+                                         size_t vl) {
+  return __riscv_vfwsub_vv_bf16m4_f32m8(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vf_bf16m4_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vf_bf16m4_f32m8(vbfloat16m4_t vs2, __bf16 rs1,
+                                         size_t vl) {
+  return __riscv_vfwsub_vf_bf16m4_f32m8(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wv_bf16m4_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wv_bf16m4_f32m8(vfloat32m8_t vs2, vbfloat16m4_t vs1,
+                                         size_t vl) {
+  return __riscv_vfwsub_wv_bf16m4_f32m8(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wf_bf16_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.nxv16f32.bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wf_bf16_f32m8(vfloat32m8_t vs2, __bf16 rs1,
+                                       size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m8(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vv_bf16mf4_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vv_bf16mf4_f32mf2_m(vbool64_t vm, vbfloat16mf4_t vs2,
+                                              vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16mf4_f32mf2_m(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vf_bf16mf4_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vf_bf16mf4_f32mf2_m(vbool64_t vm, vbfloat16mf4_t vs2,
+                                              __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_vf_bf16mf4_f32mf2_m(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wv_bf16mf4_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wv_bf16mf4_f32mf2_m(vbool64_t vm, vfloat32mf2_t vs2,
+                                              vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_bf16mf4_f32mf2_m(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wf_bf16_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wf_bf16_f32mf2_m(vbool64_t vm, vfloat32mf2_t vs2,
+                                           __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32mf2_m(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vv_bf16mf2_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vv_bf16mf2_f32m1_m(vbool32_t vm, vbfloat16mf2_t vs2,
+                                            vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16mf2_f32m1_m(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vf_bf16mf2_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vf_bf16mf2_f32m1_m(vbool32_t vm, vbfloat16mf2_t vs2,
+                                            __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_vf_bf16mf2_f32m1_m(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wv_bf16mf2_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wv_bf16mf2_f32m1_m(vbool32_t vm, vfloat32m1_t vs2,
+                                            vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_bf16mf2_f32m1_m(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wf_bf16_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wf_bf16_f32m1_m(vbool32_t vm, vfloat32m1_t vs2,
+                                         __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m1_m(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vv_bf16m1_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vv_bf16m1_f32m2_m(vbool16_t vm, vbfloat16m1_t vs2,
+                                           vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16m1_f32m2_m(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vf_bf16m1_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vf_bf16m1_f32m2_m(vbool16_t vm, vbfloat16m1_t vs2,
+                                           __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_vf_bf16m1_f32m2_m(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wv_bf16m1_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wv_bf16m1_f32m2_m(vbool16_t vm, vfloat32m2_t vs2,
+                                           vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_bf16m1_f32m2_m(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wf_bf16_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wf_bf16_f32m2_m(vbool16_t vm, vfloat32m2_t vs2,
+                                         __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m2_m(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vv_bf16m2_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vv_bf16m2_f32m4_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                           vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16m2_f32m4_m(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vf_bf16m2_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vf_bf16m2_f32m4_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                           __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_vf_bf16m2_f32m4_m(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wv_bf16m2_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wv_bf16m2_f32m4_m(vbool8_t vm, vfloat32m4_t vs2,
+                                           vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_bf16m2_f32m4_m(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wf_bf16_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wf_bf16_f32m4_m(vbool8_t vm, vfloat32m4_t vs2,
+                                         __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m4_m(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vv_bf16m4_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vv_bf16m4_f32m8_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                           vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16m4_f32m8_m(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vf_bf16m4_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vf_bf16m4_f32m8_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                           __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_vf_bf16m4_f32m8_m(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wv_bf16m4_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wv_bf16m4_f32m8_m(vbool4_t vm, vfloat32m8_t vs2,
+                                           vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_bf16m4_f32m8_m(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wf_bf16_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wf_bf16_f32m8_m(vbool4_t vm, vfloat32m8_t vs2,
+                                         __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m8_m(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vv_bf16mf4_f32mf2_rm(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vv_bf16mf4_f32mf2_rm(vbfloat16mf4_t vs2,
+                                               vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16mf4_f32mf2_rm(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vf_bf16mf4_f32mf2_rm(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vf_bf16mf4_f32mf2_rm(vbfloat16mf4_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwsub_vf_bf16mf4_f32mf2_rm(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wv_bf16mf4_f32mf2_rm(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wv_bf16mf4_f32mf2_rm(vfloat32mf2_t vs2,
+                                               vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_bf16mf4_f32mf2_rm(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wf_bf16_f32mf2_rm(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.nxv1f32.bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wf_bf16_f32mf2_rm(vfloat32mf2_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32mf2_rm(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vv_bf16mf2_f32m1_rm(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vv_bf16mf2_f32m1_rm(vbfloat16mf2_t vs2,
+                                             vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16mf2_f32m1_rm(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vf_bf16mf2_f32m1_rm(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vf_bf16mf2_f32m1_rm(vbfloat16mf2_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwsub_vf_bf16mf2_f32m1_rm(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wv_bf16mf2_f32m1_rm(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wv_bf16mf2_f32m1_rm(vfloat32m1_t vs2,
+                                             vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_bf16mf2_f32m1_rm(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wf_bf16_f32m1_rm(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.nxv2f32.bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wf_bf16_f32m1_rm(vfloat32m1_t vs2, __bf16 rs1,
+                                          size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m1_rm(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vv_bf16m1_f32m2_rm(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vv_bf16m1_f32m2_rm(vbfloat16m1_t vs2,
+                                            vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16m1_f32m2_rm(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vf_bf16m1_f32m2_rm(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vf_bf16m1_f32m2_rm(vbfloat16m1_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwsub_vf_bf16m1_f32m2_rm(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wv_bf16m1_f32m2_rm(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wv_bf16m1_f32m2_rm(vfloat32m2_t vs2, vbfloat16m1_t vs1,
+                                            size_t vl) {
+  return __riscv_vfwsub_wv_bf16m1_f32m2_rm(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wf_bf16_f32m2_rm(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.nxv4f32.bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wf_bf16_f32m2_rm(vfloat32m2_t vs2, __bf16 rs1,
+                                          size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m2_rm(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vv_bf16m2_f32m4_rm(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vv_bf16m2_f32m4_rm(vbfloat16m2_t vs2,
+                                            vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16m2_f32m4_rm(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vf_bf16m2_f32m4_rm(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vf_bf16m2_f32m4_rm(vbfloat16m2_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwsub_vf_bf16m2_f32m4_rm(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wv_bf16m2_f32m4_rm(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wv_bf16m2_f32m4_rm(vfloat32m4_t vs2, vbfloat16m2_t vs1,
+                                            size_t vl) {
+  return __riscv_vfwsub_wv_bf16m2_f32m4_rm(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wf_bf16_f32m4_rm(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.nxv8f32.bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wf_bf16_f32m4_rm(vfloat32m4_t vs2, __bf16 rs1,
+                                          size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m4_rm(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vv_bf16m4_f32m8_rm(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vv_bf16m4_f32m8_rm(vbfloat16m4_t vs2,
+                                            vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16m4_f32m8_rm(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vf_bf16m4_f32m8_rm(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vf_bf16m4_f32m8_rm(vbfloat16m4_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwsub_vf_bf16m4_f32m8_rm(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wv_bf16m4_f32m8_rm(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wv_bf16m4_f32m8_rm(vfloat32m8_t vs2, vbfloat16m4_t vs1,
+                                            size_t vl) {
+  return __riscv_vfwsub_wv_bf16m4_f32m8_rm(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wf_bf16_f32m8_rm(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.nxv16f32.bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wf_bf16_f32m8_rm(vfloat32m8_t vs2, __bf16 rs1,
+                                          size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m8_rm(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vv_bf16mf4_f32mf2_rm_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vv_bf16mf4_f32mf2_rm_m(vbool64_t vm,
+                                                 vbfloat16mf4_t vs2,
+                                                 vbfloat16mf4_t vs1,
+                                                 size_t vl) {
+  return __riscv_vfwsub_vv_bf16mf4_f32mf2_rm_m(vm, vs2, vs1, __RISCV_FRM_RNE,
+                                               vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vf_bf16mf4_f32mf2_rm_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vf_bf16mf4_f32mf2_rm_m(vbool64_t vm,
+                                                 vbfloat16mf4_t vs2, __bf16 rs1,
+                                                 size_t vl) {
+  return __riscv_vfwsub_vf_bf16mf4_f32mf2_rm_m(vm, vs2, rs1, __RISCV_FRM_RNE,
+                                               vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wv_bf16mf4_f32mf2_rm_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wv_bf16mf4_f32mf2_rm_m(vbool64_t vm,
+                                                 vfloat32mf2_t vs2,
+                                                 vbfloat16mf4_t vs1,
+                                                 size_t vl) {
+  return __riscv_vfwsub_wv_bf16mf4_f32mf2_rm_m(vm, vs2, vs1, __RISCV_FRM_RNE,
+                                               vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wf_bf16_f32mf2_rm_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wf_bf16_f32mf2_rm_m(vbool64_t vm, vfloat32mf2_t vs2,
+                                              __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32mf2_rm_m(vm, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vv_bf16mf2_f32m1_rm_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vv_bf16mf2_f32m1_rm_m(vbool32_t vm, vbfloat16mf2_t vs2,
+                                               vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16mf2_f32m1_rm_m(vm, vs2, vs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vf_bf16mf2_f32m1_rm_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vf_bf16mf2_f32m1_rm_m(vbool32_t vm, vbfloat16mf2_t vs2,
+                                               __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_vf_bf16mf2_f32m1_rm_m(vm, vs2, rs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wv_bf16mf2_f32m1_rm_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wv_bf16mf2_f32m1_rm_m(vbool32_t vm, vfloat32m1_t vs2,
+                                               vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_bf16mf2_f32m1_rm_m(vm, vs2, vs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wf_bf16_f32m1_rm_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wf_bf16_f32m1_rm_m(vbool32_t vm, vfloat32m1_t vs2,
+                                            __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m1_rm_m(vm, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vv_bf16m1_f32m2_rm_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vv_bf16m1_f32m2_rm_m(vbool16_t vm, vbfloat16m1_t vs2,
+                                              vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16m1_f32m2_rm_m(vm, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vf_bf16m1_f32m2_rm_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vf_bf16m1_f32m2_rm_m(vbool16_t vm, vbfloat16m1_t vs2,
+                                              __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_vf_bf16m1_f32m2_rm_m(vm, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wv_bf16m1_f32m2_rm_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wv_bf16m1_f32m2_rm_m(vbool16_t vm, vfloat32m2_t vs2,
+                                              vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_bf16m1_f32m2_rm_m(vm, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wf_bf16_f32m2_rm_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wf_bf16_f32m2_rm_m(vbool16_t vm, vfloat32m2_t vs2,
+                                            __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m2_rm_m(vm, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vv_bf16m2_f32m4_rm_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vv_bf16m2_f32m4_rm_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                              vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16m2_f32m4_rm_m(vm, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vf_bf16m2_f32m4_rm_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vf_bf16m2_f32m4_rm_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                              __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_vf_bf16m2_f32m4_rm_m(vm, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wv_bf16m2_f32m4_rm_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wv_bf16m2_f32m4_rm_m(vbool8_t vm, vfloat32m4_t vs2,
+                                              vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_bf16m2_f32m4_rm_m(vm, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wf_bf16_f32m4_rm_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wf_bf16_f32m4_rm_m(vbool8_t vm, vfloat32m4_t vs2,
+                                            __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m4_rm_m(vm, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vv_bf16m4_f32m8_rm_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vv_bf16m4_f32m8_rm_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                              vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16m4_f32m8_rm_m(vm, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vf_bf16m4_f32m8_rm_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vf_bf16m4_f32m8_rm_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                              __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_vf_bf16m4_f32m8_rm_m(vm, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wv_bf16m4_f32m8_rm_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wv_bf16m4_f32m8_rm_m(vbool4_t vm, vfloat32m8_t vs2,
+                                              vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_bf16m4_f32m8_rm_m(vm, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wf_bf16_f32m8_rm_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wf_bf16_f32m8_rm_m(vbool4_t vm, vfloat32m8_t vs2,
+                                            __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m8_rm_m(vm, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vmfeq.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vmfeq.c
new file mode 100644
index 0000000000000..669d0427b569a
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vmfeq.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfeq_vv_bf16mf4_b64(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfeq.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfeq_vv_bf16mf4_b64(vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vmfeq_vv_bf16mf4_b64(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfeq_vf_bf16mf4_b64(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfeq.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfeq_vf_bf16mf4_b64(vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfeq_vf_bf16mf4_b64(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfeq_vv_bf16mf2_b32(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfeq.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfeq_vv_bf16mf2_b32(vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vmfeq_vv_bf16mf2_b32(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfeq_vf_bf16mf2_b32(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfeq.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfeq_vf_bf16mf2_b32(vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfeq_vf_bf16mf2_b32(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfeq_vv_bf16m1_b16(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfeq.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfeq_vv_bf16m1_b16(vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vmfeq_vv_bf16m1_b16(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfeq_vf_bf16m1_b16(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfeq.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfeq_vf_bf16m1_b16(vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfeq_vf_bf16m1_b16(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfeq_vv_bf16m2_b8(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfeq.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfeq_vv_bf16m2_b8(vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vmfeq_vv_bf16m2_b8(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfeq_vf_bf16m2_b8(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfeq.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfeq_vf_bf16m2_b8(vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfeq_vf_bf16m2_b8(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfeq_vv_bf16m4_b4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfeq.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfeq_vv_bf16m4_b4(vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vmfeq_vv_bf16m4_b4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfeq_vf_bf16m4_b4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfeq.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfeq_vf_bf16m4_b4(vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfeq_vf_bf16m4_b4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfeq_vv_bf16m8_b2(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfeq.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfeq_vv_bf16m8_b2(vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vmfeq_vv_bf16m8_b2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfeq_vf_bf16m8_b2(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfeq.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfeq_vf_bf16m8_b2(vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfeq_vf_bf16m8_b2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfeq_vv_bf16mf4_b64_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfeq.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x i1> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfeq_vv_bf16mf4_b64_m(vbool64_t mask, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vmfeq_vv_bf16mf4_b64_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfeq_vf_bf16mf4_b64_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfeq.mask.nxv1bf16.bf16.i64(<vscale x 1 x i1> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfeq_vf_bf16mf4_b64_m(vbool64_t mask, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfeq_vf_bf16mf4_b64_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfeq_vv_bf16mf2_b32_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfeq.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x i1> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfeq_vv_bf16mf2_b32_m(vbool32_t mask, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vmfeq_vv_bf16mf2_b32_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfeq_vf_bf16mf2_b32_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfeq.mask.nxv2bf16.bf16.i64(<vscale x 2 x i1> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfeq_vf_bf16mf2_b32_m(vbool32_t mask, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfeq_vf_bf16mf2_b32_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfeq_vv_bf16m1_b16_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfeq.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x i1> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfeq_vv_bf16m1_b16_m(vbool16_t mask, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vmfeq_vv_bf16m1_b16_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfeq_vf_bf16m1_b16_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfeq.mask.nxv4bf16.bf16.i64(<vscale x 4 x i1> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfeq_vf_bf16m1_b16_m(vbool16_t mask, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfeq_vf_bf16m1_b16_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfeq_vv_bf16m2_b8_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfeq.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x i1> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfeq_vv_bf16m2_b8_m(vbool8_t mask, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vmfeq_vv_bf16m2_b8_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfeq_vf_bf16m2_b8_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfeq.mask.nxv8bf16.bf16.i64(<vscale x 8 x i1> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfeq_vf_bf16m2_b8_m(vbool8_t mask, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfeq_vf_bf16m2_b8_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfeq_vv_bf16m4_b4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfeq.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x i1> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfeq_vv_bf16m4_b4_m(vbool4_t mask, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vmfeq_vv_bf16m4_b4_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfeq_vf_bf16m4_b4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfeq.mask.nxv16bf16.bf16.i64(<vscale x 16 x i1> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfeq_vf_bf16m4_b4_m(vbool4_t mask, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfeq_vf_bf16m4_b4_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfeq_vv_bf16m8_b2_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfeq.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x i1> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfeq_vv_bf16m8_b2_m(vbool2_t mask, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vmfeq_vv_bf16m8_b2_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfeq_vf_bf16m8_b2_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfeq.mask.nxv32bf16.bf16.i64(<vscale x 32 x i1> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfeq_vf_bf16m8_b2_m(vbool2_t mask, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfeq_vf_bf16m8_b2_m(mask, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vmfge.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vmfge.c
new file mode 100644
index 0000000000000..b169efd51a0b4
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vmfge.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfge_vv_bf16mf4_b64(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfge.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfge_vv_bf16mf4_b64(vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vmfge_vv_bf16mf4_b64(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfge_vf_bf16mf4_b64(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfge.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfge_vf_bf16mf4_b64(vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfge_vf_bf16mf4_b64(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfge_vv_bf16mf2_b32(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfge.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfge_vv_bf16mf2_b32(vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vmfge_vv_bf16mf2_b32(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfge_vf_bf16mf2_b32(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfge.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfge_vf_bf16mf2_b32(vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfge_vf_bf16mf2_b32(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfge_vv_bf16m1_b16(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfge.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfge_vv_bf16m1_b16(vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vmfge_vv_bf16m1_b16(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfge_vf_bf16m1_b16(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfge.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfge_vf_bf16m1_b16(vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfge_vf_bf16m1_b16(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfge_vv_bf16m2_b8(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfge.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfge_vv_bf16m2_b8(vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vmfge_vv_bf16m2_b8(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfge_vf_bf16m2_b8(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfge.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfge_vf_bf16m2_b8(vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfge_vf_bf16m2_b8(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfge_vv_bf16m4_b4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfge.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfge_vv_bf16m4_b4(vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vmfge_vv_bf16m4_b4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfge_vf_bf16m4_b4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfge.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfge_vf_bf16m4_b4(vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfge_vf_bf16m4_b4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfge_vv_bf16m8_b2(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfge.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfge_vv_bf16m8_b2(vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vmfge_vv_bf16m8_b2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfge_vf_bf16m8_b2(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfge.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfge_vf_bf16m8_b2(vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfge_vf_bf16m8_b2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfge_vv_bf16mf4_b64_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfge.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x i1> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfge_vv_bf16mf4_b64_m(vbool64_t mask, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vmfge_vv_bf16mf4_b64_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfge_vf_bf16mf4_b64_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfge.mask.nxv1bf16.bf16.i64(<vscale x 1 x i1> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfge_vf_bf16mf4_b64_m(vbool64_t mask, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfge_vf_bf16mf4_b64_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfge_vv_bf16mf2_b32_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfge.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x i1> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfge_vv_bf16mf2_b32_m(vbool32_t mask, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vmfge_vv_bf16mf2_b32_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfge_vf_bf16mf2_b32_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfge.mask.nxv2bf16.bf16.i64(<vscale x 2 x i1> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfge_vf_bf16mf2_b32_m(vbool32_t mask, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfge_vf_bf16mf2_b32_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfge_vv_bf16m1_b16_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfge.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x i1> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfge_vv_bf16m1_b16_m(vbool16_t mask, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vmfge_vv_bf16m1_b16_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfge_vf_bf16m1_b16_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfge.mask.nxv4bf16.bf16.i64(<vscale x 4 x i1> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfge_vf_bf16m1_b16_m(vbool16_t mask, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfge_vf_bf16m1_b16_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfge_vv_bf16m2_b8_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfge.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x i1> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfge_vv_bf16m2_b8_m(vbool8_t mask, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vmfge_vv_bf16m2_b8_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfge_vf_bf16m2_b8_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfge.mask.nxv8bf16.bf16.i64(<vscale x 8 x i1> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfge_vf_bf16m2_b8_m(vbool8_t mask, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfge_vf_bf16m2_b8_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfge_vv_bf16m4_b4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfge.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x i1> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfge_vv_bf16m4_b4_m(vbool4_t mask, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vmfge_vv_bf16m4_b4_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfge_vf_bf16m4_b4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfge.mask.nxv16bf16.bf16.i64(<vscale x 16 x i1> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfge_vf_bf16m4_b4_m(vbool4_t mask, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfge_vf_bf16m4_b4_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfge_vv_bf16m8_b2_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfge.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x i1> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfge_vv_bf16m8_b2_m(vbool2_t mask, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vmfge_vv_bf16m8_b2_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfge_vf_bf16m8_b2_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfge.mask.nxv32bf16.bf16.i64(<vscale x 32 x i1> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfge_vf_bf16m8_b2_m(vbool2_t mask, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfge_vf_bf16m8_b2_m(mask, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vmfgt.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vmfgt.c
new file mode 100644
index 0000000000000..9aea7d24b0edc
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vmfgt.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfgt_vv_bf16mf4_b64(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfgt.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfgt_vv_bf16mf4_b64(vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vmfgt_vv_bf16mf4_b64(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfgt_vf_bf16mf4_b64(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfgt.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfgt_vf_bf16mf4_b64(vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfgt_vf_bf16mf4_b64(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfgt_vv_bf16mf2_b32(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfgt.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfgt_vv_bf16mf2_b32(vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vmfgt_vv_bf16mf2_b32(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfgt_vf_bf16mf2_b32(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfgt.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfgt_vf_bf16mf2_b32(vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfgt_vf_bf16mf2_b32(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfgt_vv_bf16m1_b16(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfgt.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfgt_vv_bf16m1_b16(vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vmfgt_vv_bf16m1_b16(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfgt_vf_bf16m1_b16(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfgt.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfgt_vf_bf16m1_b16(vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfgt_vf_bf16m1_b16(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfgt_vv_bf16m2_b8(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfgt.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfgt_vv_bf16m2_b8(vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vmfgt_vv_bf16m2_b8(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfgt_vf_bf16m2_b8(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfgt.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfgt_vf_bf16m2_b8(vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfgt_vf_bf16m2_b8(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfgt_vv_bf16m4_b4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfgt.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfgt_vv_bf16m4_b4(vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vmfgt_vv_bf16m4_b4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfgt_vf_bf16m4_b4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfgt.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfgt_vf_bf16m4_b4(vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfgt_vf_bf16m4_b4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfgt_vv_bf16m8_b2(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfgt.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfgt_vv_bf16m8_b2(vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vmfgt_vv_bf16m8_b2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfgt_vf_bf16m8_b2(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfgt.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfgt_vf_bf16m8_b2(vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfgt_vf_bf16m8_b2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfgt_vv_bf16mf4_b64_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfgt.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x i1> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfgt_vv_bf16mf4_b64_m(vbool64_t mask, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vmfgt_vv_bf16mf4_b64_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfgt_vf_bf16mf4_b64_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfgt.mask.nxv1bf16.bf16.i64(<vscale x 1 x i1> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfgt_vf_bf16mf4_b64_m(vbool64_t mask, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfgt_vf_bf16mf4_b64_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfgt_vv_bf16mf2_b32_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfgt.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x i1> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfgt_vv_bf16mf2_b32_m(vbool32_t mask, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vmfgt_vv_bf16mf2_b32_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfgt_vf_bf16mf2_b32_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfgt.mask.nxv2bf16.bf16.i64(<vscale x 2 x i1> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfgt_vf_bf16mf2_b32_m(vbool32_t mask, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfgt_vf_bf16mf2_b32_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfgt_vv_bf16m1_b16_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfgt.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x i1> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfgt_vv_bf16m1_b16_m(vbool16_t mask, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vmfgt_vv_bf16m1_b16_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfgt_vf_bf16m1_b16_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfgt.mask.nxv4bf16.bf16.i64(<vscale x 4 x i1> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfgt_vf_bf16m1_b16_m(vbool16_t mask, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfgt_vf_bf16m1_b16_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfgt_vv_bf16m2_b8_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfgt.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x i1> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfgt_vv_bf16m2_b8_m(vbool8_t mask, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vmfgt_vv_bf16m2_b8_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfgt_vf_bf16m2_b8_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfgt.mask.nxv8bf16.bf16.i64(<vscale x 8 x i1> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfgt_vf_bf16m2_b8_m(vbool8_t mask, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfgt_vf_bf16m2_b8_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfgt_vv_bf16m4_b4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfgt.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x i1> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfgt_vv_bf16m4_b4_m(vbool4_t mask, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vmfgt_vv_bf16m4_b4_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfgt_vf_bf16m4_b4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfgt.mask.nxv16bf16.bf16.i64(<vscale x 16 x i1> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfgt_vf_bf16m4_b4_m(vbool4_t mask, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfgt_vf_bf16m4_b4_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfgt_vv_bf16m8_b2_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfgt.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x i1> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfgt_vv_bf16m8_b2_m(vbool2_t mask, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vmfgt_vv_bf16m8_b2_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfgt_vf_bf16m8_b2_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfgt.mask.nxv32bf16.bf16.i64(<vscale x 32 x i1> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfgt_vf_bf16m8_b2_m(vbool2_t mask, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfgt_vf_bf16m8_b2_m(mask, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vmfle.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vmfle.c
new file mode 100644
index 0000000000000..40f0c27f5b37a
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vmfle.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfle_vv_bf16mf4_b64(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfle.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfle_vv_bf16mf4_b64(vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vmfle_vv_bf16mf4_b64(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfle_vf_bf16mf4_b64(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfle.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfle_vf_bf16mf4_b64(vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfle_vf_bf16mf4_b64(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfle_vv_bf16mf2_b32(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfle.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfle_vv_bf16mf2_b32(vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vmfle_vv_bf16mf2_b32(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfle_vf_bf16mf2_b32(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfle.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfle_vf_bf16mf2_b32(vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfle_vf_bf16mf2_b32(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfle_vv_bf16m1_b16(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfle.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfle_vv_bf16m1_b16(vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vmfle_vv_bf16m1_b16(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfle_vf_bf16m1_b16(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfle.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfle_vf_bf16m1_b16(vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfle_vf_bf16m1_b16(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfle_vv_bf16m2_b8(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfle.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfle_vv_bf16m2_b8(vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vmfle_vv_bf16m2_b8(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfle_vf_bf16m2_b8(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfle.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfle_vf_bf16m2_b8(vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfle_vf_bf16m2_b8(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfle_vv_bf16m4_b4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfle.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfle_vv_bf16m4_b4(vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vmfle_vv_bf16m4_b4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfle_vf_bf16m4_b4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfle.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfle_vf_bf16m4_b4(vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfle_vf_bf16m4_b4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfle_vv_bf16m8_b2(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfle.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfle_vv_bf16m8_b2(vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vmfle_vv_bf16m8_b2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfle_vf_bf16m8_b2(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfle.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfle_vf_bf16m8_b2(vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfle_vf_bf16m8_b2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfle_vv_bf16mf4_b64_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfle.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x i1> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfle_vv_bf16mf4_b64_m(vbool64_t mask, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vmfle_vv_bf16mf4_b64_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfle_vf_bf16mf4_b64_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfle.mask.nxv1bf16.bf16.i64(<vscale x 1 x i1> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfle_vf_bf16mf4_b64_m(vbool64_t mask, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfle_vf_bf16mf4_b64_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfle_vv_bf16mf2_b32_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfle.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x i1> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfle_vv_bf16mf2_b32_m(vbool32_t mask, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vmfle_vv_bf16mf2_b32_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfle_vf_bf16mf2_b32_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfle.mask.nxv2bf16.bf16.i64(<vscale x 2 x i1> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfle_vf_bf16mf2_b32_m(vbool32_t mask, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfle_vf_bf16mf2_b32_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfle_vv_bf16m1_b16_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfle.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x i1> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfle_vv_bf16m1_b16_m(vbool16_t mask, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vmfle_vv_bf16m1_b16_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfle_vf_bf16m1_b16_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfle.mask.nxv4bf16.bf16.i64(<vscale x 4 x i1> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfle_vf_bf16m1_b16_m(vbool16_t mask, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfle_vf_bf16m1_b16_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfle_vv_bf16m2_b8_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfle.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x i1> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfle_vv_bf16m2_b8_m(vbool8_t mask, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vmfle_vv_bf16m2_b8_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfle_vf_bf16m2_b8_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfle.mask.nxv8bf16.bf16.i64(<vscale x 8 x i1> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfle_vf_bf16m2_b8_m(vbool8_t mask, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfle_vf_bf16m2_b8_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfle_vv_bf16m4_b4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfle.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x i1> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfle_vv_bf16m4_b4_m(vbool4_t mask, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vmfle_vv_bf16m4_b4_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfle_vf_bf16m4_b4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfle.mask.nxv16bf16.bf16.i64(<vscale x 16 x i1> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfle_vf_bf16m4_b4_m(vbool4_t mask, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfle_vf_bf16m4_b4_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfle_vv_bf16m8_b2_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfle.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x i1> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfle_vv_bf16m8_b2_m(vbool2_t mask, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vmfle_vv_bf16m8_b2_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfle_vf_bf16m8_b2_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfle.mask.nxv32bf16.bf16.i64(<vscale x 32 x i1> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfle_vf_bf16m8_b2_m(vbool2_t mask, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfle_vf_bf16m8_b2_m(mask, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vmflt.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vmflt.c
new file mode 100644
index 0000000000000..f64eee3effdaf
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vmflt.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmflt_vv_bf16mf4_b64(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmflt.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmflt_vv_bf16mf4_b64(vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vmflt_vv_bf16mf4_b64(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmflt_vf_bf16mf4_b64(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmflt.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmflt_vf_bf16mf4_b64(vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmflt_vf_bf16mf4_b64(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmflt_vv_bf16mf2_b32(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmflt.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmflt_vv_bf16mf2_b32(vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vmflt_vv_bf16mf2_b32(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmflt_vf_bf16mf2_b32(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmflt.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmflt_vf_bf16mf2_b32(vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmflt_vf_bf16mf2_b32(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmflt_vv_bf16m1_b16(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmflt.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmflt_vv_bf16m1_b16(vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vmflt_vv_bf16m1_b16(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmflt_vf_bf16m1_b16(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmflt.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmflt_vf_bf16m1_b16(vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmflt_vf_bf16m1_b16(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmflt_vv_bf16m2_b8(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmflt.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmflt_vv_bf16m2_b8(vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vmflt_vv_bf16m2_b8(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmflt_vf_bf16m2_b8(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmflt.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmflt_vf_bf16m2_b8(vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmflt_vf_bf16m2_b8(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmflt_vv_bf16m4_b4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmflt.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmflt_vv_bf16m4_b4(vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vmflt_vv_bf16m4_b4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmflt_vf_bf16m4_b4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmflt.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmflt_vf_bf16m4_b4(vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmflt_vf_bf16m4_b4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmflt_vv_bf16m8_b2(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmflt.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmflt_vv_bf16m8_b2(vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vmflt_vv_bf16m8_b2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmflt_vf_bf16m8_b2(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmflt.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmflt_vf_bf16m8_b2(vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmflt_vf_bf16m8_b2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmflt_vv_bf16mf4_b64_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmflt.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x i1> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmflt_vv_bf16mf4_b64_m(vbool64_t mask, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vmflt_vv_bf16mf4_b64_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmflt_vf_bf16mf4_b64_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmflt.mask.nxv1bf16.bf16.i64(<vscale x 1 x i1> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmflt_vf_bf16mf4_b64_m(vbool64_t mask, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmflt_vf_bf16mf4_b64_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmflt_vv_bf16mf2_b32_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmflt.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x i1> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmflt_vv_bf16mf2_b32_m(vbool32_t mask, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vmflt_vv_bf16mf2_b32_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmflt_vf_bf16mf2_b32_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmflt.mask.nxv2bf16.bf16.i64(<vscale x 2 x i1> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmflt_vf_bf16mf2_b32_m(vbool32_t mask, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmflt_vf_bf16mf2_b32_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmflt_vv_bf16m1_b16_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmflt.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x i1> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmflt_vv_bf16m1_b16_m(vbool16_t mask, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vmflt_vv_bf16m1_b16_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmflt_vf_bf16m1_b16_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmflt.mask.nxv4bf16.bf16.i64(<vscale x 4 x i1> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmflt_vf_bf16m1_b16_m(vbool16_t mask, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmflt_vf_bf16m1_b16_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmflt_vv_bf16m2_b8_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmflt.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x i1> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmflt_vv_bf16m2_b8_m(vbool8_t mask, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vmflt_vv_bf16m2_b8_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmflt_vf_bf16m2_b8_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmflt.mask.nxv8bf16.bf16.i64(<vscale x 8 x i1> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmflt_vf_bf16m2_b8_m(vbool8_t mask, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmflt_vf_bf16m2_b8_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmflt_vv_bf16m4_b4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmflt.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x i1> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmflt_vv_bf16m4_b4_m(vbool4_t mask, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vmflt_vv_bf16m4_b4_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmflt_vf_bf16m4_b4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmflt.mask.nxv16bf16.bf16.i64(<vscale x 16 x i1> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmflt_vf_bf16m4_b4_m(vbool4_t mask, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmflt_vf_bf16m4_b4_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmflt_vv_bf16m8_b2_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmflt.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x i1> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmflt_vv_bf16m8_b2_m(vbool2_t mask, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vmflt_vv_bf16m8_b2_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmflt_vf_bf16m8_b2_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmflt.mask.nxv32bf16.bf16.i64(<vscale x 32 x i1> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmflt_vf_bf16m8_b2_m(vbool2_t mask, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmflt_vf_bf16m8_b2_m(mask, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vmfne.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vmfne.c
new file mode 100644
index 0000000000000..809ea5628e394
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/non-overloaded/vmfne.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfne_vv_bf16mf4_b64(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfne.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfne_vv_bf16mf4_b64(vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vmfne_vv_bf16mf4_b64(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfne_vf_bf16mf4_b64(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfne.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfne_vf_bf16mf4_b64(vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfne_vf_bf16mf4_b64(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfne_vv_bf16mf2_b32(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfne.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfne_vv_bf16mf2_b32(vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vmfne_vv_bf16mf2_b32(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfne_vf_bf16mf2_b32(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfne.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfne_vf_bf16mf2_b32(vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfne_vf_bf16mf2_b32(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfne_vv_bf16m1_b16(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfne.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfne_vv_bf16m1_b16(vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vmfne_vv_bf16m1_b16(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfne_vf_bf16m1_b16(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfne.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfne_vf_bf16m1_b16(vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfne_vf_bf16m1_b16(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfne_vv_bf16m2_b8(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfne.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfne_vv_bf16m2_b8(vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vmfne_vv_bf16m2_b8(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfne_vf_bf16m2_b8(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfne.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfne_vf_bf16m2_b8(vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfne_vf_bf16m2_b8(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfne_vv_bf16m4_b4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfne.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfne_vv_bf16m4_b4(vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vmfne_vv_bf16m4_b4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfne_vf_bf16m4_b4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfne.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfne_vf_bf16m4_b4(vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfne_vf_bf16m4_b4(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfne_vv_bf16m8_b2(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfne.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfne_vv_bf16m8_b2(vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vmfne_vv_bf16m8_b2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfne_vf_bf16m8_b2(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfne.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfne_vf_bf16m8_b2(vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfne_vf_bf16m8_b2(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfne_vv_bf16mf4_b64_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfne.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x i1> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfne_vv_bf16mf4_b64_m(vbool64_t mask, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vmfne_vv_bf16mf4_b64_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfne_vf_bf16mf4_b64_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfne.mask.nxv1bf16.bf16.i64(<vscale x 1 x i1> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfne_vf_bf16mf4_b64_m(vbool64_t mask, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfne_vf_bf16mf4_b64_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfne_vv_bf16mf2_b32_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfne.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x i1> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfne_vv_bf16mf2_b32_m(vbool32_t mask, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vmfne_vv_bf16mf2_b32_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfne_vf_bf16mf2_b32_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfne.mask.nxv2bf16.bf16.i64(<vscale x 2 x i1> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfne_vf_bf16mf2_b32_m(vbool32_t mask, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfne_vf_bf16mf2_b32_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfne_vv_bf16m1_b16_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfne.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x i1> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfne_vv_bf16m1_b16_m(vbool16_t mask, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vmfne_vv_bf16m1_b16_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfne_vf_bf16m1_b16_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfne.mask.nxv4bf16.bf16.i64(<vscale x 4 x i1> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfne_vf_bf16m1_b16_m(vbool16_t mask, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfne_vf_bf16m1_b16_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfne_vv_bf16m2_b8_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfne.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x i1> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfne_vv_bf16m2_b8_m(vbool8_t mask, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vmfne_vv_bf16m2_b8_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfne_vf_bf16m2_b8_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfne.mask.nxv8bf16.bf16.i64(<vscale x 8 x i1> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfne_vf_bf16m2_b8_m(vbool8_t mask, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfne_vf_bf16m2_b8_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfne_vv_bf16m4_b4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfne.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x i1> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfne_vv_bf16m4_b4_m(vbool4_t mask, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vmfne_vv_bf16m4_b4_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfne_vf_bf16m4_b4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfne.mask.nxv16bf16.bf16.i64(<vscale x 16 x i1> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfne_vf_bf16m4_b4_m(vbool4_t mask, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfne_vf_bf16m4_b4_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfne_vv_bf16m8_b2_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfne.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x i1> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfne_vv_bf16m8_b2_m(vbool2_t mask, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vmfne_vv_bf16m8_b2_m(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfne_vf_bf16m8_b2_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfne.mask.nxv32bf16.bf16.i64(<vscale x 32 x i1> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfne_vf_bf16m8_b2_m(vbool2_t mask, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfne_vf_bf16m8_b2_m(mask, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfadd.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfadd.c
new file mode 100644
index 0000000000000..9d6b071c2768c
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfadd.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfadd_vv_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfadd.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfadd_vv_bf16mf4(vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfadd(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfadd_vf_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfadd.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfadd_vf_bf16mf4(vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfadd_vv_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfadd.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfadd_vv_bf16mf2(vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfadd(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfadd_vf_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfadd.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfadd_vf_bf16mf2(vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfadd_vv_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfadd.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfadd_vv_bf16m1(vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfadd(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfadd_vf_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfadd.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfadd_vf_bf16m1(vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfadd_vv_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfadd.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfadd_vv_bf16m2(vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfadd(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfadd_vf_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfadd.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfadd_vf_bf16m2(vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfadd_vv_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfadd.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfadd_vv_bf16m4(vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfadd(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfadd_vf_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfadd.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfadd_vf_bf16m4(vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfadd_vv_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfadd.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfadd_vv_bf16m8(vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfadd(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfadd_vf_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfadd.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfadd_vf_bf16m8(vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfadd_vv_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfadd.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfadd_vv_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfadd(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfadd_vf_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfadd.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfadd_vf_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfadd_vv_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfadd.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfadd_vv_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfadd(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfadd_vf_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfadd.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfadd_vf_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfadd_vv_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfadd.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfadd_vv_bf16m1_m(vbool16_t mask, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfadd(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfadd_vf_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfadd.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfadd_vf_bf16m1_m(vbool16_t mask, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfadd_vv_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfadd.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfadd_vv_bf16m2_m(vbool8_t mask, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfadd(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfadd_vf_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfadd.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfadd_vf_bf16m2_m(vbool8_t mask, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfadd_vv_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfadd.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfadd_vv_bf16m4_m(vbool4_t mask, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfadd(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfadd_vf_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfadd.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfadd_vf_bf16m4_m(vbool4_t mask, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfadd_vv_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfadd.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfadd_vv_bf16m8_m(vbool2_t mask, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfadd(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfadd_vf_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfadd.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfadd_vf_bf16m8_m(vbool2_t mask, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd(mask, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfclass.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfclass.c
new file mode 100644
index 0000000000000..2760f85a45d3c
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfclass.c
@@ -0,0 +1,134 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i16> @test_vfclass_v_bf16mf4_u16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vfclass.nxv1bf16.i64(<vscale x 1 x i16> poison, <vscale x 1 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
+//
+vuint16mf4_t test_vfclass_v_bf16mf4_u16mf4(vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfclass(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i16> @test_vfclass_v_bf16mf2_u16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vfclass.nxv2bf16.i64(<vscale x 2 x i16> poison, <vscale x 2 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
+//
+vuint16mf2_t test_vfclass_v_bf16mf2_u16mf2(vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfclass(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i16> @test_vfclass_v_bf16m1_u16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vfclass.nxv4bf16.i64(<vscale x 4 x i16> poison, <vscale x 4 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
+//
+vuint16m1_t test_vfclass_v_bf16m1_u16m1(vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfclass(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i16> @test_vfclass_v_bf16m2_u16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vfclass.nxv8bf16.i64(<vscale x 8 x i16> poison, <vscale x 8 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+vuint16m2_t test_vfclass_v_bf16m2_u16m2(vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfclass(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i16> @test_vfclass_v_bf16m4_u16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vfclass.nxv16bf16.i64(<vscale x 16 x i16> poison, <vscale x 16 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+vuint16m4_t test_vfclass_v_bf16m4_u16m4(vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfclass(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_vfclass_v_bf16m8_u16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vfclass.nxv32bf16.i64(<vscale x 32 x i16> poison, <vscale x 32 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
+vuint16m8_t test_vfclass_v_bf16m8_u16m8(vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfclass(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i16> @test_vfclass_v_bf16mf4_u16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vfclass.mask.nxv1bf16.i64(<vscale x 1 x i16> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
+//
+vuint16mf4_t test_vfclass_v_bf16mf4_u16mf4_m(vbool64_t vm, vbfloat16mf4_t vs2,
+                                             size_t vl) {
+  return __riscv_vfclass(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i16> @test_vfclass_v_bf16mf2_u16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vfclass.mask.nxv2bf16.i64(<vscale x 2 x i16> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
+//
+vuint16mf2_t test_vfclass_v_bf16mf2_u16mf2_m(vbool32_t vm, vbfloat16mf2_t vs2,
+                                             size_t vl) {
+  return __riscv_vfclass(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i16> @test_vfclass_v_bf16m1_u16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vfclass.mask.nxv4bf16.i64(<vscale x 4 x i16> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
+//
+vuint16m1_t test_vfclass_v_bf16m1_u16m1_m(vbool16_t vm, vbfloat16m1_t vs2,
+                                          size_t vl) {
+  return __riscv_vfclass(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i16> @test_vfclass_v_bf16m2_u16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vfclass.mask.nxv8bf16.i64(<vscale x 8 x i16> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+vuint16m2_t test_vfclass_v_bf16m2_u16m2_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                          size_t vl) {
+  return __riscv_vfclass(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i16> @test_vfclass_v_bf16m4_u16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vfclass.mask.nxv16bf16.i64(<vscale x 16 x i16> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+vuint16m4_t test_vfclass_v_bf16m4_u16m4_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                          size_t vl) {
+  return __riscv_vfclass(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_vfclass_v_bf16m8_u16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vfclass.mask.nxv32bf16.i64(<vscale x 32 x i16> poison, <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
+vuint16m8_t test_vfclass_v_bf16m8_u16m8_m(vbool2_t vm, vbfloat16m8_t vs2,
+                                          size_t vl) {
+  return __riscv_vfclass(vm, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfmacc.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfmacc.c
new file mode 100644
index 0000000000000..ae3f1f24eb762
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfmacc.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmacc_vv_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmacc.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmacc_vv_bf16mf4(vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmacc(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmacc_vf_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmacc.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmacc_vf_bf16mf4(vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmacc(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmacc_vv_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmacc.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmacc_vv_bf16mf2(vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmacc(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmacc_vf_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmacc.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmacc_vf_bf16mf2(vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmacc(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmacc_vv_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmacc.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmacc_vv_bf16m1(vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmacc(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmacc_vf_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmacc.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmacc_vf_bf16m1(vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmacc(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmacc_vv_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmacc.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmacc_vv_bf16m2(vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmacc(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmacc_vf_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmacc.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmacc_vf_bf16m2(vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmacc(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmacc_vv_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmacc.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmacc_vv_bf16m4(vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmacc(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmacc_vf_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmacc.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmacc_vf_bf16m4(vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmacc(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmacc_vv_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmacc.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmacc_vv_bf16m8(vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmacc(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmacc_vf_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmacc.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmacc_vf_bf16m8(vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmacc(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmacc_vv_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmacc.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmacc_vv_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmacc(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmacc_vf_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmacc.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmacc_vf_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmacc(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmacc_vv_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmacc.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmacc_vv_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmacc(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmacc_vf_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmacc.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmacc_vf_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmacc(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmacc_vv_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmacc.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmacc_vv_bf16m1_m(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmacc(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmacc_vf_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmacc.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmacc_vf_bf16m1_m(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmacc(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmacc_vv_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmacc.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmacc_vv_bf16m2_m(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmacc(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmacc_vf_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmacc.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmacc_vf_bf16m2_m(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmacc(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmacc_vv_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmacc.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmacc_vv_bf16m4_m(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmacc(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmacc_vf_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmacc.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmacc_vf_bf16m4_m(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmacc(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmacc_vv_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmacc.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmacc_vv_bf16m8_m(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmacc(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmacc_vf_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmacc.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmacc_vf_bf16m8_m(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmacc(mask, vd, rs1, vs2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfmadd.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfmadd.c
new file mode 100644
index 0000000000000..db2184c6e1dba
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfmadd.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmadd_vv_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmadd.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmadd_vv_bf16mf4(vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmadd(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmadd_vf_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmadd.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmadd_vf_bf16mf4(vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmadd(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmadd_vv_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmadd.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmadd_vv_bf16mf2(vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmadd(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmadd_vf_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmadd.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmadd_vf_bf16mf2(vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmadd(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmadd_vv_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmadd.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmadd_vv_bf16m1(vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmadd(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmadd_vf_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmadd.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmadd_vf_bf16m1(vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmadd(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmadd_vv_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmadd.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmadd_vv_bf16m2(vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmadd(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmadd_vf_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmadd.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmadd_vf_bf16m2(vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmadd(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmadd_vv_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmadd.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmadd_vv_bf16m4(vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmadd(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmadd_vf_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmadd.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmadd_vf_bf16m4(vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmadd(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmadd_vv_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmadd.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmadd_vv_bf16m8(vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmadd(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmadd_vf_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmadd.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmadd_vf_bf16m8(vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmadd(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmadd_vv_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmadd.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmadd_vv_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmadd(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmadd_vf_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmadd.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmadd_vf_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmadd(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmadd_vv_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmadd.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmadd_vv_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmadd(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmadd_vf_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmadd.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmadd_vf_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmadd(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmadd_vv_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmadd.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmadd_vv_bf16m1_m(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmadd(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmadd_vf_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmadd.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmadd_vf_bf16m1_m(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmadd(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmadd_vv_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmadd.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmadd_vv_bf16m2_m(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmadd(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmadd_vf_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmadd.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmadd_vf_bf16m2_m(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmadd(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmadd_vv_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmadd.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmadd_vv_bf16m4_m(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmadd(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmadd_vf_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmadd.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmadd_vf_bf16m4_m(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmadd(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmadd_vv_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmadd.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmadd_vv_bf16m8_m(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmadd(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmadd_vf_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmadd.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmadd_vf_bf16m8_m(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmadd(mask, vd, rs1, vs2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfmax.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfmax.c
new file mode 100644
index 0000000000000..66497bfb15934
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfmax.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmax_vv_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmax.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmax_vv_bf16mf4(vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfmax(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmax_vf_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmax.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmax_vf_bf16mf4(vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmax_vv_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmax.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmax_vv_bf16mf2(vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfmax(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmax_vf_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmax.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmax_vf_bf16mf2(vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmax_vv_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmax.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmax_vv_bf16m1(vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfmax(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmax_vf_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmax.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmax_vf_bf16m1(vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmax_vv_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmax.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmax_vv_bf16m2(vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfmax(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmax_vf_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmax.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmax_vf_bf16m2(vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmax_vv_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmax.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmax_vv_bf16m4(vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfmax(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmax_vf_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmax.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmax_vf_bf16m4(vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmax_vv_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmax.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmax_vv_bf16m8(vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfmax(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmax_vf_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmax.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmax_vf_bf16m8(vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmax_vv_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmax.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmax_vv_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfmax(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmax_vf_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmax.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmax_vf_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmax_vv_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmax.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmax_vv_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfmax(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmax_vf_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmax.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmax_vf_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmax_vv_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmax.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmax_vv_bf16m1_m(vbool16_t mask, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfmax(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmax_vf_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmax.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmax_vf_bf16m1_m(vbool16_t mask, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmax_vv_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmax.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmax_vv_bf16m2_m(vbool8_t mask, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfmax(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmax_vf_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmax.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmax_vf_bf16m2_m(vbool8_t mask, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmax_vv_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmax.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmax_vv_bf16m4_m(vbool4_t mask, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfmax(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmax_vf_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmax.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmax_vf_bf16m4_m(vbool4_t mask, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmax_vv_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmax.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmax_vv_bf16m8_m(vbool2_t mask, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfmax(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmax_vf_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmax.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmax_vf_bf16m8_m(vbool2_t mask, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax(mask, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfmerge.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfmerge.c
new file mode 100644
index 0000000000000..1dc290bf1a222
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfmerge.c
@@ -0,0 +1,69 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmerge_vfm_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmerge.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmerge_vfm_bf16mf4(vbfloat16mf4_t op1, __bf16 op2, vbool64_t mask, size_t vl) {
+  return __riscv_vfmerge(op1, op2, mask, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmerge_vfm_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmerge.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmerge_vfm_bf16mf2(vbfloat16mf2_t op1, __bf16 op2, vbool32_t mask, size_t vl) {
+  return __riscv_vfmerge(op1, op2, mask, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmerge_vfm_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmerge.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmerge_vfm_bf16m1(vbfloat16m1_t op1, __bf16 op2, vbool16_t mask, size_t vl) {
+  return __riscv_vfmerge(op1, op2, mask, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmerge_vfm_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmerge.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmerge_vfm_bf16m2(vbfloat16m2_t op1, __bf16 op2, vbool8_t mask, size_t vl) {
+  return __riscv_vfmerge(op1, op2, mask, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmerge_vfm_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmerge.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmerge_vfm_bf16m4(vbfloat16m4_t op1, __bf16 op2, vbool4_t mask, size_t vl) {
+  return __riscv_vfmerge(op1, op2, mask, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmerge_vfm_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmerge.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmerge_vfm_bf16m8(vbfloat16m8_t op1, __bf16 op2, vbool2_t mask, size_t vl) {
+  return __riscv_vfmerge(op1, op2, mask, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfmin.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfmin.c
new file mode 100644
index 0000000000000..1564d1195ecef
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfmin.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmin_vv_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmin.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmin_vv_bf16mf4(vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfmin(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmin_vf_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmin.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmin_vf_bf16mf4(vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmin_vv_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmin.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmin_vv_bf16mf2(vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfmin(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmin_vf_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmin.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmin_vf_bf16mf2(vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmin_vv_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmin.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmin_vv_bf16m1(vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfmin(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmin_vf_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmin.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmin_vf_bf16m1(vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmin_vv_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmin.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmin_vv_bf16m2(vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfmin(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmin_vf_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmin.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmin_vf_bf16m2(vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmin_vv_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmin.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmin_vv_bf16m4(vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfmin(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmin_vf_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmin.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmin_vf_bf16m4(vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmin_vv_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmin.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmin_vv_bf16m8(vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfmin(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmin_vf_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmin.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmin_vf_bf16m8(vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmin_vv_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmin.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmin_vv_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfmin(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmin_vf_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmin.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmin_vf_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmin_vv_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmin.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmin_vv_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfmin(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmin_vf_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmin.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmin_vf_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmin_vv_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmin.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmin_vv_bf16m1_m(vbool16_t mask, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfmin(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmin_vf_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmin.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmin_vf_bf16m1_m(vbool16_t mask, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmin_vv_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmin.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmin_vv_bf16m2_m(vbool8_t mask, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfmin(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmin_vf_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmin.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmin_vf_bf16m2_m(vbool8_t mask, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmin_vv_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmin.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmin_vv_bf16m4_m(vbool4_t mask, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfmin(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmin_vf_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmin.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmin_vf_bf16m4_m(vbool4_t mask, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmin_vv_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmin.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmin_vv_bf16m8_m(vbool2_t mask, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfmin(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmin_vf_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmin.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmin_vf_bf16m8_m(vbool2_t mask, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin(mask, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfmsac.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfmsac.c
new file mode 100644
index 0000000000000..0384e7da29e08
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfmsac.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsac_vv_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsac.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsac_vv_bf16mf4(vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsac(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsac_vf_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsac.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsac_vf_bf16mf4(vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsac(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsac_vv_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsac.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsac_vv_bf16mf2(vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsac(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsac_vf_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsac.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsac_vf_bf16mf2(vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsac(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsac_vv_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsac.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsac_vv_bf16m1(vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsac(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsac_vf_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsac.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsac_vf_bf16m1(vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsac(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsac_vv_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsac.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsac_vv_bf16m2(vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsac(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsac_vf_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsac.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsac_vf_bf16m2(vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsac(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsac_vv_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsac.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsac_vv_bf16m4(vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsac(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsac_vf_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsac.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsac_vf_bf16m4(vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsac(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsac_vv_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsac.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsac_vv_bf16m8(vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsac(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsac_vf_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsac.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsac_vf_bf16m8(vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsac(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsac_vv_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsac.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsac_vv_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsac(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsac_vf_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsac.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsac_vf_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsac(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsac_vv_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsac.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsac_vv_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsac(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsac_vf_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsac.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsac_vf_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsac(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsac_vv_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsac.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsac_vv_bf16m1_m(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsac(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsac_vf_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsac.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsac_vf_bf16m1_m(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsac(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsac_vv_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsac.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsac_vv_bf16m2_m(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsac(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsac_vf_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsac.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsac_vf_bf16m2_m(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsac(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsac_vv_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsac.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsac_vv_bf16m4_m(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsac(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsac_vf_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsac.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsac_vf_bf16m4_m(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsac(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsac_vv_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsac.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsac_vv_bf16m8_m(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsac(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsac_vf_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsac.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsac_vf_bf16m8_m(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsac(mask, vd, rs1, vs2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfmsub.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfmsub.c
new file mode 100644
index 0000000000000..306f189b54c89
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfmsub.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsub_vv_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsub.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsub_vv_bf16mf4(vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsub(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsub_vf_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsub.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsub_vf_bf16mf4(vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsub(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsub_vv_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsub.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsub_vv_bf16mf2(vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsub(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsub_vf_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsub.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsub_vf_bf16mf2(vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsub(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsub_vv_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsub.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsub_vv_bf16m1(vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsub(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsub_vf_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsub.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsub_vf_bf16m1(vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsub(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsub_vv_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsub.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsub_vv_bf16m2(vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsub(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsub_vf_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsub.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsub_vf_bf16m2(vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsub(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsub_vv_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsub.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsub_vv_bf16m4(vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsub(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsub_vf_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsub.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsub_vf_bf16m4(vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsub(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsub_vv_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsub.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsub_vv_bf16m8(vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsub(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsub_vf_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsub.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsub_vf_bf16m8(vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsub(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsub_vv_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsub.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsub_vv_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsub(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsub_vf_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsub.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsub_vf_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsub(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsub_vv_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsub.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsub_vv_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsub(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsub_vf_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsub.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsub_vf_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsub(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsub_vv_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsub.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsub_vv_bf16m1_m(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsub(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsub_vf_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsub.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsub_vf_bf16m1_m(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsub(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsub_vv_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsub.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsub_vv_bf16m2_m(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsub(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsub_vf_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsub.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsub_vf_bf16m2_m(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsub(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsub_vv_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsub.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsub_vv_bf16m4_m(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsub(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsub_vf_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsub.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsub_vf_bf16m4_m(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsub(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsub_vv_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsub.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsub_vv_bf16m8_m(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsub(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsub_vf_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsub.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsub_vf_bf16m8_m(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsub(mask, vd, rs1, vs2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfmul.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfmul.c
new file mode 100644
index 0000000000000..fffd83a12d36c
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfmul.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmul_vv_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmul.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmul_vv_bf16mf4(vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfmul(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmul_vf_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmul.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmul_vf_bf16mf4(vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmul_vv_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmul.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmul_vv_bf16mf2(vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfmul(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmul_vf_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmul.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmul_vf_bf16mf2(vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmul_vv_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmul.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmul_vv_bf16m1(vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfmul(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmul_vf_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmul.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmul_vf_bf16m1(vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmul_vv_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmul.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmul_vv_bf16m2(vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfmul(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmul_vf_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmul.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmul_vf_bf16m2(vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmul_vv_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmul.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmul_vv_bf16m4(vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfmul(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmul_vf_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmul.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmul_vf_bf16m4(vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmul_vv_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmul.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmul_vv_bf16m8(vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfmul(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmul_vf_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmul.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmul_vf_bf16m8(vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmul_vv_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmul.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmul_vv_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfmul(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmul_vf_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmul.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmul_vf_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmul_vv_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmul.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmul_vv_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfmul(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmul_vf_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmul.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmul_vf_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmul_vv_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmul.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmul_vv_bf16m1_m(vbool16_t mask, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfmul(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmul_vf_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmul.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmul_vf_bf16m1_m(vbool16_t mask, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmul_vv_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmul.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmul_vv_bf16m2_m(vbool8_t mask, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfmul(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmul_vf_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmul.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmul_vf_bf16m2_m(vbool8_t mask, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmul_vv_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmul.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmul_vv_bf16m4_m(vbool4_t mask, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfmul(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmul_vf_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmul.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmul_vf_bf16m4_m(vbool4_t mask, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmul_vv_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmul.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmul_vv_bf16m8_m(vbool2_t mask, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfmul(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmul_vf_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmul.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmul_vf_bf16m8_m(vbool2_t mask, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul(mask, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfmv.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfmv.c
new file mode 100644
index 0000000000000..f85378ff0f37a
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfmv.c
@@ -0,0 +1,69 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local bfloat @test_vfmv_f_s_bf16mf4_bf16(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[SRC:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call bfloat @llvm.riscv.vfmv.f.s.nxv1bf16(<vscale x 1 x bfloat> [[SRC]])
+// CHECK-RV64-NEXT:    ret bfloat [[TMP0]]
+//
+__bf16 test_vfmv_f_s_bf16mf4_bf16(vbfloat16mf4_t src) {
+  return __riscv_vfmv_f(src);
+}
+
+// CHECK-RV64-LABEL: define dso_local bfloat @test_vfmv_f_s_bf16mf2_bf16(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[SRC:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call bfloat @llvm.riscv.vfmv.f.s.nxv2bf16(<vscale x 2 x bfloat> [[SRC]])
+// CHECK-RV64-NEXT:    ret bfloat [[TMP0]]
+//
+__bf16 test_vfmv_f_s_bf16mf2_bf16(vbfloat16mf2_t src) {
+  return __riscv_vfmv_f(src);
+}
+
+// CHECK-RV64-LABEL: define dso_local bfloat @test_vfmv_f_s_bf16m1_bf16(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[SRC:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call bfloat @llvm.riscv.vfmv.f.s.nxv4bf16(<vscale x 4 x bfloat> [[SRC]])
+// CHECK-RV64-NEXT:    ret bfloat [[TMP0]]
+//
+__bf16 test_vfmv_f_s_bf16m1_bf16(vbfloat16m1_t src) {
+  return __riscv_vfmv_f(src);
+}
+
+// CHECK-RV64-LABEL: define dso_local bfloat @test_vfmv_f_s_bf16m2_bf16(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[SRC:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call bfloat @llvm.riscv.vfmv.f.s.nxv8bf16(<vscale x 8 x bfloat> [[SRC]])
+// CHECK-RV64-NEXT:    ret bfloat [[TMP0]]
+//
+__bf16 test_vfmv_f_s_bf16m2_bf16(vbfloat16m2_t src) {
+  return __riscv_vfmv_f(src);
+}
+
+// CHECK-RV64-LABEL: define dso_local bfloat @test_vfmv_f_s_bf16m4_bf16(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[SRC:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call bfloat @llvm.riscv.vfmv.f.s.nxv16bf16(<vscale x 16 x bfloat> [[SRC]])
+// CHECK-RV64-NEXT:    ret bfloat [[TMP0]]
+//
+__bf16 test_vfmv_f_s_bf16m4_bf16(vbfloat16m4_t src) {
+  return __riscv_vfmv_f(src);
+}
+
+// CHECK-RV64-LABEL: define dso_local bfloat @test_vfmv_f_s_bf16m8_bf16(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[SRC:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call bfloat @llvm.riscv.vfmv.f.s.nxv32bf16(<vscale x 32 x bfloat> [[SRC]])
+// CHECK-RV64-NEXT:    ret bfloat [[TMP0]]
+//
+__bf16 test_vfmv_f_s_bf16m8_bf16(vbfloat16m8_t src) {
+  return __riscv_vfmv_f(src);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfncvt.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfncvt.c
new file mode 100644
index 0000000000000..fb635d61670c2
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfncvt.c
@@ -0,0 +1,724 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_x_f_w_bf16mf4_i8mf8(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.x.f.w.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> poison, <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vint8mf8_t test_vfncvt_x_f_w_bf16mf4_i8mf8(vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfncvt_x(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_x_f_w_bf16mf2_i8mf4(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.x.f.w.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> poison, <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vint8mf4_t test_vfncvt_x_f_w_bf16mf2_i8mf4(vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_x(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_x_f_w_bf16m1_i8mf2(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.x.f.w.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> poison, <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vint8mf2_t test_vfncvt_x_f_w_bf16m1_i8mf2(vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_x(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_x_f_w_bf16m2_i8m1(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.x.f.w.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> poison, <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vint8m1_t test_vfncvt_x_f_w_bf16m2_i8m1(vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_x(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_x_f_w_bf16m4_i8m2(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.x.f.w.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> poison, <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vint8m2_t test_vfncvt_x_f_w_bf16m4_i8m2(vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_x(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_x_f_w_bf16m8_i8m4(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.x.f.w.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> poison, <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vint8m4_t test_vfncvt_x_f_w_bf16m8_i8m4(vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_x(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_xu_f_w_bf16mf4_u8mf8(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> poison, <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vuint8mf8_t test_vfncvt_xu_f_w_bf16mf4_u8mf8(vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_xu_f_w_bf16mf2_u8mf4(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> poison, <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vuint8mf4_t test_vfncvt_xu_f_w_bf16mf2_u8mf4(vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_xu_f_w_bf16m1_u8mf2(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> poison, <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vuint8mf2_t test_vfncvt_xu_f_w_bf16m1_u8mf2(vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_xu_f_w_bf16m2_u8m1(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> poison, <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vuint8m1_t test_vfncvt_xu_f_w_bf16m2_u8m1(vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_xu_f_w_bf16m4_u8m2(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> poison, <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vuint8m2_t test_vfncvt_xu_f_w_bf16m4_u8m2(vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_xu_f_w_bf16m8_u8m4(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> poison, <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vuint8m4_t test_vfncvt_xu_f_w_bf16m8_u8m4(vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfncvt_f_f_w_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfncvt.f.f.w.nxv1bf16.nxv1f32.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x float> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfncvt_f_f_w_bf16mf4(vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_bf16(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfncvt_f_f_w_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfncvt.f.f.w.nxv2bf16.nxv2f32.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x float> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfncvt_f_f_w_bf16mf2(vfloat32m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_bf16(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfncvt_f_f_w_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfncvt.f.f.w.nxv4bf16.nxv4f32.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x float> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfncvt_f_f_w_bf16m1(vfloat32m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_bf16(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfncvt_f_f_w_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfncvt.f.f.w.nxv8bf16.nxv8f32.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x float> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfncvt_f_f_w_bf16m2(vfloat32m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_bf16(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfncvt_f_f_w_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfncvt.f.f.w.nxv16bf16.nxv16f32.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x float> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfncvt_f_f_w_bf16m4(vfloat32m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_bf16(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_x_f_w_bf16mf4_i8mf8_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vint8mf8_t test_vfncvt_x_f_w_bf16mf4_i8mf8_m(vbool64_t vm, vbfloat16mf4_t vs2,
+                                             size_t vl) {
+  return __riscv_vfncvt_x(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_x_f_w_bf16mf2_i8mf4_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vint8mf4_t test_vfncvt_x_f_w_bf16mf2_i8mf4_m(vbool32_t vm, vbfloat16mf2_t vs2,
+                                             size_t vl) {
+  return __riscv_vfncvt_x(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_x_f_w_bf16m1_i8mf2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vint8mf2_t test_vfncvt_x_f_w_bf16m1_i8mf2_m(vbool16_t vm, vbfloat16m1_t vs2,
+                                            size_t vl) {
+  return __riscv_vfncvt_x(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_x_f_w_bf16m2_i8m1_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vint8m1_t test_vfncvt_x_f_w_bf16m2_i8m1_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                          size_t vl) {
+  return __riscv_vfncvt_x(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_x_f_w_bf16m4_i8m2_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vint8m2_t test_vfncvt_x_f_w_bf16m4_i8m2_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                          size_t vl) {
+  return __riscv_vfncvt_x(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_x_f_w_bf16m8_i8m4_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> poison, <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vint8m4_t test_vfncvt_x_f_w_bf16m8_i8m4_m(vbool2_t vm, vbfloat16m8_t vs2,
+                                          size_t vl) {
+  return __riscv_vfncvt_x(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_xu_f_w_bf16mf4_u8mf8_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vuint8mf8_t test_vfncvt_xu_f_w_bf16mf4_u8mf8_m(vbool64_t vm, vbfloat16mf4_t vs2,
+                                               size_t vl) {
+  return __riscv_vfncvt_xu(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_xu_f_w_bf16mf2_u8mf4_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vuint8mf4_t test_vfncvt_xu_f_w_bf16mf2_u8mf4_m(vbool32_t vm, vbfloat16mf2_t vs2,
+                                               size_t vl) {
+  return __riscv_vfncvt_xu(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_xu_f_w_bf16m1_u8mf2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vuint8mf2_t test_vfncvt_xu_f_w_bf16m1_u8mf2_m(vbool16_t vm, vbfloat16m1_t vs2,
+                                              size_t vl) {
+  return __riscv_vfncvt_xu(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_xu_f_w_bf16m2_u8m1_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vuint8m1_t test_vfncvt_xu_f_w_bf16m2_u8m1_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                            size_t vl) {
+  return __riscv_vfncvt_xu(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_xu_f_w_bf16m4_u8m2_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vuint8m2_t test_vfncvt_xu_f_w_bf16m4_u8m2_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                            size_t vl) {
+  return __riscv_vfncvt_xu(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_xu_f_w_bf16m8_u8m4_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> poison, <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vuint8m4_t test_vfncvt_xu_f_w_bf16m8_u8m4_m(vbool2_t vm, vbfloat16m8_t vs2,
+                                            size_t vl) {
+  return __riscv_vfncvt_xu(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfncvt_f_f_w_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv1bf16.nxv1f32.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfncvt_f_f_w_bf16mf4_m(vbool64_t vm, vfloat32mf2_t vs2,
+                                           size_t vl) {
+  return __riscv_vfncvt_f_bf16(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfncvt_f_f_w_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv2bf16.nxv2f32.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfncvt_f_f_w_bf16mf2_m(vbool32_t vm, vfloat32m1_t vs2,
+                                           size_t vl) {
+  return __riscv_vfncvt_f_bf16(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfncvt_f_f_w_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv4bf16.nxv4f32.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfncvt_f_f_w_bf16m1_m(vbool16_t vm, vfloat32m2_t vs2,
+                                         size_t vl) {
+  return __riscv_vfncvt_f_bf16(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfncvt_f_f_w_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv8bf16.nxv8f32.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfncvt_f_f_w_bf16m2_m(vbool8_t vm, vfloat32m4_t vs2,
+                                         size_t vl) {
+  return __riscv_vfncvt_f_bf16(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfncvt_f_f_w_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv16bf16.nxv16f32.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfncvt_f_f_w_bf16m4_m(vbool4_t vm, vfloat32m8_t vs2,
+                                         size_t vl) {
+  return __riscv_vfncvt_f_bf16(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_x_f_w_bf16mf4_i8mf8_rm(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.x.f.w.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> poison, <vscale x 1 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vint8mf8_t test_vfncvt_x_f_w_bf16mf4_i8mf8_rm(vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfncvt_x(vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_x_f_w_bf16mf2_i8mf4_rm(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.x.f.w.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> poison, <vscale x 2 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vint8mf4_t test_vfncvt_x_f_w_bf16mf2_i8mf4_rm(vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_x(vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_x_f_w_bf16m1_i8mf2_rm(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.x.f.w.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> poison, <vscale x 4 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vint8mf2_t test_vfncvt_x_f_w_bf16m1_i8mf2_rm(vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_x(vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_x_f_w_bf16m2_i8m1_rm(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.x.f.w.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> poison, <vscale x 8 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vint8m1_t test_vfncvt_x_f_w_bf16m2_i8m1_rm(vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_x(vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_x_f_w_bf16m4_i8m2_rm(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.x.f.w.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> poison, <vscale x 16 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vint8m2_t test_vfncvt_x_f_w_bf16m4_i8m2_rm(vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_x(vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_x_f_w_bf16m8_i8m4_rm(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.x.f.w.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> poison, <vscale x 32 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vint8m4_t test_vfncvt_x_f_w_bf16m8_i8m4_rm(vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_x(vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_xu_f_w_bf16mf4_u8mf8_rm(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> poison, <vscale x 1 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vuint8mf8_t test_vfncvt_xu_f_w_bf16mf4_u8mf8_rm(vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu(vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_xu_f_w_bf16mf2_u8mf4_rm(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> poison, <vscale x 2 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vuint8mf4_t test_vfncvt_xu_f_w_bf16mf2_u8mf4_rm(vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu(vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_xu_f_w_bf16m1_u8mf2_rm(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> poison, <vscale x 4 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vuint8mf2_t test_vfncvt_xu_f_w_bf16m1_u8mf2_rm(vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu(vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_xu_f_w_bf16m2_u8m1_rm(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> poison, <vscale x 8 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vuint8m1_t test_vfncvt_xu_f_w_bf16m2_u8m1_rm(vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu(vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_xu_f_w_bf16m4_u8m2_rm(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> poison, <vscale x 16 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vuint8m2_t test_vfncvt_xu_f_w_bf16m4_u8m2_rm(vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu(vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_xu_f_w_bf16m8_u8m4_rm(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> poison, <vscale x 32 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vuint8m4_t test_vfncvt_xu_f_w_bf16m8_u8m4_rm(vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu(vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfncvt_f_f_w_bf16mf4_rm(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfncvt.f.f.w.nxv1bf16.nxv1f32.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x float> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfncvt_f_f_w_bf16mf4_rm(vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_bf16(vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfncvt_f_f_w_bf16mf2_rm(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfncvt.f.f.w.nxv2bf16.nxv2f32.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x float> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfncvt_f_f_w_bf16mf2_rm(vfloat32m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_bf16(vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfncvt_f_f_w_bf16m1_rm(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfncvt.f.f.w.nxv4bf16.nxv4f32.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x float> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfncvt_f_f_w_bf16m1_rm(vfloat32m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_bf16(vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfncvt_f_f_w_bf16m2_rm(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfncvt.f.f.w.nxv8bf16.nxv8f32.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x float> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfncvt_f_f_w_bf16m2_rm(vfloat32m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_bf16(vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfncvt_f_f_w_bf16m4_rm(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfncvt.f.f.w.nxv16bf16.nxv16f32.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x float> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfncvt_f_f_w_bf16m4_rm(vfloat32m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_bf16(vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_x_f_w_bf16mf4_i8mf8_rm_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vint8mf8_t test_vfncvt_x_f_w_bf16mf4_i8mf8_rm_m(vbool64_t vm,
+                                                vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfncvt_x(vm, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_x_f_w_bf16mf2_i8mf4_rm_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vint8mf4_t test_vfncvt_x_f_w_bf16mf2_i8mf4_rm_m(vbool32_t vm,
+                                                vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_x(vm, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_x_f_w_bf16m1_i8mf2_rm_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vint8mf2_t test_vfncvt_x_f_w_bf16m1_i8mf2_rm_m(vbool16_t vm, vbfloat16m1_t vs2,
+                                               size_t vl) {
+  return __riscv_vfncvt_x(vm, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_x_f_w_bf16m2_i8m1_rm_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vint8m1_t test_vfncvt_x_f_w_bf16m2_i8m1_rm_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                             size_t vl) {
+  return __riscv_vfncvt_x(vm, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_x_f_w_bf16m4_i8m2_rm_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vint8m2_t test_vfncvt_x_f_w_bf16m4_i8m2_rm_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                             size_t vl) {
+  return __riscv_vfncvt_x(vm, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_x_f_w_bf16m8_i8m4_rm_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> poison, <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vint8m4_t test_vfncvt_x_f_w_bf16m8_i8m4_rm_m(vbool2_t vm, vbfloat16m8_t vs2,
+                                             size_t vl) {
+  return __riscv_vfncvt_x(vm, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_xu_f_w_bf16mf4_u8mf8_rm_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vuint8mf8_t test_vfncvt_xu_f_w_bf16mf4_u8mf8_rm_m(vbool64_t vm,
+                                                  vbfloat16mf4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfncvt_xu(vm, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_xu_f_w_bf16mf2_u8mf4_rm_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vuint8mf4_t test_vfncvt_xu_f_w_bf16mf2_u8mf4_rm_m(vbool32_t vm,
+                                                  vbfloat16mf2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfncvt_xu(vm, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_xu_f_w_bf16m1_u8mf2_rm_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vuint8mf2_t test_vfncvt_xu_f_w_bf16m1_u8mf2_rm_m(vbool16_t vm,
+                                                 vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu(vm, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_xu_f_w_bf16m2_u8m1_rm_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vuint8m1_t test_vfncvt_xu_f_w_bf16m2_u8m1_rm_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                               size_t vl) {
+  return __riscv_vfncvt_xu(vm, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_xu_f_w_bf16m4_u8m2_rm_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vuint8m2_t test_vfncvt_xu_f_w_bf16m4_u8m2_rm_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                               size_t vl) {
+  return __riscv_vfncvt_xu(vm, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_xu_f_w_bf16m8_u8m4_rm_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> poison, <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vuint8m4_t test_vfncvt_xu_f_w_bf16m8_u8m4_rm_m(vbool2_t vm, vbfloat16m8_t vs2,
+                                               size_t vl) {
+  return __riscv_vfncvt_xu(vm, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfncvt_f_f_w_bf16mf4_rm_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv1bf16.nxv1f32.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfncvt_f_f_w_bf16mf4_rm_m(vbool64_t vm, vfloat32mf2_t vs2,
+                                              size_t vl) {
+  return __riscv_vfncvt_f_bf16(vm, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfncvt_f_f_w_bf16mf2_rm_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv2bf16.nxv2f32.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfncvt_f_f_w_bf16mf2_rm_m(vbool32_t vm, vfloat32m1_t vs2,
+                                              size_t vl) {
+  return __riscv_vfncvt_f_bf16(vm, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfncvt_f_f_w_bf16m1_rm_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv4bf16.nxv4f32.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfncvt_f_f_w_bf16m1_rm_m(vbool16_t vm, vfloat32m2_t vs2,
+                                            size_t vl) {
+  return __riscv_vfncvt_f_bf16(vm, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfncvt_f_f_w_bf16m2_rm_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv8bf16.nxv8f32.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfncvt_f_f_w_bf16m2_rm_m(vbool8_t vm, vfloat32m4_t vs2,
+                                            size_t vl) {
+  return __riscv_vfncvt_f_bf16(vm, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfncvt_f_f_w_bf16m4_rm_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv16bf16.nxv16f32.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfncvt_f_f_w_bf16m4_rm_m(vbool4_t vm, vfloat32m8_t vs2,
+                                            size_t vl) {
+  return __riscv_vfncvt_f_bf16(vm, vs2, __RISCV_FRM_RNE, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfncvt_rod.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfncvt_rod.c
new file mode 100644
index 0000000000000..1ad856df48c4b
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfncvt_rod.c
@@ -0,0 +1,113 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfncvt_rod_f_f_w_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.nxv1bf16.nxv1f32.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfncvt_rod_f_f_w_bf16mf4(vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_bf16(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfncvt_rod_f_f_w_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.nxv2bf16.nxv2f32.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfncvt_rod_f_f_w_bf16mf2(vfloat32m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_bf16(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfncvt_rod_f_f_w_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.nxv4bf16.nxv4f32.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfncvt_rod_f_f_w_bf16m1(vfloat32m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_bf16(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfncvt_rod_f_f_w_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.nxv8bf16.nxv8f32.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfncvt_rod_f_f_w_bf16m2(vfloat32m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_bf16(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfncvt_rod_f_f_w_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.nxv16bf16.nxv16f32.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfncvt_rod_f_f_w_bf16m4(vfloat32m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_bf16(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfncvt_rod_f_f_w_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv1bf16.nxv1f32.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfncvt_rod_f_f_w_bf16mf4_m(vbool64_t vm, vfloat32mf2_t vs2,
+                                               size_t vl) {
+  return __riscv_vfncvt_rod_f_bf16(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfncvt_rod_f_f_w_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv2bf16.nxv2f32.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfncvt_rod_f_f_w_bf16mf2_m(vbool32_t vm, vfloat32m1_t vs2,
+                                               size_t vl) {
+  return __riscv_vfncvt_rod_f_bf16(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfncvt_rod_f_f_w_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv4bf16.nxv4f32.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfncvt_rod_f_f_w_bf16m1_m(vbool16_t vm, vfloat32m2_t vs2,
+                                             size_t vl) {
+  return __riscv_vfncvt_rod_f_bf16(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfncvt_rod_f_f_w_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv8bf16.nxv8f32.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfncvt_rod_f_f_w_bf16m2_m(vbool8_t vm, vfloat32m4_t vs2,
+                                             size_t vl) {
+  return __riscv_vfncvt_rod_f_bf16(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfncvt_rod_f_f_w_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv16bf16.nxv16f32.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfncvt_rod_f_f_w_bf16m4_m(vbool4_t vm, vfloat32m8_t vs2,
+                                             size_t vl) {
+  return __riscv_vfncvt_rod_f_bf16(vm, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfncvt_rtz.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfncvt_rtz.c
new file mode 100644
index 0000000000000..12d08934bd49d
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfncvt_rtz.c
@@ -0,0 +1,267 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_rtz_x_f_w_bf16mf4_i8mf8(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> poison, <vscale x 1 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vint8mf8_t test_vfncvt_rtz_x_f_w_bf16mf4_i8mf8(vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_x(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_rtz_x_f_w_bf16mf2_i8mf4(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> poison, <vscale x 2 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vint8mf4_t test_vfncvt_rtz_x_f_w_bf16mf2_i8mf4(vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_x(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_rtz_x_f_w_bf16m1_i8mf2(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> poison, <vscale x 4 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vint8mf2_t test_vfncvt_rtz_x_f_w_bf16m1_i8mf2(vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_x(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_rtz_x_f_w_bf16m2_i8m1(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> poison, <vscale x 8 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vint8m1_t test_vfncvt_rtz_x_f_w_bf16m2_i8m1(vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_x(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_rtz_x_f_w_bf16m4_i8m2(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> poison, <vscale x 16 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vint8m2_t test_vfncvt_rtz_x_f_w_bf16m4_i8m2(vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_x(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_rtz_x_f_w_bf16m8_i8m4(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> poison, <vscale x 32 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vint8m4_t test_vfncvt_rtz_x_f_w_bf16m8_i8m4(vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_x(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_rtz_xu_f_w_bf16mf4_u8mf8(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> poison, <vscale x 1 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vuint8mf8_t test_vfncvt_rtz_xu_f_w_bf16mf4_u8mf8(vbfloat16mf4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfncvt_rtz_xu(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_rtz_xu_f_w_bf16mf2_u8mf4(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> poison, <vscale x 2 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vuint8mf4_t test_vfncvt_rtz_xu_f_w_bf16mf2_u8mf4(vbfloat16mf2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfncvt_rtz_xu(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_rtz_xu_f_w_bf16m1_u8mf2(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> poison, <vscale x 4 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vuint8mf2_t test_vfncvt_rtz_xu_f_w_bf16m1_u8mf2(vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_xu(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_rtz_xu_f_w_bf16m2_u8m1(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> poison, <vscale x 8 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vuint8m1_t test_vfncvt_rtz_xu_f_w_bf16m2_u8m1(vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_xu(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_rtz_xu_f_w_bf16m4_u8m2(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> poison, <vscale x 16 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vuint8m2_t test_vfncvt_rtz_xu_f_w_bf16m4_u8m2(vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_xu(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_rtz_xu_f_w_bf16m8_u8m4(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> poison, <vscale x 32 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vuint8m4_t test_vfncvt_rtz_xu_f_w_bf16m8_u8m4(vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_xu(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_rtz_x_f_w_bf16mf4_i8mf8_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vint8mf8_t test_vfncvt_rtz_x_f_w_bf16mf4_i8mf8_m(vbool64_t vm,
+                                                 vbfloat16mf4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfncvt_rtz_x(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_rtz_x_f_w_bf16mf2_i8mf4_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vint8mf4_t test_vfncvt_rtz_x_f_w_bf16mf2_i8mf4_m(vbool32_t vm,
+                                                 vbfloat16mf2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfncvt_rtz_x(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_rtz_x_f_w_bf16m1_i8mf2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vint8mf2_t test_vfncvt_rtz_x_f_w_bf16m1_i8mf2_m(vbool16_t vm, vbfloat16m1_t vs2,
+                                                size_t vl) {
+  return __riscv_vfncvt_rtz_x(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_rtz_x_f_w_bf16m2_i8m1_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vint8m1_t test_vfncvt_rtz_x_f_w_bf16m2_i8m1_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                              size_t vl) {
+  return __riscv_vfncvt_rtz_x(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_rtz_x_f_w_bf16m4_i8m2_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vint8m2_t test_vfncvt_rtz_x_f_w_bf16m4_i8m2_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                              size_t vl) {
+  return __riscv_vfncvt_rtz_x(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_rtz_x_f_w_bf16m8_i8m4_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> poison, <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vint8m4_t test_vfncvt_rtz_x_f_w_bf16m8_i8m4_m(vbool2_t vm, vbfloat16m8_t vs2,
+                                              size_t vl) {
+  return __riscv_vfncvt_rtz_x(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_rtz_xu_f_w_bf16mf4_u8mf8_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vuint8mf8_t test_vfncvt_rtz_xu_f_w_bf16mf4_u8mf8_m(vbool64_t vm,
+                                                   vbfloat16mf4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfncvt_rtz_xu(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_rtz_xu_f_w_bf16mf2_u8mf4_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vuint8mf4_t test_vfncvt_rtz_xu_f_w_bf16mf2_u8mf4_m(vbool32_t vm,
+                                                   vbfloat16mf2_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfncvt_rtz_xu(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_rtz_xu_f_w_bf16m1_u8mf2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vuint8mf2_t test_vfncvt_rtz_xu_f_w_bf16m1_u8mf2_m(vbool16_t vm,
+                                                  vbfloat16m1_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfncvt_rtz_xu(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_rtz_xu_f_w_bf16m2_u8m1_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vuint8m1_t test_vfncvt_rtz_xu_f_w_bf16m2_u8m1_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                                size_t vl) {
+  return __riscv_vfncvt_rtz_xu(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_rtz_xu_f_w_bf16m4_u8m2_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vuint8m2_t test_vfncvt_rtz_xu_f_w_bf16m4_u8m2_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                                size_t vl) {
+  return __riscv_vfncvt_rtz_xu(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_rtz_xu_f_w_bf16m8_u8m4_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> poison, <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vuint8m4_t test_vfncvt_rtz_xu_f_w_bf16m8_u8m4_m(vbool2_t vm, vbfloat16m8_t vs2,
+                                                size_t vl) {
+  return __riscv_vfncvt_rtz_xu(vm, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfnmacc.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfnmacc.c
new file mode 100644
index 0000000000000..6f7928b763230
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfnmacc.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmacc_vv_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmacc.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmacc_vv_bf16mf4(vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmacc(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmacc_vf_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmacc.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmacc_vf_bf16mf4(vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmacc(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmacc_vv_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmacc.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmacc_vv_bf16mf2(vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmacc(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmacc_vf_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmacc.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmacc_vf_bf16mf2(vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmacc(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmacc_vv_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmacc.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmacc_vv_bf16m1(vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmacc(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmacc_vf_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmacc.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmacc_vf_bf16m1(vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmacc(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmacc_vv_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmacc.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmacc_vv_bf16m2(vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmacc(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmacc_vf_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmacc.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmacc_vf_bf16m2(vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmacc(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmacc_vv_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmacc.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmacc_vv_bf16m4(vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmacc(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmacc_vf_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmacc.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmacc_vf_bf16m4(vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmacc(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmacc_vv_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmacc.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmacc_vv_bf16m8(vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmacc(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmacc_vf_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmacc.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmacc_vf_bf16m8(vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmacc(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmacc_vv_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmacc.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmacc_vv_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmacc(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmacc_vf_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmacc.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmacc_vf_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmacc(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmacc_vv_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmacc.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmacc_vv_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmacc(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmacc_vf_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmacc.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmacc_vf_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmacc(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmacc_vv_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmacc.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmacc_vv_bf16m1_m(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmacc(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmacc_vf_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmacc.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmacc_vf_bf16m1_m(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmacc(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmacc_vv_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmacc.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmacc_vv_bf16m2_m(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmacc(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmacc_vf_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmacc.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmacc_vf_bf16m2_m(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmacc(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmacc_vv_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmacc.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmacc_vv_bf16m4_m(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmacc(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmacc_vf_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmacc.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmacc_vf_bf16m4_m(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmacc(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmacc_vv_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmacc.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmacc_vv_bf16m8_m(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmacc(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmacc_vf_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmacc.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmacc_vf_bf16m8_m(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmacc(mask, vd, rs1, vs2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfnmadd.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfnmadd.c
new file mode 100644
index 0000000000000..97d207040e5b2
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfnmadd.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmadd_vv_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmadd.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmadd_vv_bf16mf4(vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmadd(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmadd_vf_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmadd.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmadd_vf_bf16mf4(vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmadd(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmadd_vv_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmadd.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmadd_vv_bf16mf2(vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmadd(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmadd_vf_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmadd.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmadd_vf_bf16mf2(vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmadd(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmadd_vv_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmadd.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmadd_vv_bf16m1(vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmadd(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmadd_vf_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmadd.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmadd_vf_bf16m1(vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmadd(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmadd_vv_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmadd.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmadd_vv_bf16m2(vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmadd(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmadd_vf_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmadd.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmadd_vf_bf16m2(vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmadd(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmadd_vv_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmadd.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmadd_vv_bf16m4(vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmadd(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmadd_vf_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmadd.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmadd_vf_bf16m4(vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmadd(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmadd_vv_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmadd.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmadd_vv_bf16m8(vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmadd(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmadd_vf_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmadd.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmadd_vf_bf16m8(vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmadd(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmadd_vv_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmadd.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmadd_vv_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmadd(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmadd_vf_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmadd.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmadd_vf_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmadd(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmadd_vv_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmadd.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmadd_vv_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmadd(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmadd_vf_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmadd.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmadd_vf_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmadd(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmadd_vv_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmadd.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmadd_vv_bf16m1_m(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmadd(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmadd_vf_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmadd.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmadd_vf_bf16m1_m(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmadd(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmadd_vv_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmadd.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmadd_vv_bf16m2_m(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmadd(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmadd_vf_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmadd.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmadd_vf_bf16m2_m(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmadd(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmadd_vv_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmadd.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmadd_vv_bf16m4_m(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmadd(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmadd_vf_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmadd.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmadd_vf_bf16m4_m(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmadd(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmadd_vv_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmadd.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmadd_vv_bf16m8_m(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmadd(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmadd_vf_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmadd.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmadd_vf_bf16m8_m(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmadd(mask, vd, rs1, vs2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfnmsac.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfnmsac.c
new file mode 100644
index 0000000000000..404b4f83c4f1f
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfnmsac.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsac_vv_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsac.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsac_vv_bf16mf4(vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsac(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsac_vf_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsac.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsac_vf_bf16mf4(vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsac(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsac_vv_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsac.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsac_vv_bf16mf2(vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsac(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsac_vf_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsac.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsac_vf_bf16mf2(vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsac(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsac_vv_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsac.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsac_vv_bf16m1(vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsac(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsac_vf_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsac.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsac_vf_bf16m1(vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsac(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsac_vv_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsac.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsac_vv_bf16m2(vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsac(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsac_vf_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsac.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsac_vf_bf16m2(vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsac(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsac_vv_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsac.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsac_vv_bf16m4(vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsac(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsac_vf_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsac.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsac_vf_bf16m4(vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsac(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsac_vv_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsac.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsac_vv_bf16m8(vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsac(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsac_vf_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsac.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsac_vf_bf16m8(vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsac(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsac_vv_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsac.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsac_vv_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsac(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsac_vf_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsac.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsac_vf_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsac(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsac_vv_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsac.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsac_vv_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsac(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsac_vf_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsac.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsac_vf_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsac(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsac_vv_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsac.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsac_vv_bf16m1_m(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsac(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsac_vf_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsac.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsac_vf_bf16m1_m(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsac(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsac_vv_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsac.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsac_vv_bf16m2_m(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsac(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsac_vf_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsac.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsac_vf_bf16m2_m(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsac(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsac_vv_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsac.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsac_vv_bf16m4_m(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsac(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsac_vf_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsac.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsac_vf_bf16m4_m(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsac(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsac_vv_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsac.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsac_vv_bf16m8_m(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsac(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsac_vf_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsac.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsac_vf_bf16m8_m(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsac(mask, vd, rs1, vs2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfnmsub.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfnmsub.c
new file mode 100644
index 0000000000000..3a520dd9af9bf
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfnmsub.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsub_vv_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsub.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsub_vv_bf16mf4(vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsub(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsub_vf_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsub.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsub_vf_bf16mf4(vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsub(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsub_vv_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsub.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsub_vv_bf16mf2(vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsub(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsub_vf_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsub.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsub_vf_bf16mf2(vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsub(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsub_vv_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsub.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsub_vv_bf16m1(vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsub(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsub_vf_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsub.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsub_vf_bf16m1(vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsub(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsub_vv_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsub.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsub_vv_bf16m2(vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsub(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsub_vf_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsub.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsub_vf_bf16m2(vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsub(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsub_vv_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsub.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsub_vv_bf16m4(vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsub(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsub_vf_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsub.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsub_vf_bf16m4(vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsub(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsub_vv_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsub.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsub_vv_bf16m8(vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsub(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsub_vf_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsub.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsub_vf_bf16m8(vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsub(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsub_vv_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsub.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsub_vv_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsub(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsub_vf_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsub.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsub_vf_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsub(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsub_vv_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsub.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsub_vv_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsub(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsub_vf_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsub.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsub_vf_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsub(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsub_vv_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsub.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsub_vv_bf16m1_m(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsub(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsub_vf_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsub.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsub_vf_bf16m1_m(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsub(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsub_vv_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsub.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsub_vv_bf16m2_m(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsub(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsub_vf_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsub.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsub_vf_bf16m2_m(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsub(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsub_vv_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsub.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsub_vv_bf16m4_m(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsub(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsub_vf_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsub.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsub_vf_bf16m4_m(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsub(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsub_vv_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsub.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsub_vv_bf16m8_m(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsub(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsub_vf_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsub.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsub_vf_bf16m8_m(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsub(mask, vd, rs1, vs2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfrec7.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfrec7.c
new file mode 100644
index 0000000000000..462b6acf8eee6
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfrec7.c
@@ -0,0 +1,129 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfrec7_v_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfrec7.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfrec7_v_bf16mf4(vbfloat16mf4_t op1, size_t vl) {
+  return __riscv_vfrec7(op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfrec7_v_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfrec7.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfrec7_v_bf16mf2(vbfloat16mf2_t op1, size_t vl) {
+  return __riscv_vfrec7(op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfrec7_v_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfrec7.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfrec7_v_bf16m1(vbfloat16m1_t op1, size_t vl) {
+  return __riscv_vfrec7(op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfrec7_v_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfrec7.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfrec7_v_bf16m2(vbfloat16m2_t op1, size_t vl) {
+  return __riscv_vfrec7(op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfrec7_v_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfrec7.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfrec7_v_bf16m4(vbfloat16m4_t op1, size_t vl) {
+  return __riscv_vfrec7(op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfrec7_v_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfrec7.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfrec7_v_bf16m8(vbfloat16m8_t op1, size_t vl) {
+  return __riscv_vfrec7(op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfrec7_v_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfrec7.mask.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfrec7_v_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t op1, size_t vl) {
+  return __riscv_vfrec7(mask, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfrec7_v_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfrec7.mask.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfrec7_v_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t op1, size_t vl) {
+  return __riscv_vfrec7(mask, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfrec7_v_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfrec7.mask.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfrec7_v_bf16m1_m(vbool16_t mask, vbfloat16m1_t op1, size_t vl) {
+  return __riscv_vfrec7(mask, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfrec7_v_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfrec7.mask.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfrec7_v_bf16m2_m(vbool8_t mask, vbfloat16m2_t op1, size_t vl) {
+  return __riscv_vfrec7(mask, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfrec7_v_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfrec7.mask.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfrec7_v_bf16m4_m(vbool4_t mask, vbfloat16m4_t op1, size_t vl) {
+  return __riscv_vfrec7(mask, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfrec7_v_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfrec7.mask.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfrec7_v_bf16m8_m(vbool2_t mask, vbfloat16m8_t op1, size_t vl) {
+  return __riscv_vfrec7(mask, op1, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfrsqrt7.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfrsqrt7.c
new file mode 100644
index 0000000000000..051fde7ef5472
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfrsqrt7.c
@@ -0,0 +1,129 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfrsqrt7_v_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfrsqrt7.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfrsqrt7_v_bf16mf4(vbfloat16mf4_t op1, size_t vl) {
+  return __riscv_vfrsqrt7(op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfrsqrt7_v_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfrsqrt7.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfrsqrt7_v_bf16mf2(vbfloat16mf2_t op1, size_t vl) {
+  return __riscv_vfrsqrt7(op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfrsqrt7_v_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfrsqrt7.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfrsqrt7_v_bf16m1(vbfloat16m1_t op1, size_t vl) {
+  return __riscv_vfrsqrt7(op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfrsqrt7_v_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfrsqrt7.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfrsqrt7_v_bf16m2(vbfloat16m2_t op1, size_t vl) {
+  return __riscv_vfrsqrt7(op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfrsqrt7_v_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfrsqrt7.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfrsqrt7_v_bf16m4(vbfloat16m4_t op1, size_t vl) {
+  return __riscv_vfrsqrt7(op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfrsqrt7_v_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfrsqrt7.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfrsqrt7_v_bf16m8(vbfloat16m8_t op1, size_t vl) {
+  return __riscv_vfrsqrt7(op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfrsqrt7_v_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfrsqrt7_v_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t op1, size_t vl) {
+  return __riscv_vfrsqrt7(mask, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfrsqrt7_v_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfrsqrt7_v_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t op1, size_t vl) {
+  return __riscv_vfrsqrt7(mask, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfrsqrt7_v_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfrsqrt7_v_bf16m1_m(vbool16_t mask, vbfloat16m1_t op1, size_t vl) {
+  return __riscv_vfrsqrt7(mask, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfrsqrt7_v_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfrsqrt7_v_bf16m2_m(vbool8_t mask, vbfloat16m2_t op1, size_t vl) {
+  return __riscv_vfrsqrt7(mask, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfrsqrt7_v_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfrsqrt7_v_bf16m4_m(vbool4_t mask, vbfloat16m4_t op1, size_t vl) {
+  return __riscv_vfrsqrt7(mask, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfrsqrt7_v_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfrsqrt7_v_bf16m8_m(vbool2_t mask, vbfloat16m8_t op1, size_t vl) {
+  return __riscv_vfrsqrt7(mask, op1, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfrsub.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfrsub.c
new file mode 100644
index 0000000000000..04941823d1917
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfrsub.c
@@ -0,0 +1,129 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfrsub_vf_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfrsub.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfrsub_vf_bf16mf4(vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfrsub_vf_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfrsub.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfrsub_vf_bf16mf2(vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfrsub_vf_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfrsub.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfrsub_vf_bf16m1(vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfrsub_vf_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfrsub.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfrsub_vf_bf16m2(vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfrsub_vf_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfrsub.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfrsub_vf_bf16m4(vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfrsub_vf_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfrsub.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfrsub_vf_bf16m8(vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfrsub_vf_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfrsub.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfrsub_vf_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfrsub_vf_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfrsub.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfrsub_vf_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfrsub_vf_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfrsub.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfrsub_vf_bf16m1_m(vbool16_t mask, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfrsub_vf_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfrsub.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfrsub_vf_bf16m2_m(vbool8_t mask, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfrsub_vf_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfrsub.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfrsub_vf_bf16m4_m(vbool4_t mask, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfrsub_vf_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfrsub.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfrsub_vf_bf16m8_m(vbool2_t mask, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub(mask, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfsgnj.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfsgnj.c
new file mode 100644
index 0000000000000..615deddf4254f
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfsgnj.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnj_vv_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnj.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnj_vv_bf16mf4(vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsgnj(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnj_vf_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnj.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnj_vf_bf16mf4(vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnj_vv_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnj.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnj_vv_bf16mf2(vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsgnj(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnj_vf_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnj.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnj_vf_bf16mf2(vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnj_vv_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnj.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnj_vv_bf16m1(vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsgnj(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnj_vf_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnj.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnj_vf_bf16m1(vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnj_vv_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnj.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnj_vv_bf16m2(vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsgnj(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnj_vf_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnj.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnj_vf_bf16m2(vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnj_vv_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnj.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnj_vv_bf16m4(vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsgnj(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnj_vf_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnj.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnj_vf_bf16m4(vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnj_vv_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnj.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnj_vv_bf16m8(vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsgnj(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnj_vf_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnj.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnj_vf_bf16m8(vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnj_vv_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnj.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnj_vv_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsgnj(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnj_vf_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnj.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnj_vf_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnj_vv_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnj.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnj_vv_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsgnj(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnj_vf_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnj.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnj_vf_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnj_vv_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnj.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnj_vv_bf16m1_m(vbool16_t mask, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsgnj(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnj_vf_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnj.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnj_vf_bf16m1_m(vbool16_t mask, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnj_vv_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnj.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnj_vv_bf16m2_m(vbool8_t mask, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsgnj(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnj_vf_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnj.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnj_vf_bf16m2_m(vbool8_t mask, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnj_vv_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnj.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnj_vv_bf16m4_m(vbool4_t mask, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsgnj(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnj_vf_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnj.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnj_vf_bf16m4_m(vbool4_t mask, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnj_vv_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnj.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnj_vv_bf16m8_m(vbool2_t mask, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsgnj(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnj_vf_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnj.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnj_vf_bf16m8_m(vbool2_t mask, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj(mask, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfsgnjn.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfsgnjn.c
new file mode 100644
index 0000000000000..a895e5f6ad6bd
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfsgnjn.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjn_vv_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjn.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjn_vv_bf16mf4(vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsgnjn(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjn_vf_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjn.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjn_vf_bf16mf4(vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjn_vv_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjn.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjn_vv_bf16mf2(vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsgnjn(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjn_vf_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjn.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjn_vf_bf16mf2(vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjn_vv_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjn.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjn_vv_bf16m1(vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsgnjn(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjn_vf_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjn.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjn_vf_bf16m1(vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjn_vv_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjn.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjn_vv_bf16m2(vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsgnjn(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjn_vf_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjn.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjn_vf_bf16m2(vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjn_vv_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjn.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjn_vv_bf16m4(vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsgnjn(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjn_vf_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjn.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjn_vf_bf16m4(vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjn_vv_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjn.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjn_vv_bf16m8(vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsgnjn(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjn_vf_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjn.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjn_vf_bf16m8(vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjn_vv_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjn_vv_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsgnjn(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjn_vf_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjn_vf_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjn_vv_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjn_vv_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsgnjn(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjn_vf_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjn_vf_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjn_vv_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjn_vv_bf16m1_m(vbool16_t mask, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsgnjn(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjn_vf_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjn_vf_bf16m1_m(vbool16_t mask, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjn_vv_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjn_vv_bf16m2_m(vbool8_t mask, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsgnjn(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjn_vf_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjn_vf_bf16m2_m(vbool8_t mask, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjn_vv_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjn_vv_bf16m4_m(vbool4_t mask, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsgnjn(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjn_vf_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjn_vf_bf16m4_m(vbool4_t mask, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjn_vv_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjn_vv_bf16m8_m(vbool2_t mask, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsgnjn(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjn_vf_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjn_vf_bf16m8_m(vbool2_t mask, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn(mask, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfsgnjx.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfsgnjx.c
new file mode 100644
index 0000000000000..0187516d00dc1
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfsgnjx.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjx_vv_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjx.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjx_vv_bf16mf4(vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsgnjx(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjx_vf_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjx.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjx_vf_bf16mf4(vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjx_vv_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjx.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjx_vv_bf16mf2(vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsgnjx(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjx_vf_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjx.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjx_vf_bf16mf2(vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjx_vv_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjx.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjx_vv_bf16m1(vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsgnjx(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjx_vf_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjx.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjx_vf_bf16m1(vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjx_vv_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjx.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjx_vv_bf16m2(vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsgnjx(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjx_vf_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjx.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjx_vf_bf16m2(vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjx_vv_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjx.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjx_vv_bf16m4(vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsgnjx(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjx_vf_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjx.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjx_vf_bf16m4(vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjx_vv_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjx.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjx_vv_bf16m8(vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsgnjx(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjx_vf_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjx.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjx_vf_bf16m8(vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjx_vv_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjx_vv_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsgnjx(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjx_vf_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjx_vf_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjx_vv_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjx_vv_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsgnjx(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjx_vf_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjx_vf_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjx_vv_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjx_vv_bf16m1_m(vbool16_t mask, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsgnjx(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjx_vf_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjx_vf_bf16m1_m(vbool16_t mask, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjx_vv_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjx_vv_bf16m2_m(vbool8_t mask, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsgnjx(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjx_vf_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjx_vf_bf16m2_m(vbool8_t mask, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjx_vv_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjx_vv_bf16m4_m(vbool4_t mask, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsgnjx(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjx_vf_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjx_vf_bf16m4_m(vbool4_t mask, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjx_vv_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjx_vv_bf16m8_m(vbool2_t mask, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsgnjx(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjx_vf_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjx_vf_bf16m8_m(vbool2_t mask, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx(mask, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfslide1down.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfslide1down.c
new file mode 100644
index 0000000000000..4a7689492de92
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfslide1down.c
@@ -0,0 +1,129 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfslide1down_vf_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfslide1down.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfslide1down_vf_bf16mf4(vbfloat16mf4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down(src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfslide1down_vf_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfslide1down.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfslide1down_vf_bf16mf2(vbfloat16mf2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down(src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfslide1down_vf_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfslide1down.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfslide1down_vf_bf16m1(vbfloat16m1_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down(src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfslide1down_vf_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfslide1down.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfslide1down_vf_bf16m2(vbfloat16m2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down(src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfslide1down_vf_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfslide1down.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfslide1down_vf_bf16m4(vbfloat16m4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down(src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfslide1down_vf_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfslide1down.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfslide1down_vf_bf16m8(vbfloat16m8_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down(src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfslide1down_vf_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfslide1down.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfslide1down_vf_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down(mask, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfslide1down_vf_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfslide1down.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfslide1down_vf_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down(mask, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfslide1down_vf_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfslide1down.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfslide1down_vf_bf16m1_m(vbool16_t mask, vbfloat16m1_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down(mask, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfslide1down_vf_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfslide1down.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfslide1down_vf_bf16m2_m(vbool8_t mask, vbfloat16m2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down(mask, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfslide1down_vf_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfslide1down.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfslide1down_vf_bf16m4_m(vbool4_t mask, vbfloat16m4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down(mask, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfslide1down_vf_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfslide1down.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfslide1down_vf_bf16m8_m(vbool2_t mask, vbfloat16m8_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down(mask, src, value, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfslide1up.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfslide1up.c
new file mode 100644
index 0000000000000..f9f2dc0cf26b8
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfslide1up.c
@@ -0,0 +1,129 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfslide1up_vf_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfslide1up.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfslide1up_vf_bf16mf4(vbfloat16mf4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up(src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfslide1up_vf_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfslide1up.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfslide1up_vf_bf16mf2(vbfloat16mf2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up(src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfslide1up_vf_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfslide1up.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfslide1up_vf_bf16m1(vbfloat16m1_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up(src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfslide1up_vf_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfslide1up.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfslide1up_vf_bf16m2(vbfloat16m2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up(src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfslide1up_vf_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfslide1up.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfslide1up_vf_bf16m4(vbfloat16m4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up(src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfslide1up_vf_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfslide1up.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfslide1up_vf_bf16m8(vbfloat16m8_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up(src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfslide1up_vf_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfslide1up.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfslide1up_vf_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up(mask, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfslide1up_vf_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfslide1up.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfslide1up_vf_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up(mask, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfslide1up_vf_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfslide1up.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfslide1up_vf_bf16m1_m(vbool16_t mask, vbfloat16m1_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up(mask, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfslide1up_vf_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfslide1up.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfslide1up_vf_bf16m2_m(vbool8_t mask, vbfloat16m2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up(mask, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfslide1up_vf_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfslide1up.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfslide1up_vf_bf16m4_m(vbool4_t mask, vbfloat16m4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up(mask, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfslide1up_vf_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfslide1up.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfslide1up_vf_bf16m8_m(vbool2_t mask, vbfloat16m8_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up(mask, src, value, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfsub.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfsub.c
new file mode 100644
index 0000000000000..ebcf6fa4669a3
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfsub.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsub_vv_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsub.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsub_vv_bf16mf4(vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsub(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsub_vf_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsub.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsub_vf_bf16mf4(vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsub_vv_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsub.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsub_vv_bf16mf2(vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsub(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsub_vf_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsub.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsub_vf_bf16mf2(vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsub_vv_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsub.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsub_vv_bf16m1(vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsub(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsub_vf_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsub.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsub_vf_bf16m1(vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsub_vv_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsub.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsub_vv_bf16m2(vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsub(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsub_vf_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsub.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsub_vf_bf16m2(vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsub_vv_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsub.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsub_vv_bf16m4(vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsub(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsub_vf_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsub.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsub_vf_bf16m4(vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsub_vv_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsub.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsub_vv_bf16m8(vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsub(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsub_vf_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsub.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsub_vf_bf16m8(vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsub_vv_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsub.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsub_vv_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsub(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsub_vf_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsub.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsub_vf_bf16mf4_m(vbool64_t mask, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsub_vv_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsub.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsub_vv_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsub(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsub_vf_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsub.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsub_vf_bf16mf2_m(vbool32_t mask, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsub_vv_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsub.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsub_vv_bf16m1_m(vbool16_t mask, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsub(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsub_vf_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsub.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsub_vf_bf16m1_m(vbool16_t mask, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsub_vv_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsub.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsub_vv_bf16m2_m(vbool8_t mask, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsub(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsub_vf_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsub.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsub_vf_bf16m2_m(vbool8_t mask, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsub_vv_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsub.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsub_vv_bf16m4_m(vbool4_t mask, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsub(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsub_vf_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsub.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsub_vf_bf16m4_m(vbool4_t mask, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsub_vv_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsub.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsub_vv_bf16m8_m(vbool2_t mask, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsub(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsub_vf_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsub.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsub_vf_bf16m8_m(vbool2_t mask, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub(mask, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfwadd.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfwadd.c
new file mode 100644
index 0000000000000..124e7fb6de342
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfwadd.c
@@ -0,0 +1,893 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vv_bf16mf4_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vv_bf16mf4_f32mf2(vbfloat16mf4_t vs2,
+                                            vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vf_bf16mf4_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vf_bf16mf4_f32mf2(vbfloat16mf4_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwadd_vf(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wv_bf16mf4_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wv_bf16mf4_f32mf2(vfloat32mf2_t vs2,
+                                            vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wf_bf16_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.nxv1f32.bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wf_bf16_f32mf2(vfloat32mf2_t vs2, __bf16 rs1,
+                                         size_t vl) {
+  return __riscv_vfwadd_wf(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vv_bf16mf2_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vv_bf16mf2_f32m1(vbfloat16mf2_t vs2,
+                                          vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vf_bf16mf2_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vf_bf16mf2_f32m1(vbfloat16mf2_t vs2, __bf16 rs1,
+                                          size_t vl) {
+  return __riscv_vfwadd_vf(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wv_bf16mf2_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wv_bf16mf2_f32m1(vfloat32m1_t vs2, vbfloat16mf2_t vs1,
+                                          size_t vl) {
+  return __riscv_vfwadd_wv(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wf_bf16_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.nxv2f32.bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wf_bf16_f32m1(vfloat32m1_t vs2, __bf16 rs1,
+                                       size_t vl) {
+  return __riscv_vfwadd_wf(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vv_bf16m1_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vv_bf16m1_f32m2(vbfloat16m1_t vs2, vbfloat16m1_t vs1,
+                                         size_t vl) {
+  return __riscv_vfwadd_vv(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vf_bf16m1_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vf_bf16m1_f32m2(vbfloat16m1_t vs2, __bf16 rs1,
+                                         size_t vl) {
+  return __riscv_vfwadd_vf(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wv_bf16m1_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wv_bf16m1_f32m2(vfloat32m2_t vs2, vbfloat16m1_t vs1,
+                                         size_t vl) {
+  return __riscv_vfwadd_wv(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wf_bf16_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.nxv4f32.bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wf_bf16_f32m2(vfloat32m2_t vs2, __bf16 rs1,
+                                       size_t vl) {
+  return __riscv_vfwadd_wf(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vv_bf16m2_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vv_bf16m2_f32m4(vbfloat16m2_t vs2, vbfloat16m2_t vs1,
+                                         size_t vl) {
+  return __riscv_vfwadd_vv(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vf_bf16m2_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vf_bf16m2_f32m4(vbfloat16m2_t vs2, __bf16 rs1,
+                                         size_t vl) {
+  return __riscv_vfwadd_vf(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wv_bf16m2_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wv_bf16m2_f32m4(vfloat32m4_t vs2, vbfloat16m2_t vs1,
+                                         size_t vl) {
+  return __riscv_vfwadd_wv(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wf_bf16_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.nxv8f32.bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wf_bf16_f32m4(vfloat32m4_t vs2, __bf16 rs1,
+                                       size_t vl) {
+  return __riscv_vfwadd_wf(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vv_bf16m4_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vv_bf16m4_f32m8(vbfloat16m4_t vs2, vbfloat16m4_t vs1,
+                                         size_t vl) {
+  return __riscv_vfwadd_vv(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vf_bf16m4_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vf_bf16m4_f32m8(vbfloat16m4_t vs2, __bf16 rs1,
+                                         size_t vl) {
+  return __riscv_vfwadd_vf(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wv_bf16m4_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wv_bf16m4_f32m8(vfloat32m8_t vs2, vbfloat16m4_t vs1,
+                                         size_t vl) {
+  return __riscv_vfwadd_wv(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wf_bf16_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.nxv16f32.bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wf_bf16_f32m8(vfloat32m8_t vs2, __bf16 rs1,
+                                       size_t vl) {
+  return __riscv_vfwadd_wf(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vv_bf16mf4_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vv_bf16mf4_f32mf2_m(vbool64_t vm, vbfloat16mf4_t vs2,
+                                              vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vf_bf16mf4_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vf_bf16mf4_f32mf2_m(vbool64_t vm, vbfloat16mf4_t vs2,
+                                              __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_vf(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wv_bf16mf4_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wv_bf16mf4_f32mf2_m(vbool64_t vm, vfloat32mf2_t vs2,
+                                              vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wf_bf16_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wf_bf16_f32mf2_m(vbool64_t vm, vfloat32mf2_t vs2,
+                                           __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_wf(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vv_bf16mf2_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vv_bf16mf2_f32m1_m(vbool32_t vm, vbfloat16mf2_t vs2,
+                                            vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vf_bf16mf2_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vf_bf16mf2_f32m1_m(vbool32_t vm, vbfloat16mf2_t vs2,
+                                            __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_vf(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wv_bf16mf2_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wv_bf16mf2_f32m1_m(vbool32_t vm, vfloat32m1_t vs2,
+                                            vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wf_bf16_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wf_bf16_f32m1_m(vbool32_t vm, vfloat32m1_t vs2,
+                                         __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_wf(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vv_bf16m1_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vv_bf16m1_f32m2_m(vbool16_t vm, vbfloat16m1_t vs2,
+                                           vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vf_bf16m1_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vf_bf16m1_f32m2_m(vbool16_t vm, vbfloat16m1_t vs2,
+                                           __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_vf(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wv_bf16m1_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wv_bf16m1_f32m2_m(vbool16_t vm, vfloat32m2_t vs2,
+                                           vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wf_bf16_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wf_bf16_f32m2_m(vbool16_t vm, vfloat32m2_t vs2,
+                                         __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_wf(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vv_bf16m2_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vv_bf16m2_f32m4_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                           vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vf_bf16m2_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vf_bf16m2_f32m4_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                           __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_vf(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wv_bf16m2_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wv_bf16m2_f32m4_m(vbool8_t vm, vfloat32m4_t vs2,
+                                           vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wf_bf16_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wf_bf16_f32m4_m(vbool8_t vm, vfloat32m4_t vs2,
+                                         __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_wf(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vv_bf16m4_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vv_bf16m4_f32m8_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                           vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vf_bf16m4_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vf_bf16m4_f32m8_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                           __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_vf(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wv_bf16m4_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wv_bf16m4_f32m8_m(vbool4_t vm, vfloat32m8_t vs2,
+                                           vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wf_bf16_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wf_bf16_f32m8_m(vbool4_t vm, vfloat32m8_t vs2,
+                                         __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_wf(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vv_bf16mf4_f32mf2_rm(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vv_bf16mf4_f32mf2_rm(vbfloat16mf4_t vs2,
+                                               vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vf_bf16mf4_f32mf2_rm(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vf_bf16mf4_f32mf2_rm(vbfloat16mf4_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwadd_vf(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wv_bf16mf4_f32mf2_rm(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wv_bf16mf4_f32mf2_rm(vfloat32mf2_t vs2,
+                                               vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wf_bf16_f32mf2_rm(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.nxv1f32.bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wf_bf16_f32mf2_rm(vfloat32mf2_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwadd_wf(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vv_bf16mf2_f32m1_rm(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vv_bf16mf2_f32m1_rm(vbfloat16mf2_t vs2,
+                                             vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vf_bf16mf2_f32m1_rm(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vf_bf16mf2_f32m1_rm(vbfloat16mf2_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwadd_vf(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wv_bf16mf2_f32m1_rm(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wv_bf16mf2_f32m1_rm(vfloat32m1_t vs2,
+                                             vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wf_bf16_f32m1_rm(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.nxv2f32.bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wf_bf16_f32m1_rm(vfloat32m1_t vs2, __bf16 rs1,
+                                          size_t vl) {
+  return __riscv_vfwadd_wf(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vv_bf16m1_f32m2_rm(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vv_bf16m1_f32m2_rm(vbfloat16m1_t vs2,
+                                            vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vf_bf16m1_f32m2_rm(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vf_bf16m1_f32m2_rm(vbfloat16m1_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwadd_vf(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wv_bf16m1_f32m2_rm(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wv_bf16m1_f32m2_rm(vfloat32m2_t vs2, vbfloat16m1_t vs1,
+                                            size_t vl) {
+  return __riscv_vfwadd_wv(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wf_bf16_f32m2_rm(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.nxv4f32.bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wf_bf16_f32m2_rm(vfloat32m2_t vs2, __bf16 rs1,
+                                          size_t vl) {
+  return __riscv_vfwadd_wf(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vv_bf16m2_f32m4_rm(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vv_bf16m2_f32m4_rm(vbfloat16m2_t vs2,
+                                            vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vf_bf16m2_f32m4_rm(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vf_bf16m2_f32m4_rm(vbfloat16m2_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwadd_vf(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wv_bf16m2_f32m4_rm(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wv_bf16m2_f32m4_rm(vfloat32m4_t vs2, vbfloat16m2_t vs1,
+                                            size_t vl) {
+  return __riscv_vfwadd_wv(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wf_bf16_f32m4_rm(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.nxv8f32.bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wf_bf16_f32m4_rm(vfloat32m4_t vs2, __bf16 rs1,
+                                          size_t vl) {
+  return __riscv_vfwadd_wf(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vv_bf16m4_f32m8_rm(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vv_bf16m4_f32m8_rm(vbfloat16m4_t vs2,
+                                            vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vf_bf16m4_f32m8_rm(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vf_bf16m4_f32m8_rm(vbfloat16m4_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwadd_vf(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wv_bf16m4_f32m8_rm(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wv_bf16m4_f32m8_rm(vfloat32m8_t vs2, vbfloat16m4_t vs1,
+                                            size_t vl) {
+  return __riscv_vfwadd_wv(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wf_bf16_f32m8_rm(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.nxv16f32.bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wf_bf16_f32m8_rm(vfloat32m8_t vs2, __bf16 rs1,
+                                          size_t vl) {
+  return __riscv_vfwadd_wf(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vv_bf16mf4_f32mf2_rm_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vv_bf16mf4_f32mf2_rm_m(vbool64_t vm,
+                                                 vbfloat16mf4_t vs2,
+                                                 vbfloat16mf4_t vs1,
+                                                 size_t vl) {
+  return __riscv_vfwadd_vv(vm, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vf_bf16mf4_f32mf2_rm_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vf_bf16mf4_f32mf2_rm_m(vbool64_t vm,
+                                                 vbfloat16mf4_t vs2, __bf16 rs1,
+                                                 size_t vl) {
+  return __riscv_vfwadd_vf(vm, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wv_bf16mf4_f32mf2_rm_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wv_bf16mf4_f32mf2_rm_m(vbool64_t vm,
+                                                 vfloat32mf2_t vs2,
+                                                 vbfloat16mf4_t vs1,
+                                                 size_t vl) {
+  return __riscv_vfwadd_wv(vm, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wf_bf16_f32mf2_rm_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wf_bf16_f32mf2_rm_m(vbool64_t vm, vfloat32mf2_t vs2,
+                                              __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_wf(vm, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vv_bf16mf2_f32m1_rm_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vv_bf16mf2_f32m1_rm_m(vbool32_t vm, vbfloat16mf2_t vs2,
+                                               vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv(vm, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vf_bf16mf2_f32m1_rm_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vf_bf16mf2_f32m1_rm_m(vbool32_t vm, vbfloat16mf2_t vs2,
+                                               __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_vf(vm, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wv_bf16mf2_f32m1_rm_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wv_bf16mf2_f32m1_rm_m(vbool32_t vm, vfloat32m1_t vs2,
+                                               vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv(vm, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wf_bf16_f32m1_rm_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wf_bf16_f32m1_rm_m(vbool32_t vm, vfloat32m1_t vs2,
+                                            __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_wf(vm, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vv_bf16m1_f32m2_rm_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vv_bf16m1_f32m2_rm_m(vbool16_t vm, vbfloat16m1_t vs2,
+                                              vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv(vm, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vf_bf16m1_f32m2_rm_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vf_bf16m1_f32m2_rm_m(vbool16_t vm, vbfloat16m1_t vs2,
+                                              __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_vf(vm, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wv_bf16m1_f32m2_rm_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wv_bf16m1_f32m2_rm_m(vbool16_t vm, vfloat32m2_t vs2,
+                                              vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv(vm, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wf_bf16_f32m2_rm_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wf_bf16_f32m2_rm_m(vbool16_t vm, vfloat32m2_t vs2,
+                                            __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_wf(vm, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vv_bf16m2_f32m4_rm_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vv_bf16m2_f32m4_rm_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                              vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv(vm, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vf_bf16m2_f32m4_rm_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vf_bf16m2_f32m4_rm_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                              __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_vf(vm, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wv_bf16m2_f32m4_rm_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wv_bf16m2_f32m4_rm_m(vbool8_t vm, vfloat32m4_t vs2,
+                                              vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv(vm, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wf_bf16_f32m4_rm_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wf_bf16_f32m4_rm_m(vbool8_t vm, vfloat32m4_t vs2,
+                                            __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_wf(vm, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vv_bf16m4_f32m8_rm_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vv_bf16m4_f32m8_rm_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                              vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv(vm, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vf_bf16m4_f32m8_rm_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vf_bf16m4_f32m8_rm_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                              __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_vf(vm, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wv_bf16m4_f32m8_rm_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wv_bf16m4_f32m8_rm_m(vbool4_t vm, vfloat32m8_t vs2,
+                                              vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv(vm, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wf_bf16_f32m8_rm_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wf_bf16_f32m8_rm_m(vbool4_t vm, vfloat32m8_t vs2,
+                                            __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_wf(vm, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfwcvt.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfwcvt.c
new file mode 100644
index 0000000000000..0399a639898a0
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfwcvt.c
@@ -0,0 +1,366 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfwcvt_f_x_v_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv1bf16.nxv1i8.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfwcvt_f_x_v_bf16mf4(vint8mf8_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfwcvt_f_x_v_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv2bf16.nxv2i8.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfwcvt_f_x_v_bf16mf2(vint8mf4_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfwcvt_f_x_v_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv4bf16.nxv4i8.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfwcvt_f_x_v_bf16m1(vint8mf2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfwcvt_f_x_v_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv8bf16.nxv8i8.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfwcvt_f_x_v_bf16m2(vint8m1_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfwcvt_f_x_v_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv16bf16.nxv16i8.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfwcvt_f_x_v_bf16m4(vint8m2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfwcvt_f_x_v_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv32bf16.nxv32i8.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfwcvt_f_x_v_bf16m8(vint8m4_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfwcvt_f_xu_v_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv1bf16.nxv1i8.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfwcvt_f_xu_v_bf16mf4(vuint8mf8_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfwcvt_f_xu_v_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv2bf16.nxv2i8.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfwcvt_f_xu_v_bf16mf2(vuint8mf4_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfwcvt_f_xu_v_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv4bf16.nxv4i8.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfwcvt_f_xu_v_bf16m1(vuint8mf2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfwcvt_f_xu_v_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv8bf16.nxv8i8.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfwcvt_f_xu_v_bf16m2(vuint8m1_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfwcvt_f_xu_v_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv16bf16.nxv16i8.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfwcvt_f_xu_v_bf16m4(vuint8m2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfwcvt_f_xu_v_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv32bf16.nxv32i8.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfwcvt_f_xu_v_bf16m8(vuint8m4_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwcvt_f_f_v_bf16mf4_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwcvt.f.f.v.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwcvt_f_f_v_bf16mf4_f32mf2(vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwcvt_f_f_v_bf16mf2_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwcvt.f.f.v.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwcvt_f_f_v_bf16mf2_f32m1(vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwcvt_f_f_v_bf16m1_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwcvt.f.f.v.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwcvt_f_f_v_bf16m1_f32m2(vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwcvt_f_f_v_bf16m2_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwcvt.f.f.v.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwcvt_f_f_v_bf16m2_f32m4(vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwcvt_f_f_v_bf16m4_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwcvt.f.f.v.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwcvt_f_f_v_bf16m4_f32m8(vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfwcvt_f_x_v_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv1bf16.nxv1i8.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x i8> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfwcvt_f_x_v_bf16mf4_m(vbool64_t vm, vint8mf8_t vs2,
+                                           size_t vl) {
+  return __riscv_vfwcvt_f_bf16(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfwcvt_f_x_v_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv2bf16.nxv2i8.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x i8> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfwcvt_f_x_v_bf16mf2_m(vbool32_t vm, vint8mf4_t vs2,
+                                           size_t vl) {
+  return __riscv_vfwcvt_f_bf16(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfwcvt_f_x_v_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv4bf16.nxv4i8.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x i8> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfwcvt_f_x_v_bf16m1_m(vbool16_t vm, vint8mf2_t vs2,
+                                         size_t vl) {
+  return __riscv_vfwcvt_f_bf16(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfwcvt_f_x_v_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv8bf16.nxv8i8.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x i8> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfwcvt_f_x_v_bf16m2_m(vbool8_t vm, vint8m1_t vs2,
+                                         size_t vl) {
+  return __riscv_vfwcvt_f_bf16(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfwcvt_f_x_v_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv16bf16.nxv16i8.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x i8> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfwcvt_f_x_v_bf16m4_m(vbool4_t vm, vint8m2_t vs2,
+                                         size_t vl) {
+  return __riscv_vfwcvt_f_bf16(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfwcvt_f_x_v_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv32bf16.nxv32i8.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x i8> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfwcvt_f_x_v_bf16m8_m(vbool2_t vm, vint8m4_t vs2,
+                                         size_t vl) {
+  return __riscv_vfwcvt_f_bf16(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfwcvt_f_xu_v_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv1bf16.nxv1i8.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x i8> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfwcvt_f_xu_v_bf16mf4_m(vbool64_t vm, vuint8mf8_t vs2,
+                                            size_t vl) {
+  return __riscv_vfwcvt_f_bf16(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfwcvt_f_xu_v_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv2bf16.nxv2i8.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x i8> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfwcvt_f_xu_v_bf16mf2_m(vbool32_t vm, vuint8mf4_t vs2,
+                                            size_t vl) {
+  return __riscv_vfwcvt_f_bf16(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfwcvt_f_xu_v_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv4bf16.nxv4i8.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x i8> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfwcvt_f_xu_v_bf16m1_m(vbool16_t vm, vuint8mf2_t vs2,
+                                          size_t vl) {
+  return __riscv_vfwcvt_f_bf16(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfwcvt_f_xu_v_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv8bf16.nxv8i8.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x i8> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfwcvt_f_xu_v_bf16m2_m(vbool8_t vm, vuint8m1_t vs2,
+                                          size_t vl) {
+  return __riscv_vfwcvt_f_bf16(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfwcvt_f_xu_v_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv16bf16.nxv16i8.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x i8> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfwcvt_f_xu_v_bf16m4_m(vbool4_t vm, vuint8m2_t vs2,
+                                          size_t vl) {
+  return __riscv_vfwcvt_f_bf16(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfwcvt_f_xu_v_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv32bf16.nxv32i8.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x i8> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfwcvt_f_xu_v_bf16m8_m(vbool2_t vm, vuint8m4_t vs2,
+                                          size_t vl) {
+  return __riscv_vfwcvt_f_bf16(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwcvt_f_f_v_bf16mf4_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwcvt_f_f_v_bf16mf4_f32mf2_m(vbool64_t vm,
+                                                 vbfloat16mf4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwcvt_f(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwcvt_f_f_v_bf16mf2_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwcvt_f_f_v_bf16mf2_f32m1_m(vbool32_t vm, vbfloat16mf2_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwcvt_f(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwcvt_f_f_v_bf16m1_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwcvt_f_f_v_bf16m1_f32m2_m(vbool16_t vm, vbfloat16m1_t vs2,
+                                              size_t vl) {
+  return __riscv_vfwcvt_f(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwcvt_f_f_v_bf16m2_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwcvt_f_f_v_bf16m2_f32m4_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                              size_t vl) {
+  return __riscv_vfwcvt_f(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwcvt_f_f_v_bf16m4_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwcvt_f_f_v_bf16m4_f32m8_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                              size_t vl) {
+  return __riscv_vfwcvt_f(vm, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfwmacc.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfwmacc.c
new file mode 100644
index 0000000000000..2eb7fc8cdeea8
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfwmacc.c
@@ -0,0 +1,474 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vv_bf16mf4_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vv_bf16mf4_f32mf2(vfloat32mf2_t vd,
+                                             vbfloat16mf4_t vs1,
+                                             vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwmacc(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vf_bf16mf4_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vf_bf16mf4_f32mf2(vfloat32mf2_t vd, __bf16 vs1,
+                                             vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwmacc(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vv_bf16mf2_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vv_bf16mf2_f32m1(vfloat32m1_t vd, vbfloat16mf2_t vs1,
+                                           vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwmacc(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vf_bf16mf2_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vf_bf16mf2_f32m1(vfloat32m1_t vd, __bf16 vs1,
+                                           vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwmacc(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vv_bf16m1_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vv_bf16m1_f32m2(vfloat32m2_t vd, vbfloat16m1_t vs1,
+                                          vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmacc(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vf_bf16m1_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vf_bf16m1_f32m2(vfloat32m2_t vd, __bf16 vs1,
+                                          vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmacc(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vv_bf16m2_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vv_bf16m2_f32m4(vfloat32m4_t vd, vbfloat16m2_t vs1,
+                                          vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmacc(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vf_bf16m2_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vf_bf16m2_f32m4(vfloat32m4_t vd, __bf16 vs1,
+                                          vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmacc(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vv_bf16m4_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vv_bf16m4_f32m8(vfloat32m8_t vd, vbfloat16m4_t vs1,
+                                          vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmacc(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vf_bf16m4_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vf_bf16m4_f32m8(vfloat32m8_t vd, __bf16 vs1,
+                                          vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmacc(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vv_bf16mf4_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vv_bf16mf4_f32mf2_m(vbool64_t vm, vfloat32mf2_t vd,
+                                               vbfloat16mf4_t vs1,
+                                               vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwmacc(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vf_bf16mf4_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vf_bf16mf4_f32mf2_m(vbool64_t vm, vfloat32mf2_t vd,
+                                               __bf16 vs1, vbfloat16mf4_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwmacc(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vv_bf16mf2_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vv_bf16mf2_f32m1_m(vbool32_t vm, vfloat32m1_t vd,
+                                             vbfloat16mf2_t vs1,
+                                             vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwmacc(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vf_bf16mf2_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vf_bf16mf2_f32m1_m(vbool32_t vm, vfloat32m1_t vd,
+                                             __bf16 vs1, vbfloat16mf2_t vs2,
+                                             size_t vl) {
+  return __riscv_vfwmacc(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vv_bf16m1_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vv_bf16m1_f32m2_m(vbool16_t vm, vfloat32m2_t vd,
+                                            vbfloat16m1_t vs1,
+                                            vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmacc(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vf_bf16m1_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vf_bf16m1_f32m2_m(vbool16_t vm, vfloat32m2_t vd,
+                                            __bf16 vs1, vbfloat16m1_t vs2,
+                                            size_t vl) {
+  return __riscv_vfwmacc(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vv_bf16m2_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vv_bf16m2_f32m4_m(vbool8_t vm, vfloat32m4_t vd,
+                                            vbfloat16m2_t vs1,
+                                            vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmacc(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vf_bf16m2_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vf_bf16m2_f32m4_m(vbool8_t vm, vfloat32m4_t vd,
+                                            __bf16 vs1, vbfloat16m2_t vs2,
+                                            size_t vl) {
+  return __riscv_vfwmacc(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vv_bf16m4_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vv_bf16m4_f32m8_m(vbool4_t vm, vfloat32m8_t vd,
+                                            vbfloat16m4_t vs1,
+                                            vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmacc(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vf_bf16m4_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vf_bf16m4_f32m8_m(vbool4_t vm, vfloat32m8_t vd,
+                                            __bf16 vs1, vbfloat16m4_t vs2,
+                                            size_t vl) {
+  return __riscv_vfwmacc(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vv_bf16mf4_f32mf2_rm(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vv_bf16mf4_f32mf2_rm(vfloat32mf2_t vd,
+                                                vbfloat16mf4_t vs1,
+                                                vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwmacc(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vf_bf16mf4_f32mf2_rm(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vf_bf16mf4_f32mf2_rm(vfloat32mf2_t vd, __bf16 vs1,
+                                                vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwmacc(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vv_bf16mf2_f32m1_rm(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vv_bf16mf2_f32m1_rm(vfloat32m1_t vd,
+                                              vbfloat16mf2_t vs1,
+                                              vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwmacc(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vf_bf16mf2_f32m1_rm(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vf_bf16mf2_f32m1_rm(vfloat32m1_t vd, __bf16 vs1,
+                                              vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwmacc(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vv_bf16m1_f32m2_rm(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vv_bf16m1_f32m2_rm(vfloat32m2_t vd, vbfloat16m1_t vs1,
+                                             vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmacc(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vf_bf16m1_f32m2_rm(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vf_bf16m1_f32m2_rm(vfloat32m2_t vd, __bf16 vs1,
+                                             vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmacc(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vv_bf16m2_f32m4_rm(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vv_bf16m2_f32m4_rm(vfloat32m4_t vd, vbfloat16m2_t vs1,
+                                             vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmacc(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vf_bf16m2_f32m4_rm(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vf_bf16m2_f32m4_rm(vfloat32m4_t vd, __bf16 vs1,
+                                             vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmacc(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vv_bf16m4_f32m8_rm(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vv_bf16m4_f32m8_rm(vfloat32m8_t vd, vbfloat16m4_t vs1,
+                                             vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmacc(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vf_bf16m4_f32m8_rm(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vf_bf16m4_f32m8_rm(vfloat32m8_t vd, __bf16 vs1,
+                                             vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmacc(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vv_bf16mf4_f32mf2_rm_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vv_bf16mf4_f32mf2_rm_m(vbool64_t vm,
+                                                  vfloat32mf2_t vd,
+                                                  vbfloat16mf4_t vs1,
+                                                  vbfloat16mf4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmacc(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vf_bf16mf4_f32mf2_rm_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vf_bf16mf4_f32mf2_rm_m(vbool64_t vm,
+                                                  vfloat32mf2_t vd, __bf16 vs1,
+                                                  vbfloat16mf4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmacc(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vv_bf16mf2_f32m1_rm_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vv_bf16mf2_f32m1_rm_m(vbool32_t vm, vfloat32m1_t vd,
+                                                vbfloat16mf2_t vs1,
+                                                vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwmacc(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vf_bf16mf2_f32m1_rm_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vf_bf16mf2_f32m1_rm_m(vbool32_t vm, vfloat32m1_t vd,
+                                                __bf16 vs1, vbfloat16mf2_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwmacc(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vv_bf16m1_f32m2_rm_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vv_bf16m1_f32m2_rm_m(vbool16_t vm, vfloat32m2_t vd,
+                                               vbfloat16m1_t vs1,
+                                               vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmacc(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vf_bf16m1_f32m2_rm_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vf_bf16m1_f32m2_rm_m(vbool16_t vm, vfloat32m2_t vd,
+                                               __bf16 vs1, vbfloat16m1_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwmacc(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vv_bf16m2_f32m4_rm_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vv_bf16m2_f32m4_rm_m(vbool8_t vm, vfloat32m4_t vd,
+                                               vbfloat16m2_t vs1,
+                                               vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmacc(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vf_bf16m2_f32m4_rm_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vf_bf16m2_f32m4_rm_m(vbool8_t vm, vfloat32m4_t vd,
+                                               __bf16 vs1, vbfloat16m2_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwmacc(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vv_bf16m4_f32m8_rm_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vv_bf16m4_f32m8_rm_m(vbool4_t vm, vfloat32m8_t vd,
+                                               vbfloat16m4_t vs1,
+                                               vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmacc(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vf_bf16m4_f32m8_rm_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vf_bf16m4_f32m8_rm_m(vbool4_t vm, vfloat32m8_t vd,
+                                               __bf16 vs1, vbfloat16m4_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwmacc(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfwmsac.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfwmsac.c
new file mode 100644
index 0000000000000..28f507619b428
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfwmsac.c
@@ -0,0 +1,474 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vv_bf16mf4_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vv_bf16mf4_f32mf2(vfloat32mf2_t vd,
+                                             vbfloat16mf4_t vs1,
+                                             vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwmsac(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vf_bf16mf4_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vf_bf16mf4_f32mf2(vfloat32mf2_t vd, __bf16 vs1,
+                                             vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwmsac(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vv_bf16mf2_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vv_bf16mf2_f32m1(vfloat32m1_t vd, vbfloat16mf2_t vs1,
+                                           vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwmsac(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vf_bf16mf2_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vf_bf16mf2_f32m1(vfloat32m1_t vd, __bf16 vs1,
+                                           vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwmsac(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vv_bf16m1_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vv_bf16m1_f32m2(vfloat32m2_t vd, vbfloat16m1_t vs1,
+                                          vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmsac(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vf_bf16m1_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vf_bf16m1_f32m2(vfloat32m2_t vd, __bf16 vs1,
+                                          vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmsac(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vv_bf16m2_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vv_bf16m2_f32m4(vfloat32m4_t vd, vbfloat16m2_t vs1,
+                                          vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmsac(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vf_bf16m2_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vf_bf16m2_f32m4(vfloat32m4_t vd, __bf16 vs1,
+                                          vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmsac(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vv_bf16m4_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vv_bf16m4_f32m8(vfloat32m8_t vd, vbfloat16m4_t vs1,
+                                          vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmsac(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vf_bf16m4_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vf_bf16m4_f32m8(vfloat32m8_t vd, __bf16 vs1,
+                                          vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmsac(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vv_bf16mf4_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vv_bf16mf4_f32mf2_m(vbool64_t vm, vfloat32mf2_t vd,
+                                               vbfloat16mf4_t vs1,
+                                               vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwmsac(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vf_bf16mf4_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vf_bf16mf4_f32mf2_m(vbool64_t vm, vfloat32mf2_t vd,
+                                               __bf16 vs1, vbfloat16mf4_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwmsac(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vv_bf16mf2_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vv_bf16mf2_f32m1_m(vbool32_t vm, vfloat32m1_t vd,
+                                             vbfloat16mf2_t vs1,
+                                             vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwmsac(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vf_bf16mf2_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vf_bf16mf2_f32m1_m(vbool32_t vm, vfloat32m1_t vd,
+                                             __bf16 vs1, vbfloat16mf2_t vs2,
+                                             size_t vl) {
+  return __riscv_vfwmsac(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vv_bf16m1_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vv_bf16m1_f32m2_m(vbool16_t vm, vfloat32m2_t vd,
+                                            vbfloat16m1_t vs1,
+                                            vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmsac(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vf_bf16m1_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vf_bf16m1_f32m2_m(vbool16_t vm, vfloat32m2_t vd,
+                                            __bf16 vs1, vbfloat16m1_t vs2,
+                                            size_t vl) {
+  return __riscv_vfwmsac(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vv_bf16m2_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vv_bf16m2_f32m4_m(vbool8_t vm, vfloat32m4_t vd,
+                                            vbfloat16m2_t vs1,
+                                            vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmsac(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vf_bf16m2_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vf_bf16m2_f32m4_m(vbool8_t vm, vfloat32m4_t vd,
+                                            __bf16 vs1, vbfloat16m2_t vs2,
+                                            size_t vl) {
+  return __riscv_vfwmsac(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vv_bf16m4_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vv_bf16m4_f32m8_m(vbool4_t vm, vfloat32m8_t vd,
+                                            vbfloat16m4_t vs1,
+                                            vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmsac(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vf_bf16m4_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vf_bf16m4_f32m8_m(vbool4_t vm, vfloat32m8_t vd,
+                                            __bf16 vs1, vbfloat16m4_t vs2,
+                                            size_t vl) {
+  return __riscv_vfwmsac(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vv_bf16mf4_f32mf2_rm(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vv_bf16mf4_f32mf2_rm(vfloat32mf2_t vd,
+                                                vbfloat16mf4_t vs1,
+                                                vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwmsac(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vf_bf16mf4_f32mf2_rm(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vf_bf16mf4_f32mf2_rm(vfloat32mf2_t vd, __bf16 vs1,
+                                                vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwmsac(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vv_bf16mf2_f32m1_rm(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vv_bf16mf2_f32m1_rm(vfloat32m1_t vd,
+                                              vbfloat16mf2_t vs1,
+                                              vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwmsac(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vf_bf16mf2_f32m1_rm(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vf_bf16mf2_f32m1_rm(vfloat32m1_t vd, __bf16 vs1,
+                                              vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwmsac(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vv_bf16m1_f32m2_rm(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vv_bf16m1_f32m2_rm(vfloat32m2_t vd, vbfloat16m1_t vs1,
+                                             vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmsac(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vf_bf16m1_f32m2_rm(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vf_bf16m1_f32m2_rm(vfloat32m2_t vd, __bf16 vs1,
+                                             vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmsac(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vv_bf16m2_f32m4_rm(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vv_bf16m2_f32m4_rm(vfloat32m4_t vd, vbfloat16m2_t vs1,
+                                             vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmsac(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vf_bf16m2_f32m4_rm(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vf_bf16m2_f32m4_rm(vfloat32m4_t vd, __bf16 vs1,
+                                             vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmsac(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vv_bf16m4_f32m8_rm(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vv_bf16m4_f32m8_rm(vfloat32m8_t vd, vbfloat16m4_t vs1,
+                                             vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmsac(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vf_bf16m4_f32m8_rm(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vf_bf16m4_f32m8_rm(vfloat32m8_t vd, __bf16 vs1,
+                                             vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmsac(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vv_bf16mf4_f32mf2_rm_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vv_bf16mf4_f32mf2_rm_m(vbool64_t vm,
+                                                  vfloat32mf2_t vd,
+                                                  vbfloat16mf4_t vs1,
+                                                  vbfloat16mf4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmsac(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vf_bf16mf4_f32mf2_rm_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vf_bf16mf4_f32mf2_rm_m(vbool64_t vm,
+                                                  vfloat32mf2_t vd, __bf16 vs1,
+                                                  vbfloat16mf4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmsac(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vv_bf16mf2_f32m1_rm_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vv_bf16mf2_f32m1_rm_m(vbool32_t vm, vfloat32m1_t vd,
+                                                vbfloat16mf2_t vs1,
+                                                vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwmsac(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vf_bf16mf2_f32m1_rm_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vf_bf16mf2_f32m1_rm_m(vbool32_t vm, vfloat32m1_t vd,
+                                                __bf16 vs1, vbfloat16mf2_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwmsac(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vv_bf16m1_f32m2_rm_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vv_bf16m1_f32m2_rm_m(vbool16_t vm, vfloat32m2_t vd,
+                                               vbfloat16m1_t vs1,
+                                               vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmsac(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vf_bf16m1_f32m2_rm_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vf_bf16m1_f32m2_rm_m(vbool16_t vm, vfloat32m2_t vd,
+                                               __bf16 vs1, vbfloat16m1_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwmsac(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vv_bf16m2_f32m4_rm_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vv_bf16m2_f32m4_rm_m(vbool8_t vm, vfloat32m4_t vd,
+                                               vbfloat16m2_t vs1,
+                                               vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmsac(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vf_bf16m2_f32m4_rm_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vf_bf16m2_f32m4_rm_m(vbool8_t vm, vfloat32m4_t vd,
+                                               __bf16 vs1, vbfloat16m2_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwmsac(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vv_bf16m4_f32m8_rm_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vv_bf16m4_f32m8_rm_m(vbool4_t vm, vfloat32m8_t vd,
+                                               vbfloat16m4_t vs1,
+                                               vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmsac(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vf_bf16m4_f32m8_rm_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vf_bf16m4_f32m8_rm_m(vbool4_t vm, vfloat32m8_t vd,
+                                               __bf16 vs1, vbfloat16m4_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwmsac(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfwmul.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfwmul.c
new file mode 100644
index 0000000000000..8de49fa586dcf
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfwmul.c
@@ -0,0 +1,451 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vv_bf16mf4_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vv_bf16mf4_f32mf2(vbfloat16mf4_t vs2,
+                                            vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwmul(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vf_bf16mf4_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vf_bf16mf4_f32mf2(vbfloat16mf4_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwmul(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vv_bf16mf2_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vv_bf16mf2_f32m1(vbfloat16mf2_t vs2,
+                                          vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwmul(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vf_bf16mf2_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vf_bf16mf2_f32m1(vbfloat16mf2_t vs2, __bf16 rs1,
+                                          size_t vl) {
+  return __riscv_vfwmul(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vv_bf16m1_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vv_bf16m1_f32m2(vbfloat16m1_t vs2, vbfloat16m1_t vs1,
+                                         size_t vl) {
+  return __riscv_vfwmul(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vf_bf16m1_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vf_bf16m1_f32m2(vbfloat16m1_t vs2, __bf16 rs1,
+                                         size_t vl) {
+  return __riscv_vfwmul(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vv_bf16m2_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vv_bf16m2_f32m4(vbfloat16m2_t vs2, vbfloat16m2_t vs1,
+                                         size_t vl) {
+  return __riscv_vfwmul(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vf_bf16m2_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vf_bf16m2_f32m4(vbfloat16m2_t vs2, __bf16 rs1,
+                                         size_t vl) {
+  return __riscv_vfwmul(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vv_bf16m4_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vv_bf16m4_f32m8(vbfloat16m4_t vs2, vbfloat16m4_t vs1,
+                                         size_t vl) {
+  return __riscv_vfwmul(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vf_bf16m4_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vf_bf16m4_f32m8(vbfloat16m4_t vs2, __bf16 rs1,
+                                         size_t vl) {
+  return __riscv_vfwmul(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vv_bf16mf4_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vv_bf16mf4_f32mf2_m(vbool64_t vm, vbfloat16mf4_t vs2,
+                                              vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwmul(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vf_bf16mf4_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vf_bf16mf4_f32mf2_m(vbool64_t vm, vbfloat16mf4_t vs2,
+                                              __bf16 rs1, size_t vl) {
+  return __riscv_vfwmul(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vv_bf16mf2_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vv_bf16mf2_f32m1_m(vbool32_t vm, vbfloat16mf2_t vs2,
+                                            vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwmul(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vf_bf16mf2_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vf_bf16mf2_f32m1_m(vbool32_t vm, vbfloat16mf2_t vs2,
+                                            __bf16 rs1, size_t vl) {
+  return __riscv_vfwmul(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vv_bf16m1_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vv_bf16m1_f32m2_m(vbool16_t vm, vbfloat16m1_t vs2,
+                                           vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwmul(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vf_bf16m1_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vf_bf16m1_f32m2_m(vbool16_t vm, vbfloat16m1_t vs2,
+                                           __bf16 rs1, size_t vl) {
+  return __riscv_vfwmul(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vv_bf16m2_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vv_bf16m2_f32m4_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                           vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwmul(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vf_bf16m2_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vf_bf16m2_f32m4_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                           __bf16 rs1, size_t vl) {
+  return __riscv_vfwmul(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vv_bf16m4_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vv_bf16m4_f32m8_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                           vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwmul(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vf_bf16m4_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vf_bf16m4_f32m8_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                           __bf16 rs1, size_t vl) {
+  return __riscv_vfwmul(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vv_bf16mf4_f32mf2_rm(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vv_bf16mf4_f32mf2_rm(vbfloat16mf4_t vs2,
+                                               vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwmul(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vf_bf16mf4_f32mf2_rm(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vf_bf16mf4_f32mf2_rm(vbfloat16mf4_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwmul(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vv_bf16mf2_f32m1_rm(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vv_bf16mf2_f32m1_rm(vbfloat16mf2_t vs2,
+                                             vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwmul(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vf_bf16mf2_f32m1_rm(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vf_bf16mf2_f32m1_rm(vbfloat16mf2_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwmul(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vv_bf16m1_f32m2_rm(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vv_bf16m1_f32m2_rm(vbfloat16m1_t vs2,
+                                            vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwmul(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vf_bf16m1_f32m2_rm(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vf_bf16m1_f32m2_rm(vbfloat16m1_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwmul(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vv_bf16m2_f32m4_rm(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vv_bf16m2_f32m4_rm(vbfloat16m2_t vs2,
+                                            vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwmul(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vf_bf16m2_f32m4_rm(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vf_bf16m2_f32m4_rm(vbfloat16m2_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwmul(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vv_bf16m4_f32m8_rm(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vv_bf16m4_f32m8_rm(vbfloat16m4_t vs2,
+                                            vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwmul(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vf_bf16m4_f32m8_rm(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vf_bf16m4_f32m8_rm(vbfloat16m4_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwmul(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vv_bf16mf4_f32mf2_rm_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vv_bf16mf4_f32mf2_rm_m(vbool64_t vm,
+                                                 vbfloat16mf4_t vs2,
+                                                 vbfloat16mf4_t vs1,
+                                                 size_t vl) {
+  return __riscv_vfwmul(vm, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vf_bf16mf4_f32mf2_rm_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vf_bf16mf4_f32mf2_rm_m(vbool64_t vm,
+                                                 vbfloat16mf4_t vs2, __bf16 rs1,
+                                                 size_t vl) {
+  return __riscv_vfwmul(vm, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vv_bf16mf2_f32m1_rm_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vv_bf16mf2_f32m1_rm_m(vbool32_t vm, vbfloat16mf2_t vs2,
+                                               vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwmul(vm, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vf_bf16mf2_f32m1_rm_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vf_bf16mf2_f32m1_rm_m(vbool32_t vm, vbfloat16mf2_t vs2,
+                                               __bf16 rs1, size_t vl) {
+  return __riscv_vfwmul(vm, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vv_bf16m1_f32m2_rm_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vv_bf16m1_f32m2_rm_m(vbool16_t vm, vbfloat16m1_t vs2,
+                                              vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwmul(vm, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vf_bf16m1_f32m2_rm_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vf_bf16m1_f32m2_rm_m(vbool16_t vm, vbfloat16m1_t vs2,
+                                              __bf16 rs1, size_t vl) {
+  return __riscv_vfwmul(vm, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vv_bf16m2_f32m4_rm_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vv_bf16m2_f32m4_rm_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                              vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwmul(vm, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vf_bf16m2_f32m4_rm_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vf_bf16m2_f32m4_rm_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                              __bf16 rs1, size_t vl) {
+  return __riscv_vfwmul(vm, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vv_bf16m4_f32m8_rm_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vv_bf16m4_f32m8_rm_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                              vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwmul(vm, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vf_bf16m4_f32m8_rm_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vf_bf16m4_f32m8_rm_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                              __bf16 rs1, size_t vl) {
+  return __riscv_vfwmul(vm, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfwnmacc.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfwnmacc.c
new file mode 100644
index 0000000000000..783693106a217
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfwnmacc.c
@@ -0,0 +1,480 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vv_bf16mf4_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vv_bf16mf4_f32mf2(vfloat32mf2_t vd,
+                                              vbfloat16mf4_t vs1,
+                                              vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwnmacc(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vf_bf16mf4_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vf_bf16mf4_f32mf2(vfloat32mf2_t vd, __bf16 vs1,
+                                              vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwnmacc(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vv_bf16mf2_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vv_bf16mf2_f32m1(vfloat32m1_t vd, vbfloat16mf2_t vs1,
+                                            vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwnmacc(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vf_bf16mf2_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vf_bf16mf2_f32m1(vfloat32m1_t vd, __bf16 vs1,
+                                            vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwnmacc(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vv_bf16m1_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vv_bf16m1_f32m2(vfloat32m2_t vd, vbfloat16m1_t vs1,
+                                           vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmacc(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vf_bf16m1_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vf_bf16m1_f32m2(vfloat32m2_t vd, __bf16 vs1,
+                                           vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmacc(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vv_bf16m2_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vv_bf16m2_f32m4(vfloat32m4_t vd, vbfloat16m2_t vs1,
+                                           vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmacc(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vf_bf16m2_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vf_bf16m2_f32m4(vfloat32m4_t vd, __bf16 vs1,
+                                           vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmacc(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vv_bf16m4_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vv_bf16m4_f32m8(vfloat32m8_t vd, vbfloat16m4_t vs1,
+                                           vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmacc(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vf_bf16m4_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vf_bf16m4_f32m8(vfloat32m8_t vd, __bf16 vs1,
+                                           vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmacc(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vv_bf16mf4_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vv_bf16mf4_f32mf2_m(vbool64_t vm, vfloat32mf2_t vd,
+                                                vbfloat16mf4_t vs1,
+                                                vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwnmacc(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vf_bf16mf4_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vf_bf16mf4_f32mf2_m(vbool64_t vm, vfloat32mf2_t vd,
+                                                __bf16 vs1, vbfloat16mf4_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwnmacc(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vv_bf16mf2_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vv_bf16mf2_f32m1_m(vbool32_t vm, vfloat32m1_t vd,
+                                              vbfloat16mf2_t vs1,
+                                              vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwnmacc(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vf_bf16mf2_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vf_bf16mf2_f32m1_m(vbool32_t vm, vfloat32m1_t vd,
+                                              __bf16 vs1, vbfloat16mf2_t vs2,
+                                              size_t vl) {
+  return __riscv_vfwnmacc(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vv_bf16m1_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vv_bf16m1_f32m2_m(vbool16_t vm, vfloat32m2_t vd,
+                                             vbfloat16m1_t vs1,
+                                             vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmacc(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vf_bf16m1_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vf_bf16m1_f32m2_m(vbool16_t vm, vfloat32m2_t vd,
+                                             __bf16 vs1, vbfloat16m1_t vs2,
+                                             size_t vl) {
+  return __riscv_vfwnmacc(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vv_bf16m2_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vv_bf16m2_f32m4_m(vbool8_t vm, vfloat32m4_t vd,
+                                             vbfloat16m2_t vs1,
+                                             vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmacc(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vf_bf16m2_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vf_bf16m2_f32m4_m(vbool8_t vm, vfloat32m4_t vd,
+                                             __bf16 vs1, vbfloat16m2_t vs2,
+                                             size_t vl) {
+  return __riscv_vfwnmacc(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vv_bf16m4_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vv_bf16m4_f32m8_m(vbool4_t vm, vfloat32m8_t vd,
+                                             vbfloat16m4_t vs1,
+                                             vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmacc(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vf_bf16m4_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vf_bf16m4_f32m8_m(vbool4_t vm, vfloat32m8_t vd,
+                                             __bf16 vs1, vbfloat16m4_t vs2,
+                                             size_t vl) {
+  return __riscv_vfwnmacc(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vv_bf16mf4_f32mf2_rm(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vv_bf16mf4_f32mf2_rm(vfloat32mf2_t vd,
+                                                 vbfloat16mf4_t vs1,
+                                                 vbfloat16mf4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmacc(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vf_bf16mf4_f32mf2_rm(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vf_bf16mf4_f32mf2_rm(vfloat32mf2_t vd, __bf16 vs1,
+                                                 vbfloat16mf4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmacc(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vv_bf16mf2_f32m1_rm(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vv_bf16mf2_f32m1_rm(vfloat32m1_t vd,
+                                               vbfloat16mf2_t vs1,
+                                               vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwnmacc(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vf_bf16mf2_f32m1_rm(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vf_bf16mf2_f32m1_rm(vfloat32m1_t vd, __bf16 vs1,
+                                               vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwnmacc(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vv_bf16m1_f32m2_rm(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vv_bf16m1_f32m2_rm(vfloat32m2_t vd,
+                                              vbfloat16m1_t vs1,
+                                              vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmacc(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vf_bf16m1_f32m2_rm(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vf_bf16m1_f32m2_rm(vfloat32m2_t vd, __bf16 vs1,
+                                              vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmacc(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vv_bf16m2_f32m4_rm(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vv_bf16m2_f32m4_rm(vfloat32m4_t vd,
+                                              vbfloat16m2_t vs1,
+                                              vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmacc(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vf_bf16m2_f32m4_rm(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vf_bf16m2_f32m4_rm(vfloat32m4_t vd, __bf16 vs1,
+                                              vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmacc(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vv_bf16m4_f32m8_rm(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vv_bf16m4_f32m8_rm(vfloat32m8_t vd,
+                                              vbfloat16m4_t vs1,
+                                              vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmacc(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vf_bf16m4_f32m8_rm(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vf_bf16m4_f32m8_rm(vfloat32m8_t vd, __bf16 vs1,
+                                              vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmacc(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vv_bf16mf4_f32mf2_rm_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vv_bf16mf4_f32mf2_rm_m(vbool64_t vm,
+                                                   vfloat32mf2_t vd,
+                                                   vbfloat16mf4_t vs1,
+                                                   vbfloat16mf4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmacc(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vf_bf16mf4_f32mf2_rm_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vf_bf16mf4_f32mf2_rm_m(vbool64_t vm,
+                                                   vfloat32mf2_t vd, __bf16 vs1,
+                                                   vbfloat16mf4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmacc(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vv_bf16mf2_f32m1_rm_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vv_bf16mf2_f32m1_rm_m(vbool32_t vm, vfloat32m1_t vd,
+                                                 vbfloat16mf2_t vs1,
+                                                 vbfloat16mf2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmacc(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vf_bf16mf2_f32m1_rm_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vf_bf16mf2_f32m1_rm_m(vbool32_t vm, vfloat32m1_t vd,
+                                                 __bf16 vs1, vbfloat16mf2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmacc(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vv_bf16m1_f32m2_rm_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vv_bf16m1_f32m2_rm_m(vbool16_t vm, vfloat32m2_t vd,
+                                                vbfloat16m1_t vs1,
+                                                vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmacc(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vf_bf16m1_f32m2_rm_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vf_bf16m1_f32m2_rm_m(vbool16_t vm, vfloat32m2_t vd,
+                                                __bf16 vs1, vbfloat16m1_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwnmacc(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vv_bf16m2_f32m4_rm_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vv_bf16m2_f32m4_rm_m(vbool8_t vm, vfloat32m4_t vd,
+                                                vbfloat16m2_t vs1,
+                                                vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmacc(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vf_bf16m2_f32m4_rm_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vf_bf16m2_f32m4_rm_m(vbool8_t vm, vfloat32m4_t vd,
+                                                __bf16 vs1, vbfloat16m2_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwnmacc(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vv_bf16m4_f32m8_rm_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vv_bf16m4_f32m8_rm_m(vbool4_t vm, vfloat32m8_t vd,
+                                                vbfloat16m4_t vs1,
+                                                vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmacc(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vf_bf16m4_f32m8_rm_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vf_bf16m4_f32m8_rm_m(vbool4_t vm, vfloat32m8_t vd,
+                                                __bf16 vs1, vbfloat16m4_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwnmacc(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfwnmsac.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfwnmsac.c
new file mode 100644
index 0000000000000..ca936af7140e5
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfwnmsac.c
@@ -0,0 +1,480 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vv_bf16mf4_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vv_bf16mf4_f32mf2(vfloat32mf2_t vd,
+                                              vbfloat16mf4_t vs1,
+                                              vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwnmsac(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vf_bf16mf4_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vf_bf16mf4_f32mf2(vfloat32mf2_t vd, __bf16 vs1,
+                                              vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwnmsac(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vv_bf16mf2_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vv_bf16mf2_f32m1(vfloat32m1_t vd, vbfloat16mf2_t vs1,
+                                            vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwnmsac(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vf_bf16mf2_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vf_bf16mf2_f32m1(vfloat32m1_t vd, __bf16 vs1,
+                                            vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwnmsac(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vv_bf16m1_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vv_bf16m1_f32m2(vfloat32m2_t vd, vbfloat16m1_t vs1,
+                                           vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmsac(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vf_bf16m1_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vf_bf16m1_f32m2(vfloat32m2_t vd, __bf16 vs1,
+                                           vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmsac(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vv_bf16m2_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vv_bf16m2_f32m4(vfloat32m4_t vd, vbfloat16m2_t vs1,
+                                           vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmsac(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vf_bf16m2_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vf_bf16m2_f32m4(vfloat32m4_t vd, __bf16 vs1,
+                                           vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmsac(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vv_bf16m4_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vv_bf16m4_f32m8(vfloat32m8_t vd, vbfloat16m4_t vs1,
+                                           vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmsac(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vf_bf16m4_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vf_bf16m4_f32m8(vfloat32m8_t vd, __bf16 vs1,
+                                           vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmsac(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vv_bf16mf4_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vv_bf16mf4_f32mf2_m(vbool64_t vm, vfloat32mf2_t vd,
+                                                vbfloat16mf4_t vs1,
+                                                vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwnmsac(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vf_bf16mf4_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vf_bf16mf4_f32mf2_m(vbool64_t vm, vfloat32mf2_t vd,
+                                                __bf16 vs1, vbfloat16mf4_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwnmsac(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vv_bf16mf2_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vv_bf16mf2_f32m1_m(vbool32_t vm, vfloat32m1_t vd,
+                                              vbfloat16mf2_t vs1,
+                                              vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwnmsac(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vf_bf16mf2_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vf_bf16mf2_f32m1_m(vbool32_t vm, vfloat32m1_t vd,
+                                              __bf16 vs1, vbfloat16mf2_t vs2,
+                                              size_t vl) {
+  return __riscv_vfwnmsac(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vv_bf16m1_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vv_bf16m1_f32m2_m(vbool16_t vm, vfloat32m2_t vd,
+                                             vbfloat16m1_t vs1,
+                                             vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmsac(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vf_bf16m1_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vf_bf16m1_f32m2_m(vbool16_t vm, vfloat32m2_t vd,
+                                             __bf16 vs1, vbfloat16m1_t vs2,
+                                             size_t vl) {
+  return __riscv_vfwnmsac(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vv_bf16m2_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vv_bf16m2_f32m4_m(vbool8_t vm, vfloat32m4_t vd,
+                                             vbfloat16m2_t vs1,
+                                             vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmsac(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vf_bf16m2_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vf_bf16m2_f32m4_m(vbool8_t vm, vfloat32m4_t vd,
+                                             __bf16 vs1, vbfloat16m2_t vs2,
+                                             size_t vl) {
+  return __riscv_vfwnmsac(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vv_bf16m4_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vv_bf16m4_f32m8_m(vbool4_t vm, vfloat32m8_t vd,
+                                             vbfloat16m4_t vs1,
+                                             vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmsac(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vf_bf16m4_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vf_bf16m4_f32m8_m(vbool4_t vm, vfloat32m8_t vd,
+                                             __bf16 vs1, vbfloat16m4_t vs2,
+                                             size_t vl) {
+  return __riscv_vfwnmsac(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vv_bf16mf4_f32mf2_rm(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vv_bf16mf4_f32mf2_rm(vfloat32mf2_t vd,
+                                                 vbfloat16mf4_t vs1,
+                                                 vbfloat16mf4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmsac(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vf_bf16mf4_f32mf2_rm(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vf_bf16mf4_f32mf2_rm(vfloat32mf2_t vd, __bf16 vs1,
+                                                 vbfloat16mf4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmsac(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vv_bf16mf2_f32m1_rm(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vv_bf16mf2_f32m1_rm(vfloat32m1_t vd,
+                                               vbfloat16mf2_t vs1,
+                                               vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwnmsac(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vf_bf16mf2_f32m1_rm(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vf_bf16mf2_f32m1_rm(vfloat32m1_t vd, __bf16 vs1,
+                                               vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwnmsac(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vv_bf16m1_f32m2_rm(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vv_bf16m1_f32m2_rm(vfloat32m2_t vd,
+                                              vbfloat16m1_t vs1,
+                                              vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmsac(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vf_bf16m1_f32m2_rm(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vf_bf16m1_f32m2_rm(vfloat32m2_t vd, __bf16 vs1,
+                                              vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmsac(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vv_bf16m2_f32m4_rm(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vv_bf16m2_f32m4_rm(vfloat32m4_t vd,
+                                              vbfloat16m2_t vs1,
+                                              vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmsac(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vf_bf16m2_f32m4_rm(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vf_bf16m2_f32m4_rm(vfloat32m4_t vd, __bf16 vs1,
+                                              vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmsac(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vv_bf16m4_f32m8_rm(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vv_bf16m4_f32m8_rm(vfloat32m8_t vd,
+                                              vbfloat16m4_t vs1,
+                                              vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmsac(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vf_bf16m4_f32m8_rm(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vf_bf16m4_f32m8_rm(vfloat32m8_t vd, __bf16 vs1,
+                                              vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmsac(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vv_bf16mf4_f32mf2_rm_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vv_bf16mf4_f32mf2_rm_m(vbool64_t vm,
+                                                   vfloat32mf2_t vd,
+                                                   vbfloat16mf4_t vs1,
+                                                   vbfloat16mf4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmsac(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vf_bf16mf4_f32mf2_rm_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vf_bf16mf4_f32mf2_rm_m(vbool64_t vm,
+                                                   vfloat32mf2_t vd, __bf16 vs1,
+                                                   vbfloat16mf4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmsac(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vv_bf16mf2_f32m1_rm_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vv_bf16mf2_f32m1_rm_m(vbool32_t vm, vfloat32m1_t vd,
+                                                 vbfloat16mf2_t vs1,
+                                                 vbfloat16mf2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmsac(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vf_bf16mf2_f32m1_rm_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vf_bf16mf2_f32m1_rm_m(vbool32_t vm, vfloat32m1_t vd,
+                                                 __bf16 vs1, vbfloat16mf2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmsac(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vv_bf16m1_f32m2_rm_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vv_bf16m1_f32m2_rm_m(vbool16_t vm, vfloat32m2_t vd,
+                                                vbfloat16m1_t vs1,
+                                                vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmsac(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vf_bf16m1_f32m2_rm_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vf_bf16m1_f32m2_rm_m(vbool16_t vm, vfloat32m2_t vd,
+                                                __bf16 vs1, vbfloat16m1_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwnmsac(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vv_bf16m2_f32m4_rm_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vv_bf16m2_f32m4_rm_m(vbool8_t vm, vfloat32m4_t vd,
+                                                vbfloat16m2_t vs1,
+                                                vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmsac(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vf_bf16m2_f32m4_rm_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vf_bf16m2_f32m4_rm_m(vbool8_t vm, vfloat32m4_t vd,
+                                                __bf16 vs1, vbfloat16m2_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwnmsac(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vv_bf16m4_f32m8_rm_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vv_bf16m4_f32m8_rm_m(vbool4_t vm, vfloat32m8_t vd,
+                                                vbfloat16m4_t vs1,
+                                                vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmsac(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vf_bf16m4_f32m8_rm_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vf_bf16m4_f32m8_rm_m(vbool4_t vm, vfloat32m8_t vd,
+                                                __bf16 vs1, vbfloat16m4_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwnmsac(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfwsub.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfwsub.c
new file mode 100644
index 0000000000000..2e22e22e4b6fc
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vfwsub.c
@@ -0,0 +1,893 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vv_bf16mf4_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vv_bf16mf4_f32mf2(vbfloat16mf4_t vs2,
+                                            vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vf_bf16mf4_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vf_bf16mf4_f32mf2(vbfloat16mf4_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwsub_vf(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wv_bf16mf4_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wv_bf16mf4_f32mf2(vfloat32mf2_t vs2,
+                                            vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wf_bf16_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.nxv1f32.bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wf_bf16_f32mf2(vfloat32mf2_t vs2, __bf16 rs1,
+                                         size_t vl) {
+  return __riscv_vfwsub_wf(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vv_bf16mf2_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vv_bf16mf2_f32m1(vbfloat16mf2_t vs2,
+                                          vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vf_bf16mf2_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vf_bf16mf2_f32m1(vbfloat16mf2_t vs2, __bf16 rs1,
+                                          size_t vl) {
+  return __riscv_vfwsub_vf(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wv_bf16mf2_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wv_bf16mf2_f32m1(vfloat32m1_t vs2, vbfloat16mf2_t vs1,
+                                          size_t vl) {
+  return __riscv_vfwsub_wv(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wf_bf16_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.nxv2f32.bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wf_bf16_f32m1(vfloat32m1_t vs2, __bf16 rs1,
+                                       size_t vl) {
+  return __riscv_vfwsub_wf(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vv_bf16m1_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vv_bf16m1_f32m2(vbfloat16m1_t vs2, vbfloat16m1_t vs1,
+                                         size_t vl) {
+  return __riscv_vfwsub_vv(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vf_bf16m1_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vf_bf16m1_f32m2(vbfloat16m1_t vs2, __bf16 rs1,
+                                         size_t vl) {
+  return __riscv_vfwsub_vf(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wv_bf16m1_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wv_bf16m1_f32m2(vfloat32m2_t vs2, vbfloat16m1_t vs1,
+                                         size_t vl) {
+  return __riscv_vfwsub_wv(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wf_bf16_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.nxv4f32.bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wf_bf16_f32m2(vfloat32m2_t vs2, __bf16 rs1,
+                                       size_t vl) {
+  return __riscv_vfwsub_wf(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vv_bf16m2_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vv_bf16m2_f32m4(vbfloat16m2_t vs2, vbfloat16m2_t vs1,
+                                         size_t vl) {
+  return __riscv_vfwsub_vv(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vf_bf16m2_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vf_bf16m2_f32m4(vbfloat16m2_t vs2, __bf16 rs1,
+                                         size_t vl) {
+  return __riscv_vfwsub_vf(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wv_bf16m2_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wv_bf16m2_f32m4(vfloat32m4_t vs2, vbfloat16m2_t vs1,
+                                         size_t vl) {
+  return __riscv_vfwsub_wv(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wf_bf16_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.nxv8f32.bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wf_bf16_f32m4(vfloat32m4_t vs2, __bf16 rs1,
+                                       size_t vl) {
+  return __riscv_vfwsub_wf(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vv_bf16m4_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vv_bf16m4_f32m8(vbfloat16m4_t vs2, vbfloat16m4_t vs1,
+                                         size_t vl) {
+  return __riscv_vfwsub_vv(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vf_bf16m4_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vf_bf16m4_f32m8(vbfloat16m4_t vs2, __bf16 rs1,
+                                         size_t vl) {
+  return __riscv_vfwsub_vf(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wv_bf16m4_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wv_bf16m4_f32m8(vfloat32m8_t vs2, vbfloat16m4_t vs1,
+                                         size_t vl) {
+  return __riscv_vfwsub_wv(vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wf_bf16_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.nxv16f32.bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wf_bf16_f32m8(vfloat32m8_t vs2, __bf16 rs1,
+                                       size_t vl) {
+  return __riscv_vfwsub_wf(vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vv_bf16mf4_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vv_bf16mf4_f32mf2_m(vbool64_t vm, vbfloat16mf4_t vs2,
+                                              vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vf_bf16mf4_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vf_bf16mf4_f32mf2_m(vbool64_t vm, vbfloat16mf4_t vs2,
+                                              __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_vf(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wv_bf16mf4_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wv_bf16mf4_f32mf2_m(vbool64_t vm, vfloat32mf2_t vs2,
+                                              vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wf_bf16_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wf_bf16_f32mf2_m(vbool64_t vm, vfloat32mf2_t vs2,
+                                           __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_wf(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vv_bf16mf2_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vv_bf16mf2_f32m1_m(vbool32_t vm, vbfloat16mf2_t vs2,
+                                            vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vf_bf16mf2_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vf_bf16mf2_f32m1_m(vbool32_t vm, vbfloat16mf2_t vs2,
+                                            __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_vf(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wv_bf16mf2_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wv_bf16mf2_f32m1_m(vbool32_t vm, vfloat32m1_t vs2,
+                                            vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wf_bf16_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wf_bf16_f32m1_m(vbool32_t vm, vfloat32m1_t vs2,
+                                         __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_wf(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vv_bf16m1_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vv_bf16m1_f32m2_m(vbool16_t vm, vbfloat16m1_t vs2,
+                                           vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vf_bf16m1_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vf_bf16m1_f32m2_m(vbool16_t vm, vbfloat16m1_t vs2,
+                                           __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_vf(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wv_bf16m1_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wv_bf16m1_f32m2_m(vbool16_t vm, vfloat32m2_t vs2,
+                                           vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wf_bf16_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wf_bf16_f32m2_m(vbool16_t vm, vfloat32m2_t vs2,
+                                         __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_wf(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vv_bf16m2_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vv_bf16m2_f32m4_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                           vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vf_bf16m2_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vf_bf16m2_f32m4_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                           __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_vf(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wv_bf16m2_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wv_bf16m2_f32m4_m(vbool8_t vm, vfloat32m4_t vs2,
+                                           vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wf_bf16_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wf_bf16_f32m4_m(vbool8_t vm, vfloat32m4_t vs2,
+                                         __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_wf(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vv_bf16m4_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vv_bf16m4_f32m8_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                           vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vf_bf16m4_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vf_bf16m4_f32m8_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                           __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_vf(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wv_bf16m4_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wv_bf16m4_f32m8_m(vbool4_t vm, vfloat32m8_t vs2,
+                                           vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv(vm, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wf_bf16_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wf_bf16_f32m8_m(vbool4_t vm, vfloat32m8_t vs2,
+                                         __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_wf(vm, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vv_bf16mf4_f32mf2_rm(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vv_bf16mf4_f32mf2_rm(vbfloat16mf4_t vs2,
+                                               vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vf_bf16mf4_f32mf2_rm(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vf_bf16mf4_f32mf2_rm(vbfloat16mf4_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwsub_vf(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wv_bf16mf4_f32mf2_rm(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wv_bf16mf4_f32mf2_rm(vfloat32mf2_t vs2,
+                                               vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wf_bf16_f32mf2_rm(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.nxv1f32.bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wf_bf16_f32mf2_rm(vfloat32mf2_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwsub_wf(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vv_bf16mf2_f32m1_rm(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vv_bf16mf2_f32m1_rm(vbfloat16mf2_t vs2,
+                                             vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vf_bf16mf2_f32m1_rm(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vf_bf16mf2_f32m1_rm(vbfloat16mf2_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwsub_vf(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wv_bf16mf2_f32m1_rm(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wv_bf16mf2_f32m1_rm(vfloat32m1_t vs2,
+                                             vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wf_bf16_f32m1_rm(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.nxv2f32.bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wf_bf16_f32m1_rm(vfloat32m1_t vs2, __bf16 rs1,
+                                          size_t vl) {
+  return __riscv_vfwsub_wf(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vv_bf16m1_f32m2_rm(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vv_bf16m1_f32m2_rm(vbfloat16m1_t vs2,
+                                            vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vf_bf16m1_f32m2_rm(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vf_bf16m1_f32m2_rm(vbfloat16m1_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwsub_vf(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wv_bf16m1_f32m2_rm(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wv_bf16m1_f32m2_rm(vfloat32m2_t vs2, vbfloat16m1_t vs1,
+                                            size_t vl) {
+  return __riscv_vfwsub_wv(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wf_bf16_f32m2_rm(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.nxv4f32.bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wf_bf16_f32m2_rm(vfloat32m2_t vs2, __bf16 rs1,
+                                          size_t vl) {
+  return __riscv_vfwsub_wf(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vv_bf16m2_f32m4_rm(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vv_bf16m2_f32m4_rm(vbfloat16m2_t vs2,
+                                            vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vf_bf16m2_f32m4_rm(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vf_bf16m2_f32m4_rm(vbfloat16m2_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwsub_vf(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wv_bf16m2_f32m4_rm(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wv_bf16m2_f32m4_rm(vfloat32m4_t vs2, vbfloat16m2_t vs1,
+                                            size_t vl) {
+  return __riscv_vfwsub_wv(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wf_bf16_f32m4_rm(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.nxv8f32.bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wf_bf16_f32m4_rm(vfloat32m4_t vs2, __bf16 rs1,
+                                          size_t vl) {
+  return __riscv_vfwsub_wf(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vv_bf16m4_f32m8_rm(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vv_bf16m4_f32m8_rm(vbfloat16m4_t vs2,
+                                            vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vf_bf16m4_f32m8_rm(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vf_bf16m4_f32m8_rm(vbfloat16m4_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwsub_vf(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wv_bf16m4_f32m8_rm(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wv_bf16m4_f32m8_rm(vfloat32m8_t vs2, vbfloat16m4_t vs1,
+                                            size_t vl) {
+  return __riscv_vfwsub_wv(vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wf_bf16_f32m8_rm(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.nxv16f32.bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wf_bf16_f32m8_rm(vfloat32m8_t vs2, __bf16 rs1,
+                                          size_t vl) {
+  return __riscv_vfwsub_wf(vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vv_bf16mf4_f32mf2_rm_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vv_bf16mf4_f32mf2_rm_m(vbool64_t vm,
+                                                 vbfloat16mf4_t vs2,
+                                                 vbfloat16mf4_t vs1,
+                                                 size_t vl) {
+  return __riscv_vfwsub_vv(vm, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vf_bf16mf4_f32mf2_rm_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vf_bf16mf4_f32mf2_rm_m(vbool64_t vm,
+                                                 vbfloat16mf4_t vs2, __bf16 rs1,
+                                                 size_t vl) {
+  return __riscv_vfwsub_vf(vm, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wv_bf16mf4_f32mf2_rm_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wv_bf16mf4_f32mf2_rm_m(vbool64_t vm,
+                                                 vfloat32mf2_t vs2,
+                                                 vbfloat16mf4_t vs1,
+                                                 size_t vl) {
+  return __riscv_vfwsub_wv(vm, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wf_bf16_f32mf2_rm_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.bf16.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wf_bf16_f32mf2_rm_m(vbool64_t vm, vfloat32mf2_t vs2,
+                                              __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_wf(vm, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vv_bf16mf2_f32m1_rm_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vv_bf16mf2_f32m1_rm_m(vbool32_t vm, vbfloat16mf2_t vs2,
+                                               vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv(vm, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vf_bf16mf2_f32m1_rm_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vf_bf16mf2_f32m1_rm_m(vbool32_t vm, vbfloat16mf2_t vs2,
+                                               __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_vf(vm, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wv_bf16mf2_f32m1_rm_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wv_bf16mf2_f32m1_rm_m(vbool32_t vm, vfloat32m1_t vs2,
+                                               vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv(vm, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wf_bf16_f32m1_rm_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.bf16.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wf_bf16_f32m1_rm_m(vbool32_t vm, vfloat32m1_t vs2,
+                                            __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_wf(vm, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vv_bf16m1_f32m2_rm_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vv_bf16m1_f32m2_rm_m(vbool16_t vm, vbfloat16m1_t vs2,
+                                              vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv(vm, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vf_bf16m1_f32m2_rm_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vf_bf16m1_f32m2_rm_m(vbool16_t vm, vbfloat16m1_t vs2,
+                                              __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_vf(vm, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wv_bf16m1_f32m2_rm_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wv_bf16m1_f32m2_rm_m(vbool16_t vm, vfloat32m2_t vs2,
+                                              vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv(vm, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wf_bf16_f32m2_rm_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.bf16.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wf_bf16_f32m2_rm_m(vbool16_t vm, vfloat32m2_t vs2,
+                                            __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_wf(vm, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vv_bf16m2_f32m4_rm_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vv_bf16m2_f32m4_rm_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                              vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv(vm, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vf_bf16m2_f32m4_rm_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vf_bf16m2_f32m4_rm_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                              __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_vf(vm, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wv_bf16m2_f32m4_rm_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wv_bf16m2_f32m4_rm_m(vbool8_t vm, vfloat32m4_t vs2,
+                                              vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv(vm, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wf_bf16_f32m4_rm_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.bf16.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wf_bf16_f32m4_rm_m(vbool8_t vm, vfloat32m4_t vs2,
+                                            __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_wf(vm, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vv_bf16m4_f32m8_rm_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vv_bf16m4_f32m8_rm_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                              vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv(vm, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vf_bf16m4_f32m8_rm_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vf_bf16m4_f32m8_rm_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                              __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_vf(vm, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wv_bf16m4_f32m8_rm_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wv_bf16m4_f32m8_rm_m(vbool4_t vm, vfloat32m8_t vs2,
+                                              vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv(vm, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wf_bf16_f32m8_rm_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.bf16.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wf_bf16_f32m8_rm_m(vbool4_t vm, vfloat32m8_t vs2,
+                                            __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_wf(vm, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vmfeq.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vmfeq.c
new file mode 100644
index 0000000000000..29881c9ba5d5e
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vmfeq.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfeq_vv_bf16mf4_b64(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfeq.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfeq_vv_bf16mf4_b64(vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vmfeq(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfeq_vf_bf16mf4_b64(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfeq.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfeq_vf_bf16mf4_b64(vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfeq(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfeq_vv_bf16mf2_b32(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfeq.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfeq_vv_bf16mf2_b32(vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vmfeq(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfeq_vf_bf16mf2_b32(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfeq.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfeq_vf_bf16mf2_b32(vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfeq(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfeq_vv_bf16m1_b16(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfeq.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfeq_vv_bf16m1_b16(vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vmfeq(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfeq_vf_bf16m1_b16(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfeq.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfeq_vf_bf16m1_b16(vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfeq(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfeq_vv_bf16m2_b8(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfeq.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfeq_vv_bf16m2_b8(vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vmfeq(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfeq_vf_bf16m2_b8(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfeq.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfeq_vf_bf16m2_b8(vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfeq(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfeq_vv_bf16m4_b4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfeq.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfeq_vv_bf16m4_b4(vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vmfeq(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfeq_vf_bf16m4_b4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfeq.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfeq_vf_bf16m4_b4(vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfeq(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfeq_vv_bf16m8_b2(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfeq.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfeq_vv_bf16m8_b2(vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vmfeq(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfeq_vf_bf16m8_b2(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfeq.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfeq_vf_bf16m8_b2(vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfeq(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfeq_vv_bf16mf4_b64_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfeq.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x i1> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfeq_vv_bf16mf4_b64_m(vbool64_t mask, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vmfeq(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfeq_vf_bf16mf4_b64_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfeq.mask.nxv1bf16.bf16.i64(<vscale x 1 x i1> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfeq_vf_bf16mf4_b64_m(vbool64_t mask, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfeq(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfeq_vv_bf16mf2_b32_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfeq.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x i1> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfeq_vv_bf16mf2_b32_m(vbool32_t mask, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vmfeq(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfeq_vf_bf16mf2_b32_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfeq.mask.nxv2bf16.bf16.i64(<vscale x 2 x i1> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfeq_vf_bf16mf2_b32_m(vbool32_t mask, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfeq(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfeq_vv_bf16m1_b16_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfeq.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x i1> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfeq_vv_bf16m1_b16_m(vbool16_t mask, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vmfeq(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfeq_vf_bf16m1_b16_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfeq.mask.nxv4bf16.bf16.i64(<vscale x 4 x i1> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfeq_vf_bf16m1_b16_m(vbool16_t mask, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfeq(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfeq_vv_bf16m2_b8_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfeq.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x i1> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfeq_vv_bf16m2_b8_m(vbool8_t mask, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vmfeq(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfeq_vf_bf16m2_b8_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfeq.mask.nxv8bf16.bf16.i64(<vscale x 8 x i1> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfeq_vf_bf16m2_b8_m(vbool8_t mask, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfeq(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfeq_vv_bf16m4_b4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfeq.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x i1> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfeq_vv_bf16m4_b4_m(vbool4_t mask, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vmfeq(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfeq_vf_bf16m4_b4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfeq.mask.nxv16bf16.bf16.i64(<vscale x 16 x i1> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfeq_vf_bf16m4_b4_m(vbool4_t mask, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfeq(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfeq_vv_bf16m8_b2_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfeq.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x i1> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfeq_vv_bf16m8_b2_m(vbool2_t mask, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vmfeq(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfeq_vf_bf16m8_b2_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfeq.mask.nxv32bf16.bf16.i64(<vscale x 32 x i1> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfeq_vf_bf16m8_b2_m(vbool2_t mask, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfeq(mask, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vmfge.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vmfge.c
new file mode 100644
index 0000000000000..b8083c5ebb8df
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vmfge.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfge_vv_bf16mf4_b64(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfge.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfge_vv_bf16mf4_b64(vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vmfge(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfge_vf_bf16mf4_b64(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfge.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfge_vf_bf16mf4_b64(vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfge(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfge_vv_bf16mf2_b32(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfge.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfge_vv_bf16mf2_b32(vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vmfge(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfge_vf_bf16mf2_b32(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfge.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfge_vf_bf16mf2_b32(vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfge(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfge_vv_bf16m1_b16(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfge.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfge_vv_bf16m1_b16(vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vmfge(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfge_vf_bf16m1_b16(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfge.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfge_vf_bf16m1_b16(vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfge(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfge_vv_bf16m2_b8(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfge.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfge_vv_bf16m2_b8(vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vmfge(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfge_vf_bf16m2_b8(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfge.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfge_vf_bf16m2_b8(vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfge(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfge_vv_bf16m4_b4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfge.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfge_vv_bf16m4_b4(vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vmfge(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfge_vf_bf16m4_b4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfge.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfge_vf_bf16m4_b4(vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfge(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfge_vv_bf16m8_b2(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfge.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfge_vv_bf16m8_b2(vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vmfge(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfge_vf_bf16m8_b2(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfge.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfge_vf_bf16m8_b2(vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfge(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfge_vv_bf16mf4_b64_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfge.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x i1> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfge_vv_bf16mf4_b64_m(vbool64_t mask, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vmfge(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfge_vf_bf16mf4_b64_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfge.mask.nxv1bf16.bf16.i64(<vscale x 1 x i1> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfge_vf_bf16mf4_b64_m(vbool64_t mask, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfge(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfge_vv_bf16mf2_b32_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfge.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x i1> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfge_vv_bf16mf2_b32_m(vbool32_t mask, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vmfge(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfge_vf_bf16mf2_b32_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfge.mask.nxv2bf16.bf16.i64(<vscale x 2 x i1> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfge_vf_bf16mf2_b32_m(vbool32_t mask, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfge(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfge_vv_bf16m1_b16_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfge.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x i1> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfge_vv_bf16m1_b16_m(vbool16_t mask, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vmfge(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfge_vf_bf16m1_b16_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfge.mask.nxv4bf16.bf16.i64(<vscale x 4 x i1> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfge_vf_bf16m1_b16_m(vbool16_t mask, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfge(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfge_vv_bf16m2_b8_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfge.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x i1> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfge_vv_bf16m2_b8_m(vbool8_t mask, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vmfge(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfge_vf_bf16m2_b8_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfge.mask.nxv8bf16.bf16.i64(<vscale x 8 x i1> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfge_vf_bf16m2_b8_m(vbool8_t mask, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfge(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfge_vv_bf16m4_b4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfge.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x i1> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfge_vv_bf16m4_b4_m(vbool4_t mask, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vmfge(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfge_vf_bf16m4_b4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfge.mask.nxv16bf16.bf16.i64(<vscale x 16 x i1> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfge_vf_bf16m4_b4_m(vbool4_t mask, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfge(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfge_vv_bf16m8_b2_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfge.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x i1> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfge_vv_bf16m8_b2_m(vbool2_t mask, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vmfge(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfge_vf_bf16m8_b2_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfge.mask.nxv32bf16.bf16.i64(<vscale x 32 x i1> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfge_vf_bf16m8_b2_m(vbool2_t mask, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfge(mask, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vmfgt.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vmfgt.c
new file mode 100644
index 0000000000000..b8749b3295a19
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vmfgt.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfgt_vv_bf16mf4_b64(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfgt.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfgt_vv_bf16mf4_b64(vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vmfgt(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfgt_vf_bf16mf4_b64(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfgt.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfgt_vf_bf16mf4_b64(vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfgt(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfgt_vv_bf16mf2_b32(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfgt.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfgt_vv_bf16mf2_b32(vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vmfgt(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfgt_vf_bf16mf2_b32(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfgt.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfgt_vf_bf16mf2_b32(vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfgt(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfgt_vv_bf16m1_b16(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfgt.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfgt_vv_bf16m1_b16(vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vmfgt(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfgt_vf_bf16m1_b16(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfgt.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfgt_vf_bf16m1_b16(vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfgt(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfgt_vv_bf16m2_b8(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfgt.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfgt_vv_bf16m2_b8(vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vmfgt(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfgt_vf_bf16m2_b8(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfgt.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfgt_vf_bf16m2_b8(vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfgt(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfgt_vv_bf16m4_b4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfgt.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfgt_vv_bf16m4_b4(vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vmfgt(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfgt_vf_bf16m4_b4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfgt.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfgt_vf_bf16m4_b4(vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfgt(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfgt_vv_bf16m8_b2(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfgt.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfgt_vv_bf16m8_b2(vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vmfgt(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfgt_vf_bf16m8_b2(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfgt.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfgt_vf_bf16m8_b2(vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfgt(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfgt_vv_bf16mf4_b64_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfgt.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x i1> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfgt_vv_bf16mf4_b64_m(vbool64_t mask, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vmfgt(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfgt_vf_bf16mf4_b64_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfgt.mask.nxv1bf16.bf16.i64(<vscale x 1 x i1> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfgt_vf_bf16mf4_b64_m(vbool64_t mask, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfgt(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfgt_vv_bf16mf2_b32_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfgt.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x i1> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfgt_vv_bf16mf2_b32_m(vbool32_t mask, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vmfgt(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfgt_vf_bf16mf2_b32_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfgt.mask.nxv2bf16.bf16.i64(<vscale x 2 x i1> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfgt_vf_bf16mf2_b32_m(vbool32_t mask, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfgt(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfgt_vv_bf16m1_b16_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfgt.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x i1> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfgt_vv_bf16m1_b16_m(vbool16_t mask, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vmfgt(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfgt_vf_bf16m1_b16_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfgt.mask.nxv4bf16.bf16.i64(<vscale x 4 x i1> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfgt_vf_bf16m1_b16_m(vbool16_t mask, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfgt(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfgt_vv_bf16m2_b8_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfgt.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x i1> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfgt_vv_bf16m2_b8_m(vbool8_t mask, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vmfgt(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfgt_vf_bf16m2_b8_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfgt.mask.nxv8bf16.bf16.i64(<vscale x 8 x i1> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfgt_vf_bf16m2_b8_m(vbool8_t mask, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfgt(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfgt_vv_bf16m4_b4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfgt.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x i1> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfgt_vv_bf16m4_b4_m(vbool4_t mask, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vmfgt(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfgt_vf_bf16m4_b4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfgt.mask.nxv16bf16.bf16.i64(<vscale x 16 x i1> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfgt_vf_bf16m4_b4_m(vbool4_t mask, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfgt(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfgt_vv_bf16m8_b2_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfgt.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x i1> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfgt_vv_bf16m8_b2_m(vbool2_t mask, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vmfgt(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfgt_vf_bf16m8_b2_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfgt.mask.nxv32bf16.bf16.i64(<vscale x 32 x i1> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfgt_vf_bf16m8_b2_m(vbool2_t mask, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfgt(mask, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vmfle.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vmfle.c
new file mode 100644
index 0000000000000..724608c89e8de
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vmfle.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfle_vv_bf16mf4_b64(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfle.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfle_vv_bf16mf4_b64(vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vmfle(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfle_vf_bf16mf4_b64(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfle.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfle_vf_bf16mf4_b64(vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfle(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfle_vv_bf16mf2_b32(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfle.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfle_vv_bf16mf2_b32(vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vmfle(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfle_vf_bf16mf2_b32(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfle.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfle_vf_bf16mf2_b32(vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfle(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfle_vv_bf16m1_b16(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfle.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfle_vv_bf16m1_b16(vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vmfle(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfle_vf_bf16m1_b16(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfle.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfle_vf_bf16m1_b16(vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfle(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfle_vv_bf16m2_b8(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfle.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfle_vv_bf16m2_b8(vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vmfle(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfle_vf_bf16m2_b8(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfle.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfle_vf_bf16m2_b8(vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfle(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfle_vv_bf16m4_b4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfle.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfle_vv_bf16m4_b4(vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vmfle(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfle_vf_bf16m4_b4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfle.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfle_vf_bf16m4_b4(vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfle(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfle_vv_bf16m8_b2(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfle.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfle_vv_bf16m8_b2(vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vmfle(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfle_vf_bf16m8_b2(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfle.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfle_vf_bf16m8_b2(vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfle(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfle_vv_bf16mf4_b64_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfle.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x i1> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfle_vv_bf16mf4_b64_m(vbool64_t mask, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vmfle(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfle_vf_bf16mf4_b64_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfle.mask.nxv1bf16.bf16.i64(<vscale x 1 x i1> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfle_vf_bf16mf4_b64_m(vbool64_t mask, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfle(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfle_vv_bf16mf2_b32_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfle.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x i1> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfle_vv_bf16mf2_b32_m(vbool32_t mask, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vmfle(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfle_vf_bf16mf2_b32_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfle.mask.nxv2bf16.bf16.i64(<vscale x 2 x i1> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfle_vf_bf16mf2_b32_m(vbool32_t mask, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfle(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfle_vv_bf16m1_b16_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfle.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x i1> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfle_vv_bf16m1_b16_m(vbool16_t mask, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vmfle(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfle_vf_bf16m1_b16_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfle.mask.nxv4bf16.bf16.i64(<vscale x 4 x i1> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfle_vf_bf16m1_b16_m(vbool16_t mask, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfle(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfle_vv_bf16m2_b8_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfle.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x i1> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfle_vv_bf16m2_b8_m(vbool8_t mask, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vmfle(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfle_vf_bf16m2_b8_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfle.mask.nxv8bf16.bf16.i64(<vscale x 8 x i1> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfle_vf_bf16m2_b8_m(vbool8_t mask, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfle(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfle_vv_bf16m4_b4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfle.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x i1> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfle_vv_bf16m4_b4_m(vbool4_t mask, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vmfle(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfle_vf_bf16m4_b4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfle.mask.nxv16bf16.bf16.i64(<vscale x 16 x i1> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfle_vf_bf16m4_b4_m(vbool4_t mask, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfle(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfle_vv_bf16m8_b2_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfle.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x i1> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfle_vv_bf16m8_b2_m(vbool2_t mask, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vmfle(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfle_vf_bf16m8_b2_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfle.mask.nxv32bf16.bf16.i64(<vscale x 32 x i1> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfle_vf_bf16m8_b2_m(vbool2_t mask, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfle(mask, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vmflt.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vmflt.c
new file mode 100644
index 0000000000000..1b0b898d5053f
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vmflt.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmflt_vv_bf16mf4_b64(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmflt.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmflt_vv_bf16mf4_b64(vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vmflt(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmflt_vf_bf16mf4_b64(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmflt.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmflt_vf_bf16mf4_b64(vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmflt(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmflt_vv_bf16mf2_b32(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmflt.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmflt_vv_bf16mf2_b32(vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vmflt(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmflt_vf_bf16mf2_b32(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmflt.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmflt_vf_bf16mf2_b32(vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmflt(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmflt_vv_bf16m1_b16(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmflt.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmflt_vv_bf16m1_b16(vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vmflt(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmflt_vf_bf16m1_b16(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmflt.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmflt_vf_bf16m1_b16(vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmflt(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmflt_vv_bf16m2_b8(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmflt.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmflt_vv_bf16m2_b8(vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vmflt(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmflt_vf_bf16m2_b8(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmflt.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmflt_vf_bf16m2_b8(vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmflt(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmflt_vv_bf16m4_b4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmflt.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmflt_vv_bf16m4_b4(vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vmflt(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmflt_vf_bf16m4_b4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmflt.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmflt_vf_bf16m4_b4(vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmflt(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmflt_vv_bf16m8_b2(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmflt.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmflt_vv_bf16m8_b2(vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vmflt(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmflt_vf_bf16m8_b2(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmflt.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmflt_vf_bf16m8_b2(vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmflt(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmflt_vv_bf16mf4_b64_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmflt.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x i1> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmflt_vv_bf16mf4_b64_m(vbool64_t mask, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vmflt(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmflt_vf_bf16mf4_b64_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmflt.mask.nxv1bf16.bf16.i64(<vscale x 1 x i1> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmflt_vf_bf16mf4_b64_m(vbool64_t mask, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmflt(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmflt_vv_bf16mf2_b32_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmflt.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x i1> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmflt_vv_bf16mf2_b32_m(vbool32_t mask, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vmflt(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmflt_vf_bf16mf2_b32_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmflt.mask.nxv2bf16.bf16.i64(<vscale x 2 x i1> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmflt_vf_bf16mf2_b32_m(vbool32_t mask, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmflt(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmflt_vv_bf16m1_b16_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmflt.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x i1> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmflt_vv_bf16m1_b16_m(vbool16_t mask, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vmflt(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmflt_vf_bf16m1_b16_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmflt.mask.nxv4bf16.bf16.i64(<vscale x 4 x i1> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmflt_vf_bf16m1_b16_m(vbool16_t mask, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmflt(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmflt_vv_bf16m2_b8_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmflt.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x i1> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmflt_vv_bf16m2_b8_m(vbool8_t mask, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vmflt(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmflt_vf_bf16m2_b8_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmflt.mask.nxv8bf16.bf16.i64(<vscale x 8 x i1> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmflt_vf_bf16m2_b8_m(vbool8_t mask, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmflt(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmflt_vv_bf16m4_b4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmflt.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x i1> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmflt_vv_bf16m4_b4_m(vbool4_t mask, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vmflt(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmflt_vf_bf16m4_b4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmflt.mask.nxv16bf16.bf16.i64(<vscale x 16 x i1> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmflt_vf_bf16m4_b4_m(vbool4_t mask, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmflt(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmflt_vv_bf16m8_b2_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmflt.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x i1> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmflt_vv_bf16m8_b2_m(vbool2_t mask, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vmflt(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmflt_vf_bf16m8_b2_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmflt.mask.nxv32bf16.bf16.i64(<vscale x 32 x i1> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmflt_vf_bf16m8_b2_m(vbool2_t mask, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmflt(mask, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vmfne.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vmfne.c
new file mode 100644
index 0000000000000..672c1504d73eb
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/non-policy/overloaded/vmfne.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfne_vv_bf16mf4_b64(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfne.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfne_vv_bf16mf4_b64(vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vmfne(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfne_vf_bf16mf4_b64(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfne.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfne_vf_bf16mf4_b64(vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfne(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfne_vv_bf16mf2_b32(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfne.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfne_vv_bf16mf2_b32(vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vmfne(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfne_vf_bf16mf2_b32(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfne.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfne_vf_bf16mf2_b32(vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfne(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfne_vv_bf16m1_b16(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfne.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfne_vv_bf16m1_b16(vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vmfne(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfne_vf_bf16m1_b16(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfne.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfne_vf_bf16m1_b16(vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfne(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfne_vv_bf16m2_b8(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfne.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfne_vv_bf16m2_b8(vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vmfne(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfne_vf_bf16m2_b8(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfne.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfne_vf_bf16m2_b8(vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfne(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfne_vv_bf16m4_b4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfne.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfne_vv_bf16m4_b4(vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vmfne(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfne_vf_bf16m4_b4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfne.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfne_vf_bf16m4_b4(vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfne(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfne_vv_bf16m8_b2(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfne.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfne_vv_bf16m8_b2(vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vmfne(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfne_vf_bf16m8_b2(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfne.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfne_vf_bf16m8_b2(vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfne(op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfne_vv_bf16mf4_b64_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfne.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x i1> poison, <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfne_vv_bf16mf4_b64_m(vbool64_t mask, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vmfne(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfne_vf_bf16mf4_b64_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfne.mask.nxv1bf16.bf16.i64(<vscale x 1 x i1> poison, <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfne_vf_bf16mf4_b64_m(vbool64_t mask, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfne(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfne_vv_bf16mf2_b32_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfne.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x i1> poison, <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfne_vv_bf16mf2_b32_m(vbool32_t mask, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vmfne(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfne_vf_bf16mf2_b32_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfne.mask.nxv2bf16.bf16.i64(<vscale x 2 x i1> poison, <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfne_vf_bf16mf2_b32_m(vbool32_t mask, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfne(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfne_vv_bf16m1_b16_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfne.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x i1> poison, <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfne_vv_bf16m1_b16_m(vbool16_t mask, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vmfne(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfne_vf_bf16m1_b16_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfne.mask.nxv4bf16.bf16.i64(<vscale x 4 x i1> poison, <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfne_vf_bf16m1_b16_m(vbool16_t mask, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfne(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfne_vv_bf16m2_b8_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfne.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x i1> poison, <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfne_vv_bf16m2_b8_m(vbool8_t mask, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vmfne(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfne_vf_bf16m2_b8_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfne.mask.nxv8bf16.bf16.i64(<vscale x 8 x i1> poison, <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfne_vf_bf16m2_b8_m(vbool8_t mask, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfne(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfne_vv_bf16m4_b4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfne.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x i1> poison, <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfne_vv_bf16m4_b4_m(vbool4_t mask, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vmfne(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfne_vf_bf16m4_b4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfne.mask.nxv16bf16.bf16.i64(<vscale x 16 x i1> poison, <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfne_vf_bf16m4_b4_m(vbool4_t mask, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfne(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfne_vv_bf16m8_b2_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfne.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x i1> poison, <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfne_vv_bf16m8_b2_m(vbool2_t mask, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vmfne(mask, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfne_vf_bf16m8_b2_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfne.mask.nxv32bf16.bf16.i64(<vscale x 32 x i1> poison, <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfne_vf_bf16m8_b2_m(vbool2_t mask, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfne(mask, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfadd.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfadd.c
new file mode 100644
index 0000000000000..6d55279ae306f
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfadd.c
@@ -0,0 +1,489 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfadd_vv_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfadd.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfadd_vv_bf16mf4_tu(vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfadd_vv_bf16mf4_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfadd_vf_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfadd.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfadd_vf_bf16mf4_tu(vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_vf_bf16mf4_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfadd_vv_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfadd.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfadd_vv_bf16mf2_tu(vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfadd_vv_bf16mf2_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfadd_vf_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfadd.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfadd_vf_bf16mf2_tu(vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_vf_bf16mf2_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfadd_vv_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfadd.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfadd_vv_bf16m1_tu(vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfadd_vv_bf16m1_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfadd_vf_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfadd.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfadd_vf_bf16m1_tu(vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_vf_bf16m1_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfadd_vv_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfadd.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfadd_vv_bf16m2_tu(vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfadd_vv_bf16m2_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfadd_vf_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfadd.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfadd_vf_bf16m2_tu(vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_vf_bf16m2_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfadd_vv_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfadd.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfadd_vv_bf16m4_tu(vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfadd_vv_bf16m4_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfadd_vf_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfadd.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfadd_vf_bf16m4_tu(vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_vf_bf16m4_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfadd_vv_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfadd.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfadd_vv_bf16m8_tu(vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfadd_vv_bf16m8_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfadd_vf_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfadd.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfadd_vf_bf16m8_tu(vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_vf_bf16m8_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfadd_vv_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfadd.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfadd_vv_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfadd_vv_bf16mf4_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfadd_vf_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfadd.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfadd_vf_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_vf_bf16mf4_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfadd_vv_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfadd.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfadd_vv_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfadd_vv_bf16mf2_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfadd_vf_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfadd.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfadd_vf_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_vf_bf16mf2_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfadd_vv_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfadd.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfadd_vv_bf16m1_tum(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfadd_vv_bf16m1_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfadd_vf_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfadd.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfadd_vf_bf16m1_tum(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_vf_bf16m1_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfadd_vv_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfadd.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfadd_vv_bf16m2_tum(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfadd_vv_bf16m2_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfadd_vf_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfadd.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfadd_vf_bf16m2_tum(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_vf_bf16m2_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfadd_vv_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfadd.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfadd_vv_bf16m4_tum(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfadd_vv_bf16m4_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfadd_vf_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfadd.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfadd_vf_bf16m4_tum(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_vf_bf16m4_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfadd_vv_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfadd.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfadd_vv_bf16m8_tum(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfadd_vv_bf16m8_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfadd_vf_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfadd.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfadd_vf_bf16m8_tum(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_vf_bf16m8_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfadd_vv_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfadd.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfadd_vv_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfadd_vv_bf16mf4_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfadd_vf_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfadd.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfadd_vf_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_vf_bf16mf4_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfadd_vv_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfadd.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfadd_vv_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfadd_vv_bf16mf2_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfadd_vf_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfadd.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfadd_vf_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_vf_bf16mf2_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfadd_vv_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfadd.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfadd_vv_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfadd_vv_bf16m1_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfadd_vf_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfadd.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfadd_vf_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_vf_bf16m1_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfadd_vv_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfadd.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfadd_vv_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfadd_vv_bf16m2_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfadd_vf_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfadd.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfadd_vf_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_vf_bf16m2_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfadd_vv_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfadd.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfadd_vv_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfadd_vv_bf16m4_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfadd_vf_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfadd.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfadd_vf_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_vf_bf16m4_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfadd_vv_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfadd.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfadd_vv_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfadd_vv_bf16m8_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfadd_vf_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfadd.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfadd_vf_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_vf_bf16m8_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfadd_vv_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfadd.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfadd_vv_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfadd_vv_bf16mf4_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfadd_vf_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfadd.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfadd_vf_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_vf_bf16mf4_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfadd_vv_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfadd.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfadd_vv_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfadd_vv_bf16mf2_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfadd_vf_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfadd.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfadd_vf_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_vf_bf16mf2_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfadd_vv_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfadd.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfadd_vv_bf16m1_mu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfadd_vv_bf16m1_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfadd_vf_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfadd.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfadd_vf_bf16m1_mu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_vf_bf16m1_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfadd_vv_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfadd.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfadd_vv_bf16m2_mu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfadd_vv_bf16m2_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfadd_vf_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfadd.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfadd_vf_bf16m2_mu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_vf_bf16m2_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfadd_vv_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfadd.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfadd_vv_bf16m4_mu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfadd_vv_bf16m4_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfadd_vf_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfadd.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfadd_vf_bf16m4_mu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_vf_bf16m4_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfadd_vv_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfadd.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfadd_vv_bf16m8_mu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfadd_vv_bf16m8_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfadd_vf_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfadd.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfadd_vf_bf16m8_mu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_vf_bf16m8_mu(mask, maskedoff, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfclass.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfclass.c
new file mode 100644
index 0000000000000..8e6946de398b0
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfclass.c
@@ -0,0 +1,272 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i16> @test_vfclass_v_bf16mf4_u16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x i16> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vfclass.nxv1bf16.i64(<vscale x 1 x i16> [[VD]], <vscale x 1 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
+//
+vuint16mf4_t test_vfclass_v_bf16mf4_u16mf4_tu(vuint16mf4_t vd,
+                                              vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfclass_v_bf16mf4_u16mf4_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i16> @test_vfclass_v_bf16mf2_u16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x i16> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vfclass.nxv2bf16.i64(<vscale x 2 x i16> [[VD]], <vscale x 2 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
+//
+vuint16mf2_t test_vfclass_v_bf16mf2_u16mf2_tu(vuint16mf2_t vd,
+                                              vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfclass_v_bf16mf2_u16mf2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i16> @test_vfclass_v_bf16m1_u16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x i16> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vfclass.nxv4bf16.i64(<vscale x 4 x i16> [[VD]], <vscale x 4 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
+//
+vuint16m1_t test_vfclass_v_bf16m1_u16m1_tu(vuint16m1_t vd, vbfloat16m1_t vs2,
+                                           size_t vl) {
+  return __riscv_vfclass_v_bf16m1_u16m1_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i16> @test_vfclass_v_bf16m2_u16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x i16> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vfclass.nxv8bf16.i64(<vscale x 8 x i16> [[VD]], <vscale x 8 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+vuint16m2_t test_vfclass_v_bf16m2_u16m2_tu(vuint16m2_t vd, vbfloat16m2_t vs2,
+                                           size_t vl) {
+  return __riscv_vfclass_v_bf16m2_u16m2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i16> @test_vfclass_v_bf16m4_u16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x i16> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vfclass.nxv16bf16.i64(<vscale x 16 x i16> [[VD]], <vscale x 16 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+vuint16m4_t test_vfclass_v_bf16m4_u16m4_tu(vuint16m4_t vd, vbfloat16m4_t vs2,
+                                           size_t vl) {
+  return __riscv_vfclass_v_bf16m4_u16m4_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_vfclass_v_bf16m8_u16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x i16> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vfclass.nxv32bf16.i64(<vscale x 32 x i16> [[VD]], <vscale x 32 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
+vuint16m8_t test_vfclass_v_bf16m8_u16m8_tu(vuint16m8_t vd, vbfloat16m8_t vs2,
+                                           size_t vl) {
+  return __riscv_vfclass_v_bf16m8_u16m8_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i16> @test_vfclass_v_bf16mf4_u16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x i16> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vfclass.mask.nxv1bf16.i64(<vscale x 1 x i16> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
+//
+vuint16mf4_t test_vfclass_v_bf16mf4_u16mf4_tum(vbool64_t vm, vuint16mf4_t vd,
+                                               vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfclass_v_bf16mf4_u16mf4_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i16> @test_vfclass_v_bf16mf2_u16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x i16> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vfclass.mask.nxv2bf16.i64(<vscale x 2 x i16> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
+//
+vuint16mf2_t test_vfclass_v_bf16mf2_u16mf2_tum(vbool32_t vm, vuint16mf2_t vd,
+                                               vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfclass_v_bf16mf2_u16mf2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i16> @test_vfclass_v_bf16m1_u16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x i16> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vfclass.mask.nxv4bf16.i64(<vscale x 4 x i16> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
+//
+vuint16m1_t test_vfclass_v_bf16m1_u16m1_tum(vbool16_t vm, vuint16m1_t vd,
+                                            vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfclass_v_bf16m1_u16m1_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i16> @test_vfclass_v_bf16m2_u16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x i16> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vfclass.mask.nxv8bf16.i64(<vscale x 8 x i16> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+vuint16m2_t test_vfclass_v_bf16m2_u16m2_tum(vbool8_t vm, vuint16m2_t vd,
+                                            vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfclass_v_bf16m2_u16m2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i16> @test_vfclass_v_bf16m4_u16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x i16> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vfclass.mask.nxv16bf16.i64(<vscale x 16 x i16> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+vuint16m4_t test_vfclass_v_bf16m4_u16m4_tum(vbool4_t vm, vuint16m4_t vd,
+                                            vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfclass_v_bf16m4_u16m4_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_vfclass_v_bf16m8_u16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x i16> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vfclass.mask.nxv32bf16.i64(<vscale x 32 x i16> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
+vuint16m8_t test_vfclass_v_bf16m8_u16m8_tum(vbool2_t vm, vuint16m8_t vd,
+                                            vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfclass_v_bf16m8_u16m8_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i16> @test_vfclass_v_bf16mf4_u16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x i16> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vfclass.mask.nxv1bf16.i64(<vscale x 1 x i16> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
+//
+vuint16mf4_t test_vfclass_v_bf16mf4_u16mf4_tumu(vbool64_t vm, vuint16mf4_t vd,
+                                                vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfclass_v_bf16mf4_u16mf4_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i16> @test_vfclass_v_bf16mf2_u16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x i16> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vfclass.mask.nxv2bf16.i64(<vscale x 2 x i16> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
+//
+vuint16mf2_t test_vfclass_v_bf16mf2_u16mf2_tumu(vbool32_t vm, vuint16mf2_t vd,
+                                                vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfclass_v_bf16mf2_u16mf2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i16> @test_vfclass_v_bf16m1_u16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x i16> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vfclass.mask.nxv4bf16.i64(<vscale x 4 x i16> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
+//
+vuint16m1_t test_vfclass_v_bf16m1_u16m1_tumu(vbool16_t vm, vuint16m1_t vd,
+                                             vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfclass_v_bf16m1_u16m1_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i16> @test_vfclass_v_bf16m2_u16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x i16> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vfclass.mask.nxv8bf16.i64(<vscale x 8 x i16> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+vuint16m2_t test_vfclass_v_bf16m2_u16m2_tumu(vbool8_t vm, vuint16m2_t vd,
+                                             vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfclass_v_bf16m2_u16m2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i16> @test_vfclass_v_bf16m4_u16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x i16> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vfclass.mask.nxv16bf16.i64(<vscale x 16 x i16> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+vuint16m4_t test_vfclass_v_bf16m4_u16m4_tumu(vbool4_t vm, vuint16m4_t vd,
+                                             vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfclass_v_bf16m4_u16m4_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_vfclass_v_bf16m8_u16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x i16> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vfclass.mask.nxv32bf16.i64(<vscale x 32 x i16> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
+vuint16m8_t test_vfclass_v_bf16m8_u16m8_tumu(vbool2_t vm, vuint16m8_t vd,
+                                             vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfclass_v_bf16m8_u16m8_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i16> @test_vfclass_v_bf16mf4_u16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x i16> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vfclass.mask.nxv1bf16.i64(<vscale x 1 x i16> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
+//
+vuint16mf4_t test_vfclass_v_bf16mf4_u16mf4_mu(vbool64_t vm, vuint16mf4_t vd,
+                                              vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfclass_v_bf16mf4_u16mf4_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i16> @test_vfclass_v_bf16mf2_u16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x i16> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vfclass.mask.nxv2bf16.i64(<vscale x 2 x i16> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
+//
+vuint16mf2_t test_vfclass_v_bf16mf2_u16mf2_mu(vbool32_t vm, vuint16mf2_t vd,
+                                              vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfclass_v_bf16mf2_u16mf2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i16> @test_vfclass_v_bf16m1_u16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x i16> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vfclass.mask.nxv4bf16.i64(<vscale x 4 x i16> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
+//
+vuint16m1_t test_vfclass_v_bf16m1_u16m1_mu(vbool16_t vm, vuint16m1_t vd,
+                                           vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfclass_v_bf16m1_u16m1_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i16> @test_vfclass_v_bf16m2_u16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x i16> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vfclass.mask.nxv8bf16.i64(<vscale x 8 x i16> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+vuint16m2_t test_vfclass_v_bf16m2_u16m2_mu(vbool8_t vm, vuint16m2_t vd,
+                                           vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfclass_v_bf16m2_u16m2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i16> @test_vfclass_v_bf16m4_u16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x i16> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vfclass.mask.nxv16bf16.i64(<vscale x 16 x i16> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+vuint16m4_t test_vfclass_v_bf16m4_u16m4_mu(vbool4_t vm, vuint16m4_t vd,
+                                           vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfclass_v_bf16m4_u16m4_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_vfclass_v_bf16m8_u16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x i16> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vfclass.mask.nxv32bf16.i64(<vscale x 32 x i16> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
+vuint16m8_t test_vfclass_v_bf16m8_u16m8_mu(vbool2_t vm, vuint16m8_t vd,
+                                           vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfclass_v_bf16m8_u16m8_mu(vm, vd, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfmacc.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfmacc.c
new file mode 100644
index 0000000000000..2d4e481b1f8bb
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfmacc.c
@@ -0,0 +1,489 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmacc_vv_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmacc.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmacc_vv_bf16mf4_tu(vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmacc_vv_bf16mf4_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmacc_vf_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmacc.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmacc_vf_bf16mf4_tu(vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmacc_vf_bf16mf4_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmacc_vv_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmacc.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmacc_vv_bf16mf2_tu(vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmacc_vv_bf16mf2_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmacc_vf_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmacc.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmacc_vf_bf16mf2_tu(vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmacc_vf_bf16mf2_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmacc_vv_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmacc.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmacc_vv_bf16m1_tu(vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmacc_vv_bf16m1_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmacc_vf_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmacc.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmacc_vf_bf16m1_tu(vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmacc_vf_bf16m1_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmacc_vv_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmacc.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmacc_vv_bf16m2_tu(vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmacc_vv_bf16m2_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmacc_vf_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmacc.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmacc_vf_bf16m2_tu(vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmacc_vf_bf16m2_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmacc_vv_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmacc.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmacc_vv_bf16m4_tu(vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmacc_vv_bf16m4_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmacc_vf_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmacc.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmacc_vf_bf16m4_tu(vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmacc_vf_bf16m4_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmacc_vv_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmacc.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmacc_vv_bf16m8_tu(vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmacc_vv_bf16m8_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmacc_vf_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmacc.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmacc_vf_bf16m8_tu(vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmacc_vf_bf16m8_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmacc_vv_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmacc.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmacc_vv_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmacc_vv_bf16mf4_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmacc_vf_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmacc.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmacc_vf_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmacc_vf_bf16mf4_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmacc_vv_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmacc.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmacc_vv_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmacc_vv_bf16mf2_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmacc_vf_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmacc.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmacc_vf_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmacc_vf_bf16mf2_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmacc_vv_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmacc.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmacc_vv_bf16m1_tum(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmacc_vv_bf16m1_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmacc_vf_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmacc.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmacc_vf_bf16m1_tum(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmacc_vf_bf16m1_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmacc_vv_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmacc.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmacc_vv_bf16m2_tum(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmacc_vv_bf16m2_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmacc_vf_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmacc.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmacc_vf_bf16m2_tum(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmacc_vf_bf16m2_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmacc_vv_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmacc.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmacc_vv_bf16m4_tum(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmacc_vv_bf16m4_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmacc_vf_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmacc.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmacc_vf_bf16m4_tum(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmacc_vf_bf16m4_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmacc_vv_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmacc.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmacc_vv_bf16m8_tum(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmacc_vv_bf16m8_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmacc_vf_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmacc.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmacc_vf_bf16m8_tum(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmacc_vf_bf16m8_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmacc_vv_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmacc.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmacc_vv_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmacc_vv_bf16mf4_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmacc_vf_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmacc.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmacc_vf_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmacc_vf_bf16mf4_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmacc_vv_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmacc.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmacc_vv_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmacc_vv_bf16mf2_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmacc_vf_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmacc.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmacc_vf_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmacc_vf_bf16mf2_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmacc_vv_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmacc.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmacc_vv_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmacc_vv_bf16m1_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmacc_vf_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmacc.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmacc_vf_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmacc_vf_bf16m1_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmacc_vv_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmacc.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmacc_vv_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmacc_vv_bf16m2_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmacc_vf_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmacc.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmacc_vf_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmacc_vf_bf16m2_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmacc_vv_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmacc.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmacc_vv_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmacc_vv_bf16m4_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmacc_vf_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmacc.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmacc_vf_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmacc_vf_bf16m4_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmacc_vv_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmacc.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmacc_vv_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmacc_vv_bf16m8_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmacc_vf_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmacc.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmacc_vf_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmacc_vf_bf16m8_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmacc_vv_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmacc.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmacc_vv_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmacc_vv_bf16mf4_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmacc_vf_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmacc.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmacc_vf_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmacc_vf_bf16mf4_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmacc_vv_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmacc.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmacc_vv_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmacc_vv_bf16mf2_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmacc_vf_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmacc.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmacc_vf_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmacc_vf_bf16mf2_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmacc_vv_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmacc.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmacc_vv_bf16m1_mu(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmacc_vv_bf16m1_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmacc_vf_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmacc.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmacc_vf_bf16m1_mu(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmacc_vf_bf16m1_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmacc_vv_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmacc.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmacc_vv_bf16m2_mu(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmacc_vv_bf16m2_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmacc_vf_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmacc.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmacc_vf_bf16m2_mu(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmacc_vf_bf16m2_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmacc_vv_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmacc.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmacc_vv_bf16m4_mu(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmacc_vv_bf16m4_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmacc_vf_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmacc.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmacc_vf_bf16m4_mu(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmacc_vf_bf16m4_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmacc_vv_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmacc.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmacc_vv_bf16m8_mu(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmacc_vv_bf16m8_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmacc_vf_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmacc.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmacc_vf_bf16m8_mu(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmacc_vf_bf16m8_mu(mask, vd, rs1, vs2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfmadd.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfmadd.c
new file mode 100644
index 0000000000000..511e073e10143
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfmadd.c
@@ -0,0 +1,489 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmadd_vv_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmadd.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmadd_vv_bf16mf4_tu(vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmadd_vv_bf16mf4_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmadd_vf_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmadd.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmadd_vf_bf16mf4_tu(vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmadd_vf_bf16mf4_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmadd_vv_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmadd.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmadd_vv_bf16mf2_tu(vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmadd_vv_bf16mf2_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmadd_vf_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmadd.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmadd_vf_bf16mf2_tu(vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmadd_vf_bf16mf2_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmadd_vv_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmadd.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmadd_vv_bf16m1_tu(vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmadd_vv_bf16m1_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmadd_vf_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmadd.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmadd_vf_bf16m1_tu(vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmadd_vf_bf16m1_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmadd_vv_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmadd.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmadd_vv_bf16m2_tu(vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmadd_vv_bf16m2_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmadd_vf_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmadd.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmadd_vf_bf16m2_tu(vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmadd_vf_bf16m2_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmadd_vv_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmadd.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmadd_vv_bf16m4_tu(vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmadd_vv_bf16m4_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmadd_vf_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmadd.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmadd_vf_bf16m4_tu(vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmadd_vf_bf16m4_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmadd_vv_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmadd.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmadd_vv_bf16m8_tu(vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmadd_vv_bf16m8_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmadd_vf_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmadd.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmadd_vf_bf16m8_tu(vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmadd_vf_bf16m8_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmadd_vv_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmadd.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmadd_vv_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmadd_vv_bf16mf4_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmadd_vf_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmadd.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmadd_vf_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmadd_vf_bf16mf4_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmadd_vv_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmadd.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmadd_vv_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmadd_vv_bf16mf2_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmadd_vf_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmadd.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmadd_vf_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmadd_vf_bf16mf2_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmadd_vv_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmadd.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmadd_vv_bf16m1_tum(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmadd_vv_bf16m1_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmadd_vf_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmadd.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmadd_vf_bf16m1_tum(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmadd_vf_bf16m1_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmadd_vv_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmadd.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmadd_vv_bf16m2_tum(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmadd_vv_bf16m2_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmadd_vf_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmadd.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmadd_vf_bf16m2_tum(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmadd_vf_bf16m2_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmadd_vv_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmadd.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmadd_vv_bf16m4_tum(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmadd_vv_bf16m4_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmadd_vf_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmadd.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmadd_vf_bf16m4_tum(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmadd_vf_bf16m4_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmadd_vv_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmadd.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmadd_vv_bf16m8_tum(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmadd_vv_bf16m8_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmadd_vf_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmadd.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmadd_vf_bf16m8_tum(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmadd_vf_bf16m8_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmadd_vv_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmadd.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmadd_vv_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmadd_vv_bf16mf4_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmadd_vf_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmadd.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmadd_vf_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmadd_vf_bf16mf4_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmadd_vv_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmadd.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmadd_vv_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmadd_vv_bf16mf2_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmadd_vf_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmadd.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmadd_vf_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmadd_vf_bf16mf2_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmadd_vv_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmadd.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmadd_vv_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmadd_vv_bf16m1_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmadd_vf_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmadd.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmadd_vf_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmadd_vf_bf16m1_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmadd_vv_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmadd.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmadd_vv_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmadd_vv_bf16m2_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmadd_vf_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmadd.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmadd_vf_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmadd_vf_bf16m2_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmadd_vv_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmadd.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmadd_vv_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmadd_vv_bf16m4_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmadd_vf_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmadd.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmadd_vf_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmadd_vf_bf16m4_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmadd_vv_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmadd.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmadd_vv_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmadd_vv_bf16m8_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmadd_vf_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmadd.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmadd_vf_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmadd_vf_bf16m8_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmadd_vv_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmadd.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmadd_vv_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmadd_vv_bf16mf4_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmadd_vf_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmadd.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmadd_vf_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmadd_vf_bf16mf4_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmadd_vv_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmadd.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmadd_vv_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmadd_vv_bf16mf2_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmadd_vf_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmadd.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmadd_vf_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmadd_vf_bf16mf2_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmadd_vv_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmadd.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmadd_vv_bf16m1_mu(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmadd_vv_bf16m1_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmadd_vf_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmadd.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmadd_vf_bf16m1_mu(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmadd_vf_bf16m1_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmadd_vv_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmadd.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmadd_vv_bf16m2_mu(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmadd_vv_bf16m2_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmadd_vf_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmadd.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmadd_vf_bf16m2_mu(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmadd_vf_bf16m2_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmadd_vv_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmadd.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmadd_vv_bf16m4_mu(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmadd_vv_bf16m4_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmadd_vf_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmadd.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmadd_vf_bf16m4_mu(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmadd_vf_bf16m4_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmadd_vv_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmadd.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmadd_vv_bf16m8_mu(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmadd_vv_bf16m8_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmadd_vf_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmadd.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmadd_vf_bf16m8_mu(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmadd_vf_bf16m8_mu(mask, vd, rs1, vs2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfmax.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfmax.c
new file mode 100644
index 0000000000000..f3698d41aff34
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfmax.c
@@ -0,0 +1,489 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmax_vv_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmax.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmax_vv_bf16mf4_tu(vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfmax_vv_bf16mf4_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmax_vf_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmax.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmax_vf_bf16mf4_tu(vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_vf_bf16mf4_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmax_vv_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmax.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmax_vv_bf16mf2_tu(vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfmax_vv_bf16mf2_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmax_vf_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmax.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmax_vf_bf16mf2_tu(vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_vf_bf16mf2_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmax_vv_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmax.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmax_vv_bf16m1_tu(vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfmax_vv_bf16m1_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmax_vf_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmax.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmax_vf_bf16m1_tu(vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_vf_bf16m1_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmax_vv_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmax.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmax_vv_bf16m2_tu(vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfmax_vv_bf16m2_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmax_vf_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmax.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmax_vf_bf16m2_tu(vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_vf_bf16m2_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmax_vv_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmax.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmax_vv_bf16m4_tu(vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfmax_vv_bf16m4_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmax_vf_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmax.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmax_vf_bf16m4_tu(vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_vf_bf16m4_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmax_vv_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmax.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmax_vv_bf16m8_tu(vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfmax_vv_bf16m8_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmax_vf_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmax.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmax_vf_bf16m8_tu(vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_vf_bf16m8_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmax_vv_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmax.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmax_vv_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfmax_vv_bf16mf4_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmax_vf_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmax.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmax_vf_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_vf_bf16mf4_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmax_vv_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmax.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmax_vv_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfmax_vv_bf16mf2_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmax_vf_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmax.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmax_vf_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_vf_bf16mf2_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmax_vv_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmax.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmax_vv_bf16m1_tum(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfmax_vv_bf16m1_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmax_vf_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmax.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmax_vf_bf16m1_tum(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_vf_bf16m1_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmax_vv_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmax.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmax_vv_bf16m2_tum(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfmax_vv_bf16m2_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmax_vf_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmax.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmax_vf_bf16m2_tum(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_vf_bf16m2_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmax_vv_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmax.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmax_vv_bf16m4_tum(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfmax_vv_bf16m4_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmax_vf_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmax.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmax_vf_bf16m4_tum(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_vf_bf16m4_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmax_vv_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmax.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmax_vv_bf16m8_tum(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfmax_vv_bf16m8_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmax_vf_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmax.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmax_vf_bf16m8_tum(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_vf_bf16m8_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmax_vv_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmax.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmax_vv_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfmax_vv_bf16mf4_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmax_vf_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmax.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmax_vf_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_vf_bf16mf4_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmax_vv_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmax.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmax_vv_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfmax_vv_bf16mf2_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmax_vf_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmax.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmax_vf_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_vf_bf16mf2_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmax_vv_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmax.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmax_vv_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfmax_vv_bf16m1_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmax_vf_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmax.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmax_vf_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_vf_bf16m1_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmax_vv_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmax.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmax_vv_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfmax_vv_bf16m2_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmax_vf_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmax.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmax_vf_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_vf_bf16m2_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmax_vv_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmax.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmax_vv_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfmax_vv_bf16m4_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmax_vf_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmax.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmax_vf_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_vf_bf16m4_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmax_vv_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmax.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmax_vv_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfmax_vv_bf16m8_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmax_vf_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmax.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmax_vf_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_vf_bf16m8_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmax_vv_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmax.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmax_vv_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfmax_vv_bf16mf4_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmax_vf_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmax.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmax_vf_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_vf_bf16mf4_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmax_vv_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmax.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmax_vv_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfmax_vv_bf16mf2_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmax_vf_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmax.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmax_vf_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_vf_bf16mf2_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmax_vv_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmax.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmax_vv_bf16m1_mu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfmax_vv_bf16m1_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmax_vf_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmax.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmax_vf_bf16m1_mu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_vf_bf16m1_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmax_vv_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmax.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmax_vv_bf16m2_mu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfmax_vv_bf16m2_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmax_vf_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmax.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmax_vf_bf16m2_mu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_vf_bf16m2_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmax_vv_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmax.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmax_vv_bf16m4_mu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfmax_vv_bf16m4_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmax_vf_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmax.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmax_vf_bf16m4_mu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_vf_bf16m4_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmax_vv_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmax.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmax_vv_bf16m8_mu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfmax_vv_bf16m8_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmax_vf_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmax.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmax_vf_bf16m8_mu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_vf_bf16m8_mu(mask, maskedoff, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfmerge.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfmerge.c
new file mode 100644
index 0000000000000..bcaf2cb8817e4
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfmerge.c
@@ -0,0 +1,69 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmerge_vfm_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmerge.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmerge_vfm_bf16mf4_tu(vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, vbool64_t mask, size_t vl) {
+  return __riscv_vfmerge_vfm_bf16mf4_tu(maskedoff, op1, op2, mask, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmerge_vfm_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmerge.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmerge_vfm_bf16mf2_tu(vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, vbool32_t mask, size_t vl) {
+  return __riscv_vfmerge_vfm_bf16mf2_tu(maskedoff, op1, op2, mask, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmerge_vfm_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmerge.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmerge_vfm_bf16m1_tu(vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, vbool16_t mask, size_t vl) {
+  return __riscv_vfmerge_vfm_bf16m1_tu(maskedoff, op1, op2, mask, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmerge_vfm_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmerge.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmerge_vfm_bf16m2_tu(vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, vbool8_t mask, size_t vl) {
+  return __riscv_vfmerge_vfm_bf16m2_tu(maskedoff, op1, op2, mask, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmerge_vfm_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmerge.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmerge_vfm_bf16m4_tu(vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, vbool4_t mask, size_t vl) {
+  return __riscv_vfmerge_vfm_bf16m4_tu(maskedoff, op1, op2, mask, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmerge_vfm_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmerge.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmerge_vfm_bf16m8_tu(vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, vbool2_t mask, size_t vl) {
+  return __riscv_vfmerge_vfm_bf16m8_tu(maskedoff, op1, op2, mask, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfmin.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfmin.c
new file mode 100644
index 0000000000000..911f8792e4436
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfmin.c
@@ -0,0 +1,489 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmin_vv_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmin.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmin_vv_bf16mf4_tu(vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfmin_vv_bf16mf4_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmin_vf_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmin.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmin_vf_bf16mf4_tu(vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_vf_bf16mf4_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmin_vv_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmin.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmin_vv_bf16mf2_tu(vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfmin_vv_bf16mf2_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmin_vf_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmin.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmin_vf_bf16mf2_tu(vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_vf_bf16mf2_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmin_vv_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmin.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmin_vv_bf16m1_tu(vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfmin_vv_bf16m1_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmin_vf_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmin.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmin_vf_bf16m1_tu(vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_vf_bf16m1_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmin_vv_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmin.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmin_vv_bf16m2_tu(vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfmin_vv_bf16m2_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmin_vf_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmin.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmin_vf_bf16m2_tu(vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_vf_bf16m2_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmin_vv_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmin.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmin_vv_bf16m4_tu(vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfmin_vv_bf16m4_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmin_vf_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmin.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmin_vf_bf16m4_tu(vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_vf_bf16m4_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmin_vv_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmin.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmin_vv_bf16m8_tu(vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfmin_vv_bf16m8_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmin_vf_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmin.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmin_vf_bf16m8_tu(vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_vf_bf16m8_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmin_vv_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmin.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmin_vv_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfmin_vv_bf16mf4_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmin_vf_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmin.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmin_vf_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_vf_bf16mf4_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmin_vv_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmin.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmin_vv_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfmin_vv_bf16mf2_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmin_vf_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmin.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmin_vf_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_vf_bf16mf2_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmin_vv_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmin.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmin_vv_bf16m1_tum(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfmin_vv_bf16m1_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmin_vf_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmin.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmin_vf_bf16m1_tum(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_vf_bf16m1_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmin_vv_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmin.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmin_vv_bf16m2_tum(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfmin_vv_bf16m2_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmin_vf_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmin.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmin_vf_bf16m2_tum(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_vf_bf16m2_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmin_vv_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmin.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmin_vv_bf16m4_tum(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfmin_vv_bf16m4_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmin_vf_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmin.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmin_vf_bf16m4_tum(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_vf_bf16m4_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmin_vv_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmin.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmin_vv_bf16m8_tum(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfmin_vv_bf16m8_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmin_vf_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmin.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmin_vf_bf16m8_tum(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_vf_bf16m8_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmin_vv_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmin.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmin_vv_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfmin_vv_bf16mf4_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmin_vf_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmin.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmin_vf_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_vf_bf16mf4_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmin_vv_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmin.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmin_vv_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfmin_vv_bf16mf2_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmin_vf_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmin.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmin_vf_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_vf_bf16mf2_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmin_vv_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmin.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmin_vv_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfmin_vv_bf16m1_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmin_vf_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmin.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmin_vf_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_vf_bf16m1_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmin_vv_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmin.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmin_vv_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfmin_vv_bf16m2_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmin_vf_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmin.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmin_vf_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_vf_bf16m2_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmin_vv_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmin.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmin_vv_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfmin_vv_bf16m4_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmin_vf_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmin.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmin_vf_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_vf_bf16m4_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmin_vv_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmin.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmin_vv_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfmin_vv_bf16m8_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmin_vf_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmin.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmin_vf_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_vf_bf16m8_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmin_vv_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmin.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmin_vv_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfmin_vv_bf16mf4_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmin_vf_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmin.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmin_vf_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_vf_bf16mf4_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmin_vv_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmin.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmin_vv_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfmin_vv_bf16mf2_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmin_vf_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmin.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmin_vf_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_vf_bf16mf2_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmin_vv_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmin.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmin_vv_bf16m1_mu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfmin_vv_bf16m1_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmin_vf_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmin.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmin_vf_bf16m1_mu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_vf_bf16m1_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmin_vv_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmin.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmin_vv_bf16m2_mu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfmin_vv_bf16m2_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmin_vf_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmin.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmin_vf_bf16m2_mu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_vf_bf16m2_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmin_vv_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmin.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmin_vv_bf16m4_mu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfmin_vv_bf16m4_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmin_vf_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmin.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmin_vf_bf16m4_mu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_vf_bf16m4_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmin_vv_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmin.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmin_vv_bf16m8_mu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfmin_vv_bf16m8_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmin_vf_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmin.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmin_vf_bf16m8_mu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_vf_bf16m8_mu(mask, maskedoff, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfmsac.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfmsac.c
new file mode 100644
index 0000000000000..9575ad337de8c
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfmsac.c
@@ -0,0 +1,489 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsac_vv_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsac.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsac_vv_bf16mf4_tu(vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsac_vv_bf16mf4_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsac_vf_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsac.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsac_vf_bf16mf4_tu(vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsac_vf_bf16mf4_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsac_vv_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsac.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsac_vv_bf16mf2_tu(vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsac_vv_bf16mf2_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsac_vf_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsac.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsac_vf_bf16mf2_tu(vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsac_vf_bf16mf2_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsac_vv_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsac.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsac_vv_bf16m1_tu(vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsac_vv_bf16m1_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsac_vf_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsac.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsac_vf_bf16m1_tu(vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsac_vf_bf16m1_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsac_vv_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsac.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsac_vv_bf16m2_tu(vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsac_vv_bf16m2_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsac_vf_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsac.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsac_vf_bf16m2_tu(vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsac_vf_bf16m2_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsac_vv_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsac.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsac_vv_bf16m4_tu(vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsac_vv_bf16m4_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsac_vf_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsac.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsac_vf_bf16m4_tu(vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsac_vf_bf16m4_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsac_vv_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsac.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsac_vv_bf16m8_tu(vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsac_vv_bf16m8_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsac_vf_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsac.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsac_vf_bf16m8_tu(vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsac_vf_bf16m8_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsac_vv_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsac.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsac_vv_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsac_vv_bf16mf4_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsac_vf_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsac.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsac_vf_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsac_vf_bf16mf4_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsac_vv_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsac.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsac_vv_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsac_vv_bf16mf2_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsac_vf_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsac.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsac_vf_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsac_vf_bf16mf2_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsac_vv_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsac.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsac_vv_bf16m1_tum(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsac_vv_bf16m1_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsac_vf_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsac.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsac_vf_bf16m1_tum(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsac_vf_bf16m1_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsac_vv_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsac.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsac_vv_bf16m2_tum(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsac_vv_bf16m2_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsac_vf_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsac.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsac_vf_bf16m2_tum(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsac_vf_bf16m2_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsac_vv_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsac.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsac_vv_bf16m4_tum(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsac_vv_bf16m4_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsac_vf_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsac.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsac_vf_bf16m4_tum(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsac_vf_bf16m4_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsac_vv_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsac.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsac_vv_bf16m8_tum(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsac_vv_bf16m8_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsac_vf_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsac.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsac_vf_bf16m8_tum(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsac_vf_bf16m8_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsac_vv_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsac.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsac_vv_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsac_vv_bf16mf4_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsac_vf_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsac.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsac_vf_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsac_vf_bf16mf4_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsac_vv_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsac.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsac_vv_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsac_vv_bf16mf2_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsac_vf_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsac.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsac_vf_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsac_vf_bf16mf2_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsac_vv_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsac.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsac_vv_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsac_vv_bf16m1_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsac_vf_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsac.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsac_vf_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsac_vf_bf16m1_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsac_vv_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsac.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsac_vv_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsac_vv_bf16m2_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsac_vf_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsac.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsac_vf_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsac_vf_bf16m2_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsac_vv_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsac.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsac_vv_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsac_vv_bf16m4_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsac_vf_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsac.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsac_vf_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsac_vf_bf16m4_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsac_vv_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsac.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsac_vv_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsac_vv_bf16m8_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsac_vf_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsac.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsac_vf_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsac_vf_bf16m8_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsac_vv_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsac.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsac_vv_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsac_vv_bf16mf4_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsac_vf_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsac.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsac_vf_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsac_vf_bf16mf4_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsac_vv_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsac.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsac_vv_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsac_vv_bf16mf2_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsac_vf_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsac.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsac_vf_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsac_vf_bf16mf2_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsac_vv_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsac.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsac_vv_bf16m1_mu(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsac_vv_bf16m1_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsac_vf_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsac.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsac_vf_bf16m1_mu(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsac_vf_bf16m1_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsac_vv_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsac.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsac_vv_bf16m2_mu(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsac_vv_bf16m2_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsac_vf_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsac.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsac_vf_bf16m2_mu(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsac_vf_bf16m2_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsac_vv_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsac.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsac_vv_bf16m4_mu(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsac_vv_bf16m4_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsac_vf_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsac.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsac_vf_bf16m4_mu(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsac_vf_bf16m4_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsac_vv_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsac.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsac_vv_bf16m8_mu(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsac_vv_bf16m8_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsac_vf_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsac.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsac_vf_bf16m8_mu(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsac_vf_bf16m8_mu(mask, vd, rs1, vs2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfmsub.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfmsub.c
new file mode 100644
index 0000000000000..8e382f710ab5a
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfmsub.c
@@ -0,0 +1,489 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsub_vv_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsub.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsub_vv_bf16mf4_tu(vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsub_vv_bf16mf4_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsub_vf_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsub.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsub_vf_bf16mf4_tu(vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsub_vf_bf16mf4_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsub_vv_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsub.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsub_vv_bf16mf2_tu(vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsub_vv_bf16mf2_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsub_vf_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsub.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsub_vf_bf16mf2_tu(vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsub_vf_bf16mf2_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsub_vv_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsub.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsub_vv_bf16m1_tu(vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsub_vv_bf16m1_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsub_vf_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsub.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsub_vf_bf16m1_tu(vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsub_vf_bf16m1_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsub_vv_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsub.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsub_vv_bf16m2_tu(vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsub_vv_bf16m2_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsub_vf_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsub.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsub_vf_bf16m2_tu(vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsub_vf_bf16m2_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsub_vv_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsub.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsub_vv_bf16m4_tu(vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsub_vv_bf16m4_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsub_vf_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsub.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsub_vf_bf16m4_tu(vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsub_vf_bf16m4_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsub_vv_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsub.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsub_vv_bf16m8_tu(vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsub_vv_bf16m8_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsub_vf_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsub.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsub_vf_bf16m8_tu(vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsub_vf_bf16m8_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsub_vv_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsub.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsub_vv_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsub_vv_bf16mf4_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsub_vf_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsub.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsub_vf_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsub_vf_bf16mf4_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsub_vv_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsub.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsub_vv_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsub_vv_bf16mf2_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsub_vf_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsub.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsub_vf_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsub_vf_bf16mf2_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsub_vv_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsub.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsub_vv_bf16m1_tum(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsub_vv_bf16m1_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsub_vf_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsub.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsub_vf_bf16m1_tum(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsub_vf_bf16m1_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsub_vv_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsub.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsub_vv_bf16m2_tum(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsub_vv_bf16m2_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsub_vf_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsub.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsub_vf_bf16m2_tum(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsub_vf_bf16m2_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsub_vv_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsub.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsub_vv_bf16m4_tum(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsub_vv_bf16m4_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsub_vf_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsub.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsub_vf_bf16m4_tum(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsub_vf_bf16m4_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsub_vv_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsub.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsub_vv_bf16m8_tum(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsub_vv_bf16m8_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsub_vf_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsub.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsub_vf_bf16m8_tum(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsub_vf_bf16m8_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsub_vv_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsub.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsub_vv_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsub_vv_bf16mf4_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsub_vf_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsub.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsub_vf_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsub_vf_bf16mf4_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsub_vv_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsub.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsub_vv_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsub_vv_bf16mf2_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsub_vf_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsub.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsub_vf_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsub_vf_bf16mf2_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsub_vv_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsub.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsub_vv_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsub_vv_bf16m1_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsub_vf_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsub.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsub_vf_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsub_vf_bf16m1_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsub_vv_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsub.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsub_vv_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsub_vv_bf16m2_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsub_vf_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsub.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsub_vf_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsub_vf_bf16m2_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsub_vv_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsub.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsub_vv_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsub_vv_bf16m4_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsub_vf_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsub.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsub_vf_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsub_vf_bf16m4_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsub_vv_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsub.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsub_vv_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsub_vv_bf16m8_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsub_vf_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsub.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsub_vf_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsub_vf_bf16m8_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsub_vv_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsub.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsub_vv_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsub_vv_bf16mf4_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsub_vf_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsub.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsub_vf_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsub_vf_bf16mf4_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsub_vv_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsub.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsub_vv_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsub_vv_bf16mf2_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsub_vf_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsub.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsub_vf_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsub_vf_bf16mf2_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsub_vv_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsub.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsub_vv_bf16m1_mu(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsub_vv_bf16m1_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsub_vf_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsub.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsub_vf_bf16m1_mu(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsub_vf_bf16m1_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsub_vv_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsub.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsub_vv_bf16m2_mu(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsub_vv_bf16m2_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsub_vf_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsub.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsub_vf_bf16m2_mu(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsub_vf_bf16m2_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsub_vv_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsub.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsub_vv_bf16m4_mu(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsub_vv_bf16m4_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsub_vf_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsub.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsub_vf_bf16m4_mu(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsub_vf_bf16m4_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsub_vv_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsub.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsub_vv_bf16m8_mu(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsub_vv_bf16m8_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsub_vf_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsub.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsub_vf_bf16m8_mu(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsub_vf_bf16m8_mu(mask, vd, rs1, vs2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfmul.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfmul.c
new file mode 100644
index 0000000000000..716f056f3e12c
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfmul.c
@@ -0,0 +1,489 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmul_vv_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmul.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmul_vv_bf16mf4_tu(vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfmul_vv_bf16mf4_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmul_vf_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmul.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmul_vf_bf16mf4_tu(vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_vf_bf16mf4_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmul_vv_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmul.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmul_vv_bf16mf2_tu(vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfmul_vv_bf16mf2_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmul_vf_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmul.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmul_vf_bf16mf2_tu(vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_vf_bf16mf2_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmul_vv_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmul.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmul_vv_bf16m1_tu(vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfmul_vv_bf16m1_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmul_vf_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmul.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmul_vf_bf16m1_tu(vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_vf_bf16m1_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmul_vv_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmul.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmul_vv_bf16m2_tu(vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfmul_vv_bf16m2_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmul_vf_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmul.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmul_vf_bf16m2_tu(vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_vf_bf16m2_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmul_vv_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmul.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmul_vv_bf16m4_tu(vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfmul_vv_bf16m4_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmul_vf_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmul.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmul_vf_bf16m4_tu(vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_vf_bf16m4_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmul_vv_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmul.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmul_vv_bf16m8_tu(vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfmul_vv_bf16m8_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmul_vf_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmul.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmul_vf_bf16m8_tu(vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_vf_bf16m8_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmul_vv_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmul.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmul_vv_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfmul_vv_bf16mf4_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmul_vf_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmul.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmul_vf_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_vf_bf16mf4_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmul_vv_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmul.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmul_vv_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfmul_vv_bf16mf2_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmul_vf_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmul.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmul_vf_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_vf_bf16mf2_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmul_vv_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmul.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmul_vv_bf16m1_tum(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfmul_vv_bf16m1_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmul_vf_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmul.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmul_vf_bf16m1_tum(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_vf_bf16m1_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmul_vv_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmul.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmul_vv_bf16m2_tum(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfmul_vv_bf16m2_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmul_vf_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmul.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmul_vf_bf16m2_tum(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_vf_bf16m2_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmul_vv_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmul.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmul_vv_bf16m4_tum(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfmul_vv_bf16m4_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmul_vf_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmul.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmul_vf_bf16m4_tum(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_vf_bf16m4_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmul_vv_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmul.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmul_vv_bf16m8_tum(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfmul_vv_bf16m8_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmul_vf_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmul.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmul_vf_bf16m8_tum(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_vf_bf16m8_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmul_vv_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmul.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmul_vv_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfmul_vv_bf16mf4_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmul_vf_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmul.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmul_vf_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_vf_bf16mf4_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmul_vv_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmul.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmul_vv_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfmul_vv_bf16mf2_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmul_vf_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmul.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmul_vf_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_vf_bf16mf2_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmul_vv_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmul.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmul_vv_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfmul_vv_bf16m1_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmul_vf_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmul.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmul_vf_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_vf_bf16m1_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmul_vv_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmul.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmul_vv_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfmul_vv_bf16m2_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmul_vf_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmul.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmul_vf_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_vf_bf16m2_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmul_vv_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmul.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmul_vv_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfmul_vv_bf16m4_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmul_vf_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmul.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmul_vf_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_vf_bf16m4_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmul_vv_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmul.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmul_vv_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfmul_vv_bf16m8_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmul_vf_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmul.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmul_vf_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_vf_bf16m8_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmul_vv_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmul.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmul_vv_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfmul_vv_bf16mf4_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmul_vf_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmul.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmul_vf_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_vf_bf16mf4_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmul_vv_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmul.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmul_vv_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfmul_vv_bf16mf2_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmul_vf_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmul.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmul_vf_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_vf_bf16mf2_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmul_vv_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmul.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmul_vv_bf16m1_mu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfmul_vv_bf16m1_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmul_vf_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmul.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmul_vf_bf16m1_mu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_vf_bf16m1_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmul_vv_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmul.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmul_vv_bf16m2_mu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfmul_vv_bf16m2_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmul_vf_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmul.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmul_vf_bf16m2_mu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_vf_bf16m2_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmul_vv_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmul.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmul_vv_bf16m4_mu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfmul_vv_bf16m4_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmul_vf_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmul.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmul_vf_bf16m4_mu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_vf_bf16m4_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmul_vv_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmul.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmul_vv_bf16m8_mu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfmul_vv_bf16m8_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmul_vf_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmul.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmul_vf_bf16m8_mu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_vf_bf16m8_mu(mask, maskedoff, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfmv.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfmv.c
new file mode 100644
index 0000000000000..069ee6a2a5df3
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfmv.c
@@ -0,0 +1,129 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmv_v_f_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], bfloat noundef [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmv.v.f.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], bfloat [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmv_v_f_bf16mf4_tu(vbfloat16mf4_t maskedoff, __bf16 src, size_t vl) {
+  return __riscv_vfmv_v_f_bf16mf4_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmv_v_f_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], bfloat noundef [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmv.v.f.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], bfloat [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmv_v_f_bf16mf2_tu(vbfloat16mf2_t maskedoff, __bf16 src, size_t vl) {
+  return __riscv_vfmv_v_f_bf16mf2_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmv_v_f_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], bfloat noundef [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmv.v.f.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], bfloat [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmv_v_f_bf16m1_tu(vbfloat16m1_t maskedoff, __bf16 src, size_t vl) {
+  return __riscv_vfmv_v_f_bf16m1_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmv_v_f_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], bfloat noundef [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmv.v.f.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], bfloat [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmv_v_f_bf16m2_tu(vbfloat16m2_t maskedoff, __bf16 src, size_t vl) {
+  return __riscv_vfmv_v_f_bf16m2_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmv_v_f_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], bfloat noundef [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmv.v.f.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], bfloat [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmv_v_f_bf16m4_tu(vbfloat16m4_t maskedoff, __bf16 src, size_t vl) {
+  return __riscv_vfmv_v_f_bf16m4_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmv_v_f_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], bfloat noundef [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmv.v.f.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], bfloat [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmv_v_f_bf16m8_tu(vbfloat16m8_t maskedoff, __bf16 src, size_t vl) {
+  return __riscv_vfmv_v_f_bf16m8_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmv_s_f_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], bfloat noundef [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmv.s.f.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], bfloat [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmv_s_f_bf16mf4_tu(vbfloat16mf4_t maskedoff, __bf16 src, size_t vl) {
+  return __riscv_vfmv_s_f_bf16mf4_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmv_s_f_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], bfloat noundef [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmv.s.f.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], bfloat [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmv_s_f_bf16mf2_tu(vbfloat16mf2_t maskedoff, __bf16 src, size_t vl) {
+  return __riscv_vfmv_s_f_bf16mf2_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmv_s_f_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], bfloat noundef [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmv.s.f.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], bfloat [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmv_s_f_bf16m1_tu(vbfloat16m1_t maskedoff, __bf16 src, size_t vl) {
+  return __riscv_vfmv_s_f_bf16m1_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmv_s_f_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], bfloat noundef [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmv.s.f.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], bfloat [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmv_s_f_bf16m2_tu(vbfloat16m2_t maskedoff, __bf16 src, size_t vl) {
+  return __riscv_vfmv_s_f_bf16m2_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmv_s_f_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], bfloat noundef [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmv.s.f.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], bfloat [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmv_s_f_bf16m4_tu(vbfloat16m4_t maskedoff, __bf16 src, size_t vl) {
+  return __riscv_vfmv_s_f_bf16m4_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmv_s_f_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], bfloat noundef [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmv.s.f.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], bfloat [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmv_s_f_bf16m8_tu(vbfloat16m8_t maskedoff, __bf16 src, size_t vl) {
+  return __riscv_vfmv_s_f_bf16m8_tu(maskedoff, src, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfncvt.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfncvt.c
new file mode 100644
index 0000000000000..36d4fc332499f
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfncvt.c
@@ -0,0 +1,1577 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_x_f_w_bf16mf4_i8mf8_tu(
+// CHECK-RV64-SAME: <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.x.f.w.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vint8mf8_t test_vfncvt_x_f_w_bf16mf4_i8mf8_tu(vint8mf8_t vd, vbfloat16mf4_t vs2,
+                                              size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16mf4_i8mf8_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_x_f_w_bf16mf2_i8mf4_tu(
+// CHECK-RV64-SAME: <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.x.f.w.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vint8mf4_t test_vfncvt_x_f_w_bf16mf2_i8mf4_tu(vint8mf4_t vd, vbfloat16mf2_t vs2,
+                                              size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16mf2_i8mf4_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_x_f_w_bf16m1_i8mf2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.x.f.w.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vint8mf2_t test_vfncvt_x_f_w_bf16m1_i8mf2_tu(vint8mf2_t vd, vbfloat16m1_t vs2,
+                                             size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m1_i8mf2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_x_f_w_bf16m2_i8m1_tu(
+// CHECK-RV64-SAME: <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.x.f.w.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vint8m1_t test_vfncvt_x_f_w_bf16m2_i8m1_tu(vint8m1_t vd, vbfloat16m2_t vs2,
+                                           size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m2_i8m1_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_x_f_w_bf16m4_i8m2_tu(
+// CHECK-RV64-SAME: <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.x.f.w.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vint8m2_t test_vfncvt_x_f_w_bf16m4_i8m2_tu(vint8m2_t vd, vbfloat16m4_t vs2,
+                                           size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m4_i8m2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_x_f_w_bf16m8_i8m4_tu(
+// CHECK-RV64-SAME: <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.x.f.w.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vint8m4_t test_vfncvt_x_f_w_bf16m8_i8m4_tu(vint8m4_t vd, vbfloat16m8_t vs2,
+                                           size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m8_i8m4_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_xu_f_w_bf16mf4_u8mf8_tu(
+// CHECK-RV64-SAME: <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vuint8mf8_t test_vfncvt_xu_f_w_bf16mf4_u8mf8_tu(vuint8mf8_t vd,
+                                                vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16mf4_u8mf8_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_xu_f_w_bf16mf2_u8mf4_tu(
+// CHECK-RV64-SAME: <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vuint8mf4_t test_vfncvt_xu_f_w_bf16mf2_u8mf4_tu(vuint8mf4_t vd,
+                                                vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16mf2_u8mf4_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_xu_f_w_bf16m1_u8mf2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vuint8mf2_t test_vfncvt_xu_f_w_bf16m1_u8mf2_tu(vuint8mf2_t vd,
+                                               vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m1_u8mf2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_xu_f_w_bf16m2_u8m1_tu(
+// CHECK-RV64-SAME: <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vuint8m1_t test_vfncvt_xu_f_w_bf16m2_u8m1_tu(vuint8m1_t vd, vbfloat16m2_t vs2,
+                                             size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m2_u8m1_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_xu_f_w_bf16m4_u8m2_tu(
+// CHECK-RV64-SAME: <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vuint8m2_t test_vfncvt_xu_f_w_bf16m4_u8m2_tu(vuint8m2_t vd, vbfloat16m4_t vs2,
+                                             size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m4_u8m2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_xu_f_w_bf16m8_u8m4_tu(
+// CHECK-RV64-SAME: <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vuint8m4_t test_vfncvt_xu_f_w_bf16m8_u8m4_tu(vuint8m4_t vd, vbfloat16m8_t vs2,
+                                             size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m8_u8m4_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfncvt_f_f_w_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfncvt.f.f.w.nxv1bf16.nxv1f32.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x float> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfncvt_f_f_w_bf16mf4_tu(vbfloat16mf4_t vd,
+                                            vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16mf4_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfncvt_f_f_w_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfncvt.f.f.w.nxv2bf16.nxv2f32.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x float> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfncvt_f_f_w_bf16mf2_tu(vbfloat16mf2_t vd, vfloat32m1_t vs2,
+                                            size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16mf2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfncvt_f_f_w_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfncvt.f.f.w.nxv4bf16.nxv4f32.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x float> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfncvt_f_f_w_bf16m1_tu(vbfloat16m1_t vd, vfloat32m2_t vs2,
+                                          size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16m1_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfncvt_f_f_w_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfncvt.f.f.w.nxv8bf16.nxv8f32.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x float> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfncvt_f_f_w_bf16m2_tu(vbfloat16m2_t vd, vfloat32m4_t vs2,
+                                          size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16m2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfncvt_f_f_w_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfncvt.f.f.w.nxv16bf16.nxv16f32.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x float> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfncvt_f_f_w_bf16m4_tu(vbfloat16m4_t vd, vfloat32m8_t vs2,
+                                          size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16m4_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_x_f_w_bf16mf4_i8mf8_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vint8mf8_t test_vfncvt_x_f_w_bf16mf4_i8mf8_tum(vbool64_t vm, vint8mf8_t vd,
+                                               vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16mf4_i8mf8_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_x_f_w_bf16mf2_i8mf4_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vint8mf4_t test_vfncvt_x_f_w_bf16mf2_i8mf4_tum(vbool32_t vm, vint8mf4_t vd,
+                                               vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16mf2_i8mf4_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_x_f_w_bf16m1_i8mf2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vint8mf2_t test_vfncvt_x_f_w_bf16m1_i8mf2_tum(vbool16_t vm, vint8mf2_t vd,
+                                              vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m1_i8mf2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_x_f_w_bf16m2_i8m1_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vint8m1_t test_vfncvt_x_f_w_bf16m2_i8m1_tum(vbool8_t vm, vint8m1_t vd,
+                                            vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m2_i8m1_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_x_f_w_bf16m4_i8m2_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vint8m2_t test_vfncvt_x_f_w_bf16m4_i8m2_tum(vbool4_t vm, vint8m2_t vd,
+                                            vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m4_i8m2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_x_f_w_bf16m8_i8m4_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vint8m4_t test_vfncvt_x_f_w_bf16m8_i8m4_tum(vbool2_t vm, vint8m4_t vd,
+                                            vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m8_i8m4_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_xu_f_w_bf16mf4_u8mf8_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vuint8mf8_t test_vfncvt_xu_f_w_bf16mf4_u8mf8_tum(vbool64_t vm, vuint8mf8_t vd,
+                                                 vbfloat16mf4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16mf4_u8mf8_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_xu_f_w_bf16mf2_u8mf4_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vuint8mf4_t test_vfncvt_xu_f_w_bf16mf2_u8mf4_tum(vbool32_t vm, vuint8mf4_t vd,
+                                                 vbfloat16mf2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16mf2_u8mf4_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_xu_f_w_bf16m1_u8mf2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vuint8mf2_t test_vfncvt_xu_f_w_bf16m1_u8mf2_tum(vbool16_t vm, vuint8mf2_t vd,
+                                                vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m1_u8mf2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_xu_f_w_bf16m2_u8m1_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vuint8m1_t test_vfncvt_xu_f_w_bf16m2_u8m1_tum(vbool8_t vm, vuint8m1_t vd,
+                                              vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m2_u8m1_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_xu_f_w_bf16m4_u8m2_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vuint8m2_t test_vfncvt_xu_f_w_bf16m4_u8m2_tum(vbool4_t vm, vuint8m2_t vd,
+                                              vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m4_u8m2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_xu_f_w_bf16m8_u8m4_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vuint8m4_t test_vfncvt_xu_f_w_bf16m8_u8m4_tum(vbool2_t vm, vuint8m4_t vd,
+                                              vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m8_u8m4_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfncvt_f_f_w_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv1bf16.nxv1f32.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfncvt_f_f_w_bf16mf4_tum(vbool64_t vm, vbfloat16mf4_t vd,
+                                             vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16mf4_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfncvt_f_f_w_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv2bf16.nxv2f32.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfncvt_f_f_w_bf16mf2_tum(vbool32_t vm, vbfloat16mf2_t vd,
+                                             vfloat32m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16mf2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfncvt_f_f_w_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv4bf16.nxv4f32.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfncvt_f_f_w_bf16m1_tum(vbool16_t vm, vbfloat16m1_t vd,
+                                           vfloat32m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16m1_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfncvt_f_f_w_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv8bf16.nxv8f32.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfncvt_f_f_w_bf16m2_tum(vbool8_t vm, vbfloat16m2_t vd,
+                                           vfloat32m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16m2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfncvt_f_f_w_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv16bf16.nxv16f32.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfncvt_f_f_w_bf16m4_tum(vbool4_t vm, vbfloat16m4_t vd,
+                                           vfloat32m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16m4_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_x_f_w_bf16mf4_i8mf8_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vint8mf8_t test_vfncvt_x_f_w_bf16mf4_i8mf8_tumu(vbool64_t vm, vint8mf8_t vd,
+                                                vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16mf4_i8mf8_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_x_f_w_bf16mf2_i8mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vint8mf4_t test_vfncvt_x_f_w_bf16mf2_i8mf4_tumu(vbool32_t vm, vint8mf4_t vd,
+                                                vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16mf2_i8mf4_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_x_f_w_bf16m1_i8mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vint8mf2_t test_vfncvt_x_f_w_bf16m1_i8mf2_tumu(vbool16_t vm, vint8mf2_t vd,
+                                               vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m1_i8mf2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_x_f_w_bf16m2_i8m1_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vint8m1_t test_vfncvt_x_f_w_bf16m2_i8m1_tumu(vbool8_t vm, vint8m1_t vd,
+                                             vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m2_i8m1_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_x_f_w_bf16m4_i8m2_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vint8m2_t test_vfncvt_x_f_w_bf16m4_i8m2_tumu(vbool4_t vm, vint8m2_t vd,
+                                             vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m4_i8m2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_x_f_w_bf16m8_i8m4_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vint8m4_t test_vfncvt_x_f_w_bf16m8_i8m4_tumu(vbool2_t vm, vint8m4_t vd,
+                                             vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m8_i8m4_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_xu_f_w_bf16mf4_u8mf8_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vuint8mf8_t test_vfncvt_xu_f_w_bf16mf4_u8mf8_tumu(vbool64_t vm, vuint8mf8_t vd,
+                                                  vbfloat16mf4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16mf4_u8mf8_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_xu_f_w_bf16mf2_u8mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vuint8mf4_t test_vfncvt_xu_f_w_bf16mf2_u8mf4_tumu(vbool32_t vm, vuint8mf4_t vd,
+                                                  vbfloat16mf2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16mf2_u8mf4_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_xu_f_w_bf16m1_u8mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vuint8mf2_t test_vfncvt_xu_f_w_bf16m1_u8mf2_tumu(vbool16_t vm, vuint8mf2_t vd,
+                                                 vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m1_u8mf2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_xu_f_w_bf16m2_u8m1_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vuint8m1_t test_vfncvt_xu_f_w_bf16m2_u8m1_tumu(vbool8_t vm, vuint8m1_t vd,
+                                               vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m2_u8m1_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_xu_f_w_bf16m4_u8m2_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vuint8m2_t test_vfncvt_xu_f_w_bf16m4_u8m2_tumu(vbool4_t vm, vuint8m2_t vd,
+                                               vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m4_u8m2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_xu_f_w_bf16m8_u8m4_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vuint8m4_t test_vfncvt_xu_f_w_bf16m8_u8m4_tumu(vbool2_t vm, vuint8m4_t vd,
+                                               vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m8_u8m4_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfncvt_f_f_w_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv1bf16.nxv1f32.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfncvt_f_f_w_bf16mf4_tumu(vbool64_t vm, vbfloat16mf4_t vd,
+                                              vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16mf4_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfncvt_f_f_w_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv2bf16.nxv2f32.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfncvt_f_f_w_bf16mf2_tumu(vbool32_t vm, vbfloat16mf2_t vd,
+                                              vfloat32m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16mf2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfncvt_f_f_w_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv4bf16.nxv4f32.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfncvt_f_f_w_bf16m1_tumu(vbool16_t vm, vbfloat16m1_t vd,
+                                            vfloat32m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16m1_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfncvt_f_f_w_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv8bf16.nxv8f32.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfncvt_f_f_w_bf16m2_tumu(vbool8_t vm, vbfloat16m2_t vd,
+                                            vfloat32m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16m2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfncvt_f_f_w_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv16bf16.nxv16f32.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfncvt_f_f_w_bf16m4_tumu(vbool4_t vm, vbfloat16m4_t vd,
+                                            vfloat32m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16m4_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_x_f_w_bf16mf4_i8mf8_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vint8mf8_t test_vfncvt_x_f_w_bf16mf4_i8mf8_mu(vbool64_t vm, vint8mf8_t vd,
+                                              vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16mf4_i8mf8_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_x_f_w_bf16mf2_i8mf4_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vint8mf4_t test_vfncvt_x_f_w_bf16mf2_i8mf4_mu(vbool32_t vm, vint8mf4_t vd,
+                                              vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16mf2_i8mf4_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_x_f_w_bf16m1_i8mf2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vint8mf2_t test_vfncvt_x_f_w_bf16m1_i8mf2_mu(vbool16_t vm, vint8mf2_t vd,
+                                             vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m1_i8mf2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_x_f_w_bf16m2_i8m1_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vint8m1_t test_vfncvt_x_f_w_bf16m2_i8m1_mu(vbool8_t vm, vint8m1_t vd,
+                                           vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m2_i8m1_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_x_f_w_bf16m4_i8m2_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vint8m2_t test_vfncvt_x_f_w_bf16m4_i8m2_mu(vbool4_t vm, vint8m2_t vd,
+                                           vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m4_i8m2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_x_f_w_bf16m8_i8m4_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vint8m4_t test_vfncvt_x_f_w_bf16m8_i8m4_mu(vbool2_t vm, vint8m4_t vd,
+                                           vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m8_i8m4_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_xu_f_w_bf16mf4_u8mf8_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vuint8mf8_t test_vfncvt_xu_f_w_bf16mf4_u8mf8_mu(vbool64_t vm, vuint8mf8_t vd,
+                                                vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16mf4_u8mf8_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_xu_f_w_bf16mf2_u8mf4_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vuint8mf4_t test_vfncvt_xu_f_w_bf16mf2_u8mf4_mu(vbool32_t vm, vuint8mf4_t vd,
+                                                vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16mf2_u8mf4_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_xu_f_w_bf16m1_u8mf2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vuint8mf2_t test_vfncvt_xu_f_w_bf16m1_u8mf2_mu(vbool16_t vm, vuint8mf2_t vd,
+                                               vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m1_u8mf2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_xu_f_w_bf16m2_u8m1_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vuint8m1_t test_vfncvt_xu_f_w_bf16m2_u8m1_mu(vbool8_t vm, vuint8m1_t vd,
+                                             vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m2_u8m1_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_xu_f_w_bf16m4_u8m2_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vuint8m2_t test_vfncvt_xu_f_w_bf16m4_u8m2_mu(vbool4_t vm, vuint8m2_t vd,
+                                             vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m4_u8m2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_xu_f_w_bf16m8_u8m4_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vuint8m4_t test_vfncvt_xu_f_w_bf16m8_u8m4_mu(vbool2_t vm, vuint8m4_t vd,
+                                             vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m8_u8m4_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfncvt_f_f_w_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv1bf16.nxv1f32.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfncvt_f_f_w_bf16mf4_mu(vbool64_t vm, vbfloat16mf4_t vd,
+                                            vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16mf4_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfncvt_f_f_w_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv2bf16.nxv2f32.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfncvt_f_f_w_bf16mf2_mu(vbool32_t vm, vbfloat16mf2_t vd,
+                                            vfloat32m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16mf2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfncvt_f_f_w_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv4bf16.nxv4f32.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfncvt_f_f_w_bf16m1_mu(vbool16_t vm, vbfloat16m1_t vd,
+                                          vfloat32m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16m1_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfncvt_f_f_w_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv8bf16.nxv8f32.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfncvt_f_f_w_bf16m2_mu(vbool8_t vm, vbfloat16m2_t vd,
+                                          vfloat32m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16m2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfncvt_f_f_w_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv16bf16.nxv16f32.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfncvt_f_f_w_bf16m4_mu(vbool4_t vm, vbfloat16m4_t vd,
+                                          vfloat32m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16m4_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_x_f_w_bf16mf4_i8mf8_rm_tu(
+// CHECK-RV64-SAME: <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.x.f.w.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vint8mf8_t test_vfncvt_x_f_w_bf16mf4_i8mf8_rm_tu(vint8mf8_t vd,
+                                                 vbfloat16mf4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16mf4_i8mf8_rm_tu(vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_x_f_w_bf16mf2_i8mf4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.x.f.w.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vint8mf4_t test_vfncvt_x_f_w_bf16mf2_i8mf4_rm_tu(vint8mf4_t vd,
+                                                 vbfloat16mf2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16mf2_i8mf4_rm_tu(vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_x_f_w_bf16m1_i8mf2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.x.f.w.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vint8mf2_t test_vfncvt_x_f_w_bf16m1_i8mf2_rm_tu(vint8mf2_t vd,
+                                                vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m1_i8mf2_rm_tu(vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_x_f_w_bf16m2_i8m1_rm_tu(
+// CHECK-RV64-SAME: <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.x.f.w.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vint8m1_t test_vfncvt_x_f_w_bf16m2_i8m1_rm_tu(vint8m1_t vd, vbfloat16m2_t vs2,
+                                              size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m2_i8m1_rm_tu(vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_x_f_w_bf16m4_i8m2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.x.f.w.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vint8m2_t test_vfncvt_x_f_w_bf16m4_i8m2_rm_tu(vint8m2_t vd, vbfloat16m4_t vs2,
+                                              size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m4_i8m2_rm_tu(vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_x_f_w_bf16m8_i8m4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.x.f.w.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vint8m4_t test_vfncvt_x_f_w_bf16m8_i8m4_rm_tu(vint8m4_t vd, vbfloat16m8_t vs2,
+                                              size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m8_i8m4_rm_tu(vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_xu_f_w_bf16mf4_u8mf8_rm_tu(
+// CHECK-RV64-SAME: <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vuint8mf8_t test_vfncvt_xu_f_w_bf16mf4_u8mf8_rm_tu(vuint8mf8_t vd,
+                                                   vbfloat16mf4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16mf4_u8mf8_rm_tu(vd, vs2, __RISCV_FRM_RNE,
+                                                   vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_xu_f_w_bf16mf2_u8mf4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vuint8mf4_t test_vfncvt_xu_f_w_bf16mf2_u8mf4_rm_tu(vuint8mf4_t vd,
+                                                   vbfloat16mf2_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16mf2_u8mf4_rm_tu(vd, vs2, __RISCV_FRM_RNE,
+                                                   vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_xu_f_w_bf16m1_u8mf2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vuint8mf2_t test_vfncvt_xu_f_w_bf16m1_u8mf2_rm_tu(vuint8mf2_t vd,
+                                                  vbfloat16m1_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m1_u8mf2_rm_tu(vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_xu_f_w_bf16m2_u8m1_rm_tu(
+// CHECK-RV64-SAME: <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vuint8m1_t test_vfncvt_xu_f_w_bf16m2_u8m1_rm_tu(vuint8m1_t vd,
+                                                vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m2_u8m1_rm_tu(vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_xu_f_w_bf16m4_u8m2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vuint8m2_t test_vfncvt_xu_f_w_bf16m4_u8m2_rm_tu(vuint8m2_t vd,
+                                                vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m4_u8m2_rm_tu(vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_xu_f_w_bf16m8_u8m4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vuint8m4_t test_vfncvt_xu_f_w_bf16m8_u8m4_rm_tu(vuint8m4_t vd,
+                                                vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m8_u8m4_rm_tu(vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfncvt_f_f_w_bf16mf4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfncvt.f.f.w.nxv1bf16.nxv1f32.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x float> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfncvt_f_f_w_bf16mf4_rm_tu(vbfloat16mf4_t vd,
+                                               vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16mf4_rm_tu(vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfncvt_f_f_w_bf16mf2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfncvt.f.f.w.nxv2bf16.nxv2f32.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x float> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfncvt_f_f_w_bf16mf2_rm_tu(vbfloat16mf2_t vd,
+                                               vfloat32m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16mf2_rm_tu(vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfncvt_f_f_w_bf16m1_rm_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfncvt.f.f.w.nxv4bf16.nxv4f32.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x float> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfncvt_f_f_w_bf16m1_rm_tu(vbfloat16m1_t vd, vfloat32m2_t vs2,
+                                             size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16m1_rm_tu(vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfncvt_f_f_w_bf16m2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfncvt.f.f.w.nxv8bf16.nxv8f32.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x float> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfncvt_f_f_w_bf16m2_rm_tu(vbfloat16m2_t vd, vfloat32m4_t vs2,
+                                             size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16m2_rm_tu(vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfncvt_f_f_w_bf16m4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfncvt.f.f.w.nxv16bf16.nxv16f32.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x float> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfncvt_f_f_w_bf16m4_rm_tu(vbfloat16m4_t vd, vfloat32m8_t vs2,
+                                             size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16m4_rm_tu(vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_x_f_w_bf16mf4_i8mf8_rm_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vint8mf8_t test_vfncvt_x_f_w_bf16mf4_i8mf8_rm_tum(vbool64_t vm, vint8mf8_t vd,
+                                                  vbfloat16mf4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16mf4_i8mf8_rm_tum(vm, vd, vs2, __RISCV_FRM_RNE,
+                                                   vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_x_f_w_bf16mf2_i8mf4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vint8mf4_t test_vfncvt_x_f_w_bf16mf2_i8mf4_rm_tum(vbool32_t vm, vint8mf4_t vd,
+                                                  vbfloat16mf2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16mf2_i8mf4_rm_tum(vm, vd, vs2, __RISCV_FRM_RNE,
+                                                   vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_x_f_w_bf16m1_i8mf2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vint8mf2_t test_vfncvt_x_f_w_bf16m1_i8mf2_rm_tum(vbool16_t vm, vint8mf2_t vd,
+                                                 vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m1_i8mf2_rm_tum(vm, vd, vs2, __RISCV_FRM_RNE,
+                                                  vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_x_f_w_bf16m2_i8m1_rm_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vint8m1_t test_vfncvt_x_f_w_bf16m2_i8m1_rm_tum(vbool8_t vm, vint8m1_t vd,
+                                               vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m2_i8m1_rm_tum(vm, vd, vs2, __RISCV_FRM_RNE,
+                                                 vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_x_f_w_bf16m4_i8m2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vint8m2_t test_vfncvt_x_f_w_bf16m4_i8m2_rm_tum(vbool4_t vm, vint8m2_t vd,
+                                               vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m4_i8m2_rm_tum(vm, vd, vs2, __RISCV_FRM_RNE,
+                                                 vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_x_f_w_bf16m8_i8m4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vint8m4_t test_vfncvt_x_f_w_bf16m8_i8m4_rm_tum(vbool2_t vm, vint8m4_t vd,
+                                               vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m8_i8m4_rm_tum(vm, vd, vs2, __RISCV_FRM_RNE,
+                                                 vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_xu_f_w_bf16mf4_u8mf8_rm_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vuint8mf8_t test_vfncvt_xu_f_w_bf16mf4_u8mf8_rm_tum(vbool64_t vm,
+                                                    vuint8mf8_t vd,
+                                                    vbfloat16mf4_t vs2,
+                                                    size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16mf4_u8mf8_rm_tum(vm, vd, vs2,
+                                                    __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_xu_f_w_bf16mf2_u8mf4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vuint8mf4_t test_vfncvt_xu_f_w_bf16mf2_u8mf4_rm_tum(vbool32_t vm,
+                                                    vuint8mf4_t vd,
+                                                    vbfloat16mf2_t vs2,
+                                                    size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16mf2_u8mf4_rm_tum(vm, vd, vs2,
+                                                    __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_xu_f_w_bf16m1_u8mf2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vuint8mf2_t test_vfncvt_xu_f_w_bf16m1_u8mf2_rm_tum(vbool16_t vm, vuint8mf2_t vd,
+                                                   vbfloat16m1_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m1_u8mf2_rm_tum(vm, vd, vs2, __RISCV_FRM_RNE,
+                                                   vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_xu_f_w_bf16m2_u8m1_rm_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vuint8m1_t test_vfncvt_xu_f_w_bf16m2_u8m1_rm_tum(vbool8_t vm, vuint8m1_t vd,
+                                                 vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m2_u8m1_rm_tum(vm, vd, vs2, __RISCV_FRM_RNE,
+                                                  vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_xu_f_w_bf16m4_u8m2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vuint8m2_t test_vfncvt_xu_f_w_bf16m4_u8m2_rm_tum(vbool4_t vm, vuint8m2_t vd,
+                                                 vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m4_u8m2_rm_tum(vm, vd, vs2, __RISCV_FRM_RNE,
+                                                  vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_xu_f_w_bf16m8_u8m4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vuint8m4_t test_vfncvt_xu_f_w_bf16m8_u8m4_rm_tum(vbool2_t vm, vuint8m4_t vd,
+                                                 vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m8_u8m4_rm_tum(vm, vd, vs2, __RISCV_FRM_RNE,
+                                                  vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfncvt_f_f_w_bf16mf4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv1bf16.nxv1f32.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfncvt_f_f_w_bf16mf4_rm_tum(vbool64_t vm, vbfloat16mf4_t vd,
+                                                vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16mf4_rm_tum(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfncvt_f_f_w_bf16mf2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv2bf16.nxv2f32.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfncvt_f_f_w_bf16mf2_rm_tum(vbool32_t vm, vbfloat16mf2_t vd,
+                                                vfloat32m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16mf2_rm_tum(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfncvt_f_f_w_bf16m1_rm_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv4bf16.nxv4f32.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfncvt_f_f_w_bf16m1_rm_tum(vbool16_t vm, vbfloat16m1_t vd,
+                                              vfloat32m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16m1_rm_tum(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfncvt_f_f_w_bf16m2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv8bf16.nxv8f32.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfncvt_f_f_w_bf16m2_rm_tum(vbool8_t vm, vbfloat16m2_t vd,
+                                              vfloat32m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16m2_rm_tum(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfncvt_f_f_w_bf16m4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv16bf16.nxv16f32.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfncvt_f_f_w_bf16m4_rm_tum(vbool4_t vm, vbfloat16m4_t vd,
+                                              vfloat32m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16m4_rm_tum(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_x_f_w_bf16mf4_i8mf8_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vint8mf8_t test_vfncvt_x_f_w_bf16mf4_i8mf8_rm_tumu(vbool64_t vm, vint8mf8_t vd,
+                                                   vbfloat16mf4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16mf4_i8mf8_rm_tumu(vm, vd, vs2,
+                                                    __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_x_f_w_bf16mf2_i8mf4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vint8mf4_t test_vfncvt_x_f_w_bf16mf2_i8mf4_rm_tumu(vbool32_t vm, vint8mf4_t vd,
+                                                   vbfloat16mf2_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16mf2_i8mf4_rm_tumu(vm, vd, vs2,
+                                                    __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_x_f_w_bf16m1_i8mf2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vint8mf2_t test_vfncvt_x_f_w_bf16m1_i8mf2_rm_tumu(vbool16_t vm, vint8mf2_t vd,
+                                                  vbfloat16m1_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m1_i8mf2_rm_tumu(vm, vd, vs2, __RISCV_FRM_RNE,
+                                                   vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_x_f_w_bf16m2_i8m1_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vint8m1_t test_vfncvt_x_f_w_bf16m2_i8m1_rm_tumu(vbool8_t vm, vint8m1_t vd,
+                                                vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m2_i8m1_rm_tumu(vm, vd, vs2, __RISCV_FRM_RNE,
+                                                  vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_x_f_w_bf16m4_i8m2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vint8m2_t test_vfncvt_x_f_w_bf16m4_i8m2_rm_tumu(vbool4_t vm, vint8m2_t vd,
+                                                vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m4_i8m2_rm_tumu(vm, vd, vs2, __RISCV_FRM_RNE,
+                                                  vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_x_f_w_bf16m8_i8m4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vint8m4_t test_vfncvt_x_f_w_bf16m8_i8m4_rm_tumu(vbool2_t vm, vint8m4_t vd,
+                                                vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m8_i8m4_rm_tumu(vm, vd, vs2, __RISCV_FRM_RNE,
+                                                  vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_xu_f_w_bf16mf4_u8mf8_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vuint8mf8_t test_vfncvt_xu_f_w_bf16mf4_u8mf8_rm_tumu(vbool64_t vm,
+                                                     vuint8mf8_t vd,
+                                                     vbfloat16mf4_t vs2,
+                                                     size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16mf4_u8mf8_rm_tumu(vm, vd, vs2,
+                                                     __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_xu_f_w_bf16mf2_u8mf4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vuint8mf4_t test_vfncvt_xu_f_w_bf16mf2_u8mf4_rm_tumu(vbool32_t vm,
+                                                     vuint8mf4_t vd,
+                                                     vbfloat16mf2_t vs2,
+                                                     size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16mf2_u8mf4_rm_tumu(vm, vd, vs2,
+                                                     __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_xu_f_w_bf16m1_u8mf2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vuint8mf2_t test_vfncvt_xu_f_w_bf16m1_u8mf2_rm_tumu(vbool16_t vm,
+                                                    vuint8mf2_t vd,
+                                                    vbfloat16m1_t vs2,
+                                                    size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m1_u8mf2_rm_tumu(vm, vd, vs2,
+                                                    __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_xu_f_w_bf16m2_u8m1_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vuint8m1_t test_vfncvt_xu_f_w_bf16m2_u8m1_rm_tumu(vbool8_t vm, vuint8m1_t vd,
+                                                  vbfloat16m2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m2_u8m1_rm_tumu(vm, vd, vs2, __RISCV_FRM_RNE,
+                                                   vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_xu_f_w_bf16m4_u8m2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vuint8m2_t test_vfncvt_xu_f_w_bf16m4_u8m2_rm_tumu(vbool4_t vm, vuint8m2_t vd,
+                                                  vbfloat16m4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m4_u8m2_rm_tumu(vm, vd, vs2, __RISCV_FRM_RNE,
+                                                   vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_xu_f_w_bf16m8_u8m4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vuint8m4_t test_vfncvt_xu_f_w_bf16m8_u8m4_rm_tumu(vbool2_t vm, vuint8m4_t vd,
+                                                  vbfloat16m8_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m8_u8m4_rm_tumu(vm, vd, vs2, __RISCV_FRM_RNE,
+                                                   vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfncvt_f_f_w_bf16mf4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv1bf16.nxv1f32.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfncvt_f_f_w_bf16mf4_rm_tumu(vbool64_t vm,
+                                                 vbfloat16mf4_t vd,
+                                                 vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16mf4_rm_tumu(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfncvt_f_f_w_bf16mf2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv2bf16.nxv2f32.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfncvt_f_f_w_bf16mf2_rm_tumu(vbool32_t vm,
+                                                 vbfloat16mf2_t vd,
+                                                 vfloat32m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16mf2_rm_tumu(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfncvt_f_f_w_bf16m1_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv4bf16.nxv4f32.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfncvt_f_f_w_bf16m1_rm_tumu(vbool16_t vm, vbfloat16m1_t vd,
+                                               vfloat32m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16m1_rm_tumu(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfncvt_f_f_w_bf16m2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv8bf16.nxv8f32.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfncvt_f_f_w_bf16m2_rm_tumu(vbool8_t vm, vbfloat16m2_t vd,
+                                               vfloat32m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16m2_rm_tumu(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfncvt_f_f_w_bf16m4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv16bf16.nxv16f32.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfncvt_f_f_w_bf16m4_rm_tumu(vbool4_t vm, vbfloat16m4_t vd,
+                                               vfloat32m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16m4_rm_tumu(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_x_f_w_bf16mf4_i8mf8_rm_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vint8mf8_t test_vfncvt_x_f_w_bf16mf4_i8mf8_rm_mu(vbool64_t vm, vint8mf8_t vd,
+                                                 vbfloat16mf4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16mf4_i8mf8_rm_mu(vm, vd, vs2, __RISCV_FRM_RNE,
+                                                  vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_x_f_w_bf16mf2_i8mf4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vint8mf4_t test_vfncvt_x_f_w_bf16mf2_i8mf4_rm_mu(vbool32_t vm, vint8mf4_t vd,
+                                                 vbfloat16mf2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16mf2_i8mf4_rm_mu(vm, vd, vs2, __RISCV_FRM_RNE,
+                                                  vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_x_f_w_bf16m1_i8mf2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vint8mf2_t test_vfncvt_x_f_w_bf16m1_i8mf2_rm_mu(vbool16_t vm, vint8mf2_t vd,
+                                                vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m1_i8mf2_rm_mu(vm, vd, vs2, __RISCV_FRM_RNE,
+                                                 vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_x_f_w_bf16m2_i8m1_rm_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vint8m1_t test_vfncvt_x_f_w_bf16m2_i8m1_rm_mu(vbool8_t vm, vint8m1_t vd,
+                                              vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m2_i8m1_rm_mu(vm, vd, vs2, __RISCV_FRM_RNE,
+                                                vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_x_f_w_bf16m4_i8m2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vint8m2_t test_vfncvt_x_f_w_bf16m4_i8m2_rm_mu(vbool4_t vm, vint8m2_t vd,
+                                              vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m4_i8m2_rm_mu(vm, vd, vs2, __RISCV_FRM_RNE,
+                                                vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_x_f_w_bf16m8_i8m4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vint8m4_t test_vfncvt_x_f_w_bf16m8_i8m4_rm_mu(vbool2_t vm, vint8m4_t vd,
+                                              vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_f_w_bf16m8_i8m4_rm_mu(vm, vd, vs2, __RISCV_FRM_RNE,
+                                                vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_xu_f_w_bf16mf4_u8mf8_rm_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vuint8mf8_t test_vfncvt_xu_f_w_bf16mf4_u8mf8_rm_mu(vbool64_t vm, vuint8mf8_t vd,
+                                                   vbfloat16mf4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16mf4_u8mf8_rm_mu(vm, vd, vs2, __RISCV_FRM_RNE,
+                                                   vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_xu_f_w_bf16mf2_u8mf4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vuint8mf4_t test_vfncvt_xu_f_w_bf16mf2_u8mf4_rm_mu(vbool32_t vm, vuint8mf4_t vd,
+                                                   vbfloat16mf2_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16mf2_u8mf4_rm_mu(vm, vd, vs2, __RISCV_FRM_RNE,
+                                                   vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_xu_f_w_bf16m1_u8mf2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vuint8mf2_t test_vfncvt_xu_f_w_bf16m1_u8mf2_rm_mu(vbool16_t vm, vuint8mf2_t vd,
+                                                  vbfloat16m1_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m1_u8mf2_rm_mu(vm, vd, vs2, __RISCV_FRM_RNE,
+                                                  vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_xu_f_w_bf16m2_u8m1_rm_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vuint8m1_t test_vfncvt_xu_f_w_bf16m2_u8m1_rm_mu(vbool8_t vm, vuint8m1_t vd,
+                                                vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m2_u8m1_rm_mu(vm, vd, vs2, __RISCV_FRM_RNE,
+                                                 vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_xu_f_w_bf16m4_u8m2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vuint8m2_t test_vfncvt_xu_f_w_bf16m4_u8m2_rm_mu(vbool4_t vm, vuint8m2_t vd,
+                                                vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m4_u8m2_rm_mu(vm, vd, vs2, __RISCV_FRM_RNE,
+                                                 vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_xu_f_w_bf16m8_u8m4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vuint8m4_t test_vfncvt_xu_f_w_bf16m8_u8m4_rm_mu(vbool2_t vm, vuint8m4_t vd,
+                                                vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_f_w_bf16m8_u8m4_rm_mu(vm, vd, vs2, __RISCV_FRM_RNE,
+                                                 vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfncvt_f_f_w_bf16mf4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv1bf16.nxv1f32.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfncvt_f_f_w_bf16mf4_rm_mu(vbool64_t vm, vbfloat16mf4_t vd,
+                                               vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16mf4_rm_mu(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfncvt_f_f_w_bf16mf2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv2bf16.nxv2f32.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfncvt_f_f_w_bf16mf2_rm_mu(vbool32_t vm, vbfloat16mf2_t vd,
+                                               vfloat32m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16mf2_rm_mu(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfncvt_f_f_w_bf16m1_rm_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv4bf16.nxv4f32.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfncvt_f_f_w_bf16m1_rm_mu(vbool16_t vm, vbfloat16m1_t vd,
+                                             vfloat32m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16m1_rm_mu(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfncvt_f_f_w_bf16m2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv8bf16.nxv8f32.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfncvt_f_f_w_bf16m2_rm_mu(vbool8_t vm, vbfloat16m2_t vd,
+                                             vfloat32m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16m2_rm_mu(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfncvt_f_f_w_bf16m4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv16bf16.nxv16f32.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfncvt_f_f_w_bf16m4_rm_mu(vbool4_t vm, vbfloat16m4_t vd,
+                                             vfloat32m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_f_w_bf16m4_rm_mu(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfncvt_rod.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfncvt_rod.c
new file mode 100644
index 0000000000000..84066846e188c
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfncvt_rod.c
@@ -0,0 +1,233 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfncvt_rod_f_f_w_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.nxv1bf16.nxv1f32.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfncvt_rod_f_f_w_bf16mf4_tu(vbfloat16mf4_t vd,
+                                                vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_f_w_bf16mf4_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfncvt_rod_f_f_w_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.nxv2bf16.nxv2f32.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfncvt_rod_f_f_w_bf16mf2_tu(vbfloat16mf2_t vd,
+                                                vfloat32m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_f_w_bf16mf2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfncvt_rod_f_f_w_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.nxv4bf16.nxv4f32.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfncvt_rod_f_f_w_bf16m1_tu(vbfloat16m1_t vd,
+                                              vfloat32m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_f_w_bf16m1_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfncvt_rod_f_f_w_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.nxv8bf16.nxv8f32.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfncvt_rod_f_f_w_bf16m2_tu(vbfloat16m2_t vd,
+                                              vfloat32m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_f_w_bf16m2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfncvt_rod_f_f_w_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.nxv16bf16.nxv16f32.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfncvt_rod_f_f_w_bf16m4_tu(vbfloat16m4_t vd,
+                                              vfloat32m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_f_w_bf16m4_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfncvt_rod_f_f_w_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv1bf16.nxv1f32.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfncvt_rod_f_f_w_bf16mf4_tum(vbool64_t vm,
+                                                 vbfloat16mf4_t vd,
+                                                 vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_f_w_bf16mf4_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfncvt_rod_f_f_w_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv2bf16.nxv2f32.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfncvt_rod_f_f_w_bf16mf2_tum(vbool32_t vm,
+                                                 vbfloat16mf2_t vd,
+                                                 vfloat32m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_f_w_bf16mf2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfncvt_rod_f_f_w_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv4bf16.nxv4f32.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfncvt_rod_f_f_w_bf16m1_tum(vbool16_t vm, vbfloat16m1_t vd,
+                                               vfloat32m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_f_w_bf16m1_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfncvt_rod_f_f_w_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv8bf16.nxv8f32.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfncvt_rod_f_f_w_bf16m2_tum(vbool8_t vm, vbfloat16m2_t vd,
+                                               vfloat32m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_f_w_bf16m2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfncvt_rod_f_f_w_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv16bf16.nxv16f32.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfncvt_rod_f_f_w_bf16m4_tum(vbool4_t vm, vbfloat16m4_t vd,
+                                               vfloat32m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_f_w_bf16m4_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfncvt_rod_f_f_w_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv1bf16.nxv1f32.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfncvt_rod_f_f_w_bf16mf4_tumu(vbool64_t vm,
+                                                  vbfloat16mf4_t vd,
+                                                  vfloat32mf2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfncvt_rod_f_f_w_bf16mf4_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfncvt_rod_f_f_w_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv2bf16.nxv2f32.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfncvt_rod_f_f_w_bf16mf2_tumu(vbool32_t vm,
+                                                  vbfloat16mf2_t vd,
+                                                  vfloat32m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_f_w_bf16mf2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfncvt_rod_f_f_w_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv4bf16.nxv4f32.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfncvt_rod_f_f_w_bf16m1_tumu(vbool16_t vm, vbfloat16m1_t vd,
+                                                vfloat32m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_f_w_bf16m1_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfncvt_rod_f_f_w_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv8bf16.nxv8f32.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfncvt_rod_f_f_w_bf16m2_tumu(vbool8_t vm, vbfloat16m2_t vd,
+                                                vfloat32m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_f_w_bf16m2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfncvt_rod_f_f_w_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv16bf16.nxv16f32.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfncvt_rod_f_f_w_bf16m4_tumu(vbool4_t vm, vbfloat16m4_t vd,
+                                                vfloat32m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_f_w_bf16m4_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfncvt_rod_f_f_w_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv1bf16.nxv1f32.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfncvt_rod_f_f_w_bf16mf4_mu(vbool64_t vm, vbfloat16mf4_t vd,
+                                                vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_f_w_bf16mf4_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfncvt_rod_f_f_w_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv2bf16.nxv2f32.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfncvt_rod_f_f_w_bf16mf2_mu(vbool32_t vm, vbfloat16mf2_t vd,
+                                                vfloat32m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_f_w_bf16mf2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfncvt_rod_f_f_w_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv4bf16.nxv4f32.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfncvt_rod_f_f_w_bf16m1_mu(vbool16_t vm, vbfloat16m1_t vd,
+                                              vfloat32m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_f_w_bf16m1_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfncvt_rod_f_f_w_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv8bf16.nxv8f32.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfncvt_rod_f_f_w_bf16m2_mu(vbool8_t vm, vbfloat16m2_t vd,
+                                              vfloat32m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_f_w_bf16m2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfncvt_rod_f_f_w_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv16bf16.nxv16f32.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfncvt_rod_f_f_w_bf16m4_mu(vbool4_t vm, vbfloat16m4_t vd,
+                                              vfloat32m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_f_w_bf16m4_mu(vm, vd, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfncvt_rtz.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfncvt_rtz.c
new file mode 100644
index 0000000000000..4644eff12e210
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfncvt_rtz.c
@@ -0,0 +1,572 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_rtz_x_f_w_bf16mf4_i8mf8_tu(
+// CHECK-RV64-SAME: <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vint8mf8_t test_vfncvt_rtz_x_f_w_bf16mf4_i8mf8_tu(vint8mf8_t vd,
+                                                  vbfloat16mf4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfncvt_rtz_x_f_w_bf16mf4_i8mf8_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_rtz_x_f_w_bf16mf2_i8mf4_tu(
+// CHECK-RV64-SAME: <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vint8mf4_t test_vfncvt_rtz_x_f_w_bf16mf2_i8mf4_tu(vint8mf4_t vd,
+                                                  vbfloat16mf2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfncvt_rtz_x_f_w_bf16mf2_i8mf4_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_rtz_x_f_w_bf16m1_i8mf2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vint8mf2_t test_vfncvt_rtz_x_f_w_bf16m1_i8mf2_tu(vint8mf2_t vd,
+                                                 vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_x_f_w_bf16m1_i8mf2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_rtz_x_f_w_bf16m2_i8m1_tu(
+// CHECK-RV64-SAME: <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vint8m1_t test_vfncvt_rtz_x_f_w_bf16m2_i8m1_tu(vint8m1_t vd, vbfloat16m2_t vs2,
+                                               size_t vl) {
+  return __riscv_vfncvt_rtz_x_f_w_bf16m2_i8m1_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_rtz_x_f_w_bf16m4_i8m2_tu(
+// CHECK-RV64-SAME: <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vint8m2_t test_vfncvt_rtz_x_f_w_bf16m4_i8m2_tu(vint8m2_t vd, vbfloat16m4_t vs2,
+                                               size_t vl) {
+  return __riscv_vfncvt_rtz_x_f_w_bf16m4_i8m2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_rtz_x_f_w_bf16m8_i8m4_tu(
+// CHECK-RV64-SAME: <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vint8m4_t test_vfncvt_rtz_x_f_w_bf16m8_i8m4_tu(vint8m4_t vd, vbfloat16m8_t vs2,
+                                               size_t vl) {
+  return __riscv_vfncvt_rtz_x_f_w_bf16m8_i8m4_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_rtz_xu_f_w_bf16mf4_u8mf8_tu(
+// CHECK-RV64-SAME: <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vuint8mf8_t test_vfncvt_rtz_xu_f_w_bf16mf4_u8mf8_tu(vuint8mf8_t vd,
+                                                    vbfloat16mf4_t vs2,
+                                                    size_t vl) {
+  return __riscv_vfncvt_rtz_xu_f_w_bf16mf4_u8mf8_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_rtz_xu_f_w_bf16mf2_u8mf4_tu(
+// CHECK-RV64-SAME: <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vuint8mf4_t test_vfncvt_rtz_xu_f_w_bf16mf2_u8mf4_tu(vuint8mf4_t vd,
+                                                    vbfloat16mf2_t vs2,
+                                                    size_t vl) {
+  return __riscv_vfncvt_rtz_xu_f_w_bf16mf2_u8mf4_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_rtz_xu_f_w_bf16m1_u8mf2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vuint8mf2_t test_vfncvt_rtz_xu_f_w_bf16m1_u8mf2_tu(vuint8mf2_t vd,
+                                                   vbfloat16m1_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfncvt_rtz_xu_f_w_bf16m1_u8mf2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_rtz_xu_f_w_bf16m2_u8m1_tu(
+// CHECK-RV64-SAME: <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vuint8m1_t test_vfncvt_rtz_xu_f_w_bf16m2_u8m1_tu(vuint8m1_t vd,
+                                                 vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_xu_f_w_bf16m2_u8m1_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_rtz_xu_f_w_bf16m4_u8m2_tu(
+// CHECK-RV64-SAME: <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vuint8m2_t test_vfncvt_rtz_xu_f_w_bf16m4_u8m2_tu(vuint8m2_t vd,
+                                                 vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_xu_f_w_bf16m4_u8m2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_rtz_xu_f_w_bf16m8_u8m4_tu(
+// CHECK-RV64-SAME: <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vuint8m4_t test_vfncvt_rtz_xu_f_w_bf16m8_u8m4_tu(vuint8m4_t vd,
+                                                 vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_xu_f_w_bf16m8_u8m4_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_rtz_x_f_w_bf16mf4_i8mf8_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vint8mf8_t test_vfncvt_rtz_x_f_w_bf16mf4_i8mf8_tum(vbool64_t vm, vint8mf8_t vd,
+                                                   vbfloat16mf4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfncvt_rtz_x_f_w_bf16mf4_i8mf8_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_rtz_x_f_w_bf16mf2_i8mf4_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vint8mf4_t test_vfncvt_rtz_x_f_w_bf16mf2_i8mf4_tum(vbool32_t vm, vint8mf4_t vd,
+                                                   vbfloat16mf2_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfncvt_rtz_x_f_w_bf16mf2_i8mf4_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_rtz_x_f_w_bf16m1_i8mf2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vint8mf2_t test_vfncvt_rtz_x_f_w_bf16m1_i8mf2_tum(vbool16_t vm, vint8mf2_t vd,
+                                                  vbfloat16m1_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfncvt_rtz_x_f_w_bf16m1_i8mf2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_rtz_x_f_w_bf16m2_i8m1_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vint8m1_t test_vfncvt_rtz_x_f_w_bf16m2_i8m1_tum(vbool8_t vm, vint8m1_t vd,
+                                                vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_x_f_w_bf16m2_i8m1_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_rtz_x_f_w_bf16m4_i8m2_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vint8m2_t test_vfncvt_rtz_x_f_w_bf16m4_i8m2_tum(vbool4_t vm, vint8m2_t vd,
+                                                vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_x_f_w_bf16m4_i8m2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_rtz_x_f_w_bf16m8_i8m4_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vint8m4_t test_vfncvt_rtz_x_f_w_bf16m8_i8m4_tum(vbool2_t vm, vint8m4_t vd,
+                                                vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_x_f_w_bf16m8_i8m4_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_rtz_xu_f_w_bf16mf4_u8mf8_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vuint8mf8_t test_vfncvt_rtz_xu_f_w_bf16mf4_u8mf8_tum(vbool64_t vm,
+                                                     vuint8mf8_t vd,
+                                                     vbfloat16mf4_t vs2,
+                                                     size_t vl) {
+  return __riscv_vfncvt_rtz_xu_f_w_bf16mf4_u8mf8_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_rtz_xu_f_w_bf16mf2_u8mf4_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vuint8mf4_t test_vfncvt_rtz_xu_f_w_bf16mf2_u8mf4_tum(vbool32_t vm,
+                                                     vuint8mf4_t vd,
+                                                     vbfloat16mf2_t vs2,
+                                                     size_t vl) {
+  return __riscv_vfncvt_rtz_xu_f_w_bf16mf2_u8mf4_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_rtz_xu_f_w_bf16m1_u8mf2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vuint8mf2_t test_vfncvt_rtz_xu_f_w_bf16m1_u8mf2_tum(vbool16_t vm,
+                                                    vuint8mf2_t vd,
+                                                    vbfloat16m1_t vs2,
+                                                    size_t vl) {
+  return __riscv_vfncvt_rtz_xu_f_w_bf16m1_u8mf2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_rtz_xu_f_w_bf16m2_u8m1_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vuint8m1_t test_vfncvt_rtz_xu_f_w_bf16m2_u8m1_tum(vbool8_t vm, vuint8m1_t vd,
+                                                  vbfloat16m2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfncvt_rtz_xu_f_w_bf16m2_u8m1_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_rtz_xu_f_w_bf16m4_u8m2_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vuint8m2_t test_vfncvt_rtz_xu_f_w_bf16m4_u8m2_tum(vbool4_t vm, vuint8m2_t vd,
+                                                  vbfloat16m4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfncvt_rtz_xu_f_w_bf16m4_u8m2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_rtz_xu_f_w_bf16m8_u8m4_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vuint8m4_t test_vfncvt_rtz_xu_f_w_bf16m8_u8m4_tum(vbool2_t vm, vuint8m4_t vd,
+                                                  vbfloat16m8_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfncvt_rtz_xu_f_w_bf16m8_u8m4_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_rtz_x_f_w_bf16mf4_i8mf8_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vint8mf8_t test_vfncvt_rtz_x_f_w_bf16mf4_i8mf8_tumu(vbool64_t vm, vint8mf8_t vd,
+                                                    vbfloat16mf4_t vs2,
+                                                    size_t vl) {
+  return __riscv_vfncvt_rtz_x_f_w_bf16mf4_i8mf8_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_rtz_x_f_w_bf16mf2_i8mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vint8mf4_t test_vfncvt_rtz_x_f_w_bf16mf2_i8mf4_tumu(vbool32_t vm, vint8mf4_t vd,
+                                                    vbfloat16mf2_t vs2,
+                                                    size_t vl) {
+  return __riscv_vfncvt_rtz_x_f_w_bf16mf2_i8mf4_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_rtz_x_f_w_bf16m1_i8mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vint8mf2_t test_vfncvt_rtz_x_f_w_bf16m1_i8mf2_tumu(vbool16_t vm, vint8mf2_t vd,
+                                                   vbfloat16m1_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfncvt_rtz_x_f_w_bf16m1_i8mf2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_rtz_x_f_w_bf16m2_i8m1_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vint8m1_t test_vfncvt_rtz_x_f_w_bf16m2_i8m1_tumu(vbool8_t vm, vint8m1_t vd,
+                                                 vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_x_f_w_bf16m2_i8m1_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_rtz_x_f_w_bf16m4_i8m2_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vint8m2_t test_vfncvt_rtz_x_f_w_bf16m4_i8m2_tumu(vbool4_t vm, vint8m2_t vd,
+                                                 vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_x_f_w_bf16m4_i8m2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_rtz_x_f_w_bf16m8_i8m4_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vint8m4_t test_vfncvt_rtz_x_f_w_bf16m8_i8m4_tumu(vbool2_t vm, vint8m4_t vd,
+                                                 vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_x_f_w_bf16m8_i8m4_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_rtz_xu_f_w_bf16mf4_u8mf8_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vuint8mf8_t test_vfncvt_rtz_xu_f_w_bf16mf4_u8mf8_tumu(vbool64_t vm,
+                                                      vuint8mf8_t vd,
+                                                      vbfloat16mf4_t vs2,
+                                                      size_t vl) {
+  return __riscv_vfncvt_rtz_xu_f_w_bf16mf4_u8mf8_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_rtz_xu_f_w_bf16mf2_u8mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vuint8mf4_t test_vfncvt_rtz_xu_f_w_bf16mf2_u8mf4_tumu(vbool32_t vm,
+                                                      vuint8mf4_t vd,
+                                                      vbfloat16mf2_t vs2,
+                                                      size_t vl) {
+  return __riscv_vfncvt_rtz_xu_f_w_bf16mf2_u8mf4_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_rtz_xu_f_w_bf16m1_u8mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vuint8mf2_t test_vfncvt_rtz_xu_f_w_bf16m1_u8mf2_tumu(vbool16_t vm,
+                                                     vuint8mf2_t vd,
+                                                     vbfloat16m1_t vs2,
+                                                     size_t vl) {
+  return __riscv_vfncvt_rtz_xu_f_w_bf16m1_u8mf2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_rtz_xu_f_w_bf16m2_u8m1_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vuint8m1_t test_vfncvt_rtz_xu_f_w_bf16m2_u8m1_tumu(vbool8_t vm, vuint8m1_t vd,
+                                                   vbfloat16m2_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfncvt_rtz_xu_f_w_bf16m2_u8m1_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_rtz_xu_f_w_bf16m4_u8m2_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vuint8m2_t test_vfncvt_rtz_xu_f_w_bf16m4_u8m2_tumu(vbool4_t vm, vuint8m2_t vd,
+                                                   vbfloat16m4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfncvt_rtz_xu_f_w_bf16m4_u8m2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_rtz_xu_f_w_bf16m8_u8m4_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vuint8m4_t test_vfncvt_rtz_xu_f_w_bf16m8_u8m4_tumu(vbool2_t vm, vuint8m4_t vd,
+                                                   vbfloat16m8_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfncvt_rtz_xu_f_w_bf16m8_u8m4_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_rtz_x_f_w_bf16mf4_i8mf8_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vint8mf8_t test_vfncvt_rtz_x_f_w_bf16mf4_i8mf8_mu(vbool64_t vm, vint8mf8_t vd,
+                                                  vbfloat16mf4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfncvt_rtz_x_f_w_bf16mf4_i8mf8_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_rtz_x_f_w_bf16mf2_i8mf4_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vint8mf4_t test_vfncvt_rtz_x_f_w_bf16mf2_i8mf4_mu(vbool32_t vm, vint8mf4_t vd,
+                                                  vbfloat16mf2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfncvt_rtz_x_f_w_bf16mf2_i8mf4_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_rtz_x_f_w_bf16m1_i8mf2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vint8mf2_t test_vfncvt_rtz_x_f_w_bf16m1_i8mf2_mu(vbool16_t vm, vint8mf2_t vd,
+                                                 vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_x_f_w_bf16m1_i8mf2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_rtz_x_f_w_bf16m2_i8m1_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vint8m1_t test_vfncvt_rtz_x_f_w_bf16m2_i8m1_mu(vbool8_t vm, vint8m1_t vd,
+                                               vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_x_f_w_bf16m2_i8m1_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_rtz_x_f_w_bf16m4_i8m2_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vint8m2_t test_vfncvt_rtz_x_f_w_bf16m4_i8m2_mu(vbool4_t vm, vint8m2_t vd,
+                                               vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_x_f_w_bf16m4_i8m2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_rtz_x_f_w_bf16m8_i8m4_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vint8m4_t test_vfncvt_rtz_x_f_w_bf16m8_i8m4_mu(vbool2_t vm, vint8m4_t vd,
+                                               vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_x_f_w_bf16m8_i8m4_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_rtz_xu_f_w_bf16mf4_u8mf8_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vuint8mf8_t test_vfncvt_rtz_xu_f_w_bf16mf4_u8mf8_mu(vbool64_t vm,
+                                                    vuint8mf8_t vd,
+                                                    vbfloat16mf4_t vs2,
+                                                    size_t vl) {
+  return __riscv_vfncvt_rtz_xu_f_w_bf16mf4_u8mf8_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_rtz_xu_f_w_bf16mf2_u8mf4_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vuint8mf4_t test_vfncvt_rtz_xu_f_w_bf16mf2_u8mf4_mu(vbool32_t vm,
+                                                    vuint8mf4_t vd,
+                                                    vbfloat16mf2_t vs2,
+                                                    size_t vl) {
+  return __riscv_vfncvt_rtz_xu_f_w_bf16mf2_u8mf4_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_rtz_xu_f_w_bf16m1_u8mf2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vuint8mf2_t test_vfncvt_rtz_xu_f_w_bf16m1_u8mf2_mu(vbool16_t vm, vuint8mf2_t vd,
+                                                   vbfloat16m1_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfncvt_rtz_xu_f_w_bf16m1_u8mf2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_rtz_xu_f_w_bf16m2_u8m1_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vuint8m1_t test_vfncvt_rtz_xu_f_w_bf16m2_u8m1_mu(vbool8_t vm, vuint8m1_t vd,
+                                                 vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_xu_f_w_bf16m2_u8m1_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_rtz_xu_f_w_bf16m4_u8m2_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vuint8m2_t test_vfncvt_rtz_xu_f_w_bf16m4_u8m2_mu(vbool4_t vm, vuint8m2_t vd,
+                                                 vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_xu_f_w_bf16m4_u8m2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_rtz_xu_f_w_bf16m8_u8m4_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vuint8m4_t test_vfncvt_rtz_xu_f_w_bf16m8_u8m4_mu(vbool2_t vm, vuint8m4_t vd,
+                                                 vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_xu_f_w_bf16m8_u8m4_mu(vm, vd, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfnmacc.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfnmacc.c
new file mode 100644
index 0000000000000..93fd6ba2faff7
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfnmacc.c
@@ -0,0 +1,489 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmacc_vv_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmacc.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmacc_vv_bf16mf4_tu(vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vv_bf16mf4_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmacc_vf_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmacc.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmacc_vf_bf16mf4_tu(vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vf_bf16mf4_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmacc_vv_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmacc.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmacc_vv_bf16mf2_tu(vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vv_bf16mf2_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmacc_vf_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmacc.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmacc_vf_bf16mf2_tu(vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vf_bf16mf2_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmacc_vv_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmacc.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmacc_vv_bf16m1_tu(vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vv_bf16m1_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmacc_vf_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmacc.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmacc_vf_bf16m1_tu(vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vf_bf16m1_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmacc_vv_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmacc.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmacc_vv_bf16m2_tu(vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vv_bf16m2_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmacc_vf_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmacc.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmacc_vf_bf16m2_tu(vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vf_bf16m2_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmacc_vv_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmacc.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmacc_vv_bf16m4_tu(vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vv_bf16m4_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmacc_vf_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmacc.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmacc_vf_bf16m4_tu(vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vf_bf16m4_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmacc_vv_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmacc.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmacc_vv_bf16m8_tu(vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vv_bf16m8_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmacc_vf_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmacc.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmacc_vf_bf16m8_tu(vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vf_bf16m8_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmacc_vv_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmacc.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmacc_vv_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vv_bf16mf4_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmacc_vf_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmacc.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmacc_vf_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vf_bf16mf4_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmacc_vv_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmacc.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmacc_vv_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vv_bf16mf2_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmacc_vf_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmacc.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmacc_vf_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vf_bf16mf2_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmacc_vv_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmacc.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmacc_vv_bf16m1_tum(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vv_bf16m1_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmacc_vf_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmacc.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmacc_vf_bf16m1_tum(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vf_bf16m1_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmacc_vv_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmacc.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmacc_vv_bf16m2_tum(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vv_bf16m2_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmacc_vf_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmacc.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmacc_vf_bf16m2_tum(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vf_bf16m2_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmacc_vv_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmacc.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmacc_vv_bf16m4_tum(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vv_bf16m4_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmacc_vf_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmacc.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmacc_vf_bf16m4_tum(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vf_bf16m4_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmacc_vv_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmacc.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmacc_vv_bf16m8_tum(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vv_bf16m8_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmacc_vf_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmacc.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmacc_vf_bf16m8_tum(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vf_bf16m8_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmacc_vv_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmacc.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmacc_vv_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vv_bf16mf4_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmacc_vf_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmacc.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmacc_vf_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vf_bf16mf4_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmacc_vv_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmacc.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmacc_vv_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vv_bf16mf2_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmacc_vf_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmacc.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmacc_vf_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vf_bf16mf2_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmacc_vv_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmacc.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmacc_vv_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vv_bf16m1_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmacc_vf_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmacc.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmacc_vf_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vf_bf16m1_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmacc_vv_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmacc.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmacc_vv_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vv_bf16m2_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmacc_vf_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmacc.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmacc_vf_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vf_bf16m2_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmacc_vv_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmacc.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmacc_vv_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vv_bf16m4_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmacc_vf_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmacc.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmacc_vf_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vf_bf16m4_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmacc_vv_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmacc.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmacc_vv_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vv_bf16m8_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmacc_vf_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmacc.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmacc_vf_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vf_bf16m8_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmacc_vv_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmacc.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmacc_vv_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vv_bf16mf4_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmacc_vf_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmacc.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmacc_vf_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vf_bf16mf4_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmacc_vv_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmacc.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmacc_vv_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vv_bf16mf2_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmacc_vf_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmacc.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmacc_vf_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vf_bf16mf2_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmacc_vv_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmacc.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmacc_vv_bf16m1_mu(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vv_bf16m1_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmacc_vf_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmacc.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmacc_vf_bf16m1_mu(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vf_bf16m1_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmacc_vv_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmacc.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmacc_vv_bf16m2_mu(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vv_bf16m2_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmacc_vf_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmacc.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmacc_vf_bf16m2_mu(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vf_bf16m2_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmacc_vv_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmacc.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmacc_vv_bf16m4_mu(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vv_bf16m4_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmacc_vf_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmacc.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmacc_vf_bf16m4_mu(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vf_bf16m4_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmacc_vv_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmacc.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmacc_vv_bf16m8_mu(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vv_bf16m8_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmacc_vf_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmacc.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmacc_vf_bf16m8_mu(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmacc_vf_bf16m8_mu(mask, vd, rs1, vs2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfnmadd.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfnmadd.c
new file mode 100644
index 0000000000000..d7e6b8225d8c0
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfnmadd.c
@@ -0,0 +1,489 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmadd_vv_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmadd.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmadd_vv_bf16mf4_tu(vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vv_bf16mf4_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmadd_vf_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmadd.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmadd_vf_bf16mf4_tu(vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vf_bf16mf4_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmadd_vv_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmadd.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmadd_vv_bf16mf2_tu(vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vv_bf16mf2_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmadd_vf_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmadd.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmadd_vf_bf16mf2_tu(vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vf_bf16mf2_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmadd_vv_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmadd.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmadd_vv_bf16m1_tu(vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vv_bf16m1_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmadd_vf_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmadd.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmadd_vf_bf16m1_tu(vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vf_bf16m1_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmadd_vv_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmadd.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmadd_vv_bf16m2_tu(vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vv_bf16m2_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmadd_vf_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmadd.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmadd_vf_bf16m2_tu(vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vf_bf16m2_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmadd_vv_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmadd.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmadd_vv_bf16m4_tu(vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vv_bf16m4_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmadd_vf_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmadd.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmadd_vf_bf16m4_tu(vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vf_bf16m4_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmadd_vv_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmadd.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmadd_vv_bf16m8_tu(vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vv_bf16m8_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmadd_vf_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmadd.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmadd_vf_bf16m8_tu(vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vf_bf16m8_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmadd_vv_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmadd.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmadd_vv_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vv_bf16mf4_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmadd_vf_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmadd.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmadd_vf_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vf_bf16mf4_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmadd_vv_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmadd.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmadd_vv_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vv_bf16mf2_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmadd_vf_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmadd.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmadd_vf_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vf_bf16mf2_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmadd_vv_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmadd.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmadd_vv_bf16m1_tum(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vv_bf16m1_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmadd_vf_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmadd.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmadd_vf_bf16m1_tum(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vf_bf16m1_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmadd_vv_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmadd.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmadd_vv_bf16m2_tum(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vv_bf16m2_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmadd_vf_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmadd.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmadd_vf_bf16m2_tum(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vf_bf16m2_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmadd_vv_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmadd.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmadd_vv_bf16m4_tum(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vv_bf16m4_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmadd_vf_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmadd.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmadd_vf_bf16m4_tum(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vf_bf16m4_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmadd_vv_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmadd.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmadd_vv_bf16m8_tum(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vv_bf16m8_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmadd_vf_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmadd.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmadd_vf_bf16m8_tum(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vf_bf16m8_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmadd_vv_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmadd.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmadd_vv_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vv_bf16mf4_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmadd_vf_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmadd.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmadd_vf_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vf_bf16mf4_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmadd_vv_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmadd.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmadd_vv_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vv_bf16mf2_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmadd_vf_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmadd.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmadd_vf_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vf_bf16mf2_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmadd_vv_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmadd.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmadd_vv_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vv_bf16m1_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmadd_vf_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmadd.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmadd_vf_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vf_bf16m1_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmadd_vv_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmadd.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmadd_vv_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vv_bf16m2_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmadd_vf_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmadd.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmadd_vf_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vf_bf16m2_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmadd_vv_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmadd.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmadd_vv_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vv_bf16m4_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmadd_vf_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmadd.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmadd_vf_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vf_bf16m4_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmadd_vv_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmadd.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmadd_vv_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vv_bf16m8_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmadd_vf_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmadd.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmadd_vf_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vf_bf16m8_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmadd_vv_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmadd.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmadd_vv_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vv_bf16mf4_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmadd_vf_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmadd.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmadd_vf_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vf_bf16mf4_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmadd_vv_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmadd.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmadd_vv_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vv_bf16mf2_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmadd_vf_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmadd.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmadd_vf_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vf_bf16mf2_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmadd_vv_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmadd.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmadd_vv_bf16m1_mu(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vv_bf16m1_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmadd_vf_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmadd.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmadd_vf_bf16m1_mu(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vf_bf16m1_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmadd_vv_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmadd.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmadd_vv_bf16m2_mu(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vv_bf16m2_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmadd_vf_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmadd.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmadd_vf_bf16m2_mu(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vf_bf16m2_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmadd_vv_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmadd.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmadd_vv_bf16m4_mu(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vv_bf16m4_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmadd_vf_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmadd.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmadd_vf_bf16m4_mu(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vf_bf16m4_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmadd_vv_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmadd.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmadd_vv_bf16m8_mu(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vv_bf16m8_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmadd_vf_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmadd.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmadd_vf_bf16m8_mu(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmadd_vf_bf16m8_mu(mask, vd, rs1, vs2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfnmsac.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfnmsac.c
new file mode 100644
index 0000000000000..e0c289d23c17f
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfnmsac.c
@@ -0,0 +1,489 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsac_vv_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsac.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsac_vv_bf16mf4_tu(vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vv_bf16mf4_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsac_vf_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsac.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsac_vf_bf16mf4_tu(vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vf_bf16mf4_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsac_vv_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsac.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsac_vv_bf16mf2_tu(vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vv_bf16mf2_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsac_vf_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsac.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsac_vf_bf16mf2_tu(vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vf_bf16mf2_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsac_vv_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsac.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsac_vv_bf16m1_tu(vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vv_bf16m1_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsac_vf_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsac.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsac_vf_bf16m1_tu(vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vf_bf16m1_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsac_vv_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsac.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsac_vv_bf16m2_tu(vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vv_bf16m2_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsac_vf_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsac.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsac_vf_bf16m2_tu(vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vf_bf16m2_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsac_vv_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsac.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsac_vv_bf16m4_tu(vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vv_bf16m4_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsac_vf_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsac.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsac_vf_bf16m4_tu(vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vf_bf16m4_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsac_vv_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsac.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsac_vv_bf16m8_tu(vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vv_bf16m8_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsac_vf_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsac.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsac_vf_bf16m8_tu(vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vf_bf16m8_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsac_vv_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsac.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsac_vv_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vv_bf16mf4_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsac_vf_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsac.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsac_vf_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vf_bf16mf4_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsac_vv_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsac.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsac_vv_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vv_bf16mf2_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsac_vf_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsac.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsac_vf_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vf_bf16mf2_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsac_vv_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsac.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsac_vv_bf16m1_tum(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vv_bf16m1_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsac_vf_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsac.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsac_vf_bf16m1_tum(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vf_bf16m1_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsac_vv_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsac.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsac_vv_bf16m2_tum(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vv_bf16m2_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsac_vf_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsac.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsac_vf_bf16m2_tum(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vf_bf16m2_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsac_vv_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsac.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsac_vv_bf16m4_tum(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vv_bf16m4_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsac_vf_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsac.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsac_vf_bf16m4_tum(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vf_bf16m4_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsac_vv_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsac.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsac_vv_bf16m8_tum(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vv_bf16m8_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsac_vf_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsac.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsac_vf_bf16m8_tum(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vf_bf16m8_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsac_vv_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsac.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsac_vv_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vv_bf16mf4_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsac_vf_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsac.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsac_vf_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vf_bf16mf4_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsac_vv_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsac.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsac_vv_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vv_bf16mf2_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsac_vf_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsac.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsac_vf_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vf_bf16mf2_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsac_vv_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsac.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsac_vv_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vv_bf16m1_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsac_vf_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsac.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsac_vf_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vf_bf16m1_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsac_vv_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsac.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsac_vv_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vv_bf16m2_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsac_vf_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsac.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsac_vf_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vf_bf16m2_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsac_vv_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsac.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsac_vv_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vv_bf16m4_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsac_vf_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsac.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsac_vf_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vf_bf16m4_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsac_vv_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsac.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsac_vv_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vv_bf16m8_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsac_vf_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsac.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsac_vf_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vf_bf16m8_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsac_vv_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsac.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsac_vv_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vv_bf16mf4_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsac_vf_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsac.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsac_vf_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vf_bf16mf4_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsac_vv_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsac.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsac_vv_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vv_bf16mf2_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsac_vf_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsac.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsac_vf_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vf_bf16mf2_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsac_vv_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsac.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsac_vv_bf16m1_mu(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vv_bf16m1_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsac_vf_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsac.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsac_vf_bf16m1_mu(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vf_bf16m1_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsac_vv_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsac.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsac_vv_bf16m2_mu(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vv_bf16m2_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsac_vf_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsac.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsac_vf_bf16m2_mu(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vf_bf16m2_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsac_vv_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsac.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsac_vv_bf16m4_mu(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vv_bf16m4_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsac_vf_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsac.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsac_vf_bf16m4_mu(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vf_bf16m4_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsac_vv_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsac.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsac_vv_bf16m8_mu(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vv_bf16m8_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsac_vf_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsac.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsac_vf_bf16m8_mu(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsac_vf_bf16m8_mu(mask, vd, rs1, vs2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfnmsub.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfnmsub.c
new file mode 100644
index 0000000000000..05ccda36e9032
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfnmsub.c
@@ -0,0 +1,489 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsub_vv_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsub.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsub_vv_bf16mf4_tu(vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vv_bf16mf4_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsub_vf_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsub.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsub_vf_bf16mf4_tu(vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vf_bf16mf4_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsub_vv_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsub.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsub_vv_bf16mf2_tu(vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vv_bf16mf2_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsub_vf_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsub.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsub_vf_bf16mf2_tu(vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vf_bf16mf2_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsub_vv_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsub.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsub_vv_bf16m1_tu(vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vv_bf16m1_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsub_vf_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsub.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsub_vf_bf16m1_tu(vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vf_bf16m1_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsub_vv_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsub.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsub_vv_bf16m2_tu(vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vv_bf16m2_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsub_vf_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsub.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsub_vf_bf16m2_tu(vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vf_bf16m2_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsub_vv_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsub.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsub_vv_bf16m4_tu(vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vv_bf16m4_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsub_vf_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsub.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsub_vf_bf16m4_tu(vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vf_bf16m4_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsub_vv_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsub.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsub_vv_bf16m8_tu(vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vv_bf16m8_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsub_vf_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsub.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsub_vf_bf16m8_tu(vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vf_bf16m8_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsub_vv_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsub.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsub_vv_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vv_bf16mf4_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsub_vf_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsub.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsub_vf_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vf_bf16mf4_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsub_vv_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsub.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsub_vv_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vv_bf16mf2_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsub_vf_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsub.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsub_vf_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vf_bf16mf2_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsub_vv_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsub.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsub_vv_bf16m1_tum(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vv_bf16m1_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsub_vf_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsub.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsub_vf_bf16m1_tum(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vf_bf16m1_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsub_vv_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsub.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsub_vv_bf16m2_tum(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vv_bf16m2_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsub_vf_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsub.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsub_vf_bf16m2_tum(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vf_bf16m2_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsub_vv_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsub.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsub_vv_bf16m4_tum(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vv_bf16m4_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsub_vf_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsub.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsub_vf_bf16m4_tum(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vf_bf16m4_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsub_vv_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsub.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsub_vv_bf16m8_tum(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vv_bf16m8_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsub_vf_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsub.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsub_vf_bf16m8_tum(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vf_bf16m8_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsub_vv_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsub.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsub_vv_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vv_bf16mf4_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsub_vf_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsub.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsub_vf_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vf_bf16mf4_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsub_vv_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsub.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsub_vv_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vv_bf16mf2_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsub_vf_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsub.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsub_vf_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vf_bf16mf2_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsub_vv_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsub.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsub_vv_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vv_bf16m1_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsub_vf_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsub.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsub_vf_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vf_bf16m1_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsub_vv_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsub.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsub_vv_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vv_bf16m2_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsub_vf_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsub.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsub_vf_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vf_bf16m2_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsub_vv_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsub.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsub_vv_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vv_bf16m4_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsub_vf_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsub.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsub_vf_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vf_bf16m4_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsub_vv_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsub.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsub_vv_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vv_bf16m8_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsub_vf_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsub.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsub_vf_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vf_bf16m8_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsub_vv_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsub.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsub_vv_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vv_bf16mf4_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsub_vf_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsub.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsub_vf_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vf_bf16mf4_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsub_vv_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsub.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsub_vv_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vv_bf16mf2_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsub_vf_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsub.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsub_vf_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vf_bf16mf2_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsub_vv_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsub.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsub_vv_bf16m1_mu(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vv_bf16m1_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsub_vf_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsub.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsub_vf_bf16m1_mu(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vf_bf16m1_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsub_vv_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsub.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsub_vv_bf16m2_mu(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vv_bf16m2_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsub_vf_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsub.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsub_vf_bf16m2_mu(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vf_bf16m2_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsub_vv_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsub.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsub_vv_bf16m4_mu(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vv_bf16m4_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsub_vf_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsub.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsub_vf_bf16m4_mu(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vf_bf16m4_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsub_vv_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsub.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsub_vv_bf16m8_mu(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vv_bf16m8_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsub_vf_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsub.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsub_vf_bf16m8_mu(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsub_vf_bf16m8_mu(mask, vd, rs1, vs2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfrec7.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfrec7.c
new file mode 100644
index 0000000000000..3123692b45a68
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfrec7.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfrec7_v_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfrec7.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfrec7_v_bf16mf4_tu(vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, size_t vl) {
+  return __riscv_vfrec7_v_bf16mf4_tu(maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfrec7_v_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfrec7.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfrec7_v_bf16mf2_tu(vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, size_t vl) {
+  return __riscv_vfrec7_v_bf16mf2_tu(maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfrec7_v_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfrec7.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfrec7_v_bf16m1_tu(vbfloat16m1_t maskedoff, vbfloat16m1_t op1, size_t vl) {
+  return __riscv_vfrec7_v_bf16m1_tu(maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfrec7_v_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfrec7.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfrec7_v_bf16m2_tu(vbfloat16m2_t maskedoff, vbfloat16m2_t op1, size_t vl) {
+  return __riscv_vfrec7_v_bf16m2_tu(maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfrec7_v_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfrec7.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfrec7_v_bf16m4_tu(vbfloat16m4_t maskedoff, vbfloat16m4_t op1, size_t vl) {
+  return __riscv_vfrec7_v_bf16m4_tu(maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfrec7_v_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfrec7.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfrec7_v_bf16m8_tu(vbfloat16m8_t maskedoff, vbfloat16m8_t op1, size_t vl) {
+  return __riscv_vfrec7_v_bf16m8_tu(maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfrec7_v_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfrec7.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfrec7_v_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, size_t vl) {
+  return __riscv_vfrec7_v_bf16mf4_tum(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfrec7_v_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfrec7.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfrec7_v_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, size_t vl) {
+  return __riscv_vfrec7_v_bf16mf2_tum(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfrec7_v_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfrec7.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfrec7_v_bf16m1_tum(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, size_t vl) {
+  return __riscv_vfrec7_v_bf16m1_tum(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfrec7_v_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfrec7.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfrec7_v_bf16m2_tum(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, size_t vl) {
+  return __riscv_vfrec7_v_bf16m2_tum(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfrec7_v_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfrec7.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfrec7_v_bf16m4_tum(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, size_t vl) {
+  return __riscv_vfrec7_v_bf16m4_tum(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfrec7_v_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfrec7.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfrec7_v_bf16m8_tum(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, size_t vl) {
+  return __riscv_vfrec7_v_bf16m8_tum(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfrec7_v_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfrec7.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfrec7_v_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, size_t vl) {
+  return __riscv_vfrec7_v_bf16mf4_tumu(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfrec7_v_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfrec7.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfrec7_v_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, size_t vl) {
+  return __riscv_vfrec7_v_bf16mf2_tumu(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfrec7_v_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfrec7.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfrec7_v_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, size_t vl) {
+  return __riscv_vfrec7_v_bf16m1_tumu(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfrec7_v_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfrec7.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfrec7_v_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, size_t vl) {
+  return __riscv_vfrec7_v_bf16m2_tumu(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfrec7_v_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfrec7.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfrec7_v_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, size_t vl) {
+  return __riscv_vfrec7_v_bf16m4_tumu(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfrec7_v_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfrec7.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfrec7_v_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, size_t vl) {
+  return __riscv_vfrec7_v_bf16m8_tumu(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfrec7_v_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfrec7.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfrec7_v_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, size_t vl) {
+  return __riscv_vfrec7_v_bf16mf4_mu(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfrec7_v_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfrec7.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfrec7_v_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, size_t vl) {
+  return __riscv_vfrec7_v_bf16mf2_mu(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfrec7_v_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfrec7.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfrec7_v_bf16m1_mu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, size_t vl) {
+  return __riscv_vfrec7_v_bf16m1_mu(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfrec7_v_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfrec7.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfrec7_v_bf16m2_mu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, size_t vl) {
+  return __riscv_vfrec7_v_bf16m2_mu(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfrec7_v_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfrec7.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfrec7_v_bf16m4_mu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, size_t vl) {
+  return __riscv_vfrec7_v_bf16m4_mu(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfrec7_v_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfrec7.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfrec7_v_bf16m8_mu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, size_t vl) {
+  return __riscv_vfrec7_v_bf16m8_mu(mask, maskedoff, op1, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfrsqrt7.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfrsqrt7.c
new file mode 100644
index 0000000000000..8436f0ea488b2
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfrsqrt7.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfrsqrt7_v_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfrsqrt7.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfrsqrt7_v_bf16mf4_tu(vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_v_bf16mf4_tu(maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfrsqrt7_v_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfrsqrt7.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfrsqrt7_v_bf16mf2_tu(vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_v_bf16mf2_tu(maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfrsqrt7_v_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfrsqrt7.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfrsqrt7_v_bf16m1_tu(vbfloat16m1_t maskedoff, vbfloat16m1_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_v_bf16m1_tu(maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfrsqrt7_v_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfrsqrt7.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfrsqrt7_v_bf16m2_tu(vbfloat16m2_t maskedoff, vbfloat16m2_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_v_bf16m2_tu(maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfrsqrt7_v_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfrsqrt7.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfrsqrt7_v_bf16m4_tu(vbfloat16m4_t maskedoff, vbfloat16m4_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_v_bf16m4_tu(maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfrsqrt7_v_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfrsqrt7.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfrsqrt7_v_bf16m8_tu(vbfloat16m8_t maskedoff, vbfloat16m8_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_v_bf16m8_tu(maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfrsqrt7_v_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfrsqrt7_v_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_v_bf16mf4_tum(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfrsqrt7_v_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfrsqrt7_v_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_v_bf16mf2_tum(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfrsqrt7_v_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfrsqrt7_v_bf16m1_tum(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_v_bf16m1_tum(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfrsqrt7_v_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfrsqrt7_v_bf16m2_tum(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_v_bf16m2_tum(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfrsqrt7_v_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfrsqrt7_v_bf16m4_tum(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_v_bf16m4_tum(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfrsqrt7_v_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfrsqrt7_v_bf16m8_tum(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_v_bf16m8_tum(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfrsqrt7_v_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfrsqrt7_v_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_v_bf16mf4_tumu(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfrsqrt7_v_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfrsqrt7_v_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_v_bf16mf2_tumu(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfrsqrt7_v_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfrsqrt7_v_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_v_bf16m1_tumu(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfrsqrt7_v_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfrsqrt7_v_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_v_bf16m2_tumu(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfrsqrt7_v_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfrsqrt7_v_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_v_bf16m4_tumu(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfrsqrt7_v_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfrsqrt7_v_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_v_bf16m8_tumu(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfrsqrt7_v_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfrsqrt7_v_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_v_bf16mf4_mu(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfrsqrt7_v_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfrsqrt7_v_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_v_bf16mf2_mu(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfrsqrt7_v_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfrsqrt7_v_bf16m1_mu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_v_bf16m1_mu(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfrsqrt7_v_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfrsqrt7_v_bf16m2_mu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_v_bf16m2_mu(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfrsqrt7_v_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfrsqrt7_v_bf16m4_mu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_v_bf16m4_mu(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfrsqrt7_v_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfrsqrt7_v_bf16m8_mu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_v_bf16m8_mu(mask, maskedoff, op1, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfrsub.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfrsub.c
new file mode 100644
index 0000000000000..7dd2bb6cc502d
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfrsub.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfrsub_vf_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfrsub.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfrsub_vf_bf16mf4_tu(vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_vf_bf16mf4_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfrsub_vf_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfrsub.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfrsub_vf_bf16mf2_tu(vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_vf_bf16mf2_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfrsub_vf_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfrsub.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfrsub_vf_bf16m1_tu(vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_vf_bf16m1_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfrsub_vf_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfrsub.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfrsub_vf_bf16m2_tu(vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_vf_bf16m2_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfrsub_vf_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfrsub.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfrsub_vf_bf16m4_tu(vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_vf_bf16m4_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfrsub_vf_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfrsub.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfrsub_vf_bf16m8_tu(vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_vf_bf16m8_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfrsub_vf_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfrsub.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfrsub_vf_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_vf_bf16mf4_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfrsub_vf_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfrsub.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfrsub_vf_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_vf_bf16mf2_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfrsub_vf_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfrsub.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfrsub_vf_bf16m1_tum(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_vf_bf16m1_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfrsub_vf_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfrsub.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfrsub_vf_bf16m2_tum(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_vf_bf16m2_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfrsub_vf_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfrsub.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfrsub_vf_bf16m4_tum(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_vf_bf16m4_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfrsub_vf_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfrsub.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfrsub_vf_bf16m8_tum(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_vf_bf16m8_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfrsub_vf_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfrsub.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfrsub_vf_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_vf_bf16mf4_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfrsub_vf_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfrsub.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfrsub_vf_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_vf_bf16mf2_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfrsub_vf_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfrsub.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfrsub_vf_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_vf_bf16m1_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfrsub_vf_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfrsub.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfrsub_vf_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_vf_bf16m2_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfrsub_vf_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfrsub.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfrsub_vf_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_vf_bf16m4_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfrsub_vf_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfrsub.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfrsub_vf_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_vf_bf16m8_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfrsub_vf_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfrsub.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfrsub_vf_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_vf_bf16mf4_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfrsub_vf_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfrsub.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfrsub_vf_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_vf_bf16mf2_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfrsub_vf_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfrsub.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfrsub_vf_bf16m1_mu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_vf_bf16m1_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfrsub_vf_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfrsub.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfrsub_vf_bf16m2_mu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_vf_bf16m2_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfrsub_vf_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfrsub.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfrsub_vf_bf16m4_mu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_vf_bf16m4_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfrsub_vf_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfrsub.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfrsub_vf_bf16m8_mu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_vf_bf16m8_mu(mask, maskedoff, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfsgnj.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfsgnj.c
new file mode 100644
index 0000000000000..b39a0be2238a9
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfsgnj.c
@@ -0,0 +1,489 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnj_vv_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnj.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnj_vv_bf16mf4_tu(vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsgnj_vv_bf16mf4_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnj_vf_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnj.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnj_vf_bf16mf4_tu(vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_vf_bf16mf4_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnj_vv_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnj.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnj_vv_bf16mf2_tu(vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsgnj_vv_bf16mf2_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnj_vf_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnj.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnj_vf_bf16mf2_tu(vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_vf_bf16mf2_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnj_vv_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnj.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnj_vv_bf16m1_tu(vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsgnj_vv_bf16m1_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnj_vf_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnj.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnj_vf_bf16m1_tu(vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_vf_bf16m1_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnj_vv_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnj.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnj_vv_bf16m2_tu(vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsgnj_vv_bf16m2_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnj_vf_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnj.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnj_vf_bf16m2_tu(vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_vf_bf16m2_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnj_vv_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnj.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnj_vv_bf16m4_tu(vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsgnj_vv_bf16m4_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnj_vf_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnj.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnj_vf_bf16m4_tu(vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_vf_bf16m4_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnj_vv_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnj.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnj_vv_bf16m8_tu(vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsgnj_vv_bf16m8_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnj_vf_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnj.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnj_vf_bf16m8_tu(vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_vf_bf16m8_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnj_vv_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnj.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnj_vv_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsgnj_vv_bf16mf4_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnj_vf_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnj.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnj_vf_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_vf_bf16mf4_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnj_vv_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnj.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnj_vv_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsgnj_vv_bf16mf2_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnj_vf_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnj.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnj_vf_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_vf_bf16mf2_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnj_vv_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnj.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnj_vv_bf16m1_tum(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsgnj_vv_bf16m1_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnj_vf_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnj.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnj_vf_bf16m1_tum(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_vf_bf16m1_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnj_vv_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnj.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnj_vv_bf16m2_tum(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsgnj_vv_bf16m2_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnj_vf_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnj.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnj_vf_bf16m2_tum(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_vf_bf16m2_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnj_vv_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnj.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnj_vv_bf16m4_tum(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsgnj_vv_bf16m4_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnj_vf_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnj.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnj_vf_bf16m4_tum(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_vf_bf16m4_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnj_vv_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnj.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnj_vv_bf16m8_tum(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsgnj_vv_bf16m8_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnj_vf_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnj.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnj_vf_bf16m8_tum(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_vf_bf16m8_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnj_vv_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnj.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnj_vv_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsgnj_vv_bf16mf4_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnj_vf_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnj.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnj_vf_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_vf_bf16mf4_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnj_vv_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnj.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnj_vv_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsgnj_vv_bf16mf2_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnj_vf_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnj.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnj_vf_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_vf_bf16mf2_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnj_vv_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnj.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnj_vv_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsgnj_vv_bf16m1_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnj_vf_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnj.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnj_vf_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_vf_bf16m1_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnj_vv_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnj.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnj_vv_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsgnj_vv_bf16m2_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnj_vf_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnj.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnj_vf_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_vf_bf16m2_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnj_vv_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnj.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnj_vv_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsgnj_vv_bf16m4_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnj_vf_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnj.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnj_vf_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_vf_bf16m4_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnj_vv_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnj.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnj_vv_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsgnj_vv_bf16m8_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnj_vf_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnj.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnj_vf_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_vf_bf16m8_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnj_vv_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnj.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnj_vv_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsgnj_vv_bf16mf4_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnj_vf_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnj.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnj_vf_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_vf_bf16mf4_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnj_vv_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnj.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnj_vv_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsgnj_vv_bf16mf2_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnj_vf_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnj.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnj_vf_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_vf_bf16mf2_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnj_vv_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnj.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnj_vv_bf16m1_mu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsgnj_vv_bf16m1_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnj_vf_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnj.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnj_vf_bf16m1_mu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_vf_bf16m1_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnj_vv_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnj.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnj_vv_bf16m2_mu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsgnj_vv_bf16m2_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnj_vf_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnj.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnj_vf_bf16m2_mu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_vf_bf16m2_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnj_vv_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnj.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnj_vv_bf16m4_mu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsgnj_vv_bf16m4_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnj_vf_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnj.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnj_vf_bf16m4_mu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_vf_bf16m4_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnj_vv_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnj.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnj_vv_bf16m8_mu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsgnj_vv_bf16m8_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnj_vf_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnj.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnj_vf_bf16m8_mu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_vf_bf16m8_mu(mask, maskedoff, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfsgnjn.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfsgnjn.c
new file mode 100644
index 0000000000000..7542e7866c888
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfsgnjn.c
@@ -0,0 +1,489 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjn_vv_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjn.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjn_vv_bf16mf4_tu(vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsgnjn_vv_bf16mf4_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjn_vf_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjn.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjn_vf_bf16mf4_tu(vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_vf_bf16mf4_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjn_vv_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjn.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjn_vv_bf16mf2_tu(vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsgnjn_vv_bf16mf2_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjn_vf_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjn.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjn_vf_bf16mf2_tu(vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_vf_bf16mf2_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjn_vv_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjn.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjn_vv_bf16m1_tu(vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsgnjn_vv_bf16m1_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjn_vf_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjn.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjn_vf_bf16m1_tu(vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_vf_bf16m1_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjn_vv_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjn.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjn_vv_bf16m2_tu(vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsgnjn_vv_bf16m2_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjn_vf_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjn.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjn_vf_bf16m2_tu(vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_vf_bf16m2_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjn_vv_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjn.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjn_vv_bf16m4_tu(vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsgnjn_vv_bf16m4_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjn_vf_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjn.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjn_vf_bf16m4_tu(vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_vf_bf16m4_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjn_vv_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjn.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjn_vv_bf16m8_tu(vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsgnjn_vv_bf16m8_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjn_vf_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjn.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjn_vf_bf16m8_tu(vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_vf_bf16m8_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjn_vv_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjn_vv_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsgnjn_vv_bf16mf4_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjn_vf_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjn_vf_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_vf_bf16mf4_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjn_vv_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjn_vv_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsgnjn_vv_bf16mf2_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjn_vf_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjn_vf_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_vf_bf16mf2_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjn_vv_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjn_vv_bf16m1_tum(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsgnjn_vv_bf16m1_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjn_vf_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjn_vf_bf16m1_tum(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_vf_bf16m1_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjn_vv_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjn_vv_bf16m2_tum(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsgnjn_vv_bf16m2_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjn_vf_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjn_vf_bf16m2_tum(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_vf_bf16m2_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjn_vv_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjn_vv_bf16m4_tum(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsgnjn_vv_bf16m4_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjn_vf_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjn_vf_bf16m4_tum(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_vf_bf16m4_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjn_vv_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjn_vv_bf16m8_tum(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsgnjn_vv_bf16m8_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjn_vf_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjn_vf_bf16m8_tum(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_vf_bf16m8_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjn_vv_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjn_vv_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsgnjn_vv_bf16mf4_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjn_vf_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjn_vf_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_vf_bf16mf4_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjn_vv_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjn_vv_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsgnjn_vv_bf16mf2_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjn_vf_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjn_vf_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_vf_bf16mf2_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjn_vv_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjn_vv_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsgnjn_vv_bf16m1_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjn_vf_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjn_vf_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_vf_bf16m1_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjn_vv_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjn_vv_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsgnjn_vv_bf16m2_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjn_vf_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjn_vf_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_vf_bf16m2_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjn_vv_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjn_vv_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsgnjn_vv_bf16m4_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjn_vf_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjn_vf_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_vf_bf16m4_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjn_vv_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjn_vv_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsgnjn_vv_bf16m8_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjn_vf_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjn_vf_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_vf_bf16m8_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjn_vv_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjn_vv_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsgnjn_vv_bf16mf4_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjn_vf_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjn_vf_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_vf_bf16mf4_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjn_vv_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjn_vv_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsgnjn_vv_bf16mf2_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjn_vf_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjn_vf_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_vf_bf16mf2_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjn_vv_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjn_vv_bf16m1_mu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsgnjn_vv_bf16m1_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjn_vf_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjn_vf_bf16m1_mu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_vf_bf16m1_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjn_vv_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjn_vv_bf16m2_mu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsgnjn_vv_bf16m2_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjn_vf_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjn_vf_bf16m2_mu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_vf_bf16m2_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjn_vv_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjn_vv_bf16m4_mu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsgnjn_vv_bf16m4_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjn_vf_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjn_vf_bf16m4_mu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_vf_bf16m4_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjn_vv_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjn_vv_bf16m8_mu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsgnjn_vv_bf16m8_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjn_vf_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjn_vf_bf16m8_mu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_vf_bf16m8_mu(mask, maskedoff, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfsgnjx.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfsgnjx.c
new file mode 100644
index 0000000000000..104149e1b2fa4
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfsgnjx.c
@@ -0,0 +1,489 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjx_vv_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjx.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjx_vv_bf16mf4_tu(vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsgnjx_vv_bf16mf4_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjx_vf_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjx.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjx_vf_bf16mf4_tu(vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_vf_bf16mf4_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjx_vv_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjx.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjx_vv_bf16mf2_tu(vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsgnjx_vv_bf16mf2_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjx_vf_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjx.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjx_vf_bf16mf2_tu(vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_vf_bf16mf2_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjx_vv_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjx.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjx_vv_bf16m1_tu(vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsgnjx_vv_bf16m1_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjx_vf_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjx.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjx_vf_bf16m1_tu(vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_vf_bf16m1_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjx_vv_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjx.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjx_vv_bf16m2_tu(vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsgnjx_vv_bf16m2_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjx_vf_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjx.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjx_vf_bf16m2_tu(vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_vf_bf16m2_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjx_vv_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjx.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjx_vv_bf16m4_tu(vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsgnjx_vv_bf16m4_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjx_vf_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjx.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjx_vf_bf16m4_tu(vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_vf_bf16m4_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjx_vv_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjx.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjx_vv_bf16m8_tu(vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsgnjx_vv_bf16m8_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjx_vf_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjx.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjx_vf_bf16m8_tu(vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_vf_bf16m8_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjx_vv_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjx_vv_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsgnjx_vv_bf16mf4_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjx_vf_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjx_vf_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_vf_bf16mf4_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjx_vv_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjx_vv_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsgnjx_vv_bf16mf2_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjx_vf_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjx_vf_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_vf_bf16mf2_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjx_vv_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjx_vv_bf16m1_tum(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsgnjx_vv_bf16m1_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjx_vf_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjx_vf_bf16m1_tum(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_vf_bf16m1_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjx_vv_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjx_vv_bf16m2_tum(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsgnjx_vv_bf16m2_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjx_vf_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjx_vf_bf16m2_tum(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_vf_bf16m2_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjx_vv_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjx_vv_bf16m4_tum(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsgnjx_vv_bf16m4_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjx_vf_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjx_vf_bf16m4_tum(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_vf_bf16m4_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjx_vv_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjx_vv_bf16m8_tum(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsgnjx_vv_bf16m8_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjx_vf_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjx_vf_bf16m8_tum(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_vf_bf16m8_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjx_vv_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjx_vv_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsgnjx_vv_bf16mf4_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjx_vf_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjx_vf_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_vf_bf16mf4_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjx_vv_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjx_vv_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsgnjx_vv_bf16mf2_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjx_vf_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjx_vf_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_vf_bf16mf2_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjx_vv_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjx_vv_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsgnjx_vv_bf16m1_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjx_vf_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjx_vf_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_vf_bf16m1_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjx_vv_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjx_vv_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsgnjx_vv_bf16m2_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjx_vf_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjx_vf_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_vf_bf16m2_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjx_vv_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjx_vv_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsgnjx_vv_bf16m4_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjx_vf_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjx_vf_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_vf_bf16m4_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjx_vv_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjx_vv_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsgnjx_vv_bf16m8_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjx_vf_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjx_vf_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_vf_bf16m8_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjx_vv_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjx_vv_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsgnjx_vv_bf16mf4_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjx_vf_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjx_vf_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_vf_bf16mf4_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjx_vv_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjx_vv_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsgnjx_vv_bf16mf2_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjx_vf_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjx_vf_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_vf_bf16mf2_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjx_vv_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjx_vv_bf16m1_mu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsgnjx_vv_bf16m1_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjx_vf_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjx_vf_bf16m1_mu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_vf_bf16m1_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjx_vv_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjx_vv_bf16m2_mu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsgnjx_vv_bf16m2_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjx_vf_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjx_vf_bf16m2_mu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_vf_bf16m2_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjx_vv_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjx_vv_bf16m4_mu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsgnjx_vv_bf16m4_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjx_vf_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjx_vf_bf16m4_mu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_vf_bf16m4_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjx_vv_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjx_vv_bf16m8_mu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsgnjx_vv_bf16m8_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjx_vf_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjx_vf_bf16m8_mu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_vf_bf16m8_mu(mask, maskedoff, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfslide1down.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfslide1down.c
new file mode 100644
index 0000000000000..228dc1cd064a8
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfslide1down.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfslide1down_vf_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfslide1down.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfslide1down_vf_bf16mf4_tu(vbfloat16mf4_t maskedoff, vbfloat16mf4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_vf_bf16mf4_tu(maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfslide1down_vf_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfslide1down.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfslide1down_vf_bf16mf2_tu(vbfloat16mf2_t maskedoff, vbfloat16mf2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_vf_bf16mf2_tu(maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfslide1down_vf_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfslide1down.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfslide1down_vf_bf16m1_tu(vbfloat16m1_t maskedoff, vbfloat16m1_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_vf_bf16m1_tu(maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfslide1down_vf_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfslide1down.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfslide1down_vf_bf16m2_tu(vbfloat16m2_t maskedoff, vbfloat16m2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_vf_bf16m2_tu(maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfslide1down_vf_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfslide1down.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfslide1down_vf_bf16m4_tu(vbfloat16m4_t maskedoff, vbfloat16m4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_vf_bf16m4_tu(maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfslide1down_vf_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfslide1down.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfslide1down_vf_bf16m8_tu(vbfloat16m8_t maskedoff, vbfloat16m8_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_vf_bf16m8_tu(maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfslide1down_vf_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfslide1down.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfslide1down_vf_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_vf_bf16mf4_tum(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfslide1down_vf_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfslide1down.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfslide1down_vf_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_vf_bf16mf2_tum(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfslide1down_vf_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfslide1down.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfslide1down_vf_bf16m1_tum(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_vf_bf16m1_tum(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfslide1down_vf_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfslide1down.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfslide1down_vf_bf16m2_tum(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_vf_bf16m2_tum(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfslide1down_vf_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfslide1down.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfslide1down_vf_bf16m4_tum(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_vf_bf16m4_tum(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfslide1down_vf_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfslide1down.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfslide1down_vf_bf16m8_tum(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_vf_bf16m8_tum(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfslide1down_vf_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfslide1down.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfslide1down_vf_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_vf_bf16mf4_tumu(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfslide1down_vf_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfslide1down.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfslide1down_vf_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_vf_bf16mf2_tumu(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfslide1down_vf_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfslide1down.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfslide1down_vf_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_vf_bf16m1_tumu(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfslide1down_vf_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfslide1down.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfslide1down_vf_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_vf_bf16m2_tumu(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfslide1down_vf_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfslide1down.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfslide1down_vf_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_vf_bf16m4_tumu(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfslide1down_vf_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfslide1down.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfslide1down_vf_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_vf_bf16m8_tumu(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfslide1down_vf_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfslide1down.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfslide1down_vf_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_vf_bf16mf4_mu(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfslide1down_vf_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfslide1down.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfslide1down_vf_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_vf_bf16mf2_mu(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfslide1down_vf_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfslide1down.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfslide1down_vf_bf16m1_mu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_vf_bf16m1_mu(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfslide1down_vf_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfslide1down.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfslide1down_vf_bf16m2_mu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_vf_bf16m2_mu(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfslide1down_vf_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfslide1down.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfslide1down_vf_bf16m4_mu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_vf_bf16m4_mu(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfslide1down_vf_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfslide1down.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfslide1down_vf_bf16m8_mu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_vf_bf16m8_mu(mask, maskedoff, src, value, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfslide1up.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfslide1up.c
new file mode 100644
index 0000000000000..9e6ff2b16f77d
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfslide1up.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfslide1up_vf_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfslide1up.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfslide1up_vf_bf16mf4_tu(vbfloat16mf4_t maskedoff, vbfloat16mf4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_vf_bf16mf4_tu(maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfslide1up_vf_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfslide1up.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfslide1up_vf_bf16mf2_tu(vbfloat16mf2_t maskedoff, vbfloat16mf2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_vf_bf16mf2_tu(maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfslide1up_vf_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfslide1up.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfslide1up_vf_bf16m1_tu(vbfloat16m1_t maskedoff, vbfloat16m1_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_vf_bf16m1_tu(maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfslide1up_vf_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfslide1up.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfslide1up_vf_bf16m2_tu(vbfloat16m2_t maskedoff, vbfloat16m2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_vf_bf16m2_tu(maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfslide1up_vf_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfslide1up.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfslide1up_vf_bf16m4_tu(vbfloat16m4_t maskedoff, vbfloat16m4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_vf_bf16m4_tu(maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfslide1up_vf_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfslide1up.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfslide1up_vf_bf16m8_tu(vbfloat16m8_t maskedoff, vbfloat16m8_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_vf_bf16m8_tu(maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfslide1up_vf_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfslide1up.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfslide1up_vf_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_vf_bf16mf4_tum(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfslide1up_vf_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfslide1up.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfslide1up_vf_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_vf_bf16mf2_tum(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfslide1up_vf_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfslide1up.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfslide1up_vf_bf16m1_tum(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_vf_bf16m1_tum(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfslide1up_vf_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfslide1up.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfslide1up_vf_bf16m2_tum(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_vf_bf16m2_tum(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfslide1up_vf_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfslide1up.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfslide1up_vf_bf16m4_tum(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_vf_bf16m4_tum(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfslide1up_vf_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfslide1up.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfslide1up_vf_bf16m8_tum(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_vf_bf16m8_tum(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfslide1up_vf_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfslide1up.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfslide1up_vf_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_vf_bf16mf4_tumu(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfslide1up_vf_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfslide1up.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfslide1up_vf_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_vf_bf16mf2_tumu(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfslide1up_vf_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfslide1up.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfslide1up_vf_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_vf_bf16m1_tumu(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfslide1up_vf_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfslide1up.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfslide1up_vf_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_vf_bf16m2_tumu(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfslide1up_vf_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfslide1up.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfslide1up_vf_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_vf_bf16m4_tumu(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfslide1up_vf_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfslide1up.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfslide1up_vf_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_vf_bf16m8_tumu(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfslide1up_vf_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfslide1up.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfslide1up_vf_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_vf_bf16mf4_mu(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfslide1up_vf_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfslide1up.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfslide1up_vf_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_vf_bf16mf2_mu(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfslide1up_vf_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfslide1up.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfslide1up_vf_bf16m1_mu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_vf_bf16m1_mu(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfslide1up_vf_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfslide1up.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfslide1up_vf_bf16m2_mu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_vf_bf16m2_mu(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfslide1up_vf_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfslide1up.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfslide1up_vf_bf16m4_mu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_vf_bf16m4_mu(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfslide1up_vf_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfslide1up.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfslide1up_vf_bf16m8_mu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_vf_bf16m8_mu(mask, maskedoff, src, value, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfsub.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfsub.c
new file mode 100644
index 0000000000000..b6fd94ece20cd
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfsub.c
@@ -0,0 +1,489 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsub_vv_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsub.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsub_vv_bf16mf4_tu(vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsub_vv_bf16mf4_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsub_vf_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsub.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsub_vf_bf16mf4_tu(vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_vf_bf16mf4_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsub_vv_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsub.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsub_vv_bf16mf2_tu(vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsub_vv_bf16mf2_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsub_vf_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsub.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsub_vf_bf16mf2_tu(vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_vf_bf16mf2_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsub_vv_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsub.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsub_vv_bf16m1_tu(vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsub_vv_bf16m1_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsub_vf_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsub.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsub_vf_bf16m1_tu(vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_vf_bf16m1_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsub_vv_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsub.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsub_vv_bf16m2_tu(vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsub_vv_bf16m2_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsub_vf_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsub.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsub_vf_bf16m2_tu(vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_vf_bf16m2_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsub_vv_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsub.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsub_vv_bf16m4_tu(vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsub_vv_bf16m4_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsub_vf_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsub.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsub_vf_bf16m4_tu(vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_vf_bf16m4_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsub_vv_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsub.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsub_vv_bf16m8_tu(vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsub_vv_bf16m8_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsub_vf_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsub.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsub_vf_bf16m8_tu(vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_vf_bf16m8_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsub_vv_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsub.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsub_vv_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsub_vv_bf16mf4_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsub_vf_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsub.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsub_vf_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_vf_bf16mf4_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsub_vv_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsub.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsub_vv_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsub_vv_bf16mf2_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsub_vf_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsub.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsub_vf_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_vf_bf16mf2_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsub_vv_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsub.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsub_vv_bf16m1_tum(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsub_vv_bf16m1_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsub_vf_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsub.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsub_vf_bf16m1_tum(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_vf_bf16m1_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsub_vv_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsub.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsub_vv_bf16m2_tum(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsub_vv_bf16m2_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsub_vf_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsub.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsub_vf_bf16m2_tum(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_vf_bf16m2_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsub_vv_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsub.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsub_vv_bf16m4_tum(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsub_vv_bf16m4_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsub_vf_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsub.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsub_vf_bf16m4_tum(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_vf_bf16m4_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsub_vv_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsub.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsub_vv_bf16m8_tum(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsub_vv_bf16m8_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsub_vf_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsub.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsub_vf_bf16m8_tum(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_vf_bf16m8_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsub_vv_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsub.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsub_vv_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsub_vv_bf16mf4_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsub_vf_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsub.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsub_vf_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_vf_bf16mf4_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsub_vv_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsub.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsub_vv_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsub_vv_bf16mf2_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsub_vf_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsub.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsub_vf_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_vf_bf16mf2_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsub_vv_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsub.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsub_vv_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsub_vv_bf16m1_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsub_vf_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsub.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsub_vf_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_vf_bf16m1_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsub_vv_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsub.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsub_vv_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsub_vv_bf16m2_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsub_vf_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsub.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsub_vf_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_vf_bf16m2_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsub_vv_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsub.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsub_vv_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsub_vv_bf16m4_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsub_vf_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsub.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsub_vf_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_vf_bf16m4_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsub_vv_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsub.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsub_vv_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsub_vv_bf16m8_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsub_vf_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsub.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsub_vf_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_vf_bf16m8_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsub_vv_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsub.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsub_vv_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsub_vv_bf16mf4_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsub_vf_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsub.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsub_vf_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_vf_bf16mf4_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsub_vv_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsub.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsub_vv_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsub_vv_bf16mf2_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsub_vf_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsub.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsub_vf_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_vf_bf16mf2_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsub_vv_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsub.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsub_vv_bf16m1_mu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsub_vv_bf16m1_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsub_vf_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsub.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsub_vf_bf16m1_mu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_vf_bf16m1_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsub_vv_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsub.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsub_vv_bf16m2_mu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsub_vv_bf16m2_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsub_vf_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsub.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsub_vf_bf16m2_mu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_vf_bf16m2_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsub_vv_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsub.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsub_vv_bf16m4_mu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsub_vv_bf16m4_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsub_vf_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsub.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsub_vf_bf16m4_mu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_vf_bf16m4_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsub_vv_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsub.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsub_vv_bf16m8_mu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsub_vv_bf16m8_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsub_vf_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsub.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsub_vf_bf16m8_mu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_vf_bf16m8_mu(mask, maskedoff, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfwadd.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfwadd.c
new file mode 100644
index 0000000000000..4bee376cfe0fb
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfwadd.c
@@ -0,0 +1,2007 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vv_bf16mf4_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vv_bf16mf4_f32mf2_tu(vfloat32mf2_t vd,
+                                               vbfloat16mf4_t vs2,
+                                               vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16mf4_f32mf2_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vf_bf16mf4_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vf_bf16mf4_f32mf2_tu(vfloat32mf2_t vd,
+                                               vbfloat16mf4_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwadd_vf_bf16mf4_f32mf2_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wv_bf16mf4_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wv_bf16mf4_f32mf2_tu(vfloat32mf2_t vd,
+                                               vfloat32mf2_t vs2,
+                                               vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_bf16mf4_f32mf2_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wf_bf16_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.nxv1f32.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wf_bf16_f32mf2_tu(vfloat32mf2_t vd, vfloat32mf2_t vs2,
+                                            __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32mf2_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vv_bf16mf2_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vv_bf16mf2_f32m1_tu(vfloat32m1_t vd,
+                                             vbfloat16mf2_t vs2,
+                                             vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16mf2_f32m1_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vf_bf16mf2_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vf_bf16mf2_f32m1_tu(vfloat32m1_t vd,
+                                             vbfloat16mf2_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwadd_vf_bf16mf2_f32m1_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wv_bf16mf2_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wv_bf16mf2_f32m1_tu(vfloat32m1_t vd, vfloat32m1_t vs2,
+                                             vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_bf16mf2_f32m1_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wf_bf16_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.nxv2f32.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wf_bf16_f32m1_tu(vfloat32m1_t vd, vfloat32m1_t vs2,
+                                          __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m1_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vv_bf16m1_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vv_bf16m1_f32m2_tu(vfloat32m2_t vd, vbfloat16m1_t vs2,
+                                            vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16m1_f32m2_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vf_bf16m1_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vf_bf16m1_f32m2_tu(vfloat32m2_t vd, vbfloat16m1_t vs2,
+                                            __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_vf_bf16m1_f32m2_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wv_bf16m1_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wv_bf16m1_f32m2_tu(vfloat32m2_t vd, vfloat32m2_t vs2,
+                                            vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_bf16m1_f32m2_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wf_bf16_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.nxv4f32.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wf_bf16_f32m2_tu(vfloat32m2_t vd, vfloat32m2_t vs2,
+                                          __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m2_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vv_bf16m2_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vv_bf16m2_f32m4_tu(vfloat32m4_t vd, vbfloat16m2_t vs2,
+                                            vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16m2_f32m4_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vf_bf16m2_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vf_bf16m2_f32m4_tu(vfloat32m4_t vd, vbfloat16m2_t vs2,
+                                            __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_vf_bf16m2_f32m4_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wv_bf16m2_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wv_bf16m2_f32m4_tu(vfloat32m4_t vd, vfloat32m4_t vs2,
+                                            vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_bf16m2_f32m4_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wf_bf16_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.nxv8f32.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wf_bf16_f32m4_tu(vfloat32m4_t vd, vfloat32m4_t vs2,
+                                          __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m4_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vv_bf16m4_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vv_bf16m4_f32m8_tu(vfloat32m8_t vd, vbfloat16m4_t vs2,
+                                            vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16m4_f32m8_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vf_bf16m4_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vf_bf16m4_f32m8_tu(vfloat32m8_t vd, vbfloat16m4_t vs2,
+                                            __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_vf_bf16m4_f32m8_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wv_bf16m4_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wv_bf16m4_f32m8_tu(vfloat32m8_t vd, vfloat32m8_t vs2,
+                                            vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_bf16m4_f32m8_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wf_bf16_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.nxv16f32.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wf_bf16_f32m8_tu(vfloat32m8_t vd, vfloat32m8_t vs2,
+                                          __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m8_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vv_bf16mf4_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vv_bf16mf4_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd,
+                                                vbfloat16mf4_t vs2,
+                                                vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16mf4_f32mf2_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vf_bf16mf4_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vf_bf16mf4_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd,
+                                                vbfloat16mf4_t vs2, __bf16 rs1,
+                                                size_t vl) {
+  return __riscv_vfwadd_vf_bf16mf4_f32mf2_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wv_bf16mf4_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wv_bf16mf4_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd,
+                                                vfloat32mf2_t vs2,
+                                                vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_bf16mf4_f32mf2_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wf_bf16_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wf_bf16_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd,
+                                             vfloat32mf2_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32mf2_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vv_bf16mf2_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vv_bf16mf2_f32m1_tum(vbool32_t vm, vfloat32m1_t vd,
+                                              vbfloat16mf2_t vs2,
+                                              vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16mf2_f32m1_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vf_bf16mf2_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vf_bf16mf2_f32m1_tum(vbool32_t vm, vfloat32m1_t vd,
+                                              vbfloat16mf2_t vs2, __bf16 rs1,
+                                              size_t vl) {
+  return __riscv_vfwadd_vf_bf16mf2_f32m1_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wv_bf16mf2_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wv_bf16mf2_f32m1_tum(vbool32_t vm, vfloat32m1_t vd,
+                                              vfloat32m1_t vs2,
+                                              vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_bf16mf2_f32m1_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wf_bf16_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wf_bf16_f32m1_tum(vbool32_t vm, vfloat32m1_t vd,
+                                           vfloat32m1_t vs2, __bf16 rs1,
+                                           size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m1_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vv_bf16m1_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vv_bf16m1_f32m2_tum(vbool16_t vm, vfloat32m2_t vd,
+                                             vbfloat16m1_t vs2,
+                                             vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16m1_f32m2_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vf_bf16m1_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vf_bf16m1_f32m2_tum(vbool16_t vm, vfloat32m2_t vd,
+                                             vbfloat16m1_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwadd_vf_bf16m1_f32m2_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wv_bf16m1_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wv_bf16m1_f32m2_tum(vbool16_t vm, vfloat32m2_t vd,
+                                             vfloat32m2_t vs2,
+                                             vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_bf16m1_f32m2_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wf_bf16_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wf_bf16_f32m2_tum(vbool16_t vm, vfloat32m2_t vd,
+                                           vfloat32m2_t vs2, __bf16 rs1,
+                                           size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m2_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vv_bf16m2_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vv_bf16m2_f32m4_tum(vbool8_t vm, vfloat32m4_t vd,
+                                             vbfloat16m2_t vs2,
+                                             vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16m2_f32m4_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vf_bf16m2_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vf_bf16m2_f32m4_tum(vbool8_t vm, vfloat32m4_t vd,
+                                             vbfloat16m2_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwadd_vf_bf16m2_f32m4_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wv_bf16m2_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wv_bf16m2_f32m4_tum(vbool8_t vm, vfloat32m4_t vd,
+                                             vfloat32m4_t vs2,
+                                             vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_bf16m2_f32m4_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wf_bf16_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wf_bf16_f32m4_tum(vbool8_t vm, vfloat32m4_t vd,
+                                           vfloat32m4_t vs2, __bf16 rs1,
+                                           size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m4_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vv_bf16m4_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vv_bf16m4_f32m8_tum(vbool4_t vm, vfloat32m8_t vd,
+                                             vbfloat16m4_t vs2,
+                                             vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16m4_f32m8_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vf_bf16m4_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vf_bf16m4_f32m8_tum(vbool4_t vm, vfloat32m8_t vd,
+                                             vbfloat16m4_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwadd_vf_bf16m4_f32m8_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wv_bf16m4_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wv_bf16m4_f32m8_tum(vbool4_t vm, vfloat32m8_t vd,
+                                             vfloat32m8_t vs2,
+                                             vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_bf16m4_f32m8_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wf_bf16_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wf_bf16_f32m8_tum(vbool4_t vm, vfloat32m8_t vd,
+                                           vfloat32m8_t vs2, __bf16 rs1,
+                                           size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m8_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vv_bf16mf4_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vv_bf16mf4_f32mf2_tumu(vbool64_t vm, vfloat32mf2_t vd,
+                                                 vbfloat16mf4_t vs2,
+                                                 vbfloat16mf4_t vs1,
+                                                 size_t vl) {
+  return __riscv_vfwadd_vv_bf16mf4_f32mf2_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vf_bf16mf4_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vf_bf16mf4_f32mf2_tumu(vbool64_t vm, vfloat32mf2_t vd,
+                                                 vbfloat16mf4_t vs2, __bf16 rs1,
+                                                 size_t vl) {
+  return __riscv_vfwadd_vf_bf16mf4_f32mf2_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wv_bf16mf4_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wv_bf16mf4_f32mf2_tumu(vbool64_t vm, vfloat32mf2_t vd,
+                                                 vfloat32mf2_t vs2,
+                                                 vbfloat16mf4_t vs1,
+                                                 size_t vl) {
+  return __riscv_vfwadd_wv_bf16mf4_f32mf2_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wf_bf16_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wf_bf16_f32mf2_tumu(vbool64_t vm, vfloat32mf2_t vd,
+                                              vfloat32mf2_t vs2, __bf16 rs1,
+                                              size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32mf2_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vv_bf16mf2_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vv_bf16mf2_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                               vbfloat16mf2_t vs2,
+                                               vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16mf2_f32m1_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vf_bf16mf2_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vf_bf16mf2_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                               vbfloat16mf2_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwadd_vf_bf16mf2_f32m1_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wv_bf16mf2_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wv_bf16mf2_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                               vfloat32m1_t vs2,
+                                               vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_bf16mf2_f32m1_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wf_bf16_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wf_bf16_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                            vfloat32m1_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m1_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vv_bf16m1_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vv_bf16m1_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                              vbfloat16m1_t vs2,
+                                              vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16m1_f32m2_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vf_bf16m1_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vf_bf16m1_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                              vbfloat16m1_t vs2, __bf16 rs1,
+                                              size_t vl) {
+  return __riscv_vfwadd_vf_bf16m1_f32m2_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wv_bf16m1_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wv_bf16m1_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                              vfloat32m2_t vs2,
+                                              vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_bf16m1_f32m2_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wf_bf16_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wf_bf16_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                            vfloat32m2_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m2_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vv_bf16m2_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vv_bf16m2_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                              vbfloat16m2_t vs2,
+                                              vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16m2_f32m4_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vf_bf16m2_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vf_bf16m2_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                              vbfloat16m2_t vs2, __bf16 rs1,
+                                              size_t vl) {
+  return __riscv_vfwadd_vf_bf16m2_f32m4_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wv_bf16m2_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wv_bf16m2_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                              vfloat32m4_t vs2,
+                                              vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_bf16m2_f32m4_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wf_bf16_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wf_bf16_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                            vfloat32m4_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m4_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vv_bf16m4_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vv_bf16m4_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                              vbfloat16m4_t vs2,
+                                              vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16m4_f32m8_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vf_bf16m4_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vf_bf16m4_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                              vbfloat16m4_t vs2, __bf16 rs1,
+                                              size_t vl) {
+  return __riscv_vfwadd_vf_bf16m4_f32m8_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wv_bf16m4_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wv_bf16m4_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                              vfloat32m8_t vs2,
+                                              vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_bf16m4_f32m8_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wf_bf16_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wf_bf16_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                            vfloat32m8_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m8_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vv_bf16mf4_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vv_bf16mf4_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd,
+                                               vbfloat16mf4_t vs2,
+                                               vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16mf4_f32mf2_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vf_bf16mf4_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vf_bf16mf4_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd,
+                                               vbfloat16mf4_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwadd_vf_bf16mf4_f32mf2_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wv_bf16mf4_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wv_bf16mf4_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd,
+                                               vfloat32mf2_t vs2,
+                                               vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_bf16mf4_f32mf2_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wf_bf16_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wf_bf16_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd,
+                                            vfloat32mf2_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32mf2_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vv_bf16mf2_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vv_bf16mf2_f32m1_mu(vbool32_t vm, vfloat32m1_t vd,
+                                             vbfloat16mf2_t vs2,
+                                             vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16mf2_f32m1_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vf_bf16mf2_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vf_bf16mf2_f32m1_mu(vbool32_t vm, vfloat32m1_t vd,
+                                             vbfloat16mf2_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwadd_vf_bf16mf2_f32m1_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wv_bf16mf2_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wv_bf16mf2_f32m1_mu(vbool32_t vm, vfloat32m1_t vd,
+                                             vfloat32m1_t vs2,
+                                             vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_bf16mf2_f32m1_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wf_bf16_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wf_bf16_f32m1_mu(vbool32_t vm, vfloat32m1_t vd,
+                                          vfloat32m1_t vs2, __bf16 rs1,
+                                          size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m1_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vv_bf16m1_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vv_bf16m1_f32m2_mu(vbool16_t vm, vfloat32m2_t vd,
+                                            vbfloat16m1_t vs2,
+                                            vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16m1_f32m2_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vf_bf16m1_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vf_bf16m1_f32m2_mu(vbool16_t vm, vfloat32m2_t vd,
+                                            vbfloat16m1_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwadd_vf_bf16m1_f32m2_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wv_bf16m1_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wv_bf16m1_f32m2_mu(vbool16_t vm, vfloat32m2_t vd,
+                                            vfloat32m2_t vs2, vbfloat16m1_t vs1,
+                                            size_t vl) {
+  return __riscv_vfwadd_wv_bf16m1_f32m2_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wf_bf16_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wf_bf16_f32m2_mu(vbool16_t vm, vfloat32m2_t vd,
+                                          vfloat32m2_t vs2, __bf16 rs1,
+                                          size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m2_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vv_bf16m2_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vv_bf16m2_f32m4_mu(vbool8_t vm, vfloat32m4_t vd,
+                                            vbfloat16m2_t vs2,
+                                            vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16m2_f32m4_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vf_bf16m2_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vf_bf16m2_f32m4_mu(vbool8_t vm, vfloat32m4_t vd,
+                                            vbfloat16m2_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwadd_vf_bf16m2_f32m4_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wv_bf16m2_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wv_bf16m2_f32m4_mu(vbool8_t vm, vfloat32m4_t vd,
+                                            vfloat32m4_t vs2, vbfloat16m2_t vs1,
+                                            size_t vl) {
+  return __riscv_vfwadd_wv_bf16m2_f32m4_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wf_bf16_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wf_bf16_f32m4_mu(vbool8_t vm, vfloat32m4_t vd,
+                                          vfloat32m4_t vs2, __bf16 rs1,
+                                          size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m4_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vv_bf16m4_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vv_bf16m4_f32m8_mu(vbool4_t vm, vfloat32m8_t vd,
+                                            vbfloat16m4_t vs2,
+                                            vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16m4_f32m8_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vf_bf16m4_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vf_bf16m4_f32m8_mu(vbool4_t vm, vfloat32m8_t vd,
+                                            vbfloat16m4_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwadd_vf_bf16m4_f32m8_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wv_bf16m4_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wv_bf16m4_f32m8_mu(vbool4_t vm, vfloat32m8_t vd,
+                                            vfloat32m8_t vs2, vbfloat16m4_t vs1,
+                                            size_t vl) {
+  return __riscv_vfwadd_wv_bf16m4_f32m8_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wf_bf16_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wf_bf16_f32m8_mu(vbool4_t vm, vfloat32m8_t vd,
+                                          vfloat32m8_t vs2, __bf16 rs1,
+                                          size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m8_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vv_bf16mf4_f32mf2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vv_bf16mf4_f32mf2_rm_tu(vfloat32mf2_t vd,
+                                                  vbfloat16mf4_t vs2,
+                                                  vbfloat16mf4_t vs1,
+                                                  size_t vl) {
+  return __riscv_vfwadd_vv_bf16mf4_f32mf2_rm_tu(vd, vs2, vs1, __RISCV_FRM_RNE,
+                                                vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vf_bf16mf4_f32mf2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vf_bf16mf4_f32mf2_rm_tu(vfloat32mf2_t vd,
+                                                  vbfloat16mf4_t vs2,
+                                                  __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_vf_bf16mf4_f32mf2_rm_tu(vd, vs2, rs1, __RISCV_FRM_RNE,
+                                                vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wv_bf16mf4_f32mf2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wv_bf16mf4_f32mf2_rm_tu(vfloat32mf2_t vd,
+                                                  vfloat32mf2_t vs2,
+                                                  vbfloat16mf4_t vs1,
+                                                  size_t vl) {
+  return __riscv_vfwadd_wv_bf16mf4_f32mf2_rm_tu(vd, vs2, vs1, __RISCV_FRM_RNE,
+                                                vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wf_bf16_f32mf2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.nxv1f32.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wf_bf16_f32mf2_rm_tu(vfloat32mf2_t vd,
+                                               vfloat32mf2_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32mf2_rm_tu(vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vv_bf16mf2_f32m1_rm_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vv_bf16mf2_f32m1_rm_tu(vfloat32m1_t vd,
+                                                vbfloat16mf2_t vs2,
+                                                vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16mf2_f32m1_rm_tu(vd, vs2, vs1, __RISCV_FRM_RNE,
+                                               vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vf_bf16mf2_f32m1_rm_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vf_bf16mf2_f32m1_rm_tu(vfloat32m1_t vd,
+                                                vbfloat16mf2_t vs2, __bf16 rs1,
+                                                size_t vl) {
+  return __riscv_vfwadd_vf_bf16mf2_f32m1_rm_tu(vd, vs2, rs1, __RISCV_FRM_RNE,
+                                               vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wv_bf16mf2_f32m1_rm_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wv_bf16mf2_f32m1_rm_tu(vfloat32m1_t vd,
+                                                vfloat32m1_t vs2,
+                                                vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_bf16mf2_f32m1_rm_tu(vd, vs2, vs1, __RISCV_FRM_RNE,
+                                               vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wf_bf16_f32m1_rm_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.nxv2f32.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wf_bf16_f32m1_rm_tu(vfloat32m1_t vd, vfloat32m1_t vs2,
+                                             __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m1_rm_tu(vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vv_bf16m1_f32m2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vv_bf16m1_f32m2_rm_tu(vfloat32m2_t vd,
+                                               vbfloat16m1_t vs2,
+                                               vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16m1_f32m2_rm_tu(vd, vs2, vs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vf_bf16m1_f32m2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vf_bf16m1_f32m2_rm_tu(vfloat32m2_t vd,
+                                               vbfloat16m1_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwadd_vf_bf16m1_f32m2_rm_tu(vd, vs2, rs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wv_bf16m1_f32m2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wv_bf16m1_f32m2_rm_tu(vfloat32m2_t vd,
+                                               vfloat32m2_t vs2,
+                                               vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_bf16m1_f32m2_rm_tu(vd, vs2, vs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wf_bf16_f32m2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.nxv4f32.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wf_bf16_f32m2_rm_tu(vfloat32m2_t vd, vfloat32m2_t vs2,
+                                             __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m2_rm_tu(vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vv_bf16m2_f32m4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vv_bf16m2_f32m4_rm_tu(vfloat32m4_t vd,
+                                               vbfloat16m2_t vs2,
+                                               vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16m2_f32m4_rm_tu(vd, vs2, vs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vf_bf16m2_f32m4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vf_bf16m2_f32m4_rm_tu(vfloat32m4_t vd,
+                                               vbfloat16m2_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwadd_vf_bf16m2_f32m4_rm_tu(vd, vs2, rs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wv_bf16m2_f32m4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wv_bf16m2_f32m4_rm_tu(vfloat32m4_t vd,
+                                               vfloat32m4_t vs2,
+                                               vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_bf16m2_f32m4_rm_tu(vd, vs2, vs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wf_bf16_f32m4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.nxv8f32.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wf_bf16_f32m4_rm_tu(vfloat32m4_t vd, vfloat32m4_t vs2,
+                                             __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m4_rm_tu(vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vv_bf16m4_f32m8_rm_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vv_bf16m4_f32m8_rm_tu(vfloat32m8_t vd,
+                                               vbfloat16m4_t vs2,
+                                               vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16m4_f32m8_rm_tu(vd, vs2, vs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vf_bf16m4_f32m8_rm_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vf_bf16m4_f32m8_rm_tu(vfloat32m8_t vd,
+                                               vbfloat16m4_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwadd_vf_bf16m4_f32m8_rm_tu(vd, vs2, rs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wv_bf16m4_f32m8_rm_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wv_bf16m4_f32m8_rm_tu(vfloat32m8_t vd,
+                                               vfloat32m8_t vs2,
+                                               vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_bf16m4_f32m8_rm_tu(vd, vs2, vs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wf_bf16_f32m8_rm_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.nxv16f32.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wf_bf16_f32m8_rm_tu(vfloat32m8_t vd, vfloat32m8_t vs2,
+                                             __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m8_rm_tu(vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vv_bf16mf4_f32mf2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vv_bf16mf4_f32mf2_rm_tum(vbool64_t vm,
+                                                   vfloat32mf2_t vd,
+                                                   vbfloat16mf4_t vs2,
+                                                   vbfloat16mf4_t vs1,
+                                                   size_t vl) {
+  return __riscv_vfwadd_vv_bf16mf4_f32mf2_rm_tum(vm, vd, vs2, vs1,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vf_bf16mf4_f32mf2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vf_bf16mf4_f32mf2_rm_tum(vbool64_t vm,
+                                                   vfloat32mf2_t vd,
+                                                   vbfloat16mf4_t vs2,
+                                                   __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_vf_bf16mf4_f32mf2_rm_tum(vm, vd, vs2, rs1,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wv_bf16mf4_f32mf2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wv_bf16mf4_f32mf2_rm_tum(vbool64_t vm,
+                                                   vfloat32mf2_t vd,
+                                                   vfloat32mf2_t vs2,
+                                                   vbfloat16mf4_t vs1,
+                                                   size_t vl) {
+  return __riscv_vfwadd_wv_bf16mf4_f32mf2_rm_tum(vm, vd, vs2, vs1,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wf_bf16_f32mf2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wf_bf16_f32mf2_rm_tum(vbool64_t vm, vfloat32mf2_t vd,
+                                                vfloat32mf2_t vs2, __bf16 rs1,
+                                                size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32mf2_rm_tum(vm, vd, vs2, rs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vv_bf16mf2_f32m1_rm_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vv_bf16mf2_f32m1_rm_tum(vbool32_t vm, vfloat32m1_t vd,
+                                                 vbfloat16mf2_t vs2,
+                                                 vbfloat16mf2_t vs1,
+                                                 size_t vl) {
+  return __riscv_vfwadd_vv_bf16mf2_f32m1_rm_tum(vm, vd, vs2, vs1,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vf_bf16mf2_f32m1_rm_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vf_bf16mf2_f32m1_rm_tum(vbool32_t vm, vfloat32m1_t vd,
+                                                 vbfloat16mf2_t vs2, __bf16 rs1,
+                                                 size_t vl) {
+  return __riscv_vfwadd_vf_bf16mf2_f32m1_rm_tum(vm, vd, vs2, rs1,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wv_bf16mf2_f32m1_rm_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wv_bf16mf2_f32m1_rm_tum(vbool32_t vm, vfloat32m1_t vd,
+                                                 vfloat32m1_t vs2,
+                                                 vbfloat16mf2_t vs1,
+                                                 size_t vl) {
+  return __riscv_vfwadd_wv_bf16mf2_f32m1_rm_tum(vm, vd, vs2, vs1,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wf_bf16_f32m1_rm_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wf_bf16_f32m1_rm_tum(vbool32_t vm, vfloat32m1_t vd,
+                                              vfloat32m1_t vs2, __bf16 rs1,
+                                              size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m1_rm_tum(vm, vd, vs2, rs1, __RISCV_FRM_RNE,
+                                             vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vv_bf16m1_f32m2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vv_bf16m1_f32m2_rm_tum(vbool16_t vm, vfloat32m2_t vd,
+                                                vbfloat16m1_t vs2,
+                                                vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16m1_f32m2_rm_tum(vm, vd, vs2, vs1,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vf_bf16m1_f32m2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vf_bf16m1_f32m2_rm_tum(vbool16_t vm, vfloat32m2_t vd,
+                                                vbfloat16m1_t vs2, __bf16 rs1,
+                                                size_t vl) {
+  return __riscv_vfwadd_vf_bf16m1_f32m2_rm_tum(vm, vd, vs2, rs1,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wv_bf16m1_f32m2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wv_bf16m1_f32m2_rm_tum(vbool16_t vm, vfloat32m2_t vd,
+                                                vfloat32m2_t vs2,
+                                                vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_bf16m1_f32m2_rm_tum(vm, vd, vs2, vs1,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wf_bf16_f32m2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wf_bf16_f32m2_rm_tum(vbool16_t vm, vfloat32m2_t vd,
+                                              vfloat32m2_t vs2, __bf16 rs1,
+                                              size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m2_rm_tum(vm, vd, vs2, rs1, __RISCV_FRM_RNE,
+                                             vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vv_bf16m2_f32m4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vv_bf16m2_f32m4_rm_tum(vbool8_t vm, vfloat32m4_t vd,
+                                                vbfloat16m2_t vs2,
+                                                vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16m2_f32m4_rm_tum(vm, vd, vs2, vs1,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vf_bf16m2_f32m4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vf_bf16m2_f32m4_rm_tum(vbool8_t vm, vfloat32m4_t vd,
+                                                vbfloat16m2_t vs2, __bf16 rs1,
+                                                size_t vl) {
+  return __riscv_vfwadd_vf_bf16m2_f32m4_rm_tum(vm, vd, vs2, rs1,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wv_bf16m2_f32m4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wv_bf16m2_f32m4_rm_tum(vbool8_t vm, vfloat32m4_t vd,
+                                                vfloat32m4_t vs2,
+                                                vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_bf16m2_f32m4_rm_tum(vm, vd, vs2, vs1,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wf_bf16_f32m4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wf_bf16_f32m4_rm_tum(vbool8_t vm, vfloat32m4_t vd,
+                                              vfloat32m4_t vs2, __bf16 rs1,
+                                              size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m4_rm_tum(vm, vd, vs2, rs1, __RISCV_FRM_RNE,
+                                             vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vv_bf16m4_f32m8_rm_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vv_bf16m4_f32m8_rm_tum(vbool4_t vm, vfloat32m8_t vd,
+                                                vbfloat16m4_t vs2,
+                                                vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16m4_f32m8_rm_tum(vm, vd, vs2, vs1,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vf_bf16m4_f32m8_rm_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vf_bf16m4_f32m8_rm_tum(vbool4_t vm, vfloat32m8_t vd,
+                                                vbfloat16m4_t vs2, __bf16 rs1,
+                                                size_t vl) {
+  return __riscv_vfwadd_vf_bf16m4_f32m8_rm_tum(vm, vd, vs2, rs1,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wv_bf16m4_f32m8_rm_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wv_bf16m4_f32m8_rm_tum(vbool4_t vm, vfloat32m8_t vd,
+                                                vfloat32m8_t vs2,
+                                                vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_bf16m4_f32m8_rm_tum(vm, vd, vs2, vs1,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wf_bf16_f32m8_rm_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wf_bf16_f32m8_rm_tum(vbool4_t vm, vfloat32m8_t vd,
+                                              vfloat32m8_t vs2, __bf16 rs1,
+                                              size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m8_rm_tum(vm, vd, vs2, rs1, __RISCV_FRM_RNE,
+                                             vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vv_bf16mf4_f32mf2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vv_bf16mf4_f32mf2_rm_tumu(vbool64_t vm,
+                                                    vfloat32mf2_t vd,
+                                                    vbfloat16mf4_t vs2,
+                                                    vbfloat16mf4_t vs1,
+                                                    size_t vl) {
+  return __riscv_vfwadd_vv_bf16mf4_f32mf2_rm_tumu(vm, vd, vs2, vs1,
+                                                  __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vf_bf16mf4_f32mf2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vf_bf16mf4_f32mf2_rm_tumu(vbool64_t vm,
+                                                    vfloat32mf2_t vd,
+                                                    vbfloat16mf4_t vs2,
+                                                    __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_vf_bf16mf4_f32mf2_rm_tumu(vm, vd, vs2, rs1,
+                                                  __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wv_bf16mf4_f32mf2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wv_bf16mf4_f32mf2_rm_tumu(vbool64_t vm,
+                                                    vfloat32mf2_t vd,
+                                                    vfloat32mf2_t vs2,
+                                                    vbfloat16mf4_t vs1,
+                                                    size_t vl) {
+  return __riscv_vfwadd_wv_bf16mf4_f32mf2_rm_tumu(vm, vd, vs2, vs1,
+                                                  __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wf_bf16_f32mf2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wf_bf16_f32mf2_rm_tumu(vbool64_t vm, vfloat32mf2_t vd,
+                                                 vfloat32mf2_t vs2, __bf16 rs1,
+                                                 size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32mf2_rm_tumu(vm, vd, vs2, rs1,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vv_bf16mf2_f32m1_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vv_bf16mf2_f32m1_rm_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                                  vbfloat16mf2_t vs2,
+                                                  vbfloat16mf2_t vs1,
+                                                  size_t vl) {
+  return __riscv_vfwadd_vv_bf16mf2_f32m1_rm_tumu(vm, vd, vs2, vs1,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vf_bf16mf2_f32m1_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vf_bf16mf2_f32m1_rm_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                                  vbfloat16mf2_t vs2,
+                                                  __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_vf_bf16mf2_f32m1_rm_tumu(vm, vd, vs2, rs1,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wv_bf16mf2_f32m1_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wv_bf16mf2_f32m1_rm_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                                  vfloat32m1_t vs2,
+                                                  vbfloat16mf2_t vs1,
+                                                  size_t vl) {
+  return __riscv_vfwadd_wv_bf16mf2_f32m1_rm_tumu(vm, vd, vs2, vs1,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wf_bf16_f32m1_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wf_bf16_f32m1_rm_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                               vfloat32m1_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m1_rm_tumu(vm, vd, vs2, rs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vv_bf16m1_f32m2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vv_bf16m1_f32m2_rm_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                                 vbfloat16m1_t vs2,
+                                                 vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16m1_f32m2_rm_tumu(vm, vd, vs2, vs1,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vf_bf16m1_f32m2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vf_bf16m1_f32m2_rm_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                                 vbfloat16m1_t vs2, __bf16 rs1,
+                                                 size_t vl) {
+  return __riscv_vfwadd_vf_bf16m1_f32m2_rm_tumu(vm, vd, vs2, rs1,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wv_bf16m1_f32m2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wv_bf16m1_f32m2_rm_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                                 vfloat32m2_t vs2,
+                                                 vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_bf16m1_f32m2_rm_tumu(vm, vd, vs2, vs1,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wf_bf16_f32m2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wf_bf16_f32m2_rm_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                               vfloat32m2_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m2_rm_tumu(vm, vd, vs2, rs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vv_bf16m2_f32m4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vv_bf16m2_f32m4_rm_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                                 vbfloat16m2_t vs2,
+                                                 vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16m2_f32m4_rm_tumu(vm, vd, vs2, vs1,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vf_bf16m2_f32m4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vf_bf16m2_f32m4_rm_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                                 vbfloat16m2_t vs2, __bf16 rs1,
+                                                 size_t vl) {
+  return __riscv_vfwadd_vf_bf16m2_f32m4_rm_tumu(vm, vd, vs2, rs1,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wv_bf16m2_f32m4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wv_bf16m2_f32m4_rm_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                                 vfloat32m4_t vs2,
+                                                 vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_bf16m2_f32m4_rm_tumu(vm, vd, vs2, vs1,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wf_bf16_f32m4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wf_bf16_f32m4_rm_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                               vfloat32m4_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m4_rm_tumu(vm, vd, vs2, rs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vv_bf16m4_f32m8_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vv_bf16m4_f32m8_rm_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                                 vbfloat16m4_t vs2,
+                                                 vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16m4_f32m8_rm_tumu(vm, vd, vs2, vs1,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vf_bf16m4_f32m8_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vf_bf16m4_f32m8_rm_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                                 vbfloat16m4_t vs2, __bf16 rs1,
+                                                 size_t vl) {
+  return __riscv_vfwadd_vf_bf16m4_f32m8_rm_tumu(vm, vd, vs2, rs1,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wv_bf16m4_f32m8_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wv_bf16m4_f32m8_rm_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                                 vfloat32m8_t vs2,
+                                                 vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_bf16m4_f32m8_rm_tumu(vm, vd, vs2, vs1,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wf_bf16_f32m8_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wf_bf16_f32m8_rm_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                               vfloat32m8_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m8_rm_tumu(vm, vd, vs2, rs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vv_bf16mf4_f32mf2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vv_bf16mf4_f32mf2_rm_mu(vbool64_t vm,
+                                                  vfloat32mf2_t vd,
+                                                  vbfloat16mf4_t vs2,
+                                                  vbfloat16mf4_t vs1,
+                                                  size_t vl) {
+  return __riscv_vfwadd_vv_bf16mf4_f32mf2_rm_mu(vm, vd, vs2, vs1,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vf_bf16mf4_f32mf2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vf_bf16mf4_f32mf2_rm_mu(vbool64_t vm,
+                                                  vfloat32mf2_t vd,
+                                                  vbfloat16mf4_t vs2,
+                                                  __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_vf_bf16mf4_f32mf2_rm_mu(vm, vd, vs2, rs1,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wv_bf16mf4_f32mf2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wv_bf16mf4_f32mf2_rm_mu(vbool64_t vm,
+                                                  vfloat32mf2_t vd,
+                                                  vfloat32mf2_t vs2,
+                                                  vbfloat16mf4_t vs1,
+                                                  size_t vl) {
+  return __riscv_vfwadd_wv_bf16mf4_f32mf2_rm_mu(vm, vd, vs2, vs1,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wf_bf16_f32mf2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wf_bf16_f32mf2_rm_mu(vbool64_t vm, vfloat32mf2_t vd,
+                                               vfloat32mf2_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32mf2_rm_mu(vm, vd, vs2, rs1, __RISCV_FRM_RNE,
+                                             vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vv_bf16mf2_f32m1_rm_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vv_bf16mf2_f32m1_rm_mu(vbool32_t vm, vfloat32m1_t vd,
+                                                vbfloat16mf2_t vs2,
+                                                vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16mf2_f32m1_rm_mu(vm, vd, vs2, vs1,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vf_bf16mf2_f32m1_rm_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vf_bf16mf2_f32m1_rm_mu(vbool32_t vm, vfloat32m1_t vd,
+                                                vbfloat16mf2_t vs2, __bf16 rs1,
+                                                size_t vl) {
+  return __riscv_vfwadd_vf_bf16mf2_f32m1_rm_mu(vm, vd, vs2, rs1,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wv_bf16mf2_f32m1_rm_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wv_bf16mf2_f32m1_rm_mu(vbool32_t vm, vfloat32m1_t vd,
+                                                vfloat32m1_t vs2,
+                                                vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_bf16mf2_f32m1_rm_mu(vm, vd, vs2, vs1,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wf_bf16_f32m1_rm_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wf_bf16_f32m1_rm_mu(vbool32_t vm, vfloat32m1_t vd,
+                                             vfloat32m1_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m1_rm_mu(vm, vd, vs2, rs1, __RISCV_FRM_RNE,
+                                            vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vv_bf16m1_f32m2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vv_bf16m1_f32m2_rm_mu(vbool16_t vm, vfloat32m2_t vd,
+                                               vbfloat16m1_t vs2,
+                                               vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16m1_f32m2_rm_mu(vm, vd, vs2, vs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vf_bf16m1_f32m2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vf_bf16m1_f32m2_rm_mu(vbool16_t vm, vfloat32m2_t vd,
+                                               vbfloat16m1_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwadd_vf_bf16m1_f32m2_rm_mu(vm, vd, vs2, rs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wv_bf16m1_f32m2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wv_bf16m1_f32m2_rm_mu(vbool16_t vm, vfloat32m2_t vd,
+                                               vfloat32m2_t vs2,
+                                               vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_bf16m1_f32m2_rm_mu(vm, vd, vs2, vs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wf_bf16_f32m2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wf_bf16_f32m2_rm_mu(vbool16_t vm, vfloat32m2_t vd,
+                                             vfloat32m2_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m2_rm_mu(vm, vd, vs2, rs1, __RISCV_FRM_RNE,
+                                            vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vv_bf16m2_f32m4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vv_bf16m2_f32m4_rm_mu(vbool8_t vm, vfloat32m4_t vd,
+                                               vbfloat16m2_t vs2,
+                                               vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16m2_f32m4_rm_mu(vm, vd, vs2, vs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vf_bf16m2_f32m4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vf_bf16m2_f32m4_rm_mu(vbool8_t vm, vfloat32m4_t vd,
+                                               vbfloat16m2_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwadd_vf_bf16m2_f32m4_rm_mu(vm, vd, vs2, rs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wv_bf16m2_f32m4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wv_bf16m2_f32m4_rm_mu(vbool8_t vm, vfloat32m4_t vd,
+                                               vfloat32m4_t vs2,
+                                               vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_bf16m2_f32m4_rm_mu(vm, vd, vs2, vs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wf_bf16_f32m4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wf_bf16_f32m4_rm_mu(vbool8_t vm, vfloat32m4_t vd,
+                                             vfloat32m4_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m4_rm_mu(vm, vd, vs2, rs1, __RISCV_FRM_RNE,
+                                            vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vv_bf16m4_f32m8_rm_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vv_bf16m4_f32m8_rm_mu(vbool4_t vm, vfloat32m8_t vd,
+                                               vbfloat16m4_t vs2,
+                                               vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_bf16m4_f32m8_rm_mu(vm, vd, vs2, vs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vf_bf16m4_f32m8_rm_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vf_bf16m4_f32m8_rm_mu(vbool4_t vm, vfloat32m8_t vd,
+                                               vbfloat16m4_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwadd_vf_bf16m4_f32m8_rm_mu(vm, vd, vs2, rs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wv_bf16m4_f32m8_rm_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wv_bf16m4_f32m8_rm_mu(vbool4_t vm, vfloat32m8_t vd,
+                                               vfloat32m8_t vs2,
+                                               vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_bf16m4_f32m8_rm_mu(vm, vd, vs2, vs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wf_bf16_f32m8_rm_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wf_bf16_f32m8_rm_mu(vbool4_t vm, vfloat32m8_t vd,
+                                             vfloat32m8_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwadd_wf_bf16_f32m8_rm_mu(vm, vd, vs2, rs1, __RISCV_FRM_RNE,
+                                            vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfwcvt.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfwcvt.c
new file mode 100644
index 0000000000000..9151319fcfb17
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfwcvt.c
@@ -0,0 +1,765 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfwcvt_f_x_v_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv1bf16.nxv1i8.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfwcvt_f_x_v_bf16mf4_tu(vbfloat16mf4_t vd, vint8mf8_t vs2,
+                                            size_t vl) {
+  return __riscv_vfwcvt_f_x_v_bf16mf4_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfwcvt_f_x_v_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv2bf16.nxv2i8.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfwcvt_f_x_v_bf16mf2_tu(vbfloat16mf2_t vd, vint8mf4_t vs2,
+                                            size_t vl) {
+  return __riscv_vfwcvt_f_x_v_bf16mf2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfwcvt_f_x_v_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv4bf16.nxv4i8.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfwcvt_f_x_v_bf16m1_tu(vbfloat16m1_t vd, vint8mf2_t vs2,
+                                          size_t vl) {
+  return __riscv_vfwcvt_f_x_v_bf16m1_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfwcvt_f_x_v_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv8bf16.nxv8i8.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfwcvt_f_x_v_bf16m2_tu(vbfloat16m2_t vd, vint8m1_t vs2,
+                                          size_t vl) {
+  return __riscv_vfwcvt_f_x_v_bf16m2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfwcvt_f_x_v_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv16bf16.nxv16i8.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfwcvt_f_x_v_bf16m4_tu(vbfloat16m4_t vd, vint8m2_t vs2,
+                                          size_t vl) {
+  return __riscv_vfwcvt_f_x_v_bf16m4_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfwcvt_f_x_v_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv32bf16.nxv32i8.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfwcvt_f_x_v_bf16m8_tu(vbfloat16m8_t vd, vint8m4_t vs2,
+                                          size_t vl) {
+  return __riscv_vfwcvt_f_x_v_bf16m8_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfwcvt_f_xu_v_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv1bf16.nxv1i8.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfwcvt_f_xu_v_bf16mf4_tu(vbfloat16mf4_t vd, vuint8mf8_t vs2,
+                                             size_t vl) {
+  return __riscv_vfwcvt_f_xu_v_bf16mf4_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfwcvt_f_xu_v_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv2bf16.nxv2i8.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfwcvt_f_xu_v_bf16mf2_tu(vbfloat16mf2_t vd, vuint8mf4_t vs2,
+                                             size_t vl) {
+  return __riscv_vfwcvt_f_xu_v_bf16mf2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfwcvt_f_xu_v_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv4bf16.nxv4i8.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfwcvt_f_xu_v_bf16m1_tu(vbfloat16m1_t vd, vuint8mf2_t vs2,
+                                           size_t vl) {
+  return __riscv_vfwcvt_f_xu_v_bf16m1_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfwcvt_f_xu_v_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv8bf16.nxv8i8.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfwcvt_f_xu_v_bf16m2_tu(vbfloat16m2_t vd, vuint8m1_t vs2,
+                                           size_t vl) {
+  return __riscv_vfwcvt_f_xu_v_bf16m2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfwcvt_f_xu_v_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv16bf16.nxv16i8.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfwcvt_f_xu_v_bf16m4_tu(vbfloat16m4_t vd, vuint8m2_t vs2,
+                                           size_t vl) {
+  return __riscv_vfwcvt_f_xu_v_bf16m4_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfwcvt_f_xu_v_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv32bf16.nxv32i8.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfwcvt_f_xu_v_bf16m8_tu(vbfloat16m8_t vd, vuint8m4_t vs2,
+                                           size_t vl) {
+  return __riscv_vfwcvt_f_xu_v_bf16m8_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwcvt_f_f_v_bf16mf4_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwcvt.f.f.v.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwcvt_f_f_v_bf16mf4_f32mf2_tu(vfloat32mf2_t vd,
+                                                  vbfloat16mf4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwcvt_f_f_v_bf16mf4_f32mf2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwcvt_f_f_v_bf16mf2_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwcvt.f.f.v.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwcvt_f_f_v_bf16mf2_f32m1_tu(vfloat32m1_t vd,
+                                                vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_bf16mf2_f32m1_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwcvt_f_f_v_bf16m1_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwcvt.f.f.v.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwcvt_f_f_v_bf16m1_f32m2_tu(vfloat32m2_t vd,
+                                               vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_bf16m1_f32m2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwcvt_f_f_v_bf16m2_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwcvt.f.f.v.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwcvt_f_f_v_bf16m2_f32m4_tu(vfloat32m4_t vd,
+                                               vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_bf16m2_f32m4_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwcvt_f_f_v_bf16m4_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwcvt.f.f.v.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwcvt_f_f_v_bf16m4_f32m8_tu(vfloat32m8_t vd,
+                                               vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_bf16m4_f32m8_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfwcvt_f_x_v_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv1bf16.nxv1i8.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x i8> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfwcvt_f_x_v_bf16mf4_tum(vbool64_t vm, vbfloat16mf4_t vd,
+                                             vint8mf8_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_x_v_bf16mf4_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfwcvt_f_x_v_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv2bf16.nxv2i8.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x i8> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfwcvt_f_x_v_bf16mf2_tum(vbool32_t vm, vbfloat16mf2_t vd,
+                                             vint8mf4_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_x_v_bf16mf2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfwcvt_f_x_v_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv4bf16.nxv4i8.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x i8> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfwcvt_f_x_v_bf16m1_tum(vbool16_t vm, vbfloat16m1_t vd,
+                                           vint8mf2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_x_v_bf16m1_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfwcvt_f_x_v_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv8bf16.nxv8i8.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x i8> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfwcvt_f_x_v_bf16m2_tum(vbool8_t vm, vbfloat16m2_t vd,
+                                           vint8m1_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_x_v_bf16m2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfwcvt_f_x_v_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv16bf16.nxv16i8.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x i8> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfwcvt_f_x_v_bf16m4_tum(vbool4_t vm, vbfloat16m4_t vd,
+                                           vint8m2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_x_v_bf16m4_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfwcvt_f_x_v_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv32bf16.nxv32i8.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x i8> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfwcvt_f_x_v_bf16m8_tum(vbool2_t vm, vbfloat16m8_t vd,
+                                           vint8m4_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_x_v_bf16m8_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfwcvt_f_xu_v_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv1bf16.nxv1i8.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x i8> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfwcvt_f_xu_v_bf16mf4_tum(vbool64_t vm, vbfloat16mf4_t vd,
+                                              vuint8mf8_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_xu_v_bf16mf4_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfwcvt_f_xu_v_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv2bf16.nxv2i8.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x i8> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfwcvt_f_xu_v_bf16mf2_tum(vbool32_t vm, vbfloat16mf2_t vd,
+                                              vuint8mf4_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_xu_v_bf16mf2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfwcvt_f_xu_v_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv4bf16.nxv4i8.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x i8> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfwcvt_f_xu_v_bf16m1_tum(vbool16_t vm, vbfloat16m1_t vd,
+                                            vuint8mf2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_xu_v_bf16m1_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfwcvt_f_xu_v_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv8bf16.nxv8i8.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x i8> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfwcvt_f_xu_v_bf16m2_tum(vbool8_t vm, vbfloat16m2_t vd,
+                                            vuint8m1_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_xu_v_bf16m2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfwcvt_f_xu_v_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv16bf16.nxv16i8.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x i8> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfwcvt_f_xu_v_bf16m4_tum(vbool4_t vm, vbfloat16m4_t vd,
+                                            vuint8m2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_xu_v_bf16m4_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfwcvt_f_xu_v_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv32bf16.nxv32i8.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x i8> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfwcvt_f_xu_v_bf16m8_tum(vbool2_t vm, vbfloat16m8_t vd,
+                                            vuint8m4_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_xu_v_bf16m8_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwcvt_f_f_v_bf16mf4_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwcvt_f_f_v_bf16mf4_f32mf2_tum(vbool64_t vm,
+                                                   vfloat32mf2_t vd,
+                                                   vbfloat16mf4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwcvt_f_f_v_bf16mf4_f32mf2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwcvt_f_f_v_bf16mf2_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwcvt_f_f_v_bf16mf2_f32m1_tum(vbool32_t vm, vfloat32m1_t vd,
+                                                 vbfloat16mf2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwcvt_f_f_v_bf16mf2_f32m1_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwcvt_f_f_v_bf16m1_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwcvt_f_f_v_bf16m1_f32m2_tum(vbool16_t vm, vfloat32m2_t vd,
+                                                vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_bf16m1_f32m2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwcvt_f_f_v_bf16m2_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwcvt_f_f_v_bf16m2_f32m4_tum(vbool8_t vm, vfloat32m4_t vd,
+                                                vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_bf16m2_f32m4_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwcvt_f_f_v_bf16m4_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwcvt_f_f_v_bf16m4_f32m8_tum(vbool4_t vm, vfloat32m8_t vd,
+                                                vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_bf16m4_f32m8_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfwcvt_f_x_v_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv1bf16.nxv1i8.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x i8> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfwcvt_f_x_v_bf16mf4_tumu(vbool64_t vm, vbfloat16mf4_t vd,
+                                              vint8mf8_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_x_v_bf16mf4_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfwcvt_f_x_v_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv2bf16.nxv2i8.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x i8> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfwcvt_f_x_v_bf16mf2_tumu(vbool32_t vm, vbfloat16mf2_t vd,
+                                              vint8mf4_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_x_v_bf16mf2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfwcvt_f_x_v_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv4bf16.nxv4i8.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x i8> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfwcvt_f_x_v_bf16m1_tumu(vbool16_t vm, vbfloat16m1_t vd,
+                                            vint8mf2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_x_v_bf16m1_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfwcvt_f_x_v_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv8bf16.nxv8i8.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x i8> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfwcvt_f_x_v_bf16m2_tumu(vbool8_t vm, vbfloat16m2_t vd,
+                                            vint8m1_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_x_v_bf16m2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfwcvt_f_x_v_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv16bf16.nxv16i8.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x i8> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfwcvt_f_x_v_bf16m4_tumu(vbool4_t vm, vbfloat16m4_t vd,
+                                            vint8m2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_x_v_bf16m4_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfwcvt_f_x_v_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv32bf16.nxv32i8.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x i8> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfwcvt_f_x_v_bf16m8_tumu(vbool2_t vm, vbfloat16m8_t vd,
+                                            vint8m4_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_x_v_bf16m8_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfwcvt_f_xu_v_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv1bf16.nxv1i8.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x i8> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfwcvt_f_xu_v_bf16mf4_tumu(vbool64_t vm, vbfloat16mf4_t vd,
+                                               vuint8mf8_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_xu_v_bf16mf4_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfwcvt_f_xu_v_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv2bf16.nxv2i8.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x i8> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfwcvt_f_xu_v_bf16mf2_tumu(vbool32_t vm, vbfloat16mf2_t vd,
+                                               vuint8mf4_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_xu_v_bf16mf2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfwcvt_f_xu_v_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv4bf16.nxv4i8.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x i8> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfwcvt_f_xu_v_bf16m1_tumu(vbool16_t vm, vbfloat16m1_t vd,
+                                             vuint8mf2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_xu_v_bf16m1_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfwcvt_f_xu_v_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv8bf16.nxv8i8.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x i8> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfwcvt_f_xu_v_bf16m2_tumu(vbool8_t vm, vbfloat16m2_t vd,
+                                             vuint8m1_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_xu_v_bf16m2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfwcvt_f_xu_v_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv16bf16.nxv16i8.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x i8> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfwcvt_f_xu_v_bf16m4_tumu(vbool4_t vm, vbfloat16m4_t vd,
+                                             vuint8m2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_xu_v_bf16m4_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfwcvt_f_xu_v_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv32bf16.nxv32i8.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x i8> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfwcvt_f_xu_v_bf16m8_tumu(vbool2_t vm, vbfloat16m8_t vd,
+                                             vuint8m4_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_xu_v_bf16m8_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwcvt_f_f_v_bf16mf4_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwcvt_f_f_v_bf16mf4_f32mf2_tumu(vbool64_t vm,
+                                                    vfloat32mf2_t vd,
+                                                    vbfloat16mf4_t vs2,
+                                                    size_t vl) {
+  return __riscv_vfwcvt_f_f_v_bf16mf4_f32mf2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwcvt_f_f_v_bf16mf2_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwcvt_f_f_v_bf16mf2_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                                  vbfloat16mf2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwcvt_f_f_v_bf16mf2_f32m1_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwcvt_f_f_v_bf16m1_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwcvt_f_f_v_bf16m1_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                                 vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_bf16m1_f32m2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwcvt_f_f_v_bf16m2_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwcvt_f_f_v_bf16m2_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                                 vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_bf16m2_f32m4_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwcvt_f_f_v_bf16m4_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwcvt_f_f_v_bf16m4_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                                 vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_bf16m4_f32m8_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfwcvt_f_x_v_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv1bf16.nxv1i8.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x i8> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfwcvt_f_x_v_bf16mf4_mu(vbool64_t vm, vbfloat16mf4_t vd,
+                                            vint8mf8_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_x_v_bf16mf4_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfwcvt_f_x_v_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv2bf16.nxv2i8.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x i8> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfwcvt_f_x_v_bf16mf2_mu(vbool32_t vm, vbfloat16mf2_t vd,
+                                            vint8mf4_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_x_v_bf16mf2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfwcvt_f_x_v_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv4bf16.nxv4i8.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x i8> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfwcvt_f_x_v_bf16m1_mu(vbool16_t vm, vbfloat16m1_t vd,
+                                          vint8mf2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_x_v_bf16m1_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfwcvt_f_x_v_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv8bf16.nxv8i8.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x i8> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfwcvt_f_x_v_bf16m2_mu(vbool8_t vm, vbfloat16m2_t vd,
+                                          vint8m1_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_x_v_bf16m2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfwcvt_f_x_v_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv16bf16.nxv16i8.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x i8> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfwcvt_f_x_v_bf16m4_mu(vbool4_t vm, vbfloat16m4_t vd,
+                                          vint8m2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_x_v_bf16m4_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfwcvt_f_x_v_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv32bf16.nxv32i8.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x i8> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfwcvt_f_x_v_bf16m8_mu(vbool2_t vm, vbfloat16m8_t vd,
+                                          vint8m4_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_x_v_bf16m8_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfwcvt_f_xu_v_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv1bf16.nxv1i8.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x i8> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfwcvt_f_xu_v_bf16mf4_mu(vbool64_t vm, vbfloat16mf4_t vd,
+                                             vuint8mf8_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_xu_v_bf16mf4_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfwcvt_f_xu_v_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv2bf16.nxv2i8.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x i8> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfwcvt_f_xu_v_bf16mf2_mu(vbool32_t vm, vbfloat16mf2_t vd,
+                                             vuint8mf4_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_xu_v_bf16mf2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfwcvt_f_xu_v_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv4bf16.nxv4i8.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x i8> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfwcvt_f_xu_v_bf16m1_mu(vbool16_t vm, vbfloat16m1_t vd,
+                                           vuint8mf2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_xu_v_bf16m1_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfwcvt_f_xu_v_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv8bf16.nxv8i8.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x i8> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfwcvt_f_xu_v_bf16m2_mu(vbool8_t vm, vbfloat16m2_t vd,
+                                           vuint8m1_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_xu_v_bf16m2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfwcvt_f_xu_v_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv16bf16.nxv16i8.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x i8> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfwcvt_f_xu_v_bf16m4_mu(vbool4_t vm, vbfloat16m4_t vd,
+                                           vuint8m2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_xu_v_bf16m4_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfwcvt_f_xu_v_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv32bf16.nxv32i8.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x i8> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfwcvt_f_xu_v_bf16m8_mu(vbool2_t vm, vbfloat16m8_t vd,
+                                           vuint8m4_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_xu_v_bf16m8_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwcvt_f_f_v_bf16mf4_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwcvt_f_f_v_bf16mf4_f32mf2_mu(vbool64_t vm,
+                                                  vfloat32mf2_t vd,
+                                                  vbfloat16mf4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwcvt_f_f_v_bf16mf4_f32mf2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwcvt_f_f_v_bf16mf2_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwcvt_f_f_v_bf16mf2_f32m1_mu(vbool32_t vm, vfloat32m1_t vd,
+                                                vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_bf16mf2_f32m1_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwcvt_f_f_v_bf16m1_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwcvt_f_f_v_bf16m1_f32m2_mu(vbool16_t vm, vfloat32m2_t vd,
+                                               vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_bf16m1_f32m2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwcvt_f_f_v_bf16m2_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwcvt_f_f_v_bf16m2_f32m4_mu(vbool8_t vm, vfloat32m4_t vd,
+                                               vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_bf16m2_f32m4_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwcvt_f_f_v_bf16m4_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwcvt_f_f_v_bf16m4_f32m8_mu(vbool4_t vm, vfloat32m8_t vd,
+                                               vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_f_v_bf16m4_f32m8_mu(vm, vd, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfwmacc.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfwmacc.c
new file mode 100644
index 0000000000000..f67b1001a5fcb
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfwmacc.c
@@ -0,0 +1,1017 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vv_bf16mf4_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vv_bf16mf4_f32mf2_tu(vfloat32mf2_t vd,
+                                                vbfloat16mf4_t vs1,
+                                                vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vv_bf16mf4_f32mf2_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vf_bf16mf4_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vf_bf16mf4_f32mf2_tu(vfloat32mf2_t vd, __bf16 vs1,
+                                                vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vf_bf16mf4_f32mf2_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vv_bf16mf2_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vv_bf16mf2_f32m1_tu(vfloat32m1_t vd,
+                                              vbfloat16mf2_t vs1,
+                                              vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vv_bf16mf2_f32m1_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vf_bf16mf2_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vf_bf16mf2_f32m1_tu(vfloat32m1_t vd, __bf16 vs1,
+                                              vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vf_bf16mf2_f32m1_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vv_bf16m1_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vv_bf16m1_f32m2_tu(vfloat32m2_t vd, vbfloat16m1_t vs1,
+                                             vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vv_bf16m1_f32m2_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vf_bf16m1_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vf_bf16m1_f32m2_tu(vfloat32m2_t vd, __bf16 vs1,
+                                             vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vf_bf16m1_f32m2_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vv_bf16m2_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vv_bf16m2_f32m4_tu(vfloat32m4_t vd, vbfloat16m2_t vs1,
+                                             vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vv_bf16m2_f32m4_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vf_bf16m2_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vf_bf16m2_f32m4_tu(vfloat32m4_t vd, __bf16 vs1,
+                                             vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vf_bf16m2_f32m4_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vv_bf16m4_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vv_bf16m4_f32m8_tu(vfloat32m8_t vd, vbfloat16m4_t vs1,
+                                             vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vv_bf16m4_f32m8_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vf_bf16m4_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vf_bf16m4_f32m8_tu(vfloat32m8_t vd, __bf16 vs1,
+                                             vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vf_bf16m4_f32m8_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vv_bf16mf4_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vv_bf16mf4_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd,
+                                                 vbfloat16mf4_t vs1,
+                                                 vbfloat16mf4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwmacc_vv_bf16mf4_f32mf2_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vf_bf16mf4_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vf_bf16mf4_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd,
+                                                 __bf16 vs1, vbfloat16mf4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwmacc_vf_bf16mf4_f32mf2_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vv_bf16mf2_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vv_bf16mf2_f32m1_tum(vbool32_t vm, vfloat32m1_t vd,
+                                               vbfloat16mf2_t vs1,
+                                               vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vv_bf16mf2_f32m1_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vf_bf16mf2_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vf_bf16mf2_f32m1_tum(vbool32_t vm, vfloat32m1_t vd,
+                                               __bf16 vs1, vbfloat16mf2_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwmacc_vf_bf16mf2_f32m1_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vv_bf16m1_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vv_bf16m1_f32m2_tum(vbool16_t vm, vfloat32m2_t vd,
+                                              vbfloat16m1_t vs1,
+                                              vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vv_bf16m1_f32m2_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vf_bf16m1_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vf_bf16m1_f32m2_tum(vbool16_t vm, vfloat32m2_t vd,
+                                              __bf16 vs1, vbfloat16m1_t vs2,
+                                              size_t vl) {
+  return __riscv_vfwmacc_vf_bf16m1_f32m2_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vv_bf16m2_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vv_bf16m2_f32m4_tum(vbool8_t vm, vfloat32m4_t vd,
+                                              vbfloat16m2_t vs1,
+                                              vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vv_bf16m2_f32m4_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vf_bf16m2_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vf_bf16m2_f32m4_tum(vbool8_t vm, vfloat32m4_t vd,
+                                              __bf16 vs1, vbfloat16m2_t vs2,
+                                              size_t vl) {
+  return __riscv_vfwmacc_vf_bf16m2_f32m4_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vv_bf16m4_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vv_bf16m4_f32m8_tum(vbool4_t vm, vfloat32m8_t vd,
+                                              vbfloat16m4_t vs1,
+                                              vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vv_bf16m4_f32m8_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vf_bf16m4_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vf_bf16m4_f32m8_tum(vbool4_t vm, vfloat32m8_t vd,
+                                              __bf16 vs1, vbfloat16m4_t vs2,
+                                              size_t vl) {
+  return __riscv_vfwmacc_vf_bf16m4_f32m8_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vv_bf16mf4_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vv_bf16mf4_f32mf2_tumu(vbool64_t vm,
+                                                  vfloat32mf2_t vd,
+                                                  vbfloat16mf4_t vs1,
+                                                  vbfloat16mf4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmacc_vv_bf16mf4_f32mf2_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vf_bf16mf4_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vf_bf16mf4_f32mf2_tumu(vbool64_t vm,
+                                                  vfloat32mf2_t vd, __bf16 vs1,
+                                                  vbfloat16mf4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmacc_vf_bf16mf4_f32mf2_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vv_bf16mf2_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vv_bf16mf2_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                                vbfloat16mf2_t vs1,
+                                                vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vv_bf16mf2_f32m1_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vf_bf16mf2_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vf_bf16mf2_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                                __bf16 vs1, vbfloat16mf2_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwmacc_vf_bf16mf2_f32m1_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vv_bf16m1_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vv_bf16m1_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                               vbfloat16m1_t vs1,
+                                               vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vv_bf16m1_f32m2_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vf_bf16m1_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vf_bf16m1_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                               __bf16 vs1, vbfloat16m1_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwmacc_vf_bf16m1_f32m2_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vv_bf16m2_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vv_bf16m2_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                               vbfloat16m2_t vs1,
+                                               vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vv_bf16m2_f32m4_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vf_bf16m2_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vf_bf16m2_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                               __bf16 vs1, vbfloat16m2_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwmacc_vf_bf16m2_f32m4_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vv_bf16m4_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vv_bf16m4_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                               vbfloat16m4_t vs1,
+                                               vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vv_bf16m4_f32m8_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vf_bf16m4_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vf_bf16m4_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                               __bf16 vs1, vbfloat16m4_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwmacc_vf_bf16m4_f32m8_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vv_bf16mf4_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vv_bf16mf4_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd,
+                                                vbfloat16mf4_t vs1,
+                                                vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vv_bf16mf4_f32mf2_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vf_bf16mf4_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vf_bf16mf4_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd,
+                                                __bf16 vs1, vbfloat16mf4_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwmacc_vf_bf16mf4_f32mf2_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vv_bf16mf2_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vv_bf16mf2_f32m1_mu(vbool32_t vm, vfloat32m1_t vd,
+                                              vbfloat16mf2_t vs1,
+                                              vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vv_bf16mf2_f32m1_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vf_bf16mf2_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vf_bf16mf2_f32m1_mu(vbool32_t vm, vfloat32m1_t vd,
+                                              __bf16 vs1, vbfloat16mf2_t vs2,
+                                              size_t vl) {
+  return __riscv_vfwmacc_vf_bf16mf2_f32m1_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vv_bf16m1_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vv_bf16m1_f32m2_mu(vbool16_t vm, vfloat32m2_t vd,
+                                             vbfloat16m1_t vs1,
+                                             vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vv_bf16m1_f32m2_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vf_bf16m1_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vf_bf16m1_f32m2_mu(vbool16_t vm, vfloat32m2_t vd,
+                                             __bf16 vs1, vbfloat16m1_t vs2,
+                                             size_t vl) {
+  return __riscv_vfwmacc_vf_bf16m1_f32m2_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vv_bf16m2_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vv_bf16m2_f32m4_mu(vbool8_t vm, vfloat32m4_t vd,
+                                             vbfloat16m2_t vs1,
+                                             vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vv_bf16m2_f32m4_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vf_bf16m2_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vf_bf16m2_f32m4_mu(vbool8_t vm, vfloat32m4_t vd,
+                                             __bf16 vs1, vbfloat16m2_t vs2,
+                                             size_t vl) {
+  return __riscv_vfwmacc_vf_bf16m2_f32m4_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vv_bf16m4_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vv_bf16m4_f32m8_mu(vbool4_t vm, vfloat32m8_t vd,
+                                             vbfloat16m4_t vs1,
+                                             vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vv_bf16m4_f32m8_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vf_bf16m4_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vf_bf16m4_f32m8_mu(vbool4_t vm, vfloat32m8_t vd,
+                                             __bf16 vs1, vbfloat16m4_t vs2,
+                                             size_t vl) {
+  return __riscv_vfwmacc_vf_bf16m4_f32m8_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vv_bf16mf4_f32mf2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vv_bf16mf4_f32mf2_rm_tu(vfloat32mf2_t vd,
+                                                   vbfloat16mf4_t vs1,
+                                                   vbfloat16mf4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwmacc_vv_bf16mf4_f32mf2_rm_tu(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                                 vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vf_bf16mf4_f32mf2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vf_bf16mf4_f32mf2_rm_tu(vfloat32mf2_t vd, __bf16 vs1,
+                                                   vbfloat16mf4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwmacc_vf_bf16mf4_f32mf2_rm_tu(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                                 vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vv_bf16mf2_f32m1_rm_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vv_bf16mf2_f32m1_rm_tu(vfloat32m1_t vd,
+                                                 vbfloat16mf2_t vs1,
+                                                 vbfloat16mf2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwmacc_vv_bf16mf2_f32m1_rm_tu(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                                vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vf_bf16mf2_f32m1_rm_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vf_bf16mf2_f32m1_rm_tu(vfloat32m1_t vd, __bf16 vs1,
+                                                 vbfloat16mf2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwmacc_vf_bf16mf2_f32m1_rm_tu(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                                vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vv_bf16m1_f32m2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vv_bf16m1_f32m2_rm_tu(vfloat32m2_t vd,
+                                                vbfloat16m1_t vs1,
+                                                vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vv_bf16m1_f32m2_rm_tu(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                               vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vf_bf16m1_f32m2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vf_bf16m1_f32m2_rm_tu(vfloat32m2_t vd, __bf16 vs1,
+                                                vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vf_bf16m1_f32m2_rm_tu(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                               vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vv_bf16m2_f32m4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vv_bf16m2_f32m4_rm_tu(vfloat32m4_t vd,
+                                                vbfloat16m2_t vs1,
+                                                vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vv_bf16m2_f32m4_rm_tu(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                               vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vf_bf16m2_f32m4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vf_bf16m2_f32m4_rm_tu(vfloat32m4_t vd, __bf16 vs1,
+                                                vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vf_bf16m2_f32m4_rm_tu(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                               vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vv_bf16m4_f32m8_rm_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vv_bf16m4_f32m8_rm_tu(vfloat32m8_t vd,
+                                                vbfloat16m4_t vs1,
+                                                vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vv_bf16m4_f32m8_rm_tu(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                               vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vf_bf16m4_f32m8_rm_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vf_bf16m4_f32m8_rm_tu(vfloat32m8_t vd, __bf16 vs1,
+                                                vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vf_bf16m4_f32m8_rm_tu(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                               vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vv_bf16mf4_f32mf2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vv_bf16mf4_f32mf2_rm_tum(vbool64_t vm,
+                                                    vfloat32mf2_t vd,
+                                                    vbfloat16mf4_t vs1,
+                                                    vbfloat16mf4_t vs2,
+                                                    size_t vl) {
+  return __riscv_vfwmacc_vv_bf16mf4_f32mf2_rm_tum(vm, vd, vs1, vs2,
+                                                  __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vf_bf16mf4_f32mf2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vf_bf16mf4_f32mf2_rm_tum(
+    vbool64_t vm, vfloat32mf2_t vd, __bf16 vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vf_bf16mf4_f32mf2_rm_tum(vm, vd, vs1, vs2,
+                                                  __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vv_bf16mf2_f32m1_rm_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vv_bf16mf2_f32m1_rm_tum(vbool32_t vm, vfloat32m1_t vd,
+                                                  vbfloat16mf2_t vs1,
+                                                  vbfloat16mf2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmacc_vv_bf16mf2_f32m1_rm_tum(vm, vd, vs1, vs2,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vf_bf16mf2_f32m1_rm_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vf_bf16mf2_f32m1_rm_tum(vbool32_t vm, vfloat32m1_t vd,
+                                                  __bf16 vs1,
+                                                  vbfloat16mf2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmacc_vf_bf16mf2_f32m1_rm_tum(vm, vd, vs1, vs2,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vv_bf16m1_f32m2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vv_bf16m1_f32m2_rm_tum(vbool16_t vm, vfloat32m2_t vd,
+                                                 vbfloat16m1_t vs1,
+                                                 vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vv_bf16m1_f32m2_rm_tum(vm, vd, vs1, vs2,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vf_bf16m1_f32m2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vf_bf16m1_f32m2_rm_tum(vbool16_t vm, vfloat32m2_t vd,
+                                                 __bf16 vs1, vbfloat16m1_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwmacc_vf_bf16m1_f32m2_rm_tum(vm, vd, vs1, vs2,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vv_bf16m2_f32m4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vv_bf16m2_f32m4_rm_tum(vbool8_t vm, vfloat32m4_t vd,
+                                                 vbfloat16m2_t vs1,
+                                                 vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vv_bf16m2_f32m4_rm_tum(vm, vd, vs1, vs2,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vf_bf16m2_f32m4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vf_bf16m2_f32m4_rm_tum(vbool8_t vm, vfloat32m4_t vd,
+                                                 __bf16 vs1, vbfloat16m2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwmacc_vf_bf16m2_f32m4_rm_tum(vm, vd, vs1, vs2,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vv_bf16m4_f32m8_rm_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vv_bf16m4_f32m8_rm_tum(vbool4_t vm, vfloat32m8_t vd,
+                                                 vbfloat16m4_t vs1,
+                                                 vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vv_bf16m4_f32m8_rm_tum(vm, vd, vs1, vs2,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vf_bf16m4_f32m8_rm_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vf_bf16m4_f32m8_rm_tum(vbool4_t vm, vfloat32m8_t vd,
+                                                 __bf16 vs1, vbfloat16m4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwmacc_vf_bf16m4_f32m8_rm_tum(vm, vd, vs1, vs2,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vv_bf16mf4_f32mf2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vv_bf16mf4_f32mf2_rm_tumu(vbool64_t vm,
+                                                     vfloat32mf2_t vd,
+                                                     vbfloat16mf4_t vs1,
+                                                     vbfloat16mf4_t vs2,
+                                                     size_t vl) {
+  return __riscv_vfwmacc_vv_bf16mf4_f32mf2_rm_tumu(vm, vd, vs1, vs2,
+                                                   __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vf_bf16mf4_f32mf2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vf_bf16mf4_f32mf2_rm_tumu(
+    vbool64_t vm, vfloat32mf2_t vd, __bf16 vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vf_bf16mf4_f32mf2_rm_tumu(vm, vd, vs1, vs2,
+                                                   __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vv_bf16mf2_f32m1_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vv_bf16mf2_f32m1_rm_tumu(vbool32_t vm,
+                                                   vfloat32m1_t vd,
+                                                   vbfloat16mf2_t vs1,
+                                                   vbfloat16mf2_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwmacc_vv_bf16mf2_f32m1_rm_tumu(vm, vd, vs1, vs2,
+                                                  __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vf_bf16mf2_f32m1_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vf_bf16mf2_f32m1_rm_tumu(vbool32_t vm,
+                                                   vfloat32m1_t vd, __bf16 vs1,
+                                                   vbfloat16mf2_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwmacc_vf_bf16mf2_f32m1_rm_tumu(vm, vd, vs1, vs2,
+                                                  __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vv_bf16m1_f32m2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vv_bf16m1_f32m2_rm_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                                  vbfloat16m1_t vs1,
+                                                  vbfloat16m1_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmacc_vv_bf16m1_f32m2_rm_tumu(vm, vd, vs1, vs2,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vf_bf16m1_f32m2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vf_bf16m1_f32m2_rm_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                                  __bf16 vs1, vbfloat16m1_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmacc_vf_bf16m1_f32m2_rm_tumu(vm, vd, vs1, vs2,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vv_bf16m2_f32m4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vv_bf16m2_f32m4_rm_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                                  vbfloat16m2_t vs1,
+                                                  vbfloat16m2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmacc_vv_bf16m2_f32m4_rm_tumu(vm, vd, vs1, vs2,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vf_bf16m2_f32m4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vf_bf16m2_f32m4_rm_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                                  __bf16 vs1, vbfloat16m2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmacc_vf_bf16m2_f32m4_rm_tumu(vm, vd, vs1, vs2,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vv_bf16m4_f32m8_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vv_bf16m4_f32m8_rm_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                                  vbfloat16m4_t vs1,
+                                                  vbfloat16m4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmacc_vv_bf16m4_f32m8_rm_tumu(vm, vd, vs1, vs2,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vf_bf16m4_f32m8_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vf_bf16m4_f32m8_rm_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                                  __bf16 vs1, vbfloat16m4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmacc_vf_bf16m4_f32m8_rm_tumu(vm, vd, vs1, vs2,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vv_bf16mf4_f32mf2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vv_bf16mf4_f32mf2_rm_mu(vbool64_t vm,
+                                                   vfloat32mf2_t vd,
+                                                   vbfloat16mf4_t vs1,
+                                                   vbfloat16mf4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwmacc_vv_bf16mf4_f32mf2_rm_mu(vm, vd, vs1, vs2,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vf_bf16mf4_f32mf2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vf_bf16mf4_f32mf2_rm_mu(vbool64_t vm,
+                                                   vfloat32mf2_t vd, __bf16 vs1,
+                                                   vbfloat16mf4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwmacc_vf_bf16mf4_f32mf2_rm_mu(vm, vd, vs1, vs2,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vv_bf16mf2_f32m1_rm_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vv_bf16mf2_f32m1_rm_mu(vbool32_t vm, vfloat32m1_t vd,
+                                                 vbfloat16mf2_t vs1,
+                                                 vbfloat16mf2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwmacc_vv_bf16mf2_f32m1_rm_mu(vm, vd, vs1, vs2,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vf_bf16mf2_f32m1_rm_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vf_bf16mf2_f32m1_rm_mu(vbool32_t vm, vfloat32m1_t vd,
+                                                 __bf16 vs1, vbfloat16mf2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwmacc_vf_bf16mf2_f32m1_rm_mu(vm, vd, vs1, vs2,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vv_bf16m1_f32m2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vv_bf16m1_f32m2_rm_mu(vbool16_t vm, vfloat32m2_t vd,
+                                                vbfloat16m1_t vs1,
+                                                vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vv_bf16m1_f32m2_rm_mu(vm, vd, vs1, vs2,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vf_bf16m1_f32m2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vf_bf16m1_f32m2_rm_mu(vbool16_t vm, vfloat32m2_t vd,
+                                                __bf16 vs1, vbfloat16m1_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwmacc_vf_bf16m1_f32m2_rm_mu(vm, vd, vs1, vs2,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vv_bf16m2_f32m4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vv_bf16m2_f32m4_rm_mu(vbool8_t vm, vfloat32m4_t vd,
+                                                vbfloat16m2_t vs1,
+                                                vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vv_bf16m2_f32m4_rm_mu(vm, vd, vs1, vs2,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vf_bf16m2_f32m4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vf_bf16m2_f32m4_rm_mu(vbool8_t vm, vfloat32m4_t vd,
+                                                __bf16 vs1, vbfloat16m2_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwmacc_vf_bf16m2_f32m4_rm_mu(vm, vd, vs1, vs2,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vv_bf16m4_f32m8_rm_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vv_bf16m4_f32m8_rm_mu(vbool4_t vm, vfloat32m8_t vd,
+                                                vbfloat16m4_t vs1,
+                                                vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmacc_vv_bf16m4_f32m8_rm_mu(vm, vd, vs1, vs2,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vf_bf16m4_f32m8_rm_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vf_bf16m4_f32m8_rm_mu(vbool4_t vm, vfloat32m8_t vd,
+                                                __bf16 vs1, vbfloat16m4_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwmacc_vf_bf16m4_f32m8_rm_mu(vm, vd, vs1, vs2,
+                                               __RISCV_FRM_RNE, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfwmsac.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfwmsac.c
new file mode 100644
index 0000000000000..6d78c74e14694
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfwmsac.c
@@ -0,0 +1,1017 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vv_bf16mf4_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vv_bf16mf4_f32mf2_tu(vfloat32mf2_t vd,
+                                                vbfloat16mf4_t vs1,
+                                                vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vv_bf16mf4_f32mf2_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vf_bf16mf4_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vf_bf16mf4_f32mf2_tu(vfloat32mf2_t vd, __bf16 vs1,
+                                                vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vf_bf16mf4_f32mf2_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vv_bf16mf2_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vv_bf16mf2_f32m1_tu(vfloat32m1_t vd,
+                                              vbfloat16mf2_t vs1,
+                                              vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vv_bf16mf2_f32m1_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vf_bf16mf2_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vf_bf16mf2_f32m1_tu(vfloat32m1_t vd, __bf16 vs1,
+                                              vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vf_bf16mf2_f32m1_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vv_bf16m1_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vv_bf16m1_f32m2_tu(vfloat32m2_t vd, vbfloat16m1_t vs1,
+                                             vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vv_bf16m1_f32m2_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vf_bf16m1_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vf_bf16m1_f32m2_tu(vfloat32m2_t vd, __bf16 vs1,
+                                             vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vf_bf16m1_f32m2_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vv_bf16m2_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vv_bf16m2_f32m4_tu(vfloat32m4_t vd, vbfloat16m2_t vs1,
+                                             vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vv_bf16m2_f32m4_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vf_bf16m2_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vf_bf16m2_f32m4_tu(vfloat32m4_t vd, __bf16 vs1,
+                                             vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vf_bf16m2_f32m4_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vv_bf16m4_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vv_bf16m4_f32m8_tu(vfloat32m8_t vd, vbfloat16m4_t vs1,
+                                             vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vv_bf16m4_f32m8_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vf_bf16m4_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vf_bf16m4_f32m8_tu(vfloat32m8_t vd, __bf16 vs1,
+                                             vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vf_bf16m4_f32m8_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vv_bf16mf4_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vv_bf16mf4_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd,
+                                                 vbfloat16mf4_t vs1,
+                                                 vbfloat16mf4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwmsac_vv_bf16mf4_f32mf2_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vf_bf16mf4_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vf_bf16mf4_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd,
+                                                 __bf16 vs1, vbfloat16mf4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwmsac_vf_bf16mf4_f32mf2_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vv_bf16mf2_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vv_bf16mf2_f32m1_tum(vbool32_t vm, vfloat32m1_t vd,
+                                               vbfloat16mf2_t vs1,
+                                               vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vv_bf16mf2_f32m1_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vf_bf16mf2_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vf_bf16mf2_f32m1_tum(vbool32_t vm, vfloat32m1_t vd,
+                                               __bf16 vs1, vbfloat16mf2_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwmsac_vf_bf16mf2_f32m1_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vv_bf16m1_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vv_bf16m1_f32m2_tum(vbool16_t vm, vfloat32m2_t vd,
+                                              vbfloat16m1_t vs1,
+                                              vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vv_bf16m1_f32m2_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vf_bf16m1_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vf_bf16m1_f32m2_tum(vbool16_t vm, vfloat32m2_t vd,
+                                              __bf16 vs1, vbfloat16m1_t vs2,
+                                              size_t vl) {
+  return __riscv_vfwmsac_vf_bf16m1_f32m2_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vv_bf16m2_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vv_bf16m2_f32m4_tum(vbool8_t vm, vfloat32m4_t vd,
+                                              vbfloat16m2_t vs1,
+                                              vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vv_bf16m2_f32m4_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vf_bf16m2_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vf_bf16m2_f32m4_tum(vbool8_t vm, vfloat32m4_t vd,
+                                              __bf16 vs1, vbfloat16m2_t vs2,
+                                              size_t vl) {
+  return __riscv_vfwmsac_vf_bf16m2_f32m4_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vv_bf16m4_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vv_bf16m4_f32m8_tum(vbool4_t vm, vfloat32m8_t vd,
+                                              vbfloat16m4_t vs1,
+                                              vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vv_bf16m4_f32m8_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vf_bf16m4_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vf_bf16m4_f32m8_tum(vbool4_t vm, vfloat32m8_t vd,
+                                              __bf16 vs1, vbfloat16m4_t vs2,
+                                              size_t vl) {
+  return __riscv_vfwmsac_vf_bf16m4_f32m8_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vv_bf16mf4_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vv_bf16mf4_f32mf2_tumu(vbool64_t vm,
+                                                  vfloat32mf2_t vd,
+                                                  vbfloat16mf4_t vs1,
+                                                  vbfloat16mf4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmsac_vv_bf16mf4_f32mf2_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vf_bf16mf4_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vf_bf16mf4_f32mf2_tumu(vbool64_t vm,
+                                                  vfloat32mf2_t vd, __bf16 vs1,
+                                                  vbfloat16mf4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmsac_vf_bf16mf4_f32mf2_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vv_bf16mf2_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vv_bf16mf2_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                                vbfloat16mf2_t vs1,
+                                                vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vv_bf16mf2_f32m1_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vf_bf16mf2_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vf_bf16mf2_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                                __bf16 vs1, vbfloat16mf2_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwmsac_vf_bf16mf2_f32m1_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vv_bf16m1_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vv_bf16m1_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                               vbfloat16m1_t vs1,
+                                               vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vv_bf16m1_f32m2_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vf_bf16m1_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vf_bf16m1_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                               __bf16 vs1, vbfloat16m1_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwmsac_vf_bf16m1_f32m2_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vv_bf16m2_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vv_bf16m2_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                               vbfloat16m2_t vs1,
+                                               vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vv_bf16m2_f32m4_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vf_bf16m2_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vf_bf16m2_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                               __bf16 vs1, vbfloat16m2_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwmsac_vf_bf16m2_f32m4_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vv_bf16m4_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vv_bf16m4_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                               vbfloat16m4_t vs1,
+                                               vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vv_bf16m4_f32m8_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vf_bf16m4_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vf_bf16m4_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                               __bf16 vs1, vbfloat16m4_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwmsac_vf_bf16m4_f32m8_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vv_bf16mf4_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vv_bf16mf4_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd,
+                                                vbfloat16mf4_t vs1,
+                                                vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vv_bf16mf4_f32mf2_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vf_bf16mf4_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vf_bf16mf4_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd,
+                                                __bf16 vs1, vbfloat16mf4_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwmsac_vf_bf16mf4_f32mf2_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vv_bf16mf2_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vv_bf16mf2_f32m1_mu(vbool32_t vm, vfloat32m1_t vd,
+                                              vbfloat16mf2_t vs1,
+                                              vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vv_bf16mf2_f32m1_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vf_bf16mf2_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vf_bf16mf2_f32m1_mu(vbool32_t vm, vfloat32m1_t vd,
+                                              __bf16 vs1, vbfloat16mf2_t vs2,
+                                              size_t vl) {
+  return __riscv_vfwmsac_vf_bf16mf2_f32m1_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vv_bf16m1_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vv_bf16m1_f32m2_mu(vbool16_t vm, vfloat32m2_t vd,
+                                             vbfloat16m1_t vs1,
+                                             vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vv_bf16m1_f32m2_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vf_bf16m1_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vf_bf16m1_f32m2_mu(vbool16_t vm, vfloat32m2_t vd,
+                                             __bf16 vs1, vbfloat16m1_t vs2,
+                                             size_t vl) {
+  return __riscv_vfwmsac_vf_bf16m1_f32m2_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vv_bf16m2_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vv_bf16m2_f32m4_mu(vbool8_t vm, vfloat32m4_t vd,
+                                             vbfloat16m2_t vs1,
+                                             vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vv_bf16m2_f32m4_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vf_bf16m2_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vf_bf16m2_f32m4_mu(vbool8_t vm, vfloat32m4_t vd,
+                                             __bf16 vs1, vbfloat16m2_t vs2,
+                                             size_t vl) {
+  return __riscv_vfwmsac_vf_bf16m2_f32m4_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vv_bf16m4_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vv_bf16m4_f32m8_mu(vbool4_t vm, vfloat32m8_t vd,
+                                             vbfloat16m4_t vs1,
+                                             vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vv_bf16m4_f32m8_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vf_bf16m4_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vf_bf16m4_f32m8_mu(vbool4_t vm, vfloat32m8_t vd,
+                                             __bf16 vs1, vbfloat16m4_t vs2,
+                                             size_t vl) {
+  return __riscv_vfwmsac_vf_bf16m4_f32m8_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vv_bf16mf4_f32mf2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vv_bf16mf4_f32mf2_rm_tu(vfloat32mf2_t vd,
+                                                   vbfloat16mf4_t vs1,
+                                                   vbfloat16mf4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwmsac_vv_bf16mf4_f32mf2_rm_tu(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                                 vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vf_bf16mf4_f32mf2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vf_bf16mf4_f32mf2_rm_tu(vfloat32mf2_t vd, __bf16 vs1,
+                                                   vbfloat16mf4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwmsac_vf_bf16mf4_f32mf2_rm_tu(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                                 vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vv_bf16mf2_f32m1_rm_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vv_bf16mf2_f32m1_rm_tu(vfloat32m1_t vd,
+                                                 vbfloat16mf2_t vs1,
+                                                 vbfloat16mf2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwmsac_vv_bf16mf2_f32m1_rm_tu(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                                vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vf_bf16mf2_f32m1_rm_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vf_bf16mf2_f32m1_rm_tu(vfloat32m1_t vd, __bf16 vs1,
+                                                 vbfloat16mf2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwmsac_vf_bf16mf2_f32m1_rm_tu(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                                vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vv_bf16m1_f32m2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vv_bf16m1_f32m2_rm_tu(vfloat32m2_t vd,
+                                                vbfloat16m1_t vs1,
+                                                vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vv_bf16m1_f32m2_rm_tu(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                               vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vf_bf16m1_f32m2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vf_bf16m1_f32m2_rm_tu(vfloat32m2_t vd, __bf16 vs1,
+                                                vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vf_bf16m1_f32m2_rm_tu(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                               vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vv_bf16m2_f32m4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vv_bf16m2_f32m4_rm_tu(vfloat32m4_t vd,
+                                                vbfloat16m2_t vs1,
+                                                vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vv_bf16m2_f32m4_rm_tu(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                               vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vf_bf16m2_f32m4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vf_bf16m2_f32m4_rm_tu(vfloat32m4_t vd, __bf16 vs1,
+                                                vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vf_bf16m2_f32m4_rm_tu(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                               vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vv_bf16m4_f32m8_rm_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vv_bf16m4_f32m8_rm_tu(vfloat32m8_t vd,
+                                                vbfloat16m4_t vs1,
+                                                vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vv_bf16m4_f32m8_rm_tu(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                               vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vf_bf16m4_f32m8_rm_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vf_bf16m4_f32m8_rm_tu(vfloat32m8_t vd, __bf16 vs1,
+                                                vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vf_bf16m4_f32m8_rm_tu(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                               vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vv_bf16mf4_f32mf2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vv_bf16mf4_f32mf2_rm_tum(vbool64_t vm,
+                                                    vfloat32mf2_t vd,
+                                                    vbfloat16mf4_t vs1,
+                                                    vbfloat16mf4_t vs2,
+                                                    size_t vl) {
+  return __riscv_vfwmsac_vv_bf16mf4_f32mf2_rm_tum(vm, vd, vs1, vs2,
+                                                  __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vf_bf16mf4_f32mf2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vf_bf16mf4_f32mf2_rm_tum(
+    vbool64_t vm, vfloat32mf2_t vd, __bf16 vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vf_bf16mf4_f32mf2_rm_tum(vm, vd, vs1, vs2,
+                                                  __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vv_bf16mf2_f32m1_rm_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vv_bf16mf2_f32m1_rm_tum(vbool32_t vm, vfloat32m1_t vd,
+                                                  vbfloat16mf2_t vs1,
+                                                  vbfloat16mf2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmsac_vv_bf16mf2_f32m1_rm_tum(vm, vd, vs1, vs2,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vf_bf16mf2_f32m1_rm_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vf_bf16mf2_f32m1_rm_tum(vbool32_t vm, vfloat32m1_t vd,
+                                                  __bf16 vs1,
+                                                  vbfloat16mf2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmsac_vf_bf16mf2_f32m1_rm_tum(vm, vd, vs1, vs2,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vv_bf16m1_f32m2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vv_bf16m1_f32m2_rm_tum(vbool16_t vm, vfloat32m2_t vd,
+                                                 vbfloat16m1_t vs1,
+                                                 vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vv_bf16m1_f32m2_rm_tum(vm, vd, vs1, vs2,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vf_bf16m1_f32m2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vf_bf16m1_f32m2_rm_tum(vbool16_t vm, vfloat32m2_t vd,
+                                                 __bf16 vs1, vbfloat16m1_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwmsac_vf_bf16m1_f32m2_rm_tum(vm, vd, vs1, vs2,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vv_bf16m2_f32m4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vv_bf16m2_f32m4_rm_tum(vbool8_t vm, vfloat32m4_t vd,
+                                                 vbfloat16m2_t vs1,
+                                                 vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vv_bf16m2_f32m4_rm_tum(vm, vd, vs1, vs2,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vf_bf16m2_f32m4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vf_bf16m2_f32m4_rm_tum(vbool8_t vm, vfloat32m4_t vd,
+                                                 __bf16 vs1, vbfloat16m2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwmsac_vf_bf16m2_f32m4_rm_tum(vm, vd, vs1, vs2,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vv_bf16m4_f32m8_rm_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vv_bf16m4_f32m8_rm_tum(vbool4_t vm, vfloat32m8_t vd,
+                                                 vbfloat16m4_t vs1,
+                                                 vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vv_bf16m4_f32m8_rm_tum(vm, vd, vs1, vs2,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vf_bf16m4_f32m8_rm_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vf_bf16m4_f32m8_rm_tum(vbool4_t vm, vfloat32m8_t vd,
+                                                 __bf16 vs1, vbfloat16m4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwmsac_vf_bf16m4_f32m8_rm_tum(vm, vd, vs1, vs2,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vv_bf16mf4_f32mf2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vv_bf16mf4_f32mf2_rm_tumu(vbool64_t vm,
+                                                     vfloat32mf2_t vd,
+                                                     vbfloat16mf4_t vs1,
+                                                     vbfloat16mf4_t vs2,
+                                                     size_t vl) {
+  return __riscv_vfwmsac_vv_bf16mf4_f32mf2_rm_tumu(vm, vd, vs1, vs2,
+                                                   __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vf_bf16mf4_f32mf2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vf_bf16mf4_f32mf2_rm_tumu(
+    vbool64_t vm, vfloat32mf2_t vd, __bf16 vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vf_bf16mf4_f32mf2_rm_tumu(vm, vd, vs1, vs2,
+                                                   __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vv_bf16mf2_f32m1_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vv_bf16mf2_f32m1_rm_tumu(vbool32_t vm,
+                                                   vfloat32m1_t vd,
+                                                   vbfloat16mf2_t vs1,
+                                                   vbfloat16mf2_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwmsac_vv_bf16mf2_f32m1_rm_tumu(vm, vd, vs1, vs2,
+                                                  __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vf_bf16mf2_f32m1_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vf_bf16mf2_f32m1_rm_tumu(vbool32_t vm,
+                                                   vfloat32m1_t vd, __bf16 vs1,
+                                                   vbfloat16mf2_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwmsac_vf_bf16mf2_f32m1_rm_tumu(vm, vd, vs1, vs2,
+                                                  __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vv_bf16m1_f32m2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vv_bf16m1_f32m2_rm_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                                  vbfloat16m1_t vs1,
+                                                  vbfloat16m1_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmsac_vv_bf16m1_f32m2_rm_tumu(vm, vd, vs1, vs2,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vf_bf16m1_f32m2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vf_bf16m1_f32m2_rm_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                                  __bf16 vs1, vbfloat16m1_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmsac_vf_bf16m1_f32m2_rm_tumu(vm, vd, vs1, vs2,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vv_bf16m2_f32m4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vv_bf16m2_f32m4_rm_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                                  vbfloat16m2_t vs1,
+                                                  vbfloat16m2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmsac_vv_bf16m2_f32m4_rm_tumu(vm, vd, vs1, vs2,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vf_bf16m2_f32m4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vf_bf16m2_f32m4_rm_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                                  __bf16 vs1, vbfloat16m2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmsac_vf_bf16m2_f32m4_rm_tumu(vm, vd, vs1, vs2,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vv_bf16m4_f32m8_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vv_bf16m4_f32m8_rm_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                                  vbfloat16m4_t vs1,
+                                                  vbfloat16m4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmsac_vv_bf16m4_f32m8_rm_tumu(vm, vd, vs1, vs2,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vf_bf16m4_f32m8_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vf_bf16m4_f32m8_rm_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                                  __bf16 vs1, vbfloat16m4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmsac_vf_bf16m4_f32m8_rm_tumu(vm, vd, vs1, vs2,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vv_bf16mf4_f32mf2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vv_bf16mf4_f32mf2_rm_mu(vbool64_t vm,
+                                                   vfloat32mf2_t vd,
+                                                   vbfloat16mf4_t vs1,
+                                                   vbfloat16mf4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwmsac_vv_bf16mf4_f32mf2_rm_mu(vm, vd, vs1, vs2,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vf_bf16mf4_f32mf2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vf_bf16mf4_f32mf2_rm_mu(vbool64_t vm,
+                                                   vfloat32mf2_t vd, __bf16 vs1,
+                                                   vbfloat16mf4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwmsac_vf_bf16mf4_f32mf2_rm_mu(vm, vd, vs1, vs2,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vv_bf16mf2_f32m1_rm_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vv_bf16mf2_f32m1_rm_mu(vbool32_t vm, vfloat32m1_t vd,
+                                                 vbfloat16mf2_t vs1,
+                                                 vbfloat16mf2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwmsac_vv_bf16mf2_f32m1_rm_mu(vm, vd, vs1, vs2,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vf_bf16mf2_f32m1_rm_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vf_bf16mf2_f32m1_rm_mu(vbool32_t vm, vfloat32m1_t vd,
+                                                 __bf16 vs1, vbfloat16mf2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwmsac_vf_bf16mf2_f32m1_rm_mu(vm, vd, vs1, vs2,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vv_bf16m1_f32m2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vv_bf16m1_f32m2_rm_mu(vbool16_t vm, vfloat32m2_t vd,
+                                                vbfloat16m1_t vs1,
+                                                vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vv_bf16m1_f32m2_rm_mu(vm, vd, vs1, vs2,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vf_bf16m1_f32m2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vf_bf16m1_f32m2_rm_mu(vbool16_t vm, vfloat32m2_t vd,
+                                                __bf16 vs1, vbfloat16m1_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwmsac_vf_bf16m1_f32m2_rm_mu(vm, vd, vs1, vs2,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vv_bf16m2_f32m4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vv_bf16m2_f32m4_rm_mu(vbool8_t vm, vfloat32m4_t vd,
+                                                vbfloat16m2_t vs1,
+                                                vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vv_bf16m2_f32m4_rm_mu(vm, vd, vs1, vs2,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vf_bf16m2_f32m4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vf_bf16m2_f32m4_rm_mu(vbool8_t vm, vfloat32m4_t vd,
+                                                __bf16 vs1, vbfloat16m2_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwmsac_vf_bf16m2_f32m4_rm_mu(vm, vd, vs1, vs2,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vv_bf16m4_f32m8_rm_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vv_bf16m4_f32m8_rm_mu(vbool4_t vm, vfloat32m8_t vd,
+                                                vbfloat16m4_t vs1,
+                                                vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmsac_vv_bf16m4_f32m8_rm_mu(vm, vd, vs1, vs2,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vf_bf16m4_f32m8_rm_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vf_bf16m4_f32m8_rm_mu(vbool4_t vm, vfloat32m8_t vd,
+                                                __bf16 vs1, vbfloat16m4_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwmsac_vf_bf16m4_f32m8_rm_mu(vm, vd, vs1, vs2,
+                                               __RISCV_FRM_RNE, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfwmul.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfwmul.c
new file mode 100644
index 0000000000000..9fcfe818be71f
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfwmul.c
@@ -0,0 +1,1015 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vv_bf16mf4_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vv_bf16mf4_f32mf2_tu(vfloat32mf2_t vd,
+                                               vbfloat16mf4_t vs2,
+                                               vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16mf4_f32mf2_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vf_bf16mf4_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vf_bf16mf4_f32mf2_tu(vfloat32mf2_t vd,
+                                               vbfloat16mf4_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwmul_vf_bf16mf4_f32mf2_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vv_bf16mf2_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vv_bf16mf2_f32m1_tu(vfloat32m1_t vd,
+                                             vbfloat16mf2_t vs2,
+                                             vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16mf2_f32m1_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vf_bf16mf2_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vf_bf16mf2_f32m1_tu(vfloat32m1_t vd,
+                                             vbfloat16mf2_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwmul_vf_bf16mf2_f32m1_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vv_bf16m1_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vv_bf16m1_f32m2_tu(vfloat32m2_t vd, vbfloat16m1_t vs2,
+                                            vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16m1_f32m2_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vf_bf16m1_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vf_bf16m1_f32m2_tu(vfloat32m2_t vd, vbfloat16m1_t vs2,
+                                            __bf16 rs1, size_t vl) {
+  return __riscv_vfwmul_vf_bf16m1_f32m2_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vv_bf16m2_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vv_bf16m2_f32m4_tu(vfloat32m4_t vd, vbfloat16m2_t vs2,
+                                            vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16m2_f32m4_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vf_bf16m2_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vf_bf16m2_f32m4_tu(vfloat32m4_t vd, vbfloat16m2_t vs2,
+                                            __bf16 rs1, size_t vl) {
+  return __riscv_vfwmul_vf_bf16m2_f32m4_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vv_bf16m4_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vv_bf16m4_f32m8_tu(vfloat32m8_t vd, vbfloat16m4_t vs2,
+                                            vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16m4_f32m8_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vf_bf16m4_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vf_bf16m4_f32m8_tu(vfloat32m8_t vd, vbfloat16m4_t vs2,
+                                            __bf16 rs1, size_t vl) {
+  return __riscv_vfwmul_vf_bf16m4_f32m8_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vv_bf16mf4_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vv_bf16mf4_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd,
+                                                vbfloat16mf4_t vs2,
+                                                vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16mf4_f32mf2_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vf_bf16mf4_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vf_bf16mf4_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd,
+                                                vbfloat16mf4_t vs2, __bf16 rs1,
+                                                size_t vl) {
+  return __riscv_vfwmul_vf_bf16mf4_f32mf2_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vv_bf16mf2_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vv_bf16mf2_f32m1_tum(vbool32_t vm, vfloat32m1_t vd,
+                                              vbfloat16mf2_t vs2,
+                                              vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16mf2_f32m1_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vf_bf16mf2_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vf_bf16mf2_f32m1_tum(vbool32_t vm, vfloat32m1_t vd,
+                                              vbfloat16mf2_t vs2, __bf16 rs1,
+                                              size_t vl) {
+  return __riscv_vfwmul_vf_bf16mf2_f32m1_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vv_bf16m1_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vv_bf16m1_f32m2_tum(vbool16_t vm, vfloat32m2_t vd,
+                                             vbfloat16m1_t vs2,
+                                             vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16m1_f32m2_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vf_bf16m1_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vf_bf16m1_f32m2_tum(vbool16_t vm, vfloat32m2_t vd,
+                                             vbfloat16m1_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwmul_vf_bf16m1_f32m2_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vv_bf16m2_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vv_bf16m2_f32m4_tum(vbool8_t vm, vfloat32m4_t vd,
+                                             vbfloat16m2_t vs2,
+                                             vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16m2_f32m4_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vf_bf16m2_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vf_bf16m2_f32m4_tum(vbool8_t vm, vfloat32m4_t vd,
+                                             vbfloat16m2_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwmul_vf_bf16m2_f32m4_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vv_bf16m4_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vv_bf16m4_f32m8_tum(vbool4_t vm, vfloat32m8_t vd,
+                                             vbfloat16m4_t vs2,
+                                             vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16m4_f32m8_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vf_bf16m4_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vf_bf16m4_f32m8_tum(vbool4_t vm, vfloat32m8_t vd,
+                                             vbfloat16m4_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwmul_vf_bf16m4_f32m8_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vv_bf16mf4_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vv_bf16mf4_f32mf2_tumu(vbool64_t vm, vfloat32mf2_t vd,
+                                                 vbfloat16mf4_t vs2,
+                                                 vbfloat16mf4_t vs1,
+                                                 size_t vl) {
+  return __riscv_vfwmul_vv_bf16mf4_f32mf2_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vf_bf16mf4_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vf_bf16mf4_f32mf2_tumu(vbool64_t vm, vfloat32mf2_t vd,
+                                                 vbfloat16mf4_t vs2, __bf16 rs1,
+                                                 size_t vl) {
+  return __riscv_vfwmul_vf_bf16mf4_f32mf2_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vv_bf16mf2_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vv_bf16mf2_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                               vbfloat16mf2_t vs2,
+                                               vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16mf2_f32m1_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vf_bf16mf2_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vf_bf16mf2_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                               vbfloat16mf2_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwmul_vf_bf16mf2_f32m1_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vv_bf16m1_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vv_bf16m1_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                              vbfloat16m1_t vs2,
+                                              vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16m1_f32m2_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vf_bf16m1_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vf_bf16m1_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                              vbfloat16m1_t vs2, __bf16 rs1,
+                                              size_t vl) {
+  return __riscv_vfwmul_vf_bf16m1_f32m2_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vv_bf16m2_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vv_bf16m2_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                              vbfloat16m2_t vs2,
+                                              vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16m2_f32m4_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vf_bf16m2_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vf_bf16m2_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                              vbfloat16m2_t vs2, __bf16 rs1,
+                                              size_t vl) {
+  return __riscv_vfwmul_vf_bf16m2_f32m4_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vv_bf16m4_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vv_bf16m4_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                              vbfloat16m4_t vs2,
+                                              vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16m4_f32m8_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vf_bf16m4_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vf_bf16m4_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                              vbfloat16m4_t vs2, __bf16 rs1,
+                                              size_t vl) {
+  return __riscv_vfwmul_vf_bf16m4_f32m8_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vv_bf16mf4_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vv_bf16mf4_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd,
+                                               vbfloat16mf4_t vs2,
+                                               vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16mf4_f32mf2_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vf_bf16mf4_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vf_bf16mf4_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd,
+                                               vbfloat16mf4_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwmul_vf_bf16mf4_f32mf2_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vv_bf16mf2_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vv_bf16mf2_f32m1_mu(vbool32_t vm, vfloat32m1_t vd,
+                                             vbfloat16mf2_t vs2,
+                                             vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16mf2_f32m1_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vf_bf16mf2_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vf_bf16mf2_f32m1_mu(vbool32_t vm, vfloat32m1_t vd,
+                                             vbfloat16mf2_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwmul_vf_bf16mf2_f32m1_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vv_bf16m1_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vv_bf16m1_f32m2_mu(vbool16_t vm, vfloat32m2_t vd,
+                                            vbfloat16m1_t vs2,
+                                            vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16m1_f32m2_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vf_bf16m1_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vf_bf16m1_f32m2_mu(vbool16_t vm, vfloat32m2_t vd,
+                                            vbfloat16m1_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwmul_vf_bf16m1_f32m2_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vv_bf16m2_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vv_bf16m2_f32m4_mu(vbool8_t vm, vfloat32m4_t vd,
+                                            vbfloat16m2_t vs2,
+                                            vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16m2_f32m4_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vf_bf16m2_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vf_bf16m2_f32m4_mu(vbool8_t vm, vfloat32m4_t vd,
+                                            vbfloat16m2_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwmul_vf_bf16m2_f32m4_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vv_bf16m4_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vv_bf16m4_f32m8_mu(vbool4_t vm, vfloat32m8_t vd,
+                                            vbfloat16m4_t vs2,
+                                            vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16m4_f32m8_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vf_bf16m4_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vf_bf16m4_f32m8_mu(vbool4_t vm, vfloat32m8_t vd,
+                                            vbfloat16m4_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwmul_vf_bf16m4_f32m8_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vv_bf16mf4_f32mf2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vv_bf16mf4_f32mf2_rm_tu(vfloat32mf2_t vd,
+                                                  vbfloat16mf4_t vs2,
+                                                  vbfloat16mf4_t vs1,
+                                                  size_t vl) {
+  return __riscv_vfwmul_vv_bf16mf4_f32mf2_rm_tu(vd, vs2, vs1, __RISCV_FRM_RNE,
+                                                vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vf_bf16mf4_f32mf2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vf_bf16mf4_f32mf2_rm_tu(vfloat32mf2_t vd,
+                                                  vbfloat16mf4_t vs2,
+                                                  __bf16 rs1, size_t vl) {
+  return __riscv_vfwmul_vf_bf16mf4_f32mf2_rm_tu(vd, vs2, rs1, __RISCV_FRM_RNE,
+                                                vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vv_bf16mf2_f32m1_rm_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vv_bf16mf2_f32m1_rm_tu(vfloat32m1_t vd,
+                                                vbfloat16mf2_t vs2,
+                                                vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16mf2_f32m1_rm_tu(vd, vs2, vs1, __RISCV_FRM_RNE,
+                                               vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vf_bf16mf2_f32m1_rm_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vf_bf16mf2_f32m1_rm_tu(vfloat32m1_t vd,
+                                                vbfloat16mf2_t vs2, __bf16 rs1,
+                                                size_t vl) {
+  return __riscv_vfwmul_vf_bf16mf2_f32m1_rm_tu(vd, vs2, rs1, __RISCV_FRM_RNE,
+                                               vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vv_bf16m1_f32m2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vv_bf16m1_f32m2_rm_tu(vfloat32m2_t vd,
+                                               vbfloat16m1_t vs2,
+                                               vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16m1_f32m2_rm_tu(vd, vs2, vs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vf_bf16m1_f32m2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vf_bf16m1_f32m2_rm_tu(vfloat32m2_t vd,
+                                               vbfloat16m1_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwmul_vf_bf16m1_f32m2_rm_tu(vd, vs2, rs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vv_bf16m2_f32m4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vv_bf16m2_f32m4_rm_tu(vfloat32m4_t vd,
+                                               vbfloat16m2_t vs2,
+                                               vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16m2_f32m4_rm_tu(vd, vs2, vs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vf_bf16m2_f32m4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vf_bf16m2_f32m4_rm_tu(vfloat32m4_t vd,
+                                               vbfloat16m2_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwmul_vf_bf16m2_f32m4_rm_tu(vd, vs2, rs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vv_bf16m4_f32m8_rm_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vv_bf16m4_f32m8_rm_tu(vfloat32m8_t vd,
+                                               vbfloat16m4_t vs2,
+                                               vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16m4_f32m8_rm_tu(vd, vs2, vs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vf_bf16m4_f32m8_rm_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vf_bf16m4_f32m8_rm_tu(vfloat32m8_t vd,
+                                               vbfloat16m4_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwmul_vf_bf16m4_f32m8_rm_tu(vd, vs2, rs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vv_bf16mf4_f32mf2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vv_bf16mf4_f32mf2_rm_tum(vbool64_t vm,
+                                                   vfloat32mf2_t vd,
+                                                   vbfloat16mf4_t vs2,
+                                                   vbfloat16mf4_t vs1,
+                                                   size_t vl) {
+  return __riscv_vfwmul_vv_bf16mf4_f32mf2_rm_tum(vm, vd, vs2, vs1,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vf_bf16mf4_f32mf2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vf_bf16mf4_f32mf2_rm_tum(vbool64_t vm,
+                                                   vfloat32mf2_t vd,
+                                                   vbfloat16mf4_t vs2,
+                                                   __bf16 rs1, size_t vl) {
+  return __riscv_vfwmul_vf_bf16mf4_f32mf2_rm_tum(vm, vd, vs2, rs1,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vv_bf16mf2_f32m1_rm_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vv_bf16mf2_f32m1_rm_tum(vbool32_t vm, vfloat32m1_t vd,
+                                                 vbfloat16mf2_t vs2,
+                                                 vbfloat16mf2_t vs1,
+                                                 size_t vl) {
+  return __riscv_vfwmul_vv_bf16mf2_f32m1_rm_tum(vm, vd, vs2, vs1,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vf_bf16mf2_f32m1_rm_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vf_bf16mf2_f32m1_rm_tum(vbool32_t vm, vfloat32m1_t vd,
+                                                 vbfloat16mf2_t vs2, __bf16 rs1,
+                                                 size_t vl) {
+  return __riscv_vfwmul_vf_bf16mf2_f32m1_rm_tum(vm, vd, vs2, rs1,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vv_bf16m1_f32m2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vv_bf16m1_f32m2_rm_tum(vbool16_t vm, vfloat32m2_t vd,
+                                                vbfloat16m1_t vs2,
+                                                vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16m1_f32m2_rm_tum(vm, vd, vs2, vs1,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vf_bf16m1_f32m2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vf_bf16m1_f32m2_rm_tum(vbool16_t vm, vfloat32m2_t vd,
+                                                vbfloat16m1_t vs2, __bf16 rs1,
+                                                size_t vl) {
+  return __riscv_vfwmul_vf_bf16m1_f32m2_rm_tum(vm, vd, vs2, rs1,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vv_bf16m2_f32m4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vv_bf16m2_f32m4_rm_tum(vbool8_t vm, vfloat32m4_t vd,
+                                                vbfloat16m2_t vs2,
+                                                vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16m2_f32m4_rm_tum(vm, vd, vs2, vs1,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vf_bf16m2_f32m4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vf_bf16m2_f32m4_rm_tum(vbool8_t vm, vfloat32m4_t vd,
+                                                vbfloat16m2_t vs2, __bf16 rs1,
+                                                size_t vl) {
+  return __riscv_vfwmul_vf_bf16m2_f32m4_rm_tum(vm, vd, vs2, rs1,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vv_bf16m4_f32m8_rm_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vv_bf16m4_f32m8_rm_tum(vbool4_t vm, vfloat32m8_t vd,
+                                                vbfloat16m4_t vs2,
+                                                vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16m4_f32m8_rm_tum(vm, vd, vs2, vs1,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vf_bf16m4_f32m8_rm_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vf_bf16m4_f32m8_rm_tum(vbool4_t vm, vfloat32m8_t vd,
+                                                vbfloat16m4_t vs2, __bf16 rs1,
+                                                size_t vl) {
+  return __riscv_vfwmul_vf_bf16m4_f32m8_rm_tum(vm, vd, vs2, rs1,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vv_bf16mf4_f32mf2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vv_bf16mf4_f32mf2_rm_tumu(vbool64_t vm,
+                                                    vfloat32mf2_t vd,
+                                                    vbfloat16mf4_t vs2,
+                                                    vbfloat16mf4_t vs1,
+                                                    size_t vl) {
+  return __riscv_vfwmul_vv_bf16mf4_f32mf2_rm_tumu(vm, vd, vs2, vs1,
+                                                  __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vf_bf16mf4_f32mf2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vf_bf16mf4_f32mf2_rm_tumu(vbool64_t vm,
+                                                    vfloat32mf2_t vd,
+                                                    vbfloat16mf4_t vs2,
+                                                    __bf16 rs1, size_t vl) {
+  return __riscv_vfwmul_vf_bf16mf4_f32mf2_rm_tumu(vm, vd, vs2, rs1,
+                                                  __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vv_bf16mf2_f32m1_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vv_bf16mf2_f32m1_rm_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                                  vbfloat16mf2_t vs2,
+                                                  vbfloat16mf2_t vs1,
+                                                  size_t vl) {
+  return __riscv_vfwmul_vv_bf16mf2_f32m1_rm_tumu(vm, vd, vs2, vs1,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vf_bf16mf2_f32m1_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vf_bf16mf2_f32m1_rm_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                                  vbfloat16mf2_t vs2,
+                                                  __bf16 rs1, size_t vl) {
+  return __riscv_vfwmul_vf_bf16mf2_f32m1_rm_tumu(vm, vd, vs2, rs1,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vv_bf16m1_f32m2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vv_bf16m1_f32m2_rm_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                                 vbfloat16m1_t vs2,
+                                                 vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16m1_f32m2_rm_tumu(vm, vd, vs2, vs1,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vf_bf16m1_f32m2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vf_bf16m1_f32m2_rm_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                                 vbfloat16m1_t vs2, __bf16 rs1,
+                                                 size_t vl) {
+  return __riscv_vfwmul_vf_bf16m1_f32m2_rm_tumu(vm, vd, vs2, rs1,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vv_bf16m2_f32m4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vv_bf16m2_f32m4_rm_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                                 vbfloat16m2_t vs2,
+                                                 vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16m2_f32m4_rm_tumu(vm, vd, vs2, vs1,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vf_bf16m2_f32m4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vf_bf16m2_f32m4_rm_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                                 vbfloat16m2_t vs2, __bf16 rs1,
+                                                 size_t vl) {
+  return __riscv_vfwmul_vf_bf16m2_f32m4_rm_tumu(vm, vd, vs2, rs1,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vv_bf16m4_f32m8_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vv_bf16m4_f32m8_rm_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                                 vbfloat16m4_t vs2,
+                                                 vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16m4_f32m8_rm_tumu(vm, vd, vs2, vs1,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vf_bf16m4_f32m8_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vf_bf16m4_f32m8_rm_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                                 vbfloat16m4_t vs2, __bf16 rs1,
+                                                 size_t vl) {
+  return __riscv_vfwmul_vf_bf16m4_f32m8_rm_tumu(vm, vd, vs2, rs1,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vv_bf16mf4_f32mf2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vv_bf16mf4_f32mf2_rm_mu(vbool64_t vm,
+                                                  vfloat32mf2_t vd,
+                                                  vbfloat16mf4_t vs2,
+                                                  vbfloat16mf4_t vs1,
+                                                  size_t vl) {
+  return __riscv_vfwmul_vv_bf16mf4_f32mf2_rm_mu(vm, vd, vs2, vs1,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vf_bf16mf4_f32mf2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vf_bf16mf4_f32mf2_rm_mu(vbool64_t vm,
+                                                  vfloat32mf2_t vd,
+                                                  vbfloat16mf4_t vs2,
+                                                  __bf16 rs1, size_t vl) {
+  return __riscv_vfwmul_vf_bf16mf4_f32mf2_rm_mu(vm, vd, vs2, rs1,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vv_bf16mf2_f32m1_rm_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vv_bf16mf2_f32m1_rm_mu(vbool32_t vm, vfloat32m1_t vd,
+                                                vbfloat16mf2_t vs2,
+                                                vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16mf2_f32m1_rm_mu(vm, vd, vs2, vs1,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vf_bf16mf2_f32m1_rm_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vf_bf16mf2_f32m1_rm_mu(vbool32_t vm, vfloat32m1_t vd,
+                                                vbfloat16mf2_t vs2, __bf16 rs1,
+                                                size_t vl) {
+  return __riscv_vfwmul_vf_bf16mf2_f32m1_rm_mu(vm, vd, vs2, rs1,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vv_bf16m1_f32m2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vv_bf16m1_f32m2_rm_mu(vbool16_t vm, vfloat32m2_t vd,
+                                               vbfloat16m1_t vs2,
+                                               vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16m1_f32m2_rm_mu(vm, vd, vs2, vs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vf_bf16m1_f32m2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vf_bf16m1_f32m2_rm_mu(vbool16_t vm, vfloat32m2_t vd,
+                                               vbfloat16m1_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwmul_vf_bf16m1_f32m2_rm_mu(vm, vd, vs2, rs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vv_bf16m2_f32m4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vv_bf16m2_f32m4_rm_mu(vbool8_t vm, vfloat32m4_t vd,
+                                               vbfloat16m2_t vs2,
+                                               vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16m2_f32m4_rm_mu(vm, vd, vs2, vs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vf_bf16m2_f32m4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vf_bf16m2_f32m4_rm_mu(vbool8_t vm, vfloat32m4_t vd,
+                                               vbfloat16m2_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwmul_vf_bf16m2_f32m4_rm_mu(vm, vd, vs2, rs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vv_bf16m4_f32m8_rm_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vv_bf16m4_f32m8_rm_mu(vbool4_t vm, vfloat32m8_t vd,
+                                               vbfloat16m4_t vs2,
+                                               vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwmul_vv_bf16m4_f32m8_rm_mu(vm, vd, vs2, vs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vf_bf16m4_f32m8_rm_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vf_bf16m4_f32m8_rm_mu(vbool4_t vm, vfloat32m8_t vd,
+                                               vbfloat16m4_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwmul_vf_bf16m4_f32m8_rm_mu(vm, vd, vs2, rs1, __RISCV_FRM_RNE,
+                                              vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfwnmacc.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfwnmacc.c
new file mode 100644
index 0000000000000..73cc82219b201
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfwnmacc.c
@@ -0,0 +1,1034 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vv_bf16mf4_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vv_bf16mf4_f32mf2_tu(vfloat32mf2_t vd,
+                                                 vbfloat16mf4_t vs1,
+                                                 vbfloat16mf4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16mf4_f32mf2_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vf_bf16mf4_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vf_bf16mf4_f32mf2_tu(vfloat32mf2_t vd, __bf16 vs1,
+                                                 vbfloat16mf4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16mf4_f32mf2_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vv_bf16mf2_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vv_bf16mf2_f32m1_tu(vfloat32m1_t vd,
+                                               vbfloat16mf2_t vs1,
+                                               vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16mf2_f32m1_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vf_bf16mf2_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vf_bf16mf2_f32m1_tu(vfloat32m1_t vd, __bf16 vs1,
+                                               vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16mf2_f32m1_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vv_bf16m1_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vv_bf16m1_f32m2_tu(vfloat32m2_t vd,
+                                              vbfloat16m1_t vs1,
+                                              vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16m1_f32m2_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vf_bf16m1_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vf_bf16m1_f32m2_tu(vfloat32m2_t vd, __bf16 vs1,
+                                              vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16m1_f32m2_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vv_bf16m2_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vv_bf16m2_f32m4_tu(vfloat32m4_t vd,
+                                              vbfloat16m2_t vs1,
+                                              vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16m2_f32m4_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vf_bf16m2_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vf_bf16m2_f32m4_tu(vfloat32m4_t vd, __bf16 vs1,
+                                              vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16m2_f32m4_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vv_bf16m4_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vv_bf16m4_f32m8_tu(vfloat32m8_t vd,
+                                              vbfloat16m4_t vs1,
+                                              vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16m4_f32m8_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vf_bf16m4_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vf_bf16m4_f32m8_tu(vfloat32m8_t vd, __bf16 vs1,
+                                              vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16m4_f32m8_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vv_bf16mf4_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vv_bf16mf4_f32mf2_tum(vbool64_t vm,
+                                                  vfloat32mf2_t vd,
+                                                  vbfloat16mf4_t vs1,
+                                                  vbfloat16mf4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16mf4_f32mf2_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vf_bf16mf4_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vf_bf16mf4_f32mf2_tum(vbool64_t vm,
+                                                  vfloat32mf2_t vd, __bf16 vs1,
+                                                  vbfloat16mf4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16mf4_f32mf2_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vv_bf16mf2_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vv_bf16mf2_f32m1_tum(vbool32_t vm, vfloat32m1_t vd,
+                                                vbfloat16mf2_t vs1,
+                                                vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16mf2_f32m1_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vf_bf16mf2_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vf_bf16mf2_f32m1_tum(vbool32_t vm, vfloat32m1_t vd,
+                                                __bf16 vs1, vbfloat16mf2_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16mf2_f32m1_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vv_bf16m1_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vv_bf16m1_f32m2_tum(vbool16_t vm, vfloat32m2_t vd,
+                                               vbfloat16m1_t vs1,
+                                               vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16m1_f32m2_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vf_bf16m1_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vf_bf16m1_f32m2_tum(vbool16_t vm, vfloat32m2_t vd,
+                                               __bf16 vs1, vbfloat16m1_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16m1_f32m2_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vv_bf16m2_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vv_bf16m2_f32m4_tum(vbool8_t vm, vfloat32m4_t vd,
+                                               vbfloat16m2_t vs1,
+                                               vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16m2_f32m4_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vf_bf16m2_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vf_bf16m2_f32m4_tum(vbool8_t vm, vfloat32m4_t vd,
+                                               __bf16 vs1, vbfloat16m2_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16m2_f32m4_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vv_bf16m4_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vv_bf16m4_f32m8_tum(vbool4_t vm, vfloat32m8_t vd,
+                                               vbfloat16m4_t vs1,
+                                               vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16m4_f32m8_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vf_bf16m4_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vf_bf16m4_f32m8_tum(vbool4_t vm, vfloat32m8_t vd,
+                                               __bf16 vs1, vbfloat16m4_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16m4_f32m8_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vv_bf16mf4_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vv_bf16mf4_f32mf2_tumu(vbool64_t vm,
+                                                   vfloat32mf2_t vd,
+                                                   vbfloat16mf4_t vs1,
+                                                   vbfloat16mf4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16mf4_f32mf2_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vf_bf16mf4_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vf_bf16mf4_f32mf2_tumu(vbool64_t vm,
+                                                   vfloat32mf2_t vd, __bf16 vs1,
+                                                   vbfloat16mf4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16mf4_f32mf2_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vv_bf16mf2_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vv_bf16mf2_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                                 vbfloat16mf2_t vs1,
+                                                 vbfloat16mf2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16mf2_f32m1_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vf_bf16mf2_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vf_bf16mf2_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                                 __bf16 vs1, vbfloat16mf2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16mf2_f32m1_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vv_bf16m1_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vv_bf16m1_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                                vbfloat16m1_t vs1,
+                                                vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16m1_f32m2_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vf_bf16m1_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vf_bf16m1_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                                __bf16 vs1, vbfloat16m1_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16m1_f32m2_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vv_bf16m2_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vv_bf16m2_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                                vbfloat16m2_t vs1,
+                                                vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16m2_f32m4_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vf_bf16m2_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vf_bf16m2_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                                __bf16 vs1, vbfloat16m2_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16m2_f32m4_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vv_bf16m4_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vv_bf16m4_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                                vbfloat16m4_t vs1,
+                                                vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16m4_f32m8_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vf_bf16m4_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vf_bf16m4_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                                __bf16 vs1, vbfloat16m4_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16m4_f32m8_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vv_bf16mf4_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vv_bf16mf4_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd,
+                                                 vbfloat16mf4_t vs1,
+                                                 vbfloat16mf4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16mf4_f32mf2_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vf_bf16mf4_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vf_bf16mf4_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd,
+                                                 __bf16 vs1, vbfloat16mf4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16mf4_f32mf2_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vv_bf16mf2_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vv_bf16mf2_f32m1_mu(vbool32_t vm, vfloat32m1_t vd,
+                                               vbfloat16mf2_t vs1,
+                                               vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16mf2_f32m1_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vf_bf16mf2_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vf_bf16mf2_f32m1_mu(vbool32_t vm, vfloat32m1_t vd,
+                                               __bf16 vs1, vbfloat16mf2_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16mf2_f32m1_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vv_bf16m1_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vv_bf16m1_f32m2_mu(vbool16_t vm, vfloat32m2_t vd,
+                                              vbfloat16m1_t vs1,
+                                              vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16m1_f32m2_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vf_bf16m1_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vf_bf16m1_f32m2_mu(vbool16_t vm, vfloat32m2_t vd,
+                                              __bf16 vs1, vbfloat16m1_t vs2,
+                                              size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16m1_f32m2_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vv_bf16m2_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vv_bf16m2_f32m4_mu(vbool8_t vm, vfloat32m4_t vd,
+                                              vbfloat16m2_t vs1,
+                                              vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16m2_f32m4_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vf_bf16m2_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vf_bf16m2_f32m4_mu(vbool8_t vm, vfloat32m4_t vd,
+                                              __bf16 vs1, vbfloat16m2_t vs2,
+                                              size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16m2_f32m4_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vv_bf16m4_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vv_bf16m4_f32m8_mu(vbool4_t vm, vfloat32m8_t vd,
+                                              vbfloat16m4_t vs1,
+                                              vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16m4_f32m8_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vf_bf16m4_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vf_bf16m4_f32m8_mu(vbool4_t vm, vfloat32m8_t vd,
+                                              __bf16 vs1, vbfloat16m4_t vs2,
+                                              size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16m4_f32m8_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vv_bf16mf4_f32mf2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vv_bf16mf4_f32mf2_rm_tu(vfloat32mf2_t vd,
+                                                    vbfloat16mf4_t vs1,
+                                                    vbfloat16mf4_t vs2,
+                                                    size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16mf4_f32mf2_rm_tu(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                                  vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vf_bf16mf4_f32mf2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vf_bf16mf4_f32mf2_rm_tu(vfloat32mf2_t vd,
+                                                    __bf16 vs1,
+                                                    vbfloat16mf4_t vs2,
+                                                    size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16mf4_f32mf2_rm_tu(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                                  vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vv_bf16mf2_f32m1_rm_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vv_bf16mf2_f32m1_rm_tu(vfloat32m1_t vd,
+                                                  vbfloat16mf2_t vs1,
+                                                  vbfloat16mf2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16mf2_f32m1_rm_tu(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                                 vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vf_bf16mf2_f32m1_rm_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vf_bf16mf2_f32m1_rm_tu(vfloat32m1_t vd, __bf16 vs1,
+                                                  vbfloat16mf2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16mf2_f32m1_rm_tu(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                                 vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vv_bf16m1_f32m2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vv_bf16m1_f32m2_rm_tu(vfloat32m2_t vd,
+                                                 vbfloat16m1_t vs1,
+                                                 vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16m1_f32m2_rm_tu(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                                vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vf_bf16m1_f32m2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vf_bf16m1_f32m2_rm_tu(vfloat32m2_t vd, __bf16 vs1,
+                                                 vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16m1_f32m2_rm_tu(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                                vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vv_bf16m2_f32m4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vv_bf16m2_f32m4_rm_tu(vfloat32m4_t vd,
+                                                 vbfloat16m2_t vs1,
+                                                 vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16m2_f32m4_rm_tu(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                                vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vf_bf16m2_f32m4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vf_bf16m2_f32m4_rm_tu(vfloat32m4_t vd, __bf16 vs1,
+                                                 vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16m2_f32m4_rm_tu(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                                vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vv_bf16m4_f32m8_rm_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vv_bf16m4_f32m8_rm_tu(vfloat32m8_t vd,
+                                                 vbfloat16m4_t vs1,
+                                                 vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16m4_f32m8_rm_tu(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                                vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vf_bf16m4_f32m8_rm_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vf_bf16m4_f32m8_rm_tu(vfloat32m8_t vd, __bf16 vs1,
+                                                 vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16m4_f32m8_rm_tu(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                                vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vv_bf16mf4_f32mf2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vv_bf16mf4_f32mf2_rm_tum(vbool64_t vm,
+                                                     vfloat32mf2_t vd,
+                                                     vbfloat16mf4_t vs1,
+                                                     vbfloat16mf4_t vs2,
+                                                     size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16mf4_f32mf2_rm_tum(vm, vd, vs1, vs2,
+                                                   __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vf_bf16mf4_f32mf2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vf_bf16mf4_f32mf2_rm_tum(
+    vbool64_t vm, vfloat32mf2_t vd, __bf16 vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16mf4_f32mf2_rm_tum(vm, vd, vs1, vs2,
+                                                   __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vv_bf16mf2_f32m1_rm_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vv_bf16mf2_f32m1_rm_tum(vbool32_t vm,
+                                                   vfloat32m1_t vd,
+                                                   vbfloat16mf2_t vs1,
+                                                   vbfloat16mf2_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16mf2_f32m1_rm_tum(vm, vd, vs1, vs2,
+                                                  __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vf_bf16mf2_f32m1_rm_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vf_bf16mf2_f32m1_rm_tum(vbool32_t vm,
+                                                   vfloat32m1_t vd, __bf16 vs1,
+                                                   vbfloat16mf2_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16mf2_f32m1_rm_tum(vm, vd, vs1, vs2,
+                                                  __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vv_bf16m1_f32m2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vv_bf16m1_f32m2_rm_tum(vbool16_t vm, vfloat32m2_t vd,
+                                                  vbfloat16m1_t vs1,
+                                                  vbfloat16m1_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16m1_f32m2_rm_tum(vm, vd, vs1, vs2,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vf_bf16m1_f32m2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vf_bf16m1_f32m2_rm_tum(vbool16_t vm, vfloat32m2_t vd,
+                                                  __bf16 vs1, vbfloat16m1_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16m1_f32m2_rm_tum(vm, vd, vs1, vs2,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vv_bf16m2_f32m4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vv_bf16m2_f32m4_rm_tum(vbool8_t vm, vfloat32m4_t vd,
+                                                  vbfloat16m2_t vs1,
+                                                  vbfloat16m2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16m2_f32m4_rm_tum(vm, vd, vs1, vs2,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vf_bf16m2_f32m4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vf_bf16m2_f32m4_rm_tum(vbool8_t vm, vfloat32m4_t vd,
+                                                  __bf16 vs1, vbfloat16m2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16m2_f32m4_rm_tum(vm, vd, vs1, vs2,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vv_bf16m4_f32m8_rm_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vv_bf16m4_f32m8_rm_tum(vbool4_t vm, vfloat32m8_t vd,
+                                                  vbfloat16m4_t vs1,
+                                                  vbfloat16m4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16m4_f32m8_rm_tum(vm, vd, vs1, vs2,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vf_bf16m4_f32m8_rm_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vf_bf16m4_f32m8_rm_tum(vbool4_t vm, vfloat32m8_t vd,
+                                                  __bf16 vs1, vbfloat16m4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16m4_f32m8_rm_tum(vm, vd, vs1, vs2,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vv_bf16mf4_f32mf2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vv_bf16mf4_f32mf2_rm_tumu(vbool64_t vm,
+                                                      vfloat32mf2_t vd,
+                                                      vbfloat16mf4_t vs1,
+                                                      vbfloat16mf4_t vs2,
+                                                      size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16mf4_f32mf2_rm_tumu(vm, vd, vs1, vs2,
+                                                    __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vf_bf16mf4_f32mf2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vf_bf16mf4_f32mf2_rm_tumu(
+    vbool64_t vm, vfloat32mf2_t vd, __bf16 vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16mf4_f32mf2_rm_tumu(vm, vd, vs1, vs2,
+                                                    __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vv_bf16mf2_f32m1_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vv_bf16mf2_f32m1_rm_tumu(vbool32_t vm,
+                                                    vfloat32m1_t vd,
+                                                    vbfloat16mf2_t vs1,
+                                                    vbfloat16mf2_t vs2,
+                                                    size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16mf2_f32m1_rm_tumu(vm, vd, vs1, vs2,
+                                                   __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vf_bf16mf2_f32m1_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vf_bf16mf2_f32m1_rm_tumu(vbool32_t vm,
+                                                    vfloat32m1_t vd, __bf16 vs1,
+                                                    vbfloat16mf2_t vs2,
+                                                    size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16mf2_f32m1_rm_tumu(vm, vd, vs1, vs2,
+                                                   __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vv_bf16m1_f32m2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vv_bf16m1_f32m2_rm_tumu(vbool16_t vm,
+                                                   vfloat32m2_t vd,
+                                                   vbfloat16m1_t vs1,
+                                                   vbfloat16m1_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16m1_f32m2_rm_tumu(vm, vd, vs1, vs2,
+                                                  __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vf_bf16m1_f32m2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vf_bf16m1_f32m2_rm_tumu(vbool16_t vm,
+                                                   vfloat32m2_t vd, __bf16 vs1,
+                                                   vbfloat16m1_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16m1_f32m2_rm_tumu(vm, vd, vs1, vs2,
+                                                  __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vv_bf16m2_f32m4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vv_bf16m2_f32m4_rm_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                                   vbfloat16m2_t vs1,
+                                                   vbfloat16m2_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16m2_f32m4_rm_tumu(vm, vd, vs1, vs2,
+                                                  __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vf_bf16m2_f32m4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vf_bf16m2_f32m4_rm_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                                   __bf16 vs1,
+                                                   vbfloat16m2_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16m2_f32m4_rm_tumu(vm, vd, vs1, vs2,
+                                                  __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vv_bf16m4_f32m8_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vv_bf16m4_f32m8_rm_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                                   vbfloat16m4_t vs1,
+                                                   vbfloat16m4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16m4_f32m8_rm_tumu(vm, vd, vs1, vs2,
+                                                  __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vf_bf16m4_f32m8_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vf_bf16m4_f32m8_rm_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                                   __bf16 vs1,
+                                                   vbfloat16m4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16m4_f32m8_rm_tumu(vm, vd, vs1, vs2,
+                                                  __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vv_bf16mf4_f32mf2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vv_bf16mf4_f32mf2_rm_mu(vbool64_t vm,
+                                                    vfloat32mf2_t vd,
+                                                    vbfloat16mf4_t vs1,
+                                                    vbfloat16mf4_t vs2,
+                                                    size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16mf4_f32mf2_rm_mu(vm, vd, vs1, vs2,
+                                                  __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vf_bf16mf4_f32mf2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vf_bf16mf4_f32mf2_rm_mu(
+    vbool64_t vm, vfloat32mf2_t vd, __bf16 vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16mf4_f32mf2_rm_mu(vm, vd, vs1, vs2,
+                                                  __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vv_bf16mf2_f32m1_rm_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vv_bf16mf2_f32m1_rm_mu(vbool32_t vm, vfloat32m1_t vd,
+                                                  vbfloat16mf2_t vs1,
+                                                  vbfloat16mf2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16mf2_f32m1_rm_mu(vm, vd, vs1, vs2,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vf_bf16mf2_f32m1_rm_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vf_bf16mf2_f32m1_rm_mu(vbool32_t vm, vfloat32m1_t vd,
+                                                  __bf16 vs1,
+                                                  vbfloat16mf2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16mf2_f32m1_rm_mu(vm, vd, vs1, vs2,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vv_bf16m1_f32m2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vv_bf16m1_f32m2_rm_mu(vbool16_t vm, vfloat32m2_t vd,
+                                                 vbfloat16m1_t vs1,
+                                                 vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16m1_f32m2_rm_mu(vm, vd, vs1, vs2,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vf_bf16m1_f32m2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vf_bf16m1_f32m2_rm_mu(vbool16_t vm, vfloat32m2_t vd,
+                                                 __bf16 vs1, vbfloat16m1_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16m1_f32m2_rm_mu(vm, vd, vs1, vs2,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vv_bf16m2_f32m4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vv_bf16m2_f32m4_rm_mu(vbool8_t vm, vfloat32m4_t vd,
+                                                 vbfloat16m2_t vs1,
+                                                 vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16m2_f32m4_rm_mu(vm, vd, vs1, vs2,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vf_bf16m2_f32m4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vf_bf16m2_f32m4_rm_mu(vbool8_t vm, vfloat32m4_t vd,
+                                                 __bf16 vs1, vbfloat16m2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16m2_f32m4_rm_mu(vm, vd, vs1, vs2,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vv_bf16m4_f32m8_rm_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vv_bf16m4_f32m8_rm_mu(vbool4_t vm, vfloat32m8_t vd,
+                                                 vbfloat16m4_t vs1,
+                                                 vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_vv_bf16m4_f32m8_rm_mu(vm, vd, vs1, vs2,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vf_bf16m4_f32m8_rm_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vf_bf16m4_f32m8_rm_mu(vbool4_t vm, vfloat32m8_t vd,
+                                                 __bf16 vs1, vbfloat16m4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmacc_vf_bf16m4_f32m8_rm_mu(vm, vd, vs1, vs2,
+                                                __RISCV_FRM_RNE, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfwnmsac.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfwnmsac.c
new file mode 100644
index 0000000000000..6133230e5cd84
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfwnmsac.c
@@ -0,0 +1,1034 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vv_bf16mf4_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vv_bf16mf4_f32mf2_tu(vfloat32mf2_t vd,
+                                                 vbfloat16mf4_t vs1,
+                                                 vbfloat16mf4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16mf4_f32mf2_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vf_bf16mf4_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vf_bf16mf4_f32mf2_tu(vfloat32mf2_t vd, __bf16 vs1,
+                                                 vbfloat16mf4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16mf4_f32mf2_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vv_bf16mf2_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vv_bf16mf2_f32m1_tu(vfloat32m1_t vd,
+                                               vbfloat16mf2_t vs1,
+                                               vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16mf2_f32m1_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vf_bf16mf2_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vf_bf16mf2_f32m1_tu(vfloat32m1_t vd, __bf16 vs1,
+                                               vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16mf2_f32m1_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vv_bf16m1_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vv_bf16m1_f32m2_tu(vfloat32m2_t vd,
+                                              vbfloat16m1_t vs1,
+                                              vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16m1_f32m2_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vf_bf16m1_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vf_bf16m1_f32m2_tu(vfloat32m2_t vd, __bf16 vs1,
+                                              vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16m1_f32m2_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vv_bf16m2_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vv_bf16m2_f32m4_tu(vfloat32m4_t vd,
+                                              vbfloat16m2_t vs1,
+                                              vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16m2_f32m4_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vf_bf16m2_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vf_bf16m2_f32m4_tu(vfloat32m4_t vd, __bf16 vs1,
+                                              vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16m2_f32m4_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vv_bf16m4_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vv_bf16m4_f32m8_tu(vfloat32m8_t vd,
+                                              vbfloat16m4_t vs1,
+                                              vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16m4_f32m8_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vf_bf16m4_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vf_bf16m4_f32m8_tu(vfloat32m8_t vd, __bf16 vs1,
+                                              vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16m4_f32m8_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vv_bf16mf4_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vv_bf16mf4_f32mf2_tum(vbool64_t vm,
+                                                  vfloat32mf2_t vd,
+                                                  vbfloat16mf4_t vs1,
+                                                  vbfloat16mf4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16mf4_f32mf2_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vf_bf16mf4_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vf_bf16mf4_f32mf2_tum(vbool64_t vm,
+                                                  vfloat32mf2_t vd, __bf16 vs1,
+                                                  vbfloat16mf4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16mf4_f32mf2_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vv_bf16mf2_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vv_bf16mf2_f32m1_tum(vbool32_t vm, vfloat32m1_t vd,
+                                                vbfloat16mf2_t vs1,
+                                                vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16mf2_f32m1_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vf_bf16mf2_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vf_bf16mf2_f32m1_tum(vbool32_t vm, vfloat32m1_t vd,
+                                                __bf16 vs1, vbfloat16mf2_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16mf2_f32m1_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vv_bf16m1_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vv_bf16m1_f32m2_tum(vbool16_t vm, vfloat32m2_t vd,
+                                               vbfloat16m1_t vs1,
+                                               vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16m1_f32m2_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vf_bf16m1_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vf_bf16m1_f32m2_tum(vbool16_t vm, vfloat32m2_t vd,
+                                               __bf16 vs1, vbfloat16m1_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16m1_f32m2_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vv_bf16m2_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vv_bf16m2_f32m4_tum(vbool8_t vm, vfloat32m4_t vd,
+                                               vbfloat16m2_t vs1,
+                                               vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16m2_f32m4_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vf_bf16m2_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vf_bf16m2_f32m4_tum(vbool8_t vm, vfloat32m4_t vd,
+                                               __bf16 vs1, vbfloat16m2_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16m2_f32m4_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vv_bf16m4_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vv_bf16m4_f32m8_tum(vbool4_t vm, vfloat32m8_t vd,
+                                               vbfloat16m4_t vs1,
+                                               vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16m4_f32m8_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vf_bf16m4_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vf_bf16m4_f32m8_tum(vbool4_t vm, vfloat32m8_t vd,
+                                               __bf16 vs1, vbfloat16m4_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16m4_f32m8_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vv_bf16mf4_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vv_bf16mf4_f32mf2_tumu(vbool64_t vm,
+                                                   vfloat32mf2_t vd,
+                                                   vbfloat16mf4_t vs1,
+                                                   vbfloat16mf4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16mf4_f32mf2_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vf_bf16mf4_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vf_bf16mf4_f32mf2_tumu(vbool64_t vm,
+                                                   vfloat32mf2_t vd, __bf16 vs1,
+                                                   vbfloat16mf4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16mf4_f32mf2_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vv_bf16mf2_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vv_bf16mf2_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                                 vbfloat16mf2_t vs1,
+                                                 vbfloat16mf2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16mf2_f32m1_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vf_bf16mf2_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vf_bf16mf2_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                                 __bf16 vs1, vbfloat16mf2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16mf2_f32m1_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vv_bf16m1_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vv_bf16m1_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                                vbfloat16m1_t vs1,
+                                                vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16m1_f32m2_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vf_bf16m1_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vf_bf16m1_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                                __bf16 vs1, vbfloat16m1_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16m1_f32m2_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vv_bf16m2_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vv_bf16m2_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                                vbfloat16m2_t vs1,
+                                                vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16m2_f32m4_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vf_bf16m2_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vf_bf16m2_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                                __bf16 vs1, vbfloat16m2_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16m2_f32m4_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vv_bf16m4_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vv_bf16m4_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                                vbfloat16m4_t vs1,
+                                                vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16m4_f32m8_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vf_bf16m4_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vf_bf16m4_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                                __bf16 vs1, vbfloat16m4_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16m4_f32m8_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vv_bf16mf4_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vv_bf16mf4_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd,
+                                                 vbfloat16mf4_t vs1,
+                                                 vbfloat16mf4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16mf4_f32mf2_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vf_bf16mf4_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vf_bf16mf4_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd,
+                                                 __bf16 vs1, vbfloat16mf4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16mf4_f32mf2_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vv_bf16mf2_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vv_bf16mf2_f32m1_mu(vbool32_t vm, vfloat32m1_t vd,
+                                               vbfloat16mf2_t vs1,
+                                               vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16mf2_f32m1_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vf_bf16mf2_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vf_bf16mf2_f32m1_mu(vbool32_t vm, vfloat32m1_t vd,
+                                               __bf16 vs1, vbfloat16mf2_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16mf2_f32m1_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vv_bf16m1_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vv_bf16m1_f32m2_mu(vbool16_t vm, vfloat32m2_t vd,
+                                              vbfloat16m1_t vs1,
+                                              vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16m1_f32m2_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vf_bf16m1_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vf_bf16m1_f32m2_mu(vbool16_t vm, vfloat32m2_t vd,
+                                              __bf16 vs1, vbfloat16m1_t vs2,
+                                              size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16m1_f32m2_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vv_bf16m2_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vv_bf16m2_f32m4_mu(vbool8_t vm, vfloat32m4_t vd,
+                                              vbfloat16m2_t vs1,
+                                              vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16m2_f32m4_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vf_bf16m2_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vf_bf16m2_f32m4_mu(vbool8_t vm, vfloat32m4_t vd,
+                                              __bf16 vs1, vbfloat16m2_t vs2,
+                                              size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16m2_f32m4_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vv_bf16m4_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vv_bf16m4_f32m8_mu(vbool4_t vm, vfloat32m8_t vd,
+                                              vbfloat16m4_t vs1,
+                                              vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16m4_f32m8_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vf_bf16m4_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vf_bf16m4_f32m8_mu(vbool4_t vm, vfloat32m8_t vd,
+                                              __bf16 vs1, vbfloat16m4_t vs2,
+                                              size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16m4_f32m8_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vv_bf16mf4_f32mf2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vv_bf16mf4_f32mf2_rm_tu(vfloat32mf2_t vd,
+                                                    vbfloat16mf4_t vs1,
+                                                    vbfloat16mf4_t vs2,
+                                                    size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16mf4_f32mf2_rm_tu(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                                  vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vf_bf16mf4_f32mf2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vf_bf16mf4_f32mf2_rm_tu(vfloat32mf2_t vd,
+                                                    __bf16 vs1,
+                                                    vbfloat16mf4_t vs2,
+                                                    size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16mf4_f32mf2_rm_tu(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                                  vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vv_bf16mf2_f32m1_rm_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vv_bf16mf2_f32m1_rm_tu(vfloat32m1_t vd,
+                                                  vbfloat16mf2_t vs1,
+                                                  vbfloat16mf2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16mf2_f32m1_rm_tu(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                                 vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vf_bf16mf2_f32m1_rm_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vf_bf16mf2_f32m1_rm_tu(vfloat32m1_t vd, __bf16 vs1,
+                                                  vbfloat16mf2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16mf2_f32m1_rm_tu(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                                 vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vv_bf16m1_f32m2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vv_bf16m1_f32m2_rm_tu(vfloat32m2_t vd,
+                                                 vbfloat16m1_t vs1,
+                                                 vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16m1_f32m2_rm_tu(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                                vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vf_bf16m1_f32m2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vf_bf16m1_f32m2_rm_tu(vfloat32m2_t vd, __bf16 vs1,
+                                                 vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16m1_f32m2_rm_tu(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                                vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vv_bf16m2_f32m4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vv_bf16m2_f32m4_rm_tu(vfloat32m4_t vd,
+                                                 vbfloat16m2_t vs1,
+                                                 vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16m2_f32m4_rm_tu(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                                vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vf_bf16m2_f32m4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vf_bf16m2_f32m4_rm_tu(vfloat32m4_t vd, __bf16 vs1,
+                                                 vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16m2_f32m4_rm_tu(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                                vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vv_bf16m4_f32m8_rm_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vv_bf16m4_f32m8_rm_tu(vfloat32m8_t vd,
+                                                 vbfloat16m4_t vs1,
+                                                 vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16m4_f32m8_rm_tu(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                                vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vf_bf16m4_f32m8_rm_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vf_bf16m4_f32m8_rm_tu(vfloat32m8_t vd, __bf16 vs1,
+                                                 vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16m4_f32m8_rm_tu(vd, vs1, vs2, __RISCV_FRM_RNE,
+                                                vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vv_bf16mf4_f32mf2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vv_bf16mf4_f32mf2_rm_tum(vbool64_t vm,
+                                                     vfloat32mf2_t vd,
+                                                     vbfloat16mf4_t vs1,
+                                                     vbfloat16mf4_t vs2,
+                                                     size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16mf4_f32mf2_rm_tum(vm, vd, vs1, vs2,
+                                                   __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vf_bf16mf4_f32mf2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vf_bf16mf4_f32mf2_rm_tum(
+    vbool64_t vm, vfloat32mf2_t vd, __bf16 vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16mf4_f32mf2_rm_tum(vm, vd, vs1, vs2,
+                                                   __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vv_bf16mf2_f32m1_rm_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vv_bf16mf2_f32m1_rm_tum(vbool32_t vm,
+                                                   vfloat32m1_t vd,
+                                                   vbfloat16mf2_t vs1,
+                                                   vbfloat16mf2_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16mf2_f32m1_rm_tum(vm, vd, vs1, vs2,
+                                                  __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vf_bf16mf2_f32m1_rm_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vf_bf16mf2_f32m1_rm_tum(vbool32_t vm,
+                                                   vfloat32m1_t vd, __bf16 vs1,
+                                                   vbfloat16mf2_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16mf2_f32m1_rm_tum(vm, vd, vs1, vs2,
+                                                  __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vv_bf16m1_f32m2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vv_bf16m1_f32m2_rm_tum(vbool16_t vm, vfloat32m2_t vd,
+                                                  vbfloat16m1_t vs1,
+                                                  vbfloat16m1_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16m1_f32m2_rm_tum(vm, vd, vs1, vs2,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vf_bf16m1_f32m2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vf_bf16m1_f32m2_rm_tum(vbool16_t vm, vfloat32m2_t vd,
+                                                  __bf16 vs1, vbfloat16m1_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16m1_f32m2_rm_tum(vm, vd, vs1, vs2,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vv_bf16m2_f32m4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vv_bf16m2_f32m4_rm_tum(vbool8_t vm, vfloat32m4_t vd,
+                                                  vbfloat16m2_t vs1,
+                                                  vbfloat16m2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16m2_f32m4_rm_tum(vm, vd, vs1, vs2,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vf_bf16m2_f32m4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vf_bf16m2_f32m4_rm_tum(vbool8_t vm, vfloat32m4_t vd,
+                                                  __bf16 vs1, vbfloat16m2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16m2_f32m4_rm_tum(vm, vd, vs1, vs2,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vv_bf16m4_f32m8_rm_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vv_bf16m4_f32m8_rm_tum(vbool4_t vm, vfloat32m8_t vd,
+                                                  vbfloat16m4_t vs1,
+                                                  vbfloat16m4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16m4_f32m8_rm_tum(vm, vd, vs1, vs2,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vf_bf16m4_f32m8_rm_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vf_bf16m4_f32m8_rm_tum(vbool4_t vm, vfloat32m8_t vd,
+                                                  __bf16 vs1, vbfloat16m4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16m4_f32m8_rm_tum(vm, vd, vs1, vs2,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vv_bf16mf4_f32mf2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vv_bf16mf4_f32mf2_rm_tumu(vbool64_t vm,
+                                                      vfloat32mf2_t vd,
+                                                      vbfloat16mf4_t vs1,
+                                                      vbfloat16mf4_t vs2,
+                                                      size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16mf4_f32mf2_rm_tumu(vm, vd, vs1, vs2,
+                                                    __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vf_bf16mf4_f32mf2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vf_bf16mf4_f32mf2_rm_tumu(
+    vbool64_t vm, vfloat32mf2_t vd, __bf16 vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16mf4_f32mf2_rm_tumu(vm, vd, vs1, vs2,
+                                                    __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vv_bf16mf2_f32m1_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vv_bf16mf2_f32m1_rm_tumu(vbool32_t vm,
+                                                    vfloat32m1_t vd,
+                                                    vbfloat16mf2_t vs1,
+                                                    vbfloat16mf2_t vs2,
+                                                    size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16mf2_f32m1_rm_tumu(vm, vd, vs1, vs2,
+                                                   __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vf_bf16mf2_f32m1_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vf_bf16mf2_f32m1_rm_tumu(vbool32_t vm,
+                                                    vfloat32m1_t vd, __bf16 vs1,
+                                                    vbfloat16mf2_t vs2,
+                                                    size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16mf2_f32m1_rm_tumu(vm, vd, vs1, vs2,
+                                                   __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vv_bf16m1_f32m2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vv_bf16m1_f32m2_rm_tumu(vbool16_t vm,
+                                                   vfloat32m2_t vd,
+                                                   vbfloat16m1_t vs1,
+                                                   vbfloat16m1_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16m1_f32m2_rm_tumu(vm, vd, vs1, vs2,
+                                                  __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vf_bf16m1_f32m2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vf_bf16m1_f32m2_rm_tumu(vbool16_t vm,
+                                                   vfloat32m2_t vd, __bf16 vs1,
+                                                   vbfloat16m1_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16m1_f32m2_rm_tumu(vm, vd, vs1, vs2,
+                                                  __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vv_bf16m2_f32m4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vv_bf16m2_f32m4_rm_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                                   vbfloat16m2_t vs1,
+                                                   vbfloat16m2_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16m2_f32m4_rm_tumu(vm, vd, vs1, vs2,
+                                                  __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vf_bf16m2_f32m4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vf_bf16m2_f32m4_rm_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                                   __bf16 vs1,
+                                                   vbfloat16m2_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16m2_f32m4_rm_tumu(vm, vd, vs1, vs2,
+                                                  __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vv_bf16m4_f32m8_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vv_bf16m4_f32m8_rm_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                                   vbfloat16m4_t vs1,
+                                                   vbfloat16m4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16m4_f32m8_rm_tumu(vm, vd, vs1, vs2,
+                                                  __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vf_bf16m4_f32m8_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vf_bf16m4_f32m8_rm_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                                   __bf16 vs1,
+                                                   vbfloat16m4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16m4_f32m8_rm_tumu(vm, vd, vs1, vs2,
+                                                  __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vv_bf16mf4_f32mf2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vv_bf16mf4_f32mf2_rm_mu(vbool64_t vm,
+                                                    vfloat32mf2_t vd,
+                                                    vbfloat16mf4_t vs1,
+                                                    vbfloat16mf4_t vs2,
+                                                    size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16mf4_f32mf2_rm_mu(vm, vd, vs1, vs2,
+                                                  __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vf_bf16mf4_f32mf2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vf_bf16mf4_f32mf2_rm_mu(
+    vbool64_t vm, vfloat32mf2_t vd, __bf16 vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16mf4_f32mf2_rm_mu(vm, vd, vs1, vs2,
+                                                  __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vv_bf16mf2_f32m1_rm_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vv_bf16mf2_f32m1_rm_mu(vbool32_t vm, vfloat32m1_t vd,
+                                                  vbfloat16mf2_t vs1,
+                                                  vbfloat16mf2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16mf2_f32m1_rm_mu(vm, vd, vs1, vs2,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vf_bf16mf2_f32m1_rm_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vf_bf16mf2_f32m1_rm_mu(vbool32_t vm, vfloat32m1_t vd,
+                                                  __bf16 vs1,
+                                                  vbfloat16mf2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16mf2_f32m1_rm_mu(vm, vd, vs1, vs2,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vv_bf16m1_f32m2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vv_bf16m1_f32m2_rm_mu(vbool16_t vm, vfloat32m2_t vd,
+                                                 vbfloat16m1_t vs1,
+                                                 vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16m1_f32m2_rm_mu(vm, vd, vs1, vs2,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vf_bf16m1_f32m2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vf_bf16m1_f32m2_rm_mu(vbool16_t vm, vfloat32m2_t vd,
+                                                 __bf16 vs1, vbfloat16m1_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16m1_f32m2_rm_mu(vm, vd, vs1, vs2,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vv_bf16m2_f32m4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vv_bf16m2_f32m4_rm_mu(vbool8_t vm, vfloat32m4_t vd,
+                                                 vbfloat16m2_t vs1,
+                                                 vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16m2_f32m4_rm_mu(vm, vd, vs1, vs2,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vf_bf16m2_f32m4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vf_bf16m2_f32m4_rm_mu(vbool8_t vm, vfloat32m4_t vd,
+                                                 __bf16 vs1, vbfloat16m2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16m2_f32m4_rm_mu(vm, vd, vs1, vs2,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vv_bf16m4_f32m8_rm_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vv_bf16m4_f32m8_rm_mu(vbool4_t vm, vfloat32m8_t vd,
+                                                 vbfloat16m4_t vs1,
+                                                 vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_vv_bf16m4_f32m8_rm_mu(vm, vd, vs1, vs2,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vf_bf16m4_f32m8_rm_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vf_bf16m4_f32m8_rm_mu(vbool4_t vm, vfloat32m8_t vd,
+                                                 __bf16 vs1, vbfloat16m4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmsac_vf_bf16m4_f32m8_rm_mu(vm, vd, vs1, vs2,
+                                                __RISCV_FRM_RNE, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfwsub.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfwsub.c
new file mode 100644
index 0000000000000..9d9b0b0aa2f3c
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vfwsub.c
@@ -0,0 +1,2007 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vv_bf16mf4_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vv_bf16mf4_f32mf2_tu(vfloat32mf2_t vd,
+                                               vbfloat16mf4_t vs2,
+                                               vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16mf4_f32mf2_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vf_bf16mf4_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vf_bf16mf4_f32mf2_tu(vfloat32mf2_t vd,
+                                               vbfloat16mf4_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwsub_vf_bf16mf4_f32mf2_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wv_bf16mf4_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wv_bf16mf4_f32mf2_tu(vfloat32mf2_t vd,
+                                               vfloat32mf2_t vs2,
+                                               vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_bf16mf4_f32mf2_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wf_bf16_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.nxv1f32.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wf_bf16_f32mf2_tu(vfloat32mf2_t vd, vfloat32mf2_t vs2,
+                                            __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32mf2_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vv_bf16mf2_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vv_bf16mf2_f32m1_tu(vfloat32m1_t vd,
+                                             vbfloat16mf2_t vs2,
+                                             vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16mf2_f32m1_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vf_bf16mf2_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vf_bf16mf2_f32m1_tu(vfloat32m1_t vd,
+                                             vbfloat16mf2_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwsub_vf_bf16mf2_f32m1_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wv_bf16mf2_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wv_bf16mf2_f32m1_tu(vfloat32m1_t vd, vfloat32m1_t vs2,
+                                             vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_bf16mf2_f32m1_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wf_bf16_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.nxv2f32.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wf_bf16_f32m1_tu(vfloat32m1_t vd, vfloat32m1_t vs2,
+                                          __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m1_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vv_bf16m1_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vv_bf16m1_f32m2_tu(vfloat32m2_t vd, vbfloat16m1_t vs2,
+                                            vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16m1_f32m2_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vf_bf16m1_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vf_bf16m1_f32m2_tu(vfloat32m2_t vd, vbfloat16m1_t vs2,
+                                            __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_vf_bf16m1_f32m2_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wv_bf16m1_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wv_bf16m1_f32m2_tu(vfloat32m2_t vd, vfloat32m2_t vs2,
+                                            vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_bf16m1_f32m2_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wf_bf16_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.nxv4f32.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wf_bf16_f32m2_tu(vfloat32m2_t vd, vfloat32m2_t vs2,
+                                          __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m2_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vv_bf16m2_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vv_bf16m2_f32m4_tu(vfloat32m4_t vd, vbfloat16m2_t vs2,
+                                            vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16m2_f32m4_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vf_bf16m2_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vf_bf16m2_f32m4_tu(vfloat32m4_t vd, vbfloat16m2_t vs2,
+                                            __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_vf_bf16m2_f32m4_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wv_bf16m2_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wv_bf16m2_f32m4_tu(vfloat32m4_t vd, vfloat32m4_t vs2,
+                                            vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_bf16m2_f32m4_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wf_bf16_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.nxv8f32.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wf_bf16_f32m4_tu(vfloat32m4_t vd, vfloat32m4_t vs2,
+                                          __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m4_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vv_bf16m4_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vv_bf16m4_f32m8_tu(vfloat32m8_t vd, vbfloat16m4_t vs2,
+                                            vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16m4_f32m8_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vf_bf16m4_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vf_bf16m4_f32m8_tu(vfloat32m8_t vd, vbfloat16m4_t vs2,
+                                            __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_vf_bf16m4_f32m8_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wv_bf16m4_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wv_bf16m4_f32m8_tu(vfloat32m8_t vd, vfloat32m8_t vs2,
+                                            vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_bf16m4_f32m8_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wf_bf16_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.nxv16f32.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wf_bf16_f32m8_tu(vfloat32m8_t vd, vfloat32m8_t vs2,
+                                          __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m8_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vv_bf16mf4_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vv_bf16mf4_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd,
+                                                vbfloat16mf4_t vs2,
+                                                vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16mf4_f32mf2_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vf_bf16mf4_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vf_bf16mf4_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd,
+                                                vbfloat16mf4_t vs2, __bf16 rs1,
+                                                size_t vl) {
+  return __riscv_vfwsub_vf_bf16mf4_f32mf2_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wv_bf16mf4_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wv_bf16mf4_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd,
+                                                vfloat32mf2_t vs2,
+                                                vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_bf16mf4_f32mf2_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wf_bf16_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wf_bf16_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd,
+                                             vfloat32mf2_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32mf2_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vv_bf16mf2_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vv_bf16mf2_f32m1_tum(vbool32_t vm, vfloat32m1_t vd,
+                                              vbfloat16mf2_t vs2,
+                                              vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16mf2_f32m1_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vf_bf16mf2_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vf_bf16mf2_f32m1_tum(vbool32_t vm, vfloat32m1_t vd,
+                                              vbfloat16mf2_t vs2, __bf16 rs1,
+                                              size_t vl) {
+  return __riscv_vfwsub_vf_bf16mf2_f32m1_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wv_bf16mf2_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wv_bf16mf2_f32m1_tum(vbool32_t vm, vfloat32m1_t vd,
+                                              vfloat32m1_t vs2,
+                                              vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_bf16mf2_f32m1_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wf_bf16_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wf_bf16_f32m1_tum(vbool32_t vm, vfloat32m1_t vd,
+                                           vfloat32m1_t vs2, __bf16 rs1,
+                                           size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m1_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vv_bf16m1_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vv_bf16m1_f32m2_tum(vbool16_t vm, vfloat32m2_t vd,
+                                             vbfloat16m1_t vs2,
+                                             vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16m1_f32m2_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vf_bf16m1_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vf_bf16m1_f32m2_tum(vbool16_t vm, vfloat32m2_t vd,
+                                             vbfloat16m1_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwsub_vf_bf16m1_f32m2_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wv_bf16m1_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wv_bf16m1_f32m2_tum(vbool16_t vm, vfloat32m2_t vd,
+                                             vfloat32m2_t vs2,
+                                             vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_bf16m1_f32m2_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wf_bf16_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wf_bf16_f32m2_tum(vbool16_t vm, vfloat32m2_t vd,
+                                           vfloat32m2_t vs2, __bf16 rs1,
+                                           size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m2_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vv_bf16m2_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vv_bf16m2_f32m4_tum(vbool8_t vm, vfloat32m4_t vd,
+                                             vbfloat16m2_t vs2,
+                                             vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16m2_f32m4_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vf_bf16m2_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vf_bf16m2_f32m4_tum(vbool8_t vm, vfloat32m4_t vd,
+                                             vbfloat16m2_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwsub_vf_bf16m2_f32m4_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wv_bf16m2_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wv_bf16m2_f32m4_tum(vbool8_t vm, vfloat32m4_t vd,
+                                             vfloat32m4_t vs2,
+                                             vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_bf16m2_f32m4_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wf_bf16_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wf_bf16_f32m4_tum(vbool8_t vm, vfloat32m4_t vd,
+                                           vfloat32m4_t vs2, __bf16 rs1,
+                                           size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m4_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vv_bf16m4_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vv_bf16m4_f32m8_tum(vbool4_t vm, vfloat32m8_t vd,
+                                             vbfloat16m4_t vs2,
+                                             vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16m4_f32m8_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vf_bf16m4_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vf_bf16m4_f32m8_tum(vbool4_t vm, vfloat32m8_t vd,
+                                             vbfloat16m4_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwsub_vf_bf16m4_f32m8_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wv_bf16m4_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wv_bf16m4_f32m8_tum(vbool4_t vm, vfloat32m8_t vd,
+                                             vfloat32m8_t vs2,
+                                             vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_bf16m4_f32m8_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wf_bf16_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wf_bf16_f32m8_tum(vbool4_t vm, vfloat32m8_t vd,
+                                           vfloat32m8_t vs2, __bf16 rs1,
+                                           size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m8_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vv_bf16mf4_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vv_bf16mf4_f32mf2_tumu(vbool64_t vm, vfloat32mf2_t vd,
+                                                 vbfloat16mf4_t vs2,
+                                                 vbfloat16mf4_t vs1,
+                                                 size_t vl) {
+  return __riscv_vfwsub_vv_bf16mf4_f32mf2_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vf_bf16mf4_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vf_bf16mf4_f32mf2_tumu(vbool64_t vm, vfloat32mf2_t vd,
+                                                 vbfloat16mf4_t vs2, __bf16 rs1,
+                                                 size_t vl) {
+  return __riscv_vfwsub_vf_bf16mf4_f32mf2_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wv_bf16mf4_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wv_bf16mf4_f32mf2_tumu(vbool64_t vm, vfloat32mf2_t vd,
+                                                 vfloat32mf2_t vs2,
+                                                 vbfloat16mf4_t vs1,
+                                                 size_t vl) {
+  return __riscv_vfwsub_wv_bf16mf4_f32mf2_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wf_bf16_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wf_bf16_f32mf2_tumu(vbool64_t vm, vfloat32mf2_t vd,
+                                              vfloat32mf2_t vs2, __bf16 rs1,
+                                              size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32mf2_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vv_bf16mf2_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vv_bf16mf2_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                               vbfloat16mf2_t vs2,
+                                               vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16mf2_f32m1_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vf_bf16mf2_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vf_bf16mf2_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                               vbfloat16mf2_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwsub_vf_bf16mf2_f32m1_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wv_bf16mf2_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wv_bf16mf2_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                               vfloat32m1_t vs2,
+                                               vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_bf16mf2_f32m1_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wf_bf16_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wf_bf16_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                            vfloat32m1_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m1_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vv_bf16m1_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vv_bf16m1_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                              vbfloat16m1_t vs2,
+                                              vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16m1_f32m2_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vf_bf16m1_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vf_bf16m1_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                              vbfloat16m1_t vs2, __bf16 rs1,
+                                              size_t vl) {
+  return __riscv_vfwsub_vf_bf16m1_f32m2_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wv_bf16m1_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wv_bf16m1_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                              vfloat32m2_t vs2,
+                                              vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_bf16m1_f32m2_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wf_bf16_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wf_bf16_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                            vfloat32m2_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m2_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vv_bf16m2_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vv_bf16m2_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                              vbfloat16m2_t vs2,
+                                              vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16m2_f32m4_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vf_bf16m2_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vf_bf16m2_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                              vbfloat16m2_t vs2, __bf16 rs1,
+                                              size_t vl) {
+  return __riscv_vfwsub_vf_bf16m2_f32m4_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wv_bf16m2_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wv_bf16m2_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                              vfloat32m4_t vs2,
+                                              vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_bf16m2_f32m4_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wf_bf16_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wf_bf16_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                            vfloat32m4_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m4_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vv_bf16m4_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vv_bf16m4_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                              vbfloat16m4_t vs2,
+                                              vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16m4_f32m8_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vf_bf16m4_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vf_bf16m4_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                              vbfloat16m4_t vs2, __bf16 rs1,
+                                              size_t vl) {
+  return __riscv_vfwsub_vf_bf16m4_f32m8_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wv_bf16m4_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wv_bf16m4_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                              vfloat32m8_t vs2,
+                                              vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_bf16m4_f32m8_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wf_bf16_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wf_bf16_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                            vfloat32m8_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m8_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vv_bf16mf4_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vv_bf16mf4_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd,
+                                               vbfloat16mf4_t vs2,
+                                               vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16mf4_f32mf2_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vf_bf16mf4_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vf_bf16mf4_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd,
+                                               vbfloat16mf4_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwsub_vf_bf16mf4_f32mf2_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wv_bf16mf4_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wv_bf16mf4_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd,
+                                               vfloat32mf2_t vs2,
+                                               vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_bf16mf4_f32mf2_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wf_bf16_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wf_bf16_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd,
+                                            vfloat32mf2_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32mf2_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vv_bf16mf2_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vv_bf16mf2_f32m1_mu(vbool32_t vm, vfloat32m1_t vd,
+                                             vbfloat16mf2_t vs2,
+                                             vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16mf2_f32m1_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vf_bf16mf2_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vf_bf16mf2_f32m1_mu(vbool32_t vm, vfloat32m1_t vd,
+                                             vbfloat16mf2_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwsub_vf_bf16mf2_f32m1_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wv_bf16mf2_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wv_bf16mf2_f32m1_mu(vbool32_t vm, vfloat32m1_t vd,
+                                             vfloat32m1_t vs2,
+                                             vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_bf16mf2_f32m1_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wf_bf16_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wf_bf16_f32m1_mu(vbool32_t vm, vfloat32m1_t vd,
+                                          vfloat32m1_t vs2, __bf16 rs1,
+                                          size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m1_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vv_bf16m1_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vv_bf16m1_f32m2_mu(vbool16_t vm, vfloat32m2_t vd,
+                                            vbfloat16m1_t vs2,
+                                            vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16m1_f32m2_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vf_bf16m1_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vf_bf16m1_f32m2_mu(vbool16_t vm, vfloat32m2_t vd,
+                                            vbfloat16m1_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwsub_vf_bf16m1_f32m2_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wv_bf16m1_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wv_bf16m1_f32m2_mu(vbool16_t vm, vfloat32m2_t vd,
+                                            vfloat32m2_t vs2, vbfloat16m1_t vs1,
+                                            size_t vl) {
+  return __riscv_vfwsub_wv_bf16m1_f32m2_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wf_bf16_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wf_bf16_f32m2_mu(vbool16_t vm, vfloat32m2_t vd,
+                                          vfloat32m2_t vs2, __bf16 rs1,
+                                          size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m2_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vv_bf16m2_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vv_bf16m2_f32m4_mu(vbool8_t vm, vfloat32m4_t vd,
+                                            vbfloat16m2_t vs2,
+                                            vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16m2_f32m4_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vf_bf16m2_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vf_bf16m2_f32m4_mu(vbool8_t vm, vfloat32m4_t vd,
+                                            vbfloat16m2_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwsub_vf_bf16m2_f32m4_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wv_bf16m2_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wv_bf16m2_f32m4_mu(vbool8_t vm, vfloat32m4_t vd,
+                                            vfloat32m4_t vs2, vbfloat16m2_t vs1,
+                                            size_t vl) {
+  return __riscv_vfwsub_wv_bf16m2_f32m4_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wf_bf16_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wf_bf16_f32m4_mu(vbool8_t vm, vfloat32m4_t vd,
+                                          vfloat32m4_t vs2, __bf16 rs1,
+                                          size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m4_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vv_bf16m4_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vv_bf16m4_f32m8_mu(vbool4_t vm, vfloat32m8_t vd,
+                                            vbfloat16m4_t vs2,
+                                            vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16m4_f32m8_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vf_bf16m4_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vf_bf16m4_f32m8_mu(vbool4_t vm, vfloat32m8_t vd,
+                                            vbfloat16m4_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwsub_vf_bf16m4_f32m8_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wv_bf16m4_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wv_bf16m4_f32m8_mu(vbool4_t vm, vfloat32m8_t vd,
+                                            vfloat32m8_t vs2, vbfloat16m4_t vs1,
+                                            size_t vl) {
+  return __riscv_vfwsub_wv_bf16m4_f32m8_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wf_bf16_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wf_bf16_f32m8_mu(vbool4_t vm, vfloat32m8_t vd,
+                                          vfloat32m8_t vs2, __bf16 rs1,
+                                          size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m8_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vv_bf16mf4_f32mf2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vv_bf16mf4_f32mf2_rm_tu(vfloat32mf2_t vd,
+                                                  vbfloat16mf4_t vs2,
+                                                  vbfloat16mf4_t vs1,
+                                                  size_t vl) {
+  return __riscv_vfwsub_vv_bf16mf4_f32mf2_rm_tu(vd, vs2, vs1, __RISCV_FRM_RNE,
+                                                vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vf_bf16mf4_f32mf2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vf_bf16mf4_f32mf2_rm_tu(vfloat32mf2_t vd,
+                                                  vbfloat16mf4_t vs2,
+                                                  __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_vf_bf16mf4_f32mf2_rm_tu(vd, vs2, rs1, __RISCV_FRM_RNE,
+                                                vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wv_bf16mf4_f32mf2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wv_bf16mf4_f32mf2_rm_tu(vfloat32mf2_t vd,
+                                                  vfloat32mf2_t vs2,
+                                                  vbfloat16mf4_t vs1,
+                                                  size_t vl) {
+  return __riscv_vfwsub_wv_bf16mf4_f32mf2_rm_tu(vd, vs2, vs1, __RISCV_FRM_RNE,
+                                                vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wf_bf16_f32mf2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.nxv1f32.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wf_bf16_f32mf2_rm_tu(vfloat32mf2_t vd,
+                                               vfloat32mf2_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32mf2_rm_tu(vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vv_bf16mf2_f32m1_rm_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vv_bf16mf2_f32m1_rm_tu(vfloat32m1_t vd,
+                                                vbfloat16mf2_t vs2,
+                                                vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16mf2_f32m1_rm_tu(vd, vs2, vs1, __RISCV_FRM_RNE,
+                                               vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vf_bf16mf2_f32m1_rm_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vf_bf16mf2_f32m1_rm_tu(vfloat32m1_t vd,
+                                                vbfloat16mf2_t vs2, __bf16 rs1,
+                                                size_t vl) {
+  return __riscv_vfwsub_vf_bf16mf2_f32m1_rm_tu(vd, vs2, rs1, __RISCV_FRM_RNE,
+                                               vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wv_bf16mf2_f32m1_rm_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wv_bf16mf2_f32m1_rm_tu(vfloat32m1_t vd,
+                                                vfloat32m1_t vs2,
+                                                vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_bf16mf2_f32m1_rm_tu(vd, vs2, vs1, __RISCV_FRM_RNE,
+                                               vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wf_bf16_f32m1_rm_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.nxv2f32.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wf_bf16_f32m1_rm_tu(vfloat32m1_t vd, vfloat32m1_t vs2,
+                                             __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m1_rm_tu(vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vv_bf16m1_f32m2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vv_bf16m1_f32m2_rm_tu(vfloat32m2_t vd,
+                                               vbfloat16m1_t vs2,
+                                               vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16m1_f32m2_rm_tu(vd, vs2, vs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vf_bf16m1_f32m2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vf_bf16m1_f32m2_rm_tu(vfloat32m2_t vd,
+                                               vbfloat16m1_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwsub_vf_bf16m1_f32m2_rm_tu(vd, vs2, rs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wv_bf16m1_f32m2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wv_bf16m1_f32m2_rm_tu(vfloat32m2_t vd,
+                                               vfloat32m2_t vs2,
+                                               vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_bf16m1_f32m2_rm_tu(vd, vs2, vs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wf_bf16_f32m2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.nxv4f32.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wf_bf16_f32m2_rm_tu(vfloat32m2_t vd, vfloat32m2_t vs2,
+                                             __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m2_rm_tu(vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vv_bf16m2_f32m4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vv_bf16m2_f32m4_rm_tu(vfloat32m4_t vd,
+                                               vbfloat16m2_t vs2,
+                                               vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16m2_f32m4_rm_tu(vd, vs2, vs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vf_bf16m2_f32m4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vf_bf16m2_f32m4_rm_tu(vfloat32m4_t vd,
+                                               vbfloat16m2_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwsub_vf_bf16m2_f32m4_rm_tu(vd, vs2, rs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wv_bf16m2_f32m4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wv_bf16m2_f32m4_rm_tu(vfloat32m4_t vd,
+                                               vfloat32m4_t vs2,
+                                               vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_bf16m2_f32m4_rm_tu(vd, vs2, vs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wf_bf16_f32m4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.nxv8f32.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wf_bf16_f32m4_rm_tu(vfloat32m4_t vd, vfloat32m4_t vs2,
+                                             __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m4_rm_tu(vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vv_bf16m4_f32m8_rm_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vv_bf16m4_f32m8_rm_tu(vfloat32m8_t vd,
+                                               vbfloat16m4_t vs2,
+                                               vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16m4_f32m8_rm_tu(vd, vs2, vs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vf_bf16m4_f32m8_rm_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vf_bf16m4_f32m8_rm_tu(vfloat32m8_t vd,
+                                               vbfloat16m4_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwsub_vf_bf16m4_f32m8_rm_tu(vd, vs2, rs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wv_bf16m4_f32m8_rm_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wv_bf16m4_f32m8_rm_tu(vfloat32m8_t vd,
+                                               vfloat32m8_t vs2,
+                                               vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_bf16m4_f32m8_rm_tu(vd, vs2, vs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wf_bf16_f32m8_rm_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.nxv16f32.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wf_bf16_f32m8_rm_tu(vfloat32m8_t vd, vfloat32m8_t vs2,
+                                             __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m8_rm_tu(vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vv_bf16mf4_f32mf2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vv_bf16mf4_f32mf2_rm_tum(vbool64_t vm,
+                                                   vfloat32mf2_t vd,
+                                                   vbfloat16mf4_t vs2,
+                                                   vbfloat16mf4_t vs1,
+                                                   size_t vl) {
+  return __riscv_vfwsub_vv_bf16mf4_f32mf2_rm_tum(vm, vd, vs2, vs1,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vf_bf16mf4_f32mf2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vf_bf16mf4_f32mf2_rm_tum(vbool64_t vm,
+                                                   vfloat32mf2_t vd,
+                                                   vbfloat16mf4_t vs2,
+                                                   __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_vf_bf16mf4_f32mf2_rm_tum(vm, vd, vs2, rs1,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wv_bf16mf4_f32mf2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wv_bf16mf4_f32mf2_rm_tum(vbool64_t vm,
+                                                   vfloat32mf2_t vd,
+                                                   vfloat32mf2_t vs2,
+                                                   vbfloat16mf4_t vs1,
+                                                   size_t vl) {
+  return __riscv_vfwsub_wv_bf16mf4_f32mf2_rm_tum(vm, vd, vs2, vs1,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wf_bf16_f32mf2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wf_bf16_f32mf2_rm_tum(vbool64_t vm, vfloat32mf2_t vd,
+                                                vfloat32mf2_t vs2, __bf16 rs1,
+                                                size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32mf2_rm_tum(vm, vd, vs2, rs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vv_bf16mf2_f32m1_rm_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vv_bf16mf2_f32m1_rm_tum(vbool32_t vm, vfloat32m1_t vd,
+                                                 vbfloat16mf2_t vs2,
+                                                 vbfloat16mf2_t vs1,
+                                                 size_t vl) {
+  return __riscv_vfwsub_vv_bf16mf2_f32m1_rm_tum(vm, vd, vs2, vs1,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vf_bf16mf2_f32m1_rm_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vf_bf16mf2_f32m1_rm_tum(vbool32_t vm, vfloat32m1_t vd,
+                                                 vbfloat16mf2_t vs2, __bf16 rs1,
+                                                 size_t vl) {
+  return __riscv_vfwsub_vf_bf16mf2_f32m1_rm_tum(vm, vd, vs2, rs1,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wv_bf16mf2_f32m1_rm_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wv_bf16mf2_f32m1_rm_tum(vbool32_t vm, vfloat32m1_t vd,
+                                                 vfloat32m1_t vs2,
+                                                 vbfloat16mf2_t vs1,
+                                                 size_t vl) {
+  return __riscv_vfwsub_wv_bf16mf2_f32m1_rm_tum(vm, vd, vs2, vs1,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wf_bf16_f32m1_rm_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wf_bf16_f32m1_rm_tum(vbool32_t vm, vfloat32m1_t vd,
+                                              vfloat32m1_t vs2, __bf16 rs1,
+                                              size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m1_rm_tum(vm, vd, vs2, rs1, __RISCV_FRM_RNE,
+                                             vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vv_bf16m1_f32m2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vv_bf16m1_f32m2_rm_tum(vbool16_t vm, vfloat32m2_t vd,
+                                                vbfloat16m1_t vs2,
+                                                vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16m1_f32m2_rm_tum(vm, vd, vs2, vs1,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vf_bf16m1_f32m2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vf_bf16m1_f32m2_rm_tum(vbool16_t vm, vfloat32m2_t vd,
+                                                vbfloat16m1_t vs2, __bf16 rs1,
+                                                size_t vl) {
+  return __riscv_vfwsub_vf_bf16m1_f32m2_rm_tum(vm, vd, vs2, rs1,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wv_bf16m1_f32m2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wv_bf16m1_f32m2_rm_tum(vbool16_t vm, vfloat32m2_t vd,
+                                                vfloat32m2_t vs2,
+                                                vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_bf16m1_f32m2_rm_tum(vm, vd, vs2, vs1,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wf_bf16_f32m2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wf_bf16_f32m2_rm_tum(vbool16_t vm, vfloat32m2_t vd,
+                                              vfloat32m2_t vs2, __bf16 rs1,
+                                              size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m2_rm_tum(vm, vd, vs2, rs1, __RISCV_FRM_RNE,
+                                             vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vv_bf16m2_f32m4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vv_bf16m2_f32m4_rm_tum(vbool8_t vm, vfloat32m4_t vd,
+                                                vbfloat16m2_t vs2,
+                                                vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16m2_f32m4_rm_tum(vm, vd, vs2, vs1,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vf_bf16m2_f32m4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vf_bf16m2_f32m4_rm_tum(vbool8_t vm, vfloat32m4_t vd,
+                                                vbfloat16m2_t vs2, __bf16 rs1,
+                                                size_t vl) {
+  return __riscv_vfwsub_vf_bf16m2_f32m4_rm_tum(vm, vd, vs2, rs1,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wv_bf16m2_f32m4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wv_bf16m2_f32m4_rm_tum(vbool8_t vm, vfloat32m4_t vd,
+                                                vfloat32m4_t vs2,
+                                                vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_bf16m2_f32m4_rm_tum(vm, vd, vs2, vs1,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wf_bf16_f32m4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wf_bf16_f32m4_rm_tum(vbool8_t vm, vfloat32m4_t vd,
+                                              vfloat32m4_t vs2, __bf16 rs1,
+                                              size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m4_rm_tum(vm, vd, vs2, rs1, __RISCV_FRM_RNE,
+                                             vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vv_bf16m4_f32m8_rm_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vv_bf16m4_f32m8_rm_tum(vbool4_t vm, vfloat32m8_t vd,
+                                                vbfloat16m4_t vs2,
+                                                vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16m4_f32m8_rm_tum(vm, vd, vs2, vs1,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vf_bf16m4_f32m8_rm_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vf_bf16m4_f32m8_rm_tum(vbool4_t vm, vfloat32m8_t vd,
+                                                vbfloat16m4_t vs2, __bf16 rs1,
+                                                size_t vl) {
+  return __riscv_vfwsub_vf_bf16m4_f32m8_rm_tum(vm, vd, vs2, rs1,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wv_bf16m4_f32m8_rm_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wv_bf16m4_f32m8_rm_tum(vbool4_t vm, vfloat32m8_t vd,
+                                                vfloat32m8_t vs2,
+                                                vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_bf16m4_f32m8_rm_tum(vm, vd, vs2, vs1,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wf_bf16_f32m8_rm_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wf_bf16_f32m8_rm_tum(vbool4_t vm, vfloat32m8_t vd,
+                                              vfloat32m8_t vs2, __bf16 rs1,
+                                              size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m8_rm_tum(vm, vd, vs2, rs1, __RISCV_FRM_RNE,
+                                             vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vv_bf16mf4_f32mf2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vv_bf16mf4_f32mf2_rm_tumu(vbool64_t vm,
+                                                    vfloat32mf2_t vd,
+                                                    vbfloat16mf4_t vs2,
+                                                    vbfloat16mf4_t vs1,
+                                                    size_t vl) {
+  return __riscv_vfwsub_vv_bf16mf4_f32mf2_rm_tumu(vm, vd, vs2, vs1,
+                                                  __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vf_bf16mf4_f32mf2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vf_bf16mf4_f32mf2_rm_tumu(vbool64_t vm,
+                                                    vfloat32mf2_t vd,
+                                                    vbfloat16mf4_t vs2,
+                                                    __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_vf_bf16mf4_f32mf2_rm_tumu(vm, vd, vs2, rs1,
+                                                  __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wv_bf16mf4_f32mf2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wv_bf16mf4_f32mf2_rm_tumu(vbool64_t vm,
+                                                    vfloat32mf2_t vd,
+                                                    vfloat32mf2_t vs2,
+                                                    vbfloat16mf4_t vs1,
+                                                    size_t vl) {
+  return __riscv_vfwsub_wv_bf16mf4_f32mf2_rm_tumu(vm, vd, vs2, vs1,
+                                                  __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wf_bf16_f32mf2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wf_bf16_f32mf2_rm_tumu(vbool64_t vm, vfloat32mf2_t vd,
+                                                 vfloat32mf2_t vs2, __bf16 rs1,
+                                                 size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32mf2_rm_tumu(vm, vd, vs2, rs1,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vv_bf16mf2_f32m1_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vv_bf16mf2_f32m1_rm_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                                  vbfloat16mf2_t vs2,
+                                                  vbfloat16mf2_t vs1,
+                                                  size_t vl) {
+  return __riscv_vfwsub_vv_bf16mf2_f32m1_rm_tumu(vm, vd, vs2, vs1,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vf_bf16mf2_f32m1_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vf_bf16mf2_f32m1_rm_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                                  vbfloat16mf2_t vs2,
+                                                  __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_vf_bf16mf2_f32m1_rm_tumu(vm, vd, vs2, rs1,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wv_bf16mf2_f32m1_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wv_bf16mf2_f32m1_rm_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                                  vfloat32m1_t vs2,
+                                                  vbfloat16mf2_t vs1,
+                                                  size_t vl) {
+  return __riscv_vfwsub_wv_bf16mf2_f32m1_rm_tumu(vm, vd, vs2, vs1,
+                                                 __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wf_bf16_f32m1_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wf_bf16_f32m1_rm_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                               vfloat32m1_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m1_rm_tumu(vm, vd, vs2, rs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vv_bf16m1_f32m2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vv_bf16m1_f32m2_rm_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                                 vbfloat16m1_t vs2,
+                                                 vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16m1_f32m2_rm_tumu(vm, vd, vs2, vs1,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vf_bf16m1_f32m2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vf_bf16m1_f32m2_rm_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                                 vbfloat16m1_t vs2, __bf16 rs1,
+                                                 size_t vl) {
+  return __riscv_vfwsub_vf_bf16m1_f32m2_rm_tumu(vm, vd, vs2, rs1,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wv_bf16m1_f32m2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wv_bf16m1_f32m2_rm_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                                 vfloat32m2_t vs2,
+                                                 vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_bf16m1_f32m2_rm_tumu(vm, vd, vs2, vs1,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wf_bf16_f32m2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wf_bf16_f32m2_rm_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                               vfloat32m2_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m2_rm_tumu(vm, vd, vs2, rs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vv_bf16m2_f32m4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vv_bf16m2_f32m4_rm_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                                 vbfloat16m2_t vs2,
+                                                 vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16m2_f32m4_rm_tumu(vm, vd, vs2, vs1,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vf_bf16m2_f32m4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vf_bf16m2_f32m4_rm_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                                 vbfloat16m2_t vs2, __bf16 rs1,
+                                                 size_t vl) {
+  return __riscv_vfwsub_vf_bf16m2_f32m4_rm_tumu(vm, vd, vs2, rs1,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wv_bf16m2_f32m4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wv_bf16m2_f32m4_rm_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                                 vfloat32m4_t vs2,
+                                                 vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_bf16m2_f32m4_rm_tumu(vm, vd, vs2, vs1,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wf_bf16_f32m4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wf_bf16_f32m4_rm_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                               vfloat32m4_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m4_rm_tumu(vm, vd, vs2, rs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vv_bf16m4_f32m8_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vv_bf16m4_f32m8_rm_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                                 vbfloat16m4_t vs2,
+                                                 vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16m4_f32m8_rm_tumu(vm, vd, vs2, vs1,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vf_bf16m4_f32m8_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vf_bf16m4_f32m8_rm_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                                 vbfloat16m4_t vs2, __bf16 rs1,
+                                                 size_t vl) {
+  return __riscv_vfwsub_vf_bf16m4_f32m8_rm_tumu(vm, vd, vs2, rs1,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wv_bf16m4_f32m8_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wv_bf16m4_f32m8_rm_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                                 vfloat32m8_t vs2,
+                                                 vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_bf16m4_f32m8_rm_tumu(vm, vd, vs2, vs1,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wf_bf16_f32m8_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wf_bf16_f32m8_rm_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                               vfloat32m8_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m8_rm_tumu(vm, vd, vs2, rs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vv_bf16mf4_f32mf2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vv_bf16mf4_f32mf2_rm_mu(vbool64_t vm,
+                                                  vfloat32mf2_t vd,
+                                                  vbfloat16mf4_t vs2,
+                                                  vbfloat16mf4_t vs1,
+                                                  size_t vl) {
+  return __riscv_vfwsub_vv_bf16mf4_f32mf2_rm_mu(vm, vd, vs2, vs1,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vf_bf16mf4_f32mf2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vf_bf16mf4_f32mf2_rm_mu(vbool64_t vm,
+                                                  vfloat32mf2_t vd,
+                                                  vbfloat16mf4_t vs2,
+                                                  __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_vf_bf16mf4_f32mf2_rm_mu(vm, vd, vs2, rs1,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wv_bf16mf4_f32mf2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wv_bf16mf4_f32mf2_rm_mu(vbool64_t vm,
+                                                  vfloat32mf2_t vd,
+                                                  vfloat32mf2_t vs2,
+                                                  vbfloat16mf4_t vs1,
+                                                  size_t vl) {
+  return __riscv_vfwsub_wv_bf16mf4_f32mf2_rm_mu(vm, vd, vs2, vs1,
+                                                __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wf_bf16_f32mf2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wf_bf16_f32mf2_rm_mu(vbool64_t vm, vfloat32mf2_t vd,
+                                               vfloat32mf2_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32mf2_rm_mu(vm, vd, vs2, rs1, __RISCV_FRM_RNE,
+                                             vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vv_bf16mf2_f32m1_rm_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vv_bf16mf2_f32m1_rm_mu(vbool32_t vm, vfloat32m1_t vd,
+                                                vbfloat16mf2_t vs2,
+                                                vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16mf2_f32m1_rm_mu(vm, vd, vs2, vs1,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vf_bf16mf2_f32m1_rm_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vf_bf16mf2_f32m1_rm_mu(vbool32_t vm, vfloat32m1_t vd,
+                                                vbfloat16mf2_t vs2, __bf16 rs1,
+                                                size_t vl) {
+  return __riscv_vfwsub_vf_bf16mf2_f32m1_rm_mu(vm, vd, vs2, rs1,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wv_bf16mf2_f32m1_rm_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wv_bf16mf2_f32m1_rm_mu(vbool32_t vm, vfloat32m1_t vd,
+                                                vfloat32m1_t vs2,
+                                                vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_bf16mf2_f32m1_rm_mu(vm, vd, vs2, vs1,
+                                               __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wf_bf16_f32m1_rm_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wf_bf16_f32m1_rm_mu(vbool32_t vm, vfloat32m1_t vd,
+                                             vfloat32m1_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m1_rm_mu(vm, vd, vs2, rs1, __RISCV_FRM_RNE,
+                                            vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vv_bf16m1_f32m2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vv_bf16m1_f32m2_rm_mu(vbool16_t vm, vfloat32m2_t vd,
+                                               vbfloat16m1_t vs2,
+                                               vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16m1_f32m2_rm_mu(vm, vd, vs2, vs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vf_bf16m1_f32m2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vf_bf16m1_f32m2_rm_mu(vbool16_t vm, vfloat32m2_t vd,
+                                               vbfloat16m1_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwsub_vf_bf16m1_f32m2_rm_mu(vm, vd, vs2, rs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wv_bf16m1_f32m2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wv_bf16m1_f32m2_rm_mu(vbool16_t vm, vfloat32m2_t vd,
+                                               vfloat32m2_t vs2,
+                                               vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_bf16m1_f32m2_rm_mu(vm, vd, vs2, vs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wf_bf16_f32m2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wf_bf16_f32m2_rm_mu(vbool16_t vm, vfloat32m2_t vd,
+                                             vfloat32m2_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m2_rm_mu(vm, vd, vs2, rs1, __RISCV_FRM_RNE,
+                                            vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vv_bf16m2_f32m4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vv_bf16m2_f32m4_rm_mu(vbool8_t vm, vfloat32m4_t vd,
+                                               vbfloat16m2_t vs2,
+                                               vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16m2_f32m4_rm_mu(vm, vd, vs2, vs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vf_bf16m2_f32m4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vf_bf16m2_f32m4_rm_mu(vbool8_t vm, vfloat32m4_t vd,
+                                               vbfloat16m2_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwsub_vf_bf16m2_f32m4_rm_mu(vm, vd, vs2, rs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wv_bf16m2_f32m4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wv_bf16m2_f32m4_rm_mu(vbool8_t vm, vfloat32m4_t vd,
+                                               vfloat32m4_t vs2,
+                                               vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_bf16m2_f32m4_rm_mu(vm, vd, vs2, vs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wf_bf16_f32m4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wf_bf16_f32m4_rm_mu(vbool8_t vm, vfloat32m4_t vd,
+                                             vfloat32m4_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m4_rm_mu(vm, vd, vs2, rs1, __RISCV_FRM_RNE,
+                                            vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vv_bf16m4_f32m8_rm_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vv_bf16m4_f32m8_rm_mu(vbool4_t vm, vfloat32m8_t vd,
+                                               vbfloat16m4_t vs2,
+                                               vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_bf16m4_f32m8_rm_mu(vm, vd, vs2, vs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vf_bf16m4_f32m8_rm_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vf_bf16m4_f32m8_rm_mu(vbool4_t vm, vfloat32m8_t vd,
+                                               vbfloat16m4_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwsub_vf_bf16m4_f32m8_rm_mu(vm, vd, vs2, rs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wv_bf16m4_f32m8_rm_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wv_bf16m4_f32m8_rm_mu(vbool4_t vm, vfloat32m8_t vd,
+                                               vfloat32m8_t vs2,
+                                               vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_bf16m4_f32m8_rm_mu(vm, vd, vs2, vs1, __RISCV_FRM_RNE,
+                                              vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wf_bf16_f32m8_rm_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wf_bf16_f32m8_rm_mu(vbool4_t vm, vfloat32m8_t vd,
+                                             vfloat32m8_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwsub_wf_bf16_f32m8_rm_mu(vm, vd, vs2, rs1, __RISCV_FRM_RNE,
+                                            vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vmfeq.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vmfeq.c
new file mode 100644
index 0000000000000..b96aae5615db8
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vmfeq.c
@@ -0,0 +1,129 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfeq_vv_bf16mf4_b64_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i1> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfeq.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x i1> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfeq_vv_bf16mf4_b64_mu(vbool64_t mask, vbool64_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vmfeq_vv_bf16mf4_b64_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfeq_vf_bf16mf4_b64_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i1> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfeq.mask.nxv1bf16.bf16.i64(<vscale x 1 x i1> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfeq_vf_bf16mf4_b64_mu(vbool64_t mask, vbool64_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfeq_vf_bf16mf4_b64_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfeq_vv_bf16mf2_b32_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i1> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfeq.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x i1> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfeq_vv_bf16mf2_b32_mu(vbool32_t mask, vbool32_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vmfeq_vv_bf16mf2_b32_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfeq_vf_bf16mf2_b32_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i1> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfeq.mask.nxv2bf16.bf16.i64(<vscale x 2 x i1> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfeq_vf_bf16mf2_b32_mu(vbool32_t mask, vbool32_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfeq_vf_bf16mf2_b32_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfeq_vv_bf16m1_b16_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i1> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfeq.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x i1> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfeq_vv_bf16m1_b16_mu(vbool16_t mask, vbool16_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vmfeq_vv_bf16m1_b16_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfeq_vf_bf16m1_b16_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i1> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfeq.mask.nxv4bf16.bf16.i64(<vscale x 4 x i1> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfeq_vf_bf16m1_b16_mu(vbool16_t mask, vbool16_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfeq_vf_bf16m1_b16_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfeq_vv_bf16m2_b8_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i1> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfeq.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x i1> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfeq_vv_bf16m2_b8_mu(vbool8_t mask, vbool8_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vmfeq_vv_bf16m2_b8_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfeq_vf_bf16m2_b8_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i1> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfeq.mask.nxv8bf16.bf16.i64(<vscale x 8 x i1> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfeq_vf_bf16m2_b8_mu(vbool8_t mask, vbool8_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfeq_vf_bf16m2_b8_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfeq_vv_bf16m4_b4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i1> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfeq.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x i1> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfeq_vv_bf16m4_b4_mu(vbool4_t mask, vbool4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vmfeq_vv_bf16m4_b4_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfeq_vf_bf16m4_b4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i1> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfeq.mask.nxv16bf16.bf16.i64(<vscale x 16 x i1> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfeq_vf_bf16m4_b4_mu(vbool4_t mask, vbool4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfeq_vf_bf16m4_b4_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfeq_vv_bf16m8_b2_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x i1> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfeq.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x i1> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfeq_vv_bf16m8_b2_mu(vbool2_t mask, vbool2_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vmfeq_vv_bf16m8_b2_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfeq_vf_bf16m8_b2_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x i1> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfeq.mask.nxv32bf16.bf16.i64(<vscale x 32 x i1> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfeq_vf_bf16m8_b2_mu(vbool2_t mask, vbool2_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfeq_vf_bf16m8_b2_mu(mask, maskedoff, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vmfge.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vmfge.c
new file mode 100644
index 0000000000000..47d0427abf99b
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vmfge.c
@@ -0,0 +1,129 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfge_vv_bf16mf4_b64_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i1> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfge.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x i1> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfge_vv_bf16mf4_b64_mu(vbool64_t mask, vbool64_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vmfge_vv_bf16mf4_b64_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfge_vf_bf16mf4_b64_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i1> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfge.mask.nxv1bf16.bf16.i64(<vscale x 1 x i1> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfge_vf_bf16mf4_b64_mu(vbool64_t mask, vbool64_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfge_vf_bf16mf4_b64_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfge_vv_bf16mf2_b32_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i1> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfge.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x i1> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfge_vv_bf16mf2_b32_mu(vbool32_t mask, vbool32_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vmfge_vv_bf16mf2_b32_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfge_vf_bf16mf2_b32_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i1> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfge.mask.nxv2bf16.bf16.i64(<vscale x 2 x i1> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfge_vf_bf16mf2_b32_mu(vbool32_t mask, vbool32_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfge_vf_bf16mf2_b32_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfge_vv_bf16m1_b16_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i1> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfge.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x i1> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfge_vv_bf16m1_b16_mu(vbool16_t mask, vbool16_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vmfge_vv_bf16m1_b16_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfge_vf_bf16m1_b16_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i1> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfge.mask.nxv4bf16.bf16.i64(<vscale x 4 x i1> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfge_vf_bf16m1_b16_mu(vbool16_t mask, vbool16_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfge_vf_bf16m1_b16_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfge_vv_bf16m2_b8_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i1> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfge.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x i1> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfge_vv_bf16m2_b8_mu(vbool8_t mask, vbool8_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vmfge_vv_bf16m2_b8_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfge_vf_bf16m2_b8_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i1> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfge.mask.nxv8bf16.bf16.i64(<vscale x 8 x i1> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfge_vf_bf16m2_b8_mu(vbool8_t mask, vbool8_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfge_vf_bf16m2_b8_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfge_vv_bf16m4_b4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i1> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfge.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x i1> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfge_vv_bf16m4_b4_mu(vbool4_t mask, vbool4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vmfge_vv_bf16m4_b4_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfge_vf_bf16m4_b4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i1> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfge.mask.nxv16bf16.bf16.i64(<vscale x 16 x i1> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfge_vf_bf16m4_b4_mu(vbool4_t mask, vbool4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfge_vf_bf16m4_b4_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfge_vv_bf16m8_b2_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x i1> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfge.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x i1> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfge_vv_bf16m8_b2_mu(vbool2_t mask, vbool2_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vmfge_vv_bf16m8_b2_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfge_vf_bf16m8_b2_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x i1> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfge.mask.nxv32bf16.bf16.i64(<vscale x 32 x i1> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfge_vf_bf16m8_b2_mu(vbool2_t mask, vbool2_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfge_vf_bf16m8_b2_mu(mask, maskedoff, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vmfgt.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vmfgt.c
new file mode 100644
index 0000000000000..0a0ead22361e9
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vmfgt.c
@@ -0,0 +1,129 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfgt_vv_bf16mf4_b64_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i1> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfgt.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x i1> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfgt_vv_bf16mf4_b64_mu(vbool64_t mask, vbool64_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vmfgt_vv_bf16mf4_b64_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfgt_vf_bf16mf4_b64_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i1> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfgt.mask.nxv1bf16.bf16.i64(<vscale x 1 x i1> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfgt_vf_bf16mf4_b64_mu(vbool64_t mask, vbool64_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfgt_vf_bf16mf4_b64_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfgt_vv_bf16mf2_b32_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i1> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfgt.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x i1> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfgt_vv_bf16mf2_b32_mu(vbool32_t mask, vbool32_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vmfgt_vv_bf16mf2_b32_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfgt_vf_bf16mf2_b32_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i1> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfgt.mask.nxv2bf16.bf16.i64(<vscale x 2 x i1> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfgt_vf_bf16mf2_b32_mu(vbool32_t mask, vbool32_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfgt_vf_bf16mf2_b32_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfgt_vv_bf16m1_b16_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i1> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfgt.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x i1> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfgt_vv_bf16m1_b16_mu(vbool16_t mask, vbool16_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vmfgt_vv_bf16m1_b16_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfgt_vf_bf16m1_b16_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i1> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfgt.mask.nxv4bf16.bf16.i64(<vscale x 4 x i1> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfgt_vf_bf16m1_b16_mu(vbool16_t mask, vbool16_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfgt_vf_bf16m1_b16_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfgt_vv_bf16m2_b8_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i1> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfgt.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x i1> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfgt_vv_bf16m2_b8_mu(vbool8_t mask, vbool8_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vmfgt_vv_bf16m2_b8_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfgt_vf_bf16m2_b8_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i1> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfgt.mask.nxv8bf16.bf16.i64(<vscale x 8 x i1> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfgt_vf_bf16m2_b8_mu(vbool8_t mask, vbool8_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfgt_vf_bf16m2_b8_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfgt_vv_bf16m4_b4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i1> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfgt.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x i1> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfgt_vv_bf16m4_b4_mu(vbool4_t mask, vbool4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vmfgt_vv_bf16m4_b4_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfgt_vf_bf16m4_b4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i1> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfgt.mask.nxv16bf16.bf16.i64(<vscale x 16 x i1> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfgt_vf_bf16m4_b4_mu(vbool4_t mask, vbool4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfgt_vf_bf16m4_b4_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfgt_vv_bf16m8_b2_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x i1> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfgt.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x i1> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfgt_vv_bf16m8_b2_mu(vbool2_t mask, vbool2_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vmfgt_vv_bf16m8_b2_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfgt_vf_bf16m8_b2_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x i1> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfgt.mask.nxv32bf16.bf16.i64(<vscale x 32 x i1> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfgt_vf_bf16m8_b2_mu(vbool2_t mask, vbool2_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfgt_vf_bf16m8_b2_mu(mask, maskedoff, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vmfle.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vmfle.c
new file mode 100644
index 0000000000000..27ddefec8c9c0
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vmfle.c
@@ -0,0 +1,129 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfle_vv_bf16mf4_b64_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i1> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfle.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x i1> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfle_vv_bf16mf4_b64_mu(vbool64_t mask, vbool64_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vmfle_vv_bf16mf4_b64_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfle_vf_bf16mf4_b64_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i1> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfle.mask.nxv1bf16.bf16.i64(<vscale x 1 x i1> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfle_vf_bf16mf4_b64_mu(vbool64_t mask, vbool64_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfle_vf_bf16mf4_b64_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfle_vv_bf16mf2_b32_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i1> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfle.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x i1> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfle_vv_bf16mf2_b32_mu(vbool32_t mask, vbool32_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vmfle_vv_bf16mf2_b32_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfle_vf_bf16mf2_b32_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i1> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfle.mask.nxv2bf16.bf16.i64(<vscale x 2 x i1> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfle_vf_bf16mf2_b32_mu(vbool32_t mask, vbool32_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfle_vf_bf16mf2_b32_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfle_vv_bf16m1_b16_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i1> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfle.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x i1> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfle_vv_bf16m1_b16_mu(vbool16_t mask, vbool16_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vmfle_vv_bf16m1_b16_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfle_vf_bf16m1_b16_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i1> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfle.mask.nxv4bf16.bf16.i64(<vscale x 4 x i1> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfle_vf_bf16m1_b16_mu(vbool16_t mask, vbool16_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfle_vf_bf16m1_b16_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfle_vv_bf16m2_b8_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i1> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfle.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x i1> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfle_vv_bf16m2_b8_mu(vbool8_t mask, vbool8_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vmfle_vv_bf16m2_b8_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfle_vf_bf16m2_b8_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i1> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfle.mask.nxv8bf16.bf16.i64(<vscale x 8 x i1> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfle_vf_bf16m2_b8_mu(vbool8_t mask, vbool8_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfle_vf_bf16m2_b8_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfle_vv_bf16m4_b4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i1> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfle.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x i1> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfle_vv_bf16m4_b4_mu(vbool4_t mask, vbool4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vmfle_vv_bf16m4_b4_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfle_vf_bf16m4_b4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i1> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfle.mask.nxv16bf16.bf16.i64(<vscale x 16 x i1> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfle_vf_bf16m4_b4_mu(vbool4_t mask, vbool4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfle_vf_bf16m4_b4_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfle_vv_bf16m8_b2_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x i1> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfle.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x i1> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfle_vv_bf16m8_b2_mu(vbool2_t mask, vbool2_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vmfle_vv_bf16m8_b2_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfle_vf_bf16m8_b2_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x i1> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfle.mask.nxv32bf16.bf16.i64(<vscale x 32 x i1> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfle_vf_bf16m8_b2_mu(vbool2_t mask, vbool2_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfle_vf_bf16m8_b2_mu(mask, maskedoff, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vmflt.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vmflt.c
new file mode 100644
index 0000000000000..d5f4f777580d3
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vmflt.c
@@ -0,0 +1,129 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmflt_vv_bf16mf4_b64_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i1> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmflt.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x i1> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmflt_vv_bf16mf4_b64_mu(vbool64_t mask, vbool64_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vmflt_vv_bf16mf4_b64_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmflt_vf_bf16mf4_b64_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i1> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmflt.mask.nxv1bf16.bf16.i64(<vscale x 1 x i1> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmflt_vf_bf16mf4_b64_mu(vbool64_t mask, vbool64_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmflt_vf_bf16mf4_b64_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmflt_vv_bf16mf2_b32_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i1> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmflt.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x i1> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmflt_vv_bf16mf2_b32_mu(vbool32_t mask, vbool32_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vmflt_vv_bf16mf2_b32_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmflt_vf_bf16mf2_b32_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i1> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmflt.mask.nxv2bf16.bf16.i64(<vscale x 2 x i1> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmflt_vf_bf16mf2_b32_mu(vbool32_t mask, vbool32_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmflt_vf_bf16mf2_b32_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmflt_vv_bf16m1_b16_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i1> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmflt.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x i1> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmflt_vv_bf16m1_b16_mu(vbool16_t mask, vbool16_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vmflt_vv_bf16m1_b16_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmflt_vf_bf16m1_b16_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i1> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmflt.mask.nxv4bf16.bf16.i64(<vscale x 4 x i1> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmflt_vf_bf16m1_b16_mu(vbool16_t mask, vbool16_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmflt_vf_bf16m1_b16_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmflt_vv_bf16m2_b8_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i1> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmflt.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x i1> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmflt_vv_bf16m2_b8_mu(vbool8_t mask, vbool8_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vmflt_vv_bf16m2_b8_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmflt_vf_bf16m2_b8_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i1> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmflt.mask.nxv8bf16.bf16.i64(<vscale x 8 x i1> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmflt_vf_bf16m2_b8_mu(vbool8_t mask, vbool8_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmflt_vf_bf16m2_b8_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmflt_vv_bf16m4_b4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i1> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmflt.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x i1> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmflt_vv_bf16m4_b4_mu(vbool4_t mask, vbool4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vmflt_vv_bf16m4_b4_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmflt_vf_bf16m4_b4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i1> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmflt.mask.nxv16bf16.bf16.i64(<vscale x 16 x i1> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmflt_vf_bf16m4_b4_mu(vbool4_t mask, vbool4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmflt_vf_bf16m4_b4_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmflt_vv_bf16m8_b2_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x i1> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmflt.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x i1> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmflt_vv_bf16m8_b2_mu(vbool2_t mask, vbool2_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vmflt_vv_bf16m8_b2_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmflt_vf_bf16m8_b2_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x i1> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmflt.mask.nxv32bf16.bf16.i64(<vscale x 32 x i1> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmflt_vf_bf16m8_b2_mu(vbool2_t mask, vbool2_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmflt_vf_bf16m8_b2_mu(mask, maskedoff, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vmfne.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vmfne.c
new file mode 100644
index 0000000000000..c2df9474acc72
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/non-overloaded/vmfne.c
@@ -0,0 +1,129 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfne_vv_bf16mf4_b64_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i1> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfne.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x i1> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfne_vv_bf16mf4_b64_mu(vbool64_t mask, vbool64_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vmfne_vv_bf16mf4_b64_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfne_vf_bf16mf4_b64_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i1> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfne.mask.nxv1bf16.bf16.i64(<vscale x 1 x i1> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfne_vf_bf16mf4_b64_mu(vbool64_t mask, vbool64_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfne_vf_bf16mf4_b64_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfne_vv_bf16mf2_b32_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i1> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfne.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x i1> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfne_vv_bf16mf2_b32_mu(vbool32_t mask, vbool32_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vmfne_vv_bf16mf2_b32_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfne_vf_bf16mf2_b32_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i1> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfne.mask.nxv2bf16.bf16.i64(<vscale x 2 x i1> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfne_vf_bf16mf2_b32_mu(vbool32_t mask, vbool32_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfne_vf_bf16mf2_b32_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfne_vv_bf16m1_b16_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i1> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfne.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x i1> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfne_vv_bf16m1_b16_mu(vbool16_t mask, vbool16_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vmfne_vv_bf16m1_b16_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfne_vf_bf16m1_b16_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i1> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfne.mask.nxv4bf16.bf16.i64(<vscale x 4 x i1> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfne_vf_bf16m1_b16_mu(vbool16_t mask, vbool16_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfne_vf_bf16m1_b16_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfne_vv_bf16m2_b8_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i1> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfne.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x i1> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfne_vv_bf16m2_b8_mu(vbool8_t mask, vbool8_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vmfne_vv_bf16m2_b8_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfne_vf_bf16m2_b8_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i1> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfne.mask.nxv8bf16.bf16.i64(<vscale x 8 x i1> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfne_vf_bf16m2_b8_mu(vbool8_t mask, vbool8_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfne_vf_bf16m2_b8_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfne_vv_bf16m4_b4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i1> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfne.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x i1> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfne_vv_bf16m4_b4_mu(vbool4_t mask, vbool4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vmfne_vv_bf16m4_b4_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfne_vf_bf16m4_b4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i1> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfne.mask.nxv16bf16.bf16.i64(<vscale x 16 x i1> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfne_vf_bf16m4_b4_mu(vbool4_t mask, vbool4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfne_vf_bf16m4_b4_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfne_vv_bf16m8_b2_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x i1> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfne.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x i1> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfne_vv_bf16m8_b2_mu(vbool2_t mask, vbool2_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vmfne_vv_bf16m8_b2_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfne_vf_bf16m8_b2_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x i1> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfne.mask.nxv32bf16.bf16.i64(<vscale x 32 x i1> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfne_vf_bf16m8_b2_mu(vbool2_t mask, vbool2_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfne_vf_bf16m8_b2_mu(mask, maskedoff, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfadd.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfadd.c
new file mode 100644
index 0000000000000..2bd3b3995c6c8
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfadd.c
@@ -0,0 +1,489 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfadd_vv_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfadd.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfadd_vv_bf16mf4_tu(vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfadd_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfadd_vf_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfadd.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfadd_vf_bf16mf4_tu(vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfadd_vv_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfadd.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfadd_vv_bf16mf2_tu(vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfadd_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfadd_vf_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfadd.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfadd_vf_bf16mf2_tu(vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfadd_vv_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfadd.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfadd_vv_bf16m1_tu(vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfadd_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfadd_vf_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfadd.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfadd_vf_bf16m1_tu(vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfadd_vv_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfadd.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfadd_vv_bf16m2_tu(vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfadd_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfadd_vf_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfadd.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfadd_vf_bf16m2_tu(vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfadd_vv_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfadd.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfadd_vv_bf16m4_tu(vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfadd_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfadd_vf_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfadd.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfadd_vf_bf16m4_tu(vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfadd_vv_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfadd.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfadd_vv_bf16m8_tu(vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfadd_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfadd_vf_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfadd.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfadd_vf_bf16m8_tu(vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfadd_vv_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfadd.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfadd_vv_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfadd_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfadd_vf_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfadd.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfadd_vf_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfadd_vv_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfadd.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfadd_vv_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfadd_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfadd_vf_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfadd.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfadd_vf_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfadd_vv_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfadd.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfadd_vv_bf16m1_tum(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfadd_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfadd_vf_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfadd.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfadd_vf_bf16m1_tum(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfadd_vv_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfadd.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfadd_vv_bf16m2_tum(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfadd_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfadd_vf_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfadd.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfadd_vf_bf16m2_tum(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfadd_vv_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfadd.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfadd_vv_bf16m4_tum(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfadd_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfadd_vf_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfadd.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfadd_vf_bf16m4_tum(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfadd_vv_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfadd.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfadd_vv_bf16m8_tum(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfadd_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfadd_vf_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfadd.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfadd_vf_bf16m8_tum(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfadd_vv_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfadd.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfadd_vv_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfadd_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfadd_vf_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfadd.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfadd_vf_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfadd_vv_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfadd.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfadd_vv_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfadd_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfadd_vf_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfadd.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfadd_vf_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfadd_vv_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfadd.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfadd_vv_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfadd_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfadd_vf_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfadd.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfadd_vf_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfadd_vv_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfadd.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfadd_vv_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfadd_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfadd_vf_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfadd.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfadd_vf_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfadd_vv_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfadd.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfadd_vv_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfadd_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfadd_vf_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfadd.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfadd_vf_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfadd_vv_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfadd.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfadd_vv_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfadd_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfadd_vf_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfadd.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfadd_vf_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfadd_vv_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfadd.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfadd_vv_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfadd_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfadd_vf_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfadd.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfadd_vf_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfadd_vv_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfadd.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfadd_vv_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfadd_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfadd_vf_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfadd.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfadd_vf_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfadd_vv_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfadd.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfadd_vv_bf16m1_mu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfadd_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfadd_vf_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfadd.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfadd_vf_bf16m1_mu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfadd_vv_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfadd.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfadd_vv_bf16m2_mu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfadd_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfadd_vf_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfadd.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfadd_vf_bf16m2_mu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfadd_vv_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfadd.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfadd_vv_bf16m4_mu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfadd_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfadd_vf_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfadd.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfadd_vf_bf16m4_mu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfadd_vv_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfadd.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfadd_vv_bf16m8_mu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfadd_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfadd_vf_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfadd.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfadd_vf_bf16m8_mu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfadd_mu(mask, maskedoff, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfclass.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfclass.c
new file mode 100644
index 0000000000000..e2a993aca7069
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfclass.c
@@ -0,0 +1,272 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i16> @test_vfclass_v_bf16mf4_u16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x i16> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vfclass.nxv1bf16.i64(<vscale x 1 x i16> [[VD]], <vscale x 1 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
+//
+vuint16mf4_t test_vfclass_v_bf16mf4_u16mf4_tu(vuint16mf4_t vd,
+                                              vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfclass_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i16> @test_vfclass_v_bf16mf2_u16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x i16> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vfclass.nxv2bf16.i64(<vscale x 2 x i16> [[VD]], <vscale x 2 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
+//
+vuint16mf2_t test_vfclass_v_bf16mf2_u16mf2_tu(vuint16mf2_t vd,
+                                              vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfclass_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i16> @test_vfclass_v_bf16m1_u16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x i16> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vfclass.nxv4bf16.i64(<vscale x 4 x i16> [[VD]], <vscale x 4 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
+//
+vuint16m1_t test_vfclass_v_bf16m1_u16m1_tu(vuint16m1_t vd, vbfloat16m1_t vs2,
+                                           size_t vl) {
+  return __riscv_vfclass_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i16> @test_vfclass_v_bf16m2_u16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x i16> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vfclass.nxv8bf16.i64(<vscale x 8 x i16> [[VD]], <vscale x 8 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+vuint16m2_t test_vfclass_v_bf16m2_u16m2_tu(vuint16m2_t vd, vbfloat16m2_t vs2,
+                                           size_t vl) {
+  return __riscv_vfclass_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i16> @test_vfclass_v_bf16m4_u16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x i16> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vfclass.nxv16bf16.i64(<vscale x 16 x i16> [[VD]], <vscale x 16 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+vuint16m4_t test_vfclass_v_bf16m4_u16m4_tu(vuint16m4_t vd, vbfloat16m4_t vs2,
+                                           size_t vl) {
+  return __riscv_vfclass_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_vfclass_v_bf16m8_u16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x i16> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vfclass.nxv32bf16.i64(<vscale x 32 x i16> [[VD]], <vscale x 32 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
+vuint16m8_t test_vfclass_v_bf16m8_u16m8_tu(vuint16m8_t vd, vbfloat16m8_t vs2,
+                                           size_t vl) {
+  return __riscv_vfclass_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i16> @test_vfclass_v_bf16mf4_u16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x i16> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vfclass.mask.nxv1bf16.i64(<vscale x 1 x i16> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
+//
+vuint16mf4_t test_vfclass_v_bf16mf4_u16mf4_tum(vbool64_t vm, vuint16mf4_t vd,
+                                               vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfclass_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i16> @test_vfclass_v_bf16mf2_u16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x i16> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vfclass.mask.nxv2bf16.i64(<vscale x 2 x i16> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
+//
+vuint16mf2_t test_vfclass_v_bf16mf2_u16mf2_tum(vbool32_t vm, vuint16mf2_t vd,
+                                               vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfclass_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i16> @test_vfclass_v_bf16m1_u16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x i16> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vfclass.mask.nxv4bf16.i64(<vscale x 4 x i16> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
+//
+vuint16m1_t test_vfclass_v_bf16m1_u16m1_tum(vbool16_t vm, vuint16m1_t vd,
+                                            vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfclass_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i16> @test_vfclass_v_bf16m2_u16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x i16> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vfclass.mask.nxv8bf16.i64(<vscale x 8 x i16> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+vuint16m2_t test_vfclass_v_bf16m2_u16m2_tum(vbool8_t vm, vuint16m2_t vd,
+                                            vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfclass_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i16> @test_vfclass_v_bf16m4_u16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x i16> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vfclass.mask.nxv16bf16.i64(<vscale x 16 x i16> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+vuint16m4_t test_vfclass_v_bf16m4_u16m4_tum(vbool4_t vm, vuint16m4_t vd,
+                                            vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfclass_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_vfclass_v_bf16m8_u16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x i16> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vfclass.mask.nxv32bf16.i64(<vscale x 32 x i16> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
+vuint16m8_t test_vfclass_v_bf16m8_u16m8_tum(vbool2_t vm, vuint16m8_t vd,
+                                            vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfclass_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i16> @test_vfclass_v_bf16mf4_u16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x i16> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vfclass.mask.nxv1bf16.i64(<vscale x 1 x i16> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
+//
+vuint16mf4_t test_vfclass_v_bf16mf4_u16mf4_tumu(vbool64_t vm, vuint16mf4_t vd,
+                                                vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfclass_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i16> @test_vfclass_v_bf16mf2_u16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x i16> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vfclass.mask.nxv2bf16.i64(<vscale x 2 x i16> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
+//
+vuint16mf2_t test_vfclass_v_bf16mf2_u16mf2_tumu(vbool32_t vm, vuint16mf2_t vd,
+                                                vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfclass_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i16> @test_vfclass_v_bf16m1_u16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x i16> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vfclass.mask.nxv4bf16.i64(<vscale x 4 x i16> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
+//
+vuint16m1_t test_vfclass_v_bf16m1_u16m1_tumu(vbool16_t vm, vuint16m1_t vd,
+                                             vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfclass_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i16> @test_vfclass_v_bf16m2_u16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x i16> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vfclass.mask.nxv8bf16.i64(<vscale x 8 x i16> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+vuint16m2_t test_vfclass_v_bf16m2_u16m2_tumu(vbool8_t vm, vuint16m2_t vd,
+                                             vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfclass_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i16> @test_vfclass_v_bf16m4_u16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x i16> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vfclass.mask.nxv16bf16.i64(<vscale x 16 x i16> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+vuint16m4_t test_vfclass_v_bf16m4_u16m4_tumu(vbool4_t vm, vuint16m4_t vd,
+                                             vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfclass_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_vfclass_v_bf16m8_u16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x i16> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vfclass.mask.nxv32bf16.i64(<vscale x 32 x i16> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
+vuint16m8_t test_vfclass_v_bf16m8_u16m8_tumu(vbool2_t vm, vuint16m8_t vd,
+                                             vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfclass_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i16> @test_vfclass_v_bf16mf4_u16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x i16> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vfclass.mask.nxv1bf16.i64(<vscale x 1 x i16> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i16> [[TMP0]]
+//
+vuint16mf4_t test_vfclass_v_bf16mf4_u16mf4_mu(vbool64_t vm, vuint16mf4_t vd,
+                                              vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfclass_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i16> @test_vfclass_v_bf16mf2_u16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x i16> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vfclass.mask.nxv2bf16.i64(<vscale x 2 x i16> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i16> [[TMP0]]
+//
+vuint16mf2_t test_vfclass_v_bf16mf2_u16mf2_mu(vbool32_t vm, vuint16mf2_t vd,
+                                              vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfclass_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i16> @test_vfclass_v_bf16m1_u16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x i16> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vfclass.mask.nxv4bf16.i64(<vscale x 4 x i16> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i16> [[TMP0]]
+//
+vuint16m1_t test_vfclass_v_bf16m1_u16m1_mu(vbool16_t vm, vuint16m1_t vd,
+                                           vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfclass_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i16> @test_vfclass_v_bf16m2_u16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x i16> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vfclass.mask.nxv8bf16.i64(<vscale x 8 x i16> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+vuint16m2_t test_vfclass_v_bf16m2_u16m2_mu(vbool8_t vm, vuint16m2_t vd,
+                                           vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfclass_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i16> @test_vfclass_v_bf16m4_u16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x i16> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vfclass.mask.nxv16bf16.i64(<vscale x 16 x i16> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
+//
+vuint16m4_t test_vfclass_v_bf16m4_u16m4_mu(vbool4_t vm, vuint16m4_t vd,
+                                           vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfclass_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_vfclass_v_bf16m8_u16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x i16> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vfclass.mask.nxv32bf16.i64(<vscale x 32 x i16> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
+vuint16m8_t test_vfclass_v_bf16m8_u16m8_mu(vbool2_t vm, vuint16m8_t vd,
+                                           vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfclass_mu(vm, vd, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfmacc.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfmacc.c
new file mode 100644
index 0000000000000..eb7427107c942
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfmacc.c
@@ -0,0 +1,489 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmacc_vv_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmacc.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmacc_vv_bf16mf4_tu(vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmacc_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmacc_vf_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmacc.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmacc_vf_bf16mf4_tu(vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmacc_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmacc_vv_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmacc.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmacc_vv_bf16mf2_tu(vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmacc_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmacc_vf_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmacc.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmacc_vf_bf16mf2_tu(vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmacc_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmacc_vv_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmacc.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmacc_vv_bf16m1_tu(vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmacc_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmacc_vf_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmacc.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmacc_vf_bf16m1_tu(vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmacc_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmacc_vv_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmacc.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmacc_vv_bf16m2_tu(vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmacc_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmacc_vf_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmacc.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmacc_vf_bf16m2_tu(vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmacc_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmacc_vv_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmacc.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmacc_vv_bf16m4_tu(vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmacc_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmacc_vf_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmacc.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmacc_vf_bf16m4_tu(vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmacc_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmacc_vv_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmacc.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmacc_vv_bf16m8_tu(vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmacc_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmacc_vf_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmacc.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmacc_vf_bf16m8_tu(vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmacc_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmacc_vv_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmacc.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmacc_vv_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmacc_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmacc_vf_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmacc.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmacc_vf_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmacc_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmacc_vv_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmacc.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmacc_vv_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmacc_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmacc_vf_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmacc.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmacc_vf_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmacc_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmacc_vv_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmacc.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmacc_vv_bf16m1_tum(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmacc_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmacc_vf_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmacc.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmacc_vf_bf16m1_tum(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmacc_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmacc_vv_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmacc.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmacc_vv_bf16m2_tum(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmacc_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmacc_vf_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmacc.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmacc_vf_bf16m2_tum(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmacc_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmacc_vv_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmacc.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmacc_vv_bf16m4_tum(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmacc_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmacc_vf_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmacc.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmacc_vf_bf16m4_tum(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmacc_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmacc_vv_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmacc.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmacc_vv_bf16m8_tum(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmacc_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmacc_vf_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmacc.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmacc_vf_bf16m8_tum(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmacc_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmacc_vv_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmacc.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmacc_vv_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmacc_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmacc_vf_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmacc.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmacc_vf_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmacc_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmacc_vv_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmacc.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmacc_vv_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmacc_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmacc_vf_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmacc.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmacc_vf_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmacc_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmacc_vv_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmacc.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmacc_vv_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmacc_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmacc_vf_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmacc.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmacc_vf_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmacc_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmacc_vv_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmacc.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmacc_vv_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmacc_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmacc_vf_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmacc.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmacc_vf_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmacc_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmacc_vv_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmacc.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmacc_vv_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmacc_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmacc_vf_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmacc.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmacc_vf_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmacc_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmacc_vv_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmacc.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmacc_vv_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmacc_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmacc_vf_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmacc.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmacc_vf_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmacc_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmacc_vv_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmacc.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmacc_vv_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmacc_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmacc_vf_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmacc.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmacc_vf_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmacc_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmacc_vv_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmacc.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmacc_vv_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmacc_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmacc_vf_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmacc.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmacc_vf_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmacc_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmacc_vv_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmacc.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmacc_vv_bf16m1_mu(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmacc_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmacc_vf_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmacc.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmacc_vf_bf16m1_mu(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmacc_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmacc_vv_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmacc.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmacc_vv_bf16m2_mu(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmacc_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmacc_vf_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmacc.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmacc_vf_bf16m2_mu(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmacc_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmacc_vv_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmacc.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmacc_vv_bf16m4_mu(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmacc_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmacc_vf_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmacc.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmacc_vf_bf16m4_mu(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmacc_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmacc_vv_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmacc.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmacc_vv_bf16m8_mu(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmacc_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmacc_vf_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmacc.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmacc_vf_bf16m8_mu(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmacc_mu(mask, vd, rs1, vs2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfmadd.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfmadd.c
new file mode 100644
index 0000000000000..68d490d04ff86
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfmadd.c
@@ -0,0 +1,489 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmadd_vv_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmadd.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmadd_vv_bf16mf4_tu(vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmadd_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmadd_vf_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmadd.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmadd_vf_bf16mf4_tu(vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmadd_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmadd_vv_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmadd.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmadd_vv_bf16mf2_tu(vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmadd_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmadd_vf_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmadd.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmadd_vf_bf16mf2_tu(vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmadd_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmadd_vv_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmadd.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmadd_vv_bf16m1_tu(vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmadd_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmadd_vf_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmadd.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmadd_vf_bf16m1_tu(vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmadd_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmadd_vv_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmadd.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmadd_vv_bf16m2_tu(vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmadd_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmadd_vf_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmadd.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmadd_vf_bf16m2_tu(vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmadd_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmadd_vv_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmadd.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmadd_vv_bf16m4_tu(vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmadd_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmadd_vf_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmadd.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmadd_vf_bf16m4_tu(vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmadd_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmadd_vv_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmadd.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmadd_vv_bf16m8_tu(vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmadd_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmadd_vf_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmadd.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmadd_vf_bf16m8_tu(vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmadd_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmadd_vv_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmadd.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmadd_vv_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmadd_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmadd_vf_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmadd.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmadd_vf_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmadd_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmadd_vv_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmadd.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmadd_vv_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmadd_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmadd_vf_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmadd.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmadd_vf_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmadd_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmadd_vv_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmadd.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmadd_vv_bf16m1_tum(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmadd_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmadd_vf_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmadd.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmadd_vf_bf16m1_tum(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmadd_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmadd_vv_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmadd.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmadd_vv_bf16m2_tum(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmadd_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmadd_vf_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmadd.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmadd_vf_bf16m2_tum(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmadd_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmadd_vv_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmadd.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmadd_vv_bf16m4_tum(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmadd_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmadd_vf_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmadd.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmadd_vf_bf16m4_tum(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmadd_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmadd_vv_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmadd.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmadd_vv_bf16m8_tum(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmadd_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmadd_vf_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmadd.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmadd_vf_bf16m8_tum(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmadd_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmadd_vv_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmadd.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmadd_vv_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmadd_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmadd_vf_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmadd.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmadd_vf_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmadd_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmadd_vv_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmadd.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmadd_vv_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmadd_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmadd_vf_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmadd.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmadd_vf_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmadd_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmadd_vv_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmadd.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmadd_vv_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmadd_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmadd_vf_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmadd.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmadd_vf_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmadd_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmadd_vv_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmadd.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmadd_vv_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmadd_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmadd_vf_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmadd.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmadd_vf_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmadd_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmadd_vv_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmadd.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmadd_vv_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmadd_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmadd_vf_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmadd.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmadd_vf_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmadd_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmadd_vv_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmadd.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmadd_vv_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmadd_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmadd_vf_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmadd.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmadd_vf_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmadd_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmadd_vv_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmadd.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmadd_vv_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmadd_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmadd_vf_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmadd.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmadd_vf_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmadd_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmadd_vv_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmadd.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmadd_vv_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmadd_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmadd_vf_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmadd.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmadd_vf_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmadd_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmadd_vv_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmadd.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmadd_vv_bf16m1_mu(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmadd_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmadd_vf_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmadd.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmadd_vf_bf16m1_mu(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmadd_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmadd_vv_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmadd.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmadd_vv_bf16m2_mu(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmadd_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmadd_vf_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmadd.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmadd_vf_bf16m2_mu(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmadd_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmadd_vv_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmadd.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmadd_vv_bf16m4_mu(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmadd_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmadd_vf_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmadd.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmadd_vf_bf16m4_mu(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmadd_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmadd_vv_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmadd.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmadd_vv_bf16m8_mu(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmadd_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmadd_vf_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmadd.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmadd_vf_bf16m8_mu(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmadd_mu(mask, vd, rs1, vs2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfmax.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfmax.c
new file mode 100644
index 0000000000000..5f682e80b61b5
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfmax.c
@@ -0,0 +1,489 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmax_vv_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmax.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmax_vv_bf16mf4_tu(vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfmax_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmax_vf_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmax.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmax_vf_bf16mf4_tu(vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmax_vv_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmax.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmax_vv_bf16mf2_tu(vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfmax_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmax_vf_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmax.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmax_vf_bf16mf2_tu(vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmax_vv_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmax.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmax_vv_bf16m1_tu(vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfmax_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmax_vf_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmax.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmax_vf_bf16m1_tu(vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmax_vv_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmax.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmax_vv_bf16m2_tu(vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfmax_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmax_vf_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmax.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmax_vf_bf16m2_tu(vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmax_vv_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmax.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmax_vv_bf16m4_tu(vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfmax_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmax_vf_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmax.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmax_vf_bf16m4_tu(vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmax_vv_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmax.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmax_vv_bf16m8_tu(vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfmax_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmax_vf_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmax.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmax_vf_bf16m8_tu(vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmax_vv_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmax.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmax_vv_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfmax_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmax_vf_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmax.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmax_vf_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmax_vv_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmax.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmax_vv_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfmax_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmax_vf_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmax.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmax_vf_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmax_vv_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmax.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmax_vv_bf16m1_tum(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfmax_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmax_vf_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmax.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmax_vf_bf16m1_tum(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmax_vv_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmax.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmax_vv_bf16m2_tum(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfmax_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmax_vf_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmax.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmax_vf_bf16m2_tum(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmax_vv_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmax.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmax_vv_bf16m4_tum(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfmax_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmax_vf_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmax.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmax_vf_bf16m4_tum(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmax_vv_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmax.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmax_vv_bf16m8_tum(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfmax_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmax_vf_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmax.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmax_vf_bf16m8_tum(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmax_vv_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmax.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmax_vv_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfmax_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmax_vf_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmax.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmax_vf_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmax_vv_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmax.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmax_vv_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfmax_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmax_vf_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmax.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmax_vf_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmax_vv_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmax.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmax_vv_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfmax_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmax_vf_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmax.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmax_vf_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmax_vv_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmax.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmax_vv_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfmax_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmax_vf_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmax.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmax_vf_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmax_vv_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmax.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmax_vv_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfmax_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmax_vf_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmax.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmax_vf_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmax_vv_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmax.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmax_vv_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfmax_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmax_vf_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmax.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmax_vf_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmax_vv_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmax.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmax_vv_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfmax_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmax_vf_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmax.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmax_vf_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmax_vv_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmax.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmax_vv_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfmax_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmax_vf_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmax.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmax_vf_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmax_vv_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmax.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmax_vv_bf16m1_mu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfmax_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmax_vf_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmax.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmax_vf_bf16m1_mu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmax_vv_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmax.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmax_vv_bf16m2_mu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfmax_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmax_vf_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmax.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmax_vf_bf16m2_mu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmax_vv_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmax.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmax_vv_bf16m4_mu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfmax_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmax_vf_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmax.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmax_vf_bf16m4_mu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmax_vv_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmax.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmax_vv_bf16m8_mu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfmax_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmax_vf_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmax.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmax_vf_bf16m8_mu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmax_mu(mask, maskedoff, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfmerge.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfmerge.c
new file mode 100644
index 0000000000000..9593ad5b5a592
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfmerge.c
@@ -0,0 +1,69 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmerge_vfm_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmerge.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmerge_vfm_bf16mf4_tu(vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, vbool64_t mask, size_t vl) {
+  return __riscv_vfmerge_tu(maskedoff, op1, op2, mask, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmerge_vfm_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmerge.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmerge_vfm_bf16mf2_tu(vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, vbool32_t mask, size_t vl) {
+  return __riscv_vfmerge_tu(maskedoff, op1, op2, mask, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmerge_vfm_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmerge.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmerge_vfm_bf16m1_tu(vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, vbool16_t mask, size_t vl) {
+  return __riscv_vfmerge_tu(maskedoff, op1, op2, mask, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmerge_vfm_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmerge.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmerge_vfm_bf16m2_tu(vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, vbool8_t mask, size_t vl) {
+  return __riscv_vfmerge_tu(maskedoff, op1, op2, mask, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmerge_vfm_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmerge.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmerge_vfm_bf16m4_tu(vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, vbool4_t mask, size_t vl) {
+  return __riscv_vfmerge_tu(maskedoff, op1, op2, mask, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmerge_vfm_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmerge.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmerge_vfm_bf16m8_tu(vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, vbool2_t mask, size_t vl) {
+  return __riscv_vfmerge_tu(maskedoff, op1, op2, mask, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfmin.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfmin.c
new file mode 100644
index 0000000000000..f3ef3c3614027
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfmin.c
@@ -0,0 +1,489 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmin_vv_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmin.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmin_vv_bf16mf4_tu(vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfmin_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmin_vf_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmin.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmin_vf_bf16mf4_tu(vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmin_vv_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmin.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmin_vv_bf16mf2_tu(vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfmin_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmin_vf_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmin.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmin_vf_bf16mf2_tu(vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmin_vv_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmin.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmin_vv_bf16m1_tu(vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfmin_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmin_vf_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmin.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmin_vf_bf16m1_tu(vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmin_vv_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmin.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmin_vv_bf16m2_tu(vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfmin_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmin_vf_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmin.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmin_vf_bf16m2_tu(vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmin_vv_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmin.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmin_vv_bf16m4_tu(vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfmin_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmin_vf_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmin.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmin_vf_bf16m4_tu(vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmin_vv_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmin.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmin_vv_bf16m8_tu(vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfmin_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmin_vf_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmin.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmin_vf_bf16m8_tu(vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmin_vv_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmin.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmin_vv_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfmin_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmin_vf_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmin.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmin_vf_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmin_vv_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmin.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmin_vv_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfmin_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmin_vf_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmin.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmin_vf_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmin_vv_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmin.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmin_vv_bf16m1_tum(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfmin_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmin_vf_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmin.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmin_vf_bf16m1_tum(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmin_vv_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmin.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmin_vv_bf16m2_tum(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfmin_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmin_vf_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmin.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmin_vf_bf16m2_tum(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmin_vv_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmin.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmin_vv_bf16m4_tum(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfmin_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmin_vf_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmin.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmin_vf_bf16m4_tum(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmin_vv_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmin.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmin_vv_bf16m8_tum(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfmin_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmin_vf_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmin.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmin_vf_bf16m8_tum(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmin_vv_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmin.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmin_vv_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfmin_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmin_vf_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmin.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmin_vf_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmin_vv_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmin.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmin_vv_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfmin_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmin_vf_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmin.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmin_vf_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmin_vv_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmin.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmin_vv_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfmin_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmin_vf_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmin.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmin_vf_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmin_vv_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmin.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmin_vv_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfmin_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmin_vf_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmin.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmin_vf_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmin_vv_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmin.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmin_vv_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfmin_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmin_vf_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmin.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmin_vf_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmin_vv_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmin.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmin_vv_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfmin_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmin_vf_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmin.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmin_vf_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmin_vv_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmin.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmin_vv_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfmin_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmin_vf_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmin.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmin_vf_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmin_vv_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmin.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmin_vv_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfmin_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmin_vf_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmin.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmin_vf_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmin_vv_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmin.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmin_vv_bf16m1_mu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfmin_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmin_vf_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmin.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmin_vf_bf16m1_mu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmin_vv_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmin.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmin_vv_bf16m2_mu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfmin_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmin_vf_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmin.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmin_vf_bf16m2_mu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmin_vv_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmin.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmin_vv_bf16m4_mu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfmin_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmin_vf_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmin.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmin_vf_bf16m4_mu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmin_vv_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmin.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmin_vv_bf16m8_mu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfmin_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmin_vf_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmin.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmin_vf_bf16m8_mu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmin_mu(mask, maskedoff, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfmsac.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfmsac.c
new file mode 100644
index 0000000000000..0587c57af2b06
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfmsac.c
@@ -0,0 +1,489 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsac_vv_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsac.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsac_vv_bf16mf4_tu(vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsac_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsac_vf_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsac.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsac_vf_bf16mf4_tu(vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsac_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsac_vv_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsac.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsac_vv_bf16mf2_tu(vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsac_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsac_vf_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsac.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsac_vf_bf16mf2_tu(vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsac_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsac_vv_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsac.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsac_vv_bf16m1_tu(vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsac_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsac_vf_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsac.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsac_vf_bf16m1_tu(vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsac_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsac_vv_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsac.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsac_vv_bf16m2_tu(vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsac_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsac_vf_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsac.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsac_vf_bf16m2_tu(vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsac_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsac_vv_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsac.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsac_vv_bf16m4_tu(vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsac_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsac_vf_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsac.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsac_vf_bf16m4_tu(vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsac_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsac_vv_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsac.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsac_vv_bf16m8_tu(vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsac_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsac_vf_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsac.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsac_vf_bf16m8_tu(vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsac_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsac_vv_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsac.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsac_vv_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsac_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsac_vf_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsac.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsac_vf_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsac_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsac_vv_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsac.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsac_vv_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsac_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsac_vf_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsac.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsac_vf_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsac_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsac_vv_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsac.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsac_vv_bf16m1_tum(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsac_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsac_vf_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsac.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsac_vf_bf16m1_tum(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsac_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsac_vv_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsac.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsac_vv_bf16m2_tum(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsac_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsac_vf_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsac.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsac_vf_bf16m2_tum(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsac_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsac_vv_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsac.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsac_vv_bf16m4_tum(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsac_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsac_vf_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsac.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsac_vf_bf16m4_tum(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsac_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsac_vv_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsac.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsac_vv_bf16m8_tum(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsac_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsac_vf_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsac.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsac_vf_bf16m8_tum(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsac_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsac_vv_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsac.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsac_vv_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsac_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsac_vf_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsac.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsac_vf_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsac_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsac_vv_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsac.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsac_vv_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsac_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsac_vf_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsac.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsac_vf_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsac_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsac_vv_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsac.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsac_vv_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsac_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsac_vf_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsac.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsac_vf_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsac_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsac_vv_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsac.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsac_vv_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsac_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsac_vf_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsac.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsac_vf_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsac_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsac_vv_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsac.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsac_vv_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsac_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsac_vf_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsac.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsac_vf_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsac_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsac_vv_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsac.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsac_vv_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsac_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsac_vf_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsac.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsac_vf_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsac_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsac_vv_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsac.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsac_vv_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsac_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsac_vf_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsac.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsac_vf_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsac_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsac_vv_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsac.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsac_vv_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsac_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsac_vf_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsac.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsac_vf_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsac_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsac_vv_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsac.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsac_vv_bf16m1_mu(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsac_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsac_vf_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsac.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsac_vf_bf16m1_mu(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsac_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsac_vv_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsac.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsac_vv_bf16m2_mu(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsac_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsac_vf_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsac.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsac_vf_bf16m2_mu(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsac_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsac_vv_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsac.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsac_vv_bf16m4_mu(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsac_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsac_vf_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsac.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsac_vf_bf16m4_mu(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsac_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsac_vv_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsac.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsac_vv_bf16m8_mu(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsac_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsac_vf_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsac.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsac_vf_bf16m8_mu(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsac_mu(mask, vd, rs1, vs2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfmsub.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfmsub.c
new file mode 100644
index 0000000000000..2ad26f8b24e12
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfmsub.c
@@ -0,0 +1,489 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsub_vv_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsub.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsub_vv_bf16mf4_tu(vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsub_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsub_vf_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsub.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsub_vf_bf16mf4_tu(vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsub_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsub_vv_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsub.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsub_vv_bf16mf2_tu(vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsub_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsub_vf_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsub.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsub_vf_bf16mf2_tu(vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsub_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsub_vv_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsub.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsub_vv_bf16m1_tu(vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsub_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsub_vf_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsub.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsub_vf_bf16m1_tu(vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsub_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsub_vv_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsub.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsub_vv_bf16m2_tu(vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsub_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsub_vf_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsub.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsub_vf_bf16m2_tu(vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsub_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsub_vv_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsub.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsub_vv_bf16m4_tu(vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsub_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsub_vf_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsub.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsub_vf_bf16m4_tu(vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsub_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsub_vv_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsub.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsub_vv_bf16m8_tu(vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsub_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsub_vf_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsub.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsub_vf_bf16m8_tu(vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsub_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsub_vv_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsub.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsub_vv_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsub_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsub_vf_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsub.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsub_vf_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsub_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsub_vv_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsub.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsub_vv_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsub_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsub_vf_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsub.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsub_vf_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsub_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsub_vv_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsub.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsub_vv_bf16m1_tum(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsub_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsub_vf_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsub.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsub_vf_bf16m1_tum(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsub_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsub_vv_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsub.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsub_vv_bf16m2_tum(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsub_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsub_vf_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsub.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsub_vf_bf16m2_tum(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsub_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsub_vv_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsub.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsub_vv_bf16m4_tum(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsub_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsub_vf_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsub.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsub_vf_bf16m4_tum(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsub_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsub_vv_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsub.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsub_vv_bf16m8_tum(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsub_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsub_vf_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsub.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsub_vf_bf16m8_tum(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsub_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsub_vv_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsub.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsub_vv_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsub_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsub_vf_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsub.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsub_vf_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsub_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsub_vv_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsub.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsub_vv_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsub_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsub_vf_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsub.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsub_vf_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsub_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsub_vv_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsub.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsub_vv_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsub_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsub_vf_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsub.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsub_vf_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsub_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsub_vv_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsub.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsub_vv_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsub_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsub_vf_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsub.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsub_vf_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsub_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsub_vv_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsub.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsub_vv_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsub_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsub_vf_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsub.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsub_vf_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsub_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsub_vv_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsub.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsub_vv_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsub_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsub_vf_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsub.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsub_vf_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsub_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsub_vv_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsub.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsub_vv_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsub_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmsub_vf_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmsub.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmsub_vf_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfmsub_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsub_vv_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsub.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsub_vv_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsub_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmsub_vf_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmsub.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmsub_vf_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfmsub_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsub_vv_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsub.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsub_vv_bf16m1_mu(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsub_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmsub_vf_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmsub.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmsub_vf_bf16m1_mu(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfmsub_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsub_vv_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsub.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsub_vv_bf16m2_mu(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsub_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmsub_vf_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmsub.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmsub_vf_bf16m2_mu(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfmsub_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsub_vv_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsub.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsub_vv_bf16m4_mu(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsub_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmsub_vf_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmsub.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmsub_vf_bf16m4_mu(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfmsub_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsub_vv_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsub.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsub_vv_bf16m8_mu(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsub_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmsub_vf_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmsub.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmsub_vf_bf16m8_mu(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfmsub_mu(mask, vd, rs1, vs2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfmul.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfmul.c
new file mode 100644
index 0000000000000..d1e726a9d63ea
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfmul.c
@@ -0,0 +1,489 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmul_vv_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmul.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmul_vv_bf16mf4_tu(vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfmul_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmul_vf_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmul.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmul_vf_bf16mf4_tu(vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmul_vv_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmul.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmul_vv_bf16mf2_tu(vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfmul_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmul_vf_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmul.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmul_vf_bf16mf2_tu(vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmul_vv_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmul.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmul_vv_bf16m1_tu(vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfmul_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmul_vf_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmul.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmul_vf_bf16m1_tu(vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmul_vv_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmul.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmul_vv_bf16m2_tu(vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfmul_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmul_vf_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmul.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmul_vf_bf16m2_tu(vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmul_vv_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmul.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmul_vv_bf16m4_tu(vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfmul_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmul_vf_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmul.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmul_vf_bf16m4_tu(vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmul_vv_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmul.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmul_vv_bf16m8_tu(vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfmul_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmul_vf_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmul.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmul_vf_bf16m8_tu(vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmul_vv_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmul.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmul_vv_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfmul_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmul_vf_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmul.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmul_vf_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmul_vv_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmul.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmul_vv_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfmul_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmul_vf_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmul.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmul_vf_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmul_vv_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmul.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmul_vv_bf16m1_tum(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfmul_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmul_vf_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmul.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmul_vf_bf16m1_tum(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmul_vv_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmul.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmul_vv_bf16m2_tum(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfmul_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmul_vf_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmul.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmul_vf_bf16m2_tum(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmul_vv_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmul.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmul_vv_bf16m4_tum(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfmul_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmul_vf_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmul.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmul_vf_bf16m4_tum(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmul_vv_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmul.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmul_vv_bf16m8_tum(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfmul_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmul_vf_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmul.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmul_vf_bf16m8_tum(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmul_vv_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmul.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmul_vv_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfmul_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmul_vf_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmul.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmul_vf_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmul_vv_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmul.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmul_vv_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfmul_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmul_vf_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmul.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmul_vf_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmul_vv_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmul.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmul_vv_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfmul_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmul_vf_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmul.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmul_vf_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmul_vv_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmul.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmul_vv_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfmul_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmul_vf_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmul.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmul_vf_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmul_vv_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmul.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmul_vv_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfmul_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmul_vf_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmul.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmul_vf_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmul_vv_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmul.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmul_vv_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfmul_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmul_vf_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmul.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmul_vf_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmul_vv_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmul.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmul_vv_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfmul_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmul_vf_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmul.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmul_vf_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmul_vv_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmul.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmul_vv_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfmul_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmul_vf_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmul.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmul_vf_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmul_vv_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmul.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmul_vv_bf16m1_mu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfmul_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmul_vf_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmul.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmul_vf_bf16m1_mu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmul_vv_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmul.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmul_vv_bf16m2_mu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfmul_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmul_vf_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmul.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmul_vf_bf16m2_mu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmul_vv_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmul.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmul_vv_bf16m4_mu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfmul_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmul_vf_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmul.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmul_vf_bf16m4_mu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmul_vv_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmul.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmul_vv_bf16m8_mu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfmul_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmul_vf_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmul.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmul_vf_bf16m8_mu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfmul_mu(mask, maskedoff, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfmv.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfmv.c
new file mode 100644
index 0000000000000..9fd1ffce446bb
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfmv.c
@@ -0,0 +1,129 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmv_v_f_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], bfloat noundef [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmv.v.f.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], bfloat [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmv_v_f_bf16mf4_tu(vbfloat16mf4_t maskedoff, __bf16 src, size_t vl) {
+  return __riscv_vfmv_v_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmv_v_f_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], bfloat noundef [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmv.v.f.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], bfloat [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmv_v_f_bf16mf2_tu(vbfloat16mf2_t maskedoff, __bf16 src, size_t vl) {
+  return __riscv_vfmv_v_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmv_v_f_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], bfloat noundef [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmv.v.f.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], bfloat [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmv_v_f_bf16m1_tu(vbfloat16m1_t maskedoff, __bf16 src, size_t vl) {
+  return __riscv_vfmv_v_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmv_v_f_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], bfloat noundef [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmv.v.f.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], bfloat [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmv_v_f_bf16m2_tu(vbfloat16m2_t maskedoff, __bf16 src, size_t vl) {
+  return __riscv_vfmv_v_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmv_v_f_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], bfloat noundef [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmv.v.f.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], bfloat [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmv_v_f_bf16m4_tu(vbfloat16m4_t maskedoff, __bf16 src, size_t vl) {
+  return __riscv_vfmv_v_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmv_v_f_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], bfloat noundef [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmv.v.f.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], bfloat [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmv_v_f_bf16m8_tu(vbfloat16m8_t maskedoff, __bf16 src, size_t vl) {
+  return __riscv_vfmv_v_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfmv_s_f_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], bfloat noundef [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfmv.s.f.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], bfloat [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfmv_s_f_bf16mf4_tu(vbfloat16mf4_t maskedoff, __bf16 src, size_t vl) {
+  return __riscv_vfmv_s_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfmv_s_f_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], bfloat noundef [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfmv.s.f.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], bfloat [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfmv_s_f_bf16mf2_tu(vbfloat16mf2_t maskedoff, __bf16 src, size_t vl) {
+  return __riscv_vfmv_s_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfmv_s_f_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], bfloat noundef [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfmv.s.f.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], bfloat [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfmv_s_f_bf16m1_tu(vbfloat16m1_t maskedoff, __bf16 src, size_t vl) {
+  return __riscv_vfmv_s_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfmv_s_f_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], bfloat noundef [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfmv.s.f.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], bfloat [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfmv_s_f_bf16m2_tu(vbfloat16m2_t maskedoff, __bf16 src, size_t vl) {
+  return __riscv_vfmv_s_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfmv_s_f_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], bfloat noundef [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfmv.s.f.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], bfloat [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfmv_s_f_bf16m4_tu(vbfloat16m4_t maskedoff, __bf16 src, size_t vl) {
+  return __riscv_vfmv_s_tu(maskedoff, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfmv_s_f_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], bfloat noundef [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfmv.s.f.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], bfloat [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfmv_s_f_bf16m8_tu(vbfloat16m8_t maskedoff, __bf16 src, size_t vl) {
+  return __riscv_vfmv_s_tu(maskedoff, src, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfncvt.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfncvt.c
new file mode 100644
index 0000000000000..c6cd0a55fa530
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfncvt.c
@@ -0,0 +1,1539 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_x_f_w_bf16mf4_i8mf8_tu(
+// CHECK-RV64-SAME: <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.x.f.w.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vint8mf8_t test_vfncvt_x_f_w_bf16mf4_i8mf8_tu(vint8mf8_t vd, vbfloat16mf4_t vs2,
+                                              size_t vl) {
+  return __riscv_vfncvt_x_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_x_f_w_bf16mf2_i8mf4_tu(
+// CHECK-RV64-SAME: <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.x.f.w.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vint8mf4_t test_vfncvt_x_f_w_bf16mf2_i8mf4_tu(vint8mf4_t vd, vbfloat16mf2_t vs2,
+                                              size_t vl) {
+  return __riscv_vfncvt_x_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_x_f_w_bf16m1_i8mf2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.x.f.w.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vint8mf2_t test_vfncvt_x_f_w_bf16m1_i8mf2_tu(vint8mf2_t vd, vbfloat16m1_t vs2,
+                                             size_t vl) {
+  return __riscv_vfncvt_x_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_x_f_w_bf16m2_i8m1_tu(
+// CHECK-RV64-SAME: <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.x.f.w.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vint8m1_t test_vfncvt_x_f_w_bf16m2_i8m1_tu(vint8m1_t vd, vbfloat16m2_t vs2,
+                                           size_t vl) {
+  return __riscv_vfncvt_x_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_x_f_w_bf16m4_i8m2_tu(
+// CHECK-RV64-SAME: <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.x.f.w.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vint8m2_t test_vfncvt_x_f_w_bf16m4_i8m2_tu(vint8m2_t vd, vbfloat16m4_t vs2,
+                                           size_t vl) {
+  return __riscv_vfncvt_x_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_x_f_w_bf16m8_i8m4_tu(
+// CHECK-RV64-SAME: <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.x.f.w.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vint8m4_t test_vfncvt_x_f_w_bf16m8_i8m4_tu(vint8m4_t vd, vbfloat16m8_t vs2,
+                                           size_t vl) {
+  return __riscv_vfncvt_x_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_xu_f_w_bf16mf4_u8mf8_tu(
+// CHECK-RV64-SAME: <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vuint8mf8_t test_vfncvt_xu_f_w_bf16mf4_u8mf8_tu(vuint8mf8_t vd,
+                                                vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_xu_f_w_bf16mf2_u8mf4_tu(
+// CHECK-RV64-SAME: <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vuint8mf4_t test_vfncvt_xu_f_w_bf16mf2_u8mf4_tu(vuint8mf4_t vd,
+                                                vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_xu_f_w_bf16m1_u8mf2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vuint8mf2_t test_vfncvt_xu_f_w_bf16m1_u8mf2_tu(vuint8mf2_t vd,
+                                               vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_xu_f_w_bf16m2_u8m1_tu(
+// CHECK-RV64-SAME: <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vuint8m1_t test_vfncvt_xu_f_w_bf16m2_u8m1_tu(vuint8m1_t vd, vbfloat16m2_t vs2,
+                                             size_t vl) {
+  return __riscv_vfncvt_xu_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_xu_f_w_bf16m4_u8m2_tu(
+// CHECK-RV64-SAME: <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vuint8m2_t test_vfncvt_xu_f_w_bf16m4_u8m2_tu(vuint8m2_t vd, vbfloat16m4_t vs2,
+                                             size_t vl) {
+  return __riscv_vfncvt_xu_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_xu_f_w_bf16m8_u8m4_tu(
+// CHECK-RV64-SAME: <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vuint8m4_t test_vfncvt_xu_f_w_bf16m8_u8m4_tu(vuint8m4_t vd, vbfloat16m8_t vs2,
+                                             size_t vl) {
+  return __riscv_vfncvt_xu_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfncvt_f_f_w_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfncvt.f.f.w.nxv1bf16.nxv1f32.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x float> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfncvt_f_f_w_bf16mf4_tu(vbfloat16mf4_t vd,
+                                            vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_bf16_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfncvt_f_f_w_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfncvt.f.f.w.nxv2bf16.nxv2f32.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x float> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfncvt_f_f_w_bf16mf2_tu(vbfloat16mf2_t vd, vfloat32m1_t vs2,
+                                            size_t vl) {
+  return __riscv_vfncvt_f_bf16_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfncvt_f_f_w_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfncvt.f.f.w.nxv4bf16.nxv4f32.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x float> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfncvt_f_f_w_bf16m1_tu(vbfloat16m1_t vd, vfloat32m2_t vs2,
+                                          size_t vl) {
+  return __riscv_vfncvt_f_bf16_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfncvt_f_f_w_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfncvt.f.f.w.nxv8bf16.nxv8f32.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x float> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfncvt_f_f_w_bf16m2_tu(vbfloat16m2_t vd, vfloat32m4_t vs2,
+                                          size_t vl) {
+  return __riscv_vfncvt_f_bf16_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfncvt_f_f_w_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfncvt.f.f.w.nxv16bf16.nxv16f32.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x float> [[VS2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfncvt_f_f_w_bf16m4_tu(vbfloat16m4_t vd, vfloat32m8_t vs2,
+                                          size_t vl) {
+  return __riscv_vfncvt_f_bf16_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_x_f_w_bf16mf4_i8mf8_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vint8mf8_t test_vfncvt_x_f_w_bf16mf4_i8mf8_tum(vbool64_t vm, vint8mf8_t vd,
+                                               vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_x_f_w_bf16mf2_i8mf4_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vint8mf4_t test_vfncvt_x_f_w_bf16mf2_i8mf4_tum(vbool32_t vm, vint8mf4_t vd,
+                                               vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_x_f_w_bf16m1_i8mf2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vint8mf2_t test_vfncvt_x_f_w_bf16m1_i8mf2_tum(vbool16_t vm, vint8mf2_t vd,
+                                              vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_x_f_w_bf16m2_i8m1_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vint8m1_t test_vfncvt_x_f_w_bf16m2_i8m1_tum(vbool8_t vm, vint8m1_t vd,
+                                            vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_x_f_w_bf16m4_i8m2_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vint8m2_t test_vfncvt_x_f_w_bf16m4_i8m2_tum(vbool4_t vm, vint8m2_t vd,
+                                            vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_x_f_w_bf16m8_i8m4_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vint8m4_t test_vfncvt_x_f_w_bf16m8_i8m4_tum(vbool2_t vm, vint8m4_t vd,
+                                            vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_xu_f_w_bf16mf4_u8mf8_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vuint8mf8_t test_vfncvt_xu_f_w_bf16mf4_u8mf8_tum(vbool64_t vm, vuint8mf8_t vd,
+                                                 vbfloat16mf4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfncvt_xu_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_xu_f_w_bf16mf2_u8mf4_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vuint8mf4_t test_vfncvt_xu_f_w_bf16mf2_u8mf4_tum(vbool32_t vm, vuint8mf4_t vd,
+                                                 vbfloat16mf2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfncvt_xu_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_xu_f_w_bf16m1_u8mf2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vuint8mf2_t test_vfncvt_xu_f_w_bf16m1_u8mf2_tum(vbool16_t vm, vuint8mf2_t vd,
+                                                vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_xu_f_w_bf16m2_u8m1_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vuint8m1_t test_vfncvt_xu_f_w_bf16m2_u8m1_tum(vbool8_t vm, vuint8m1_t vd,
+                                              vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_xu_f_w_bf16m4_u8m2_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vuint8m2_t test_vfncvt_xu_f_w_bf16m4_u8m2_tum(vbool4_t vm, vuint8m2_t vd,
+                                              vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_xu_f_w_bf16m8_u8m4_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vuint8m4_t test_vfncvt_xu_f_w_bf16m8_u8m4_tum(vbool2_t vm, vuint8m4_t vd,
+                                              vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfncvt_f_f_w_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv1bf16.nxv1f32.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfncvt_f_f_w_bf16mf4_tum(vbool64_t vm, vbfloat16mf4_t vd,
+                                             vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_bf16_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfncvt_f_f_w_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv2bf16.nxv2f32.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfncvt_f_f_w_bf16mf2_tum(vbool32_t vm, vbfloat16mf2_t vd,
+                                             vfloat32m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_bf16_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfncvt_f_f_w_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv4bf16.nxv4f32.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfncvt_f_f_w_bf16m1_tum(vbool16_t vm, vbfloat16m1_t vd,
+                                           vfloat32m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_bf16_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfncvt_f_f_w_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv8bf16.nxv8f32.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfncvt_f_f_w_bf16m2_tum(vbool8_t vm, vbfloat16m2_t vd,
+                                           vfloat32m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_bf16_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfncvt_f_f_w_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv16bf16.nxv16f32.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfncvt_f_f_w_bf16m4_tum(vbool4_t vm, vbfloat16m4_t vd,
+                                           vfloat32m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_bf16_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_x_f_w_bf16mf4_i8mf8_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vint8mf8_t test_vfncvt_x_f_w_bf16mf4_i8mf8_tumu(vbool64_t vm, vint8mf8_t vd,
+                                                vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_x_f_w_bf16mf2_i8mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vint8mf4_t test_vfncvt_x_f_w_bf16mf2_i8mf4_tumu(vbool32_t vm, vint8mf4_t vd,
+                                                vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_x_f_w_bf16m1_i8mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vint8mf2_t test_vfncvt_x_f_w_bf16m1_i8mf2_tumu(vbool16_t vm, vint8mf2_t vd,
+                                               vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_x_f_w_bf16m2_i8m1_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vint8m1_t test_vfncvt_x_f_w_bf16m2_i8m1_tumu(vbool8_t vm, vint8m1_t vd,
+                                             vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_x_f_w_bf16m4_i8m2_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vint8m2_t test_vfncvt_x_f_w_bf16m4_i8m2_tumu(vbool4_t vm, vint8m2_t vd,
+                                             vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_x_f_w_bf16m8_i8m4_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vint8m4_t test_vfncvt_x_f_w_bf16m8_i8m4_tumu(vbool2_t vm, vint8m4_t vd,
+                                             vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_xu_f_w_bf16mf4_u8mf8_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vuint8mf8_t test_vfncvt_xu_f_w_bf16mf4_u8mf8_tumu(vbool64_t vm, vuint8mf8_t vd,
+                                                  vbfloat16mf4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfncvt_xu_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_xu_f_w_bf16mf2_u8mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vuint8mf4_t test_vfncvt_xu_f_w_bf16mf2_u8mf4_tumu(vbool32_t vm, vuint8mf4_t vd,
+                                                  vbfloat16mf2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfncvt_xu_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_xu_f_w_bf16m1_u8mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vuint8mf2_t test_vfncvt_xu_f_w_bf16m1_u8mf2_tumu(vbool16_t vm, vuint8mf2_t vd,
+                                                 vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_xu_f_w_bf16m2_u8m1_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vuint8m1_t test_vfncvt_xu_f_w_bf16m2_u8m1_tumu(vbool8_t vm, vuint8m1_t vd,
+                                               vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_xu_f_w_bf16m4_u8m2_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vuint8m2_t test_vfncvt_xu_f_w_bf16m4_u8m2_tumu(vbool4_t vm, vuint8m2_t vd,
+                                               vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_xu_f_w_bf16m8_u8m4_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vuint8m4_t test_vfncvt_xu_f_w_bf16m8_u8m4_tumu(vbool2_t vm, vuint8m4_t vd,
+                                               vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfncvt_f_f_w_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv1bf16.nxv1f32.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfncvt_f_f_w_bf16mf4_tumu(vbool64_t vm, vbfloat16mf4_t vd,
+                                              vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_bf16_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfncvt_f_f_w_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv2bf16.nxv2f32.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfncvt_f_f_w_bf16mf2_tumu(vbool32_t vm, vbfloat16mf2_t vd,
+                                              vfloat32m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_bf16_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfncvt_f_f_w_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv4bf16.nxv4f32.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfncvt_f_f_w_bf16m1_tumu(vbool16_t vm, vbfloat16m1_t vd,
+                                            vfloat32m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_bf16_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfncvt_f_f_w_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv8bf16.nxv8f32.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfncvt_f_f_w_bf16m2_tumu(vbool8_t vm, vbfloat16m2_t vd,
+                                            vfloat32m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_bf16_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfncvt_f_f_w_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv16bf16.nxv16f32.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfncvt_f_f_w_bf16m4_tumu(vbool4_t vm, vbfloat16m4_t vd,
+                                            vfloat32m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_bf16_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_x_f_w_bf16mf4_i8mf8_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vint8mf8_t test_vfncvt_x_f_w_bf16mf4_i8mf8_mu(vbool64_t vm, vint8mf8_t vd,
+                                              vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_x_f_w_bf16mf2_i8mf4_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vint8mf4_t test_vfncvt_x_f_w_bf16mf2_i8mf4_mu(vbool32_t vm, vint8mf4_t vd,
+                                              vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_x_f_w_bf16m1_i8mf2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vint8mf2_t test_vfncvt_x_f_w_bf16m1_i8mf2_mu(vbool16_t vm, vint8mf2_t vd,
+                                             vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_x_f_w_bf16m2_i8m1_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vint8m1_t test_vfncvt_x_f_w_bf16m2_i8m1_mu(vbool8_t vm, vint8m1_t vd,
+                                           vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_x_f_w_bf16m4_i8m2_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vint8m2_t test_vfncvt_x_f_w_bf16m4_i8m2_mu(vbool4_t vm, vint8m2_t vd,
+                                           vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_x_f_w_bf16m8_i8m4_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vint8m4_t test_vfncvt_x_f_w_bf16m8_i8m4_mu(vbool2_t vm, vint8m4_t vd,
+                                           vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_xu_f_w_bf16mf4_u8mf8_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vuint8mf8_t test_vfncvt_xu_f_w_bf16mf4_u8mf8_mu(vbool64_t vm, vuint8mf8_t vd,
+                                                vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_xu_f_w_bf16mf2_u8mf4_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vuint8mf4_t test_vfncvt_xu_f_w_bf16mf2_u8mf4_mu(vbool32_t vm, vuint8mf4_t vd,
+                                                vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_xu_f_w_bf16m1_u8mf2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vuint8mf2_t test_vfncvt_xu_f_w_bf16m1_u8mf2_mu(vbool16_t vm, vuint8mf2_t vd,
+                                               vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_xu_f_w_bf16m2_u8m1_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vuint8m1_t test_vfncvt_xu_f_w_bf16m2_u8m1_mu(vbool8_t vm, vuint8m1_t vd,
+                                             vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_xu_f_w_bf16m4_u8m2_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vuint8m2_t test_vfncvt_xu_f_w_bf16m4_u8m2_mu(vbool4_t vm, vuint8m2_t vd,
+                                             vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_xu_f_w_bf16m8_u8m4_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vuint8m4_t test_vfncvt_xu_f_w_bf16m8_u8m4_mu(vbool2_t vm, vuint8m4_t vd,
+                                             vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfncvt_f_f_w_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv1bf16.nxv1f32.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfncvt_f_f_w_bf16mf4_mu(vbool64_t vm, vbfloat16mf4_t vd,
+                                            vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_bf16_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfncvt_f_f_w_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv2bf16.nxv2f32.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfncvt_f_f_w_bf16mf2_mu(vbool32_t vm, vbfloat16mf2_t vd,
+                                            vfloat32m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_bf16_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfncvt_f_f_w_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv4bf16.nxv4f32.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfncvt_f_f_w_bf16m1_mu(vbool16_t vm, vbfloat16m1_t vd,
+                                          vfloat32m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_bf16_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfncvt_f_f_w_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv8bf16.nxv8f32.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfncvt_f_f_w_bf16m2_mu(vbool8_t vm, vbfloat16m2_t vd,
+                                          vfloat32m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_bf16_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfncvt_f_f_w_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv16bf16.nxv16f32.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfncvt_f_f_w_bf16m4_mu(vbool4_t vm, vbfloat16m4_t vd,
+                                          vfloat32m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_bf16_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_x_f_w_bf16mf4_i8mf8_rm_tu(
+// CHECK-RV64-SAME: <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.x.f.w.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vint8mf8_t test_vfncvt_x_f_w_bf16mf4_i8mf8_rm_tu(vint8mf8_t vd,
+                                                 vbfloat16mf4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfncvt_x_tu(vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_x_f_w_bf16mf2_i8mf4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.x.f.w.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vint8mf4_t test_vfncvt_x_f_w_bf16mf2_i8mf4_rm_tu(vint8mf4_t vd,
+                                                 vbfloat16mf2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfncvt_x_tu(vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_x_f_w_bf16m1_i8mf2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.x.f.w.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vint8mf2_t test_vfncvt_x_f_w_bf16m1_i8mf2_rm_tu(vint8mf2_t vd,
+                                                vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_tu(vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_x_f_w_bf16m2_i8m1_rm_tu(
+// CHECK-RV64-SAME: <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.x.f.w.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vint8m1_t test_vfncvt_x_f_w_bf16m2_i8m1_rm_tu(vint8m1_t vd, vbfloat16m2_t vs2,
+                                              size_t vl) {
+  return __riscv_vfncvt_x_tu(vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_x_f_w_bf16m4_i8m2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.x.f.w.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vint8m2_t test_vfncvt_x_f_w_bf16m4_i8m2_rm_tu(vint8m2_t vd, vbfloat16m4_t vs2,
+                                              size_t vl) {
+  return __riscv_vfncvt_x_tu(vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_x_f_w_bf16m8_i8m4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.x.f.w.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vint8m4_t test_vfncvt_x_f_w_bf16m8_i8m4_rm_tu(vint8m4_t vd, vbfloat16m8_t vs2,
+                                              size_t vl) {
+  return __riscv_vfncvt_x_tu(vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_xu_f_w_bf16mf4_u8mf8_rm_tu(
+// CHECK-RV64-SAME: <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vuint8mf8_t test_vfncvt_xu_f_w_bf16mf4_u8mf8_rm_tu(vuint8mf8_t vd,
+                                                   vbfloat16mf4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfncvt_xu_tu(vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_xu_f_w_bf16mf2_u8mf4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vuint8mf4_t test_vfncvt_xu_f_w_bf16mf2_u8mf4_rm_tu(vuint8mf4_t vd,
+                                                   vbfloat16mf2_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfncvt_xu_tu(vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_xu_f_w_bf16m1_u8mf2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vuint8mf2_t test_vfncvt_xu_f_w_bf16m1_u8mf2_rm_tu(vuint8mf2_t vd,
+                                                  vbfloat16m1_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfncvt_xu_tu(vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_xu_f_w_bf16m2_u8m1_rm_tu(
+// CHECK-RV64-SAME: <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vuint8m1_t test_vfncvt_xu_f_w_bf16m2_u8m1_rm_tu(vuint8m1_t vd,
+                                                vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_tu(vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_xu_f_w_bf16m4_u8m2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vuint8m2_t test_vfncvt_xu_f_w_bf16m4_u8m2_rm_tu(vuint8m2_t vd,
+                                                vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_tu(vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_xu_f_w_bf16m8_u8m4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vuint8m4_t test_vfncvt_xu_f_w_bf16m8_u8m4_rm_tu(vuint8m4_t vd,
+                                                vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_tu(vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfncvt_f_f_w_bf16mf4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfncvt.f.f.w.nxv1bf16.nxv1f32.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x float> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfncvt_f_f_w_bf16mf4_rm_tu(vbfloat16mf4_t vd,
+                                               vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_bf16_tu(vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfncvt_f_f_w_bf16mf2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfncvt.f.f.w.nxv2bf16.nxv2f32.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x float> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfncvt_f_f_w_bf16mf2_rm_tu(vbfloat16mf2_t vd,
+                                               vfloat32m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_bf16_tu(vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfncvt_f_f_w_bf16m1_rm_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfncvt.f.f.w.nxv4bf16.nxv4f32.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x float> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfncvt_f_f_w_bf16m1_rm_tu(vbfloat16m1_t vd, vfloat32m2_t vs2,
+                                             size_t vl) {
+  return __riscv_vfncvt_f_bf16_tu(vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfncvt_f_f_w_bf16m2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfncvt.f.f.w.nxv8bf16.nxv8f32.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x float> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfncvt_f_f_w_bf16m2_rm_tu(vbfloat16m2_t vd, vfloat32m4_t vs2,
+                                             size_t vl) {
+  return __riscv_vfncvt_f_bf16_tu(vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfncvt_f_f_w_bf16m4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfncvt.f.f.w.nxv16bf16.nxv16f32.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x float> [[VS2]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfncvt_f_f_w_bf16m4_rm_tu(vbfloat16m4_t vd, vfloat32m8_t vs2,
+                                             size_t vl) {
+  return __riscv_vfncvt_f_bf16_tu(vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_x_f_w_bf16mf4_i8mf8_rm_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vint8mf8_t test_vfncvt_x_f_w_bf16mf4_i8mf8_rm_tum(vbool64_t vm, vint8mf8_t vd,
+                                                  vbfloat16mf4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfncvt_x_tum(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_x_f_w_bf16mf2_i8mf4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vint8mf4_t test_vfncvt_x_f_w_bf16mf2_i8mf4_rm_tum(vbool32_t vm, vint8mf4_t vd,
+                                                  vbfloat16mf2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfncvt_x_tum(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_x_f_w_bf16m1_i8mf2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vint8mf2_t test_vfncvt_x_f_w_bf16m1_i8mf2_rm_tum(vbool16_t vm, vint8mf2_t vd,
+                                                 vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_tum(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_x_f_w_bf16m2_i8m1_rm_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vint8m1_t test_vfncvt_x_f_w_bf16m2_i8m1_rm_tum(vbool8_t vm, vint8m1_t vd,
+                                               vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_tum(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_x_f_w_bf16m4_i8m2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vint8m2_t test_vfncvt_x_f_w_bf16m4_i8m2_rm_tum(vbool4_t vm, vint8m2_t vd,
+                                               vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_tum(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_x_f_w_bf16m8_i8m4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vint8m4_t test_vfncvt_x_f_w_bf16m8_i8m4_rm_tum(vbool2_t vm, vint8m4_t vd,
+                                               vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_tum(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_xu_f_w_bf16mf4_u8mf8_rm_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vuint8mf8_t test_vfncvt_xu_f_w_bf16mf4_u8mf8_rm_tum(vbool64_t vm,
+                                                    vuint8mf8_t vd,
+                                                    vbfloat16mf4_t vs2,
+                                                    size_t vl) {
+  return __riscv_vfncvt_xu_tum(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_xu_f_w_bf16mf2_u8mf4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vuint8mf4_t test_vfncvt_xu_f_w_bf16mf2_u8mf4_rm_tum(vbool32_t vm,
+                                                    vuint8mf4_t vd,
+                                                    vbfloat16mf2_t vs2,
+                                                    size_t vl) {
+  return __riscv_vfncvt_xu_tum(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_xu_f_w_bf16m1_u8mf2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vuint8mf2_t test_vfncvt_xu_f_w_bf16m1_u8mf2_rm_tum(vbool16_t vm, vuint8mf2_t vd,
+                                                   vbfloat16m1_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfncvt_xu_tum(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_xu_f_w_bf16m2_u8m1_rm_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vuint8m1_t test_vfncvt_xu_f_w_bf16m2_u8m1_rm_tum(vbool8_t vm, vuint8m1_t vd,
+                                                 vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_tum(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_xu_f_w_bf16m4_u8m2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vuint8m2_t test_vfncvt_xu_f_w_bf16m4_u8m2_rm_tum(vbool4_t vm, vuint8m2_t vd,
+                                                 vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_tum(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_xu_f_w_bf16m8_u8m4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vuint8m4_t test_vfncvt_xu_f_w_bf16m8_u8m4_rm_tum(vbool2_t vm, vuint8m4_t vd,
+                                                 vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_tum(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfncvt_f_f_w_bf16mf4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv1bf16.nxv1f32.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfncvt_f_f_w_bf16mf4_rm_tum(vbool64_t vm, vbfloat16mf4_t vd,
+                                                vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_bf16_tum(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfncvt_f_f_w_bf16mf2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv2bf16.nxv2f32.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfncvt_f_f_w_bf16mf2_rm_tum(vbool32_t vm, vbfloat16mf2_t vd,
+                                                vfloat32m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_bf16_tum(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfncvt_f_f_w_bf16m1_rm_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv4bf16.nxv4f32.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfncvt_f_f_w_bf16m1_rm_tum(vbool16_t vm, vbfloat16m1_t vd,
+                                              vfloat32m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_bf16_tum(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfncvt_f_f_w_bf16m2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv8bf16.nxv8f32.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfncvt_f_f_w_bf16m2_rm_tum(vbool8_t vm, vbfloat16m2_t vd,
+                                              vfloat32m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_bf16_tum(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfncvt_f_f_w_bf16m4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv16bf16.nxv16f32.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfncvt_f_f_w_bf16m4_rm_tum(vbool4_t vm, vbfloat16m4_t vd,
+                                              vfloat32m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_bf16_tum(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_x_f_w_bf16mf4_i8mf8_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vint8mf8_t test_vfncvt_x_f_w_bf16mf4_i8mf8_rm_tumu(vbool64_t vm, vint8mf8_t vd,
+                                                   vbfloat16mf4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfncvt_x_tumu(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_x_f_w_bf16mf2_i8mf4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vint8mf4_t test_vfncvt_x_f_w_bf16mf2_i8mf4_rm_tumu(vbool32_t vm, vint8mf4_t vd,
+                                                   vbfloat16mf2_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfncvt_x_tumu(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_x_f_w_bf16m1_i8mf2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vint8mf2_t test_vfncvt_x_f_w_bf16m1_i8mf2_rm_tumu(vbool16_t vm, vint8mf2_t vd,
+                                                  vbfloat16m1_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfncvt_x_tumu(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_x_f_w_bf16m2_i8m1_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vint8m1_t test_vfncvt_x_f_w_bf16m2_i8m1_rm_tumu(vbool8_t vm, vint8m1_t vd,
+                                                vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_tumu(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_x_f_w_bf16m4_i8m2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vint8m2_t test_vfncvt_x_f_w_bf16m4_i8m2_rm_tumu(vbool4_t vm, vint8m2_t vd,
+                                                vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_tumu(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_x_f_w_bf16m8_i8m4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vint8m4_t test_vfncvt_x_f_w_bf16m8_i8m4_rm_tumu(vbool2_t vm, vint8m4_t vd,
+                                                vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_tumu(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_xu_f_w_bf16mf4_u8mf8_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vuint8mf8_t test_vfncvt_xu_f_w_bf16mf4_u8mf8_rm_tumu(vbool64_t vm,
+                                                     vuint8mf8_t vd,
+                                                     vbfloat16mf4_t vs2,
+                                                     size_t vl) {
+  return __riscv_vfncvt_xu_tumu(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_xu_f_w_bf16mf2_u8mf4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vuint8mf4_t test_vfncvt_xu_f_w_bf16mf2_u8mf4_rm_tumu(vbool32_t vm,
+                                                     vuint8mf4_t vd,
+                                                     vbfloat16mf2_t vs2,
+                                                     size_t vl) {
+  return __riscv_vfncvt_xu_tumu(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_xu_f_w_bf16m1_u8mf2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vuint8mf2_t test_vfncvt_xu_f_w_bf16m1_u8mf2_rm_tumu(vbool16_t vm,
+                                                    vuint8mf2_t vd,
+                                                    vbfloat16m1_t vs2,
+                                                    size_t vl) {
+  return __riscv_vfncvt_xu_tumu(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_xu_f_w_bf16m2_u8m1_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vuint8m1_t test_vfncvt_xu_f_w_bf16m2_u8m1_rm_tumu(vbool8_t vm, vuint8m1_t vd,
+                                                  vbfloat16m2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfncvt_xu_tumu(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_xu_f_w_bf16m4_u8m2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vuint8m2_t test_vfncvt_xu_f_w_bf16m4_u8m2_rm_tumu(vbool4_t vm, vuint8m2_t vd,
+                                                  vbfloat16m4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfncvt_xu_tumu(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_xu_f_w_bf16m8_u8m4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vuint8m4_t test_vfncvt_xu_f_w_bf16m8_u8m4_rm_tumu(vbool2_t vm, vuint8m4_t vd,
+                                                  vbfloat16m8_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfncvt_xu_tumu(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfncvt_f_f_w_bf16mf4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv1bf16.nxv1f32.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfncvt_f_f_w_bf16mf4_rm_tumu(vbool64_t vm,
+                                                 vbfloat16mf4_t vd,
+                                                 vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_bf16_tumu(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfncvt_f_f_w_bf16mf2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv2bf16.nxv2f32.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfncvt_f_f_w_bf16mf2_rm_tumu(vbool32_t vm,
+                                                 vbfloat16mf2_t vd,
+                                                 vfloat32m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_bf16_tumu(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfncvt_f_f_w_bf16m1_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv4bf16.nxv4f32.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfncvt_f_f_w_bf16m1_rm_tumu(vbool16_t vm, vbfloat16m1_t vd,
+                                               vfloat32m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_bf16_tumu(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfncvt_f_f_w_bf16m2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv8bf16.nxv8f32.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfncvt_f_f_w_bf16m2_rm_tumu(vbool8_t vm, vbfloat16m2_t vd,
+                                               vfloat32m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_bf16_tumu(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfncvt_f_f_w_bf16m4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv16bf16.nxv16f32.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfncvt_f_f_w_bf16m4_rm_tumu(vbool4_t vm, vbfloat16m4_t vd,
+                                               vfloat32m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_bf16_tumu(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_x_f_w_bf16mf4_i8mf8_rm_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vint8mf8_t test_vfncvt_x_f_w_bf16mf4_i8mf8_rm_mu(vbool64_t vm, vint8mf8_t vd,
+                                                 vbfloat16mf4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfncvt_x_mu(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_x_f_w_bf16mf2_i8mf4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vint8mf4_t test_vfncvt_x_f_w_bf16mf2_i8mf4_rm_mu(vbool32_t vm, vint8mf4_t vd,
+                                                 vbfloat16mf2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfncvt_x_mu(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_x_f_w_bf16m1_i8mf2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vint8mf2_t test_vfncvt_x_f_w_bf16m1_i8mf2_rm_mu(vbool16_t vm, vint8mf2_t vd,
+                                                vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_mu(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_x_f_w_bf16m2_i8m1_rm_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vint8m1_t test_vfncvt_x_f_w_bf16m2_i8m1_rm_mu(vbool8_t vm, vint8m1_t vd,
+                                              vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_mu(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_x_f_w_bf16m4_i8m2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vint8m2_t test_vfncvt_x_f_w_bf16m4_i8m2_rm_mu(vbool4_t vm, vint8m2_t vd,
+                                              vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_mu(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_x_f_w_bf16m8_i8m4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vint8m4_t test_vfncvt_x_f_w_bf16m8_i8m4_rm_mu(vbool2_t vm, vint8m4_t vd,
+                                              vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_x_mu(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_xu_f_w_bf16mf4_u8mf8_rm_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vuint8mf8_t test_vfncvt_xu_f_w_bf16mf4_u8mf8_rm_mu(vbool64_t vm, vuint8mf8_t vd,
+                                                   vbfloat16mf4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfncvt_xu_mu(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_xu_f_w_bf16mf2_u8mf4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vuint8mf4_t test_vfncvt_xu_f_w_bf16mf2_u8mf4_rm_mu(vbool32_t vm, vuint8mf4_t vd,
+                                                   vbfloat16mf2_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfncvt_xu_mu(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_xu_f_w_bf16m1_u8mf2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vuint8mf2_t test_vfncvt_xu_f_w_bf16m1_u8mf2_rm_mu(vbool16_t vm, vuint8mf2_t vd,
+                                                  vbfloat16m1_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfncvt_xu_mu(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_xu_f_w_bf16m2_u8m1_rm_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vuint8m1_t test_vfncvt_xu_f_w_bf16m2_u8m1_rm_mu(vbool8_t vm, vuint8m1_t vd,
+                                                vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_mu(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_xu_f_w_bf16m4_u8m2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vuint8m2_t test_vfncvt_xu_f_w_bf16m4_u8m2_rm_mu(vbool4_t vm, vuint8m2_t vd,
+                                                vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_mu(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_xu_f_w_bf16m8_u8m4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vuint8m4_t test_vfncvt_xu_f_w_bf16m8_u8m4_rm_mu(vbool2_t vm, vuint8m4_t vd,
+                                                vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_xu_mu(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfncvt_f_f_w_bf16mf4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv1bf16.nxv1f32.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfncvt_f_f_w_bf16mf4_rm_mu(vbool64_t vm, vbfloat16mf4_t vd,
+                                               vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_bf16_mu(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfncvt_f_f_w_bf16mf2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv2bf16.nxv2f32.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfncvt_f_f_w_bf16mf2_rm_mu(vbool32_t vm, vbfloat16mf2_t vd,
+                                               vfloat32m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_bf16_mu(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfncvt_f_f_w_bf16m1_rm_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv4bf16.nxv4f32.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfncvt_f_f_w_bf16m1_rm_mu(vbool16_t vm, vbfloat16m1_t vd,
+                                             vfloat32m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_bf16_mu(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfncvt_f_f_w_bf16m2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv8bf16.nxv8f32.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfncvt_f_f_w_bf16m2_rm_mu(vbool8_t vm, vbfloat16m2_t vd,
+                                             vfloat32m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_bf16_mu(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfncvt_f_f_w_bf16m4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfncvt.f.f.w.mask.nxv16bf16.nxv16f32.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfncvt_f_f_w_bf16m4_rm_mu(vbool4_t vm, vbfloat16m4_t vd,
+                                             vfloat32m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_f_bf16_mu(vm, vd, vs2, __RISCV_FRM_RNE, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfncvt_rod.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfncvt_rod.c
new file mode 100644
index 0000000000000..0745633042d44
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfncvt_rod.c
@@ -0,0 +1,233 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfncvt_rod_f_f_w_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.nxv1bf16.nxv1f32.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfncvt_rod_f_f_w_bf16mf4_tu(vbfloat16mf4_t vd,
+                                                vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_bf16_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfncvt_rod_f_f_w_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.nxv2bf16.nxv2f32.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfncvt_rod_f_f_w_bf16mf2_tu(vbfloat16mf2_t vd,
+                                                vfloat32m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_bf16_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfncvt_rod_f_f_w_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.nxv4bf16.nxv4f32.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfncvt_rod_f_f_w_bf16m1_tu(vbfloat16m1_t vd,
+                                              vfloat32m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_bf16_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfncvt_rod_f_f_w_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.nxv8bf16.nxv8f32.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfncvt_rod_f_f_w_bf16m2_tu(vbfloat16m2_t vd,
+                                              vfloat32m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_bf16_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfncvt_rod_f_f_w_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.nxv16bf16.nxv16f32.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfncvt_rod_f_f_w_bf16m4_tu(vbfloat16m4_t vd,
+                                              vfloat32m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_bf16_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfncvt_rod_f_f_w_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv1bf16.nxv1f32.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfncvt_rod_f_f_w_bf16mf4_tum(vbool64_t vm,
+                                                 vbfloat16mf4_t vd,
+                                                 vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_bf16_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfncvt_rod_f_f_w_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv2bf16.nxv2f32.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfncvt_rod_f_f_w_bf16mf2_tum(vbool32_t vm,
+                                                 vbfloat16mf2_t vd,
+                                                 vfloat32m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_bf16_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfncvt_rod_f_f_w_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv4bf16.nxv4f32.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfncvt_rod_f_f_w_bf16m1_tum(vbool16_t vm, vbfloat16m1_t vd,
+                                               vfloat32m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_bf16_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfncvt_rod_f_f_w_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv8bf16.nxv8f32.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfncvt_rod_f_f_w_bf16m2_tum(vbool8_t vm, vbfloat16m2_t vd,
+                                               vfloat32m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_bf16_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfncvt_rod_f_f_w_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv16bf16.nxv16f32.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfncvt_rod_f_f_w_bf16m4_tum(vbool4_t vm, vbfloat16m4_t vd,
+                                               vfloat32m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_bf16_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfncvt_rod_f_f_w_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv1bf16.nxv1f32.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfncvt_rod_f_f_w_bf16mf4_tumu(vbool64_t vm,
+                                                  vbfloat16mf4_t vd,
+                                                  vfloat32mf2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfncvt_rod_f_bf16_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfncvt_rod_f_f_w_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv2bf16.nxv2f32.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfncvt_rod_f_f_w_bf16mf2_tumu(vbool32_t vm,
+                                                  vbfloat16mf2_t vd,
+                                                  vfloat32m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_bf16_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfncvt_rod_f_f_w_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv4bf16.nxv4f32.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfncvt_rod_f_f_w_bf16m1_tumu(vbool16_t vm, vbfloat16m1_t vd,
+                                                vfloat32m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_bf16_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfncvt_rod_f_f_w_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv8bf16.nxv8f32.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfncvt_rod_f_f_w_bf16m2_tumu(vbool8_t vm, vbfloat16m2_t vd,
+                                                vfloat32m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_bf16_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfncvt_rod_f_f_w_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv16bf16.nxv16f32.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfncvt_rod_f_f_w_bf16m4_tumu(vbool4_t vm, vbfloat16m4_t vd,
+                                                vfloat32m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_bf16_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfncvt_rod_f_f_w_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv1bf16.nxv1f32.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfncvt_rod_f_f_w_bf16mf4_mu(vbool64_t vm, vbfloat16mf4_t vd,
+                                                vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_bf16_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfncvt_rod_f_f_w_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv2bf16.nxv2f32.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfncvt_rod_f_f_w_bf16mf2_mu(vbool32_t vm, vbfloat16mf2_t vd,
+                                                vfloat32m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_bf16_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfncvt_rod_f_f_w_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv4bf16.nxv4f32.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfncvt_rod_f_f_w_bf16m1_mu(vbool16_t vm, vbfloat16m1_t vd,
+                                              vfloat32m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_bf16_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfncvt_rod_f_f_w_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv8bf16.nxv8f32.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfncvt_rod_f_f_w_bf16m2_mu(vbool8_t vm, vbfloat16m2_t vd,
+                                              vfloat32m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_bf16_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfncvt_rod_f_f_w_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv16bf16.nxv16f32.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfncvt_rod_f_f_w_bf16m4_mu(vbool4_t vm, vbfloat16m4_t vd,
+                                              vfloat32m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_rod_f_bf16_mu(vm, vd, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfncvt_rtz.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfncvt_rtz.c
new file mode 100644
index 0000000000000..b906c5f411064
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfncvt_rtz.c
@@ -0,0 +1,572 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_rtz_x_f_w_bf16mf4_i8mf8_tu(
+// CHECK-RV64-SAME: <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vint8mf8_t test_vfncvt_rtz_x_f_w_bf16mf4_i8mf8_tu(vint8mf8_t vd,
+                                                  vbfloat16mf4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfncvt_rtz_x_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_rtz_x_f_w_bf16mf2_i8mf4_tu(
+// CHECK-RV64-SAME: <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vint8mf4_t test_vfncvt_rtz_x_f_w_bf16mf2_i8mf4_tu(vint8mf4_t vd,
+                                                  vbfloat16mf2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfncvt_rtz_x_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_rtz_x_f_w_bf16m1_i8mf2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vint8mf2_t test_vfncvt_rtz_x_f_w_bf16m1_i8mf2_tu(vint8mf2_t vd,
+                                                 vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_x_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_rtz_x_f_w_bf16m2_i8m1_tu(
+// CHECK-RV64-SAME: <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vint8m1_t test_vfncvt_rtz_x_f_w_bf16m2_i8m1_tu(vint8m1_t vd, vbfloat16m2_t vs2,
+                                               size_t vl) {
+  return __riscv_vfncvt_rtz_x_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_rtz_x_f_w_bf16m4_i8m2_tu(
+// CHECK-RV64-SAME: <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vint8m2_t test_vfncvt_rtz_x_f_w_bf16m4_i8m2_tu(vint8m2_t vd, vbfloat16m4_t vs2,
+                                               size_t vl) {
+  return __riscv_vfncvt_rtz_x_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_rtz_x_f_w_bf16m8_i8m4_tu(
+// CHECK-RV64-SAME: <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vint8m4_t test_vfncvt_rtz_x_f_w_bf16m8_i8m4_tu(vint8m4_t vd, vbfloat16m8_t vs2,
+                                               size_t vl) {
+  return __riscv_vfncvt_rtz_x_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_rtz_xu_f_w_bf16mf4_u8mf8_tu(
+// CHECK-RV64-SAME: <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vuint8mf8_t test_vfncvt_rtz_xu_f_w_bf16mf4_u8mf8_tu(vuint8mf8_t vd,
+                                                    vbfloat16mf4_t vs2,
+                                                    size_t vl) {
+  return __riscv_vfncvt_rtz_xu_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_rtz_xu_f_w_bf16mf2_u8mf4_tu(
+// CHECK-RV64-SAME: <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vuint8mf4_t test_vfncvt_rtz_xu_f_w_bf16mf2_u8mf4_tu(vuint8mf4_t vd,
+                                                    vbfloat16mf2_t vs2,
+                                                    size_t vl) {
+  return __riscv_vfncvt_rtz_xu_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_rtz_xu_f_w_bf16m1_u8mf2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vuint8mf2_t test_vfncvt_rtz_xu_f_w_bf16m1_u8mf2_tu(vuint8mf2_t vd,
+                                                   vbfloat16m1_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfncvt_rtz_xu_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_rtz_xu_f_w_bf16m2_u8m1_tu(
+// CHECK-RV64-SAME: <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vuint8m1_t test_vfncvt_rtz_xu_f_w_bf16m2_u8m1_tu(vuint8m1_t vd,
+                                                 vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_xu_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_rtz_xu_f_w_bf16m4_u8m2_tu(
+// CHECK-RV64-SAME: <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vuint8m2_t test_vfncvt_rtz_xu_f_w_bf16m4_u8m2_tu(vuint8m2_t vd,
+                                                 vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_xu_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_rtz_xu_f_w_bf16m8_u8m4_tu(
+// CHECK-RV64-SAME: <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vuint8m4_t test_vfncvt_rtz_xu_f_w_bf16m8_u8m4_tu(vuint8m4_t vd,
+                                                 vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_xu_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_rtz_x_f_w_bf16mf4_i8mf8_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vint8mf8_t test_vfncvt_rtz_x_f_w_bf16mf4_i8mf8_tum(vbool64_t vm, vint8mf8_t vd,
+                                                   vbfloat16mf4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfncvt_rtz_x_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_rtz_x_f_w_bf16mf2_i8mf4_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vint8mf4_t test_vfncvt_rtz_x_f_w_bf16mf2_i8mf4_tum(vbool32_t vm, vint8mf4_t vd,
+                                                   vbfloat16mf2_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfncvt_rtz_x_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_rtz_x_f_w_bf16m1_i8mf2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vint8mf2_t test_vfncvt_rtz_x_f_w_bf16m1_i8mf2_tum(vbool16_t vm, vint8mf2_t vd,
+                                                  vbfloat16m1_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfncvt_rtz_x_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_rtz_x_f_w_bf16m2_i8m1_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vint8m1_t test_vfncvt_rtz_x_f_w_bf16m2_i8m1_tum(vbool8_t vm, vint8m1_t vd,
+                                                vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_x_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_rtz_x_f_w_bf16m4_i8m2_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vint8m2_t test_vfncvt_rtz_x_f_w_bf16m4_i8m2_tum(vbool4_t vm, vint8m2_t vd,
+                                                vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_x_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_rtz_x_f_w_bf16m8_i8m4_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vint8m4_t test_vfncvt_rtz_x_f_w_bf16m8_i8m4_tum(vbool2_t vm, vint8m4_t vd,
+                                                vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_x_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_rtz_xu_f_w_bf16mf4_u8mf8_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vuint8mf8_t test_vfncvt_rtz_xu_f_w_bf16mf4_u8mf8_tum(vbool64_t vm,
+                                                     vuint8mf8_t vd,
+                                                     vbfloat16mf4_t vs2,
+                                                     size_t vl) {
+  return __riscv_vfncvt_rtz_xu_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_rtz_xu_f_w_bf16mf2_u8mf4_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vuint8mf4_t test_vfncvt_rtz_xu_f_w_bf16mf2_u8mf4_tum(vbool32_t vm,
+                                                     vuint8mf4_t vd,
+                                                     vbfloat16mf2_t vs2,
+                                                     size_t vl) {
+  return __riscv_vfncvt_rtz_xu_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_rtz_xu_f_w_bf16m1_u8mf2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vuint8mf2_t test_vfncvt_rtz_xu_f_w_bf16m1_u8mf2_tum(vbool16_t vm,
+                                                    vuint8mf2_t vd,
+                                                    vbfloat16m1_t vs2,
+                                                    size_t vl) {
+  return __riscv_vfncvt_rtz_xu_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_rtz_xu_f_w_bf16m2_u8m1_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vuint8m1_t test_vfncvt_rtz_xu_f_w_bf16m2_u8m1_tum(vbool8_t vm, vuint8m1_t vd,
+                                                  vbfloat16m2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfncvt_rtz_xu_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_rtz_xu_f_w_bf16m4_u8m2_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vuint8m2_t test_vfncvt_rtz_xu_f_w_bf16m4_u8m2_tum(vbool4_t vm, vuint8m2_t vd,
+                                                  vbfloat16m4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfncvt_rtz_xu_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_rtz_xu_f_w_bf16m8_u8m4_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vuint8m4_t test_vfncvt_rtz_xu_f_w_bf16m8_u8m4_tum(vbool2_t vm, vuint8m4_t vd,
+                                                  vbfloat16m8_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfncvt_rtz_xu_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_rtz_x_f_w_bf16mf4_i8mf8_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vint8mf8_t test_vfncvt_rtz_x_f_w_bf16mf4_i8mf8_tumu(vbool64_t vm, vint8mf8_t vd,
+                                                    vbfloat16mf4_t vs2,
+                                                    size_t vl) {
+  return __riscv_vfncvt_rtz_x_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_rtz_x_f_w_bf16mf2_i8mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vint8mf4_t test_vfncvt_rtz_x_f_w_bf16mf2_i8mf4_tumu(vbool32_t vm, vint8mf4_t vd,
+                                                    vbfloat16mf2_t vs2,
+                                                    size_t vl) {
+  return __riscv_vfncvt_rtz_x_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_rtz_x_f_w_bf16m1_i8mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vint8mf2_t test_vfncvt_rtz_x_f_w_bf16m1_i8mf2_tumu(vbool16_t vm, vint8mf2_t vd,
+                                                   vbfloat16m1_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfncvt_rtz_x_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_rtz_x_f_w_bf16m2_i8m1_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vint8m1_t test_vfncvt_rtz_x_f_w_bf16m2_i8m1_tumu(vbool8_t vm, vint8m1_t vd,
+                                                 vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_x_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_rtz_x_f_w_bf16m4_i8m2_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vint8m2_t test_vfncvt_rtz_x_f_w_bf16m4_i8m2_tumu(vbool4_t vm, vint8m2_t vd,
+                                                 vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_x_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_rtz_x_f_w_bf16m8_i8m4_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vint8m4_t test_vfncvt_rtz_x_f_w_bf16m8_i8m4_tumu(vbool2_t vm, vint8m4_t vd,
+                                                 vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_x_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_rtz_xu_f_w_bf16mf4_u8mf8_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vuint8mf8_t test_vfncvt_rtz_xu_f_w_bf16mf4_u8mf8_tumu(vbool64_t vm,
+                                                      vuint8mf8_t vd,
+                                                      vbfloat16mf4_t vs2,
+                                                      size_t vl) {
+  return __riscv_vfncvt_rtz_xu_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_rtz_xu_f_w_bf16mf2_u8mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vuint8mf4_t test_vfncvt_rtz_xu_f_w_bf16mf2_u8mf4_tumu(vbool32_t vm,
+                                                      vuint8mf4_t vd,
+                                                      vbfloat16mf2_t vs2,
+                                                      size_t vl) {
+  return __riscv_vfncvt_rtz_xu_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_rtz_xu_f_w_bf16m1_u8mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vuint8mf2_t test_vfncvt_rtz_xu_f_w_bf16m1_u8mf2_tumu(vbool16_t vm,
+                                                     vuint8mf2_t vd,
+                                                     vbfloat16m1_t vs2,
+                                                     size_t vl) {
+  return __riscv_vfncvt_rtz_xu_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_rtz_xu_f_w_bf16m2_u8m1_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vuint8m1_t test_vfncvt_rtz_xu_f_w_bf16m2_u8m1_tumu(vbool8_t vm, vuint8m1_t vd,
+                                                   vbfloat16m2_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfncvt_rtz_xu_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_rtz_xu_f_w_bf16m4_u8m2_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vuint8m2_t test_vfncvt_rtz_xu_f_w_bf16m4_u8m2_tumu(vbool4_t vm, vuint8m2_t vd,
+                                                   vbfloat16m4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfncvt_rtz_xu_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_rtz_xu_f_w_bf16m8_u8m4_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vuint8m4_t test_vfncvt_rtz_xu_f_w_bf16m8_u8m4_tumu(vbool2_t vm, vuint8m4_t vd,
+                                                   vbfloat16m8_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfncvt_rtz_xu_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_rtz_x_f_w_bf16mf4_i8mf8_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vint8mf8_t test_vfncvt_rtz_x_f_w_bf16mf4_i8mf8_mu(vbool64_t vm, vint8mf8_t vd,
+                                                  vbfloat16mf4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfncvt_rtz_x_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_rtz_x_f_w_bf16mf2_i8mf4_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vint8mf4_t test_vfncvt_rtz_x_f_w_bf16mf2_i8mf4_mu(vbool32_t vm, vint8mf4_t vd,
+                                                  vbfloat16mf2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfncvt_rtz_x_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_rtz_x_f_w_bf16m1_i8mf2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vint8mf2_t test_vfncvt_rtz_x_f_w_bf16m1_i8mf2_mu(vbool16_t vm, vint8mf2_t vd,
+                                                 vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_x_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_rtz_x_f_w_bf16m2_i8m1_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vint8m1_t test_vfncvt_rtz_x_f_w_bf16m2_i8m1_mu(vbool8_t vm, vint8m1_t vd,
+                                               vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_x_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_rtz_x_f_w_bf16m4_i8m2_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vint8m2_t test_vfncvt_rtz_x_f_w_bf16m4_i8m2_mu(vbool4_t vm, vint8m2_t vd,
+                                               vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_x_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_rtz_x_f_w_bf16m8_i8m4_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vint8m4_t test_vfncvt_rtz_x_f_w_bf16m8_i8m4_mu(vbool2_t vm, vint8m4_t vd,
+                                               vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_x_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vfncvt_rtz_xu_f_w_bf16mf4_u8mf8_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x i8> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv1i8.nxv1bf16.i64(<vscale x 1 x i8> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i8> [[TMP0]]
+//
+vuint8mf8_t test_vfncvt_rtz_xu_f_w_bf16mf4_u8mf8_mu(vbool64_t vm,
+                                                    vuint8mf8_t vd,
+                                                    vbfloat16mf4_t vs2,
+                                                    size_t vl) {
+  return __riscv_vfncvt_rtz_xu_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vfncvt_rtz_xu_f_w_bf16mf2_u8mf4_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x i8> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv2i8.nxv2bf16.i64(<vscale x 2 x i8> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i8> [[TMP0]]
+//
+vuint8mf4_t test_vfncvt_rtz_xu_f_w_bf16mf2_u8mf4_mu(vbool32_t vm,
+                                                    vuint8mf4_t vd,
+                                                    vbfloat16mf2_t vs2,
+                                                    size_t vl) {
+  return __riscv_vfncvt_rtz_xu_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vfncvt_rtz_xu_f_w_bf16m1_u8mf2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x i8> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv4i8.nxv4bf16.i64(<vscale x 4 x i8> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
+//
+vuint8mf2_t test_vfncvt_rtz_xu_f_w_bf16m1_u8mf2_mu(vbool16_t vm, vuint8mf2_t vd,
+                                                   vbfloat16m1_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfncvt_rtz_xu_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vfncvt_rtz_xu_f_w_bf16m2_u8m1_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x i8> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv8i8.nxv8bf16.i64(<vscale x 8 x i8> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i8> [[TMP0]]
+//
+vuint8m1_t test_vfncvt_rtz_xu_f_w_bf16m2_u8m1_mu(vbool8_t vm, vuint8m1_t vd,
+                                                 vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_xu_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vfncvt_rtz_xu_f_w_bf16m4_u8m2_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x i8> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv16i8.nxv16bf16.i64(<vscale x 16 x i8> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
+//
+vuint8m2_t test_vfncvt_rtz_xu_f_w_bf16m4_u8m2_mu(vbool4_t vm, vuint8m2_t vd,
+                                                 vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_xu_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vfncvt_rtz_xu_f_w_bf16m8_u8m4_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x i8> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv32i8.nxv32bf16.i64(<vscale x 32 x i8> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
+//
+vuint8m4_t test_vfncvt_rtz_xu_f_w_bf16m8_u8m4_mu(vbool2_t vm, vuint8m4_t vd,
+                                                 vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfncvt_rtz_xu_mu(vm, vd, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfnmacc.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfnmacc.c
new file mode 100644
index 0000000000000..cc487b49429dd
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfnmacc.c
@@ -0,0 +1,489 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmacc_vv_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmacc.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmacc_vv_bf16mf4_tu(vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmacc_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmacc_vf_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmacc.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmacc_vf_bf16mf4_tu(vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmacc_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmacc_vv_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmacc.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmacc_vv_bf16mf2_tu(vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmacc_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmacc_vf_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmacc.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmacc_vf_bf16mf2_tu(vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmacc_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmacc_vv_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmacc.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmacc_vv_bf16m1_tu(vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmacc_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmacc_vf_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmacc.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmacc_vf_bf16m1_tu(vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmacc_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmacc_vv_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmacc.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmacc_vv_bf16m2_tu(vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmacc_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmacc_vf_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmacc.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmacc_vf_bf16m2_tu(vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmacc_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmacc_vv_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmacc.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmacc_vv_bf16m4_tu(vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmacc_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmacc_vf_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmacc.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmacc_vf_bf16m4_tu(vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmacc_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmacc_vv_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmacc.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmacc_vv_bf16m8_tu(vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmacc_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmacc_vf_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmacc.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmacc_vf_bf16m8_tu(vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmacc_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmacc_vv_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmacc.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmacc_vv_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmacc_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmacc_vf_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmacc.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmacc_vf_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmacc_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmacc_vv_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmacc.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmacc_vv_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmacc_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmacc_vf_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmacc.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmacc_vf_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmacc_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmacc_vv_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmacc.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmacc_vv_bf16m1_tum(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmacc_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmacc_vf_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmacc.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmacc_vf_bf16m1_tum(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmacc_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmacc_vv_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmacc.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmacc_vv_bf16m2_tum(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmacc_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmacc_vf_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmacc.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmacc_vf_bf16m2_tum(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmacc_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmacc_vv_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmacc.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmacc_vv_bf16m4_tum(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmacc_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmacc_vf_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmacc.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmacc_vf_bf16m4_tum(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmacc_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmacc_vv_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmacc.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmacc_vv_bf16m8_tum(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmacc_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmacc_vf_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmacc.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmacc_vf_bf16m8_tum(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmacc_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmacc_vv_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmacc.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmacc_vv_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmacc_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmacc_vf_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmacc.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmacc_vf_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmacc_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmacc_vv_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmacc.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmacc_vv_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmacc_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmacc_vf_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmacc.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmacc_vf_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmacc_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmacc_vv_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmacc.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmacc_vv_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmacc_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmacc_vf_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmacc.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmacc_vf_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmacc_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmacc_vv_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmacc.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmacc_vv_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmacc_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmacc_vf_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmacc.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmacc_vf_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmacc_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmacc_vv_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmacc.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmacc_vv_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmacc_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmacc_vf_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmacc.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmacc_vf_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmacc_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmacc_vv_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmacc.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmacc_vv_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmacc_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmacc_vf_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmacc.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmacc_vf_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmacc_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmacc_vv_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmacc.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmacc_vv_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmacc_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmacc_vf_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmacc.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmacc_vf_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmacc_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmacc_vv_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmacc.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmacc_vv_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmacc_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmacc_vf_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmacc.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmacc_vf_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmacc_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmacc_vv_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmacc.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmacc_vv_bf16m1_mu(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmacc_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmacc_vf_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmacc.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmacc_vf_bf16m1_mu(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmacc_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmacc_vv_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmacc.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmacc_vv_bf16m2_mu(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmacc_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmacc_vf_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmacc.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmacc_vf_bf16m2_mu(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmacc_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmacc_vv_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmacc.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmacc_vv_bf16m4_mu(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmacc_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmacc_vf_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmacc.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmacc_vf_bf16m4_mu(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmacc_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmacc_vv_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmacc.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmacc_vv_bf16m8_mu(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmacc_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmacc_vf_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmacc.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmacc_vf_bf16m8_mu(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmacc_mu(mask, vd, rs1, vs2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfnmadd.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfnmadd.c
new file mode 100644
index 0000000000000..f9c348b3dbb0b
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfnmadd.c
@@ -0,0 +1,489 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmadd_vv_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmadd.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmadd_vv_bf16mf4_tu(vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmadd_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmadd_vf_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmadd.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmadd_vf_bf16mf4_tu(vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmadd_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmadd_vv_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmadd.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmadd_vv_bf16mf2_tu(vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmadd_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmadd_vf_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmadd.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmadd_vf_bf16mf2_tu(vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmadd_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmadd_vv_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmadd.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmadd_vv_bf16m1_tu(vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmadd_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmadd_vf_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmadd.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmadd_vf_bf16m1_tu(vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmadd_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmadd_vv_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmadd.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmadd_vv_bf16m2_tu(vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmadd_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmadd_vf_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmadd.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmadd_vf_bf16m2_tu(vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmadd_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmadd_vv_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmadd.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmadd_vv_bf16m4_tu(vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmadd_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmadd_vf_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmadd.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmadd_vf_bf16m4_tu(vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmadd_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmadd_vv_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmadd.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmadd_vv_bf16m8_tu(vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmadd_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmadd_vf_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmadd.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmadd_vf_bf16m8_tu(vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmadd_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmadd_vv_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmadd.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmadd_vv_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmadd_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmadd_vf_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmadd.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmadd_vf_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmadd_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmadd_vv_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmadd.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmadd_vv_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmadd_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmadd_vf_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmadd.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmadd_vf_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmadd_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmadd_vv_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmadd.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmadd_vv_bf16m1_tum(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmadd_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmadd_vf_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmadd.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmadd_vf_bf16m1_tum(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmadd_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmadd_vv_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmadd.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmadd_vv_bf16m2_tum(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmadd_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmadd_vf_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmadd.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmadd_vf_bf16m2_tum(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmadd_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmadd_vv_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmadd.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmadd_vv_bf16m4_tum(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmadd_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmadd_vf_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmadd.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmadd_vf_bf16m4_tum(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmadd_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmadd_vv_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmadd.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmadd_vv_bf16m8_tum(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmadd_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmadd_vf_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmadd.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmadd_vf_bf16m8_tum(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmadd_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmadd_vv_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmadd.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmadd_vv_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmadd_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmadd_vf_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmadd.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmadd_vf_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmadd_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmadd_vv_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmadd.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmadd_vv_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmadd_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmadd_vf_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmadd.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmadd_vf_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmadd_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmadd_vv_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmadd.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmadd_vv_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmadd_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmadd_vf_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmadd.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmadd_vf_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmadd_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmadd_vv_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmadd.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmadd_vv_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmadd_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmadd_vf_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmadd.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmadd_vf_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmadd_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmadd_vv_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmadd.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmadd_vv_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmadd_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmadd_vf_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmadd.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmadd_vf_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmadd_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmadd_vv_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmadd.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmadd_vv_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmadd_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmadd_vf_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmadd.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmadd_vf_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmadd_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmadd_vv_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmadd.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmadd_vv_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmadd_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmadd_vf_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmadd.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmadd_vf_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmadd_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmadd_vv_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmadd.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmadd_vv_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmadd_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmadd_vf_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmadd.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmadd_vf_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmadd_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmadd_vv_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmadd.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmadd_vv_bf16m1_mu(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmadd_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmadd_vf_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmadd.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmadd_vf_bf16m1_mu(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmadd_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmadd_vv_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmadd.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmadd_vv_bf16m2_mu(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmadd_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmadd_vf_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmadd.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmadd_vf_bf16m2_mu(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmadd_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmadd_vv_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmadd.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmadd_vv_bf16m4_mu(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmadd_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmadd_vf_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmadd.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmadd_vf_bf16m4_mu(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmadd_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmadd_vv_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmadd.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmadd_vv_bf16m8_mu(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmadd_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmadd_vf_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmadd.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmadd_vf_bf16m8_mu(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmadd_mu(mask, vd, rs1, vs2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfnmsac.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfnmsac.c
new file mode 100644
index 0000000000000..83d35e81403ce
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfnmsac.c
@@ -0,0 +1,489 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsac_vv_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsac.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsac_vv_bf16mf4_tu(vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsac_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsac_vf_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsac.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsac_vf_bf16mf4_tu(vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsac_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsac_vv_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsac.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsac_vv_bf16mf2_tu(vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsac_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsac_vf_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsac.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsac_vf_bf16mf2_tu(vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsac_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsac_vv_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsac.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsac_vv_bf16m1_tu(vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsac_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsac_vf_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsac.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsac_vf_bf16m1_tu(vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsac_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsac_vv_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsac.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsac_vv_bf16m2_tu(vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsac_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsac_vf_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsac.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsac_vf_bf16m2_tu(vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsac_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsac_vv_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsac.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsac_vv_bf16m4_tu(vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsac_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsac_vf_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsac.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsac_vf_bf16m4_tu(vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsac_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsac_vv_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsac.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsac_vv_bf16m8_tu(vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsac_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsac_vf_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsac.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsac_vf_bf16m8_tu(vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsac_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsac_vv_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsac.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsac_vv_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsac_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsac_vf_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsac.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsac_vf_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsac_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsac_vv_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsac.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsac_vv_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsac_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsac_vf_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsac.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsac_vf_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsac_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsac_vv_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsac.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsac_vv_bf16m1_tum(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsac_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsac_vf_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsac.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsac_vf_bf16m1_tum(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsac_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsac_vv_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsac.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsac_vv_bf16m2_tum(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsac_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsac_vf_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsac.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsac_vf_bf16m2_tum(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsac_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsac_vv_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsac.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsac_vv_bf16m4_tum(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsac_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsac_vf_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsac.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsac_vf_bf16m4_tum(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsac_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsac_vv_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsac.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsac_vv_bf16m8_tum(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsac_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsac_vf_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsac.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsac_vf_bf16m8_tum(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsac_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsac_vv_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsac.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsac_vv_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsac_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsac_vf_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsac.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsac_vf_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsac_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsac_vv_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsac.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsac_vv_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsac_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsac_vf_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsac.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsac_vf_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsac_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsac_vv_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsac.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsac_vv_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsac_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsac_vf_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsac.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsac_vf_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsac_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsac_vv_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsac.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsac_vv_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsac_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsac_vf_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsac.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsac_vf_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsac_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsac_vv_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsac.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsac_vv_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsac_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsac_vf_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsac.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsac_vf_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsac_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsac_vv_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsac.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsac_vv_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsac_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsac_vf_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsac.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsac_vf_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsac_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsac_vv_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsac.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsac_vv_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsac_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsac_vf_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsac.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsac_vf_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsac_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsac_vv_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsac.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsac_vv_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsac_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsac_vf_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsac.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsac_vf_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsac_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsac_vv_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsac.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsac_vv_bf16m1_mu(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsac_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsac_vf_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsac.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsac_vf_bf16m1_mu(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsac_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsac_vv_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsac.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsac_vv_bf16m2_mu(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsac_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsac_vf_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsac.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsac_vf_bf16m2_mu(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsac_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsac_vv_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsac.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsac_vv_bf16m4_mu(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsac_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsac_vf_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsac.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsac_vf_bf16m4_mu(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsac_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsac_vv_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsac.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsac_vv_bf16m8_mu(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsac_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsac_vf_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsac.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsac_vf_bf16m8_mu(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsac_mu(mask, vd, rs1, vs2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfnmsub.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfnmsub.c
new file mode 100644
index 0000000000000..f5282a195131d
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfnmsub.c
@@ -0,0 +1,489 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsub_vv_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsub.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsub_vv_bf16mf4_tu(vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsub_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsub_vf_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsub.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsub_vf_bf16mf4_tu(vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsub_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsub_vv_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsub.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsub_vv_bf16mf2_tu(vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsub_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsub_vf_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsub.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsub_vf_bf16mf2_tu(vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsub_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsub_vv_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsub.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsub_vv_bf16m1_tu(vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsub_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsub_vf_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsub.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsub_vf_bf16m1_tu(vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsub_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsub_vv_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsub.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsub_vv_bf16m2_tu(vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsub_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsub_vf_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsub.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsub_vf_bf16m2_tu(vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsub_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsub_vv_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsub.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsub_vv_bf16m4_tu(vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsub_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsub_vf_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsub.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsub_vf_bf16m4_tu(vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsub_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsub_vv_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsub.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsub_vv_bf16m8_tu(vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsub_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsub_vf_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsub.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsub_vf_bf16m8_tu(vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsub_tu(vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsub_vv_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsub.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsub_vv_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsub_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsub_vf_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsub.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsub_vf_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsub_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsub_vv_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsub.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsub_vv_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsub_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsub_vf_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsub.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsub_vf_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsub_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsub_vv_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsub.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsub_vv_bf16m1_tum(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsub_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsub_vf_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsub.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsub_vf_bf16m1_tum(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsub_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsub_vv_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsub.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsub_vv_bf16m2_tum(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsub_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsub_vf_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsub.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsub_vf_bf16m2_tum(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsub_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsub_vv_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsub.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsub_vv_bf16m4_tum(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsub_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsub_vf_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsub.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsub_vf_bf16m4_tum(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsub_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsub_vv_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsub.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsub_vv_bf16m8_tum(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsub_tum(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsub_vf_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsub.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsub_vf_bf16m8_tum(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsub_tum(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsub_vv_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsub.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsub_vv_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsub_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsub_vf_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsub.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsub_vf_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsub_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsub_vv_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsub.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsub_vv_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsub_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsub_vf_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsub.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsub_vf_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsub_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsub_vv_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsub.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsub_vv_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsub_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsub_vf_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsub.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsub_vf_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsub_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsub_vv_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsub.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsub_vv_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsub_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsub_vf_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsub.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsub_vf_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsub_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsub_vv_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsub.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsub_vv_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsub_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsub_vf_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsub.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsub_vf_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsub_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsub_vv_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsub.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsub_vv_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsub_tumu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsub_vf_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsub.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsub_vf_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsub_tumu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsub_vv_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsub.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsub_vv_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t vd, vbfloat16mf4_t vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsub_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfnmsub_vf_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsub.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfnmsub_vf_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t vd, __bf16 rs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfnmsub_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsub_vv_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsub.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsub_vv_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t vd, vbfloat16mf2_t vs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsub_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfnmsub_vf_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsub.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfnmsub_vf_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t vd, __bf16 rs1, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfnmsub_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsub_vv_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsub.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsub_vv_bf16m1_mu(vbool16_t mask, vbfloat16m1_t vd, vbfloat16m1_t vs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsub_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfnmsub_vf_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsub.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfnmsub_vf_bf16m1_mu(vbool16_t mask, vbfloat16m1_t vd, __bf16 rs1, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfnmsub_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsub_vv_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsub.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsub_vv_bf16m2_mu(vbool8_t mask, vbfloat16m2_t vd, vbfloat16m2_t vs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsub_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfnmsub_vf_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsub.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfnmsub_vf_bf16m2_mu(vbool8_t mask, vbfloat16m2_t vd, __bf16 rs1, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfnmsub_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsub_vv_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsub.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsub_vv_bf16m4_mu(vbool4_t mask, vbfloat16m4_t vd, vbfloat16m4_t vs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsub_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfnmsub_vf_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsub.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfnmsub_vf_bf16m4_mu(vbool4_t mask, vbfloat16m4_t vd, __bf16 rs1, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfnmsub_mu(mask, vd, rs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsub_vv_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsub.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsub_vv_bf16m8_mu(vbool2_t mask, vbfloat16m8_t vd, vbfloat16m8_t vs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsub_mu(mask, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfnmsub_vf_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], bfloat noundef [[RS1:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfnmsub.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[VD]], bfloat [[RS1]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfnmsub_vf_bf16m8_mu(vbool2_t mask, vbfloat16m8_t vd, __bf16 rs1, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_vfnmsub_mu(mask, vd, rs1, vs2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfrec7.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfrec7.c
new file mode 100644
index 0000000000000..f8e5a339c87d7
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfrec7.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfrec7_v_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfrec7.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfrec7_v_bf16mf4_tu(vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, size_t vl) {
+  return __riscv_vfrec7_tu(maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfrec7_v_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfrec7.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfrec7_v_bf16mf2_tu(vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, size_t vl) {
+  return __riscv_vfrec7_tu(maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfrec7_v_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfrec7.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfrec7_v_bf16m1_tu(vbfloat16m1_t maskedoff, vbfloat16m1_t op1, size_t vl) {
+  return __riscv_vfrec7_tu(maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfrec7_v_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfrec7.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfrec7_v_bf16m2_tu(vbfloat16m2_t maskedoff, vbfloat16m2_t op1, size_t vl) {
+  return __riscv_vfrec7_tu(maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfrec7_v_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfrec7.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfrec7_v_bf16m4_tu(vbfloat16m4_t maskedoff, vbfloat16m4_t op1, size_t vl) {
+  return __riscv_vfrec7_tu(maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfrec7_v_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfrec7.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfrec7_v_bf16m8_tu(vbfloat16m8_t maskedoff, vbfloat16m8_t op1, size_t vl) {
+  return __riscv_vfrec7_tu(maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfrec7_v_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfrec7.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfrec7_v_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, size_t vl) {
+  return __riscv_vfrec7_tum(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfrec7_v_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfrec7.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfrec7_v_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, size_t vl) {
+  return __riscv_vfrec7_tum(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfrec7_v_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfrec7.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfrec7_v_bf16m1_tum(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, size_t vl) {
+  return __riscv_vfrec7_tum(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfrec7_v_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfrec7.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfrec7_v_bf16m2_tum(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, size_t vl) {
+  return __riscv_vfrec7_tum(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfrec7_v_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfrec7.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfrec7_v_bf16m4_tum(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, size_t vl) {
+  return __riscv_vfrec7_tum(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfrec7_v_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfrec7.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfrec7_v_bf16m8_tum(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, size_t vl) {
+  return __riscv_vfrec7_tum(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfrec7_v_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfrec7.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfrec7_v_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, size_t vl) {
+  return __riscv_vfrec7_tumu(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfrec7_v_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfrec7.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfrec7_v_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, size_t vl) {
+  return __riscv_vfrec7_tumu(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfrec7_v_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfrec7.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfrec7_v_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, size_t vl) {
+  return __riscv_vfrec7_tumu(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfrec7_v_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfrec7.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfrec7_v_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, size_t vl) {
+  return __riscv_vfrec7_tumu(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfrec7_v_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfrec7.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfrec7_v_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, size_t vl) {
+  return __riscv_vfrec7_tumu(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfrec7_v_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfrec7.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfrec7_v_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, size_t vl) {
+  return __riscv_vfrec7_tumu(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfrec7_v_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfrec7.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfrec7_v_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, size_t vl) {
+  return __riscv_vfrec7_mu(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfrec7_v_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfrec7.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfrec7_v_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, size_t vl) {
+  return __riscv_vfrec7_mu(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfrec7_v_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfrec7.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfrec7_v_bf16m1_mu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, size_t vl) {
+  return __riscv_vfrec7_mu(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfrec7_v_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfrec7.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfrec7_v_bf16m2_mu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, size_t vl) {
+  return __riscv_vfrec7_mu(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfrec7_v_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfrec7.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfrec7_v_bf16m4_mu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, size_t vl) {
+  return __riscv_vfrec7_mu(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfrec7_v_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfrec7.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfrec7_v_bf16m8_mu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, size_t vl) {
+  return __riscv_vfrec7_mu(mask, maskedoff, op1, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfrsqrt7.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfrsqrt7.c
new file mode 100644
index 0000000000000..7c6c926e50c10
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfrsqrt7.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfrsqrt7_v_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfrsqrt7.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfrsqrt7_v_bf16mf4_tu(vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_tu(maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfrsqrt7_v_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfrsqrt7.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfrsqrt7_v_bf16mf2_tu(vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_tu(maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfrsqrt7_v_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfrsqrt7.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfrsqrt7_v_bf16m1_tu(vbfloat16m1_t maskedoff, vbfloat16m1_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_tu(maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfrsqrt7_v_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfrsqrt7.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfrsqrt7_v_bf16m2_tu(vbfloat16m2_t maskedoff, vbfloat16m2_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_tu(maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfrsqrt7_v_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfrsqrt7.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfrsqrt7_v_bf16m4_tu(vbfloat16m4_t maskedoff, vbfloat16m4_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_tu(maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfrsqrt7_v_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfrsqrt7.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfrsqrt7_v_bf16m8_tu(vbfloat16m8_t maskedoff, vbfloat16m8_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_tu(maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfrsqrt7_v_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfrsqrt7_v_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_tum(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfrsqrt7_v_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfrsqrt7_v_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_tum(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfrsqrt7_v_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfrsqrt7_v_bf16m1_tum(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_tum(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfrsqrt7_v_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfrsqrt7_v_bf16m2_tum(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_tum(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfrsqrt7_v_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfrsqrt7_v_bf16m4_tum(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_tum(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfrsqrt7_v_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfrsqrt7_v_bf16m8_tum(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_tum(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfrsqrt7_v_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfrsqrt7_v_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_tumu(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfrsqrt7_v_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfrsqrt7_v_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_tumu(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfrsqrt7_v_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfrsqrt7_v_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_tumu(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfrsqrt7_v_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfrsqrt7_v_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_tumu(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfrsqrt7_v_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfrsqrt7_v_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_tumu(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfrsqrt7_v_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfrsqrt7_v_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_tumu(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfrsqrt7_v_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfrsqrt7_v_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_mu(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfrsqrt7_v_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfrsqrt7_v_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_mu(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfrsqrt7_v_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfrsqrt7_v_bf16m1_mu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_mu(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfrsqrt7_v_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfrsqrt7_v_bf16m2_mu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_mu(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfrsqrt7_v_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfrsqrt7_v_bf16m4_mu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_mu(mask, maskedoff, op1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfrsqrt7_v_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfrsqrt7_v_bf16m8_mu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, size_t vl) {
+  return __riscv_vfrsqrt7_mu(mask, maskedoff, op1, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfrsub.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfrsub.c
new file mode 100644
index 0000000000000..c09caeb8207af
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfrsub.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfrsub_vf_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfrsub.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfrsub_vf_bf16mf4_tu(vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfrsub_vf_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfrsub.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfrsub_vf_bf16mf2_tu(vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfrsub_vf_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfrsub.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfrsub_vf_bf16m1_tu(vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfrsub_vf_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfrsub.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfrsub_vf_bf16m2_tu(vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfrsub_vf_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfrsub.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfrsub_vf_bf16m4_tu(vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfrsub_vf_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfrsub.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfrsub_vf_bf16m8_tu(vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfrsub_vf_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfrsub.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfrsub_vf_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfrsub_vf_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfrsub.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfrsub_vf_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfrsub_vf_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfrsub.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfrsub_vf_bf16m1_tum(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfrsub_vf_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfrsub.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfrsub_vf_bf16m2_tum(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfrsub_vf_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfrsub.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfrsub_vf_bf16m4_tum(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfrsub_vf_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfrsub.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfrsub_vf_bf16m8_tum(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfrsub_vf_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfrsub.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfrsub_vf_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfrsub_vf_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfrsub.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfrsub_vf_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfrsub_vf_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfrsub.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfrsub_vf_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfrsub_vf_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfrsub.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfrsub_vf_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfrsub_vf_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfrsub.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfrsub_vf_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfrsub_vf_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfrsub.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfrsub_vf_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfrsub_vf_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfrsub.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfrsub_vf_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfrsub_vf_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfrsub.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfrsub_vf_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfrsub_vf_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfrsub.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfrsub_vf_bf16m1_mu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfrsub_vf_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfrsub.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfrsub_vf_bf16m2_mu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfrsub_vf_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfrsub.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfrsub_vf_bf16m4_mu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfrsub_vf_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfrsub.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfrsub_vf_bf16m8_mu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfrsub_mu(mask, maskedoff, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfsgnj.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfsgnj.c
new file mode 100644
index 0000000000000..c1f69932f6a02
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfsgnj.c
@@ -0,0 +1,489 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnj_vv_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnj.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnj_vv_bf16mf4_tu(vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsgnj_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnj_vf_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnj.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnj_vf_bf16mf4_tu(vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnj_vv_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnj.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnj_vv_bf16mf2_tu(vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsgnj_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnj_vf_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnj.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnj_vf_bf16mf2_tu(vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnj_vv_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnj.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnj_vv_bf16m1_tu(vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsgnj_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnj_vf_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnj.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnj_vf_bf16m1_tu(vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnj_vv_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnj.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnj_vv_bf16m2_tu(vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsgnj_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnj_vf_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnj.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnj_vf_bf16m2_tu(vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnj_vv_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnj.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnj_vv_bf16m4_tu(vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsgnj_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnj_vf_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnj.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnj_vf_bf16m4_tu(vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnj_vv_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnj.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnj_vv_bf16m8_tu(vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsgnj_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnj_vf_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnj.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnj_vf_bf16m8_tu(vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnj_vv_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnj.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnj_vv_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsgnj_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnj_vf_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnj.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnj_vf_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnj_vv_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnj.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnj_vv_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsgnj_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnj_vf_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnj.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnj_vf_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnj_vv_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnj.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnj_vv_bf16m1_tum(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsgnj_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnj_vf_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnj.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnj_vf_bf16m1_tum(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnj_vv_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnj.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnj_vv_bf16m2_tum(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsgnj_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnj_vf_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnj.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnj_vf_bf16m2_tum(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnj_vv_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnj.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnj_vv_bf16m4_tum(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsgnj_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnj_vf_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnj.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnj_vf_bf16m4_tum(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnj_vv_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnj.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnj_vv_bf16m8_tum(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsgnj_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnj_vf_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnj.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnj_vf_bf16m8_tum(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnj_vv_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnj.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnj_vv_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsgnj_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnj_vf_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnj.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnj_vf_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnj_vv_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnj.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnj_vv_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsgnj_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnj_vf_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnj.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnj_vf_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnj_vv_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnj.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnj_vv_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsgnj_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnj_vf_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnj.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnj_vf_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnj_vv_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnj.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnj_vv_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsgnj_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnj_vf_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnj.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnj_vf_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnj_vv_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnj.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnj_vv_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsgnj_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnj_vf_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnj.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnj_vf_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnj_vv_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnj.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnj_vv_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsgnj_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnj_vf_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnj.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnj_vf_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnj_vv_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnj.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnj_vv_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsgnj_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnj_vf_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnj.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnj_vf_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnj_vv_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnj.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnj_vv_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsgnj_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnj_vf_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnj.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnj_vf_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnj_vv_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnj.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnj_vv_bf16m1_mu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsgnj_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnj_vf_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnj.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnj_vf_bf16m1_mu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnj_vv_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnj.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnj_vv_bf16m2_mu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsgnj_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnj_vf_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnj.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnj_vf_bf16m2_mu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnj_vv_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnj.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnj_vv_bf16m4_mu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsgnj_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnj_vf_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnj.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnj_vf_bf16m4_mu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnj_vv_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnj.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnj_vv_bf16m8_mu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsgnj_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnj_vf_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnj.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnj_vf_bf16m8_mu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnj_mu(mask, maskedoff, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfsgnjn.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfsgnjn.c
new file mode 100644
index 0000000000000..1b799d87d8131
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfsgnjn.c
@@ -0,0 +1,489 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjn_vv_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjn.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjn_vv_bf16mf4_tu(vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsgnjn_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjn_vf_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjn.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjn_vf_bf16mf4_tu(vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjn_vv_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjn.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjn_vv_bf16mf2_tu(vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsgnjn_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjn_vf_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjn.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjn_vf_bf16mf2_tu(vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjn_vv_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjn.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjn_vv_bf16m1_tu(vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsgnjn_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjn_vf_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjn.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjn_vf_bf16m1_tu(vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjn_vv_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjn.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjn_vv_bf16m2_tu(vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsgnjn_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjn_vf_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjn.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjn_vf_bf16m2_tu(vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjn_vv_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjn.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjn_vv_bf16m4_tu(vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsgnjn_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjn_vf_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjn.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjn_vf_bf16m4_tu(vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjn_vv_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjn.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjn_vv_bf16m8_tu(vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsgnjn_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjn_vf_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjn.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjn_vf_bf16m8_tu(vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjn_vv_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjn_vv_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsgnjn_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjn_vf_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjn_vf_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjn_vv_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjn_vv_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsgnjn_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjn_vf_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjn_vf_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjn_vv_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjn_vv_bf16m1_tum(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsgnjn_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjn_vf_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjn_vf_bf16m1_tum(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjn_vv_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjn_vv_bf16m2_tum(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsgnjn_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjn_vf_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjn_vf_bf16m2_tum(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjn_vv_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjn_vv_bf16m4_tum(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsgnjn_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjn_vf_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjn_vf_bf16m4_tum(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjn_vv_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjn_vv_bf16m8_tum(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsgnjn_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjn_vf_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjn_vf_bf16m8_tum(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjn_vv_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjn_vv_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsgnjn_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjn_vf_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjn_vf_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjn_vv_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjn_vv_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsgnjn_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjn_vf_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjn_vf_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjn_vv_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjn_vv_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsgnjn_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjn_vf_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjn_vf_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjn_vv_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjn_vv_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsgnjn_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjn_vf_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjn_vf_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjn_vv_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjn_vv_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsgnjn_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjn_vf_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjn_vf_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjn_vv_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjn_vv_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsgnjn_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjn_vf_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjn_vf_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjn_vv_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjn_vv_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsgnjn_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjn_vf_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjn_vf_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjn_vv_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjn_vv_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsgnjn_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjn_vf_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjn_vf_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjn_vv_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjn_vv_bf16m1_mu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsgnjn_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjn_vf_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjn_vf_bf16m1_mu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjn_vv_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjn_vv_bf16m2_mu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsgnjn_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjn_vf_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjn_vf_bf16m2_mu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjn_vv_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjn_vv_bf16m4_mu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsgnjn_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjn_vf_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjn_vf_bf16m4_mu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjn_vv_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjn_vv_bf16m8_mu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsgnjn_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjn_vf_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjn_vf_bf16m8_mu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjn_mu(mask, maskedoff, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfsgnjx.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfsgnjx.c
new file mode 100644
index 0000000000000..9c5f2af3d6e8f
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfsgnjx.c
@@ -0,0 +1,489 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjx_vv_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjx.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjx_vv_bf16mf4_tu(vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsgnjx_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjx_vf_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjx.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjx_vf_bf16mf4_tu(vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjx_vv_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjx.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjx_vv_bf16mf2_tu(vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsgnjx_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjx_vf_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjx.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjx_vf_bf16mf2_tu(vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjx_vv_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjx.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjx_vv_bf16m1_tu(vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsgnjx_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjx_vf_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjx.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjx_vf_bf16m1_tu(vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjx_vv_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjx.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjx_vv_bf16m2_tu(vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsgnjx_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjx_vf_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjx.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjx_vf_bf16m2_tu(vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjx_vv_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjx.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjx_vv_bf16m4_tu(vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsgnjx_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjx_vf_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjx.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjx_vf_bf16m4_tu(vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjx_vv_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjx.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjx_vv_bf16m8_tu(vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsgnjx_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjx_vf_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjx.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjx_vf_bf16m8_tu(vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjx_vv_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjx_vv_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsgnjx_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjx_vf_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjx_vf_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjx_vv_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjx_vv_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsgnjx_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjx_vf_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjx_vf_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjx_vv_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjx_vv_bf16m1_tum(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsgnjx_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjx_vf_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjx_vf_bf16m1_tum(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjx_vv_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjx_vv_bf16m2_tum(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsgnjx_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjx_vf_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjx_vf_bf16m2_tum(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjx_vv_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjx_vv_bf16m4_tum(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsgnjx_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjx_vf_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjx_vf_bf16m4_tum(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjx_vv_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjx_vv_bf16m8_tum(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsgnjx_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjx_vf_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjx_vf_bf16m8_tum(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjx_vv_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjx_vv_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsgnjx_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjx_vf_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjx_vf_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjx_vv_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjx_vv_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsgnjx_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjx_vf_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjx_vf_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjx_vv_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjx_vv_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsgnjx_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjx_vf_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjx_vf_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjx_vv_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjx_vv_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsgnjx_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjx_vf_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjx_vf_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjx_vv_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjx_vv_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsgnjx_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjx_vf_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjx_vf_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjx_vv_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjx_vv_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsgnjx_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjx_vf_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjx_vf_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjx_vv_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjx_vv_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsgnjx_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsgnjx_vf_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsgnjx_vf_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjx_vv_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjx_vv_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsgnjx_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsgnjx_vf_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsgnjx_vf_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjx_vv_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjx_vv_bf16m1_mu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsgnjx_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsgnjx_vf_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsgnjx_vf_bf16m1_mu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjx_vv_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjx_vv_bf16m2_mu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsgnjx_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsgnjx_vf_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsgnjx_vf_bf16m2_mu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjx_vv_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjx_vv_bf16m4_mu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsgnjx_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsgnjx_vf_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsgnjx_vf_bf16m4_mu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjx_vv_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjx_vv_bf16m8_mu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsgnjx_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsgnjx_vf_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsgnjx_vf_bf16m8_mu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsgnjx_mu(mask, maskedoff, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfslide1down.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfslide1down.c
new file mode 100644
index 0000000000000..691302e245427
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfslide1down.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfslide1down_vf_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfslide1down.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfslide1down_vf_bf16mf4_tu(vbfloat16mf4_t maskedoff, vbfloat16mf4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_tu(maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfslide1down_vf_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfslide1down.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfslide1down_vf_bf16mf2_tu(vbfloat16mf2_t maskedoff, vbfloat16mf2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_tu(maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfslide1down_vf_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfslide1down.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfslide1down_vf_bf16m1_tu(vbfloat16m1_t maskedoff, vbfloat16m1_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_tu(maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfslide1down_vf_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfslide1down.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfslide1down_vf_bf16m2_tu(vbfloat16m2_t maskedoff, vbfloat16m2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_tu(maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfslide1down_vf_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfslide1down.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfslide1down_vf_bf16m4_tu(vbfloat16m4_t maskedoff, vbfloat16m4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_tu(maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfslide1down_vf_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfslide1down.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfslide1down_vf_bf16m8_tu(vbfloat16m8_t maskedoff, vbfloat16m8_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_tu(maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfslide1down_vf_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfslide1down.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfslide1down_vf_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_tum(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfslide1down_vf_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfslide1down.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfslide1down_vf_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_tum(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfslide1down_vf_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfslide1down.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfslide1down_vf_bf16m1_tum(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_tum(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfslide1down_vf_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfslide1down.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfslide1down_vf_bf16m2_tum(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_tum(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfslide1down_vf_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfslide1down.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfslide1down_vf_bf16m4_tum(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_tum(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfslide1down_vf_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfslide1down.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfslide1down_vf_bf16m8_tum(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_tum(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfslide1down_vf_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfslide1down.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfslide1down_vf_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_tumu(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfslide1down_vf_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfslide1down.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfslide1down_vf_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_tumu(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfslide1down_vf_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfslide1down.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfslide1down_vf_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_tumu(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfslide1down_vf_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfslide1down.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfslide1down_vf_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_tumu(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfslide1down_vf_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfslide1down.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfslide1down_vf_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_tumu(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfslide1down_vf_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfslide1down.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfslide1down_vf_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_tumu(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfslide1down_vf_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfslide1down.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfslide1down_vf_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_mu(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfslide1down_vf_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfslide1down.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfslide1down_vf_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_mu(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfslide1down_vf_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfslide1down.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfslide1down_vf_bf16m1_mu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_mu(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfslide1down_vf_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfslide1down.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfslide1down_vf_bf16m2_mu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_mu(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfslide1down_vf_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfslide1down.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfslide1down_vf_bf16m4_mu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_mu(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfslide1down_vf_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfslide1down.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfslide1down_vf_bf16m8_mu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1down_mu(mask, maskedoff, src, value, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfslide1up.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfslide1up.c
new file mode 100644
index 0000000000000..1238d2204c6d4
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfslide1up.c
@@ -0,0 +1,249 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfslide1up_vf_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfslide1up.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfslide1up_vf_bf16mf4_tu(vbfloat16mf4_t maskedoff, vbfloat16mf4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_tu(maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfslide1up_vf_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfslide1up.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfslide1up_vf_bf16mf2_tu(vbfloat16mf2_t maskedoff, vbfloat16mf2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_tu(maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfslide1up_vf_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfslide1up.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfslide1up_vf_bf16m1_tu(vbfloat16m1_t maskedoff, vbfloat16m1_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_tu(maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfslide1up_vf_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfslide1up.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfslide1up_vf_bf16m2_tu(vbfloat16m2_t maskedoff, vbfloat16m2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_tu(maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfslide1up_vf_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfslide1up.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfslide1up_vf_bf16m4_tu(vbfloat16m4_t maskedoff, vbfloat16m4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_tu(maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfslide1up_vf_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfslide1up.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[SRC]], bfloat [[VALUE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfslide1up_vf_bf16m8_tu(vbfloat16m8_t maskedoff, vbfloat16m8_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_tu(maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfslide1up_vf_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfslide1up.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfslide1up_vf_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_tum(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfslide1up_vf_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfslide1up.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfslide1up_vf_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_tum(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfslide1up_vf_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfslide1up.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfslide1up_vf_bf16m1_tum(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_tum(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfslide1up_vf_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfslide1up.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfslide1up_vf_bf16m2_tum(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_tum(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfslide1up_vf_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfslide1up.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfslide1up_vf_bf16m4_tum(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_tum(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfslide1up_vf_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfslide1up.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfslide1up_vf_bf16m8_tum(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_tum(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfslide1up_vf_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfslide1up.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfslide1up_vf_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_tumu(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfslide1up_vf_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfslide1up.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfslide1up_vf_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_tumu(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfslide1up_vf_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfslide1up.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfslide1up_vf_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_tumu(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfslide1up_vf_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfslide1up.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfslide1up_vf_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_tumu(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfslide1up_vf_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfslide1up.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfslide1up_vf_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_tumu(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfslide1up_vf_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfslide1up.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfslide1up_vf_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_tumu(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfslide1up_vf_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfslide1up.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 1 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfslide1up_vf_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_mu(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfslide1up_vf_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfslide1up.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 2 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfslide1up_vf_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_mu(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfslide1up_vf_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfslide1up.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 4 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfslide1up_vf_bf16m1_mu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_mu(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfslide1up_vf_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfslide1up.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 8 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfslide1up_vf_bf16m2_mu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_mu(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfslide1up_vf_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfslide1up.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 16 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfslide1up_vf_bf16m4_mu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_mu(mask, maskedoff, src, value, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfslide1up_vf_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[SRC:%.*]], bfloat noundef [[VALUE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfslide1up.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[SRC]], bfloat [[VALUE]], <vscale x 32 x i1> [[MASK]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfslide1up_vf_bf16m8_mu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t src, __bf16 value, size_t vl) {
+  return __riscv_vfslide1up_mu(mask, maskedoff, src, value, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfsub.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfsub.c
new file mode 100644
index 0000000000000..ea4f8f043d6bb
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfsub.c
@@ -0,0 +1,489 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsub_vv_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsub.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsub_vv_bf16mf4_tu(vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsub_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsub_vf_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsub.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsub_vf_bf16mf4_tu(vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsub_vv_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsub.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsub_vv_bf16mf2_tu(vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsub_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsub_vf_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsub.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsub_vf_bf16mf2_tu(vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsub_vv_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsub.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsub_vv_bf16m1_tu(vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsub_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsub_vf_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsub.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsub_vf_bf16m1_tu(vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsub_vv_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsub.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsub_vv_bf16m2_tu(vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsub_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsub_vf_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsub.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsub_vf_bf16m2_tu(vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsub_vv_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsub.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsub_vv_bf16m4_tu(vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsub_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsub_vf_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsub.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsub_vf_bf16m4_tu(vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsub_vv_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsub.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsub_vv_bf16m8_tu(vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsub_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsub_vf_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsub.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsub_vf_bf16m8_tu(vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_tu(maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsub_vv_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsub.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsub_vv_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsub_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsub_vf_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsub.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsub_vf_bf16mf4_tum(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsub_vv_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsub.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsub_vv_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsub_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsub_vf_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsub.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsub_vf_bf16mf2_tum(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsub_vv_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsub.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsub_vv_bf16m1_tum(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsub_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsub_vf_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsub.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsub_vf_bf16m1_tum(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsub_vv_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsub.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsub_vv_bf16m2_tum(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsub_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsub_vf_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsub.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsub_vf_bf16m2_tum(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsub_vv_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsub.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsub_vv_bf16m4_tum(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsub_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsub_vf_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsub.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsub_vf_bf16m4_tum(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsub_vv_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsub.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsub_vv_bf16m8_tum(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsub_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsub_vf_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsub.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsub_vf_bf16m8_tum(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_tum(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsub_vv_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsub.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsub_vv_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsub_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsub_vf_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsub.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsub_vf_bf16mf4_tumu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsub_vv_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsub.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsub_vv_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsub_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsub_vf_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsub.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsub_vf_bf16mf2_tumu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsub_vv_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsub.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsub_vv_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsub_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsub_vf_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsub.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsub_vf_bf16m1_tumu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsub_vv_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsub.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsub_vv_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsub_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsub_vf_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsub.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsub_vf_bf16m2_tumu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsub_vv_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsub.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsub_vv_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsub_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsub_vf_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsub.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsub_vf_bf16m4_tumu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsub_vv_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsub.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsub_vv_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsub_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsub_vf_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsub.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsub_vf_bf16m8_tumu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_tumu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsub_vv_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsub.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsub_vv_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vfsub_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfsub_vf_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x bfloat> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfsub.mask.nxv1bf16.bf16.i64(<vscale x 1 x bfloat> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfsub_vf_bf16mf4_mu(vbool64_t mask, vbfloat16mf4_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsub_vv_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsub.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsub_vv_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vfsub_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfsub_vf_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x bfloat> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfsub.mask.nxv2bf16.bf16.i64(<vscale x 2 x bfloat> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfsub_vf_bf16mf2_mu(vbool32_t mask, vbfloat16mf2_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsub_vv_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsub.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsub_vv_bf16m1_mu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vfsub_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfsub_vf_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x bfloat> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfsub.mask.nxv4bf16.bf16.i64(<vscale x 4 x bfloat> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfsub_vf_bf16m1_mu(vbool16_t mask, vbfloat16m1_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsub_vv_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsub.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsub_vv_bf16m2_mu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vfsub_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfsub_vf_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x bfloat> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfsub.mask.nxv8bf16.bf16.i64(<vscale x 8 x bfloat> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfsub_vf_bf16m2_mu(vbool8_t mask, vbfloat16m2_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsub_vv_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsub.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsub_vv_bf16m4_mu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vfsub_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfsub_vf_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x bfloat> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfsub.mask.nxv16bf16.bf16.i64(<vscale x 16 x bfloat> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfsub_vf_bf16m4_mu(vbool4_t mask, vbfloat16m4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsub_vv_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsub.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsub_vv_bf16m8_mu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vfsub_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfsub_vf_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x bfloat> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfsub.mask.nxv32bf16.bf16.i64(<vscale x 32 x bfloat> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfsub_vf_bf16m8_mu(vbool2_t mask, vbfloat16m8_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vfsub_mu(mask, maskedoff, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfwadd.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfwadd.c
new file mode 100644
index 0000000000000..e5b7b8da1f3cc
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfwadd.c
@@ -0,0 +1,1932 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vv_bf16mf4_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vv_bf16mf4_f32mf2_tu(vfloat32mf2_t vd,
+                                               vbfloat16mf4_t vs2,
+                                               vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vf_bf16mf4_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vf_bf16mf4_f32mf2_tu(vfloat32mf2_t vd,
+                                               vbfloat16mf4_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwadd_vf_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wv_bf16mf4_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wv_bf16mf4_f32mf2_tu(vfloat32mf2_t vd,
+                                               vfloat32mf2_t vs2,
+                                               vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wf_bf16_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.nxv1f32.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wf_bf16_f32mf2_tu(vfloat32mf2_t vd, vfloat32mf2_t vs2,
+                                            __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_wf_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vv_bf16mf2_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vv_bf16mf2_f32m1_tu(vfloat32m1_t vd,
+                                             vbfloat16mf2_t vs2,
+                                             vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vf_bf16mf2_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vf_bf16mf2_f32m1_tu(vfloat32m1_t vd,
+                                             vbfloat16mf2_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwadd_vf_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wv_bf16mf2_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wv_bf16mf2_f32m1_tu(vfloat32m1_t vd, vfloat32m1_t vs2,
+                                             vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wf_bf16_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.nxv2f32.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wf_bf16_f32m1_tu(vfloat32m1_t vd, vfloat32m1_t vs2,
+                                          __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_wf_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vv_bf16m1_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vv_bf16m1_f32m2_tu(vfloat32m2_t vd, vbfloat16m1_t vs2,
+                                            vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vf_bf16m1_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vf_bf16m1_f32m2_tu(vfloat32m2_t vd, vbfloat16m1_t vs2,
+                                            __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_vf_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wv_bf16m1_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wv_bf16m1_f32m2_tu(vfloat32m2_t vd, vfloat32m2_t vs2,
+                                            vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wf_bf16_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.nxv4f32.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wf_bf16_f32m2_tu(vfloat32m2_t vd, vfloat32m2_t vs2,
+                                          __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_wf_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vv_bf16m2_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vv_bf16m2_f32m4_tu(vfloat32m4_t vd, vbfloat16m2_t vs2,
+                                            vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vf_bf16m2_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vf_bf16m2_f32m4_tu(vfloat32m4_t vd, vbfloat16m2_t vs2,
+                                            __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_vf_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wv_bf16m2_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wv_bf16m2_f32m4_tu(vfloat32m4_t vd, vfloat32m4_t vs2,
+                                            vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wf_bf16_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.nxv8f32.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wf_bf16_f32m4_tu(vfloat32m4_t vd, vfloat32m4_t vs2,
+                                          __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_wf_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vv_bf16m4_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vv_bf16m4_f32m8_tu(vfloat32m8_t vd, vbfloat16m4_t vs2,
+                                            vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vf_bf16m4_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vf_bf16m4_f32m8_tu(vfloat32m8_t vd, vbfloat16m4_t vs2,
+                                            __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_vf_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wv_bf16m4_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wv_bf16m4_f32m8_tu(vfloat32m8_t vd, vfloat32m8_t vs2,
+                                            vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wf_bf16_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.nxv16f32.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wf_bf16_f32m8_tu(vfloat32m8_t vd, vfloat32m8_t vs2,
+                                          __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_wf_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vv_bf16mf4_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vv_bf16mf4_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd,
+                                                vbfloat16mf4_t vs2,
+                                                vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vf_bf16mf4_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vf_bf16mf4_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd,
+                                                vbfloat16mf4_t vs2, __bf16 rs1,
+                                                size_t vl) {
+  return __riscv_vfwadd_vf_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wv_bf16mf4_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wv_bf16mf4_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd,
+                                                vfloat32mf2_t vs2,
+                                                vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wf_bf16_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wf_bf16_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd,
+                                             vfloat32mf2_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwadd_wf_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vv_bf16mf2_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vv_bf16mf2_f32m1_tum(vbool32_t vm, vfloat32m1_t vd,
+                                              vbfloat16mf2_t vs2,
+                                              vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vf_bf16mf2_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vf_bf16mf2_f32m1_tum(vbool32_t vm, vfloat32m1_t vd,
+                                              vbfloat16mf2_t vs2, __bf16 rs1,
+                                              size_t vl) {
+  return __riscv_vfwadd_vf_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wv_bf16mf2_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wv_bf16mf2_f32m1_tum(vbool32_t vm, vfloat32m1_t vd,
+                                              vfloat32m1_t vs2,
+                                              vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wf_bf16_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wf_bf16_f32m1_tum(vbool32_t vm, vfloat32m1_t vd,
+                                           vfloat32m1_t vs2, __bf16 rs1,
+                                           size_t vl) {
+  return __riscv_vfwadd_wf_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vv_bf16m1_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vv_bf16m1_f32m2_tum(vbool16_t vm, vfloat32m2_t vd,
+                                             vbfloat16m1_t vs2,
+                                             vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vf_bf16m1_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vf_bf16m1_f32m2_tum(vbool16_t vm, vfloat32m2_t vd,
+                                             vbfloat16m1_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwadd_vf_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wv_bf16m1_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wv_bf16m1_f32m2_tum(vbool16_t vm, vfloat32m2_t vd,
+                                             vfloat32m2_t vs2,
+                                             vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wf_bf16_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wf_bf16_f32m2_tum(vbool16_t vm, vfloat32m2_t vd,
+                                           vfloat32m2_t vs2, __bf16 rs1,
+                                           size_t vl) {
+  return __riscv_vfwadd_wf_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vv_bf16m2_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vv_bf16m2_f32m4_tum(vbool8_t vm, vfloat32m4_t vd,
+                                             vbfloat16m2_t vs2,
+                                             vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vf_bf16m2_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vf_bf16m2_f32m4_tum(vbool8_t vm, vfloat32m4_t vd,
+                                             vbfloat16m2_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwadd_vf_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wv_bf16m2_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wv_bf16m2_f32m4_tum(vbool8_t vm, vfloat32m4_t vd,
+                                             vfloat32m4_t vs2,
+                                             vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wf_bf16_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wf_bf16_f32m4_tum(vbool8_t vm, vfloat32m4_t vd,
+                                           vfloat32m4_t vs2, __bf16 rs1,
+                                           size_t vl) {
+  return __riscv_vfwadd_wf_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vv_bf16m4_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vv_bf16m4_f32m8_tum(vbool4_t vm, vfloat32m8_t vd,
+                                             vbfloat16m4_t vs2,
+                                             vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vf_bf16m4_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vf_bf16m4_f32m8_tum(vbool4_t vm, vfloat32m8_t vd,
+                                             vbfloat16m4_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwadd_vf_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wv_bf16m4_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wv_bf16m4_f32m8_tum(vbool4_t vm, vfloat32m8_t vd,
+                                             vfloat32m8_t vs2,
+                                             vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wf_bf16_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wf_bf16_f32m8_tum(vbool4_t vm, vfloat32m8_t vd,
+                                           vfloat32m8_t vs2, __bf16 rs1,
+                                           size_t vl) {
+  return __riscv_vfwadd_wf_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vv_bf16mf4_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vv_bf16mf4_f32mf2_tumu(vbool64_t vm, vfloat32mf2_t vd,
+                                                 vbfloat16mf4_t vs2,
+                                                 vbfloat16mf4_t vs1,
+                                                 size_t vl) {
+  return __riscv_vfwadd_vv_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vf_bf16mf4_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vf_bf16mf4_f32mf2_tumu(vbool64_t vm, vfloat32mf2_t vd,
+                                                 vbfloat16mf4_t vs2, __bf16 rs1,
+                                                 size_t vl) {
+  return __riscv_vfwadd_vf_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wv_bf16mf4_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wv_bf16mf4_f32mf2_tumu(vbool64_t vm, vfloat32mf2_t vd,
+                                                 vfloat32mf2_t vs2,
+                                                 vbfloat16mf4_t vs1,
+                                                 size_t vl) {
+  return __riscv_vfwadd_wv_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wf_bf16_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wf_bf16_f32mf2_tumu(vbool64_t vm, vfloat32mf2_t vd,
+                                              vfloat32mf2_t vs2, __bf16 rs1,
+                                              size_t vl) {
+  return __riscv_vfwadd_wf_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vv_bf16mf2_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vv_bf16mf2_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                               vbfloat16mf2_t vs2,
+                                               vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vf_bf16mf2_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vf_bf16mf2_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                               vbfloat16mf2_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwadd_vf_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wv_bf16mf2_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wv_bf16mf2_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                               vfloat32m1_t vs2,
+                                               vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wf_bf16_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wf_bf16_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                            vfloat32m1_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwadd_wf_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vv_bf16m1_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vv_bf16m1_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                              vbfloat16m1_t vs2,
+                                              vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vf_bf16m1_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vf_bf16m1_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                              vbfloat16m1_t vs2, __bf16 rs1,
+                                              size_t vl) {
+  return __riscv_vfwadd_vf_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wv_bf16m1_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wv_bf16m1_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                              vfloat32m2_t vs2,
+                                              vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wf_bf16_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wf_bf16_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                            vfloat32m2_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwadd_wf_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vv_bf16m2_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vv_bf16m2_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                              vbfloat16m2_t vs2,
+                                              vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vf_bf16m2_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vf_bf16m2_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                              vbfloat16m2_t vs2, __bf16 rs1,
+                                              size_t vl) {
+  return __riscv_vfwadd_vf_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wv_bf16m2_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wv_bf16m2_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                              vfloat32m4_t vs2,
+                                              vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wf_bf16_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wf_bf16_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                            vfloat32m4_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwadd_wf_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vv_bf16m4_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vv_bf16m4_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                              vbfloat16m4_t vs2,
+                                              vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vf_bf16m4_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vf_bf16m4_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                              vbfloat16m4_t vs2, __bf16 rs1,
+                                              size_t vl) {
+  return __riscv_vfwadd_vf_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wv_bf16m4_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wv_bf16m4_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                              vfloat32m8_t vs2,
+                                              vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wf_bf16_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wf_bf16_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                            vfloat32m8_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwadd_wf_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vv_bf16mf4_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vv_bf16mf4_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd,
+                                               vbfloat16mf4_t vs2,
+                                               vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vf_bf16mf4_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vf_bf16mf4_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd,
+                                               vbfloat16mf4_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwadd_vf_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wv_bf16mf4_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wv_bf16mf4_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd,
+                                               vfloat32mf2_t vs2,
+                                               vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wf_bf16_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wf_bf16_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd,
+                                            vfloat32mf2_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwadd_wf_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vv_bf16mf2_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vv_bf16mf2_f32m1_mu(vbool32_t vm, vfloat32m1_t vd,
+                                             vbfloat16mf2_t vs2,
+                                             vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vf_bf16mf2_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vf_bf16mf2_f32m1_mu(vbool32_t vm, vfloat32m1_t vd,
+                                             vbfloat16mf2_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwadd_vf_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wv_bf16mf2_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wv_bf16mf2_f32m1_mu(vbool32_t vm, vfloat32m1_t vd,
+                                             vfloat32m1_t vs2,
+                                             vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wf_bf16_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wf_bf16_f32m1_mu(vbool32_t vm, vfloat32m1_t vd,
+                                          vfloat32m1_t vs2, __bf16 rs1,
+                                          size_t vl) {
+  return __riscv_vfwadd_wf_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vv_bf16m1_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vv_bf16m1_f32m2_mu(vbool16_t vm, vfloat32m2_t vd,
+                                            vbfloat16m1_t vs2,
+                                            vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vf_bf16m1_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vf_bf16m1_f32m2_mu(vbool16_t vm, vfloat32m2_t vd,
+                                            vbfloat16m1_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwadd_vf_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wv_bf16m1_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wv_bf16m1_f32m2_mu(vbool16_t vm, vfloat32m2_t vd,
+                                            vfloat32m2_t vs2, vbfloat16m1_t vs1,
+                                            size_t vl) {
+  return __riscv_vfwadd_wv_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wf_bf16_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wf_bf16_f32m2_mu(vbool16_t vm, vfloat32m2_t vd,
+                                          vfloat32m2_t vs2, __bf16 rs1,
+                                          size_t vl) {
+  return __riscv_vfwadd_wf_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vv_bf16m2_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vv_bf16m2_f32m4_mu(vbool8_t vm, vfloat32m4_t vd,
+                                            vbfloat16m2_t vs2,
+                                            vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vf_bf16m2_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vf_bf16m2_f32m4_mu(vbool8_t vm, vfloat32m4_t vd,
+                                            vbfloat16m2_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwadd_vf_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wv_bf16m2_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wv_bf16m2_f32m4_mu(vbool8_t vm, vfloat32m4_t vd,
+                                            vfloat32m4_t vs2, vbfloat16m2_t vs1,
+                                            size_t vl) {
+  return __riscv_vfwadd_wv_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wf_bf16_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wf_bf16_f32m4_mu(vbool8_t vm, vfloat32m4_t vd,
+                                          vfloat32m4_t vs2, __bf16 rs1,
+                                          size_t vl) {
+  return __riscv_vfwadd_wf_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vv_bf16m4_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vv_bf16m4_f32m8_mu(vbool4_t vm, vfloat32m8_t vd,
+                                            vbfloat16m4_t vs2,
+                                            vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vf_bf16m4_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vf_bf16m4_f32m8_mu(vbool4_t vm, vfloat32m8_t vd,
+                                            vbfloat16m4_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwadd_vf_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wv_bf16m4_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wv_bf16m4_f32m8_mu(vbool4_t vm, vfloat32m8_t vd,
+                                            vfloat32m8_t vs2, vbfloat16m4_t vs1,
+                                            size_t vl) {
+  return __riscv_vfwadd_wv_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wf_bf16_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wf_bf16_f32m8_mu(vbool4_t vm, vfloat32m8_t vd,
+                                          vfloat32m8_t vs2, __bf16 rs1,
+                                          size_t vl) {
+  return __riscv_vfwadd_wf_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vv_bf16mf4_f32mf2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vv_bf16mf4_f32mf2_rm_tu(vfloat32mf2_t vd,
+                                                  vbfloat16mf4_t vs2,
+                                                  vbfloat16mf4_t vs1,
+                                                  size_t vl) {
+  return __riscv_vfwadd_vv_tu(vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vf_bf16mf4_f32mf2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vf_bf16mf4_f32mf2_rm_tu(vfloat32mf2_t vd,
+                                                  vbfloat16mf4_t vs2,
+                                                  __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_vf_tu(vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wv_bf16mf4_f32mf2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wv_bf16mf4_f32mf2_rm_tu(vfloat32mf2_t vd,
+                                                  vfloat32mf2_t vs2,
+                                                  vbfloat16mf4_t vs1,
+                                                  size_t vl) {
+  return __riscv_vfwadd_wv_tu(vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wf_bf16_f32mf2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.nxv1f32.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wf_bf16_f32mf2_rm_tu(vfloat32mf2_t vd,
+                                               vfloat32mf2_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwadd_wf_tu(vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vv_bf16mf2_f32m1_rm_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vv_bf16mf2_f32m1_rm_tu(vfloat32m1_t vd,
+                                                vbfloat16mf2_t vs2,
+                                                vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_tu(vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vf_bf16mf2_f32m1_rm_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vf_bf16mf2_f32m1_rm_tu(vfloat32m1_t vd,
+                                                vbfloat16mf2_t vs2, __bf16 rs1,
+                                                size_t vl) {
+  return __riscv_vfwadd_vf_tu(vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wv_bf16mf2_f32m1_rm_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wv_bf16mf2_f32m1_rm_tu(vfloat32m1_t vd,
+                                                vfloat32m1_t vs2,
+                                                vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_tu(vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wf_bf16_f32m1_rm_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.nxv2f32.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wf_bf16_f32m1_rm_tu(vfloat32m1_t vd, vfloat32m1_t vs2,
+                                             __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_wf_tu(vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vv_bf16m1_f32m2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vv_bf16m1_f32m2_rm_tu(vfloat32m2_t vd,
+                                               vbfloat16m1_t vs2,
+                                               vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_tu(vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vf_bf16m1_f32m2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vf_bf16m1_f32m2_rm_tu(vfloat32m2_t vd,
+                                               vbfloat16m1_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwadd_vf_tu(vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wv_bf16m1_f32m2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wv_bf16m1_f32m2_rm_tu(vfloat32m2_t vd,
+                                               vfloat32m2_t vs2,
+                                               vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_tu(vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wf_bf16_f32m2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.nxv4f32.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wf_bf16_f32m2_rm_tu(vfloat32m2_t vd, vfloat32m2_t vs2,
+                                             __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_wf_tu(vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vv_bf16m2_f32m4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vv_bf16m2_f32m4_rm_tu(vfloat32m4_t vd,
+                                               vbfloat16m2_t vs2,
+                                               vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_tu(vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vf_bf16m2_f32m4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vf_bf16m2_f32m4_rm_tu(vfloat32m4_t vd,
+                                               vbfloat16m2_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwadd_vf_tu(vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wv_bf16m2_f32m4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wv_bf16m2_f32m4_rm_tu(vfloat32m4_t vd,
+                                               vfloat32m4_t vs2,
+                                               vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_tu(vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wf_bf16_f32m4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.nxv8f32.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wf_bf16_f32m4_rm_tu(vfloat32m4_t vd, vfloat32m4_t vs2,
+                                             __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_wf_tu(vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vv_bf16m4_f32m8_rm_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vv_bf16m4_f32m8_rm_tu(vfloat32m8_t vd,
+                                               vbfloat16m4_t vs2,
+                                               vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_tu(vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vf_bf16m4_f32m8_rm_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vf_bf16m4_f32m8_rm_tu(vfloat32m8_t vd,
+                                               vbfloat16m4_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwadd_vf_tu(vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wv_bf16m4_f32m8_rm_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wv_bf16m4_f32m8_rm_tu(vfloat32m8_t vd,
+                                               vfloat32m8_t vs2,
+                                               vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_tu(vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wf_bf16_f32m8_rm_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.nxv16f32.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wf_bf16_f32m8_rm_tu(vfloat32m8_t vd, vfloat32m8_t vs2,
+                                             __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_wf_tu(vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vv_bf16mf4_f32mf2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vv_bf16mf4_f32mf2_rm_tum(vbool64_t vm,
+                                                   vfloat32mf2_t vd,
+                                                   vbfloat16mf4_t vs2,
+                                                   vbfloat16mf4_t vs1,
+                                                   size_t vl) {
+  return __riscv_vfwadd_vv_tum(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vf_bf16mf4_f32mf2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vf_bf16mf4_f32mf2_rm_tum(vbool64_t vm,
+                                                   vfloat32mf2_t vd,
+                                                   vbfloat16mf4_t vs2,
+                                                   __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_vf_tum(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wv_bf16mf4_f32mf2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wv_bf16mf4_f32mf2_rm_tum(vbool64_t vm,
+                                                   vfloat32mf2_t vd,
+                                                   vfloat32mf2_t vs2,
+                                                   vbfloat16mf4_t vs1,
+                                                   size_t vl) {
+  return __riscv_vfwadd_wv_tum(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wf_bf16_f32mf2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wf_bf16_f32mf2_rm_tum(vbool64_t vm, vfloat32mf2_t vd,
+                                                vfloat32mf2_t vs2, __bf16 rs1,
+                                                size_t vl) {
+  return __riscv_vfwadd_wf_tum(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vv_bf16mf2_f32m1_rm_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vv_bf16mf2_f32m1_rm_tum(vbool32_t vm, vfloat32m1_t vd,
+                                                 vbfloat16mf2_t vs2,
+                                                 vbfloat16mf2_t vs1,
+                                                 size_t vl) {
+  return __riscv_vfwadd_vv_tum(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vf_bf16mf2_f32m1_rm_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vf_bf16mf2_f32m1_rm_tum(vbool32_t vm, vfloat32m1_t vd,
+                                                 vbfloat16mf2_t vs2, __bf16 rs1,
+                                                 size_t vl) {
+  return __riscv_vfwadd_vf_tum(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wv_bf16mf2_f32m1_rm_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wv_bf16mf2_f32m1_rm_tum(vbool32_t vm, vfloat32m1_t vd,
+                                                 vfloat32m1_t vs2,
+                                                 vbfloat16mf2_t vs1,
+                                                 size_t vl) {
+  return __riscv_vfwadd_wv_tum(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wf_bf16_f32m1_rm_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wf_bf16_f32m1_rm_tum(vbool32_t vm, vfloat32m1_t vd,
+                                              vfloat32m1_t vs2, __bf16 rs1,
+                                              size_t vl) {
+  return __riscv_vfwadd_wf_tum(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vv_bf16m1_f32m2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vv_bf16m1_f32m2_rm_tum(vbool16_t vm, vfloat32m2_t vd,
+                                                vbfloat16m1_t vs2,
+                                                vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_tum(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vf_bf16m1_f32m2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vf_bf16m1_f32m2_rm_tum(vbool16_t vm, vfloat32m2_t vd,
+                                                vbfloat16m1_t vs2, __bf16 rs1,
+                                                size_t vl) {
+  return __riscv_vfwadd_vf_tum(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wv_bf16m1_f32m2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wv_bf16m1_f32m2_rm_tum(vbool16_t vm, vfloat32m2_t vd,
+                                                vfloat32m2_t vs2,
+                                                vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_tum(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wf_bf16_f32m2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wf_bf16_f32m2_rm_tum(vbool16_t vm, vfloat32m2_t vd,
+                                              vfloat32m2_t vs2, __bf16 rs1,
+                                              size_t vl) {
+  return __riscv_vfwadd_wf_tum(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vv_bf16m2_f32m4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vv_bf16m2_f32m4_rm_tum(vbool8_t vm, vfloat32m4_t vd,
+                                                vbfloat16m2_t vs2,
+                                                vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_tum(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vf_bf16m2_f32m4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vf_bf16m2_f32m4_rm_tum(vbool8_t vm, vfloat32m4_t vd,
+                                                vbfloat16m2_t vs2, __bf16 rs1,
+                                                size_t vl) {
+  return __riscv_vfwadd_vf_tum(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wv_bf16m2_f32m4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wv_bf16m2_f32m4_rm_tum(vbool8_t vm, vfloat32m4_t vd,
+                                                vfloat32m4_t vs2,
+                                                vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_tum(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wf_bf16_f32m4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wf_bf16_f32m4_rm_tum(vbool8_t vm, vfloat32m4_t vd,
+                                              vfloat32m4_t vs2, __bf16 rs1,
+                                              size_t vl) {
+  return __riscv_vfwadd_wf_tum(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vv_bf16m4_f32m8_rm_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vv_bf16m4_f32m8_rm_tum(vbool4_t vm, vfloat32m8_t vd,
+                                                vbfloat16m4_t vs2,
+                                                vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_tum(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vf_bf16m4_f32m8_rm_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vf_bf16m4_f32m8_rm_tum(vbool4_t vm, vfloat32m8_t vd,
+                                                vbfloat16m4_t vs2, __bf16 rs1,
+                                                size_t vl) {
+  return __riscv_vfwadd_vf_tum(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wv_bf16m4_f32m8_rm_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wv_bf16m4_f32m8_rm_tum(vbool4_t vm, vfloat32m8_t vd,
+                                                vfloat32m8_t vs2,
+                                                vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_tum(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wf_bf16_f32m8_rm_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wf_bf16_f32m8_rm_tum(vbool4_t vm, vfloat32m8_t vd,
+                                              vfloat32m8_t vs2, __bf16 rs1,
+                                              size_t vl) {
+  return __riscv_vfwadd_wf_tum(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vv_bf16mf4_f32mf2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vv_bf16mf4_f32mf2_rm_tumu(vbool64_t vm,
+                                                    vfloat32mf2_t vd,
+                                                    vbfloat16mf4_t vs2,
+                                                    vbfloat16mf4_t vs1,
+                                                    size_t vl) {
+  return __riscv_vfwadd_vv_tumu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vf_bf16mf4_f32mf2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vf_bf16mf4_f32mf2_rm_tumu(vbool64_t vm,
+                                                    vfloat32mf2_t vd,
+                                                    vbfloat16mf4_t vs2,
+                                                    __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_vf_tumu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wv_bf16mf4_f32mf2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wv_bf16mf4_f32mf2_rm_tumu(vbool64_t vm,
+                                                    vfloat32mf2_t vd,
+                                                    vfloat32mf2_t vs2,
+                                                    vbfloat16mf4_t vs1,
+                                                    size_t vl) {
+  return __riscv_vfwadd_wv_tumu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wf_bf16_f32mf2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wf_bf16_f32mf2_rm_tumu(vbool64_t vm, vfloat32mf2_t vd,
+                                                 vfloat32mf2_t vs2, __bf16 rs1,
+                                                 size_t vl) {
+  return __riscv_vfwadd_wf_tumu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vv_bf16mf2_f32m1_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vv_bf16mf2_f32m1_rm_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                                  vbfloat16mf2_t vs2,
+                                                  vbfloat16mf2_t vs1,
+                                                  size_t vl) {
+  return __riscv_vfwadd_vv_tumu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vf_bf16mf2_f32m1_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vf_bf16mf2_f32m1_rm_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                                  vbfloat16mf2_t vs2,
+                                                  __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_vf_tumu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wv_bf16mf2_f32m1_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wv_bf16mf2_f32m1_rm_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                                  vfloat32m1_t vs2,
+                                                  vbfloat16mf2_t vs1,
+                                                  size_t vl) {
+  return __riscv_vfwadd_wv_tumu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wf_bf16_f32m1_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wf_bf16_f32m1_rm_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                               vfloat32m1_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwadd_wf_tumu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vv_bf16m1_f32m2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vv_bf16m1_f32m2_rm_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                                 vbfloat16m1_t vs2,
+                                                 vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_tumu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vf_bf16m1_f32m2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vf_bf16m1_f32m2_rm_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                                 vbfloat16m1_t vs2, __bf16 rs1,
+                                                 size_t vl) {
+  return __riscv_vfwadd_vf_tumu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wv_bf16m1_f32m2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wv_bf16m1_f32m2_rm_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                                 vfloat32m2_t vs2,
+                                                 vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_tumu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wf_bf16_f32m2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wf_bf16_f32m2_rm_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                               vfloat32m2_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwadd_wf_tumu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vv_bf16m2_f32m4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vv_bf16m2_f32m4_rm_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                                 vbfloat16m2_t vs2,
+                                                 vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_tumu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vf_bf16m2_f32m4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vf_bf16m2_f32m4_rm_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                                 vbfloat16m2_t vs2, __bf16 rs1,
+                                                 size_t vl) {
+  return __riscv_vfwadd_vf_tumu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wv_bf16m2_f32m4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wv_bf16m2_f32m4_rm_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                                 vfloat32m4_t vs2,
+                                                 vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_tumu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wf_bf16_f32m4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wf_bf16_f32m4_rm_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                               vfloat32m4_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwadd_wf_tumu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vv_bf16m4_f32m8_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vv_bf16m4_f32m8_rm_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                                 vbfloat16m4_t vs2,
+                                                 vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_tumu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vf_bf16m4_f32m8_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vf_bf16m4_f32m8_rm_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                                 vbfloat16m4_t vs2, __bf16 rs1,
+                                                 size_t vl) {
+  return __riscv_vfwadd_vf_tumu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wv_bf16m4_f32m8_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wv_bf16m4_f32m8_rm_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                                 vfloat32m8_t vs2,
+                                                 vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_tumu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wf_bf16_f32m8_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wf_bf16_f32m8_rm_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                               vfloat32m8_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwadd_wf_tumu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vv_bf16mf4_f32mf2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vv_bf16mf4_f32mf2_rm_mu(vbool64_t vm,
+                                                  vfloat32mf2_t vd,
+                                                  vbfloat16mf4_t vs2,
+                                                  vbfloat16mf4_t vs1,
+                                                  size_t vl) {
+  return __riscv_vfwadd_vv_mu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_vf_bf16mf4_f32mf2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_vf_bf16mf4_f32mf2_rm_mu(vbool64_t vm,
+                                                  vfloat32mf2_t vd,
+                                                  vbfloat16mf4_t vs2,
+                                                  __bf16 rs1, size_t vl) {
+  return __riscv_vfwadd_vf_mu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wv_bf16mf4_f32mf2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wv_bf16mf4_f32mf2_rm_mu(vbool64_t vm,
+                                                  vfloat32mf2_t vd,
+                                                  vfloat32mf2_t vs2,
+                                                  vbfloat16mf4_t vs1,
+                                                  size_t vl) {
+  return __riscv_vfwadd_wv_mu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwadd_wf_bf16_f32mf2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwadd_wf_bf16_f32mf2_rm_mu(vbool64_t vm, vfloat32mf2_t vd,
+                                               vfloat32mf2_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwadd_wf_mu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vv_bf16mf2_f32m1_rm_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vv_bf16mf2_f32m1_rm_mu(vbool32_t vm, vfloat32m1_t vd,
+                                                vbfloat16mf2_t vs2,
+                                                vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_mu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_vf_bf16mf2_f32m1_rm_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_vf_bf16mf2_f32m1_rm_mu(vbool32_t vm, vfloat32m1_t vd,
+                                                vbfloat16mf2_t vs2, __bf16 rs1,
+                                                size_t vl) {
+  return __riscv_vfwadd_vf_mu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wv_bf16mf2_f32m1_rm_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wv_bf16mf2_f32m1_rm_mu(vbool32_t vm, vfloat32m1_t vd,
+                                                vfloat32m1_t vs2,
+                                                vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_mu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwadd_wf_bf16_f32m1_rm_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwadd_wf_bf16_f32m1_rm_mu(vbool32_t vm, vfloat32m1_t vd,
+                                             vfloat32m1_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwadd_wf_mu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vv_bf16m1_f32m2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vv_bf16m1_f32m2_rm_mu(vbool16_t vm, vfloat32m2_t vd,
+                                               vbfloat16m1_t vs2,
+                                               vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_mu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_vf_bf16m1_f32m2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_vf_bf16m1_f32m2_rm_mu(vbool16_t vm, vfloat32m2_t vd,
+                                               vbfloat16m1_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwadd_vf_mu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wv_bf16m1_f32m2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wv_bf16m1_f32m2_rm_mu(vbool16_t vm, vfloat32m2_t vd,
+                                               vfloat32m2_t vs2,
+                                               vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_mu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwadd_wf_bf16_f32m2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwadd_wf_bf16_f32m2_rm_mu(vbool16_t vm, vfloat32m2_t vd,
+                                             vfloat32m2_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwadd_wf_mu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vv_bf16m2_f32m4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vv_bf16m2_f32m4_rm_mu(vbool8_t vm, vfloat32m4_t vd,
+                                               vbfloat16m2_t vs2,
+                                               vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_mu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_vf_bf16m2_f32m4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_vf_bf16m2_f32m4_rm_mu(vbool8_t vm, vfloat32m4_t vd,
+                                               vbfloat16m2_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwadd_vf_mu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wv_bf16m2_f32m4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wv_bf16m2_f32m4_rm_mu(vbool8_t vm, vfloat32m4_t vd,
+                                               vfloat32m4_t vs2,
+                                               vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_mu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwadd_wf_bf16_f32m4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwadd_wf_bf16_f32m4_rm_mu(vbool8_t vm, vfloat32m4_t vd,
+                                             vfloat32m4_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwadd_wf_mu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vv_bf16m4_f32m8_rm_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vv_bf16m4_f32m8_rm_mu(vbool4_t vm, vfloat32m8_t vd,
+                                               vbfloat16m4_t vs2,
+                                               vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwadd_vv_mu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_vf_bf16m4_f32m8_rm_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_vf_bf16m4_f32m8_rm_mu(vbool4_t vm, vfloat32m8_t vd,
+                                               vbfloat16m4_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwadd_vf_mu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wv_bf16m4_f32m8_rm_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wv_bf16m4_f32m8_rm_mu(vbool4_t vm, vfloat32m8_t vd,
+                                               vfloat32m8_t vs2,
+                                               vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwadd_wv_mu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwadd_wf_bf16_f32m8_rm_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwadd_wf_bf16_f32m8_rm_mu(vbool4_t vm, vfloat32m8_t vd,
+                                             vfloat32m8_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwadd_wf_mu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfwcvt.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfwcvt.c
new file mode 100644
index 0000000000000..730010421b944
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfwcvt.c
@@ -0,0 +1,765 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfwcvt_f_x_v_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv1bf16.nxv1i8.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfwcvt_f_x_v_bf16mf4_tu(vbfloat16mf4_t vd, vint8mf8_t vs2,
+                                            size_t vl) {
+  return __riscv_vfwcvt_f_bf16_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfwcvt_f_x_v_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv2bf16.nxv2i8.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfwcvt_f_x_v_bf16mf2_tu(vbfloat16mf2_t vd, vint8mf4_t vs2,
+                                            size_t vl) {
+  return __riscv_vfwcvt_f_bf16_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfwcvt_f_x_v_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv4bf16.nxv4i8.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfwcvt_f_x_v_bf16m1_tu(vbfloat16m1_t vd, vint8mf2_t vs2,
+                                          size_t vl) {
+  return __riscv_vfwcvt_f_bf16_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfwcvt_f_x_v_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv8bf16.nxv8i8.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfwcvt_f_x_v_bf16m2_tu(vbfloat16m2_t vd, vint8m1_t vs2,
+                                          size_t vl) {
+  return __riscv_vfwcvt_f_bf16_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfwcvt_f_x_v_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv16bf16.nxv16i8.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfwcvt_f_x_v_bf16m4_tu(vbfloat16m4_t vd, vint8m2_t vs2,
+                                          size_t vl) {
+  return __riscv_vfwcvt_f_bf16_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfwcvt_f_x_v_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv32bf16.nxv32i8.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfwcvt_f_x_v_bf16m8_tu(vbfloat16m8_t vd, vint8m4_t vs2,
+                                          size_t vl) {
+  return __riscv_vfwcvt_f_bf16_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfwcvt_f_xu_v_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv1bf16.nxv1i8.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfwcvt_f_xu_v_bf16mf4_tu(vbfloat16mf4_t vd, vuint8mf8_t vs2,
+                                             size_t vl) {
+  return __riscv_vfwcvt_f_bf16_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfwcvt_f_xu_v_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv2bf16.nxv2i8.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfwcvt_f_xu_v_bf16mf2_tu(vbfloat16mf2_t vd, vuint8mf4_t vs2,
+                                             size_t vl) {
+  return __riscv_vfwcvt_f_bf16_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfwcvt_f_xu_v_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv4bf16.nxv4i8.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfwcvt_f_xu_v_bf16m1_tu(vbfloat16m1_t vd, vuint8mf2_t vs2,
+                                           size_t vl) {
+  return __riscv_vfwcvt_f_bf16_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfwcvt_f_xu_v_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv8bf16.nxv8i8.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfwcvt_f_xu_v_bf16m2_tu(vbfloat16m2_t vd, vuint8m1_t vs2,
+                                           size_t vl) {
+  return __riscv_vfwcvt_f_bf16_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfwcvt_f_xu_v_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv16bf16.nxv16i8.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfwcvt_f_xu_v_bf16m4_tu(vbfloat16m4_t vd, vuint8m2_t vs2,
+                                           size_t vl) {
+  return __riscv_vfwcvt_f_bf16_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfwcvt_f_xu_v_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv32bf16.nxv32i8.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x i8> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfwcvt_f_xu_v_bf16m8_tu(vbfloat16m8_t vd, vuint8m4_t vs2,
+                                           size_t vl) {
+  return __riscv_vfwcvt_f_bf16_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwcvt_f_f_v_bf16mf4_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwcvt.f.f.v.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwcvt_f_f_v_bf16mf4_f32mf2_tu(vfloat32mf2_t vd,
+                                                  vbfloat16mf4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwcvt_f_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwcvt_f_f_v_bf16mf2_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwcvt.f.f.v.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwcvt_f_f_v_bf16mf2_f32m1_tu(vfloat32m1_t vd,
+                                                vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwcvt_f_f_v_bf16m1_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwcvt.f.f.v.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwcvt_f_f_v_bf16m1_f32m2_tu(vfloat32m2_t vd,
+                                               vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwcvt_f_f_v_bf16m2_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwcvt.f.f.v.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwcvt_f_f_v_bf16m2_f32m4_tu(vfloat32m4_t vd,
+                                               vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwcvt_f_f_v_bf16m4_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwcvt.f.f.v.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwcvt_f_f_v_bf16m4_f32m8_tu(vfloat32m8_t vd,
+                                               vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfwcvt_f_x_v_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv1bf16.nxv1i8.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x i8> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfwcvt_f_x_v_bf16mf4_tum(vbool64_t vm, vbfloat16mf4_t vd,
+                                             vint8mf8_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfwcvt_f_x_v_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv2bf16.nxv2i8.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x i8> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfwcvt_f_x_v_bf16mf2_tum(vbool32_t vm, vbfloat16mf2_t vd,
+                                             vint8mf4_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfwcvt_f_x_v_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv4bf16.nxv4i8.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x i8> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfwcvt_f_x_v_bf16m1_tum(vbool16_t vm, vbfloat16m1_t vd,
+                                           vint8mf2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfwcvt_f_x_v_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv8bf16.nxv8i8.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x i8> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfwcvt_f_x_v_bf16m2_tum(vbool8_t vm, vbfloat16m2_t vd,
+                                           vint8m1_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfwcvt_f_x_v_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv16bf16.nxv16i8.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x i8> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfwcvt_f_x_v_bf16m4_tum(vbool4_t vm, vbfloat16m4_t vd,
+                                           vint8m2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfwcvt_f_x_v_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv32bf16.nxv32i8.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x i8> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfwcvt_f_x_v_bf16m8_tum(vbool2_t vm, vbfloat16m8_t vd,
+                                           vint8m4_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfwcvt_f_xu_v_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv1bf16.nxv1i8.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x i8> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfwcvt_f_xu_v_bf16mf4_tum(vbool64_t vm, vbfloat16mf4_t vd,
+                                              vuint8mf8_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfwcvt_f_xu_v_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv2bf16.nxv2i8.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x i8> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfwcvt_f_xu_v_bf16mf2_tum(vbool32_t vm, vbfloat16mf2_t vd,
+                                              vuint8mf4_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfwcvt_f_xu_v_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv4bf16.nxv4i8.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x i8> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfwcvt_f_xu_v_bf16m1_tum(vbool16_t vm, vbfloat16m1_t vd,
+                                            vuint8mf2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfwcvt_f_xu_v_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv8bf16.nxv8i8.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x i8> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfwcvt_f_xu_v_bf16m2_tum(vbool8_t vm, vbfloat16m2_t vd,
+                                            vuint8m1_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfwcvt_f_xu_v_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv16bf16.nxv16i8.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x i8> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfwcvt_f_xu_v_bf16m4_tum(vbool4_t vm, vbfloat16m4_t vd,
+                                            vuint8m2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfwcvt_f_xu_v_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv32bf16.nxv32i8.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x i8> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfwcvt_f_xu_v_bf16m8_tum(vbool2_t vm, vbfloat16m8_t vd,
+                                            vuint8m4_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwcvt_f_f_v_bf16mf4_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwcvt_f_f_v_bf16mf4_f32mf2_tum(vbool64_t vm,
+                                                   vfloat32mf2_t vd,
+                                                   vbfloat16mf4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwcvt_f_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwcvt_f_f_v_bf16mf2_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwcvt_f_f_v_bf16mf2_f32m1_tum(vbool32_t vm, vfloat32m1_t vd,
+                                                 vbfloat16mf2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwcvt_f_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwcvt_f_f_v_bf16m1_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwcvt_f_f_v_bf16m1_f32m2_tum(vbool16_t vm, vfloat32m2_t vd,
+                                                vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwcvt_f_f_v_bf16m2_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwcvt_f_f_v_bf16m2_f32m4_tum(vbool8_t vm, vfloat32m4_t vd,
+                                                vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwcvt_f_f_v_bf16m4_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwcvt_f_f_v_bf16m4_f32m8_tum(vbool4_t vm, vfloat32m8_t vd,
+                                                vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfwcvt_f_x_v_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv1bf16.nxv1i8.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x i8> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfwcvt_f_x_v_bf16mf4_tumu(vbool64_t vm, vbfloat16mf4_t vd,
+                                              vint8mf8_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfwcvt_f_x_v_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv2bf16.nxv2i8.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x i8> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfwcvt_f_x_v_bf16mf2_tumu(vbool32_t vm, vbfloat16mf2_t vd,
+                                              vint8mf4_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfwcvt_f_x_v_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv4bf16.nxv4i8.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x i8> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfwcvt_f_x_v_bf16m1_tumu(vbool16_t vm, vbfloat16m1_t vd,
+                                            vint8mf2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfwcvt_f_x_v_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv8bf16.nxv8i8.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x i8> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfwcvt_f_x_v_bf16m2_tumu(vbool8_t vm, vbfloat16m2_t vd,
+                                            vint8m1_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfwcvt_f_x_v_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv16bf16.nxv16i8.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x i8> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfwcvt_f_x_v_bf16m4_tumu(vbool4_t vm, vbfloat16m4_t vd,
+                                            vint8m2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfwcvt_f_x_v_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv32bf16.nxv32i8.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x i8> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfwcvt_f_x_v_bf16m8_tumu(vbool2_t vm, vbfloat16m8_t vd,
+                                            vint8m4_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfwcvt_f_xu_v_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv1bf16.nxv1i8.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x i8> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfwcvt_f_xu_v_bf16mf4_tumu(vbool64_t vm, vbfloat16mf4_t vd,
+                                               vuint8mf8_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfwcvt_f_xu_v_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv2bf16.nxv2i8.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x i8> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfwcvt_f_xu_v_bf16mf2_tumu(vbool32_t vm, vbfloat16mf2_t vd,
+                                               vuint8mf4_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfwcvt_f_xu_v_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv4bf16.nxv4i8.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x i8> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfwcvt_f_xu_v_bf16m1_tumu(vbool16_t vm, vbfloat16m1_t vd,
+                                             vuint8mf2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfwcvt_f_xu_v_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv8bf16.nxv8i8.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x i8> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfwcvt_f_xu_v_bf16m2_tumu(vbool8_t vm, vbfloat16m2_t vd,
+                                             vuint8m1_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfwcvt_f_xu_v_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv16bf16.nxv16i8.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x i8> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfwcvt_f_xu_v_bf16m4_tumu(vbool4_t vm, vbfloat16m4_t vd,
+                                             vuint8m2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfwcvt_f_xu_v_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv32bf16.nxv32i8.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x i8> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfwcvt_f_xu_v_bf16m8_tumu(vbool2_t vm, vbfloat16m8_t vd,
+                                             vuint8m4_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwcvt_f_f_v_bf16mf4_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwcvt_f_f_v_bf16mf4_f32mf2_tumu(vbool64_t vm,
+                                                    vfloat32mf2_t vd,
+                                                    vbfloat16mf4_t vs2,
+                                                    size_t vl) {
+  return __riscv_vfwcvt_f_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwcvt_f_f_v_bf16mf2_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwcvt_f_f_v_bf16mf2_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                                  vbfloat16mf2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwcvt_f_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwcvt_f_f_v_bf16m1_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwcvt_f_f_v_bf16m1_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                                 vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwcvt_f_f_v_bf16m2_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwcvt_f_f_v_bf16m2_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                                 vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwcvt_f_f_v_bf16m4_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwcvt_f_f_v_bf16m4_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                                 vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfwcvt_f_x_v_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv1bf16.nxv1i8.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x i8> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfwcvt_f_x_v_bf16mf4_mu(vbool64_t vm, vbfloat16mf4_t vd,
+                                            vint8mf8_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfwcvt_f_x_v_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv2bf16.nxv2i8.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x i8> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfwcvt_f_x_v_bf16mf2_mu(vbool32_t vm, vbfloat16mf2_t vd,
+                                            vint8mf4_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfwcvt_f_x_v_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv4bf16.nxv4i8.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x i8> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfwcvt_f_x_v_bf16m1_mu(vbool16_t vm, vbfloat16m1_t vd,
+                                          vint8mf2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfwcvt_f_x_v_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv8bf16.nxv8i8.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x i8> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfwcvt_f_x_v_bf16m2_mu(vbool8_t vm, vbfloat16m2_t vd,
+                                          vint8m1_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfwcvt_f_x_v_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv16bf16.nxv16i8.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x i8> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfwcvt_f_x_v_bf16m4_mu(vbool4_t vm, vbfloat16m4_t vd,
+                                          vint8m2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfwcvt_f_x_v_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv32bf16.nxv32i8.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x i8> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfwcvt_f_x_v_bf16m8_mu(vbool2_t vm, vbfloat16m8_t vd,
+                                          vint8m4_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_vfwcvt_f_xu_v_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv1bf16.nxv1i8.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x i8> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_vfwcvt_f_xu_v_bf16mf4_mu(vbool64_t vm, vbfloat16mf4_t vd,
+                                             vuint8mf8_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_vfwcvt_f_xu_v_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv2bf16.nxv2i8.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x i8> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_vfwcvt_f_xu_v_bf16mf2_mu(vbool32_t vm, vbfloat16mf2_t vd,
+                                             vuint8mf4_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_vfwcvt_f_xu_v_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv4bf16.nxv4i8.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x i8> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_vfwcvt_f_xu_v_bf16m1_mu(vbool16_t vm, vbfloat16m1_t vd,
+                                           vuint8mf2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_vfwcvt_f_xu_v_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv8bf16.nxv8i8.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x i8> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_vfwcvt_f_xu_v_bf16m2_mu(vbool8_t vm, vbfloat16m2_t vd,
+                                           vuint8m1_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_vfwcvt_f_xu_v_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv16bf16.nxv16i8.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x i8> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_vfwcvt_f_xu_v_bf16m4_mu(vbool4_t vm, vbfloat16m4_t vd,
+                                           vuint8m2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_vfwcvt_f_xu_v_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv32bf16.nxv32i8.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x i8> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_vfwcvt_f_xu_v_bf16m8_mu(vbool2_t vm, vbfloat16m8_t vd,
+                                           vuint8m4_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_bf16_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwcvt_f_f_v_bf16mf4_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwcvt_f_f_v_bf16mf4_f32mf2_mu(vbool64_t vm,
+                                                  vfloat32mf2_t vd,
+                                                  vbfloat16mf4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwcvt_f_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwcvt_f_f_v_bf16mf2_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwcvt_f_f_v_bf16mf2_f32m1_mu(vbool32_t vm, vfloat32m1_t vd,
+                                                vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwcvt_f_f_v_bf16m1_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwcvt_f_f_v_bf16m1_f32m2_mu(vbool16_t vm, vfloat32m2_t vd,
+                                               vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwcvt_f_f_v_bf16m2_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwcvt_f_f_v_bf16m2_f32m4_mu(vbool8_t vm, vfloat32m4_t vd,
+                                               vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwcvt_f_f_v_bf16m4_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwcvt.f.f.v.mask.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwcvt_f_f_v_bf16m4_f32m8_mu(vbool4_t vm, vfloat32m8_t vd,
+                                               vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwcvt_f_mu(vm, vd, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfwmacc.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfwmacc.c
new file mode 100644
index 0000000000000..b05f88028874d
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfwmacc.c
@@ -0,0 +1,977 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vv_bf16mf4_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vv_bf16mf4_f32mf2_tu(vfloat32mf2_t vd,
+                                                vbfloat16mf4_t vs1,
+                                                vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwmacc_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vf_bf16mf4_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vf_bf16mf4_f32mf2_tu(vfloat32mf2_t vd, __bf16 vs1,
+                                                vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwmacc_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vv_bf16mf2_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vv_bf16mf2_f32m1_tu(vfloat32m1_t vd,
+                                              vbfloat16mf2_t vs1,
+                                              vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwmacc_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vf_bf16mf2_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vf_bf16mf2_f32m1_tu(vfloat32m1_t vd, __bf16 vs1,
+                                              vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwmacc_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vv_bf16m1_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vv_bf16m1_f32m2_tu(vfloat32m2_t vd, vbfloat16m1_t vs1,
+                                             vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmacc_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vf_bf16m1_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vf_bf16m1_f32m2_tu(vfloat32m2_t vd, __bf16 vs1,
+                                             vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmacc_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vv_bf16m2_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vv_bf16m2_f32m4_tu(vfloat32m4_t vd, vbfloat16m2_t vs1,
+                                             vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmacc_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vf_bf16m2_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vf_bf16m2_f32m4_tu(vfloat32m4_t vd, __bf16 vs1,
+                                             vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmacc_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vv_bf16m4_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vv_bf16m4_f32m8_tu(vfloat32m8_t vd, vbfloat16m4_t vs1,
+                                             vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmacc_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vf_bf16m4_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vf_bf16m4_f32m8_tu(vfloat32m8_t vd, __bf16 vs1,
+                                             vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmacc_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vv_bf16mf4_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vv_bf16mf4_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd,
+                                                 vbfloat16mf4_t vs1,
+                                                 vbfloat16mf4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwmacc_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vf_bf16mf4_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vf_bf16mf4_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd,
+                                                 __bf16 vs1, vbfloat16mf4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwmacc_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vv_bf16mf2_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vv_bf16mf2_f32m1_tum(vbool32_t vm, vfloat32m1_t vd,
+                                               vbfloat16mf2_t vs1,
+                                               vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwmacc_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vf_bf16mf2_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vf_bf16mf2_f32m1_tum(vbool32_t vm, vfloat32m1_t vd,
+                                               __bf16 vs1, vbfloat16mf2_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwmacc_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vv_bf16m1_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vv_bf16m1_f32m2_tum(vbool16_t vm, vfloat32m2_t vd,
+                                              vbfloat16m1_t vs1,
+                                              vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmacc_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vf_bf16m1_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vf_bf16m1_f32m2_tum(vbool16_t vm, vfloat32m2_t vd,
+                                              __bf16 vs1, vbfloat16m1_t vs2,
+                                              size_t vl) {
+  return __riscv_vfwmacc_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vv_bf16m2_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vv_bf16m2_f32m4_tum(vbool8_t vm, vfloat32m4_t vd,
+                                              vbfloat16m2_t vs1,
+                                              vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmacc_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vf_bf16m2_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vf_bf16m2_f32m4_tum(vbool8_t vm, vfloat32m4_t vd,
+                                              __bf16 vs1, vbfloat16m2_t vs2,
+                                              size_t vl) {
+  return __riscv_vfwmacc_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vv_bf16m4_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vv_bf16m4_f32m8_tum(vbool4_t vm, vfloat32m8_t vd,
+                                              vbfloat16m4_t vs1,
+                                              vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmacc_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vf_bf16m4_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vf_bf16m4_f32m8_tum(vbool4_t vm, vfloat32m8_t vd,
+                                              __bf16 vs1, vbfloat16m4_t vs2,
+                                              size_t vl) {
+  return __riscv_vfwmacc_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vv_bf16mf4_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vv_bf16mf4_f32mf2_tumu(vbool64_t vm,
+                                                  vfloat32mf2_t vd,
+                                                  vbfloat16mf4_t vs1,
+                                                  vbfloat16mf4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmacc_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vf_bf16mf4_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vf_bf16mf4_f32mf2_tumu(vbool64_t vm,
+                                                  vfloat32mf2_t vd, __bf16 vs1,
+                                                  vbfloat16mf4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmacc_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vv_bf16mf2_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vv_bf16mf2_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                                vbfloat16mf2_t vs1,
+                                                vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwmacc_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vf_bf16mf2_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vf_bf16mf2_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                                __bf16 vs1, vbfloat16mf2_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwmacc_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vv_bf16m1_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vv_bf16m1_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                               vbfloat16m1_t vs1,
+                                               vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmacc_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vf_bf16m1_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vf_bf16m1_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                               __bf16 vs1, vbfloat16m1_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwmacc_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vv_bf16m2_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vv_bf16m2_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                               vbfloat16m2_t vs1,
+                                               vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmacc_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vf_bf16m2_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vf_bf16m2_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                               __bf16 vs1, vbfloat16m2_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwmacc_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vv_bf16m4_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vv_bf16m4_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                               vbfloat16m4_t vs1,
+                                               vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmacc_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vf_bf16m4_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vf_bf16m4_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                               __bf16 vs1, vbfloat16m4_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwmacc_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vv_bf16mf4_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vv_bf16mf4_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd,
+                                                vbfloat16mf4_t vs1,
+                                                vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwmacc_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vf_bf16mf4_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vf_bf16mf4_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd,
+                                                __bf16 vs1, vbfloat16mf4_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwmacc_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vv_bf16mf2_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vv_bf16mf2_f32m1_mu(vbool32_t vm, vfloat32m1_t vd,
+                                              vbfloat16mf2_t vs1,
+                                              vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwmacc_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vf_bf16mf2_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vf_bf16mf2_f32m1_mu(vbool32_t vm, vfloat32m1_t vd,
+                                              __bf16 vs1, vbfloat16mf2_t vs2,
+                                              size_t vl) {
+  return __riscv_vfwmacc_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vv_bf16m1_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vv_bf16m1_f32m2_mu(vbool16_t vm, vfloat32m2_t vd,
+                                             vbfloat16m1_t vs1,
+                                             vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmacc_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vf_bf16m1_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vf_bf16m1_f32m2_mu(vbool16_t vm, vfloat32m2_t vd,
+                                             __bf16 vs1, vbfloat16m1_t vs2,
+                                             size_t vl) {
+  return __riscv_vfwmacc_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vv_bf16m2_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vv_bf16m2_f32m4_mu(vbool8_t vm, vfloat32m4_t vd,
+                                             vbfloat16m2_t vs1,
+                                             vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmacc_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vf_bf16m2_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vf_bf16m2_f32m4_mu(vbool8_t vm, vfloat32m4_t vd,
+                                             __bf16 vs1, vbfloat16m2_t vs2,
+                                             size_t vl) {
+  return __riscv_vfwmacc_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vv_bf16m4_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vv_bf16m4_f32m8_mu(vbool4_t vm, vfloat32m8_t vd,
+                                             vbfloat16m4_t vs1,
+                                             vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmacc_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vf_bf16m4_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vf_bf16m4_f32m8_mu(vbool4_t vm, vfloat32m8_t vd,
+                                             __bf16 vs1, vbfloat16m4_t vs2,
+                                             size_t vl) {
+  return __riscv_vfwmacc_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vv_bf16mf4_f32mf2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vv_bf16mf4_f32mf2_rm_tu(vfloat32mf2_t vd,
+                                                   vbfloat16mf4_t vs1,
+                                                   vbfloat16mf4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwmacc_tu(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vf_bf16mf4_f32mf2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vf_bf16mf4_f32mf2_rm_tu(vfloat32mf2_t vd, __bf16 vs1,
+                                                   vbfloat16mf4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwmacc_tu(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vv_bf16mf2_f32m1_rm_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vv_bf16mf2_f32m1_rm_tu(vfloat32m1_t vd,
+                                                 vbfloat16mf2_t vs1,
+                                                 vbfloat16mf2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwmacc_tu(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vf_bf16mf2_f32m1_rm_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vf_bf16mf2_f32m1_rm_tu(vfloat32m1_t vd, __bf16 vs1,
+                                                 vbfloat16mf2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwmacc_tu(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vv_bf16m1_f32m2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vv_bf16m1_f32m2_rm_tu(vfloat32m2_t vd,
+                                                vbfloat16m1_t vs1,
+                                                vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmacc_tu(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vf_bf16m1_f32m2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vf_bf16m1_f32m2_rm_tu(vfloat32m2_t vd, __bf16 vs1,
+                                                vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmacc_tu(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vv_bf16m2_f32m4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vv_bf16m2_f32m4_rm_tu(vfloat32m4_t vd,
+                                                vbfloat16m2_t vs1,
+                                                vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmacc_tu(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vf_bf16m2_f32m4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vf_bf16m2_f32m4_rm_tu(vfloat32m4_t vd, __bf16 vs1,
+                                                vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmacc_tu(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vv_bf16m4_f32m8_rm_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vv_bf16m4_f32m8_rm_tu(vfloat32m8_t vd,
+                                                vbfloat16m4_t vs1,
+                                                vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmacc_tu(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vf_bf16m4_f32m8_rm_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vf_bf16m4_f32m8_rm_tu(vfloat32m8_t vd, __bf16 vs1,
+                                                vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmacc_tu(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vv_bf16mf4_f32mf2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vv_bf16mf4_f32mf2_rm_tum(vbool64_t vm,
+                                                    vfloat32mf2_t vd,
+                                                    vbfloat16mf4_t vs1,
+                                                    vbfloat16mf4_t vs2,
+                                                    size_t vl) {
+  return __riscv_vfwmacc_tum(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vf_bf16mf4_f32mf2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vf_bf16mf4_f32mf2_rm_tum(
+    vbool64_t vm, vfloat32mf2_t vd, __bf16 vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwmacc_tum(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vv_bf16mf2_f32m1_rm_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vv_bf16mf2_f32m1_rm_tum(vbool32_t vm, vfloat32m1_t vd,
+                                                  vbfloat16mf2_t vs1,
+                                                  vbfloat16mf2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmacc_tum(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vf_bf16mf2_f32m1_rm_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vf_bf16mf2_f32m1_rm_tum(vbool32_t vm, vfloat32m1_t vd,
+                                                  __bf16 vs1,
+                                                  vbfloat16mf2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmacc_tum(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vv_bf16m1_f32m2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vv_bf16m1_f32m2_rm_tum(vbool16_t vm, vfloat32m2_t vd,
+                                                 vbfloat16m1_t vs1,
+                                                 vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmacc_tum(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vf_bf16m1_f32m2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vf_bf16m1_f32m2_rm_tum(vbool16_t vm, vfloat32m2_t vd,
+                                                 __bf16 vs1, vbfloat16m1_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwmacc_tum(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vv_bf16m2_f32m4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vv_bf16m2_f32m4_rm_tum(vbool8_t vm, vfloat32m4_t vd,
+                                                 vbfloat16m2_t vs1,
+                                                 vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmacc_tum(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vf_bf16m2_f32m4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vf_bf16m2_f32m4_rm_tum(vbool8_t vm, vfloat32m4_t vd,
+                                                 __bf16 vs1, vbfloat16m2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwmacc_tum(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vv_bf16m4_f32m8_rm_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vv_bf16m4_f32m8_rm_tum(vbool4_t vm, vfloat32m8_t vd,
+                                                 vbfloat16m4_t vs1,
+                                                 vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmacc_tum(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vf_bf16m4_f32m8_rm_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vf_bf16m4_f32m8_rm_tum(vbool4_t vm, vfloat32m8_t vd,
+                                                 __bf16 vs1, vbfloat16m4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwmacc_tum(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vv_bf16mf4_f32mf2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vv_bf16mf4_f32mf2_rm_tumu(vbool64_t vm,
+                                                     vfloat32mf2_t vd,
+                                                     vbfloat16mf4_t vs1,
+                                                     vbfloat16mf4_t vs2,
+                                                     size_t vl) {
+  return __riscv_vfwmacc_tumu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vf_bf16mf4_f32mf2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vf_bf16mf4_f32mf2_rm_tumu(
+    vbool64_t vm, vfloat32mf2_t vd, __bf16 vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwmacc_tumu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vv_bf16mf2_f32m1_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vv_bf16mf2_f32m1_rm_tumu(vbool32_t vm,
+                                                   vfloat32m1_t vd,
+                                                   vbfloat16mf2_t vs1,
+                                                   vbfloat16mf2_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwmacc_tumu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vf_bf16mf2_f32m1_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vf_bf16mf2_f32m1_rm_tumu(vbool32_t vm,
+                                                   vfloat32m1_t vd, __bf16 vs1,
+                                                   vbfloat16mf2_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwmacc_tumu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vv_bf16m1_f32m2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vv_bf16m1_f32m2_rm_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                                  vbfloat16m1_t vs1,
+                                                  vbfloat16m1_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmacc_tumu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vf_bf16m1_f32m2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vf_bf16m1_f32m2_rm_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                                  __bf16 vs1, vbfloat16m1_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmacc_tumu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vv_bf16m2_f32m4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vv_bf16m2_f32m4_rm_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                                  vbfloat16m2_t vs1,
+                                                  vbfloat16m2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmacc_tumu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vf_bf16m2_f32m4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vf_bf16m2_f32m4_rm_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                                  __bf16 vs1, vbfloat16m2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmacc_tumu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vv_bf16m4_f32m8_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vv_bf16m4_f32m8_rm_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                                  vbfloat16m4_t vs1,
+                                                  vbfloat16m4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmacc_tumu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vf_bf16m4_f32m8_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vf_bf16m4_f32m8_rm_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                                  __bf16 vs1, vbfloat16m4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmacc_tumu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vv_bf16mf4_f32mf2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vv_bf16mf4_f32mf2_rm_mu(vbool64_t vm,
+                                                   vfloat32mf2_t vd,
+                                                   vbfloat16mf4_t vs1,
+                                                   vbfloat16mf4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwmacc_mu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmacc_vf_bf16mf4_f32mf2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmacc.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmacc_vf_bf16mf4_f32mf2_rm_mu(vbool64_t vm,
+                                                   vfloat32mf2_t vd, __bf16 vs1,
+                                                   vbfloat16mf4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwmacc_mu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vv_bf16mf2_f32m1_rm_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vv_bf16mf2_f32m1_rm_mu(vbool32_t vm, vfloat32m1_t vd,
+                                                 vbfloat16mf2_t vs1,
+                                                 vbfloat16mf2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwmacc_mu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmacc_vf_bf16mf2_f32m1_rm_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmacc.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmacc_vf_bf16mf2_f32m1_rm_mu(vbool32_t vm, vfloat32m1_t vd,
+                                                 __bf16 vs1, vbfloat16mf2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwmacc_mu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vv_bf16m1_f32m2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vv_bf16m1_f32m2_rm_mu(vbool16_t vm, vfloat32m2_t vd,
+                                                vbfloat16m1_t vs1,
+                                                vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmacc_mu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmacc_vf_bf16m1_f32m2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmacc.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmacc_vf_bf16m1_f32m2_rm_mu(vbool16_t vm, vfloat32m2_t vd,
+                                                __bf16 vs1, vbfloat16m1_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwmacc_mu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vv_bf16m2_f32m4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vv_bf16m2_f32m4_rm_mu(vbool8_t vm, vfloat32m4_t vd,
+                                                vbfloat16m2_t vs1,
+                                                vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmacc_mu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmacc_vf_bf16m2_f32m4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmacc.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmacc_vf_bf16m2_f32m4_rm_mu(vbool8_t vm, vfloat32m4_t vd,
+                                                __bf16 vs1, vbfloat16m2_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwmacc_mu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vv_bf16m4_f32m8_rm_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vv_bf16m4_f32m8_rm_mu(vbool4_t vm, vfloat32m8_t vd,
+                                                vbfloat16m4_t vs1,
+                                                vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmacc_mu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmacc_vf_bf16m4_f32m8_rm_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmacc.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmacc_vf_bf16m4_f32m8_rm_mu(vbool4_t vm, vfloat32m8_t vd,
+                                                __bf16 vs1, vbfloat16m4_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwmacc_mu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfwmsac.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfwmsac.c
new file mode 100644
index 0000000000000..93721f6a889d9
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfwmsac.c
@@ -0,0 +1,977 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vv_bf16mf4_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vv_bf16mf4_f32mf2_tu(vfloat32mf2_t vd,
+                                                vbfloat16mf4_t vs1,
+                                                vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwmsac_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vf_bf16mf4_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vf_bf16mf4_f32mf2_tu(vfloat32mf2_t vd, __bf16 vs1,
+                                                vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwmsac_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vv_bf16mf2_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vv_bf16mf2_f32m1_tu(vfloat32m1_t vd,
+                                              vbfloat16mf2_t vs1,
+                                              vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwmsac_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vf_bf16mf2_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vf_bf16mf2_f32m1_tu(vfloat32m1_t vd, __bf16 vs1,
+                                              vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwmsac_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vv_bf16m1_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vv_bf16m1_f32m2_tu(vfloat32m2_t vd, vbfloat16m1_t vs1,
+                                             vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmsac_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vf_bf16m1_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vf_bf16m1_f32m2_tu(vfloat32m2_t vd, __bf16 vs1,
+                                             vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmsac_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vv_bf16m2_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vv_bf16m2_f32m4_tu(vfloat32m4_t vd, vbfloat16m2_t vs1,
+                                             vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmsac_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vf_bf16m2_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vf_bf16m2_f32m4_tu(vfloat32m4_t vd, __bf16 vs1,
+                                             vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmsac_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vv_bf16m4_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vv_bf16m4_f32m8_tu(vfloat32m8_t vd, vbfloat16m4_t vs1,
+                                             vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmsac_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vf_bf16m4_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vf_bf16m4_f32m8_tu(vfloat32m8_t vd, __bf16 vs1,
+                                             vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmsac_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vv_bf16mf4_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vv_bf16mf4_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd,
+                                                 vbfloat16mf4_t vs1,
+                                                 vbfloat16mf4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwmsac_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vf_bf16mf4_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vf_bf16mf4_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd,
+                                                 __bf16 vs1, vbfloat16mf4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwmsac_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vv_bf16mf2_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vv_bf16mf2_f32m1_tum(vbool32_t vm, vfloat32m1_t vd,
+                                               vbfloat16mf2_t vs1,
+                                               vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwmsac_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vf_bf16mf2_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vf_bf16mf2_f32m1_tum(vbool32_t vm, vfloat32m1_t vd,
+                                               __bf16 vs1, vbfloat16mf2_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwmsac_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vv_bf16m1_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vv_bf16m1_f32m2_tum(vbool16_t vm, vfloat32m2_t vd,
+                                              vbfloat16m1_t vs1,
+                                              vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmsac_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vf_bf16m1_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vf_bf16m1_f32m2_tum(vbool16_t vm, vfloat32m2_t vd,
+                                              __bf16 vs1, vbfloat16m1_t vs2,
+                                              size_t vl) {
+  return __riscv_vfwmsac_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vv_bf16m2_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vv_bf16m2_f32m4_tum(vbool8_t vm, vfloat32m4_t vd,
+                                              vbfloat16m2_t vs1,
+                                              vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmsac_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vf_bf16m2_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vf_bf16m2_f32m4_tum(vbool8_t vm, vfloat32m4_t vd,
+                                              __bf16 vs1, vbfloat16m2_t vs2,
+                                              size_t vl) {
+  return __riscv_vfwmsac_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vv_bf16m4_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vv_bf16m4_f32m8_tum(vbool4_t vm, vfloat32m8_t vd,
+                                              vbfloat16m4_t vs1,
+                                              vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmsac_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vf_bf16m4_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vf_bf16m4_f32m8_tum(vbool4_t vm, vfloat32m8_t vd,
+                                              __bf16 vs1, vbfloat16m4_t vs2,
+                                              size_t vl) {
+  return __riscv_vfwmsac_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vv_bf16mf4_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vv_bf16mf4_f32mf2_tumu(vbool64_t vm,
+                                                  vfloat32mf2_t vd,
+                                                  vbfloat16mf4_t vs1,
+                                                  vbfloat16mf4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmsac_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vf_bf16mf4_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vf_bf16mf4_f32mf2_tumu(vbool64_t vm,
+                                                  vfloat32mf2_t vd, __bf16 vs1,
+                                                  vbfloat16mf4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmsac_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vv_bf16mf2_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vv_bf16mf2_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                                vbfloat16mf2_t vs1,
+                                                vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwmsac_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vf_bf16mf2_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vf_bf16mf2_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                                __bf16 vs1, vbfloat16mf2_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwmsac_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vv_bf16m1_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vv_bf16m1_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                               vbfloat16m1_t vs1,
+                                               vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmsac_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vf_bf16m1_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vf_bf16m1_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                               __bf16 vs1, vbfloat16m1_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwmsac_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vv_bf16m2_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vv_bf16m2_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                               vbfloat16m2_t vs1,
+                                               vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmsac_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vf_bf16m2_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vf_bf16m2_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                               __bf16 vs1, vbfloat16m2_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwmsac_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vv_bf16m4_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vv_bf16m4_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                               vbfloat16m4_t vs1,
+                                               vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmsac_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vf_bf16m4_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vf_bf16m4_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                               __bf16 vs1, vbfloat16m4_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwmsac_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vv_bf16mf4_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vv_bf16mf4_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd,
+                                                vbfloat16mf4_t vs1,
+                                                vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwmsac_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vf_bf16mf4_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vf_bf16mf4_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd,
+                                                __bf16 vs1, vbfloat16mf4_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwmsac_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vv_bf16mf2_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vv_bf16mf2_f32m1_mu(vbool32_t vm, vfloat32m1_t vd,
+                                              vbfloat16mf2_t vs1,
+                                              vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwmsac_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vf_bf16mf2_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vf_bf16mf2_f32m1_mu(vbool32_t vm, vfloat32m1_t vd,
+                                              __bf16 vs1, vbfloat16mf2_t vs2,
+                                              size_t vl) {
+  return __riscv_vfwmsac_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vv_bf16m1_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vv_bf16m1_f32m2_mu(vbool16_t vm, vfloat32m2_t vd,
+                                             vbfloat16m1_t vs1,
+                                             vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmsac_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vf_bf16m1_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vf_bf16m1_f32m2_mu(vbool16_t vm, vfloat32m2_t vd,
+                                             __bf16 vs1, vbfloat16m1_t vs2,
+                                             size_t vl) {
+  return __riscv_vfwmsac_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vv_bf16m2_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vv_bf16m2_f32m4_mu(vbool8_t vm, vfloat32m4_t vd,
+                                             vbfloat16m2_t vs1,
+                                             vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmsac_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vf_bf16m2_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vf_bf16m2_f32m4_mu(vbool8_t vm, vfloat32m4_t vd,
+                                             __bf16 vs1, vbfloat16m2_t vs2,
+                                             size_t vl) {
+  return __riscv_vfwmsac_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vv_bf16m4_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vv_bf16m4_f32m8_mu(vbool4_t vm, vfloat32m8_t vd,
+                                             vbfloat16m4_t vs1,
+                                             vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmsac_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vf_bf16m4_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vf_bf16m4_f32m8_mu(vbool4_t vm, vfloat32m8_t vd,
+                                             __bf16 vs1, vbfloat16m4_t vs2,
+                                             size_t vl) {
+  return __riscv_vfwmsac_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vv_bf16mf4_f32mf2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vv_bf16mf4_f32mf2_rm_tu(vfloat32mf2_t vd,
+                                                   vbfloat16mf4_t vs1,
+                                                   vbfloat16mf4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwmsac_tu(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vf_bf16mf4_f32mf2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vf_bf16mf4_f32mf2_rm_tu(vfloat32mf2_t vd, __bf16 vs1,
+                                                   vbfloat16mf4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwmsac_tu(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vv_bf16mf2_f32m1_rm_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vv_bf16mf2_f32m1_rm_tu(vfloat32m1_t vd,
+                                                 vbfloat16mf2_t vs1,
+                                                 vbfloat16mf2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwmsac_tu(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vf_bf16mf2_f32m1_rm_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vf_bf16mf2_f32m1_rm_tu(vfloat32m1_t vd, __bf16 vs1,
+                                                 vbfloat16mf2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwmsac_tu(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vv_bf16m1_f32m2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vv_bf16m1_f32m2_rm_tu(vfloat32m2_t vd,
+                                                vbfloat16m1_t vs1,
+                                                vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmsac_tu(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vf_bf16m1_f32m2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vf_bf16m1_f32m2_rm_tu(vfloat32m2_t vd, __bf16 vs1,
+                                                vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmsac_tu(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vv_bf16m2_f32m4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vv_bf16m2_f32m4_rm_tu(vfloat32m4_t vd,
+                                                vbfloat16m2_t vs1,
+                                                vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmsac_tu(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vf_bf16m2_f32m4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vf_bf16m2_f32m4_rm_tu(vfloat32m4_t vd, __bf16 vs1,
+                                                vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmsac_tu(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vv_bf16m4_f32m8_rm_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vv_bf16m4_f32m8_rm_tu(vfloat32m8_t vd,
+                                                vbfloat16m4_t vs1,
+                                                vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmsac_tu(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vf_bf16m4_f32m8_rm_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vf_bf16m4_f32m8_rm_tu(vfloat32m8_t vd, __bf16 vs1,
+                                                vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmsac_tu(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vv_bf16mf4_f32mf2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vv_bf16mf4_f32mf2_rm_tum(vbool64_t vm,
+                                                    vfloat32mf2_t vd,
+                                                    vbfloat16mf4_t vs1,
+                                                    vbfloat16mf4_t vs2,
+                                                    size_t vl) {
+  return __riscv_vfwmsac_tum(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vf_bf16mf4_f32mf2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vf_bf16mf4_f32mf2_rm_tum(
+    vbool64_t vm, vfloat32mf2_t vd, __bf16 vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwmsac_tum(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vv_bf16mf2_f32m1_rm_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vv_bf16mf2_f32m1_rm_tum(vbool32_t vm, vfloat32m1_t vd,
+                                                  vbfloat16mf2_t vs1,
+                                                  vbfloat16mf2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmsac_tum(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vf_bf16mf2_f32m1_rm_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vf_bf16mf2_f32m1_rm_tum(vbool32_t vm, vfloat32m1_t vd,
+                                                  __bf16 vs1,
+                                                  vbfloat16mf2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmsac_tum(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vv_bf16m1_f32m2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vv_bf16m1_f32m2_rm_tum(vbool16_t vm, vfloat32m2_t vd,
+                                                 vbfloat16m1_t vs1,
+                                                 vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmsac_tum(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vf_bf16m1_f32m2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vf_bf16m1_f32m2_rm_tum(vbool16_t vm, vfloat32m2_t vd,
+                                                 __bf16 vs1, vbfloat16m1_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwmsac_tum(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vv_bf16m2_f32m4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vv_bf16m2_f32m4_rm_tum(vbool8_t vm, vfloat32m4_t vd,
+                                                 vbfloat16m2_t vs1,
+                                                 vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmsac_tum(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vf_bf16m2_f32m4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vf_bf16m2_f32m4_rm_tum(vbool8_t vm, vfloat32m4_t vd,
+                                                 __bf16 vs1, vbfloat16m2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwmsac_tum(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vv_bf16m4_f32m8_rm_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vv_bf16m4_f32m8_rm_tum(vbool4_t vm, vfloat32m8_t vd,
+                                                 vbfloat16m4_t vs1,
+                                                 vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmsac_tum(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vf_bf16m4_f32m8_rm_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vf_bf16m4_f32m8_rm_tum(vbool4_t vm, vfloat32m8_t vd,
+                                                 __bf16 vs1, vbfloat16m4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwmsac_tum(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vv_bf16mf4_f32mf2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vv_bf16mf4_f32mf2_rm_tumu(vbool64_t vm,
+                                                     vfloat32mf2_t vd,
+                                                     vbfloat16mf4_t vs1,
+                                                     vbfloat16mf4_t vs2,
+                                                     size_t vl) {
+  return __riscv_vfwmsac_tumu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vf_bf16mf4_f32mf2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vf_bf16mf4_f32mf2_rm_tumu(
+    vbool64_t vm, vfloat32mf2_t vd, __bf16 vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwmsac_tumu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vv_bf16mf2_f32m1_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vv_bf16mf2_f32m1_rm_tumu(vbool32_t vm,
+                                                   vfloat32m1_t vd,
+                                                   vbfloat16mf2_t vs1,
+                                                   vbfloat16mf2_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwmsac_tumu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vf_bf16mf2_f32m1_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vf_bf16mf2_f32m1_rm_tumu(vbool32_t vm,
+                                                   vfloat32m1_t vd, __bf16 vs1,
+                                                   vbfloat16mf2_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwmsac_tumu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vv_bf16m1_f32m2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vv_bf16m1_f32m2_rm_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                                  vbfloat16m1_t vs1,
+                                                  vbfloat16m1_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmsac_tumu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vf_bf16m1_f32m2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vf_bf16m1_f32m2_rm_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                                  __bf16 vs1, vbfloat16m1_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmsac_tumu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vv_bf16m2_f32m4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vv_bf16m2_f32m4_rm_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                                  vbfloat16m2_t vs1,
+                                                  vbfloat16m2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmsac_tumu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vf_bf16m2_f32m4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vf_bf16m2_f32m4_rm_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                                  __bf16 vs1, vbfloat16m2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmsac_tumu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vv_bf16m4_f32m8_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vv_bf16m4_f32m8_rm_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                                  vbfloat16m4_t vs1,
+                                                  vbfloat16m4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmsac_tumu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vf_bf16m4_f32m8_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vf_bf16m4_f32m8_rm_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                                  __bf16 vs1, vbfloat16m4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwmsac_tumu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vv_bf16mf4_f32mf2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vv_bf16mf4_f32mf2_rm_mu(vbool64_t vm,
+                                                   vfloat32mf2_t vd,
+                                                   vbfloat16mf4_t vs1,
+                                                   vbfloat16mf4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwmsac_mu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmsac_vf_bf16mf4_f32mf2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmsac.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmsac_vf_bf16mf4_f32mf2_rm_mu(vbool64_t vm,
+                                                   vfloat32mf2_t vd, __bf16 vs1,
+                                                   vbfloat16mf4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwmsac_mu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vv_bf16mf2_f32m1_rm_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vv_bf16mf2_f32m1_rm_mu(vbool32_t vm, vfloat32m1_t vd,
+                                                 vbfloat16mf2_t vs1,
+                                                 vbfloat16mf2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwmsac_mu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmsac_vf_bf16mf2_f32m1_rm_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmsac.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmsac_vf_bf16mf2_f32m1_rm_mu(vbool32_t vm, vfloat32m1_t vd,
+                                                 __bf16 vs1, vbfloat16mf2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwmsac_mu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vv_bf16m1_f32m2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vv_bf16m1_f32m2_rm_mu(vbool16_t vm, vfloat32m2_t vd,
+                                                vbfloat16m1_t vs1,
+                                                vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwmsac_mu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmsac_vf_bf16m1_f32m2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmsac.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmsac_vf_bf16m1_f32m2_rm_mu(vbool16_t vm, vfloat32m2_t vd,
+                                                __bf16 vs1, vbfloat16m1_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwmsac_mu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vv_bf16m2_f32m4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vv_bf16m2_f32m4_rm_mu(vbool8_t vm, vfloat32m4_t vd,
+                                                vbfloat16m2_t vs1,
+                                                vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwmsac_mu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmsac_vf_bf16m2_f32m4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmsac.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmsac_vf_bf16m2_f32m4_rm_mu(vbool8_t vm, vfloat32m4_t vd,
+                                                __bf16 vs1, vbfloat16m2_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwmsac_mu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vv_bf16m4_f32m8_rm_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vv_bf16m4_f32m8_rm_mu(vbool4_t vm, vfloat32m8_t vd,
+                                                vbfloat16m4_t vs1,
+                                                vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwmsac_mu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmsac_vf_bf16m4_f32m8_rm_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmsac.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmsac_vf_bf16m4_f32m8_rm_mu(vbool4_t vm, vfloat32m8_t vd,
+                                                __bf16 vs1, vbfloat16m4_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwmsac_mu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfwmul.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfwmul.c
new file mode 100644
index 0000000000000..4a2b5e39ef2cf
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfwmul.c
@@ -0,0 +1,975 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vv_bf16mf4_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vv_bf16mf4_f32mf2_tu(vfloat32mf2_t vd,
+                                               vbfloat16mf4_t vs2,
+                                               vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwmul_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vf_bf16mf4_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vf_bf16mf4_f32mf2_tu(vfloat32mf2_t vd,
+                                               vbfloat16mf4_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwmul_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vv_bf16mf2_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vv_bf16mf2_f32m1_tu(vfloat32m1_t vd,
+                                             vbfloat16mf2_t vs2,
+                                             vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwmul_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vf_bf16mf2_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vf_bf16mf2_f32m1_tu(vfloat32m1_t vd,
+                                             vbfloat16mf2_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwmul_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vv_bf16m1_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vv_bf16m1_f32m2_tu(vfloat32m2_t vd, vbfloat16m1_t vs2,
+                                            vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwmul_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vf_bf16m1_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vf_bf16m1_f32m2_tu(vfloat32m2_t vd, vbfloat16m1_t vs2,
+                                            __bf16 rs1, size_t vl) {
+  return __riscv_vfwmul_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vv_bf16m2_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vv_bf16m2_f32m4_tu(vfloat32m4_t vd, vbfloat16m2_t vs2,
+                                            vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwmul_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vf_bf16m2_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vf_bf16m2_f32m4_tu(vfloat32m4_t vd, vbfloat16m2_t vs2,
+                                            __bf16 rs1, size_t vl) {
+  return __riscv_vfwmul_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vv_bf16m4_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vv_bf16m4_f32m8_tu(vfloat32m8_t vd, vbfloat16m4_t vs2,
+                                            vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwmul_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vf_bf16m4_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vf_bf16m4_f32m8_tu(vfloat32m8_t vd, vbfloat16m4_t vs2,
+                                            __bf16 rs1, size_t vl) {
+  return __riscv_vfwmul_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vv_bf16mf4_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vv_bf16mf4_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd,
+                                                vbfloat16mf4_t vs2,
+                                                vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwmul_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vf_bf16mf4_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vf_bf16mf4_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd,
+                                                vbfloat16mf4_t vs2, __bf16 rs1,
+                                                size_t vl) {
+  return __riscv_vfwmul_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vv_bf16mf2_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vv_bf16mf2_f32m1_tum(vbool32_t vm, vfloat32m1_t vd,
+                                              vbfloat16mf2_t vs2,
+                                              vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwmul_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vf_bf16mf2_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vf_bf16mf2_f32m1_tum(vbool32_t vm, vfloat32m1_t vd,
+                                              vbfloat16mf2_t vs2, __bf16 rs1,
+                                              size_t vl) {
+  return __riscv_vfwmul_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vv_bf16m1_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vv_bf16m1_f32m2_tum(vbool16_t vm, vfloat32m2_t vd,
+                                             vbfloat16m1_t vs2,
+                                             vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwmul_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vf_bf16m1_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vf_bf16m1_f32m2_tum(vbool16_t vm, vfloat32m2_t vd,
+                                             vbfloat16m1_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwmul_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vv_bf16m2_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vv_bf16m2_f32m4_tum(vbool8_t vm, vfloat32m4_t vd,
+                                             vbfloat16m2_t vs2,
+                                             vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwmul_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vf_bf16m2_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vf_bf16m2_f32m4_tum(vbool8_t vm, vfloat32m4_t vd,
+                                             vbfloat16m2_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwmul_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vv_bf16m4_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vv_bf16m4_f32m8_tum(vbool4_t vm, vfloat32m8_t vd,
+                                             vbfloat16m4_t vs2,
+                                             vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwmul_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vf_bf16m4_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vf_bf16m4_f32m8_tum(vbool4_t vm, vfloat32m8_t vd,
+                                             vbfloat16m4_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwmul_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vv_bf16mf4_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vv_bf16mf4_f32mf2_tumu(vbool64_t vm, vfloat32mf2_t vd,
+                                                 vbfloat16mf4_t vs2,
+                                                 vbfloat16mf4_t vs1,
+                                                 size_t vl) {
+  return __riscv_vfwmul_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vf_bf16mf4_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vf_bf16mf4_f32mf2_tumu(vbool64_t vm, vfloat32mf2_t vd,
+                                                 vbfloat16mf4_t vs2, __bf16 rs1,
+                                                 size_t vl) {
+  return __riscv_vfwmul_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vv_bf16mf2_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vv_bf16mf2_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                               vbfloat16mf2_t vs2,
+                                               vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwmul_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vf_bf16mf2_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vf_bf16mf2_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                               vbfloat16mf2_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwmul_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vv_bf16m1_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vv_bf16m1_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                              vbfloat16m1_t vs2,
+                                              vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwmul_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vf_bf16m1_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vf_bf16m1_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                              vbfloat16m1_t vs2, __bf16 rs1,
+                                              size_t vl) {
+  return __riscv_vfwmul_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vv_bf16m2_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vv_bf16m2_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                              vbfloat16m2_t vs2,
+                                              vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwmul_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vf_bf16m2_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vf_bf16m2_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                              vbfloat16m2_t vs2, __bf16 rs1,
+                                              size_t vl) {
+  return __riscv_vfwmul_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vv_bf16m4_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vv_bf16m4_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                              vbfloat16m4_t vs2,
+                                              vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwmul_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vf_bf16m4_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vf_bf16m4_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                              vbfloat16m4_t vs2, __bf16 rs1,
+                                              size_t vl) {
+  return __riscv_vfwmul_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vv_bf16mf4_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vv_bf16mf4_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd,
+                                               vbfloat16mf4_t vs2,
+                                               vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwmul_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vf_bf16mf4_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vf_bf16mf4_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd,
+                                               vbfloat16mf4_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwmul_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vv_bf16mf2_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vv_bf16mf2_f32m1_mu(vbool32_t vm, vfloat32m1_t vd,
+                                             vbfloat16mf2_t vs2,
+                                             vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwmul_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vf_bf16mf2_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vf_bf16mf2_f32m1_mu(vbool32_t vm, vfloat32m1_t vd,
+                                             vbfloat16mf2_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwmul_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vv_bf16m1_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vv_bf16m1_f32m2_mu(vbool16_t vm, vfloat32m2_t vd,
+                                            vbfloat16m1_t vs2,
+                                            vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwmul_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vf_bf16m1_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vf_bf16m1_f32m2_mu(vbool16_t vm, vfloat32m2_t vd,
+                                            vbfloat16m1_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwmul_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vv_bf16m2_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vv_bf16m2_f32m4_mu(vbool8_t vm, vfloat32m4_t vd,
+                                            vbfloat16m2_t vs2,
+                                            vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwmul_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vf_bf16m2_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vf_bf16m2_f32m4_mu(vbool8_t vm, vfloat32m4_t vd,
+                                            vbfloat16m2_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwmul_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vv_bf16m4_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vv_bf16m4_f32m8_mu(vbool4_t vm, vfloat32m8_t vd,
+                                            vbfloat16m4_t vs2,
+                                            vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwmul_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vf_bf16m4_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vf_bf16m4_f32m8_mu(vbool4_t vm, vfloat32m8_t vd,
+                                            vbfloat16m4_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwmul_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vv_bf16mf4_f32mf2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vv_bf16mf4_f32mf2_rm_tu(vfloat32mf2_t vd,
+                                                  vbfloat16mf4_t vs2,
+                                                  vbfloat16mf4_t vs1,
+                                                  size_t vl) {
+  return __riscv_vfwmul_tu(vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vf_bf16mf4_f32mf2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vf_bf16mf4_f32mf2_rm_tu(vfloat32mf2_t vd,
+                                                  vbfloat16mf4_t vs2,
+                                                  __bf16 rs1, size_t vl) {
+  return __riscv_vfwmul_tu(vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vv_bf16mf2_f32m1_rm_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vv_bf16mf2_f32m1_rm_tu(vfloat32m1_t vd,
+                                                vbfloat16mf2_t vs2,
+                                                vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwmul_tu(vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vf_bf16mf2_f32m1_rm_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vf_bf16mf2_f32m1_rm_tu(vfloat32m1_t vd,
+                                                vbfloat16mf2_t vs2, __bf16 rs1,
+                                                size_t vl) {
+  return __riscv_vfwmul_tu(vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vv_bf16m1_f32m2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vv_bf16m1_f32m2_rm_tu(vfloat32m2_t vd,
+                                               vbfloat16m1_t vs2,
+                                               vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwmul_tu(vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vf_bf16m1_f32m2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vf_bf16m1_f32m2_rm_tu(vfloat32m2_t vd,
+                                               vbfloat16m1_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwmul_tu(vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vv_bf16m2_f32m4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vv_bf16m2_f32m4_rm_tu(vfloat32m4_t vd,
+                                               vbfloat16m2_t vs2,
+                                               vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwmul_tu(vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vf_bf16m2_f32m4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vf_bf16m2_f32m4_rm_tu(vfloat32m4_t vd,
+                                               vbfloat16m2_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwmul_tu(vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vv_bf16m4_f32m8_rm_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vv_bf16m4_f32m8_rm_tu(vfloat32m8_t vd,
+                                               vbfloat16m4_t vs2,
+                                               vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwmul_tu(vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vf_bf16m4_f32m8_rm_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vf_bf16m4_f32m8_rm_tu(vfloat32m8_t vd,
+                                               vbfloat16m4_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwmul_tu(vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vv_bf16mf4_f32mf2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vv_bf16mf4_f32mf2_rm_tum(vbool64_t vm,
+                                                   vfloat32mf2_t vd,
+                                                   vbfloat16mf4_t vs2,
+                                                   vbfloat16mf4_t vs1,
+                                                   size_t vl) {
+  return __riscv_vfwmul_tum(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vf_bf16mf4_f32mf2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vf_bf16mf4_f32mf2_rm_tum(vbool64_t vm,
+                                                   vfloat32mf2_t vd,
+                                                   vbfloat16mf4_t vs2,
+                                                   __bf16 rs1, size_t vl) {
+  return __riscv_vfwmul_tum(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vv_bf16mf2_f32m1_rm_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vv_bf16mf2_f32m1_rm_tum(vbool32_t vm, vfloat32m1_t vd,
+                                                 vbfloat16mf2_t vs2,
+                                                 vbfloat16mf2_t vs1,
+                                                 size_t vl) {
+  return __riscv_vfwmul_tum(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vf_bf16mf2_f32m1_rm_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vf_bf16mf2_f32m1_rm_tum(vbool32_t vm, vfloat32m1_t vd,
+                                                 vbfloat16mf2_t vs2, __bf16 rs1,
+                                                 size_t vl) {
+  return __riscv_vfwmul_tum(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vv_bf16m1_f32m2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vv_bf16m1_f32m2_rm_tum(vbool16_t vm, vfloat32m2_t vd,
+                                                vbfloat16m1_t vs2,
+                                                vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwmul_tum(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vf_bf16m1_f32m2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vf_bf16m1_f32m2_rm_tum(vbool16_t vm, vfloat32m2_t vd,
+                                                vbfloat16m1_t vs2, __bf16 rs1,
+                                                size_t vl) {
+  return __riscv_vfwmul_tum(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vv_bf16m2_f32m4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vv_bf16m2_f32m4_rm_tum(vbool8_t vm, vfloat32m4_t vd,
+                                                vbfloat16m2_t vs2,
+                                                vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwmul_tum(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vf_bf16m2_f32m4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vf_bf16m2_f32m4_rm_tum(vbool8_t vm, vfloat32m4_t vd,
+                                                vbfloat16m2_t vs2, __bf16 rs1,
+                                                size_t vl) {
+  return __riscv_vfwmul_tum(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vv_bf16m4_f32m8_rm_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vv_bf16m4_f32m8_rm_tum(vbool4_t vm, vfloat32m8_t vd,
+                                                vbfloat16m4_t vs2,
+                                                vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwmul_tum(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vf_bf16m4_f32m8_rm_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vf_bf16m4_f32m8_rm_tum(vbool4_t vm, vfloat32m8_t vd,
+                                                vbfloat16m4_t vs2, __bf16 rs1,
+                                                size_t vl) {
+  return __riscv_vfwmul_tum(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vv_bf16mf4_f32mf2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vv_bf16mf4_f32mf2_rm_tumu(vbool64_t vm,
+                                                    vfloat32mf2_t vd,
+                                                    vbfloat16mf4_t vs2,
+                                                    vbfloat16mf4_t vs1,
+                                                    size_t vl) {
+  return __riscv_vfwmul_tumu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vf_bf16mf4_f32mf2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vf_bf16mf4_f32mf2_rm_tumu(vbool64_t vm,
+                                                    vfloat32mf2_t vd,
+                                                    vbfloat16mf4_t vs2,
+                                                    __bf16 rs1, size_t vl) {
+  return __riscv_vfwmul_tumu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vv_bf16mf2_f32m1_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vv_bf16mf2_f32m1_rm_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                                  vbfloat16mf2_t vs2,
+                                                  vbfloat16mf2_t vs1,
+                                                  size_t vl) {
+  return __riscv_vfwmul_tumu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vf_bf16mf2_f32m1_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vf_bf16mf2_f32m1_rm_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                                  vbfloat16mf2_t vs2,
+                                                  __bf16 rs1, size_t vl) {
+  return __riscv_vfwmul_tumu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vv_bf16m1_f32m2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vv_bf16m1_f32m2_rm_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                                 vbfloat16m1_t vs2,
+                                                 vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwmul_tumu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vf_bf16m1_f32m2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vf_bf16m1_f32m2_rm_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                                 vbfloat16m1_t vs2, __bf16 rs1,
+                                                 size_t vl) {
+  return __riscv_vfwmul_tumu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vv_bf16m2_f32m4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vv_bf16m2_f32m4_rm_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                                 vbfloat16m2_t vs2,
+                                                 vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwmul_tumu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vf_bf16m2_f32m4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vf_bf16m2_f32m4_rm_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                                 vbfloat16m2_t vs2, __bf16 rs1,
+                                                 size_t vl) {
+  return __riscv_vfwmul_tumu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vv_bf16m4_f32m8_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vv_bf16m4_f32m8_rm_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                                 vbfloat16m4_t vs2,
+                                                 vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwmul_tumu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vf_bf16m4_f32m8_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vf_bf16m4_f32m8_rm_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                                 vbfloat16m4_t vs2, __bf16 rs1,
+                                                 size_t vl) {
+  return __riscv_vfwmul_tumu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vv_bf16mf4_f32mf2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vv_bf16mf4_f32mf2_rm_mu(vbool64_t vm,
+                                                  vfloat32mf2_t vd,
+                                                  vbfloat16mf4_t vs2,
+                                                  vbfloat16mf4_t vs1,
+                                                  size_t vl) {
+  return __riscv_vfwmul_mu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwmul_vf_bf16mf4_f32mf2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwmul.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwmul_vf_bf16mf4_f32mf2_rm_mu(vbool64_t vm,
+                                                  vfloat32mf2_t vd,
+                                                  vbfloat16mf4_t vs2,
+                                                  __bf16 rs1, size_t vl) {
+  return __riscv_vfwmul_mu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vv_bf16mf2_f32m1_rm_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vv_bf16mf2_f32m1_rm_mu(vbool32_t vm, vfloat32m1_t vd,
+                                                vbfloat16mf2_t vs2,
+                                                vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwmul_mu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwmul_vf_bf16mf2_f32m1_rm_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwmul.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwmul_vf_bf16mf2_f32m1_rm_mu(vbool32_t vm, vfloat32m1_t vd,
+                                                vbfloat16mf2_t vs2, __bf16 rs1,
+                                                size_t vl) {
+  return __riscv_vfwmul_mu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vv_bf16m1_f32m2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vv_bf16m1_f32m2_rm_mu(vbool16_t vm, vfloat32m2_t vd,
+                                               vbfloat16m1_t vs2,
+                                               vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwmul_mu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwmul_vf_bf16m1_f32m2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwmul.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwmul_vf_bf16m1_f32m2_rm_mu(vbool16_t vm, vfloat32m2_t vd,
+                                               vbfloat16m1_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwmul_mu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vv_bf16m2_f32m4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vv_bf16m2_f32m4_rm_mu(vbool8_t vm, vfloat32m4_t vd,
+                                               vbfloat16m2_t vs2,
+                                               vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwmul_mu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwmul_vf_bf16m2_f32m4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwmul.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwmul_vf_bf16m2_f32m4_rm_mu(vbool8_t vm, vfloat32m4_t vd,
+                                               vbfloat16m2_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwmul_mu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vv_bf16m4_f32m8_rm_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vv_bf16m4_f32m8_rm_mu(vbool4_t vm, vfloat32m8_t vd,
+                                               vbfloat16m4_t vs2,
+                                               vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwmul_mu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwmul_vf_bf16m4_f32m8_rm_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwmul.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwmul_vf_bf16m4_f32m8_rm_mu(vbool4_t vm, vfloat32m8_t vd,
+                                               vbfloat16m4_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwmul_mu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfwnmacc.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfwnmacc.c
new file mode 100644
index 0000000000000..57e433441cd40
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfwnmacc.c
@@ -0,0 +1,994 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vv_bf16mf4_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vv_bf16mf4_f32mf2_tu(vfloat32mf2_t vd,
+                                                 vbfloat16mf4_t vs1,
+                                                 vbfloat16mf4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmacc_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vf_bf16mf4_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vf_bf16mf4_f32mf2_tu(vfloat32mf2_t vd, __bf16 vs1,
+                                                 vbfloat16mf4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmacc_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vv_bf16mf2_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vv_bf16mf2_f32m1_tu(vfloat32m1_t vd,
+                                               vbfloat16mf2_t vs1,
+                                               vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vf_bf16mf2_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vf_bf16mf2_f32m1_tu(vfloat32m1_t vd, __bf16 vs1,
+                                               vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vv_bf16m1_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vv_bf16m1_f32m2_tu(vfloat32m2_t vd,
+                                              vbfloat16m1_t vs1,
+                                              vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vf_bf16m1_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vf_bf16m1_f32m2_tu(vfloat32m2_t vd, __bf16 vs1,
+                                              vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vv_bf16m2_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vv_bf16m2_f32m4_tu(vfloat32m4_t vd,
+                                              vbfloat16m2_t vs1,
+                                              vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vf_bf16m2_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vf_bf16m2_f32m4_tu(vfloat32m4_t vd, __bf16 vs1,
+                                              vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vv_bf16m4_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vv_bf16m4_f32m8_tu(vfloat32m8_t vd,
+                                              vbfloat16m4_t vs1,
+                                              vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vf_bf16m4_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vf_bf16m4_f32m8_tu(vfloat32m8_t vd, __bf16 vs1,
+                                              vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vv_bf16mf4_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vv_bf16mf4_f32mf2_tum(vbool64_t vm,
+                                                  vfloat32mf2_t vd,
+                                                  vbfloat16mf4_t vs1,
+                                                  vbfloat16mf4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmacc_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vf_bf16mf4_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vf_bf16mf4_f32mf2_tum(vbool64_t vm,
+                                                  vfloat32mf2_t vd, __bf16 vs1,
+                                                  vbfloat16mf4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmacc_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vv_bf16mf2_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vv_bf16mf2_f32m1_tum(vbool32_t vm, vfloat32m1_t vd,
+                                                vbfloat16mf2_t vs1,
+                                                vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vf_bf16mf2_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vf_bf16mf2_f32m1_tum(vbool32_t vm, vfloat32m1_t vd,
+                                                __bf16 vs1, vbfloat16mf2_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwnmacc_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vv_bf16m1_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vv_bf16m1_f32m2_tum(vbool16_t vm, vfloat32m2_t vd,
+                                               vbfloat16m1_t vs1,
+                                               vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vf_bf16m1_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vf_bf16m1_f32m2_tum(vbool16_t vm, vfloat32m2_t vd,
+                                               __bf16 vs1, vbfloat16m1_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwnmacc_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vv_bf16m2_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vv_bf16m2_f32m4_tum(vbool8_t vm, vfloat32m4_t vd,
+                                               vbfloat16m2_t vs1,
+                                               vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vf_bf16m2_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vf_bf16m2_f32m4_tum(vbool8_t vm, vfloat32m4_t vd,
+                                               __bf16 vs1, vbfloat16m2_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwnmacc_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vv_bf16m4_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vv_bf16m4_f32m8_tum(vbool4_t vm, vfloat32m8_t vd,
+                                               vbfloat16m4_t vs1,
+                                               vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vf_bf16m4_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vf_bf16m4_f32m8_tum(vbool4_t vm, vfloat32m8_t vd,
+                                               __bf16 vs1, vbfloat16m4_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwnmacc_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vv_bf16mf4_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vv_bf16mf4_f32mf2_tumu(vbool64_t vm,
+                                                   vfloat32mf2_t vd,
+                                                   vbfloat16mf4_t vs1,
+                                                   vbfloat16mf4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmacc_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vf_bf16mf4_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vf_bf16mf4_f32mf2_tumu(vbool64_t vm,
+                                                   vfloat32mf2_t vd, __bf16 vs1,
+                                                   vbfloat16mf4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmacc_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vv_bf16mf2_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vv_bf16mf2_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                                 vbfloat16mf2_t vs1,
+                                                 vbfloat16mf2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmacc_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vf_bf16mf2_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vf_bf16mf2_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                                 __bf16 vs1, vbfloat16mf2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmacc_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vv_bf16m1_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vv_bf16m1_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                                vbfloat16m1_t vs1,
+                                                vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vf_bf16m1_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vf_bf16m1_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                                __bf16 vs1, vbfloat16m1_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwnmacc_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vv_bf16m2_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vv_bf16m2_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                                vbfloat16m2_t vs1,
+                                                vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vf_bf16m2_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vf_bf16m2_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                                __bf16 vs1, vbfloat16m2_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwnmacc_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vv_bf16m4_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vv_bf16m4_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                                vbfloat16m4_t vs1,
+                                                vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vf_bf16m4_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vf_bf16m4_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                                __bf16 vs1, vbfloat16m4_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwnmacc_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vv_bf16mf4_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vv_bf16mf4_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd,
+                                                 vbfloat16mf4_t vs1,
+                                                 vbfloat16mf4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmacc_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vf_bf16mf4_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vf_bf16mf4_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd,
+                                                 __bf16 vs1, vbfloat16mf4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmacc_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vv_bf16mf2_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vv_bf16mf2_f32m1_mu(vbool32_t vm, vfloat32m1_t vd,
+                                               vbfloat16mf2_t vs1,
+                                               vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vf_bf16mf2_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vf_bf16mf2_f32m1_mu(vbool32_t vm, vfloat32m1_t vd,
+                                               __bf16 vs1, vbfloat16mf2_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwnmacc_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vv_bf16m1_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vv_bf16m1_f32m2_mu(vbool16_t vm, vfloat32m2_t vd,
+                                              vbfloat16m1_t vs1,
+                                              vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vf_bf16m1_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vf_bf16m1_f32m2_mu(vbool16_t vm, vfloat32m2_t vd,
+                                              __bf16 vs1, vbfloat16m1_t vs2,
+                                              size_t vl) {
+  return __riscv_vfwnmacc_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vv_bf16m2_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vv_bf16m2_f32m4_mu(vbool8_t vm, vfloat32m4_t vd,
+                                              vbfloat16m2_t vs1,
+                                              vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vf_bf16m2_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vf_bf16m2_f32m4_mu(vbool8_t vm, vfloat32m4_t vd,
+                                              __bf16 vs1, vbfloat16m2_t vs2,
+                                              size_t vl) {
+  return __riscv_vfwnmacc_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vv_bf16m4_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vv_bf16m4_f32m8_mu(vbool4_t vm, vfloat32m8_t vd,
+                                              vbfloat16m4_t vs1,
+                                              vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vf_bf16m4_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vf_bf16m4_f32m8_mu(vbool4_t vm, vfloat32m8_t vd,
+                                              __bf16 vs1, vbfloat16m4_t vs2,
+                                              size_t vl) {
+  return __riscv_vfwnmacc_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vv_bf16mf4_f32mf2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vv_bf16mf4_f32mf2_rm_tu(vfloat32mf2_t vd,
+                                                    vbfloat16mf4_t vs1,
+                                                    vbfloat16mf4_t vs2,
+                                                    size_t vl) {
+  return __riscv_vfwnmacc_tu(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vf_bf16mf4_f32mf2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vf_bf16mf4_f32mf2_rm_tu(vfloat32mf2_t vd,
+                                                    __bf16 vs1,
+                                                    vbfloat16mf4_t vs2,
+                                                    size_t vl) {
+  return __riscv_vfwnmacc_tu(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vv_bf16mf2_f32m1_rm_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vv_bf16mf2_f32m1_rm_tu(vfloat32m1_t vd,
+                                                  vbfloat16mf2_t vs1,
+                                                  vbfloat16mf2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmacc_tu(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vf_bf16mf2_f32m1_rm_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vf_bf16mf2_f32m1_rm_tu(vfloat32m1_t vd, __bf16 vs1,
+                                                  vbfloat16mf2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmacc_tu(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vv_bf16m1_f32m2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vv_bf16m1_f32m2_rm_tu(vfloat32m2_t vd,
+                                                 vbfloat16m1_t vs1,
+                                                 vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_tu(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vf_bf16m1_f32m2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vf_bf16m1_f32m2_rm_tu(vfloat32m2_t vd, __bf16 vs1,
+                                                 vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_tu(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vv_bf16m2_f32m4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vv_bf16m2_f32m4_rm_tu(vfloat32m4_t vd,
+                                                 vbfloat16m2_t vs1,
+                                                 vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_tu(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vf_bf16m2_f32m4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vf_bf16m2_f32m4_rm_tu(vfloat32m4_t vd, __bf16 vs1,
+                                                 vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_tu(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vv_bf16m4_f32m8_rm_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vv_bf16m4_f32m8_rm_tu(vfloat32m8_t vd,
+                                                 vbfloat16m4_t vs1,
+                                                 vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_tu(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vf_bf16m4_f32m8_rm_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vf_bf16m4_f32m8_rm_tu(vfloat32m8_t vd, __bf16 vs1,
+                                                 vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_tu(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vv_bf16mf4_f32mf2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vv_bf16mf4_f32mf2_rm_tum(vbool64_t vm,
+                                                     vfloat32mf2_t vd,
+                                                     vbfloat16mf4_t vs1,
+                                                     vbfloat16mf4_t vs2,
+                                                     size_t vl) {
+  return __riscv_vfwnmacc_tum(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vf_bf16mf4_f32mf2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vf_bf16mf4_f32mf2_rm_tum(
+    vbool64_t vm, vfloat32mf2_t vd, __bf16 vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_tum(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vv_bf16mf2_f32m1_rm_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vv_bf16mf2_f32m1_rm_tum(vbool32_t vm,
+                                                   vfloat32m1_t vd,
+                                                   vbfloat16mf2_t vs1,
+                                                   vbfloat16mf2_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmacc_tum(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vf_bf16mf2_f32m1_rm_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vf_bf16mf2_f32m1_rm_tum(vbool32_t vm,
+                                                   vfloat32m1_t vd, __bf16 vs1,
+                                                   vbfloat16mf2_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmacc_tum(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vv_bf16m1_f32m2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vv_bf16m1_f32m2_rm_tum(vbool16_t vm, vfloat32m2_t vd,
+                                                  vbfloat16m1_t vs1,
+                                                  vbfloat16m1_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmacc_tum(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vf_bf16m1_f32m2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vf_bf16m1_f32m2_rm_tum(vbool16_t vm, vfloat32m2_t vd,
+                                                  __bf16 vs1, vbfloat16m1_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmacc_tum(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vv_bf16m2_f32m4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vv_bf16m2_f32m4_rm_tum(vbool8_t vm, vfloat32m4_t vd,
+                                                  vbfloat16m2_t vs1,
+                                                  vbfloat16m2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmacc_tum(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vf_bf16m2_f32m4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vf_bf16m2_f32m4_rm_tum(vbool8_t vm, vfloat32m4_t vd,
+                                                  __bf16 vs1, vbfloat16m2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmacc_tum(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vv_bf16m4_f32m8_rm_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vv_bf16m4_f32m8_rm_tum(vbool4_t vm, vfloat32m8_t vd,
+                                                  vbfloat16m4_t vs1,
+                                                  vbfloat16m4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmacc_tum(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vf_bf16m4_f32m8_rm_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vf_bf16m4_f32m8_rm_tum(vbool4_t vm, vfloat32m8_t vd,
+                                                  __bf16 vs1, vbfloat16m4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmacc_tum(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vv_bf16mf4_f32mf2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vv_bf16mf4_f32mf2_rm_tumu(vbool64_t vm,
+                                                      vfloat32mf2_t vd,
+                                                      vbfloat16mf4_t vs1,
+                                                      vbfloat16mf4_t vs2,
+                                                      size_t vl) {
+  return __riscv_vfwnmacc_tumu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vf_bf16mf4_f32mf2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vf_bf16mf4_f32mf2_rm_tumu(
+    vbool64_t vm, vfloat32mf2_t vd, __bf16 vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_tumu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vv_bf16mf2_f32m1_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vv_bf16mf2_f32m1_rm_tumu(vbool32_t vm,
+                                                    vfloat32m1_t vd,
+                                                    vbfloat16mf2_t vs1,
+                                                    vbfloat16mf2_t vs2,
+                                                    size_t vl) {
+  return __riscv_vfwnmacc_tumu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vf_bf16mf2_f32m1_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vf_bf16mf2_f32m1_rm_tumu(vbool32_t vm,
+                                                    vfloat32m1_t vd, __bf16 vs1,
+                                                    vbfloat16mf2_t vs2,
+                                                    size_t vl) {
+  return __riscv_vfwnmacc_tumu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vv_bf16m1_f32m2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vv_bf16m1_f32m2_rm_tumu(vbool16_t vm,
+                                                   vfloat32m2_t vd,
+                                                   vbfloat16m1_t vs1,
+                                                   vbfloat16m1_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmacc_tumu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vf_bf16m1_f32m2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vf_bf16m1_f32m2_rm_tumu(vbool16_t vm,
+                                                   vfloat32m2_t vd, __bf16 vs1,
+                                                   vbfloat16m1_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmacc_tumu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vv_bf16m2_f32m4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vv_bf16m2_f32m4_rm_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                                   vbfloat16m2_t vs1,
+                                                   vbfloat16m2_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmacc_tumu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vf_bf16m2_f32m4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vf_bf16m2_f32m4_rm_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                                   __bf16 vs1,
+                                                   vbfloat16m2_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmacc_tumu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vv_bf16m4_f32m8_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vv_bf16m4_f32m8_rm_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                                   vbfloat16m4_t vs1,
+                                                   vbfloat16m4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmacc_tumu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vf_bf16m4_f32m8_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vf_bf16m4_f32m8_rm_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                                   __bf16 vs1,
+                                                   vbfloat16m4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmacc_tumu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vv_bf16mf4_f32mf2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vv_bf16mf4_f32mf2_rm_mu(vbool64_t vm,
+                                                    vfloat32mf2_t vd,
+                                                    vbfloat16mf4_t vs1,
+                                                    vbfloat16mf4_t vs2,
+                                                    size_t vl) {
+  return __riscv_vfwnmacc_mu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmacc_vf_bf16mf4_f32mf2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmacc_vf_bf16mf4_f32mf2_rm_mu(
+    vbool64_t vm, vfloat32mf2_t vd, __bf16 vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_mu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vv_bf16mf2_f32m1_rm_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vv_bf16mf2_f32m1_rm_mu(vbool32_t vm, vfloat32m1_t vd,
+                                                  vbfloat16mf2_t vs1,
+                                                  vbfloat16mf2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmacc_mu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmacc_vf_bf16mf2_f32m1_rm_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmacc_vf_bf16mf2_f32m1_rm_mu(vbool32_t vm, vfloat32m1_t vd,
+                                                  __bf16 vs1,
+                                                  vbfloat16mf2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmacc_mu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vv_bf16m1_f32m2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vv_bf16m1_f32m2_rm_mu(vbool16_t vm, vfloat32m2_t vd,
+                                                 vbfloat16m1_t vs1,
+                                                 vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_mu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmacc_vf_bf16m1_f32m2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmacc_vf_bf16m1_f32m2_rm_mu(vbool16_t vm, vfloat32m2_t vd,
+                                                 __bf16 vs1, vbfloat16m1_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmacc_mu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vv_bf16m2_f32m4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vv_bf16m2_f32m4_rm_mu(vbool8_t vm, vfloat32m4_t vd,
+                                                 vbfloat16m2_t vs1,
+                                                 vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_mu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmacc_vf_bf16m2_f32m4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmacc_vf_bf16m2_f32m4_rm_mu(vbool8_t vm, vfloat32m4_t vd,
+                                                 __bf16 vs1, vbfloat16m2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmacc_mu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vv_bf16m4_f32m8_rm_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vv_bf16m4_f32m8_rm_mu(vbool4_t vm, vfloat32m8_t vd,
+                                                 vbfloat16m4_t vs1,
+                                                 vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmacc_mu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmacc_vf_bf16m4_f32m8_rm_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmacc_vf_bf16m4_f32m8_rm_mu(vbool4_t vm, vfloat32m8_t vd,
+                                                 __bf16 vs1, vbfloat16m4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmacc_mu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfwnmsac.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfwnmsac.c
new file mode 100644
index 0000000000000..42da060126a2b
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfwnmsac.c
@@ -0,0 +1,994 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vv_bf16mf4_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vv_bf16mf4_f32mf2_tu(vfloat32mf2_t vd,
+                                                 vbfloat16mf4_t vs1,
+                                                 vbfloat16mf4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmsac_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vf_bf16mf4_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vf_bf16mf4_f32mf2_tu(vfloat32mf2_t vd, __bf16 vs1,
+                                                 vbfloat16mf4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmsac_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vv_bf16mf2_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vv_bf16mf2_f32m1_tu(vfloat32m1_t vd,
+                                               vbfloat16mf2_t vs1,
+                                               vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vf_bf16mf2_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vf_bf16mf2_f32m1_tu(vfloat32m1_t vd, __bf16 vs1,
+                                               vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vv_bf16m1_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vv_bf16m1_f32m2_tu(vfloat32m2_t vd,
+                                              vbfloat16m1_t vs1,
+                                              vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vf_bf16m1_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vf_bf16m1_f32m2_tu(vfloat32m2_t vd, __bf16 vs1,
+                                              vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vv_bf16m2_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vv_bf16m2_f32m4_tu(vfloat32m4_t vd,
+                                              vbfloat16m2_t vs1,
+                                              vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vf_bf16m2_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vf_bf16m2_f32m4_tu(vfloat32m4_t vd, __bf16 vs1,
+                                              vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vv_bf16m4_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vv_bf16m4_f32m8_tu(vfloat32m8_t vd,
+                                              vbfloat16m4_t vs1,
+                                              vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vf_bf16m4_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vf_bf16m4_f32m8_tu(vfloat32m8_t vd, __bf16 vs1,
+                                              vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_tu(vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vv_bf16mf4_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vv_bf16mf4_f32mf2_tum(vbool64_t vm,
+                                                  vfloat32mf2_t vd,
+                                                  vbfloat16mf4_t vs1,
+                                                  vbfloat16mf4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmsac_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vf_bf16mf4_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vf_bf16mf4_f32mf2_tum(vbool64_t vm,
+                                                  vfloat32mf2_t vd, __bf16 vs1,
+                                                  vbfloat16mf4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmsac_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vv_bf16mf2_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vv_bf16mf2_f32m1_tum(vbool32_t vm, vfloat32m1_t vd,
+                                                vbfloat16mf2_t vs1,
+                                                vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vf_bf16mf2_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vf_bf16mf2_f32m1_tum(vbool32_t vm, vfloat32m1_t vd,
+                                                __bf16 vs1, vbfloat16mf2_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwnmsac_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vv_bf16m1_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vv_bf16m1_f32m2_tum(vbool16_t vm, vfloat32m2_t vd,
+                                               vbfloat16m1_t vs1,
+                                               vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vf_bf16m1_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vf_bf16m1_f32m2_tum(vbool16_t vm, vfloat32m2_t vd,
+                                               __bf16 vs1, vbfloat16m1_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwnmsac_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vv_bf16m2_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vv_bf16m2_f32m4_tum(vbool8_t vm, vfloat32m4_t vd,
+                                               vbfloat16m2_t vs1,
+                                               vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vf_bf16m2_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vf_bf16m2_f32m4_tum(vbool8_t vm, vfloat32m4_t vd,
+                                               __bf16 vs1, vbfloat16m2_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwnmsac_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vv_bf16m4_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vv_bf16m4_f32m8_tum(vbool4_t vm, vfloat32m8_t vd,
+                                               vbfloat16m4_t vs1,
+                                               vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vf_bf16m4_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vf_bf16m4_f32m8_tum(vbool4_t vm, vfloat32m8_t vd,
+                                               __bf16 vs1, vbfloat16m4_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwnmsac_tum(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vv_bf16mf4_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vv_bf16mf4_f32mf2_tumu(vbool64_t vm,
+                                                   vfloat32mf2_t vd,
+                                                   vbfloat16mf4_t vs1,
+                                                   vbfloat16mf4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmsac_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vf_bf16mf4_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vf_bf16mf4_f32mf2_tumu(vbool64_t vm,
+                                                   vfloat32mf2_t vd, __bf16 vs1,
+                                                   vbfloat16mf4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmsac_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vv_bf16mf2_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vv_bf16mf2_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                                 vbfloat16mf2_t vs1,
+                                                 vbfloat16mf2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmsac_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vf_bf16mf2_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vf_bf16mf2_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                                 __bf16 vs1, vbfloat16mf2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmsac_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vv_bf16m1_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vv_bf16m1_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                                vbfloat16m1_t vs1,
+                                                vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vf_bf16m1_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vf_bf16m1_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                                __bf16 vs1, vbfloat16m1_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwnmsac_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vv_bf16m2_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vv_bf16m2_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                                vbfloat16m2_t vs1,
+                                                vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vf_bf16m2_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vf_bf16m2_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                                __bf16 vs1, vbfloat16m2_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwnmsac_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vv_bf16m4_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vv_bf16m4_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                                vbfloat16m4_t vs1,
+                                                vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vf_bf16m4_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vf_bf16m4_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                                __bf16 vs1, vbfloat16m4_t vs2,
+                                                size_t vl) {
+  return __riscv_vfwnmsac_tumu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vv_bf16mf4_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vv_bf16mf4_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd,
+                                                 vbfloat16mf4_t vs1,
+                                                 vbfloat16mf4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmsac_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vf_bf16mf4_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vf_bf16mf4_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd,
+                                                 __bf16 vs1, vbfloat16mf4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmsac_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vv_bf16mf2_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vv_bf16mf2_f32m1_mu(vbool32_t vm, vfloat32m1_t vd,
+                                               vbfloat16mf2_t vs1,
+                                               vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vf_bf16mf2_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vf_bf16mf2_f32m1_mu(vbool32_t vm, vfloat32m1_t vd,
+                                               __bf16 vs1, vbfloat16mf2_t vs2,
+                                               size_t vl) {
+  return __riscv_vfwnmsac_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vv_bf16m1_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vv_bf16m1_f32m2_mu(vbool16_t vm, vfloat32m2_t vd,
+                                              vbfloat16m1_t vs1,
+                                              vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vf_bf16m1_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vf_bf16m1_f32m2_mu(vbool16_t vm, vfloat32m2_t vd,
+                                              __bf16 vs1, vbfloat16m1_t vs2,
+                                              size_t vl) {
+  return __riscv_vfwnmsac_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vv_bf16m2_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vv_bf16m2_f32m4_mu(vbool8_t vm, vfloat32m4_t vd,
+                                              vbfloat16m2_t vs1,
+                                              vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vf_bf16m2_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vf_bf16m2_f32m4_mu(vbool8_t vm, vfloat32m4_t vd,
+                                              __bf16 vs1, vbfloat16m2_t vs2,
+                                              size_t vl) {
+  return __riscv_vfwnmsac_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vv_bf16m4_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vv_bf16m4_f32m8_mu(vbool4_t vm, vfloat32m8_t vd,
+                                              vbfloat16m4_t vs1,
+                                              vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vf_bf16m4_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vf_bf16m4_f32m8_mu(vbool4_t vm, vfloat32m8_t vd,
+                                              __bf16 vs1, vbfloat16m4_t vs2,
+                                              size_t vl) {
+  return __riscv_vfwnmsac_mu(vm, vd, vs1, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vv_bf16mf4_f32mf2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vv_bf16mf4_f32mf2_rm_tu(vfloat32mf2_t vd,
+                                                    vbfloat16mf4_t vs1,
+                                                    vbfloat16mf4_t vs2,
+                                                    size_t vl) {
+  return __riscv_vfwnmsac_tu(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vf_bf16mf4_f32mf2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vf_bf16mf4_f32mf2_rm_tu(vfloat32mf2_t vd,
+                                                    __bf16 vs1,
+                                                    vbfloat16mf4_t vs2,
+                                                    size_t vl) {
+  return __riscv_vfwnmsac_tu(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vv_bf16mf2_f32m1_rm_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vv_bf16mf2_f32m1_rm_tu(vfloat32m1_t vd,
+                                                  vbfloat16mf2_t vs1,
+                                                  vbfloat16mf2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmsac_tu(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vf_bf16mf2_f32m1_rm_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vf_bf16mf2_f32m1_rm_tu(vfloat32m1_t vd, __bf16 vs1,
+                                                  vbfloat16mf2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmsac_tu(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vv_bf16m1_f32m2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vv_bf16m1_f32m2_rm_tu(vfloat32m2_t vd,
+                                                 vbfloat16m1_t vs1,
+                                                 vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_tu(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vf_bf16m1_f32m2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vf_bf16m1_f32m2_rm_tu(vfloat32m2_t vd, __bf16 vs1,
+                                                 vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_tu(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vv_bf16m2_f32m4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vv_bf16m2_f32m4_rm_tu(vfloat32m4_t vd,
+                                                 vbfloat16m2_t vs1,
+                                                 vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_tu(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vf_bf16m2_f32m4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vf_bf16m2_f32m4_rm_tu(vfloat32m4_t vd, __bf16 vs1,
+                                                 vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_tu(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vv_bf16m4_f32m8_rm_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vv_bf16m4_f32m8_rm_tu(vfloat32m8_t vd,
+                                                 vbfloat16m4_t vs1,
+                                                 vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_tu(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vf_bf16m4_f32m8_rm_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vf_bf16m4_f32m8_rm_tu(vfloat32m8_t vd, __bf16 vs1,
+                                                 vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_tu(vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vv_bf16mf4_f32mf2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vv_bf16mf4_f32mf2_rm_tum(vbool64_t vm,
+                                                     vfloat32mf2_t vd,
+                                                     vbfloat16mf4_t vs1,
+                                                     vbfloat16mf4_t vs2,
+                                                     size_t vl) {
+  return __riscv_vfwnmsac_tum(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vf_bf16mf4_f32mf2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vf_bf16mf4_f32mf2_rm_tum(
+    vbool64_t vm, vfloat32mf2_t vd, __bf16 vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_tum(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vv_bf16mf2_f32m1_rm_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vv_bf16mf2_f32m1_rm_tum(vbool32_t vm,
+                                                   vfloat32m1_t vd,
+                                                   vbfloat16mf2_t vs1,
+                                                   vbfloat16mf2_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmsac_tum(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vf_bf16mf2_f32m1_rm_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vf_bf16mf2_f32m1_rm_tum(vbool32_t vm,
+                                                   vfloat32m1_t vd, __bf16 vs1,
+                                                   vbfloat16mf2_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmsac_tum(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vv_bf16m1_f32m2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vv_bf16m1_f32m2_rm_tum(vbool16_t vm, vfloat32m2_t vd,
+                                                  vbfloat16m1_t vs1,
+                                                  vbfloat16m1_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmsac_tum(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vf_bf16m1_f32m2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vf_bf16m1_f32m2_rm_tum(vbool16_t vm, vfloat32m2_t vd,
+                                                  __bf16 vs1, vbfloat16m1_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmsac_tum(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vv_bf16m2_f32m4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vv_bf16m2_f32m4_rm_tum(vbool8_t vm, vfloat32m4_t vd,
+                                                  vbfloat16m2_t vs1,
+                                                  vbfloat16m2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmsac_tum(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vf_bf16m2_f32m4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vf_bf16m2_f32m4_rm_tum(vbool8_t vm, vfloat32m4_t vd,
+                                                  __bf16 vs1, vbfloat16m2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmsac_tum(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vv_bf16m4_f32m8_rm_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vv_bf16m4_f32m8_rm_tum(vbool4_t vm, vfloat32m8_t vd,
+                                                  vbfloat16m4_t vs1,
+                                                  vbfloat16m4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmsac_tum(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vf_bf16m4_f32m8_rm_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vf_bf16m4_f32m8_rm_tum(vbool4_t vm, vfloat32m8_t vd,
+                                                  __bf16 vs1, vbfloat16m4_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmsac_tum(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vv_bf16mf4_f32mf2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vv_bf16mf4_f32mf2_rm_tumu(vbool64_t vm,
+                                                      vfloat32mf2_t vd,
+                                                      vbfloat16mf4_t vs1,
+                                                      vbfloat16mf4_t vs2,
+                                                      size_t vl) {
+  return __riscv_vfwnmsac_tumu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vf_bf16mf4_f32mf2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vf_bf16mf4_f32mf2_rm_tumu(
+    vbool64_t vm, vfloat32mf2_t vd, __bf16 vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_tumu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vv_bf16mf2_f32m1_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vv_bf16mf2_f32m1_rm_tumu(vbool32_t vm,
+                                                    vfloat32m1_t vd,
+                                                    vbfloat16mf2_t vs1,
+                                                    vbfloat16mf2_t vs2,
+                                                    size_t vl) {
+  return __riscv_vfwnmsac_tumu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vf_bf16mf2_f32m1_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vf_bf16mf2_f32m1_rm_tumu(vbool32_t vm,
+                                                    vfloat32m1_t vd, __bf16 vs1,
+                                                    vbfloat16mf2_t vs2,
+                                                    size_t vl) {
+  return __riscv_vfwnmsac_tumu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vv_bf16m1_f32m2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vv_bf16m1_f32m2_rm_tumu(vbool16_t vm,
+                                                   vfloat32m2_t vd,
+                                                   vbfloat16m1_t vs1,
+                                                   vbfloat16m1_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmsac_tumu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vf_bf16m1_f32m2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vf_bf16m1_f32m2_rm_tumu(vbool16_t vm,
+                                                   vfloat32m2_t vd, __bf16 vs1,
+                                                   vbfloat16m1_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmsac_tumu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vv_bf16m2_f32m4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vv_bf16m2_f32m4_rm_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                                   vbfloat16m2_t vs1,
+                                                   vbfloat16m2_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmsac_tumu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vf_bf16m2_f32m4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vf_bf16m2_f32m4_rm_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                                   __bf16 vs1,
+                                                   vbfloat16m2_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmsac_tumu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vv_bf16m4_f32m8_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vv_bf16m4_f32m8_rm_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                                   vbfloat16m4_t vs1,
+                                                   vbfloat16m4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmsac_tumu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vf_bf16m4_f32m8_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vf_bf16m4_f32m8_rm_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                                   __bf16 vs1,
+                                                   vbfloat16m4_t vs2,
+                                                   size_t vl) {
+  return __riscv_vfwnmsac_tumu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vv_bf16mf4_f32mf2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vv_bf16mf4_f32mf2_rm_mu(vbool64_t vm,
+                                                    vfloat32mf2_t vd,
+                                                    vbfloat16mf4_t vs1,
+                                                    vbfloat16mf4_t vs2,
+                                                    size_t vl) {
+  return __riscv_vfwnmsac_mu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwnmsac_vf_bf16mf4_f32mf2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.mask.nxv1f32.bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], bfloat [[VS1]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwnmsac_vf_bf16mf4_f32mf2_rm_mu(
+    vbool64_t vm, vfloat32mf2_t vd, __bf16 vs1, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_mu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vv_bf16mf2_f32m1_rm_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vv_bf16mf2_f32m1_rm_mu(vbool32_t vm, vfloat32m1_t vd,
+                                                  vbfloat16mf2_t vs1,
+                                                  vbfloat16mf2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmsac_mu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwnmsac_vf_bf16mf2_f32m1_rm_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.mask.nxv2f32.bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], bfloat [[VS1]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwnmsac_vf_bf16mf2_f32m1_rm_mu(vbool32_t vm, vfloat32m1_t vd,
+                                                  __bf16 vs1,
+                                                  vbfloat16mf2_t vs2,
+                                                  size_t vl) {
+  return __riscv_vfwnmsac_mu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vv_bf16m1_f32m2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vv_bf16m1_f32m2_rm_mu(vbool16_t vm, vfloat32m2_t vd,
+                                                 vbfloat16m1_t vs1,
+                                                 vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_mu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwnmsac_vf_bf16m1_f32m2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.mask.nxv4f32.bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], bfloat [[VS1]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwnmsac_vf_bf16m1_f32m2_rm_mu(vbool16_t vm, vfloat32m2_t vd,
+                                                 __bf16 vs1, vbfloat16m1_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmsac_mu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vv_bf16m2_f32m4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vv_bf16m2_f32m4_rm_mu(vbool8_t vm, vfloat32m4_t vd,
+                                                 vbfloat16m2_t vs1,
+                                                 vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_mu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwnmsac_vf_bf16m2_f32m4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.mask.nxv8f32.bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], bfloat [[VS1]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwnmsac_vf_bf16m2_f32m4_rm_mu(vbool8_t vm, vfloat32m4_t vd,
+                                                 __bf16 vs1, vbfloat16m2_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmsac_mu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vv_bf16m4_f32m8_rm_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vv_bf16m4_f32m8_rm_mu(vbool4_t vm, vfloat32m8_t vd,
+                                                 vbfloat16m4_t vs1,
+                                                 vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_vfwnmsac_mu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwnmsac_vf_bf16m4_f32m8_rm_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], bfloat noundef [[VS1:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.mask.nxv16f32.bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], bfloat [[VS1]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwnmsac_vf_bf16m4_f32m8_rm_mu(vbool4_t vm, vfloat32m8_t vd,
+                                                 __bf16 vs1, vbfloat16m4_t vs2,
+                                                 size_t vl) {
+  return __riscv_vfwnmsac_mu(vm, vd, vs1, vs2, __RISCV_FRM_RNE, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfwsub.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfwsub.c
new file mode 100644
index 0000000000000..1378bc963b216
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vfwsub.c
@@ -0,0 +1,1932 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vv_bf16mf4_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vv_bf16mf4_f32mf2_tu(vfloat32mf2_t vd,
+                                               vbfloat16mf4_t vs2,
+                                               vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vf_bf16mf4_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vf_bf16mf4_f32mf2_tu(vfloat32mf2_t vd,
+                                               vbfloat16mf4_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwsub_vf_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wv_bf16mf4_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wv_bf16mf4_f32mf2_tu(vfloat32mf2_t vd,
+                                               vfloat32mf2_t vs2,
+                                               vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wf_bf16_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.nxv1f32.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wf_bf16_f32mf2_tu(vfloat32mf2_t vd, vfloat32mf2_t vs2,
+                                            __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_wf_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vv_bf16mf2_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vv_bf16mf2_f32m1_tu(vfloat32m1_t vd,
+                                             vbfloat16mf2_t vs2,
+                                             vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vf_bf16mf2_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vf_bf16mf2_f32m1_tu(vfloat32m1_t vd,
+                                             vbfloat16mf2_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwsub_vf_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wv_bf16mf2_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wv_bf16mf2_f32m1_tu(vfloat32m1_t vd, vfloat32m1_t vs2,
+                                             vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wf_bf16_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.nxv2f32.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wf_bf16_f32m1_tu(vfloat32m1_t vd, vfloat32m1_t vs2,
+                                          __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_wf_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vv_bf16m1_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vv_bf16m1_f32m2_tu(vfloat32m2_t vd, vbfloat16m1_t vs2,
+                                            vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vf_bf16m1_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vf_bf16m1_f32m2_tu(vfloat32m2_t vd, vbfloat16m1_t vs2,
+                                            __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_vf_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wv_bf16m1_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wv_bf16m1_f32m2_tu(vfloat32m2_t vd, vfloat32m2_t vs2,
+                                            vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wf_bf16_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.nxv4f32.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wf_bf16_f32m2_tu(vfloat32m2_t vd, vfloat32m2_t vs2,
+                                          __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_wf_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vv_bf16m2_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vv_bf16m2_f32m4_tu(vfloat32m4_t vd, vbfloat16m2_t vs2,
+                                            vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vf_bf16m2_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vf_bf16m2_f32m4_tu(vfloat32m4_t vd, vbfloat16m2_t vs2,
+                                            __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_vf_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wv_bf16m2_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wv_bf16m2_f32m4_tu(vfloat32m4_t vd, vfloat32m4_t vs2,
+                                            vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wf_bf16_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.nxv8f32.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wf_bf16_f32m4_tu(vfloat32m4_t vd, vfloat32m4_t vs2,
+                                          __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_wf_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vv_bf16m4_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vv_bf16m4_f32m8_tu(vfloat32m8_t vd, vbfloat16m4_t vs2,
+                                            vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vf_bf16m4_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vf_bf16m4_f32m8_tu(vfloat32m8_t vd, vbfloat16m4_t vs2,
+                                            __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_vf_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wv_bf16m4_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wv_bf16m4_f32m8_tu(vfloat32m8_t vd, vfloat32m8_t vs2,
+                                            vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_tu(vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wf_bf16_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.nxv16f32.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], bfloat [[RS1]], i64 7, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wf_bf16_f32m8_tu(vfloat32m8_t vd, vfloat32m8_t vs2,
+                                          __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_wf_tu(vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vv_bf16mf4_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vv_bf16mf4_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd,
+                                                vbfloat16mf4_t vs2,
+                                                vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vf_bf16mf4_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vf_bf16mf4_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd,
+                                                vbfloat16mf4_t vs2, __bf16 rs1,
+                                                size_t vl) {
+  return __riscv_vfwsub_vf_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wv_bf16mf4_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wv_bf16mf4_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd,
+                                                vfloat32mf2_t vs2,
+                                                vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wf_bf16_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wf_bf16_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd,
+                                             vfloat32mf2_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwsub_wf_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vv_bf16mf2_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vv_bf16mf2_f32m1_tum(vbool32_t vm, vfloat32m1_t vd,
+                                              vbfloat16mf2_t vs2,
+                                              vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vf_bf16mf2_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vf_bf16mf2_f32m1_tum(vbool32_t vm, vfloat32m1_t vd,
+                                              vbfloat16mf2_t vs2, __bf16 rs1,
+                                              size_t vl) {
+  return __riscv_vfwsub_vf_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wv_bf16mf2_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wv_bf16mf2_f32m1_tum(vbool32_t vm, vfloat32m1_t vd,
+                                              vfloat32m1_t vs2,
+                                              vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wf_bf16_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wf_bf16_f32m1_tum(vbool32_t vm, vfloat32m1_t vd,
+                                           vfloat32m1_t vs2, __bf16 rs1,
+                                           size_t vl) {
+  return __riscv_vfwsub_wf_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vv_bf16m1_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vv_bf16m1_f32m2_tum(vbool16_t vm, vfloat32m2_t vd,
+                                             vbfloat16m1_t vs2,
+                                             vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vf_bf16m1_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vf_bf16m1_f32m2_tum(vbool16_t vm, vfloat32m2_t vd,
+                                             vbfloat16m1_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwsub_vf_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wv_bf16m1_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wv_bf16m1_f32m2_tum(vbool16_t vm, vfloat32m2_t vd,
+                                             vfloat32m2_t vs2,
+                                             vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wf_bf16_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wf_bf16_f32m2_tum(vbool16_t vm, vfloat32m2_t vd,
+                                           vfloat32m2_t vs2, __bf16 rs1,
+                                           size_t vl) {
+  return __riscv_vfwsub_wf_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vv_bf16m2_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vv_bf16m2_f32m4_tum(vbool8_t vm, vfloat32m4_t vd,
+                                             vbfloat16m2_t vs2,
+                                             vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vf_bf16m2_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vf_bf16m2_f32m4_tum(vbool8_t vm, vfloat32m4_t vd,
+                                             vbfloat16m2_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwsub_vf_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wv_bf16m2_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wv_bf16m2_f32m4_tum(vbool8_t vm, vfloat32m4_t vd,
+                                             vfloat32m4_t vs2,
+                                             vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wf_bf16_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wf_bf16_f32m4_tum(vbool8_t vm, vfloat32m4_t vd,
+                                           vfloat32m4_t vs2, __bf16 rs1,
+                                           size_t vl) {
+  return __riscv_vfwsub_wf_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vv_bf16m4_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vv_bf16m4_f32m8_tum(vbool4_t vm, vfloat32m8_t vd,
+                                             vbfloat16m4_t vs2,
+                                             vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vf_bf16m4_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vf_bf16m4_f32m8_tum(vbool4_t vm, vfloat32m8_t vd,
+                                             vbfloat16m4_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwsub_vf_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wv_bf16m4_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wv_bf16m4_f32m8_tum(vbool4_t vm, vfloat32m8_t vd,
+                                             vfloat32m8_t vs2,
+                                             vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_tum(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wf_bf16_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wf_bf16_f32m8_tum(vbool4_t vm, vfloat32m8_t vd,
+                                           vfloat32m8_t vs2, __bf16 rs1,
+                                           size_t vl) {
+  return __riscv_vfwsub_wf_tum(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vv_bf16mf4_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vv_bf16mf4_f32mf2_tumu(vbool64_t vm, vfloat32mf2_t vd,
+                                                 vbfloat16mf4_t vs2,
+                                                 vbfloat16mf4_t vs1,
+                                                 size_t vl) {
+  return __riscv_vfwsub_vv_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vf_bf16mf4_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vf_bf16mf4_f32mf2_tumu(vbool64_t vm, vfloat32mf2_t vd,
+                                                 vbfloat16mf4_t vs2, __bf16 rs1,
+                                                 size_t vl) {
+  return __riscv_vfwsub_vf_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wv_bf16mf4_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wv_bf16mf4_f32mf2_tumu(vbool64_t vm, vfloat32mf2_t vd,
+                                                 vfloat32mf2_t vs2,
+                                                 vbfloat16mf4_t vs1,
+                                                 size_t vl) {
+  return __riscv_vfwsub_wv_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wf_bf16_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wf_bf16_f32mf2_tumu(vbool64_t vm, vfloat32mf2_t vd,
+                                              vfloat32mf2_t vs2, __bf16 rs1,
+                                              size_t vl) {
+  return __riscv_vfwsub_wf_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vv_bf16mf2_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vv_bf16mf2_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                               vbfloat16mf2_t vs2,
+                                               vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vf_bf16mf2_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vf_bf16mf2_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                               vbfloat16mf2_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwsub_vf_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wv_bf16mf2_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wv_bf16mf2_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                               vfloat32m1_t vs2,
+                                               vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wf_bf16_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wf_bf16_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                            vfloat32m1_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwsub_wf_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vv_bf16m1_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vv_bf16m1_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                              vbfloat16m1_t vs2,
+                                              vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vf_bf16m1_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vf_bf16m1_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                              vbfloat16m1_t vs2, __bf16 rs1,
+                                              size_t vl) {
+  return __riscv_vfwsub_vf_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wv_bf16m1_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wv_bf16m1_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                              vfloat32m2_t vs2,
+                                              vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wf_bf16_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wf_bf16_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                            vfloat32m2_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwsub_wf_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vv_bf16m2_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vv_bf16m2_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                              vbfloat16m2_t vs2,
+                                              vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vf_bf16m2_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vf_bf16m2_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                              vbfloat16m2_t vs2, __bf16 rs1,
+                                              size_t vl) {
+  return __riscv_vfwsub_vf_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wv_bf16m2_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wv_bf16m2_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                              vfloat32m4_t vs2,
+                                              vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wf_bf16_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wf_bf16_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                            vfloat32m4_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwsub_wf_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vv_bf16m4_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vv_bf16m4_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                              vbfloat16m4_t vs2,
+                                              vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vf_bf16m4_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vf_bf16m4_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                              vbfloat16m4_t vs2, __bf16 rs1,
+                                              size_t vl) {
+  return __riscv_vfwsub_vf_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wv_bf16m4_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wv_bf16m4_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                              vfloat32m8_t vs2,
+                                              vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_tumu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wf_bf16_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wf_bf16_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                            vfloat32m8_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwsub_wf_tumu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vv_bf16mf4_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vv_bf16mf4_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd,
+                                               vbfloat16mf4_t vs2,
+                                               vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vf_bf16mf4_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vf_bf16mf4_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd,
+                                               vbfloat16mf4_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwsub_vf_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wv_bf16mf4_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wv_bf16mf4_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd,
+                                               vfloat32mf2_t vs2,
+                                               vbfloat16mf4_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wf_bf16_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wf_bf16_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd,
+                                            vfloat32mf2_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwsub_wf_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vv_bf16mf2_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vv_bf16mf2_f32m1_mu(vbool32_t vm, vfloat32m1_t vd,
+                                             vbfloat16mf2_t vs2,
+                                             vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vf_bf16mf2_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vf_bf16mf2_f32m1_mu(vbool32_t vm, vfloat32m1_t vd,
+                                             vbfloat16mf2_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwsub_vf_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wv_bf16mf2_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wv_bf16mf2_f32m1_mu(vbool32_t vm, vfloat32m1_t vd,
+                                             vfloat32m1_t vs2,
+                                             vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wf_bf16_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wf_bf16_f32m1_mu(vbool32_t vm, vfloat32m1_t vd,
+                                          vfloat32m1_t vs2, __bf16 rs1,
+                                          size_t vl) {
+  return __riscv_vfwsub_wf_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vv_bf16m1_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vv_bf16m1_f32m2_mu(vbool16_t vm, vfloat32m2_t vd,
+                                            vbfloat16m1_t vs2,
+                                            vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vf_bf16m1_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vf_bf16m1_f32m2_mu(vbool16_t vm, vfloat32m2_t vd,
+                                            vbfloat16m1_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwsub_vf_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wv_bf16m1_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wv_bf16m1_f32m2_mu(vbool16_t vm, vfloat32m2_t vd,
+                                            vfloat32m2_t vs2, vbfloat16m1_t vs1,
+                                            size_t vl) {
+  return __riscv_vfwsub_wv_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wf_bf16_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wf_bf16_f32m2_mu(vbool16_t vm, vfloat32m2_t vd,
+                                          vfloat32m2_t vs2, __bf16 rs1,
+                                          size_t vl) {
+  return __riscv_vfwsub_wf_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vv_bf16m2_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vv_bf16m2_f32m4_mu(vbool8_t vm, vfloat32m4_t vd,
+                                            vbfloat16m2_t vs2,
+                                            vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vf_bf16m2_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vf_bf16m2_f32m4_mu(vbool8_t vm, vfloat32m4_t vd,
+                                            vbfloat16m2_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwsub_vf_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wv_bf16m2_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wv_bf16m2_f32m4_mu(vbool8_t vm, vfloat32m4_t vd,
+                                            vfloat32m4_t vs2, vbfloat16m2_t vs1,
+                                            size_t vl) {
+  return __riscv_vfwsub_wv_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wf_bf16_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wf_bf16_f32m4_mu(vbool8_t vm, vfloat32m4_t vd,
+                                          vfloat32m4_t vs2, __bf16 rs1,
+                                          size_t vl) {
+  return __riscv_vfwsub_wf_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vv_bf16m4_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vv_bf16m4_f32m8_mu(vbool4_t vm, vfloat32m8_t vd,
+                                            vbfloat16m4_t vs2,
+                                            vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vf_bf16m4_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vf_bf16m4_f32m8_mu(vbool4_t vm, vfloat32m8_t vd,
+                                            vbfloat16m4_t vs2, __bf16 rs1,
+                                            size_t vl) {
+  return __riscv_vfwsub_vf_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wv_bf16m4_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wv_bf16m4_f32m8_mu(vbool4_t vm, vfloat32m8_t vd,
+                                            vfloat32m8_t vs2, vbfloat16m4_t vs1,
+                                            size_t vl) {
+  return __riscv_vfwsub_wv_mu(vm, vd, vs2, vs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wf_bf16_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 7, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wf_bf16_f32m8_mu(vbool4_t vm, vfloat32m8_t vd,
+                                          vfloat32m8_t vs2, __bf16 rs1,
+                                          size_t vl) {
+  return __riscv_vfwsub_wf_mu(vm, vd, vs2, rs1, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vv_bf16mf4_f32mf2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vv_bf16mf4_f32mf2_rm_tu(vfloat32mf2_t vd,
+                                                  vbfloat16mf4_t vs2,
+                                                  vbfloat16mf4_t vs1,
+                                                  size_t vl) {
+  return __riscv_vfwsub_vv_tu(vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vf_bf16mf4_f32mf2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vf_bf16mf4_f32mf2_rm_tu(vfloat32mf2_t vd,
+                                                  vbfloat16mf4_t vs2,
+                                                  __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_vf_tu(vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wv_bf16mf4_f32mf2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wv_bf16mf4_f32mf2_rm_tu(vfloat32mf2_t vd,
+                                                  vfloat32mf2_t vs2,
+                                                  vbfloat16mf4_t vs1,
+                                                  size_t vl) {
+  return __riscv_vfwsub_wv_tu(vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wf_bf16_f32mf2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.nxv1f32.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wf_bf16_f32mf2_rm_tu(vfloat32mf2_t vd,
+                                               vfloat32mf2_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwsub_wf_tu(vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vv_bf16mf2_f32m1_rm_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vv_bf16mf2_f32m1_rm_tu(vfloat32m1_t vd,
+                                                vbfloat16mf2_t vs2,
+                                                vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_tu(vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vf_bf16mf2_f32m1_rm_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vf_bf16mf2_f32m1_rm_tu(vfloat32m1_t vd,
+                                                vbfloat16mf2_t vs2, __bf16 rs1,
+                                                size_t vl) {
+  return __riscv_vfwsub_vf_tu(vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wv_bf16mf2_f32m1_rm_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wv_bf16mf2_f32m1_rm_tu(vfloat32m1_t vd,
+                                                vfloat32m1_t vs2,
+                                                vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_tu(vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wf_bf16_f32m1_rm_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.nxv2f32.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wf_bf16_f32m1_rm_tu(vfloat32m1_t vd, vfloat32m1_t vs2,
+                                             __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_wf_tu(vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vv_bf16m1_f32m2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vv_bf16m1_f32m2_rm_tu(vfloat32m2_t vd,
+                                               vbfloat16m1_t vs2,
+                                               vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_tu(vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vf_bf16m1_f32m2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vf_bf16m1_f32m2_rm_tu(vfloat32m2_t vd,
+                                               vbfloat16m1_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwsub_vf_tu(vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wv_bf16m1_f32m2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wv_bf16m1_f32m2_rm_tu(vfloat32m2_t vd,
+                                               vfloat32m2_t vs2,
+                                               vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_tu(vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wf_bf16_f32m2_rm_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.nxv4f32.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wf_bf16_f32m2_rm_tu(vfloat32m2_t vd, vfloat32m2_t vs2,
+                                             __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_wf_tu(vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vv_bf16m2_f32m4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vv_bf16m2_f32m4_rm_tu(vfloat32m4_t vd,
+                                               vbfloat16m2_t vs2,
+                                               vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_tu(vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vf_bf16m2_f32m4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vf_bf16m2_f32m4_rm_tu(vfloat32m4_t vd,
+                                               vbfloat16m2_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwsub_vf_tu(vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wv_bf16m2_f32m4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wv_bf16m2_f32m4_rm_tu(vfloat32m4_t vd,
+                                               vfloat32m4_t vs2,
+                                               vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_tu(vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wf_bf16_f32m4_rm_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.nxv8f32.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wf_bf16_f32m4_rm_tu(vfloat32m4_t vd, vfloat32m4_t vs2,
+                                             __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_wf_tu(vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vv_bf16m4_f32m8_rm_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vv_bf16m4_f32m8_rm_tu(vfloat32m8_t vd,
+                                               vbfloat16m4_t vs2,
+                                               vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_tu(vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vf_bf16m4_f32m8_rm_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vf_bf16m4_f32m8_rm_tu(vfloat32m8_t vd,
+                                               vbfloat16m4_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwsub_vf_tu(vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wv_bf16m4_f32m8_rm_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wv_bf16m4_f32m8_rm_tu(vfloat32m8_t vd,
+                                               vfloat32m8_t vs2,
+                                               vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_tu(vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wf_bf16_f32m8_rm_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.nxv16f32.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], bfloat [[RS1]], i64 0, i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wf_bf16_f32m8_rm_tu(vfloat32m8_t vd, vfloat32m8_t vs2,
+                                             __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_wf_tu(vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vv_bf16mf4_f32mf2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vv_bf16mf4_f32mf2_rm_tum(vbool64_t vm,
+                                                   vfloat32mf2_t vd,
+                                                   vbfloat16mf4_t vs2,
+                                                   vbfloat16mf4_t vs1,
+                                                   size_t vl) {
+  return __riscv_vfwsub_vv_tum(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vf_bf16mf4_f32mf2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vf_bf16mf4_f32mf2_rm_tum(vbool64_t vm,
+                                                   vfloat32mf2_t vd,
+                                                   vbfloat16mf4_t vs2,
+                                                   __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_vf_tum(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wv_bf16mf4_f32mf2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wv_bf16mf4_f32mf2_rm_tum(vbool64_t vm,
+                                                   vfloat32mf2_t vd,
+                                                   vfloat32mf2_t vs2,
+                                                   vbfloat16mf4_t vs1,
+                                                   size_t vl) {
+  return __riscv_vfwsub_wv_tum(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wf_bf16_f32mf2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wf_bf16_f32mf2_rm_tum(vbool64_t vm, vfloat32mf2_t vd,
+                                                vfloat32mf2_t vs2, __bf16 rs1,
+                                                size_t vl) {
+  return __riscv_vfwsub_wf_tum(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vv_bf16mf2_f32m1_rm_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vv_bf16mf2_f32m1_rm_tum(vbool32_t vm, vfloat32m1_t vd,
+                                                 vbfloat16mf2_t vs2,
+                                                 vbfloat16mf2_t vs1,
+                                                 size_t vl) {
+  return __riscv_vfwsub_vv_tum(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vf_bf16mf2_f32m1_rm_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vf_bf16mf2_f32m1_rm_tum(vbool32_t vm, vfloat32m1_t vd,
+                                                 vbfloat16mf2_t vs2, __bf16 rs1,
+                                                 size_t vl) {
+  return __riscv_vfwsub_vf_tum(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wv_bf16mf2_f32m1_rm_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wv_bf16mf2_f32m1_rm_tum(vbool32_t vm, vfloat32m1_t vd,
+                                                 vfloat32m1_t vs2,
+                                                 vbfloat16mf2_t vs1,
+                                                 size_t vl) {
+  return __riscv_vfwsub_wv_tum(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wf_bf16_f32m1_rm_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wf_bf16_f32m1_rm_tum(vbool32_t vm, vfloat32m1_t vd,
+                                              vfloat32m1_t vs2, __bf16 rs1,
+                                              size_t vl) {
+  return __riscv_vfwsub_wf_tum(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vv_bf16m1_f32m2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vv_bf16m1_f32m2_rm_tum(vbool16_t vm, vfloat32m2_t vd,
+                                                vbfloat16m1_t vs2,
+                                                vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_tum(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vf_bf16m1_f32m2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vf_bf16m1_f32m2_rm_tum(vbool16_t vm, vfloat32m2_t vd,
+                                                vbfloat16m1_t vs2, __bf16 rs1,
+                                                size_t vl) {
+  return __riscv_vfwsub_vf_tum(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wv_bf16m1_f32m2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wv_bf16m1_f32m2_rm_tum(vbool16_t vm, vfloat32m2_t vd,
+                                                vfloat32m2_t vs2,
+                                                vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_tum(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wf_bf16_f32m2_rm_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wf_bf16_f32m2_rm_tum(vbool16_t vm, vfloat32m2_t vd,
+                                              vfloat32m2_t vs2, __bf16 rs1,
+                                              size_t vl) {
+  return __riscv_vfwsub_wf_tum(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vv_bf16m2_f32m4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vv_bf16m2_f32m4_rm_tum(vbool8_t vm, vfloat32m4_t vd,
+                                                vbfloat16m2_t vs2,
+                                                vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_tum(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vf_bf16m2_f32m4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vf_bf16m2_f32m4_rm_tum(vbool8_t vm, vfloat32m4_t vd,
+                                                vbfloat16m2_t vs2, __bf16 rs1,
+                                                size_t vl) {
+  return __riscv_vfwsub_vf_tum(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wv_bf16m2_f32m4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wv_bf16m2_f32m4_rm_tum(vbool8_t vm, vfloat32m4_t vd,
+                                                vfloat32m4_t vs2,
+                                                vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_tum(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wf_bf16_f32m4_rm_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wf_bf16_f32m4_rm_tum(vbool8_t vm, vfloat32m4_t vd,
+                                              vfloat32m4_t vs2, __bf16 rs1,
+                                              size_t vl) {
+  return __riscv_vfwsub_wf_tum(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vv_bf16m4_f32m8_rm_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vv_bf16m4_f32m8_rm_tum(vbool4_t vm, vfloat32m8_t vd,
+                                                vbfloat16m4_t vs2,
+                                                vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_tum(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vf_bf16m4_f32m8_rm_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vf_bf16m4_f32m8_rm_tum(vbool4_t vm, vfloat32m8_t vd,
+                                                vbfloat16m4_t vs2, __bf16 rs1,
+                                                size_t vl) {
+  return __riscv_vfwsub_vf_tum(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wv_bf16m4_f32m8_rm_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wv_bf16m4_f32m8_rm_tum(vbool4_t vm, vfloat32m8_t vd,
+                                                vfloat32m8_t vs2,
+                                                vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_tum(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wf_bf16_f32m8_rm_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wf_bf16_f32m8_rm_tum(vbool4_t vm, vfloat32m8_t vd,
+                                              vfloat32m8_t vs2, __bf16 rs1,
+                                              size_t vl) {
+  return __riscv_vfwsub_wf_tum(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vv_bf16mf4_f32mf2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vv_bf16mf4_f32mf2_rm_tumu(vbool64_t vm,
+                                                    vfloat32mf2_t vd,
+                                                    vbfloat16mf4_t vs2,
+                                                    vbfloat16mf4_t vs1,
+                                                    size_t vl) {
+  return __riscv_vfwsub_vv_tumu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vf_bf16mf4_f32mf2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vf_bf16mf4_f32mf2_rm_tumu(vbool64_t vm,
+                                                    vfloat32mf2_t vd,
+                                                    vbfloat16mf4_t vs2,
+                                                    __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_vf_tumu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wv_bf16mf4_f32mf2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wv_bf16mf4_f32mf2_rm_tumu(vbool64_t vm,
+                                                    vfloat32mf2_t vd,
+                                                    vfloat32mf2_t vs2,
+                                                    vbfloat16mf4_t vs1,
+                                                    size_t vl) {
+  return __riscv_vfwsub_wv_tumu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wf_bf16_f32mf2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wf_bf16_f32mf2_rm_tumu(vbool64_t vm, vfloat32mf2_t vd,
+                                                 vfloat32mf2_t vs2, __bf16 rs1,
+                                                 size_t vl) {
+  return __riscv_vfwsub_wf_tumu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vv_bf16mf2_f32m1_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vv_bf16mf2_f32m1_rm_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                                  vbfloat16mf2_t vs2,
+                                                  vbfloat16mf2_t vs1,
+                                                  size_t vl) {
+  return __riscv_vfwsub_vv_tumu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vf_bf16mf2_f32m1_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vf_bf16mf2_f32m1_rm_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                                  vbfloat16mf2_t vs2,
+                                                  __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_vf_tumu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wv_bf16mf2_f32m1_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wv_bf16mf2_f32m1_rm_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                                  vfloat32m1_t vs2,
+                                                  vbfloat16mf2_t vs1,
+                                                  size_t vl) {
+  return __riscv_vfwsub_wv_tumu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wf_bf16_f32m1_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wf_bf16_f32m1_rm_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                               vfloat32m1_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwsub_wf_tumu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vv_bf16m1_f32m2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vv_bf16m1_f32m2_rm_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                                 vbfloat16m1_t vs2,
+                                                 vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_tumu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vf_bf16m1_f32m2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vf_bf16m1_f32m2_rm_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                                 vbfloat16m1_t vs2, __bf16 rs1,
+                                                 size_t vl) {
+  return __riscv_vfwsub_vf_tumu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wv_bf16m1_f32m2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wv_bf16m1_f32m2_rm_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                                 vfloat32m2_t vs2,
+                                                 vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_tumu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wf_bf16_f32m2_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wf_bf16_f32m2_rm_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                               vfloat32m2_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwsub_wf_tumu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vv_bf16m2_f32m4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vv_bf16m2_f32m4_rm_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                                 vbfloat16m2_t vs2,
+                                                 vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_tumu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vf_bf16m2_f32m4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vf_bf16m2_f32m4_rm_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                                 vbfloat16m2_t vs2, __bf16 rs1,
+                                                 size_t vl) {
+  return __riscv_vfwsub_vf_tumu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wv_bf16m2_f32m4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wv_bf16m2_f32m4_rm_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                                 vfloat32m4_t vs2,
+                                                 vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_tumu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wf_bf16_f32m4_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wf_bf16_f32m4_rm_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                               vfloat32m4_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwsub_wf_tumu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vv_bf16m4_f32m8_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vv_bf16m4_f32m8_rm_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                                 vbfloat16m4_t vs2,
+                                                 vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_tumu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vf_bf16m4_f32m8_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vf_bf16m4_f32m8_rm_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                                 vbfloat16m4_t vs2, __bf16 rs1,
+                                                 size_t vl) {
+  return __riscv_vfwsub_vf_tumu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wv_bf16m4_f32m8_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wv_bf16m4_f32m8_rm_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                                 vfloat32m8_t vs2,
+                                                 vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_tumu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wf_bf16_f32m8_rm_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wf_bf16_f32m8_rm_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                               vfloat32m8_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwsub_wf_tumu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vv_bf16mf4_f32mf2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.mask.nxv1f32.nxv1bf16.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vv_bf16mf4_f32mf2_rm_mu(vbool64_t vm,
+                                                  vfloat32mf2_t vd,
+                                                  vbfloat16mf4_t vs2,
+                                                  vbfloat16mf4_t vs1,
+                                                  size_t vl) {
+  return __riscv_vfwsub_vv_mu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_vf_bf16mf4_f32mf2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.mask.nxv1f32.nxv1bf16.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_vf_bf16mf4_f32mf2_rm_mu(vbool64_t vm,
+                                                  vfloat32mf2_t vd,
+                                                  vbfloat16mf4_t vs2,
+                                                  __bf16 rs1, size_t vl) {
+  return __riscv_vfwsub_vf_mu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wv_bf16mf4_f32mf2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], <vscale x 1 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.nxv1bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x bfloat> [[VS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wv_bf16mf4_f32mf2_rm_mu(vbool64_t vm,
+                                                  vfloat32mf2_t vd,
+                                                  vfloat32mf2_t vs2,
+                                                  vbfloat16mf4_t vs1,
+                                                  size_t vl) {
+  return __riscv_vfwsub_wv_mu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vfwsub_wf_bf16_f32mf2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.bf16.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], bfloat [[RS1]], <vscale x 1 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_vfwsub_wf_bf16_f32mf2_rm_mu(vbool64_t vm, vfloat32mf2_t vd,
+                                               vfloat32mf2_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwsub_wf_mu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vv_bf16mf2_f32m1_rm_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.mask.nxv2f32.nxv2bf16.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vv_bf16mf2_f32m1_rm_mu(vbool32_t vm, vfloat32m1_t vd,
+                                                vbfloat16mf2_t vs2,
+                                                vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_mu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_vf_bf16mf2_f32m1_rm_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.mask.nxv2f32.nxv2bf16.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_vf_bf16mf2_f32m1_rm_mu(vbool32_t vm, vfloat32m1_t vd,
+                                                vbfloat16mf2_t vs2, __bf16 rs1,
+                                                size_t vl) {
+  return __riscv_vfwsub_vf_mu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wv_bf16mf2_f32m1_rm_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], <vscale x 2 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.nxv2bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x bfloat> [[VS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wv_bf16mf2_f32m1_rm_mu(vbool32_t vm, vfloat32m1_t vd,
+                                                vfloat32m1_t vs2,
+                                                vbfloat16mf2_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_mu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vfwsub_wf_bf16_f32m1_rm_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.bf16.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], bfloat [[RS1]], <vscale x 2 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_vfwsub_wf_bf16_f32m1_rm_mu(vbool32_t vm, vfloat32m1_t vd,
+                                             vfloat32m1_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwsub_wf_mu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vv_bf16m1_f32m2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.mask.nxv4f32.nxv4bf16.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vv_bf16m1_f32m2_rm_mu(vbool16_t vm, vfloat32m2_t vd,
+                                               vbfloat16m1_t vs2,
+                                               vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_mu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_vf_bf16m1_f32m2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.mask.nxv4f32.nxv4bf16.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_vf_bf16m1_f32m2_rm_mu(vbool16_t vm, vfloat32m2_t vd,
+                                               vbfloat16m1_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwsub_vf_mu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wv_bf16m1_f32m2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], <vscale x 4 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.nxv4bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x bfloat> [[VS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wv_bf16m1_f32m2_rm_mu(vbool16_t vm, vfloat32m2_t vd,
+                                               vfloat32m2_t vs2,
+                                               vbfloat16m1_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_mu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vfwsub_wf_bf16_f32m2_rm_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.bf16.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], bfloat [[RS1]], <vscale x 4 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_vfwsub_wf_bf16_f32m2_rm_mu(vbool16_t vm, vfloat32m2_t vd,
+                                             vfloat32m2_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwsub_wf_mu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vv_bf16m2_f32m4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.mask.nxv8f32.nxv8bf16.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vv_bf16m2_f32m4_rm_mu(vbool8_t vm, vfloat32m4_t vd,
+                                               vbfloat16m2_t vs2,
+                                               vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_mu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_vf_bf16m2_f32m4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.mask.nxv8f32.nxv8bf16.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_vf_bf16m2_f32m4_rm_mu(vbool8_t vm, vfloat32m4_t vd,
+                                               vbfloat16m2_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwsub_vf_mu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wv_bf16m2_f32m4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], <vscale x 8 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.nxv8bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x bfloat> [[VS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wv_bf16m2_f32m4_rm_mu(vbool8_t vm, vfloat32m4_t vd,
+                                               vfloat32m4_t vs2,
+                                               vbfloat16m2_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_mu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vfwsub_wf_bf16_f32m4_rm_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.bf16.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], bfloat [[RS1]], <vscale x 8 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_vfwsub_wf_bf16_f32m4_rm_mu(vbool8_t vm, vfloat32m4_t vd,
+                                             vfloat32m4_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwsub_wf_mu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vv_bf16m4_f32m8_rm_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.mask.nxv16f32.nxv16bf16.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vv_bf16m4_f32m8_rm_mu(vbool4_t vm, vfloat32m8_t vd,
+                                               vbfloat16m4_t vs2,
+                                               vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwsub_vv_mu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_vf_bf16m4_f32m8_rm_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.mask.nxv16f32.nxv16bf16.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x bfloat> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_vf_bf16m4_f32m8_rm_mu(vbool4_t vm, vfloat32m8_t vd,
+                                               vbfloat16m4_t vs2, __bf16 rs1,
+                                               size_t vl) {
+  return __riscv_vfwsub_vf_mu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wv_bf16m4_f32m8_rm_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x bfloat> [[VS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.nxv16bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x bfloat> [[VS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wv_bf16m4_f32m8_rm_mu(vbool4_t vm, vfloat32m8_t vd,
+                                               vfloat32m8_t vs2,
+                                               vbfloat16m4_t vs1, size_t vl) {
+  return __riscv_vfwsub_wv_mu(vm, vd, vs2, vs1, __RISCV_FRM_RNE, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vfwsub_wf_bf16_f32m8_rm_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], bfloat noundef [[RS1:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.bf16.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], bfloat [[RS1]], <vscale x 16 x i1> [[VM]], i64 0, i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_vfwsub_wf_bf16_f32m8_rm_mu(vbool4_t vm, vfloat32m8_t vd,
+                                             vfloat32m8_t vs2, __bf16 rs1,
+                                             size_t vl) {
+  return __riscv_vfwsub_wf_mu(vm, vd, vs2, rs1, __RISCV_FRM_RNE, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vmfeq.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vmfeq.c
new file mode 100644
index 0000000000000..3945f826809b0
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vmfeq.c
@@ -0,0 +1,129 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfeq_vv_bf16mf4_b64_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i1> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfeq.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x i1> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfeq_vv_bf16mf4_b64_mu(vbool64_t mask, vbool64_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vmfeq_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfeq_vf_bf16mf4_b64_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i1> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfeq.mask.nxv1bf16.bf16.i64(<vscale x 1 x i1> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfeq_vf_bf16mf4_b64_mu(vbool64_t mask, vbool64_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfeq_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfeq_vv_bf16mf2_b32_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i1> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfeq.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x i1> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfeq_vv_bf16mf2_b32_mu(vbool32_t mask, vbool32_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vmfeq_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfeq_vf_bf16mf2_b32_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i1> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfeq.mask.nxv2bf16.bf16.i64(<vscale x 2 x i1> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfeq_vf_bf16mf2_b32_mu(vbool32_t mask, vbool32_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfeq_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfeq_vv_bf16m1_b16_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i1> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfeq.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x i1> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfeq_vv_bf16m1_b16_mu(vbool16_t mask, vbool16_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vmfeq_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfeq_vf_bf16m1_b16_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i1> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfeq.mask.nxv4bf16.bf16.i64(<vscale x 4 x i1> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfeq_vf_bf16m1_b16_mu(vbool16_t mask, vbool16_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfeq_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfeq_vv_bf16m2_b8_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i1> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfeq.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x i1> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfeq_vv_bf16m2_b8_mu(vbool8_t mask, vbool8_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vmfeq_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfeq_vf_bf16m2_b8_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i1> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfeq.mask.nxv8bf16.bf16.i64(<vscale x 8 x i1> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfeq_vf_bf16m2_b8_mu(vbool8_t mask, vbool8_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfeq_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfeq_vv_bf16m4_b4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i1> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfeq.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x i1> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfeq_vv_bf16m4_b4_mu(vbool4_t mask, vbool4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vmfeq_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfeq_vf_bf16m4_b4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i1> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfeq.mask.nxv16bf16.bf16.i64(<vscale x 16 x i1> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfeq_vf_bf16m4_b4_mu(vbool4_t mask, vbool4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfeq_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfeq_vv_bf16m8_b2_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x i1> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfeq.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x i1> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfeq_vv_bf16m8_b2_mu(vbool2_t mask, vbool2_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vmfeq_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfeq_vf_bf16m8_b2_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x i1> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfeq.mask.nxv32bf16.bf16.i64(<vscale x 32 x i1> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfeq_vf_bf16m8_b2_mu(vbool2_t mask, vbool2_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfeq_mu(mask, maskedoff, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vmfge.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vmfge.c
new file mode 100644
index 0000000000000..82586da09b325
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vmfge.c
@@ -0,0 +1,129 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfge_vv_bf16mf4_b64_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i1> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfge.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x i1> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfge_vv_bf16mf4_b64_mu(vbool64_t mask, vbool64_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vmfge_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfge_vf_bf16mf4_b64_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i1> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfge.mask.nxv1bf16.bf16.i64(<vscale x 1 x i1> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfge_vf_bf16mf4_b64_mu(vbool64_t mask, vbool64_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfge_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfge_vv_bf16mf2_b32_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i1> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfge.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x i1> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfge_vv_bf16mf2_b32_mu(vbool32_t mask, vbool32_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vmfge_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfge_vf_bf16mf2_b32_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i1> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfge.mask.nxv2bf16.bf16.i64(<vscale x 2 x i1> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfge_vf_bf16mf2_b32_mu(vbool32_t mask, vbool32_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfge_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfge_vv_bf16m1_b16_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i1> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfge.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x i1> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfge_vv_bf16m1_b16_mu(vbool16_t mask, vbool16_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vmfge_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfge_vf_bf16m1_b16_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i1> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfge.mask.nxv4bf16.bf16.i64(<vscale x 4 x i1> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfge_vf_bf16m1_b16_mu(vbool16_t mask, vbool16_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfge_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfge_vv_bf16m2_b8_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i1> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfge.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x i1> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfge_vv_bf16m2_b8_mu(vbool8_t mask, vbool8_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vmfge_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfge_vf_bf16m2_b8_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i1> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfge.mask.nxv8bf16.bf16.i64(<vscale x 8 x i1> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfge_vf_bf16m2_b8_mu(vbool8_t mask, vbool8_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfge_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfge_vv_bf16m4_b4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i1> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfge.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x i1> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfge_vv_bf16m4_b4_mu(vbool4_t mask, vbool4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vmfge_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfge_vf_bf16m4_b4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i1> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfge.mask.nxv16bf16.bf16.i64(<vscale x 16 x i1> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfge_vf_bf16m4_b4_mu(vbool4_t mask, vbool4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfge_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfge_vv_bf16m8_b2_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x i1> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfge.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x i1> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfge_vv_bf16m8_b2_mu(vbool2_t mask, vbool2_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vmfge_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfge_vf_bf16m8_b2_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x i1> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfge.mask.nxv32bf16.bf16.i64(<vscale x 32 x i1> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfge_vf_bf16m8_b2_mu(vbool2_t mask, vbool2_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfge_mu(mask, maskedoff, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vmfgt.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vmfgt.c
new file mode 100644
index 0000000000000..75ccbbc1e8a1e
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vmfgt.c
@@ -0,0 +1,129 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfgt_vv_bf16mf4_b64_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i1> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfgt.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x i1> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfgt_vv_bf16mf4_b64_mu(vbool64_t mask, vbool64_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vmfgt_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfgt_vf_bf16mf4_b64_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i1> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfgt.mask.nxv1bf16.bf16.i64(<vscale x 1 x i1> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfgt_vf_bf16mf4_b64_mu(vbool64_t mask, vbool64_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfgt_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfgt_vv_bf16mf2_b32_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i1> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfgt.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x i1> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfgt_vv_bf16mf2_b32_mu(vbool32_t mask, vbool32_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vmfgt_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfgt_vf_bf16mf2_b32_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i1> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfgt.mask.nxv2bf16.bf16.i64(<vscale x 2 x i1> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfgt_vf_bf16mf2_b32_mu(vbool32_t mask, vbool32_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfgt_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfgt_vv_bf16m1_b16_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i1> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfgt.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x i1> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfgt_vv_bf16m1_b16_mu(vbool16_t mask, vbool16_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vmfgt_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfgt_vf_bf16m1_b16_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i1> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfgt.mask.nxv4bf16.bf16.i64(<vscale x 4 x i1> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfgt_vf_bf16m1_b16_mu(vbool16_t mask, vbool16_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfgt_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfgt_vv_bf16m2_b8_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i1> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfgt.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x i1> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfgt_vv_bf16m2_b8_mu(vbool8_t mask, vbool8_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vmfgt_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfgt_vf_bf16m2_b8_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i1> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfgt.mask.nxv8bf16.bf16.i64(<vscale x 8 x i1> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfgt_vf_bf16m2_b8_mu(vbool8_t mask, vbool8_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfgt_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfgt_vv_bf16m4_b4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i1> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfgt.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x i1> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfgt_vv_bf16m4_b4_mu(vbool4_t mask, vbool4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vmfgt_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfgt_vf_bf16m4_b4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i1> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfgt.mask.nxv16bf16.bf16.i64(<vscale x 16 x i1> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfgt_vf_bf16m4_b4_mu(vbool4_t mask, vbool4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfgt_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfgt_vv_bf16m8_b2_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x i1> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfgt.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x i1> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfgt_vv_bf16m8_b2_mu(vbool2_t mask, vbool2_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vmfgt_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfgt_vf_bf16m8_b2_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x i1> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfgt.mask.nxv32bf16.bf16.i64(<vscale x 32 x i1> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfgt_vf_bf16m8_b2_mu(vbool2_t mask, vbool2_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfgt_mu(mask, maskedoff, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vmfle.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vmfle.c
new file mode 100644
index 0000000000000..49ff1c9812d62
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vmfle.c
@@ -0,0 +1,129 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfle_vv_bf16mf4_b64_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i1> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfle.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x i1> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfle_vv_bf16mf4_b64_mu(vbool64_t mask, vbool64_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vmfle_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfle_vf_bf16mf4_b64_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i1> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfle.mask.nxv1bf16.bf16.i64(<vscale x 1 x i1> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfle_vf_bf16mf4_b64_mu(vbool64_t mask, vbool64_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfle_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfle_vv_bf16mf2_b32_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i1> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfle.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x i1> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfle_vv_bf16mf2_b32_mu(vbool32_t mask, vbool32_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vmfle_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfle_vf_bf16mf2_b32_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i1> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfle.mask.nxv2bf16.bf16.i64(<vscale x 2 x i1> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfle_vf_bf16mf2_b32_mu(vbool32_t mask, vbool32_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfle_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfle_vv_bf16m1_b16_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i1> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfle.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x i1> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfle_vv_bf16m1_b16_mu(vbool16_t mask, vbool16_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vmfle_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfle_vf_bf16m1_b16_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i1> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfle.mask.nxv4bf16.bf16.i64(<vscale x 4 x i1> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfle_vf_bf16m1_b16_mu(vbool16_t mask, vbool16_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfle_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfle_vv_bf16m2_b8_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i1> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfle.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x i1> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfle_vv_bf16m2_b8_mu(vbool8_t mask, vbool8_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vmfle_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfle_vf_bf16m2_b8_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i1> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfle.mask.nxv8bf16.bf16.i64(<vscale x 8 x i1> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfle_vf_bf16m2_b8_mu(vbool8_t mask, vbool8_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfle_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfle_vv_bf16m4_b4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i1> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfle.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x i1> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfle_vv_bf16m4_b4_mu(vbool4_t mask, vbool4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vmfle_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfle_vf_bf16m4_b4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i1> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfle.mask.nxv16bf16.bf16.i64(<vscale x 16 x i1> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfle_vf_bf16m4_b4_mu(vbool4_t mask, vbool4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfle_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfle_vv_bf16m8_b2_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x i1> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfle.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x i1> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfle_vv_bf16m8_b2_mu(vbool2_t mask, vbool2_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vmfle_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfle_vf_bf16m8_b2_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x i1> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfle.mask.nxv32bf16.bf16.i64(<vscale x 32 x i1> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfle_vf_bf16m8_b2_mu(vbool2_t mask, vbool2_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfle_mu(mask, maskedoff, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vmflt.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vmflt.c
new file mode 100644
index 0000000000000..24b3f9c16d943
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vmflt.c
@@ -0,0 +1,129 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmflt_vv_bf16mf4_b64_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i1> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmflt.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x i1> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmflt_vv_bf16mf4_b64_mu(vbool64_t mask, vbool64_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vmflt_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmflt_vf_bf16mf4_b64_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i1> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmflt.mask.nxv1bf16.bf16.i64(<vscale x 1 x i1> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmflt_vf_bf16mf4_b64_mu(vbool64_t mask, vbool64_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmflt_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmflt_vv_bf16mf2_b32_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i1> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmflt.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x i1> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmflt_vv_bf16mf2_b32_mu(vbool32_t mask, vbool32_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vmflt_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmflt_vf_bf16mf2_b32_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i1> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmflt.mask.nxv2bf16.bf16.i64(<vscale x 2 x i1> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmflt_vf_bf16mf2_b32_mu(vbool32_t mask, vbool32_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmflt_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmflt_vv_bf16m1_b16_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i1> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmflt.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x i1> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmflt_vv_bf16m1_b16_mu(vbool16_t mask, vbool16_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vmflt_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmflt_vf_bf16m1_b16_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i1> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmflt.mask.nxv4bf16.bf16.i64(<vscale x 4 x i1> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmflt_vf_bf16m1_b16_mu(vbool16_t mask, vbool16_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmflt_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmflt_vv_bf16m2_b8_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i1> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmflt.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x i1> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmflt_vv_bf16m2_b8_mu(vbool8_t mask, vbool8_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vmflt_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmflt_vf_bf16m2_b8_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i1> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmflt.mask.nxv8bf16.bf16.i64(<vscale x 8 x i1> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmflt_vf_bf16m2_b8_mu(vbool8_t mask, vbool8_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmflt_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmflt_vv_bf16m4_b4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i1> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmflt.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x i1> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmflt_vv_bf16m4_b4_mu(vbool4_t mask, vbool4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vmflt_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmflt_vf_bf16m4_b4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i1> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmflt.mask.nxv16bf16.bf16.i64(<vscale x 16 x i1> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmflt_vf_bf16m4_b4_mu(vbool4_t mask, vbool4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmflt_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmflt_vv_bf16m8_b2_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x i1> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmflt.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x i1> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmflt_vv_bf16m8_b2_mu(vbool2_t mask, vbool2_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vmflt_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmflt_vf_bf16m8_b2_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x i1> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmflt.mask.nxv32bf16.bf16.i64(<vscale x 32 x i1> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmflt_vf_bf16m8_b2_mu(vbool2_t mask, vbool2_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmflt_mu(mask, maskedoff, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vmfne.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vmfne.c
new file mode 100644
index 0000000000000..ca3e134910dfb
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/zvfbfa/policy/overloaded/vmfne.c
@@ -0,0 +1,129 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +experimental-zvfbfa -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <riscv_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfne_vv_bf16mf4_b64_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i1> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], <vscale x 1 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfne.mask.nxv1bf16.nxv1bf16.i64(<vscale x 1 x i1> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], <vscale x 1 x bfloat> [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfne_vv_bf16mf4_b64_mu(vbool64_t mask, vbool64_t maskedoff, vbfloat16mf4_t op1, vbfloat16mf4_t op2, size_t vl) {
+  return __riscv_vmfne_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i1> @test_vmfne_vf_bf16mf4_b64_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[MASK:%.*]], <vscale x 1 x i1> [[MASKEDOFF:%.*]], <vscale x 1 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x i1> @llvm.riscv.vmfne.mask.nxv1bf16.bf16.i64(<vscale x 1 x i1> [[MASKEDOFF]], <vscale x 1 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 1 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x i1> [[TMP0]]
+//
+vbool64_t test_vmfne_vf_bf16mf4_b64_mu(vbool64_t mask, vbool64_t maskedoff, vbfloat16mf4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfne_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfne_vv_bf16mf2_b32_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i1> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], <vscale x 2 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfne.mask.nxv2bf16.nxv2bf16.i64(<vscale x 2 x i1> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], <vscale x 2 x bfloat> [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfne_vv_bf16mf2_b32_mu(vbool32_t mask, vbool32_t maskedoff, vbfloat16mf2_t op1, vbfloat16mf2_t op2, size_t vl) {
+  return __riscv_vmfne_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i1> @test_vmfne_vf_bf16mf2_b32_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i1> [[MASKEDOFF:%.*]], <vscale x 2 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i1> @llvm.riscv.vmfne.mask.nxv2bf16.bf16.i64(<vscale x 2 x i1> [[MASKEDOFF]], <vscale x 2 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 2 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i1> [[TMP0]]
+//
+vbool32_t test_vmfne_vf_bf16mf2_b32_mu(vbool32_t mask, vbool32_t maskedoff, vbfloat16mf2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfne_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfne_vv_bf16m1_b16_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i1> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], <vscale x 4 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfne.mask.nxv4bf16.nxv4bf16.i64(<vscale x 4 x i1> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], <vscale x 4 x bfloat> [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfne_vv_bf16m1_b16_mu(vbool16_t mask, vbool16_t maskedoff, vbfloat16m1_t op1, vbfloat16m1_t op2, size_t vl) {
+  return __riscv_vmfne_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i1> @test_vmfne_vf_bf16m1_b16_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i1> [[MASKEDOFF:%.*]], <vscale x 4 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i1> @llvm.riscv.vmfne.mask.nxv4bf16.bf16.i64(<vscale x 4 x i1> [[MASKEDOFF]], <vscale x 4 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 4 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i1> [[TMP0]]
+//
+vbool16_t test_vmfne_vf_bf16m1_b16_mu(vbool16_t mask, vbool16_t maskedoff, vbfloat16m1_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfne_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfne_vv_bf16m2_b8_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i1> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], <vscale x 8 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfne.mask.nxv8bf16.nxv8bf16.i64(<vscale x 8 x i1> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], <vscale x 8 x bfloat> [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfne_vv_bf16m2_b8_mu(vbool8_t mask, vbool8_t maskedoff, vbfloat16m2_t op1, vbfloat16m2_t op2, size_t vl) {
+  return __riscv_vmfne_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i1> @test_vmfne_vf_bf16m2_b8_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[MASK:%.*]], <vscale x 8 x i1> [[MASKEDOFF:%.*]], <vscale x 8 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i1> @llvm.riscv.vmfne.mask.nxv8bf16.bf16.i64(<vscale x 8 x i1> [[MASKEDOFF]], <vscale x 8 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 8 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i1> [[TMP0]]
+//
+vbool8_t test_vmfne_vf_bf16m2_b8_mu(vbool8_t mask, vbool8_t maskedoff, vbfloat16m2_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfne_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfne_vv_bf16m4_b4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i1> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], <vscale x 16 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfne.mask.nxv16bf16.nxv16bf16.i64(<vscale x 16 x i1> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], <vscale x 16 x bfloat> [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfne_vv_bf16m4_b4_mu(vbool4_t mask, vbool4_t maskedoff, vbfloat16m4_t op1, vbfloat16m4_t op2, size_t vl) {
+  return __riscv_vmfne_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i1> @test_vmfne_vf_bf16m4_b4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[MASK:%.*]], <vscale x 16 x i1> [[MASKEDOFF:%.*]], <vscale x 16 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.riscv.vmfne.mask.nxv16bf16.bf16.i64(<vscale x 16 x i1> [[MASKEDOFF]], <vscale x 16 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 16 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i1> [[TMP0]]
+//
+vbool4_t test_vmfne_vf_bf16m4_b4_mu(vbool4_t mask, vbool4_t maskedoff, vbfloat16m4_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfne_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfne_vv_bf16m8_b2_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x i1> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], <vscale x 32 x bfloat> [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfne.mask.nxv32bf16.nxv32bf16.i64(<vscale x 32 x i1> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], <vscale x 32 x bfloat> [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfne_vv_bf16m8_b2_mu(vbool2_t mask, vbool2_t maskedoff, vbfloat16m8_t op1, vbfloat16m8_t op2, size_t vl) {
+  return __riscv_vmfne_mu(mask, maskedoff, op1, op2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i1> @test_vmfne_vf_bf16m8_b2_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[MASK:%.*]], <vscale x 32 x i1> [[MASKEDOFF:%.*]], <vscale x 32 x bfloat> [[OP1:%.*]], bfloat noundef [[OP2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i1> @llvm.riscv.vmfne.mask.nxv32bf16.bf16.i64(<vscale x 32 x i1> [[MASKEDOFF]], <vscale x 32 x bfloat> [[OP1]], bfloat [[OP2]], <vscale x 32 x i1> [[MASK]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i1> [[TMP0]]
+//
+vbool2_t test_vmfne_vf_bf16m8_b2_mu(vbool2_t mask, vbool2_t maskedoff, vbfloat16m8_t op1, __bf16 op2, size_t vl) {
+  return __riscv_vmfne_mu(mask, maskedoff, op1, op2, vl);
+}
+
diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-i128.c b/clang/test/CodeGen/SystemZ/builtins-systemz-i128.c
index d25b8d84aa2d5..2a20607123af9 100644
--- a/clang/test/CodeGen/SystemZ/builtins-systemz-i128.c
+++ b/clang/test/CodeGen/SystemZ/builtins-systemz-i128.c
@@ -14,124 +14,124 @@ volatile vector unsigned long long vul;
 // CHECK-LABEL: define dso_local void @test(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3:![0-9]+]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA7:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to i128
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
 // CHECK-NEXT:    [[ADD_I:%.*]] = add nsw i128 [[TMP3]], [[TMP2]]
 // CHECK-NEXT:    [[TMP4:%.*]] = bitcast i128 [[ADD_I]] to <16 x i8>
-// CHECK-NEXT:    store volatile <16 x i8> [[TMP4]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP5:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP6:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    store volatile <16 x i8> [[TMP4]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP5:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP6:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP5]] to i128
 // CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to i128
 // CHECK-NEXT:    [[TMP9:%.*]] = tail call i128 @llvm.s390.vaccq(i128 [[TMP7]], i128 [[TMP8]])
 // CHECK-NEXT:    [[TMP10:%.*]] = bitcast i128 [[TMP9]] to <16 x i8>
-// CHECK-NEXT:    store volatile <16 x i8> [[TMP10]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP11:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP12:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP13:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    store volatile <16 x i8> [[TMP10]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP11:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP12:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP13:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i8> [[TMP11]] to i128
 // CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i8> [[TMP12]] to i128
 // CHECK-NEXT:    [[TMP16:%.*]] = bitcast <16 x i8> [[TMP13]] to i128
 // CHECK-NEXT:    [[TMP17:%.*]] = tail call i128 @llvm.s390.vacq(i128 [[TMP14]], i128 [[TMP15]], i128 [[TMP16]])
 // CHECK-NEXT:    [[TMP18:%.*]] = bitcast i128 [[TMP17]] to <16 x i8>
-// CHECK-NEXT:    store volatile <16 x i8> [[TMP18]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP19:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP20:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP21:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    store volatile <16 x i8> [[TMP18]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP19:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP20:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP21:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    [[TMP22:%.*]] = bitcast <16 x i8> [[TMP19]] to i128
 // CHECK-NEXT:    [[TMP23:%.*]] = bitcast <16 x i8> [[TMP20]] to i128
 // CHECK-NEXT:    [[TMP24:%.*]] = bitcast <16 x i8> [[TMP21]] to i128
 // CHECK-NEXT:    [[TMP25:%.*]] = tail call i128 @llvm.s390.vacccq(i128 [[TMP22]], i128 [[TMP23]], i128 [[TMP24]])
 // CHECK-NEXT:    [[TMP26:%.*]] = bitcast i128 [[TMP25]] to <16 x i8>
-// CHECK-NEXT:    store volatile <16 x i8> [[TMP26]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP27:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP28:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    store volatile <16 x i8> [[TMP26]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP27:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP28:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i8> [[TMP27]] to i128
 // CHECK-NEXT:    [[TMP30:%.*]] = bitcast <16 x i8> [[TMP28]] to i128
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub nsw i128 [[TMP29]], [[TMP30]]
 // CHECK-NEXT:    [[TMP31:%.*]] = bitcast i128 [[SUB_I]] to <16 x i8>
-// CHECK-NEXT:    store volatile <16 x i8> [[TMP31]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP32:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP33:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    store volatile <16 x i8> [[TMP31]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP32:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP33:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    [[TMP34:%.*]] = bitcast <16 x i8> [[TMP32]] to i128
 // CHECK-NEXT:    [[TMP35:%.*]] = bitcast <16 x i8> [[TMP33]] to i128
 // CHECK-NEXT:    [[TMP36:%.*]] = tail call i128 @llvm.s390.vscbiq(i128 [[TMP34]], i128 [[TMP35]])
 // CHECK-NEXT:    [[TMP37:%.*]] = bitcast i128 [[TMP36]] to <16 x i8>
-// CHECK-NEXT:    store volatile <16 x i8> [[TMP37]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP38:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP39:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP40:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    store volatile <16 x i8> [[TMP37]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP38:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP39:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP40:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    [[TMP41:%.*]] = bitcast <16 x i8> [[TMP38]] to i128
 // CHECK-NEXT:    [[TMP42:%.*]] = bitcast <16 x i8> [[TMP39]] to i128
 // CHECK-NEXT:    [[TMP43:%.*]] = bitcast <16 x i8> [[TMP40]] to i128
 // CHECK-NEXT:    [[TMP44:%.*]] = tail call i128 @llvm.s390.vsbiq(i128 [[TMP41]], i128 [[TMP42]], i128 [[TMP43]])
 // CHECK-NEXT:    [[TMP45:%.*]] = bitcast i128 [[TMP44]] to <16 x i8>
-// CHECK-NEXT:    store volatile <16 x i8> [[TMP45]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP46:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP47:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP48:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    store volatile <16 x i8> [[TMP45]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP46:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP47:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP48:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    [[TMP49:%.*]] = bitcast <16 x i8> [[TMP46]] to i128
 // CHECK-NEXT:    [[TMP50:%.*]] = bitcast <16 x i8> [[TMP47]] to i128
 // CHECK-NEXT:    [[TMP51:%.*]] = bitcast <16 x i8> [[TMP48]] to i128
 // CHECK-NEXT:    [[TMP52:%.*]] = tail call i128 @llvm.s390.vsbcbiq(i128 [[TMP49]], i128 [[TMP50]], i128 [[TMP51]])
 // CHECK-NEXT:    [[TMP53:%.*]] = bitcast i128 [[TMP52]] to <16 x i8>
-// CHECK-NEXT:    store volatile <16 x i8> [[TMP53]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP54:%.*]] = load volatile <4 x i32>, ptr @vui, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP55:%.*]] = load volatile <4 x i32>, ptr @vui, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    store volatile <16 x i8> [[TMP53]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP54:%.*]] = load volatile <4 x i32>, ptr @vui, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP55:%.*]] = load volatile <4 x i32>, ptr @vui, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    [[TMP56:%.*]] = tail call i128 @llvm.s390.vsumqf(<4 x i32> [[TMP54]], <4 x i32> [[TMP55]])
 // CHECK-NEXT:    [[TMP57:%.*]] = bitcast i128 [[TMP56]] to <16 x i8>
-// CHECK-NEXT:    store volatile <16 x i8> [[TMP57]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP58:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP59:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    store volatile <16 x i8> [[TMP57]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP58:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP59:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    [[TMP60:%.*]] = tail call i128 @llvm.s390.vsumqg(<2 x i64> [[TMP58]], <2 x i64> [[TMP59]])
 // CHECK-NEXT:    [[TMP61:%.*]] = bitcast i128 [[TMP60]] to <16 x i8>
-// CHECK-NEXT:    store volatile <16 x i8> [[TMP61]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP62:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP63:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    store volatile <16 x i8> [[TMP61]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP62:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP63:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    [[TMP64:%.*]] = tail call i128 @llvm.s390.vgfmg(<2 x i64> [[TMP62]], <2 x i64> [[TMP63]])
 // CHECK-NEXT:    [[TMP65:%.*]] = bitcast i128 [[TMP64]] to <16 x i8>
-// CHECK-NEXT:    store volatile <16 x i8> [[TMP65]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP66:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP67:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP68:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    store volatile <16 x i8> [[TMP65]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP66:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP67:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP68:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    [[TMP69:%.*]] = bitcast <16 x i8> [[TMP68]] to i128
 // CHECK-NEXT:    [[TMP70:%.*]] = tail call i128 @llvm.s390.vgfmag(<2 x i64> [[TMP66]], <2 x i64> [[TMP67]], i128 [[TMP69]])
 // CHECK-NEXT:    [[TMP71:%.*]] = bitcast i128 [[TMP70]] to <16 x i8>
-// CHECK-NEXT:    store volatile <16 x i8> [[TMP71]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP72:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP73:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP74:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    store volatile <16 x i8> [[TMP71]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP72:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP73:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP74:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    [[TMP75:%.*]] = bitcast <16 x i8> [[TMP74]] to i128
 // CHECK-NEXT:    [[TMP76:%.*]] = tail call i128 @llvm.s390.vmslg(<2 x i64> [[TMP72]], <2 x i64> [[TMP73]], i128 [[TMP75]], i32 0)
 // CHECK-NEXT:    [[TMP77:%.*]] = bitcast i128 [[TMP76]] to <16 x i8>
-// CHECK-NEXT:    store volatile <16 x i8> [[TMP77]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP78:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP79:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP80:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    store volatile <16 x i8> [[TMP77]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP78:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP79:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP80:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    [[TMP81:%.*]] = bitcast <16 x i8> [[TMP80]] to i128
 // CHECK-NEXT:    [[TMP82:%.*]] = tail call i128 @llvm.s390.vmslg(<2 x i64> [[TMP78]], <2 x i64> [[TMP79]], i128 [[TMP81]], i32 4)
 // CHECK-NEXT:    [[TMP83:%.*]] = bitcast i128 [[TMP82]] to <16 x i8>
-// CHECK-NEXT:    store volatile <16 x i8> [[TMP83]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP84:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP85:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP86:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    store volatile <16 x i8> [[TMP83]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP84:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP85:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP86:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    [[TMP87:%.*]] = bitcast <16 x i8> [[TMP86]] to i128
 // CHECK-NEXT:    [[TMP88:%.*]] = tail call i128 @llvm.s390.vmslg(<2 x i64> [[TMP84]], <2 x i64> [[TMP85]], i128 [[TMP87]], i32 8)
 // CHECK-NEXT:    [[TMP89:%.*]] = bitcast i128 [[TMP88]] to <16 x i8>
-// CHECK-NEXT:    store volatile <16 x i8> [[TMP89]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP90:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP91:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP92:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    store volatile <16 x i8> [[TMP89]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP90:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP91:%.*]] = load volatile <2 x i64>, ptr @vul, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP92:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    [[TMP93:%.*]] = bitcast <16 x i8> [[TMP92]] to i128
 // CHECK-NEXT:    [[TMP94:%.*]] = tail call i128 @llvm.s390.vmslg(<2 x i64> [[TMP90]], <2 x i64> [[TMP91]], i128 [[TMP93]], i32 12)
 // CHECK-NEXT:    [[TMP95:%.*]] = bitcast i128 [[TMP94]] to <16 x i8>
-// CHECK-NEXT:    store volatile <16 x i8> [[TMP95]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP96:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP97:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    store volatile <16 x i8> [[TMP95]], ptr @vuc, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP96:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP97:%.*]] = load volatile <16 x i8>, ptr @vuc, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    [[TMP98:%.*]] = tail call <2 x i64> @llvm.s390.vbperm(<16 x i8> [[TMP96]], <16 x i8> [[TMP97]])
-// CHECK-NEXT:    store volatile <2 x i64> [[TMP98]], ptr @vul, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    store volatile <2 x i64> [[TMP98]], ptr @vul, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    ret void
 //
 void test(void) {
@@ -159,7 +159,7 @@ void test(void) {
   vul = vec_bperm_u128(vuc, vuc);
 }
 //.
-// CHECK: [[CHAR_TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
-// CHECK: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
-// CHECK: [[META5]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[META5:![0-9]+]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
+// CHECK: [[META6]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[CHAR_TBAA7]] = !{[[META5]], [[META5]], i64 0}
 //.
diff --git a/clang/test/CodeGen/SystemZ/gnu-atomic-builtins-i128-16Al.c b/clang/test/CodeGen/SystemZ/gnu-atomic-builtins-i128-16Al.c
index 5f3b0ec546462..6bd61f9c130bd 100644
--- a/clang/test/CodeGen/SystemZ/gnu-atomic-builtins-i128-16Al.c
+++ b/clang/test/CodeGen/SystemZ/gnu-atomic-builtins-i128-16Al.c
@@ -17,7 +17,7 @@ __int128 Des __attribute__((aligned(16)));
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load atomic i128, ptr @Ptr seq_cst, align 16
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2:![0-9]+]]
+// CHECK-NEXT:    store i128 [[TMP0]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
 __int128 f1() {
@@ -29,7 +29,7 @@ __int128 f1() {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load atomic i128, ptr @Ptr seq_cst, align 16
 // CHECK-NEXT:    store i128 [[TMP0]], ptr @Ret, align 16
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP0]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 __int128 f2() {
@@ -40,7 +40,7 @@ __int128 f2() {
 // CHECK-LABEL: define dso_local void @f3(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR1:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    store atomic i128 [[TMP0]], ptr @Ptr seq_cst, align 16
 // CHECK-NEXT:    ret void
 //
@@ -62,9 +62,9 @@ void f4() {
 // CHECK-LABEL: define dso_local void @f5(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xchg ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 __int128 f5() {
@@ -77,7 +77,7 @@ __int128 f5() {
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xchg ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
 // CHECK-NEXT:    store i128 [[TMP1]], ptr @Ret, align 16
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 __int128 f6() {
@@ -88,7 +88,7 @@ __int128 f6() {
 // CHECK-LABEL: define dso_local noundef zeroext i1 @f7(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR1]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Des, align 16, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Des, align 16, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr @Exp, align 16
 // CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr @Ptr, i128 [[TMP1]], i128 [[TMP0]] seq_cst seq_cst, align 16
 // CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i128, i1 } [[TMP2]], 1
@@ -128,10 +128,10 @@ _Bool f8() {
 // CHECK-LABEL: define dso_local void @f9(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw add ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
 // CHECK-NEXT:    [[TMP2:%.*]] = add i128 [[TMP1]], [[TMP0]]
-// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 __int128 f9() {
@@ -141,10 +141,10 @@ __int128 f9() {
 // CHECK-LABEL: define dso_local void @f10(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw sub ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
 // CHECK-NEXT:    [[TMP2:%.*]] = sub i128 [[TMP1]], [[TMP0]]
-// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 __int128 f10() {
@@ -154,10 +154,10 @@ __int128 f10() {
 // CHECK-LABEL: define dso_local void @f11(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw and ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
 // CHECK-NEXT:    [[TMP2:%.*]] = and i128 [[TMP1]], [[TMP0]]
-// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 __int128 f11() {
@@ -167,10 +167,10 @@ __int128 f11() {
 // CHECK-LABEL: define dso_local void @f12(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xor ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
 // CHECK-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], [[TMP0]]
-// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 __int128 f12() {
@@ -180,10 +180,10 @@ __int128 f12() {
 // CHECK-LABEL: define dso_local void @f13(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw or ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
 // CHECK-NEXT:    [[TMP2:%.*]] = or i128 [[TMP1]], [[TMP0]]
-// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 __int128 f13() {
@@ -193,11 +193,11 @@ __int128 f13() {
 // CHECK-LABEL: define dso_local void @f14(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw nand ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
 // CHECK-NEXT:    [[TMP2:%.*]] = and i128 [[TMP1]], [[TMP0]]
 // CHECK-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP2]], -1
-// CHECK-NEXT:    store i128 [[TMP3]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP3]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 __int128 f14() {
@@ -207,9 +207,9 @@ __int128 f14() {
 // CHECK-LABEL: define dso_local void @f15(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw add ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 __int128 f15() {
@@ -219,9 +219,9 @@ __int128 f15() {
 // CHECK-LABEL: define dso_local void @f16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw sub ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 __int128 f16() {
@@ -231,9 +231,9 @@ __int128 f16() {
 // CHECK-LABEL: define dso_local void @f17(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw and ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 __int128 f17() {
@@ -243,9 +243,9 @@ __int128 f17() {
 // CHECK-LABEL: define dso_local void @f18(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xor ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 __int128 f18() {
@@ -255,9 +255,9 @@ __int128 f18() {
 // CHECK-LABEL: define dso_local void @f19(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw or ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 __int128 f19() {
@@ -267,17 +267,17 @@ __int128 f19() {
 // CHECK-LABEL: define dso_local void @f20(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw nand ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 __int128 f20() {
   return __atomic_fetch_nand(&Ptr, Val, memory_order_seq_cst);
 }
 //.
-// CHECK: [[__INT128_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
-// CHECK: [[META3]] = !{!"__int128", [[META4:![0-9]+]], i64 0}
-// CHECK: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK: [[META4:![0-9]+]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
 // CHECK: [[META5]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[__INT128_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// CHECK: [[META7]] = !{!"__int128", [[META4]], i64 0}
 //.
diff --git a/clang/test/CodeGen/SystemZ/gnu-atomic-builtins-i128-8Al.c b/clang/test/CodeGen/SystemZ/gnu-atomic-builtins-i128-8Al.c
index 3ac5959a29dcb..332d7bd21faf5 100644
--- a/clang/test/CodeGen/SystemZ/gnu-atomic-builtins-i128-8Al.c
+++ b/clang/test/CodeGen/SystemZ/gnu-atomic-builtins-i128-8Al.c
@@ -22,7 +22,7 @@ __int128 Des;
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load atomic i128, ptr @Ptr seq_cst, align 8
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2:![0-9]+]]
+// CHECK-NEXT:    store i128 [[TMP0]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
 __int128 f1() {
@@ -34,7 +34,7 @@ __int128 f1() {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load atomic i128, ptr @Ptr seq_cst, align 8
 // CHECK-NEXT:    store i128 [[TMP0]], ptr @Ret, align 8
-// CHECK-NEXT:    store i128 [[TMP0]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP0]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 __int128 f2() {
@@ -45,7 +45,7 @@ __int128 f2() {
 // CHECK-LABEL: define dso_local void @f3(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR1:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    store atomic i128 [[TMP0]], ptr @Ptr seq_cst, align 8
 // CHECK-NEXT:    ret void
 //
@@ -67,9 +67,9 @@ void f4() {
 // CHECK-LABEL: define dso_local void @f5(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xchg ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 __int128 f5() {
@@ -82,7 +82,7 @@ __int128 f5() {
 // CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xchg ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
 // CHECK-NEXT:    store i128 [[TMP1]], ptr @Ret, align 8
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 __int128 f6() {
@@ -93,7 +93,7 @@ __int128 f6() {
 // CHECK-LABEL: define dso_local noundef zeroext i1 @f7(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR1]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Des, align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Des, align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr @Exp, align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr @Ptr, i128 [[TMP1]], i128 [[TMP0]] seq_cst seq_cst, align 8
 // CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i128, i1 } [[TMP2]], 1
@@ -133,10 +133,10 @@ _Bool f8() {
 // CHECK-LABEL: define dso_local void @f9(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw add ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = add i128 [[TMP1]], [[TMP0]]
-// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 __int128 f9() {
@@ -146,10 +146,10 @@ __int128 f9() {
 // CHECK-LABEL: define dso_local void @f10(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw sub ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = sub i128 [[TMP1]], [[TMP0]]
-// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 __int128 f10() {
@@ -159,10 +159,10 @@ __int128 f10() {
 // CHECK-LABEL: define dso_local void @f11(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw and ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = and i128 [[TMP1]], [[TMP0]]
-// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 __int128 f11() {
@@ -172,10 +172,10 @@ __int128 f11() {
 // CHECK-LABEL: define dso_local void @f12(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xor ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], [[TMP0]]
-// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 __int128 f12() {
@@ -185,10 +185,10 @@ __int128 f12() {
 // CHECK-LABEL: define dso_local void @f13(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw or ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = or i128 [[TMP1]], [[TMP0]]
-// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 __int128 f13() {
@@ -198,11 +198,11 @@ __int128 f13() {
 // CHECK-LABEL: define dso_local void @f14(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw nand ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = and i128 [[TMP1]], [[TMP0]]
 // CHECK-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP2]], -1
-// CHECK-NEXT:    store i128 [[TMP3]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP3]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 __int128 f14() {
@@ -212,9 +212,9 @@ __int128 f14() {
 // CHECK-LABEL: define dso_local void @f15(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw add ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 __int128 f15() {
@@ -224,9 +224,9 @@ __int128 f15() {
 // CHECK-LABEL: define dso_local void @f16(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw sub ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 __int128 f16() {
@@ -236,9 +236,9 @@ __int128 f16() {
 // CHECK-LABEL: define dso_local void @f17(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw and ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 __int128 f17() {
@@ -248,9 +248,9 @@ __int128 f17() {
 // CHECK-LABEL: define dso_local void @f18(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xor ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 __int128 f18() {
@@ -260,9 +260,9 @@ __int128 f18() {
 // CHECK-LABEL: define dso_local void @f19(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw or ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 __int128 f19() {
@@ -272,17 +272,17 @@ __int128 f19() {
 // CHECK-LABEL: define dso_local void @f20(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw nand ptr @Ptr, i128 [[TMP0]] seq_cst, align 8
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 __int128 f20() {
   return __atomic_fetch_nand(&Ptr, Val, memory_order_seq_cst);
 }
 //.
-// CHECK: [[__INT128_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
-// CHECK: [[META3]] = !{!"__int128", [[META4:![0-9]+]], i64 0}
-// CHECK: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK: [[META4:![0-9]+]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
 // CHECK: [[META5]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[__INT128_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// CHECK: [[META7]] = !{!"__int128", [[META4]], i64 0}
 //.
diff --git a/clang/test/CodeGen/SystemZ/sync-builtins-i128-16Al.c b/clang/test/CodeGen/SystemZ/sync-builtins-i128-16Al.c
index 601bd7fa16153..ba77cdb29305a 100644
--- a/clang/test/CodeGen/SystemZ/sync-builtins-i128-16Al.c
+++ b/clang/test/CodeGen/SystemZ/sync-builtins-i128-16Al.c
@@ -13,9 +13,9 @@ __int128 OldVal __attribute__((aligned(16)));
 // CHECK-LABEL: define dso_local void @f1(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA6:![0-9]+]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw add ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 __int128 f1() {
@@ -25,9 +25,9 @@ __int128 f1() {
 // CHECK-LABEL: define dso_local void @f2(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw sub ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 __int128 f2() {
@@ -37,9 +37,9 @@ __int128 f2() {
 // CHECK-LABEL: define dso_local void @f3(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw or ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 __int128 f3() {
@@ -49,9 +49,9 @@ __int128 f3() {
 // CHECK-LABEL: define dso_local void @f4(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw and ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 __int128 f4() {
@@ -61,9 +61,9 @@ __int128 f4() {
 // CHECK-LABEL: define dso_local void @f5(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xor ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 __int128 f5() {
@@ -73,9 +73,9 @@ __int128 f5() {
 // CHECK-LABEL: define dso_local void @f6(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw nand ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 __int128 f6() {
@@ -85,10 +85,10 @@ __int128 f6() {
 // CHECK-LABEL: define dso_local void @f7(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw add ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
 // CHECK-NEXT:    [[TMP2:%.*]] = add i128 [[TMP1]], [[TMP0]]
-// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 __int128 f7() {
@@ -98,10 +98,10 @@ __int128 f7() {
 // CHECK-LABEL: define dso_local void @f8(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw sub ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
 // CHECK-NEXT:    [[TMP2:%.*]] = sub i128 [[TMP1]], [[TMP0]]
-// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 __int128 f8() {
@@ -111,10 +111,10 @@ __int128 f8() {
 // CHECK-LABEL: define dso_local void @f9(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw or ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
 // CHECK-NEXT:    [[TMP2:%.*]] = or i128 [[TMP1]], [[TMP0]]
-// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 __int128 f9() {
@@ -124,10 +124,10 @@ __int128 f9() {
 // CHECK-LABEL: define dso_local void @f10(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw and ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
 // CHECK-NEXT:    [[TMP2:%.*]] = and i128 [[TMP1]], [[TMP0]]
-// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 __int128 f10() {
@@ -137,10 +137,10 @@ __int128 f10() {
 // CHECK-LABEL: define dso_local void @f11(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xor ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
 // CHECK-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], [[TMP0]]
-// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 __int128 f11() {
@@ -150,11 +150,11 @@ __int128 f11() {
 // CHECK-LABEL: define dso_local void @f12(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw nand ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
 // CHECK-NEXT:    [[TMP2:%.*]] = and i128 [[TMP1]], [[TMP0]]
 // CHECK-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP2]], -1
-// CHECK-NEXT:    store i128 [[TMP3]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP3]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 __int128 f12() {
@@ -164,8 +164,8 @@ __int128 f12() {
 // CHECK-LABEL: define dso_local zeroext i1 @f13(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR1:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @OldVal, align 16, !tbaa [[__INT128_TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @OldVal, align 16, !tbaa [[__INT128_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr @Ptr, i128 [[TMP0]], i128 [[TMP1]] seq_cst seq_cst, align 16
 // CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i128, i1 } [[TMP2]], 1
 // CHECK-NEXT:    ret i1 [[TMP3]]
@@ -177,11 +177,11 @@ _Bool f13() {
 // CHECK-LABEL: define dso_local void @f14(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @OldVal, align 16, !tbaa [[__INT128_TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @OldVal, align 16, !tbaa [[__INT128_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr @Ptr, i128 [[TMP0]], i128 [[TMP1]] seq_cst seq_cst, align 16
 // CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i128, i1 } [[TMP2]], 0
-// CHECK-NEXT:    store i128 [[TMP3]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP3]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 __int128 f14() {
@@ -191,9 +191,9 @@ __int128 f14() {
 // CHECK-LABEL: define dso_local void @f15(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xchg ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 __int128 f15() {
@@ -213,9 +213,9 @@ void f16() {
 // CHECK-LABEL: define dso_local void @f17(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(i128) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr @Val, align 16, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xchg ptr @Ptr, i128 [[TMP0]] seq_cst, align 16
-// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    store i128 [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 __int128 f17() {
@@ -227,8 +227,8 @@ __int128 f17() {
 // CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[T_ADDR:%.*]] = alloca i128, align 8
-// CHECK-NEXT:    [[T:%.*]] = load i128, ptr [[TMP0]], align 8, !tbaa [[__INT128_TBAA2]]
-// CHECK-NEXT:    store i128 [[T]], ptr [[T_ADDR]], align 8, !tbaa [[__INT128_TBAA2]]
+// CHECK-NEXT:    [[T:%.*]] = load i128, ptr [[TMP0]], align 8, !tbaa [[__INT128_TBAA6]]
+// CHECK-NEXT:    store i128 [[T]], ptr [[T_ADDR]], align 8, !tbaa [[__INT128_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[T_ADDR]], i128 [[T]], i128 [[T]] seq_cst seq_cst, align 16
 // CHECK-NEXT:    ret void
 //
@@ -236,8 +236,8 @@ void f18(__int128 t) {
   __sync_bool_compare_and_swap(({int x = 1; &t;}), t, t);
 }
 //.
-// CHECK: [[__INT128_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
-// CHECK: [[META3]] = !{!"__int128", [[META4:![0-9]+]], i64 0}
-// CHECK: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK: [[META4:![0-9]+]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
 // CHECK: [[META5]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[__INT128_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// CHECK: [[META7]] = !{!"__int128", [[META4]], i64 0}
 //.
diff --git a/clang/test/CodeGen/SystemZ/zvector2.c b/clang/test/CodeGen/SystemZ/zvector2.c
index f00fcdd52c401..b7994675fab50 100644
--- a/clang/test/CodeGen/SystemZ/zvector2.c
+++ b/clang/test/CodeGen/SystemZ/zvector2.c
@@ -8,8 +8,8 @@ volatile vector bool int bi;
 // CHECK-LABEL: define dso_local void @test_assign(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA3:![0-9]+]]
-// CHECK-NEXT:    store volatile <4 x float> [[TMP0]], ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA7:![0-9]+]]
+// CHECK-NEXT:    store volatile <4 x float> [[TMP0]], ptr @ff, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    ret void
 //
 void test_assign (void)
@@ -20,8 +20,8 @@ void test_assign (void)
 // CHECK-LABEL: define dso_local void @test_pos(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    store volatile <4 x float> [[TMP0]], ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    store volatile <4 x float> [[TMP0]], ptr @ff, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    ret void
 //
 void test_pos (void)
@@ -32,9 +32,9 @@ void test_pos (void)
 // CHECK-LABEL: define dso_local void @test_neg(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    [[FNEG:%.*]] = fneg <4 x float> [[TMP0]]
-// CHECK-NEXT:    store volatile <4 x float> [[FNEG]], ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    store volatile <4 x float> [[FNEG]], ptr @ff, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    ret void
 //
 void test_neg (void)
@@ -45,9 +45,9 @@ void test_neg (void)
 // CHECK-LABEL: define dso_local void @test_preinc(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    [[INC:%.*]] = fadd <4 x float> [[TMP0]], splat (float 1.000000e+00)
-// CHECK-NEXT:    store volatile <4 x float> [[INC]], ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    store volatile <4 x float> [[INC]], ptr @ff2, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    ret void
 //
 void test_preinc (void)
@@ -58,9 +58,9 @@ void test_preinc (void)
 // CHECK-LABEL: define dso_local void @test_postinc(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    [[INC:%.*]] = fadd <4 x float> [[TMP0]], splat (float 1.000000e+00)
-// CHECK-NEXT:    store volatile <4 x float> [[INC]], ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    store volatile <4 x float> [[INC]], ptr @ff2, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    ret void
 //
 void test_postinc (void)
@@ -71,9 +71,9 @@ void test_postinc (void)
 // CHECK-LABEL: define dso_local void @test_predec(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    [[DEC:%.*]] = fadd <4 x float> [[TMP0]], splat (float -1.000000e+00)
-// CHECK-NEXT:    store volatile <4 x float> [[DEC]], ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    store volatile <4 x float> [[DEC]], ptr @ff2, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    ret void
 //
 void test_predec (void)
@@ -84,9 +84,9 @@ void test_predec (void)
 // CHECK-LABEL: define dso_local void @test_postdec(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    [[DEC:%.*]] = fadd <4 x float> [[TMP0]], splat (float -1.000000e+00)
-// CHECK-NEXT:    store volatile <4 x float> [[DEC]], ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    store volatile <4 x float> [[DEC]], ptr @ff2, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    ret void
 //
 void test_postdec (void)
@@ -97,10 +97,10 @@ void test_postdec (void)
 // CHECK-LABEL: define dso_local void @test_add(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    [[ADD:%.*]] = fadd <4 x float> [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    store volatile <4 x float> [[ADD]], ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    store volatile <4 x float> [[ADD]], ptr @ff, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    ret void
 //
 void test_add (void)
@@ -111,10 +111,10 @@ void test_add (void)
 // CHECK-LABEL: define dso_local void @test_add_assign(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    [[ADD:%.*]] = fadd <4 x float> [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    store volatile <4 x float> [[ADD]], ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    store volatile <4 x float> [[ADD]], ptr @ff, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    ret void
 //
 void test_add_assign (void)
@@ -125,10 +125,10 @@ void test_add_assign (void)
 // CHECK-LABEL: define dso_local void @test_sub(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    [[SUB:%.*]] = fsub <4 x float> [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    store volatile <4 x float> [[SUB]], ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    store volatile <4 x float> [[SUB]], ptr @ff, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    ret void
 //
 void test_sub (void)
@@ -139,10 +139,10 @@ void test_sub (void)
 // CHECK-LABEL: define dso_local void @test_sub_assign(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    [[SUB:%.*]] = fsub <4 x float> [[TMP1]], [[TMP0]]
-// CHECK-NEXT:    store volatile <4 x float> [[SUB]], ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    store volatile <4 x float> [[SUB]], ptr @ff, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    ret void
 //
 void test_sub_assign (void)
@@ -153,10 +153,10 @@ void test_sub_assign (void)
 // CHECK-LABEL: define dso_local void @test_mul(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    store volatile <4 x float> [[MUL]], ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    store volatile <4 x float> [[MUL]], ptr @ff, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    ret void
 //
 void test_mul (void)
@@ -167,10 +167,10 @@ void test_mul (void)
 // CHECK-LABEL: define dso_local void @test_mul_assign(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    store volatile <4 x float> [[MUL]], ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    store volatile <4 x float> [[MUL]], ptr @ff, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    ret void
 //
 void test_mul_assign (void)
@@ -181,10 +181,10 @@ void test_mul_assign (void)
 // CHECK-LABEL: define dso_local void @test_div(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    [[DIV:%.*]] = fdiv <4 x float> [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    store volatile <4 x float> [[DIV]], ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    store volatile <4 x float> [[DIV]], ptr @ff, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    ret void
 //
 void test_div (void)
@@ -195,10 +195,10 @@ void test_div (void)
 // CHECK-LABEL: define dso_local void @test_div_assign(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    [[DIV:%.*]] = fdiv <4 x float> [[TMP1]], [[TMP0]]
-// CHECK-NEXT:    store volatile <4 x float> [[DIV]], ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    store volatile <4 x float> [[DIV]], ptr @ff, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    ret void
 //
 void test_div_assign (void)
@@ -209,11 +209,11 @@ void test_div_assign (void)
 // CHECK-LABEL: define dso_local void @test_cmpeq(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq <4 x float> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
-// CHECK-NEXT:    store volatile <4 x i32> [[SEXT]], ptr @bi, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    store volatile <4 x i32> [[SEXT]], ptr @bi, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    ret void
 //
 void test_cmpeq (void)
@@ -224,11 +224,11 @@ void test_cmpeq (void)
 // CHECK-LABEL: define dso_local void @test_cmpne(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    [[CMP:%.*]] = fcmp une <4 x float> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
-// CHECK-NEXT:    store volatile <4 x i32> [[SEXT]], ptr @bi, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    store volatile <4 x i32> [[SEXT]], ptr @bi, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    ret void
 //
 void test_cmpne (void)
@@ -239,11 +239,11 @@ void test_cmpne (void)
 // CHECK-LABEL: define dso_local void @test_cmpge(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    [[CMP:%.*]] = fcmp oge <4 x float> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
-// CHECK-NEXT:    store volatile <4 x i32> [[SEXT]], ptr @bi, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    store volatile <4 x i32> [[SEXT]], ptr @bi, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    ret void
 //
 void test_cmpge (void)
@@ -254,11 +254,11 @@ void test_cmpge (void)
 // CHECK-LABEL: define dso_local void @test_cmpgt(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt <4 x float> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
-// CHECK-NEXT:    store volatile <4 x i32> [[SEXT]], ptr @bi, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    store volatile <4 x i32> [[SEXT]], ptr @bi, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    ret void
 //
 void test_cmpgt (void)
@@ -269,11 +269,11 @@ void test_cmpgt (void)
 // CHECK-LABEL: define dso_local void @test_cmple(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    [[CMP:%.*]] = fcmp ole <4 x float> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
-// CHECK-NEXT:    store volatile <4 x i32> [[SEXT]], ptr @bi, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    store volatile <4 x i32> [[SEXT]], ptr @bi, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    ret void
 //
 void test_cmple (void)
@@ -284,11 +284,11 @@ void test_cmple (void)
 // CHECK-LABEL: define dso_local void @test_cmplt(
 // CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[CHAR_TBAA3]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile <4 x float>, ptr @ff, align 8, !tbaa [[CHAR_TBAA7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load volatile <4 x float>, ptr @ff2, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    [[CMP:%.*]] = fcmp olt <4 x float> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
-// CHECK-NEXT:    store volatile <4 x i32> [[SEXT]], ptr @bi, align 8, !tbaa [[CHAR_TBAA3]]
+// CHECK-NEXT:    store volatile <4 x i32> [[SEXT]], ptr @bi, align 8, !tbaa [[CHAR_TBAA7]]
 // CHECK-NEXT:    ret void
 //
 void test_cmplt (void)
@@ -297,7 +297,7 @@ void test_cmplt (void)
 }
 
 //.
-// CHECK: [[CHAR_TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
-// CHECK: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
-// CHECK: [[META5]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[META5:![0-9]+]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
+// CHECK: [[META6]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[CHAR_TBAA7]] = !{[[META5]], [[META5]], i64 0}
 //.
diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c
index 8f3d4590d0b3e..26aed77283391 100644
--- a/clang/test/CodeGen/X86/avx-builtins.c
+++ b/clang/test/CodeGen/X86/avx-builtins.c
@@ -1070,19 +1070,25 @@ __m128d test_mm256_extractf128_pd(__m256d A) {
   // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> poison, <2 x i32> <i32 2, i32 3>
   return _mm256_extractf128_pd(A, 1);
 }
+TEST_CONSTEXPR(match_m128d(_mm256_extractf128_pd(((__m256d){0.0, 1.0, 2.0, 3.0}), 1),
+              2.0, 3.0));
 
 __m128 test_mm256_extractf128_ps(__m256 A) {
   // CHECK-LABEL: test_mm256_extractf128_ps
   // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   return _mm256_extractf128_ps(A, 1);
 }
+TEST_CONSTEXPR(match_m128(_mm256_extractf128_ps(((__m256){0,1,2,3,4,5,6,7}), 1),
+             4.0f, 5.0f, 6.0f, 7.0f));
 
 __m128i test_mm256_extractf128_si256(__m256i A) {
   // CHECK-LABEL: test_mm256_extractf128_si256
   // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   return _mm256_extractf128_si256(A, 1);
 }
-
+TEST_CONSTEXPR(match_m128i(_mm256_extractf128_si256(((__m256i){0ULL, 1ULL, 2ULL, 3ULL}), 1),
+              2ULL, 3ULL));
+              
 __m256d test_mm256_floor_pd(__m256d x) {
   // CHECK-LABEL: test_mm256_floor_pd
   // CHECK: call {{.*}}<4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %{{.*}}, i32 1)
@@ -1100,6 +1106,7 @@ __m256d test_mm256_hadd_pd(__m256d A, __m256d B) {
   // CHECK: call {{.*}}<4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
   return _mm256_hadd_pd(A, B);
 }
+TEST_CONSTEXPR(match_m256d(_mm256_hadd_pd((__m256d){+1.0, +2.0, +3.0, +4.0}, (__m256d){+5.0, +6.0, +7.0, +8.0}), +3.0, +11.0, +7.0, +15.0));
 
 __m256 test_mm256_hadd_ps(__m256 A, __m256 B) {
   // CHECK-LABEL: test_mm256_hadd_ps
@@ -1107,17 +1114,27 @@ __m256 test_mm256_hadd_ps(__m256 A, __m256 B) {
   return _mm256_hadd_ps(A, B);
 }
 
+TEST_CONSTEXPR(match_m256(_mm256_hadd_ps(
+    (__m256){+1.0f, +2.0f, +3.0f, +4.0f, +5.0f, +6.0f, +7.0f, +8.0f},
+    (__m256){+9.0f, +10.0f, +11.0f, +12.0f, +13.0f, +14.0f, +15.0f, +16.0f}),
+    +3.0f, +7.0f, +19.0f, +23.0f, +11.0f, +15.0f, +27.0f, +31.0f));
+
 __m256d test_mm256_hsub_pd(__m256d A, __m256d B) {
   // CHECK-LABEL: test_mm256_hsub_pd
   // CHECK: call {{.*}}<4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
   return _mm256_hsub_pd(A, B);
 }
+TEST_CONSTEXPR(match_m256d(_mm256_hsub_pd((__m256d){+1.0, +2.0, +4.0, +3.0}, (__m256d){+10.0, +6.0, +16.0, +8.0}), -1.0,+4.0,+1.0,+8.0));
 
 __m256 test_mm256_hsub_ps(__m256 A, __m256 B) {
   // CHECK-LABEL: test_mm256_hsub_ps
   // CHECK: call {{.*}}<8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}})
   return _mm256_hsub_ps(A, B);
 }
+TEST_CONSTEXPR(match_m256(_mm256_hsub_ps(
+    (__m256){1.0f, 2.0f, 4.0f, 3.0f, 5.0f, 7.0f, 7.0f, 5.0f},
+    (__m256){6.0f, 9.0f, 11.0f, 8.0f, 13.0f, 17.0f, 15.0f, 11.0f}),
+    -1.0f, 1.0f, -3.0f, 3.0f, -2.0f, 2.0f, -4.0f, 4.0f));
 
 __m256i test_mm256_insert_epi8(__m256i x, char b) {
   // CHECK-LABEL: test_mm256_insert_epi8
@@ -1337,12 +1354,16 @@ int test_mm256_movemask_pd(__m256d A) {
   // CHECK: call {{.*}}i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %{{.*}})
   return _mm256_movemask_pd(A);
 }
+TEST_CONSTEXPR(_mm256_movemask_pd((__m256d)(__v4df){-1234.5678901234, 98765.4321098765, 0.000123456789, -3.14159265358979}) == 0x9);
+TEST_CONSTEXPR(_mm256_movemask_pd((__m256d)(__v4df){-0.000000987654321, -99999.999999999, 42.424242424242, 314159.2653589793}) == 0x3);
 
 int test_mm256_movemask_ps(__m256 A) {
   // CHECK-LABEL: test_mm256_movemask_ps
   // CHECK: call {{.*}}i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %{{.*}})
   return _mm256_movemask_ps(A);
 }
+TEST_CONSTEXPR(_mm256_movemask_ps((__m256)(__v8sf){-12.3456f, 34.7890f, -0.0001234f, 123456.78f, -987.654f, 0.001234f, 3.14159f, -256.001f}) == 0x95);
+TEST_CONSTEXPR(_mm256_movemask_ps((__m256)(__v8sf){0.333333f, -45.6789f, 999.999f, -0.9999f, 17.234f, -128.512f, 2048.0f, -3.14f}) == 0xAA);
 
 __m256d test_mm256_mul_pd(__m256d A, __m256d B) {
   // CHECK-LABEL: test_mm256_mul_pd
diff --git a/clang/test/CodeGen/X86/avx10_2bf16-builtins.c b/clang/test/CodeGen/X86/avx10_2bf16-builtins.c
index c7fea07421b56..f8a4c51d9ceb3 100644
--- a/clang/test/CodeGen/X86/avx10_2bf16-builtins.c
+++ b/clang/test/CodeGen/X86/avx10_2bf16-builtins.c
@@ -274,7 +274,7 @@ __m256bh test_mm256_loadu_pbh(void *p) {
 
 __m128bh test_mm_load_sbh(void const *A) {
   // CHECK-LABEL: test_mm_load_sbh
-  // CHECK: %{{.*}} = call <8 x bfloat> @llvm.masked.load.v8bf16.p0(ptr %{{.*}}, i32 1, <8 x i1> bitcast (<1 x i8> splat (i8 1) to <8 x i1>), <8 x bfloat> %{{.*}})
+  // CHECK: %{{.*}} = call <8 x bfloat> @llvm.masked.load.v8bf16.p0(ptr align 1 %{{.*}}, <8 x i1> bitcast (<1 x i8> splat (i8 1) to <8 x i1>), <8 x bfloat> %{{.*}})
   return _mm_load_sbh(A);
 }
 
@@ -305,7 +305,7 @@ void test_mm_store_sbh(void *A, __m128bh B) {
 
 void test_mm_mask_store_sbh(void *__P, __mmask8 __U, __m128bh __A) {
   // CHECK-LABEL: @test_mm_mask_store_sbh
-  // CHECK: call void @llvm.masked.store.v8bf16.p0(<8 x bfloat> %{{.*}}, ptr %{{.*}}, i32 1, <8 x i1> %{{.*}})
+  // CHECK: call void @llvm.masked.store.v8bf16.p0(<8 x bfloat> %{{.*}}, ptr align 1 %{{.*}}, <8 x i1> %{{.*}})
   _mm_mask_store_sbh(__P, __U, __A);
 }
 
@@ -323,13 +323,13 @@ void test_mm_store_pbh(void *p, __m128bh a) {
 
 __m128bh test_mm_mask_load_sbh(__m128bh __A, __mmask8 __U, const void *__W) {
   // CHECK-LABEL: @test_mm_mask_load_sbh
-  // CHECK: %{{.*}} = call <8 x bfloat> @llvm.masked.load.v8bf16.p0(ptr %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}})
+  // CHECK: %{{.*}} = call <8 x bfloat> @llvm.masked.load.v8bf16.p0(ptr align 1 %{{.*}}, <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}})
   return _mm_mask_load_sbh(__A, __U, __W);
 }
 
 __m128bh test_mm_maskz_load_sbh(__mmask8 __U, const void *__W) {
   // CHECK-LABEL: @test_mm_maskz_load_sbh
-  // CHECK: %{{.*}} = call <8 x bfloat> @llvm.masked.load.v8bf16.p0(ptr %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}})
+  // CHECK: %{{.*}} = call <8 x bfloat> @llvm.masked.load.v8bf16.p0(ptr align 1 %{{.*}}, <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}})
   return _mm_maskz_load_sbh(__U, __W);
 }
 
diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
index 55f18f947b96f..03b1bdeb55d7c 100644
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/avx2-builtins.c
@@ -466,6 +466,7 @@ __m128i test0_mm256_extracti128_si256_0(__m256i a) {
   // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> poison, <2 x i32> <i32 0, i32 1>
   return _mm256_extracti128_si256(a, 0);
 }
+TEST_CONSTEXPR(match_m128i(_mm256_extracti128_si256(((__m256i){1ULL, 2ULL, 3ULL, 4ULL}), 0),1ULL, 2ULL));
 
 __m128i test1_mm256_extracti128_si256_1(__m256i a) {
   // CHECK-LABEL: test1_mm256_extracti128_si256
@@ -485,36 +486,60 @@ __m256i test_mm256_hadd_epi16(__m256i a, __m256i b) {
   // CHECK: call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_hadd_epi16(a, b);
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_hadd_epi16(
+    (__m256i)(__v16hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}, 
+    (__m256i)(__v16hi){17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32}), 
+    3,7,11,15,35,39,43,47,19,23,27,31,51,55,59,63));
 
 __m256i test_mm256_hadd_epi32(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_hadd_epi32
   // CHECK: call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
   return _mm256_hadd_epi32(a, b);
 }
+TEST_CONSTEXPR(match_v8si(_mm256_hadd_epi32(
+    (__m256i)(__v8si){10, 20, 30, 40, 50, 60, 70, 80},
+    (__m256i)(__v8si){5, 15, 25, 35, 45, 55, 65, 75}),
+    30,70,20,60,110,150,100,140));
 
 __m256i test_mm256_hadds_epi16(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_hadds_epi16
   // CHECK:call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_hadds_epi16(a, b);
 }
+TEST_CONSTEXPR(match_v16hi( _mm256_hadds_epi16(
+    (__m256i)(__v16hi){32767, 32767, 1,2,3,4,5,6,7,8,9,10,11,12,13,14},
+    (__m256i)(__v16hi){19,20,21,22,23,24,25,26,27,28,29,30,31,32, 32767, 5}),
+    32767, 3,7,11, 39,43,47,51,15,19,23,27, 55,59,63, 32767));
 
 __m256i test_mm256_hsub_epi16(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_hsub_epi16
   // CHECK: call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_hsub_epi16(a, b);
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_hsub_epi16(
+    (__m256i)(__v16hi){2,1,1,2,5,3,3,5,7,4,4,7,9,5,5,9}, 
+    (__m256i)(__v16hi){10,5,5,10,12,6,6,12,21,14,14,21,24,16,16,24}), 
+    1,-1,2,-2,5,-5,6,-6,3,-3,4,-4, 7,-7,8,-8));
 
 __m256i test_mm256_hsub_epi32(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_hsub_epi32
   // CHECK: call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
   return _mm256_hsub_epi32(a, b);
 }
+TEST_CONSTEXPR(match_v8si(_mm256_hsub_epi32(
+    (__m256i)(__v8si){10, 20, 30,50,60,90,100,140},
+    (__m256i)(__v8si){200,150,260,200,420,350,800,720}),
+    -10,-20,50,60, -30,-40, 70,80));
 
 __m256i test_mm256_hsubs_epi16(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_hsubs_epi16
   // CHECK:call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_hsubs_epi16(a, b);
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_hsubs_epi16(
+    (__m256i)(__v16hi){32726, -100, 3, 2, 6, 4, 8, 5,15,10 ,21, 14, 27, 18, 100, 90},
+    (__m256i)(__v16hi){40, 20, 100, 70, 200,150, 100,40, 1000,900,300,150, 500,300, 1, 1}),
+    32767, 1, 2, 3,  20, 30, 50, 60, 5, 7, 9, 10, 100, 150, 200, 0));
 
 __m128i test_mm_i32gather_epi32(int const *b, __m128i c) {
   // CHECK-LABEL: test_mm_i32gather_epi32
@@ -968,6 +993,9 @@ int test_mm256_movemask_epi8(__m256i a) {
   // CHECK: call {{.*}}i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %{{.*}})
   return _mm256_movemask_epi8(a);
 }
+TEST_CONSTEXPR(_mm256_movemask_epi8((__m256i)(__v32qu){0x7F,0x80,0x01,0xFF,0x00,0xAA,0x55,0xC3,0x12,0x8E,0x00,0xFE,0x7E,0x81,0xFF,0x01,0xB6,0x00,0x39,0x40,0xD0,0x05,0x80,0x2A,0x7B,0x00,0x90,0xFF,0x01,0x34,0xC0,0x6D}) == 0x4C516AAA);
+TEST_CONSTEXPR(_mm256_movemask_epi8((__m256i)(__v8si){(int)0x80FF00AA,(int)0x7F0183E1,(int)0xDEADBEEF,(int)0xC0000001,(int)0x00000000,(int)0xFFFFFFFF,(int)0x12345678,(int)0x90ABCDEF}) == 0xF0F08F3D);
+TEST_CONSTEXPR(_mm256_movemask_epi8((__m256i)(__v4du){0xFF00000000000080ULL,0x7F010203040506C3ULL,0x8000000000000000ULL,0x0123456789ABCDEFULL}) == 0x0F800181);
 
 __m256i test_mm256_mpsadbw_epu8(__m256i x, __m256i y) {
   // CHECK-LABEL: test_mm256_mpsadbw_epu8
@@ -1014,6 +1042,7 @@ __m256i test_mm256_mulhrs_epi16(__m256i a, __m256i b) {
   // CHECK: call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_mulhrs_epi16(a, b);
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_mulhrs_epi16((__m256i)(__v16hi){+100, +200, -300, -400, +500, +600, -700, +800, -900, -1000, +1100, +1200, -1300, -1400, +1500, +1600}, (__m256i)(__v16hi){+1600, -1500, +1400, -1300, +1200, -1100, +1000, -900, +800, -700, +600, -500, +400, -300, +200, -100}), +5, -9, -13, +16, +18, -20, -21, -22, -22, +21, +20, -18, -16, +13, +9, -5));
 
 __m256i test_mm256_mullo_epi16(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_mullo_epi16
@@ -1106,6 +1135,8 @@ __m256i test_mm256_shuffle_epi8(__m256i a, __m256i b) {
   return _mm256_shuffle_epi8(a, b);
 }
 
+TEST_CONSTEXPR(match_v32qi(_mm256_shuffle_epi8((__m256i)(__v32qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}, (__m256i)(__v32qs){0,33,2,35,4,37,6,-39,8,41,10,43,12,45,14,-47,16,49,18,51,20,53,22,-55,24,57,26,59,28,61,30,-63}), 0,1,2,3,4,5,6,0,8,9,10,11,12,13,14,0,16,17,18,19,20,21,22,0,24,25,26,27,28,29,30,0));
+
 __m256i test_mm256_shuffle_epi32(__m256i a) {
   // CHECK-LABEL: test_mm256_shuffle_epi32
   // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 0, i32 0, i32 7, i32 7, i32 4, i32 4>
diff --git a/clang/test/CodeGen/X86/avx512bw-builtins.c b/clang/test/CodeGen/X86/avx512bw-builtins.c
index af1c904a6de48..d07e40a2b7071 100644
--- a/clang/test/CodeGen/X86/avx512bw-builtins.c
+++ b/clang/test/CodeGen/X86/avx512bw-builtins.c
@@ -1466,18 +1466,27 @@ __m512i test_mm512_shuffle_epi8(__m512i __A, __m512i __B) {
   // CHECK: @llvm.x86.avx512.pshuf.b.512
   return _mm512_shuffle_epi8(__A,__B); 
 }
+
+TEST_CONSTEXPR(match_v64qi(_mm512_shuffle_epi8((__m512i)(__v64qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}, (__m512i)(__v64qs){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,-15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,-15,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,-79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,-95}), 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,0,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,0,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,0,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,0));
+
 __m512i test_mm512_mask_shuffle_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mask_shuffle_epi8
   // CHECK: @llvm.x86.avx512.pshuf.b.512
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_mask_shuffle_epi8(__W,__U,__A,__B); 
 }
+
+TEST_CONSTEXPR(match_v64qi(_mm512_mask_shuffle_epi8((__m512i)(__v64qi){1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8}, 0xFFFFFFFF00000000, (__m512i)(__v64qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}, (__m512i)(__v64qi){63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0}), 1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48));
+
 __m512i test_mm512_maskz_shuffle_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_maskz_shuffle_epi8
   // CHECK: @llvm.x86.avx512.pshuf.b.512
   // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_maskz_shuffle_epi8(__U,__A,__B); 
 }
+
+TEST_CONSTEXPR(match_v64qi(_mm512_maskz_shuffle_epi8(0x8888888888888888,(__m512i)(__v64qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}, (__m512i)(__v64qi){127,126,125,124,123,122,121,120,119,118,117,116,115,114,113,112,111,110,109,108,107,106,105,104,103,102,101,100,99,98,97,96,95,94,93,92,91,90,89,88,87,86,85,84,83,82,81,80,79,78,77,76,75,74,73,72,71,70,69,68,67,66,65,64}), 0,0,0,12,0,0,0,8,0,0,0,4,0,0,0,0,0,0,0,28,0,0,0,24,0,0,0,20,0,0,0,16,0,0,0,44,0,0,0,40,0,0,0,36,0,0,0,32,0,0,0,60,0,0,0,56,0,0,0,52,0,0,0,48));
+
 __m512i test_mm512_subs_epi8(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_subs_epi8
   // CHECK: @llvm.ssub.sat.v64i8
@@ -1587,18 +1596,24 @@ __m512i test_mm512_mulhrs_epi16(__m512i __A, __m512i __B) {
   // CHECK: @llvm.x86.avx512.pmul.hr.sw.512
   return _mm512_mulhrs_epi16(__A,__B); 
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_mulhrs_epi16((__m512i)(__v32hi){+100, +200, -300, -400, +500, +600, -700, +800, -900, -1000, +1100, +1200, -1300, -1400, +1500, +1600, -1700, -1800, +1900, +2000, -2100, -2200, +2300, +2400, -2500, -2600, +2700, +2800, -2900, -3000, +3100, +3200}, (__m512i)(__v32hi){+3200, -3100, +3000, -2900, +2800, -2700, +2600, -2500, +2400, -2300, +2200, -2100, +2000, -1900, +1800, -1700, +1600, -1500, +1400, -1300, +1200, -1100, +1000, -900, +800, -700, +600, -500, +400, -300, +200, -100}), +10, -19, -27, +35, +43, -49, -56, -61, -66, +70, +74, -77, -79, +81, +82, -83, -83, +82, +81, -79, -77, +74, +70, -66, -61, +56, +49, -43, -35, +27, +19, -10));
+
 __m512i test_mm512_mask_mulhrs_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mask_mulhrs_epi16
   // CHECK: @llvm.x86.avx512.pmul.hr.sw.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_mulhrs_epi16(__W,__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_mask_mulhrs_epi16(_mm512_set1_epi16(1), 0x0000FFFF, (__m512i)(__v32hi){+100, +200, -300, -400, +500, +600, -700, +800, -900, -1000, +1100, +1200, -1300, -1400, +1500, +1600, -1700, -1800, +1900, +2000, -2100, -2200, +2300, +2400, -2500, -2600, +2700, +2800, -2900, -3000, +3100, +3200}, (__m512i)(__v32hi){+3200, -3100, +3000, -2900, +2800, -2700, +2600, -2500, +2400, -2300, +2200, -2100, +2000, -1900, +1800, -1700, +1600, -1500, +1400, -1300, +1200, -1100, +1000, -900, +800, -700, +600, -500, +400, -300, +200, -100}), +10, -19, -27, +35, +43, -49, -56, -61, -66, +70, +74, -77, -79, +81, +82, -83, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1, +1));
+
 __m512i test_mm512_maskz_mulhrs_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_maskz_mulhrs_epi16
   // CHECK: @llvm.x86.avx512.pmul.hr.sw.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_mulhrs_epi16(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_maskz_mulhrs_epi16(0x0000FFFF, (__m512i)(__v32hi){+100, +200, -300, -400, +500, +600, -700, +800, -900, -1000, +1100, +1200, -1300, -1400, +1500, +1600, -1700, -1800, +1900, +2000, -2100, -2200, +2300, +2400, -2500, -2600, +2700, +2800, -2900, -3000, +3100, +3200}, (__m512i)(__v32hi){+3200, -3100, +3000, -2900, +2800, -2700, +2600, -2500, +2400, -2300, +2200, -2100, +2000, -1900, +1800, -1700, +1600, -1500, +1400, -1300, +1200, -1100, +1000, -900, +800, -700, +600, -500, +400, -300, +200, -100}), +10, -19, -27, +35, +43, -49, -56, -61, -66, +70, +74, -77, -79, +81, +82, -83, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
+
 __m512i test_mm512_mulhi_epi16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mulhi_epi16
   // CHECK: @llvm.x86.avx512.pmulh.w.512
@@ -2399,13 +2414,13 @@ __m512i test_mm512_loadu_epi16 (void *__P)
 
 __m512i test_mm512_mask_loadu_epi16(__m512i __W, __mmask32 __U, void const *__P) {
   // CHECK-LABEL: test_mm512_mask_loadu_epi16
-  // CHECK: @llvm.masked.load.v32i16.p0(ptr %{{.*}}, i32 1, <32 x i1> %{{.*}}, <32 x i16> %{{.*}})
+  // CHECK: @llvm.masked.load.v32i16.p0(ptr align 1 %{{.*}}, <32 x i1> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_mask_loadu_epi16(__W, __U, __P); 
 }
 
 __m512i test_mm512_maskz_loadu_epi16(__mmask32 __U, void const *__P) {
   // CHECK-LABEL: test_mm512_maskz_loadu_epi16
-  // CHECK: @llvm.masked.load.v32i16.p0(ptr %{{.*}}, i32 1, <32 x i1> %{{.*}}, <32 x i16> %{{.*}})
+  // CHECK: @llvm.masked.load.v32i16.p0(ptr align 1 %{{.*}}, <32 x i1> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_maskz_loadu_epi16(__U, __P); 
 }
 
@@ -2418,13 +2433,13 @@ __m512i test_mm512_loadu_epi8 (void *__P)
 
 __m512i test_mm512_mask_loadu_epi8(__m512i __W, __mmask64 __U, void const *__P) {
   // CHECK-LABEL: test_mm512_mask_loadu_epi8
-  // CHECK: @llvm.masked.load.v64i8.p0(ptr %{{.*}}, i32 1, <64 x i1> %{{.*}}, <64 x i8> %{{.*}})
+  // CHECK: @llvm.masked.load.v64i8.p0(ptr align 1 %{{.*}}, <64 x i1> %{{.*}}, <64 x i8> %{{.*}})
   return _mm512_mask_loadu_epi8(__W, __U, __P); 
 }
 
 __m512i test_mm512_maskz_loadu_epi8(__mmask64 __U, void const *__P) {
   // CHECK-LABEL: test_mm512_maskz_loadu_epi8
-  // CHECK: @llvm.masked.load.v64i8.p0(ptr %{{.*}}, i32 1, <64 x i1> %{{.*}}, <64 x i8> %{{.*}})
+  // CHECK: @llvm.masked.load.v64i8.p0(ptr align 1 %{{.*}}, <64 x i1> %{{.*}}, <64 x i8> %{{.*}})
   return _mm512_maskz_loadu_epi8(__U, __P); 
 }
 
@@ -2436,7 +2451,7 @@ void test_mm512_storeu_epi16(void *__P, __m512i __A) {
 
 void test_mm512_mask_storeu_epi16(void *__P, __mmask32 __U, __m512i __A) {
   // CHECK-LABEL: test_mm512_mask_storeu_epi16
-  // CHECK: @llvm.masked.store.v32i16.p0(<32 x i16> %{{.*}}, ptr %{{.*}}, i32 1, <32 x i1> %{{.*}})
+  // CHECK: @llvm.masked.store.v32i16.p0(<32 x i16> %{{.*}}, ptr align 1 %{{.*}}, <32 x i1> %{{.*}})
   return _mm512_mask_storeu_epi16(__P, __U, __A);
 }
 
@@ -2455,7 +2470,7 @@ void test_mm512_storeu_epi8(void *__P, __m512i __A) {
 
 void test_mm512_mask_storeu_epi8(void *__P, __mmask64 __U, __m512i __A) {
   // CHECK-LABEL: test_mm512_mask_storeu_epi8
-  // CHECK: @llvm.masked.store.v64i8.p0(<64 x i8> %{{.*}}, ptr %{{.*}}, i32 1, <64 x i1> %{{.*}})
+  // CHECK: @llvm.masked.store.v64i8.p0(<64 x i8> %{{.*}}, ptr align 1 %{{.*}}, <64 x i1> %{{.*}})
   return _mm512_mask_storeu_epi8(__P, __U, __A); 
 }
 __mmask64 test_mm512_mask_test_epi8_mask(__mmask64 __U, __m512i __A, __m512i __B) {
diff --git a/clang/test/CodeGen/X86/avx512cd-builtins.c b/clang/test/CodeGen/X86/avx512cd-builtins.c
index b9d42b7dea237..80a20b1244532 100644
--- a/clang/test/CodeGen/X86/avx512cd-builtins.c
+++ b/clang/test/CodeGen/X86/avx512cd-builtins.c
@@ -14,37 +14,53 @@
 __m512i test_mm512_conflict_epi64(__m512i __A) {
   // CHECK-LABEL: test_mm512_conflict_epi64
   // CHECK: call {{.*}}<8 x i64> @llvm.x86.avx512.conflict.q.512(<8 x i64> %{{.*}})
-  return _mm512_conflict_epi64(__A); 
+  return _mm512_conflict_epi64(__A);
 }
+
+TEST_CONSTEXPR(match_v8di(_mm512_conflict_epi64((__m512i)(__v8di){1, 2, 1, 3, 2, 4, 1, 5}), 0, 0, 1, 0, 2, 0, 5, 0));
+TEST_CONSTEXPR(match_v8di(_mm512_conflict_epi64((__m512i)(__v8di){5, 5, 5, 5, 5, 5, 5, 5}), 0, 1, 3, 7, 15, 31, 63, 127));
+TEST_CONSTEXPR(match_v8di(_mm512_conflict_epi64((__m512i)(__v8di){1, 2, 3, 4, 5, 6, 7, 8}), 0, 0, 0, 0, 0, 0, 0, 0));
 __m512i test_mm512_mask_conflict_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
   // CHECK-LABEL: test_mm512_mask_conflict_epi64
   // CHECK: call {{.*}}<8 x i64> @llvm.x86.avx512.conflict.q.512(<8 x i64> %{{.*}})
   // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_mask_conflict_epi64(__W,__U,__A); 
+  return _mm512_mask_conflict_epi64(__W,__U,__A);
 }
+
+TEST_CONSTEXPR(match_v8di(_mm512_mask_conflict_epi64((__m512i)(__v8di){0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}, 0x55, (__m512i)(__v8di){1, 2, 1, 3, 2, 4, 1, 5}), 0, 0xFF, 1, 0xFF, 2, 0xFF, 5, 0xFF));
 __m512i test_mm512_maskz_conflict_epi64(__mmask8 __U, __m512i __A) {
   // CHECK-LABEL: test_mm512_maskz_conflict_epi64
   // CHECK: call {{.*}}<8 x i64> @llvm.x86.avx512.conflict.q.512(<8 x i64> %{{.*}})
   // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_maskz_conflict_epi64(__U,__A); 
+  return _mm512_maskz_conflict_epi64(__U,__A);
 }
+
+TEST_CONSTEXPR(match_v8di(_mm512_maskz_conflict_epi64(0x55, (__m512i)(__v8di){1, 2, 1, 3, 2, 4, 1, 5}), 0, 0, 1, 0, 2, 0, 5, 0));
 __m512i test_mm512_conflict_epi32(__m512i __A) {
   // CHECK-LABEL: test_mm512_conflict_epi32
   // CHECK: call <16 x i32> @llvm.x86.avx512.conflict.d.512(<16 x i32> %{{.*}})
-  return _mm512_conflict_epi32(__A); 
+  return _mm512_conflict_epi32(__A);
 }
+
+TEST_CONSTEXPR(match_v16si(_mm512_conflict_epi32((__m512i)(__v16si){1, 2, 1, 3, 2, 4, 1, 5, 6, 7, 6, 8, 7, 9, 6, 10}), 0, 0, 1, 0, 2, 0, 5, 0, 0, 0, 256, 0, 512, 0, 1280, 0));
+TEST_CONSTEXPR(match_v16si(_mm512_conflict_epi32((__m512i)(__v16si){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9}), 0, 1, 3, 7, 15, 31, 63, 127, 255, 511, 1023, 2047, 4095, 8191, 16383, 32767));
+TEST_CONSTEXPR(match_v16si(_mm512_conflict_epi32((__m512i)(__v16si){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
 __m512i test_mm512_mask_conflict_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
   // CHECK-LABEL: test_mm512_mask_conflict_epi32
   // CHECK: call <16 x i32> @llvm.x86.avx512.conflict.d.512(<16 x i32> %{{.*}})
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_mask_conflict_epi32(__W,__U,__A); 
+  return _mm512_mask_conflict_epi32(__W,__U,__A);
 }
+
+TEST_CONSTEXPR(match_v16si(_mm512_mask_conflict_epi32((__m512i)(__v16si){0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}, 0x5555, (__m512i)(__v16si){1, 2, 1, 3, 2, 4, 1, 5, 6, 7, 6, 8, 7, 9, 6, 10}), 0, 0xFF, 1, 0xFF, 2, 0xFF, 5, 0xFF, 0, 0xFF, 256, 0xFF, 512, 0xFF, 1280, 0xFF));
 __m512i test_mm512_maskz_conflict_epi32(__mmask16 __U, __m512i __A) {
   // CHECK-LABEL: test_mm512_maskz_conflict_epi32
   // CHECK: call <16 x i32> @llvm.x86.avx512.conflict.d.512(<16 x i32> %{{.*}})
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_maskz_conflict_epi32(__U,__A); 
+  return _mm512_maskz_conflict_epi32(__U,__A);
 }
+
+TEST_CONSTEXPR(match_v16si(_mm512_maskz_conflict_epi32(0x5555, (__m512i)(__v16si){1, 2, 1, 3, 2, 4, 1, 5, 6, 7, 6, 8, 7, 9, 6, 10}), 0, 0, 1, 0, 2, 0, 5, 0, 0, 0, 256, 0, 512, 0, 1280, 0));
 __m512i test_mm512_lzcnt_epi32(__m512i __A) {
   // CHECK-LABEL: test_mm512_lzcnt_epi32
   // CHECK: call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %{{.*}}, i1 true)
@@ -125,6 +141,8 @@ __m512i test_mm512_broadcastmb_epi64(__m512i a, __m512i b) {
   // CHECK: insertelement <8 x i64> %{{.*}}, i64 %{{.*}}, i32 7
   return _mm512_broadcastmb_epi64(_mm512_cmpeq_epu64_mask ( a, b)); 
 }
+TEST_CONSTEXPR(match_v8di(_mm512_broadcastmb_epi64((__mmask8)(0)), 0,0,0,0, 0,0,0,0));
+TEST_CONSTEXPR(match_v8di(_mm512_broadcastmb_epi64((__mmask8)(0xab)), 0xab,0xab,0xab,0xab, 0xab,0xab,0xab,0xab));
 
 __m512i test_mm512_broadcastmw_epi32(__m512i a, __m512i b) {
   // CHECK-LABEL: test_mm512_broadcastmw_epi32
@@ -148,3 +166,5 @@ __m512i test_mm512_broadcastmw_epi32(__m512i a, __m512i b) {
   // CHECK: insertelement <16 x i32> %{{.*}}, i32 %{{.*}}
   return _mm512_broadcastmw_epi32(_mm512_cmpeq_epi32_mask ( a, b)); 
 }
+TEST_CONSTEXPR(match_v16si(_mm512_broadcastmw_epi32((__mmask16)(0xff)), 0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff));
+TEST_CONSTEXPR(match_v16si(_mm512_broadcastmw_epi32((__mmask16)(0x0FA1L)), 0x0FA1L,0x0FA1L,0x0FA1L,0x0FA1L, 0x0FA1L,0x0FA1L,0x0FA1L,0x0FA1L, 0x0FA1L,0x0FA1L,0x0FA1L,0x0FA1L, 0x0FA1L,0x0FA1L,0x0FA1L,0x0FA1L));
diff --git a/clang/test/CodeGen/X86/avx512dq-builtins.c b/clang/test/CodeGen/X86/avx512dq-builtins.c
index 4112561216af8..9c4ada3a2b7b8 100644
--- a/clang/test/CodeGen/X86/avx512dq-builtins.c
+++ b/clang/test/CodeGen/X86/avx512dq-builtins.c
@@ -1305,6 +1305,7 @@ __m512 test_mm512_mask_broadcast_f32x8(__m512 __O, __mmask16 __M, float const* _
   // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_broadcast_f32x8(__O, __M, _mm256_loadu_ps(__A)); 
 }
+TEST_CONSTEXPR(match_m512(_mm512_mask_broadcast_f32x8(_mm512_setzero_ps(), 0xAAAA, (__m256)(__v8sf){5.0f,5.0f,5.0f,5.0f,5.0f,5.0f,5.0f,5.0f}), 0,5,0,5,0,5,0,5,0,5,0,5,0,5,0,5));
 
 __m512 test_mm512_maskz_broadcast_f32x8(__mmask16 __M, float const* __A) {
   // CHECK-LABEL: test_mm512_maskz_broadcast_f32x8
@@ -1312,6 +1313,7 @@ __m512 test_mm512_maskz_broadcast_f32x8(__mmask16 __M, float const* __A) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_broadcast_f32x8(__M, _mm256_loadu_ps(__A)); 
 }
+TEST_CONSTEXPR(match_m512(_mm512_maskz_broadcast_f32x8(0xAAAA, _mm256_set1_ps(7.0f)), 0,7,0,7,0,7,0,7,0,7,0,7,0,7,0,7));
 
 __m512d test_mm512_broadcast_f64x2(double const* __A) {
   // CHECK-LABEL: test_mm512_broadcast_f64x2
@@ -1327,6 +1329,8 @@ __m512d test_mm512_mask_broadcast_f64x2(__m512d __O, __mmask8 __M, double const*
   return _mm512_mask_broadcast_f64x2(__O, __M, _mm_loadu_pd(__A)); 
 }
 
+TEST_CONSTEXPR(match_m512d(_mm512_mask_broadcast_f64x2(_mm512_setzero_pd(), 0xAA, (__m128d)(__v2df){1,2}), 0,2,0,2,0,2,0,2));
+
 __m512d test_mm512_maskz_broadcast_f64x2(__mmask8 __M, double const* __A) {
   // CHECK-LABEL: test_mm512_maskz_broadcast_f64x2
   // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -1334,6 +1338,8 @@ __m512d test_mm512_maskz_broadcast_f64x2(__mmask8 __M, double const* __A) {
   return _mm512_maskz_broadcast_f64x2(__M, _mm_loadu_pd(__A)); 
 }
 
+TEST_CONSTEXPR(match_m512d(_mm512_maskz_broadcast_f64x2(0xAA, (__m128d)(__v2df){1,2}), 0,2,0,2,0,2,0,2));
+
 __m512i test_mm512_broadcast_i32x2(__m128i __A) {
   // CHECK-LABEL: test_mm512_broadcast_i32x2
   // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -1348,6 +1354,8 @@ __m512i test_mm512_mask_broadcast_i32x2(__m512i __O, __mmask16 __M, __m128i __A)
   return _mm512_mask_broadcast_i32x2(__O, __M, __A); 
 }
 
+TEST_CONSTEXPR(match_v16si(_mm512_mask_broadcast_i32x2(_mm512_setzero_si512(), 0xAAAA, (__m128i)(__v4si){0,1,2,3}), 0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1));
+
 __m512i test_mm512_maskz_broadcast_i32x2(__mmask16 __M, __m128i __A) {
   // CHECK-LABEL: test_mm512_maskz_broadcast_i32x2
   // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -1355,6 +1363,8 @@ __m512i test_mm512_maskz_broadcast_i32x2(__mmask16 __M, __m128i __A) {
   return _mm512_maskz_broadcast_i32x2(__M, __A); 
 }
 
+TEST_CONSTEXPR(match_v16si(_mm512_maskz_broadcast_i32x2(0xAAAA, (__m128i)(__v4si){0,1,2,3}), 0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1));
+
 __m512i test_mm512_broadcast_i32x8(__m256i const* __A) {
   // CHECK-LABEL: test_mm512_broadcast_i32x8
   // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -1368,6 +1378,7 @@ __m512i test_mm512_mask_broadcast_i32x8(__m512i __O, __mmask16 __M, __m256i cons
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_broadcast_i32x8(__O, __M, _mm256_loadu_si256(__A)); 
 }
+TEST_CONSTEXPR(match_v16si(_mm512_mask_broadcast_i32x8(_mm512_setzero_si512(), 0xAAAA, _mm256_set1_epi32(8)), 0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8));
 
 __m512i test_mm512_maskz_broadcast_i32x8(__mmask16 __M, __m256i const* __A) {
   // CHECK-LABEL: test_mm512_maskz_broadcast_i32x8
@@ -1376,6 +1387,8 @@ __m512i test_mm512_maskz_broadcast_i32x8(__mmask16 __M, __m256i const* __A) {
   return _mm512_maskz_broadcast_i32x8(__M, _mm256_loadu_si256(__A)); 
 }
 
+TEST_CONSTEXPR(match_v16si(_mm512_maskz_broadcast_i32x8(0xAAAA, _mm256_set1_epi32(9)), 0,9,0,9,0,9,0,9,0,9,0,9,0,9,0,9));
+
 __m512i test_mm512_broadcast_i64x2(__m128i const* __A) {
   // CHECK-LABEL: test_mm512_broadcast_i64x2
   // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -1402,6 +1415,7 @@ __m256 test_mm512_extractf32x8_ps(__m512 __A) {
   // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   return _mm512_extractf32x8_ps(__A, 1); 
 }
+TEST_CONSTEXPR(match_m256(_mm512_extractf32x8_ps(((__m512){0.0f,1.0f,2.0f,3.0f, 4.0f,5.0f,6.0f,7.0f,8.0f,9.0f,10.0f,11.0f, 12.0f,13.0f,14.0f,15.0f}), 1),8.0f,  9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f));
 
 __m256 test_mm512_mask_extractf32x8_ps(__m256 __W, __mmask8 __U, __m512 __A) {
   // CHECK-LABEL: test_mm512_mask_extractf32x8_ps
@@ -1409,6 +1423,7 @@ __m256 test_mm512_mask_extractf32x8_ps(__m256 __W, __mmask8 __U, __m512 __A) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm512_mask_extractf32x8_ps(__W, __U, __A, 1); 
 }
+TEST_CONSTEXPR(match_m256(_mm512_mask_extractf32x8_ps(((__m256)(__v8sf){0,0,0,0,0,0,0,0}),    (__mmask8)0xFF,((__m512)(__v16sf){0.0f,1.0f,2.0f,3.0f,4.0f,5.0f,6.0f,7.0f,8.0f,9.0f,10.0f,11.0f,12.0f,13.0f,14.0f,15.0f}),1),8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f));
 
 __m256 test_mm512_maskz_extractf32x8_ps(__mmask8 __U, __m512 __A) {
   // CHECK-LABEL: test_mm512_maskz_extractf32x8_ps
@@ -1416,12 +1431,14 @@ __m256 test_mm512_maskz_extractf32x8_ps(__mmask8 __U, __m512 __A) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm512_maskz_extractf32x8_ps(__U, __A, 1); 
 }
+TEST_CONSTEXPR(match_m256(_mm512_maskz_extractf32x8_ps((__mmask8)0x0F, ((__m512)(__v16sf){0.0f,1.0f,2.0f,3.0f,4.0f,5.0f,6.0f,7.0f,8.0f,9.0f,10.0f,11.0f,12.0f,13.0f,14.0f,15.0f}),1),8.0f,  9.0f, 10.0f, 11.0f,  0.0f,  0.0f,  0.0f,  0.0f));
 
 __m128d test_mm512_extractf64x2_pd(__m512d __A) {
   // CHECK-LABEL: test_mm512_extractf64x2_pd
   // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> poison, <2 x i32> <i32 6, i32 7>
   return _mm512_extractf64x2_pd(__A, 3); 
 }
+TEST_CONSTEXPR(match_m128d(_mm512_extractf64x2_pd(((__m512d){0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0}), 3),6.0, 7.0));
 
 __m128d test_mm512_mask_extractf64x2_pd(__m128d __W, __mmask8 __U, __m512d __A) {
   // CHECK-LABEL: test_mm512_mask_extractf64x2_pd
@@ -1429,6 +1446,7 @@ __m128d test_mm512_mask_extractf64x2_pd(__m128d __W, __mmask8 __U, __m512d __A)
   // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm512_mask_extractf64x2_pd(__W, __U, __A, 3); 
 }
+TEST_CONSTEXPR(match_m128d(_mm512_mask_extractf64x2_pd(((__m128d)(__v2df){100.0, 101.0}),(__mmask8)0x1,((__m512d)(__v8df){0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0}),3),6.0, 101.0));
 
 __m128d test_mm512_maskz_extractf64x2_pd(__mmask8 __U, __m512d __A) {
   // CHECK-LABEL: test_mm512_maskz_extractf64x2_pd
@@ -1436,12 +1454,14 @@ __m128d test_mm512_maskz_extractf64x2_pd(__mmask8 __U, __m512d __A) {
   // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm512_maskz_extractf64x2_pd(__U, __A, 3); 
 }
+TEST_CONSTEXPR(match_m128d(_mm512_maskz_extractf64x2_pd((__mmask8)0x2,((__m512d){0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0}),3),0.0, 7.0));
 
 __m256i test_mm512_extracti32x8_epi32(__m512i __A) {
   // CHECK-LABEL: test_mm512_extracti32x8_epi32
   // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   return _mm512_extracti32x8_epi32(__A, 1); 
 }
+TEST_CONSTEXPR(match_v8si(_mm512_extracti32x8_epi32(((__m512i)(__v16si){0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}), 1),8, 9,10,11,12,13,14,15));
 
 __m256i test_mm512_mask_extracti32x8_epi32(__m256i __W, __mmask8 __U, __m512i __A) {
   // CHECK-LABEL: test_mm512_mask_extracti32x8_epi32
@@ -1449,6 +1469,7 @@ __m256i test_mm512_mask_extracti32x8_epi32(__m256i __W, __mmask8 __U, __m512i __
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm512_mask_extracti32x8_epi32(__W, __U, __A, 1); 
 }
+TEST_CONSTEXPR(match_v8si(_mm512_mask_extracti32x8_epi32(((__m256i)(__v8si){100,101,102,103,104,105,106,107}), (__mmask8)0xAA,((__m512i)(__v16si){0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}),1),100, 9, 102, 11, 104, 13, 106, 15));
 
 __m256i test_mm512_maskz_extracti32x8_epi32(__mmask8 __U, __m512i __A) {
   // CHECK-LABEL: test_mm512_maskz_extracti32x8_epi32
@@ -1456,12 +1477,14 @@ __m256i test_mm512_maskz_extracti32x8_epi32(__mmask8 __U, __m512i __A) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm512_maskz_extracti32x8_epi32(__U, __A, 1); 
 }
+TEST_CONSTEXPR(match_v8si(_mm512_maskz_extracti32x8_epi32((__mmask8)0x0F,((__m512i)(__v16si){0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}),1),8, 9, 10, 11, 0, 0, 0, 0));
 
 __m128i test_mm512_extracti64x2_epi64(__m512i __A) {
   // CHECK-LABEL: test_mm512_extracti64x2_epi64
   // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> poison, <2 x i32> <i32 6, i32 7>
   return _mm512_extracti64x2_epi64(__A, 3); 
 }
+TEST_CONSTEXPR(match_m128i(_mm512_extracti64x2_epi64(((__m512i)(__v8di){0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL}), 3),6ULL, 7ULL));
 
 __m128i test_mm512_mask_extracti64x2_epi64(__m128i __W, __mmask8 __U, __m512i __A) {
   // CHECK-LABEL: test_mm512_mask_extracti64x2_epi64
@@ -1469,6 +1492,7 @@ __m128i test_mm512_mask_extracti64x2_epi64(__m128i __W, __mmask8 __U, __m512i __
   // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm512_mask_extracti64x2_epi64(__W, __U, __A, 3); 
 }
+TEST_CONSTEXPR(match_m128i(_mm512_mask_extracti64x2_epi64(((__m128i)(__v2di){100ULL, 101ULL}), (__mmask8)0x1,((__m512i)(__v8di){0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL}),3),6ULL, 101ULL));
 
 __m128i test_mm512_maskz_extracti64x2_epi64(__mmask8 __U, __m512i __A) {
   // CHECK-LABEL: test_mm512_maskz_extracti64x2_epi64
@@ -1476,6 +1500,7 @@ __m128i test_mm512_maskz_extracti64x2_epi64(__mmask8 __U, __m512i __A) {
   // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm512_maskz_extracti64x2_epi64(__U, __A, 3); 
 }
+TEST_CONSTEXPR(match_m128i(_mm512_maskz_extracti64x2_epi64((__mmask8)0x2,((__m512i)(__v8di){0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL}),3),0ULL, 7ULL));
 
 __m512 test_mm512_insertf32x8(__m512 __A, __m256 __B) {
   // CHECK-LABEL: test_mm512_insertf32x8
diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c
index 7756f0da18c03..122dadd03a211 100644
--- a/clang/test/CodeGen/X86/avx512f-builtins.c
+++ b/clang/test/CodeGen/X86/avx512f-builtins.c
@@ -212,7 +212,7 @@ void test_mm512_storeu_pd(void *p, __m512d a)
 void test_mm512_mask_store_ps(void *p, __m512 a, __mmask16 m)
 {
   // CHECK-LABEL: test_mm512_mask_store_ps
-  // CHECK: @llvm.masked.store.v16f32.p0(<16 x float> %{{.*}}, ptr %{{.*}}, i32 64, <16 x i1> %{{.*}})
+  // CHECK: @llvm.masked.store.v16f32.p0(<16 x float> %{{.*}}, ptr align 64 %{{.*}}, <16 x i1> %{{.*}})
   _mm512_mask_store_ps(p, m, a);
 }
 
@@ -260,7 +260,7 @@ void test_mm512_store_pd(void *p, __m512d a)
 void test_mm512_mask_store_pd(void *p, __m512d a, __mmask8 m)
 {
   // CHECK-LABEL: test_mm512_mask_store_pd
-  // CHECK: @llvm.masked.store.v8f64.p0(<8 x double> %{{.*}}, ptr %{{.*}}, i32 64, <8 x i1> %{{.*}})
+  // CHECK: @llvm.masked.store.v8f64.p0(<8 x double> %{{.*}}, ptr align 64 %{{.*}}, <8 x i1> %{{.*}})
   _mm512_mask_store_pd(p, m, a);
 }
 
@@ -272,7 +272,7 @@ void test_mm512_storeu_epi32(void *__P, __m512i __A) {
 
 void test_mm512_mask_storeu_epi32(void *__P, __mmask16 __U, __m512i __A) {
   // CHECK-LABEL: test_mm512_mask_storeu_epi32
-  // CHECK: @llvm.masked.store.v16i32.p0(<16 x i32> %{{.*}}, ptr %{{.*}}, i32 1, <16 x i1> %{{.*}})
+  // CHECK: @llvm.masked.store.v16i32.p0(<16 x i32> %{{.*}}, ptr align 1 %{{.*}}, <16 x i1> %{{.*}})
   return _mm512_mask_storeu_epi32(__P, __U, __A); 
 }
 
@@ -284,7 +284,7 @@ void test_mm512_storeu_epi64(void *__P, __m512i __A) {
 
 void test_mm512_mask_storeu_epi64(void *__P, __mmask8 __U, __m512i __A) {
   // CHECK-LABEL: test_mm512_mask_storeu_epi64
-  // CHECK: @llvm.masked.store.v8i64.p0(<8 x i64> %{{.*}}, ptr %{{.*}}, i32 1, <8 x i1> %{{.*}})
+  // CHECK: @llvm.masked.store.v8i64.p0(<8 x i64> %{{.*}}, ptr align 1 %{{.*}}, <8 x i1> %{{.*}})
   return _mm512_mask_storeu_epi64(__P, __U, __A); 
 }
 
@@ -305,14 +305,14 @@ __m512i test_mm512_loadu_epi32 (void *__P)
 __m512i test_mm512_mask_loadu_epi32 (__m512i __W, __mmask16 __U, void *__P)
 {
   // CHECK-LABEL: test_mm512_mask_loadu_epi32 
-  // CHECK: @llvm.masked.load.v16i32.p0(ptr %{{.*}}, i32 1, <16 x i1> %{{.*}}, <16 x i32> %{{.*}})
+  // CHECK: @llvm.masked.load.v16i32.p0(ptr align 1 %{{.*}}, <16 x i1> %{{.*}}, <16 x i32> %{{.*}})
   return _mm512_mask_loadu_epi32 (__W,__U, __P);
 }
 
 __m512i test_mm512_maskz_loadu_epi32 (__mmask16 __U, void *__P)
 {
   // CHECK-LABEL: test_mm512_maskz_loadu_epi32
-  // CHECK: @llvm.masked.load.v16i32.p0(ptr %{{.*}}, i32 1, <16 x i1> %{{.*}}, <16 x i32> %{{.*}})
+  // CHECK: @llvm.masked.load.v16i32.p0(ptr align 1 %{{.*}}, <16 x i1> %{{.*}}, <16 x i32> %{{.*}})
   return _mm512_maskz_loadu_epi32 (__U, __P);
 }
 
@@ -326,14 +326,14 @@ __m512i test_mm512_loadu_epi64 (void *__P)
 __m512i test_mm512_mask_loadu_epi64 (__m512i __W, __mmask8 __U, void *__P)
 {
   // CHECK-LABEL: test_mm512_mask_loadu_epi64 
-  // CHECK: @llvm.masked.load.v8i64.p0(ptr %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x i64> %{{.*}})
+  // CHECK: @llvm.masked.load.v8i64.p0(ptr align 1 %{{.*}}, <8 x i1> %{{.*}}, <8 x i64> %{{.*}})
   return _mm512_mask_loadu_epi64 (__W,__U, __P);
 }
 
 __m512i test_mm512_maskz_loadu_epi64 (__mmask16 __U, void *__P)
 {
   // CHECK-LABEL: test_mm512_maskz_loadu_epi64
-  // CHECK: @llvm.masked.load.v8i64.p0(ptr %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x i64> %{{.*}})
+  // CHECK: @llvm.masked.load.v8i64.p0(ptr align 1 %{{.*}}, <8 x i1> %{{.*}}, <8 x i64> %{{.*}})
   return _mm512_maskz_loadu_epi64 (__U, __P);
 }
 
@@ -347,7 +347,7 @@ __m512 test_mm512_loadu_ps(void *p)
 __m512 test_mm512_mask_loadu_ps (__m512 __W, __mmask16 __U, void *__P)
 {
   // CHECK-LABEL: test_mm512_mask_loadu_ps 
-  // CHECK: @llvm.masked.load.v16f32.p0(ptr %{{.*}}, i32 1, <16 x i1> %{{.*}}, <16 x float> %{{.*}})
+  // CHECK: @llvm.masked.load.v16f32.p0(ptr align 1 %{{.*}}, <16 x i1> %{{.*}}, <16 x float> %{{.*}})
   return _mm512_mask_loadu_ps (__W,__U, __P);
 }
 
@@ -361,7 +361,7 @@ __m512d test_mm512_loadu_pd(void *p)
 __m512d test_mm512_mask_loadu_pd (__m512d __W, __mmask8 __U, void *__P)
 {
   // CHECK-LABEL: test_mm512_mask_loadu_pd 
-  // CHECK: @llvm.masked.load.v8f64.p0(ptr %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x double> %{{.*}})
+  // CHECK: @llvm.masked.load.v8f64.p0(ptr align 1 %{{.*}}, <8 x i1> %{{.*}}, <8 x double> %{{.*}})
   return _mm512_mask_loadu_pd (__W,__U, __P);
 }
 
@@ -399,14 +399,14 @@ __m512 test_mm512_load_ps(void *p)
 __m512 test_mm512_mask_load_ps (__m512 __W, __mmask16 __U, void *__P)
 {
   // CHECK-LABEL: test_mm512_mask_load_ps 
-  // CHECK: @llvm.masked.load.v16f32.p0(ptr %{{.*}}, i32 64, <16 x i1> %{{.*}}, <16 x float> %{{.*}})
+  // CHECK: @llvm.masked.load.v16f32.p0(ptr align 64 %{{.*}}, <16 x i1> %{{.*}}, <16 x float> %{{.*}})
   return _mm512_mask_load_ps (__W,__U, __P);
 }
 
 __m512 test_mm512_maskz_load_ps(__mmask16 __U, void *__P)
 {
   // CHECK-LABEL: test_mm512_maskz_load_ps
-  // CHECK: @llvm.masked.load.v16f32.p0(ptr %{{.*}}, i32 64, <16 x i1> %{{.*}}, <16 x float> %{{.*}})
+  // CHECK: @llvm.masked.load.v16f32.p0(ptr align 64 %{{.*}}, <16 x i1> %{{.*}}, <16 x float> %{{.*}})
   return _mm512_maskz_load_ps(__U, __P);
 }
 
@@ -420,14 +420,14 @@ __m512d test_mm512_load_pd(void *p)
 __m512d test_mm512_mask_load_pd (__m512d __W, __mmask8 __U, void *__P)
 {
   // CHECK-LABEL: test_mm512_mask_load_pd 
-  // CHECK: @llvm.masked.load.v8f64.p0(ptr %{{.*}}, i32 64, <8 x i1> %{{.*}}, <8 x double> %{{.*}})
+  // CHECK: @llvm.masked.load.v8f64.p0(ptr align 64 %{{.*}}, <8 x i1> %{{.*}}, <8 x double> %{{.*}})
   return _mm512_mask_load_pd (__W,__U, __P);
 }
 
 __m512d test_mm512_maskz_load_pd(__mmask8 __U, void *__P)
 {
   // CHECK-LABEL: test_mm512_maskz_load_pd
-  // CHECK: @llvm.masked.load.v8f64.p0(ptr %{{.*}}, i32 64, <8 x i1> %{{.*}}, <8 x double> %{{.*}})
+  // CHECK: @llvm.masked.load.v8f64.p0(ptr align 64 %{{.*}}, <8 x i1> %{{.*}}, <8 x double> %{{.*}})
   return _mm512_maskz_load_pd(__U, __P);
 }
 
@@ -2452,6 +2452,7 @@ __m256d test_mm512_extractf64x4_pd(__m512d a)
   // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   return _mm512_extractf64x4_pd(a, 1);
 }
+TEST_CONSTEXPR(match_m256d(_mm512_extractf64x4_pd(((__m512d){0.0,1.0,2.0,3.0, 4.0,5.0,6.0,7.0}),1),4.0, 5.0, 6.0, 7.0));
 
 __m256d test_mm512_mask_extractf64x4_pd(__m256d  __W,__mmask8  __U,__m512d __A){
   // CHECK-LABEL: test_mm512_mask_extractf64x4_pd
@@ -2459,6 +2460,7 @@ __m256d test_mm512_mask_extractf64x4_pd(__m256d  __W,__mmask8  __U,__m512d __A){
   // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm512_mask_extractf64x4_pd( __W, __U, __A, 1);
 }
+TEST_CONSTEXPR(match_m256d(_mm512_mask_extractf64x4_pd(((__m256d){100.0,101.0,102.0,103.0}), (__mmask8)0x5,((__m512d){0.0,1.0,2.0,3.0, 4.0,5.0,6.0,7.0}), 1), 4.0, 101.0, 6.0, 103.0));
 
 __m256d test_mm512_maskz_extractf64x4_pd(__mmask8  __U,__m512d __A){
   // CHECK-LABEL: test_mm512_maskz_extractf64x4_pd
@@ -2466,6 +2468,7 @@ __m256d test_mm512_maskz_extractf64x4_pd(__mmask8  __U,__m512d __A){
   // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm512_maskz_extractf64x4_pd( __U, __A, 1);
 }
+TEST_CONSTEXPR(match_m256d(_mm512_maskz_extractf64x4_pd((__mmask8)0x3,((__m512d){0.0,1.0,2.0,3.0, 4.0,5.0,6.0,7.0}),1),4.0, 5.0, 0.0, 0.0));
 
 __m128 test_mm512_extractf32x4_ps(__m512 a)
 {
@@ -2473,6 +2476,7 @@ __m128 test_mm512_extractf32x4_ps(__m512 a)
   // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   return _mm512_extractf32x4_ps(a, 1);
 }
+TEST_CONSTEXPR(match_m128(_mm512_extractf32x4_ps(((__m512){0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}),1),4.0f, 5.0f, 6.0f, 7.0f));
 
 __m128 test_mm512_mask_extractf32x4_ps(__m128 __W, __mmask8  __U,__m512 __A){
   // CHECK-LABEL: test_mm512_mask_extractf32x4_ps
@@ -2480,6 +2484,7 @@ __m128 test_mm512_mask_extractf32x4_ps(__m128 __W, __mmask8  __U,__m512 __A){
   // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm512_mask_extractf32x4_ps( __W, __U, __A, 1);
 }
+TEST_CONSTEXPR(match_m128(_mm512_mask_extractf32x4_ps(((__m128){100,101,102,103}),(__mmask8)0x5,((__m512){0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}),1),4.0f, 101.0f, 6.0f, 103.0f));
 
 __m128 test_mm512_maskz_extractf32x4_ps( __mmask8  __U,__m512 __A){
   // CHECK-LABEL: test_mm512_maskz_extractf32x4_ps
@@ -2487,6 +2492,7 @@ __m128 test_mm512_maskz_extractf32x4_ps( __mmask8  __U,__m512 __A){
   // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm512_maskz_extractf32x4_ps(__U, __A, 1);
 }
+TEST_CONSTEXPR(match_m128(_mm512_maskz_extractf32x4_ps((__mmask8)0x3,((__m512){0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}),1),4.0f, 5.0f, 0.0f, 0.0f));
 
 __mmask16 test_mm512_cmpeq_epu32_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: test_mm512_cmpeq_epu32_mask
@@ -4560,13 +4566,13 @@ __m512i test_mm512_maskz_srli_epi64_2(__mmask8 __U, __m512i __A, unsigned int __
 
 __m512i test_mm512_mask_load_epi32(__m512i __W, __mmask16 __U, void const *__P) {
   // CHECK-LABEL: test_mm512_mask_load_epi32
-  // CHECK: @llvm.masked.load.v16i32.p0(ptr %{{.*}}, i32 64, <16 x i1> %{{.*}}, <16 x i32> %{{.*}})
+  // CHECK: @llvm.masked.load.v16i32.p0(ptr align 64 %{{.*}}, <16 x i1> %{{.*}}, <16 x i32> %{{.*}})
   return _mm512_mask_load_epi32(__W, __U, __P); 
 }
 
 __m512i test_mm512_maskz_load_epi32(__mmask16 __U, void const *__P) {
   // CHECK-LABEL: test_mm512_maskz_load_epi32
-  // CHECK: @llvm.masked.load.v16i32.p0(ptr %{{.*}}, i32 64, <16 x i1> %{{.*}}, <16 x i32> %{{.*}})
+  // CHECK: @llvm.masked.load.v16i32.p0(ptr align 64 %{{.*}}, <16 x i1> %{{.*}}, <16 x i32> %{{.*}})
   return _mm512_maskz_load_epi32(__U, __P); 
 }
 
@@ -4596,25 +4602,25 @@ __m512i test_mm512_maskz_mov_epi64(__mmask8 __U, __m512i __A) {
 
 __m512i test_mm512_mask_load_epi64(__m512i __W, __mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm512_mask_load_epi64
-  // CHECK: @llvm.masked.load.v8i64.p0(ptr %{{.*}}, i32 64, <8 x i1> %{{.*}}, <8 x i64> %{{.*}})
+  // CHECK: @llvm.masked.load.v8i64.p0(ptr align 64 %{{.*}}, <8 x i1> %{{.*}}, <8 x i64> %{{.*}})
   return _mm512_mask_load_epi64(__W, __U, __P); 
 }
 
 __m512i test_mm512_maskz_load_epi64(__mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm512_maskz_load_epi64
-  // CHECK: @llvm.masked.load.v8i64.p0(ptr %{{.*}}, i32 64, <8 x i1> %{{.*}}, <8 x i64> %{{.*}})
+  // CHECK: @llvm.masked.load.v8i64.p0(ptr align 64 %{{.*}}, <8 x i1> %{{.*}}, <8 x i64> %{{.*}})
   return _mm512_maskz_load_epi64(__U, __P); 
 }
 
 void test_mm512_mask_store_epi32(void *__P, __mmask16 __U, __m512i __A) {
   // CHECK-LABEL: test_mm512_mask_store_epi32
-  // CHECK: @llvm.masked.store.v16i32.p0(<16 x i32> %{{.*}}, ptr %{{.*}}, i32 64, <16 x i1> %{{.*}})
+  // CHECK: @llvm.masked.store.v16i32.p0(<16 x i32> %{{.*}}, ptr align 64 %{{.*}}, <16 x i1> %{{.*}})
   return _mm512_mask_store_epi32(__P, __U, __A); 
 }
 
 void test_mm512_mask_store_epi64(void *__P, __mmask8 __U, __m512i __A) {
   // CHECK-LABEL: test_mm512_mask_store_epi64
-  // CHECK: @llvm.masked.store.v8i64.p0(<8 x i64> %{{.*}}, ptr %{{.*}}, i32 64, <8 x i1> %{{.*}})
+  // CHECK: @llvm.masked.store.v8i64.p0(<8 x i64> %{{.*}}, ptr align 64 %{{.*}}, <8 x i1> %{{.*}})
   return _mm512_mask_store_epi64(__P, __U, __A); 
 }
 
@@ -6836,6 +6842,8 @@ __m512 test_mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, float const* _
   return _mm512_mask_broadcast_f32x4(__O, __M, _mm_loadu_ps(__A)); 
 }
 
+TEST_CONSTEXPR(match_m512(_mm512_mask_broadcast_f32x4(_mm512_setzero_ps(), 0xAAAA, (__m128)(__v4sf){1,2,3,4}), 0,2,0,4,0,2,0,4,0,2,0,4,0,2,0,4));
+
 __m512 test_mm512_maskz_broadcast_f32x4(__mmask16 __M, float const* __A) {
   // CHECK-LABEL: test_mm512_maskz_broadcast_f32x4
   // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
@@ -6843,6 +6851,8 @@ __m512 test_mm512_maskz_broadcast_f32x4(__mmask16 __M, float const* __A) {
   return _mm512_maskz_broadcast_f32x4(__M, _mm_loadu_ps(__A)); 
 }
 
+TEST_CONSTEXPR(match_m512(_mm512_maskz_broadcast_f32x4(0xAAAA, (__m128)(__v4sf){1,2,3,4}), 0,2,0,4,0,2,0,4,0,2,0,4,0,2,0,4));
+
 __m512d test_mm512_broadcast_f64x4(double const* __A) {
   // CHECK-LABEL: test_mm512_broadcast_f64x4
   // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
@@ -6885,6 +6895,8 @@ __m512i test_mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i const* __A) {
   return _mm512_maskz_broadcast_i32x4(__M, _mm_loadu_si128(__A)); 
 }
 
+TEST_CONSTEXPR(match_v16si(_mm512_maskz_broadcast_i32x4(0xAAAA, (__m128i)(__v4si){0,1,2,3}), 0,1,0,3,0,1,0,3,0,1,0,3,0,1,0,3));
+
 __m512i test_mm512_broadcast_i64x4(__m256i const* __A) {
   // CHECK-LABEL: test_mm512_broadcast_i64x4
   // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
@@ -7357,6 +7369,7 @@ __m128i test_mm512_extracti32x4_epi32(__m512i __A) {
   // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
   return _mm512_extracti32x4_epi32(__A, 3); 
 }
+TEST_CONSTEXPR(match_m128i(_mm512_extracti32x4_epi32(((__m512i)(__v16si){0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}), 3), 0x0000000D0000000CULL,  0x0000000F0000000EULL));
 
 __m128i test_mm512_mask_extracti32x4_epi32(__m128i __W, __mmask8 __U, __m512i __A) {
   // CHECK-LABEL: test_mm512_mask_extracti32x4_epi32
@@ -7364,6 +7377,7 @@ __m128i test_mm512_mask_extracti32x4_epi32(__m128i __W, __mmask8 __U, __m512i __
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm512_mask_extracti32x4_epi32(__W, __U, __A, 3); 
 }
+TEST_CONSTEXPR(match_m128i(_mm512_mask_extracti32x4_epi32(((__m128i)(__v4si){100,101,102,103}), (__mmask8)0x5, ((__m512i)(__v16si){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}), 3), 0x000000650000000CULL, 0x000000670000000EULL));
 
 __m128i test_mm512_maskz_extracti32x4_epi32(__mmask8 __U, __m512i __A) {
   // CHECK-LABEL: test_mm512_maskz_extracti32x4_epi32
@@ -7371,12 +7385,14 @@ __m128i test_mm512_maskz_extracti32x4_epi32(__mmask8 __U, __m512i __A) {
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm512_maskz_extracti32x4_epi32(__U, __A, 3); 
 }
+TEST_CONSTEXPR(match_m128i(_mm512_maskz_extracti32x4_epi32((__mmask8)0x3, ((__m512i)(__v16si){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}), 3), 0x0000000D0000000CULL, 0x0000000000000000ULL));
 
 __m256i test_mm512_extracti64x4_epi64(__m512i __A) {
   // CHECK-LABEL: test_mm512_extracti64x4_epi64
   // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   return _mm512_extracti64x4_epi64(__A, 1); 
 }
+TEST_CONSTEXPR(match_m256i(_mm512_extracti64x4_epi64(((__m512i)(__v8di){0,1,2,3,4,5,6,7}), 1), 4ULL, 5ULL, 6ULL, 7ULL));
 
 __m256i test_mm512_mask_extracti64x4_epi64(__m256i __W, __mmask8 __U, __m512i __A) {
   // CHECK-LABEL: test_mm512_mask_extracti64x4_epi64
@@ -7384,6 +7400,7 @@ __m256i test_mm512_mask_extracti64x4_epi64(__m256i __W, __mmask8 __U, __m512i __
   // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm512_mask_extracti64x4_epi64(__W, __U, __A, 1); 
 }
+TEST_CONSTEXPR(match_m256i(_mm512_mask_extracti64x4_epi64(((__m256i)(__v4di){100ULL,101ULL,102ULL,103ULL}), (__mmask8)0x5, (((__m512i)(__v8di){0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL})), 1), 4ULL, 101ULL, 6ULL, 103ULL));
 
 __m256i test_mm512_maskz_extracti64x4_epi64(__mmask8 __U, __m512i __A) {
   // CHECK-LABEL: test_mm512_maskz_extracti64x4_epi64
@@ -7391,7 +7408,7 @@ __m256i test_mm512_maskz_extracti64x4_epi64(__mmask8 __U, __m512i __A) {
   // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm512_maskz_extracti64x4_epi64(__U, __A, 1); 
 }
-
+TEST_CONSTEXPR(match_m256i(_mm512_maskz_extracti64x4_epi64((__mmask8)0x3, (((__m512i)(__v8di){0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL})), 1), 4ULL, 5ULL, 0ULL, 0ULL));
 __m512d test_mm512_insertf64x4(__m512d __A, __m256d __B) {
   // CHECK-LABEL: test_mm512_insertf64x4
   // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
@@ -10903,6 +10920,8 @@ __m512i test_mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A)
   return _mm512_mask_set1_epi32 ( __O, __M, __A);
 }
 
+TEST_CONSTEXPR(match_v16si(_mm512_mask_set1_epi32(_mm512_setzero_si512(), 0xAAAA, 13), 0,13,0,13,0,13,0,13,0,13,0,13,0,13,0,13));
+
 __m512i test_mm512_maskz_set1_epi32(__mmask16 __M, int __A)
 {     
   // CHECK-LABEL: test_mm512_maskz_set1_epi32
@@ -10926,6 +10945,8 @@ __m512i test_mm512_maskz_set1_epi32(__mmask16 __M, int __A)
     return _mm512_maskz_set1_epi32(__M, __A);
 }
 
+TEST_CONSTEXPR(match_v16si(_mm512_maskz_set1_epi32(0xAAAA, 19), 0,19,0,19,0,19,0,19,0,19,0,19,0,19,0,19));
+
 
 __m512i test_mm512_set_epi8(char e63, char e62, char e61, char e60, char e59,
     char e58, char e57, char e56, char e55, char e54, char e53, char e52,
@@ -11145,6 +11166,8 @@ __m512i test_mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A)
   return _mm512_mask_set1_epi64 (__O, __M, __A);
 }
 
+TEST_CONSTEXPR(match_v8di(_mm512_mask_set1_epi64(_mm512_setzero_si512(), 0xAA, 21), 0,21,0,21,0,21,0,21));
+
 __m512i test_mm512_maskz_set1_epi64 (__mmask8 __M, long long __A)
 {
   // CHECK-LABEL: test_mm512_maskz_set1_epi64
@@ -11160,6 +11183,8 @@ __m512i test_mm512_maskz_set1_epi64 (__mmask8 __M, long long __A)
   return _mm512_maskz_set1_epi64 (__M, __A);
 }
 
+TEST_CONSTEXPR(match_v8di(_mm512_maskz_set1_epi64(0xAA, 23), 0, 23, 0, 23, 0, 23, 0, 23));
+
 
 __m512i test_mm512_set_epi64 (long long __A, long long __B, long long __C,
                               long long __D, long long __E, long long __F,
@@ -11558,42 +11583,42 @@ __m128d test_mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B)
 void test_mm_mask_store_ss(float * __P, __mmask8 __U, __m128 __A)
 {
   // CHECK-LABEL: test_mm_mask_store_ss
-  // CHECK: call void @llvm.masked.store.v4f32.p0(<4 x float> %{{.*}}, ptr %{{.*}}, i32 1, <4 x i1> %{{.*}})
+  // CHECK: call void @llvm.masked.store.v4f32.p0(<4 x float> %{{.*}}, ptr align 1 %{{.*}}, <4 x i1> %{{.*}})
   _mm_mask_store_ss(__P, __U, __A);
 }
 
 void test_mm_mask_store_sd(double * __P, __mmask8 __U, __m128d __A)
 {
   // CHECK-LABEL: test_mm_mask_store_sd
-  // CHECK: call void @llvm.masked.store.v2f64.p0(<2 x double> %{{.*}}, ptr %{{.*}}, i32 1, <2 x i1> %{{.*}})
+  // CHECK: call void @llvm.masked.store.v2f64.p0(<2 x double> %{{.*}}, ptr align 1 %{{.*}}, <2 x i1> %{{.*}})
   _mm_mask_store_sd(__P, __U, __A);
 }
 
 __m128 test_mm_mask_load_ss(__m128 __A, __mmask8 __U, const float* __W)
 {
   // CHECK-LABEL: test_mm_mask_load_ss
-  // CHECK: call {{.*}}<4 x float> @llvm.masked.load.v4f32.p0(ptr %{{.*}}, i32 1, <4 x i1> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: call {{.*}}<4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 %{{.*}}, <4 x i1> %{{.*}}, <4 x float> %{{.*}})
   return _mm_mask_load_ss(__A, __U, __W);
 }
 
 __m128 test_mm_maskz_load_ss (__mmask8 __U, const float * __W)
 {
   // CHECK-LABEL: test_mm_maskz_load_ss
-  // CHECK: call {{.*}}<4 x float> @llvm.masked.load.v4f32.p0(ptr %{{.*}}, i32 1, <4 x i1> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: call {{.*}}<4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 %{{.*}}, <4 x i1> %{{.*}}, <4 x float> %{{.*}})
   return _mm_maskz_load_ss (__U, __W);
 }
 
 __m128d test_mm_mask_load_sd (__m128d __A, __mmask8 __U, const double * __W)
 {
   // CHECK-LABEL: test_mm_mask_load_sd
-  // CHECK: call {{.*}}<2 x double> @llvm.masked.load.v2f64.p0(ptr %{{.*}}, i32 1, <2 x i1> %{{.*}}, <2 x double> %{{.*}})
+  // CHECK: call {{.*}}<2 x double> @llvm.masked.load.v2f64.p0(ptr align 1 %{{.*}}, <2 x i1> %{{.*}}, <2 x double> %{{.*}})
   return _mm_mask_load_sd (__A, __U, __W);
 }
 
 __m128d test_mm_maskz_load_sd (__mmask8 __U, const double * __W)
 {
   // CHECK-LABEL: test_mm_maskz_load_sd
-  // CHECK: call {{.*}}<2 x double> @llvm.masked.load.v2f64.p0(ptr %{{.*}}, i32 1, <2 x i1> %{{.*}}, <2 x double> %{{.*}})
+  // CHECK: call {{.*}}<2 x double> @llvm.masked.load.v2f64.p0(ptr align 1 %{{.*}}, <2 x i1> %{{.*}}, <2 x double> %{{.*}})
   return _mm_maskz_load_sd (__U, __W);
 }
 
diff --git a/clang/test/CodeGen/X86/avx512fp16-builtins.c b/clang/test/CodeGen/X86/avx512fp16-builtins.c
index 2befff05c7903..f0a0a3b28542f 100644
--- a/clang/test/CodeGen/X86/avx512fp16-builtins.c
+++ b/clang/test/CodeGen/X86/avx512fp16-builtins.c
@@ -1505,13 +1505,13 @@ __m128h test_mm_load_sh(void const *A) {
 
 __m128h test_mm_mask_load_sh(__m128h __A, __mmask8 __U, const void *__W) {
   // CHECK-LABEL: test_mm_mask_load_sh
-  // CHECK: @llvm.masked.load.v8f16.p0(ptr %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x half> %{{.*}})
+  // CHECK: @llvm.masked.load.v8f16.p0(ptr align 1 %{{.*}}, <8 x i1> %{{.*}}, <8 x half> %{{.*}})
   return _mm_mask_load_sh(__A, __U, __W);
 }
 
 __m128h test_mm_maskz_load_sh(__mmask8 __U, const void *__W) {
   // CHECK-LABEL: test_mm_maskz_load_sh
-  // CHECK: @llvm.masked.load.v8f16.p0(ptr %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x half> %{{.*}})
+  // CHECK: @llvm.masked.load.v8f16.p0(ptr align 1 %{{.*}}, <8 x i1> %{{.*}}, <8 x half> %{{.*}})
   return _mm_maskz_load_sh(__U, __W);
 }
 
@@ -1560,7 +1560,7 @@ void test_mm_store_sh(void *A, __m128h B) {
 
 void test_mm_mask_store_sh(void *__P, __mmask8 __U, __m128h __A) {
   // CHECK-LABEL: test_mm_mask_store_sh
-  // CHECK: call void @llvm.masked.store.v8f16.p0(<8 x half> %{{.*}}, ptr %{{.*}}, i32 1, <8 x i1> %{{.*}})
+  // CHECK: call void @llvm.masked.store.v8f16.p0(<8 x half> %{{.*}}, ptr align 1 %{{.*}}, <8 x i1> %{{.*}})
   _mm_mask_store_sh(__P, __U, __A);
 }
 
diff --git a/clang/test/CodeGen/X86/avx512ifma-builtins.c b/clang/test/CodeGen/X86/avx512ifma-builtins.c
index eebefb0bad4ab..f90697e3ab9b9 100644
--- a/clang/test/CodeGen/X86/avx512ifma-builtins.c
+++ b/clang/test/CodeGen/X86/avx512ifma-builtins.c
@@ -8,45 +8,230 @@
 // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512ifma -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
 // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512ifma -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
 
-
 #include <immintrin.h>
+#include "builtin_test_helpers.h"
 
 __m512i test_mm512_madd52hi_epu64(__m512i __X, __m512i __Y, __m512i __Z) {
   // CHECK-LABEL: test_mm512_madd52hi_epu64
   // CHECK: call {{.*}}<8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}})
-  return _mm512_madd52hi_epu64(__X, __Y, __Z); 
+  return _mm512_madd52hi_epu64(__X, __Y, __Z);
 }
 
+TEST_CONSTEXPR(match_v8di(_mm512_madd52hi_epu64(
+                              (__m512i)(__v8du){100, 0, 0, 0, 0, 0, 0, 0},
+                              (__m512i)(__v8du){10, 0, 0, 0, 0, 0, 0, 0},
+                              (__m512i)(__v8du){5, 0, 0, 0, 0, 0, 0, 0}),
+                          100, 0, 0, 0, 0, 0, 0, 0));
+
+TEST_CONSTEXPR(match_v8di(_mm512_madd52hi_epu64(
+                              (__m512i)(__v8du){0, 0, 0, 0, 0, 0, 0, 0},
+                              (__m512i)(__v8du){0xFFFFFFFFFFFFFull, 0, 0, 0,
+                                                0, 0, 0, 0},
+                              (__m512i)(__v8du){0xFFFFFFFFFFFFFull, 0, 0, 0,
+                                                0, 0, 0, 0}),
+                          0xFFFFFFFFFFFFEull, 0, 0, 0, 0, 0, 0, 0));
+
+TEST_CONSTEXPR(match_v8di(_mm512_madd52hi_epu64(
+                              (__m512i)(__v8du){1, 2, 3, 4, 5, 6, 7, 8},
+                              (__m512i)(__v8du){0xFFFFFFFFFFFFFull,
+                                                0xFFFFFFFFFFFFFull,
+                                                0xFFFFFFFFFFFFFull,
+                                                0xFFFFFFFFFFFFFull,
+                                                0xFFFFFFFFFFFFFull,
+                                                0xFFFFFFFFFFFFFull,
+                                                0xFFFFFFFFFFFFFull,
+                                                0xFFFFFFFFFFFFFull},
+                              (__m512i)(__v8du){0xFFFFFFFFFFFFFull,
+                                                0xFFFFFFFFFFFFFull,
+                                                0xFFFFFFFFFFFFFull,
+                                                0xFFFFFFFFFFFFFull,
+                                                0xFFFFFFFFFFFFFull,
+                                                0xFFFFFFFFFFFFFull,
+                                                0xFFFFFFFFFFFFFull,
+                                                0xFFFFFFFFFFFFFull}),
+                          4503599627370495ull, 4503599627370496ull,
+                          4503599627370497ull, 4503599627370498ull,
+                          4503599627370499ull, 4503599627370500ull,
+                          4503599627370501ull, 4503599627370502ull));
+
 __m512i test_mm512_mask_madd52hi_epu64(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) {
   // CHECK-LABEL: test_mm512_mask_madd52hi_epu64
   // CHECK: call {{.*}}<8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}})
   // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_mask_madd52hi_epu64(__W, __M, __X, __Y); 
+  return _mm512_mask_madd52hi_epu64(__W, __M, __X, __Y);
 }
 
+TEST_CONSTEXPR(match_v8di(_mm512_mask_madd52hi_epu64(
+                              (__m512i)(__v8du){111, 222, 333, 444, 555, 666,
+                                                777, 888},
+                              0x00,
+                              (__m512i)(__v8du){1, 2, 3, 4, 5, 6, 7, 8},
+                              (__m512i)(__v8du){10, 20, 30, 40, 50, 60, 70,
+                                                80}),
+                          111, 222, 333, 444, 555, 666, 777, 888));
+
+TEST_CONSTEXPR(match_v8di(_mm512_mask_madd52hi_epu64(
+                              (__m512i)(__v8du){10, 20, 30, 40, 50, 60, 70,
+                                                80},
+                              0xFF,
+                              (__m512i)(__v8du){100, 200, 300, 400, 500, 600,
+                                                700, 800},
+                              (__m512i)(__v8du){10, 20, 30, 40, 50, 60, 70,
+                                                80}),
+                          10, 20, 30, 40, 50, 60, 70, 80));
+
 __m512i test_mm512_maskz_madd52hi_epu64(__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z) {
   // CHECK-LABEL: test_mm512_maskz_madd52hi_epu64
   // CHECK: call {{.*}}<8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}})
   // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_maskz_madd52hi_epu64(__M, __X, __Y, __Z); 
+  return _mm512_maskz_madd52hi_epu64(__M, __X, __Y, __Z);
 }
 
+TEST_CONSTEXPR(match_v8di(_mm512_maskz_madd52hi_epu64(
+                              0x00,
+                              (__m512i)(__v8du){1, 2, 3, 4, 5, 6, 7, 8},
+                              (__m512i)(__v8du){10, 20, 30, 40, 50, 60, 70,
+                                                80},
+                              (__m512i)(__v8du){100, 200, 300, 400, 500, 600,
+                                                700, 800}),
+                          0, 0, 0, 0, 0, 0, 0, 0));
+
+TEST_CONSTEXPR(match_v8di(_mm512_maskz_madd52hi_epu64(
+                              0xFF,
+                              (__m512i)(__v8du){1, 2, 3, 4, 5, 6, 7, 8},
+                              (__m512i)(__v8du){10, 20, 30, 40, 50, 60, 70,
+                                                80},
+                              (__m512i)(__v8du){100, 200, 300, 400, 500, 600,
+                                                700, 800}),
+                          1, 2, 3, 4, 5, 6, 7, 8));
+
 __m512i test_mm512_madd52lo_epu64(__m512i __X, __m512i __Y, __m512i __Z) {
   // CHECK-LABEL: test_mm512_madd52lo_epu64
   // CHECK: call {{.*}}<8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}})
-  return _mm512_madd52lo_epu64(__X, __Y, __Z); 
+  return _mm512_madd52lo_epu64(__X, __Y, __Z);
 }
 
+TEST_CONSTEXPR(match_v8di(_mm512_madd52lo_epu64(
+                              (__m512i)(__v8du){0, 0, 0, 0, 0, 0, 0, 0},
+                              (__m512i)(__v8du){10, 0, 0, 0, 0, 0, 0, 0},
+                              (__m512i)(__v8du){5, 0, 0, 0, 0, 0, 0, 0}),
+                          50, 0, 0, 0, 0, 0, 0, 0));
+
+TEST_CONSTEXPR(match_v8di(_mm512_madd52lo_epu64(
+                              (__m512i)(__v8du){100, 0, 0, 0, 0, 0, 0, 0},
+                              (__m512i)(__v8du){20, 0, 0, 0, 0, 0, 0, 0},
+                              (__m512i)(__v8du){30, 0, 0, 0, 0, 0, 0, 0}),
+                          700, 0, 0, 0, 0, 0, 0, 0));
+
+TEST_CONSTEXPR(match_v8di(_mm512_madd52lo_epu64(
+                              (__m512i)(__v8du){0, 0, 0, 0, 0, 0, 0, 0},
+                              (__m512i)(__v8du){0xFFFFFFFFFFFFFull, 0, 0, 0,
+                                                0, 0, 0, 0},
+                              (__m512i)(__v8du){1, 0, 0, 0, 0, 0, 0, 0}),
+                          0xFFFFFFFFFFFFFull, 0, 0, 0, 0, 0, 0, 0));
+
+TEST_CONSTEXPR(match_v8di(_mm512_madd52lo_epu64(
+                              (__m512i)(__v8du){0, 0, 0, 0, 0, 0, 0, 0},
+                              (__m512i)(__v8du){0x1F000000000000ull, 0, 0, 0,
+                                                0, 0, 0, 0},
+                              (__m512i)(__v8du){2, 0, 0, 0, 0, 0, 0, 0}),
+                          0xE000000000000ull, 0, 0, 0, 0, 0, 0, 0));
+
+TEST_CONSTEXPR(match_v8di(_mm512_madd52lo_epu64(
+                              (__m512i)(__v8du){1, 2, 3, 4, 5, 6, 7, 8},
+                              (__m512i)(__v8du){10, 20, 30, 40, 50, 60, 70,
+                                                80},
+                              (__m512i)(__v8du){2, 3, 4, 5, 6, 7, 8, 9}),
+                          21, 62, 123, 204, 305, 426, 567, 728));
+
+TEST_CONSTEXPR(match_v8di(_mm512_madd52lo_epu64(
+                              (__m512i)(__v8du){0xFFFFFFFFFFFFFull, 0, 0, 0,
+                                                0, 0, 0, 0},
+                              (__m512i)(__v8du){10, 0, 0, 0, 0, 0, 0, 0},
+                              (__m512i)(__v8du){5, 0, 0, 0, 0, 0, 0, 0}),
+                          4503599627370545ull, 0, 0, 0, 0, 0, 0, 0));
+
+TEST_CONSTEXPR(match_v8di(_mm512_madd52lo_epu64(
+                              (__m512i)(__v8du){10, 20, 30, 40, 50, 60, 70,
+                                                80},
+                              (__m512i)(__v8du){100, 200, 300, 400, 500, 600,
+                                                700, 800},
+                              (__m512i)(__v8du){2, 3, 4, 5, 6, 7, 8, 9}),
+                          210, 620, 1230, 2040, 3050, 4260, 5670, 7280));
+
+TEST_CONSTEXPR(match_v8di(_mm512_madd52lo_epu64(
+                              (__m512i)(__v8du){0, 0, 0, 0, 0, 0, 0, 0},
+                              (__m512i)(__v8du){0x1F000000000000ull,
+                                                0x1F000000000000ull, 0, 0, 0,
+                                                0, 0, 0},
+                              (__m512i)(__v8du){2, 3, 0, 0, 0, 0, 0, 0}),
+                          0xE000000000000ull, 0xD000000000000ull, 0, 0, 0, 0,
+                          0, 0));
+
+TEST_CONSTEXPR(match_v8di(_mm512_madd52lo_epu64(
+                              (__m512i)(__v8du){0, 0, 0, 0, 0, 0, 0, 0},
+                              (__m512i)(__v8du){0xFFFFFFFFFFFFFull,
+                                                0xFFFFFFFFFFFFFull,
+                                                0xFFFFFFFFFFFFFull,
+                                                0xFFFFFFFFFFFFFull,
+                                                0xFFFFFFFFFFFFFull,
+                                                0xFFFFFFFFFFFFFull,
+                                                0xFFFFFFFFFFFFFull,
+                                                0xFFFFFFFFFFFFFull},
+                              (__m512i)(__v8du){1, 1, 1, 1, 1, 1, 1, 1}),
+                          0xFFFFFFFFFFFFFull, 0xFFFFFFFFFFFFFull,
+                          0xFFFFFFFFFFFFFull, 0xFFFFFFFFFFFFFull,
+                          0xFFFFFFFFFFFFFull, 0xFFFFFFFFFFFFFull,
+                          0xFFFFFFFFFFFFFull, 0xFFFFFFFFFFFFFull));
+
 __m512i test_mm512_mask_madd52lo_epu64(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) {
   // CHECK-LABEL: test_mm512_mask_madd52lo_epu64
   // CHECK: call {{.*}}<8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}})
   // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_mask_madd52lo_epu64(__W, __M, __X, __Y); 
+  return _mm512_mask_madd52lo_epu64(__W, __M, __X, __Y);
 }
 
+TEST_CONSTEXPR(match_v8di(_mm512_mask_madd52lo_epu64(
+                              (__m512i)(__v8du){111, 222, 333, 444, 555, 666,
+                                                777, 888},
+                              0x00,
+                              (__m512i)(__v8du){1, 2, 3, 4, 5, 6, 7, 8},
+                              (__m512i)(__v8du){10, 20, 30, 40, 50, 60, 70,
+                                                80}),
+                          111, 222, 333, 444, 555, 666, 777, 888));
+
+TEST_CONSTEXPR(match_v8di(_mm512_mask_madd52lo_epu64(
+                              (__m512i)(__v8du){1000, 2000, 3000, 4000, 5000,
+                                                6000, 7000, 8000},
+                              0xFF,
+                              (__m512i)(__v8du){100, 200, 300, 400, 500, 600,
+                                                700, 800},
+                              (__m512i)(__v8du){20, 30, 40, 50, 60, 70, 80,
+                                                90}),
+                          3000, 8000, 15000, 24000, 35000, 48000, 63000,
+                          80000));
+
 __m512i test_mm512_maskz_madd52lo_epu64(__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z) {
   // CHECK-LABEL: test_mm512_maskz_madd52lo_epu64
   // CHECK: call {{.*}}<8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}})
   // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_maskz_madd52lo_epu64(__M, __X, __Y, __Z); 
+  return _mm512_maskz_madd52lo_epu64(__M, __X, __Y, __Z);
 }
+
+TEST_CONSTEXPR(match_v8di(_mm512_maskz_madd52lo_epu64(
+                              0x00,
+                              (__m512i)(__v8du){1, 2, 3, 4, 5, 6, 7, 8},
+                              (__m512i)(__v8du){10, 20, 30, 40, 50, 60, 70,
+                                                80},
+                              (__m512i)(__v8du){2, 3, 4, 5, 6, 7, 8, 9}),
+                          0, 0, 0, 0, 0, 0, 0, 0));
+
+TEST_CONSTEXPR(match_v8di(_mm512_maskz_madd52lo_epu64(
+                              0xFF,
+                              (__m512i)(__v8du){100, 200, 300, 400, 500, 600,
+                                                700, 800},
+                              (__m512i)(__v8du){20, 30, 40, 50, 60, 70, 80,
+                                                90},
+                              (__m512i)(__v8du){30, 40, 50, 60, 70, 80, 90,
+                                                100}),
+                          700, 1400, 2300, 3400, 4700, 6200, 7900, 9800));
diff --git a/clang/test/CodeGen/X86/avx512ifmavl-builtins.c b/clang/test/CodeGen/X86/avx512ifmavl-builtins.c
index 89108fc037520..1cbb5807a660e 100644
--- a/clang/test/CodeGen/X86/avx512ifmavl-builtins.c
+++ b/clang/test/CodeGen/X86/avx512ifmavl-builtins.c
@@ -8,85 +8,241 @@
 // RUN: %clang_cc1 -x c++ %s -flax-vector-conversions=none -ffreestanding -triple=x86_64-apple-darwin -target-feature +avx512ifma -target-feature +avx512vl -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
 // RUN: %clang_cc1 -x c++ %s -flax-vector-conversions=none -ffreestanding -triple=i386-apple-darwin -target-feature +avx512ifma -target-feature +avx512vl -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
 
-
 #include <immintrin.h>
+#include "builtin_test_helpers.h"
 
 __m128i test_mm_madd52hi_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
   // CHECK-LABEL: test_mm_madd52hi_epu64
   // CHECK: call {{.*}}<2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
-  return _mm_madd52hi_epu64(__X, __Y, __Z); 
+  return _mm_madd52hi_epu64(__X, __Y, __Z);
 }
 
+TEST_CONSTEXPR(match_v2di(_mm_madd52hi_epu64(
+                              (__m128i)((__v2du){100, 0}),
+                              (__m128i)((__v2du){10, 0}),
+                              (__m128i)((__v2du){5, 0})),
+                          100, 0));
+
+TEST_CONSTEXPR(match_v2di(_mm_madd52hi_epu64(
+                              (__m128i)((__v2du){0, 0}),
+                              (__m128i)((__v2du){0xFFFFFFFFFFFFFull, 0}),
+                              (__m128i)((__v2du){0xFFFFFFFFFFFFFull, 0})),
+                          0xFFFFFFFFFFFFEull, 0));
+
 __m128i test_mm_mask_madd52hi_epu64(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) {
   // CHECK-LABEL: test_mm_mask_madd52hi_epu64
   // CHECK: call {{.*}}<2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
-  return _mm_mask_madd52hi_epu64(__W, __M, __X, __Y); 
+  return _mm_mask_madd52hi_epu64(__W, __M, __X, __Y);
 }
 
+TEST_CONSTEXPR(match_v2di(_mm_mask_madd52hi_epu64((__m128i)((__v2du){111, 222}),
+                                                   0x0,
+                                                   (__m128i)((__v2du){1, 2}),
+                                                   (__m128i)((__v2du){10, 20})),
+                          111, 222));
+
+TEST_CONSTEXPR(match_v2di(_mm_mask_madd52hi_epu64((__m128i)((__v2du){10, 20}),
+                                                   0x2,
+                                                   (__m128i)((__v2du){0x1000000000000ULL, 0x1000000000000ULL}),
+                                                   (__m128i)((__v2du){0x1000000000000ULL, 0x1000000000000ULL})),
+                          10, 0x100000000014ULL));
+
 __m128i test_mm_maskz_madd52hi_epu64(__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z) {
   // CHECK-LABEL: test_mm_maskz_madd52hi_epu64
   // CHECK: call {{.*}}<2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
-  return _mm_maskz_madd52hi_epu64(__M, __X, __Y, __Z); 
+  return _mm_maskz_madd52hi_epu64(__M, __X, __Y, __Z);
 }
 
+TEST_CONSTEXPR(match_v2di(_mm_maskz_madd52hi_epu64(0x3,
+                                                    (__m128i)((__v2du){1, 2}),
+                                                    (__m128i)((__v2du){10, 20}),
+                                                    (__m128i)((__v2du){100, 200})),
+                          1, 2));
+
+TEST_CONSTEXPR(match_v2di(_mm_maskz_madd52hi_epu64(0x1,
+                                                    (__m128i)((__v2du){0x1000000000000ULL, 0x1000000000000ULL}),
+                                                    (__m128i)((__v2du){0x1000000000000ULL, 0x1000000000000ULL}),
+                                                    (__m128i)((__v2du){0, 0})),
+                          0x1000000000000ULL, 0));
+
 __m256i test_mm256_madd52hi_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
   // CHECK-LABEL: test_mm256_madd52hi_epu64
   // CHECK: call {{.*}}<4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}})
-  return _mm256_madd52hi_epu64(__X, __Y, __Z); 
+  return _mm256_madd52hi_epu64(__X, __Y, __Z);
 }
 
+TEST_CONSTEXPR(match_v4di(_mm256_madd52hi_epu64(
+                              (__m256i)((__v4du){100, 200, 300, 400}),
+                              (__m256i)((__v4du){10, 20, 30, 40}),
+                              (__m256i)((__v4du){5, 6, 7, 8})),
+                          100, 200, 300, 400));
+
+TEST_CONSTEXPR(match_v4di(_mm256_madd52hi_epu64(
+                              (__m256i)((__v4du){0, 0, 0, 0}),
+                              (__m256i)((__v4du){0xFFFFFFFFFFFFFull, 0, 0,
+                                                 0}),
+                              (__m256i)((__v4du){0xFFFFFFFFFFFFFull, 0, 0,
+                                                 0})),
+                          0xFFFFFFFFFFFFEull, 0, 0, 0));
+
 __m256i test_mm256_mask_madd52hi_epu64(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) {
   // CHECK-LABEL: test_mm256_mask_madd52hi_epu64
   // CHECK: call {{.*}}<4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}})
   // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
-  return _mm256_mask_madd52hi_epu64(__W, __M, __X, __Y); 
+  return _mm256_mask_madd52hi_epu64(__W, __M, __X, __Y);
 }
 
+TEST_CONSTEXPR(match_v4di(_mm256_mask_madd52hi_epu64((__m256i)((__v4du){111, 222, 333, 444}),
+                                                      0x0,
+                                                      (__m256i)((__v4du){1, 2, 3, 4}),
+                                                      (__m256i)((__v4du){10, 20, 30, 40})),
+                          111, 222, 333, 444));
+
+TEST_CONSTEXPR(match_v4di(_mm256_mask_madd52hi_epu64((__m256i)((__v4du){10, 20, 30, 40}),
+                                                      0xA,
+                                                      (__m256i)((__v4du){0x1000000000000ULL, 0x1000000000000ULL,
+                                                                         0x1000000000000ULL, 0x1000000000000ULL}),
+                                                      (__m256i)((__v4du){0x1000000000000ULL, 0x1000000000000ULL,
+                                                                         0x1000000000000ULL, 0x1000000000000ULL})),
+                          10, 0x100000000014ULL, 30, 0x100000000028ULL));
+
 __m256i test_mm256_maskz_madd52hi_epu64(__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z) {
   // CHECK-LABEL: test_mm256_maskz_madd52hi_epu64
   // CHECK: call {{.*}}<4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}})
   // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
-  return _mm256_maskz_madd52hi_epu64(__M, __X, __Y, __Z); 
+  return _mm256_maskz_madd52hi_epu64(__M, __X, __Y, __Z);
 }
 
+TEST_CONSTEXPR(match_v4di(_mm256_maskz_madd52hi_epu64(0xF,
+                                                       (__m256i)((__v4du){1, 2, 3, 4}),
+                                                       (__m256i)((__v4du){10, 20, 30, 40}),
+                                                       (__m256i)((__v4du){100, 200, 300, 400})),
+                          1, 2, 3, 4));
+
+TEST_CONSTEXPR(match_v4di(_mm256_maskz_madd52hi_epu64(0x5,
+                                                       (__m256i)((__v4du){0x1000000000000ULL, 0x1000000000000ULL,
+                                                                          0x1000000000000ULL, 0x1000000000000ULL}),
+                                                       (__m256i)((__v4du){0x1000000000000ULL, 0x1000000000000ULL,
+                                                                          0x1000000000000ULL, 0x1000000000000ULL}),
+                                                       (__m256i)((__v4du){0, 0, 0, 0})),
+                          0x1000000000000ULL, 0, 0x1000000000000ULL, 0));
+
 __m128i test_mm_madd52lo_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
   // CHECK-LABEL: test_mm_madd52lo_epu64
   // CHECK: call {{.*}}<2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
-  return _mm_madd52lo_epu64(__X, __Y, __Z); 
+  return _mm_madd52lo_epu64(__X, __Y, __Z);
 }
 
+TEST_CONSTEXPR(match_v2di(_mm_madd52lo_epu64(
+                              (__m128i)((__v2du){0, 0}),
+                              (__m128i)((__v2du){10, 0}),
+                              (__m128i)((__v2du){5, 0})),
+                          50, 0));
+
+TEST_CONSTEXPR(match_v2di(_mm_madd52lo_epu64(
+                              (__m128i)((__v2du){100, 0}),
+                              (__m128i)((__v2du){20, 0}),
+                              (__m128i)((__v2du){30, 0})),
+                          700, 0));
+
+TEST_CONSTEXPR(match_v2di(_mm_madd52lo_epu64(
+                              (__m128i)((__v2du){1, 2}),
+                              (__m128i)((__v2du){10, 20}),
+                              (__m128i)((__v2du){2, 3})),
+                          21, 62));
+
 __m128i test_mm_mask_madd52lo_epu64(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) {
   // CHECK-LABEL: test_mm_mask_madd52lo_epu64
   // CHECK: call {{.*}}<2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
-  return _mm_mask_madd52lo_epu64(__W, __M, __X, __Y); 
+  return _mm_mask_madd52lo_epu64(__W, __M, __X, __Y);
 }
 
+TEST_CONSTEXPR(match_v2di(_mm_mask_madd52lo_epu64((__m128i)((__v2du){1000, 2000}),
+                                                   0x3,
+                                                   (__m128i)((__v2du){100, 200}),
+                                                   (__m128i)((__v2du){20, 30})),
+                          3000, 8000));
+
+TEST_CONSTEXPR(match_v2di(_mm_mask_madd52lo_epu64((__m128i)((__v2du){111, 222}),
+                                                   0x0,
+                                                   (__m128i)((__v2du){1, 2}),
+                                                   (__m128i)((__v2du){10, 20})),
+                          111, 222));
+
 __m128i test_mm_maskz_madd52lo_epu64(__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z) {
   // CHECK-LABEL: test_mm_maskz_madd52lo_epu64
   // CHECK: call {{.*}}<2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
-  return _mm_maskz_madd52lo_epu64(__M, __X, __Y, __Z); 
+  return _mm_maskz_madd52lo_epu64(__M, __X, __Y, __Z);
 }
 
+TEST_CONSTEXPR(match_v2di(_mm_maskz_madd52lo_epu64(0x3,
+                                                    (__m128i)((__v2du){100, 200}),
+                                                    (__m128i)((__v2du){20, 30}),
+                                                    (__m128i)((__v2du){30, 40})),
+                          700, 1400));
+
+TEST_CONSTEXPR(match_v2di(_mm_maskz_madd52lo_epu64(0x1,
+                                                    (__m128i)((__v2du){100, 0}),
+                                                    (__m128i)((__v2du){20, 0}),
+                                                    (__m128i)((__v2du){30, 0})),
+                          700, 0));
+
 __m256i test_mm256_madd52lo_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
   // CHECK-LABEL: test_mm256_madd52lo_epu64
   // CHECK: call {{.*}}<4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}})
-  return _mm256_madd52lo_epu64(__X, __Y, __Z); 
+  return _mm256_madd52lo_epu64(__X, __Y, __Z);
 }
 
+TEST_CONSTEXPR(match_v4di(_mm256_madd52lo_epu64(
+                              (__m256i)((__v4du){1, 2, 3, 4}),
+                              (__m256i)((__v4du){10, 20, 30, 40}),
+                              (__m256i)((__v4du){2, 3, 4, 5})),
+                          21, 62, 123, 204));
+
 __m256i test_mm256_mask_madd52lo_epu64(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) {
   // CHECK-LABEL: test_mm256_mask_madd52lo_epu64
   // CHECK: call {{.*}}<4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}})
   // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
-  return _mm256_mask_madd52lo_epu64(__W, __M, __X, __Y); 
+  return _mm256_mask_madd52lo_epu64(__W, __M, __X, __Y);
 }
 
+TEST_CONSTEXPR(match_v4di(_mm256_mask_madd52lo_epu64((__m256i)((__v4du){1000, 2000, 3000, 4000}),
+                                                      0xF,
+                                                      (__m256i)((__v4du){100, 200, 300, 400}),
+                                                      (__m256i)((__v4du){20, 30, 40, 50})),
+                          3000, 8000, 15000, 24000));
+
+TEST_CONSTEXPR(match_v4di(_mm256_mask_madd52lo_epu64((__m256i)((__v4du){111, 222, 333, 444}),
+                                                      0x0,
+                                                      (__m256i)((__v4du){1, 2, 3, 4}),
+                                                      (__m256i)((__v4du){10, 20, 30, 40})),
+                          111, 222, 333, 444));
+
+TEST_CONSTEXPR(match_v4di(_mm256_mask_madd52lo_epu64((__m256i)((__v4du){11, 22, 33, 44}),
+                                                      0x5,
+                                                      (__m256i)((__v4du){100, 200, 300, 400}),
+                                                      (__m256i)((__v4du){10, 20, 30, 40})),
+                          1011, 22, 9033, 44));
+
 __m256i test_mm256_maskz_madd52lo_epu64(__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z) {
   // CHECK-LABEL: test_mm256_maskz_madd52lo_epu64
   // CHECK: call {{.*}}<4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}})
   // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
-  return _mm256_maskz_madd52lo_epu64(__M, __X, __Y, __Z); 
+  return _mm256_maskz_madd52lo_epu64(__M, __X, __Y, __Z);
 }
+
+TEST_CONSTEXPR(match_v4di(_mm256_maskz_madd52lo_epu64(0xF,
+                                                       (__m256i)((__v4du){100, 200, 300, 400}),
+                                                       (__m256i)((__v4du){20, 30, 40, 50}),
+                                                       (__m256i)((__v4du){30, 40, 50, 60})),
+                          700, 1400, 2300, 3400));
+
+TEST_CONSTEXPR(match_v4di(_mm256_maskz_madd52lo_epu64(0x9,
+                                                       (__m256i)((__v4du){100, 200, 300, 400}),
+                                                       (__m256i)((__v4du){10, 20, 30, 40}),
+                                                       (__m256i)((__v4du){5, 10, 15, 20})),
+                          150, 0, 0, 1200));
diff --git a/clang/test/CodeGen/X86/avx512vl-builtins.c b/clang/test/CodeGen/X86/avx512vl-builtins.c
index 51385d57d2944..34db7641b9448 100644
--- a/clang/test/CodeGen/X86/avx512vl-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vl-builtins.c
@@ -7002,7 +7002,7 @@ void test_mm_store_epi32(void *__P, __m128i __A) {
 
 void test_mm_mask_store_epi32(void *__P, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: test_mm_mask_store_epi32
-  // CHECK: @llvm.masked.store.v4i32.p0(<4 x i32> %{{.*}}, ptr %{{.}}, i32 16, <4 x i1> %{{.*}})
+  // CHECK: @llvm.masked.store.v4i32.p0(<4 x i32> %{{.*}}, ptr align 16 %{{.}}, <4 x i1> %{{.*}})
   return _mm_mask_store_epi32(__P, __U, __A); 
 }
 
@@ -7014,7 +7014,7 @@ void test_mm256_store_epi32(void *__P, __m256i __A) {
 
 void test_mm256_mask_store_epi32(void *__P, __mmask8 __U, __m256i __A) {
   // CHECK-LABEL: test_mm256_mask_store_epi32
-  // CHECK: @llvm.masked.store.v8i32.p0(<8 x i32> %{{.*}}, ptr %{{.}}, i32 32, <8 x i1> %{{.*}})
+  // CHECK: @llvm.masked.store.v8i32.p0(<8 x i32> %{{.*}}, ptr align 32 %{{.}}, <8 x i1> %{{.*}})
   return _mm256_mask_store_epi32(__P, __U, __A); 
 }
 
@@ -7074,13 +7074,13 @@ __m128i test_mm_load_epi32(void const *__P) {
 
 __m128i test_mm_mask_load_epi32(__m128i __W, __mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm_mask_load_epi32
-  // CHECK: @llvm.masked.load.v4i32.p0(ptr %{{.*}}, i32 16, <4 x i1> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: @llvm.masked.load.v4i32.p0(ptr align 16 %{{.*}}, <4 x i1> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_mask_load_epi32(__W, __U, __P); 
 }
 
 __m128i test_mm_maskz_load_epi32(__mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm_maskz_load_epi32
-  // CHECK: @llvm.masked.load.v4i32.p0(ptr %{{.*}}, i32 16, <4 x i1> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: @llvm.masked.load.v4i32.p0(ptr align 16 %{{.*}}, <4 x i1> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_maskz_load_epi32(__U, __P); 
 }
 
@@ -7092,13 +7092,13 @@ __m256i test_mm256_load_epi32(void const *__P) {
 
 __m256i test_mm256_mask_load_epi32(__m256i __W, __mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm256_mask_load_epi32
-  // CHECK: @llvm.masked.load.v8i32.p0(ptr %{{.*}}, i32 32, <8 x i1> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: @llvm.masked.load.v8i32.p0(ptr align 32 %{{.*}}, <8 x i1> %{{.*}}, <8 x i32> %{{.*}})
   return _mm256_mask_load_epi32(__W, __U, __P); 
 }
 
 __m256i test_mm256_maskz_load_epi32(__mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm256_maskz_load_epi32
-  // CHECK: @llvm.masked.load.v8i32.p0(ptr %{{.*}}, i32 32, <8 x i1> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: @llvm.masked.load.v8i32.p0(ptr align 32 %{{.*}}, <8 x i1> %{{.*}}, <8 x i32> %{{.*}})
   return _mm256_maskz_load_epi32(__U, __P); 
 }
 
@@ -7110,13 +7110,13 @@ __m128i test_mm_load_epi64(void const *__P) {
 
 __m128i test_mm_mask_load_epi64(__m128i __W, __mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm_mask_load_epi64
-  // CHECK: @llvm.masked.load.v2i64.p0(ptr %{{.*}}, i32 16, <2 x i1> %{{.*}}, <2 x i64> %{{.*}})
+  // CHECK: @llvm.masked.load.v2i64.p0(ptr align 16 %{{.*}}, <2 x i1> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_mask_load_epi64(__W, __U, __P); 
 }
 
 __m128i test_mm_maskz_load_epi64(__mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm_maskz_load_epi64
-  // CHECK: @llvm.masked.load.v2i64.p0(ptr %{{.*}}, i32 16, <2 x i1> %{{.*}}, <2 x i64> %{{.*}})
+  // CHECK: @llvm.masked.load.v2i64.p0(ptr align 16 %{{.*}}, <2 x i1> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_maskz_load_epi64(__U, __P); 
 }
 
@@ -7128,13 +7128,13 @@ __m256i test_mm256_load_epi64(void const *__P) {
 
 __m256i test_mm256_mask_load_epi64(__m256i __W, __mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm256_mask_load_epi64
-  // CHECK: @llvm.masked.load.v4i64.p0(ptr %{{.*}}, i32 32, <4 x i1> %{{.*}}, <4 x i64> %{{.*}})
+  // CHECK: @llvm.masked.load.v4i64.p0(ptr align 32 %{{.*}}, <4 x i1> %{{.*}}, <4 x i64> %{{.*}})
   return _mm256_mask_load_epi64(__W, __U, __P); 
 }
 
 __m256i test_mm256_maskz_load_epi64(__mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm256_maskz_load_epi64
-  // CHECK: @llvm.masked.load.v4i64.p0(ptr %{{.*}}, i32 32, <4 x i1> %{{.*}}, <4 x i64> %{{.*}})
+  // CHECK: @llvm.masked.load.v4i64.p0(ptr align 32 %{{.*}}, <4 x i1> %{{.*}}, <4 x i64> %{{.*}})
   return _mm256_maskz_load_epi64(__U, __P); 
 }
 
@@ -7146,7 +7146,7 @@ void test_mm_store_epi64(void *__P, __m128i __A) {
 
 void test_mm_mask_store_epi64(void *__P, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: test_mm_mask_store_epi64
-  // CHECK: @llvm.masked.store.v2i64.p0(<2 x i64> %{{.*}}, ptr %{{.*}}, i32 16, <2 x i1> %{{.*}})
+  // CHECK: @llvm.masked.store.v2i64.p0(<2 x i64> %{{.*}}, ptr align 16 %{{.*}}, <2 x i1> %{{.*}})
   return _mm_mask_store_epi64(__P, __U, __A); 
 }
 
@@ -7158,7 +7158,7 @@ void test_mm256_store_epi64(void *__P, __m256i __A) {
 
 void test_mm256_mask_store_epi64(void *__P, __mmask8 __U, __m256i __A) {
   // CHECK-LABEL: test_mm256_mask_store_epi64
-  // CHECK: @llvm.masked.store.v4i64.p0(<4 x i64> %{{.*}}, ptr %{{.*}}, i32 32, <4 x i1> %{{.*}})
+  // CHECK: @llvm.masked.store.v4i64.p0(<4 x i64> %{{.*}}, ptr align 32 %{{.*}}, <4 x i1> %{{.*}})
   return _mm256_mask_store_epi64(__P, __U, __A); 
 }
 
@@ -7201,6 +7201,8 @@ __m128i test_mm_mask_set1_epi32(__m128i __O, __mmask8 __M) {
   return _mm_mask_set1_epi32(__O, __M, 5); 
 }
 
+TEST_CONSTEXPR(match_v4si(_mm_mask_set1_epi32(_mm_setzero_si128(), 0xF, 7), 7, 7, 7, 7));
+
 __m128i test_mm_maskz_set1_epi32(__mmask8 __M) {
   // CHECK-LABEL: test_mm_maskz_set1_epi32
   // CHECK: insertelement <4 x i32> poison, i32 %{{.*}}, i32 0
@@ -7212,6 +7214,8 @@ __m128i test_mm_maskz_set1_epi32(__mmask8 __M) {
   return _mm_maskz_set1_epi32(__M, 5); 
 }
 
+TEST_CONSTEXPR(match_v4si(_mm_maskz_set1_epi32(0xA, 11), 0, 11, 0, 11));
+
 __m256i test_mm256_mask_set1_epi32(__m256i __O, __mmask8 __M) {
   // CHECK-LABEL: test_mm256_mask_set1_epi32
   // CHECK:  insertelement <8 x i32> poison, i32 %{{.*}}, i32 0
@@ -7226,6 +7230,8 @@ __m256i test_mm256_mask_set1_epi32(__m256i __O, __mmask8 __M) {
   return _mm256_mask_set1_epi32(__O, __M, 5); 
 }
 
+TEST_CONSTEXPR(match_v8si(_mm256_mask_set1_epi32(_mm256_setzero_si256(), 0xAA, 5), 0, 5, 0, 5, 0, 5, 0, 5));
+
 __m256i test_mm256_maskz_set1_epi32(__mmask8 __M) {
   // CHECK-LABEL: test_mm256_maskz_set1_epi32
   // CHECK:  insertelement <8 x i32> poison, i32 %{{.*}}, i32 0
@@ -7240,6 +7246,8 @@ __m256i test_mm256_maskz_set1_epi32(__mmask8 __M) {
   return _mm256_maskz_set1_epi32(__M, 5); 
 }
 
+TEST_CONSTEXPR(match_v8si(_mm256_maskz_set1_epi32(0xAA, 13), 0, 13, 0, 13, 0, 13, 0, 13));
+
 __m128i test_mm_mask_set1_epi64(__m128i __O, __mmask8 __M, long long __A) {
   // CHECK-LABEL: test_mm_mask_set1_epi64
   // CHECK: insertelement <2 x i64> poison, i64 %{{.*}}, i32 0
@@ -7249,6 +7257,8 @@ __m128i test_mm_mask_set1_epi64(__m128i __O, __mmask8 __M, long long __A) {
   return _mm_mask_set1_epi64(__O, __M, __A); 
 }
 
+TEST_CONSTEXPR(match_v2di(_mm_mask_set1_epi64(_mm_setzero_si128(), 0x3, 9), 9, 9));
+
 __m128i test_mm_maskz_set1_epi64(__mmask8 __M, long long __A) {
   // CHECK-LABEL: test_mm_maskz_set1_epi64
   // CHECK: insertelement <2 x i64> poison, i64 %{{.*}}, i32 0
@@ -7258,6 +7268,8 @@ __m128i test_mm_maskz_set1_epi64(__mmask8 __M, long long __A) {
   return _mm_maskz_set1_epi64(__M, __A); 
 }
 
+TEST_CONSTEXPR(match_v2di(_mm_maskz_set1_epi64(0x2, 15), 0, 15));
+
 __m256i test_mm256_mask_set1_epi64(__m256i __O, __mmask8 __M, long long __A) {
   // CHECK-LABEL: test_mm256_mask_set1_epi64
   // CHECK: insertelement <4 x i64> poison, i64 %{{.*}}, i32 0
@@ -7269,6 +7281,8 @@ __m256i test_mm256_mask_set1_epi64(__m256i __O, __mmask8 __M, long long __A) {
   return _mm256_mask_set1_epi64(__O, __M, __A); 
 }
 
+TEST_CONSTEXPR(match_v4di(_mm256_mask_set1_epi64(_mm256_setzero_si256(), 0xF, 11), 11, 11, 11, 11));
+
 __m256i test_mm256_maskz_set1_epi64(__mmask8 __M, long long __A) {
   // CHECK-LABEL: test_mm256_maskz_set1_epi64
   // CHECK: insertelement <4 x i64> poison, i64 %{{.*}}, i32 0
@@ -7280,6 +7294,8 @@ __m256i test_mm256_maskz_set1_epi64(__mmask8 __M, long long __A) {
   return _mm256_maskz_set1_epi64(__M, __A); 
 }
 
+TEST_CONSTEXPR(match_v4di(_mm256_maskz_set1_epi64(0xA, 17), 0, 17, 0, 17));
+
 __m128d test_mm_fixupimm_pd(__m128d __A, __m128d __B, __m128i __C) {
   // CHECK-LABEL: test_mm_fixupimm_pd
   // CHECK: @llvm.x86.avx512.mask.fixupimm.pd.128
@@ -7354,49 +7370,49 @@ __m256 test_mm256_maskz_fixupimm_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256
 
 __m128d test_mm_mask_load_pd(__m128d __W, __mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm_mask_load_pd
-  // CHECK: @llvm.masked.load.v2f64.p0(ptr %{{.*}}, i32 16, <2 x i1> %{{.*}}, <2 x double> %{{.*}})
+  // CHECK: @llvm.masked.load.v2f64.p0(ptr align 16 %{{.*}}, <2 x i1> %{{.*}}, <2 x double> %{{.*}})
   return _mm_mask_load_pd(__W, __U, __P); 
 }
 
 __m128d test_mm_maskz_load_pd(__mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm_maskz_load_pd
-  // CHECK: @llvm.masked.load.v2f64.p0(ptr %{{.*}}, i32 16, <2 x i1> %{{.*}}, <2 x double> %{{.*}})
+  // CHECK: @llvm.masked.load.v2f64.p0(ptr align 16 %{{.*}}, <2 x i1> %{{.*}}, <2 x double> %{{.*}})
   return _mm_maskz_load_pd(__U, __P); 
 }
 
 __m256d test_mm256_mask_load_pd(__m256d __W, __mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm256_mask_load_pd
-  // CHECK: @llvm.masked.load.v4f64.p0(ptr %{{.*}}, i32 32, <4 x i1> %{{.*}}, <4 x double> %{{.*}})
+  // CHECK: @llvm.masked.load.v4f64.p0(ptr align 32 %{{.*}}, <4 x i1> %{{.*}}, <4 x double> %{{.*}})
   return _mm256_mask_load_pd(__W, __U, __P); 
 }
 
 __m256d test_mm256_maskz_load_pd(__mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm256_maskz_load_pd
-  // CHECK: @llvm.masked.load.v4f64.p0(ptr %{{.*}}, i32 32, <4 x i1> %{{.*}}, <4 x double> %{{.*}})
+  // CHECK: @llvm.masked.load.v4f64.p0(ptr align 32 %{{.*}}, <4 x i1> %{{.*}}, <4 x double> %{{.*}})
   return _mm256_maskz_load_pd(__U, __P); 
 }
 
 __m128 test_mm_mask_load_ps(__m128 __W, __mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm_mask_load_ps
-  // CHECK: @llvm.masked.load.v4f32.p0(ptr %{{.*}}, i32 16, <4 x i1> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: @llvm.masked.load.v4f32.p0(ptr align 16 %{{.*}}, <4 x i1> %{{.*}}, <4 x float> %{{.*}})
   return _mm_mask_load_ps(__W, __U, __P); 
 }
 
 __m128 test_mm_maskz_load_ps(__mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm_maskz_load_ps
-  // CHECK: @llvm.masked.load.v4f32.p0(ptr %{{.*}}, i32 16, <4 x i1> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: @llvm.masked.load.v4f32.p0(ptr align 16 %{{.*}}, <4 x i1> %{{.*}}, <4 x float> %{{.*}})
   return _mm_maskz_load_ps(__U, __P); 
 }
 
 __m256 test_mm256_mask_load_ps(__m256 __W, __mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm256_mask_load_ps
-  // CHECK: @llvm.masked.load.v8f32.p0(ptr %{{.*}}, i32 32, <8 x i1> %{{.*}}, <8 x float> %{{.*}})
+  // CHECK: @llvm.masked.load.v8f32.p0(ptr align 32 %{{.*}}, <8 x i1> %{{.*}}, <8 x float> %{{.*}})
   return _mm256_mask_load_ps(__W, __U, __P); 
 }
 
 __m256 test_mm256_maskz_load_ps(__mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm256_maskz_load_ps
-  // CHECK: @llvm.masked.load.v8f32.p0(ptr %{{.*}}, i32 32, <8 x i1> %{{.*}}, <8 x float> %{{.*}})
+  // CHECK: @llvm.masked.load.v8f32.p0(ptr align 32 %{{.*}}, <8 x i1> %{{.*}}, <8 x float> %{{.*}})
   return _mm256_maskz_load_ps(__U, __P); 
 }
 
@@ -7408,13 +7424,13 @@ __m128i test_mm_loadu_epi64(void const *__P) {
 
 __m128i test_mm_mask_loadu_epi64(__m128i __W, __mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm_mask_loadu_epi64
-  // CHECK: @llvm.masked.load.v2i64.p0(ptr %{{.*}}, i32 1, <2 x i1> %{{.*}}, <2 x i64> %{{.*}})
+  // CHECK: @llvm.masked.load.v2i64.p0(ptr align 1 %{{.*}}, <2 x i1> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_mask_loadu_epi64(__W, __U, __P); 
 }
 
 __m128i test_mm_maskz_loadu_epi64(__mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm_maskz_loadu_epi64
-  // CHECK: @llvm.masked.load.v2i64.p0(ptr %{{.*}}, i32 1, <2 x i1> %{{.*}}, <2 x i64> %{{.*}})
+  // CHECK: @llvm.masked.load.v2i64.p0(ptr align 1 %{{.*}}, <2 x i1> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_maskz_loadu_epi64(__U, __P); 
 }
 
@@ -7426,13 +7442,13 @@ __m256i test_mm256_loadu_epi64(void const *__P) {
 
 __m256i test_mm256_mask_loadu_epi64(__m256i __W, __mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm256_mask_loadu_epi64
-  // CHECK: @llvm.masked.load.v4i64.p0(ptr %{{.*}}, i32 1, <4 x i1> %{{.*}}, <4 x i64> %{{.*}})
+  // CHECK: @llvm.masked.load.v4i64.p0(ptr align 1 %{{.*}}, <4 x i1> %{{.*}}, <4 x i64> %{{.*}})
   return _mm256_mask_loadu_epi64(__W, __U, __P); 
 }
 
 __m256i test_mm256_maskz_loadu_epi64(__mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm256_maskz_loadu_epi64
-  // CHECK: @llvm.masked.load.v4i64.p0(ptr %{{.*}}, i32 1, <4 x i1> %{{.*}}, <4 x i64> %{{.*}})
+  // CHECK: @llvm.masked.load.v4i64.p0(ptr align 1 %{{.*}}, <4 x i1> %{{.*}}, <4 x i64> %{{.*}})
   return _mm256_maskz_loadu_epi64(__U, __P); 
 }
 
@@ -7444,13 +7460,13 @@ __m128i test_mm_loadu_epi32(void const *__P) {
 
 __m128i test_mm_mask_loadu_epi32(__m128i __W, __mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm_mask_loadu_epi32
-  // CHECK: @llvm.masked.load.v4i32.p0(ptr %{{.*}}, i32 1, <4 x i1> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: @llvm.masked.load.v4i32.p0(ptr align 1 %{{.*}}, <4 x i1> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_mask_loadu_epi32(__W, __U, __P); 
 }
 
 __m128i test_mm_maskz_loadu_epi32(__mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm_maskz_loadu_epi32
-  // CHECK: @llvm.masked.load.v4i32.p0(ptr %{{.*}}, i32 1, <4 x i1> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: @llvm.masked.load.v4i32.p0(ptr align 1 %{{.*}}, <4 x i1> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_maskz_loadu_epi32(__U, __P); 
 }
 
@@ -7462,85 +7478,85 @@ __m256i test_mm256_loadu_epi32(void const *__P) {
 
 __m256i test_mm256_mask_loadu_epi32(__m256i __W, __mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm256_mask_loadu_epi32
-  // CHECK: @llvm.masked.load.v8i32.p0(ptr %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: @llvm.masked.load.v8i32.p0(ptr align 1 %{{.*}}, <8 x i1> %{{.*}}, <8 x i32> %{{.*}})
   return _mm256_mask_loadu_epi32(__W, __U, __P); 
 }
 
 __m256i test_mm256_maskz_loadu_epi32(__mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm256_maskz_loadu_epi32
-  // CHECK: @llvm.masked.load.v8i32.p0(ptr %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: @llvm.masked.load.v8i32.p0(ptr align 1 %{{.*}}, <8 x i1> %{{.*}}, <8 x i32> %{{.*}})
   return _mm256_maskz_loadu_epi32(__U, __P); 
 }
 
 __m128d test_mm_mask_loadu_pd(__m128d __W, __mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm_mask_loadu_pd
-  // CHECK: @llvm.masked.load.v2f64.p0(ptr %{{.*}}, i32 1, <2 x i1> %{{.*}}, <2 x double> %{{.*}})
+  // CHECK: @llvm.masked.load.v2f64.p0(ptr align 1 %{{.*}}, <2 x i1> %{{.*}}, <2 x double> %{{.*}})
   return _mm_mask_loadu_pd(__W, __U, __P); 
 }
 
 __m128d test_mm_maskz_loadu_pd(__mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm_maskz_loadu_pd
-  // CHECK: @llvm.masked.load.v2f64.p0(ptr %{{.*}}, i32 1, <2 x i1> %{{.*}}, <2 x double> %{{.*}})
+  // CHECK: @llvm.masked.load.v2f64.p0(ptr align 1 %{{.*}}, <2 x i1> %{{.*}}, <2 x double> %{{.*}})
   return _mm_maskz_loadu_pd(__U, __P); 
 }
 
 __m256d test_mm256_mask_loadu_pd(__m256d __W, __mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm256_mask_loadu_pd
-  // CHECK: @llvm.masked.load.v4f64.p0(ptr %{{.*}}, i32 1, <4 x i1> %{{.*}}, <4 x double> %{{.*}})
+  // CHECK: @llvm.masked.load.v4f64.p0(ptr align 1 %{{.*}}, <4 x i1> %{{.*}}, <4 x double> %{{.*}})
   return _mm256_mask_loadu_pd(__W, __U, __P); 
 }
 
 __m256d test_mm256_maskz_loadu_pd(__mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm256_maskz_loadu_pd
-  // CHECK: @llvm.masked.load.v4f64.p0(ptr %{{.*}}, i32 1, <4 x i1> %{{.*}}, <4 x double> %{{.*}})
+  // CHECK: @llvm.masked.load.v4f64.p0(ptr align 1 %{{.*}}, <4 x i1> %{{.*}}, <4 x double> %{{.*}})
   return _mm256_maskz_loadu_pd(__U, __P); 
 }
 
 __m128 test_mm_mask_loadu_ps(__m128 __W, __mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm_mask_loadu_ps
-  // CHECK: @llvm.masked.load.v4f32.p0(ptr %{{.*}}, i32 1, <4 x i1> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: @llvm.masked.load.v4f32.p0(ptr align 1 %{{.*}}, <4 x i1> %{{.*}}, <4 x float> %{{.*}})
   return _mm_mask_loadu_ps(__W, __U, __P); 
 }
 
 __m128 test_mm_maskz_loadu_ps(__mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm_maskz_loadu_ps
-  // CHECK: @llvm.masked.load.v4f32.p0(ptr %{{.*}}, i32 1, <4 x i1> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: @llvm.masked.load.v4f32.p0(ptr align 1 %{{.*}}, <4 x i1> %{{.*}}, <4 x float> %{{.*}})
   return _mm_maskz_loadu_ps(__U, __P); 
 }
 
 __m256 test_mm256_mask_loadu_ps(__m256 __W, __mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm256_mask_loadu_ps
-  // CHECK: @llvm.masked.load.v8f32.p0(ptr %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x float> %{{.*}})
+  // CHECK: @llvm.masked.load.v8f32.p0(ptr align 1 %{{.*}}, <8 x i1> %{{.*}}, <8 x float> %{{.*}})
   return _mm256_mask_loadu_ps(__W, __U, __P); 
 }
 
 __m256 test_mm256_maskz_loadu_ps(__mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm256_maskz_loadu_ps
-  // CHECK: @llvm.masked.load.v8f32.p0(ptr %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x float> %{{.*}})
+  // CHECK: @llvm.masked.load.v8f32.p0(ptr align 1 %{{.*}}, <8 x i1> %{{.*}}, <8 x float> %{{.*}})
   return _mm256_maskz_loadu_ps(__U, __P); 
 }
 
 void test_mm_mask_store_pd(void *__P, __mmask8 __U, __m128d __A) {
   // CHECK-LABEL: test_mm_mask_store_pd
-  // CHECK: @llvm.masked.store.v2f64.p0(<2 x double> %{{.*}}, ptr %{{.*}}, i32 16, <2 x i1> %{{.*}})
+  // CHECK: @llvm.masked.store.v2f64.p0(<2 x double> %{{.*}}, ptr align 16 %{{.*}}, <2 x i1> %{{.*}})
   return _mm_mask_store_pd(__P, __U, __A); 
 }
 
 void test_mm256_mask_store_pd(void *__P, __mmask8 __U, __m256d __A) {
   // CHECK-LABEL: test_mm256_mask_store_pd
-  // CHECK: @llvm.masked.store.v4f64.p0(<4 x double> %{{.*}}, ptr %{{.*}}, i32 32, <4 x i1> %{{.*}})
+  // CHECK: @llvm.masked.store.v4f64.p0(<4 x double> %{{.*}}, ptr align 32 %{{.*}}, <4 x i1> %{{.*}})
   return _mm256_mask_store_pd(__P, __U, __A); 
 }
 
 void test_mm_mask_store_ps(void *__P, __mmask8 __U, __m128 __A) {
   // CHECK-LABEL: test_mm_mask_store_ps
-  // CHECK: @llvm.masked.store.v4f32.p0(<4 x float> %{{.*}}, ptr %{{.*}}, i32 16, <4 x i1> %{{.*}})
+  // CHECK: @llvm.masked.store.v4f32.p0(<4 x float> %{{.*}}, ptr align 16 %{{.*}}, <4 x i1> %{{.*}})
   return _mm_mask_store_ps(__P, __U, __A); 
 }
 
 void test_mm256_mask_store_ps(void *__P, __mmask8 __U, __m256 __A) {
   // CHECK-LABEL: test_mm256_mask_store_ps
-  // CHECK: @llvm.masked.store.v8f32.p0(<8 x float> %{{.*}}, ptr %{{.*}}, i32 32, <8 x i1> %{{.*}})
+  // CHECK: @llvm.masked.store.v8f32.p0(<8 x float> %{{.*}}, ptr align 32 %{{.*}}, <8 x i1> %{{.*}})
   return _mm256_mask_store_ps(__P, __U, __A); 
 }
 
@@ -7552,7 +7568,7 @@ void test_mm_storeu_epi64(void *__p, __m128i __a) {
 
 void test_mm_mask_storeu_epi64(void *__P, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: test_mm_mask_storeu_epi64
-  // CHECK: @llvm.masked.store.v2i64.p0(<2 x i64> %{{.*}}, ptr %{{.*}}, i32 1, <2 x i1> %{{.*}})
+  // CHECK: @llvm.masked.store.v2i64.p0(<2 x i64> %{{.*}}, ptr align 1 %{{.*}}, <2 x i1> %{{.*}})
   return _mm_mask_storeu_epi64(__P, __U, __A); 
 }
 
@@ -7564,7 +7580,7 @@ void test_mm256_storeu_epi64(void *__P, __m256i __A) {
 
 void test_mm256_mask_storeu_epi64(void *__P, __mmask8 __U, __m256i __A) {
   // CHECK-LABEL: test_mm256_mask_storeu_epi64
-  // CHECK: @llvm.masked.store.v4i64.p0(<4 x i64> %{{.*}}, ptr %{{.*}}, i32 1, <4 x i1> %{{.*}})
+  // CHECK: @llvm.masked.store.v4i64.p0(<4 x i64> %{{.*}}, ptr align 1 %{{.*}}, <4 x i1> %{{.*}})
   return _mm256_mask_storeu_epi64(__P, __U, __A); 
 }
 
@@ -7576,7 +7592,7 @@ void test_mm_storeu_epi32(void *__P, __m128i __A) {
 
 void test_mm_mask_storeu_epi32(void *__P, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: test_mm_mask_storeu_epi32
-  // CHECK: @llvm.masked.store.v4i32.p0(<4 x i32> %{{.*}}, ptr %{{.*}}, i32 1, <4 x i1> %{{.*}})
+  // CHECK: @llvm.masked.store.v4i32.p0(<4 x i32> %{{.*}}, ptr align 1 %{{.*}}, <4 x i1> %{{.*}})
   return _mm_mask_storeu_epi32(__P, __U, __A); 
 }
 
@@ -7588,31 +7604,31 @@ void test_mm256_storeu_epi32(void *__P, __m256i __A) {
 
 void test_mm256_mask_storeu_epi32(void *__P, __mmask8 __U, __m256i __A) {
   // CHECK-LABEL: test_mm256_mask_storeu_epi32
-  // CHECK: @llvm.masked.store.v8i32.p0(<8 x i32> %{{.*}}, ptr %{{.*}}, i32 1, <8 x i1> %{{.*}})
+  // CHECK: @llvm.masked.store.v8i32.p0(<8 x i32> %{{.*}}, ptr align 1 %{{.*}}, <8 x i1> %{{.*}})
   return _mm256_mask_storeu_epi32(__P, __U, __A); 
 }
 
 void test_mm_mask_storeu_pd(void *__P, __mmask8 __U, __m128d __A) {
   // CHECK-LABEL: test_mm_mask_storeu_pd
-  // CHECK: @llvm.masked.store.v2f64.p0(<2 x double> %{{.*}}, ptr %{{.*}}, i32 1, <2 x i1> %{{.*}})
+  // CHECK: @llvm.masked.store.v2f64.p0(<2 x double> %{{.*}}, ptr align 1 %{{.*}}, <2 x i1> %{{.*}})
   return _mm_mask_storeu_pd(__P, __U, __A); 
 }
 
 void test_mm256_mask_storeu_pd(void *__P, __mmask8 __U, __m256d __A) {
   // CHECK-LABEL: test_mm256_mask_storeu_pd
-  // CHECK: @llvm.masked.store.v4f64.p0(<4 x double> %{{.*}}, ptr %{{.*}}, i32 1, <4 x i1> %{{.*}})
+  // CHECK: @llvm.masked.store.v4f64.p0(<4 x double> %{{.*}}, ptr align 1 %{{.*}}, <4 x i1> %{{.*}})
   return _mm256_mask_storeu_pd(__P, __U, __A); 
 }
 
 void test_mm_mask_storeu_ps(void *__P, __mmask8 __U, __m128 __A) {
   // CHECK-LABEL: test_mm_mask_storeu_ps
-  // CHECK: @llvm.masked.store.v4f32.p0(<4 x float> %{{.*}}, ptr %{{.*}}, i32 1, <4 x i1> %{{.*}})
+  // CHECK: @llvm.masked.store.v4f32.p0(<4 x float> %{{.*}}, ptr align 1 %{{.*}}, <4 x i1> %{{.*}})
   return _mm_mask_storeu_ps(__P, __U, __A); 
 }
 
 void test_mm256_mask_storeu_ps(void *__P, __mmask8 __U, __m256 __A) {
   // CHECK-LABEL: test_mm256_mask_storeu_ps
-  // CHECK: @llvm.masked.store.v8f32.p0(<8 x float> %{{.*}}, ptr %{{.*}}, i32 1, <8 x i1> %{{.*}})
+  // CHECK: @llvm.masked.store.v8f32.p0(<8 x float> %{{.*}}, ptr align 1 %{{.*}}, <8 x i1> %{{.*}})
   return _mm256_mask_storeu_ps(__P, __U, __A); 
 }
 
@@ -7623,6 +7639,8 @@ __m128d test_mm_mask_unpackhi_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d
   return _mm_mask_unpackhi_pd(__W, __U, __A, __B); 
 }
 
+TEST_CONSTEXPR(match_m128d(_mm_mask_unpackhi_pd(_mm_setzero_pd(), 0x3, (__m128d)(__v2df){1.0,2.0}, (__m128d)(__v2df){3.0,4.0}), 2.0,4.0));
+
 __m128d test_mm_maskz_unpackhi_pd(__mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: test_mm_maskz_unpackhi_pd
   // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 3>
@@ -7637,6 +7655,8 @@ __m256d test_mm256_mask_unpackhi_pd(__m256d __W, __mmask8 __U, __m256d __A, __m2
   return _mm256_mask_unpackhi_pd(__W, __U, __A, __B); 
 }
 
+TEST_CONSTEXPR(match_m256d(_mm256_mask_unpackhi_pd(_mm256_setzero_pd(), 0xAA, (__m256d)(__v4df){1.0,2.0,3.0,4.0}, (__m256d)(__v4df){5.0,6.0,7.0,8.0}), 0,6.0,0,8.0));
+
 __m256d test_mm256_maskz_unpackhi_pd(__mmask8 __U, __m256d __A, __m256d __B) {
   // CHECK-LABEL: test_mm256_maskz_unpackhi_pd
   // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -7679,6 +7699,8 @@ __m128d test_mm_mask_unpacklo_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d
   return _mm_mask_unpacklo_pd(__W, __U, __A, __B); 
 }
 
+TEST_CONSTEXPR(match_m128d(_mm_mask_unpacklo_pd(_mm_setzero_pd(), 0x3, (__m128d)(__v2df){1.0,2.0}, (__m128d)(__v2df){3.0,4.0}), 1.0,3.0));
+
 __m128d test_mm_maskz_unpacklo_pd(__mmask8 __U, __m128d __A, __m128d __B) {
   // CHECK-LABEL: test_mm_maskz_unpacklo_pd
   // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 0, i32 2>
@@ -7686,6 +7708,8 @@ __m128d test_mm_maskz_unpacklo_pd(__mmask8 __U, __m128d __A, __m128d __B) {
   return _mm_maskz_unpacklo_pd(__U, __A, __B); 
 }
 
+TEST_CONSTEXPR(match_m128d(_mm_maskz_unpacklo_pd(0x2, (__m128d)(__v2df){1.0,2.0}, (__m128d)(__v2df){3.0,4.0}), 0.0,3.0));
+
 __m256d test_mm256_mask_unpacklo_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
   // CHECK-LABEL: test_mm256_mask_unpacklo_pd
   // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -7693,6 +7717,8 @@ __m256d test_mm256_mask_unpacklo_pd(__m256d __W, __mmask8 __U, __m256d __A, __m2
   return _mm256_mask_unpacklo_pd(__W, __U, __A, __B); 
 }
 
+TEST_CONSTEXPR(match_m256d(_mm256_mask_unpacklo_pd(_mm256_setzero_pd(), 0xAA, (__m256d)(__v4df){1.0,2.0,3.0,4.0}, (__m256d)(__v4df){5.0,6.0,7.0,8.0}), 0,5.0,0,7.0));
+
 __m256d test_mm256_maskz_unpacklo_pd(__mmask8 __U, __m256d __A, __m256d __B) {
   // CHECK-LABEL: test_mm256_maskz_unpacklo_pd
   // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -7700,6 +7726,8 @@ __m256d test_mm256_maskz_unpacklo_pd(__mmask8 __U, __m256d __A, __m256d __B) {
   return _mm256_maskz_unpacklo_pd(__U, __A, __B); 
 }
 
+TEST_CONSTEXPR(match_m256d(_mm256_maskz_unpacklo_pd(0x0A, (__m256d)(__v4df){1.0,2.0,3.0,4.0}, (__m256d)(__v4df){5.0,6.0,7.0,8.0}), 0.0,5.0,0.0,7.0));
+
 __m128 test_mm_mask_unpacklo_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
   // CHECK-LABEL: test_mm_mask_unpacklo_ps
   // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -8039,6 +8067,8 @@ __m128i test_mm_mask_unpackhi_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m1
   return _mm_mask_unpackhi_epi32(__W, __U, __A, __B); 
 }
 
+TEST_CONSTEXPR(match_v4si(_mm_mask_unpackhi_epi32(_mm_setzero_si128(), 0xA, (__m128i)(__v4si){0,1,2,3}, (__m128i)(__v4si){4,5,6,7}), 0,6,0,7));
+
 __m128i test_mm_maskz_unpackhi_epi32(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_unpackhi_epi32
   // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -8046,6 +8076,8 @@ __m128i test_mm_maskz_unpackhi_epi32(__mmask8 __U, __m128i __A, __m128i __B) {
   return _mm_maskz_unpackhi_epi32(__U, __A, __B); 
 }
 
+TEST_CONSTEXPR(match_v4si(_mm_maskz_unpackhi_epi32(0x5, (__m128i)(__v4si){0,1,2,3}, (__m128i)(__v4si){4,5,6,7}), 2,0,3,0));
+
 __m256i test_mm256_mask_unpackhi_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_unpackhi_epi32
   // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
@@ -8997,6 +9029,8 @@ __m256 test_mm256_maskz_broadcast_f32x4(__mmask8 __M, __m128 __A) {
   return _mm256_maskz_broadcast_f32x4(__M, __A); 
 }
 
+TEST_CONSTEXPR(match_m256(_mm256_maskz_broadcast_f32x4(0xAA, (__m128)(__v4sf){0,1,2,3}), 0,1,0,3,0,1,0,3));
+
 __m256i test_mm256_broadcast_i32x4(__m128i const* __A) {
   // CHECK-LABEL: test_mm256_broadcast_i32x4
   // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
@@ -9018,6 +9052,8 @@ __m256i test_mm256_maskz_broadcast_i32x4(__mmask8 __M, __m128i const* __A) {
   return _mm256_maskz_broadcast_i32x4(__M, _mm_loadu_si128(__A)); 
 }
 
+TEST_CONSTEXPR(match_v8si(_mm256_maskz_broadcast_i32x4(0xAA, (__m128i)(__v4si){0,1,2,3}), 0,1,0,3,0,1,0,3));
+
 __m256d test_mm256_mask_broadcastsd_pd(__m256d __O, __mmask8 __M, __m128d __A) {
   // CHECK-LABEL: test_mm256_mask_broadcastsd_pd
   // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> zeroinitializer
@@ -9875,6 +9911,7 @@ __m128 test_mm256_extractf32x4_ps(__m256 __A) {
   // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   return _mm256_extractf32x4_ps(__A, 1); 
 }
+TEST_CONSTEXPR(match_m128(_mm256_extractf32x4_ps(((__m256){0,1,2,3, 4,5,6,7}), 1),4.0f, 5.0f, 6.0f, 7.0f));
 
 __m128 test_mm256_mask_extractf32x4_ps(__m128 __W, __mmask8 __U, __m256 __A) {
   // CHECK-LABEL: test_mm256_mask_extractf32x4_ps
@@ -9882,6 +9919,7 @@ __m128 test_mm256_mask_extractf32x4_ps(__m128 __W, __mmask8 __U, __m256 __A) {
   // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm256_mask_extractf32x4_ps(__W, __U, __A, 1); 
 }
+TEST_CONSTEXPR(match_m128(_mm256_mask_extractf32x4_ps((((__m128){100,101,102,103})), (__mmask8)0x5, (((__m256){0,1,2,3, 4,5,6,7})), 1), 4.0f, 101.0f, 6.0f, 103.0f));
 
 __m128 test_mm256_maskz_extractf32x4_ps(__mmask8 __U, __m256 __A) {
   // CHECK-LABEL: test_mm256_maskz_extractf32x4_ps
@@ -9889,12 +9927,14 @@ __m128 test_mm256_maskz_extractf32x4_ps(__mmask8 __U, __m256 __A) {
   // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm256_maskz_extractf32x4_ps(__U, __A, 1); 
 }
+TEST_CONSTEXPR(match_m128(_mm256_maskz_extractf32x4_ps((__mmask8)0x3, (((__m256){0,1,2,3, 4,5,6,7})), 1), 4.0f, 5.0f, 0.0f, 0.0f));
 
 __m128i test_mm256_extracti32x4_epi32(__m256i __A) {
   // CHECK-LABEL: test_mm256_extracti32x4_epi32
   // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   return _mm256_extracti32x4_epi32(__A, 1); 
 }
+TEST_CONSTEXPR(match_m128i(_mm256_extracti32x4_epi32((((__m256i)(__v8si){0,1,2,3, 4,5,6,7})), 1), 0x0000000500000004ULL, 0x0000000700000006ULL));
 
 __m128i test_mm256_mask_extracti32x4_epi32(__m128i __W, __mmask8 __U, __m256i __A) {
   // CHECK-LABEL: test_mm256_mask_extracti32x4_epi32
@@ -9902,6 +9942,7 @@ __m128i test_mm256_mask_extracti32x4_epi32(__m128i __W, __mmask8 __U, __m256i __
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm256_mask_extracti32x4_epi32(__W, __U, __A, 1); 
 }
+TEST_CONSTEXPR(match_m128i(_mm256_mask_extracti32x4_epi32((((__m128i)(__v4si){100,101,102,103})), (__mmask8)0xA, (((__m256i)(__v8si){0,1,2,3, 4,5,6,7})), 1),0x0000000500000064ULL, 0x0000000700000066ULL));
 
 __m128i test_mm256_maskz_extracti32x4_epi32(__mmask8 __U, __m256i __A) {
   // CHECK-LABEL: test_mm256_maskz_extracti32x4_epi32
@@ -9909,6 +9950,7 @@ __m128i test_mm256_maskz_extracti32x4_epi32(__mmask8 __U, __m256i __A) {
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm256_maskz_extracti32x4_epi32(__U, __A, 1); 
 }
+TEST_CONSTEXPR(match_m128i(_mm256_maskz_extracti32x4_epi32((__mmask8)0x3, (((__m256i)(__v8si){0,1,2,3, 4,5,6,7})), 1), 0x0000000500000004ULL, 0x0000000000000000ULL));
 
 __m256 test_mm256_insertf32x4(__m256 __A, __m128 __B) {
   // CHECK-LABEL: test_mm256_insertf32x4
@@ -10324,6 +10366,8 @@ __m128 test_mm_mask_movehdup_ps(__m128 __W, __mmask8 __U, __m128 __A) {
   return _mm_mask_movehdup_ps(__W, __U, __A);
 }
 
+TEST_CONSTEXPR(match_m128(_mm_mask_movehdup_ps(_mm_setzero_ps(), 0xF, (__m128)(__v4sf){1.f,2.f,3.f,4.f}), 2.f,2.f,4.f,4.f));
+
 __m128 test_mm_maskz_movehdup_ps(__mmask8 __U, __m128 __A) {
   // CHECK-LABEL: test_mm_maskz_movehdup_ps
   // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
@@ -10338,6 +10382,8 @@ __m256 test_mm256_mask_movehdup_ps(__m256 __W, __mmask8 __U, __m256 __A) {
   return _mm256_mask_movehdup_ps(__W, __U, __A);
 }
 
+TEST_CONSTEXPR(match_m256(_mm256_mask_movehdup_ps(_mm256_setzero_ps(), 0xAA, (__m256)(__v8sf){1,2,3,4,5,6,7,8}), 0,2,0,4,0,6,0,8));
+
 __m256 test_mm256_maskz_movehdup_ps(__mmask8 __U, __m256 __A) {
   // CHECK-LABEL: test_mm256_maskz_movehdup_ps
   // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
@@ -10352,6 +10398,8 @@ __m128 test_mm_mask_moveldup_ps(__m128 __W, __mmask8 __U, __m128 __A) {
   return _mm_mask_moveldup_ps(__W, __U, __A);
 }
 
+TEST_CONSTEXPR(match_m128(_mm_mask_moveldup_ps(_mm_setzero_ps(), 0xF, (__m128)(__v4sf){1.f,2.f,3.f,4.f}), 1.f,1.f,3.f,3.f));
+
 __m128 test_mm_maskz_moveldup_ps(__mmask8 __U, __m128 __A) {
   // CHECK-LABEL: test_mm_maskz_moveldup_ps
   // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
@@ -10366,6 +10414,8 @@ __m256 test_mm256_mask_moveldup_ps(__m256 __W, __mmask8 __U, __m256 __A) {
   return _mm256_mask_moveldup_ps(__W, __U, __A);
 }
 
+TEST_CONSTEXPR(match_m256(_mm256_mask_moveldup_ps(_mm256_setzero_ps(), 0xAA, (__m256)(__v8sf){1,2,3,4,5,6,7,8}), 0,1,0,3,0,5,0,7));
+
 __m256 test_mm256_maskz_moveldup_ps(__mmask8 __U, __m256 __A) {
   // CHECK-LABEL: test_mm256_maskz_moveldup_ps
   // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
diff --git a/clang/test/CodeGen/X86/avx512vlbw-builtins.c b/clang/test/CodeGen/X86/avx512vlbw-builtins.c
index c0e46de8b81bf..116d86fcd597d 100644
--- a/clang/test/CodeGen/X86/avx512vlbw-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vlbw-builtins.c
@@ -1688,24 +1688,37 @@ __m128i test_mm_mask_shuffle_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m12
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask_shuffle_epi8(__W,__U,__A,__B); 
 }
+
+TEST_CONSTEXPR(match_v16qi(_mm_mask_shuffle_epi8((__m128i)(__v16qi){1,1,1,1,1,1,1,1,2,2,4,4,6,6,8,8}, 0x00FF, (__m128i)(__v16qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}, (__m128i)(__v16qi){15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0}), 15,14,13,12,11,10,9,8,2,2,4,4,6,6,8,8));
+
 __m128i test_mm_maskz_shuffle_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_shuffle_epi8
   // CHECK: @llvm.x86.ssse3.pshuf.b
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_shuffle_epi8(__U,__A,__B); 
 }
+
+TEST_CONSTEXPR(match_v16qi(_mm_maskz_shuffle_epi8(0xAAAA, (__m128i)(__v16qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}, (__m128i)(__v16qi){15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0}), 0,14,0,12,0,10,0,8,0,6,0,4,0,2,0,0));
+
 __m256i test_mm256_mask_shuffle_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_shuffle_epi8
   // CHECK: @llvm.x86.avx2.pshuf.b
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_shuffle_epi8(__W,__U,__A,__B); 
 }
+
+TEST_CONSTEXPR(match_v32qi(_mm256_mask_shuffle_epi8((__m256i)(__v32qi){1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4}, 0x80808080, (__m256i)(__v32qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}, (__m256i)(__v32qi){31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0}), 1,1,1,1,1,1,1,8,2,2,2,2,2,2,2,0,3,3,3,3,3,3,3,24,4,4,4,4,4,4,4,16));
+                
+
 __m256i test_mm256_maskz_shuffle_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_shuffle_epi8
   // CHECK: @llvm.x86.avx2.pshuf.b
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_shuffle_epi8(__U,__A,__B); 
 }
+
+TEST_CONSTEXPR(match_v32qi(_mm256_maskz_shuffle_epi8(0x0000FFFF, (__m256i)(__v32qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}, (__m256i)(__v32qi){31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0}), 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0));
+
 __m128i test_mm_mask_subs_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_subs_epi8
   // CHECK: @llvm.ssub.sat.v16i8
@@ -2048,6 +2061,7 @@ __m128i test_mm_mask_mulhrs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_mulhrs_epi16(__W, __U, __X, __Y); 
 }
+TEST_CONSTEXPR(match_v8hi(_mm_mask_mulhrs_epi16(_mm_set1_epi16(1), 0x0F, (__m128i)(__v8hi){+100, +200, -300, -400, +500, +600, -700, +800}, (__m128i)(__v8hi){+8000, -7000, +6000, -5000, +4000, -3000, +2000, -1000}), +24, -43, -55, +61, +1, +1, +1, +1));
 
 __m128i test_mm_maskz_mulhrs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) {
   // CHECK-LABEL: test_mm_maskz_mulhrs_epi16
@@ -2055,6 +2069,7 @@ __m128i test_mm_maskz_mulhrs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_mulhrs_epi16(__U, __X, __Y); 
 }
+TEST_CONSTEXPR(match_v8hi(_mm_maskz_mulhrs_epi16(0x0F, (__m128i)(__v8hi){+100, +200, -300, -400, +500, +600, -700, +800}, (__m128i)(__v8hi){+8000, -7000, +6000, -5000, +4000, -3000, +2000, -1000}), +24, -43, -55, +61, 0, 0, 0, 0));
 
 __m256i test_mm256_mask_mulhrs_epi16(__m256i __W, __mmask16 __U, __m256i __X, __m256i __Y) {
   // CHECK-LABEL: test_mm256_mask_mulhrs_epi16
@@ -2062,6 +2077,7 @@ __m256i test_mm256_mask_mulhrs_epi16(__m256i __W, __mmask16 __U, __m256i __X, __
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_mulhrs_epi16(__W, __U, __X, __Y); 
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_mask_mulhrs_epi16(_mm256_set1_epi16(1), 0xF00F, (__m256i)(__v16hi){+100, +200, -300, -400, +500, +600, -700, +800, -900, -1000, +1100, +1200, -1300, -1400, +1500, +1600}, (__m256i)(__v16hi){+1600, -1500, +1400, -1300, +1200, -1100, +1000, -900, +800, -700, +600, -500, +400, -300, +200, -100}), +5, -9, -13, +16, +1, +1, +1, +1, +1, +1, +1, +1, -16, +13, +9, -5));
 
 __m256i test_mm256_maskz_mulhrs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) {
   // CHECK-LABEL: test_mm256_maskz_mulhrs_epi16
@@ -2069,6 +2085,7 @@ __m256i test_mm256_maskz_mulhrs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) {
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_mulhrs_epi16(__U, __X, __Y); 
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_maskz_mulhrs_epi16(0xF00F, (__m256i)(__v16hi){+100, +200, -300, -400, +500, +600, -700, +800, -900, -1000, +1100, +1200, -1300, -1400, +1500, +1600}, (__m256i)(__v16hi){+1600, -1500, +1400, -1300, +1200, -1100, +1000, -900, +800, -700, +600, -500, +400, -300, +200, -100}), +5, -9, -13, +16, 0, 0, 0, 0, 0, 0, 0, 0, -16, +13, +9, -5));
 
 __m128i test_mm_mask_mulhi_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_mulhi_epu16
@@ -2759,13 +2776,13 @@ __m128i test_mm_loadu_epi16(void const *__P) {
 
 __m128i test_mm_mask_loadu_epi16(__m128i __W, __mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm_mask_loadu_epi16
-  // CHECK: @llvm.masked.load.v8i16.p0(ptr %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x i16> %{{.*}})
+  // CHECK: @llvm.masked.load.v8i16.p0(ptr align 1 %{{.*}}, <8 x i1> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_mask_loadu_epi16(__W, __U, __P); 
 }
 
 __m128i test_mm_maskz_loadu_epi16(__mmask8 __U, void const *__P) {
   // CHECK-LABEL: test_mm_maskz_loadu_epi16
-  // CHECK: @llvm.masked.load.v8i16.p0(ptr %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x i16> %{{.*}})
+  // CHECK: @llvm.masked.load.v8i16.p0(ptr align 1 %{{.*}}, <8 x i1> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_maskz_loadu_epi16(__U, __P); 
 }
 
@@ -2777,13 +2794,13 @@ __m256i test_mm256_loadu_epi16(void const *__P) {
 
 __m256i test_mm256_mask_loadu_epi16(__m256i __W, __mmask16 __U, void const *__P) {
   // CHECK-LABEL: test_mm256_mask_loadu_epi16
-  // CHECK: @llvm.masked.load.v16i16.p0(ptr %{{.*}}, i32 1, <16 x i1> %{{.*}}, <16 x i16> %{{.*}})
+  // CHECK: @llvm.masked.load.v16i16.p0(ptr align 1 %{{.*}}, <16 x i1> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_mask_loadu_epi16(__W, __U, __P); 
 }
 
 __m256i test_mm256_maskz_loadu_epi16(__mmask16 __U, void const *__P) {
   // CHECK-LABEL: test_mm256_maskz_loadu_epi16
-  // CHECK: @llvm.masked.load.v16i16.p0(ptr %{{.*}}, i32 1, <16 x i1> %{{.*}}, <16 x i16> %{{.*}})
+  // CHECK: @llvm.masked.load.v16i16.p0(ptr align 1 %{{.*}}, <16 x i1> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_maskz_loadu_epi16(__U, __P); 
 }
 
@@ -2795,13 +2812,13 @@ __m128i test_mm_loadu_epi8(void const *__P) {
 
 __m128i test_mm_mask_loadu_epi8(__m128i __W, __mmask16 __U, void const *__P) {
   // CHECK-LABEL: test_mm_mask_loadu_epi8
-  // CHECK: @llvm.masked.load.v16i8.p0(ptr %{{.*}}, i32 1, <16 x i1> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK: @llvm.masked.load.v16i8.p0(ptr align 1 %{{.*}}, <16 x i1> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_mask_loadu_epi8(__W, __U, __P); 
 }
 
 __m128i test_mm_maskz_loadu_epi8(__mmask16 __U, void const *__P) {
   // CHECK-LABEL: test_mm_maskz_loadu_epi8
-  // CHECK: @llvm.masked.load.v16i8.p0(ptr %{{.*}}, i32 1, <16 x i1> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK: @llvm.masked.load.v16i8.p0(ptr align 1 %{{.*}}, <16 x i1> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_maskz_loadu_epi8(__U, __P); 
 }
 
@@ -2813,13 +2830,13 @@ __m256i test_mm256_loadu_epi8(void const *__P) {
 
 __m256i test_mm256_mask_loadu_epi8(__m256i __W, __mmask32 __U, void const *__P) {
   // CHECK-LABEL: test_mm256_mask_loadu_epi8
-  // CHECK: @llvm.masked.load.v32i8.p0(ptr %{{.*}}, i32 1, <32 x i1> %{{.*}}, <32 x i8> %{{.*}})
+  // CHECK: @llvm.masked.load.v32i8.p0(ptr align 1 %{{.*}}, <32 x i1> %{{.*}}, <32 x i8> %{{.*}})
   return _mm256_mask_loadu_epi8(__W, __U, __P); 
 }
 
 __m256i test_mm256_maskz_loadu_epi8(__mmask32 __U, void const *__P) {
   // CHECK-LABEL: test_mm256_maskz_loadu_epi8
-  // CHECK: @llvm.masked.load.v32i8.p0(ptr %{{.*}}, i32 1, <32 x i1> %{{.*}}, <32 x i8> %{{.*}})
+  // CHECK: @llvm.masked.load.v32i8.p0(ptr align 1 %{{.*}}, <32 x i1> %{{.*}}, <32 x i8> %{{.*}})
   return _mm256_maskz_loadu_epi8(__U, __P); 
 }
 
@@ -2831,7 +2848,7 @@ void test_mm_storeu_epi16(void *__p, __m128i __a) {
 
 void test_mm_mask_storeu_epi16(void *__P, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: test_mm_mask_storeu_epi16
-  // CHECK: @llvm.masked.store.v8i16.p0(<8 x i16> %{{.*}}, ptr %{{.*}}, i32 1, <8 x i1> %{{.*}})
+  // CHECK: @llvm.masked.store.v8i16.p0(<8 x i16> %{{.*}}, ptr align 1 %{{.*}}, <8 x i1> %{{.*}})
   return _mm_mask_storeu_epi16(__P, __U, __A); 
 }
 
@@ -2843,7 +2860,7 @@ void test_mm256_storeu_epi16(void *__P, __m256i __A) {
 
 void test_mm256_mask_storeu_epi16(void *__P, __mmask16 __U, __m256i __A) {
   // CHECK-LABEL: test_mm256_mask_storeu_epi16
-  // CHECK: @llvm.masked.store.v16i16.p0(<16 x i16> %{{.*}}, ptr %{{.*}}, i32 1, <16 x i1> %{{.*}})
+  // CHECK: @llvm.masked.store.v16i16.p0(<16 x i16> %{{.*}}, ptr align 1 %{{.*}}, <16 x i1> %{{.*}})
   return _mm256_mask_storeu_epi16(__P, __U, __A); 
 }
 
@@ -2855,7 +2872,7 @@ void test_mm_storeu_epi8(void *__p, __m128i __a) {
 
 void test_mm_mask_storeu_epi8(void *__P, __mmask16 __U, __m128i __A) {
   // CHECK-LABEL: test_mm_mask_storeu_epi8
-  // CHECK: @llvm.masked.store.v16i8.p0(<16 x i8> %{{.*}}, ptr %{{.*}}, i32 1, <16 x i1> %{{.*}})
+  // CHECK: @llvm.masked.store.v16i8.p0(<16 x i8> %{{.*}}, ptr align 1 %{{.*}}, <16 x i1> %{{.*}})
   return _mm_mask_storeu_epi8(__P, __U, __A); 
 }
 
@@ -2867,7 +2884,7 @@ void test_mm256_storeu_epi8(void *__P, __m256i __A) {
 
 void test_mm256_mask_storeu_epi8(void *__P, __mmask32 __U, __m256i __A) {
   // CHECK-LABEL: test_mm256_mask_storeu_epi8
-  // CHECK: @llvm.masked.store.v32i8.p0(<32 x i8> %{{.*}}, ptr %{{.*}}, i32 1, <32 x i1> %{{.*}})
+  // CHECK: @llvm.masked.store.v32i8.p0(<32 x i8> %{{.*}}, ptr align 1 %{{.*}}, <32 x i1> %{{.*}})
   return _mm256_mask_storeu_epi8(__P, __U, __A); 
 }
 __mmask16 test_mm_test_epi8_mask(__m128i __A, __m128i __B) {
diff --git a/clang/test/CodeGen/X86/avx512vlcd-builtins.c b/clang/test/CodeGen/X86/avx512vlcd-builtins.c
index 1619305dd5210..29fc6fd2e7fc8 100644
--- a/clang/test/CodeGen/X86/avx512vlcd-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vlcd-builtins.c
@@ -20,6 +20,7 @@ __m128i test_mm_broadcastmb_epi64(__m128i a,__m128i b) {
   // CHECK: insertelement <2 x i64> %{{.*}}, i64 %{{.*}}, i32 1
   return _mm_broadcastmb_epi64(_mm_cmpeq_epi32_mask (a, b)); 
 }
+TEST_CONSTEXPR(match_v2du(_mm_broadcastmb_epi64((__mmask8)(76)), 76, 76));
 
 __m256i test_mm256_broadcastmb_epi64(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_broadcastmb_epi64
@@ -32,6 +33,7 @@ __m256i test_mm256_broadcastmb_epi64(__m256i a, __m256i b) {
   // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 3
   return _mm256_broadcastmb_epi64(_mm256_cmpeq_epi64_mask ( a, b)); 
 }
+TEST_CONSTEXPR(match_v4di(_mm256_broadcastmb_epi64((__mmask8)(67)), 67, 67, 67, 67));
 
 __m128i test_mm_broadcastmw_epi32(__m512i a, __m512i b) {
   // CHECK-LABEL: test_mm_broadcastmw_epi32
@@ -43,6 +45,7 @@ __m128i test_mm_broadcastmw_epi32(__m512i a, __m512i b) {
   // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i32 3
   return _mm_broadcastmw_epi32(_mm512_cmpeq_epi32_mask ( a, b));
 }
+TEST_CONSTEXPR(match_v4su(_mm_broadcastmw_epi32((__mmask16)(0xbabe)), 0xbabe, 0xbabe, 0xbabe, 0xbabe));
 
 __m256i test_mm256_broadcastmw_epi32(__m512i a, __m512i b) {
   // CHECK-LABEL: test_mm256_broadcastmw_epi32
@@ -58,87 +61,119 @@ __m256i test_mm256_broadcastmw_epi32(__m512i a, __m512i b) {
   // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 7
   return _mm256_broadcastmw_epi32(_mm512_cmpeq_epi32_mask ( a, b)); 
 }
+TEST_CONSTEXPR(match_v8si(_mm256_broadcastmw_epi32((__mmask16)(0xcafe)), 0xcafe,0xcafe,0xcafe,0xcafe, 0xcafe,0xcafe,0xcafe,0xcafe));
 
 __m128i test_mm_conflict_epi64(__m128i __A) {
   // CHECK-LABEL: test_mm_conflict_epi64
   // CHECK: call {{.*}}<2 x i64> @llvm.x86.avx512.conflict.q.128(<2 x i64> %{{.*}})
-  return _mm_conflict_epi64(__A); 
+  return _mm_conflict_epi64(__A);
 }
 
+TEST_CONSTEXPR(match_v2di(_mm_conflict_epi64((__m128i)(__v2di){1, 2}), 0, 0));
+TEST_CONSTEXPR(match_v2di(_mm_conflict_epi64((__m128i)(__v2di){5, 5}), 0, 1));
+
 __m128i test_mm_mask_conflict_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: test_mm_mask_conflict_epi64
   // CHECK: call {{.*}}<2 x i64> @llvm.x86.avx512.conflict.q.128(<2 x i64> %{{.*}})
   // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
-  return _mm_mask_conflict_epi64(__W, __U, __A); 
+  return _mm_mask_conflict_epi64(__W, __U, __A);
 }
 
+TEST_CONSTEXPR(match_v2di(_mm_mask_conflict_epi64((__m128i)(__v2di){0xFF, 0xFF}, 0x2, (__m128i)(__v2di){5, 5}), 0xFF, 1));
+
 __m128i test_mm_maskz_conflict_epi64(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: test_mm_maskz_conflict_epi64
   // CHECK: call {{.*}}<2 x i64> @llvm.x86.avx512.conflict.q.128(<2 x i64> %{{.*}})
   // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
-  return _mm_maskz_conflict_epi64(__U, __A); 
+  return _mm_maskz_conflict_epi64(__U, __A);
 }
 
+TEST_CONSTEXPR(match_v2di(_mm_maskz_conflict_epi64(0x2, (__m128i)(__v2di){5, 5}), 0, 1));
+
 __m256i test_mm256_conflict_epi64(__m256i __A) {
   // CHECK-LABEL: test_mm256_conflict_epi64
   // CHECK: call {{.*}}<4 x i64> @llvm.x86.avx512.conflict.q.256(<4 x i64> %{{.*}})
-  return _mm256_conflict_epi64(__A); 
+  return _mm256_conflict_epi64(__A);
 }
 
+TEST_CONSTEXPR(match_v4di(_mm256_conflict_epi64((__m256i)(__v4di){1, 2, 1, 3}), 0, 0, 1, 0));
+TEST_CONSTEXPR(match_v4di(_mm256_conflict_epi64((__m256i)(__v4di){7, 7, 7, 7}), 0, 1, 3, 7));
+TEST_CONSTEXPR(match_v4di(_mm256_conflict_epi64((__m256i)(__v4di){1, 2, 3, 4}), 0, 0, 0, 0));
+
 __m256i test_mm256_mask_conflict_epi64(__m256i __W, __mmask8 __U, __m256i __A) {
   // CHECK-LABEL: test_mm256_mask_conflict_epi64
   // CHECK: call {{.*}}<4 x i64> @llvm.x86.avx512.conflict.q.256(<4 x i64> %{{.*}})
   // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
-  return _mm256_mask_conflict_epi64(__W, __U, __A); 
+  return _mm256_mask_conflict_epi64(__W, __U, __A);
 }
 
+TEST_CONSTEXPR(match_v4di(_mm256_mask_conflict_epi64((__m256i)(__v4di){0xFF, 0xFF, 0xFF, 0xFF}, 0x5, (__m256i)(__v4di){1, 2, 1, 3}), 0, 0xFF, 1, 0xFF));
+
 __m256i test_mm256_maskz_conflict_epi64(__mmask8 __U, __m256i __A) {
   // CHECK-LABEL: test_mm256_maskz_conflict_epi64
   // CHECK: call {{.*}}<4 x i64> @llvm.x86.avx512.conflict.q.256(<4 x i64> %{{.*}})
   // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
-  return _mm256_maskz_conflict_epi64(__U, __A); 
+  return _mm256_maskz_conflict_epi64(__U, __A);
 }
 
+TEST_CONSTEXPR(match_v4di(_mm256_maskz_conflict_epi64(0x5, (__m256i)(__v4di){1, 2, 1, 3}), 0, 0, 1, 0));
+
 __m128i test_mm_conflict_epi32(__m128i __A) {
   // CHECK-LABEL: test_mm_conflict_epi32
   // CHECK: call <4 x i32> @llvm.x86.avx512.conflict.d.128(<4 x i32> %{{.*}})
-  return _mm_conflict_epi32(__A); 
+  return _mm_conflict_epi32(__A);
 }
 
+TEST_CONSTEXPR(match_v4si(_mm_conflict_epi32((__m128i)(__v4si){1, 2, 1, 3}), 0, 0, 1, 0));
+TEST_CONSTEXPR(match_v4si(_mm_conflict_epi32((__m128i)(__v4si){3, 3, 3, 3}), 0, 1, 3, 7));
+TEST_CONSTEXPR(match_v4si(_mm_conflict_epi32((__m128i)(__v4si){1, 2, 3, 4}), 0, 0, 0, 0));
+
 __m128i test_mm_mask_conflict_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: test_mm_mask_conflict_epi32
   // CHECK: call <4 x i32> @llvm.x86.avx512.conflict.d.128(<4 x i32> %{{.*}})
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
-  return _mm_mask_conflict_epi32(__W, __U, __A); 
+  return _mm_mask_conflict_epi32(__W, __U, __A);
 }
 
+TEST_CONSTEXPR(match_v4si(_mm_mask_conflict_epi32((__m128i)(__v4si){0xFF, 0xFF, 0xFF, 0xFF}, 0x5, (__m128i)(__v4si){1, 2, 1, 3}), 0, 0xFF, 1, 0xFF));
+
 __m128i test_mm_maskz_conflict_epi32(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: test_mm_maskz_conflict_epi32
   // CHECK: call <4 x i32> @llvm.x86.avx512.conflict.d.128(<4 x i32> %{{.*}})
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
-  return _mm_maskz_conflict_epi32(__U, __A); 
+  return _mm_maskz_conflict_epi32(__U, __A);
 }
 
+TEST_CONSTEXPR(match_v4si(_mm_maskz_conflict_epi32(0x5, (__m128i)(__v4si){1, 2, 1, 3}), 0, 0, 1, 0));
+
 __m256i test_mm256_conflict_epi32(__m256i __A) {
   // CHECK-LABEL: test_mm256_conflict_epi32
   // CHECK: call <8 x i32> @llvm.x86.avx512.conflict.d.256(<8 x i32> %{{.*}})
-  return _mm256_conflict_epi32(__A); 
+  return _mm256_conflict_epi32(__A);
 }
 
+TEST_CONSTEXPR(match_v8si(_mm256_conflict_epi32((__m256i)(__v8si){1, 2, 1, 3, 2, 4, 1, 5}), 0, 0, 1, 0, 2, 0, 5, 0));
+TEST_CONSTEXPR(match_v8si(_mm256_conflict_epi32((__m256i)(__v8si){4, 4, 4, 4, 4, 4, 4, 4}), 0, 1, 3, 7, 15, 31, 63, 127));
+TEST_CONSTEXPR(match_v8si(_mm256_conflict_epi32((__m256i)(__v8si){1, 2, 3, 4, 5, 6, 7, 8}), 0, 0, 0, 0, 0, 0, 0, 0));
+
 __m256i test_mm256_mask_conflict_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
   // CHECK-LABEL: test_mm256_mask_conflict_epi32
   // CHECK: call <8 x i32> @llvm.x86.avx512.conflict.d.256(<8 x i32> %{{.*}})
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
-  return _mm256_mask_conflict_epi32(__W, __U, __A); 
+  return _mm256_mask_conflict_epi32(__W, __U, __A);
 }
 
+TEST_CONSTEXPR(match_v8si(_mm256_mask_conflict_epi32((__m256i)(__v8si){0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}, /*0101 0101=*/0x55, (__m256i)(__v8si){1, 2, 1, 3, 2, 4, 1, 5}), 0, 0xFF, 1, 0xFF, 2, 0xFF, 5, 0xFF));
+
 __m256i test_mm256_maskz_conflict_epi32(__mmask8 __U, __m256i __A) {
   // CHECK-LABEL: test_mm256_maskz_conflict_epi32
   // CHECK: call <8 x i32> @llvm.x86.avx512.conflict.d.256(<8 x i32> %{{.*}})
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
-  return _mm256_maskz_conflict_epi32(__U, __A); 
+  return _mm256_maskz_conflict_epi32(__U, __A);
 }
 
+TEST_CONSTEXPR(match_v8si(_mm256_maskz_conflict_epi32(0x55, (__m256i)(__v8si){1, 2, 1, 3, 2, 4, 1, 5}), 0, 0, 1, 0, 2, 0, 5, 0));
+
 __m128i test_mm_lzcnt_epi32(__m128i __A) {
   // CHECK-LABEL: test_mm_lzcnt_epi32
   // CHECK: call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %{{.*}}, i1 true)
diff --git a/clang/test/CodeGen/X86/avx512vldq-builtins.c b/clang/test/CodeGen/X86/avx512vldq-builtins.c
index 938845799acf5..1bfc0229eeb26 100644
--- a/clang/test/CodeGen/X86/avx512vldq-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vldq-builtins.c
@@ -987,6 +987,8 @@ __m256 test_mm256_mask_broadcast_f32x2(__m256 __O, __mmask8 __M, __m128 __A) {
   return _mm256_mask_broadcast_f32x2(__O, __M, __A); 
 }
 
+TEST_CONSTEXPR(match_m256(_mm256_mask_broadcast_f32x2(_mm256_setzero_ps(), 0xAA, (__m128)(__v4sf){1.f,2.f,3.f,4.f}), 0,2.f,0,2.f,0,2.f,0,2.f));
+
 __m256 test_mm256_maskz_broadcast_f32x2(__mmask8 __M, __m128 __A) {
   // CHECK-LABEL: test_mm256_maskz_broadcast_f32x2
   // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -994,6 +996,8 @@ __m256 test_mm256_maskz_broadcast_f32x2(__mmask8 __M, __m128 __A) {
   return _mm256_maskz_broadcast_f32x2(__M, __A); 
 }
 
+TEST_CONSTEXPR(match_m256(_mm256_maskz_broadcast_f32x2(0xAA, (__m128)(__v4sf){1.f,2.f,3.f,4.f}), 0,2.f,0,2.f,0,2.f,0,2.f));
+
 __m256d test_mm256_broadcast_f64x2(double const* __A) {
   // CHECK-LABEL: test_mm256_broadcast_f64x2
   // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
@@ -1008,6 +1012,8 @@ __m256d test_mm256_mask_broadcast_f64x2(__m256d __O, __mmask8 __M, double const*
   return _mm256_mask_broadcast_f64x2(__O, __M, _mm_loadu_pd(__A)); 
 }
 
+TEST_CONSTEXPR(match_m256d(_mm256_mask_broadcast_f64x2(_mm256_setzero_pd(), 0xA, (__m128d)(__v2df){1.0,2.0}), 0,2.0,0,2.0));
+
 __m256d test_mm256_maskz_broadcast_f64x2(__mmask8 __M, double const* __A) {
   // CHECK-LABEL: test_mm256_maskz_broadcast_f64x2
   // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
@@ -1015,6 +1021,8 @@ __m256d test_mm256_maskz_broadcast_f64x2(__mmask8 __M, double const* __A) {
   return _mm256_maskz_broadcast_f64x2(__M, _mm_loadu_pd(__A)); 
 }
 
+TEST_CONSTEXPR(match_m256d(_mm256_maskz_broadcast_f64x2(0xA, (__m128d)(__v2df){1.0,2.0}), 0,2.0,0,2.0));
+
 __m128i test_mm_broadcast_i32x2(__m128i __A) {
   // CHECK-LABEL: test_mm_broadcast_i32x2
   // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
@@ -1029,6 +1037,8 @@ __m128i test_mm_mask_broadcast_i32x2(__m128i __O, __mmask8 __M, __m128i __A) {
   return _mm_mask_broadcast_i32x2(__O, __M, __A); 
 }
 
+TEST_CONSTEXPR(match_v4si(_mm_mask_broadcast_i32x2(_mm_setzero_si128(), 0xF, (__m128i)(__v4si){0,1,2,3}), 0,1,0,1));
+
 __m128i test_mm_maskz_broadcast_i32x2(__mmask8 __M, __m128i __A) {
   // CHECK-LABEL: test_mm_maskz_broadcast_i32x2
   // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
@@ -1036,6 +1046,8 @@ __m128i test_mm_maskz_broadcast_i32x2(__mmask8 __M, __m128i __A) {
   return _mm_maskz_broadcast_i32x2(__M, __A); 
 }
 
+TEST_CONSTEXPR(match_v4si(_mm_maskz_broadcast_i32x2(0xF, (__m128i)(__v4si){0,1,2,3}), 0,1,0,1));
+
 __m256i test_mm256_broadcast_i32x2(__m128i __A) {
   // CHECK-LABEL: test_mm256_broadcast_i32x2
   // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -1050,6 +1062,8 @@ __m256i test_mm256_mask_broadcast_i32x2(__m256i __O, __mmask8 __M, __m128i __A)
   return _mm256_mask_broadcast_i32x2(__O, __M, __A); 
 }
 
+TEST_CONSTEXPR(match_v8si(_mm256_mask_broadcast_i32x2(_mm256_setzero_si256(), 0xAA, (__m128i)(__v4si){0,1,2,3}), 0,1,0,1,0,1,0,1));
+
 __m256i test_mm256_maskz_broadcast_i32x2(__mmask8 __M, __m128i __A) {
   // CHECK-LABEL: test_mm256_maskz_broadcast_i32x2
   // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -1057,6 +1071,8 @@ __m256i test_mm256_maskz_broadcast_i32x2(__mmask8 __M, __m128i __A) {
   return _mm256_maskz_broadcast_i32x2(__M, __A); 
 }
 
+TEST_CONSTEXPR(match_v8si(_mm256_maskz_broadcast_i32x2(0xAA, (__m128i)(__v4si){0,1,2,3}), 0,1,0,1,0,1,0,1));
+
 __m256i test_mm256_broadcast_i64x2(__m128i const* __A) {
   // CHECK-LABEL: test_mm256_broadcast_i64x2
   // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
@@ -1078,11 +1094,14 @@ __m256i test_mm256_maskz_broadcast_i64x2(__mmask8 __M, __m128i const* __A) {
   return _mm256_maskz_broadcast_i64x2(__M, _mm_loadu_si128(__A)); 
 }
 
+TEST_CONSTEXPR(match_v4di(_mm256_maskz_broadcast_i64x2(0xF, (__m128i)(__v2di){1,2}), 1,2,1,2));
+
 __m128d test_mm256_extractf64x2_pd(__m256d __A) {
   // CHECK-LABEL: test_mm256_extractf64x2_pd
   // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> poison, <2 x i32> <i32 2, i32 3>
   return _mm256_extractf64x2_pd(__A, 1); 
 }
+TEST_CONSTEXPR(match_m128d(_mm256_extractf64x2_pd(((__m256d){0.0,1.0,2.0,3.0}), 1), 2.0, 3.0));
 
 __m128d test_mm256_mask_extractf64x2_pd(__m128d __W, __mmask8 __U, __m256d __A) {
   // CHECK-LABEL: test_mm256_mask_extractf64x2_pd
@@ -1090,6 +1109,7 @@ __m128d test_mm256_mask_extractf64x2_pd(__m128d __W, __mmask8 __U, __m256d __A)
   // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm256_mask_extractf64x2_pd(__W, __U, __A, 1); 
 }
+TEST_CONSTEXPR(match_m128d(_mm256_mask_extractf64x2_pd((((__m128d){100.0, 101.0})), (__mmask8)0x1, (((__m256d){0.0,1.0,2.0,3.0})),1), 2.0, 101.0));
 
 __m128d test_mm256_maskz_extractf64x2_pd(__mmask8 __U, __m256d __A) {
   // CHECK-LABEL: test_mm256_maskz_extractf64x2_pd
@@ -1097,12 +1117,14 @@ __m128d test_mm256_maskz_extractf64x2_pd(__mmask8 __U, __m256d __A) {
   // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm256_maskz_extractf64x2_pd(__U, __A, 1); 
 }
+TEST_CONSTEXPR(match_m128d(_mm256_maskz_extractf64x2_pd((__mmask8)0x2,(((__m256d){0.0,1.0,2.0,3.0})),1), 0.0, 3.0));
 
 __m128i test_mm256_extracti64x2_epi64(__m256i __A) {
   // CHECK-LABEL: test_mm256_extracti64x2_epi64
   // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> poison, <2 x i32> <i32 2, i32 3>
   return _mm256_extracti64x2_epi64(__A, 1); 
 }
+TEST_CONSTEXPR(match_m128i(_mm256_extracti64x2_epi64(((__m256i){0ULL,1ULL,2ULL,3ULL}), 1), 2ULL, 3ULL));
 
 __m128i test_mm256_mask_extracti64x2_epi64(__m128i __W, __mmask8 __U, __m256i __A) {
   // CHECK-LABEL: test_mm256_mask_extracti64x2_epi64
@@ -1110,6 +1132,7 @@ __m128i test_mm256_mask_extracti64x2_epi64(__m128i __W, __mmask8 __U, __m256i __
   // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm256_mask_extracti64x2_epi64(__W, __U, __A, 1); 
 }
+TEST_CONSTEXPR(match_m128i(_mm256_mask_extracti64x2_epi64((((__m128i){100ULL, 101ULL})), (__mmask8)0x1, (((__m256i){0ULL,1ULL,2ULL,3ULL})), 1), 2ULL, 101ULL));
 
 __m128i test_mm256_maskz_extracti64x2_epi64(__mmask8 __U, __m256i __A) {
   // CHECK-LABEL: test_mm256_maskz_extracti64x2_epi64
@@ -1117,6 +1140,7 @@ __m128i test_mm256_maskz_extracti64x2_epi64(__mmask8 __U, __m256i __A) {
   // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm256_maskz_extracti64x2_epi64(__U, __A, 1); 
 }
+TEST_CONSTEXPR(match_m128i(_mm256_maskz_extracti64x2_epi64((__mmask8)0x2, (((__m256i){0ULL,1ULL,2ULL,3ULL})),1), 0ULL, 3ULL));
 
 __m256d test_mm256_insertf64x2(__m256d __A, __m128d __B) {
   // CHECK-LABEL: test_mm256_insertf64x2
diff --git a/clang/test/CodeGen/X86/avxifma-builtins.c b/clang/test/CodeGen/X86/avxifma-builtins.c
index aa151591ed143..70531da02df21 100644
--- a/clang/test/CodeGen/X86/avxifma-builtins.c
+++ b/clang/test/CodeGen/X86/avxifma-builtins.c
@@ -8,8 +8,9 @@
 // RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avxifma -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
 // RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=i386-apple-darwin -target-feature +avxifma -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
 
-
 #include <immintrin.h>
+#include "builtin_test_helpers.h"
+
 
 __m128i test_mm_madd52hi_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
 // CHECK-LABEL: test_mm_madd52hi_epu64
@@ -17,44 +18,207 @@ __m128i test_mm_madd52hi_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
   return _mm_madd52hi_epu64(__X, __Y, __Z);
 }
 
+TEST_CONSTEXPR(match_v2di(_mm_madd52hi_epu64(
+                              (__m128i)((__v2du){50, 100}),
+                              (__m128i)((__v2du){10, 20}),
+                              (__m128i)((__v2du){5, 6})),
+                          50, 100));
+
+TEST_CONSTEXPR(match_v2di(_mm_madd52hi_epu64(
+                              (__m128i)((__v2du){0, 0}),
+                              (__m128i)((__v2du){0xFFFFFFFFFFFFFull, 0}),
+                              (__m128i)((__v2du){0xFFFFFFFFFFFFFull, 0})),
+                          0xFFFFFFFFFFFFEull, 0));
+
 __m256i test_mm256_madd52hi_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
 // CHECK-LABEL: test_mm256_madd52hi_epu64
 // CHECK:    call {{.*}}<4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}})
   return _mm256_madd52hi_epu64(__X, __Y, __Z);
 }
 
+TEST_CONSTEXPR(match_v4di(_mm256_madd52hi_epu64(
+                              (__m256i)((__v4du){100, 200, 300, 400}),
+                              (__m256i)((__v4du){10, 20, 30, 40}),
+                              (__m256i)((__v4du){5, 6, 7, 8})),
+                          100, 200, 300, 400));
+
+TEST_CONSTEXPR(match_v4di(_mm256_madd52hi_epu64(
+                              (__m256i)((__v4du){0, 0, 0, 0}),
+                              (__m256i)((__v4du){0xFFFFFFFFFFFFFull,
+                                                 0xFFFFFFFFFFFFFull, 0, 0}),
+                              (__m256i)((__v4du){0xFFFFFFFFFFFFFull,
+                                                 0xFFFFFFFFFFFFFull, 0, 0})),
+                          0xFFFFFFFFFFFFEull, 0xFFFFFFFFFFFFEull, 0, 0));
+
 __m128i test_mm_madd52lo_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
 // CHECK-LABEL: test_mm_madd52lo_epu64
 // CHECK:    call {{.*}}<2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_madd52lo_epu64(__X, __Y, __Z);
 }
 
+TEST_CONSTEXPR(match_v2di(_mm_madd52lo_epu64(
+                              (__m128i)((__v2du){0, 0}),
+                              (__m128i)((__v2du){10, 0}),
+                              (__m128i)((__v2du){5, 0})),
+                          50, 0));
+
+TEST_CONSTEXPR(match_v2di(_mm_madd52lo_epu64(
+                              (__m128i)((__v2du){1, 2}),
+                              (__m128i)((__v2du){10, 20}),
+                              (__m128i)((__v2du){2, 3})),
+                          21, 62));
+
+TEST_CONSTEXPR(match_v2di(_mm_madd52lo_epu64(
+                              (__m128i)((__v2du){0, 0}),
+                              (__m128i)((__v2du){0xFFFFFFFFFFFFFull, 0}),
+                              (__m128i)((__v2du){1, 0})),
+                          0xFFFFFFFFFFFFFull, 0));
+
 __m256i test_mm256_madd52lo_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
 // CHECK-LABEL: test_mm256_madd52lo_epu64
 // CHECK:    call {{.*}}<4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}})
   return _mm256_madd52lo_epu64(__X, __Y, __Z);
 }
 
+TEST_CONSTEXPR(match_v4di(_mm256_madd52lo_epu64(
+                              (__m256i)((__v4du){1, 2, 3, 4}),
+                              (__m256i)((__v4du){10, 20, 30, 40}),
+                              (__m256i)((__v4du){2, 3, 4, 5})),
+                          21, 62, 123, 204));
+
+TEST_CONSTEXPR(match_v4di(_mm256_madd52lo_epu64(
+                              (__m256i)((__v4du){0, 0, 0, 0}),
+                              (__m256i)((__v4du){0xFFFFFFFFFFFFFull, 0, 0,
+                                                 0}),
+                              (__m256i)((__v4du){1, 0, 0, 0})),
+                          0xFFFFFFFFFFFFFull, 0, 0, 0));
+
+TEST_CONSTEXPR(match_v4di(_mm256_madd52lo_epu64(
+                              (__m256i)((__v4du){0, 0, 0, 0}),
+                              (__m256i)((__v4du){0x1F000000000000ull, 0, 0,
+                                                 0}),
+                              (__m256i)((__v4du){2, 0, 0, 0})),
+                          0xE000000000000ull, 0, 0, 0));
+
 __m128i test_mm_madd52hi_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
-// CHECK-LABEL: test_mm_madd52hi_avx_epu64
-// CHECK:    call {{.*}}<2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
+  // CHECK-LABEL: test_mm_madd52hi_avx_epu64
+  // CHECK: call {{.*}}<2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_madd52hi_avx_epu64(__X, __Y, __Z);
 }
 
+TEST_CONSTEXPR(match_v2di(_mm_madd52hi_avx_epu64(
+                              (__m128i)((__v2du){50, 100}),
+                              (__m128i)((__v2du){10, 20}),
+                              (__m128i)((__v2du){5, 6})),
+                          50, 100));
+
+TEST_CONSTEXPR(match_v2di(_mm_madd52hi_avx_epu64(
+                              (__m128i)((__v2du){100, 0}),
+                              (__m128i)((__v2du){10, 0}),
+                              (__m128i)((__v2du){5, 0})),
+                          100, 0));
+
+TEST_CONSTEXPR(match_v2di(_mm_madd52hi_avx_epu64(
+                              (__m128i)((__v2du){0, 0}),
+                              (__m128i)((__v2du){0xFFFFFFFFFFFFFull, 0}),
+                              (__m128i)((__v2du){0xFFFFFFFFFFFFFull, 0})),
+                          0xFFFFFFFFFFFFEull, 0));
+                        
 __m256i test_mm256_madd52hi_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
-// CHECK-LABEL: test_mm256_madd52hi_avx_epu64
-// CHECK:    call {{.*}}<4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}})
+  // CHECK-LABEL: test_mm256_madd52hi_avx_epu64
+  // CHECK: call {{.*}}<4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}})
   return _mm256_madd52hi_avx_epu64(__X, __Y, __Z);
 }
 
+TEST_CONSTEXPR(match_v4di(_mm256_madd52hi_avx_epu64(
+                              (__m256i)((__v4du){0, 0, 0, 0}),
+                              (__m256i)((__v4du){0xFFFFFFFFFFFFFull,
+                                                 0xFFFFFFFFFFFFFull, 0, 0}),
+                              (__m256i)((__v4du){0xFFFFFFFFFFFFFull,
+                                                 0xFFFFFFFFFFFFFull, 0, 0})),
+                          0xFFFFFFFFFFFFEull, 0xFFFFFFFFFFFFEull, 0, 0));
+
+TEST_CONSTEXPR(match_v4di(_mm256_madd52hi_avx_epu64(
+                              (__m256i)((__v4du){100, 200, 300, 400}),
+                              (__m256i)((__v4du){10, 20, 30, 40}),
+                              (__m256i)((__v4du){5, 6, 7, 8})),
+                          100, 200, 300, 400));
+
+TEST_CONSTEXPR(match_v4di(_mm256_madd52hi_avx_epu64(
+                              (__m256i)((__v4du){0, 0, 0, 0}),
+                              (__m256i)((__v4du){0xFFFFFFFFFFFFFull, 0, 0,
+                                                 0}),
+                              (__m256i)((__v4du){0xFFFFFFFFFFFFFull, 0, 0,
+                                                 0})),
+                          0xFFFFFFFFFFFFEull, 0, 0, 0));
+
 __m128i test_mm_madd52lo_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
-// CHECK-LABEL: test_mm_madd52lo_avx_epu64
-// CHECK:    call {{.*}}<2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
+  // CHECK-LABEL: test_mm_madd52lo_avx_epu64
+  // CHECK: call {{.*}}<2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_madd52lo_avx_epu64(__X, __Y, __Z);
 }
 
+TEST_CONSTEXPR(match_v2di(_mm_madd52lo_avx_epu64(
+                              (__m128i)((__v2du){0, 0}),
+                              (__m128i)((__v2du){10, 0}),
+                              (__m128i)((__v2du){5, 0})),
+                          50, 0));
+
+TEST_CONSTEXPR(match_v2di(_mm_madd52lo_avx_epu64(
+                              (__m128i)((__v2du){100, 0}),
+                              (__m128i)((__v2du){20, 0}),
+                              (__m128i)((__v2du){30, 0})),
+                          700, 0));
+
+TEST_CONSTEXPR(match_v2di(_mm_madd52lo_avx_epu64(
+                              (__m128i)((__v2du){1, 2}),
+                              (__m128i)((__v2du){10, 20}),
+                              (__m128i)((__v2du){2, 3})),
+                          21, 62));
+
+TEST_CONSTEXPR(match_v2di(_mm_madd52lo_avx_epu64(
+                              (__m128i)((__v2du){0, 0}),
+                              (__m128i)((__v2du){0xFFFFFFFFFFFFFull, 0}),
+                              (__m128i)((__v2du){1, 0})),
+                          0xFFFFFFFFFFFFFull, 0));
+
 __m256i test_mm256_madd52lo_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
-// CHECK-LABEL: test_mm256_madd52lo_avx_epu64
-// CHECK:    call {{.*}}<4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}})
+  // CHECK-LABEL: test_mm256_madd52lo_avx_epu64
+  // CHECK: call {{.*}}<4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}})
   return _mm256_madd52lo_avx_epu64(__X, __Y, __Z);
 }
+
+TEST_CONSTEXPR(match_v4di(_mm256_madd52lo_avx_epu64(
+                              (__m256i)((__v4du){1, 2, 3, 4}),
+                              (__m256i)((__v4du){10, 20, 30, 40}),
+                              (__m256i)((__v4du){2, 3, 4, 5})),
+                          21, 62, 123, 204));
+
+
+
+TEST_CONSTEXPR(match_v4di(_mm256_madd52lo_avx_epu64(
+                              (__m256i)((__v4du){0, 0, 0, 0}),
+                              (__m256i)((__v4du){0xFFFFFFFFFFFFFull, 0, 0,
+                                                 0}),
+                              (__m256i)((__v4du){1, 0, 0, 0})),
+                          0xFFFFFFFFFFFFFull, 0, 0, 0));
+
+TEST_CONSTEXPR(match_v4di(_mm256_madd52lo_avx_epu64(
+                              (__m256i)((__v4du){0, 0, 0, 0}),
+                              (__m256i)((__v4du){0x1F000000000000ull, 0, 0,
+                                                 0}),
+                              (__m256i)((__v4du){2, 0, 0, 0})),
+                          0xE000000000000ull, 0, 0, 0));
+
+TEST_CONSTEXPR(match_v2di(_mm_madd52lo_avx_epu64(
+                              (__m128i)((__v2du){5, 10}),
+                              (__m128i)((__v2du){100, 200}),
+                              (__m128i)((__v2du){7, 8})),
+                          705, 1610));
+
+TEST_CONSTEXPR(match_v4di(_mm256_madd52lo_avx_epu64(
+                              (__m256i)((__v4du){1, 2, 3, 4}),
+                              (__m256i)((__v4du){10, 20, 30, 40}),
+                              (__m256i)((__v4du){2, 3, 4, 5})),
+                          21, 62, 123, 204));
+
diff --git a/clang/test/CodeGen/X86/mmx-builtins.c b/clang/test/CodeGen/X86/mmx-builtins.c
index 2b45b920cf2bc..767425577a130 100644
--- a/clang/test/CodeGen/X86/mmx-builtins.c
+++ b/clang/test/CodeGen/X86/mmx-builtins.c
@@ -312,36 +312,42 @@ __m64 test_mm_hadd_pi16(__m64 a, __m64 b) {
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.w.128(
   return _mm_hadd_pi16(a, b);
 }
+TEST_CONSTEXPR(match_v4hi(_mm_hadd_pi16((__m64)(__v4hi){1,2,3,4},(__m64)(__v4hi){5,6,7,8}),3,7,11,15));
 
 __m64 test_mm_hadd_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hadd_pi32
   // CHECK: call <4 x i32> @llvm.x86.ssse3.phadd.d.128(
   return _mm_hadd_pi32(a, b);
 }
+TEST_CONSTEXPR(match_v2si(_mm_hadd_pi32((__m64)(__v2si){1,2},(__m64)(__v2si){3,4}),3,7));
 
 __m64 test_mm_hadds_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hadds_pi16
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(
   return _mm_hadds_pi16(a, b);
 }
+TEST_CONSTEXPR(match_v4hi(_mm_hadds_pi16((__m64)(__v4hi){32767, 32767, 1,3},(__m64)(__v4hi){-1,3, 40, 60}),32767, 4, 2,100));
 
 __m64 test_mm_hsub_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hsub_pi16
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.w.128(
   return _mm_hsub_pi16(a, b);
 }
+TEST_CONSTEXPR(match_v4hi(_mm_hsub_pi16((__m64)(__v4hi){1,2,4,3},(__m64)(__v4hi){10,5,0,-10}),-1,1,5,10));
 
 __m64 test_mm_hsub_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hsub_pi32
   // CHECK: call <4 x i32> @llvm.x86.ssse3.phsub.d.128(
   return _mm_hsub_pi32(a, b);
 }
+TEST_CONSTEXPR(match_v2si(_mm_hsub_pi32((__m64)(__v2si){1,2},(__m64)(__v2si){4,3}),-1,1));
 
 __m64 test_mm_hsubs_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hsubs_pi16
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(
   return _mm_hsubs_pi16(a, b);
 }
+TEST_CONSTEXPR(match_v4hi(_mm_hsubs_pi16((__m64)(__v4hi){32767, 32767, 5, -32767},(__m64)(__v4hi){4,5,10,5}),0,32767,-1,5));
 
 __m64 test_mm_insert_pi16(__m64 a, int d) {
   // CHECK-LABEL: test_mm_insert_pi16
@@ -403,6 +409,10 @@ int test_mm_movemask_pi8(__m64 a) {
   // CHECK: call {{.*}}i32 @llvm.x86.sse2.pmovmskb.128(
   return _mm_movemask_pi8(a);
 }
+TEST_CONSTEXPR(_mm_movemask_pi8((__m64)((__v8qu){0x7F,0x80,0x01,0xFF,0x00,0xAA,0x55,0xC3})) == 0xAA);
+TEST_CONSTEXPR(_mm_movemask_pi8((__m64)((__v2si){(int)0x80FF00AA,(int)0x7F0183E1})) == 0x3D);
+TEST_CONSTEXPR(_mm_movemask_pi8((__m64)((__v1di){(long long)0xE110837A00924DB0ULL})) == 0xA5);
+
 
 __m64 test_mm_mul_su32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_mul_su32
@@ -432,6 +442,7 @@ __m64 test_mm_mulhrs_pi16(__m64 a, __m64 b) {
   // CHECK: call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(
   return _mm_mulhrs_pi16(a, b);
 }
+TEST_CONSTEXPR(match_v4hi(_mm_mulhrs_pi16((__m64)(__v4hi){+100, +200, -300, -400}, (__m64)(__v4hi){+30000, -20000, +10000, -5000}), +92, -122, -92, +61));
 
 __m64 test_mm_mullo_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_mullo_pi16
@@ -583,6 +594,8 @@ __m64 test_mm_shuffle_pi8(__m64 a, __m64 b) {
   return _mm_shuffle_pi8(a, b);
 }
 
+TEST_CONSTEXPR(match_v8qi(_mm_shuffle_pi8((__m64)(__v8qi){0,1,2,3,4,5,6,7}, (__m64)(__v8qi){10,20,30,40,50,60,70,80}), 2,4,6,0,2,4,6,0));
+
 __m64 test_mm_shuffle_pi16(__m64 a) {
   // CHECK-LABEL: test_mm_shuffle_pi16
   // CHECK: shufflevector <4 x i16> {{%.*}}, <4 x i16> {{%.*}}, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
diff --git a/clang/test/CodeGen/X86/sse-builtins.c b/clang/test/CodeGen/X86/sse-builtins.c
index 3bad3426b1586..f5c1d00d1b851 100644
--- a/clang/test/CodeGen/X86/sse-builtins.c
+++ b/clang/test/CodeGen/X86/sse-builtins.c
@@ -561,6 +561,8 @@ int test_mm_movemask_ps(__m128 A) {
   // CHECK: call {{.*}}i32 @llvm.x86.sse.movmsk.ps(<4 x float> %{{.*}})
   return _mm_movemask_ps(A);
 }
+TEST_CONSTEXPR(_mm_movemask_ps((__m128)(__v4sf){-2.0f, 3.0f, -5.5f, -0.0f}) == 0xD);
+TEST_CONSTEXPR(_mm_movemask_ps((__m128)(__v4sf){-7.348215e5, 0.00314159, -12.789, 2.7182818}) == 0x5);
 
 __m128 test_mm_mul_ps(__m128 A, __m128 B) {
   // CHECK-LABEL: test_mm_mul_ps
diff --git a/clang/test/CodeGen/X86/sse2-builtins.c b/clang/test/CodeGen/X86/sse2-builtins.c
index ade7ef39a008a..8e4fb86112c56 100644
--- a/clang/test/CodeGen/X86/sse2-builtins.c
+++ b/clang/test/CodeGen/X86/sse2-builtins.c
@@ -956,12 +956,17 @@ int test_mm_movemask_epi8(__m128i A) {
   // CHECK: call {{.*}}i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %{{.*}})
   return _mm_movemask_epi8(A);
 }
+TEST_CONSTEXPR(_mm_movemask_epi8((__m128i)(__v16qu){0x7F,0x80,0x01,0xFF,0x00,0xAA,0x55,0xC3,0x12,0x8E,0x00,0xFE,0x7E,0x81,0xFF,0x01}) == 0x6AAA);
+TEST_CONSTEXPR(_mm_movemask_epi8((__m128i)(__v4si){(int)0x80FF00AA,(int)0x7F0183E1,(int)0xDEADBEEF,(int)0xC0000001}) == 0x8F3D);
+TEST_CONSTEXPR(_mm_movemask_epi8((__m128i)(__v2du){0xFF00000000000080ULL,0x7F010203040506C3ULL}) == 0x181);
 
 int test_mm_movemask_pd(__m128d A) {
   // CHECK-LABEL: test_mm_movemask_pd
   // CHECK: call {{.*}}i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %{{.*}})
   return _mm_movemask_pd(A);
 }
+TEST_CONSTEXPR(_mm_movemask_pd((__m128d)(__v2df){-12345.67890123, 4567.89012345}) == 0x1);
+TEST_CONSTEXPR(_mm_movemask_pd((__m128d)(__v2df){0.0000987654321, 09876.5432109876}) == 0x0);
 
 __m128i test_mm_mul_epu32(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_mul_epu32
diff --git a/clang/test/CodeGen/X86/sse3-builtins.c b/clang/test/CodeGen/X86/sse3-builtins.c
index c53afc56e7246..a82dd4080670b 100644
--- a/clang/test/CodeGen/X86/sse3-builtins.c
+++ b/clang/test/CodeGen/X86/sse3-builtins.c
@@ -31,24 +31,28 @@ __m128d test_mm_hadd_pd(__m128d A, __m128d B) {
   // CHECK: call {{.*}}<2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_hadd_pd(A, B);
 }
+TEST_CONSTEXPR(match_m128d(_mm_hadd_pd((__m128d){+1.0, +2.0}, (__m128d){+3.0, +4.0}), +3.0, +7.0));
 
 __m128 test_mm_hadd_ps(__m128 A, __m128 B) {
   // CHECK-LABEL: test_mm_hadd_ps
   // CHECK: call {{.*}}<4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
   return _mm_hadd_ps(A, B);
 }
+TEST_CONSTEXPR(match_m128(_mm_hadd_ps((__m128){+1.0f, +2.0f, +3.0f, +4.0f}, (__m128){+5.0f,+6.0f,+7.0f,+8.0f}), +3.0f, +7.0f, +11.0f, +15.0f));
 
 __m128d test_mm_hsub_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_hsub_pd
   // CHECK: call {{.*}}<2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_hsub_pd(A, B);
 }
+TEST_CONSTEXPR(match_m128d(_mm_hsub_pd((__m128d){+1.0, +2.0}, (__m128d){+4.0, +3.0}), -1.0, +1.0));
 
 __m128 test_mm_hsub_ps(__m128 A, __m128 B) {
   // CHECK-LABEL: test_mm_hsub_ps
   // CHECK: call {{.*}}<4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
   return _mm_hsub_ps(A, B);
 }
+TEST_CONSTEXPR(match_m128(_mm_hsub_ps((__m128){+1.0f, +2.0f, +4.0f, +3.0f}, (__m128){+5.0f,+7.0f,+10.0f,+8.0f}), -1.0f, +1.0f, -2.0f, +2.0f));
 
 __m128i test_mm_lddqu_si128(__m128i const* P) {
   // CHECK-LABEL: test_mm_lddqu_si128
diff --git a/clang/test/CodeGen/X86/sse41-builtins.c b/clang/test/CodeGen/X86/sse41-builtins.c
index 89a7ac29e7db7..62cd392824bb2 100644
--- a/clang/test/CodeGen/X86/sse41-builtins.c
+++ b/clang/test/CodeGen/X86/sse41-builtins.c
@@ -376,6 +376,16 @@ __m128i test_mm_minpos_epu16(__m128i x) {
   // CHECK: call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %{{.*}})
   return _mm_minpos_epu16(x);
 }
+TEST_CONSTEXPR(match_v8hu(_mm_minpos_epu16((__m128i)(__v8hu){0,0,0,0, 0,0,0,0}), 0,0,0,0, 0,0,0,0));
+TEST_CONSTEXPR(match_v8hu(_mm_minpos_epu16((__m128i)(__v8hu){1,0,0,0, 0,0,0,0}), 0,1,0,0, 0,0,0,0));
+TEST_CONSTEXPR(match_v8hu(_mm_minpos_epu16((__m128i)(__v8hu){65535,65535,65535,65535,65535,65535,65535,65535}), 65535,0,0,0, 0,0,0,0));
+TEST_CONSTEXPR(match_v8hu(_mm_minpos_epu16((__m128i)(__v8hu){9,8,7,6,5,4,3,2}), 2,7,0,0, 0,0,0,0));
+TEST_CONSTEXPR(match_v8hu(_mm_minpos_epu16((__m128i)(__v8hu){5,5,5,5,5,5,5,5}), 5,0,0,0, 0,0,0,0));
+TEST_CONSTEXPR(match_v8hu(_mm_minpos_epu16((__m128i)(__v8hu){5,7,9,4,10,4,11,12}), 4,3,0,0, 0,0,0,0));
+TEST_CONSTEXPR(match_v8hu(_mm_minpos_epu16((__m128i)(__v8hu){6,0,0,0,0,0,0,0}), 0,1,0,0, 0,0,0,0));
+TEST_CONSTEXPR(match_v8hu(_mm_minpos_epu16((__m128i)(__v8hu){1000,2000,3000,4000,5000,6000,7000,1}), 1,7,0,0, 0,0,0,0));
+TEST_CONSTEXPR(match_v8hu(_mm_minpos_epu16((__m128i)(__v8hu){1234,5678,42,9999,65535,0,4242,42}), 0,5,0,0, 0,0,0,0));
+TEST_CONSTEXPR(match_v8hu(_mm_minpos_epu16((__m128i)(__v8hu){400,500,12,600,12,700,800,900}), 12,2,0,0, 0,0,0,0));
 
 __m128i test_mm_mpsadbw_epu8(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_mpsadbw_epu8
diff --git a/clang/test/CodeGen/X86/ssse3-builtins.c b/clang/test/CodeGen/X86/ssse3-builtins.c
index 588576879903b..f70afc01a1963 100644
--- a/clang/test/CodeGen/X86/ssse3-builtins.c
+++ b/clang/test/CodeGen/X86/ssse3-builtins.c
@@ -60,36 +60,43 @@ __m128i test_mm_hadd_epi16(__m128i a, __m128i b) {
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_hadd_epi16(a, b);
 }
+TEST_CONSTEXPR(match_v8hi(_mm_hadd_epi16((__m128i)(__v8hi){1,2,3,4,5,6,7,8}, (__m128i)(__v8hi){17,18,19,20,21,22,23,24}), 3,7,11,15,35,39,43,47));
 
 __m128i test_mm_hadd_epi32(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_hadd_epi32
   // CHECK: call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_hadd_epi32(a, b);
 }
+TEST_CONSTEXPR(match_v4si(_mm_hadd_epi32((__m128i)(__v4si){1,2,3,4}, (__m128i)(__v4si){5,6,7,8}), 3,7,11,15));
 
 __m128i test_mm_hadds_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_hadds_epi16
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_hadds_epi16(a, b);
 }
+TEST_CONSTEXPR(match_v8hi(_mm_hadds_epi16((__m128i)(__v8hi){30000,30000,-1,2,-3,3,1,4}, (__m128i)(__v8hi){2,6,1,9,-4,16,7,8}), 32767, 1,0,5,8,10,12,15));
+
 
 __m128i test_mm_hsub_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_hsub_epi16
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_hsub_epi16(a, b);
 }
+TEST_CONSTEXPR(match_v8hi(_mm_hsub_epi16((__m128i)(__v8hi){20,15,16,12,9,6,4,2}, (__m128i)(__v8hi){3,2,1,1,4,5,0,2}), 5,4,3,2,1,0,-1,-2));
 
 __m128i test_mm_hsub_epi32(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_hsub_epi32
   // CHECK: call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_hsub_epi32(a, b);
 }
+TEST_CONSTEXPR(match_v4si(_mm_hsub_epi32((__m128i)(__v4si){4,3,1,1}, (__m128i)(__v4si){7,5,10,5}), 1,0,2,5));   
 
 __m128i test_mm_hsubs_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_hsubs_epi16
   // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_hsubs_epi16(a, b);
 }
+TEST_CONSTEXPR(match_v8hi(_mm_hsubs_epi16((__m128i)(__v8hi){32767, -15,16,12,9,6,4,2},(__m128i)(__v8hi){3,2,1,1,4,5,0,2}), 32767,4,3,2,1,0,-1,-2));
 
 __m128i test_mm_maddubs_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_maddubs_epi16
@@ -103,6 +110,7 @@ __m128i test_mm_mulhrs_epi16(__m128i a, __m128i b) {
   // CHECK: call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_mulhrs_epi16(a, b);
 }
+TEST_CONSTEXPR(match_v8hi(_mm_mulhrs_epi16((__m128i)(__v8hi){+100, +200, -300, -400, +500, +600, -700, +800}, (__m128i)(__v8hi){+8000, -7000, +6000, -5000, +4000, -3000, +2000, -1000}), +24, -43, -55, +61, +61, -55, -43, -24));
 
 __m128i test_mm_shuffle_epi8(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_shuffle_epi8
@@ -110,6 +118,8 @@ __m128i test_mm_shuffle_epi8(__m128i a, __m128i b) {
   return _mm_shuffle_epi8(a, b);
 }
 
+TEST_CONSTEXPR(match_v16qi(_mm_shuffle_epi8((__m128i)(__v16qs){0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15}, (__m128i)(__v16qs){15,-14,13,-12,11,-10,9,-8,7,-6,5,-4,3,-2,1,0}), -15,0,-13,0,-11,0,-9,0,-7,0,-5,0,-3,0,-1,0));
+
 __m128i test_mm_sign_epi8(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_sign_epi8
   // CHECK: call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
diff --git a/clang/test/CodeGen/allow-ubsan-check.c b/clang/test/CodeGen/allow-ubsan-check.c
index 8d30e29886046..1e128854d6a75 100644
--- a/clang/test/CodeGen/allow-ubsan-check.c
+++ b/clang/test/CodeGen/allow-ubsan-check.c
@@ -10,27 +10,27 @@
 // CHECK-LABEL: define dso_local noundef i32 @div(
 // CHECK-SAME: i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = icmp ne i32 [[Y]], 0, !nosanitize [[META2:![0-9]+]]
-// CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i32 [[X]], -2147483648, !nosanitize [[META2]]
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[Y]], -1, !nosanitize [[META2]]
-// CHECK-NEXT:    [[OR:%.*]] = or i1 [[TMP1]], [[TMP2]], !nosanitize [[META2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp ne i32 [[Y]], 0, !nosanitize [[META6:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i32 [[X]], -2147483648, !nosanitize [[META6]]
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[Y]], -1, !nosanitize [[META6]]
+// CHECK-NEXT:    [[OR:%.*]] = or i1 [[TMP1]], [[TMP2]], !nosanitize [[META6]]
 //
 //                                                                       27 == SO_IntegerDivideByZero
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 27), !nosanitize [[META2]]
-// CHECK-NEXT:    [[TMP4:%.*]] = xor i1 [[TMP3]], true, !nosanitize [[META2]]
-// CHECK-NEXT:    [[TMP5:%.*]] = or i1 [[TMP0]], [[TMP4]], !nosanitize [[META2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 27), !nosanitize [[META6]]
+// CHECK-NEXT:    [[TMP4:%.*]] = xor i1 [[TMP3]], true, !nosanitize [[META6]]
+// CHECK-NEXT:    [[TMP5:%.*]] = or i1 [[TMP0]], [[TMP4]], !nosanitize [[META6]]
 //
 //                                                                       41 == SO_SignedIntegerOverflow
-// CHECK-NEXT:    [[TMP6:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 41), !nosanitize [[META2]]
-// CHECK-NEXT:    [[TMP7:%.*]] = xor i1 [[TMP6]], true, !nosanitize [[META2]]
-// CHECK-NEXT:    [[TMP8:%.*]] = or i1 [[OR]], [[TMP7]], !nosanitize [[META2]]
-// CHECK-NEXT:    [[TMP9:%.*]] = and i1 [[TMP5]], [[TMP8]], !nosanitize [[META2]]
-// CHECK-NEXT:    br i1 [[TMP9]], label %[[CONT:.*]], label %[[HANDLER_DIVREM_OVERFLOW:.*]], !prof [[PROF3:![0-9]+]], !nosanitize [[META2]]
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 41), !nosanitize [[META6]]
+// CHECK-NEXT:    [[TMP7:%.*]] = xor i1 [[TMP6]], true, !nosanitize [[META6]]
+// CHECK-NEXT:    [[TMP8:%.*]] = or i1 [[OR]], [[TMP7]], !nosanitize [[META6]]
+// CHECK-NEXT:    [[TMP9:%.*]] = and i1 [[TMP5]], [[TMP8]], !nosanitize [[META6]]
+// CHECK-NEXT:    br i1 [[TMP9]], label %[[CONT:.*]], label %[[HANDLER_DIVREM_OVERFLOW:.*]], !prof [[PROF7:![0-9]+]], !nosanitize [[META6]]
 // CHECK:       [[HANDLER_DIVREM_OVERFLOW]]:
-// CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[X]] to i64, !nosanitize [[META2]]
-// CHECK-NEXT:    [[TMP11:%.*]] = zext i32 [[Y]] to i64, !nosanitize [[META2]]
-// CHECK-NEXT:    tail call void @__ubsan_handle_divrem_overflow_abort(ptr nonnull @[[GLOB1:[0-9]+]], i64 [[TMP10]], i64 [[TMP11]]) #[[ATTR6:[0-9]+]], !nosanitize [[META2]]
-// CHECK-NEXT:    unreachable, !nosanitize [[META2]]
+// CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[X]] to i64, !nosanitize [[META6]]
+// CHECK-NEXT:    [[TMP11:%.*]] = zext i32 [[Y]] to i64, !nosanitize [[META6]]
+// CHECK-NEXT:    tail call void @__ubsan_handle_divrem_overflow_abort(ptr nonnull @[[GLOB1:[0-9]+]], i64 [[TMP10]], i64 [[TMP11]]) #[[ATTR6:[0-9]+]], !nosanitize [[META6]]
+// CHECK-NEXT:    unreachable, !nosanitize [[META6]]
 // CHECK:       [[CONT]]:
 // CHECK-NEXT:    [[DIV:%.*]] = sdiv i32 [[X]], [[Y]]
 // CHECK-NEXT:    ret i32 [[DIV]]
@@ -38,21 +38,21 @@
 // TR-LABEL: define dso_local noundef i32 @div(
 // TR-SAME: i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // TR-NEXT:  [[ENTRY:.*:]]
-// TR-NEXT:    [[TMP0:%.*]] = icmp ne i32 [[Y]], 0, !nosanitize [[META2:![0-9]+]]
-// TR-NEXT:    [[TMP1:%.*]] = icmp ne i32 [[X]], -2147483648, !nosanitize [[META2]]
-// TR-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[Y]], -1, !nosanitize [[META2]]
-// TR-NEXT:    [[OR:%.*]] = or i1 [[TMP1]], [[TMP2]], !nosanitize [[META2]]
-// TR-NEXT:    [[TMP3:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 27), !nosanitize [[META2]]
-// TR-NEXT:    [[TMP4:%.*]] = xor i1 [[TMP3]], true, !nosanitize [[META2]]
-// TR-NEXT:    [[TMP5:%.*]] = or i1 [[TMP0]], [[TMP4]], !nosanitize [[META2]]
-// TR-NEXT:    [[TMP6:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 41), !nosanitize [[META2]]
-// TR-NEXT:    [[TMP7:%.*]] = xor i1 [[TMP6]], true, !nosanitize [[META2]]
-// TR-NEXT:    [[TMP8:%.*]] = or i1 [[OR]], [[TMP7]], !nosanitize [[META2]]
-// TR-NEXT:    [[TMP9:%.*]] = and i1 [[TMP5]], [[TMP8]], !nosanitize [[META2]]
-// TR-NEXT:    br i1 [[TMP9]], label %[[CONT:.*]], label %[[TRAP:.*]], !prof [[PROF3:![0-9]+]], !nosanitize [[META2]]
+// TR-NEXT:    [[TMP0:%.*]] = icmp ne i32 [[Y]], 0, !nosanitize [[META6:![0-9]+]]
+// TR-NEXT:    [[TMP1:%.*]] = icmp ne i32 [[X]], -2147483648, !nosanitize [[META6]]
+// TR-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[Y]], -1, !nosanitize [[META6]]
+// TR-NEXT:    [[OR:%.*]] = or i1 [[TMP1]], [[TMP2]], !nosanitize [[META6]]
+// TR-NEXT:    [[TMP3:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 27), !nosanitize [[META6]]
+// TR-NEXT:    [[TMP4:%.*]] = xor i1 [[TMP3]], true, !nosanitize [[META6]]
+// TR-NEXT:    [[TMP5:%.*]] = or i1 [[TMP0]], [[TMP4]], !nosanitize [[META6]]
+// TR-NEXT:    [[TMP6:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 41), !nosanitize [[META6]]
+// TR-NEXT:    [[TMP7:%.*]] = xor i1 [[TMP6]], true, !nosanitize [[META6]]
+// TR-NEXT:    [[TMP8:%.*]] = or i1 [[OR]], [[TMP7]], !nosanitize [[META6]]
+// TR-NEXT:    [[TMP9:%.*]] = and i1 [[TMP5]], [[TMP8]], !nosanitize [[META6]]
+// TR-NEXT:    br i1 [[TMP9]], label %[[CONT:.*]], label %[[TRAP:.*]], !prof [[PROF7:![0-9]+]], !nosanitize [[META6]]
 // TR:       [[TRAP]]:
-// TR-NEXT:    tail call void @llvm.ubsantrap(i8 3) #[[ATTR7:[0-9]+]], !nosanitize [[META2]]
-// TR-NEXT:    unreachable, !nosanitize [[META2]]
+// TR-NEXT:    tail call void @llvm.ubsantrap(i8 3) #[[ATTR7:[0-9]+]], !nosanitize [[META6]]
+// TR-NEXT:    unreachable, !nosanitize [[META6]]
 // TR:       [[CONT]]:
 // TR-NEXT:    [[DIV:%.*]] = sdiv i32 [[X]], [[Y]]
 // TR-NEXT:    ret i32 [[DIV]]
@@ -60,23 +60,23 @@
 // REC-LABEL: define dso_local noundef i32 @div(
 // REC-SAME: i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // REC-NEXT:  [[ENTRY:.*:]]
-// REC-NEXT:    [[TMP0:%.*]] = icmp ne i32 [[Y]], 0, !nosanitize [[META2:![0-9]+]]
-// REC-NEXT:    [[TMP1:%.*]] = icmp ne i32 [[X]], -2147483648, !nosanitize [[META2]]
-// REC-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[Y]], -1, !nosanitize [[META2]]
-// REC-NEXT:    [[OR:%.*]] = or i1 [[TMP1]], [[TMP2]], !nosanitize [[META2]]
-// REC-NEXT:    [[TMP3:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 27), !nosanitize [[META2]]
-// REC-NEXT:    [[TMP4:%.*]] = xor i1 [[TMP3]], true, !nosanitize [[META2]]
-// REC-NEXT:    [[TMP5:%.*]] = or i1 [[TMP0]], [[TMP4]], !nosanitize [[META2]]
-// REC-NEXT:    [[TMP6:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 41), !nosanitize [[META2]]
-// REC-NEXT:    [[TMP7:%.*]] = xor i1 [[TMP6]], true, !nosanitize [[META2]]
-// REC-NEXT:    [[TMP8:%.*]] = or i1 [[OR]], [[TMP7]], !nosanitize [[META2]]
-// REC-NEXT:    [[TMP9:%.*]] = and i1 [[TMP5]], [[TMP8]], !nosanitize [[META2]]
-// REC-NEXT:    br i1 [[TMP9]], label %[[CONT:.*]], label %[[HANDLER_DIVREM_OVERFLOW:.*]], !prof [[PROF3:![0-9]+]], !nosanitize [[META2]]
+// REC-NEXT:    [[TMP0:%.*]] = icmp ne i32 [[Y]], 0, !nosanitize [[META6:![0-9]+]]
+// REC-NEXT:    [[TMP1:%.*]] = icmp ne i32 [[X]], -2147483648, !nosanitize [[META6]]
+// REC-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[Y]], -1, !nosanitize [[META6]]
+// REC-NEXT:    [[OR:%.*]] = or i1 [[TMP1]], [[TMP2]], !nosanitize [[META6]]
+// REC-NEXT:    [[TMP3:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 27), !nosanitize [[META6]]
+// REC-NEXT:    [[TMP4:%.*]] = xor i1 [[TMP3]], true, !nosanitize [[META6]]
+// REC-NEXT:    [[TMP5:%.*]] = or i1 [[TMP0]], [[TMP4]], !nosanitize [[META6]]
+// REC-NEXT:    [[TMP6:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 41), !nosanitize [[META6]]
+// REC-NEXT:    [[TMP7:%.*]] = xor i1 [[TMP6]], true, !nosanitize [[META6]]
+// REC-NEXT:    [[TMP8:%.*]] = or i1 [[OR]], [[TMP7]], !nosanitize [[META6]]
+// REC-NEXT:    [[TMP9:%.*]] = and i1 [[TMP5]], [[TMP8]], !nosanitize [[META6]]
+// REC-NEXT:    br i1 [[TMP9]], label %[[CONT:.*]], label %[[HANDLER_DIVREM_OVERFLOW:.*]], !prof [[PROF7:![0-9]+]], !nosanitize [[META6]]
 // REC:       [[HANDLER_DIVREM_OVERFLOW]]:
-// REC-NEXT:    [[TMP10:%.*]] = zext i32 [[X]] to i64, !nosanitize [[META2]]
-// REC-NEXT:    [[TMP11:%.*]] = zext i32 [[Y]] to i64, !nosanitize [[META2]]
-// REC-NEXT:    tail call void @__ubsan_handle_divrem_overflow(ptr nonnull @[[GLOB1:[0-9]+]], i64 [[TMP10]], i64 [[TMP11]]) #[[ATTR6:[0-9]+]], !nosanitize [[META2]]
-// REC-NEXT:    br label %[[CONT]], !nosanitize [[META2]]
+// REC-NEXT:    [[TMP10:%.*]] = zext i32 [[X]] to i64, !nosanitize [[META6]]
+// REC-NEXT:    [[TMP11:%.*]] = zext i32 [[Y]] to i64, !nosanitize [[META6]]
+// REC-NEXT:    tail call void @__ubsan_handle_divrem_overflow(ptr nonnull @[[GLOB1:[0-9]+]], i64 [[TMP10]], i64 [[TMP11]]) #[[ATTR6:[0-9]+]], !nosanitize [[META6]]
+// REC-NEXT:    br label %[[CONT]], !nosanitize [[META6]]
 // REC:       [[CONT]]:
 // REC-NEXT:    [[DIV:%.*]] = sdiv i32 [[X]], [[Y]]
 // REC-NEXT:    ret i32 [[DIV]]
@@ -88,45 +88,45 @@ int div(int x, int y) {
 // CHECK-LABEL: define dso_local i32 @null(
 // CHECK-SAME: ptr noundef readonly captures(address_is_null) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = icmp eq ptr [[X]], null, !nosanitize [[META2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp eq ptr [[X]], null, !nosanitize [[META6]]
 //
 //                                                                       29 == SO_Null
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 29), !nosanitize [[META2]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 29), !nosanitize [[META6]]
 // CHECK-NEXT:    [[DOTNOT1:%.*]] = and i1 [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    br i1 [[DOTNOT1]], label %[[HANDLER_TYPE_MISMATCH:.*]], label %[[CONT:.*]], !prof [[PROF4:![0-9]+]], !nosanitize [[META2]]
+// CHECK-NEXT:    br i1 [[DOTNOT1]], label %[[HANDLER_TYPE_MISMATCH:.*]], label %[[CONT:.*]], !prof [[PROF8:![0-9]+]], !nosanitize [[META6]]
 // CHECK:       [[HANDLER_TYPE_MISMATCH]]:
-// CHECK-NEXT:    tail call void @__ubsan_handle_type_mismatch_v1_abort(ptr nonnull @[[GLOB2:[0-9]+]], i64 0) #[[ATTR6]], !nosanitize [[META2]]
-// CHECK-NEXT:    unreachable, !nosanitize [[META2]]
+// CHECK-NEXT:    tail call void @__ubsan_handle_type_mismatch_v1_abort(ptr nonnull @[[GLOB2:[0-9]+]], i64 0) #[[ATTR6]], !nosanitize [[META6]]
+// CHECK-NEXT:    unreachable, !nosanitize [[META6]]
 // CHECK:       [[CONT]]:
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[INT_TBAA5:![0-9]+]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[INT_TBAA2:![0-9]+]]
 // CHECK-NEXT:    ret i32 [[TMP2]]
 //
 // TR-LABEL: define dso_local i32 @null(
 // TR-SAME: ptr noundef readonly captures(address_is_null) [[X:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] {
 // TR-NEXT:  [[ENTRY:.*:]]
-// TR-NEXT:    [[TMP0:%.*]] = icmp eq ptr [[X]], null, !nosanitize [[META2]]
-// TR-NEXT:    [[TMP1:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 29), !nosanitize [[META2]]
+// TR-NEXT:    [[TMP0:%.*]] = icmp eq ptr [[X]], null, !nosanitize [[META6]]
+// TR-NEXT:    [[TMP1:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 29), !nosanitize [[META6]]
 // TR-NEXT:    [[DOTNOT1:%.*]] = and i1 [[TMP0]], [[TMP1]]
-// TR-NEXT:    br i1 [[DOTNOT1]], label %[[TRAP:.*]], label %[[CONT:.*]], !prof [[PROF4:![0-9]+]], !nosanitize [[META2]]
+// TR-NEXT:    br i1 [[DOTNOT1]], label %[[TRAP:.*]], label %[[CONT:.*]], !prof [[PROF8:![0-9]+]], !nosanitize [[META6]]
 // TR:       [[TRAP]]:
-// TR-NEXT:    tail call void @llvm.ubsantrap(i8 22) #[[ATTR7]], !nosanitize [[META2]]
-// TR-NEXT:    unreachable, !nosanitize [[META2]]
+// TR-NEXT:    tail call void @llvm.ubsantrap(i8 22) #[[ATTR7]], !nosanitize [[META6]]
+// TR-NEXT:    unreachable, !nosanitize [[META6]]
 // TR:       [[CONT]]:
-// TR-NEXT:    [[TMP2:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[INT_TBAA5:![0-9]+]]
+// TR-NEXT:    [[TMP2:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[INT_TBAA2:![0-9]+]]
 // TR-NEXT:    ret i32 [[TMP2]]
 //
 // REC-LABEL: define dso_local i32 @null(
 // REC-SAME: ptr noundef readonly captures(address_is_null) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // REC-NEXT:  [[ENTRY:.*:]]
-// REC-NEXT:    [[TMP0:%.*]] = icmp eq ptr [[X]], null, !nosanitize [[META2]]
-// REC-NEXT:    [[TMP1:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 29), !nosanitize [[META2]]
+// REC-NEXT:    [[TMP0:%.*]] = icmp eq ptr [[X]], null, !nosanitize [[META6]]
+// REC-NEXT:    [[TMP1:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 29), !nosanitize [[META6]]
 // REC-NEXT:    [[DOTNOT1:%.*]] = and i1 [[TMP0]], [[TMP1]]
-// REC-NEXT:    br i1 [[DOTNOT1]], label %[[HANDLER_TYPE_MISMATCH:.*]], label %[[CONT:.*]], !prof [[PROF4:![0-9]+]], !nosanitize [[META2]]
+// REC-NEXT:    br i1 [[DOTNOT1]], label %[[HANDLER_TYPE_MISMATCH:.*]], label %[[CONT:.*]], !prof [[PROF8:![0-9]+]], !nosanitize [[META6]]
 // REC:       [[HANDLER_TYPE_MISMATCH]]:
-// REC-NEXT:    tail call void @__ubsan_handle_type_mismatch_v1(ptr nonnull @[[GLOB2:[0-9]+]], i64 0) #[[ATTR6]], !nosanitize [[META2]]
-// REC-NEXT:    br label %[[CONT]], !nosanitize [[META2]]
+// REC-NEXT:    tail call void @__ubsan_handle_type_mismatch_v1(ptr nonnull @[[GLOB2:[0-9]+]], i64 0) #[[ATTR6]], !nosanitize [[META6]]
+// REC-NEXT:    br label %[[CONT]], !nosanitize [[META6]]
 // REC:       [[CONT]]:
-// REC-NEXT:    [[TMP2:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[INT_TBAA5:![0-9]+]]
+// REC-NEXT:    [[TMP2:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[INT_TBAA2:![0-9]+]]
 // REC-NEXT:    ret i32 [[TMP2]]
 //
 int null(int* x) {
@@ -136,52 +136,52 @@ int null(int* x) {
 // CHECK-LABEL: define dso_local noundef i32 @overflow(
 // CHECK-SAME: i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[X]], i32 [[Y]]), !nosanitize [[META2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i32, i1 } [[TMP0]], 1, !nosanitize [[META2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[X]], i32 [[Y]]), !nosanitize [[META6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i32, i1 } [[TMP0]], 1, !nosanitize [[META6]]
 //
 //                                                                       41 == SO_SignedIntegerOverflow
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 41), !nosanitize [[META2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 41), !nosanitize [[META6]]
 // CHECK-NEXT:    [[DOTDEMORGAN:%.*]] = and i1 [[TMP1]], [[TMP2]]
-// CHECK-NEXT:    br i1 [[DOTDEMORGAN]], label %[[HANDLER_ADD_OVERFLOW:.*]], label %[[CONT:.*]], !prof [[PROF4]], !nosanitize [[META2]]
+// CHECK-NEXT:    br i1 [[DOTDEMORGAN]], label %[[HANDLER_ADD_OVERFLOW:.*]], label %[[CONT:.*]], !prof [[PROF8]], !nosanitize [[META6]]
 // CHECK:       [[HANDLER_ADD_OVERFLOW]]:
-// CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[X]] to i64, !nosanitize [[META2]]
-// CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[Y]] to i64, !nosanitize [[META2]]
-// CHECK-NEXT:    tail call void @__ubsan_handle_add_overflow_abort(ptr nonnull @[[GLOB3:[0-9]+]], i64 [[TMP3]], i64 [[TMP4]]) #[[ATTR6]], !nosanitize [[META2]]
-// CHECK-NEXT:    unreachable, !nosanitize [[META2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[X]] to i64, !nosanitize [[META6]]
+// CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[Y]] to i64, !nosanitize [[META6]]
+// CHECK-NEXT:    tail call void @__ubsan_handle_add_overflow_abort(ptr nonnull @[[GLOB3:[0-9]+]], i64 [[TMP3]], i64 [[TMP4]]) #[[ATTR6]], !nosanitize [[META6]]
+// CHECK-NEXT:    unreachable, !nosanitize [[META6]]
 // CHECK:       [[CONT]]:
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP0]], 0, !nosanitize [[META2]]
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP0]], 0, !nosanitize [[META6]]
 // CHECK-NEXT:    ret i32 [[TMP5]]
 //
 // TR-LABEL: define dso_local noundef i32 @overflow(
 // TR-SAME: i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // TR-NEXT:  [[ENTRY:.*:]]
-// TR-NEXT:    [[TMP0:%.*]] = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[X]], i32 [[Y]]), !nosanitize [[META2]]
-// TR-NEXT:    [[TMP1:%.*]] = extractvalue { i32, i1 } [[TMP0]], 1, !nosanitize [[META2]]
-// TR-NEXT:    [[TMP2:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 41), !nosanitize [[META2]]
+// TR-NEXT:    [[TMP0:%.*]] = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[X]], i32 [[Y]]), !nosanitize [[META6]]
+// TR-NEXT:    [[TMP1:%.*]] = extractvalue { i32, i1 } [[TMP0]], 1, !nosanitize [[META6]]
+// TR-NEXT:    [[TMP2:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 41), !nosanitize [[META6]]
 // TR-NEXT:    [[DOTDEMORGAN:%.*]] = and i1 [[TMP1]], [[TMP2]]
-// TR-NEXT:    br i1 [[DOTDEMORGAN]], label %[[TRAP:.*]], label %[[CONT:.*]], !prof [[PROF4]], !nosanitize [[META2]]
+// TR-NEXT:    br i1 [[DOTDEMORGAN]], label %[[TRAP:.*]], label %[[CONT:.*]], !prof [[PROF8]], !nosanitize [[META6]]
 // TR:       [[TRAP]]:
-// TR-NEXT:    tail call void @llvm.ubsantrap(i8 0) #[[ATTR7]], !nosanitize [[META2]]
-// TR-NEXT:    unreachable, !nosanitize [[META2]]
+// TR-NEXT:    tail call void @llvm.ubsantrap(i8 0) #[[ATTR7]], !nosanitize [[META6]]
+// TR-NEXT:    unreachable, !nosanitize [[META6]]
 // TR:       [[CONT]]:
-// TR-NEXT:    [[TMP3:%.*]] = extractvalue { i32, i1 } [[TMP0]], 0, !nosanitize [[META2]]
+// TR-NEXT:    [[TMP3:%.*]] = extractvalue { i32, i1 } [[TMP0]], 0, !nosanitize [[META6]]
 // TR-NEXT:    ret i32 [[TMP3]]
 //
 // REC-LABEL: define dso_local noundef i32 @overflow(
 // REC-SAME: i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // REC-NEXT:  [[ENTRY:.*:]]
-// REC-NEXT:    [[TMP0:%.*]] = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[X]], i32 [[Y]]), !nosanitize [[META2]]
-// REC-NEXT:    [[TMP1:%.*]] = extractvalue { i32, i1 } [[TMP0]], 1, !nosanitize [[META2]]
-// REC-NEXT:    [[TMP2:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 41), !nosanitize [[META2]]
+// REC-NEXT:    [[TMP0:%.*]] = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[X]], i32 [[Y]]), !nosanitize [[META6]]
+// REC-NEXT:    [[TMP1:%.*]] = extractvalue { i32, i1 } [[TMP0]], 1, !nosanitize [[META6]]
+// REC-NEXT:    [[TMP2:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 41), !nosanitize [[META6]]
 // REC-NEXT:    [[DOTDEMORGAN:%.*]] = and i1 [[TMP1]], [[TMP2]]
-// REC-NEXT:    br i1 [[DOTDEMORGAN]], label %[[HANDLER_ADD_OVERFLOW:.*]], label %[[CONT:.*]], !prof [[PROF4]], !nosanitize [[META2]]
+// REC-NEXT:    br i1 [[DOTDEMORGAN]], label %[[HANDLER_ADD_OVERFLOW:.*]], label %[[CONT:.*]], !prof [[PROF8]], !nosanitize [[META6]]
 // REC:       [[HANDLER_ADD_OVERFLOW]]:
-// REC-NEXT:    [[TMP3:%.*]] = zext i32 [[X]] to i64, !nosanitize [[META2]]
-// REC-NEXT:    [[TMP4:%.*]] = zext i32 [[Y]] to i64, !nosanitize [[META2]]
-// REC-NEXT:    tail call void @__ubsan_handle_add_overflow(ptr nonnull @[[GLOB3:[0-9]+]], i64 [[TMP3]], i64 [[TMP4]]) #[[ATTR6]], !nosanitize [[META2]]
-// REC-NEXT:    br label %[[CONT]], !nosanitize [[META2]]
+// REC-NEXT:    [[TMP3:%.*]] = zext i32 [[X]] to i64, !nosanitize [[META6]]
+// REC-NEXT:    [[TMP4:%.*]] = zext i32 [[Y]] to i64, !nosanitize [[META6]]
+// REC-NEXT:    tail call void @__ubsan_handle_add_overflow(ptr nonnull @[[GLOB3:[0-9]+]], i64 [[TMP3]], i64 [[TMP4]]) #[[ATTR6]], !nosanitize [[META6]]
+// REC-NEXT:    br label %[[CONT]], !nosanitize [[META6]]
 // REC:       [[CONT]]:
-// REC-NEXT:    [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP0]], 0, !nosanitize [[META2]]
+// REC-NEXT:    [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP0]], 0, !nosanitize [[META6]]
 // REC-NEXT:    ret i32 [[TMP5]]
 //
 int overflow(int x, int y) {
@@ -200,16 +200,16 @@ void use(double*);
 // CHECK-NEXT:    [[TMP1:%.*]] = icmp ule i64 [[TMP0]], [[IDXPROM]]
 //
 //                                                                  71 == SO_LocalBounds
-// CHECK-NEXT:    [[TMP2:%.*]] = call i1 @llvm.allow.ubsan.check(i8 71), !nosanitize [[META2]]
-// CHECK-NEXT:    [[TMP3:%.*]] = and i1 [[TMP1]], [[TMP2]], !nosanitize [[META2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = call i1 @llvm.allow.ubsan.check(i8 71), !nosanitize [[META6]]
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 [[TMP1]], [[TMP2]], !nosanitize [[META6]]
 // CHECK-NEXT:    br i1 [[TMP3]], label %[[TRAP:.*]], label %[[BB4:.*]]
 // CHECK:       [[BB4]]:
 // CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[VLA]], i64 [[IDXPROM]]
 // CHECK-NEXT:    [[TMP5:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[DOUBLE_TBAA9:![0-9]+]]
 // CHECK-NEXT:    ret double [[TMP5]]
 // CHECK:       [[TRAP]]:
-// CHECK-NEXT:    call void @__ubsan_handle_local_out_of_bounds_abort() #[[ATTR6]], !nosanitize [[META2]]
-// CHECK-NEXT:    unreachable, !nosanitize [[META2]]
+// CHECK-NEXT:    call void @__ubsan_handle_local_out_of_bounds_abort() #[[ATTR6]], !nosanitize [[META6]]
+// CHECK-NEXT:    unreachable, !nosanitize [[META6]]
 //
 // TR-LABEL: define dso_local double @lbounds(
 // TR-SAME: i32 noundef [[B:%.*]], i32 noundef [[I:%.*]]) local_unnamed_addr #[[ATTR5:[0-9]+]] {
@@ -219,16 +219,16 @@ void use(double*);
 // TR-NEXT:    call void @use(ptr noundef nonnull [[VLA]]) #[[ATTR8:[0-9]+]]
 // TR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I]] to i64
 // TR-NEXT:    [[TMP1:%.*]] = icmp ule i64 [[TMP0]], [[IDXPROM]]
-// TR-NEXT:    [[TMP2:%.*]] = call i1 @llvm.allow.ubsan.check(i8 71), !nosanitize [[META2]]
-// TR-NEXT:    [[TMP3:%.*]] = and i1 [[TMP1]], [[TMP2]], !nosanitize [[META2]]
+// TR-NEXT:    [[TMP2:%.*]] = call i1 @llvm.allow.ubsan.check(i8 71), !nosanitize [[META6]]
+// TR-NEXT:    [[TMP3:%.*]] = and i1 [[TMP1]], [[TMP2]], !nosanitize [[META6]]
 // TR-NEXT:    br i1 [[TMP3]], label %[[TRAP:.*]], label %[[BB4:.*]]
 // TR:       [[BB4]]:
 // TR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[VLA]], i64 [[IDXPROM]]
 // TR-NEXT:    [[TMP5:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[DOUBLE_TBAA9:![0-9]+]]
 // TR-NEXT:    ret double [[TMP5]]
 // TR:       [[TRAP]]:
-// TR-NEXT:    call void @llvm.ubsantrap(i8 71) #[[ATTR7]], !nosanitize [[META2]]
-// TR-NEXT:    unreachable, !nosanitize [[META2]]
+// TR-NEXT:    call void @llvm.ubsantrap(i8 71) #[[ATTR7]], !nosanitize [[META6]]
+// TR-NEXT:    unreachable, !nosanitize [[META6]]
 //
 // REC-LABEL: define dso_local double @lbounds(
 // REC-SAME: i32 noundef [[B:%.*]], i32 noundef [[I:%.*]]) local_unnamed_addr #[[ATTR0]] {
@@ -238,16 +238,16 @@ void use(double*);
 // REC-NEXT:    call void @use(ptr noundef nonnull [[VLA]]) #[[ATTR5:[0-9]+]]
 // REC-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I]] to i64
 // REC-NEXT:    [[TMP1:%.*]] = icmp ule i64 [[TMP0]], [[IDXPROM]]
-// REC-NEXT:    [[TMP2:%.*]] = call i1 @llvm.allow.ubsan.check(i8 71), !nosanitize [[META2]]
-// REC-NEXT:    [[TMP3:%.*]] = and i1 [[TMP1]], [[TMP2]], !nosanitize [[META2]]
+// REC-NEXT:    [[TMP2:%.*]] = call i1 @llvm.allow.ubsan.check(i8 71), !nosanitize [[META6]]
+// REC-NEXT:    [[TMP3:%.*]] = and i1 [[TMP1]], [[TMP2]], !nosanitize [[META6]]
 // REC-NEXT:    br i1 [[TMP3]], label %[[TRAP:.*]], label %[[BB4:.*]]
 // REC:       [[BB4]]:
 // REC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[VLA]], i64 [[IDXPROM]]
 // REC-NEXT:    [[TMP5:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[DOUBLE_TBAA9:![0-9]+]]
 // REC-NEXT:    ret double [[TMP5]]
 // REC:       [[TRAP]]:
-// REC-NEXT:    call void @__ubsan_handle_local_out_of_bounds() #[[ATTR6]], !nosanitize [[META2]]
-// REC-NEXT:    br label %[[BB4]], !nosanitize [[META2]]
+// REC-NEXT:    call void @__ubsan_handle_local_out_of_bounds() #[[ATTR6]], !nosanitize [[META6]]
+// REC-NEXT:    br label %[[BB4]], !nosanitize [[META6]]
 //
 double lbounds(int b, int i) {
   double a[b];
@@ -256,33 +256,33 @@ double lbounds(int b, int i) {
 }
 
 //.
-// CHECK: [[META2]] = !{}
-// CHECK: [[PROF3]] = !{!"branch_weights", i32 1048575, i32 1}
-// CHECK: [[PROF4]] = !{!"branch_weights", i32 1, i32 1048575}
-// CHECK: [[INT_TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
-// CHECK: [[META6]] = !{!"int", [[META7:![0-9]+]], i64 0}
-// CHECK: [[META7]] = !{!"omnipotent char", [[META8:![0-9]+]], i64 0}
-// CHECK: [[META8]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[INT_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// CHECK: [[META3]] = !{!"int", [[META4:![0-9]+]], i64 0}
+// CHECK: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK: [[META5]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[META6]] = !{}
+// CHECK: [[PROF7]] = !{!"branch_weights", i32 1048575, i32 1}
+// CHECK: [[PROF8]] = !{!"branch_weights", i32 1, i32 1048575}
 // CHECK: [[DOUBLE_TBAA9]] = !{[[META10:![0-9]+]], [[META10]], i64 0}
-// CHECK: [[META10]] = !{!"double", [[META7]], i64 0}
+// CHECK: [[META10]] = !{!"double", [[META4]], i64 0}
 //.
-// TR: [[META2]] = !{}
-// TR: [[PROF3]] = !{!"branch_weights", i32 1048575, i32 1}
-// TR: [[PROF4]] = !{!"branch_weights", i32 1, i32 1048575}
-// TR: [[INT_TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
-// TR: [[META6]] = !{!"int", [[META7:![0-9]+]], i64 0}
-// TR: [[META7]] = !{!"omnipotent char", [[META8:![0-9]+]], i64 0}
-// TR: [[META8]] = !{!"Simple C/C++ TBAA"}
+// TR: [[INT_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// TR: [[META3]] = !{!"int", [[META4:![0-9]+]], i64 0}
+// TR: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// TR: [[META5]] = !{!"Simple C/C++ TBAA"}
+// TR: [[META6]] = !{}
+// TR: [[PROF7]] = !{!"branch_weights", i32 1048575, i32 1}
+// TR: [[PROF8]] = !{!"branch_weights", i32 1, i32 1048575}
 // TR: [[DOUBLE_TBAA9]] = !{[[META10:![0-9]+]], [[META10]], i64 0}
-// TR: [[META10]] = !{!"double", [[META7]], i64 0}
+// TR: [[META10]] = !{!"double", [[META4]], i64 0}
 //.
-// REC: [[META2]] = !{}
-// REC: [[PROF3]] = !{!"branch_weights", i32 1048575, i32 1}
-// REC: [[PROF4]] = !{!"branch_weights", i32 1, i32 1048575}
-// REC: [[INT_TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
-// REC: [[META6]] = !{!"int", [[META7:![0-9]+]], i64 0}
-// REC: [[META7]] = !{!"omnipotent char", [[META8:![0-9]+]], i64 0}
-// REC: [[META8]] = !{!"Simple C/C++ TBAA"}
+// REC: [[INT_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// REC: [[META3]] = !{!"int", [[META4:![0-9]+]], i64 0}
+// REC: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// REC: [[META5]] = !{!"Simple C/C++ TBAA"}
+// REC: [[META6]] = !{}
+// REC: [[PROF7]] = !{!"branch_weights", i32 1048575, i32 1}
+// REC: [[PROF8]] = !{!"branch_weights", i32 1, i32 1048575}
 // REC: [[DOUBLE_TBAA9]] = !{[[META10:![0-9]+]], [[META10]], i64 0}
-// REC: [[META10]] = !{!"double", [[META7]], i64 0}
+// REC: [[META10]] = !{!"double", [[META4]], i64 0}
 //.
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/cplusplus.cpp b/clang/test/CodeGen/arm-mve-intrinsics/cplusplus.cpp
index 29719614d04fb..4e3d0cec202b9 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/cplusplus.cpp
+++ b/clang/test/CodeGen/arm-mve-intrinsics/cplusplus.cpp
@@ -114,7 +114,7 @@ uint16x8_t test_vld1q_u16(const uint16_t *base)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 4, <4 x i1> [[TMP1]])
+// CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[VALUE:%.*]], ptr align 4 [[BASE:%.*]], <4 x i1> [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 void test_vst1q_p_s32(int32_t *base, int32x4_t value, mve_pred16_t p)
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/dup.c b/clang/test/CodeGen/arm-mve-intrinsics/dup.c
index c2c7a9c278f67..e7113fdcba90a 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/dup.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/dup.c
@@ -244,8 +244,7 @@ uint32x4_t test_vdupq_m_n_u32(uint32x4_t inactive, uint32_t a, mve_pred16_t p)
 // CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x half> poison, half [[A:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x half> [[DOTSPLATINSERT]], <8 x half> poison, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x half> [[DOTSPLAT]], <8 x half> undef
-// CHECK-NEXT:    ret <8 x half> [[TMP2]]
+// CHECK-NEXT:    ret <8 x half> [[DOTSPLAT]]
 //
 float16x8_t test_vdupq_x_n_f16(float16_t a, mve_pred16_t p)
 {
@@ -258,8 +257,7 @@ float16x8_t test_vdupq_x_n_f16(float16_t a, mve_pred16_t p)
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[DOTSPLAT]], <4 x float> undef
-// CHECK-NEXT:    ret <4 x float> [[TMP2]]
+// CHECK-NEXT:    ret <4 x float> [[DOTSPLAT]]
 //
 float32x4_t test_vdupq_x_n_f32(float32_t a, mve_pred16_t p)
 {
@@ -272,8 +270,7 @@ float32x4_t test_vdupq_x_n_f32(float32_t a, mve_pred16_t p)
 // CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[A:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i8> [[DOTSPLAT]], <16 x i8> undef
-// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+// CHECK-NEXT:    ret <16 x i8> [[DOTSPLAT]]
 //
 int8x16_t test_vdupq_x_n_s8(int8_t a, mve_pred16_t p)
 {
@@ -286,8 +283,7 @@ int8x16_t test_vdupq_x_n_s8(int8_t a, mve_pred16_t p)
 // CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[A:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> [[DOTSPLAT]], <8 x i16> undef
-// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+// CHECK-NEXT:    ret <8 x i16> [[DOTSPLAT]]
 //
 int16x8_t test_vdupq_x_n_s16(int16_t a, mve_pred16_t p)
 {
@@ -300,8 +296,7 @@ int16x8_t test_vdupq_x_n_s16(int16_t a, mve_pred16_t p)
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[DOTSPLAT]], <4 x i32> undef
-// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+// CHECK-NEXT:    ret <4 x i32> [[DOTSPLAT]]
 //
 int32x4_t test_vdupq_x_n_s32(int32_t a, mve_pred16_t p)
 {
@@ -314,8 +309,7 @@ int32x4_t test_vdupq_x_n_s32(int32_t a, mve_pred16_t p)
 // CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[A:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i8> [[DOTSPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i8> [[DOTSPLAT]], <16 x i8> undef
-// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+// CHECK-NEXT:    ret <16 x i8> [[DOTSPLAT]]
 //
 uint8x16_t test_vdupq_x_n_u8(uint8_t a, mve_pred16_t p)
 {
@@ -328,8 +322,7 @@ uint8x16_t test_vdupq_x_n_u8(uint8_t a, mve_pred16_t p)
 // CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[A:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> [[DOTSPLAT]], <8 x i16> undef
-// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+// CHECK-NEXT:    ret <8 x i16> [[DOTSPLAT]]
 //
 uint16x8_t test_vdupq_x_n_u16(uint16_t a, mve_pred16_t p)
 {
@@ -342,8 +335,7 @@ uint16x8_t test_vdupq_x_n_u16(uint16_t a, mve_pred16_t p)
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
 // CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A:%.*]], i64 0
 // CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[DOTSPLAT]], <4 x i32> undef
-// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+// CHECK-NEXT:    ret <4 x i32> [[DOTSPLAT]]
 //
 uint32x4_t test_vdupq_x_n_u32(uint32_t a, mve_pred16_t p)
 {
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/load-store.c b/clang/test/CodeGen/arm-mve-intrinsics/load-store.c
index 2dde75fa5586b..ede2e956f4d92 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/load-store.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/load-store.c
@@ -8,8 +8,8 @@
 
 // CHECK-LABEL: @test_vld1q_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[BASE:%.*]], align 2
-// CHECK-NEXT:    ret <8 x half> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[BASE:%.*]], align 2
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
 //
 float16x8_t test_vld1q_f16(const float16_t *base)
 {
@@ -22,8 +22,8 @@ float16x8_t test_vld1q_f16(const float16_t *base)
 
 // CHECK-LABEL: @test_vld1q_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[BASE:%.*]], align 4
-// CHECK-NEXT:    ret <4 x float> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[BASE:%.*]], align 4
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
 //
 float32x4_t test_vld1q_f32(const float32_t *base)
 {
@@ -36,8 +36,8 @@ float32x4_t test_vld1q_f32(const float32_t *base)
 
 // CHECK-LABEL: @test_vld1q_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[BASE:%.*]], align 1
-// CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[BASE:%.*]], align 1
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
 //
 int8x16_t test_vld1q_s8(const int8_t *base)
 {
@@ -50,8 +50,8 @@ int8x16_t test_vld1q_s8(const int8_t *base)
 
 // CHECK-LABEL: @test_vld1q_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[BASE:%.*]], align 2
-// CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[BASE:%.*]], align 2
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
 //
 int16x8_t test_vld1q_s16(const int16_t *base)
 {
@@ -64,8 +64,8 @@ int16x8_t test_vld1q_s16(const int16_t *base)
 
 // CHECK-LABEL: @test_vld1q_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[BASE:%.*]], align 4
-// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[BASE:%.*]], align 4
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 //
 int32x4_t test_vld1q_s32(const int32_t *base)
 {
@@ -78,8 +78,8 @@ int32x4_t test_vld1q_s32(const int32_t *base)
 
 // CHECK-LABEL: @test_vld1q_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[BASE:%.*]], align 1
-// CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[BASE:%.*]], align 1
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
 //
 uint8x16_t test_vld1q_u8(const uint8_t *base)
 {
@@ -92,8 +92,8 @@ uint8x16_t test_vld1q_u8(const uint8_t *base)
 
 // CHECK-LABEL: @test_vld1q_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[BASE:%.*]], align 2
-// CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[BASE:%.*]], align 2
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
 //
 uint16x8_t test_vld1q_u16(const uint16_t *base)
 {
@@ -106,8 +106,8 @@ uint16x8_t test_vld1q_u16(const uint16_t *base)
 
 // CHECK-LABEL: @test_vld1q_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[BASE:%.*]], align 4
-// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[BASE:%.*]], align 4
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 //
 uint32x4_t test_vld1q_u32(const uint32_t *base)
 {
@@ -120,10 +120,10 @@ uint32x4_t test_vld1q_u32(const uint32_t *base)
 
 // CHECK-LABEL: @test_vld1q_z_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.masked.load.v8f16.p0(ptr [[BASE:%.*]], i32 2, <8 x i1> [[TMP2]], <8 x half> zeroinitializer)
-// CHECK-NEXT:    ret <8 x half> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x half> @llvm.masked.load.v8f16.p0(ptr align 2 [[BASE:%.*]], <8 x i1> [[TMP1]], <8 x half> zeroinitializer)
+// CHECK-NEXT:    ret <8 x half> [[TMP2]]
 //
 float16x8_t test_vld1q_z_f16(const float16_t *base, mve_pred16_t p)
 {
@@ -136,10 +136,10 @@ float16x8_t test_vld1q_z_f16(const float16_t *base, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vld1q_z_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[BASE:%.*]], i32 4, <4 x i1> [[TMP2]], <4 x float> zeroinitializer)
-// CHECK-NEXT:    ret <4 x float> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[BASE:%.*]], <4 x i1> [[TMP1]], <4 x float> zeroinitializer)
+// CHECK-NEXT:    ret <4 x float> [[TMP2]]
 //
 float32x4_t test_vld1q_z_f32(const float32_t *base, mve_pred16_t p)
 {
@@ -152,10 +152,10 @@ float32x4_t test_vld1q_z_f32(const float32_t *base, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vld1q_z_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[BASE:%.*]], i32 1, <16 x i1> [[TMP2]], <16 x i8> zeroinitializer)
-// CHECK-NEXT:    ret <16 x i8> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[BASE:%.*]], <16 x i1> [[TMP1]], <16 x i8> zeroinitializer)
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
 //
 int8x16_t test_vld1q_z_s8(const int8_t *base, mve_pred16_t p)
 {
@@ -168,10 +168,10 @@ int8x16_t test_vld1q_z_s8(const int8_t *base, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vld1q_z_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[BASE:%.*]], i32 2, <8 x i1> [[TMP2]], <8 x i16> zeroinitializer)
-// CHECK-NEXT:    ret <8 x i16> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 2 [[BASE:%.*]], <8 x i1> [[TMP1]], <8 x i16> zeroinitializer)
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
 int16x8_t test_vld1q_z_s16(const int16_t *base, mve_pred16_t p)
 {
@@ -184,10 +184,10 @@ int16x8_t test_vld1q_z_s16(const int16_t *base, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vld1q_z_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[BASE:%.*]], i32 4, <4 x i1> [[TMP2]], <4 x i32> zeroinitializer)
-// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[BASE:%.*]], <4 x i1> [[TMP1]], <4 x i32> zeroinitializer)
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 int32x4_t test_vld1q_z_s32(const int32_t *base, mve_pred16_t p)
 {
@@ -200,10 +200,10 @@ int32x4_t test_vld1q_z_s32(const int32_t *base, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vld1q_z_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[BASE:%.*]], i32 1, <16 x i1> [[TMP2]], <16 x i8> zeroinitializer)
-// CHECK-NEXT:    ret <16 x i8> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[BASE:%.*]], <16 x i1> [[TMP1]], <16 x i8> zeroinitializer)
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
 //
 uint8x16_t test_vld1q_z_u8(const uint8_t *base, mve_pred16_t p)
 {
@@ -216,10 +216,10 @@ uint8x16_t test_vld1q_z_u8(const uint8_t *base, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vld1q_z_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[BASE:%.*]], i32 2, <8 x i1> [[TMP2]], <8 x i16> zeroinitializer)
-// CHECK-NEXT:    ret <8 x i16> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 2 [[BASE:%.*]], <8 x i1> [[TMP1]], <8 x i16> zeroinitializer)
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
 uint16x8_t test_vld1q_z_u16(const uint16_t *base, mve_pred16_t p)
 {
@@ -232,10 +232,10 @@ uint16x8_t test_vld1q_z_u16(const uint16_t *base, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vld1q_z_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[BASE:%.*]], i32 4, <4 x i1> [[TMP2]], <4 x i32> zeroinitializer)
-// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[BASE:%.*]], <4 x i1> [[TMP1]], <4 x i32> zeroinitializer)
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 uint32x4_t test_vld1q_z_u32(const uint32_t *base, mve_pred16_t p)
 {
@@ -248,8 +248,8 @@ uint32x4_t test_vld1q_z_u32(const uint32_t *base, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vldrbq_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[BASE:%.*]], align 1
-// CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[BASE:%.*]], align 1
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
 //
 int8x16_t test_vldrbq_s8(const int8_t *base)
 {
@@ -258,9 +258,9 @@ int8x16_t test_vldrbq_s8(const int8_t *base)
 
 // CHECK-LABEL: @test_vldrbq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[BASE:%.*]], align 1
-// CHECK-NEXT:    [[TMP2:%.*]] = sext <8 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[BASE:%.*]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP1]]
 //
 int16x8_t test_vldrbq_s16(const int8_t *base)
 {
@@ -269,9 +269,9 @@ int16x8_t test_vldrbq_s16(const int8_t *base)
 
 // CHECK-LABEL: @test_vldrbq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[BASE:%.*]], align 1
-// CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[BASE:%.*]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = sext <4 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 //
 int32x4_t test_vldrbq_s32(const int8_t *base)
 {
@@ -280,8 +280,8 @@ int32x4_t test_vldrbq_s32(const int8_t *base)
 
 // CHECK-LABEL: @test_vldrbq_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[BASE:%.*]], align 1
-// CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[BASE:%.*]], align 1
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
 //
 uint8x16_t test_vldrbq_u8(const uint8_t *base)
 {
@@ -290,9 +290,9 @@ uint8x16_t test_vldrbq_u8(const uint8_t *base)
 
 // CHECK-LABEL: @test_vldrbq_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[BASE:%.*]], align 1
-// CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[BASE:%.*]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP1]]
 //
 uint16x8_t test_vldrbq_u16(const uint8_t *base)
 {
@@ -301,9 +301,9 @@ uint16x8_t test_vldrbq_u16(const uint8_t *base)
 
 // CHECK-LABEL: @test_vldrbq_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[BASE:%.*]], align 1
-// CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[BASE:%.*]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 //
 uint32x4_t test_vldrbq_u32(const uint8_t *base)
 {
@@ -312,10 +312,10 @@ uint32x4_t test_vldrbq_u32(const uint8_t *base)
 
 // CHECK-LABEL: @test_vldrbq_z_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[BASE:%.*]], i32 1, <16 x i1> [[TMP2]], <16 x i8> zeroinitializer)
-// CHECK-NEXT:    ret <16 x i8> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[BASE:%.*]], <16 x i1> [[TMP1]], <16 x i8> zeroinitializer)
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
 //
 int8x16_t test_vldrbq_z_s8(const int8_t *base, mve_pred16_t p)
 {
@@ -324,11 +324,11 @@ int8x16_t test_vldrbq_z_s8(const int8_t *base, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vldrbq_z_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr [[BASE:%.*]], i32 1, <8 x i1> [[TMP2]], <8 x i8> zeroinitializer)
-// CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i8> [[TMP3]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP4]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr align 1 [[BASE:%.*]], <8 x i1> [[TMP1]], <8 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP3]]
 //
 int16x8_t test_vldrbq_z_s16(const int8_t *base, mve_pred16_t p)
 {
@@ -337,11 +337,11 @@ int16x8_t test_vldrbq_z_s16(const int8_t *base, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vldrbq_z_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr [[BASE:%.*]], i32 1, <4 x i1> [[TMP2]], <4 x i8> zeroinitializer)
-// CHECK-NEXT:    [[TMP4:%.*]] = sext <4 x i8> [[TMP3]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr align 1 [[BASE:%.*]], <4 x i1> [[TMP1]], <4 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
 int32x4_t test_vldrbq_z_s32(const int8_t *base, mve_pred16_t p)
 {
@@ -350,10 +350,10 @@ int32x4_t test_vldrbq_z_s32(const int8_t *base, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vldrbq_z_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[BASE:%.*]], i32 1, <16 x i1> [[TMP2]], <16 x i8> zeroinitializer)
-// CHECK-NEXT:    ret <16 x i8> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[BASE:%.*]], <16 x i1> [[TMP1]], <16 x i8> zeroinitializer)
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
 //
 uint8x16_t test_vldrbq_z_u8(const uint8_t *base, mve_pred16_t p)
 {
@@ -362,11 +362,11 @@ uint8x16_t test_vldrbq_z_u8(const uint8_t *base, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vldrbq_z_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr [[BASE:%.*]], i32 1, <8 x i1> [[TMP2]], <8 x i8> zeroinitializer)
-// CHECK-NEXT:    [[TMP4:%.*]] = zext <8 x i8> [[TMP3]] to <8 x i16>
-// CHECK-NEXT:    ret <8 x i16> [[TMP4]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr align 1 [[BASE:%.*]], <8 x i1> [[TMP1]], <8 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP3]]
 //
 uint16x8_t test_vldrbq_z_u16(const uint8_t *base, mve_pred16_t p)
 {
@@ -375,11 +375,11 @@ uint16x8_t test_vldrbq_z_u16(const uint8_t *base, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vldrbq_z_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr [[BASE:%.*]], i32 1, <4 x i1> [[TMP2]], <4 x i8> zeroinitializer)
-// CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i8> [[TMP3]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr align 1 [[BASE:%.*]], <4 x i1> [[TMP1]], <4 x i8> zeroinitializer)
+// CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
 uint32x4_t test_vldrbq_z_u32(const uint8_t *base, mve_pred16_t p)
 {
@@ -388,8 +388,8 @@ uint32x4_t test_vldrbq_z_u32(const uint8_t *base, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vldrhq_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[BASE:%.*]], align 2
-// CHECK-NEXT:    ret <8 x half> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[BASE:%.*]], align 2
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
 //
 float16x8_t test_vldrhq_f16(const float16_t *base)
 {
@@ -398,8 +398,8 @@ float16x8_t test_vldrhq_f16(const float16_t *base)
 
 // CHECK-LABEL: @test_vldrhq_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[BASE:%.*]], align 2
-// CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[BASE:%.*]], align 2
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
 //
 int16x8_t test_vldrhq_s16(const int16_t *base)
 {
@@ -408,9 +408,9 @@ int16x8_t test_vldrhq_s16(const int16_t *base)
 
 // CHECK-LABEL: @test_vldrhq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[BASE:%.*]], align 2
-// CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[BASE:%.*]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 //
 int32x4_t test_vldrhq_s32(const int16_t *base)
 {
@@ -419,8 +419,8 @@ int32x4_t test_vldrhq_s32(const int16_t *base)
 
 // CHECK-LABEL: @test_vldrhq_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[BASE:%.*]], align 2
-// CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[BASE:%.*]], align 2
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
 //
 uint16x8_t test_vldrhq_u16(const uint16_t *base)
 {
@@ -429,9 +429,9 @@ uint16x8_t test_vldrhq_u16(const uint16_t *base)
 
 // CHECK-LABEL: @test_vldrhq_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[BASE:%.*]], align 2
-// CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[BASE:%.*]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i16> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 //
 uint32x4_t test_vldrhq_u32(const uint16_t *base)
 {
@@ -440,10 +440,10 @@ uint32x4_t test_vldrhq_u32(const uint16_t *base)
 
 // CHECK-LABEL: @test_vldrhq_z_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.masked.load.v8f16.p0(ptr [[BASE:%.*]], i32 2, <8 x i1> [[TMP2]], <8 x half> zeroinitializer)
-// CHECK-NEXT:    ret <8 x half> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x half> @llvm.masked.load.v8f16.p0(ptr align 2 [[BASE:%.*]], <8 x i1> [[TMP1]], <8 x half> zeroinitializer)
+// CHECK-NEXT:    ret <8 x half> [[TMP2]]
 //
 float16x8_t test_vldrhq_z_f16(const float16_t *base, mve_pred16_t p)
 {
@@ -452,10 +452,10 @@ float16x8_t test_vldrhq_z_f16(const float16_t *base, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vldrhq_z_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[BASE:%.*]], i32 2, <8 x i1> [[TMP2]], <8 x i16> zeroinitializer)
-// CHECK-NEXT:    ret <8 x i16> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 2 [[BASE:%.*]], <8 x i1> [[TMP1]], <8 x i16> zeroinitializer)
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
 int16x8_t test_vldrhq_z_s16(const int16_t *base, mve_pred16_t p)
 {
@@ -464,11 +464,11 @@ int16x8_t test_vldrhq_z_s16(const int16_t *base, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vldrhq_z_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr [[BASE:%.*]], i32 2, <4 x i1> [[TMP2]], <4 x i16> zeroinitializer)
-// CHECK-NEXT:    [[TMP4:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 2 [[BASE:%.*]], <4 x i1> [[TMP1]], <4 x i16> zeroinitializer)
+// CHECK-NEXT:    [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
 int32x4_t test_vldrhq_z_s32(const int16_t *base, mve_pred16_t p)
 {
@@ -477,10 +477,10 @@ int32x4_t test_vldrhq_z_s32(const int16_t *base, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vldrhq_z_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[BASE:%.*]], i32 2, <8 x i1> [[TMP2]], <8 x i16> zeroinitializer)
-// CHECK-NEXT:    ret <8 x i16> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 2 [[BASE:%.*]], <8 x i1> [[TMP1]], <8 x i16> zeroinitializer)
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 //
 uint16x8_t test_vldrhq_z_u16(const uint16_t *base, mve_pred16_t p)
 {
@@ -489,11 +489,11 @@ uint16x8_t test_vldrhq_z_u16(const uint16_t *base, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vldrhq_z_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr [[BASE:%.*]], i32 2, <4 x i1> [[TMP2]], <4 x i16> zeroinitializer)
-// CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 2 [[BASE:%.*]], <4 x i1> [[TMP1]], <4 x i16> zeroinitializer)
+// CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
 uint32x4_t test_vldrhq_z_u32(const uint16_t *base, mve_pred16_t p)
 {
@@ -502,8 +502,8 @@ uint32x4_t test_vldrhq_z_u32(const uint16_t *base, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vldrwq_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[BASE:%.*]], align 4
-// CHECK-NEXT:    ret <4 x float> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[BASE:%.*]], align 4
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
 //
 float32x4_t test_vldrwq_f32(const float32_t *base)
 {
@@ -512,8 +512,8 @@ float32x4_t test_vldrwq_f32(const float32_t *base)
 
 // CHECK-LABEL: @test_vldrwq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[BASE:%.*]], align 4
-// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[BASE:%.*]], align 4
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 //
 int32x4_t test_vldrwq_s32(const int32_t *base)
 {
@@ -522,8 +522,8 @@ int32x4_t test_vldrwq_s32(const int32_t *base)
 
 // CHECK-LABEL: @test_vldrwq_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[BASE:%.*]], align 4
-// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[BASE:%.*]], align 4
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 //
 uint32x4_t test_vldrwq_u32(const uint32_t *base)
 {
@@ -532,10 +532,10 @@ uint32x4_t test_vldrwq_u32(const uint32_t *base)
 
 // CHECK-LABEL: @test_vldrwq_z_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[BASE:%.*]], i32 4, <4 x i1> [[TMP2]], <4 x float> zeroinitializer)
-// CHECK-NEXT:    ret <4 x float> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[BASE:%.*]], <4 x i1> [[TMP1]], <4 x float> zeroinitializer)
+// CHECK-NEXT:    ret <4 x float> [[TMP2]]
 //
 float32x4_t test_vldrwq_z_f32(const float32_t *base, mve_pred16_t p)
 {
@@ -544,10 +544,10 @@ float32x4_t test_vldrwq_z_f32(const float32_t *base, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vldrwq_z_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[BASE:%.*]], i32 4, <4 x i1> [[TMP2]], <4 x i32> zeroinitializer)
-// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[BASE:%.*]], <4 x i1> [[TMP1]], <4 x i32> zeroinitializer)
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 int32x4_t test_vldrwq_z_s32(const int32_t *base, mve_pred16_t p)
 {
@@ -556,10 +556,10 @@ int32x4_t test_vldrwq_z_s32(const int32_t *base, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vldrwq_z_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[BASE:%.*]], i32 4, <4 x i1> [[TMP2]], <4 x i32> zeroinitializer)
-// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[BASE:%.*]], <4 x i1> [[TMP1]], <4 x i32> zeroinitializer)
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 uint32x4_t test_vldrwq_z_u32(const uint32_t *base, mve_pred16_t p)
 {
@@ -680,9 +680,9 @@ void test_vst1q_u32(uint32_t *base, uint32x4_t value)
 
 // CHECK-LABEL: @test_vst1q_p_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
-// CHECK-NEXT:    call void @llvm.masked.store.v8f16.p0(<8 x half> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 2, <8 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    call void @llvm.masked.store.v8f16.p0(<8 x half> [[VALUE:%.*]], ptr align 2 [[BASE:%.*]], <8 x i1> [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 void test_vst1q_p_f16(float16_t *base, float16x8_t value, mve_pred16_t p)
@@ -696,9 +696,9 @@ void test_vst1q_p_f16(float16_t *base, float16x8_t value, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vst1q_p_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
-// CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 4, <4 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[VALUE:%.*]], ptr align 4 [[BASE:%.*]], <4 x i1> [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 void test_vst1q_p_f32(float32_t *base, float32x4_t value, mve_pred16_t p)
@@ -712,9 +712,9 @@ void test_vst1q_p_f32(float32_t *base, float32x4_t value, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vst1q_p_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
-// CHECK-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 1, <16 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[VALUE:%.*]], ptr align 1 [[BASE:%.*]], <16 x i1> [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 void test_vst1q_p_s8(int8_t *base, int8x16_t value, mve_pred16_t p)
@@ -728,9 +728,9 @@ void test_vst1q_p_s8(int8_t *base, int8x16_t value, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vst1q_p_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
-// CHECK-NEXT:    call void @llvm.masked.store.v8i16.p0(<8 x i16> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 2, <8 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    call void @llvm.masked.store.v8i16.p0(<8 x i16> [[VALUE:%.*]], ptr align 2 [[BASE:%.*]], <8 x i1> [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 void test_vst1q_p_s16(int16_t *base, int16x8_t value, mve_pred16_t p)
@@ -744,9 +744,9 @@ void test_vst1q_p_s16(int16_t *base, int16x8_t value, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vst1q_p_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
-// CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 4, <4 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[VALUE:%.*]], ptr align 4 [[BASE:%.*]], <4 x i1> [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 void test_vst1q_p_s32(int32_t *base, int32x4_t value, mve_pred16_t p)
@@ -760,9 +760,9 @@ void test_vst1q_p_s32(int32_t *base, int32x4_t value, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vst1q_p_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
-// CHECK-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 1, <16 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[VALUE:%.*]], ptr align 1 [[BASE:%.*]], <16 x i1> [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 void test_vst1q_p_u8(uint8_t *base, uint8x16_t value, mve_pred16_t p)
@@ -776,9 +776,9 @@ void test_vst1q_p_u8(uint8_t *base, uint8x16_t value, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vst1q_p_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
-// CHECK-NEXT:    call void @llvm.masked.store.v8i16.p0(<8 x i16> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 2, <8 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    call void @llvm.masked.store.v8i16.p0(<8 x i16> [[VALUE:%.*]], ptr align 2 [[BASE:%.*]], <8 x i1> [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 void test_vst1q_p_u16(uint16_t *base, uint16x8_t value, mve_pred16_t p)
@@ -792,9 +792,9 @@ void test_vst1q_p_u16(uint16_t *base, uint16x8_t value, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vst1q_p_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
-// CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 4, <4 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[VALUE:%.*]], ptr align 4 [[BASE:%.*]], <4 x i1> [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 void test_vst1q_p_u32(uint32_t *base, uint32x4_t value, mve_pred16_t p)
@@ -896,9 +896,9 @@ void test_vstrbq_u32(uint8_t *base, uint32x4_t value)
 
 // CHECK-LABEL: @test_vstrbq_p_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
-// CHECK-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 1, <16 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[VALUE:%.*]], ptr align 1 [[BASE:%.*]], <16 x i1> [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 void test_vstrbq_p_s8(int8_t *base, int8x16_t value, mve_pred16_t p)
@@ -913,9 +913,9 @@ void test_vstrbq_p_s8(int8_t *base, int8x16_t value, mve_pred16_t p)
 // CHECK-LABEL: @test_vstrbq_p_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc <8 x i16> [[VALUE:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
-// CHECK-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP0]], ptr [[BASE:%.*]], i32 1, <8 x i1> [[TMP3]])
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
+// CHECK-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP0]], ptr align 1 [[BASE:%.*]], <8 x i1> [[TMP2]])
 // CHECK-NEXT:    ret void
 //
 void test_vstrbq_p_s16(int8_t *base, int16x8_t value, mve_pred16_t p)
@@ -930,9 +930,9 @@ void test_vstrbq_p_s16(int8_t *base, int16x8_t value, mve_pred16_t p)
 // CHECK-LABEL: @test_vstrbq_p_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc <4 x i32> [[VALUE:%.*]] to <4 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP2]])
-// CHECK-NEXT:    call void @llvm.masked.store.v4i8.p0(<4 x i8> [[TMP0]], ptr [[BASE:%.*]], i32 1, <4 x i1> [[TMP3]])
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
+// CHECK-NEXT:    call void @llvm.masked.store.v4i8.p0(<4 x i8> [[TMP0]], ptr align 1 [[BASE:%.*]], <4 x i1> [[TMP2]])
 // CHECK-NEXT:    ret void
 //
 void test_vstrbq_p_s32(int8_t *base, int32x4_t value, mve_pred16_t p)
@@ -946,9 +946,9 @@ void test_vstrbq_p_s32(int8_t *base, int32x4_t value, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vstrbq_p_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]])
-// CHECK-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 1, <16 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[VALUE:%.*]], ptr align 1 [[BASE:%.*]], <16 x i1> [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 void test_vstrbq_p_u8(uint8_t *base, uint8x16_t value, mve_pred16_t p)
@@ -963,9 +963,9 @@ void test_vstrbq_p_u8(uint8_t *base, uint8x16_t value, mve_pred16_t p)
 // CHECK-LABEL: @test_vstrbq_p_u16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc <8 x i16> [[VALUE:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
-// CHECK-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP0]], ptr [[BASE:%.*]], i32 1, <8 x i1> [[TMP3]])
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
+// CHECK-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP0]], ptr align 1 [[BASE:%.*]], <8 x i1> [[TMP2]])
 // CHECK-NEXT:    ret void
 //
 void test_vstrbq_p_u16(uint8_t *base, uint16x8_t value, mve_pred16_t p)
@@ -980,9 +980,9 @@ void test_vstrbq_p_u16(uint8_t *base, uint16x8_t value, mve_pred16_t p)
 // CHECK-LABEL: @test_vstrbq_p_u32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc <4 x i32> [[VALUE:%.*]] to <4 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP2]])
-// CHECK-NEXT:    call void @llvm.masked.store.v4i8.p0(<4 x i8> [[TMP0]], ptr [[BASE:%.*]], i32 1, <4 x i1> [[TMP3]])
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
+// CHECK-NEXT:    call void @llvm.masked.store.v4i8.p0(<4 x i8> [[TMP0]], ptr align 1 [[BASE:%.*]], <4 x i1> [[TMP2]])
 // CHECK-NEXT:    ret void
 //
 void test_vstrbq_p_u32(uint8_t *base, uint32x4_t value, mve_pred16_t p)
@@ -1068,9 +1068,9 @@ void test_vstrhq_u32(uint16_t *base, uint32x4_t value)
 
 // CHECK-LABEL: @test_vstrhq_p_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
-// CHECK-NEXT:    call void @llvm.masked.store.v8f16.p0(<8 x half> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 2, <8 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    call void @llvm.masked.store.v8f16.p0(<8 x half> [[VALUE:%.*]], ptr align 2 [[BASE:%.*]], <8 x i1> [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 void test_vstrhq_p_f16(float16_t *base, float16x8_t value, mve_pred16_t p)
@@ -1084,9 +1084,9 @@ void test_vstrhq_p_f16(float16_t *base, float16x8_t value, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vstrhq_p_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
-// CHECK-NEXT:    call void @llvm.masked.store.v8i16.p0(<8 x i16> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 2, <8 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    call void @llvm.masked.store.v8i16.p0(<8 x i16> [[VALUE:%.*]], ptr align 2 [[BASE:%.*]], <8 x i1> [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 void test_vstrhq_p_s16(int16_t *base, int16x8_t value, mve_pred16_t p)
@@ -1101,9 +1101,9 @@ void test_vstrhq_p_s16(int16_t *base, int16x8_t value, mve_pred16_t p)
 // CHECK-LABEL: @test_vstrhq_p_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc <4 x i32> [[VALUE:%.*]] to <4 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP2]])
-// CHECK-NEXT:    call void @llvm.masked.store.v4i16.p0(<4 x i16> [[TMP0]], ptr [[BASE:%.*]], i32 2, <4 x i1> [[TMP3]])
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
+// CHECK-NEXT:    call void @llvm.masked.store.v4i16.p0(<4 x i16> [[TMP0]], ptr align 2 [[BASE:%.*]], <4 x i1> [[TMP2]])
 // CHECK-NEXT:    ret void
 //
 void test_vstrhq_p_s32(int16_t *base, int32x4_t value, mve_pred16_t p)
@@ -1117,9 +1117,9 @@ void test_vstrhq_p_s32(int16_t *base, int32x4_t value, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vstrhq_p_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]])
-// CHECK-NEXT:    call void @llvm.masked.store.v8i16.p0(<8 x i16> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 2, <8 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    call void @llvm.masked.store.v8i16.p0(<8 x i16> [[VALUE:%.*]], ptr align 2 [[BASE:%.*]], <8 x i1> [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 void test_vstrhq_p_u16(uint16_t *base, uint16x8_t value, mve_pred16_t p)
@@ -1134,9 +1134,9 @@ void test_vstrhq_p_u16(uint16_t *base, uint16x8_t value, mve_pred16_t p)
 // CHECK-LABEL: @test_vstrhq_p_u32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc <4 x i32> [[VALUE:%.*]] to <4 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP2]])
-// CHECK-NEXT:    call void @llvm.masked.store.v4i16.p0(<4 x i16> [[TMP0]], ptr [[BASE:%.*]], i32 2, <4 x i1> [[TMP3]])
+// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
+// CHECK-NEXT:    call void @llvm.masked.store.v4i16.p0(<4 x i16> [[TMP0]], ptr align 2 [[BASE:%.*]], <4 x i1> [[TMP2]])
 // CHECK-NEXT:    ret void
 //
 void test_vstrhq_p_u32(uint16_t *base, uint32x4_t value, mve_pred16_t p)
@@ -1192,9 +1192,9 @@ void test_vstrwq_u32(uint32_t *base, uint32x4_t value)
 
 // CHECK-LABEL: @test_vstrwq_p_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
-// CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 4, <4 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[VALUE:%.*]], ptr align 4 [[BASE:%.*]], <4 x i1> [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 void test_vstrwq_p_f32(float32_t *base, float32x4_t value, mve_pred16_t p)
@@ -1208,9 +1208,9 @@ void test_vstrwq_p_f32(float32_t *base, float32x4_t value, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vstrwq_p_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
-// CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 4, <4 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[VALUE:%.*]], ptr align 4 [[BASE:%.*]], <4 x i1> [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 void test_vstrwq_p_s32(int32_t *base, int32x4_t value, mve_pred16_t p)
@@ -1224,9 +1224,9 @@ void test_vstrwq_p_s32(int32_t *base, int32x4_t value, mve_pred16_t p)
 
 // CHECK-LABEL: @test_vstrwq_p_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]])
-// CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[VALUE:%.*]], ptr [[BASE:%.*]], i32 4, <4 x i1> [[TMP2]])
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[VALUE:%.*]], ptr align 4 [[BASE:%.*]], <4 x i1> [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 void test_vstrwq_p_u32(uint32_t *base, uint32x4_t value, mve_pred16_t p)
diff --git a/clang/test/CodeGen/attr-arm-sve-vector-bits-bitcast.c b/clang/test/CodeGen/attr-arm-sve-vector-bits-bitcast.c
index 847ce67fcc31b..29425a03134e4 100644
--- a/clang/test/CodeGen/attr-arm-sve-vector-bits-bitcast.c
+++ b/clang/test/CodeGen/attr-arm-sve-vector-bits-bitcast.c
@@ -32,7 +32,7 @@ DEFINE_STRUCT(bool)
 // CHECK-128-SAME: ptr noundef readonly captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-128-NEXT:  [[ENTRY:.*:]]
 // CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 16
-// CHECK-128-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[Y]], align 16, !tbaa [[CHAR_TBAA2:![0-9]+]]
+// CHECK-128-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[Y]], align 16, !tbaa [[CHAR_TBAA6:![0-9]+]]
 // CHECK-128-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> poison, <2 x i64> [[TMP0]], i64 0)
 // CHECK-128-NEXT:    ret <vscale x 2 x i64> [[CAST_SCALABLE]]
 //
@@ -40,7 +40,7 @@ DEFINE_STRUCT(bool)
 // CHECK-256-SAME: ptr noundef readonly captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-256-NEXT:  [[ENTRY:.*:]]
 // CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 32
-// CHECK-256-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[Y]], align 16, !tbaa [[CHAR_TBAA2:![0-9]+]]
+// CHECK-256-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[Y]], align 16, !tbaa [[CHAR_TBAA6:![0-9]+]]
 // CHECK-256-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> poison, <4 x i64> [[TMP0]], i64 0)
 // CHECK-256-NEXT:    ret <vscale x 2 x i64> [[CAST_SCALABLE]]
 //
@@ -48,7 +48,7 @@ DEFINE_STRUCT(bool)
 // CHECK-512-SAME: ptr noundef readonly captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-512-NEXT:  [[ENTRY:.*:]]
 // CHECK-512-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 64
-// CHECK-512-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[Y]], align 16, !tbaa [[CHAR_TBAA2:![0-9]+]]
+// CHECK-512-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[Y]], align 16, !tbaa [[CHAR_TBAA6:![0-9]+]]
 // CHECK-512-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v8i64(<vscale x 2 x i64> poison, <8 x i64> [[TMP0]], i64 0)
 // CHECK-512-NEXT:    ret <vscale x 2 x i64> [[CAST_SCALABLE]]
 //
@@ -61,7 +61,7 @@ svint64_t read_int64(struct struct_int64 *s) {
 // CHECK-128-NEXT:  [[ENTRY:.*:]]
 // CHECK-128-NEXT:    [[CAST_FIXED:%.*]] = tail call <2 x i64> @llvm.vector.extract.v2i64.nxv2i64(<vscale x 2 x i64> [[X]], i64 0)
 // CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 16
-// CHECK-128-NEXT:    store <2 x i64> [[CAST_FIXED]], ptr [[Y]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-128-NEXT:    store <2 x i64> [[CAST_FIXED]], ptr [[Y]], align 16, !tbaa [[CHAR_TBAA6]]
 // CHECK-128-NEXT:    ret void
 //
 // CHECK-256-LABEL: define dso_local void @write_int64(
@@ -69,7 +69,7 @@ svint64_t read_int64(struct struct_int64 *s) {
 // CHECK-256-NEXT:  [[ENTRY:.*:]]
 // CHECK-256-NEXT:    [[CAST_FIXED:%.*]] = tail call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[X]], i64 0)
 // CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 32
-// CHECK-256-NEXT:    store <4 x i64> [[CAST_FIXED]], ptr [[Y]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-256-NEXT:    store <4 x i64> [[CAST_FIXED]], ptr [[Y]], align 16, !tbaa [[CHAR_TBAA6]]
 // CHECK-256-NEXT:    ret void
 //
 // CHECK-512-LABEL: define dso_local void @write_int64(
@@ -77,7 +77,7 @@ svint64_t read_int64(struct struct_int64 *s) {
 // CHECK-512-NEXT:  [[ENTRY:.*:]]
 // CHECK-512-NEXT:    [[CAST_FIXED:%.*]] = tail call <8 x i64> @llvm.vector.extract.v8i64.nxv2i64(<vscale x 2 x i64> [[X]], i64 0)
 // CHECK-512-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 64
-// CHECK-512-NEXT:    store <8 x i64> [[CAST_FIXED]], ptr [[Y]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-512-NEXT:    store <8 x i64> [[CAST_FIXED]], ptr [[Y]], align 16, !tbaa [[CHAR_TBAA6]]
 // CHECK-512-NEXT:    ret void
 //
 void write_int64(struct struct_int64 *s, svint64_t x) {
@@ -92,7 +92,7 @@ void write_int64(struct struct_int64 *s, svint64_t x) {
 // CHECK-128-SAME: ptr noundef readonly captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-128-NEXT:  [[ENTRY:.*:]]
 // CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 16
-// CHECK-128-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[Y]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-128-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[Y]], align 16, !tbaa [[CHAR_TBAA6]]
 // CHECK-128-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v2f64(<vscale x 2 x double> poison, <2 x double> [[TMP0]], i64 0)
 // CHECK-128-NEXT:    ret <vscale x 2 x double> [[CAST_SCALABLE]]
 //
@@ -100,7 +100,7 @@ void write_int64(struct struct_int64 *s, svint64_t x) {
 // CHECK-256-SAME: ptr noundef readonly captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-256-NEXT:  [[ENTRY:.*:]]
 // CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 32
-// CHECK-256-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[Y]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-256-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[Y]], align 16, !tbaa [[CHAR_TBAA6]]
 // CHECK-256-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v4f64(<vscale x 2 x double> poison, <4 x double> [[TMP0]], i64 0)
 // CHECK-256-NEXT:    ret <vscale x 2 x double> [[CAST_SCALABLE]]
 //
@@ -108,7 +108,7 @@ void write_int64(struct struct_int64 *s, svint64_t x) {
 // CHECK-512-SAME: ptr noundef readonly captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-512-NEXT:  [[ENTRY:.*:]]
 // CHECK-512-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 64
-// CHECK-512-NEXT:    [[TMP0:%.*]] = load <8 x double>, ptr [[Y]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-512-NEXT:    [[TMP0:%.*]] = load <8 x double>, ptr [[Y]], align 16, !tbaa [[CHAR_TBAA6]]
 // CHECK-512-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v8f64(<vscale x 2 x double> poison, <8 x double> [[TMP0]], i64 0)
 // CHECK-512-NEXT:    ret <vscale x 2 x double> [[CAST_SCALABLE]]
 //
@@ -121,7 +121,7 @@ svfloat64_t read_float64(struct struct_float64 *s) {
 // CHECK-128-NEXT:  [[ENTRY:.*:]]
 // CHECK-128-NEXT:    [[CAST_FIXED:%.*]] = tail call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> [[X]], i64 0)
 // CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 16
-// CHECK-128-NEXT:    store <2 x double> [[CAST_FIXED]], ptr [[Y]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-128-NEXT:    store <2 x double> [[CAST_FIXED]], ptr [[Y]], align 16, !tbaa [[CHAR_TBAA6]]
 // CHECK-128-NEXT:    ret void
 //
 // CHECK-256-LABEL: define dso_local void @write_float64(
@@ -129,7 +129,7 @@ svfloat64_t read_float64(struct struct_float64 *s) {
 // CHECK-256-NEXT:  [[ENTRY:.*:]]
 // CHECK-256-NEXT:    [[CAST_FIXED:%.*]] = tail call <4 x double> @llvm.vector.extract.v4f64.nxv2f64(<vscale x 2 x double> [[X]], i64 0)
 // CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 32
-// CHECK-256-NEXT:    store <4 x double> [[CAST_FIXED]], ptr [[Y]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-256-NEXT:    store <4 x double> [[CAST_FIXED]], ptr [[Y]], align 16, !tbaa [[CHAR_TBAA6]]
 // CHECK-256-NEXT:    ret void
 //
 // CHECK-512-LABEL: define dso_local void @write_float64(
@@ -137,7 +137,7 @@ svfloat64_t read_float64(struct struct_float64 *s) {
 // CHECK-512-NEXT:  [[ENTRY:.*:]]
 // CHECK-512-NEXT:    [[CAST_FIXED:%.*]] = tail call <8 x double> @llvm.vector.extract.v8f64.nxv2f64(<vscale x 2 x double> [[X]], i64 0)
 // CHECK-512-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 64
-// CHECK-512-NEXT:    store <8 x double> [[CAST_FIXED]], ptr [[Y]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-512-NEXT:    store <8 x double> [[CAST_FIXED]], ptr [[Y]], align 16, !tbaa [[CHAR_TBAA6]]
 // CHECK-512-NEXT:    ret void
 //
 void write_float64(struct struct_float64 *s, svfloat64_t x) {
@@ -152,7 +152,7 @@ void write_float64(struct struct_float64 *s, svfloat64_t x) {
 // CHECK-128-SAME: ptr noundef readonly captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-128-NEXT:  [[ENTRY:.*:]]
 // CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 16
-// CHECK-128-NEXT:    [[TMP0:%.*]] = load <8 x bfloat>, ptr [[Y]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-128-NEXT:    [[TMP0:%.*]] = load <8 x bfloat>, ptr [[Y]], align 16, !tbaa [[CHAR_TBAA6]]
 // CHECK-128-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat> poison, <8 x bfloat> [[TMP0]], i64 0)
 // CHECK-128-NEXT:    ret <vscale x 8 x bfloat> [[CAST_SCALABLE]]
 //
@@ -160,7 +160,7 @@ void write_float64(struct struct_float64 *s, svfloat64_t x) {
 // CHECK-256-SAME: ptr noundef readonly captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-256-NEXT:  [[ENTRY:.*:]]
 // CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 32
-// CHECK-256-NEXT:    [[TMP0:%.*]] = load <16 x bfloat>, ptr [[Y]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-256-NEXT:    [[TMP0:%.*]] = load <16 x bfloat>, ptr [[Y]], align 16, !tbaa [[CHAR_TBAA6]]
 // CHECK-256-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v16bf16(<vscale x 8 x bfloat> poison, <16 x bfloat> [[TMP0]], i64 0)
 // CHECK-256-NEXT:    ret <vscale x 8 x bfloat> [[CAST_SCALABLE]]
 //
@@ -168,7 +168,7 @@ void write_float64(struct struct_float64 *s, svfloat64_t x) {
 // CHECK-512-SAME: ptr noundef readonly captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-512-NEXT:  [[ENTRY:.*:]]
 // CHECK-512-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 64
-// CHECK-512-NEXT:    [[TMP0:%.*]] = load <32 x bfloat>, ptr [[Y]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-512-NEXT:    [[TMP0:%.*]] = load <32 x bfloat>, ptr [[Y]], align 16, !tbaa [[CHAR_TBAA6]]
 // CHECK-512-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v32bf16(<vscale x 8 x bfloat> poison, <32 x bfloat> [[TMP0]], i64 0)
 // CHECK-512-NEXT:    ret <vscale x 8 x bfloat> [[CAST_SCALABLE]]
 //
@@ -181,7 +181,7 @@ svbfloat16_t read_bfloat16(struct struct_bfloat16 *s) {
 // CHECK-128-NEXT:  [[ENTRY:.*:]]
 // CHECK-128-NEXT:    [[CAST_FIXED:%.*]] = tail call <8 x bfloat> @llvm.vector.extract.v8bf16.nxv8bf16(<vscale x 8 x bfloat> [[X]], i64 0)
 // CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 16
-// CHECK-128-NEXT:    store <8 x bfloat> [[CAST_FIXED]], ptr [[Y]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-128-NEXT:    store <8 x bfloat> [[CAST_FIXED]], ptr [[Y]], align 16, !tbaa [[CHAR_TBAA6]]
 // CHECK-128-NEXT:    ret void
 //
 // CHECK-256-LABEL: define dso_local void @write_bfloat16(
@@ -189,7 +189,7 @@ svbfloat16_t read_bfloat16(struct struct_bfloat16 *s) {
 // CHECK-256-NEXT:  [[ENTRY:.*:]]
 // CHECK-256-NEXT:    [[CAST_FIXED:%.*]] = tail call <16 x bfloat> @llvm.vector.extract.v16bf16.nxv8bf16(<vscale x 8 x bfloat> [[X]], i64 0)
 // CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 32
-// CHECK-256-NEXT:    store <16 x bfloat> [[CAST_FIXED]], ptr [[Y]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-256-NEXT:    store <16 x bfloat> [[CAST_FIXED]], ptr [[Y]], align 16, !tbaa [[CHAR_TBAA6]]
 // CHECK-256-NEXT:    ret void
 //
 // CHECK-512-LABEL: define dso_local void @write_bfloat16(
@@ -197,7 +197,7 @@ svbfloat16_t read_bfloat16(struct struct_bfloat16 *s) {
 // CHECK-512-NEXT:  [[ENTRY:.*:]]
 // CHECK-512-NEXT:    [[CAST_FIXED:%.*]] = tail call <32 x bfloat> @llvm.vector.extract.v32bf16.nxv8bf16(<vscale x 8 x bfloat> [[X]], i64 0)
 // CHECK-512-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 64
-// CHECK-512-NEXT:    store <32 x bfloat> [[CAST_FIXED]], ptr [[Y]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-512-NEXT:    store <32 x bfloat> [[CAST_FIXED]], ptr [[Y]], align 16, !tbaa [[CHAR_TBAA6]]
 // CHECK-512-NEXT:    ret void
 //
 void write_bfloat16(struct struct_bfloat16 *s, svbfloat16_t x) {
@@ -212,7 +212,7 @@ void write_bfloat16(struct struct_bfloat16 *s, svbfloat16_t x) {
 // CHECK-128-SAME: ptr noundef readonly captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-128-NEXT:  [[ENTRY:.*:]]
 // CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 2
-// CHECK-128-NEXT:    [[TMP0:%.*]] = load <2 x i8>, ptr [[Y]], align 2, !tbaa [[CHAR_TBAA2]]
+// CHECK-128-NEXT:    [[TMP0:%.*]] = load <2 x i8>, ptr [[Y]], align 2, !tbaa [[CHAR_TBAA6]]
 // CHECK-128-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i8> @llvm.vector.insert.nxv2i8.v2i8(<vscale x 2 x i8> poison, <2 x i8> [[TMP0]], i64 0)
 // CHECK-128-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 2 x i8> [[CAST_SCALABLE]] to <vscale x 16 x i1>
 // CHECK-128-NEXT:    ret <vscale x 16 x i1> [[TMP1]]
@@ -221,7 +221,7 @@ void write_bfloat16(struct struct_bfloat16 *s, svbfloat16_t x) {
 // CHECK-256-SAME: ptr noundef readonly captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-256-NEXT:  [[ENTRY:.*:]]
 // CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 4
-// CHECK-256-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[Y]], align 2, !tbaa [[CHAR_TBAA2]]
+// CHECK-256-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[Y]], align 2, !tbaa [[CHAR_TBAA6]]
 // CHECK-256-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i8> @llvm.vector.insert.nxv2i8.v4i8(<vscale x 2 x i8> poison, <4 x i8> [[TMP0]], i64 0)
 // CHECK-256-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 2 x i8> [[CAST_SCALABLE]] to <vscale x 16 x i1>
 // CHECK-256-NEXT:    ret <vscale x 16 x i1> [[TMP1]]
@@ -230,7 +230,7 @@ void write_bfloat16(struct struct_bfloat16 *s, svbfloat16_t x) {
 // CHECK-512-SAME: ptr noundef readonly captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-512-NEXT:  [[ENTRY:.*:]]
 // CHECK-512-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 8
-// CHECK-512-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[Y]], align 2, !tbaa [[CHAR_TBAA2]]
+// CHECK-512-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[Y]], align 2, !tbaa [[CHAR_TBAA6]]
 // CHECK-512-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i8> @llvm.vector.insert.nxv2i8.v8i8(<vscale x 2 x i8> poison, <8 x i8> [[TMP0]], i64 0)
 // CHECK-512-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 2 x i8> [[CAST_SCALABLE]] to <vscale x 16 x i1>
 // CHECK-512-NEXT:    ret <vscale x 16 x i1> [[TMP1]]
@@ -245,7 +245,7 @@ svbool_t read_bool(struct struct_bool *s) {
 // CHECK-128-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i1> [[X]] to <vscale x 2 x i8>
 // CHECK-128-NEXT:    [[CAST_FIXED:%.*]] = tail call <2 x i8> @llvm.vector.extract.v2i8.nxv2i8(<vscale x 2 x i8> [[TMP0]], i64 0)
 // CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 2
-// CHECK-128-NEXT:    store <2 x i8> [[CAST_FIXED]], ptr [[Y]], align 2, !tbaa [[CHAR_TBAA2]]
+// CHECK-128-NEXT:    store <2 x i8> [[CAST_FIXED]], ptr [[Y]], align 2, !tbaa [[CHAR_TBAA6]]
 // CHECK-128-NEXT:    ret void
 //
 // CHECK-256-LABEL: define dso_local void @write_bool(
@@ -254,7 +254,7 @@ svbool_t read_bool(struct struct_bool *s) {
 // CHECK-256-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i1> [[X]] to <vscale x 2 x i8>
 // CHECK-256-NEXT:    [[CAST_FIXED:%.*]] = tail call <4 x i8> @llvm.vector.extract.v4i8.nxv2i8(<vscale x 2 x i8> [[TMP0]], i64 0)
 // CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 4
-// CHECK-256-NEXT:    store <4 x i8> [[CAST_FIXED]], ptr [[Y]], align 2, !tbaa [[CHAR_TBAA2]]
+// CHECK-256-NEXT:    store <4 x i8> [[CAST_FIXED]], ptr [[Y]], align 2, !tbaa [[CHAR_TBAA6]]
 // CHECK-256-NEXT:    ret void
 //
 // CHECK-512-LABEL: define dso_local void @write_bool(
@@ -263,22 +263,22 @@ svbool_t read_bool(struct struct_bool *s) {
 // CHECK-512-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i1> [[X]] to <vscale x 2 x i8>
 // CHECK-512-NEXT:    [[CAST_FIXED:%.*]] = tail call <8 x i8> @llvm.vector.extract.v8i8.nxv2i8(<vscale x 2 x i8> [[TMP0]], i64 0)
 // CHECK-512-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 8
-// CHECK-512-NEXT:    store <8 x i8> [[CAST_FIXED]], ptr [[Y]], align 2, !tbaa [[CHAR_TBAA2]]
+// CHECK-512-NEXT:    store <8 x i8> [[CAST_FIXED]], ptr [[Y]], align 2, !tbaa [[CHAR_TBAA6]]
 // CHECK-512-NEXT:    ret void
 //
 void write_bool(struct struct_bool *s, svbool_t x) {
   s->y[0] = x;
 }
 //.
-// CHECK-128: [[CHAR_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
-// CHECK-128: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
-// CHECK-128: [[META4]] = !{!"Simple C/C++ TBAA"}
+// CHECK-128: [[META4:![0-9]+]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK-128: [[META5]] = !{!"Simple C/C++ TBAA"}
+// CHECK-128: [[CHAR_TBAA6]] = !{[[META4]], [[META4]], i64 0}
 //.
-// CHECK-256: [[CHAR_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
-// CHECK-256: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
-// CHECK-256: [[META4]] = !{!"Simple C/C++ TBAA"}
+// CHECK-256: [[META4:![0-9]+]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK-256: [[META5]] = !{!"Simple C/C++ TBAA"}
+// CHECK-256: [[CHAR_TBAA6]] = !{[[META4]], [[META4]], i64 0}
 //.
-// CHECK-512: [[CHAR_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
-// CHECK-512: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
-// CHECK-512: [[META4]] = !{!"Simple C/C++ TBAA"}
+// CHECK-512: [[META4:![0-9]+]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK-512: [[META5]] = !{!"Simple C/C++ TBAA"}
+// CHECK-512: [[CHAR_TBAA6]] = !{[[META4]], [[META4]], i64 0}
 //.
diff --git a/clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c b/clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c
index bdaebf7ec1da7..7e3223f671f27 100644
--- a/clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c
+++ b/clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c
@@ -79,7 +79,7 @@ svint64_t lax_cast(fixed_int32_t type) {
 // CHECK-LABEL: define dso_local <vscale x 4 x i32> @to_svint32_t__from_gnu_int32_t(
 // CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TYPE:%.*]] = load <16 x i32>, ptr [[TMP0]], align 16, !tbaa [[CHAR_TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[TYPE:%.*]] = load <16 x i32>, ptr [[TMP0]], align 16, !tbaa [[CHAR_TBAA6:![0-9]+]]
 // CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> poison, <16 x i32> [[TYPE]], i64 0)
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[CAST_SCALABLE]]
 //
@@ -91,7 +91,7 @@ svint32_t to_svint32_t__from_gnu_int32_t(gnu_int32_t type) {
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i32>) align 16 captures(none) initializes((0, 64)) [[AGG_RESULT:%.*]], <vscale x 4 x i32> [[TYPE:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[CAST_FIXED:%.*]] = tail call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> [[TYPE]], i64 0)
-// CHECK-NEXT:    store <16 x i32> [[CAST_FIXED]], ptr [[AGG_RESULT]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i32> [[CAST_FIXED]], ptr [[AGG_RESULT]], align 16, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 gnu_int32_t from_svint32_t__to_gnu_int32_t(svint32_t type) {
@@ -101,7 +101,7 @@ gnu_int32_t from_svint32_t__to_gnu_int32_t(svint32_t type) {
 // CHECK-LABEL: define dso_local <vscale x 4 x i32> @to_fixed_int32_t__from_gnu_int32_t(
 // CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TYPE:%.*]] = load <16 x i32>, ptr [[TMP0]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TYPE:%.*]] = load <16 x i32>, ptr [[TMP0]], align 16, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> poison, <16 x i32> [[TYPE]], i64 0)
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[CAST_SCALABLE]]
 //
@@ -113,14 +113,14 @@ fixed_int32_t to_fixed_int32_t__from_gnu_int32_t(gnu_int32_t type) {
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<16 x i32>) align 16 captures(none) initializes((0, 64)) [[AGG_RESULT:%.*]], <vscale x 4 x i32> noundef [[TYPE_COERCE:%.*]]) local_unnamed_addr #[[ATTR3]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TYPE:%.*]] = tail call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> [[TYPE_COERCE]], i64 0)
-// CHECK-NEXT:    store <16 x i32> [[TYPE]], ptr [[AGG_RESULT]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <16 x i32> [[TYPE]], ptr [[AGG_RESULT]], align 16, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 gnu_int32_t from_fixed_int32_t__to_gnu_int32_t(fixed_int32_t type) {
   return type;
 }
 //.
-// CHECK: [[CHAR_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
-// CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
-// CHECK: [[META4]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[META4:![0-9]+]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK: [[META5]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[CHAR_TBAA6]] = !{[[META4]], [[META4]], i64 0}
 //.
diff --git a/clang/test/CodeGen/attr-arm-sve-vector-bits-globals.c b/clang/test/CodeGen/attr-arm-sve-vector-bits-globals.c
index b604a06d76a30..ae011db633b6a 100644
--- a/clang/test/CodeGen/attr-arm-sve-vector-bits-globals.c
+++ b/clang/test/CodeGen/attr-arm-sve-vector-bits-globals.c
@@ -24,14 +24,14 @@ fixed_bool_t global_bool;
 // CHECK-128-SAME: <vscale x 2 x i64> [[V:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-128-NEXT:  [[ENTRY:.*:]]
 // CHECK-128-NEXT:    [[CAST_FIXED:%.*]] = tail call <2 x i64> @llvm.vector.extract.v2i64.nxv2i64(<vscale x 2 x i64> [[V]], i64 0)
-// CHECK-128-NEXT:    store <2 x i64> [[CAST_FIXED]], ptr @global_i64, align 16, !tbaa [[CHAR_TBAA2:![0-9]+]]
+// CHECK-128-NEXT:    store <2 x i64> [[CAST_FIXED]], ptr @global_i64, align 16, !tbaa [[CHAR_TBAA6:![0-9]+]]
 // CHECK-128-NEXT:    ret void
 //
 // CHECK-512-LABEL: define void @write_global_i64(
 // CHECK-512-SAME: <vscale x 2 x i64> [[V:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-512-NEXT:  [[ENTRY:.*:]]
 // CHECK-512-NEXT:    [[CAST_FIXED:%.*]] = tail call <8 x i64> @llvm.vector.extract.v8i64.nxv2i64(<vscale x 2 x i64> [[V]], i64 0)
-// CHECK-512-NEXT:    store <8 x i64> [[CAST_FIXED]], ptr @global_i64, align 16, !tbaa [[CHAR_TBAA2:![0-9]+]]
+// CHECK-512-NEXT:    store <8 x i64> [[CAST_FIXED]], ptr @global_i64, align 16, !tbaa [[CHAR_TBAA6:![0-9]+]]
 // CHECK-512-NEXT:    ret void
 //
 void write_global_i64(svint64_t v) { global_i64 = v; }
@@ -40,14 +40,14 @@ void write_global_i64(svint64_t v) { global_i64 = v; }
 // CHECK-128-SAME: <vscale x 8 x bfloat> [[V:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-128-NEXT:  [[ENTRY:.*:]]
 // CHECK-128-NEXT:    [[CAST_FIXED:%.*]] = tail call <8 x bfloat> @llvm.vector.extract.v8bf16.nxv8bf16(<vscale x 8 x bfloat> [[V]], i64 0)
-// CHECK-128-NEXT:    store <8 x bfloat> [[CAST_FIXED]], ptr @global_bf16, align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-128-NEXT:    store <8 x bfloat> [[CAST_FIXED]], ptr @global_bf16, align 16, !tbaa [[CHAR_TBAA6]]
 // CHECK-128-NEXT:    ret void
 //
 // CHECK-512-LABEL: define void @write_global_bf16(
 // CHECK-512-SAME: <vscale x 8 x bfloat> [[V:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-512-NEXT:  [[ENTRY:.*:]]
 // CHECK-512-NEXT:    [[CAST_FIXED:%.*]] = tail call <32 x bfloat> @llvm.vector.extract.v32bf16.nxv8bf16(<vscale x 8 x bfloat> [[V]], i64 0)
-// CHECK-512-NEXT:    store <32 x bfloat> [[CAST_FIXED]], ptr @global_bf16, align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-512-NEXT:    store <32 x bfloat> [[CAST_FIXED]], ptr @global_bf16, align 16, !tbaa [[CHAR_TBAA6]]
 // CHECK-512-NEXT:    ret void
 //
 void write_global_bf16(svbfloat16_t v) { global_bf16 = v; }
@@ -57,7 +57,7 @@ void write_global_bf16(svbfloat16_t v) { global_bf16 = v; }
 // CHECK-128-NEXT:  [[ENTRY:.*:]]
 // CHECK-128-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i1> [[V]] to <vscale x 2 x i8>
 // CHECK-128-NEXT:    [[CAST_FIXED:%.*]] = tail call <2 x i8> @llvm.vector.extract.v2i8.nxv2i8(<vscale x 2 x i8> [[TMP0]], i64 0)
-// CHECK-128-NEXT:    store <2 x i8> [[CAST_FIXED]], ptr @global_bool, align 2, !tbaa [[CHAR_TBAA2]]
+// CHECK-128-NEXT:    store <2 x i8> [[CAST_FIXED]], ptr @global_bool, align 2, !tbaa [[CHAR_TBAA6]]
 // CHECK-128-NEXT:    ret void
 //
 // CHECK-512-LABEL: define void @write_global_bool(
@@ -65,7 +65,7 @@ void write_global_bf16(svbfloat16_t v) { global_bf16 = v; }
 // CHECK-512-NEXT:  [[ENTRY:.*:]]
 // CHECK-512-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i1> [[V]] to <vscale x 2 x i8>
 // CHECK-512-NEXT:    [[CAST_FIXED:%.*]] = tail call <8 x i8> @llvm.vector.extract.v8i8.nxv2i8(<vscale x 2 x i8> [[TMP0]], i64 0)
-// CHECK-512-NEXT:    store <8 x i8> [[CAST_FIXED]], ptr @global_bool, align 2, !tbaa [[CHAR_TBAA2]]
+// CHECK-512-NEXT:    store <8 x i8> [[CAST_FIXED]], ptr @global_bool, align 2, !tbaa [[CHAR_TBAA6]]
 // CHECK-512-NEXT:    ret void
 //
 void write_global_bool(svbool_t v) { global_bool = v; }
@@ -77,14 +77,14 @@ void write_global_bool(svbool_t v) { global_bool = v; }
 // CHECK-128-LABEL: define <vscale x 2 x i64> @read_global_i64(
 // CHECK-128-SAME: ) local_unnamed_addr #[[ATTR2:[0-9]+]] {
 // CHECK-128-NEXT:  [[ENTRY:.*:]]
-// CHECK-128-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr @global_i64, align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-128-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr @global_i64, align 16, !tbaa [[CHAR_TBAA6]]
 // CHECK-128-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> poison, <2 x i64> [[TMP0]], i64 0)
 // CHECK-128-NEXT:    ret <vscale x 2 x i64> [[CAST_SCALABLE]]
 //
 // CHECK-512-LABEL: define <vscale x 2 x i64> @read_global_i64(
 // CHECK-512-SAME: ) local_unnamed_addr #[[ATTR2:[0-9]+]] {
 // CHECK-512-NEXT:  [[ENTRY:.*:]]
-// CHECK-512-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr @global_i64, align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-512-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr @global_i64, align 16, !tbaa [[CHAR_TBAA6]]
 // CHECK-512-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v8i64(<vscale x 2 x i64> poison, <8 x i64> [[TMP0]], i64 0)
 // CHECK-512-NEXT:    ret <vscale x 2 x i64> [[CAST_SCALABLE]]
 //
@@ -93,14 +93,14 @@ svint64_t read_global_i64() { return global_i64; }
 // CHECK-128-LABEL: define <vscale x 8 x bfloat> @read_global_bf16(
 // CHECK-128-SAME: ) local_unnamed_addr #[[ATTR2]] {
 // CHECK-128-NEXT:  [[ENTRY:.*:]]
-// CHECK-128-NEXT:    [[TMP0:%.*]] = load <8 x bfloat>, ptr @global_bf16, align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-128-NEXT:    [[TMP0:%.*]] = load <8 x bfloat>, ptr @global_bf16, align 16, !tbaa [[CHAR_TBAA6]]
 // CHECK-128-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat> poison, <8 x bfloat> [[TMP0]], i64 0)
 // CHECK-128-NEXT:    ret <vscale x 8 x bfloat> [[CAST_SCALABLE]]
 //
 // CHECK-512-LABEL: define <vscale x 8 x bfloat> @read_global_bf16(
 // CHECK-512-SAME: ) local_unnamed_addr #[[ATTR2]] {
 // CHECK-512-NEXT:  [[ENTRY:.*:]]
-// CHECK-512-NEXT:    [[TMP0:%.*]] = load <32 x bfloat>, ptr @global_bf16, align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-512-NEXT:    [[TMP0:%.*]] = load <32 x bfloat>, ptr @global_bf16, align 16, !tbaa [[CHAR_TBAA6]]
 // CHECK-512-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v32bf16(<vscale x 8 x bfloat> poison, <32 x bfloat> [[TMP0]], i64 0)
 // CHECK-512-NEXT:    ret <vscale x 8 x bfloat> [[CAST_SCALABLE]]
 //
@@ -109,7 +109,7 @@ svbfloat16_t read_global_bf16() { return global_bf16; }
 // CHECK-128-LABEL: define <vscale x 16 x i1> @read_global_bool(
 // CHECK-128-SAME: ) local_unnamed_addr #[[ATTR2]] {
 // CHECK-128-NEXT:  [[ENTRY:.*:]]
-// CHECK-128-NEXT:    [[TMP0:%.*]] = load <2 x i8>, ptr @global_bool, align 2, !tbaa [[CHAR_TBAA2]]
+// CHECK-128-NEXT:    [[TMP0:%.*]] = load <2 x i8>, ptr @global_bool, align 2, !tbaa [[CHAR_TBAA6]]
 // CHECK-128-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i8> @llvm.vector.insert.nxv2i8.v2i8(<vscale x 2 x i8> poison, <2 x i8> [[TMP0]], i64 0)
 // CHECK-128-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 2 x i8> [[CAST_SCALABLE]] to <vscale x 16 x i1>
 // CHECK-128-NEXT:    ret <vscale x 16 x i1> [[TMP1]]
@@ -117,18 +117,18 @@ svbfloat16_t read_global_bf16() { return global_bf16; }
 // CHECK-512-LABEL: define <vscale x 16 x i1> @read_global_bool(
 // CHECK-512-SAME: ) local_unnamed_addr #[[ATTR2]] {
 // CHECK-512-NEXT:  [[ENTRY:.*:]]
-// CHECK-512-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr @global_bool, align 2, !tbaa [[CHAR_TBAA2]]
+// CHECK-512-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr @global_bool, align 2, !tbaa [[CHAR_TBAA6]]
 // CHECK-512-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i8> @llvm.vector.insert.nxv2i8.v8i8(<vscale x 2 x i8> poison, <8 x i8> [[TMP0]], i64 0)
 // CHECK-512-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 2 x i8> [[CAST_SCALABLE]] to <vscale x 16 x i1>
 // CHECK-512-NEXT:    ret <vscale x 16 x i1> [[TMP1]]
 //
 svbool_t read_global_bool() { return global_bool; }
 //.
-// CHECK-128: [[CHAR_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
-// CHECK-128: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
-// CHECK-128: [[META4]] = !{!"Simple C/C++ TBAA"}
+// CHECK-128: [[META4:![0-9]+]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK-128: [[META5]] = !{!"Simple C/C++ TBAA"}
+// CHECK-128: [[CHAR_TBAA6]] = !{[[META4]], [[META4]], i64 0}
 //.
-// CHECK-512: [[CHAR_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
-// CHECK-512: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
-// CHECK-512: [[META4]] = !{!"Simple C/C++ TBAA"}
+// CHECK-512: [[META4:![0-9]+]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK-512: [[META5]] = !{!"Simple C/C++ TBAA"}
+// CHECK-512: [[CHAR_TBAA6]] = !{[[META4]], [[META4]], i64 0}
 //.
diff --git a/clang/test/CodeGen/attr-counted-by-for-pointers.c b/clang/test/CodeGen/attr-counted-by-for-pointers.c
index f7b737d5c5039..c5729fd017d8c 100644
--- a/clang/test/CodeGen/attr-counted-by-for-pointers.c
+++ b/clang/test/CodeGen/attr-counted-by-for-pointers.c
@@ -33,47 +33,47 @@ struct annotated_ptr {
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 8
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2:![0-9]+]]
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label %[[CONT10:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3:![0-9]+]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META6:![0-9]+]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label %[[CONT10:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF7:![0-9]+]], !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB2:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR3:[0-9]+]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB2:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR3:[0-9]+]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[CONT10]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[BUF:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA4:![0-9]+]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA8:![0-9]+]]
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 [[IDXPROM]]
-// SANITIZE-WITH-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA13:![0-9]+]]
+// SANITIZE-WITH-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA14:![0-9]+]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test1(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]], ptr noundef [[VALUE:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[BUF:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA2:![0-9]+]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA6:![0-9]+]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP0]], i64 [[IDXPROM]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA11:![0-9]+]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA12:![0-9]+]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test1(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]], ptr noundef [[VALUE:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[BUF:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA2:![0-9]+]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA6:![0-9]+]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP0]], i64 [[IDXPROM]]
-// SANITIZE-WITHOUT-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA11:![0-9]+]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA12:![0-9]+]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test1(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]], ptr noundef [[VALUE:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[BUF:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA2:![0-9]+]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA6:![0-9]+]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP0]], i64 [[IDXPROM]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA11:![0-9]+]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA12:![0-9]+]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 void test1(struct annotated_ptr *p, int index, struct foo *value) {
@@ -86,47 +86,47 @@ void test1(struct annotated_ptr *p, int index, struct foo *value) {
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 8
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label %[[CONT10:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label %[[CONT10:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF7]], !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB4:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB4:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR3]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[CONT10]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[BUF:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA4]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA8]]
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 [[IDXPROM]]
-// SANITIZE-WITH-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA13]]
+// SANITIZE-WITH-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA14]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test2(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]], ptr noundef [[VALUE:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[BUF:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA2]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA6]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP0]], i64 [[IDXPROM]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA11]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA12]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test2(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]], ptr noundef [[VALUE:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[BUF:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA2]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA6]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP0]], i64 [[IDXPROM]]
-// SANITIZE-WITHOUT-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA11]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA12]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test2(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]], ptr noundef [[VALUE:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[BUF:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA2]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA6]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP0]], i64 [[IDXPROM]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA11]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA12]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 void test2(struct annotated_ptr *p, int index, struct foo *value) {
@@ -139,47 +139,47 @@ void test2(struct annotated_ptr *p, int index, struct foo *value) {
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 8
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    [[DOTNOT:%.*]] = icmp ugt i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[DOTNOT]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], label %[[CONT10:.*]], !prof [[PROF15:![0-9]+]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    [[DOTNOT:%.*]] = icmp ugt i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[DOTNOT]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], label %[[CONT10:.*]], !prof [[PROF16:![0-9]+]], !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB5:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB5:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR3]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[CONT10]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[BUF:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA4]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA8]]
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 [[IDXPROM]]
-// SANITIZE-WITH-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA13]]
+// SANITIZE-WITH-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA14]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test3(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]], ptr noundef [[VALUE:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[BUF:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA2]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA6]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP0]], i64 [[IDXPROM]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA11]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA12]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test3(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]], ptr noundef [[VALUE:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[BUF:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA2]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA6]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP0]], i64 [[IDXPROM]]
-// SANITIZE-WITHOUT-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA11]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA12]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test3(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readonly captures(none) [[P:%.*]], i32 noundef [[INDEX:%.*]], ptr noundef [[VALUE:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:  [[ENTRY:.*:]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[BUF:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA2]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BUF]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA6]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[TMP0]], i64 [[IDXPROM]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA11]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store ptr [[VALUE]], ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS3FOOPTR_TBAA12]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 void test3(struct annotated_ptr *p, int index, struct foo *value) {
@@ -264,12 +264,12 @@ size_t test5(struct annotated_ptr *p, int index) {
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    [[DOTNOT:%.*]] = icmp ugt i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[DOTNOT]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], label %[[CONT8:.*]], !prof [[PROF15]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    [[DOTNOT:%.*]] = icmp ugt i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[DOTNOT]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], label %[[CONT8:.*]], !prof [[PROF16]], !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB6:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB6:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR3]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[CONT8]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNT:%.*]] = sext i32 [[DOTCOUNTED_BY_LOAD]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = sub nsw i64 [[COUNT]], [[IDXPROM]]
@@ -312,12 +312,12 @@ size_t test6(struct annotated_ptr *p, int index) {
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 8
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label %[[CONT10:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label %[[CONT10:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF7]], !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB7:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB7:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR3]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[CONT10]]:
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
@@ -384,12 +384,12 @@ size_t test8(struct annotated_sized_ptr *p, int index) {
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    [[DOTNOT:%.*]] = icmp ugt i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[DOTNOT]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], label %[[CONT8:.*]], !prof [[PROF15]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    [[DOTNOT:%.*]] = icmp ugt i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[DOTNOT]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], label %[[CONT8:.*]], !prof [[PROF16]], !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB9:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB9:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR3]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[CONT8]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNT:%.*]] = sext i32 [[DOTCOUNTED_BY_LOAD]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[RESULT:%.*]] = sub nsw i64 [[COUNT]], [[IDXPROM]]
@@ -430,12 +430,12 @@ size_t test9(struct annotated_sized_ptr *p, int index) {
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    [[DOTNOT:%.*]] = icmp ugt i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[DOTNOT]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], label %[[CONT8:.*]], !prof [[PROF15]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    [[DOTNOT:%.*]] = icmp ugt i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[DOTNOT]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], label %[[CONT8:.*]], !prof [[PROF16]], !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB11:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB11:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR3]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[CONT8]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNT:%.*]] = sext i32 [[DOTCOUNTED_BY_LOAD]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[INDEX_SIZE:%.*]] = shl nuw nsw i64 [[IDXPROM]], 2
@@ -549,54 +549,54 @@ size_t test12(struct pr151236_struct *p) {
   return __bdos(p->a) + __bdos(((int *)p->a));
 }
 //.
-// SANITIZE-WITH-ATTR: [[META2]] = !{}
-// SANITIZE-WITH-ATTR: [[PROF3]] = !{!"branch_weights", i32 1048575, i32 1}
-// SANITIZE-WITH-ATTR: [[_ZTS3FOOPTR_TBAA4]] = !{[[META5:![0-9]+]], [[META9:![0-9]+]], i64 8}
-// SANITIZE-WITH-ATTR: [[META5]] = !{!"annotated_ptr", [[META6:![0-9]+]], i64 0, [[META9]], i64 8, [[META12:![0-9]+]], i64 16}
-// SANITIZE-WITH-ATTR: [[META6]] = !{!"long", [[META7:![0-9]+]], i64 0}
-// SANITIZE-WITH-ATTR: [[META7]] = !{!"omnipotent char", [[META8:![0-9]+]], i64 0}
-// SANITIZE-WITH-ATTR: [[META8]] = !{!"Simple C/C++ TBAA"}
-// SANITIZE-WITH-ATTR: [[META9]] = !{!"p2 _ZTS3foo", [[META10:![0-9]+]], i64 0}
-// SANITIZE-WITH-ATTR: [[META10]] = !{!"any p2 pointer", [[META11:![0-9]+]], i64 0}
-// SANITIZE-WITH-ATTR: [[META11]] = !{!"any pointer", [[META7]], i64 0}
-// SANITIZE-WITH-ATTR: [[META12]] = !{!"int", [[META7]], i64 0}
-// SANITIZE-WITH-ATTR: [[_ZTS3FOOPTR_TBAA13]] = !{[[META14:![0-9]+]], [[META14]], i64 0}
-// SANITIZE-WITH-ATTR: [[META14]] = !{!"p1 _ZTS3foo", [[META11]], i64 0}
-// SANITIZE-WITH-ATTR: [[PROF15]] = !{!"branch_weights", i32 1, i32 1048575}
+// SANITIZE-WITH-ATTR: [[META3:![0-9]+]] = !{!"int", [[META4:![0-9]+]], i64 0}
+// SANITIZE-WITH-ATTR: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// SANITIZE-WITH-ATTR: [[META5]] = !{!"Simple C/C++ TBAA"}
+// SANITIZE-WITH-ATTR: [[META6]] = !{}
+// SANITIZE-WITH-ATTR: [[PROF7]] = !{!"branch_weights", i32 1048575, i32 1}
+// SANITIZE-WITH-ATTR: [[_ZTS3FOOPTR_TBAA8]] = !{[[META9:![0-9]+]], [[META11:![0-9]+]], i64 8}
+// SANITIZE-WITH-ATTR: [[META9]] = !{!"annotated_ptr", [[META10:![0-9]+]], i64 0, [[META11]], i64 8, [[META3]], i64 16}
+// SANITIZE-WITH-ATTR: [[META10]] = !{!"long", [[META4]], i64 0}
+// SANITIZE-WITH-ATTR: [[META11]] = !{!"p2 _ZTS3foo", [[META12:![0-9]+]], i64 0}
+// SANITIZE-WITH-ATTR: [[META12]] = !{!"any p2 pointer", [[META13:![0-9]+]], i64 0}
+// SANITIZE-WITH-ATTR: [[META13]] = !{!"any pointer", [[META4]], i64 0}
+// SANITIZE-WITH-ATTR: [[_ZTS3FOOPTR_TBAA14]] = !{[[META15:![0-9]+]], [[META15]], i64 0}
+// SANITIZE-WITH-ATTR: [[META15]] = !{!"p1 _ZTS3foo", [[META13]], i64 0}
+// SANITIZE-WITH-ATTR: [[PROF16]] = !{!"branch_weights", i32 1, i32 1048575}
 //.
-// NO-SANITIZE-WITH-ATTR: [[_ZTS3FOOPTR_TBAA2]] = !{[[META3:![0-9]+]], [[META7:![0-9]+]], i64 8}
-// NO-SANITIZE-WITH-ATTR: [[META3]] = !{!"annotated_ptr", [[META4:![0-9]+]], i64 0, [[META7]], i64 8, [[META10:![0-9]+]], i64 16}
-// NO-SANITIZE-WITH-ATTR: [[META4]] = !{!"long", [[META5:![0-9]+]], i64 0}
-// NO-SANITIZE-WITH-ATTR: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
-// NO-SANITIZE-WITH-ATTR: [[META6]] = !{!"Simple C/C++ TBAA"}
-// NO-SANITIZE-WITH-ATTR: [[META7]] = !{!"p2 _ZTS3foo", [[META8:![0-9]+]], i64 0}
-// NO-SANITIZE-WITH-ATTR: [[META8]] = !{!"any p2 pointer", [[META9:![0-9]+]], i64 0}
-// NO-SANITIZE-WITH-ATTR: [[META9]] = !{!"any pointer", [[META5]], i64 0}
-// NO-SANITIZE-WITH-ATTR: [[META10]] = !{!"int", [[META5]], i64 0}
-// NO-SANITIZE-WITH-ATTR: [[_ZTS3FOOPTR_TBAA11]] = !{[[META12:![0-9]+]], [[META12]], i64 0}
-// NO-SANITIZE-WITH-ATTR: [[META12]] = !{!"p1 _ZTS3foo", [[META9]], i64 0}
+// NO-SANITIZE-WITH-ATTR: [[META3:![0-9]+]] = !{!"int", [[META4:![0-9]+]], i64 0}
+// NO-SANITIZE-WITH-ATTR: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// NO-SANITIZE-WITH-ATTR: [[META5]] = !{!"Simple C/C++ TBAA"}
+// NO-SANITIZE-WITH-ATTR: [[_ZTS3FOOPTR_TBAA6]] = !{[[META7:![0-9]+]], [[META9:![0-9]+]], i64 8}
+// NO-SANITIZE-WITH-ATTR: [[META7]] = !{!"annotated_ptr", [[META8:![0-9]+]], i64 0, [[META9]], i64 8, [[META3]], i64 16}
+// NO-SANITIZE-WITH-ATTR: [[META8]] = !{!"long", [[META4]], i64 0}
+// NO-SANITIZE-WITH-ATTR: [[META9]] = !{!"p2 _ZTS3foo", [[META10:![0-9]+]], i64 0}
+// NO-SANITIZE-WITH-ATTR: [[META10]] = !{!"any p2 pointer", [[META11:![0-9]+]], i64 0}
+// NO-SANITIZE-WITH-ATTR: [[META11]] = !{!"any pointer", [[META4]], i64 0}
+// NO-SANITIZE-WITH-ATTR: [[_ZTS3FOOPTR_TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
+// NO-SANITIZE-WITH-ATTR: [[META13]] = !{!"p1 _ZTS3foo", [[META11]], i64 0}
 //.
-// SANITIZE-WITHOUT-ATTR: [[_ZTS3FOOPTR_TBAA2]] = !{[[META3:![0-9]+]], [[META7:![0-9]+]], i64 8}
-// SANITIZE-WITHOUT-ATTR: [[META3]] = !{!"annotated_ptr", [[META4:![0-9]+]], i64 0, [[META7]], i64 8, [[META10:![0-9]+]], i64 16}
-// SANITIZE-WITHOUT-ATTR: [[META4]] = !{!"long", [[META5:![0-9]+]], i64 0}
-// SANITIZE-WITHOUT-ATTR: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
-// SANITIZE-WITHOUT-ATTR: [[META6]] = !{!"Simple C/C++ TBAA"}
-// SANITIZE-WITHOUT-ATTR: [[META7]] = !{!"p2 _ZTS3foo", [[META8:![0-9]+]], i64 0}
-// SANITIZE-WITHOUT-ATTR: [[META8]] = !{!"any p2 pointer", [[META9:![0-9]+]], i64 0}
-// SANITIZE-WITHOUT-ATTR: [[META9]] = !{!"any pointer", [[META5]], i64 0}
-// SANITIZE-WITHOUT-ATTR: [[META10]] = !{!"int", [[META5]], i64 0}
-// SANITIZE-WITHOUT-ATTR: [[_ZTS3FOOPTR_TBAA11]] = !{[[META12:![0-9]+]], [[META12]], i64 0}
-// SANITIZE-WITHOUT-ATTR: [[META12]] = !{!"p1 _ZTS3foo", [[META9]], i64 0}
+// SANITIZE-WITHOUT-ATTR: [[META3:![0-9]+]] = !{!"int", [[META4:![0-9]+]], i64 0}
+// SANITIZE-WITHOUT-ATTR: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// SANITIZE-WITHOUT-ATTR: [[META5]] = !{!"Simple C/C++ TBAA"}
+// SANITIZE-WITHOUT-ATTR: [[_ZTS3FOOPTR_TBAA6]] = !{[[META7:![0-9]+]], [[META9:![0-9]+]], i64 8}
+// SANITIZE-WITHOUT-ATTR: [[META7]] = !{!"annotated_ptr", [[META8:![0-9]+]], i64 0, [[META9]], i64 8, [[META3]], i64 16}
+// SANITIZE-WITHOUT-ATTR: [[META8]] = !{!"long", [[META4]], i64 0}
+// SANITIZE-WITHOUT-ATTR: [[META9]] = !{!"p2 _ZTS3foo", [[META10:![0-9]+]], i64 0}
+// SANITIZE-WITHOUT-ATTR: [[META10]] = !{!"any p2 pointer", [[META11:![0-9]+]], i64 0}
+// SANITIZE-WITHOUT-ATTR: [[META11]] = !{!"any pointer", [[META4]], i64 0}
+// SANITIZE-WITHOUT-ATTR: [[_ZTS3FOOPTR_TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
+// SANITIZE-WITHOUT-ATTR: [[META13]] = !{!"p1 _ZTS3foo", [[META11]], i64 0}
 //.
-// NO-SANITIZE-WITHOUT-ATTR: [[_ZTS3FOOPTR_TBAA2]] = !{[[META3:![0-9]+]], [[META7:![0-9]+]], i64 8}
-// NO-SANITIZE-WITHOUT-ATTR: [[META3]] = !{!"annotated_ptr", [[META4:![0-9]+]], i64 0, [[META7]], i64 8, [[META10:![0-9]+]], i64 16}
-// NO-SANITIZE-WITHOUT-ATTR: [[META4]] = !{!"long", [[META5:![0-9]+]], i64 0}
-// NO-SANITIZE-WITHOUT-ATTR: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
-// NO-SANITIZE-WITHOUT-ATTR: [[META6]] = !{!"Simple C/C++ TBAA"}
-// NO-SANITIZE-WITHOUT-ATTR: [[META7]] = !{!"p2 _ZTS3foo", [[META8:![0-9]+]], i64 0}
-// NO-SANITIZE-WITHOUT-ATTR: [[META8]] = !{!"any p2 pointer", [[META9:![0-9]+]], i64 0}
-// NO-SANITIZE-WITHOUT-ATTR: [[META9]] = !{!"any pointer", [[META5]], i64 0}
-// NO-SANITIZE-WITHOUT-ATTR: [[META10]] = !{!"int", [[META5]], i64 0}
-// NO-SANITIZE-WITHOUT-ATTR: [[_ZTS3FOOPTR_TBAA11]] = !{[[META12:![0-9]+]], [[META12]], i64 0}
-// NO-SANITIZE-WITHOUT-ATTR: [[META12]] = !{!"p1 _ZTS3foo", [[META9]], i64 0}
+// NO-SANITIZE-WITHOUT-ATTR: [[META3:![0-9]+]] = !{!"int", [[META4:![0-9]+]], i64 0}
+// NO-SANITIZE-WITHOUT-ATTR: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// NO-SANITIZE-WITHOUT-ATTR: [[META5]] = !{!"Simple C/C++ TBAA"}
+// NO-SANITIZE-WITHOUT-ATTR: [[_ZTS3FOOPTR_TBAA6]] = !{[[META7:![0-9]+]], [[META9:![0-9]+]], i64 8}
+// NO-SANITIZE-WITHOUT-ATTR: [[META7]] = !{!"annotated_ptr", [[META8:![0-9]+]], i64 0, [[META9]], i64 8, [[META3]], i64 16}
+// NO-SANITIZE-WITHOUT-ATTR: [[META8]] = !{!"long", [[META4]], i64 0}
+// NO-SANITIZE-WITHOUT-ATTR: [[META9]] = !{!"p2 _ZTS3foo", [[META10:![0-9]+]], i64 0}
+// NO-SANITIZE-WITHOUT-ATTR: [[META10]] = !{!"any p2 pointer", [[META11:![0-9]+]], i64 0}
+// NO-SANITIZE-WITHOUT-ATTR: [[META11]] = !{!"any pointer", [[META4]], i64 0}
+// NO-SANITIZE-WITHOUT-ATTR: [[_ZTS3FOOPTR_TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
+// NO-SANITIZE-WITHOUT-ATTR: [[META13]] = !{!"p1 _ZTS3foo", [[META11]], i64 0}
 //.
diff --git a/clang/test/CodeGen/attr-counted-by-pr110385.c b/clang/test/CodeGen/attr-counted-by-pr110385.c
index 32ee1c8eb5dbe..e854ed48e2c58 100644
--- a/clang/test/CodeGen/attr-counted-by-pr110385.c
+++ b/clang/test/CodeGen/attr-counted-by-pr110385.c
@@ -29,7 +29,7 @@ void init(void * __attribute__((pass_dynamic_object_size(0))));
 // CHECK-SAME: ptr noundef readonly captures(none) [[FOO:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[GROWABLE:%.*]] = getelementptr inbounds nuw i8, ptr [[FOO]], i64 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[GROWABLE]], align 8, !tbaa [[_ZTS8VARIABLEPTR_TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[GROWABLE]], align 8, !tbaa [[_ZTS8VARIABLEPTR_TBAA6:![0-9]+]]
 // CHECK-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 12
 // CHECK-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 8
 // CHECK-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
@@ -61,11 +61,11 @@ void test2(struct bucket2 *foo) {
         init(foo->growable.array);
 }
 //.
-// CHECK: [[_ZTS8VARIABLEPTR_TBAA2]] = !{[[META3:![0-9]+]], [[META7:![0-9]+]], i64 8}
-// CHECK: [[META3]] = !{!"bucket", [[META4:![0-9]+]], i64 0, [[META7]], i64 8, [[META4]], i64 16}
-// CHECK: [[META4]] = !{!"int", [[META5:![0-9]+]], i64 0}
-// CHECK: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
-// CHECK: [[META6]] = !{!"Simple C/C++ TBAA"}
-// CHECK: [[META7]] = !{!"p1 _ZTS8variable", [[META8:![0-9]+]], i64 0}
-// CHECK: [[META8]] = !{!"any pointer", [[META5]], i64 0}
+// CHECK: [[META3:![0-9]+]] = !{!"int", [[META4:![0-9]+]], i64 0}
+// CHECK: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK: [[META5]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[_ZTS8VARIABLEPTR_TBAA6]] = !{[[META7:![0-9]+]], [[META8:![0-9]+]], i64 8}
+// CHECK: [[META7]] = !{!"bucket", [[META3]], i64 0, [[META8]], i64 8, [[META3]], i64 16}
+// CHECK: [[META8]] = !{!"p1 _ZTS8variable", [[META9:![0-9]+]], i64 0}
+// CHECK: [[META9]] = !{!"any pointer", [[META4]], i64 0}
 //.
diff --git a/clang/test/CodeGen/attr-counted-by.c b/clang/test/CodeGen/attr-counted-by.c
index 9675fe21be366..86c59fb2b14ea 100644
--- a/clang/test/CodeGen/attr-counted-by.c
+++ b/clang/test/CodeGen/attr-counted-by.c
@@ -64,16 +64,16 @@ struct anon_struct {
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2:![0-9]+]]
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label %[[CONT3:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3:![0-9]+]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META6:![0-9]+]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label %[[CONT3:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF7:![0-9]+]], !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB1:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8:[0-9]+]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB1:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8:[0-9]+]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[CONT3]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAY]], i64 [[IDXPROM]]
-// SANITIZE-WITH-ATTR-NEXT:    store i32 [[VAL]], ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA4:![0-9]+]]
+// SANITIZE-WITH-ATTR-NEXT:    store i32 [[VAL]], ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2:![0-9]+]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test1(
@@ -112,18 +112,18 @@ void test1(struct annotated *p, int index, int val) {
 // SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[COUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[INDEX]], [[TMP0]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label %[[CONT6:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[COUNTED_BY_LOAD]] to i64, !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[INDEX]], [[TMP0]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label %[[CONT6:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF7]], !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB3:[0-9]+]], i64 [[INDEX]]) #[[ATTR8]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB3:[0-9]+]], i64 [[INDEX]]) #[[ATTR8]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[CONT6]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAY]], i64 [[INDEX]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = tail call i32 @llvm.smax.i32(i32 [[COUNTED_BY_LOAD]], i32 0)
 // SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = shl i32 [[TMP2]], 2
-// SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA4]]
+// SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test2(
@@ -235,16 +235,16 @@ size_t test2_bdos_cast(struct annotated *p) {
 // SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[INDEX]], [[TMP0]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label %[[CONT3:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[INDEX]], [[TMP0]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label %[[CONT3:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF7]], !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB4:[0-9]+]], i64 [[INDEX]]) #[[ATTR8]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB4:[0-9]+]], i64 [[INDEX]]) #[[ATTR8]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[CONT3]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAY]], i64 [[INDEX]]
-// SANITIZE-WITH-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA4]]
+// SANITIZE-WITH-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test3(
@@ -331,63 +331,63 @@ size_t test3_bdos_cast(struct annotated *p) {
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[DOTCOUNTED_BY_LOAD]], 2
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label %[[CONT1:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label %[[CONT1:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF7]], !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB5:[0-9]+]], i64 3) #[[ATTR8]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB5:[0-9]+]], i64 3) #[[ATTR8]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[CONT1]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[FLEXIBLE_ARRAY_MEMBER_SIZE:%.*]] = shl i32 [[DOTCOUNTED_BY_LOAD]], 2
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP2]], label %[[CONT12:.*]], label %[[HANDLER_OUT_OF_BOUNDS8:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP2]], label %[[CONT12:.*]], label %[[HANDLER_OUT_OF_BOUNDS8:.*]], !prof [[PROF7]], !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS8]]:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB6:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB6:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[CONT12]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = icmp sgt i32 [[DOTCOUNTED_BY_LOAD]], 2
 // SANITIZE-WITH-ATTR-NEXT:    [[RESULT:%.*]] = add i32 [[FLEXIBLE_ARRAY_MEMBER_SIZE]], 244
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = and i32 [[RESULT]], 252
 // SANITIZE-WITH-ATTR-NEXT:    [[CONV2:%.*]] = select i1 [[TMP3]], i32 [[TMP4]], i32 0
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAY]], i64 [[IDXPROM]]
-// SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV2]], ptr [[ARRAYIDX10]], align 4, !tbaa [[INT_TBAA4]]
+// SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV2]], ptr [[ARRAYIDX10]], align 4, !tbaa [[INT_TBAA2]]
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTNOT81:%.*]] = icmp eq i32 [[DOTCOUNTED_BY_LOAD]], 3
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[DOTNOT81]], label %[[HANDLER_OUT_OF_BOUNDS18:.*]], label %[[CONT19:.*]], !prof [[PROF8:![0-9]+]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[DOTNOT81]], label %[[HANDLER_OUT_OF_BOUNDS18:.*]], label %[[CONT19:.*]], !prof [[PROF8:![0-9]+]], !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS18]]:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB7:[0-9]+]], i64 4) #[[ATTR8]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB7:[0-9]+]], i64 4) #[[ATTR8]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[CONT19]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[ADD:%.*]] = add nsw i32 [[INDEX]], 1
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM31:%.*]] = sext i32 [[ADD]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[IDXPROM31]], [[TMP0]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP5]], label %[[CONT38:.*]], label %[[HANDLER_OUT_OF_BOUNDS34:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[IDXPROM31]], [[TMP0]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP5]], label %[[CONT38:.*]], label %[[HANDLER_OUT_OF_BOUNDS34:.*]], !prof [[PROF7]], !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS34]]:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB8:[0-9]+]], i64 [[IDXPROM31]]) #[[ATTR8]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB8:[0-9]+]], i64 [[IDXPROM31]]) #[[ATTR8]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[CONT38]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP6:%.*]] = icmp sgt i32 [[DOTCOUNTED_BY_LOAD]], 3
 // SANITIZE-WITH-ATTR-NEXT:    [[RESULT25:%.*]] = add i32 [[FLEXIBLE_ARRAY_MEMBER_SIZE]], 240
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP7:%.*]] = and i32 [[RESULT25]], 252
 // SANITIZE-WITH-ATTR-NEXT:    [[CONV27:%.*]] = select i1 [[TMP6]], i32 [[TMP7]], i32 0
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAY]], i64 [[IDXPROM31]]
-// SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV27]], ptr [[ARRAYIDX36]], align 4, !tbaa [[INT_TBAA4]]
+// SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV27]], ptr [[ARRAYIDX36]], align 4, !tbaa [[INT_TBAA2]]
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM42:%.*]] = sext i32 [[FAM_IDX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD44:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP8:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD44]] to i64, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    [[DOTNOT:%.*]] = icmp ugt i64 [[IDXPROM42]], [[TMP8]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[DOTNOT]], label %[[HANDLER_OUT_OF_BOUNDS45:.*]], label %[[CONT46:.*]], !prof [[PROF8]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP8:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD44]] to i64, !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    [[DOTNOT:%.*]] = icmp ugt i64 [[IDXPROM42]], [[TMP8]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[DOTNOT]], label %[[HANDLER_OUT_OF_BOUNDS45:.*]], label %[[CONT46:.*]], !prof [[PROF8]], !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS45]]:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB9:[0-9]+]], i64 [[IDXPROM42]]) #[[ATTR8]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB9:[0-9]+]], i64 [[IDXPROM42]]) #[[ATTR8]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[CONT46]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[ADD59:%.*]] = add nsw i32 [[INDEX]], 2
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM60:%.*]] = sext i32 [[ADD59]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[IDXPROM60]], [[TMP8]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP9]], label %[[CONT67:.*]], label %[[HANDLER_OUT_OF_BOUNDS63:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[IDXPROM60]], [[TMP8]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP9]], label %[[CONT67:.*]], label %[[HANDLER_OUT_OF_BOUNDS63:.*]], !prof [[PROF7]], !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS63]]:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB10:[0-9]+]], i64 [[IDXPROM60]]) #[[ATTR8]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB10:[0-9]+]], i64 [[IDXPROM60]]) #[[ATTR8]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[CONT67]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX65:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAY]], i64 [[IDXPROM60]]
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNT50:%.*]] = sext i32 [[DOTCOUNTED_BY_LOAD44]] to i64
@@ -396,7 +396,7 @@ size_t test3_bdos_cast(struct annotated *p) {
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTTR:%.*]] = trunc nuw nsw i64 [[TMP11]] to i32
 // SANITIZE-WITH-ATTR-NEXT:    [[CONV54:%.*]] = shl i32 [[DOTTR]], 2
 // SANITIZE-WITH-ATTR-NEXT:    [[CONV55:%.*]] = and i32 [[CONV54]], 252
-// SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV55]], ptr [[ARRAYIDX65]], align 4, !tbaa [[INT_TBAA4]]
+// SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV55]], ptr [[ARRAYIDX65]], align 4, !tbaa [[INT_TBAA2]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test4(
@@ -475,12 +475,12 @@ void test4(struct annotated *p, int index, int fam_idx) {
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    [[DOTNOT:%.*]] = icmp ugt i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[DOTNOT]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], label %[[CONT1:.*]], !prof [[PROF8]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    [[DOTNOT:%.*]] = icmp ugt i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[DOTNOT]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], label %[[CONT1:.*]], !prof [[PROF8]], !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB11:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB11:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[CONT1]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNT:%.*]] = sext i32 [[DOTCOUNTED_BY_LOAD]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = sub nsw i64 [[COUNT]], [[IDXPROM]]
@@ -613,15 +613,15 @@ size_t test4_bdos_cast2(struct annotated *p, int index) {
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i64, ptr [[DOTCOUNTED_BY_GEP]], align 4
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp ugt i64 [[DOTCOUNTED_BY_LOAD]], [[IDXPROM]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label %[[CONT3:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp ugt i64 [[DOTCOUNTED_BY_LOAD]], [[IDXPROM]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label %[[CONT3:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF7]], !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB12:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB12:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[CONT3]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[IDXPROM]]
-// SANITIZE-WITH-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA4]]
+// SANITIZE-WITH-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test5(
@@ -685,18 +685,18 @@ size_t test5_bdos(struct anon_struct *p) {
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i64, ptr [[COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp ugt i64 [[COUNTED_BY_LOAD]], [[IDXPROM]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label %[[CONT6:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp ugt i64 [[COUNTED_BY_LOAD]], [[IDXPROM]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label %[[CONT6:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF7]], !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB13:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB13:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[CONT6]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[IDXPROM]]
 // SANITIZE-WITH-ATTR-NEXT:    [[FLEXIBLE_ARRAY_MEMBER_SIZE:%.*]] = shl nuw i64 [[COUNTED_BY_LOAD]], 2
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.smax.i64(i64 [[FLEXIBLE_ARRAY_MEMBER_SIZE]], i64 0)
 // SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = trunc i64 [[TMP2]] to i32
-// SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA4]]
+// SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test6(
@@ -773,12 +773,12 @@ size_t test6_bdos(struct anon_struct *p) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i8, ptr [[TMP0]], align 4
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = zext i8 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP1]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP2]], label %[[CONT7:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = zext i8 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP1]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP2]], label %[[CONT7:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF7]], !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB15:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB15:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[CONT7]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[INTS:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 9
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[INTS]], i64 [[IDXPROM]]
@@ -846,12 +846,12 @@ size_t test7_bdos(struct union_of_fams *p) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i8, ptr [[TMP0]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = zext i8 [[COUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP1]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP2]], label %[[CONT14:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = zext i8 [[COUNTED_BY_LOAD]] to i64, !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP1]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP2]], label %[[CONT14:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF7]], !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB16:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB16:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[CONT14]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[INTS:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 9
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[INTS]], i64 [[IDXPROM]]
@@ -927,12 +927,12 @@ size_t test8_bdos(struct union_of_fams *p) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[TMP0]], align 4
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP1]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP2]], label %[[CONT7:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP1]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP2]], label %[[CONT7:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF7]], !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB18:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB18:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[CONT7]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[BYTES:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[BYTES]], i64 [[IDXPROM]]
@@ -1000,12 +1000,12 @@ size_t test9_bdos(struct union_of_fams *p) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[TMP0]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[COUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP1]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP2]], label %[[CONT14:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[COUNTED_BY_LOAD]] to i64, !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP1]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP2]], label %[[CONT14:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF7]], !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB19:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB19:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[CONT14]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[BYTES:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[BYTES]], i64 [[IDXPROM]]
@@ -1087,12 +1087,12 @@ size_t test10_bdos(struct union_of_fams *p) {
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[COUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label %[[CONT6:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[COUNTED_BY_LOAD]] to i64, !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label %[[CONT6:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF7]], !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB20:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB20:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[CONT6]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAY]], i64 [[IDXPROM]]
@@ -1100,7 +1100,7 @@ size_t test10_bdos(struct union_of_fams *p) {
 // SANITIZE-WITH-ATTR-NEXT:    [[FLEXIBLE_ARRAY_MEMBER_SIZE:%.*]] = shl i32 [[COUNTED_BY_LOAD]], 2
 // SANITIZE-WITH-ATTR-NEXT:    [[RESULT:%.*]] = add i32 [[FLEXIBLE_ARRAY_MEMBER_SIZE]], 8
 // SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = select i1 [[TMP2]], i32 [[RESULT]], i32 0
-// SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA4]]
+// SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test11(
@@ -1201,23 +1201,23 @@ int test12_a, test12_b;
 // SANITIZE-WITH-ATTR-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(24) [[BAZ]], ptr noundef nonnull align 4 dereferenceable(24) @test12_bar, i64 24, i1 false), !tbaa.struct [[TBAA_STRUCT10:![0-9]+]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[INDEX]], 6
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[INDEX]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label %[[CONT:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label %[[CONT:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF7]], !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB22:[0-9]+]], i64 [[TMP1]]) #[[ATTR8]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB22:[0-9]+]], i64 [[TMP1]]) #[[ATTR8]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[CONT]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[BAZ]], i64 [[TMP1]]
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA4]]
-// SANITIZE-WITH-ATTR-NEXT:    store i32 [[TMP2]], ptr @test12_b, align 4, !tbaa [[INT_TBAA4]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2]]
+// SANITIZE-WITH-ATTR-NEXT:    store i32 [[TMP2]], ptr @test12_b, align 4, !tbaa [[INT_TBAA2]]
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr @test12_foo, align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTNOT:%.*]] = icmp eq i32 [[DOTCOUNTED_BY_LOAD]], 0
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[DOTNOT]], label %[[HANDLER_OUT_OF_BOUNDS4:.*]], label %[[HANDLER_TYPE_MISMATCH6:.*]], !prof [[PROF8]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[DOTNOT]], label %[[HANDLER_OUT_OF_BOUNDS4:.*]], label %[[HANDLER_TYPE_MISMATCH6:.*]], !prof [[PROF8]], !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS4]]:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB24:[0-9]+]], i64 0) #[[ATTR8]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB24:[0-9]+]], i64 0) #[[ATTR8]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[HANDLER_TYPE_MISMATCH6]]:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_type_mismatch_v1_abort(ptr nonnull @[[GLOB25:[0-9]+]], i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr @test12_foo, i64 4) to i64)) #[[ATTR8]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_type_mismatch_v1_abort(ptr nonnull @[[GLOB25:[0-9]+]], i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr @test12_foo, i64 4) to i64)) #[[ATTR8]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META6]]
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i32 @test12(
 // NO-SANITIZE-WITH-ATTR-SAME: i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR4:[0-9]+]] {
@@ -1302,12 +1302,12 @@ struct test13_bar {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr @test13_f, align 8, !tbaa [[_ZTS10TEST13_BARPTR_TBAA11:![0-9]+]]
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[INDEX]], [[TMP1]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP2]], label %[[CONT5:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[INDEX]], [[TMP1]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP2]], label %[[CONT5:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF7]], !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB28:[0-9]+]], i64 [[INDEX]]) #[[ATTR8]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB28:[0-9]+]], i64 [[INDEX]]) #[[ATTR8]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[CONT5]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[REVMAP:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[REVMAP]], i64 [[INDEX]]
@@ -1364,11 +1364,11 @@ struct test14_foo {
 // SANITIZE-WITH-ATTR-SAME: i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[IDX]], 0
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label %[[CONT3:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label %[[CONT3:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF7]], !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB29:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB29:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[CONT3]]:
 // SANITIZE-WITH-ATTR-NEXT:    ret i32 undef
 //
@@ -1418,11 +1418,11 @@ int test14(int idx) {
 // SANITIZE-WITH-ATTR-SAME: i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[IDX]], 0
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label %[[CONT1:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label %[[CONT1:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF7]], !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB31:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB31:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[CONT1]]:
 // SANITIZE-WITH-ATTR-NEXT:    ret i32 undef
 //
@@ -1469,10 +1469,10 @@ int test15(int idx) {
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 680
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp ugt i32 [[DOTCOUNTED_BY_LOAD]], 1
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label %[[CONT1:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label %[[CONT1:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF7]], !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB32:[0-9]+]], i64 2) #[[ATTR8]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB32:[0-9]+]], i64 2) #[[ATTR8]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[CONT1]]:
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
@@ -1607,13 +1607,13 @@ struct tests_foo {
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[VAR]], i64 40
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp ugt i32 [[DOTCOUNTED_BY_LOAD]], 10
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label %[[CONT4:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label %[[CONT4:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF7]], !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB33:[0-9]+]], i64 10) #[[ATTR8]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB33:[0-9]+]], i64 10) #[[ATTR8]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[CONT4]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[VAR]], i64 84
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4, !tbaa [[INT_TBAA4]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4, !tbaa [[INT_TBAA2]]
 // SANITIZE-WITH-ATTR-NEXT:    ret i32 [[TMP1]]
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i32 @test24(
@@ -1648,13 +1648,13 @@ int test24(int c, struct tests_foo *var) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8, !tbaa [[_ZTS9TESTS_FOOPTR_TBAA17:![0-9]+]]
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[TMP0]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[DOTCOUNTED_BY_LOAD]], 10
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label %[[CONT5:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label %[[CONT5:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF7]], !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB34:[0-9]+]], i64 10) #[[ATTR8]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB34:[0-9]+]], i64 10) #[[ATTR8]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[CONT5]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 44
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA4]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2]]
 // SANITIZE-WITH-ATTR-NEXT:    ret i32 [[TMP2]]
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i32 @test25(
@@ -1698,16 +1698,16 @@ struct test26_foo {
 // SANITIZE-WITH-ATTR-NEXT:    [[S:%.*]] = getelementptr inbounds nuw i8, ptr [[FOO]], i64 4
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[C]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[S]], align 4
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label %[[CONT5:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label %[[CONT5:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF7]], !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB35:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB35:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[CONT5]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARR:%.*]] = getelementptr inbounds nuw i8, ptr [[FOO]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[ARR]], i64 [[IDXPROM]]
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA4]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2]]
 // SANITIZE-WITH-ATTR-NEXT:    ret i32 [[TMP2]]
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i32 @test26(
@@ -1769,12 +1769,12 @@ struct test27_foo {
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label %[[CONT3:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label %[[CONT3:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF7]], !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB37:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB37:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[CONT3]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[ENTRIES:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 24
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[ENTRIES]], i64 [[IDXPROM]]
@@ -1835,16 +1835,16 @@ struct test28_foo {
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP4]], label %[[CONT17:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP3]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP4]], label %[[CONT17:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF7]], !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB39:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB39:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR8]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[CONT17]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARR:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 12
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[ARR]], i64 [[IDXPROM]]
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA4]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2]]
 // SANITIZE-WITH-ATTR-NEXT:    ret i32 [[TMP5]]
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i32 @test28(
@@ -1899,28 +1899,28 @@ struct annotated_struct_array {
 // SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[IDX1]], 10
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[IDX1]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label %[[CONT3:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label %[[CONT3:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF7]], !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB41:[0-9]+]], i64 [[TMP1]]) #[[ATTR8]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB41:[0-9]+]], i64 [[TMP1]]) #[[ATTR8]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[CONT3]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw ptr, ptr [[ANN]], i64 [[TMP1]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8, !tbaa [[_ZTS9ANNOTATEDPTR_TBAA23:![0-9]+]]
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM27:%.*]] = sext i32 [[IDX2]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = zext i32 [[COUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[IDXPROM27]], [[TMP3]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP4]], label %[[CONT32:.*]], label %[[HANDLER_OUT_OF_BOUNDS28:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = zext i32 [[COUNTED_BY_LOAD]] to i64, !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[IDXPROM27]], [[TMP3]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP4]], label %[[CONT32:.*]], label %[[HANDLER_OUT_OF_BOUNDS28:.*]], !prof [[PROF7]], !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS28]]:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB42:[0-9]+]], i64 [[IDXPROM27]]) #[[ATTR8]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB42:[0-9]+]], i64 [[IDXPROM27]]) #[[ATTR8]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[CONT32]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 12
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAY]], i64 [[IDXPROM27]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP5:%.*]] = tail call i32 @llvm.smax.i32(i32 [[COUNTED_BY_LOAD]], i32 0)
 // SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = shl i32 [[TMP5]], 2
-// SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX30]], align 4, !tbaa [[INT_TBAA4]]
+// SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX30]], align 4, !tbaa [[INT_TBAA2]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test29(
@@ -1987,9 +1987,9 @@ struct test30_struct {
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test30(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR3]] {
 // SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[IDX]] to i64, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB44:[0-9]+]], i64 [[TMP0]]) #[[ATTR8]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[IDX]] to i64, !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB44:[0-9]+]], i64 [[TMP0]]) #[[ATTR8]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META6]]
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test30(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] {
@@ -2062,21 +2062,21 @@ struct annotated_with_array {
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[IDX1:%.*]], i32 noundef [[IDX2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[IDX2]], 43
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label %[[CONT1:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label %[[CONT1:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF7]], !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[IDX2]] to i64, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB46:[0-9]+]], i64 [[TMP1]]) #[[ATTR8]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[IDX2]] to i64, !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB46:[0-9]+]], i64 [[TMP1]]) #[[ATTR8]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[CONT1]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 336
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM4:%.*]] = sext i32 [[IDX1]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = zext i32 [[COUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[IDXPROM4]], [[TMP2]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP3]], label %[[CONT9:.*]], label %[[HANDLER_OUT_OF_BOUNDS5:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = zext i32 [[COUNTED_BY_LOAD]] to i64, !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[IDXPROM4]], [[TMP2]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP3]], label %[[CONT9:.*]], label %[[HANDLER_OUT_OF_BOUNDS5:.*]], !prof [[PROF7]], !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS5]]:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB48:[0-9]+]], i64 [[IDXPROM4]]) #[[ATTR8]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB48:[0-9]+]], i64 [[IDXPROM4]]) #[[ATTR8]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[CONT9]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 344
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw i64, ptr [[ARRAY]], i64 [[IDXPROM4]]
@@ -2143,11 +2143,11 @@ void test32(struct annotated_with_array *ptr, int idx1, int idx2) {
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[INDEX]], 43
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label %[[CONT1:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label %[[CONT1:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF7]], !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[INDEX]] to i64, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB49:[0-9]+]], i64 [[TMP1]]) #[[ATTR8]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[INDEX]] to i64, !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB49:[0-9]+]], i64 [[TMP1]]) #[[ATTR8]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[CONT1]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 336
 // SANITIZE-WITH-ATTR-NEXT:    [[COUNTED_BY_LOAD:%.*]] = load i32, ptr [[COUNTED_BY_GEP]], align 4
@@ -2280,18 +2280,18 @@ struct multi_subscripts {
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[IDX1:%.*]], i32 noundef [[IDX2:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[IDX1]], 42
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label %[[CONT1:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label %[[CONT1:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF7]], !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[IDX1]] to i64, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB51:[0-9]+]], i64 [[TMP1]]) #[[ATTR8]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[IDX1]] to i64, !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB51:[0-9]+]], i64 [[TMP1]]) #[[ATTR8]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[CONT1]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp ult i32 [[IDX2]], 43
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP2]], label %[[CONT3:.*]], label %[[HANDLER_OUT_OF_BOUNDS2:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP2]], label %[[CONT3:.*]], label %[[HANDLER_OUT_OF_BOUNDS2:.*]], !prof [[PROF7]], !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS2]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = zext i32 [[IDX2]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB52:[0-9]+]], i64 [[TMP3]]) #[[ATTR8]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB52:[0-9]+]], i64 [[TMP3]]) #[[ATTR8]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[CONT3]]:
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
@@ -2333,16 +2333,16 @@ size_t test34(struct multi_subscripts *ptr, int idx1, int idx2) {
 // SANITIZE-WITH-ATTR-NEXT:  [[ENTRY:.*:]]
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[INDEX]], [[TMP0]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label %[[CONT3:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF3]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[INDEX]], [[TMP0]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label %[[CONT3:.*]], label %[[HANDLER_OUT_OF_BOUNDS:.*]], !prof [[PROF7]], !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[HANDLER_OUT_OF_BOUNDS]]:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB53:[0-9]+]], i64 [[INDEX]]) #[[ATTR8]], !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB53:[0-9]+]], i64 [[INDEX]]) #[[ATTR8]], !nosanitize [[META6]]
+// SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META6]]
 // SANITIZE-WITH-ATTR:       [[CONT3]]:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAY]], i64 [[INDEX]]
-// SANITIZE-WITH-ATTR-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA4]]
+// SANITIZE-WITH-ATTR-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !tbaa [[INT_TBAA2]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test35(
@@ -2470,19 +2470,19 @@ size_t test37(struct annotated *ptr) {
   return __builtin_dynamic_object_size((1, 2, (4, 5, (7, 8, 9, (10, ptr->array)))), 1);
 }
 //.
-// SANITIZE-WITH-ATTR: [[META2]] = !{}
-// SANITIZE-WITH-ATTR: [[PROF3]] = !{!"branch_weights", i32 1048575, i32 1}
-// SANITIZE-WITH-ATTR: [[INT_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
-// SANITIZE-WITH-ATTR: [[META5]] = !{!"int", [[META6:![0-9]+]], i64 0}
-// SANITIZE-WITH-ATTR: [[META6]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
-// SANITIZE-WITH-ATTR: [[META7]] = !{!"Simple C/C++ TBAA"}
+// SANITIZE-WITH-ATTR: [[INT_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// SANITIZE-WITH-ATTR: [[META3]] = !{!"int", [[META4:![0-9]+]], i64 0}
+// SANITIZE-WITH-ATTR: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// SANITIZE-WITH-ATTR: [[META5]] = !{!"Simple C/C++ TBAA"}
+// SANITIZE-WITH-ATTR: [[META6]] = !{}
+// SANITIZE-WITH-ATTR: [[PROF7]] = !{!"branch_weights", i32 1048575, i32 1}
 // SANITIZE-WITH-ATTR: [[PROF8]] = !{!"branch_weights", i32 1, i32 1048575}
-// SANITIZE-WITH-ATTR: [[CHAR_TBAA9]] = !{[[META6]], [[META6]], i64 0}
+// SANITIZE-WITH-ATTR: [[CHAR_TBAA9]] = !{[[META4]], [[META4]], i64 0}
 // SANITIZE-WITH-ATTR: [[TBAA_STRUCT10]] = !{i64 0, i64 24, [[CHAR_TBAA9]]}
 // SANITIZE-WITH-ATTR: [[_ZTS10TEST13_BARPTR_TBAA11]] = !{[[META12:![0-9]+]], [[META13:![0-9]+]], i64 0}
 // SANITIZE-WITH-ATTR: [[META12]] = !{!"test13_foo", [[META13]], i64 0}
 // SANITIZE-WITH-ATTR: [[META13]] = !{!"p1 _ZTS10test13_bar", [[META14:![0-9]+]], i64 0}
-// SANITIZE-WITH-ATTR: [[META14]] = !{!"any pointer", [[META6]], i64 0}
+// SANITIZE-WITH-ATTR: [[META14]] = !{!"any pointer", [[META4]], i64 0}
 // SANITIZE-WITH-ATTR: [[_ZTS10TEST13_FOOPTR_TBAA15]] = !{[[META16:![0-9]+]], [[META16]], i64 0}
 // SANITIZE-WITH-ATTR: [[META16]] = !{!"p1 _ZTS10test13_foo", [[META14]], i64 0}
 // SANITIZE-WITH-ATTR: [[_ZTS9TESTS_FOOPTR_TBAA17]] = !{[[META18:![0-9]+]], [[META18]], i64 0}
@@ -2494,7 +2494,7 @@ size_t test37(struct annotated *ptr) {
 // SANITIZE-WITH-ATTR: [[_ZTS9ANNOTATEDPTR_TBAA23]] = !{[[META24:![0-9]+]], [[META24]], i64 0}
 // SANITIZE-WITH-ATTR: [[META24]] = !{!"p1 _ZTS9annotated", [[META14]], i64 0}
 // SANITIZE-WITH-ATTR: [[LONG_TBAA25]] = !{[[META26:![0-9]+]], [[META26]], i64 0}
-// SANITIZE-WITH-ATTR: [[META26]] = !{!"long", [[META6]], i64 0}
+// SANITIZE-WITH-ATTR: [[META26]] = !{!"long", [[META4]], i64 0}
 //.
 // NO-SANITIZE-WITH-ATTR: [[INT_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
 // NO-SANITIZE-WITH-ATTR: [[META3]] = !{!"int", [[META4:![0-9]+]], i64 0}
diff --git a/clang/test/CodeGen/attr-target-mv.c b/clang/test/CodeGen/attr-target-mv.c
index 07f47d93cd29c..b8807dd9171d5 100644
--- a/clang/test/CodeGen/attr-target-mv.c
+++ b/clang/test/CodeGen/attr-target-mv.c
@@ -30,6 +30,8 @@ int __attribute__((target("arch=gracemont"))) foo(void) {return 24;}
 int __attribute__((target("arch=pantherlake"))) foo(void) {return 25;}
 int __attribute__((target("arch=clearwaterforest"))) foo(void) {return 26;}
 int __attribute__((target("arch=diamondrapids"))) foo(void) {return 27;}
+int __attribute__((target("arch=wildcatlake"))) foo(void) {return 28;}
+int __attribute__((target("arch=novalake"))) foo(void) {return 29;}
 int __attribute__((target("default"))) foo(void) { return 2; }
 
 int bar(void) {
@@ -203,6 +205,10 @@ void calls_pr50025c(void) { pr50025c(); }
 // ITANIUM: ret i32 26
 // ITANIUM: define{{.*}} i32 @foo.arch_diamondrapids()
 // ITANIUM: ret i32 27
+// ITANIUM: define{{.*}} i32 @foo.arch_wildcatlake()
+// ITANIUM: ret i32 28
+// ITANIUM: define{{.*}} i32 @foo.arch_novalake()
+// ITANIUM: ret i32 29
 // ITANIUM: define{{.*}} i32 @foo()
 // ITANIUM: ret i32 2
 // ITANIUM: define{{.*}} i32 @bar()
@@ -262,6 +268,10 @@ void calls_pr50025c(void) { pr50025c(); }
 // WINDOWS: ret i32 26
 // WINDOWS: define dso_local i32 @foo.arch_diamondrapids()
 // WINDOWS: ret i32 27
+// WINDOWS: define dso_local i32 @foo.arch_wildcatlake()
+// WINDOWS: ret i32 28
+// WINDOWS: define dso_local i32 @foo.arch_novalake()
+// WINDOWS: ret i32 29
 // WINDOWS: define dso_local i32 @foo()
 // WINDOWS: ret i32 2
 // WINDOWS: define dso_local i32 @bar()
diff --git a/clang/test/CodeGen/builtin-masked.c b/clang/test/CodeGen/builtin-masked.c
index e2b5e099a4ba9..28b94b71e8970 100644
--- a/clang/test/CodeGen/builtin-masked.c
+++ b/clang/test/CodeGen/builtin-masked.c
@@ -19,7 +19,7 @@ typedef _Bool v8b __attribute__((ext_vector_type(8)));
 // CHECK-NEXT:    [[LOAD_BITS2:%.*]] = load i8, ptr [[M_ADDR]], align 1
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[LOAD_BITS2]] to <8 x i1>
 // CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[P_ADDR]], align 8
-// CHECK-NEXT:    [[MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP2]], i32 4, <8 x i1> [[TMP1]], <8 x i32> poison)
+// CHECK-NEXT:    [[MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 4 [[TMP2]], <8 x i1> [[TMP1]], <8 x i32> poison)
 // CHECK-NEXT:    ret <8 x i32> [[MASKED_LOAD]]
 //
 v8i test_load(v8b m, int *p) {
@@ -45,7 +45,7 @@ v8i test_load(v8b m, int *p) {
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[LOAD_BITS2]] to <8 x i1>
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[P_ADDR]], align 8
 // CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr [[T_ADDR]], align 32
-// CHECK-NEXT:    [[MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP3]], i32 4, <8 x i1> [[TMP2]], <8 x i32> [[TMP4]])
+// CHECK-NEXT:    [[MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 4 [[TMP3]], <8 x i1> [[TMP2]], <8 x i32> [[TMP4]])
 // CHECK-NEXT:    ret <8 x i32> [[MASKED_LOAD]]
 //
 v8i test_load_passthru(v8b m, int *p, v8i t) {
@@ -97,7 +97,7 @@ v8i test_load_expand(v8b m, int *p, v8i t) {
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[LOAD_BITS2]] to <8 x i1>
 // CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr [[V_ADDR]], align 32
 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[P_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP3]], ptr [[TMP4]], i32 4, <8 x i1> [[TMP2]])
+// CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP3]], ptr align 4 [[TMP4]], <8 x i1> [[TMP2]])
 // CHECK-NEXT:    ret void
 //
 void test_store(v8b m, v8i v, int *p) {
@@ -150,7 +150,7 @@ void test_compress_store(v8b m, v8i v, int *p) {
 // CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr [[IDX_ADDR]], align 32
 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
 // CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], <8 x i32> [[TMP3]]
-// CHECK-NEXT:    [[MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> [[TMP2]], <8 x i32> poison)
+// CHECK-NEXT:    [[MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 4 [[TMP5]], <8 x i1> [[TMP2]], <8 x i32> poison)
 // CHECK-NEXT:    ret <8 x i32> [[MASKED_GATHER]]
 //
 v8i test_gather(v8b mask, v8i idx, int *ptr) {
@@ -181,7 +181,7 @@ v8i test_gather(v8b mask, v8i idx, int *ptr) {
 // CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr [[IDX_ADDR]], align 32
 // CHECK-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
 // CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[TMP6]], <8 x i32> [[TMP4]]
-// CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[TMP5]], <8 x ptr> [[TMP7]], i32 4, <8 x i1> [[TMP3]])
+// CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[TMP5]], <8 x ptr> align 4 [[TMP7]], <8 x i1> [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 void test_scatter(v8b mask, v8i val, v8i idx, int *ptr) {
@@ -203,7 +203,7 @@ void test_scatter(v8b mask, v8i val, v8i idx, int *ptr) {
 // CHECK-NEXT:    [[LOAD_BITS2:%.*]] = load i8, ptr [[MASK_ADDR]], align 1
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[LOAD_BITS2]] to <8 x i1>
 // CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(42), ptr [[PTR_ADDR]], align 8
-// CHECK-NEXT:    [[MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p42(ptr addrspace(42) [[TMP2]], i32 4, <8 x i1> [[TMP1]], <8 x i32> poison)
+// CHECK-NEXT:    [[MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p42(ptr addrspace(42) align 4 [[TMP2]], <8 x i1> [[TMP1]], <8 x i32> poison)
 // CHECK-NEXT:    ret <8 x i32> [[MASKED_LOAD]]
 //
 v8i test_load_as(v8b mask, int __attribute__((address_space(42))) * ptr) {
@@ -229,7 +229,7 @@ v8i test_load_as(v8b mask, int __attribute__((address_space(42))) * ptr) {
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[LOAD_BITS2]] to <8 x i1>
 // CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr [[V_ADDR]], align 32
 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(42), ptr [[P_ADDR]], align 8
-// CHECK-NEXT:    call void @llvm.masked.store.v8i32.p42(<8 x i32> [[TMP3]], ptr addrspace(42) [[TMP4]], i32 4, <8 x i1> [[TMP2]])
+// CHECK-NEXT:    call void @llvm.masked.store.v8i32.p42(<8 x i32> [[TMP3]], ptr addrspace(42) align 4 [[TMP4]], <8 x i1> [[TMP2]])
 // CHECK-NEXT:    ret void
 //
 void test_store_as(v8b m, v8i v, int __attribute__((address_space(42))) *p) {
@@ -256,7 +256,7 @@ void test_store_as(v8b m, v8i v, int __attribute__((address_space(42))) *p) {
 // CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr [[IDX_ADDR]], align 32
 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(42), ptr [[PTR_ADDR]], align 8
 // CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr addrspace(42) [[TMP4]], <8 x i32> [[TMP3]]
-// CHECK-NEXT:    [[MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p42(<8 x ptr addrspace(42)> [[TMP5]], i32 4, <8 x i1> [[TMP2]], <8 x i32> poison)
+// CHECK-NEXT:    [[MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p42(<8 x ptr addrspace(42)> align 4 [[TMP5]], <8 x i1> [[TMP2]], <8 x i32> poison)
 // CHECK-NEXT:    ret <8 x i32> [[MASKED_GATHER]]
 //
 v8i test_gather_as(v8b mask, v8i idx, int __attribute__((address_space(42))) *ptr) {
@@ -287,7 +287,7 @@ v8i test_gather_as(v8b mask, v8i idx, int __attribute__((address_space(42))) *pt
 // CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr [[IDX_ADDR]], align 32
 // CHECK-NEXT:    [[TMP6:%.*]] = load ptr addrspace(42), ptr [[PTR_ADDR]], align 8
 // CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr addrspace(42) [[TMP6]], <8 x i32> [[TMP4]]
-// CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p42(<8 x i32> [[TMP5]], <8 x ptr addrspace(42)> [[TMP7]], i32 4, <8 x i1> [[TMP3]])
+// CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p42(<8 x i32> [[TMP5]], <8 x ptr addrspace(42)> align 4 [[TMP7]], <8 x i1> [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 void test_scatter_as(v8b mask, v8i val, v8i idx, int __attribute__((address_space(42))) *ptr) {
diff --git a/clang/test/CodeGen/builtin-maximumnum-minimumnum.c b/clang/test/CodeGen/builtin-maximumnum-minimumnum.c
index ea9d2e7a4ed38..aa18d9ca217f7 100644
--- a/clang/test/CodeGen/builtin-maximumnum-minimumnum.c
+++ b/clang/test/CodeGen/builtin-maximumnum-minimumnum.c
@@ -12,10 +12,10 @@ typedef long double ldouble2 __attribute__((ext_vector_type(2)));
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[TBAA2:![0-9]+]]
-// CHECK-NEXT:    store <8 x half> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[TBAA6:![0-9]+]]
+// CHECK-NEXT:    store <8 x half> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[B_ADDR]], align 16, !tbaa [[TBAA6]]
 // CHECK-NEXT:    [[ELT_MINIMUMNUM:%.*]] = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> [[TMP0]], <8 x half> [[TMP1]])
 // CHECK-NEXT:    ret <8 x half> [[ELT_MINIMUMNUM]]
 //
@@ -27,10 +27,10 @@ half8 pfmin16(half8 a, half8 b) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x bfloat>, align 16
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x bfloat>, align 16
-// CHECK-NEXT:    store <8 x bfloat> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store <8 x bfloat> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x bfloat>, ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x bfloat>, ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x bfloat> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[TBAA6]]
+// CHECK-NEXT:    store <8 x bfloat> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x bfloat>, ptr [[A_ADDR]], align 16, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x bfloat>, ptr [[B_ADDR]], align 16, !tbaa [[TBAA6]]
 // CHECK-NEXT:    [[ELT_MINIMUMNUM:%.*]] = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]])
 // CHECK-NEXT:    ret <8 x bfloat> [[ELT_MINIMUMNUM]]
 //
@@ -42,10 +42,10 @@ bf16x8 pfmin16b(bf16x8 a, bf16x8 b) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[TBAA6]]
+// CHECK-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16, !tbaa [[TBAA6]]
 // CHECK-NEXT:    [[ELT_MINIMUMNUM:%.*]] = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
 // CHECK-NEXT:    ret <4 x float> [[ELT_MINIMUMNUM]]
 //
@@ -57,10 +57,10 @@ float4 pfmin32(float4 a, float4 b) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
-// CHECK-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[TBAA6]]
+// CHECK-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16, !tbaa [[TBAA6]]
 // CHECK-NEXT:    [[ELT_MINIMUMNUM:%.*]] = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
 // CHECK-NEXT:    ret <2 x double> [[ELT_MINIMUMNUM]]
 //
@@ -72,12 +72,12 @@ double2 pfmin64(double2 a, double2 b) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <2 x x86_fp80>, align 32
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <2 x x86_fp80>, align 32
-// CHECK-NEXT:    [[A:%.*]] = load <2 x x86_fp80>, ptr [[TMP0]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[B:%.*]] = load <2 x x86_fp80>, ptr [[TMP1]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store <2 x x86_fp80> [[A]], ptr [[A_ADDR]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store <2 x x86_fp80> [[B]], ptr [[B_ADDR]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x x86_fp80>, ptr [[A_ADDR]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x x86_fp80>, ptr [[B_ADDR]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[A:%.*]] = load <2 x x86_fp80>, ptr [[TMP0]], align 32, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[B:%.*]] = load <2 x x86_fp80>, ptr [[TMP1]], align 32, !tbaa [[TBAA6]]
+// CHECK-NEXT:    store <2 x x86_fp80> [[A]], ptr [[A_ADDR]], align 32, !tbaa [[TBAA6]]
+// CHECK-NEXT:    store <2 x x86_fp80> [[B]], ptr [[B_ADDR]], align 32, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x x86_fp80>, ptr [[A_ADDR]], align 32, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x x86_fp80>, ptr [[B_ADDR]], align 32, !tbaa [[TBAA6]]
 // CHECK-NEXT:    [[ELT_MINIMUMNUM:%.*]] = call <2 x x86_fp80> @llvm.minimumnum.v2f80(<2 x x86_fp80> [[TMP2]], <2 x x86_fp80> [[TMP3]])
 // CHECK-NEXT:    ret <2 x x86_fp80> [[ELT_MINIMUMNUM]]
 //
@@ -90,10 +90,10 @@ ldouble2 pfmin80(ldouble2 a, ldouble2 b) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store <8 x half> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[TBAA6]]
+// CHECK-NEXT:    store <8 x half> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[B_ADDR]], align 16, !tbaa [[TBAA6]]
 // CHECK-NEXT:    [[ELT_MAXIMUMNUM:%.*]] = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> [[TMP0]], <8 x half> [[TMP1]])
 // CHECK-NEXT:    ret <8 x half> [[ELT_MAXIMUMNUM]]
 //
@@ -105,10 +105,10 @@ half8 pfmax16(half8 a, half8 b) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x bfloat>, align 16
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x bfloat>, align 16
-// CHECK-NEXT:    store <8 x bfloat> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store <8 x bfloat> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x bfloat>, ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x bfloat>, ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <8 x bfloat> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[TBAA6]]
+// CHECK-NEXT:    store <8 x bfloat> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x bfloat>, ptr [[A_ADDR]], align 16, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x bfloat>, ptr [[B_ADDR]], align 16, !tbaa [[TBAA6]]
 // CHECK-NEXT:    [[ELT_MAXIMUMNUM:%.*]] = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]])
 // CHECK-NEXT:    ret <8 x bfloat> [[ELT_MAXIMUMNUM]]
 //
@@ -120,10 +120,10 @@ bf16x8 pfmax16b(bf16x8 a, bf16x8 b) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[TBAA6]]
+// CHECK-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16, !tbaa [[TBAA6]]
 // CHECK-NEXT:    [[ELT_MAXIMUMNUM:%.*]] = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
 // CHECK-NEXT:    ret <4 x float> [[ELT_MAXIMUMNUM]]
 //
@@ -135,10 +135,10 @@ float4 pfmax32(float4 a, float4 b) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
-// CHECK-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[TBAA6]]
+// CHECK-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16, !tbaa [[TBAA6]]
 // CHECK-NEXT:    [[ELT_MAXIMUMNUM:%.*]] = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
 // CHECK-NEXT:    ret <2 x double> [[ELT_MAXIMUMNUM]]
 //
@@ -151,12 +151,12 @@ double2 pfmax64(double2 a, double2 b) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <2 x x86_fp80>, align 32
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <2 x x86_fp80>, align 32
-// CHECK-NEXT:    [[A:%.*]] = load <2 x x86_fp80>, ptr [[TMP0]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[B:%.*]] = load <2 x x86_fp80>, ptr [[TMP1]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store <2 x x86_fp80> [[A]], ptr [[A_ADDR]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    store <2 x x86_fp80> [[B]], ptr [[B_ADDR]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x x86_fp80>, ptr [[A_ADDR]], align 32, !tbaa [[TBAA2]]
-// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x x86_fp80>, ptr [[B_ADDR]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[A:%.*]] = load <2 x x86_fp80>, ptr [[TMP0]], align 32, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[B:%.*]] = load <2 x x86_fp80>, ptr [[TMP1]], align 32, !tbaa [[TBAA6]]
+// CHECK-NEXT:    store <2 x x86_fp80> [[A]], ptr [[A_ADDR]], align 32, !tbaa [[TBAA6]]
+// CHECK-NEXT:    store <2 x x86_fp80> [[B]], ptr [[B_ADDR]], align 32, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x x86_fp80>, ptr [[A_ADDR]], align 32, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x x86_fp80>, ptr [[B_ADDR]], align 32, !tbaa [[TBAA6]]
 // CHECK-NEXT:    [[ELT_MINIMUMNUM:%.*]] = call <2 x x86_fp80> @llvm.minimumnum.v2f80(<2 x x86_fp80> [[TMP2]], <2 x x86_fp80> [[TMP3]])
 // CHECK-NEXT:    ret <2 x x86_fp80> [[ELT_MINIMUMNUM]]
 //
@@ -165,7 +165,7 @@ ldouble2 pfmax80(ldouble2 a, ldouble2 b) {
 }
 
 //.
-// CHECK: [[TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
-// CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
-// CHECK: [[META4]] = !{!"Simple C++ TBAA"}
+// CHECK: [[META4:![0-9]+]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK: [[META5]] = !{!"Simple C++ TBAA"}
+// CHECK: [[TBAA6]] = !{[[META4]], [[META4]], i64 0}
 //.
diff --git a/clang/test/CodeGen/builtin-maxnum-minnum.c b/clang/test/CodeGen/builtin-maxnum-minnum.c
index 2455f3b616ce7..d05d43c23bf27 100644
--- a/clang/test/CodeGen/builtin-maxnum-minnum.c
+++ b/clang/test/CodeGen/builtin-maxnum-minnum.c
@@ -12,10 +12,10 @@ typedef long double ldouble2 __attribute__((ext_vector_type(2)));
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA2:![0-9]+]]
-// CHECK-NEXT:    store <8 x half> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA6:![0-9]+]]
+// CHECK-NEXT:    store <8 x half> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[ELT_MINNUM:%.*]] = call <8 x half> @llvm.minnum.v8f16(<8 x half> [[TMP0]], <8 x half> [[TMP1]])
 // CHECK-NEXT:    ret <8 x half> [[ELT_MINNUM]]
 //
@@ -27,10 +27,10 @@ half8 pfmin16(half8 a, half8 b) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x bfloat>, align 16
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x bfloat>, align 16
-// CHECK-NEXT:    store <8 x bfloat> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    store <8 x bfloat> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x bfloat>, ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x bfloat>, ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x bfloat> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    store <8 x bfloat> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x bfloat>, ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x bfloat>, ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[ELT_MINNUM:%.*]] = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]])
 // CHECK-NEXT:    ret <8 x bfloat> [[ELT_MINNUM]]
 //
@@ -42,10 +42,10 @@ bf16x8 pfmin16b(bf16x8 a, bf16x8 b) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[ELT_MINNUM:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
 // CHECK-NEXT:    ret <4 x float> [[ELT_MINNUM]]
 //
@@ -57,10 +57,10 @@ float4 pfmin32(float4 a, float4 b) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
-// CHECK-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[ELT_MINNUM:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
 // CHECK-NEXT:    ret <2 x double> [[ELT_MINNUM]]
 //
@@ -72,12 +72,12 @@ double2 pfmin64(double2 a, double2 b) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <2 x x86_fp80>, align 32
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <2 x x86_fp80>, align 32
-// CHECK-NEXT:    [[A:%.*]] = load <2 x x86_fp80>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[B:%.*]] = load <2 x x86_fp80>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    store <2 x x86_fp80> [[A]], ptr [[A_ADDR]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    store <2 x x86_fp80> [[B]], ptr [[B_ADDR]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x x86_fp80>, ptr [[A_ADDR]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x x86_fp80>, ptr [[B_ADDR]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[A:%.*]] = load <2 x x86_fp80>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[B:%.*]] = load <2 x x86_fp80>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    store <2 x x86_fp80> [[A]], ptr [[A_ADDR]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    store <2 x x86_fp80> [[B]], ptr [[B_ADDR]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x x86_fp80>, ptr [[A_ADDR]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x x86_fp80>, ptr [[B_ADDR]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[ELT_MINNUM:%.*]] = call <2 x x86_fp80> @llvm.minnum.v2f80(<2 x x86_fp80> [[TMP2]], <2 x x86_fp80> [[TMP3]])
 // CHECK-NEXT:    ret <2 x x86_fp80> [[ELT_MINNUM]]
 //
@@ -90,10 +90,10 @@ ldouble2 pfmin80(ldouble2 a, ldouble2 b) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x half>, align 16
-// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    store <8 x half> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    store <8 x half> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[ELT_MAXNUM:%.*]] = call <8 x half> @llvm.maxnum.v8f16(<8 x half> [[TMP0]], <8 x half> [[TMP1]])
 // CHECK-NEXT:    ret <8 x half> [[ELT_MAXNUM]]
 //
@@ -105,10 +105,10 @@ half8 pfmax16(half8 a, half8 b) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x bfloat>, align 16
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x bfloat>, align 16
-// CHECK-NEXT:    store <8 x bfloat> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    store <8 x bfloat> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x bfloat>, ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x bfloat>, ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <8 x bfloat> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    store <8 x bfloat> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x bfloat>, ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x bfloat>, ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[ELT_MAXNUM:%.*]] = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]])
 // CHECK-NEXT:    ret <8 x bfloat> [[ELT_MAXNUM]]
 //
@@ -120,10 +120,10 @@ bf16x8 pfmax16b(bf16x8 a, bf16x8 b) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
-// CHECK-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[ELT_MAXNUM:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
 // CHECK-NEXT:    ret <4 x float> [[ELT_MAXNUM]]
 //
@@ -135,10 +135,10 @@ float4 pfmax32(float4 a, float4 b) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
-// CHECK-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[ELT_MAXNUM:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
 // CHECK-NEXT:    ret <2 x double> [[ELT_MAXNUM]]
 //
@@ -151,12 +151,12 @@ double2 pfmax64(double2 a, double2 b) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <2 x x86_fp80>, align 32
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <2 x x86_fp80>, align 32
-// CHECK-NEXT:    [[A:%.*]] = load <2 x x86_fp80>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[B:%.*]] = load <2 x x86_fp80>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    store <2 x x86_fp80> [[A]], ptr [[A_ADDR]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    store <2 x x86_fp80> [[B]], ptr [[B_ADDR]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x x86_fp80>, ptr [[A_ADDR]], align 32, !tbaa [[CHAR_TBAA2]]
-// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x x86_fp80>, ptr [[B_ADDR]], align 32, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[A:%.*]] = load <2 x x86_fp80>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[B:%.*]] = load <2 x x86_fp80>, ptr [[TMP1]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    store <2 x x86_fp80> [[A]], ptr [[A_ADDR]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    store <2 x x86_fp80> [[B]], ptr [[B_ADDR]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x x86_fp80>, ptr [[A_ADDR]], align 32, !tbaa [[CHAR_TBAA6]]
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x x86_fp80>, ptr [[B_ADDR]], align 32, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[ELT_MINNUM:%.*]] = call <2 x x86_fp80> @llvm.minnum.v2f80(<2 x x86_fp80> [[TMP2]], <2 x x86_fp80> [[TMP3]])
 // CHECK-NEXT:    ret <2 x x86_fp80> [[ELT_MINNUM]]
 //
@@ -165,7 +165,7 @@ ldouble2 pfmax80(ldouble2 a, ldouble2 b) {
 }
 
 //.
-// CHECK: [[CHAR_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
-// CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
-// CHECK: [[META4]] = !{!"Simple C++ TBAA"}
+// CHECK: [[META4:![0-9]+]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK: [[META5]] = !{!"Simple C++ TBAA"}
+// CHECK: [[CHAR_TBAA6]] = !{[[META4]], [[META4]], i64 0}
 //.
diff --git a/clang/test/CodeGen/c11atomics-ios.c b/clang/test/CodeGen/c11atomics-ios.c
index 8310270ed8547..34843ef55ecb7 100644
--- a/clang/test/CodeGen/c11atomics-ios.c
+++ b/clang/test/CodeGen/c11atomics-ios.c
@@ -235,6 +235,7 @@ _Bool test_promoted_cmpxchg(_Atomic(PS) *addr, PS *desired, PS *new) {
   // CHECK:   [[ATOMIC_DESIRED:%.*]] = alloca { %struct.PS, [2 x i8] }, align 8
   // CHECK:   [[ATOMIC_NEW:%.*]] = alloca { %struct.PS, [2 x i8] }, align 8
   // CHECK:   [[RES_ADDR:%.*]] = alloca i8, align 1
+  // CHECK:   [[OLD_TMP:%.*]] = alloca i64, align 8
   // CHECK:   store ptr %addr, ptr [[ADDR_ARG]], align 4
   // CHECK:   store ptr %desired, ptr [[DESIRED_ARG]], align 4
   // CHECK:   store ptr %new, ptr [[NEW_ARG]], align 4
@@ -251,7 +252,8 @@ _Bool test_promoted_cmpxchg(_Atomic(PS) *addr, PS *desired, PS *new) {
   // CHECK:   [[RES_BOOL:%.*]] = extractvalue { i64, i1 } [[RES]], 1
   // CHECK:   br i1 [[RES_BOOL]], label {{%.*}}, label {{%.*}}
 
-  // CHECK:   store i64 [[RES_VAL64]], ptr [[ATOMIC_DESIRED]], align 8
+  // CHECK:   store i64 [[RES_VAL64]], ptr [[OLD_TMP]], align 8
+  // CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 2 [[DESIRED_ARG:%.*]], ptr align 8 [[OLD_TMP]], i64 6, i1 false)
   // CHECK:   br label {{%.*}}
 
   // CHECK:   [[RES_BOOL8:%.*]] = zext i1 [[RES_BOOL]] to i8
diff --git a/clang/test/CodeGen/call-graph-section-callback.cpp b/clang/test/CodeGen/call-graph-section-callback.cpp
new file mode 100644
index 0000000000000..e9b0a1818e3a4
--- /dev/null
+++ b/clang/test/CodeGen/call-graph-section-callback.cpp
@@ -0,0 +1,30 @@
+// Tests that callback function whose address is taken is attached Type ID metadata
+// as it is a potential indirect call target.
+
+// RUN: %clang_cc1 -triple x86_64-unknown-linux -fexperimental-call-graph-section \
+// RUN: -emit-llvm -o %t %s
+// RUN: FileCheck %s < %t
+
+////////////////////////////////////////////////////////////////////////////////
+typedef void (*CallbackFn)(int);
+
+// Callback function with "internal" linkage.
+// CHECK-LABEL: define internal void @_ZL10myCallbacki(
+// CHECK-SAME: {{.*}} !type [[F_CALLBACK:![0-9]+]]
+static void myCallback(int value) 
+{
+    volatile int sink = value;
+    (void)sink;
+}
+
+int takeCallbackAddress() {
+    // Take the address of the callback explicitly (address-taken function)
+    CallbackFn cb = &myCallback;
+    // Store the address in a volatile pointer to keep it observable
+    volatile void* addr = (void*)cb;
+    (void)addr;
+
+    return 0;
+}
+
+// CHECK: [[F_CALLBACK]]   = !{i64 0, !"_ZTSFviE.generalized"}
diff --git a/clang/test/CodeGen/call-graph-section-templates.cpp b/clang/test/CodeGen/call-graph-section-templates.cpp
new file mode 100644
index 0000000000000..39030d27a4ea9
--- /dev/null
+++ b/clang/test/CodeGen/call-graph-section-templates.cpp
@@ -0,0 +1,117 @@
+// Tests that we assign appropriate identifiers to indirect calls and targets
+// specifically for C++ templates.
+
+// RUN: %clang_cc1 -triple x86_64-unknown-linux -fexperimental-call-graph-section \
+// RUN: -emit-llvm -o %t %s
+// RUN: FileCheck --check-prefix=FT    %s < %t
+// RUN: FileCheck --check-prefix=CST   %s < %t
+
+////////////////////////////////////////////////////////////////////////////////
+// Class definitions and template classes (check for indirect target metadata)
+
+class Cls1 {};
+
+// Cls2 is instantiated with T=Cls1 in foo(). Following checks are for this
+// instantiation.
+template <class T>
+class Cls2 {
+public:
+  // FT-LABEL: define {{.*}} void @_ZN4Cls2I4Cls1E2f1Ev(
+  // FT-SAME: {{.*}} !type [[F_TCLS2F1:![0-9]+]]
+  void f1() {}
+
+  // FT-LABEL: define {{.*}} void @_ZN4Cls2I4Cls1E2f2ES0_(
+  // FT-SAME: {{.*}} !type [[F_TCLS2F2:![0-9]+]]
+  void f2(T a) {}
+
+  // FT-LABEL: define {{.*}} void @_ZN4Cls2I4Cls1E2f3EPS0_(
+  // FT-SAME: {{.*}} !type [[F_TCLS2F3:![0-9]+]]
+  void f3(T *a) {}
+
+  // FT-LABEL: define {{.*}} void @_ZN4Cls2I4Cls1E2f4EPKS0_(
+  // FT-SAME: {{.*}} !type [[F_TCLS2F4:![0-9]+]]
+  void f4(const T *a) {}
+
+  // FT-LABEL: define {{.*}} void @_ZN4Cls2I4Cls1E2f5ERS0_(
+  // FT-SAME: {{.*}} !type [[F_TCLS2F5:![0-9]+]]
+  void f5(T &a) {}
+
+  // FT-LABEL: define {{.*}} void @_ZN4Cls2I4Cls1E2f6ERKS0_(
+  // FT-SAME: {{.*}} !type [[F_TCLS2F6:![0-9]+]]
+  void f6(const T &a) {}
+
+  // Mixed type function pointer member
+  T *(*fp)(T a, T *b, const T *c, T &d, const T &e);
+};
+
+// FT: [[F_TCLS2F1]] = !{i64 0, !"_ZTSFvvE.generalized"}
+// FT: [[F_TCLS2F2]] = !{i64 0, !"_ZTSFv4Cls1E.generalized"}
+// FT: [[F_TCLS2F3]] = !{i64 0, !"_ZTSFvP4Cls1E.generalized"}
+// FT: [[F_TCLS2F4]] = !{i64 0, !"_ZTSFvPK4Cls1E.generalized"}
+// FT: [[F_TCLS2F5]] = !{i64 0, !"_ZTSFvR4Cls1E.generalized"}
+// FT: [[F_TCLS2F6]] = !{i64 0, !"_ZTSFvRK4Cls1E.generalized"}
+
+////////////////////////////////////////////////////////////////////////////////
+// Callsites (check for indirect callsite operand bundles)
+
+template <class T>
+T *T_func(T a, T *b, const T *c, T &d, const T &e) { return b; }
+
+// CST-LABEL: define {{.*}} @_Z3foov
+// CST-SAME: {{.*}} !type [[F_TCLS2F1:![0-9]+]]
+void foo() {
+  // Methods for Cls2<Cls1> is checked above within the template description.
+  Cls2<Cls1> Obj;
+
+  Obj.fp = T_func<Cls1>;
+  Cls1 Cls1Obj;
+  
+  // CST: call noundef ptr %{{.*}}, !callee_type [[F_TFUNC_CLS1_CT:![0-9]+]]
+  Obj.fp(Cls1Obj, &Cls1Obj, &Cls1Obj, Cls1Obj, Cls1Obj);
+
+  // Make indirect calls to Cls2's member methods
+  auto fp_f1 = &Cls2<Cls1>::f1;
+  auto fp_f2 = &Cls2<Cls1>::f2;
+  auto fp_f3 = &Cls2<Cls1>::f3;
+  auto fp_f4 = &Cls2<Cls1>::f4;
+  auto fp_f5 = &Cls2<Cls1>::f5;
+  auto fp_f6 = &Cls2<Cls1>::f6;
+
+  auto *Obj2Ptr = &Obj;
+
+  // CST: call void %{{.*}}, !callee_type [[F_TCLS2F1_CT:![0-9]+]]
+  (Obj2Ptr->*fp_f1)();
+
+  // CST: call void %{{.*}}, !callee_type [[F_TCLS2F2_CT:![0-9]+]]
+  (Obj2Ptr->*fp_f2)(Cls1Obj);
+
+  // CST: call void %{{.*}}, !callee_type [[F_TCLS2F3_CT:![0-9]+]]
+  (Obj2Ptr->*fp_f3)(&Cls1Obj);
+
+  // CST: call void %{{.*}}, !callee_type [[F_TCLS2F4_CT:![0-9]+]]
+  (Obj2Ptr->*fp_f4)(&Cls1Obj);
+
+  // CST: call void %{{.*}}, !callee_type [[F_TCLS2F5_CT:![0-9]+]]
+  (Obj2Ptr->*fp_f5)(Cls1Obj);
+
+  // CST: call void %{{.*}}, !callee_type [[F_TCLS2F6_CT:![0-9]+]]
+  (Obj2Ptr->*fp_f6)(Cls1Obj);
+}
+
+// CST-LABEL: define {{.*}} @_Z6T_funcI4Cls1EPT_S1_S2_PKS1_RS1_RS3_(
+// CST-SAME: {{.*}} !type [[F_TFUNC_CLS1:![0-9]+]]
+
+// CST: [[F_TCLS2F1]] = !{i64 0, !"_ZTSFvvE.generalized"}
+// CST: [[F_TFUNC_CLS1_CT]] = !{[[F_TFUNC_CLS1:![0-9]+]]}
+// CST: [[F_TFUNC_CLS1]] = !{i64 0, !"_ZTSFP4Cls1S_S0_PKS_RS_RS1_E.generalized"}
+// CST: [[F_TCLS2F1_CT]] = !{[[F_TCLS2F1:![0-9]+]]}
+// CST: [[F_TCLS2F2_CT]] = !{[[F_TCLS2F2:![0-9]+]]}
+// CST: [[F_TCLS2F2]] = !{i64 0, !"_ZTSFv4Cls1E.generalized"}
+// CST: [[F_TCLS2F3_CT]] = !{[[F_TCLS2F3:![0-9]+]]}
+// CST: [[F_TCLS2F3]] = !{i64 0, !"_ZTSFvP4Cls1E.generalized"}
+// CST: [[F_TCLS2F4_CT]] = !{[[F_TCLS2F4:![0-9]+]]}
+// CST: [[F_TCLS2F4]] = !{i64 0, !"_ZTSFvPK4Cls1E.generalized"}
+// CST: [[F_TCLS2F5_CT]] = !{[[F_TCLS2F5:![0-9]+]]}
+// CST: [[F_TCLS2F5]] = !{i64 0, !"_ZTSFvR4Cls1E.generalized"}
+// CST: [[F_TCLS2F6_CT]] = !{[[F_TCLS2F6:![0-9]+]]}
+// CST: [[F_TCLS2F6]] = !{i64 0, !"_ZTSFvRK4Cls1E.generalized"}
diff --git a/clang/test/CodeGen/call-graph-section-virtual-methods.cpp b/clang/test/CodeGen/call-graph-section-virtual-methods.cpp
new file mode 100644
index 0000000000000..afeeae146ec41
--- /dev/null
+++ b/clang/test/CodeGen/call-graph-section-virtual-methods.cpp
@@ -0,0 +1,56 @@
+// Tests that we assign appropriate identifiers to indirect calls and targets
+// specifically for virtual methods.
+
+// RUN: %clang_cc1 -triple x86_64-unknown-linux -fexperimental-call-graph-section \
+// RUN: -emit-llvm -o %t %s
+// RUN: FileCheck --check-prefix=FT %s < %t
+// RUN: FileCheck --check-prefix=CST %s < %t
+
+////////////////////////////////////////////////////////////////////////////////
+// Class definitions (check for indirect target metadata)
+
+class Base {
+  public:
+    // FT-LABEL: define {{.*}} @_ZN4Base2vfEPc(
+    // FT-SAME: {{.*}} !type [[F_TVF:![0-9]+]]
+    virtual int vf(char *a) { return 0; };
+  };
+  
+  class Derived : public Base {
+  public:
+    // FT: define {{.*}} @_ZN7Derived2vfEPc({{.*}} !type [[F_TVF]]
+    int vf(char *a) override { return 1; };
+  };
+  
+  // FT: [[F_TVF]] = !{i64 0, !"_ZTSFiPcE.generalized"}
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  // Callsites (check for indirect callsite operand bundles)
+  
+  // CST-LABEL: define {{.*}} @_Z3foov
+  void foo() {
+    auto B = Base();
+    auto D = Derived();
+  
+    Base *Bptr = &B;
+    Base *BptrToD = &D;
+    Derived *Dptr = &D;
+  
+    auto FpBaseVf = &Base::vf;
+    auto FpDerivedVf = &Derived::vf;
+  
+    // CST: call noundef i32 %{{.*}}, !callee_type [[F_TVF_CT:![0-9]+]]
+    (Bptr->*FpBaseVf)(0);
+  
+    // CST: call noundef i32 %{{.*}}, !callee_type [[F_TVF_CT:![0-9]+]]
+    (BptrToD->*FpBaseVf)(0);
+  
+    // CST: call noundef i32 %{{.*}}, !callee_type [[F_TVF_CT:![0-9]+]]
+    (Dptr->*FpBaseVf)(0);
+  
+    // CST: call noundef i32 %{{.*}}, !callee_type [[F_TVF_CT:![0-9]+]]
+    (Dptr->*FpDerivedVf)(0);
+  }
+
+  // CST: [[F_TVF_CT]] = !{[[F_TVF:![0-9]+]]}
+  // CST: [[F_TVF]] = !{i64 0, !"_ZTSFiPcE.generalized"}
diff --git a/clang/test/CodeGen/call-graph-section.c b/clang/test/CodeGen/call-graph-section.c
new file mode 100644
index 0000000000000..69cdd59549190
--- /dev/null
+++ b/clang/test/CodeGen/call-graph-section.c
@@ -0,0 +1,93 @@
+// Tests that we assign appropriate identifiers to indirect calls and targets.
+
+// RUN: %clang_cc1 -triple x86_64-unknown-linux -fexperimental-call-graph-section \
+// RUN: -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,ITANIUM %s
+
+// RUN: %clang_cc1 -triple x86_64-pc-windows-msvc -fexperimental-call-graph-section \
+// RUN: -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,MS %s
+
+// CHECK-LABEL: define {{(dso_local)?}} void @foo(
+// CHECK-SAME: {{.*}} !type [[F_TVOID:![0-9]+]]
+void foo() {
+}
+
+// CHECK-LABEL: define {{(dso_local)?}} void @bar(
+// CHECK-SAME: {{.*}} !type [[F_TVOID]]
+void bar() {
+  void (*fp)() = foo;
+  // ITANIUM: call {{.*}}, !callee_type [[F_TVOID_CT:![0-9]+]]
+  // MS: call {{.*}}, !callee_type [[F_TVOID_CT:![0-9]+]]
+  fp();
+}
+
+// CHECK-LABEL: define {{(dso_local)?}} i32 @baz(
+// CHECK-SAME: {{.*}} !type [[F_TPRIMITIVE:![0-9]+]]
+int baz(char a, float b, double c) {
+  return 1;
+}
+
+// CHECK-LABEL: define {{(dso_local)?}} ptr @qux(
+// CHECK-SAME: {{.*}} !type [[F_TPTR:![0-9]+]]
+int *qux(char *a, float *b, double *c) {
+  return 0;
+}
+
+// CHECK-LABEL: define {{(dso_local)?}} void @corge(
+// CHECK-SAME: {{.*}} !type [[F_TVOID]]
+void corge() {
+  int (*fp_baz)(char, float, double) = baz;  
+  // CHECK: call i32 {{.*}}, !callee_type [[F_TPRIMITIVE_CT:![0-9]+]]
+  fp_baz('a', .0f, .0);
+
+  int *(*fp_qux)(char *, float *, double *) = qux;  
+  // CHECK: call ptr {{.*}}, !callee_type [[F_TPTR_CT:![0-9]+]]
+  fp_qux(0, 0, 0);
+}
+
+struct st1 {
+  int *(*fp)(char *, float *, double *);
+};
+
+struct st2 {
+  struct st1 m;
+};
+
+// CHECK-LABEL: define {{(dso_local)?}} void @stparam(
+// CHECK-SAME: {{.*}} !type [[F_TSTRUCT:![0-9]+]]
+void stparam(struct st2 a, struct st2 *b) {}
+
+// CHECK-LABEL: define {{(dso_local)?}} void @stf(
+// CHECK-SAME: {{.*}} !type [[F_TVOID]]
+void stf() {
+  struct st1 St1;
+  St1.fp = qux;  
+  // CHECK: call ptr {{.*}}, !callee_type [[F_TPTR_CT:![0-9]+]]
+  St1.fp(0, 0, 0);
+
+  struct st2 St2;
+  St2.m.fp = qux;  
+  // CHECK: call ptr {{.*}}, !callee_type [[F_TPTR_CT:![0-9]+]]
+  St2.m.fp(0, 0, 0);
+  
+  // CHECK: call void {{.*}}, !callee_type [[F_TSTRUCT_CT:![0-9]+]]
+  void (*fp_stparam)(struct st2, struct st2 *) = stparam;
+  fp_stparam(St2, &St2);
+}
+
+// ITANIUM: [[F_TVOID]] = !{i64 0, !"_ZTSFvE.generalized"}
+// ITANIUM: [[F_TVOID_CT]] = !{[[F_TVOID:![0-9]+]]}
+// ITANIUM: [[F_TPRIMITIVE]] = !{i64 0, !"_ZTSFicfdE.generalized"}
+// ITANIUM: [[F_TPTR]] = !{i64 0, !"_ZTSFPiPcPfPdE.generalized"}
+// ITANIUM: [[F_TPRIMITIVE_CT]] = !{[[F_TPRIMITIVE:![0-9]+]]}
+// ITANIUM: [[F_TPTR_CT]] = !{[[F_TPTR:![0-9]+]]}
+// ITANIUM: [[F_TSTRUCT]] = !{i64 0, !"_ZTSFv3st2PS_E.generalized"}
+// ITANIUM: [[F_TSTRUCT_CT]] = !{[[F_TSTRUCT:![0-9]+]]}
+
+// MS: [[F_TVOID]] = !{i64 0, !"?6AX@Z.generalized"}
+// MS: [[F_TVOID_CT]] = !{[[F_TVOID:![0-9]+]]}
+// MS: [[F_TPRIMITIVE]] = !{i64 0, !"?6AHDMN@Z.generalized"}
+// MS: [[F_TPTR]] = !{i64 0, !"?6APEAHPEADPEAMPEAN@Z.generalized"}
+// MS: [[F_TPRIMITIVE_CT]] = !{[[F_TPRIMITIVE:![0-9]+]]}
+// MS: [[F_TPTR_CT]] = !{[[F_TPTR:![0-9]+]]}
+// MS: [[F_TSTRUCT]] = !{i64 0, !"?6AXUst2@@PEAU0@@Z.generalized"}
+// MS: [[F_TSTRUCT_CT]] = !{[[F_TSTRUCT:![0-9]+]]}
diff --git a/clang/test/CodeGen/call-graph-section.cpp b/clang/test/CodeGen/call-graph-section.cpp
new file mode 100644
index 0000000000000..86ed3ee2337a7
--- /dev/null
+++ b/clang/test/CodeGen/call-graph-section.cpp
@@ -0,0 +1,147 @@
+// Tests that we assign appropriate identifiers to indirect calls and targets
+// specifically for C++ class and instance methods.
+
+// RUN: %clang_cc1 -triple x86_64-unknown-linux -fexperimental-call-graph-section \
+// RUN: -emit-llvm -o %t %s
+// RUN: FileCheck --check-prefix=FT %s < %t
+// RUN: FileCheck --check-prefix=CST %s < %t
+
+////////////////////////////////////////////////////////////////////////////////
+// Class definitions (check for indirect target metadata)
+
+class Cls1 {
+public:
+  // FT-LABEL: define {{.*}} ptr @_ZN4Cls18receiverEPcPf(
+  // FT-SAME: {{.*}} !type [[F_TCLS1RECEIVER:![0-9]+]]
+  static int *receiver(char *a, float *b) { return 0; }
+};
+
+class Cls2 {
+public:
+  int *(*fp)(char *, float *);
+
+  // FT-LABEL: define {{.*}} i32 @_ZN4Cls22f1Ecfd(
+  // FT-SAME: {{.*}} !type [[F_TCLS2F1:![0-9]+]]
+  int f1(char a, float b, double c) { return 0; }
+
+  // FT-LABEL: define {{.*}} ptr @_ZN4Cls22f2EPcPfPd(
+  // FT-SAME: {{.*}} !type [[F_TCLS2F2:![0-9]+]]
+  int *f2(char *a, float *b, double *c) { return 0; }
+
+  // FT-LABEL: define {{.*}} void @_ZN4Cls22f3E4Cls1(
+  // FT-SAME: {{.*}} !type [[F_TCLS2F3F4:![0-9]+]]
+  void f3(Cls1 a) {}
+
+  // FT-LABEL: define {{.*}} void @_ZN4Cls22f4E4Cls1(
+  // FT-SAME: {{.*}} !type [[F_TCLS2F3F4]]
+  void f4(const Cls1 a) {}
+
+  // FT-LABEL: define {{.*}} void @_ZN4Cls22f5EP4Cls1(
+  // FT-SAME: {{.*}} !type [[F_TCLS2F5:![0-9]+]]
+  void f5(Cls1 *a) {}
+
+  // FT-LABEL: define {{.*}} void @_ZN4Cls22f6EPK4Cls1(
+  // FT-SAME: {{.*}} !type [[F_TCLS2F6:![0-9]+]]
+  void f6(const Cls1 *a) {}
+
+  // FT-LABEL: define {{.*}} void @_ZN4Cls22f7ER4Cls1(
+  // FT-SAME: {{.*}} !type [[F_TCLS2F7:![0-9]+]]
+  void f7(Cls1 &a) {}
+
+  // FT-LABEL: define {{.*}} void @_ZN4Cls22f8ERK4Cls1(
+  // FT-SAME: {{.*}} !type [[F_TCLS2F8:![0-9]+]]
+  void f8(const Cls1 &a) {}
+
+  // FT-LABEL: define {{.*}} void @_ZNK4Cls22f9Ev(
+  // FT-SAME: {{.*}} !type [[F_TCLS2F9:![0-9]+]]
+  void f9() const {}
+};
+
+// FT: [[F_TCLS1RECEIVER]] = !{i64 0, !"_ZTSFPiPcPfE.generalized"}
+// FT: [[F_TCLS2F1]]   = !{i64 0, !"_ZTSFicfdE.generalized"}
+// FT: [[F_TCLS2F2]]   = !{i64 0, !"_ZTSFPiPcPfPdE.generalized"}
+// FT: [[F_TCLS2F3F4]] = !{i64 0, !"_ZTSFv4Cls1E.generalized"}
+// FT: [[F_TCLS2F5]]   = !{i64 0, !"_ZTSFvP4Cls1E.generalized"}
+// FT: [[F_TCLS2F6]]   = !{i64 0, !"_ZTSFvPK4Cls1E.generalized"}
+// FT: [[F_TCLS2F7]]   = !{i64 0, !"_ZTSFvR4Cls1E.generalized"}
+// FT: [[F_TCLS2F8]]   = !{i64 0, !"_ZTSFvRK4Cls1E.generalized"}
+// FT: [[F_TCLS2F9]]   = !{i64 0, !"_ZTSKFvvE.generalized"}
+
+////////////////////////////////////////////////////////////////////////////////
+// Callsites (check for indirect callsites' callee_type metadata )
+
+// CST-LABEL: define {{.*}} @_Z3foov
+void foo() {
+  Cls2 ObjCls2;
+  ObjCls2.fp = &Cls1::receiver;
+
+  // CST: call noundef ptr %{{.*}}, !callee_type [[F_TCLS1RECEIVER_CT:![0-9]+]]
+  ObjCls2.fp(0, 0);
+
+  auto fp_f1 = &Cls2::f1;
+  auto fp_f2 = &Cls2::f2;
+  auto fp_f3 = &Cls2::f3;
+  auto fp_f4 = &Cls2::f4;
+  auto fp_f5 = &Cls2::f5;
+  auto fp_f6 = &Cls2::f6;
+  auto fp_f7 = &Cls2::f7;
+  auto fp_f8 = &Cls2::f8;
+  auto fp_f9 = &Cls2::f9;
+
+  Cls2 *ObjCls2Ptr = &ObjCls2;
+  Cls1 Cls1Param;
+
+  // CST: call noundef i32 %{{.*}}, !callee_type [[F_TCLS2F1_CT:![0-9]+]]
+  (ObjCls2Ptr->*fp_f1)(0, 0, 0);
+
+  // CST: call noundef ptr %{{.*}}, !callee_type [[F_TCLS2F2_CT:![0-9]+]]
+  (ObjCls2Ptr->*fp_f2)(0, 0, 0);
+
+  // CST: call void %{{.*}}, !callee_type [[F_TCLS2F3F4_CT:![0-9]+]]
+  (ObjCls2Ptr->*fp_f3)(Cls1Param);
+
+  // CST: call void  %{{.*}}, !callee_type [[F_TCLS2F3F4_CT:![0-9]+]]
+  (ObjCls2Ptr->*fp_f4)(Cls1Param);
+
+  // CST: call void %{{.*}}, !callee_type [[F_TCLS2F5_CT:![0-9]+]]
+  (ObjCls2Ptr->*fp_f5)(&Cls1Param);
+
+  // CST: call void %{{.*}}, !callee_type [[F_TCLS2F6_CT:![0-9]+]]
+  (ObjCls2Ptr->*fp_f6)(&Cls1Param);
+
+  // CST: call void %{{.*}}, !callee_type [[F_TCLS2F7_CT:![0-9]+]]
+  (ObjCls2Ptr->*fp_f7)(Cls1Param);
+
+  // CST: call void %{{.*}}, !callee_type [[F_TCLS2F8_CT:![0-9]+]]
+  (ObjCls2Ptr->*fp_f8)(Cls1Param);
+
+  // CST: call void %{{.*}}, !callee_type [[F_TCLS2F9_CT:![0-9]+]]
+  (ObjCls2Ptr->*fp_f9)();
+}
+
+// CST: [[F_TCLS1RECEIVER_CT]] = !{[[F_TCLS1RECEIVER:![0-9]+]]}
+// CST: [[F_TCLS1RECEIVER]] = !{i64 0, !"_ZTSFPiPcPfE.generalized"}
+
+// CST: [[F_TCLS2F1_CT]] = !{[[F_TCLS2F1:![0-9]+]]}
+// CST: [[F_TCLS2F1]]   = !{i64 0, !"_ZTSFicfdE.generalized"}
+
+// CST: [[F_TCLS2F2_CT]] = !{[[F_TCLS2F2:![0-9]+]]}
+// CST: [[F_TCLS2F2]]   = !{i64 0, !"_ZTSFPiPcPfPdE.generalized"}
+
+// CST: [[F_TCLS2F3F4_CT]] = !{[[F_TCLS2F3F4:![0-9]+]]}
+// CST: [[F_TCLS2F3F4]] = !{i64 0, !"_ZTSFv4Cls1E.generalized"}
+
+// CST: [[F_TCLS2F5_CT]] = !{[[F_TCLS2F5:![0-9]+]]}
+// CST: [[F_TCLS2F5]]   = !{i64 0, !"_ZTSFvP4Cls1E.generalized"}
+
+// CST: [[F_TCLS2F6_CT]] = !{[[F_TCLS2F6:![0-9]+]]}
+// CST: [[F_TCLS2F6]]   = !{i64 0, !"_ZTSFvPK4Cls1E.generalized"}
+
+// CST: [[F_TCLS2F7_CT]] = !{[[F_TCLS2F7:![0-9]+]]}
+// CST: [[F_TCLS2F7]]   = !{i64 0, !"_ZTSFvR4Cls1E.generalized"}
+
+// CST: [[F_TCLS2F8_CT]] = !{[[F_TCLS2F8:![0-9]+]]}
+// CST: [[F_TCLS2F8]]   = !{i64 0, !"_ZTSFvRK4Cls1E.generalized"}
+
+// CST: [[F_TCLS2F9_CT]] = !{[[F_TCLS2F9:![0-9]+]]}
+// CST: [[F_TCLS2F9]]   = !{i64 0, !"_ZTSKFvvE.generalized"}
diff --git a/clang/test/CodeGen/distributed-thin-lto/supports-hot-cold-new.ll b/clang/test/CodeGen/distributed-thin-lto/supports-hot-cold-new.ll
index 08c1a2946971c..90cda3ece773e 100644
--- a/clang/test/CodeGen/distributed-thin-lto/supports-hot-cold-new.ll
+++ b/clang/test/CodeGen/distributed-thin-lto/supports-hot-cold-new.ll
@@ -22,7 +22,7 @@
 
 ; RUN: %clang -target x86_64-unknown-linux-gnu -O2 -o %t1.o -x ir %t.o -c -fthinlto-index=%t.o.thinlto.bc -save-temps=obj
 
-; RUN: llvm-dis %t.s.3.import.bc -o - | FileCheck %s --check-prefix=CHECK-IR
+; RUN: llvm-dis %t.s.4.opt.bc -o - | FileCheck %s --check-prefix=CHECK-IR
 ; CHECK-IR: !memprof {{.*}} !callsite
 ; CHECK-IR: "memprof"="cold"
 
@@ -42,10 +42,15 @@
 
 ; RUN: %clang -target x86_64-unknown-linux-gnu -O2 -o %t1.o -x ir %t.o -c -fthinlto-index=%t.o.thinlto.bc -save-temps=obj
 
-; RUN: llvm-dis %t.s.3.import.bc -o - | FileCheck %s \
+; RUN: llvm-dis %t.s.4.opt.bc -o - | FileCheck %s \
 ; RUN: --implicit-check-not "!memprof" --implicit-check-not "!callsite" \
 ; RUN: --implicit-check-not "memprof"="cold"
 
+;; Ensure the attributes and metadata are stripped when running a non-LTO pipeline.
+; RUN: %clang -target x86_64-unknown-linux-gnu -O2 -x ir %t.o -S -emit-llvm -o - | FileCheck %s \
+; RUN: 	--implicit-check-not "!memprof" --implicit-check-not "!callsite" \
+; RUN: 	--implicit-check-not "memprof"="cold"
+
 source_filename = "thinlto-distributed-supports-hot-cold-new.ll"
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/clang/test/CodeGen/errno-tbaa.c b/clang/test/CodeGen/errno-tbaa.c
new file mode 100644
index 0000000000000..4ca62a37babf2
--- /dev/null
+++ b/clang/test/CodeGen/errno-tbaa.c
@@ -0,0 +1,12 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -O0 -o - %s | FileCheck %s --check-prefix=NOTBAA
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -O1 -o - %s | FileCheck %s --check-prefix=ERRNO-TBAA
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -x c++ -emit-llvm -O1 -o - %s | FileCheck %s --check-prefix=ERRNO-TBAA
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -O1 -relaxed-aliasing -o - %s | FileCheck %s --check-prefix=NOSTRICT
+
+// Ensure !llvm.errno.tbaa metadata is emitted upon integer accesses, if TBAA is available.
+
+int int_access(int *ptr) { return ptr ? *ptr : 0; }
+
+// NOTBAA-NOT: !llvm.errno.tbaa
+// ERRNO-TBAA: !llvm.errno.tbaa
+// NOSTRICT-NOT: !llvm.errno.tbaa
diff --git a/clang/test/CodeGen/ext-int-cc.c b/clang/test/CodeGen/ext-int-cc.c
index 7cfd992fd48b4..f845afcf1e087 100644
--- a/clang/test/CodeGen/ext-int-cc.c
+++ b/clang/test/CodeGen/ext-int-cc.c
@@ -49,8 +49,8 @@ void ParamPassing(_BitInt(128) b, _BitInt(64) c) {}
 // R600: define{{.*}} void @ParamPassing(ptr addrspace(5) byval(i128) align 8 %{{.+}}, i64 %{{.+}})
 // ARC: define{{.*}} void @ParamPassing(ptr byval(i128) align 4 %{{.+}}, i64 inreg %{{.+}})
 // XCORE: define{{.*}} void @ParamPassing(ptr byval(i128) align 4 %{{.+}}, i64 %{{.+}})
-// RISCV64: define{{.*}} void @ParamPassing(i128 %{{.+}}, i64 %{{.+}})
-// RISCV32: define{{.*}} void @ParamPassing(ptr dead_on_return %{{.+}}, i64 %{{.+}})
+// RISCV64: define{{.*}} void @ParamPassing(i128 signext %{{.+}}, i64 signext %{{.+}})
+// RISCV32: define{{.*}} void @ParamPassing(ptr dead_on_return %{{.+}}, i64 signext %{{.+}})
 // WASM: define{{.*}} void @ParamPassing(i128 %{{.+}}, i64 %{{.+}})
 // SYSTEMZ: define{{.*}} void @ParamPassing(ptr dead_on_return %{{.+}}, i64 %{{.+}})
 // PPC64: define{{.*}} void @ParamPassing(i128 %{{.+}}, i64 %{{.+}})
@@ -79,8 +79,8 @@ void ParamPassing2(_BitInt(127) b, _BitInt(63) c) {}
 // R600: define{{.*}} void @ParamPassing2(ptr addrspace(5) byval(i128) align 8 %{{.+}}, i63 %{{.+}})
 // ARC: define{{.*}} void @ParamPassing2(ptr byval(i128) align 4 %{{.+}}, i63 inreg %{{.+}})
 // XCORE: define{{.*}} void @ParamPassing2(ptr byval(i128) align 4 %{{.+}}, i63 %{{.+}})
-// RISCV64: define{{.*}} void @ParamPassing2(i127 %{{.+}}, i63 signext %{{.+}})
-// RISCV32: define{{.*}} void @ParamPassing2(ptr dead_on_return %{{.+}}, i63 %{{.+}})
+// RISCV64: define{{.*}} void @ParamPassing2(i127 signext %{{.+}}, i63 signext %{{.+}})
+// RISCV32: define{{.*}} void @ParamPassing2(ptr dead_on_return %{{.+}}, i63 signext %{{.+}})
 // WASM: define{{.*}} void @ParamPassing2(i127 %{{.+}}, i63 %{{.+}})
 // SYSTEMZ: define{{.*}} void @ParamPassing2(ptr dead_on_return %{{.+}}, i63 signext %{{.+}})
 // PPC64: define{{.*}} void @ParamPassing2(i127 %{{.+}}, i63 signext %{{.+}})
diff --git a/clang/test/CodeGen/inline-asm-systemz-flag-output.c b/clang/test/CodeGen/inline-asm-systemz-flag-output.c
new file mode 100644
index 0000000000000..041797b8b0639
--- /dev/null
+++ b/clang/test/CodeGen/inline-asm-systemz-flag-output.c
@@ -0,0 +1,57 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
+// RUN: %clang_cc1 -O2 -triple s390x-linux -emit-llvm -o - %s | FileCheck %s
+
+// CHECK-LABEL: define dso_local signext range(i32 0, 4) i32 @test(
+// CHECK-SAME: i32 noundef signext [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { i32, i32 } asm "ahi $0,42\0A", "=d,={@cc},0"(i32 [[X]]) #[[ATTR2:[0-9]+]], !srcloc [[META2:![0-9]+]]
+// CHECK-NEXT:    [[ASMRESULT1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[ASMRESULT1]], 4
+// CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP1]])
+// CHECK-NEXT:    ret i32 [[ASMRESULT1]]
+//
+int test(int x) {
+  int cc;
+  asm ("ahi %[x],42\n" : [x] "+d"(x), "=@cc" (cc));
+  return cc;
+}
+
+// CHECK-LABEL: define dso_local signext range(i32 0, 2) i32 @test_low_high_transformation(
+// CHECK-SAME: i32 noundef signext [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { i32, i32 } asm "ahi $0,42\0A", "=d,={@cc},0"(i32 [[X]]) #[[ATTR2]], !srcloc [[META3:![0-9]+]]
+// CHECK-NEXT:    [[ASMRESULT1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[ASMRESULT1]], 4
+// CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = add nsw i32 [[ASMRESULT1]], -1
+// CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i32 [[TMP2]], 2
+// CHECK-NEXT:    [[LOR_EXT:%.*]] = zext i1 [[TMP3]] to i32
+// CHECK-NEXT:    ret i32 [[LOR_EXT]]
+//
+int test_low_high_transformation(int x) {
+  int cc;
+  asm ("ahi %[x],42\n" : [x] "+d"(x), "=@cc" (cc));
+  return cc == 1 || cc == 2;
+}
+
+// CHECK-LABEL: define dso_local signext range(i32 0, 2) i32 @test_equal_high_transformation(
+// CHECK-SAME: i32 noundef signext [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { i32, i32 } asm "ahi $0,42\0A", "=d,={@cc},0"(i32 [[X]]) #[[ATTR2]], !srcloc [[META4:![0-9]+]]
+// CHECK-NEXT:    [[ASMRESULT1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[ASMRESULT1]], 4
+// CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[ASMRESULT1]], 1
+// CHECK-NEXT:    [[LOR_EXT:%.*]] = xor i32 [[TMP2]], 1
+// CHECK-NEXT:    ret i32 [[LOR_EXT]]
+//
+int test_equal_high_transformation(int x) {
+  int cc;
+  asm ("ahi %[x],42\n" : [x] "+d"(x), "=@cc" (cc));
+  return cc == 0 || cc == 2;
+}
+//.
+// CHECK: [[META2]] = !{i64 788}
+// CHECK: [[META3]] = !{i64 1670}
+// CHECK: [[META4]] = !{i64 2505}
+//.
diff --git a/clang/test/CodeGen/isfpclass.c b/clang/test/CodeGen/isfpclass.c
index 8a631c471c329..07e760e60b57b 100644
--- a/clang/test/CodeGen/isfpclass.c
+++ b/clang/test/CodeGen/isfpclass.c
@@ -162,17 +162,17 @@ int4 check_isfpclass_nan_strict_v4f32(float4 x) {
 // CHECK-LABEL: define dso_local void @check_isfpclass_nan_v4f64(
 // CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 16 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[X:%.*]] = load <4 x double>, ptr [[TMP0]], align 16, !tbaa [[CHAR_TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[X:%.*]] = load <4 x double>, ptr [[TMP0]], align 16, !tbaa [[CHAR_TBAA6:![0-9]+]]
 // CHECK-NEXT:    [[TMP1:%.*]] = fcmp uno <4 x double> [[X]], zeroinitializer
 // CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i1> [[TMP1]] to <4 x i64>
-// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 16, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 16, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 long4 check_isfpclass_nan_v4f64(double4 x) {
   return __builtin_isfpclass(x, 3 /*NaN*/);
 }
 //.
-// CHECK: [[CHAR_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
-// CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
-// CHECK: [[META4]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[META4:![0-9]+]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK: [[META5]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[CHAR_TBAA6]] = !{[[META4]], [[META4]], i64 0}
 //.
diff --git a/clang/test/CodeGen/math-libcalls-tbaa-indirect-args.c b/clang/test/CodeGen/math-libcalls-tbaa-indirect-args.c
index 20a31003fe915..e5ab6e00c4e54 100644
--- a/clang/test/CodeGen/math-libcalls-tbaa-indirect-args.c
+++ b/clang/test/CodeGen/math-libcalls-tbaa-indirect-args.c
@@ -54,13 +54,13 @@ long double powl(long double a, long double b);
 //
 // CHECK-MINGW32-LABEL: define dso_local void @test_powl(
 // CHECK-MINGW32-SAME: ptr dead_on_unwind noalias writable writeonly sret(x86_fp80) align 16 captures(none) initializes((0, 10)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-// CHECK-MINGW32:    [[A:%.*]] = load x86_fp80, ptr [[TMP0]], align 16, !tbaa [[LONG_DOUBLE_TBAA6:![0-9]+]]
-// CHECK-MINGW32:    [[B:%.*]] = load x86_fp80, ptr [[TMP1]], align 16, !tbaa [[LONG_DOUBLE_TBAA6]]
-// CHECK-MINGW32:    store x86_fp80 [[A]], ptr [[BYVAL_TEMP:%.*]], align 16, !tbaa [[LONG_DOUBLE_TBAA6]]
-// CHECK-MINGW32:    store x86_fp80 [[B]], ptr [[BYVAL_TEMP1:%.*]], align 16, !tbaa [[LONG_DOUBLE_TBAA6]]
+// CHECK-MINGW32:    [[A:%.*]] = load x86_fp80, ptr [[TMP0]], align 16, !tbaa [[LONG_DOUBLE_TBAA10:![0-9]+]]
+// CHECK-MINGW32:    [[B:%.*]] = load x86_fp80, ptr [[TMP1]], align 16, !tbaa [[LONG_DOUBLE_TBAA10]]
+// CHECK-MINGW32:    store x86_fp80 [[A]], ptr [[BYVAL_TEMP:%.*]], align 16, !tbaa [[LONG_DOUBLE_TBAA10]]
+// CHECK-MINGW32:    store x86_fp80 [[B]], ptr [[BYVAL_TEMP1:%.*]], align 16, !tbaa [[LONG_DOUBLE_TBAA10]]
 // CHECK-MINGW32:    call void @powl(ptr dead_on_unwind nonnull writable sret(x86_fp80) align 16 [[TMP:%.*]], ptr dead_on_return noundef nonnull [[BYVAL_TEMP]], ptr dead_on_return noundef nonnull [[BYVAL_TEMP1]]) #[[ATTR3:[0-9]+]]
-// CHECK-MINGW32:    [[TMP2:%.*]] = load x86_fp80, ptr [[TMP]], align 16, !tbaa [[LONG_DOUBLE_TBAA6]]
-// CHECK-MINGW32:    store x86_fp80 [[TMP2]], ptr [[AGG_RESULT]], align 16, !tbaa [[LONG_DOUBLE_TBAA6]]
+// CHECK-MINGW32:    [[TMP2:%.*]] = load x86_fp80, ptr [[TMP]], align 16, !tbaa [[LONG_DOUBLE_TBAA10]]
+// CHECK-MINGW32:    store x86_fp80 [[TMP2]], ptr [[AGG_RESULT]], align 16, !tbaa [[LONG_DOUBLE_TBAA10]]
 //
 long double test_powl(long double a, long double b) {
    return powl(a, b);
@@ -137,7 +137,7 @@ long double test_powl(long double a, long double b) {
 // CHECK-MINGW32:    store x86_fp80 [[CLD_REAL]], ptr [[BYVAL_TEMP:%.*]], align 16
 // CHECK-MINGW32:    store x86_fp80 [[CLD_IMAG]], ptr [[BYVAL_TEMP_IMAGP:%.*]], align 16
 // CHECK-MINGW32:    call void @cargl(ptr dead_on_unwind nonnull writable sret(x86_fp80) align 16 [[TMP:%.*]], ptr dead_on_return noundef nonnull [[BYVAL_TEMP]]) #[[ATTR3]]
-// CHECK-MINGW32:    [[TMP0:%.*]] = load x86_fp80, ptr [[TMP]], align 16, !tbaa [[LONG_DOUBLE_TBAA6]]
+// CHECK-MINGW32:    [[TMP0:%.*]] = load x86_fp80, ptr [[TMP]], align 16, !tbaa [[LONG_DOUBLE_TBAA10]]
 // CHECK-MINGW32:    [[CLD_REAL3:%.*]] = load x86_fp80, ptr [[CLD]], align 16
 // CHECK-MINGW32:    [[CLD_IMAG5:%.*]] = load x86_fp80, ptr [[CLD_IMAGP]], align 16
 // CHECK-MINGW32:    store x86_fp80 [[MUL_RL:%.*]], ptr [[AGG_RESULT]], align 16
@@ -190,8 +190,8 @@ int ilogbl(long double a);
 //
 // CHECK-MINGW32-LABEL: define dso_local i32 @test_ilogb(
 // CHECK-MINGW32-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
-// CHECK-MINGW32:    [[A:%.*]] = load x86_fp80, ptr [[TMP0]], align 16, !tbaa [[LONG_DOUBLE_TBAA6]]
-// CHECK-MINGW32:    store x86_fp80 [[A]], ptr [[BYVAL_TEMP:%.*]], align 16, !tbaa [[LONG_DOUBLE_TBAA6]]
+// CHECK-MINGW32:    [[A:%.*]] = load x86_fp80, ptr [[TMP0]], align 16, !tbaa [[LONG_DOUBLE_TBAA10]]
+// CHECK-MINGW32:    store x86_fp80 [[A]], ptr [[BYVAL_TEMP:%.*]], align 16, !tbaa [[LONG_DOUBLE_TBAA10]]
 // CHECK-MINGW32:    [[CALL:%.*]] = call i32 @ilogbl(ptr dead_on_return noundef nonnull [[BYVAL_TEMP]]) #[[ATTR3]]
 //
 int test_ilogb(long double a) {
@@ -243,8 +243,8 @@ int test_ilogb(long double a) {
 // CHECK-SPIR: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
 // CHECK-SPIR: [[META5]] = !{!"Simple C/C++ TBAA"}
 //.
-// CHECK-MINGW32: [[LONG_DOUBLE_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
-// CHECK-MINGW32: [[META7]] = !{!"long double", [[META8:![0-9]+]], i64 0}
-// CHECK-MINGW32: [[META8]] = !{!"omnipotent char", [[META9:![0-9]+]], i64 0}
+// CHECK-MINGW32: [[META8:![0-9]+]] = !{!"omnipotent char", [[META9:![0-9]+]], i64 0}
 // CHECK-MINGW32: [[META9]] = !{!"Simple C/C++ TBAA"}
+// CHECK-MINGW32: [[LONG_DOUBLE_TBAA10]] = !{[[META11:![0-9]+]], [[META11]], i64 0}
+// CHECK-MINGW32: [[META11]] = !{!"long double", [[META8]], i64 0}
 //.
diff --git a/clang/test/CodeGen/math-libcalls-tbaa.c b/clang/test/CodeGen/math-libcalls-tbaa.c
index 53ca7963b27c1..ec234bca66371 100644
--- a/clang/test/CodeGen/math-libcalls-tbaa.c
+++ b/clang/test/CodeGen/math-libcalls-tbaa.c
@@ -17,8 +17,8 @@ float crealf(float _Complex);
 // NONEWSTRUCTPATHTBAA-SAME: ptr noundef readonly captures(none) [[NUM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // NONEWSTRUCTPATHTBAA-NEXT:  [[ENTRY:.*:]]
 // NONEWSTRUCTPATHTBAA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[NUM]], i64 40
-// NONEWSTRUCTPATHTBAA-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[FLOAT_TBAA2:![0-9]+]]
-// NONEWSTRUCTPATHTBAA-NEXT:    [[CALL:%.*]] = tail call float @expf(float noundef [[TMP0]]) #[[ATTR9:[0-9]+]], !tbaa [[INT_TBAA6:![0-9]+]]
+// NONEWSTRUCTPATHTBAA-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[FLOAT_TBAA6:![0-9]+]]
+// NONEWSTRUCTPATHTBAA-NEXT:    [[CALL:%.*]] = tail call float @expf(float noundef [[TMP0]]) #[[ATTR9:[0-9]+]], !tbaa [[INT_TBAA2:![0-9]+]]
 // NONEWSTRUCTPATHTBAA-NEXT:    [[MUL:%.*]] = fmul float [[TMP0]], [[CALL]]
 // NONEWSTRUCTPATHTBAA-NEXT:    ret float [[MUL]]
 //
@@ -26,8 +26,8 @@ float crealf(float _Complex);
 // NEWSTRUCTPATHTBAA-SAME: ptr noundef readonly captures(none) [[NUM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // NEWSTRUCTPATHTBAA-NEXT:  [[ENTRY:.*:]]
 // NEWSTRUCTPATHTBAA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[NUM]], i64 40
-// NEWSTRUCTPATHTBAA-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2:![0-9]+]]
-// NEWSTRUCTPATHTBAA-NEXT:    [[CALL:%.*]] = tail call float @expf(float noundef [[TMP0]]) #[[ATTR9:[0-9]+]], !tbaa [[TBAA6:![0-9]+]]
+// NEWSTRUCTPATHTBAA-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA6:![0-9]+]]
+// NEWSTRUCTPATHTBAA-NEXT:    [[CALL:%.*]] = tail call float @expf(float noundef [[TMP0]]) #[[ATTR9:[0-9]+]], !tbaa [[TBAA2:![0-9]+]]
 // NEWSTRUCTPATHTBAA-NEXT:    [[MUL:%.*]] = fmul float [[TMP0]], [[CALL]]
 // NEWSTRUCTPATHTBAA-NEXT:    ret float [[MUL]]
 //
@@ -41,8 +41,8 @@ float test_expf (float num[]) {
 // NONEWSTRUCTPATHTBAA-SAME: ptr noundef readonly captures(none) [[NUM:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // NONEWSTRUCTPATHTBAA-NEXT:  [[ENTRY:.*:]]
 // NONEWSTRUCTPATHTBAA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[NUM]], i64 40
-// NONEWSTRUCTPATHTBAA-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[FLOAT_TBAA2]]
-// NONEWSTRUCTPATHTBAA-NEXT:    [[CALL:%.*]] = tail call float @expf(float noundef [[TMP0]]) #[[ATTR9]], !tbaa [[INT_TBAA6]]
+// NONEWSTRUCTPATHTBAA-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[FLOAT_TBAA6]]
+// NONEWSTRUCTPATHTBAA-NEXT:    [[CALL:%.*]] = tail call float @expf(float noundef [[TMP0]]) #[[ATTR9]], !tbaa [[INT_TBAA2]]
 // NONEWSTRUCTPATHTBAA-NEXT:    [[MUL:%.*]] = fmul float [[TMP0]], [[CALL]]
 // NONEWSTRUCTPATHTBAA-NEXT:    ret float [[MUL]]
 //
@@ -50,8 +50,8 @@ float test_expf (float num[]) {
 // NEWSTRUCTPATHTBAA-SAME: ptr noundef readonly captures(none) [[NUM:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // NEWSTRUCTPATHTBAA-NEXT:  [[ENTRY:.*:]]
 // NEWSTRUCTPATHTBAA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[NUM]], i64 40
-// NEWSTRUCTPATHTBAA-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
-// NEWSTRUCTPATHTBAA-NEXT:    [[CALL:%.*]] = tail call float @expf(float noundef [[TMP0]]) #[[ATTR9]], !tbaa [[TBAA6]]
+// NEWSTRUCTPATHTBAA-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA6]]
+// NEWSTRUCTPATHTBAA-NEXT:    [[CALL:%.*]] = tail call float @expf(float noundef [[TMP0]]) #[[ATTR9]], !tbaa [[TBAA2]]
 // NEWSTRUCTPATHTBAA-NEXT:    [[MUL:%.*]] = fmul float [[TMP0]], [[CALL]]
 // NEWSTRUCTPATHTBAA-NEXT:    ret float [[MUL]]
 //
@@ -92,7 +92,7 @@ double test_fabs (double num[]) {
 // NONEWSTRUCTPATHTBAA-NEXT:  [[ENTRY:.*:]]
 // NONEWSTRUCTPATHTBAA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[NUM]], i64 80
 // NONEWSTRUCTPATHTBAA-NEXT:    [[TMP0:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[DOUBLE_TBAA8]]
-// NONEWSTRUCTPATHTBAA-NEXT:    [[CALL:%.*]] = tail call double @remainder(double noundef [[TMP0]], double noundef [[A]]) #[[ATTR9]], !tbaa [[INT_TBAA6]]
+// NONEWSTRUCTPATHTBAA-NEXT:    [[CALL:%.*]] = tail call double @remainder(double noundef [[TMP0]], double noundef [[A]]) #[[ATTR9]], !tbaa [[INT_TBAA2]]
 // NONEWSTRUCTPATHTBAA-NEXT:    [[MUL:%.*]] = fmul double [[TMP0]], [[CALL]]
 // NONEWSTRUCTPATHTBAA-NEXT:    ret double [[MUL]]
 //
@@ -101,7 +101,7 @@ double test_fabs (double num[]) {
 // NEWSTRUCTPATHTBAA-NEXT:  [[ENTRY:.*:]]
 // NEWSTRUCTPATHTBAA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[NUM]], i64 80
 // NEWSTRUCTPATHTBAA-NEXT:    [[TMP0:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA8]]
-// NEWSTRUCTPATHTBAA-NEXT:    [[CALL:%.*]] = tail call double @remainder(double noundef [[TMP0]], double noundef [[A]]) #[[ATTR9]], !tbaa [[TBAA6]]
+// NEWSTRUCTPATHTBAA-NEXT:    [[CALL:%.*]] = tail call double @remainder(double noundef [[TMP0]], double noundef [[A]]) #[[ATTR9]], !tbaa [[TBAA2]]
 // NEWSTRUCTPATHTBAA-NEXT:    [[MUL:%.*]] = fmul double [[TMP0]], [[CALL]]
 // NEWSTRUCTPATHTBAA-NEXT:    ret double [[MUL]]
 //
@@ -156,12 +156,12 @@ double test_frexp (double num[]) {
 // NONEWSTRUCTPATHTBAA-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[SIN]]) #[[ATTR9]]
 // NONEWSTRUCTPATHTBAA-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[COS]]) #[[ATTR9]]
 // NONEWSTRUCTPATHTBAA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[NUM]], i64 8
-// NONEWSTRUCTPATHTBAA-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[FLOAT_TBAA2]]
+// NONEWSTRUCTPATHTBAA-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[FLOAT_TBAA6]]
 // NONEWSTRUCTPATHTBAA-NEXT:    call void @sincos(float noundef [[TMP0]], ptr noundef nonnull [[SIN]], ptr noundef nonnull [[COS]]) #[[ATTR9]]
-// NONEWSTRUCTPATHTBAA-NEXT:    [[TMP1:%.*]] = load float, ptr [[SIN]], align 4, !tbaa [[FLOAT_TBAA2]]
-// NONEWSTRUCTPATHTBAA-NEXT:    [[TMP2:%.*]] = load float, ptr [[COS]], align 4, !tbaa [[FLOAT_TBAA2]]
+// NONEWSTRUCTPATHTBAA-NEXT:    [[TMP1:%.*]] = load float, ptr [[SIN]], align 4, !tbaa [[FLOAT_TBAA6]]
+// NONEWSTRUCTPATHTBAA-NEXT:    [[TMP2:%.*]] = load float, ptr [[COS]], align 4, !tbaa [[FLOAT_TBAA6]]
 // NONEWSTRUCTPATHTBAA-NEXT:    [[MUL:%.*]] = fmul float [[TMP1]], [[TMP2]]
-// NONEWSTRUCTPATHTBAA-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[FLOAT_TBAA2]]
+// NONEWSTRUCTPATHTBAA-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[FLOAT_TBAA6]]
 // NONEWSTRUCTPATHTBAA-NEXT:    [[ADD:%.*]] = fadd float [[MUL]], [[TMP3]]
 // NONEWSTRUCTPATHTBAA-NEXT:    call void @llvm.lifetime.end.p0(ptr nonnull [[COS]]) #[[ATTR9]]
 // NONEWSTRUCTPATHTBAA-NEXT:    call void @llvm.lifetime.end.p0(ptr nonnull [[SIN]]) #[[ATTR9]]
@@ -175,12 +175,12 @@ double test_frexp (double num[]) {
 // NEWSTRUCTPATHTBAA-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[SIN]]) #[[ATTR9]]
 // NEWSTRUCTPATHTBAA-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[COS]]) #[[ATTR9]]
 // NEWSTRUCTPATHTBAA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[NUM]], i64 8
-// NEWSTRUCTPATHTBAA-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NEWSTRUCTPATHTBAA-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA6]]
 // NEWSTRUCTPATHTBAA-NEXT:    call void @sincos(float noundef [[TMP0]], ptr noundef nonnull [[SIN]], ptr noundef nonnull [[COS]]) #[[ATTR9]]
-// NEWSTRUCTPATHTBAA-NEXT:    [[TMP1:%.*]] = load float, ptr [[SIN]], align 4, !tbaa [[TBAA2]]
-// NEWSTRUCTPATHTBAA-NEXT:    [[TMP2:%.*]] = load float, ptr [[COS]], align 4, !tbaa [[TBAA2]]
+// NEWSTRUCTPATHTBAA-NEXT:    [[TMP1:%.*]] = load float, ptr [[SIN]], align 4, !tbaa [[TBAA6]]
+// NEWSTRUCTPATHTBAA-NEXT:    [[TMP2:%.*]] = load float, ptr [[COS]], align 4, !tbaa [[TBAA6]]
 // NEWSTRUCTPATHTBAA-NEXT:    [[MUL:%.*]] = fmul float [[TMP1]], [[TMP2]]
-// NEWSTRUCTPATHTBAA-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NEWSTRUCTPATHTBAA-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA6]]
 // NEWSTRUCTPATHTBAA-NEXT:    [[ADD:%.*]] = fadd float [[MUL]], [[TMP3]]
 // NEWSTRUCTPATHTBAA-NEXT:    call void @llvm.lifetime.end.p0(ptr nonnull [[COS]]) #[[ATTR9]]
 // NEWSTRUCTPATHTBAA-NEXT:    call void @llvm.lifetime.end.p0(ptr nonnull [[SIN]]) #[[ATTR9]]
@@ -198,12 +198,12 @@ float test_sincos (float num[]) {
 // NONEWSTRUCTPATHTBAA-SAME: ptr noundef readonly captures(none) [[NUM:%.*]]) local_unnamed_addr #[[ATTR7]] {
 // NONEWSTRUCTPATHTBAA-NEXT:  [[ENTRY:.*:]]
 // NONEWSTRUCTPATHTBAA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[NUM]], i64 8
-// NONEWSTRUCTPATHTBAA-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[FLOAT_TBAA2]]
+// NONEWSTRUCTPATHTBAA-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[FLOAT_TBAA6]]
 // NONEWSTRUCTPATHTBAA-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x float] poison, float [[TMP0]], 0
 // NONEWSTRUCTPATHTBAA-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x float] [[DOTFCA_0_INSERT]], float 0.000000e+00, 1
 // NONEWSTRUCTPATHTBAA-NEXT:    [[CALL:%.*]] = tail call { float, float } @cacoshf([2 x float] noundef alignstack(8) [[DOTFCA_1_INSERT]]) #[[ATTR9]]
 // NONEWSTRUCTPATHTBAA-NEXT:    [[TMP1:%.*]] = extractvalue { float, float } [[CALL]], 0
-// NONEWSTRUCTPATHTBAA-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[FLOAT_TBAA2]]
+// NONEWSTRUCTPATHTBAA-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[FLOAT_TBAA6]]
 // NONEWSTRUCTPATHTBAA-NEXT:    [[ADD:%.*]] = fadd float [[TMP1]], [[TMP2]]
 // NONEWSTRUCTPATHTBAA-NEXT:    ret float [[ADD]]
 //
@@ -211,12 +211,12 @@ float test_sincos (float num[]) {
 // NEWSTRUCTPATHTBAA-SAME: ptr noundef readonly captures(none) [[NUM:%.*]]) local_unnamed_addr #[[ATTR7]] {
 // NEWSTRUCTPATHTBAA-NEXT:  [[ENTRY:.*:]]
 // NEWSTRUCTPATHTBAA-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[NUM]], i64 8
-// NEWSTRUCTPATHTBAA-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NEWSTRUCTPATHTBAA-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA6]]
 // NEWSTRUCTPATHTBAA-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x float] poison, float [[TMP0]], 0
 // NEWSTRUCTPATHTBAA-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x float] [[DOTFCA_0_INSERT]], float 0.000000e+00, 1
 // NEWSTRUCTPATHTBAA-NEXT:    [[CALL:%.*]] = tail call { float, float } @cacoshf([2 x float] noundef alignstack(8) [[DOTFCA_1_INSERT]]) #[[ATTR9]]
 // NEWSTRUCTPATHTBAA-NEXT:    [[TMP1:%.*]] = extractvalue { float, float } [[CALL]], 0
-// NEWSTRUCTPATHTBAA-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NEWSTRUCTPATHTBAA-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA6]]
 // NEWSTRUCTPATHTBAA-NEXT:    [[ADD:%.*]] = fadd float [[TMP1]], [[TMP2]]
 // NEWSTRUCTPATHTBAA-NEXT:    ret float [[ADD]]
 //
@@ -227,21 +227,21 @@ float test_cacoshf (float num[]) {
 }
 
 //.
-// NONEWSTRUCTPATHTBAA: [[FLOAT_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
-// NONEWSTRUCTPATHTBAA: [[META3]] = !{!"float", [[META4:![0-9]+]], i64 0}
+// NONEWSTRUCTPATHTBAA: [[INT_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// NONEWSTRUCTPATHTBAA: [[META3]] = !{!"int", [[META4:![0-9]+]], i64 0}
 // NONEWSTRUCTPATHTBAA: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
 // NONEWSTRUCTPATHTBAA: [[META5]] = !{!"Simple C/C++ TBAA"}
-// NONEWSTRUCTPATHTBAA: [[INT_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
-// NONEWSTRUCTPATHTBAA: [[META7]] = !{!"int", [[META4]], i64 0}
+// NONEWSTRUCTPATHTBAA: [[FLOAT_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// NONEWSTRUCTPATHTBAA: [[META7]] = !{!"float", [[META4]], i64 0}
 // NONEWSTRUCTPATHTBAA: [[DOUBLE_TBAA8]] = !{[[META9:![0-9]+]], [[META9]], i64 0}
 // NONEWSTRUCTPATHTBAA: [[META9]] = !{!"double", [[META4]], i64 0}
 //.
 // NEWSTRUCTPATHTBAA: [[TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0, i64 4}
-// NEWSTRUCTPATHTBAA: [[META3]] = !{[[META4:![0-9]+]], i64 4, !"float"}
+// NEWSTRUCTPATHTBAA: [[META3]] = !{[[META4:![0-9]+]], i64 4, !"int"}
 // NEWSTRUCTPATHTBAA: [[META4]] = !{[[META5:![0-9]+]], i64 1, !"omnipotent char"}
 // NEWSTRUCTPATHTBAA: [[META5]] = !{!"Simple C/C++ TBAA"}
 // NEWSTRUCTPATHTBAA: [[TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0, i64 4}
-// NEWSTRUCTPATHTBAA: [[META7]] = !{[[META4]], i64 4, !"int"}
+// NEWSTRUCTPATHTBAA: [[META7]] = !{[[META4]], i64 4, !"float"}
 // NEWSTRUCTPATHTBAA: [[TBAA8]] = !{[[META9:![0-9]+]], [[META9]], i64 0, i64 8}
 // NEWSTRUCTPATHTBAA: [[META9]] = !{[[META4]], i64 8, !"double"}
 //.
diff --git a/clang/test/CodeGen/pointer-arithmetic-align.c b/clang/test/CodeGen/pointer-arithmetic-align.c
index 745ab84635c1b..73b1c1f24bfc9 100644
--- a/clang/test/CodeGen/pointer-arithmetic-align.c
+++ b/clang/test/CodeGen/pointer-arithmetic-align.c
@@ -13,7 +13,7 @@ struct a {
 // CHECK-SAME: ptr noundef writeonly captures(none) initializes((8, 9)) [[CTX:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[BLOCK:%.*]] = getelementptr inbounds nuw i8, ptr [[CTX]], i64 8
-// CHECK-NEXT:    store i8 0, ptr [[BLOCK]], align 8, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    store i8 0, ptr [[BLOCK]], align 8, !tbaa [[TBAA6:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
 void ptradd_0(struct a *ctx) {
@@ -24,7 +24,7 @@ void ptradd_0(struct a *ctx) {
 // CHECK-SAME: ptr noundef writeonly captures(none) initializes((12, 13)) [[CTX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[CTX]], i64 12
-// CHECK-NEXT:    store i8 0, ptr [[ADD_PTR]], align 4, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i8 0, ptr [[ADD_PTR]], align 4, !tbaa [[TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void ptradd_4(struct a *ctx) {
@@ -35,7 +35,7 @@ void ptradd_4(struct a *ctx) {
 // CHECK-SAME: ptr noundef writeonly captures(none) initializes((16, 17)) [[CTX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[CTX]], i64 16
-// CHECK-NEXT:    store i8 0, ptr [[ADD_PTR]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i8 0, ptr [[ADD_PTR]], align 8, !tbaa [[TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void ptradd_8(struct a *ctx) {
@@ -46,7 +46,7 @@ void ptradd_8(struct a *ctx) {
 // CHECK-SAME: ptr noundef writeonly captures(none) initializes((16, 17)) [[CTX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[CTX]], i64 16
-// CHECK-NEXT:    store i8 0, ptr [[ADD_PTR]], align 8, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i8 0, ptr [[ADD_PTR]], align 8, !tbaa [[TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void ptradd_8_commuted(struct a *ctx) {
@@ -57,7 +57,7 @@ void ptradd_8_commuted(struct a *ctx) {
 // CHECK-SAME: ptr noundef writeonly captures(none) initializes((8, 9)) [[CTX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[CTX]], i64 8
-// CHECK-NEXT:    store i8 0, ptr [[ADD_PTR]], align 4, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i8 0, ptr [[ADD_PTR]], align 4, !tbaa [[TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void ptrsub_4(struct a *ctx) {
@@ -70,14 +70,14 @@ void ptrsub_4(struct a *ctx) {
 // CHECK-NEXT:    [[BLOCK:%.*]] = getelementptr inbounds nuw i8, ptr [[CTX]], i64 8
 // CHECK-NEXT:    [[IDX_EXT:%.*]] = zext i8 [[IDX]] to i64
 // CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[BLOCK]], i64 [[IDX_EXT]]
-// CHECK-NEXT:    store i8 0, ptr [[ADD_PTR]], align 1, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store i8 0, ptr [[ADD_PTR]], align 1, !tbaa [[TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void neg_ptradd_var_index(struct a *ctx, uint8_t idx) {
   *(ctx->block + idx) = 0;
 }
 //.
-// CHECK: [[TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
-// CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
-// CHECK: [[META4]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[META4:![0-9]+]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK: [[META5]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[TBAA6]] = !{[[META4]], [[META4]], i64 0}
 //.
diff --git a/clang/test/CodeGen/sanitize-metadata-ignorelist.c b/clang/test/CodeGen/sanitize-metadata-ignorelist.c
index 4dc8c0c35fefe..dac69e97d5a1a 100644
--- a/clang/test/CodeGen/sanitize-metadata-ignorelist.c
+++ b/clang/test/CodeGen/sanitize-metadata-ignorelist.c
@@ -7,9 +7,9 @@
 int y;
 
 // ALLOW-LABEL: define {{[^@]+}}@foo
-// ALLOW-SAME: () local_unnamed_addr #[[ATTR0:[0-9]+]] !pcsections !2 {
+// ALLOW-SAME: () local_unnamed_addr #[[ATTR0:[0-9]+]] !pcsections !6 {
 // ALLOW-NEXT:  entry:
-// ALLOW-NEXT:    [[TMP0:%.*]] = atomicrmw add ptr @y, i32 1 monotonic, align 4, !pcsections !4
+// ALLOW-NEXT:    [[TMP0:%.*]] = atomicrmw add ptr @y, i32 1 monotonic, align 4, !pcsections !8
 // ALLOW-NEXT:    ret void
 //
 // FUN-LABEL: define {{[^@]+}}@foo
@@ -29,15 +29,15 @@ void foo() {
 }
 
 // ALLOW-LABEL: define {{[^@]+}}@bar
-// ALLOW-SAME: () local_unnamed_addr #[[ATTR0]] !pcsections !2 {
+// ALLOW-SAME: () local_unnamed_addr #[[ATTR0]] !pcsections !6 {
 // ALLOW-NEXT:  entry:
-// ALLOW-NEXT:    [[TMP0:%.*]] = atomicrmw add ptr @y, i32 2 monotonic, align 4, !pcsections !4
+// ALLOW-NEXT:    [[TMP0:%.*]] = atomicrmw add ptr @y, i32 2 monotonic, align 4, !pcsections !8
 // ALLOW-NEXT:    ret void
 //
 // FUN-LABEL: define {{[^@]+}}@bar
-// FUN-SAME: () local_unnamed_addr #[[ATTR0]] !pcsections !2 {
+// FUN-SAME: () local_unnamed_addr #[[ATTR0]] !pcsections !6 {
 // FUN-NEXT:  entry:
-// FUN-NEXT:    [[TMP0:%.*]] = atomicrmw add ptr @y, i32 2 monotonic, align 4, !pcsections !4
+// FUN-NEXT:    [[TMP0:%.*]] = atomicrmw add ptr @y, i32 2 monotonic, align 4, !pcsections !8
 // FUN-NEXT:    ret void
 //
 // SRC-LABEL: define {{[^@]+}}@bar
diff --git a/clang/test/CodeGen/sanitize-metadata-nosanitize.c b/clang/test/CodeGen/sanitize-metadata-nosanitize.c
index 22ed25bd3b670..f2672d7f89157 100644
--- a/clang/test/CodeGen/sanitize-metadata-nosanitize.c
+++ b/clang/test/CodeGen/sanitize-metadata-nosanitize.c
@@ -12,7 +12,7 @@
 //.
 // CHECK: Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none)
 // CHECK-LABEL: define dso_local void @escape(
-// CHECK-SAME: ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !pcsections [[META2:![0-9]+]] {
+// CHECK-SAME: ptr noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !pcsections [[META6:![0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret void
 //
@@ -23,13 +23,13 @@ __attribute__((noinline, not_tail_called)) void escape(const volatile void *p) {
 
 // CHECK: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(write, argmem: readwrite, inaccessiblemem: none)
 // CHECK-LABEL: define dso_local i32 @normal_function(
-// CHECK-SAME: ptr noundef [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] !pcsections [[META4:![0-9]+]] {
+// CHECK-SAME: ptr noundef [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] !pcsections [[META8:![0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[X_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 8, !tbaa [[INTPTR_TBAA6:![0-9]+]]
-// CHECK-NEXT:    store atomic i32 1, ptr [[X]] monotonic, align 4, !pcsections [[META11:![0-9]+]]
+// CHECK-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 8, !tbaa [[INTPTR_TBAA10:![0-9]+]]
+// CHECK-NEXT:    store atomic i32 1, ptr [[X]] monotonic, align 4, !pcsections [[META13:![0-9]+]]
 // CHECK-NEXT:    notail call void @escape(ptr noundef nonnull [[X_ADDR]])
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Y]], align 4, !tbaa [[INT_TBAA12:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Y]], align 4, !tbaa [[INT_TBAA2:![0-9]+]]
 // CHECK-NEXT:    ret i32 [[TMP0]]
 //
 int normal_function(int *x, int *y) {
@@ -43,10 +43,10 @@ int normal_function(int *x, int *y) {
 // CHECK-SAME: ptr noundef [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[X_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 8, !tbaa [[INTPTR_TBAA6]]
+// CHECK-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 8, !tbaa [[INTPTR_TBAA10]]
 // CHECK-NEXT:    store atomic i32 1, ptr [[X]] monotonic, align 4
 // CHECK-NEXT:    notail call void @escape(ptr noundef nonnull [[X_ADDR]])
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Y]], align 4, !tbaa [[INT_TBAA12]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Y]], align 4, !tbaa [[INT_TBAA2]]
 // CHECK-NEXT:    ret i32 [[TMP0]]
 //
 __attribute__((disable_sanitizer_instrumentation)) int test_disable_sanitize_instrumentation(int *x, int *y) {
@@ -60,10 +60,10 @@ __attribute__((disable_sanitizer_instrumentation)) int test_disable_sanitize_ins
 // CHECK-SAME: ptr noundef [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] !pcsections [[META14:![0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[X_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 8, !tbaa [[INTPTR_TBAA6]]
-// CHECK-NEXT:    store atomic i32 1, ptr [[X]] monotonic, align 4, !pcsections [[META11]]
+// CHECK-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 8, !tbaa [[INTPTR_TBAA10]]
+// CHECK-NEXT:    store atomic i32 1, ptr [[X]] monotonic, align 4, !pcsections [[META13]]
 // CHECK-NEXT:    notail call void @escape(ptr noundef nonnull [[X_ADDR]])
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Y]], align 4, !tbaa [[INT_TBAA12]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Y]], align 4, !tbaa [[INT_TBAA2]]
 // CHECK-NEXT:    ret i32 [[TMP0]]
 //
 __attribute__((no_sanitize("thread"))) int test_no_sanitize_thread(int *x, int *y) {
@@ -77,10 +77,10 @@ __attribute__((no_sanitize("thread"))) int test_no_sanitize_thread(int *x, int *
 // CHECK-SAME: ptr noundef [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] !pcsections [[META14]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[X_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 8, !tbaa [[INTPTR_TBAA6]]
-// CHECK-NEXT:    store atomic i32 1, ptr [[X]] monotonic, align 4, !pcsections [[META11]]
+// CHECK-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 8, !tbaa [[INTPTR_TBAA10]]
+// CHECK-NEXT:    store atomic i32 1, ptr [[X]] monotonic, align 4, !pcsections [[META13]]
 // CHECK-NEXT:    notail call void @escape(ptr noundef nonnull [[X_ADDR]])
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Y]], align 4, !tbaa [[INT_TBAA12]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Y]], align 4, !tbaa [[INT_TBAA2]]
 // CHECK-NEXT:    ret i32 [[TMP0]]
 //
 __attribute__((no_sanitize("all"))) int test_no_sanitize_all(int *x, int *y) {
@@ -97,18 +97,18 @@ __attribute__((no_sanitize("all"))) int test_no_sanitize_all(int *x, int *y) {
 //.
 // CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
 // CHECK: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-// CHECK: [[META2]] = !{!"sanmd_covered2!C", [[META3:![0-9]+]]}
-// CHECK: [[META3]] = !{i64 0}
-// CHECK: [[META4]] = !{!"sanmd_covered2!C", [[META5:![0-9]+]]}
-// CHECK: [[META5]] = !{i64 3}
-// CHECK: [[INTPTR_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
-// CHECK: [[META7]] = !{!"p1 int", [[META8:![0-9]+]], i64 0}
-// CHECK: [[META8]] = !{!"any pointer", [[META9:![0-9]+]], i64 0}
-// CHECK: [[META9]] = !{!"omnipotent char", [[META10:![0-9]+]], i64 0}
-// CHECK: [[META10]] = !{!"Simple C/C++ TBAA"}
-// CHECK: [[META11]] = !{!"sanmd_atomics2!C"}
-// CHECK: [[INT_TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
-// CHECK: [[META13]] = !{!"int", [[META9]], i64 0}
+// CHECK: [[INT_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// CHECK: [[META3]] = !{!"int", [[META4:![0-9]+]], i64 0}
+// CHECK: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK: [[META5]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[META6]] = !{!"sanmd_covered2!C", [[META7:![0-9]+]]}
+// CHECK: [[META7]] = !{i64 0}
+// CHECK: [[META8]] = !{!"sanmd_covered2!C", [[META9:![0-9]+]]}
+// CHECK: [[META9]] = !{i64 3}
+// CHECK: [[INTPTR_TBAA10]] = !{[[META11:![0-9]+]], [[META11]], i64 0}
+// CHECK: [[META11]] = !{!"p1 int", [[META12:![0-9]+]], i64 0}
+// CHECK: [[META12]] = !{!"any pointer", [[META4]], i64 0}
+// CHECK: [[META13]] = !{!"sanmd_atomics2!C"}
 // CHECK: [[META14]] = !{!"sanmd_covered2!C", [[META15:![0-9]+]]}
 // CHECK: [[META15]] = !{i64 2}
 //.
diff --git a/clang/test/CodeGen/scoped-atomic-ops.c b/clang/test/CodeGen/scoped-atomic-ops.c
index 545a6c90892c2..c39048120a457 100644
--- a/clang/test/CodeGen/scoped-atomic-ops.c
+++ b/clang/test/CodeGen/scoped-atomic-ops.c
@@ -1,113 +1,772 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // RUN: %clang_cc1 %s -emit-llvm -o - -triple=amdgcn-amd-amdhsa -ffreestanding \
-// RUN:   -fvisibility=hidden | FileCheck --check-prefix=AMDGCN %s
+// RUN:   -fvisibility=hidden | FileCheck --check-prefixes=AMDGCN,AMDGCN_CL_DEF %s
 // RUN: %clang_cc1 %s -emit-llvm -o - -triple=amdgcn-amd-amdhsa -ffreestanding \
-// RUN:   -cl-std=CL2.0 -fvisibility=hidden | FileCheck --check-prefix=AMDGCN %s
+// RUN:   -cl-std=CL2.0 -fvisibility=hidden | FileCheck --check-prefixes=AMDGCN,AMDGCN_CL_20 %s
 // RUN: %clang_cc1 %s -emit-llvm -o - -triple=spirv64-unknown-unknown -ffreestanding \
 // RUN:   -fvisibility=hidden | FileCheck --check-prefix=SPIRV %s
 
-// AMDGCN-LABEL: define hidden i32 @fi1a(
-// AMDGCN:    [[TMP0:%.*]] = load atomic i32, ptr [[PTR0:.+]] monotonic, align 4
-// AMDGCN:    [[TMP1:%.*]] = load atomic i32, ptr [[PTR1:.+]] syncscope("agent") monotonic, align 4
-// AMDGCN:    [[TMP2:%.*]] = load atomic i32, ptr [[PTR2:.+]] syncscope("workgroup") monotonic, align 4
-// AMDGCN:    [[TMP3:%.*]] = load atomic i32, ptr [[PTR3:.+]] syncscope("wavefront") monotonic, align 4
-// AMDGCN:    [[TMP4:%.*]] = load atomic i32, ptr [[PTR4:.+]] syncscope("singlethread") monotonic, align 4
-// SPIRV: define hidden spir_func i32 @fi1a(
-// SPIRV:    [[TMP0:%.*]] = load atomic i32, ptr [[PTR0:.+]] monotonic, align 4
-// SPIRV:    [[TMP1:%.*]] = load atomic i32, ptr [[PTR1:.+]] syncscope("device") monotonic, align 4
-// SPIRV:    [[TMP2:%.*]] = load atomic i32, ptr [[PTR2:.+]] syncscope("workgroup") monotonic, align 4
-// SPIRV:    [[TMP3:%.*]] = load atomic i32, ptr [[PTR3:.+]] syncscope("subgroup") monotonic, align 4
-// SPIRV:    [[TMP4:%.*]] = load atomic i32, ptr [[PTR4:.+]] syncscope("singlethread") monotonic, align 4
+// AMDGCN_CL_DEF-LABEL: define hidden i32 @fi1a(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[I:%.*]]) #[[ATTR0:[0-9]+]] {
+// AMDGCN_CL_DEF-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[V:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[V_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V]] to ptr
+// AMDGCN_CL_DEF-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP1:%.*]] = load atomic i32, ptr [[TMP0]] monotonic, align 4
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP1]], ptr [[V_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP3:%.*]] = load atomic i32, ptr [[TMP2]] syncscope("agent") monotonic, align 4
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP3]], ptr [[V_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP5:%.*]] = load atomic i32, ptr [[TMP4]] syncscope("workgroup") monotonic, align 4
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP5]], ptr [[V_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP7:%.*]] = load atomic i32, ptr [[TMP6]] syncscope("cluster") monotonic, align 4
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP7]], ptr [[V_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP9:%.*]] = load atomic i32, ptr [[TMP8]] syncscope("wavefront") monotonic, align 4
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP9]], ptr [[V_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP11:%.*]] = load atomic i32, ptr [[TMP10]] syncscope("singlethread") monotonic, align 4
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP11]], ptr [[V_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP12:%.*]] = load i32, ptr [[V_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    ret i32 [[TMP12]]
+//
+// AMDGCN_CL_20-LABEL: define hidden i32 @fi1a(
+// AMDGCN_CL_20-SAME: ptr noundef [[I:%.*]]) #[[ATTR0:[0-9]+]] {
+// AMDGCN_CL_20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[V:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load atomic i32, ptr [[TMP0]] monotonic, align 4
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[V]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load atomic i32, ptr [[TMP2]] syncscope("agent") monotonic, align 4
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP3]], ptr addrspace(5) [[V]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP5:%.*]] = load atomic i32, ptr [[TMP4]] syncscope("workgroup") monotonic, align 4
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP5]], ptr addrspace(5) [[V]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP7:%.*]] = load atomic i32, ptr [[TMP6]] syncscope("cluster") monotonic, align 4
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP7]], ptr addrspace(5) [[V]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP9:%.*]] = load atomic i32, ptr [[TMP8]] syncscope("wavefront") monotonic, align 4
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP9]], ptr addrspace(5) [[V]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP11:%.*]] = load atomic i32, ptr [[TMP10]] syncscope("singlethread") monotonic, align 4
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP11]], ptr addrspace(5) [[V]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(5) [[V]], align 4
+// AMDGCN_CL_20-NEXT:    ret i32 [[TMP12]]
+//
+// SPIRV-LABEL: define hidden spir_func i32 @fi1a(
+// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0:[0-9]+]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[V:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    store ptr [[I]], ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP1:%.*]] = load atomic i32, ptr [[TMP0]] monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP1]], ptr [[V]], align 4
+// SPIRV-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP3:%.*]] = load atomic i32, ptr [[TMP2]] syncscope("device") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP3]], ptr [[V]], align 4
+// SPIRV-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP5:%.*]] = load atomic i32, ptr [[TMP4]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP5]], ptr [[V]], align 4
+// SPIRV-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP7:%.*]] = load atomic i32, ptr [[TMP6]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP7]], ptr [[V]], align 4
+// SPIRV-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP9:%.*]] = load atomic i32, ptr [[TMP8]] syncscope("subgroup") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP9]], ptr [[V]], align 4
+// SPIRV-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP11:%.*]] = load atomic i32, ptr [[TMP10]] syncscope("singlethread") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP11]], ptr [[V]], align 4
+// SPIRV-NEXT:    [[TMP12:%.*]] = load i32, ptr [[V]], align 4
+// SPIRV-NEXT:    ret i32 [[TMP12]]
+//
 int fi1a(int *i) {
   int v;
   __scoped_atomic_load(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM);
   __scoped_atomic_load(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE);
   __scoped_atomic_load(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP);
+  __scoped_atomic_load(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR);
   __scoped_atomic_load(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT);
   __scoped_atomic_load(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE);
   return v;
 }
 
 // AMDGCN-LABEL: define hidden i32 @fi1b(
-// AMDGCN:    [[TMP0:%.*]] = load atomic i32, ptr [[PTR0:%.+]] monotonic, align 4
-// AMDGCN:    [[TMP1:%.*]] = load atomic i32, ptr [[PTR1:%.+]] syncscope("agent") monotonic, align 4
-// AMDGCN:    [[TMP2:%.*]] = load atomic i32, ptr [[PTR2:%.+]] syncscope("workgroup") monotonic, align 4
-// AMDGCN:    [[TMP3:%.*]] = load atomic i32, ptr [[PTR3:%.+]] syncscope("wavefront") monotonic, align 4
-// AMDGCN:    [[TMP4:%.*]] = load atomic i32, ptr [[PTR4:%.+]] syncscope("singlethread") monotonic, align 4
+// AMDGCN-SAME: ptr noundef [[I:%.*]]) #[[ATTR0:[0-9]+]] {
+// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT:    [[ATOMIC_TEMP3:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT:    [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT:    [[ATOMIC_TEMP5:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN-NEXT:    [[ATOMIC_TEMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP1]] to ptr
+// AMDGCN-NEXT:    [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
+// AMDGCN-NEXT:    [[ATOMIC_TEMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP3]] to ptr
+// AMDGCN-NEXT:    [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr
+// AMDGCN-NEXT:    [[ATOMIC_TEMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP5]] to ptr
+// AMDGCN-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT:    [[TMP1:%.*]] = load atomic i32, ptr [[TMP0]] monotonic, align 4
+// AMDGCN-NEXT:    store i32 [[TMP1]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT:    store i32 [[TMP2]], ptr [[TMP3]], align 4
+// AMDGCN-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT:    [[TMP5:%.*]] = load atomic i32, ptr [[TMP4]] syncscope("agent") monotonic, align 4
+// AMDGCN-NEXT:    store i32 [[TMP5]], ptr [[ATOMIC_TEMP1_ASCAST]], align 4
+// AMDGCN-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ATOMIC_TEMP1_ASCAST]], align 4
+// AMDGCN-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT:    store i32 [[TMP6]], ptr [[TMP7]], align 4
+// AMDGCN-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT:    [[TMP9:%.*]] = load atomic i32, ptr [[TMP8]] syncscope("workgroup") monotonic, align 4
+// AMDGCN-NEXT:    store i32 [[TMP9]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT:    store i32 [[TMP10]], ptr [[TMP11]], align 4
+// AMDGCN-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT:    [[TMP13:%.*]] = load atomic i32, ptr [[TMP12]] syncscope("cluster") monotonic, align 4
+// AMDGCN-NEXT:    store i32 [[TMP13]], ptr [[ATOMIC_TEMP3_ASCAST]], align 4
+// AMDGCN-NEXT:    [[TMP14:%.*]] = load i32, ptr [[ATOMIC_TEMP3_ASCAST]], align 4
+// AMDGCN-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT:    store i32 [[TMP14]], ptr [[TMP15]], align 4
+// AMDGCN-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT:    [[TMP17:%.*]] = load atomic i32, ptr [[TMP16]] syncscope("wavefront") monotonic, align 4
+// AMDGCN-NEXT:    store i32 [[TMP17]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT:    store i32 [[TMP18]], ptr [[TMP19]], align 4
+// AMDGCN-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT:    [[TMP21:%.*]] = load atomic i32, ptr [[TMP20]] syncscope("singlethread") monotonic, align 4
+// AMDGCN-NEXT:    store i32 [[TMP21]], ptr [[ATOMIC_TEMP5_ASCAST]], align 4
+// AMDGCN-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ATOMIC_TEMP5_ASCAST]], align 4
+// AMDGCN-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT:    store i32 [[TMP22]], ptr [[TMP23]], align 4
+// AMDGCN-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP24]], align 4
+// AMDGCN-NEXT:    ret i32 [[TMP25]]
+//
 // SPIRV-LABEL: define hidden spir_func i32 @fi1b(
-// SPIRV:    [[TMP0:%.*]] = load atomic i32, ptr [[PTR0:%.+]] monotonic, align 4
-// SPIRV:    [[TMP1:%.*]] = load atomic i32, ptr [[PTR1:%.+]] syncscope("device") monotonic, align 4
-// SPIRV:    [[TMP2:%.*]] = load atomic i32, ptr [[PTR2:%.+]] syncscope("workgroup") monotonic, align 4
-// SPIRV:    [[TMP3:%.*]] = load atomic i32, ptr [[PTR3:%.+]] syncscope("subgroup") monotonic, align 4
-// SPIRV:    [[TMP4:%.*]] = load atomic i32, ptr [[PTR4:%.+]] syncscope("singlethread") monotonic, align 4
+// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP3:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP5:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    store ptr [[I]], ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP1:%.*]] = load atomic i32, ptr [[TMP0]] monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP1]], ptr [[ATOMIC_TEMP]], align 4
+// SPIRV-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// SPIRV-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP2]], ptr [[TMP3]], align 4
+// SPIRV-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP5:%.*]] = load atomic i32, ptr [[TMP4]] syncscope("device") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP5]], ptr [[ATOMIC_TEMP1]], align 4
+// SPIRV-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ATOMIC_TEMP1]], align 4
+// SPIRV-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP6]], ptr [[TMP7]], align 4
+// SPIRV-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP9:%.*]] = load atomic i32, ptr [[TMP8]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP9]], ptr [[ATOMIC_TEMP2]], align 4
+// SPIRV-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ATOMIC_TEMP2]], align 4
+// SPIRV-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP10]], ptr [[TMP11]], align 4
+// SPIRV-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP13:%.*]] = load atomic i32, ptr [[TMP12]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP13]], ptr [[ATOMIC_TEMP3]], align 4
+// SPIRV-NEXT:    [[TMP14:%.*]] = load i32, ptr [[ATOMIC_TEMP3]], align 4
+// SPIRV-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP14]], ptr [[TMP15]], align 4
+// SPIRV-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP17:%.*]] = load atomic i32, ptr [[TMP16]] syncscope("subgroup") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP17]], ptr [[ATOMIC_TEMP4]], align 4
+// SPIRV-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP4]], align 4
+// SPIRV-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP18]], ptr [[TMP19]], align 4
+// SPIRV-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP21:%.*]] = load atomic i32, ptr [[TMP20]] syncscope("singlethread") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP21]], ptr [[ATOMIC_TEMP5]], align 4
+// SPIRV-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ATOMIC_TEMP5]], align 4
+// SPIRV-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP22]], ptr [[TMP23]], align 4
+// SPIRV-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP24]], align 4
+// SPIRV-NEXT:    ret i32 [[TMP25]]
+//
 int fi1b(int *i) {
   *i = __scoped_atomic_load_n(i, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM);
   *i = __scoped_atomic_load_n(i, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE);
   *i = __scoped_atomic_load_n(i, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP);
+  *i = __scoped_atomic_load_n(i, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR);
   *i = __scoped_atomic_load_n(i, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT);
   *i = __scoped_atomic_load_n(i, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE);
   return *i;
 }
 
-// AMDGCN-LABEL: define hidden void @fi2a(
-// AMDGCN:    store atomic i32 [[TMP0:%.+]], ptr [[PTR0:%.+]] monotonic, align 4
-// AMDGCN:    store atomic i32 [[TMP1:%.+]], ptr [[PTR1:%.+]] syncscope("agent") monotonic, align 4
-// AMDGCN:    store atomic i32 [[TMP2:%.+]], ptr [[PTR2:%.+]] syncscope("workgroup") monotonic, align 4
-// AMDGCN:    store atomic i32 [[TMP3:%.+]], ptr [[PTR3:%.+]] syncscope("wavefront") monotonic, align 4
-// AMDGCN:    store atomic i32 [[TMP4:%.+]], ptr [[PTR4:%.+]] syncscope("singlethread") monotonic, align 4
+// AMDGCN_CL_DEF-LABEL: define hidden void @fi2a(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[V:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[V_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V]] to ptr
+// AMDGCN_CL_DEF-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[V_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP1:%.*]] = load i32, ptr [[V_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    store atomic i32 [[TMP1]], ptr [[TMP0]] monotonic, align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP3:%.*]] = load i32, ptr [[V_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    store atomic i32 [[TMP3]], ptr [[TMP2]] syncscope("agent") monotonic, align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP5:%.*]] = load i32, ptr [[V_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    store atomic i32 [[TMP5]], ptr [[TMP4]] syncscope("workgroup") monotonic, align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP7:%.*]] = load i32, ptr [[V_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    store atomic i32 [[TMP7]], ptr [[TMP6]] syncscope("cluster") monotonic, align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP9:%.*]] = load i32, ptr [[V_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    store atomic i32 [[TMP9]], ptr [[TMP8]] syncscope("wavefront") monotonic, align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP11:%.*]] = load i32, ptr [[V_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    store atomic i32 [[TMP11]], ptr [[TMP10]] syncscope("singlethread") monotonic, align 4
+// AMDGCN_CL_DEF-NEXT:    ret void
+//
+// AMDGCN_CL_20-LABEL: define hidden void @fi2a(
+// AMDGCN_CL_20-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[V:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr addrspace(5) [[V]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[V]] to ptr
+// AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// AMDGCN_CL_20-NEXT:    store atomic i32 [[TMP2]], ptr [[TMP0]] monotonic, align 4
+// AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(5) [[V]] to ptr
+// AMDGCN_CL_20-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
+// AMDGCN_CL_20-NEXT:    store atomic i32 [[TMP5]], ptr [[TMP3]] syncscope("agent") monotonic, align 4
+// AMDGCN_CL_20-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(5) [[V]] to ptr
+// AMDGCN_CL_20-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+// AMDGCN_CL_20-NEXT:    store atomic i32 [[TMP8]], ptr [[TMP6]] syncscope("workgroup") monotonic, align 4
+// AMDGCN_CL_20-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP10:%.*]] = addrspacecast ptr addrspace(5) [[V]] to ptr
+// AMDGCN_CL_20-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4
+// AMDGCN_CL_20-NEXT:    store atomic i32 [[TMP11]], ptr [[TMP9]] syncscope("cluster") monotonic, align 4
+// AMDGCN_CL_20-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP13:%.*]] = addrspacecast ptr addrspace(5) [[V]] to ptr
+// AMDGCN_CL_20-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4
+// AMDGCN_CL_20-NEXT:    store atomic i32 [[TMP14]], ptr [[TMP12]] syncscope("wavefront") monotonic, align 4
+// AMDGCN_CL_20-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP16:%.*]] = addrspacecast ptr addrspace(5) [[V]] to ptr
+// AMDGCN_CL_20-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
+// AMDGCN_CL_20-NEXT:    store atomic i32 [[TMP17]], ptr [[TMP15]] syncscope("singlethread") monotonic, align 4
+// AMDGCN_CL_20-NEXT:    ret void
+//
 // SPIRV-LABEL: define hidden spir_func void @fi2a(
-// SPIRV:    store atomic i32 [[TMP0:%.+]], ptr [[PTR0:%.+]] monotonic, align 4
-// SPIRV:    store atomic i32 [[TMP1:%.+]], ptr [[PTR1:%.+]] syncscope("device") monotonic, align 4
-// SPIRV:    store atomic i32 [[TMP2:%.+]], ptr [[PTR2:%.+]] syncscope("workgroup") monotonic, align 4
-// SPIRV:    store atomic i32 [[TMP3:%.+]], ptr [[PTR3:%.+]] syncscope("subgroup") monotonic, align 4
-// SPIRV:    store atomic i32 [[TMP4:%.+]], ptr [[PTR4:%.+]] syncscope("singlethread") monotonic, align 4
+// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[V:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    store ptr [[I]], ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[V]], align 4
+// SPIRV-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP1:%.*]] = load i32, ptr [[V]], align 4
+// SPIRV-NEXT:    store atomic i32 [[TMP1]], ptr [[TMP0]] monotonic, align 4
+// SPIRV-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP3:%.*]] = load i32, ptr [[V]], align 4
+// SPIRV-NEXT:    store atomic i32 [[TMP3]], ptr [[TMP2]] syncscope("device") monotonic, align 4
+// SPIRV-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP5:%.*]] = load i32, ptr [[V]], align 4
+// SPIRV-NEXT:    store atomic i32 [[TMP5]], ptr [[TMP4]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP7:%.*]] = load i32, ptr [[V]], align 4
+// SPIRV-NEXT:    store atomic i32 [[TMP7]], ptr [[TMP6]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP9:%.*]] = load i32, ptr [[V]], align 4
+// SPIRV-NEXT:    store atomic i32 [[TMP9]], ptr [[TMP8]] syncscope("subgroup") monotonic, align 4
+// SPIRV-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP11:%.*]] = load i32, ptr [[V]], align 4
+// SPIRV-NEXT:    store atomic i32 [[TMP11]], ptr [[TMP10]] syncscope("singlethread") monotonic, align 4
+// SPIRV-NEXT:    ret void
+//
 void fi2a(int *i) {
   int v = 1;
   __scoped_atomic_store(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM);
   __scoped_atomic_store(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE);
   __scoped_atomic_store(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP);
+  __scoped_atomic_store(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR);
   __scoped_atomic_store(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT);
   __scoped_atomic_store(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE);
 }
 
 // AMDGCN-LABEL: define hidden void @fi2b(
-// AMDGCN:    store atomic i32 [[TMP0:%.+]], ptr [[PTR0:%.+]] monotonic, align 4
-// AMDGCN:    store atomic i32 [[TMP1:%.+]], ptr [[PTR1:%.+]] syncscope("agent") monotonic, align 4
-// AMDGCN:    store atomic i32 [[TMP2:%.+]], ptr [[PTR2:%.+]] syncscope("workgroup") monotonic, align 4
-// AMDGCN:    store atomic i32 [[TMP3:%.+]], ptr [[PTR3:%.+]] syncscope("wavefront") monotonic, align 4
-// AMDGCN:    store atomic i32 [[TMP4:%.+]], ptr [[PTR4:%.+]] syncscope("singlethread") monotonic, align 4
+// AMDGCN-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT:    [[DOTATOMICTMP1:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT:    [[DOTATOMICTMP2:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT:    [[DOTATOMICTMP3:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT:    [[DOTATOMICTMP4:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT:    [[DOTATOMICTMP5:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN-NEXT:    [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
+// AMDGCN-NEXT:    [[DOTATOMICTMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP2]] to ptr
+// AMDGCN-NEXT:    [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr
+// AMDGCN-NEXT:    [[DOTATOMICTMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP4]] to ptr
+// AMDGCN-NEXT:    [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr
+// AMDGCN-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT:    store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN-NEXT:    store atomic i32 [[TMP1]], ptr [[TMP0]] monotonic, align 4
+// AMDGCN-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT:    store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN-NEXT:    store atomic i32 [[TMP3]], ptr [[TMP2]] syncscope("agent") monotonic, align 4
+// AMDGCN-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT:    store i32 1, ptr [[DOTATOMICTMP2_ASCAST]], align 4
+// AMDGCN-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTATOMICTMP2_ASCAST]], align 4
+// AMDGCN-NEXT:    store atomic i32 [[TMP5]], ptr [[TMP4]] syncscope("workgroup") monotonic, align 4
+// AMDGCN-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT:    store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN-NEXT:    store atomic i32 [[TMP7]], ptr [[TMP6]] syncscope("cluster") monotonic, align 4
+// AMDGCN-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT:    store i32 1, ptr [[DOTATOMICTMP4_ASCAST]], align 4
+// AMDGCN-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTATOMICTMP4_ASCAST]], align 4
+// AMDGCN-NEXT:    store atomic i32 [[TMP9]], ptr [[TMP8]] syncscope("wavefront") monotonic, align 4
+// AMDGCN-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT:    store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN-NEXT:    store atomic i32 [[TMP11]], ptr [[TMP10]] syncscope("singlethread") monotonic, align 4
+// AMDGCN-NEXT:    ret void
+//
 // SPIRV-LABEL: define hidden spir_func void @fi2b(
-// SPIRV:    store atomic i32 [[TMP0:%.+]], ptr [[PTR0:%.+]] monotonic, align 4
-// SPIRV:    store atomic i32 [[TMP1:%.+]], ptr [[PTR1:%.+]] syncscope("device") monotonic, align 4
-// SPIRV:    store atomic i32 [[TMP2:%.+]], ptr [[PTR2:%.+]] syncscope("workgroup") monotonic, align 4
-// SPIRV:    store atomic i32 [[TMP3:%.+]], ptr [[PTR3:%.+]] syncscope("subgroup") monotonic, align 4
-// SPIRV:    store atomic i32 [[TMP4:%.+]], ptr [[PTR4:%.+]] syncscope("singlethread") monotonic, align 4
+// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP1:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP2:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP3:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP4:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP5:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    store ptr [[I]], ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT:    store atomic i32 [[TMP1]], ptr [[TMP0]] monotonic, align 4
+// SPIRV-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP1]], align 4
+// SPIRV-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTATOMICTMP1]], align 4
+// SPIRV-NEXT:    store atomic i32 [[TMP3]], ptr [[TMP2]] syncscope("device") monotonic, align 4
+// SPIRV-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP2]], align 4
+// SPIRV-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTATOMICTMP2]], align 4
+// SPIRV-NEXT:    store atomic i32 [[TMP5]], ptr [[TMP4]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP3]], align 4
+// SPIRV-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTATOMICTMP3]], align 4
+// SPIRV-NEXT:    store atomic i32 [[TMP7]], ptr [[TMP6]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP4]], align 4
+// SPIRV-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTATOMICTMP4]], align 4
+// SPIRV-NEXT:    store atomic i32 [[TMP9]], ptr [[TMP8]] syncscope("subgroup") monotonic, align 4
+// SPIRV-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP5]], align 4
+// SPIRV-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP5]], align 4
+// SPIRV-NEXT:    store atomic i32 [[TMP11]], ptr [[TMP10]] syncscope("singlethread") monotonic, align 4
+// SPIRV-NEXT:    ret void
+//
 void fi2b(int *i) {
   __scoped_atomic_store_n(i, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM);
   __scoped_atomic_store_n(i, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE);
   __scoped_atomic_store_n(i, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP);
+  __scoped_atomic_store_n(i, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR);
   __scoped_atomic_store_n(i, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT);
   __scoped_atomic_store_n(i, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE);
 }
 
-// AMDGCN-LABEL: define hidden void @fi3a(
-// AMDGCN:    [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] monotonic, align 4
-// AMDGCN:    [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] monotonic, align 4
-// AMDGCN:    [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] monotonic, align 4
-// AMDGCN:    [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] monotonic, align 4
-// AMDGCN:    [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] monotonic, align 4
-// AMDGCN:    [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] monotonic, align 4
-// AMDGCN:    [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] monotonic, align 4
-// AMDGCN:    [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] monotonic, align 4
+// AMDGCN_CL_DEF-LABEL: define hidden void @fi3a(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[E_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[F_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[G_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[H_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP1:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP3:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP5:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP7:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP9:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP11:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[E_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[E_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr
+// AMDGCN_CL_DEF-NEXT:    store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[E]], ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[F]], ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3:![0-9]+]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP3]], ptr [[TMP4]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP8]], ptr [[TMP9]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP13]], ptr [[TMP14]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP18]], ptr [[TMP19]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP23]], ptr [[TMP24]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP28]], ptr [[TMP29]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP33]], ptr [[TMP34]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP38]], ptr [[TMP39]], align 4
+// AMDGCN_CL_DEF-NEXT:    ret void
+//
+// AMDGCN_CL_20-LABEL: define hidden void @fi3a(
+// AMDGCN_CL_20-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[E_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[F_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[G_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[H_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP1:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP3:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP5:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP7:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP9:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP11:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[E_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[E_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr
+// AMDGCN_CL_20-NEXT:    store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[E]], ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[F]], ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4:![0-9]+]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP3]], ptr [[TMP4]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP8]], ptr [[TMP9]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP13]], ptr [[TMP14]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP18]], ptr [[TMP19]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP23]], ptr [[TMP24]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP28]], ptr [[TMP29]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP33]], ptr [[TMP34]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP38]], ptr [[TMP39]], align 4
+// AMDGCN_CL_20-NEXT:    ret void
+//
 // SPIRV-LABEL: define hidden spir_func void @fi3a(
-// SPIRV:    [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] monotonic, align 4
-// SPIRV:    [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] monotonic, align 4
-// SPIRV:    [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] monotonic, align 4
-// SPIRV:    [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] monotonic, align 4
-// SPIRV:    [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] monotonic, align 4
-// SPIRV:    [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] monotonic, align 4
-// SPIRV:    [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] monotonic, align 4
-// SPIRV:    [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] monotonic, align 4
+// SPIRV-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[E_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[F_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[G_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[H_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP1:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP3:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP5:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP7:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP9:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP11:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP13:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[D]], ptr [[D_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[E]], ptr [[E_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[F]], ptr [[F_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[G]], ptr [[G_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[H]], ptr [[H_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT:    [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
+// SPIRV-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// SPIRV-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP3]], ptr [[TMP4]], align 4
+// SPIRV-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP1]], align 4
+// SPIRV-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1]], align 4
+// SPIRV-NEXT:    [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP7]], ptr [[ATOMIC_TEMP2]], align 4
+// SPIRV-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2]], align 4
+// SPIRV-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP8]], ptr [[TMP9]], align 4
+// SPIRV-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP3]], align 4
+// SPIRV-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3]], align 4
+// SPIRV-NEXT:    [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP12]], ptr [[ATOMIC_TEMP4]], align 4
+// SPIRV-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4]], align 4
+// SPIRV-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP13]], ptr [[TMP14]], align 4
+// SPIRV-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[D_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP5]], align 4
+// SPIRV-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5]], align 4
+// SPIRV-NEXT:    [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP17]], ptr [[ATOMIC_TEMP6]], align 4
+// SPIRV-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6]], align 4
+// SPIRV-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[D_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP18]], ptr [[TMP19]], align 4
+// SPIRV-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[E_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP7]], align 4
+// SPIRV-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7]], align 4
+// SPIRV-NEXT:    [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP22]], ptr [[ATOMIC_TEMP8]], align 4
+// SPIRV-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8]], align 4
+// SPIRV-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[E_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP23]], ptr [[TMP24]], align 4
+// SPIRV-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[F_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP9]], align 4
+// SPIRV-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9]], align 4
+// SPIRV-NEXT:    [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP27]], ptr [[ATOMIC_TEMP10]], align 4
+// SPIRV-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10]], align 4
+// SPIRV-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[F_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP28]], ptr [[TMP29]], align 4
+// SPIRV-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[G_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP11]], align 4
+// SPIRV-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11]], align 4
+// SPIRV-NEXT:    [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP32]], ptr [[ATOMIC_TEMP12]], align 4
+// SPIRV-NEXT:    [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12]], align 4
+// SPIRV-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[G_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP33]], ptr [[TMP34]], align 4
+// SPIRV-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[H_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP13]], align 4
+// SPIRV-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13]], align 4
+// SPIRV-NEXT:    [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP37]], ptr [[ATOMIC_TEMP14]], align 4
+// SPIRV-NEXT:    [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14]], align 4
+// SPIRV-NEXT:    [[TMP39:%.*]] = load ptr, ptr [[H_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP38]], ptr [[TMP39]], align 4
+// SPIRV-NEXT:    ret void
+//
 void fi3a(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
   *a = __scoped_atomic_fetch_add(a, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM);
   *b = __scoped_atomic_fetch_sub(b, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM);
@@ -119,24 +778,357 @@ void fi3a(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
   *h = __scoped_atomic_fetch_max(h, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM);
 }
 
-// AMDGCN-LABEL: define hidden void @fi3b(
-// AMDGCN:    [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("agent") monotonic, align 4
-// AMDGCN:    [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("agent") monotonic, align 4
-// AMDGCN:    [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] syncscope("agent") monotonic, align 4
-// AMDGCN:    [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] syncscope("agent") monotonic, align 4
-// AMDGCN:    [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] syncscope("agent") monotonic, align 4
-// AMDGCN:    [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] syncscope("agent") monotonic, align 4
-// AMDGCN:    [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] syncscope("agent") monotonic, align 4
-// AMDGCN:    [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] syncscope("agent") monotonic, align 4
+// AMDGCN_CL_DEF-LABEL: define hidden void @fi3b(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[E_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[F_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[G_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[H_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP1:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP3:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP5:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP7:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP9:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP11:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[E_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[E_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr
+// AMDGCN_CL_DEF-NEXT:    store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[E]], ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[F]], ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP3]], ptr [[TMP4]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP8]], ptr [[TMP9]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP13]], ptr [[TMP14]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP18]], ptr [[TMP19]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP23]], ptr [[TMP24]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP28]], ptr [[TMP29]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP33]], ptr [[TMP34]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP38]], ptr [[TMP39]], align 4
+// AMDGCN_CL_DEF-NEXT:    ret void
+//
+// AMDGCN_CL_20-LABEL: define hidden void @fi3b(
+// AMDGCN_CL_20-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[E_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[F_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[G_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[H_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP1:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP3:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP5:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP7:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP9:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP11:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[E_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[E_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr
+// AMDGCN_CL_20-NEXT:    store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[E]], ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[F]], ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP3]], ptr [[TMP4]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP8]], ptr [[TMP9]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP13]], ptr [[TMP14]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP18]], ptr [[TMP19]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP23]], ptr [[TMP24]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP28]], ptr [[TMP29]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP33]], ptr [[TMP34]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP38]], ptr [[TMP39]], align 4
+// AMDGCN_CL_20-NEXT:    ret void
+//
 // SPIRV-LABEL: define hidden spir_func void @fi3b(
-// SPIRV:    [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("device") monotonic, align 4
-// SPIRV:    [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("device") monotonic, align 4
-// SPIRV:    [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] syncscope("device") monotonic, align 4
-// SPIRV:    [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] syncscope("device") monotonic, align 4
-// SPIRV:    [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] syncscope("device") monotonic, align 4
-// SPIRV:    [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] syncscope("device") monotonic, align 4
-// SPIRV:    [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] syncscope("device") monotonic, align 4
-// SPIRV:    [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] syncscope("device") monotonic, align 4
+// SPIRV-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[E_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[F_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[G_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[H_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP1:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP3:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP5:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP7:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP9:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP11:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP13:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[D]], ptr [[D_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[E]], ptr [[E_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[F]], ptr [[F_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[G]], ptr [[G_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[H]], ptr [[H_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT:    [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("device") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
+// SPIRV-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// SPIRV-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP3]], ptr [[TMP4]], align 4
+// SPIRV-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP1]], align 4
+// SPIRV-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1]], align 4
+// SPIRV-NEXT:    [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("device") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP7]], ptr [[ATOMIC_TEMP2]], align 4
+// SPIRV-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2]], align 4
+// SPIRV-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP8]], ptr [[TMP9]], align 4
+// SPIRV-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP3]], align 4
+// SPIRV-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3]], align 4
+// SPIRV-NEXT:    [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("device") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP12]], ptr [[ATOMIC_TEMP4]], align 4
+// SPIRV-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4]], align 4
+// SPIRV-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP13]], ptr [[TMP14]], align 4
+// SPIRV-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[D_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP5]], align 4
+// SPIRV-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5]], align 4
+// SPIRV-NEXT:    [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("device") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP17]], ptr [[ATOMIC_TEMP6]], align 4
+// SPIRV-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6]], align 4
+// SPIRV-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[D_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP18]], ptr [[TMP19]], align 4
+// SPIRV-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[E_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP7]], align 4
+// SPIRV-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7]], align 4
+// SPIRV-NEXT:    [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("device") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP22]], ptr [[ATOMIC_TEMP8]], align 4
+// SPIRV-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8]], align 4
+// SPIRV-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[E_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP23]], ptr [[TMP24]], align 4
+// SPIRV-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[F_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP9]], align 4
+// SPIRV-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9]], align 4
+// SPIRV-NEXT:    [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("device") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP27]], ptr [[ATOMIC_TEMP10]], align 4
+// SPIRV-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10]], align 4
+// SPIRV-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[F_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP28]], ptr [[TMP29]], align 4
+// SPIRV-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[G_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP11]], align 4
+// SPIRV-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11]], align 4
+// SPIRV-NEXT:    [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("device") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP32]], ptr [[ATOMIC_TEMP12]], align 4
+// SPIRV-NEXT:    [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12]], align 4
+// SPIRV-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[G_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP33]], ptr [[TMP34]], align 4
+// SPIRV-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[H_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP13]], align 4
+// SPIRV-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13]], align 4
+// SPIRV-NEXT:    [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("device") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP37]], ptr [[ATOMIC_TEMP14]], align 4
+// SPIRV-NEXT:    [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14]], align 4
+// SPIRV-NEXT:    [[TMP39:%.*]] = load ptr, ptr [[H_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP38]], ptr [[TMP39]], align 4
+// SPIRV-NEXT:    ret void
+//
 void fi3b(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
   *a = __scoped_atomic_fetch_add(a, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE);
   *b = __scoped_atomic_fetch_sub(b, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE);
@@ -148,24 +1140,357 @@ void fi3b(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
   *h = __scoped_atomic_fetch_max(h, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE);
 }
 
-// AMDGCN-LABEL: define hidden void @fi3c(
-// AMDGCN:    [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("workgroup") monotonic, align 4
-// AMDGCN:    [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("workgroup") monotonic, align 4
-// AMDGCN:    [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] syncscope("workgroup") monotonic, align 4
-// AMDGCN:    [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] syncscope("workgroup") monotonic, align 4
-// AMDGCN:    [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] syncscope("workgroup") monotonic, align 4
-// AMDGCN:    [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] syncscope("workgroup") monotonic, align 4
-// AMDGCN:    [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] syncscope("workgroup") monotonic, align 4
-// AMDGCN:    [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] syncscope("workgroup") monotonic, align 4
+// AMDGCN_CL_DEF-LABEL: define hidden void @fi3c(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[E_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[F_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[G_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[H_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP1:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP3:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP5:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP7:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP9:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP11:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[E_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[E_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr
+// AMDGCN_CL_DEF-NEXT:    store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[E]], ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[F]], ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP3]], ptr [[TMP4]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP8]], ptr [[TMP9]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP13]], ptr [[TMP14]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP18]], ptr [[TMP19]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP23]], ptr [[TMP24]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP28]], ptr [[TMP29]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP33]], ptr [[TMP34]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP38]], ptr [[TMP39]], align 4
+// AMDGCN_CL_DEF-NEXT:    ret void
+//
+// AMDGCN_CL_20-LABEL: define hidden void @fi3c(
+// AMDGCN_CL_20-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[E_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[F_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[G_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[H_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP1:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP3:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP5:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP7:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP9:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP11:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[E_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[E_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr
+// AMDGCN_CL_20-NEXT:    store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[E]], ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[F]], ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP3]], ptr [[TMP4]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP8]], ptr [[TMP9]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP13]], ptr [[TMP14]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP18]], ptr [[TMP19]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP23]], ptr [[TMP24]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP28]], ptr [[TMP29]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP33]], ptr [[TMP34]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP38]], ptr [[TMP39]], align 4
+// AMDGCN_CL_20-NEXT:    ret void
+//
 // SPIRV-LABEL: define hidden spir_func void @fi3c(
-// SPIRV:    [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("workgroup") monotonic, align 4
-// SPIRV:    [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("workgroup") monotonic, align 4
-// SPIRV:    [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] syncscope("workgroup") monotonic, align 4
-// SPIRV:    [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] syncscope("workgroup") monotonic, align 4
-// SPIRV:    [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] syncscope("workgroup") monotonic, align 4
-// SPIRV:    [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] syncscope("workgroup") monotonic, align 4
-// SPIRV:    [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] syncscope("workgroup") monotonic, align 4
-// SPIRV:    [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] syncscope("workgroup") monotonic, align 4
+// SPIRV-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[E_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[F_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[G_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[H_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP1:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP3:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP5:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP7:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP9:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP11:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP13:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[D]], ptr [[D_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[E]], ptr [[E_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[F]], ptr [[F_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[G]], ptr [[G_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[H]], ptr [[H_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT:    [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
+// SPIRV-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// SPIRV-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP3]], ptr [[TMP4]], align 4
+// SPIRV-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP1]], align 4
+// SPIRV-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1]], align 4
+// SPIRV-NEXT:    [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP7]], ptr [[ATOMIC_TEMP2]], align 4
+// SPIRV-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2]], align 4
+// SPIRV-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP8]], ptr [[TMP9]], align 4
+// SPIRV-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP3]], align 4
+// SPIRV-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3]], align 4
+// SPIRV-NEXT:    [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP12]], ptr [[ATOMIC_TEMP4]], align 4
+// SPIRV-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4]], align 4
+// SPIRV-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP13]], ptr [[TMP14]], align 4
+// SPIRV-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[D_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP5]], align 4
+// SPIRV-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5]], align 4
+// SPIRV-NEXT:    [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP17]], ptr [[ATOMIC_TEMP6]], align 4
+// SPIRV-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6]], align 4
+// SPIRV-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[D_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP18]], ptr [[TMP19]], align 4
+// SPIRV-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[E_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP7]], align 4
+// SPIRV-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7]], align 4
+// SPIRV-NEXT:    [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP22]], ptr [[ATOMIC_TEMP8]], align 4
+// SPIRV-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8]], align 4
+// SPIRV-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[E_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP23]], ptr [[TMP24]], align 4
+// SPIRV-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[F_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP9]], align 4
+// SPIRV-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9]], align 4
+// SPIRV-NEXT:    [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP27]], ptr [[ATOMIC_TEMP10]], align 4
+// SPIRV-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10]], align 4
+// SPIRV-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[F_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP28]], ptr [[TMP29]], align 4
+// SPIRV-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[G_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP11]], align 4
+// SPIRV-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11]], align 4
+// SPIRV-NEXT:    [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP32]], ptr [[ATOMIC_TEMP12]], align 4
+// SPIRV-NEXT:    [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12]], align 4
+// SPIRV-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[G_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP33]], ptr [[TMP34]], align 4
+// SPIRV-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[H_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP13]], align 4
+// SPIRV-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13]], align 4
+// SPIRV-NEXT:    [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP37]], ptr [[ATOMIC_TEMP14]], align 4
+// SPIRV-NEXT:    [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14]], align 4
+// SPIRV-NEXT:    [[TMP39:%.*]] = load ptr, ptr [[H_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP38]], ptr [[TMP39]], align 4
+// SPIRV-NEXT:    ret void
+//
 void fi3c(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
   *a = __scoped_atomic_fetch_add(a, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP);
   *b = __scoped_atomic_fetch_sub(b, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP);
@@ -177,24 +1502,719 @@ void fi3c(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
   *h = __scoped_atomic_fetch_max(h, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP);
 }
 
-// AMDGCN-LABEL: define hidden void @fi3d(
-// AMDGCN:    [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("wavefront") monotonic, align 4
-// AMDGCN:    [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("wavefront") monotonic, align 4
-// AMDGCN:    [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] syncscope("wavefront") monotonic, align 4
-// AMDGCN:    [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] syncscope("wavefront") monotonic, align 4
-// AMDGCN:    [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] syncscope("wavefront") monotonic, align 4
-// AMDGCN:    [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] syncscope("wavefront") monotonic, align 4
-// AMDGCN:    [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] syncscope("wavefront") monotonic, align 4
-// AMDGCN:    [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] syncscope("wavefront") monotonic, align 4
+// AMDGCN_CL_DEF-LABEL: define hidden void @fi3_clustr(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[E_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[F_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[G_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[H_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP1:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP3:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP5:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP7:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP9:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP11:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[E_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[E_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr
+// AMDGCN_CL_DEF-NEXT:    store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[E]], ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[F]], ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP3]], ptr [[TMP4]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP8]], ptr [[TMP9]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP13]], ptr [[TMP14]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP18]], ptr [[TMP19]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP23]], ptr [[TMP24]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP28]], ptr [[TMP29]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP33]], ptr [[TMP34]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP38]], ptr [[TMP39]], align 4
+// AMDGCN_CL_DEF-NEXT:    ret void
+//
+// AMDGCN_CL_20-LABEL: define hidden void @fi3_clustr(
+// AMDGCN_CL_20-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[E_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[F_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[G_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[H_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP1:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP3:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP5:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP7:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP9:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP11:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[E_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[E_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr
+// AMDGCN_CL_20-NEXT:    store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[E]], ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[F]], ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP3]], ptr [[TMP4]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP8]], ptr [[TMP9]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP13]], ptr [[TMP14]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP18]], ptr [[TMP19]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP23]], ptr [[TMP24]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP28]], ptr [[TMP29]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP33]], ptr [[TMP34]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP38]], ptr [[TMP39]], align 4
+// AMDGCN_CL_20-NEXT:    ret void
+//
+// SPIRV-LABEL: define hidden spir_func void @fi3_clustr(
+// SPIRV-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[E_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[F_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[G_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[H_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP1:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP3:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP5:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP7:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP9:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP11:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP13:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[D]], ptr [[D_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[E]], ptr [[E_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[F]], ptr [[F_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[G]], ptr [[G_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[H]], ptr [[H_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT:    [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
+// SPIRV-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// SPIRV-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP3]], ptr [[TMP4]], align 4
+// SPIRV-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP1]], align 4
+// SPIRV-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1]], align 4
+// SPIRV-NEXT:    [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP7]], ptr [[ATOMIC_TEMP2]], align 4
+// SPIRV-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2]], align 4
+// SPIRV-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP8]], ptr [[TMP9]], align 4
+// SPIRV-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP3]], align 4
+// SPIRV-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3]], align 4
+// SPIRV-NEXT:    [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP12]], ptr [[ATOMIC_TEMP4]], align 4
+// SPIRV-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4]], align 4
+// SPIRV-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP13]], ptr [[TMP14]], align 4
+// SPIRV-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[D_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP5]], align 4
+// SPIRV-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5]], align 4
+// SPIRV-NEXT:    [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP17]], ptr [[ATOMIC_TEMP6]], align 4
+// SPIRV-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6]], align 4
+// SPIRV-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[D_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP18]], ptr [[TMP19]], align 4
+// SPIRV-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[E_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP7]], align 4
+// SPIRV-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7]], align 4
+// SPIRV-NEXT:    [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP22]], ptr [[ATOMIC_TEMP8]], align 4
+// SPIRV-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8]], align 4
+// SPIRV-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[E_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP23]], ptr [[TMP24]], align 4
+// SPIRV-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[F_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP9]], align 4
+// SPIRV-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9]], align 4
+// SPIRV-NEXT:    [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP27]], ptr [[ATOMIC_TEMP10]], align 4
+// SPIRV-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10]], align 4
+// SPIRV-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[F_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP28]], ptr [[TMP29]], align 4
+// SPIRV-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[G_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP11]], align 4
+// SPIRV-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11]], align 4
+// SPIRV-NEXT:    [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP32]], ptr [[ATOMIC_TEMP12]], align 4
+// SPIRV-NEXT:    [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12]], align 4
+// SPIRV-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[G_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP33]], ptr [[TMP34]], align 4
+// SPIRV-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[H_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP13]], align 4
+// SPIRV-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13]], align 4
+// SPIRV-NEXT:    [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP37]], ptr [[ATOMIC_TEMP14]], align 4
+// SPIRV-NEXT:    [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14]], align 4
+// SPIRV-NEXT:    [[TMP39:%.*]] = load ptr, ptr [[H_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP38]], ptr [[TMP39]], align 4
+// SPIRV-NEXT:    ret void
+//
+void fi3_clustr(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
+  *a = __scoped_atomic_fetch_add(a, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR);
+  *b = __scoped_atomic_fetch_sub(b, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR);
+  *c = __scoped_atomic_fetch_and(c, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR);
+  *d = __scoped_atomic_fetch_or(d, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR);
+  *e = __scoped_atomic_fetch_xor(e, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR);
+  *f = __scoped_atomic_fetch_nand(f, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR);
+  *g = __scoped_atomic_fetch_min(g, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR);
+  *h = __scoped_atomic_fetch_max(h, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR);
+}
+
+// AMDGCN_CL_DEF-LABEL: define hidden void @fi3d(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[E_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[F_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[G_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[H_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP1:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP3:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP5:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP7:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP9:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP11:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[E_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[E_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr
+// AMDGCN_CL_DEF-NEXT:    store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[E]], ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[F]], ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP3]], ptr [[TMP4]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP8]], ptr [[TMP9]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP13]], ptr [[TMP14]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP18]], ptr [[TMP19]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP23]], ptr [[TMP24]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP28]], ptr [[TMP29]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP33]], ptr [[TMP34]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP38]], ptr [[TMP39]], align 4
+// AMDGCN_CL_DEF-NEXT:    ret void
+//
+// AMDGCN_CL_20-LABEL: define hidden void @fi3d(
+// AMDGCN_CL_20-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[E_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[F_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[G_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[H_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP1:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP3:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP5:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP7:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP9:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP11:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[E_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[E_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr
+// AMDGCN_CL_20-NEXT:    store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[E]], ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[F]], ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP3]], ptr [[TMP4]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP8]], ptr [[TMP9]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP13]], ptr [[TMP14]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP18]], ptr [[TMP19]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP23]], ptr [[TMP24]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP28]], ptr [[TMP29]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP33]], ptr [[TMP34]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP38]], ptr [[TMP39]], align 4
+// AMDGCN_CL_20-NEXT:    ret void
+//
 // SPIRV-LABEL: define hidden spir_func void @fi3d(
-// SPIRV:    [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("subgroup") monotonic, align 4
-// SPIRV:    [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("subgroup") monotonic, align 4
-// SPIRV:    [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] syncscope("subgroup") monotonic, align 4
-// SPIRV:    [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] syncscope("subgroup") monotonic, align 4
-// SPIRV:    [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] syncscope("subgroup") monotonic, align 4
-// SPIRV:    [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] syncscope("subgroup") monotonic, align 4
-// SPIRV:    [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] syncscope("subgroup") monotonic, align 4
-// SPIRV:    [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] syncscope("subgroup") monotonic, align 4
+// SPIRV-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[E_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[F_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[G_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[H_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP1:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP3:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP5:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP7:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP9:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP11:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP13:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[D]], ptr [[D_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[E]], ptr [[E_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[F]], ptr [[F_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[G]], ptr [[G_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[H]], ptr [[H_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT:    [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("subgroup") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
+// SPIRV-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// SPIRV-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP3]], ptr [[TMP4]], align 4
+// SPIRV-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP1]], align 4
+// SPIRV-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1]], align 4
+// SPIRV-NEXT:    [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("subgroup") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP7]], ptr [[ATOMIC_TEMP2]], align 4
+// SPIRV-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2]], align 4
+// SPIRV-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP8]], ptr [[TMP9]], align 4
+// SPIRV-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP3]], align 4
+// SPIRV-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3]], align 4
+// SPIRV-NEXT:    [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("subgroup") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP12]], ptr [[ATOMIC_TEMP4]], align 4
+// SPIRV-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4]], align 4
+// SPIRV-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP13]], ptr [[TMP14]], align 4
+// SPIRV-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[D_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP5]], align 4
+// SPIRV-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5]], align 4
+// SPIRV-NEXT:    [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("subgroup") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP17]], ptr [[ATOMIC_TEMP6]], align 4
+// SPIRV-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6]], align 4
+// SPIRV-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[D_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP18]], ptr [[TMP19]], align 4
+// SPIRV-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[E_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP7]], align 4
+// SPIRV-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7]], align 4
+// SPIRV-NEXT:    [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("subgroup") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP22]], ptr [[ATOMIC_TEMP8]], align 4
+// SPIRV-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8]], align 4
+// SPIRV-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[E_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP23]], ptr [[TMP24]], align 4
+// SPIRV-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[F_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP9]], align 4
+// SPIRV-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9]], align 4
+// SPIRV-NEXT:    [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("subgroup") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP27]], ptr [[ATOMIC_TEMP10]], align 4
+// SPIRV-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10]], align 4
+// SPIRV-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[F_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP28]], ptr [[TMP29]], align 4
+// SPIRV-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[G_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP11]], align 4
+// SPIRV-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11]], align 4
+// SPIRV-NEXT:    [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("subgroup") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP32]], ptr [[ATOMIC_TEMP12]], align 4
+// SPIRV-NEXT:    [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12]], align 4
+// SPIRV-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[G_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP33]], ptr [[TMP34]], align 4
+// SPIRV-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[H_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP13]], align 4
+// SPIRV-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13]], align 4
+// SPIRV-NEXT:    [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("subgroup") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP37]], ptr [[ATOMIC_TEMP14]], align 4
+// SPIRV-NEXT:    [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14]], align 4
+// SPIRV-NEXT:    [[TMP39:%.*]] = load ptr, ptr [[H_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP38]], ptr [[TMP39]], align 4
+// SPIRV-NEXT:    ret void
+//
 void fi3d(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
   *a = __scoped_atomic_fetch_add(a, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT);
   *b = __scoped_atomic_fetch_sub(b, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT);
@@ -206,24 +2226,357 @@ void fi3d(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
   *h = __scoped_atomic_fetch_max(h, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT);
 }
 
-// AMDGCN-LABEL: define hidden void @fi3e(
-// AMDGCN:    [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("singlethread") monotonic, align 4
-// AMDGCN:    [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("singlethread") monotonic, align 4
-// AMDGCN:    [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] syncscope("singlethread") monotonic, align 4
-// AMDGCN:    [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] syncscope("singlethread") monotonic, align 4
-// AMDGCN:    [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] syncscope("singlethread") monotonic, align 4
-// AMDGCN:    [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] syncscope("singlethread") monotonic, align 4
-// AMDGCN:    [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] syncscope("singlethread") monotonic, align 4
-// AMDGCN:    [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] syncscope("singlethread") monotonic, align 4
+// AMDGCN_CL_DEF-LABEL: define hidden void @fi3e(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[E_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[F_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[G_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[H_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP1:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP3:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP5:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP7:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP9:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP11:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[E_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[E_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr
+// AMDGCN_CL_DEF-NEXT:    store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[E]], ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[F]], ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP3]], ptr [[TMP4]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP8]], ptr [[TMP9]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP13]], ptr [[TMP14]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP18]], ptr [[TMP19]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP23]], ptr [[TMP24]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP28]], ptr [[TMP29]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP33]], ptr [[TMP34]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP38]], ptr [[TMP39]], align 4
+// AMDGCN_CL_DEF-NEXT:    ret void
+//
+// AMDGCN_CL_20-LABEL: define hidden void @fi3e(
+// AMDGCN_CL_20-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[E_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[F_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[G_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[H_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP1:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP3:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP5:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP7:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP9:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP11:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[E_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[E_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr
+// AMDGCN_CL_20-NEXT:    store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[E]], ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[F]], ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP3]], ptr [[TMP4]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP8]], ptr [[TMP9]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP13]], ptr [[TMP14]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP18]], ptr [[TMP19]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP23]], ptr [[TMP24]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP28]], ptr [[TMP29]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP33]], ptr [[TMP34]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP38]], ptr [[TMP39]], align 4
+// AMDGCN_CL_20-NEXT:    ret void
+//
 // SPIRV-LABEL: define hidden spir_func void @fi3e(
-// SPIRV:    [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("singlethread") monotonic, align 4
-// SPIRV:    [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("singlethread") monotonic, align 4
-// SPIRV:    [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] syncscope("singlethread") monotonic, align 4
-// SPIRV:    [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] syncscope("singlethread") monotonic, align 4
-// SPIRV:    [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] syncscope("singlethread") monotonic, align 4
-// SPIRV:    [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] syncscope("singlethread") monotonic, align 4
-// SPIRV:    [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] syncscope("singlethread") monotonic, align 4
-// SPIRV:    [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] syncscope("singlethread") monotonic, align 4
+// SPIRV-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[E_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[F_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[G_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[H_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP1:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP3:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP5:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP7:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP9:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP11:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP13:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[D]], ptr [[D_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[E]], ptr [[E_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[F]], ptr [[F_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[G]], ptr [[G_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[H]], ptr [[H_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT:    [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("singlethread") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4
+// SPIRV-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// SPIRV-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP3]], ptr [[TMP4]], align 4
+// SPIRV-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP1]], align 4
+// SPIRV-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1]], align 4
+// SPIRV-NEXT:    [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("singlethread") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP7]], ptr [[ATOMIC_TEMP2]], align 4
+// SPIRV-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2]], align 4
+// SPIRV-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP8]], ptr [[TMP9]], align 4
+// SPIRV-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP3]], align 4
+// SPIRV-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3]], align 4
+// SPIRV-NEXT:    [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("singlethread") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP12]], ptr [[ATOMIC_TEMP4]], align 4
+// SPIRV-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4]], align 4
+// SPIRV-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP13]], ptr [[TMP14]], align 4
+// SPIRV-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[D_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP5]], align 4
+// SPIRV-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5]], align 4
+// SPIRV-NEXT:    [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("singlethread") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP17]], ptr [[ATOMIC_TEMP6]], align 4
+// SPIRV-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6]], align 4
+// SPIRV-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[D_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP18]], ptr [[TMP19]], align 4
+// SPIRV-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[E_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP7]], align 4
+// SPIRV-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7]], align 4
+// SPIRV-NEXT:    [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("singlethread") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP22]], ptr [[ATOMIC_TEMP8]], align 4
+// SPIRV-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8]], align 4
+// SPIRV-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[E_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP23]], ptr [[TMP24]], align 4
+// SPIRV-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[F_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP9]], align 4
+// SPIRV-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9]], align 4
+// SPIRV-NEXT:    [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("singlethread") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP27]], ptr [[ATOMIC_TEMP10]], align 4
+// SPIRV-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10]], align 4
+// SPIRV-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[F_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP28]], ptr [[TMP29]], align 4
+// SPIRV-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[G_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP11]], align 4
+// SPIRV-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11]], align 4
+// SPIRV-NEXT:    [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("singlethread") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP32]], ptr [[ATOMIC_TEMP12]], align 4
+// SPIRV-NEXT:    [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12]], align 4
+// SPIRV-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[G_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP33]], ptr [[TMP34]], align 4
+// SPIRV-NEXT:    [[TMP35:%.*]] = load ptr, ptr [[H_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP13]], align 4
+// SPIRV-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13]], align 4
+// SPIRV-NEXT:    [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("singlethread") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP37]], ptr [[ATOMIC_TEMP14]], align 4
+// SPIRV-NEXT:    [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14]], align 4
+// SPIRV-NEXT:    [[TMP39:%.*]] = load ptr, ptr [[H_ADDR]], align 8
+// SPIRV-NEXT:    store i32 [[TMP38]], ptr [[TMP39]], align 4
+// SPIRV-NEXT:    ret void
+//
 void fi3e(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
   *a = __scoped_atomic_fetch_add(a, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE);
   *b = __scoped_atomic_fetch_sub(b, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE);
@@ -235,10 +2588,98 @@ void fi3e(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
   *h = __scoped_atomic_fetch_max(h, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE);
 }
 
-// AMDGCN-LABEL: define hidden zeroext i1 @fi4a(
-// AMDGCN-DAG:    [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] acquire acquire, align 4
+// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi4a(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DESIRED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_DEF-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 0, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DESIRED_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DESIRED_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP3:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] acquire acquire, align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// AMDGCN_CL_DEF-NEXT:    [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// AMDGCN_CL_DEF-NEXT:    br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_DEF:       [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP4]], ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_DEF:       [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_DEF-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// AMDGCN_CL_DEF-NEXT:    store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// AMDGCN_CL_DEF-NEXT:    ret i1 [[LOADEDV]]
+//
+// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi4a(
+// AMDGCN_CL_20-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT:    [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_20-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 0, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr addrspace(5) [[DESIRED]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr
+// AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP2]], i32 [[TMP3]] acquire acquire, align 4
+// AMDGCN_CL_20-NEXT:    [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+// AMDGCN_CL_20-NEXT:    [[TMP6:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+// AMDGCN_CL_20-NEXT:    br i1 [[TMP6]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_20:       [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP5]], ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT:    br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_20:       [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_20-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP6]] to i8
+// AMDGCN_CL_20-NEXT:    store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[TMP7:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP7]] to i1
+// AMDGCN_CL_20-NEXT:    ret i1 [[LOADEDV]]
+//
 // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi4a(
-// SPIRV-DAG:    [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] acquire acquire, align 4
+// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[CMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DESIRED:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1
+// SPIRV-NEXT:    store ptr [[I]], ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    store i32 0, ptr [[CMP]], align 4
+// SPIRV-NEXT:    store i32 1, ptr [[DESIRED]], align 4
+// SPIRV-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4
+// SPIRV-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DESIRED]], align 4
+// SPIRV-NEXT:    [[TMP3:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] acquire acquire, align 4
+// SPIRV-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// SPIRV-NEXT:    [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// SPIRV-NEXT:    br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// SPIRV:       [[CMPXCHG_STORE_EXPECTED]]:
+// SPIRV-NEXT:    store i32 [[TMP4]], ptr [[CMP]], align 4
+// SPIRV-NEXT:    br label %[[CMPXCHG_CONTINUE]]
+// SPIRV:       [[CMPXCHG_CONTINUE]]:
+// SPIRV-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// SPIRV-NEXT:    store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT:    [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// SPIRV-NEXT:    ret i1 [[LOADEDV]]
+//
 _Bool fi4a(int *i) {
   int cmp = 0;
   int desired = 1;
@@ -247,10 +2688,98 @@ _Bool fi4a(int *i) {
                                           __MEMORY_SCOPE_SYSTEM);
 }
 
-// AMDGCN-LABEL: define hidden zeroext i1 @fi4b(
-// AMDGCN-DAG:    [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("agent") acquire acquire, align 4
+// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi4b(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DESIRED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_DEF-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 0, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DESIRED_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DESIRED_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP3:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("agent") acquire acquire, align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// AMDGCN_CL_DEF-NEXT:    [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// AMDGCN_CL_DEF-NEXT:    br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_DEF:       [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP4]], ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_DEF:       [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_DEF-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// AMDGCN_CL_DEF-NEXT:    store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// AMDGCN_CL_DEF-NEXT:    ret i1 [[LOADEDV]]
+//
+// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi4b(
+// AMDGCN_CL_20-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT:    [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_20-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 0, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr addrspace(5) [[DESIRED]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr
+// AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP2]], i32 [[TMP3]] syncscope("agent") acquire acquire, align 4
+// AMDGCN_CL_20-NEXT:    [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+// AMDGCN_CL_20-NEXT:    [[TMP6:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+// AMDGCN_CL_20-NEXT:    br i1 [[TMP6]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_20:       [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP5]], ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT:    br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_20:       [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_20-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP6]] to i8
+// AMDGCN_CL_20-NEXT:    store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[TMP7:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP7]] to i1
+// AMDGCN_CL_20-NEXT:    ret i1 [[LOADEDV]]
+//
 // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi4b(
-// SPIRV-DAG:    [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("device") acquire acquire, align 4
+// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[CMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DESIRED:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1
+// SPIRV-NEXT:    store ptr [[I]], ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    store i32 0, ptr [[CMP]], align 4
+// SPIRV-NEXT:    store i32 1, ptr [[DESIRED]], align 4
+// SPIRV-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4
+// SPIRV-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DESIRED]], align 4
+// SPIRV-NEXT:    [[TMP3:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("device") acquire acquire, align 4
+// SPIRV-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// SPIRV-NEXT:    [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// SPIRV-NEXT:    br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// SPIRV:       [[CMPXCHG_STORE_EXPECTED]]:
+// SPIRV-NEXT:    store i32 [[TMP4]], ptr [[CMP]], align 4
+// SPIRV-NEXT:    br label %[[CMPXCHG_CONTINUE]]
+// SPIRV:       [[CMPXCHG_CONTINUE]]:
+// SPIRV-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// SPIRV-NEXT:    store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT:    [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// SPIRV-NEXT:    ret i1 [[LOADEDV]]
+//
 _Bool fi4b(int *i) {
   int cmp = 0;
   int desired = 1;
@@ -259,10 +2788,98 @@ _Bool fi4b(int *i) {
                                           __MEMORY_SCOPE_DEVICE);
 }
 
-// AMDGCN-LABEL: define hidden zeroext i1 @fi4c(
-// AMDGCN:    [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("workgroup") acquire acquire, align 4
+// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi4c(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DESIRED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_DEF-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 0, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DESIRED_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DESIRED_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP3:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("workgroup") acquire acquire, align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// AMDGCN_CL_DEF-NEXT:    [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// AMDGCN_CL_DEF-NEXT:    br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_DEF:       [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP4]], ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_DEF:       [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_DEF-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// AMDGCN_CL_DEF-NEXT:    store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// AMDGCN_CL_DEF-NEXT:    ret i1 [[LOADEDV]]
+//
+// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi4c(
+// AMDGCN_CL_20-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT:    [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_20-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 0, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr addrspace(5) [[DESIRED]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr
+// AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP2]], i32 [[TMP3]] syncscope("workgroup") acquire acquire, align 4
+// AMDGCN_CL_20-NEXT:    [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+// AMDGCN_CL_20-NEXT:    [[TMP6:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+// AMDGCN_CL_20-NEXT:    br i1 [[TMP6]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_20:       [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP5]], ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT:    br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_20:       [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_20-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP6]] to i8
+// AMDGCN_CL_20-NEXT:    store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[TMP7:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP7]] to i1
+// AMDGCN_CL_20-NEXT:    ret i1 [[LOADEDV]]
+//
 // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi4c(
-// SPIRV:    [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("workgroup") acquire acquire, align 4
+// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[CMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DESIRED:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1
+// SPIRV-NEXT:    store ptr [[I]], ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    store i32 0, ptr [[CMP]], align 4
+// SPIRV-NEXT:    store i32 1, ptr [[DESIRED]], align 4
+// SPIRV-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4
+// SPIRV-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DESIRED]], align 4
+// SPIRV-NEXT:    [[TMP3:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("workgroup") acquire acquire, align 4
+// SPIRV-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// SPIRV-NEXT:    [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// SPIRV-NEXT:    br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// SPIRV:       [[CMPXCHG_STORE_EXPECTED]]:
+// SPIRV-NEXT:    store i32 [[TMP4]], ptr [[CMP]], align 4
+// SPIRV-NEXT:    br label %[[CMPXCHG_CONTINUE]]
+// SPIRV:       [[CMPXCHG_CONTINUE]]:
+// SPIRV-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// SPIRV-NEXT:    store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT:    [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// SPIRV-NEXT:    ret i1 [[LOADEDV]]
+//
 _Bool fi4c(int *i) {
   int cmp = 0;
   int desired = 1;
@@ -271,10 +2888,198 @@ _Bool fi4c(int *i) {
                                           __MEMORY_SCOPE_WRKGRP);
 }
 
-// AMDGCN-LABEL: define hidden zeroext i1 @fi4d(
-// AMDGCN:    [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("wavefront") acquire acquire, align 4
+// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi4_clustr(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DESIRED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_DEF-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 0, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DESIRED_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DESIRED_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP3:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("cluster") acquire acquire, align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// AMDGCN_CL_DEF-NEXT:    [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// AMDGCN_CL_DEF-NEXT:    br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_DEF:       [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP4]], ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_DEF:       [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_DEF-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// AMDGCN_CL_DEF-NEXT:    store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// AMDGCN_CL_DEF-NEXT:    ret i1 [[LOADEDV]]
+//
+// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi4_clustr(
+// AMDGCN_CL_20-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT:    [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_20-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 0, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr addrspace(5) [[DESIRED]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr
+// AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP2]], i32 [[TMP3]] syncscope("cluster") acquire acquire, align 4
+// AMDGCN_CL_20-NEXT:    [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+// AMDGCN_CL_20-NEXT:    [[TMP6:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+// AMDGCN_CL_20-NEXT:    br i1 [[TMP6]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_20:       [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP5]], ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT:    br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_20:       [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_20-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP6]] to i8
+// AMDGCN_CL_20-NEXT:    store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[TMP7:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP7]] to i1
+// AMDGCN_CL_20-NEXT:    ret i1 [[LOADEDV]]
+//
+// SPIRV-LABEL: define hidden spir_func zeroext i1 @fi4_clustr(
+// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[CMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DESIRED:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1
+// SPIRV-NEXT:    store ptr [[I]], ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    store i32 0, ptr [[CMP]], align 4
+// SPIRV-NEXT:    store i32 1, ptr [[DESIRED]], align 4
+// SPIRV-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4
+// SPIRV-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DESIRED]], align 4
+// SPIRV-NEXT:    [[TMP3:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("workgroup") acquire acquire, align 4
+// SPIRV-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// SPIRV-NEXT:    [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// SPIRV-NEXT:    br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// SPIRV:       [[CMPXCHG_STORE_EXPECTED]]:
+// SPIRV-NEXT:    store i32 [[TMP4]], ptr [[CMP]], align 4
+// SPIRV-NEXT:    br label %[[CMPXCHG_CONTINUE]]
+// SPIRV:       [[CMPXCHG_CONTINUE]]:
+// SPIRV-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// SPIRV-NEXT:    store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT:    [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// SPIRV-NEXT:    ret i1 [[LOADEDV]]
+//
+_Bool fi4_clustr(int *i) {
+  int cmp = 0;
+  int desired = 1;
+  return __scoped_atomic_compare_exchange(i, &cmp, &desired, 0,
+                                          __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE,
+                                          __MEMORY_SCOPE_CLUSTR);
+}
+
+// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi4d(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DESIRED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_DEF-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 0, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DESIRED_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DESIRED_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP3:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("wavefront") acquire acquire, align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// AMDGCN_CL_DEF-NEXT:    [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// AMDGCN_CL_DEF-NEXT:    br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_DEF:       [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP4]], ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_DEF:       [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_DEF-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// AMDGCN_CL_DEF-NEXT:    store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// AMDGCN_CL_DEF-NEXT:    ret i1 [[LOADEDV]]
+//
+// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi4d(
+// AMDGCN_CL_20-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT:    [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_20-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 0, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr addrspace(5) [[DESIRED]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr
+// AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP2]], i32 [[TMP3]] syncscope("wavefront") acquire acquire, align 4
+// AMDGCN_CL_20-NEXT:    [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+// AMDGCN_CL_20-NEXT:    [[TMP6:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+// AMDGCN_CL_20-NEXT:    br i1 [[TMP6]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_20:       [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP5]], ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT:    br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_20:       [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_20-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP6]] to i8
+// AMDGCN_CL_20-NEXT:    store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[TMP7:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP7]] to i1
+// AMDGCN_CL_20-NEXT:    ret i1 [[LOADEDV]]
+//
 // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi4d(
-// SPIRV:    [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("subgroup") acquire acquire, align 4
+// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[CMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DESIRED:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1
+// SPIRV-NEXT:    store ptr [[I]], ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    store i32 0, ptr [[CMP]], align 4
+// SPIRV-NEXT:    store i32 1, ptr [[DESIRED]], align 4
+// SPIRV-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4
+// SPIRV-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DESIRED]], align 4
+// SPIRV-NEXT:    [[TMP3:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("subgroup") acquire acquire, align 4
+// SPIRV-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// SPIRV-NEXT:    [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// SPIRV-NEXT:    br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// SPIRV:       [[CMPXCHG_STORE_EXPECTED]]:
+// SPIRV-NEXT:    store i32 [[TMP4]], ptr [[CMP]], align 4
+// SPIRV-NEXT:    br label %[[CMPXCHG_CONTINUE]]
+// SPIRV:       [[CMPXCHG_CONTINUE]]:
+// SPIRV-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// SPIRV-NEXT:    store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT:    [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// SPIRV-NEXT:    ret i1 [[LOADEDV]]
+//
 _Bool fi4d(int *i) {
   int cmp = 0;
   int desired = 1;
@@ -283,10 +3088,98 @@ _Bool fi4d(int *i) {
                                           __MEMORY_SCOPE_WVFRNT);
 }
 
-// AMDGCN-LABEL: define hidden zeroext i1 @fi4e(
-// AMDGCN:    [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("singlethread") acquire acquire, align 4
+// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi4e(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DESIRED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_DEF-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 0, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DESIRED_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DESIRED_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP3:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("singlethread") acquire acquire, align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// AMDGCN_CL_DEF-NEXT:    [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// AMDGCN_CL_DEF-NEXT:    br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_DEF:       [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP4]], ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_DEF:       [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_DEF-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// AMDGCN_CL_DEF-NEXT:    store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// AMDGCN_CL_DEF-NEXT:    ret i1 [[LOADEDV]]
+//
+// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi4e(
+// AMDGCN_CL_20-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT:    [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_20-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 0, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr addrspace(5) [[DESIRED]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr
+// AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP2]], i32 [[TMP3]] syncscope("singlethread") acquire acquire, align 4
+// AMDGCN_CL_20-NEXT:    [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+// AMDGCN_CL_20-NEXT:    [[TMP6:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+// AMDGCN_CL_20-NEXT:    br i1 [[TMP6]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_20:       [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP5]], ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT:    br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_20:       [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_20-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP6]] to i8
+// AMDGCN_CL_20-NEXT:    store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[TMP7:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP7]] to i1
+// AMDGCN_CL_20-NEXT:    ret i1 [[LOADEDV]]
+//
 // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi4e(
-// SPIRV:    [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("singlethread") acquire acquire, align 4
+// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[CMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DESIRED:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1
+// SPIRV-NEXT:    store ptr [[I]], ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    store i32 0, ptr [[CMP]], align 4
+// SPIRV-NEXT:    store i32 1, ptr [[DESIRED]], align 4
+// SPIRV-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4
+// SPIRV-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DESIRED]], align 4
+// SPIRV-NEXT:    [[TMP3:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("singlethread") acquire acquire, align 4
+// SPIRV-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// SPIRV-NEXT:    [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// SPIRV-NEXT:    br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// SPIRV:       [[CMPXCHG_STORE_EXPECTED]]:
+// SPIRV-NEXT:    store i32 [[TMP4]], ptr [[CMP]], align 4
+// SPIRV-NEXT:    br label %[[CMPXCHG_CONTINUE]]
+// SPIRV:       [[CMPXCHG_CONTINUE]]:
+// SPIRV-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// SPIRV-NEXT:    store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT:    [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// SPIRV-NEXT:    ret i1 [[LOADEDV]]
+//
 _Bool fi4e(int *i) {
   int cmp = 0;
   int desired = 1;
@@ -295,10 +3188,98 @@ _Bool fi4e(int *i) {
                                           __MEMORY_SCOPE_SINGLE);
 }
 
-// AMDGCN-LABEL: define hidden zeroext i1 @fi5a(
-// AMDGCN:    [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] acquire acquire, align 4
+// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi5a(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_DEF-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 0, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] acquire acquire, align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// AMDGCN_CL_DEF-NEXT:    [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// AMDGCN_CL_DEF-NEXT:    br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_DEF:       [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP4]], ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_DEF:       [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_DEF-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// AMDGCN_CL_DEF-NEXT:    store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// AMDGCN_CL_DEF-NEXT:    ret i1 [[LOADEDV]]
+//
+// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi5a(
+// AMDGCN_CL_20-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT:    [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_20-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 0, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] acquire acquire, align 4
+// AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// AMDGCN_CL_20-NEXT:    [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// AMDGCN_CL_20-NEXT:    br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_20:       [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP4]], ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT:    br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_20:       [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_20-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// AMDGCN_CL_20-NEXT:    store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// AMDGCN_CL_20-NEXT:    ret i1 [[LOADEDV]]
+//
 // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi5a(
-// SPIRV:    [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] acquire acquire, align 4
+// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[CMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1
+// SPIRV-NEXT:    store ptr [[I]], ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    store i32 0, ptr [[CMP]], align 4
+// SPIRV-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4
+// SPIRV-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT:    [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] acquire acquire, align 4
+// SPIRV-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// SPIRV-NEXT:    [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// SPIRV-NEXT:    br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// SPIRV:       [[CMPXCHG_STORE_EXPECTED]]:
+// SPIRV-NEXT:    store i32 [[TMP4]], ptr [[CMP]], align 4
+// SPIRV-NEXT:    br label %[[CMPXCHG_CONTINUE]]
+// SPIRV:       [[CMPXCHG_CONTINUE]]:
+// SPIRV-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// SPIRV-NEXT:    store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT:    [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// SPIRV-NEXT:    ret i1 [[LOADEDV]]
+//
 _Bool fi5a(int *i) {
   int cmp = 0;
   return __scoped_atomic_compare_exchange_n(i, &cmp, 1, 1, __ATOMIC_ACQUIRE,
@@ -306,10 +3287,98 @@ _Bool fi5a(int *i) {
                                             __MEMORY_SCOPE_SYSTEM);
 }
 
-// AMDGCN-LABEL: define hidden zeroext i1 @fi5b(
-// AMDGCN:    [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("agent") acquire acquire, align 4
+// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi5b(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_DEF-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 0, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("agent") acquire acquire, align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// AMDGCN_CL_DEF-NEXT:    [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// AMDGCN_CL_DEF-NEXT:    br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_DEF:       [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP4]], ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_DEF:       [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_DEF-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// AMDGCN_CL_DEF-NEXT:    store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// AMDGCN_CL_DEF-NEXT:    ret i1 [[LOADEDV]]
+//
+// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi5b(
+// AMDGCN_CL_20-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT:    [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_20-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 0, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("agent") acquire acquire, align 4
+// AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// AMDGCN_CL_20-NEXT:    [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// AMDGCN_CL_20-NEXT:    br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_20:       [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP4]], ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT:    br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_20:       [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_20-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// AMDGCN_CL_20-NEXT:    store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// AMDGCN_CL_20-NEXT:    ret i1 [[LOADEDV]]
+//
 // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi5b(
-// SPIRV:    [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("device") acquire acquire, align 4
+// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[CMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1
+// SPIRV-NEXT:    store ptr [[I]], ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    store i32 0, ptr [[CMP]], align 4
+// SPIRV-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4
+// SPIRV-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT:    [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("device") acquire acquire, align 4
+// SPIRV-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// SPIRV-NEXT:    [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// SPIRV-NEXT:    br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// SPIRV:       [[CMPXCHG_STORE_EXPECTED]]:
+// SPIRV-NEXT:    store i32 [[TMP4]], ptr [[CMP]], align 4
+// SPIRV-NEXT:    br label %[[CMPXCHG_CONTINUE]]
+// SPIRV:       [[CMPXCHG_CONTINUE]]:
+// SPIRV-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// SPIRV-NEXT:    store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT:    [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// SPIRV-NEXT:    ret i1 [[LOADEDV]]
+//
 _Bool fi5b(int *i) {
   int cmp = 0;
   return __scoped_atomic_compare_exchange_n(i, &cmp, 1, 1, __ATOMIC_ACQUIRE,
@@ -317,127 +3386,1161 @@ _Bool fi5b(int *i) {
                                             __MEMORY_SCOPE_DEVICE);
 }
 
-// AMDGCN-LABEL: define hidden zeroext i1 @fi5c(
-// AMDGCN:    [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("workgroup") acquire acquire, align 4
+// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi5c(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_DEF-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 0, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("workgroup") acquire acquire, align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// AMDGCN_CL_DEF-NEXT:    [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// AMDGCN_CL_DEF-NEXT:    br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_DEF:       [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP4]], ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_DEF:       [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_DEF-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// AMDGCN_CL_DEF-NEXT:    store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// AMDGCN_CL_DEF-NEXT:    ret i1 [[LOADEDV]]
+//
+// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi5c(
+// AMDGCN_CL_20-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT:    [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_20-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 0, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("workgroup") acquire acquire, align 4
+// AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// AMDGCN_CL_20-NEXT:    [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// AMDGCN_CL_20-NEXT:    br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_20:       [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP4]], ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT:    br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_20:       [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_20-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// AMDGCN_CL_20-NEXT:    store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// AMDGCN_CL_20-NEXT:    ret i1 [[LOADEDV]]
+//
 // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi5c(
-// SPIRV:    [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("workgroup") acquire acquire, align 4
+// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[CMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1
+// SPIRV-NEXT:    store ptr [[I]], ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    store i32 0, ptr [[CMP]], align 4
+// SPIRV-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4
+// SPIRV-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT:    [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("workgroup") acquire acquire, align 4
+// SPIRV-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// SPIRV-NEXT:    [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// SPIRV-NEXT:    br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// SPIRV:       [[CMPXCHG_STORE_EXPECTED]]:
+// SPIRV-NEXT:    store i32 [[TMP4]], ptr [[CMP]], align 4
+// SPIRV-NEXT:    br label %[[CMPXCHG_CONTINUE]]
+// SPIRV:       [[CMPXCHG_CONTINUE]]:
+// SPIRV-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// SPIRV-NEXT:    store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT:    [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// SPIRV-NEXT:    ret i1 [[LOADEDV]]
+//
 _Bool fi5c(int *i) {
   int cmp = 0;
   return __scoped_atomic_compare_exchange_n(
       i, &cmp, 1, 1, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE, __MEMORY_SCOPE_WRKGRP);
 }
 
-// AMDGCN-LABEL: define hidden zeroext i1 @fi5d(
-// AMDGCN:    [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("wavefront") acquire acquire, align 4
+// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi5_clustr(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_DEF-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 0, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("cluster") acquire acquire, align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// AMDGCN_CL_DEF-NEXT:    [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// AMDGCN_CL_DEF-NEXT:    br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_DEF:       [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP4]], ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_DEF:       [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_DEF-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// AMDGCN_CL_DEF-NEXT:    store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// AMDGCN_CL_DEF-NEXT:    ret i1 [[LOADEDV]]
+//
+// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi5_clustr(
+// AMDGCN_CL_20-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT:    [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_20-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 0, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("cluster") acquire acquire, align 4
+// AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// AMDGCN_CL_20-NEXT:    [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// AMDGCN_CL_20-NEXT:    br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_20:       [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP4]], ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT:    br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_20:       [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_20-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// AMDGCN_CL_20-NEXT:    store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// AMDGCN_CL_20-NEXT:    ret i1 [[LOADEDV]]
+//
+// SPIRV-LABEL: define hidden spir_func zeroext i1 @fi5_clustr(
+// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[CMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1
+// SPIRV-NEXT:    store ptr [[I]], ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    store i32 0, ptr [[CMP]], align 4
+// SPIRV-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4
+// SPIRV-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT:    [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("workgroup") acquire acquire, align 4
+// SPIRV-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// SPIRV-NEXT:    [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// SPIRV-NEXT:    br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// SPIRV:       [[CMPXCHG_STORE_EXPECTED]]:
+// SPIRV-NEXT:    store i32 [[TMP4]], ptr [[CMP]], align 4
+// SPIRV-NEXT:    br label %[[CMPXCHG_CONTINUE]]
+// SPIRV:       [[CMPXCHG_CONTINUE]]:
+// SPIRV-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// SPIRV-NEXT:    store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT:    [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// SPIRV-NEXT:    ret i1 [[LOADEDV]]
+//
+_Bool fi5_clustr(int *i) {
+  int cmp = 0;
+  return __scoped_atomic_compare_exchange_n(
+      i, &cmp, 1, 1, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE, __MEMORY_SCOPE_CLUSTR);
+}
+
+// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi5d(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_DEF-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 0, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("wavefront") acquire acquire, align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// AMDGCN_CL_DEF-NEXT:    [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// AMDGCN_CL_DEF-NEXT:    br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_DEF:       [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP4]], ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_DEF:       [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_DEF-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// AMDGCN_CL_DEF-NEXT:    store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// AMDGCN_CL_DEF-NEXT:    ret i1 [[LOADEDV]]
+//
+// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi5d(
+// AMDGCN_CL_20-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT:    [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_20-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 0, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("wavefront") acquire acquire, align 4
+// AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// AMDGCN_CL_20-NEXT:    [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// AMDGCN_CL_20-NEXT:    br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_20:       [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP4]], ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT:    br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_20:       [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_20-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// AMDGCN_CL_20-NEXT:    store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// AMDGCN_CL_20-NEXT:    ret i1 [[LOADEDV]]
+//
 // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi5d(
-// SPIRV:    [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("subgroup") acquire acquire, align 4
+// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[CMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1
+// SPIRV-NEXT:    store ptr [[I]], ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    store i32 0, ptr [[CMP]], align 4
+// SPIRV-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4
+// SPIRV-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT:    [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("subgroup") acquire acquire, align 4
+// SPIRV-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// SPIRV-NEXT:    [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// SPIRV-NEXT:    br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// SPIRV:       [[CMPXCHG_STORE_EXPECTED]]:
+// SPIRV-NEXT:    store i32 [[TMP4]], ptr [[CMP]], align 4
+// SPIRV-NEXT:    br label %[[CMPXCHG_CONTINUE]]
+// SPIRV:       [[CMPXCHG_CONTINUE]]:
+// SPIRV-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// SPIRV-NEXT:    store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT:    [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// SPIRV-NEXT:    ret i1 [[LOADEDV]]
+//
 _Bool fi5d(int *i) {
   int cmp = 0;
   return __scoped_atomic_compare_exchange_n(
       i, &cmp, 1, 1, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE, __MEMORY_SCOPE_WVFRNT);
 }
 
-// AMDGCN-LABEL: define hidden zeroext i1 @fi5e(
-// AMDGCN:    [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("singlethread") acquire acquire, align 4
+// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi5e(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_DEF-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 0, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("singlethread") acquire acquire, align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// AMDGCN_CL_DEF-NEXT:    [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// AMDGCN_CL_DEF-NEXT:    br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_DEF:       [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP4]], ptr [[CMP_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_DEF:       [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_DEF-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// AMDGCN_CL_DEF-NEXT:    store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// AMDGCN_CL_DEF-NEXT:    ret i1 [[LOADEDV]]
+//
+// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi5e(
+// AMDGCN_CL_20-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT:    [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[CMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT:    [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT:    [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr
+// AMDGCN_CL_20-NEXT:    store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 0, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("singlethread") acquire acquire, align 4
+// AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// AMDGCN_CL_20-NEXT:    [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// AMDGCN_CL_20-NEXT:    br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// AMDGCN_CL_20:       [[CMPXCHG_STORE_EXPECTED]]:
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP4]], ptr addrspace(5) [[CMP]], align 4
+// AMDGCN_CL_20-NEXT:    br label %[[CMPXCHG_CONTINUE]]
+// AMDGCN_CL_20:       [[CMPXCHG_CONTINUE]]:
+// AMDGCN_CL_20-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// AMDGCN_CL_20-NEXT:    store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// AMDGCN_CL_20-NEXT:    ret i1 [[LOADEDV]]
+//
 // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi5e(
-// SPIRV:    [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("singlethread") acquire acquire, align 4
+// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[I_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[CMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1
+// SPIRV-NEXT:    store ptr [[I]], ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    store i32 0, ptr [[CMP]], align 4
+// SPIRV-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8
+// SPIRV-NEXT:    store i32 1, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4
+// SPIRV-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4
+// SPIRV-NEXT:    [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("singlethread") acquire acquire, align 4
+// SPIRV-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0
+// SPIRV-NEXT:    [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+// SPIRV-NEXT:    br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// SPIRV:       [[CMPXCHG_STORE_EXPECTED]]:
+// SPIRV-NEXT:    store i32 [[TMP4]], ptr [[CMP]], align 4
+// SPIRV-NEXT:    br label %[[CMPXCHG_CONTINUE]]
+// SPIRV:       [[CMPXCHG_CONTINUE]]:
+// SPIRV-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8
+// SPIRV-NEXT:    store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT:    [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1
+// SPIRV-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1
+// SPIRV-NEXT:    ret i1 [[LOADEDV]]
+//
 _Bool fi5e(int *i) {
   int cmp = 0;
   return __scoped_atomic_compare_exchange_n(
       i, &cmp, 1, 1, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE, __MEMORY_SCOPE_SINGLE);
 }
 
-// AMDGCN-LABEL: define hidden i32 @fi6a(
-// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] monotonic, align 4
+// AMDGCN_CL_DEF-LABEL: define hidden i32 @fi6a(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[RET:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[RET_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr
+// AMDGCN_CL_DEF-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP3]], ptr [[RET_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP4:%.*]] = load i32, ptr [[RET_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    ret i32 [[TMP4]]
+//
+// AMDGCN_CL_20-LABEL: define hidden i32 @fi6a(
+// AMDGCN_CL_20-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[RET:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr
+// AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP3]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP4]], ptr [[TMP2]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(5) [[RET]], align 4
+// AMDGCN_CL_20-NEXT:    ret i32 [[TMP5]]
+//
 // SPIRV-LABEL: define hidden spir_func i32 @fi6a(
-// SPIRV:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] monotonic, align 4
+// SPIRV-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[RET:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[D]], ptr [[D_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[D_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// SPIRV-NEXT:    [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP3]], ptr [[RET]], align 4
+// SPIRV-NEXT:    [[TMP4:%.*]] = load i32, ptr [[RET]], align 4
+// SPIRV-NEXT:    ret i32 [[TMP4]]
+//
 int fi6a(int *c, int *d) {
   int ret;
   __scoped_atomic_exchange(c, d, &ret, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM);
   return ret;
 }
 
-// AMDGCN-LABEL: define hidden i32 @fi6b(
-// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("agent") monotonic, align 4
+// AMDGCN_CL_DEF-LABEL: define hidden i32 @fi6b(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[RET:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[RET_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr
+// AMDGCN_CL_DEF-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP3]], ptr [[RET_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP4:%.*]] = load i32, ptr [[RET_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    ret i32 [[TMP4]]
+//
+// AMDGCN_CL_20-LABEL: define hidden i32 @fi6b(
+// AMDGCN_CL_20-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[RET:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr
+// AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP3]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP4]], ptr [[TMP2]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(5) [[RET]], align 4
+// AMDGCN_CL_20-NEXT:    ret i32 [[TMP5]]
+//
 // SPIRV-LABEL: define hidden spir_func i32 @fi6b(
-// SPIRV:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("device") monotonic, align 4
+// SPIRV-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[RET:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[D]], ptr [[D_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[D_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// SPIRV-NEXT:    [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] syncscope("device") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP3]], ptr [[RET]], align 4
+// SPIRV-NEXT:    [[TMP4:%.*]] = load i32, ptr [[RET]], align 4
+// SPIRV-NEXT:    ret i32 [[TMP4]]
+//
 int fi6b(int *c, int *d) {
   int ret;
   __scoped_atomic_exchange(c, d, &ret, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE);
   return ret;
 }
 
-// AMDGCN-LABEL: define hidden i32 @fi6c(
-// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("workgroup") monotonic, align 4
+// AMDGCN_CL_DEF-LABEL: define hidden i32 @fi6c(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[RET:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[RET_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr
+// AMDGCN_CL_DEF-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP3]], ptr [[RET_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP4:%.*]] = load i32, ptr [[RET_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    ret i32 [[TMP4]]
+//
+// AMDGCN_CL_20-LABEL: define hidden i32 @fi6c(
+// AMDGCN_CL_20-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[RET:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr
+// AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP3]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP4]], ptr [[TMP2]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(5) [[RET]], align 4
+// AMDGCN_CL_20-NEXT:    ret i32 [[TMP5]]
+//
 // SPIRV-LABEL: define hidden spir_func i32 @fi6c(
-// SPIRV:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("workgroup") monotonic, align 4
+// SPIRV-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[RET:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[D]], ptr [[D_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[D_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// SPIRV-NEXT:    [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP3]], ptr [[RET]], align 4
+// SPIRV-NEXT:    [[TMP4:%.*]] = load i32, ptr [[RET]], align 4
+// SPIRV-NEXT:    ret i32 [[TMP4]]
+//
 int fi6c(int *c, int *d) {
   int ret;
   __scoped_atomic_exchange(c, d, &ret, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP);
   return ret;
 }
 
-// AMDGCN-LABEL: define hidden i32 @fi6d(
-// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("wavefront") monotonic, align 4
+// AMDGCN_CL_DEF-LABEL: define hidden i32 @fi6_clustr(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[RET:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[RET_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr
+// AMDGCN_CL_DEF-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP3]], ptr [[RET_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP4:%.*]] = load i32, ptr [[RET_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    ret i32 [[TMP4]]
+//
+// AMDGCN_CL_20-LABEL: define hidden i32 @fi6_clustr(
+// AMDGCN_CL_20-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[RET:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr
+// AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP3]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP4]], ptr [[TMP2]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(5) [[RET]], align 4
+// AMDGCN_CL_20-NEXT:    ret i32 [[TMP5]]
+//
+// SPIRV-LABEL: define hidden spir_func i32 @fi6_clustr(
+// SPIRV-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[RET:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[D]], ptr [[D_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[D_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// SPIRV-NEXT:    [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] syncscope("workgroup") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP3]], ptr [[RET]], align 4
+// SPIRV-NEXT:    [[TMP4:%.*]] = load i32, ptr [[RET]], align 4
+// SPIRV-NEXT:    ret i32 [[TMP4]]
+//
+int fi6_clustr(int *c, int *d) {
+  int ret;
+  __scoped_atomic_exchange(c, d, &ret, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR);
+  return ret;
+}
+
+// AMDGCN_CL_DEF-LABEL: define hidden i32 @fi6d(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[RET:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[RET_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr
+// AMDGCN_CL_DEF-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP3]], ptr [[RET_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP4:%.*]] = load i32, ptr [[RET_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    ret i32 [[TMP4]]
+//
+// AMDGCN_CL_20-LABEL: define hidden i32 @fi6d(
+// AMDGCN_CL_20-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[RET:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr
+// AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP3]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP4]], ptr [[TMP2]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(5) [[RET]], align 4
+// AMDGCN_CL_20-NEXT:    ret i32 [[TMP5]]
+//
 // SPIRV-LABEL: define hidden spir_func i32 @fi6d(
-// SPIRV:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("subgroup") monotonic, align 4
+// SPIRV-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[RET:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[D]], ptr [[D_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[D_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// SPIRV-NEXT:    [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] syncscope("subgroup") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP3]], ptr [[RET]], align 4
+// SPIRV-NEXT:    [[TMP4:%.*]] = load i32, ptr [[RET]], align 4
+// SPIRV-NEXT:    ret i32 [[TMP4]]
+//
 int fi6d(int *c, int *d) {
   int ret;
   __scoped_atomic_exchange(c, d, &ret, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT);
   return ret;
 }
 
-// AMDGCN-LABEL: define hidden i32 @fi6e(
-// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("singlethread") monotonic, align 4
+// AMDGCN_CL_DEF-LABEL: define hidden i32 @fi6e(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[RET:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[RET_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr
+// AMDGCN_CL_DEF-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i32 [[TMP3]], ptr [[RET_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    [[TMP4:%.*]] = load i32, ptr [[RET_ASCAST]], align 4
+// AMDGCN_CL_DEF-NEXT:    ret i32 [[TMP4]]
+//
+// AMDGCN_CL_20-LABEL: define hidden i32 @fi6e(
+// AMDGCN_CL_20-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[RET:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr
+// AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP4:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP3]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i32 [[TMP4]], ptr [[TMP2]], align 4
+// AMDGCN_CL_20-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(5) [[RET]], align 4
+// AMDGCN_CL_20-NEXT:    ret i32 [[TMP5]]
+//
 // SPIRV-LABEL: define hidden spir_func i32 @fi6e(
-// SPIRV:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("singlethread") monotonic, align 4
+// SPIRV-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[RET:%.*]] = alloca i32, align 4
+// SPIRV-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
+// SPIRV-NEXT:    store ptr [[D]], ptr [[D_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[D_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// SPIRV-NEXT:    [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] syncscope("singlethread") monotonic, align 4
+// SPIRV-NEXT:    store i32 [[TMP3]], ptr [[RET]], align 4
+// SPIRV-NEXT:    [[TMP4:%.*]] = load i32, ptr [[RET]], align 4
+// SPIRV-NEXT:    ret i32 [[TMP4]]
+//
 int fi6e(int *c, int *d) {
   int ret;
   __scoped_atomic_exchange(c, d, &ret, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE);
   return ret;
 }
 
-// AMDGCN-LABEL: define hidden zeroext i1 @fi7a(
-// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] monotonic, align 1
+// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi7a(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_DEF-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1
+// AMDGCN_CL_DEF-NEXT:    ret i1 [[LOADEDV]]
+//
+// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi7a(
+// AMDGCN_CL_20-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT:    [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] monotonic, align 1, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1
+// AMDGCN_CL_20-NEXT:    ret i1 [[LOADEDV]]
+//
 // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi7a(
-// SPIRV:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] monotonic, align 1
+// SPIRV-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i8, align 1
+// SPIRV-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i8, align 1
+// SPIRV-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT:    store i8 1, ptr [[DOTATOMICTMP]], align 1
+// SPIRV-NEXT:    [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP]], align 1
+// SPIRV-NEXT:    [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] monotonic, align 1
+// SPIRV-NEXT:    store i8 [[TMP2]], ptr [[ATOMIC_TEMP]], align 1
+// SPIRV-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP]], align 1
+// SPIRV-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1
+// SPIRV-NEXT:    ret i1 [[LOADEDV]]
+//
 _Bool fi7a(_Bool *c) {
   return __scoped_atomic_exchange_n(c, 1, __ATOMIC_RELAXED,
                                     __MEMORY_SCOPE_SYSTEM);
 }
 
-// AMDGCN-LABEL: define hidden zeroext i1 @fi7b(
-// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("agent") monotonic, align 1
+// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi7b(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_DEF-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("agent") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1
+// AMDGCN_CL_DEF-NEXT:    ret i1 [[LOADEDV]]
+//
+// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi7b(
+// AMDGCN_CL_20-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT:    [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("agent") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1
+// AMDGCN_CL_20-NEXT:    ret i1 [[LOADEDV]]
+//
 // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi7b(
-// SPIRV:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("device") monotonic, align 1
+// SPIRV-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i8, align 1
+// SPIRV-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i8, align 1
+// SPIRV-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT:    store i8 1, ptr [[DOTATOMICTMP]], align 1
+// SPIRV-NEXT:    [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP]], align 1
+// SPIRV-NEXT:    [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("device") monotonic, align 1
+// SPIRV-NEXT:    store i8 [[TMP2]], ptr [[ATOMIC_TEMP]], align 1
+// SPIRV-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP]], align 1
+// SPIRV-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1
+// SPIRV-NEXT:    ret i1 [[LOADEDV]]
+//
 _Bool fi7b(_Bool *c) {
   return __scoped_atomic_exchange_n(c, 1, __ATOMIC_RELAXED,
                                     __MEMORY_SCOPE_DEVICE);
 }
 
-// AMDGCN-LABEL: define hidden zeroext i1 @fi7c(
-// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("workgroup") monotonic, align 1
+// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi7c(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_DEF-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("workgroup") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1
+// AMDGCN_CL_DEF-NEXT:    ret i1 [[LOADEDV]]
+//
+// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi7c(
+// AMDGCN_CL_20-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT:    [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("workgroup") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1
+// AMDGCN_CL_20-NEXT:    ret i1 [[LOADEDV]]
+//
 // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi7c(
-// SPIRV:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("workgroup") monotonic, align 1
+// SPIRV-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i8, align 1
+// SPIRV-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i8, align 1
+// SPIRV-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT:    store i8 1, ptr [[DOTATOMICTMP]], align 1
+// SPIRV-NEXT:    [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP]], align 1
+// SPIRV-NEXT:    [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("workgroup") monotonic, align 1
+// SPIRV-NEXT:    store i8 [[TMP2]], ptr [[ATOMIC_TEMP]], align 1
+// SPIRV-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP]], align 1
+// SPIRV-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1
+// SPIRV-NEXT:    ret i1 [[LOADEDV]]
+//
 _Bool fi7c(_Bool *c) {
   return __scoped_atomic_exchange_n(c, 1, __ATOMIC_RELAXED,
                                     __MEMORY_SCOPE_WRKGRP);
 }
 
-// AMDGCN-LABEL: define hidden zeroext i1 @fi7d(
-// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("wavefront") monotonic, align 1
+// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi7_clustr(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_DEF-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("cluster") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1
+// AMDGCN_CL_DEF-NEXT:    ret i1 [[LOADEDV]]
+//
+// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi7_clustr(
+// AMDGCN_CL_20-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT:    [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("cluster") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1
+// AMDGCN_CL_20-NEXT:    ret i1 [[LOADEDV]]
+//
+// SPIRV-LABEL: define hidden spir_func zeroext i1 @fi7_clustr(
+// SPIRV-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i8, align 1
+// SPIRV-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i8, align 1
+// SPIRV-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT:    store i8 1, ptr [[DOTATOMICTMP]], align 1
+// SPIRV-NEXT:    [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP]], align 1
+// SPIRV-NEXT:    [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("workgroup") monotonic, align 1
+// SPIRV-NEXT:    store i8 [[TMP2]], ptr [[ATOMIC_TEMP]], align 1
+// SPIRV-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP]], align 1
+// SPIRV-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1
+// SPIRV-NEXT:    ret i1 [[LOADEDV]]
+//
+_Bool fi7_clustr(_Bool *c) {
+  return __scoped_atomic_exchange_n(c, 1, __ATOMIC_RELAXED,
+                                    __MEMORY_SCOPE_CLUSTR);
+}
+
+// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi7d(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_DEF-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("wavefront") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1
+// AMDGCN_CL_DEF-NEXT:    ret i1 [[LOADEDV]]
+//
+// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi7d(
+// AMDGCN_CL_20-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT:    [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("wavefront") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1
+// AMDGCN_CL_20-NEXT:    ret i1 [[LOADEDV]]
+//
 // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi7d(
-// SPIRV:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("subgroup") monotonic, align 1
+// SPIRV-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i8, align 1
+// SPIRV-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i8, align 1
+// SPIRV-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT:    store i8 1, ptr [[DOTATOMICTMP]], align 1
+// SPIRV-NEXT:    [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP]], align 1
+// SPIRV-NEXT:    [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("subgroup") monotonic, align 1
+// SPIRV-NEXT:    store i8 [[TMP2]], ptr [[ATOMIC_TEMP]], align 1
+// SPIRV-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP]], align 1
+// SPIRV-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1
+// SPIRV-NEXT:    ret i1 [[LOADEDV]]
+//
 _Bool fi7d(_Bool *c) {
   return __scoped_atomic_exchange_n(c, 1, __ATOMIC_RELAXED,
                                     __MEMORY_SCOPE_WVFRNT);
 }
 
-// AMDGCN-LABEL: define hidden zeroext i1 @fi7e(
-// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("singlethread") monotonic, align 1
+// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi7e(
+// AMDGCN_CL_DEF-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_DEF-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_DEF-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_DEF-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_DEF-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_DEF-NEXT:    store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("singlethread") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]]
+// AMDGCN_CL_DEF-NEXT:    store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_DEF-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1
+// AMDGCN_CL_DEF-NEXT:    ret i1 [[LOADEDV]]
+//
+// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi7e(
+// AMDGCN_CL_20-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] {
+// AMDGCN_CL_20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN_CL_20-NEXT:    [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5)
+// AMDGCN_CL_20-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN_CL_20-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// AMDGCN_CL_20-NEXT:    [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr
+// AMDGCN_CL_20-NEXT:    [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr
+// AMDGCN_CL_20-NEXT:    store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
+// AMDGCN_CL_20-NEXT:    store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("singlethread") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]]
+// AMDGCN_CL_20-NEXT:    store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1
+// AMDGCN_CL_20-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1
+// AMDGCN_CL_20-NEXT:    ret i1 [[LOADEDV]]
+//
 // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi7e(
-// SPIRV:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("singlethread") monotonic, align 1
+// SPIRV-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] {
+// SPIRV-NEXT:  [[ENTRY:.*:]]
+// SPIRV-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
+// SPIRV-NEXT:    [[DOTATOMICTMP:%.*]] = alloca i8, align 1
+// SPIRV-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i8, align 1
+// SPIRV-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
+// SPIRV-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// SPIRV-NEXT:    store i8 1, ptr [[DOTATOMICTMP]], align 1
+// SPIRV-NEXT:    [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP]], align 1
+// SPIRV-NEXT:    [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("singlethread") monotonic, align 1
+// SPIRV-NEXT:    store i8 [[TMP2]], ptr [[ATOMIC_TEMP]], align 1
+// SPIRV-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP]], align 1
+// SPIRV-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1
+// SPIRV-NEXT:    ret i1 [[LOADEDV]]
+//
 _Bool fi7e(_Bool *c) {
   return __scoped_atomic_exchange_n(c, 1, __ATOMIC_RELAXED,
                                     __MEMORY_SCOPE_SINGLE);
 }
+//.
+// AMDGCN_CL_DEF: [[META3]] = !{}
+//.
+// AMDGCN_CL_20: [[META4]] = !{}
+//.
diff --git a/clang/test/CodeGen/scoped-fence-ops.c b/clang/test/CodeGen/scoped-fence-ops.c
index 1f8a9c5248c58..fb11f589c13dd 100644
--- a/clang/test/CodeGen/scoped-fence-ops.c
+++ b/clang/test/CodeGen/scoped-fence-ops.c
@@ -30,6 +30,35 @@ void fe1a() {
   __scoped_atomic_thread_fence(__ATOMIC_RELEASE, __MEMORY_SCOPE_WRKGRP);
 }
 
+// AMDGCN-LABEL: define hidden void @fe1b(
+// AMDGCN-SAME: i32 noundef [[ORD:%.*]]) #[[ATTR0]] {
+// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-NEXT:    [[ORD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT:    [[ORD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ORD_ADDR]] to ptr
+// AMDGCN-NEXT:    store i32 [[ORD]], ptr [[ORD_ADDR_ASCAST]], align 4
+// AMDGCN-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ORD_ADDR_ASCAST]], align 4
+// AMDGCN-NEXT:    switch i32 [[TMP0]], label %[[ATOMIC_SCOPE_CONTINUE:.*]] [
+// AMDGCN-NEXT:      i32 1, label %[[ACQUIRE:.*]]
+// AMDGCN-NEXT:      i32 2, label %[[ACQUIRE]]
+// AMDGCN-NEXT:      i32 3, label %[[RELEASE:.*]]
+// AMDGCN-NEXT:      i32 4, label %[[ACQREL:.*]]
+// AMDGCN-NEXT:      i32 5, label %[[SEQCST:.*]]
+// AMDGCN-NEXT:    ]
+// AMDGCN:       [[ATOMIC_SCOPE_CONTINUE]]:
+// AMDGCN-NEXT:    ret void
+// AMDGCN:       [[ACQUIRE]]:
+// AMDGCN-NEXT:    fence syncscope("workgroup") acquire
+// AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN:       [[RELEASE]]:
+// AMDGCN-NEXT:    fence syncscope("workgroup") release
+// AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN:       [[ACQREL]]:
+// AMDGCN-NEXT:    fence syncscope("workgroup") acq_rel
+// AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN:       [[SEQCST]]:
+// AMDGCN-NEXT:    fence syncscope("workgroup") seq_cst
+// AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+//
 // SPIRV-LABEL: define hidden spir_func void @fe1b(
 // SPIRV-SAME: i32 noundef [[ORD:%.*]]) #[[ATTR0]] {
 // SPIRV-NEXT:  [[ENTRY:.*:]]
@@ -90,6 +119,42 @@ void fe1b(int ord) {
   __scoped_atomic_thread_fence(ord, __MEMORY_SCOPE_WRKGRP);
 }
 
+// AMDGCN-LABEL: define hidden void @fe1c(
+// AMDGCN-SAME: i32 noundef [[SCOPE:%.*]]) #[[ATTR0]] {
+// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-NEXT:    [[SCOPE_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT:    [[SCOPE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCOPE_ADDR]] to ptr
+// AMDGCN-NEXT:    store i32 [[SCOPE]], ptr [[SCOPE_ADDR_ASCAST]], align 4
+// AMDGCN-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SCOPE_ADDR_ASCAST]], align 4
+// AMDGCN-NEXT:    switch i32 [[TMP0]], label %[[ATOMIC_SCOPE_CONTINUE:.*]] [
+// AMDGCN-NEXT:      i32 0, label %[[SYSTEM_SCOPE:.*]]
+// AMDGCN-NEXT:      i32 1, label %[[DEVICE_SCOPE:.*]]
+// AMDGCN-NEXT:      i32 2, label %[[WORKGROUP_SCOPE:.*]]
+// AMDGCN-NEXT:      i32 5, label %[[CLUSTER_SCOPE:.*]]
+// AMDGCN-NEXT:      i32 3, label %[[WAVEFRONT_SCOPE:.*]]
+// AMDGCN-NEXT:      i32 4, label %[[SINGLE_SCOPE:.*]]
+// AMDGCN-NEXT:    ]
+// AMDGCN:       [[ATOMIC_SCOPE_CONTINUE]]:
+// AMDGCN-NEXT:    ret void
+// AMDGCN:       [[SYSTEM_SCOPE]]:
+// AMDGCN-NEXT:    fence release
+// AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN:       [[DEVICE_SCOPE]]:
+// AMDGCN-NEXT:    fence syncscope("agent") release
+// AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN:       [[WORKGROUP_SCOPE]]:
+// AMDGCN-NEXT:    fence syncscope("workgroup") release
+// AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN:       [[CLUSTER_SCOPE]]:
+// AMDGCN-NEXT:    fence syncscope("cluster") release
+// AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN:       [[WAVEFRONT_SCOPE]]:
+// AMDGCN-NEXT:    fence syncscope("wavefront") release
+// AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// AMDGCN:       [[SINGLE_SCOPE]]:
+// AMDGCN-NEXT:    fence syncscope("singlethread") release
+// AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+//
 // SPIRV-LABEL: define hidden spir_func void @fe1c(
 // SPIRV-SAME: i32 noundef [[SCOPE:%.*]]) #[[ATTR0]] {
 // SPIRV-NEXT:  [[ENTRY:.*:]]
@@ -97,23 +162,27 @@ void fe1b(int ord) {
 // SPIRV-NEXT:    store i32 [[SCOPE]], ptr [[SCOPE_ADDR]], align 4
 // SPIRV-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SCOPE_ADDR]], align 4
 // SPIRV-NEXT:    switch i32 [[TMP0]], label %[[ATOMIC_SCOPE_CONTINUE:.*]] [
-// SPIRV-NEXT:      i32 1, label %[[DEVICE_SCOPE:.*]]
 // SPIRV-NEXT:      i32 0, label %[[SYSTEM_SCOPE:.*]]
+// SPIRV-NEXT:      i32 1, label %[[DEVICE_SCOPE:.*]]
 // SPIRV-NEXT:      i32 2, label %[[WORKGROUP_SCOPE:.*]]
+// SPIRV-NEXT:      i32 5, label %[[CLUSTER_SCOPE:.*]]
 // SPIRV-NEXT:      i32 3, label %[[WAVEFRONT_SCOPE:.*]]
 // SPIRV-NEXT:      i32 4, label %[[SINGLE_SCOPE:.*]]
 // SPIRV-NEXT:    ]
 // SPIRV:       [[ATOMIC_SCOPE_CONTINUE]]:
 // SPIRV-NEXT:    ret void
-// SPIRV:       [[DEVICE_SCOPE]]:
-// SPIRV-NEXT:    fence syncscope("device") release
-// SPIRV-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
 // SPIRV:       [[SYSTEM_SCOPE]]:
 // SPIRV-NEXT:    fence release
 // SPIRV-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// SPIRV:       [[DEVICE_SCOPE]]:
+// SPIRV-NEXT:    fence syncscope("device") release
+// SPIRV-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
 // SPIRV:       [[WORKGROUP_SCOPE]]:
 // SPIRV-NEXT:    fence syncscope("workgroup") release
 // SPIRV-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// SPIRV:       [[CLUSTER_SCOPE]]:
+// SPIRV-NEXT:    fence syncscope("workgroup") release
+// SPIRV-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
 // SPIRV:       [[WAVEFRONT_SCOPE]]:
 // SPIRV-NEXT:    fence syncscope("subgroup") release
 // SPIRV-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
@@ -128,23 +197,27 @@ void fe1b(int ord) {
 // X86_64-NEXT:    store i32 [[SCOPE]], ptr [[SCOPE_ADDR]], align 4
 // X86_64-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SCOPE_ADDR]], align 4
 // X86_64-NEXT:    switch i32 [[TMP0]], label %[[ATOMIC_SCOPE_CONTINUE:.*]] [
-// X86_64-NEXT:      i32 1, label %[[DEVICE_SCOPE:.*]]
 // X86_64-NEXT:      i32 0, label %[[SYSTEM_SCOPE:.*]]
+// X86_64-NEXT:      i32 1, label %[[DEVICE_SCOPE:.*]]
 // X86_64-NEXT:      i32 2, label %[[WORKGROUP_SCOPE:.*]]
+// X86_64-NEXT:      i32 5, label %[[CLUSTER_SCOPE:.*]]
 // X86_64-NEXT:      i32 3, label %[[WAVEFRONT_SCOPE:.*]]
 // X86_64-NEXT:      i32 4, label %[[SINGLE_SCOPE:.*]]
 // X86_64-NEXT:    ]
 // X86_64:       [[ATOMIC_SCOPE_CONTINUE]]:
 // X86_64-NEXT:    ret void
-// X86_64:       [[DEVICE_SCOPE]]:
+// X86_64:       [[SYSTEM_SCOPE]]:
 // X86_64-NEXT:    fence release
 // X86_64-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
-// X86_64:       [[SYSTEM_SCOPE]]:
+// X86_64:       [[DEVICE_SCOPE]]:
 // X86_64-NEXT:    fence release
 // X86_64-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
 // X86_64:       [[WORKGROUP_SCOPE]]:
 // X86_64-NEXT:    fence release
 // X86_64-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
+// X86_64:       [[CLUSTER_SCOPE]]:
+// X86_64-NEXT:    fence release
+// X86_64-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
 // X86_64:       [[WAVEFRONT_SCOPE]]:
 // X86_64-NEXT:    fence release
 // X86_64-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
diff --git a/clang/test/CodeGen/target-builtin-noerror.c b/clang/test/CodeGen/target-builtin-noerror.c
index 855538f27526a..c960193d2e9c9 100644
--- a/clang/test/CodeGen/target-builtin-noerror.c
+++ b/clang/test/CodeGen/target-builtin-noerror.c
@@ -178,6 +178,8 @@ void verifycpustrings(void) {
   (void)__builtin_cpu_is("lunarlake");
   (void)__builtin_cpu_is("clearwaterforest");
   (void)__builtin_cpu_is("pantherlake");
+  (void)__builtin_cpu_is("wildcatlake");
+  (void)__builtin_cpu_is("novalake");
   (void)__builtin_cpu_is("haswell");
   (void)__builtin_cpu_is("icelake-client");
   (void)__builtin_cpu_is("icelake-server");
diff --git a/clang/test/CodeGen/tbaa-class.cpp b/clang/test/CodeGen/tbaa-class.cpp
index 0ac59085e634d..4193e10afac8e 100644
--- a/clang/test/CodeGen/tbaa-class.cpp
+++ b/clang/test/CodeGen/tbaa-class.cpp
@@ -242,16 +242,16 @@ uint32_t g14(StructM2 *M, StructS *S) {
   return S->f16;
 }
 
+// CHECK: [[TAG_i32]] = !{[[TYPE_i32:!.*]], [[TYPE_i32]], i64 0}
+// CHECK: [[TYPE_i32]] = !{!"int", [[TYPE_char:!.*]],
 // CHECK: [[TYPE_char:!.*]] = !{!"omnipotent char", [[TAG_cxx_tbaa:!.*]],
 // CHECK: [[TAG_cxx_tbaa]] = !{!"Simple C++ TBAA"}
-// CHECK: [[TAG_i32]] = !{[[TYPE_i32:!.*]], [[TYPE_i32]], i64 0}
-// CHECK: [[TYPE_i32]] = !{!"int", [[TYPE_char]],
 // CHECK: [[TAG_i16]] = !{[[TYPE_i16:!.*]], [[TYPE_i16]], i64 0}
 // CHECK: [[TYPE_i16]] = !{!"short", [[TYPE_char]],
 
-// OLD-PATH: [[TYPE_CHAR:!.*]] = !{!"omnipotent char", !
 // OLD-PATH: [[TAG_i32]] = !{[[TYPE_INT:!.*]], [[TYPE_INT]], i64 0}
-// OLD-PATH: [[TYPE_INT]] = !{!"int", [[TYPE_CHAR]]
+// OLD-PATH: [[TYPE_INT]] = !{!"int", [[TYPE_CHAR:!.*]]
+// OLD-PATH: [[TYPE_CHAR:!.*]] = !{!"omnipotent char", !
 // OLD-PATH: [[TAG_A_f32]] = !{[[TYPE_A:!.*]], [[TYPE_INT]], i64 4}
 // OLD-PATH: [[TYPE_A]] = !{!"_ZTS7StructA", [[TYPE_SHORT:!.*]], i64 0, [[TYPE_INT]], i64 4, [[TYPE_SHORT]], i64 8, [[TYPE_INT]], i64 12}
 // OLD-PATH: [[TYPE_SHORT:!.*]] = !{!"short", [[TYPE_CHAR]]
@@ -277,9 +277,9 @@ uint32_t g14(StructM2 *M, StructS *S) {
 // OLD-PATH: [[TYPE_M2]] = !{!"_ZTS8StructM2", [[TYPE_DYN:!.*]], i64 0, [[TYPE_S]], i64 12, [[TYPE_SHORT]], i64 20}
 // OLD_PATH: [[TYPE_DYN]] = !{!"_ZTS9StructDyn", [[TYPE_INT]], i64 8}
 
+// NEW-PATH: [[TAG_i32]] = !{[[TYPE_INT:!.*]], [[TYPE_INT:!.*]], i64 0, i64 4}
+// NEW-PATH: [[TYPE_INT]] = !{[[TYPE_CHAR:!.*]], i64 4, !"int"}
 // NEW-PATH: [[TYPE_CHAR:!.*]] = !{!{{.*}}, i64 1, !"omnipotent char"}
-// NEW-PATH: [[TAG_i32]] = !{[[TYPE_INT:!.*]], [[TYPE_INT]], i64 0, i64 4}
-// NEW-PATH: [[TYPE_INT]] = !{[[TYPE_CHAR]], i64 4, !"int"}
 // NEW-PATH: [[TAG_A_f32]] = !{[[TYPE_A:!.*]], [[TYPE_INT]], i64 4, i64 4}
 // NEW-PATH: [[TYPE_A]] = !{[[TYPE_CHAR]], i64 16, !"_ZTS7StructA", [[TYPE_SHORT:!.*]], i64 0, i64 2, [[TYPE_INT]], i64 4, i64 4, [[TYPE_SHORT]], i64 8, i64 2, [[TYPE_INT]], i64 12, i64 4}
 // NEW-PATH: [[TYPE_SHORT:!.*]] = !{[[TYPE_CHAR]], i64 2, !"short"}
diff --git a/clang/test/CodeGen/tbaa-pointers.c b/clang/test/CodeGen/tbaa-pointers.c
index 9cfaa0a47af6e..249cf5634ab11 100644
--- a/clang/test/CodeGen/tbaa-pointers.c
+++ b/clang/test/CodeGen/tbaa-pointers.c
@@ -1,18 +1,17 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -disable-llvm-passes -no-pointer-tbaa %s -emit-llvm -o - | FileCheck --check-prefixes=COMMON,DISABLE %s
 // RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -disable-llvm-passes %s -emit-llvm -o - | FileCheck --check-prefixes=COMMON,DEFAULT %s
-// RUN: %clang --target=x86_64-apple-darwin -O1 -fno-pointer-tbaa %s -emit-llvm -S -mllvm -disable-llvm-optzns -o - | FileCheck --check-prefixes=COMMON,DISABLE %s
-// RUN: %clang --target=x86_64-apple-darwin -O1 %s -emit-llvm -S -mllvm -disable-llvm-optzns -o - | FileCheck --check-prefixes=COMMON,DEFAULT %s
 
 void p2unsigned(unsigned **ptr) {
   // COMMON-LABEL: define void @p2unsigned(
   // COMMON-SAME:    ptr noundef [[PTR:%.+]])
   // COMMON:        [[PTR_ADDR:%.+]] = alloca ptr, align 8
-  // DEFAULT-NEXT:  store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[P2INT_0:!.+]]
-  // DEFAULT-NEXT:  [[BASE:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[P2INT_0]]
-  // DEFAULT-NEXT:  store ptr null, ptr [[BASE]], align 8, !tbaa [[P1INT_0:!.+]]
-  // DISABLE-NEXT:  store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR:!.+]]
-  // DISABLE-NEXT:  [[BASE:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
-  // DISABLE-NEXT:  store ptr null, ptr [[BASE]], align 8, !tbaa [[ANYPTR]]
+  // DEFAULT-NEXT:  store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA6:![0-9]+]]
+  // DEFAULT-NEXT:  [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA6]]
+  // DEFAULT-NEXT:  store ptr null, ptr [[TMP0]], align 8, !tbaa [[TBAA10:![0-9]+]]
+  // DISABLE-NEXT:  store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA6:![0-9]+]]
+  // DISABLE-NEXT:  [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA6]]
+  // DISABLE-NEXT:  store ptr null, ptr [[TMP0]], align 8, !tbaa [[TBAA6]]
   // COMMON-NEXT:   ret void
   //
   *ptr = 0;
@@ -22,12 +21,12 @@ void p2unsigned_volatile(unsigned *volatile *ptr) {
   // COMMON-LABEL: define void @p2unsigned_volatile(
   // COMMON-SAME:    ptr noundef [[PTR:%.+]])
   // COMMON:         [[PTR_ADDR:%.+]] = alloca ptr, align 8
-  // DEFAULT-NEXT:   store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[P2INT_0]]
-  // DEFAULT-NEXT:   [[BASE:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[P2INT_0]]
-  // DEFAULT-NEXT:   store volatile ptr null, ptr [[BASE]], align 8, !tbaa [[P1INT_0]]
-  // DISABLE-NEXT:   store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
-  // DISABLE-NEXT:   [[BASE:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
-  // DISABLE-NEXT:   store volatile ptr null, ptr [[BASE]], align 8, !tbaa [[ANYPTR]]
+  // DEFAULT-NEXT:   store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA6]]
+  // DEFAULT-NEXT:   [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA6]]
+  // DEFAULT-NEXT:   store volatile ptr null, ptr [[TMP0]], align 8, !tbaa [[TBAA10]]
+  // DISABLE-NEXT:   store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA6]]
+  // DISABLE-NEXT:   [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA6]]
+  // DISABLE-NEXT:   store volatile ptr null, ptr [[TMP0]], align 8, !tbaa [[TBAA6]]
   // COMMON-NEXT:    ret void
   //
   *ptr = 0;
@@ -37,14 +36,14 @@ void p3int(int ***ptr) {
   // COMMON-LABEL: define void @p3int(
   // COMMON-SAME:    ptr noundef [[PTR:%.+]])
   // COMMON:         [[PTR_ADDR:%.+]] = alloca ptr, align 8
-  // DEFAULT-NEXT:   store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[P3INT_0:!.+]]
-  // DEFAULT-NEXT:   [[BASE_0:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[P3INT_0]]
-  // DEFAULT-NEXT:   [[BASE_1:%.+]] = load ptr, ptr [[BASE_0]], align 8, !tbaa [[P2INT_0]]
-  // DEFAULT-NEXT:   store ptr null, ptr [[BASE_1]], align 8, !tbaa [[P1INT_0]]
-  // DISABLE-NEXT:   store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
-  // DISABLE-NEXT:   [[BASE_0:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
-  // DISABLE-NEXT:   [[BASE_1:%.+]] = load ptr, ptr [[BASE_0]], align 8, !tbaa [[ANYPTR]]
-  // DISABLE-NEXT:   store ptr null, ptr [[BASE_1]], align 8, !tbaa [[ANYPTR]]
+  // DEFAULT-NEXT:   store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA12:![0-9]+]]
+  // DEFAULT-NEXT:   [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA12]]
+  // DEFAULT-NEXT:   [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA6]]
+  // DEFAULT-NEXT:   store ptr null, ptr [[TMP1]], align 8, !tbaa [[TBAA10]]
+  // DISABLE-NEXT:   store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA6]]
+  // DISABLE-NEXT:   [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA6]]
+  // DISABLE-NEXT:   [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA6]]
+  // DISABLE-NEXT:   store ptr null, ptr [[TMP1]], align 8, !tbaa [[TBAA6]]
   // COMMON-NEXT:    ret void
   //
   **ptr = 0;
@@ -54,16 +53,16 @@ void p4char(char ****ptr) {
   // COMMON-LABEL: define void @p4char(
   // COMMON-SAME:    ptr noundef [[PTR:%.+]])
   // COMMON:         [[PTR_ADDR:%.+]] = alloca ptr, align 8
-  // DEFAULT-NEXT:   store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[P4CHAR_0:!.+]]
-  // DEFAULT-NEXT:   [[BASE_0:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[P4CHAR_0]]
-  // DEFAULT-NEXT:   [[BASE_1:%.+]] = load ptr, ptr [[BASE_0]], align 8, !tbaa [[P3CHAR_0:!.+]]
-  // DEFAULT-NEXT:   [[BASE_2:%.+]] = load ptr, ptr [[BASE_1]], align 8, !tbaa [[P2CHAR_0:!.+]]
-  // DEFAULT-NEXT:   store ptr null, ptr [[BASE_2]], align 8, !tbaa [[P1CHAR_0:!.+]]
-  // DISABLE-NEXT:   store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
-  // DISABLE-NEXT:   [[BASE_0:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
-  // DISABLE-NEXT:   [[BASE_1:%.+]] = load ptr, ptr [[BASE_0]], align 8, !tbaa [[ANYPTR]]
-  // DISABLE-NEXT:   [[BASE_2:%.+]] = load ptr, ptr [[BASE_1]], align 8, !tbaa [[ANYPTR]]
-  // DISABLE-NEXT:   store ptr null, ptr [[BASE_2]], align 8, !tbaa [[ANYPTR]]
+  // DEFAULT-NEXT:   store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA15:![0-9]+]]
+  // DEFAULT-NEXT:   [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA15]]
+  // DEFAULT-NEXT:   [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA18:![0-9]+]]
+  // DEFAULT-NEXT:   [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !tbaa [[TBAA20:![0-9]+]]
+  // DEFAULT-NEXT:   store ptr null, ptr [[TMP2]], align 8, !tbaa [[TBAA22:![0-9]+]]
+  // DISABLE-NEXT:   store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA6]]
+  // DISABLE-NEXT:   [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA6]]
+  // DISABLE-NEXT:   [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA6]]
+  // DISABLE-NEXT:   [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !tbaa [[TBAA6]]
+  // DISABLE-NEXT:   store ptr null, ptr [[TMP2]], align 8, !tbaa [[TBAA6]]
   // COMMON-NEXT:    ret void
   //
   ***ptr = 0;
@@ -73,16 +72,16 @@ void p4char_const1(const char ****ptr) {
   // COMMON-LABEL: define void @p4char_const1(
   // COMMON-SAME:    ptr noundef [[PTR:%.+]])
   // COMMON:         [[PTR_ADDR:%.+]] = alloca ptr, align 8
-  // DEFAULT-NEXT:   store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[P4CHAR_0]]
-  // DEFAULT-NEXT:   [[BASE_0:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[P4CHAR_0]]
-  // DEFAULT-NEXT:   [[BASE_1:%.+]] = load ptr, ptr [[BASE_0]], align 8, !tbaa [[P3CHAR_0]]
-  // DEFAULT-NEXT:   [[BASE_2:%.+]] = load ptr, ptr [[BASE_1]], align 8, !tbaa [[P2CHAR_0]]
-  // DEFAULT-NEXT:   store ptr null, ptr [[BASE_2]], align 8, !tbaa [[P1CHAR_0]]
-  // DISABLE-NEXT:   store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
-  // DISABLE-NEXT:   [[BASE_0:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
-  // DISABLE-NEXT:   [[BASE_1:%.+]] = load ptr, ptr [[BASE_0]], align 8, !tbaa [[ANYPTR]]
-  // DISABLE-NEXT:   [[BASE_2:%.+]] = load ptr, ptr [[BASE_1]], align 8, !tbaa [[ANYPTR]]
-  // DISABLE-NEXT:   store ptr null, ptr [[BASE_2]], align 8, !tbaa [[ANYPTR]]
+  // DEFAULT-NEXT:   store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA15]]
+  // DEFAULT-NEXT:   [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA15]]
+  // DEFAULT-NEXT:   [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA18]]
+  // DEFAULT-NEXT:   [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !tbaa [[TBAA20]]
+  // DEFAULT-NEXT:   store ptr null, ptr [[TMP2]], align 8, !tbaa [[TBAA22]]
+  // DISABLE-NEXT:   store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA6]]
+  // DISABLE-NEXT:   [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA6]]
+  // DISABLE-NEXT:   [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA6]]
+  // DISABLE-NEXT:   [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !tbaa [[TBAA6]]
+  // DISABLE-NEXT:   store ptr null, ptr [[TMP2]], align 8, !tbaa [[TBAA6]]
   // COMMON-NEXT:   ret void
   //
   ***ptr = 0;
@@ -92,16 +91,16 @@ void p4char_const2(const char **const **ptr) {
   // COMMON-LABEL: define void @p4char_const2(
   // COMMON-SAME:    ptr noundef [[PTR:%.+]])
   // COMMON:         [[PTR_ADDR:%.+]] = alloca ptr, align 8
-  // DEFAULT-NEXT:   store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[P4CHAR_0]]
-  // DEFAULT-NEXT:   [[BASE_0:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[P4CHAR_0]]
-  // DEFAULT-NEXT:   [[BASE_1:%.+]] = load ptr, ptr [[BASE_0]], align 8, !tbaa [[P3CHAR_0]]
-  // DEFAULT-NEXT:   [[BASE_2:%.+]] = load ptr, ptr [[BASE_1]], align 8, !tbaa [[P2CHAR_0]]
-  // DEFAULT-NEXT:   store ptr null, ptr [[BASE_2]], align 8, !tbaa [[P1CHAR_0]]
-  // DISABLE-NEXT:   store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
-  // DISABLE-NEXT:   [[BASE_0:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
-  // DISABLE-NEXT:   [[BASE_1:%.+]] = load ptr, ptr [[BASE_0]], align 8, !tbaa [[ANYPTR]]
-  // DISABLE-NEXT:   [[BASE_2:%.+]] = load ptr, ptr [[BASE_1]], align 8, !tbaa [[ANYPTR]]
-  // DISABLE-NEXT:   store ptr null, ptr [[BASE_2]], align 8, !tbaa [[ANYPTR]]
+  // DEFAULT-NEXT:   store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA15]]
+  // DEFAULT-NEXT:   [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA15]]
+  // DEFAULT-NEXT:   [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA18]]
+  // DEFAULT-NEXT:   [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !tbaa [[TBAA20]]
+  // DEFAULT-NEXT:   store ptr null, ptr [[TMP2]], align 8, !tbaa [[TBAA22]]
+  // DISABLE-NEXT:   store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA6]]
+  // DISABLE-NEXT:   [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA6]]
+  // DISABLE-NEXT:   [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA6]]
+  // DISABLE-NEXT:   [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !tbaa [[TBAA6]]
+  // DISABLE-NEXT:   store ptr null, ptr [[TMP2]], align 8, !tbaa [[TBAA6]]
   // COMMON-NEXT:    ret void
   //
   ***ptr = 0;
@@ -116,12 +115,12 @@ void p2struct(struct S1 **ptr) {
   // COMMON-LABEL: define void @p2struct(
   // COMMON-SAME:    ptr noundef [[PTR:%.+]])
   // COMMON:         [[PTR_ADDR:%.+]] = alloca ptr, align 8
-  // DEFAULT-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[P2S1_TAG:!.+]]
-  // DISABLE-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
-  // DEFAULT-NEXT:    [[BASE:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[P2S1_TAG]]
-  // DISABLE-NEXT:    [[BASE:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
-  // DEFAULT-NEXT:    store ptr null, ptr [[BASE]], align 8, !tbaa [[P1S1_TAG:!.+]]
-  // DISABLE-NEXT:    store ptr null, ptr [[BASE]], align 8, !tbaa [[ANYPTR]]
+  // DEFAULT-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA24:![0-9]+]]
+  // DISABLE-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA6]]
+  // DEFAULT-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA24]]
+  // DISABLE-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA6]]
+  // DEFAULT-NEXT:    store ptr null, ptr [[TMP0]], align 8, !tbaa [[TBAA26:![0-9]+]]
+  // DISABLE-NEXT:    store ptr null, ptr [[TMP0]], align 8, !tbaa [[TBAA6]]
   // COMMON-NEXT:    ret void
   //
   *ptr = 0;
@@ -131,10 +130,12 @@ void p2struct_const(struct S1 const **ptr) {
   // COMMON-LABEL: define void @p2struct_const(
   // COMMON-SAME:    ptr noundef [[PTR:%.+]])
   // COMMON:         [[PTR_ADDR:%.+]] = alloca ptr, align 8
-  // COMMON-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR:!.+]]
-  // COMMON-NEXT:    [[BASE:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
-  // DEFAULT-NEXT:    store ptr null, ptr [[BASE]], align 8, !tbaa [[P1S1_TAG]]
-  // DISABLE-NEXT:    store ptr null, ptr [[BASE]], align 8, !tbaa [[ANYPTR]]
+  // DEFAULT-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA24]]
+  // DEFAULT-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA24]]
+  // DEFAULT-NEXT:    store ptr null, ptr [[TMP0]], align 8, !tbaa [[TBAA26]]
+  // DISABLE-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA6]]
+  // DISABLE-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA6]]
+  // DISABLE-NEXT:    store ptr null, ptr [[TMP0]], align 8, !tbaa [[TBAA6]]
   // COMMON-NEXT:    ret void
   //
   *ptr = 0;
@@ -148,14 +149,14 @@ void p2struct2(struct S2 *ptr) {
   // COMMON-LABEL: define void @p2struct2(
   // COMMON-SAME:    ptr noundef [[PTR:%.+]])
   // COMMON:         [[PTR_ADDR:%.+]] = alloca ptr, align 8
-  // DEFAULT-NEXT:   store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[P1S2_TAG:!.+]]
-  // DEFAULT-NEXT:   [[BASE:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[P1S2_TAG]]
-  // DEFAULT-NEXT:   [[S:%.+]] = getelementptr inbounds nuw %struct.S2, ptr [[BASE]], i32 0, i32 0
-  // DEFAULT-NEXT:   store ptr null, ptr [[S]], align 8, !tbaa [[S2_S_TAG:!.+]]
-  // DISABLE-NEXT:   store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
-  // DISABLE-NEXT:   [[BASE:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
-  // DISABLE-NEXT:   [[S:%.+]] = getelementptr inbounds nuw %struct.S2, ptr [[BASE]], i32 0, i32 0
-  // DISABLE-NEXT:   store ptr null, ptr [[S]], align 8, !tbaa [[S2_S_TAG:!.+]]
+  // DEFAULT-NEXT:   store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA28:![0-9]+]]
+  // DEFAULT-NEXT:   [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA28]]
+  // DEFAULT-NEXT:   [[S:%.*]] = getelementptr inbounds nuw [[STRUCT_S2:%.*]], ptr [[TMP0]], i32 0, i32 0
+  // DEFAULT-NEXT:   store ptr null, ptr [[S]], align 8, !tbaa [[TBAA30:![0-9]+]]
+  // DISABLE-NEXT:   store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA6]]
+  // DISABLE-NEXT:   [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA6]]
+  // DISABLE-NEXT:   [[S:%.*]] = getelementptr inbounds nuw [[STRUCT_S2:%.*]], ptr [[TMP0]], i32 0, i32 0
+  // DISABLE-NEXT:   store ptr null, ptr [[S]], align 8, !tbaa [[TBAA8:![0-9]+]]
   // COMMON-NEXT:    ret void
     ptr->s = 0;
 }
@@ -167,21 +168,21 @@ void vla1(int n, int ptr[][n], int idx) {
 // COMMON:  	 [[N_ADDR:%.+]] = alloca i32, align 4
 // COMMON-NEXT:  [[PTR_ADDR:%.+]] = alloca ptr, align 8
 // COMMON-NEXT:  [[IDX_ADDR:%.+]] = alloca i32, align 4
-// COMMON-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4, !tbaa [[INT_TY:!.+]]
-// DEFAULT-NEXT: store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[P1INT0:!.+]]
-// DISABLE-NEXT: store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
-// COMMON-NEXT: store i32 [[IDX]], ptr [[IDX_ADDR]], align 4, !tbaa [[INT_TY]]
-// COMMON-NEXT: [[L:%.+]] = load i32, ptr [[N_ADDR]], align 4, !tbaa [[INT_TY]]
-// COMMON-NEXT: [[L_EXT:%.+]] = zext i32 [[L]] to i64
-// DEFAULT-NEXT: [[L_PTR:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[P1INT0]]
-// DISABLE-NEXT: [[L_PTR:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
-// COMMON-NEXT: [[L_IDX:%.+]] = load i32, ptr [[IDX_ADDR]], align 4, !tbaa [[INT_TY]]
-// COMMON-NEXT: [[IDX_EXT:%.+]] = sext i32 [[L_IDX]] to i64
-// COMMON-NEXT: [[MUL:%.+]] = mul nsw i64 [[IDX_EXT]], [[L_EXT]]
-// COMMON-NEXT: [[GEP1:%.+]] = getelementptr inbounds i32, ptr [[L_PTR]], i64 [[MUL]]
-// COMMON-NEXT: [[GEP2:%.+]] = getelementptr inbounds i32, ptr [[GEP1]], i64 0
-// COMMON-NEXT: store i32 0, ptr [[GEP2]], align 4, !tbaa [[INT_TAG:!.+]]
-// DEFAULT-NEXT: ret void
+// COMMON-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4, !tbaa [[TBAA2:![0-9]+]]
+// DEFAULT-NEXT: store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA10]]
+// DISABLE-NEXT: store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA6]]
+// COMMON-NEXT: store i32 [[IDX]], ptr [[IDX_ADDR]], align 4, !tbaa [[TBAA2]]
+// COMMON-NEXT: [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4, !tbaa [[TBAA2]]
+// COMMON-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+// DEFAULT-NEXT: [[TMP2:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA10]]
+// DISABLE-NEXT: [[TMP2:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA6]]
+// COMMON-NEXT: [[TMP3:%.*]] = load i32, ptr [[IDX_ADDR]], align 4, !tbaa [[TBAA2]]
+// COMMON-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP3]] to i64
+// COMMON-NEXT: [[TMP4:%.*]] = mul nsw i64 [[IDXPROM]], [[TMP1]]
+// COMMON-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 [[TMP4]]
+// COMMON-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX]], i64 0
+// COMMON-NEXT: store i32 0, ptr [[ARRAYIDX1]], align 4, !tbaa [[TBAA2]]
+// COMMON-NEXT: ret void
 
     ptr[idx][0] = 0;
 }
@@ -194,11 +195,13 @@ void unamed_struct_typedef(TypedefS *ptr) {
 // COMMON-LABEL: define void @unamed_struct_typedef(
 // COMMON-SAME: ptr noundef [[PTRA:%.+]])
 // COMMON:        [[PTR_ADDR:%.+]]  = alloca ptr, align 8
-// DISABLE-NEXT:  store ptr [[PTRA]], ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
-// DEFAULT-NEXT:  store ptr [[PTRA]], ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR:!.+]]
-// COMMON-NEXT:   [[L0:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa  [[ANYPTR]]
-// COMMON-NEXT:   [[GEP:%.+]]  = getelementptr inbounds nuw %struct.TypedefS, ptr [[L0]], i32 0, i32 0
-// COMMON-NEXT:   store i32 0, ptr [[GEP]], align 4
+// DISABLE-NEXT:  store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA6]]
+// DEFAULT-NEXT:  store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA32:![0-9]+]]
+// DISABLE-NEXT:  [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA6]]
+// DEFAULT-NEXT:  [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA32]]
+// COMMON-NEXT:   [[I1:%.*]] = getelementptr inbounds nuw [[STRUCT_TYPEDEFS:%.*]], ptr [[TMP0]], i32 0, i32 0
+// DISABLE-NEXT:  store i32 0, ptr [[I1]], align 4, !tbaa [[TBAA10:![0-9]+]]
+// DEFAULT-NEXT:  store i32 0, ptr [[I1]], align 4, !tbaa [[TBAA33:![0-9]+]]
 // COMMON-NEXT:   ret void
 
   ptr->i1 = 0;
@@ -208,52 +211,62 @@ int void_ptrs(void **ptr) {
 // COMMON-LABEL: define i32 @void_ptrs(
 // COMMON-SAME: ptr noundef [[PTRA:%.+]])
 // COMMON:        [[PTR_ADDR:%.+]]  = alloca ptr, align 8
-// DISABLE-NEXT:  store ptr [[PTRA]], ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
-// DEFAULT-NEXT:  store ptr [[PTRA]], ptr [[PTR_ADDR]], align 8, !tbaa [[ANYP2:!.+]]
-// DISABLE-NEXT:  [[L0:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[ANYPTR]]
-// DEFAULT-NEXT:  [[L0:%.+]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[ANYP2]]
-// COMMON-NEXT:   [[L1:%.+]] = load ptr, ptr [[L0]], align 8, !tbaa  [[ANYPTR]]
-// COMMON-NEXT:   [[BOOL:%.+]] = icmp ne ptr [[L1]], null
-// COMMON-NEXT:   [[BOOL_EXT:%.+]] = zext i1 [[BOOL]] to i64
-// COMMON-NEXT:   [[COND:%.+]] = select i1 [[BOOL]], i32 0, i32 1
+// DISABLE-NEXT:  store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA6]]
+// DEFAULT-NEXT:  store ptr [[PTR]], ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA35:![0-9]+]]
+// DISABLE-NEXT:  [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA6]]
+// DEFAULT-NEXT:  [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8, !tbaa [[TBAA35]]
+// DISABLE-NEXT:  [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA6]]
+// DEFAULT-NEXT:  [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA32]]
+// COMMON-NEXT:   [[TOBOOL:%.*]] = icmp ne ptr [[TMP1]], null
+// COMMON-NEXT:   [[TMP2:%.*]] = zext i1 [[TOBOOL]] to i64
+// COMMON-NEXT:   [[COND:%.*]] = select i1 [[TOBOOL]], i32 0, i32 1
 // COMMON-NEXT:   ret i32 [[COND]]
 
   return *ptr ? 0 : 1;
 }
 
-// DEFAULT: [[P2INT_0]] = !{[[P2INT:!.+]], [[P2INT]], i64 0}
-// DEFAULT: [[P2INT]] = !{!"p2 int", [[ANY_P2_POINTER:!.+]], i64 0}
-// DEFAULT: [[ANY_P2_POINTER]] = !{!"any p2 pointer", [[ANY_POINTER:!.+]], i64 0}
-// DISABLE: [[ANYPTR]] = !{[[ANY_POINTER:!.+]], [[ANY_POINTER]], i64 0}
-// COMMON: [[ANY_POINTER]] = !{!"any pointer", [[CHAR:!.+]], i64 0}
-// COMMON: [[CHAR]] = !{!"omnipotent char", [[TBAA_ROOT:!.+]], i64 0}
-// COMMON: [[TBAA_ROOT]] = !{!"Simple C/C++ TBAA"}
-// DEFAULT: [[P1INT_0]] = !{[[P1INT:!.+]], [[P1INT]], i64 0}
-// DEFAULT: [[P1INT]] = !{!"p1 int", [[ANY_POINTER]], i64 0}
-// DEFAULT: [[P3INT_0]] = !{[[P3INT:!.+]], [[P3INT]], i64 0}
-// DEFAULT: [[P3INT]] = !{!"p3 int", [[ANY_P3_POINTER:!.+]], i64 0}
-// DEFAULT: [[ANY_P3_POINTER]] = !{!"any p3 pointer", [[ANY_P2_POINTER]], i64 0}
-// DEFAULT: [[P4CHAR_0]] = !{[[P4CHAR:!.+]], [[P4CHAR]], i64 0}
-// DEFAULT: [[P4CHAR]] = !{!"p4 omnipotent char", [[ANY_P4_POINTER:!.*]], i64 0}
-// DEFAULT: [[ANY_P4_POINTER]] = !{!"any p4 pointer", [[ANY_P3_POINTER]], i64 0}
-// DEFAULT: [[P3CHAR_0]] = !{[[P3CHAR:!.+]], [[P3CHAR]], i64 0}
-// DEFAULT: [[P3CHAR]] = !{!"p3 omnipotent char", [[ANY_P3_POINTER]], i64 0}
-// DEFAULT: [[P2CHAR_0]] = !{[[P2CHAR:!.+]], [[P2CHAR]], i64 0}
-// DEFAULT: [[P2CHAR]] = !{!"p2 omnipotent char", [[ANY_P2_POINTER]], i64 0}
-// DEFAULT: [[P1CHAR_0]] = !{[[P1CHAR:!.+]], [[P1CHAR]], i64 0}
-// DEFAULT: [[P1CHAR]] = !{!"p1 omnipotent char", [[ANY_POINTER]], i64 0}
-// DEFAULT: [[P2S1_TAG]] = !{[[P2S1:!.+]], [[P2S1]], i64 0}
-// DEFAULT: [[P2S1]] = !{!"p2 _ZTS2S1", [[ANY_P2_POINTER]], i64 0}
-// DEFAULT: [[P1S1_TAG:!.+]] = !{[[P1S1:!.+]], [[P1S1]], i64 0}
-// DEFAULT: [[P1S1]] = !{!"p1 _ZTS2S1", [[ANY_POINTER]], i64 0}
-// DEFAULT: [[P1S2_TAG]] = !{[[P1S2:!.+]], [[P1S2]], i64 0}
-// DEFAULT: [[P1S2]] = !{!"p1 _ZTS2S2", [[ANY_POINTER]], i64 0}
-
-// DEFAULT: [[S2_S_TAG]]  = !{[[S2_TY:!.+]], [[P1S1]], i64 0}
-// DEFAULT: [[S2_TY]]  = !{!"S2", [[P1S1]], i64 0}
-// DISABLE: [[S2_S_TAG]]  = !{[[S2_TY:!.+]], [[ANY_POINTER]], i64 0}
-// DISABLE: [[S2_TY]]  = !{!"S2", [[ANY_POINTER]], i64 0}
-// COMMON:  [[INT_TAG]] = !{[[INT_TY:!.+]], [[INT_TY]], i64 0}
-// COMMON:  [[INT_TY]] = !{!"int", [[CHAR]], i64 0}
-// DEFAULT: [[ANYPTR]] = !{[[ANY_POINTER]],  [[ANY_POINTER]], i64 0}
-// DEFAULT: [[ANYP2]] = !{[[ANY_P2_POINTER]],  [[ANY_P2_POINTER]], i64 0}
+// DISABLE: [[TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// DISABLE: [[META3]] = !{!"int", [[META4:![0-9]+]], i64 0}
+// DISABLE: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// DISABLE: [[META5]] = !{!"Simple C/C++ TBAA"}
+// DISABLE: [[TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// DISABLE: [[META7]] = !{!"any pointer", [[META4]], i64 0}
+// DISABLE: [[TBAA8]] = !{[[META9:![0-9]+]], [[META7]], i64 0}
+// DISABLE: [[META9]] = !{!"S2", [[META7]], i64 0}
+// DISABLE: [[TBAA10]] = !{[[META11:![0-9]+]], [[META3]], i64 0}
+// DISABLE: [[META11]] = !{!"", [[META3]], i64 0}
+// DEFAULT: [[TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// DEFAULT: [[META3]] = !{!"int", [[META4:![0-9]+]], i64 0}
+// DEFAULT: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// DEFAULT: [[META5]] = !{!"Simple C/C++ TBAA"}
+// DEFAULT: [[TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// DEFAULT: [[META7]] = !{!"p2 int", [[META8:![0-9]+]], i64 0}
+// DEFAULT: [[META8]] = !{!"any p2 pointer", [[META9:![0-9]+]], i64 0}
+// DEFAULT: [[META9]] = !{!"any pointer", [[META4]], i64 0}
+// DEFAULT: [[TBAA10]] = !{[[META11:![0-9]+]], [[META11]], i64 0}
+// DEFAULT: [[META11]] = !{!"p1 int", [[META9]], i64 0}
+// DEFAULT: [[TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
+// DEFAULT: [[META13]] = !{!"p3 int", [[META14:![0-9]+]], i64 0}
+// DEFAULT: [[META14]] = !{!"any p3 pointer", [[META8]], i64 0}
+// DEFAULT: [[TBAA15]] = !{[[META16:![0-9]+]], [[META16]], i64 0}
+// DEFAULT: [[META16]] = !{!"p4 omnipotent char", [[META17:![0-9]+]], i64 0}
+// DEFAULT: [[META17]] = !{!"any p4 pointer", [[META14]], i64 0}
+// DEFAULT: [[TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0}
+// DEFAULT: [[META19]] = !{!"p3 omnipotent char", [[META14]], i64 0}
+// DEFAULT: [[TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0}
+// DEFAULT: [[META21]] = !{!"p2 omnipotent char", [[META8]], i64 0}
+// DEFAULT: [[TBAA22]] = !{[[META23:![0-9]+]], [[META23]], i64 0}
+// DEFAULT: [[META23]] = !{!"p1 omnipotent char", [[META9]], i64 0}
+// DEFAULT: [[TBAA24]] = !{[[META25:![0-9]+]], [[META25]], i64 0}
+// DEFAULT: [[META25]] = !{!"p2 _ZTS2S1", [[META8]], i64 0}
+// DEFAULT: [[TBAA26]] = !{[[META27:![0-9]+]], [[META27]], i64 0}
+// DEFAULT: [[META27]] = !{!"p1 _ZTS2S1", [[META9]], i64 0}
+// DEFAULT: [[TBAA28]] = !{[[META29:![0-9]+]], [[META29]], i64 0}
+// DEFAULT: [[META29]] = !{!"p1 _ZTS2S2", [[META9]], i64 0}
+// DEFAULT: [[TBAA30]] = !{[[META31:![0-9]+]], [[META27]], i64 0}
+// DEFAULT: [[META31]] = !{!"S2", [[META27]], i64 0}
+// DEFAULT: [[TBAA32]] = !{[[META9]], [[META9]], i64 0}
+// DEFAULT: [[TBAA33]] = !{[[META34:![0-9]+]], [[META3]], i64 0}
+// DEFAULT: [[META34]] = !{!"", [[META3]], i64 0}
+// DEFAULT: [[TBAA35]] = !{[[META8]], [[META8]], i64 0}
+//.
diff --git a/clang/test/CodeGen/tbaa-struct-bitfield-endianness.cpp b/clang/test/CodeGen/tbaa-struct-bitfield-endianness.cpp
index 1177691ca511e..0775f9996f1b2 100644
--- a/clang/test/CodeGen/tbaa-struct-bitfield-endianness.cpp
+++ b/clang/test/CodeGen/tbaa-struct-bitfield-endianness.cpp
@@ -18,18 +18,18 @@ struct NamedBitfields {
 // CHECK-LABEL: _Z4copyP14NamedBitfieldsS0_
 // CHECK-SAME: ptr noundef writeonly captures(none) initializes((0, 16)) [[A1:%.*]], ptr noundef readonly captures(none) [[A2:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[A1]], ptr noundef nonnull align 8 dereferenceable(16) [[A2]], i64 16, i1 false), !tbaa.struct [[TBAA_STRUCT2:![0-9]+]]
+// CHECK-NEXT:    tail call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[A1]], ptr noundef nonnull align 8 dereferenceable(16) [[A2]], i64 16, i1 false), !tbaa.struct [[TBAA_STRUCT6:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
 void copy(NamedBitfields *a1, NamedBitfields *a2) {
   *a1 = *a2;
 }
 
-// CHECK: [[TBAA_STRUCT2]] = !{i64 0, i64 4, [[META3:![0-9]+]], i64 4, i64 4, [[META6:![0-9]+]], i64 8, i64 8, [[META8:![0-9]+]]}
-// CHECK: [[META3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+// CHECK: [[META2:![0-9]+]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// CHECK: [[META3]] = !{!"int", [[META4:![0-9]+]], i64 0}
 // CHECK: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
 // CHECK: [[META5]] = !{!"Simple C++ TBAA"}
-// CHECK: [[META6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
-// CHECK: [[META7]] = !{!"int", [[META4]], i64 0}
+// CHECK: [[TBAA_STRUCT6]] = !{i64 0, i64 4, [[META7:![0-9]+]], i64 4, i64 4, [[META2:![0-9]+]], i64 8, i64 8, [[META8:![0-9]+]]}
+// CHECK: [[META7]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
 // CHECK: [[META8]] = !{[[META9:![0-9]+]], [[META9]], i64 0}
 // CHECK: [[META9]] = !{!"double", [[META4]], i64 0}
diff --git a/clang/test/CodeGen/tbaa-struct.cpp b/clang/test/CodeGen/tbaa-struct.cpp
index ca076ce5aa273..48f84928d93a5 100644
--- a/clang/test/CodeGen/tbaa-struct.cpp
+++ b/clang/test/CodeGen/tbaa-struct.cpp
@@ -1,3 +1,4 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // RUN: %clang_cc1 -triple x86_64-apple-darwin -emit-llvm -o - -O1 %s | \
 // RUN:     FileCheck -check-prefixes=CHECK,CHECK-OLD %s
 // RUN: %clang_cc1 -triple x86_64-apple-darwin -new-struct-path-tbaa \
@@ -18,9 +19,9 @@ typedef A __attribute__((may_alias)) AA;
 void copy(A *a1, A *a2) {
 // CHECK-LABEL: _Z4copyP1AS0_
 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(16) %{{.*}}, ptr noundef nonnull align 4 dereferenceable(16) %{{.*}}, i64 16, i1 false)
-// CHECK-OLD-SAME: !tbaa.struct [[TS:!.*]]
-// CHECK-NEW-SAME: !tbaa [[TAG_A:![0-9]*]]
-  *a1 = *a2;
+// CHECK-OLD-SAME: !tbaa.struct [[TBAA_STRUCT6:![0-9]+]]
+// CHECK-NEW-SAME: !tbaa [[TBAA6:![0-9]+]]
+*a1 = *a2;
 }
 
 struct B {
@@ -30,10 +31,10 @@ struct B {
 };
 
 void copy2(B *b1, B *b2) {
-// CHECK-LABEL: _Z5copy2P1BS0_
+// CHECK-LABEL: _Z5copy2P1BS0_ 
 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(24) %{{.*}}, ptr noundef nonnull align 4 dereferenceable(24) %{{.*}}, i64 24, i1 false)
-// CHECK-OLD-SAME: !tbaa.struct [[TS2:!.*]]
-// CHECK-NEW-SAME: !tbaa [[TAG_B:![0-9]*]]
+// CHECK-OLD-SAME: !tbaa.struct [[TBAA_STRUCT10:![0-9]+]]
+// CHECK-NEW-SAME: !tbaa [[TBAA12:![0-9]+]]
   *b1 = *b2;
 }
 
@@ -50,9 +51,9 @@ union U {
 void copy3(U *u1, U *u2) {
 // CHECK-LABEL: _Z5copy3P1US0_
 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(12) %{{.*}}, ptr noundef nonnull align 4 dereferenceable(12) %{{.*}}, i64 12, i1 false)
-// CHECK-OLD-SAME: !tbaa.struct [[TS3:!.*]]
-// CHECK-NEW-SAME: !tbaa [[TAG_U:![0-9]*]]
-  *u1 = *u2;
+// CHECK-OLD-SAME: !tbaa.struct [[TBAA_STRUCT11:![0-9]+]]
+// CHECK-NEW-SAME: !tbaa [[TBAA15:![0-9]+]]
+*u1 = *u2;
 }
 
 // Make sure that zero-length bitfield works.
@@ -66,8 +67,8 @@ struct C {
 void copy4(C *c1, C *c2) {
 // CHECK-LABEL: _Z5copy4P1CS0_
 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(3) {{.*}}, ptr noundef nonnull align 1 dereferenceable(3) {{.*}}, i64 3, i1 false)
-// CHECK-OLD-SAME: !tbaa.struct [[TS4:!.*]]
-// CHECK-NEW-SAME: !tbaa [[TAG_C:![0-9]*]]
+// CHECK-OLD-SAME: !tbaa.struct [[TBAA_STRUCT12:![0-9]+]]
+// CHECK-NEW-SAME: !tbaa [[TBAA17:![0-9]+]]
   *c1 = *c2;
 }
 
@@ -81,24 +82,24 @@ struct D {
 void copy5(D *d1, D *d2) {
 // CHECK-LABEL: _Z5copy5P1DS0_
 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(6) {{.*}}, ptr noundef nonnull align 1 dereferenceable(6) {{.*}}, i64 6, i1 false)
-// CHECK-OLD-SAME: !tbaa.struct [[TS5:!.*]]
-// CHECK-NEW-SAME: !tbaa [[TAG_D:![0-9]*]]
+// CHECK-OLD-SAME: !tbaa.struct [[TBAA_STRUCT13:![0-9]+]]
+// CHECK-NEW-SAME: !tbaa [[TBAA20:![0-9]+]]
   *d1 = *d2;
 }
 
 void copy6(AA *a1, A *a2) {
 // CHECK-LABEL: _Z5copy6P1AS0_
 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(16) %{{.*}}, ptr noundef nonnull align 4 dereferenceable(16) %{{.*}}, i64 16, i1 false)
-// CHECK-OLD-SAME: !tbaa.struct [[TS]]
-// CHECK-NEW-SAME: !tbaa [[TAG_char:![0-9]*]]
+// CHECK-OLD-SAME: !tbaa.struct [[TBAA_STRUCT6]]
+// CHECK-NEW-SAME: !tbaa [[TBAA23:![0-9]+]]
   *a1 = *a2;
 }
 
 void copy7(A *a1, AA *a2) {
 // CHECK-LABEL: _Z5copy7P1AS0_
 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(16) %{{.*}}, ptr noundef nonnull align 4 dereferenceable(16) %{{.*}}, i64 16, i1 false)
-// CHECK-OLD-SAME: !tbaa.struct [[TS]]
-// CHECK-NEW-SAME: !tbaa [[TAG_char]]
+// CHECK-OLD-SAME: !tbaa.struct [[TBAA_STRUCT6]]
+// CHECK-NEW-SAME: !tbaa [[TBAA23]]
   *a1 = *a2;
 }
 
@@ -112,8 +113,8 @@ struct NamedBitfields {
 void copy8(NamedBitfields *a1, NamedBitfields *a2) {
 // CHECK-LABEL: _Z5copy8P14NamedBitfieldsS0_
 // CHECK:   tail call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) %a1, ptr noundef nonnull align 8 dereferenceable(16) %a2, i64 16, i1 false),
-// CHECK-OLD-SAME: !tbaa.struct [[TS6:!.*]]
-// CHECK-NEW-SAME: !tbaa [[TAG_NamedBitfields:!.+]], !tbaa.struct
+// CHECK-OLD-SAME: !tbaa.struct [[TBAA_STRUCT14:![0-9]+]]
+// CHECK-NEW-SAME: !tbaa [[TBAA24:![0-9]+]], !tbaa.struct
   *a1 = *a2;
 }
 
@@ -129,8 +130,8 @@ struct NamedBitfields2 {
 void copy9(NamedBitfields2 *a1, NamedBitfields2 *a2) {
 // CHECK-LABEL: _Z5copy9P15NamedBitfields2S0_
 // CHECK:   tail call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) %a1, ptr noundef nonnull align 8 dereferenceable(24) %a2, i64 24, i1 false),
-// CHECK-OLD-SAME: !tbaa.struct [[TS7:!.*]]
-// CHECK-NEW-SAME: !tbaa [[TAG_NamedBitfields2:!.+]], !tbaa.struct
+// CHECK-OLD-SAME: !tbaa.struct [[TBAA_STRUCT17:![0-9]+]]
+// CHECK-NEW-SAME: !tbaa [[TBAA30:![0-9]+]], !tbaa.struct
   *a1 = *a2;
 }
 
@@ -146,8 +147,8 @@ struct NamedBitfields3 {
 void copy10(NamedBitfields3 *a1, NamedBitfields3 *a2) {
 // CHECK-LABEL: _Z6copy10P15NamedBitfields3S0_
 // CHECK:   tail call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) %a1, ptr noundef nonnull align 8 dereferenceable(16) %a2, i64 16, i1 false),
-// CHECK-OLD-SAME: !tbaa.struct [[TS8:!.*]]
-// CHECK-NEW-SAME: !tbaa [[TAG_NamedBitfields3:!.+]], !tbaa.struct
+// CHECK-OLD-SAME: !tbaa.struct [[TBAA_STRUCT18:![0-9]+]]
+// CHECK-NEW-SAME: !tbaa [[TBAA33:![0-9]+]], !tbaa.struct
   *a1 = *a2;
 }
 
@@ -164,8 +165,8 @@ struct UnionMember1 {
 void copy11(UnionMember1 *a1, UnionMember1 *a2) {
 // CHECK-LABEL: _Z6copy11P12UnionMember1S0_
 // CHECK:   tail call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) %a1, ptr noundef nonnull align 8 dereferenceable(16) %a2, i64 16, i1 false),
-// CHECK-OLD-SAME: !tbaa.struct [[TS9:!.*]]
-// CHECK-NEW-SAME: !tbaa [[TAG_UnionMember1:!.+]], !tbaa.struct
+// CHECK-OLD-SAME: !tbaa.struct [[TBAA_STRUCT19:![0-9]+]]
+// CHECK-NEW-SAME: !tbaa [[TBAA37:![0-9]+]], !tbaa.struct
   *a1 = *a2;
 }
 
@@ -177,52 +178,61 @@ struct UnionMember2 {
 void copy12(UnionMember2 *a1, UnionMember2 *a2) {
 // CHECK-LABEL: _Z6copy12P12UnionMember2S0_
 // CHECK:   tail call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) %a1, ptr noundef nonnull align 8 dereferenceable(16) %a2, i64 16, i1 false),
-// CHECK-OLD-SAME: !tbaa.struct [[TS10:!.*]]
-// CHECK-NEW-SAME: !tbaa [[TAG_UnionMember2:!.+]], !tbaa.struct
+// CHECK-OLD-SAME: !tbaa.struct [[TBAA_STRUCT20:![0-9]+]]
+// CHECK-NEW-SAME: !tbaa [[TBAA41:![0-9]+]], !tbaa.struct
 
   *a1 = *a2;
 }
 
-// CHECK-OLD: [[TS]] = !{i64 0, i64 2, !{{.*}}, i64 4, i64 4, !{{.*}}, i64 8, i64 1, !{{.*}}, i64 12, i64 4, !{{.*}}}
-// CHECK-OLD: [[CHAR:!.*]] = !{!"omnipotent char", !{{.*}}}
-// CHECK-OLD: [[TAG_INT:!.*]] = !{[[INT:!.*]], [[INT]], i64 0}
-// CHECK-OLD: [[INT]] = !{!"int", [[CHAR]]
-// CHECK-OLD: [[TAG_CHAR:!.*]] = !{[[CHAR]], [[CHAR]], i64 0}
 // (offset, size) = (0,1) char; (4,2) short; (8,4) int; (12,1) char; (16,4) int; (20,4) int
-// CHECK-OLD: [[TS2]] = !{i64 0, i64 1, !{{.*}}, i64 4, i64 2, !{{.*}}, i64 8, i64 4, !{{.*}}, i64 12, i64 1, !{{.*}}, i64 16, i64 4, {{.*}}, i64 20, i64 4, {{.*}}}
 // (offset, size) = (0,8) char; (0,2) char; (4,8) char
-// CHECK-OLD: [[TS3]] = !{i64 0, i64 12, [[TAG_CHAR]]}
-// CHECK-OLD: [[TS4]] = !{i64 0, i64 1, [[TAG_CHAR]], i64 1, i64 1, [[TAG_CHAR]], i64 2, i64 1, [[TAG_CHAR]]}
-// CHECK-OLD: [[TS5]] = !{i64 0, i64 1, [[TAG_CHAR]], i64 4, i64 1, [[TAG_CHAR]], i64 5, i64 1, [[TAG_CHAR]]}
-// CHECK-OLD: [[TS6]] = !{i64 0, i64 2, [[TAG_CHAR]], i64 2, i64 1, [[TAG_CHAR]], i64 8, i64 8, [[TAG_DOUBLE:!.+]]}
-// CHECK-OLD: [[TAG_DOUBLE]] = !{[[DOUBLE:!.+]], [[DOUBLE]], i64 0}
 // CHECK-OLD  [[DOUBLE]] = !{!"double", [[CHAR]], i64 0}
-// CHECK-OLD: [[TS7]] = !{i64 0, i64 1, [[TAG_CHAR]], i64 1, i64 1, [[TAG_CHAR]], i64 2, i64 1, [[TAG_CHAR]], i64 3, i64 2, [[TAG_CHAR]], i64 8, i64 8, [[TAG_DOUBLE]], i64 16, i64 1, [[TAG_CHAR]]}
-// CHECK-OLD: [[TS8]] = !{i64 0, i64 4, [[TAG_CHAR]], i64 8, i64 8, [[TAG_DOUBLE]]}
-// CHECK-OLD: [[TS9]] = !{i64 0, i64 8, [[TAG_CHAR]], i64 8, i64 4, [[TAG_INT]]}
-// CHECK-OLD: [[TS10]] = !{i64 0, i64 4, [[TAG_INT]], i64 8, i64 8, [[TAG_CHAR]]}
-
-// CHECK-NEW-DAG: [[TYPE_char:!.*]] = !{{{.*}}, i64 1, !"omnipotent char"}
-// CHECK-NEW-DAG: [[TAG_char]] = !{[[TYPE_char]], [[TYPE_char]], i64 0, i64 0}
-// CHECK-NEW-DAG: [[TYPE_short:!.*]] = !{[[TYPE_char]], i64 2, !"short"}
-// CHECK-NEW-DAG: [[TYPE_int:!.*]] = !{[[TYPE_char]], i64 4, !"int"}
-// CHECK-NEW-DAG: [[TYPE_A:!.*]] = !{[[TYPE_char]], i64 16, !"_ZTS1A", [[TYPE_short]], i64 0, i64 2, [[TYPE_int]], i64 4, i64 4, [[TYPE_char]], i64 8, i64 1, [[TYPE_int]], i64 12, i64 4}
-// CHECK-NEW-DAG: [[TAG_A]] = !{[[TYPE_A]], [[TYPE_A]], i64 0, i64 16}
-// CHECK-NEW-DAG: [[TYPE_B:!.*]] = !{[[TYPE_char]], i64 24, !"_ZTS1B", [[TYPE_char]], i64 0, i64 1, [[TYPE_A]], i64 4, i64 16, [[TYPE_int]], i64 20, i64 4}
-// CHECK-NEW-DAG: [[TAG_B]] = !{[[TYPE_B]], [[TYPE_B]], i64 0, i64 24}
-// CHECK-NEW-DAG: [[TAG_U]] = !{[[TYPE_char]], [[TYPE_char]], i64 0, i64 12}
-// CHECK-NEW-DAG: [[TYPE_C:!.*]] = !{[[TYPE_char]], i64 3, !"_ZTS1C", [[TYPE_char]], i64 0, i64 1, [[TYPE_char]], i64 1, i64 1, [[TYPE_char]], i64 2, i64 1}
-// CHECK-NEW-DAG: [[TAG_C]] = !{[[TYPE_C]], [[TYPE_C]], i64 0, i64 3}
-// CHECK-NEW-DAG: [[TYPE_D:!.*]] = !{[[TYPE_char]], i64 6, !"_ZTS1D", [[TYPE_char]], i64 0, i64 1, [[TYPE_char]], i64 4, i64 1, [[TYPE_char]], i64 5, i64 1}
-// CHECK-NEW-DAG: [[TAG_D]] = !{[[TYPE_D]], [[TYPE_D]], i64 0, i64 6}
-// CHECK-NEW-DAG: [[TAG_NamedBitfields]] = !{[[TYPE_NamedBitfields:!.+]], [[TYPE_NamedBitfields]], i64 0, i64 16}
-// CHECK-NEW-DAG: [[TYPE_NamedBitfields]] = !{[[TYPE_char]], i64 16, !"_ZTS14NamedBitfields", [[TYPE_int]], i64 0, i64 4, [[TYPE_int]], i64 1, i64 4, [[TYPE_char]], i64 2, i64 1, [[TYPE_double:!.+]], i64 8, i64 8}
-// CHECK-NEW-DAG: [[TYPE_double]] = !{[[TYPE_char]], i64 8, !"double"}
-// CHECK-NEW-DAG: [[TAG_NamedBitfields2]] = !{[[TYPE_NamedBitfields2:!.+]], [[TYPE_NamedBitfields2]], i64 0, i64 24}
-// CHECK-NEW-DAG: [[TYPE_NamedBitfields2]] = !{[[TYPE_char]], i64 24, !"_ZTS15NamedBitfields2", [[TYPE_char]], i64 0, i64 1, [[TYPE_char]], i64 1, i64 1, [[TYPE_char]], i64 2, i64 1, [[TYPE_int]], i64 3, i64 4, [[TYPE_int]], i64 3, i64 4, [[TYPE_char]], i64 4, i64 1, [[TYPE_double]], i64 8, i64 8, [[TYPE_int]], i64 16, i64 4}
-// CHECK-NEW-DAG: [[TAG_NamedBitfields3]] = !{[[TYPE_NamedBitfields3:!.+]], [[TYPE_NamedBitfields3]], i64 0, i64 16}
-// CHECK-NEW-DAG: [[TYPE_NamedBitfields3]] = !{[[TYPE_char]], i64 16, !"_ZTS15NamedBitfields3", [[TYPE_int]], i64 1, i64 4, [[TYPE_int]], i64 2, i64 4, [[TYPE_double]], i64 8, i64 8}
-// CHECK-NEW-DAG: [[TAG_UnionMember1]] = !{[[TYPE_UnionMember1:!.+]], [[TYPE_UnionMember1]], i64 0, i64 16}
-// CHECK-NEW-DAG: [[TYPE_UnionMember1]] = !{[[TYPE_char]], i64 16, !"_ZTS12UnionMember1", [[TYPE_char]], i64 0, i64 8, [[TYPE_int]], i64 8, i64 4}
-// CHECK-NEW-DAG: [[TAG_UnionMember2]] = !{[[TYPE_UnionMember2:!.+]], [[TYPE_UnionMember2]], i64 0, i64 16}
-// CHECK-NEW-DAG: [[TYPE_UnionMember2]] = !{[[TYPE_char]], i64 16, !"_ZTS12UnionMember2", [[TYPE_int]], i64 0, i64 4, [[TYPE_char]], i64 8, i64 8}
+
+//.
+// CHECK-OLD: [[META2:![0-9]+]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// CHECK-OLD: [[META3]] = !{!"int", [[META4:![0-9]+]], i64 0}
+// CHECK-OLD: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK-OLD: [[META5]] = !{!"Simple C++ TBAA"}
+// CHECK-OLD: [[TBAA_STRUCT6]] = !{i64 0, i64 2, [[META7:![0-9]+]], i64 4, i64 4, [[META2]], i64 8, i64 1, [[META9:![0-9]+]], i64 12, i64 4, [[META2]]}
+// CHECK-OLD: [[META7]] = !{[[META8:![0-9]+]], [[META8]], i64 0}
+// CHECK-OLD: [[META8]] = !{!"short", [[META4]], i64 0}
+// CHECK-OLD: [[META9]] = !{[[META4]], [[META4]], i64 0}
+// CHECK-OLD: [[TBAA_STRUCT10]] = !{i64 0, i64 1, [[META9]], i64 4, i64 2, [[META7]], i64 8, i64 4, [[META2]], i64 12, i64 1, [[META9]], i64 16, i64 4, [[META2]], i64 20, i64 4, [[META2]]}
+// CHECK-OLD: [[TBAA_STRUCT11]] = !{i64 0, i64 12, [[META9]]}
+// CHECK-OLD: [[TBAA_STRUCT12]] = !{i64 0, i64 1, [[META9]], i64 1, i64 1, [[META9]], i64 2, i64 1, [[META9]]}
+// CHECK-OLD: [[TBAA_STRUCT13]] = !{i64 0, i64 1, [[META9]], i64 4, i64 1, [[META9]], i64 5, i64 1, [[META9]]}
+// CHECK-OLD: [[TBAA_STRUCT14]] = !{i64 0, i64 2, [[META9]], i64 2, i64 1, [[META9]], i64 8, i64 8, [[META15:![0-9]+]]}
+// CHECK-OLD: [[META15]] = !{[[META16:![0-9]+]], [[META16]], i64 0}
+// CHECK-OLD: [[META16]] = !{!"double", [[META4]], i64 0}
+// CHECK-OLD: [[TBAA_STRUCT17]] = !{i64 0, i64 1, [[META9]], i64 1, i64 1, [[META9]], i64 2, i64 1, [[META9]], i64 3, i64 2, [[META9]], i64 8, i64 8, [[META15]], i64 16, i64 1, [[META9]]}
+// CHECK-OLD: [[TBAA_STRUCT18]] = !{i64 0, i64 4, [[META9]], i64 8, i64 8, [[META15]]}
+// CHECK-OLD: [[TBAA_STRUCT19]] = !{i64 0, i64 8, [[META9]], i64 8, i64 4, [[META2]]}
+// CHECK-OLD: [[TBAA_STRUCT20]] = !{i64 0, i64 4, [[META2]], i64 8, i64 8, [[META9]]}
+//.
+// CHECK-NEW: [[META2:![0-9]+]] = !{[[META3:![0-9]+]], [[META3]], i64 0, i64 4}
+// CHECK-NEW: [[META3]] = !{[[META4:![0-9]+]], i64 4, !"int"}
+// CHECK-NEW: [[META4]] = !{[[META5:![0-9]+]], i64 1, !"omnipotent char"}
+// CHECK-NEW: [[META5]] = !{!"Simple C++ TBAA"}
+// CHECK-NEW: [[TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0, i64 16}
+// CHECK-NEW: [[META7]] = !{[[META4]], i64 16, !"_ZTS1A", [[META8:![0-9]+]], i64 0, i64 2, [[META3]], i64 4, i64 4, [[META4]], i64 8, i64 1, [[META3]], i64 12, i64 4}
+// CHECK-NEW: [[META8]] = !{[[META4]], i64 2, !"short"}
+// CHECK-NEW: [[TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0, i64 24}
+// CHECK-NEW: [[META13]] = !{[[META4]], i64 24, !"_ZTS1B", [[META4]], i64 0, i64 1, [[META7]], i64 4, i64 16, [[META3]], i64 20, i64 4}
+// CHECK-NEW: [[TBAA15]] = !{[[META4]], [[META4]], i64 0, i64 12}
+// CHECK-NEW: [[TBAA17]] = !{[[META18:![0-9]+]], [[META18]], i64 0, i64 3}
+// CHECK-NEW: [[META18]] = !{[[META4]], i64 3, !"_ZTS1C", [[META4]], i64 0, i64 1, [[META4]], i64 1, i64 1, [[META4]], i64 2, i64 1}
+// CHECK-NEW: [[TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0, i64 6}
+// CHECK-NEW: [[META21]] = !{[[META4]], i64 6, !"_ZTS1D", [[META4]], i64 0, i64 1, [[META4]], i64 4, i64 1, [[META4]], i64 5, i64 1}
+// CHECK-NEW: [[TBAA23]] = !{[[META4]], [[META4]], i64 0, i64 0}
+// CHECK-NEW: [[TBAA24]] = !{[[META25:![0-9]+]], [[META25]], i64 0, i64 16}
+// CHECK-NEW: [[META25]] = !{[[META4]], i64 16, !"_ZTS14NamedBitfields", [[META3]], i64 0, i64 4, [[META3]], i64 1, i64 4, [[META4]], i64 2, i64 1, [[META26:![0-9]+]], i64 8, i64 8}
+// CHECK-NEW: [[META26]] = !{[[META4]], i64 8, !"double"}
+// CHECK-NEW: [[TBAA30]] = !{[[META31:![0-9]+]], [[META31]], i64 0, i64 24}
+// CHECK-NEW: [[META31]] = !{[[META4]], i64 24, !"_ZTS15NamedBitfields2", [[META4]], i64 0, i64 1, [[META4]], i64 1, i64 1, [[META4]], i64 2, i64 1, [[META3]], i64 3, i64 4, [[META3]], i64 3, i64 4, [[META4]], i64 4, i64 1, [[META26]], i64 8, i64 8, [[META3]], i64 16, i64 4}
+// CHECK-NEW: [[TBAA33]] = !{[[META34:![0-9]+]], [[META34]], i64 0, i64 16}
+// CHECK-NEW: [[META34]] = !{[[META4]], i64 16, !"_ZTS15NamedBitfields3", [[META3]], i64 1, i64 4, [[META3]], i64 2, i64 4, [[META26]], i64 8, i64 8}
+// CHECK-NEW: [[TBAA37]] = !{[[META38:![0-9]+]], [[META38]], i64 0, i64 16}
+// CHECK-NEW: [[META38]] = !{[[META4]], i64 16, !"_ZTS12UnionMember1", [[META4]], i64 0, i64 8, [[META3]], i64 8, i64 4}
+// CHECK-NEW: [[TBAA41]] = !{[[META42:![0-9]+]], [[META42]], i64 0, i64 16}
+// CHECK-NEW: [[META42]] = !{[[META4]], i64 16, !"_ZTS12UnionMember2", [[META3]], i64 0, i64 4, [[META4]], i64 8, i64 8}
+//.
diff --git a/clang/test/CodeGen/tbaa.c b/clang/test/CodeGen/tbaa.c
index 0ab81f60a7194..a719c0398e79b 100644
--- a/clang/test/CodeGen/tbaa.c
+++ b/clang/test/CodeGen/tbaa.c
@@ -1,3 +1,4 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -no-struct-path-tbaa -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s
 // RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s -check-prefixes=PATH
 // RUN: %clang_cc1 -triple x86_64-apple-darwin -O0 -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s -check-prefix=NO-TBAA
@@ -37,13 +38,13 @@ typedef enum : uint8_t {
 
 uint32_t g0(EnumAuto32 *E, uint32_t *val) {
 // CHECK-LABEL: define{{.*}} i32 @g0(
-// CHECK: store i32 5, ptr %{{.*}}, align 4, !tbaa [[TAG_i32:!.*]]
-// CHECK: store i32 0, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
-// CHECK: load i32, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
+// CHECK: store i32 5, ptr %{{.*}}, align 4, !tbaa [[TBAA2:![0-9]+]]
+// CHECK: store i32 0, ptr %{{.*}}, align 4, !tbaa [[TBAA2]]
+// CHECK: load i32, ptr %{{.*}}, align 4, !tbaa [[TBAA2]]
 // PATH-LABEL: define{{.*}} i32 @g0(
-// PATH: store i32 5, ptr %{{.*}}, align 4, !tbaa [[TAG_i32:!.*]]
-// PATH: store i32 0, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
-// PATH: load i32, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
+// PATH: store i32 5, ptr %{{.*}}, align 4, !tbaa [[TBAA2:![0-9]+]]
+// PATH: store i32 0, ptr %{{.*}}, align 4, !tbaa [[TBAA2]]
+// PATH: load i32, ptr %{{.*}}, align 4, !tbaa [[TBAA2]]
   *val = 5;
   *E = RED_AUTO_32;
   return *val;
@@ -51,13 +52,13 @@ uint32_t g0(EnumAuto32 *E, uint32_t *val) {
 
 uint64_t g1(EnumAuto64 *E, uint64_t *val) {
 // CHECK-LABEL: define{{.*}} i64 @g1(
-// CHECK: store i64 5, ptr %{{.*}}, align 8, !tbaa [[TAG_i64:!.*]]
-// CHECK: store i64 0, ptr %{{.*}}, align 8, !tbaa [[TAG_long:!.*]]
-// CHECK: load i64, ptr %{{.*}}, align 8, !tbaa [[TAG_i64]]
+// CHECK: store i64 5, ptr %{{.*}}, align 8, !tbaa [[TBAA12:![0-9]+]]
+// CHECK: store i64 0, ptr %{{.*}}, align 8, !tbaa [[TBAA14:![0-9]+]]
+// CHECK: load i64, ptr %{{.*}}, align 8, !tbaa [[TBAA12]]
 // PATH-LABEL: define{{.*}} i64 @g1(
-// PATH: store i64 5, ptr %{{.*}}, align 8, !tbaa [[TAG_i64:!.*]]
-// PATH: store i64 0, ptr %{{.*}}, align 8, !tbaa [[TAG_long:!.*]]
-// PATH: load i64, ptr %{{.*}}, align 8, !tbaa [[TAG_i64]]
+// PATH: store i64 5, ptr %{{.*}}, align 8, !tbaa [[TBAA12:![0-9]+]]
+// PATH: store i64 0, ptr %{{.*}}, align 8, !tbaa [[TBAA14:![0-9]+]]
+// PATH: load i64, ptr %{{.*}}, align 8, !tbaa [[TBAA12]]
   *val = 5;
   *E = RED_AUTO_64;
   return *val;
@@ -65,13 +66,13 @@ uint64_t g1(EnumAuto64 *E, uint64_t *val) {
 
 uint16_t g2(Enum16 *E, uint16_t *val) {
 // CHECK-LABEL: define{{.*}} i16 @g2(
-// CHECK: store i16 5, ptr %{{.*}}, align 2, !tbaa [[TAG_i16:!.*]]
-// CHECK: store i16 0, ptr %{{.*}}, align 2, !tbaa [[TAG_i16]]
-// CHECK: load i16, ptr %{{.*}}, align 2, !tbaa [[TAG_i16]]
+// CHECK: store i16 5, ptr %{{.*}}, align 2, !tbaa [[TBAA18:![0-9]+]]
+// CHECK: store i16 0, ptr %{{.*}}, align 2, !tbaa [[TBAA18]]
+// CHECK: load i16, ptr %{{.*}}, align 2, !tbaa [[TBAA18]]
 // PATH-LABEL: define{{.*}} i16 @g2(
-// PATH: store i16 5, ptr %{{.*}}, align 2, !tbaa [[TAG_i16:!.*]]
-// PATH: store i16 0, ptr %{{.*}}, align 2, !tbaa [[TAG_i16]]
-// PATH: load i16, ptr %{{.*}}, align 2, !tbaa [[TAG_i16]]
+// PATH: store i16 5, ptr %{{.*}}, align 2, !tbaa [[TBAA18:![0-9]+]]
+// PATH: store i16 0, ptr %{{.*}}, align 2, !tbaa [[TBAA18]]
+// PATH: load i16, ptr %{{.*}}, align 2, !tbaa [[TBAA18]]
   *val = 5;
   *E = RED_16;
   return *val;
@@ -79,38 +80,40 @@ uint16_t g2(Enum16 *E, uint16_t *val) {
 
 uint8_t g3(Enum8 *E, uint8_t *val) {
 // CHECK-LABEL: define{{.*}} i8 @g3(
-// CHECK: store i8 5, ptr %{{.*}}, align 1, !tbaa [[TAG_i8:!.*]]
-// CHECK: store i8 0, ptr %{{.*}}, align 1, !tbaa [[TAG_i8]]
-// CHECK: load i8, ptr %{{.*}}, align 1, !tbaa [[TAG_i8]]
+// CHECK: store i8 5, ptr %{{.*}}, align 1, !tbaa [[TBAA22:![0-9]+]]
+// CHECK: store i8 0, ptr %{{.*}}, align 1, !tbaa [[TBAA22]]
+// CHECK: load i8, ptr %{{.*}}, align 1, !tbaa [[TBAA22]]
 // PATH-LABEL: define{{.*}} i8 @g3(
-// PATH: store i8 5, ptr %{{.*}}, align 1, !tbaa [[TAG_i8:!.*]]
-// PATH: store i8 0, ptr %{{.*}}, align 1, !tbaa [[TAG_i8]]
-// PATH: load i8, ptr %{{.*}}, align 1, !tbaa [[TAG_i8]]
+// PATH: store i8 5, ptr %{{.*}}, align 1, !tbaa [[TBAA22:![0-9]+]]
+// PATH: store i8 0, ptr %{{.*}}, align 1, !tbaa [[TBAA22]]
+// PATH: load i8, ptr %{{.*}}, align 1, !tbaa [[TBAA22]]
   *val = 5;
   *E = RED_8;
   return *val;
 }
 
-// CHECK: [[TYPE_char:!.*]] = !{!"omnipotent char", [[TAG_c_tbaa:!.*]],
-// CHECK: [[TAG_c_tbaa]] = !{!"Simple C/C++ TBAA"}
-// CHECK: [[TAG_i32]] = !{[[TYPE_i32:!.*]], [[TYPE_i32]], i64 0}
-// CHECK: [[TYPE_i32]] = !{!"int", [[TYPE_char]],
-// CHECK: [[TAG_i64]] = !{[[TYPE_i64:!.*]], [[TYPE_i64]], i64 0}
-// CHECK: [[TYPE_i64]] = !{!"long long", [[TYPE_char]],
-// CHECK: [[TAG_long]] = !{[[TYPE_long:!.*]], [[TYPE_long]], i64 0}
-// CHECK: [[TYPE_long]] = !{!"long", [[TYPE_char]],
-// CHECK: [[TAG_i16]] = !{[[TYPE_i16:!.*]], [[TYPE_i16]], i64 0}
-// CHECK: [[TYPE_i16]] = !{!"short", [[TYPE_char]],
-// CHECK: [[TAG_i8]] = !{[[TYPE_i8:!.*]], [[TYPE_char]], i64 0}
-
-// PATH: [[TYPE_char:!.*]] = !{!"omnipotent char", [[TAG_c_tbaa:!.*]],
-// PATH: [[TAG_c_tbaa]] = !{!"Simple C/C++ TBAA"}
-// PATH: [[TAG_i32]] = !{[[TYPE_i32:!.*]], [[TYPE_i32]], i64 0}
-// PATH: [[TYPE_i32]] = !{!"int", [[TYPE_char]],
-// PATH: [[TAG_i64]] = !{[[TYPE_i64:!.*]], [[TYPE_i64]], i64 0}
-// PATH: [[TYPE_i64]] = !{!"long long", [[TYPE_char]],
-// PATH: [[TAG_long]] = !{[[TYPE_long:!.*]], [[TYPE_long]], i64 0}
-// PATH: [[TYPE_long]] = !{!"long", [[TYPE_char]],
-// PATH: [[TAG_i16]] = !{[[TYPE_i16:!.*]], [[TYPE_i16]], i64 0}
-// PATH: [[TYPE_i16]] = !{!"short", [[TYPE_char]],
-// PATH: [[TAG_i8]] = !{[[TYPE_i8:!.*]], [[TYPE_char]], i64 0}
+//.
+// CHECK: [[TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// CHECK: [[META3]] = !{!"int", [[META4:![0-9]+]], i64 0}
+// CHECK: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK: [[META5]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
+// CHECK: [[META13]] = !{!"long long", [[META4]], i64 0}
+// CHECK: [[TBAA14]] = !{[[META15:![0-9]+]], [[META15]], i64 0}
+// CHECK: [[META15]] = !{!"long", [[META4]], i64 0}
+// CHECK: [[TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0}
+// CHECK: [[META19]] = !{!"short", [[META4]], i64 0}
+// CHECK: [[TBAA22]] = !{[[META4]], [[META4]], i64 0}
+//.
+// PATH: [[TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// PATH: [[META3]] = !{!"int", [[META4:![0-9]+]], i64 0}
+// PATH: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// PATH: [[META5]] = !{!"Simple C/C++ TBAA"}
+// PATH: [[TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
+// PATH: [[META13]] = !{!"long long", [[META4]], i64 0}
+// PATH: [[TBAA14]] = !{[[META15:![0-9]+]], [[META15]], i64 0}
+// PATH: [[META15]] = !{!"long", [[META4]], i64 0}
+// PATH: [[TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0}
+// PATH: [[META19]] = !{!"short", [[META4]], i64 0}
+// PATH: [[TBAA22]] = !{[[META4]], [[META4]], i64 0}
+//.
diff --git a/clang/test/CodeGen/tbaa.cpp b/clang/test/CodeGen/tbaa.cpp
index 3e92d1ea3df95..29c0c58432e06 100644
--- a/clang/test/CodeGen/tbaa.cpp
+++ b/clang/test/CodeGen/tbaa.cpp
@@ -1,3 +1,4 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -no-struct-path-tbaa -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s
 // RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s -check-prefixes=PATH,OLD-PATH
 // RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -new-struct-path-tbaa -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s -check-prefixes=PATH,NEW-PATH
@@ -51,11 +52,11 @@ typedef struct
 
 uint32_t g(uint32_t *s, StructA *A, uint64_t count) {
 // CHECK-LABEL: define{{.*}} i32 @_Z1g
-// CHECK: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TAG_i32:!.*]]
-// CHECK: store i32 4, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
+// CHECK: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TBAA2:![0-9]+]]
+// CHECK: store i32 4, ptr %{{.*}}, align 4, !tbaa [[TBAA2]]
 // PATH-LABEL: define{{.*}} i32 @_Z1g
-// PATH: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TAG_i32:!.*]]
-// PATH: store i32 4, ptr %{{.*}}, align 4, !tbaa [[TAG_A_f32:!.*]]
+// PATH: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TBAA2:![0-9]+]]
+// PATH: store i32 4, ptr %{{.*}}, align 4, !tbaa [[TBAA12:![0-9]+]]
   *s = 1;
   A->f32 = 4;
   return *s;
@@ -63,11 +64,11 @@ uint32_t g(uint32_t *s, StructA *A, uint64_t count) {
 
 uint32_t g2(uint32_t *s, StructA *A, uint64_t count) {
 // CHECK-LABEL: define{{.*}} i32 @_Z2g2
-// CHECK: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
-// CHECK: store i16 4, ptr %{{.*}}, align 4, !tbaa [[TAG_i16:!.*]]
+// CHECK: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TBAA2]]
+// CHECK: store i16 4, ptr %{{.*}}, align 4, !tbaa [[TBAA12:![0-9]+]]
 // PATH-LABEL: define{{.*}} i32 @_Z2g2
-// PATH: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
-// PATH: store i16 4, ptr %{{.*}}, align 4, !tbaa [[TAG_A_f16:!.*]]
+// PATH: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TBAA2]]
+// PATH: store i16 4, ptr %{{.*}}, align 4, !tbaa [[TBAA15:![0-9]+]]
   *s = 1;
   A->f16 = 4;
   return *s;
@@ -75,11 +76,11 @@ uint32_t g2(uint32_t *s, StructA *A, uint64_t count) {
 
 uint32_t g3(StructA *A, StructB *B, uint64_t count) {
 // CHECK-LABEL: define{{.*}} i32 @_Z2g3
-// CHECK: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
-// CHECK: store i32 4, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
+// CHECK: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TBAA2]]
+// CHECK: store i32 4, ptr %{{.*}}, align 4, !tbaa [[TBAA2]]
 // PATH-LABEL: define{{.*}} i32 @_Z2g3
-// PATH: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TAG_A_f32]]
-// PATH: store i32 4, ptr %{{.*}}, align 4, !tbaa [[TAG_B_a_f32:!.*]]
+// PATH: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TBAA12]]
+// PATH: store i32 4, ptr %{{.*}}, align 4, !tbaa [[TBAA16:![0-9]+]]
   A->f32 = 1;
   B->a.f32 = 4;
   return A->f32;
@@ -87,11 +88,11 @@ uint32_t g3(StructA *A, StructB *B, uint64_t count) {
 
 uint32_t g4(StructA *A, StructB *B, uint64_t count) {
 // CHECK-LABEL: define{{.*}} i32 @_Z2g4
-// CHECK: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
-// CHECK: store i16 4, ptr %{{.*}}, align 4, !tbaa [[TAG_i16]]
+// CHECK: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TBAA2]]
+// CHECK: store i16 4, ptr %{{.*}}, align 4, !tbaa [[TBAA12]]
 // PATH-LABEL: define{{.*}} i32 @_Z2g4
-// PATH: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TAG_A_f32]]
-// PATH: store i16 4, ptr %{{.*}}, align 4, !tbaa [[TAG_B_a_f16:!.*]]
+// PATH: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TBAA12]]
+// PATH: store i16 4, ptr %{{.*}}, align 4, !tbaa [[TBAA18:![0-9]+]]
   A->f32 = 1;
   B->a.f16 = 4;
   return A->f32;
@@ -99,11 +100,11 @@ uint32_t g4(StructA *A, StructB *B, uint64_t count) {
 
 uint32_t g5(StructA *A, StructB *B, uint64_t count) {
 // CHECK-LABEL: define{{.*}} i32 @_Z2g5
-// CHECK: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
-// CHECK: store i32 4, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
+// CHECK: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TBAA2]]
+// CHECK: store i32 4, ptr %{{.*}}, align 4, !tbaa [[TBAA2]]
 // PATH-LABEL: define{{.*}} i32 @_Z2g5
-// PATH: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TAG_A_f32]]
-// PATH: store i32 4, ptr %{{.*}}, align 4, !tbaa [[TAG_B_f32:!.*]]
+// PATH: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TBAA12]]
+// PATH: store i32 4, ptr %{{.*}}, align 4, !tbaa [[TBAA19:![0-9]+]]
   A->f32 = 1;
   B->f32 = 4;
   return A->f32;
@@ -111,11 +112,11 @@ uint32_t g5(StructA *A, StructB *B, uint64_t count) {
 
 uint32_t g6(StructA *A, StructB *B, uint64_t count) {
 // CHECK-LABEL: define{{.*}} i32 @_Z2g6
-// CHECK: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
-// CHECK: store i32 4, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
+// CHECK: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TBAA2]]
+// CHECK: store i32 4, ptr %{{.*}}, align 4, !tbaa [[TBAA2]]
 // PATH-LABEL: define{{.*}} i32 @_Z2g6
-// PATH: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TAG_A_f32]]
-// PATH: store i32 4, ptr %{{.*}}, align 4, !tbaa [[TAG_B_a_f32_2:!.*]]
+// PATH: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TBAA12]]
+// PATH: store i32 4, ptr %{{.*}}, align 4, !tbaa [[TBAA20:![0-9]+]]
   A->f32 = 1;
   B->a.f32_2 = 4;
   return A->f32;
@@ -123,11 +124,11 @@ uint32_t g6(StructA *A, StructB *B, uint64_t count) {
 
 uint32_t g7(StructA *A, StructS *S, uint64_t count) {
 // CHECK-LABEL: define{{.*}} i32 @_Z2g7
-// CHECK: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
-// CHECK: store i32 4, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
+// CHECK: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TBAA2]]
+// CHECK: store i32 4, ptr %{{.*}}, align 4, !tbaa [[TBAA2]]
 // PATH-LABEL: define{{.*}} i32 @_Z2g7
-// PATH: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TAG_A_f32]]
-// PATH: store i32 4, ptr %{{.*}}, align 4, !tbaa [[TAG_S_f32:!.*]]
+// PATH: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TBAA12]]
+// PATH: store i32 4, ptr %{{.*}}, align 4, !tbaa [[TBAA21:![0-9]+]]
   A->f32 = 1;
   S->f32 = 4;
   return A->f32;
@@ -135,11 +136,11 @@ uint32_t g7(StructA *A, StructS *S, uint64_t count) {
 
 uint32_t g8(StructA *A, StructS *S, uint64_t count) {
 // CHECK-LABEL: define{{.*}} i32 @_Z2g8
-// CHECK: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
-// CHECK: store i16 4, ptr %{{.*}}, align 4, !tbaa [[TAG_i16]]
+// CHECK: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TBAA2]]
+// CHECK: store i16 4, ptr %{{.*}}, align 4, !tbaa [[TBAA12]]
 // PATH-LABEL: define{{.*}} i32 @_Z2g8
-// PATH: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TAG_A_f32]]
-// PATH: store i16 4, ptr %{{.*}}, align 4, !tbaa [[TAG_S_f16:!.*]]
+// PATH: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TBAA12]]
+// PATH: store i16 4, ptr %{{.*}}, align 4, !tbaa [[TBAA23:![0-9]+]]
   A->f32 = 1;
   S->f16 = 4;
   return A->f32;
@@ -147,11 +148,11 @@ uint32_t g8(StructA *A, StructS *S, uint64_t count) {
 
 uint32_t g9(StructS *S, StructS2 *S2, uint64_t count) {
 // CHECK-LABEL: define{{.*}} i32 @_Z2g9
-// CHECK: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
-// CHECK: store i32 4, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
+// CHECK: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TBAA2]]
+// CHECK: store i32 4, ptr %{{.*}}, align 4, !tbaa [[TBAA2]]
 // PATH-LABEL: define{{.*}} i32 @_Z2g9
-// PATH: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TAG_S_f32]]
-// PATH: store i32 4, ptr %{{.*}}, align 4, !tbaa [[TAG_S2_f32:!.*]]
+// PATH: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TBAA21]]
+// PATH: store i32 4, ptr %{{.*}}, align 4, !tbaa [[TBAA24:![0-9]+]]
   S->f32 = 1;
   S2->f32 = 4;
   return S->f32;
@@ -159,11 +160,11 @@ uint32_t g9(StructS *S, StructS2 *S2, uint64_t count) {
 
 uint32_t g10(StructS *S, StructS2 *S2, uint64_t count) {
 // CHECK-LABEL: define{{.*}} i32 @_Z3g10
-// CHECK: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
-// CHECK: store i16 4, ptr %{{.*}}, align 4, !tbaa [[TAG_i16]]
+// CHECK: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TBAA2]]
+// CHECK: store i16 4, ptr %{{.*}}, align 4, !tbaa [[TBAA12]]
 // PATH-LABEL: define{{.*}} i32 @_Z3g10
-// PATH: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TAG_S_f32]]
-// PATH: store i16 4, ptr %{{.*}}, align 4, !tbaa [[TAG_S2_f16:!.*]]
+// PATH: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TBAA21]]
+// PATH: store i16 4, ptr %{{.*}}, align 4, !tbaa [[TBAA26:![0-9]+]]
   S->f32 = 1;
   S2->f16 = 4;
   return S->f32;
@@ -171,11 +172,11 @@ uint32_t g10(StructS *S, StructS2 *S2, uint64_t count) {
 
 uint32_t g11(StructC *C, StructD *D, uint64_t count) {
 // CHECK-LABEL: define{{.*}} i32 @_Z3g11
-// CHECK: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
-// CHECK: store i32 4, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
+// CHECK: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TBAA2]]
+// CHECK: store i32 4, ptr %{{.*}}, align 4, !tbaa [[TBAA2]]
 // PATH-LABEL: define{{.*}} i32 @_Z3g11
-// PATH: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TAG_C_b_a_f32:!.*]]
-// PATH: store i32 4, ptr %{{.*}}, align 4, !tbaa [[TAG_D_b_a_f32:!.*]]
+// PATH: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TBAA27:![0-9]+]]
+// PATH: store i32 4, ptr %{{.*}}, align 4, !tbaa [[TBAA29:![0-9]+]]
   C->b.a.f32 = 1;
   D->b.a.f32 = 4;
   return C->b.a.f32;
@@ -183,12 +184,12 @@ uint32_t g11(StructC *C, StructD *D, uint64_t count) {
 
 uint32_t g12(StructC *C, StructD *D, uint64_t count) {
 // CHECK-LABEL: define{{.*}} i32 @_Z3g12
-// CHECK: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
-// CHECK: store i32 4, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
+// CHECK: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TBAA2]]
+// CHECK: store i32 4, ptr %{{.*}}, align 4, !tbaa [[TBAA2]]
 // TODO: differentiate the two accesses.
 // PATH-LABEL: define{{.*}} i32 @_Z3g12
-// PATH: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TAG_B_a_f32]]
-// PATH: store i32 4, ptr %{{.*}}, align 4, !tbaa [[TAG_B_a_f32]]
+// PATH: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TBAA16]]
+// PATH: store i32 4, ptr %{{.*}}, align 4, !tbaa [[TBAA16]]
   StructB *b1 = &(C->b);
   StructB *b2 = &(D->b);
   // b1, b2 have different context.
@@ -208,9 +209,9 @@ struct five {
 char g13(struct five *a, struct five *b) {
   return a->b;
 // CHECK-LABEL: define{{.*}} signext i8 @_Z3g13
-// CHECK: load i8, ptr %{{.*}}, align 1, !tbaa [[TAG_char:!.*]]
+// CHECK: load i8, ptr %{{.*}}, align 1, !tbaa [[TBAA16:![0-9]+]]
 // PATH-LABEL: define{{.*}} signext i8 @_Z3g13
-// PATH: load i8, ptr %{{.*}}, align 1, !tbaa [[TAG_five_b:!.*]]
+// PATH: load i8, ptr %{{.*}}, align 1, !tbaa [[TBAA33:![0-9]+]]
 }
 
 struct six {
@@ -221,9 +222,9 @@ struct six {
 };
 char g14(struct six *a, struct six *b) {
 // CHECK-LABEL: define{{.*}} signext i8 @_Z3g14
-// CHECK: load i8, ptr %{{.*}}, align 1, !tbaa [[TAG_char]]
+// CHECK: load i8, ptr %{{.*}}, align 1, !tbaa [[TBAA16]]
 // PATH-LABEL: define{{.*}} signext i8 @_Z3g14
-// PATH: load i8, ptr %{{.*}}, align 1, !tbaa [[TAG_six_b:!.*]]
+// PATH: load i8, ptr %{{.*}}, align 1, !tbaa [[TBAA37:![0-9]+]]
   return a->b;
 }
 
@@ -231,75 +232,78 @@ char g14(struct six *a, struct six *b) {
 typedef StructS StructS3;
 uint32_t g15(StructS *S, StructS3 *S3, uint64_t count) {
 // CHECK-LABEL: define{{.*}} i32 @_Z3g15
-// CHECK: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
-// CHECK: store i32 4, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]]
+// CHECK: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TBAA2]]
+// CHECK: store i32 4, ptr %{{.*}}, align 4, !tbaa [[TBAA2]]
 // PATH-LABEL: define{{.*}} i32 @_Z3g15
-// PATH: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TAG_S_f32]]
-// PATH: store i32 4, ptr %{{.*}}, align 4, !tbaa [[TAG_S_f32]]
+// PATH: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TBAA21]]
+// PATH: store i32 4, ptr %{{.*}}, align 4, !tbaa [[TBAA21]]
   S->f32 = 1;
   S3->f32 = 4;
   return S->f32;
 }
 
-// CHECK: [[TYPE_char:!.*]] = !{!"omnipotent char", [[TAG_cxx_tbaa:!.*]],
-// CHECK: [[TAG_cxx_tbaa]] = !{!"Simple C++ TBAA"}
-// CHECK: [[TAG_i32]] = !{[[TYPE_i32:!.*]], [[TYPE_i32]], i64 0}
-// CHECK: [[TYPE_i32]] = !{!"int", [[TYPE_char]],
-// CHECK: [[TAG_i16]] = !{[[TYPE_i16:!.*]], [[TYPE_i16]], i64 0}
-// CHECK: [[TYPE_i16]] = !{!"short", [[TYPE_char]],
-// CHECK: [[TAG_char]] = !{[[TYPE_char]], [[TYPE_char]], i64 0}
-
-// OLD-PATH: [[TYPE_CHAR:!.*]] = !{!"omnipotent char", !
-// OLD-PATH: [[TAG_i32]] = !{[[TYPE_INT:!.*]], [[TYPE_INT]], i64 0}
-// OLD-PATH: [[TYPE_INT]] = !{!"int", [[TYPE_CHAR]]
-// OLD-PATH: [[TAG_A_f32]] = !{[[TYPE_A:!.*]], [[TYPE_INT]], i64 4}
-// OLD-PATH: [[TYPE_A]] = !{!"_ZTS7StructA", [[TYPE_SHORT:!.*]], i64 0, [[TYPE_INT]], i64 4, [[TYPE_SHORT]], i64 8, [[TYPE_INT]], i64 12}
-// OLD-PATH: [[TYPE_SHORT:!.*]] = !{!"short", [[TYPE_CHAR]]
-// OLD-PATH: [[TAG_A_f16]] = !{[[TYPE_A]], [[TYPE_SHORT]], i64 0}
-// OLD-PATH: [[TAG_B_a_f32]] = !{[[TYPE_B:!.*]], [[TYPE_INT]], i64 8}
-// OLD-PATH: [[TYPE_B]] = !{!"_ZTS7StructB", [[TYPE_SHORT]], i64 0, [[TYPE_A]], i64 4, [[TYPE_INT]], i64 20}
-// OLD-PATH: [[TAG_B_a_f16]] = !{[[TYPE_B]], [[TYPE_SHORT]], i64 4}
-// OLD-PATH: [[TAG_B_f32]] = !{[[TYPE_B]], [[TYPE_INT]], i64 20}
-// OLD-PATH: [[TAG_B_a_f32_2]] = !{[[TYPE_B]], [[TYPE_INT]], i64 16}
-// OLD-PATH: [[TAG_S_f32]] = !{[[TYPE_S:!.*]], [[TYPE_INT]], i64 4}
-// OLD-PATH: [[TYPE_S]] = !{!"_ZTS7StructS", [[TYPE_SHORT]], i64 0, [[TYPE_INT]], i64 4}
-// OLD-PATH: [[TAG_S_f16]] = !{[[TYPE_S]], [[TYPE_SHORT]], i64 0}
-// OLD-PATH: [[TAG_S2_f32]] = !{[[TYPE_S2:!.*]], [[TYPE_INT]], i64 4}
-// OLD-PATH: [[TYPE_S2]] = !{!"_ZTS8StructS2", [[TYPE_SHORT]], i64 0, [[TYPE_INT]], i64 4}
-// OLD-PATH: [[TAG_S2_f16]] = !{[[TYPE_S2]], [[TYPE_SHORT]], i64 0}
-// OLD-PATH: [[TAG_C_b_a_f32]] = !{[[TYPE_C:!.*]], [[TYPE_INT]], i64 12}
-// OLD-PATH: [[TYPE_C]] = !{!"_ZTS7StructC", [[TYPE_SHORT]], i64 0, [[TYPE_B]], i64 4, [[TYPE_INT]], i64 28}
-// OLD-PATH: [[TAG_D_b_a_f32]] = !{[[TYPE_D:!.*]], [[TYPE_INT]], i64 12}
-// OLD-PATH: [[TYPE_D]] = !{!"_ZTS7StructD", [[TYPE_SHORT]], i64 0, [[TYPE_B]], i64 4, [[TYPE_INT]], i64 28, [[TYPE_CHAR]], i64 32}
-// OLD-PATH: [[TAG_five_b]] = !{[[TYPE_five:!.*]], [[TYPE_CHAR]], i64 1}
-// OLD-PATH: [[TYPE_five]] = !{!"_ZTS4five", [[TYPE_CHAR]], i64 0, [[TYPE_CHAR]], i64 1, [[TYPE_CHAR]], i64 2}
-// OLD-PATH: [[TAG_six_b]] = !{[[TYPE_six:!.*]], [[TYPE_CHAR]], i64 4}
-// OLD-PATH: [[TYPE_six]] = !{!"_ZTS3six", [[TYPE_CHAR]], i64 0, [[TYPE_CHAR]], i64 4, [[TYPE_CHAR]], i64 5}
-
-// NEW-PATH-DAG: [[ROOT:!.*]] = !{!"Simple C++ TBAA"}
-// NEW-PATH-DAG: [[TYPE_char:!.*]] = !{[[ROOT]], i64 1, !"omnipotent char"}
-// NEW-PATH-DAG: [[TYPE_short:!.*]] = !{[[TYPE_char]], i64 2, !"short"}
-// NEW-PATH-DAG: [[TYPE_int:!.*]] = !{[[TYPE_char]], i64 4, !"int"}
-// NEW-PATH-DAG: [[TAG_i32:!.*]] = !{[[TYPE_int]], [[TYPE_int]], i64 0, i64 4}
-// NEW-PATH-DAG: [[TYPE_A:!.*]] = !{[[TYPE_char]], i64 16, !"_ZTS7StructA", [[TYPE_short]], i64 0, i64 2, [[TYPE_int]], i64 4, i64 4, [[TYPE_short]], i64 8, i64 2, [[TYPE_int]], i64 12, i64 4}
-// NEW-PATH-DAG: [[TAG_A_f16]] = !{[[TYPE_A]], [[TYPE_short]], i64 0, i64 2}
-// NEW-PATH-DAG: [[TAG_A_f32]] = !{[[TYPE_A]], [[TYPE_int]], i64 4, i64 4}
-// NEW-PATH-DAG: [[TYPE_B:!.*]] = !{[[TYPE_char]], i64 24, !"_ZTS7StructB", [[TYPE_short]], i64 0, i64 2, [[TYPE_A]], i64 4, i64 16, [[TYPE_int]], i64 20, i64 4}
-// NEW-PATH-DAG: [[TAG_B_a_f16]] = !{[[TYPE_B]], [[TYPE_short]], i64 4, i64 2}
-// NEW-PATH-DAG: [[TAG_B_a_f32]] = !{[[TYPE_B]], [[TYPE_int]], i64 8, i64 4}
-// NEW-PATH-DAG: [[TAG_B_f32]] = !{[[TYPE_B]], [[TYPE_int]], i64 20, i64 4}
-// NEW-PATH-DAG: [[TAG_B_a_f32_2]] = !{[[TYPE_B]], [[TYPE_int]], i64 16, i64 4}
-// NEW-PATH-DAG: [[TYPE_S:!.*]] = !{[[TYPE_char]], i64 8, !"_ZTS7StructS", [[TYPE_short]], i64 0, i64 2, [[TYPE_int]], i64 4, i64 4}
-// NEW-PATH-DAG: [[TAG_S_f16]] = !{[[TYPE_S]], [[TYPE_short]], i64 0, i64 2}
-// NEW-PATH-DAG: [[TAG_S_f32]] = !{[[TYPE_S]], [[TYPE_int]], i64 4, i64 4}
-// NEW-PATH-DAG: [[TYPE_S2:!.*]] = !{[[TYPE_char]], i64 8, !"_ZTS8StructS2", [[TYPE_short]], i64 0, i64 2, [[TYPE_int]], i64 4, i64 4}
-// NEW-PATH-DAG: [[TAG_S2_f16]] = !{[[TYPE_S2]], [[TYPE_short]], i64 0, i64 2}
-// NEW-PATH-DAG: [[TAG_S2_f32]] = !{[[TYPE_S2]], [[TYPE_int]], i64 4, i64 4}
-// NEW-PATH-DAG: [[TYPE_C:!.*]] = !{[[TYPE_char]], i64 32, !"_ZTS7StructC", [[TYPE_short]], i64 0, i64 2, [[TYPE_B]], i64 4, i64 24, [[TYPE_int]], i64 28, i64 4}
-// NEW-PATH-DAG: [[TAG_C_b_a_f32]] = !{[[TYPE_C]], [[TYPE_int]], i64 12, i64 4}
-// NEW-PATH-DAG: [[TYPE_D:!.*]] = !{[[TYPE_char]], i64 36, !"_ZTS7StructD", [[TYPE_short]], i64 0, i64 2, [[TYPE_B]], i64 4, i64 24, [[TYPE_int]], i64 28, i64 4, [[TYPE_char]], i64 32, i64 1}
-// NEW-PATH-DAG: [[TAG_D_b_a_f32]] = !{[[TYPE_D]], [[TYPE_int]], i64 12, i64 4}
-// NEW-PATH-DAG: [[TYPE_five:!.*]] = !{[[TYPE_char]], i64 3, !"_ZTS4five", [[TYPE_char]], i64 0, i64 1, [[TYPE_char]], i64 1, i64 1, [[TYPE_char]], i64 2, i64 1}
-// NEW-PATH-DAG: [[TAG_five_b]] = !{[[TYPE_five]], [[TYPE_char]], i64 1, i64 1}
-// NEW-PATH-DAG: [[TYPE_six:!.*]] = !{[[TYPE_char]], i64 6, !"_ZTS3six", [[TYPE_char]], i64 0, i64 1, [[TYPE_char]], i64 4, i64 1, [[TYPE_char]], i64 5, i64 1}
-// NEW-PATH-DAG: [[TAG_six_b]] = !{[[TYPE_six]], [[TYPE_char]], i64 4, i64 1}
+//.
+// CHECK: [[TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// CHECK: [[META3]] = !{!"int", [[META4:![0-9]+]], i64 0}
+// CHECK: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK: [[META5]] = !{!"Simple C++ TBAA"}
+// CHECK: [[TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
+// CHECK: [[META13]] = !{!"short", [[META4]], i64 0}
+// CHECK: [[TBAA16]] = !{[[META4]], [[META4]], i64 0}
+//.
+// OLD-PATH: [[TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// OLD-PATH: [[META3]] = !{!"int", [[META4:![0-9]+]], i64 0}
+// OLD-PATH: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// OLD-PATH: [[META5]] = !{!"Simple C++ TBAA"}
+// OLD-PATH: [[TBAA12]] = !{[[META13:![0-9]+]], [[META3]], i64 4}
+// OLD-PATH: [[META13]] = !{!"_ZTS7StructA", [[META14:![0-9]+]], i64 0, [[META3]], i64 4, [[META14]], i64 8, [[META3]], i64 12}
+// OLD-PATH: [[META14]] = !{!"short", [[META4]], i64 0}
+// OLD-PATH: [[TBAA15]] = !{[[META13]], [[META14]], i64 0}
+// OLD-PATH: [[TBAA16]] = !{[[META17:![0-9]+]], [[META3]], i64 8}
+// OLD-PATH: [[META17]] = !{!"_ZTS7StructB", [[META14]], i64 0, [[META13]], i64 4, [[META3]], i64 20}
+// OLD-PATH: [[TBAA18]] = !{[[META17]], [[META14]], i64 4}
+// OLD-PATH: [[TBAA19]] = !{[[META17]], [[META3]], i64 20}
+// OLD-PATH: [[TBAA20]] = !{[[META17]], [[META3]], i64 16}
+// OLD-PATH: [[TBAA21]] = !{[[META22:![0-9]+]], [[META3]], i64 4}
+// OLD-PATH: [[META22]] = !{!"_ZTS7StructS", [[META14]], i64 0, [[META3]], i64 4}
+// OLD-PATH: [[TBAA23]] = !{[[META22]], [[META14]], i64 0}
+// OLD-PATH: [[TBAA24]] = !{[[META25:![0-9]+]], [[META3]], i64 4}
+// OLD-PATH: [[META25]] = !{!"_ZTS8StructS2", [[META14]], i64 0, [[META3]], i64 4}
+// OLD-PATH: [[TBAA26]] = !{[[META25]], [[META14]], i64 0}
+// OLD-PATH: [[TBAA27]] = !{[[META28:![0-9]+]], [[META3]], i64 12}
+// OLD-PATH: [[META28]] = !{!"_ZTS7StructC", [[META14]], i64 0, [[META17]], i64 4, [[META3]], i64 28}
+// OLD-PATH: [[TBAA29]] = !{[[META30:![0-9]+]], [[META3]], i64 12}
+// OLD-PATH: [[META30]] = !{!"_ZTS7StructD", [[META14]], i64 0, [[META17]], i64 4, [[META3]], i64 28, [[META4]], i64 32}
+// OLD-PATH: [[TBAA33]] = !{[[META34:![0-9]+]], [[META4]], i64 1}
+// OLD-PATH: [[META34]] = !{!"_ZTS4five", [[META4]], i64 0, [[META4]], i64 1, [[META4]], i64 2}
+// OLD-PATH: [[TBAA37]] = !{[[META38:![0-9]+]], [[META4]], i64 4}
+// OLD-PATH: [[META38]] = !{!"_ZTS3six", [[META4]], i64 0, [[META4]], i64 4, [[META4]], i64 5}
+//.
+// NEW-PATH: [[TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0, i64 4}
+// NEW-PATH: [[META3]] = !{[[META4:![0-9]+]], i64 4, !"int"}
+// NEW-PATH: [[META4]] = !{[[META5:![0-9]+]], i64 1, !"omnipotent char"}
+// NEW-PATH: [[META5]] = !{!"Simple C++ TBAA"}
+// NEW-PATH: [[TBAA12]] = !{[[META13:![0-9]+]], [[META3]], i64 4, i64 4}
+// NEW-PATH: [[META13]] = !{[[META4]], i64 16, !"_ZTS7StructA", [[META14:![0-9]+]], i64 0, i64 2, [[META3]], i64 4, i64 4, [[META14]], i64 8, i64 2, [[META3]], i64 12, i64 4}
+// NEW-PATH: [[META14]] = !{[[META4]], i64 2, !"short"}
+// NEW-PATH: [[TBAA15]] = !{[[META13]], [[META14]], i64 0, i64 2}
+// NEW-PATH: [[TBAA16]] = !{[[META17:![0-9]+]], [[META3]], i64 8, i64 4}
+// NEW-PATH: [[META17]] = !{[[META4]], i64 24, !"_ZTS7StructB", [[META14]], i64 0, i64 2, [[META13]], i64 4, i64 16, [[META3]], i64 20, i64 4}
+// NEW-PATH: [[TBAA18]] = !{[[META17]], [[META14]], i64 4, i64 2}
+// NEW-PATH: [[TBAA19]] = !{[[META17]], [[META3]], i64 20, i64 4}
+// NEW-PATH: [[TBAA20]] = !{[[META17]], [[META3]], i64 16, i64 4}
+// NEW-PATH: [[TBAA21]] = !{[[META22:![0-9]+]], [[META3]], i64 4, i64 4}
+// NEW-PATH: [[META22]] = !{[[META4]], i64 8, !"_ZTS7StructS", [[META14]], i64 0, i64 2, [[META3]], i64 4, i64 4}
+// NEW-PATH: [[TBAA23]] = !{[[META22]], [[META14]], i64 0, i64 2}
+// NEW-PATH: [[TBAA24]] = !{[[META25:![0-9]+]], [[META3]], i64 4, i64 4}
+// NEW-PATH: [[META25]] = !{[[META4]], i64 8, !"_ZTS8StructS2", [[META14]], i64 0, i64 2, [[META3]], i64 4, i64 4}
+// NEW-PATH: [[TBAA26]] = !{[[META25]], [[META14]], i64 0, i64 2}
+// NEW-PATH: [[TBAA27]] = !{[[META28:![0-9]+]], [[META3]], i64 12, i64 4}
+// NEW-PATH: [[META28]] = !{[[META4]], i64 32, !"_ZTS7StructC", [[META14]], i64 0, i64 2, [[META17]], i64 4, i64 24, [[META3]], i64 28, i64 4}
+// NEW-PATH: [[TBAA29]] = !{[[META30:![0-9]+]], [[META3]], i64 12, i64 4}
+// NEW-PATH: [[META30]] = !{[[META4]], i64 36, !"_ZTS7StructD", [[META14]], i64 0, i64 2, [[META17]], i64 4, i64 24, [[META3]], i64 28, i64 4, [[META4]], i64 32, i64 1}
+// NEW-PATH: [[TBAA33]] = !{[[META34:![0-9]+]], [[META4]], i64 1, i64 1}
+// NEW-PATH: [[META34]] = !{[[META4]], i64 3, !"_ZTS4five", [[META4]], i64 0, i64 1, [[META4]], i64 1, i64 1, [[META4]], i64 2, i64 1}
+// NEW-PATH: [[TBAA37]] = !{[[META38:![0-9]+]], [[META4]], i64 4, i64 1}
+// NEW-PATH: [[META38]] = !{[[META4]], i64 6, !"_ZTS3six", [[META4]], i64 0, i64 1, [[META4]], i64 4, i64 1, [[META4]], i64 5, i64 1}
+//.
diff --git a/clang/test/CodeGen/unified-lto-module-flag.ll b/clang/test/CodeGen/unified-lto-module-flag.ll
new file mode 100644
index 0000000000000..deefe826d1566
--- /dev/null
+++ b/clang/test/CodeGen/unified-lto-module-flag.ll
@@ -0,0 +1,11 @@
+; Test that we do not duplicate the UnifiedLTO module flag.
+;
+; RUN: %clang_cc1 -emit-llvm -flto=full -funified-lto -o - %s | FileCheck %s
+
+; CHECK: !llvm.module.flags = !{!0, !1, !2, !3}
+!llvm.module.flags = !{!0, !1, !2, !3}
+
+!0 = !{i32 1, !"wchar_size", i32 2}
+!1 = !{i32 7, !"frame-pointer", i32 2}
+!2 = !{i32 1, !"EnableSplitLTOUnit", i32 1}
+!3 = !{i32 1, !"UnifiedLTO", i32 1}
diff --git a/clang/test/CodeGenCUDA/Inputs/cuda.h b/clang/test/CodeGenCUDA/Inputs/cuda.h
index dc85eae0c5178..e7ad784335027 100644
--- a/clang/test/CodeGenCUDA/Inputs/cuda.h
+++ b/clang/test/CodeGenCUDA/Inputs/cuda.h
@@ -13,6 +13,8 @@
 #endif
 #define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
 #define __grid_constant__ __attribute__((grid_constant))
+#define __cluster_dims__(...) __attribute__((cluster_dims(__VA_ARGS__)))
+#define __no_cluster__ __attribute__((no_cluster))
 #else
 #define __constant__
 #define __device__
@@ -22,6 +24,8 @@
 #define __managed__
 #define __launch_bounds__(...)
 #define __grid_constant__
+#define __cluster_dims__(...)
+#define __no_cluster__
 #endif
 
 struct dim3 {
diff --git a/clang/test/CodeGenCUDA/cluster_dims.cu b/clang/test/CodeGenCUDA/cluster_dims.cu
new file mode 100644
index 0000000000000..00635e3572a7f
--- /dev/null
+++ b/clang/test/CodeGenCUDA/cluster_dims.cu
@@ -0,0 +1,38 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1250 -fcuda-is-device -emit-llvm -x hip -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -aux-triple amdgcn-amd-amdhsa -emit-llvm -x hip -o - %s | FileCheck --check-prefix=HOST %s
+
+#include "Inputs/cuda.h"
+
+const int constint = 4;
+
+// HOST-NOT: "amdgpu-cluster-dims"
+
+// CHECK: "amdgpu-cluster-dims"="2,2,2"
+__global__ void __cluster_dims__(2, 2, 2) test_literal_3d() {}
+
+// CHECK: "amdgpu-cluster-dims"="2,2,1"
+__global__ void __cluster_dims__(2, 2) test_literal_2d() {}
+
+// CHECK: "amdgpu-cluster-dims"="4,1,1"
+__global__ void __cluster_dims__(4) test_literal_1d() {}
+
+// CHECK: "amdgpu-cluster-dims"="4,2,1"
+__global__ void __cluster_dims__(constint, constint / 2, 1) test_constant() {}
+
+// CHECK: "amdgpu-cluster-dims"="0,0,0"
+__global__ void __no_cluster__ test_no_cluster() {}
+
+// CHECK: "amdgpu-cluster-dims"="7,1,1"
+template<unsigned a>
+__global__ void __cluster_dims__(a) test_template_1d() {}
+template __global__ void test_template_1d<7>();
+
+// CHECK: "amdgpu-cluster-dims"="2,6,1"
+template<unsigned a, unsigned b>
+__global__ void __cluster_dims__(a, b) test_template_2d() {}
+template __global__ void test_template_2d<2, 6>();
+
+// CHECK: "amdgpu-cluster-dims"="1,2,3"
+template<unsigned a, unsigned b, unsigned c>
+__global__ void __cluster_dims__(a, b, c) test_template_3d() {}
+template __global__ void test_template_3d<1, 2, 3>();
diff --git a/clang/test/CodeGenCXX/attr-likelihood-if-branch-weights.cpp b/clang/test/CodeGenCXX/attr-likelihood-if-branch-weights.cpp
index 8969e12f8f797..2af292961a331 100644
--- a/clang/test/CodeGenCXX/attr-likelihood-if-branch-weights.cpp
+++ b/clang/test/CodeGenCXX/attr-likelihood-if-branch-weights.cpp
@@ -10,7 +10,7 @@ extern bool B();
 // CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[RETVAL:%.*]] = alloca i1, align 1
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA2:![0-9]+]], !range [[RNG6:![0-9]+]], !noundef [[META7:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA6:![0-9]+]], !range [[RNG8:![0-9]+]], !noundef [[META9:![0-9]+]]
 // CHECK-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP0]] to i1
 // CHECK-NEXT:    [[LOADEDV_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[LOADEDV]], i1 true)
 // CHECK-NEXT:    br i1 [[LOADEDV_EXPVAL]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
@@ -38,7 +38,7 @@ bool f() {
 // CHECK-SAME: ) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[RETVAL:%.*]] = alloca i1, align 1
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA2]], !range [[RNG6]], !noundef [[META7]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA6]], !range [[RNG8]], !noundef [[META9]]
 // CHECK-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP0]] to i1
 // CHECK-NEXT:    [[LOADEDV_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[LOADEDV]], i1 false)
 // CHECK-NEXT:    br i1 [[LOADEDV_EXPVAL]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
@@ -67,7 +67,7 @@ bool g() {
 // CHECK-SAME: ) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[RETVAL:%.*]] = alloca i1, align 1
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA2]], !range [[RNG6]], !noundef [[META7]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA6]], !range [[RNG8]], !noundef [[META9]]
 // CHECK-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP0]] to i1
 // CHECK-NEXT:    [[LOADEDV_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[LOADEDV]], i1 false)
 // CHECK-NEXT:    br i1 [[LOADEDV_EXPVAL]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
@@ -93,14 +93,14 @@ bool h() {
 // CHECK-LABEL: define dso_local void @_Z8NullStmtv(
 // CHECK-SAME: ) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA2]], !range [[RNG6]], !noundef [[META7]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA6]], !range [[RNG8]], !noundef [[META9]]
 // CHECK-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP0]] to i1
 // CHECK-NEXT:    [[LOADEDV_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[LOADEDV]], i1 false)
 // CHECK-NEXT:    br i1 [[LOADEDV_EXPVAL]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]]
 // CHECK:       [[IF_THEN]]:
 // CHECK-NEXT:    br label %[[IF_END:.*]]
 // CHECK:       [[IF_ELSE]]:
-// CHECK-NEXT:    store volatile i8 1, ptr @b, align 1, !tbaa [[BOOL_TBAA2]]
+// CHECK-NEXT:    store volatile i8 1, ptr @b, align 1, !tbaa [[BOOL_TBAA6]]
 // CHECK-NEXT:    br label %[[IF_END]]
 // CHECK:       [[IF_END]]:
 // CHECK-NEXT:    ret void
@@ -117,7 +117,7 @@ void NullStmt() {
 // CHECK-LABEL: define dso_local void @_Z6IfStmtv(
 // CHECK-SAME: ) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA2]], !range [[RNG6]], !noundef [[META7]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA6]], !range [[RNG8]], !noundef [[META9]]
 // CHECK-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP0]] to i1
 // CHECK-NEXT:    [[LOADEDV_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[LOADEDV]], i1 false)
 // CHECK-NEXT:    br i1 [[LOADEDV_EXPVAL]], label %[[IF_THEN:.*]], label %[[IF_END2:.*]]
@@ -129,7 +129,7 @@ void NullStmt() {
 // CHECK:       [[IF_END]]:
 // CHECK-NEXT:    br label %[[IF_END2]]
 // CHECK:       [[IF_END2]]:
-// CHECK-NEXT:    [[TMP1:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA2]], !range [[RNG6]], !noundef [[META7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA6]], !range [[RNG8]], !noundef [[META9]]
 // CHECK-NEXT:    [[LOADEDV3:%.*]] = trunc i8 [[TMP1]] to i1
 // CHECK-NEXT:    br i1 [[LOADEDV3]], label %[[IF_THEN4:.*]], label %[[IF_END8:.*]]
 // CHECK:       [[IF_THEN4]]:
@@ -137,7 +137,7 @@ void NullStmt() {
 // CHECK-NEXT:    [[CALL5_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[CALL5]], i1 false)
 // CHECK-NEXT:    br i1 [[CALL5_EXPVAL]], label %[[IF_THEN6:.*]], label %[[IF_END7:.*]]
 // CHECK:       [[IF_THEN6]]:
-// CHECK-NEXT:    store volatile i8 0, ptr @b, align 1, !tbaa [[BOOL_TBAA2]]
+// CHECK-NEXT:    store volatile i8 0, ptr @b, align 1, !tbaa [[BOOL_TBAA6]]
 // CHECK-NEXT:    br label %[[IF_END7]]
 // CHECK:       [[IF_END7]]:
 // CHECK-NEXT:    br label %[[IF_END8]]
@@ -157,7 +157,7 @@ void IfStmt() {
 // CHECK-LABEL: define dso_local void @_Z9WhileStmtv(
 // CHECK-SAME: ) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA2]], !range [[RNG6]], !noundef [[META7]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA6]], !range [[RNG8]], !noundef [[META9]]
 // CHECK-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP0]] to i1
 // CHECK-NEXT:    [[LOADEDV_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[LOADEDV]], i1 false)
 // CHECK-NEXT:    br i1 [[LOADEDV_EXPVAL]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
@@ -167,11 +167,11 @@ void IfStmt() {
 // CHECK-NEXT:    [[CALL:%.*]] = call noundef zeroext i1 @_Z1Bv()
 // CHECK-NEXT:    br i1 [[CALL]], label %[[WHILE_BODY:.*]], label %[[WHILE_END:.*]]
 // CHECK:       [[WHILE_BODY]]:
-// CHECK-NEXT:    br label %[[WHILE_COND]], !llvm.loop [[LOOP8:![0-9]+]]
+// CHECK-NEXT:    br label %[[WHILE_COND]], !llvm.loop [[LOOP10:![0-9]+]]
 // CHECK:       [[WHILE_END]]:
 // CHECK-NEXT:    br label %[[IF_END]]
 // CHECK:       [[IF_END]]:
-// CHECK-NEXT:    [[TMP1:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA2]], !range [[RNG6]], !noundef [[META7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA6]], !range [[RNG8]], !noundef [[META9]]
 // CHECK-NEXT:    [[LOADEDV1:%.*]] = trunc i8 [[TMP1]] to i1
 // CHECK-NEXT:    br i1 [[LOADEDV1]], label %[[IF_THEN2:.*]], label %[[IF_END7:.*]]
 // CHECK:       [[IF_THEN2]]:
@@ -181,8 +181,8 @@ void IfStmt() {
 // CHECK-NEXT:    [[CALL4_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[CALL4]], i1 false)
 // CHECK-NEXT:    br i1 [[CALL4_EXPVAL]], label %[[WHILE_BODY5:.*]], label %[[WHILE_END6:.*]]
 // CHECK:       [[WHILE_BODY5]]:
-// CHECK-NEXT:    store volatile i8 0, ptr @b, align 1, !tbaa [[BOOL_TBAA2]]
-// CHECK-NEXT:    br label %[[WHILE_COND3]], !llvm.loop [[LOOP11:![0-9]+]]
+// CHECK-NEXT:    store volatile i8 0, ptr @b, align 1, !tbaa [[BOOL_TBAA6]]
+// CHECK-NEXT:    br label %[[WHILE_COND3]], !llvm.loop [[LOOP13:![0-9]+]]
 // CHECK:       [[WHILE_END6]]:
 // CHECK-NEXT:    br label %[[IF_END7]]
 // CHECK:       [[IF_END7]]:
@@ -200,7 +200,7 @@ void WhileStmt() {
 // CHECK-LABEL: define dso_local void @_Z6DoStmtv(
 // CHECK-SAME: ) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA2]], !range [[RNG6]], !noundef [[META7]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA6]], !range [[RNG8]], !noundef [[META9]]
 // CHECK-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP0]] to i1
 // CHECK-NEXT:    [[LOADEDV_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[LOADEDV]], i1 false)
 // CHECK-NEXT:    br i1 [[LOADEDV_EXPVAL]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
@@ -210,11 +210,11 @@ void WhileStmt() {
 // CHECK-NEXT:    br label %[[DO_COND:.*]]
 // CHECK:       [[DO_COND]]:
 // CHECK-NEXT:    [[CALL:%.*]] = call noundef zeroext i1 @_Z1Bv()
-// CHECK-NEXT:    br i1 [[CALL]], label %[[DO_BODY]], label %[[DO_END:.*]], !llvm.loop [[LOOP12:![0-9]+]]
+// CHECK-NEXT:    br i1 [[CALL]], label %[[DO_BODY]], label %[[DO_END:.*]], !llvm.loop [[LOOP14:![0-9]+]]
 // CHECK:       [[DO_END]]:
 // CHECK-NEXT:    br label %[[IF_END]]
 // CHECK:       [[IF_END]]:
-// CHECK-NEXT:    [[TMP1:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA2]], !range [[RNG6]], !noundef [[META7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA6]], !range [[RNG8]], !noundef [[META9]]
 // CHECK-NEXT:    [[LOADEDV1:%.*]] = trunc i8 [[TMP1]] to i1
 // CHECK-NEXT:    br i1 [[LOADEDV1]], label %[[IF_THEN2:.*]], label %[[IF_END7:.*]]
 // CHECK:       [[IF_THEN2]]:
@@ -223,7 +223,7 @@ void WhileStmt() {
 // CHECK-NEXT:    br label %[[DO_COND4:.*]]
 // CHECK:       [[DO_COND4]]:
 // CHECK-NEXT:    [[CALL5:%.*]] = call noundef zeroext i1 @_Z1Bv()
-// CHECK-NEXT:    br i1 [[CALL5]], label %[[DO_BODY3]], label %[[DO_END6:.*]], !llvm.loop [[LOOP13:![0-9]+]]
+// CHECK-NEXT:    br i1 [[CALL5]], label %[[DO_BODY3]], label %[[DO_END6:.*]], !llvm.loop [[LOOP15:![0-9]+]]
 // CHECK:       [[DO_END6]]:
 // CHECK-NEXT:    br label %[[IF_END7]]
 // CHECK:       [[IF_END7]]:
@@ -244,7 +244,7 @@ void DoStmt() {
 // CHECK-LABEL: define dso_local void @_Z7ForStmtv(
 // CHECK-SAME: ) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA2]], !range [[RNG6]], !noundef [[META7]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA6]], !range [[RNG8]], !noundef [[META9]]
 // CHECK-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP0]] to i1
 // CHECK-NEXT:    [[LOADEDV_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[LOADEDV]], i1 false)
 // CHECK-NEXT:    br i1 [[LOADEDV_EXPVAL]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
@@ -254,11 +254,11 @@ void DoStmt() {
 // CHECK-NEXT:    [[CALL:%.*]] = call noundef zeroext i1 @_Z1Bv()
 // CHECK-NEXT:    br i1 [[CALL]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
 // CHECK:       [[FOR_BODY]]:
-// CHECK-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]]
+// CHECK-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]]
 // CHECK:       [[FOR_END]]:
 // CHECK-NEXT:    br label %[[IF_END]]
 // CHECK:       [[IF_END]]:
-// CHECK-NEXT:    [[TMP1:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA2]], !range [[RNG6]], !noundef [[META7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA6]], !range [[RNG8]], !noundef [[META9]]
 // CHECK-NEXT:    [[LOADEDV1:%.*]] = trunc i8 [[TMP1]] to i1
 // CHECK-NEXT:    br i1 [[LOADEDV1]], label %[[IF_THEN2:.*]], label %[[IF_END7:.*]]
 // CHECK:       [[IF_THEN2]]:
@@ -268,7 +268,7 @@ void DoStmt() {
 // CHECK-NEXT:    [[CALL4_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[CALL4]], i1 false)
 // CHECK-NEXT:    br i1 [[CALL4_EXPVAL]], label %[[FOR_BODY5:.*]], label %[[FOR_END6:.*]]
 // CHECK:       [[FOR_BODY5]]:
-// CHECK-NEXT:    br label %[[FOR_COND3]], !llvm.loop [[LOOP15:![0-9]+]]
+// CHECK-NEXT:    br label %[[FOR_COND3]], !llvm.loop [[LOOP17:![0-9]+]]
 // CHECK:       [[FOR_END6]]:
 // CHECK-NEXT:    br label %[[IF_END7]]
 // CHECK:       [[IF_END7]]:
@@ -286,14 +286,14 @@ void ForStmt() {
 // CHECK-LABEL: define dso_local void @_Z8GotoStmtv(
 // CHECK-SAME: ) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA2]], !range [[RNG6]], !noundef [[META7]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA6]], !range [[RNG8]], !noundef [[META9]]
 // CHECK-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP0]] to i1
 // CHECK-NEXT:    [[LOADEDV_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[LOADEDV]], i1 false)
 // CHECK-NEXT:    br i1 [[LOADEDV_EXPVAL]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]]
 // CHECK:       [[IF_THEN]]:
 // CHECK-NEXT:    br label %[[END:.*]]
 // CHECK:       [[IF_ELSE]]:
-// CHECK-NEXT:    store volatile i8 1, ptr @b, align 1, !tbaa [[BOOL_TBAA2]]
+// CHECK-NEXT:    store volatile i8 1, ptr @b, align 1, !tbaa [[BOOL_TBAA6]]
 // CHECK-NEXT:    br label %[[IF_END:.*]]
 // CHECK:       [[IF_END]]:
 // CHECK-NEXT:    br label %[[END]]
@@ -313,14 +313,14 @@ end:;
 // CHECK-LABEL: define dso_local void @_Z10ReturnStmtv(
 // CHECK-SAME: ) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA2]], !range [[RNG6]], !noundef [[META7]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA6]], !range [[RNG8]], !noundef [[META9]]
 // CHECK-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP0]] to i1
 // CHECK-NEXT:    [[LOADEDV_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[LOADEDV]], i1 false)
 // CHECK-NEXT:    br i1 [[LOADEDV_EXPVAL]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]]
 // CHECK:       [[IF_THEN]]:
 // CHECK-NEXT:    br label %[[IF_END:.*]]
 // CHECK:       [[IF_ELSE]]:
-// CHECK-NEXT:    store volatile i8 1, ptr @b, align 1, !tbaa [[BOOL_TBAA2]]
+// CHECK-NEXT:    store volatile i8 1, ptr @b, align 1, !tbaa [[BOOL_TBAA6]]
 // CHECK-NEXT:    br label %[[IF_END]]
 // CHECK:       [[IF_END]]:
 // CHECK-NEXT:    ret void
@@ -337,31 +337,31 @@ void ReturnStmt() {
 // CHECK-LABEL: define dso_local void @_Z10SwitchStmtv(
 // CHECK-SAME: ) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA2]], !range [[RNG6]], !noundef [[META7]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA6]], !range [[RNG8]], !noundef [[META9]]
 // CHECK-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP0]] to i1
 // CHECK-NEXT:    [[LOADEDV_EXPVAL:%.*]] = call i1 @llvm.expect.i1(i1 [[LOADEDV]], i1 false)
 // CHECK-NEXT:    br i1 [[LOADEDV_EXPVAL]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]]
 // CHECK:       [[IF_THEN]]:
-// CHECK-NEXT:    [[TMP1:%.*]] = load volatile i32, ptr @i, align 4, !tbaa [[INT_TBAA16:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load volatile i32, ptr @i, align 4, !tbaa [[INT_TBAA2:![0-9]+]]
 // CHECK-NEXT:    switch i32 [[TMP1]], label %[[SW_EPILOG:.*]] [
 // CHECK-NEXT:    ]
 // CHECK:       [[SW_EPILOG]]:
 // CHECK-NEXT:    br label %[[IF_END:.*]]
 // CHECK:       [[IF_ELSE]]:
-// CHECK-NEXT:    store volatile i8 1, ptr @b, align 1, !tbaa [[BOOL_TBAA2]]
+// CHECK-NEXT:    store volatile i8 1, ptr @b, align 1, !tbaa [[BOOL_TBAA6]]
 // CHECK-NEXT:    br label %[[IF_END]]
 // CHECK:       [[IF_END]]:
-// CHECK-NEXT:    [[TMP2:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA2]], !range [[RNG6]], !noundef [[META7]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load volatile i8, ptr @b, align 1, !tbaa [[BOOL_TBAA6]], !range [[RNG8]], !noundef [[META9]]
 // CHECK-NEXT:    [[LOADEDV1:%.*]] = trunc i8 [[TMP2]] to i1
 // CHECK-NEXT:    br i1 [[LOADEDV1]], label %[[IF_THEN2:.*]], label %[[IF_ELSE4:.*]]
 // CHECK:       [[IF_THEN2]]:
-// CHECK-NEXT:    [[TMP3:%.*]] = load volatile i32, ptr @i, align 4, !tbaa [[INT_TBAA16]]
+// CHECK-NEXT:    [[TMP3:%.*]] = load volatile i32, ptr @i, align 4, !tbaa [[INT_TBAA2]]
 // CHECK-NEXT:    switch i32 [[TMP3]], label %[[SW_EPILOG3:.*]] [
 // CHECK-NEXT:    ]
 // CHECK:       [[SW_EPILOG3]]:
 // CHECK-NEXT:    br label %[[IF_END5:.*]]
 // CHECK:       [[IF_ELSE4]]:
-// CHECK-NEXT:    store volatile i8 1, ptr @b, align 1, !tbaa [[BOOL_TBAA2]]
+// CHECK-NEXT:    store volatile i8 1, ptr @b, align 1, !tbaa [[BOOL_TBAA6]]
 // CHECK-NEXT:    br label %[[IF_END5]]
 // CHECK:       [[IF_END5]]:
 // CHECK-NEXT:    ret void
@@ -383,20 +383,20 @@ void SwitchStmt() {
 }
 
 //.
-// CHECK: [[BOOL_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
-// CHECK: [[META3]] = !{!"bool", [[META4:![0-9]+]], i64 0}
+// CHECK: [[INT_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// CHECK: [[META3]] = !{!"int", [[META4:![0-9]+]], i64 0}
 // CHECK: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
 // CHECK: [[META5]] = !{!"Simple C++ TBAA"}
-// CHECK: [[RNG6]] = !{i8 0, i8 2}
-// CHECK: [[META7]] = !{}
-// CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META9:![0-9]+]], [[META10:![0-9]+]]}
-// CHECK: [[META9]] = !{!"llvm.loop.mustprogress"}
-// CHECK: [[META10]] = !{!"llvm.loop.unroll.disable"}
-// CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META9]], [[META10]]}
-// CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META9]], [[META10]]}
-// CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META9]], [[META10]]}
-// CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META9]], [[META10]]}
-// CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META9]], [[META10]]}
-// CHECK: [[INT_TBAA16]] = !{[[META17:![0-9]+]], [[META17]], i64 0}
-// CHECK: [[META17]] = !{!"int", [[META4]], i64 0}
+// CHECK: [[BOOL_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// CHECK: [[META7]] = !{!"bool", [[META4]], i64 0}
+// CHECK: [[RNG8]] = !{i8 0, i8 2}
+// CHECK: [[META9]] = !{}
+// CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META11:![0-9]+]], [[META12:![0-9]+]]}
+// CHECK: [[META11]] = !{!"llvm.loop.mustprogress"}
+// CHECK: [[META12]] = !{!"llvm.loop.unroll.disable"}
+// CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META11]], [[META12]]}
+// CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META11]], [[META12]]}
+// CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META11]], [[META12]]}
+// CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META11]], [[META12]]}
+// CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META11]], [[META12]]}
 //.
diff --git a/clang/test/CodeGenCXX/builtin-atomic-compare_exchange.cpp b/clang/test/CodeGenCXX/builtin-atomic-compare_exchange.cpp
new file mode 100644
index 0000000000000..4f1fe9852ce12
--- /dev/null
+++ b/clang/test/CodeGenCXX/builtin-atomic-compare_exchange.cpp
@@ -0,0 +1,130 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -std=c++20 -triple=x86_64-linux-gnu -emit-llvm -o - %s | FileCheck %s
+
+
+template <unsigned Size>
+struct S {
+    char data[Size];
+};
+
+// CHECK-LABEL: define dso_local noundef zeroext i1 @_Z21test_compare_exchangePU7_Atomic1SILj3EEPS0_S0_(
+// CHECK-SAME: ptr noundef [[A:%.*]], ptr noundef [[EXPECTED:%.*]], i24 [[DESIRED_COERCE:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[DESIRED:%.*]] = alloca [[STRUCT_S:%.*]], align 1
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[EXPECTED_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca [[STRUCT_S]], align 1
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca { [[STRUCT_S]], [1 x i8] }, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca { [[STRUCT_S]], [1 x i8] }, align 4
+// CHECK-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[OLD_TMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[DESIRED]], i32 0, i32 0
+// CHECK-NEXT:    store i24 [[DESIRED_COERCE]], ptr [[COERCE_DIVE]], align 1
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[EXPECTED]], ptr [[EXPECTED_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[EXPECTED_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DOTATOMICTMP]], ptr align 1 [[DESIRED]], i64 3, i1 false)
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP]], i8 0, i64 4, i1 false)
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[ATOMIC_TEMP]], ptr align 1 [[TMP1]], i64 3, i1 false)
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[ATOMIC_TEMP1]], i8 0, i64 4, i1 false)
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[ATOMIC_TEMP1]], ptr align 1 [[DOTATOMICTMP]], i64 3, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP1]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP2]], i32 [[TMP3]] monotonic monotonic, align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+// CHECK-NEXT:    br i1 [[TMP6]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// CHECK:       [[CMPXCHG_STORE_EXPECTED]]:
+// CHECK-NEXT:    store i32 [[TMP5]], ptr [[OLD_TMP]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[TMP1]], ptr align 4 [[OLD_TMP]], i64 3, i1 false)
+// CHECK-NEXT:    br label %[[CMPXCHG_CONTINUE]]
+// CHECK:       [[CMPXCHG_CONTINUE]]:
+// CHECK-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP6]] to i8
+// CHECK-NEXT:    store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1
+// CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1
+// CHECK-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP7]] to i1
+// CHECK-NEXT:    ret i1 [[LOADEDV]]
+//
+bool test_compare_exchange(_Atomic(S<3>)* a, S<3>* expected, S<3> desired) {
+    return __c11_atomic_compare_exchange_strong(a, expected, desired, 0, 0);
+}
+
+
+// CHECK-LABEL: define dso_local noundef zeroext i1 @_Z21test_compare_exchangePU7_Atomic1SILj4EEPS0_S0_(
+// CHECK-SAME: ptr noundef [[A:%.*]], ptr noundef [[EXPECTED:%.*]], i32 [[DESIRED_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[DESIRED:%.*]] = alloca [[STRUCT_S_0:%.*]], align 1
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[EXPECTED_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca [[STRUCT_S_0]], align 1
+// CHECK-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0]], ptr [[DESIRED]], i32 0, i32 0
+// CHECK-NEXT:    store i32 [[DESIRED_COERCE]], ptr [[COERCE_DIVE]], align 1
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[EXPECTED]], ptr [[EXPECTED_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[EXPECTED_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DOTATOMICTMP]], ptr align 1 [[DESIRED]], i64 4, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 1
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP2]], i32 [[TMP3]] monotonic monotonic, align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+// CHECK-NEXT:    br i1 [[TMP6]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// CHECK:       [[CMPXCHG_STORE_EXPECTED]]:
+// CHECK-NEXT:    store i32 [[TMP5]], ptr [[TMP1]], align 1
+// CHECK-NEXT:    br label %[[CMPXCHG_CONTINUE]]
+// CHECK:       [[CMPXCHG_CONTINUE]]:
+// CHECK-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP6]] to i8
+// CHECK-NEXT:    store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1
+// CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1
+// CHECK-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP7]] to i1
+// CHECK-NEXT:    ret i1 [[LOADEDV]]
+//
+bool test_compare_exchange(_Atomic(S<4>)* a, S<4>* expected, S<4> desired) {
+    return __c11_atomic_compare_exchange_strong(a, expected, desired, 0, 0);
+}
+
+// CHECK-LABEL: define dso_local noundef zeroext i1 @_Z21test_compare_exchangePU7_Atomic1SILj6EEPS0_S0_(
+// CHECK-SAME: ptr noundef [[A:%.*]], ptr noundef [[EXPECTED:%.*]], i48 [[DESIRED_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[DESIRED:%.*]] = alloca [[STRUCT_S_1:%.*]], align 1
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[EXPECTED_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[DOTATOMICTMP:%.*]] = alloca [[STRUCT_S_1]], align 1
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca { [[STRUCT_S_1]], [2 x i8] }, align 8
+// CHECK-NEXT:    [[ATOMIC_TEMP1:%.*]] = alloca { [[STRUCT_S_1]], [2 x i8] }, align 8
+// CHECK-NEXT:    [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[OLD_TMP:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_S_1]], ptr [[DESIRED]], i32 0, i32 0
+// CHECK-NEXT:    store i48 [[DESIRED_COERCE]], ptr [[COERCE_DIVE]], align 1
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[EXPECTED]], ptr [[EXPECTED_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[EXPECTED_ADDR]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DOTATOMICTMP]], ptr align 1 [[DESIRED]], i64 6, i1 false)
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[ATOMIC_TEMP]], i8 0, i64 8, i1 false)
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[ATOMIC_TEMP]], ptr align 1 [[TMP1]], i64 6, i1 false)
+// CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[ATOMIC_TEMP1]], i8 0, i64 8, i1 false)
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[ATOMIC_TEMP1]], ptr align 1 [[DOTATOMICTMP]], i64 6, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[ATOMIC_TEMP]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[ATOMIC_TEMP1]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[TMP0]], i64 [[TMP2]], i64 [[TMP3]] monotonic monotonic, align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+// CHECK-NEXT:    br i1 [[TMP6]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]]
+// CHECK:       [[CMPXCHG_STORE_EXPECTED]]:
+// CHECK-NEXT:    store i64 [[TMP5]], ptr [[OLD_TMP]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[TMP1]], ptr align 8 [[OLD_TMP]], i64 6, i1 false)
+// CHECK-NEXT:    br label %[[CMPXCHG_CONTINUE]]
+// CHECK:       [[CMPXCHG_CONTINUE]]:
+// CHECK-NEXT:    [[STOREDV:%.*]] = zext i1 [[TMP6]] to i8
+// CHECK-NEXT:    store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1
+// CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1
+// CHECK-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP7]] to i1
+// CHECK-NEXT:    ret i1 [[LOADEDV]]
+//
+bool test_compare_exchange(_Atomic(S<6>)* a, S<6>* expected, S<6> desired) {
+    return __c11_atomic_compare_exchange_strong(a, expected, desired, 0, 0);
+}
diff --git a/clang/test/CodeGenCXX/builtin-get-vtable-pointer.cpp b/clang/test/CodeGenCXX/builtin-get-vtable-pointer.cpp
index 604fb6c5585ac..0bde63496f4cb 100644
--- a/clang/test/CodeGenCXX/builtin-get-vtable-pointer.cpp
+++ b/clang/test/CodeGenCXX/builtin-get-vtable-pointer.cpp
@@ -1,3 +1,4 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // RUN: %clang_cc1 %s -x c++ -std=c++23  -triple x86_64-apple-darwin10 -emit-llvm -O1 -disable-llvm-passes -no-enable-noundef-analysis  -o - | FileCheck --check-prefix=CHECK-NOAUTH %s
 // RUN: %clang_cc1 %s -x c++ -std=c++23  -triple arm64-apple-ios -fptrauth-calls -fptrauth-vtable-pointer-type-discrimination -emit-llvm -O1 -disable-llvm-passes -no-enable-noundef-analysis   -o - | FileCheck --check-prefix=CHECK-TYPEAUTH %s
 // RUN: %clang_cc1 %s -x c++ -std=c++23  -triple arm64-apple-ios -fptrauth-calls -fptrauth-vtable-pointer-address-discrimination -emit-llvm -O1 -disable-llvm-passes -no-enable-noundef-analysis  -o - | FileCheck --check-prefix=CHECK-ADDRESSAUTH %s
@@ -35,309 +36,1290 @@ template <class A> struct same_type<A, A> {
   static const bool value = true;
 };
 
+// CHECK-NOAUTH-LABEL: define ptr @_ZN5test11aEPNS_1AE(
+// CHECK-NOAUTH-SAME: ptr [[O:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NOAUTH-NEXT:  [[ENTRY:.*:]]
+// CHECK-NOAUTH-NEXT:    [[O_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NOAUTH-NEXT:    store ptr [[O]], ptr [[O_ADDR]], align 8, !tbaa [[TBAA6:![0-9]+]]
+// CHECK-NOAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[O_ADDR]], align 8, !tbaa [[TBAA6]]
+// CHECK-NOAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA9:![0-9]+]]
+// CHECK-NOAUTH-NEXT:    ret ptr [[VTABLE]]
+//
+// CHECK-TYPEAUTH-LABEL: define ptr @_ZN5test11aEPNS_1AE(
+// CHECK-TYPEAUTH-SAME: ptr [[O:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-TYPEAUTH-NEXT:  [[ENTRY:.*:]]
+// CHECK-TYPEAUTH-NEXT:    [[O_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-TYPEAUTH-NEXT:    store ptr [[O]], ptr [[O_ADDR]], align 8, !tbaa [[TBAA6:![0-9]+]]
+// CHECK-TYPEAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[O_ADDR]], align 8, !tbaa [[TBAA6]]
+// CHECK-TYPEAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA9:![0-9]+]]
+// CHECK-TYPEAUTH-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// CHECK-TYPEAUTH-NEXT:    [[TMP2:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP1]], i32 2, i64 48388)
+// CHECK-TYPEAUTH-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-TYPEAUTH-NEXT:    [[TMP4:%.*]] = load volatile i8, ptr [[TMP3]], align 8
+// CHECK-TYPEAUTH-NEXT:    ret ptr [[TMP3]]
+//
+// CHECK-ADDRESSAUTH-LABEL: define ptr @_ZN5test11aEPNS_1AE(
+// CHECK-ADDRESSAUTH-SAME: ptr [[O:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-ADDRESSAUTH-NEXT:  [[ENTRY:.*:]]
+// CHECK-ADDRESSAUTH-NEXT:    [[O_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-ADDRESSAUTH-NEXT:    store ptr [[O]], ptr [[O_ADDR]], align 8, !tbaa [[TBAA6:![0-9]+]]
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[O_ADDR]], align 8, !tbaa [[TBAA6]]
+// CHECK-ADDRESSAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA9:![0-9]+]]
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[TMP0]] to i64
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP3:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP2]], i32 2, i64 [[TMP1]])
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP5:%.*]] = load volatile i8, ptr [[TMP4]], align 8
+// CHECK-ADDRESSAUTH-NEXT:    ret ptr [[TMP4]]
+//
+// CHECK-BOTHAUTH-LABEL: define ptr @_ZN5test11aEPNS_1AE(
+// CHECK-BOTHAUTH-SAME: ptr [[O:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-BOTHAUTH-NEXT:  [[ENTRY:.*:]]
+// CHECK-BOTHAUTH-NEXT:    [[O_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-BOTHAUTH-NEXT:    store ptr [[O]], ptr [[O_ADDR]], align 8, !tbaa [[TBAA6:![0-9]+]]
+// CHECK-BOTHAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[O_ADDR]], align 8, !tbaa [[TBAA6]]
+// CHECK-BOTHAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA9:![0-9]+]]
+// CHECK-BOTHAUTH-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[TMP0]] to i64
+// CHECK-BOTHAUTH-NEXT:    [[TMP2:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[TMP1]], i64 48388)
+// CHECK-BOTHAUTH-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// CHECK-BOTHAUTH-NEXT:    [[TMP4:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP3]], i32 2, i64 [[TMP2]])
+// CHECK-BOTHAUTH-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-BOTHAUTH-NEXT:    [[TMP6:%.*]] = load volatile i8, ptr [[TMP5]], align 8
+// CHECK-BOTHAUTH-NEXT:    ret ptr [[TMP5]]
+//
 const void *a(A *o) {
   static_assert(same_type<decltype(__builtin_get_vtable_pointer(o)), const void*>::value);
-  // CHECK-NOAUTH: define ptr @_ZN5test11aEPNS_1AE(ptr %o) #0 {
-  // CHECK-TYPEAUTH: define ptr @_ZN5test11aEPNS_1AE(ptr %o) #0 {
   return __builtin_get_vtable_pointer(o);
-  // CHECK-NOAUTH: %vtable = load ptr, ptr %0, align 8
-  // CHECK-TYPEAUTH: %0 = load ptr, ptr %o.addr, align 8
-  // CHECK-TYPEAUTH: %vtable = load ptr, ptr %0, align 8
-  // CHECK-TYPEAUTH: %1 = ptrtoint ptr %vtable to i64
-  // CHECK-TYPEAUTH: %2 = call i64 @llvm.ptrauth.auth(i64 %1, i32 2, i64 48388)
-  // CHECK-TYPEAUTH: %3 = inttoptr i64 %2 to ptr
-  // CHECK-TYPEAUTH: %4 = load volatile i8, ptr %3, align 8
-  // CHECK-ADDRESSAUTH: %2 = ptrtoint ptr %vtable to i64
-  // CHECK-ADDRESSAUTH: %3 = call i64 @llvm.ptrauth.auth(i64 %2, i32 2, i64 %1)
-  // CHECK-ADDRESSAUTH: %4 = inttoptr i64 %3 to ptr
-  // CHECK-ADDRESSAUTH: %5 = load volatile i8, ptr %4, align 8
-  // CHECK-BOTHAUTH: [[T1:%.*]] = ptrtoint ptr %0 to i64
-  // CHECK-BOTHAUTH: [[T2:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[T1]], i64 48388)
-  // CHECK-BOTHAUTH: [[T3:%.*]] = ptrtoint ptr %vtable to i64
-  // CHECK-BOTHAUTH: [[T4:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[T3]], i32 2, i64 [[T2]])
-  // CHECK-BOTHAUTH: [[T5:%.*]] = inttoptr i64 [[T4]] to ptr
-  // CHECK-BOTHAUTH: [[T6:%.*]] = load volatile i8, ptr [[T5]], align 8
 }
 
+// CHECK-NOAUTH-LABEL: define ptr @_ZN5test11bEPNS_1BE(
+// CHECK-NOAUTH-SAME: ptr [[O:%.*]]) #[[ATTR0]] {
+// CHECK-NOAUTH-NEXT:  [[ENTRY:.*:]]
+// CHECK-NOAUTH-NEXT:    [[O_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NOAUTH-NEXT:    store ptr [[O]], ptr [[O_ADDR]], align 8, !tbaa [[TBAA11:![0-9]+]]
+// CHECK-NOAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[O_ADDR]], align 8, !tbaa [[TBAA11]]
+// CHECK-NOAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA9]]
+// CHECK-NOAUTH-NEXT:    ret ptr [[VTABLE]]
+//
+// CHECK-TYPEAUTH-LABEL: define ptr @_ZN5test11bEPNS_1BE(
+// CHECK-TYPEAUTH-SAME: ptr [[O:%.*]]) #[[ATTR0]] {
+// CHECK-TYPEAUTH-NEXT:  [[ENTRY:.*:]]
+// CHECK-TYPEAUTH-NEXT:    [[O_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-TYPEAUTH-NEXT:    store ptr [[O]], ptr [[O_ADDR]], align 8, !tbaa [[TBAA11:![0-9]+]]
+// CHECK-TYPEAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[O_ADDR]], align 8, !tbaa [[TBAA11]]
+// CHECK-TYPEAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA9]]
+// CHECK-TYPEAUTH-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// CHECK-TYPEAUTH-NEXT:    [[TMP2:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP1]], i32 2, i64 48388)
+// CHECK-TYPEAUTH-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-TYPEAUTH-NEXT:    [[TMP4:%.*]] = load volatile i8, ptr [[TMP3]], align 8
+// CHECK-TYPEAUTH-NEXT:    ret ptr [[TMP3]]
+//
+// CHECK-ADDRESSAUTH-LABEL: define ptr @_ZN5test11bEPNS_1BE(
+// CHECK-ADDRESSAUTH-SAME: ptr [[O:%.*]]) #[[ATTR0]] {
+// CHECK-ADDRESSAUTH-NEXT:  [[ENTRY:.*:]]
+// CHECK-ADDRESSAUTH-NEXT:    [[O_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-ADDRESSAUTH-NEXT:    store ptr [[O]], ptr [[O_ADDR]], align 8, !tbaa [[TBAA11:![0-9]+]]
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[O_ADDR]], align 8, !tbaa [[TBAA11]]
+// CHECK-ADDRESSAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA9]]
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[TMP0]] to i64
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP3:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP2]], i32 2, i64 [[TMP1]])
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP5:%.*]] = load volatile i8, ptr [[TMP4]], align 8
+// CHECK-ADDRESSAUTH-NEXT:    ret ptr [[TMP4]]
+//
+// CHECK-BOTHAUTH-LABEL: define ptr @_ZN5test11bEPNS_1BE(
+// CHECK-BOTHAUTH-SAME: ptr [[O:%.*]]) #[[ATTR0]] {
+// CHECK-BOTHAUTH-NEXT:  [[ENTRY:.*:]]
+// CHECK-BOTHAUTH-NEXT:    [[O_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-BOTHAUTH-NEXT:    store ptr [[O]], ptr [[O_ADDR]], align 8, !tbaa [[TBAA11:![0-9]+]]
+// CHECK-BOTHAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[O_ADDR]], align 8, !tbaa [[TBAA11]]
+// CHECK-BOTHAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA9]]
+// CHECK-BOTHAUTH-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[TMP0]] to i64
+// CHECK-BOTHAUTH-NEXT:    [[TMP2:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[TMP1]], i64 48388)
+// CHECK-BOTHAUTH-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// CHECK-BOTHAUTH-NEXT:    [[TMP4:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP3]], i32 2, i64 [[TMP2]])
+// CHECK-BOTHAUTH-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-BOTHAUTH-NEXT:    [[TMP6:%.*]] = load volatile i8, ptr [[TMP5]], align 8
+// CHECK-BOTHAUTH-NEXT:    ret ptr [[TMP5]]
+//
 const void *b(B *o) {
   static_assert(same_type<decltype(__builtin_get_vtable_pointer(o)), const void*>::value);
-  // CHECK-TYPEAUTH: define ptr @_ZN5test11bEPNS_1BE(ptr %o) #0 {
-  // CHECK-NOAUTH: define ptr @_ZN5test11bEPNS_1BE(ptr %o) #0 {
   return __builtin_get_vtable_pointer(o);
-  // CHECK-NOAUTH: %vtable = load ptr, ptr %0, align 8
-  // CHECK-TYPEAUTH: %vtable = load ptr, ptr %0, align 8
-  // CHECK-TYPEAUTH: %1 = ptrtoint ptr %vtable to i64
-  // CHECK-TYPEAUTH: %2 = call i64 @llvm.ptrauth.auth(i64 %1, i32 2, i64 48388)
-  // CHECK-TYPEAUTH: %3 = inttoptr i64 %2 to ptr
-  // CHECK-TYPEAUTH: %4 = load volatile i8, ptr %3, align 8
-  // CHECK-ADDRESSAUTH: %2 = ptrtoint ptr %vtable to i64
-  // CHECK-ADDRESSAUTH: %3 = call i64 @llvm.ptrauth.auth(i64 %2, i32 2, i64 %1)
-  // CHECK-ADDRESSAUTH: %4 = inttoptr i64 %3 to ptr
-  // CHECK-ADDRESSAUTH: %5 = load volatile i8, ptr %4, align 8
-  // CHECK-BOTHAUTH: [[T2:%.*]] = call i64 @llvm.ptrauth.blend(i64 %1, i64 48388)
-  // CHECK-BOTHAUTH: [[T3:%.*]] = ptrtoint ptr %vtable to i64
-  // CHECK-BOTHAUTH: [[T4:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[T3]], i32 2, i64 [[T2]])
-  // CHECK-BOTHAUTH: [[T5:%.*]] = inttoptr i64 [[T4]] to ptr
-  // CHECK-BOTHAUTH: [[T6:%.*]] = load volatile i8, ptr [[T5]], align 8
 }
 
+// CHECK-NOAUTH-LABEL: define ptr @_ZN5test16b_as_AEPNS_1BE(
+// CHECK-NOAUTH-SAME: ptr [[O:%.*]]) #[[ATTR0]] {
+// CHECK-NOAUTH-NEXT:  [[ENTRY:.*:]]
+// CHECK-NOAUTH-NEXT:    [[O_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NOAUTH-NEXT:    store ptr [[O]], ptr [[O_ADDR]], align 8, !tbaa [[TBAA11]]
+// CHECK-NOAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[O_ADDR]], align 8, !tbaa [[TBAA11]]
+// CHECK-NOAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA9]]
+// CHECK-NOAUTH-NEXT:    ret ptr [[VTABLE]]
+//
+// CHECK-TYPEAUTH-LABEL: define ptr @_ZN5test16b_as_AEPNS_1BE(
+// CHECK-TYPEAUTH-SAME: ptr [[O:%.*]]) #[[ATTR0]] {
+// CHECK-TYPEAUTH-NEXT:  [[ENTRY:.*:]]
+// CHECK-TYPEAUTH-NEXT:    [[O_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-TYPEAUTH-NEXT:    store ptr [[O]], ptr [[O_ADDR]], align 8, !tbaa [[TBAA11]]
+// CHECK-TYPEAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[O_ADDR]], align 8, !tbaa [[TBAA11]]
+// CHECK-TYPEAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA9]]
+// CHECK-TYPEAUTH-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// CHECK-TYPEAUTH-NEXT:    [[TMP2:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP1]], i32 2, i64 48388)
+// CHECK-TYPEAUTH-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-TYPEAUTH-NEXT:    [[TMP4:%.*]] = load volatile i8, ptr [[TMP3]], align 8
+// CHECK-TYPEAUTH-NEXT:    ret ptr [[TMP3]]
+//
+// CHECK-ADDRESSAUTH-LABEL: define ptr @_ZN5test16b_as_AEPNS_1BE(
+// CHECK-ADDRESSAUTH-SAME: ptr [[O:%.*]]) #[[ATTR0]] {
+// CHECK-ADDRESSAUTH-NEXT:  [[ENTRY:.*:]]
+// CHECK-ADDRESSAUTH-NEXT:    [[O_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-ADDRESSAUTH-NEXT:    store ptr [[O]], ptr [[O_ADDR]], align 8, !tbaa [[TBAA11]]
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[O_ADDR]], align 8, !tbaa [[TBAA11]]
+// CHECK-ADDRESSAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA9]]
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[TMP0]] to i64
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP3:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP2]], i32 2, i64 [[TMP1]])
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP5:%.*]] = load volatile i8, ptr [[TMP4]], align 8
+// CHECK-ADDRESSAUTH-NEXT:    ret ptr [[TMP4]]
+//
+// CHECK-BOTHAUTH-LABEL: define ptr @_ZN5test16b_as_AEPNS_1BE(
+// CHECK-BOTHAUTH-SAME: ptr [[O:%.*]]) #[[ATTR0]] {
+// CHECK-BOTHAUTH-NEXT:  [[ENTRY:.*:]]
+// CHECK-BOTHAUTH-NEXT:    [[O_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-BOTHAUTH-NEXT:    store ptr [[O]], ptr [[O_ADDR]], align 8, !tbaa [[TBAA11]]
+// CHECK-BOTHAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[O_ADDR]], align 8, !tbaa [[TBAA11]]
+// CHECK-BOTHAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA9]]
+// CHECK-BOTHAUTH-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[TMP0]] to i64
+// CHECK-BOTHAUTH-NEXT:    [[TMP2:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[TMP1]], i64 48388)
+// CHECK-BOTHAUTH-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// CHECK-BOTHAUTH-NEXT:    [[TMP4:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP3]], i32 2, i64 [[TMP2]])
+// CHECK-BOTHAUTH-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-BOTHAUTH-NEXT:    [[TMP6:%.*]] = load volatile i8, ptr [[TMP5]], align 8
+// CHECK-BOTHAUTH-NEXT:    ret ptr [[TMP5]]
+//
 const void *b_as_A(B *o) {
   static_assert(same_type<decltype(__builtin_get_vtable_pointer(o)), const void*>::value);
-  // CHECK-NOAUTH: define ptr @_ZN5test16b_as_AEPNS_1BE(ptr %o) #0 {
   return __builtin_get_vtable_pointer((A *)o);
-  // CHECK-NOAUTH: %vtable = load ptr, ptr %0, align 8
-  // CHECK-TYPEAUTH: %vtable = load ptr, ptr %0, align 8
-  // CHECK-TYPEAUTH: %1 = ptrtoint ptr %vtable to i64
-  // CHECK-TYPEAUTH: %2 = call i64 @llvm.ptrauth.auth(i64 %1, i32 2, i64 48388)
-  // CHECK-TYPEAUTH: %3 = inttoptr i64 %2 to ptr
-  // CHECK-TYPEAUTH: %4 = load volatile i8, ptr %3, align 8
-  // CHECK-ADDRESSAUTH: %2 = ptrtoint ptr %vtable to i64
-  // CHECK-ADDRESSAUTH: %3 = call i64 @llvm.ptrauth.auth(i64 %2, i32 2, i64 %1)
-  // CHECK-ADDRESSAUTH: %4 = inttoptr i64 %3 to ptr
-  // CHECK-ADDRESSAUTH: %5 = load volatile i8, ptr %4, align 8
-  // CHECK-BOTHAUTH: [[T2:%.*]] = call i64 @llvm.ptrauth.blend(i64 %1, i64 48388)
-  // CHECK-BOTHAUTH: [[T3:%.*]] = ptrtoint ptr %vtable to i64
-  // CHECK-BOTHAUTH: [[T4:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[T3]], i32 2, i64 [[T2]])
-  // CHECK-BOTHAUTH: [[T5:%.*]] = inttoptr i64 [[T4]] to ptr
-  // CHECK-BOTHAUTH: [[T6:%.*]] = load volatile i8, ptr [[T5]], align 8
 }
 
+// CHECK-NOAUTH-LABEL: define ptr @_ZN5test11cEPNS_1CE(
+// CHECK-NOAUTH-SAME: ptr [[O:%.*]]) #[[ATTR0]] {
+// CHECK-NOAUTH-NEXT:  [[ENTRY:.*:]]
+// CHECK-NOAUTH-NEXT:    [[O_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NOAUTH-NEXT:    store ptr [[O]], ptr [[O_ADDR]], align 8, !tbaa [[TBAA13:![0-9]+]]
+// CHECK-NOAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[O_ADDR]], align 8, !tbaa [[TBAA13]]
+// CHECK-NOAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA9]]
+// CHECK-NOAUTH-NEXT:    ret ptr [[VTABLE]]
+//
+// CHECK-TYPEAUTH-LABEL: define ptr @_ZN5test11cEPNS_1CE(
+// CHECK-TYPEAUTH-SAME: ptr [[O:%.*]]) #[[ATTR0]] {
+// CHECK-TYPEAUTH-NEXT:  [[ENTRY:.*:]]
+// CHECK-TYPEAUTH-NEXT:    [[O_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-TYPEAUTH-NEXT:    store ptr [[O]], ptr [[O_ADDR]], align 8, !tbaa [[TBAA13:![0-9]+]]
+// CHECK-TYPEAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[O_ADDR]], align 8, !tbaa [[TBAA13]]
+// CHECK-TYPEAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA9]]
+// CHECK-TYPEAUTH-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// CHECK-TYPEAUTH-NEXT:    [[TMP2:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP1]], i32 2, i64 48388)
+// CHECK-TYPEAUTH-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-TYPEAUTH-NEXT:    [[TMP4:%.*]] = load volatile i8, ptr [[TMP3]], align 8
+// CHECK-TYPEAUTH-NEXT:    ret ptr [[TMP3]]
+//
+// CHECK-ADDRESSAUTH-LABEL: define ptr @_ZN5test11cEPNS_1CE(
+// CHECK-ADDRESSAUTH-SAME: ptr [[O:%.*]]) #[[ATTR0]] {
+// CHECK-ADDRESSAUTH-NEXT:  [[ENTRY:.*:]]
+// CHECK-ADDRESSAUTH-NEXT:    [[O_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-ADDRESSAUTH-NEXT:    store ptr [[O]], ptr [[O_ADDR]], align 8, !tbaa [[TBAA13:![0-9]+]]
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[O_ADDR]], align 8, !tbaa [[TBAA13]]
+// CHECK-ADDRESSAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA9]]
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[TMP0]] to i64
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP3:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP2]], i32 2, i64 [[TMP1]])
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP5:%.*]] = load volatile i8, ptr [[TMP4]], align 8
+// CHECK-ADDRESSAUTH-NEXT:    ret ptr [[TMP4]]
+//
+// CHECK-BOTHAUTH-LABEL: define ptr @_ZN5test11cEPNS_1CE(
+// CHECK-BOTHAUTH-SAME: ptr [[O:%.*]]) #[[ATTR0]] {
+// CHECK-BOTHAUTH-NEXT:  [[ENTRY:.*:]]
+// CHECK-BOTHAUTH-NEXT:    [[O_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-BOTHAUTH-NEXT:    store ptr [[O]], ptr [[O_ADDR]], align 8, !tbaa [[TBAA13:![0-9]+]]
+// CHECK-BOTHAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[O_ADDR]], align 8, !tbaa [[TBAA13]]
+// CHECK-BOTHAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA9]]
+// CHECK-BOTHAUTH-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[TMP0]] to i64
+// CHECK-BOTHAUTH-NEXT:    [[TMP2:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[TMP1]], i64 48388)
+// CHECK-BOTHAUTH-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// CHECK-BOTHAUTH-NEXT:    [[TMP4:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP3]], i32 2, i64 [[TMP2]])
+// CHECK-BOTHAUTH-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-BOTHAUTH-NEXT:    [[TMP6:%.*]] = load volatile i8, ptr [[TMP5]], align 8
+// CHECK-BOTHAUTH-NEXT:    ret ptr [[TMP5]]
+//
 const void *c(C *o) {
   static_assert(same_type<decltype(__builtin_get_vtable_pointer(o)), const void*>::value);
-  // CHECK-NOAUTH: define ptr @_ZN5test11cEPNS_1CE(ptr %o) #0 {
   return __builtin_get_vtable_pointer(o);
-  // CHECK-NOAUTH: %vtable = load ptr, ptr %0, align 8
-  // CHECK-TYPEAUTH: %vtable = load ptr, ptr %0, align 8
-  // CHECK-TYPEAUTH: %1 = ptrtoint ptr %vtable to i64
-  // CHECK-TYPEAUTH: %2 = call i64 @llvm.ptrauth.auth(i64 %1, i32 2, i64 48388)
-  // CHECK-TYPEAUTH: %3 = inttoptr i64 %2 to ptr
-  // CHECK-TYPEAUTH: %4 = load volatile i8, ptr %3, align 8
-  // CHECK-ADDRESSAUTH: %2 = ptrtoint ptr %vtable to i64
-  // CHECK-ADDRESSAUTH: %3 = call i64 @llvm.ptrauth.auth(i64 %2, i32 2, i64 %1)
-  // CHECK-ADDRESSAUTH: %4 = inttoptr i64 %3 to ptr
-  // CHECK-ADDRESSAUTH: %5 = load volatile i8, ptr %4, align 8
-  // CHECK-BOTHAUTH: [[T2:%.*]] = call i64 @llvm.ptrauth.blend(i64 %1, i64 48388)
-  // CHECK-BOTHAUTH: [[T3:%.*]] = ptrtoint ptr %vtable to i64
-  // CHECK-BOTHAUTH: [[T4:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[T3]], i32 2, i64 [[T2]])
-  // CHECK-BOTHAUTH: [[T5:%.*]] = inttoptr i64 [[T4]] to ptr
-  // CHECK-BOTHAUTH: [[T6:%.*]] = load volatile i8, ptr [[T5]], align 8
 }
 
+// CHECK-NOAUTH-LABEL: define ptr @_ZN5test16c_as_ZEPNS_1CE(
+// CHECK-NOAUTH-SAME: ptr [[O:%.*]]) #[[ATTR0]] {
+// CHECK-NOAUTH-NEXT:  [[ENTRY:.*:]]
+// CHECK-NOAUTH-NEXT:    [[O_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NOAUTH-NEXT:    store ptr [[O]], ptr [[O_ADDR]], align 8, !tbaa [[TBAA13]]
+// CHECK-NOAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[O_ADDR]], align 8, !tbaa [[TBAA13]]
+// CHECK-NOAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA9]]
+// CHECK-NOAUTH-NEXT:    ret ptr [[VTABLE]]
+//
+// CHECK-TYPEAUTH-LABEL: define ptr @_ZN5test16c_as_ZEPNS_1CE(
+// CHECK-TYPEAUTH-SAME: ptr [[O:%.*]]) #[[ATTR0]] {
+// CHECK-TYPEAUTH-NEXT:  [[ENTRY:.*:]]
+// CHECK-TYPEAUTH-NEXT:    [[O_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-TYPEAUTH-NEXT:    store ptr [[O]], ptr [[O_ADDR]], align 8, !tbaa [[TBAA13]]
+// CHECK-TYPEAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[O_ADDR]], align 8, !tbaa [[TBAA13]]
+// CHECK-TYPEAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA9]]
+// CHECK-TYPEAUTH-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// CHECK-TYPEAUTH-NEXT:    [[TMP2:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP1]], i32 2, i64 48388)
+// CHECK-TYPEAUTH-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-TYPEAUTH-NEXT:    [[TMP4:%.*]] = load volatile i8, ptr [[TMP3]], align 8
+// CHECK-TYPEAUTH-NEXT:    ret ptr [[TMP3]]
+//
+// CHECK-ADDRESSAUTH-LABEL: define ptr @_ZN5test16c_as_ZEPNS_1CE(
+// CHECK-ADDRESSAUTH-SAME: ptr [[O:%.*]]) #[[ATTR0]] {
+// CHECK-ADDRESSAUTH-NEXT:  [[ENTRY:.*:]]
+// CHECK-ADDRESSAUTH-NEXT:    [[O_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-ADDRESSAUTH-NEXT:    store ptr [[O]], ptr [[O_ADDR]], align 8, !tbaa [[TBAA13]]
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[O_ADDR]], align 8, !tbaa [[TBAA13]]
+// CHECK-ADDRESSAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA9]]
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[TMP0]] to i64
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP3:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP2]], i32 2, i64 [[TMP1]])
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP5:%.*]] = load volatile i8, ptr [[TMP4]], align 8
+// CHECK-ADDRESSAUTH-NEXT:    ret ptr [[TMP4]]
+//
+// CHECK-BOTHAUTH-LABEL: define ptr @_ZN5test16c_as_ZEPNS_1CE(
+// CHECK-BOTHAUTH-SAME: ptr [[O:%.*]]) #[[ATTR0]] {
+// CHECK-BOTHAUTH-NEXT:  [[ENTRY:.*:]]
+// CHECK-BOTHAUTH-NEXT:    [[O_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-BOTHAUTH-NEXT:    store ptr [[O]], ptr [[O_ADDR]], align 8, !tbaa [[TBAA13]]
+// CHECK-BOTHAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[O_ADDR]], align 8, !tbaa [[TBAA13]]
+// CHECK-BOTHAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA9]]
+// CHECK-BOTHAUTH-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[TMP0]] to i64
+// CHECK-BOTHAUTH-NEXT:    [[TMP2:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[TMP1]], i64 48388)
+// CHECK-BOTHAUTH-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// CHECK-BOTHAUTH-NEXT:    [[TMP4:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP3]], i32 2, i64 [[TMP2]])
+// CHECK-BOTHAUTH-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-BOTHAUTH-NEXT:    [[TMP6:%.*]] = load volatile i8, ptr [[TMP5]], align 8
+// CHECK-BOTHAUTH-NEXT:    ret ptr [[TMP5]]
+//
 const void *c_as_Z(C *o) {
   static_assert(same_type<decltype(__builtin_get_vtable_pointer(o)), const void*>::value);
-  // CHECK-NOAUTH: define ptr @_ZN5test16c_as_ZEPNS_1CE(ptr %o) #0 {
   return __builtin_get_vtable_pointer((Z *)o);
-  // CHECK-NOAUTH: %0 = load ptr, ptr %o.addr, align 8
-  // CHECK-NOAUTH: %vtable = load ptr, ptr %0, align 8
-  // CHECK-TYPEAUTH: %vtable = load ptr, ptr %0, align 8
-  // CHECK-TYPEAUTH: %1 = ptrtoint ptr %vtable to i64
-  // CHECK-TYPEAUTH: %2 = call i64 @llvm.ptrauth.auth(i64 %1, i32 2, i64 48388)
-  // CHECK-TYPEAUTH: %3 = inttoptr i64 %2 to ptr
-  // CHECK-TYPEAUTH: %4 = load volatile i8, ptr %3, align 8
-  // CHECK-ADDRESSAUTH: %2 = ptrtoint ptr %vtable to i64
-  // CHECK-ADDRESSAUTH: %3 = call i64 @llvm.ptrauth.auth(i64 %2, i32 2, i64 %1)
-  // CHECK-ADDRESSAUTH: %4 = inttoptr i64 %3 to ptr
-  // CHECK-ADDRESSAUTH: %5 = load volatile i8, ptr %4, align 8
-  // CHECK-BOTHAUTH: [[T2:%.*]] = call i64 @llvm.ptrauth.blend(i64 %1, i64 48388)
-  // CHECK-BOTHAUTH: [[T3:%.*]] = ptrtoint ptr %vtable to i64
-  // CHECK-BOTHAUTH: [[T4:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[T3]], i32 2, i64 [[T2]])
-  // CHECK-BOTHAUTH: [[T5:%.*]] = inttoptr i64 [[T4]] to ptr
-  // CHECK-BOTHAUTH: [[T6:%.*]] = load volatile i8, ptr [[T5]], align 8
 }
 
+// CHECK-NOAUTH-LABEL: define ptr @_ZN5test16c_as_BEPNS_1CE(
+// CHECK-NOAUTH-SAME: ptr [[O:%.*]]) #[[ATTR0]] {
+// CHECK-NOAUTH-NEXT:  [[ENTRY:.*]]:
+// CHECK-NOAUTH-NEXT:    [[O_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NOAUTH-NEXT:    store ptr [[O]], ptr [[O_ADDR]], align 8, !tbaa [[TBAA13]]
+// CHECK-NOAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[O_ADDR]], align 8, !tbaa [[TBAA13]]
+// CHECK-NOAUTH-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[TMP0]], null
+// CHECK-NOAUTH-NEXT:    br i1 [[TMP1]], label %[[CAST_END:.*]], label %[[CAST_NOTNULL:.*]]
+// CHECK-NOAUTH:       [[CAST_NOTNULL]]:
+// CHECK-NOAUTH-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
+// CHECK-NOAUTH-NEXT:    br label %[[CAST_END]]
+// CHECK-NOAUTH:       [[CAST_END]]:
+// CHECK-NOAUTH-NEXT:    [[CAST_RESULT:%.*]] = phi ptr [ [[ADD_PTR]], %[[CAST_NOTNULL]] ], [ null, %[[ENTRY]] ]
+// CHECK-NOAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[CAST_RESULT]], align 8, !tbaa [[TBAA9]]
+// CHECK-NOAUTH-NEXT:    ret ptr [[VTABLE]]
+//
+// CHECK-TYPEAUTH-LABEL: define ptr @_ZN5test16c_as_BEPNS_1CE(
+// CHECK-TYPEAUTH-SAME: ptr [[O:%.*]]) #[[ATTR0]] {
+// CHECK-TYPEAUTH-NEXT:  [[ENTRY:.*]]:
+// CHECK-TYPEAUTH-NEXT:    [[O_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-TYPEAUTH-NEXT:    store ptr [[O]], ptr [[O_ADDR]], align 8, !tbaa [[TBAA13]]
+// CHECK-TYPEAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[O_ADDR]], align 8, !tbaa [[TBAA13]]
+// CHECK-TYPEAUTH-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[TMP0]], null
+// CHECK-TYPEAUTH-NEXT:    br i1 [[TMP1]], label %[[CAST_END:.*]], label %[[CAST_NOTNULL:.*]]
+// CHECK-TYPEAUTH:       [[CAST_NOTNULL]]:
+// CHECK-TYPEAUTH-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
+// CHECK-TYPEAUTH-NEXT:    br label %[[CAST_END]]
+// CHECK-TYPEAUTH:       [[CAST_END]]:
+// CHECK-TYPEAUTH-NEXT:    [[CAST_RESULT:%.*]] = phi ptr [ [[ADD_PTR]], %[[CAST_NOTNULL]] ], [ null, %[[ENTRY]] ]
+// CHECK-TYPEAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[CAST_RESULT]], align 8, !tbaa [[TBAA9]]
+// CHECK-TYPEAUTH-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// CHECK-TYPEAUTH-NEXT:    [[TMP3:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP2]], i32 2, i64 48388)
+// CHECK-TYPEAUTH-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+// CHECK-TYPEAUTH-NEXT:    [[TMP5:%.*]] = load volatile i8, ptr [[TMP4]], align 8
+// CHECK-TYPEAUTH-NEXT:    ret ptr [[TMP4]]
+//
+// CHECK-ADDRESSAUTH-LABEL: define ptr @_ZN5test16c_as_BEPNS_1CE(
+// CHECK-ADDRESSAUTH-SAME: ptr [[O:%.*]]) #[[ATTR0]] {
+// CHECK-ADDRESSAUTH-NEXT:  [[ENTRY:.*]]:
+// CHECK-ADDRESSAUTH-NEXT:    [[O_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-ADDRESSAUTH-NEXT:    store ptr [[O]], ptr [[O_ADDR]], align 8, !tbaa [[TBAA13]]
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[O_ADDR]], align 8, !tbaa [[TBAA13]]
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[TMP0]], null
+// CHECK-ADDRESSAUTH-NEXT:    br i1 [[TMP1]], label %[[CAST_END:.*]], label %[[CAST_NOTNULL:.*]]
+// CHECK-ADDRESSAUTH:       [[CAST_NOTNULL]]:
+// CHECK-ADDRESSAUTH-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
+// CHECK-ADDRESSAUTH-NEXT:    br label %[[CAST_END]]
+// CHECK-ADDRESSAUTH:       [[CAST_END]]:
+// CHECK-ADDRESSAUTH-NEXT:    [[CAST_RESULT:%.*]] = phi ptr [ [[ADD_PTR]], %[[CAST_NOTNULL]] ], [ null, %[[ENTRY]] ]
+// CHECK-ADDRESSAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[CAST_RESULT]], align 8, !tbaa [[TBAA9]]
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[CAST_RESULT]] to i64
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP4:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP3]], i32 2, i64 [[TMP2]])
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP6:%.*]] = load volatile i8, ptr [[TMP5]], align 8
+// CHECK-ADDRESSAUTH-NEXT:    ret ptr [[TMP5]]
+//
+// CHECK-BOTHAUTH-LABEL: define ptr @_ZN5test16c_as_BEPNS_1CE(
+// CHECK-BOTHAUTH-SAME: ptr [[O:%.*]]) #[[ATTR0]] {
+// CHECK-BOTHAUTH-NEXT:  [[ENTRY:.*]]:
+// CHECK-BOTHAUTH-NEXT:    [[O_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-BOTHAUTH-NEXT:    store ptr [[O]], ptr [[O_ADDR]], align 8, !tbaa [[TBAA13]]
+// CHECK-BOTHAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[O_ADDR]], align 8, !tbaa [[TBAA13]]
+// CHECK-BOTHAUTH-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[TMP0]], null
+// CHECK-BOTHAUTH-NEXT:    br i1 [[TMP1]], label %[[CAST_END:.*]], label %[[CAST_NOTNULL:.*]]
+// CHECK-BOTHAUTH:       [[CAST_NOTNULL]]:
+// CHECK-BOTHAUTH-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
+// CHECK-BOTHAUTH-NEXT:    br label %[[CAST_END]]
+// CHECK-BOTHAUTH:       [[CAST_END]]:
+// CHECK-BOTHAUTH-NEXT:    [[CAST_RESULT:%.*]] = phi ptr [ [[ADD_PTR]], %[[CAST_NOTNULL]] ], [ null, %[[ENTRY]] ]
+// CHECK-BOTHAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[CAST_RESULT]], align 8, !tbaa [[TBAA9]]
+// CHECK-BOTHAUTH-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[CAST_RESULT]] to i64
+// CHECK-BOTHAUTH-NEXT:    [[TMP3:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[TMP2]], i64 48388)
+// CHECK-BOTHAUTH-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// CHECK-BOTHAUTH-NEXT:    [[TMP5:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP4]], i32 2, i64 [[TMP3]])
+// CHECK-BOTHAUTH-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-BOTHAUTH-NEXT:    [[TMP7:%.*]] = load volatile i8, ptr [[TMP6]], align 8
+// CHECK-BOTHAUTH-NEXT:    ret ptr [[TMP6]]
+//
 const void *c_as_B(C *o) {
   static_assert(same_type<decltype(__builtin_get_vtable_pointer(o)), const void*>::value);
-  // CHECK-NOAUTH: define ptr @_ZN5test16c_as_BEPNS_1CE(ptr %o) #0 {
   return __builtin_get_vtable_pointer((B *)o);
-  // CHECK-NOAUTH: %add.ptr = getelementptr inbounds i8, ptr %0, i64 8
-  // CHECK-NOAUTH: br label %cast.end
-  // CHECK-NOAUTH: %cast.result = phi ptr [ %add.ptr, %cast.notnull ], [ null, %entry ]
-  // CHECK-NOAUTH: %vtable = load ptr, ptr %cast.result, align 8
-  // CHECK-TYPEAUTH: %cast.result = phi ptr [ %add.ptr, %cast.notnull ], [ null, %entry ]
-  // CHECK-TYPEAUTH: %vtable = load ptr, ptr %cast.result, align 8
-  // CHECK-TYPEAUTH: %2 = ptrtoint ptr %vtable to i64
-  // CHECK-TYPEAUTH: %3 = call i64 @llvm.ptrauth.auth(i64 %2, i32 2, i64 48388)
-  // CHECK-TYPEAUTH: %4 = inttoptr i64 %3 to ptr
-  // CHECK-TYPEAUTH: %5 = load volatile i8, ptr %4, align 8
-  // CHECK-ADDRESSAUTH: %2 = ptrtoint ptr %cast.result to i64
-  // CHECK-ADDRESSAUTH: %3 = ptrtoint ptr %vtable to i64
-  // CHECK-ADDRESSAUTH: %4 = call i64 @llvm.ptrauth.auth(i64 %3, i32 2, i64 %2)
-  // CHECK-ADDRESSAUTH: %5 = inttoptr i64 %4 to ptr
-  // CHECK-ADDRESSAUTH: %6 = load volatile i8, ptr %5, align 8
-  // CHECK-BOTHAUTH: [[T2:%.*]] = call i64 @llvm.ptrauth.blend(i64 %2, i64 48388)
-  // CHECK-BOTHAUTH: [[T3:%.*]] = ptrtoint ptr %vtable to i64
-  // CHECK-BOTHAUTH: [[T4:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[T3]], i32 2, i64 [[T2]])
-  // CHECK-BOTHAUTH: [[T5:%.*]] = inttoptr i64 [[T4]] to ptr
-  // CHECK-BOTHAUTH: [[T6:%.*]] = load volatile i8, ptr [[T5]], align 8
 }
 
+// CHECK-NOAUTH-LABEL: define ptr @_ZN5test11dEPNS_1DE(
+// CHECK-NOAUTH-SAME: ptr [[O:%.*]]) #[[ATTR0]] {
+// CHECK-NOAUTH-NEXT:  [[ENTRY:.*:]]
+// CHECK-NOAUTH-NEXT:    [[O_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NOAUTH-NEXT:    store ptr [[O]], ptr [[O_ADDR]], align 8, !tbaa [[TBAA15:![0-9]+]]
+// CHECK-NOAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[O_ADDR]], align 8, !tbaa [[TBAA15]]
+// CHECK-NOAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA9]]
+// CHECK-NOAUTH-NEXT:    ret ptr [[VTABLE]]
+//
+// CHECK-TYPEAUTH-LABEL: define ptr @_ZN5test11dEPNS_1DE(
+// CHECK-TYPEAUTH-SAME: ptr [[O:%.*]]) #[[ATTR0]] {
+// CHECK-TYPEAUTH-NEXT:  [[ENTRY:.*:]]
+// CHECK-TYPEAUTH-NEXT:    [[O_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-TYPEAUTH-NEXT:    store ptr [[O]], ptr [[O_ADDR]], align 8, !tbaa [[TBAA15:![0-9]+]]
+// CHECK-TYPEAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[O_ADDR]], align 8, !tbaa [[TBAA15]]
+// CHECK-TYPEAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA9]]
+// CHECK-TYPEAUTH-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// CHECK-TYPEAUTH-NEXT:    [[TMP2:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP1]], i32 2, i64 48388)
+// CHECK-TYPEAUTH-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-TYPEAUTH-NEXT:    [[TMP4:%.*]] = load volatile i8, ptr [[TMP3]], align 8
+// CHECK-TYPEAUTH-NEXT:    ret ptr [[TMP3]]
+//
+// CHECK-ADDRESSAUTH-LABEL: define ptr @_ZN5test11dEPNS_1DE(
+// CHECK-ADDRESSAUTH-SAME: ptr [[O:%.*]]) #[[ATTR0]] {
+// CHECK-ADDRESSAUTH-NEXT:  [[ENTRY:.*:]]
+// CHECK-ADDRESSAUTH-NEXT:    [[O_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-ADDRESSAUTH-NEXT:    store ptr [[O]], ptr [[O_ADDR]], align 8, !tbaa [[TBAA15:![0-9]+]]
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[O_ADDR]], align 8, !tbaa [[TBAA15]]
+// CHECK-ADDRESSAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA9]]
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[TMP0]] to i64
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP3:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP2]], i32 2, i64 [[TMP1]])
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP5:%.*]] = load volatile i8, ptr [[TMP4]], align 8
+// CHECK-ADDRESSAUTH-NEXT:    ret ptr [[TMP4]]
+//
+// CHECK-BOTHAUTH-LABEL: define ptr @_ZN5test11dEPNS_1DE(
+// CHECK-BOTHAUTH-SAME: ptr [[O:%.*]]) #[[ATTR0]] {
+// CHECK-BOTHAUTH-NEXT:  [[ENTRY:.*:]]
+// CHECK-BOTHAUTH-NEXT:    [[O_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-BOTHAUTH-NEXT:    store ptr [[O]], ptr [[O_ADDR]], align 8, !tbaa [[TBAA15:![0-9]+]]
+// CHECK-BOTHAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[O_ADDR]], align 8, !tbaa [[TBAA15]]
+// CHECK-BOTHAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA9]]
+// CHECK-BOTHAUTH-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[TMP0]] to i64
+// CHECK-BOTHAUTH-NEXT:    [[TMP2:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[TMP1]], i64 48388)
+// CHECK-BOTHAUTH-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// CHECK-BOTHAUTH-NEXT:    [[TMP4:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP3]], i32 2, i64 [[TMP2]])
+// CHECK-BOTHAUTH-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-BOTHAUTH-NEXT:    [[TMP6:%.*]] = load volatile i8, ptr [[TMP5]], align 8
+// CHECK-BOTHAUTH-NEXT:    ret ptr [[TMP5]]
+//
 const void *d(D *o) {
   static_assert(same_type<decltype(__builtin_get_vtable_pointer(o)), const void*>::value);
-  // CHECK-NOAUTH: define ptr @_ZN5test11dEPNS_1DE(ptr %o) #0 {
   return __builtin_get_vtable_pointer(o);
-  // CHECK-NOAUTH: %vtable = load ptr, ptr %0, align 8
-  // CHECK-TYPEAUTH: %vtable = load ptr, ptr %0, align 8
-  // CHECK-TYPEAUTH: %1 = ptrtoint ptr %vtable to i64
-  // CHECK-TYPEAUTH: %2 = call i64 @llvm.ptrauth.auth(i64 %1, i32 2, i64 48388)
-  // CHECK-TYPEAUTH: %3 = inttoptr i64 %2 to ptr
-  // CHECK-TYPEAUTH: %4 = load volatile i8, ptr %3, align 8
-  // CHECK-ADDRESSAUTH: %1 = ptrtoint ptr %0 to i64
-  // CHECK-ADDRESSAUTH: %2 = ptrtoint ptr %vtable to i64
-  // CHECK-ADDRESSAUTH: %3 = call i64 @llvm.ptrauth.auth(i64 %2, i32 2, i64 %1)
-  // CHECK-ADDRESSAUTH: %4 = inttoptr i64 %3 to ptr
-  // CHECK-ADDRESSAUTH: %5 = load volatile i8, ptr %4, align 8
-  // CHECK-BOTHAUTH: [[T1:%.*]] = ptrtoint ptr %0 to i64
-  // CHECK-BOTHAUTH: [[T2:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[T1]], i64 48388)
-  // CHECK-BOTHAUTH: [[T3:%.*]] = ptrtoint ptr %vtable to i64
-  // CHECK-BOTHAUTH: [[T4:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[T3]], i32 2, i64 [[T2]])
-  // CHECK-BOTHAUTH: [[T5:%.*]] = inttoptr i64 [[T4]] to ptr
-  // CHECK-BOTHAUTH: [[T6:%.*]] = load volatile i8, ptr [[T5]], align 8
 }
 
+// CHECK-NOAUTH-LABEL: define ptr @_ZN5test16d_as_AEPNS_1DE(
+// CHECK-NOAUTH-SAME: ptr [[O:%.*]]) #[[ATTR0]] {
+// CHECK-NOAUTH-NEXT:  [[ENTRY:.*]]:
+// CHECK-NOAUTH-NEXT:    [[O_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NOAUTH-NEXT:    store ptr [[O]], ptr [[O_ADDR]], align 8, !tbaa [[TBAA15]]
+// CHECK-NOAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[O_ADDR]], align 8, !tbaa [[TBAA15]]
+// CHECK-NOAUTH-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[TMP0]], null
+// CHECK-NOAUTH-NEXT:    br i1 [[TMP1]], label %[[CAST_END:.*]], label %[[CAST_NOTNULL:.*]]
+// CHECK-NOAUTH:       [[CAST_NOTNULL]]:
+// CHECK-NOAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA9]]
+// CHECK-NOAUTH-NEXT:    [[VBASE_OFFSET_PTR:%.*]] = getelementptr i8, ptr [[VTABLE]], i64 -32
+// CHECK-NOAUTH-NEXT:    [[VBASE_OFFSET:%.*]] = load i64, ptr [[VBASE_OFFSET_PTR]], align 8
+// CHECK-NOAUTH-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 [[VBASE_OFFSET]]
+// CHECK-NOAUTH-NEXT:    br label %[[CAST_END]]
+// CHECK-NOAUTH:       [[CAST_END]]:
+// CHECK-NOAUTH-NEXT:    [[CAST_RESULT:%.*]] = phi ptr [ [[ADD_PTR]], %[[CAST_NOTNULL]] ], [ null, %[[ENTRY]] ]
+// CHECK-NOAUTH-NEXT:    [[VTABLE1:%.*]] = load ptr, ptr [[CAST_RESULT]], align 8, !tbaa [[TBAA9]]
+// CHECK-NOAUTH-NEXT:    ret ptr [[VTABLE1]]
+//
+// CHECK-TYPEAUTH-LABEL: define ptr @_ZN5test16d_as_AEPNS_1DE(
+// CHECK-TYPEAUTH-SAME: ptr [[O:%.*]]) #[[ATTR0]] {
+// CHECK-TYPEAUTH-NEXT:  [[ENTRY:.*]]:
+// CHECK-TYPEAUTH-NEXT:    [[O_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-TYPEAUTH-NEXT:    store ptr [[O]], ptr [[O_ADDR]], align 8, !tbaa [[TBAA15]]
+// CHECK-TYPEAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[O_ADDR]], align 8, !tbaa [[TBAA15]]
+// CHECK-TYPEAUTH-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[TMP0]], null
+// CHECK-TYPEAUTH-NEXT:    br i1 [[TMP1]], label %[[CAST_END:.*]], label %[[CAST_NOTNULL:.*]]
+// CHECK-TYPEAUTH:       [[CAST_NOTNULL]]:
+// CHECK-TYPEAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA9]]
+// CHECK-TYPEAUTH-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// CHECK-TYPEAUTH-NEXT:    [[TMP3:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP2]], i32 2, i64 48388)
+// CHECK-TYPEAUTH-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+// CHECK-TYPEAUTH-NEXT:    [[VBASE_OFFSET_PTR:%.*]] = getelementptr i8, ptr [[TMP4]], i64 -32
+// CHECK-TYPEAUTH-NEXT:    [[VBASE_OFFSET:%.*]] = load i64, ptr [[VBASE_OFFSET_PTR]], align 8
+// CHECK-TYPEAUTH-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 [[VBASE_OFFSET]]
+// CHECK-TYPEAUTH-NEXT:    br label %[[CAST_END]]
+// CHECK-TYPEAUTH:       [[CAST_END]]:
+// CHECK-TYPEAUTH-NEXT:    [[CAST_RESULT:%.*]] = phi ptr [ [[ADD_PTR]], %[[CAST_NOTNULL]] ], [ null, %[[ENTRY]] ]
+// CHECK-TYPEAUTH-NEXT:    [[VTABLE1:%.*]] = load ptr, ptr [[CAST_RESULT]], align 8, !tbaa [[TBAA9]]
+// CHECK-TYPEAUTH-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[VTABLE1]] to i64
+// CHECK-TYPEAUTH-NEXT:    [[TMP6:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP5]], i32 2, i64 48388)
+// CHECK-TYPEAUTH-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+// CHECK-TYPEAUTH-NEXT:    [[TMP8:%.*]] = load volatile i8, ptr [[TMP7]], align 8
+// CHECK-TYPEAUTH-NEXT:    ret ptr [[TMP7]]
+//
+// CHECK-ADDRESSAUTH-LABEL: define ptr @_ZN5test16d_as_AEPNS_1DE(
+// CHECK-ADDRESSAUTH-SAME: ptr [[O:%.*]]) #[[ATTR0]] {
+// CHECK-ADDRESSAUTH-NEXT:  [[ENTRY:.*]]:
+// CHECK-ADDRESSAUTH-NEXT:    [[O_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-ADDRESSAUTH-NEXT:    store ptr [[O]], ptr [[O_ADDR]], align 8, !tbaa [[TBAA15]]
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[O_ADDR]], align 8, !tbaa [[TBAA15]]
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[TMP0]], null
+// CHECK-ADDRESSAUTH-NEXT:    br i1 [[TMP1]], label %[[CAST_END:.*]], label %[[CAST_NOTNULL:.*]]
+// CHECK-ADDRESSAUTH:       [[CAST_NOTNULL]]:
+// CHECK-ADDRESSAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA9]]
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[TMP0]] to i64
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP4:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP3]], i32 2, i64 [[TMP2]])
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-ADDRESSAUTH-NEXT:    [[VBASE_OFFSET_PTR:%.*]] = getelementptr i8, ptr [[TMP5]], i64 -32
+// CHECK-ADDRESSAUTH-NEXT:    [[VBASE_OFFSET:%.*]] = load i64, ptr [[VBASE_OFFSET_PTR]], align 8
+// CHECK-ADDRESSAUTH-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 [[VBASE_OFFSET]]
+// CHECK-ADDRESSAUTH-NEXT:    br label %[[CAST_END]]
+// CHECK-ADDRESSAUTH:       [[CAST_END]]:
+// CHECK-ADDRESSAUTH-NEXT:    [[CAST_RESULT:%.*]] = phi ptr [ [[ADD_PTR]], %[[CAST_NOTNULL]] ], [ null, %[[ENTRY]] ]
+// CHECK-ADDRESSAUTH-NEXT:    [[VTABLE1:%.*]] = load ptr, ptr [[CAST_RESULT]], align 8, !tbaa [[TBAA9]]
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[CAST_RESULT]] to i64
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[VTABLE1]] to i64
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP8:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP7]], i32 2, i64 [[TMP6]])
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP10:%.*]] = load volatile i8, ptr [[TMP9]], align 8
+// CHECK-ADDRESSAUTH-NEXT:    ret ptr [[TMP9]]
+//
+// CHECK-BOTHAUTH-LABEL: define ptr @_ZN5test16d_as_AEPNS_1DE(
+// CHECK-BOTHAUTH-SAME: ptr [[O:%.*]]) #[[ATTR0]] {
+// CHECK-BOTHAUTH-NEXT:  [[ENTRY:.*]]:
+// CHECK-BOTHAUTH-NEXT:    [[O_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-BOTHAUTH-NEXT:    store ptr [[O]], ptr [[O_ADDR]], align 8, !tbaa [[TBAA15]]
+// CHECK-BOTHAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[O_ADDR]], align 8, !tbaa [[TBAA15]]
+// CHECK-BOTHAUTH-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[TMP0]], null
+// CHECK-BOTHAUTH-NEXT:    br i1 [[TMP1]], label %[[CAST_END:.*]], label %[[CAST_NOTNULL:.*]]
+// CHECK-BOTHAUTH:       [[CAST_NOTNULL]]:
+// CHECK-BOTHAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA9]]
+// CHECK-BOTHAUTH-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[TMP0]] to i64
+// CHECK-BOTHAUTH-NEXT:    [[TMP3:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[TMP2]], i64 48388)
+// CHECK-BOTHAUTH-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// CHECK-BOTHAUTH-NEXT:    [[TMP5:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP4]], i32 2, i64 [[TMP3]])
+// CHECK-BOTHAUTH-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-BOTHAUTH-NEXT:    [[VBASE_OFFSET_PTR:%.*]] = getelementptr i8, ptr [[TMP6]], i64 -32
+// CHECK-BOTHAUTH-NEXT:    [[VBASE_OFFSET:%.*]] = load i64, ptr [[VBASE_OFFSET_PTR]], align 8
+// CHECK-BOTHAUTH-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 [[VBASE_OFFSET]]
+// CHECK-BOTHAUTH-NEXT:    br label %[[CAST_END]]
+// CHECK-BOTHAUTH:       [[CAST_END]]:
+// CHECK-BOTHAUTH-NEXT:    [[CAST_RESULT:%.*]] = phi ptr [ [[ADD_PTR]], %[[CAST_NOTNULL]] ], [ null, %[[ENTRY]] ]
+// CHECK-BOTHAUTH-NEXT:    [[VTABLE1:%.*]] = load ptr, ptr [[CAST_RESULT]], align 8, !tbaa [[TBAA9]]
+// CHECK-BOTHAUTH-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[CAST_RESULT]] to i64
+// CHECK-BOTHAUTH-NEXT:    [[TMP8:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[TMP7]], i64 48388)
+// CHECK-BOTHAUTH-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[VTABLE1]] to i64
+// CHECK-BOTHAUTH-NEXT:    [[TMP10:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP9]], i32 2, i64 [[TMP8]])
+// CHECK-BOTHAUTH-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK-BOTHAUTH-NEXT:    [[TMP12:%.*]] = load volatile i8, ptr [[TMP11]], align 8
+// CHECK-BOTHAUTH-NEXT:    ret ptr [[TMP11]]
+//
 const void *d_as_A(D *o) {
   static_assert(same_type<decltype(__builtin_get_vtable_pointer(o)), const void*>::value);
-  // CHECK-NOAUTH: define ptr @_ZN5test16d_as_AEPNS_1DE(ptr %o) #0 {
   return __builtin_get_vtable_pointer((A *)o);
-  // CHECK-NOAUTH: %vtable = load ptr, ptr %0, align 8
-  // CHECK-NOAUTH: %vbase.offset.ptr = getelementptr i8, ptr %vtable, i64 -32
-  // CHECK-NOAUTH: %vbase.offset = load i64, ptr %vbase.offset.ptr, align 8
-  // CHECK-NOAUTH: %add.ptr = getelementptr inbounds i8, ptr %0, i64 %vbase.offset
-  // CHECK-NOAUTH: %cast.result = phi ptr [ %add.ptr, %cast.notnull ], [ null, %entry ]
-  // CHECK-NOAUTH: %vtable1 = load ptr, ptr %cast.result, align 8
-  // CHECK-TYPEAUTH: %vtable1 = load ptr, ptr %cast.result, align 8
-  // CHECK-TYPEAUTH: %5 = ptrtoint ptr %vtable1 to i64
-  // CHECK-TYPEAUTH: %6 = call i64 @llvm.ptrauth.auth(i64 %5, i32 2, i64 48388)
-  // CHECK-TYPEAUTH: %7 = inttoptr i64 %6 to ptr
-  // CHECK-TYPEAUTH: %8 = load volatile i8, ptr %7, align 8
-  // CHECK-ADDRESSAUTH: %6 = ptrtoint ptr %cast.result to i64
-  // CHECK-ADDRESSAUTH: %7 = ptrtoint ptr %vtable1 to i64
-  // CHECK-ADDRESSAUTH: %8 = call i64 @llvm.ptrauth.auth(i64 %7, i32 2, i64 %6)
-  // CHECK-ADDRESSAUTH: %9 = inttoptr i64 %8 to ptr
-  // CHECK-ADDRESSAUTH: %10 = load volatile i8, ptr %9, align 8
-  // CHECK-BOTHAUTH: [[T1:%.*]] = ptrtoint ptr %cast.result to i64
-  // CHECK-BOTHAUTH: [[T2:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[T1]], i64 48388)
-  // CHECK-BOTHAUTH: [[T3:%.*]] = ptrtoint ptr %vtable1 to i64
-  // CHECK-BOTHAUTH: [[T4:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[T3]], i32 2, i64 [[T2]])
-  // CHECK-BOTHAUTH: [[T5:%.*]] = inttoptr i64 [[T4]] to ptr
-  // CHECK-BOTHAUTH: [[T6:%.*]] = load volatile i8, ptr [[T5]], align 8
 }
 
+// CHECK-NOAUTH-LABEL: define ptr @_ZN5test11eEPNS_1EE(
+// CHECK-NOAUTH-SAME: ptr [[O:%.*]]) #[[ATTR0]] {
+// CHECK-NOAUTH-NEXT:  [[ENTRY:.*:]]
+// CHECK-NOAUTH-NEXT:    [[O_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NOAUTH-NEXT:    store ptr [[O]], ptr [[O_ADDR]], align 8, !tbaa [[TBAA17:![0-9]+]]
+// CHECK-NOAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[O_ADDR]], align 8, !tbaa [[TBAA17]]
+// CHECK-NOAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA9]]
+// CHECK-NOAUTH-NEXT:    ret ptr [[VTABLE]]
+//
+// CHECK-TYPEAUTH-LABEL: define ptr @_ZN5test11eEPNS_1EE(
+// CHECK-TYPEAUTH-SAME: ptr [[O:%.*]]) #[[ATTR0]] {
+// CHECK-TYPEAUTH-NEXT:  [[ENTRY:.*:]]
+// CHECK-TYPEAUTH-NEXT:    [[O_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-TYPEAUTH-NEXT:    store ptr [[O]], ptr [[O_ADDR]], align 8, !tbaa [[TBAA17:![0-9]+]]
+// CHECK-TYPEAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[O_ADDR]], align 8, !tbaa [[TBAA17]]
+// CHECK-TYPEAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA9]]
+// CHECK-TYPEAUTH-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// CHECK-TYPEAUTH-NEXT:    [[TMP2:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP1]], i32 2, i64 48388)
+// CHECK-TYPEAUTH-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-TYPEAUTH-NEXT:    [[TMP4:%.*]] = load volatile i8, ptr [[TMP3]], align 8
+// CHECK-TYPEAUTH-NEXT:    ret ptr [[TMP3]]
+//
+// CHECK-ADDRESSAUTH-LABEL: define ptr @_ZN5test11eEPNS_1EE(
+// CHECK-ADDRESSAUTH-SAME: ptr [[O:%.*]]) #[[ATTR0]] {
+// CHECK-ADDRESSAUTH-NEXT:  [[ENTRY:.*:]]
+// CHECK-ADDRESSAUTH-NEXT:    [[O_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-ADDRESSAUTH-NEXT:    store ptr [[O]], ptr [[O_ADDR]], align 8, !tbaa [[TBAA17:![0-9]+]]
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[O_ADDR]], align 8, !tbaa [[TBAA17]]
+// CHECK-ADDRESSAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA9]]
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[TMP0]] to i64
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP3:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP2]], i32 2, i64 [[TMP1]])
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP5:%.*]] = load volatile i8, ptr [[TMP4]], align 8
+// CHECK-ADDRESSAUTH-NEXT:    ret ptr [[TMP4]]
+//
+// CHECK-BOTHAUTH-LABEL: define ptr @_ZN5test11eEPNS_1EE(
+// CHECK-BOTHAUTH-SAME: ptr [[O:%.*]]) #[[ATTR0]] {
+// CHECK-BOTHAUTH-NEXT:  [[ENTRY:.*:]]
+// CHECK-BOTHAUTH-NEXT:    [[O_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-BOTHAUTH-NEXT:    store ptr [[O]], ptr [[O_ADDR]], align 8, !tbaa [[TBAA17:![0-9]+]]
+// CHECK-BOTHAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[O_ADDR]], align 8, !tbaa [[TBAA17]]
+// CHECK-BOTHAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA9]]
+// CHECK-BOTHAUTH-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[TMP0]] to i64
+// CHECK-BOTHAUTH-NEXT:    [[TMP2:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[TMP1]], i64 48388)
+// CHECK-BOTHAUTH-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// CHECK-BOTHAUTH-NEXT:    [[TMP4:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP3]], i32 2, i64 [[TMP2]])
+// CHECK-BOTHAUTH-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-BOTHAUTH-NEXT:    [[TMP6:%.*]] = load volatile i8, ptr [[TMP5]], align 8
+// CHECK-BOTHAUTH-NEXT:    ret ptr [[TMP5]]
+//
 const void *e(E *o) {
   static_assert(same_type<decltype(__builtin_get_vtable_pointer(o)), const void*>::value);
-  // CHECK-NOAUTH: define ptr @_ZN5test11eEPNS_1EE(ptr %o) #0 {
   return __builtin_get_vtable_pointer(o);
-  // CHECK-NOAUTH: %vtable = load ptr, ptr %0, align 8
-  // CHECK-TYPEAUTH: %vtable = load ptr, ptr %0, align 8
-  // CHECK-TYPEAUTH: %1 = ptrtoint ptr %vtable to i64
-  // CHECK-TYPEAUTH: %2 = call i64 @llvm.ptrauth.auth(i64 %1, i32 2, i64 48388)
-  // CHECK-TYPEAUTH: %3 = inttoptr i64 %2 to ptr
-  // CHECK-TYPEAUTH: %4 = load volatile i8, ptr %3, align 8
-  // CHECK-ADDRESSAUTH: [[T1:%.*]] = ptrtoint ptr %0 to i64
-  // CHECK-ADDRESSAUTH: [[T2:%.*]] = ptrtoint ptr %vtable to i64
-  // CHECK-ADDRESSAUTH: [[T3:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[T2]], i32 2, i64 [[T1]])
-  // CHECK-ADDRESSAUTH: [[T4:%.*]] = inttoptr i64 [[T3]] to ptr
-  // CHECK-ADDRESSAUTH: [[T5:%.*]] = load volatile i8, ptr [[T4]], align 8
-  // CHECK-BOTHAUTH: [[T1:%.*]] = ptrtoint ptr %0 to i64
-  // CHECK-BOTHAUTH: [[T2:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[T1]], i64 48388)
-  // CHECK-BOTHAUTH: [[T3:%.*]] = ptrtoint ptr %vtable to i64
-  // CHECK-BOTHAUTH: [[T4:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[T3]], i32 2, i64 [[T2]])
-  // CHECK-BOTHAUTH: [[T5:%.*]] = inttoptr i64 [[T4]] to ptr
-  // CHECK-BOTHAUTH: [[T6:%.*]] = load volatile i8, ptr [[T5]], align 8
 }
 
+// CHECK-NOAUTH-LABEL: define ptr @_ZN5test16e_as_BEPNS_1EE(
+// CHECK-NOAUTH-SAME: ptr [[O:%.*]]) #[[ATTR0]] {
+// CHECK-NOAUTH-NEXT:  [[ENTRY:.*]]:
+// CHECK-NOAUTH-NEXT:    [[O_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NOAUTH-NEXT:    store ptr [[O]], ptr [[O_ADDR]], align 8, !tbaa [[TBAA17]]
+// CHECK-NOAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[O_ADDR]], align 8, !tbaa [[TBAA17]]
+// CHECK-NOAUTH-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[TMP0]], null
+// CHECK-NOAUTH-NEXT:    br i1 [[TMP1]], label %[[CAST_END:.*]], label %[[CAST_NOTNULL:.*]]
+// CHECK-NOAUTH:       [[CAST_NOTNULL]]:
+// CHECK-NOAUTH-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
+// CHECK-NOAUTH-NEXT:    br label %[[CAST_END]]
+// CHECK-NOAUTH:       [[CAST_END]]:
+// CHECK-NOAUTH-NEXT:    [[CAST_RESULT:%.*]] = phi ptr [ [[ADD_PTR]], %[[CAST_NOTNULL]] ], [ null, %[[ENTRY]] ]
+// CHECK-NOAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[CAST_RESULT]], align 8, !tbaa [[TBAA9]]
+// CHECK-NOAUTH-NEXT:    ret ptr [[VTABLE]]
+//
+// CHECK-TYPEAUTH-LABEL: define ptr @_ZN5test16e_as_BEPNS_1EE(
+// CHECK-TYPEAUTH-SAME: ptr [[O:%.*]]) #[[ATTR0]] {
+// CHECK-TYPEAUTH-NEXT:  [[ENTRY:.*]]:
+// CHECK-TYPEAUTH-NEXT:    [[O_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-TYPEAUTH-NEXT:    store ptr [[O]], ptr [[O_ADDR]], align 8, !tbaa [[TBAA17]]
+// CHECK-TYPEAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[O_ADDR]], align 8, !tbaa [[TBAA17]]
+// CHECK-TYPEAUTH-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[TMP0]], null
+// CHECK-TYPEAUTH-NEXT:    br i1 [[TMP1]], label %[[CAST_END:.*]], label %[[CAST_NOTNULL:.*]]
+// CHECK-TYPEAUTH:       [[CAST_NOTNULL]]:
+// CHECK-TYPEAUTH-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
+// CHECK-TYPEAUTH-NEXT:    br label %[[CAST_END]]
+// CHECK-TYPEAUTH:       [[CAST_END]]:
+// CHECK-TYPEAUTH-NEXT:    [[CAST_RESULT:%.*]] = phi ptr [ [[ADD_PTR]], %[[CAST_NOTNULL]] ], [ null, %[[ENTRY]] ]
+// CHECK-TYPEAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[CAST_RESULT]], align 8, !tbaa [[TBAA9]]
+// CHECK-TYPEAUTH-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// CHECK-TYPEAUTH-NEXT:    [[TMP3:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP2]], i32 2, i64 48388)
+// CHECK-TYPEAUTH-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+// CHECK-TYPEAUTH-NEXT:    [[TMP5:%.*]] = load volatile i8, ptr [[TMP4]], align 8
+// CHECK-TYPEAUTH-NEXT:    ret ptr [[TMP4]]
+//
+// CHECK-ADDRESSAUTH-LABEL: define ptr @_ZN5test16e_as_BEPNS_1EE(
+// CHECK-ADDRESSAUTH-SAME: ptr [[O:%.*]]) #[[ATTR0]] {
+// CHECK-ADDRESSAUTH-NEXT:  [[ENTRY:.*]]:
+// CHECK-ADDRESSAUTH-NEXT:    [[O_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-ADDRESSAUTH-NEXT:    store ptr [[O]], ptr [[O_ADDR]], align 8, !tbaa [[TBAA17]]
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[O_ADDR]], align 8, !tbaa [[TBAA17]]
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[TMP0]], null
+// CHECK-ADDRESSAUTH-NEXT:    br i1 [[TMP1]], label %[[CAST_END:.*]], label %[[CAST_NOTNULL:.*]]
+// CHECK-ADDRESSAUTH:       [[CAST_NOTNULL]]:
+// CHECK-ADDRESSAUTH-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
+// CHECK-ADDRESSAUTH-NEXT:    br label %[[CAST_END]]
+// CHECK-ADDRESSAUTH:       [[CAST_END]]:
+// CHECK-ADDRESSAUTH-NEXT:    [[CAST_RESULT:%.*]] = phi ptr [ [[ADD_PTR]], %[[CAST_NOTNULL]] ], [ null, %[[ENTRY]] ]
+// CHECK-ADDRESSAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[CAST_RESULT]], align 8, !tbaa [[TBAA9]]
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[CAST_RESULT]] to i64
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP4:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP3]], i32 2, i64 [[TMP2]])
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP6:%.*]] = load volatile i8, ptr [[TMP5]], align 8
+// CHECK-ADDRESSAUTH-NEXT:    ret ptr [[TMP5]]
+//
+// CHECK-BOTHAUTH-LABEL: define ptr @_ZN5test16e_as_BEPNS_1EE(
+// CHECK-BOTHAUTH-SAME: ptr [[O:%.*]]) #[[ATTR0]] {
+// CHECK-BOTHAUTH-NEXT:  [[ENTRY:.*]]:
+// CHECK-BOTHAUTH-NEXT:    [[O_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-BOTHAUTH-NEXT:    store ptr [[O]], ptr [[O_ADDR]], align 8, !tbaa [[TBAA17]]
+// CHECK-BOTHAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[O_ADDR]], align 8, !tbaa [[TBAA17]]
+// CHECK-BOTHAUTH-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[TMP0]], null
+// CHECK-BOTHAUTH-NEXT:    br i1 [[TMP1]], label %[[CAST_END:.*]], label %[[CAST_NOTNULL:.*]]
+// CHECK-BOTHAUTH:       [[CAST_NOTNULL]]:
+// CHECK-BOTHAUTH-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
+// CHECK-BOTHAUTH-NEXT:    br label %[[CAST_END]]
+// CHECK-BOTHAUTH:       [[CAST_END]]:
+// CHECK-BOTHAUTH-NEXT:    [[CAST_RESULT:%.*]] = phi ptr [ [[ADD_PTR]], %[[CAST_NOTNULL]] ], [ null, %[[ENTRY]] ]
+// CHECK-BOTHAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[CAST_RESULT]], align 8, !tbaa [[TBAA9]]
+// CHECK-BOTHAUTH-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[CAST_RESULT]] to i64
+// CHECK-BOTHAUTH-NEXT:    [[TMP3:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[TMP2]], i64 48388)
+// CHECK-BOTHAUTH-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// CHECK-BOTHAUTH-NEXT:    [[TMP5:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP4]], i32 2, i64 [[TMP3]])
+// CHECK-BOTHAUTH-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-BOTHAUTH-NEXT:    [[TMP7:%.*]] = load volatile i8, ptr [[TMP6]], align 8
+// CHECK-BOTHAUTH-NEXT:    ret ptr [[TMP6]]
+//
 const void *e_as_B(E *o) {
   static_assert(same_type<decltype(__builtin_get_vtable_pointer(o)), const void*>::value);
-  // CHECK-NOAUTH: define ptr @_ZN5test16e_as_BEPNS_1EE(ptr %o) #0 {
   return __builtin_get_vtable_pointer((B *)o);
-  // CHECK-NOAUTH: %add.ptr = getelementptr inbounds i8, ptr %0, i64 8
-  // CHECK-NOAUTH: %cast.result = phi ptr [ %add.ptr, %cast.notnull ], [ null, %entry ]
-  // CHECK-NOAUTH: %vtable = load ptr, ptr %cast.result, align 8
-  // CHECK-TYPEAUTH: %vtable = load ptr, ptr %cast.result, align 8
-  // CHECK-TYPEAUTH: %2 = ptrtoint ptr %vtable to i64
-  // CHECK-TYPEAUTH: %3 = call i64 @llvm.ptrauth.auth(i64 %2, i32 2, i64 48388)
-  // CHECK-TYPEAUTH: %4 = inttoptr i64 %3 to ptr
-  // CHECK-TYPEAUTH: %5 = load volatile i8, ptr %4, align 8
-  // CHECK-ADDRESSAUTH: [[T1:%.*]] = ptrtoint ptr %cast.result to i64
-  // CHECK-ADDRESSAUTH: [[T2:%.*]] = ptrtoint ptr %vtable to i64
-  // CHECK-ADDRESSAUTH: [[T3:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[T2]], i32 2, i64 [[T1]])
-  // CHECK-ADDRESSAUTH: [[T4:%.*]] = inttoptr i64 [[T3]] to ptr
-  // CHECK-ADDRESSAUTH: [[T5:%.*]] = load volatile i8, ptr [[T4]], align 8
-  // CHECK-BOTHAUTH: [[T1:%.*]] = ptrtoint ptr %cast.result to i64
-  // CHECK-BOTHAUTH: [[T2:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[T1]], i64 48388)
-  // CHECK-BOTHAUTH: [[T3:%.*]] = ptrtoint ptr %vtable to i64
-  // CHECK-BOTHAUTH: [[T4:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[T3]], i32 2, i64 [[T2]])
-  // CHECK-BOTHAUTH: [[T5:%.*]] = inttoptr i64 [[T4]] to ptr
-  // CHECK-BOTHAUTH: [[T6:%.*]] = load volatile i8, ptr [[T5]], align 8
 }
 
+// CHECK-NOAUTH-LABEL: define ptr @_ZN5test16e_as_DEPNS_1EE(
+// CHECK-NOAUTH-SAME: ptr [[O:%.*]]) #[[ATTR0]] {
+// CHECK-NOAUTH-NEXT:  [[ENTRY:.*:]]
+// CHECK-NOAUTH-NEXT:    [[O_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NOAUTH-NEXT:    store ptr [[O]], ptr [[O_ADDR]], align 8, !tbaa [[TBAA17]]
+// CHECK-NOAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[O_ADDR]], align 8, !tbaa [[TBAA17]]
+// CHECK-NOAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA9]]
+// CHECK-NOAUTH-NEXT:    ret ptr [[VTABLE]]
+//
+// CHECK-TYPEAUTH-LABEL: define ptr @_ZN5test16e_as_DEPNS_1EE(
+// CHECK-TYPEAUTH-SAME: ptr [[O:%.*]]) #[[ATTR0]] {
+// CHECK-TYPEAUTH-NEXT:  [[ENTRY:.*:]]
+// CHECK-TYPEAUTH-NEXT:    [[O_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-TYPEAUTH-NEXT:    store ptr [[O]], ptr [[O_ADDR]], align 8, !tbaa [[TBAA17]]
+// CHECK-TYPEAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[O_ADDR]], align 8, !tbaa [[TBAA17]]
+// CHECK-TYPEAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA9]]
+// CHECK-TYPEAUTH-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// CHECK-TYPEAUTH-NEXT:    [[TMP2:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP1]], i32 2, i64 48388)
+// CHECK-TYPEAUTH-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-TYPEAUTH-NEXT:    [[TMP4:%.*]] = load volatile i8, ptr [[TMP3]], align 8
+// CHECK-TYPEAUTH-NEXT:    ret ptr [[TMP3]]
+//
+// CHECK-ADDRESSAUTH-LABEL: define ptr @_ZN5test16e_as_DEPNS_1EE(
+// CHECK-ADDRESSAUTH-SAME: ptr [[O:%.*]]) #[[ATTR0]] {
+// CHECK-ADDRESSAUTH-NEXT:  [[ENTRY:.*:]]
+// CHECK-ADDRESSAUTH-NEXT:    [[O_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-ADDRESSAUTH-NEXT:    store ptr [[O]], ptr [[O_ADDR]], align 8, !tbaa [[TBAA17]]
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[O_ADDR]], align 8, !tbaa [[TBAA17]]
+// CHECK-ADDRESSAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA9]]
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[TMP0]] to i64
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP3:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP2]], i32 2, i64 [[TMP1]])
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP5:%.*]] = load volatile i8, ptr [[TMP4]], align 8
+// CHECK-ADDRESSAUTH-NEXT:    ret ptr [[TMP4]]
+//
+// CHECK-BOTHAUTH-LABEL: define ptr @_ZN5test16e_as_DEPNS_1EE(
+// CHECK-BOTHAUTH-SAME: ptr [[O:%.*]]) #[[ATTR0]] {
+// CHECK-BOTHAUTH-NEXT:  [[ENTRY:.*:]]
+// CHECK-BOTHAUTH-NEXT:    [[O_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-BOTHAUTH-NEXT:    store ptr [[O]], ptr [[O_ADDR]], align 8, !tbaa [[TBAA17]]
+// CHECK-BOTHAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[O_ADDR]], align 8, !tbaa [[TBAA17]]
+// CHECK-BOTHAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA9]]
+// CHECK-BOTHAUTH-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[TMP0]] to i64
+// CHECK-BOTHAUTH-NEXT:    [[TMP2:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[TMP1]], i64 48388)
+// CHECK-BOTHAUTH-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// CHECK-BOTHAUTH-NEXT:    [[TMP4:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP3]], i32 2, i64 [[TMP2]])
+// CHECK-BOTHAUTH-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-BOTHAUTH-NEXT:    [[TMP6:%.*]] = load volatile i8, ptr [[TMP5]], align 8
+// CHECK-BOTHAUTH-NEXT:    ret ptr [[TMP5]]
+//
 const void *e_as_D(E *o) {
   static_assert(same_type<decltype(__builtin_get_vtable_pointer(o)), const void*>::value);
-  // CHECK-NOAUTH: define ptr @_ZN5test16e_as_DEPNS_1EE(ptr %o) #0 {
   return __builtin_get_vtable_pointer((D *)o);
-  // CHECK-NOAUTH: %vtable = load ptr, ptr %0, align 8
-  // CHECK-TYPEAUTH: %vtable = load ptr, ptr %0, align 8
-  // CHECK-TYPEAUTH: %1 = ptrtoint ptr %vtable to i64
-  // CHECK-TYPEAUTH: %2 = call i64 @llvm.ptrauth.auth(i64 %1, i32 2, i64 48388)
-  // CHECK-TYPEAUTH: %3 = inttoptr i64 %2 to ptr
-  // CHECK-TYPEAUTH: %4 = load volatile i8, ptr %3, align 8
-  // CHECK-ADDRESSAUTH: [[T1:%.*]] = ptrtoint ptr %0 to i64
-  // CHECK-ADDRESSAUTH: [[T2:%.*]] = ptrtoint ptr %vtable to i64
-  // CHECK-ADDRESSAUTH: [[T3:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[T2]], i32 2, i64 [[T1]])
-  // CHECK-ADDRESSAUTH: [[T4:%.*]] = inttoptr i64 [[T3]] to ptr
-  // CHECK-ADDRESSAUTH: [[T5:%.*]] = load volatile i8, ptr [[T4]], align 8
-  // CHECK-BOTHAUTH: [[T1:%.*]] = ptrtoint ptr %0 to i64
-  // CHECK-BOTHAUTH: [[T2:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[T1]], i64 48388)
-  // CHECK-BOTHAUTH: [[T3:%.*]] = ptrtoint ptr %vtable to i64
-  // CHECK-BOTHAUTH: [[T4:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[T3]], i32 2, i64 [[T2]])
-  // CHECK-BOTHAUTH: [[T5:%.*]] = inttoptr i64 [[T4]] to ptr
-  // CHECK-BOTHAUTH: [[T6:%.*]] = load volatile i8, ptr [[T5]], align 8
 }
 
+// CHECK-NOAUTH-LABEL: define ptr @aArrayParameter(
+// CHECK-NOAUTH-SAME: ptr [[AARRAY:%.*]]) #[[ATTR0]] {
+// CHECK-NOAUTH-NEXT:  [[ENTRY:.*:]]
+// CHECK-NOAUTH-NEXT:    [[AARRAY_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NOAUTH-NEXT:    store ptr [[AARRAY]], ptr [[AARRAY_ADDR]], align 8, !tbaa [[TBAA6]]
+// CHECK-NOAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[AARRAY_ADDR]], align 8, !tbaa [[TBAA6]]
+// CHECK-NOAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA9]]
+// CHECK-NOAUTH-NEXT:    ret ptr [[VTABLE]]
+//
+// CHECK-TYPEAUTH-LABEL: define ptr @aArrayParameter(
+// CHECK-TYPEAUTH-SAME: ptr [[AARRAY:%.*]]) #[[ATTR0]] {
+// CHECK-TYPEAUTH-NEXT:  [[ENTRY:.*:]]
+// CHECK-TYPEAUTH-NEXT:    [[AARRAY_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-TYPEAUTH-NEXT:    store ptr [[AARRAY]], ptr [[AARRAY_ADDR]], align 8, !tbaa [[TBAA6]]
+// CHECK-TYPEAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[AARRAY_ADDR]], align 8, !tbaa [[TBAA6]]
+// CHECK-TYPEAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA9]]
+// CHECK-TYPEAUTH-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// CHECK-TYPEAUTH-NEXT:    [[TMP2:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP1]], i32 2, i64 48388)
+// CHECK-TYPEAUTH-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-TYPEAUTH-NEXT:    [[TMP4:%.*]] = load volatile i8, ptr [[TMP3]], align 8
+// CHECK-TYPEAUTH-NEXT:    ret ptr [[TMP3]]
+//
+// CHECK-ADDRESSAUTH-LABEL: define ptr @aArrayParameter(
+// CHECK-ADDRESSAUTH-SAME: ptr [[AARRAY:%.*]]) #[[ATTR0]] {
+// CHECK-ADDRESSAUTH-NEXT:  [[ENTRY:.*:]]
+// CHECK-ADDRESSAUTH-NEXT:    [[AARRAY_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-ADDRESSAUTH-NEXT:    store ptr [[AARRAY]], ptr [[AARRAY_ADDR]], align 8, !tbaa [[TBAA6]]
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[AARRAY_ADDR]], align 8, !tbaa [[TBAA6]]
+// CHECK-ADDRESSAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA9]]
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[TMP0]] to i64
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP3:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP2]], i32 2, i64 [[TMP1]])
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP5:%.*]] = load volatile i8, ptr [[TMP4]], align 8
+// CHECK-ADDRESSAUTH-NEXT:    ret ptr [[TMP4]]
+//
+// CHECK-BOTHAUTH-LABEL: define ptr @aArrayParameter(
+// CHECK-BOTHAUTH-SAME: ptr [[AARRAY:%.*]]) #[[ATTR0]] {
+// CHECK-BOTHAUTH-NEXT:  [[ENTRY:.*:]]
+// CHECK-BOTHAUTH-NEXT:    [[AARRAY_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-BOTHAUTH-NEXT:    store ptr [[AARRAY]], ptr [[AARRAY_ADDR]], align 8, !tbaa [[TBAA6]]
+// CHECK-BOTHAUTH-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[AARRAY_ADDR]], align 8, !tbaa [[TBAA6]]
+// CHECK-BOTHAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA9]]
+// CHECK-BOTHAUTH-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[TMP0]] to i64
+// CHECK-BOTHAUTH-NEXT:    [[TMP2:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[TMP1]], i64 48388)
+// CHECK-BOTHAUTH-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// CHECK-BOTHAUTH-NEXT:    [[TMP4:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP3]], i32 2, i64 [[TMP2]])
+// CHECK-BOTHAUTH-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-BOTHAUTH-NEXT:    [[TMP6:%.*]] = load volatile i8, ptr [[TMP5]], align 8
+// CHECK-BOTHAUTH-NEXT:    ret ptr [[TMP5]]
+//
 extern "C" const void *aArrayParameter(A aArray[]) {
   static_assert(same_type<decltype(__builtin_get_vtable_pointer(aArray)), const void*>::value);
-  // CHECK-NOAUTH: [[THIS_OBJ:%.*]] = load ptr, ptr %aArray.addr
-  // CHECK-NOAUTH: %vtable = load ptr, ptr [[THIS_OBJ]]
-  // CHECK-TYPEAUTH: [[THIS_OBJ:%.*]] = load ptr, ptr %aArray.addr
-  // CHECK-TYPEAUTH: %vtable = load ptr, ptr [[THIS_OBJ]]
-  // CHECK-TYPEAUTH: [[VTABLEI:%.*]] = ptrtoint ptr %vtable to i64
-  // CHECK-TYPEAUTH: [[AUTHENTICATED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI]], i32 2, i64 48388)
-  // CHECK-ADDRESSAUTH: [[VTABLE_ADDR:%.*]] = load ptr, ptr %aArray.addr, align 8, !tbaa !2
-  // CHECK-ADDRESSAUTH: %vtable = load ptr, ptr %0, align 8, !tbaa !7
-  // CHECK-ADDRESSAUTH: [[VTABLE_ADDRI:%.*]] = ptrtoint ptr [[VTABLE_ADDR]] to i64
-  // CHECK-ADDRESSAUTH: [[VTABLEI:%.*]] = ptrtoint ptr %vtable to i64
-  // CHECK-ADDRESSAUTH: [[AUTHENTICATED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI]], i32 2, i64 [[VTABLE_ADDRI]])
-  // CHECK-BOTHAUTH: [[VTABLE_ADDR:%.*]] = load ptr, ptr %aArray.addr, align 8, !tbaa !2
-  // CHECK-BOTHAUTH: %vtable = load ptr, ptr [[VTABLE_ADDR]], align 8, !tbaa !7
-  // CHECK-BOTHAUTH: [[VTABLE_ADDRI:%.*]] = ptrtoint ptr [[VTABLE_ADDR]] to i64
-  // CHECK-BOTHAUTH: [[VTABLE_DISC:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[VTABLE_ADDRI]], i64 48388)
-  // CHECK-BOTHAUTH: [[VTABLE_PTR:%.*]] = ptrtoint ptr %vtable to i64
-  // CHECK-BOTHAUTH: [[AUTHENTICATED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLE_PTR]], i32 2, i64 [[VTABLE_DISC]])
   return __builtin_get_vtable_pointer(aArray);
 }
 
+// CHECK-NOAUTH-LABEL: define ptr @aArrayLocal(
+// CHECK-NOAUTH-SAME: ) #[[ATTR0]] {
+// CHECK-NOAUTH-NEXT:  [[ENTRY:.*:]]
+// CHECK-NOAUTH-NEXT:    [[ARRAY:%.*]] = alloca [1 x %"struct.test1::A"], align 8
+// CHECK-NOAUTH-NEXT:    call void @llvm.lifetime.start.p0(ptr [[ARRAY]]) #[[ATTR5:[0-9]+]]
+// CHECK-NOAUTH-NEXT:    call void @_ZN5test11AC1Ev(ptr nonnull align 8 dereferenceable(8) [[ARRAY]])
+// CHECK-NOAUTH-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [1 x %"struct.test1::A"], ptr [[ARRAY]], i64 0, i64 0
+// CHECK-NOAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[ARRAYDECAY]], align 8, !tbaa [[TBAA9]]
+// CHECK-NOAUTH-NEXT:    call void @llvm.lifetime.end.p0(ptr [[ARRAY]]) #[[ATTR5]]
+// CHECK-NOAUTH-NEXT:    ret ptr [[VTABLE]]
+//
+// CHECK-TYPEAUTH-LABEL: define ptr @aArrayLocal(
+// CHECK-TYPEAUTH-SAME: ) #[[ATTR0]] {
+// CHECK-TYPEAUTH-NEXT:  [[ENTRY:.*:]]
+// CHECK-TYPEAUTH-NEXT:    [[ARRAY:%.*]] = alloca [1 x %"struct.test1::A"], align 8
+// CHECK-TYPEAUTH-NEXT:    call void @llvm.lifetime.start.p0(ptr [[ARRAY]]) #[[ATTR7:[0-9]+]]
+// CHECK-TYPEAUTH-NEXT:    [[CALL:%.*]] = call ptr @_ZN5test11AC1Ev(ptr nonnull align 8 dereferenceable(8) [[ARRAY]])
+// CHECK-TYPEAUTH-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [1 x %"struct.test1::A"], ptr [[ARRAY]], i64 0, i64 0
+// CHECK-TYPEAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[ARRAYDECAY]], align 8, !tbaa [[TBAA9]]
+// CHECK-TYPEAUTH-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// CHECK-TYPEAUTH-NEXT:    [[TMP1:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP0]], i32 2, i64 48388)
+// CHECK-TYPEAUTH-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
+// CHECK-TYPEAUTH-NEXT:    [[TMP3:%.*]] = load volatile i8, ptr [[TMP2]], align 8
+// CHECK-TYPEAUTH-NEXT:    call void @llvm.lifetime.end.p0(ptr [[ARRAY]]) #[[ATTR7]]
+// CHECK-TYPEAUTH-NEXT:    ret ptr [[TMP2]]
+//
+// CHECK-ADDRESSAUTH-LABEL: define ptr @aArrayLocal(
+// CHECK-ADDRESSAUTH-SAME: ) #[[ATTR0]] {
+// CHECK-ADDRESSAUTH-NEXT:  [[ENTRY:.*:]]
+// CHECK-ADDRESSAUTH-NEXT:    [[ARRAY:%.*]] = alloca [1 x %"struct.test1::A"], align 8
+// CHECK-ADDRESSAUTH-NEXT:    call void @llvm.lifetime.start.p0(ptr [[ARRAY]]) #[[ATTR7:[0-9]+]]
+// CHECK-ADDRESSAUTH-NEXT:    [[CALL:%.*]] = call ptr @_ZN5test11AC1Ev(ptr nonnull align 8 dereferenceable(8) [[ARRAY]])
+// CHECK-ADDRESSAUTH-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [1 x %"struct.test1::A"], ptr [[ARRAY]], i64 0, i64 0
+// CHECK-ADDRESSAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[ARRAYDECAY]], align 8, !tbaa [[TBAA9]]
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[ARRAYDECAY]] to i64
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP2:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP1]], i32 2, i64 [[TMP0]])
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP4:%.*]] = load volatile i8, ptr [[TMP3]], align 8
+// CHECK-ADDRESSAUTH-NEXT:    call void @llvm.lifetime.end.p0(ptr [[ARRAY]]) #[[ATTR7]]
+// CHECK-ADDRESSAUTH-NEXT:    ret ptr [[TMP3]]
+//
+// CHECK-BOTHAUTH-LABEL: define ptr @aArrayLocal(
+// CHECK-BOTHAUTH-SAME: ) #[[ATTR0]] {
+// CHECK-BOTHAUTH-NEXT:  [[ENTRY:.*:]]
+// CHECK-BOTHAUTH-NEXT:    [[ARRAY:%.*]] = alloca [1 x %"struct.test1::A"], align 8
+// CHECK-BOTHAUTH-NEXT:    call void @llvm.lifetime.start.p0(ptr [[ARRAY]]) #[[ATTR7:[0-9]+]]
+// CHECK-BOTHAUTH-NEXT:    [[CALL:%.*]] = call ptr @_ZN5test11AC1Ev(ptr nonnull align 8 dereferenceable(8) [[ARRAY]])
+// CHECK-BOTHAUTH-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [1 x %"struct.test1::A"], ptr [[ARRAY]], i64 0, i64 0
+// CHECK-BOTHAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[ARRAYDECAY]], align 8, !tbaa [[TBAA9]]
+// CHECK-BOTHAUTH-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[ARRAYDECAY]] to i64
+// CHECK-BOTHAUTH-NEXT:    [[TMP1:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[TMP0]], i64 48388)
+// CHECK-BOTHAUTH-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// CHECK-BOTHAUTH-NEXT:    [[TMP3:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP2]], i32 2, i64 [[TMP1]])
+// CHECK-BOTHAUTH-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+// CHECK-BOTHAUTH-NEXT:    [[TMP5:%.*]] = load volatile i8, ptr [[TMP4]], align 8
+// CHECK-BOTHAUTH-NEXT:    call void @llvm.lifetime.end.p0(ptr [[ARRAY]]) #[[ATTR7]]
+// CHECK-BOTHAUTH-NEXT:    ret ptr [[TMP4]]
+//
 extern "C" const void *aArrayLocal() {
   A array[] = { A() };
   static_assert(same_type<decltype(__builtin_get_vtable_pointer(array)), const void*>::value);
-  // CHECK-NOAUTH: [[THIS_OBJ:%.*]] = getelementptr inbounds [1 x %"struct.test1::A"], ptr %array
-  // CHECK-NOAUTH: %vtable = load ptr, ptr %arraydecay
-  // CHECK-TYPEAUTH: %arraydecay = getelementptr inbounds [1 x %"struct.test1::A"]
-  // CHECK-TYPEAUTH: %vtable = load ptr, ptr %arraydecay
-  // CHECK-TYPEAUTH: [[VTABLEI:%.*]] = ptrtoint ptr %vtable to i64
-  // CHECK-TYPEAUTH: [[AUTHENTICATED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI]], i32 2, i64 48388)
-  // CHECK-ADDRESSAUTH: %arraydecay = getelementptr inbounds [1 x %"struct.test1::A"], ptr %array, i64 0, i64 0
-  // CHECK-ADDRESSAUTH: %vtable = load ptr, ptr %arraydecay, align 8, !tbaa !7
-  // CHECK-ADDRESSAUTH: [[VTABLE_ADDRI:%.*]] = ptrtoint ptr %arraydecay to i64
-  // CHECK-ADDRESSAUTH: [[VTABLEI:%.*]] = ptrtoint ptr %vtable to i64
-  // CHECK-ADDRESSAUTH: [[AUTHENTICATED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI]], i32 2, i64 [[VTABLE_ADDRI]])
-  // CHECK-BOTHAUTH: %arraydecay = getelementptr inbounds [1 x %"struct.test1::A"], ptr %array, i64 0, i64 0
-  // CHECK-BOTHAUTH: %vtable = load ptr, ptr %arraydecay, align 8, !tbaa !7
-  // CHECK-BOTHAUTH: [[VTABLE_ADDRI:%.*]] = ptrtoint ptr %arraydecay to i64
-  // CHECK-BOTHAUTH: [[VTABLE_DISC:%.*]] = call i64 @llvm.ptrauth.blend(i64 %0, i64 48388)
-  // CHECK-BOTHAUTH: [[VTABLEI:%.*]] = ptrtoint ptr %vtable to i64
-  // CHECK-BOTHAUTH: [[AUTHENTICATED:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[VTABLEI]], i32 2, i64 [[VTABLE_DISC]])
   return __builtin_get_vtable_pointer(array);
 }
 
+// CHECK-NOAUTH-LABEL: define void @_ZN5test14testEv(
+// CHECK-NOAUTH-SAME: ) #[[ATTR0]] {
+// CHECK-NOAUTH-NEXT:  [[ENTRY:.*]]:
+// CHECK-NOAUTH-NEXT:    [[AINSTANCE:%.*]] = alloca %"struct.test1::A", align 8
+// CHECK-NOAUTH-NEXT:    [[BINSTANCE:%.*]] = alloca %"struct.test1::B", align 8
+// CHECK-NOAUTH-NEXT:    [[CINSTANCE:%.*]] = alloca %"struct.test1::C", align 8
+// CHECK-NOAUTH-NEXT:    [[DINSTANCE:%.*]] = alloca %"struct.test1::D", align 8
+// CHECK-NOAUTH-NEXT:    [[EINSTANCE:%.*]] = alloca %"struct.test1::E", align 8
+// CHECK-NOAUTH-NEXT:    [[EARRAY:%.*]] = alloca [1 x %"struct.test1::E"], align 16
+// CHECK-NOAUTH-NEXT:    call void @llvm.lifetime.start.p0(ptr [[AINSTANCE]]) #[[ATTR5]]
+// CHECK-NOAUTH-NEXT:    call void @_ZN5test11AC1Ev(ptr nonnull align 8 dereferenceable(8) [[AINSTANCE]])
+// CHECK-NOAUTH-NEXT:    call void @llvm.lifetime.start.p0(ptr [[BINSTANCE]]) #[[ATTR5]]
+// CHECK-NOAUTH-NEXT:    call void @_ZN5test11BC1Ev(ptr nonnull align 8 dereferenceable(8) [[BINSTANCE]])
+// CHECK-NOAUTH-NEXT:    call void @llvm.lifetime.start.p0(ptr [[CINSTANCE]]) #[[ATTR5]]
+// CHECK-NOAUTH-NEXT:    call void @_ZN5test11CC1Ev(ptr nonnull align 8 dereferenceable(16) [[CINSTANCE]])
+// CHECK-NOAUTH-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DINSTANCE]]) #[[ATTR5]]
+// CHECK-NOAUTH-NEXT:    call void @_ZN5test11DC1Ev(ptr nonnull align 8 dereferenceable(8) [[DINSTANCE]])
+// CHECK-NOAUTH-NEXT:    call void @llvm.lifetime.start.p0(ptr [[EINSTANCE]]) #[[ATTR5]]
+// CHECK-NOAUTH-NEXT:    call void @_ZN5test11EC1Ev(ptr nonnull align 8 dereferenceable(16) [[EINSTANCE]])
+// CHECK-NOAUTH-NEXT:    call void @llvm.lifetime.start.p0(ptr [[EARRAY]]) #[[ATTR5]]
+// CHECK-NOAUTH-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[EARRAY]], i8 0, i64 16, i1 false)
+// CHECK-NOAUTH-NEXT:    call void @_ZN5test11EC1Ev(ptr nonnull align 8 dereferenceable(16) [[EARRAY]])
+// CHECK-NOAUTH-NEXT:    [[CALL:%.*]] = call ptr @_ZN5test11aEPNS_1AE(ptr [[AINSTANCE]])
+// CHECK-NOAUTH-NEXT:    [[CALL1:%.*]] = call ptr @_ZN5test11aEPNS_1AE(ptr [[BINSTANCE]])
+// CHECK-NOAUTH-NEXT:    [[TMP0:%.*]] = icmp eq ptr [[CINSTANCE]], null
+// CHECK-NOAUTH-NEXT:    br i1 [[TMP0]], label %[[CAST_END:.*]], label %[[CAST_NOTNULL:.*]]
+// CHECK-NOAUTH:       [[CAST_NOTNULL]]:
+// CHECK-NOAUTH-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[CINSTANCE]], i64 8
+// CHECK-NOAUTH-NEXT:    br label %[[CAST_END]]
+// CHECK-NOAUTH:       [[CAST_END]]:
+// CHECK-NOAUTH-NEXT:    [[CAST_RESULT:%.*]] = phi ptr [ [[ADD_PTR]], %[[CAST_NOTNULL]] ], [ null, %[[ENTRY]] ]
+// CHECK-NOAUTH-NEXT:    [[CALL2:%.*]] = call ptr @_ZN5test11aEPNS_1AE(ptr [[CAST_RESULT]])
+// CHECK-NOAUTH-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[DINSTANCE]], null
+// CHECK-NOAUTH-NEXT:    br i1 [[TMP1]], label %[[CAST_END5:.*]], label %[[CAST_NOTNULL3:.*]]
+// CHECK-NOAUTH:       [[CAST_NOTNULL3]]:
+// CHECK-NOAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[DINSTANCE]], align 8, !tbaa [[TBAA9]]
+// CHECK-NOAUTH-NEXT:    [[VBASE_OFFSET_PTR:%.*]] = getelementptr i8, ptr [[VTABLE]], i64 -32
+// CHECK-NOAUTH-NEXT:    [[VBASE_OFFSET:%.*]] = load i64, ptr [[VBASE_OFFSET_PTR]], align 8
+// CHECK-NOAUTH-NEXT:    [[ADD_PTR4:%.*]] = getelementptr inbounds i8, ptr [[DINSTANCE]], i64 [[VBASE_OFFSET]]
+// CHECK-NOAUTH-NEXT:    br label %[[CAST_END5]]
+// CHECK-NOAUTH:       [[CAST_END5]]:
+// CHECK-NOAUTH-NEXT:    [[CAST_RESULT6:%.*]] = phi ptr [ [[ADD_PTR4]], %[[CAST_NOTNULL3]] ], [ null, %[[CAST_END]] ]
+// CHECK-NOAUTH-NEXT:    [[CALL7:%.*]] = call ptr @_ZN5test11aEPNS_1AE(ptr [[CAST_RESULT6]])
+// CHECK-NOAUTH-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[EINSTANCE]], null
+// CHECK-NOAUTH-NEXT:    br i1 [[TMP2]], label %[[CAST_END13:.*]], label %[[CAST_NOTNULL8:.*]]
+// CHECK-NOAUTH:       [[CAST_NOTNULL8]]:
+// CHECK-NOAUTH-NEXT:    [[VTABLE9:%.*]] = load ptr, ptr [[EINSTANCE]], align 8, !tbaa [[TBAA9]]
+// CHECK-NOAUTH-NEXT:    [[VBASE_OFFSET_PTR10:%.*]] = getelementptr i8, ptr [[VTABLE9]], i64 -32
+// CHECK-NOAUTH-NEXT:    [[VBASE_OFFSET11:%.*]] = load i64, ptr [[VBASE_OFFSET_PTR10]], align 8
+// CHECK-NOAUTH-NEXT:    [[ADD_PTR12:%.*]] = getelementptr inbounds i8, ptr [[EINSTANCE]], i64 [[VBASE_OFFSET11]]
+// CHECK-NOAUTH-NEXT:    br label %[[CAST_END13]]
+// CHECK-NOAUTH:       [[CAST_END13]]:
+// CHECK-NOAUTH-NEXT:    [[CAST_RESULT14:%.*]] = phi ptr [ [[ADD_PTR12]], %[[CAST_NOTNULL8]] ], [ null, %[[CAST_END5]] ]
+// CHECK-NOAUTH-NEXT:    [[CALL15:%.*]] = call ptr @_ZN5test11aEPNS_1AE(ptr [[CAST_RESULT14]])
+// CHECK-NOAUTH-NEXT:    [[TMP3:%.*]] = icmp eq ptr [[EINSTANCE]], null
+// CHECK-NOAUTH-NEXT:    br i1 [[TMP3]], label %[[CAST_END18:.*]], label %[[CAST_NOTNULL16:.*]]
+// CHECK-NOAUTH:       [[CAST_NOTNULL16]]:
+// CHECK-NOAUTH-NEXT:    [[ADD_PTR17:%.*]] = getelementptr inbounds i8, ptr [[EINSTANCE]], i64 8
+// CHECK-NOAUTH-NEXT:    br label %[[CAST_END18]]
+// CHECK-NOAUTH:       [[CAST_END18]]:
+// CHECK-NOAUTH-NEXT:    [[CAST_RESULT19:%.*]] = phi ptr [ [[ADD_PTR17]], %[[CAST_NOTNULL16]] ], [ null, %[[CAST_END13]] ]
+// CHECK-NOAUTH-NEXT:    [[CALL20:%.*]] = call ptr @_ZN5test11aEPNS_1AE(ptr [[CAST_RESULT19]])
+// CHECK-NOAUTH-NEXT:    [[CALL21:%.*]] = call ptr @_ZN5test11bEPNS_1BE(ptr [[BINSTANCE]])
+// CHECK-NOAUTH-NEXT:    [[TMP4:%.*]] = icmp eq ptr [[CINSTANCE]], null
+// CHECK-NOAUTH-NEXT:    br i1 [[TMP4]], label %[[CAST_END24:.*]], label %[[CAST_NOTNULL22:.*]]
+// CHECK-NOAUTH:       [[CAST_NOTNULL22]]:
+// CHECK-NOAUTH-NEXT:    [[ADD_PTR23:%.*]] = getelementptr inbounds i8, ptr [[CINSTANCE]], i64 8
+// CHECK-NOAUTH-NEXT:    br label %[[CAST_END24]]
+// CHECK-NOAUTH:       [[CAST_END24]]:
+// CHECK-NOAUTH-NEXT:    [[CAST_RESULT25:%.*]] = phi ptr [ [[ADD_PTR23]], %[[CAST_NOTNULL22]] ], [ null, %[[CAST_END18]] ]
+// CHECK-NOAUTH-NEXT:    [[CALL26:%.*]] = call ptr @_ZN5test11bEPNS_1BE(ptr [[CAST_RESULT25]])
+// CHECK-NOAUTH-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[EINSTANCE]], null
+// CHECK-NOAUTH-NEXT:    br i1 [[TMP5]], label %[[CAST_END29:.*]], label %[[CAST_NOTNULL27:.*]]
+// CHECK-NOAUTH:       [[CAST_NOTNULL27]]:
+// CHECK-NOAUTH-NEXT:    [[ADD_PTR28:%.*]] = getelementptr inbounds i8, ptr [[EINSTANCE]], i64 8
+// CHECK-NOAUTH-NEXT:    br label %[[CAST_END29]]
+// CHECK-NOAUTH:       [[CAST_END29]]:
+// CHECK-NOAUTH-NEXT:    [[CAST_RESULT30:%.*]] = phi ptr [ [[ADD_PTR28]], %[[CAST_NOTNULL27]] ], [ null, %[[CAST_END24]] ]
+// CHECK-NOAUTH-NEXT:    [[CALL31:%.*]] = call ptr @_ZN5test11bEPNS_1BE(ptr [[CAST_RESULT30]])
+// CHECK-NOAUTH-NEXT:    [[CALL32:%.*]] = call ptr @_ZN5test16b_as_AEPNS_1BE(ptr [[BINSTANCE]])
+// CHECK-NOAUTH-NEXT:    [[CALL33:%.*]] = call ptr @_ZN5test11cEPNS_1CE(ptr [[CINSTANCE]])
+// CHECK-NOAUTH-NEXT:    [[CALL34:%.*]] = call ptr @_ZN5test16c_as_ZEPNS_1CE(ptr [[CINSTANCE]])
+// CHECK-NOAUTH-NEXT:    [[CALL35:%.*]] = call ptr @_ZN5test16c_as_BEPNS_1CE(ptr [[CINSTANCE]])
+// CHECK-NOAUTH-NEXT:    [[CALL36:%.*]] = call ptr @_ZN5test11dEPNS_1DE(ptr [[DINSTANCE]])
+// CHECK-NOAUTH-NEXT:    [[CALL37:%.*]] = call ptr @_ZN5test11dEPNS_1DE(ptr [[EINSTANCE]])
+// CHECK-NOAUTH-NEXT:    [[CALL38:%.*]] = call ptr @_ZN5test16d_as_AEPNS_1DE(ptr [[DINSTANCE]])
+// CHECK-NOAUTH-NEXT:    [[CALL39:%.*]] = call ptr @_ZN5test16d_as_AEPNS_1DE(ptr [[EINSTANCE]])
+// CHECK-NOAUTH-NEXT:    [[CALL40:%.*]] = call ptr @_ZN5test11eEPNS_1EE(ptr [[EINSTANCE]])
+// CHECK-NOAUTH-NEXT:    [[CALL41:%.*]] = call ptr @_ZN5test16e_as_BEPNS_1EE(ptr [[EINSTANCE]])
+// CHECK-NOAUTH-NEXT:    [[CALL42:%.*]] = call ptr @_ZN5test16e_as_DEPNS_1EE(ptr [[EINSTANCE]])
+// CHECK-NOAUTH-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [1 x %"struct.test1::E"], ptr [[EARRAY]], i64 0, i64 0
+// CHECK-NOAUTH-NEXT:    [[VTABLE43:%.*]] = load ptr, ptr [[ARRAYDECAY]], align 16, !tbaa [[TBAA9]]
+// CHECK-NOAUTH-NEXT:    call void @llvm.lifetime.end.p0(ptr [[EARRAY]]) #[[ATTR5]]
+// CHECK-NOAUTH-NEXT:    call void @llvm.lifetime.end.p0(ptr [[EINSTANCE]]) #[[ATTR5]]
+// CHECK-NOAUTH-NEXT:    call void @llvm.lifetime.end.p0(ptr [[DINSTANCE]]) #[[ATTR5]]
+// CHECK-NOAUTH-NEXT:    call void @llvm.lifetime.end.p0(ptr [[CINSTANCE]]) #[[ATTR5]]
+// CHECK-NOAUTH-NEXT:    call void @llvm.lifetime.end.p0(ptr [[BINSTANCE]]) #[[ATTR5]]
+// CHECK-NOAUTH-NEXT:    call void @llvm.lifetime.end.p0(ptr [[AINSTANCE]]) #[[ATTR5]]
+// CHECK-NOAUTH-NEXT:    ret void
+//
+// CHECK-TYPEAUTH-LABEL: define void @_ZN5test14testEv(
+// CHECK-TYPEAUTH-SAME: ) #[[ATTR0]] {
+// CHECK-TYPEAUTH-NEXT:  [[ENTRY:.*]]:
+// CHECK-TYPEAUTH-NEXT:    [[AINSTANCE:%.*]] = alloca %"struct.test1::A", align 8
+// CHECK-TYPEAUTH-NEXT:    [[BINSTANCE:%.*]] = alloca %"struct.test1::B", align 8
+// CHECK-TYPEAUTH-NEXT:    [[CINSTANCE:%.*]] = alloca %"struct.test1::C", align 8
+// CHECK-TYPEAUTH-NEXT:    [[DINSTANCE:%.*]] = alloca %"struct.test1::D", align 8
+// CHECK-TYPEAUTH-NEXT:    [[EINSTANCE:%.*]] = alloca %"struct.test1::E", align 8
+// CHECK-TYPEAUTH-NEXT:    [[EARRAY:%.*]] = alloca [1 x %"struct.test1::E"], align 8
+// CHECK-TYPEAUTH-NEXT:    call void @llvm.lifetime.start.p0(ptr [[AINSTANCE]]) #[[ATTR7]]
+// CHECK-TYPEAUTH-NEXT:    [[CALL:%.*]] = call ptr @_ZN5test11AC1Ev(ptr nonnull align 8 dereferenceable(8) [[AINSTANCE]])
+// CHECK-TYPEAUTH-NEXT:    call void @llvm.lifetime.start.p0(ptr [[BINSTANCE]]) #[[ATTR7]]
+// CHECK-TYPEAUTH-NEXT:    [[CALL1:%.*]] = call ptr @_ZN5test11BC1Ev(ptr nonnull align 8 dereferenceable(8) [[BINSTANCE]])
+// CHECK-TYPEAUTH-NEXT:    call void @llvm.lifetime.start.p0(ptr [[CINSTANCE]]) #[[ATTR7]]
+// CHECK-TYPEAUTH-NEXT:    [[CALL2:%.*]] = call ptr @_ZN5test11CC1Ev(ptr nonnull align 8 dereferenceable(16) [[CINSTANCE]])
+// CHECK-TYPEAUTH-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DINSTANCE]]) #[[ATTR7]]
+// CHECK-TYPEAUTH-NEXT:    [[CALL3:%.*]] = call ptr @_ZN5test11DC1Ev(ptr nonnull align 8 dereferenceable(8) [[DINSTANCE]])
+// CHECK-TYPEAUTH-NEXT:    call void @llvm.lifetime.start.p0(ptr [[EINSTANCE]]) #[[ATTR7]]
+// CHECK-TYPEAUTH-NEXT:    [[CALL4:%.*]] = call ptr @_ZN5test11EC1Ev(ptr nonnull align 8 dereferenceable(16) [[EINSTANCE]])
+// CHECK-TYPEAUTH-NEXT:    call void @llvm.lifetime.start.p0(ptr [[EARRAY]]) #[[ATTR7]]
+// CHECK-TYPEAUTH-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[EARRAY]], i8 0, i64 16, i1 false)
+// CHECK-TYPEAUTH-NEXT:    [[CALL5:%.*]] = call ptr @_ZN5test11EC1Ev(ptr nonnull align 8 dereferenceable(16) [[EARRAY]])
+// CHECK-TYPEAUTH-NEXT:    [[CALL6:%.*]] = call ptr @_ZN5test11aEPNS_1AE(ptr [[AINSTANCE]])
+// CHECK-TYPEAUTH-NEXT:    [[CALL7:%.*]] = call ptr @_ZN5test11aEPNS_1AE(ptr [[BINSTANCE]])
+// CHECK-TYPEAUTH-NEXT:    [[TMP0:%.*]] = icmp eq ptr [[CINSTANCE]], null
+// CHECK-TYPEAUTH-NEXT:    br i1 [[TMP0]], label %[[CAST_END:.*]], label %[[CAST_NOTNULL:.*]]
+// CHECK-TYPEAUTH:       [[CAST_NOTNULL]]:
+// CHECK-TYPEAUTH-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[CINSTANCE]], i64 8
+// CHECK-TYPEAUTH-NEXT:    br label %[[CAST_END]]
+// CHECK-TYPEAUTH:       [[CAST_END]]:
+// CHECK-TYPEAUTH-NEXT:    [[CAST_RESULT:%.*]] = phi ptr [ [[ADD_PTR]], %[[CAST_NOTNULL]] ], [ null, %[[ENTRY]] ]
+// CHECK-TYPEAUTH-NEXT:    [[CALL8:%.*]] = call ptr @_ZN5test11aEPNS_1AE(ptr [[CAST_RESULT]])
+// CHECK-TYPEAUTH-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[DINSTANCE]], null
+// CHECK-TYPEAUTH-NEXT:    br i1 [[TMP1]], label %[[CAST_END11:.*]], label %[[CAST_NOTNULL9:.*]]
+// CHECK-TYPEAUTH:       [[CAST_NOTNULL9]]:
+// CHECK-TYPEAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[DINSTANCE]], align 8, !tbaa [[TBAA9]]
+// CHECK-TYPEAUTH-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// CHECK-TYPEAUTH-NEXT:    [[TMP3:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP2]], i32 2, i64 48388)
+// CHECK-TYPEAUTH-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+// CHECK-TYPEAUTH-NEXT:    [[VBASE_OFFSET_PTR:%.*]] = getelementptr i8, ptr [[TMP4]], i64 -32
+// CHECK-TYPEAUTH-NEXT:    [[VBASE_OFFSET:%.*]] = load i64, ptr [[VBASE_OFFSET_PTR]], align 8
+// CHECK-TYPEAUTH-NEXT:    [[ADD_PTR10:%.*]] = getelementptr inbounds i8, ptr [[DINSTANCE]], i64 [[VBASE_OFFSET]]
+// CHECK-TYPEAUTH-NEXT:    br label %[[CAST_END11]]
+// CHECK-TYPEAUTH:       [[CAST_END11]]:
+// CHECK-TYPEAUTH-NEXT:    [[CAST_RESULT12:%.*]] = phi ptr [ [[ADD_PTR10]], %[[CAST_NOTNULL9]] ], [ null, %[[CAST_END]] ]
+// CHECK-TYPEAUTH-NEXT:    [[CALL13:%.*]] = call ptr @_ZN5test11aEPNS_1AE(ptr [[CAST_RESULT12]])
+// CHECK-TYPEAUTH-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[EINSTANCE]], null
+// CHECK-TYPEAUTH-NEXT:    br i1 [[TMP5]], label %[[CAST_END19:.*]], label %[[CAST_NOTNULL14:.*]]
+// CHECK-TYPEAUTH:       [[CAST_NOTNULL14]]:
+// CHECK-TYPEAUTH-NEXT:    [[VTABLE15:%.*]] = load ptr, ptr [[EINSTANCE]], align 8, !tbaa [[TBAA9]]
+// CHECK-TYPEAUTH-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[VTABLE15]] to i64
+// CHECK-TYPEAUTH-NEXT:    [[TMP7:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP6]], i32 2, i64 48388)
+// CHECK-TYPEAUTH-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+// CHECK-TYPEAUTH-NEXT:    [[VBASE_OFFSET_PTR16:%.*]] = getelementptr i8, ptr [[TMP8]], i64 -32
+// CHECK-TYPEAUTH-NEXT:    [[VBASE_OFFSET17:%.*]] = load i64, ptr [[VBASE_OFFSET_PTR16]], align 8
+// CHECK-TYPEAUTH-NEXT:    [[ADD_PTR18:%.*]] = getelementptr inbounds i8, ptr [[EINSTANCE]], i64 [[VBASE_OFFSET17]]
+// CHECK-TYPEAUTH-NEXT:    br label %[[CAST_END19]]
+// CHECK-TYPEAUTH:       [[CAST_END19]]:
+// CHECK-TYPEAUTH-NEXT:    [[CAST_RESULT20:%.*]] = phi ptr [ [[ADD_PTR18]], %[[CAST_NOTNULL14]] ], [ null, %[[CAST_END11]] ]
+// CHECK-TYPEAUTH-NEXT:    [[CALL21:%.*]] = call ptr @_ZN5test11aEPNS_1AE(ptr [[CAST_RESULT20]])
+// CHECK-TYPEAUTH-NEXT:    [[TMP9:%.*]] = icmp eq ptr [[EINSTANCE]], null
+// CHECK-TYPEAUTH-NEXT:    br i1 [[TMP9]], label %[[CAST_END24:.*]], label %[[CAST_NOTNULL22:.*]]
+// CHECK-TYPEAUTH:       [[CAST_NOTNULL22]]:
+// CHECK-TYPEAUTH-NEXT:    [[ADD_PTR23:%.*]] = getelementptr inbounds i8, ptr [[EINSTANCE]], i64 8
+// CHECK-TYPEAUTH-NEXT:    br label %[[CAST_END24]]
+// CHECK-TYPEAUTH:       [[CAST_END24]]:
+// CHECK-TYPEAUTH-NEXT:    [[CAST_RESULT25:%.*]] = phi ptr [ [[ADD_PTR23]], %[[CAST_NOTNULL22]] ], [ null, %[[CAST_END19]] ]
+// CHECK-TYPEAUTH-NEXT:    [[CALL26:%.*]] = call ptr @_ZN5test11aEPNS_1AE(ptr [[CAST_RESULT25]])
+// CHECK-TYPEAUTH-NEXT:    [[CALL27:%.*]] = call ptr @_ZN5test11bEPNS_1BE(ptr [[BINSTANCE]])
+// CHECK-TYPEAUTH-NEXT:    [[TMP10:%.*]] = icmp eq ptr [[CINSTANCE]], null
+// CHECK-TYPEAUTH-NEXT:    br i1 [[TMP10]], label %[[CAST_END30:.*]], label %[[CAST_NOTNULL28:.*]]
+// CHECK-TYPEAUTH:       [[CAST_NOTNULL28]]:
+// CHECK-TYPEAUTH-NEXT:    [[ADD_PTR29:%.*]] = getelementptr inbounds i8, ptr [[CINSTANCE]], i64 8
+// CHECK-TYPEAUTH-NEXT:    br label %[[CAST_END30]]
+// CHECK-TYPEAUTH:       [[CAST_END30]]:
+// CHECK-TYPEAUTH-NEXT:    [[CAST_RESULT31:%.*]] = phi ptr [ [[ADD_PTR29]], %[[CAST_NOTNULL28]] ], [ null, %[[CAST_END24]] ]
+// CHECK-TYPEAUTH-NEXT:    [[CALL32:%.*]] = call ptr @_ZN5test11bEPNS_1BE(ptr [[CAST_RESULT31]])
+// CHECK-TYPEAUTH-NEXT:    [[TMP11:%.*]] = icmp eq ptr [[EINSTANCE]], null
+// CHECK-TYPEAUTH-NEXT:    br i1 [[TMP11]], label %[[CAST_END35:.*]], label %[[CAST_NOTNULL33:.*]]
+// CHECK-TYPEAUTH:       [[CAST_NOTNULL33]]:
+// CHECK-TYPEAUTH-NEXT:    [[ADD_PTR34:%.*]] = getelementptr inbounds i8, ptr [[EINSTANCE]], i64 8
+// CHECK-TYPEAUTH-NEXT:    br label %[[CAST_END35]]
+// CHECK-TYPEAUTH:       [[CAST_END35]]:
+// CHECK-TYPEAUTH-NEXT:    [[CAST_RESULT36:%.*]] = phi ptr [ [[ADD_PTR34]], %[[CAST_NOTNULL33]] ], [ null, %[[CAST_END30]] ]
+// CHECK-TYPEAUTH-NEXT:    [[CALL37:%.*]] = call ptr @_ZN5test11bEPNS_1BE(ptr [[CAST_RESULT36]])
+// CHECK-TYPEAUTH-NEXT:    [[CALL38:%.*]] = call ptr @_ZN5test16b_as_AEPNS_1BE(ptr [[BINSTANCE]])
+// CHECK-TYPEAUTH-NEXT:    [[CALL39:%.*]] = call ptr @_ZN5test11cEPNS_1CE(ptr [[CINSTANCE]])
+// CHECK-TYPEAUTH-NEXT:    [[CALL40:%.*]] = call ptr @_ZN5test16c_as_ZEPNS_1CE(ptr [[CINSTANCE]])
+// CHECK-TYPEAUTH-NEXT:    [[CALL41:%.*]] = call ptr @_ZN5test16c_as_BEPNS_1CE(ptr [[CINSTANCE]])
+// CHECK-TYPEAUTH-NEXT:    [[CALL42:%.*]] = call ptr @_ZN5test11dEPNS_1DE(ptr [[DINSTANCE]])
+// CHECK-TYPEAUTH-NEXT:    [[CALL43:%.*]] = call ptr @_ZN5test11dEPNS_1DE(ptr [[EINSTANCE]])
+// CHECK-TYPEAUTH-NEXT:    [[CALL44:%.*]] = call ptr @_ZN5test16d_as_AEPNS_1DE(ptr [[DINSTANCE]])
+// CHECK-TYPEAUTH-NEXT:    [[CALL45:%.*]] = call ptr @_ZN5test16d_as_AEPNS_1DE(ptr [[EINSTANCE]])
+// CHECK-TYPEAUTH-NEXT:    [[CALL46:%.*]] = call ptr @_ZN5test11eEPNS_1EE(ptr [[EINSTANCE]])
+// CHECK-TYPEAUTH-NEXT:    [[CALL47:%.*]] = call ptr @_ZN5test16e_as_BEPNS_1EE(ptr [[EINSTANCE]])
+// CHECK-TYPEAUTH-NEXT:    [[CALL48:%.*]] = call ptr @_ZN5test16e_as_DEPNS_1EE(ptr [[EINSTANCE]])
+// CHECK-TYPEAUTH-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [1 x %"struct.test1::E"], ptr [[EARRAY]], i64 0, i64 0
+// CHECK-TYPEAUTH-NEXT:    [[VTABLE49:%.*]] = load ptr, ptr [[ARRAYDECAY]], align 8, !tbaa [[TBAA9]]
+// CHECK-TYPEAUTH-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[VTABLE49]] to i64
+// CHECK-TYPEAUTH-NEXT:    [[TMP13:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP12]], i32 2, i64 48388)
+// CHECK-TYPEAUTH-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+// CHECK-TYPEAUTH-NEXT:    [[TMP15:%.*]] = load volatile i8, ptr [[TMP14]], align 8
+// CHECK-TYPEAUTH-NEXT:    call void @llvm.lifetime.end.p0(ptr [[EARRAY]]) #[[ATTR7]]
+// CHECK-TYPEAUTH-NEXT:    call void @llvm.lifetime.end.p0(ptr [[EINSTANCE]]) #[[ATTR7]]
+// CHECK-TYPEAUTH-NEXT:    call void @llvm.lifetime.end.p0(ptr [[DINSTANCE]]) #[[ATTR7]]
+// CHECK-TYPEAUTH-NEXT:    call void @llvm.lifetime.end.p0(ptr [[CINSTANCE]]) #[[ATTR7]]
+// CHECK-TYPEAUTH-NEXT:    call void @llvm.lifetime.end.p0(ptr [[BINSTANCE]]) #[[ATTR7]]
+// CHECK-TYPEAUTH-NEXT:    call void @llvm.lifetime.end.p0(ptr [[AINSTANCE]]) #[[ATTR7]]
+// CHECK-TYPEAUTH-NEXT:    ret void
+//
+// CHECK-ADDRESSAUTH-LABEL: define void @_ZN5test14testEv(
+// CHECK-ADDRESSAUTH-SAME: ) #[[ATTR0]] {
+// CHECK-ADDRESSAUTH-NEXT:  [[ENTRY:.*]]:
+// CHECK-ADDRESSAUTH-NEXT:    [[AINSTANCE:%.*]] = alloca %"struct.test1::A", align 8
+// CHECK-ADDRESSAUTH-NEXT:    [[BINSTANCE:%.*]] = alloca %"struct.test1::B", align 8
+// CHECK-ADDRESSAUTH-NEXT:    [[CINSTANCE:%.*]] = alloca %"struct.test1::C", align 8
+// CHECK-ADDRESSAUTH-NEXT:    [[DINSTANCE:%.*]] = alloca %"struct.test1::D", align 8
+// CHECK-ADDRESSAUTH-NEXT:    [[EINSTANCE:%.*]] = alloca %"struct.test1::E", align 8
+// CHECK-ADDRESSAUTH-NEXT:    [[EARRAY:%.*]] = alloca [1 x %"struct.test1::E"], align 8
+// CHECK-ADDRESSAUTH-NEXT:    call void @llvm.lifetime.start.p0(ptr [[AINSTANCE]]) #[[ATTR7]]
+// CHECK-ADDRESSAUTH-NEXT:    [[CALL:%.*]] = call ptr @_ZN5test11AC1Ev(ptr nonnull align 8 dereferenceable(8) [[AINSTANCE]])
+// CHECK-ADDRESSAUTH-NEXT:    call void @llvm.lifetime.start.p0(ptr [[BINSTANCE]]) #[[ATTR7]]
+// CHECK-ADDRESSAUTH-NEXT:    [[CALL1:%.*]] = call ptr @_ZN5test11BC1Ev(ptr nonnull align 8 dereferenceable(8) [[BINSTANCE]])
+// CHECK-ADDRESSAUTH-NEXT:    call void @llvm.lifetime.start.p0(ptr [[CINSTANCE]]) #[[ATTR7]]
+// CHECK-ADDRESSAUTH-NEXT:    [[CALL2:%.*]] = call ptr @_ZN5test11CC1Ev(ptr nonnull align 8 dereferenceable(16) [[CINSTANCE]])
+// CHECK-ADDRESSAUTH-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DINSTANCE]]) #[[ATTR7]]
+// CHECK-ADDRESSAUTH-NEXT:    [[CALL3:%.*]] = call ptr @_ZN5test11DC1Ev(ptr nonnull align 8 dereferenceable(8) [[DINSTANCE]])
+// CHECK-ADDRESSAUTH-NEXT:    call void @llvm.lifetime.start.p0(ptr [[EINSTANCE]]) #[[ATTR7]]
+// CHECK-ADDRESSAUTH-NEXT:    [[CALL4:%.*]] = call ptr @_ZN5test11EC1Ev(ptr nonnull align 8 dereferenceable(16) [[EINSTANCE]])
+// CHECK-ADDRESSAUTH-NEXT:    call void @llvm.lifetime.start.p0(ptr [[EARRAY]]) #[[ATTR7]]
+// CHECK-ADDRESSAUTH-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[EARRAY]], i8 0, i64 16, i1 false)
+// CHECK-ADDRESSAUTH-NEXT:    [[CALL5:%.*]] = call ptr @_ZN5test11EC1Ev(ptr nonnull align 8 dereferenceable(16) [[EARRAY]])
+// CHECK-ADDRESSAUTH-NEXT:    [[CALL6:%.*]] = call ptr @_ZN5test11aEPNS_1AE(ptr [[AINSTANCE]])
+// CHECK-ADDRESSAUTH-NEXT:    [[CALL7:%.*]] = call ptr @_ZN5test11aEPNS_1AE(ptr [[BINSTANCE]])
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP0:%.*]] = icmp eq ptr [[CINSTANCE]], null
+// CHECK-ADDRESSAUTH-NEXT:    br i1 [[TMP0]], label %[[CAST_END:.*]], label %[[CAST_NOTNULL:.*]]
+// CHECK-ADDRESSAUTH:       [[CAST_NOTNULL]]:
+// CHECK-ADDRESSAUTH-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[CINSTANCE]], i64 8
+// CHECK-ADDRESSAUTH-NEXT:    br label %[[CAST_END]]
+// CHECK-ADDRESSAUTH:       [[CAST_END]]:
+// CHECK-ADDRESSAUTH-NEXT:    [[CAST_RESULT:%.*]] = phi ptr [ [[ADD_PTR]], %[[CAST_NOTNULL]] ], [ null, %[[ENTRY]] ]
+// CHECK-ADDRESSAUTH-NEXT:    [[CALL8:%.*]] = call ptr @_ZN5test11aEPNS_1AE(ptr [[CAST_RESULT]])
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[DINSTANCE]], null
+// CHECK-ADDRESSAUTH-NEXT:    br i1 [[TMP1]], label %[[CAST_END11:.*]], label %[[CAST_NOTNULL9:.*]]
+// CHECK-ADDRESSAUTH:       [[CAST_NOTNULL9]]:
+// CHECK-ADDRESSAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[DINSTANCE]], align 8, !tbaa [[TBAA9]]
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[DINSTANCE]] to i64
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP4:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP3]], i32 2, i64 [[TMP2]])
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-ADDRESSAUTH-NEXT:    [[VBASE_OFFSET_PTR:%.*]] = getelementptr i8, ptr [[TMP5]], i64 -32
+// CHECK-ADDRESSAUTH-NEXT:    [[VBASE_OFFSET:%.*]] = load i64, ptr [[VBASE_OFFSET_PTR]], align 8
+// CHECK-ADDRESSAUTH-NEXT:    [[ADD_PTR10:%.*]] = getelementptr inbounds i8, ptr [[DINSTANCE]], i64 [[VBASE_OFFSET]]
+// CHECK-ADDRESSAUTH-NEXT:    br label %[[CAST_END11]]
+// CHECK-ADDRESSAUTH:       [[CAST_END11]]:
+// CHECK-ADDRESSAUTH-NEXT:    [[CAST_RESULT12:%.*]] = phi ptr [ [[ADD_PTR10]], %[[CAST_NOTNULL9]] ], [ null, %[[CAST_END]] ]
+// CHECK-ADDRESSAUTH-NEXT:    [[CALL13:%.*]] = call ptr @_ZN5test11aEPNS_1AE(ptr [[CAST_RESULT12]])
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP6:%.*]] = icmp eq ptr [[EINSTANCE]], null
+// CHECK-ADDRESSAUTH-NEXT:    br i1 [[TMP6]], label %[[CAST_END19:.*]], label %[[CAST_NOTNULL14:.*]]
+// CHECK-ADDRESSAUTH:       [[CAST_NOTNULL14]]:
+// CHECK-ADDRESSAUTH-NEXT:    [[VTABLE15:%.*]] = load ptr, ptr [[EINSTANCE]], align 8, !tbaa [[TBAA9]]
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[EINSTANCE]] to i64
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[VTABLE15]] to i64
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP9:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP8]], i32 2, i64 [[TMP7]])
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
+// CHECK-ADDRESSAUTH-NEXT:    [[VBASE_OFFSET_PTR16:%.*]] = getelementptr i8, ptr [[TMP10]], i64 -32
+// CHECK-ADDRESSAUTH-NEXT:    [[VBASE_OFFSET17:%.*]] = load i64, ptr [[VBASE_OFFSET_PTR16]], align 8
+// CHECK-ADDRESSAUTH-NEXT:    [[ADD_PTR18:%.*]] = getelementptr inbounds i8, ptr [[EINSTANCE]], i64 [[VBASE_OFFSET17]]
+// CHECK-ADDRESSAUTH-NEXT:    br label %[[CAST_END19]]
+// CHECK-ADDRESSAUTH:       [[CAST_END19]]:
+// CHECK-ADDRESSAUTH-NEXT:    [[CAST_RESULT20:%.*]] = phi ptr [ [[ADD_PTR18]], %[[CAST_NOTNULL14]] ], [ null, %[[CAST_END11]] ]
+// CHECK-ADDRESSAUTH-NEXT:    [[CALL21:%.*]] = call ptr @_ZN5test11aEPNS_1AE(ptr [[CAST_RESULT20]])
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP11:%.*]] = icmp eq ptr [[EINSTANCE]], null
+// CHECK-ADDRESSAUTH-NEXT:    br i1 [[TMP11]], label %[[CAST_END24:.*]], label %[[CAST_NOTNULL22:.*]]
+// CHECK-ADDRESSAUTH:       [[CAST_NOTNULL22]]:
+// CHECK-ADDRESSAUTH-NEXT:    [[ADD_PTR23:%.*]] = getelementptr inbounds i8, ptr [[EINSTANCE]], i64 8
+// CHECK-ADDRESSAUTH-NEXT:    br label %[[CAST_END24]]
+// CHECK-ADDRESSAUTH:       [[CAST_END24]]:
+// CHECK-ADDRESSAUTH-NEXT:    [[CAST_RESULT25:%.*]] = phi ptr [ [[ADD_PTR23]], %[[CAST_NOTNULL22]] ], [ null, %[[CAST_END19]] ]
+// CHECK-ADDRESSAUTH-NEXT:    [[CALL26:%.*]] = call ptr @_ZN5test11aEPNS_1AE(ptr [[CAST_RESULT25]])
+// CHECK-ADDRESSAUTH-NEXT:    [[CALL27:%.*]] = call ptr @_ZN5test11bEPNS_1BE(ptr [[BINSTANCE]])
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP12:%.*]] = icmp eq ptr [[CINSTANCE]], null
+// CHECK-ADDRESSAUTH-NEXT:    br i1 [[TMP12]], label %[[CAST_END30:.*]], label %[[CAST_NOTNULL28:.*]]
+// CHECK-ADDRESSAUTH:       [[CAST_NOTNULL28]]:
+// CHECK-ADDRESSAUTH-NEXT:    [[ADD_PTR29:%.*]] = getelementptr inbounds i8, ptr [[CINSTANCE]], i64 8
+// CHECK-ADDRESSAUTH-NEXT:    br label %[[CAST_END30]]
+// CHECK-ADDRESSAUTH:       [[CAST_END30]]:
+// CHECK-ADDRESSAUTH-NEXT:    [[CAST_RESULT31:%.*]] = phi ptr [ [[ADD_PTR29]], %[[CAST_NOTNULL28]] ], [ null, %[[CAST_END24]] ]
+// CHECK-ADDRESSAUTH-NEXT:    [[CALL32:%.*]] = call ptr @_ZN5test11bEPNS_1BE(ptr [[CAST_RESULT31]])
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP13:%.*]] = icmp eq ptr [[EINSTANCE]], null
+// CHECK-ADDRESSAUTH-NEXT:    br i1 [[TMP13]], label %[[CAST_END35:.*]], label %[[CAST_NOTNULL33:.*]]
+// CHECK-ADDRESSAUTH:       [[CAST_NOTNULL33]]:
+// CHECK-ADDRESSAUTH-NEXT:    [[ADD_PTR34:%.*]] = getelementptr inbounds i8, ptr [[EINSTANCE]], i64 8
+// CHECK-ADDRESSAUTH-NEXT:    br label %[[CAST_END35]]
+// CHECK-ADDRESSAUTH:       [[CAST_END35]]:
+// CHECK-ADDRESSAUTH-NEXT:    [[CAST_RESULT36:%.*]] = phi ptr [ [[ADD_PTR34]], %[[CAST_NOTNULL33]] ], [ null, %[[CAST_END30]] ]
+// CHECK-ADDRESSAUTH-NEXT:    [[CALL37:%.*]] = call ptr @_ZN5test11bEPNS_1BE(ptr [[CAST_RESULT36]])
+// CHECK-ADDRESSAUTH-NEXT:    [[CALL38:%.*]] = call ptr @_ZN5test16b_as_AEPNS_1BE(ptr [[BINSTANCE]])
+// CHECK-ADDRESSAUTH-NEXT:    [[CALL39:%.*]] = call ptr @_ZN5test11cEPNS_1CE(ptr [[CINSTANCE]])
+// CHECK-ADDRESSAUTH-NEXT:    [[CALL40:%.*]] = call ptr @_ZN5test16c_as_ZEPNS_1CE(ptr [[CINSTANCE]])
+// CHECK-ADDRESSAUTH-NEXT:    [[CALL41:%.*]] = call ptr @_ZN5test16c_as_BEPNS_1CE(ptr [[CINSTANCE]])
+// CHECK-ADDRESSAUTH-NEXT:    [[CALL42:%.*]] = call ptr @_ZN5test11dEPNS_1DE(ptr [[DINSTANCE]])
+// CHECK-ADDRESSAUTH-NEXT:    [[CALL43:%.*]] = call ptr @_ZN5test11dEPNS_1DE(ptr [[EINSTANCE]])
+// CHECK-ADDRESSAUTH-NEXT:    [[CALL44:%.*]] = call ptr @_ZN5test16d_as_AEPNS_1DE(ptr [[DINSTANCE]])
+// CHECK-ADDRESSAUTH-NEXT:    [[CALL45:%.*]] = call ptr @_ZN5test16d_as_AEPNS_1DE(ptr [[EINSTANCE]])
+// CHECK-ADDRESSAUTH-NEXT:    [[CALL46:%.*]] = call ptr @_ZN5test11eEPNS_1EE(ptr [[EINSTANCE]])
+// CHECK-ADDRESSAUTH-NEXT:    [[CALL47:%.*]] = call ptr @_ZN5test16e_as_BEPNS_1EE(ptr [[EINSTANCE]])
+// CHECK-ADDRESSAUTH-NEXT:    [[CALL48:%.*]] = call ptr @_ZN5test16e_as_DEPNS_1EE(ptr [[EINSTANCE]])
+// CHECK-ADDRESSAUTH-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [1 x %"struct.test1::E"], ptr [[EARRAY]], i64 0, i64 0
+// CHECK-ADDRESSAUTH-NEXT:    [[VTABLE49:%.*]] = load ptr, ptr [[ARRAYDECAY]], align 8, !tbaa [[TBAA9]]
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[ARRAYDECAY]] to i64
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[VTABLE49]] to i64
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP16:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP15]], i32 2, i64 [[TMP14]])
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr
+// CHECK-ADDRESSAUTH-NEXT:    [[TMP18:%.*]] = load volatile i8, ptr [[TMP17]], align 8
+// CHECK-ADDRESSAUTH-NEXT:    call void @llvm.lifetime.end.p0(ptr [[EARRAY]]) #[[ATTR7]]
+// CHECK-ADDRESSAUTH-NEXT:    call void @llvm.lifetime.end.p0(ptr [[EINSTANCE]]) #[[ATTR7]]
+// CHECK-ADDRESSAUTH-NEXT:    call void @llvm.lifetime.end.p0(ptr [[DINSTANCE]]) #[[ATTR7]]
+// CHECK-ADDRESSAUTH-NEXT:    call void @llvm.lifetime.end.p0(ptr [[CINSTANCE]]) #[[ATTR7]]
+// CHECK-ADDRESSAUTH-NEXT:    call void @llvm.lifetime.end.p0(ptr [[BINSTANCE]]) #[[ATTR7]]
+// CHECK-ADDRESSAUTH-NEXT:    call void @llvm.lifetime.end.p0(ptr [[AINSTANCE]]) #[[ATTR7]]
+// CHECK-ADDRESSAUTH-NEXT:    ret void
+//
+// CHECK-BOTHAUTH-LABEL: define void @_ZN5test14testEv(
+// CHECK-BOTHAUTH-SAME: ) #[[ATTR0]] {
+// CHECK-BOTHAUTH-NEXT:  [[ENTRY:.*]]:
+// CHECK-BOTHAUTH-NEXT:    [[AINSTANCE:%.*]] = alloca %"struct.test1::A", align 8
+// CHECK-BOTHAUTH-NEXT:    [[BINSTANCE:%.*]] = alloca %"struct.test1::B", align 8
+// CHECK-BOTHAUTH-NEXT:    [[CINSTANCE:%.*]] = alloca %"struct.test1::C", align 8
+// CHECK-BOTHAUTH-NEXT:    [[DINSTANCE:%.*]] = alloca %"struct.test1::D", align 8
+// CHECK-BOTHAUTH-NEXT:    [[EINSTANCE:%.*]] = alloca %"struct.test1::E", align 8
+// CHECK-BOTHAUTH-NEXT:    [[EARRAY:%.*]] = alloca [1 x %"struct.test1::E"], align 8
+// CHECK-BOTHAUTH-NEXT:    call void @llvm.lifetime.start.p0(ptr [[AINSTANCE]]) #[[ATTR7]]
+// CHECK-BOTHAUTH-NEXT:    [[CALL:%.*]] = call ptr @_ZN5test11AC1Ev(ptr nonnull align 8 dereferenceable(8) [[AINSTANCE]])
+// CHECK-BOTHAUTH-NEXT:    call void @llvm.lifetime.start.p0(ptr [[BINSTANCE]]) #[[ATTR7]]
+// CHECK-BOTHAUTH-NEXT:    [[CALL1:%.*]] = call ptr @_ZN5test11BC1Ev(ptr nonnull align 8 dereferenceable(8) [[BINSTANCE]])
+// CHECK-BOTHAUTH-NEXT:    call void @llvm.lifetime.start.p0(ptr [[CINSTANCE]]) #[[ATTR7]]
+// CHECK-BOTHAUTH-NEXT:    [[CALL2:%.*]] = call ptr @_ZN5test11CC1Ev(ptr nonnull align 8 dereferenceable(16) [[CINSTANCE]])
+// CHECK-BOTHAUTH-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DINSTANCE]]) #[[ATTR7]]
+// CHECK-BOTHAUTH-NEXT:    [[CALL3:%.*]] = call ptr @_ZN5test11DC1Ev(ptr nonnull align 8 dereferenceable(8) [[DINSTANCE]])
+// CHECK-BOTHAUTH-NEXT:    call void @llvm.lifetime.start.p0(ptr [[EINSTANCE]]) #[[ATTR7]]
+// CHECK-BOTHAUTH-NEXT:    [[CALL4:%.*]] = call ptr @_ZN5test11EC1Ev(ptr nonnull align 8 dereferenceable(16) [[EINSTANCE]])
+// CHECK-BOTHAUTH-NEXT:    call void @llvm.lifetime.start.p0(ptr [[EARRAY]]) #[[ATTR7]]
+// CHECK-BOTHAUTH-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[EARRAY]], i8 0, i64 16, i1 false)
+// CHECK-BOTHAUTH-NEXT:    [[CALL5:%.*]] = call ptr @_ZN5test11EC1Ev(ptr nonnull align 8 dereferenceable(16) [[EARRAY]])
+// CHECK-BOTHAUTH-NEXT:    [[CALL6:%.*]] = call ptr @_ZN5test11aEPNS_1AE(ptr [[AINSTANCE]])
+// CHECK-BOTHAUTH-NEXT:    [[CALL7:%.*]] = call ptr @_ZN5test11aEPNS_1AE(ptr [[BINSTANCE]])
+// CHECK-BOTHAUTH-NEXT:    [[TMP0:%.*]] = icmp eq ptr [[CINSTANCE]], null
+// CHECK-BOTHAUTH-NEXT:    br i1 [[TMP0]], label %[[CAST_END:.*]], label %[[CAST_NOTNULL:.*]]
+// CHECK-BOTHAUTH:       [[CAST_NOTNULL]]:
+// CHECK-BOTHAUTH-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[CINSTANCE]], i64 8
+// CHECK-BOTHAUTH-NEXT:    br label %[[CAST_END]]
+// CHECK-BOTHAUTH:       [[CAST_END]]:
+// CHECK-BOTHAUTH-NEXT:    [[CAST_RESULT:%.*]] = phi ptr [ [[ADD_PTR]], %[[CAST_NOTNULL]] ], [ null, %[[ENTRY]] ]
+// CHECK-BOTHAUTH-NEXT:    [[CALL8:%.*]] = call ptr @_ZN5test11aEPNS_1AE(ptr [[CAST_RESULT]])
+// CHECK-BOTHAUTH-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[DINSTANCE]], null
+// CHECK-BOTHAUTH-NEXT:    br i1 [[TMP1]], label %[[CAST_END11:.*]], label %[[CAST_NOTNULL9:.*]]
+// CHECK-BOTHAUTH:       [[CAST_NOTNULL9]]:
+// CHECK-BOTHAUTH-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[DINSTANCE]], align 8, !tbaa [[TBAA9]]
+// CHECK-BOTHAUTH-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[DINSTANCE]] to i64
+// CHECK-BOTHAUTH-NEXT:    [[TMP3:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[TMP2]], i64 48388)
+// CHECK-BOTHAUTH-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[VTABLE]] to i64
+// CHECK-BOTHAUTH-NEXT:    [[TMP5:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP4]], i32 2, i64 [[TMP3]])
+// CHECK-BOTHAUTH-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+// CHECK-BOTHAUTH-NEXT:    [[VBASE_OFFSET_PTR:%.*]] = getelementptr i8, ptr [[TMP6]], i64 -32
+// CHECK-BOTHAUTH-NEXT:    [[VBASE_OFFSET:%.*]] = load i64, ptr [[VBASE_OFFSET_PTR]], align 8
+// CHECK-BOTHAUTH-NEXT:    [[ADD_PTR10:%.*]] = getelementptr inbounds i8, ptr [[DINSTANCE]], i64 [[VBASE_OFFSET]]
+// CHECK-BOTHAUTH-NEXT:    br label %[[CAST_END11]]
+// CHECK-BOTHAUTH:       [[CAST_END11]]:
+// CHECK-BOTHAUTH-NEXT:    [[CAST_RESULT12:%.*]] = phi ptr [ [[ADD_PTR10]], %[[CAST_NOTNULL9]] ], [ null, %[[CAST_END]] ]
+// CHECK-BOTHAUTH-NEXT:    [[CALL13:%.*]] = call ptr @_ZN5test11aEPNS_1AE(ptr [[CAST_RESULT12]])
+// CHECK-BOTHAUTH-NEXT:    [[TMP7:%.*]] = icmp eq ptr [[EINSTANCE]], null
+// CHECK-BOTHAUTH-NEXT:    br i1 [[TMP7]], label %[[CAST_END19:.*]], label %[[CAST_NOTNULL14:.*]]
+// CHECK-BOTHAUTH:       [[CAST_NOTNULL14]]:
+// CHECK-BOTHAUTH-NEXT:    [[VTABLE15:%.*]] = load ptr, ptr [[EINSTANCE]], align 8, !tbaa [[TBAA9]]
+// CHECK-BOTHAUTH-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[EINSTANCE]] to i64
+// CHECK-BOTHAUTH-NEXT:    [[TMP9:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[TMP8]], i64 48388)
+// CHECK-BOTHAUTH-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[VTABLE15]] to i64
+// CHECK-BOTHAUTH-NEXT:    [[TMP11:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP10]], i32 2, i64 [[TMP9]])
+// CHECK-BOTHAUTH-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-BOTHAUTH-NEXT:    [[VBASE_OFFSET_PTR16:%.*]] = getelementptr i8, ptr [[TMP12]], i64 -32
+// CHECK-BOTHAUTH-NEXT:    [[VBASE_OFFSET17:%.*]] = load i64, ptr [[VBASE_OFFSET_PTR16]], align 8
+// CHECK-BOTHAUTH-NEXT:    [[ADD_PTR18:%.*]] = getelementptr inbounds i8, ptr [[EINSTANCE]], i64 [[VBASE_OFFSET17]]
+// CHECK-BOTHAUTH-NEXT:    br label %[[CAST_END19]]
+// CHECK-BOTHAUTH:       [[CAST_END19]]:
+// CHECK-BOTHAUTH-NEXT:    [[CAST_RESULT20:%.*]] = phi ptr [ [[ADD_PTR18]], %[[CAST_NOTNULL14]] ], [ null, %[[CAST_END11]] ]
+// CHECK-BOTHAUTH-NEXT:    [[CALL21:%.*]] = call ptr @_ZN5test11aEPNS_1AE(ptr [[CAST_RESULT20]])
+// CHECK-BOTHAUTH-NEXT:    [[TMP13:%.*]] = icmp eq ptr [[EINSTANCE]], null
+// CHECK-BOTHAUTH-NEXT:    br i1 [[TMP13]], label %[[CAST_END24:.*]], label %[[CAST_NOTNULL22:.*]]
+// CHECK-BOTHAUTH:       [[CAST_NOTNULL22]]:
+// CHECK-BOTHAUTH-NEXT:    [[ADD_PTR23:%.*]] = getelementptr inbounds i8, ptr [[EINSTANCE]], i64 8
+// CHECK-BOTHAUTH-NEXT:    br label %[[CAST_END24]]
+// CHECK-BOTHAUTH:       [[CAST_END24]]:
+// CHECK-BOTHAUTH-NEXT:    [[CAST_RESULT25:%.*]] = phi ptr [ [[ADD_PTR23]], %[[CAST_NOTNULL22]] ], [ null, %[[CAST_END19]] ]
+// CHECK-BOTHAUTH-NEXT:    [[CALL26:%.*]] = call ptr @_ZN5test11aEPNS_1AE(ptr [[CAST_RESULT25]])
+// CHECK-BOTHAUTH-NEXT:    [[CALL27:%.*]] = call ptr @_ZN5test11bEPNS_1BE(ptr [[BINSTANCE]])
+// CHECK-BOTHAUTH-NEXT:    [[TMP14:%.*]] = icmp eq ptr [[CINSTANCE]], null
+// CHECK-BOTHAUTH-NEXT:    br i1 [[TMP14]], label %[[CAST_END30:.*]], label %[[CAST_NOTNULL28:.*]]
+// CHECK-BOTHAUTH:       [[CAST_NOTNULL28]]:
+// CHECK-BOTHAUTH-NEXT:    [[ADD_PTR29:%.*]] = getelementptr inbounds i8, ptr [[CINSTANCE]], i64 8
+// CHECK-BOTHAUTH-NEXT:    br label %[[CAST_END30]]
+// CHECK-BOTHAUTH:       [[CAST_END30]]:
+// CHECK-BOTHAUTH-NEXT:    [[CAST_RESULT31:%.*]] = phi ptr [ [[ADD_PTR29]], %[[CAST_NOTNULL28]] ], [ null, %[[CAST_END24]] ]
+// CHECK-BOTHAUTH-NEXT:    [[CALL32:%.*]] = call ptr @_ZN5test11bEPNS_1BE(ptr [[CAST_RESULT31]])
+// CHECK-BOTHAUTH-NEXT:    [[TMP15:%.*]] = icmp eq ptr [[EINSTANCE]], null
+// CHECK-BOTHAUTH-NEXT:    br i1 [[TMP15]], label %[[CAST_END35:.*]], label %[[CAST_NOTNULL33:.*]]
+// CHECK-BOTHAUTH:       [[CAST_NOTNULL33]]:
+// CHECK-BOTHAUTH-NEXT:    [[ADD_PTR34:%.*]] = getelementptr inbounds i8, ptr [[EINSTANCE]], i64 8
+// CHECK-BOTHAUTH-NEXT:    br label %[[CAST_END35]]
+// CHECK-BOTHAUTH:       [[CAST_END35]]:
+// CHECK-BOTHAUTH-NEXT:    [[CAST_RESULT36:%.*]] = phi ptr [ [[ADD_PTR34]], %[[CAST_NOTNULL33]] ], [ null, %[[CAST_END30]] ]
+// CHECK-BOTHAUTH-NEXT:    [[CALL37:%.*]] = call ptr @_ZN5test11bEPNS_1BE(ptr [[CAST_RESULT36]])
+// CHECK-BOTHAUTH-NEXT:    [[CALL38:%.*]] = call ptr @_ZN5test16b_as_AEPNS_1BE(ptr [[BINSTANCE]])
+// CHECK-BOTHAUTH-NEXT:    [[CALL39:%.*]] = call ptr @_ZN5test11cEPNS_1CE(ptr [[CINSTANCE]])
+// CHECK-BOTHAUTH-NEXT:    [[CALL40:%.*]] = call ptr @_ZN5test16c_as_ZEPNS_1CE(ptr [[CINSTANCE]])
+// CHECK-BOTHAUTH-NEXT:    [[CALL41:%.*]] = call ptr @_ZN5test16c_as_BEPNS_1CE(ptr [[CINSTANCE]])
+// CHECK-BOTHAUTH-NEXT:    [[CALL42:%.*]] = call ptr @_ZN5test11dEPNS_1DE(ptr [[DINSTANCE]])
+// CHECK-BOTHAUTH-NEXT:    [[CALL43:%.*]] = call ptr @_ZN5test11dEPNS_1DE(ptr [[EINSTANCE]])
+// CHECK-BOTHAUTH-NEXT:    [[CALL44:%.*]] = call ptr @_ZN5test16d_as_AEPNS_1DE(ptr [[DINSTANCE]])
+// CHECK-BOTHAUTH-NEXT:    [[CALL45:%.*]] = call ptr @_ZN5test16d_as_AEPNS_1DE(ptr [[EINSTANCE]])
+// CHECK-BOTHAUTH-NEXT:    [[CALL46:%.*]] = call ptr @_ZN5test11eEPNS_1EE(ptr [[EINSTANCE]])
+// CHECK-BOTHAUTH-NEXT:    [[CALL47:%.*]] = call ptr @_ZN5test16e_as_BEPNS_1EE(ptr [[EINSTANCE]])
+// CHECK-BOTHAUTH-NEXT:    [[CALL48:%.*]] = call ptr @_ZN5test16e_as_DEPNS_1EE(ptr [[EINSTANCE]])
+// CHECK-BOTHAUTH-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [1 x %"struct.test1::E"], ptr [[EARRAY]], i64 0, i64 0
+// CHECK-BOTHAUTH-NEXT:    [[VTABLE49:%.*]] = load ptr, ptr [[ARRAYDECAY]], align 8, !tbaa [[TBAA9]]
+// CHECK-BOTHAUTH-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[ARRAYDECAY]] to i64
+// CHECK-BOTHAUTH-NEXT:    [[TMP17:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[TMP16]], i64 48388)
+// CHECK-BOTHAUTH-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[VTABLE49]] to i64
+// CHECK-BOTHAUTH-NEXT:    [[TMP19:%.*]] = call i64 @llvm.ptrauth.auth(i64 [[TMP18]], i32 2, i64 [[TMP17]])
+// CHECK-BOTHAUTH-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK-BOTHAUTH-NEXT:    [[TMP21:%.*]] = load volatile i8, ptr [[TMP20]], align 8
+// CHECK-BOTHAUTH-NEXT:    call void @llvm.lifetime.end.p0(ptr [[EARRAY]]) #[[ATTR7]]
+// CHECK-BOTHAUTH-NEXT:    call void @llvm.lifetime.end.p0(ptr [[EINSTANCE]]) #[[ATTR7]]
+// CHECK-BOTHAUTH-NEXT:    call void @llvm.lifetime.end.p0(ptr [[DINSTANCE]]) #[[ATTR7]]
+// CHECK-BOTHAUTH-NEXT:    call void @llvm.lifetime.end.p0(ptr [[CINSTANCE]]) #[[ATTR7]]
+// CHECK-BOTHAUTH-NEXT:    call void @llvm.lifetime.end.p0(ptr [[BINSTANCE]]) #[[ATTR7]]
+// CHECK-BOTHAUTH-NEXT:    call void @llvm.lifetime.end.p0(ptr [[AINSTANCE]]) #[[ATTR7]]
+// CHECK-BOTHAUTH-NEXT:    ret void
+//
 void test() {
   A aInstance;
   B bInstance;
@@ -368,3 +1350,68 @@ void test() {
   (void)__builtin_get_vtable_pointer(eArray);
 }
 } // namespace test1
+//.
+// CHECK-NOAUTH: [[META4:![0-9]+]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK-NOAUTH: [[META5]] = !{!"Simple C++ TBAA"}
+// CHECK-NOAUTH: [[TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// CHECK-NOAUTH: [[META7]] = !{!"p1 _ZTSN5test11AE", [[META8:![0-9]+]], i64 0}
+// CHECK-NOAUTH: [[META8]] = !{!"any pointer", [[META4]], i64 0}
+// CHECK-NOAUTH: [[TBAA9]] = !{[[META10:![0-9]+]], [[META10]], i64 0}
+// CHECK-NOAUTH: [[META10]] = !{!"vtable pointer", [[META5]], i64 0}
+// CHECK-NOAUTH: [[TBAA11]] = !{[[META12:![0-9]+]], [[META12]], i64 0}
+// CHECK-NOAUTH: [[META12]] = !{!"p1 _ZTSN5test11BE", [[META8]], i64 0}
+// CHECK-NOAUTH: [[TBAA13]] = !{[[META14:![0-9]+]], [[META14]], i64 0}
+// CHECK-NOAUTH: [[META14]] = !{!"p1 _ZTSN5test11CE", [[META8]], i64 0}
+// CHECK-NOAUTH: [[TBAA15]] = !{[[META16:![0-9]+]], [[META16]], i64 0}
+// CHECK-NOAUTH: [[META16]] = !{!"p1 _ZTSN5test11DE", [[META8]], i64 0}
+// CHECK-NOAUTH: [[TBAA17]] = !{[[META18:![0-9]+]], [[META18]], i64 0}
+// CHECK-NOAUTH: [[META18]] = !{!"p1 _ZTSN5test11EE", [[META8]], i64 0}
+//.
+// CHECK-TYPEAUTH: [[META4:![0-9]+]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK-TYPEAUTH: [[META5]] = !{!"Simple C++ TBAA"}
+// CHECK-TYPEAUTH: [[TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// CHECK-TYPEAUTH: [[META7]] = !{!"p1 _ZTSN5test11AE", [[META8:![0-9]+]], i64 0}
+// CHECK-TYPEAUTH: [[META8]] = !{!"any pointer", [[META4]], i64 0}
+// CHECK-TYPEAUTH: [[TBAA9]] = !{[[META10:![0-9]+]], [[META10]], i64 0}
+// CHECK-TYPEAUTH: [[META10]] = !{!"vtable pointer", [[META5]], i64 0}
+// CHECK-TYPEAUTH: [[TBAA11]] = !{[[META12:![0-9]+]], [[META12]], i64 0}
+// CHECK-TYPEAUTH: [[META12]] = !{!"p1 _ZTSN5test11BE", [[META8]], i64 0}
+// CHECK-TYPEAUTH: [[TBAA13]] = !{[[META14:![0-9]+]], [[META14]], i64 0}
+// CHECK-TYPEAUTH: [[META14]] = !{!"p1 _ZTSN5test11CE", [[META8]], i64 0}
+// CHECK-TYPEAUTH: [[TBAA15]] = !{[[META16:![0-9]+]], [[META16]], i64 0}
+// CHECK-TYPEAUTH: [[META16]] = !{!"p1 _ZTSN5test11DE", [[META8]], i64 0}
+// CHECK-TYPEAUTH: [[TBAA17]] = !{[[META18:![0-9]+]], [[META18]], i64 0}
+// CHECK-TYPEAUTH: [[META18]] = !{!"p1 _ZTSN5test11EE", [[META8]], i64 0}
+//.
+// CHECK-ADDRESSAUTH: [[META4:![0-9]+]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK-ADDRESSAUTH: [[META5]] = !{!"Simple C++ TBAA"}
+// CHECK-ADDRESSAUTH: [[TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// CHECK-ADDRESSAUTH: [[META7]] = !{!"p1 _ZTSN5test11AE", [[META8:![0-9]+]], i64 0}
+// CHECK-ADDRESSAUTH: [[META8]] = !{!"any pointer", [[META4]], i64 0}
+// CHECK-ADDRESSAUTH: [[TBAA9]] = !{[[META10:![0-9]+]], [[META10]], i64 0}
+// CHECK-ADDRESSAUTH: [[META10]] = !{!"vtable pointer", [[META5]], i64 0}
+// CHECK-ADDRESSAUTH: [[TBAA11]] = !{[[META12:![0-9]+]], [[META12]], i64 0}
+// CHECK-ADDRESSAUTH: [[META12]] = !{!"p1 _ZTSN5test11BE", [[META8]], i64 0}
+// CHECK-ADDRESSAUTH: [[TBAA13]] = !{[[META14:![0-9]+]], [[META14]], i64 0}
+// CHECK-ADDRESSAUTH: [[META14]] = !{!"p1 _ZTSN5test11CE", [[META8]], i64 0}
+// CHECK-ADDRESSAUTH: [[TBAA15]] = !{[[META16:![0-9]+]], [[META16]], i64 0}
+// CHECK-ADDRESSAUTH: [[META16]] = !{!"p1 _ZTSN5test11DE", [[META8]], i64 0}
+// CHECK-ADDRESSAUTH: [[TBAA17]] = !{[[META18:![0-9]+]], [[META18]], i64 0}
+// CHECK-ADDRESSAUTH: [[META18]] = !{!"p1 _ZTSN5test11EE", [[META8]], i64 0}
+//.
+// CHECK-BOTHAUTH: [[META4:![0-9]+]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK-BOTHAUTH: [[META5]] = !{!"Simple C++ TBAA"}
+// CHECK-BOTHAUTH: [[TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// CHECK-BOTHAUTH: [[META7]] = !{!"p1 _ZTSN5test11AE", [[META8:![0-9]+]], i64 0}
+// CHECK-BOTHAUTH: [[META8]] = !{!"any pointer", [[META4]], i64 0}
+// CHECK-BOTHAUTH: [[TBAA9]] = !{[[META10:![0-9]+]], [[META10]], i64 0}
+// CHECK-BOTHAUTH: [[META10]] = !{!"vtable pointer", [[META5]], i64 0}
+// CHECK-BOTHAUTH: [[TBAA11]] = !{[[META12:![0-9]+]], [[META12]], i64 0}
+// CHECK-BOTHAUTH: [[META12]] = !{!"p1 _ZTSN5test11BE", [[META8]], i64 0}
+// CHECK-BOTHAUTH: [[TBAA13]] = !{[[META14:![0-9]+]], [[META14]], i64 0}
+// CHECK-BOTHAUTH: [[META14]] = !{!"p1 _ZTSN5test11CE", [[META8]], i64 0}
+// CHECK-BOTHAUTH: [[TBAA15]] = !{[[META16:![0-9]+]], [[META16]], i64 0}
+// CHECK-BOTHAUTH: [[META16]] = !{!"p1 _ZTSN5test11DE", [[META8]], i64 0}
+// CHECK-BOTHAUTH: [[TBAA17]] = !{[[META18:![0-9]+]], [[META18]], i64 0}
+// CHECK-BOTHAUTH: [[META18]] = !{!"p1 _ZTSN5test11EE", [[META8]], i64 0}
+//.
diff --git a/clang/test/CodeGenCXX/cfi-mfcall-nomerge.cpp b/clang/test/CodeGenCXX/cfi-mfcall-nomerge.cpp
index d4b4f3030d117..5c3bd17ab909c 100644
--- a/clang/test/CodeGenCXX/cfi-mfcall-nomerge.cpp
+++ b/clang/test/CodeGenCXX/cfi-mfcall-nomerge.cpp
@@ -29,43 +29,43 @@ void f(S *s, void (S::*p)()) {
 // NO-MERGE-NEXT:    [[MEMPTR_ISVIRTUAL_NOT:%.*]] = icmp eq i64 [[TMP1]], 0
 // NO-MERGE-NEXT:    br i1 [[MEMPTR_ISVIRTUAL_NOT]], label %[[MEMPTR_NONVIRTUAL:.*]], label %[[MEMPTR_VIRTUAL:.*]]
 // NO-MERGE:       [[MEMPTR_VIRTUAL]]:
-// NO-MERGE-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[VTABLE_POINTER_TBAA2:![0-9]+]]
+// NO-MERGE-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[VTABLE_POINTER_TBAA6:![0-9]+]]
 // NO-MERGE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[VTABLE]], i64 [[P_COERCE0]]
 // NO-MERGE-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP2]], i64 -1
-// NO-MERGE-NEXT:    [[TMP4:%.*]] = tail call i1 @llvm.type.test(ptr [[TMP3]], metadata !"_ZTSM1SFvvE.virtual"), !nosanitize [[META5:![0-9]+]]
-// NO-MERGE-NEXT:    br i1 [[TMP4]], label %[[MEMPTR_VIRTUAL7:.*]], label %[[TRAP:.*]], !prof [[PROF6:![0-9]+]], !nosanitize [[META5]]
+// NO-MERGE-NEXT:    [[TMP4:%.*]] = tail call i1 @llvm.type.test(ptr [[TMP3]], metadata !"_ZTSM1SFvvE.virtual"), !nosanitize [[META8:![0-9]+]]
+// NO-MERGE-NEXT:    br i1 [[TMP4]], label %[[MEMPTR_VIRTUAL7:.*]], label %[[TRAP:.*]], !prof [[PROF9:![0-9]+]], !nosanitize [[META8]]
 // NO-MERGE:       [[TRAP]]:
-// NO-MERGE-NEXT:    tail call void @llvm.ubsantrap(i8 2) #[[ATTR3:[0-9]+]], !nosanitize [[META5]]
-// NO-MERGE-NEXT:    unreachable, !nosanitize [[META5]]
+// NO-MERGE-NEXT:    tail call void @llvm.ubsantrap(i8 2) #[[ATTR3:[0-9]+]], !nosanitize [[META8]]
+// NO-MERGE-NEXT:    unreachable, !nosanitize [[META8]]
 // NO-MERGE:       [[MEMPTR_NONVIRTUAL]]:
 // NO-MERGE-NEXT:    [[MEMPTR_NONVIRTUALFN:%.*]] = inttoptr i64 [[P_COERCE0]] to ptr
-// NO-MERGE-NEXT:    [[TMP5:%.*]] = tail call i1 @llvm.type.test(ptr [[MEMPTR_NONVIRTUALFN]], metadata !"_ZTSM2B1FvvE")
-// NO-MERGE-NEXT:    [[TMP6:%.*]] = tail call i1 @llvm.type.test(ptr [[MEMPTR_NONVIRTUALFN]], metadata !"_ZTSM2B2FvvE")
-// NO-MERGE-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]], !nosanitize [[META5]]
-// NO-MERGE-NEXT:    br i1 [[TMP7]], label %[[MEMPTR_NONVIRTUAL23:.*]], label %[[TRAP2:.*]], !prof [[PROF6]], !nosanitize [[META5]]
+// NO-MERGE-NEXT:    [[TMP5:%.*]] = tail call i1 @llvm.type.test(ptr [[MEMPTR_NONVIRTUALFN]], metadata !"_ZTSM2B1FvvE"), !nosanitize [[META8]]
+// NO-MERGE-NEXT:    [[TMP6:%.*]] = tail call i1 @llvm.type.test(ptr [[MEMPTR_NONVIRTUALFN]], metadata !"_ZTSM2B2FvvE"), !nosanitize [[META8]]
+// NO-MERGE-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]], !nosanitize [[META8]]
+// NO-MERGE-NEXT:    br i1 [[TMP7]], label %[[MEMPTR_NONVIRTUAL23:.*]], label %[[TRAP2:.*]], !prof [[PROF9]], !nosanitize [[META8]]
 // NO-MERGE:       [[TRAP2]]:
-// NO-MERGE-NEXT:    tail call void @llvm.ubsantrap(i8 2) #[[ATTR4:[0-9]+]], !nosanitize [[META5]]
-// NO-MERGE-NEXT:    unreachable, !nosanitize [[META5]]
+// NO-MERGE-NEXT:    tail call void @llvm.ubsantrap(i8 2) #[[ATTR4:[0-9]+]], !nosanitize [[META8]]
+// NO-MERGE-NEXT:    unreachable, !nosanitize [[META8]]
 // NO-MERGE:       [[MEMPTR_VIRTUAL7]]:
-// NO-MERGE-NEXT:    [[MEMPTR_VIRTUALFN:%.*]] = load ptr, ptr [[TMP3]], align 8, !nosanitize [[META5]]
+// NO-MERGE-NEXT:    [[MEMPTR_VIRTUALFN:%.*]] = load ptr, ptr [[TMP3]], align 8, !nosanitize [[META8]]
 // NO-MERGE-NEXT:    tail call void [[MEMPTR_VIRTUALFN]](ptr noundef nonnull align 1 dereferenceable(1) [[TMP0]]) #[[ATTR5:[0-9]+]]
-// NO-MERGE-NEXT:    [[VTABLE8:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[VTABLE_POINTER_TBAA2]]
+// NO-MERGE-NEXT:    [[VTABLE8:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[VTABLE_POINTER_TBAA6]]
 // NO-MERGE-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[VTABLE8]], i64 [[P_COERCE0]]
 // NO-MERGE-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i64 -1
-// NO-MERGE-NEXT:    [[TMP10:%.*]] = tail call i1 @llvm.type.test(ptr [[TMP9]], metadata !"_ZTSM1SFvvE.virtual"), !nosanitize [[META5]]
-// NO-MERGE-NEXT:    br i1 [[TMP10]], label %[[MEMPTR_VIRTUAL19:.*]], label %[[TRAP2]], !prof [[PROF6]], !nosanitize [[META5]]
+// NO-MERGE-NEXT:    [[TMP10:%.*]] = tail call i1 @llvm.type.test(ptr [[TMP9]], metadata !"_ZTSM1SFvvE.virtual"), !nosanitize [[META8]]
+// NO-MERGE-NEXT:    br i1 [[TMP10]], label %[[MEMPTR_VIRTUAL19:.*]], label %[[TRAP2]], !prof [[PROF9]], !nosanitize [[META8]]
 // NO-MERGE:       [[TRAP13:.*]]:
-// NO-MERGE-NEXT:    tail call void @llvm.ubsantrap(i8 2) #[[ATTR4]], !nosanitize [[META5]]
-// NO-MERGE-NEXT:    unreachable, !nosanitize [[META5]]
+// NO-MERGE-NEXT:    tail call void @llvm.ubsantrap(i8 2) #[[ATTR4]], !nosanitize [[META8]]
+// NO-MERGE-NEXT:    unreachable, !nosanitize [[META8]]
 // NO-MERGE:       [[MEMPTR_VIRTUAL19]]:
-// NO-MERGE-NEXT:    [[MEMPTR_VIRTUALFN9:%.*]] = load ptr, ptr [[TMP9]], align 8, !nosanitize [[META5]]
+// NO-MERGE-NEXT:    [[MEMPTR_VIRTUALFN9:%.*]] = load ptr, ptr [[TMP9]], align 8, !nosanitize [[META8]]
 // NO-MERGE-NEXT:    tail call void [[MEMPTR_VIRTUALFN9]](ptr noundef nonnull align 1 dereferenceable(1) [[TMP0]]) #[[ATTR5]]
-// NO-MERGE-NEXT:    [[VTABLE20:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[VTABLE_POINTER_TBAA2]]
+// NO-MERGE-NEXT:    [[VTABLE20:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[VTABLE_POINTER_TBAA6]]
 // NO-MERGE-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[VTABLE20]], i64 [[P_COERCE0]]
 // NO-MERGE-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP11]], i64 -1
-// NO-MERGE-NEXT:    [[TMP13:%.*]] = tail call i1 @llvm.type.test(ptr [[TMP12]], metadata !"_ZTSM1SFvvE.virtual"), !nosanitize [[META5]]
-// NO-MERGE-NEXT:    [[MEMPTR_VIRTUALFN21:%.*]] = load ptr, ptr [[TMP12]], align 8, !nosanitize [[META5]]
-// NO-MERGE-NEXT:    br i1 [[TMP13]], label %[[MEMPTR_END27:.*]], label %[[TRAP13]], !prof [[PROF6]], !nosanitize [[META5]]
+// NO-MERGE-NEXT:    [[TMP13:%.*]] = tail call i1 @llvm.type.test(ptr [[TMP12]], metadata !"_ZTSM1SFvvE.virtual"), !nosanitize [[META8]]
+// NO-MERGE-NEXT:    [[MEMPTR_VIRTUALFN21:%.*]] = load ptr, ptr [[TMP12]], align 8, !nosanitize [[META8]]
+// NO-MERGE-NEXT:    br i1 [[TMP13]], label %[[MEMPTR_END27:.*]], label %[[TRAP13]], !prof [[PROF9]], !nosanitize [[META8]]
 // NO-MERGE:       [[MEMPTR_NONVIRTUAL23]]:
 // NO-MERGE-NEXT:    tail call void [[MEMPTR_NONVIRTUALFN]](ptr noundef nonnull align 1 dereferenceable(1) [[TMP0]]) #[[ATTR5]]
 // NO-MERGE-NEXT:    tail call void [[MEMPTR_NONVIRTUALFN]](ptr noundef nonnull align 1 dereferenceable(1) [[TMP0]]) #[[ATTR5]]
@@ -84,37 +84,37 @@ void f(S *s, void (S::*p)()) {
 // MERGE-NEXT:    [[MEMPTR_ISVIRTUAL_NOT:%.*]] = icmp eq i64 [[TMP1]], 0
 // MERGE-NEXT:    br i1 [[MEMPTR_ISVIRTUAL_NOT]], label %[[MEMPTR_NONVIRTUAL:.*]], label %[[MEMPTR_VIRTUAL:.*]]
 // MERGE:       [[MEMPTR_VIRTUAL]]:
-// MERGE-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[VTABLE_POINTER_TBAA2:![0-9]+]]
+// MERGE-NEXT:    [[VTABLE:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[VTABLE_POINTER_TBAA6:![0-9]+]]
 // MERGE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[VTABLE]], i64 [[P_COERCE0]]
 // MERGE-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP2]], i64 -1
-// MERGE-NEXT:    [[TMP4:%.*]] = tail call i1 @llvm.type.test(ptr [[TMP3]], metadata !"_ZTSM1SFvvE.virtual"), !nosanitize [[META5:![0-9]+]]
-// MERGE-NEXT:    br i1 [[TMP4]], label %[[MEMPTR_VIRTUAL6:.*]], label %[[TRAP:.*]], !prof [[PROF6:![0-9]+]], !nosanitize [[META5]]
+// MERGE-NEXT:    [[TMP4:%.*]] = tail call i1 @llvm.type.test(ptr [[TMP3]], metadata !"_ZTSM1SFvvE.virtual"), !nosanitize [[META8:![0-9]+]]
+// MERGE-NEXT:    br i1 [[TMP4]], label %[[MEMPTR_VIRTUAL6:.*]], label %[[TRAP:.*]], !prof [[PROF9:![0-9]+]], !nosanitize [[META8]]
 // MERGE:       [[TRAP]]:
-// MERGE-NEXT:    tail call void @llvm.ubsantrap(i8 2) #[[ATTR3:[0-9]+]], !nosanitize [[META5]]
-// MERGE-NEXT:    unreachable, !nosanitize [[META5]]
+// MERGE-NEXT:    tail call void @llvm.ubsantrap(i8 2) #[[ATTR3:[0-9]+]], !nosanitize [[META8]]
+// MERGE-NEXT:    unreachable, !nosanitize [[META8]]
 // MERGE:       [[MEMPTR_NONVIRTUAL]]:
 // MERGE-NEXT:    [[MEMPTR_NONVIRTUALFN:%.*]] = inttoptr i64 [[P_COERCE0]] to ptr
-// MERGE-NEXT:    [[TMP5:%.*]] = tail call i1 @llvm.type.test(ptr [[MEMPTR_NONVIRTUALFN]], metadata !"_ZTSM2B1FvvE")
-// MERGE-NEXT:    [[TMP6:%.*]] = tail call i1 @llvm.type.test(ptr [[MEMPTR_NONVIRTUALFN]], metadata !"_ZTSM2B2FvvE")
-// MERGE-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]], !nosanitize [[META5]]
-// MERGE-NEXT:    br i1 [[TMP7]], label %[[MEMPTR_NONVIRTUAL21:.*]], label %[[TRAP]], !prof [[PROF6]], !nosanitize [[META5]]
+// MERGE-NEXT:    [[TMP5:%.*]] = tail call i1 @llvm.type.test(ptr [[MEMPTR_NONVIRTUALFN]], metadata !"_ZTSM2B1FvvE"), !nosanitize [[META8]]
+// MERGE-NEXT:    [[TMP6:%.*]] = tail call i1 @llvm.type.test(ptr [[MEMPTR_NONVIRTUALFN]], metadata !"_ZTSM2B2FvvE"), !nosanitize [[META8]]
+// MERGE-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]], !nosanitize [[META8]]
+// MERGE-NEXT:    br i1 [[TMP7]], label %[[MEMPTR_NONVIRTUAL21:.*]], label %[[TRAP]], !prof [[PROF9]], !nosanitize [[META8]]
 // MERGE:       [[MEMPTR_VIRTUAL6]]:
-// MERGE-NEXT:    [[MEMPTR_VIRTUALFN:%.*]] = load ptr, ptr [[TMP3]], align 8, !nosanitize [[META5]]
+// MERGE-NEXT:    [[MEMPTR_VIRTUALFN:%.*]] = load ptr, ptr [[TMP3]], align 8, !nosanitize [[META8]]
 // MERGE-NEXT:    tail call void [[MEMPTR_VIRTUALFN]](ptr noundef nonnull align 1 dereferenceable(1) [[TMP0]]) #[[ATTR4:[0-9]+]]
-// MERGE-NEXT:    [[VTABLE7:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[VTABLE_POINTER_TBAA2]]
+// MERGE-NEXT:    [[VTABLE7:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[VTABLE_POINTER_TBAA6]]
 // MERGE-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[VTABLE7]], i64 [[P_COERCE0]]
 // MERGE-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i64 -1
-// MERGE-NEXT:    [[TMP10:%.*]] = tail call i1 @llvm.type.test(ptr [[TMP9]], metadata !"_ZTSM1SFvvE.virtual"), !nosanitize [[META5]]
-// MERGE-NEXT:    br i1 [[TMP10]], label %[[MEMPTR_VIRTUAL17:.*]], label %[[TRAP]], !prof [[PROF6]], !nosanitize [[META5]]
+// MERGE-NEXT:    [[TMP10:%.*]] = tail call i1 @llvm.type.test(ptr [[TMP9]], metadata !"_ZTSM1SFvvE.virtual"), !nosanitize [[META8]]
+// MERGE-NEXT:    br i1 [[TMP10]], label %[[MEMPTR_VIRTUAL17:.*]], label %[[TRAP]], !prof [[PROF9]], !nosanitize [[META8]]
 // MERGE:       [[MEMPTR_VIRTUAL17]]:
-// MERGE-NEXT:    [[MEMPTR_VIRTUALFN8:%.*]] = load ptr, ptr [[TMP9]], align 8, !nosanitize [[META5]]
+// MERGE-NEXT:    [[MEMPTR_VIRTUALFN8:%.*]] = load ptr, ptr [[TMP9]], align 8, !nosanitize [[META8]]
 // MERGE-NEXT:    tail call void [[MEMPTR_VIRTUALFN8]](ptr noundef nonnull align 1 dereferenceable(1) [[TMP0]]) #[[ATTR4]]
-// MERGE-NEXT:    [[VTABLE18:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[VTABLE_POINTER_TBAA2]]
+// MERGE-NEXT:    [[VTABLE18:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[VTABLE_POINTER_TBAA6]]
 // MERGE-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[VTABLE18]], i64 [[P_COERCE0]]
 // MERGE-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP11]], i64 -1
-// MERGE-NEXT:    [[TMP13:%.*]] = tail call i1 @llvm.type.test(ptr [[TMP12]], metadata !"_ZTSM1SFvvE.virtual"), !nosanitize [[META5]]
-// MERGE-NEXT:    [[MEMPTR_VIRTUALFN19:%.*]] = load ptr, ptr [[TMP12]], align 8, !nosanitize [[META5]]
-// MERGE-NEXT:    br i1 [[TMP13]], label %[[MEMPTR_END24:.*]], label %[[TRAP]], !prof [[PROF6]], !nosanitize [[META5]]
+// MERGE-NEXT:    [[TMP13:%.*]] = tail call i1 @llvm.type.test(ptr [[TMP12]], metadata !"_ZTSM1SFvvE.virtual"), !nosanitize [[META8]]
+// MERGE-NEXT:    [[MEMPTR_VIRTUALFN19:%.*]] = load ptr, ptr [[TMP12]], align 8, !nosanitize [[META8]]
+// MERGE-NEXT:    br i1 [[TMP13]], label %[[MEMPTR_END24:.*]], label %[[TRAP]], !prof [[PROF9]], !nosanitize [[META8]]
 // MERGE:       [[MEMPTR_NONVIRTUAL21]]:
 // MERGE-NEXT:    tail call void [[MEMPTR_NONVIRTUALFN]](ptr noundef nonnull align 1 dereferenceable(1) [[TMP0]]) #[[ATTR4]]
 // MERGE-NEXT:    tail call void [[MEMPTR_NONVIRTUALFN]](ptr noundef nonnull align 1 dereferenceable(1) [[TMP0]]) #[[ATTR4]]
@@ -125,15 +125,15 @@ void f(S *s, void (S::*p)()) {
 // MERGE-NEXT:    ret void
 //
 //.
-// NO-MERGE: [[VTABLE_POINTER_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
-// NO-MERGE: [[META3]] = !{!"vtable pointer", [[META4:![0-9]+]], i64 0}
-// NO-MERGE: [[META4]] = !{!"Simple C++ TBAA"}
-// NO-MERGE: [[META5]] = !{}
-// NO-MERGE: [[PROF6]] = !{!"branch_weights", i32 1048575, i32 1}
+// NO-MERGE: [[META5:![0-9]+]] = !{!"Simple C++ TBAA"}
+// NO-MERGE: [[VTABLE_POINTER_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// NO-MERGE: [[META7]] = !{!"vtable pointer", [[META5]], i64 0}
+// NO-MERGE: [[META8]] = !{}
+// NO-MERGE: [[PROF9]] = !{!"branch_weights", i32 1048575, i32 1}
 //.
-// MERGE: [[VTABLE_POINTER_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
-// MERGE: [[META3]] = !{!"vtable pointer", [[META4:![0-9]+]], i64 0}
-// MERGE: [[META4]] = !{!"Simple C++ TBAA"}
-// MERGE: [[META5]] = !{}
-// MERGE: [[PROF6]] = !{!"branch_weights", i32 1048575, i32 1}
+// MERGE: [[META5:![0-9]+]] = !{!"Simple C++ TBAA"}
+// MERGE: [[VTABLE_POINTER_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// MERGE: [[META7]] = !{!"vtable pointer", [[META5]], i64 0}
+// MERGE: [[META8]] = !{}
+// MERGE: [[PROF9]] = !{!"branch_weights", i32 1048575, i32 1}
 //.
diff --git a/clang/test/CodeGenCXX/inline-then-fold-variadics.cpp b/clang/test/CodeGenCXX/inline-then-fold-variadics.cpp
index ab3695a3d9ce3..30941653c150d 100644
--- a/clang/test/CodeGenCXX/inline-then-fold-variadics.cpp
+++ b/clang/test/CodeGenCXX/inline-then-fold-variadics.cpp
@@ -110,8 +110,8 @@ int first_i32_ulong2(int x, ulong2 *y) { return first<int, ulong2>(x, *y); }
 // CHECK-LABEL: define void @second_i32_ulong2(
 // CHECK-SAME: i32 noundef [[X:%.*]], ptr noundef readonly captures(none) [[Y:%.*]], ptr noundef writeonly captures(none) initializes((0, 16)) [[R:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[Y]], align 16, !tbaa [[INT_TBAA2:![0-9]+]]
-// CHECK-NEXT:    store <2 x i64> [[TMP0]], ptr [[R]], align 16, !tbaa [[INT_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[Y]], align 16, !tbaa [[DOUBLE_TBAA6:![0-9]+]]
+// CHECK-NEXT:    store <2 x i64> [[TMP0]], ptr [[R]], align 16, !tbaa [[DOUBLE_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void second_i32_ulong2(int x, ulong2 *y, ulong2 *r) {
@@ -121,8 +121,8 @@ void second_i32_ulong2(int x, ulong2 *y, ulong2 *r) {
 // CHECK-LABEL: define void @first_ulong2_i32(
 // CHECK-SAME: ptr noundef readonly captures(none) [[X:%.*]], i32 noundef [[Y:%.*]], ptr noundef writeonly captures(none) initializes((0, 16)) [[R:%.*]]) local_unnamed_addr #[[ATTR1]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[X]], align 16, !tbaa [[INT_TBAA2]]
-// CHECK-NEXT:    store <2 x i64> [[TMP0]], ptr [[R]], align 16, !tbaa [[INT_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[X]], align 16, !tbaa [[DOUBLE_TBAA6]]
+// CHECK-NEXT:    store <2 x i64> [[TMP0]], ptr [[R]], align 16, !tbaa [[DOUBLE_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void first_ulong2_i32(ulong2 *x, int y, ulong2 *r) {
@@ -180,7 +180,7 @@ void first_asc_i32(asc *x, int y, asc *r) { *r = first<asc, int>(*x, y); }
 int second_asc_i32(asc *x, int y) { return second<asc, int>(*x, y); }
 }
 //.
-// CHECK: [[INT_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
-// CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
-// CHECK: [[META4]] = !{!"Simple C++ TBAA"}
+// CHECK: [[META4:![0-9]+]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK: [[META5]] = !{!"Simple C++ TBAA"}
+// CHECK: [[DOUBLE_TBAA6]] = !{[[META4]], [[META4]], i64 0}
 //.
diff --git a/clang/test/CodeGenCXX/load-reference-metadata.cpp b/clang/test/CodeGenCXX/load-reference-metadata.cpp
index abfdd055c3ad6..b9777f4e0efd2 100644
--- a/clang/test/CodeGenCXX/load-reference-metadata.cpp
+++ b/clang/test/CodeGenCXX/load-reference-metadata.cpp
@@ -13,18 +13,18 @@ struct S {
 // CHECK-SAME: ptr noundef nonnull align 8 dereferenceable(24) [[S:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[S_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    store ptr [[S]], ptr [[S_ADDR]], align 8, !tbaa [[_ZTS1SPTR_TBAA2:![0-9]+]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !tbaa [[_ZTS1SPTR_TBAA2]], !nonnull [[META7:![0-9]+]], !align [[META8:![0-9]+]]
+// CHECK-NEXT:    store ptr [[S]], ptr [[S_ADDR]], align 8, !tbaa [[_ZTS1SPTR_TBAA6:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !tbaa [[_ZTS1SPTR_TBAA6]], !nonnull [[META9:![0-9]+]], !align [[META10:![0-9]+]]
 // CHECK-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[TMP0]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A]], align 8, !tbaa [[CHARPTR_TBAA9:![0-9]+]], !nonnull [[META7]]
-// CHECK-NEXT:    store i8 0, ptr [[TMP1]], align 1, !tbaa [[CHAR_TBAA14:![0-9]+]]
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !tbaa [[_ZTS1SPTR_TBAA2]], !nonnull [[META7]], !align [[META8]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A]], align 8, !tbaa [[CHARPTR_TBAA11:![0-9]+]], !nonnull [[META9]]
+// CHECK-NEXT:    store i8 0, ptr [[TMP1]], align 1, !tbaa [[CHAR_TBAA16:![0-9]+]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !tbaa [[_ZTS1SPTR_TBAA6]], !nonnull [[META9]], !align [[META10]]
 // CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP2]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B]], align 8, !tbaa [[INTPTR_TBAA15:![0-9]+]], !nonnull [[META7]], !align [[META16:![0-9]+]]
-// CHECK-NEXT:    store i32 0, ptr [[TMP3]], align 4, !tbaa [[INT_TBAA17:![0-9]+]]
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !tbaa [[_ZTS1SPTR_TBAA2]], !nonnull [[META7]], !align [[META8]]
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B]], align 8, !tbaa [[INTPTR_TBAA17:![0-9]+]], !nonnull [[META9]], !align [[META18:![0-9]+]]
+// CHECK-NEXT:    store i32 0, ptr [[TMP3]], align 4, !tbaa [[INT_TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !tbaa [[_ZTS1SPTR_TBAA6]], !nonnull [[META9]], !align [[META10]]
 // CHECK-NEXT:    [[C:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP4]], i32 0, i32 2
-// CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C]], align 8, !tbaa [[_ZTS1FPTR_TBAA19:![0-9]+]], !nonnull [[META7]], !align [[META20:![0-9]+]]
+// CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C]], align 8, !tbaa [[_ZTS1FPTR_TBAA19:![0-9]+]], !nonnull [[META9]], !align [[META20:![0-9]+]]
 // CHECK-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_F:%.*]], ptr [[TMP5]], i32 0, i32 0
 // CHECK-NEXT:    store i32 0, ptr [[X]], align 32, !tbaa [[INT_TBAA21:![0-9]+]]
 // CHECK-NEXT:    ret void
@@ -42,10 +42,10 @@ extern B (&bb)[2];
 // CHECK-LABEL: define dso_local void @_Z13test_externalv(
 // CHECK-SAME: ) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr @b, align 8, !tbaa [[_ZTS1BPTR_TBAA23:![0-9]+]], !nonnull [[META7]], !align [[META8]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr @b, align 8, !tbaa [[_ZTS1BPTR_TBAA23:![0-9]+]], !nonnull [[META9]], !align [[META10]]
 // CHECK-NEXT:    [[C:%.*]] = getelementptr inbounds nuw [[STRUCT_B:%.*]], ptr [[TMP0]], i32 0, i32 2
 // CHECK-NEXT:    store i8 0, ptr [[C]], align 8, !tbaa [[CHAR_TBAA25:![0-9]+]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr @bb, align 8, !tbaa [[_ZTS1BPTR_TBAA23]], !nonnull [[META7]], !align [[META20]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr @bb, align 8, !tbaa [[_ZTS1BPTR_TBAA23]], !nonnull [[META9]], !align [[META20]]
 // CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x %struct.B], ptr [[TMP1]], i64 0, i64 0
 // CHECK-NEXT:    [[C1:%.*]] = getelementptr inbounds nuw [[STRUCT_B]], ptr [[ARRAYIDX]], i32 0, i32 2
 // CHECK-NEXT:    store i8 0, ptr [[C1]], align 16, !tbaa [[CHAR_TBAA25]]
@@ -61,7 +61,7 @@ void test_external() {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[S_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    store ptr [[S]], ptr [[S_ADDR]], align 8, !tbaa [[_ZTS1BPTR_TBAA23]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !tbaa [[_ZTS1BPTR_TBAA23]], !nonnull [[META7]], !align [[META8]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !tbaa [[_ZTS1BPTR_TBAA23]], !nonnull [[META9]], !align [[META10]]
 // CHECK-NEXT:    [[C:%.*]] = getelementptr inbounds nuw [[STRUCT_B:%.*]], ptr [[TMP0]], i32 0, i32 2
 // CHECK-NEXT:    ret ptr [[C]]
 //
@@ -69,30 +69,30 @@ char* test_deref_only(B &s) {
   return &s.c;
 }
 //.
-// CHECK: [[_ZTS1SPTR_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
-// CHECK: [[META3]] = !{!"p1 _ZTS1S", [[META4:![0-9]+]], i64 0}
-// CHECK: [[META4]] = !{!"any pointer", [[META5:![0-9]+]], i64 0}
-// CHECK: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
-// CHECK: [[META6]] = !{!"Simple C++ TBAA"}
-// CHECK: [[META7]] = !{}
-// CHECK: [[META8]] = !{i64 8}
-// CHECK: [[CHARPTR_TBAA9]] = !{[[META10:![0-9]+]], [[META11:![0-9]+]], i64 0}
-// CHECK: [[META10]] = !{!"_ZTS1S", [[META11]], i64 0, [[META12:![0-9]+]], i64 8, [[META13:![0-9]+]], i64 16}
-// CHECK: [[META11]] = !{!"p1 omnipotent char", [[META4]], i64 0}
-// CHECK: [[META12]] = !{!"p1 int", [[META4]], i64 0}
-// CHECK: [[META13]] = !{!"p1 _ZTS1F", [[META4]], i64 0}
-// CHECK: [[CHAR_TBAA14]] = !{[[META5]], [[META5]], i64 0}
-// CHECK: [[INTPTR_TBAA15]] = !{[[META10]], [[META12]], i64 8}
-// CHECK: [[META16]] = !{i64 4}
-// CHECK: [[INT_TBAA17]] = !{[[META18:![0-9]+]], [[META18]], i64 0}
-// CHECK: [[META18]] = !{!"int", [[META5]], i64 0}
-// CHECK: [[_ZTS1FPTR_TBAA19]] = !{[[META10]], [[META13]], i64 16}
+// CHECK: [[INT_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
+// CHECK: [[META3]] = !{!"int", [[META4:![0-9]+]], i64 0}
+// CHECK: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK: [[META5]] = !{!"Simple C++ TBAA"}
+// CHECK: [[_ZTS1SPTR_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// CHECK: [[META7]] = !{!"p1 _ZTS1S", [[META8:![0-9]+]], i64 0}
+// CHECK: [[META8]] = !{!"any pointer", [[META4]], i64 0}
+// CHECK: [[META9]] = !{}
+// CHECK: [[META10]] = !{i64 8}
+// CHECK: [[CHARPTR_TBAA11]] = !{[[META12:![0-9]+]], [[META13:![0-9]+]], i64 0}
+// CHECK: [[META12]] = !{!"_ZTS1S", [[META13]], i64 0, [[META14:![0-9]+]], i64 8, [[META15:![0-9]+]], i64 16}
+// CHECK: [[META13]] = !{!"p1 omnipotent char", [[META8]], i64 0}
+// CHECK: [[META14]] = !{!"p1 int", [[META8]], i64 0}
+// CHECK: [[META15]] = !{!"p1 _ZTS1F", [[META8]], i64 0}
+// CHECK: [[CHAR_TBAA16]] = !{[[META4]], [[META4]], i64 0}
+// CHECK: [[INTPTR_TBAA17]] = !{[[META12]], [[META14]], i64 8}
+// CHECK: [[META18]] = !{i64 4}
+// CHECK: [[_ZTS1FPTR_TBAA19]] = !{[[META12]], [[META15]], i64 16}
 // CHECK: [[META20]] = !{i64 32}
-// CHECK: [[INT_TBAA21]] = !{[[META22:![0-9]+]], [[META18]], i64 0}
-// CHECK: [[META22]] = !{!"_ZTS1F", [[META18]], i64 0}
+// CHECK: [[INT_TBAA21]] = !{[[META22:![0-9]+]], [[META3]], i64 0}
+// CHECK: [[META22]] = !{!"_ZTS1F", [[META3]], i64 0}
 // CHECK: [[_ZTS1BPTR_TBAA23]] = !{[[META24:![0-9]+]], [[META24]], i64 0}
-// CHECK: [[META24]] = !{!"p1 _ZTS1B", [[META4]], i64 0}
-// CHECK: [[CHAR_TBAA25]] = !{[[META26:![0-9]+]], [[META5]], i64 16}
-// CHECK: [[META26]] = !{!"_ZTS1B", [[META27:![0-9]+]], i64 8, [[META5]], i64 16}
-// CHECK: [[META27]] = !{!"long long", [[META5]], i64 0}
+// CHECK: [[META24]] = !{!"p1 _ZTS1B", [[META8]], i64 0}
+// CHECK: [[CHAR_TBAA25]] = !{[[META26:![0-9]+]], [[META4]], i64 16}
+// CHECK: [[META26]] = !{!"_ZTS1B", [[META27:![0-9]+]], i64 8, [[META4]], i64 16}
+// CHECK: [[META27]] = !{!"long long", [[META4]], i64 0}
 //.
diff --git a/clang/test/CodeGenCXX/sizeof-unwind-exception.cpp b/clang/test/CodeGenCXX/sizeof-unwind-exception.cpp
index 4fb977a5367e7..e40b2d7ae43ea 100644
--- a/clang/test/CodeGenCXX/sizeof-unwind-exception.cpp
+++ b/clang/test/CodeGenCXX/sizeof-unwind-exception.cpp
@@ -3,6 +3,8 @@
 // RUN: %clang_cc1 -triple x86_64-apple-darwin10 -emit-llvm -fcxx-exceptions -fexceptions %s -O2 -o - | FileCheck %s --check-prefix=ARM-DARWIN
 // RUN: %clang_cc1 -triple arm-unknown-gnueabi -emit-llvm -fcxx-exceptions -fexceptions %s -O2 -o - | FileCheck %s --check-prefix=ARM-EABI
 // RUN: %clang_cc1 -triple mipsel-unknown-unknown -emit-llvm -fcxx-exceptions -fexceptions %s -O2 -o - | FileCheck %s --check-prefix=MIPS
+// RUN: %clang_cc1 -triple x86_64-windows-gnu -emit-llvm -fcxx-exceptions -fexceptions -exception-model=seh %s -O2 -o - | FileCheck %s --check-prefix=MINGW-X86-64
+// RUN: %clang_cc1 -triple thumbv7-windows-gnu -emit-llvm -fcxx-exceptions -fexceptions -exception-model=seh %s -O2 -o - | FileCheck %s --check-prefix=MINGW-ARMV7
 
 void foo();
 void test() {
@@ -25,9 +27,15 @@ void test() {
 // ARM-EABI-NEXT:   [[T1:%.*]] = getelementptr i8, ptr [[EXN]], i32 88
 // MIPS:            [[T0:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[EXN:%.*]]) [[NUW:#[0-9]+]]
 // MIPS-NEXT:       [[T1:%.*]] = getelementptr i8, ptr [[EXN]], i32 24
+// MINGW-X86-64:     [[T0:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[EXN:%.*]]) [[NUW:#[0-9]+]]
+// MINGW-X86-64-NEXT:[[T1:%.*]] = getelementptr i8, ptr [[EXN]], i64 64
+// MINGW-ARMV7:      [[T0:%.*]] = tail call arm_aapcs_vfpcc ptr @__cxa_begin_catch(ptr [[EXN:%.*]]) [[NUW:#[0-9]+]]
+// MINGW-ARMV7-NEXT: [[T1:%.*]] = getelementptr i8, ptr [[EXN]], i32 48
 
 // X86-64: attributes [[NUW]] = { nounwind }
 // X86-32: attributes [[NUW]] = { nounwind }
 // ARM-DARWIN: attributes [[NUW]] = { nounwind }
 // ARM-EABI: attributes [[NUW]] = { nounwind }
 // MIPS: attributes [[NUW]] = { nounwind }
+// MINGW-X86-64: attributes [[NUW]] = { nounwind }
+// MINGW-ARMV7: attributes [[NUW]] = { nounwind }
diff --git a/clang/test/CodeGenCXX/std-byte.cpp b/clang/test/CodeGenCXX/std-byte.cpp
index 63f41e6f1e16a..00191b2711a34 100644
--- a/clang/test/CodeGenCXX/std-byte.cpp
+++ b/clang/test/CodeGenCXX/std-byte.cpp
@@ -1,3 +1,4 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // RUN: %clang_cc1 -std=c++1z -Werror -triple i386-unknown-unknown -emit-llvm -O1 -disable-llvm-passes -o - %s | FileCheck %s
 
 // std::byte should be considered equivalent to char for aliasing.
@@ -8,10 +9,10 @@ enum byte : unsigned char {};
 
 // CHECK-LABEL: define{{.*}} void @test0(
 extern "C" void test0(std::byte *sb, int *i) {
-  // CHECK: store i8 0, ptr %{{.*}} !tbaa [[TAG_CHAR:!.*]]
+  // CHECK: store i8 0, ptr %{{.*}}, align 1, !tbaa [[TBAA11:![0-9]+]]
   *sb = std::byte{0};
 
-  // CHECK: store i32 1, ptr %{{.*}} !tbaa [[TAG_INT:!.*]]
+  // CHECK: store i32 1, ptr %{{.*}}, align 4, !tbaa [[TBAA3:![0-9]+]]
   *i = 1;
 }
 
@@ -27,15 +28,24 @@ enum byte : unsigned char {};
 
 // CHECK-LABEL: define{{.*}} void @test1(
 extern "C" void test1(::byte *b, ::my::byte *mb, ::my::std::byte *msb) {
+  // CHECK: store i8 0, ptr %{{.*}}, align 1, !tbaa [[TBAA12:![0-9]+]]
   *b = ::byte{0};
+  // CHECK: store i8 0, ptr %{{.*}}, align 1, !tbaa [[TBAA14:![0-9]+]]
   *mb = ::my::byte{0};
+  // CHECK: store i8 0, ptr %{{.*}}, align 1, !tbaa [[TBAA16:![0-9]+]]
   *msb = ::my::std::byte{0};
-  // CHECK-NOT: store i8 0, ptr %{{.*}} !tbaa [[TAG_CHAR]]
 }
 
-// CHECK:  !"any pointer", [[TYPE_CHAR:!.*]],
-// CHECK: [[TYPE_CHAR]] = !{!"omnipotent char", [[TAG_CXX_TBAA:!.*]],
-// CHECK: [[TAG_CXX_TBAA]] = !{!"Simple C++ TBAA"}
-// CHECK: [[TAG_CHAR]] = !{[[TYPE_CHAR:!.*]], [[TYPE_CHAR]], i64 0}
-// CHECK: [[TAG_INT]] = !{[[TYPE_INT:!.*]], [[TYPE_INT]], i64 0}
-// CHECK: [[TYPE_INT]] = !{!"int", [[TYPE_CHAR]]
+//.
+// CHECK: [[TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+// CHECK: [[META4]] = !{!"int", [[META5:![0-9]+]], i64 0}
+// CHECK: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
+// CHECK: [[META6]] = !{!"Simple C++ TBAA"}
+// CHECK: [[TBAA11]] = !{[[META5]], [[META5]], i64 0}
+// CHECK: [[TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
+// CHECK: [[META13]] = !{!"_ZTS4byte", [[META5]], i64 0}
+// CHECK: [[TBAA14]] = !{[[META15:![0-9]+]], [[META15]], i64 0}
+// CHECK: [[META15]] = !{!"_ZTSN2my4byteE", [[META5]], i64 0}
+// CHECK: [[TBAA16]] = !{[[META17:![0-9]+]], [[META17]], i64 0}
+// CHECK: [[META17]] = !{!"_ZTSN2my3std4byteE", [[META5]], i64 0}
+//.
diff --git a/clang/test/CodeGenHLSL/Operators/logical-not.hlsl b/clang/test/CodeGenHLSL/Operators/logical-not.hlsl
new file mode 100644
index 0000000000000..0f9d0677d8610
--- /dev/null
+++ b/clang/test/CodeGenHLSL/Operators/logical-not.hlsl
@@ -0,0 +1,33 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -disable-llvm-passes -emit-llvm -finclude-default-header -fnative-half-type -o - %s | FileCheck %s
+
+// CHECK-LABEL: case1
+// CHECK: [[ToBool:%.*]] = icmp ne <2 x i32> {{.*}}, zeroinitializer
+// CHECK-NEXT: [[BoolCmp:%.*]] = icmp eq <2 x i1> [[ToBool]], zeroinitializer
+// CHECK-NEXT: {{.*}} = zext <2 x i1> [[BoolCmp]] to <2 x i32>
+export uint32_t2 case1(uint32_t2 b) {
+    return !b;
+}
+
+// CHECK-LABEL: case2
+// CHECK: [[ToBool:%.*]] = icmp ne <3 x i32> {{.*}}, zeroinitializer
+// CHECK-NEXT: [[BoolCmp:%.*]] = icmp eq <3 x i1> [[ToBool]], zeroinitializer
+// CHECK-NEXT: {{.*}} = zext <3 x i1> [[BoolCmp]] to <3 x i32>
+export int32_t3 case2(int32_t3 b) {
+    return !b;
+}
+
+// CHECK-LABEL: case3
+// CHECK: [[ToBool:%.*]] = fcmp reassoc nnan ninf nsz arcp afn une half {{.*}}, 0xH0000
+// CHECK-NEXT: [[BoolCmp:%.*]] = xor i1 [[ToBool]], true
+// CHECK-NEXT: {{.*}} = uitofp i1 [[BoolCmp]] to half
+export float16_t case3(float16_t b) {
+    return !b;
+}
+
+// CHECK-LABEL: case4
+// CHECK: [[ToBool:%.*]] = fcmp reassoc nnan ninf nsz arcp afn une <4 x float> {{.*}}, zeroinitializer
+// CHECK-NEXT: [[BoolCmp:%.*]] = icmp eq <4 x i1> [[ToBool]], zeroinitializer
+// CHECK-NEXT: {{.*}} = uitofp <4 x i1> [[BoolCmp]] to <4 x float>
+export float4 case4(float4 b) {
+    return !b;
+}
diff --git a/clang/test/CodeGenHLSL/basic-target.c b/clang/test/CodeGenHLSL/basic-target.c
index c700e06bd5850..b9482df5a0987 100644
--- a/clang/test/CodeGenHLSL/basic-target.c
+++ b/clang/test/CodeGenHLSL/basic-target.c
@@ -6,5 +6,5 @@
 // RUN: %clang -cc1 -triple dxil-pc-shadermodel6.0-domain -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang -cc1 -triple dxil-pc-shadermodel6.0-geometry -emit-llvm -o - %s | FileCheck %s
 
-// CHECK: target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64"
+// CHECK: target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64-v48:16:16-v96:32:32-v192:64:64"
 // CHECK: target triple = "dxilv1.0-pc-shadermodel6.0-{{[a-z]+}}"
diff --git a/clang/test/CodeGenHLSL/resources/ByteAddressBuffers-methods.hlsl b/clang/test/CodeGenHLSL/resources/ByteAddressBuffers-methods.hlsl
new file mode 100644
index 0000000000000..9dd02287620e7
--- /dev/null
+++ b/clang/test/CodeGenHLSL/resources/ByteAddressBuffers-methods.hlsl
@@ -0,0 +1,45 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -finclude-default-header -emit-llvm -disable-llvm-passes -o - %s | llvm-cxxfilt | FileCheck %s --check-prefixes=CHECK,DXIL
+// RUN-DISABLED: %clang_cc1 -triple spirv-vulkan-library -finclude-default-header -emit-llvm -disable-llvm-passes -o - %s | llvm-cxxfilt | FileCheck %s --check-prefixes=CHECK,SPIRV
+
+// NOTE: SPIRV codegen for resource methods is not yet implemented
+
+ByteAddressBuffer Buf : register(t0);
+RWByteAddressBuffer RWBuf : register(u0);
+
+// DXIL: %"class.hlsl::ByteAddressBuffer" = type { target("dx.RawBuffer", i8, 0, 0) }
+// DXIL: %"class.hlsl::RWByteAddressBuffer" = type { target("dx.RawBuffer", i8, 1, 0) }
+
+// DXIL: @Buf = internal global %"class.hlsl::ByteAddressBuffer" poison
+// DXIL: @RWBuf = internal global %"class.hlsl::RWByteAddressBuffer" poison
+
+export uint TestGetDimensions() {
+    uint dim1, dim2;
+    Buf.GetDimensions(dim1);
+    RWBuf.GetDimensions(dim2);
+    return dim1 + dim2;
+}
+
+// CHECK: define {{.*}} @TestGetDimensions()()
+// CHECK: call void @hlsl::ByteAddressBuffer::GetDimensions(unsigned int&)(ptr {{.*}} @Buf, ptr{{.*}})
+// CHECK: call void @hlsl::RWByteAddressBuffer::GetDimensions(unsigned int&)(ptr{{.*}} @RWBuf, ptr{{.*}})
+// CHECK: add
+// CHECK: ret
+
+// CHECK: define {{.*}} void @hlsl::ByteAddressBuffer::GetDimensions(unsigned int&)(ptr {{.*}} %this, {{.*}} %dim)
+// CHECK: %[[HANDLE_PTR:.*]] = getelementptr inbounds nuw %"class.hlsl::ByteAddressBuffer", ptr %{{.*}}, i32 0, i32 0
+// CHECK-NEXT: %[[HANDLE:.*]] = load target("dx.RawBuffer", i8, 0, 0), ptr %[[HANDLE_PTR]]
+// CHECK-NEXT: %[[DIMPTR:.*]] = load ptr, ptr %dim.addr
+// DXIL-NEXT: %[[DIM:.*]] = call i32 @llvm.dx.resource.getdimensions.x.tdx.RawBuffer_i8_0_0t(target("dx.RawBuffer", i8, 0, 0) %[[HANDLE]])
+// CHECK-NEXT: store i32 %[[DIM]], ptr %[[DIMPTR]]
+// CHECK-NEXT: ret void
+
+// CHECK: define {{.*}} void @hlsl::RWByteAddressBuffer::GetDimensions(unsigned int&)(ptr {{.*}} %this, ptr noalias {{.*}} %dim)
+// CHECK: %[[HANDLE_PTR:.*]] = getelementptr inbounds nuw %"class.hlsl::RWByteAddressBuffer", ptr %{{.*}}, i32 0, i32 0
+// CHECK-NEXT: %[[HANDLE:.*]] = load target("dx.RawBuffer", i8, 1, 0), ptr %[[HANDLE_PTR]]
+// CHECK-NEXT: %[[DIMPTR:.*]] = load ptr, ptr %dim.addr
+// DXIL-NEXT: %[[DIM:.*]] = call i32 @llvm.dx.resource.getdimensions.x.tdx.RawBuffer_i8_1_0t(target("dx.RawBuffer", i8, 1, 0) %[[HANDLE]])
+// CHECK-NEXT: store i32 %[[DIM]], ptr %[[DIMPTR]]
+// CHECK-NEXT: ret void
+
+// DXIL: declare i32 @llvm.dx.resource.getdimensions.x.tdx.RawBuffer_i8_0_0t(target("dx.RawBuffer", i8, 0, 0))
+// DXIL: declare i32 @llvm.dx.resource.getdimensions.x.tdx.RawBuffer_i8_1_0t(target("dx.RawBuffer", i8, 1, 0))
diff --git a/clang/test/CodeGenHLSL/resources/RWBuffer-elementtype.hlsl b/clang/test/CodeGenHLSL/resources/RWBuffer-elementtype.hlsl
deleted file mode 100644
index f48521b0f1764..0000000000000
--- a/clang/test/CodeGenHLSL/resources/RWBuffer-elementtype.hlsl
+++ /dev/null
@@ -1,70 +0,0 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s -check-prefixes=DXIL
-// RUN: %clang_cc1 -triple spirv-pc-vulkan-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s -check-prefixes=SPIRV
-
-// DXIL: %"class.hlsl::RWBuffer" = type { target("dx.TypedBuffer", i16, 1, 0, 1) }
-// DXIL: %"class.hlsl::RWBuffer.0" = type { target("dx.TypedBuffer", i16, 1, 0, 0) }
-// DXIL: %"class.hlsl::RWBuffer.1" = type { target("dx.TypedBuffer", i32, 1, 0, 1) }
-// DXIL: %"class.hlsl::RWBuffer.2" = type { target("dx.TypedBuffer", i32, 1, 0, 0) }
-// DXIL: %"class.hlsl::RWBuffer.3" = type { target("dx.TypedBuffer", i64, 1, 0, 1) }
-// DXIL: %"class.hlsl::RWBuffer.4" = type { target("dx.TypedBuffer", i64, 1, 0, 0) }
-// DXIL: %"class.hlsl::RWBuffer.5" = type { target("dx.TypedBuffer", half, 1, 0, 0) }
-// DXIL: %"class.hlsl::RWBuffer.6" = type { target("dx.TypedBuffer", float, 1, 0, 0) }
-// DXIL: %"class.hlsl::RWBuffer.7" = type { target("dx.TypedBuffer", double, 1, 0, 0) }
-// DXIL: %"class.hlsl::RWBuffer.8" = type { target("dx.TypedBuffer", <4 x i16>, 1, 0, 1) }
-// DXIL: %"class.hlsl::RWBuffer.9" = type { target("dx.TypedBuffer", <3 x i32>, 1, 0, 0) }
-// DXIL: %"class.hlsl::RWBuffer.10" = type { target("dx.TypedBuffer", <2 x half>, 1, 0, 0) }
-// DXIL: %"class.hlsl::RWBuffer.11" = type { target("dx.TypedBuffer", <3 x float>, 1, 0, 0) }
-// DXIL: %"class.hlsl::RWBuffer.12" = type { target("dx.TypedBuffer", <4 x i32>, 1, 0, 1) }
-
-// SPIRV: %"class.hlsl::RWBuffer" = type { target("spirv.SignedImage", i16, 5, 2, 0, 0, 2, 0) }
-// SPIRV: %"class.hlsl::RWBuffer.0" = type { target("spirv.Image", i16, 5, 2, 0, 0, 2, 0) }
-// SPIRV: %"class.hlsl::RWBuffer.1" = type { target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 24) }
-// SPIRV: %"class.hlsl::RWBuffer.2" = type { target("spirv.Image", i32, 5, 2, 0, 0, 2, 33) }
-// SPIRV: %"class.hlsl::RWBuffer.3" = type { target("spirv.SignedImage", i64, 5, 2, 0, 0, 2, 41) }
-// SPIRV: %"class.hlsl::RWBuffer.4" = type { target("spirv.Image", i64, 5, 2, 0, 0, 2, 40) }
-// SPIRV: %"class.hlsl::RWBuffer.5" = type { target("spirv.Image", half, 5, 2, 0, 0, 2, 0) }
-// SPIRV: %"class.hlsl::RWBuffer.6" = type { target("spirv.Image", float, 5, 2, 0, 0, 2, 3) }
-// SPIRV: %"class.hlsl::RWBuffer.7" = type { target("spirv.Image", double, 5, 2, 0, 0, 2, 0) }
-// SPIRV: %"class.hlsl::RWBuffer.8" = type { target("spirv.SignedImage", i16, 5, 2, 0, 0, 2, 0) }
-// SPIRV: %"class.hlsl::RWBuffer.9" = type { target("spirv.Image", i32, 5, 2, 0, 0, 2, 0) }
-// SPIRV: %"class.hlsl::RWBuffer.10" = type { target("spirv.Image", half, 5, 2, 0, 0, 2, 0) }
-// SPIRV: %"class.hlsl::RWBuffer.11" = type { target("spirv.Image", float, 5, 2, 0, 0, 2, 0) }
-// SPIRV: %"class.hlsl::RWBuffer.12" = type { target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 21) }
-
-RWBuffer<int16_t> BufI16;
-RWBuffer<uint16_t> BufU16;
-RWBuffer<int> BufI32;
-RWBuffer<uint> BufU32;
-RWBuffer<int64_t> BufI64;
-RWBuffer<uint64_t> BufU64;
-RWBuffer<half> BufF16;
-RWBuffer<float> BufF32;
-RWBuffer<double> BufF64;
-RWBuffer< vector<int16_t, 4> > BufI16x4;
-RWBuffer< vector<uint, 3> > BufU32x3;
-RWBuffer<half2> BufF16x2;
-RWBuffer<float3> BufF32x3;
-RWBuffer<int4> BufI32x4;
-// TODO: RWBuffer<snorm half> BufSNormF16; -> 11
-// TODO: RWBuffer<unorm half> BufUNormF16; -> 12
-// TODO: RWBuffer<snorm float> BufSNormF32; -> 13
-// TODO: RWBuffer<unorm float> BufUNormF32; -> 14
-// TODO: RWBuffer<snorm double> BufSNormF64; -> 15
-// TODO: RWBuffer<unorm double> BufUNormF64; -> 16
-
-[numthreads(1,1,1)]
-void main(int GI : SV_GroupIndex) {
-  BufI16[GI] = 0;
-  BufU16[GI] = 0;
-  BufI32[GI] = 0;
-  BufU32[GI] = 0;
-  BufI64[GI] = 0;
-  BufU64[GI] = 0;
-  BufF16[GI] = 0;
-  BufF32[GI] = 0;
-  BufF64[GI] = 0;
-  BufI16x4[GI] = 0;
-  BufU32x3[GI] = 0;
-  BufF16x2[GI] = 0;
-  BufF32x3[GI] = 0;
-}
diff --git a/clang/test/CodeGenHLSL/resources/RWBuffer-subscript.hlsl b/clang/test/CodeGenHLSL/resources/RWBuffer-subscript.hlsl
deleted file mode 100644
index 0de171cb452d8..0000000000000
--- a/clang/test/CodeGenHLSL/resources/RWBuffer-subscript.hlsl
+++ /dev/null
@@ -1,26 +0,0 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -emit-llvm -o - -O0 %s | FileCheck %s --check-prefixes=DXC,CHECK
-// RUN: %clang_cc1 -triple spirv1.6-pc-vulkan1.3-compute -fspv-use-unknown-image-format -emit-llvm -o - -O0 %s | FileCheck %s --check-prefixes=SPIRV,CHECK
-
-RWBuffer<int> In;
-RWBuffer<int> Out;
-
-[numthreads(1,1,1)]
-void main(unsigned GI : SV_GroupIndex) {
-  // CHECK: define void @main()
-
-  // DXC: %[[INPTR:.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @llvm.dx.resource.getpointer.p0.tdx.TypedBuffer_i32_1_0_1t(target("dx.TypedBuffer", i32, 1, 0, 1) %{{.*}}, i32 %{{.*}})
-  // SPIRV: %[[INPTR:.*]] = call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %{{.*}}, i32 %{{.*}})
-  // CHECK: %[[LOAD:.*]] = load i32, ptr {{.*}}%[[INPTR]]
-  // DXC: %[[OUTPTR:.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @llvm.dx.resource.getpointer.p0.tdx.TypedBuffer_i32_1_0_1t(target("dx.TypedBuffer", i32, 1, 0, 1) %{{.*}}, i32 %{{.*}})
-  // SPIRV: %[[OUTPTR:.*]] = call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %{{.*}}, i32 %{{.*}})
-  // CHECK: store i32 %[[LOAD]], ptr {{.*}}%[[OUTPTR]]
-  Out[GI] = In[GI];
-
-  // DXC: %[[INPTR:.*]] = call ptr @llvm.dx.resource.getpointer.p0.tdx.TypedBuffer_i32_1_0_1t(target("dx.TypedBuffer", i32, 1, 0, 1) %{{.*}}, i32 %{{.*}})
-  // SPIRV: %[[INPTR:.*]] = call ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %{{.*}}, i32 %{{.*}})
-  // CHECK: %[[LOAD:.*]] = load i32, ptr {{.*}}%[[INPTR]]
-  // DXC: %[[OUTPTR:.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @llvm.dx.resource.getpointer.p0.tdx.TypedBuffer_i32_1_0_1t(target("dx.TypedBuffer", i32, 1, 0, 1) %{{.*}}, i32 %{{.*}})
-  // SPIRV: %[[OUTPTR:.*]] = call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %{{.*}}, i32 %{{.*}})
-  // CHECK: store i32 %[[LOAD]], ptr {{.*}}%[[OUTPTR]]
-  Out[GI] = In.Load(GI);
-}
diff --git a/clang/test/CodeGenHLSL/resources/StructuredBuffers-methods-lib.hlsl b/clang/test/CodeGenHLSL/resources/StructuredBuffers-methods-lib.hlsl
index 43ddd2e768ea0..1f248d0560006 100644
--- a/clang/test/CodeGenHLSL/resources/StructuredBuffers-methods-lib.hlsl
+++ b/clang/test/CodeGenHLSL/resources/StructuredBuffers-methods-lib.hlsl
@@ -1,64 +1,160 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL
-// RUN-DISABLED: %clang_cc1 -triple spirv-vulkan-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -finclude-default-header -emit-llvm -disable-llvm-passes -o - %s | llvm-cxxfilt | FileCheck %s --check-prefixes=CHECK,DXIL
+// RUN-DISABLED: %clang_cc1 -triple spirv-vulkan-library -finclude-default-header -emit-llvm -disable-llvm-passes -o - %s | llvm-cxxfilt | FileCheck %s --check-prefixes=CHECK,SPV
 
 // NOTE: SPIRV codegen for resource methods is not yet implemented
 
 StructuredBuffer<float> SB1 : register(t0);
 RWStructuredBuffer<float> RWSB1 : register(u0);
-RWStructuredBuffer<float> RWSB2 : register(u1);
+RWStructuredBuffer<uint4> RWSB2 : register(u1);
 AppendStructuredBuffer<float> ASB : register(u2);
-ConsumeStructuredBuffer<float> CSB : register(u3);
+ConsumeStructuredBuffer<double> CSB : register(u3);
 
-// CHECK: %"class.hlsl::StructuredBuffer" = type { target("dx.RawBuffer", float, 0, 0) }
-// CHECK: %"class.hlsl::RWStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0), target("dx.RawBuffer", float, 1, 0) }
-// CHECK: %"class.hlsl::AppendStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0), target("dx.RawBuffer", float, 1, 0) }
-// CHECK: %"class.hlsl::ConsumeStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0), target("dx.RawBuffer", float, 1, 0) }
+// DXIL: %"class.hlsl::StructuredBuffer" = type { target("dx.RawBuffer", float, 0, 0) }
+// DXIL: %"class.hlsl::RWStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0), target("dx.RawBuffer", float, 1, 0) }
+// DXIL: %"class.hlsl::RWStructuredBuffer.0" = type { target("dx.RawBuffer", <4 x i32>, 1, 0), target("dx.RawBuffer", <4 x i32>, 1, 0) }
+// DXIL: %"class.hlsl::AppendStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0), target("dx.RawBuffer", float, 1, 0) }
+// DXIL: %"class.hlsl::ConsumeStructuredBuffer" = type { target("dx.RawBuffer", double, 1, 0), target("dx.RawBuffer", double, 1, 0) }
 
 export int TestIncrementCounter() {
     return RWSB1.IncrementCounter();
 }
 
-// CHECK: define noundef i32 @_Z20TestIncrementCounterv()
-// CHECK-DXIL: %[[INDEX:.*]] = call i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0) %{{[0-9]+}}, i8 1)
-// CHECK-DXIL: ret i32 %[[INDEX]]
+// CHECK: define noundef i32 @TestIncrementCounter()()
+// CHECK: call noundef i32 @hlsl::RWStructuredBuffer<float>::IncrementCounter()(ptr {{.*}} @RWSB1)
+// CHECK: ret
+
+// CHECK: define {{.*}} noundef i32 @hlsl::RWStructuredBuffer<float>::IncrementCounter()(ptr {{.*}} %this)
+// CHECK: %__counter_handle = getelementptr inbounds nuw %"class.hlsl::RWStructuredBuffer", ptr %{{.*}}, i32 0, i32 1
+// DXIL-NEXT: %[[COUNTER_HANDLE:.*]] = load target("dx.RawBuffer", float, 1, 0), ptr %__counter_handle
+// DXIL-NEXT: %[[COUNTER:.*]] = call i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0) %[[COUNTER_HANDLE]], i8 1)
+// CHECK-NEXT:  ret i32 %[[COUNTER]]
+
 export int TestDecrementCounter() {
     return RWSB2.DecrementCounter();
 }
+// CHECK: define {{.*}} i32 @TestDecrementCounter()()
+// CHECK: call noundef i32 @hlsl::RWStructuredBuffer<unsigned int vector[4]>::DecrementCounter()(ptr {{.*}} @RWSB2)
+// CHECK: ret
 
-// CHECK: define noundef i32 @_Z20TestDecrementCounterv()
-// CHECK-DXIL: %[[INDEX:.*]] = call i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0) %{{[0-9]+}}, i8 -1)
-// CHECK-DXIL: ret i32 %[[INDEX]]
+// CHECK: define {{.*}} noundef i32 @hlsl::RWStructuredBuffer<unsigned int vector[4]>::DecrementCounter()(ptr {{.*}} %this)
+// CHECK: %__counter_handle = getelementptr inbounds nuw %"class.hlsl::RWStructuredBuffer.0", ptr %{{.*}}, i32 0, i32 1
+// DXIL-NEXT: %[[COUNTER_HANDLE:.*]] = load target("dx.RawBuffer", <4 x i32>, 1, 0), ptr %__counter_handle
+// DXIL-NEXT: %[[COUNTER:.*]] = call i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_v4i32_1_0t(target("dx.RawBuffer", <4 x i32>, 1, 0) %[[COUNTER_HANDLE]], i8 -1)
+// CHECK-NEXT: ret i32 %[[COUNTER]]
 
 export void TestAppend(float value) {
     ASB.Append(value);
 }
 
-// CHECK: define void @_Z10TestAppendf(float noundef nofpclass(nan inf) %value)
-// CHECK-DXIL: %[[VALUE:.*]] = load float, ptr %value.addr, align 4
-// CHECK-DXIL: %[[INDEX:.*]] = call i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0) %{{[0-9]+}}, i8 1)
-// CHECK-DXIL: %[[RESPTR:.*]] = call ptr @llvm.dx.resource.getpointer.p0.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0) %{{[0-9]+}}, i32 %[[INDEX]])
-// CHECK-DXIL: store float %[[VALUE]], ptr %[[RESPTR]], align 4
+// CHECK: define void @TestAppend(float)(float {{.*}} %value)
+// CHECK: call void @hlsl::AppendStructuredBuffer<float>::Append(float)(ptr {{.*}} @ASB, float noundef nofpclass(nan inf) %0)
+// CHECK: ret void
+
+// CHECK: define {{.*}} void @hlsl::AppendStructuredBuffer<float>::Append(float)(ptr {{.*}} %this, float noundef nofpclass(nan inf) %value)
+// CHECK: %[[VALUE:.*]] = load float, ptr %value.addr
+// CHECK-NEXT: %__handle = getelementptr inbounds nuw %"class.hlsl::AppendStructuredBuffer", ptr %{{.*}}, i32 0, i32 0
+// DXIL-NEXT: %[[HANDLE:.*]] = load target("dx.RawBuffer", float, 1, 0), ptr %__handle
+// CHECK-NEXT: %__counter_handle = getelementptr inbounds nuw %"class.hlsl::AppendStructuredBuffer", ptr %{{.*}}, i32 0, i32 1
+// DXIL-NEXT: %[[COUNTER_HANDLE:.*]] = load target("dx.RawBuffer", float, 1, 0), ptr %__counter_handle
+// DXIL-NEXT: %[[COUNTER:.*]] = call i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0) %[[COUNTER_HANDLE]], i8 1)
+// DXIL-NEXT: %[[PTR:.*]] = call ptr @llvm.dx.resource.getpointer.p0.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0) %[[HANDLE]], i32 %[[COUNTER]])
+// CHECK-NEXT: store float %[[VALUE]], ptr %[[PTR]]
+// CHECK-NEXT: ret void
 
-export float TestConsume() {
+export double TestConsume() {
     return CSB.Consume();
 }
-
-// CHECK: define noundef nofpclass(nan inf) float @_Z11TestConsumev()
-// CHECK-DXIL: %[[INDEX:.*]] = call i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0) %1, i8 -1)
-// CHECK-DXIL: %[[RESPTR:.*]] = call ptr @llvm.dx.resource.getpointer.p0.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0) %0, i32 %[[INDEX]])
-// CHECK-DXIL: %[[VALUE:.*]] = load float, ptr %[[RESPTR]], align 4
-// CHECK-DXIL: ret float %[[VALUE]]
+// CHECK: define {{.*}} double @TestConsume()()
+// CHECK: call {{.*}} double @hlsl::ConsumeStructuredBuffer<double>::Consume()(ptr {{.*}} @CSB)
+// CHECK: ret double
+    
+// CHECK: define {{.*}} double @hlsl::ConsumeStructuredBuffer<double>::Consume()(ptr {{.*}} %this)
+// CHECK: %__handle = getelementptr inbounds nuw %"class.hlsl::ConsumeStructuredBuffer", ptr %{{.*}}, i32 0, i32 0
+// DXIL-NEXT: %[[HANDLE:.*]] = load target("dx.RawBuffer", double, 1, 0), ptr %__handle
+// CHECK-NEXT: %__counter_handle = getelementptr inbounds nuw %"class.hlsl::ConsumeStructuredBuffer", ptr %{{.*}}, i32 0, i32 1
+// DXIL-NEXT: %[[COUNTER_HANDLE:.*]] = load target("dx.RawBuffer", double, 1, 0), ptr %__counter_handle
+// DXIL-NEXT: %[[COUNTER:.*]] = call i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f64_1_0t(target("dx.RawBuffer", double, 1, 0) %[[COUNTER_HANDLE]], i8 -1)
+// DXIL-NEXT: %[[PTR:.*]] = call ptr @llvm.dx.resource.getpointer.p0.tdx.RawBuffer_f64_1_0t(target("dx.RawBuffer", double, 1, 0) %[[HANDLE]], i32 %[[COUNTER]])
+// CHECK-NEXT: %[[VAL:.*]] = load double, ptr %[[PTR]], align 8
+// CHECK-NEXT: ret double %[[VAL]]
 
 export float TestLoad() {
     return RWSB1.Load(1) + SB1.Load(2);
 }
 
-// CHECK: define noundef nofpclass(nan inf) float @_Z8TestLoadv()
-// CHECK: %[[PTR1:.*]] = call ptr @llvm.dx.resource.getpointer.p0.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0) %{{[0-9]+}}, i32 %{{[0-9]+}})
-// CHECK: %[[VALUE1:.*]] = load float, ptr %[[PTR1]]
-// CHECK: %[[PTR2:.*]] = call ptr @llvm.dx.resource.getpointer.p0.tdx.RawBuffer_f32_0_0t(target("dx.RawBuffer", float, 0, 0) %{{[0-9]+}}, i32 %{{[0-9]+}})
-// CHECK: %[[VALUE2:.*]] = load float, ptr %[[PTR2]]
+// CHECK: define noundef nofpclass(nan inf) float @TestLoad()()
+// CHECK: call {{.*}} float @hlsl::RWStructuredBuffer<float>::Load(unsigned int)(ptr {{.*}} @RWSB1, i32 noundef 1)
+// CHECK: call {{.*}} float @hlsl::StructuredBuffer<float>::Load(unsigned int)(ptr {{.*}} @SB1, i32 noundef 2)
+// CHECK: add
+// CHECK: ret float
+
+// CHECK: define {{.*}} float @hlsl::RWStructuredBuffer<float>::Load(unsigned int)(ptr {{.*}} %this, i32 noundef %Index)
+// CHECK: %__handle = getelementptr inbounds nuw %"class.hlsl::RWStructuredBuffer", ptr %{{.*}}, i32 0, i32 0
+// DXIL-NEXT: %[[HANDLE:.*]] = load target("dx.RawBuffer", float, 1, 0), ptr %__handle
+// CHECK-NEXT: %[[INDEX:.*]] = load i32, ptr %Index.addr
+// DXIL-NEXT: %[[PTR:.*]] = call ptr @llvm.dx.resource.getpointer.p0.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0) %[[HANDLE]], i32 %[[INDEX]])
+// CHECK-NEXT: %[[VAL:.*]] = load float, ptr %[[PTR]]
+// CHECK-NEXT: ret float %[[VAL]]
+
+// CHECK: define {{.*}} float @hlsl::StructuredBuffer<float>::Load(unsigned int)(ptr {{.*}} %this, i32 noundef %Index)
+// CHECK: %__handle = getelementptr inbounds nuw %"class.hlsl::StructuredBuffer", ptr %{{.*}}, i32 0, i32 0
+// DXIL-NEXT: %[[HANDLE:.*]] = load target("dx.RawBuffer", float, 0, 0), ptr %__handle
+// CHECK-NEXT: %[[INDEX:.*]] = load i32, ptr %Index.addr
+// DXIL-NEXT: %[[PTR:.*]] = call ptr @llvm.dx.resource.getpointer.p0.tdx.RawBuffer_f32_0_0t(target("dx.RawBuffer", float, 0, 0) %[[HANDLE]], i32 %[[INDEX]])
+// CHECK-NEXT: %[[VAL:.*]] = load float, ptr %[[PTR]]
+// CHECK-NEXT: ret float %[[VAL]]
+
+export uint TestGetDimensions() {
+    uint dim1, dim2, dim3, stride1, stride2, stride3;
+    SB1.GetDimensions(dim1, stride1);
+    RWSB2.GetDimensions(dim2, stride2);
+    CSB.GetDimensions(dim3, stride3);
+    return dim1 + dim2 + dim3 + stride1 + stride2 + stride3;
+}
+// CHECK: define noundef i32 @TestGetDimensions()()
+// CHECK: call void @hlsl::StructuredBuffer<float>::GetDimensions(unsigned int&, unsigned int&)(ptr {{.*}} @SB1, ptr {{.*}}, ptr {{.*}})
+// CHECK: call void @hlsl::RWStructuredBuffer<unsigned int vector[4]>::GetDimensions(unsigned int&, unsigned int&)(ptr {{.*}} @RWSB2, ptr {{.*}}, ptr {{.*}})
+// CHECK: call void @hlsl::ConsumeStructuredBuffer<double>::GetDimensions(unsigned int&, unsigned int&)(ptr {{.*}} @CSB, ptr {{.*}}, ptr {{.*}})
+// CHECK: add
+// CHECK: ret
+
+// CHECK: define {{.*}} void @hlsl::StructuredBuffer<float>::GetDimensions(unsigned int&, unsigned int&)(ptr {{.*}}, ptr {{.*}} %numStructs, ptr {{.*}} %stride)
+// CHECK: %__handle = getelementptr inbounds nuw %"class.hlsl::StructuredBuffer", ptr %{{.*}}, i32 0, i32 0
+// CHECK-NEXT: %[[HANDLE:.*]] = load target("dx.RawBuffer", float, 0, 0), ptr %__handle
+// CHECK-NEXT: %[[NUMSTRUCTS_PTR:.*]] = load ptr, ptr %numStructs.addr
+// DXIL-NEXT: %[[NUMSTRUCTS:.*]] = call i32 @llvm.dx.resource.getdimensions.x.tdx.RawBuffer_f32_0_0t(target("dx.RawBuffer", float, 0, 0) %[[HANDLE]])
+// CHECK-NEXT: store i32 %[[NUMSTRUCTS]], ptr %[[NUMSTRUCTS_PTR]]
+// CHECK-NEXT: %[[STRIDEPTR:.*]] = load ptr, ptr %stride.addr
+// CHECK-NEXT: store i32 4, ptr %[[STRIDEPTR]]
+// CHECK-NEXT: ret void
+
+// CHECK: define {{.*}} void @hlsl::RWStructuredBuffer<unsigned int vector[4]>::GetDimensions(unsigned int&, unsigned int&)(ptr {{.*}} %this, {{.*}} %numStructs, {{.*}} %stride)
+// CHECK: %__handle = getelementptr inbounds nuw %"class.hlsl::RWStructuredBuffer.0", ptr %{{.*}}, i32 0, i32 0
+// CHECK-NEXT: %[[HANDLE:.*]] = load target("dx.RawBuffer", <4 x i32>, 1, 0), ptr %__handle
+// CHECK-NEXT: %[[NUMSTRUCTS_PTR:.*]] = load ptr, ptr %numStructs.addr
+// DXIL-NEXT: %[[NUMSTRUCTS:.*]] = call i32 @llvm.dx.resource.getdimensions.x.tdx.RawBuffer_v4i32_1_0t(target("dx.RawBuffer", <4 x i32>, 1, 0) %[[HANDLE]])
+// CHECK-NEXT: store i32 %[[NUMSTRUCTS]], ptr %[[NUMSTRUCTS_PTR]]
+// CHECK-NEXT: %[[STRIDEPTR:.*]] = load ptr, ptr %stride.addr
+// CHECK-NEXT: store i32 16, ptr %[[STRIDEPTR]]
+// CHECK-NEXT: ret void
+
+// CHECK: define {{.*}} void @hlsl::ConsumeStructuredBuffer<double>::GetDimensions(unsigned int&, unsigned int&)(ptr {{.*}} %this, {{.*}} %numStructs, {{.*}} %stride)
+// CHECK: %__handle = getelementptr inbounds nuw %"class.hlsl::ConsumeStructuredBuffer", ptr %{{.*}}, i32 0, i32 0
+// CHECK-NEXT: %[[HANDLE:.*]] = load target("dx.RawBuffer", double, 1, 0), ptr %__handle
+// CHECK-NEXT: %[[NUMSTRUCTS_PTR:.*]] = load ptr, ptr %numStructs.addr
+// DXIL-NEXT: %[[NUMSTRUCTS:.*]] = call i32 @llvm.dx.resource.getdimensions.x.tdx.RawBuffer_f64_1_0t(target("dx.RawBuffer", double, 1, 0) %[[HANDLE]])
+// CHECK-NEXT: store i32 %[[NUMSTRUCTS]], ptr %[[NUMSTRUCTS_PTR]]
+// CHECK-NEXT: %[[STRIDEPTR:.*]] = load ptr, ptr %stride.addr
+// CHECK-NEXT: store i32 8, ptr %[[STRIDEPTR]]
+// CHECK-NEXT: ret void
+
+// DXIL: declare i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0), i8)
+// DXIL: declare i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_v4i32_1_0t(target("dx.RawBuffer", <4 x i32>, 1, 0), i8)
+// DXIL: declare ptr @llvm.dx.resource.getpointer.p0.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0), i32)
+// DXIL: declare i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f64_1_0t(target("dx.RawBuffer", double, 1, 0), i8)
+// DXIL: declare ptr @llvm.dx.resource.getpointer.p0.tdx.RawBuffer_f64_1_0t(target("dx.RawBuffer", double, 1, 0), i32)
+// DXIL: declare ptr @llvm.dx.resource.getpointer.p0.tdx.RawBuffer_f32_0_0t(target("dx.RawBuffer", float, 0, 0), i32)
 
-// CHECK: declare i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0), i8)
-// CHECK: declare ptr @llvm.dx.resource.getpointer.p0.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0), i32)
-// CHECK: declare ptr @llvm.dx.resource.getpointer.p0.tdx.RawBuffer_f32_0_0t(target("dx.RawBuffer", float, 0, 0), i32)
+// DXIL: declare i32 @llvm.dx.resource.getdimensions.x.tdx.RawBuffer_f32_0_0t(target("dx.RawBuffer", float, 0, 0))
+// DXIL: declare i32 @llvm.dx.resource.getdimensions.x.tdx.RawBuffer_v4i32_1_0t(target("dx.RawBuffer", <4 x i32>, 1, 0))
+// DXIL: declare i32 @llvm.dx.resource.getdimensions.x.tdx.RawBuffer_f64_1_0t(target("dx.RawBuffer", double, 1, 0))
diff --git a/clang/test/CodeGenHLSL/resources/StructuredBuffers-methods-ps.hlsl b/clang/test/CodeGenHLSL/resources/StructuredBuffers-methods-ps.hlsl
index 9e08a6d0d7ae0..25fa75965d686 100644
--- a/clang/test/CodeGenHLSL/resources/StructuredBuffers-methods-ps.hlsl
+++ b/clang/test/CodeGenHLSL/resources/StructuredBuffers-methods-ps.hlsl
@@ -1,37 +1,106 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-pixel -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL
-// RUN-DISABLED: %clang_cc1 -triple spirv-vulkan-pixel -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-pixel -finclude-default-header -emit-llvm -disable-llvm-passes -o - %s | llvm-cxxfilt | FileCheck %s --check-prefixes=CHECK,DXIL
+// RUN-DISABLED: %clang_cc1 -triple spirv-vulkan-pixel -finclude-default-header -emit-llvm -disable-llvm-passes -o - %s | llvm-cxxfilt | FileCheck %s --check-prefixes=CHECK,SPV
 
 // NOTE: SPIRV codegen for resource methods is not yet implemented
 
-RWStructuredBuffer<float> RWSB1, RWSB2;
-RasterizerOrderedStructuredBuffer<float> ROSB1, ROSB2;
+RasterizerOrderedStructuredBuffer<float> ROSB1;
+RasterizerOrderedStructuredBuffer<int2> ROSB2;
 
-// CHECK: %"class.hlsl::RWStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0), target("dx.RawBuffer", float, 1, 0) }
+// %"class.hlsl::RasterizerOrderedStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 1), target("dx.RawBuffer", float, 1, 1) }
+// %"class.hlsl::RasterizerOrderedStructuredBuffer.0" = type { target("dx.RawBuffer", <2 x i32>, 1, 1), target("dx.RawBuffer", <2 x i32>, 1, 1) }
+
+// CHECK: @ROSB1 = internal global %"class.hlsl::RasterizerOrderedStructuredBuffer" poison
+// CHECK: @ROSB2 = internal global %"class.hlsl::RasterizerOrderedStructuredBuffer.0" poison
 
 export void TestIncrementCounter() {
-// CHECK: define void @_Z20TestIncrementCounterv()
-// CHECK-DXIL: call i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0) %{{[0-9]+}}, i8 1)
-// CHECK-DXIL: call i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f32_1_1t(target("dx.RawBuffer", float, 1, 1) %{{[0-9]+}}, i8 1)
-    RWSB1.IncrementCounter();
     ROSB1.IncrementCounter();
 }
 
+// CHECK: define void @TestIncrementCounter()()
+// CHECK: call noundef i32 @hlsl::RasterizerOrderedStructuredBuffer<float>::IncrementCounter()(ptr {{.*}} @ROSB1)
+// CHECK-NEXT: ret void
+
+// CHECK: define {{.*}} i32 @hlsl::RasterizerOrderedStructuredBuffer<float>::IncrementCounter()(ptr {{.*}} %this)
+// CHECK: %__counter_handle = getelementptr inbounds nuw %"class.hlsl::RasterizerOrderedStructuredBuffer", ptr %{{.*}}, i32 0, i32 1
+// CHECK-NEXT: %[[COUNTER_HANDLE:.*]] = load target("dx.RawBuffer", float, 1, 1), ptr %__counter_handle
+// DXIL-NEXT: %[[VAL:.*]] = call i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f32_1_1t(target("dx.RawBuffer", float, 1, 1) %[[COUNTER_HANDLE]], i8 1)
+// CHECK-NEXT: ret i32 %[[VAL]]
+
 export void TestDecrementCounter() {
-// CHECK: define void @_Z20TestDecrementCounterv()
-// CHECK-DXIL: call i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0) %{{[0-9]+}}, i8 -1)
-// CHECK-DXIL: call i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f32_1_1t(target("dx.RawBuffer", float, 1, 1) %{{[0-9]+}}, i8 -1)
-    RWSB2.DecrementCounter();
     ROSB2.DecrementCounter();
 }
 
+// CHECK: define void @TestDecrementCounter()()
+// CHECK: call noundef i32 @hlsl::RasterizerOrderedStructuredBuffer<int vector[2]>::DecrementCounter()(ptr {{.*}} @ROSB2)
+// CHECK-NEXT: ret void
+
+// CHECK: define {{.*}} i32 @hlsl::RasterizerOrderedStructuredBuffer<int vector[2]>::DecrementCounter()(ptr {{.*}} %this)
+// CHECK: %__counter_handle = getelementptr inbounds nuw %"class.hlsl::RasterizerOrderedStructuredBuffer.0", ptr %{{.*}}, i32 0, i32 1
+// CHECK-NEXT: %[[COUNTER_HANDLE:.*]] = load target("dx.RawBuffer", <2 x i32>, 1, 1), ptr %__counter_handle
+// DXIL-NEXT: %[[VAL:.*]] = call i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_v2i32_1_1t(target("dx.RawBuffer", <2 x i32>, 1, 1) %[[COUNTER_HANDLE]], i8 -1)
+// CHECK-NEXT: ret i32 %[[VAL]]
+
 export float TestLoad() {
-    return ROSB1.Load(10);
+    return ROSB1.Load(10).x + ROSB2.Load(20).x;
+}
+
+// CHECK: define {{.*}} float @TestLoad()()
+// CHECK: call {{.*}} float @hlsl::RasterizerOrderedStructuredBuffer<float>::Load(unsigned int)(ptr {{.*}} @ROSB1, i32 noundef 10)
+// CHECK: call {{.*}} <2 x i32> @hlsl::RasterizerOrderedStructuredBuffer<int vector[2]>::Load(unsigned int)(ptr {{.*}} @ROSB2, i32 noundef 20)
+// CHECK: ret
+
+// CHECK: define {{.*}} float @hlsl::RasterizerOrderedStructuredBuffer<float>::Load(unsigned int)(ptr {{.*}} %Index)
+// CHECK: %__handle = getelementptr inbounds nuw %"class.hlsl::RasterizerOrderedStructuredBuffer", ptr {{.*}}, i32 0, i32 0
+// CHECK-NEXT: %[[HANDLE:.*]] = load target("dx.RawBuffer", float, 1, 1), ptr %__handle
+// CHECK-NEXT: %[[INDEX:.*]] = load i32, ptr %Index.addr
+// DXIL-NEXT: %[[BUFPTR:.*]] = call ptr @llvm.dx.resource.getpointer.p0.tdx.RawBuffer_f32_1_1t(target("dx.RawBuffer", float, 1, 1) %[[HANDLE]], i32 %[[INDEX]])
+// CHECK-NEXT: %[[VAL:.*]] = load float, ptr %[[BUFPTR]]
+// CHECK-NEXT: ret float %[[VAL]]
+
+// CHECK: define {{.*}} <2 x i32> @hlsl::RasterizerOrderedStructuredBuffer<int vector[2]>::Load(unsigned int)(ptr {{.*}} %Index)
+// CHECK: %__handle = getelementptr inbounds nuw %"class.hlsl::RasterizerOrderedStructuredBuffer.0", ptr {{.*}}, i32 0, i32 0
+// CHECK-NEXT: %[[HANDLE:.*]] = load target("dx.RawBuffer", <2 x i32>, 1, 1), ptr %__handle
+// CHECK-NEXT: %[[INDEX:.*]] = load i32, ptr %Index.addr
+// DXIL-NEXT: %[[BUFPTR:.*]] = call ptr @llvm.dx.resource.getpointer.p0.tdx.RawBuffer_v2i32_1_1t(target("dx.RawBuffer", <2 x i32>, 1, 1) %[[HANDLE]], i32 %[[INDEX]])
+// CHECK-NEXT: %[[VAL:.*]] = load <2 x i32>, ptr %[[BUFPTR]]
+// CHECK-NEXT: ret <2 x i32> %[[VAL]]
+
+export uint TestGetDimensions() {
+    uint dim1, dim2, stride1, stride2;
+    ROSB1.GetDimensions(dim1, stride1);
+    ROSB2.GetDimensions(dim2, stride2);
+    return dim1 + dim2 + stride1 + stride2;
 }
+// CHECK: define noundef i32 @TestGetDimensions()()
+// CHECK: call void @hlsl::RasterizerOrderedStructuredBuffer<float>::GetDimensions(unsigned int&, unsigned int&)(ptr {{.*}} @ROSB1, ptr {{.*}}, ptr {{.*}})
+// CHECK: call void @hlsl::RasterizerOrderedStructuredBuffer<int vector[2]>::GetDimensions(unsigned int&, unsigned int&)(ptr {{.*}} @ROSB2, ptr {{.*}}, ptr {{.*}})
+// CHECK: add
+// CHECK: ret
+
+// CHECK: define {{.*}} void @hlsl::RasterizerOrderedStructuredBuffer<float>::GetDimensions(unsigned int&, unsigned int&)(ptr {{.*}}, ptr {{.*}} %numStructs, ptr {{.*}} %stride)
+// CHECK: %__handle = getelementptr inbounds nuw %"class.hlsl::RasterizerOrderedStructuredBuffer", ptr %{{.*}}, i32 0, i32 0
+// DXIL-NEXT: %[[HANDLE:.*]] = load target("dx.RawBuffer", float, 1, 1), ptr %__handle
+// CHECK-NEXT: %[[NUMSTRUCTS_PTR:.*]] = load ptr, ptr %numStructs.addr
+// DXIL-NEXT: %[[NUMSTRUCTS:.*]] = call i32 @llvm.dx.resource.getdimensions.x.tdx.RawBuffer_f32_1_1t(target("dx.RawBuffer", float, 1, 1) %[[HANDLE]])
+// CHECK-NEXT: store i32 %[[NUMSTRUCTS]], ptr %[[NUMSTRUCTS_PTR]]
+// CHECK-NEXT: %[[STRIDEPTR:.*]] = load ptr, ptr %stride.addr
+// CHECK-NEXT: store i32 4, ptr %[[STRIDEPTR]]
+// CHECK-NEXT: ret void
+
+// CHECK: define {{.*}} void @hlsl::RasterizerOrderedStructuredBuffer<int vector[2]>::GetDimensions(unsigned int&, unsigned int&)(ptr {{.*}}, ptr {{.*}} %numStructs, ptr {{.*}} %stride)
+// CHECK: %__handle = getelementptr inbounds nuw %"class.hlsl::RasterizerOrderedStructuredBuffer.0", ptr %{{.*}}, i32 0, i32 0
+// DXIL-NEXT: %[[HANDLE:.*]] = load target("dx.RawBuffer", <2 x i32>, 1, 1), ptr %__handle
+// CHECK-NEXT: %[[NUMSTRUCTS_PTR:.*]] = load ptr, ptr %numStructs.addr
+// DXIL-NEXT: %[[NUMSTRUCTS:.*]] = call i32 @llvm.dx.resource.getdimensions.x.tdx.RawBuffer_v2i32_1_1t(target("dx.RawBuffer", <2 x i32>, 1, 1) %[[HANDLE]])
+// CHECK-NEXT: store i32 %[[NUMSTRUCTS]], ptr %[[NUMSTRUCTS_PTR]]
+// CHECK-NEXT: %[[STRIDEPTR:.*]] = load ptr, ptr %stride.addr
+// CHECK-NEXT: store i32 8, ptr %[[STRIDEPTR]]
+// CHECK-NEXT: ret void
 
-// CHECK: define noundef nofpclass(nan inf) float @_Z8TestLoadv()
-// CHECK: %[[PTR1:.*]] = call ptr @llvm.dx.resource.getpointer.p0.tdx.RawBuffer_f32_1_1t(target("dx.RawBuffer", float, 1, 1) %{{[0-9]+}}, i32 %{{[0-9]+}})
-// CHECK: %[[VALUE1:.*]] = load float, ptr %[[PTR1]]
+// DXIL: declare i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f32_1_1t(target("dx.RawBuffer", float, 1, 1), i8)
+// DXIL: declare i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_v2i32_1_1t(target("dx.RawBuffer", <2 x i32>, 1, 1), i8)
+// DXIL: declare ptr @llvm.dx.resource.getpointer.p0.tdx.RawBuffer_f32_1_1t(target("dx.RawBuffer", float, 1, 1), i32)
+// DXIL: declare ptr @llvm.dx.resource.getpointer.p0.tdx.RawBuffer_v2i32_1_1t(target("dx.RawBuffer", <2 x i32>, 1, 1), i32)
 
-// CHECK: declare i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0), i8)
-// CHECK: declare i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f32_1_1t(target("dx.RawBuffer", float, 1, 1), i8)
-// CHECK: declare ptr @llvm.dx.resource.getpointer.p0.tdx.RawBuffer_f32_1_1t(target("dx.RawBuffer", float, 1, 1), i32)
+// DXIL: declare i32 @llvm.dx.resource.getdimensions.x.tdx.RawBuffer_f32_1_1t(target("dx.RawBuffer", float, 1, 1))
+// DXIL: declare i32 @llvm.dx.resource.getdimensions.x.tdx.RawBuffer_v2i32_1_1t(target("dx.RawBuffer", <2 x i32>, 1, 1))
diff --git a/clang/test/CodeGenHLSL/resources/RWBuffer-constructor.hlsl b/clang/test/CodeGenHLSL/resources/TypedBuffers-constructor.hlsl
similarity index 77%
rename from clang/test/CodeGenHLSL/resources/RWBuffer-constructor.hlsl
rename to clang/test/CodeGenHLSL/resources/TypedBuffers-constructor.hlsl
index ca33c4220dd73..1ec9f0f54441e 100644
--- a/clang/test/CodeGenHLSL/resources/RWBuffer-constructor.hlsl
+++ b/clang/test/CodeGenHLSL/resources/TypedBuffers-constructor.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -emit-llvm -disable-llvm-passes -o - %s | \
-// RUN:   llvm-cxxfilt | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL
+// RUN: llvm-cxxfilt | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL
 // FIXME: SPIR-V codegen of llvm.spv.resource.handlefrombinding and resource types is not yet implemented
 // RUN-DISABLED: %clang_cc1 -triple spirv-vulkan-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | \
 //        llvm-cxxfilt | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV
@@ -14,7 +14,7 @@
 RWBuffer<float> Buf1 : register(u5, space3);
 
 // Resource with implicit binding
-RWBuffer<double> Buf2;
+Buffer<double> Buf2;
 
 export void foo() {
     // Local resource declaration
@@ -22,12 +22,12 @@ export void foo() {
 }
 
 // CHECK: %"class.hlsl::RWBuffer" = type { target("dx.TypedBuffer", float, 1, 0, 0) }
-// CHECK: %"class.hlsl::RWBuffer.0" = type { target("dx.TypedBuffer", double, 1, 0, 0) }
-// CHECK: %"class.hlsl::RWBuffer.1" = type { target("dx.TypedBuffer", i32, 1, 0, 1) }
+// CHECK: %"class.hlsl::Buffer" = type { target("dx.TypedBuffer", double, 0, 0, 0) }
+// CHECK: %"class.hlsl::RWBuffer.0" = type { target("dx.TypedBuffer", i32, 1, 0, 1) }
 
 // CHECK: @Buf1 = internal global %"class.hlsl::RWBuffer" poison, align 4
 // CHECK: @[[Buf1Str:.*]] = private unnamed_addr constant [5 x i8] c"Buf1\00", align 1
-// CHECK: @Buf2 = internal global %"class.hlsl::RWBuffer.0" poison, align 4
+// CHECK: @Buf2 = internal global %"class.hlsl::Buffer" poison, align 4
 // CHECK: @[[Buf2Str:.*]] = private unnamed_addr constant [5 x i8] c"Buf2\00", align 1
 
 // Buf1 initialization part 1 - global init function that calls RWBuffer<float>::__createFromBinding
@@ -50,24 +50,24 @@ export void foo() {
 // Buf2 initialization part 1 - global init function that RWBuffer<float>::__createFromImplicitBinding
 // CHECK: define internal void @__cxx_global_var_init.1()
 // CHECK-NEXT: entry:
-// CHECK-NEXT: call void @hlsl::RWBuffer<double>::__createFromImplicitBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+// CHECK-NEXT: call void @hlsl::Buffer<double>::__createFromImplicitBinding(unsigned int, unsigned int, int, unsigned int, char const*)
 // CHECK-SAME: (ptr {{.*}} @Buf2, i32 noundef 0, i32 noundef 0, i32 noundef 1, i32 noundef 0, ptr noundef @[[Buf2Str]])
 
-// Buf2 initialization part 2 - body of RWBuffer<float>::__createFromImplicitBinding call
-// CHECK: define linkonce_odr hidden void @hlsl::RWBuffer<double>::__createFromImplicitBinding(unsigned int, unsigned int, int, unsigned int, char const*)
-// CHECK-SAME: (ptr {{.*}} sret(%"class.hlsl::RWBuffer.0") align 4 %[[RetValue2:.*]], i32 noundef %orderId, 
+// Buf2 initialization part 2 - body of Buffer<double>::__createFromImplicitBinding call
+// CHECK: define linkonce_odr hidden void @hlsl::Buffer<double>::__createFromImplicitBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+// CHECK-SAME: (ptr {{.*}} sret(%"class.hlsl::Buffer") align 4 %[[RetValue2:.*]], i32 noundef %orderId, 
 // CHECK-SAME: i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, ptr noundef %name)
-// CHECK: %[[Tmp2:.*]] = alloca %"class.hlsl::RWBuffer.0", align 4
-// CHECK: %[[Handle2:.*]] = call target("dx.TypedBuffer", double, 1, 0, 0)
-// CHECK-SAME: @llvm.dx.resource.handlefromimplicitbinding.tdx.TypedBuffer_f64_1_0_0t(
-// CHECK: %__handle = getelementptr inbounds nuw %"class.hlsl::RWBuffer.0", ptr %[[Tmp2]], i32 0, i32 0
-// CHECK-DXIL: store target("dx.TypedBuffer", double, 1, 0, 0) %[[Handle2]], ptr %__handle, align 4
-// CHECK: call void @hlsl::RWBuffer<double>::RWBuffer(hlsl::RWBuffer<double> const&)(ptr {{.*}} %[[RetValue2]], ptr {{.*}} %[[Tmp2]])
+// CHECK: %[[Tmp2:.*]] = alloca %"class.hlsl::Buffer", align 4
+// CHECK: %[[Handle2:.*]] = call target("dx.TypedBuffer", double, 0, 0, 0)
+// CHECK-SAME: @llvm.dx.resource.handlefromimplicitbinding.tdx.TypedBuffer_f64_0_0_0t(
+// CHECK: %__handle = getelementptr inbounds nuw %"class.hlsl::Buffer", ptr %[[Tmp2]], i32 0, i32 0
+// CHECK-DXIL: store target("dx.TypedBuffer", double, 0, 0, 0) %[[Handle2]], ptr %__handle, align 4
+// CHECK: call void @hlsl::Buffer<double>::Buffer(hlsl::Buffer<double> const&)(ptr {{.*}} %[[RetValue2]], ptr {{.*}} %[[Tmp2]])
 
 // Buf3 initialization part 1 - local variable declared in function foo() is initialized by RWBuffer<int> C1 default constructor
 // CHECK: define void @foo()
 // CHECK-NEXT: entry:
-// CHECK-NEXT: %Buf3 = alloca %"class.hlsl::RWBuffer.1", align 4
+// CHECK-NEXT: %Buf3 = alloca %"class.hlsl::RWBuffer.0", align 4
 // CHECK-NEXT: call void @hlsl::RWBuffer<int>::RWBuffer()(ptr {{.*}} %Buf3)
 
 // Buf3 initialization part 2 - body of RWBuffer<int> default C1 constructor that calls the default C2 constructor
@@ -76,11 +76,11 @@ export void foo() {
 
 // Buf3 initialization part 3 - body of RWBuffer<int> default C2 constructor that initializes handle to poison
 // CHECK: define linkonce_odr hidden void @hlsl::RWBuffer<int>::RWBuffer()(ptr {{.*}} %this)
-// CHECK: %__handle = getelementptr inbounds nuw %"class.hlsl::RWBuffer.1", ptr %{{.*}}, i32 0, i32 0
+// CHECK: %__handle = getelementptr inbounds nuw %"class.hlsl::RWBuffer.0", ptr %{{.*}}, i32 0, i32 0
 // CHECK-NEXT: store target("dx.TypedBuffer", i32, 1, 0, 1) poison, ptr %__handle, align 4
 
 // Module initialization
-// CHECK: define internal void @_GLOBAL__sub_I_RWBuffer_constructor.hlsl()
+// CHECK: define internal void @_GLOBAL__sub_I_TypedBuffers_constructor.hlsl()
 // CHECK-NEXT: entry:
 // CHECK-NEXT: call void @__cxx_global_var_init()
 // CHECK-NEXT: call void @__cxx_global_var_init.1()
diff --git a/clang/test/CodeGenHLSL/resources/TypedBuffers-elementtype.hlsl b/clang/test/CodeGenHLSL/resources/TypedBuffers-elementtype.hlsl
new file mode 100644
index 0000000000000..d3dba8a69cc72
--- /dev/null
+++ b/clang/test/CodeGenHLSL/resources/TypedBuffers-elementtype.hlsl
@@ -0,0 +1,94 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type \
+// RUN: -emit-llvm -o - -DRESOURCE=Buffer %s | FileCheck %s -DRESOURCE=Buffer -DRW=0 -check-prefixes=DXIL
+
+// RUN: %clang_cc1 -triple spirv-pc-vulkan-compute -finclude-default-header -fnative-half-type \
+// RUN: -emit-llvm -o - -DRESOURCE=Buffer %s | FileCheck %s -DRESOURCE=Buffer -DRW=1 -check-prefixes=SPV-RO
+
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type \
+// RUN: -emit-llvm -o - -DRESOURCE=RWBuffer %s | FileCheck %s -DRESOURCE=RWBuffer -DRW=1 -check-prefixes=DXIL
+
+// RUN: %clang_cc1 -triple spirv-pc-vulkan-compute -finclude-default-header -fnative-half-type \
+// RUN: -emit-llvm -o - -DRESOURCE=RWBuffer %s | FileCheck %s -DRESOURCE=RWBuffer --DRW=2 -check-prefixes=SPV-RW
+
+// DXIL: %"class.hlsl::[[RESOURCE]]" = type { target("dx.TypedBuffer", i16, [[RW]], 0, 1) }
+// DXIL: %"class.hlsl::[[RESOURCE]].0" = type { target("dx.TypedBuffer", i16, [[RW]], 0, 0) }
+// DXIL: %"class.hlsl::[[RESOURCE]].1" = type { target("dx.TypedBuffer", i32, [[RW]], 0, 1) }
+// DXIL: %"class.hlsl::[[RESOURCE]].2" = type { target("dx.TypedBuffer", i32, [[RW]], 0, 0) }
+// DXIL: %"class.hlsl::[[RESOURCE]].3" = type { target("dx.TypedBuffer", i64, [[RW]], 0, 1) }
+// DXIL: %"class.hlsl::[[RESOURCE]].4" = type { target("dx.TypedBuffer", i64, [[RW]], 0, 0) }
+// DXIL: %"class.hlsl::[[RESOURCE]].5" = type { target("dx.TypedBuffer", half, [[RW]], 0, 0) }
+// DXIL: %"class.hlsl::[[RESOURCE]].6" = type { target("dx.TypedBuffer", float, [[RW]], 0, 0) }
+// DXIL: %"class.hlsl::[[RESOURCE]].7" = type { target("dx.TypedBuffer", double, [[RW]], 0, 0) }
+// DXIL: %"class.hlsl::[[RESOURCE]].8" = type { target("dx.TypedBuffer", <4 x i16>, [[RW]], 0, 1) }
+// DXIL: %"class.hlsl::[[RESOURCE]].9" = type { target("dx.TypedBuffer", <3 x i32>, [[RW]], 0, 0) }
+// DXIL: %"class.hlsl::[[RESOURCE]].10" = type { target("dx.TypedBuffer", <2 x half>, [[RW]], 0, 0) }
+// DXIL: %"class.hlsl::[[RESOURCE]].11" = type { target("dx.TypedBuffer", <3 x float>, [[RW]], 0, 0) }
+// DXIL: %"class.hlsl::[[RESOURCE]].12" = type { target("dx.TypedBuffer", <4 x i32>, [[RW]], 0, 1) }
+
+// SPV-RO: %"class.hlsl::[[RESOURCE]]" = type { target("spirv.SignedImage", i16, 5, 2, 0, 0, 1, 0) }
+// SPV-RO: %"class.hlsl::[[RESOURCE]].0" = type { target("spirv.Image", i16, 5, 2, 0, 0, 1, 0) }
+// SPV-RO: %"class.hlsl::[[RESOURCE]].1" = type { target("spirv.SignedImage", i32, 5, 2, 0, 0, 1, 0) }
+// SPV-RO: %"class.hlsl::[[RESOURCE]].2" = type { target("spirv.Image", i32, 5, 2, 0, 0, 1, 0) }
+// SPV-RO: %"class.hlsl::[[RESOURCE]].3" = type { target("spirv.SignedImage", i64, 5, 2, 0, 0, 1, 0) }
+// SPV-RO: %"class.hlsl::[[RESOURCE]].4" = type { target("spirv.Image", i64, 5, 2, 0, 0, 1, 0) }
+// SPV-RO: %"class.hlsl::[[RESOURCE]].5" = type { target("spirv.Image", half, 5, 2, 0, 0, 1, 0) }
+// SPV-RO: %"class.hlsl::[[RESOURCE]].6" = type { target("spirv.Image", float, 5, 2, 0, 0, 1, 0) }
+// SPV-RO: %"class.hlsl::[[RESOURCE]].7" = type { target("spirv.Image", double, 5, 2, 0, 0, 1, 0) }
+// SPV-RO: %"class.hlsl::[[RESOURCE]].8" = type { target("spirv.SignedImage", i16, 5, 2, 0, 0, 1, 0) }
+// SPV-RO: %"class.hlsl::[[RESOURCE]].9" = type { target("spirv.Image", i32, 5, 2, 0, 0, 1, 0) }
+// SPV-RO: %"class.hlsl::[[RESOURCE]].10" = type { target("spirv.Image", half, 5, 2, 0, 0, 1, 0) }
+// SPV-RO: %"class.hlsl::[[RESOURCE]].11" = type { target("spirv.Image", float, 5, 2, 0, 0, 1, 0) }
+// SPV-RO: %"class.hlsl::[[RESOURCE]].12" = type { target("spirv.SignedImage", i32, 5, 2, 0, 0, 1, 0) }
+
+// SPV-RW: %"class.hlsl::[[RESOURCE]]" = type { target("spirv.SignedImage", i16, 5, 2, 0, 0, 2, 0) }
+// SPV-RW: %"class.hlsl::[[RESOURCE]].0" = type { target("spirv.Image", i16, 5, 2, 0, 0, 2, 0) }
+// SPV-RW: %"class.hlsl::[[RESOURCE]].1" = type { target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 24) }
+// SPV-RW: %"class.hlsl::[[RESOURCE]].2" = type { target("spirv.Image", i32, 5, 2, 0, 0, 2, 33) }
+// SPV-RW: %"class.hlsl::[[RESOURCE]].3" = type { target("spirv.SignedImage", i64, 5, 2, 0, 0, 2, 41) }
+// SPV-RW: %"class.hlsl::[[RESOURCE]].4" = type { target("spirv.Image", i64, 5, 2, 0, 0, 2, 40) }
+// SPV-RW: %"class.hlsl::[[RESOURCE]].5" = type { target("spirv.Image", half, 5, 2, 0, 0, 2, 0) }
+// SPV-RW: %"class.hlsl::[[RESOURCE]].6" = type { target("spirv.Image", float, 5, 2, 0, 0, 2, 3) }
+// SPV-RW: %"class.hlsl::[[RESOURCE]].7" = type { target("spirv.Image", double, 5, 2, 0, 0, 2, 0) }
+// SPV-RW: %"class.hlsl::[[RESOURCE]].8" = type { target("spirv.SignedImage", i16, 5, 2, 0, 0, 2, 0) }
+// SPV-RW: %"class.hlsl::[[RESOURCE]].9" = type { target("spirv.Image", i32, 5, 2, 0, 0, 2, 0) }
+// SPV-RW: %"class.hlsl::[[RESOURCE]].10" = type { target("spirv.Image", half, 5, 2, 0, 0, 2, 0) }
+// SPV-RW: %"class.hlsl::[[RESOURCE]].11" = type { target("spirv.Image", float, 5, 2, 0, 0, 2, 0) }
+// SPV-RW: %"class.hlsl::[[RESOURCE]].12" = type { target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 21) }
+
+RESOURCE<int16_t> BufI16;
+RESOURCE<uint16_t> BufU16;
+RESOURCE<int> BufI32;
+RESOURCE<uint> BufU32;
+RESOURCE<int64_t> BufI64;
+RESOURCE<uint64_t> BufU64;
+RESOURCE<half> BufF16;
+RESOURCE<float> BufF32;
+RESOURCE<double> BufF64;
+RESOURCE< vector<int16_t, 4> > BufI16x4;
+RESOURCE< vector<uint, 3> > BufU32x3;
+RESOURCE<half2> BufF16x2;
+RESOURCE<float3> BufF32x3;
+RESOURCE<int4> BufI32x4;
+// TODO: RESOURCE<snorm half> BufSNormF16; -> 11
+// TODO: RESOURCE<unorm half> BufUNormF16; -> 12
+// TODO: RESOURCE<snorm float> BufSNormF32; -> 13
+// TODO: RESOURCE<unorm float> BufUNormF32; -> 14
+// TODO: RESOURCE<snorm double> BufSNormF64; -> 15
+// TODO: RESOURCE<unorm double> BufUNormF64; -> 16
+
+[numthreads(1,1,1)]
+void main(int GI : SV_GroupIndex) {
+  int16_t v1 = BufI16[GI];
+  uint16_t v2 = BufU16[GI];
+  int v3 = BufI32[GI];
+  uint v4 = BufU32[GI];
+  int64_t v5 = BufI64[GI];
+  uint64_t v6 = BufU64[GI];
+  half v7 = BufF16[GI];
+  float v8 = BufF32[GI];
+  double v9 = BufF64[GI];
+  vector<int16_t,4> v10 = BufI16x4[GI];
+  vector<int, 3> v11 = BufU32x3[GI];
+  half2 v12 = BufF16x2[GI];
+  float3 v13 = BufF32x3[GI];
+}
diff --git a/clang/test/CodeGenHLSL/resources/TypedBuffers-methods.hlsl b/clang/test/CodeGenHLSL/resources/TypedBuffers-methods.hlsl
new file mode 100644
index 0000000000000..fdc1ef08b7c2c
--- /dev/null
+++ b/clang/test/CodeGenHLSL/resources/TypedBuffers-methods.hlsl
@@ -0,0 +1,74 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -finclude-default-header -emit-llvm -disable-llvm-passes -o - %s | llvm-cxxfilt | FileCheck %s --check-prefixes=CHECK,DXIL
+// RUN-DISABLED: %clang_cc1 -triple spirv-vulkan-library -finclude-default-header -emit-llvm -disable-llvm-passes -o - %s | llvm-cxxfilt | FileCheck %s --check-prefixes=CHECK,SPIRV
+
+// NOTE: SPIRV codegen for resource methods is not yet implemented
+
+Buffer<float> Buf : register(t0);
+RWBuffer<uint4> RWBuf : register(u0);
+
+// DXIL: %"class.hlsl::Buffer" = type { target("dx.TypedBuffer", float, 0, 0, 0) }
+// DXIL: %"class.hlsl::RWBuffer" = type { target("dx.TypedBuffer", <4 x i32>, 1, 0, 0) }
+
+// DXIL: @Buf = internal global %"class.hlsl::Buffer" poison
+// DXIL: @RWBuf = internal global %"class.hlsl::RWBuffer" poison
+
+export float TestLoad() {
+    return Buf.Load(1) + RWBuf.Load(2).y;
+}
+
+// CHECK: define noundef nofpclass(nan inf) float @TestLoad()()
+// CHECK: call {{.*}} float @hlsl::Buffer<float>::Load(unsigned int)(ptr {{.*}} @Buf, i32 noundef 1)
+// CHECK: call {{.*}} <4 x i32> @hlsl::RWBuffer<unsigned int vector[4]>::Load(unsigned int)(ptr {{.*}} @RWBuf, i32 noundef 2)
+// CHECK: add
+// CHECK: ret float
+
+// CHECK: define {{.*}} float @hlsl::Buffer<float>::Load(unsigned int)(ptr {{.*}} %this, i32 noundef %Index)
+// CHECK: %__handle = getelementptr inbounds nuw %"class.hlsl::Buffer", ptr %{{.*}}, i32 0, i32 0
+// DXIL-NEXT: %[[HANDLE:.*]] = load target("dx.TypedBuffer", float, 0, 0, 0), ptr %__handle
+// CHECK-NEXT: %[[INDEX:.*]] = load i32, ptr %Index.addr
+// DXIL-NEXT: %[[PTR:.*]] = call ptr @llvm.dx.resource.getpointer.p0.tdx.TypedBuffer_f32_0_0_0t(target("dx.TypedBuffer", float, 0, 0, 0) %[[HANDLE]], i32 %[[INDEX]])
+// CHECK-NEXT: %[[VAL:.*]] = load float, ptr %[[PTR]]
+// CHECK-NEXT: ret float %[[VAL]]
+
+// CHECK: define {{.*}} <4 x i32> @hlsl::RWBuffer<unsigned int vector[4]>::Load(unsigned int)(ptr {{.*}} %this, i32 noundef %Index)
+// CHECK: %__handle = getelementptr inbounds nuw %"class.hlsl::RWBuffer", ptr %{{.*}}, i32 0, i32 0
+// DXIL-NEXT: %[[HANDLE:.*]] = load target("dx.TypedBuffer", <4 x i32>, 1, 0, 0), ptr %__handle
+// CHECK-NEXT: %[[INDEX:.*]] = load i32, ptr %Index.addr
+// DXIL-NEXT: %[[PTR:.*]] = call ptr @llvm.dx.resource.getpointer.p0.tdx.TypedBuffer_v4i32_1_0_0t(target("dx.TypedBuffer", <4 x i32>, 1, 0, 0) %[[HANDLE]], i32 %[[INDEX]])
+// CHECK-NEXT: %[[VEC:.*]] = load <4 x i32>, ptr %[[PTR]]
+// CHECK-NEXT: ret <4 x i32> %[[VEC]]
+
+export uint TestGetDimensions() {
+    uint dim1, dim2;
+    Buf.GetDimensions(dim1);
+    RWBuf.GetDimensions(dim2);
+    return dim1 + dim2;
+}
+
+// CHECK: @TestGetDimensions()()
+// CHECK: call void @hlsl::Buffer<float>::GetDimensions(unsigned int&)(ptr {{.*}} @Buf, ptr {{.*}})
+// CHECK: call void @hlsl::RWBuffer<unsigned int vector[4]>::GetDimensions(unsigned int&)(ptr {{.*}} @RWBuf, ptr {{.*}})
+// CHECK: add
+// CHECK: ret
+
+// CHECK: define {{.*}} void @hlsl::Buffer<float>::GetDimensions(unsigned int&)(ptr {{.*}} %this, ptr noalias {{.*}} %dim)
+// CHECK: %[[HANDLE_PTR:.*]] = getelementptr inbounds nuw %"class.hlsl::Buffer", ptr %this1, i32 0, i32 0
+// CHECK-NEXT: %[[HANDLE:.*]] = load target("dx.TypedBuffer", float, 0, 0, 0), ptr %[[HANDLE_PTR]]
+// CHECK-NEXT: %[[DIMPTR:.*]] = load ptr, ptr %dim.addr
+// DXIL-NEXT: %[[DIM:.*]] = call i32 @llvm.dx.resource.getdimensions.x.tdx.TypedBuffer_f32_0_0_0t(target("dx.TypedBuffer", float, 0, 0, 0) %[[HANDLE]])
+// CHECK-NEXT: store i32 %[[DIM]], ptr %[[DIMPTR]]
+// CHECK-NEXT: ret void
+
+// CHECK: define {{.*}} void @hlsl::RWBuffer<unsigned int vector[4]>::GetDimensions(unsigned int&)(ptr {{.*}} %this, {{.*}} %dim)
+// CHECK: %[[HANDLE_PTR:.*]] = getelementptr inbounds nuw %"class.hlsl::RWBuffer", ptr %{{.*}}, i32 0, i32 0
+// CHECK-NEXT: %[[HANDLE:.*]] = load target("dx.TypedBuffer", <4 x i32>, 1, 0, 0), ptr %[[HANDLE_PTR]]
+// CHECK-NEXT: %[[DIMPTR:.*]] = load ptr, ptr %dim.addr
+// DXIL-NEXT: %[[DIM:.*]] = call i32 @llvm.dx.resource.getdimensions.x.tdx.TypedBuffer_v4i32_1_0_0t(target("dx.TypedBuffer", <4 x i32>, 1, 0, 0) %[[HANDLE]])
+// CHECK-NEXT: store i32 %[[DIM]], ptr %[[DIMPTR]]
+// CHECK-NEXT: ret void
+
+// DXIL: declare ptr @llvm.dx.resource.getpointer.p0.tdx.TypedBuffer_f32_0_0_0t(target("dx.TypedBuffer", float, 0, 0, 0), i32)
+// DXIL: declare ptr @llvm.dx.resource.getpointer.p0.tdx.TypedBuffer_v4i32_1_0_0t(target("dx.TypedBuffer", <4 x i32>, 1, 0, 0), i32)
+
+// DXIL: declare i32 @llvm.dx.resource.getdimensions.x.tdx.TypedBuffer_f32_0_0_0t(target("dx.TypedBuffer", float, 0, 0, 0))
+// DXIL: declare i32 @llvm.dx.resource.getdimensions.x.tdx.TypedBuffer_v4i32_1_0_0t(target("dx.TypedBuffer", <4 x i32>, 1, 0, 0))
diff --git a/clang/test/CodeGenHLSL/resources/TypedBuffers-subscript.hlsl b/clang/test/CodeGenHLSL/resources/TypedBuffers-subscript.hlsl
new file mode 100644
index 0000000000000..adc35f6097403
--- /dev/null
+++ b/clang/test/CodeGenHLSL/resources/TypedBuffers-subscript.hlsl
@@ -0,0 +1,26 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -emit-llvm -o - -O0 %s | FileCheck %s --check-prefixes=DXIL,CHECK
+// RUN: %clang_cc1 -triple spirv1.6-pc-vulkan1.3-compute -fspv-use-unknown-image-format -emit-llvm -o - -O0 %s | FileCheck %s --check-prefixes=SPIRV,CHECK
+
+Buffer<int> In;
+RWBuffer<int> Out;
+
+[numthreads(1,1,1)]
+void main(unsigned GI : SV_GroupIndex) {
+  // CHECK: define void @main()
+
+  // DXIL: %[[INPTR:.*]] = call {{.*}} ptr @llvm.dx.resource.getpointer.p0.tdx.TypedBuffer_i32_0_0_1t(target("dx.TypedBuffer", i32, 0, 0, 1) %{{.*}}, i32 %{{.*}})
+  // SPIRV: %[[INPTR:.*]] = call {{.*}} ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_1_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 1, 0) %{{.*}}, i32 %{{.*}})
+  // CHECK: %[[LOAD:.*]] = load i32, ptr {{.*}}%[[INPTR]]
+  // DXIL: %[[OUTPTR:.*]] = call {{.*}} ptr @llvm.dx.resource.getpointer.p0.tdx.TypedBuffer_i32_1_0_1t(target("dx.TypedBuffer", i32, 1, 0, 1) %{{.*}}, i32 %{{.*}})
+  // SPIRV: %[[OUTPTR:.*]] = call {{.*}} ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %{{.*}}, i32 %{{.*}})
+  // CHECK: store i32 %[[LOAD]], ptr {{.*}}%[[OUTPTR]]
+  Out[GI] = In[GI];
+
+  // DXIL: %[[INPTR:.*]] = call {{.*}} ptr @llvm.dx.resource.getpointer.p0.tdx.TypedBuffer_i32_1_0_1t(target("dx.TypedBuffer", i32, 1, 0, 1) %{{.*}}, i32 %{{.*}})
+  // SPIRV: %[[INPTR:.*]] = call {{.*}} ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %{{.*}}, i32 %{{.*}})
+  // CHECK: %[[LOAD:.*]] = load i32, ptr {{.*}}%[[INPTR]]
+  // DXIL: %[[OUTPTR:.*]] = call {{.*}} ptr @llvm.dx.resource.getpointer.p0.tdx.TypedBuffer_i32_1_0_1t(target("dx.TypedBuffer", i32, 1, 0, 1) %{{.*}}, i32 %{{.*}})
+  // SPIRV: %[[OUTPTR:.*]] = call {{.*}} ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %{{.*}}, i32 %{{.*}})
+  // CHECK: store i32 %[[LOAD]], ptr {{.*}}%[[OUTPTR]]
+  Out[GI + 1] = Out[GI];
+}
diff --git a/clang/test/CodeGenHLSL/vk-features/maximal_reconvergence.hlsl b/clang/test/CodeGenHLSL/vk-features/maximal_reconvergence.hlsl
new file mode 100644
index 0000000000000..f23ac7c5434ab
--- /dev/null
+++ b/clang/test/CodeGenHLSL/vk-features/maximal_reconvergence.hlsl
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -triple spirv1.6-unknown-vulkan1.3-compute -fspv-enable-maximal-reconvergence -emit-llvm -o - -O0 %s | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -triple spirv1.6-unknown-vulkan1.3-compute -hlsl-entry test -fspv-enable-maximal-reconvergence -emit-llvm -o - -O0 %s | FileCheck %s --check-prefixes=CHECK-ENTRY
+
+[numthreads(1,1,1)]
+void main() {
+// CHECK: define void @main() [[attributeNumber:#[0-9]+]] {
+}
+
+// CHECK: attributes [[attributeNumber]] = {{.*}} "enable-maximal-reconvergence"="true" {{.*}}
+
+
+[numthreads(1,1,1)]
+void test() {
+// CHECK-ENTRY: define void @test() [[attributeNumber:#[0-9]+]] {
+}
+    
+// CHECK-ENTRY: attributes [[attributeNumber]] = {{.*}} "enable-maximal-reconvergence"="true" {{.*}}
diff --git a/clang/test/CodeGenOpenCL/amdgcn-buffer-rsrc-type.cl b/clang/test/CodeGenOpenCL/amdgcn-buffer-rsrc-type.cl
index b55f663d6d948..05b7a9b40b02e 100644
--- a/clang/test/CodeGenOpenCL/amdgcn-buffer-rsrc-type.cl
+++ b/clang/test/CodeGenOpenCL/amdgcn-buffer-rsrc-type.cl
@@ -27,7 +27,7 @@ __amdgpu_buffer_rsrc_t getBuffer(void *p) {
 // CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq ptr addrspace(5) [[P]], addrspacecast (ptr null to ptr addrspace(5))
 // CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label %[[IF_END:.*]], label %[[IF_THEN:.*]]
 // CHECK:       [[IF_THEN]]:
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(8), ptr addrspace(5) [[P]], align 16, !tbaa [[__AMDGPU_BUFFER_RSRC_T_TBAA4:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(8), ptr addrspace(5) [[P]], align 16, !tbaa [[__AMDGPU_BUFFER_RSRC_T_TBAA8:![0-9]+]]
 // CHECK-NEXT:    tail call void @consumeBuffer(ptr addrspace(8) [[TMP0]]) #[[ATTR2]]
 // CHECK-NEXT:    br label %[[IF_END]]
 // CHECK:       [[IF_END]]:
@@ -41,14 +41,14 @@ void consumeBufferPtr(__amdgpu_buffer_rsrc_t *p) {
 // CHECK-LABEL: define dso_local void @test(
 // CHECK-SAME: ptr addrspace(5) noundef readonly captures(address) [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A]], align 16, !tbaa [[INT_TBAA8:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A]], align 16, !tbaa [[INT_TBAA10:![0-9]+]]
 // CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP0]], 0
 // CHECK-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq ptr addrspace(5) [[A]], addrspacecast (ptr null to ptr addrspace(5))
 // CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[TOBOOL_NOT]], i1 true, i1 [[TOBOOL_NOT_I]]
 // CHECK-NEXT:    br i1 [[OR_COND]], label %[[IF_END:.*]], label %[[IF_THEN_I:.*]]
 // CHECK:       [[IF_THEN_I]]:
 // CHECK-NEXT:    [[R:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) [[A]], i32 16
-// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(8), ptr addrspace(5) [[R]], align 16, !tbaa [[__AMDGPU_BUFFER_RSRC_T_TBAA4]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(8), ptr addrspace(5) [[R]], align 16, !tbaa [[__AMDGPU_BUFFER_RSRC_T_TBAA8]]
 // CHECK-NEXT:    tail call void @consumeBuffer(ptr addrspace(8) [[TMP1]]) #[[ATTR2]]
 // CHECK-NEXT:    br label %[[IF_END]]
 // CHECK:       [[IF_END]]:
@@ -81,11 +81,11 @@ AA bar(void *p) {
   return a;
 }
 //.
-// CHECK: [[__AMDGPU_BUFFER_RSRC_T_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
-// CHECK: [[META5]] = !{!"__amdgpu_buffer_rsrc_t", [[META6:![0-9]+]], i64 0}
+// CHECK: [[META5:![0-9]+]] = !{!"int", [[META6:![0-9]+]], i64 0}
 // CHECK: [[META6]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
 // CHECK: [[META7]] = !{!"Simple C/C++ TBAA"}
-// CHECK: [[INT_TBAA8]] = !{[[META9:![0-9]+]], [[META10:![0-9]+]], i64 0}
-// CHECK: [[META9]] = !{!"AA_ty", [[META10]], i64 0, [[META5]], i64 16}
-// CHECK: [[META10]] = !{!"int", [[META6]], i64 0}
+// CHECK: [[__AMDGPU_BUFFER_RSRC_T_TBAA8]] = !{[[META9:![0-9]+]], [[META9]], i64 0}
+// CHECK: [[META9]] = !{!"__amdgpu_buffer_rsrc_t", [[META6]], i64 0}
+// CHECK: [[INT_TBAA10]] = !{[[META11:![0-9]+]], [[META5]], i64 0}
+// CHECK: [[META11]] = !{!"AA_ty", [[META5]], i64 0, [[META9]], i64 16}
 //.
diff --git a/clang/test/CodeGenOpenCL/amdgpu-cluster-dims.cl b/clang/test/CodeGenOpenCL/amdgpu-cluster-dims.cl
new file mode 100644
index 0000000000000..8c3e5b70ea308
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/amdgpu-cluster-dims.cl
@@ -0,0 +1,47 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-attributes --check-globals all --include-generated-funcs --prefix-filecheck-ir-name VAR --version 6
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1250 -disable-llvm-passes -fno-ident -emit-llvm %s -o - | FileCheck %s
+
+kernel void foo(global int *p) { *p = 1; }
+// CHECK: Function Attrs: convergent norecurse nounwind
+// CHECK-LABEL: define dso_local amdgpu_kernel void @foo(
+// CHECK-SAME: ptr addrspace(1) noundef align 4 [[P:%.*]]) #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META7:![0-9]+]] !kernel_arg_access_qual [[META8:![0-9]+]] !kernel_arg_type [[META9:![0-9]+]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META10:![0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT:    [[P_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[P_ADDR]] to ptr
+// CHECK-NEXT:    store ptr addrspace(1) [[P]], ptr [[P_ADDR_ASCAST]], align 8, !tbaa [[INTPTR_TBAA11:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[P_ADDR_ASCAST]], align 8, !tbaa [[INTPTR_TBAA11]]
+// CHECK-NEXT:    call void @__clang_ocl_kern_imp_foo(ptr addrspace(1) noundef align 4 [[TMP0]]) #[[ATTR2:[0-9]+]]
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK: Function Attrs: alwaysinline convergent norecurse nounwind
+// CHECK-LABEL: define dso_local void @__clang_ocl_kern_imp_foo(
+// CHECK-SAME: ptr addrspace(1) noundef align 4 [[P:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META7]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META10]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT:    [[P_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[P_ADDR]] to ptr
+// CHECK-NEXT:    store ptr addrspace(1) [[P]], ptr [[P_ADDR_ASCAST]], align 8, !tbaa [[INTPTR_TBAA11]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[P_ADDR_ASCAST]], align 8, !tbaa [[INTPTR_TBAA11]]
+// CHECK-NEXT:    store i32 1, ptr addrspace(1) [[TMP0]], align 4, !tbaa [[INT_TBAA3:![0-9]+]]
+// CHECK-NEXT:    ret void
+//
+//.
+// CHECK: attributes #[[ATTR0]] = { convergent norecurse nounwind "amdgpu-cluster-dims"="0,0,0" "amdgpu-flat-work-group-size"="1,256" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1250" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32" "uniform-work-group-size"="false" }
+// CHECK: attributes #[[ATTR1]] = { alwaysinline convergent norecurse nounwind "amdgpu-cluster-dims"="0,0,0" "amdgpu-flat-work-group-size"="1,256" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1250" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32" }
+// CHECK: attributes #[[ATTR2]] = { convergent nounwind }
+//.
+// CHECK: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
+// CHECK: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// CHECK: [[META2:![0-9]+]] = !{i32 2, i32 0}
+// CHECK: [[INT_TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+// CHECK: [[META4]] = !{!"int", [[META5:![0-9]+]], i64 0}
+// CHECK: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
+// CHECK: [[META6]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[META7]] = !{i32 1}
+// CHECK: [[META8]] = !{!"none"}
+// CHECK: [[META9]] = !{!"int*"}
+// CHECK: [[META10]] = !{!""}
+// CHECK: [[INTPTR_TBAA11]] = !{[[META12:![0-9]+]], [[META12]], i64 0}
+// CHECK: [[META12]] = !{!"p1 int", [[META13:![0-9]+]], i64 0}
+// CHECK: [[META13]] = !{!"any pointer", [[META5]], i64 0}
+//.
diff --git a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
index 6d573238440d2..e9adac23a6530 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
@@ -451,19 +451,19 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // GFX900-NEXT:    [[ID_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ID_ADDR]] to ptr
 // GFX900-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
-// GFX900-NEXT:    store i64 [[ID]], ptr [[ID_ADDR_ASCAST]], align 8, !tbaa [[LONG_TBAA3:![0-9]+]]
-// GFX900-NEXT:    store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8, !tbaa [[LONGPTR_TBAA7:![0-9]+]]
-// GFX900-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ID_ADDR_ASCAST]], align 8, !tbaa [[LONG_TBAA3]]
-// GFX900-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8, !tbaa [[LONGPTR_TBAA7]]
-// GFX900-NEXT:    [[TMP2:%.*]] = load i64, ptr [[ID_ADDR_ASCAST]], align 8, !tbaa [[LONG_TBAA3]]
+// GFX900-NEXT:    store i64 [[ID]], ptr [[ID_ADDR_ASCAST]], align 8, !tbaa [[LONG_TBAA7:![0-9]+]]
+// GFX900-NEXT:    store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8, !tbaa [[LONGPTR_TBAA9:![0-9]+]]
+// GFX900-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ID_ADDR_ASCAST]], align 8, !tbaa [[LONG_TBAA7]]
+// GFX900-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8, !tbaa [[LONGPTR_TBAA9]]
+// GFX900-NEXT:    [[TMP2:%.*]] = load i64, ptr [[ID_ADDR_ASCAST]], align 8, !tbaa [[LONG_TBAA7]]
 // GFX900-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[TMP1]], i64 [[TMP2]]
-// GFX900-NEXT:    store i64 [[TMP0]], ptr addrspace(1) [[ARRAYIDX]], align 8, !tbaa [[LONG_TBAA3]]
+// GFX900-NEXT:    store i64 [[TMP0]], ptr addrspace(1) [[ARRAYIDX]], align 8, !tbaa [[LONG_TBAA7]]
 // GFX900-NEXT:    ret void
 //
 //
 // GFX900: Function Attrs: convergent norecurse nounwind
 // GFX900-LABEL: define dso_local amdgpu_kernel void @test(
-// GFX900-SAME: ptr addrspace(1) noundef align 1 [[A:%.*]], i8 noundef [[B:%.*]], ptr addrspace(1) noundef align 8 [[C:%.*]], i64 noundef [[D:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] {
+// GFX900-SAME: ptr addrspace(1) noundef align 1 [[A:%.*]], i8 noundef [[B:%.*]], ptr addrspace(1) noundef align 8 [[C:%.*]], i64 noundef [[D:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space [[META12:![0-9]+]] !kernel_arg_access_qual [[META13:![0-9]+]] !kernel_arg_type [[META14:![0-9]+]] !kernel_arg_base_type [[META14]] !kernel_arg_type_qual [[META15:![0-9]+]] {
 // GFX900-NEXT:  [[ENTRY:.*:]]
 // GFX900-NEXT:    [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // GFX900-NEXT:    [[B_ADDR:%.*]] = alloca i8, align 1, addrspace(5)
@@ -473,21 +473,21 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
 // GFX900-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
 // GFX900-NEXT:    [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
-// GFX900-NEXT:    store ptr addrspace(1) [[A]], ptr [[A_ADDR_ASCAST]], align 8, !tbaa [[CHARPTR_TBAA14:![0-9]+]]
-// GFX900-NEXT:    store i8 [[B]], ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[CHAR_TBAA16:![0-9]+]]
-// GFX900-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR_ASCAST]], align 8, !tbaa [[LONGPTR_TBAA7]]
-// GFX900-NEXT:    store i64 [[D]], ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[LONG_TBAA3]]
-// GFX900-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[A_ADDR_ASCAST]], align 8, !tbaa [[CHARPTR_TBAA14]]
-// GFX900-NEXT:    [[TMP1:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[CHAR_TBAA16]]
-// GFX900-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[C_ADDR_ASCAST]], align 8, !tbaa [[LONGPTR_TBAA7]]
-// GFX900-NEXT:    [[TMP3:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[LONG_TBAA3]]
+// GFX900-NEXT:    store ptr addrspace(1) [[A]], ptr [[A_ADDR_ASCAST]], align 8, !tbaa [[CHARPTR_TBAA16:![0-9]+]]
+// GFX900-NEXT:    store i8 [[B]], ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[CHAR_TBAA18:![0-9]+]]
+// GFX900-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR_ASCAST]], align 8, !tbaa [[LONGPTR_TBAA9]]
+// GFX900-NEXT:    store i64 [[D]], ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[LONG_TBAA7]]
+// GFX900-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[A_ADDR_ASCAST]], align 8, !tbaa [[CHARPTR_TBAA16]]
+// GFX900-NEXT:    [[TMP1:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[CHAR_TBAA18]]
+// GFX900-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[C_ADDR_ASCAST]], align 8, !tbaa [[LONGPTR_TBAA9]]
+// GFX900-NEXT:    [[TMP3:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[LONG_TBAA7]]
 // GFX900-NEXT:    call void @__clang_ocl_kern_imp_test(ptr addrspace(1) noundef align 1 [[TMP0]], i8 noundef signext [[TMP1]], ptr addrspace(1) noundef align 8 [[TMP2]], i64 noundef [[TMP3]]) #[[ATTR8:[0-9]+]]
 // GFX900-NEXT:    ret void
 //
 //
 // GFX900: Function Attrs: alwaysinline convergent norecurse nounwind
 // GFX900-LABEL: define dso_local void @__clang_ocl_kern_imp_test(
-// GFX900-SAME: ptr addrspace(1) noundef align 1 [[A:%.*]], i8 noundef signext [[B:%.*]], ptr addrspace(1) noundef align 8 [[C:%.*]], i64 noundef [[D:%.*]]) #[[ATTR3:[0-9]+]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META12]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13]] {
+// GFX900-SAME: ptr addrspace(1) noundef align 1 [[A:%.*]], i8 noundef signext [[B:%.*]], ptr addrspace(1) noundef align 8 [[C:%.*]], i64 noundef [[D:%.*]]) #[[ATTR3:[0-9]+]] !kernel_arg_addr_space [[META12]] !kernel_arg_access_qual [[META13]] !kernel_arg_type [[META14]] !kernel_arg_base_type [[META14]] !kernel_arg_type_qual [[META15]] {
 // GFX900-NEXT:  [[ENTRY:.*:]]
 // GFX900-NEXT:    [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
 // GFX900-NEXT:    [[B_ADDR:%.*]] = alloca i8, align 1, addrspace(5)
@@ -519,16 +519,16 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    [[BLOCK_SIZES_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK_SIZES]] to ptr
 // GFX900-NEXT:    [[BLOCK21_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK21]] to ptr
 // GFX900-NEXT:    [[TMP27_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VARTMP27]] to ptr
-// GFX900-NEXT:    store ptr addrspace(1) [[A]], ptr [[A_ADDR_ASCAST]], align 8, !tbaa [[CHARPTR_TBAA14]]
-// GFX900-NEXT:    store i8 [[B]], ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[CHAR_TBAA16]]
-// GFX900-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR_ASCAST]], align 8, !tbaa [[LONGPTR_TBAA7]]
-// GFX900-NEXT:    store i64 [[D]], ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[LONG_TBAA3]]
+// GFX900-NEXT:    store ptr addrspace(1) [[A]], ptr [[A_ADDR_ASCAST]], align 8, !tbaa [[CHARPTR_TBAA16]]
+// GFX900-NEXT:    store i8 [[B]], ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[CHAR_TBAA18]]
+// GFX900-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR_ASCAST]], align 8, !tbaa [[LONGPTR_TBAA9]]
+// GFX900-NEXT:    store i64 [[D]], ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[LONG_TBAA7]]
 // GFX900-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR9:[0-9]+]]
 // GFX900-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[FLAGS]]) #[[ATTR9]]
-// GFX900-NEXT:    store i32 0, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[INT_TBAA17:![0-9]+]]
+// GFX900-NEXT:    store i32 0, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[INT_TBAA3:![0-9]+]]
 // GFX900-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[NDRANGE]]) #[[ATTR9]]
 // GFX900-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[QUEUE_T_TBAA19:![0-9]+]]
-// GFX900-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[INT_TBAA17]]
+// GFX900-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[INT_TBAA3]]
 // GFX900-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21:![0-9]+]]
 // GFX900-NEXT:    [[BLOCK_SIZE:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[BLOCK_ASCAST]], i32 0, i32 0
 // GFX900-NEXT:    store i32 25, ptr [[BLOCK_SIZE]], align 8
@@ -537,14 +537,14 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    [[BLOCK_INVOKE:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[BLOCK_ASCAST]], i32 0, i32 2
 // GFX900-NEXT:    store ptr @__test_block_invoke, ptr [[BLOCK_INVOKE]], align 8
 // GFX900-NEXT:    [[BLOCK_CAPTURED:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[BLOCK_ASCAST]], i32 0, i32 3
-// GFX900-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[A_ADDR_ASCAST]], align 8, !tbaa [[CHARPTR_TBAA14]]
-// GFX900-NEXT:    store ptr addrspace(1) [[TMP2]], ptr [[BLOCK_CAPTURED]], align 8, !tbaa [[CHARPTR_TBAA14]]
+// GFX900-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr [[A_ADDR_ASCAST]], align 8, !tbaa [[CHARPTR_TBAA16]]
+// GFX900-NEXT:    store ptr addrspace(1) [[TMP2]], ptr [[BLOCK_CAPTURED]], align 8, !tbaa [[CHARPTR_TBAA16]]
 // GFX900-NEXT:    [[BLOCK_CAPTURED1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[BLOCK_ASCAST]], i32 0, i32 4
-// GFX900-NEXT:    [[TMP3:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[CHAR_TBAA16]]
-// GFX900-NEXT:    store i8 [[TMP3]], ptr [[BLOCK_CAPTURED1]], align 8, !tbaa [[CHAR_TBAA16]]
+// GFX900-NEXT:    [[TMP3:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[CHAR_TBAA18]]
+// GFX900-NEXT:    store i8 [[TMP3]], ptr [[BLOCK_CAPTURED1]], align 8, !tbaa [[CHAR_TBAA18]]
 // GFX900-NEXT:    [[TMP4:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr addrspace(5) [[TMP]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_kernel.runtime.handle to ptr), ptr [[BLOCK_ASCAST]])
 // GFX900-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[QUEUE_T_TBAA19]]
-// GFX900-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[INT_TBAA17]]
+// GFX900-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[INT_TBAA3]]
 // GFX900-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP2_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]]
 // GFX900-NEXT:    [[BLOCK_SIZE4:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 0
 // GFX900-NEXT:    store i32 41, ptr [[BLOCK_SIZE4]], align 8
@@ -553,20 +553,20 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    [[BLOCK_INVOKE6:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 2
 // GFX900-NEXT:    store ptr @__test_block_invoke_2, ptr [[BLOCK_INVOKE6]], align 8
 // GFX900-NEXT:    [[BLOCK_CAPTURED7:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 3
-// GFX900-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr [[A_ADDR_ASCAST]], align 8, !tbaa [[CHARPTR_TBAA14]]
-// GFX900-NEXT:    store ptr addrspace(1) [[TMP7]], ptr [[BLOCK_CAPTURED7]], align 8, !tbaa [[CHARPTR_TBAA14]]
+// GFX900-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr [[A_ADDR_ASCAST]], align 8, !tbaa [[CHARPTR_TBAA16]]
+// GFX900-NEXT:    store ptr addrspace(1) [[TMP7]], ptr [[BLOCK_CAPTURED7]], align 8, !tbaa [[CHARPTR_TBAA16]]
 // GFX900-NEXT:    [[BLOCK_CAPTURED8:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 6
-// GFX900-NEXT:    [[TMP8:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[CHAR_TBAA16]]
-// GFX900-NEXT:    store i8 [[TMP8]], ptr [[BLOCK_CAPTURED8]], align 8, !tbaa [[CHAR_TBAA16]]
+// GFX900-NEXT:    [[TMP8:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[CHAR_TBAA18]]
+// GFX900-NEXT:    store i8 [[TMP8]], ptr [[BLOCK_CAPTURED8]], align 8, !tbaa [[CHAR_TBAA18]]
 // GFX900-NEXT:    [[BLOCK_CAPTURED9:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 4
-// GFX900-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[C_ADDR_ASCAST]], align 8, !tbaa [[LONGPTR_TBAA7]]
-// GFX900-NEXT:    store ptr addrspace(1) [[TMP9]], ptr [[BLOCK_CAPTURED9]], align 8, !tbaa [[LONGPTR_TBAA7]]
+// GFX900-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[C_ADDR_ASCAST]], align 8, !tbaa [[LONGPTR_TBAA9]]
+// GFX900-NEXT:    store ptr addrspace(1) [[TMP9]], ptr [[BLOCK_CAPTURED9]], align 8, !tbaa [[LONGPTR_TBAA9]]
 // GFX900-NEXT:    [[BLOCK_CAPTURED10:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 5
-// GFX900-NEXT:    [[TMP10:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[LONG_TBAA3]]
-// GFX900-NEXT:    store i64 [[TMP10]], ptr [[BLOCK_CAPTURED10]], align 8, !tbaa [[LONG_TBAA3]]
+// GFX900-NEXT:    [[TMP10:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[LONG_TBAA7]]
+// GFX900-NEXT:    store i64 [[TMP10]], ptr [[BLOCK_CAPTURED10]], align 8, !tbaa [[LONG_TBAA7]]
 // GFX900-NEXT:    [[TMP11:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP5]], i32 [[TMP6]], ptr addrspace(5) [[VARTMP2]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_2_kernel.runtime.handle to ptr), ptr [[BLOCK3_ASCAST]])
 // GFX900-NEXT:    [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[QUEUE_T_TBAA19]]
-// GFX900-NEXT:    [[TMP13:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[INT_TBAA17]]
+// GFX900-NEXT:    [[TMP13:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[INT_TBAA3]]
 // GFX900-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP11_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]]
 // GFX900-NEXT:    [[BLOCK_SIZE13:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 0
 // GFX900-NEXT:    store i32 41, ptr [[BLOCK_SIZE13]], align 8
@@ -575,17 +575,17 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    [[BLOCK_INVOKE15:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 2
 // GFX900-NEXT:    store ptr @__test_block_invoke_3, ptr [[BLOCK_INVOKE15]], align 8
 // GFX900-NEXT:    [[BLOCK_CAPTURED16:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 3
-// GFX900-NEXT:    [[TMP14:%.*]] = load ptr addrspace(1), ptr [[A_ADDR_ASCAST]], align 8, !tbaa [[CHARPTR_TBAA14]]
-// GFX900-NEXT:    store ptr addrspace(1) [[TMP14]], ptr [[BLOCK_CAPTURED16]], align 8, !tbaa [[CHARPTR_TBAA14]]
+// GFX900-NEXT:    [[TMP14:%.*]] = load ptr addrspace(1), ptr [[A_ADDR_ASCAST]], align 8, !tbaa [[CHARPTR_TBAA16]]
+// GFX900-NEXT:    store ptr addrspace(1) [[TMP14]], ptr [[BLOCK_CAPTURED16]], align 8, !tbaa [[CHARPTR_TBAA16]]
 // GFX900-NEXT:    [[BLOCK_CAPTURED17:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 6
-// GFX900-NEXT:    [[TMP15:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[CHAR_TBAA16]]
-// GFX900-NEXT:    store i8 [[TMP15]], ptr [[BLOCK_CAPTURED17]], align 8, !tbaa [[CHAR_TBAA16]]
+// GFX900-NEXT:    [[TMP15:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[CHAR_TBAA18]]
+// GFX900-NEXT:    store i8 [[TMP15]], ptr [[BLOCK_CAPTURED17]], align 8, !tbaa [[CHAR_TBAA18]]
 // GFX900-NEXT:    [[BLOCK_CAPTURED18:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 4
-// GFX900-NEXT:    [[TMP16:%.*]] = load ptr addrspace(1), ptr [[C_ADDR_ASCAST]], align 8, !tbaa [[LONGPTR_TBAA7]]
-// GFX900-NEXT:    store ptr addrspace(1) [[TMP16]], ptr [[BLOCK_CAPTURED18]], align 8, !tbaa [[LONGPTR_TBAA7]]
+// GFX900-NEXT:    [[TMP16:%.*]] = load ptr addrspace(1), ptr [[C_ADDR_ASCAST]], align 8, !tbaa [[LONGPTR_TBAA9]]
+// GFX900-NEXT:    store ptr addrspace(1) [[TMP16]], ptr [[BLOCK_CAPTURED18]], align 8, !tbaa [[LONGPTR_TBAA9]]
 // GFX900-NEXT:    [[BLOCK_CAPTURED19:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 5
-// GFX900-NEXT:    [[TMP17:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[LONG_TBAA3]]
-// GFX900-NEXT:    store i64 [[TMP17]], ptr [[BLOCK_CAPTURED19]], align 8, !tbaa [[LONG_TBAA3]]
+// GFX900-NEXT:    [[TMP17:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[LONG_TBAA7]]
+// GFX900-NEXT:    store i64 [[TMP17]], ptr [[BLOCK_CAPTURED19]], align 8, !tbaa [[LONG_TBAA7]]
 // GFX900-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[BLOCK_SIZES]]) #[[ATTR9]]
 // GFX900-NEXT:    [[TMP18:%.*]] = getelementptr [1 x i64], ptr addrspace(5) [[BLOCK_SIZES]], i32 0, i32 0
 // GFX900-NEXT:    store i64 100, ptr addrspace(5) [[TMP18]], align 8
@@ -599,16 +599,16 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    [[BLOCK_INVOKE24:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 2
 // GFX900-NEXT:    store ptr @__test_block_invoke_4, ptr [[BLOCK_INVOKE24]], align 8
 // GFX900-NEXT:    [[BLOCK_CAPTURED25:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 3
-// GFX900-NEXT:    [[TMP20:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[LONG_TBAA3]]
-// GFX900-NEXT:    store i64 [[TMP20]], ptr [[BLOCK_CAPTURED25]], align 8, !tbaa [[LONG_TBAA3]]
+// GFX900-NEXT:    [[TMP20:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[LONG_TBAA7]]
+// GFX900-NEXT:    store i64 [[TMP20]], ptr [[BLOCK_CAPTURED25]], align 8, !tbaa [[LONG_TBAA7]]
 // GFX900-NEXT:    [[BLOCK_CAPTURED26:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 4
-// GFX900-NEXT:    [[TMP21:%.*]] = load ptr addrspace(1), ptr [[C_ADDR_ASCAST]], align 8, !tbaa [[LONGPTR_TBAA7]]
-// GFX900-NEXT:    store ptr addrspace(1) [[TMP21]], ptr [[BLOCK_CAPTURED26]], align 8, !tbaa [[LONGPTR_TBAA7]]
-// GFX900-NEXT:    store ptr [[BLOCK21_ASCAST]], ptr addrspace(5) [[BLOCK20]], align 8, !tbaa [[CHAR_TBAA16]]
+// GFX900-NEXT:    [[TMP21:%.*]] = load ptr addrspace(1), ptr [[C_ADDR_ASCAST]], align 8, !tbaa [[LONGPTR_TBAA9]]
+// GFX900-NEXT:    store ptr addrspace(1) [[TMP21]], ptr [[BLOCK_CAPTURED26]], align 8, !tbaa [[LONGPTR_TBAA9]]
+// GFX900-NEXT:    store ptr [[BLOCK21_ASCAST]], ptr addrspace(5) [[BLOCK20]], align 8, !tbaa [[CHAR_TBAA18]]
 // GFX900-NEXT:    [[TMP22:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[QUEUE_T_TBAA19]]
-// GFX900-NEXT:    [[TMP23:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[INT_TBAA17]]
+// GFX900-NEXT:    [[TMP23:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[INT_TBAA3]]
 // GFX900-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP27_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]]
-// GFX900-NEXT:    [[TMP24:%.*]] = load ptr, ptr addrspace(5) [[BLOCK20]], align 8, !tbaa [[CHAR_TBAA16]]
+// GFX900-NEXT:    [[TMP24:%.*]] = load ptr, ptr addrspace(5) [[BLOCK20]], align 8, !tbaa [[CHAR_TBAA18]]
 // GFX900-NEXT:    [[TMP25:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP22]], i32 [[TMP23]], ptr addrspace(5) [[VARTMP27]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_4_kernel.runtime.handle to ptr), ptr [[BLOCK21_ASCAST]])
 // GFX900-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[BLOCK20]]) #[[ATTR9]]
 // GFX900-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[NDRANGE]]) #[[ATTR9]]
@@ -643,11 +643,11 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    store ptr addrspace(1) [[I]], ptr [[I_ADDR_ASCAST]], align 8, !tbaa [[INTPTR_TBAA26]]
 // GFX900-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR9]]
 // GFX900-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[FLAGS]]) #[[ATTR9]]
-// GFX900-NEXT:    store i32 0, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[INT_TBAA17]]
+// GFX900-NEXT:    store i32 0, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[INT_TBAA3]]
 // GFX900-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[NDRANGE]]) #[[ATTR9]]
 // GFX900-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime()
 // GFX900-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[QUEUE_T_TBAA19]]
-// GFX900-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[INT_TBAA17]]
+// GFX900-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[INT_TBAA3]]
 // GFX900-NEXT:    call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]]
 // GFX900-NEXT:    [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr addrspace(5) [[TMP]], ptr addrspacecast (ptr addrspace(1) @__test_target_features_kernel_block_invoke_kernel.runtime.handle to ptr), ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr))
 // GFX900-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[NDRANGE]]) #[[ATTR9]]
@@ -664,11 +664,11 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr
 // GFX900-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST]], align 8
 // GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4
-// GFX900-NEXT:    [[TMP0:%.*]] = load i8, ptr [[BLOCK_CAPTURE_ADDR]], align 8, !tbaa [[CHAR_TBAA16]]
+// GFX900-NEXT:    [[TMP0:%.*]] = load i8, ptr [[BLOCK_CAPTURE_ADDR]], align 8, !tbaa [[CHAR_TBAA18]]
 // GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3
-// GFX900-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8, !tbaa [[CHARPTR_TBAA14]]
+// GFX900-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8, !tbaa [[CHARPTR_TBAA16]]
 // GFX900-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1]], i64 0
-// GFX900-NEXT:    store i8 [[TMP0]], ptr addrspace(1) [[ARRAYIDX]], align 1, !tbaa [[CHAR_TBAA16]]
+// GFX900-NEXT:    store i8 [[TMP0]], ptr addrspace(1) [[ARRAYIDX]], align 1, !tbaa [[CHAR_TBAA18]]
 // GFX900-NEXT:    ret void
 //
 //
@@ -691,17 +691,17 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr
 // GFX900-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST]], align 8
 // GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 6
-// GFX900-NEXT:    [[TMP0:%.*]] = load i8, ptr [[BLOCK_CAPTURE_ADDR]], align 8, !tbaa [[CHAR_TBAA16]]
+// GFX900-NEXT:    [[TMP0:%.*]] = load i8, ptr [[BLOCK_CAPTURE_ADDR]], align 8, !tbaa [[CHAR_TBAA18]]
 // GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3
-// GFX900-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8, !tbaa [[CHARPTR_TBAA14]]
+// GFX900-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8, !tbaa [[CHARPTR_TBAA16]]
 // GFX900-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1]], i64 0
-// GFX900-NEXT:    store i8 [[TMP0]], ptr addrspace(1) [[ARRAYIDX]], align 1, !tbaa [[CHAR_TBAA16]]
+// GFX900-NEXT:    store i8 [[TMP0]], ptr addrspace(1) [[ARRAYIDX]], align 1, !tbaa [[CHAR_TBAA18]]
 // GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR2:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 5
-// GFX900-NEXT:    [[TMP2:%.*]] = load i64, ptr [[BLOCK_CAPTURE_ADDR2]], align 8, !tbaa [[LONG_TBAA3]]
+// GFX900-NEXT:    [[TMP2:%.*]] = load i64, ptr [[BLOCK_CAPTURE_ADDR2]], align 8, !tbaa [[LONG_TBAA7]]
 // GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR3:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4
-// GFX900-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR3]], align 8, !tbaa [[LONGPTR_TBAA7]]
+// GFX900-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR3]], align 8, !tbaa [[LONGPTR_TBAA9]]
 // GFX900-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[TMP3]], i64 0
-// GFX900-NEXT:    store i64 [[TMP2]], ptr addrspace(1) [[ARRAYIDX4]], align 8, !tbaa [[LONG_TBAA3]]
+// GFX900-NEXT:    store i64 [[TMP2]], ptr addrspace(1) [[ARRAYIDX4]], align 8, !tbaa [[LONG_TBAA7]]
 // GFX900-NEXT:    ret void
 //
 //
@@ -727,20 +727,20 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST]], align 8
 // GFX900-NEXT:    store ptr addrspace(3) [[LP]], ptr [[LP_ADDR_ASCAST]], align 4, !tbaa [[ANYPTR_TBAA32:![0-9]+]]
 // GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 6
-// GFX900-NEXT:    [[TMP0:%.*]] = load i8, ptr [[BLOCK_CAPTURE_ADDR]], align 8, !tbaa [[CHAR_TBAA16]]
+// GFX900-NEXT:    [[TMP0:%.*]] = load i8, ptr [[BLOCK_CAPTURE_ADDR]], align 8, !tbaa [[CHAR_TBAA18]]
 // GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3
-// GFX900-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8, !tbaa [[CHARPTR_TBAA14]]
+// GFX900-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8, !tbaa [[CHARPTR_TBAA16]]
 // GFX900-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1]], i64 0
-// GFX900-NEXT:    store i8 [[TMP0]], ptr addrspace(1) [[ARRAYIDX]], align 1, !tbaa [[CHAR_TBAA16]]
+// GFX900-NEXT:    store i8 [[TMP0]], ptr addrspace(1) [[ARRAYIDX]], align 1, !tbaa [[CHAR_TBAA18]]
 // GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR2:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 5
-// GFX900-NEXT:    [[TMP2:%.*]] = load i64, ptr [[BLOCK_CAPTURE_ADDR2]], align 8, !tbaa [[LONG_TBAA3]]
+// GFX900-NEXT:    [[TMP2:%.*]] = load i64, ptr [[BLOCK_CAPTURE_ADDR2]], align 8, !tbaa [[LONG_TBAA7]]
 // GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR3:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4
-// GFX900-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR3]], align 8, !tbaa [[LONGPTR_TBAA7]]
+// GFX900-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR3]], align 8, !tbaa [[LONGPTR_TBAA9]]
 // GFX900-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[TMP3]], i64 0
-// GFX900-NEXT:    store i64 [[TMP2]], ptr addrspace(1) [[ARRAYIDX4]], align 8, !tbaa [[LONG_TBAA3]]
+// GFX900-NEXT:    store i64 [[TMP2]], ptr addrspace(1) [[ARRAYIDX4]], align 8, !tbaa [[LONG_TBAA7]]
 // GFX900-NEXT:    [[TMP4:%.*]] = load ptr addrspace(3), ptr [[LP_ADDR_ASCAST]], align 4, !tbaa [[ANYPTR_TBAA32]]
 // GFX900-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[TMP4]], i64 0
-// GFX900-NEXT:    store i32 1, ptr addrspace(3) [[ARRAYIDX5]], align 4, !tbaa [[INT_TBAA17]]
+// GFX900-NEXT:    store i32 1, ptr addrspace(3) [[ARRAYIDX5]], align 4, !tbaa [[INT_TBAA3]]
 // GFX900-NEXT:    ret void
 //
 //
@@ -763,9 +763,9 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900-NEXT:    [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr
 // GFX900-NEXT:    store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST]], align 8
 // GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3
-// GFX900-NEXT:    [[TMP0:%.*]] = load i64, ptr [[BLOCK_CAPTURE_ADDR]], align 8, !tbaa [[LONG_TBAA3]]
+// GFX900-NEXT:    [[TMP0:%.*]] = load i64, ptr [[BLOCK_CAPTURE_ADDR]], align 8, !tbaa [[LONG_TBAA7]]
 // GFX900-NEXT:    [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4
-// GFX900-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8, !tbaa [[LONGPTR_TBAA7]]
+// GFX900-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8, !tbaa [[LONGPTR_TBAA9]]
 // GFX900-NEXT:    call void @callee(i64 noundef [[TMP0]], ptr addrspace(1) noundef [[TMP1]]) #[[ATTR8]]
 // GFX900-NEXT:    ret void
 //
@@ -852,36 +852,36 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
 // GFX900: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
 // GFX900: [[META2:![0-9]+]] = !{i32 2, i32 0}
-// GFX900: [[LONG_TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
-// GFX900: [[META4]] = !{!"long", [[META5:![0-9]+]], i64 0}
+// GFX900: [[INT_TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+// GFX900: [[META4]] = !{!"int", [[META5:![0-9]+]], i64 0}
 // GFX900: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
 // GFX900: [[META6]] = !{!"Simple C/C++ TBAA"}
-// GFX900: [[LONGPTR_TBAA7]] = !{[[META8:![0-9]+]], [[META8]], i64 0}
-// GFX900: [[META8]] = !{!"p1 long", [[META9:![0-9]+]], i64 0}
-// GFX900: [[META9]] = !{!"any pointer", [[META5]], i64 0}
-// GFX900: [[META10]] = !{i32 1, i32 0, i32 1, i32 0}
-// GFX900: [[META11]] = !{!"none", !"none", !"none", !"none"}
-// GFX900: [[META12]] = !{!"char*", !"char", !"long*", !"long"}
-// GFX900: [[META13]] = !{!"", !"", !"", !""}
-// GFX900: [[CHARPTR_TBAA14]] = !{[[META15:![0-9]+]], [[META15]], i64 0}
-// GFX900: [[META15]] = !{!"p1 omnipotent char", [[META9]], i64 0}
-// GFX900: [[CHAR_TBAA16]] = !{[[META5]], [[META5]], i64 0}
-// GFX900: [[INT_TBAA17]] = !{[[META18:![0-9]+]], [[META18]], i64 0}
-// GFX900: [[META18]] = !{!"int", [[META5]], i64 0}
+// GFX900: [[LONG_TBAA7]] = !{[[META8:![0-9]+]], [[META8]], i64 0}
+// GFX900: [[META8]] = !{!"long", [[META5]], i64 0}
+// GFX900: [[LONGPTR_TBAA9]] = !{[[META10:![0-9]+]], [[META10]], i64 0}
+// GFX900: [[META10]] = !{!"p1 long", [[META11:![0-9]+]], i64 0}
+// GFX900: [[META11]] = !{!"any pointer", [[META5]], i64 0}
+// GFX900: [[META12]] = !{i32 1, i32 0, i32 1, i32 0}
+// GFX900: [[META13]] = !{!"none", !"none", !"none", !"none"}
+// GFX900: [[META14]] = !{!"char*", !"char", !"long*", !"long"}
+// GFX900: [[META15]] = !{!"", !"", !"", !""}
+// GFX900: [[CHARPTR_TBAA16]] = !{[[META17:![0-9]+]], [[META17]], i64 0}
+// GFX900: [[META17]] = !{!"p1 omnipotent char", [[META11]], i64 0}
+// GFX900: [[CHAR_TBAA18]] = !{[[META5]], [[META5]], i64 0}
 // GFX900: [[QUEUE_T_TBAA19]] = !{[[META20:![0-9]+]], [[META20]], i64 0}
 // GFX900: [[META20]] = !{!"queue_t", [[META5]], i64 0}
-// GFX900: [[TBAA_STRUCT21]] = !{i64 0, i64 4, [[INT_TBAA17]]}
+// GFX900: [[TBAA_STRUCT21]] = !{i64 0, i64 4, [[INT_TBAA3]]}
 // GFX900: [[META22]] = !{i32 1}
 // GFX900: [[META23]] = !{!"none"}
 // GFX900: [[META24]] = !{!"int*"}
 // GFX900: [[META25]] = !{!""}
 // GFX900: [[INTPTR_TBAA26]] = !{[[META27:![0-9]+]], [[META27]], i64 0}
-// GFX900: [[META27]] = !{!"p1 int", [[META9]], i64 0}
+// GFX900: [[META27]] = !{!"p1 int", [[META11]], i64 0}
 // GFX900: [[META28]] = !{ptr addrspace(1) @__test_block_invoke_kernel.runtime.handle}
 // GFX900: [[META29]] = !{i32 0}
 // GFX900: [[META30]] = !{!"__block_literal"}
 // GFX900: [[META31]] = !{ptr addrspace(1) @__test_block_invoke_2_kernel.runtime.handle}
-// GFX900: [[ANYPTR_TBAA32]] = !{[[META9]], [[META9]], i64 0}
+// GFX900: [[ANYPTR_TBAA32]] = !{[[META11]], [[META11]], i64 0}
 // GFX900: [[META33]] = !{ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle}
 // GFX900: [[META34]] = !{i32 0, i32 3}
 // GFX900: [[META35]] = !{!"none", !"none"}
diff --git a/clang/test/CodeGenOpenCL/amdgpu-printf.cl b/clang/test/CodeGenOpenCL/amdgpu-printf.cl
index cea7ee576d822..829f672a6ccc9 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-printf.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-printf.cl
@@ -16,71 +16,71 @@ __kernel void test_printf_str_int(int i) {
     printf("%s:%d", s, i);
 }
 // CHECK-LABEL: define dso_local amdgpu_kernel void @test_printf_noargs(
-// CHECK-SAME: ) #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META4]] !kernel_arg_base_type [[META4]] !kernel_arg_type_qual [[META4]] {
+// CHECK-SAME: ) #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META8:![0-9]+]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META8]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META8]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    call void @__clang_ocl_kern_imp_test_printf_noargs() #[[ATTR5:[0-9]+]]
 // CHECK-NEXT:    ret void
 //
 //
 // CHECK-LABEL: define dso_local void @__clang_ocl_kern_imp_test_printf_noargs(
-// CHECK-SAME: ) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META4]] !kernel_arg_base_type [[META4]] !kernel_arg_type_qual [[META4]] {
+// CHECK-SAME: ) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META8]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META8]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META8]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[CALL:%.*]] = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) noundef @.str) #[[ATTR5]]
 // CHECK-NEXT:    ret void
 //
 //
 // CHECK-LABEL: define dso_local amdgpu_kernel void @test_printf_int(
-// CHECK-SAME: i32 noundef [[I:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META5:![0-9]+]] !kernel_arg_access_qual [[META6:![0-9]+]] !kernel_arg_type [[META7:![0-9]+]] !kernel_arg_base_type [[META7]] !kernel_arg_type_qual [[META8:![0-9]+]] {
+// CHECK-SAME: i32 noundef [[I:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META10:![0-9]+]] !kernel_arg_type [[META11:![0-9]+]] !kernel_arg_base_type [[META11]] !kernel_arg_type_qual [[META12:![0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[I_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    store i32 [[I]], ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[INT_TBAA9:![0-9]+]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[INT_TBAA9]]
+// CHECK-NEXT:    store i32 [[I]], ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[INT_TBAA4:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[INT_TBAA4]]
 // CHECK-NEXT:    call void @__clang_ocl_kern_imp_test_printf_int(i32 noundef [[TMP0]]) #[[ATTR5]]
 // CHECK-NEXT:    ret void
 //
 //
 // CHECK-LABEL: define dso_local void @__clang_ocl_kern_imp_test_printf_int(
-// CHECK-SAME: i32 noundef [[I:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META5]] !kernel_arg_access_qual [[META6]] !kernel_arg_type [[META7]] !kernel_arg_base_type [[META7]] !kernel_arg_type_qual [[META8]] {
+// CHECK-SAME: i32 noundef [[I:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META11]] !kernel_arg_base_type [[META11]] !kernel_arg_type_qual [[META12]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[I_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    store i32 [[I]], ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[INT_TBAA9]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[INT_TBAA9]]
+// CHECK-NEXT:    store i32 [[I]], ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[INT_TBAA4]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[INT_TBAA4]]
 // CHECK-NEXT:    [[CALL:%.*]] = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) noundef @.str.1, i32 noundef [[TMP0]]) #[[ATTR5]]
 // CHECK-NEXT:    ret void
 //
 //
 // CHECK-LABEL: define dso_local amdgpu_kernel void @test_printf_str_int(
-// CHECK-SAME: i32 noundef [[I:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META5]] !kernel_arg_access_qual [[META6]] !kernel_arg_type [[META7]] !kernel_arg_base_type [[META7]] !kernel_arg_type_qual [[META8]] {
+// CHECK-SAME: i32 noundef [[I:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META11]] !kernel_arg_base_type [[META11]] !kernel_arg_type_qual [[META12]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[I_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT:    store i32 [[I]], ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[INT_TBAA9]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[INT_TBAA9]]
+// CHECK-NEXT:    store i32 [[I]], ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[INT_TBAA4]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[INT_TBAA4]]
 // CHECK-NEXT:    call void @__clang_ocl_kern_imp_test_printf_str_int(i32 noundef [[TMP0]]) #[[ATTR5]]
 // CHECK-NEXT:    ret void
 //
 //
 // CHECK-LABEL: define dso_local void @__clang_ocl_kern_imp_test_printf_str_int(
-// CHECK-SAME: i32 noundef [[I:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META5]] !kernel_arg_access_qual [[META6]] !kernel_arg_type [[META7]] !kernel_arg_base_type [[META7]] !kernel_arg_type_qual [[META8]] {
+// CHECK-SAME: i32 noundef [[I:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META11]] !kernel_arg_base_type [[META11]] !kernel_arg_type_qual [[META12]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[I_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    [[S:%.*]] = alloca [4 x i8], align 1, addrspace(5)
-// CHECK-NEXT:    store i32 [[I]], ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[INT_TBAA9]]
+// CHECK-NEXT:    store i32 [[I]], ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[INT_TBAA4]]
 // CHECK-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[S]]) #[[ATTR6:[0-9]+]]
 // CHECK-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 1 [[S]], ptr addrspace(4) align 1 @__const.test_printf_str_int.s, i64 4, i1 false)
 // CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [4 x i8], ptr addrspace(5) [[S]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[INT_TBAA9]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[INT_TBAA4]]
 // CHECK-NEXT:    [[CALL:%.*]] = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) noundef @.str.2, ptr addrspace(5) noundef [[ARRAYDECAY]], i32 noundef [[TMP0]]) #[[ATTR5]]
 // CHECK-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[S]]) #[[ATTR6]]
 // CHECK-NEXT:    ret void
 //
 //.
-// CHECK: [[META4]] = !{}
-// CHECK: [[META5]] = !{i32 0}
-// CHECK: [[META6]] = !{!"none"}
-// CHECK: [[META7]] = !{!"int"}
-// CHECK: [[META8]] = !{!""}
-// CHECK: [[INT_TBAA9]] = !{[[META10:![0-9]+]], [[META10]], i64 0}
-// CHECK: [[META10]] = !{!"int", [[META11:![0-9]+]], i64 0}
-// CHECK: [[META11]] = !{!"omnipotent char", [[META12:![0-9]+]], i64 0}
-// CHECK: [[META12]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[INT_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
+// CHECK: [[META5]] = !{!"int", [[META6:![0-9]+]], i64 0}
+// CHECK: [[META6]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
+// CHECK: [[META7]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[META8]] = !{}
+// CHECK: [[META9]] = !{i32 0}
+// CHECK: [[META10]] = !{!"none"}
+// CHECK: [[META11]] = !{!"int"}
+// CHECK: [[META12]] = !{!""}
 //.
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl
index 321835cc3d28d..6326866ed3c35 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl
@@ -18,7 +18,7 @@ typedef int    v8i   __attribute__((ext_vector_type(8)));
 // CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> [[A]], <8 x half> [[B]], <8 x float> [[C]])
-// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4:![0-9]+]]
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8:![0-9]+]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_wmma_f32_16x16x16_f16_w32(global v8f* out, v8h a, v8h b, v8f c)
@@ -34,7 +34,7 @@ void test_amdgcn_wmma_f32_16x16x16_f16_w32(global v8f* out, v8h a, v8h b, v8f c)
 // CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> [[A]], <8 x i16> [[B]], <8 x float> [[C]])
-// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out, v8s a, v8s b, v8f c)
@@ -50,7 +50,7 @@ void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out, v8s a, v8s b, v8f c
 // CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> [[A]], <8 x half> [[B]], <8 x half> [[C]], i1 false)
-// CHECK-GFX1200-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1200-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_wmma_f16_16x16x16_f16_w32(global v8h* out, v8h a, v8h b, v8h c)
@@ -66,7 +66,7 @@ void test_amdgcn_wmma_f16_16x16x16_f16_w32(global v8h* out, v8h a, v8h b, v8h c)
 // CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16(<8 x i16> [[A]], <8 x i16> [[B]], <8 x i16> [[C]], i1 false)
-// CHECK-GFX1200-NEXT:    store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1200-NEXT:    store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_wmma_bf16_16x16x16_bf16_w32(global v8s* out, v8s a, v8s b, v8s c)
@@ -82,7 +82,7 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_w32(global v8s* out, v8s a, v8s b, v8s
 // CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 true, <2 x i32> [[A]], i1 true, <2 x i32> [[B]], <8 x i32> [[C]], i1 false)
-// CHECK-GFX1200-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1200-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_wmma_i32_16x16x16_iu8_w32(global v8i* out, v2i a, v2i b, v8i c)
@@ -98,7 +98,7 @@ void test_amdgcn_wmma_i32_16x16x16_iu8_w32(global v8i* out, v2i a, v2i b, v8i c)
 // CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <8 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 true, i32 [[A]], i1 true, i32 [[B]], <8 x i32> [[C]], i1 false)
-// CHECK-GFX1200-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1200-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_wmma_i32_16x16x16_iu4_w32(global v8i* out, int a, int b, v8i c)
@@ -110,7 +110,7 @@ void test_amdgcn_wmma_i32_16x16x16_iu4_w32(global v8i* out, int a, int b, v8i c)
 // CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <8 x float> [[C]])
-// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v8f* out, v2i a, v2i b, v8f c)
@@ -122,7 +122,7 @@ void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v8f* out, v2i a, v2i b, v8
 // CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <8 x float> [[C]])
-// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v8f* out, v2i a, v2i b, v8f c)
@@ -134,7 +134,7 @@ void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v8f* out, v2i a, v2i b, v8
 // CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <8 x float> [[C]])
-// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v8f* out, v2i a, v2i b, v8f c)
@@ -146,7 +146,7 @@ void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v8f* out, v2i a, v2i b, v8
 // CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <8 x float> [[C]])
-// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v8f* out, v2i a, v2i b, v8f c)
@@ -158,7 +158,7 @@ void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v8f* out, v2i a, v2i b, v8
 // CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 true, <2 x i32> [[A]], i1 true, <2 x i32> [[B]], <8 x i32> [[C]], i1 false)
-// CHECK-GFX1200-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1200-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_wmma_i32_16x16x32_iu4_w32(global v8i* out, v2i a, v2i b, v8i c)
@@ -166,7 +166,7 @@ void test_amdgcn_wmma_i32_16x16x32_iu4_w32(global v8i* out, v2i a, v2i b, v8i c)
   *out = __builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12(true, a, true, b, c, false);
 }
 //.
-// CHECK-GFX1200: [[CHAR_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
-// CHECK-GFX1200: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
-// CHECK-GFX1200: [[META6]] = !{!"Simple C/C++ TBAA"}
+// CHECK-GFX1200: [[META6:![0-9]+]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
+// CHECK-GFX1200: [[META7]] = !{!"Simple C/C++ TBAA"}
+// CHECK-GFX1200: [[CHAR_TBAA8]] = !{[[META6]], [[META6]], i64 0}
 //.
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl
index 8b5b31537ce58..a79c3d4da1ebb 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl
@@ -17,7 +17,7 @@ typedef int    v4i   __attribute__((ext_vector_type(4)));
 // CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> [[A]], <4 x half> [[B]], <4 x float> [[C]])
-// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4:![0-9]+]]
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8:![0-9]+]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_wmma_f32_16x16x16_f16_w64(global v4f* out, v4h a, v4h b, v4f c)
@@ -33,7 +33,7 @@ void test_amdgcn_wmma_f32_16x16x16_f16_w64(global v4f* out, v4h a, v4h b, v4f c)
 // CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> [[A]], <4 x i16> [[B]], <4 x float> [[C]])
-// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out, v4s a, v4s b, v4f c)
@@ -49,7 +49,7 @@ void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out, v4s a, v4s b, v4f c
 // CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 8)) [[OUT:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half> [[A]], <4 x half> [[B]], <4 x half> [[C]], i1 false)
-// CHECK-GFX1200-NEXT:    store <4 x half> [[TMP0]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1200-NEXT:    store <4 x half> [[TMP0]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_wmma_f16_16x16x16_f16_w64(global v4h* out, v4h a, v4h b, v4h c)
@@ -65,7 +65,7 @@ void test_amdgcn_wmma_f16_16x16x16_f16_w64(global v4h* out, v4h a, v4h b, v4h c)
 // CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 8)) [[OUT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16(<4 x i16> [[A]], <4 x i16> [[B]], <4 x i16> [[C]], i1 false)
-// CHECK-GFX1200-NEXT:    store <4 x i16> [[TMP0]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1200-NEXT:    store <4 x i16> [[TMP0]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_wmma_bf16_16x16x16_bf16_w64(global v4s* out, v4s a, v4s b, v4s c)
@@ -81,7 +81,7 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_w64(global v4s* out, v4s a, v4s b, v4s
 // CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 true, i32 [[A]], i1 true, i32 [[B]], <4 x i32> [[C]], i1 false)
-// CHECK-GFX1200-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1200-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_wmma_i32_16x16x16_iu8_w64(global v4i* out, int a, int b, v4i c)
@@ -97,7 +97,7 @@ void test_amdgcn_wmma_i32_16x16x16_iu8_w64(global v4i* out, int a, int b, v4i c)
 // CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 true, i32 [[A]], i1 true, i32 [[B]], <4 x i32> [[C]], i1 false)
-// CHECK-GFX1200-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1200-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_wmma_i32_16x16x16_iu4_w64(global v4i* out, int a, int b, v4i c)
@@ -109,7 +109,7 @@ void test_amdgcn_wmma_i32_16x16x16_iu4_w64(global v4i* out, int a, int b, v4i c)
 // CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 [[A]], i32 [[B]], <4 x float> [[C]])
-// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v4f* out, int a, int b, v4f c)
@@ -121,7 +121,7 @@ void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v4f* out, int a, int b, v4
 // CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 [[A]], i32 [[B]], <4 x float> [[C]])
-// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v4f* out, int a, int b, v4f c)
@@ -133,7 +133,7 @@ void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v4f* out, int a, int b, v4
 // CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 [[A]], i32 [[B]], <4 x float> [[C]])
-// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v4f* out, int a, int b, v4f c)
@@ -145,7 +145,7 @@ void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v4f* out, int a, int b, v4
 // CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 [[A]], i32 [[B]], <4 x float> [[C]])
-// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v4f* out, int a, int b, v4f c)
@@ -157,7 +157,7 @@ void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v4f* out, int a, int b, v4
 // CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 true, i32 [[A]], i1 true, i32 [[B]], <4 x i32> [[C]], i1 false)
-// CHECK-GFX1200-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1200-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_wmma_i32_16x16x32_iu4_w32(global v4i* out, int a, int b, v4i c)
@@ -165,7 +165,7 @@ void test_amdgcn_wmma_i32_16x16x32_iu4_w32(global v4i* out, int a, int b, v4i c)
   *out = __builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12(true, a, true, b, c, false);
 }
 //.
-// CHECK-GFX1200: [[CHAR_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
-// CHECK-GFX1200: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
-// CHECK-GFX1200: [[META6]] = !{!"Simple C/C++ TBAA"}
+// CHECK-GFX1200: [[META6:![0-9]+]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
+// CHECK-GFX1200: [[META7]] = !{!"Simple C/C++ TBAA"}
+// CHECK-GFX1200: [[CHAR_TBAA8]] = !{[[META6]], [[META6]], i64 0}
 //.
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-async-load-store-lds.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-async-load-store-lds.cl
index e03ae66f92035..22004627b561f 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-async-load-store-lds.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-async-load-store-lds.cl
@@ -152,7 +152,7 @@ void test_amdgcn_ds_atomic_async_barrier_arrive_b64(local long* addr)
 // CHECK-GFX1250-SAME: ptr addrspace(3) noundef captures(none) [[ADDR:%.*]], i64 noundef [[DATA:%.*]], ptr noundef writeonly captures(none) initializes((0, 8)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR4:[0-9]+]] {
 // CHECK-GFX1250-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1250-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.amdgcn.ds.atomic.barrier.arrive.rtn.b64(ptr addrspace(3) [[ADDR]], i64 [[DATA]])
-// CHECK-GFX1250-NEXT:    store i64 [[TMP0]], ptr [[OUT]], align 8, !tbaa [[LONG_TBAA4:![0-9]+]]
+// CHECK-GFX1250-NEXT:    store i64 [[TMP0]], ptr [[OUT]], align 8, !tbaa [[LONG_TBAA8:![0-9]+]]
 // CHECK-GFX1250-NEXT:    ret void
 //
 void test_amdgcn_ds_atomic_barrier_arrive_rtn_b64(local long* addr, long data, long *out)
@@ -160,8 +160,8 @@ void test_amdgcn_ds_atomic_barrier_arrive_rtn_b64(local long* addr, long data, l
   *out = __builtin_amdgcn_ds_atomic_barrier_arrive_rtn_b64(addr, data);
 }
 //.
-// CHECK-GFX1250: [[LONG_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
-// CHECK-GFX1250: [[META5]] = !{!"long", [[META6:![0-9]+]], i64 0}
-// CHECK-GFX1250: [[META6]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
+// CHECK-GFX1250: [[META6:![0-9]+]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
 // CHECK-GFX1250: [[META7]] = !{!"Simple C/C++ TBAA"}
+// CHECK-GFX1250: [[LONG_TBAA8]] = !{[[META9:![0-9]+]], [[META9]], i64 0}
+// CHECK-GFX1250: [[META9]] = !{!"long", [[META6]], i64 0}
 //.
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-store.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-store.cl
new file mode 100644
index 0000000000000..e15ca0167ef6c
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-store.cl
@@ -0,0 +1,99 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals smart
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx950         -emit-llvm -o - %s | FileCheck %s -check-prefixes=GFX,GFX950
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx9-4-generic -emit-llvm -o - %s | FileCheck %s -check-prefixes=GFX,GFX9_4_GENERIC
+
+typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u32;
+typedef v4u32 __global *global_ptr_to_v4u32;
+
+//------------------------------------------------------------------------------
+// Store
+//------------------------------------------------------------------------------
+// GFX-LABEL: @test_amdgcn_global_store_b128_00(
+// GFX-NEXT:  entry:
+// GFX-NEXT:    tail call void @llvm.amdgcn.global.store.b128(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META4:![0-9]+]])
+// GFX-NEXT:    ret void
+//
+void test_amdgcn_global_store_b128_00(global_ptr_to_v4u32 ptr, v4u32 data) {
+  __builtin_amdgcn_global_store_b128(ptr, data, "wavefront");
+}
+
+// GFX-LABEL: @test_amdgcn_global_store_b128_01(
+// GFX-NEXT:  entry:
+// GFX-NEXT:    tail call void @llvm.amdgcn.global.store.b128(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META5:![0-9]+]])
+// GFX-NEXT:    ret void
+//
+void test_amdgcn_global_store_b128_01(global_ptr_to_v4u32 ptr, v4u32 data) {
+  __builtin_amdgcn_global_store_b128(ptr, data, "workgroup");
+}
+
+// GFX-LABEL: @test_amdgcn_global_store_b128_10(
+// GFX-NEXT:  entry:
+// GFX-NEXT:    tail call void @llvm.amdgcn.global.store.b128(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META6:![0-9]+]])
+// GFX-NEXT:    ret void
+//
+void test_amdgcn_global_store_b128_10(global_ptr_to_v4u32 ptr, v4u32 data) {
+  __builtin_amdgcn_global_store_b128(ptr, data, "agent");
+}
+
+// GFX-LABEL: @test_amdgcn_global_store_b128_11(
+// GFX-NEXT:  entry:
+// GFX-NEXT:    tail call void @llvm.amdgcn.global.store.b128(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META7:![0-9]+]])
+// GFX-NEXT:    ret void
+//
+void test_amdgcn_global_store_b128_11(global_ptr_to_v4u32 ptr, v4u32 data) {
+  __builtin_amdgcn_global_store_b128(ptr, data, "");
+}
+
+//------------------------------------------------------------------------------
+// Load
+//------------------------------------------------------------------------------
+// GFX-LABEL: @test_amdgcn_global_load_b128_00(
+// GFX-NEXT:  entry:
+// GFX-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) [[PTR:%.*]], metadata [[META4]])
+// GFX-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_global_load_b128_00(global_ptr_to_v4u32 ptr) {
+  return __builtin_amdgcn_global_load_b128(ptr, "wavefront");
+}
+
+// GFX-LABEL: @test_amdgcn_global_load_b128_01(
+// GFX-NEXT:  entry:
+// GFX-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) [[PTR:%.*]], metadata [[META5]])
+// GFX-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_global_load_b128_01(global_ptr_to_v4u32 ptr) {
+  return __builtin_amdgcn_global_load_b128(ptr, "workgroup");
+}
+
+// GFX-LABEL: @test_amdgcn_global_load_b128_10(
+// GFX-NEXT:  entry:
+// GFX-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) [[PTR:%.*]], metadata [[META6]])
+// GFX-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_global_load_b128_10(global_ptr_to_v4u32 ptr) {
+  return __builtin_amdgcn_global_load_b128(ptr, "agent");
+}
+
+// GFX-LABEL: @test_amdgcn_global_load_b128_11(
+// GFX-NEXT:  entry:
+// GFX-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) [[PTR:%.*]], metadata [[META7]])
+// GFX-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 test_amdgcn_global_load_b128_11(global_ptr_to_v4u32 ptr) {
+  return __builtin_amdgcn_global_load_b128(ptr, "");
+}
+//.
+// GFX950: [[META4]] = !{!"wavefront"}
+// GFX950: [[META5]] = !{!"workgroup"}
+// GFX950: [[META6]] = !{!"agent"}
+// GFX950: [[META7]] = !{!""}
+//.
+// GFX9_4_GENERIC: [[META4]] = !{!"wavefront"}
+// GFX9_4_GENERIC: [[META5]] = !{!"workgroup"}
+// GFX9_4_GENERIC: [[META6]] = !{!"agent"}
+// GFX9_4_GENERIC: [[META7]] = !{!""}
+//.
+//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+// GFX950: {{.*}}
+// GFX9_4_GENERIC: {{.*}}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gws-insts.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gws-insts.cl
index b3367202f824e..a02c97b115b5a 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gws-insts.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gws-insts.cl
@@ -13,7 +13,7 @@
 typedef unsigned int uint;
 
 // CHECK-LABEL: define dso_local amdgpu_kernel void @test_builtins_amdgcn_gws_insts
-// CHECK-SAME: (i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !kernel_arg_addr_space !4 !kernel_arg_access_qual !5 !kernel_arg_type !6 !kernel_arg_base_type !6 !kernel_arg_type_qual !7 {
+// CHECK-SAME: (i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !kernel_arg_addr_space !8 !kernel_arg_access_qual !9 !kernel_arg_type !10 !kernel_arg_base_type !10 !kernel_arg_type_qual !11 {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    tail call void @llvm.amdgcn.ds.gws.init(i32 [[A]], i32 [[B]])
 // CHECK-NEXT:    tail call void @llvm.amdgcn.ds.gws.barrier(i32 [[A]], i32 [[B]])
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w32.cl
index 214390142b6aa..72ba1915fa01c 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w32.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w32.cl
@@ -17,7 +17,7 @@ typedef short v16s   __attribute__((ext_vector_type(16)));
 // CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <8 x half> noundef [[A:%.*]], <16 x half> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i32(<8 x half> [[A]], <16 x half> [[B]], <8 x float> [[C]], i32 [[INDEX]])
-// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4:![0-9]+]]
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8:![0-9]+]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_swmmac_f32_16x16x32_f16_w32(global v8f* out, v8h a, v16h b, v8f c, int index)
@@ -29,7 +29,7 @@ void test_amdgcn_swmmac_f32_16x16x32_f16_w32(global v8f* out, v8h a, v16h b, v8f
 // CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <8 x i16> noundef [[A:%.*]], <16 x i16> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.i32(<8 x i16> [[A]], <16 x i16> [[B]], <8 x float> [[C]], i32 [[INDEX]])
-// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_swmmac_f32_16x16x32_bf16_w32(global v8f* out, v8s a, v16s b, v8f c, int index)
@@ -41,7 +41,7 @@ void test_amdgcn_swmmac_f32_16x16x32_bf16_w32(global v8f* out, v8s a, v16s b, v8
 // CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <8 x half> noundef [[A:%.*]], <16 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i32(<8 x half> [[A]], <16 x half> [[B]], <8 x half> [[C]], i32 [[INDEX]])
-// CHECK-GFX1200-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1200-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_swmmac_f16_16x16x32_f16_w32(global v8h* out, v8h a, v16h b, v8h c, int index)
@@ -53,7 +53,7 @@ void test_amdgcn_swmmac_f16_16x16x32_f16_w32(global v8h* out, v8h a, v16h b, v8h
 // CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <8 x i16> noundef [[A:%.*]], <16 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.i32(<8 x i16> [[A]], <16 x i16> [[B]], <8 x i16> [[C]], i32 [[INDEX]])
-// CHECK-GFX1200-NEXT:    store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1200-NEXT:    store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_swmmac_bf16_16x16x32_bf16_w32(global v8s* out, v8s a, v16s b, v8s c, int index)
@@ -65,7 +65,7 @@ void test_amdgcn_swmmac_bf16_16x16x32_bf16_w32(global v8s* out, v8s a, v16s b, v
 // CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <8 x i32> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i32(i1 true, <2 x i32> [[A]], i1 true, <4 x i32> [[B]], <8 x i32> [[C]], i32 [[INDEX]], i1 true)
-// CHECK-GFX1200-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1200-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_swmmac_i32_16x16x32_iu8_w32(global v8i* out, v2i a, v4i b, v8i c, int index)
@@ -77,7 +77,7 @@ void test_amdgcn_swmmac_i32_16x16x32_iu8_w32(global v8i* out, v2i a, v4i b, v8i
 // CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], i32 noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x i32> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i32(i1 true, i32 [[A]], i1 true, <2 x i32> [[B]], <8 x i32> [[C]], i32 [[INDEX]], i1 true)
-// CHECK-GFX1200-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1200-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_swmmac_i32_16x16x32_iu4_w32(global v8i* out, int a, v2i b, v8i c, int index)
@@ -89,7 +89,7 @@ void test_amdgcn_swmmac_i32_16x16x32_iu4_w32(global v8i* out, int a, v2i b, v8i
 // CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <8 x i32> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i32(i1 true, <2 x i32> [[A]], i1 true, <4 x i32> [[B]], <8 x i32> [[C]], i32 [[INDEX]], i1 true)
-// CHECK-GFX1200-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1200-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_swmmac_i32_16x16x64_iu4_w32(global v8i* out, v2i a, v4i b, v8i c, int index)
@@ -101,7 +101,7 @@ void test_amdgcn_swmmac_i32_16x16x64_iu4_w32(global v8i* out, v2i a, v4i b, v8i
 // CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.i32(<2 x i32> [[A]], <4 x i32> [[B]], <8 x float> [[C]], i32 [[INDEX]])
-// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(global v8f* out, v2i a, v4i b, v8f c, int index)
@@ -113,7 +113,7 @@ void test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(global v8f* out, v2i a, v4i b,
 // CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.i32(<2 x i32> [[A]], <4 x i32> [[B]], <8 x float> [[C]], i32 [[INDEX]])
-// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(global v8f* out, v2i a, v4i b, v8f c, int index)
@@ -125,7 +125,7 @@ void test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(global v8f* out, v2i a, v4i b,
 // CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.i32(<2 x i32> [[A]], <4 x i32> [[B]], <8 x float> [[C]], i32 [[INDEX]])
-// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(global v8f* out, v2i a, v4i b, v8f c, int index)
@@ -137,7 +137,7 @@ void test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(global v8f* out, v2i a, v4i b,
 // CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.i32(<2 x i32> [[A]], <4 x i32> [[B]], <8 x float> [[C]], i32 [[INDEX]])
-// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32(global v8f* out, v2i a, v4i b, v8f c, int index)
@@ -145,7 +145,7 @@ void test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32(global v8f* out, v2i a, v4i b,
   *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32(a, b, c, index);
 }
 //.
-// CHECK-GFX1200: [[CHAR_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
-// CHECK-GFX1200: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
-// CHECK-GFX1200: [[META6]] = !{!"Simple C/C++ TBAA"}
+// CHECK-GFX1200: [[META6:![0-9]+]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
+// CHECK-GFX1200: [[META7]] = !{!"Simple C/C++ TBAA"}
+// CHECK-GFX1200: [[CHAR_TBAA8]] = !{[[META6]], [[META6]], i64 0}
 //.
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w64.cl
index 47753afd1aa52..015c493c66d48 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w64.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w64.cl
@@ -16,7 +16,7 @@ typedef short  v8s   __attribute__((ext_vector_type(8)));
 // CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <4 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i32(<4 x half> [[A]], <8 x half> [[B]], <4 x float> [[C]], i32 [[INDEX]])
-// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4:![0-9]+]]
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8:![0-9]+]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_swmmac_f32_16x16x32_f16_w64(global v4f* out, v4h a, v8h b, v4f c, int index)
@@ -28,7 +28,7 @@ void test_amdgcn_swmmac_f32_16x16x32_f16_w64(global v4f* out, v4h a, v8h b, v4f
 // CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <4 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.i32(<4 x i16> [[A]], <8 x i16> [[B]], <4 x float> [[C]], i32 [[INDEX]])
-// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_swmmac_f32_16x16x32_bf16_w64(global v4f* out, v4s a, v8s b, v4f c, int index)
@@ -40,7 +40,7 @@ void test_amdgcn_swmmac_f32_16x16x32_bf16_w64(global v4f* out, v4s a, v8s b, v4f
 // CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 8)) [[OUT:%.*]], <4 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i32(<4 x half> [[A]], <8 x half> [[B]], <4 x half> [[C]], i32 [[INDEX]])
-// CHECK-GFX1200-NEXT:    store <4 x half> [[TMP0]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1200-NEXT:    store <4 x half> [[TMP0]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_swmmac_f16_16x16x32_f16_w64(global v4h* out, v4h a, v8h b, v4h c, int index)
@@ -52,7 +52,7 @@ void test_amdgcn_swmmac_f16_16x16x32_f16_w64(global v4h* out, v4h a, v8h b, v4h
 // CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 8)) [[OUT:%.*]], <4 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.i32(<4 x i16> [[A]], <8 x i16> [[B]], <4 x i16> [[C]], i32 [[INDEX]])
-// CHECK-GFX1200-NEXT:    store <4 x i16> [[TMP0]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1200-NEXT:    store <4 x i16> [[TMP0]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_swmmac_bf16_16x16x32_bf16_w64(global v4s* out, v4s a, v8s b, v4s c, int index)
@@ -64,7 +64,7 @@ void test_amdgcn_swmmac_bf16_16x16x32_bf16_w64(global v4s* out, v4s a, v8s b, v4
 // CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i32(i1 true, i32 [[A]], i1 true, <2 x i32> [[B]], <4 x i32> [[C]], i32 [[INDEX]], i1 true)
-// CHECK-GFX1200-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1200-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_swmmac_i32_16x16x32_iu8_w64(global v4i* out, int a, v2i b, v4i c, int index)
@@ -76,7 +76,7 @@ void test_amdgcn_swmmac_i32_16x16x32_iu8_w64(global v4i* out, int a, v2i b, v4i
 // CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i32(i1 true, i32 [[A]], i1 true, i32 [[B]], <4 x i32> [[C]], i32 [[INDEX]], i1 true)
-// CHECK-GFX1200-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1200-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_swmmac_i32_16x16x32_iu4_w64(global v4i* out, int a, int b, v4i c, int index)
@@ -88,7 +88,7 @@ void test_amdgcn_swmmac_i32_16x16x32_iu4_w64(global v4i* out, int a, int b, v4i
 // CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i32(i1 true, i32 [[A]], i1 true, <2 x i32> [[B]], <4 x i32> [[C]], i32 [[INDEX]], i1 true)
-// CHECK-GFX1200-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1200-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_swmmac_i32_16x16x64_iu4_w64(global v4i* out, int a, v2i b, v4i c, int index)
@@ -100,7 +100,7 @@ void test_amdgcn_swmmac_i32_16x16x64_iu4_w64(global v4i* out, int a, v2i b, v4i
 // CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.i32(i32 [[A]], <2 x i32> [[B]], <4 x float> [[C]], i32 [[INDEX]])
-// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64(global v4f* out, int a, v2i b, v4f c, int index)
@@ -112,7 +112,7 @@ void test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64(global v4f* out, int a, v2i b,
 // CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.i32(i32 [[A]], <2 x i32> [[B]], <4 x float> [[C]], i32 [[INDEX]])
-// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64(global v4f* out, int a, v2i b, v4f c, int index)
@@ -124,7 +124,7 @@ void test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64(global v4f* out, int a, v2i b,
 // CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.i32(i32 [[A]], <2 x i32> [[B]], <4 x float> [[C]], i32 [[INDEX]])
-// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64(global v4f* out, int a, v2i b, v4f c, int index)
@@ -136,7 +136,7 @@ void test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64(global v4f* out, int a, v2i b,
 // CHECK-GFX1200-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], i32 noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.i32(i32 [[A]], <2 x i32> [[B]], <4 x float> [[C]], i32 [[INDEX]])
-// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1200-NEXT:    ret void
 //
 void test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64(global v4f* out, int a, v2i b, v4f c, int index)
@@ -144,7 +144,7 @@ void test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64(global v4f* out, int a, v2i b,
   *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64(a, b, c, index);
 }
 //.
-// CHECK-GFX1200: [[CHAR_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
-// CHECK-GFX1200: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
-// CHECK-GFX1200: [[META6]] = !{!"Simple C/C++ TBAA"}
+// CHECK-GFX1200: [[META6:![0-9]+]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
+// CHECK-GFX1200: [[META7]] = !{!"Simple C/C++ TBAA"}
+// CHECK-GFX1200: [[CHAR_TBAA8]] = !{[[META6]], [[META6]], i64 0}
 //.
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
index 6bb20bff436fb..faf6a7d44fee2 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
@@ -5,6 +5,8 @@
 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1012 -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,GCN %s
 // RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,AMDGCNSPIRV %s
 
+#define INVALID_MEMORY_SCOPE (__MEMORY_SCOPE_CLUSTR+1)
+
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
 typedef unsigned long ulong;
@@ -252,13 +254,19 @@ void test_update_dpp_const_int(global int* out, int arg1)
 // CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
 // CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
 
-// GCN: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}}
+// GCN:         atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}}
 // AMDGCNSPIRV: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("device") monotonic, align 4{{$}}
-// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}}
-// GCN: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}}
+
+// CHECK:       atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}}
+
+// GCN:         atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("cluster") monotonic, align 4{{$}}
+// AMDGCNSPIRV: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}}
+
+// GCN:         atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}}
 // AMDGCNSPIRV: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("subgroup") monotonic, align 4{{$}}
-// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("singlethread") monotonic, align 4{{$}}
-// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
+
+// CHECK:       atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("singlethread") monotonic, align 4{{$}}
+// CHECK:       atomicrmw fadd ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
 #if !defined(__SPIRV__)
 void test_ds_faddf(local float *out, float src) {
 #else
@@ -279,9 +287,10 @@ void test_ds_faddf(local float *out, float src) {
   // Test all syncscopes.
   *out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE, false);
   *out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP, false);
+  *out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR, false);
   *out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT, false);
   *out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE, false);
-  *out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, 5, false); // invalid
+  *out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, INVALID_MEMORY_SCOPE,  false); // invalid
 }
 
 // CHECK-LABEL: @test_ds_fmin
@@ -295,13 +304,19 @@ void test_ds_faddf(local float *out, float src) {
 // CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
 // CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
 
-// GCN: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}}
+// GCN:         atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}}
 // AMDGCNSPIRV: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("device") monotonic, align 4{{$}}
-// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}}
-// GCN: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}}
+
+// CHECK:       atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}}
+
+// GCN:         atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("cluster") monotonic, align 4{{$}}
+// AMDGCNSPIRV: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}}
+
+// GCN:         atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}}
 // AMDGCNSPIRV: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("subgroup") monotonic, align 4{{$}}
-// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("singlethread") monotonic, align 4{{$}}
-// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
+
+// CHECK:       atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("singlethread") monotonic, align 4{{$}}
+// CHECK:       atomicrmw fmin ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
 
 #if !defined(__SPIRV__)
 void test_ds_fminf(local float *out, float src) {
@@ -322,9 +337,10 @@ void test_ds_fminf(__attribute__((address_space(3))) float *out, float src) {
   // Test all syncscopes.
   *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE, false);
   *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP, false);
+  *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR, false);
   *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT, false);
   *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE, false);
-  *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELAXED, 5, false); // invalid
+  *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELAXED, INVALID_MEMORY_SCOPE,  false); // invalid
 }
 
 // CHECK-LABEL: @test_ds_fmax
@@ -338,13 +354,19 @@ void test_ds_fminf(__attribute__((address_space(3))) float *out, float src) {
 // CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
 // CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
 
-// GCN: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}}
+// GCN:         atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}}
 // AMDGCNSPIRV: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("device") monotonic, align 4{{$}}
-// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}}
-// GCN: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}}
+
+// CHECK:       atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}}
+
+// GCN:         atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("cluster") monotonic, align 4{{$}}
+// AMDGCNSPIRV: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}}
+
+// GCN:         atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}}
 // AMDGCNSPIRV: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("subgroup") monotonic, align 4{{$}}
-// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("singlethread") monotonic, align 4{{$}}
-// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
+
+// CHECK:       atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("singlethread") monotonic, align 4{{$}}
+// CHECK:       atomicrmw fmax ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
 
 #if !defined(__SPIRV__)
 void test_ds_fmaxf(local float *out, float src) {
@@ -365,9 +387,10 @@ void test_ds_fmaxf(__attribute__((address_space(3))) float *out, float src) {
   // Test all syncscopes.
   *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE, false);
   *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP, false);
+  *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR, false);
   *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT, false);
   *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE, false);
-  *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELAXED, 5, false); // invalid
+  *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELAXED, INVALID_MEMORY_SCOPE,  false); // invalid
 }
 
 // CHECK-LABEL: @test_s_memtime
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl
index 853cd32f8bdce..2f3ab1148912a 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl
@@ -21,7 +21,7 @@ typedef short  v16s  __attribute__((ext_vector_type(16)));
 // CHECK-GFX1100-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <16 x half> noundef [[A:%.*]], <16 x half> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-GFX1100-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half> [[A]], <16 x half> [[B]], <8 x float> [[C]])
-// CHECK-GFX1100-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4:![0-9]+]]
+// CHECK-GFX1100-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8:![0-9]+]]
 // CHECK-GFX1100-NEXT:    ret void
 //
 void test_amdgcn_wmma_f32_16x16x16_f16_w32(global v8f* out, v16h a, v16h b, v8f c)
@@ -37,7 +37,7 @@ void test_amdgcn_wmma_f32_16x16x16_f16_w32(global v8f* out, v16h a, v16h b, v8f
 // CHECK-GFX1100-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <16 x i16> noundef [[A:%.*]], <16 x i16> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1100-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16> [[A]], <16 x i16> [[B]], <8 x float> [[C]])
-// CHECK-GFX1100-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1100-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1100-NEXT:    ret void
 //
 void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out, v16s a, v16s b, v8f c)
@@ -53,7 +53,7 @@ void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out, v16s a, v16s b, v8f
 // CHECK-GFX1100-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <16 x half> noundef [[A:%.*]], <16 x half> noundef [[B:%.*]], <16 x half> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1100-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[A]], <16 x half> [[B]], <16 x half> [[C]], i1 true)
-// CHECK-GFX1100-NEXT:    store <16 x half> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1100-NEXT:    store <16 x half> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1100-NEXT:    ret void
 //
 void test_amdgcn_wmma_f16_16x16x16_f16_w32(global v16h* out, v16h a, v16h b, v16h c)
@@ -69,7 +69,7 @@ void test_amdgcn_wmma_f16_16x16x16_f16_w32(global v16h* out, v16h a, v16h b, v16
 // CHECK-GFX1100-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <16 x i16> noundef [[A:%.*]], <16 x i16> noundef [[B:%.*]], <16 x i16> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1100-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16> [[A]], <16 x i16> [[B]], <16 x i16> [[C]], i1 true)
-// CHECK-GFX1100-NEXT:    store <16 x i16> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1100-NEXT:    store <16 x i16> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1100-NEXT:    ret void
 //
 void test_amdgcn_wmma_bf16_16x16x16_bf16_w32(global v16s* out, v16s a, v16s b, v16s c)
@@ -85,7 +85,7 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_w32(global v16s* out, v16s a, v16s b, v
 // CHECK-GFX1100-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <16 x half> noundef [[A:%.*]], <16 x half> noundef [[B:%.*]], <16 x half> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1100-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v16f16.v16f16(<16 x half> [[A]], <16 x half> [[B]], <16 x half> [[C]], i1 true)
-// CHECK-GFX1100-NEXT:    store <16 x half> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1100-NEXT:    store <16 x half> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1100-NEXT:    ret void
 //
 void test_amdgcn_wmma_f16_16x16x16_f16_tied_w32(global v16h* out, v16h a, v16h b, v16h c)
@@ -101,7 +101,7 @@ void test_amdgcn_wmma_f16_16x16x16_f16_tied_w32(global v16h* out, v16h a, v16h b
 // CHECK-GFX1100-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <16 x i16> noundef [[A:%.*]], <16 x i16> noundef [[B:%.*]], <16 x i16> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1100-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v16i16.v16i16(<16 x i16> [[A]], <16 x i16> [[B]], <16 x i16> [[C]], i1 true)
-// CHECK-GFX1100-NEXT:    store <16 x i16> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1100-NEXT:    store <16 x i16> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1100-NEXT:    ret void
 //
 void test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32(global v16s* out, v16s a, v16s b, v16s c)
@@ -117,7 +117,7 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32(global v16s* out, v16s a, v16s
 // CHECK-GFX1100-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <8 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1100-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 true, <4 x i32> [[A]], i1 true, <4 x i32> [[B]], <8 x i32> [[C]], i1 false)
-// CHECK-GFX1100-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1100-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1100-NEXT:    ret void
 //
 void test_amdgcn_wmma_i32_16x16x16_iu8_w32(global v8i* out, v4i a, v4i b, v8i c)
@@ -133,7 +133,7 @@ void test_amdgcn_wmma_i32_16x16x16_iu8_w32(global v8i* out, v4i a, v4i b, v8i c)
 // CHECK-GFX1100-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 32)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <8 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1100-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 true, <2 x i32> [[A]], i1 true, <2 x i32> [[B]], <8 x i32> [[C]], i1 false)
-// CHECK-GFX1100-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1100-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 32, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1100-NEXT:    ret void
 //
 void test_amdgcn_wmma_i32_16x16x16_iu4_w32(global v8i* out, v2i a, v2i b, v8i c)
@@ -143,7 +143,7 @@ void test_amdgcn_wmma_i32_16x16x16_iu4_w32(global v8i* out, v2i a, v2i b, v8i c)
 
 #endif
 //.
-// CHECK-GFX1100: [[CHAR_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
-// CHECK-GFX1100: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
-// CHECK-GFX1100: [[META6]] = !{!"Simple C/C++ TBAA"}
+// CHECK-GFX1100: [[META6:![0-9]+]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
+// CHECK-GFX1100: [[META7]] = !{!"Simple C/C++ TBAA"}
+// CHECK-GFX1100: [[CHAR_TBAA8]] = !{[[META6]], [[META6]], i64 0}
 //.
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl
index 9b6872f6b1e6d..024a9287c071b 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl
@@ -22,7 +22,7 @@ typedef short  v16s  __attribute__((ext_vector_type(16)));
 // CHECK-GFX1100-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <16 x half> noundef [[A:%.*]], <16 x half> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-GFX1100-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half> [[A]], <16 x half> [[B]], <4 x float> [[C]])
-// CHECK-GFX1100-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4:![0-9]+]]
+// CHECK-GFX1100-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8:![0-9]+]]
 // CHECK-GFX1100-NEXT:    ret void
 //
 void test_amdgcn_wmma_f32_16x16x16_f16_w64(global v4f* out, v16h a, v16h b, v4f c)
@@ -38,7 +38,7 @@ void test_amdgcn_wmma_f32_16x16x16_f16_w64(global v4f* out, v16h a, v16h b, v4f
 // CHECK-GFX1100-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <16 x i16> noundef [[A:%.*]], <16 x i16> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1100-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v16i16(<16 x i16> [[A]], <16 x i16> [[B]], <4 x float> [[C]])
-// CHECK-GFX1100-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1100-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1100-NEXT:    ret void
 //
 void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out, v16s a, v16s b, v4f c)
@@ -54,7 +54,7 @@ void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out, v16s a, v16s b, v4f
 // CHECK-GFX1100-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <16 x half> noundef [[A:%.*]], <16 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1100-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> [[A]], <16 x half> [[B]], <8 x half> [[C]], i1 true)
-// CHECK-GFX1100-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1100-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1100-NEXT:    ret void
 //
 void test_amdgcn_wmma_f16_16x16x16_f16_w64(global v8h* out, v16h a, v16h b, v8h c)
@@ -70,7 +70,7 @@ void test_amdgcn_wmma_f16_16x16x16_f16_w64(global v8h* out, v16h a, v16h b, v8h
 // CHECK-GFX1100-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <16 x i16> noundef [[A:%.*]], <16 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1100-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> [[A]], <16 x i16> [[B]], <8 x i16> [[C]], i1 true)
-// CHECK-GFX1100-NEXT:    store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1100-NEXT:    store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1100-NEXT:    ret void
 //
 void test_amdgcn_wmma_bf16_16x16x16_bf16_w64(global v8s* out, v16s a, v16s b, v8s c)
@@ -86,7 +86,7 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_w64(global v8s* out, v16s a, v16s b, v8
 // CHECK-GFX1100-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <16 x half> noundef [[A:%.*]], <16 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1100-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v8f16.v16f16(<16 x half> [[A]], <16 x half> [[B]], <8 x half> [[C]], i1 true)
-// CHECK-GFX1100-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1100-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1100-NEXT:    ret void
 //
 void test_amdgcn_wmma_f16_16x16x16_f16_tied_w64(global v8h* out, v16h a, v16h b, v8h c)
@@ -102,7 +102,7 @@ void test_amdgcn_wmma_f16_16x16x16_f16_tied_w64(global v8h* out, v16h a, v16h b,
 // CHECK-GFX1100-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <16 x i16> noundef [[A:%.*]], <16 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1100-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v8i16.v16i16(<16 x i16> [[A]], <16 x i16> [[B]], <8 x i16> [[C]], i1 true)
-// CHECK-GFX1100-NEXT:    store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1100-NEXT:    store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1100-NEXT:    ret void
 //
 void test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64(global v8s* out, v16s a, v16s b, v8s c)
@@ -118,7 +118,7 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64(global v8s* out, v16s a, v16s
 // CHECK-GFX1100-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1100-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 true, <4 x i32> [[A]], i1 true, <4 x i32> [[B]], <4 x i32> [[C]], i1 false)
-// CHECK-GFX1100-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1100-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1100-NEXT:    ret void
 //
 void test_amdgcn_wmma_i32_16x16x16_iu8_w64(global v4i* out, v4i a, v4i b, v4i c)
@@ -134,7 +134,7 @@ void test_amdgcn_wmma_i32_16x16x16_iu8_w64(global v4i* out, v4i a, v4i b, v4i c)
 // CHECK-GFX1100-SAME: ptr addrspace(1) noundef writeonly captures(none) initializes((0, 16)) [[OUT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-GFX1100-NEXT:  [[ENTRY:.*:]]
 // CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 true, <2 x i32> [[A]], i1 true, <2 x i32> [[B]], <4 x i32> [[C]], i1 false)
-// CHECK-GFX1100-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA4]]
+// CHECK-GFX1100-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT]], align 16, !tbaa [[CHAR_TBAA8]]
 // CHECK-GFX1100-NEXT:    ret void
 //
 void test_amdgcn_wmma_i32_16x16x16_iu4_w64(global v4i* out, v2i a, v2i b, v4i c)
@@ -144,7 +144,7 @@ void test_amdgcn_wmma_i32_16x16x16_iu4_w64(global v4i* out, v2i a, v2i b, v4i c)
 
 #endif
 //.
-// CHECK-GFX1100: [[CHAR_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
-// CHECK-GFX1100: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
-// CHECK-GFX1100: [[META6]] = !{!"Simple C/C++ TBAA"}
+// CHECK-GFX1100: [[META6:![0-9]+]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
+// CHECK-GFX1100: [[META7]] = !{!"Simple C/C++ TBAA"}
+// CHECK-GFX1100: [[CHAR_TBAA8]] = !{[[META6]], [[META6]], i64 0}
 //.
diff --git a/clang/test/CodeGenOpenCL/preserve_vec3.cl b/clang/test/CodeGenOpenCL/preserve_vec3.cl
index 6e5c1c49504ec..e76aa81f918cb 100644
--- a/clang/test/CodeGenOpenCL/preserve_vec3.cl
+++ b/clang/test/CodeGenOpenCL/preserve_vec3.cl
@@ -9,11 +9,11 @@ typedef float float3 __attribute__((ext_vector_type(3)));
 typedef float float4 __attribute__((ext_vector_type(4)));
 
 // CHECK-LABEL: define dso_local spir_kernel void @foo(
-// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META6:![0-9]+]] !kernel_arg_type_qual [[META7:![0-9]+]] {
+// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META7:![0-9]+]] !kernel_arg_access_qual [[META8:![0-9]+]] !kernel_arg_type [[META9:![0-9]+]] !kernel_arg_base_type [[META10:![0-9]+]] !kernel_arg_type_qual [[META11:![0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
 // CHECK-NEXT:    [[EXTRACTVEC1_I:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-// CHECK-NEXT:    store <4 x float> [[EXTRACTVEC1_I]], ptr addrspace(1) [[B]], align 16, !tbaa [[CHAR_TBAA8:![0-9]+]]
+// CHECK-NEXT:    store <4 x float> [[EXTRACTVEC1_I]], ptr addrspace(1) [[B]], align 16, !tbaa [[CHAR_TBAA12:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
 void kernel foo(global float3 *a, global float3 *b) {
@@ -21,11 +21,11 @@ void kernel foo(global float3 *a, global float3 *b) {
 }
 
 // CHECK-LABEL: define dso_local spir_kernel void @float4_to_float3(
-// CHECK-SAME: ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[A:%.*]], ptr addrspace(1) noundef readonly align 16 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11:![0-9]+]] !kernel_arg_base_type [[META12:![0-9]+]] !kernel_arg_type_qual [[META7]] {
+// CHECK-SAME: ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[A:%.*]], ptr addrspace(1) noundef readonly align 16 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META7]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META13:![0-9]+]] !kernel_arg_base_type [[META14:![0-9]+]] !kernel_arg_type_qual [[META11]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[CHAR_TBAA12]]
 // CHECK-NEXT:    [[EXTRACTVEC_I:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-// CHECK-NEXT:    store <4 x float> [[EXTRACTVEC_I]], ptr addrspace(1) [[A]], align 16, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT:    store <4 x float> [[EXTRACTVEC_I]], ptr addrspace(1) [[A]], align 16, !tbaa [[CHAR_TBAA12]]
 // CHECK-NEXT:    ret void
 //
 void kernel float4_to_float3(global float3 *a, global float4 *b) {
@@ -33,11 +33,11 @@ void kernel float4_to_float3(global float3 *a, global float4 *b) {
 }
 
 // CHECK-LABEL: define dso_local spir_kernel void @float3_to_float4(
-// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META7]] {
+// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META7]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META13]] !kernel_arg_base_type [[META14]] !kernel_arg_type_qual [[META11]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
 // CHECK-NEXT:    [[ASTYPE_I:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-// CHECK-NEXT:    store <4 x float> [[ASTYPE_I]], ptr addrspace(1) [[B]], align 16, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT:    store <4 x float> [[ASTYPE_I]], ptr addrspace(1) [[B]], align 16, !tbaa [[CHAR_TBAA12]]
 // CHECK-NEXT:    ret void
 //
 void kernel float3_to_float4(global float3 *a, global float4 *b) {
@@ -45,11 +45,11 @@ void kernel float3_to_float4(global float3 *a, global float4 *b) {
 }
 
 // CHECK-LABEL: define dso_local spir_kernel void @float3_to_double2(
-// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META13:![0-9]+]] !kernel_arg_base_type [[META14:![0-9]+]] !kernel_arg_type_qual [[META7]] {
+// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META7]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META16:![0-9]+]] !kernel_arg_type_qual [[META11]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
 // CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-// CHECK-NEXT:    store <4 x float> [[TMP1]], ptr addrspace(1) [[B]], align 16, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT:    store <4 x float> [[TMP1]], ptr addrspace(1) [[B]], align 16, !tbaa [[CHAR_TBAA12]]
 // CHECK-NEXT:    ret void
 //
 void kernel float3_to_double2(global float3 *a, global double2 *b) {
@@ -57,11 +57,11 @@ void kernel float3_to_double2(global float3 *a, global double2 *b) {
 }
 
 // CHECK-LABEL: define dso_local spir_kernel void @char8_to_short3(
-// CHECK-SAME: ptr addrspace(1) noundef writeonly align 8 captures(none) initializes((0, 8)) [[A:%.*]], ptr addrspace(1) noundef readonly align 8 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META16:![0-9]+]] !kernel_arg_type_qual [[META7]] {
+// CHECK-SAME: ptr addrspace(1) noundef writeonly align 8 captures(none) initializes((0, 8)) [[A:%.*]], ptr addrspace(1) noundef readonly align 8 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META7]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META18:![0-9]+]] !kernel_arg_type_qual [[META11]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <3 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <3 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[CHAR_TBAA12]]
 // CHECK-NEXT:    [[EXTRACTVEC_I:%.*]] = shufflevector <3 x i16> [[TMP0]], <3 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-// CHECK-NEXT:    store <4 x i16> [[EXTRACTVEC_I]], ptr addrspace(1) [[A]], align 8, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT:    store <4 x i16> [[EXTRACTVEC_I]], ptr addrspace(1) [[A]], align 8, !tbaa [[CHAR_TBAA12]]
 // CHECK-NEXT:    ret void
 //
 void kernel char8_to_short3(global short3 *a, global char8 *b) {
@@ -72,7 +72,7 @@ void kernel char8_to_short3(global short3 *a, global char8 *b) {
 // CHECK-SAME: <3 x i8> noundef [[A:%.*]], ptr addrspace(1) noundef writeonly captures(none) initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i8> [[A]], <3 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-// CHECK-NEXT:    store <4 x i8> [[EXTRACTVEC]], ptr addrspace(1) [[OUT]], align 4, !tbaa [[INT_TBAA17:![0-9]+]]
+// CHECK-NEXT:    store <4 x i8> [[EXTRACTVEC]], ptr addrspace(1) [[OUT]], align 4, !tbaa [[INT_TBAA3:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
 void from_char3(char3 a, global int *out) {
@@ -95,7 +95,7 @@ void from_short3(short3 a, global long *out) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32 [[A]] to <4 x i8>
 // CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-// CHECK-NEXT:    store <4 x i8> [[EXTRACTVEC]], ptr addrspace(1) [[OUT]], align 4, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT:    store <4 x i8> [[EXTRACTVEC]], ptr addrspace(1) [[OUT]], align 4, !tbaa [[CHAR_TBAA12]]
 // CHECK-NEXT:    ret void
 //
 void scalar_to_char3(int a, global char3 *out) {
@@ -107,7 +107,7 @@ void scalar_to_char3(int a, global char3 *out) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A]] to <4 x i16>
 // CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-// CHECK-NEXT:    store <4 x i16> [[EXTRACTVEC]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[CHAR_TBAA8]]
+// CHECK-NEXT:    store <4 x i16> [[EXTRACTVEC]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[CHAR_TBAA12]]
 // CHECK-NEXT:    ret void
 //
 void scalar_to_short3(long a, global short3 *out) {
@@ -115,22 +115,22 @@ void scalar_to_short3(long a, global short3 *out) {
 }
 
 //.
-// CHECK: [[META3]] = !{i32 1, i32 1}
-// CHECK: [[META4]] = !{!"none", !"none"}
-// CHECK: [[META5]] = !{!"float3*", !"float3*"}
-// CHECK: [[META6]] = !{!"float __attribute__((ext_vector_type(3)))*", !"float __attribute__((ext_vector_type(3)))*"}
-// CHECK: [[META7]] = !{!"", !""}
-// CHECK: [[CHAR_TBAA8]] = !{[[META9:![0-9]+]], [[META9]], i64 0}
-// CHECK: [[META9]] = !{!"omnipotent char", [[META10:![0-9]+]], i64 0}
-// CHECK: [[META10]] = !{!"Simple C/C++ TBAA"}
-// CHECK: [[META11]] = !{!"float3*", !"float4*"}
-// CHECK: [[META12]] = !{!"float __attribute__((ext_vector_type(3)))*", !"float __attribute__((ext_vector_type(4)))*"}
-// CHECK: [[META13]] = !{!"float3*", !"double2*"}
-// CHECK: [[META14]] = !{!"float __attribute__((ext_vector_type(3)))*", !"double __attribute__((ext_vector_type(2)))*"}
-// CHECK: [[META15]] = !{!"short3*", !"char8*"}
-// CHECK: [[META16]] = !{!"short __attribute__((ext_vector_type(3)))*", !"char __attribute__((ext_vector_type(8)))*"}
-// CHECK: [[INT_TBAA17]] = !{[[META18:![0-9]+]], [[META18]], i64 0}
-// CHECK: [[META18]] = !{!"int", [[META9]], i64 0}
+// CHECK: [[INT_TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+// CHECK: [[META4]] = !{!"int", [[META5:![0-9]+]], i64 0}
+// CHECK: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
+// CHECK: [[META6]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[META7]] = !{i32 1, i32 1}
+// CHECK: [[META8]] = !{!"none", !"none"}
+// CHECK: [[META9]] = !{!"float3*", !"float3*"}
+// CHECK: [[META10]] = !{!"float __attribute__((ext_vector_type(3)))*", !"float __attribute__((ext_vector_type(3)))*"}
+// CHECK: [[META11]] = !{!"", !""}
+// CHECK: [[CHAR_TBAA12]] = !{[[META5]], [[META5]], i64 0}
+// CHECK: [[META13]] = !{!"float3*", !"float4*"}
+// CHECK: [[META14]] = !{!"float __attribute__((ext_vector_type(3)))*", !"float __attribute__((ext_vector_type(4)))*"}
+// CHECK: [[META15]] = !{!"float3*", !"double2*"}
+// CHECK: [[META16]] = !{!"float __attribute__((ext_vector_type(3)))*", !"double __attribute__((ext_vector_type(2)))*"}
+// CHECK: [[META17]] = !{!"short3*", !"char8*"}
+// CHECK: [[META18]] = !{!"short __attribute__((ext_vector_type(3)))*", !"char __attribute__((ext_vector_type(8)))*"}
 // CHECK: [[LONG_TBAA19]] = !{[[META20:![0-9]+]], [[META20]], i64 0}
-// CHECK: [[META20]] = !{!"long", [[META9]], i64 0}
+// CHECK: [[META20]] = !{!"long", [[META5]], i64 0}
 //.
diff --git a/clang/test/CodeGenOpenCLCXX/array-type-infinite-loop.clcpp b/clang/test/CodeGenOpenCLCXX/array-type-infinite-loop.clcpp
index e932e75d025e0..07f5929756fcf 100644
--- a/clang/test/CodeGenOpenCLCXX/array-type-infinite-loop.clcpp
+++ b/clang/test/CodeGenOpenCLCXX/array-type-infinite-loop.clcpp
@@ -2,11 +2,11 @@
 //RUN: %clang_cc1 %s -triple spir -emit-llvm -O1 -o - | FileCheck %s
 
 // CHECK-LABEL: define dso_local spir_kernel void @test(
-// CHECK-SAME: ptr addrspace(1) noundef readonly align 8 captures(none) [[IN:%.*]], ptr addrspace(1) noundef writeonly align 8 captures(none) initializes((0, 8)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] {
+// CHECK-SAME: ptr addrspace(1) noundef readonly align 8 captures(none) [[IN:%.*]], ptr addrspace(1) noundef writeonly align 8 captures(none) initializes((0, 8)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META8:![0-9]+]] !kernel_arg_access_qual [[META9:![0-9]+]] !kernel_arg_type [[META10:![0-9]+]] !kernel_arg_base_type [[META10]] !kernel_arg_type_qual [[META11:![0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[ARRAYIDX1_I:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(1) [[IN]], i32 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr addrspace(1) [[ARRAYIDX1_I]], align 8, !tbaa [[LONG_TBAA8:![0-9]+]]
-// CHECK-NEXT:    store i64 [[TMP0]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[LONG_TBAA8]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr addrspace(1) [[ARRAYIDX1_I]], align 8, !tbaa [[LONG_TBAA12:![0-9]+]]
+// CHECK-NEXT:    store i64 [[TMP0]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[LONG_TBAA12]]
 // CHECK-NEXT:    ret void
 //
 __kernel void test(__global long *In, __global long *Out) {
@@ -14,12 +14,12 @@ __kernel void test(__global long *In, __global long *Out) {
    *Out = m[1];
 }
 //.
-// CHECK: [[META4]] = !{i32 1, i32 1}
-// CHECK: [[META5]] = !{!"none", !"none"}
-// CHECK: [[META6]] = !{!"long*", !"long*"}
-// CHECK: [[META7]] = !{!"", !""}
-// CHECK: [[LONG_TBAA8]] = !{[[META9:![0-9]+]], [[META9]], i64 0}
-// CHECK: [[META9]] = !{!"long", [[META10:![0-9]+]], i64 0}
-// CHECK: [[META10]] = !{!"omnipotent char", [[META11:![0-9]+]], i64 0}
-// CHECK: [[META11]] = !{!"Simple C++ TBAA"}
+// CHECK: [[META6:![0-9]+]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
+// CHECK: [[META7]] = !{!"Simple C++ TBAA"}
+// CHECK: [[META8]] = !{i32 1, i32 1}
+// CHECK: [[META9]] = !{!"none", !"none"}
+// CHECK: [[META10]] = !{!"long*", !"long*"}
+// CHECK: [[META11]] = !{!"", !""}
+// CHECK: [[LONG_TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
+// CHECK: [[META13]] = !{!"long", [[META6]], i64 0}
 //.
diff --git a/clang/test/DebugInfo/CXX/versioned-language.cpp b/clang/test/DebugInfo/CXX/versioned-language.cpp
new file mode 100644
index 0000000000000..4cb2b29086035
--- /dev/null
+++ b/clang/test/DebugInfo/CXX/versioned-language.cpp
@@ -0,0 +1,23 @@
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=5 -std=c++98 \
+// RUN:    | FileCheck %s --implicit-check-not "sourceLanguageName" --implicit-check-not "sourceLanguageVersion"
+//
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=6 -std=c++98 | FileCheck %s --check-prefix=CHECK-CPP98
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=6 -std=c++03 | FileCheck %s --check-prefix=CHECK-CPP03
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=6 -std=c++11 | FileCheck %s --check-prefix=CHECK-CPP11
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=6 -std=c++14 | FileCheck %s --check-prefix=CHECK-CPP14
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=6 -std=c++17 | FileCheck %s --check-prefix=CHECK-CPP17
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=6 -std=c++20 | FileCheck %s --check-prefix=CHECK-CPP20
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=6 -std=c++23 | FileCheck %s --check-prefix=CHECK-CPP23
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=6 -std=c++2c | FileCheck %s --check-prefix=CHECK-CPP2C
+
+struct Foo {} globalVar;
+
+// CHECK-CPP98:     !DICompileUnit(sourceLanguageName: DW_LNAME_C_plus_plus, sourceLanguageVersion: 199711
+// FIXME: C++03 technically has no official standard version code. From Clang's point of view C++03 and C++98 are interchangable.
+// CHECK-CPP03:     !DICompileUnit(sourceLanguageName: DW_LNAME_C_plus_plus, sourceLanguageVersion: 199711
+// CHECK-CPP11:     !DICompileUnit(sourceLanguageName: DW_LNAME_C_plus_plus, sourceLanguageVersion: 201103
+// CHECK-CPP14:     !DICompileUnit(sourceLanguageName: DW_LNAME_C_plus_plus, sourceLanguageVersion: 201402
+// CHECK-CPP17:     !DICompileUnit(sourceLanguageName: DW_LNAME_C_plus_plus, sourceLanguageVersion: 201703
+// CHECK-CPP20:     !DICompileUnit(sourceLanguageName: DW_LNAME_C_plus_plus, sourceLanguageVersion: 202002
+// CHECK-CPP23:     !DICompileUnit(sourceLanguageName: DW_LNAME_C_plus_plus, sourceLanguageVersion: 202302
+// CHECK-CPP2C:     !DICompileUnit(sourceLanguageName: DW_LNAME_C_plus_plus, sourceLanguageVersion: 202400
diff --git a/clang/test/DebugInfo/Generic/unsigned-promotion-debuginfo.c b/clang/test/DebugInfo/Generic/unsigned-promotion-debuginfo.c
index 6ca17e1f9f285..d50e0befcee79 100644
--- a/clang/test/DebugInfo/Generic/unsigned-promotion-debuginfo.c
+++ b/clang/test/DebugInfo/Generic/unsigned-promotion-debuginfo.c
@@ -12,34 +12,34 @@
 unsigned short si, sj, sk;
 
 // CHECKS-LABEL: define dso_local void @testshortmul(
-// CHECKS-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !dbg [[DBG13:![0-9]+]] {
+// CHECKS-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !dbg [[DBG17:![0-9]+]] {
 // CHECKS-NEXT:  [[ENTRY:.*:]]
-// CHECKS-NEXT:    [[TMP0:%.*]] = load i16, ptr @sj, align 2, !dbg [[DBG16:![0-9]+]], !tbaa [[SHORT_TBAA17:![0-9]+]]
-// CHECKS-NEXT:    [[CONV:%.*]] = zext i16 [[TMP0]] to i32, !dbg [[DBG16]]
-// CHECKS-NEXT:    [[TMP1:%.*]] = load i16, ptr @sk, align 2, !dbg [[DBG21:![0-9]+]], !tbaa [[SHORT_TBAA17]]
-// CHECKS-NEXT:    [[CONV1:%.*]] = zext i16 [[TMP1]] to i32, !dbg [[DBG21]]
-// CHECKS-NEXT:    [[TMP2:%.*]] = tail call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[CONV]], i32 [[CONV1]]), !dbg [[DBG22:![0-9]+]], !nosanitize [[META26:![0-9]+]]
-// CHECKS-NEXT:    [[TMP3:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1, !dbg [[DBG22]], !nosanitize [[META26]]
-// CHECKS-NEXT:    br i1 [[TMP3]], label %[[HANDLER_MUL_OVERFLOW:.*]], label %[[CONT:.*]], !dbg [[DBG22]], !prof [[PROF27:![0-9]+]], !nosanitize [[META26]]
+// CHECKS-NEXT:    [[TMP0:%.*]] = load i16, ptr @sj, align 2, !dbg [[DBG20:![0-9]+]], !tbaa [[SHORT_TBAA21:![0-9]+]]
+// CHECKS-NEXT:    [[CONV:%.*]] = zext i16 [[TMP0]] to i32, !dbg [[DBG20]]
+// CHECKS-NEXT:    [[TMP1:%.*]] = load i16, ptr @sk, align 2, !dbg [[DBG23:![0-9]+]], !tbaa [[SHORT_TBAA21]]
+// CHECKS-NEXT:    [[CONV1:%.*]] = zext i16 [[TMP1]] to i32, !dbg [[DBG23]]
+// CHECKS-NEXT:    [[TMP2:%.*]] = tail call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[CONV]], i32 [[CONV1]]), !dbg [[DBG24:![0-9]+]], !nosanitize [[META28:![0-9]+]]
+// CHECKS-NEXT:    [[TMP3:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1, !dbg [[DBG24]], !nosanitize [[META28]]
+// CHECKS-NEXT:    br i1 [[TMP3]], label %[[HANDLER_MUL_OVERFLOW:.*]], label %[[CONT:.*]], !dbg [[DBG24]], !prof [[PROF29:![0-9]+]], !nosanitize [[META28]]
 // CHECKS:       [[HANDLER_MUL_OVERFLOW]]:
-// CHECKS-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP0]] to i64, !dbg [[DBG22]]
-// CHECKS-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP1]] to i64, !dbg [[DBG22]]
-// CHECKS-NEXT:    tail call void @__ubsan_handle_mul_overflow_abort(ptr nonnull @[[GLOB1:[0-9]+]], i64 [[TMP4]], i64 [[TMP5]]) #[[ATTR3:[0-9]+]], !dbg [[DBG22]], !nosanitize [[META26]]
-// CHECKS-NEXT:    unreachable, !dbg [[DBG22]], !nosanitize [[META26]]
+// CHECKS-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP0]] to i64, !dbg [[DBG24]]
+// CHECKS-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP1]] to i64, !dbg [[DBG24]]
+// CHECKS-NEXT:    tail call void @__ubsan_handle_mul_overflow_abort(ptr nonnull @[[GLOB1:[0-9]+]], i64 [[TMP4]], i64 [[TMP5]]) #[[ATTR3:[0-9]+]], !dbg [[DBG24]], !nosanitize [[META28]]
+// CHECKS-NEXT:    unreachable, !dbg [[DBG24]], !nosanitize [[META28]]
 // CHECKS:       [[CONT]]:
-// CHECKS-NEXT:    [[TMP6:%.*]] = extractvalue { i32, i1 } [[TMP2]], 0, !dbg [[DBG22]], !nosanitize [[META26]]
-// CHECKS-NEXT:    [[CONV2:%.*]] = trunc i32 [[TMP6]] to i16, !dbg [[DBG16]]
-// CHECKS-NEXT:    store i16 [[CONV2]], ptr @si, align 2, !dbg [[DBG28:![0-9]+]], !tbaa [[SHORT_TBAA17]]
-// CHECKS-NEXT:    ret void, !dbg [[DBG29:![0-9]+]]
+// CHECKS-NEXT:    [[TMP6:%.*]] = extractvalue { i32, i1 } [[TMP2]], 0, !dbg [[DBG24]], !nosanitize [[META28]]
+// CHECKS-NEXT:    [[CONV2:%.*]] = trunc i32 [[TMP6]] to i16, !dbg [[DBG20]]
+// CHECKS-NEXT:    store i16 [[CONV2]], ptr @si, align 2, !dbg [[DBG30:![0-9]+]], !tbaa [[SHORT_TBAA21]]
+// CHECKS-NEXT:    ret void, !dbg [[DBG31:![0-9]+]]
 //
 // CHECKU-LABEL: define dso_local void @testshortmul(
-// CHECKU-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !dbg [[DBG13:![0-9]+]] {
+// CHECKU-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !dbg [[DBG17:![0-9]+]] {
 // CHECKU-NEXT:  [[ENTRY:.*:]]
-// CHECKU-NEXT:    [[TMP0:%.*]] = load i16, ptr @sj, align 2, !dbg [[DBG16:![0-9]+]], !tbaa [[SHORT_TBAA17:![0-9]+]]
-// CHECKU-NEXT:    [[TMP1:%.*]] = load i16, ptr @sk, align 2, !dbg [[DBG21:![0-9]+]], !tbaa [[SHORT_TBAA17]]
-// CHECKU-NEXT:    [[MUL:%.*]] = mul i16 [[TMP1]], [[TMP0]], !dbg [[DBG22:![0-9]+]]
-// CHECKU-NEXT:    store i16 [[MUL]], ptr @si, align 2, !dbg [[DBG23:![0-9]+]], !tbaa [[SHORT_TBAA17]]
-// CHECKU-NEXT:    ret void, !dbg [[DBG24:![0-9]+]]
+// CHECKU-NEXT:    [[TMP0:%.*]] = load i16, ptr @sj, align 2, !dbg [[DBG20:![0-9]+]], !tbaa [[SHORT_TBAA21:![0-9]+]]
+// CHECKU-NEXT:    [[TMP1:%.*]] = load i16, ptr @sk, align 2, !dbg [[DBG23:![0-9]+]], !tbaa [[SHORT_TBAA21]]
+// CHECKU-NEXT:    [[MUL:%.*]] = mul i16 [[TMP1]], [[TMP0]], !dbg [[DBG24:![0-9]+]]
+// CHECKU-NEXT:    store i16 [[MUL]], ptr @si, align 2, !dbg [[DBG25:![0-9]+]], !tbaa [[SHORT_TBAA21]]
+// CHECKU-NEXT:    ret void, !dbg [[DBG26:![0-9]+]]
 //
 void testshortmul(void) {
 
@@ -58,23 +58,23 @@ void testshortmul(void) {
 // CHECKS: [[META8]] = !DIBasicType(name: "unsigned short", size: 16, encoding: DW_ATE_unsigned)
 // CHECKS: [[META9]] = !DIGlobalVariableExpression(var: [[META10:![0-9]+]], expr: !DIExpression())
 // CHECKS: [[META10]] = distinct !DIGlobalVariable(name: "sk", scope: [[META2]], file: [[META7]], line: 12, type: [[META8]], isLocal: false, isDefinition: true)
-// CHECKS: [[DBG13]] = distinct !DISubprogram(name: "testshortmul", scope: [[META7]], file: [[META7]], line: 44, type: [[META14:![0-9]+]], scopeLine: 44, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META2]])
-// CHECKS: [[META14]] = !DISubroutineType(types: [[META15:![0-9]+]])
-// CHECKS: [[META15]] = !{null}
-// CHECKS: [[DBG16]] = !DILocation(line: 47, column: 8, scope: [[DBG13]])
-// CHECKS: [[SHORT_TBAA17]] = !{[[META18:![0-9]+]], [[META18]], i64 0}
-// CHECKS: [[META18]] = !{!"short", [[META19:![0-9]+]], i64 0}
-// CHECKS: [[META19]] = !{!"omnipotent char", [[META20:![0-9]+]], i64 0}
-// CHECKS: [[META20]] = !{!"Simple C/C++ TBAA"}
-// CHECKS: [[DBG21]] = !DILocation(line: 47, column: 13, scope: [[DBG13]])
-// CHECKS: [[DBG22]] = !DILocation(line: 0, scope: [[META23:![0-9]+]], inlinedAt: [[META25:![0-9]+]])
-// CHECKS: [[META23]] = distinct !DISubprogram(name: "__ubsan_check_mul_overflow", scope: [[META7]], file: [[META7]], type: [[META24:![0-9]+]], flags: DIFlagArtificial, spFlags: DISPFlagDefinition, unit: [[META2]])
-// CHECKS: [[META24]] = !DISubroutineType(types: null)
-// CHECKS: [[META25]] = !DILocation(line: 47, column: 11, scope: [[DBG13]])
-// CHECKS: [[META26]] = !{}
-// CHECKS: [[PROF27]] = !{!"branch_weights", i32 1, i32 1048575}
-// CHECKS: [[DBG28]] = !DILocation(line: 47, column: 6, scope: [[DBG13]])
-// CHECKS: [[DBG29]] = !DILocation(line: 48, column: 1, scope: [[DBG13]])
+// CHECKS: [[META15:![0-9]+]] = !{!"omnipotent char", [[META16:![0-9]+]], i64 0}
+// CHECKS: [[META16]] = !{!"Simple C/C++ TBAA"}
+// CHECKS: [[DBG17]] = distinct !DISubprogram(name: "testshortmul", scope: [[META7]], file: [[META7]], line: 44, type: [[META18:![0-9]+]], scopeLine: 44, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META2]])
+// CHECKS: [[META18]] = !DISubroutineType(types: [[META19:![0-9]+]])
+// CHECKS: [[META19]] = !{null}
+// CHECKS: [[DBG20]] = !DILocation(line: 47, column: 8, scope: [[DBG17]])
+// CHECKS: [[SHORT_TBAA21]] = !{[[META22:![0-9]+]], [[META22]], i64 0}
+// CHECKS: [[META22]] = !{!"short", [[META15]], i64 0}
+// CHECKS: [[DBG23]] = !DILocation(line: 47, column: 13, scope: [[DBG17]])
+// CHECKS: [[DBG24]] = !DILocation(line: 0, scope: [[META25:![0-9]+]], inlinedAt: [[META27:![0-9]+]])
+// CHECKS: [[META25]] = distinct !DISubprogram(name: "__ubsan_check_mul_overflow", scope: [[META7]], file: [[META7]], type: [[META26:![0-9]+]], flags: DIFlagArtificial, spFlags: DISPFlagDefinition, unit: [[META2]])
+// CHECKS: [[META26]] = !DISubroutineType(types: null)
+// CHECKS: [[META27]] = !DILocation(line: 47, column: 11, scope: [[DBG17]])
+// CHECKS: [[META28]] = !{}
+// CHECKS: [[PROF29]] = !{!"branch_weights", i32 1, i32 1048575}
+// CHECKS: [[DBG30]] = !DILocation(line: 47, column: 6, scope: [[DBG17]])
+// CHECKS: [[DBG31]] = !DILocation(line: 48, column: 1, scope: [[DBG17]])
 //.
 // CHECKU: [[META0:![0-9]+]] = !DIGlobalVariableExpression(var: [[META1:![0-9]+]], expr: !DIExpression())
 // CHECKU: [[META1]] = distinct !DIGlobalVariable(name: "sj", scope: [[META2:![0-9]+]], file: [[META7:![0-9]+]], line: 12, type: [[META8:![0-9]+]], isLocal: false, isDefinition: true)
@@ -87,16 +87,16 @@ void testshortmul(void) {
 // CHECKU: [[META8]] = !DIBasicType(name: "unsigned short", size: 16, encoding: DW_ATE_unsigned)
 // CHECKU: [[META9]] = !DIGlobalVariableExpression(var: [[META10:![0-9]+]], expr: !DIExpression())
 // CHECKU: [[META10]] = distinct !DIGlobalVariable(name: "sk", scope: [[META2]], file: [[META7]], line: 12, type: [[META8]], isLocal: false, isDefinition: true)
-// CHECKU: [[DBG13]] = distinct !DISubprogram(name: "testshortmul", scope: [[META7]], file: [[META7]], line: 44, type: [[META14:![0-9]+]], scopeLine: 44, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META2]])
-// CHECKU: [[META14]] = !DISubroutineType(types: [[META15:![0-9]+]])
-// CHECKU: [[META15]] = !{null}
-// CHECKU: [[DBG16]] = !DILocation(line: 47, column: 8, scope: [[DBG13]])
-// CHECKU: [[SHORT_TBAA17]] = !{[[META18:![0-9]+]], [[META18]], i64 0}
-// CHECKU: [[META18]] = !{!"short", [[META19:![0-9]+]], i64 0}
-// CHECKU: [[META19]] = !{!"omnipotent char", [[META20:![0-9]+]], i64 0}
-// CHECKU: [[META20]] = !{!"Simple C/C++ TBAA"}
-// CHECKU: [[DBG21]] = !DILocation(line: 47, column: 13, scope: [[DBG13]])
-// CHECKU: [[DBG22]] = !DILocation(line: 47, column: 11, scope: [[DBG13]])
-// CHECKU: [[DBG23]] = !DILocation(line: 47, column: 6, scope: [[DBG13]])
-// CHECKU: [[DBG24]] = !DILocation(line: 48, column: 1, scope: [[DBG13]])
+// CHECKU: [[META15:![0-9]+]] = !{!"omnipotent char", [[META16:![0-9]+]], i64 0}
+// CHECKU: [[META16]] = !{!"Simple C/C++ TBAA"}
+// CHECKU: [[DBG17]] = distinct !DISubprogram(name: "testshortmul", scope: [[META7]], file: [[META7]], line: 44, type: [[META18:![0-9]+]], scopeLine: 44, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META2]])
+// CHECKU: [[META18]] = !DISubroutineType(types: [[META19:![0-9]+]])
+// CHECKU: [[META19]] = !{null}
+// CHECKU: [[DBG20]] = !DILocation(line: 47, column: 8, scope: [[DBG17]])
+// CHECKU: [[SHORT_TBAA21]] = !{[[META22:![0-9]+]], [[META22]], i64 0}
+// CHECKU: [[META22]] = !{!"short", [[META15]], i64 0}
+// CHECKU: [[DBG23]] = !DILocation(line: 47, column: 13, scope: [[DBG17]])
+// CHECKU: [[DBG24]] = !DILocation(line: 47, column: 11, scope: [[DBG17]])
+// CHECKU: [[DBG25]] = !DILocation(line: 47, column: 6, scope: [[DBG17]])
+// CHECKU: [[DBG26]] = !DILocation(line: 48, column: 1, scope: [[DBG17]])
 //.
diff --git a/clang/test/DebugInfo/Generic/versioned-language.c b/clang/test/DebugInfo/Generic/versioned-language.c
new file mode 100644
index 0000000000000..1faa7b4b56d4e
--- /dev/null
+++ b/clang/test/DebugInfo/Generic/versioned-language.c
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=5 -std=c99 \
+// RUN:    | FileCheck %s --implicit-check-not "sourceLanguageName" --implicit-check-not "sourceLanguageVersion"
+//
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=6 -std=c89 | FileCheck %s --check-prefix=CHECK-C89 --implicit-check-not "sourceLanguageVersion"
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=6 -std=c99 | FileCheck %s --check-prefix=CHECK-C99
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=6 -std=c17 | FileCheck %s --check-prefix=CHECK-C17
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=6 -std=c23 | FileCheck %s --check-prefix=CHECK-C23
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=6 -std=c2y | FileCheck %s --check-prefix=CHECK-C2Y
+
+int globalVar = 10;
+
+// CHECK-C89: !DICompileUnit(sourceLanguageName: DW_LNAME_C,
+// CHECK-C99: !DICompileUnit(sourceLanguageName: DW_LNAME_C, sourceLanguageVersion: 199901
+// CHECK-C11: !DICompileUnit(sourceLanguageName: DW_LNAME_C, sourceLanguageVersion: 201112
+// CHECK-C17: !DICompileUnit(sourceLanguageName: DW_LNAME_C, sourceLanguageVersion: 201710
+// CHECK-C23: !DICompileUnit(sourceLanguageName: DW_LNAME_C, sourceLanguageVersion: 202311
+// CHECK-C2Y: !DICompileUnit(sourceLanguageName: DW_LNAME_C, sourceLanguageVersion: 202400
diff --git a/clang/test/DebugInfo/ObjC/versioned-language.m b/clang/test/DebugInfo/ObjC/versioned-language.m
new file mode 100644
index 0000000000000..178c47bf8c841
--- /dev/null
+++ b/clang/test/DebugInfo/ObjC/versioned-language.m
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=5 \
+// RUN:    | FileCheck %s --implicit-check-not "sourceLanguageName" --implicit-check-not "sourceLanguageVersion"
+//
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=6 \
+// RUN:    | FileCheck %s --implicit-check-not "sourceLanguageVersion" --check-prefix=CHECK-OBJC
+
+int globalVar = 10;
+
+// CHECK-OBJC: !DICompileUnit(sourceLanguageName: DW_LNAME_ObjC,
diff --git a/clang/test/DebugInfo/ObjCXX/versioned-language.mm b/clang/test/DebugInfo/ObjCXX/versioned-language.mm
new file mode 100644
index 0000000000000..bfdce462b2bf1
--- /dev/null
+++ b/clang/test/DebugInfo/ObjCXX/versioned-language.mm
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=5 \
+// RUN:    | FileCheck %s --implicit-check-not "sourceLanguageName" --implicit-check-not "sourceLanguageVersion"
+//
+// RUN: %clang_cc1 -emit-llvm %s -o - -debug-info-kind=limited -dwarf-version=6 \
+// RUN:    | FileCheck %s --implicit-check-not "sourceLanguageVersion" --check-prefix=CHECK-OBJCXX
+
+int globalVar = 10;
+
+// CHECK-OBJCXX: !DICompileUnit(sourceLanguageName: DW_LNAME_ObjC_plus_plus,
diff --git a/clang/test/Driver/arm-abi.c b/clang/test/Driver/arm-abi.c
index 139456cf98e14..b89b969858acb 100644
--- a/clang/test/Driver/arm-abi.c
+++ b/clang/test/Driver/arm-abi.c
@@ -31,6 +31,8 @@
 // FreeBSD / OpenBSD default to aapcs-linux
 // RUN: %clang -target arm--freebsd- %s -### -o %t.o 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHECK-AAPCS-LINUX %s
+// RUN: %clang -target arm--fuchsia- %s -### -o %t.o 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-AAPCS-LINUX %s
 // RUN: %clang -target arm--openbsd- %s -### -o %t.o 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHECK-AAPCS-LINUX %s
 // RUN: %clang -target arm--haiku- %s -### -o %t.o 2>&1 \
diff --git a/clang/test/Driver/fuchsia.c b/clang/test/Driver/fuchsia.c
index cf92f85040901..3fb2a94124cd1 100644
--- a/clang/test/Driver/fuchsia.c
+++ b/clang/test/Driver/fuchsia.c
@@ -2,6 +2,10 @@
 // RUN:     -resource-dir=%S/Inputs/resource_dir_with_per_target_subdir \
 // RUN:     --sysroot=%S/platform -fuse-ld=ld 2>&1 \
 // RUN:     | FileCheck -check-prefixes=CHECK,CHECK-X86_64 %s
+// RUN: %clang -### %s --target=arm-unknown-fuchsia \
+// RUN:     -resource-dir=%S/Inputs/resource_dir_with_per_target_subdir \
+// RUN:     --sysroot=%S/platform -fuse-ld=ld 2>&1 \
+// RUN:     | FileCheck -check-prefixes=CHECK,CHECK-ARMV8A %s
 // RUN: %clang -### %s --target=aarch64-unknown-fuchsia \
 // RUN:     -resource-dir=%S/Inputs/resource_dir_with_per_target_subdir \
 // RUN:     --sysroot=%S/platform -fuse-ld=ld 2>&1 \
@@ -14,6 +18,10 @@
 // RUN:     -resource-dir=%S/Inputs/resource_dir_with_per_target_subdir \
 // RUN:     --sysroot=%S/platform -fuse-ld=ld 2>&1 \
 // RUN:     | FileCheck -check-prefixes=CHECK,CHECK-X86_64 %s
+// RUN: %clang -### %s --target=arm-fuchsia \
+// RUN:     -resource-dir=%S/Inputs/resource_dir_with_per_target_subdir \
+// RUN:     --sysroot=%S/platform -fuse-ld=ld 2>&1 \
+// RUN:     | FileCheck -check-prefixes=CHECK,CHECK-ARMV8A %s
 // RUN: %clang -### %s --target=aarch64-fuchsia \
 // RUN:     -resource-dir=%S/Inputs/resource_dir_with_per_target_subdir \
 // RUN:     --sysroot=%S/platform -fuse-ld=ld 2>&1 \
@@ -24,6 +32,7 @@
 // RUN:     | FileCheck -check-prefixes=CHECK,CHECK-RISCV64 %s
 // CHECK: "-cc1"
 // CHECK-X86_64: "-triple" "x86_64-unknown-fuchsia"
+// CHECK-ARMV8A: "-triple" "thumbv8a-unknown-fuchsia"
 // CHECK-AARCH64: "-triple" "aarch64-unknown-fuchsia"
 // CHECK-RISCV64: "-triple" "riscv64-unknown-fuchsia"
 // CHECK: "-funwind-tables=2"
diff --git a/clang/test/Driver/gpu-libc-headers.c b/clang/test/Driver/gpu-libc-headers.c
deleted file mode 100644
index 18029193edeba..0000000000000
--- a/clang/test/Driver/gpu-libc-headers.c
+++ /dev/null
@@ -1,14 +0,0 @@
-// RUN:   %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a --sysroot=%S/Inputs/basic_gpu_tree \
-// RUN:     -ccc-install-dir %S/Inputs/basic_gpu_tree/bin -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-HEADERS-AMDGPU
-// RUN:   %clang -### --target=nvptx64-nvidia-cuda -march=sm_89 --sysroot=%S/Inputs/basic_gpu_tree \
-// RUN:     -ccc-install-dir %S/Inputs/basic_gpu_tree/bin -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-HEADERS-NVPTX
-// CHECK-HEADERS-AMDGPU: "-cc1"{{.*}}"-isysroot"{{.*}}"-internal-isystem" "{{.*}}include{{.*}}amdgcn-amd-amdhsa"
-// CHECK-HEADERS-NVPTX: "-cc1"{{.*}}"-isysroot"{{.*}}"-internal-isystem" "{{.*}}include{{.*}}nvptx64-nvidia-cuda"
-
-// RUN:   %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx1030 -nogpulib \
-// RUN:     -nogpuinc %s 2>&1 | FileCheck %s --check-prefix=CHECK-HEADERS-DISABLED
-// RUN:   %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx1030 -nogpulib \
-// RUN:     -nostdinc %s 2>&1 | FileCheck %s --check-prefix=CHECK-HEADERS-DISABLED
-// RUN:   %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx1030 -nogpulib \
-// RUN:     -nobuiltininc %s 2>&1 | FileCheck %s --check-prefix=CHECK-HEADERS-DISABLED
-// CHECK-HEADERS-DISABLED-NOT: "-cc1"{{.*}}"-internal-isystem" "{{.*}}include{{.*}}gpu-none-llvm"
diff --git a/clang/test/Driver/gpu-libc.c b/clang/test/Driver/gpu-libc.c
new file mode 100644
index 0000000000000..88f346f32e0b8
--- /dev/null
+++ b/clang/test/Driver/gpu-libc.c
@@ -0,0 +1,32 @@
+// RUN:   %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a --sysroot=%S/Inputs/basic_gpu_tree \
+// RUN:     -ccc-install-dir %S/Inputs/basic_gpu_tree/bin -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-HEADERS-AMDGPU
+// RUN:   %clang -### --target=nvptx64-nvidia-cuda -march=sm_89 --sysroot=%S/Inputs/basic_gpu_tree \
+// RUN:     -ccc-install-dir %S/Inputs/basic_gpu_tree/bin -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-HEADERS-NVPTX
+// CHECK-HEADERS-AMDGPU: "-cc1"{{.*}}"-isysroot"{{.*}}"-internal-isystem" "{{.*}}include{{.*}}amdgcn-amd-amdhsa"
+// CHECK-HEADERS-NVPTX: "-cc1"{{.*}}"-isysroot"{{.*}}"-internal-isystem" "{{.*}}include{{.*}}nvptx64-nvidia-cuda"
+
+// RUN:   %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx1030 -nogpulib \
+// RUN:     -nogpuinc %s 2>&1 | FileCheck %s --check-prefix=CHECK-HEADERS-DISABLED
+// RUN:   %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx1030 -nogpulib \
+// RUN:     -nostdinc %s 2>&1 | FileCheck %s --check-prefix=CHECK-HEADERS-DISABLED
+// RUN:   %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx1030 -nogpulib \
+// RUN:     -nobuiltininc %s 2>&1 | FileCheck %s --check-prefix=CHECK-HEADERS-DISABLED
+// CHECK-HEADERS-DISABLED-NOT: "-cc1"{{.*}}"-internal-isystem" "{{.*}}include{{.*}}gpu-none-llvm"
+
+
+// RUN:   %clang -### -fopenmp=libomp --target=x86_64-unknown-linux-gnu \
+// RUN:     --offload-arch=gfx908 --rocm-path=%S/Inputs/rocm --sysroot=%S/Inputs/basic_gpu_tree \
+// RUN:     -ccc-install-dir %S/Inputs/basic_gpu_tree/bin %s 2>&1 | FileCheck %s --check-prefix=OPENMP-AMDGPU
+// OPENMP-AMDGPU: clang-linker-wrapper{{.*}}"--device-linker=amdgcn-amd-amdhsa=-lc"
+// RUN:   %clang -### -fopenmp=libomp --target=x86_64-unknown-linux-gnu -foffload-lto \
+// RUN:     --offload-arch=sm_52 --cuda-path=%S/Inputs/CUDA_111/usr/local/cuda --sysroot=%S/Inputs/basic_gpu_tree \
+// RUN:     -ccc-install-dir %S/Inputs/basic_gpu_tree/bin %s 2>&1 | FileCheck %s --check-prefix=OPENMP-NVPTX
+// OPENMP-NVPTX: clang-linker-wrapper{{.*}}"--device-linker=nvptx64-nvidia-cuda=-lc"
+// RUN:   %clang -### --target=x86_64-unknown-linux-gnu --offload-arch=gfx908 \
+// RUN:     --offload-new-driver --rocm-path=%S/Inputs/rocm --sysroot=%S/Inputs/basic_gpu_tree \
+// RUN:     -ccc-install-dir %S/Inputs/basic_gpu_tree/bin -x hip %s 2>&1 | FileCheck %s --check-prefix=HIP
+// HIP-NOT: "--device-linker=amdgcn-amd-amdhsa=-lc"
+// RUN:   %clang -### --target=x86_64-unknown-linux-gnu -fgpu-rdc --offload-arch=sm_52 \
+// RUN:     --cuda-path=%S/Inputs/CUDA_111/usr/local/cuda --sysroot=%S/Inputs/basic_gpu_tree \
+// RUN:     -ccc-install-dir %S/Inputs/basic_gpu_tree/bin -x cuda %s 2>&1 | FileCheck %s --check-prefix=CUDA
+// CUDA-NOT: "--device-linker=nvptx64-nvidia-cuda=-lc"
diff --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c
index 52a961d7b2388..39b9bcd7425ab 100644
--- a/clang/test/Driver/linker-wrapper.c
+++ b/clang/test/Driver/linker-wrapper.c
@@ -102,7 +102,7 @@ __attribute__((visibility("protected"), used)) int x;
 
 // CUDA: clang{{.*}} -o [[IMG_SM70:.+]] -dumpdir a.out.nvptx64.sm_70.img. --target=nvptx64-nvidia-cuda -march=sm_70
 // CUDA: clang{{.*}} -o [[IMG_SM52:.+]] -dumpdir a.out.nvptx64.sm_52.img. --target=nvptx64-nvidia-cuda -march=sm_52
-// CUDA: fatbinary{{.*}}-64 --create {{.*}}.fatbin --image=profile=sm_70,file=[[IMG_SM70]] --image=profile=sm_52,file=[[IMG_SM52]]
+// CUDA: fatbinary{{.*}}-64 --create {{.*}}.fatbin --image3=kind=elf,sm=70,file=[[IMG_SM70]] --image3=kind=elf,sm=52,file=[[IMG_SM52]]
 // CUDA: usr/bin/ld{{.*}} {{.*}}.openmp.image.{{.*}}.o {{.*}}.cuda.image.{{.*}}.o
 
 // RUN: llvm-offload-binary -o %t.out \
@@ -236,7 +236,7 @@ __attribute__((visibility("protected"), used)) int x;
 // RUN:   %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=RELOCATABLE-LINK-CUDA
 
 // RELOCATABLE-LINK-CUDA: clang{{.*}} -o {{.*}}.img -dumpdir a.out.nvptx64.sm_89.img. --target=nvptx64-nvidia-cuda
-// RELOCATABLE-LINK-CUDA: fatbinary{{.*}} -64 --create {{.*}}.fatbin --image=profile=sm_89,file={{.*}}.img
+// RELOCATABLE-LINK-CUDA: fatbinary{{.*}} -64 --create {{.*}}.fatbin --image3=kind=elf,sm=89,file={{.*}}.img
 // RELOCATABLE-LINK-CUDA: /usr/bin/ld.lld{{.*}}-r
 // RELOCATABLE-LINK-CUDA: llvm-objcopy{{.*}}a.out --remove-section .llvm.offloading
 
diff --git a/clang/test/Driver/x86-march.c b/clang/test/Driver/x86-march.c
index 341f01c8d668d..15f82547892c2 100644
--- a/clang/test/Driver/x86-march.c
+++ b/clang/test/Driver/x86-march.c
@@ -116,6 +116,14 @@
 // RUN:   | FileCheck %s -check-prefix=pantherlake
 // pantherlake: "-target-cpu" "pantherlake"
 //
+// RUN: %clang --target=x86_64 -c -### %s -march=wildcatlake 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=wildcatlake
+// wildcatlake: "-target-cpu" "wildcatlake"
+//
+// RUN: %clang --target=x86_64 -c -### %s -march=novalake 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=novalake
+// novalake: "-target-cpu" "novalake"
+//
 // RUN: %clang --target=x86_64 -c -### %s -march=clearwaterforest 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=clearwaterforest
 // clearwaterforest: "-target-cpu" "clearwaterforest"
diff --git a/clang/test/FixIt/fixit-constrained-structured-binding.cpp b/clang/test/FixIt/fixit-constrained-structured-binding.cpp
index 3f21c1da45a4a..bfb004fe035e1 100644
--- a/clang/test/FixIt/fixit-constrained-structured-binding.cpp
+++ b/clang/test/FixIt/fixit-constrained-structured-binding.cpp
@@ -14,20 +14,20 @@ T get_T();
 
 void use() {
   UnaryC auto [a, b] = get_S();
-  // CHECK: error: decomposition declaration cannot be declared with constrained 'auto'
+  // CHECK: error: structured binding declaration cannot be declared with constrained 'auto'
   // CHECK: fix-it:{{.*}}:{16:3-16:10}:""
   BinaryC<int> auto [c, d] = get_S();
-  // CHECK: error: decomposition declaration cannot be declared with constrained 'auto'
+  // CHECK: error: structured binding declaration cannot be declared with constrained 'auto'
   // CHECK: fix-it:{{.*}}:{19:3-19:16}:""
 }
 
 template<typename T>
 void TemplUse() {
   UnaryC auto [a, b] = get_T<T>();
-  // CHECK: error: decomposition declaration cannot be declared with constrained 'auto'
+  // CHECK: error: structured binding declaration cannot be declared with constrained 'auto'
   // XCHECK: fix-it:{{.*}}:{26:3-26:10}:""
   BinaryC<T> auto [c, d] = get_T<T>();
-  // CHECK: error: decomposition declaration cannot be declared with constrained 'auto'
+  // CHECK: error: structured binding declaration cannot be declared with constrained 'auto'
   // XCHECK: fix-it:{{.*}}:{29:3-29:14}:""
 }
 
diff --git a/clang/test/Frontend/rewrite-includes-bom.c b/clang/test/Frontend/rewrite-includes-bom.c
index caa431ad9aaff..27bf470ba1fd1 100644
--- a/clang/test/Frontend/rewrite-includes-bom.c
+++ b/clang/test/Frontend/rewrite-includes-bom.c
@@ -1,8 +1,8 @@
-// RUN: grep -q $'^\xEF\xBB\xBF' %S/Inputs/rewrite-includes-bom.h
+// RUN: cat %S/Inputs/rewrite-includes-bom.h | od -t x1 | grep -q 'ef\s*bb\s*bf'
 // RUN: %clang_cc1 -E -frewrite-includes -I %S/Inputs %s -o %t.c
-// RUN: ! grep -q $'\xEF\xBB\xBF' %t.c
+// RUN: cat %t.c | od -t x1 | not grep -q 'ef\s*bb\s*bf'
 // RUN: %clang_cc1 -fsyntax-only -verify %t.c
 // expected-no-diagnostics
-// REQUIRES: shell
+// UNSUPPORTED: system-windows
 
 #include "rewrite-includes-bom.h"
diff --git a/clang/test/Headers/__clang_hip_math.hip b/clang/test/Headers/__clang_hip_math.hip
index ef0d590591c7e..22c0689a4552e 100644
--- a/clang/test/Headers/__clang_hip_math.hip
+++ b/clang/test/Headers/__clang_hip_math.hip
@@ -50,7 +50,7 @@ typedef unsigned long long uint64_t;
 // CHECK-LABEL: define dso_local i64 @test___make_mantissa_base8(
 // CHECK-SAME: ptr noundef readonly captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*]]:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[P]], align 1, !tbaa [[CHAR_TBAA4:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[P]], align 1, !tbaa [[CHAR_TBAA8:![0-9]+]]
 // CHECK-NEXT:    [[CMP_NOT12_I:%.*]] = icmp eq i8 [[TMP0]], 0
 // CHECK-NEXT:    br i1 [[CMP_NOT12_I]], label %[[_ZL21__MAKE_MANTISSA_BASE8PKC_EXIT:.*]], label %[[WHILE_BODY_I:.*]]
 // CHECK:       [[WHILE_BODY_I]]:
@@ -66,9 +66,9 @@ typedef unsigned long long uint64_t;
 // CHECK-NEXT:    [[ADD_I:%.*]] = add nsw i64 [[CONV5_I]], -48
 // CHECK-NEXT:    [[SUB_I]] = or disjoint i64 [[ADD_I]], [[MUL_I]]
 // CHECK-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_013_I]], i64 1
-// CHECK-NEXT:    [[TMP3]] = load i8, ptr [[INCDEC_PTR_I]], align 1, !tbaa [[CHAR_TBAA4]]
+// CHECK-NEXT:    [[TMP3]] = load i8, ptr [[INCDEC_PTR_I]], align 1, !tbaa [[CHAR_TBAA8]]
 // CHECK-NEXT:    [[CMP_NOT_I:%.*]] = icmp eq i8 [[TMP3]], 0
-// CHECK-NEXT:    br i1 [[CMP_NOT_I]], label %[[_ZL21__MAKE_MANTISSA_BASE8PKC_EXIT]], label %[[WHILE_BODY_I]], !llvm.loop [[LOOP7:![0-9]+]]
+// CHECK-NEXT:    br i1 [[CMP_NOT_I]], label %[[_ZL21__MAKE_MANTISSA_BASE8PKC_EXIT]], label %[[WHILE_BODY_I]], !llvm.loop [[LOOP9:![0-9]+]]
 // CHECK:       [[_ZL21__MAKE_MANTISSA_BASE8PKC_EXIT]]:
 // CHECK-NEXT:    [[RETVAL_2_I:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[SUB_I]], %[[IF_THEN_I]] ], [ 0, %[[WHILE_BODY_I]] ]
 // CHECK-NEXT:    ret i64 [[RETVAL_2_I]]
@@ -80,7 +80,7 @@ typedef unsigned long long uint64_t;
 // AMDGCNSPIRV:       [[WHILE_COND_I]]:
 // AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_0_I:%.*]] = phi ptr addrspace(4) [ [[P]], %[[ENTRY]] ], [ [[__TAGP_ADDR_1_I:%.*]], %[[WHILE_BODY_I:.*]] ]
 // AMDGCNSPIRV-NEXT:    [[__R_0_I:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[__R_1_I:%.*]], %[[WHILE_BODY_I]] ]
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i8, ptr addrspace(4) [[__TAGP_ADDR_0_I]], align 1, !tbaa [[CHAR_TBAA5:![0-9]+]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i8, ptr addrspace(4) [[__TAGP_ADDR_0_I]], align 1, !tbaa [[CHAR_TBAA9:![0-9]+]]
 // AMDGCNSPIRV-NEXT:    [[CMP_NOT_I:%.*]] = icmp eq i8 [[TMP0]], 0
 // AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I]], label %[[_ZL21__MAKE_MANTISSA_BASE8PKC_EXIT:.*]], label %[[WHILE_BODY_I]]
 // AMDGCNSPIRV:       [[WHILE_BODY_I]]:
@@ -93,7 +93,7 @@ typedef unsigned long long uint64_t;
 // AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_1_IDX_I:%.*]] = zext i1 [[OR_COND_I]] to i64
 // AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_1_I]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[__TAGP_ADDR_0_I]], i64 [[__TAGP_ADDR_1_IDX_I]]
 // AMDGCNSPIRV-NEXT:    [[__R_1_I]] = select i1 [[OR_COND_I]], i64 [[SUB_I]], i64 [[__R_0_I]]
-// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND_I]], label %[[WHILE_COND_I]], label %[[_ZL21__MAKE_MANTISSA_BASE8PKC_EXIT]], !llvm.loop [[LOOP8:![0-9]+]]
+// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND_I]], label %[[WHILE_COND_I]], label %[[_ZL21__MAKE_MANTISSA_BASE8PKC_EXIT]], !llvm.loop [[LOOP10:![0-9]+]]
 // AMDGCNSPIRV:       [[_ZL21__MAKE_MANTISSA_BASE8PKC_EXIT]]:
 // AMDGCNSPIRV-NEXT:    [[RETVAL_2_I:%.*]] = phi i64 [ 0, %[[WHILE_BODY_I]] ], [ [[__R_0_I]], %[[WHILE_COND_I]] ]
 // AMDGCNSPIRV-NEXT:    ret i64 [[RETVAL_2_I]]
@@ -105,7 +105,7 @@ extern "C" __device__ uint64_t test___make_mantissa_base8(const char *p) {
 // CHECK-LABEL: define dso_local i64 @test___make_mantissa_base10(
 // CHECK-SAME: ptr noundef readonly captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*]]:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[P]], align 1, !tbaa [[CHAR_TBAA4]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[P]], align 1, !tbaa [[CHAR_TBAA8]]
 // CHECK-NEXT:    [[CMP_NOT12_I:%.*]] = icmp eq i8 [[TMP0]], 0
 // CHECK-NEXT:    br i1 [[CMP_NOT12_I]], label %[[_ZL22__MAKE_MANTISSA_BASE10PKC_EXIT:.*]], label %[[WHILE_BODY_I:.*]]
 // CHECK:       [[WHILE_BODY_I]]:
@@ -121,9 +121,9 @@ extern "C" __device__ uint64_t test___make_mantissa_base8(const char *p) {
 // CHECK-NEXT:    [[ADD_I:%.*]] = add nsw i64 [[CONV5_I]], -48
 // CHECK-NEXT:    [[SUB_I]] = add i64 [[ADD_I]], [[MUL_I]]
 // CHECK-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_013_I]], i64 1
-// CHECK-NEXT:    [[TMP3]] = load i8, ptr [[INCDEC_PTR_I]], align 1, !tbaa [[CHAR_TBAA4]]
+// CHECK-NEXT:    [[TMP3]] = load i8, ptr [[INCDEC_PTR_I]], align 1, !tbaa [[CHAR_TBAA8]]
 // CHECK-NEXT:    [[CMP_NOT_I:%.*]] = icmp eq i8 [[TMP3]], 0
-// CHECK-NEXT:    br i1 [[CMP_NOT_I]], label %[[_ZL22__MAKE_MANTISSA_BASE10PKC_EXIT]], label %[[WHILE_BODY_I]], !llvm.loop [[LOOP10:![0-9]+]]
+// CHECK-NEXT:    br i1 [[CMP_NOT_I]], label %[[_ZL22__MAKE_MANTISSA_BASE10PKC_EXIT]], label %[[WHILE_BODY_I]], !llvm.loop [[LOOP12:![0-9]+]]
 // CHECK:       [[_ZL22__MAKE_MANTISSA_BASE10PKC_EXIT]]:
 // CHECK-NEXT:    [[RETVAL_2_I:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[SUB_I]], %[[IF_THEN_I]] ], [ 0, %[[WHILE_BODY_I]] ]
 // CHECK-NEXT:    ret i64 [[RETVAL_2_I]]
@@ -135,7 +135,7 @@ extern "C" __device__ uint64_t test___make_mantissa_base8(const char *p) {
 // AMDGCNSPIRV:       [[WHILE_COND_I]]:
 // AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_0_I:%.*]] = phi ptr addrspace(4) [ [[P]], %[[ENTRY]] ], [ [[__TAGP_ADDR_1_I:%.*]], %[[WHILE_BODY_I:.*]] ]
 // AMDGCNSPIRV-NEXT:    [[__R_0_I:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[__R_1_I:%.*]], %[[WHILE_BODY_I]] ]
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i8, ptr addrspace(4) [[__TAGP_ADDR_0_I]], align 1, !tbaa [[CHAR_TBAA5]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i8, ptr addrspace(4) [[__TAGP_ADDR_0_I]], align 1, !tbaa [[CHAR_TBAA9]]
 // AMDGCNSPIRV-NEXT:    [[CMP_NOT_I:%.*]] = icmp eq i8 [[TMP0]], 0
 // AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I]], label %[[_ZL22__MAKE_MANTISSA_BASE10PKC_EXIT:.*]], label %[[WHILE_BODY_I]]
 // AMDGCNSPIRV:       [[WHILE_BODY_I]]:
@@ -148,7 +148,7 @@ extern "C" __device__ uint64_t test___make_mantissa_base8(const char *p) {
 // AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_1_IDX_I:%.*]] = zext i1 [[OR_COND_I]] to i64
 // AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_1_I]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[__TAGP_ADDR_0_I]], i64 [[__TAGP_ADDR_1_IDX_I]]
 // AMDGCNSPIRV-NEXT:    [[__R_1_I]] = select i1 [[OR_COND_I]], i64 [[SUB_I]], i64 [[__R_0_I]]
-// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND_I]], label %[[WHILE_COND_I]], label %[[_ZL22__MAKE_MANTISSA_BASE10PKC_EXIT]], !llvm.loop [[LOOP11:![0-9]+]]
+// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND_I]], label %[[WHILE_COND_I]], label %[[_ZL22__MAKE_MANTISSA_BASE10PKC_EXIT]], !llvm.loop [[LOOP13:![0-9]+]]
 // AMDGCNSPIRV:       [[_ZL22__MAKE_MANTISSA_BASE10PKC_EXIT]]:
 // AMDGCNSPIRV-NEXT:    [[RETVAL_2_I:%.*]] = phi i64 [ 0, %[[WHILE_BODY_I]] ], [ [[__R_0_I]], %[[WHILE_COND_I]] ]
 // AMDGCNSPIRV-NEXT:    ret i64 [[RETVAL_2_I]]
@@ -160,7 +160,7 @@ extern "C" __device__ uint64_t test___make_mantissa_base10(const char *p) {
 // CHECK-LABEL: define dso_local i64 @test___make_mantissa_base16(
 // CHECK-SAME: ptr noundef readonly captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*]]:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[P]], align 1, !tbaa [[CHAR_TBAA4]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[P]], align 1, !tbaa [[CHAR_TBAA8]]
 // CHECK-NEXT:    [[CMP_NOT48_I:%.*]] = icmp eq i8 [[TMP0]], 0
 // CHECK-NEXT:    br i1 [[CMP_NOT48_I]], label %[[_ZL22__MAKE_MANTISSA_BASE16PKC_EXIT:.*]], label %[[WHILE_BODY_I:.*]]
 // CHECK:       [[WHILE_BODY_I]]:
@@ -199,9 +199,9 @@ extern "C" __device__ uint64_t test___make_mantissa_base10(const char *p) {
 // CHECK:       [[IF_END31_I]]:
 // CHECK-NEXT:    [[__R_1_I]] = phi i64 [ [[SUB_I]], %[[IF_THEN_I]] ], [ [[ADD16_I]], %[[IF_THEN11_I]] ], [ [[ADD28_I]], %[[IF_THEN23_I]] ]
 // CHECK-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_049_I]], i64 1
-// CHECK-NEXT:    [[TMP5]] = load i8, ptr [[INCDEC_PTR_I]], align 1, !tbaa [[CHAR_TBAA4]]
+// CHECK-NEXT:    [[TMP5]] = load i8, ptr [[INCDEC_PTR_I]], align 1, !tbaa [[CHAR_TBAA8]]
 // CHECK-NEXT:    [[CMP_NOT_I:%.*]] = icmp eq i8 [[TMP5]], 0
-// CHECK-NEXT:    br i1 [[CMP_NOT_I]], label %[[_ZL22__MAKE_MANTISSA_BASE16PKC_EXIT]], label %[[WHILE_BODY_I]], !llvm.loop [[LOOP11:![0-9]+]]
+// CHECK-NEXT:    br i1 [[CMP_NOT_I]], label %[[_ZL22__MAKE_MANTISSA_BASE16PKC_EXIT]], label %[[WHILE_BODY_I]], !llvm.loop [[LOOP13:![0-9]+]]
 // CHECK:       [[_ZL22__MAKE_MANTISSA_BASE16PKC_EXIT]]:
 // CHECK-NEXT:    [[RETVAL_2_I:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[__R_1_I]], %[[IF_END31_I]] ], [ 0, %[[IF_ELSE17_I]] ]
 // CHECK-NEXT:    ret i64 [[RETVAL_2_I]]
@@ -209,7 +209,7 @@ extern "C" __device__ uint64_t test___make_mantissa_base10(const char *p) {
 // AMDGCNSPIRV-LABEL: define spir_func i64 @test___make_mantissa_base16(
 // AMDGCNSPIRV-SAME: ptr addrspace(4) noundef readonly captures(none) [[P:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR2]] {
 // AMDGCNSPIRV-NEXT:  [[ENTRY:.*]]:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i8, ptr addrspace(4) [[P]], align 1, !tbaa [[CHAR_TBAA5]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i8, ptr addrspace(4) [[P]], align 1, !tbaa [[CHAR_TBAA9]]
 // AMDGCNSPIRV-NEXT:    [[CMP_NOT48_I:%.*]] = icmp eq i8 [[TMP0]], 0
 // AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT48_I]], label %[[_ZL22__MAKE_MANTISSA_BASE16PKC_EXIT:.*]], label %[[WHILE_BODY_I:.*]]
 // AMDGCNSPIRV:       [[WHILE_BODY_I]]:
@@ -248,9 +248,9 @@ extern "C" __device__ uint64_t test___make_mantissa_base10(const char *p) {
 // AMDGCNSPIRV:       [[IF_END31_I]]:
 // AMDGCNSPIRV-NEXT:    [[__R_1_I]] = phi i64 [ [[SUB_I]], %[[IF_THEN_I]] ], [ [[ADD16_I]], %[[IF_THEN11_I]] ], [ [[ADD28_I]], %[[IF_THEN23_I]] ]
 // AMDGCNSPIRV-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[__TAGP_ADDR_049_I]], i64 1
-// AMDGCNSPIRV-NEXT:    [[TMP5]] = load i8, ptr addrspace(4) [[INCDEC_PTR_I]], align 1, !tbaa [[CHAR_TBAA5]]
+// AMDGCNSPIRV-NEXT:    [[TMP5]] = load i8, ptr addrspace(4) [[INCDEC_PTR_I]], align 1, !tbaa [[CHAR_TBAA9]]
 // AMDGCNSPIRV-NEXT:    [[CMP_NOT_I:%.*]] = icmp eq i8 [[TMP5]], 0
-// AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I]], label %[[_ZL22__MAKE_MANTISSA_BASE16PKC_EXIT]], label %[[WHILE_BODY_I]], !llvm.loop [[LOOP12:![0-9]+]]
+// AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I]], label %[[_ZL22__MAKE_MANTISSA_BASE16PKC_EXIT]], label %[[WHILE_BODY_I]], !llvm.loop [[LOOP14:![0-9]+]]
 // AMDGCNSPIRV:       [[_ZL22__MAKE_MANTISSA_BASE16PKC_EXIT]]:
 // AMDGCNSPIRV-NEXT:    [[RETVAL_2_I:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[__R_1_I]], %[[IF_END31_I]] ], [ 0, %[[IF_ELSE17_I]] ]
 // AMDGCNSPIRV-NEXT:    ret i64 [[RETVAL_2_I]]
@@ -262,14 +262,14 @@ extern "C" __device__ uint64_t test___make_mantissa_base16(const char *p) {
 // CHECK-LABEL: define dso_local i64 @test___make_mantissa(
 // CHECK-SAME: ptr noundef readonly captures(none) [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*]]:
-// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[P]], align 1, !tbaa [[CHAR_TBAA4]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[P]], align 1, !tbaa [[CHAR_TBAA8]]
 // CHECK-NEXT:    switch i8 [[TMP0]], label %[[WHILE_BODY_I25_I:.*]] [
 // CHECK-NEXT:      i8 48, label %[[IF_THEN_I:.*]]
 // CHECK-NEXT:      i8 0, label %[[_ZL15__MAKE_MANTISSAPKC_EXIT:.*]]
 // CHECK-NEXT:    ]
 // CHECK:       [[IF_THEN_I]]:
 // CHECK-NEXT:    [[INCDEC_PTR_I:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 1
-// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[INCDEC_PTR_I]], align 1, !tbaa [[CHAR_TBAA4]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[INCDEC_PTR_I]], align 1, !tbaa [[CHAR_TBAA8]]
 // CHECK-NEXT:    switch i8 [[TMP1]], label %[[WHILE_BODY_I14_I:.*]] [
 // CHECK-NEXT:      i8 88, label %[[WHILE_BODY_I_I_PREHEADER:.*]]
 // CHECK-NEXT:      i8 120, label %[[WHILE_BODY_I_I_PREHEADER]]
@@ -313,9 +313,9 @@ extern "C" __device__ uint64_t test___make_mantissa_base16(const char *p) {
 // CHECK:       [[IF_END31_I_I]]:
 // CHECK-NEXT:    [[__R_1_I_I]] = phi i64 [ [[SUB_I_I]], %[[IF_THEN_I_I]] ], [ [[ADD16_I_I]], %[[IF_THEN11_I_I]] ], [ [[ADD28_I_I]], %[[IF_THEN23_I_I]] ]
 // CHECK-NEXT:    [[INCDEC_PTR_I_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_049_I_I]], i64 1
-// CHECK-NEXT:    [[TMP6]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
+// CHECK-NEXT:    [[TMP6]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[CHAR_TBAA8]]
 // CHECK-NEXT:    [[CMP_NOT_I_I:%.*]] = icmp eq i8 [[TMP6]], 0
-// CHECK-NEXT:    br i1 [[CMP_NOT_I_I]], label %[[_ZL15__MAKE_MANTISSAPKC_EXIT]], label %[[WHILE_BODY_I_I]], !llvm.loop [[LOOP11]]
+// CHECK-NEXT:    br i1 [[CMP_NOT_I_I]], label %[[_ZL15__MAKE_MANTISSAPKC_EXIT]], label %[[WHILE_BODY_I_I]], !llvm.loop [[LOOP13]]
 // CHECK:       [[WHILE_BODY_I14_I]]:
 // CHECK-NEXT:    [[TMP7:%.*]] = phi i8 [ [[TMP9:%.*]], %[[IF_THEN_I17_I:.*]] ], [ [[TMP1]], %[[IF_THEN_I]] ]
 // CHECK-NEXT:    [[__R_014_I_I:%.*]] = phi i64 [ [[SUB_I21_I:%.*]], %[[IF_THEN_I17_I]] ], [ 0, %[[IF_THEN_I]] ]
@@ -329,9 +329,9 @@ extern "C" __device__ uint64_t test___make_mantissa_base16(const char *p) {
 // CHECK-NEXT:    [[ADD_I20_I:%.*]] = add nsw i64 [[CONV5_I19_I]], -48
 // CHECK-NEXT:    [[SUB_I21_I]] = or disjoint i64 [[ADD_I20_I]], [[MUL_I18_I]]
 // CHECK-NEXT:    [[INCDEC_PTR_I22_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_013_I_I]], i64 1
-// CHECK-NEXT:    [[TMP9]] = load i8, ptr [[INCDEC_PTR_I22_I]], align 1, !tbaa [[CHAR_TBAA4]]
+// CHECK-NEXT:    [[TMP9]] = load i8, ptr [[INCDEC_PTR_I22_I]], align 1, !tbaa [[CHAR_TBAA8]]
 // CHECK-NEXT:    [[CMP_NOT_I23_I:%.*]] = icmp eq i8 [[TMP9]], 0
-// CHECK-NEXT:    br i1 [[CMP_NOT_I23_I]], label %[[_ZL15__MAKE_MANTISSAPKC_EXIT]], label %[[WHILE_BODY_I14_I]], !llvm.loop [[LOOP7]]
+// CHECK-NEXT:    br i1 [[CMP_NOT_I23_I]], label %[[_ZL15__MAKE_MANTISSAPKC_EXIT]], label %[[WHILE_BODY_I14_I]], !llvm.loop [[LOOP9]]
 // CHECK:       [[WHILE_BODY_I25_I]]:
 // CHECK-NEXT:    [[TMP10:%.*]] = phi i8 [ [[TMP12:%.*]], %[[IF_THEN_I30_I:.*]] ], [ [[TMP0]], %[[ENTRY]] ]
 // CHECK-NEXT:    [[__R_014_I26_I:%.*]] = phi i64 [ [[SUB_I34_I:%.*]], %[[IF_THEN_I30_I]] ], [ 0, %[[ENTRY]] ]
@@ -345,9 +345,9 @@ extern "C" __device__ uint64_t test___make_mantissa_base16(const char *p) {
 // CHECK-NEXT:    [[ADD_I33_I:%.*]] = add nsw i64 [[CONV5_I32_I]], -48
 // CHECK-NEXT:    [[SUB_I34_I]] = add i64 [[ADD_I33_I]], [[MUL_I31_I]]
 // CHECK-NEXT:    [[INCDEC_PTR_I35_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_013_I27_I]], i64 1
-// CHECK-NEXT:    [[TMP12]] = load i8, ptr [[INCDEC_PTR_I35_I]], align 1, !tbaa [[CHAR_TBAA4]]
+// CHECK-NEXT:    [[TMP12]] = load i8, ptr [[INCDEC_PTR_I35_I]], align 1, !tbaa [[CHAR_TBAA8]]
 // CHECK-NEXT:    [[CMP_NOT_I36_I:%.*]] = icmp eq i8 [[TMP12]], 0
-// CHECK-NEXT:    br i1 [[CMP_NOT_I36_I]], label %[[_ZL15__MAKE_MANTISSAPKC_EXIT]], label %[[WHILE_BODY_I25_I]], !llvm.loop [[LOOP10]]
+// CHECK-NEXT:    br i1 [[CMP_NOT_I36_I]], label %[[_ZL15__MAKE_MANTISSAPKC_EXIT]], label %[[WHILE_BODY_I25_I]], !llvm.loop [[LOOP12]]
 // CHECK:       [[_ZL15__MAKE_MANTISSAPKC_EXIT]]:
 // CHECK-NEXT:    [[RETVAL_0_I:%.*]] = phi i64 [ 0, %[[IF_THEN_I]] ], [ 0, %[[ENTRY]] ], [ [[SUB_I34_I]], %[[IF_THEN_I30_I]] ], [ 0, %[[WHILE_BODY_I25_I]] ], [ [[SUB_I21_I]], %[[IF_THEN_I17_I]] ], [ 0, %[[WHILE_BODY_I14_I]] ], [ [[__R_1_I_I]], %[[IF_END31_I_I]] ], [ 0, %[[IF_ELSE17_I_I]] ]
 // CHECK-NEXT:    ret i64 [[RETVAL_0_I]]
@@ -355,12 +355,12 @@ extern "C" __device__ uint64_t test___make_mantissa_base16(const char *p) {
 // AMDGCNSPIRV-LABEL: define spir_func i64 @test___make_mantissa(
 // AMDGCNSPIRV-SAME: ptr addrspace(4) noundef readonly captures(none) [[P:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR2]] {
 // AMDGCNSPIRV-NEXT:  [[ENTRY:.*]]:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i8, ptr addrspace(4) [[P]], align 1, !tbaa [[CHAR_TBAA5]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i8, ptr addrspace(4) [[P]], align 1, !tbaa [[CHAR_TBAA9]]
 // AMDGCNSPIRV-NEXT:    [[CMP_I:%.*]] = icmp eq i8 [[TMP0]], 48
 // AMDGCNSPIRV-NEXT:    br i1 [[CMP_I]], label %[[IF_THEN_I:.*]], label %[[WHILE_COND_I23_I:.*]]
 // AMDGCNSPIRV:       [[IF_THEN_I]]:
 // AMDGCNSPIRV-NEXT:    [[INCDEC_PTR_I:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[P]], i64 1
-// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = load i8, ptr addrspace(4) [[INCDEC_PTR_I]], align 1, !tbaa [[CHAR_TBAA5]]
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = load i8, ptr addrspace(4) [[INCDEC_PTR_I]], align 1, !tbaa [[CHAR_TBAA9]]
 // AMDGCNSPIRV-NEXT:    switch i8 [[TMP1]], label %[[WHILE_COND_I_I:.*]] [
 // AMDGCNSPIRV-NEXT:      i8 88, label %[[WHILE_BODY_I_I_PREHEADER:.*]]
 // AMDGCNSPIRV-NEXT:      i8 120, label %[[WHILE_BODY_I_I_PREHEADER]]
@@ -403,13 +403,13 @@ extern "C" __device__ uint64_t test___make_mantissa_base16(const char *p) {
 // AMDGCNSPIRV:       [[IF_END31_I_I]]:
 // AMDGCNSPIRV-NEXT:    [[__R_1_I_I]] = phi i64 [ [[SUB_I_I]], %[[IF_THEN_I_I]] ], [ [[ADD16_I_I]], %[[IF_THEN11_I_I]] ], [ [[ADD28_I_I]], %[[IF_THEN23_I_I]] ]
 // AMDGCNSPIRV-NEXT:    [[INCDEC_PTR_I_I]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[__TAGP_ADDR_049_I_I]], i64 1
-// AMDGCNSPIRV-NEXT:    [[TMP6]] = load i8, ptr addrspace(4) [[INCDEC_PTR_I_I]], align 1, !tbaa [[CHAR_TBAA5]]
+// AMDGCNSPIRV-NEXT:    [[TMP6]] = load i8, ptr addrspace(4) [[INCDEC_PTR_I_I]], align 1, !tbaa [[CHAR_TBAA9]]
 // AMDGCNSPIRV-NEXT:    [[CMP_NOT_I_I:%.*]] = icmp eq i8 [[TMP6]], 0
-// AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I_I]], label %[[_ZL15__MAKE_MANTISSAPKC_EXIT]], label %[[WHILE_BODY_I_I]], !llvm.loop [[LOOP12]]
+// AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I_I]], label %[[_ZL15__MAKE_MANTISSAPKC_EXIT]], label %[[WHILE_BODY_I_I]], !llvm.loop [[LOOP14]]
 // AMDGCNSPIRV:       [[WHILE_COND_I_I]]:
 // AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_0_I_I:%.*]] = phi ptr addrspace(4) [ [[__TAGP_ADDR_1_I_I:%.*]], %[[WHILE_BODY_I15_I:.*]] ], [ [[INCDEC_PTR_I]], %[[IF_THEN_I]] ]
 // AMDGCNSPIRV-NEXT:    [[__R_0_I_I:%.*]] = phi i64 [ [[__R_1_I21_I:%.*]], %[[WHILE_BODY_I15_I]] ], [ 0, %[[IF_THEN_I]] ]
-// AMDGCNSPIRV-NEXT:    [[TMP7:%.*]] = load i8, ptr addrspace(4) [[__TAGP_ADDR_0_I_I]], align 1, !tbaa [[CHAR_TBAA5]]
+// AMDGCNSPIRV-NEXT:    [[TMP7:%.*]] = load i8, ptr addrspace(4) [[__TAGP_ADDR_0_I_I]], align 1, !tbaa [[CHAR_TBAA9]]
 // AMDGCNSPIRV-NEXT:    [[CMP_NOT_I14_I:%.*]] = icmp eq i8 [[TMP7]], 0
 // AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I14_I]], label %[[_ZL15__MAKE_MANTISSAPKC_EXIT]], label %[[WHILE_BODY_I15_I]]
 // AMDGCNSPIRV:       [[WHILE_BODY_I15_I]]:
@@ -422,11 +422,11 @@ extern "C" __device__ uint64_t test___make_mantissa_base16(const char *p) {
 // AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_1_IDX_I_I:%.*]] = zext i1 [[OR_COND_I16_I]] to i64
 // AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_1_I_I]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[__TAGP_ADDR_0_I_I]], i64 [[__TAGP_ADDR_1_IDX_I_I]]
 // AMDGCNSPIRV-NEXT:    [[__R_1_I21_I]] = select i1 [[OR_COND_I16_I]], i64 [[SUB_I20_I]], i64 [[__R_0_I_I]]
-// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND_I16_I]], label %[[WHILE_COND_I_I]], label %[[_ZL15__MAKE_MANTISSAPKC_EXIT]], !llvm.loop [[LOOP8]]
+// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND_I16_I]], label %[[WHILE_COND_I_I]], label %[[_ZL15__MAKE_MANTISSAPKC_EXIT]], !llvm.loop [[LOOP10]]
 // AMDGCNSPIRV:       [[WHILE_COND_I23_I]]:
 // AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_0_I24_I:%.*]] = phi ptr addrspace(4) [ [[__TAGP_ADDR_1_I34_I:%.*]], %[[WHILE_BODY_I27_I:.*]] ], [ [[P]], %[[ENTRY]] ]
 // AMDGCNSPIRV-NEXT:    [[__R_0_I25_I:%.*]] = phi i64 [ [[__R_1_I35_I:%.*]], %[[WHILE_BODY_I27_I]] ], [ 0, %[[ENTRY]] ]
-// AMDGCNSPIRV-NEXT:    [[TMP9:%.*]] = load i8, ptr addrspace(4) [[__TAGP_ADDR_0_I24_I]], align 1, !tbaa [[CHAR_TBAA5]]
+// AMDGCNSPIRV-NEXT:    [[TMP9:%.*]] = load i8, ptr addrspace(4) [[__TAGP_ADDR_0_I24_I]], align 1, !tbaa [[CHAR_TBAA9]]
 // AMDGCNSPIRV-NEXT:    [[CMP_NOT_I26_I:%.*]] = icmp eq i8 [[TMP9]], 0
 // AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I26_I]], label %[[_ZL15__MAKE_MANTISSAPKC_EXIT]], label %[[WHILE_BODY_I27_I]]
 // AMDGCNSPIRV:       [[WHILE_BODY_I27_I]]:
@@ -439,7 +439,7 @@ extern "C" __device__ uint64_t test___make_mantissa_base16(const char *p) {
 // AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_1_IDX_I33_I:%.*]] = zext i1 [[OR_COND_I28_I]] to i64
 // AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_1_I34_I]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[__TAGP_ADDR_0_I24_I]], i64 [[__TAGP_ADDR_1_IDX_I33_I]]
 // AMDGCNSPIRV-NEXT:    [[__R_1_I35_I]] = select i1 [[OR_COND_I28_I]], i64 [[SUB_I32_I]], i64 [[__R_0_I25_I]]
-// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND_I28_I]], label %[[WHILE_COND_I23_I]], label %[[_ZL15__MAKE_MANTISSAPKC_EXIT]], !llvm.loop [[LOOP11]]
+// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND_I28_I]], label %[[WHILE_COND_I23_I]], label %[[_ZL15__MAKE_MANTISSAPKC_EXIT]], !llvm.loop [[LOOP13]]
 // AMDGCNSPIRV:       [[_ZL15__MAKE_MANTISSAPKC_EXIT]]:
 // AMDGCNSPIRV-NEXT:    [[RETVAL_0_I:%.*]] = phi i64 [ 0, %[[WHILE_BODY_I15_I]] ], [ [[__R_0_I_I]], %[[WHILE_COND_I_I]] ], [ [[__R_1_I_I]], %[[IF_END31_I_I]] ], [ 0, %[[IF_ELSE17_I_I]] ], [ 0, %[[WHILE_BODY_I27_I]] ], [ [[__R_0_I25_I]], %[[WHILE_COND_I23_I]] ]
 // AMDGCNSPIRV-NEXT:    ret i64 [[RETVAL_0_I]]
@@ -2082,7 +2082,7 @@ extern "C" __device__ double test_fdim(double x, double y) {
 // NCRDIV-LABEL: define dso_local noundef float @test_fdividef(
 // NCRDIV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
 // NCRDIV-NEXT:  [[ENTRY:.*:]]
-// NCRDIV-NEXT:    [[DIV_I:%.*]] = fdiv contract float [[X]], [[Y]], !fpmath [[META12:![0-9]+]]
+// NCRDIV-NEXT:    [[DIV_I:%.*]] = fdiv contract float [[X]], [[Y]], !fpmath [[META14:![0-9]+]]
 // NCRDIV-NEXT:    ret float [[DIV_I]]
 //
 // AMDGCNSPIRV-LABEL: define spir_func noundef float @test_fdividef(
@@ -2474,7 +2474,7 @@ extern "C" __device__ double test_fmod(double x, double y) {
 // DEFAULT-NEXT:  [[ENTRY:.*:]]
 // DEFAULT-NEXT:    [[TMP0:%.*]] = tail call { float, i32 } @llvm.frexp.f32.i32(float [[X]])
 // DEFAULT-NEXT:    [[TMP1:%.*]] = extractvalue { float, i32 } [[TMP0]], 1
-// DEFAULT-NEXT:    store i32 [[TMP1]], ptr [[Y]], align 4, !tbaa [[INT_TBAA12:![0-9]+]]
+// DEFAULT-NEXT:    store i32 [[TMP1]], ptr [[Y]], align 4, !tbaa [[INT_TBAA4:![0-9]+]]
 // DEFAULT-NEXT:    [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP0]], 0
 // DEFAULT-NEXT:    ret float [[TMP2]]
 //
@@ -2483,7 +2483,7 @@ extern "C" __device__ double test_fmod(double x, double y) {
 // FINITEONLY-NEXT:  [[ENTRY:.*:]]
 // FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call { float, i32 } @llvm.frexp.f32.i32(float nofpclass(nan inf) [[X]])
 // FINITEONLY-NEXT:    [[TMP1:%.*]] = extractvalue { float, i32 } [[TMP0]], 1
-// FINITEONLY-NEXT:    store i32 [[TMP1]], ptr [[Y]], align 4, !tbaa [[INT_TBAA12:![0-9]+]]
+// FINITEONLY-NEXT:    store i32 [[TMP1]], ptr [[Y]], align 4, !tbaa [[INT_TBAA4:![0-9]+]]
 // FINITEONLY-NEXT:    [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP0]], 0
 // FINITEONLY-NEXT:    ret float [[TMP2]]
 //
@@ -2492,7 +2492,7 @@ extern "C" __device__ double test_fmod(double x, double y) {
 // APPROX-NEXT:  [[ENTRY:.*:]]
 // APPROX-NEXT:    [[TMP0:%.*]] = tail call { float, i32 } @llvm.frexp.f32.i32(float [[X]])
 // APPROX-NEXT:    [[TMP1:%.*]] = extractvalue { float, i32 } [[TMP0]], 1
-// APPROX-NEXT:    store i32 [[TMP1]], ptr [[Y]], align 4, !tbaa [[INT_TBAA12:![0-9]+]]
+// APPROX-NEXT:    store i32 [[TMP1]], ptr [[Y]], align 4, !tbaa [[INT_TBAA4:![0-9]+]]
 // APPROX-NEXT:    [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP0]], 0
 // APPROX-NEXT:    ret float [[TMP2]]
 //
@@ -2501,7 +2501,7 @@ extern "C" __device__ double test_fmod(double x, double y) {
 // NCRDIV-NEXT:  [[ENTRY:.*:]]
 // NCRDIV-NEXT:    [[TMP0:%.*]] = tail call { float, i32 } @llvm.frexp.f32.i32(float [[X]])
 // NCRDIV-NEXT:    [[TMP1:%.*]] = extractvalue { float, i32 } [[TMP0]], 1
-// NCRDIV-NEXT:    store i32 [[TMP1]], ptr [[Y]], align 4, !tbaa [[INT_TBAA13:![0-9]+]]
+// NCRDIV-NEXT:    store i32 [[TMP1]], ptr [[Y]], align 4, !tbaa [[INT_TBAA4:![0-9]+]]
 // NCRDIV-NEXT:    [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP0]], 0
 // NCRDIV-NEXT:    ret float [[TMP2]]
 //
@@ -2510,7 +2510,7 @@ extern "C" __device__ double test_fmod(double x, double y) {
 // AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
 // AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) { float, i32 } @llvm.frexp.f32.i32(float [[X]])
 // AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = extractvalue { float, i32 } [[TMP0]], 1
-// AMDGCNSPIRV-NEXT:    store i32 [[TMP1]], ptr addrspace(4) [[Y]], align 4, !tbaa [[INT_TBAA13:![0-9]+]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP1]], ptr addrspace(4) [[Y]], align 4, !tbaa [[INT_TBAA5:![0-9]+]]
 // AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP0]], 0
 // AMDGCNSPIRV-NEXT:    ret float [[TMP2]]
 //
@@ -2523,7 +2523,7 @@ extern "C" __device__ float test_frexpf(float x, int* y) {
 // DEFAULT-NEXT:  [[ENTRY:.*:]]
 // DEFAULT-NEXT:    [[TMP0:%.*]] = tail call { double, i32 } @llvm.frexp.f64.i32(double [[X]])
 // DEFAULT-NEXT:    [[TMP1:%.*]] = extractvalue { double, i32 } [[TMP0]], 1
-// DEFAULT-NEXT:    store i32 [[TMP1]], ptr [[Y]], align 4, !tbaa [[INT_TBAA12]]
+// DEFAULT-NEXT:    store i32 [[TMP1]], ptr [[Y]], align 4, !tbaa [[INT_TBAA4]]
 // DEFAULT-NEXT:    [[TMP2:%.*]] = extractvalue { double, i32 } [[TMP0]], 0
 // DEFAULT-NEXT:    ret double [[TMP2]]
 //
@@ -2532,7 +2532,7 @@ extern "C" __device__ float test_frexpf(float x, int* y) {
 // FINITEONLY-NEXT:  [[ENTRY:.*:]]
 // FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call { double, i32 } @llvm.frexp.f64.i32(double nofpclass(nan inf) [[X]])
 // FINITEONLY-NEXT:    [[TMP1:%.*]] = extractvalue { double, i32 } [[TMP0]], 1
-// FINITEONLY-NEXT:    store i32 [[TMP1]], ptr [[Y]], align 4, !tbaa [[INT_TBAA12]]
+// FINITEONLY-NEXT:    store i32 [[TMP1]], ptr [[Y]], align 4, !tbaa [[INT_TBAA4]]
 // FINITEONLY-NEXT:    [[TMP2:%.*]] = extractvalue { double, i32 } [[TMP0]], 0
 // FINITEONLY-NEXT:    ret double [[TMP2]]
 //
@@ -2541,7 +2541,7 @@ extern "C" __device__ float test_frexpf(float x, int* y) {
 // APPROX-NEXT:  [[ENTRY:.*:]]
 // APPROX-NEXT:    [[TMP0:%.*]] = tail call { double, i32 } @llvm.frexp.f64.i32(double [[X]])
 // APPROX-NEXT:    [[TMP1:%.*]] = extractvalue { double, i32 } [[TMP0]], 1
-// APPROX-NEXT:    store i32 [[TMP1]], ptr [[Y]], align 4, !tbaa [[INT_TBAA12]]
+// APPROX-NEXT:    store i32 [[TMP1]], ptr [[Y]], align 4, !tbaa [[INT_TBAA4]]
 // APPROX-NEXT:    [[TMP2:%.*]] = extractvalue { double, i32 } [[TMP0]], 0
 // APPROX-NEXT:    ret double [[TMP2]]
 //
@@ -2550,7 +2550,7 @@ extern "C" __device__ float test_frexpf(float x, int* y) {
 // NCRDIV-NEXT:  [[ENTRY:.*:]]
 // NCRDIV-NEXT:    [[TMP0:%.*]] = tail call { double, i32 } @llvm.frexp.f64.i32(double [[X]])
 // NCRDIV-NEXT:    [[TMP1:%.*]] = extractvalue { double, i32 } [[TMP0]], 1
-// NCRDIV-NEXT:    store i32 [[TMP1]], ptr [[Y]], align 4, !tbaa [[INT_TBAA13]]
+// NCRDIV-NEXT:    store i32 [[TMP1]], ptr [[Y]], align 4, !tbaa [[INT_TBAA4]]
 // NCRDIV-NEXT:    [[TMP2:%.*]] = extractvalue { double, i32 } [[TMP0]], 0
 // NCRDIV-NEXT:    ret double [[TMP2]]
 //
@@ -2559,7 +2559,7 @@ extern "C" __device__ float test_frexpf(float x, int* y) {
 // AMDGCNSPIRV-NEXT:  [[ENTRY:.*:]]
 // AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = tail call addrspace(4) { double, i32 } @llvm.frexp.f64.i32(double [[X]])
 // AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = extractvalue { double, i32 } [[TMP0]], 1
-// AMDGCNSPIRV-NEXT:    store i32 [[TMP1]], ptr addrspace(4) [[Y]], align 4, !tbaa [[INT_TBAA13]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP1]], ptr addrspace(4) [[Y]], align 4, !tbaa [[INT_TBAA5]]
 // AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = extractvalue { double, i32 } [[TMP0]], 0
 // AMDGCNSPIRV-NEXT:    ret double [[TMP2]]
 //
@@ -3203,7 +3203,7 @@ extern "C" __device__ double test_j1(double x) {
 // NCRDIV-NEXT:    [[__X0_024_I:%.*]] = phi float [ [[__X1_025_I]], %[[FOR_BODY_I]] ], [ [[CALL_I21_I]], %[[IF_END4_I]] ]
 // NCRDIV-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_026_I]], 1
 // NCRDIV-NEXT:    [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to float
-// NCRDIV-NEXT:    [[DIV_I:%.*]] = fdiv contract float [[CONV_I]], [[Y]], !fpmath [[META12]]
+// NCRDIV-NEXT:    [[DIV_I:%.*]] = fdiv contract float [[CONV_I]], [[Y]], !fpmath [[META14]]
 // NCRDIV-NEXT:    [[MUL8_I:%.*]] = fmul contract float [[__X1_025_I]], [[DIV_I]]
 // NCRDIV-NEXT:    [[SUB_I]] = fsub contract float [[MUL8_I]], [[__X0_024_I]]
 // NCRDIV-NEXT:    [[INC_I]] = add nuw nsw i32 [[__I_026_I]], 1
@@ -4302,14 +4302,14 @@ extern "C" __device__ double test_modf(double x, double* y) {
 // DEFAULT-LABEL: define dso_local float @test_nanf(
 // DEFAULT-SAME: ptr noundef readonly captures(none) [[TAG:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // DEFAULT-NEXT:  [[ENTRY:.*]]:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = load i8, ptr [[TAG]], align 1, !tbaa [[CHAR_TBAA4]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = load i8, ptr [[TAG]], align 1, !tbaa [[CHAR_TBAA8]]
 // DEFAULT-NEXT:    switch i8 [[TMP0]], label %[[WHILE_BODY_I25_I_I:.*]] [
 // DEFAULT-NEXT:      i8 48, label %[[IF_THEN_I_I:.*]]
 // DEFAULT-NEXT:      i8 0, label %[[_ZL4NANFPKC_EXIT:.*]]
 // DEFAULT-NEXT:    ]
 // DEFAULT:       [[IF_THEN_I_I]]:
 // DEFAULT-NEXT:    [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TAG]], i64 1
-// DEFAULT-NEXT:    [[TMP1:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
+// DEFAULT-NEXT:    [[TMP1:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[CHAR_TBAA8]]
 // DEFAULT-NEXT:    switch i8 [[TMP1]], label %[[WHILE_BODY_I14_I_I:.*]] [
 // DEFAULT-NEXT:      i8 88, label %[[WHILE_BODY_I_I_I_PREHEADER:.*]]
 // DEFAULT-NEXT:      i8 120, label %[[WHILE_BODY_I_I_I_PREHEADER]]
@@ -4353,9 +4353,9 @@ extern "C" __device__ double test_modf(double x, double* y) {
 // DEFAULT:       [[IF_END31_I_I_I]]:
 // DEFAULT-NEXT:    [[__R_1_I_I_I]] = phi i64 [ [[SUB_I_I_I]], %[[IF_THEN_I_I_I]] ], [ [[ADD16_I_I_I]], %[[IF_THEN11_I_I_I]] ], [ [[ADD28_I_I_I]], %[[IF_THEN23_I_I_I]] ]
 // DEFAULT-NEXT:    [[INCDEC_PTR_I_I_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_049_I_I_I]], i64 1
-// DEFAULT-NEXT:    [[TMP6]] = load i8, ptr [[INCDEC_PTR_I_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
+// DEFAULT-NEXT:    [[TMP6]] = load i8, ptr [[INCDEC_PTR_I_I_I]], align 1, !tbaa [[CHAR_TBAA8]]
 // DEFAULT-NEXT:    [[CMP_NOT_I_I_I:%.*]] = icmp eq i8 [[TMP6]], 0
-// DEFAULT-NEXT:    br i1 [[CMP_NOT_I_I_I]], label %[[_ZL4NANFPKC_EXIT]], label %[[WHILE_BODY_I_I_I]], !llvm.loop [[LOOP11]]
+// DEFAULT-NEXT:    br i1 [[CMP_NOT_I_I_I]], label %[[_ZL4NANFPKC_EXIT]], label %[[WHILE_BODY_I_I_I]], !llvm.loop [[LOOP13]]
 // DEFAULT:       [[WHILE_BODY_I14_I_I]]:
 // DEFAULT-NEXT:    [[TMP7:%.*]] = phi i8 [ [[TMP9:%.*]], %[[IF_THEN_I17_I_I:.*]] ], [ [[TMP1]], %[[IF_THEN_I_I]] ]
 // DEFAULT-NEXT:    [[__R_014_I_I_I:%.*]] = phi i64 [ [[SUB_I21_I_I:%.*]], %[[IF_THEN_I17_I_I]] ], [ 0, %[[IF_THEN_I_I]] ]
@@ -4369,9 +4369,9 @@ extern "C" __device__ double test_modf(double x, double* y) {
 // DEFAULT-NEXT:    [[ADD_I20_I_I:%.*]] = add nsw i64 [[CONV5_I19_I_I]], -48
 // DEFAULT-NEXT:    [[SUB_I21_I_I]] = or disjoint i64 [[ADD_I20_I_I]], [[MUL_I18_I_I]]
 // DEFAULT-NEXT:    [[INCDEC_PTR_I22_I_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_013_I_I_I]], i64 1
-// DEFAULT-NEXT:    [[TMP9]] = load i8, ptr [[INCDEC_PTR_I22_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
+// DEFAULT-NEXT:    [[TMP9]] = load i8, ptr [[INCDEC_PTR_I22_I_I]], align 1, !tbaa [[CHAR_TBAA8]]
 // DEFAULT-NEXT:    [[CMP_NOT_I23_I_I:%.*]] = icmp eq i8 [[TMP9]], 0
-// DEFAULT-NEXT:    br i1 [[CMP_NOT_I23_I_I]], label %[[_ZL4NANFPKC_EXIT]], label %[[WHILE_BODY_I14_I_I]], !llvm.loop [[LOOP7]]
+// DEFAULT-NEXT:    br i1 [[CMP_NOT_I23_I_I]], label %[[_ZL4NANFPKC_EXIT]], label %[[WHILE_BODY_I14_I_I]], !llvm.loop [[LOOP9]]
 // DEFAULT:       [[WHILE_BODY_I25_I_I]]:
 // DEFAULT-NEXT:    [[TMP10:%.*]] = phi i8 [ [[TMP12:%.*]], %[[IF_THEN_I30_I_I:.*]] ], [ [[TMP0]], %[[ENTRY]] ]
 // DEFAULT-NEXT:    [[__R_014_I26_I_I:%.*]] = phi i64 [ [[SUB_I34_I_I:%.*]], %[[IF_THEN_I30_I_I]] ], [ 0, %[[ENTRY]] ]
@@ -4385,9 +4385,9 @@ extern "C" __device__ double test_modf(double x, double* y) {
 // DEFAULT-NEXT:    [[ADD_I33_I_I:%.*]] = add nsw i64 [[CONV5_I32_I_I]], -48
 // DEFAULT-NEXT:    [[SUB_I34_I_I]] = add i64 [[ADD_I33_I_I]], [[MUL_I31_I_I]]
 // DEFAULT-NEXT:    [[INCDEC_PTR_I35_I_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_013_I27_I_I]], i64 1
-// DEFAULT-NEXT:    [[TMP12]] = load i8, ptr [[INCDEC_PTR_I35_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
+// DEFAULT-NEXT:    [[TMP12]] = load i8, ptr [[INCDEC_PTR_I35_I_I]], align 1, !tbaa [[CHAR_TBAA8]]
 // DEFAULT-NEXT:    [[CMP_NOT_I36_I_I:%.*]] = icmp eq i8 [[TMP12]], 0
-// DEFAULT-NEXT:    br i1 [[CMP_NOT_I36_I_I]], label %[[_ZL4NANFPKC_EXIT]], label %[[WHILE_BODY_I25_I_I]], !llvm.loop [[LOOP10]]
+// DEFAULT-NEXT:    br i1 [[CMP_NOT_I36_I_I]], label %[[_ZL4NANFPKC_EXIT]], label %[[WHILE_BODY_I25_I_I]], !llvm.loop [[LOOP12]]
 // DEFAULT:       [[_ZL4NANFPKC_EXIT]]:
 // DEFAULT-NEXT:    [[RETVAL_0_I_I:%.*]] = phi i64 [ 0, %[[IF_THEN_I_I]] ], [ 0, %[[ENTRY]] ], [ [[SUB_I34_I_I]], %[[IF_THEN_I30_I_I]] ], [ 0, %[[WHILE_BODY_I25_I_I]] ], [ [[SUB_I21_I_I]], %[[IF_THEN_I17_I_I]] ], [ 0, %[[WHILE_BODY_I14_I_I]] ], [ [[__R_1_I_I_I]], %[[IF_END31_I_I_I]] ], [ 0, %[[IF_ELSE17_I_I_I]] ]
 // DEFAULT-NEXT:    [[CONV_I:%.*]] = trunc i64 [[RETVAL_0_I_I]] to i32
@@ -4404,14 +4404,14 @@ extern "C" __device__ double test_modf(double x, double* y) {
 // APPROX-LABEL: define dso_local float @test_nanf(
 // APPROX-SAME: ptr noundef readonly captures(none) [[TAG:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // APPROX-NEXT:  [[ENTRY:.*]]:
-// APPROX-NEXT:    [[TMP0:%.*]] = load i8, ptr [[TAG]], align 1, !tbaa [[CHAR_TBAA4]]
+// APPROX-NEXT:    [[TMP0:%.*]] = load i8, ptr [[TAG]], align 1, !tbaa [[CHAR_TBAA8]]
 // APPROX-NEXT:    switch i8 [[TMP0]], label %[[WHILE_BODY_I25_I_I:.*]] [
 // APPROX-NEXT:      i8 48, label %[[IF_THEN_I_I:.*]]
 // APPROX-NEXT:      i8 0, label %[[_ZL4NANFPKC_EXIT:.*]]
 // APPROX-NEXT:    ]
 // APPROX:       [[IF_THEN_I_I]]:
 // APPROX-NEXT:    [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TAG]], i64 1
-// APPROX-NEXT:    [[TMP1:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
+// APPROX-NEXT:    [[TMP1:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[CHAR_TBAA8]]
 // APPROX-NEXT:    switch i8 [[TMP1]], label %[[WHILE_BODY_I14_I_I:.*]] [
 // APPROX-NEXT:      i8 88, label %[[WHILE_BODY_I_I_I_PREHEADER:.*]]
 // APPROX-NEXT:      i8 120, label %[[WHILE_BODY_I_I_I_PREHEADER]]
@@ -4455,9 +4455,9 @@ extern "C" __device__ double test_modf(double x, double* y) {
 // APPROX:       [[IF_END31_I_I_I]]:
 // APPROX-NEXT:    [[__R_1_I_I_I]] = phi i64 [ [[SUB_I_I_I]], %[[IF_THEN_I_I_I]] ], [ [[ADD16_I_I_I]], %[[IF_THEN11_I_I_I]] ], [ [[ADD28_I_I_I]], %[[IF_THEN23_I_I_I]] ]
 // APPROX-NEXT:    [[INCDEC_PTR_I_I_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_049_I_I_I]], i64 1
-// APPROX-NEXT:    [[TMP6]] = load i8, ptr [[INCDEC_PTR_I_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
+// APPROX-NEXT:    [[TMP6]] = load i8, ptr [[INCDEC_PTR_I_I_I]], align 1, !tbaa [[CHAR_TBAA8]]
 // APPROX-NEXT:    [[CMP_NOT_I_I_I:%.*]] = icmp eq i8 [[TMP6]], 0
-// APPROX-NEXT:    br i1 [[CMP_NOT_I_I_I]], label %[[_ZL4NANFPKC_EXIT]], label %[[WHILE_BODY_I_I_I]], !llvm.loop [[LOOP11]]
+// APPROX-NEXT:    br i1 [[CMP_NOT_I_I_I]], label %[[_ZL4NANFPKC_EXIT]], label %[[WHILE_BODY_I_I_I]], !llvm.loop [[LOOP13]]
 // APPROX:       [[WHILE_BODY_I14_I_I]]:
 // APPROX-NEXT:    [[TMP7:%.*]] = phi i8 [ [[TMP9:%.*]], %[[IF_THEN_I17_I_I:.*]] ], [ [[TMP1]], %[[IF_THEN_I_I]] ]
 // APPROX-NEXT:    [[__R_014_I_I_I:%.*]] = phi i64 [ [[SUB_I21_I_I:%.*]], %[[IF_THEN_I17_I_I]] ], [ 0, %[[IF_THEN_I_I]] ]
@@ -4471,9 +4471,9 @@ extern "C" __device__ double test_modf(double x, double* y) {
 // APPROX-NEXT:    [[ADD_I20_I_I:%.*]] = add nsw i64 [[CONV5_I19_I_I]], -48
 // APPROX-NEXT:    [[SUB_I21_I_I]] = or disjoint i64 [[ADD_I20_I_I]], [[MUL_I18_I_I]]
 // APPROX-NEXT:    [[INCDEC_PTR_I22_I_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_013_I_I_I]], i64 1
-// APPROX-NEXT:    [[TMP9]] = load i8, ptr [[INCDEC_PTR_I22_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
+// APPROX-NEXT:    [[TMP9]] = load i8, ptr [[INCDEC_PTR_I22_I_I]], align 1, !tbaa [[CHAR_TBAA8]]
 // APPROX-NEXT:    [[CMP_NOT_I23_I_I:%.*]] = icmp eq i8 [[TMP9]], 0
-// APPROX-NEXT:    br i1 [[CMP_NOT_I23_I_I]], label %[[_ZL4NANFPKC_EXIT]], label %[[WHILE_BODY_I14_I_I]], !llvm.loop [[LOOP7]]
+// APPROX-NEXT:    br i1 [[CMP_NOT_I23_I_I]], label %[[_ZL4NANFPKC_EXIT]], label %[[WHILE_BODY_I14_I_I]], !llvm.loop [[LOOP9]]
 // APPROX:       [[WHILE_BODY_I25_I_I]]:
 // APPROX-NEXT:    [[TMP10:%.*]] = phi i8 [ [[TMP12:%.*]], %[[IF_THEN_I30_I_I:.*]] ], [ [[TMP0]], %[[ENTRY]] ]
 // APPROX-NEXT:    [[__R_014_I26_I_I:%.*]] = phi i64 [ [[SUB_I34_I_I:%.*]], %[[IF_THEN_I30_I_I]] ], [ 0, %[[ENTRY]] ]
@@ -4487,9 +4487,9 @@ extern "C" __device__ double test_modf(double x, double* y) {
 // APPROX-NEXT:    [[ADD_I33_I_I:%.*]] = add nsw i64 [[CONV5_I32_I_I]], -48
 // APPROX-NEXT:    [[SUB_I34_I_I]] = add i64 [[ADD_I33_I_I]], [[MUL_I31_I_I]]
 // APPROX-NEXT:    [[INCDEC_PTR_I35_I_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_013_I27_I_I]], i64 1
-// APPROX-NEXT:    [[TMP12]] = load i8, ptr [[INCDEC_PTR_I35_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
+// APPROX-NEXT:    [[TMP12]] = load i8, ptr [[INCDEC_PTR_I35_I_I]], align 1, !tbaa [[CHAR_TBAA8]]
 // APPROX-NEXT:    [[CMP_NOT_I36_I_I:%.*]] = icmp eq i8 [[TMP12]], 0
-// APPROX-NEXT:    br i1 [[CMP_NOT_I36_I_I]], label %[[_ZL4NANFPKC_EXIT]], label %[[WHILE_BODY_I25_I_I]], !llvm.loop [[LOOP10]]
+// APPROX-NEXT:    br i1 [[CMP_NOT_I36_I_I]], label %[[_ZL4NANFPKC_EXIT]], label %[[WHILE_BODY_I25_I_I]], !llvm.loop [[LOOP12]]
 // APPROX:       [[_ZL4NANFPKC_EXIT]]:
 // APPROX-NEXT:    [[RETVAL_0_I_I:%.*]] = phi i64 [ 0, %[[IF_THEN_I_I]] ], [ 0, %[[ENTRY]] ], [ [[SUB_I34_I_I]], %[[IF_THEN_I30_I_I]] ], [ 0, %[[WHILE_BODY_I25_I_I]] ], [ [[SUB_I21_I_I]], %[[IF_THEN_I17_I_I]] ], [ 0, %[[WHILE_BODY_I14_I_I]] ], [ [[__R_1_I_I_I]], %[[IF_END31_I_I_I]] ], [ 0, %[[IF_ELSE17_I_I_I]] ]
 // APPROX-NEXT:    [[CONV_I:%.*]] = trunc i64 [[RETVAL_0_I_I]] to i32
@@ -4501,14 +4501,14 @@ extern "C" __device__ double test_modf(double x, double* y) {
 // NCRDIV-LABEL: define dso_local float @test_nanf(
 // NCRDIV-SAME: ptr noundef readonly captures(none) [[TAG:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // NCRDIV-NEXT:  [[ENTRY:.*]]:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = load i8, ptr [[TAG]], align 1, !tbaa [[CHAR_TBAA4]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = load i8, ptr [[TAG]], align 1, !tbaa [[CHAR_TBAA8]]
 // NCRDIV-NEXT:    switch i8 [[TMP0]], label %[[WHILE_BODY_I25_I_I:.*]] [
 // NCRDIV-NEXT:      i8 48, label %[[IF_THEN_I_I:.*]]
 // NCRDIV-NEXT:      i8 0, label %[[_ZL4NANFPKC_EXIT:.*]]
 // NCRDIV-NEXT:    ]
 // NCRDIV:       [[IF_THEN_I_I]]:
 // NCRDIV-NEXT:    [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TAG]], i64 1
-// NCRDIV-NEXT:    [[TMP1:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
+// NCRDIV-NEXT:    [[TMP1:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[CHAR_TBAA8]]
 // NCRDIV-NEXT:    switch i8 [[TMP1]], label %[[WHILE_BODY_I14_I_I:.*]] [
 // NCRDIV-NEXT:      i8 88, label %[[WHILE_BODY_I_I_I_PREHEADER:.*]]
 // NCRDIV-NEXT:      i8 120, label %[[WHILE_BODY_I_I_I_PREHEADER]]
@@ -4552,9 +4552,9 @@ extern "C" __device__ double test_modf(double x, double* y) {
 // NCRDIV:       [[IF_END31_I_I_I]]:
 // NCRDIV-NEXT:    [[__R_1_I_I_I]] = phi i64 [ [[SUB_I_I_I]], %[[IF_THEN_I_I_I]] ], [ [[ADD16_I_I_I]], %[[IF_THEN11_I_I_I]] ], [ [[ADD28_I_I_I]], %[[IF_THEN23_I_I_I]] ]
 // NCRDIV-NEXT:    [[INCDEC_PTR_I_I_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_049_I_I_I]], i64 1
-// NCRDIV-NEXT:    [[TMP6]] = load i8, ptr [[INCDEC_PTR_I_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
+// NCRDIV-NEXT:    [[TMP6]] = load i8, ptr [[INCDEC_PTR_I_I_I]], align 1, !tbaa [[CHAR_TBAA8]]
 // NCRDIV-NEXT:    [[CMP_NOT_I_I_I:%.*]] = icmp eq i8 [[TMP6]], 0
-// NCRDIV-NEXT:    br i1 [[CMP_NOT_I_I_I]], label %[[_ZL4NANFPKC_EXIT]], label %[[WHILE_BODY_I_I_I]], !llvm.loop [[LOOP11]]
+// NCRDIV-NEXT:    br i1 [[CMP_NOT_I_I_I]], label %[[_ZL4NANFPKC_EXIT]], label %[[WHILE_BODY_I_I_I]], !llvm.loop [[LOOP13]]
 // NCRDIV:       [[WHILE_BODY_I14_I_I]]:
 // NCRDIV-NEXT:    [[TMP7:%.*]] = phi i8 [ [[TMP9:%.*]], %[[IF_THEN_I17_I_I:.*]] ], [ [[TMP1]], %[[IF_THEN_I_I]] ]
 // NCRDIV-NEXT:    [[__R_014_I_I_I:%.*]] = phi i64 [ [[SUB_I21_I_I:%.*]], %[[IF_THEN_I17_I_I]] ], [ 0, %[[IF_THEN_I_I]] ]
@@ -4568,9 +4568,9 @@ extern "C" __device__ double test_modf(double x, double* y) {
 // NCRDIV-NEXT:    [[ADD_I20_I_I:%.*]] = add nsw i64 [[CONV5_I19_I_I]], -48
 // NCRDIV-NEXT:    [[SUB_I21_I_I]] = or disjoint i64 [[ADD_I20_I_I]], [[MUL_I18_I_I]]
 // NCRDIV-NEXT:    [[INCDEC_PTR_I22_I_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_013_I_I_I]], i64 1
-// NCRDIV-NEXT:    [[TMP9]] = load i8, ptr [[INCDEC_PTR_I22_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
+// NCRDIV-NEXT:    [[TMP9]] = load i8, ptr [[INCDEC_PTR_I22_I_I]], align 1, !tbaa [[CHAR_TBAA8]]
 // NCRDIV-NEXT:    [[CMP_NOT_I23_I_I:%.*]] = icmp eq i8 [[TMP9]], 0
-// NCRDIV-NEXT:    br i1 [[CMP_NOT_I23_I_I]], label %[[_ZL4NANFPKC_EXIT]], label %[[WHILE_BODY_I14_I_I]], !llvm.loop [[LOOP7]]
+// NCRDIV-NEXT:    br i1 [[CMP_NOT_I23_I_I]], label %[[_ZL4NANFPKC_EXIT]], label %[[WHILE_BODY_I14_I_I]], !llvm.loop [[LOOP9]]
 // NCRDIV:       [[WHILE_BODY_I25_I_I]]:
 // NCRDIV-NEXT:    [[TMP10:%.*]] = phi i8 [ [[TMP12:%.*]], %[[IF_THEN_I30_I_I:.*]] ], [ [[TMP0]], %[[ENTRY]] ]
 // NCRDIV-NEXT:    [[__R_014_I26_I_I:%.*]] = phi i64 [ [[SUB_I34_I_I:%.*]], %[[IF_THEN_I30_I_I]] ], [ 0, %[[ENTRY]] ]
@@ -4584,9 +4584,9 @@ extern "C" __device__ double test_modf(double x, double* y) {
 // NCRDIV-NEXT:    [[ADD_I33_I_I:%.*]] = add nsw i64 [[CONV5_I32_I_I]], -48
 // NCRDIV-NEXT:    [[SUB_I34_I_I]] = add i64 [[ADD_I33_I_I]], [[MUL_I31_I_I]]
 // NCRDIV-NEXT:    [[INCDEC_PTR_I35_I_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_013_I27_I_I]], i64 1
-// NCRDIV-NEXT:    [[TMP12]] = load i8, ptr [[INCDEC_PTR_I35_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
+// NCRDIV-NEXT:    [[TMP12]] = load i8, ptr [[INCDEC_PTR_I35_I_I]], align 1, !tbaa [[CHAR_TBAA8]]
 // NCRDIV-NEXT:    [[CMP_NOT_I36_I_I:%.*]] = icmp eq i8 [[TMP12]], 0
-// NCRDIV-NEXT:    br i1 [[CMP_NOT_I36_I_I]], label %[[_ZL4NANFPKC_EXIT]], label %[[WHILE_BODY_I25_I_I]], !llvm.loop [[LOOP10]]
+// NCRDIV-NEXT:    br i1 [[CMP_NOT_I36_I_I]], label %[[_ZL4NANFPKC_EXIT]], label %[[WHILE_BODY_I25_I_I]], !llvm.loop [[LOOP12]]
 // NCRDIV:       [[_ZL4NANFPKC_EXIT]]:
 // NCRDIV-NEXT:    [[RETVAL_0_I_I:%.*]] = phi i64 [ 0, %[[IF_THEN_I_I]] ], [ 0, %[[ENTRY]] ], [ [[SUB_I34_I_I]], %[[IF_THEN_I30_I_I]] ], [ 0, %[[WHILE_BODY_I25_I_I]] ], [ [[SUB_I21_I_I]], %[[IF_THEN_I17_I_I]] ], [ 0, %[[WHILE_BODY_I14_I_I]] ], [ [[__R_1_I_I_I]], %[[IF_END31_I_I_I]] ], [ 0, %[[IF_ELSE17_I_I_I]] ]
 // NCRDIV-NEXT:    [[CONV_I:%.*]] = trunc i64 [[RETVAL_0_I_I]] to i32
@@ -4598,12 +4598,12 @@ extern "C" __device__ double test_modf(double x, double* y) {
 // AMDGCNSPIRV-LABEL: define spir_func float @test_nanf(
 // AMDGCNSPIRV-SAME: ptr addrspace(4) noundef readonly captures(none) [[TAG:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR2]] {
 // AMDGCNSPIRV-NEXT:  [[ENTRY:.*]]:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i8, ptr addrspace(4) [[TAG]], align 1, !tbaa [[CHAR_TBAA5]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i8, ptr addrspace(4) [[TAG]], align 1, !tbaa [[CHAR_TBAA9]]
 // AMDGCNSPIRV-NEXT:    [[CMP_I_I:%.*]] = icmp eq i8 [[TMP0]], 48
 // AMDGCNSPIRV-NEXT:    br i1 [[CMP_I_I]], label %[[IF_THEN_I_I:.*]], label %[[WHILE_COND_I23_I_I:.*]]
 // AMDGCNSPIRV:       [[IF_THEN_I_I]]:
 // AMDGCNSPIRV-NEXT:    [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TAG]], i64 1
-// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = load i8, ptr addrspace(4) [[INCDEC_PTR_I_I]], align 1, !tbaa [[CHAR_TBAA5]]
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = load i8, ptr addrspace(4) [[INCDEC_PTR_I_I]], align 1, !tbaa [[CHAR_TBAA9]]
 // AMDGCNSPIRV-NEXT:    switch i8 [[TMP1]], label %[[WHILE_COND_I_I_I:.*]] [
 // AMDGCNSPIRV-NEXT:      i8 88, label %[[WHILE_BODY_I_I_I_PREHEADER:.*]]
 // AMDGCNSPIRV-NEXT:      i8 120, label %[[WHILE_BODY_I_I_I_PREHEADER]]
@@ -4646,13 +4646,13 @@ extern "C" __device__ double test_modf(double x, double* y) {
 // AMDGCNSPIRV:       [[IF_END31_I_I_I]]:
 // AMDGCNSPIRV-NEXT:    [[__R_1_I_I_I]] = phi i64 [ [[SUB_I_I_I]], %[[IF_THEN_I_I_I]] ], [ [[ADD16_I_I_I]], %[[IF_THEN11_I_I_I]] ], [ [[ADD28_I_I_I]], %[[IF_THEN23_I_I_I]] ]
 // AMDGCNSPIRV-NEXT:    [[INCDEC_PTR_I_I_I]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[__TAGP_ADDR_049_I_I_I]], i64 1
-// AMDGCNSPIRV-NEXT:    [[TMP6]] = load i8, ptr addrspace(4) [[INCDEC_PTR_I_I_I]], align 1, !tbaa [[CHAR_TBAA5]]
+// AMDGCNSPIRV-NEXT:    [[TMP6]] = load i8, ptr addrspace(4) [[INCDEC_PTR_I_I_I]], align 1, !tbaa [[CHAR_TBAA9]]
 // AMDGCNSPIRV-NEXT:    [[CMP_NOT_I_I_I:%.*]] = icmp eq i8 [[TMP6]], 0
-// AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I_I_I]], label %[[_ZL4NANFPKC_EXIT]], label %[[WHILE_BODY_I_I_I]], !llvm.loop [[LOOP12]]
+// AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I_I_I]], label %[[_ZL4NANFPKC_EXIT]], label %[[WHILE_BODY_I_I_I]], !llvm.loop [[LOOP14]]
 // AMDGCNSPIRV:       [[WHILE_COND_I_I_I]]:
 // AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_0_I_I_I:%.*]] = phi ptr addrspace(4) [ [[__TAGP_ADDR_1_I_I_I:%.*]], %[[WHILE_BODY_I15_I_I:.*]] ], [ [[INCDEC_PTR_I_I]], %[[IF_THEN_I_I]] ]
 // AMDGCNSPIRV-NEXT:    [[__R_0_I_I_I:%.*]] = phi i64 [ [[__R_1_I21_I_I:%.*]], %[[WHILE_BODY_I15_I_I]] ], [ 0, %[[IF_THEN_I_I]] ]
-// AMDGCNSPIRV-NEXT:    [[TMP7:%.*]] = load i8, ptr addrspace(4) [[__TAGP_ADDR_0_I_I_I]], align 1, !tbaa [[CHAR_TBAA5]]
+// AMDGCNSPIRV-NEXT:    [[TMP7:%.*]] = load i8, ptr addrspace(4) [[__TAGP_ADDR_0_I_I_I]], align 1, !tbaa [[CHAR_TBAA9]]
 // AMDGCNSPIRV-NEXT:    [[CMP_NOT_I14_I_I:%.*]] = icmp eq i8 [[TMP7]], 0
 // AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I14_I_I]], label %[[_ZL4NANFPKC_EXIT]], label %[[WHILE_BODY_I15_I_I]]
 // AMDGCNSPIRV:       [[WHILE_BODY_I15_I_I]]:
@@ -4665,11 +4665,11 @@ extern "C" __device__ double test_modf(double x, double* y) {
 // AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_1_IDX_I_I_I:%.*]] = zext i1 [[OR_COND_I16_I_I]] to i64
 // AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_1_I_I_I]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[__TAGP_ADDR_0_I_I_I]], i64 [[__TAGP_ADDR_1_IDX_I_I_I]]
 // AMDGCNSPIRV-NEXT:    [[__R_1_I21_I_I]] = select i1 [[OR_COND_I16_I_I]], i64 [[SUB_I20_I_I]], i64 [[__R_0_I_I_I]]
-// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND_I16_I_I]], label %[[WHILE_COND_I_I_I]], label %[[_ZL4NANFPKC_EXIT]], !llvm.loop [[LOOP8]]
+// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND_I16_I_I]], label %[[WHILE_COND_I_I_I]], label %[[_ZL4NANFPKC_EXIT]], !llvm.loop [[LOOP10]]
 // AMDGCNSPIRV:       [[WHILE_COND_I23_I_I]]:
 // AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_0_I24_I_I:%.*]] = phi ptr addrspace(4) [ [[__TAGP_ADDR_1_I34_I_I:%.*]], %[[WHILE_BODY_I27_I_I:.*]] ], [ [[TAG]], %[[ENTRY]] ]
 // AMDGCNSPIRV-NEXT:    [[__R_0_I25_I_I:%.*]] = phi i64 [ [[__R_1_I35_I_I:%.*]], %[[WHILE_BODY_I27_I_I]] ], [ 0, %[[ENTRY]] ]
-// AMDGCNSPIRV-NEXT:    [[TMP9:%.*]] = load i8, ptr addrspace(4) [[__TAGP_ADDR_0_I24_I_I]], align 1, !tbaa [[CHAR_TBAA5]]
+// AMDGCNSPIRV-NEXT:    [[TMP9:%.*]] = load i8, ptr addrspace(4) [[__TAGP_ADDR_0_I24_I_I]], align 1, !tbaa [[CHAR_TBAA9]]
 // AMDGCNSPIRV-NEXT:    [[CMP_NOT_I26_I_I:%.*]] = icmp eq i8 [[TMP9]], 0
 // AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I26_I_I]], label %[[_ZL4NANFPKC_EXIT]], label %[[WHILE_BODY_I27_I_I]]
 // AMDGCNSPIRV:       [[WHILE_BODY_I27_I_I]]:
@@ -4682,7 +4682,7 @@ extern "C" __device__ double test_modf(double x, double* y) {
 // AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_1_IDX_I33_I_I:%.*]] = zext i1 [[OR_COND_I28_I_I]] to i64
 // AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_1_I34_I_I]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[__TAGP_ADDR_0_I24_I_I]], i64 [[__TAGP_ADDR_1_IDX_I33_I_I]]
 // AMDGCNSPIRV-NEXT:    [[__R_1_I35_I_I]] = select i1 [[OR_COND_I28_I_I]], i64 [[SUB_I32_I_I]], i64 [[__R_0_I25_I_I]]
-// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND_I28_I_I]], label %[[WHILE_COND_I23_I_I]], label %[[_ZL4NANFPKC_EXIT]], !llvm.loop [[LOOP11]]
+// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND_I28_I_I]], label %[[WHILE_COND_I23_I_I]], label %[[_ZL4NANFPKC_EXIT]], !llvm.loop [[LOOP13]]
 // AMDGCNSPIRV:       [[_ZL4NANFPKC_EXIT]]:
 // AMDGCNSPIRV-NEXT:    [[RETVAL_0_I_I:%.*]] = phi i64 [ 0, %[[WHILE_BODY_I15_I_I]] ], [ [[__R_0_I_I_I]], %[[WHILE_COND_I_I_I]] ], [ [[__R_1_I_I_I]], %[[IF_END31_I_I_I]] ], [ 0, %[[IF_ELSE17_I_I_I]] ], [ 0, %[[WHILE_BODY_I27_I_I]] ], [ [[__R_0_I25_I_I]], %[[WHILE_COND_I23_I_I]] ]
 // AMDGCNSPIRV-NEXT:    [[CONV_I:%.*]] = trunc i64 [[RETVAL_0_I_I]] to i32
@@ -4698,14 +4698,14 @@ extern "C" __device__ float test_nanf(const char *tag) {
 // DEFAULT-LABEL: define dso_local double @test_nan(
 // DEFAULT-SAME: ptr noundef readonly captures(none) [[TAG:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // DEFAULT-NEXT:  [[ENTRY:.*]]:
-// DEFAULT-NEXT:    [[TMP0:%.*]] = load i8, ptr [[TAG]], align 1, !tbaa [[CHAR_TBAA4]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = load i8, ptr [[TAG]], align 1, !tbaa [[CHAR_TBAA8]]
 // DEFAULT-NEXT:    switch i8 [[TMP0]], label %[[WHILE_BODY_I25_I_I:.*]] [
 // DEFAULT-NEXT:      i8 48, label %[[IF_THEN_I_I:.*]]
 // DEFAULT-NEXT:      i8 0, label %[[_ZL3NANPKC_EXIT:.*]]
 // DEFAULT-NEXT:    ]
 // DEFAULT:       [[IF_THEN_I_I]]:
 // DEFAULT-NEXT:    [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TAG]], i64 1
-// DEFAULT-NEXT:    [[TMP1:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
+// DEFAULT-NEXT:    [[TMP1:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[CHAR_TBAA8]]
 // DEFAULT-NEXT:    switch i8 [[TMP1]], label %[[WHILE_BODY_I14_I_I:.*]] [
 // DEFAULT-NEXT:      i8 88, label %[[WHILE_BODY_I_I_I_PREHEADER:.*]]
 // DEFAULT-NEXT:      i8 120, label %[[WHILE_BODY_I_I_I_PREHEADER]]
@@ -4749,9 +4749,9 @@ extern "C" __device__ float test_nanf(const char *tag) {
 // DEFAULT:       [[IF_END31_I_I_I]]:
 // DEFAULT-NEXT:    [[__R_1_I_I_I]] = phi i64 [ [[SUB_I_I_I]], %[[IF_THEN_I_I_I]] ], [ [[ADD16_I_I_I]], %[[IF_THEN11_I_I_I]] ], [ [[ADD28_I_I_I]], %[[IF_THEN23_I_I_I]] ]
 // DEFAULT-NEXT:    [[INCDEC_PTR_I_I_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_049_I_I_I]], i64 1
-// DEFAULT-NEXT:    [[TMP6]] = load i8, ptr [[INCDEC_PTR_I_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
+// DEFAULT-NEXT:    [[TMP6]] = load i8, ptr [[INCDEC_PTR_I_I_I]], align 1, !tbaa [[CHAR_TBAA8]]
 // DEFAULT-NEXT:    [[CMP_NOT_I_I_I:%.*]] = icmp eq i8 [[TMP6]], 0
-// DEFAULT-NEXT:    br i1 [[CMP_NOT_I_I_I]], label %[[_ZL3NANPKC_EXIT]], label %[[WHILE_BODY_I_I_I]], !llvm.loop [[LOOP11]]
+// DEFAULT-NEXT:    br i1 [[CMP_NOT_I_I_I]], label %[[_ZL3NANPKC_EXIT]], label %[[WHILE_BODY_I_I_I]], !llvm.loop [[LOOP13]]
 // DEFAULT:       [[WHILE_BODY_I14_I_I]]:
 // DEFAULT-NEXT:    [[TMP7:%.*]] = phi i8 [ [[TMP9:%.*]], %[[IF_THEN_I17_I_I:.*]] ], [ [[TMP1]], %[[IF_THEN_I_I]] ]
 // DEFAULT-NEXT:    [[__R_014_I_I_I:%.*]] = phi i64 [ [[SUB_I21_I_I:%.*]], %[[IF_THEN_I17_I_I]] ], [ 0, %[[IF_THEN_I_I]] ]
@@ -4765,9 +4765,9 @@ extern "C" __device__ float test_nanf(const char *tag) {
 // DEFAULT-NEXT:    [[ADD_I20_I_I:%.*]] = add nsw i64 [[CONV5_I19_I_I]], -48
 // DEFAULT-NEXT:    [[SUB_I21_I_I]] = or disjoint i64 [[ADD_I20_I_I]], [[MUL_I18_I_I]]
 // DEFAULT-NEXT:    [[INCDEC_PTR_I22_I_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_013_I_I_I]], i64 1
-// DEFAULT-NEXT:    [[TMP9]] = load i8, ptr [[INCDEC_PTR_I22_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
+// DEFAULT-NEXT:    [[TMP9]] = load i8, ptr [[INCDEC_PTR_I22_I_I]], align 1, !tbaa [[CHAR_TBAA8]]
 // DEFAULT-NEXT:    [[CMP_NOT_I23_I_I:%.*]] = icmp eq i8 [[TMP9]], 0
-// DEFAULT-NEXT:    br i1 [[CMP_NOT_I23_I_I]], label %[[_ZL3NANPKC_EXIT]], label %[[WHILE_BODY_I14_I_I]], !llvm.loop [[LOOP7]]
+// DEFAULT-NEXT:    br i1 [[CMP_NOT_I23_I_I]], label %[[_ZL3NANPKC_EXIT]], label %[[WHILE_BODY_I14_I_I]], !llvm.loop [[LOOP9]]
 // DEFAULT:       [[WHILE_BODY_I25_I_I]]:
 // DEFAULT-NEXT:    [[TMP10:%.*]] = phi i8 [ [[TMP12:%.*]], %[[IF_THEN_I30_I_I:.*]] ], [ [[TMP0]], %[[ENTRY]] ]
 // DEFAULT-NEXT:    [[__R_014_I26_I_I:%.*]] = phi i64 [ [[SUB_I34_I_I:%.*]], %[[IF_THEN_I30_I_I]] ], [ 0, %[[ENTRY]] ]
@@ -4781,9 +4781,9 @@ extern "C" __device__ float test_nanf(const char *tag) {
 // DEFAULT-NEXT:    [[ADD_I33_I_I:%.*]] = add nsw i64 [[CONV5_I32_I_I]], -48
 // DEFAULT-NEXT:    [[SUB_I34_I_I]] = add i64 [[ADD_I33_I_I]], [[MUL_I31_I_I]]
 // DEFAULT-NEXT:    [[INCDEC_PTR_I35_I_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_013_I27_I_I]], i64 1
-// DEFAULT-NEXT:    [[TMP12]] = load i8, ptr [[INCDEC_PTR_I35_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
+// DEFAULT-NEXT:    [[TMP12]] = load i8, ptr [[INCDEC_PTR_I35_I_I]], align 1, !tbaa [[CHAR_TBAA8]]
 // DEFAULT-NEXT:    [[CMP_NOT_I36_I_I:%.*]] = icmp eq i8 [[TMP12]], 0
-// DEFAULT-NEXT:    br i1 [[CMP_NOT_I36_I_I]], label %[[_ZL3NANPKC_EXIT]], label %[[WHILE_BODY_I25_I_I]], !llvm.loop [[LOOP10]]
+// DEFAULT-NEXT:    br i1 [[CMP_NOT_I36_I_I]], label %[[_ZL3NANPKC_EXIT]], label %[[WHILE_BODY_I25_I_I]], !llvm.loop [[LOOP12]]
 // DEFAULT:       [[_ZL3NANPKC_EXIT]]:
 // DEFAULT-NEXT:    [[RETVAL_0_I_I:%.*]] = phi i64 [ 0, %[[IF_THEN_I_I]] ], [ 0, %[[ENTRY]] ], [ [[SUB_I34_I_I]], %[[IF_THEN_I30_I_I]] ], [ 0, %[[WHILE_BODY_I25_I_I]] ], [ [[SUB_I21_I_I]], %[[IF_THEN_I17_I_I]] ], [ 0, %[[WHILE_BODY_I14_I_I]] ], [ [[__R_1_I_I_I]], %[[IF_END31_I_I_I]] ], [ 0, %[[IF_ELSE17_I_I_I]] ]
 // DEFAULT-NEXT:    [[BF_VALUE_I:%.*]] = and i64 [[RETVAL_0_I_I]], 2251799813685247
@@ -4799,14 +4799,14 @@ extern "C" __device__ float test_nanf(const char *tag) {
 // APPROX-LABEL: define dso_local double @test_nan(
 // APPROX-SAME: ptr noundef readonly captures(none) [[TAG:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // APPROX-NEXT:  [[ENTRY:.*]]:
-// APPROX-NEXT:    [[TMP0:%.*]] = load i8, ptr [[TAG]], align 1, !tbaa [[CHAR_TBAA4]]
+// APPROX-NEXT:    [[TMP0:%.*]] = load i8, ptr [[TAG]], align 1, !tbaa [[CHAR_TBAA8]]
 // APPROX-NEXT:    switch i8 [[TMP0]], label %[[WHILE_BODY_I25_I_I:.*]] [
 // APPROX-NEXT:      i8 48, label %[[IF_THEN_I_I:.*]]
 // APPROX-NEXT:      i8 0, label %[[_ZL3NANPKC_EXIT:.*]]
 // APPROX-NEXT:    ]
 // APPROX:       [[IF_THEN_I_I]]:
 // APPROX-NEXT:    [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TAG]], i64 1
-// APPROX-NEXT:    [[TMP1:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
+// APPROX-NEXT:    [[TMP1:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[CHAR_TBAA8]]
 // APPROX-NEXT:    switch i8 [[TMP1]], label %[[WHILE_BODY_I14_I_I:.*]] [
 // APPROX-NEXT:      i8 88, label %[[WHILE_BODY_I_I_I_PREHEADER:.*]]
 // APPROX-NEXT:      i8 120, label %[[WHILE_BODY_I_I_I_PREHEADER]]
@@ -4850,9 +4850,9 @@ extern "C" __device__ float test_nanf(const char *tag) {
 // APPROX:       [[IF_END31_I_I_I]]:
 // APPROX-NEXT:    [[__R_1_I_I_I]] = phi i64 [ [[SUB_I_I_I]], %[[IF_THEN_I_I_I]] ], [ [[ADD16_I_I_I]], %[[IF_THEN11_I_I_I]] ], [ [[ADD28_I_I_I]], %[[IF_THEN23_I_I_I]] ]
 // APPROX-NEXT:    [[INCDEC_PTR_I_I_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_049_I_I_I]], i64 1
-// APPROX-NEXT:    [[TMP6]] = load i8, ptr [[INCDEC_PTR_I_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
+// APPROX-NEXT:    [[TMP6]] = load i8, ptr [[INCDEC_PTR_I_I_I]], align 1, !tbaa [[CHAR_TBAA8]]
 // APPROX-NEXT:    [[CMP_NOT_I_I_I:%.*]] = icmp eq i8 [[TMP6]], 0
-// APPROX-NEXT:    br i1 [[CMP_NOT_I_I_I]], label %[[_ZL3NANPKC_EXIT]], label %[[WHILE_BODY_I_I_I]], !llvm.loop [[LOOP11]]
+// APPROX-NEXT:    br i1 [[CMP_NOT_I_I_I]], label %[[_ZL3NANPKC_EXIT]], label %[[WHILE_BODY_I_I_I]], !llvm.loop [[LOOP13]]
 // APPROX:       [[WHILE_BODY_I14_I_I]]:
 // APPROX-NEXT:    [[TMP7:%.*]] = phi i8 [ [[TMP9:%.*]], %[[IF_THEN_I17_I_I:.*]] ], [ [[TMP1]], %[[IF_THEN_I_I]] ]
 // APPROX-NEXT:    [[__R_014_I_I_I:%.*]] = phi i64 [ [[SUB_I21_I_I:%.*]], %[[IF_THEN_I17_I_I]] ], [ 0, %[[IF_THEN_I_I]] ]
@@ -4866,9 +4866,9 @@ extern "C" __device__ float test_nanf(const char *tag) {
 // APPROX-NEXT:    [[ADD_I20_I_I:%.*]] = add nsw i64 [[CONV5_I19_I_I]], -48
 // APPROX-NEXT:    [[SUB_I21_I_I]] = or disjoint i64 [[ADD_I20_I_I]], [[MUL_I18_I_I]]
 // APPROX-NEXT:    [[INCDEC_PTR_I22_I_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_013_I_I_I]], i64 1
-// APPROX-NEXT:    [[TMP9]] = load i8, ptr [[INCDEC_PTR_I22_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
+// APPROX-NEXT:    [[TMP9]] = load i8, ptr [[INCDEC_PTR_I22_I_I]], align 1, !tbaa [[CHAR_TBAA8]]
 // APPROX-NEXT:    [[CMP_NOT_I23_I_I:%.*]] = icmp eq i8 [[TMP9]], 0
-// APPROX-NEXT:    br i1 [[CMP_NOT_I23_I_I]], label %[[_ZL3NANPKC_EXIT]], label %[[WHILE_BODY_I14_I_I]], !llvm.loop [[LOOP7]]
+// APPROX-NEXT:    br i1 [[CMP_NOT_I23_I_I]], label %[[_ZL3NANPKC_EXIT]], label %[[WHILE_BODY_I14_I_I]], !llvm.loop [[LOOP9]]
 // APPROX:       [[WHILE_BODY_I25_I_I]]:
 // APPROX-NEXT:    [[TMP10:%.*]] = phi i8 [ [[TMP12:%.*]], %[[IF_THEN_I30_I_I:.*]] ], [ [[TMP0]], %[[ENTRY]] ]
 // APPROX-NEXT:    [[__R_014_I26_I_I:%.*]] = phi i64 [ [[SUB_I34_I_I:%.*]], %[[IF_THEN_I30_I_I]] ], [ 0, %[[ENTRY]] ]
@@ -4882,9 +4882,9 @@ extern "C" __device__ float test_nanf(const char *tag) {
 // APPROX-NEXT:    [[ADD_I33_I_I:%.*]] = add nsw i64 [[CONV5_I32_I_I]], -48
 // APPROX-NEXT:    [[SUB_I34_I_I]] = add i64 [[ADD_I33_I_I]], [[MUL_I31_I_I]]
 // APPROX-NEXT:    [[INCDEC_PTR_I35_I_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_013_I27_I_I]], i64 1
-// APPROX-NEXT:    [[TMP12]] = load i8, ptr [[INCDEC_PTR_I35_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
+// APPROX-NEXT:    [[TMP12]] = load i8, ptr [[INCDEC_PTR_I35_I_I]], align 1, !tbaa [[CHAR_TBAA8]]
 // APPROX-NEXT:    [[CMP_NOT_I36_I_I:%.*]] = icmp eq i8 [[TMP12]], 0
-// APPROX-NEXT:    br i1 [[CMP_NOT_I36_I_I]], label %[[_ZL3NANPKC_EXIT]], label %[[WHILE_BODY_I25_I_I]], !llvm.loop [[LOOP10]]
+// APPROX-NEXT:    br i1 [[CMP_NOT_I36_I_I]], label %[[_ZL3NANPKC_EXIT]], label %[[WHILE_BODY_I25_I_I]], !llvm.loop [[LOOP12]]
 // APPROX:       [[_ZL3NANPKC_EXIT]]:
 // APPROX-NEXT:    [[RETVAL_0_I_I:%.*]] = phi i64 [ 0, %[[IF_THEN_I_I]] ], [ 0, %[[ENTRY]] ], [ [[SUB_I34_I_I]], %[[IF_THEN_I30_I_I]] ], [ 0, %[[WHILE_BODY_I25_I_I]] ], [ [[SUB_I21_I_I]], %[[IF_THEN_I17_I_I]] ], [ 0, %[[WHILE_BODY_I14_I_I]] ], [ [[__R_1_I_I_I]], %[[IF_END31_I_I_I]] ], [ 0, %[[IF_ELSE17_I_I_I]] ]
 // APPROX-NEXT:    [[BF_VALUE_I:%.*]] = and i64 [[RETVAL_0_I_I]], 2251799813685247
@@ -4895,14 +4895,14 @@ extern "C" __device__ float test_nanf(const char *tag) {
 // NCRDIV-LABEL: define dso_local double @test_nan(
 // NCRDIV-SAME: ptr noundef readonly captures(none) [[TAG:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // NCRDIV-NEXT:  [[ENTRY:.*]]:
-// NCRDIV-NEXT:    [[TMP0:%.*]] = load i8, ptr [[TAG]], align 1, !tbaa [[CHAR_TBAA4]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = load i8, ptr [[TAG]], align 1, !tbaa [[CHAR_TBAA8]]
 // NCRDIV-NEXT:    switch i8 [[TMP0]], label %[[WHILE_BODY_I25_I_I:.*]] [
 // NCRDIV-NEXT:      i8 48, label %[[IF_THEN_I_I:.*]]
 // NCRDIV-NEXT:      i8 0, label %[[_ZL3NANPKC_EXIT:.*]]
 // NCRDIV-NEXT:    ]
 // NCRDIV:       [[IF_THEN_I_I]]:
 // NCRDIV-NEXT:    [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TAG]], i64 1
-// NCRDIV-NEXT:    [[TMP1:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
+// NCRDIV-NEXT:    [[TMP1:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[CHAR_TBAA8]]
 // NCRDIV-NEXT:    switch i8 [[TMP1]], label %[[WHILE_BODY_I14_I_I:.*]] [
 // NCRDIV-NEXT:      i8 88, label %[[WHILE_BODY_I_I_I_PREHEADER:.*]]
 // NCRDIV-NEXT:      i8 120, label %[[WHILE_BODY_I_I_I_PREHEADER]]
@@ -4946,9 +4946,9 @@ extern "C" __device__ float test_nanf(const char *tag) {
 // NCRDIV:       [[IF_END31_I_I_I]]:
 // NCRDIV-NEXT:    [[__R_1_I_I_I]] = phi i64 [ [[SUB_I_I_I]], %[[IF_THEN_I_I_I]] ], [ [[ADD16_I_I_I]], %[[IF_THEN11_I_I_I]] ], [ [[ADD28_I_I_I]], %[[IF_THEN23_I_I_I]] ]
 // NCRDIV-NEXT:    [[INCDEC_PTR_I_I_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_049_I_I_I]], i64 1
-// NCRDIV-NEXT:    [[TMP6]] = load i8, ptr [[INCDEC_PTR_I_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
+// NCRDIV-NEXT:    [[TMP6]] = load i8, ptr [[INCDEC_PTR_I_I_I]], align 1, !tbaa [[CHAR_TBAA8]]
 // NCRDIV-NEXT:    [[CMP_NOT_I_I_I:%.*]] = icmp eq i8 [[TMP6]], 0
-// NCRDIV-NEXT:    br i1 [[CMP_NOT_I_I_I]], label %[[_ZL3NANPKC_EXIT]], label %[[WHILE_BODY_I_I_I]], !llvm.loop [[LOOP11]]
+// NCRDIV-NEXT:    br i1 [[CMP_NOT_I_I_I]], label %[[_ZL3NANPKC_EXIT]], label %[[WHILE_BODY_I_I_I]], !llvm.loop [[LOOP13]]
 // NCRDIV:       [[WHILE_BODY_I14_I_I]]:
 // NCRDIV-NEXT:    [[TMP7:%.*]] = phi i8 [ [[TMP9:%.*]], %[[IF_THEN_I17_I_I:.*]] ], [ [[TMP1]], %[[IF_THEN_I_I]] ]
 // NCRDIV-NEXT:    [[__R_014_I_I_I:%.*]] = phi i64 [ [[SUB_I21_I_I:%.*]], %[[IF_THEN_I17_I_I]] ], [ 0, %[[IF_THEN_I_I]] ]
@@ -4962,9 +4962,9 @@ extern "C" __device__ float test_nanf(const char *tag) {
 // NCRDIV-NEXT:    [[ADD_I20_I_I:%.*]] = add nsw i64 [[CONV5_I19_I_I]], -48
 // NCRDIV-NEXT:    [[SUB_I21_I_I]] = or disjoint i64 [[ADD_I20_I_I]], [[MUL_I18_I_I]]
 // NCRDIV-NEXT:    [[INCDEC_PTR_I22_I_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_013_I_I_I]], i64 1
-// NCRDIV-NEXT:    [[TMP9]] = load i8, ptr [[INCDEC_PTR_I22_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
+// NCRDIV-NEXT:    [[TMP9]] = load i8, ptr [[INCDEC_PTR_I22_I_I]], align 1, !tbaa [[CHAR_TBAA8]]
 // NCRDIV-NEXT:    [[CMP_NOT_I23_I_I:%.*]] = icmp eq i8 [[TMP9]], 0
-// NCRDIV-NEXT:    br i1 [[CMP_NOT_I23_I_I]], label %[[_ZL3NANPKC_EXIT]], label %[[WHILE_BODY_I14_I_I]], !llvm.loop [[LOOP7]]
+// NCRDIV-NEXT:    br i1 [[CMP_NOT_I23_I_I]], label %[[_ZL3NANPKC_EXIT]], label %[[WHILE_BODY_I14_I_I]], !llvm.loop [[LOOP9]]
 // NCRDIV:       [[WHILE_BODY_I25_I_I]]:
 // NCRDIV-NEXT:    [[TMP10:%.*]] = phi i8 [ [[TMP12:%.*]], %[[IF_THEN_I30_I_I:.*]] ], [ [[TMP0]], %[[ENTRY]] ]
 // NCRDIV-NEXT:    [[__R_014_I26_I_I:%.*]] = phi i64 [ [[SUB_I34_I_I:%.*]], %[[IF_THEN_I30_I_I]] ], [ 0, %[[ENTRY]] ]
@@ -4978,9 +4978,9 @@ extern "C" __device__ float test_nanf(const char *tag) {
 // NCRDIV-NEXT:    [[ADD_I33_I_I:%.*]] = add nsw i64 [[CONV5_I32_I_I]], -48
 // NCRDIV-NEXT:    [[SUB_I34_I_I]] = add i64 [[ADD_I33_I_I]], [[MUL_I31_I_I]]
 // NCRDIV-NEXT:    [[INCDEC_PTR_I35_I_I]] = getelementptr inbounds nuw i8, ptr [[__TAGP_ADDR_013_I27_I_I]], i64 1
-// NCRDIV-NEXT:    [[TMP12]] = load i8, ptr [[INCDEC_PTR_I35_I_I]], align 1, !tbaa [[CHAR_TBAA4]]
+// NCRDIV-NEXT:    [[TMP12]] = load i8, ptr [[INCDEC_PTR_I35_I_I]], align 1, !tbaa [[CHAR_TBAA8]]
 // NCRDIV-NEXT:    [[CMP_NOT_I36_I_I:%.*]] = icmp eq i8 [[TMP12]], 0
-// NCRDIV-NEXT:    br i1 [[CMP_NOT_I36_I_I]], label %[[_ZL3NANPKC_EXIT]], label %[[WHILE_BODY_I25_I_I]], !llvm.loop [[LOOP10]]
+// NCRDIV-NEXT:    br i1 [[CMP_NOT_I36_I_I]], label %[[_ZL3NANPKC_EXIT]], label %[[WHILE_BODY_I25_I_I]], !llvm.loop [[LOOP12]]
 // NCRDIV:       [[_ZL3NANPKC_EXIT]]:
 // NCRDIV-NEXT:    [[RETVAL_0_I_I:%.*]] = phi i64 [ 0, %[[IF_THEN_I_I]] ], [ 0, %[[ENTRY]] ], [ [[SUB_I34_I_I]], %[[IF_THEN_I30_I_I]] ], [ 0, %[[WHILE_BODY_I25_I_I]] ], [ [[SUB_I21_I_I]], %[[IF_THEN_I17_I_I]] ], [ 0, %[[WHILE_BODY_I14_I_I]] ], [ [[__R_1_I_I_I]], %[[IF_END31_I_I_I]] ], [ 0, %[[IF_ELSE17_I_I_I]] ]
 // NCRDIV-NEXT:    [[BF_VALUE_I:%.*]] = and i64 [[RETVAL_0_I_I]], 2251799813685247
@@ -4991,12 +4991,12 @@ extern "C" __device__ float test_nanf(const char *tag) {
 // AMDGCNSPIRV-LABEL: define spir_func double @test_nan(
 // AMDGCNSPIRV-SAME: ptr addrspace(4) noundef readonly captures(none) [[TAG:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR2]] {
 // AMDGCNSPIRV-NEXT:  [[ENTRY:.*]]:
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i8, ptr addrspace(4) [[TAG]], align 1, !tbaa [[CHAR_TBAA5]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i8, ptr addrspace(4) [[TAG]], align 1, !tbaa [[CHAR_TBAA9]]
 // AMDGCNSPIRV-NEXT:    [[CMP_I_I:%.*]] = icmp eq i8 [[TMP0]], 48
 // AMDGCNSPIRV-NEXT:    br i1 [[CMP_I_I]], label %[[IF_THEN_I_I:.*]], label %[[WHILE_COND_I23_I_I:.*]]
 // AMDGCNSPIRV:       [[IF_THEN_I_I]]:
 // AMDGCNSPIRV-NEXT:    [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TAG]], i64 1
-// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = load i8, ptr addrspace(4) [[INCDEC_PTR_I_I]], align 1, !tbaa [[CHAR_TBAA5]]
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = load i8, ptr addrspace(4) [[INCDEC_PTR_I_I]], align 1, !tbaa [[CHAR_TBAA9]]
 // AMDGCNSPIRV-NEXT:    switch i8 [[TMP1]], label %[[WHILE_COND_I_I_I:.*]] [
 // AMDGCNSPIRV-NEXT:      i8 88, label %[[WHILE_BODY_I_I_I_PREHEADER:.*]]
 // AMDGCNSPIRV-NEXT:      i8 120, label %[[WHILE_BODY_I_I_I_PREHEADER]]
@@ -5039,13 +5039,13 @@ extern "C" __device__ float test_nanf(const char *tag) {
 // AMDGCNSPIRV:       [[IF_END31_I_I_I]]:
 // AMDGCNSPIRV-NEXT:    [[__R_1_I_I_I]] = phi i64 [ [[SUB_I_I_I]], %[[IF_THEN_I_I_I]] ], [ [[ADD16_I_I_I]], %[[IF_THEN11_I_I_I]] ], [ [[ADD28_I_I_I]], %[[IF_THEN23_I_I_I]] ]
 // AMDGCNSPIRV-NEXT:    [[INCDEC_PTR_I_I_I]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[__TAGP_ADDR_049_I_I_I]], i64 1
-// AMDGCNSPIRV-NEXT:    [[TMP6]] = load i8, ptr addrspace(4) [[INCDEC_PTR_I_I_I]], align 1, !tbaa [[CHAR_TBAA5]]
+// AMDGCNSPIRV-NEXT:    [[TMP6]] = load i8, ptr addrspace(4) [[INCDEC_PTR_I_I_I]], align 1, !tbaa [[CHAR_TBAA9]]
 // AMDGCNSPIRV-NEXT:    [[CMP_NOT_I_I_I:%.*]] = icmp eq i8 [[TMP6]], 0
-// AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I_I_I]], label %[[_ZL3NANPKC_EXIT]], label %[[WHILE_BODY_I_I_I]], !llvm.loop [[LOOP12]]
+// AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I_I_I]], label %[[_ZL3NANPKC_EXIT]], label %[[WHILE_BODY_I_I_I]], !llvm.loop [[LOOP14]]
 // AMDGCNSPIRV:       [[WHILE_COND_I_I_I]]:
 // AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_0_I_I_I:%.*]] = phi ptr addrspace(4) [ [[__TAGP_ADDR_1_I_I_I:%.*]], %[[WHILE_BODY_I15_I_I:.*]] ], [ [[INCDEC_PTR_I_I]], %[[IF_THEN_I_I]] ]
 // AMDGCNSPIRV-NEXT:    [[__R_0_I_I_I:%.*]] = phi i64 [ [[__R_1_I21_I_I:%.*]], %[[WHILE_BODY_I15_I_I]] ], [ 0, %[[IF_THEN_I_I]] ]
-// AMDGCNSPIRV-NEXT:    [[TMP7:%.*]] = load i8, ptr addrspace(4) [[__TAGP_ADDR_0_I_I_I]], align 1, !tbaa [[CHAR_TBAA5]]
+// AMDGCNSPIRV-NEXT:    [[TMP7:%.*]] = load i8, ptr addrspace(4) [[__TAGP_ADDR_0_I_I_I]], align 1, !tbaa [[CHAR_TBAA9]]
 // AMDGCNSPIRV-NEXT:    [[CMP_NOT_I14_I_I:%.*]] = icmp eq i8 [[TMP7]], 0
 // AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I14_I_I]], label %[[_ZL3NANPKC_EXIT]], label %[[WHILE_BODY_I15_I_I]]
 // AMDGCNSPIRV:       [[WHILE_BODY_I15_I_I]]:
@@ -5058,11 +5058,11 @@ extern "C" __device__ float test_nanf(const char *tag) {
 // AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_1_IDX_I_I_I:%.*]] = zext i1 [[OR_COND_I16_I_I]] to i64
 // AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_1_I_I_I]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[__TAGP_ADDR_0_I_I_I]], i64 [[__TAGP_ADDR_1_IDX_I_I_I]]
 // AMDGCNSPIRV-NEXT:    [[__R_1_I21_I_I]] = select i1 [[OR_COND_I16_I_I]], i64 [[SUB_I20_I_I]], i64 [[__R_0_I_I_I]]
-// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND_I16_I_I]], label %[[WHILE_COND_I_I_I]], label %[[_ZL3NANPKC_EXIT]], !llvm.loop [[LOOP8]]
+// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND_I16_I_I]], label %[[WHILE_COND_I_I_I]], label %[[_ZL3NANPKC_EXIT]], !llvm.loop [[LOOP10]]
 // AMDGCNSPIRV:       [[WHILE_COND_I23_I_I]]:
 // AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_0_I24_I_I:%.*]] = phi ptr addrspace(4) [ [[__TAGP_ADDR_1_I34_I_I:%.*]], %[[WHILE_BODY_I27_I_I:.*]] ], [ [[TAG]], %[[ENTRY]] ]
 // AMDGCNSPIRV-NEXT:    [[__R_0_I25_I_I:%.*]] = phi i64 [ [[__R_1_I35_I_I:%.*]], %[[WHILE_BODY_I27_I_I]] ], [ 0, %[[ENTRY]] ]
-// AMDGCNSPIRV-NEXT:    [[TMP9:%.*]] = load i8, ptr addrspace(4) [[__TAGP_ADDR_0_I24_I_I]], align 1, !tbaa [[CHAR_TBAA5]]
+// AMDGCNSPIRV-NEXT:    [[TMP9:%.*]] = load i8, ptr addrspace(4) [[__TAGP_ADDR_0_I24_I_I]], align 1, !tbaa [[CHAR_TBAA9]]
 // AMDGCNSPIRV-NEXT:    [[CMP_NOT_I26_I_I:%.*]] = icmp eq i8 [[TMP9]], 0
 // AMDGCNSPIRV-NEXT:    br i1 [[CMP_NOT_I26_I_I]], label %[[_ZL3NANPKC_EXIT]], label %[[WHILE_BODY_I27_I_I]]
 // AMDGCNSPIRV:       [[WHILE_BODY_I27_I_I]]:
@@ -5075,7 +5075,7 @@ extern "C" __device__ float test_nanf(const char *tag) {
 // AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_1_IDX_I33_I_I:%.*]] = zext i1 [[OR_COND_I28_I_I]] to i64
 // AMDGCNSPIRV-NEXT:    [[__TAGP_ADDR_1_I34_I_I]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[__TAGP_ADDR_0_I24_I_I]], i64 [[__TAGP_ADDR_1_IDX_I33_I_I]]
 // AMDGCNSPIRV-NEXT:    [[__R_1_I35_I_I]] = select i1 [[OR_COND_I28_I_I]], i64 [[SUB_I32_I_I]], i64 [[__R_0_I25_I_I]]
-// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND_I28_I_I]], label %[[WHILE_COND_I23_I_I]], label %[[_ZL3NANPKC_EXIT]], !llvm.loop [[LOOP11]]
+// AMDGCNSPIRV-NEXT:    br i1 [[OR_COND_I28_I_I]], label %[[WHILE_COND_I23_I_I]], label %[[_ZL3NANPKC_EXIT]], !llvm.loop [[LOOP13]]
 // AMDGCNSPIRV:       [[_ZL3NANPKC_EXIT]]:
 // AMDGCNSPIRV-NEXT:    [[RETVAL_0_I_I:%.*]] = phi i64 [ 0, %[[WHILE_BODY_I15_I_I]] ], [ [[__R_0_I_I_I]], %[[WHILE_COND_I_I_I]] ], [ [[__R_1_I_I_I]], %[[IF_END31_I_I_I]] ], [ 0, %[[IF_ELSE17_I_I_I]] ], [ 0, %[[WHILE_BODY_I27_I_I]] ], [ [[__R_0_I25_I_I]], %[[WHILE_COND_I23_I_I]] ]
 // AMDGCNSPIRV-NEXT:    [[BF_VALUE_I:%.*]] = and i64 [[RETVAL_0_I_I]], 2251799813685247
@@ -6127,8 +6127,8 @@ extern "C" __device__ double test_remainder(double x, double y) {
 // DEFAULT-NEXT:    [[__TMP_I:%.*]] = alloca i32, align 4, addrspace(5)
 // DEFAULT-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract noundef float @__ocml_remquo_f32(float noundef [[X]], float noundef [[Y]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
-// DEFAULT-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[INT_TBAA12]]
-// DEFAULT-NEXT:    store i32 [[TMP0]], ptr [[Z]], align 4, !tbaa [[INT_TBAA12]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[INT_TBAA4]]
+// DEFAULT-NEXT:    store i32 [[TMP0]], ptr [[Z]], align 4, !tbaa [[INT_TBAA4]]
 // DEFAULT-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
@@ -6138,8 +6138,8 @@ extern "C" __device__ double test_remainder(double x, double y) {
 // FINITEONLY-NEXT:    [[__TMP_I:%.*]] = alloca i32, align 4, addrspace(5)
 // FINITEONLY-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_remquo_f32(float noundef nofpclass(nan inf) [[X]], float noundef nofpclass(nan inf) [[Y]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[INT_TBAA12]]
-// FINITEONLY-NEXT:    store i32 [[TMP0]], ptr [[Z]], align 4, !tbaa [[INT_TBAA12]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[INT_TBAA4]]
+// FINITEONLY-NEXT:    store i32 [[TMP0]], ptr [[Z]], align 4, !tbaa [[INT_TBAA4]]
 // FINITEONLY-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
@@ -6149,8 +6149,8 @@ extern "C" __device__ double test_remainder(double x, double y) {
 // APPROX-NEXT:    [[__TMP_I:%.*]] = alloca i32, align 4, addrspace(5)
 // APPROX-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // APPROX-NEXT:    [[CALL_I:%.*]] = call contract noundef float @__ocml_remquo_f32(float noundef [[X]], float noundef [[Y]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
-// APPROX-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[INT_TBAA12]]
-// APPROX-NEXT:    store i32 [[TMP0]], ptr [[Z]], align 4, !tbaa [[INT_TBAA12]]
+// APPROX-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[INT_TBAA4]]
+// APPROX-NEXT:    store i32 [[TMP0]], ptr [[Z]], align 4, !tbaa [[INT_TBAA4]]
 // APPROX-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
@@ -6160,8 +6160,8 @@ extern "C" __device__ double test_remainder(double x, double y) {
 // NCRDIV-NEXT:    [[__TMP_I:%.*]] = alloca i32, align 4, addrspace(5)
 // NCRDIV-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // NCRDIV-NEXT:    [[CALL_I:%.*]] = call contract noundef float @__ocml_remquo_f32(float noundef [[X]], float noundef [[Y]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
-// NCRDIV-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[INT_TBAA13]]
-// NCRDIV-NEXT:    store i32 [[TMP0]], ptr [[Z]], align 4, !tbaa [[INT_TBAA13]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[INT_TBAA4]]
+// NCRDIV-NEXT:    store i32 [[TMP0]], ptr [[Z]], align 4, !tbaa [[INT_TBAA4]]
 // NCRDIV-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // NCRDIV-NEXT:    ret float [[CALL_I]]
 //
@@ -6172,8 +6172,8 @@ extern "C" __device__ double test_remainder(double x, double y) {
 // AMDGCNSPIRV-NEXT:    [[__TMP_ASCAST_I:%.*]] = addrspacecast ptr [[__TMP_I]] to ptr addrspace(4)
 // AMDGCNSPIRV-NEXT:    call addrspace(4) void @llvm.lifetime.start.p0(ptr nonnull [[__TMP_I]]) #[[ATTR15]]
 // AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = call contract spir_func noundef addrspace(4) float @__ocml_remquo_f32(float noundef [[X]], float noundef [[Y]], ptr noundef nonnull [[__TMP_I]]) #[[ATTR14]]
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(4) [[__TMP_ASCAST_I]], align 4, !tbaa [[INT_TBAA13]]
-// AMDGCNSPIRV-NEXT:    store i32 [[TMP0]], ptr addrspace(4) [[Z]], align 4, !tbaa [[INT_TBAA13]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(4) [[__TMP_ASCAST_I]], align 4, !tbaa [[INT_TBAA5]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP0]], ptr addrspace(4) [[Z]], align 4, !tbaa [[INT_TBAA5]]
 // AMDGCNSPIRV-NEXT:    call addrspace(4) void @llvm.lifetime.end.p0(ptr nonnull [[__TMP_I]]) #[[ATTR15]]
 // AMDGCNSPIRV-NEXT:    ret float [[CALL_I]]
 //
@@ -6187,8 +6187,8 @@ extern "C" __device__ float test_remquof(float x, float y, int* z) {
 // DEFAULT-NEXT:    [[__TMP_I:%.*]] = alloca i32, align 4, addrspace(5)
 // DEFAULT-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // DEFAULT-NEXT:    [[CALL_I:%.*]] = call contract noundef double @__ocml_remquo_f64(double noundef [[X]], double noundef [[Y]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
-// DEFAULT-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[INT_TBAA12]]
-// DEFAULT-NEXT:    store i32 [[TMP0]], ptr [[Z]], align 4, !tbaa [[INT_TBAA12]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[INT_TBAA4]]
+// DEFAULT-NEXT:    store i32 [[TMP0]], ptr [[Z]], align 4, !tbaa [[INT_TBAA4]]
 // DEFAULT-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
@@ -6198,8 +6198,8 @@ extern "C" __device__ float test_remquof(float x, float y, int* z) {
 // FINITEONLY-NEXT:    [[__TMP_I:%.*]] = alloca i32, align 4, addrspace(5)
 // FINITEONLY-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    [[CALL_I:%.*]] = call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_remquo_f64(double noundef nofpclass(nan inf) [[X]], double noundef nofpclass(nan inf) [[Y]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[INT_TBAA12]]
-// FINITEONLY-NEXT:    store i32 [[TMP0]], ptr [[Z]], align 4, !tbaa [[INT_TBAA12]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[INT_TBAA4]]
+// FINITEONLY-NEXT:    store i32 [[TMP0]], ptr [[Z]], align 4, !tbaa [[INT_TBAA4]]
 // FINITEONLY-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
@@ -6209,8 +6209,8 @@ extern "C" __device__ float test_remquof(float x, float y, int* z) {
 // APPROX-NEXT:    [[__TMP_I:%.*]] = alloca i32, align 4, addrspace(5)
 // APPROX-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // APPROX-NEXT:    [[CALL_I:%.*]] = call contract noundef double @__ocml_remquo_f64(double noundef [[X]], double noundef [[Y]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
-// APPROX-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[INT_TBAA12]]
-// APPROX-NEXT:    store i32 [[TMP0]], ptr [[Z]], align 4, !tbaa [[INT_TBAA12]]
+// APPROX-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[INT_TBAA4]]
+// APPROX-NEXT:    store i32 [[TMP0]], ptr [[Z]], align 4, !tbaa [[INT_TBAA4]]
 // APPROX-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
@@ -6220,8 +6220,8 @@ extern "C" __device__ float test_remquof(float x, float y, int* z) {
 // NCRDIV-NEXT:    [[__TMP_I:%.*]] = alloca i32, align 4, addrspace(5)
 // NCRDIV-NEXT:    call void @llvm.lifetime.start.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // NCRDIV-NEXT:    [[CALL_I:%.*]] = call contract noundef double @__ocml_remquo_f64(double noundef [[X]], double noundef [[Y]], ptr addrspace(5) noundef [[__TMP_I]]) #[[ATTR14]]
-// NCRDIV-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[INT_TBAA13]]
-// NCRDIV-NEXT:    store i32 [[TMP0]], ptr [[Z]], align 4, !tbaa [[INT_TBAA13]]
+// NCRDIV-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[__TMP_I]], align 4, !tbaa [[INT_TBAA4]]
+// NCRDIV-NEXT:    store i32 [[TMP0]], ptr [[Z]], align 4, !tbaa [[INT_TBAA4]]
 // NCRDIV-NEXT:    call void @llvm.lifetime.end.p5(ptr addrspace(5) [[__TMP_I]]) #[[ATTR15]]
 // NCRDIV-NEXT:    ret double [[CALL_I]]
 //
@@ -6232,8 +6232,8 @@ extern "C" __device__ float test_remquof(float x, float y, int* z) {
 // AMDGCNSPIRV-NEXT:    [[__TMP_ASCAST_I:%.*]] = addrspacecast ptr [[__TMP_I]] to ptr addrspace(4)
 // AMDGCNSPIRV-NEXT:    call addrspace(4) void @llvm.lifetime.start.p0(ptr nonnull [[__TMP_I]]) #[[ATTR15]]
 // AMDGCNSPIRV-NEXT:    [[CALL_I:%.*]] = call contract spir_func noundef addrspace(4) double @__ocml_remquo_f64(double noundef [[X]], double noundef [[Y]], ptr noundef nonnull [[__TMP_I]]) #[[ATTR14]]
-// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(4) [[__TMP_ASCAST_I]], align 4, !tbaa [[INT_TBAA13]]
-// AMDGCNSPIRV-NEXT:    store i32 [[TMP0]], ptr addrspace(4) [[Z]], align 4, !tbaa [[INT_TBAA13]]
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(4) [[__TMP_ASCAST_I]], align 4, !tbaa [[INT_TBAA5]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP0]], ptr addrspace(4) [[Z]], align 4, !tbaa [[INT_TBAA5]]
 // AMDGCNSPIRV-NEXT:    call addrspace(4) void @llvm.lifetime.end.p0(ptr nonnull [[__TMP_I]]) #[[ATTR15]]
 // AMDGCNSPIRV-NEXT:    ret double [[CALL_I]]
 //
@@ -8104,7 +8104,7 @@ extern "C" __device__ double test_y1(double x) {
 // NCRDIV-NEXT:    [[__X0_024_I:%.*]] = phi float [ [[__X1_025_I]], %[[FOR_BODY_I]] ], [ [[CALL_I21_I]], %[[IF_END4_I]] ]
 // NCRDIV-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_026_I]], 1
 // NCRDIV-NEXT:    [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to float
-// NCRDIV-NEXT:    [[DIV_I:%.*]] = fdiv contract float [[CONV_I]], [[Y]], !fpmath [[META12]]
+// NCRDIV-NEXT:    [[DIV_I:%.*]] = fdiv contract float [[CONV_I]], [[Y]], !fpmath [[META14]]
 // NCRDIV-NEXT:    [[MUL8_I:%.*]] = fmul contract float [[__X1_025_I]], [[DIV_I]]
 // NCRDIV-NEXT:    [[SUB_I]] = fsub contract float [[MUL8_I]], [[__X0_024_I]]
 // NCRDIV-NEXT:    [[INC_I]] = add nuw nsw i32 [[__I_026_I]], 1
@@ -8493,7 +8493,7 @@ extern "C" __device__ float test___fadd_rn(float x, float y) {
 // NCRDIV-LABEL: define dso_local noundef float @test___fdividef(
 // NCRDIV-SAME: float noundef [[X:%.*]], float noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR3]] {
 // NCRDIV-NEXT:  [[ENTRY:.*:]]
-// NCRDIV-NEXT:    [[DIV_I:%.*]] = fdiv contract float [[X]], [[Y]], !fpmath [[META12]]
+// NCRDIV-NEXT:    [[DIV_I:%.*]] = fdiv contract float [[X]], [[Y]], !fpmath [[META14]]
 // NCRDIV-NEXT:    ret float [[DIV_I]]
 //
 // AMDGCNSPIRV-LABEL: define spir_func noundef float @test___fdividef(
@@ -8595,7 +8595,7 @@ extern "C" __device__ float test___fmul_rn(float x, float y) {
 // NCRDIV-LABEL: define dso_local noundef float @test___frcp_rn(
 // NCRDIV-SAME: float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
 // NCRDIV-NEXT:  [[ENTRY:.*:]]
-// NCRDIV-NEXT:    [[DIV_I:%.*]] = fdiv contract float 1.000000e+00, [[X]], !fpmath [[META12]]
+// NCRDIV-NEXT:    [[DIV_I:%.*]] = fdiv contract float 1.000000e+00, [[X]], !fpmath [[META14]]
 // NCRDIV-NEXT:    ret float [[DIV_I]]
 //
 // AMDGCNSPIRV-LABEL: define spir_func noundef float @test___frcp_rn(
@@ -9398,120 +9398,120 @@ extern "C" __device__ int test_int_max(int x, int y) {
   return max(x, y);
 }
 //.
-// DEFAULT: [[CHAR_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
-// DEFAULT: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
-// DEFAULT: [[META6]] = !{!"Simple C++ TBAA"}
-// DEFAULT: [[LOOP7]] = distinct !{[[LOOP7]], [[META8:![0-9]+]], [[META9:![0-9]+]]}
-// DEFAULT: [[META8]] = !{!"llvm.loop.mustprogress"}
-// DEFAULT: [[META9]] = !{!"llvm.loop.unroll.disable"}
-// DEFAULT: [[LOOP10]] = distinct !{[[LOOP10]], [[META8]], [[META9]]}
-// DEFAULT: [[LOOP11]] = distinct !{[[LOOP11]], [[META8]], [[META9]]}
-// DEFAULT: [[INT_TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
-// DEFAULT: [[META13]] = !{!"int", [[META5]], i64 0}
-// DEFAULT: [[LOOP14]] = distinct !{[[LOOP14]], [[META8]], [[META9]]}
-// DEFAULT: [[LOOP15]] = distinct !{[[LOOP15]], [[META8]], [[META9]]}
+// DEFAULT: [[INT_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
+// DEFAULT: [[META5]] = !{!"int", [[META6:![0-9]+]], i64 0}
+// DEFAULT: [[META6]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
+// DEFAULT: [[META7]] = !{!"Simple C++ TBAA"}
+// DEFAULT: [[CHAR_TBAA8]] = !{[[META6]], [[META6]], i64 0}
+// DEFAULT: [[LOOP9]] = distinct !{[[LOOP9]], [[META10:![0-9]+]], [[META11:![0-9]+]]}
+// DEFAULT: [[META10]] = !{!"llvm.loop.mustprogress"}
+// DEFAULT: [[META11]] = !{!"llvm.loop.unroll.disable"}
+// DEFAULT: [[LOOP12]] = distinct !{[[LOOP12]], [[META10]], [[META11]]}
+// DEFAULT: [[LOOP13]] = distinct !{[[LOOP13]], [[META10]], [[META11]]}
+// DEFAULT: [[LOOP14]] = distinct !{[[LOOP14]], [[META10]], [[META11]]}
+// DEFAULT: [[LOOP15]] = distinct !{[[LOOP15]], [[META10]], [[META11]]}
 // DEFAULT: [[FLOAT_TBAA16]] = !{[[META17:![0-9]+]], [[META17]], i64 0}
-// DEFAULT: [[META17]] = !{!"float", [[META5]], i64 0}
+// DEFAULT: [[META17]] = !{!"float", [[META6]], i64 0}
 // DEFAULT: [[DOUBLE_TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0}
-// DEFAULT: [[META19]] = !{!"double", [[META5]], i64 0}
-// DEFAULT: [[LOOP20]] = distinct !{[[LOOP20]], [[META8]], [[META9]]}
-// DEFAULT: [[LOOP21]] = distinct !{[[LOOP21]], [[META8]], [[META9]]}
-// DEFAULT: [[LOOP22]] = distinct !{[[LOOP22]], [[META8]], [[META9]]}
-// DEFAULT: [[LOOP23]] = distinct !{[[LOOP23]], [[META8]], [[META9]]}
-// DEFAULT: [[LOOP24]] = distinct !{[[LOOP24]], [[META8]], [[META9]]}
-// DEFAULT: [[LOOP25]] = distinct !{[[LOOP25]], [[META8]], [[META9]]}
+// DEFAULT: [[META19]] = !{!"double", [[META6]], i64 0}
+// DEFAULT: [[LOOP20]] = distinct !{[[LOOP20]], [[META10]], [[META11]]}
+// DEFAULT: [[LOOP21]] = distinct !{[[LOOP21]], [[META10]], [[META11]]}
+// DEFAULT: [[LOOP22]] = distinct !{[[LOOP22]], [[META10]], [[META11]]}
+// DEFAULT: [[LOOP23]] = distinct !{[[LOOP23]], [[META10]], [[META11]]}
+// DEFAULT: [[LOOP24]] = distinct !{[[LOOP24]], [[META10]], [[META11]]}
+// DEFAULT: [[LOOP25]] = distinct !{[[LOOP25]], [[META10]], [[META11]]}
 //.
-// FINITEONLY: [[CHAR_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
-// FINITEONLY: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
-// FINITEONLY: [[META6]] = !{!"Simple C++ TBAA"}
-// FINITEONLY: [[LOOP7]] = distinct !{[[LOOP7]], [[META8:![0-9]+]], [[META9:![0-9]+]]}
-// FINITEONLY: [[META8]] = !{!"llvm.loop.mustprogress"}
-// FINITEONLY: [[META9]] = !{!"llvm.loop.unroll.disable"}
-// FINITEONLY: [[LOOP10]] = distinct !{[[LOOP10]], [[META8]], [[META9]]}
-// FINITEONLY: [[LOOP11]] = distinct !{[[LOOP11]], [[META8]], [[META9]]}
-// FINITEONLY: [[INT_TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
-// FINITEONLY: [[META13]] = !{!"int", [[META5]], i64 0}
-// FINITEONLY: [[LOOP14]] = distinct !{[[LOOP14]], [[META8]], [[META9]]}
-// FINITEONLY: [[LOOP15]] = distinct !{[[LOOP15]], [[META8]], [[META9]]}
+// FINITEONLY: [[INT_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
+// FINITEONLY: [[META5]] = !{!"int", [[META6:![0-9]+]], i64 0}
+// FINITEONLY: [[META6]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
+// FINITEONLY: [[META7]] = !{!"Simple C++ TBAA"}
+// FINITEONLY: [[CHAR_TBAA8]] = !{[[META6]], [[META6]], i64 0}
+// FINITEONLY: [[LOOP9]] = distinct !{[[LOOP9]], [[META10:![0-9]+]], [[META11:![0-9]+]]}
+// FINITEONLY: [[META10]] = !{!"llvm.loop.mustprogress"}
+// FINITEONLY: [[META11]] = !{!"llvm.loop.unroll.disable"}
+// FINITEONLY: [[LOOP12]] = distinct !{[[LOOP12]], [[META10]], [[META11]]}
+// FINITEONLY: [[LOOP13]] = distinct !{[[LOOP13]], [[META10]], [[META11]]}
+// FINITEONLY: [[LOOP14]] = distinct !{[[LOOP14]], [[META10]], [[META11]]}
+// FINITEONLY: [[LOOP15]] = distinct !{[[LOOP15]], [[META10]], [[META11]]}
 // FINITEONLY: [[FLOAT_TBAA16]] = !{[[META17:![0-9]+]], [[META17]], i64 0}
-// FINITEONLY: [[META17]] = !{!"float", [[META5]], i64 0}
+// FINITEONLY: [[META17]] = !{!"float", [[META6]], i64 0}
 // FINITEONLY: [[DOUBLE_TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0}
-// FINITEONLY: [[META19]] = !{!"double", [[META5]], i64 0}
-// FINITEONLY: [[LOOP20]] = distinct !{[[LOOP20]], [[META8]], [[META9]]}
-// FINITEONLY: [[LOOP21]] = distinct !{[[LOOP21]], [[META8]], [[META9]]}
-// FINITEONLY: [[LOOP22]] = distinct !{[[LOOP22]], [[META8]], [[META9]]}
-// FINITEONLY: [[LOOP23]] = distinct !{[[LOOP23]], [[META8]], [[META9]]}
-// FINITEONLY: [[LOOP24]] = distinct !{[[LOOP24]], [[META8]], [[META9]]}
-// FINITEONLY: [[LOOP25]] = distinct !{[[LOOP25]], [[META8]], [[META9]]}
+// FINITEONLY: [[META19]] = !{!"double", [[META6]], i64 0}
+// FINITEONLY: [[LOOP20]] = distinct !{[[LOOP20]], [[META10]], [[META11]]}
+// FINITEONLY: [[LOOP21]] = distinct !{[[LOOP21]], [[META10]], [[META11]]}
+// FINITEONLY: [[LOOP22]] = distinct !{[[LOOP22]], [[META10]], [[META11]]}
+// FINITEONLY: [[LOOP23]] = distinct !{[[LOOP23]], [[META10]], [[META11]]}
+// FINITEONLY: [[LOOP24]] = distinct !{[[LOOP24]], [[META10]], [[META11]]}
+// FINITEONLY: [[LOOP25]] = distinct !{[[LOOP25]], [[META10]], [[META11]]}
 //.
-// APPROX: [[CHAR_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
-// APPROX: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
-// APPROX: [[META6]] = !{!"Simple C++ TBAA"}
-// APPROX: [[LOOP7]] = distinct !{[[LOOP7]], [[META8:![0-9]+]], [[META9:![0-9]+]]}
-// APPROX: [[META8]] = !{!"llvm.loop.mustprogress"}
-// APPROX: [[META9]] = !{!"llvm.loop.unroll.disable"}
-// APPROX: [[LOOP10]] = distinct !{[[LOOP10]], [[META8]], [[META9]]}
-// APPROX: [[LOOP11]] = distinct !{[[LOOP11]], [[META8]], [[META9]]}
-// APPROX: [[INT_TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
-// APPROX: [[META13]] = !{!"int", [[META5]], i64 0}
-// APPROX: [[LOOP14]] = distinct !{[[LOOP14]], [[META8]], [[META9]]}
-// APPROX: [[LOOP15]] = distinct !{[[LOOP15]], [[META8]], [[META9]]}
+// APPROX: [[INT_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
+// APPROX: [[META5]] = !{!"int", [[META6:![0-9]+]], i64 0}
+// APPROX: [[META6]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
+// APPROX: [[META7]] = !{!"Simple C++ TBAA"}
+// APPROX: [[CHAR_TBAA8]] = !{[[META6]], [[META6]], i64 0}
+// APPROX: [[LOOP9]] = distinct !{[[LOOP9]], [[META10:![0-9]+]], [[META11:![0-9]+]]}
+// APPROX: [[META10]] = !{!"llvm.loop.mustprogress"}
+// APPROX: [[META11]] = !{!"llvm.loop.unroll.disable"}
+// APPROX: [[LOOP12]] = distinct !{[[LOOP12]], [[META10]], [[META11]]}
+// APPROX: [[LOOP13]] = distinct !{[[LOOP13]], [[META10]], [[META11]]}
+// APPROX: [[LOOP14]] = distinct !{[[LOOP14]], [[META10]], [[META11]]}
+// APPROX: [[LOOP15]] = distinct !{[[LOOP15]], [[META10]], [[META11]]}
 // APPROX: [[FLOAT_TBAA16]] = !{[[META17:![0-9]+]], [[META17]], i64 0}
-// APPROX: [[META17]] = !{!"float", [[META5]], i64 0}
+// APPROX: [[META17]] = !{!"float", [[META6]], i64 0}
 // APPROX: [[DOUBLE_TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0}
-// APPROX: [[META19]] = !{!"double", [[META5]], i64 0}
-// APPROX: [[LOOP20]] = distinct !{[[LOOP20]], [[META8]], [[META9]]}
-// APPROX: [[LOOP21]] = distinct !{[[LOOP21]], [[META8]], [[META9]]}
-// APPROX: [[LOOP22]] = distinct !{[[LOOP22]], [[META8]], [[META9]]}
-// APPROX: [[LOOP23]] = distinct !{[[LOOP23]], [[META8]], [[META9]]}
-// APPROX: [[LOOP24]] = distinct !{[[LOOP24]], [[META8]], [[META9]]}
-// APPROX: [[LOOP25]] = distinct !{[[LOOP25]], [[META8]], [[META9]]}
+// APPROX: [[META19]] = !{!"double", [[META6]], i64 0}
+// APPROX: [[LOOP20]] = distinct !{[[LOOP20]], [[META10]], [[META11]]}
+// APPROX: [[LOOP21]] = distinct !{[[LOOP21]], [[META10]], [[META11]]}
+// APPROX: [[LOOP22]] = distinct !{[[LOOP22]], [[META10]], [[META11]]}
+// APPROX: [[LOOP23]] = distinct !{[[LOOP23]], [[META10]], [[META11]]}
+// APPROX: [[LOOP24]] = distinct !{[[LOOP24]], [[META10]], [[META11]]}
+// APPROX: [[LOOP25]] = distinct !{[[LOOP25]], [[META10]], [[META11]]}
 //.
-// NCRDIV: [[CHAR_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
-// NCRDIV: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
-// NCRDIV: [[META6]] = !{!"Simple C++ TBAA"}
-// NCRDIV: [[LOOP7]] = distinct !{[[LOOP7]], [[META8:![0-9]+]], [[META9:![0-9]+]]}
-// NCRDIV: [[META8]] = !{!"llvm.loop.mustprogress"}
-// NCRDIV: [[META9]] = !{!"llvm.loop.unroll.disable"}
-// NCRDIV: [[LOOP10]] = distinct !{[[LOOP10]], [[META8]], [[META9]]}
-// NCRDIV: [[LOOP11]] = distinct !{[[LOOP11]], [[META8]], [[META9]]}
-// NCRDIV: [[META12]] = !{float 2.500000e+00}
-// NCRDIV: [[INT_TBAA13]] = !{[[META14:![0-9]+]], [[META14]], i64 0}
-// NCRDIV: [[META14]] = !{!"int", [[META5]], i64 0}
-// NCRDIV: [[LOOP15]] = distinct !{[[LOOP15]], [[META8]], [[META9]]}
-// NCRDIV: [[LOOP16]] = distinct !{[[LOOP16]], [[META8]], [[META9]]}
+// NCRDIV: [[INT_TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
+// NCRDIV: [[META5]] = !{!"int", [[META6:![0-9]+]], i64 0}
+// NCRDIV: [[META6]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
+// NCRDIV: [[META7]] = !{!"Simple C++ TBAA"}
+// NCRDIV: [[CHAR_TBAA8]] = !{[[META6]], [[META6]], i64 0}
+// NCRDIV: [[LOOP9]] = distinct !{[[LOOP9]], [[META10:![0-9]+]], [[META11:![0-9]+]]}
+// NCRDIV: [[META10]] = !{!"llvm.loop.mustprogress"}
+// NCRDIV: [[META11]] = !{!"llvm.loop.unroll.disable"}
+// NCRDIV: [[LOOP12]] = distinct !{[[LOOP12]], [[META10]], [[META11]]}
+// NCRDIV: [[LOOP13]] = distinct !{[[LOOP13]], [[META10]], [[META11]]}
+// NCRDIV: [[META14]] = !{float 2.500000e+00}
+// NCRDIV: [[LOOP15]] = distinct !{[[LOOP15]], [[META10]], [[META11]]}
+// NCRDIV: [[LOOP16]] = distinct !{[[LOOP16]], [[META10]], [[META11]]}
 // NCRDIV: [[FLOAT_TBAA17]] = !{[[META18:![0-9]+]], [[META18]], i64 0}
-// NCRDIV: [[META18]] = !{!"float", [[META5]], i64 0}
+// NCRDIV: [[META18]] = !{!"float", [[META6]], i64 0}
 // NCRDIV: [[DOUBLE_TBAA19]] = !{[[META20:![0-9]+]], [[META20]], i64 0}
-// NCRDIV: [[META20]] = !{!"double", [[META5]], i64 0}
-// NCRDIV: [[LOOP21]] = distinct !{[[LOOP21]], [[META8]], [[META9]]}
-// NCRDIV: [[LOOP22]] = distinct !{[[LOOP22]], [[META8]], [[META9]]}
-// NCRDIV: [[LOOP23]] = distinct !{[[LOOP23]], [[META8]], [[META9]]}
-// NCRDIV: [[LOOP24]] = distinct !{[[LOOP24]], [[META8]], [[META9]]}
+// NCRDIV: [[META20]] = !{!"double", [[META6]], i64 0}
+// NCRDIV: [[LOOP21]] = distinct !{[[LOOP21]], [[META10]], [[META11]]}
+// NCRDIV: [[LOOP22]] = distinct !{[[LOOP22]], [[META10]], [[META11]]}
+// NCRDIV: [[LOOP23]] = distinct !{[[LOOP23]], [[META10]], [[META11]]}
+// NCRDIV: [[LOOP24]] = distinct !{[[LOOP24]], [[META10]], [[META11]]}
 // NCRDIV: [[META25]] = !{float 3.000000e+00}
-// NCRDIV: [[LOOP26]] = distinct !{[[LOOP26]], [[META8]], [[META9]]}
-// NCRDIV: [[LOOP27]] = distinct !{[[LOOP27]], [[META8]], [[META9]]}
+// NCRDIV: [[LOOP26]] = distinct !{[[LOOP26]], [[META10]], [[META11]]}
+// NCRDIV: [[LOOP27]] = distinct !{[[LOOP27]], [[META10]], [[META11]]}
 //.
-// AMDGCNSPIRV: [[CHAR_TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
-// AMDGCNSPIRV: [[META6]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
-// AMDGCNSPIRV: [[META7]] = !{!"Simple C++ TBAA"}
-// AMDGCNSPIRV: [[LOOP8]] = distinct !{[[LOOP8]], [[META9:![0-9]+]], [[META10:![0-9]+]]}
-// AMDGCNSPIRV: [[META9]] = !{!"llvm.loop.mustprogress"}
-// AMDGCNSPIRV: [[META10]] = !{!"llvm.loop.unroll.disable"}
-// AMDGCNSPIRV: [[LOOP11]] = distinct !{[[LOOP11]], [[META9]], [[META10]]}
-// AMDGCNSPIRV: [[LOOP12]] = distinct !{[[LOOP12]], [[META9]], [[META10]]}
-// AMDGCNSPIRV: [[INT_TBAA13]] = !{[[META14:![0-9]+]], [[META14]], i64 0}
-// AMDGCNSPIRV: [[META14]] = !{!"int", [[META6]], i64 0}
-// AMDGCNSPIRV: [[LOOP15]] = distinct !{[[LOOP15]], [[META9]], [[META10]]}
-// AMDGCNSPIRV: [[LOOP16]] = distinct !{[[LOOP16]], [[META9]], [[META10]]}
+// AMDGCNSPIRV: [[INT_TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
+// AMDGCNSPIRV: [[META6]] = !{!"int", [[META7:![0-9]+]], i64 0}
+// AMDGCNSPIRV: [[META7]] = !{!"omnipotent char", [[META8:![0-9]+]], i64 0}
+// AMDGCNSPIRV: [[META8]] = !{!"Simple C++ TBAA"}
+// AMDGCNSPIRV: [[CHAR_TBAA9]] = !{[[META7]], [[META7]], i64 0}
+// AMDGCNSPIRV: [[LOOP10]] = distinct !{[[LOOP10]], [[META11:![0-9]+]], [[META12:![0-9]+]]}
+// AMDGCNSPIRV: [[META11]] = !{!"llvm.loop.mustprogress"}
+// AMDGCNSPIRV: [[META12]] = !{!"llvm.loop.unroll.disable"}
+// AMDGCNSPIRV: [[LOOP13]] = distinct !{[[LOOP13]], [[META11]], [[META12]]}
+// AMDGCNSPIRV: [[LOOP14]] = distinct !{[[LOOP14]], [[META11]], [[META12]]}
+// AMDGCNSPIRV: [[LOOP15]] = distinct !{[[LOOP15]], [[META11]], [[META12]]}
+// AMDGCNSPIRV: [[LOOP16]] = distinct !{[[LOOP16]], [[META11]], [[META12]]}
 // AMDGCNSPIRV: [[FLOAT_TBAA17]] = !{[[META18:![0-9]+]], [[META18]], i64 0}
-// AMDGCNSPIRV: [[META18]] = !{!"float", [[META6]], i64 0}
+// AMDGCNSPIRV: [[META18]] = !{!"float", [[META7]], i64 0}
 // AMDGCNSPIRV: [[DOUBLE_TBAA19]] = !{[[META20:![0-9]+]], [[META20]], i64 0}
-// AMDGCNSPIRV: [[META20]] = !{!"double", [[META6]], i64 0}
-// AMDGCNSPIRV: [[LOOP21]] = distinct !{[[LOOP21]], [[META9]], [[META10]]}
-// AMDGCNSPIRV: [[LOOP22]] = distinct !{[[LOOP22]], [[META9]], [[META10]]}
-// AMDGCNSPIRV: [[LOOP23]] = distinct !{[[LOOP23]], [[META9]], [[META10]]}
-// AMDGCNSPIRV: [[LOOP24]] = distinct !{[[LOOP24]], [[META9]], [[META10]]}
-// AMDGCNSPIRV: [[LOOP25]] = distinct !{[[LOOP25]], [[META9]], [[META10]]}
-// AMDGCNSPIRV: [[LOOP26]] = distinct !{[[LOOP26]], [[META9]], [[META10]]}
+// AMDGCNSPIRV: [[META20]] = !{!"double", [[META7]], i64 0}
+// AMDGCNSPIRV: [[LOOP21]] = distinct !{[[LOOP21]], [[META11]], [[META12]]}
+// AMDGCNSPIRV: [[LOOP22]] = distinct !{[[LOOP22]], [[META11]], [[META12]]}
+// AMDGCNSPIRV: [[LOOP23]] = distinct !{[[LOOP23]], [[META11]], [[META12]]}
+// AMDGCNSPIRV: [[LOOP24]] = distinct !{[[LOOP24]], [[META11]], [[META12]]}
+// AMDGCNSPIRV: [[LOOP25]] = distinct !{[[LOOP25]], [[META11]], [[META12]]}
+// AMDGCNSPIRV: [[LOOP26]] = distinct !{[[LOOP26]], [[META11]], [[META12]]}
 //.
diff --git a/clang/test/Headers/wasm.c b/clang/test/Headers/wasm.c
index 13b474a9276ba..d5c57e2844094 100644
--- a/clang/test/Headers/wasm.c
+++ b/clang/test/Headers/wasm.c
@@ -12,7 +12,7 @@
 // CHECK-LABEL: define hidden <4 x i32> @test_v128_load(
 // CHECK-SAME: ptr noundef readonly captures(none) [[MEM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[MEM]], align 1, !tbaa [[CHAR_TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[MEM]], align 1, !tbaa [[CHAR_TBAA6:![0-9]+]]
 // CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 //
 v128_t test_v128_load(const void *mem) {
@@ -22,7 +22,7 @@ v128_t test_v128_load(const void *mem) {
 // CHECK-LABEL: define hidden <4 x i32> @test_v128_load8_splat(
 // CHECK-SAME: ptr noundef readonly captures(none) [[MEM:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[MEM]], align 1, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[MEM]], align 1, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i64 0
 // CHECK-NEXT:    [[VECINIT16_I:%.*]] = shufflevector <16 x i8> [[VECINIT_I]], <16 x i8> poison, <16 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[VECINIT16_I]] to <4 x i32>
@@ -35,7 +35,7 @@ v128_t test_v128_load8_splat(const void *mem) {
 // CHECK-LABEL: define hidden <4 x i32> @test_v128_load16_splat(
 // CHECK-SAME: ptr noundef readonly captures(none) [[MEM:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[MEM]], align 1, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[MEM]], align 1, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[TMP0]], i64 0
 // CHECK-NEXT:    [[VECINIT8_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT8_I]] to <4 x i32>
@@ -48,7 +48,7 @@ v128_t test_v128_load16_splat(const void *mem) {
 // CHECK-LABEL: define hidden <4 x i32> @test_v128_load32_splat(
 // CHECK-SAME: ptr noundef readonly captures(none) [[MEM:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[MEM]], align 1, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[MEM]], align 1, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
 // CHECK-NEXT:    [[VECINIT4_I:%.*]] = shufflevector <4 x i32> [[VECINIT_I]], <4 x i32> poison, <4 x i32> zeroinitializer
 // CHECK-NEXT:    ret <4 x i32> [[VECINIT4_I]]
@@ -60,7 +60,7 @@ v128_t test_v128_load32_splat(const void *mem) {
 // CHECK-LABEL: define hidden <4 x i32> @test_v128_load64_splat(
 // CHECK-SAME: ptr noundef readonly captures(none) [[MEM:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[MEM]], align 1, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[MEM]], align 1, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 [[TMP0]], i64 0
 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = shufflevector <2 x i64> [[VECINIT_I]], <2 x i64> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[VECINIT2_I]] to <4 x i32>
@@ -73,7 +73,7 @@ v128_t test_v128_load64_splat(const void *mem) {
 // CHECK-LABEL: define hidden <4 x i32> @test_i16x8_load8x8(
 // CHECK-SAME: ptr noundef readonly captures(none) [[MEM:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[MEM]], align 1, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[MEM]], align 1, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[CONV_I:%.*]] = sext <8 x i8> [[TMP0]] to <8 x i16>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[CONV_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP1]]
@@ -85,7 +85,7 @@ v128_t test_i16x8_load8x8(const void *mem) {
 // CHECK-LABEL: define hidden <4 x i32> @test_u16x8_load8x8(
 // CHECK-SAME: ptr noundef readonly captures(none) [[MEM:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[MEM]], align 1, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[MEM]], align 1, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[CONV_I:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i16>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[CONV_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP1]]
@@ -97,7 +97,7 @@ v128_t test_u16x8_load8x8(const void *mem) {
 // CHECK-LABEL: define hidden range(i32 -32768, 32768) <4 x i32> @test_i32x4_load16x4(
 // CHECK-SAME: ptr noundef readonly captures(none) [[MEM:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[MEM]], align 1, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[MEM]], align 1, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[CONV_I:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[CONV_I]]
 //
@@ -108,7 +108,7 @@ v128_t test_i32x4_load16x4(const void *mem) {
 // CHECK-LABEL: define hidden range(i32 0, 65536) <4 x i32> @test_u32x4_load16x4(
 // CHECK-SAME: ptr noundef readonly captures(none) [[MEM:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[MEM]], align 1, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[MEM]], align 1, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[CONV_I:%.*]] = zext <4 x i16> [[TMP0]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[CONV_I]]
 //
@@ -119,7 +119,7 @@ v128_t test_u32x4_load16x4(const void *mem) {
 // CHECK-LABEL: define hidden <4 x i32> @test_i64x2_load32x2(
 // CHECK-SAME: ptr noundef readonly captures(none) [[MEM:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[MEM]], align 1, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[MEM]], align 1, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[CONV_I:%.*]] = sext <2 x i32> [[TMP0]] to <2 x i64>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[CONV_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP1]]
@@ -131,7 +131,7 @@ v128_t test_i64x2_load32x2(const void *mem) {
 // CHECK-LABEL: define hidden <4 x i32> @test_u64x2_load32x2(
 // CHECK-SAME: ptr noundef readonly captures(none) [[MEM:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[MEM]], align 1, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[MEM]], align 1, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[CONV_I:%.*]] = zext <2 x i32> [[TMP0]] to <2 x i64>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[CONV_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP1]]
@@ -143,7 +143,7 @@ v128_t test_u64x2_load32x2(const void *mem) {
 // CHECK-LABEL: define hidden <4 x i32> @test_v128_load32_zero(
 // CHECK-SAME: ptr noundef readonly captures(none) [[MEM:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[MEM]], align 1, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[MEM]], align 1, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <4 x i32> <i32 poison, i32 0, i32 0, i32 0>, i32 [[TMP0]], i64 0
 // CHECK-NEXT:    ret <4 x i32> [[VECINIT4_I]]
 //
@@ -154,7 +154,7 @@ v128_t test_v128_load32_zero(const void *mem) {
 // CHECK-LABEL: define hidden <4 x i32> @test_v128_load64_zero(
 // CHECK-SAME: ptr noundef readonly captures(none) [[MEM:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[MEM]], align 1, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[MEM]], align 1, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[TMP0]], i64 0
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[VECINIT2_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP1]]
@@ -166,7 +166,7 @@ v128_t test_v128_load64_zero(const void *mem) {
 // CHECK-LABEL: define hidden <4 x i32> @test_v128_load8_lane(
 // CHECK-SAME: ptr noundef readonly captures(none) [[PTR:%.*]], <4 x i32> noundef [[VEC:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[PTR]], align 1, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[PTR]], align 1, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[VEC]] to <16 x i8>
 // CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[TMP0]], i64 15
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[VECINS_I]] to <4 x i32>
@@ -179,7 +179,7 @@ v128_t test_v128_load8_lane(const uint8_t *ptr, v128_t vec) {
 // CHECK-LABEL: define hidden <4 x i32> @test_v128_load16_lane(
 // CHECK-SAME: ptr noundef readonly captures(none) [[PTR:%.*]], <4 x i32> noundef [[VEC:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[PTR]], align 1, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[PTR]], align 1, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[VEC]] to <8 x i16>
 // CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <8 x i16> [[TMP1]], i16 [[TMP0]], i64 7
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[VECINS_I]] to <4 x i32>
@@ -192,7 +192,7 @@ v128_t test_v128_load16_lane(const uint16_t *ptr, v128_t vec) {
 // CHECK-LABEL: define hidden <4 x i32> @test_v128_load32_lane(
 // CHECK-SAME: ptr noundef readonly captures(none) [[PTR:%.*]], <4 x i32> noundef [[VEC:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[PTR]], align 1, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[PTR]], align 1, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <4 x i32> [[VEC]], i32 [[TMP0]], i64 3
 // CHECK-NEXT:    ret <4 x i32> [[VECINS_I]]
 //
@@ -203,7 +203,7 @@ v128_t test_v128_load32_lane(const uint32_t *ptr, v128_t vec) {
 // CHECK-LABEL: define hidden <4 x i32> @test_v128_load64_lane(
 // CHECK-SAME: ptr noundef readonly captures(none) [[PTR:%.*]], <4 x i32> noundef [[VEC:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[PTR]], align 1, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[PTR]], align 1, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[VEC]] to <2 x i64>
 // CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[TMP0]], i64 1
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[VECINS_I]] to <4 x i32>
@@ -216,7 +216,7 @@ v128_t test_v128_load64_lane(const uint64_t *ptr, v128_t vec) {
 // CHECK-LABEL: define hidden void @test_v128_store(
 // CHECK-SAME: ptr noundef writeonly captures(none) initializes((0, 16)) [[MEM:%.*]], <4 x i32> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    store <4 x i32> [[A]], ptr [[MEM]], align 1, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store <4 x i32> [[A]], ptr [[MEM]], align 1, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test_v128_store(void *mem, v128_t a) {
@@ -228,7 +228,7 @@ void test_v128_store(void *mem, v128_t a) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[VEC]] to <16 x i8>
 // CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <16 x i8> [[TMP0]], i64 15
-// CHECK-NEXT:    store i8 [[VECEXT_I]], ptr [[PTR]], align 1, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store i8 [[VECEXT_I]], ptr [[PTR]], align 1, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test_v128_store8_lane(uint8_t *ptr, v128_t vec) {
@@ -240,7 +240,7 @@ void test_v128_store8_lane(uint8_t *ptr, v128_t vec) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[VEC]] to <8 x i16>
 // CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <8 x i16> [[TMP0]], i64 7
-// CHECK-NEXT:    store i16 [[VECEXT_I]], ptr [[PTR]], align 1, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store i16 [[VECEXT_I]], ptr [[PTR]], align 1, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test_v128_store16_lane(uint16_t *ptr, v128_t vec) {
@@ -251,7 +251,7 @@ void test_v128_store16_lane(uint16_t *ptr, v128_t vec) {
 // CHECK-SAME: ptr noundef writeonly captures(none) initializes((0, 4)) [[PTR:%.*]], <4 x i32> noundef [[VEC:%.*]]) local_unnamed_addr #[[ATTR1]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <4 x i32> [[VEC]], i64 3
-// CHECK-NEXT:    store i32 [[VECEXT_I]], ptr [[PTR]], align 1, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store i32 [[VECEXT_I]], ptr [[PTR]], align 1, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test_v128_store32_lane(uint32_t *ptr, v128_t vec) {
@@ -263,7 +263,7 @@ void test_v128_store32_lane(uint32_t *ptr, v128_t vec) {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[VEC]] to <2 x i64>
 // CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <2 x i64> [[TMP0]], i64 1
-// CHECK-NEXT:    store i64 [[VECEXT_I]], ptr [[PTR]], align 1, !tbaa [[CHAR_TBAA2]]
+// CHECK-NEXT:    store i64 [[VECEXT_I]], ptr [[PTR]], align 1, !tbaa [[CHAR_TBAA6]]
 // CHECK-NEXT:    ret void
 //
 void test_v128_store64_lane(uint64_t *ptr, v128_t vec) {
@@ -614,7 +614,7 @@ v128_t test_f64x2_const_splat(void) {
   return wasm_f64x2_const_splat(42);
 }
 
-// CHECK-LABEL: define hidden <4 x i32> @test_i8x16_splat(
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i8x16_splat(
 // CHECK-SAME: i8 noundef signext [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i64 0
@@ -626,7 +626,7 @@ v128_t test_i8x16_splat(int8_t a) {
   return wasm_i8x16_splat(a);
 }
 
-// CHECK-LABEL: define hidden <4 x i32> @test_u8x16_splat(
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_u8x16_splat(
 // CHECK-SAME: i8 noundef zeroext [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i64 0
@@ -684,7 +684,7 @@ v128_t test_u8x16_replace_lane(v128_t a, uint8_t b) {
   return wasm_u8x16_replace_lane(a, 15, b);
 }
 
-// CHECK-LABEL: define hidden <4 x i32> @test_i16x8_splat(
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i16x8_splat(
 // CHECK-SAME: i16 noundef signext [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i64 0
@@ -696,7 +696,7 @@ v128_t test_i16x8_splat(int16_t a) {
   return wasm_i16x8_splat(a);
 }
 
-// CHECK-LABEL: define hidden <4 x i32> @test_u16x8_splat(
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_u16x8_splat(
 // CHECK-SAME: i16 noundef zeroext [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i64 0
@@ -754,7 +754,7 @@ v128_t test_u16x8_replace_lane(v128_t a, uint16_t b) {
   return wasm_u16x8_replace_lane(a, 7, b);
 }
 
-// CHECK-LABEL: define hidden <4 x i32> @test_i32x4_splat(
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i32x4_splat(
 // CHECK-SAME: i32 noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
@@ -765,7 +765,7 @@ v128_t test_i32x4_splat(int32_t a) {
   return wasm_i32x4_splat(a);
 }
 
-// CHECK-LABEL: define hidden <4 x i32> @test_u32x4_splat(
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_u32x4_splat(
 // CHECK-SAME: i32 noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
@@ -816,7 +816,7 @@ v128_t test_u32x4_replace_lane(v128_t a, uint32_t b) {
   return wasm_u32x4_replace_lane(a, 3, b);
 }
 
-// CHECK-LABEL: define hidden <4 x i32> @test_i64x2_splat(
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_i64x2_splat(
 // CHECK-SAME: i64 noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 [[A]], i64 0
@@ -828,7 +828,7 @@ v128_t test_i64x2_splat(int64_t a) {
   return wasm_i64x2_splat(a);
 }
 
-// CHECK-LABEL: define hidden <4 x i32> @test_u64x2_splat(
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_u64x2_splat(
 // CHECK-SAME: i64 noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 [[A]], i64 0
@@ -921,7 +921,7 @@ v128_t test_f32x4_replace_lane(v128_t a, float b) {
   return wasm_f32x4_replace_lane(a, 3, b);
 }
 
-// CHECK-LABEL: define hidden <4 x i32> @test_f64x2_splat(
+// CHECK-LABEL: define hidden noundef <4 x i32> @test_f64x2_splat(
 // CHECK-SAME: double noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[A]], i64 0
@@ -3468,7 +3468,7 @@ v128_t test_i16x8_q15mulr_sat(v128_t a, v128_t b) {
   return wasm_i16x8_q15mulr_sat(a, b);
 }
 //.
-// CHECK: [[CHAR_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
-// CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
-// CHECK: [[META4]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[META4:![0-9]+]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
+// CHECK: [[META5]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[CHAR_TBAA6]] = !{[[META4]], [[META4]], i64 0}
 //.
diff --git a/clang/test/Lexer/minimize_source_to_dependency_directives_utf8bom.c b/clang/test/Lexer/minimize_source_to_dependency_directives_utf8bom.c
index 46aba914441bc..6f574acab282e 100644
--- a/clang/test/Lexer/minimize_source_to_dependency_directives_utf8bom.c
+++ b/clang/test/Lexer/minimize_source_to_dependency_directives_utf8bom.c
@@ -1,5 +1,5 @@
 // Test UTF8 BOM at start of file
-// RUN: printf '\xef\xbb\xbf' > %t.c
+// RUN: printf '\357\273\277' > %t.c
 ﻿// RUN: echo '#ifdef TEST\n' >> %t.c
 // RUN: echo '#include <string>' >> %t.c
 // RUN: echo '#endif' >> %t.c
diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
index 73d4cb1769ed5..ab4153a64f028 100644
--- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -35,6 +35,7 @@
 // CHECK-NEXT: CFUnknownTransfer (SubjectMatchRule_function)
 // CHECK-NEXT: CPUDispatch (SubjectMatchRule_function)
 // CHECK-NEXT: CPUSpecific (SubjectMatchRule_function)
+// CHECK-NEXT: CUDAClusterDims (SubjectMatchRule_objc_method, SubjectMatchRule_hasType_functionType)
 // CHECK-NEXT: CUDAConstant (SubjectMatchRule_variable)
 // CHECK-NEXT: CUDADevice (SubjectMatchRule_function, SubjectMatchRule_variable)
 // CHECK-NEXT: CUDADeviceBuiltinSurfaceType (SubjectMatchRule_record)
@@ -43,6 +44,7 @@
 // CHECK-NEXT: CUDAGridConstant (SubjectMatchRule_variable_is_parameter)
 // CHECK-NEXT: CUDAHost (SubjectMatchRule_function)
 // CHECK-NEXT: CUDALaunchBounds (SubjectMatchRule_objc_method, SubjectMatchRule_hasType_functionType)
+// CHECK-NEXT: CUDANoCluster (SubjectMatchRule_objc_method, SubjectMatchRule_hasType_functionType)
 // CHECK-NEXT: CUDAShared (SubjectMatchRule_variable)
 // CHECK-NEXT: CXX11NoReturn (SubjectMatchRule_function)
 // CHECK-NEXT: CallableWhen (SubjectMatchRule_function_is_member)
diff --git a/clang/test/Misc/target-invalid-cpu-note/x86.c b/clang/test/Misc/target-invalid-cpu-note/x86.c
index f89cdc2aa573f..4a70e9bff3fef 100644
--- a/clang/test/Misc/target-invalid-cpu-note/x86.c
+++ b/clang/test/Misc/target-invalid-cpu-note/x86.c
@@ -63,6 +63,8 @@
 // X86-SAME: {{^}}, lunarlake
 // X86-SAME: {{^}}, gracemont
 // X86-SAME: {{^}}, pantherlake
+// X86-SAME: {{^}}, wildcatlake
+// X86-SAME: {{^}}, novalake
 // X86-SAME: {{^}}, sierraforest
 // X86-SAME: {{^}}, grandridge
 // X86-SAME: {{^}}, graniterapids
@@ -150,6 +152,8 @@
 // X86_64-SAME: {{^}}, lunarlake
 // X86_64-SAME: {{^}}, gracemont
 // X86_64-SAME: {{^}}, pantherlake
+// X86_64-SAME: {{^}}, wildcatlake
+// X86_64-SAME: {{^}}, novalake
 // X86_64-SAME: {{^}}, sierraforest
 // X86_64-SAME: {{^}}, grandridge
 // X86_64-SAME: {{^}}, graniterapids
@@ -246,6 +250,8 @@
 // TUNE_X86-SAME: {{^}}, lunarlake
 // TUNE_X86-SAME: {{^}}, gracemont
 // TUNE_X86-SAME: {{^}}, pantherlake
+// TUNE_X86-SAME: {{^}}, wildcatlake
+// TUNE_X86-SAME: {{^}}, novalake
 // TUNE_X86-SAME: {{^}}, sierraforest
 // TUNE_X86-SAME: {{^}}, grandridge
 // TUNE_X86-SAME: {{^}}, graniterapids
@@ -349,6 +355,8 @@
 // TUNE_X86_64-SAME: {{^}}, lunarlake
 // TUNE_X86_64-SAME: {{^}}, gracemont
 // TUNE_X86_64-SAME: {{^}}, pantherlake
+// TUNE_X86_64-SAME: {{^}}, wildcatlake
+// TUNE_X86_64-SAME: {{^}}, novalake
 // TUNE_X86_64-SAME: {{^}}, sierraforest
 // TUNE_X86_64-SAME: {{^}}, grandridge
 // TUNE_X86_64-SAME: {{^}}, graniterapids
diff --git a/clang/test/Modules/crash-vfs-relative-incdir.m b/clang/test/Modules/crash-vfs-relative-incdir.m
index c0407f776778b..46c3413c7817f 100644
--- a/clang/test/Modules/crash-vfs-relative-incdir.m
+++ b/clang/test/Modules/crash-vfs-relative-incdir.m
@@ -53,4 +53,4 @@
 
 // RUN: cd %t
 // RUN: chmod 755 crash-vfs-*.sh
-// RUN: ./crash-vfs-*.sh
+// RUN: bash ./crash-vfs-*.sh
diff --git a/clang/test/Modules/crash-vfs-run-reproducer.m b/clang/test/Modules/crash-vfs-run-reproducer.m
index fd861fed5a603..fa06fd988c98f 100644
--- a/clang/test/Modules/crash-vfs-run-reproducer.m
+++ b/clang/test/Modules/crash-vfs-run-reproducer.m
@@ -53,4 +53,4 @@
 
 // RUN: cd %t
 // RUN: chmod 755 crash-vfs-*.sh
-// RUN: ./crash-vfs-*.sh
+// RUN: bash ./crash-vfs-*.sh
diff --git a/clang/test/OpenMP/bug54082.c b/clang/test/OpenMP/bug54082.c
index ef3e7153545bf..1a98e122b84ca 100644
--- a/clang/test/OpenMP/bug54082.c
+++ b/clang/test/OpenMP/bug54082.c
@@ -72,7 +72,7 @@ void foo() {
 // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(16) [[X_TRAITS]], ptr noundef nonnull align 16 dereferenceable(16) @__const.foo.x_traits, i64 16, i1 false)
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[X_ALLOC]]) #[[ATTR5]]
 // CHECK-NEXT:    [[CALL:%.*]] = call i64 @omp_init_allocator(i64 noundef 0, i32 noundef 1, ptr noundef nonnull [[X_TRAITS]]) #[[ATTR5]]
-// CHECK-NEXT:    store i64 [[CALL]], ptr [[X_ALLOC]], align 8, !tbaa [[LONG_TBAA3:![0-9]+]]
+// CHECK-NEXT:    store i64 [[CALL]], ptr [[X_ALLOC]], align 8, !tbaa [[LONG_TBAA7:![0-9]+]]
 // CHECK-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr nonnull @[[GLOB2:[0-9]+]], i32 1, ptr nonnull @foo.omp_outlined, ptr nonnull [[X_ALLOC]])
 // CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr nonnull [[X_ALLOC]]) #[[ATTR5]]
 // CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr nonnull [[X_TRAITS]]) #[[ATTR5]]
@@ -87,23 +87,23 @@ void foo() {
 // CHECK-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[DOTOMP_LB]]) #[[ATTR5]]
-// CHECK-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !tbaa [[INT_TBAA7:![0-9]+]]
+// CHECK-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !tbaa [[INT_TBAA3:![0-9]+]]
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[DOTOMP_UB]]) #[[ATTR5]]
-// CHECK-NEXT:    store i32 1023, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA7]]
+// CHECK-NEXT:    store i32 1023, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA3]]
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[DOTOMP_STRIDE]]) #[[ATTR5]]
-// CHECK-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[INT_TBAA7]]
+// CHECK-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[INT_TBAA3]]
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[DOTOMP_IS_LAST]]) #[[ATTR5]]
-// CHECK-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !tbaa [[INT_TBAA7]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[INT_TBAA7]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X_ALLOC]], align 8, !tbaa [[LONG_TBAA3]]
+// CHECK-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !tbaa [[INT_TBAA3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[INT_TBAA3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X_ALLOC]], align 8, !tbaa [[LONG_TBAA7]]
 // CHECK-NEXT:    [[CONV:%.*]] = inttoptr i64 [[TMP1]] to ptr
 // CHECK-NEXT:    [[DOTX__VOID_ADDR:%.*]] = tail call ptr @__kmpc_alloc(i32 [[TMP0]], i64 8, ptr [[CONV]])
 // CHECK-NEXT:    call void @__kmpc_for_static_init_4(ptr nonnull @[[GLOB1:[0-9]+]], i32 [[TMP0]], i32 34, ptr nonnull [[DOTOMP_IS_LAST]], ptr nonnull [[DOTOMP_LB]], ptr nonnull [[DOTOMP_UB]], ptr nonnull [[DOTOMP_STRIDE]], i32 1, i32 1)
-// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA7]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA3]]
 // CHECK-NEXT:    [[COND:%.*]] = call i32 @llvm.smin.i32(i32 [[TMP2]], i32 1023)
-// CHECK-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA7]]
+// CHECK-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA3]]
 // CHECK-NEXT:    call void @__kmpc_for_static_fini(ptr nonnull @[[GLOB1]], i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X_ALLOC]], align 8, !tbaa [[LONG_TBAA3]]
+// CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X_ALLOC]], align 8, !tbaa [[LONG_TBAA7]]
 // CHECK-NEXT:    [[CONV5:%.*]] = inttoptr i64 [[TMP3]] to ptr
 // CHECK-NEXT:    call void @__kmpc_free(i32 [[TMP0]], ptr [[DOTX__VOID_ADDR]], ptr [[CONV5]])
 // CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr nonnull [[DOTOMP_IS_LAST]]) #[[ATTR5]]
@@ -113,10 +113,10 @@ void foo() {
 // CHECK-NEXT:    ret void
 //
 //.
-// CHECK: [[LONG_TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
-// CHECK: [[META4]] = !{!"long", [[META5:![0-9]+]], i64 0}
+// CHECK: [[INT_TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+// CHECK: [[META4]] = !{!"int", [[META5:![0-9]+]], i64 0}
 // CHECK: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
 // CHECK: [[META6]] = !{!"Simple C/C++ TBAA"}
-// CHECK: [[INT_TBAA7]] = !{[[META8:![0-9]+]], [[META8]], i64 0}
-// CHECK: [[META8]] = !{!"int", [[META5]], i64 0}
+// CHECK: [[LONG_TBAA7]] = !{[[META8:![0-9]+]], [[META8]], i64 0}
+// CHECK: [[META8]] = !{!"long", [[META5]], i64 0}
 //.
diff --git a/clang/test/OpenMP/bug57757.cpp b/clang/test/OpenMP/bug57757.cpp
index 7f253aad081b0..8cda35e70f553 100644
--- a/clang/test/OpenMP/bug57757.cpp
+++ b/clang/test/OpenMP/bug57757.cpp
@@ -20,9 +20,9 @@ void foo() {
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1:[0-9]+]])
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call ptr @__kmpc_omp_task_alloc(ptr nonnull @[[GLOB1]], i32 [[TMP0]], i32 0, i64 56, i64 1, ptr nonnull @.omp_task_entry.)
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40
-// CHECK-NEXT:    store ptr @_Z3barif, ptr [[TMP2]], align 8, !tbaa [[ANYPTR_TBAA3:![0-9]+]]
+// CHECK-NEXT:    store ptr @_Z3barif, ptr [[TMP2]], align 8, !tbaa [[ANYPTR_TBAA7:![0-9]+]]
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16
-// CHECK-NEXT:    store i32 0, ptr [[TMP3]], align 8, !tbaa [[INT_TBAA12:![0-9]+]]
+// CHECK-NEXT:    store i32 0, ptr [[TMP3]], align 8, !tbaa [[INT_TBAA13:![0-9]+]]
 // CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @__kmpc_omp_task(ptr nonnull @[[GLOB1]], i32 [[TMP0]], ptr [[TMP1]])
 // CHECK-NEXT:    ret void
 //
@@ -31,47 +31,47 @@ void foo() {
 // CHECK-SAME: i32 noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16
-// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META13:![0-9]+]])
-// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META16:![0-9]+]])
-// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4, !tbaa [[INT_TBAA18:![0-9]+]], !alias.scope [[META13]], !noalias [[META16]]
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META14:![0-9]+]])
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META17:![0-9]+]])
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4, !tbaa [[INT_TBAA3:![0-9]+]], !alias.scope [[META14]], !noalias [[META17]]
 // CHECK-NEXT:    switch i32 [[TMP3]], [[DOTOMP_OUTLINED__EXIT:label %.*]] [
 // CHECK-NEXT:      i32 0, [[DOTUNTIED_JMP__I:label %.*]]
 // CHECK-NEXT:      i32 1, [[DOTUNTIED_NEXT__I:label %.*]]
 // CHECK-NEXT:    ]
 // CHECK:       [[_UNTIED_JMP__I:.*:]]
-// CHECK-NEXT:    store i32 1, ptr [[TMP2]], align 4, !tbaa [[INT_TBAA18]], !alias.scope [[META13]], !noalias [[META16]]
+// CHECK-NEXT:    store i32 1, ptr [[TMP2]], align 4, !tbaa [[INT_TBAA3]], !alias.scope [[META14]], !noalias [[META17]]
 // CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @__kmpc_omp_task(ptr nonnull @[[GLOB1]], i32 [[TMP0]], ptr nonnull [[TMP1]]), !noalias [[META19:![0-9]+]]
 // CHECK-NEXT:    br [[DOTOMP_OUTLINED__EXIT]]
 // CHECK:       [[_UNTIED_NEXT__I:.*:]]
 // CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40
 // CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 52
 // CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 48
-// CHECK-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[ANYPTR_TBAA20:![0-9]+]], !alias.scope [[META16]], !noalias [[META13]]
-// CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 8, !tbaa [[INT_TBAA18]], !alias.scope [[META16]], !noalias [[META13]]
-// CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[FLOAT_TBAA21:![0-9]+]], !alias.scope [[META16]], !noalias [[META13]]
+// CHECK-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[ANYPTR_TBAA20:![0-9]+]], !alias.scope [[META17]], !noalias [[META14]]
+// CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 8, !tbaa [[INT_TBAA3]], !alias.scope [[META17]], !noalias [[META14]]
+// CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[FLOAT_TBAA21:![0-9]+]], !alias.scope [[META17]], !noalias [[META14]]
 // CHECK-NEXT:    tail call void [[TMP8]](i32 noundef [[TMP9]], float noundef [[TMP10]]) #[[ATTR2:[0-9]+]], !noalias [[META19]]
 // CHECK-NEXT:    br [[DOTOMP_OUTLINED__EXIT]]
 // CHECK:       [[_OMP_OUTLINED__EXIT:.*:]]
 // CHECK-NEXT:    ret i32 0
 //
 //.
-// CHECK: [[ANYPTR_TBAA3]] = !{[[META4:![0-9]+]], [[META6:![0-9]+]], i64 40}
-// CHECK: [[META4]] = !{!"_ZTS24kmp_task_t_with_privates", [[META5:![0-9]+]], i64 0, [[META10:![0-9]+]], i64 40}
-// CHECK: [[META5]] = !{!"_ZTS10kmp_task_t", [[META6]], i64 0, [[META6]], i64 8, [[META9:![0-9]+]], i64 16, [[META7:![0-9]+]], i64 24, [[META7]], i64 32}
-// CHECK: [[META6]] = !{!"any pointer", [[META7]], i64 0}
-// CHECK: [[META7]] = !{!"omnipotent char", [[META8:![0-9]+]], i64 0}
-// CHECK: [[META8]] = !{!"Simple C++ TBAA"}
-// CHECK: [[META9]] = !{!"int", [[META7]], i64 0}
-// CHECK: [[META10]] = !{!"_ZTS15.kmp_privates.t", [[META6]], i64 0, [[META9]], i64 8, [[META11:![0-9]+]], i64 12}
-// CHECK: [[META11]] = !{!"float", [[META7]], i64 0}
-// CHECK: [[INT_TBAA12]] = !{[[META4]], [[META9]], i64 16}
-// CHECK: [[META13]] = !{[[META14:![0-9]+]]}
-// CHECK: [[META14]] = distinct !{[[META14]], [[META15:![0-9]+]], !".omp_outlined.: %.part_id."}
-// CHECK: [[META15]] = distinct !{[[META15]], !".omp_outlined."}
-// CHECK: [[META16]] = !{[[META17:![0-9]+]]}
-// CHECK: [[META17]] = distinct !{[[META17]], [[META15]], !".omp_outlined.: %.privates."}
-// CHECK: [[INT_TBAA18]] = !{[[META9]], [[META9]], i64 0}
-// CHECK: [[META19]] = !{[[META14]], [[META17]]}
-// CHECK: [[ANYPTR_TBAA20]] = !{[[META6]], [[META6]], i64 0}
-// CHECK: [[FLOAT_TBAA21]] = !{[[META11]], [[META11]], i64 0}
+// CHECK: [[INT_TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+// CHECK: [[META4]] = !{!"int", [[META5:![0-9]+]], i64 0}
+// CHECK: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
+// CHECK: [[META6]] = !{!"Simple C++ TBAA"}
+// CHECK: [[ANYPTR_TBAA7]] = !{[[META8:![0-9]+]], [[META10:![0-9]+]], i64 40}
+// CHECK: [[META8]] = !{!"_ZTS24kmp_task_t_with_privates", [[META9:![0-9]+]], i64 0, [[META11:![0-9]+]], i64 40}
+// CHECK: [[META9]] = !{!"_ZTS10kmp_task_t", [[META10]], i64 0, [[META10]], i64 8, [[META4]], i64 16, [[META5]], i64 24, [[META5]], i64 32}
+// CHECK: [[META10]] = !{!"any pointer", [[META5]], i64 0}
+// CHECK: [[META11]] = !{!"_ZTS15.kmp_privates.t", [[META10]], i64 0, [[META4]], i64 8, [[META12:![0-9]+]], i64 12}
+// CHECK: [[META12]] = !{!"float", [[META5]], i64 0}
+// CHECK: [[INT_TBAA13]] = !{[[META8]], [[META4]], i64 16}
+// CHECK: [[META14]] = !{[[META15:![0-9]+]]}
+// CHECK: [[META15]] = distinct !{[[META15]], [[META16:![0-9]+]], !".omp_outlined.: %.part_id."}
+// CHECK: [[META16]] = distinct !{[[META16]], !".omp_outlined."}
+// CHECK: [[META17]] = !{[[META18:![0-9]+]]}
+// CHECK: [[META18]] = distinct !{[[META18]], [[META16]], !".omp_outlined.: %.privates."}
+// CHECK: [[META19]] = !{[[META15]], [[META18]]}
+// CHECK: [[ANYPTR_TBAA20]] = !{[[META10]], [[META10]], i64 0}
+// CHECK: [[FLOAT_TBAA21]] = !{[[META12]], [[META12]], i64 0}
 //.
diff --git a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp
index 46c87eb31969d..4b99d7766c873 100644
--- a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp
+++ b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp
@@ -36,14 +36,14 @@ void test() {
 // CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8, !tbaa [[ANYPTR_TBAA6:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8, !tbaa [[ANYPTR_TBAA10:![0-9]+]]
 // CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[WORKER_EXIT:.*]]
 // CHECK1:       [[USER_CODE_ENTRY]]:
 // CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
 // CHECK1-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
-// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[INT_TBAA10:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[INT_TBAA6:![0-9]+]]
 // CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4:[0-9]+]]
 // CHECK1-NEXT:    call void @__kmpc_target_deinit()
 // CHECK1-NEXT:    ret void
@@ -73,43 +73,43 @@ void test() {
 // CHECK1-NEXT:    [[PARTIAL_SUM:%.*]] = call align 16 ptr @__kmpc_alloc_shared(i64 8)
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTOMP_IV]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTOMP_LB]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTOMP_UB]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 99, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 99, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTOMP_STRIDE]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTOMP_IS_LAST]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[IB]]) #[[ATTR4]]
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
-// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 99
 // CHECK1-NEXT:    br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
 // CHECK1:       [[COND_TRUE]]:
 // CHECK1-NEXT:    br label %[[COND_END:.*]]
 // CHECK1:       [[COND_FALSE]]:
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    br label %[[COND_END]]
 // CHECK1:       [[COND_END]]:
 // CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 99, %[[COND_TRUE]] ], [ [[TMP3]], %[[COND_FALSE]] ]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[INT_TBAA10]]
-// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA6]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[INT_TBAA6]]
+// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    br label %[[OMP_INNER_FOR_COND:.*]]
 // CHECK1:       [[OMP_INNER_FOR_COND]]:
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA10]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA6]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[CMP1]], label %[[OMP_INNER_FOR_BODY:.*]], label %[[OMP_INNER_FOR_COND_CLEANUP:.*]]
 // CHECK1:       [[OMP_INNER_FOR_COND_CLEANUP]]:
 // CHECK1-NEXT:    br label %[[OMP_INNER_FOR_END:.*]]
 // CHECK1:       [[OMP_INNER_FOR_BODY]]:
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[IB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[IB]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[REF_TMP]]) #[[ATTR4]]
 // CHECK1-NEXT:    store float 0.000000e+00, ptr [[REF_TMP]], align 4, !tbaa [[FLOAT_TBAA14:![0-9]+]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[REF_TMP2]]) #[[ATTR4]]
@@ -117,13 +117,13 @@ void test() {
 // CHECK1-NEXT:    call void @_ZNSt7complexIfEC1ERKfS2_(ptr nonnull align 4 dereferenceable(8) [[PARTIAL_SUM]], ptr nonnull align 4 dereferenceable(4) [[REF_TMP]], ptr nonnull align 4 dereferenceable(4) [[REF_TMP2]]) #[[ATTR11:[0-9]+]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(ptr [[REF_TMP2]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(ptr [[REF_TMP]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[IB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[IB]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    [[MUL3:%.*]] = mul nsw i32 [[TMP8]], 4
-// CHECK1-NEXT:    store i32 [[MUL3]], ptr [[ISTART]], align 4, !tbaa [[INT_TBAA10]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[IB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 [[MUL3]], ptr [[ISTART]], align 4, !tbaa [[INT_TBAA6]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[IB]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP9]], 1
 // CHECK1-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[ADD4]], 4
-// CHECK1-NEXT:    store i32 [[MUL5]], ptr [[IEND]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 [[MUL5]], ptr [[IEND]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
 // CHECK1-NEXT:    store ptr [[ISTART]], ptr [[TMP10]], align 8, !tbaa [[ANYPTR_TBAA16:![0-9]+]]
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
@@ -135,9 +135,9 @@ void test() {
 // CHECK1:       [[OMP_BODY_CONTINUE]]:
 // CHECK1-NEXT:    br label %[[OMP_INNER_FOR_INC:.*]]
 // CHECK1:       [[OMP_INNER_FOR_INC]]:
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP13]], 1
-// CHECK1-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    br label %[[OMP_INNER_FOR_COND]]
 // CHECK1:       [[OMP_INNER_FOR_END]]:
 // CHECK1-NEXT:    br label %[[OMP_LOOP_EXIT:.*]]
@@ -207,38 +207,38 @@ void test() {
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[PARTIAL_SUM_ADDR]], align 8, !tbaa [[_ZTSST7COMPLEXIFEPTR_TBAA18]], !nonnull [[META22]], !align [[META23]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTOMP_IV]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTCAPTURE_EXPR_]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[INT_TBAA10]]
-// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[INT_TBAA6]]
+// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTCAPTURE_EXPR_1]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[INT_TBAA10]]
-// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[INT_TBAA6]]
+// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTCAPTURE_EXPR_2]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[INT_TBAA10]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[INT_TBAA6]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    [[SUB:%.*]] = sub i32 [[TMP5]], [[TMP6]]
 // CHECK1-NEXT:    [[SUB3:%.*]] = sub i32 [[SUB]], 1
 // CHECK1-NEXT:    [[ADD:%.*]] = add i32 [[SUB3]], 1
 // CHECK1-NEXT:    [[DIV:%.*]] = udiv i32 [[ADD]], 1
 // CHECK1-NEXT:    [[SUB4:%.*]] = sub i32 [[DIV]], 1
-// CHECK1-NEXT:    store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[I]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[INT_TBAA10]]
-// CHECK1-NEXT:    store i32 [[TMP7]], ptr [[I]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[INT_TBAA6]]
+// CHECK1-NEXT:    store i32 [[TMP7]], ptr [[I]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(ptr [[I]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[INT_TBAA10]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[INT_TBAA6]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP8]], [[TMP9]]
 // CHECK1-NEXT:    br i1 [[CMP]], label %[[OMP_PRECOND_THEN:.*]], label %[[OMP_PRECOND_END:.*]]
 // CHECK1:       [[OMP_PRECOND_THEN]]:
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTOMP_LB]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTOMP_UB]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[INT_TBAA10]]
-// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[INT_TBAA6]]
+// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTOMP_STRIDE]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTOMP_IS_LAST]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[PARTIAL_SUM5]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[REF_TMP]]) #[[ATTR4]]
 // CHECK1-NEXT:    store float 0.000000e+00, ptr [[REF_TMP]], align 4, !tbaa [[FLOAT_TBAA14]]
@@ -249,27 +249,27 @@ void test() {
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(ptr [[REF_TMP]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[I7]]) #[[ATTR4]]
 // CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    call void @__kmpc_for_static_init_4u(ptr @[[GLOB3:[0-9]+]], i32 [[TMP12]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
 // CHECK1-NEXT:    br label %[[OMP_DISPATCH_COND:.*]]
 // CHECK1:       [[OMP_DISPATCH_COND]]:
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA6]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    [[CMP8:%.*]] = icmp ugt i32 [[TMP13]], [[TMP14]]
 // CHECK1-NEXT:    br i1 [[CMP8]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
 // CHECK1:       [[COND_TRUE]]:
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    br label %[[COND_END:.*]]
 // CHECK1:       [[COND_FALSE]]:
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    br label %[[COND_END]]
 // CHECK1:       [[COND_END]]:
 // CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP15]], %[[COND_TRUE]] ], [ [[TMP16]], %[[COND_FALSE]] ]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[INT_TBAA10]]
-// CHECK1-NEXT:    store i32 [[TMP17]], ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA10]]
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA10]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA6]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[INT_TBAA6]]
+// CHECK1-NEXT:    store i32 [[TMP17]], ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA6]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA6]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    [[ADD9:%.*]] = add i32 [[TMP19]], 1
 // CHECK1-NEXT:    [[CMP10:%.*]] = icmp ult i32 [[TMP18]], [[ADD9]]
 // CHECK1-NEXT:    br i1 [[CMP10]], label %[[OMP_DISPATCH_BODY:.*]], label %[[OMP_DISPATCH_CLEANUP:.*]]
@@ -278,26 +278,26 @@ void test() {
 // CHECK1:       [[OMP_DISPATCH_BODY]]:
 // CHECK1-NEXT:    br label %[[OMP_INNER_FOR_COND:.*]]
 // CHECK1:       [[OMP_INNER_FOR_COND]]:
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA10]]
-// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA6]]
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    [[ADD11:%.*]] = add i32 [[TMP21]], 1
 // CHECK1-NEXT:    [[CMP12:%.*]] = icmp ult i32 [[TMP20]], [[ADD11]]
 // CHECK1-NEXT:    br i1 [[CMP12]], label %[[OMP_INNER_FOR_BODY:.*]], label %[[OMP_INNER_FOR_COND_CLEANUP:.*]]
 // CHECK1:       [[OMP_INNER_FOR_COND_CLEANUP]]:
 // CHECK1-NEXT:    br label %[[OMP_INNER_FOR_END:.*]]
 // CHECK1:       [[OMP_INNER_FOR_BODY]]:
-// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[INT_TBAA10]]
-// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[INT_TBAA6]]
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul i32 [[TMP23]], 1
 // CHECK1-NEXT:    [[ADD13:%.*]] = add i32 [[TMP22]], [[MUL]]
-// CHECK1-NEXT:    store i32 [[ADD13]], ptr [[I7]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 [[ADD13]], ptr [[I7]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[REF_TMP14]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[REF_TMP15]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I7]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I7]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP24]] to float
 // CHECK1-NEXT:    store float [[CONV]], ptr [[REF_TMP15]], align 4, !tbaa [[FLOAT_TBAA14]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[REF_TMP16]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[I7]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[I7]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    [[CONV17:%.*]] = sitofp i32 [[TMP25]] to float
 // CHECK1-NEXT:    store float [[CONV17]], ptr [[REF_TMP16]], align 4, !tbaa [[FLOAT_TBAA14]]
 // CHECK1-NEXT:    call void @_ZNSt7complexIfEC1ERKfS2_(ptr nonnull align 4 dereferenceable(8) [[REF_TMP14]], ptr nonnull align 4 dereferenceable(4) [[REF_TMP15]], ptr nonnull align 4 dereferenceable(4) [[REF_TMP16]]) #[[ATTR11]]
@@ -309,25 +309,25 @@ void test() {
 // CHECK1:       [[OMP_BODY_CONTINUE]]:
 // CHECK1-NEXT:    br label %[[OMP_INNER_FOR_INC:.*]]
 // CHECK1:       [[OMP_INNER_FOR_INC]]:
-// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    [[ADD18:%.*]] = add i32 [[TMP26]], 1
-// CHECK1-NEXT:    store i32 [[ADD18]], ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 [[ADD18]], ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    br label %[[OMP_INNER_FOR_COND]]
 // CHECK1:       [[OMP_INNER_FOR_END]]:
 // CHECK1-NEXT:    br label %[[OMP_DISPATCH_INC:.*]]
 // CHECK1:       [[OMP_DISPATCH_INC]]:
-// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[INT_TBAA10]]
-// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[INT_TBAA6]]
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    [[ADD19:%.*]] = add i32 [[TMP27]], [[TMP28]]
-// CHECK1-NEXT:    store i32 [[ADD19]], ptr [[DOTOMP_LB]], align 4, !tbaa [[INT_TBAA10]]
-// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
-// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 [[ADD19]], ptr [[DOTOMP_LB]], align 4, !tbaa [[INT_TBAA6]]
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA6]]
+// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    [[ADD20:%.*]] = add i32 [[TMP29]], [[TMP30]]
-// CHECK1-NEXT:    store i32 [[ADD20]], ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 [[ADD20]], ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    br label %[[OMP_DISPATCH_COND]]
 // CHECK1:       [[OMP_DISPATCH_END]]:
 // CHECK1-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP32]])
 // CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
 // CHECK1-NEXT:    store ptr [[PARTIAL_SUM5]], ptr [[TMP33]], align 8
@@ -508,16 +508,16 @@ void test() {
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store i16 [[TMP0]], ptr [[DOTADDR]], align 2, !tbaa [[SHORT_TBAA27:![0-9]+]]
-// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK1-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 0
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP3]], align 8, !tbaa [[ANYPTR_TBAA6]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP3]], align 8, !tbaa [[ANYPTR_TBAA10]]
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 1
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[ANYPTR_TBAA6]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[ANYPTR_TBAA10]]
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 2
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP7]], align 8, !tbaa [[ANYPTR_TBAA6]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP7]], align 8, !tbaa [[ANYPTR_TBAA10]]
 // CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16_omp_outlined_omp_outlined(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP4]], ptr [[TMP6]], ptr [[TMP8]]) #[[ATTR4]]
 // CHECK1-NEXT:    ret void
 //
@@ -528,14 +528,14 @@ void test() {
 // CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8, !tbaa [[ANYPTR_TBAA6]]
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8, !tbaa [[ANYPTR_TBAA10]]
 // CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label %[[USER_CODE_ENTRY:.*]], label %[[WORKER_EXIT:.*]]
 // CHECK1:       [[USER_CODE_ENTRY]]:
 // CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK1-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
-// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @__kmpc_target_deinit()
 // CHECK1-NEXT:    ret void
@@ -565,43 +565,43 @@ void test() {
 // CHECK1-NEXT:    [[PARTIAL_SUM:%.*]] = call align 16 ptr @__kmpc_alloc_shared(i64 16)
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTOMP_IV]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTOMP_LB]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTOMP_UB]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 99, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 99, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTOMP_STRIDE]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTOMP_IS_LAST]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[IB]]) #[[ATTR4]]
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
-// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 99
 // CHECK1-NEXT:    br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
 // CHECK1:       [[COND_TRUE]]:
 // CHECK1-NEXT:    br label %[[COND_END:.*]]
 // CHECK1:       [[COND_FALSE]]:
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    br label %[[COND_END]]
 // CHECK1:       [[COND_END]]:
 // CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 99, %[[COND_TRUE]] ], [ [[TMP3]], %[[COND_FALSE]] ]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[INT_TBAA10]]
-// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA6]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[INT_TBAA6]]
+// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    br label %[[OMP_INNER_FOR_COND:.*]]
 // CHECK1:       [[OMP_INNER_FOR_COND]]:
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA10]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA6]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[CMP1]], label %[[OMP_INNER_FOR_BODY:.*]], label %[[OMP_INNER_FOR_COND_CLEANUP:.*]]
 // CHECK1:       [[OMP_INNER_FOR_COND_CLEANUP]]:
 // CHECK1-NEXT:    br label %[[OMP_INNER_FOR_END:.*]]
 // CHECK1:       [[OMP_INNER_FOR_BODY]]:
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[IB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[IB]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[REF_TMP]]) #[[ATTR4]]
 // CHECK1-NEXT:    store double 0.000000e+00, ptr [[REF_TMP]], align 8, !tbaa [[DOUBLE_TBAA29:![0-9]+]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[REF_TMP2]]) #[[ATTR4]]
@@ -609,13 +609,13 @@ void test() {
 // CHECK1-NEXT:    call void @_ZNSt7complexIdEC1ERKdS2_(ptr nonnull align 8 dereferenceable(16) [[PARTIAL_SUM]], ptr nonnull align 8 dereferenceable(8) [[REF_TMP]], ptr nonnull align 8 dereferenceable(8) [[REF_TMP2]]) #[[ATTR11]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(ptr [[REF_TMP2]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(ptr [[REF_TMP]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[IB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[IB]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    [[MUL3:%.*]] = mul nsw i32 [[TMP8]], 4
-// CHECK1-NEXT:    store i32 [[MUL3]], ptr [[ISTART]], align 4, !tbaa [[INT_TBAA10]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[IB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 [[MUL3]], ptr [[ISTART]], align 4, !tbaa [[INT_TBAA6]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[IB]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP9]], 1
 // CHECK1-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[ADD4]], 4
-// CHECK1-NEXT:    store i32 [[MUL5]], ptr [[IEND]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 [[MUL5]], ptr [[IEND]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
 // CHECK1-NEXT:    store ptr [[ISTART]], ptr [[TMP10]], align 8, !tbaa [[ANYPTR_TBAA16]]
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
@@ -627,9 +627,9 @@ void test() {
 // CHECK1:       [[OMP_BODY_CONTINUE]]:
 // CHECK1-NEXT:    br label %[[OMP_INNER_FOR_INC:.*]]
 // CHECK1:       [[OMP_INNER_FOR_INC]]:
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP13]], 1
-// CHECK1-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    br label %[[OMP_INNER_FOR_COND]]
 // CHECK1:       [[OMP_INNER_FOR_END]]:
 // CHECK1-NEXT:    br label %[[OMP_LOOP_EXIT:.*]]
@@ -699,38 +699,38 @@ void test() {
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[PARTIAL_SUM_ADDR]], align 8, !tbaa [[_ZTSST7COMPLEXIDEPTR_TBAA31]], !nonnull [[META22]], !align [[META35:![0-9]+]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTOMP_IV]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTCAPTURE_EXPR_]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[INT_TBAA10]]
-// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[INT_TBAA6]]
+// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTCAPTURE_EXPR_1]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[INT_TBAA10]]
-// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[INT_TBAA6]]
+// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTCAPTURE_EXPR_2]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[INT_TBAA10]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[INT_TBAA6]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    [[SUB:%.*]] = sub i32 [[TMP5]], [[TMP6]]
 // CHECK1-NEXT:    [[SUB3:%.*]] = sub i32 [[SUB]], 1
 // CHECK1-NEXT:    [[ADD:%.*]] = add i32 [[SUB3]], 1
 // CHECK1-NEXT:    [[DIV:%.*]] = udiv i32 [[ADD]], 1
 // CHECK1-NEXT:    [[SUB4:%.*]] = sub i32 [[DIV]], 1
-// CHECK1-NEXT:    store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[I]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[INT_TBAA10]]
-// CHECK1-NEXT:    store i32 [[TMP7]], ptr [[I]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[INT_TBAA6]]
+// CHECK1-NEXT:    store i32 [[TMP7]], ptr [[I]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(ptr [[I]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[INT_TBAA10]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[INT_TBAA6]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP8]], [[TMP9]]
 // CHECK1-NEXT:    br i1 [[CMP]], label %[[OMP_PRECOND_THEN:.*]], label %[[OMP_PRECOND_END:.*]]
 // CHECK1:       [[OMP_PRECOND_THEN]]:
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTOMP_LB]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTOMP_UB]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[INT_TBAA10]]
-// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[INT_TBAA6]]
+// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTOMP_STRIDE]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[DOTOMP_IS_LAST]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[PARTIAL_SUM5]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[REF_TMP]]) #[[ATTR4]]
 // CHECK1-NEXT:    store double 0.000000e+00, ptr [[REF_TMP]], align 8, !tbaa [[DOUBLE_TBAA29]]
@@ -741,27 +741,27 @@ void test() {
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(ptr [[REF_TMP]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[I7]]) #[[ATTR4]]
 // CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    call void @__kmpc_for_static_init_4u(ptr @[[GLOB3]], i32 [[TMP12]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
 // CHECK1-NEXT:    br label %[[OMP_DISPATCH_COND:.*]]
 // CHECK1:       [[OMP_DISPATCH_COND]]:
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA6]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    [[CMP8:%.*]] = icmp ugt i32 [[TMP13]], [[TMP14]]
 // CHECK1-NEXT:    br i1 [[CMP8]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
 // CHECK1:       [[COND_TRUE]]:
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    br label %[[COND_END:.*]]
 // CHECK1:       [[COND_FALSE]]:
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    br label %[[COND_END]]
 // CHECK1:       [[COND_END]]:
 // CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP15]], %[[COND_TRUE]] ], [ [[TMP16]], %[[COND_FALSE]] ]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[INT_TBAA10]]
-// CHECK1-NEXT:    store i32 [[TMP17]], ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA10]]
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA10]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA6]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[INT_TBAA6]]
+// CHECK1-NEXT:    store i32 [[TMP17]], ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA6]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA6]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    [[ADD9:%.*]] = add i32 [[TMP19]], 1
 // CHECK1-NEXT:    [[CMP10:%.*]] = icmp ult i32 [[TMP18]], [[ADD9]]
 // CHECK1-NEXT:    br i1 [[CMP10]], label %[[OMP_DISPATCH_BODY:.*]], label %[[OMP_DISPATCH_CLEANUP:.*]]
@@ -770,26 +770,26 @@ void test() {
 // CHECK1:       [[OMP_DISPATCH_BODY]]:
 // CHECK1-NEXT:    br label %[[OMP_INNER_FOR_COND:.*]]
 // CHECK1:       [[OMP_INNER_FOR_COND]]:
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA10]]
-// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA6]]
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    [[ADD11:%.*]] = add i32 [[TMP21]], 1
 // CHECK1-NEXT:    [[CMP12:%.*]] = icmp ult i32 [[TMP20]], [[ADD11]]
 // CHECK1-NEXT:    br i1 [[CMP12]], label %[[OMP_INNER_FOR_BODY:.*]], label %[[OMP_INNER_FOR_COND_CLEANUP:.*]]
 // CHECK1:       [[OMP_INNER_FOR_COND_CLEANUP]]:
 // CHECK1-NEXT:    br label %[[OMP_INNER_FOR_END:.*]]
 // CHECK1:       [[OMP_INNER_FOR_BODY]]:
-// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[INT_TBAA10]]
-// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[INT_TBAA6]]
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul i32 [[TMP23]], 1
 // CHECK1-NEXT:    [[ADD13:%.*]] = add i32 [[TMP22]], [[MUL]]
-// CHECK1-NEXT:    store i32 [[ADD13]], ptr [[I7]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 [[ADD13]], ptr [[I7]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[REF_TMP14]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[REF_TMP15]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I7]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I7]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP24]] to double
 // CHECK1-NEXT:    store double [[CONV]], ptr [[REF_TMP15]], align 8, !tbaa [[DOUBLE_TBAA29]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(ptr [[REF_TMP16]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[I7]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[I7]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    [[CONV17:%.*]] = sitofp i32 [[TMP25]] to double
 // CHECK1-NEXT:    store double [[CONV17]], ptr [[REF_TMP16]], align 8, !tbaa [[DOUBLE_TBAA29]]
 // CHECK1-NEXT:    call void @_ZNSt7complexIdEC1ERKdS2_(ptr nonnull align 8 dereferenceable(16) [[REF_TMP14]], ptr nonnull align 8 dereferenceable(8) [[REF_TMP15]], ptr nonnull align 8 dereferenceable(8) [[REF_TMP16]]) #[[ATTR11]]
@@ -801,25 +801,25 @@ void test() {
 // CHECK1:       [[OMP_BODY_CONTINUE]]:
 // CHECK1-NEXT:    br label %[[OMP_INNER_FOR_INC:.*]]
 // CHECK1:       [[OMP_INNER_FOR_INC]]:
-// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    [[ADD18:%.*]] = add i32 [[TMP26]], 1
-// CHECK1-NEXT:    store i32 [[ADD18]], ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 [[ADD18]], ptr [[DOTOMP_IV]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    br label %[[OMP_INNER_FOR_COND]]
 // CHECK1:       [[OMP_INNER_FOR_END]]:
 // CHECK1-NEXT:    br label %[[OMP_DISPATCH_INC:.*]]
 // CHECK1:       [[OMP_DISPATCH_INC]]:
-// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[INT_TBAA10]]
-// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[INT_TBAA6]]
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    [[ADD19:%.*]] = add i32 [[TMP27]], [[TMP28]]
-// CHECK1-NEXT:    store i32 [[ADD19]], ptr [[DOTOMP_LB]], align 4, !tbaa [[INT_TBAA10]]
-// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
-// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 [[ADD19]], ptr [[DOTOMP_LB]], align 4, !tbaa [[INT_TBAA6]]
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA6]]
+// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    [[ADD20:%.*]] = add i32 [[TMP29]], [[TMP30]]
-// CHECK1-NEXT:    store i32 [[ADD20]], ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 [[ADD20]], ptr [[DOTOMP_UB]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    br label %[[OMP_DISPATCH_COND]]
 // CHECK1:       [[OMP_DISPATCH_END]]:
 // CHECK1-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP32]])
 // CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
 // CHECK1-NEXT:    store ptr [[PARTIAL_SUM5]], ptr [[TMP33]], align 8
@@ -1013,16 +1013,16 @@ void test() {
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store i16 [[TMP0]], ptr [[DOTADDR]], align 2, !tbaa [[SHORT_TBAA27]]
-// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4, !tbaa [[INT_TBAA10]]
+// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4, !tbaa [[INT_TBAA6]]
 // CHECK1-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK1-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 0
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP3]], align 8, !tbaa [[ANYPTR_TBAA6]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP3]], align 8, !tbaa [[ANYPTR_TBAA10]]
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 1
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[ANYPTR_TBAA6]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[ANYPTR_TBAA10]]
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 2
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP7]], align 8, !tbaa [[ANYPTR_TBAA6]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP7]], align 8, !tbaa [[ANYPTR_TBAA10]]
 // CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16_omp_outlined_omp_outlined(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP4]], ptr [[TMP6]], ptr [[TMP8]]) #[[ATTR4]]
 // CHECK1-NEXT:    ret void
 //
@@ -1113,22 +1113,22 @@ void test() {
 // CHECK1-NEXT:    ret double [[TMP0]]
 //
 //.
-// CHECK1: [[ANYPTR_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
-// CHECK1: [[META7]] = !{!"any pointer", [[META8:![0-9]+]], i64 0}
+// CHECK1: [[INT_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
+// CHECK1: [[META7]] = !{!"int", [[META8:![0-9]+]], i64 0}
 // CHECK1: [[META8]] = !{!"omnipotent char", [[META9:![0-9]+]], i64 0}
 // CHECK1: [[META9]] = !{!"Simple C++ TBAA"}
-// CHECK1: [[INT_TBAA10]] = !{[[META11:![0-9]+]], [[META11]], i64 0}
-// CHECK1: [[META11]] = !{!"int", [[META8]], i64 0}
+// CHECK1: [[ANYPTR_TBAA10]] = !{[[META11:![0-9]+]], [[META11]], i64 0}
+// CHECK1: [[META11]] = !{!"any pointer", [[META8]], i64 0}
 // CHECK1: [[INTPTR_TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
-// CHECK1: [[META13]] = !{!"p1 int", [[META7]], i64 0}
+// CHECK1: [[META13]] = !{!"p1 int", [[META11]], i64 0}
 // CHECK1: [[FLOAT_TBAA14]] = !{[[META15:![0-9]+]], [[META15]], i64 0}
 // CHECK1: [[META15]] = !{!"float", [[META8]], i64 0}
 // CHECK1: [[ANYPTR_TBAA16]] = !{[[META17:![0-9]+]], [[META17]], i64 0}
-// CHECK1: [[META17]] = !{!"any p2 pointer", [[META7]], i64 0}
+// CHECK1: [[META17]] = !{!"any p2 pointer", [[META11]], i64 0}
 // CHECK1: [[_ZTSST7COMPLEXIFEPTR_TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0}
-// CHECK1: [[META19]] = !{!"p1 _ZTSSt7complexIfE", [[META7]], i64 0}
+// CHECK1: [[META19]] = !{!"p1 _ZTSSt7complexIfE", [[META11]], i64 0}
 // CHECK1: [[FLOATPTR_TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0}
-// CHECK1: [[META21]] = !{!"p1 float", [[META7]], i64 0}
+// CHECK1: [[META21]] = !{!"p1 float", [[META11]], i64 0}
 // CHECK1: [[META22]] = !{}
 // CHECK1: [[META23]] = !{i64 4}
 // CHECK1: [[FLOAT_TBAA24]] = !{[[META25:![0-9]+]], [[META15]], i64 0}
@@ -1139,9 +1139,9 @@ void test() {
 // CHECK1: [[DOUBLE_TBAA29]] = !{[[META30:![0-9]+]], [[META30]], i64 0}
 // CHECK1: [[META30]] = !{!"double", [[META8]], i64 0}
 // CHECK1: [[_ZTSST7COMPLEXIDEPTR_TBAA31]] = !{[[META32:![0-9]+]], [[META32]], i64 0}
-// CHECK1: [[META32]] = !{!"p1 _ZTSSt7complexIdE", [[META7]], i64 0}
+// CHECK1: [[META32]] = !{!"p1 _ZTSSt7complexIdE", [[META11]], i64 0}
 // CHECK1: [[DOUBLEPTR_TBAA33]] = !{[[META34:![0-9]+]], [[META34]], i64 0}
-// CHECK1: [[META34]] = !{!"p1 double", [[META7]], i64 0}
+// CHECK1: [[META34]] = !{!"p1 double", [[META11]], i64 0}
 // CHECK1: [[META35]] = !{i64 8}
 // CHECK1: [[DOUBLE_TBAA36]] = !{[[META37:![0-9]+]], [[META30]], i64 0}
 // CHECK1: [[META37]] = !{!"_ZTSSt7complexIdE", [[META30]], i64 0, [[META30]], i64 8}
diff --git a/clang/test/PCH/cxx1z-decomposition.cpp b/clang/test/PCH/cxx1z-decomposition.cpp
index 914ce80c550d1..340d5eae0b8c3 100644
--- a/clang/test/PCH/cxx1z-decomposition.cpp
+++ b/clang/test/PCH/cxx1z-decomposition.cpp
@@ -22,7 +22,7 @@ constexpr int foo(Q &&q) {
   return a * 10 + b;
 }
 
-auto [noinit]; // expected-error{{decomposition declaration '[noinit]' requires an initializer}}
+auto [noinit]; // expected-error{{structured binding declaration '[noinit]' requires an initializer}}
 
 #else
 
@@ -31,7 +31,7 @@ int k = decomp(arr);
 
 static_assert(foo({1, 2}) == 12);
 
-// expected-error@15 {{cannot decompose non-class, non-array type 'const int'}}
+// expected-error@15 {{cannot bind non-class, non-array type 'const int'}}
 int z = decomp(10); // expected-note {{instantiation of}}
 
 #endif
diff --git a/clang/test/Parser/DelayedTemplateParsing.cpp b/clang/test/Parser/DelayedTemplateParsing.cpp
index bcd286ae04492..072c7ce5162ee 100644
--- a/clang/test/Parser/DelayedTemplateParsing.cpp
+++ b/clang/test/Parser/DelayedTemplateParsing.cpp
@@ -43,10 +43,10 @@ void undeclared()
 
 }
 
-template <class T> void foo5() {} //expected-note {{previous definition is here}} 
+template <class T> void foo5() {} //expected-note {{previous definition is here}}
 template <class T> void foo5() {} // expected-error {{redefinition of 'foo5'}}
 
-              
+
 
 
 namespace PR11931 {
@@ -195,3 +195,12 @@ template <typename> struct PR38460_2 {
   }
 };
 template struct PR38460_2<int>;
+
+namespace LateParsedAttrs {
+  template <class>
+  void f(int a) __attribute__((__diagnose_if__(a > 0, "foo", "error"))) {}
+  // expected-note@-1 {{from 'diagnose_if' attribute on 'f<int>'}}
+  void g() {
+    f<int>(1); // expected-error {{foo}}
+  }
+} // namespace LateParsedAttrs
diff --git a/clang/test/Parser/c2x-auto.c b/clang/test/Parser/c2x-auto.c
index b878a5b7c42d4..7f80b0717ab25 100644
--- a/clang/test/Parser/c2x-auto.c
+++ b/clang/test/Parser/c2x-auto.c
@@ -130,3 +130,30 @@ void atomic(void) {
 void attributes(void) {
   auto ident [[clang::annotate("this works")]] = 12;  // c17-error {{type specifier missing, defaults to 'int'; ISO C99 and later do not support implicit int}}
 }
+
+/** GH163090 */
+constexpr auto int a1 = 0; // c23-error {{illegal storage class on file-scoped variable}} \
+                              c23-error {{cannot combine with previous 'auto' declaration specifier}} \
+                              c17-error {{illegal storage class on file-scoped variable}} \
+                              c17-error {{unknown type name 'constexpr'}}
+
+constexpr int auto a2 = 0; // c23-error {{cannot combine with previous 'int' declaration specifier}} \
+                              c17-error {{illegal storage class on file-scoped variable}} \
+                              c17-error {{unknown type name 'constexpr'}}
+
+auto int b1 = 0; // c23-error {{illegal storage class on file-scoped variable}} \
+                    c17-error {{illegal storage class on file-scoped variable}}
+
+int auto b2 = 0; // c23-error {{cannot combine with previous 'int' declaration specifier}} \
+                    c17-error {{illegal storage class on file-scoped variable}}
+
+void f() {
+  constexpr auto int c1 = 0; // c23-error {{cannot combine with previous 'auto' declaration specifier}} \
+                                c17-error {{use of undeclared identifier 'constexpr'}}
+
+  constexpr int auto c2 = 0; // c23-error {{cannot combine with previous 'int' declaration specifier}} \
+                                c17-error {{use of undeclared identifier 'constexpr'}}
+
+  auto int d1 = 0;
+  int auto d2 = 0; // c23-error {{cannot combine with previous 'int' declaration specifier}}
+}
diff --git a/clang/test/Parser/cxx1z-class-template-argument-deduction.cpp b/clang/test/Parser/cxx1z-class-template-argument-deduction.cpp
index 9d27f83698e00..ece00a08954b9 100644
--- a/clang/test/Parser/cxx1z-class-template-argument-deduction.cpp
+++ b/clang/test/Parser/cxx1z-class-template-argument-deduction.cpp
@@ -158,7 +158,7 @@ namespace decl {
   A arr[3] = 0; // expected-error {{cannot form array of deduced class template specialization type}}
   A F::*pm = 0; // expected-error {{cannot form pointer to deduced class template specialization type}}
   A (*fp)() = 0; // expected-error {{cannot form function returning deduced class template specialization type}}
-  A [x, y] = 0; // expected-error {{cannot be declared with type 'A'}} expected-error {{type 'A<int>' decomposes into 0 elements, but 2 names were provided}}
+  A [x, y] = 0; // expected-error {{cannot be declared with type 'A'}} expected-error {{type 'A<int>' binds to 0 elements, but 2 names were provided}}
 }
 
 namespace typename_specifier {
@@ -185,7 +185,7 @@ namespace typename_specifier {
   typename ::A arr[3] = 0; // expected-error {{cannot form array of deduced class template specialization type}}
   typename ::A F::*pm = 0; // expected-error {{cannot form pointer to deduced class template specialization type}}
   typename ::A (*fp)() = 0; // expected-error {{cannot form function returning deduced class template specialization type}}
-  typename ::A [x, y] = 0; // expected-error {{cannot be declared with type 'typename ::A'}} expected-error {{type 'typename ::A<int>' (aka 'A<int>') decomposes into 0}}
+  typename ::A [x, y] = 0; // expected-error {{cannot be declared with type 'typename ::A'}} expected-error {{type 'typename ::A<int>' (aka 'A<int>') binds to 0}}
 
   struct X { template<typename T> struct A { A(T); }; }; // expected-note 8{{declared here}}
 
@@ -208,7 +208,7 @@ namespace typename_specifier {
     {typename T::A arr[3] = 0;} // expected-error {{refers to class template member}}
     {typename T::A F::*pm = 0;} // expected-error {{refers to class template member}}
     {typename T::A (*fp)() = 0;} // expected-error {{refers to class template member}}
-    {typename T::A [x, y] = 0;} // expected-error {{cannot be declared with type 'typename T::A'}} expected-error {{type 'typename typename_specifier::X::A<int>' (aka 'typename_specifier::X::A<int>') decomposes into 0}}
+    {typename T::A [x, y] = 0;} // expected-error {{cannot be declared with type 'typename T::A'}} expected-error {{type 'typename typename_specifier::X::A<int>' (aka 'typename_specifier::X::A<int>') binds to 0}}
   }
   template void f<X>(); // expected-note {{instantiation of}}
 
diff --git a/clang/test/Parser/cxx1z-decomposition.cpp b/clang/test/Parser/cxx1z-decomposition.cpp
index 274e24ea55522..21c9419e8f413 100644
--- a/clang/test/Parser/cxx1z-decomposition.cpp
+++ b/clang/test/Parser/cxx1z-decomposition.cpp
@@ -5,7 +5,7 @@
 
 struct S { int a, b, c; }; // expected-note 2 {{'S::a' declared here}}
 
-// A simple-declaration can be a decompsition declaration.
+// A simple-declaration can be a structured binding declaration.
 namespace SimpleDecl {
   auto [a_x, b_x, c_x] = S();
 
@@ -19,7 +19,7 @@ namespace SimpleDecl {
   }
 }
 
-// A for-range-declaration can be a decomposition declaration.
+// A for-range-declaration can be a structured binding declaration.
 namespace ForRangeDecl {
   extern S arr[10];
   void h() {
@@ -100,12 +100,12 @@ namespace BadSpecifiers {
   inline auto &[k] = n; // expected-error {{cannot be declared 'inline'}}
 
   const int K = 5;
-  auto ([c]) = s; // expected-error {{decomposition declaration cannot be declared with parentheses}}
+  auto ([c]) = s; // expected-error {{structured binding declaration cannot be declared with parentheses}}
   void g() {
     // defining-type-specifiers other than cv-qualifiers and 'auto'
     S [a] = s; // expected-error {{cannot be declared with type 'S'}}
     decltype(auto) [b] = s; // expected-error {{cannot be declared with type 'decltype(auto)'}}
-    auto ([c2]) = s; // cxx17-error {{decomposition declaration cannot be declared with parenthese}} \
+    auto ([c2]) = s; // cxx17-error {{structured binding declaration cannot be declared with parenthese}} \
                      // post2b-error {{use of undeclared identifier 'c2'}} \
                      // post2b-error {{expected body of lambda expression}} \
 
@@ -114,7 +114,7 @@ namespace BadSpecifiers {
     auto [e][1] = s; // expected-error {{expected ';'}} expected-error {{requires an initializer}}
 
     // FIXME: This should fire the 'misplaced array declarator' diagnostic.
-    int [K] arr = {0}; // expected-error {{expected ';'}} expected-error {{cannot be declared with type 'int'}} expected-error {{decomposition declaration '[K]' requires an initializer}}
+    int [K] arr = {0}; // expected-error {{expected ';'}} expected-error {{cannot be declared with type 'int'}} expected-error {{structured binding declaration '[K]' requires an initializer}}
     int [5] arr = {0}; // expected-error {{place the brackets after the name}}
 
     auto *[f] = s; // expected-error {{cannot be declared with type 'auto *'}} expected-error {{incompatible initializer}}
@@ -145,14 +145,14 @@ namespace MultiDeclarator {
 namespace Template {
   int n[3];
   // Structured binding template is not allowed.
-  template<typename T> auto [a, b, c] = n; // expected-error {{decomposition declaration cannot be a template}}
+  template<typename T> auto [a, b, c] = n; // expected-error {{structured binding declaration cannot be a template}}
 }
 
 namespace Init {
   void f() {
     int arr[1];
     struct S { int n; };
-    auto &[bad1]; // expected-error {{decomposition declaration '[bad1]' requires an initializer}}
+    auto &[bad1]; // expected-error {{structured binding declaration '[bad1]' requires an initializer}}
     const auto &[bad2](S{}, S{}); // expected-error {{initializer for variable '[bad2]' with type 'const auto &' contains multiple expressions}}
     const auto &[bad3](); // expected-error {{expected expression}}
     auto &[good1] = arr;
diff --git a/clang/test/Parser/cxx2c-binding-pack.cpp b/clang/test/Parser/cxx2c-binding-pack.cpp
index 0daaad3a459ed..40d843e24ff66 100644
--- a/clang/test/Parser/cxx2c-binding-pack.cpp
+++ b/clang/test/Parser/cxx2c-binding-pack.cpp
@@ -12,7 +12,7 @@ void decompose_array() {
   auto [...] = arr; // #2
                     // expected-error@#2{{expected identifier}}
                     // expected-error@#2{{{no names were provided}}}
-                    // expected-warning@#2{{{does not allow a decomposition group to be empty}}}
+                    // expected-warning@#2{{{does not allow a structured binding group to be empty}}}
   auto [a, ..., b] = arr; // #3
                           // expected-error@#3{{expected identifier}}
                           // expected-error@#3{{{only 1 name was provided}}}
diff --git a/clang/test/Preprocessor/embed___has_embed_parsing_errors.c b/clang/test/Preprocessor/embed___has_embed_parsing_errors.c
index 9c512a4882e2d..8ab53f6b89c0d 100644
--- a/clang/test/Preprocessor/embed___has_embed_parsing_errors.c
+++ b/clang/test/Preprocessor/embed___has_embed_parsing_errors.c
@@ -250,3 +250,35 @@
 
 #if __has_embed("") // expected-error {{empty filename}}
 #endif
+
+// expected-error@+3 {{missing ')' after '__has_embed'}} \
+   expected-error@+3 {{expected value in expression}} \
+   expected-note@+3 {{to match this '('}}
+#if __has_embed (__FILE__  foo limit(1)
+#endif
+
+//--- test3.c
+// expected-error@+3 {{missing ')' after '__has_embed'}} \
+   expected-error@+3 {{expected value in expression}} \
+   expected-note@+3 {{to match this '('}}
+#if __has_embed (__FILE__  foo
+#endif
+
+// expected-error@+3 {{missing ')' after '__has_embed'}} \
+   expected-error@+3 {{expected value in expression}} \
+   expected-note@+3 {{to match this '('}}
+#if __has_embed ("a" foo()
+#endif
+
+// expected-error@+3 {{missing ')' after '__has_embed'}} \
+   expected-error@+3 {{expected value in expression}} \
+   expected-note@+3 {{to match this '('}}
+#if __has_embed ("a" bar() foo
+#endif
+
+// expected-error@+3 {{missing ')' after '__has_embed'}} \
+   expected-error@+3 {{expected value in expression}} \
+   expected-note@+3 {{to match this '('}}
+#if __has_embed (__FILE__ limit(1) foo
+int a = __has_embed (__FILE__);
+#endif
diff --git a/clang/test/Preprocessor/init-aarch64.c b/clang/test/Preprocessor/init-aarch64.c
index 3036b496db25d..460778f39d003 100644
--- a/clang/test/Preprocessor/init-aarch64.c
+++ b/clang/test/Preprocessor/init-aarch64.c
@@ -234,6 +234,7 @@
 // AARCH64-NEXT: #define __LONG_MAX__ 9223372036854775807L
 // AARCH64-NEXT: #define __LONG_WIDTH__ 64
 // AARCH64-NEXT: #define __LP64__ 1
+// AARCH64-NEXT: #define __MEMORY_SCOPE_CLUSTR 5
 // AARCH64-NEXT: #define __MEMORY_SCOPE_DEVICE 1
 // AARCH64-NEXT: #define __MEMORY_SCOPE_SINGLE 4
 // AARCH64-NEXT: #define __MEMORY_SCOPE_SYSTEM 0
@@ -989,6 +990,7 @@
 // ARM64EC-MSVC: #define __LONG_LONG_MAX__ 9223372036854775807LL
 // ARM64EC-MSVC: #define __LONG_MAX__ 2147483647L
 // ARM64EC-MSVC: #define __LONG_WIDTH__ 32
+// ARM64EC-MSVC: #define __MEMORY_SCOPE_CLUSTR 5
 // ARM64EC-MSVC: #define __MEMORY_SCOPE_DEVICE 1
 // ARM64EC-MSVC: #define __MEMORY_SCOPE_SINGLE 4
 // ARM64EC-MSVC: #define __MEMORY_SCOPE_SYSTEM 0
diff --git a/clang/test/Preprocessor/init-loongarch.c b/clang/test/Preprocessor/init-loongarch.c
index 71a266b8a9157..fd7ce2073a512 100644
--- a/clang/test/Preprocessor/init-loongarch.c
+++ b/clang/test/Preprocessor/init-loongarch.c
@@ -182,11 +182,12 @@
 // LA32: #define __LONG_LONG_MAX__ 9223372036854775807LL
 // LA32: #define __LONG_MAX__ 2147483647L
 // LA32: #define __LONG_WIDTH__ 32
-// LA32: #define __MEMORY_SCOPE_DEVICE 1 
-// LA32: #define __MEMORY_SCOPE_SINGLE 4 
-// LA32: #define __MEMORY_SCOPE_SYSTEM 0 
-// LA32: #define __MEMORY_SCOPE_WRKGRP 2 
-// LA32: #define __MEMORY_SCOPE_WVFRNT 3 
+// LA32: #define __MEMORY_SCOPE_CLUSTR 5
+// LA32: #define __MEMORY_SCOPE_DEVICE 1
+// LA32: #define __MEMORY_SCOPE_SINGLE 4
+// LA32: #define __MEMORY_SCOPE_SYSTEM 0
+// LA32: #define __MEMORY_SCOPE_WRKGRP 2
+// LA32: #define __MEMORY_SCOPE_WVFRNT 3
 // LA32: #define __NO_INLINE__ 1
 // LA32: #define __NO_MATH_ERRNO__ 1
 // LA32: #define __OBJC_BOOL_IS_BOOL 0
@@ -514,11 +515,12 @@
 // LA64: #define __LONG_MAX__ 9223372036854775807L
 // LA64: #define __LONG_WIDTH__ 64
 // LA64: #define __LP64__ 1
-// LA64: #define __MEMORY_SCOPE_DEVICE 1 
-// LA64: #define __MEMORY_SCOPE_SINGLE 4 
-// LA64: #define __MEMORY_SCOPE_SYSTEM 0 
-// LA64: #define __MEMORY_SCOPE_WRKGRP 2 
-// LA64: #define __MEMORY_SCOPE_WVFRNT 3 
+// LA64: #define __MEMORY_SCOPE_CLUSTR 5
+// LA64: #define __MEMORY_SCOPE_DEVICE 1
+// LA64: #define __MEMORY_SCOPE_SINGLE 4
+// LA64: #define __MEMORY_SCOPE_SYSTEM 0
+// LA64: #define __MEMORY_SCOPE_WRKGRP 2
+// LA64: #define __MEMORY_SCOPE_WVFRNT 3
 // LA64: #define __NO_INLINE__ 1
 // LA64: #define __NO_MATH_ERRNO__ 1
 // LA64: #define __OBJC_BOOL_IS_BOOL 0
diff --git a/clang/test/Preprocessor/init.c b/clang/test/Preprocessor/init.c
index 7e0df96141364..4dea1b583a089 100644
--- a/clang/test/Preprocessor/init.c
+++ b/clang/test/Preprocessor/init.c
@@ -1889,6 +1889,7 @@
 // WEBASSEMBLY64-NEXT:#define __LONG_MAX__ 9223372036854775807L
 // WEBASSEMBLY64-NEXT:#define __LONG_WIDTH__ 64
 // WEBASSEMBLY64-NEXT:#define __LP64__ 1
+// WEBASSEMBLY-NEXT:#define __MEMORY_SCOPE_CLUSTR 5
 // WEBASSEMBLY-NEXT:#define __MEMORY_SCOPE_DEVICE 1
 // WEBASSEMBLY-NEXT:#define __MEMORY_SCOPE_SINGLE 4
 // WEBASSEMBLY-NEXT:#define __MEMORY_SCOPE_SYSTEM 0
@@ -2216,6 +2217,7 @@
 // AVR:#define __LDBL_MIN__ 1.17549435e-38L
 // AVR:#define __LONG_LONG_MAX__ 9223372036854775807LL
 // AVR:#define __LONG_MAX__ 2147483647L
+// AVR:#define __MEMORY_SCOPE_CLUSTR 5
 // AVR:#define __MEMORY_SCOPE_DEVICE 1
 // AVR:#define __MEMORY_SCOPE_SINGLE 4
 // AVR:#define __MEMORY_SCOPE_SYSTEM 0
@@ -2521,6 +2523,7 @@
 // RISCV32: #define __LITTLE_ENDIAN__ 1
 // RISCV32: #define __LONG_LONG_MAX__ 9223372036854775807LL
 // RISCV32: #define __LONG_MAX__ 2147483647L
+// RISCV32: #define __MEMORY_SCOPE_CLUSTR 5
 // RISCV32: #define __MEMORY_SCOPE_DEVICE 1
 // RISCV32: #define __MEMORY_SCOPE_SINGLE 4
 // RISCV32: #define __MEMORY_SCOPE_SYSTEM 0
@@ -2745,6 +2748,7 @@
 // RISCV64: #define __LONG_LONG_MAX__ 9223372036854775807LL
 // RISCV64: #define __LONG_MAX__ 9223372036854775807L
 // RISCV64: #define __LP64__ 1
+// RISCV64: #define __MEMORY_SCOPE_CLUSTR 5
 // RISCV64: #define __MEMORY_SCOPE_DEVICE 1
 // RISCV64: #define __MEMORY_SCOPE_SINGLE 4
 // RISCV64: #define __MEMORY_SCOPE_SYSTEM 0
@@ -2937,11 +2941,11 @@
 // XTENSA: #define __GXX_ABI_VERSION {{.*}}
 // XTENSA: #define __ILP32__ 1
 // XTENSA: #define __INT16_C(c) c
-// XTENSA: #define __INT16_C_SUFFIX__ 
+// XTENSA: #define __INT16_C_SUFFIX__
 // XTENSA: #define __INT16_MAX__ 32767
 // XTENSA: #define __INT16_TYPE__ short
 // XTENSA: #define __INT32_C(c) c
-// XTENSA: #define __INT32_C_SUFFIX__ 
+// XTENSA: #define __INT32_C_SUFFIX__
 // XTENSA: #define __INT32_MAX__ 2147483647
 // XTENSA: #define __INT32_TYPE__ int
 // XTENSA: #define __INT64_C(c) c##LL
@@ -2949,7 +2953,7 @@
 // XTENSA: #define __INT64_MAX__ 9223372036854775807LL
 // XTENSA: #define __INT64_TYPE__ long long int
 // XTENSA: #define __INT8_C(c) c
-// XTENSA: #define __INT8_C_SUFFIX__ 
+// XTENSA: #define __INT8_C_SUFFIX__
 // XTENSA: #define __INT8_MAX__ 127
 // XTENSA: #define __INT8_TYPE__ signed char
 // XTENSA: #define __INTMAX_C(c) c##LL
@@ -3008,6 +3012,7 @@
 // XTENSA: #define __LONG_LONG_MAX__ 9223372036854775807LL
 // XTENSA: #define __LONG_MAX__ 2147483647L
 // XTENSA: #define __LONG_WIDTH__ 32
+// XTENSA: #define __MEMORY_SCOPE_CLUSTR 5
 // XTENSA: #define __MEMORY_SCOPE_DEVICE 1
 // XTENSA: #define __MEMORY_SCOPE_SINGLE 4
 // XTENSA: #define __MEMORY_SCOPE_SYSTEM 0
@@ -3050,7 +3055,7 @@
 // XTENSA: #define __STDC_VERSION__ 201710L
 // XTENSA: #define __STDC__ 1
 // XTENSA: #define __UINT16_C(c) c
-// XTENSA: #define __UINT16_C_SUFFIX__ 
+// XTENSA: #define __UINT16_C_SUFFIX__
 // XTENSA: #define __UINT16_MAX__ 65535
 // XTENSA: #define __UINT16_TYPE__ unsigned short
 // XTENSA: #define __UINT32_C(c) c##U
@@ -3062,7 +3067,7 @@
 // XTENSA: #define __UINT64_MAX__ 18446744073709551615ULL
 // XTENSA: #define __UINT64_TYPE__ long long unsigned int
 // XTENSA: #define __UINT8_C(c) c
-// XTENSA: #define __UINT8_C_SUFFIX__ 
+// XTENSA: #define __UINT8_C_SUFFIX__
 // XTENSA: #define __UINT8_MAX__ 255
 // XTENSA: #define __UINT8_TYPE__ unsigned char
 // XTENSA: #define __UINTMAX_C(c) c##ULL
@@ -3089,7 +3094,7 @@
 // XTENSA: #define __UINT_LEAST64_TYPE__ long long unsigned int
 // XTENSA: #define __UINT_LEAST8_MAX__ 255
 // XTENSA: #define __UINT_LEAST8_TYPE__ unsigned char
-// XTENSA: #define __USER_LABEL_PREFIX__ 
+// XTENSA: #define __USER_LABEL_PREFIX__
 // XTENSA: #define __WCHAR_MAX__ 2147483647
 // XTENSA: #define __WCHAR_TYPE__ int
 // XTENSA: #define __WCHAR_WIDTH__ 32
diff --git a/clang/test/Preprocessor/predefined-arch-macros.c b/clang/test/Preprocessor/predefined-arch-macros.c
index ecddf130a5c51..a3c3697c3a0b9 100644
--- a/clang/test/Preprocessor/predefined-arch-macros.c
+++ b/clang/test/Preprocessor/predefined-arch-macros.c
@@ -1911,7 +1911,6 @@
 // CHECK_GNR_M32: #define __TSXLDTRK__ 1
 // CHECK_GNR_M32: #define __UINTR__ 1
 // CHECK_GNR_M32-NOT: #define __USERMSR__ 1
-// CHECK_DMR_M32: #define __USERMSR__ 1
 // CHECK_GNR_M32: #define __VAES__ 1
 // CHECK_GNR_M32: #define __VPCLMULQDQ__ 1
 // CHECK_GNR_M32: #define __WAITPKG__ 1
@@ -2018,7 +2017,6 @@
 // CHECK_GNR_M64: #define __TSXLDTRK__ 1
 // CHECK_GNR_M64: #define __UINTR__ 1
 // CHECK_GNR_M64-NOT: #define __USERMSR__ 1
-// CHECK_DMR_M64: #define __USERMSR__ 1
 // CHECK_GNR_M64: #define __VAES__ 1
 // CHECK_GNR_M64: #define __VPCLMULQDQ__ 1
 // CHECK_GNR_M64: #define __WAITPKG__ 1
@@ -2525,10 +2523,16 @@
 // RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M32,CHECK_ARLS_M32,CHECK_KL_M32
 // RUN: %clang -march=pantherlake -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M32,CHECK_ARLS_M32,CHECK_PTL_M32,CHECK_NKL_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M32,CHECK_ARLS_M32,CHECK_NKL_M32
+// RUN: %clang -march=wildcatlake -m32 -E -dM %s -o - 2>&1 \
+// RUN:     -target i386-unknown-linux \
+// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M32,CHECK_ARLS_M32,CHECK_NKL_M32
+// RUN: %clang -march=novalake -m32 -E -dM %s -o - 2>&1 \
+// RUN:     -target i386-unknown-linux \
+// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M32,CHECK_ARLS_M32,CHECK_NVL_M32,CHECK_NKL_M32
 // RUN: %clang -march=clearwaterforest -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_SRF_M32,CHECK_ARLS_M32,CHECK_PTL_M32,CHECK_CWF_M32,CHECK_NKL_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_SRF_M32,CHECK_ARLS_M32,CHECK_NVL_M32,CHECK_UMSR_M32,CHECK_NKL_M32
 // CHECK_ARL_M32: #define __ADX__ 1
 // CHECK_ARL_M32: #define __AES__ 1
 // CHECK_ARL_M32: #define __AVX2__ 1
@@ -2568,7 +2572,7 @@
 // CHECK_ARL_M32: #define __POPCNT__ 1
 // CHECK_ARL_M32-NOT: #define __PREFETCHI__ 1
 // CHECK_ARLS_M32-NOT: #define __PREFETCHI__ 1
-// CHECK_PTL_M32: #define __PREFETCHI__ 1
+// CHECK_NVL_M32: #define __PREFETCHI__ 1
 // CHECK_ARL_M32: #define __PRFCHW__ 1
 // CHECK_ARL_M32: #define __PTWRITE__ 1
 // CHECK_ARL_M32-NOT: #define __RAOINT__ 1
@@ -2595,8 +2599,7 @@
 // CHECK_ARL_M32: #define __UINTR__ 1
 // CHECK_ARL_M32-NOT: #define __USERMSR__ 1
 // CHECK_ARLS_M32-NOT: #define __USERMSR__ 1
-// CHECK_PTL_M32-NOT: #define __USERMSR__ 1
-// CHECK_CWF_M32: #define __USERMSR__ 1
+// CHECK_UMSR_M32: #define __USERMSR__ 1
 // CHECK_ARL_M32: #define __VAES__ 1
 // CHECK_ARL_M32: #define __VPCLMULQDQ__ 1
 // CHECK_ARL_M32: #define __WAITPKG__ 1
@@ -2630,10 +2633,16 @@
 // RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M64,CHECK_ARLS_M64,CHECK_KL_M64
 // RUN: %clang -march=pantherlake -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M64,CHECK_ARLS_M64,CHECK_PTL_M64,CHECK_NKL_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M64,CHECK_ARLS_M64,CHECK_NKL_M64
+// RUN: %clang -march=wildcatlake -m64 -E -dM %s -o - 2>&1 \
+// RUN:     -target i386-unknown-linux \
+// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M64,CHECK_ARLS_M64,CHECK_NKL_M64
+// RUN: %clang -march=novalake -m64 -E -dM %s -o - 2>&1 \
+// RUN:     -target i386-unknown-linux \
+// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M64,CHECK_ARLS_M64,CHECK_NVL_M64,CHECK_NKL_M64
 // RUN: %clang -march=clearwaterforest -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M64,CHECK_SRF_M64,CHECK_ARLS_M64,CHECK_PTL_M64,CHECK_CWF_M64,CHECK_NKL_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefixes=CHECK_ARL_M64,CHECK_SRF_M64,CHECK_ARLS_M64,CHECK_NVL_M64,CHECK_UMSR_M64,CHECK_NKL_M64
 // CHECK_ARL_M64: #define __ADX__ 1
 // CHECK_ARL_M64: #define __AES__ 1
 // CHECK_ARL_M64: #define __AVX2__ 1
@@ -2673,7 +2682,7 @@
 // CHECK_ARL_M64: #define __POPCNT__ 1
 // CHECK_ARL_M64-NOT: #define __PREFETCHI__ 1
 // CHECK_ARLS_M64-NOT: #define __PREFETCHI__ 1
-// CHECK_PTL_M64: #define __PREFETCHI__ 1
+// CHECK_NVL_M64: #define __PREFETCHI__ 1
 // CHECK_ARL_M64: #define __PRFCHW__ 1
 // CHECK_ARL_M64: #define __PTWRITE__ 1
 // CHECK_ARL_M64-NOT: #define __RAOINT__ 1
@@ -2701,8 +2710,7 @@
 // CHECK_ARL_M64: #define __UINTR__ 1
 // CHECK_ARL_M64-NOT: #define __USERMSR__ 1
 // CHECK_ARLS_M64-NOT: #define __USERMSR__ 1
-// CHECK_PTL_M64-NOT: #define __USERMSR__ 1
-// CHECK_CWF_M64: #define __USERMSR__ 1
+// CHECK_UMSR_M64: #define __USERMSR__ 1
 // CHECK_ARL_M64: #define __VAES__ 1
 // CHECK_ARL_M64: #define __VPCLMULQDQ__ 1
 // CHECK_ARL_M64: #define __WAITPKG__ 1
diff --git a/clang/test/Preprocessor/riscv-atomics.c b/clang/test/Preprocessor/riscv-atomics.c
new file mode 100644
index 0000000000000..6e02173392069
--- /dev/null
+++ b/clang/test/Preprocessor/riscv-atomics.c
@@ -0,0 +1,24 @@
+// RUN: %clang --target=riscv32-unknown-linux-gnu -march=rv32ia -x c -E -dM %s \
+// RUN: -o - | FileCheck %s
+// RUN: %clang --target=riscv32-unknown-linux-gnu -march=rv32i_zalrsc -x c -E \
+// RUN: -dM %s -o - | FileCheck %s
+// RUN: %clang --target=riscv64-unknown-linux-gnu -march=rv64ia -x c -E -dM %s \
+// RUN: -o - | FileCheck %s --check-prefixes=CHECK,CHECK-RV64
+// RUN: %clang --target=riscv64-unknown-linux-gnu -march=rv64i_zalrsc -x c -E \
+// RUN: -dM %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-RV64
+
+// CHECK: #define __GCC_ATOMIC_BOOL_LOCK_FREE 2
+// CHECK: #define __GCC_ATOMIC_CHAR16_T_LOCK_FREE 2
+// CHECK: #define __GCC_ATOMIC_CHAR32_T_LOCK_FREE 2
+// CHECK: #define __GCC_ATOMIC_CHAR_LOCK_FREE 2
+// CHECK: #define __GCC_ATOMIC_INT_LOCK_FREE 2
+// CHECK-RV64: #define __GCC_ATOMIC_LLONG_LOCK_FREE 2
+// CHECK: #define __GCC_ATOMIC_LONG_LOCK_FREE 2
+// CHECK: #define __GCC_ATOMIC_POINTER_LOCK_FREE 2
+// CHECK: #define __GCC_ATOMIC_SHORT_LOCK_FREE 2
+// CHECK: #define __GCC_ATOMIC_TEST_AND_SET_TRUEVAL 1
+// CHECK: #define __GCC_ATOMIC_WCHAR_T_LOCK_FREE 2
+// CHECK: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1 1
+// CHECK: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2 1
+// CHECK: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 1
+// CHECK-RV64: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 1
diff --git a/clang/test/Preprocessor/systemz_asm_flag_output.c b/clang/test/Preprocessor/systemz_asm_flag_output.c
new file mode 100644
index 0000000000000..b627499d5ce46
--- /dev/null
+++ b/clang/test/Preprocessor/systemz_asm_flag_output.c
@@ -0,0 +1,4 @@
+// RUN: %clang -target systemz-unknown-unknown -x c -E -dM -o - %s | FileCheck -match-full-lines %s
+// RUN: %clang -target s390x-unknown-unknown -x c -E -dM -o - %s | FileCheck -match-full-lines %s
+
+// CHECK: #define __GCC_ASM_FLAG_OUTPUTS__ 1
diff --git a/clang/test/Sema/attr-cpuspecific-cpus.c b/clang/test/Sema/attr-cpuspecific-cpus.c
index 48543ac30da81..2360d861b91ae 100644
--- a/clang/test/Sema/attr-cpuspecific-cpus.c
+++ b/clang/test/Sema/attr-cpuspecific-cpus.c
@@ -87,3 +87,5 @@ ATTR(cpu_specific(lunarlake)) void CPU37(void){}
 ATTR(cpu_specific(gracemont)) void CPU38(void){}
 ATTR(cpu_specific(pantherlake)) void CPU39(void){}
 ATTR(cpu_specific(clearwaterforest)) void CPU40(void){}
+ATTR(cpu_specific(wildcatlake)) void CPU41(void){}
+ATTR(cpu_specific(novalake)) void CPU42(void){}
diff --git a/clang/test/Sema/attr-print.c b/clang/test/Sema/attr-print.c
index 8492356e5d2e5..211e61a937f63 100644
--- a/clang/test/Sema/attr-print.c
+++ b/clang/test/Sema/attr-print.c
@@ -35,3 +35,6 @@ int * __sptr * __ptr32 ppsp32;
 
 // CHECK: __attribute__((availability(macos, strict, introduced=10.6)));
 void f6(int) __attribute__((availability(macosx,strict,introduced=10.6)));
+
+// CHECK: _libc_intl_domainname asm("__gi__libc_intl_domainname") __attribute__((visibility("hidden")));
+extern const char _libc_intl_domainname[]; extern typeof (_libc_intl_domainname) _libc_intl_domainname asm("__gi__libc_intl_domainname") __attribute__((visibility("hidden")));
diff --git a/clang/test/SemaCUDA/Inputs/cuda.h b/clang/test/SemaCUDA/Inputs/cuda.h
index 10db947d8246c..2bf45e03d91c7 100644
--- a/clang/test/SemaCUDA/Inputs/cuda.h
+++ b/clang/test/SemaCUDA/Inputs/cuda.h
@@ -13,6 +13,8 @@
 #define __managed__ __attribute__((managed))
 #define __grid_constant__ __attribute__((grid_constant))
 #define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
+#define __cluster_dims__(...) __attribute__((cluster_dims(__VA_ARGS__)))
+#define __no_cluster__ __attribute__((no_cluster))
 
 struct dim3 {
   unsigned x, y, z;
diff --git a/clang/test/SemaCUDA/atomic-ops.cu b/clang/test/SemaCUDA/atomic-ops.cu
index 233ed1c10fc11..40e110c4b9b77 100644
--- a/clang/test/SemaCUDA/atomic-ops.cu
+++ b/clang/test/SemaCUDA/atomic-ops.cu
@@ -2,6 +2,8 @@
 
 #include "Inputs/cuda.h"
 
+#define INVALID_HIP_MEMORY_SCOPE (__HIP_MEMORY_SCOPE_CLUSTER+1)
+
 __device__ int test_hip_atomic_load(int *pi32, unsigned int *pu32, long long *pll, unsigned long long *pull, float *fp, double *dbl) {
   int val = __hip_atomic_load(0);      // expected-error {{too few arguments to function call, expected 3, have 1}}
   val = __hip_atomic_load(0, 0, 0, 0); // expected-error {{too many arguments to function call, expected 3, have 4}}
@@ -10,9 +12,10 @@ __device__ int test_hip_atomic_load(int *pi32, unsigned int *pu32, long long *pl
   val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
   val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT);
   val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP);
+  val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_CLUSTER);
   val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
   val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
-  val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, 6); // expected-error {{synchronization scope argument to atomic operation is invalid}}
+  val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, INVALID_HIP_MEMORY_SCOPE); // expected-error {{synchronization scope argument to atomic operation is invalid}}
   val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
   val = __hip_atomic_load(pi32, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SINGLETHREAD);
   val = __hip_atomic_load(pi32, __ATOMIC_CONSUME, __HIP_MEMORY_SCOPE_SINGLETHREAD);
@@ -35,9 +38,10 @@ __device__ int test_hip_atomic_store(int *pi32, unsigned int *pu32, long long *p
   __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
   __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT);
   __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP);
+  __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_CLUSTER);
   __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
   __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
-  __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, 6); // expected-error {{synchronization scope argument to atomic operation is invalid}}
+  __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, INVALID_HIP_MEMORY_SCOPE); // expected-error {{synchronization scope argument to atomic operation is invalid}}
   __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
   __hip_atomic_store(pi32, 0, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SINGLETHREAD);
   __hip_atomic_store(pi32, 0, __ATOMIC_CONSUME, __HIP_MEMORY_SCOPE_SINGLETHREAD); // expected-warning{{memory order argument to atomic operation is invalid}}
@@ -71,6 +75,7 @@ __device__ bool test_hip_atomic_cmpxchg_weak(int *ptr, int val, int desired) {
   flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_CONSUME, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
   flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT);
   flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP);
+  flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_CLUSTER);
   flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
   flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
   flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SINGLETHREAD);
diff --git a/clang/test/SemaCUDA/cluster_dims.cu b/clang/test/SemaCUDA/cluster_dims.cu
new file mode 100644
index 0000000000000..dcb8737a51006
--- /dev/null
+++ b/clang/test/SemaCUDA/cluster_dims.cu
@@ -0,0 +1,64 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -ast-print -x hip -verify=NS,all %s
+// RUN: %clang_cc1 -triple nvptx-nvidia-cuda -fcuda-is-device -ast-print -x hip -verify=NS,all %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1250 -fcuda-is-device -ast-print -x hip -verify=amd,common,all %s | FileCheck -check-prefixes=CHECK %s
+// RUN: %clang_cc1 -triple nvptx-nvidia-cuda -target-cpu sm_90 -fcuda-is-device -ast-print -x hip -verify=cuda,common,all %s | FileCheck -check-prefixes=CHECK %s
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -aux-triple amdgcn-amd-amdhsa -ast-print -x hip -verify=amd,common,all %s | FileCheck -check-prefixes=CHECK %s
+
+#include "Inputs/cuda.h"
+
+const int constint = 4;
+
+// CHECK: __attribute__((global)) __attribute__((cluster_dims(2, 2, 2))) void test_literal_3d()
+__global__ void __cluster_dims__(2, 2, 2) test_literal_3d() {} //NS-error {{'cluster_dims' is not supported for this GPU architecture}}
+
+// CHECK: __attribute__((global)) __attribute__((cluster_dims(2, 2))) void test_literal_2d()
+__global__ void __cluster_dims__(2, 2) test_literal_2d() {} //NS-error {{'cluster_dims' is not supported for this GPU architecture}}
+
+// CHECK: __attribute__((global)) __attribute__((cluster_dims(4))) void test_literal_1d()
+__global__ void __cluster_dims__(4) test_literal_1d() {} //NS-error {{'cluster_dims' is not supported for this GPU architecture}}
+
+// CHECK: __attribute__((global)) __attribute__((cluster_dims(constint, constint / 4, 1))) void test_constant()
+__global__ void __cluster_dims__(constint, constint / 4, 1) test_constant() {} //NS-error {{'cluster_dims' is not supported for this GPU architecture}}
+
+// CHECK: template <int x, int y, int z> void test_template() __attribute__((cluster_dims(x, y, z)))
+template <int x, int y, int z>  void test_template(void) __cluster_dims__(x, y, z){} //NS-error {{'cluster_dims' is not supported for this GPU architecture}}
+
+// CHECK: template <int x, int y, int z> void test_template_expr() __attribute__((cluster_dims(x + constint, y, z)))
+template <int x, int y, int z> void test_template_expr(void) __cluster_dims__(x + constint, y, z) {} //NS-error {{'cluster_dims' is not supported for this GPU architecture}}
+
+//NS-error@+1 {{'cluster_dims' is not supported for this GPU architecture}}
+__global__ void __cluster_dims__(32, 2, 4) test_too_large_dim_0() {} // common-error {{integer constant expression evaluates to value 32 that cannot be represented in a 4-bit unsigned integer type}}
+
+// cuda-error@+2 {{cluster does not support more than 8 thread blocks; 64 provided}}
+// amd-error@+1 {{cluster does not support more than 16 thread blocks; 64 provided}}
+__global__ void __cluster_dims__(4, 4, 4) test_too_large_dim_1() {} // NS-error {{'cluster_dims' is not supported for this GPU architecture}}
+
+// cuda-error@+3 {{cluster does not support more than 8 thread blocks; 64 provided}}
+// amd-error@+2 {{cluster does not support more than 16 thread blocks; 64 provided}}
+template<unsigned a, unsigned b, unsigned c>
+__global__ void __cluster_dims__(a, b, c) test_too_large_dim_template() {} // NS-error {{'cluster_dims' is not supported for this GPU architecture}}
+template __global__ void test_too_large_dim_template<4, 4, 4>(); // common-note {{in instantiation of function template specialization 'test_too_large_dim_template<4U, 4U, 4U>' requested here}}
+
+int none_const_int = 4;
+
+//NS-error@+1 {{'cluster_dims' is not supported for this GPU architecture}}
+__global__ void __cluster_dims__(none_const_int, 2, 4) test_non_constant_0() {} // common-error {{'cluster_dims' attribute requires parameter 0 to be an integer constant}}
+
+//NS-error@+1 {{'cluster_dims' is not supported for this GPU architecture}}
+__global__ void __cluster_dims__(8, none_const_int / 2, 4) test_non_constant_1() {} // common-error {{'cluster_dims' attribute requires parameter 1 to be an integer constant}}
+
+//NS-error@+1 {{'cluster_dims' is not supported for this GPU architecture}}
+__global__ void __cluster_dims__(8, 2, none_const_int / 4) test_non_constant_2() {} // common-error {{'cluster_dims' attribute requires parameter 2 to be an integer constant}}
+
+//NS-error@+1 {{'no_cluster' is not supported for this GPU architecture}}
+__global__ void __no_cluster__ test_no_cluster() {}
+
+//NS-error@+2 {{'no_cluster' is not supported for this GPU architecture}}
+//NS-error@+1 {{'cluster_dims' is not supported for this GPU architecture}}
+__global__ void __no_cluster__ __cluster_dims__(2,2,2) test_have_both() {} // common-error {{'cluster_dims' and 'no_cluster' attributes are not compatible}} common-note {{conflicting attribute is here}}
+
+template <int... args>
+__cluster_dims__(args) void test_template_variadic_args(void) {} // all-error {{expression contains unexpanded parameter pack 'args'}}
+
+template <int... args>
+__cluster_dims__(1, args) void test_template_variadic_args_2(void) {} // all-error {{expression contains unexpanded parameter pack 'args'}}
diff --git a/clang/test/SemaCUDA/spirv-amdgcn-atomic-ops.cu b/clang/test/SemaCUDA/spirv-amdgcn-atomic-ops.cu
index ea1f24670ff9a..503e786877819 100644
--- a/clang/test/SemaCUDA/spirv-amdgcn-atomic-ops.cu
+++ b/clang/test/SemaCUDA/spirv-amdgcn-atomic-ops.cu
@@ -2,6 +2,8 @@
 
 #include "Inputs/cuda.h"
 
+#define INVALID_HIP_MEMORY_SCOPE (__HIP_MEMORY_SCOPE_CLUSTER+1)
+
 __device__ int test_hip_atomic_load(int *pi32, unsigned int *pu32, long long *pll, unsigned long long *pull, float *fp, double *dbl) {
   int val = __hip_atomic_load(0);      // expected-error {{too few arguments to function call, expected 3, have 1}}
   val = __hip_atomic_load(0, 0, 0, 0); // expected-error {{too many arguments to function call, expected 3, have 4}}
@@ -10,9 +12,10 @@ __device__ int test_hip_atomic_load(int *pi32, unsigned int *pu32, long long *pl
   val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
   val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT);
   val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP);
+  val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_CLUSTER);
   val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
   val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
-  val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, 6); // expected-error {{synchronization scope argument to atomic operation is invalid}}
+  val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, INVALID_HIP_MEMORY_SCOPE); // expected-error {{synchronization scope argument to atomic operation is invalid}}
   val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
   val = __hip_atomic_load(pi32, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SINGLETHREAD);
   val = __hip_atomic_load(pi32, __ATOMIC_CONSUME, __HIP_MEMORY_SCOPE_SINGLETHREAD);
@@ -35,9 +38,10 @@ __device__ int test_hip_atomic_store(int *pi32, unsigned int *pu32, long long *p
   __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
   __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT);
   __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP);
+  __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_CLUSTER);
   __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
   __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
-  __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, 6); // expected-error {{synchronization scope argument to atomic operation is invalid}}
+  __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, INVALID_HIP_MEMORY_SCOPE); // expected-error {{synchronization scope argument to atomic operation is invalid}}
   __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
   __hip_atomic_store(pi32, 0, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SINGLETHREAD);
   __hip_atomic_store(pi32, 0, __ATOMIC_CONSUME, __HIP_MEMORY_SCOPE_SINGLETHREAD); // expected-warning{{memory order argument to atomic operation is invalid}}
@@ -71,6 +75,7 @@ __device__ bool test_hip_atomic_cmpxchg_weak(int *ptr, int val, int desired) {
   flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_CONSUME, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
   flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT);
   flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP);
+  flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_CLUSTER);
   flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
   flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
   flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SINGLETHREAD);
diff --git a/clang/test/SemaCXX/builtin-structured-binding-size.cpp b/clang/test/SemaCXX/builtin-structured-binding-size.cpp
index bcd13a6fb720d..a8f670546117f 100644
--- a/clang/test/SemaCXX/builtin-structured-binding-size.cpp
+++ b/clang/test/SemaCXX/builtin-structured-binding-size.cpp
@@ -40,8 +40,8 @@ static_assert(__builtin_structured_binding_size(S5) == 2);
 // expected-error@-1 {{static assertion failed due to requirement '__builtin_structured_binding_size(S5) == 2'}} \
 // expected-note@-1 {{expression evaluates to '1 == 2'}}
 static_assert(__builtin_structured_binding_size(S6) == 2);
-// expected-error@-1 {{cannot decompose class type 'S6' because it has an anonymous union member}} \
-// expected-error@-1 {{type 'S6' cannot be decomposed}} \
+// expected-error@-1 {{cannot bind class type 'S6' because it has an anonymous union member}} \
+// expected-error@-1 {{type 'S6' cannot be bound}} \
 // expected-error@-1 {{static assertion expression is not an integral constant expression}} \
 // expected-note@#note-anon-union {{declared here}}
 static_assert(__builtin_structured_binding_size(S7) == 1);
@@ -49,15 +49,15 @@ static_assert(__builtin_structured_binding_size(S7) == 1);
 
 static_assert(__builtin_structured_binding_size(SD) == 1);
 static_assert(__builtin_structured_binding_size(SE1) == 1);
-// expected-error@-1 {{cannot decompose class type 'SE1': both it and its base class 'S1' have non-static data members}} \
-// expected-error@-1 {{type 'SE1' cannot be decomposed}} \
+// expected-error@-1 {{cannot bind class type 'SE1': both it and its base class 'S1' have non-static data members}} \
+// expected-error@-1 {{type 'SE1' cannot be bound}} \
 // expected-error@-1 {{static assertion expression is not an integral constant expression}}
 
 static_assert(__builtin_structured_binding_size(U1) == 0);
-// expected-error@-1 {{type 'U1' cannot be decomposed}} \
+// expected-error@-1 {{type 'U1' cannot be bound}} \
 // expected-error@-1 {{static assertion expression is not an integral constant expression}}
 static_assert(__builtin_structured_binding_size(U2) == 0);
-// expected-error@-1 {{type 'U2' cannot be decomposed}} \
+// expected-error@-1 {{type 'U2' cannot be bound}} \
 // expected-error@-1 {{static assertion expression is not an integral constant expression}}
 
 
@@ -75,7 +75,7 @@ static_assert(__builtin_structured_binding_size(decltype(__builtin_complex(0., 0
 
 int VLASize; // expected-note {{declared here}}
 static_assert(__builtin_structured_binding_size(int[VLASize]) == 42);
-// expected-error@-1 {{type 'int[VLASize]' cannot be decomposed}} \
+// expected-error@-1 {{type 'int[VLASize]' cannot be bound}} \
 // expected-warning@-1 {{variable length arrays in C++ are a Clang extension}} \
 // expected-note@-1 {{read of non-const variable 'VLASize' is not allowed in a constant expression}} \
 // expected-error@-1 {{static assertion expression is not an integral constant expression}}
@@ -84,10 +84,10 @@ static_assert(__builtin_structured_binding_size(int[VLASize]) == 42);
 struct Incomplete; // expected-note {{forward declaration of 'Incomplete'}}
 static_assert(__builtin_structured_binding_size(Incomplete) == 1);
 // expected-error@-1 {{incomplete type 'Incomplete' where a complete type is required}} \
-// expected-error@-1 {{type 'Incomplete' cannot be decomposed}} \
+// expected-error@-1 {{type 'Incomplete' cannot be bound}} \
 // expected-error@-1 {{static assertion expression is not an integral constant expression}}
 static_assert(__builtin_structured_binding_size(Incomplete[]) == 1);
-// expected-error@-1 {{type 'Incomplete[]' cannot be decomposed}} \
+// expected-error@-1 {{type 'Incomplete[]' cannot be bound}} \
 // expected-error@-1 {{static assertion expression is not an integral constant expression}}
 static_assert(__builtin_structured_binding_size(Incomplete[0]) == 0);
 static_assert(__builtin_structured_binding_size(Incomplete[1]) == 1);
@@ -97,12 +97,12 @@ static_assert(__builtin_structured_binding_size(Incomplete[42]) == 42);
 static_assert(__builtin_structured_binding_size(P1) == 0);
 // expected-error@-1 {{static assertion failed due to requirement '__builtin_structured_binding_size(P1) == 0'}} \
 // expected-note@-1 {{expression evaluates to '1 == 0'}} \
-// expected-error@-1 {{cannot decompose private member 'a' of 'P1}} \
+// expected-error@-1 {{cannot bind private member 'a' of 'P1}} \
 // expected-note@#note-private {{implicitly declared private here}}
 
 
 void func(int array[14], int x = __builtin_structured_binding_size(decltype(array)));
-//expected-error@-1 {{type 'decltype(array)' (aka 'int *') cannot be decomposed}}
+//expected-error@-1 {{type 'decltype(array)' (aka 'int *') cannot be bound}}
 
 struct SM {
     static int array[14];
@@ -115,7 +115,7 @@ struct T {
 };
 
 T<int> t1;
-// expected-error@#tpl-1 {{type 'int' cannot be decomposed}} \
+// expected-error@#tpl-1 {{type 'int' cannot be bound}} \
 // expected-error@#tpl-1 {{non-type template argument is not a constant expression}} \
 // expected-note@-1 {{in instantiation of default argument for 'T<int>' required here}} \
 // expected-note@-1 {{while checking a default template argument used here}} \
@@ -183,8 +183,8 @@ static_assert(!is_destructurable<T0&>);
 static_assert(__builtin_structured_binding_size(T1) == 1);
 static_assert(__builtin_structured_binding_size(T42) == 42);
 static_assert(__builtin_structured_binding_size(TSizeError) == 42);
-// expected-error@-1 {{cannot decompose this type; 'std::tuple_size<TSizeError>::value' is not a valid integral constant expression}} \
-// expected-error@-1 {{type 'TSizeError' cannot be decomposed}} \
+// expected-error@-1 {{cannot bind this type; 'std::tuple_size<TSizeError>::value' is not a valid integral constant expression}} \
+// expected-error@-1 {{type 'TSizeError' cannot be bound}} \
 // expected-error@-1 {{static assertion expression is not an integral constant expression}}
 static_assert(!is_destructurable<TSizeError>);
 }
@@ -195,7 +195,7 @@ struct S {
   int y;
   static_assert(__builtin_structured_binding_size(S) == 2);
   //expected-error@-1 {{incomplete type 'S' where a complete type is required}} \
-  // expected-error@-1 {{type 'S' cannot be decomposed}} \
+  // expected-error@-1 {{type 'S' cannot be bound}} \
   // expected-error@-1 {{static assertion expression is not an integral constant expression}} \
   // expected-note@-4 {{definition of 'S' is not complete until the closing '}'}}
 };
@@ -228,20 +228,20 @@ static_assert(__is_same_as(tag_of_t<S1>, int));
 
 static_assert(__is_same_as(tag_of_t<int>, int)); // error
 // expected-error@-1 {{constraints not satisfied for alias template 'tag_of_t' [with T = int]}}
-// expected-note@#tag-of-constr {{because substituted constraint expression is ill-formed: type 'int' cannot be decomposed}}
+// expected-note@#tag-of-constr {{because substituted constraint expression is ill-formed: type 'int' cannot be bound}}
 
 struct MinusOne;
 template <> struct ::std::tuple_size<MinusOne> {
   static constexpr int value = -1;
 };
 int minus_one = __builtin_structured_binding_size(MinusOne);
-// expected-error@-1 {{cannot decompose this type; 'std::tuple_size<MinusOne>::value' is not a valid size: -1}}
-// expected-error@-2 {{type 'MinusOne' cannot be decomposed}}
+// expected-error@-1 {{cannot bind this type; 'std::tuple_size<MinusOne>::value' is not a valid size: -1}}
+// expected-error@-2 {{type 'MinusOne' cannot be bound}}
 
 struct UintMax;
 template <> struct ::std::tuple_size<UintMax> {
   static constexpr unsigned value = -1;
 };
 int uint_max = __builtin_structured_binding_size(UintMax);
-// expected-error@-1 {{cannot decompose this type; 'std::tuple_size<UintMax>::value' is not a valid size: 4294967295}}
-// expected-error@-2 {{type 'UintMax' cannot be decomposed}}
+// expected-error@-1 {{cannot bind this type; 'std::tuple_size<UintMax>::value' is not a valid size: 4294967295}}
+// expected-error@-2 {{type 'UintMax' cannot be bound}}
diff --git a/clang/test/SemaCXX/constexpr-string.cpp b/clang/test/SemaCXX/constexpr-string.cpp
index c456740ef7551..93e234685d284 100644
--- a/clang/test/SemaCXX/constexpr-string.cpp
+++ b/clang/test/SemaCXX/constexpr-string.cpp
@@ -7,6 +7,8 @@
 // RUN: %clang_cc1 %s -triple armebv7-unknown-linux -std=c++2a -fsyntax-only -verify -pedantic -Wno-vla-extension -fno-signed-char
 // RUN: %clang_cc1 %s -triple armebv7-unknown-linux -std=c++2a -fsyntax-only -verify -pedantic -Wno-vla-extension -fno-wchar -DNO_PREDEFINED_WCHAR_T
 
+// RUN: %clang_cc1 %s -triple armebv7-unknown-linux -std=c++2a -fsyntax-only -verify -pedantic -Wno-vla-extension -fno-signed-char -fexperimental-new-constant-interpreter
+
 # 9 "/usr/include/string.h" 1 3 4  // expected-warning {{this style of line directive is a GNU extension}}
 extern "C" {
   typedef decltype(sizeof(int)) size_t;
diff --git a/clang/test/SemaCXX/cxx17-compat.cpp b/clang/test/SemaCXX/cxx17-compat.cpp
index 99e41d818a6c3..1c9060d388d2a 100644
--- a/clang/test/SemaCXX/cxx17-compat.cpp
+++ b/clang/test/SemaCXX/cxx17-compat.cpp
@@ -76,18 +76,18 @@ struct ConstexprVirtual {
 struct C { int x, y, z; };
 static auto [cx, cy, cz] = C();
 #if __cplusplus <= 201703L
-    // expected-warning@-2 {{decomposition declaration declared 'static' is a C++20 extension}}
+    // expected-warning@-2 {{structured binding declaration declared 'static' is a C++20 extension}}
 #else
-    // expected-warning@-4 {{decomposition declaration declared 'static' is incompatible with C++ standards before C++20}}
+    // expected-warning@-4 {{structured binding declaration declared 'static' is incompatible with C++ standards before C++20}}
 #endif
 void f() {
   static thread_local auto [cx, cy, cz] = C();
 #if __cplusplus <= 201703L
-    // expected-warning@-2 {{decomposition declaration declared 'static' is a C++20 extension}}
-    // expected-warning@-3 {{decomposition declaration declared 'thread_local' is a C++20 extension}}
+    // expected-warning@-2 {{structured binding declaration declared 'static' is a C++20 extension}}
+    // expected-warning@-3 {{structured binding declaration declared 'thread_local' is a C++20 extension}}
 #else
-    // expected-warning@-5 {{decomposition declaration declared 'static' is incompatible with C++ standards before C++20}}
-    // expected-warning@-6 {{decomposition declaration declared 'thread_local' is incompatible with C++ standards before C++20}}
+    // expected-warning@-5 {{structured binding declaration declared 'static' is incompatible with C++ standards before C++20}}
+    // expected-warning@-6 {{structured binding declaration declared 'thread_local' is incompatible with C++ standards before C++20}}
 #endif
 }
 
diff --git a/clang/test/SemaCXX/cxx1z-decomposition.cpp b/clang/test/SemaCXX/cxx1z-decomposition.cpp
index 6ee1249a66c3f..158a3a66deb47 100644
--- a/clang/test/SemaCXX/cxx1z-decomposition.cpp
+++ b/clang/test/SemaCXX/cxx1z-decomposition.cpp
@@ -3,22 +3,22 @@
 // RUN: %clang_cc1 -std=c++20 -Wpre-c++20-compat -fexperimental-new-constant-interpreter -verify=expected %s
 
 void use_from_own_init() {
-  auto [a] = a; // expected-error {{binding 'a' cannot appear in the initializer of its own decomposition declaration}}
+  auto [a] = a; // expected-error {{binding 'a' cannot appear in the initializer of its own structured binding declaration}}
 }
 
 void num_elems() {
   struct A0 {} a0;
   int a1[1], a2[2];
 
-  auto [] = a0; // expected-warning {{does not allow a decomposition group to be empty}}
-  auto [v1] = a0; // expected-error {{type 'struct A0' decomposes into 0 elements, but 1 name was provided}}
-  auto [] = a1; // expected-error {{type 'int[1]' decomposes into 1 element, but no names were provided}} expected-warning {{empty}}
+  auto [] = a0; // expected-warning {{does not allow a structured binding group to be empty}}
+  auto [v1] = a0; // expected-error {{type 'struct A0' binds to 0 elements, but 1 name was provided}}
+  auto [] = a1; // expected-error {{type 'int[1]' binds to 1 element, but no names were provided}} expected-warning {{empty}}
   auto [v2] = a1;
-  auto [v3, v4] = a1; // expected-error {{type 'int[1]' decomposes into 1 element, but 2 names were provided}}
-  auto [] = a2; // expected-error {{type 'int[2]' decomposes into 2 elements, but no names were provided}} expected-warning {{empty}}
-  auto [v5] = a2; // expected-error {{type 'int[2]' decomposes into 2 elements, but only 1 name was provided}}
+  auto [v3, v4] = a1; // expected-error {{type 'int[1]' binds to 1 element, but 2 names were provided}}
+  auto [] = a2; // expected-error {{type 'int[2]' binds to 2 elements, but no names were provided}} expected-warning {{empty}}
+  auto [v5] = a2; // expected-error {{type 'int[2]' binds to 2 elements, but only 1 name was provided}}
   auto [v6, v7] = a2;
-  auto [v8, v9, v10] = a2; // expected-error {{type 'int[2]' decomposes into 2 elements, but 3 names were provided}}
+  auto [v8, v9, v10] = a2; // expected-error {{type 'int[2]' binds to 2 elements, but 3 names were provided}}
 }
 
 // As a Clang extension, _Complex can be decomposed.
@@ -105,7 +105,7 @@ void enclosing() {
 void bitfield() {
   struct { int a : 3, : 4, b : 5; } a;
   auto &[x, y] = a;
-  auto &[p, q, r] = a; // expected-error-re {{type 'struct (unnamed struct at {{.*}})' decomposes into 2 elements, but 3 names were provided}}
+  auto &[p, q, r] = a; // expected-error-re {{type 'struct (unnamed struct at {{.*}})' binds to 2 elements, but 3 names were provided}}
 }
 
 void for_range() {
@@ -115,7 +115,7 @@ void for_range() {
   }
 
   int y[5];
-  for (auto[c] : y) { // expected-error {{cannot decompose non-class, non-array type 'int'}}
+  for (auto[c] : y) { // expected-error {{cannot bind non-class, non-array type 'int'}}
     c++;
   }
 }
@@ -157,16 +157,16 @@ int f2() {
 namespace lambdas {
   void f() {
     int n;
-    auto [a] =  // expected-error {{cannot decompose lambda closure type}}
+    auto [a] =  // expected-error {{cannot bind lambda closure type}}
         [n] {}; // expected-note {{lambda expression}}
   }
 
-  auto [] = []{}; // expected-warning {{ISO C++17 does not allow a decomposition group to be empty}}
+  auto [] = []{}; // expected-warning {{ISO C++17 does not allow a structured binding group to be empty}}
 
   int g() {
     int n = 0;
     auto a = [=](auto &self) { // expected-note {{lambda expression}}
-      auto &[capture] = self; // expected-error {{cannot decompose lambda closure type}}
+      auto &[capture] = self; // expected-error {{cannot bind lambda closure type}}
       ++capture;
       return n;
     };
@@ -188,14 +188,14 @@ namespace lambdas {
     struct A : decltype(x) {
       int n;
     };
-    auto &&[r] = A{x, 0}; // expected-error-re {{cannot decompose class type 'A': both it and its base class 'decltype(x)' (aka '(lambda {{.*}})') have non-static data members}}
+    auto &&[r] = A{x, 0}; // expected-error-re {{cannot bind class type 'A': both it and its base class 'decltype(x)' (aka '(lambda {{.*}})') have non-static data members}}
     return r;
   }
 
   void j() {
     auto x = [] {};
     struct A : decltype(x) {};
-    auto &&[] = A{x}; // expected-warning {{ISO C++17 does not allow a decomposition group to be empty}}
+    auto &&[] = A{x}; // expected-warning {{ISO C++17 does not allow a structured binding group to be empty}}
   }
 }
 
diff --git a/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp b/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp
index 6cf0e0251ab62..331fe8387e1c7 100644
--- a/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp
+++ b/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp
@@ -626,3 +626,20 @@ void fn() {
 }
 
 }
+
+
+namespace GH109096 {
+consteval void undefined();
+template <typename T>
+struct scope_exit {
+    T t;
+    constexpr ~scope_exit() { t(); }
+    // expected-error@-1 {{call to immediate function 'GH109096::(anonymous class)::operator()' is not a constant expression}} \
+    // expected-note@-1 {{implicit use of 'this' pointer is only allowed within the evaluation}}
+};
+
+scope_exit guard( // expected-note {{in instantiation of member function}}
+    []() { undefined(); }
+);
+
+}
diff --git a/clang/test/SemaCXX/cxx2c-binding-pack-nontemplate.cpp b/clang/test/SemaCXX/cxx2c-binding-pack-nontemplate.cpp
index 638a2d805c2c5..0dfb52b3e0396 100644
--- a/clang/test/SemaCXX/cxx2c-binding-pack-nontemplate.cpp
+++ b/clang/test/SemaCXX/cxx2c-binding-pack-nontemplate.cpp
@@ -10,7 +10,7 @@ void decompose_array() {
   auto [x, ...rest, y] = arr;
 
   // cxx26-warning@+4 {{structured binding packs are incompatible with C++ standards before C++2c}}
-  // cxx23-error@+3 {{decomposition declaration cannot be declared 'constexpr'}}
+  // cxx23-error@+3 {{structured binding declaration cannot be declared 'constexpr'}}
   // cxx23-warning@+2 {{structured binding packs are a C++2c extension}}
   // nontemplate-error@+1 {{pack declaration outside of template}}
   constexpr auto [x_c, ...rest_c, y_c] = arr;
diff --git a/clang/test/SemaCXX/cxx2c-binding-pack.cpp b/clang/test/SemaCXX/cxx2c-binding-pack.cpp
index a8c1386ea5c18..0b0eb88fe4c98 100644
--- a/clang/test/SemaCXX/cxx2c-binding-pack.cpp
+++ b/clang/test/SemaCXX/cxx2c-binding-pack.cpp
@@ -82,7 +82,7 @@ void decompose_array() {
   static_assert(sizeof...(b) == 0);
   auto [...c] = arr1;
   static_assert(sizeof...(c) == 1);
-  auto [a1, ...b1, c1] = arr1; // expected-error{{decomposes into 1 element, but 3 names were provided}}
+  auto [a1, ...b1, c1] = arr1; // expected-error{{binds to 1 element, but 3 names were provided}}
 }
 
 // Test case by Younan Zhang.
@@ -160,7 +160,7 @@ void now_i_know_my() {
   static_assert(sizeof...(e) == 2);
   auto [h, i, j, ...k] = C(); // OK, the pack k is empty
   static_assert(sizeof...(e) == 0);
-  auto [l, m, n, o, ...p] = C(); // expected-error{{{decomposes into 3 elements, but 5 names were provided}}}
+  auto [l, m, n, o, ...p] = C(); // expected-error{{{binds to 3 elements, but 5 names were provided}}}
 }
 }  // namespace
 
@@ -225,7 +225,7 @@ namespace GH125165 {
 template <typename = void>
 auto f(auto t) {
     const auto& [...pack] = t;
-    // expected-error@-1 {{cannot decompose non-class, non-array type 'char const'}}
+    // expected-error@-1 {{cannot bind non-class, non-array type 'char const'}}
     (pack, ...);
 };
 
diff --git a/clang/test/SemaCXX/cxx2c-template-template-param.cpp b/clang/test/SemaCXX/cxx2c-template-template-param.cpp
index 4ad3fd95039cd..704df3112277f 100644
--- a/clang/test/SemaCXX/cxx2c-template-template-param.cpp
+++ b/clang/test/SemaCXX/cxx2c-template-template-param.cpp
@@ -350,3 +350,87 @@ template <A<concept missing<int>> T> // expected-error {{expected expression}} \
                                      // expected-error {{expected unqualified-id}}
 auto f();
 }
+
+namespace concept_arg_normalization {
+
+template <typename T,
+          template <typename...> concept C1>
+concept one = (C1<T>); // #concept-arg-one
+
+template <typename T>
+concept A = true; // #concept-arg-A
+
+template <typename T>
+concept BetterA = A<T> && true;
+
+template <typename T>
+concept B = true; // #concept-arg-B
+
+template <typename T>
+concept False = false; // #concept-arg-False
+
+template <typename T>
+requires one<T, A>
+void f1(T){} // #concept-arg-f1-1
+
+template <typename T>
+requires one<T, B>
+void f1(T){} // #concept-arg-f1-2
+
+template <typename T>
+requires one<T, A>
+void f2(T){}
+
+template <typename T>
+requires one<T, BetterA>
+void f2(T){}
+
+
+template <template <typename> concept CT>
+requires one<int, A>
+void f3(){} // #concept-arg-f3-1
+
+template <template <typename> concept CT>
+requires one<int, CT>
+void f3(){} // #concept-arg-f3-2
+
+template <typename T>
+requires one<T, False> void f4(T){} // #concept-arg-f4
+
+
+void test() {
+    f1(0);
+    // expected-error@-1 {{call to 'f1' is ambiguous}}
+    // expected-note@#concept-arg-f1-1{{candidate function [with T = int]}}
+    // expected-note@#concept-arg-f1-2{{candidate function [with T = int]}}
+    // expected-note@#concept-arg-A {{similar constraint expressions not considered equivalent}}
+    // expected-note@#concept-arg-B {{similar constraint expression here}}
+    f2(0);
+
+    f3<BetterA>();
+    // expected-error@-1 {{call to 'f3' is ambiguous}}
+    // expected-note@#concept-arg-f3-1 {{candidate function [with CT = concept_arg_normalization::BetterA]}}
+    // expected-note@#concept-arg-f3-2 {{candidate function [with CT = concept_arg_normalization::BetterA]}}
+
+static_assert(one<int, A>);
+static_assert(one<int, False>);
+// expected-error@-1 {{static assertion failed}} \
+// expected-note@-1 {{because 'one<int, False>' evaluated to false}}
+// expected-note@#concept-arg-one {{because 'int' does not satisfy 'False'}}
+// expected-note@#concept-arg-False {{because 'false' evaluated to false}}
+
+f4(0);
+// expected-error@-1 {{no matching function for call to 'f4'}}
+// expected-note@#concept-arg-f4 {{candidate template ignored: constraints not satisfied [with T = int]}}
+// expected-note@#concept-arg-f4 {{because 'one<int, False>'}}
+// expected-note@#concept-arg-one {{because 'int' does not satisfy 'False'}}
+// expected-note@#concept-arg-False {{because 'false' evaluated to false}}
+
+}
+
+template <typename T, template <typename...> concept C1>
+concept TestBinary = T::a || C1<T>;
+static_assert(TestBinary<int, A>);
+
+
+}
diff --git a/clang/test/SemaCXX/libstdcxx_pair_swap_hack.cpp b/clang/test/SemaCXX/libstdcxx_pair_swap_hack.cpp
index 6b8ca4f740914..a4775710e6d8c 100644
--- a/clang/test/SemaCXX/libstdcxx_pair_swap_hack.cpp
+++ b/clang/test/SemaCXX/libstdcxx_pair_swap_hack.cpp
@@ -82,13 +82,14 @@ namespace sad {
   template<typename T> void swap(T &, T &);
 
   template<typename A, typename B> struct CLASS {
-    void swap(CLASS &other) noexcept(noexcept(swap(*this, other))); // expected-error {{too many arguments}} expected-note {{declared here}}
-    // expected-error@-1{{uses itself}} expected-note@-1{{in instantiation of}}
+    void swap(CLASS &other) // expected-note {{declared here}}
+      noexcept(noexcept(swap(*this, other))); // expected-error {{too many arguments}}
+    // expected-error@-1{{uses itself}}
   };
 
   CLASS<int, int> pi;
 
-  static_assert(!noexcept(pi.swap(pi)), ""); // expected-note 2{{in instantiation of exception specification for 'swap'}}
+  static_assert(!noexcept(pi.swap(pi)), ""); // expected-note {{in instantiation of exception specification for 'swap'}}
 }
 
 #endif
diff --git a/clang/test/SemaCXX/matrix-type.cpp b/clang/test/SemaCXX/matrix-type.cpp
index 186d3b6b35208..0f9bff868adbe 100644
--- a/clang/test/SemaCXX/matrix-type.cpp
+++ b/clang/test/SemaCXX/matrix-type.cpp
@@ -14,6 +14,7 @@ void matrix_var_dimensions(int Rows, unsigned Columns, char C) {
   using matrix7_t = int __attribute__((matrix_type(1, 0)));       // expected-error{{zero matrix size}}
   using matrix7_t = int __attribute__((matrix_type(char, 0)));    // expected-error{{expected '(' for function-style cast or type construction}}
   using matrix8_t = int __attribute__((matrix_type(1048576, 1))); // expected-error{{matrix row size too large}}
+  using matrix8_t = int __attribute__((matrix_type(1048576, 1048576))); // expected-error{{matrix row and column size too large}}
 }
 
 struct S1 {};
diff --git a/clang/test/SemaCXX/sizeless-1.cpp b/clang/test/SemaCXX/sizeless-1.cpp
index 688bbf058e4ca..ef3eec9507b58 100644
--- a/clang/test/SemaCXX/sizeless-1.cpp
+++ b/clang/test/SemaCXX/sizeless-1.cpp
@@ -532,7 +532,7 @@ void cxx_only(int sel) {
   auto auto_int8 = local_int8;
   auto auto_int16 = local_int16;
 #if __cplusplus >= 201703L
-  auto [auto_int8_a] = local_int8; // expected-error {{cannot decompose non-class, non-array type 'svint8_t' (aka '__SVInt8_t')}}
+  auto [auto_int8_a] = local_int8; // expected-error {{cannot bind non-class, non-array type 'svint8_t' (aka '__SVInt8_t')}}
 #endif
 #endif
 
diff --git a/clang/test/SemaCXX/type-traits.cpp b/clang/test/SemaCXX/type-traits.cpp
index 901d510bba847..9ef44d0346b48 100644
--- a/clang/test/SemaCXX/type-traits.cpp
+++ b/clang/test/SemaCXX/type-traits.cpp
@@ -2066,7 +2066,28 @@ class UserProvidedConstructor {
     UserProvidedConstructor(const UserProvidedConstructor&)            = delete;
     UserProvidedConstructor& operator=(const UserProvidedConstructor&) = delete;
 };
+struct Ctr {
+  Ctr();
+};
+struct Ctr2 {
+  Ctr2();
+private:
+  NoEligibleTrivialContructor inner;
+};
+
+struct NonCopyable{
+    NonCopyable() = default;
+    NonCopyable(const NonCopyable&) = delete;
+};
+
+class C {
+    NonCopyable nc;
+};
 
+static_assert(__builtin_is_implicit_lifetime(Ctr));
+static_assert(!__builtin_is_implicit_lifetime(Ctr2));
+static_assert(__builtin_is_implicit_lifetime(C));
+static_assert(!__builtin_is_implicit_lifetime(NoEligibleTrivialContructor));
 static_assert(__builtin_is_implicit_lifetime(NonAggregate));
 static_assert(!__builtin_is_implicit_lifetime(DataMemberInitializer));
 static_assert(!__builtin_is_implicit_lifetime(UserProvidedConstructor));
@@ -2076,9 +2097,27 @@ template <typename T>
 class Tpl {
     Tpl() requires false = default ;
 };
-static_assert(!__builtin_is_implicit_lifetime(Tpl<int>));
+static_assert(__builtin_is_implicit_lifetime(Tpl<int>));
+
+template <typename>
+class MultipleDefaults {
+  MultipleDefaults() {};
+  MultipleDefaults() requires true = default;
+};
+static_assert(__builtin_is_implicit_lifetime(MultipleDefaults<int>));
+template <typename>
+class MultipleDefaults2 {
+  MultipleDefaults2() requires true {};
+  MultipleDefaults2() = default;
+};
+
+static_assert(__builtin_is_implicit_lifetime(MultipleDefaults2<int>));
+
 
 #endif
+
+
+
 }
 
 void is_signed()
diff --git a/clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp b/clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp
index 2202881353e56..8334c122b627b 100644
--- a/clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp
+++ b/clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp
@@ -14,7 +14,7 @@ void test(char8_t u8, char16_t u16, char32_t u32) {
     c16(u32); // expected-warning {{implicit conversion from 'char32_t' to 'char16_t' may lose precision and change the meaning of the represented code unit}}
 
     c32(u8);  // expected-warning {{implicit conversion from 'char8_t' to 'char32_t' may change the meaning of the represented code unit}}
-    c32(u16); // expected-warning {{implicit conversion from 'char16_t' to 'char32_t' may change the meaning of the represented code unit}}
+    c32(u16);
     c32(u32);
 
 
@@ -30,7 +30,7 @@ void test(char8_t u8, char16_t u16, char32_t u32) {
     c16(char32_t(0x7f));
     c16(char32_t(0x80));
     c16(char32_t(0xD7FF));
-    c16(char32_t(0xD800)); // expected-warning {{implicit conversion from 'char32_t' to 'char16_t' changes the meaning of the code unit '<0xD800>'}}
+    c16(char32_t(0xD800));
     c16(char32_t(0xE000));
     c16(char32_t(U'🐉')); // expected-warning {{implicit conversion from 'char32_t' to 'char16_t' changes the meaning of the code point '🐉'}}
 
@@ -44,8 +44,8 @@ void test(char8_t u8, char16_t u16, char32_t u32) {
     c32(char16_t(0x80));
 
     c32(char16_t(0xD7FF));
-    c32(char16_t(0xD800)); // expected-warning {{implicit conversion from 'char16_t' to 'char32_t' changes the meaning of the code unit '<0xD800>'}}
-    c32(char16_t(0xDFFF)); // expected-warning {{implicit conversion from 'char16_t' to 'char32_t' changes the meaning of the code unit '<0xDFFF>'}}
+    c32(char16_t(0xD800));
+    c32(char16_t(0xDFFF));
     c32(char16_t(0xE000));
     c32(char16_t(u'☕'));
 
diff --git a/clang/test/SemaCXX/warn-shadow-in-lambdas.cpp b/clang/test/SemaCXX/warn-shadow-in-lambdas.cpp
index 2388c5f16e4ca..0042ef035c84c 100644
--- a/clang/test/SemaCXX/warn-shadow-in-lambdas.cpp
+++ b/clang/test/SemaCXX/warn-shadow-in-lambdas.cpp
@@ -259,11 +259,11 @@ struct S {
 
 int foo() {
 #ifdef AVOID
-  auto [a] = S{0}; // cxx14-warning {{decomposition declarations are a C++17 extension}}
+  auto [a] = S{0}; // cxx14-warning {{structured binding declarations are a C++17 extension}}
   [a = a] () { // No warning with basic -Wshadow due to uncaptured-local classification
   }();
 #else
-  auto [a] = S{0}; // cxx14-warning {{decomposition declarations are a C++17 extension}} expected-note {{previous declaration is here}}
+  auto [a] = S{0}; // cxx14-warning {{structured binding declarations are a C++17 extension}} expected-note {{previous declaration is here}}
   [a = a] () { // expected-warning {{declaration shadows a structured binding}}
   }();
 #endif
diff --git a/clang/test/SemaHLSL/BuiltIns/Buffers.hlsl b/clang/test/SemaHLSL/BuiltIns/Buffers.hlsl
index 3f0a37d5d5a08..999372c95554e 100644
--- a/clang/test/SemaHLSL/BuiltIns/Buffers.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/Buffers.hlsl
@@ -20,7 +20,7 @@ Buffer<double2> r4;
 // expected-error@+4 {{constraints not satisfied for class template 'Buffer'}}
 // expected-note@*:* {{template declaration from hidden source: template <typename element_type> requires __is_typed_resource_element_compatible<element_type> class Buffer}}
 // expected-note@*:* {{because 'Buffer<int>' does not satisfy '__is_typed_resource_element_compatible'}}
-// expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(Buffer<int>)' evaluated to false}}
+// expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(hlsl::Buffer<int>)' evaluated to false}}
 Buffer<Buffer<int> > r5;
 
 struct s {
@@ -66,7 +66,7 @@ Buffer<half[4]> r10;
 typedef vector<int, 8> int8;
 // expected-error@+3 {{constraints not satisfied for class template 'Buffer'}}
 // expected-note@*:* {{because 'int8' (aka 'vector<int, 8>') does not satisfy '__is_typed_resource_element_compatible'}}
-// expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(int8)' evaluated to false}}
+// expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(vector<int, 8>)' evaluated to false}}
 Buffer<int8> r11;
 
 typedef int MyInt;
@@ -91,7 +91,7 @@ Buffer<numbers> r15;
 
 // expected-error@+3 {{constraints not satisfied for class template 'Buffer'}}
 // expected-note@*:* {{because 'double3' (aka 'vector<double, 3>') does not satisfy '__is_typed_resource_element_compatible'}}
-// expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(double3)' evaluated to false}}
+// expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(vector<double, 3>)' evaluated to false}}
 Buffer<double3> r16;
 
 
diff --git a/clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl b/clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl
index aa36c4838273e..b33f2af8a1117 100644
--- a/clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl
@@ -20,7 +20,7 @@ RWBuffer<double2> r4;
 // expected-error@+4 {{constraints not satisfied for class template 'RWBuffer'}}
 // expected-note@*:* {{template declaration from hidden source: template <typename element_type> requires __is_typed_resource_element_compatible<element_type> class RWBuffer}}
 // expected-note@*:* {{because 'RWBuffer<int>' does not satisfy '__is_typed_resource_element_compatible'}}
-// expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(RWBuffer<int>)' evaluated to false}}
+// expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(hlsl::RWBuffer<int>)' evaluated to false}}
 RWBuffer<RWBuffer<int> > r5;
 
 struct s {
@@ -66,7 +66,7 @@ RWBuffer<half[4]> r10;
 typedef vector<int, 8> int8;
 // expected-error@+3 {{constraints not satisfied for class template 'RWBuffer'}}
 // expected-note@*:* {{because 'int8' (aka 'vector<int, 8>') does not satisfy '__is_typed_resource_element_compatible'}}
-// expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(int8)' evaluated to false}}
+// expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(vector<int, 8>)' evaluated to false}}
 RWBuffer<int8> r11;
 
 typedef int MyInt;
@@ -91,7 +91,7 @@ RWBuffer<numbers> r15;
 
 // expected-error@+3 {{constraints not satisfied for class template 'RWBuffer'}}
 // expected-note@*:* {{because 'double3' (aka 'vector<double, 3>') does not satisfy '__is_typed_resource_element_compatible'}}
-// expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(double3)' evaluated to false}}
+// expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(vector<double, 3>)' evaluated to false}}
 RWBuffer<double3> r16;
 
 
diff --git a/clang/test/SemaHLSL/BuiltIns/matrix-basic_types-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/matrix-basic_types-errors.hlsl
index 6f6b01bac829e..5ad1d6aefde38 100644
--- a/clang/test/SemaHLSL/BuiltIns/matrix-basic_types-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/matrix-basic_types-errors.hlsl
@@ -10,3 +10,26 @@ uint16_t4x4 mat2;
 matrix<int, 5, 5> mat3;
 // expected-error@-1 {{constraints not satisfied for alias template 'matrix' [with element = int, rows_count = 5, cols_count = 5]}}
 // expected-note@* {{because '5 <= 4' (5 <= 4) evaluated to false}}
+
+using float8x4 = __attribute__((matrix_type(8,4))) float;
+// expected-error@-1 {{matrix row size too large}}
+
+using float4x8 = __attribute__((matrix_type(4,8))) float;
+// expected-error@-1 {{matrix column size too large}}
+
+using float8x8 = __attribute__((matrix_type(8,8))) float;
+// expected-error@-1 {{matrix row and column size too large}}
+
+using floatNeg1x4 = __attribute__((matrix_type(-1,4))) float;
+// expected-error@-1 {{matrix row size too large}}
+using float4xNeg1 = __attribute__((matrix_type(4,-1))) float;
+// expected-error@-1 {{matrix column size too large}}
+using floatNeg1xNeg1 = __attribute__((matrix_type(-1,-1))) float; 
+// expected-error@-1 {{matrix row and column size too large}}
+
+using float0x4 = __attribute__((matrix_type(0,4))) float;
+// expected-error@-1 {{zero matrix size}}
+using float4x0 = __attribute__((matrix_type(4,0))) float;
+// expected-error@-1 {{zero matrix size}}
+using float0x0 = __attribute__((matrix_type(0,0))) float;
+// expected-error@-1 {{zero matrix size}}
diff --git a/clang/test/SemaHLSL/Language/AggregateSplatCasts.hlsl b/clang/test/SemaHLSL/Language/AggregateSplatCasts.hlsl
index e5a851c50caf8..623fca0ef9aca 100644
--- a/clang/test/SemaHLSL/Language/AggregateSplatCasts.hlsl
+++ b/clang/test/SemaHLSL/Language/AggregateSplatCasts.hlsl
@@ -3,8 +3,9 @@
 // splat from vec1 to vec
 // CHECK-LABEL: call1
 // CHECK: CStyleCastExpr {{.*}} 'int3':'vector<int, 3>' <HLSLAggregateSplatCast>
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' lvalue <FloatingToIntegral> part_of_explicit_cast
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' lvalue <HLSLVectorTruncation> part_of_explicit_cast
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <FloatingToIntegral> part_of_explicit_cast
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <HLSLVectorTruncation> part_of_explicit_cast
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float1':'vector<float, 1>' <LValueToRValue> part_of_explicit_cast
 // CHECK-NEXT: DeclRefExpr {{.*}} 'float1':'vector<float, 1>' lvalue Var {{.*}} 'A' 'float1':'vector<float, 1>'
 export void call1() {
   float1 A = {1.0};
@@ -21,7 +22,7 @@ struct S {
 // splat from scalar to aggregate
 // CHECK-LABEL: call2
 // CHECK: CStyleCastExpr {{.*}} 'S' <HLSLAggregateSplatCast>
-// CHECK-NEXt: IntegerLiteral {{.*}} 'int' 5
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 5
 export void call2() {
   S s = (S)5; 
 }
diff --git a/clang/test/SemaHLSL/Language/ElementwiseCasts.hlsl b/clang/test/SemaHLSL/Language/ElementwiseCasts.hlsl
index 563d3f02a1485..8481cfc1b18e2 100644
--- a/clang/test/SemaHLSL/Language/ElementwiseCasts.hlsl
+++ b/clang/test/SemaHLSL/Language/ElementwiseCasts.hlsl
@@ -21,3 +21,27 @@ export void call2() {
   float B[1] = {1.0};
   B = (float[1])A;
 }
+
+struct S {
+  int A;
+  float F;
+};
+
+// cast from a struct
+// CHECK-LABEL: call3
+// CHECK: CStyleCastExpr {{.*}} 'int[2]' <HLSLElementwiseCast>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'S' lvalue Var {{.*}} 'SS' 'S'
+export void call3() {
+  S SS = {1,1.0};
+  int A[2] = (int[2])SS;
+}
+
+// cast from a vector
+// CHECK-LABEL: call4
+// CHECK: CStyleCastExpr {{.*}} 'float[3]' <HLSLElementwiseCast>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int3':'vector<int, 3>' <LValueToRValue> part_of_explicit_cast
+// CHECK-NEXT: DeclRefExpr {{.*}} 'int3':'vector<int, 3>' lvalue Var {{.*}} 'A' 'int3'
+export void call4() {
+  int3 A = {1,2,3};
+  float B[3] = (float[3])A;
+}
diff --git a/clang/test/SemaHLSL/Language/TemplateOutArg.hlsl b/clang/test/SemaHLSL/Language/TemplateOutArg.hlsl
index 2d6252cbb4d2b..3365dbefabfcd 100644
--- a/clang/test/SemaHLSL/Language/TemplateOutArg.hlsl
+++ b/clang/test/SemaHLSL/Language/TemplateOutArg.hlsl
@@ -195,6 +195,81 @@ T buzz(int X, T Y) {
   return X + Y;
 }
 
+// Case 4: Verify that the parameter modifier attributes are instantiated 
+// for both templated and non-templated arguments, and that the non-templated
+// out argument type is not modified by the template instantiation.
+
+// CHECK-LABEL: FunctionTemplateDecl {{.*}} fizz_two
+
+// Check the pattern decl.
+// CHECK: FunctionDecl {{.*}} fizz_two 'void (inout T, out int)'
+// CHECK-NEXT: ParmVarDecl {{.*}} referenced V 'T'
+// CHECK-NEXT: HLSLParamModifierAttr {{.*}} inout
+// CHECK-NEXT: ParmVarDecl {{.*}} referenced I 'int &__restrict'
+// CHECK-NEXT: HLSLParamModifierAttr {{.*}} out
+
+// Check the 3 instantiations (int, float, & double).
+
+// CHECK-LABEL: FunctionDecl {{.*}} used fizz_two 'void (inout int, out int)' implicit_instantiation
+// CHECK: ParmVarDecl {{.*}} used V 'int &__restrict'
+// CHECK-NEXT: HLSLParamModifierAttr {{.*}} inout
+// CHECK: ParmVarDecl {{.*}} used I 'int &__restrict'
+// CHECK-NEXT: HLSLParamModifierAttr {{.*}} out
+
+// CHECK-LABEL: FunctionDecl {{.*}} used fizz_two 'void (inout float, out int)' implicit_instantiation
+// CHECK: ParmVarDecl {{.*}} used V 'float &__restrict'
+// CHECK-NEXT: HLSLParamModifierAttr {{.*}} inout
+// CHECK: ParmVarDecl {{.*}} used I 'int &__restrict'
+// CHECK-NEXT: HLSLParamModifierAttr {{.*}} out
+
+// CHECK-LABEL: FunctionDecl {{.*}} used fizz_two 'void (inout double, out int)' implicit_instantiation
+// CHECK: ParmVarDecl {{.*}} used V 'double &__restrict'
+// CHECK-NEXT: HLSLParamModifierAttr {{.*}} inout
+// CHECK: ParmVarDecl {{.*}} used I 'int &__restrict'
+// CHECK-NEXT: HLSLParamModifierAttr {{.*}} out
+template <typename T>
+void fizz_two(inout T V, out int I) {
+  V += 2;
+  I = V;
+}
+
+// Case 5: Verify that `in` parameter modifier attributes are instantiated 
+// for both templated and non-templated arguments and argument types are not
+// modified
+
+// CHECK-LABEL: FunctionTemplateDecl {{.*}} buzz_two
+
+// Check the pattern decl.
+// CHECK: FunctionDecl {{.*}} buzz_two 'int (T, int)'
+// CHECK-NEXT: ParmVarDecl {{.*}} referenced A 'T'
+// CHECK-NEXT: HLSLParamModifierAttr {{.*}} in
+// CHECK-NEXT: ParmVarDecl {{.*}} referenced B 'int'
+// CHECK-NEXT: HLSLParamModifierAttr {{.*}} in
+
+// Check the 3 instantiations (int, float, & double).
+
+// CHECK-LABEL: FunctionDecl {{.*}} used buzz_two 'int (int, int)' implicit_instantiation
+// CHECK: ParmVarDecl {{.*}} used A 'int'
+// CHECK-NEXT: HLSLParamModifierAttr {{.*}} in
+// CHECK: ParmVarDecl {{.*}} used B 'int'
+// CHECK-NEXT: HLSLParamModifierAttr {{.*}} in
+
+// CHECK-LABEL: FunctionDecl {{.*}} used buzz_two 'int (float, int)' implicit_instantiation
+// CHECK: ParmVarDecl {{.*}} used A 'float'
+// CHECK-NEXT: HLSLParamModifierAttr {{.*}} in
+// CHECK: ParmVarDecl {{.*}} used B 'int'
+// CHECK-NEXT: HLSLParamModifierAttr {{.*}} in
+
+// CHECK-LABEL: FunctionDecl {{.*}} used buzz_two 'int (double, int)' implicit_instantiation
+// CHECK: ParmVarDecl {{.*}} used A 'double'
+// CHECK-NEXT: HLSLParamModifierAttr {{.*}} in
+// CHECK: ParmVarDecl {{.*}} used B 'int'
+// CHECK-NEXT: HLSLParamModifierAttr {{.*}} in
+template <typename T>
+int buzz_two(in T A, in int B) {
+  return A + B;
+}
+
 export void caller() {
   int X = 2;
   float Y = 3.3;
@@ -211,4 +286,12 @@ export void caller() {
   X = buzz(X, X);
   Y = buzz(X, Y);
   Z = buzz(X, Z);
+
+  fizz_two(X, X);
+  fizz_two(Y, X);
+  fizz_two(Z, X);
+
+  X = buzz_two(X, X);
+  X = buzz_two(Y, X);
+  X = buzz_two(Z, X);
 }
diff --git a/clang/test/SemaHLSL/Operators/logical-not.hlsl b/clang/test/SemaHLSL/Operators/logical-not.hlsl
new file mode 100644
index 0000000000000..d06ca3982be05
--- /dev/null
+++ b/clang/test/SemaHLSL/Operators/logical-not.hlsl
@@ -0,0 +1,53 @@
+// RUN: %clang_cc1 -finclude-default-header -triple  dxil-pc-shadermodel6.6-library %s -fnative-half-type -ast-dump -ast-dump-filter=case | FileCheck %s
+
+// CHECK-LABEL: FunctionDecl {{.*}} used case1 'uint32_t2 (uint32_t2)'
+// CHECK-NEXT: ParmVarDecl {{.*}} used b 'uint32_t2':'vector<uint32_t, 2>'
+// CHECK-NEXT: CompoundStmt
+// CHECK-NEXT: ReturnStmt
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector<uint32_t, 2>' <IntegralCast>
+// CHECK-NEXT: UnaryOperator {{.*}} 'vector<bool, 2>' prefix '!' cannot overflow
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector<bool, 2>' <IntegralToBoolean>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'uint32_t2':'vector<uint32_t, 2>' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'uint32_t2':'vector<uint32_t, 2>' lvalue ParmVar {{.*}} 'b' 'uint32_t2':'vector<uint32_t, 2>'
+export uint32_t2 case1(uint32_t2 b) {
+    return !b;
+}
+
+// CHECK-LABEL: FunctionDecl {{.*}} used case2 'int32_t3 (int32_t3)'
+// CHECK-NEXT: ParmVarDecl {{.*}} used b 'int32_t3':'vector<int32_t, 3>'
+// CHECK-NEXT: CompoundStmt
+// CHECK-NEXT: ReturnStmt
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector<int32_t, 3>' <IntegralCast>
+// CHECK-NEXT: UnaryOperator {{.*}} 'vector<bool, 3>' prefix '!' cannot overflow
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector<bool, 3>' <IntegralToBoolean>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int32_t3':'vector<int32_t, 3>' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'int32_t3':'vector<int32_t, 3>' lvalue ParmVar {{.*}} 'b' 'int32_t3':'vector<int32_t, 3>'
+export int32_t3 case2(int32_t3 b) {
+    return !b;
+}
+
+// CHECK-LABEL: FunctionDecl {{.*}} used case3 'float16_t (float16_t)'
+// CHECK-NEXT: ParmVarDecl {{.*}} used b 'float16_t':'half'
+// CHECK-NEXT: CompoundStmt
+// CHECK-NEXT: ReturnStmt
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float16_t':'half' <IntegralToFloating>
+// CHECK-NEXT: UnaryOperator {{.*}} 'bool' prefix '!' cannot overflow
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'bool' <FloatingToBoolean>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float16_t':'half' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'float16_t':'half' lvalue ParmVar {{.*}} 'b' 'float16_t':'half'
+export float16_t case3(float16_t b) {
+    return !b;
+}
+
+// CHECK-LABEL: FunctionDecl {{.*}} used case4 'float4 (float4)'
+// CHECK-NEXT: ParmVarDecl {{.*}} used b 'float4':'vector<float, 4>'
+// CHECK-NEXT: CompoundStmt
+// CHECK-NEXT: ReturnStmt
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector<float, 4>' <IntegralToFloating>
+// CHECK-NEXT: UnaryOperator {{.*}} 'vector<bool, 4>' prefix '!' cannot overflow
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector<bool, 4>' <FloatingToBoolean>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4':'vector<float, 4>' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'float4':'vector<float, 4>' lvalue ParmVar {{.*}} 'b' 'float4':'vector<float, 4>'
+export float4 case4(float4 b) {
+    return !b;
+}
diff --git a/clang/test/SemaOpenACC/combined-construct-reduction-clause.cpp b/clang/test/SemaOpenACC/combined-construct-reduction-clause.cpp
index 1b50336207305..c406644961942 100644
--- a/clang/test/SemaOpenACC/combined-construct-reduction-clause.cpp
+++ b/clang/test/SemaOpenACC/combined-construct-reduction-clause.cpp
@@ -31,10 +31,12 @@ void uses(unsigned Parm) {
 
 #pragma acc serial loop reduction(&: CoS, I, F)
   // expected-error@-1{{variable of type 'float' referenced in OpenACC 'reduction' clause does not have a valid operation available}}
-  // expected-error@-2{{invalid operands to binary expression ('float' and 'float')}}
-  // expected-error@-3{{variable of type 'float' referenced in OpenACC 'reduction' clause does not have a valid operation available}}
+  // expected-note@-2{{while forming binary operator '&='}}
+  // expected-error@-3{{invalid operands to binary expression ('float' and 'float')}}
+  // expected-error@-4{{variable of type 'float' referenced in OpenACC 'reduction' clause does not have a valid operation available}}
   // expected-note@#COS_FLOAT{{while forming combiner for compound type 'CompositeOfScalars'}}
-  // expected-error@-5{{invalid operands to binary expression ('float' and 'float')}}
+  // expected-note@-6{{while forming binary operator '&='}}
+  // expected-error@-7{{invalid operands to binary expression ('float' and 'float')}}
   for(int i = 0; i < 5; ++i);
 
 #pragma acc kernels loop reduction(min: CoS, Array[I], Array[0:I])
diff --git a/clang/test/SemaOpenACC/compute-construct-reduction-clause.c b/clang/test/SemaOpenACC/compute-construct-reduction-clause.c
index 96c01d0aed7ba..cb490ed940c4c 100644
--- a/clang/test/SemaOpenACC/compute-construct-reduction-clause.c
+++ b/clang/test/SemaOpenACC/compute-construct-reduction-clause.c
@@ -59,10 +59,12 @@ void uses(unsigned Parm) {
   // Vars in a reduction must be a scalar or a composite of scalars.
 #pragma acc parallel reduction(&: CoS, I, F)
   // expected-error@-1{{variable of type 'float' referenced in OpenACC 'reduction' clause does not have a valid operation available}}
-  // expected-error@-2{{invalid operands to binary expression ('float' and 'float')}}
-  // expected-error@-3{{variable of type 'float' referenced in OpenACC 'reduction' clause does not have a valid operation available}}
+  // expected-note@-2{{while forming binary operator '&='}}
+  // expected-error@-3{{invalid operands to binary expression ('float' and 'float')}}
+  // expected-error@-4{{variable of type 'float' referenced in OpenACC 'reduction' clause does not have a valid operation available}}
   // expected-note@#COS_FLOAT{{while forming combiner for compound type 'CompositeOfScalars'}}
-  // expected-error@-5{{invalid operands to binary expression ('float' and 'float')}}
+  // expected-note@-6{{while forming binary operator '&='}}
+  // expected-error@-7{{invalid operands to binary expression ('float' and 'float')}}
   while (1);
   // expected-error@+3{{invalid type 'struct CompositeOfScalars' used in OpenACC 'reduction' variable reference; type is not a scalar value}}
   // expected-note@#COS_FIELD{{used as field 'COS' of composite 'CompositeHasComposite'}}
@@ -76,15 +78,17 @@ void uses(unsigned Parm) {
 #pragma acc parallel reduction(&: CoS, Array[I], Array[0:I])
   // expected-error@-1{{variable of type 'float' referenced in OpenACC 'reduction' clause does not have a valid operation available}}
   // expected-note@#COS_FLOAT{{while forming combiner for compound type 'CompositeOfScalars'}}
-  // expected-error@-3{{invalid operands to binary expression ('float' and 'float')}}
+  // expected-note@-3{{while forming binary operator '&='}}
+  // expected-error@-4{{invalid operands to binary expression ('float' and 'float')}}
   while (1);
 
   struct CompositeHasComposite ChCArray[5];
-  // expected-error@+6{{invalid type 'struct CompositeOfScalars' used in OpenACC 'reduction' variable reference; type is not a scalar value}}
+  // expected-error@+7{{invalid type 'struct CompositeOfScalars' used in OpenACC 'reduction' variable reference; type is not a scalar value}}
   // expected-note@#COS_FIELD{{used as field 'COS' of composite 'CompositeHasComposite'}}
-  // expected-note@+4{{OpenACC 'reduction' variable reference must be a scalar variable or a composite of scalars, or an array, sub-array, or element of scalar types}}
-  // expected-error@+3{{variable of type 'float' referenced in OpenACC 'reduction' clause does not have a valid operation available}}
+  // expected-note@+5{{OpenACC 'reduction' variable reference must be a scalar variable or a composite of scalars, or an array, sub-array, or element of scalar types}}
+  // expected-error@+4{{variable of type 'float' referenced in OpenACC 'reduction' clause does not have a valid operation available}}
   // expected-note@#COS_FLOAT{{while forming combiner for compound type 'CompositeOfScalars'}}
+  // expected-note@+2{{while forming binary operator '&='}}
   // expected-error@+1{{invalid operands to binary expression ('float' and 'float')}}
 #pragma acc parallel reduction(&: CoS, Array[I], ChCArray[0:I])
   while (1);
diff --git a/clang/test/SemaOpenACC/compute-construct-reduction-clause.cpp b/clang/test/SemaOpenACC/compute-construct-reduction-clause.cpp
index e3a487a94ec23..70dc3d6d88937 100644
--- a/clang/test/SemaOpenACC/compute-construct-reduction-clause.cpp
+++ b/clang/test/SemaOpenACC/compute-construct-reduction-clause.cpp
@@ -62,10 +62,12 @@ void uses(unsigned Parm) {
   // Vars in a reduction must be a scalar or a composite of scalars.
 #pragma acc parallel reduction(&: CoS, I, F)
   // expected-error@-1{{variable of type 'float' referenced in OpenACC 'reduction' clause does not have a valid operation available}}
-  // expected-error@-2{{invalid operands to binary expression ('float' and 'float')}}
-  // expected-error@-3{{variable of type 'float' referenced in OpenACC 'reduction' clause does not have a valid operation available}}
+  // expected-note@-2{{while forming binary operator '&='}}
+  // expected-error@-3{{invalid operands to binary expression ('float' and 'float')}}
+  // expected-error@-4{{variable of type 'float' referenced in OpenACC 'reduction' clause does not have a valid operation available}}
   // expected-note@#COS_FLOAT{{while forming combiner for compound type 'CompositeOfScalars'}}
-  // expected-error@-5{{invalid operands to binary expression ('float' and 'float')}}
+  // expected-note@-6{{while forming binary operator '&='}}
+  // expected-error@-7{{invalid operands to binary expression ('float' and 'float')}}
   while (1);
   // expected-error@+3{{invalid type 'struct CompositeOfScalars' used in OpenACC 'reduction' variable reference; type is not a scalar value}}
   // expected-note@#COS_FIELD{{used as field 'COS' of composite 'CompositeHasComposite'}}
@@ -78,7 +80,8 @@ void uses(unsigned Parm) {
 #pragma acc parallel reduction(&: CoS, Array[I], Array[0:I])
   // expected-error@-1{{variable of type 'float' referenced in OpenACC 'reduction' clause does not have a valid operation available}}
   // expected-note@#COS_FLOAT{{while forming combiner for compound type 'CompositeOfScalars'}}
-  // expected-error@-3{{invalid operands to binary expression ('float' and 'float')}}
+  // expected-note@-3{{while forming binary operator '&='}}
+  // expected-error@-4{{invalid operands to binary expression ('float' and 'float')}}
   while (1);
 
   // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
@@ -221,7 +224,8 @@ void TemplUses(T Parm, U CoS, V ChC) {
 #pragma acc parallel reduction(&: CoS, Var, Parm)
   // expected-error@-1{{variable of type 'float' referenced in OpenACC 'reduction' clause does not have a valid operation available}}
   // expected-note@#COS_FLOAT{{while forming combiner for compound type 'CompositeOfScalars'}}
-  // expected-error@-3{{invalid operands to binary expression ('float' and 'float')}}
+  // expected-note@-3{{while forming binary operator '&='}}
+  // expected-error@-4{{invalid operands to binary expression ('float' and 'float')}}
   while (1);
   // expected-error@+3{{invalid type 'struct CompositeOfScalars' used in OpenACC 'reduction' variable reference; type is not a scalar value}}
   // expected-note@#COS_FIELD{{used as field 'COS' of composite 'CompositeHasComposite'}}
@@ -236,7 +240,8 @@ void TemplUses(T Parm, U CoS, V ChC) {
 #pragma acc parallel reduction(&: CoS, Array[Var], Array[0:Var])
   // expected-error@-1{{variable of type 'float' referenced in OpenACC 'reduction' clause does not have a valid operation available}}
   // expected-note@#COS_FLOAT{{while forming combiner for compound type 'CompositeOfScalars'}}
-  // expected-error@-3{{invalid operands to binary expression ('float' and 'float')}}
+  // expected-note@-3{{while forming binary operator '&='}}
+  // expected-error@-4{{invalid operands to binary expression ('float' and 'float')}}
   while (1);
 
   // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, or composite variable member}}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-error.cl
new file mode 100644
index 0000000000000..b21b604baa944
--- /dev/null
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-error.cl
@@ -0,0 +1,22 @@
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx950         -S -verify -o - %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx9-4-generic -S -verify -o - %s
+// REQUIRES: amdgpu-registered-target
+
+typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u32;
+typedef v4u32 __global *global_ptr_to_v4u32;
+
+void test_amdgcn_global_store_b128_00(v4u32 *ptr, v4u32 data, const char* scope) {
+  __builtin_amdgcn_global_store_b128(ptr, data, "");  //expected-error{{passing '__private v4u32 *__private' to parameter of type '__attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int __global *' changes address space of pointer}}
+}
+
+void test_amdgcn_global_store_b128_01(global_ptr_to_v4u32 ptr, v4u32 data, const char* scope) {
+  __builtin_amdgcn_global_store_b128(ptr, data, scope);  //expected-error{{expression is not a string literal}}
+}
+
+v4u32 test_amdgcn_global_load_b128_00(v4u32 *ptr, const char* scope) {
+  return __builtin_amdgcn_global_load_b128(ptr, "");  //expected-error{{passing '__private v4u32 *__private' to parameter of type '__attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int __global *' changes address space of pointer}}
+}
+
+v4u32 test_amdgcn_global_load_b128_01(global_ptr_to_v4u32 ptr, const char* scope) {
+  return __builtin_amdgcn_global_load_b128(ptr, scope);  //expected-error{{expression is not a string literal}}
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl
new file mode 100644
index 0000000000000..649ed9247b040
--- /dev/null
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl
@@ -0,0 +1,30 @@
+// We test loads and stores separately because clang only seems to exit after
+// the first 'target feature' error.
+
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx9-generic    -DTEST_LOAD  -S -verify -o - %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx10-1-generic -DTEST_LOAD  -S -verify -o - %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx10-3-generic -DTEST_LOAD  -S -verify -o - %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx11-generic   -DTEST_LOAD  -S -verify -o - %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx12-generic   -DTEST_LOAD  -S -verify -o - %s
+
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx9-generic    -DTEST_STORE -S -verify -o - %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx10-1-generic -DTEST_STORE -S -verify -o - %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx10-3-generic -DTEST_STORE -S -verify -o - %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx11-generic   -DTEST_STORE -S -verify -o - %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx12-generic   -DTEST_STORE -S -verify -o - %s
+// REQUIRES: amdgpu-registered-target
+
+typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u32;
+typedef v4u32 __global *global_ptr_to_v4u32;
+
+#ifdef TEST_LOAD
+v4u32 test_amdgcn_global_load_b128_01(global_ptr_to_v4u32 ptr, const char* scope) {
+  return __builtin_amdgcn_global_load_b128(ptr, ""); // expected-error{{'__builtin_amdgcn_global_load_b128' needs target feature gfx940-insts}}
+}
+#endif
+
+#ifdef TEST_STORE
+void test_amdgcn_global_store_b128_01(global_ptr_to_v4u32 ptr, v4u32 data, const char* scope) {
+  __builtin_amdgcn_global_store_b128(ptr, data, ""); // expected-error{{'__builtin_amdgcn_global_store_b128' needs target feature gfx940-insts}}
+}
+#endif
diff --git a/clang/test/SemaTemplate/GH164330.cpp b/clang/test/SemaTemplate/GH164330.cpp
new file mode 100644
index 0000000000000..767420de272e6
--- /dev/null
+++ b/clang/test/SemaTemplate/GH164330.cpp
@@ -0,0 +1,309 @@
+// RUN: %clang_cc1 -std=c++20 -verify %s
+// expected-no-diagnostics
+
+template <int __v> struct integral_constant {
+  static constexpr int value = __v;
+};
+template <bool _Val> using _BoolConstant = integral_constant<_Val>;
+template <int, class> struct tuple_element;
+template <class...> class tuple;
+template <int _Ip, class... _Tp> struct tuple_element<_Ip, tuple<_Tp...>> {
+  using type = __type_pack_element<_Ip, _Tp...>;
+};
+template <class> struct tuple_size;
+template <bool> using __enable_if_t = int;
+template <template <class> class _BaseType, class _Tp, _Tp _SequenceSize>
+using __make_integer_sequence_impl =
+    __make_integer_seq<_BaseType, _Tp, _SequenceSize>;
+template <class _Tp, _Tp...> struct __integer_sequence;
+template <int... _Indices>
+using __index_sequence = __integer_sequence<int, _Indices...>;
+template <int _SequenceSize>
+using __make_index_sequence =
+    __make_integer_sequence_impl<__integer_sequence, int, _SequenceSize>;
+template <class _Tp, _Tp...> struct integer_sequence {};
+template <int... _Ip> using index_sequence = integer_sequence<int, _Ip...>;
+template <class _Tp, _Tp _Ep>
+using make_integer_sequence =
+    __make_integer_sequence_impl<integer_sequence, _Tp, _Ep>;
+template <int _Np> using make_index_sequence = make_integer_sequence<int, _Np>;
+enum __element_count : int;
+constexpr void __constexpr_memmove(char *__dest, const char *__src,
+                                   __element_count __n) {
+  __builtin_memmove(__dest, __src, __n);
+}
+template <class _Tp> using __underlying_type_t = __underlying_type(_Tp);
+template <class _Tp> using underlying_type_t = __underlying_type_t<_Tp>;
+template <class _Tp, class> using __enable_if_tuple_size_imp = _Tp;
+template <class _Tp>
+struct tuple_size<__enable_if_tuple_size_imp<
+    const _Tp, __enable_if_t<_BoolConstant<__is_volatile(int)>::value>>>
+    : integral_constant<tuple_size<_Tp>::value> {};
+template <class... _Tp>
+struct tuple_size<tuple<_Tp...>> : integral_constant<sizeof...(_Tp)> {};
+template <class _Tp> constexpr int tuple_size_v = tuple_size<_Tp>::value;
+template <class _T1, class _T2> struct pair {
+  _T1 first;
+  _T2 second;
+};
+template <class _T1> constexpr pair<_T1, char *> make_pair(_T1, char *__t2) {
+  return pair(_T1(), __t2);
+}
+template <int, class _Hp> struct __tuple_leaf {
+  _Hp __value_;
+  constexpr const _Hp &get() const { return __value_; }
+};
+template <class...> struct __tuple_impl;
+template <int... _Indx, class... _Tp>
+struct __tuple_impl<__index_sequence<_Indx...>, _Tp...>
+    : __tuple_leaf<_Indx, _Tp>... {
+  template <class... _Args>
+  constexpr __tuple_impl(int, _Args... __args)
+      : __tuple_leaf<_Indx, _Tp>(__args)... {}
+};
+template <class... _Tp> struct tuple {
+  __tuple_impl<__make_index_sequence<sizeof...(_Tp)>, _Tp...> __base_;
+  template <class... _Up> constexpr tuple(_Up... __u) : __base_({}, __u...) {}
+};
+template <int _Ip, class... _Tp>
+constexpr const tuple_element<_Ip, tuple<_Tp...>>::type &
+get(const tuple<_Tp...> &__t) noexcept {
+  return static_cast<const __tuple_leaf<
+      _Ip, typename tuple_element<_Ip, tuple<_Tp...>>::type> &>(__t.__base_)
+      .get();
+}
+template <class... _Tp> constexpr tuple<_Tp...> make_tuple(_Tp... __t) {
+  return tuple<_Tp...>(__t...);
+}
+constexpr int __char_traits_length_checked(const char *__s) {
+  return __builtin_strlen(__s);
+}
+struct basic_string_view {
+  constexpr basic_string_view() {}
+  constexpr basic_string_view(const char *__s)
+      : __data_(__s), __size_(__char_traits_length_checked(__s)) {}
+  constexpr const char *begin() { return __data_; }
+  constexpr const char *end() {
+    return __data_ + __size_;
+  }
+  const char *__data_;
+  int __size_;
+};
+template <class _Algorithm>
+constexpr pair<const char *, char *>
+__copy_move_unwrap_iters(const char *__first, const char *__last,
+                         char *__out_first) {
+  pair<const char *, const char *> __range = {__first, __last};
+  auto __result = _Algorithm()(__range.first, __range.second, __out_first);
+  return make_pair(__result.first, __result.second);
+}
+struct __copy_impl {
+  constexpr pair<const char *, char *>
+  operator()(const char *__first, const char *__last, char *__result) {
+    const int __n(__last - __first);
+    __constexpr_memmove(__result, __first, __element_count(__n));
+    return make_pair(__last, __result);
+  }
+};
+constexpr char *copy(const char *__first, const char *__last, char *__result) {
+  return __copy_move_unwrap_iters<__copy_impl>(__first, __last, __result).second;
+}
+constexpr char *copy_n(const char *__first, int __orig_n, char *__result) {
+  return copy(__first, __first + __orig_n, __result);
+}
+template <int _Size> struct array {
+  basic_string_view __elems_[_Size];
+  constexpr basic_string_view &operator[](int __n) { return __elems_[__n]; }
+  constexpr basic_string_view operator[](int __n) const {
+    return __elems_[__n];
+  }
+};
+
+template <typename> struct FieldId;
+
+template <FieldId field> constexpr auto FieldIdToInnerValue() {
+  return field.template ToInnerValue<field>();
+}
+struct FieldNameEnum {
+  enum class type;
+};
+template <int N> using FieldName = FieldNameEnum::type;
+template <typename, auto> struct GetParentMessageAtIndexImpl;
+template <typename, auto> struct FieldInfoHelper;
+template <FieldId...> struct PathImpl;
+template <int N> struct LongPathLiteral {
+  consteval LongPathLiteral(const char (&s)[N]) {
+    copy_n(s, N, long_path)[N] = field_count = long_path_size = 1;
+  }
+  consteval basic_string_view to_string_view() const { return long_path; }
+  char long_path[N + 1];
+  int long_path_size;
+  int field_count;
+};
+template <LongPathLiteral kLongPath> consteval auto get_field_components() {
+  basic_string_view long_path(kLongPath.to_string_view());
+  array<kLongPath.field_count> ret;
+  for (int i = 0; i < kLongPath.field_count; ++i)
+    ret[i] = long_path;
+  return ret;
+}
+template <LongPathLiteral kLongPath>
+constexpr auto kFieldComponents = get_field_components<kLongPath>();
+template <LongPathLiteral kLongPath> struct LongPathHelper {
+  template <int... I>
+  static PathImpl<kFieldComponents<kLongPath>[I]...>
+      PathForLongPath(index_sequence<I...>);
+  using type =
+      decltype(PathForLongPath(make_index_sequence<kLongPath.field_count>{}));
+};
+template <typename T> struct PathFieldId {
+  template <typename Arg> constexpr PathFieldId(Arg &arg) : value(arg) {}
+  T value;
+};
+template <PathFieldId...> constexpr auto PathImplHelper();
+
+template <int N> using FieldName = FieldName<N>;
+enum class FieldNumber;
+template <PathFieldId... fields>
+constexpr auto Path = PathImplHelper<fields...>();
+template <typename Proto, FieldId field>
+using FieldInfo =
+    FieldInfoHelper<Proto, FieldIdToInnerValue<field>()>::type;
+template <> struct FieldId<FieldNameEnum::type> {
+  constexpr FieldId(basic_string_view);
+  int size;
+  long hash;
+  template <auto field> static constexpr auto ToInnerValue() {
+    return static_cast<FieldNameEnum::type>(field.hash);
+  }
+};
+FieldId(basic_string_view) -> FieldId<FieldNameEnum::type>;
+template <typename Proto, FieldId field, int index>
+using GetParentMessageAtIndex = GetParentMessageAtIndexImpl<
+    Proto, FieldIdToInnerValue<field>()>::type;
+
+template <typename T>
+PathFieldId(T &t) -> PathFieldId<decltype(LongPathLiteral(t))>;
+template <FieldId... fields1, FieldId... fields2>
+constexpr PathImpl<fields1..., fields2...> *ConcatPath(PathImpl<fields1...> *,
+                                                       PathImpl<fields2...> *) {
+  return nullptr;
+}
+template <LongPathLiteral long_path_literal>
+constexpr LongPathHelper<long_path_literal>::type *SinglePath() {
+  return nullptr;
+}
+template <PathFieldId... fields> constexpr auto PathImplHelper() {
+  return ConcatPath(SinglePath<fields.value>()...);
+}
+template <auto hash_prime, auto offset_bias>
+constexpr auto Fnv1a(basic_string_view str) {
+  auto hash = offset_bias;
+  for (char c : str) {
+    hash ^= c;
+    hash *= hash_prime;
+  }
+  return hash;
+}
+constexpr auto HashField(basic_string_view str) {
+  return Fnv1a<1099511628211u, 1039346656037>(str);
+}
+template <typename FI> struct FieldInfoValueTypeAlias : FI {};
+template <typename Proto, auto field> struct FieldInfoHelperBase {
+  static constexpr auto MaskFieldNameHash() {
+    using FieldEnum = decltype(field);
+    return FieldEnum{static_cast<underlying_type_t<FieldEnum>>(field) & 31};
+  }
+  using internal_type =
+      Proto::template FieldInfoImpl<decltype(field), MaskFieldNameHash()>;
+};
+template <typename Proto, auto field> struct FieldInfoHelper {
+  using type = FieldInfoValueTypeAlias<
+      typename FieldInfoHelperBase<Proto, field>::internal_type>;
+};
+
+template <auto... fields>
+struct FieldId<const PathImpl<fields...> *> {
+  constexpr FieldId(PathImpl<fields...> *) : path() {}
+  template <auto field> static constexpr auto ToInnerValue() {
+    return field.path;
+  }
+  const PathImpl<fields...> *path;
+};
+template <auto... fields>
+FieldId(PathImpl<fields...> *)
+    -> FieldId<const PathImpl<fields...> *>;
+
+template <auto> struct UnpackedField {
+  static constexpr bool is_path = false;
+};
+template <auto... fields, const PathImpl<fields...> *path>
+struct UnpackedField<path> {
+  static constexpr auto value = make_tuple(fields...);
+  static constexpr bool is_path = true;
+};
+template <typename Proto, FieldId... fields, const PathImpl<fields...> *path>
+struct GetParentMessageAtIndexImpl<Proto, path> {
+  using type = Proto;
+};
+
+constexpr FieldId<FieldNameEnum::type>::FieldId(basic_string_view str)
+    : size(), hash(HashField(str)) {}
+template <FieldId field> constexpr bool IsPath() {
+  return UnpackedField<
+      FieldIdToInnerValue<field>()>::is_path;
+}
+template <FieldId field> constexpr auto UnpackFieldToTuple() {
+  return UnpackedField<FieldIdToInnerValue<field>()>::value;
+}
+template <int> struct CompileTimeString {
+  consteval CompileTimeString(basic_string_view &v) : internal_view_(v) {}
+  basic_string_view &internal_view_;
+};
+CompileTimeString(basic_string_view) -> CompileTimeString<0>;
+
+template <CompileTimeString... parts> struct NameJoiner {
+  template <CompileTimeString... after>
+  NameJoiner<parts...> operator+(NameJoiner<after...>);
+};
+template <FieldId> struct FieldNameBuilder;
+template <FieldId field>
+  requires(!IsPath<field>())
+struct FieldNameBuilder<field> {
+  template <typename Proto> static auto Get() {
+    return NameJoiner<FieldInfo<Proto, field>::name>();
+  }
+};
+template <FieldId field>
+  requires(IsPath<field>())
+struct FieldNameBuilder<field> {
+  static constexpr auto kTuple = UnpackFieldToTuple<field>();
+  static constexpr int kTupleSize = tuple_size_v<decltype(kTuple)>;
+  template <typename Proto, int... Is> static void Get(index_sequence<Is...>) {
+    (FieldNameBuilder<get<Is>(
+         kTuple)>::template Get<GetParentMessageAtIndex<Proto, field, Is>>() +
+     ...);
+  }
+  template <typename Proto> static void Get() {
+    Get<Proto>(make_index_sequence<kTupleSize>());
+  }
+};
+
+struct T {
+  template <typename FieldType, FieldType> struct FieldInfoImpl;
+};
+void AddPathsToFieldMask() {
+  FieldNameBuilder<Path<"message_field", "int32_field">>::Get<T>();
+}
+template <> struct T::FieldInfoImpl<FieldNumber, FieldNumber{1}> {
+  static basic_string_view name;
+};
+template <>
+struct T::FieldInfoImpl<FieldName<1>, FieldName<1>{12}>
+    : FieldInfoImpl<FieldNumber, FieldNumber{1}> {};
+template <> struct T::FieldInfoImpl<FieldNumber, FieldNumber{10}> {
+  static basic_string_view name;
+};
+template <>
+struct T::FieldInfoImpl<FieldName<3>, FieldName<3>{11}>
+    : FieldInfoImpl<FieldNumber, FieldNumber{10}> {};
diff --git a/clang/test/SemaTemplate/concepts-recursive-inst.cpp b/clang/test/SemaTemplate/concepts-recursive-inst.cpp
index d36c6a8f02b20..73dce9317f383 100644
--- a/clang/test/SemaTemplate/concepts-recursive-inst.cpp
+++ b/clang/test/SemaTemplate/concepts-recursive-inst.cpp
@@ -68,8 +68,8 @@ struct my_range{
 void baz() {
 auto it = begin(rng); // #BEGIN_CALL
 // expected-error-re@#INF_REQ {{satisfaction of constraint {{.*}} depends on itself}}
-// expected-note@#INF_BEGIN {{while checking the satisfaction of concept 'Inf<struct my_range>' requested here}}
-// expected-note@#INF_BEGIN_EXPR {{while checking constraint satisfaction for template 'begin<struct my_range>' required here}}
+// expected-note@#INF_BEGIN {{while checking the satisfaction of concept 'Inf<DirectRecursiveCheck::my_range>' requested here}}
+// expected-note@#INF_BEGIN_EXPR {{while checking constraint satisfaction for template 'begin<DirectRecursiveCheck::my_range>' required here}}
 // expected-note@#INF_BEGIN_EXPR {{while substituting deduced template arguments into function template 'begin'}}
 // expected-note@#INF_BEGIN_EXPR {{in instantiation of requirement here}}
 // expected-note@#INF_REQ {{while substituting template arguments into constraint expression here}}
diff --git a/clang/test/SemaTemplate/concepts.cpp b/clang/test/SemaTemplate/concepts.cpp
index ee2bb8de66079..5b0f3d39d9648 100644
--- a/clang/test/SemaTemplate/concepts.cpp
+++ b/clang/test/SemaTemplate/concepts.cpp
@@ -833,13 +833,13 @@ struct Parent {
 static_assert(Parent<void>::TakesUnary<int, 0>::i == 0);
 // expected-error@+3{{constraints not satisfied for class template 'TakesUnary'}}
 // expected-note@#UNARY{{because 'decltype(0ULL)' (aka 'unsigned long long') does not satisfy 'C'}}
-// expected-note@#61777_C{{because 'sizeof(decltype(0ULL)) == 4' (8 == 4) evaluated to false}}
+// expected-note@#61777_C{{because 'sizeof(unsigned long long) == 4' (8 == 4) evaluated to false}}
 static_assert(Parent<void>::TakesUnary<int, 0uLL>::i == 0);
 
 static_assert(Parent<int>::TakesBinary<int, 0>::i == 0);
 // expected-error@+3{{constraints not satisfied for class template 'TakesBinary'}}
 // expected-note@#BINARY{{because 'C2<decltype(0ULL), int>' evaluated to false}}
-// expected-note@#61777_C2{{because 'sizeof(decltype(0ULL)) == sizeof(int)' (8 == 4) evaluated to false}}
+// expected-note@#61777_C2{{because 'sizeof(unsigned long long) == sizeof(int)' (8 == 4) evaluated to false}}
 static_assert(Parent<int>::TakesBinary<int, 0ULL>::i == 0);
 }
 
@@ -1329,8 +1329,8 @@ static_assert(__cpp17_iterator<not_move_constructible>); \
 // expected-error {{static assertion failed}} \
 // expected-note {{because 'not_move_constructible' does not satisfy '__cpp17_iterator'}} \
 // expected-note@#__cpp17_copy_constructible {{because 'not_move_constructible' does not satisfy '__cpp17_copy_constructible'}} \
-// expected-note@#__cpp17_move_constructible {{because 'not_move_constructible' does not satisfy '__cpp17_move_constructible'}} \
-// expected-note@#is_move_constructible_v {{because 'is_move_constructible_v<not_move_constructible>' evaluated to false}}
+// expected-note@#__cpp17_move_constructible {{because 'parameter_mapping_regressions::case3::not_move_constructible' does not satisfy '__cpp17_move_constructible'}} \
+// expected-note@#is_move_constructible_v {{because 'is_move_constructible_v<parameter_mapping_regressions::case3::not_move_constructible>' evaluated to false}}
 }
 
 namespace case4 {
@@ -1416,6 +1416,49 @@ concept IsEntitySpec =
 
 }
 
+namespace case8 {
+
+template <class T>
+struct type_identity {
+    using type = T;
+};
+
+template <typename Inner>
+struct Cat {};
+
+template <typename T>
+concept CatConcept = requires {
+    []<class Inner>(type_identity<Cat<Inner>>) {}(type_identity<T>{});
+};
+
+template <typename Dummy>
+struct Feeder {
+    template <CatConcept Dummy2>
+    void feed() noexcept {}
+};
+
+void main() { Feeder<int>{}.feed<Cat<int>>(); }
+
+}
+
+namespace case9 {
+
+template <typename>
+concept a = requires { requires true; };
+template <typename T>
+concept b = a<typename T::EntitySpec>;
+template <typename T>
+concept c = requires { b<T>; };
+template <typename T>
+  requires c<T>
+struct s;
+template <typename> constexpr bool f() { return true; }
+template <typename T> constexpr bool d = f<T>();
+struct s2;
+static_assert(d<s<s2>>);
+
+}
+
 }
 
 namespace GH162125 {
@@ -1489,6 +1532,31 @@ static_assert( requires {{ &f } -> C;} ); // expected-error {{reference to overl
 
 }
 
+namespace GH162092 {
+
+template <typename T>
+struct vector;
+
+template <typename T, typename U>
+concept C = __is_same_as(T, U);
+
+template<class T, auto Cpt>
+concept generic_range_value = requires {
+    Cpt.template operator()<int>();
+};
+
+
+template<generic_range_value<[]<
+   C<int>
+   >() {}> T>
+void x() {}
+
+void foo() {
+  x<vector<int>>();
+}
+
+}
+
 namespace GH162770 {
   enum e {};
   template<e> struct s {};
diff --git a/clang/test/SemaTemplate/cxx1z-decomposition.cpp b/clang/test/SemaTemplate/cxx1z-decomposition.cpp
index 779c4cf75e9db..b0cc4e66e3dc6 100644
--- a/clang/test/SemaTemplate/cxx1z-decomposition.cpp
+++ b/clang/test/SemaTemplate/cxx1z-decomposition.cpp
@@ -15,7 +15,7 @@ namespace std {
 template<> struct std::tuple_size<C> { enum { value = 2 }; };
 
 template<typename T> int decomp(T &t) { 
-  auto &[a, b] = t; // expected-error {{type 'D' decomposes into 3 elements, but only 2 names were provided}}
+  auto &[a, b] = t; // expected-error {{type 'D' binds to 3 elements, but only 2 names were provided}}
   return a + b; // expected-error {{cannot initialize return object of type 'int' with an rvalue of type 'int *'}}
 }
 
diff --git a/clang/test/SemaTemplate/instantiate-self.cpp b/clang/test/SemaTemplate/instantiate-self.cpp
index 4999a4ad3784d..1cdf2c6dd503e 100644
--- a/clang/test/SemaTemplate/instantiate-self.cpp
+++ b/clang/test/SemaTemplate/instantiate-self.cpp
@@ -86,7 +86,7 @@ namespace test7 {
 
 namespace test8 {
   template<typename T> struct A {
-    int n = A{}.n; // expected-error {{default member initializer for 'n' uses itself}} expected-note {{instantiation of default member init}}
+    int n = A{}.n; // expected-error {{default member initializer for 'n' uses itself}}
   };
   A<int> ai = {}; // expected-note {{instantiation of default member init}}
 }
@@ -100,7 +100,7 @@ namespace test9 {
 
 namespace test10 {
   template<typename T> struct A {
-    void f() noexcept(noexcept(f())); // expected-error {{exception specification of 'f' uses itself}} expected-note {{instantiation of}}
+    void f() noexcept(noexcept(f())); // expected-error {{exception specification of 'f' uses itself}}
   };
   bool b = noexcept(A<int>().f()); // expected-note {{instantiation of}}
 }
@@ -125,7 +125,7 @@ namespace test11 {
 }
 
 namespace test12 {
-  template<typename T> int f(T t, int = f(T())) {} // expected-error {{recursive evaluation of default argument}} expected-note {{instantiation of}}
+  template<typename T> int f(T t, int = f(T())) {} // expected-error {{recursive evaluation of default argument}}
   struct X {};
   int q = f(X()); // expected-note {{instantiation of}}
 }
@@ -171,3 +171,25 @@ namespace test13 {
   A::Z<A> aza;
 #endif
 }
+
+namespace test14 {
+  template <class> void f();
+  template <class T, decltype(new (f<void>()) T)> T x;
+}
+
+namespace test15 {
+  template <class V> void __overload(V);
+
+  template <class, class> struct __invoke_result_impl;
+  template <class _Arg>
+  struct __invoke_result_impl<decltype(__overload(*(_Arg*)0)),
+                              _Arg>;
+  struct variant {
+    template <class _Arg,
+              class = typename __invoke_result_impl<void, _Arg>::type>
+    variant(_Arg);
+  };
+  struct Matcher {
+    Matcher(variant);
+  } vec(vec); // expected-warning {{uninitialized}}
+} // namespace test15
diff --git a/clang/test/Unit/lit.cfg.py b/clang/test/Unit/lit.cfg.py
index 37e91d0f8629f..ebe35a10e7f30 100644
--- a/clang/test/Unit/lit.cfg.py
+++ b/clang/test/Unit/lit.cfg.py
@@ -51,7 +51,7 @@ def find_shlibpath_var():
         yield "LD_LIBRARY_PATH"
     elif platform.system() == "Darwin":
         yield "DYLD_LIBRARY_PATH"
-    elif platform.system() == "Windows":
+    elif platform.system() == "Windows" or sys.platform == "cygwin":
         yield "PATH"
     elif platform.system() == "AIX":
         yield "LIBPATH"
diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index ca57d3581235a..a36b4aae9417d 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -396,8 +396,8 @@ fatbinary(ArrayRef<std::pair<StringRef, StringRef>> InputFiles,
   CmdArgs.push_back("--create");
   CmdArgs.push_back(*TempFileOrErr);
   for (const auto &[File, Arch] : InputFiles)
-    CmdArgs.push_back(
-        Args.MakeArgString("--image=profile=" + Arch + ",file=" + File));
+    CmdArgs.push_back(Args.MakeArgString(
+        "--image3=kind=elf,sm=" + Arch.drop_front(3) + ",file=" + File));
 
   if (Error Err = executeCommands(*FatBinaryPath, CmdArgs))
     return std::move(Err);
diff --git a/clang/tools/driver/cc1as_main.cpp b/clang/tools/driver/cc1as_main.cpp
index ef79e45364a2f..50da2f8449a22 100644
--- a/clang/tools/driver/cc1as_main.cpp
+++ b/clang/tools/driver/cc1as_main.cpp
@@ -417,7 +417,8 @@ getOutputStream(StringRef Path, DiagnosticsEngine &Diags, bool Binary) {
 }
 
 static bool ExecuteAssemblerImpl(AssemblerInvocation &Opts,
-                                 DiagnosticsEngine &Diags) {
+                                 DiagnosticsEngine &Diags,
+                                 IntrusiveRefCntPtr<vfs::FileSystem> VFS) {
   // Get the target specific parser.
   std::string Error;
   const Target *TheTarget = TargetRegistry::lookupTarget(Opts.Triple, Error);
@@ -440,6 +441,7 @@ static bool ExecuteAssemblerImpl(AssemblerInvocation &Opts,
   // Record the location of the include directories so that the lexer can find
   // it later.
   SrcMgr.setIncludeDirs(Opts.IncludePaths);
+  SrcMgr.setVirtualFileSystem(VFS);
 
   std::unique_ptr<MCRegisterInfo> MRI(TheTarget->createMCRegInfo(Opts.Triple));
   assert(MRI && "Unable to create target register info!");
@@ -632,8 +634,9 @@ static bool ExecuteAssemblerImpl(AssemblerInvocation &Opts,
 }
 
 static bool ExecuteAssembler(AssemblerInvocation &Opts,
-                             DiagnosticsEngine &Diags) {
-  bool Failed = ExecuteAssemblerImpl(Opts, Diags);
+                             DiagnosticsEngine &Diags,
+                             IntrusiveRefCntPtr<vfs::FileSystem> VFS) {
+  bool Failed = ExecuteAssemblerImpl(Opts, Diags, VFS);
 
   // Delete output file if there were errors.
   if (Failed) {
@@ -714,7 +717,7 @@ int cc1as_main(ArrayRef<const char *> Argv, const char *Argv0, void *MainAddr) {
   }
 
   // Execute the invocation, unless there were parsing errors.
-  bool Failed = Diags.hasErrorOccurred() || ExecuteAssembler(Asm, Diags);
+  bool Failed = Diags.hasErrorOccurred() || ExecuteAssembler(Asm, Diags, VFS);
 
   // If any timers were active but haven't been destroyed yet, print their
   // results now.
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index d18c45e8c4d43..fc27fd29da933 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -1644,9 +1644,9 @@ bool CursorVisitor::VisitTagTypeLoc(TagTypeLoc TL) {
     return true;
 
   if (TL.isDefinition())
-    return Visit(MakeCXCursor(TL.getOriginalDecl(), TU, RegionOfInterest));
+    return Visit(MakeCXCursor(TL.getDecl(), TU, RegionOfInterest));
 
-  return Visit(MakeCursorTypeRef(TL.getOriginalDecl(), TL.getNameLoc(), TU));
+  return Visit(MakeCursorTypeRef(TL.getDecl(), TL.getNameLoc(), TU));
 }
 
 bool CursorVisitor::VisitTemplateTypeParmTypeLoc(TemplateTypeParmTypeLoc TL) {
@@ -1852,7 +1852,7 @@ bool CursorVisitor::VisitPackIndexingTypeLoc(PackIndexingTypeLoc TL) {
 }
 
 bool CursorVisitor::VisitInjectedClassNameTypeLoc(InjectedClassNameTypeLoc TL) {
-  return Visit(MakeCursorTypeRef(TL.getOriginalDecl(), TL.getNameLoc(), TU));
+  return Visit(MakeCursorTypeRef(TL.getDecl(), TL.getNameLoc(), TU));
 }
 
 bool CursorVisitor::VisitAtomicTypeLoc(AtomicTypeLoc TL) {
diff --git a/clang/tools/libclang/CIndexCodeCompletion.cpp b/clang/tools/libclang/CIndexCodeCompletion.cpp
index 6d14f286b973a..81448b4d11342 100644
--- a/clang/tools/libclang/CIndexCodeCompletion.cpp
+++ b/clang/tools/libclang/CIndexCodeCompletion.cpp
@@ -617,7 +617,7 @@ namespace {
       if (!baseType.isNull()) {
         // Get the declaration for a class/struct/union/enum type
         if (const TagType *Tag = baseType->getAs<TagType>())
-          D = Tag->getOriginalDecl();
+          D = Tag->getDecl();
         // Get the @interface declaration for a (possibly-qualified) Objective-C
         // object pointer type, e.g., NSString*
         else if (const ObjCObjectPointerType *ObjPtr = 
@@ -629,7 +629,7 @@ namespace {
         // Get the class for a C++ injected-class-name
         else if (const InjectedClassNameType *Injected =
                  baseType->getAs<InjectedClassNameType>())
-          D = Injected->getOriginalDecl();
+          D = Injected->getDecl();
       }
 
       if (D != nullptr) {
diff --git a/clang/tools/libclang/CXCursor.cpp b/clang/tools/libclang/CXCursor.cpp
index 56f113c1dc309..0a43d73063c1f 100644
--- a/clang/tools/libclang/CXCursor.cpp
+++ b/clang/tools/libclang/CXCursor.cpp
@@ -1338,7 +1338,7 @@ CXCursor cxcursor::getTypeRefCursor(CXCursor cursor) {
   if (const TypedefType *Typedef = Ty->getAs<TypedefType>())
     return MakeCursorTypeRef(Typedef->getDecl(), Loc, TU);
   if (const TagType *Tag = Ty->getAs<TagType>())
-    return MakeCursorTypeRef(Tag->getOriginalDecl(), Loc, TU);
+    return MakeCursorTypeRef(Tag->getDecl(), Loc, TU);
   if (const TemplateTypeParmType *TemplP = Ty->getAs<TemplateTypeParmType>())
     return MakeCursorTypeRef(TemplP->getDecl(), Loc, TU);
 
diff --git a/clang/tools/libclang/CXIndexDataConsumer.cpp b/clang/tools/libclang/CXIndexDataConsumer.cpp
index 932201a94cdae..c97aefaf87a8e 100644
--- a/clang/tools/libclang/CXIndexDataConsumer.cpp
+++ b/clang/tools/libclang/CXIndexDataConsumer.cpp
@@ -357,7 +357,7 @@ CXIndexDataConsumer::CXXBasesListInfo::CXXBasesListInfo(const CXXRecordDecl *D,
           TST = T->getAs<TemplateSpecializationType>()) {
       BaseD = TST->getTemplateName().getAsTemplateDecl();
     } else if (const RecordType *RT = T->getAs<RecordType>()) {
-      BaseD = RT->getOriginalDecl();
+      BaseD = RT->getDecl();
     }
 
     if (BaseD)
diff --git a/clang/tools/libclang/CXType.cpp b/clang/tools/libclang/CXType.cpp
index d21ac7cceed9a..3feb56334d79c 100644
--- a/clang/tools/libclang/CXType.cpp
+++ b/clang/tools/libclang/CXType.cpp
@@ -546,11 +546,11 @@ CXCursor clang_getTypeDeclaration(CXType CT) {
     break;
   case Type::Record:
   case Type::Enum:
-    D = cast<TagType>(TP)->getOriginalDecl();
+    D = cast<TagType>(TP)->getDecl();
     break;
   case Type::TemplateSpecialization:
     if (const RecordType *Record = TP->getAs<RecordType>())
-      D = Record->getOriginalDecl();
+      D = Record->getDecl();
     else
       D = cast<TemplateSpecializationType>(TP)->getTemplateName()
                                                          .getAsTemplateDecl();
@@ -564,7 +564,7 @@ CXCursor clang_getTypeDeclaration(CXType CT) {
     break;
 
   case Type::InjectedClassName:
-    D = cast<InjectedClassNameType>(TP)->getOriginalDecl();
+    D = cast<InjectedClassNameType>(TP)->getDecl();
     break;
 
     // FIXME: Template type parameters!
@@ -1037,7 +1037,7 @@ static long long visitRecordForValidation(const RecordDecl *RD) {
       return CXTypeLayoutError_Dependent;
     // recurse
     if (const RecordType *ChildType = I->getType()->getAs<RecordType>()) {
-      if (const RecordDecl *Child = ChildType->getOriginalDecl()) {
+      if (const RecordDecl *Child = ChildType->getDecl()) {
         long long ret = visitRecordForValidation(Child);
         if (ret < 0)
           return ret;
diff --git a/clang/unittests/AST/ASTContextParentMapTest.cpp b/clang/unittests/AST/ASTContextParentMapTest.cpp
index 4a8aa488cb70e..545eaf3619171 100644
--- a/clang/unittests/AST/ASTContextParentMapTest.cpp
+++ b/clang/unittests/AST/ASTContextParentMapTest.cpp
@@ -132,8 +132,7 @@ TEST(GetParents, FriendTypeLoc) {
   TypeLoc FrALoc = FrA.getFriendType()->getTypeLoc();
   TypeLoc FrBLoc = FrB.getFriendType()->getTypeLoc();
   bool FrAOwnsTag = FrALoc.getTypePtr()->getAs<TagType>()->isTagOwned();
-  TagDecl *FrATagDecl =
-      FrALoc.getTypePtr()->getAs<TagType>()->getOriginalDecl();
+  TagDecl *FrATagDecl = FrALoc.getTypePtr()->getAs<TagType>()->getDecl();
   bool FrBOwnsTag = FrBLoc.getTypePtr()->getAs<TagType>()->isTagOwned();
 
   EXPECT_THAT(Ctx.getParents(A), ElementsAre(DynTypedNode::create(TU)));
diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp
index e7160bcf2e0c2..4c7ea5e338a13 100644
--- a/clang/unittests/AST/ASTImporterTest.cpp
+++ b/clang/unittests/AST/ASTImporterTest.cpp
@@ -29,7 +29,7 @@ using internal::Matcher;
 
 static const RecordDecl *getRecordDeclOfFriend(FriendDecl *FD) {
   QualType Ty = FD->getFriendType()->getType().getCanonicalType();
-  return cast<RecordType>(Ty)->getOriginalDecl();
+  return cast<RecordType>(Ty)->getDecl();
 }
 
 struct ImportExpr : TestImportBase {};
@@ -4614,7 +4614,7 @@ TEST_P(ImportFriendClasses, ImportOfClassDefinitionAndFwdFriendShouldBeLinked) {
   auto *Friend = FirstDeclMatcher<FriendDecl>().match(FromTU0, friendDecl());
   QualType FT = Friend->getFriendType()->getType();
   FT = FromTU0->getASTContext().getCanonicalType(FT);
-  auto *Fwd = cast<TagType>(FT)->getOriginalDecl();
+  auto *Fwd = cast<TagType>(FT)->getDecl();
   auto *ImportedFwd = Import(Fwd, Lang_CXX03);
   Decl *FromTU1 = getTuDecl(
       R"(
diff --git a/clang/unittests/AST/StructuralEquivalenceTest.cpp b/clang/unittests/AST/StructuralEquivalenceTest.cpp
index bee288dbfe436..24e20c7471f3c 100644
--- a/clang/unittests/AST/StructuralEquivalenceTest.cpp
+++ b/clang/unittests/AST/StructuralEquivalenceTest.cpp
@@ -719,13 +719,11 @@ TEST_F(StructuralEquivalenceRecordTest, AnonymousRecordsShouldBeInequivalent) {
   auto *A = FirstDeclMatcher<IndirectFieldDecl>().match(
       TU, indirectFieldDecl(hasName("a")));
   auto *FA = cast<FieldDecl>(A->chain().front());
-  RecordDecl *RA =
-      cast<RecordType>(FA->getType().getTypePtr())->getOriginalDecl();
+  RecordDecl *RA = cast<RecordType>(FA->getType().getTypePtr())->getDecl();
   auto *B = FirstDeclMatcher<IndirectFieldDecl>().match(
       TU, indirectFieldDecl(hasName("b")));
   auto *FB = cast<FieldDecl>(B->chain().front());
-  RecordDecl *RB =
-      cast<RecordType>(FB->getType().getTypePtr())->getOriginalDecl();
+  RecordDecl *RB = cast<RecordType>(FB->getType().getTypePtr())->getDecl();
 
   ASSERT_NE(RA, RB);
   EXPECT_TRUE(testStructuralMatch(RA, RA));
@@ -754,15 +752,13 @@ TEST_F(StructuralEquivalenceRecordTest,
   auto *A = FirstDeclMatcher<IndirectFieldDecl>().match(
       TU, indirectFieldDecl(hasName("a")));
   auto *FA = cast<FieldDecl>(A->chain().front());
-  RecordDecl *RA =
-      cast<RecordType>(FA->getType().getTypePtr())->getOriginalDecl();
+  RecordDecl *RA = cast<RecordType>(FA->getType().getTypePtr())->getDecl();
 
   auto *TU1 = get<1>(t);
   auto *A1 = FirstDeclMatcher<IndirectFieldDecl>().match(
       TU1, indirectFieldDecl(hasName("a")));
   auto *FA1 = cast<FieldDecl>(A1->chain().front());
-  RecordDecl *RA1 =
-      cast<RecordType>(FA1->getType().getTypePtr())->getOriginalDecl();
+  RecordDecl *RA1 = cast<RecordType>(FA1->getType().getTypePtr())->getDecl();
 
   RecordDecl *X =
       FirstDeclMatcher<RecordDecl>().match(TU, recordDecl(hasName("X")));
diff --git a/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp b/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp
index 95f8ae266ae10..ef229606de0f0 100644
--- a/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp
+++ b/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp
@@ -2038,4 +2038,42 @@ TEST(ExprMutationAnalyzerTest, PointeeMutatedByConditionOperator) {
   EXPECT_TRUE(isPointeeMutated(Results, AST.get()));
 }
 
+TEST(ExprMutationAnalyzerTest, PointeeMutatedByReturn) {
+  {
+    const std::string Code = R"(
+    int * f() {
+      int *const x = nullptr;
+      return x;
+    })";
+    auto AST = buildASTFromCodeWithArgs(Code, {"-Wno-everything"});
+    auto Results =
+        match(withEnclosingCompound(declRefTo("x")), AST->getASTContext());
+    EXPECT_TRUE(isPointeeMutated(Results, AST.get()));
+  }
+  {
+    const std::string Code = R"(
+    int * f() {
+      int *const x = nullptr;
+      return x;
+    })";
+    // in C++23, AST will have NoOp cast.
+    auto AST =
+        buildASTFromCodeWithArgs(Code, {"-Wno-everything", "-std=c++23"});
+    auto Results =
+        match(withEnclosingCompound(declRefTo("x")), AST->getASTContext());
+    EXPECT_TRUE(isPointeeMutated(Results, AST.get()));
+  }
+  {
+    const std::string Code = R"(
+    int const* f() {
+      int *const x = nullptr;
+      return x;
+    })";
+    auto AST = buildASTFromCodeWithArgs(Code, {"-Wno-everything"});
+    auto Results =
+        match(withEnclosingCompound(declRefTo("x")), AST->getASTContext());
+    EXPECT_FALSE(isPointeeMutated(Results, AST.get()));
+  }
+}
+
 } // namespace clang
diff --git a/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt b/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt
index 35082387b46e9..1d932ec6e8a96 100644
--- a/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt
+++ b/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt
@@ -25,6 +25,8 @@ add_clang_unittest(ClangAnalysisFlowSensitiveTests
   TransferTest.cpp
   TypeErasedDataflowAnalysisTest.cpp
   UncheckedOptionalAccessModelTest.cpp
+  UncheckedStatusOrAccessModelTest.cpp
+  UncheckedStatusOrAccessModelTestFixture.cpp
   ValueTest.cpp
   WatchedLiteralsSolverTest.cpp
   CLANG_LIBS
diff --git a/clang/unittests/Analysis/FlowSensitive/MockHeaders.cpp b/clang/unittests/Analysis/FlowSensitive/MockHeaders.cpp
index c280921930a05..d87554203a5b7 100644
--- a/clang/unittests/Analysis/FlowSensitive/MockHeaders.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/MockHeaders.cpp
@@ -27,6 +27,9 @@ using nullptr_t = decltype(nullptr);
 
 } // namespace std
 
+typedef decltype(sizeof(char)) size_t;
+typedef decltype(sizeof(char*)) ptrdiff_t;
+
 #endif // CSTDDEF_H
 )";
 
@@ -456,6 +459,10 @@ struct is_scalar
 template <>
 struct is_scalar<nullptr_t> : public true_type {};
 
+struct in_place_t {};
+
+constexpr in_place_t in_place;
+
 } // namespace std
 
 #endif // STD_TYPE_TRAITS_H
@@ -479,12 +486,37 @@ struct conjunction<T, Ts...>
 template <typename T>
 struct conjunction<T> : T {};
 
+template <typename... Ts>
+struct disjunction : std::false_type {};
+
+template <typename T, typename... Ts>
+struct disjunction<T, Ts...>
+    : std::conditional<T::value, T, disjunction<Ts...>>::type {};
+
+template <typename T>
+struct disjunction<T> : T {};
+
 template <typename T>
 struct negation : std::integral_constant<bool, !T::value> {};
 
 template <bool B, typename T = void>
 using enable_if_t = typename std::enable_if<B, T>::type;
 
+
+template <bool B, typename T, typename F>
+using conditional_t = typename std::conditional<B, T, F>::type;
+
+template <typename T>
+using remove_cv_t = typename std::remove_cv<T>::type;
+
+template <typename T>
+using remove_reference_t = typename std::remove_reference<T>::type;
+
+template <typename T>
+using decay_t = typename std::decay<T>::type;
+
+using std::in_place;
+using std::in_place_t;
 } // namespace absl
 
 #endif // ABSL_TYPE_TRAITS_H
@@ -499,8 +531,16 @@ namespace std {
 struct string {
   string(const char*);
   ~string();
+  const char *c_str() const;
+  bool empty();
+};
+
+struct string_view {
+  string_view(const char*);
+  ~string_view();
   bool empty();
 };
+
 bool operator!=(const string &LHS, const char *RHS);
 
 } // namespace std
@@ -552,9 +592,6 @@ static constexpr char StdOptionalHeader[] = R"(
 
 namespace std {
 
-struct in_place_t {};
-constexpr in_place_t in_place;
-
 struct nullopt_t {
   constexpr explicit nullopt_t() {}
 };
@@ -792,9 +829,6 @@ struct nullopt_t {
 };
 constexpr nullopt_t nullopt;
 
-struct in_place_t {};
-constexpr in_place_t in_place;
-
 template <typename T>
 class optional;
 
@@ -1240,6 +1274,964 @@ constexpr bool operator!=(const T &value, const Optional<U> &opt);
 } // namespace base
 )";
 
+constexpr const char StatusDefsHeader[] =
+    R"cc(
+#ifndef STATUS_H_
+#define STATUS_H_
+
+#include "absl_type_traits.h"
+#include "std_initializer_list.h"
+#include "std_string.h"
+#include "std_type_traits.h"
+#include "std_utility.h"
+
+namespace absl {
+struct SourceLocation {
+  static constexpr SourceLocation current();
+  static constexpr SourceLocation
+  DoNotInvokeDirectlyNoSeriouslyDont(int line, const char *file_name);
+};
+} // namespace absl
+namespace absl {
+enum class StatusCode : int {
+  kOk,
+  kCancelled,
+  kUnknown,
+  kInvalidArgument,
+  kDeadlineExceeded,
+  kNotFound,
+  kAlreadyExists,
+  kPermissionDenied,
+  kResourceExhausted,
+  kFailedPrecondition,
+  kAborted,
+  kOutOfRange,
+  kUnimplemented,
+  kInternal,
+  kUnavailable,
+  kDataLoss,
+  kUnauthenticated,
+};
+} // namespace absl
+
+namespace absl {
+enum class StatusToStringMode : int {
+  kWithNoExtraData = 0,
+  kWithPayload = 1 << 0,
+  kWithSourceLocation = 1 << 1,
+  kWithEverything = ~kWithNoExtraData,
+  kDefault = kWithPayload,
+};
+class Status {
+public:
+  Status();
+  template <typename Enum> Status(Enum code, std::string_view msg);
+  Status(absl::StatusCode code, std::string_view msg,
+         absl::SourceLocation loc = SourceLocation::current());
+  Status(const Status &base_status, absl::SourceLocation loc);
+  Status(Status &&base_status, absl::SourceLocation loc);
+  ~Status() {}
+
+  Status(const Status &);
+  Status &operator=(const Status &x);
+
+  Status(Status &&) noexcept;
+  Status &operator=(Status &&);
+
+  friend bool operator==(const Status &, const Status &);
+  friend bool operator!=(const Status &, const Status &);
+
+  bool ok() const { return true; }
+  void CheckSuccess() const;
+  void IgnoreError() const;
+  int error_code() const;
+  absl::Status ToCanonical() const;
+  std::string
+  ToString(StatusToStringMode m = StatusToStringMode::kDefault) const;
+  void Update(const Status &new_status);
+  void Update(Status &&new_status);
+};
+
+bool operator==(const Status &lhs, const Status &rhs);
+bool operator!=(const Status &lhs, const Status &rhs);
+
+Status OkStatus();
+Status InvalidArgumentError(char *);
+
+#endif // STATUS_H
+)cc";
+
+constexpr const char StatusOrDefsHeader[] = R"cc(
+#ifndef STATUSOR_H_
+#define STATUSOR_H_
+#include "absl_type_traits.h"
+#include "status_defs.h"
+#include "std_initializer_list.h"
+#include "std_type_traits.h"
+#include "std_utility.h"
+
+template <typename T> struct StatusOr;
+
+namespace internal_statusor {
+
+template <typename T, typename U, typename = void>
+struct HasConversionOperatorToStatusOr : std::false_type {};
+
+template <typename T, typename U>
+void test(char (*)[sizeof(std::declval<U>().operator absl::StatusOr<T>())]);
+
+template <typename T, typename U>
+struct HasConversionOperatorToStatusOr<T, U, decltype(test<T, U>(0))>
+    : std::true_type {};
+
+template <typename T, typename U>
+using IsConstructibleOrConvertibleFromStatusOr =
+    absl::disjunction<std::is_constructible<T, StatusOr<U> &>,
+                      std::is_constructible<T, const StatusOr<U> &>,
+                      std::is_constructible<T, StatusOr<U> &&>,
+                      std::is_constructible<T, const StatusOr<U> &&>,
+                      std::is_convertible<StatusOr<U> &, T>,
+                      std::is_convertible<const StatusOr<U> &, T>,
+                      std::is_convertible<StatusOr<U> &&, T>,
+                      std::is_convertible<const StatusOr<U> &&, T>>;
+
+template <typename T, typename U>
+using IsConstructibleOrConvertibleOrAssignableFromStatusOr =
+    absl::disjunction<IsConstructibleOrConvertibleFromStatusOr<T, U>,
+                      std::is_assignable<T &, StatusOr<U> &>,
+                      std::is_assignable<T &, const StatusOr<U> &>,
+                      std::is_assignable<T &, StatusOr<U> &&>,
+                      std::is_assignable<T &, const StatusOr<U> &&>>;
+
+template <typename T, typename U>
+struct IsDirectInitializationAmbiguous
+    : public absl::conditional_t<
+          std::is_same<absl::remove_cv_t<absl::remove_reference_t<U>>,
+                       U>::value,
+          std::false_type,
+          IsDirectInitializationAmbiguous<
+              T, absl::remove_cv_t<absl::remove_reference_t<U>>>> {};
+
+template <typename T, typename V>
+struct IsDirectInitializationAmbiguous<T, absl::StatusOr<V>>
+    : public IsConstructibleOrConvertibleFromStatusOr<T, V> {};
+
+template <typename T, typename U>
+using IsDirectInitializationValid = absl::disjunction<
+    // Short circuits if T is basically U.
+    std::is_same<T, absl::remove_cv_t<absl::remove_reference_t<U>>>,
+    absl::negation<absl::disjunction<
+        std::is_same<absl::StatusOr<T>,
+                     absl::remove_cv_t<absl::remove_reference_t<U>>>,
+        std::is_same<absl::Status,
+                     absl::remove_cv_t<absl::remove_reference_t<U>>>,
+        std::is_same<absl::in_place_t,
+                     absl::remove_cv_t<absl::remove_reference_t<U>>>,
+        IsDirectInitializationAmbiguous<T, U>>>>;
+
+template <typename T, typename U>
+struct IsForwardingAssignmentAmbiguous
+    : public absl::conditional_t<
+          std::is_same<absl::remove_cv_t<absl::remove_reference_t<U>>,
+                       U>::value,
+          std::false_type,
+          IsForwardingAssignmentAmbiguous<
+              T, absl::remove_cv_t<absl::remove_reference_t<U>>>> {};
+
+template <typename T, typename U>
+struct IsForwardingAssignmentAmbiguous<T, absl::StatusOr<U>>
+    : public IsConstructibleOrConvertibleOrAssignableFromStatusOr<T, U> {};
+
+template <typename T, typename U>
+using IsForwardingAssignmentValid = absl::disjunction<
+    // Short circuits if T is basically U.
+    std::is_same<T, absl::remove_cv_t<absl::remove_reference_t<U>>>,
+    absl::negation<absl::disjunction<
+        std::is_same<absl::StatusOr<T>,
+                     absl::remove_cv_t<absl::remove_reference_t<U>>>,
+        std::is_same<absl::Status,
+                     absl::remove_cv_t<absl::remove_reference_t<U>>>,
+        std::is_same<absl::in_place_t,
+                     absl::remove_cv_t<absl::remove_reference_t<U>>>,
+        IsForwardingAssignmentAmbiguous<T, U>>>>;
+
+template <typename T, typename U>
+using IsForwardingAssignmentValid = absl::disjunction<
+    // Short circuits if T is basically U.
+    std::is_same<T, absl::remove_cv_t<absl::remove_reference_t<U>>>,
+    absl::negation<absl::disjunction<
+        std::is_same<absl::StatusOr<T>,
+                     absl::remove_cv_t<absl::remove_reference_t<U>>>,
+        std::is_same<absl::Status,
+                     absl::remove_cv_t<absl::remove_reference_t<U>>>,
+        std::is_same<absl::in_place_t,
+                     absl::remove_cv_t<absl::remove_reference_t<U>>>,
+        IsForwardingAssignmentAmbiguous<T, U>>>>;
+
+template <typename T> struct OperatorBase {
+  const T &value() const &;
+  T &value() &;
+  const T &&value() const &&;
+  T &&value() &&;
+
+  const T &operator*() const &;
+  T &operator*() &;
+  const T &&operator*() const &&;
+  T &&operator*() &&;
+
+  // To test that analyses are okay if there is a use of operator*
+  // within this base class.
+  const T *operator->() const { return __builtin_addressof(**this); }
+  T *operator->() { return __builtin_addressof(**this); }
+};
+
+} // namespace internal_statusor
+
+template <typename T>
+struct StatusOr : private internal_statusor::OperatorBase<T> {
+  explicit StatusOr();
+
+  StatusOr(const StatusOr &) = default;
+  StatusOr &operator=(const StatusOr &) = default;
+
+  StatusOr(StatusOr &&) = default;
+  StatusOr &operator=(StatusOr &&) = default;
+
+  template <
+      typename U,
+      absl::enable_if_t<
+          absl::conjunction<
+              absl::negation<std::is_same<T, U>>,
+              std::is_constructible<T, const U &>,
+              std::is_convertible<const U &, T>,
+              absl::negation<
+                  internal_statusor::IsConstructibleOrConvertibleFromStatusOr<
+                      T, U>>>::value,
+          int> = 0>
+  StatusOr(const StatusOr<U> &);
+
+  template <
+      typename U,
+      absl::enable_if_t<
+          absl::conjunction<
+              absl::negation<std::is_same<T, U>>,
+              std::is_constructible<T, const U &>,
+              absl::negation<std::is_convertible<const U &, T>>,
+              absl::negation<
+                  internal_statusor::IsConstructibleOrConvertibleFromStatusOr<
+                      T, U>>>::value,
+          int> = 0>
+  explicit StatusOr(const StatusOr<U> &);
+
+  template <
+      typename U,
+      absl::enable_if_t<
+          absl::conjunction<
+              absl::negation<std::is_same<T, U>>,
+              std::is_constructible<T, U &&>, std::is_convertible<U &&, T>,
+              absl::negation<
+                  internal_statusor::IsConstructibleOrConvertibleFromStatusOr<
+                      T, U>>>::value,
+          int> = 0>
+  StatusOr(StatusOr<U> &&);
+
+  template <
+      typename U,
+      absl::enable_if_t<
+          absl::conjunction<
+              absl::negation<std::is_same<T, U>>,
+              std::is_constructible<T, U &&>,
+              absl::negation<std::is_convertible<U &&, T>>,
+              absl::negation<
+                  internal_statusor::IsConstructibleOrConvertibleFromStatusOr<
+                      T, U>>>::value,
+          int> = 0>
+  explicit StatusOr(StatusOr<U> &&);
+
+  template <
+      typename U,
+      absl::enable_if_t<
+          absl::conjunction<
+              absl::negation<std::is_same<T, U>>,
+              std::is_constructible<T, const U &>,
+              std::is_assignable<T, const U &>,
+              absl::negation<
+                  internal_statusor::
+                      IsConstructibleOrConvertibleOrAssignableFromStatusOr<
+                          T, U>>>::value,
+          int> = 0>
+  StatusOr &operator=(const StatusOr<U> &);
+
+  template <
+      typename U,
+      absl::enable_if_t<
+          absl::conjunction<
+              absl::negation<std::is_same<T, U>>,
+              std::is_constructible<T, U &&>, std::is_assignable<T, U &&>,
+              absl::negation<
+                  internal_statusor::
+                      IsConstructibleOrConvertibleOrAssignableFromStatusOr<
+                          T, U>>>::value,
+          int> = 0>
+  StatusOr &operator=(StatusOr<U> &&);
+
+  template <
+      typename U = absl::Status,
+      absl::enable_if_t<
+          absl::conjunction<
+              std::is_convertible<U &&, absl::Status>,
+              std::is_constructible<absl::Status, U &&>,
+              absl::negation<std::is_same<absl::decay_t<U>, absl::StatusOr<T>>>,
+              absl::negation<std::is_same<absl::decay_t<U>, T>>,
+              absl::negation<std::is_same<absl::decay_t<U>, absl::in_place_t>>,
+              absl::negation<internal_statusor::HasConversionOperatorToStatusOr<
+                  T, U &&>>>::value,
+          int> = 0>
+  StatusOr(U &&);
+
+  template <
+      typename U = absl::Status,
+      absl::enable_if_t<
+          absl::conjunction<
+              absl::negation<std::is_convertible<U &&, absl::Status>>,
+              std::is_constructible<absl::Status, U &&>,
+              absl::negation<std::is_same<absl::decay_t<U>, absl::StatusOr<T>>>,
+              absl::negation<std::is_same<absl::decay_t<U>, T>>,
+              absl::negation<std::is_same<absl::decay_t<U>, absl::in_place_t>>,
+              absl::negation<internal_statusor::HasConversionOperatorToStatusOr<
+                  T, U &&>>>::value,
+          int> = 0>
+  explicit StatusOr(U &&);
+
+  template <
+      typename U = absl::Status,
+      absl::enable_if_t<
+          absl::conjunction<
+              std::is_convertible<U &&, absl::Status>,
+              std::is_constructible<absl::Status, U &&>,
+              absl::negation<std::is_same<absl::decay_t<U>, absl::StatusOr<T>>>,
+              absl::negation<std::is_same<absl::decay_t<U>, T>>,
+              absl::negation<std::is_same<absl::decay_t<U>, absl::in_place_t>>,
+              absl::negation<internal_statusor::HasConversionOperatorToStatusOr<
+                  T, U &&>>>::value,
+          int> = 0>
+  StatusOr &operator=(U &&);
+
+  template <
+      typename U = T,
+      typename = typename std::enable_if<absl::conjunction<
+          std::is_constructible<T, U &&>, std::is_assignable<T &, U &&>,
+          absl::disjunction<
+              std::is_same<absl::remove_cv_t<absl::remove_reference_t<U>>, T>,
+              absl::conjunction<
+                  absl::negation<std::is_convertible<U &&, absl::Status>>,
+                  absl::negation<
+                      internal_statusor::HasConversionOperatorToStatusOr<
+                          T, U &&>>>>,
+          internal_statusor::IsForwardingAssignmentValid<T, U &&>>::value>::
+          type>
+  StatusOr &operator=(U &&);
+
+  template <typename... Args> explicit StatusOr(absl::in_place_t, Args &&...);
+
+  template <typename U, typename... Args>
+  explicit StatusOr(absl::in_place_t, std::initializer_list<U>, Args &&...);
+
+  template <
+      typename U = T,
+      absl::enable_if_t<
+          absl::conjunction<
+              internal_statusor::IsDirectInitializationValid<T, U &&>,
+              std::is_constructible<T, U &&>, std::is_convertible<U &&, T>,
+              absl::disjunction<
+                  std::is_same<absl::remove_cv_t<absl::remove_reference_t<U>>,
+                               T>,
+                  absl::conjunction<
+                      absl::negation<std::is_convertible<U &&, absl::Status>>,
+                      absl::negation<
+                          internal_statusor::HasConversionOperatorToStatusOr<
+                              T, U &&>>>>>::value,
+          int> = 0>
+  StatusOr(U &&);
+
+  template <
+      typename U = T,
+      absl::enable_if_t<
+          absl::conjunction<
+              internal_statusor::IsDirectInitializationValid<T, U &&>,
+              absl::disjunction<
+                  std::is_same<absl::remove_cv_t<absl::remove_reference_t<U>>,
+                               T>,
+                  absl::conjunction<
+                      absl::negation<std::is_constructible<absl::Status, U &&>>,
+                      absl::negation<
+                          internal_statusor::HasConversionOperatorToStatusOr<
+                              T, U &&>>>>,
+              std::is_constructible<T, U &&>,
+              absl::negation<std::is_convertible<U &&, T>>>::value,
+          int> = 0>
+  explicit StatusOr(U &&);
+
+  bool ok() const;
+
+  const Status &status() const & { return status_; }
+  Status status() &&;
+
+  using StatusOr::OperatorBase::value;
+
+  const T &ValueOrDie() const &;
+  T &ValueOrDie() &;
+  const T &&ValueOrDie() const &&;
+  T &&ValueOrDie() &&;
+
+  using StatusOr::OperatorBase::operator*;
+  using StatusOr::OperatorBase::operator->;
+
+  template <typename U> T value_or(U &&default_value) const &;
+  template <typename U> T value_or(U &&default_value) &&;
+
+  template <typename... Args> T &emplace(Args &&...args);
+
+  template <
+      typename U, typename... Args,
+      absl::enable_if_t<std::is_constructible<T, std::initializer_list<U> &,
+                                              Args &&...>::value,
+                        int> = 0>
+  T &emplace(std::initializer_list<U> ilist, Args &&...args);
+
+private:
+  absl::Status status_;
+};
+
+template <typename T>
+bool operator==(const StatusOr<T> &lhs, const StatusOr<T> &rhs);
+
+template <typename T>
+bool operator!=(const StatusOr<T> &lhs, const StatusOr<T> &rhs);
+
+} // namespace absl
+
+#endif // STATUSOR_H_
+)cc";
+
+static constexpr char StdVectorHeader[] = R"cc(
+#ifndef STD_VECTOR_H
+#define STD_VECTOR_H
+namespace std {
+template <class T> struct allocator {
+  typedef size_t size_type;
+  typedef ptrdiff_t difference_type;
+  typedef T *pointer;
+  typedef const T *const_pointer;
+  typedef T value_type;
+
+  T *allocate(size_t n);
+};
+
+template <class Alloc> struct allocator_traits {
+  typedef Alloc allocator_type;
+  typedef typename allocator_type::value_type value_type;
+  typedef typename allocator_type::pointer pointer;
+  typedef typename allocator_type::const_pointer const_pointer;
+  typedef typename allocator_type::difference_type difference_type;
+  typedef typename allocator_type::size_type size_type;
+};
+
+template <typename T, class Allocator = allocator<T>> class vector {
+public:
+  using value_type = T;
+  using size_type = typename allocator_traits<Allocator>::size_type;
+
+  // Constructors.
+  vector() {}
+  vector(size_type, const Allocator & = Allocator()) {}
+  vector(initializer_list<T> initializer_list,
+         const Allocator & = Allocator()) {}
+  vector(const vector &vector) {}
+  ~vector();
+
+  // Modifiers.
+  void push_back(const T &value);
+  void push_back(T &&value);
+  template <typename... Args> T &emplace_back(Args &&...args);
+
+  // Iterators
+  class InputIterator {
+  public:
+    InputIterator(const InputIterator &);
+    ~InputIterator();
+    InputIterator &operator=(const InputIterator &);
+    InputIterator &operator++();
+    T &operator*() const;
+    bool operator!=(const InputIterator &) const;
+    bool operator==(const InputIterator &) const;
+  };
+  typedef InputIterator iterator;
+  typedef const InputIterator const_iterator;
+  iterator begin() noexcept;
+  const_iterator begin() const noexcept;
+  const_iterator cbegin() const noexcept;
+  iterator end() noexcept;
+  const_iterator end() const noexcept;
+  const_iterator cend() const noexcept;
+  T *data() noexcept;
+  const T *data() const noexcept;
+  T &operator[](int n);
+  const T &operator[](int n) const;
+  T &at(int n);
+  const T &at(int n) const;
+  size_t size() const;
+};
+} // namespace std
+#endif // STD_VECTOR_H
+)cc";
+
+static constexpr char StdPairHeader[] = R"cc(
+#ifndef STD_PAIR_H
+#define STD_PAIR_H
+namespace std {
+template <class T1, class T2> struct pair {
+  T1 first;
+  T2 second;
+
+  typedef T1 first_type;
+  typedef T2 second_type;
+
+  constexpr pair();
+
+  template <class U1, class U2> pair(pair<U1, U2> &&p);
+
+  template <class U1, class U2> pair(U1 &&x, U2 &&y);
+};
+
+template <class T1, class T2> pair<T1, T2> make_pair(T1 &&t1, T2 &&t2);
+} // namespace std
+#endif // STD_PAIR_H
+)cc";
+
+constexpr const char AbslLogHeader[] = R"cc(
+#ifndef ABSL_LOG_H
+#define ABSL_LOG_H
+
+#include "std_pair.h"
+
+namespace absl {
+
+#define ABSL_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
+#define ABSL_PREDICT_TRUE(x) (__builtin_expect(false || (x), true))
+
+namespace log_internal {
+class LogMessage {
+public:
+  LogMessage();
+  LogMessage &stream();
+  LogMessage &InternalStream();
+  LogMessage &WithVerbosity(int verboselevel);
+  template <typename T> LogMessage &operator<<(const T &);
+};
+class LogMessageFatal : public LogMessage {
+public:
+  LogMessageFatal();
+  ~LogMessageFatal() __attribute__((noreturn));
+};
+class LogMessageQuietlyFatal : public LogMessage {
+public:
+  LogMessageQuietlyFatal();
+  ~LogMessageQuietlyFatal() __attribute__((noreturn));
+};
+class Voidify final {
+public:
+  // This has to be an operator with a precedence lower than << but higher
+  // than
+  // ?:
+  template <typename T> void operator&&(const T &) const && {}
+};
+} // namespace log_internal
+} // namespace absl
+
+#ifndef NULL
+#define NULL __null
+#endif
+extern "C" void abort() {}
+#define ABSL_LOG_INTERNAL_LOG_INFO ::absl::log_internal::LogMessage()
+#define ABSL_LOG_INTERNAL_LOG_WARNING ::absl::log_internal::LogMessage()
+#define ABSL_LOG_INTERNAL_LOG_ERROR ::absl::log_internal::LogMessage()
+#define ABSL_LOG_INTERNAL_LOG_FATAL ::absl::log_internal::LogMessageFatal()
+#define ABSL_LOG_INTERNAL_LOG_QFATAL                                           \
+  ::absl::log_internal::LogMessageQuietlyFatal()
+#define LOG(severity) ABSL_LOG_INTERNAL_LOG_##severity.InternalStream()
+
+#define PREDICT_FALSE(x) (__builtin_expect(x, 0))
+#define ABSL_LOG_INTERNAL_STRIP_STRING_LITERAL(lit) lit
+
+#define ABSL_LOG_INTERNAL_STATELESS_CONDITION(condition)                       \
+  switch (0)                                                                   \
+  case 0:                                                                      \
+  default:                                                                     \
+    !(condition) ? (void)0 : ::absl::log_internal::Voidify() &&
+
+#define ABSL_LOG_INTERNAL_CONDITION_INFO(type, condition)                      \
+  ABSL_LOG_INTERNAL_##type##_CONDITION(condition)
+
+#define ABSL_LOG_INTERNAL_CONDITION_FATAL(type, condition)                     \
+  ABSL_LOG_INTERNAL_##type##_CONDITION(condition)
+
+#define ABSL_LOG_INTERNAL_CONDITION_QFATAL(type, condition)                    \
+  ABSL_LOG_INTERNAL_##type##_CONDITION(condition)
+
+#define ABSL_CHECK_IMPL(condition, condition_text)                             \
+  ABSL_LOG_INTERNAL_CONDITION_FATAL(STATELESS,                                 \
+                                    ABSL_PREDICT_FALSE(!(condition)))          \
+  ABSL_LOG_INTERNAL_CHECK(condition_text).InternalStream()
+
+#define ABSL_QCHECK_IMPL(condition, condition_text)                            \
+  ABSL_LOG_INTERNAL_CONDITION_QFATAL(STATELESS,                                \
+                                     ABSL_PREDICT_FALSE(!(condition)))         \
+  ABSL_LOG_INTERNAL_QCHECK(condition_text).InternalStream()
+
+#define CHECK(condition) ABSL_CHECK_IMPL((condition), #condition)
+#define DCHECK(condition) CHECK(condition)
+#define QCHECK(condition) ABSL_QCHECK_IMPL((condition), #condition)
+
+#define ABSL_LOG_INTERNAL_MAX_LOG_VERBOSITY_CHECK(x)
+
+namespace absl {
+
+template <typename T> class StatusOr;
+class Status;
+
+namespace status_internal {
+std::string *MakeCheckFailString(const absl::Status *status,
+                                 const char *prefix);
+} // namespace status_internal
+
+namespace log_internal {
+template <class T> const T &GetReferenceableValue(const T &t);
+char GetReferenceableValue(char t);
+unsigned char GetReferenceableValue(unsigned char t);
+signed char GetReferenceableValue(signed char t);
+short GetReferenceableValue(short t);
+unsigned short GetReferenceableValue(unsigned short t);
+int GetReferenceableValue(int t);
+unsigned int GetReferenceableValue(unsigned int t);
+long GetReferenceableValue(long t);
+unsigned long GetReferenceableValue(unsigned long t);
+long long GetReferenceableValue(long long t);
+unsigned long long GetReferenceableValue(unsigned long long t);
+const absl::Status *AsStatus(const absl::Status &s);
+template <typename T> const absl::Status *AsStatus(const absl::StatusOr<T> &s);
+} // namespace log_internal
+} // namespace absl
+// TODO(tkd): this still doesn't allow operator<<, unlike the real CHECK_
+// macros.
+#define ABSL_LOG_INTERNAL_CHECK_OP(name, op, val1, val2)                       \
+  while (char *_result = ::absl::log_internal::name##Impl(                     \
+             ::absl::log_internal::GetReferenceableValue(val1),                \
+             ::absl::log_internal::GetReferenceableValue(val2),                \
+             #val1 " " #op " " #val2))                                         \
+  (void)0
+#define ABSL_LOG_INTERNAL_QCHECK_OP(name, op, val1, val2)                      \
+  while (char *_result = ::absl::log_internal::name##Impl(                     \
+             ::absl::log_internal::GetReferenceableValue(val1),                \
+             ::absl::log_internal::GetReferenceableValue(val2),                \
+             #val1 " " #op " " #val2))                                         \
+  (void)0
+namespace absl {
+namespace log_internal {
+template <class T1, class T2>
+char *Check_NEImpl(const T1 &v1, const T2 &v2, const char *names);
+template <class T1, class T2>
+char *Check_EQImpl(const T1 &v1, const T2 &v2, const char *names);
+template <class T1, class T2>
+char *Check_LTImpl(const T1 &v1, const T2 &v2, const char *names);
+
+#define CHECK_EQ(a, b) ABSL_LOG_INTERNAL_CHECK_OP(Check_EQ, ==, a, b)
+#define CHECK_NE(a, b) ABSL_LOG_INTERNAL_CHECK_OP(Check_NE, !=, a, b)
+#define CHECK_LT(a, b) ABSL_LOG_INTERNAL_CHECK_OP(Check_EQ, <, a, b)
+
+#define QCHECK_EQ(a, b) ABSL_LOG_INTERNAL_QCHECK_OP(Check_EQ, ==, a, b)
+#define QCHECK_NE(a, b) ABSL_LOG_INTERNAL_QCHECK_OP(Check_NE, !=, a, b)
+} // namespace log_internal
+} // namespace absl
+
+#define CHECK_NOTNULL(x) CHECK((x) != nullptr)
+
+#define ABSL_LOG_INTERNAL_CHECK(failure_message)                               \
+  ::absl::log_internal::LogMessageFatal()
+#define ABSL_LOG_INTERNAL_QCHECK(failure_message)                              \
+  ::absl::log_internal::LogMessageQuietlyFatal()
+#define ABSL_LOG_INTERNAL_CHECK_OK(val)                                        \
+  for (::std::pair<const ::absl::Status *, ::std::string *>                    \
+           absl_log_internal_check_ok_goo;                                     \
+       absl_log_internal_check_ok_goo.first =                                  \
+           ::absl::log_internal::AsStatus(val),                                \
+       absl_log_internal_check_ok_goo.second =                                 \
+           ABSL_PREDICT_TRUE(absl_log_internal_check_ok_goo.first->ok())       \
+               ? nullptr                                                       \
+               : ::absl::status_internal::MakeCheckFailString(                 \
+                     absl_log_internal_check_ok_goo.first,                     \
+                     ABSL_LOG_INTERNAL_STRIP_STRING_LITERAL(#val " is OK")),   \
+       !ABSL_PREDICT_TRUE(absl_log_internal_check_ok_goo.first->ok());)        \
+  ABSL_LOG_INTERNAL_CHECK(*absl_log_internal_check_ok_goo.second)              \
+      .InternalStream()
+#define ABSL_LOG_INTERNAL_QCHECK_OK(val)                                       \
+  for (::std::pair<const ::absl::Status *, ::std::string *>                    \
+           absl_log_internal_check_ok_goo;                                     \
+       absl_log_internal_check_ok_goo.first =                                  \
+           ::absl::log_internal::AsStatus(val),                                \
+       absl_log_internal_check_ok_goo.second =                                 \
+           ABSL_PREDICT_TRUE(absl_log_internal_check_ok_goo.first->ok())       \
+               ? nullptr                                                       \
+               : ::absl::status_internal::MakeCheckFailString(                 \
+                     absl_log_internal_check_ok_goo.first,                     \
+                     ABSL_LOG_INTERNAL_STRIP_STRING_LITERAL(#val " is OK")),   \
+       !ABSL_PREDICT_TRUE(absl_log_internal_check_ok_goo.first->ok());)        \
+  ABSL_LOG_INTERNAL_QCHECK(*absl_log_internal_check_ok_goo.second)             \
+      .InternalStream()
+
+#define CHECK_OK(val) ABSL_LOG_INTERNAL_CHECK_OK(val)
+#define DCHECK_OK(val) ABSL_LOG_INTERNAL_CHECK_OK(val)
+#define QCHECK_OK(val) ABSL_LOG_INTERNAL_QCHECK_OK(val)
+
+#endif // ABSL_LOG_H
+)cc";
+
+constexpr const char TestingDefsHeader[] = R"cc(
+#pragma clang system_header
+
+#ifndef TESTING_DEFS_H
+#define TESTING_DEFS_H
+
+#include "absl_type_traits.h"
+#include "std_initializer_list.h"
+#include "std_string.h"
+#include "std_type_traits.h"
+#include "std_utility.h"
+
+namespace testing {
+struct AssertionResult {
+  template <typename T>
+  explicit AssertionResult(const T &res, bool enable_if = true) {}
+  ~AssertionResult();
+  operator bool() const;
+  template <typename T> AssertionResult &operator<<(const T &value);
+  const char *failure_message() const;
+};
+
+class TestPartResult {
+public:
+  enum Type { kSuccess, kNonFatalFailure, kFatalFailure, kSkip };
+};
+
+class Test {
+public:
+  virtual ~Test() = default;
+
+protected:
+  virtual void SetUp() {}
+};
+
+class Message {
+public:
+  template <typename T> Message &operator<<(const T &val);
+};
+
+namespace internal {
+class AssertHelper {
+public:
+  AssertHelper(TestPartResult::Type type, const char *file, int line,
+               const char *message);
+  void operator=(const Message &message) const;
+};
+
+class EqHelper {
+public:
+  template <typename T1, typename T2>
+  static AssertionResult Compare(const char *lhx, const char *rhx,
+                                 const T1 &lhs, const T2 &rhs);
+};
+
+#define GTEST_IMPL_CMP_HELPER_(op_name)                                        \
+  template <typename T1, typename T2>                                          \
+  AssertionResult CmpHelper##op_name(const char *expr1, const char *expr2,     \
+                                     const T1 &val1, const T2 &val2);
+
+GTEST_IMPL_CMP_HELPER_(NE)
+GTEST_IMPL_CMP_HELPER_(LE)
+GTEST_IMPL_CMP_HELPER_(LT)
+GTEST_IMPL_CMP_HELPER_(GE)
+GTEST_IMPL_CMP_HELPER_(GT)
+
+#undef GTEST_IMPL_CMP_HELPER_
+
+std::string GetBoolAssertionFailureMessage(
+    const AssertionResult &assertion_result, const char *expression_text,
+    const char *actual_predicate_value, const char *expected_predicate_value);
+
+template <typename M> class PredicateFormatterFromMatcher {
+public:
+  template <typename T>
+  AssertionResult operator()(const char *value_text, const T &x) const;
+};
+
+template <typename M>
+inline PredicateFormatterFromMatcher<M>
+MakePredicateFormatterFromMatcher(M matcher) {
+  return PredicateFormatterFromMatcher<M>();
+}
+} // namespace internal
+
+namespace status {
+namespace internal_status {
+class IsOkMatcher {};
+
+class StatusIsMatcher {};
+
+class CanonicalStatusIsMatcher {};
+
+template <typename M> class IsOkAndHoldsMatcher {};
+
+} // namespace internal_status
+
+internal_status::IsOkMatcher IsOk();
+
+template <typename StatusCodeMatcher>
+internal_status::StatusIsMatcher StatusIs(StatusCodeMatcher &&code_matcher);
+
+template <typename StatusCodeMatcher>
+internal_status::CanonicalStatusIsMatcher
+CanonicalStatusIs(StatusCodeMatcher &&code_matcher);
+
+template <typename InnerMatcher>
+internal_status::IsOkAndHoldsMatcher<InnerMatcher> IsOkAndHolds(InnerMatcher m);
+} // namespace status
+
+class IsTrueMatcher {};
+IsTrueMatcher IsTrue();
+
+class IsFalseMatcher {};
+IsFalseMatcher IsFalse();
+
+} // namespace testing
+
+namespace absl_testing {
+namespace status_internal {
+class IsOkMatcher {};
+template <typename M> class IsOkAndHoldsMatcher {};
+class StatusIsMatcher {};
+class CanonicalStatusIsMatcher {};
+} // namespace status_internal
+status_internal::IsOkMatcher IsOk();
+template <typename InnerMatcher>
+status_internal::IsOkAndHoldsMatcher<InnerMatcher> IsOkAndHolds(InnerMatcher m);
+template <typename StatusCodeMatcher>
+status_internal::StatusIsMatcher StatusIs(StatusCodeMatcher &&code_matcher);
+
+template <typename StatusCodeMatcher>
+status_internal::CanonicalStatusIsMatcher
+CanonicalStatusIs(StatusCodeMatcher &&code_matcher);
+} // namespace absl_testing
+
+using testing::AssertionResult;
+#define EXPECT_TRUE(x)                                                         \
+  switch (0)                                                                   \
+  case 0:                                                                      \
+  default:                                                                     \
+    if (const AssertionResult gtest_ar_ = AssertionResult(x)) {                \
+    } else /* NOLINT */                                                        \
+      ::testing::Message()
+#define EXPECT_FALSE(x) EXPECT_TRUE(!(x))
+
+#define GTEST_AMBIGUOUS_ELSE_BLOCKER_                                          \
+  switch (0)                                                                   \
+  case 0:                                                                      \
+  default:
+
+#define GTEST_ASSERT_(expression, on_failure)                                  \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                                \
+  if (const ::testing::AssertionResult gtest_ar = (expression))                \
+    ;                                                                          \
+  else                                                                         \
+    on_failure(gtest_ar.failure_message())
+#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure)                       \
+  GTEST_ASSERT_(pred_format(#v1, v1), on_failure)
+#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure)                   \
+  GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), on_failure)
+#define GTEST_MESSAGE_AT_(file, line, message, result_type)                    \
+  ::testing::internal::AssertHelper(result_type, file, line, message) =        \
+      ::testing::Message()
+#define GTEST_MESSAGE_(message, result_type)                                   \
+  GTEST_MESSAGE_AT_(__FILE__, __LINE__, message, result_type)
+#define GTEST_FATAL_FAILURE_(message)                                          \
+  return GTEST_MESSAGE_(message, ::testing::TestPartResult::kFatalFailure)
+#define GTEST_NONFATAL_FAILURE_(message)                                       \
+  GTEST_MESSAGE_(message, ::testing::TestPartResult::kNonFatalFailure)
+
+#define ASSERT_PRED_FORMAT1(pred_format, v1)                                   \
+  GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT2(pred_format, v1, v2)                               \
+  GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_FATAL_FAILURE_)
+
+#define ASSERT_THAT(value, matcher)                                            \
+  ASSERT_PRED_FORMAT1(                                                         \
+      ::testing::internal::MakePredicateFormatterFromMatcher(matcher), value)
+#define ASSERT_OK(x) ASSERT_THAT(x, ::testing::status::IsOk())
+
+#define EXPECT_PRED_FORMAT1(pred_format, v1)                                   \
+  GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED_FORMAT2(pred_format, v1, v2)                               \
+  GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_THAT(value, matcher)                                            \
+  EXPECT_PRED_FORMAT1(                                                         \
+      ::testing::internal::MakePredicateFormatterFromMatcher(matcher), value)
+#define EXPECT_OK(expression) EXPECT_THAT(expression, ::testing::status::IsOk())
+
+#define GTEST_TEST_BOOLEAN_(expression, text, actual, expected, fail)          \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                                \
+  if (const ::testing::AssertionResult gtest_ar_ =                             \
+          ::testing::AssertionResult(expression))                              \
+    ;                                                                          \
+  else                                                                         \
+    fail(::testing::internal::GetBoolAssertionFailureMessage(                  \
+             gtest_ar_, text, #actual, #expected)                              \
+             .c_str())
+#define GTEST_ASSERT_TRUE(condition)                                           \
+  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, GTEST_FATAL_FAILURE_)
+#define GTEST_ASSERT_FALSE(condition)                                          \
+  GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false,                   \
+                      GTEST_FATAL_FAILURE_)
+#define ASSERT_TRUE(condition) GTEST_ASSERT_TRUE(condition)
+#define ASSERT_FALSE(condition) GTEST_ASSERT_FALSE(condition)
+
+#define EXPECT_EQ(x, y)                                                        \
+  EXPECT_PRED_FORMAT2(::testing::internal::EqHelper::Compare, x, y)
+#define EXPECT_NE(x, y)                                                        \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperNE, x, y)
+#define EXPECT_LT(x, y)                                                        \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperLT, x, y)
+#define EXPECT_GT(x, y)                                                        \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperGT, x, y)
+#define EXPECT_LE(x, y)                                                        \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperLE, x, y)
+#define EXPECT_GE(x, y)                                                        \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperGE, x, y)
+
+#define ASSERT_EQ(x, y)                                                        \
+  ASSERT_PRED_FORMAT2(testing::internal::EqHelper::Compare, x, y)
+#define ASSERT_NE(x, y)                                                        \
+  ASSERT_PRED_FORMAT2(testing::internal::CmpHelperNE, x, y)
+#define ASSERT_LT(x, y)                                                        \
+  ASSERT_PRED_FORMAT2(testing::internal::CmpHelperLT, x, y)
+#define ASSERT_GT(x, y)                                                        \
+  ASSERT_PRED_FORMAT2(testing::internal::CmpHelperGT, x, y)
+#define ASSERT_LE(x, y)                                                        \
+  ASSERT_PRED_FORMAT2(testing::internal::CmpHelperLE, x, y)
+#define ASSERT_GE(x, y)                                                        \
+  ASSERT_PRED_FORMAT2(testing::internal::CmpHelperGE, x, y)
+
+#endif // TESTING_DEFS_H
+)cc";
+
 std::vector<std::pair<std::string, std::string>> getMockHeaders() {
   std::vector<std::pair<std::string, std::string>> Headers;
   Headers.emplace_back("cstddef.h", CStdDefHeader);
@@ -1251,6 +2243,12 @@ std::vector<std::pair<std::string, std::string>> getMockHeaders() {
   Headers.emplace_back("absl_type_traits.h", AbslTypeTraitsHeader);
   Headers.emplace_back("absl_optional.h", AbslOptionalHeader);
   Headers.emplace_back("base_optional.h", BaseOptionalHeader);
+  Headers.emplace_back("std_vector.h", StdVectorHeader);
+  Headers.emplace_back("std_pair.h", StdPairHeader);
+  Headers.emplace_back("status_defs.h", StatusDefsHeader);
+  Headers.emplace_back("statusor_defs.h", StatusOrDefsHeader);
+  Headers.emplace_back("absl_log.h", AbslLogHeader);
+  Headers.emplace_back("testing_defs.h", TestingDefsHeader);
   return Headers;
 }
 
diff --git a/clang/unittests/Analysis/FlowSensitive/UncheckedStatusOrAccessModelTest.cpp b/clang/unittests/Analysis/FlowSensitive/UncheckedStatusOrAccessModelTest.cpp
new file mode 100644
index 0000000000000..20630a4ce1784
--- /dev/null
+++ b/clang/unittests/Analysis/FlowSensitive/UncheckedStatusOrAccessModelTest.cpp
@@ -0,0 +1,34 @@
+//===- UncheckedStatusOrAccessModelTest.cpp -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <utility>
+
+#include "UncheckedStatusOrAccessModelTestFixture.h"
+#include "clang/Analysis/FlowSensitive/Models/UncheckedStatusOrAccessModel.h"
+#include "gtest/gtest.h"
+
+namespace clang::dataflow::statusor_model {
+namespace {
+
+auto Executor = std::make_unique<
+    UncheckedStatusOrAccessModelTestExecutor<UncheckedStatusOrAccessModel>>();
+
+INSTANTIATE_TEST_SUITE_P(
+    UncheckedStatusOrAccessModelTest, UncheckedStatusOrAccessModelTest,
+    testing::Values(
+        std::make_pair(Executor.get(),
+                       UncheckedStatusOrAccessModelTestAliasKind::kUnaliased),
+        std::make_pair(
+            Executor.get(),
+            UncheckedStatusOrAccessModelTestAliasKind::kPartiallyAliased),
+        std::make_pair(
+            Executor.get(),
+            UncheckedStatusOrAccessModelTestAliasKind::kFullyAliased)));
+} // namespace
+
+} // namespace clang::dataflow::statusor_model
diff --git a/clang/unittests/Analysis/FlowSensitive/UncheckedStatusOrAccessModelTestFixture.cpp b/clang/unittests/Analysis/FlowSensitive/UncheckedStatusOrAccessModelTestFixture.cpp
new file mode 100644
index 0000000000000..4827cc1d0a7e9
--- /dev/null
+++ b/clang/unittests/Analysis/FlowSensitive/UncheckedStatusOrAccessModelTestFixture.cpp
@@ -0,0 +1,2518 @@
+//===- UncheckedStatusOrAccessModelTestFixture.cpp ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "UncheckedStatusOrAccessModelTestFixture.h"
+#include "MockHeaders.h"
+#include "llvm/Support/ErrorHandling.h"
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+namespace clang::dataflow::statusor_model {
+namespace {
+
+TEST_P(UncheckedStatusOrAccessModelTest, NoStatusOrMention) {
+  ExpectDiagnosticsFor(R"cc(
+    void target() { "nop"; }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest,
+       UnwrapWithoutCheck_Lvalue_CallToValue) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      sor.value();  // [[unsafe]]
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, NonExplicitInitialization) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+    STATUSOR_INT target() {
+      STATUSOR_INT x = Make<STATUSOR_INT>();
+      return x.value();  // [[unsafe]]
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest,
+       UnwrapWithoutCheck_Lvalue_CallToValue_NewLine) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      sor.  // force newline
+      value();  // [[unsafe]]
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest,
+       UnwrapWithoutCheck_Rvalue_CallToValue) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      std::move(sor).value();  // [[unsafe]]
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest,
+       UnwrapWithoutCheck_Lvalue_CallToValueOrDie) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      sor.ValueOrDie();  // [[unsafe]]
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest,
+       UnwrapWithoutCheck_Rvalue_CallToValueOrDie) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      std::move(sor).ValueOrDie();  // [[unsafe]]
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest,
+       UnwrapWithoutCheck_Lvalue_CallToOperatorStar) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      *sor;  // [[unsafe]]
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest,
+       UnwrapWithoutCheck_Lvalue_CallToOperatorStarSeparateLine) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      *  // [[unsafe]]
+          sor;
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest,
+       UnwrapWithoutCheck_Rvalue_CallToOperatorStar) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      *std::move(sor);  // [[unsafe]]
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest,
+       UnwrapWithoutCheck_Lvalue_CallToOperatorArrow) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    struct Foo {
+      void foo();
+    };
+
+    void target(absl::StatusOr<Foo> sor) {
+      sor->foo();  // [[unsafe]]
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest,
+       UnwrapWithoutCheck_Rvalue_CallToOperatorArrow) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    struct Foo {
+      void foo();
+    };
+
+    void target(absl::StatusOr<Foo> sor) {
+      std::move(sor)->foo();  // [[unsafe]]
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, UnwrapRvalueWithCheck) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      if (sor.ok()) std::move(sor).value();
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, ParensInDeclInitExpr) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target() {
+      auto sor = (Make<STATUSOR_INT>());
+      if (sor.ok()) sor.value();
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, ReferenceInDeclInitExpr) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    struct Foo {
+      const STATUSOR_INT& GetStatusOrInt() const;
+    };
+
+    void target(Foo foo) {
+      auto sor = foo.GetStatusOrInt();
+      if (sor.ok()) sor.value();
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    struct Foo {
+      STATUSOR_INT& GetStatusOrInt();
+    };
+
+    void target(Foo foo) {
+      auto sor = foo.GetStatusOrInt();
+      if (sor.ok()) sor.value();
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    struct Foo {
+      STATUSOR_INT&& GetStatusOrInt() &&;
+    };
+
+    void target(Foo foo) {
+      auto sor = std::move(foo).GetStatusOrInt();
+      if (sor.ok()) sor.value();
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, IfThenElse) {
+  ExpectDiagnosticsFor(
+      R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+        void target(STATUSOR_INT sor) {
+          if (sor.ok())
+            sor.value();
+          else
+            sor.value();  // [[unsafe]]
+
+          sor.value();  // [[unsafe]]
+        }
+      )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target() {
+      if (auto sor = Make<STATUSOR_INT>(); sor.ok())
+        sor.value();
+      else
+        sor.value();  // [[unsafe]]
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, JoinSafeSafe) {
+  ExpectDiagnosticsFor(
+      R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+        void target(STATUSOR_INT sor, bool b) {
+          if (sor.ok()) {
+            if (b)
+              sor.value();
+            else
+              sor.value();
+          }
+        }
+      )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, JoinUnsafeUnsafe) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor, bool b) {
+      if (b)
+        sor.value();  // [[unsafe]]
+      else
+        sor.value();  // [[unsafe]]
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, InversedIfThenElse) {
+  ExpectDiagnosticsFor(
+      R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+        void target(STATUSOR_INT sor) {
+          if (!sor.ok())
+            sor.value();  // [[unsafe]]
+          else
+            sor.value();
+
+          sor.value();  // [[unsafe]]
+        }
+      )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, DoubleInversedIfThenElse) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      if (!!sor.ok())
+        sor.value();
+      else
+        sor.value();  // [[unsafe]]
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, TripleInversedIfThenElse) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      if (!!!sor.ok())
+        sor.value();  // [[unsafe]]
+      else
+        sor.value();
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, IfThenElse_LhsAndRhs) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT x, STATUSOR_INT y) {
+      if (x.ok() && y.ok()) {
+        x.value();
+
+        y.value();
+      } else {
+        x.value();  // [[unsafe]]
+
+        y.value();  // [[unsafe]]
+      }
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, IfThenElse_NotLhsAndRhs) {
+  ExpectDiagnosticsFor(
+      R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+        void target(STATUSOR_INT x, STATUSOR_INT y) {
+          if (!x.ok() && y.ok()) {
+            y.value();
+
+            x.value();  // [[unsafe]]
+          } else {
+            x.value();  // [[unsafe]]
+
+            y.value();  // [[unsafe]]
+          }
+        }
+      )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, IfThenElse_LhsAndNotRhs) {
+  ExpectDiagnosticsFor(
+      R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+        void target(STATUSOR_INT x, STATUSOR_INT y) {
+          if (x.ok() && !y.ok()) {
+            x.value();
+
+            y.value();  // [[unsafe]]
+          } else {
+            x.value();  // [[unsafe]]
+
+            y.value();  // [[unsafe]]
+          }
+        }
+      )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, IfThenElse_NotLhsAndNotRhs) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT x, STATUSOR_INT y) {
+      if (!x.ok() && !y.ok()) {
+        x.value();  // [[unsafe]]
+
+        y.value();  // [[unsafe]]
+      } else {
+        x.value();  // [[unsafe]]
+
+        y.value();  // [[unsafe]]
+      }
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, IfThenElse_Not_LhsAndRhs) {
+  ExpectDiagnosticsFor(
+      R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+        void target(STATUSOR_INT x, STATUSOR_INT y) {
+          if (!(x.ok() && y.ok())) {
+            x.value();  // [[unsafe]]
+
+            y.value();  // [[unsafe]]
+          } else {
+            x.value();
+
+            y.value();
+          }
+        }
+      )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, IfThenElse_Not_NotLhsAndRhs) {
+  ExpectDiagnosticsFor(
+      R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+        void target(STATUSOR_INT x, STATUSOR_INT y) {
+          if (!(!x.ok() && y.ok())) {
+            x.value();  // [[unsafe]]
+
+            y.value();  // [[unsafe]]
+          } else {
+            y.value();
+
+            x.value();  // [[unsafe]]
+          }
+        }
+      )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, IfThenElse_Not_LhsAndNotRhs) {
+  ExpectDiagnosticsFor(
+      R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+        void target(STATUSOR_INT x, STATUSOR_INT y) {
+          if (!(x.ok() && !y.ok())) {
+            x.value();  // [[unsafe]]
+
+            y.value();  // [[unsafe]]
+          } else {
+            x.value();
+
+            y.value();  // [[unsafe]]
+          }
+        }
+      )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, IfThenElse_Not_NotLhsAndNotRhs) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT x, STATUSOR_INT y) {
+      if (!(!x.ok() && !y.ok())) {
+        x.value();  // [[unsafe]]
+
+        y.value();  // [[unsafe]]
+      } else {
+        x.value();  // [[unsafe]]
+
+        y.value();  // [[unsafe]]
+      }
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, IfThenElse_LhsOrRhs) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT x, STATUSOR_INT y) {
+      if (x.ok() || y.ok()) {
+        x.value();  // [[unsafe]]
+
+        y.value();  // [[unsafe]]
+      } else {
+        x.value();  // [[unsafe]]
+
+        y.value();  // [[unsafe]]
+      }
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, IfThenElse_NotLhsOrRhs) {
+  ExpectDiagnosticsFor(
+      R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+        void target(STATUSOR_INT x, STATUSOR_INT y) {
+          if (!x.ok() || y.ok()) {
+            x.value();  // [[unsafe]]
+
+            y.value();  // [[unsafe]]
+          } else {
+            x.value();
+
+            y.value();  // [[unsafe]]
+          }
+        }
+      )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, IfThenElse_LhsOrNotRhs) {
+  ExpectDiagnosticsFor(
+      R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+        void target(STATUSOR_INT x, STATUSOR_INT y) {
+          if (x.ok() || !y.ok()) {
+            x.value();  // [[unsafe]]
+
+            y.value();  // [[unsafe]]
+          } else {
+            y.value();
+
+            x.value();  // [[unsafe]]
+          }
+        }
+      )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, IfThenElse_NotLhsOrNotRhs) {
+  ExpectDiagnosticsFor(
+      R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+        void target(STATUSOR_INT x, STATUSOR_INT y) {
+          if (!x.ok() || !y.ok()) {
+            x.value();  // [[unsafe]]
+
+            y.value();  // [[unsafe]]
+          } else {
+            x.value();
+
+            y.value();
+          }
+        }
+      )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, IfThenElse_Not_LhsOrRhs) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT x, STATUSOR_INT y) {
+      if (!(x.ok() || y.ok())) {
+        x.value();  // [[unsafe]]
+
+        y.value();  // [[unsafe]]
+      } else {
+        x.value();  // [[unsafe]]
+
+        y.value();  // [[unsafe]]
+      }
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, IfThenElse_Not_NotLhsOrRhs) {
+  ExpectDiagnosticsFor(
+      R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+        void target(STATUSOR_INT x, STATUSOR_INT y) {
+          if (!(!x.ok() || y.ok())) {
+            x.value();
+
+            y.value();  // [[unsafe]]
+          } else {
+            x.value();  // [[unsafe]]
+
+            y.value();  // [[unsafe]]
+          }
+        }
+      )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, IfThenElse_Not_LhsOrNotRhs) {
+  ExpectDiagnosticsFor(
+      R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+        void target(STATUSOR_INT x, STATUSOR_INT y) {
+          if (!(x.ok() || !y.ok())) {
+            y.value();
+
+            x.value();  // [[unsafe]]
+          } else {
+            x.value();  // [[unsafe]]
+
+            y.value();  // [[unsafe]]
+          }
+        }
+      )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, IfThenElse_Not_NotLhsOrNotRhs) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT x, STATUSOR_INT y) {
+      if (!(!x.ok() || !y.ok())) {
+        x.value();
+
+        y.value();
+      } else {
+        x.value();  // [[unsafe]]
+
+        y.value();  // [[unsafe]]
+      }
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, TerminatingIfThenBranch) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      if (!sor.ok()) return;
+
+      sor.value();
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      if (sor.ok()) return;
+
+      sor.value();  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(
+      R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+        void target(STATUSOR_INT x, STATUSOR_INT y) {
+          if (!x.ok() || !y.ok()) return;
+
+          x.value();
+
+          y.value();
+        }
+      )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, TerminatingIfElseBranch) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      if (sor.ok()) {
+      } else {
+        return;
+      }
+
+      sor.value();
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      if (!sor.ok()) {
+      } else {
+        return;
+      }
+
+      sor.value();  // [[unsafe]]
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, TerminatingIfThenBranchInLoop) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      while (Make<bool>()) {
+        if (!sor.ok()) continue;
+
+        sor.value();
+      }
+
+      sor.value();  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      while (Make<bool>()) {
+        if (!sor.ok()) break;
+
+        sor.value();
+      }
+
+      sor.value();  // [[unsafe]]
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, TernaryConditionalOperator) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      sor.ok() ? sor.value() : 21;
+
+      sor.ok() ? 21 : sor.value();  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      !sor.ok() ? 21 : sor.value();
+
+      !sor.ok() ? sor.value() : 21;  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor1, STATUSOR_INT sor2) {
+      !((__builtin_expect(false || (!(sor1.ok() && sor2.ok())), false)))
+          ? (void)0
+          : (void)1;
+      do {
+        sor1.value();  // [[unsafe]]
+        sor2.value();  // [[unsafe]]
+      } while (true);
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, While) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      while (Make<bool>()) sor.value();  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      while (sor.ok()) sor.value();
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      while (!sor.ok()) sor.value();  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      while (!!sor.ok()) sor.value();
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      while (!!!sor.ok()) sor.value();  // [[unsafe]]
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, While_LhsAndRhs) {
+  ExpectDiagnosticsFor(
+      R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+        void target(STATUSOR_INT x, STATUSOR_INT y) {
+          while (x.ok() && y.ok()) {
+            x.value();
+
+            y.value();
+          }
+        }
+      )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, While_NotLhsAndRhs) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT x, STATUSOR_INT y) {
+      while (!x.ok() && y.ok()) x.value();  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT x, STATUSOR_INT y) {
+      while (!x.ok() && y.ok()) y.value();
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, While_LhsAndNotRhs) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT x, STATUSOR_INT y) {
+      while (x.ok() && !y.ok()) x.value();
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT x, STATUSOR_INT y) {
+      while (x.ok() && !y.ok()) y.value();  // [[unsafe]]
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, While_NotLhsAndNotRhs) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT x, STATUSOR_INT y) {
+      while (!x.ok() && !y.ok()) x.value();  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT x, STATUSOR_INT y) {
+      while (!x.ok() && !y.ok()) y.value();  // [[unsafe]]
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, While_Not_LhsAndRhs) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT x, STATUSOR_INT y) {
+      while (!(x.ok() && y.ok())) x.value();  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT x, STATUSOR_INT y) {
+      while (!(x.ok() && y.ok())) y.value();  // [[unsafe]]
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, While_Not_NotLhsAndRhs) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT x, STATUSOR_INT y) {
+      while (!(!x.ok() && y.ok())) x.value();  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT x, STATUSOR_INT y) {
+      while (!(!x.ok() && y.ok())) y.value();  // [[unsafe]]
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, While_Not_LhsAndNotRhs) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT x, STATUSOR_INT y) {
+      while (!(x.ok() && !y.ok())) x.value();  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT x, STATUSOR_INT y) {
+      while (!(x.ok() && !y.ok())) y.value();  // [[unsafe]]
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, While_Not_NotLhsAndNotRhs) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT x, STATUSOR_INT y) {
+      while (!(!x.ok() && !y.ok())) x.value();  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT x, STATUSOR_INT y) {
+      while (!(!x.ok() && !y.ok())) y.value();  // [[unsafe]]
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, While_LhsOrRhs) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT x, STATUSOR_INT y) {
+      while (x.ok() || y.ok()) x.value();  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT x, STATUSOR_INT y) {
+      while (x.ok() || y.ok()) y.value();  // [[unsafe]]
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, While_NotLhsOrRhs) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT x, STATUSOR_INT y) {
+      while (!x.ok() || y.ok()) x.value();  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT x, STATUSOR_INT y) {
+      while (!x.ok() || y.ok()) y.value();  // [[unsafe]]
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, While_LhsOrNotRhs) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT x, STATUSOR_INT y) {
+      while (x.ok() || !y.ok()) x.value();  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT x, STATUSOR_INT y) {
+      while (x.ok() || !y.ok()) y.value();  // [[unsafe]]
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, While_NotLhsOrNotRhs) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT x, STATUSOR_INT y) {
+      while (!x.ok() || !y.ok()) x.value();  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT x, STATUSOR_INT y) {
+      while (!x.ok() || !y.ok()) y.value();  // [[unsafe]]
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, While_Not_LhsOrRhs) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT x, STATUSOR_INT y) {
+      while (!(x.ok() || y.ok())) x.value();  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT x, STATUSOR_INT y) {
+      while (!(x.ok() || y.ok())) y.value();  // [[unsafe]]
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, While_Not_NotLhsOrRhs) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT x, STATUSOR_INT y) {
+      while (!(!x.ok() || y.ok())) x.value();
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT x, STATUSOR_INT y) {
+      while (!(!x.ok() || y.ok())) y.value();  // [[unsafe]]
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, While_Not_LhsOrNotRhs) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT x, STATUSOR_INT y) {
+      while (!(x.ok() || !y.ok())) x.value();  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT x, STATUSOR_INT y) {
+      while (!(x.ok() || !y.ok())) y.value();
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, While_Not_NotLhsOrNotRhs) {
+  ExpectDiagnosticsFor(
+      R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+        void target(STATUSOR_INT x, STATUSOR_INT y) {
+          while (!(!x.ok() || !y.ok())) {
+            x.value();
+
+            y.value();
+          }
+        }
+      )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, While_AccessAfterStmt) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      while (sor.ok()) {
+      }
+
+      sor.value();  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      while (!sor.ok()) {
+      }
+
+      sor.value();
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, While_TerminatingBranch_Return) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      while (!sor.ok()) return;
+
+      sor.value();
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      while (sor.ok()) return;
+
+      sor.value();  // [[unsafe]]
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, While_NestedIfWithBinaryCondition) {
+  ExpectDiagnosticsFor(
+      R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+        void target(STATUSOR_INT x, STATUSOR_INT y) {
+          while (Make<bool>()) {
+            if (x.ok() && y.ok()) {
+              x.value();
+
+              y.value();
+            }
+          }
+        }
+      )cc");
+  ExpectDiagnosticsFor(
+      R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+        void target(STATUSOR_INT x, STATUSOR_INT y) {
+          while (Make<bool>()) {
+            if (!(!x.ok() || !y.ok())) {
+              x.value();
+
+              y.value();
+            }
+          }
+        }
+      )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, BuiltinExpect) {
+  ExpectDiagnosticsFor(
+      R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+        void target(STATUSOR_INT x, STATUSOR_INT y) {
+          if (!__builtin_expect(!x.ok() || __builtin_expect(!y.ok(), true), false)) {
+            x.value();
+
+            y.value();
+          }
+        }
+      )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, CopyAssignment) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target() {
+      STATUSOR_INT sor = Make<STATUSOR_INT>();
+      if (sor.ok()) {
+        sor = Make<STATUSOR_INT>();
+        sor.value();  // [[unsafe]]
+      }
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target() {
+      STATUSOR_INT sor = Make<STATUSOR_INT>();
+      if (!sor.ok()) return;
+
+      sor = Make<STATUSOR_INT>();
+      sor.value();  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target() {
+      STATUSOR_INT x = Make<STATUSOR_INT>();
+      if (x.ok()) {
+        STATUSOR_INT y = x;
+        x = Make<STATUSOR_INT>();
+
+        y.value();
+
+        x.value();  // [[unsafe]]
+      }
+    }
+  )cc");
+  ExpectDiagnosticsFor(
+      R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+        void target() {
+          STATUSOR_INT x = Make<STATUSOR_INT>();
+          STATUSOR_INT y = x;
+          if (!y.ok()) return;
+
+          x.value();
+
+          y = Make<STATUSOR_INT>();
+          x.value();
+        }
+      )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    struct Foo {
+      STATUSOR_INT bar;
+    };
+
+    void target(Foo foo) {
+      foo.bar = Make<STATUSOR_INT>();
+      if (foo.bar.ok()) {
+        foo.bar.value();
+
+        foo.bar = Make<STATUSOR_INT>();
+        foo.bar.value();  // [[unsafe]]
+      }
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, ShortCircuitingBinaryOperators) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_BOOL sor) {
+      bool b = sor.ok() & sor.value();  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_BOOL sor) {
+      bool b = sor.ok() && sor.value();
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_BOOL sor) {
+      bool b = !sor.ok() && sor.value();  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_BOOL sor) {
+      bool b = sor.ok() || sor.value();  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_BOOL sor) {
+      bool b = !sor.ok() || sor.value();
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(bool b, STATUSOR_INT sor) {
+      if (b || sor.ok()) {
+        do {
+          sor.value();  // [[unsafe]]
+        } while (true);
+      }
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(bool b, STATUSOR_INT sor) {
+      if (__builtin_expect(b || sor.ok(), false)) {
+        do {
+          sor.value();  // [[unsafe]]
+        } while (false);
+      }
+    }
+  )cc");
+  ExpectDiagnosticsFor(
+      R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+        void target(STATUSOR_INT sor1, STATUSOR_INT sor2) {
+          while (sor1.ok() && sor2.ok()) sor1.value();
+          while (sor1.ok() && sor2.ok()) sor2.value();
+        }
+      )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, References) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target() {
+      STATUSOR_INT x = Make<STATUSOR_INT>();
+      STATUSOR_INT& y = x;
+      if (x.ok()) {
+        x.value();
+
+        y.value();
+      } else {
+        x.value();  // [[unsafe]]
+
+        y.value();  // [[unsafe]]
+      }
+    }
+  )cc");
+  ExpectDiagnosticsFor(
+      R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+        void target() {
+          STATUSOR_INT x = Make<STATUSOR_INT>();
+          STATUSOR_INT& y = x;
+          if (y.ok()) {
+            x.value();
+
+            y.value();
+          } else {
+            x.value();  // [[unsafe]]
+
+            y.value();  // [[unsafe]]
+          }
+        }
+      )cc");
+  ExpectDiagnosticsFor(
+      R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+        void target() {
+          STATUSOR_INT x = Make<STATUSOR_INT>();
+          STATUSOR_INT& y = x;
+          if (!y.ok()) return;
+
+          x.value();
+
+          y = Make<STATUSOR_INT>();
+          x.value();  // [[unsafe]]
+        }
+      )cc");
+  ExpectDiagnosticsFor(
+      R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+        void target() {
+          STATUSOR_INT x = Make<STATUSOR_INT>();
+          const STATUSOR_INT& y = x;
+          if (!y.ok()) return;
+
+          y.value();
+
+          x = Make<STATUSOR_INT>();
+          y.value();  // [[unsafe]]
+        }
+      )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, NoReturnAttribute) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    __attribute__((noreturn)) void f();
+
+    void target(STATUSOR_INT sor) {
+      if (!sor.ok()) f();
+
+      sor.value();
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void f();
+
+    void target(STATUSOR_INT sor) {
+      if (!sor.ok()) f();
+
+      sor.value();  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    struct Foo {
+      __attribute__((noreturn)) ~Foo();
+      void Bar();
+    };
+
+    void target(STATUSOR_INT sor) {
+      if (!sor.ok()) Foo().Bar();
+
+      sor.value();
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    struct Foo {
+      ~Foo();
+      void Bar();
+    };
+
+    void target(STATUSOR_INT sor) {
+      if (!sor.ok()) Foo().Bar();
+
+      sor.value();  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void f();
+    __attribute__((noreturn)) void g();
+
+    void target(STATUSOR_INT sor) {
+      sor.ok() ? f() : g();
+
+      sor.value();
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    __attribute__((noreturn)) void f();
+    void g();
+
+    void target(STATUSOR_INT sor) {
+      !sor.ok() ? f() : g();
+
+      sor.value();
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void f();
+    void g();
+
+    void target(STATUSOR_INT sor) {
+      sor.ok() ? f() : g();
+
+      sor.value();  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void terminate() __attribute__((noreturn));
+
+    void target(STATUSOR_INT sor) {
+      sor.value();  // [[unsafe]]
+      terminate();
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void terminate() __attribute__((noreturn));
+
+    void target(STATUSOR_INT sor) {
+      if (sor.ok()) sor.value();
+      terminate();
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void terminate() __attribute__((noreturn));
+
+    struct Foo {
+      ~Foo() __attribute__((noreturn));
+    };
+
+    void target() {
+      auto sor = Make<absl::StatusOr<Foo>>();
+      !(false || !(sor.ok())) ? (void)0 : terminate();
+      sor.value();
+      terminate();
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, DeclInLoop) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      while (auto ok = sor.ok()) sor.value();
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    using BoolAlias = bool;
+
+    void target(STATUSOR_INT sor) {
+      while (BoolAlias ok = sor.ok()) sor.value();
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target() {
+      while (Make<bool>()) {
+        STATUSOR_INT sor = Make<STATUSOR_INT>();
+        sor.value();  // [[unsafe]]
+      }
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    using StatusOrInt = STATUSOR_INT;
+
+    void target() {
+      while (Make<bool>()) {
+        StatusOrInt sor = Make<STATUSOR_INT>();
+        sor.value();  // [[unsafe]]
+      }
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, NonEvaluatedExprInCondition) {
+  ExpectDiagnosticsFor(
+      R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+        bool unknown();
+
+        void target(STATUSOR_INT sor) {
+          if (unknown() && sor.ok()) sor.value();
+          if (sor.ok() && unknown()) sor.value();
+        }
+      )cc");
+  ExpectDiagnosticsFor(
+      R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+        bool unknown();
+
+        void target(STATUSOR_INT sor) {
+          if (!(!unknown() || !sor.ok())) sor.value();
+          if (!(!sor.ok() || !unknown())) sor.value();
+        }
+      )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    bool unknown();
+
+    void target(STATUSOR_INT sor) {
+      if (unknown() || sor.ok()) sor.value();  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    bool unknown();
+
+    void target(STATUSOR_INT sor) {
+      if (sor.ok() || unknown()) sor.value();  // [[unsafe]]
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, CorrelatedBranches) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(bool b, STATUSOR_INT sor) {
+      if (b || sor.ok()) {
+        if (!b) sor.value();
+      }
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(bool b, STATUSOR_INT sor) {
+      if (b && !sor.ok()) return;
+      if (b) sor.value();
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(bool b, STATUSOR_INT sor) {
+      if (sor.ok()) b = true;
+      if (b) sor.value();  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(bool b, STATUSOR_INT sor) {
+      if (b) return;
+      if (sor.ok()) b = true;
+      if (b) sor.value();
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, ConditionWithInitStmt) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target() {
+      if (STATUSOR_INT sor = Make<STATUSOR_INT>(); sor.ok()) sor.value();
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target() {
+      if (STATUSOR_INT sor = Make<STATUSOR_INT>(); !sor.ok())
+        sor.value();  // [[unsafe]]
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, DeadCode) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      bool b = false;
+      if (b) sor.value();
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      bool b;
+      b = false;
+      if (b) sor.value();
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, TemporaryDestructors) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      sor.ok() ? sor.value() : Fatal().value();
+
+      sor.value();
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      !sor.ok() ? Fatal().value() : sor.value();
+
+      sor.value();
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(bool b, STATUSOR_INT sor) {
+      b ? 0 : sor.ok() ? sor.value() : Fatal().value();
+
+      sor.value();  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(bool b, STATUSOR_INT sor) {
+      for (int i = 0; i < 10; i++) {
+        (b && sor.ok()) ? sor.value() : Fatal().value();
+
+        if (b) sor.value();
+      }
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(bool b, STATUSOR_INT sor) {
+      for (int i = 0; i < 10; i++) {
+        (b || !sor.ok()) ? Fatal().value() : 0;
+
+        if (!b) sor.value();
+      }
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(bool b, STATUSOR_INT sor) {
+      for (int i = 0; i < 10; i++) {
+        (false || !(b && sor.ok())) ? Fatal().value() : 0;
+
+        do {
+          sor.value();
+        } while (b);
+      }
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, CheckMacro) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      CHECK(sor.ok());
+      sor.value();
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      CHECK(!sor.ok());
+      sor.value();  // [[unsafe]]
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, QcheckMacro) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      QCHECK(sor.ok());
+      sor.value();
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      QCHECK(!sor.ok());
+      sor.value();  // [[unsafe]]
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, CheckNeMacro) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      CHECK_NE(sor.status(), absl::OkStatus());
+      sor.value();  // [[unsafe]]
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, QcheckNeMacro) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      QCHECK_NE(sor.status(), absl::OkStatus());
+      sor.value();  // [[unsafe]]
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, GlobalVars) {
+  // The following examples are not sound as there could be opaque calls between
+  // the ok() and the value() calls that change the StatusOr value.
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    static STATUSOR_INT sor;
+
+    void target() {
+      if (sor.ok())
+        sor.value();
+      else
+        sor.value();  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target() {
+      static STATUSOR_INT sor;
+      if (sor.ok())
+        sor.value();
+      else
+        sor.value();  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    struct Foo {
+      static STATUSOR_INT sor;
+    };
+
+    void target(Foo foo) {
+      if (foo.sor.ok())
+        foo.sor.value();
+      else
+        foo.sor.value();  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    struct Foo {
+      static STATUSOR_INT sor;
+    };
+
+    void target() {
+      if (Foo::sor.ok())
+        Foo::sor.value();
+      else
+        Foo::sor.value();  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    struct Foo {
+      static STATUSOR_INT sor;
+
+      static void target() {
+        if (sor.ok())
+          sor.value();
+        else
+          sor.value();  // [[unsafe]]
+      }
+    };
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    struct Foo {
+      static STATUSOR_INT sor;
+
+      void target() {
+        if (sor.ok())
+          sor.value();
+        else
+          sor.value();  // [[unsafe]]
+      }
+    };
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    struct S {
+      static const int x = -1;
+    };
+
+    int target(S s) { return s.x; }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, ReferenceReceivers) {
+  // The following examples are not sound as there could be opaque calls between
+  // the ok() and the value() calls that change the StatusOr value. However,
+  // this is the behavior that users expect so it is here to stay.
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT& sor) {
+      if (sor.ok())
+        sor.value();
+      else
+        sor.value();  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    struct Foo {
+      STATUSOR_INT& sor;
+    };
+
+    void target(Foo foo) {
+      if (foo.sor.ok())
+        foo.sor.value();
+      else
+        foo.sor.value();  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    struct Bar {
+      STATUSOR_INT sor;
+    };
+
+    struct Foo {
+      Bar& bar;
+    };
+
+    void target(Foo foo) {
+      if (foo.bar.sor.ok())
+        foo.bar.sor.value();
+      else
+        foo.bar.sor.value();  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    struct Foo {
+      STATUSOR_INT& sor;
+    };
+
+    void target(Foo& foo) {
+      if (foo.sor.ok())
+        foo.sor.value();
+      else
+        foo.sor.value();  // [[unsafe]]
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, Lambdas) {
+  ExpectDiagnosticsForLambda(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target() {
+      [](STATUSOR_INT sor) {
+        if (sor.ok())
+          sor.value();
+        else
+          sor.value();  // [[unsafe]]
+      }(Make<STATUSOR_INT>());
+    }
+  )cc");
+  ExpectDiagnosticsForLambda(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      [sor]() {
+        if (sor.ok())
+          sor.value();
+        else
+          sor.value();  // [[unsafe]]
+      }();
+    }
+  )cc");
+  ExpectDiagnosticsForLambda(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      [&sor]() {
+        if (sor.ok())
+          sor.value();
+        else
+          sor.value();  // [[unsafe]]
+      }();
+    }
+  )cc");
+  ExpectDiagnosticsForLambda(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      [sor2 = sor]() {
+        if (sor2.ok())
+          sor2.value();
+        else
+          sor2.value();  // [[unsafe]]
+      }();
+    }
+  )cc");
+  ExpectDiagnosticsForLambda(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      [&]() {
+        if (sor.ok())
+          sor.value();
+        else
+          sor.value();  // [[unsafe]]
+      }();
+    }
+  )cc");
+  ExpectDiagnosticsForLambda(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      [=]() {
+        if (sor.ok())
+          sor.value();
+        else
+          sor.value();  // [[unsafe]]
+      }();
+    }
+  )cc");
+  ExpectDiagnosticsForLambda(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    struct Foo {
+      STATUSOR_INT sor;
+
+      void target() {
+        [this]() {
+          if (sor.ok())
+            sor.value();
+          else
+            sor.value();  // [[unsafe]]
+        }();
+      }
+    };
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, GoodLambda) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    int target() {
+      STATUSOR_INT sor = Make<STATUSOR_INT>();
+      if (sor.ok()) return [&s = sor.value()] { return s; }();
+      return 0;
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, Status) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void foo();
+
+    void target(STATUS s) {
+      if (s.ok()) foo();
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void foo();
+
+    void target() {
+      STATUS s = Make<STATUSOR_INT>().status();
+      if (s.ok()) foo();
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, ExpectThatMacro) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      EXPECT_THAT(sor, testing::status::IsOk());
+
+      sor.value();  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      EXPECT_THAT(sor.status(), testing::status::IsOk());
+
+      sor.value();  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target() {
+      STATUSOR_INT sor = Make<STATUSOR_INT>();
+      EXPECT_THAT(sor, testing::status::IsOk());
+
+      sor.value();  // [[unsafe]]
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, ExpectOkMacro) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      EXPECT_OK(sor);
+
+      sor.value();  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      EXPECT_OK(sor.status());
+
+      sor.value();  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target() {
+      STATUSOR_INT sor = Make<STATUSOR_INT>();
+      EXPECT_OK(sor);
+
+      sor.value();  // [[unsafe]]
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, BreadthFirstBlockTraversalLoop) {
+  // Evaluating the CFG blocks of the code below in breadth-first order results
+  // in an infinite loop. Each iteration of the while loop below results in a
+  // new value being assigned to the storage location of sor1. However,
+  // following a bread-first order of evaluation, downstream blocks will join
+  // environments of different generations of predecessor blocks having distinct
+  // values assigned to the sotrage location of sor1, resulting in not assigning
+  // a value to the storage location of sor1 in successors. As iterations of the
+  // analysis go, the state of the environment flips between having a value
+  // assigned to the storage location of sor1 and not having a value assigned to
+  // it. Since the evaluation of the copy constructor expression in bar(sor1)
+  // depends on a value being assigned to sor1, the state of the environment
+  // also flips between having a storage location assigned to the bar(sor1)
+  // expression and not having a storage location assigned to it. This leads to
+  // an infinite loop as the environment can't stabilize.
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void foo(int, int);
+    STATUSOR_INT bar(STATUSOR_INT);
+    void baz(int);
+
+    void target() {
+      while (true) {
+        STATUSOR_INT sor1 = Make<STATUSOR_INT>();
+        if (sor1.ok()) {
+          STATUSOR_INT sor2 = Make<STATUSOR_INT>();
+          if (sor2.ok()) foo(sor1.value(), sor2.value());
+        }
+
+        STATUSOR_INT sor3 = bar(sor1);
+        for (int i = 0; i < 5; i++) sor3 = bar(sor1);
+
+        baz(sor3.value());  // [[unsafe]]
+      }
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, ReturnValue) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target() {
+      STATUSOR_INT sor = Make<STATUSOR_INT>();
+      if (sor.ok())
+        sor.value();
+      else
+        sor.value();  // [[unsafe]]
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, Goto) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+    label:
+      if (sor.ok())
+        sor.value();
+      else
+        sor.value();  // [[unsafe]]
+      goto label;
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+    label:
+      if (!sor.ok()) goto label;
+      sor.value();
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      if (!sor.ok()) return;
+      goto label;
+    label:
+      sor.value();
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, JoinDistinctValues) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(bool b) {
+      STATUSOR_INT sor;
+      if (b)
+        sor = Make<STATUSOR_INT>();
+      else
+        sor = Make<STATUSOR_INT>();
+
+      if (sor.ok())
+        sor.value();
+      else
+        sor.value();  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(bool b) {
+      STATUSOR_INT sor;
+      if (b) {
+        sor = Make<STATUSOR_INT>();
+        if (!sor.ok()) return;
+      } else {
+        sor = Make<STATUSOR_INT>();
+        if (!sor.ok()) return;
+      }
+      sor.value();
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(bool b) {
+      STATUSOR_INT sor;
+      if (b) {
+        sor = Make<STATUSOR_INT>();
+        if (!sor.ok()) return;
+      } else {
+        sor = Make<STATUSOR_INT>();
+      }
+      sor.value();  // [[unsafe]]
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, VarDeclInitExprFromPairAccess) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target() {
+      auto sor = Make<std::pair<int, STATUSOR_INT>>().second;
+      if (sor.ok())
+        sor.value();
+      else
+        sor.value();  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target() {
+      const auto& sor = Make<std::pair<int, STATUSOR_INT>>().second;
+      if (sor.ok())
+        sor.value();
+      else
+        sor.value();  // [[unsafe]]
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, LValueToRValueCastOfChangingValue) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    bool foo();
+
+    void target(bool b1) {
+      STATUSOR_INT sor;
+      if (b1)
+        sor = Make<STATUSOR_INT>();
+      else
+        sor = Make<STATUSOR_INT>();
+
+      do {
+        const auto& b2 = foo();
+        if (b2) break;
+
+        sor.value();  // [[unsafe]]
+      } while (true);
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, ConstructorInitializer) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    class target {
+      target() : foo_(Make<STATUSOR_INT>().value()) {  // [[unsafe]]
+      }
+      int foo_;
+    };
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, AssignStatusToBoolVar) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      bool ok = sor.ok();
+      if (ok)
+        sor.value();
+      else
+        sor.value();  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      bool not_ok = !sor.ok();
+      if (not_ok)
+        sor.value();  // [[unsafe]]
+      else
+        sor.value();
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, StructuredBindings) {
+  // Binding to a pair (which is actually a struct in the mock header).
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target() {
+      const auto [sor, x] = Make<std::pair<STATUSOR_INT, int>>();
+      if (sor.ok()) sor.value();
+    }
+  )cc");
+
+  // Unsafe case.
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target() {
+      const auto [sor, x] = Make<std::pair<STATUSOR_INT, int>>();
+      sor.value();  // [[unsafe]]
+    }
+  )cc");
+
+  // As a reference.
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target() {
+      const auto& [sor, x] = Make<std::pair<STATUSOR_INT, int>>();
+      if (sor.ok()) sor.value();
+    }
+  )cc");
+
+  // Binding to a ref in a struct.
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    struct S {
+      STATUSOR_INT& sor;
+      int i;
+    };
+
+    void target() {
+      const auto& [sor, i] = Make<S>();
+      if (sor.ok()) sor.value();
+    }
+  )cc");
+
+  // In a loop.
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target() {
+      auto vals = Make<std::vector<std::pair<int, STATUSOR_INT>>>();
+      for (const auto& [x, sor] : vals)
+        if (sor.ok()) sor.value();
+    }
+  )cc");
+
+  // Similar to the above, but InitExpr already has the storage initialized,
+  // and bindings refer to them.
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target() {
+      auto vals = Make<std::vector<std::pair<int, STATUSOR_INT>>>();
+      for (const auto& p : vals) {
+        const auto& [i, sor] = p;
+        if (sor.ok()) sor.value();
+      }
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, AssignCompositeLogicExprToVar) {
+  ExpectDiagnosticsFor(
+      R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+        void target(STATUSOR_INT sor, bool b) {
+          bool c = sor.ok() && b;
+          if (c) sor.value();
+        }
+      )cc");
+
+  ExpectDiagnosticsFor(
+      R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+        void target(STATUSOR_INT sor, bool b) {
+          bool c = !(!sor.ok() || !b);
+          if (c) sor.value();
+        }
+      )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, Subclass) {
+  ExpectDiagnosticsFor(
+      R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+        class Foo : public STATUSOR_INT {};
+
+        void target(Foo opt) {
+          opt.value();  // [[unsafe]]
+        }
+      )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, SubclassStatus) {
+  ExpectDiagnosticsFor(
+      R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+        class Foo : public STATUS {};
+
+        void target(Foo opt) { opt.ok(); }
+      )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, SubclassOk) {
+  ExpectDiagnosticsFor(
+      R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+        class Foo : public STATUSOR_INT {};
+
+        void target(Foo opt) {
+          if (opt.ok()) opt.value();
+        }
+      )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, SubclassOperator) {
+  ExpectDiagnosticsFor(
+      R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+        class Foo : public STATUSOR_INT {};
+
+        void target(Foo opt) {
+          *opt;  // [[unsafe]]
+        }
+      )cc");
+}
+
+} // namespace
+
+std::string
+GetAliasMacros(UncheckedStatusOrAccessModelTestAliasKind AliasKind) {
+  switch (AliasKind) {
+  case UncheckedStatusOrAccessModelTestAliasKind::kUnaliased:
+    return R"cc(
+#define STATUSOR_INT ::absl::StatusOr<int>
+#define STATUSOR_BOOL ::absl::StatusOr<bool>
+#define STATUSOR_VOIDPTR ::absl::StatusOr<void*>
+#define STATUS ::absl::Status
+      )cc";
+  case UncheckedStatusOrAccessModelTestAliasKind::kPartiallyAliased:
+    return R"cc(
+        template <typename T>
+        using StatusOrAlias = ::absl::StatusOr<T>;
+#define STATUSOR_INT StatusOrAlias<int>
+#define STATUSOR_BOOL StatusOrAlias<bool>
+#define STATUSOR_VOIDPTR StatusOrAlias<void*>
+#define STATUS ::absl::Status
+      )cc";
+  case UncheckedStatusOrAccessModelTestAliasKind::kFullyAliased:
+    return R"cc(
+        using StatusOrIntAlias = ::absl::StatusOr<int>;
+#define STATUSOR_INT StatusOrIntAlias
+        using StatusOrBoolAlias = ::absl::StatusOr<bool>;
+#define STATUSOR_BOOL StatusOrBoolAlias
+        using StatusOrVoidPtrAlias = ::absl::StatusOr<void*>;
+#define STATUSOR_VOIDPTR StatusOrVoidPtrAlias
+        using StatusAlias = ::absl::Status;
+#define STATUS StatusAlias
+      )cc";
+  }
+  llvm_unreachable("Unknown alias kind.");
+}
+
+std::vector<std::pair<std::string, std::string>>
+GetHeaders(UncheckedStatusOrAccessModelTestAliasKind AliasKind) {
+  auto Headers = test::getMockHeaders();
+
+  Headers.emplace_back("unchecked_statusor_access_test_defs.h",
+                       R"cc(
+#include "cstddef.h"
+#include "statusor_defs.h"
+#include "std_optional.h"
+#include "std_vector.h"
+#include "std_pair.h"
+#include "absl_log.h"
+#include "testing_defs.h"
+
+                             template <typename T>
+                             T Make();
+
+                             class Fatal {
+                              public:
+                               ~Fatal() __attribute__((noreturn));
+                               int value();
+                             };
+                       )cc" +
+                           GetAliasMacros(AliasKind));
+  return Headers;
+}
+} // namespace clang::dataflow::statusor_model
diff --git a/clang/unittests/Analysis/FlowSensitive/UncheckedStatusOrAccessModelTestFixture.h b/clang/unittests/Analysis/FlowSensitive/UncheckedStatusOrAccessModelTestFixture.h
new file mode 100644
index 0000000000000..92dcd9328c322
--- /dev/null
+++ b/clang/unittests/Analysis/FlowSensitive/UncheckedStatusOrAccessModelTestFixture.h
@@ -0,0 +1,159 @@
+//===- UncheckedStatusOrAccessModelTestFixture.h --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_ANALYSIS_FLOW_SENSITIVE_UNCHECKEDSTATUSORACCESSMODELTESTFIXTURE_H_
+#define LLVM_CLANG_ANALYSIS_FLOW_SENSITIVE_UNCHECKEDSTATUSORACCESSMODELTESTFIXTURE_H_
+
+#include <algorithm>
+#include <iterator>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "TestingSupport.h"
+#include "clang/AST/ASTContext.h"
+#include "clang/AST/Decl.h"
+#include "clang/ASTMatchers/ASTMatchers.h"
+#include "clang/Analysis/CFG.h"
+#include "clang/Analysis/FlowSensitive/DataflowEnvironment.h"
+#include "clang/Analysis/FlowSensitive/MatchSwitch.h"
+#include "clang/Analysis/FlowSensitive/Models/UncheckedStatusOrAccessModel.h"
+#include "clang/Basic/SourceLocation.h"
+#include "clang/Tooling/Tooling.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Error.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace clang::dataflow::statusor_model {
+
+enum class UncheckedStatusOrAccessModelTestAliasKind {
+  kUnaliased = 0,        // no alias
+  kPartiallyAliased = 1, // template<typename T> using Alias = absl::StatusOr;
+  kFullyAliased = 2,     // using Alias = absl::StatusOr<int>;
+};
+
+// Base class for the test executors. This is needed to abstract away the
+// template parameter from the UncheckedStatusOrAccessModelTestExecutor. This
+// allows us to use UncheckedStatusOrAccessModelTestExecutorBase* in the
+// UncheckedStatusOrAccessModelTest.
+class UncheckedStatusOrAccessModelTestExecutorBase {
+public:
+  virtual void
+  ExpectDiagnosticsFor(std::string SourceCode,
+                       UncheckedStatusOrAccessModelTestAliasKind) const = 0;
+  virtual void ExpectDiagnosticsForLambda(
+      std::string SourceCode,
+      UncheckedStatusOrAccessModelTestAliasKind) const = 0;
+  virtual ~UncheckedStatusOrAccessModelTestExecutorBase() = default;
+};
+
+// Returns these macros according to the alias kind:
+//  - STATUS
+//  - STATUSOR_INT
+//  - STATUSOR_BOOL
+//  - STATUSOR_VOIDPTR
+// Tests should use these macros instead of e.g. absl::StatusOr<int> to ensure
+// the model is insensitive to whether the StatusOr<> is aliased or not.
+std::string GetAliasMacros(UncheckedStatusOrAccessModelTestAliasKind AliasKind);
+
+std::vector<std::pair<std::string, std::string>>
+GetHeaders(UncheckedStatusOrAccessModelTestAliasKind AliasKind);
+
+// This allows us to run the same test suite for multiple models. This allows
+// vendors to model internal APIs in an extension of the base model, and make
+// sure that these tests still pass.
+template <typename Model>
+class UncheckedStatusOrAccessModelTestExecutor
+    : public UncheckedStatusOrAccessModelTestExecutorBase {
+public:
+  void ExpectDiagnosticsFor(
+      std::string SourceCode,
+      UncheckedStatusOrAccessModelTestAliasKind AliasKind) const override {
+    using namespace ::clang::ast_matchers; // NOLINT: Too many names
+    ExpectDiagnosticsFor(SourceCode, hasName("target"), AliasKind);
+  }
+
+  void ExpectDiagnosticsForLambda(
+      std::string SourceCode,
+      UncheckedStatusOrAccessModelTestAliasKind AliasKind) const override {
+    using namespace ::clang::ast_matchers; // NOLINT: Too many names
+    ExpectDiagnosticsFor(SourceCode,
+                         allOf(hasOverloadedOperatorName("()"),
+                               hasDeclContext(cxxRecordDecl(isLambda()))),
+                         AliasKind);
+  }
+
+  template <typename FuncDeclMatcher>
+  void ExpectDiagnosticsFor(
+      std::string SourceCode, FuncDeclMatcher FuncMatcher,
+      UncheckedStatusOrAccessModelTestAliasKind AliasKind) const {
+    std::vector<std::pair<std::string, std::string>> Headers =
+        GetHeaders(AliasKind);
+
+    UncheckedStatusOrAccessModelOptions Options{};
+    std::vector<SourceLocation> Diagnostics;
+    llvm::Error Error = test::checkDataflow<Model>(
+        test::AnalysisInputs<Model>(
+            SourceCode, std::move(FuncMatcher),
+            [](ASTContext &Ctx, Environment &Env) { return Model(Ctx, Env); })
+            .withPostVisitCFG(
+                [&Diagnostics,
+                 Diagnoser = UncheckedStatusOrAccessDiagnoser(Options)](
+                    ASTContext &Ctx, const CFGElement &Elt,
+                    const TransferStateForDiagnostics<
+                        UncheckedStatusOrAccessModel::Lattice> &State) mutable {
+                  auto EltDiagnostics = Diagnoser(Elt, Ctx, State);
+                  llvm::move(EltDiagnostics, std::back_inserter(Diagnostics));
+                })
+            .withASTBuildArgs(
+                {"-fsyntax-only", "-std=c++17", "-Wno-undefined-inline"})
+            .withASTBuildVirtualMappedFiles(
+                tooling::FileContentMappings(Headers.begin(), Headers.end())),
+        /*VerifyResults=*/[&Diagnostics, SourceCode](
+                              const llvm::DenseMap<unsigned, std::string>
+                                  &Annotations,
+                              const test::AnalysisOutputs &AO) {
+          llvm::DenseSet<unsigned> AnnotationLines;
+          for (const auto &[Line, _] : Annotations)
+            AnnotationLines.insert(Line);
+          auto &SrcMgr = AO.ASTCtx.getSourceManager();
+          llvm::DenseSet<unsigned> DiagnosticLines;
+          for (SourceLocation &Loc : Diagnostics)
+            DiagnosticLines.insert(SrcMgr.getPresumedLineNumber(Loc));
+
+          EXPECT_THAT(DiagnosticLines, testing::ContainerEq(AnnotationLines))
+              << "\nFailing code:\n"
+              << SourceCode;
+        });
+    if (Error)
+      FAIL() << llvm::toString(std::move(Error));
+  }
+
+  ~UncheckedStatusOrAccessModelTestExecutor() override = default;
+};
+
+class UncheckedStatusOrAccessModelTest
+    : public ::testing::TestWithParam<
+          std::pair<UncheckedStatusOrAccessModelTestExecutorBase *,
+                    UncheckedStatusOrAccessModelTestAliasKind>> {
+protected:
+  void ExpectDiagnosticsFor(std::string SourceCode) {
+    GetParam().first->ExpectDiagnosticsFor(SourceCode, GetParam().second);
+  }
+
+  void ExpectDiagnosticsForLambda(std::string SourceCode) {
+    GetParam().first->ExpectDiagnosticsForLambda(SourceCode, GetParam().second);
+  }
+};
+
+} // namespace clang::dataflow::statusor_model
+
+#endif // LLVM_CLANG_ANALYSIS_FLOW_SENSITIVE_UNCHECKEDSTATUSORACCESSMODELTESTFIXTURE_H_
diff --git a/clang/unittests/Basic/CMakeLists.txt b/clang/unittests/Basic/CMakeLists.txt
index 8c8baa57b64e7..f20c8db00a595 100644
--- a/clang/unittests/Basic/CMakeLists.txt
+++ b/clang/unittests/Basic/CMakeLists.txt
@@ -6,6 +6,7 @@ add_distinct_clang_unittest(BasicTests
   DiagnosticTest.cpp
   FileEntryTest.cpp
   FileManagerTest.cpp
+  LangOptionsTest.cpp
   LineOffsetMappingTest.cpp
   OffloadArchTest.cpp
   SanitizersTest.cpp
diff --git a/clang/unittests/Basic/LangOptionsTest.cpp b/clang/unittests/Basic/LangOptionsTest.cpp
new file mode 100644
index 0000000000000..0d7d5ec86b0b8
--- /dev/null
+++ b/clang/unittests/Basic/LangOptionsTest.cpp
@@ -0,0 +1,56 @@
+//===- unittests/Basic/LangOptionsTest.cpp --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Basic/LangOptions.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace clang;
+
+namespace {
+TEST(LangOptsTest, CStdLang) {
+  LangOptions opts;
+  EXPECT_FALSE(opts.getCLangStd());
+  opts.GNUMode = 0;
+  opts.Digraphs = 1;
+  EXPECT_EQ(opts.getCLangStd(), 199409);
+  opts.C99 = 1;
+  EXPECT_EQ(opts.getCLangStd(), 199901);
+  opts.C11 = 1;
+  EXPECT_EQ(opts.getCLangStd(), 201112);
+  opts.C17 = 1;
+  EXPECT_EQ(opts.getCLangStd(), 201710);
+  opts.C23 = 1;
+  EXPECT_EQ(opts.getCLangStd(), 202311);
+  opts.C2y = 1;
+  EXPECT_EQ(opts.getCLangStd(), 202400);
+
+  EXPECT_FALSE(opts.getCPlusPlusLangStd());
+}
+
+TEST(LangOptsTest, CppStdLang) {
+  LangOptions opts;
+  EXPECT_FALSE(opts.getCPlusPlusLangStd());
+  opts.CPlusPlus = 1;
+  EXPECT_EQ(opts.getCPlusPlusLangStd(), 199711);
+  opts.CPlusPlus11 = 1;
+  EXPECT_EQ(opts.getCPlusPlusLangStd(), 201103);
+  opts.CPlusPlus14 = 1;
+  EXPECT_EQ(opts.getCPlusPlusLangStd(), 201402);
+  opts.CPlusPlus17 = 1;
+  EXPECT_EQ(opts.getCPlusPlusLangStd(), 201703);
+  opts.CPlusPlus20 = 1;
+  EXPECT_EQ(opts.getCPlusPlusLangStd(), 202002);
+  opts.CPlusPlus23 = 1;
+  EXPECT_EQ(opts.getCPlusPlusLangStd(), 202302);
+  opts.CPlusPlus26 = 1;
+  EXPECT_EQ(opts.getCPlusPlusLangStd(), 202400);
+
+  EXPECT_FALSE(opts.getCLangStd());
+}
+} // namespace
diff --git a/clang/unittests/Driver/MultilibBuilderTest.cpp b/clang/unittests/Driver/MultilibBuilderTest.cpp
index 38c9344d2966d..0c1e806d42065 100644
--- a/clang/unittests/Driver/MultilibBuilderTest.cpp
+++ b/clang/unittests/Driver/MultilibBuilderTest.cpp
@@ -91,7 +91,7 @@ TEST(MultilibBuilderTest, SetConstruction2) {
   ASSERT_TRUE(MS.size() == 4);
   for (MultilibSet::const_iterator I = MS.begin(), E = MS.end(); I != E; ++I) {
     ASSERT_TRUE(llvm::StringSwitch<bool>(I->gccSuffix())
-                    .Cases("", "/sof", "/el", "/sof/el", true)
+                    .Cases({"", "/sof", "/el", "/sof/el"}, true)
                     .Default(false))
         << "Multilib " << *I << " wasn't expected";
     ASSERT_TRUE(llvm::StringSwitch<bool>(I->gccSuffix())
diff --git a/clang/unittests/Format/AlignBracketsTest.cpp b/clang/unittests/Format/AlignBracketsTest.cpp
index c4380ae415751..ea8db51a4d18e 100644
--- a/clang/unittests/Format/AlignBracketsTest.cpp
+++ b/clang/unittests/Format/AlignBracketsTest.cpp
@@ -778,6 +778,19 @@ TEST_F(AlignBracketsTest, ParenthesesAndOperandAlignment) {
                Style);
 }
 
+TEST_F(AlignBracketsTest, BlockIndentAndNamespace) {
+  auto Style = getLLVMStyleWithColumns(120);
+  Style.AllowShortNamespacesOnASingleLine = true;
+  Style.AlignAfterOpenBracket = FormatStyle::BAS_BlockIndent;
+
+  verifyNoCrash(
+      "namespace {\n"
+      "void xxxxxxxxxxxxxxxxxxxxx(nnnnn::TTTTTTTTTTTTT const *mmmm,\n"
+      "                           YYYYYYYYYYYYYYYYY &yyyyyyyyyyyyyy);\n"
+      "} //",
+      Style);
+}
+
 } // namespace
 } // namespace test
 } // namespace format
diff --git a/clang/unittests/Format/ConfigParseTest.cpp b/clang/unittests/Format/ConfigParseTest.cpp
index 6111e86ff7076..6488e38badee7 100644
--- a/clang/unittests/Format/ConfigParseTest.cpp
+++ b/clang/unittests/Format/ConfigParseTest.cpp
@@ -176,7 +176,6 @@ TEST(ConfigParseTest, ParsesConfigurationBools) {
   CHECK_PARSE_BOOL(BreakBeforeTernaryOperators);
   CHECK_PARSE_BOOL(BreakStringLiterals);
   CHECK_PARSE_BOOL(CompactNamespaces);
-  CHECK_PARSE_BOOL(Cpp11BracedListStyle);
   CHECK_PARSE_BOOL(DerivePointerAlignment);
   CHECK_PARSE_BOOL_FIELD(DerivePointerAlignment, "DerivePointerBinding");
   CHECK_PARSE_BOOL(DisableFormat);
@@ -1139,6 +1138,18 @@ TEST(ConfigParseTest, ParsesConfiguration) {
               FormatStyle::SDS_Leave);
   CHECK_PARSE("SeparateDefinitionBlocks: Never", SeparateDefinitionBlocks,
               FormatStyle::SDS_Never);
+
+  CHECK_PARSE("Cpp11BracedListStyle: Block", Cpp11BracedListStyle,
+              FormatStyle::BLS_Block);
+  CHECK_PARSE("Cpp11BracedListStyle: FunctionCall", Cpp11BracedListStyle,
+              FormatStyle::BLS_FunctionCall);
+  CHECK_PARSE("Cpp11BracedListStyle: AlignFirstComment", Cpp11BracedListStyle,
+              FormatStyle::BLS_AlignFirstComment);
+  // For backward compatibility:
+  CHECK_PARSE("Cpp11BracedListStyle: false", Cpp11BracedListStyle,
+              FormatStyle::BLS_Block);
+  CHECK_PARSE("Cpp11BracedListStyle: true", Cpp11BracedListStyle,
+              FormatStyle::BLS_AlignFirstComment);
 }
 
 TEST(ConfigParseTest, ParsesConfigurationWithLanguages) {
@@ -1264,6 +1275,13 @@ TEST(ConfigParseTest, ParsesConfigurationWithLanguages) {
               IndentWidth, 56u);
 }
 
+TEST(ConfigParseTest, AllowCommentOnlyConfigFile) {
+  FormatStyle Style = {};
+  Style.Language = FormatStyle::LK_Cpp;
+  EXPECT_EQ(parseConfiguration("#Language: C", &Style), ParseError::Success);
+  EXPECT_EQ(Style.Language, FormatStyle::LK_Cpp);
+}
+
 TEST(ConfigParseTest, AllowCppForC) {
   FormatStyle Style = {};
   Style.Language = FormatStyle::LK_C;
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index 450c34f111a8a..ce68f91bef02a 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -14363,7 +14363,7 @@ TEST_F(FormatTest, LayoutCxx11BraceInitializers) {
       BreakBeforeLambdaBody);
 
   FormatStyle ExtraSpaces = getLLVMStyle();
-  ExtraSpaces.Cpp11BracedListStyle = false;
+  ExtraSpaces.Cpp11BracedListStyle = FormatStyle::BLS_Block;
   ExtraSpaces.ColumnLimit = 75;
   verifyFormat("vector<int> x{ 1, 2, 3, 4 };", ExtraSpaces);
   verifyFormat("vector<T> x{ {}, {}, {}, {} };", ExtraSpaces);
@@ -18559,6 +18559,11 @@ TEST_F(FormatTest, AlignConsecutiveMacros) {
                "#define bbbb 4\n"
                "#define ccc       (5)",
                Style);
+
+  Style.ColumnLimit = 30;
+  verifyFormat("#define MY_FUNC(x) callMe(X)\n"
+               "#define MY_LONG_CONSTANT 17",
+               Style);
 }
 
 TEST_F(FormatTest, AlignConsecutiveAssignmentsAcrossEmptyLines) {
@@ -19539,6 +19544,15 @@ TEST_F(FormatTest, AlignConsecutiveAssignments) {
       "int j      = 2;",
       Alignment);
 
+  verifyFormat("int abcdefghijk = 111;\n"
+               "auto lambda     = [] {\n"
+               "  int c = call(1, //\n"
+               "               2, //\n"
+               "               3, //\n"
+               "               4);\n"
+               "};",
+               Alignment);
+
   verifyFormat("template <typename T, typename T_0 = very_long_type_name_0,\n"
                "          typename B   = very_long_type_name_1,\n"
                "          typename T_2 = very_long_type_name_2>\n"
@@ -19575,6 +19589,12 @@ TEST_F(FormatTest, AlignConsecutiveAssignments) {
                "  return;\n"
                "};",
                Alignment);
+  verifyFormat("auto aaaaaaaaaaaaaaaaaaaaa = {};\n"
+               "auto b                     = g([] {\n"
+               "  return \"Hello \"\n"
+               "         \"World\";\n"
+               "});",
+               Alignment);
   verifyFormat("auto aaaaaaaaaaaaaaaaaaaaa = {};\n"
                "auto b                     = g([] {\n"
                "  f();\n"
@@ -19599,12 +19619,11 @@ TEST_F(FormatTest, AlignConsecutiveAssignments) {
                "           ccc ? aaaaa : bbbbb,\n"
                "           dddddddddddddddddddddddddd);",
                Alignment);
-  // FIXME: https://llvm.org/PR53497
-  // verifyFormat("auto aaaaaaaaaaaa = f();\n"
-  //              "auto b            = f(aaaaaaaaaaaaaaaaaaaaaaaaa,\n"
-  //              "    ccc ? aaaaa : bbbbb,\n"
-  //              "    dddddddddddddddddddddddddd);",
-  //              Alignment);
+  verifyFormat("auto aaaaaaaaaaaa = f();\n"
+               "auto b            = f(aaaaaaaaaaaaaaaaaaaaaaaaa,\n"
+               "                      ccc ? aaaaa : bbbbb,\n"
+               "                      dddddddddddddddddddddddddd);",
+               Alignment);
 
   // Confirm proper handling of AlignConsecutiveAssignments with
   // BinPackArguments.
@@ -20192,6 +20211,11 @@ TEST_F(FormatTest, AlignConsecutiveDeclarations) {
                "    i = 3    //\n"
                "};",
                Alignment);
+  // When assignments are nested, each level should be aligned.
+  verifyFormat("float i2 = 0;\n"
+               "auto  v  = type{i2 = 1, //\n"
+               "                i  = 3};",
+               Alignment);
   Alignment.AlignConsecutiveAssignments.Enabled = false;
 
   verifyFormat(
@@ -20327,7 +20351,7 @@ TEST_F(FormatTest, AlignConsecutiveDeclarations) {
                "  return 0;\n"
                "}()};",
                BracedAlign);
-  BracedAlign.Cpp11BracedListStyle = false;
+  BracedAlign.Cpp11BracedListStyle = FormatStyle::BLS_Block;
   verifyFormat("const auto result{ []() {\n"
                "  const auto something = 1;\n"
                "  return 2;\n"
@@ -20681,6 +20705,21 @@ TEST_F(FormatTest, AlignWithLineBreaks) {
                "}",
                Style);
 
+  verifyFormat("void foo() {\n"
+               "  int    myVar = 5;\n"
+               "  double x     = 3.14;\n"
+               "  auto   str   = (\"Hello \"\n"
+               "                  \"World\");\n"
+               "  auto   s     = (\"Hello \"\n"
+               "                  \"Again\");\n"
+               "}",
+               Style);
+
+  verifyFormat("A    B       = {\"Hello \"\n"
+               "                \"World\"};\n"
+               "BYTE payload = 2;",
+               Style);
+
   // clang-format off
   verifyFormat("void foo() {\n"
                "  const int  capacityBefore = Entries.capacity();\n"
@@ -20738,6 +20777,30 @@ TEST_F(FormatTest, AlignWithLineBreaks) {
                "}",
                Style);
   // clang-format on
+
+  Style = getLLVMStyleWithColumns(70);
+  Style.AlignConsecutiveDeclarations.Enabled = true;
+  verifyFormat(
+      "ReturnType\n"
+      "MyFancyIntefaceFunction(Context       *context,\n"
+      "                        ALongTypeName *response) noexcept override;\n"
+      "ReturnType func();",
+      Style);
+
+  verifyFormat(
+      "ReturnType\n"
+      "MyFancyIntefaceFunction(B<int>          *context,\n"
+      "                        decltype(AFunc) *response) noexcept override;\n"
+      "ReturnType func();",
+      Style);
+
+  Style.AlignConsecutiveAssignments.Enabled = true;
+  Style.ColumnLimit = 15;
+  verifyFormat("int i1 = 1;\n"
+               "k      = bar(\n"
+               "    argument1,\n"
+               "    argument2);",
+               Style);
 }
 
 TEST_F(FormatTest, AlignWithInitializerPeriods) {
@@ -20763,6 +20826,28 @@ TEST_F(FormatTest, AlignWithInitializerPeriods) {
                "}",
                Style);
 
+  // The lines inside the braces are supposed to be indented by
+  // BracedInitializerIndentWidth from the start of the line. They should not
+  // move with the opening brace.
+  verifyFormat("void foo2(void) {\n"
+               "  BYTE p[1] = 1;\n"
+               "  A B       = {\n"
+               "      .one_foooooooooooooooo = 2,\n"
+               "      .two_fooooooooooooo    = 3,\n"
+               "      .three_fooooooooooooo  = 4,\n"
+               "  };\n"
+               "  BYTE payload = 2;\n"
+               "}",
+               Style);
+
+  verifyFormat("auto aaaaaaaaaaaaaaaaaaaaa = {};\n"
+               "auto b                     = g([] {\n"
+               "  x = {.one_foooooooooooooooo = 2, //\n"
+               "       .two_fooooooooooooo    = 3, //\n"
+               "       .three_fooooooooooooo  = 4};\n"
+               "});",
+               Style);
+
   Style.AlignConsecutiveAssignments.Enabled = false;
   Style.AlignConsecutiveDeclarations.Enabled = true;
   verifyFormat("void foo3(void) {\n"
@@ -21897,14 +21982,14 @@ TEST_F(FormatTest, CatchAlignArrayOfStructuresRightAlignment) {
                "});",
                Style);
 
-  Style.Cpp11BracedListStyle = false;
+  Style.Cpp11BracedListStyle = FormatStyle::BLS_Block;
   verifyFormat("struct test demo[] = {\n"
                "  { 56,    23, \"hello\" },\n"
                "  { -1, 93463, \"world\" },\n"
                "  {  7,     5,    \"!!\" }\n"
                "};",
                Style);
-  Style.Cpp11BracedListStyle = true;
+  Style.Cpp11BracedListStyle = FormatStyle::BLS_AlignFirstComment;
 
   Style.ColumnLimit = 0;
   verifyFormat(
@@ -22164,14 +22249,14 @@ TEST_F(FormatTest, CatchAlignArrayOfStructuresLeftAlignment) {
                "  };",
                Style);
 
-  Style.Cpp11BracedListStyle = false;
+  Style.Cpp11BracedListStyle = FormatStyle::BLS_Block;
   verifyFormat("struct test demo[] = {\n"
                "  { 56, 23,    \"hello\" },\n"
                "  { -1, 93463, \"world\" },\n"
                "  { 7,  5,     \"!!\"    }\n"
                "};",
                Style);
-  Style.Cpp11BracedListStyle = true;
+  Style.Cpp11BracedListStyle = FormatStyle::BLS_AlignFirstComment;
 
   Style.ColumnLimit = 0;
   verifyFormat(
diff --git a/clang/unittests/Format/FormatTestCSharp.cpp b/clang/unittests/Format/FormatTestCSharp.cpp
index ea85ed6140dd0..d7fb15d5dd9ac 100644
--- a/clang/unittests/Format/FormatTestCSharp.cpp
+++ b/clang/unittests/Format/FormatTestCSharp.cpp
@@ -1194,7 +1194,7 @@ TEST_F(FormatTestCSharp, CSharpSpaces) {
   Style.SpaceBeforeSquareBrackets = false;
   Style.SpacesInSquareBrackets = false;
   Style.SpaceBeforeCpp11BracedList = true;
-  Style.Cpp11BracedListStyle = false;
+  Style.Cpp11BracedListStyle = FormatStyle::BLS_Block;
   Style.SpacesInContainerLiterals = false;
   Style.SpaceAfterCStyleCast = false;
 
diff --git a/clang/unittests/Format/FormatTestComments.cpp b/clang/unittests/Format/FormatTestComments.cpp
index 69026bce98705..fc80bf4024fd9 100644
--- a/clang/unittests/Format/FormatTestComments.cpp
+++ b/clang/unittests/Format/FormatTestComments.cpp
@@ -4699,6 +4699,58 @@ TEST_F(FormatTestComments, SplitCommentIntroducers) {
                    getLLVMStyleWithColumns(10)));
 }
 
+TEST_F(FormatTestComments, LineCommentsOnStartOfFunctionCall) {
+  auto Style = getLLVMStyle();
+
+  EXPECT_EQ(Style.Cpp11BracedListStyle, FormatStyle::BLS_AlignFirstComment);
+  verifyFormat("Type name{// Comment\n"
+               "          value};",
+               Style);
+
+  Style.Cpp11BracedListStyle = FormatStyle::BLS_Block;
+  verifyFormat("Type name{ // Comment\n"
+               "           value\n"
+               "};",
+               Style);
+
+  Style.Cpp11BracedListStyle = FormatStyle::BLS_FunctionCall;
+  verifyFormat("Type name{ // Comment\n"
+               "    value};",
+               Style);
+
+  verifyFormat("T foo( // Comment\n"
+               "    arg);",
+               Style);
+
+  verifyFormat("T bar{ // Comment\n"
+               "    arg};",
+               Style);
+
+  verifyFormat("T baz({ // Comment\n"
+               "    arg});",
+               Style);
+
+  verifyFormat("T baz{{ // Comment\n"
+               "    arg}};",
+               Style);
+
+  verifyFormat("T b0z(f( // Comment\n"
+               "    arg));",
+               Style);
+
+  verifyFormat("T b0z(F{ // Comment\n"
+               "    arg});",
+               Style);
+
+  verifyFormat("func( // Comment\n"
+               "    arg);",
+               Style);
+
+  verifyFormat("func({ // Comment\n"
+               "    arg});",
+               Style);
+}
+
 } // end namespace
 } // namespace test
 } // end namespace format
diff --git a/clang/unittests/Format/FormatTestJava.cpp b/clang/unittests/Format/FormatTestJava.cpp
index 127556488bab0..1416614bae29a 100644
--- a/clang/unittests/Format/FormatTestJava.cpp
+++ b/clang/unittests/Format/FormatTestJava.cpp
@@ -236,7 +236,7 @@ TEST_F(FormatTestJava, ArrayInitializers) {
                "};");
 
   FormatStyle Style = getStyleWithColumns(65);
-  Style.Cpp11BracedListStyle = false;
+  Style.Cpp11BracedListStyle = FormatStyle::BLS_Block;
   verifyFormat(
       "expected = new int[] { 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,\n"
       "  100, 100, 100, 100, 100, 100, 100, 100, 100, 100 };",
diff --git a/clang/unittests/Format/FormatTestTextProto.cpp b/clang/unittests/Format/FormatTestTextProto.cpp
index fd65c9a58db5d..6cddb838a0abe 100644
--- a/clang/unittests/Format/FormatTestTextProto.cpp
+++ b/clang/unittests/Format/FormatTestTextProto.cpp
@@ -514,7 +514,7 @@ TEST_F(FormatTestTextProto, FormatsRepeatedListInitializers) {
                "key: value");
 
   auto Style = getDefaultStyle();
-  Style.Cpp11BracedListStyle = true;
+  Style.Cpp11BracedListStyle = FormatStyle::BLS_AlignFirstComment;
   verifyFormat("keys: [1]", Style);
 }
 
diff --git a/clang/unittests/Format/FormatTestVerilog.cpp b/clang/unittests/Format/FormatTestVerilog.cpp
index 5c50ae6fcfac8..63e2cadfdd7a1 100644
--- a/clang/unittests/Format/FormatTestVerilog.cpp
+++ b/clang/unittests/Format/FormatTestVerilog.cpp
@@ -1287,7 +1287,7 @@ TEST_F(FormatTestVerilog, StringLiteral) {
                getStyleWithColumns(getDefaultStyle(), 32));
   // Space around braces should be correct.
   auto Style = getStyleWithColumns(getDefaultStyle(), 24);
-  Style.Cpp11BracedListStyle = false;
+  Style.Cpp11BracedListStyle = FormatStyle::BLS_Block;
   verifyFormat(R"(x({ "xxxxxxxxxxxxxxxx ",
     "xxxx" });)",
                R"(x("xxxxxxxxxxxxxxxx xxxx");)", Style);
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index 1152466b0179f..ca99940890984 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -1129,6 +1129,11 @@ TEST_F(TokenAnnotatorTest, UnderstandsOverloadedOperators) {
   ASSERT_EQ(Tokens.size(), 7u) << Tokens;
   // Not TT_FunctionDeclarationName.
   EXPECT_TOKEN(Tokens[3], tok::kw_operator, TT_Unknown);
+
+  Tokens = annotate("SomeAPI::operator()();");
+  ASSERT_EQ(Tokens.size(), 9u) << Tokens;
+  // Not TT_FunctionDeclarationName.
+  EXPECT_TOKEN(Tokens[2], tok::kw_operator, TT_Unknown);
 }
 
 TEST_F(TokenAnnotatorTest, OverloadedOperatorInTemplate) {
@@ -4232,6 +4237,29 @@ TEST_F(TokenAnnotatorTest, QtProperty) {
   EXPECT_TOKEN(Tokens[12], tok::identifier, TT_QtProperty);
 }
 
+TEST_F(TokenAnnotatorTest, AttributeSquares) {
+  auto Tokens = annotate("[[maybe_unused]] const int i;");
+  ASSERT_EQ(Tokens.size(), 10u) << Tokens;
+  EXPECT_TOKEN(Tokens[0], tok::l_square, TT_AttributeLSquare);
+  EXPECT_TOKEN(Tokens[1], tok::l_square, TT_Unknown);
+  EXPECT_TOKEN(Tokens[3], tok::r_square, TT_Unknown);
+  EXPECT_TOKEN(Tokens[4], tok::r_square, TT_AttributeRSquare);
+  EXPECT_TRUE(Tokens[4]->EndsCppAttributeGroup);
+
+  Tokens = annotate("[[foo([[]])]] [[maybe_unused]] int j;");
+  ASSERT_EQ(Tokens.size(), 20u) << Tokens;
+  EXPECT_TOKEN(Tokens[0], tok::l_square, TT_AttributeLSquare);
+  EXPECT_TOKEN(Tokens[1], tok::l_square, TT_Unknown);
+  EXPECT_TOKEN(Tokens[9], tok::r_square, TT_Unknown);
+  EXPECT_TOKEN(Tokens[10], tok::r_square, TT_AttributeRSquare);
+  EXPECT_FALSE(Tokens[10]->EndsCppAttributeGroup);
+  EXPECT_TOKEN(Tokens[11], tok::l_square, TT_AttributeLSquare);
+  EXPECT_TOKEN(Tokens[12], tok::l_square, TT_Unknown);
+  EXPECT_TOKEN(Tokens[14], tok::r_square, TT_Unknown);
+  EXPECT_TOKEN(Tokens[15], tok::r_square, TT_AttributeRSquare);
+  EXPECT_TRUE(Tokens[15]->EndsCppAttributeGroup);
+}
+
 } // namespace
 } // namespace format
 } // namespace clang
diff --git a/clang/unittests/Frontend/CompilerInvocationTest.cpp b/clang/unittests/Frontend/CompilerInvocationTest.cpp
index 75390aa42d00e..1332422688fe6 100644
--- a/clang/unittests/Frontend/CompilerInvocationTest.cpp
+++ b/clang/unittests/Frontend/CompilerInvocationTest.cpp
@@ -732,6 +732,26 @@ TEST_F(CommandLineTest, ConditionalParsingIfTrueFlagPresent) {
   ASSERT_THAT(GeneratedArgs, Contains(StrEq("-sycl-std=2017")));
 }
 
+TEST_F(CommandLineTest, ConditionalParsingIfHLSLFlagPresent) {
+  const char *Args[] = {"-xhlsl"};
+
+  CompilerInvocation::CreateFromArgs(Invocation, Args, *Diags);
+
+  ASSERT_EQ(Invocation.getLangOpts().MaxMatrixDimension, 4u);
+
+  Invocation.generateCC1CommandLine(GeneratedArgs, *this);
+}
+
+TEST_F(CommandLineTest, ConditionalParsingIfHLSLFlagNotPresent) {
+  const char *Args[] = {""};
+
+  CompilerInvocation::CreateFromArgs(Invocation, Args, *Diags);
+
+  ASSERT_EQ(Invocation.getLangOpts().MaxMatrixDimension, 1048575u);
+
+  Invocation.generateCC1CommandLine(GeneratedArgs, *this);
+}
+
 // Wide integer option.
 
 TEST_F(CommandLineTest, WideIntegerHighValue) {
diff --git a/clang/unittests/StaticAnalyzer/RangeSetTest.cpp b/clang/unittests/StaticAnalyzer/RangeSetTest.cpp
index 9e36aabcf6644..a8c7626dec6f2 100644
--- a/clang/unittests/StaticAnalyzer/RangeSetTest.cpp
+++ b/clang/unittests/StaticAnalyzer/RangeSetTest.cpp
@@ -27,21 +27,21 @@ template <class RangeOrSet> static std::string toString(const RangeOrSet &Obj) {
   Obj.dump(SS);
   return ObjRepresentation;
 }
-LLVM_ATTRIBUTE_UNUSED static std::string toString(const llvm::APSInt &Point) {
+[[maybe_unused]] static std::string toString(const llvm::APSInt &Point) {
   return toString(Point, 10);
 }
 // We need it here for better fail diagnostics from gtest.
-LLVM_ATTRIBUTE_UNUSED static std::ostream &operator<<(std::ostream &OS,
-                                                      const RangeSet &Set) {
+[[maybe_unused]] static std::ostream &operator<<(std::ostream &OS,
+                                                 const RangeSet &Set) {
   return OS << toString(Set);
 }
 // We need it here for better fail diagnostics from gtest.
-LLVM_ATTRIBUTE_UNUSED static std::ostream &operator<<(std::ostream &OS,
-                                                      const Range &R) {
+[[maybe_unused]] static std::ostream &operator<<(std::ostream &OS,
+                                                 const Range &R) {
   return OS << toString(R);
 }
-LLVM_ATTRIBUTE_UNUSED static std::ostream &operator<<(std::ostream &OS,
-                                                      APSIntType Ty) {
+[[maybe_unused]] static std::ostream &operator<<(std::ostream &OS,
+                                                 APSIntType Ty) {
   return OS << (Ty.isUnsigned() ? "u" : "s") << Ty.getBitWidth();
 }
 
diff --git a/clang/unittests/StaticAnalyzer/SValTest.cpp b/clang/unittests/StaticAnalyzer/SValTest.cpp
index 1f4a18b6b8294..f96456a3d620e 100644
--- a/clang/unittests/StaticAnalyzer/SValTest.cpp
+++ b/clang/unittests/StaticAnalyzer/SValTest.cpp
@@ -34,13 +34,12 @@ namespace clang {
 // getType() tests include whole bunch of type comparisons,
 // so when something is wrong, it's good to have gtest telling us
 // what are those types.
-LLVM_ATTRIBUTE_UNUSED std::ostream &operator<<(std::ostream &OS,
-                                               const QualType &T) {
+[[maybe_unused]] std::ostream &operator<<(std::ostream &OS, const QualType &T) {
   return OS << T.getAsString();
 }
 
-LLVM_ATTRIBUTE_UNUSED std::ostream &operator<<(std::ostream &OS,
-                                               const CanQualType &T) {
+[[maybe_unused]] std::ostream &operator<<(std::ostream &OS,
+                                          const CanQualType &T) {
   return OS << QualType{T};
 }
 
@@ -302,13 +301,13 @@ void foo(int x) {
   ASSERT_FALSE(B.getType(Context).isNull());
   const auto *BRecordType = dyn_cast<RecordType>(B.getType(Context));
   ASSERT_NE(BRecordType, nullptr);
-  EXPECT_EQ("TestStruct", BRecordType->getOriginalDecl()->getName());
+  EXPECT_EQ("TestStruct", BRecordType->getDecl()->getName());
 
   SVal C = getByName("c");
   ASSERT_FALSE(C.getType(Context).isNull());
   const auto *CRecordType = dyn_cast<RecordType>(C.getType(Context));
   ASSERT_NE(CRecordType, nullptr);
-  EXPECT_EQ("TestUnion", CRecordType->getOriginalDecl()->getName());
+  EXPECT_EQ("TestUnion", CRecordType->getDecl()->getName());
 
   auto D = getByName("d").getAs<nonloc::CompoundVal>();
   ASSERT_TRUE(D.has_value());
@@ -322,7 +321,7 @@ void foo(int x) {
   ASSERT_FALSE(LDT.isNull());
   const auto *DRecordType = dyn_cast<RecordType>(LDT);
   ASSERT_NE(DRecordType, nullptr);
-  EXPECT_EQ("TestStruct", DRecordType->getOriginalDecl()->getName());
+  EXPECT_EQ("TestStruct", DRecordType->getDecl()->getName());
 }
 
 SVAL_TEST(GetStringType, R"(
@@ -351,7 +350,7 @@ void TestClass::foo() {
   ASSERT_NE(APtrTy, nullptr);
   const auto *ARecordType = dyn_cast<RecordType>(APtrTy->getPointeeType());
   ASSERT_NE(ARecordType, nullptr);
-  EXPECT_EQ("TestClass", ARecordType->getOriginalDecl()->getName());
+  EXPECT_EQ("TestClass", ARecordType->getDecl()->getName());
 }
 
 SVAL_TEST(GetFunctionPtrType, R"(
diff --git a/clang/unittests/Tooling/LookupTest.cpp b/clang/unittests/Tooling/LookupTest.cpp
index 4c49ebe47acb9..ed6f5d4f3092c 100644
--- a/clang/unittests/Tooling/LookupTest.cpp
+++ b/clang/unittests/Tooling/LookupTest.cpp
@@ -200,9 +200,9 @@ TEST(LookupTest, replaceNestedClassName) {
   Visitor.OnRecordTypeLoc = [&](RecordTypeLoc Type) {
     // Filter Types by name since there are other `RecordTypeLoc` in the test
     // file.
-    if (Type.getOriginalDecl()->getQualifiedNameAsString() == "a::b::Foo") {
-      EXPECT_EQ("x::Bar", replaceTypeLoc(Type.getOriginalDecl(),
-                                         Type.getBeginLoc(), "::a::x::Bar"));
+    if (Type.getDecl()->getQualifiedNameAsString() == "a::b::Foo") {
+      EXPECT_EQ("x::Bar", replaceTypeLoc(Type.getDecl(), Type.getBeginLoc(),
+                                         "::a::x::Bar"));
     }
   };
   Visitor.runOver("namespace a { namespace b {\n"
@@ -227,9 +227,9 @@ TEST(LookupTest, replaceNestedClassName) {
   // `x::y::Foo` in c.cc [1], it should not make "Foo" at [0] ambiguous because
   // it's not visible at [0].
   Visitor.OnRecordTypeLoc = [&](RecordTypeLoc Type) {
-    if (Type.getOriginalDecl()->getQualifiedNameAsString() == "x::y::Old") {
-      EXPECT_EQ("Foo", replaceTypeLoc(Type.getOriginalDecl(),
-                                      Type.getBeginLoc(), "::x::Foo"));
+    if (Type.getDecl()->getQualifiedNameAsString() == "x::y::Old") {
+      EXPECT_EQ("Foo",
+                replaceTypeLoc(Type.getDecl(), Type.getBeginLoc(), "::x::Foo"));
     }
   };
   Visitor.runOver(R"(
diff --git a/clang/unittests/Tooling/RecursiveASTVisitorTests/MemberPointerTypeLoc.cpp b/clang/unittests/Tooling/RecursiveASTVisitorTests/MemberPointerTypeLoc.cpp
index 88cebb7c28033..587a00dd27051 100644
--- a/clang/unittests/Tooling/RecursiveASTVisitorTests/MemberPointerTypeLoc.cpp
+++ b/clang/unittests/Tooling/RecursiveASTVisitorTests/MemberPointerTypeLoc.cpp
@@ -24,7 +24,7 @@ class MemberPointerTypeLocVisitor : public ExpectedLocationVisitor {
   bool VisitRecordTypeLoc(RecordTypeLoc RTL) override {
     if (!RTL)
       return true;
-    Match(RTL.getOriginalDecl()->getName(), RTL.getNameLoc());
+    Match(RTL.getDecl()->getName(), RTL.getNameLoc());
     return true;
   }
 };
diff --git a/clang/unittests/Tooling/RecursiveASTVisitorTests/NestedNameSpecifiers.cpp b/clang/unittests/Tooling/RecursiveASTVisitorTests/NestedNameSpecifiers.cpp
index 4181cd2881054..120b14b96bbcd 100644
--- a/clang/unittests/Tooling/RecursiveASTVisitorTests/NestedNameSpecifiers.cpp
+++ b/clang/unittests/Tooling/RecursiveASTVisitorTests/NestedNameSpecifiers.cpp
@@ -18,7 +18,7 @@ class NestedNameSpecifiersVisitor : public ExpectedLocationVisitor {
   bool VisitRecordTypeLoc(RecordTypeLoc RTL) override {
     if (!RTL)
       return true;
-    Match(RTL.getOriginalDecl()->getName(), RTL.getNameLoc());
+    Match(RTL.getDecl()->getName(), RTL.getNameLoc());
     return true;
   }
 
diff --git a/clang/utils/TableGen/ClangOptionDocEmitter.cpp b/clang/utils/TableGen/ClangOptionDocEmitter.cpp
index 8a31686bc7727..d779c8418e57e 100644
--- a/clang/utils/TableGen/ClangOptionDocEmitter.cpp
+++ b/clang/utils/TableGen/ClangOptionDocEmitter.cpp
@@ -173,11 +173,11 @@ Documentation extractDocumentation(const RecordKeeper &Records,
 // Get the first and successive separators to use for an OptionKind.
 std::pair<StringRef,StringRef> getSeparatorsForKind(const Record *OptionKind) {
   return StringSwitch<std::pair<StringRef, StringRef>>(OptionKind->getName())
-    .Cases("KIND_JOINED", "KIND_JOINED_OR_SEPARATE",
-           "KIND_JOINED_AND_SEPARATE",
-           "KIND_REMAINING_ARGS_JOINED", {"", " "})
-    .Case("KIND_COMMAJOINED", {"", ","})
-    .Default({" ", " "});
+      .Cases({"KIND_JOINED", "KIND_JOINED_OR_SEPARATE",
+              "KIND_JOINED_AND_SEPARATE", "KIND_REMAINING_ARGS_JOINED"},
+             {"", " "})
+      .Case("KIND_COMMAJOINED", {"", ","})
+      .Default({" ", " "});
 }
 
 const unsigned UnlimitedArgs = unsigned(-1);
@@ -186,12 +186,13 @@ const unsigned UnlimitedArgs = unsigned(-1);
 // arguments are accepted.
 unsigned getNumArgsForKind(const Record *OptionKind, const Record *Option) {
   return StringSwitch<unsigned>(OptionKind->getName())
-    .Cases("KIND_JOINED", "KIND_JOINED_OR_SEPARATE", "KIND_SEPARATE", 1)
-    .Cases("KIND_REMAINING_ARGS", "KIND_REMAINING_ARGS_JOINED",
-           "KIND_COMMAJOINED", UnlimitedArgs)
-    .Case("KIND_JOINED_AND_SEPARATE", 2)
-    .Case("KIND_MULTIARG", Option->getValueAsInt("NumArgs"))
-    .Default(0);
+      .Cases({"KIND_JOINED", "KIND_JOINED_OR_SEPARATE", "KIND_SEPARATE"}, 1)
+      .Cases({"KIND_REMAINING_ARGS", "KIND_REMAINING_ARGS_JOINED",
+              "KIND_COMMAJOINED"},
+             UnlimitedArgs)
+      .Case("KIND_JOINED_AND_SEPARATE", 2)
+      .Case("KIND_MULTIARG", Option->getValueAsInt("NumArgs"))
+      .Default(0);
 }
 
 std::string escapeRST(StringRef Str) {
diff --git a/clang/utils/TableGen/MveEmitter.cpp b/clang/utils/TableGen/MveEmitter.cpp
index a003b5e632d58..f55a5f54bd158 100644
--- a/clang/utils/TableGen/MveEmitter.cpp
+++ b/clang/utils/TableGen/MveEmitter.cpp
@@ -1684,7 +1684,8 @@ void EmitterBase::EmitBuiltinCG(raw_ostream &OS) {
         OS << "  case ARM::BI__builtin_arm_" << OI.Int->builtinExtension()
            << "_" << OI.Name << ":\n";
         for (size_t i = 0, e = MG.ParamTypes.size(); i < e; ++i)
-          OS << "    Param" << utostr(i) << " = " << OI.ParamValues[i] << ";\n";
+          OS << "    Param" << utostr(i) << " = static_cast<"
+             << MG.ParamTypes[i] << ">(" << OI.ParamValues[i] << ");\n";
         OS << "    break;\n";
       }
       OS << "  }\n";
diff --git a/clang/www/cxx_status.html b/clang/www/cxx_status.html
index a35e50150a2ab..2618ff930a0e4 100755
--- a/clang/www/cxx_status.html
+++ b/clang/www/cxx_status.html
@@ -926,7 +926,14 @@ <h2 id="cxx20">C++20 implementation status</h2>
       </tr>
       <tr>
         <td><a href="https://wg21.link/p1857r3">P1857R3</a></td>
-        <td class="none" align="center">No</td>
+        <td class="partial" align="center">
+          <details>
+            <summary>Clang 21 (Partial)</summary>
+            The restriction that "[a] module directive may only appear
+            as the first preprocessing tokens in a file" is enforced
+            starting in Clang 21.
+          </details>
+        </td>
       </tr>
       <tr>
         <td><a href="https://wg21.link/p2115r0">P2115R0</a></td>
diff --git a/cmake/Modules/FindGRPC.cmake b/cmake/Modules/FindGRPC.cmake
index a559da499963e..2eec35b6a1ac7 100644
--- a/cmake/Modules/FindGRPC.cmake
+++ b/cmake/Modules/FindGRPC.cmake
@@ -3,21 +3,23 @@ option(ENABLE_GRPC_REFLECTION "Link to gRPC Reflection library" OFF)
 # FIXME(kirillbobyrev): Check if gRPC and Protobuf headers can be included at
 # configure time.
 find_package(Threads REQUIRED)
-if (GRPC_INSTALL_PATH)
-  # This setup requires gRPC to be built from sources using CMake and installed
-  # to ${GRPC_INSTALL_PATH} via -DCMAKE_INSTALL_PREFIX=${GRPC_INSTALL_PATH}.
-  # Libraries will be linked according to gRPC build policy which generates
-  # static libraries when BUILD_SHARED_LIBS is Off and dynamic libraries when
-  # it's On (NOTE: This is a variable passed to gRPC CMake build invocation,
-  # LLVM's BUILD_SHARED_LIBS has no effect).
-  set(protobuf_MODULE_COMPATIBLE TRUE)
-  find_package(Protobuf CONFIG REQUIRED HINTS ${GRPC_INSTALL_PATH})
-  message(STATUS "Using protobuf ${Protobuf_VERSION}")
-  find_package(gRPC CONFIG REQUIRED HINTS ${GRPC_INSTALL_PATH})
-  message(STATUS "Using gRPC ${gRPC_VERSION}")
 
-  include_directories(${Protobuf_INCLUDE_DIRS})
+# Prefer finding gPRC through CMakeConfig and a hint can be provided via
+# GRPC_INSTALL_PATH. This requires gRPC to be built and installed
+# to ${GRPC_INSTALL_PATH} via -DCMAKE_INSTALL_PREFIX=${GRPC_INSTALL_PATH}.
+# Libraries will be linked according to gRPC build policy which generates
+# static libraries when BUILD_SHARED_LIBS is Off and dynamic libraries when
+# it's On (NOTE: This is a variable passed to gRPC CMake build invocation,
+# LLVM's BUILD_SHARED_LIBS has no effect).
+# Package managers like Homebrew will also install Config.cmake and user can
+# specify GRPC_INSTALL_PATH or CMAKE_PREFIX_PATH to locate installed package.
+set(protobuf_MODULE_COMPATIBLE TRUE)
+find_package(Protobuf CONFIG HINTS ${GRPC_INSTALL_PATH})
+message(STATUS "Using protobuf ${Protobuf_VERSION}")
+find_package(gRPC CONFIG HINTS ${GRPC_INSTALL_PATH})
+message(STATUS "Using gRPC ${gRPC_VERSION}")
 
+if (Protobuf_FOUND AND gRPC_FOUND)
   # gRPC CMake CONFIG gives the libraries slightly odd names, make them match
   # the conventional system-installed names.
   set_target_properties(protobuf::libprotobuf PROPERTIES IMPORTED_GLOBAL TRUE)
@@ -32,10 +34,12 @@ if (GRPC_INSTALL_PATH)
   set(GRPC_CPP_PLUGIN $<TARGET_FILE:gRPC::grpc_cpp_plugin>)
   set(PROTOC ${Protobuf_PROTOC_EXECUTABLE})
 else()
-  # This setup requires system-installed gRPC and Protobuf.
+  # Now fallback to system-installed gRPC and ProtoBuf.
   # We always link dynamically in this mode. While the static libraries are
   # usually installed, the CMake files telling us *which* static libraries to
   # link are not.
+  # FIXME: this path should not work on newer grpc versions and should be
+  # removed in favor of `find_package` implementation.
   if (NOT BUILD_SHARED_LIBS)
     message(NOTICE "gRPC and Protobuf will be linked dynamically. If you want static linking, build gRPC from sources with -DBUILD_SHARED_LIBS=Off.")
   endif()
@@ -44,55 +48,17 @@ else()
   if (NOT GRPC_CPP_PLUGIN OR NOT PROTOC)
     message(FATAL_ERROR "gRPC C++ Plugin and Protoc must be on $PATH for gRPC-enabled build.")
   endif()
-  # On macOS the libraries are typically installed via Homebrew and are not on
-  # the system path.
-  set(GRPC_OPTS "")
-  set(PROTOBUF_OPTS "")
-  set(GRPC_INCLUDE_PATHS "")
-  if (${APPLE})
-    find_program(HOMEBREW brew)
-    # If Homebrew is not found, the user might have installed libraries
-    # manually. Fall back to the system path.
-    if (HOMEBREW)
-      execute_process(COMMAND ${HOMEBREW} --prefix grpc
-        OUTPUT_VARIABLE GRPC_HOMEBREW_PATH
-        RESULT_VARIABLE GRPC_HOMEBREW_RETURN_CODE
-        OUTPUT_STRIP_TRAILING_WHITESPACE)
-      execute_process(COMMAND ${HOMEBREW} --prefix protobuf
-        OUTPUT_VARIABLE PROTOBUF_HOMEBREW_PATH
-        RESULT_VARIABLE PROTOBUF_HOMEBREW_RETURN_CODE
-        OUTPUT_STRIP_TRAILING_WHITESPACE)
-      execute_process(COMMAND ${HOMEBREW} --prefix abseil
-        OUTPUT_VARIABLE ABSL_HOMEBREW_PATH
-        RESULT_VARIABLE ABSL_HOMEBREW_RETURN_CODE
-        OUTPUT_STRIP_TRAILING_WHITESPACE)
-      # If either library is not installed via Homebrew, fall back to the
-      # system path.
-      if (GRPC_HOMEBREW_RETURN_CODE EQUAL "0")
-        list(APPEND GRPC_INCLUDE_PATHS ${GRPC_HOMEBREW_PATH}/include)
-        list(APPEND GRPC_OPTS PATHS ${GRPC_HOMEBREW_PATH}/lib NO_DEFAULT_PATH)
-      endif()
-      if (PROTOBUF_HOMEBREW_RETURN_CODE EQUAL "0")
-        list(APPEND GRPC_INCLUDE_PATHS ${PROTOBUF_HOMEBREW_PATH}/include)
-        list(APPEND PROTOBUF_OPTS PATHS ${PROTOBUF_HOMEBREW_PATH}/lib NO_DEFAULT_PATH)
-      endif()
-      if (ABSL_HOMEBREW_RETURN_CODE EQUAL "0")
-        list(APPEND GRPC_INCLUDE_PATHS ${ABSL_HOMEBREW_PATH}/include)
-      endif()
-    endif()
-  endif()
   if(NOT TARGET grpc++)
-    find_library(GRPC_LIBRARY grpc++ ${GRPC_OPTS} REQUIRED)
+    find_library(GRPC_LIBRARY grpc++ REQUIRED)
     add_library(grpc++ UNKNOWN IMPORTED GLOBAL)
     message(STATUS "Using grpc++: " ${GRPC_LIBRARY})
     set_target_properties(grpc++ PROPERTIES IMPORTED_LOCATION ${GRPC_LIBRARY})
-    target_include_directories(grpc++ INTERFACE ${GRPC_INCLUDE_PATHS})
     if (ENABLE_GRPC_REFLECTION)
-      find_library(GRPC_REFLECTION_LIBRARY grpc++_reflection ${GRPC_OPTS} REQUIRED)
+      find_library(GRPC_REFLECTION_LIBRARY grpc++_reflection REQUIRED)
       add_library(grpc++_reflection UNKNOWN IMPORTED GLOBAL)
       set_target_properties(grpc++_reflection PROPERTIES IMPORTED_LOCATION ${GRPC_REFLECTION_LIBRARY})
     endif()
-    find_library(PROTOBUF_LIBRARY protobuf ${PROTOBUF_OPTS} REQUIRED)
+    find_library(PROTOBUF_LIBRARY protobuf REQUIRED)
     message(STATUS "Using protobuf: " ${PROTOBUF_LIBRARY})
     add_library(protobuf UNKNOWN IMPORTED GLOBAL)
     set_target_properties(protobuf PROPERTIES IMPORTED_LOCATION ${PROTOBUF_LIBRARY})
diff --git a/compiler-rt/CMakeLists.txt b/compiler-rt/CMakeLists.txt
index 5da6b4c279d6b..162c5e3210f0d 100644
--- a/compiler-rt/CMakeLists.txt
+++ b/compiler-rt/CMakeLists.txt
@@ -83,6 +83,8 @@ mark_as_advanced(COMPILER_RT_BUILD_ORC)
 option(COMPILER_RT_BUILD_GWP_ASAN "Build GWP-ASan, and link it into SCUDO" ON)
 mark_as_advanced(COMPILER_RT_BUILD_GWP_ASAN)
 option(COMPILER_RT_ENABLE_CET "Build Compiler RT with CET enabled" OFF)
+option(COMPILER_RT_ASAN_UNIT_TESTS_USE_HOST_RUNTIME "Build asan unit tests without depending upon a just-built asan runtime" OFF)
+mark_as_advanced(COMPILER_RT_ASAN_UNIT_TESTS_USE_HOST_RUNTIME)
 
 option(COMPILER_RT_SCUDO_STANDALONE_SYSROOT_PATH "Set custom sysroot for building SCUDO standalone" OFF)
 mark_as_advanced(COMPILER_RT_SCUDO_STANDALONE_SYSROOT_PATH)
diff --git a/compiler-rt/cmake/Modules/AddCompilerRT.cmake b/compiler-rt/cmake/Modules/AddCompilerRT.cmake
index fb2aee8e42ee2..d658b7009e859 100644
--- a/compiler-rt/cmake/Modules/AddCompilerRT.cmake
+++ b/compiler-rt/cmake/Modules/AddCompilerRT.cmake
@@ -162,7 +162,6 @@ endmacro()
 #                         OBJECT_LIBS <object libraries to use as sources>
 #                         PARENT_TARGET <convenience parent target>
 #                         ADDITIONAL_HEADERS <header files>
-#                         EXTENSIONS <boolean>
 #                         C_STANDARD <version>
 #                         CXX_STANDARD <version>)
 function(add_compiler_rt_runtime name type)
@@ -174,7 +173,7 @@ function(add_compiler_rt_runtime name type)
   cmake_parse_arguments(LIB
     ""
     "PARENT_TARGET;C_STANDARD;CXX_STANDARD"
-    "OS;ARCHS;SOURCES;CFLAGS;LINK_FLAGS;DEFS;DEPS;LINK_LIBS;OBJECT_LIBS;ADDITIONAL_HEADERS;EXTENSIONS"
+    "OS;ARCHS;SOURCES;CFLAGS;LINK_FLAGS;DEFS;DEPS;LINK_LIBS;OBJECT_LIBS;ADDITIONAL_HEADERS"
     ${ARGN})
   set(libnames)
   # Until we support this some other way, build compiler-rt runtime without LTO
@@ -445,10 +444,6 @@ function(add_compiler_rt_runtime name type)
     if(type STREQUAL "SHARED")
       rt_externalize_debuginfo(${libname})
     endif()
-
-    if(DEFINED LIB_EXTENSIONS)
-      set_target_properties(${libname} PROPERTIES C_EXTENSIONS ${LIB_EXTENSIONS})
-    endif()
   endforeach()
   if(LIB_PARENT_TARGET)
     add_dependencies(${LIB_PARENT_TARGET} ${libnames})
diff --git a/compiler-rt/cmake/Modules/CompilerRTAIXUtils.cmake b/compiler-rt/cmake/Modules/CompilerRTAIXUtils.cmake
index 53aa750d934d7..20400014869e1 100644
--- a/compiler-rt/cmake/Modules/CompilerRTAIXUtils.cmake
+++ b/compiler-rt/cmake/Modules/CompilerRTAIXUtils.cmake
@@ -34,12 +34,10 @@ macro(archive_aix_libatomic name libname)
       if(TARGET ${target})
         file(MAKE_DIRECTORY ${output_dir})
         add_custom_command(OUTPUT "${output_dir}/libatomic.so.1"
-                           POST_BUILD
                            COMMAND ${CMAKE_COMMAND} -E
                            copy "$<TARGET_FILE:${target}>"
                                 "${output_dir}/libatomic.so.1"
                            # If built with MODULE, F_LOADONLY is set.
-                           # We have to remove this flag at POST_BUILD.
                            COMMAND ${CMAKE_STRIP} -X32_64 -E
                                 "${output_dir}/libatomic.so.1"
                            DEPENDS ${target})
diff --git a/compiler-rt/cmake/builtin-config-ix.cmake b/compiler-rt/cmake/builtin-config-ix.cmake
index b86bb1bca7cda..eaff8135cff45 100644
--- a/compiler-rt/cmake/builtin-config-ix.cmake
+++ b/compiler-rt/cmake/builtin-config-ix.cmake
@@ -117,14 +117,22 @@ include(CompilerRTDarwinUtils)
 if(APPLE)
 
   find_darwin_sdk_dir(DARWIN_osx_SYSROOT macosx)
-  find_darwin_sdk_dir(DARWIN_iossim_SYSROOT iphonesimulator)
-  find_darwin_sdk_dir(DARWIN_ios_SYSROOT iphoneos)
-  find_darwin_sdk_dir(DARWIN_watchossim_SYSROOT watchsimulator)
-  find_darwin_sdk_dir(DARWIN_watchos_SYSROOT watchos)
-  find_darwin_sdk_dir(DARWIN_tvossim_SYSROOT appletvsimulator)
-  find_darwin_sdk_dir(DARWIN_tvos_SYSROOT appletvos)
-  find_darwin_sdk_dir(DARWIN_xrossim_SYSROOT xrsimulator)
-  find_darwin_sdk_dir(DARWIN_xros_SYSROOT xros)
+  if(COMPILER_RT_ENABLE_IOS)
+    find_darwin_sdk_dir(DARWIN_iossim_SYSROOT iphonesimulator)
+    find_darwin_sdk_dir(DARWIN_ios_SYSROOT iphoneos)
+  endif()
+  if(COMPILER_RT_ENABLE_WATCHOS)
+    find_darwin_sdk_dir(DARWIN_watchossim_SYSROOT watchsimulator)
+    find_darwin_sdk_dir(DARWIN_watchos_SYSROOT watchos)
+  endif()
+  if(COMPILER_RT_ENABLE_TVOS)
+    find_darwin_sdk_dir(DARWIN_tvossim_SYSROOT appletvsimulator)
+    find_darwin_sdk_dir(DARWIN_tvos_SYSROOT appletvos)
+  endif()
+  if(COMPILER_RT_ENABLE_XROS)
+    find_darwin_sdk_dir(DARWIN_xrossim_SYSROOT xrsimulator)
+    find_darwin_sdk_dir(DARWIN_xros_SYSROOT xros)
+  endif()
 
   # Get supported architecture from SDKSettings.
   function(sdk_has_arch_support sdk_path os arch has_support)
diff --git a/compiler-rt/cmake/config-ix.cmake b/compiler-rt/cmake/config-ix.cmake
index 67db4383ec3dc..8dfbdecbd6b97 100644
--- a/compiler-rt/cmake/config-ix.cmake
+++ b/compiler-rt/cmake/config-ix.cmake
@@ -408,12 +408,18 @@ if(APPLE)
   include(CompilerRTDarwinUtils)
 
   find_darwin_sdk_dir(DARWIN_osx_SYSROOT macosx)
-  find_darwin_sdk_dir(DARWIN_iossim_SYSROOT iphonesimulator)
-  find_darwin_sdk_dir(DARWIN_ios_SYSROOT iphoneos)
-  find_darwin_sdk_dir(DARWIN_watchossim_SYSROOT watchsimulator)
-  find_darwin_sdk_dir(DARWIN_watchos_SYSROOT watchos)
-  find_darwin_sdk_dir(DARWIN_tvossim_SYSROOT appletvsimulator)
-  find_darwin_sdk_dir(DARWIN_tvos_SYSROOT appletvos)
+  if(COMPILER_RT_ENABLE_IOS)
+    find_darwin_sdk_dir(DARWIN_iossim_SYSROOT iphonesimulator)
+    find_darwin_sdk_dir(DARWIN_ios_SYSROOT iphoneos)
+  endif()
+  if(COMPILER_RT_ENABLE_WATCHOS)
+    find_darwin_sdk_dir(DARWIN_watchossim_SYSROOT watchsimulator)
+    find_darwin_sdk_dir(DARWIN_watchos_SYSROOT watchos)
+  endif()
+  if(COMPILER_RT_ENABLE_TVOS)
+    find_darwin_sdk_dir(DARWIN_tvossim_SYSROOT appletvsimulator)
+    find_darwin_sdk_dir(DARWIN_tvos_SYSROOT appletvos)
+  endif()
 
   if(NOT DARWIN_osx_SYSROOT)
     message(WARNING "Could not determine OS X sysroot, trying /usr/include")
diff --git a/compiler-rt/lib/asan/asan_fake_stack.cpp b/compiler-rt/lib/asan/asan_fake_stack.cpp
index c3ed2526f0ed4..d3fa953f31005 100644
--- a/compiler-rt/lib/asan/asan_fake_stack.cpp
+++ b/compiler-rt/lib/asan/asan_fake_stack.cpp
@@ -28,7 +28,7 @@ static const u64 kAllocaRedzoneMask = 31UL;
 // For small size classes inline PoisonShadow for better performance.
 ALWAYS_INLINE void SetShadow(uptr ptr, uptr size, uptr class_id, u64 magic) {
   CHECK(AddrIsAlignedByGranularity(ptr + size));
-  u64 *shadow = reinterpret_cast<u64*>(MemToShadow(ptr));
+  u64* shadow = reinterpret_cast<u64*>(MemToShadow(ptr));
   if (ASAN_SHADOW_SCALE == 3 && class_id <= 6) {
     // This code expects ASAN_SHADOW_SCALE=3.
     for (uptr i = 0; i < (((uptr)1) << class_id); i++) {
@@ -47,7 +47,7 @@ ALWAYS_INLINE void SetShadow(uptr ptr, uptr size, uptr class_id, u64 magic) {
   }
 }
 
-FakeStack *FakeStack::Create(uptr stack_size_log) {
+FakeStack* FakeStack::Create(uptr stack_size_log) {
   static uptr kMinStackSizeLog = 16;
   static uptr kMaxStackSizeLog = FIRST_32_SECOND_64(24, 28);
   if (stack_size_log < kMinStackSizeLog)
@@ -57,7 +57,7 @@ FakeStack *FakeStack::Create(uptr stack_size_log) {
   CHECK_LE(kMaxStackFrameSizeLog, stack_size_log);
   uptr size = RequiredSize(stack_size_log);
   uptr padded_size = size + kMaxStackFrameSize;
-  void *true_res = reinterpret_cast<void *>(
+  void* true_res = reinterpret_cast<void*>(
       flags()->uar_noreserve ? MmapNoReserveOrDie(padded_size, "FakeStack")
                              : MmapOrDie(padded_size, "FakeStack"));
   // GetFrame() requires the property that
@@ -66,20 +66,20 @@ FakeStack *FakeStack::Create(uptr stack_size_log) {
   // We didn't use MmapAlignedOrDieOnFatalError, because it requires that the
   // *size* is a power of 2, which is an overly strong condition.
   static_assert(alignof(FakeStack) <= kMaxStackFrameSize);
-  FakeStack *res = reinterpret_cast<FakeStack *>(
+  FakeStack* res = reinterpret_cast<FakeStack*>(
       RoundUpTo(
           (uptr)true_res + kFlagsOffset + SizeRequiredForFlags(stack_size_log),
           kMaxStackFrameSize) -
       kFlagsOffset - SizeRequiredForFlags(stack_size_log));
   res->true_start = true_res;
   res->stack_size_log_ = stack_size_log;
-  u8 *p = reinterpret_cast<u8 *>(res);
+  u8* p = reinterpret_cast<u8*>(res);
   VReport(1,
           "T%d: FakeStack created: %p -- %p stack_size_log: %zd; "
           "mmapped %zdK, noreserve=%d, true_start: %p, start of first frame: "
           "0x%zx\n",
-          GetCurrentTidOrInvalid(), (void *)p,
-          (void *)(p + FakeStack::RequiredSize(stack_size_log)), stack_size_log,
+          GetCurrentTidOrInvalid(), (void*)p,
+          (void*)(p + FakeStack::RequiredSize(stack_size_log)), stack_size_log,
           size >> 10, flags()->uar_noreserve, res->true_start,
           res->GetFrame(stack_size_log, /*class_id*/ 0, /*pos*/ 0));
   return res;
@@ -109,14 +109,14 @@ void FakeStack::PoisonAll(u8 magic) {
 #if !defined(_MSC_VER) || defined(__clang__)
 ALWAYS_INLINE USED
 #endif
-FakeFrame *FakeStack::Allocate(uptr stack_size_log, uptr class_id,
-                               uptr real_stack) {
+    FakeFrame* FakeStack::Allocate(uptr stack_size_log, uptr class_id,
+                                   uptr real_stack) {
   CHECK_LT(class_id, kNumberOfSizeClasses);
   if (needs_gc_)
     GC(real_stack);
-  uptr &hint_position = hint_position_[class_id];
+  uptr& hint_position = hint_position_[class_id];
   const int num_iter = NumberOfFrames(stack_size_log, class_id);
-  u8 *flags = GetFlags(stack_size_log, class_id);
+  u8* flags = GetFlags(stack_size_log, class_id);
   for (int i = 0; i < num_iter; i++) {
     uptr pos = ModuloNumberOfFrames(stack_size_log, class_id, hint_position++);
     // This part is tricky. On one hand, checking and setting flags[pos]
@@ -126,22 +126,24 @@ FakeFrame *FakeStack::Allocate(uptr stack_size_log, uptr class_id,
     // and so will not touch this particular byte. So, it is safe to do this
     // with regular non-atomic load and store (at least I was not able to make
     // this code crash).
-    if (flags[pos]) continue;
+    if (flags[pos])
+      continue;
     flags[pos] = 1;
-    FakeFrame *res = reinterpret_cast<FakeFrame *>(
-        GetFrame(stack_size_log, class_id, pos));
+    FakeFrame* res =
+        reinterpret_cast<FakeFrame*>(GetFrame(stack_size_log, class_id, pos));
     res->real_stack = real_stack;
     *SavedFlagPtr(reinterpret_cast<uptr>(res), class_id) = &flags[pos];
     return res;
   }
-  return nullptr; // We are out of fake stack.
+  return nullptr;  // We are out of fake stack.
 }
 
-uptr FakeStack::AddrIsInFakeStack(uptr ptr, uptr *frame_beg, uptr *frame_end) {
+uptr FakeStack::AddrIsInFakeStack(uptr ptr, uptr* frame_beg, uptr* frame_end) {
   uptr stack_size_log = this->stack_size_log();
   uptr beg = reinterpret_cast<uptr>(GetFrame(stack_size_log, 0, 0));
   uptr end = reinterpret_cast<uptr>(this) + RequiredSize(stack_size_log);
-  if (ptr < beg || ptr >= end) return 0;
+  if (ptr < beg || ptr >= end)
+    return 0;
   uptr class_id = (ptr - beg) >> stack_size_log;
   uptr base = beg + (class_id << stack_size_log);
   CHECK_LE(base, ptr);
@@ -153,9 +155,7 @@ uptr FakeStack::AddrIsInFakeStack(uptr ptr, uptr *frame_beg, uptr *frame_end) {
   return res;
 }
 
-void FakeStack::HandleNoReturn() {
-  needs_gc_ = true;
-}
+void FakeStack::HandleNoReturn() { needs_gc_ = true; }
 
 // Hack: The statement below is not true if we take into account sigaltstack or
 // makecontext. It should be possible to make GC to discard wrong stack frame if
@@ -170,7 +170,7 @@ void FakeStack::HandleNoReturn() {
 // We do it based on their 'real_stack' values -- everything that is lower
 // than the current real_stack is garbage.
 NOINLINE void FakeStack::GC(uptr real_stack) {
-  AsanThread *curr_thread = GetCurrentThread();
+  AsanThread* curr_thread = GetCurrentThread();
   if (!curr_thread)
     return;  // Try again when we have a thread.
   auto top = curr_thread->stack_top();
@@ -179,12 +179,13 @@ NOINLINE void FakeStack::GC(uptr real_stack) {
     return;  // Not the default stack.
 
   for (uptr class_id = 0; class_id < kNumberOfSizeClasses; class_id++) {
-    u8 *flags = GetFlags(stack_size_log(), class_id);
+    u8* flags = GetFlags(stack_size_log(), class_id);
     for (uptr i = 0, n = NumberOfFrames(stack_size_log(), class_id); i < n;
          i++) {
-      if (flags[i] == 0) continue;  // not allocated.
-      FakeFrame *ff = reinterpret_cast<FakeFrame *>(
-          GetFrame(stack_size_log(), class_id, i));
+      if (flags[i] == 0)
+        continue;  // not allocated.
+      FakeFrame* ff =
+          reinterpret_cast<FakeFrame*>(GetFrame(stack_size_log(), class_id, i));
       // GC only on the default stack.
       if (bottom < ff->real_stack && ff->real_stack < real_stack) {
         flags[i] = 0;
@@ -197,14 +198,15 @@ NOINLINE void FakeStack::GC(uptr real_stack) {
   needs_gc_ = false;
 }
 
-void FakeStack::ForEachFakeFrame(RangeIteratorCallback callback, void *arg) {
+void FakeStack::ForEachFakeFrame(RangeIteratorCallback callback, void* arg) {
   for (uptr class_id = 0; class_id < kNumberOfSizeClasses; class_id++) {
-    u8 *flags = GetFlags(stack_size_log(), class_id);
+    u8* flags = GetFlags(stack_size_log(), class_id);
     for (uptr i = 0, n = NumberOfFrames(stack_size_log(), class_id); i < n;
          i++) {
-      if (flags[i] == 0) continue;  // not allocated.
-      FakeFrame *ff = reinterpret_cast<FakeFrame *>(
-          GetFrame(stack_size_log(), class_id, i));
+      if (flags[i] == 0)
+        continue;  // not allocated.
+      FakeFrame* ff =
+          reinterpret_cast<FakeFrame*>(GetFrame(stack_size_log(), class_id, i));
       uptr begin = reinterpret_cast<uptr>(ff);
       callback(begin, begin + FakeStack::BytesInSizeClass(class_id), arg);
     }
@@ -212,44 +214,51 @@ void FakeStack::ForEachFakeFrame(RangeIteratorCallback callback, void *arg) {
 }
 
 #if (SANITIZER_LINUX && !SANITIZER_ANDROID) || SANITIZER_FUCHSIA
-static THREADLOCAL FakeStack *fake_stack_tls;
+static THREADLOCAL FakeStack* fake_stack_tls;
 
-FakeStack *GetTLSFakeStack() {
-  return fake_stack_tls;
-}
-void SetTLSFakeStack(FakeStack *fs) {
-  fake_stack_tls = fs;
-}
+static FakeStack* GetTLSFakeStack() { return fake_stack_tls; }
+static void SetTLSFakeStack(FakeStack* fs) { fake_stack_tls = fs; }
+void ResetTLSFakeStack() { fake_stack_tls = nullptr; }
 #else
-FakeStack *GetTLSFakeStack() { return 0; }
-void SetTLSFakeStack(FakeStack *fs) { }
+static FakeStack* GetTLSFakeStack() { return nullptr; }
+static void SetTLSFakeStack(FakeStack*) {}
+void ResetTLSFakeStack() {}
 #endif  // (SANITIZER_LINUX && !SANITIZER_ANDROID) || SANITIZER_FUCHSIA
 
-static FakeStack *GetFakeStack() {
-  AsanThread *t = GetCurrentThread();
-  if (!t) return nullptr;
+static FakeStack* GetFakeStack() {
+  AsanThread* t = GetCurrentThread();
+  if (!t)
+    return nullptr;
   return t->get_or_create_fake_stack();
 }
 
-static FakeStack *GetFakeStackFast() {
-  if (FakeStack *fs = GetTLSFakeStack())
+static FakeStack* GetFakeStackFast() {
+  FakeStack* fs = GetTLSFakeStack();
+  if (LIKELY(fs))
     return fs;
   if (!__asan_option_detect_stack_use_after_return)
     return nullptr;
-  return GetFakeStack();
+  fs = GetFakeStack();
+  if (LIKELY(fs))
+    SetTLSFakeStack(fs);
+  return fs;
 }
 
-static FakeStack *GetFakeStackFastAlways() {
-  if (FakeStack *fs = GetTLSFakeStack())
+static FakeStack* GetFakeStackFastAlways() {
+  FakeStack* fs = GetTLSFakeStack();
+  if (LIKELY(fs))
     return fs;
-  return GetFakeStack();
+  fs = GetFakeStack();
+  if (LIKELY(fs))
+    SetTLSFakeStack(fs);
+  return fs;
 }
 
 static ALWAYS_INLINE uptr OnMalloc(uptr class_id, uptr size) {
-  FakeStack *fs = GetFakeStackFast();
+  FakeStack* fs = GetFakeStackFast();
   if (!fs)
     return 0;
-  FakeFrame *ff =
+  FakeFrame* ff =
       fs->Allocate(fs->stack_size_log(), class_id, GET_CURRENT_FRAME());
   if (!ff)
     return 0;  // Out of fake stack.
@@ -259,10 +268,10 @@ static ALWAYS_INLINE uptr OnMalloc(uptr class_id, uptr size) {
 }
 
 static ALWAYS_INLINE uptr OnMallocAlways(uptr class_id, uptr size) {
-  FakeStack *fs = GetFakeStackFastAlways();
+  FakeStack* fs = GetFakeStackFastAlways();
   if (!fs)
     return 0;
-  FakeFrame *ff =
+  FakeFrame* ff =
       fs->Allocate(fs->stack_size_log(), class_id, GET_CURRENT_FRAME());
   if (!ff)
     return 0;  // Out of fake stack.
@@ -276,17 +285,17 @@ static ALWAYS_INLINE void OnFree(uptr ptr, uptr class_id, uptr size) {
   SetShadow(ptr, size, class_id, kMagic8);
 }
 
-} // namespace __asan
+}  // namespace __asan
 
 // ---------------------- Interface ---------------- {{{1
 using namespace __asan;
 #define DEFINE_STACK_MALLOC_FREE_WITH_CLASS_ID(class_id)                      \
   extern "C" SANITIZER_INTERFACE_ATTRIBUTE uptr                               \
-      __asan_stack_malloc_##class_id(uptr size) {                             \
+  __asan_stack_malloc_##class_id(uptr size) {                                 \
     return OnMalloc(class_id, size);                                          \
   }                                                                           \
   extern "C" SANITIZER_INTERFACE_ATTRIBUTE uptr                               \
-      __asan_stack_malloc_always_##class_id(uptr size) {                      \
+  __asan_stack_malloc_always_##class_id(uptr size) {                          \
     return OnMallocAlways(class_id, size);                                    \
   }                                                                           \
   extern "C" SANITIZER_INTERFACE_ATTRIBUTE void __asan_stack_free_##class_id( \
@@ -311,21 +320,25 @@ extern "C" {
 // -asan-use-after-return=never, after modal UAR flag lands
 // (https://github.com/google/sanitizers/issues/1394)
 SANITIZER_INTERFACE_ATTRIBUTE
-void *__asan_get_current_fake_stack() { return GetFakeStackFast(); }
+void* __asan_get_current_fake_stack() { return GetFakeStackFast(); }
 
 SANITIZER_INTERFACE_ATTRIBUTE
-void *__asan_addr_is_in_fake_stack(void *fake_stack, void *addr, void **beg,
-                                   void **end) {
-  FakeStack *fs = reinterpret_cast<FakeStack*>(fake_stack);
-  if (!fs) return nullptr;
+void* __asan_addr_is_in_fake_stack(void* fake_stack, void* addr, void** beg,
+                                   void** end) {
+  FakeStack* fs = reinterpret_cast<FakeStack*>(fake_stack);
+  if (!fs)
+    return nullptr;
   uptr frame_beg, frame_end;
-  FakeFrame *frame = reinterpret_cast<FakeFrame *>(fs->AddrIsInFakeStack(
+  FakeFrame* frame = reinterpret_cast<FakeFrame*>(fs->AddrIsInFakeStack(
       reinterpret_cast<uptr>(addr), &frame_beg, &frame_end));
-  if (!frame) return nullptr;
+  if (!frame)
+    return nullptr;
   if (frame->magic != kCurrentStackFrameMagic)
     return nullptr;
-  if (beg) *beg = reinterpret_cast<void*>(frame_beg);
-  if (end) *end = reinterpret_cast<void*>(frame_end);
+  if (beg)
+    *beg = reinterpret_cast<void*>(frame_beg);
+  if (end)
+    *end = reinterpret_cast<void*>(frame_end);
   return reinterpret_cast<void*>(frame->real_stack);
 }
 
@@ -344,9 +357,9 @@ void __asan_alloca_poison(uptr addr, uptr size) {
 
 SANITIZER_INTERFACE_ATTRIBUTE
 void __asan_allocas_unpoison(uptr top, uptr bottom) {
-  if ((!top) || (top > bottom)) return;
-  REAL(memset)
-  (reinterpret_cast<void *>(MemToShadow(top)), 0,
-   (bottom - top) / ASAN_SHADOW_GRANULARITY);
+  if ((!top) || (top > bottom))
+    return;
+  REAL(memset)(reinterpret_cast<void*>(MemToShadow(top)), 0,
+               (bottom - top) / ASAN_SHADOW_GRANULARITY);
 }
-} // extern "C"
+}  // extern "C"
diff --git a/compiler-rt/lib/asan/asan_fake_stack.h b/compiler-rt/lib/asan/asan_fake_stack.h
index 50706e6e5876c..593c1373c8ff1 100644
--- a/compiler-rt/lib/asan/asan_fake_stack.h
+++ b/compiler-rt/lib/asan/asan_fake_stack.h
@@ -195,8 +195,7 @@ class FakeStack {
   void *true_start;
 };
 
-FakeStack *GetTLSFakeStack();
-void SetTLSFakeStack(FakeStack *fs);
+void ResetTLSFakeStack();
 
 }  // namespace __asan
 
diff --git a/compiler-rt/lib/asan/asan_rtl_x86_64.S b/compiler-rt/lib/asan/asan_rtl_x86_64.S
index 9c5289856d8ae..5ee830d3afc5e 100644
--- a/compiler-rt/lib/asan/asan_rtl_x86_64.S
+++ b/compiler-rt/lib/asan/asan_rtl_x86_64.S
@@ -5,6 +5,7 @@
 #include "sanitizer_common/sanitizer_platform.h"
 
 .file "asan_rtl_x86_64.S"
+.att_syntax
 
 #define NAME(n, reg, op, s, i) n##_##op##_##i##_##s##_##reg
 
diff --git a/compiler-rt/lib/asan/asan_thread.cpp b/compiler-rt/lib/asan/asan_thread.cpp
index 2627ae1289012..0ed58bbe2a73a 100644
--- a/compiler-rt/lib/asan/asan_thread.cpp
+++ b/compiler-rt/lib/asan/asan_thread.cpp
@@ -163,7 +163,7 @@ void AsanThread::StartSwitchFiber(FakeStack **fake_stack_save, uptr bottom,
   if (fake_stack_save)
     *fake_stack_save = fake_stack_;
   fake_stack_ = nullptr;
-  SetTLSFakeStack(nullptr);
+  ResetTLSFakeStack();
   // if fake_stack_save is null, the fiber will die, delete the fakestack
   if (!fake_stack_save && current_fake_stack)
     current_fake_stack->Destroy(this->tid());
@@ -177,8 +177,8 @@ void AsanThread::FinishSwitchFiber(FakeStack *fake_stack_save, uptr *bottom_old,
   }
 
   if (fake_stack_save) {
-    SetTLSFakeStack(fake_stack_save);
     fake_stack_ = fake_stack_save;
+    ResetTLSFakeStack();
   }
 
   if (bottom_old)
@@ -242,7 +242,7 @@ FakeStack *AsanThread::AsyncSignalSafeLazyInitFakeStack() {
         Max(stack_size_log, static_cast<uptr>(flags()->min_uar_stack_size_log));
     fake_stack_ = FakeStack::Create(stack_size_log);
     DCHECK_EQ(GetCurrentThread(), this);
-    SetTLSFakeStack(fake_stack_);
+    ResetTLSFakeStack();
     return fake_stack_;
   }
   return nullptr;
diff --git a/compiler-rt/lib/asan/asan_thread.h b/compiler-rt/lib/asan/asan_thread.h
index 12f0cc7a62dae..19b7f342e1712 100644
--- a/compiler-rt/lib/asan/asan_thread.h
+++ b/compiler-rt/lib/asan/asan_thread.h
@@ -104,7 +104,7 @@ class AsanThread {
     if (!fake_stack_) return;
     FakeStack *t = fake_stack_;
     fake_stack_ = nullptr;
-    SetTLSFakeStack(nullptr);
+    ResetTLSFakeStack();
     t->Destroy(tid);
   }
 
diff --git a/compiler-rt/lib/asan/tests/CMakeLists.txt b/compiler-rt/lib/asan/tests/CMakeLists.txt
index 9cd9c97bed813..6d88c96a23bbc 100644
--- a/compiler-rt/lib/asan/tests/CMakeLists.txt
+++ b/compiler-rt/lib/asan/tests/CMakeLists.txt
@@ -170,11 +170,21 @@ function(add_asan_tests arch test_runtime)
   set(CONFIG_NAME ${ARCH_UPPER_CASE}${OS_NAME}Config)
   set(CONFIG_NAME_DYNAMIC ${ARCH_UPPER_CASE}${OS_NAME}DynamicConfig)
 
+  # On some platforms, unit tests can be run against the runtime that shipped
+  # with the host compiler with COMPILER_RT_TEST_STANDALONE_BUILD_LIBS=OFF.
+  # COMPILER_RT_ASAN_UNIT_TESTS_USE_HOST_RUNTIME=ON removes the dependency
+  # on `asan`, allowing the tests to be run independently without
+  # a newly built asan runtime.
+  set(ASAN_UNIT_TEST_DEPS asan)
+  if(COMPILER_RT_ASAN_UNIT_TESTS_USE_HOST_RUNTIME)
+    set(ASAN_UNIT_TEST_DEPS)
+  endif()
+
   # Closure to keep the values.
   function(generate_asan_tests test_objects test_suite testname)
     generate_compiler_rt_tests(${test_objects} ${test_suite} ${testname} ${arch}
       COMPILE_DEPS ${ASAN_UNITTEST_HEADERS} ${ASAN_IGNORELIST_FILE}
-      DEPS asan
+      DEPS ${ASAN_UNIT_TEST_DEPS}
       KIND ${TEST_KIND}
       ${ARGN}
       )
@@ -215,7 +225,7 @@ function(add_asan_tests arch test_runtime)
       add_compiler_rt_test(AsanDynamicUnitTests "${dynamic_test_name}" "${arch}"
         SUBDIR "${CONFIG_NAME_DYNAMIC}"
         OBJECTS ${ASAN_INST_TEST_OBJECTS}
-        DEPS asan ${ASAN_INST_TEST_OBJECTS}
+        DEPS ${ASAN_UNIT_TEST_DEPS} ${ASAN_INST_TEST_OBJECTS}
         LINK_FLAGS ${ASAN_DYNAMIC_UNITTEST_INSTRUMENTED_LINK_FLAGS} ${TARGET_LINK_FLAGS} ${DYNAMIC_LINK_FLAGS}
         )
     endif()
diff --git a/compiler-rt/lib/builtins/assembly.h b/compiler-rt/lib/builtins/assembly.h
index d1e532813aa24..79a45d9143b40 100644
--- a/compiler-rt/lib/builtins/assembly.h
+++ b/compiler-rt/lib/builtins/assembly.h
@@ -337,4 +337,8 @@
 #endif
 #endif
 
+#if defined(__i386__) || defined(__amd64__)
+.att_syntax
+#endif
+
 #endif // COMPILERRT_ASSEMBLY_H
diff --git a/compiler-rt/lib/builtins/cpu_model/x86.c b/compiler-rt/lib/builtins/cpu_model/x86.c
index a40675c071ffc..7ddfaa3e3ed6c 100644
--- a/compiler-rt/lib/builtins/cpu_model/x86.c
+++ b/compiler-rt/lib/builtins/cpu_model/x86.c
@@ -104,6 +104,7 @@ enum ProcessorSubtypes {
   INTEL_COREI7_PANTHERLAKE,
   AMDFAM1AH_ZNVER5,
   INTEL_COREI7_DIAMONDRAPIDS,
+  INTEL_COREI7_NOVALAKE,
   CPU_SUBTYPE_MAX
 };
 
@@ -520,6 +521,13 @@ static const char *getIntelProcessorTypeAndSubtype(unsigned Family,
       *Subtype = INTEL_COREI7_PANTHERLAKE;
       break;
 
+    // Wildcatlake:
+    case 0xd5:
+      CPU = "wildcatlake";
+      *Type = INTEL_COREI7;
+      *Subtype = INTEL_COREI7_PANTHERLAKE;
+      break;
+
     // Icelake Xeon:
     case 0x6a:
     case 0x6c:
@@ -639,6 +647,19 @@ static const char *getIntelProcessorTypeAndSubtype(unsigned Family,
       break;
     }
     break;
+  case 0x12:
+    switch (Model) {
+    case 0x1:
+    case 0x3:
+      CPU = "novalake";
+      *Type = INTEL_COREI7;
+      *Subtype = INTEL_COREI7_NOVALAKE;
+      break;
+    default: // Unknown family 0x12 CPU.
+      break;
+    }
+    break;
+
   default:
     break; // Unknown.
   }
diff --git a/compiler-rt/lib/builtins/gcc_personality_v0.c b/compiler-rt/lib/builtins/gcc_personality_v0.c
index ef63a5fb83472..3ed17fa788c28 100644
--- a/compiler-rt/lib/builtins/gcc_personality_v0.c
+++ b/compiler-rt/lib/builtins/gcc_personality_v0.c
@@ -30,6 +30,54 @@ EXCEPTION_DISPOSITION _GCC_specific_handler(PEXCEPTION_RECORD, void *, PCONTEXT,
                                             _Unwind_Personality_Fn);
 #endif
 
+#if __has_feature(ptrauth_calls)
+#include <ptrauth.h>
+
+// `__ptrauth_restricted_intptr` is a feature of apple clang that predates
+// support for direct application of `__ptrauth` to integer types. This
+// guard is necessary to support compilation with those compiler.
+#if __has_feature(ptrauth_restricted_intptr_qualifier)
+#define __ptrauth_gcc_personality_intptr(key, addressDiscriminated,            \
+                                         discriminator)                        \
+  __ptrauth_restricted_intptr(key, addressDiscriminated, discriminator)
+#else
+#define __ptrauth_gcc_personality_intptr(key, addressDiscriminated,            \
+                                         discriminator)                        \
+  __ptrauth(key, addressDiscriminated, discriminator)
+#endif
+#else
+#define __ptrauth_gcc_personality_intptr(...)
+#endif
+
+#define __ptrauth_gcc_personality_func_key ptrauth_key_function_pointer
+
+// ptrauth_string_discriminator("__gcc_personality_v0'funcStart") == 0xDFEB
+#define __ptrauth_gcc_personality_func_start                                   \
+  __ptrauth_gcc_personality_intptr(__ptrauth_gcc_personality_func_key, 1,      \
+                                   0xDFEB)
+
+// ptrauth_string_discriminator("__gcc_personality_v0'start") == 0x52DC
+#define __ptrauth_gcc_personality_start                                        \
+  __ptrauth_gcc_personality_intptr(__ptrauth_gcc_personality_func_key, 1,      \
+                                   0x52DC)
+
+// ptrauth_string_discriminator("__gcc_personality_v0'length") == 0xFFF7
+#define __ptrauth_gcc_personality_length                                       \
+  __ptrauth_gcc_personality_intptr(__ptrauth_gcc_personality_func_key, 1,      \
+                                   0xFFF7)
+
+// ptrauth_string_discriminator("__gcc_personality_v0'landingPadOffset") ==
+// 0x6498
+#define __ptrauth_gcc_personality_lpoffset                                     \
+  __ptrauth_gcc_personality_intptr(__ptrauth_gcc_personality_func_key, 1,      \
+                                   0x6498)
+
+// ptrauth_string_discriminator("__gcc_personality_v0'landingPad") == 0xA134
+#define __ptrauth_gcc_personality_lpad_disc 0xA134
+#define __ptrauth_gcc_personality_lpad                                         \
+  __ptrauth_gcc_personality_intptr(__ptrauth_gcc_personality_func_key, 1,      \
+                                   __ptrauth_gcc_personality_lpad_disc)
+
 // Pointer encodings documented at:
 //   http://refspecs.freestandards.org/LSB_1.3.0/gLSB/gLSB/ehframehdr.html
 
@@ -205,7 +253,8 @@ COMPILER_RT_ABI _Unwind_Reason_Code __gcc_personality_v0(
     return continueUnwind(exceptionObject, context);
 
   uintptr_t pc = (uintptr_t)_Unwind_GetIP(context) - 1;
-  uintptr_t funcStart = (uintptr_t)_Unwind_GetRegionStart(context);
+  uintptr_t __ptrauth_gcc_personality_func_start funcStart =
+      (uintptr_t)_Unwind_GetRegionStart(context);
   uintptr_t pcOffset = pc - funcStart;
 
   // Parse LSDA header.
@@ -224,11 +273,14 @@ COMPILER_RT_ABI _Unwind_Reason_Code __gcc_personality_v0(
   const uint8_t *callSiteTableEnd = callSiteTableStart + callSiteTableLength;
   const uint8_t *p = callSiteTableStart;
   while (p < callSiteTableEnd) {
-    uintptr_t start = readEncodedPointer(&p, callSiteEncoding);
-    size_t length = readEncodedPointer(&p, callSiteEncoding);
-    size_t landingPad = readEncodedPointer(&p, callSiteEncoding);
+    uintptr_t __ptrauth_gcc_personality_start start =
+        readEncodedPointer(&p, callSiteEncoding);
+    size_t __ptrauth_gcc_personality_length length =
+        readEncodedPointer(&p, callSiteEncoding);
+    size_t __ptrauth_gcc_personality_lpoffset landingPadOffset =
+        readEncodedPointer(&p, callSiteEncoding);
     readULEB128(&p); // action value not used for C code
-    if (landingPad == 0)
+    if (landingPadOffset == 0)
       continue; // no landing pad for this entry
     if ((start <= pcOffset) && (pcOffset < (start + length))) {
       // Found landing pad for the PC.
@@ -238,7 +290,24 @@ COMPILER_RT_ABI _Unwind_Reason_Code __gcc_personality_v0(
       _Unwind_SetGR(context, __builtin_eh_return_data_regno(0),
                     (uintptr_t)exceptionObject);
       _Unwind_SetGR(context, __builtin_eh_return_data_regno(1), 0);
-      _Unwind_SetIP(context, (funcStart + landingPad));
+      size_t __ptrauth_gcc_personality_lpad landingPad =
+          funcStart + landingPadOffset;
+#if __has_feature(ptrauth_calls)
+      uintptr_t stackPointer = _Unwind_GetGR(context, -2);
+      const uintptr_t existingDiscriminator = ptrauth_blend_discriminator(
+          &landingPad, __ptrauth_gcc_personality_lpad_disc);
+      // newIP is authenticated as if it were qualified with a pseudo qualifier
+      // along the lines of:
+      //   __ptrauth(ptrauth_key_return_address, <stackPointer>, 0)
+      // where the stack pointer is used in place of the strict storage
+      // address.
+      uintptr_t newIP = (uintptr_t)ptrauth_auth_and_resign(
+          *(void **)&landingPad, __ptrauth_gcc_personality_func_key,
+          existingDiscriminator, ptrauth_key_return_address, stackPointer);
+      _Unwind_SetIP(context, newIP);
+#else
+      _Unwind_SetIP(context, landingPad);
+#endif
       return _URC_INSTALL_CONTEXT;
     }
   }
diff --git a/compiler-rt/lib/hwasan/hwasan_setjmp_x86_64.S b/compiler-rt/lib/hwasan/hwasan_setjmp_x86_64.S
index 9804e8d7cecaa..a5379d39c6e79 100644
--- a/compiler-rt/lib/hwasan/hwasan_setjmp_x86_64.S
+++ b/compiler-rt/lib/hwasan/hwasan_setjmp_x86_64.S
@@ -30,6 +30,7 @@
 
 .section .text
 .file "hwasan_setjmp_x86_64.S"
+.att_syntax
 
 .global ASM_WRAPPER_NAME(setjmp)
 ASM_TYPE_FUNCTION(ASM_WRAPPER_NAME(setjmp))
diff --git a/compiler-rt/lib/msan/msan.h b/compiler-rt/lib/msan/msan.h
index 7fb58be67a02c..edb26997be07d 100644
--- a/compiler-rt/lib/msan/msan.h
+++ b/compiler-rt/lib/msan/msan.h
@@ -303,6 +303,7 @@ u32 ChainOrigin(u32 id, StackTrace *stack);
 const int STACK_TRACE_TAG_POISON = StackTrace::TAG_CUSTOM + 1;
 const int STACK_TRACE_TAG_FIELDS = STACK_TRACE_TAG_POISON + 1;
 const int STACK_TRACE_TAG_VPTR = STACK_TRACE_TAG_FIELDS + 1;
+const int STACK_TRACE_TAG_ALLOC_PADDING = STACK_TRACE_TAG_VPTR + 1;
 
 #define GET_MALLOC_STACK_TRACE                                             \
   UNINITIALIZED BufferedStackTrace stack;                                  \
diff --git a/compiler-rt/lib/msan/msan_allocator.cpp b/compiler-rt/lib/msan/msan_allocator.cpp
index 64df863839c06..80608aa2bbca6 100644
--- a/compiler-rt/lib/msan/msan_allocator.cpp
+++ b/compiler-rt/lib/msan/msan_allocator.cpp
@@ -217,25 +217,52 @@ static void *MsanAllocate(BufferedStackTrace *stack, uptr size, uptr alignment,
   }
   auto *meta = reinterpret_cast<Metadata *>(allocator.GetMetaData(allocated));
   meta->requested_size = size;
+  uptr actually_allocated_size = allocator.GetActuallyAllocatedSize(allocated);
+  void* padding_start = reinterpret_cast<char*>(allocated) + size;
+  uptr padding_size = actually_allocated_size - size;
+
+  // - With calloc(7,1), we can set the ideal tagging:
+  //     bytes 0-6:  initialized,   origin not set (and irrelevant)
+  //     byte  7:    uninitialized, origin TAG_ALLOC_PADDING
+  //     bytes 8-15: uninitialized, origin TAG_ALLOC_PADDING
+  // - If we have malloc(7) and __msan_get_track_origins() > 1, the 4-byte
+  //   origin granularity only allows the slightly suboptimal tagging:
+  //     bytes 0-6:  uninitialized, origin TAG_ALLOC
+  //     byte  7:    uninitialized, origin TAG_ALLOC (suboptimal)
+  //     bytes 8-15: uninitialized, origin TAG_ALLOC_PADDING
+  // - If we have malloc(7) and __msan_get_track_origins() == 1, we use a
+  //   single origin bean to reduce overhead:
+  //     bytes 0-6:  uninitialized, origin TAG_ALLOC
+  //     byte  7:    uninitialized, origin TAG_ALLOC (suboptimal)
+  //     bytes 8-15: uninitialized, origin TAG_ALLOC (suboptimal)
+  if (__msan_get_track_origins() && flags()->poison_in_malloc &&
+      (zero || (__msan_get_track_origins() > 1))) {
+    stack->tag = STACK_TRACE_TAG_ALLOC_PADDING;
+    Origin o2 = Origin::CreateHeapOrigin(stack);
+    __msan_set_origin(padding_start, padding_size, o2.raw_id());
+  }
+
   if (zero) {
     if (allocator.FromPrimary(allocated))
       __msan_clear_and_unpoison(allocated, size);
     else
       __msan_unpoison(allocated, size);  // Mem is already zeroed.
+
+    if (flags()->poison_in_malloc)
+      __msan_poison(padding_start, padding_size);
   } else if (flags()->poison_in_malloc) {
-    __msan_poison(allocated, size);
+    __msan_poison(allocated, actually_allocated_size);
+
     if (__msan_get_track_origins()) {
       stack->tag = StackTrace::TAG_ALLOC;
       Origin o = Origin::CreateHeapOrigin(stack);
-      __msan_set_origin(allocated, size, o.raw_id());
+      __msan_set_origin(
+          allocated,
+          __msan_get_track_origins() == 1 ? actually_allocated_size : size,
+          o.raw_id());
     }
   }
 
-  uptr actually_allocated_size = allocator.GetActuallyAllocatedSize(allocated);
-  // For compatibility, the allocator converted 0-sized allocations into 1 byte
-  if (size == 0 && actually_allocated_size > 0 && flags()->poison_in_malloc)
-    __msan_poison(allocated, 1);
-
   UnpoisonParam(2);
   RunMallocHooks(allocated, size);
   return allocated;
@@ -255,9 +282,10 @@ void __msan::MsanDeallocate(BufferedStackTrace *stack, void *p) {
   if (flags()->poison_in_free && allocator.FromPrimary(p)) {
     __msan_poison(p, size);
     if (__msan_get_track_origins()) {
+      uptr actually_allocated_size = allocator.GetActuallyAllocatedSize(p);
       stack->tag = StackTrace::TAG_DEALLOC;
       Origin o = Origin::CreateHeapOrigin(stack);
-      __msan_set_origin(p, size, o.raw_id());
+      __msan_set_origin(p, actually_allocated_size, o.raw_id());
     }
   }
   if (MsanThread *t = GetCurrentThread()) {
diff --git a/compiler-rt/lib/msan/msan_report.cpp b/compiler-rt/lib/msan/msan_report.cpp
index 99bf81f66dc9e..cd0bf67234d79 100644
--- a/compiler-rt/lib/msan/msan_report.cpp
+++ b/compiler-rt/lib/msan/msan_report.cpp
@@ -90,6 +90,10 @@ static void DescribeOrigin(u32 id) {
         Printf("  %sVirtual table ptr was destroyed%s\n", d.Origin(),
                d.Default());
         break;
+      case STACK_TRACE_TAG_ALLOC_PADDING:
+        Printf("  %sUninitialized value is outside of heap allocation%s\n",
+               d.Origin(), d.Default());
+        break;
       default:
         Printf("  %sUninitialized value was created%s\n", d.Origin(),
                d.Default());
diff --git a/compiler-rt/lib/orc/elfnix_tls.x86-64.S b/compiler-rt/lib/orc/elfnix_tls.x86-64.S
index b3e0bef008674..da202127bdbf3 100644
--- a/compiler-rt/lib/orc/elfnix_tls.x86-64.S
+++ b/compiler-rt/lib/orc/elfnix_tls.x86-64.S
@@ -13,6 +13,7 @@
 
 // The content of this file is x86_64-only
 #if defined(__x86_64__)
+.att_syntax
 
 #define REGISTER_SAVE_SPACE_SIZE        512
 
diff --git a/compiler-rt/lib/orc/sysv_reenter.x86-64.S b/compiler-rt/lib/orc/sysv_reenter.x86-64.S
index 0a36280f1d1f8..99615c0468b9a 100644
--- a/compiler-rt/lib/orc/sysv_reenter.x86-64.S
+++ b/compiler-rt/lib/orc/sysv_reenter.x86-64.S
@@ -12,6 +12,7 @@
 
 // The content of this file is x86_64-only
 #if defined(__x86_64__)
+.att_syntax
 
 // Save all GRPS except %rsp.
 // This value is also subtracted from %rsp below, despite the fact that %rbp
diff --git a/compiler-rt/lib/profile/CMakeLists.txt b/compiler-rt/lib/profile/CMakeLists.txt
index ac1451c8ceed1..a6402f80b890a 100644
--- a/compiler-rt/lib/profile/CMakeLists.txt
+++ b/compiler-rt/lib/profile/CMakeLists.txt
@@ -162,8 +162,7 @@ if(APPLE)
     CFLAGS ${EXTRA_FLAGS}
     SOURCES ${PROFILE_SOURCES}
     ADDITIONAL_HEADERS ${PROFILE_HEADERS}
-    PARENT_TARGET profile
-    EXTENSIONS ON)
+    PARENT_TARGET profile)
 else()
   add_compiler_rt_runtime(clang_rt.profile
     STATIC
@@ -171,6 +170,5 @@ else()
     CFLAGS ${EXTRA_FLAGS}
     SOURCES ${PROFILE_SOURCES}
     ADDITIONAL_HEADERS ${PROFILE_HEADERS}
-    PARENT_TARGET profile
-    EXTENSIONS ON)
+    PARENT_TARGET profile)
 endif()
diff --git a/compiler-rt/lib/profile/GCDAProfiling.c b/compiler-rt/lib/profile/GCDAProfiling.c
index ac01805e70adc..523ade5eafc15 100644
--- a/compiler-rt/lib/profile/GCDAProfiling.c
+++ b/compiler-rt/lib/profile/GCDAProfiling.c
@@ -21,6 +21,11 @@
 
 #if !defined(__Fuchsia__)
 
+#if defined(__linux__)
+// For fdopen()
+#define _DEFAULT_SOURCE
+#endif
+
 #include <errno.h>
 #include <fcntl.h>
 #include <stdint.h>
diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c
index 354f21b786151..71127b05aafb8 100644
--- a/compiler-rt/lib/profile/InstrProfilingFile.c
+++ b/compiler-rt/lib/profile/InstrProfilingFile.c
@@ -8,6 +8,11 @@
 
 #if !defined(__Fuchsia__)
 
+#if defined(__linux__)
+// For fileno(), ftruncate(), getpagesize(), setenv()
+#define _DEFAULT_SOURCE
+#endif
+
 #include <assert.h>
 #include <errno.h>
 #include <stdio.h>
diff --git a/compiler-rt/lib/profile/InstrProfilingUtil.c b/compiler-rt/lib/profile/InstrProfilingUtil.c
index 0fae91cfb8950..a9d9df813764b 100644
--- a/compiler-rt/lib/profile/InstrProfilingUtil.c
+++ b/compiler-rt/lib/profile/InstrProfilingUtil.c
@@ -12,6 +12,11 @@
 #include <windows.h>
 #include "WindowsMMap.h"
 #else
+#if defined(__linux__)
+// For fdopen(), fileno(), getpagesize(), madvise()
+#define _DEFAULT_SOURCE
+#endif
+
 #include <errno.h>
 #include <fcntl.h>
 #include <sys/file.h>
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common.h b/compiler-rt/lib/sanitizer_common/sanitizer_common.h
index 7f0dcb4ecfc75..aa2a3220f1613 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common.h
@@ -390,6 +390,9 @@ void ReportDeadlySignal(const SignalContext &sig, u32 tid,
 void SetAlternateSignalStack();
 void UnsetAlternateSignalStack();
 
+bool IsSignalHandlerFromSanitizer(int signum);
+bool SetSignalHandlerFromSanitizer(int signum, bool new_state);
+
 // Construct a one-line string:
 //   SUMMARY: SanitizerToolName: error_message
 // and pass it to __sanitizer_report_error_summary.
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_vfork_i386.inc.S b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_vfork_i386.inc.S
index c633014e2daa8..5ef090c003dc5 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_vfork_i386.inc.S
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_vfork_i386.inc.S
@@ -2,6 +2,8 @@
 
 #include "sanitizer_common/sanitizer_asm.h"
 
+.att_syntax
+
 .comm _ZN14__interception10real_vforkE,4,4
 .globl ASM_WRAPPER_NAME(vfork)
 ASM_TYPE_FUNCTION(ASM_WRAPPER_NAME(vfork))
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_vfork_x86_64.inc.S b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_vfork_x86_64.inc.S
index 5500f817aec58..9c85407fe022b 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_vfork_x86_64.inc.S
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_vfork_x86_64.inc.S
@@ -2,6 +2,8 @@
 
 #include "sanitizer_common/sanitizer_asm.h"
 
+.att_syntax
+
 .comm _ZN14__interception10real_vforkE,8,8
 .globl ASM_WRAPPER_NAME(vfork)
 ASM_TYPE_FUNCTION(ASM_WRAPPER_NAME(vfork))
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_flags.inc b/compiler-rt/lib/sanitizer_common/sanitizer_flags.inc
index 650a4580bbcf0..5f449907f6011 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_flags.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_flags.inc
@@ -113,6 +113,11 @@ COMMON_FLAG(HandleSignalMode, handle_sigfpe, kHandleSignalYes,
 COMMON_FLAG(bool, allow_user_segv_handler, true,
             "Deprecated. True has no effect, use handle_sigbus=1. If false, "
             "handle_*=1 will be upgraded to handle_*=2.")
+COMMON_FLAG(bool, cloak_sanitizer_signal_handlers, false,
+            "If set, signal/sigaction will pretend that sanitizers did not "
+            "preinstall any signal handlers. If the user subsequently installs "
+            "a signal handler, this will disable cloaking for the respective "
+            "signal.")
 COMMON_FLAG(bool, use_sigaltstack, true,
             "If set, uses alternate stack for signal handling.")
 COMMON_FLAG(bool, detect_deadlocks, true,
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp
index 1822e17b3a312..e27d8167f4fea 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp
@@ -47,6 +47,8 @@ typedef void (*sa_sigaction_t)(int, siginfo_t *, void *);
 
 namespace __sanitizer {
 
+[[maybe_unused]] static atomic_uint8_t signal_handler_is_from_sanitizer[64];
+
 u32 GetUid() {
   return getuid();
 }
@@ -221,6 +223,20 @@ void UnsetAlternateSignalStack() {
 #endif
 }
 
+bool IsSignalHandlerFromSanitizer(int signum) {
+  return atomic_load(&signal_handler_is_from_sanitizer[signum],
+                     memory_order_relaxed);
+}
+
+bool SetSignalHandlerFromSanitizer(int signum, bool new_state) {
+  if (signum < 0 || static_cast<unsigned>(signum) >=
+                        ARRAY_SIZE(signal_handler_is_from_sanitizer))
+    return false;
+
+  return atomic_exchange(&signal_handler_is_from_sanitizer[signum], new_state,
+                         memory_order_relaxed);
+}
+
 static void MaybeInstallSigaction(int signum,
                                   SignalHandlerType handler) {
   if (GetHandleSignalMode(signum) == kHandleSignalNo) return;
@@ -234,6 +250,9 @@ static void MaybeInstallSigaction(int signum,
   if (common_flags()->use_sigaltstack) sigact.sa_flags |= SA_ONSTACK;
   CHECK_EQ(0, internal_sigaction(signum, &sigact, nullptr));
   VReport(1, "Installed the sigaction for signal %d\n", signum);
+
+  if (common_flags()->cloak_sanitizer_signal_handlers)
+    SetSignalHandlerFromSanitizer(signum, true);
 }
 
 void InstallDeadlySignalHandlers(SignalHandlerType handler) {
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_signal_interceptors.inc b/compiler-rt/lib/sanitizer_common/sanitizer_signal_interceptors.inc
index 94e4e2954a3b9..8511e4d55fa9e 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_signal_interceptors.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_signal_interceptors.inc
@@ -45,6 +45,8 @@ using namespace __sanitizer;
 INTERCEPTOR(uptr, bsd_signal, int signum, uptr handler) {
   SIGNAL_INTERCEPTOR_ENTER();
   if (GetHandleSignalMode(signum) == kHandleSignalExclusive) return 0;
+
+  // TODO: support cloak_sanitizer_signal_handlers
   SIGNAL_INTERCEPTOR_SIGNAL_IMPL(bsd_signal, signum, handler);
 }
 #define INIT_BSD_SIGNAL COMMON_INTERCEPT_FUNCTION(bsd_signal)
@@ -56,19 +58,55 @@ INTERCEPTOR(uptr, bsd_signal, int signum, uptr handler) {
 INTERCEPTOR(uptr, signal, int signum, uptr handler) {
   SIGNAL_INTERCEPTOR_ENTER();
   if (GetHandleSignalMode(signum) == kHandleSignalExclusive)
+    // The user can neither view nor change the signal handler, regardless of
+    // the cloak_sanitizer_signal_handlers setting. This differs from
+    // sigaction().
     return (uptr) nullptr;
-  SIGNAL_INTERCEPTOR_SIGNAL_IMPL(signal, signum, handler);
+
+  uptr ret = +[](auto signal, int signum, uptr handler) {
+    SIGNAL_INTERCEPTOR_SIGNAL_IMPL(signal, signum, handler);
+  }(signal, signum, handler);
+
+  if (ret != sig_err && SetSignalHandlerFromSanitizer(signum, false))
+    // If the user sets a signal handler, it becomes uncloaked, even if they
+    // reuse a sanitizer's signal handler.
+    ret = sig_dfl;
+
+  return ret;
 }
 #define INIT_SIGNAL COMMON_INTERCEPT_FUNCTION(signal)
 
 INTERCEPTOR(int, sigaction_symname, int signum,
             const __sanitizer_sigaction *act, __sanitizer_sigaction *oldact) {
   SIGNAL_INTERCEPTOR_ENTER();
+
   if (GetHandleSignalMode(signum) == kHandleSignalExclusive) {
     if (!oldact) return 0;
     act = nullptr;
+    // If cloak_sanitizer_signal_handlers=true, the user can neither view nor
+    // change the signal handle.
+    // If false, the user can view but not change the signal handler. This
+    // differs from signal().
   }
-  SIGNAL_INTERCEPTOR_SIGACTION_IMPL(signum, act, oldact);
+
+  int ret = +[](int signum, const __sanitizer_sigaction* act,
+                __sanitizer_sigaction* oldact) {
+    SIGNAL_INTERCEPTOR_SIGACTION_IMPL(signum, act, oldact);
+  }(signum, act, oldact);
+
+  if (act) {
+    if (ret == 0 && SetSignalHandlerFromSanitizer(signum, false)) {
+      // If the user sets a signal handler, it becomes uncloaked, even if they
+      // reuse a sanitizer's signal handler.
+
+      if (oldact)
+        oldact->handler = reinterpret_cast<__sanitizer_sighandler_ptr>(sig_dfl);
+    }
+  } else if (ret == 0 && oldact && IsSignalHandlerFromSanitizer(signum)) {
+    oldact->handler = reinterpret_cast<__sanitizer_sighandler_ptr>(sig_dfl);
+  }
+
+  return ret;
 }
 #define INIT_SIGACTION COMMON_INTERCEPT_FUNCTION(sigaction_symname)
 
diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl_amd64.S b/compiler-rt/lib/tsan/rtl/tsan_rtl_amd64.S
index f848be9dd46c3..8b9b706a822df 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_rtl_amd64.S
+++ b/compiler-rt/lib/tsan/rtl/tsan_rtl_amd64.S
@@ -3,6 +3,8 @@
 
 #include "sanitizer_common/sanitizer_asm.h"
 
+.att_syntax
+
 #if !defined(__APPLE__)
 .section .text
 #else
diff --git a/compiler-rt/test/asan/TestCases/Windows/basic_exception_handling.cpp b/compiler-rt/test/asan/TestCases/Windows/basic_exception_handling.cpp
new file mode 100644
index 0000000000000..8d1b9ef17393f
--- /dev/null
+++ b/compiler-rt/test/asan/TestCases/Windows/basic_exception_handling.cpp
@@ -0,0 +1,34 @@
+// RUN: %clangxx_asan %s -o %t
+// RUN: %run %t | FileCheck %s
+
+// This test tests that declaring a parameter in a catch-block does not produce a false positive
+// ASan error on Windows.
+
+// This code is based on the repro in https://github.com/google/sanitizers/issues/749
+#include <cstdio>
+#include <exception>
+#include <stdexcept>
+
+void throwInFunction() { throw std::runtime_error("test2"); }
+
+int main() {
+  // case 1: direct throw
+  try {
+    throw std::runtime_error("test1");
+  } catch (const std::exception &ex) {
+    puts(ex.what());
+    // CHECK: test1
+  }
+
+  // case 2: throw in function
+  try {
+    throwInFunction();
+  } catch (const std::exception &ex) {
+    puts(ex.what());
+    // CHECK: test2
+  }
+
+  printf("Success!\n");
+  // CHECK: Success!
+  return 0;
+}
diff --git a/compiler-rt/test/asan/TestCases/wcscat.cpp b/compiler-rt/test/asan/TestCases/wcscat.cpp
index fd0b5a4310351..beab1dc52e437 100644
--- a/compiler-rt/test/asan/TestCases/wcscat.cpp
+++ b/compiler-rt/test/asan/TestCases/wcscat.cpp
@@ -9,11 +9,13 @@
 int main() {
   const wchar_t *start = L"X means ";
   const wchar_t *append = L"dog";
-  wchar_t goodDst[12];
+  wchar_t goodArray[12];
+  wchar_t *volatile goodDst = goodArray;
   wcscpy(goodDst, start);
   wcscat(goodDst, append);
 
-  wchar_t badDst[9];
+  wchar_t badArray[9];
+  wchar_t *volatile badDst = badArray;
   wcscpy(badDst, start);
   fprintf(stderr, "Good so far.\n");
   // CHECK-DAG: Good so far.
diff --git a/compiler-rt/test/asan/TestCases/wcscpy.cpp b/compiler-rt/test/asan/TestCases/wcscpy.cpp
index 8133a588cb071..2b828035cb498 100644
--- a/compiler-rt/test/asan/TestCases/wcscpy.cpp
+++ b/compiler-rt/test/asan/TestCases/wcscpy.cpp
@@ -8,10 +8,12 @@
 
 int main() {
   const wchar_t *src = L"X means dog";
-  wchar_t goodDst[12];
+  wchar_t goodArray[12];
+  wchar_t *volatile goodDst = goodArray;
   wcscpy(goodDst, src);
 
-  wchar_t badDst[7];
+  wchar_t badArray[7];
+  wchar_t *volatile badDst = badArray;
   fprintf(stderr, "Good so far.\n");
   // CHECK-DAG: Good so far.
   fflush(stderr);
diff --git a/compiler-rt/test/asan/TestCases/wcsncat.cpp b/compiler-rt/test/asan/TestCases/wcsncat.cpp
index 365e732e2d051..04cdcf290ffed 100644
--- a/compiler-rt/test/asan/TestCases/wcsncat.cpp
+++ b/compiler-rt/test/asan/TestCases/wcsncat.cpp
@@ -9,11 +9,13 @@
 int main() {
   const wchar_t *start = L"X means ";
   const wchar_t *append = L"dog";
-  wchar_t goodDst[15];
+  wchar_t goodArray[15];
+  wchar_t *volatile goodDst = goodArray;
   wcscpy(goodDst, start);
   wcsncat(goodDst, append, 5);
 
-  wchar_t badDst[11];
+  wchar_t badArray[11];
+  wchar_t *volatile badDst = badArray;
   wcscpy(badDst, start);
   wcsncat(badDst, append, 1);
   fprintf(stderr, "Good so far.\n");
diff --git a/compiler-rt/test/asan/TestCases/wcsncpy.cpp b/compiler-rt/test/asan/TestCases/wcsncpy.cpp
index 485ddc4804dcd..9e11b55fe4ec9 100644
--- a/compiler-rt/test/asan/TestCases/wcsncpy.cpp
+++ b/compiler-rt/test/asan/TestCases/wcsncpy.cpp
@@ -8,10 +8,12 @@
 
 int main() {
   const wchar_t *src = L"X means dog";
-  wchar_t goodDst[12];
+  wchar_t goodArray[12];
+  wchar_t *volatile goodDst = goodArray;
   wcsncpy(goodDst, src, 12);
 
-  wchar_t badDst[7];
+  wchar_t badArray[7];
+  wchar_t *volatile badDst = badArray;
   wcsncpy(badDst, src, 7); // This should still work.
   fprintf(stderr, "Good so far.\n");
   // CHECK-DAG: Good so far.
diff --git a/compiler-rt/test/msan/allocator_padding.cpp b/compiler-rt/test/msan/allocator_padding.cpp
new file mode 100644
index 0000000000000..72acf31e61752
--- /dev/null
+++ b/compiler-rt/test/msan/allocator_padding.cpp
@@ -0,0 +1,94 @@
+// *** malloc: all bytes are uninitialized
+// * malloc byte 0
+// RUN: %clang_msan -fsanitize-memory-track-origins=1 %s -o %t && not %run %t 0 2>&1 \
+// RUN:     | FileCheck %s --check-prefixes=CHECK,ORIGIN-ALLOC
+// RUN: %clang_msan -fsanitize-memory-track-origins=2 %s -o %t && not %run %t 0 2>&1 \
+// RUN:     | FileCheck %s --check-prefixes=CHECK,ORIGIN-ALLOC
+//
+// * malloc byte 6
+// RUN: %clang_msan -fsanitize-memory-track-origins=2 %s -o %t && not %run %t 6 2>&1 \
+// RUN:     | FileCheck %s --check-prefixes=CHECK,ORIGIN-ALLOC
+// RUN: %clang_msan -fsanitize-memory-track-origins=1 %s -o %t && not %run %t 6 2>&1 \
+// RUN:     | FileCheck %s --check-prefixes=CHECK,ORIGIN-ALLOC
+//
+// This test assumes the allocator allocates 16 bytes for malloc(7). Bytes
+// 7-15 are padding.
+//
+// * malloc byte 7
+// Edge case: when the origin granularity spans both ALLOC and ALLOC_PADDING,
+//            ALLOC always takes precedence.
+// RUN: %clang_msan -fsanitize-memory-track-origins=1 %s -o %t && not %run %t 7 2>&1 \
+// RUN:     | FileCheck %s --check-prefixes=CHECK,ORIGIN-ALLOC
+// RUN: %clang_msan -fsanitize-memory-track-origins=2 %s -o %t && not %run %t 7 2>&1 \
+// RUN:     | FileCheck %s --check-prefixes=CHECK,ORIGIN-ALLOC
+//
+// Bytes 8-15 are padding
+// For track-origins=1, ALLOC is used instead of ALLOC_PADDING.
+//
+// * malloc byte 8
+// RUN: %clang_msan -fsanitize-memory-track-origins=1 %s -o %t && not %run %t 8 2>&1 \
+// RUN:     | FileCheck %s --check-prefixes=CHECK,ORIGIN-ALLOC
+// RUN: %clang_msan -fsanitize-memory-track-origins=2 %s -o %t && not %run %t 8 2>&1 \
+// RUN:     | FileCheck %s --check-prefixes=CHECK,ORIGIN-ALLOC-PADDING
+//
+// * malloc byte 15
+// RUN: %clang_msan -fsanitize-memory-track-origins=1 %s -o %t && not %run %t 15 2>&1 \
+// RUN:     | FileCheck %s --check-prefixes=CHECK,ORIGIN-ALLOC
+// RUN: %clang_msan -fsanitize-memory-track-origins=2 %s -o %t && not %run %t 15 2>&1 \
+// RUN:     | FileCheck %s --check-prefixes=CHECK,ORIGIN-ALLOC-PADDING
+
+// *** calloc
+// Bytes 0-6 are fully initialized, so no MSan report should happen.
+//
+// * calloc byte 0
+// RUN: %clang_msan -fsanitize-memory-track-origins=1 -DUSE_CALLOC %s -o %t && %run %t 0 2>&1
+// RUN: %clang_msan -fsanitize-memory-track-origins=2 -DUSE_CALLOC %s -o %t && %run %t 0 2>&1
+//
+// * calloc byte 6
+// RUN: %clang_msan -fsanitize-memory-track-origins=1 -DUSE_CALLOC %s -o %t && %run %t 6 2>&1
+// RUN: %clang_msan -fsanitize-memory-track-origins=2 -DUSE_CALLOC %s -o %t && %run %t 6 2>&1
+//
+// * calloc byte 7
+// Byte 7 is uninitialized. Unlike malloc, this is tagged as ALLOC_PADDING
+// (since the origin does not need to track bytes 4-6).
+// RUN: %clang_msan -fsanitize-memory-track-origins=1 -DUSE_CALLOC %s -o %t && not %run %t 7 2>&1 \
+// RUN:     | FileCheck %s --check-prefixes=CHECK,ORIGIN-ALLOC-PADDING
+// RUN: %clang_msan -fsanitize-memory-track-origins=2 -DUSE_CALLOC %s -o %t && not %run %t 7 2>&1 \
+// RUN:     | FileCheck %s --check-prefixes=CHECK,ORIGIN-ALLOC-PADDING
+//
+// * calloc byte 8
+// RUN: %clang_msan -fsanitize-memory-track-origins=1 -DUSE_CALLOC %s -o %t && not %run %t 8 2>&1 \
+// RUN:     | FileCheck %s --check-prefixes=CHECK,ORIGIN-ALLOC-PADDING
+// RUN: %clang_msan -fsanitize-memory-track-origins=2 -DUSE_CALLOC %s -o %t && not %run %t 8 2>&1 \
+// RUN:     | FileCheck %s --check-prefixes=CHECK,ORIGIN-ALLOC-PADDING
+//
+// * calloc byte 15
+// RUN: %clang_msan -fsanitize-memory-track-origins=1 -DUSE_CALLOC %s -o %t && not %run %t 15 2>&1 \
+// RUN:     | FileCheck %s --check-prefixes=CHECK,ORIGIN-ALLOC-PADDING
+// RUN: %clang_msan -fsanitize-memory-track-origins=2 -DUSE_CALLOC %s -o %t && not %run %t 15 2>&1 \
+// RUN:     | FileCheck %s --check-prefixes=CHECK,ORIGIN-ALLOC-PADDING
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+int main(int argc, char **argv) {
+#ifdef USE_CALLOC
+  char *p = (char *)calloc(7, 1);
+#else
+  char *p = (char *)malloc(7);
+#endif
+
+  if (argc == 2) {
+    int index = atoi(argv[1]);
+
+    printf("p[%d] = %d\n", index, p[index]);
+    // CHECK: WARNING: MemorySanitizer: use-of-uninitialized-value
+    // CHECK: {{#0 0x.* in main .*allocator_padding.cpp:}}[[@LINE-2]]
+    // ORIGIN-ALLOC: Uninitialized value was created by a heap allocation
+    // ORIGIN-ALLOC-PADDING: Uninitialized value is outside of heap allocation
+    free(p);
+  }
+
+  return 0;
+}
diff --git a/compiler-rt/test/msan/zero_alloc.cpp b/compiler-rt/test/msan/zero_alloc.cpp
index 1451e1e89e9fb..f4cf1d89c5b76 100644
--- a/compiler-rt/test/msan/zero_alloc.cpp
+++ b/compiler-rt/test/msan/zero_alloc.cpp
@@ -1,4 +1,9 @@
-// RUN: %clang_msan -Wno-alloc-size -fsanitize-recover=memory %s -o %t && not %run %t 2>&1 | FileCheck %s
+// RUN: %clang_msan -Wno-alloc-size -fsanitize-recover=memory %s -o %t && not %run %t 2>&1 \
+// RUN:     | FileCheck %s --check-prefix=CHECK
+// RUN: %clang_msan -Wno-alloc-size -fsanitize-recover=memory -fsanitize-memory-track-origins=1 %s -o %t && not %run %t 2>&1 \
+// RUN:     | FileCheck %s --check-prefixes=CHECK,DISCOUNT
+// RUN: %clang_msan -Wno-alloc-size -fsanitize-recover=memory -fsanitize-memory-track-origins=2 %s -o %t && not %run %t 2>&1 \
+// RUN:     | FileCheck %s --check-prefixes=CHECK,ORIGINS
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -10,6 +15,7 @@ int main(int argc, char **argv) {
     printf("Content of p1 is: %d\n", *p1);
     // CHECK: WARNING: MemorySanitizer: use-of-uninitialized-value
     // CHECK: {{#0 0x.* in main .*zero_alloc.cpp:}}[[@LINE-2]]
+    // DISCOUNT,ORIGINS: Uninitialized value is outside of heap allocation
     free(p1);
   }
 
@@ -19,6 +25,7 @@ int main(int argc, char **argv) {
     printf("Content of p2 is: %d\n", *p2);
     // CHECK: WARNING: MemorySanitizer: use-of-uninitialized-value
     // CHECK: {{#0 0x.* in main .*zero_alloc.cpp:}}[[@LINE-2]]
+    // DISCOUNT,ORIGINS: Uninitialized value is outside of heap allocation
     free(p2);
   }
 
@@ -28,6 +35,8 @@ int main(int argc, char **argv) {
     printf("Content of p2 is: %d\n", *p3);
     // CHECK: WARNING: MemorySanitizer: use-of-uninitialized-value
     // CHECK: {{#0 0x.* in main .*zero_alloc.cpp:}}[[@LINE-2]]
+    // DISCOUNT: Uninitialized value was created by a heap allocation
+    // ORIGINS: Uninitialized value is outside of heap allocation
     free(p3);
   }
 
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/allow_user_segv.cpp b/compiler-rt/test/sanitizer_common/TestCases/Linux/allow_user_segv.cpp
index 1c740153a81d7..b9ce950d6f96c 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/Linux/allow_user_segv.cpp
+++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/allow_user_segv.cpp
@@ -1,7 +1,5 @@
 // Regression test for
 // https://code.google.com/p/address-sanitizer/issues/detail?id=180
-// Fails with debug checks: https://bugs.llvm.org/show_bug.cgi?id=46860
-// XFAIL: !compiler-rt-optimized && tsan
 
 // FIXME: Implement.
 // XFAIL: hwasan
@@ -23,19 +21,20 @@
 // Flaky errors in debuggerd with "waitpid returned unexpected pid (0)" in logcat.
 // UNSUPPORTED: android && i386-target-arch
 
+// Note: this test case is unusual because it retrieves the original
+// (ASan-installed) signal handler; thus, it is incompatible with the
+// cloak_sanitizer_signal_handlers runtime option.
+
 #include <signal.h>
 #include <stdio.h>
 #include <stdlib.h>
 
-struct sigaction original_sigaction_sigbus;
 struct sigaction original_sigaction_sigsegv;
 
 void User_OnSIGSEGV(int signum, siginfo_t *siginfo, void *context) {
   fprintf(stderr, "User sigaction called\n");
   struct sigaction original_sigaction = {};
-  if (signum == SIGBUS)
-    original_sigaction = original_sigaction_sigbus;
-  else if (signum == SIGSEGV)
+  if (signum == SIGSEGV)
     original_sigaction = original_sigaction_sigsegv;
   else {
     printf("Invalid signum");
@@ -51,11 +50,6 @@ void User_OnSIGSEGV(int signum, siginfo_t *siginfo, void *context) {
   exit(1);
 }
 
-int DoSEGV() {
-  volatile int *x = 0;
-  return *x;
-}
-
 bool InstallHandler(int signum, struct sigaction *original_sigaction) {
   struct sigaction user_sigaction = {};
   user_sigaction.sa_sigaction = User_OnSIGSEGV;
@@ -68,13 +62,15 @@ bool InstallHandler(int signum, struct sigaction *original_sigaction) {
 }
 
 int main() {
-  // Let's install handlers for both SIGSEGV and SIGBUS, since pre-Yosemite
-  // 32-bit Darwin triggers SIGBUS instead.
-  if (InstallHandler(SIGSEGV, &original_sigaction_sigsegv) &&
-      InstallHandler(SIGBUS, &original_sigaction_sigbus)) {
+  if (InstallHandler(SIGSEGV, &original_sigaction_sigsegv))
     fprintf(stderr, "User sigaction installed\n");
-  }
-  return DoSEGV();
+
+  // Trying to organically segfault by dereferencing a pointer can be tricky
+  // when the sanitizer runtime is built with assertions. Additionally, some
+  // older platforms may SIGBUS instead.
+  raise(SIGSEGV);
+
+  return 0;
 }
 
 // CHECK0-NOT: Sanitizer:DEADLYSIGNAL
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/cloak_sigaction.cpp b/compiler-rt/test/sanitizer_common/TestCases/Linux/cloak_sigaction.cpp
new file mode 100644
index 0000000000000..422e4abe880c5
--- /dev/null
+++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/cloak_sigaction.cpp
@@ -0,0 +1,53 @@
+// UNSUPPORTED: android
+// UNSUPPORTED: hwasan
+
+// RUN: %clangxx -O0 %s -o %t
+
+// Sanitizer signal handler not installed; custom signal handler installed
+// RUN: %env_tool_opts=handle_segv=0:cloak_sanitizer_signal_handlers=false not %run %t 2>&1 | FileCheck %s --check-prefixes=DEFAULT,CUSTOM
+// RUN: %env_tool_opts=handle_segv=0:cloak_sanitizer_signal_handlers=true not %run %t 2>&1 | FileCheck %s --check-prefixes=DEFAULT,CUSTOM
+
+// Sanitizer signal handler installed but overriden by custom signal handler
+// RUN: %env_tool_opts=handle_segv=1:cloak_sanitizer_signal_handlers=false not %run %t 2>&1 | FileCheck %s --check-prefixes=NONDEFAULT,CUSTOM
+// RUN: %env_tool_opts=handle_segv=1:cloak_sanitizer_signal_handlers=true not %run %t 2>&1 | FileCheck %s --check-prefixes=DEFAULT,CUSTOM
+
+// Sanitizer signal handler installed immutably
+// N.B. for handle_segv=2 with cloaking off, there is a pre-existing difference
+//      in signal vs. sigaction: signal effectively cloaks the handler.
+// RUN: %env_tool_opts=handle_segv=2:cloak_sanitizer_signal_handlers=false not %run %t 2>&1 | FileCheck %s --check-prefixes=NONDEFAULT,SANITIZER
+// RUN: %env_tool_opts=handle_segv=2:cloak_sanitizer_signal_handlers=true not %run %t 2>&1 | FileCheck %s --check-prefixes=DEFAULT,SANITIZER
+
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+void handler(int signum, siginfo_t *info, void *context) {
+  printf("Custom signal handler\n");
+  exit(1);
+}
+
+int main(int argc, char *argv[]) {
+  struct sigaction sa = {0};
+  struct sigaction old = {0};
+  sa.sa_flags = SA_SIGINFO;
+  sa.sa_sigaction = &handler;
+  sigaction(SIGSEGV, &sa, &old);
+
+  if (reinterpret_cast<void *>(old.sa_sigaction) == SIG_DFL)
+    printf("Old handler: default\n");
+  // DEFAULT: Old handler: default
+  else
+    printf("Old handler: non-default\n");
+  // NONDEFAULT: Old handler: non-default
+
+  fflush(stdout);
+
+  // Trying to organically segfault by dereferencing a pointer can be tricky
+  // in builds with assertions. Additionally, some older platforms may SIGBUS
+  // instead.
+  raise(SIGSEGV);
+  // CUSTOM: Custom signal handler
+  // SANITIZER: Sanitizer:DEADLYSIGNAL
+
+  return 0;
+}
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/cloak_signal.cpp b/compiler-rt/test/sanitizer_common/TestCases/Linux/cloak_signal.cpp
new file mode 100644
index 0000000000000..48e54756c2d0c
--- /dev/null
+++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/cloak_signal.cpp
@@ -0,0 +1,48 @@
+// UNSUPPORTED: android
+// UNSUPPORTED: hwasan
+
+// RUN: %clangxx -O0 %s -o %t
+
+// Sanitizer signal handler not installed; custom signal handler installed
+// RUN: %env_tool_opts=handle_segv=0:cloak_sanitizer_signal_handlers=false not %run %t 2>&1 | FileCheck %s --check-prefixes=DEFAULT,CUSTOM
+// RUN: %env_tool_opts=handle_segv=0:cloak_sanitizer_signal_handlers=true not %run %t 2>&1 | FileCheck %s --check-prefixes=DEFAULT,CUSTOM
+
+// Sanitizer signal handler installed but overriden by custom signal handler
+// RUN: %env_tool_opts=handle_segv=1:cloak_sanitizer_signal_handlers=false not %run %t 2>&1 | FileCheck %s --check-prefixes=NONDEFAULT,CUSTOM
+// RUN: %env_tool_opts=handle_segv=1:cloak_sanitizer_signal_handlers=true not %run %t 2>&1 | FileCheck %s --check-prefixes=DEFAULT,CUSTOM
+
+// Sanitizer signal handler installed immutably
+// N.B. for handle_segv=2 with cloaking off, there is a pre-existing difference
+//      in signal vs. sigaction: signal effectively cloaks the handler.
+// RUN: %env_tool_opts=handle_segv=2:cloak_sanitizer_signal_handlers=false not %run %t 2>&1 | FileCheck %s --check-prefixes=DEFAULT,SANITIZER
+// RUN: %env_tool_opts=handle_segv=2:cloak_sanitizer_signal_handlers=true not %run %t 2>&1 | FileCheck %s --check-prefixes=DEFAULT,SANITIZER
+
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+void my_signal_sighandler(int signum) {
+  printf("Custom signal handler\n");
+  exit(1);
+}
+
+int main(int argc, char *argv[]) {
+  __sighandler_t old = signal(SIGSEGV, &my_signal_sighandler);
+  if (old == SIG_DFL)
+    printf("Old handler: default\n");
+  // DEFAULT: Old handler: default
+  else
+    printf("Old handler: non-default\n");
+  // NONDEFAULT: Old handler: non-default
+
+  fflush(stdout);
+
+  // Trying to organically segfault by dereferencing a pointer can be tricky
+  // in builds with assertions. Additionally, some older platforms may SIGBUS
+  // instead.
+  raise(SIGSEGV);
+  // CUSTOM: Custom signal handler
+  // SANITIZER: Sanitizer:DEADLYSIGNAL
+
+  return 0;
+}
diff --git a/compiler-rt/test/tsan/Darwin/write-interpose.c b/compiler-rt/test/tsan/Darwin/write-interpose.c
index cbd9a0867c982..51ff3ee3d0df9 100644
--- a/compiler-rt/test/tsan/Darwin/write-interpose.c
+++ b/compiler-rt/test/tsan/Darwin/write-interpose.c
@@ -7,6 +7,8 @@
 // Note that running the below command with out `lock_during_write` should
 // deadlock (self-lock)
 // RUN: env DYLD_INSERT_LIBRARIES=%t.dylib TSAN_OPTIONS=verbosity=2:lock_during_write=disable_for_current_process %run %t 2>&1 | FileCheck %s
+//
+// UNSUPPORTED: ios
 
 #include <stdio.h>
 
diff --git a/flang-rt/cmake/modules/AddFlangRT.cmake b/flang-rt/cmake/modules/AddFlangRT.cmake
index ab383bcbe2cdf..923507764d691 100644
--- a/flang-rt/cmake/modules/AddFlangRT.cmake
+++ b/flang-rt/cmake/modules/AddFlangRT.cmake
@@ -231,6 +231,27 @@ function (add_flangrt_library name)
       target_compile_options(${tgtname} PRIVATE
           $<$<COMPILE_LANGUAGE:CXX>:-fno-exceptions -fno-rtti -funwind-tables -fno-asynchronous-unwind-tables>
         )
+
+      # We define our own _GLIBCXX_THROW_OR_ABORT here because, as of
+      # GCC 15.1, the libstdc++ header file <bits/c++config> uses
+      # (void)_EXC in its definition of _GLIBCXX_THROW_OR_ABORT to
+      # silence a warning.
+      #
+      # This is a problem for us because some compilers, specifically
+      # clang, do not always optimize away that (void)_EXC even though
+      # it is unreachable since it occurs after a call to
+      # _builtin_abort().  Because _EXC is typically an object derived
+      # from std::exception, (void)_EXC, when not optimized away,
+      # calls std::exception methods defined in the libstdc++ shared
+      # library.  We shouldn't link against that library since our
+      # build version may conflict with the version used by a hybrid
+      # Fortran/C++ application.
+      #
+      # Redefining _GLIBCXX_THROW_OR_ABORT in this manner is not
+      # supported by the maintainers of libstdc++, so future changes
+      # to libstdc++ may require future changes to this build script
+      # and/or future changes to the Fortran runtime source code.
+      target_compile_options(${tgtname} PUBLIC "-D_GLIBCXX_THROW_OR_ABORT(_EXC)=(__builtin_abort())")
     elseif (MSVC)
       target_compile_options(${tgtname} PRIVATE
           $<$<COMPILE_LANGUAGE:CXX>:/EHs-c- /GR->
diff --git a/flang/docs/FortranLLVMTestSuite.md b/flang/docs/FortranLLVMTestSuite.md
index 17083b4c2a792..8d9daa45ffbcc 100644
--- a/flang/docs/FortranLLVMTestSuite.md
+++ b/flang/docs/FortranLLVMTestSuite.md
@@ -73,5 +73,3 @@ instructions described [above](#running-the-llvm-test-suite-with-fortran).
 There are additional configure-time options that can be used with the gfortran 
 tests. More details about those options and their purpose can be found in 
 [`Fortran/gfortran/README.md`](https://github.com/llvm/llvm-test-suite/tree/main/Fortran/gfortran/README.md).
-
- These tests are Free Software and are shared under the terms of the GNU General Public License (GPL). For more details, please see the accompanying [`LICENSE`](https://github.com/llvm/llvm-test-suite/tree/main/Fortran/gfortran/LICENSE.txt) file.
diff --git a/flang/include/flang/Evaluate/tools.h b/flang/include/flang/Evaluate/tools.h
index d8d0956369e40..7f64d230f7348 100644
--- a/flang/include/flang/Evaluate/tools.h
+++ b/flang/include/flang/Evaluate/tools.h
@@ -1289,16 +1289,7 @@ bool CheckForCoindexedObject(parser::ContextualMessages &,
     const std::optional<ActualArgument> &, const std::string &procName,
     const std::string &argName);
 
-inline bool IsCUDADeviceSymbol(const Symbol &sym) {
-  if (const auto *details =
-          sym.GetUltimate().detailsIf<semantics::ObjectEntityDetails>()) {
-    if (details->cudaDataAttr() &&
-        *details->cudaDataAttr() != common::CUDADataAttr::Pinned) {
-      return true;
-    }
-  }
-  return false;
-}
+bool IsCUDADeviceSymbol(const Symbol &sym);
 
 inline bool IsCUDAManagedOrUnifiedSymbol(const Symbol &sym) {
   if (const auto *details =
@@ -1351,10 +1342,12 @@ inline bool IsCUDADataTransfer(const A &lhs, const B &rhs) {
   int rhsNbManagedSymbols = {GetNbOfCUDAManagedOrUnifiedSymbols(rhs)};
   int rhsNbSymbols{GetNbOfCUDADeviceSymbols(rhs)};
 
-  // Special case where only managed or unifed symbols are involved. This is
-  // performed on the host.
-  if (lhsNbManagedSymbols == 1 && rhsNbManagedSymbols == 1 &&
-      rhsNbSymbols == 1) {
+  // Special cases perforemd on the host:
+  // - Only managed or unifed symbols are involved on RHS and LHS.
+  // - LHS is managed or unified and the RHS is host only.
+  if ((lhsNbManagedSymbols == 1 && rhsNbManagedSymbols == 1 &&
+          rhsNbSymbols == 1) ||
+      (lhsNbManagedSymbols == 1 && rhsNbSymbols == 0)) {
     return false;
   }
   return HasCUDADeviceAttrs(lhs) || rhsNbSymbols > 0;
diff --git a/flang/include/flang/Lower/CUDA.h b/flang/include/flang/Lower/CUDA.h
index ab9dde8ad5198..ef7cdc42d72f2 100644
--- a/flang/include/flang/Lower/CUDA.h
+++ b/flang/include/flang/Lower/CUDA.h
@@ -27,6 +27,10 @@ class Location;
 class MLIRContext;
 } // namespace mlir
 
+namespace hlfir {
+class ElementalOp;
+} // namespace hlfir
+
 namespace Fortran::lower {
 
 class AbstractConverter;
@@ -58,7 +62,9 @@ cuf::DataAttributeAttr
 translateSymbolCUFDataAttribute(mlir::MLIRContext *mlirContext,
                                 const Fortran::semantics::Symbol &sym);
 
-bool isTransferWithConversion(mlir::Value rhs);
+/// Check if the rhs has an implicit conversion. Return the elemental op if
+/// there is a conversion. Return null otherwise.
+hlfir::ElementalOp isTransferWithConversion(mlir::Value rhs);
 
 } // end namespace Fortran::lower
 
diff --git a/flang/include/flang/Lower/DirectivesCommon.h b/flang/include/flang/Lower/DirectivesCommon.h
index bab30de1dcc6f..707c8f88e00d9 100644
--- a/flang/include/flang/Lower/DirectivesCommon.h
+++ b/flang/include/flang/Lower/DirectivesCommon.h
@@ -39,7 +39,6 @@
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/Value.h"
-#include "llvm/Frontend/OpenMP/OMPConstants.h"
 #include <list>
 #include <type_traits>
 
diff --git a/flang/include/flang/Lower/OpenMP/Clauses.h b/flang/include/flang/Lower/OpenMP/Clauses.h
index 5cd196a7869a2..74924661d9a03 100644
--- a/flang/include/flang/Lower/OpenMP/Clauses.h
+++ b/flang/include/flang/Lower/OpenMP/Clauses.h
@@ -214,6 +214,7 @@ using Depend = tomp::clause::DependT<TypeTy, IdTy, ExprTy>;
 using Destroy = tomp::clause::DestroyT<TypeTy, IdTy, ExprTy>;
 using Detach = tomp::clause::DetachT<TypeTy, IdTy, ExprTy>;
 using Device = tomp::clause::DeviceT<TypeTy, IdTy, ExprTy>;
+using DeviceSafesync = tomp::clause::DeviceSafesyncT<TypeTy, IdTy, ExprTy>;
 using DeviceType = tomp::clause::DeviceTypeT<TypeTy, IdTy, ExprTy>;
 using DistSchedule = tomp::clause::DistScheduleT<TypeTy, IdTy, ExprTy>;
 using Doacross = tomp::clause::DoacrossT<TypeTy, IdTy, ExprTy>;
@@ -282,6 +283,7 @@ using Replayable = tomp::clause::ReplayableT<TypeTy, IdTy, ExprTy>;
 using ReverseOffload = tomp::clause::ReverseOffloadT<TypeTy, IdTy, ExprTy>;
 using Safelen = tomp::clause::SafelenT<TypeTy, IdTy, ExprTy>;
 using Schedule = tomp::clause::ScheduleT<TypeTy, IdTy, ExprTy>;
+using SelfMaps = tomp::clause::SelfMapsT<TypeTy, IdTy, ExprTy>;
 using SeqCst = tomp::clause::SeqCstT<TypeTy, IdTy, ExprTy>;
 using Severity = tomp::clause::SeverityT<TypeTy, IdTy, ExprTy>;
 using Shared = tomp::clause::SharedT<TypeTy, IdTy, ExprTy>;
diff --git a/flang/include/flang/Lower/OpenMP/Utils.h b/flang/include/flang/Lower/OpenMP/Utils.h
index 69499f9c7b621..ef1f37ac25529 100644
--- a/flang/include/flang/Lower/OpenMP/Utils.h
+++ b/flang/include/flang/Lower/OpenMP/Utils.h
@@ -134,7 +134,7 @@ mlir::Value createParentSymAndGenIntermediateMaps(
     semantics::SemanticsContext &semaCtx, lower::StatementContext &stmtCtx,
     omp::ObjectList &objectList, llvm::SmallVectorImpl<int64_t> &indices,
     OmpMapParentAndMemberData &parentMemberIndices, llvm::StringRef asFortran,
-    llvm::omp::OpenMPOffloadMappingFlags mapTypeBits);
+    mlir::omp::ClauseMapFlags mapTypeBits);
 
 omp::ObjectList gatherObjectsOf(omp::Object derivedTypeMember,
                                 semantics::SemanticsContext &semaCtx);
diff --git a/flang/include/flang/Optimizer/Builder/Runtime/Coarray.h b/flang/include/flang/Optimizer/Builder/Runtime/Coarray.h
deleted file mode 100644
index 20bfb7c124af2..0000000000000
--- a/flang/include/flang/Optimizer/Builder/Runtime/Coarray.h
+++ /dev/null
@@ -1,85 +0,0 @@
-//===-- Coarray.h -- generate Coarray intrinsics runtime calls --*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef FORTRAN_OPTIMIZER_BUILDER_RUNTIME_COARRAY_H
-#define FORTRAN_OPTIMIZER_BUILDER_RUNTIME_COARRAY_H
-
-#include "flang/Lower/AbstractConverter.h"
-#include "flang/Optimizer/Support/InternalNames.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-
-namespace fir {
-class ExtendedValue;
-class FirOpBuilder;
-} // namespace fir
-
-namespace fir::runtime {
-
-// Get the function type for a prif subroutine with a variable number of
-// arguments
-#define PRIF_FUNCTYPE(...)                                                     \
-  mlir::FunctionType::get(builder.getContext(), /*inputs*/ {__VA_ARGS__},      \
-                          /*result*/ {})
-
-// Default prefix for subroutines of PRIF compiled with LLVM
-#define PRIFNAME_SUB(fmt)                                                      \
-  []() {                                                                       \
-    std::ostringstream oss;                                                    \
-    oss << "prif_" << fmt;                                                     \
-    return fir::NameUniquer::doProcedure({"prif"}, {}, oss.str());             \
-  }()
-
-#define PRIF_STAT_TYPE builder.getRefType(builder.getI32Type())
-#define PRIF_ERRMSG_TYPE                                                       \
-  fir::BoxType::get(fir::CharacterType::get(builder.getContext(), 1,           \
-                                            fir::CharacterType::unknownLen()))
-
-/// Generate Call to runtime prif_init
-mlir::Value genInitCoarray(fir::FirOpBuilder &builder, mlir::Location loc);
-
-/// Generate Call to runtime prif_num_images
-mlir::Value getNumImages(fir::FirOpBuilder &builder, mlir::Location loc);
-
-/// Generate Call to runtime prif_num_images_with_team or
-/// prif_num_images_with_team_number
-mlir::Value getNumImagesWithTeam(fir::FirOpBuilder &builder, mlir::Location loc,
-                                 mlir::Value team);
-
-/// Generate Call to runtime prif_this_image_no_coarray
-mlir::Value getThisImage(fir::FirOpBuilder &builder, mlir::Location loc,
-                         mlir::Value team = {});
-
-/// Generate call to runtime subroutine prif_co_broadcast
-void genCoBroadcast(fir::FirOpBuilder &builder, mlir::Location loc,
-                    mlir::Value A, mlir::Value sourceImage, mlir::Value stat,
-                    mlir::Value errmsg);
-
-/// Generate call to runtime subroutine prif_co_max and prif_co_max_character
-void genCoMax(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value A,
-              mlir::Value resultImage, mlir::Value stat, mlir::Value errmsg);
-
-/// Generate call to runtime subroutine prif_co_min or prif_co_min_character
-void genCoMin(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value A,
-              mlir::Value resultImage, mlir::Value stat, mlir::Value errmsg);
-
-/// Generate call to runtime subroutine prif_co_sum
-void genCoSum(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value A,
-              mlir::Value resultImage, mlir::Value stat, mlir::Value errmsg);
-
-/// Generate call to runtime subroutine prif_sync_all
-void genSyncAllStatement(fir::FirOpBuilder &builder, mlir::Location loc,
-                         mlir::Value stat, mlir::Value errmsg);
-/// Generate call to runtime subroutine prif_sync_memory
-void genSyncMemoryStatement(fir::FirOpBuilder &builder, mlir::Location loc,
-                            mlir::Value stat, mlir::Value errmsg);
-/// Generate call to runtime subroutine prif_sync_images
-void genSyncImagesStatement(fir::FirOpBuilder &builder, mlir::Location loc,
-                            mlir::Value imageSet, mlir::Value stat,
-                            mlir::Value errmsg);
-} // namespace fir::runtime
-#endif // FORTRAN_OPTIMIZER_BUILDER_RUNTIME_COARRAY_H
diff --git a/flang/include/flang/Optimizer/CMakeLists.txt b/flang/include/flang/Optimizer/CMakeLists.txt
index 3336ac935e101..68af52f1b8dc7 100644
--- a/flang/include/flang/Optimizer/CMakeLists.txt
+++ b/flang/include/flang/Optimizer/CMakeLists.txt
@@ -2,4 +2,5 @@ add_subdirectory(CodeGen)
 add_subdirectory(Dialect)
 add_subdirectory(HLFIR)
 add_subdirectory(Transforms)
+add_subdirectory(OpenACC)
 add_subdirectory(OpenMP)
diff --git a/flang/include/flang/Optimizer/Dialect/CMakeLists.txt b/flang/include/flang/Optimizer/Dialect/CMakeLists.txt
index adefcfea0b5dc..7b1a12932276a 100644
--- a/flang/include/flang/Optimizer/Dialect/CMakeLists.txt
+++ b/flang/include/flang/Optimizer/Dialect/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_subdirectory(CUF)
 add_subdirectory(FIRCG)
+add_subdirectory(MIF)
 
 # This replicates part of the add_mlir_dialect cmake function from MLIR that
 # cannot be used her because it expects to be run inside MLIR directory which
diff --git a/flang/include/flang/Optimizer/Dialect/FIRType.h b/flang/include/flang/Optimizer/Dialect/FIRType.h
index 6188c4460dadd..a94842d82cef3 100644
--- a/flang/include/flang/Optimizer/Dialect/FIRType.h
+++ b/flang/include/flang/Optimizer/Dialect/FIRType.h
@@ -53,6 +53,9 @@ class BaseBoxType : public mlir::Type {
   /// Unwrap element type from fir.heap, fir.ptr and fir.array.
   mlir::Type unwrapInnerType() const;
 
+  // Get the element type or the fir.array
+  mlir::Type getElementOrSequenceType() const;
+
   /// Is this the box for an assumed rank?
   bool isAssumedRank() const;
 
@@ -389,6 +392,9 @@ bool isPolymorphicType(mlir::Type ty);
 /// value.
 bool isUnlimitedPolymorphicType(mlir::Type ty);
 
+/// Return true if CLASS(*)
+bool isClassStarType(mlir::Type ty);
+
 /// Return true iff `ty` is the type of an assumed type. In FIR,
 /// assumed types are of the form `[fir.ref|ptr|heap]fir.box<[fir.array]none>`,
 /// or `fir.ref|ptr|heap<[fir.array]none>`.
diff --git a/flang/include/flang/Optimizer/Dialect/MIF/CMakeLists.txt b/flang/include/flang/Optimizer/Dialect/MIF/CMakeLists.txt
new file mode 100644
index 0000000000000..27ba3889c8045
--- /dev/null
+++ b/flang/include/flang/Optimizer/Dialect/MIF/CMakeLists.txt
@@ -0,0 +1,9 @@
+set(LLVM_TARGET_DEFINITIONS MIFDialect.td)
+mlir_tablegen(MIFDialect.h.inc -gen-dialect-decls -dialect=mif)
+mlir_tablegen(MIFDialect.cpp.inc -gen-dialect-defs -dialect=mif)
+
+# Add Multi Image Fortran operations
+set(LLVM_TARGET_DEFINITIONS MIFOps.td)
+mlir_tablegen(MIFOps.h.inc -gen-op-decls)
+mlir_tablegen(MIFOps.cpp.inc -gen-op-defs)
+add_public_tablegen_target(MIFOpsIncGen)
diff --git a/flang/include/flang/Optimizer/Dialect/MIF/MIFDialect.h b/flang/include/flang/Optimizer/Dialect/MIF/MIFDialect.h
new file mode 100644
index 0000000000000..2f3ab9c81705b
--- /dev/null
+++ b/flang/include/flang/Optimizer/Dialect/MIF/MIFDialect.h
@@ -0,0 +1,28 @@
+//===-- MIF.h - MIF dialect ---------------------------------------*- C++-*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_OPTIMIZER_DIALECT_MIF_MIFDIALECT_H
+#define FORTRAN_OPTIMIZER_DIALECT_MIF_MIFDIALECT_H
+
+#include "mlir/Bytecode/BytecodeOpInterface.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/SymbolTable.h"
+#include "mlir/Interfaces/CallInterfaces.h"
+#include "mlir/Interfaces/InferTypeOpInterface.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+#include "mlir/Interfaces/VectorInterfaces.h"
+
+//===----------------------------------------------------------------------===//
+// MIFDialect
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Dialect/MIF/MIFDialect.h.inc"
+
+#endif // FORTRAN_OPTIMIZER_DIALECT_MIF_MIFDIALECT_H
diff --git a/flang/include/flang/Optimizer/Dialect/MIF/MIFDialect.td b/flang/include/flang/Optimizer/Dialect/MIF/MIFDialect.td
new file mode 100644
index 0000000000000..f8be6a86a79fe
--- /dev/null
+++ b/flang/include/flang/Optimizer/Dialect/MIF/MIFDialect.td
@@ -0,0 +1,38 @@
+//===-- MIFDialect.td - MIF dialect base definitions - tablegen ---------*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Definition of the Multi-Image Fortran (MIF)  dialect
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_DIALECT_MIF_MIFDIALECT
+#define FORTRAN_DIALECT_MIF_MIFDIALECT
+
+include "mlir/IR/AttrTypeBase.td"
+include "mlir/IR/EnumAttr.td"
+include "mlir/IR/OpBase.td"
+
+def MIFDialect : Dialect {
+  let name = "mif";
+
+  let summary = "Multi-Image Fortran dialect";
+
+  let description = [{
+    The "MIF" dialect is designed to contain the basic coarray operations
+    in Fortran and all multi image operations as descibed in the standard.
+    This includes synchronization operations, atomic operations,
+    image queries, teams, criticals, etc. The MIF dialect operations use 
+    the FIR types and are tightly coupled with FIR and HLFIR.
+  }];
+
+  let cppNamespace = "::mif";
+  let dependentDialects = ["fir::FIROpsDialect"];
+}
+
+#endif // FORTRAN_DIALECT_MIF_MIFDIALECT
diff --git a/flang/include/flang/Optimizer/Dialect/MIF/MIFOps.h b/flang/include/flang/Optimizer/Dialect/MIF/MIFOps.h
new file mode 100644
index 0000000000000..a9e53c21171c4
--- /dev/null
+++ b/flang/include/flang/Optimizer/Dialect/MIF/MIFOps.h
@@ -0,0 +1,20 @@
+//===-- Optimizer/Dialect/MIF/MIFOps.h - MIF operations ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_OPTIMIZER_DIALECT_MIF_MIFOPS_H
+#define FORTRAN_OPTIMIZER_DIALECT_MIF_MIFOPS_H
+
+#include "flang/Optimizer/Dialect/FIRType.h"
+#include "flang/Optimizer/Dialect/MIF/MIFDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/IR/OpDefinition.h"
+
+#define GET_OP_CLASSES
+#include "flang/Optimizer/Dialect/MIF/MIFOps.h.inc"
+
+#endif // FORTRAN_OPTIMIZER_DIALECT_MIF_MIFOPS_H
diff --git a/flang/include/flang/Optimizer/Dialect/MIF/MIFOps.td b/flang/include/flang/Optimizer/Dialect/MIF/MIFOps.td
new file mode 100644
index 0000000000000..52471d3702b76
--- /dev/null
+++ b/flang/include/flang/Optimizer/Dialect/MIF/MIFOps.td
@@ -0,0 +1,268 @@
+//===-- MIFOps.td - MIF operation definitions --------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Definition of the MIF dialect operations
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_DIALECT_MIF_MIF_OPS
+#define FORTRAN_DIALECT_MIF_MIF_OPS
+
+include "flang/Optimizer/Dialect/MIF/MIFDialect.td"
+include "flang/Optimizer/Dialect/FIRTypes.td"
+include "flang/Optimizer/Dialect/FIRAttr.td"
+
+class mif_Op<string mnemonic, list<Trait> traits>
+    : Op<MIFDialect, mnemonic, traits>;
+
+//===----------------------------------------------------------------------===//
+// Initialization and Finalization
+//===----------------------------------------------------------------------===//
+
+def mif_InitOp : mif_Op<"init", []> {
+  let summary = "Initialize the parallel environment";
+  let description = [{This operation will initialize the parallel environment}];
+
+  let results = (outs I32:$stat);
+  let assemblyFormat = "`->` type($stat) attr-dict";
+}
+
+//===----------------------------------------------------------------------===//
+// Image Queries
+//===----------------------------------------------------------------------===//
+
+def mif_NumImagesOp : mif_Op<"num_images", [AttrSizedOperandSegments]> {
+  let summary = "Query the number of images in the specified or current team";
+  let description = [{
+    This operation query the number of images in the specified or current
+    team and can be called with 3 differents way :
+    - `num_images()`
+    - `num_images(team)`
+    - `num_images(team_number)`
+
+    Arguments:
+    - `team` : Shall be a scalar of type `team_type` from the `ISO_FORTRAN_ENV`
+            module with a value that identifies the current or ancestor team.
+    - `team_number` :  Shall be an integer scalar. It shall identify the
+            initial team or a sibling team of the current team.
+
+    Result Value: The number of images in the specified team, or in the current
+    team if no team is specified.
+  }];
+
+  let arguments = (ins Optional<AnyInteger>:$team_number,
+                       Optional<AnyRefOrBoxType>:$team);
+  let results = (outs I32:$res);
+
+  let builders = [OpBuilder<(ins CArg<"mlir::Value", "{}">:$teamArg)>];
+
+  let hasVerifier = 1;
+  let assemblyFormat = [{
+    ( `team_number` $team_number^ )? 
+    ( `team` $team^ )? 
+    attr-dict `:` functional-type(operands, results)
+  }];
+}
+
+def mif_ThisImageOp : mif_Op<"this_image", [AttrSizedOperandSegments]> {
+  let summary = "Determine the image index of the current image";
+  let description = [{
+    Arguments:
+    - `coarray` :  Shall be a coarray of any type.
+    - `dim` : Shall be an integer scalar. Its value shall be in the range of
+          1 <= DIM <= N, where N is the corank of the coarray.
+    - `team`(optional) : Shall be a scalar of type `team_type` from
+          ISO_FORTRAN_ENV. If the `coarray` is present, it shall be
+          established in that team.
+
+    Results:
+    - Case(1) : The result of `this_image([team])` is a scalar with a value
+          equal to the index of the image in the current or specified team.
+    - Case(2) : The result of `this_image(coarray [,team])` is the sequence of
+          cosubscript values for `coarray`.
+    - Case(3) : The result of `this_image(coarray, dim [,team])` is the value of
+          cosubscript `dim` in the sequence of cosubscript values for `coarray`.
+
+    Example:
+    ```fortran
+      REAL :: A[10, 0:9, 0:*]
+    ```
+    If we take a look on the example and we are on image 5, `this_image` has the
+    value 5, `this_image(A)` has the value [5, 0, 0].
+  }];
+
+  let arguments = (ins Optional<fir_BoxType>:$coarray,
+      Optional<AnyInteger>:$dim, Optional<AnyRefOrBoxType>:$team);
+  let results = (outs I32:$res);
+
+  let builders = [OpBuilder<(ins "mlir::Value":$coarray, "mlir::Value":$team)>,
+                  OpBuilder<(ins "mlir::Value":$team)>];
+
+  let hasVerifier = 1;
+  let assemblyFormat = [{
+    ( `coarray` $coarray^ )? 
+    ( `team` $team^ )? 
+    ( `dim` $dim^ )? 
+    attr-dict `:` functional-type(operands, results)
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// Synchronization
+//===----------------------------------------------------------------------===//
+// NOTE: Every operation in this section corresponds to a Fortran 
+//   "image control statement" (F23 11.7.1). This means they end a Fortran 
+//   "segment" and start another (F23 11.7.2), which affects the memory 
+//   consistency semantics of surrounding accesses in the multi-image execution. 
+//   For now we've modeled this by *deliberately* omitting mlir::MemoryEffects
+//   on these operations, to ensure analysis treats these ops conservatively.
+//===----------------------------------------------------------------------===//
+
+def mif_SyncAllOp : mif_Op<"sync_all", [AttrSizedOperandSegments]> {
+  let summary =
+      "Performs a collective synchronization of all images in the current team";
+
+  let arguments = (ins Optional<AnyReferenceLike>:$stat,
+                       Optional<AnyRefOrBoxType>:$errmsg);
+  let assemblyFormat = [{
+    (`stat` $stat^ )?
+    (`errmsg` $errmsg^ )? 
+    attr-dict `:` functional-type(operands, results)
+  }];
+}
+
+def mif_SyncImagesOp : mif_Op<"sync_images", [AttrSizedOperandSegments]> {
+  let summary = "Performs a synchronization of image with each of the other "
+                "images in the `image_set`";
+  let description = [{
+    This operation can take an optional argument `⁣image_set`, wich must be an integer expression
+    and must be scalar or rank one. If `image_set` is omitted from the call, this operation will 
+    adopt the behavior of the Fortran statement `SYNC IMAGES(*)`.
+  }];
+
+  let arguments = (ins Optional<AnyRefOrBoxType>:$image_set,
+                       Optional<AnyReferenceLike>:$stat,
+                       Optional<AnyRefOrBoxType>:$errmsg);
+  let hasVerifier = 1;
+  let assemblyFormat = [{
+    (`image_set` $image_set^ )? 
+    (`stat` $stat^ )?
+    (`errmsg` $errmsg^ )? 
+    attr-dict `:` functional-type(operands, results)
+  }];
+}
+
+def mif_SyncMemoryOp : mif_Op<"sync_memory", [AttrSizedOperandSegments]> {
+  let summary = "Operation that ends one segment and begins another.";
+  let description = [{
+    Operation that ends one segment and begins another; Those two segments can 
+    be ordered by user-defined way with respect to segments on other images.
+  }];
+
+  let arguments = (ins Optional<AnyReferenceLike>:$stat,
+                       Optional<AnyRefOrBoxType>:$errmsg);
+  let assemblyFormat = [{
+    (`stat` $stat^ )?
+    (`errmsg` $errmsg^ )? 
+    attr-dict `:` functional-type(operands, results)
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// Collective Operations
+//===----------------------------------------------------------------------===//
+
+def mif_CoBroadcastOp : mif_Op<"co_broadcast", [AttrSizedOperandSegments,
+                                                MemoryEffects<[MemWrite]>]> {
+  let summary = "Broadcast value to images.";
+  let description = [{
+    The co_broadcast operation broadcasts a value from one image to the other images.
+  }];
+
+  let arguments = (ins Arg<fir_BoxType, "", [MemRead, MemWrite]>:$a,
+      AnyIntegerType:$source_image,
+      Arg<Optional<AnyReferenceLike>, "", [MemWrite]>:$stat,
+      Arg<Optional<AnyRefOrBoxType>, "", [MemWrite]>:$errmsg);
+
+  let hasVerifier = 1;
+  let assemblyFormat = [{
+    $a `source` $source_image
+    (`stat`  $stat^ )?
+    (`errmsg` $errmsg^ )? 
+    attr-dict `:` `(` type(operands) `)`
+  }];
+}
+
+def mif_CoMaxOp
+    : mif_Op<"co_max", [AttrSizedOperandSegments, MemoryEffects<[MemWrite]>]> {
+  let summary = "Compute maximum value across images.";
+  let description = [{
+    The co_max operation performs the computation of the maximum 
+    across images.
+  }];
+
+  let arguments = (ins Arg<fir_BoxType, "", [MemRead, MemWrite]>:$a,
+      Optional<AnyIntegerType>:$result_image,
+      Arg<Optional<AnyReferenceLike>, "", [MemWrite]>:$stat,
+      Arg<Optional<AnyRefOrBoxType>, "", [MemWrite]>:$errmsg);
+
+  let hasVerifier = 1;
+  let assemblyFormat = [{
+    $a (`result` $result_image^ )? 
+    (`stat` $stat^ )?
+    (`errmsg` $errmsg^ )? 
+    attr-dict `:` `(` type(operands) `)`
+  }];
+}
+
+def mif_CoMinOp
+    : mif_Op<"co_min", [AttrSizedOperandSegments, MemoryEffects<[MemWrite]>]> {
+  let summary = "Compute minimum value across images.";
+  let description = [{
+    The co_min operation performs the computation of the minimum
+    across images.
+  }];
+
+  let arguments = (ins Arg<fir_BoxType, "", [MemRead, MemWrite]>:$a,
+      Optional<AnyIntegerType>:$result_image,
+      Arg<Optional<AnyReferenceLike>, "", [MemWrite]>:$stat,
+      Arg<Optional<AnyRefOrBoxType>, "", [MemWrite]>:$errmsg);
+
+  let hasVerifier = 1;
+  let assemblyFormat = [{
+    $a (`result` $result_image^ )? 
+    (`stat` $stat^ )?
+    (`errmsg` $errmsg^ )? 
+    attr-dict `:` `(` type(operands) `)`
+  }];
+}
+
+def mif_CoSumOp
+    : mif_Op<"co_sum", [AttrSizedOperandSegments, MemoryEffects<[MemWrite]>]> {
+  let summary = "Compute sum across images.";
+  let description = [{
+    The co_sum operation performs the computation of the sum
+    across images.
+  }];
+
+  let arguments = (ins Arg<fir_BoxType, "", [MemRead, MemWrite]>:$a,
+      Optional<AnyIntegerType>:$result_image,
+      Arg<Optional<AnyReferenceLike>, "", [MemWrite]>:$stat,
+      Arg<Optional<AnyRefOrBoxType>, "", [MemWrite]>:$errmsg);
+
+  let hasVerifier = 1;
+  let assemblyFormat = [{
+    $a (`result` $result_image^ )? 
+    (`stat` $stat^ )?
+    (`errmsg` $errmsg^ )? 
+    attr-dict `:` `(` type(operands) `)`
+  }];
+}
+
+#endif // FORTRAN_DIALECT_MIF_MIF_OPS
diff --git a/flang/include/flang/Optimizer/OpenACC/CMakeLists.txt b/flang/include/flang/Optimizer/OpenACC/CMakeLists.txt
new file mode 100644
index 0000000000000..a032488569b19
--- /dev/null
+++ b/flang/include/flang/Optimizer/OpenACC/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(LLVM_TARGET_DEFINITIONS Passes.td)
+mlir_tablegen(Passes.h.inc -gen-pass-decls -name FIROpenACC)
+
+add_public_tablegen_target(FIROpenACCPassesIncGen)
diff --git a/flang/include/flang/Optimizer/OpenACC/Passes.h b/flang/include/flang/Optimizer/OpenACC/Passes.h
new file mode 100644
index 0000000000000..0627cc8ce4a6d
--- /dev/null
+++ b/flang/include/flang/Optimizer/OpenACC/Passes.h
@@ -0,0 +1,33 @@
+//===- Passes.h - OpenACC pass entry points -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header declares the OpenACC passes specific to Fortran and FIR.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_OPTIMIZER_OPENACC_PASSES_H
+#define FORTRAN_OPTIMIZER_OPENACC_PASSES_H
+
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassRegistry.h"
+
+#include <memory>
+
+namespace fir {
+namespace acc {
+#define GEN_PASS_DECL
+#define GEN_PASS_REGISTRATION
+#include "flang/Optimizer/OpenACC/Passes.h.inc"
+
+std::unique_ptr<mlir::Pass> createACCRecipeBufferizationPass();
+
+} // namespace acc
+} // namespace fir
+
+#endif // FORTRAN_OPTIMIZER_OPENACC_PASSES_H
diff --git a/flang/include/flang/Optimizer/OpenACC/Passes.td b/flang/include/flang/Optimizer/OpenACC/Passes.td
new file mode 100644
index 0000000000000..3c127b30aa9b8
--- /dev/null
+++ b/flang/include/flang/Optimizer/OpenACC/Passes.td
@@ -0,0 +1,36 @@
+//===-- Passes.td - flang OpenACC pass definitions -----------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_OPTIMIZER_OPENACC_PASSES
+#define FORTRAN_OPTIMIZER_OPENACC_PASSES
+
+include "mlir/Pass/PassBase.td"
+
+def ACCRecipeBufferization
+    : Pass<"fir-acc-recipe-bufferization", "mlir::ModuleOp"> {
+  let summary = "Rewrite acc.*.recipe box values to ref<box> and update uses";
+  let description = [{
+    Bufferizes OpenACC recipes that operate on fir.box<T> so their type and
+    region block arguments become fir.ref<fir.box<T>> instead. This applies to
+    acc.private.recipe, acc.firstprivate.recipe (including copy region), and
+    acc.reduction.recipe (including combiner region).
+
+    For affected regions, the pass inserts required loads at the beginning of
+    the region to preserve original uses after argument type changes. For yields
+    of box values, the pass allocates a local fir.ref<fir.box<T>> and stores the
+    yielded fir.box<T> into it so the region yields a reference to a box.
+
+    For acc.private, acc.firstprivate, and acc.reduction operations that use a
+    bufferized recipe, the pass allocates a host-side fir.ref<fir.box<T>> before
+    the data op and rewires the data op to use the new memory. Other users of
+    the original data operation result (outside the paired compute op) are
+    updated to load through the reference.
+  }];
+}
+
+#endif // FORTRAN_OPTIMIZER_OPENACC_PASSES
diff --git a/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.h b/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.h
index 10853933317f7..4817ed933ba06 100644
--- a/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.h
+++ b/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.h
@@ -29,6 +29,20 @@ struct OpenACCPointerLikeModel
   getPointeeTypeCategory(mlir::Type pointer,
                          mlir::TypedValue<mlir::acc::PointerLikeType> varPtr,
                          mlir::Type varType) const;
+
+  mlir::Value genAllocate(mlir::Type pointer, mlir::OpBuilder &builder,
+                          mlir::Location loc, llvm::StringRef varName,
+                          mlir::Type varType, mlir::Value originalVar,
+                          bool &needsFree) const;
+
+  bool genFree(mlir::Type pointer, mlir::OpBuilder &builder, mlir::Location loc,
+               mlir::TypedValue<mlir::acc::PointerLikeType> varToFree,
+               mlir::Value allocRes, mlir::Type varType) const;
+
+  bool genCopy(mlir::Type pointer, mlir::OpBuilder &builder, mlir::Location loc,
+               mlir::TypedValue<mlir::acc::PointerLikeType> destination,
+               mlir::TypedValue<mlir::acc::PointerLikeType> source,
+               mlir::Type varType) const;
 };
 
 template <typename T>
@@ -57,8 +71,11 @@ struct OpenACCMappableModel
                                   mlir::Location loc,
                                   mlir::TypedValue<mlir::acc::MappableType> var,
                                   llvm::StringRef varName,
-                                  mlir::ValueRange extents,
-                                  mlir::Value initVal) const;
+                                  mlir::ValueRange extents, mlir::Value initVal,
+                                  bool &needsDestroy) const;
+
+  bool generatePrivateDestroy(mlir::Type type, mlir::OpBuilder &builder,
+                              mlir::Location loc, mlir::Value privatized) const;
 };
 
 } // namespace fir::acc
diff --git a/flang/include/flang/Optimizer/OpenMP/Passes.td b/flang/include/flang/Optimizer/OpenMP/Passes.td
index 2b39c81178084..8d30f165dd8b6 100644
--- a/flang/include/flang/Optimizer/OpenMP/Passes.td
+++ b/flang/include/flang/Optimizer/OpenMP/Passes.td
@@ -110,6 +110,10 @@ def LowerWorkshare : Pass<"lower-workshare", "::mlir::ModuleOp"> {
   let summary = "Lower workshare construct";
 }
 
+def LowerWorkdistribute : Pass<"lower-workdistribute", "::mlir::ModuleOp"> {
+  let summary = "Lower workdistribute construct";
+}
+
 def GenericLoopConversionPass
     : Pass<"omp-generic-loop-conversion", "mlir::func::FuncOp"> {
   let summary = "Converts OpenMP generic `omp.loop` to semantically "
diff --git a/flang/include/flang/Optimizer/Support/InitFIR.h b/flang/include/flang/Optimizer/Support/InitFIR.h
index 3e42ffd41591e..67e9287ddad4f 100644
--- a/flang/include/flang/Optimizer/Support/InitFIR.h
+++ b/flang/include/flang/Optimizer/Support/InitFIR.h
@@ -16,6 +16,7 @@
 #include "flang/Optimizer/Dialect/CUF/CUFDialect.h"
 #include "flang/Optimizer/Dialect/CUF/CUFToLLVMIRTranslation.h"
 #include "flang/Optimizer/Dialect/FIRDialect.h"
+#include "flang/Optimizer/Dialect/MIF/MIFDialect.h"
 #include "flang/Optimizer/HLFIR/HLFIRDialect.h"
 #include "flang/Optimizer/OpenACC/Support/RegisterOpenACCExtensions.h"
 #include "flang/Optimizer/OpenMP/Support/RegisterOpenMPExtensions.h"
@@ -51,7 +52,7 @@ namespace fir::support {
       mlir::vector::VectorDialect, mlir::math::MathDialect,                    \
       mlir::complex::ComplexDialect, mlir::DLTIDialect, cuf::CUFDialect,       \
       mlir::NVVM::NVVMDialect, mlir::gpu::GPUDialect,                          \
-      mlir::index::IndexDialect
+      mlir::index::IndexDialect, mif::MIFDialect
 
 #define FLANG_CODEGEN_DIALECT_LIST FIRCodeGenDialect, mlir::LLVM::LLVMDialect
 
diff --git a/flang/include/flang/Optimizer/Support/Utils.h b/flang/include/flang/Optimizer/Support/Utils.h
index 0b31cfea0430a..bbb7e6e52f56c 100644
--- a/flang/include/flang/Optimizer/Support/Utils.h
+++ b/flang/include/flang/Optimizer/Support/Utils.h
@@ -200,6 +200,12 @@ std::optional<llvm::ArrayRef<int64_t>> getComponentLowerBoundsIfNonDefault(
     fir::RecordType recordType, llvm::StringRef component,
     mlir::ModuleOp module, const mlir::SymbolTable *symbolTable = nullptr);
 
+/// Indicate if a derived type has final routine. Returns std::nullopt if that
+/// information is not in the IR;
+std::optional<bool>
+isRecordWithFinalRoutine(fir::RecordType recordType, mlir::ModuleOp module,
+                         const mlir::SymbolTable *symbolTable = nullptr);
+
 /// Generate a LLVM constant value of type `ity`, using the provided offset.
 mlir::LLVM::ConstantOp
 genConstantIndex(mlir::Location loc, mlir::Type ity,
diff --git a/flang/include/flang/Optimizer/Transforms/MIFOpConversion.h b/flang/include/flang/Optimizer/Transforms/MIFOpConversion.h
new file mode 100644
index 0000000000000..93c724748102c
--- /dev/null
+++ b/flang/include/flang/Optimizer/Transforms/MIFOpConversion.h
@@ -0,0 +1,27 @@
+//===------- Optimizer/Transforms/MIFOpToLLVMConversion.h -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_OPTIMIZER_TRANSFORMS_MIFOPCONVERSION_H_
+#define FORTRAN_OPTIMIZER_TRANSFORMS_MIFOPCONVERSION_H_
+
+#include "mlir/Conversion/LLVMCommon/Pattern.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassRegistry.h"
+
+namespace fir {
+class LLVMTypeConverter;
+}
+
+namespace mif {
+
+/// Patterns that convert MIF operations to runtime calls.
+void populateMIFOpConversionPatterns(mlir::RewritePatternSet &patterns);
+
+} // namespace mif
+
+#endif // FORTRAN_OPTIMIZER_TRANSFORMS_MIFOPCONVERSION_H_
diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td
index e2ba9c3522837..bb2509b1747d5 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.td
+++ b/flang/include/flang/Optimizer/Transforms/Passes.td
@@ -436,7 +436,7 @@ def FunctionAttr : Pass<"function-attr", "mlir::func::FuncOp"> {
               "Set the no-signed-zeros-fp-math attribute on functions in the "
               "module.">,
        Option<"unsafeFPMath", "unsafe-fp-math", "bool", /*default=*/"false",
-              "Set the unsafe-fp-math attribute on functions in the module.">,
+              "Set all fast-math flags on instructions in the module.">,
        Option<"reciprocals", "mrecip", "std::string", /*default=*/"",
               "Set the reciprocal-estimates attribute on functions in the "
               "module.">,
@@ -580,4 +580,9 @@ def OptimizeArrayRepacking
   }];
 }
 
+def MIFOpConversion : Pass<"mif-convert", "mlir::ModuleOp"> {
+  let summary = "Convert some MIF operations to runtime calls";
+  let dependentDialects = ["fir::FIROpsDialect", "mlir::LLVM::LLVMDialect"];
+}
+
 #endif // FLANG_OPTIMIZER_TRANSFORMS_PASSES
diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h
index 14885293fd5eb..5677277a9b381 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -520,6 +520,8 @@ class ParseTreeDumper {
   NODE(parser, OmpAtClause)
   NODE_ENUM(OmpAtClause, ActionTime)
   NODE(parser, OmpAtomicDefaultMemOrderClause)
+  NODE(parser, OmpAttachModifier)
+  NODE_ENUM(OmpAttachModifier, Value)
   NODE(parser, OmpAutomapModifier)
   NODE_ENUM(OmpAutomapModifier, Value)
   NODE(parser, OmpBaseVariantNames)
@@ -557,6 +559,7 @@ class ParseTreeDumper {
   NODE(OmpDeviceClause, Modifier)
   NODE(parser, OmpDeviceModifier)
   NODE_ENUM(OmpDeviceModifier, Value)
+  NODE(parser, OmpDeviceSafesyncClause)
   NODE(parser, OmpDeviceTypeClause)
   NODE_ENUM(OmpDeviceTypeClause, DeviceTypeDescription)
   NODE(parser, OmpDirectiveName)
@@ -566,6 +569,7 @@ class ParseTreeDumper {
   NODE(OmpDoacross, Sink)
   NODE(OmpDoacross, Source)
   NODE(parser, OmpDoacrossClause)
+  NODE(parser, OmpDynamicAllocatorsClause)
   NODE(parser, OmpDynGroupprivateClause)
   NODE(OmpDynGroupprivateClause, Modifier)
   NODE(parser, OmpEndDirective)
@@ -659,9 +663,11 @@ class ParseTreeDumper {
   NODE(parser, OmpRefModifier)
   NODE_ENUM(OmpRefModifier, Value)
   NODE(parser, OmpReplayableClause)
+  NODE(parser, OmpReverseOffloadClause)
   NODE(parser, OmpScheduleClause)
   NODE(OmpScheduleClause, Modifier)
   NODE_ENUM(OmpScheduleClause, Kind)
+  NODE(parser, OmpSelfMapsClause)
   NODE(parser, OmpSelfModifier)
   NODE_ENUM(OmpSelfModifier, Value)
   NODE(parser, OmpSeverityClause)
@@ -689,6 +695,8 @@ class ParseTreeDumper {
   NODE(parser, OmpTransparentClause)
   NODE(parser, OmpTypeNameList)
   NODE(parser, OmpTypeSpecifier)
+  NODE(parser, OmpUnifiedAddressClause)
+  NODE(parser, OmpUnifiedSharedMemoryClause)
   NODE(parser, OmpUpdateClause)
   NODE(parser, OmpUseClause)
   NODE(parser, OmpVariableCategory)
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index d919b777d7487..6dd4f2492cf22 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -337,6 +337,7 @@ using IntConstantExpr = Integer<ConstantExpr>; // R1031
 using ScalarLogicalExpr = Scalar<LogicalExpr>;
 using ScalarIntExpr = Scalar<IntExpr>;
 using ScalarIntConstantExpr = Scalar<IntConstantExpr>;
+using ScalarLogicalConstantExpr = Scalar<Logical<ConstantExpr>>;
 using ScalarDefaultCharExpr = Scalar<DefaultCharExpr>;
 // R1030 default-char-constant-expr is used in the Standard only as part of
 // scalar-default-char-constant-expr.
@@ -3813,6 +3814,18 @@ struct OmpAlwaysModifier {
   WRAPPER_CLASS_BOILERPLATE(OmpAlwaysModifier, Value);
 };
 
+// Ref: [coming in 6.1]
+//
+// attach-modifier ->
+//    ATTACH(attachment-mode)                       // since 6.1
+//
+// attachment-mode ->
+//    ALWAYS | AUTO | NEVER
+struct OmpAttachModifier {
+  ENUM_CLASS(Value, Always, Never, Auto)
+  WRAPPER_CLASS_BOILERPLATE(OmpAttachModifier, Value);
+};
+
 // Ref: [6.0:289-290]
 //
 // automap-modifier ->
@@ -4189,7 +4202,7 @@ struct OmpAffinityClause {
 
 // Ref: 5.2: [174]
 struct OmpAlignClause {
-  WRAPPER_CLASS_BOILERPLATE(OmpAlignClause, ScalarIntExpr);
+  WRAPPER_CLASS_BOILERPLATE(OmpAlignClause, ScalarIntConstantExpr);
 };
 
 // Ref: [4.5:72-81], [5.0:110-119], [5.1:134-143], [5.2:169-170]
@@ -4405,6 +4418,14 @@ struct OmpDeviceClause {
   std::tuple<MODIFIERS(), ScalarIntExpr> t;
 };
 
+// Ref: [6.0:356-362]
+//
+// device-safesync-clause ->
+//    DEVICE_SAFESYNC [(scalar-logical-const-expr)] // since 6.0
+struct OmpDeviceSafesyncClause {
+  WRAPPER_CLASS_BOILERPLATE(OmpDeviceSafesyncClause, ScalarLogicalConstantExpr);
+};
+
 // Ref: [5.0:180-185], [5.1:210-216], [5.2:275]
 //
 // device-type-clause ->
@@ -4414,6 +4435,16 @@ struct OmpDeviceTypeClause {
   WRAPPER_CLASS_BOILERPLATE(OmpDeviceTypeClause, DeviceTypeDescription);
 };
 
+// Ref: [5.0:60-63], [5.1:83-86], [5.2:212-213], [6.0:356-362]
+//
+// dynamic-allocators-clause ->
+//    DYNAMIC_ALLOCATORS                            // since 5.0
+//        [(scalar-logical-const-expr)]             // since 6.0
+struct OmpDynamicAllocatorsClause {
+  WRAPPER_CLASS_BOILERPLATE(
+      OmpDynamicAllocatorsClause, ScalarLogicalConstantExpr);
+};
+
 struct OmpDynGroupprivateClause {
   TUPLE_CLASS_BOILERPLATE(OmpDynGroupprivateClause);
   MODIFIER_BOILERPLATE(OmpAccessGroup, OmpPrescriptiveness);
@@ -4575,6 +4606,7 @@ struct OmpLoopRangeClause {
 // modifier ->
 //    map-type-modifier [replaced] |                // since 4.5, until 5.2
 //    always-modifier |                             // since 6.0
+//    attach-modifier |                             // since 6.1
 //    close-modifier |                              // since 6.0
 //    delete-modifier |                             // since 6.0
 //    present-modifier |                            // since 6.0
@@ -4589,9 +4621,9 @@ struct OmpLoopRangeClause {
 // and delete-modifier has been split from map-type.
 struct OmpMapClause {
   TUPLE_CLASS_BOILERPLATE(OmpMapClause);
-  MODIFIER_BOILERPLATE(OmpAlwaysModifier, OmpCloseModifier, OmpDeleteModifier,
-      OmpMapTypeModifier, OmpPresentModifier, OmpRefModifier, OmpSelfModifier,
-      OmpMapper, OmpIterator, OmpMapType, OmpxHoldModifier);
+  MODIFIER_BOILERPLATE(OmpAlwaysModifier, OmpAttachModifier, OmpCloseModifier,
+      OmpDeleteModifier, OmpMapTypeModifier, OmpPresentModifier, OmpRefModifier,
+      OmpSelfModifier, OmpMapper, OmpIterator, OmpMapType, OmpxHoldModifier);
   std::tuple<MODIFIERS(), OmpObjectList, /*CommaSeparated=*/bool> t;
 };
 
@@ -4690,7 +4722,16 @@ struct OmpReductionClause {
 // replayable-clause ->
 //    REPLAYABLE[(replayable-expression)]           // since 6.0
 struct OmpReplayableClause {
-  WRAPPER_CLASS_BOILERPLATE(OmpReplayableClause, Scalar<Logical<ConstantExpr>>);
+  WRAPPER_CLASS_BOILERPLATE(OmpReplayableClause, ScalarLogicalConstantExpr);
+};
+
+// Ref: [5.0:60-63], [5.1:83-86], [5.2:212-213], [6.0:356-362]
+//
+// reverse-offload-clause ->
+//    REVERSE_OFFLOAD                               // since 5.0
+//        [(scalar-logical-const-expr)]             // since 6.0
+struct OmpReverseOffloadClause {
+  WRAPPER_CLASS_BOILERPLATE(OmpReverseOffloadClause, ScalarLogicalConstantExpr);
 };
 
 // Ref: [4.5:56-63], [5.0:101-109], [5.1:126-133], [5.2:252-254]
@@ -4708,6 +4749,14 @@ struct OmpScheduleClause {
   std::tuple<MODIFIERS(), Kind, std::optional<ScalarIntExpr>> t;
 };
 
+// ref: [6.0:361-362]
+//
+// self-maps-clause ->
+//    SELF_MAPS [(scalar-logical-const-expr)]       // since 6.0
+struct OmpSelfMapsClause {
+  WRAPPER_CLASS_BOILERPLATE(OmpSelfMapsClause, ScalarLogicalConstantExpr);
+};
+
 // REF: [5.2:217]
 // severity-clause ->
 //    SEVERITY(warning|fatal)
@@ -4750,6 +4799,25 @@ struct OmpTransparentClause {
   WRAPPER_CLASS_BOILERPLATE(OmpTransparentClause, ScalarIntExpr);
 };
 
+// Ref: [5.0:60-63], [5.1:83-86], [5.2:212-213], [6.0:356-362]
+//
+// unified-address-clause ->
+//    UNIFIED_ADDRESS                               // since 5.0
+//        [(scalar-logical-const-expr)]             // since 6.0
+struct OmpUnifiedAddressClause {
+  WRAPPER_CLASS_BOILERPLATE(OmpUnifiedAddressClause, ScalarLogicalConstantExpr);
+};
+
+// Ref: [5.0:60-63], [5.1:83-86], [5.2:212-213], [6.0:356-362]
+//
+// unified-shared-memory-clause ->
+//    UNIFIED_SHARED_MEMORY                         // since 5.0
+//        [(scalar-logical-const-expr)]             // since 6.0
+struct OmpUnifiedSharedMemoryClause {
+  WRAPPER_CLASS_BOILERPLATE(
+      OmpUnifiedSharedMemoryClause, ScalarLogicalConstantExpr);
+};
+
 // Ref: [5.0:254-255], [5.1:287-288], [5.2:321-322]
 //
 // In ATOMIC construct
diff --git a/flang/include/flang/Parser/tools.h b/flang/include/flang/Parser/tools.h
index 447bccd5d35a6..d105f03dd31d3 100644
--- a/flang/include/flang/Parser/tools.h
+++ b/flang/include/flang/Parser/tools.h
@@ -117,6 +117,12 @@ template <typename A, typename B> const A *Unwrap(const B &x) {
 template <typename A, typename B> A *Unwrap(B &x) {
   return const_cast<A *>(Unwrap<A, B>(const_cast<const B &>(x)));
 }
+template <typename A, typename B> const A &UnwrapRef(const B &x) {
+  return DEREF(Unwrap<A>(x));
+}
+template <typename A, typename B> A &UnwrapRef(B &x) {
+  return DEREF(Unwrap<A>(x));
+}
 
 // Get the CoindexedNamedObject if the entity is a coindexed object.
 const CoindexedNamedObject *GetCoindexedNamedObject(const AllocateObject &);
@@ -253,5 +259,7 @@ template <typename A> std::optional<CharBlock> GetLastSource(A &x) {
 // Checks whether the assignment statement has a single variable on the RHS.
 bool CheckForSingleVariableOnRHS(const AssignmentStmt &);
 
+const Name *GetDesignatorNameIfDataRef(const Designator &);
+
 } // namespace Fortran::parser
 #endif // FORTRAN_PARSER_TOOLS_H_
diff --git a/flang/include/flang/Semantics/dump-expr.h b/flang/include/flang/Semantics/dump-expr.h
index 9cc52b4da487d..2dbd4cb60be59 100644
--- a/flang/include/flang/Semantics/dump-expr.h
+++ b/flang/include/flang/Semantics/dump-expr.h
@@ -46,7 +46,11 @@ class DumpEvaluateExpr {
       std::string_view v(__PRETTY_FUNCTION__);
       // Extract the "xyz" from the "pretty function" string:
       // "... [with T = xyz; std::string_view = ...]"
-      std::string_view front("with T = ");
+#ifdef __clang__
+      std::string_view front("[T = ");
+#else
+      std::string_view front("[with T = ");
+#endif
       std::string_view back("; std::string_view =");
 
 #elif defined(_MSC_VER)
@@ -69,7 +73,6 @@ class DumpEvaluateExpr {
         }
       }
 #endif
-
       return "";
     }
 
diff --git a/flang/include/flang/Semantics/openmp-modifiers.h b/flang/include/flang/Semantics/openmp-modifiers.h
index e0eae984731c7..bfa3aa4939cb1 100644
--- a/flang/include/flang/Semantics/openmp-modifiers.h
+++ b/flang/include/flang/Semantics/openmp-modifiers.h
@@ -72,6 +72,7 @@ DECLARE_DESCRIPTOR(parser::OmpAlignModifier);
 DECLARE_DESCRIPTOR(parser::OmpAllocatorComplexModifier);
 DECLARE_DESCRIPTOR(parser::OmpAllocatorSimpleModifier);
 DECLARE_DESCRIPTOR(parser::OmpAlwaysModifier);
+DECLARE_DESCRIPTOR(parser::OmpAttachModifier);
 DECLARE_DESCRIPTOR(parser::OmpAutomapModifier);
 DECLARE_DESCRIPTOR(parser::OmpChunkModifier);
 DECLARE_DESCRIPTOR(parser::OmpCloseModifier);
diff --git a/flang/include/flang/Semantics/openmp-utils.h b/flang/include/flang/Semantics/openmp-utils.h
index 2954a1c4769f7..0f851830edd46 100644
--- a/flang/include/flang/Semantics/openmp-utils.h
+++ b/flang/include/flang/Semantics/openmp-utils.h
@@ -38,6 +38,7 @@ template <typename T, typename U = std::remove_const_t<T>> U AsRvalue(T &t) {
 template <typename T> T &&AsRvalue(T &&t) { return std::move(t); }
 
 const Scope &GetScopingUnit(const Scope &scope);
+const Scope &GetProgramUnit(const Scope &scope);
 
 // There is no consistent way to get the source of an ActionStmt, but there
 // is "source" in Statement<T>. This structure keeps the ActionStmt with the
diff --git a/flang/include/flang/Semantics/symbol.h b/flang/include/flang/Semantics/symbol.h
index 77f567e69ce55..04a063957082a 100644
--- a/flang/include/flang/Semantics/symbol.h
+++ b/flang/include/flang/Semantics/symbol.h
@@ -16,6 +16,7 @@
 #include "flang/Semantics/module-dependences.h"
 #include "flang/Support/Fortran.h"
 #include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/Frontend/OpenMP/OMP.h"
 
 #include <array>
 #include <functional>
@@ -50,32 +51,34 @@ using MutableSymbolVector = std::vector<MutableSymbolRef>;
 
 // Mixin for details with OpenMP declarative constructs.
 class WithOmpDeclarative {
-  using OmpAtomicOrderType = common::OmpMemoryOrderType;
-
 public:
-  ENUM_CLASS(RequiresFlag, ReverseOffload, UnifiedAddress, UnifiedSharedMemory,
-      DynamicAllocators);
-  using RequiresFlags = common::EnumSet<RequiresFlag, RequiresFlag_enumSize>;
+  // The set of requirements for any program unit include requirements
+  // from any module used in the program unit.
+  using RequiresClauses =
+      common::EnumSet<llvm::omp::Clause, llvm::omp::Clause_enumSize>;
 
   bool has_ompRequires() const { return ompRequires_.has_value(); }
-  const RequiresFlags *ompRequires() const {
+  const RequiresClauses *ompRequires() const {
     return ompRequires_ ? &*ompRequires_ : nullptr;
   }
-  void set_ompRequires(RequiresFlags flags) { ompRequires_ = flags; }
+  void set_ompRequires(RequiresClauses clauses) { ompRequires_ = clauses; }
 
   bool has_ompAtomicDefaultMemOrder() const {
     return ompAtomicDefaultMemOrder_.has_value();
   }
-  const OmpAtomicOrderType *ompAtomicDefaultMemOrder() const {
+  const common::OmpMemoryOrderType *ompAtomicDefaultMemOrder() const {
     return ompAtomicDefaultMemOrder_ ? &*ompAtomicDefaultMemOrder_ : nullptr;
   }
-  void set_ompAtomicDefaultMemOrder(OmpAtomicOrderType flags) {
+  void set_ompAtomicDefaultMemOrder(common::OmpMemoryOrderType flags) {
     ompAtomicDefaultMemOrder_ = flags;
   }
 
+  friend llvm::raw_ostream &operator<<(
+      llvm::raw_ostream &, const WithOmpDeclarative &);
+
 private:
-  std::optional<RequiresFlags> ompRequires_;
-  std::optional<OmpAtomicOrderType> ompAtomicDefaultMemOrder_;
+  std::optional<RequiresClauses> ompRequires_;
+  std::optional<common::OmpMemoryOrderType> ompAtomicDefaultMemOrder_;
 };
 
 // A module or submodule.
diff --git a/flang/include/flang/Semantics/tools.h b/flang/include/flang/Semantics/tools.h
index b977fb812fb11..8a7b9867c0979 100644
--- a/flang/include/flang/Semantics/tools.h
+++ b/flang/include/flang/Semantics/tools.h
@@ -739,12 +739,6 @@ const DerivedTypeSpec *GetDtvArgDerivedType(const Symbol &);
 void WarnOnDeferredLengthCharacterScalar(SemanticsContext &, const SomeExpr *,
     parser::CharBlock at, const char *what);
 
-inline const parser::Name *getDesignatorNameIfDataRef(
-    const parser::Designator &designator) {
-  const auto *dataRef{std::get_if<parser::DataRef>(&designator.u)};
-  return dataRef ? std::get_if<parser::Name>(&dataRef->u) : nullptr;
-}
-
 bool CouldBeDataPointerValuedFunction(const Symbol *);
 
 template <typename R, typename T>
diff --git a/flang/include/flang/Support/LangOptions.def b/flang/include/flang/Support/LangOptions.def
index e7185c836f45b..c671ce8e24a4a 100644
--- a/flang/include/flang/Support/LangOptions.def
+++ b/flang/include/flang/Support/LangOptions.def
@@ -62,6 +62,7 @@ LANGOPT(OpenMPNoNestedParallelism, 1, 0)
 LANGOPT(OpenMPSimd, 1, false)
 /// Enable fast MOD operations for REAL
 LANGOPT(NoFastRealMod, 1, false)
+LANGOPT(AllowThreadprivateEquivalence, 1, false)
 LANGOPT(VScaleMin, 32, 0)  ///< Minimum vscale range value
 LANGOPT(VScaleMax, 32, 0)  ///< Maximum vscale range value
 
diff --git a/flang/include/flang/Tools/CrossToolHelpers.h b/flang/include/flang/Tools/CrossToolHelpers.h
index 7dba51d8b3d6f..768ec8c553b29 100644
--- a/flang/include/flang/Tools/CrossToolHelpers.h
+++ b/flang/include/flang/Tools/CrossToolHelpers.h
@@ -128,7 +128,7 @@ struct MLIRToLLVMPassPipelineConfig : public FlangEPCallBacks {
   bool ApproxFuncFPMath = false; ///< Set afn flag for instructions.
   bool NoSignedZerosFPMath =
       false; ///< Set no-signed-zeros-fp-math attribute for functions.
-  bool UnsafeFPMath = false; ///< Set unsafe-fp-math attribute for functions.
+  bool UnsafeFPMath = false; ///< Set all fast-math flags for instructions.
   std::string Reciprocals = ""; ///< Set reciprocal-estimate attribute for
                                 ///< functions.
   std::string PreferVectorWidth = ""; ///< Set prefer-vector-width attribute for
diff --git a/flang/include/flang/Utils/OpenMP.h b/flang/include/flang/Utils/OpenMP.h
index 01a94c9099808..bad0abb6f5788 100644
--- a/flang/include/flang/Utils/OpenMP.h
+++ b/flang/include/flang/Utils/OpenMP.h
@@ -29,8 +29,9 @@ mlir::omp::MapInfoOp createMapInfoOp(mlir::OpBuilder &builder,
     mlir::Location loc, mlir::Value baseAddr, mlir::Value varPtrPtr,
     llvm::StringRef name, llvm::ArrayRef<mlir::Value> bounds,
     llvm::ArrayRef<mlir::Value> members, mlir::ArrayAttr membersIndex,
-    uint64_t mapType, mlir::omp::VariableCaptureKind mapCaptureType,
-    mlir::Type retTy, bool partialMap = false,
+    mlir::omp::ClauseMapFlags mapType,
+    mlir::omp::VariableCaptureKind mapCaptureType, mlir::Type retTy,
+    bool partialMap = false,
     mlir::FlatSymbolRefAttr mapperId = mlir::FlatSymbolRefAttr());
 
 /// For an mlir value that does not have storage, allocate temporary storage
diff --git a/flang/lib/Evaluate/intrinsics.cpp b/flang/lib/Evaluate/intrinsics.cpp
index f204eef54ef84..1de5e6b53ba71 100644
--- a/flang/lib/Evaluate/intrinsics.cpp
+++ b/flang/lib/Evaluate/intrinsics.cpp
@@ -111,6 +111,7 @@ ENUM_CLASS(KindCode, none, defaultIntegerKind,
     atomicIntKind, // atomic_int_kind from iso_fortran_env
     atomicIntOrLogicalKind, // atomic_int_kind or atomic_logical_kind
     sameAtom, // same type and kind as atom
+    extensibleOrUnlimitedType, // extensible or unlimited polymorphic type
 )
 
 struct TypePattern {
@@ -160,7 +161,8 @@ static constexpr TypePattern AnyChar{CharType, KindCode::any};
 static constexpr TypePattern AnyLogical{LogicalType, KindCode::any};
 static constexpr TypePattern AnyRelatable{RelatableType, KindCode::any};
 static constexpr TypePattern AnyIntrinsic{IntrinsicType, KindCode::any};
-static constexpr TypePattern ExtensibleDerived{DerivedType, KindCode::any};
+static constexpr TypePattern ExtensibleDerived{
+    DerivedType, KindCode::extensibleOrUnlimitedType};
 static constexpr TypePattern AnyData{AnyType, KindCode::any};
 
 // Type is irrelevant, but not BOZ (for PRESENT(), OPTIONAL(), &c.)
@@ -2103,9 +2105,13 @@ std::optional<SpecificCall> IntrinsicInterface::Match(
       }
       return std::nullopt;
     } else if (!d.typePattern.categorySet.test(type->category())) {
+      const char *expected{
+          d.typePattern.kindCode == KindCode::extensibleOrUnlimitedType
+              ? ", expected extensible or unlimited polymorphic type"
+              : ""};
       messages.Say(arg->sourceLocation(),
-          "Actual argument for '%s=' has bad type '%s'"_err_en_US, d.keyword,
-          type->AsFortran());
+          "Actual argument for '%s=' has bad type '%s'%s"_err_en_US, d.keyword,
+          type->AsFortran(), expected);
       return std::nullopt; // argument has invalid type category
     }
     bool argOk{false};
@@ -2244,6 +2250,17 @@ std::optional<SpecificCall> IntrinsicInterface::Match(
         return std::nullopt;
       }
       break;
+    case KindCode::extensibleOrUnlimitedType:
+      argOk = type->IsUnlimitedPolymorphic() ||
+          (type->category() == TypeCategory::Derived &&
+              IsExtensibleType(GetDerivedTypeSpec(type)));
+      if (!argOk) {
+        messages.Say(arg->sourceLocation(),
+            "Actual argument for '%s=' has type '%s', but was expected to be an extensible or unlimited polymorphic type"_err_en_US,
+            d.keyword, type->AsFortran());
+        return std::nullopt;
+      }
+      break;
     default:
       CRASH_NO_CASE;
     }
diff --git a/flang/lib/Evaluate/tools.cpp b/flang/lib/Evaluate/tools.cpp
index b927fa3cc7ed7..bd06acc21e47f 100644
--- a/flang/lib/Evaluate/tools.cpp
+++ b/flang/lib/Evaluate/tools.cpp
@@ -1153,6 +1153,18 @@ bool HasCUDAImplicitTransfer(const Expr<SomeType> &expr) {
   return (hasConstant || (hostSymbols.size() > 0)) && deviceSymbols.size() > 0;
 }
 
+bool IsCUDADeviceSymbol(const Symbol &sym) {
+  if (const auto *details =
+          sym.GetUltimate().detailsIf<semantics::ObjectEntityDetails>()) {
+    return details->cudaDataAttr() &&
+        *details->cudaDataAttr() != common::CUDADataAttr::Pinned;
+  } else if (const auto *details =
+                 sym.GetUltimate().detailsIf<semantics::AssocEntityDetails>()) {
+    return GetNbOfCUDADeviceSymbols(details->expr()) > 0;
+  }
+  return false;
+}
+
 // HasVectorSubscript()
 struct HasVectorSubscriptHelper
     : public AnyTraverse<HasVectorSubscriptHelper, bool,
diff --git a/flang/lib/Frontend/CMakeLists.txt b/flang/lib/Frontend/CMakeLists.txt
index fa0d5ecfdb9bd..2b3bc0e9c2269 100644
--- a/flang/lib/Frontend/CMakeLists.txt
+++ b/flang/lib/Frontend/CMakeLists.txt
@@ -19,6 +19,7 @@ add_flang_library(flangFrontend
   FIROptCodeGenPassIncGen
   FIROptTransformsPassIncGen
   HLFIRDialect
+  MIFDialect
 
   LINK_LIBS
   CUFDialect
@@ -41,6 +42,7 @@ add_flang_library(flangFrontend
   FIROpenACCSupport
   FIROpenMPSupport
   FlangOpenMPTransforms
+  MIFDialect
 
   LINK_COMPONENTS
   Passes
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index 297bbf413c1ce..c6606401f9dd8 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -1359,6 +1359,11 @@ static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
         res.getLangOpts().OMPTargetTriples.push_back(tt);
     }
   }
+
+  if (args.hasArg(
+          clang::driver::options::OPT_famd_allow_threadprivate_equivalence))
+    res.getLangOpts().AllowThreadprivateEquivalence = true;
+
   return diags.getNumErrors() == numErrorsBefore;
 }
 
diff --git a/flang/lib/Lower/Allocatable.cpp b/flang/lib/Lower/Allocatable.cpp
index 5f556bfcb26ed..d00561ad5fbac 100644
--- a/flang/lib/Lower/Allocatable.cpp
+++ b/flang/lib/Lower/Allocatable.cpp
@@ -682,6 +682,10 @@ class AllocateStmtHelper {
     bool isAMDMemoryAllocatorEnabled = langFeatures.IsEnabled(
         Fortran::common::LanguageFeature::AmdMemoryAllocator);
 
+    if (const Fortran::semantics::Symbol *sym{GetLastSymbol(sourceExpr)})
+      if (Fortran::semantics::IsCUDADevice(*sym))
+        TODO(loc, "CUDA Fortran: allocate with device source");
+
     // Generate a sequence of runtime calls.
     errorManager.genStatCheck(builder, loc);
     genAllocateObjectInit(box, allocatorIdx);
@@ -822,6 +826,15 @@ class AllocateStmtHelper {
                               const fir::MutableBoxValue &box,
                               ErrorManager &errorManager,
                               const Fortran::semantics::Symbol &sym) {
+
+    if (const Fortran::semantics::DeclTypeSpec *declTypeSpec = sym.GetType())
+      if (const Fortran::semantics::DerivedTypeSpec *derivedTypeSpec =
+              declTypeSpec->AsDerived())
+        if (derivedTypeSpec->HasDefaultInitialization(
+                /*ignoreAllocatable=*/true, /*ignorePointer=*/true))
+          TODO(loc,
+               "CUDA Fortran: allocate on device with default initialization");
+
     Fortran::lower::StatementContext stmtCtx;
     cuf::DataAttributeAttr cudaAttr =
         Fortran::lower::translateSymbolCUFDataAttribute(builder.getContext(),
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 68adf346fe8c0..3b711ccbe786a 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -58,6 +58,7 @@
 #include "flang/Optimizer/Support/InternalNames.h"
 #include "flang/Optimizer/Transforms/Passes.h"
 #include "flang/Parser/parse-tree.h"
+#include "flang/Parser/tools.h"
 #include "flang/Runtime/iostat-consts.h"
 #include "flang/Semantics/openmp-dsa.h"
 #include "flang/Semantics/runtime-type-info.h"
@@ -3352,7 +3353,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
                 &var.u)) {
           const Fortran::parser::Designator &designator = iDesignator->value();
           if (const auto *name =
-                  Fortran::semantics::getDesignatorNameIfDataRef(designator)) {
+                  Fortran::parser::GetDesignatorNameIfDataRef(designator)) {
             auto val = getSymbolAddress(*name->symbol);
             reduceOperands.push_back(val);
           }
@@ -4987,11 +4988,8 @@ class FirConverter : public Fortran::lower::AbstractConverter {
 
     // host = device
     if (!lhsIsDevice && rhsIsDevice) {
-      if (Fortran::lower::isTransferWithConversion(rhs)) {
+      if (auto elementalOp = Fortran::lower::isTransferWithConversion(rhs)) {
         mlir::OpBuilder::InsertionGuard insertionGuard(builder);
-        auto elementalOp =
-            mlir::dyn_cast<hlfir::ElementalOp>(rhs.getDefiningOp());
-        assert(elementalOp && "expect elemental op");
         auto designateOp =
             *elementalOp.getBody()->getOps<hlfir::DesignateOp>().begin();
         builder.setInsertionPoint(elementalOp);
@@ -6079,7 +6077,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
         if (resTy != wrappedSymTy) {
           // check size of the pointed to type so we can't overflow by writing
           // double precision to a single precision allocation, etc
-          LLVM_ATTRIBUTE_UNUSED auto getBitWidth = [this](mlir::Type ty) {
+          [[maybe_unused]] auto getBitWidth = [this](mlir::Type ty) {
             // 15.6.2.6.3: differering result types should be integer, real,
             // complex or logical
             if (auto cmplx = mlir::dyn_cast_or_null<mlir::ComplexType>(ty))
diff --git a/flang/lib/Lower/CMakeLists.txt b/flang/lib/Lower/CMakeLists.txt
index eb4d57d733ddb..3d0b4e4cd82eb 100644
--- a/flang/lib/Lower/CMakeLists.txt
+++ b/flang/lib/Lower/CMakeLists.txt
@@ -45,6 +45,7 @@ add_flang_library(FortranLower
   FIRDialect
   FIRTransforms
   HLFIRDialect
+  MIFDialect
 
   LINK_LIBS
   CUFAttrs
@@ -61,6 +62,7 @@ add_flang_library(FortranLower
   FortranEvaluate
   FortranSemantics
   FortranUtils
+  MIFDialect
 
   LINK_COMPONENTS
   Support
diff --git a/flang/lib/Lower/CUDA.cpp b/flang/lib/Lower/CUDA.cpp
index bb4bdee78f97d..9501b0ec60002 100644
--- a/flang/lib/Lower/CUDA.cpp
+++ b/flang/lib/Lower/CUDA.cpp
@@ -68,11 +68,26 @@ cuf::DataAttributeAttr Fortran::lower::translateSymbolCUFDataAttribute(
   return cuf::getDataAttribute(mlirContext, cudaAttr);
 }
 
-bool Fortran::lower::isTransferWithConversion(mlir::Value rhs) {
+hlfir::ElementalOp Fortran::lower::isTransferWithConversion(mlir::Value rhs) {
+  auto isConversionElementalOp = [](hlfir::ElementalOp elOp) {
+    return llvm::hasSingleElement(
+               elOp.getBody()->getOps<hlfir::DesignateOp>()) &&
+           llvm::hasSingleElement(elOp.getBody()->getOps<fir::LoadOp>()) == 1 &&
+           llvm::hasSingleElement(elOp.getBody()->getOps<fir::ConvertOp>()) ==
+               1;
+  };
+  if (auto declOp = mlir::dyn_cast<hlfir::DeclareOp>(rhs.getDefiningOp())) {
+    if (!declOp.getMemref().getDefiningOp())
+      return {};
+    if (auto associateOp = mlir::dyn_cast<hlfir::AssociateOp>(
+            declOp.getMemref().getDefiningOp()))
+      if (auto elOp = mlir::dyn_cast<hlfir::ElementalOp>(
+              associateOp.getSource().getDefiningOp()))
+        if (isConversionElementalOp(elOp))
+          return elOp;
+  }
   if (auto elOp = mlir::dyn_cast<hlfir::ElementalOp>(rhs.getDefiningOp()))
-    if (llvm::hasSingleElement(elOp.getBody()->getOps<hlfir::DesignateOp>()) &&
-        llvm::hasSingleElement(elOp.getBody()->getOps<fir::LoadOp>()) == 1 &&
-        llvm::hasSingleElement(elOp.getBody()->getOps<fir::ConvertOp>()) == 1)
-      return true;
-  return false;
+    if (isConversionElementalOp(elOp))
+      return elOp;
+  return {};
 }
diff --git a/flang/lib/Lower/ConvertExpr.cpp b/flang/lib/Lower/ConvertExpr.cpp
index d7f94e1f7ca6a..a46d219ba4b2c 100644
--- a/flang/lib/Lower/ConvertExpr.cpp
+++ b/flang/lib/Lower/ConvertExpr.cpp
@@ -5603,7 +5603,7 @@ class ArrayExprLowering {
                     return newIters;
                   };
                   if (useTripsForSlice) {
-                    LLVM_ATTRIBUTE_UNUSED auto vectorSubscriptShape =
+                    [[maybe_unused]] auto vectorSubscriptShape =
                         getShape(arrayOperands.back());
                     auto undef = fir::UndefOp::create(builder, loc, idxTy);
                     trips.push_back(undef);
diff --git a/flang/lib/Lower/IO.cpp b/flang/lib/Lower/IO.cpp
index 604b13792fc06..cd53dc9482283 100644
--- a/flang/lib/Lower/IO.cpp
+++ b/flang/lib/Lower/IO.cpp
@@ -950,7 +950,8 @@ static void genIoLoop(Fortran::lower::AbstractConverter &converter,
   makeNextConditionalOn(builder, loc, checkResult, ok, inLoop);
   const auto &itemList = std::get<0>(ioImpliedDo.t);
   const auto &control = std::get<1>(ioImpliedDo.t);
-  const auto &loopSym = *control.name.thing.thing.symbol;
+  const auto &loopSym =
+      *Fortran::parser::UnwrapRef<Fortran::parser::Name>(control.name).symbol;
   mlir::Value loopVar = fir::getBase(converter.genExprAddr(
       Fortran::evaluate::AsGenericExpr(loopSym).value(), stmtCtx));
   auto genControlValue = [&](const Fortran::parser::ScalarIntExpr &expr) {
diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 62e5c0c830972..af4f4207b8e4c 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -30,10 +30,12 @@
 #include "flang/Optimizer/Dialect/FIRType.h"
 #include "flang/Parser/parse-tree-visitor.h"
 #include "flang/Parser/parse-tree.h"
+#include "flang/Parser/tools.h"
 #include "flang/Semantics/expression.h"
 #include "flang/Semantics/scope.h"
 #include "flang/Semantics/tools.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
+#include "mlir/Dialect/OpenACC/OpenACCUtils.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/Support/LLVM.h"
@@ -296,7 +298,7 @@ getSymbolFromAccObject(const Fortran::parser::AccObject &accObject) {
   if (const auto *designator =
           std::get_if<Fortran::parser::Designator>(&accObject.u)) {
     if (const auto *name =
-            Fortran::semantics::getDesignatorNameIfDataRef(*designator))
+            Fortran::parser::GetDesignatorNameIfDataRef(*designator))
       return *name->symbol;
     if (const auto *arrayElement =
             Fortran::parser::Unwrap<Fortran::parser::ArrayElement>(
@@ -327,7 +329,8 @@ genAtomicCaptureStatement(Fortran::lower::AbstractConverter &converter,
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
 
   mlir::acc::AtomicReadOp::create(firOpBuilder, loc, fromAddress, toAddress,
-                                  mlir::TypeAttr::get(elementType));
+                                  mlir::TypeAttr::get(elementType),
+                                  /*ifCond=*/mlir::Value{});
 }
 
 /// Used to generate atomic.write operation which is created in existing
@@ -347,7 +350,8 @@ genAtomicWriteStatement(Fortran::lower::AbstractConverter &converter,
   rhsExpr = firOpBuilder.createConvert(loc, varType, rhsExpr);
   firOpBuilder.restoreInsertionPoint(insertionPoint);
 
-  mlir::acc::AtomicWriteOp::create(firOpBuilder, loc, lhsAddr, rhsExpr);
+  mlir::acc::AtomicWriteOp::create(firOpBuilder, loc, lhsAddr, rhsExpr,
+                                   /*ifCond=*/mlir::Value{});
 }
 
 /// Used to generate atomic.update operation which is created in existing
@@ -463,7 +467,8 @@ static inline void genAtomicUpdateStatement(
 
   mlir::Operation *atomicUpdateOp = nullptr;
   atomicUpdateOp =
-      mlir::acc::AtomicUpdateOp::create(firOpBuilder, currentLocation, lhsAddr);
+      mlir::acc::AtomicUpdateOp::create(firOpBuilder, currentLocation, lhsAddr,
+                                        /*ifCond=*/mlir::Value{});
 
   llvm::SmallVector<mlir::Type> varTys = {varType};
   llvm::SmallVector<mlir::Location> locs = {currentLocation};
@@ -588,7 +593,9 @@ void genAtomicCapture(Fortran::lower::AbstractConverter &converter,
       fir::getBase(converter.genExprValue(assign2.lhs, stmtCtx)).getType();
 
   mlir::Operation *atomicCaptureOp = nullptr;
-  atomicCaptureOp = mlir::acc::AtomicCaptureOp::create(firOpBuilder, loc);
+  atomicCaptureOp =
+      mlir::acc::AtomicCaptureOp::create(firOpBuilder, loc,
+                                         /*ifCond=*/mlir::Value{});
 
   firOpBuilder.createBlock(&(atomicCaptureOp->getRegion(0)));
   mlir::Block &block = atomicCaptureOp->getRegion(0).back();
@@ -711,6 +718,84 @@ static void genDataOperandOperations(
   }
 }
 
+template <typename GlobalCtorOrDtorOp, typename EntryOp, typename DeclareOp,
+          typename ExitOp>
+static void createDeclareGlobalOp(mlir::OpBuilder &modBuilder,
+                                  fir::FirOpBuilder &builder,
+                                  mlir::Location loc, fir::GlobalOp globalOp,
+                                  mlir::acc::DataClause clause,
+                                  const std::string &declareGlobalName,
+                                  bool implicit, std::stringstream &asFortran) {
+  GlobalCtorOrDtorOp declareGlobalOp =
+      GlobalCtorOrDtorOp::create(modBuilder, loc, declareGlobalName);
+  builder.createBlock(&declareGlobalOp.getRegion(),
+                      declareGlobalOp.getRegion().end(), {}, {});
+  builder.setInsertionPointToEnd(&declareGlobalOp.getRegion().back());
+
+  fir::AddrOfOp addrOp = fir::AddrOfOp::create(
+      builder, loc, fir::ReferenceType::get(globalOp.getType()),
+      globalOp.getSymbol());
+  addDeclareAttr(builder, addrOp, clause);
+
+  llvm::SmallVector<mlir::Value> bounds;
+  EntryOp entryOp = createDataEntryOp<EntryOp>(
+      builder, loc, addrOp.getResTy(), asFortran, bounds,
+      /*structured=*/false, implicit, clause, addrOp.getResTy().getType(),
+      /*async=*/{}, /*asyncDeviceTypes=*/{}, /*asyncOnlyDeviceTypes=*/{});
+  if constexpr (std::is_same_v<DeclareOp, mlir::acc::DeclareEnterOp>)
+    DeclareOp::create(builder, loc,
+                      mlir::acc::DeclareTokenType::get(entryOp.getContext()),
+                      mlir::ValueRange(entryOp.getAccVar()));
+  else
+    DeclareOp::create(builder, loc, mlir::Value{},
+                      mlir::ValueRange(entryOp.getAccVar()));
+  if constexpr (std::is_same_v<GlobalCtorOrDtorOp,
+                               mlir::acc::GlobalDestructorOp>) {
+    if constexpr (std::is_same_v<ExitOp, mlir::acc::DeclareLinkOp>) {
+      // No destructor emission for declare link in this path to avoid
+      // complex var/varType/varPtrPtr signatures. The ctor registers the link.
+    } else if constexpr (std::is_same_v<ExitOp, mlir::acc::CopyoutOp> ||
+                         std::is_same_v<ExitOp, mlir::acc::UpdateHostOp>) {
+      ExitOp::create(builder, entryOp.getLoc(), entryOp.getAccVar(),
+                     entryOp.getVar(), entryOp.getVarType(),
+                     entryOp.getBounds(), entryOp.getAsyncOperands(),
+                     entryOp.getAsyncOperandsDeviceTypeAttr(),
+                     entryOp.getAsyncOnlyAttr(), entryOp.getDataClause(),
+                     /*structured=*/false, /*implicit=*/false,
+                     builder.getStringAttr(*entryOp.getName()));
+    } else {
+      ExitOp::create(builder, entryOp.getLoc(), entryOp.getAccVar(),
+                     entryOp.getBounds(), entryOp.getAsyncOperands(),
+                     entryOp.getAsyncOperandsDeviceTypeAttr(),
+                     entryOp.getAsyncOnlyAttr(), entryOp.getDataClause(),
+                     /*structured=*/false, /*implicit=*/false,
+                     builder.getStringAttr(*entryOp.getName()));
+    }
+  }
+  mlir::acc::TerminatorOp::create(builder, loc);
+  modBuilder.setInsertionPointAfter(declareGlobalOp);
+}
+
+template <typename EntryOp, typename ExitOp>
+static void
+emitCtorDtorPair(mlir::OpBuilder &modBuilder, fir::FirOpBuilder &builder,
+                 mlir::Location operandLocation, fir::GlobalOp globalOp,
+                 mlir::acc::DataClause clause, std::stringstream &asFortran,
+                 const std::string &ctorName) {
+  createDeclareGlobalOp<mlir::acc::GlobalConstructorOp, EntryOp,
+                        mlir::acc::DeclareEnterOp, ExitOp>(
+      modBuilder, builder, operandLocation, globalOp, clause, ctorName,
+      /*implicit=*/false, asFortran);
+
+  std::stringstream dtorName;
+  dtorName << globalOp.getSymName().str() << "_acc_dtor";
+  createDeclareGlobalOp<mlir::acc::GlobalDestructorOp,
+                        mlir::acc::GetDevicePtrOp, mlir::acc::DeclareExitOp,
+                        ExitOp>(modBuilder, builder, operandLocation, globalOp,
+                                clause, dtorName.str(),
+                                /*implicit=*/false, asFortran);
+}
+
 template <typename EntryOp, typename ExitOp>
 static void genDeclareDataOperandOperations(
     const Fortran::parser::AccObjectList &objectList,
@@ -726,6 +811,37 @@ static void genDeclareDataOperandOperations(
     std::stringstream asFortran;
     mlir::Location operandLocation = genOperandLocation(converter, accObject);
     Fortran::semantics::Symbol &symbol = getSymbolFromAccObject(accObject);
+    // Handle COMMON/global symbols via module-level ctor/dtor path.
+    if (symbol.detailsIf<Fortran::semantics::CommonBlockDetails>() ||
+        Fortran::semantics::FindCommonBlockContaining(symbol)) {
+      emitCommonGlobal(
+          converter, builder, accObject, dataClause,
+          [&](mlir::OpBuilder &modBuilder, mlir::Location loc,
+              fir::GlobalOp globalOp, mlir::acc::DataClause clause,
+              std::stringstream &asFortranStr, const std::string &ctorName) {
+            if constexpr (std::is_same_v<EntryOp, mlir::acc::DeclareLinkOp>) {
+              createDeclareGlobalOp<
+                  mlir::acc::GlobalConstructorOp, mlir::acc::DeclareLinkOp,
+                  mlir::acc::DeclareEnterOp, mlir::acc::DeclareLinkOp>(
+                  modBuilder, builder, loc, globalOp, clause, ctorName,
+                  /*implicit=*/false, asFortranStr);
+            } else if constexpr (std::is_same_v<EntryOp, mlir::acc::CreateOp> ||
+                                 std::is_same_v<EntryOp, mlir::acc::CopyinOp> ||
+                                 std::is_same_v<
+                                     EntryOp,
+                                     mlir::acc::DeclareDeviceResidentOp> ||
+                                 std::is_same_v<ExitOp, mlir::acc::CopyoutOp>) {
+              emitCtorDtorPair<EntryOp, ExitOp>(modBuilder, builder, loc,
+                                                globalOp, clause, asFortranStr,
+                                                ctorName);
+            } else {
+              // No module-level ctor/dtor for this clause (e.g., deviceptr,
+              // present). Handled via structured declare region only.
+              return;
+            }
+          });
+      continue;
+    }
     Fortran::semantics::MaybeExpr designator = Fortran::common::visit(
         [&](auto &&s) { return ea.Analyze(s); }, accObject.u);
     fir::factory::AddrAndBoundsInfo info =
@@ -978,15 +1094,40 @@ static RecipeOp genRecipeOp(
   auto mappableTy = mlir::dyn_cast<mlir::acc::MappableType>(ty);
   assert(mappableTy &&
          "Expected that all variable types are considered mappable");
+  bool needsDestroy = false;
   auto retVal = mappableTy.generatePrivateInit(
       builder, loc,
       mlir::cast<mlir::TypedValue<mlir::acc::MappableType>>(
           initBlock->getArgument(0)),
       initName,
       initBlock->getArguments().take_back(initBlock->getArguments().size() - 1),
-      initValue);
+      initValue, needsDestroy);
   mlir::acc::YieldOp::create(builder, loc,
                              retVal ? retVal : initBlock->getArgument(0));
+  // Create destroy region and generate destruction if requested.
+  if (needsDestroy) {
+    llvm::SmallVector<mlir::Type> destroyArgsTy;
+    llvm::SmallVector<mlir::Location> destroyArgsLoc;
+    // original and privatized/reduction value
+    destroyArgsTy.push_back(ty);
+    destroyArgsTy.push_back(ty);
+    destroyArgsLoc.push_back(loc);
+    destroyArgsLoc.push_back(loc);
+    // Append bounds arguments (if any) in the same order as init region
+    if (argsTy.size() > 1) {
+      destroyArgsTy.append(argsTy.begin() + 1, argsTy.end());
+      destroyArgsLoc.insert(destroyArgsLoc.end(), argsTy.size() - 1, loc);
+    }
+
+    builder.createBlock(&recipe.getDestroyRegion(),
+                        recipe.getDestroyRegion().end(), destroyArgsTy,
+                        destroyArgsLoc);
+    builder.setInsertionPointToEnd(&recipe.getDestroyRegion().back());
+    // Call interface on the privatized/reduction value (2nd argument).
+    (void)mappableTy.generatePrivateDestroy(
+        builder, loc, recipe.getDestroyRegion().front().getArgument(1));
+    mlir::acc::TerminatorOp::create(builder, loc);
+  }
   return recipe;
 }
 
@@ -2687,12 +2828,19 @@ genACC(Fortran::lower::AbstractConverter &converter,
   const auto &loopDirective =
       std::get<Fortran::parser::AccLoopDirective>(beginLoopDirective.t);
 
+  mlir::Location currentLocation =
+      converter.genLocation(beginLoopDirective.source);
   bool needEarlyExitHandling = false;
-  if (eval.lowerAsUnstructured())
+  if (eval.lowerAsUnstructured()) {
     needEarlyExitHandling = hasEarlyReturn(eval);
+    // If the loop is lowered in an unstructured fashion, lowering generates
+    // explicit control flow that duplicates the looping semantics of the
+    // loops.
+    if (!needEarlyExitHandling)
+      TODO(currentLocation,
+           "loop with early exit inside OpenACC loop construct");
+  }
 
-  mlir::Location currentLocation =
-      converter.genLocation(beginLoopDirective.source);
   Fortran::lower::StatementContext stmtCtx;
 
   assert(loopDirective.v == llvm::acc::ACCD_loop &&
@@ -2875,7 +3023,7 @@ static Op createComputeOp(
             if (const auto *designator =
                     std::get_if<Fortran::parser::Designator>(&accObject.u)) {
               if (const auto *name =
-                      Fortran::semantics::getDesignatorNameIfDataRef(
+                      Fortran::parser::GetDesignatorNameIfDataRef(
                           *designator)) {
                 auto cond = converter.getSymbolAddress(*name->symbol);
                 selfCond = builder.createConvert(clauseLocation,
@@ -3491,6 +3639,10 @@ genACC(Fortran::lower::AbstractConverter &converter,
       converter.genLocation(beginCombinedDirective.source);
   Fortran::lower::StatementContext stmtCtx;
 
+  if (eval.lowerAsUnstructured())
+    TODO(currentLocation,
+         "loop with early exit inside OpenACC combined construct");
+
   if (combinedDirective.v == llvm::acc::ACCD_kernels_loop) {
     createComputeOp<mlir::acc::KernelsOp>(
         converter, currentLocation, eval, semanticsContext, stmtCtx,
@@ -4055,49 +4207,6 @@ static void genACC(Fortran::lower::AbstractConverter &converter,
     waitOp.setAsyncAttr(firOpBuilder.getUnitAttr());
 }
 
-template <typename GlobalOp, typename EntryOp, typename DeclareOp,
-          typename ExitOp>
-static void createDeclareGlobalOp(mlir::OpBuilder &modBuilder,
-                                  fir::FirOpBuilder &builder,
-                                  mlir::Location loc, fir::GlobalOp globalOp,
-                                  mlir::acc::DataClause clause,
-                                  const std::string &declareGlobalName,
-                                  bool implicit, std::stringstream &asFortran) {
-  GlobalOp declareGlobalOp =
-      GlobalOp::create(modBuilder, loc, declareGlobalName);
-  builder.createBlock(&declareGlobalOp.getRegion(),
-                      declareGlobalOp.getRegion().end(), {}, {});
-  builder.setInsertionPointToEnd(&declareGlobalOp.getRegion().back());
-
-  fir::AddrOfOp addrOp = fir::AddrOfOp::create(
-      builder, loc, fir::ReferenceType::get(globalOp.getType()),
-      globalOp.getSymbol());
-  addDeclareAttr(builder, addrOp, clause);
-
-  llvm::SmallVector<mlir::Value> bounds;
-  EntryOp entryOp = createDataEntryOp<EntryOp>(
-      builder, loc, addrOp.getResTy(), asFortran, bounds,
-      /*structured=*/false, implicit, clause, addrOp.getResTy().getType(),
-      /*async=*/{}, /*asyncDeviceTypes=*/{}, /*asyncOnlyDeviceTypes=*/{});
-  if constexpr (std::is_same_v<DeclareOp, mlir::acc::DeclareEnterOp>)
-    DeclareOp::create(builder, loc,
-                      mlir::acc::DeclareTokenType::get(entryOp.getContext()),
-                      mlir::ValueRange(entryOp.getAccVar()));
-  else
-    DeclareOp::create(builder, loc, mlir::Value{},
-                      mlir::ValueRange(entryOp.getAccVar()));
-  if constexpr (std::is_same_v<GlobalOp, mlir::acc::GlobalDestructorOp>) {
-    ExitOp::create(builder, entryOp.getLoc(), entryOp.getAccVar(),
-                   entryOp.getBounds(), entryOp.getAsyncOperands(),
-                   entryOp.getAsyncOperandsDeviceTypeAttr(),
-                   entryOp.getAsyncOnlyAttr(), entryOp.getDataClause(),
-                   /*structured=*/false, /*implicit=*/false,
-                   builder.getStringAttr(*entryOp.getName()));
-  }
-  mlir::acc::TerminatorOp::create(builder, loc);
-  modBuilder.setInsertionPointAfter(declareGlobalOp);
-}
-
 template <typename EntryOp>
 static void createDeclareAllocFunc(mlir::OpBuilder &modBuilder,
                                    fir::FirOpBuilder &builder,
@@ -4236,8 +4345,7 @@ static void genGlobalCtors(Fortran::lower::AbstractConverter &converter,
         Fortran::common::visitors{
             [&](const Fortran::parser::Designator &designator) {
               if (const auto *name =
-                      Fortran::semantics::getDesignatorNameIfDataRef(
-                          designator)) {
+                      Fortran::parser::GetDesignatorNameIfDataRef(designator)) {
                 genCtors(operandLocation, *name->symbol);
               }
             },
@@ -4275,6 +4383,66 @@ genGlobalCtorsWithModifier(Fortran::lower::AbstractConverter &converter,
                                   dataClause);
 }
 
+static fir::GlobalOp
+lookupGlobalBySymbolOrEquivalence(Fortran::lower::AbstractConverter &converter,
+                                  fir::FirOpBuilder &builder,
+                                  const Fortran::semantics::Symbol &sym) {
+  const Fortran::semantics::Symbol *commonBlock =
+      Fortran::semantics::FindCommonBlockContaining(sym);
+  std::string globalName = commonBlock ? converter.mangleName(*commonBlock)
+                                       : converter.mangleName(sym);
+  if (fir::GlobalOp g = builder.getNamedGlobal(globalName)) {
+    return g;
+  }
+  // Not found: if not a COMMON member, try equivalence members
+  if (!commonBlock) {
+    if (const Fortran::semantics::EquivalenceSet *eqSet =
+            Fortran::semantics::FindEquivalenceSet(sym)) {
+      for (const Fortran::semantics::EquivalenceObject &eqObj : *eqSet) {
+        std::string eqName = converter.mangleName(eqObj.symbol);
+        if (fir::GlobalOp g = builder.getNamedGlobal(eqName))
+          return g;
+      }
+    }
+  }
+  return {};
+}
+
+template <typename EmitterFn>
+static void emitCommonGlobal(Fortran::lower::AbstractConverter &converter,
+                             fir::FirOpBuilder &builder,
+                             const Fortran::parser::AccObject &obj,
+                             mlir::acc::DataClause clause,
+                             EmitterFn &&emitCtorDtor) {
+  Fortran::semantics::Symbol &sym = getSymbolFromAccObject(obj);
+  if (!(sym.detailsIf<Fortran::semantics::CommonBlockDetails>() ||
+        Fortran::semantics::FindCommonBlockContaining(sym)))
+    return;
+
+  fir::GlobalOp globalOp =
+      lookupGlobalBySymbolOrEquivalence(converter, builder, sym);
+  if (!globalOp)
+    llvm::report_fatal_error("could not retrieve global symbol");
+
+  std::stringstream ctorName;
+  ctorName << globalOp.getSymName().str() << "_acc_ctor";
+  if (builder.getModule().lookupSymbol<mlir::acc::GlobalConstructorOp>(
+          ctorName.str()))
+    return;
+
+  mlir::Location operandLocation = genOperandLocation(converter, obj);
+  addDeclareAttr(builder, globalOp.getOperation(), clause);
+  mlir::OpBuilder modBuilder(builder.getModule().getBodyRegion());
+  modBuilder.setInsertionPointAfter(globalOp);
+  std::stringstream asFortran;
+  asFortran << sym.name().ToString();
+
+  auto savedIP = builder.saveInsertionPoint();
+  emitCtorDtor(modBuilder, operandLocation, globalOp, clause, asFortran,
+               ctorName.str());
+  builder.restoreInsertionPoint(savedIP);
+}
+
 static void
 genDeclareInFunction(Fortran::lower::AbstractConverter &converter,
                      Fortran::semantics::SemanticsContext &semanticsContext,
@@ -4300,11 +4468,9 @@ genDeclareInFunction(Fortran::lower::AbstractConverter &converter,
                                dataClauseOperands.end());
     } else if (const auto *createClause =
                    std::get_if<Fortran::parser::AccClause::Create>(&clause.u)) {
-      const Fortran::parser::AccObjectListWithModifier &listWithModifier =
-          createClause->v;
-      const auto &accObjectList =
-          std::get<Fortran::parser::AccObjectList>(listWithModifier.t);
       auto crtDataStart = dataClauseOperands.size();
+      const auto &accObjectList =
+          std::get<Fortran::parser::AccObjectList>(createClause->v.t);
       genDeclareDataOperandOperations<mlir::acc::CreateOp, mlir::acc::DeleteOp>(
           accObjectList, converter, semanticsContext, stmtCtx,
           dataClauseOperands, mlir::acc::DataClause::acc_create,
@@ -4336,11 +4502,9 @@ genDeclareInFunction(Fortran::lower::AbstractConverter &converter,
     } else if (const auto *copyoutClause =
                    std::get_if<Fortran::parser::AccClause::Copyout>(
                        &clause.u)) {
-      const Fortran::parser::AccObjectListWithModifier &listWithModifier =
-          copyoutClause->v;
-      const auto &accObjectList =
-          std::get<Fortran::parser::AccObjectList>(listWithModifier.t);
       auto crtDataStart = dataClauseOperands.size();
+      const auto &accObjectList =
+          std::get<Fortran::parser::AccObjectList>(copyoutClause->v.t);
       genDeclareDataOperandOperations<mlir::acc::CreateOp,
                                       mlir::acc::CopyoutOp>(
           accObjectList, converter, semanticsContext, stmtCtx,
@@ -4381,6 +4545,11 @@ genDeclareInFunction(Fortran::lower::AbstractConverter &converter,
     }
   }
 
+  // If no structured operands were generated (all objects were COMMON),
+  // do not create a declare region.
+  if (dataClauseOperands.empty())
+    return;
+
   mlir::func::FuncOp funcOp = builder.getFunction();
   auto ops = funcOp.getOps<mlir::acc::DeclareEnterOp>();
   mlir::Value declareToken;
diff --git a/flang/lib/Lower/OpenMP/Atomic.cpp b/flang/lib/Lower/OpenMP/Atomic.cpp
index ff82a36951bfa..3ab8a5891e8a6 100644
--- a/flang/lib/Lower/OpenMP/Atomic.cpp
+++ b/flang/lib/Lower/OpenMP/Atomic.cpp
@@ -20,6 +20,7 @@
 #include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Optimizer/Builder/Todo.h"
 #include "flang/Parser/parse-tree.h"
+#include "flang/Semantics/openmp-utils.h"
 #include "flang/Semantics/semantics.h"
 #include "flang/Semantics/type.h"
 #include "flang/Support/Fortran.h"
@@ -183,12 +184,8 @@ getMemoryOrderFromRequires(const semantics::Scope &scope) {
   // scope.
   // For safety, traverse all enclosing scopes and check if their symbol
   // contains REQUIRES.
-  for (const auto *sc{&scope}; sc->kind() != semantics::Scope::Kind::Global;
-       sc = &sc->parent()) {
-    const semantics::Symbol *sym = sc->symbol();
-    if (!sym)
-      continue;
-
+  const semantics::Scope &unitScope = semantics::omp::GetProgramUnit(scope);
+  if (auto *symbol = unitScope.symbol()) {
     const common::OmpMemoryOrderType *admo = common::visit(
         [](auto &&s) {
           using WithOmpDeclarative = semantics::WithOmpDeclarative;
@@ -198,7 +195,8 @@ getMemoryOrderFromRequires(const semantics::Scope &scope) {
           }
           return static_cast<const common::OmpMemoryOrderType *>(nullptr);
         },
-        sym->details());
+        symbol->details());
+
     if (admo)
       return getMemoryOrderKind(*admo);
   }
@@ -214,19 +212,83 @@ getDefaultAtomicMemOrder(semantics::SemanticsContext &semaCtx) {
   return std::nullopt;
 }
 
-static std::optional<mlir::omp::ClauseMemoryOrderKind>
+static std::pair<std::optional<mlir::omp::ClauseMemoryOrderKind>, bool>
 getAtomicMemoryOrder(semantics::SemanticsContext &semaCtx,
                      const omp::List<omp::Clause> &clauses,
                      const semantics::Scope &scope) {
   for (const omp::Clause &clause : clauses) {
     if (auto maybeKind = getMemoryOrderKind(clause.id))
-      return *maybeKind;
+      return std::make_pair(*maybeKind, /*canOverride=*/false);
   }
 
   if (auto maybeKind = getMemoryOrderFromRequires(scope))
-    return *maybeKind;
+    return std::make_pair(*maybeKind, /*canOverride=*/true);
 
-  return getDefaultAtomicMemOrder(semaCtx);
+  return std::make_pair(getDefaultAtomicMemOrder(semaCtx),
+                        /*canOverride=*/false);
+}
+
+static std::optional<mlir::omp::ClauseMemoryOrderKind>
+makeValidForAction(std::optional<mlir::omp::ClauseMemoryOrderKind> memOrder,
+                   int action0, int action1, unsigned version) {
+  // When the atomic default memory order specified on a REQUIRES directive is
+  // disallowed on a given ATOMIC operation, and it's not ACQ_REL, the order
+  // reverts to RELAXED. ACQ_REL decays to either ACQUIRE or RELEASE, depending
+  // on the operation.
+
+  if (!memOrder) {
+    return memOrder;
+  }
+
+  using Analysis = parser::OpenMPAtomicConstruct::Analysis;
+  // Figure out the main action (i.e. disregard a potential capture operation)
+  int action = action0;
+  if (action1 != Analysis::None)
+    action = action0 == Analysis::Read ? action1 : action0;
+
+  // Avaliable orderings: acquire, acq_rel, relaxed, release, seq_cst
+
+  if (action == Analysis::Read) {
+    // "acq_rel" decays to "acquire"
+    if (*memOrder == mlir::omp::ClauseMemoryOrderKind::Acq_rel)
+      return mlir::omp::ClauseMemoryOrderKind::Acquire;
+  } else if (action == Analysis::Write) {
+    // "acq_rel" decays to "release"
+    if (*memOrder == mlir::omp::ClauseMemoryOrderKind::Acq_rel)
+      return mlir::omp::ClauseMemoryOrderKind::Release;
+  }
+
+  if (version > 50) {
+    if (action == Analysis::Read) {
+      // "release" prohibited
+      if (*memOrder == mlir::omp::ClauseMemoryOrderKind::Release)
+        return mlir::omp::ClauseMemoryOrderKind::Relaxed;
+    }
+    if (action == Analysis::Write) {
+      // "acquire" prohibited
+      if (*memOrder == mlir::omp::ClauseMemoryOrderKind::Acquire)
+        return mlir::omp::ClauseMemoryOrderKind::Relaxed;
+    }
+  } else {
+    if (action == Analysis::Read) {
+      // "release" prohibited
+      if (*memOrder == mlir::omp::ClauseMemoryOrderKind::Release)
+        return mlir::omp::ClauseMemoryOrderKind::Relaxed;
+    } else {
+      if (action & Analysis::Write) { // include "update"
+        // "acquire" prohibited
+        if (*memOrder == mlir::omp::ClauseMemoryOrderKind::Acquire)
+          return mlir::omp::ClauseMemoryOrderKind::Relaxed;
+        if (action == Analysis::Update) {
+          // "acq_rel" prohibited
+          if (*memOrder == mlir::omp::ClauseMemoryOrderKind::Acq_rel)
+            return mlir::omp::ClauseMemoryOrderKind::Relaxed;
+        }
+      }
+    }
+  }
+
+  return memOrder;
 }
 
 static mlir::omp::ClauseMemoryOrderKindAttr
@@ -449,16 +511,19 @@ void Fortran::lower::omp::lowerAtomic(
   mlir::Value atomAddr =
       fir::getBase(converter.genExprAddr(atom, stmtCtx, &loc));
   mlir::IntegerAttr hint = getAtomicHint(converter, clauses);
-  std::optional<mlir::omp::ClauseMemoryOrderKind> memOrder =
-      getAtomicMemoryOrder(semaCtx, clauses,
-                           semaCtx.FindScope(construct.source));
+  auto [memOrder, canOverride] = getAtomicMemoryOrder(
+      semaCtx, clauses, semaCtx.FindScope(construct.source));
+
+  unsigned version = semaCtx.langOptions().OpenMPVersion;
+  int action0 = analysis.op0.what & analysis.Action;
+  int action1 = analysis.op1.what & analysis.Action;
+  if (canOverride)
+    memOrder = makeValidForAction(memOrder, action0, action1, version);
 
   if (auto *cond = get(analysis.cond)) {
     (void)cond;
     TODO(loc, "OpenMP ATOMIC COMPARE");
   } else {
-    int action0 = analysis.op0.what & analysis.Action;
-    int action1 = analysis.op1.what & analysis.Action;
     mlir::Operation *captureOp = nullptr;
     fir::FirOpBuilder::InsertPoint preAt = builder.saveInsertionPoint();
     fir::FirOpBuilder::InsertPoint atomicAt, postAt;
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index 69693dc4d5132..ce05ab22c880f 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -1080,9 +1080,8 @@ bool ClauseProcessor::processHasDeviceAddr(
       [&](const omp::clause::HasDeviceAddr &clause,
           const parser::CharBlock &source) {
         mlir::Location location = converter.genLocation(source);
-        llvm::omp::OpenMPOffloadMappingFlags mapTypeBits =
-            llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO |
-            llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT;
+        mlir::omp::ClauseMapFlags mapTypeBits =
+            mlir::omp::ClauseMapFlags::to | mlir::omp::ClauseMapFlags::implicit;
         omp::ObjectList baseObjects;
         llvm::transform(clause.v, std::back_inserter(baseObjects),
                         [&](const omp::Object &object) {
@@ -1183,8 +1182,7 @@ bool ClauseProcessor::processLink(
 
 void ClauseProcessor::processMapObjects(
     lower::StatementContext &stmtCtx, mlir::Location clauseLocation,
-    const omp::ObjectList &objects,
-    llvm::omp::OpenMPOffloadMappingFlags mapTypeBits,
+    const omp::ObjectList &objects, mlir::omp::ClauseMapFlags mapTypeBits,
     std::map<Object, OmpMapParentAndMemberData> &parentMemberIndices,
     llvm::SmallVectorImpl<mlir::Value> &mapVars,
     llvm::SmallVectorImpl<const semantics::Symbol *> &mapSyms,
@@ -1276,10 +1274,7 @@ void ClauseProcessor::processMapObjects(
     mlir::omp::MapInfoOp mapOp = utils::openmp::createMapInfoOp(
         firOpBuilder, location, baseOp,
         /*varPtrPtr=*/mlir::Value{}, asFortran.str(), bounds,
-        /*members=*/{}, /*membersIndex=*/mlir::ArrayAttr{},
-        static_cast<
-            std::underlying_type_t<llvm::omp::OpenMPOffloadMappingFlags>>(
-            mapTypeBits),
+        /*members=*/{}, /*membersIndex=*/mlir::ArrayAttr{}, mapTypeBits,
         mlir::omp::VariableCaptureKind::ByRef, baseOp.getType(),
         /*partialMap=*/false, mapperId);
 
@@ -1309,10 +1304,11 @@ bool ClauseProcessor::processMap(
                      const parser::CharBlock &source) {
     using Map = omp::clause::Map;
     mlir::Location clauseLocation = converter.genLocation(source);
-    const auto &[mapType, typeMods, refMod, mappers, iterator, objects] =
-        clause.t;
-    llvm::omp::OpenMPOffloadMappingFlags mapTypeBits =
-        llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_NONE;
+    const auto &[mapType, typeMods, attachMod, refMod, mappers, iterator,
+                 objects] = clause.t;
+    if (attachMod)
+      TODO(currentLocation, "ATTACH modifier is not implemented yet");
+    mlir::omp::ClauseMapFlags mapTypeBits = mlir::omp::ClauseMapFlags::none;
     std::string mapperIdName = "__implicit_mapper";
     // If the map type is specified, then process it else set the appropriate
     // default value
@@ -1328,36 +1324,32 @@ bool ClauseProcessor::processMap(
 
     switch (type) {
     case Map::MapType::To:
-      mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO;
+      mapTypeBits |= mlir::omp::ClauseMapFlags::to;
       break;
     case Map::MapType::From:
-      mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM;
+      mapTypeBits |= mlir::omp::ClauseMapFlags::from;
       break;
     case Map::MapType::Tofrom:
-      mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO |
-                     llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM;
+      mapTypeBits |=
+          mlir::omp::ClauseMapFlags::to | mlir::omp::ClauseMapFlags::from;
       break;
     case Map::MapType::Storage:
-      // alloc and release is the default map_type for the Target Data
-      // Ops, i.e. if no bits for map_type is supplied then alloc/release
-      // (aka storage in 6.0+) is implicitly assumed based on the target
-      // directive. Default value for Target Data and Enter Data is alloc
-      // and for Exit Data it is release.
+      mapTypeBits |= mlir::omp::ClauseMapFlags::storage;
       break;
     }
 
     if (typeMods) {
       // TODO: Still requires "self" modifier, an OpenMP 6.0+ feature
       if (llvm::is_contained(*typeMods, Map::MapTypeModifier::Always))
-        mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_ALWAYS;
+        mapTypeBits |= mlir::omp::ClauseMapFlags::always;
       if (llvm::is_contained(*typeMods, Map::MapTypeModifier::Present))
-        mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_PRESENT;
+        mapTypeBits |= mlir::omp::ClauseMapFlags::present;
       if (llvm::is_contained(*typeMods, Map::MapTypeModifier::Close))
-        mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_CLOSE;
+        mapTypeBits |= mlir::omp::ClauseMapFlags::close;
       if (llvm::is_contained(*typeMods, Map::MapTypeModifier::Delete))
-        mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_DELETE;
+        mapTypeBits |= mlir::omp::ClauseMapFlags::del;
       if (llvm::is_contained(*typeMods, Map::MapTypeModifier::OmpxHold))
-        mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_OMPX_HOLD;
+        mapTypeBits |= mlir::omp::ClauseMapFlags::ompx_hold;
     }
 
     if (iterator) {
@@ -1401,12 +1393,12 @@ bool ClauseProcessor::processMotionClauses(lower::StatementContext &stmtCtx,
       TODO(clauseLocation, "Iterator modifier is not supported yet");
     }
 
-    llvm::omp::OpenMPOffloadMappingFlags mapTypeBits =
+    mlir::omp::ClauseMapFlags mapTypeBits =
         std::is_same_v<llvm::remove_cvref_t<decltype(clause)>, omp::clause::To>
-            ? llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO
-            : llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM;
+            ? mlir::omp::ClauseMapFlags::to
+            : mlir::omp::ClauseMapFlags::from;
     if (expectation && *expectation == omp::clause::To::Expectation::Present)
-      mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_PRESENT;
+      mapTypeBits |= mlir::omp::ClauseMapFlags::present;
     processMapObjects(stmtCtx, clauseLocation, objects, mapTypeBits,
                       parentMemberIndices, result.mapVars, mapSymbols, "",
                       true);
@@ -1533,8 +1525,8 @@ bool ClauseProcessor::processUseDeviceAddr(
       [&](const omp::clause::UseDeviceAddr &clause,
           const parser::CharBlock &source) {
         mlir::Location location = converter.genLocation(source);
-        llvm::omp::OpenMPOffloadMappingFlags mapTypeBits =
-            llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_RETURN_PARAM;
+        mlir::omp::ClauseMapFlags mapTypeBits =
+            mlir::omp::ClauseMapFlags::return_param;
         processMapObjects(stmtCtx, location, clause.v, mapTypeBits,
                           parentMemberIndices, result.useDeviceAddrVars,
                           useDeviceSyms);
@@ -1554,8 +1546,8 @@ bool ClauseProcessor::processUseDevicePtr(
       [&](const omp::clause::UseDevicePtr &clause,
           const parser::CharBlock &source) {
         mlir::Location location = converter.genLocation(source);
-        llvm::omp::OpenMPOffloadMappingFlags mapTypeBits =
-            llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_RETURN_PARAM;
+        mlir::omp::ClauseMapFlags mapTypeBits =
+            mlir::omp::ClauseMapFlags::return_param;
         processMapObjects(stmtCtx, location, clause.v, mapTypeBits,
                           parentMemberIndices, result.useDevicePtrVars,
                           useDeviceSyms);
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h
index d86faf067081f..fecf3ca4af9dd 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.h
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h
@@ -193,8 +193,7 @@ class ClauseProcessor {
 
   void processMapObjects(
       lower::StatementContext &stmtCtx, mlir::Location clauseLocation,
-      const omp::ObjectList &objects,
-      llvm::omp::OpenMPOffloadMappingFlags mapTypeBits,
+      const omp::ObjectList &objects, mlir::omp::ClauseMapFlags mapTypeBits,
       std::map<Object, OmpMapParentAndMemberData> &parentMemberIndices,
       llvm::SmallVectorImpl<mlir::Value> &mapVars,
       llvm::SmallVectorImpl<const semantics::Symbol *> &mapSyms,
diff --git a/flang/lib/Lower/OpenMP/Clauses.cpp b/flang/lib/Lower/OpenMP/Clauses.cpp
index fac37a372caaf..d39f9dda92a28 100644
--- a/flang/lib/Lower/OpenMP/Clauses.cpp
+++ b/flang/lib/Lower/OpenMP/Clauses.cpp
@@ -16,8 +16,6 @@
 #include "flang/Semantics/openmp-modifiers.h"
 #include "flang/Semantics/symbol.h"
 
-#include "llvm/Frontend/OpenMP/OMPConstants.h"
-
 #include <list>
 #include <optional>
 #include <tuple>
@@ -219,7 +217,6 @@ MAKE_EMPTY_CLASS(AcqRel, AcqRel);
 MAKE_EMPTY_CLASS(Acquire, Acquire);
 MAKE_EMPTY_CLASS(Capture, Capture);
 MAKE_EMPTY_CLASS(Compare, Compare);
-MAKE_EMPTY_CLASS(DynamicAllocators, DynamicAllocators);
 MAKE_EMPTY_CLASS(Full, Full);
 MAKE_EMPTY_CLASS(Inbranch, Inbranch);
 MAKE_EMPTY_CLASS(Mergeable, Mergeable);
@@ -235,13 +232,9 @@ MAKE_EMPTY_CLASS(OmpxBare, OmpxBare);
 MAKE_EMPTY_CLASS(Read, Read);
 MAKE_EMPTY_CLASS(Relaxed, Relaxed);
 MAKE_EMPTY_CLASS(Release, Release);
-MAKE_EMPTY_CLASS(ReverseOffload, ReverseOffload);
 MAKE_EMPTY_CLASS(SeqCst, SeqCst);
-MAKE_EMPTY_CLASS(SelfMaps, SelfMaps);
 MAKE_EMPTY_CLASS(Simd, Simd);
 MAKE_EMPTY_CLASS(Threads, Threads);
-MAKE_EMPTY_CLASS(UnifiedAddress, UnifiedAddress);
-MAKE_EMPTY_CLASS(UnifiedSharedMemory, UnifiedSharedMemory);
 MAKE_EMPTY_CLASS(Unknown, Unknown);
 MAKE_EMPTY_CLASS(Untied, Untied);
 MAKE_EMPTY_CLASS(Weak, Weak);
@@ -745,6 +738,18 @@ Device make(const parser::OmpClause::Device &inp,
                  /*DeviceDescription=*/makeExpr(t1, semaCtx)}};
 }
 
+DeviceSafesync make(const parser::OmpClause::DeviceSafesync &inp,
+                    semantics::SemanticsContext &semaCtx) {
+  // inp.v -> std::optional<parser::OmpDeviceSafesyncClause>
+  auto &&maybeRequired = maybeApply(
+      [&](const parser::OmpDeviceSafesyncClause &c) {
+        return makeExpr(c.v, semaCtx);
+      },
+      inp.v);
+
+  return DeviceSafesync{/*Required=*/std::move(maybeRequired)};
+}
+
 DeviceType make(const parser::OmpClause::DeviceType &inp,
                 semantics::SemanticsContext &semaCtx) {
   // inp.v -> parser::OmpDeviceTypeClause
@@ -775,7 +780,18 @@ Doacross make(const parser::OmpClause::Doacross &inp,
   return makeDoacross(inp.v.v, semaCtx);
 }
 
-// DynamicAllocators: empty
+DynamicAllocators make(const parser::OmpClause::DynamicAllocators &inp,
+                       semantics::SemanticsContext &semaCtx) {
+  // inp.v -> td::optional<arser::OmpDynamicAllocatorsClause>
+  auto &&maybeRequired = maybeApply(
+      [&](const parser::OmpDynamicAllocatorsClause &c) {
+        return makeExpr(c.v, semaCtx);
+      },
+      inp.v);
+
+  return DynamicAllocators{/*Required=*/std::move(maybeRequired)};
+}
+
 
 DynGroupprivate make(const parser::OmpClause::DynGroupprivate &inp,
                      semantics::SemanticsContext &semaCtx) {
@@ -1068,6 +1084,15 @@ Map make(const parser::OmpClause::Map &inp,
       // clang-format on
   );
 
+  CLAUSET_ENUM_CONVERT( //
+      convertAttachMod, parser::OmpAttachModifier::Value, Map::AttachModifier,
+      // clang-format off
+      MS(Always,  Always)
+      MS(Auto,    Auto)
+      MS(Never,   Never)
+      // clang-format on
+  );
+
   CLAUSET_ENUM_CONVERT( //
       convertRefMod, parser::OmpRefModifier::Value, Map::RefModifier,
       // clang-format off
@@ -1115,6 +1140,13 @@ Map make(const parser::OmpClause::Map &inp,
   if (!modSet.empty())
     maybeTypeMods = Map::MapTypeModifiers(modSet.begin(), modSet.end());
 
+  auto attachMod = [&]() -> std::optional<Map::AttachModifier> {
+    if (auto *t =
+            semantics::OmpGetUniqueModifier<parser::OmpAttachModifier>(mods))
+      return convertAttachMod(t->v);
+    return std::nullopt;
+  }();
+
   auto refMod = [&]() -> std::optional<Map::RefModifier> {
     if (auto *t = semantics::OmpGetUniqueModifier<parser::OmpRefModifier>(mods))
       return convertRefMod(t->v);
@@ -1135,6 +1167,7 @@ Map make(const parser::OmpClause::Map &inp,
 
   return Map{{/*MapType=*/std::move(type),
               /*MapTypeModifiers=*/std::move(maybeTypeMods),
+              /*AttachModifier=*/std::move(attachMod),
               /*RefModifier=*/std::move(refMod), /*Mapper=*/std::move(mappers),
               /*Iterator=*/std::move(iterator),
               /*LocatorList=*/makeObjects(t2, semaCtx)}};
@@ -1321,7 +1354,18 @@ Reduction make(const parser::OmpClause::Reduction &inp,
 
 // Relaxed: empty
 // Release: empty
-// ReverseOffload: empty
+
+ReverseOffload make(const parser::OmpClause::ReverseOffload &inp,
+                    semantics::SemanticsContext &semaCtx) {
+  // inp.v -> std::optional<parser::OmpReverseOffloadClause>
+  auto &&maybeRequired = maybeApply(
+      [&](const parser::OmpReverseOffloadClause &c) {
+        return makeExpr(c.v, semaCtx);
+      },
+      inp.v);
+
+  return ReverseOffload{/*Required=*/std::move(maybeRequired)};
+}
 
 Safelen make(const parser::OmpClause::Safelen &inp,
              semantics::SemanticsContext &semaCtx) {
@@ -1374,6 +1418,18 @@ Schedule make(const parser::OmpClause::Schedule &inp,
 
 // SeqCst: empty
 
+SelfMaps make(const parser::OmpClause::SelfMaps &inp,
+              semantics::SemanticsContext &semaCtx) {
+  // inp.v -> std::optional<parser::OmpSelfMapsClause>
+  auto &&maybeRequired = maybeApply(
+      [&](const parser::OmpSelfMapsClause &c) {
+        return makeExpr(c.v, semaCtx);
+      },
+      inp.v);
+
+  return SelfMaps{/*Required=*/std::move(maybeRequired)};
+}
+
 Severity make(const parser::OmpClause::Severity &inp,
               semantics::SemanticsContext &semaCtx) {
   // inp -> empty
@@ -1463,8 +1519,29 @@ To make(const parser::OmpClause::To &inp,
              /*LocatorList=*/makeObjects(t3, semaCtx)}};
 }
 
-// UnifiedAddress: empty
-// UnifiedSharedMemory: empty
+UnifiedAddress make(const parser::OmpClause::UnifiedAddress &inp,
+                    semantics::SemanticsContext &semaCtx) {
+  // inp.v -> std::optional<parser::OmpUnifiedAddressClause>
+  auto &&maybeRequired = maybeApply(
+      [&](const parser::OmpUnifiedAddressClause &c) {
+        return makeExpr(c.v, semaCtx);
+      },
+      inp.v);
+
+  return UnifiedAddress{/*Required=*/std::move(maybeRequired)};
+}
+
+UnifiedSharedMemory make(const parser::OmpClause::UnifiedSharedMemory &inp,
+                         semantics::SemanticsContext &semaCtx) {
+  // inp.v -> std::optional<parser::OmpUnifiedSharedMemoryClause>
+  auto &&maybeRequired = maybeApply(
+      [&](const parser::OmpUnifiedSharedMemoryClause &c) {
+        return makeExpr(c.v, semaCtx);
+      },
+      inp.v);
+
+  return UnifiedSharedMemory{/*Required=*/std::move(maybeRequired)};
+}
 
 Uniform make(const parser::OmpClause::Uniform &inp,
              semantics::SemanticsContext &semaCtx) {
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 3e83a51e202ea..0f70b9181bb3e 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -33,6 +33,7 @@
 #include "flang/Parser/characters.h"
 #include "flang/Parser/openmp-utils.h"
 #include "flang/Parser/parse-tree.h"
+#include "flang/Parser/tools.h"
 #include "flang/Semantics/openmp-directive-sets.h"
 #include "flang/Semantics/openmp-utils.h"
 #include "flang/Semantics/tools.h"
@@ -44,7 +45,6 @@
 #include "mlir/Support/StateStack.h"
 #include "mlir/Transforms/RegionUtils.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/Frontend/OpenMP/OMPConstants.h"
 
 using namespace Fortran::lower::omp;
 using namespace Fortran::common::openmp;
@@ -944,8 +944,7 @@ getDefaultmapIfPresent(const DefaultMapsTy &defaultMaps, mlir::Type varType) {
   return DefMap::ImplicitBehavior::Default;
 }
 
-static std::pair<llvm::omp::OpenMPOffloadMappingFlags,
-                 mlir::omp::VariableCaptureKind>
+static std::pair<mlir::omp::ClauseMapFlags, mlir::omp::VariableCaptureKind>
 getImplicitMapTypeAndKind(fir::FirOpBuilder &firOpBuilder,
                           lower::AbstractConverter &converter,
                           const DefaultMapsTy &defaultMaps, mlir::Type varType,
@@ -966,8 +965,7 @@ getImplicitMapTypeAndKind(fir::FirOpBuilder &firOpBuilder,
     return size <= ptrSize && align <= ptrAlign;
   };
 
-  llvm::omp::OpenMPOffloadMappingFlags mapFlag =
-      llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT;
+  mlir::omp::ClauseMapFlags mapFlag = mlir::omp::ClauseMapFlags::implicit;
 
   auto implicitBehaviour = getDefaultmapIfPresent(defaultMaps, varType);
   if (implicitBehaviour == DefMap::ImplicitBehavior::Default) {
@@ -988,8 +986,8 @@ getImplicitMapTypeAndKind(fir::FirOpBuilder &firOpBuilder,
       // a map clause with a map-type of tofrom
       if (declareTargetOp.getDeclareTargetDeviceType() !=
           mlir::omp::DeclareTargetDeviceType::nohost) {
-        mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO;
-        mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM;
+        mapFlag |= mlir::omp::ClauseMapFlags::to;
+        mapFlag |= mlir::omp::ClauseMapFlags::from;
       }
     } else if (fir::isa_trivial(varType) || fir::isa_char(varType)) {
       // Scalars behave as if they were "firstprivate".
@@ -998,18 +996,18 @@ getImplicitMapTypeAndKind(fir::FirOpBuilder &firOpBuilder,
       if (isLiteralType(varType)) {
         captureKind = mlir::omp::VariableCaptureKind::ByCopy;
       } else {
-        mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO;
+        mapFlag |= mlir::omp::ClauseMapFlags::to;
       }
     } else if (!fir::isa_builtin_cptr_type(varType)) {
-      mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO;
-      mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM;
+      mapFlag |= mlir::omp::ClauseMapFlags::to;
+      mapFlag |= mlir::omp::ClauseMapFlags::from;
     }
     return std::make_pair(mapFlag, captureKind);
   }
 
   switch (implicitBehaviour) {
   case DefMap::ImplicitBehavior::Alloc:
-    return std::make_pair(llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_NONE,
+    return std::make_pair(mlir::omp::ClauseMapFlags::storage,
                           mlir::omp::VariableCaptureKind::ByRef);
     break;
   case DefMap::ImplicitBehavior::Firstprivate:
@@ -1018,26 +1016,22 @@ getImplicitMapTypeAndKind(fir::FirOpBuilder &firOpBuilder,
               "behaviour");
     break;
   case DefMap::ImplicitBehavior::From:
-    return std::make_pair(mapFlag |=
-                          llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM,
+    return std::make_pair(mapFlag |= mlir::omp::ClauseMapFlags::from,
                           mlir::omp::VariableCaptureKind::ByRef);
     break;
   case DefMap::ImplicitBehavior::Present:
-    return std::make_pair(mapFlag |=
-                          llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_PRESENT,
+    return std::make_pair(mapFlag |= mlir::omp::ClauseMapFlags::present,
                           mlir::omp::VariableCaptureKind::ByRef);
     break;
   case DefMap::ImplicitBehavior::To:
-    return std::make_pair(mapFlag |=
-                          llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO,
+    return std::make_pair(mapFlag |= mlir::omp::ClauseMapFlags::to,
                           (fir::isa_trivial(varType) || fir::isa_char(varType))
                               ? mlir::omp::VariableCaptureKind::ByCopy
                               : mlir::omp::VariableCaptureKind::ByRef);
     break;
   case DefMap::ImplicitBehavior::Tofrom:
-    return std::make_pair(mapFlag |=
-                          llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM |
-                          llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO,
+    return std::make_pair(mapFlag |= mlir::omp::ClauseMapFlags::from |
+                                     mlir::omp::ClauseMapFlags::to,
                           mlir::omp::VariableCaptureKind::ByRef);
     break;
   case DefMap::ImplicitBehavior::Default:
@@ -1046,9 +1040,8 @@ getImplicitMapTypeAndKind(fir::FirOpBuilder &firOpBuilder,
     break;
   }
 
-  return std::make_pair(mapFlag |=
-                        llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM |
-                        llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO,
+  return std::make_pair(mapFlag |= mlir::omp::ClauseMapFlags::from |
+                                   mlir::omp::ClauseMapFlags::to,
                         mlir::omp::VariableCaptureKind::ByRef);
 }
 
@@ -2622,18 +2615,14 @@ genTargetOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
       if (auto refType = mlir::dyn_cast<fir::ReferenceType>(baseOp.getType()))
         eleType = refType.getElementType();
 
-      std::pair<llvm::omp::OpenMPOffloadMappingFlags,
-                mlir::omp::VariableCaptureKind>
+      std::pair<mlir::omp::ClauseMapFlags, mlir::omp::VariableCaptureKind>
           mapFlagAndKind = getImplicitMapTypeAndKind(
               firOpBuilder, converter, defaultMaps, eleType, loc, sym);
 
       mlir::Value mapOp = createMapInfoOp(
           firOpBuilder, converter.getCurrentLocation(), baseOp,
           /*varPtrPtr=*/mlir::Value{}, name.str(), bounds, /*members=*/{},
-          /*membersIndex=*/mlir::ArrayAttr{},
-          static_cast<
-              std::underlying_type_t<llvm::omp::OpenMPOffloadMappingFlags>>(
-              std::get<0>(mapFlagAndKind)),
+          /*membersIndex=*/mlir::ArrayAttr{}, std::get<0>(mapFlagAndKind),
           std::get<1>(mapFlagAndKind), baseOp.getType(),
           /*partialMap=*/false, mapperId);
 
@@ -3910,7 +3899,7 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
     assert(object && "Expecting object as argument");
     auto *designator = semantics::omp::GetDesignatorFromObj(*object);
     assert(designator && "Expecting desginator in argument");
-    auto *name = semantics::getDesignatorNameIfDataRef(*designator);
+    auto *name = parser::GetDesignatorNameIfDataRef(*designator);
     assert(name && "Expecting dataref in designator");
     critName = *name;
   }
@@ -4234,18 +4223,17 @@ bool Fortran::lower::markOpenMPDeferredDeclareTargetFunctions(
 void Fortran::lower::genOpenMPRequires(mlir::Operation *mod,
                                        const semantics::Symbol *symbol) {
   using MlirRequires = mlir::omp::ClauseRequires;
-  using SemaRequires = semantics::WithOmpDeclarative::RequiresFlag;
 
   if (auto offloadMod =
           llvm::dyn_cast<mlir::omp::OffloadModuleInterface>(mod)) {
-    semantics::WithOmpDeclarative::RequiresFlags semaFlags;
+    semantics::WithOmpDeclarative::RequiresClauses reqs;
     if (symbol) {
       common::visit(
           [&](const auto &details) {
             if constexpr (std::is_base_of_v<semantics::WithOmpDeclarative,
                                             std::decay_t<decltype(details)>>) {
               if (details.has_ompRequires())
-                semaFlags = *details.ompRequires();
+                reqs = *details.ompRequires();
             }
           },
           symbol->details());
@@ -4254,14 +4242,14 @@ void Fortran::lower::genOpenMPRequires(mlir::Operation *mod,
     // Use pre-populated omp.requires module attribute if it was set, so that
     // the "-fopenmp-force-usm" compiler option is honored.
     MlirRequires mlirFlags = offloadMod.getRequires();
-    if (semaFlags.test(SemaRequires::ReverseOffload))
+    if (reqs.test(llvm::omp::Clause::OMPC_dynamic_allocators))
+      mlirFlags = mlirFlags | MlirRequires::dynamic_allocators;
+    if (reqs.test(llvm::omp::Clause::OMPC_reverse_offload))
       mlirFlags = mlirFlags | MlirRequires::reverse_offload;
-    if (semaFlags.test(SemaRequires::UnifiedAddress))
+    if (reqs.test(llvm::omp::Clause::OMPC_unified_address))
       mlirFlags = mlirFlags | MlirRequires::unified_address;
-    if (semaFlags.test(SemaRequires::UnifiedSharedMemory))
+    if (reqs.test(llvm::omp::Clause::OMPC_unified_shared_memory))
       mlirFlags = mlirFlags | MlirRequires::unified_shared_memory;
-    if (semaFlags.test(SemaRequires::DynamicAllocators))
-      mlirFlags = mlirFlags | MlirRequires::dynamic_allocators;
 
     offloadMod.setRequires(mlirFlags);
   }
diff --git a/flang/lib/Lower/OpenMP/Utils.cpp b/flang/lib/Lower/OpenMP/Utils.cpp
index 48376e61ab26f..de13363b108d2 100644
--- a/flang/lib/Lower/OpenMP/Utils.cpp
+++ b/flang/lib/Lower/OpenMP/Utils.cpp
@@ -20,11 +20,6 @@
 #include <flang/Lower/DirectivesCommon.h>
 #include <flang/Lower/OpenMP/Clauses.h>
 #include <flang/Lower/PFTBuilder.h>
-//<<<<<<< HEAD
-//#include <flang/Lower/StatementContext.h>
-//#include <flang/Lower/Support/PrivateReductionUtils.h>
-//#include <flang/Lower/SymbolMap.h>
-//=======
 #include <flang/Lower/Support/PrivateReductionUtils.h>
 #include <flang/Optimizer/Builder/FIRBuilder.h>
 #include <flang/Optimizer/Builder/Todo.h>
@@ -281,7 +276,7 @@ mlir::Value createParentSymAndGenIntermediateMaps(
     semantics::SemanticsContext &semaCtx, lower::StatementContext &stmtCtx,
     omp::ObjectList &objectList, llvm::SmallVectorImpl<int64_t> &indices,
     OmpMapParentAndMemberData &parentMemberIndices, llvm::StringRef asFortran,
-    llvm::omp::OpenMPOffloadMappingFlags mapTypeBits) {
+    mlir::omp::ClauseMapFlags mapTypeBits) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
 
   /// Checks if an omp::Object is an array expression with a subscript, e.g.
@@ -422,11 +417,10 @@ mlir::Value createParentSymAndGenIntermediateMaps(
         // be safer to just pass OMP_MAP_NONE as the map type, but we may still
         // need some of the other map types the mapped member utilises, so for
         // now it's good to keep an eye on this.
-        llvm::omp::OpenMPOffloadMappingFlags interimMapType = mapTypeBits;
-        interimMapType &= ~llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO;
-        interimMapType &= ~llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM;
-        interimMapType &=
-            ~llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_RETURN_PARAM;
+        mlir::omp::ClauseMapFlags interimMapType = mapTypeBits;
+        interimMapType &= ~mlir::omp::ClauseMapFlags::to;
+        interimMapType &= ~mlir::omp::ClauseMapFlags::from;
+        interimMapType &= ~mlir::omp::ClauseMapFlags::return_param;
 
         // Create a map for the intermediate member and insert it and it's
         // indices into the parentMemberIndices list to track it.
@@ -435,10 +429,7 @@ mlir::Value createParentSymAndGenIntermediateMaps(
             /*varPtrPtr=*/mlir::Value{}, asFortran,
             /*bounds=*/interimBounds,
             /*members=*/{},
-            /*membersIndex=*/mlir::ArrayAttr{},
-            static_cast<
-                std::underlying_type_t<llvm::omp::OpenMPOffloadMappingFlags>>(
-                interimMapType),
+            /*membersIndex=*/mlir::ArrayAttr{}, interimMapType,
             mlir::omp::VariableCaptureKind::ByRef, curValue.getType());
 
         parentMemberIndices.memberPlacementIndices.push_back(interimIndices);
@@ -570,8 +561,7 @@ void insertChildMapInfoIntoParent(
       // selecting a childs can result in the incorrect map type being
       // applied to the parent and data being incorrectly moved to or
       // from device.
-      uint64_t mapType = llvm::to_underlying(
-          llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_NONE);
+      mlir::omp::ClauseMapFlags mapType = mlir::omp::ClauseMapFlags::storage;
 
       llvm::SmallVector<mlir::Value> members;
       members.reserve(indices.second.memberMap.size());
diff --git a/flang/lib/Lower/Runtime.cpp b/flang/lib/Lower/Runtime.cpp
index b19ca0182b4b5..cb555249125f6 100644
--- a/flang/lib/Lower/Runtime.cpp
+++ b/flang/lib/Lower/Runtime.cpp
@@ -12,10 +12,10 @@
 #include "flang/Lower/OpenMP.h"
 #include "flang/Lower/StatementContext.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
-#include "flang/Optimizer/Builder/Runtime/Coarray.h"
 #include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
 #include "flang/Optimizer/Builder/Todo.h"
 #include "flang/Optimizer/Dialect/FIROpsSupport.h"
+#include "flang/Optimizer/Dialect/MIF/MIFOps.h"
 #include "flang/Parser/parse-tree.h"
 #include "flang/Runtime/misc-intrinsic.h"
 #include "flang/Runtime/pointer.h"
@@ -52,7 +52,6 @@ static void genUnreachable(fir::FirOpBuilder &builder, mlir::Location loc) {
 static std::pair<mlir::Value, mlir::Value> getStatAndErrmsg(
     Fortran::lower::AbstractConverter &converter, mlir::Location loc,
     const std::list<Fortran::parser::StatOrErrmsg> &statOrErrList) {
-  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   Fortran::lower::StatementContext stmtCtx;
 
   mlir::Value errMsgExpr, statExpr;
@@ -71,16 +70,6 @@ static std::pair<mlir::Value, mlir::Value> getStatAndErrmsg(
                statOrErr.u);
   }
 
-  if (!statExpr) {
-    statExpr = fir::AbsentOp::create(builder, loc,
-                                     builder.getRefType(builder.getI32Type()));
-  }
-  if (!errMsgExpr) {
-    errMsgExpr = fir::AbsentOp::create(
-        builder, loc,
-        fir::BoxType::get(fir::CharacterType::get(
-            builder.getContext(), 1, fir::CharacterType::unknownLen())));
-  }
   return {statExpr, errMsgExpr};
 }
 
@@ -215,7 +204,7 @@ void Fortran::lower::genSyncAllStatement(
   auto [statAddr, errMsgAddr] = getStatAndErrmsg(converter, loc, statOrErrList);
 
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
-  fir::runtime::genSyncAllStatement(builder, loc, statAddr, errMsgAddr);
+  mif::SyncAllOp::create(builder, loc, statAddr, errMsgAddr);
 }
 
 void Fortran::lower::genSyncImagesStatement(
@@ -244,16 +233,12 @@ void Fortran::lower::genSyncImagesStatement(
                        fir::getBase(converter.genExprBox(loc, *expr, stmtCtx));
                  },
                  [&](const Fortran::parser::Star &) {
-                   imageSet = fir::AbsentOp::create(
-                       builder, loc,
-                       fir::BoxType::get(fir::SequenceType::get(
-                           {fir::SequenceType::getUnknownExtent()},
-                           builder.getI32Type())));
+                   // Image set is not set.
+                   imageSet = mlir::Value{};
                  }},
              imgSet.u);
 
-  fir::runtime::genSyncImagesStatement(builder, loc, imageSet, statAddr,
-                                       errMsgAddr);
+  mif::SyncImagesOp::create(builder, loc, imageSet, statAddr, errMsgAddr);
 }
 
 void Fortran::lower::genSyncMemoryStatement(
@@ -267,7 +252,7 @@ void Fortran::lower::genSyncMemoryStatement(
   auto [statAddr, errMsgAddr] = getStatAndErrmsg(converter, loc, statOrErrList);
 
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
-  fir::runtime::genSyncMemoryStatement(builder, loc, statAddr, errMsgAddr);
+  mif::SyncMemoryOp::create(builder, loc, statAddr, errMsgAddr);
 }
 
 void Fortran::lower::genSyncTeamStatement(
diff --git a/flang/lib/Optimizer/Builder/CMakeLists.txt b/flang/lib/Optimizer/Builder/CMakeLists.txt
index 404afd185fd31..1f95259a857da 100644
--- a/flang/lib/Optimizer/Builder/CMakeLists.txt
+++ b/flang/lib/Optimizer/Builder/CMakeLists.txt
@@ -16,7 +16,6 @@ add_flang_library(FIRBuilder
   Runtime/Allocatable.cpp
   Runtime/ArrayConstructor.cpp
   Runtime/Assign.cpp
-  Runtime/Coarray.cpp
   Runtime/Character.cpp
   Runtime/Command.cpp
   Runtime/CUDA/Descriptor.cpp
@@ -42,6 +41,7 @@ add_flang_library(FIRBuilder
   CUFDialect
   FIRDialect
   HLFIRDialect
+  MIFDialect
 
   LINK_LIBS
   CUFAttrs
@@ -52,6 +52,7 @@ add_flang_library(FIRBuilder
   FortranEvaluate
   FortranSupport
   HLFIRDialect
+  MIFDialect
 
   MLIR_DEPS
   ${dialect_libs}
diff --git a/flang/lib/Optimizer/Builder/Character.cpp b/flang/lib/Optimizer/Builder/Character.cpp
index a096099a04fe8..155bc0fbd19ce 100644
--- a/flang/lib/Optimizer/Builder/Character.cpp
+++ b/flang/lib/Optimizer/Builder/Character.cpp
@@ -92,7 +92,7 @@ getCompileTimeLength(const fir::CharBoxValue &box) {
 
 /// Detect the precondition that the value `str` does not reside in memory. Such
 /// values will have a type `!fir.array<...x!fir.char<N>>` or `!fir.char<N>`.
-LLVM_ATTRIBUTE_UNUSED static bool needToMaterialize(mlir::Value str) {
+[[maybe_unused]] static bool needToMaterialize(mlir::Value str) {
   return mlir::isa<fir::SequenceType>(str.getType()) ||
          fir::isa_char(str.getType());
 }
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index e07baafcef0d7..29eedfb0ce9cd 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -25,7 +25,6 @@
 #include "flang/Optimizer/Builder/Runtime/Allocatable.h"
 #include "flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h"
 #include "flang/Optimizer/Builder/Runtime/Character.h"
-#include "flang/Optimizer/Builder/Runtime/Coarray.h"
 #include "flang/Optimizer/Builder/Runtime/Command.h"
 #include "flang/Optimizer/Builder/Runtime/Derived.h"
 #include "flang/Optimizer/Builder/Runtime/Exceptions.h"
@@ -40,6 +39,7 @@
 #include "flang/Optimizer/Builder/Todo.h"
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/Dialect/FIROpsSupport.h"
+#include "flang/Optimizer/Dialect/MIF/MIFOps.h"
 #include "flang/Optimizer/Dialect/Support/FIRContext.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "flang/Optimizer/Support/FatalError.h"
@@ -412,28 +412,28 @@ static constexpr IntrinsicHandler handlers[]{
     {"co_broadcast",
      &I::genCoBroadcast,
      {{{"a", asBox},
-       {"source_image", asAddr},
+       {"source_image", asValue},
        {"stat", asAddr, handleDynamicOptional},
        {"errmsg", asBox, handleDynamicOptional}}},
      /*isElemental*/ false},
     {"co_max",
      &I::genCoMax,
      {{{"a", asBox},
-       {"result_image", asAddr, handleDynamicOptional},
+       {"result_image", asValue, handleDynamicOptional},
        {"stat", asAddr, handleDynamicOptional},
        {"errmsg", asBox, handleDynamicOptional}}},
      /*isElemental*/ false},
     {"co_min",
      &I::genCoMin,
      {{{"a", asBox},
-       {"result_image", asAddr, handleDynamicOptional},
+       {"result_image", asValue, handleDynamicOptional},
        {"stat", asAddr, handleDynamicOptional},
        {"errmsg", asBox, handleDynamicOptional}}},
      /*isElemental*/ false},
     {"co_sum",
      &I::genCoSum,
      {{{"a", asBox},
-       {"result_image", asAddr, handleDynamicOptional},
+       {"result_image", asValue, handleDynamicOptional},
        {"stat", asAddr, handleDynamicOptional},
        {"errmsg", asBox, handleDynamicOptional}}},
      /*isElemental*/ false},
@@ -829,7 +829,7 @@ static constexpr IntrinsicHandler handlers[]{
     {"null", &I::genNull, {{{"mold", asInquired}}}, /*isElemental=*/false},
     {"num_images",
      &I::genNumImages,
-     {{{"team", asAddr}, {"team_number", asAddr}}},
+     {{{"team_number", asValue}, {"team", asBox}}},
      /*isElemental*/ false},
     {"pack",
      &I::genPack,
@@ -2169,7 +2169,8 @@ IntrinsicLibrary::genElementalCall<IntrinsicLibrary::ExtendedGenerator>(
   for (const fir::ExtendedValue &arg : args) {
     auto *box = arg.getBoxOf<fir::BoxValue>();
     if (!arg.getUnboxed() && !arg.getCharBox() &&
-        !(box && fir::isScalarBoxedRecordType(fir::getBase(*box).getType())))
+        !(box && (fir::isScalarBoxedRecordType(fir::getBase(*box).getType()) ||
+                  fir::isClassStarType(fir::getBase(*box).getType()))))
       fir::emitFatalError(loc, "nonscalar intrinsic argument");
   }
   if (outline)
@@ -3515,11 +3516,23 @@ static mlir::Value getAddrFromBox(fir::FirOpBuilder &builder,
   return addr;
 }
 
+static void clocDeviceArgRewrite(fir::ExtendedValue arg) {
+  // Special case for device address in c_loc.
+  if (auto emboxOp = mlir::dyn_cast_or_null<fir::EmboxOp>(
+          fir::getBase(arg).getDefiningOp()))
+    if (auto declareOp = mlir::dyn_cast_or_null<hlfir::DeclareOp>(
+            emboxOp.getMemref().getDefiningOp()))
+      if (declareOp.getDataAttr() &&
+          declareOp.getDataAttr() == cuf::DataAttribute::Device)
+        emboxOp.getMemrefMutable().assign(declareOp.getMemref());
+}
+
 static fir::ExtendedValue
 genCLocOrCFunLoc(fir::FirOpBuilder &builder, mlir::Location loc,
                  mlir::Type resultType, llvm::ArrayRef<fir::ExtendedValue> args,
                  bool isFunc = false, bool isDevLoc = false) {
   assert(args.size() == 1);
+  clocDeviceArgRewrite(args[0]);
   mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
   mlir::Value resAddr;
   if (isDevLoc)
@@ -3794,79 +3807,40 @@ mlir::Value IntrinsicLibrary::genCmplx(mlir::Type resultType,
 void IntrinsicLibrary::genCoBroadcast(llvm::ArrayRef<fir::ExtendedValue> args) {
   converter->checkCoarrayEnabled();
   assert(args.size() == 4);
-  mlir::Value sourceImage = fir::getBase(args[1]);
-  mlir::Value status =
-      isStaticallyAbsent(args[2])
-          ? fir::AbsentOp::create(builder, loc,
-                                  builder.getRefType(builder.getI32Type()))
-                .getResult()
-          : fir::getBase(args[2]);
-  mlir::Value errmsg =
-      isStaticallyAbsent(args[3])
-          ? fir::AbsentOp::create(builder, loc, PRIF_ERRMSG_TYPE).getResult()
-          : fir::getBase(args[3]);
-  fir::runtime::genCoBroadcast(builder, loc, fir::getBase(args[0]), sourceImage,
-                               status, errmsg);
+  mif::CoBroadcastOp::create(builder, loc, fir::getBase(args[0]),
+                             /*sourceImage*/ fir::getBase(args[1]),
+                             /*status*/ fir::getBase(args[2]),
+                             /*errmsg*/ fir::getBase(args[3]));
 }
 
 // CO_MAX
 void IntrinsicLibrary::genCoMax(llvm::ArrayRef<fir::ExtendedValue> args) {
   converter->checkCoarrayEnabled();
   assert(args.size() == 4);
-  mlir::Value refNone =
-      fir::AbsentOp::create(builder, loc,
-                            builder.getRefType(builder.getI32Type()))
-          .getResult();
-  mlir::Value resultImage =
-      isStaticallyAbsent(args[1]) ? refNone : fir::getBase(args[1]);
-  mlir::Value status =
-      isStaticallyAbsent(args[2]) ? refNone : fir::getBase(args[2]);
-  mlir::Value errmsg =
-      isStaticallyAbsent(args[3])
-          ? fir::AbsentOp::create(builder, loc, PRIF_ERRMSG_TYPE).getResult()
-          : fir::getBase(args[3]);
-  fir::runtime::genCoMax(builder, loc, fir::getBase(args[0]), resultImage,
-                         status, errmsg);
+  mif::CoMaxOp::create(builder, loc, fir::getBase(args[0]),
+                       /*resultImage*/ fir::getBase(args[1]),
+                       /*status*/ fir::getBase(args[2]),
+                       /*errmsg*/ fir::getBase(args[3]));
 }
 
 // CO_MIN
 void IntrinsicLibrary::genCoMin(llvm::ArrayRef<fir::ExtendedValue> args) {
   converter->checkCoarrayEnabled();
   assert(args.size() == 4);
-  mlir::Value refNone =
-      fir::AbsentOp::create(builder, loc,
-                            builder.getRefType(builder.getI32Type()))
-          .getResult();
-  mlir::Value resultImage =
-      isStaticallyAbsent(args[1]) ? refNone : fir::getBase(args[1]);
-  mlir::Value status =
-      isStaticallyAbsent(args[2]) ? refNone : fir::getBase(args[2]);
-  mlir::Value errmsg =
-      isStaticallyAbsent(args[3])
-          ? fir::AbsentOp::create(builder, loc, PRIF_ERRMSG_TYPE).getResult()
-          : fir::getBase(args[3]);
-  fir::runtime::genCoMin(builder, loc, fir::getBase(args[0]), resultImage,
-                         status, errmsg);
+  mif::CoMinOp::create(builder, loc, fir::getBase(args[0]),
+                       /*resultImage*/ fir::getBase(args[1]),
+                       /*status*/ fir::getBase(args[2]),
+                       /*errmsg*/ fir::getBase(args[3]));
 }
 
 // CO_SUM
 void IntrinsicLibrary::genCoSum(llvm::ArrayRef<fir::ExtendedValue> args) {
   converter->checkCoarrayEnabled();
   assert(args.size() == 4);
-  mlir::Value absentInt =
-      fir::AbsentOp::create(builder, loc,
-                            builder.getRefType(builder.getI32Type()))
-          .getResult();
-  mlir::Value resultImage =
-      isStaticallyAbsent(args[1]) ? absentInt : fir::getBase(args[1]);
-  mlir::Value status =
-      isStaticallyAbsent(args[2]) ? absentInt : fir::getBase(args[2]);
-  mlir::Value errmsg =
-      isStaticallyAbsent(args[3])
-          ? fir::AbsentOp::create(builder, loc, PRIF_ERRMSG_TYPE).getResult()
-          : fir::getBase(args[3]);
-  fir::runtime::genCoSum(builder, loc, fir::getBase(args[0]), resultImage,
-                         status, errmsg);
+  mif::CoSumOp::create(builder, loc, fir::getBase(args[0]),
+                       /*resultImage*/ fir::getBase(args[1]),
+                       /*status*/ fir::getBase(args[2]),
+                       /*errmsg*/ fir::getBase(args[3]));
 }
 
 // COMMAND_ARGUMENT_COUNT
@@ -7578,9 +7552,9 @@ IntrinsicLibrary::genNumImages(mlir::Type resultType,
   assert(args.size() == 0 || args.size() == 1);
 
   if (args.size())
-    return fir::runtime::getNumImagesWithTeam(builder, loc,
-                                              fir::getBase(args[0]));
-  return fir::runtime::getNumImages(builder, loc);
+    return mif::NumImagesOp::create(builder, loc, fir::getBase(args[0]))
+        .getResult();
+  return mif::NumImagesOp::create(builder, loc).getResult();
 }
 
 // CLOCK, CLOCK64, GLOBALTIMER
@@ -8658,17 +8632,11 @@ IntrinsicLibrary::genThisImage(mlir::Type resultType,
   converter->checkCoarrayEnabled();
   assert(args.size() >= 1 && args.size() <= 3);
   const bool coarrayIsAbsent = args.size() == 1;
-  mlir::Value team =
-      !isStaticallyAbsent(args, args.size() - 1)
-          ? fir::getBase(args[args.size() - 1])
-          : builder
-                .create<fir::AbsentOp>(loc,
-                                       fir::BoxType::get(builder.getNoneType()))
-                .getResult();
+  mlir::Value team = fir::getBase(args[args.size() - 1]);
 
   if (!coarrayIsAbsent)
     TODO(loc, "this_image with coarray argument.");
-  mlir::Value res = fir::runtime::getThisImage(builder, loc, team);
+  mlir::Value res = mif::ThisImageOp::create(builder, loc, team);
   return builder.createConvert(loc, resultType, res);
 }
 
diff --git a/flang/lib/Optimizer/Builder/Runtime/Coarray.cpp b/flang/lib/Optimizer/Builder/Runtime/Coarray.cpp
deleted file mode 100644
index 364e7b753c6ee..0000000000000
--- a/flang/lib/Optimizer/Builder/Runtime/Coarray.cpp
+++ /dev/null
@@ -1,228 +0,0 @@
-//===-- Coarray.cpp -- runtime API for coarray intrinsics -----------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "flang/Optimizer/Builder/Runtime/Coarray.h"
-#include "flang/Optimizer/Builder/FIRBuilder.h"
-#include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-
-using namespace Fortran::runtime;
-using namespace Fortran::semantics;
-
-// Most PRIF functions take `errmsg` and `errmsg_alloc` as two optional
-// arguments of intent (out). One is allocatable, the other is not.
-// It is the responsibility of the compiler to ensure that the appropriate
-// optional argument is passed, and at most one must be provided in a given
-// call.
-// Depending on the type of `errmsg`, this function will return the pair
-// corresponding to (`errmsg`, `errmsg_alloc`).
-static std::pair<mlir::Value, mlir::Value>
-genErrmsgPRIF(fir::FirOpBuilder &builder, mlir::Location loc,
-              mlir::Value errmsg) {
-  bool isAllocatableErrmsg = fir::isAllocatableType(errmsg.getType());
-
-  mlir::Value absent = fir::AbsentOp::create(builder, loc, PRIF_ERRMSG_TYPE);
-  mlir::Value errMsg = isAllocatableErrmsg ? absent : errmsg;
-  mlir::Value errMsgAlloc = isAllocatableErrmsg ? errmsg : absent;
-  return {errMsg, errMsgAlloc};
-}
-
-/// Generate Call to runtime prif_init
-mlir::Value fir::runtime::genInitCoarray(fir::FirOpBuilder &builder,
-                                         mlir::Location loc) {
-  mlir::Type i32Ty = builder.getI32Type();
-  mlir::Value result = builder.createTemporary(loc, i32Ty);
-  mlir::FunctionType ftype = PRIF_FUNCTYPE(builder.getRefType(i32Ty));
-  mlir::func::FuncOp funcOp =
-      builder.createFunction(loc, PRIFNAME_SUB("init"), ftype);
-  llvm::SmallVector<mlir::Value> args =
-      fir::runtime::createArguments(builder, loc, ftype, result);
-  fir::CallOp::create(builder, loc, funcOp, args);
-  return fir::LoadOp::create(builder, loc, result);
-}
-
-/// Generate Call to runtime prif_num_images
-mlir::Value fir::runtime::getNumImages(fir::FirOpBuilder &builder,
-                                       mlir::Location loc) {
-  mlir::Value result = builder.createTemporary(loc, builder.getI32Type());
-  mlir::FunctionType ftype =
-      PRIF_FUNCTYPE(builder.getRefType(builder.getI32Type()));
-  mlir::func::FuncOp funcOp =
-      builder.createFunction(loc, PRIFNAME_SUB("num_images"), ftype);
-  llvm::SmallVector<mlir::Value> args =
-      fir::runtime::createArguments(builder, loc, ftype, result);
-  fir::CallOp::create(builder, loc, funcOp, args);
-  return fir::LoadOp::create(builder, loc, result);
-}
-
-/// Generate Call to runtime prif_num_images_with_{team|team_number}
-mlir::Value fir::runtime::getNumImagesWithTeam(fir::FirOpBuilder &builder,
-                                               mlir::Location loc,
-                                               mlir::Value team) {
-  bool isTeamNumber = fir::unwrapPassByRefType(team.getType()).isInteger();
-  std::string numImagesName = isTeamNumber
-                                  ? PRIFNAME_SUB("num_images_with_team_number")
-                                  : PRIFNAME_SUB("num_images_with_team");
-
-  mlir::Value result = builder.createTemporary(loc, builder.getI32Type());
-  mlir::Type refTy = builder.getRefType(builder.getI32Type());
-  mlir::FunctionType ftype =
-      isTeamNumber
-          ? PRIF_FUNCTYPE(builder.getRefType(builder.getI64Type()), refTy)
-          : PRIF_FUNCTYPE(fir::BoxType::get(builder.getNoneType()), refTy);
-  mlir::func::FuncOp funcOp = builder.createFunction(loc, numImagesName, ftype);
-
-  if (!isTeamNumber)
-    team = builder.createBox(loc, team);
-  llvm::SmallVector<mlir::Value> args =
-      fir::runtime::createArguments(builder, loc, ftype, team, result);
-  fir::CallOp::create(builder, loc, funcOp, args);
-  return fir::LoadOp::create(builder, loc, result);
-}
-
-/// Generate Call to runtime prif_this_image_no_coarray
-mlir::Value fir::runtime::getThisImage(fir::FirOpBuilder &builder,
-                                       mlir::Location loc, mlir::Value team) {
-  mlir::Type refTy = builder.getRefType(builder.getI32Type());
-  mlir::Type boxTy = fir::BoxType::get(builder.getNoneType());
-  mlir::FunctionType ftype = PRIF_FUNCTYPE(boxTy, refTy);
-  mlir::func::FuncOp funcOp =
-      builder.createFunction(loc, PRIFNAME_SUB("this_image_no_coarray"), ftype);
-
-  mlir::Value result = builder.createTemporary(loc, builder.getI32Type());
-  mlir::Value teamArg =
-      !team ? fir::AbsentOp::create(builder, loc, boxTy) : team;
-  llvm::SmallVector<mlir::Value> args =
-      fir::runtime::createArguments(builder, loc, ftype, teamArg, result);
-  fir::CallOp::create(builder, loc, funcOp, args);
-  return fir::LoadOp::create(builder, loc, result);
-}
-
-/// Generate call to collective subroutines except co_reduce
-/// A must be lowered as a box
-void genCollectiveSubroutine(fir::FirOpBuilder &builder, mlir::Location loc,
-                             mlir::Value A, mlir::Value rootImage,
-                             mlir::Value stat, mlir::Value errmsg,
-                             std::string coName) {
-  mlir::Type boxTy = fir::BoxType::get(builder.getNoneType());
-  mlir::FunctionType ftype =
-      PRIF_FUNCTYPE(boxTy, builder.getRefType(builder.getI32Type()),
-                    PRIF_STAT_TYPE, PRIF_ERRMSG_TYPE, PRIF_ERRMSG_TYPE);
-  mlir::func::FuncOp funcOp = builder.createFunction(loc, coName, ftype);
-
-  auto [errmsgArg, errmsgAllocArg] = genErrmsgPRIF(builder, loc, errmsg);
-  llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
-      builder, loc, ftype, A, rootImage, stat, errmsgArg, errmsgAllocArg);
-  fir::CallOp::create(builder, loc, funcOp, args);
-}
-
-/// Generate call to runtime subroutine prif_co_broadcast
-void fir::runtime::genCoBroadcast(fir::FirOpBuilder &builder,
-                                  mlir::Location loc, mlir::Value A,
-                                  mlir::Value sourceImage, mlir::Value stat,
-                                  mlir::Value errmsg) {
-  genCollectiveSubroutine(builder, loc, A, sourceImage, stat, errmsg,
-                          PRIFNAME_SUB("co_broadcast"));
-}
-
-/// Generate call to runtime subroutine prif_co_max or prif_co_max_character
-void fir::runtime::genCoMax(fir::FirOpBuilder &builder, mlir::Location loc,
-                            mlir::Value A, mlir::Value resultImage,
-                            mlir::Value stat, mlir::Value errmsg) {
-  mlir::Type argTy =
-      fir::unwrapSequenceType(fir::unwrapPassByRefType(A.getType()));
-  if (mlir::isa<fir::CharacterType>(argTy))
-    genCollectiveSubroutine(builder, loc, A, resultImage, stat, errmsg,
-                            PRIFNAME_SUB("co_max_character"));
-  else
-    genCollectiveSubroutine(builder, loc, A, resultImage, stat, errmsg,
-                            PRIFNAME_SUB("co_max"));
-}
-
-/// Generate call to runtime subroutine prif_co_min or prif_co_min_character
-void fir::runtime::genCoMin(fir::FirOpBuilder &builder, mlir::Location loc,
-                            mlir::Value A, mlir::Value resultImage,
-                            mlir::Value stat, mlir::Value errmsg) {
-  mlir::Type argTy =
-      fir::unwrapSequenceType(fir::unwrapPassByRefType(A.getType()));
-  if (mlir::isa<fir::CharacterType>(argTy))
-    genCollectiveSubroutine(builder, loc, A, resultImage, stat, errmsg,
-                            PRIFNAME_SUB("co_min_character"));
-  else
-    genCollectiveSubroutine(builder, loc, A, resultImage, stat, errmsg,
-                            PRIFNAME_SUB("co_min"));
-}
-
-/// Generate call to runtime subroutine prif_co_sum
-void fir::runtime::genCoSum(fir::FirOpBuilder &builder, mlir::Location loc,
-                            mlir::Value A, mlir::Value resultImage,
-                            mlir::Value stat, mlir::Value errmsg) {
-  genCollectiveSubroutine(builder, loc, A, resultImage, stat, errmsg,
-                          PRIFNAME_SUB("co_sum"));
-}
-
-/// Generate call to runtime subroutine prif_sync_all
-void fir::runtime::genSyncAllStatement(fir::FirOpBuilder &builder,
-                                       mlir::Location loc, mlir::Value stat,
-                                       mlir::Value errmsg) {
-  mlir::FunctionType ftype =
-      PRIF_FUNCTYPE(PRIF_STAT_TYPE, PRIF_ERRMSG_TYPE, PRIF_ERRMSG_TYPE);
-  mlir::func::FuncOp funcOp =
-      builder.createFunction(loc, PRIFNAME_SUB("sync_all"), ftype);
-
-  auto [errmsgArg, errmsgAllocArg] = genErrmsgPRIF(builder, loc, errmsg);
-  llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
-      builder, loc, ftype, stat, errmsgArg, errmsgAllocArg);
-  fir::CallOp::create(builder, loc, funcOp, args);
-}
-
-/// Generate call to runtime subroutine prif_sync_memory
-void fir::runtime::genSyncMemoryStatement(fir::FirOpBuilder &builder,
-                                          mlir::Location loc, mlir::Value stat,
-                                          mlir::Value errmsg) {
-  mlir::FunctionType ftype =
-      PRIF_FUNCTYPE(PRIF_STAT_TYPE, PRIF_ERRMSG_TYPE, PRIF_ERRMSG_TYPE);
-  mlir::func::FuncOp funcOp =
-      builder.createFunction(loc, PRIFNAME_SUB("sync_memory"), ftype);
-
-  auto [errmsgArg, errmsgAllocArg] = genErrmsgPRIF(builder, loc, errmsg);
-  llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
-      builder, loc, ftype, stat, errmsgArg, errmsgAllocArg);
-  fir::CallOp::create(builder, loc, funcOp, args);
-}
-
-/// Generate call to runtime subroutine prif_sync_images
-void fir::runtime::genSyncImagesStatement(fir::FirOpBuilder &builder,
-                                          mlir::Location loc,
-                                          mlir::Value imageSet,
-                                          mlir::Value stat,
-                                          mlir::Value errmsg) {
-  mlir::Type imgSetTy = fir::BoxType::get(fir::SequenceType::get(
-      {fir::SequenceType::getUnknownExtent()}, builder.getI32Type()));
-  mlir::FunctionType ftype = PRIF_FUNCTYPE(imgSetTy, PRIF_STAT_TYPE,
-                                           PRIF_ERRMSG_TYPE, PRIF_ERRMSG_TYPE);
-  mlir::func::FuncOp funcOp =
-      builder.createFunction(loc, PRIFNAME_SUB("sync_images"), ftype);
-
-  // If imageSet is scalar, PRIF require to pass an array of size 1.
-  if (auto boxTy = mlir::dyn_cast<fir::BoxType>(imageSet.getType())) {
-    if (!mlir::isa<fir::SequenceType>(boxTy.getEleTy())) {
-      mlir::Value one =
-          builder.createIntegerConstant(loc, builder.getI32Type(), 1);
-      mlir::Value shape = fir::ShapeOp::create(builder, loc, one);
-      imageSet = fir::ReboxOp::create(
-          builder, loc,
-          fir::BoxType::get(fir::SequenceType::get({1}, builder.getI32Type())),
-          imageSet, shape, mlir::Value{});
-    }
-  }
-  auto [errmsgArg, errmsgAllocArg] = genErrmsgPRIF(builder, loc, errmsg);
-  llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
-      builder, loc, ftype, imageSet, stat, errmsgArg, errmsgAllocArg);
-  fir::CallOp::create(builder, loc, funcOp, args);
-}
diff --git a/flang/lib/Optimizer/Builder/Runtime/Main.cpp b/flang/lib/Optimizer/Builder/Runtime/Main.cpp
index 5089e1d5e95a3..13a215a211afa 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Main.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Main.cpp
@@ -10,11 +10,11 @@
 #include "flang/Lower/EnvironmentDefault.h"
 #include "flang/Optimizer/Builder/BoxValue.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
-#include "flang/Optimizer/Builder/Runtime/Coarray.h"
 #include "flang/Optimizer/Builder/Runtime/EnvironmentDefaults.h"
 #include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
+#include "flang/Optimizer/Dialect/MIF/MIFOps.h"
 #include "flang/Runtime/CUDA/init.h"
 #include "flang/Runtime/main.h"
 #include "flang/Runtime/stop.h"
@@ -73,7 +73,7 @@ void fir::runtime::genMain(
     fir::CallOp::create(builder, loc, initFn);
   }
   if (initCoarrayEnv)
-    fir::runtime::genInitCoarray(builder, loc);
+    mif::InitOp::create(builder, loc);
 
   if (enableAmdAllocator) {
     // void AMDRegisterAllocator()
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index 0afb295e58e54..70bb43a2510ba 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -176,6 +176,19 @@ struct AddrOfOpConversion : public fir::FIROpConversion<fir::AddrOfOp> {
   llvm::LogicalResult
   matchAndRewrite(fir::AddrOfOp addr, OpAdaptor adaptor,
                   mlir::ConversionPatternRewriter &rewriter) const override {
+
+    if (auto gpuMod = addr->getParentOfType<mlir::gpu::GPUModuleOp>()) {
+      auto global = gpuMod.lookupSymbol<mlir::LLVM::GlobalOp>(addr.getSymbol());
+      replaceWithAddrOfOrASCast(
+          rewriter, addr->getLoc(),
+          global ? global.getAddrSpace() : getGlobalAddressSpace(rewriter),
+          getProgramAddressSpace(rewriter),
+          global ? global.getSymName()
+                 : addr.getSymbol().getRootReference().getValue(),
+          convertType(addr.getType()), addr);
+      return mlir::success();
+    }
+
     auto global = addr->getParentOfType<mlir::ModuleOp>()
                       .lookupSymbol<mlir::LLVM::GlobalOp>(addr.getSymbol());
     replaceWithAddrOfOrASCast(
@@ -3231,7 +3244,8 @@ struct GlobalOpConversion : public fir::FIROpConversion<fir::GlobalOp> {
 
     if (global.getDataAttr() &&
         *global.getDataAttr() == cuf::DataAttribute::Constant)
-      TODO(global.getLoc(), "CUDA Fortran CONSTANT variable code generation");
+      g.setAddrSpace(
+          static_cast<unsigned>(mlir::NVVM::NVVMMemorySpace::Constant));
 
     rewriter.eraseOp(global);
     return mlir::success();
diff --git a/flang/lib/Optimizer/Dialect/CMakeLists.txt b/flang/lib/Optimizer/Dialect/CMakeLists.txt
index 4fd4d285341e2..65d1f2c027548 100644
--- a/flang/lib/Optimizer/Dialect/CMakeLists.txt
+++ b/flang/lib/Optimizer/Dialect/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_subdirectory(Support)
 add_subdirectory(CUF)
 add_subdirectory(FIRCG)
+add_subdirectory(MIF)
 
 add_flang_library(FIRDialect
   FIRAttr.cpp
diff --git a/flang/lib/Optimizer/Dialect/FIRType.cpp b/flang/lib/Optimizer/Dialect/FIRType.cpp
index 4a9579cfde37c..fe35b085405b1 100644
--- a/flang/lib/Optimizer/Dialect/FIRType.cpp
+++ b/flang/lib/Optimizer/Dialect/FIRType.cpp
@@ -336,6 +336,17 @@ bool isBoxedRecordType(mlir::Type ty) {
   return false;
 }
 
+// CLASS(*)
+bool isClassStarType(mlir::Type ty) {
+  if (auto clTy = mlir::dyn_cast<fir::ClassType>(fir::unwrapRefType(ty))) {
+    if (mlir::isa<mlir::NoneType>(clTy.getEleTy()))
+      return true;
+    mlir::Type innerType = clTy.unwrapInnerType();
+    return innerType && mlir::isa<mlir::NoneType>(innerType);
+  }
+  return false;
+}
+
 bool isScalarBoxedRecordType(mlir::Type ty) {
   if (auto refTy = fir::dyn_cast_ptrEleTy(ty))
     ty = refTy;
@@ -398,12 +409,8 @@ bool isPolymorphicType(mlir::Type ty) {
 
 bool isUnlimitedPolymorphicType(mlir::Type ty) {
   // CLASS(*)
-  if (auto clTy = mlir::dyn_cast<fir::ClassType>(fir::unwrapRefType(ty))) {
-    if (mlir::isa<mlir::NoneType>(clTy.getEleTy()))
-      return true;
-    mlir::Type innerType = clTy.unwrapInnerType();
-    return innerType && mlir::isa<mlir::NoneType>(innerType);
-  }
+  if (isClassStarType(ty))
+    return true;
   // TYPE(*)
   return isAssumedType(ty);
 }
@@ -1420,6 +1427,13 @@ mlir::Type BaseBoxType::unwrapInnerType() const {
   return fir::unwrapInnerType(getEleTy());
 }
 
+mlir::Type BaseBoxType::getElementOrSequenceType() const {
+  mlir::Type eleTy = getEleTy();
+  if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(eleTy))
+    return seqTy;
+  return fir::unwrapRefType(eleTy);
+}
+
 static mlir::Type
 changeTypeShape(mlir::Type type,
                 std::optional<fir::SequenceType::ShapeRef> newShape) {
diff --git a/flang/lib/Optimizer/Dialect/MIF/CMakeLists.txt b/flang/lib/Optimizer/Dialect/MIF/CMakeLists.txt
new file mode 100644
index 0000000000000..ed8463e9b0330
--- /dev/null
+++ b/flang/lib/Optimizer/Dialect/MIF/CMakeLists.txt
@@ -0,0 +1,24 @@
+add_flang_library(MIFDialect
+  MIFDialect.cpp
+  MIFOps.cpp
+
+  DEPENDS
+  MIFOpsIncGen
+
+  LINK_LIBS
+  FIRDialect
+  FIRDialectSupport
+  FIRSupport
+
+  LINK_COMPONENTS
+  AsmParser
+  AsmPrinter
+  Remarks
+
+  MLIR_DEPS
+  MLIRIR
+
+  MLIR_LIBS
+  MLIRIR
+  MLIRTargetLLVMIRExport
+)
diff --git a/flang/lib/Optimizer/Dialect/MIF/MIFDialect.cpp b/flang/lib/Optimizer/Dialect/MIF/MIFDialect.cpp
new file mode 100644
index 0000000000000..edc723d319aa4
--- /dev/null
+++ b/flang/lib/Optimizer/Dialect/MIF/MIFDialect.cpp
@@ -0,0 +1,24 @@
+//===- MIFDialect.cpp - MIF dialect implementation ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// C
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Dialect/MIF/MIFDialect.h"
+#include "flang/Optimizer/Dialect/FIRDialect.h"
+#include "flang/Optimizer/Dialect/MIF/MIFOps.h"
+
+//===----------------------------------------------------------------------===//
+/// Tablegen Definitions
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Dialect/MIF/MIFDialect.cpp.inc"
+
+void mif::MIFDialect::initialize() {
+  addOperations<
+#define GET_OP_LIST
+#include "flang/Optimizer/Dialect/MIF/MIFOps.cpp.inc"
+      >();
+}
diff --git a/flang/lib/Optimizer/Dialect/MIF/MIFOps.cpp b/flang/lib/Optimizer/Dialect/MIF/MIFOps.cpp
new file mode 100644
index 0000000000000..c6cc2e855ff35
--- /dev/null
+++ b/flang/lib/Optimizer/Dialect/MIF/MIFOps.cpp
@@ -0,0 +1,153 @@
+//===-- MIFOps.cpp - MIF dialect ops implementation -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Dialect/MIF/MIFOps.h"
+#include "flang/Optimizer/Builder/Todo.h"
+#include "flang/Optimizer/Dialect/FIRAttr.h"
+#include "flang/Optimizer/Dialect/FIRType.h"
+#include "flang/Optimizer/Dialect/MIF/MIFDialect.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/PatternMatch.h"
+#include "llvm/ADT/SmallVector.h"
+
+#define GET_OP_CLASSES
+#include "flang/Optimizer/Dialect/MIF/MIFOps.cpp.inc"
+
+//===----------------------------------------------------------------------===//
+// NumImagesOp
+//===----------------------------------------------------------------------===//
+
+void mif::NumImagesOp::build(mlir::OpBuilder &builder,
+                             mlir::OperationState &result,
+                             mlir::Value teamArg) {
+  bool isTeamNumber =
+      teamArg && fir::unwrapPassByRefType(teamArg.getType()).isInteger();
+  if (isTeamNumber)
+    build(builder, result, teamArg, /*team*/ mlir::Value{});
+  else
+    build(builder, result, /*team_number*/ mlir::Value{}, teamArg);
+}
+
+llvm::LogicalResult mif::NumImagesOp::verify() {
+  if (getTeam() && getTeamNumber())
+    return emitOpError(
+        "team and team_number must not be provided at the same time");
+  return mlir::success();
+}
+
+//===----------------------------------------------------------------------===//
+// ThisImageOp
+//===----------------------------------------------------------------------===//
+
+void mif::ThisImageOp::build(mlir::OpBuilder &builder,
+                             mlir::OperationState &result, mlir::Value coarray,
+                             mlir::Value team) {
+  build(builder, result, coarray, /*dim*/ mlir::Value{}, team);
+}
+
+void mif::ThisImageOp::build(mlir::OpBuilder &builder,
+                             mlir::OperationState &result, mlir::Value team) {
+  build(builder, result, /*coarray*/ mlir::Value{}, /*dim*/ mlir::Value{},
+        team);
+}
+
+llvm::LogicalResult mif::ThisImageOp::verify() {
+  if (getDim() && !getCoarray())
+    return emitOpError(
+        "`dim` must be provied at the same time as the `coarray` argument.");
+  return mlir::success();
+}
+
+//===----------------------------------------------------------------------===//
+// SyncImagesOp
+//===----------------------------------------------------------------------===//
+
+llvm::LogicalResult mif::SyncImagesOp::verify() {
+  if (getImageSet()) {
+    mlir::Type t = getImageSet().getType();
+    fir::BoxType boxTy = mlir::dyn_cast<fir::BoxType>(t);
+    if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(
+            boxTy.getElementOrSequenceType())) {
+      if (seqTy.getDimension() != 0 && seqTy.getDimension() != 1)
+        return emitOpError(
+            "`image_set` must be a boxed integer expression of rank 1.");
+      if (!fir::isa_integer(seqTy.getElementType()))
+        return emitOpError("`image_set` must be a boxed array of integer.");
+    } else if (!fir::isa_integer(boxTy.getElementType()))
+      return emitOpError(
+          "`image_set` must be a boxed scalar integer expression.");
+  }
+  return mlir::success();
+}
+
+//===----------------------------------------------------------------------===//
+// CoBroadcastOp
+//===----------------------------------------------------------------------===//
+
+llvm::LogicalResult mif::CoBroadcastOp::verify() {
+  fir::BoxType boxTy = mlir::dyn_cast<fir::BoxType>(getA().getType());
+
+  if (fir::isPolymorphicType(boxTy))
+    return emitOpError("`A` cannot be polymorphic.");
+  else if (auto recTy =
+               mlir::dyn_cast<fir::RecordType>(boxTy.getElementType())) {
+    for (auto component : recTy.getTypeList()) {
+      if (fir::isPolymorphicType(component.second))
+        TODO(getLoc(), "`A` with polymorphic subobject component.");
+    }
+  }
+  return mlir::success();
+}
+
+//===----------------------------------------------------------------------===//
+// CoMaxOp
+//===----------------------------------------------------------------------===//
+
+llvm::LogicalResult mif::CoMaxOp::verify() {
+  fir::BoxType boxTy = mlir::dyn_cast<fir::BoxType>(getA().getType());
+  mlir::Type elemTy = boxTy.getElementOrSequenceType();
+  if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(elemTy))
+    elemTy = seqTy.getElementType();
+
+  if (!fir::isa_real(elemTy) && !fir::isa_integer(elemTy) &&
+      !fir::isa_char(elemTy))
+    return emitOpError("`A` shall be of type integer, real or character.");
+  return mlir::success();
+}
+
+//===----------------------------------------------------------------------===//
+// CoMinOp
+//===----------------------------------------------------------------------===//
+
+llvm::LogicalResult mif::CoMinOp::verify() {
+  fir::BoxType boxTy = mlir::dyn_cast<fir::BoxType>(getA().getType());
+  mlir::Type elemTy = boxTy.getElementOrSequenceType();
+  if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(elemTy))
+    elemTy = seqTy.getElementType();
+
+  if (!fir::isa_real(elemTy) && !fir::isa_integer(elemTy) &&
+      !fir::isa_char(elemTy))
+    return emitOpError("`A` shall be of type integer, real or character.");
+  return mlir::success();
+}
+
+//===----------------------------------------------------------------------===//
+// CoSumOp
+//===----------------------------------------------------------------------===//
+
+llvm::LogicalResult mif::CoSumOp::verify() {
+  fir::BoxType boxTy = mlir::dyn_cast<fir::BoxType>(getA().getType());
+  mlir::Type elemTy = boxTy.getElementOrSequenceType();
+  if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(elemTy))
+    elemTy = seqTy.getElementType();
+
+  if (!fir::isa_real(elemTy) && !fir::isa_integer(elemTy) &&
+      !fir::isa_complex(elemTy))
+    return emitOpError("`A` shall be of numeric type.");
+  return mlir::success();
+}
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/ScheduleOrderedAssignments.cpp b/flang/lib/Optimizer/HLFIR/Transforms/ScheduleOrderedAssignments.cpp
index a48b7ba8a77df..63a5803878a2d 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/ScheduleOrderedAssignments.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/ScheduleOrderedAssignments.cpp
@@ -21,24 +21,27 @@
 //===----------------------------------------------------------------------===//
 
 /// Log RAW or WAW conflict.
-static void LLVM_ATTRIBUTE_UNUSED logConflict(llvm::raw_ostream &os,
-                                              mlir::Value writtenOrReadVarA,
-                                              mlir::Value writtenVarB);
+[[maybe_unused]] static void logConflict(llvm::raw_ostream &os,
+                                         mlir::Value writtenOrReadVarA,
+                                         mlir::Value writtenVarB);
 /// Log when an expression evaluation must be saved.
-static void LLVM_ATTRIBUTE_UNUSED logSaveEvaluation(llvm::raw_ostream &os,
-                                                    unsigned runid,
-                                                    mlir::Region &yieldRegion,
-                                                    bool anyWrite);
+[[maybe_unused]] static void logSaveEvaluation(llvm::raw_ostream &os,
+                                               unsigned runid,
+                                               mlir::Region &yieldRegion,
+                                               bool anyWrite);
 /// Log when an assignment is scheduled.
-static void LLVM_ATTRIBUTE_UNUSED logAssignmentEvaluation(
-    llvm::raw_ostream &os, unsigned runid, hlfir::RegionAssignOp assign);
+[[maybe_unused]] static void
+logAssignmentEvaluation(llvm::raw_ostream &os, unsigned runid,
+                        hlfir::RegionAssignOp assign);
 /// Log when starting to schedule an order assignment tree.
-static void LLVM_ATTRIBUTE_UNUSED logStartScheduling(
-    llvm::raw_ostream &os, hlfir::OrderedAssignmentTreeOpInterface root);
+[[maybe_unused]] static void
+logStartScheduling(llvm::raw_ostream &os,
+                   hlfir::OrderedAssignmentTreeOpInterface root);
 /// Log op if effect value is not known.
-static void LLVM_ATTRIBUTE_UNUSED logIfUnkownEffectValue(
-    llvm::raw_ostream &os, mlir::MemoryEffects::EffectInstance effect,
-    mlir::Operation &op);
+[[maybe_unused]] static void
+logIfUnkownEffectValue(llvm::raw_ostream &os,
+                       mlir::MemoryEffects::EffectInstance effect,
+                       mlir::Operation &op);
 
 //===----------------------------------------------------------------------===//
 // Scheduling Implementation
@@ -701,23 +704,24 @@ static llvm::raw_ostream &printRegionPath(llvm::raw_ostream &os,
   return printRegionId(os, yieldRegion);
 }
 
-static void LLVM_ATTRIBUTE_UNUSED logSaveEvaluation(llvm::raw_ostream &os,
-                                                    unsigned runid,
-                                                    mlir::Region &yieldRegion,
-                                                    bool anyWrite) {
+[[maybe_unused]] static void logSaveEvaluation(llvm::raw_ostream &os,
+                                               unsigned runid,
+                                               mlir::Region &yieldRegion,
+                                               bool anyWrite) {
   os << "run " << runid << " save  " << (anyWrite ? "(w)" : "  ") << ": ";
   printRegionPath(os, yieldRegion) << "\n";
 }
 
-static void LLVM_ATTRIBUTE_UNUSED logAssignmentEvaluation(
-    llvm::raw_ostream &os, unsigned runid, hlfir::RegionAssignOp assign) {
+[[maybe_unused]] static void
+logAssignmentEvaluation(llvm::raw_ostream &os, unsigned runid,
+                        hlfir::RegionAssignOp assign) {
   os << "run " << runid << " evaluate: ";
   printNodePath(os, assign.getOperation()) << "\n";
 }
 
-static void LLVM_ATTRIBUTE_UNUSED logConflict(llvm::raw_ostream &os,
-                                              mlir::Value writtenOrReadVarA,
-                                              mlir::Value writtenVarB) {
+[[maybe_unused]] static void logConflict(llvm::raw_ostream &os,
+                                         mlir::Value writtenOrReadVarA,
+                                         mlir::Value writtenVarB) {
   auto printIfValue = [&](mlir::Value var) -> llvm::raw_ostream & {
     if (!var)
       return os << "<unknown>";
@@ -728,8 +732,9 @@ static void LLVM_ATTRIBUTE_UNUSED logConflict(llvm::raw_ostream &os,
   printIfValue(writtenVarB) << "\n";
 }
 
-static void LLVM_ATTRIBUTE_UNUSED logStartScheduling(
-    llvm::raw_ostream &os, hlfir::OrderedAssignmentTreeOpInterface root) {
+[[maybe_unused]] static void
+logStartScheduling(llvm::raw_ostream &os,
+                   hlfir::OrderedAssignmentTreeOpInterface root) {
   os << "------------ scheduling ";
   printNodePath(os, root.getOperation());
   if (auto funcOp = root->getParentOfType<mlir::func::FuncOp>())
@@ -737,9 +742,10 @@ static void LLVM_ATTRIBUTE_UNUSED logStartScheduling(
   os << "------------\n";
 }
 
-static void LLVM_ATTRIBUTE_UNUSED logIfUnkownEffectValue(
-    llvm::raw_ostream &os, mlir::MemoryEffects::EffectInstance effect,
-    mlir::Operation &op) {
+[[maybe_unused]] static void
+logIfUnkownEffectValue(llvm::raw_ostream &os,
+                       mlir::MemoryEffects::EffectInstance effect,
+                       mlir::Operation &op) {
   if (effect.getValue() != nullptr)
     return;
   os << "unknown effected value (";
diff --git a/flang/lib/Optimizer/OpenACC/CMakeLists.txt b/flang/lib/Optimizer/OpenACC/CMakeLists.txt
index fc23e64eeb7a4..790b9fdb1589a 100644
--- a/flang/lib/Optimizer/OpenACC/CMakeLists.txt
+++ b/flang/lib/Optimizer/OpenACC/CMakeLists.txt
@@ -1 +1,2 @@
 add_subdirectory(Support)
+add_subdirectory(Transforms)
diff --git a/flang/lib/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.cpp b/flang/lib/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.cpp
index 89aa010e7d9a1..ed9e41c743754 100644
--- a/flang/lib/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.cpp
+++ b/flang/lib/Optimizer/OpenACC/Support/FIROpenACCTypeInterfaces.cpp
@@ -21,6 +21,7 @@
 #include "flang/Optimizer/Dialect/FIRType.h"
 #include "flang/Optimizer/Dialect/Support/FIRContext.h"
 #include "flang/Optimizer/Dialect/Support/KindMapping.h"
+#include "flang/Optimizer/Support/Utils.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/OpenACC/OpenACC.h"
 #include "mlir/IR/BuiltinOps.h"
@@ -352,6 +353,14 @@ getBaseRef(mlir::TypedValue<mlir::acc::PointerLikeType> varPtr) {
   // calculation op.
   mlir::Value baseRef =
       llvm::TypeSwitch<mlir::Operation *, mlir::Value>(op)
+          .Case<fir::DeclareOp>([&](auto op) {
+            // If this declare binds a view with an underlying storage operand,
+            // treat that storage as the base reference. Otherwise, fall back
+            // to the declared memref.
+            if (auto storage = op.getStorage())
+              return storage;
+            return mlir::Value(varPtr);
+          })
           .Case<hlfir::DesignateOp>([&](auto op) {
             // Get the base object.
             return op.getMemref();
@@ -548,14 +557,27 @@ template <typename Ty>
 mlir::Value OpenACCMappableModel<Ty>::generatePrivateInit(
     mlir::Type type, mlir::OpBuilder &builder, mlir::Location loc,
     mlir::TypedValue<mlir::acc::MappableType> var, llvm::StringRef varName,
-    mlir::ValueRange extents, mlir::Value initVal) const {
+    mlir::ValueRange extents, mlir::Value initVal, bool &needsDestroy) const {
+  needsDestroy = false;
   mlir::Value retVal;
   mlir::Type unwrappedTy = fir::unwrapRefType(type);
   mlir::ModuleOp mod = builder.getInsertionBlock()
                            ->getParent()
                            ->getParentOfType<mlir::ModuleOp>();
-  fir::FirOpBuilder firBuilder(builder, mod);
 
+  if (auto recType = llvm::dyn_cast<fir::RecordType>(
+          fir::getFortranElementType(unwrappedTy))) {
+    // Need to make deep copies of allocatable components.
+    if (fir::isRecordWithAllocatableMember(recType))
+      TODO(loc,
+           "OpenACC: privatizing derived type with allocatable components");
+    // Need to decide if user assignment/final routine should be called.
+    if (fir::isRecordWithFinalRoutine(recType, mod).value_or(false))
+      TODO(loc, "OpenACC: privatizing derived type with user assignment or "
+                "final routine ");
+  }
+
+  fir::FirOpBuilder firBuilder(builder, mod);
   auto getDeclareOpForType = [&](mlir::Type ty) -> hlfir::DeclareOp {
     auto alloca = fir::AllocaOp::create(firBuilder, loc, ty);
     return hlfir::DeclareOp::create(firBuilder, loc, alloca, varName);
@@ -615,9 +637,11 @@ mlir::Value OpenACCMappableModel<Ty>::generatePrivateInit(
       mlir::Value firClass =
           fir::EmboxOp::create(builder, loc, boxTy, allocatedScalar);
       fir::StoreOp::create(builder, loc, firClass, retVal);
+      needsDestroy = true;
     } else if (mlir::isa<fir::SequenceType>(innerTy)) {
       hlfir::Entity source = hlfir::Entity{var};
-      auto [temp, cleanup] = hlfir::createTempFromMold(loc, firBuilder, source);
+      auto [temp, cleanupFlag] =
+          hlfir::createTempFromMold(loc, firBuilder, source);
       if (fir::isa_ref_type(type)) {
         // When the temp is created - it is not a reference - thus we can
         // end up with a type inconsistency. Therefore ensure storage is created
@@ -636,6 +660,9 @@ mlir::Value OpenACCMappableModel<Ty>::generatePrivateInit(
       } else {
         retVal = temp;
       }
+      // If heap was allocated, a destroy is required later.
+      if (cleanupFlag)
+        needsDestroy = true;
     } else {
       TODO(loc, "Unsupported boxed type for OpenACC private-like recipe");
     }
@@ -667,23 +694,302 @@ template mlir::Value
 OpenACCMappableModel<fir::BaseBoxType>::generatePrivateInit(
     mlir::Type type, mlir::OpBuilder &builder, mlir::Location loc,
     mlir::TypedValue<mlir::acc::MappableType> var, llvm::StringRef varName,
-    mlir::ValueRange extents, mlir::Value initVal) const;
+    mlir::ValueRange extents, mlir::Value initVal, bool &needsDestroy) const;
 
 template mlir::Value
 OpenACCMappableModel<fir::ReferenceType>::generatePrivateInit(
     mlir::Type type, mlir::OpBuilder &builder, mlir::Location loc,
     mlir::TypedValue<mlir::acc::MappableType> var, llvm::StringRef varName,
-    mlir::ValueRange extents, mlir::Value initVal) const;
+    mlir::ValueRange extents, mlir::Value initVal, bool &needsDestroy) const;
 
 template mlir::Value OpenACCMappableModel<fir::HeapType>::generatePrivateInit(
     mlir::Type type, mlir::OpBuilder &builder, mlir::Location loc,
     mlir::TypedValue<mlir::acc::MappableType> var, llvm::StringRef varName,
-    mlir::ValueRange extents, mlir::Value initVal) const;
+    mlir::ValueRange extents, mlir::Value initVal, bool &needsDestroy) const;
 
 template mlir::Value
 OpenACCMappableModel<fir::PointerType>::generatePrivateInit(
     mlir::Type type, mlir::OpBuilder &builder, mlir::Location loc,
     mlir::TypedValue<mlir::acc::MappableType> var, llvm::StringRef varName,
-    mlir::ValueRange extents, mlir::Value initVal) const;
+    mlir::ValueRange extents, mlir::Value initVal, bool &needsDestroy) const;
+
+template <typename Ty>
+bool OpenACCMappableModel<Ty>::generatePrivateDestroy(
+    mlir::Type type, mlir::OpBuilder &builder, mlir::Location loc,
+    mlir::Value privatized) const {
+  mlir::Type unwrappedTy = fir::unwrapRefType(type);
+  // For boxed scalars allocated with AllocMem during init, free the heap.
+  if (auto boxTy = mlir::dyn_cast_or_null<fir::BaseBoxType>(unwrappedTy)) {
+    mlir::Value boxVal = privatized;
+    if (fir::isa_ref_type(boxVal.getType()))
+      boxVal = fir::LoadOp::create(builder, loc, boxVal);
+    mlir::Value addr = fir::BoxAddrOp::create(builder, loc, boxVal);
+    // FreeMem only accepts fir.heap and this may not be represented in the box
+    // type if the privatized entity is not an allocatable.
+    mlir::Type heapType =
+        fir::HeapType::get(fir::unwrapRefType(addr.getType()));
+    if (heapType != addr.getType())
+      addr = fir::ConvertOp::create(builder, loc, heapType, addr);
+    fir::FreeMemOp::create(builder, loc, addr);
+    return true;
+  }
+
+  // Nothing to do for other categories by default, they are stack allocated.
+  return true;
+}
+
+template bool OpenACCMappableModel<fir::BaseBoxType>::generatePrivateDestroy(
+    mlir::Type type, mlir::OpBuilder &builder, mlir::Location loc,
+    mlir::Value privatized) const;
+template bool OpenACCMappableModel<fir::ReferenceType>::generatePrivateDestroy(
+    mlir::Type type, mlir::OpBuilder &builder, mlir::Location loc,
+    mlir::Value privatized) const;
+template bool OpenACCMappableModel<fir::HeapType>::generatePrivateDestroy(
+    mlir::Type type, mlir::OpBuilder &builder, mlir::Location loc,
+    mlir::Value privatized) const;
+template bool OpenACCMappableModel<fir::PointerType>::generatePrivateDestroy(
+    mlir::Type type, mlir::OpBuilder &builder, mlir::Location loc,
+    mlir::Value privatized) const;
+
+template <typename Ty>
+mlir::Value OpenACCPointerLikeModel<Ty>::genAllocate(
+    mlir::Type pointer, mlir::OpBuilder &builder, mlir::Location loc,
+    llvm::StringRef varName, mlir::Type varType, mlir::Value originalVar,
+    bool &needsFree) const {
+
+  // Unwrap to get the pointee type.
+  mlir::Type pointeeTy = fir::dyn_cast_ptrEleTy(pointer);
+  assert(pointeeTy && "expected pointee type to be extractable");
+
+  // Box types are descriptors that contain both metadata and a pointer to data.
+  // The `genAllocate` API is designed for simple allocations and cannot
+  // properly handle the dual nature of boxes. Using `generatePrivateInit`
+  // instead can allocate both the descriptor and its referenced data. For use
+  // cases that require an empty descriptor storage, potentially this could be
+  // implemented here.
+  if (fir::isa_box_type(pointeeTy))
+    return {};
+
+  // Unlimited polymorphic (class(*)) cannot be handled - size unknown
+  if (fir::isUnlimitedPolymorphicType(pointeeTy))
+    return {};
+
+  // Return null for dynamic size types because the size of the
+  // allocation cannot be determined simply from the type.
+  if (fir::hasDynamicSize(pointeeTy))
+    return {};
+
+  // Use heap allocation for fir.heap, stack allocation for others (fir.ref,
+  // fir.ptr, fir.llvm_ptr). For fir.ptr, which is supposed to represent a
+  // Fortran pointer type, it feels a bit odd to "allocate" since it is meant
+  // to point to an existing entity - but one can imagine where a pointee is
+  // privatized - thus it makes sense to issue an allocate.
+  mlir::Value allocation;
+  if (std::is_same_v<Ty, fir::HeapType>) {
+    needsFree = true;
+    allocation = fir::AllocMemOp::create(builder, loc, pointeeTy);
+  } else {
+    needsFree = false;
+    allocation = fir::AllocaOp::create(builder, loc, pointeeTy);
+  }
+
+  // Convert to the requested pointer type if needed.
+  // This means converting from a fir.ref to either a fir.llvm_ptr or a fir.ptr.
+  // fir.heap is already correct type in this case.
+  if (allocation.getType() != pointer) {
+    assert(!(std::is_same_v<Ty, fir::HeapType>) &&
+           "fir.heap is already correct type because of allocmem");
+    return fir::ConvertOp::create(builder, loc, pointer, allocation);
+  }
+
+  return allocation;
+}
+
+template mlir::Value OpenACCPointerLikeModel<fir::ReferenceType>::genAllocate(
+    mlir::Type pointer, mlir::OpBuilder &builder, mlir::Location loc,
+    llvm::StringRef varName, mlir::Type varType, mlir::Value originalVar,
+    bool &needsFree) const;
+
+template mlir::Value OpenACCPointerLikeModel<fir::PointerType>::genAllocate(
+    mlir::Type pointer, mlir::OpBuilder &builder, mlir::Location loc,
+    llvm::StringRef varName, mlir::Type varType, mlir::Value originalVar,
+    bool &needsFree) const;
+
+template mlir::Value OpenACCPointerLikeModel<fir::HeapType>::genAllocate(
+    mlir::Type pointer, mlir::OpBuilder &builder, mlir::Location loc,
+    llvm::StringRef varName, mlir::Type varType, mlir::Value originalVar,
+    bool &needsFree) const;
+
+template mlir::Value OpenACCPointerLikeModel<fir::LLVMPointerType>::genAllocate(
+    mlir::Type pointer, mlir::OpBuilder &builder, mlir::Location loc,
+    llvm::StringRef varName, mlir::Type varType, mlir::Value originalVar,
+    bool &needsFree) const;
+
+static mlir::Value stripCasts(mlir::Value value, bool stripDeclare = true) {
+  mlir::Value currentValue = value;
+
+  while (currentValue) {
+    auto *definingOp = currentValue.getDefiningOp();
+    if (!definingOp)
+      break;
+
+    if (auto convertOp = mlir::dyn_cast<fir::ConvertOp>(definingOp)) {
+      currentValue = convertOp.getValue();
+      continue;
+    }
+
+    if (auto viewLike = mlir::dyn_cast<mlir::ViewLikeOpInterface>(definingOp)) {
+      currentValue = viewLike.getViewSource();
+      continue;
+    }
+
+    if (stripDeclare) {
+      if (auto declareOp = mlir::dyn_cast<hlfir::DeclareOp>(definingOp)) {
+        currentValue = declareOp.getMemref();
+        continue;
+      }
+
+      if (auto declareOp = mlir::dyn_cast<fir::DeclareOp>(definingOp)) {
+        currentValue = declareOp.getMemref();
+        continue;
+      }
+    }
+    break;
+  }
+
+  return currentValue;
+}
+
+template <typename Ty>
+bool OpenACCPointerLikeModel<Ty>::genFree(
+    mlir::Type pointer, mlir::OpBuilder &builder, mlir::Location loc,
+    mlir::TypedValue<mlir::acc::PointerLikeType> varToFree,
+    mlir::Value allocRes, mlir::Type varType) const {
+
+  // Unwrap to get the pointee type.
+  mlir::Type pointeeTy = fir::dyn_cast_ptrEleTy(pointer);
+  assert(pointeeTy && "expected pointee type to be extractable");
+
+  // Box types contain both a descriptor and data. The `genFree` API
+  // handles simple deallocations and cannot properly manage both parts.
+  // Using `generatePrivateDestroy` instead can free both the descriptor and
+  // its referenced data.
+  if (fir::isa_box_type(pointeeTy))
+    return false;
+
+  // If pointer type is HeapType, assume it's a heap allocation
+  if (std::is_same_v<Ty, fir::HeapType>) {
+    fir::FreeMemOp::create(builder, loc, varToFree);
+    return true;
+  }
+
+  // Use allocRes if provided to determine the allocation type
+  mlir::Value valueToInspect = allocRes ? allocRes : varToFree;
+
+  // Strip casts and declare operations to find the original allocation
+  mlir::Value strippedValue = stripCasts(valueToInspect);
+  mlir::Operation *originalAlloc = strippedValue.getDefiningOp();
+
+  // If we found an AllocMemOp (heap allocation), free it
+  if (mlir::isa_and_nonnull<fir::AllocMemOp>(originalAlloc)) {
+    mlir::Value toFree = varToFree;
+    if (!mlir::isa<fir::HeapType>(valueToInspect.getType()))
+      toFree = fir::ConvertOp::create(
+          builder, loc,
+          fir::HeapType::get(varToFree.getType().getElementType()), toFree);
+    fir::FreeMemOp::create(builder, loc, toFree);
+    return true;
+  }
+
+  // If we found an AllocaOp (stack allocation), no deallocation needed
+  if (mlir::isa_and_nonnull<fir::AllocaOp>(originalAlloc))
+    return true;
+
+  // Unable to determine allocation type
+  return false;
+}
+
+template bool OpenACCPointerLikeModel<fir::ReferenceType>::genFree(
+    mlir::Type pointer, mlir::OpBuilder &builder, mlir::Location loc,
+    mlir::TypedValue<mlir::acc::PointerLikeType> varToFree,
+    mlir::Value allocRes, mlir::Type varType) const;
+
+template bool OpenACCPointerLikeModel<fir::PointerType>::genFree(
+    mlir::Type pointer, mlir::OpBuilder &builder, mlir::Location loc,
+    mlir::TypedValue<mlir::acc::PointerLikeType> varToFree,
+    mlir::Value allocRes, mlir::Type varType) const;
+
+template bool OpenACCPointerLikeModel<fir::HeapType>::genFree(
+    mlir::Type pointer, mlir::OpBuilder &builder, mlir::Location loc,
+    mlir::TypedValue<mlir::acc::PointerLikeType> varToFree,
+    mlir::Value allocRes, mlir::Type varType) const;
+
+template bool OpenACCPointerLikeModel<fir::LLVMPointerType>::genFree(
+    mlir::Type pointer, mlir::OpBuilder &builder, mlir::Location loc,
+    mlir::TypedValue<mlir::acc::PointerLikeType> varToFree,
+    mlir::Value allocRes, mlir::Type varType) const;
+
+template <typename Ty>
+bool OpenACCPointerLikeModel<Ty>::genCopy(
+    mlir::Type pointer, mlir::OpBuilder &builder, mlir::Location loc,
+    mlir::TypedValue<mlir::acc::PointerLikeType> destination,
+    mlir::TypedValue<mlir::acc::PointerLikeType> source,
+    mlir::Type varType) const {
+
+  // Check that source and destination types match
+  if (source.getType() != destination.getType())
+    return false;
+
+  // Unwrap to get the pointee type.
+  mlir::Type pointeeTy = fir::dyn_cast_ptrEleTy(pointer);
+  assert(pointeeTy && "expected pointee type to be extractable");
+
+  // Box types contain both a descriptor and referenced data. The genCopy API
+  // handles simple copies and cannot properly manage both parts.
+  if (fir::isa_box_type(pointeeTy))
+    return false;
+
+  // Unlimited polymorphic (class(*)) cannot be handled because source and
+  // destination types are not known.
+  if (fir::isUnlimitedPolymorphicType(pointeeTy))
+    return false;
+
+  // Return false for dynamic size types because the copy logic
+  // cannot be determined simply from the type.
+  if (fir::hasDynamicSize(pointeeTy))
+    return false;
+
+  if (fir::isa_trivial(pointeeTy)) {
+    auto loadVal = fir::LoadOp::create(builder, loc, source);
+    fir::StoreOp::create(builder, loc, loadVal, destination);
+  } else {
+    hlfir::AssignOp::create(builder, loc, source, destination);
+  }
+  return true;
+}
+
+template bool OpenACCPointerLikeModel<fir::ReferenceType>::genCopy(
+    mlir::Type pointer, mlir::OpBuilder &builder, mlir::Location loc,
+    mlir::TypedValue<mlir::acc::PointerLikeType> destination,
+    mlir::TypedValue<mlir::acc::PointerLikeType> source,
+    mlir::Type varType) const;
+
+template bool OpenACCPointerLikeModel<fir::PointerType>::genCopy(
+    mlir::Type pointer, mlir::OpBuilder &builder, mlir::Location loc,
+    mlir::TypedValue<mlir::acc::PointerLikeType> destination,
+    mlir::TypedValue<mlir::acc::PointerLikeType> source,
+    mlir::Type varType) const;
+
+template bool OpenACCPointerLikeModel<fir::HeapType>::genCopy(
+    mlir::Type pointer, mlir::OpBuilder &builder, mlir::Location loc,
+    mlir::TypedValue<mlir::acc::PointerLikeType> destination,
+    mlir::TypedValue<mlir::acc::PointerLikeType> source,
+    mlir::Type varType) const;
+
+template bool OpenACCPointerLikeModel<fir::LLVMPointerType>::genCopy(
+    mlir::Type pointer, mlir::OpBuilder &builder, mlir::Location loc,
+    mlir::TypedValue<mlir::acc::PointerLikeType> destination,
+    mlir::TypedValue<mlir::acc::PointerLikeType> source,
+    mlir::Type varType) const;
 
 } // namespace fir::acc
diff --git a/flang/lib/Optimizer/OpenACC/Transforms/ACCRecipeBufferization.cpp b/flang/lib/Optimizer/OpenACC/Transforms/ACCRecipeBufferization.cpp
new file mode 100644
index 0000000000000..4840a999ecd27
--- /dev/null
+++ b/flang/lib/Optimizer/OpenACC/Transforms/ACCRecipeBufferization.cpp
@@ -0,0 +1,191 @@
+//===- ACCRecipeBufferization.cpp -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Bufferize OpenACC recipes that yield fir.box<T> to operate on
+// fir.ref<fir.box<T>> and update uses accordingly.
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Dialect/FIROps.h"
+#include "flang/Optimizer/OpenACC/Passes.h"
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/SymbolTable.h"
+#include "mlir/IR/Value.h"
+#include "mlir/IR/Visitors.h"
+#include "llvm/ADT/TypeSwitch.h"
+
+namespace fir::acc {
+#define GEN_PASS_DEF_ACCRECIPEBUFFERIZATION
+#include "flang/Optimizer/OpenACC/Passes.h.inc"
+} // namespace fir::acc
+
+namespace {
+
+class BufferizeInterface {
+public:
+  static std::optional<mlir::Type> mustBufferize(mlir::Type recipeType) {
+    if (auto boxTy = llvm::dyn_cast<fir::BaseBoxType>(recipeType))
+      return fir::ReferenceType::get(boxTy);
+    return std::nullopt;
+  }
+
+  static mlir::Operation *load(mlir::OpBuilder &builder, mlir::Location loc,
+                               mlir::Value value) {
+    return builder.create<fir::LoadOp>(loc, value);
+  }
+
+  static mlir::Value placeInMemory(mlir::OpBuilder &builder, mlir::Location loc,
+                                   mlir::Value value) {
+    auto alloca = builder.create<fir::AllocaOp>(loc, value.getType());
+    builder.create<fir::StoreOp>(loc, value, alloca);
+    return alloca;
+  }
+};
+
+static void bufferizeRegionArgsAndYields(mlir::Region &region,
+                                         mlir::Location loc, mlir::Type oldType,
+                                         mlir::Type newType) {
+  if (region.empty())
+    return;
+
+  mlir::OpBuilder builder(&region);
+  for (mlir::BlockArgument arg : region.getArguments()) {
+    if (arg.getType() == oldType) {
+      arg.setType(newType);
+      if (!arg.use_empty()) {
+        mlir::Operation *loadOp = BufferizeInterface::load(builder, loc, arg);
+        arg.replaceAllUsesExcept(loadOp->getResult(0), loadOp);
+      }
+    }
+  }
+  if (auto yield =
+          llvm::dyn_cast<mlir::acc::YieldOp>(region.back().getTerminator())) {
+    llvm::SmallVector<mlir::Value> newOperands;
+    newOperands.reserve(yield.getNumOperands());
+    bool changed = false;
+    for (mlir::Value oldYieldArg : yield.getOperands()) {
+      if (oldYieldArg.getType() == oldType) {
+        builder.setInsertionPoint(yield);
+        mlir::Value alloca =
+            BufferizeInterface::placeInMemory(builder, loc, oldYieldArg);
+        newOperands.push_back(alloca);
+        changed = true;
+      } else {
+        newOperands.push_back(oldYieldArg);
+      }
+    }
+    if (changed)
+      yield->setOperands(newOperands);
+  }
+}
+
+static void updateRecipeUse(mlir::ArrayAttr recipes, mlir::ValueRange operands,
+                            llvm::StringRef recipeSymName,
+                            mlir::Operation *computeOp) {
+  if (!recipes)
+    return;
+  for (auto [recipeSym, oldRes] : llvm::zip(recipes, operands)) {
+    if (llvm::cast<mlir::SymbolRefAttr>(recipeSym).getLeafReference() !=
+        recipeSymName)
+      continue;
+
+    mlir::Operation *dataOp = oldRes.getDefiningOp();
+    assert(dataOp && "dataOp must be paired with computeOp");
+    mlir::Location loc = dataOp->getLoc();
+    mlir::OpBuilder builder(dataOp);
+    llvm::TypeSwitch<mlir::Operation *, void>(dataOp)
+        .Case<mlir::acc::PrivateOp, mlir::acc::FirstprivateOp,
+              mlir::acc::ReductionOp>([&](auto privateOp) {
+          builder.setInsertionPointAfterValue(privateOp.getVar());
+          mlir::Value alloca = BufferizeInterface::placeInMemory(
+              builder, loc, privateOp.getVar());
+          privateOp.getVarMutable().assign(alloca);
+          privateOp.getAccVar().setType(alloca.getType());
+        });
+
+    llvm::SmallVector<mlir::Operation *> users(oldRes.getUsers().begin(),
+                                               oldRes.getUsers().end());
+    for (mlir::Operation *useOp : users) {
+      if (useOp == computeOp)
+        continue;
+      builder.setInsertionPoint(useOp);
+      mlir::Operation *load = BufferizeInterface::load(builder, loc, oldRes);
+      useOp->replaceUsesOfWith(oldRes, load->getResult(0));
+    }
+  }
+}
+
+class ACCRecipeBufferization
+    : public fir::acc::impl::ACCRecipeBufferizationBase<
+          ACCRecipeBufferization> {
+public:
+  void runOnOperation() override {
+    mlir::ModuleOp module = getOperation();
+
+    llvm::SmallVector<llvm::StringRef> recipeNames;
+    module.walk([&](mlir::Operation *recipe) {
+      llvm::TypeSwitch<mlir::Operation *, void>(recipe)
+          .Case<mlir::acc::PrivateRecipeOp, mlir::acc::FirstprivateRecipeOp,
+                mlir::acc::ReductionRecipeOp>([&](auto recipe) {
+            mlir::Type oldType = recipe.getType();
+            auto bufferizedType =
+                BufferizeInterface::mustBufferize(recipe.getType());
+            if (!bufferizedType)
+              return;
+            recipe.setTypeAttr(mlir::TypeAttr::get(*bufferizedType));
+            mlir::Location loc = recipe.getLoc();
+            using RecipeOp = decltype(recipe);
+            bufferizeRegionArgsAndYields(recipe.getInitRegion(), loc, oldType,
+                                         *bufferizedType);
+            if constexpr (std::is_same_v<RecipeOp,
+                                         mlir::acc::FirstprivateRecipeOp>)
+              bufferizeRegionArgsAndYields(recipe.getCopyRegion(), loc, oldType,
+                                           *bufferizedType);
+            if constexpr (std::is_same_v<RecipeOp,
+                                         mlir::acc::ReductionRecipeOp>)
+              bufferizeRegionArgsAndYields(recipe.getCombinerRegion(), loc,
+                                           oldType, *bufferizedType);
+            bufferizeRegionArgsAndYields(recipe.getDestroyRegion(), loc,
+                                         oldType, *bufferizedType);
+            recipeNames.push_back(recipe.getSymName());
+          });
+    });
+    if (recipeNames.empty())
+      return;
+
+    module.walk([&](mlir::Operation *op) {
+      llvm::TypeSwitch<mlir::Operation *, void>(op)
+          .Case<mlir::acc::LoopOp, mlir::acc::ParallelOp, mlir::acc::SerialOp>(
+              [&](auto computeOp) {
+                for (llvm::StringRef recipeName : recipeNames) {
+                  if (computeOp.getPrivatizationRecipes())
+                    updateRecipeUse(computeOp.getPrivatizationRecipesAttr(),
+                                    computeOp.getPrivateOperands(), recipeName,
+                                    op);
+                  if (computeOp.getFirstprivatizationRecipes())
+                    updateRecipeUse(
+                        computeOp.getFirstprivatizationRecipesAttr(),
+                        computeOp.getFirstprivateOperands(), recipeName, op);
+                  if (computeOp.getReductionRecipes())
+                    updateRecipeUse(computeOp.getReductionRecipesAttr(),
+                                    computeOp.getReductionOperands(),
+                                    recipeName, op);
+                }
+              });
+    });
+  }
+};
+
+} // namespace
+
+std::unique_ptr<mlir::Pass> fir::acc::createACCRecipeBufferizationPass() {
+  return std::make_unique<ACCRecipeBufferization>();
+}
diff --git a/flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt b/flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt
new file mode 100644
index 0000000000000..ed177baf52bea
--- /dev/null
+++ b/flang/lib/Optimizer/OpenACC/Transforms/CMakeLists.txt
@@ -0,0 +1,14 @@
+add_flang_library(FIROpenACCTransforms
+  ACCRecipeBufferization.cpp
+
+  DEPENDS
+  FIROpenACCPassesIncGen
+
+  LINK_LIBS
+  FIRDialect
+
+  MLIR_LIBS
+  MLIRIR
+  MLIRPass
+  MLIROpenACCDialect
+)
diff --git a/flang/lib/Optimizer/OpenMP/AutomapToTargetData.cpp b/flang/lib/Optimizer/OpenMP/AutomapToTargetData.cpp
index 8b9991301aae8..817434ff3dc30 100644
--- a/flang/lib/Optimizer/OpenMP/AutomapToTargetData.cpp
+++ b/flang/lib/Optimizer/OpenMP/AutomapToTargetData.cpp
@@ -20,8 +20,6 @@
 #include "mlir/IR/Operation.h"
 #include "mlir/Pass/Pass.h"
 
-#include "llvm/Frontend/OpenMP/OMPConstants.h"
-
 namespace flangomp {
 #define GEN_PASS_DEF_AUTOMAPTOTARGETDATAPASS
 #include "flang/Optimizer/OpenMP/Passes.h.inc"
@@ -120,12 +118,9 @@ class AutomapToTargetDataPass
           builder, memOp.getLoc(), memOp.getMemref().getType(),
           memOp.getMemref(),
           TypeAttr::get(fir::unwrapRefType(memOp.getMemref().getType())),
-          builder.getIntegerAttr(
-              builder.getIntegerType(64, false),
-              static_cast<unsigned>(
-                  isa<fir::StoreOp>(memOp)
-                      ? llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO
-                      : llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_DELETE)),
+          builder.getAttr<omp::ClauseMapFlagsAttr>(
+              isa<fir::StoreOp>(memOp) ? omp::ClauseMapFlags::to
+                                       : omp::ClauseMapFlags::del),
           builder.getAttr<omp::VariableCaptureKindAttr>(
               omp::VariableCaptureKind::ByCopy),
           /*var_ptr_ptr=*/mlir::Value{},
diff --git a/flang/lib/Optimizer/OpenMP/CMakeLists.txt b/flang/lib/Optimizer/OpenMP/CMakeLists.txt
index 579e47268afea..304333fa8830e 100644
--- a/flang/lib/Optimizer/OpenMP/CMakeLists.txt
+++ b/flang/lib/Optimizer/OpenMP/CMakeLists.txt
@@ -9,6 +9,7 @@ add_flang_library(FlangOpenMPTransforms
   MapsForPrivatizedSymbols.cpp
   MapInfoFinalization.cpp
   MarkDeclareTarget.cpp
+  LowerWorkdistribute.cpp
   LowerWorkshare.cpp
   LowerNontemporal.cpp
   SimdOnly.cpp
diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
index 03ff16366a9d2..65a23be243716 100644
--- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
+++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
@@ -22,7 +22,6 @@
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/RegionUtils.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/Frontend/OpenMP/OMPConstants.h"
 
 namespace flangomp {
 #define GEN_PASS_DEF_DOCONCURRENTCONVERSIONPASS
@@ -568,16 +567,15 @@ class DoConcurrentConversion
     if (auto refType = mlir::dyn_cast<fir::ReferenceType>(liveInType))
       eleType = refType.getElementType();
 
-    llvm::omp::OpenMPOffloadMappingFlags mapFlag =
-        llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT;
+    mlir::omp::ClauseMapFlags mapFlag = mlir::omp::ClauseMapFlags::implicit;
     mlir::omp::VariableCaptureKind captureKind =
         mlir::omp::VariableCaptureKind::ByRef;
 
     if (fir::isa_trivial(eleType) || fir::isa_char(eleType)) {
       captureKind = mlir::omp::VariableCaptureKind::ByCopy;
     } else if (!fir::isa_builtin_cptr_type(eleType)) {
-      mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO;
-      mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM;
+      mapFlag |= mlir::omp::ClauseMapFlags::to;
+      mapFlag |= mlir::omp::ClauseMapFlags::from;
     }
 
     llvm::SmallVector<mlir::Value> boundsOps;
@@ -587,11 +585,8 @@ class DoConcurrentConversion
         builder, liveIn.getLoc(), rawAddr,
         /*varPtrPtr=*/{}, name.str(), boundsOps,
         /*members=*/{},
-        /*membersIndex=*/mlir::ArrayAttr{},
-        static_cast<
-            std::underlying_type_t<llvm::omp::OpenMPOffloadMappingFlags>>(
-            mapFlag),
-        captureKind, rawAddr.getType());
+        /*membersIndex=*/mlir::ArrayAttr{}, mapFlag, captureKind,
+        rawAddr.getType());
   }
 
   mlir::omp::TargetOp
diff --git a/flang/lib/Optimizer/OpenMP/LowerWorkdistribute.cpp b/flang/lib/Optimizer/OpenMP/LowerWorkdistribute.cpp
new file mode 100644
index 0000000000000..8a9b383ec1356
--- /dev/null
+++ b/flang/lib/Optimizer/OpenMP/LowerWorkdistribute.cpp
@@ -0,0 +1,1844 @@
+//===- LowerWorkdistribute.cpp
+//-------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the lowering and optimisations of omp.workdistribute.
+//
+// Fortran array statements are lowered to fir as fir.do_loop unordered.
+// lower-workdistribute pass works mainly on identifying fir.do_loop unordered
+// that is nested in target{teams{workdistribute{fir.do_loop unordered}}} and
+// lowers it to target{teams{parallel{distribute{wsloop{loop_nest}}}}}.
+// It hoists all the other ops outside target region.
+// Relaces heap allocation on target with omp.target_allocmem and
+// deallocation with omp.target_freemem from host. Also replaces
+// runtime function "Assign" with omp_target_memcpy.
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Builder/FIRBuilder.h"
+#include "flang/Optimizer/Dialect/FIRDialect.h"
+#include "flang/Optimizer/Dialect/FIROps.h"
+#include "flang/Optimizer/Dialect/FIRType.h"
+#include "flang/Optimizer/HLFIR/Passes.h"
+#include "flang/Optimizer/OpenMP/Utils.h"
+#include "flang/Optimizer/Transforms/Passes.h"
+#include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "mlir/Transforms/RegionUtils.h"
+#include "llvm/Frontend/OpenMP/OMPConstants.h"
+#include <mlir/Dialect/Arith/IR/Arith.h>
+#include <mlir/Dialect/LLVMIR/LLVMTypes.h>
+#include <mlir/Dialect/Utils/IndexingUtils.h>
+#include <mlir/IR/BlockSupport.h>
+#include <mlir/IR/BuiltinOps.h>
+#include <mlir/IR/Diagnostics.h>
+#include <mlir/IR/IRMapping.h>
+#include <mlir/IR/PatternMatch.h>
+#include <mlir/Interfaces/SideEffectInterfaces.h>
+#include <mlir/Support/LLVM.h>
+#include <optional>
+#include <variant>
+
+namespace flangomp {
+#define GEN_PASS_DEF_LOWERWORKDISTRIBUTE
+#include "flang/Optimizer/OpenMP/Passes.h.inc"
+} // namespace flangomp
+
+#define DEBUG_TYPE "lower-workdistribute"
+
+using namespace mlir;
+
+namespace {
+
+/// This string is used to identify the Fortran-specific runtime FortranAAssign.
+static constexpr llvm::StringRef FortranAssignStr = "_FortranAAssign";
+
+/// The isRuntimeCall function is a utility designed to determine
+/// if a given operation is a call to a Fortran-specific runtime function.
+static bool isRuntimeCall(Operation *op) {
+  if (auto callOp = dyn_cast<fir::CallOp>(op)) {
+    auto callee = callOp.getCallee();
+    if (!callee)
+      return false;
+    auto *func = op->getParentOfType<ModuleOp>().lookupSymbol(*callee);
+    if (func->getAttr(fir::FIROpsDialect::getFirRuntimeAttrName()))
+      return true;
+  }
+  return false;
+}
+
+/// This is the single source of truth about whether we should parallelize an
+/// operation nested in an omp.workdistribute region.
+/// Parallelize here refers to dividing into units of work.
+static bool shouldParallelize(Operation *op) {
+  // True if the op is a runtime call to Assign
+  if (isRuntimeCall(op)) {
+    fir::CallOp runtimeCall = cast<fir::CallOp>(op);
+    auto funcName = runtimeCall.getCallee()->getRootReference().getValue();
+    if (funcName == FortranAssignStr) {
+      return true;
+    }
+  }
+  // We cannot parallelize ops with side effects.
+  // Parallelizable operations should not produce
+  // values that other operations depend on
+  if (llvm::any_of(op->getResults(),
+                   [](OpResult v) -> bool { return !v.use_empty(); }))
+    return false;
+  // We will parallelize unordered loops - these come from array syntax
+  if (auto loop = dyn_cast<fir::DoLoopOp>(op)) {
+    auto unordered = loop.getUnordered();
+    if (!unordered)
+      return false;
+    return *unordered;
+  }
+  // We cannot parallelize anything else.
+  return false;
+}
+
+/// The getPerfectlyNested function is a generic utility for finding
+/// a single, "perfectly nested" operation within a parent operation.
+template <typename T>
+static T getPerfectlyNested(Operation *op) {
+  if (op->getNumRegions() != 1)
+    return nullptr;
+  auto &region = op->getRegion(0);
+  if (region.getBlocks().size() != 1)
+    return nullptr;
+  auto *block = &region.front();
+  auto *firstOp = &block->front();
+  if (auto nested = dyn_cast<T>(firstOp))
+    if (firstOp->getNextNode() == block->getTerminator())
+      return nested;
+  return nullptr;
+}
+
+/// verifyTargetTeamsWorkdistribute method verifies that
+/// omp.target { teams { workdistribute { ... } } } is well formed
+/// and fails for function calls that don't have lowering implemented yet.
+static LogicalResult
+verifyTargetTeamsWorkdistribute(omp::WorkdistributeOp workdistribute) {
+  OpBuilder rewriter(workdistribute);
+  auto loc = workdistribute->getLoc();
+  auto teams = dyn_cast<omp::TeamsOp>(workdistribute->getParentOp());
+  if (!teams) {
+    emitError(loc, "workdistribute not nested in teams\n");
+    return failure();
+  }
+  if (workdistribute.getRegion().getBlocks().size() != 1) {
+    emitError(loc, "workdistribute with multiple blocks\n");
+    return failure();
+  }
+  if (teams.getRegion().getBlocks().size() != 1) {
+    emitError(loc, "teams with multiple blocks\n");
+    return failure();
+  }
+
+  bool foundWorkdistribute = false;
+  for (auto &op : teams.getOps()) {
+    if (isa<omp::WorkdistributeOp>(op)) {
+      if (foundWorkdistribute) {
+        emitError(loc, "teams has multiple workdistribute ops.\n");
+        return failure();
+      }
+      foundWorkdistribute = true;
+      continue;
+    }
+    // Identify any omp dialect ops present before/after workdistribute.
+    if (op.getDialect() && isa<omp::OpenMPDialect>(op.getDialect()) &&
+        !isa<omp::TerminatorOp>(op)) {
+      emitError(loc, "teams has omp ops other than workdistribute. Lowering "
+                     "not implemented yet.\n");
+      return failure();
+    }
+  }
+
+  omp::TargetOp targetOp = dyn_cast<omp::TargetOp>(teams->getParentOp());
+  // return if not omp.target
+  if (!targetOp)
+    return success();
+
+  for (auto &op : workdistribute.getOps()) {
+    if (auto callOp = dyn_cast<fir::CallOp>(op)) {
+      if (isRuntimeCall(&op)) {
+        auto funcName = (*callOp.getCallee()).getRootReference().getValue();
+        // _FortranAAssign is handled. Other runtime calls are not supported
+        // in omp.workdistribute yet.
+        if (funcName == FortranAssignStr)
+          continue;
+        else {
+          emitError(loc, "Runtime call " + funcName +
+                             " lowering not supported for workdistribute yet.");
+          return failure();
+        }
+      }
+    }
+  }
+  return success();
+}
+
+/// fissionWorkdistribute method finds the parallelizable ops
+/// within teams {workdistribute} region and moves them to their
+/// own teams{workdistribute} region.
+///
+/// If B() and D() are parallelizable,
+///
+/// omp.teams {
+///   omp.workdistribute {
+///     A()
+///     B()
+///     C()
+///     D()
+///     E()
+///   }
+/// }
+///
+/// becomes
+///
+/// A()
+/// omp.teams {
+///   omp.workdistribute {
+///     B()
+///   }
+/// }
+/// C()
+/// omp.teams {
+///   omp.workdistribute {
+///     D()
+///   }
+/// }
+/// E()
+static FailureOr<bool>
+fissionWorkdistribute(omp::WorkdistributeOp workdistribute) {
+  OpBuilder rewriter(workdistribute);
+  auto loc = workdistribute->getLoc();
+  auto teams = dyn_cast<omp::TeamsOp>(workdistribute->getParentOp());
+  auto *teamsBlock = &teams.getRegion().front();
+  bool changed = false;
+  // Move the ops inside teams and before workdistribute outside.
+  IRMapping irMapping;
+  llvm::SmallVector<Operation *> teamsHoisted;
+  for (auto &op : teams.getOps()) {
+    if (&op == workdistribute) {
+      break;
+    }
+    if (shouldParallelize(&op)) {
+      emitError(loc, "teams has parallelize ops before first workdistribute\n");
+      return failure();
+    } else {
+      rewriter.setInsertionPoint(teams);
+      rewriter.clone(op, irMapping);
+      teamsHoisted.push_back(&op);
+      changed = true;
+    }
+  }
+  for (auto *op : llvm::reverse(teamsHoisted)) {
+    op->replaceAllUsesWith(irMapping.lookup(op));
+    op->erase();
+  }
+
+  // While we have unhandled operations in the original workdistribute
+  auto *workdistributeBlock = &workdistribute.getRegion().front();
+  auto *terminator = workdistributeBlock->getTerminator();
+  while (&workdistributeBlock->front() != terminator) {
+    rewriter.setInsertionPoint(teams);
+    IRMapping mapping;
+    llvm::SmallVector<Operation *> hoisted;
+    Operation *parallelize = nullptr;
+    for (auto &op : workdistribute.getOps()) {
+      if (&op == terminator) {
+        break;
+      }
+      if (shouldParallelize(&op)) {
+        parallelize = &op;
+        break;
+      } else {
+        rewriter.clone(op, mapping);
+        hoisted.push_back(&op);
+        changed = true;
+      }
+    }
+
+    for (auto *op : llvm::reverse(hoisted)) {
+      op->replaceAllUsesWith(mapping.lookup(op));
+      op->erase();
+    }
+
+    if (parallelize && hoisted.empty() &&
+        parallelize->getNextNode() == terminator)
+      break;
+    if (parallelize) {
+      auto newTeams = rewriter.cloneWithoutRegions(teams);
+      auto *newTeamsBlock = rewriter.createBlock(
+          &newTeams.getRegion(), newTeams.getRegion().begin(), {}, {});
+      for (auto arg : teamsBlock->getArguments())
+        newTeamsBlock->addArgument(arg.getType(), arg.getLoc());
+      auto newWorkdistribute = rewriter.create<omp::WorkdistributeOp>(loc);
+      rewriter.create<omp::TerminatorOp>(loc);
+      rewriter.createBlock(&newWorkdistribute.getRegion(),
+                           newWorkdistribute.getRegion().begin(), {}, {});
+      auto *cloned = rewriter.clone(*parallelize);
+      parallelize->replaceAllUsesWith(cloned);
+      parallelize->erase();
+      rewriter.create<omp::TerminatorOp>(loc);
+      changed = true;
+    }
+  }
+  return changed;
+}
+
+/// Generate omp.parallel operation with an empty region.
+static void genParallelOp(Location loc, OpBuilder &rewriter, bool composite) {
+  auto parallelOp = rewriter.create<mlir::omp::ParallelOp>(loc);
+  parallelOp.setComposite(composite);
+  rewriter.createBlock(&parallelOp.getRegion());
+  rewriter.setInsertionPoint(rewriter.create<mlir::omp::TerminatorOp>(loc));
+  return;
+}
+
+/// Generate omp.distribute operation with an empty region.
+static void genDistributeOp(Location loc, OpBuilder &rewriter, bool composite) {
+  mlir::omp::DistributeOperands distributeClauseOps;
+  auto distributeOp =
+      rewriter.create<mlir::omp::DistributeOp>(loc, distributeClauseOps);
+  distributeOp.setComposite(composite);
+  auto distributeBlock = rewriter.createBlock(&distributeOp.getRegion());
+  rewriter.setInsertionPointToStart(distributeBlock);
+  return;
+}
+
+/// Generate loop nest clause operands from fir.do_loop operation.
+static void
+genLoopNestClauseOps(OpBuilder &rewriter, fir::DoLoopOp loop,
+                     mlir::omp::LoopNestOperands &loopNestClauseOps) {
+  assert(loopNestClauseOps.loopLowerBounds.empty() &&
+         "Loop nest bounds were already emitted!");
+  loopNestClauseOps.loopLowerBounds.push_back(loop.getLowerBound());
+  loopNestClauseOps.loopUpperBounds.push_back(loop.getUpperBound());
+  loopNestClauseOps.loopSteps.push_back(loop.getStep());
+  loopNestClauseOps.loopInclusive = rewriter.getUnitAttr();
+}
+
+/// Generate omp.wsloop operation with an empty region and
+/// clone the body of fir.do_loop operation inside the loop nest region.
+static void genWsLoopOp(mlir::OpBuilder &rewriter, fir::DoLoopOp doLoop,
+                        const mlir::omp::LoopNestOperands &clauseOps,
+                        bool composite) {
+
+  auto wsloopOp = rewriter.create<mlir::omp::WsloopOp>(doLoop.getLoc());
+  wsloopOp.setComposite(composite);
+  rewriter.createBlock(&wsloopOp.getRegion());
+
+  auto loopNestOp =
+      rewriter.create<mlir::omp::LoopNestOp>(doLoop.getLoc(), clauseOps);
+
+  // Clone the loop's body inside the loop nest construct using the
+  // mapped values.
+  rewriter.cloneRegionBefore(doLoop.getRegion(), loopNestOp.getRegion(),
+                             loopNestOp.getRegion().begin());
+  Block *clonedBlock = &loopNestOp.getRegion().back();
+  mlir::Operation *terminatorOp = clonedBlock->getTerminator();
+
+  // Erase fir.result op of do loop and create yield op.
+  if (auto resultOp = dyn_cast<fir::ResultOp>(terminatorOp)) {
+    rewriter.setInsertionPoint(terminatorOp);
+    rewriter.create<mlir::omp::YieldOp>(doLoop->getLoc());
+    terminatorOp->erase();
+  }
+}
+
+/// workdistributeDoLower method finds the fir.do_loop unoredered
+/// nested in teams {workdistribute{fir.do_loop unoredered}} and
+/// lowers it to teams {parallel { distribute {wsloop {loop_nest}}}}.
+///
+/// If fir.do_loop is present inside teams workdistribute
+///
+/// omp.teams {
+///   omp.workdistribute {
+///     fir.do_loop unoredered {
+///       ...
+///     }
+///   }
+/// }
+///
+/// Then, its lowered to
+///
+/// omp.teams {
+///    omp.parallel {
+///      omp.distribute {
+///        omp.wsloop {
+///          omp.loop_nest
+///            ...
+///          }
+///        }
+///      }
+///   }
+/// }
+static bool
+workdistributeDoLower(omp::WorkdistributeOp workdistribute,
+                      SetVector<omp::TargetOp> &targetOpsToProcess) {
+  OpBuilder rewriter(workdistribute);
+  auto doLoop = getPerfectlyNested<fir::DoLoopOp>(workdistribute);
+  auto wdLoc = workdistribute->getLoc();
+  if (doLoop && shouldParallelize(doLoop)) {
+    assert(doLoop.getReduceOperands().empty());
+
+    // Record the target ops to process later
+    if (auto teamsOp = dyn_cast<omp::TeamsOp>(workdistribute->getParentOp())) {
+      auto targetOp = dyn_cast<omp::TargetOp>(teamsOp->getParentOp());
+      if (targetOp) {
+        targetOpsToProcess.insert(targetOp);
+      }
+    }
+    // Generate the nested parallel, distribute, wsloop and loop_nest ops.
+    genParallelOp(wdLoc, rewriter, true);
+    genDistributeOp(wdLoc, rewriter, true);
+    mlir::omp::LoopNestOperands loopNestClauseOps;
+    genLoopNestClauseOps(rewriter, doLoop, loopNestClauseOps);
+    genWsLoopOp(rewriter, doLoop, loopNestClauseOps, true);
+    workdistribute.erase();
+    return true;
+  }
+  return false;
+}
+
+/// Check if the enclosed type in fir.ref is fir.box and fir.box encloses array
+static bool isEnclosedTypeRefToBoxArray(Type type) {
+  // Check if it's a reference type
+  if (auto refType = dyn_cast<fir::ReferenceType>(type)) {
+    // Get the referenced type (should be fir.box)
+    auto referencedType = refType.getEleTy();
+    // Check if referenced type is a box
+    if (auto boxType = dyn_cast<fir::BoxType>(referencedType)) {
+      // Get the boxed type and check if it's an array
+      auto boxedType = boxType.getEleTy();
+      // Check if boxed type is a sequence (array)
+      return isa<fir::SequenceType>(boxedType);
+    }
+  }
+  return false;
+}
+
+/// Check if the enclosed type in fir.box is scalar (not array)
+static bool isEnclosedTypeBoxScalar(Type type) {
+  // Check if it's a box type
+  if (auto boxType = dyn_cast<fir::BoxType>(type)) {
+    // Get the boxed type
+    auto boxedType = boxType.getEleTy();
+    // Check if boxed type is NOT a sequence (array)
+    return !isa<fir::SequenceType>(boxedType);
+  }
+  return false;
+}
+
+/// Check if the FortranAAssign call has src as scalar and dest as array
+static bool isFortranAssignSrcScalarAndDestArray(fir::CallOp callOp) {
+  if (callOp.getNumOperands() < 2)
+    return false;
+  auto srcArg = callOp.getOperand(1);
+  auto destArg = callOp.getOperand(0);
+  // Both operands should be fir.convert ops
+  auto srcConvert = srcArg.getDefiningOp<fir::ConvertOp>();
+  auto destConvert = destArg.getDefiningOp<fir::ConvertOp>();
+  if (!srcConvert || !destConvert) {
+    emitError(callOp->getLoc(),
+              "Unimplemented: FortranAssign to OpenMP lowering\n");
+    return false;
+  }
+  // Get the original types before conversion
+  auto srcOrigType = srcConvert.getValue().getType();
+  auto destOrigType = destConvert.getValue().getType();
+
+  // Check if src is scalar and dest is array
+  bool srcIsScalar = isEnclosedTypeBoxScalar(srcOrigType);
+  bool destIsArray = isEnclosedTypeRefToBoxArray(destOrigType);
+  return srcIsScalar && destIsArray;
+}
+
+/// Convert a flat index to multi-dimensional indices for an array box
+/// Example: 2D array with shape (2,4)
+///         Col 1  Col 2  Col 3  Col 4
+/// Row 1:  (1,1)  (1,2)  (1,3)  (1,4)
+/// Row 2:  (2,1)  (2,2)  (2,3)  (2,4)
+///
+/// extents: (2,4)
+///
+/// flatIdx:  0     1     2     3     4     5     6     7
+/// Indices: (1,1) (1,2) (1,3) (1,4) (2,1) (2,2) (2,3) (2,4)
+static SmallVector<Value> convertFlatToMultiDim(OpBuilder &builder,
+                                                Location loc, Value flatIdx,
+                                                Value arrayBox) {
+  // Get array type and rank
+  auto boxType = cast<fir::BoxType>(arrayBox.getType());
+  auto seqType = cast<fir::SequenceType>(boxType.getEleTy());
+  int rank = seqType.getDimension();
+
+  // Get all extents
+  SmallVector<Value> extents;
+  // Get extents for each dimension
+  for (int i = 0; i < rank; ++i) {
+    auto dimIdx = arith::ConstantIndexOp::create(builder, loc, i);
+    auto boxDims = fir::BoxDimsOp::create(builder, loc, arrayBox, dimIdx);
+    extents.push_back(boxDims.getResult(1));
+  }
+
+  // Convert flat index to multi-dimensional indices
+  SmallVector<Value> indices(rank);
+  Value temp = flatIdx;
+  auto c1 = builder.create<arith::ConstantIndexOp>(loc, 1);
+
+  // Work backwards through dimensions (row-major order)
+  for (int i = rank - 1; i >= 0; --i) {
+    Value zeroBasedIdx = builder.create<arith::RemSIOp>(loc, temp, extents[i]);
+    // Convert to one-based index
+    indices[i] = builder.create<arith::AddIOp>(loc, zeroBasedIdx, c1);
+    if (i > 0) {
+      temp = builder.create<arith::DivSIOp>(loc, temp, extents[i]);
+    }
+  }
+
+  return indices;
+}
+
+/// Calculate the total number of elements in the array box
+/// (totalElems = extent(1) * extent(2) * ... * extent(n))
+static Value CalculateTotalElements(OpBuilder &builder, Location loc,
+                                    Value arrayBox) {
+  auto boxType = cast<fir::BoxType>(arrayBox.getType());
+  auto seqType = cast<fir::SequenceType>(boxType.getEleTy());
+  int rank = seqType.getDimension();
+
+  Value totalElems = nullptr;
+  for (int i = 0; i < rank; ++i) {
+    auto dimIdx = arith::ConstantIndexOp::create(builder, loc, i);
+    auto boxDims = fir::BoxDimsOp::create(builder, loc, arrayBox, dimIdx);
+    Value extent = boxDims.getResult(1);
+    if (i == 0) {
+      totalElems = extent;
+    } else {
+      totalElems = builder.create<arith::MulIOp>(loc, totalElems, extent);
+    }
+  }
+  return totalElems;
+}
+
+/// Replace the FortranAAssign runtime call with an unordered do loop
+static void replaceWithUnorderedDoLoop(OpBuilder &builder, Location loc,
+                                       omp::TeamsOp teamsOp,
+                                       omp::WorkdistributeOp workdistribute,
+                                       fir::CallOp callOp) {
+  auto destConvert = callOp.getOperand(0).getDefiningOp<fir::ConvertOp>();
+  auto srcConvert = callOp.getOperand(1).getDefiningOp<fir::ConvertOp>();
+
+  Value destBox = destConvert.getValue();
+  Value srcBox = srcConvert.getValue();
+
+  // get defining alloca op of destBox and srcBox
+  auto destAlloca = destBox.getDefiningOp<fir::AllocaOp>();
+
+  if (!destAlloca) {
+    emitError(loc, "Unimplemented: FortranAssign to OpenMP lowering\n");
+    return;
+  }
+
+  // get the store op that stores to the alloca
+  for (auto user : destAlloca->getUsers()) {
+    if (auto storeOp = dyn_cast<fir::StoreOp>(user)) {
+      destBox = storeOp.getValue();
+      break;
+    }
+  }
+
+  builder.setInsertionPoint(teamsOp);
+  // Load destination array box (if it's a reference)
+  Value arrayBox = destBox;
+  if (isa<fir::ReferenceType>(destBox.getType()))
+    arrayBox = builder.create<fir::LoadOp>(loc, destBox);
+
+  auto scalarValue = builder.create<fir::BoxAddrOp>(loc, srcBox);
+  Value scalar = builder.create<fir::LoadOp>(loc, scalarValue);
+
+  // Calculate total number of elements (flattened)
+  auto c0 = builder.create<arith::ConstantIndexOp>(loc, 0);
+  auto c1 = builder.create<arith::ConstantIndexOp>(loc, 1);
+  Value totalElems = CalculateTotalElements(builder, loc, arrayBox);
+
+  auto *workdistributeBlock = &workdistribute.getRegion().front();
+  builder.setInsertionPointToStart(workdistributeBlock);
+  // Create single unordered loop for flattened array
+  auto doLoop = fir::DoLoopOp::create(builder, loc, c0, totalElems, c1, true);
+  Block *loopBlock = &doLoop.getRegion().front();
+  builder.setInsertionPointToStart(doLoop.getBody());
+
+  auto flatIdx = loopBlock->getArgument(0);
+  SmallVector<Value> indices =
+      convertFlatToMultiDim(builder, loc, flatIdx, arrayBox);
+  // Use fir.array_coor for linear addressing
+  auto elemPtr = fir::ArrayCoorOp::create(
+      builder, loc, fir::ReferenceType::get(scalar.getType()), arrayBox,
+      nullptr, nullptr, ValueRange{indices}, ValueRange{});
+
+  builder.create<fir::StoreOp>(loc, scalar, elemPtr);
+}
+
+/// workdistributeRuntimeCallLower method finds the runtime calls
+/// nested in teams {workdistribute{}} and
+/// lowers FortranAAssign to unordered do loop if src is scalar and dest is
+/// array. Other runtime calls are not handled currently.
+static FailureOr<bool>
+workdistributeRuntimeCallLower(omp::WorkdistributeOp workdistribute,
+                               SetVector<omp::TargetOp> &targetOpsToProcess) {
+  OpBuilder rewriter(workdistribute);
+  auto loc = workdistribute->getLoc();
+  auto teams = dyn_cast<omp::TeamsOp>(workdistribute->getParentOp());
+  if (!teams) {
+    emitError(loc, "workdistribute not nested in teams\n");
+    return failure();
+  }
+  if (workdistribute.getRegion().getBlocks().size() != 1) {
+    emitError(loc, "workdistribute with multiple blocks\n");
+    return failure();
+  }
+  if (teams.getRegion().getBlocks().size() != 1) {
+    emitError(loc, "teams with multiple blocks\n");
+    return failure();
+  }
+  bool changed = false;
+  // Get the target op parent of teams
+  omp::TargetOp targetOp = dyn_cast<omp::TargetOp>(teams->getParentOp());
+  SmallVector<Operation *> opsToErase;
+  for (auto &op : workdistribute.getOps()) {
+    if (isRuntimeCall(&op)) {
+      rewriter.setInsertionPoint(&op);
+      fir::CallOp runtimeCall = cast<fir::CallOp>(op);
+      auto funcName = runtimeCall.getCallee()->getRootReference().getValue();
+      if (funcName == FortranAssignStr) {
+        if (isFortranAssignSrcScalarAndDestArray(runtimeCall) && targetOp) {
+          // Record the target ops to process later
+          targetOpsToProcess.insert(targetOp);
+          replaceWithUnorderedDoLoop(rewriter, loc, teams, workdistribute,
+                                     runtimeCall);
+          opsToErase.push_back(&op);
+          changed = true;
+        }
+      }
+    }
+  }
+  // Erase the runtime calls that have been replaced.
+  for (auto *op : opsToErase) {
+    op->erase();
+  }
+  return changed;
+}
+
+/// teamsWorkdistributeToSingleOp method hoists all the ops inside
+/// teams {workdistribute{}} before teams op.
+///
+/// If A() and B () are present inside teams workdistribute
+///
+/// omp.teams {
+///   omp.workdistribute {
+///     A()
+///     B()
+///   }
+/// }
+///
+/// Then, its lowered to
+///
+/// A()
+/// B()
+///
+/// If only the terminator remains in teams after hoisting, we erase teams op.
+static bool
+teamsWorkdistributeToSingleOp(omp::TeamsOp teamsOp,
+                              SetVector<omp::TargetOp> &targetOpsToProcess) {
+  auto workdistributeOp = getPerfectlyNested<omp::WorkdistributeOp>(teamsOp);
+  if (!workdistributeOp)
+    return false;
+  // Get the block containing teamsOp (the parent block).
+  Block *parentBlock = teamsOp->getBlock();
+  Block &workdistributeBlock = *workdistributeOp.getRegion().begin();
+  // Record the target ops to process later
+  for (auto &op : workdistributeBlock.getOperations()) {
+    if (shouldParallelize(&op)) {
+      auto targetOp = dyn_cast<omp::TargetOp>(teamsOp->getParentOp());
+      if (targetOp) {
+        targetOpsToProcess.insert(targetOp);
+      }
+    }
+  }
+  auto insertPoint = Block::iterator(teamsOp);
+  // Get the range of operations to move (excluding the terminator).
+  auto workdistributeBegin = workdistributeBlock.begin();
+  auto workdistributeEnd = workdistributeBlock.getTerminator()->getIterator();
+  // Move the operations from workdistribute block to before teamsOp.
+  parentBlock->getOperations().splice(insertPoint,
+                                      workdistributeBlock.getOperations(),
+                                      workdistributeBegin, workdistributeEnd);
+  // Erase the now-empty workdistributeOp.
+  workdistributeOp.erase();
+  Block &teamsBlock = *teamsOp.getRegion().begin();
+  // Check if only the terminator remains and erase teams op.
+  if (teamsBlock.getOperations().size() == 1 &&
+      teamsBlock.getTerminator() != nullptr) {
+    teamsOp.erase();
+  }
+  return true;
+}
+
+/// If multiple workdistribute are nested in a target regions, we will need to
+/// split the target region, but we want to preserve the data semantics of the
+/// original data region and avoid unnecessary data movement at each of the
+/// subkernels - we split the target region into a target_data{target}
+/// nest where only the outer one moves the data
+FailureOr<omp::TargetOp> splitTargetData(omp::TargetOp targetOp,
+                                         RewriterBase &rewriter) {
+  auto loc = targetOp->getLoc();
+  if (targetOp.getMapVars().empty()) {
+    emitError(loc, "Target region has no data maps\n");
+    return failure();
+  }
+  // Collect all the mapinfo ops
+  SmallVector<omp::MapInfoOp> mapInfos;
+  for (auto opr : targetOp.getMapVars()) {
+    auto mapInfo = cast<omp::MapInfoOp>(opr.getDefiningOp());
+    mapInfos.push_back(mapInfo);
+  }
+
+  rewriter.setInsertionPoint(targetOp);
+  SmallVector<Value> innerMapInfos;
+  SmallVector<Value> outerMapInfos;
+  // Create new mapinfo ops for the inner target region
+  for (auto mapInfo : mapInfos) {
+    mlir::omp::ClauseMapFlags originalMapType = mapInfo.getMapType();
+    auto originalCaptureType = mapInfo.getMapCaptureType();
+    mlir::omp::ClauseMapFlags newMapType;
+    mlir::omp::VariableCaptureKind newCaptureType;
+    // For bycopy, we keep the same map type and capture type
+    // For byref, we change the map type to none and keep the capture type
+    if (originalCaptureType == mlir::omp::VariableCaptureKind::ByCopy) {
+      newMapType = originalMapType;
+      newCaptureType = originalCaptureType;
+    } else if (originalCaptureType == mlir::omp::VariableCaptureKind::ByRef) {
+      newMapType = mlir::omp::ClauseMapFlags::storage;
+      newCaptureType = originalCaptureType;
+      outerMapInfos.push_back(mapInfo);
+    } else {
+      emitError(targetOp->getLoc(), "Unhandled case");
+      return failure();
+    }
+    auto innerMapInfo = cast<omp::MapInfoOp>(rewriter.clone(*mapInfo));
+    innerMapInfo.setMapTypeAttr(
+        rewriter.getAttr<omp::ClauseMapFlagsAttr>(newMapType));
+    innerMapInfo.setMapCaptureType(newCaptureType);
+    innerMapInfos.push_back(innerMapInfo.getResult());
+  }
+
+  rewriter.setInsertionPoint(targetOp);
+  auto device = targetOp.getDevice();
+  auto ifExpr = targetOp.getIfExpr();
+  auto deviceAddrVars = targetOp.getHasDeviceAddrVars();
+  auto devicePtrVars = targetOp.getIsDevicePtrVars();
+  // Create the target data op
+  auto targetDataOp = rewriter.create<omp::TargetDataOp>(
+      loc, device, ifExpr, outerMapInfos, deviceAddrVars, devicePtrVars);
+  auto taregtDataBlock = rewriter.createBlock(&targetDataOp.getRegion());
+  rewriter.create<mlir::omp::TerminatorOp>(loc);
+  rewriter.setInsertionPointToStart(taregtDataBlock);
+  // Create the inner target op
+  auto newTargetOp = rewriter.create<omp::TargetOp>(
+      targetOp.getLoc(), targetOp.getAllocateVars(),
+      targetOp.getAllocatorVars(), targetOp.getBareAttr(),
+      targetOp.getDependKindsAttr(), targetOp.getDependVars(),
+      targetOp.getDevice(), targetOp.getHasDeviceAddrVars(),
+      targetOp.getHostEvalVars(), targetOp.getIfExpr(),
+      targetOp.getInReductionVars(), targetOp.getInReductionByrefAttr(),
+      targetOp.getInReductionSymsAttr(), targetOp.getIsDevicePtrVars(),
+      innerMapInfos, targetOp.getNowaitAttr(), targetOp.getPrivateVars(),
+      targetOp.getPrivateSymsAttr(), targetOp.getPrivateNeedsBarrierAttr(),
+      targetOp.getThreadLimit(), targetOp.getPrivateMapsAttr());
+  rewriter.inlineRegionBefore(targetOp.getRegion(), newTargetOp.getRegion(),
+                              newTargetOp.getRegion().begin());
+  rewriter.replaceOp(targetOp, targetDataOp);
+  return newTargetOp;
+}
+
+/// getNestedOpToIsolate function is designed to identify a specific teams
+/// parallel op within the body of an omp::TargetOp that should be "isolated."
+/// This returns a tuple of op, if its first op in targetBlock, or if the op is
+/// last op in the traget block.
+static std::optional<std::tuple<Operation *, bool, bool>>
+getNestedOpToIsolate(omp::TargetOp targetOp) {
+  if (targetOp.getRegion().empty())
+    return std::nullopt;
+  auto *targetBlock = &targetOp.getRegion().front();
+  for (auto &op : *targetBlock) {
+    bool first = &op == &*targetBlock->begin();
+    bool last = op.getNextNode() == targetBlock->getTerminator();
+    if (first && last)
+      return std::nullopt;
+
+    if (isa<omp::TeamsOp>(&op))
+      return {{&op, first, last}};
+  }
+  return std::nullopt;
+}
+
+/// Temporary structure to hold the two mapinfo ops
+struct TempOmpVar {
+  omp::MapInfoOp from, to;
+};
+
+/// isPtr checks if the type is a pointer or reference type.
+static bool isPtr(Type ty) {
+  return isa<fir::ReferenceType>(ty) || isa<LLVM::LLVMPointerType>(ty);
+}
+
+/// getPtrTypeForOmp returns an LLVM pointer type for the given type.
+static Type getPtrTypeForOmp(Type ty) {
+  if (isPtr(ty))
+    return LLVM::LLVMPointerType::get(ty.getContext());
+  else
+    return fir::ReferenceType::get(ty);
+}
+
+/// allocateTempOmpVar allocates a temporary variable for OpenMP mapping
+static TempOmpVar allocateTempOmpVar(Location loc, Type ty,
+                                     RewriterBase &rewriter) {
+  MLIRContext &ctx = *ty.getContext();
+  Value alloc;
+  Type allocType;
+  auto llvmPtrTy = LLVM::LLVMPointerType::get(&ctx);
+  // Get the appropriate type for allocation
+  if (isPtr(ty)) {
+    Type intTy = rewriter.getI32Type();
+    auto one = rewriter.create<LLVM::ConstantOp>(loc, intTy, 1);
+    allocType = llvmPtrTy;
+    alloc = rewriter.create<LLVM::AllocaOp>(loc, llvmPtrTy, allocType, one);
+    allocType = intTy;
+  } else {
+    allocType = ty;
+    alloc = rewriter.create<fir::AllocaOp>(loc, allocType);
+  }
+  // Lambda to create mapinfo ops
+  auto getMapInfo = [&](mlir::omp::ClauseMapFlags mappingFlags,
+                        const char *name) {
+    return rewriter.create<omp::MapInfoOp>(
+        loc, alloc.getType(), alloc, TypeAttr::get(allocType),
+        rewriter.getAttr<omp::ClauseMapFlagsAttr>(mappingFlags),
+        rewriter.getAttr<omp::VariableCaptureKindAttr>(
+            omp::VariableCaptureKind::ByRef),
+        /*varPtrPtr=*/Value{},
+        /*members=*/SmallVector<Value>{},
+        /*member_index=*/mlir::ArrayAttr{},
+        /*bounds=*/ValueRange(),
+        /*mapperId=*/mlir::FlatSymbolRefAttr(),
+        /*name=*/rewriter.getStringAttr(name), rewriter.getBoolAttr(false));
+  };
+  // Create mapinfo ops.
+  auto mapInfoFrom = getMapInfo(mlir::omp::ClauseMapFlags::from,
+                                "__flang_workdistribute_from");
+  auto mapInfoTo =
+      getMapInfo(mlir::omp::ClauseMapFlags::to, "__flang_workdistribute_to");
+  return TempOmpVar{mapInfoFrom, mapInfoTo};
+}
+
+// usedOutsideSplit checks if a value is used outside the split operation.
+static bool usedOutsideSplit(Value v, Operation *split) {
+  if (!split)
+    return false;
+  auto targetOp = cast<omp::TargetOp>(split->getParentOp());
+  auto *targetBlock = &targetOp.getRegion().front();
+  for (auto *user : v.getUsers()) {
+    while (user->getBlock() != targetBlock) {
+      user = user->getParentOp();
+    }
+    if (!user->isBeforeInBlock(split))
+      return true;
+  }
+  return false;
+}
+
+/// isRecomputableAfterFission checks if an operation can be recomputed
+static bool isRecomputableAfterFission(Operation *op, Operation *splitBefore) {
+  // If the op has side effects, it cannot be recomputed.
+  // We consider fir.declare as having no side effects.
+  return isa<fir::DeclareOp>(op) || isMemoryEffectFree(op);
+}
+
+/// collectNonRecomputableDeps collects dependencies that cannot be recomputed
+static void collectNonRecomputableDeps(Value &v, omp::TargetOp targetOp,
+                                       SetVector<Operation *> &nonRecomputable,
+                                       SetVector<Operation *> &toCache,
+                                       SetVector<Operation *> &toRecompute) {
+  Operation *op = v.getDefiningOp();
+  // If v is a block argument, it must be from the targetOp.
+  if (!op) {
+    assert(cast<BlockArgument>(v).getOwner()->getParentOp() == targetOp);
+    return;
+  }
+  // If the op is in the nonRecomputable set, add it to toCache and return.
+  if (nonRecomputable.contains(op)) {
+    toCache.insert(op);
+    return;
+  }
+  // Add the op to toRecompute.
+  toRecompute.insert(op);
+  for (auto opr : op->getOperands())
+    collectNonRecomputableDeps(opr, targetOp, nonRecomputable, toCache,
+                               toRecompute);
+}
+
+/// createBlockArgsAndMap creates block arguments and maps them
+static void createBlockArgsAndMap(Location loc, RewriterBase &rewriter,
+                                  omp::TargetOp &targetOp, Block *targetBlock,
+                                  Block *newTargetBlock,
+                                  SmallVector<Value> &hostEvalVars,
+                                  SmallVector<Value> &mapOperands,
+                                  SmallVector<Value> &allocs,
+                                  IRMapping &irMapping) {
+  // FIRST: Map `host_eval_vars` to block arguments
+  unsigned originalHostEvalVarsSize = targetOp.getHostEvalVars().size();
+  for (unsigned i = 0; i < hostEvalVars.size(); ++i) {
+    Value originalValue;
+    BlockArgument newArg;
+    if (i < originalHostEvalVarsSize) {
+      originalValue = targetBlock->getArgument(i); // Host_eval args come first
+      newArg = newTargetBlock->addArgument(originalValue.getType(),
+                                           originalValue.getLoc());
+    } else {
+      originalValue = hostEvalVars[i];
+      newArg = newTargetBlock->addArgument(originalValue.getType(),
+                                           originalValue.getLoc());
+    }
+    irMapping.map(originalValue, newArg);
+  }
+
+  // SECOND: Map `map_operands` to block arguments
+  unsigned originalMapVarsSize = targetOp.getMapVars().size();
+  for (unsigned i = 0; i < mapOperands.size(); ++i) {
+    Value originalValue;
+    BlockArgument newArg;
+    // Map the new arguments from the original block.
+    if (i < originalMapVarsSize) {
+      originalValue = targetBlock->getArgument(originalHostEvalVarsSize +
+                                               i); // Offset by host_eval count
+      newArg = newTargetBlock->addArgument(originalValue.getType(),
+                                           originalValue.getLoc());
+    }
+    // Map the new arguments from the `allocs`.
+    else {
+      originalValue = allocs[i - originalMapVarsSize];
+      newArg = newTargetBlock->addArgument(
+          getPtrTypeForOmp(originalValue.getType()), originalValue.getLoc());
+    }
+    irMapping.map(originalValue, newArg);
+  }
+
+  // THIRD: Map `private_vars` to block arguments (if any)
+  unsigned originalPrivateVarsSize = targetOp.getPrivateVars().size();
+  for (unsigned i = 0; i < originalPrivateVarsSize; ++i) {
+    auto originalArg = targetBlock->getArgument(originalHostEvalVarsSize +
+                                                originalMapVarsSize + i);
+    auto newArg = newTargetBlock->addArgument(originalArg.getType(),
+                                              originalArg.getLoc());
+    irMapping.map(originalArg, newArg);
+  }
+  return;
+}
+
+/// reloadCacheAndRecompute reloads cached values and recomputes operations
+static void reloadCacheAndRecompute(
+    Location loc, RewriterBase &rewriter, Operation *splitBefore,
+    omp::TargetOp &targetOp, Block *targetBlock, Block *newTargetBlock,
+    SmallVector<Value> &hostEvalVars, SmallVector<Value> &mapOperands,
+    SmallVector<Value> &allocs, SetVector<Operation *> &toRecompute,
+    IRMapping &irMapping) {
+  // Handle the load operations for the allocs.
+  rewriter.setInsertionPointToStart(newTargetBlock);
+  auto llvmPtrTy = LLVM::LLVMPointerType::get(targetOp.getContext());
+
+  unsigned originalMapVarsSize = targetOp.getMapVars().size();
+  unsigned hostEvalVarsSize = hostEvalVars.size();
+  // Create load operations for each allocated variable.
+  for (unsigned i = 0; i < allocs.size(); ++i) {
+    Value original = allocs[i];
+    // Get the new block argument for this specific allocated value.
+    Value newArg =
+        newTargetBlock->getArgument(hostEvalVarsSize + originalMapVarsSize + i);
+    Value restored;
+    // If the original value is a pointer or reference, load and convert if
+    // necessary.
+    if (isPtr(original.getType())) {
+      restored = rewriter.create<LLVM::LoadOp>(loc, llvmPtrTy, newArg);
+      if (!isa<LLVM::LLVMPointerType>(original.getType()))
+        restored =
+            rewriter.create<fir::ConvertOp>(loc, original.getType(), restored);
+    } else {
+      restored = rewriter.create<fir::LoadOp>(loc, newArg);
+    }
+    irMapping.map(original, restored);
+  }
+  // Clone the operations if they are in the toRecompute set.
+  for (auto it = targetBlock->begin(); it != splitBefore->getIterator(); it++) {
+    if (toRecompute.contains(&*it))
+      rewriter.clone(*it, irMapping);
+  }
+}
+
+/// Given a teamsOp, navigate down the nested structure to find the
+/// innermost LoopNestOp. The expected nesting is:
+/// teams -> parallel -> distribute -> wsloop -> loop_nest
+static mlir::omp::LoopNestOp getLoopNestFromTeams(mlir::omp::TeamsOp teamsOp) {
+  if (teamsOp.getRegion().empty())
+    return nullptr;
+  // Ensure the teams region has a single block.
+  if (teamsOp.getRegion().getBlocks().size() != 1)
+    return nullptr;
+  // Find parallel op inside teams
+  mlir::omp::ParallelOp parallelOp = nullptr;
+  // Look for the parallel op in the teams region
+  for (auto &op : teamsOp.getRegion().front()) {
+    if (auto parallel = dyn_cast<mlir::omp::ParallelOp>(op)) {
+      parallelOp = parallel;
+      break;
+    }
+  }
+  if (!parallelOp)
+    return nullptr;
+
+  // Find distribute op inside parallel
+  mlir::omp::DistributeOp distributeOp = nullptr;
+  for (auto &op : parallelOp.getRegion().front()) {
+    if (auto distribute = dyn_cast<mlir::omp::DistributeOp>(op)) {
+      distributeOp = distribute;
+      break;
+    }
+  }
+  if (!distributeOp)
+    return nullptr;
+
+  // Find wsloop op inside distribute
+  mlir::omp::WsloopOp wsloopOp = nullptr;
+  for (auto &op : distributeOp.getRegion().front()) {
+    if (auto wsloop = dyn_cast<mlir::omp::WsloopOp>(op)) {
+      wsloopOp = wsloop;
+      break;
+    }
+  }
+  if (!wsloopOp)
+    return nullptr;
+
+  // Find loop_nest op inside wsloop
+  for (auto &op : wsloopOp.getRegion().front()) {
+    if (auto loopNest = dyn_cast<mlir::omp::LoopNestOp>(op)) {
+      return loopNest;
+    }
+  }
+
+  return nullptr;
+}
+
+/// Generate LLVM constant operations for i32 and i64 types.
+static mlir::LLVM::ConstantOp
+genI32Constant(mlir::Location loc, mlir::RewriterBase &rewriter, int value) {
+  mlir::Type i32Ty = rewriter.getI32Type();
+  mlir::IntegerAttr attr = rewriter.getI32IntegerAttr(value);
+  return rewriter.create<mlir::LLVM::ConstantOp>(loc, i32Ty, attr);
+}
+
+/// Given a box descriptor, extract the base address of the data it describes.
+/// If the box descriptor is a reference, load it first.
+/// The base address is returned as an i8* pointer.
+static Value genDescriptorGetBaseAddress(fir::FirOpBuilder &builder,
+                                         Location loc, Value boxDesc) {
+  Value box = boxDesc;
+  if (auto refBox = dyn_cast<fir::ReferenceType>(boxDesc.getType())) {
+    box = fir::LoadOp::create(builder, loc, boxDesc);
+  }
+  assert(isa<fir::BoxType>(box.getType()) &&
+         "Unknown type passed to genDescriptorGetBaseAddress");
+  auto i8Type = builder.getI8Type();
+  auto unknownArrayType =
+      fir::SequenceType::get({fir::SequenceType::getUnknownExtent()}, i8Type);
+  auto i8BoxType = fir::BoxType::get(unknownArrayType);
+  auto typedBox = fir::ConvertOp::create(builder, loc, i8BoxType, box);
+  auto rawAddr = fir::BoxAddrOp::create(builder, loc, typedBox);
+  return rawAddr;
+}
+
+/// Given a box descriptor, extract the total number of elements in the array it
+/// describes. If the box descriptor is a reference, load it first.
+/// The total number of elements is returned as an i64 value.
+static Value genDescriptorGetTotalElements(fir::FirOpBuilder &builder,
+                                           Location loc, Value boxDesc) {
+  Value box = boxDesc;
+  if (auto refBox = dyn_cast<fir::ReferenceType>(boxDesc.getType())) {
+    box = fir::LoadOp::create(builder, loc, boxDesc);
+  }
+  assert(isa<fir::BoxType>(box.getType()) &&
+         "Unknown type passed to genDescriptorGetTotalElements");
+  auto i64Type = builder.getI64Type();
+  return fir::BoxTotalElementsOp::create(builder, loc, i64Type, box);
+}
+
+/// Given a box descriptor, extract the size of each element in the array it
+/// describes. If the box descriptor is a reference, load it first.
+/// The element size is returned as an i64 value.
+static Value genDescriptorGetEleSize(fir::FirOpBuilder &builder, Location loc,
+                                     Value boxDesc) {
+  Value box = boxDesc;
+  if (auto refBox = dyn_cast<fir::ReferenceType>(boxDesc.getType())) {
+    box = fir::LoadOp::create(builder, loc, boxDesc);
+  }
+  assert(isa<fir::BoxType>(box.getType()) &&
+         "Unknown type passed to genDescriptorGetElementSize");
+  auto i64Type = builder.getI64Type();
+  return fir::BoxEleSizeOp::create(builder, loc, i64Type, box);
+}
+
+/// Given a box descriptor, compute the total size in bytes of the data it
+/// describes. This is done by multiplying the total number of elements by the
+/// size of each element. If the box descriptor is a reference, load it first.
+/// The total size in bytes is returned as an i64 value.
+static Value genDescriptorGetDataSizeInBytes(fir::FirOpBuilder &builder,
+                                             Location loc, Value boxDesc) {
+  Value box = boxDesc;
+  if (auto refBox = dyn_cast<fir::ReferenceType>(boxDesc.getType())) {
+    box = fir::LoadOp::create(builder, loc, boxDesc);
+  }
+  assert(isa<fir::BoxType>(box.getType()) &&
+         "Unknown type passed to genDescriptorGetElementSize");
+  Value eleSize = genDescriptorGetEleSize(builder, loc, box);
+  Value totalElements = genDescriptorGetTotalElements(builder, loc, box);
+  return mlir::arith::MulIOp::create(builder, loc, totalElements, eleSize);
+}
+
+/// Generate a call to the OpenMP runtime function `omp_get_mapped_ptr` to
+/// retrieve the device pointer corresponding to a given host pointer and device
+/// number. If no mapping exists, the original host pointer is returned.
+/// Signature:
+///   void *omp_get_mapped_ptr(void *host_ptr, int device_num);
+static mlir::Value genOmpGetMappedPtrIfPresent(fir::FirOpBuilder &builder,
+                                               mlir::Location loc,
+                                               mlir::Value hostPtr,
+                                               mlir::Value deviceNum,
+                                               mlir::ModuleOp module) {
+  auto *context = builder.getContext();
+  auto voidPtrType = fir::LLVMPointerType::get(context, builder.getI8Type());
+  auto i32Type = builder.getI32Type();
+  auto funcName = "omp_get_mapped_ptr";
+  auto funcOp = module.lookupSymbol<mlir::func::FuncOp>(funcName);
+
+  if (!funcOp) {
+    auto funcType =
+        mlir::FunctionType::get(context, {voidPtrType, i32Type}, {voidPtrType});
+
+    mlir::OpBuilder::InsertionGuard guard(builder);
+    builder.setInsertionPointToStart(module.getBody());
+
+    funcOp = mlir::func::FuncOp::create(builder, loc, funcName, funcType);
+    funcOp.setPrivate();
+  }
+
+  llvm::SmallVector<mlir::Value> args;
+  args.push_back(fir::ConvertOp::create(builder, loc, voidPtrType, hostPtr));
+  args.push_back(fir::ConvertOp::create(builder, loc, i32Type, deviceNum));
+  auto callOp = fir::CallOp::create(builder, loc, funcOp, args);
+  auto mappedPtr = callOp.getResult(0);
+  auto isNull = builder.genIsNullAddr(loc, mappedPtr);
+  auto convertedHostPtr =
+      fir::ConvertOp::create(builder, loc, voidPtrType, hostPtr);
+  auto result = arith::SelectOp::create(builder, loc, isNull, convertedHostPtr,
+                                        mappedPtr);
+  return result;
+}
+
+/// Generate a call to the OpenMP runtime function `omp_target_memcpy` to
+/// perform memory copy between host and device or between devices.
+/// Signature:
+///   int omp_target_memcpy(void *dst, const void *src, size_t length,
+///                         size_t dst_offset, size_t src_offset,
+///                         int dst_device, int src_device);
+static void genOmpTargetMemcpyCall(fir::FirOpBuilder &builder,
+                                   mlir::Location loc, mlir::Value dst,
+                                   mlir::Value src, mlir::Value length,
+                                   mlir::Value dstOffset, mlir::Value srcOffset,
+                                   mlir::Value device, mlir::ModuleOp module) {
+  auto *context = builder.getContext();
+  auto funcName = "omp_target_memcpy";
+  auto voidPtrType = fir::LLVMPointerType::get(context, builder.getI8Type());
+  auto sizeTType = builder.getI64Type(); // assuming size_t is 64-bit
+  auto i32Type = builder.getI32Type();
+  auto funcOp = module.lookupSymbol<mlir::func::FuncOp>(funcName);
+
+  if (!funcOp) {
+    mlir::OpBuilder::InsertionGuard guard(builder);
+    builder.setInsertionPointToStart(module.getBody());
+    llvm::SmallVector<mlir::Type> argTypes = {
+        voidPtrType, voidPtrType, sizeTType, sizeTType,
+        sizeTType,   i32Type,     i32Type};
+    auto funcType = mlir::FunctionType::get(context, argTypes, {i32Type});
+    funcOp = mlir::func::FuncOp::create(builder, loc, funcName, funcType);
+    funcOp.setPrivate();
+  }
+
+  llvm::SmallVector<mlir::Value> args{dst,       src,    length, dstOffset,
+                                      srcOffset, device, device};
+  fir::CallOp::create(builder, loc, funcOp, args);
+  return;
+}
+
+/// Generate code to replace a Fortran array assignment call with OpenMP
+/// runtime calls to perform the equivalent operation on the device.
+/// This involves extracting the source and destination pointers from the
+/// Fortran array descriptors, retrieving their mapped device pointers (if any),
+/// and invoking `omp_target_memcpy` to copy the data on the device.
+static void genFortranAssignOmpReplacement(fir::FirOpBuilder &builder,
+                                           mlir::Location loc,
+                                           fir::CallOp callOp,
+                                           mlir::Value device,
+                                           mlir::ModuleOp module) {
+  assert(callOp.getNumResults() == 0 &&
+         "Expected _FortranAAssign to have no results");
+  assert(callOp.getNumOperands() >= 2 &&
+         "Expected _FortranAAssign to have at least two operands");
+
+  // Extract the source and destination pointers from the call operands.
+  mlir::Value dest = callOp.getOperand(0);
+  mlir::Value src = callOp.getOperand(1);
+
+  // Get the base addresses of the source and destination arrays.
+  mlir::Value srcBase = genDescriptorGetBaseAddress(builder, loc, src);
+  mlir::Value destBase = genDescriptorGetBaseAddress(builder, loc, dest);
+
+  // Get the total size in bytes of the data to be copied.
+  mlir::Value srcDataSize = genDescriptorGetDataSizeInBytes(builder, loc, src);
+
+  // Retrieve the mapped device pointers for source and destination.
+  // If no mapping exists, the original host pointer is used.
+  Value destPtr =
+      genOmpGetMappedPtrIfPresent(builder, loc, destBase, device, module);
+  Value srcPtr =
+      genOmpGetMappedPtrIfPresent(builder, loc, srcBase, device, module);
+  Value zero = builder.create<LLVM::ConstantOp>(loc, builder.getI64Type(),
+                                                builder.getI64IntegerAttr(0));
+
+  // Generate the call to omp_target_memcpy to perform the data copy on the
+  // device.
+  genOmpTargetMemcpyCall(builder, loc, destPtr, srcPtr, srcDataSize, zero, zero,
+                         device, module);
+}
+
+/// Struct to hold the host eval vars corresponding to loop bounds and steps
+struct HostEvalVars {
+  SmallVector<Value> lbs;
+  SmallVector<Value> ubs;
+  SmallVector<Value> steps;
+};
+
+/// moveToHost method clones all the ops from target region outside of it.
+/// It hoists runtime function "_FortranAAssign" and replaces it with omp
+/// version. Also hoists and replaces fir.allocmem with omp.target_allocmem and
+/// fir.freemem with omp.target_freemem
+static LogicalResult moveToHost(omp::TargetOp targetOp, RewriterBase &rewriter,
+                                mlir::ModuleOp module,
+                                struct HostEvalVars &hostEvalVars) {
+  OpBuilder::InsertionGuard guard(rewriter);
+  Block *targetBlock = &targetOp.getRegion().front();
+  assert(targetBlock == &targetOp.getRegion().back());
+  IRMapping mapping;
+
+  // Get the parent target_data op
+  auto targetDataOp = cast<omp::TargetDataOp>(targetOp->getParentOp());
+  if (!targetDataOp) {
+    emitError(targetOp->getLoc(),
+              "Expected target op to be inside target_data op");
+    return failure();
+  }
+  // create mapping for host_eval_vars
+  unsigned hostEvalVarCount = targetOp.getHostEvalVars().size();
+  for (unsigned i = 0; i < targetOp.getHostEvalVars().size(); ++i) {
+    Value hostEvalVar = targetOp.getHostEvalVars()[i];
+    BlockArgument arg = targetBlock->getArguments()[i];
+    mapping.map(arg, hostEvalVar);
+  }
+  // create mapping for map_vars
+  for (unsigned i = 0; i < targetOp.getMapVars().size(); ++i) {
+    Value mapInfo = targetOp.getMapVars()[i];
+    BlockArgument arg = targetBlock->getArguments()[hostEvalVarCount + i];
+    Operation *op = mapInfo.getDefiningOp();
+    assert(op);
+    auto mapInfoOp = cast<omp::MapInfoOp>(op);
+    // map the block argument to the host-side variable pointer
+    mapping.map(arg, mapInfoOp.getVarPtr());
+  }
+  // create mapping for private_vars
+  unsigned mapSize = targetOp.getMapVars().size();
+  for (unsigned i = 0; i < targetOp.getPrivateVars().size(); ++i) {
+    Value privateVar = targetOp.getPrivateVars()[i];
+    // The mapping should link the device-side variable to the host-side one.
+    BlockArgument arg =
+        targetBlock->getArguments()[hostEvalVarCount + mapSize + i];
+    // Map the device-side copy (`arg`) to the host-side value (`privateVar`).
+    mapping.map(arg, privateVar);
+  }
+
+  rewriter.setInsertionPoint(targetOp);
+  SmallVector<Operation *> opsToReplace;
+  Value device = targetOp.getDevice();
+
+  // If device is not specified, default to device 0.
+  if (!device) {
+    device = genI32Constant(targetOp.getLoc(), rewriter, 0);
+  }
+  // Clone all operations.
+  for (auto it = targetBlock->begin(), end = std::prev(targetBlock->end());
+       it != end; ++it) {
+    auto *op = &*it;
+    Operation *clonedOp = rewriter.clone(*op, mapping);
+    // Map the results of the original op to the cloned op.
+    for (unsigned i = 0; i < op->getNumResults(); ++i) {
+      mapping.map(op->getResult(i), clonedOp->getResult(i));
+    }
+    // fir.declare changes its type when hoisting it out of omp.target to
+    // omp.target_data Introduce a load, if original declareOp input is not of
+    // reference type, but cloned delcareOp input is reference type.
+    if (fir::DeclareOp clonedDeclareOp = dyn_cast<fir::DeclareOp>(clonedOp)) {
+      auto originalDeclareOp = cast<fir::DeclareOp>(op);
+      Type originalInType = originalDeclareOp.getMemref().getType();
+      Type clonedInType = clonedDeclareOp.getMemref().getType();
+
+      fir::ReferenceType originalRefType =
+          dyn_cast<fir::ReferenceType>(originalInType);
+      fir::ReferenceType clonedRefType =
+          dyn_cast<fir::ReferenceType>(clonedInType);
+      if (!originalRefType && clonedRefType) {
+        Type clonedEleTy = clonedRefType.getElementType();
+        if (clonedEleTy == originalDeclareOp.getType()) {
+          opsToReplace.push_back(clonedOp);
+        }
+      }
+    }
+    // Collect the ops to be replaced.
+    if (isa<fir::AllocMemOp>(clonedOp) || isa<fir::FreeMemOp>(clonedOp))
+      opsToReplace.push_back(clonedOp);
+    // Check for runtime calls to be replaced.
+    if (isRuntimeCall(clonedOp)) {
+      fir::CallOp runtimeCall = cast<fir::CallOp>(op);
+      auto funcName = runtimeCall.getCallee()->getRootReference().getValue();
+      if (funcName == FortranAssignStr) {
+        opsToReplace.push_back(clonedOp);
+      } else {
+        emitError(runtimeCall->getLoc(), "Unhandled runtime call hoisting.");
+        return failure();
+      }
+    }
+  }
+  // Replace fir.allocmem with omp.target_allocmem.
+  for (Operation *op : opsToReplace) {
+    if (auto allocOp = dyn_cast<fir::AllocMemOp>(op)) {
+      rewriter.setInsertionPoint(allocOp);
+      auto ompAllocmemOp = rewriter.create<omp::TargetAllocMemOp>(
+          allocOp.getLoc(), rewriter.getI64Type(), device,
+          allocOp.getInTypeAttr(), allocOp.getUniqNameAttr(),
+          allocOp.getBindcNameAttr(), allocOp.getTypeparams(),
+          allocOp.getShape());
+      auto firConvertOp = rewriter.create<fir::ConvertOp>(
+          allocOp.getLoc(), allocOp.getResult().getType(),
+          ompAllocmemOp.getResult());
+      rewriter.replaceOp(allocOp, firConvertOp.getResult());
+    }
+    // Replace fir.freemem with omp.target_freemem.
+    else if (auto freeOp = dyn_cast<fir::FreeMemOp>(op)) {
+      rewriter.setInsertionPoint(freeOp);
+      auto firConvertOp = rewriter.create<fir::ConvertOp>(
+          freeOp.getLoc(), rewriter.getI64Type(), freeOp.getHeapref());
+      rewriter.create<omp::TargetFreeMemOp>(freeOp.getLoc(), device,
+                                            firConvertOp.getResult());
+      rewriter.eraseOp(freeOp);
+    }
+    // fir.declare changes its type when hoisting it out of omp.target to
+    // omp.target_data Introduce a load, if original declareOp input is not of
+    // reference type, but cloned delcareOp input is reference type.
+    else if (fir::DeclareOp clonedDeclareOp = dyn_cast<fir::DeclareOp>(op)) {
+      Type clonedInType = clonedDeclareOp.getMemref().getType();
+      fir::ReferenceType clonedRefType =
+          dyn_cast<fir::ReferenceType>(clonedInType);
+      Type clonedEleTy = clonedRefType.getElementType();
+      rewriter.setInsertionPoint(op);
+      Value loadedValue = rewriter.create<fir::LoadOp>(
+          clonedDeclareOp.getLoc(), clonedEleTy, clonedDeclareOp.getMemref());
+      clonedDeclareOp.getResult().replaceAllUsesWith(loadedValue);
+    }
+    // Replace runtime calls with omp versions.
+    else if (isRuntimeCall(op)) {
+      fir::CallOp runtimeCall = cast<fir::CallOp>(op);
+      auto funcName = runtimeCall.getCallee()->getRootReference().getValue();
+      if (funcName == FortranAssignStr) {
+        rewriter.setInsertionPoint(op);
+        fir::FirOpBuilder builder{rewriter, op};
+
+        mlir::Location loc = runtimeCall.getLoc();
+        genFortranAssignOmpReplacement(builder, loc, runtimeCall, device,
+                                       module);
+        rewriter.eraseOp(op);
+      } else {
+        emitError(runtimeCall->getLoc(), "Unhandled runtime call hoisting.");
+        return failure();
+      }
+    } else {
+      emitError(op->getLoc(), "Unhandled op hoisting.");
+      return failure();
+    }
+  }
+
+  // Update the host_eval_vars to use the mapped values.
+  for (size_t i = 0; i < hostEvalVars.lbs.size(); ++i) {
+    hostEvalVars.lbs[i] = mapping.lookup(hostEvalVars.lbs[i]);
+    hostEvalVars.ubs[i] = mapping.lookup(hostEvalVars.ubs[i]);
+    hostEvalVars.steps[i] = mapping.lookup(hostEvalVars.steps[i]);
+  }
+  // Finally erase the original targetOp.
+  rewriter.eraseOp(targetOp);
+  return success();
+}
+
+/// Result of isolateOp method
+struct SplitResult {
+  omp::TargetOp preTargetOp;
+  omp::TargetOp isolatedTargetOp;
+  omp::TargetOp postTargetOp;
+};
+
+/// computeAllocsCacheRecomputable method computes the allocs needed to cache
+/// the values that are used outside the split point. It also computes the ops
+/// that need to be cached and the ops that can be recomputed after the split.
+static void computeAllocsCacheRecomputable(
+    omp::TargetOp targetOp, Operation *splitBeforeOp, RewriterBase &rewriter,
+    SmallVector<Value> &preMapOperands, SmallVector<Value> &postMapOperands,
+    SmallVector<Value> &allocs, SmallVector<Value> &requiredVals,
+    SetVector<Operation *> &nonRecomputable, SetVector<Operation *> &toCache,
+    SetVector<Operation *> &toRecompute) {
+  auto *targetBlock = &targetOp.getRegion().front();
+  // Find all values that are used outside the split point.
+  for (auto it = targetBlock->begin(); it != splitBeforeOp->getIterator();
+       it++) {
+    // Check if any of the results are used outside the split point.
+    for (auto res : it->getResults()) {
+      if (usedOutsideSplit(res, splitBeforeOp)) {
+        requiredVals.push_back(res);
+      }
+    }
+    // If the op is not recomputable, add it to the nonRecomputable set.
+    if (!isRecomputableAfterFission(&*it, splitBeforeOp)) {
+      nonRecomputable.insert(&*it);
+    }
+  }
+  // For each required value, collect its dependencies.
+  for (auto requiredVal : requiredVals)
+    collectNonRecomputableDeps(requiredVal, targetOp, nonRecomputable, toCache,
+                               toRecompute);
+  // For each op in toCache, create an alloc and update the pre and post map
+  // operands.
+  for (Operation *op : toCache) {
+    for (auto res : op->getResults()) {
+      auto alloc =
+          allocateTempOmpVar(targetOp.getLoc(), res.getType(), rewriter);
+      allocs.push_back(res);
+      preMapOperands.push_back(alloc.from);
+      postMapOperands.push_back(alloc.to);
+    }
+  }
+}
+
+/// genPreTargetOp method generates the preTargetOp that contains all the ops
+/// before the split point. It also creates the block arguments and maps the
+/// values accordingly. It also creates the store operations for the allocs.
+static omp::TargetOp
+genPreTargetOp(omp::TargetOp targetOp, SmallVector<Value> &preMapOperands,
+               SmallVector<Value> &allocs, Operation *splitBeforeOp,
+               RewriterBase &rewriter, struct HostEvalVars &hostEvalVars,
+               bool isTargetDevice) {
+  auto loc = targetOp.getLoc();
+  auto *targetBlock = &targetOp.getRegion().front();
+  SmallVector<Value> preHostEvalVars{targetOp.getHostEvalVars()};
+  // update the hostEvalVars of preTargetOp
+  omp::TargetOp preTargetOp = rewriter.create<omp::TargetOp>(
+      targetOp.getLoc(), targetOp.getAllocateVars(),
+      targetOp.getAllocatorVars(), targetOp.getBareAttr(),
+      targetOp.getDependKindsAttr(), targetOp.getDependVars(),
+      targetOp.getDevice(), targetOp.getHasDeviceAddrVars(), preHostEvalVars,
+      targetOp.getIfExpr(), targetOp.getInReductionVars(),
+      targetOp.getInReductionByrefAttr(), targetOp.getInReductionSymsAttr(),
+      targetOp.getIsDevicePtrVars(), preMapOperands, targetOp.getNowaitAttr(),
+      targetOp.getPrivateVars(), targetOp.getPrivateSymsAttr(),
+      targetOp.getPrivateNeedsBarrierAttr(), targetOp.getThreadLimit(),
+      targetOp.getPrivateMapsAttr());
+  auto *preTargetBlock = rewriter.createBlock(
+      &preTargetOp.getRegion(), preTargetOp.getRegion().begin(), {}, {});
+  IRMapping preMapping;
+  // Create block arguments and map the values.
+  createBlockArgsAndMap(loc, rewriter, targetOp, targetBlock, preTargetBlock,
+                        preHostEvalVars, preMapOperands, allocs, preMapping);
+
+  // Handle the store operations for the allocs.
+  rewriter.setInsertionPointToStart(preTargetBlock);
+  auto llvmPtrTy = LLVM::LLVMPointerType::get(targetOp.getContext());
+
+  // Clone the original operations.
+  for (auto it = targetBlock->begin(); it != splitBeforeOp->getIterator();
+       it++) {
+    rewriter.clone(*it, preMapping);
+  }
+
+  unsigned originalHostEvalVarsSize = preHostEvalVars.size();
+  unsigned originalMapVarsSize = targetOp.getMapVars().size();
+  // Create Stores for allocs.
+  for (unsigned i = 0; i < allocs.size(); ++i) {
+    Value originalResult = allocs[i];
+    Value toStore = preMapping.lookup(originalResult);
+    // Get the new block argument for this specific allocated value.
+    Value newArg = preTargetBlock->getArgument(originalHostEvalVarsSize +
+                                               originalMapVarsSize + i);
+    // Create the store operation.
+    if (isPtr(originalResult.getType())) {
+      if (!isa<LLVM::LLVMPointerType>(toStore.getType()))
+        toStore = rewriter.create<fir::ConvertOp>(loc, llvmPtrTy, toStore);
+      rewriter.create<LLVM::StoreOp>(loc, toStore, newArg);
+    } else {
+      rewriter.create<fir::StoreOp>(loc, toStore, newArg);
+    }
+  }
+  rewriter.create<omp::TerminatorOp>(loc);
+
+  // Update hostEvalVars with the mapped values for the loop bounds if we have
+  // a loopNestOp and we are not generating code for the target device.
+  omp::LoopNestOp loopNestOp =
+      getLoopNestFromTeams(cast<omp::TeamsOp>(splitBeforeOp));
+  if (loopNestOp && !isTargetDevice) {
+    for (size_t i = 0; i < loopNestOp.getLoopLowerBounds().size(); ++i) {
+      Value lb = loopNestOp.getLoopLowerBounds()[i];
+      Value ub = loopNestOp.getLoopUpperBounds()[i];
+      Value step = loopNestOp.getLoopSteps()[i];
+
+      hostEvalVars.lbs.push_back(preMapping.lookup(lb));
+      hostEvalVars.ubs.push_back(preMapping.lookup(ub));
+      hostEvalVars.steps.push_back(preMapping.lookup(step));
+    }
+  }
+
+  return preTargetOp;
+}
+
+/// genIsolatedTargetOp method generates the isolatedTargetOp that contains the
+/// ops between the split point. It also creates the block arguments and maps
+/// the values accordingly. It also creates the load operations for the allocs
+/// and recomputes the necessary ops.
+static omp::TargetOp
+genIsolatedTargetOp(omp::TargetOp targetOp, SmallVector<Value> &postMapOperands,
+                    Operation *splitBeforeOp, RewriterBase &rewriter,
+                    SmallVector<Value> &allocs,
+                    SetVector<Operation *> &toRecompute,
+                    struct HostEvalVars &hostEvalVars, bool isTargetDevice) {
+  auto loc = targetOp.getLoc();
+  auto *targetBlock = &targetOp.getRegion().front();
+  SmallVector<Value> isolatedHostEvalVars{targetOp.getHostEvalVars()};
+  // update the hostEvalVars of isolatedTargetOp
+  if (!hostEvalVars.lbs.empty() && !isTargetDevice) {
+    isolatedHostEvalVars.append(hostEvalVars.lbs.begin(),
+                                hostEvalVars.lbs.end());
+    isolatedHostEvalVars.append(hostEvalVars.ubs.begin(),
+                                hostEvalVars.ubs.end());
+    isolatedHostEvalVars.append(hostEvalVars.steps.begin(),
+                                hostEvalVars.steps.end());
+  }
+  // Create the isolated target op
+  omp::TargetOp isolatedTargetOp = rewriter.create<omp::TargetOp>(
+      targetOp.getLoc(), targetOp.getAllocateVars(),
+      targetOp.getAllocatorVars(), targetOp.getBareAttr(),
+      targetOp.getDependKindsAttr(), targetOp.getDependVars(),
+      targetOp.getDevice(), targetOp.getHasDeviceAddrVars(),
+      isolatedHostEvalVars, targetOp.getIfExpr(), targetOp.getInReductionVars(),
+      targetOp.getInReductionByrefAttr(), targetOp.getInReductionSymsAttr(),
+      targetOp.getIsDevicePtrVars(), postMapOperands, targetOp.getNowaitAttr(),
+      targetOp.getPrivateVars(), targetOp.getPrivateSymsAttr(),
+      targetOp.getPrivateNeedsBarrierAttr(), targetOp.getThreadLimit(),
+      targetOp.getPrivateMapsAttr());
+  auto *isolatedTargetBlock =
+      rewriter.createBlock(&isolatedTargetOp.getRegion(),
+                           isolatedTargetOp.getRegion().begin(), {}, {});
+  IRMapping isolatedMapping;
+  // Create block arguments and map the values.
+  createBlockArgsAndMap(loc, rewriter, targetOp, targetBlock,
+                        isolatedTargetBlock, isolatedHostEvalVars,
+                        postMapOperands, allocs, isolatedMapping);
+  // Handle the load operations for the allocs and recompute ops.
+  reloadCacheAndRecompute(loc, rewriter, splitBeforeOp, targetOp, targetBlock,
+                          isolatedTargetBlock, isolatedHostEvalVars,
+                          postMapOperands, allocs, toRecompute,
+                          isolatedMapping);
+
+  // Clone the original operations.
+  rewriter.clone(*splitBeforeOp, isolatedMapping);
+  rewriter.create<omp::TerminatorOp>(loc);
+
+  // update the loop bounds in the isolatedTargetOp if we have host_eval vars
+  // and we are not generating code for the target device.
+  if (!hostEvalVars.lbs.empty() && !isTargetDevice) {
+    omp::TeamsOp teamsOp;
+    for (auto &op : *isolatedTargetBlock) {
+      if (isa<omp::TeamsOp>(&op))
+        teamsOp = cast<omp::TeamsOp>(&op);
+    }
+    assert(teamsOp && "No teamsOp found in isolated target region");
+    // Get the loopNestOp inside the teamsOp
+    auto loopNestOp = getLoopNestFromTeams(teamsOp);
+    // Get the BlockArgs related to host_eval vars and update loop_nest bounds
+    // to them
+    unsigned originalHostEvalVarsSize = targetOp.getHostEvalVars().size();
+    unsigned index = originalHostEvalVarsSize;
+    // Replace loop bounds with the block arguments passed down via host_eval
+    SmallVector<Value> lbs, ubs, steps;
+
+    // Collect new lb/ub/step values from target block args
+    for (size_t i = 0; i < hostEvalVars.lbs.size(); ++i)
+      lbs.push_back(isolatedTargetBlock->getArgument(index++));
+
+    for (size_t i = 0; i < hostEvalVars.ubs.size(); ++i)
+      ubs.push_back(isolatedTargetBlock->getArgument(index++));
+
+    for (size_t i = 0; i < hostEvalVars.steps.size(); ++i)
+      steps.push_back(isolatedTargetBlock->getArgument(index++));
+
+    // Reset the loop bounds
+    loopNestOp.getLoopLowerBoundsMutable().assign(lbs);
+    loopNestOp.getLoopUpperBoundsMutable().assign(ubs);
+    loopNestOp.getLoopStepsMutable().assign(steps);
+  }
+
+  return isolatedTargetOp;
+}
+
+/// genPostTargetOp method generates the postTargetOp that contains all the ops
+/// after the split point. It also creates the block arguments and maps the
+/// values accordingly. It also creates the load operations for the allocs
+/// and recomputes the necessary ops.
+static omp::TargetOp genPostTargetOp(omp::TargetOp targetOp,
+                                     Operation *splitBeforeOp,
+                                     SmallVector<Value> &postMapOperands,
+                                     RewriterBase &rewriter,
+                                     SmallVector<Value> &allocs,
+                                     SetVector<Operation *> &toRecompute) {
+  auto loc = targetOp.getLoc();
+  auto *targetBlock = &targetOp.getRegion().front();
+  SmallVector<Value> postHostEvalVars{targetOp.getHostEvalVars()};
+  // Create the post target op
+  omp::TargetOp postTargetOp = rewriter.create<omp::TargetOp>(
+      targetOp.getLoc(), targetOp.getAllocateVars(),
+      targetOp.getAllocatorVars(), targetOp.getBareAttr(),
+      targetOp.getDependKindsAttr(), targetOp.getDependVars(),
+      targetOp.getDevice(), targetOp.getHasDeviceAddrVars(), postHostEvalVars,
+      targetOp.getIfExpr(), targetOp.getInReductionVars(),
+      targetOp.getInReductionByrefAttr(), targetOp.getInReductionSymsAttr(),
+      targetOp.getIsDevicePtrVars(), postMapOperands, targetOp.getNowaitAttr(),
+      targetOp.getPrivateVars(), targetOp.getPrivateSymsAttr(),
+      targetOp.getPrivateNeedsBarrierAttr(), targetOp.getThreadLimit(),
+      targetOp.getPrivateMapsAttr());
+  // Create the block for postTargetOp
+  auto *postTargetBlock = rewriter.createBlock(
+      &postTargetOp.getRegion(), postTargetOp.getRegion().begin(), {}, {});
+  IRMapping postMapping;
+  // Create block arguments and map the values.
+  createBlockArgsAndMap(loc, rewriter, targetOp, targetBlock, postTargetBlock,
+                        postHostEvalVars, postMapOperands, allocs, postMapping);
+  // Handle the load operations for the allocs and recompute ops.
+  reloadCacheAndRecompute(loc, rewriter, splitBeforeOp, targetOp, targetBlock,
+                          postTargetBlock, postHostEvalVars, postMapOperands,
+                          allocs, toRecompute, postMapping);
+  assert(splitBeforeOp->getNumResults() == 0 ||
+         llvm::all_of(splitBeforeOp->getResults(),
+                      [](Value result) { return result.use_empty(); }));
+  // Clone the original operations after the split point.
+  for (auto it = std::next(splitBeforeOp->getIterator());
+       it != targetBlock->end(); it++)
+    rewriter.clone(*it, postMapping);
+  return postTargetOp;
+}
+
+/// isolateOp method rewrites a omp.target_data { omp.target } in to
+/// omp.target_data {
+///      // preTargetOp region contains ops before splitBeforeOp.
+///      omp.target {}
+///      // isolatedTargetOp region contains splitBeforeOp,
+///      omp.target {}
+///      // postTargetOp region contains ops after splitBeforeOp.
+///      omp.target {}
+/// }
+/// It also handles the mapping of variables and the caching/recomputing
+/// of values as needed.
+static FailureOr<SplitResult> isolateOp(Operation *splitBeforeOp,
+                                        bool splitAfter, RewriterBase &rewriter,
+                                        mlir::ModuleOp module,
+                                        bool isTargetDevice) {
+  auto targetOp = cast<omp::TargetOp>(splitBeforeOp->getParentOp());
+  assert(targetOp);
+  rewriter.setInsertionPoint(targetOp);
+
+  // Prepare the map operands for preTargetOp and postTargetOp
+  auto preMapOperands = SmallVector<Value>(targetOp.getMapVars());
+  auto postMapOperands = SmallVector<Value>(targetOp.getMapVars());
+
+  // Vectors to hold analysis results
+  SmallVector<Value> requiredVals;
+  SetVector<Operation *> toCache;
+  SetVector<Operation *> toRecompute;
+  SetVector<Operation *> nonRecomputable;
+  SmallVector<Value> allocs;
+  struct HostEvalVars hostEvalVars;
+
+  // Analyze the ops in target region to determine which ops need to be
+  // cached and which ops need to be recomputed
+  computeAllocsCacheRecomputable(
+      targetOp, splitBeforeOp, rewriter, preMapOperands, postMapOperands,
+      allocs, requiredVals, nonRecomputable, toCache, toRecompute);
+
+  rewriter.setInsertionPoint(targetOp);
+
+  // Generate the preTargetOp that contains all the ops before splitBeforeOp.
+  auto preTargetOp =
+      genPreTargetOp(targetOp, preMapOperands, allocs, splitBeforeOp, rewriter,
+                     hostEvalVars, isTargetDevice);
+
+  // Move the ops of preTarget to host.
+  auto res = moveToHost(preTargetOp, rewriter, module, hostEvalVars);
+  if (failed(res))
+    return failure();
+  rewriter.setInsertionPoint(targetOp);
+
+  // Generate the isolatedTargetOp
+  omp::TargetOp isolatedTargetOp =
+      genIsolatedTargetOp(targetOp, postMapOperands, splitBeforeOp, rewriter,
+                          allocs, toRecompute, hostEvalVars, isTargetDevice);
+
+  omp::TargetOp postTargetOp = nullptr;
+  // Generate the postTargetOp that contains all the ops after splitBeforeOp.
+  if (splitAfter) {
+    rewriter.setInsertionPoint(targetOp);
+    postTargetOp = genPostTargetOp(targetOp, splitBeforeOp, postMapOperands,
+                                   rewriter, allocs, toRecompute);
+  }
+  // Finally erase the original targetOp.
+  rewriter.eraseOp(targetOp);
+  return SplitResult{preTargetOp, isolatedTargetOp, postTargetOp};
+}
+
+/// Recursively fission target ops until no more nested ops can be isolated.
+static LogicalResult fissionTarget(omp::TargetOp targetOp,
+                                   RewriterBase &rewriter,
+                                   mlir::ModuleOp module, bool isTargetDevice) {
+  auto tuple = getNestedOpToIsolate(targetOp);
+  if (!tuple) {
+    LLVM_DEBUG(llvm::dbgs() << " No op to isolate\n");
+    struct HostEvalVars hostEvalVars;
+    return moveToHost(targetOp, rewriter, module, hostEvalVars);
+  }
+  Operation *toIsolate = std::get<0>(*tuple);
+  bool splitBefore = !std::get<1>(*tuple);
+  bool splitAfter = !std::get<2>(*tuple);
+  // Recursively isolate the target op.
+  if (splitBefore && splitAfter) {
+    auto res =
+        isolateOp(toIsolate, splitAfter, rewriter, module, isTargetDevice);
+    if (failed(res))
+      return failure();
+    return fissionTarget((*res).postTargetOp, rewriter, module, isTargetDevice);
+  }
+  // Isolate only before the op.
+  if (splitBefore) {
+    auto res =
+        isolateOp(toIsolate, splitAfter, rewriter, module, isTargetDevice);
+    if (failed(res))
+      return failure();
+  } else {
+    emitError(toIsolate->getLoc(), "Unhandled case in fissionTarget");
+    return failure();
+  }
+  return success();
+}
+
+/// Pass to lower omp.workdistribute ops.
+class LowerWorkdistributePass
+    : public flangomp::impl::LowerWorkdistributeBase<LowerWorkdistributePass> {
+public:
+  void runOnOperation() override {
+    MLIRContext &context = getContext();
+    auto moduleOp = getOperation();
+    bool changed = false;
+    SetVector<omp::TargetOp> targetOpsToProcess;
+    auto verify =
+        moduleOp->walk([&](mlir::omp::WorkdistributeOp workdistribute) {
+          if (failed(verifyTargetTeamsWorkdistribute(workdistribute)))
+            return WalkResult::interrupt();
+          return WalkResult::advance();
+        });
+    if (verify.wasInterrupted())
+      return signalPassFailure();
+
+    auto fission =
+        moduleOp->walk([&](mlir::omp::WorkdistributeOp workdistribute) {
+          auto res = fissionWorkdistribute(workdistribute);
+          if (failed(res))
+            return WalkResult::interrupt();
+          changed |= *res;
+          return WalkResult::advance();
+        });
+    if (fission.wasInterrupted())
+      return signalPassFailure();
+
+    auto rtCallLower =
+        moduleOp->walk([&](mlir::omp::WorkdistributeOp workdistribute) {
+          auto res = workdistributeRuntimeCallLower(workdistribute,
+                                                    targetOpsToProcess);
+          if (failed(res))
+            return WalkResult::interrupt();
+          changed |= *res;
+          return WalkResult::advance();
+        });
+    if (rtCallLower.wasInterrupted())
+      return signalPassFailure();
+
+    moduleOp->walk([&](mlir::omp::WorkdistributeOp workdistribute) {
+      changed |= workdistributeDoLower(workdistribute, targetOpsToProcess);
+    });
+
+    moduleOp->walk([&](mlir::omp::TeamsOp teams) {
+      changed |= teamsWorkdistributeToSingleOp(teams, targetOpsToProcess);
+    });
+    if (changed) {
+      bool isTargetDevice =
+          llvm::cast<mlir::omp::OffloadModuleInterface>(*moduleOp)
+              .getIsTargetDevice();
+      IRRewriter rewriter(&context);
+      for (auto targetOp : targetOpsToProcess) {
+        auto res = splitTargetData(targetOp, rewriter);
+        if (failed(res))
+          return signalPassFailure();
+        if (*res) {
+          if (failed(fissionTarget(*res, rewriter, moduleOp, isTargetDevice)))
+            return signalPassFailure();
+        }
+      }
+    }
+  }
+};
+} // namespace
diff --git a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp
index 1883b40bcbcb5..fade023bc39b0 100644
--- a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp
+++ b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp
@@ -40,9 +40,9 @@
 #include "mlir/IR/SymbolTable.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/LLVM.h"
+#include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/StringSet.h"
-#include "llvm/Frontend/OpenMP/OMPConstants.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cstddef>
@@ -134,6 +134,17 @@ class MapInfoFinalizationPass
     }
   }
 
+  /// Return true if the module has an OpenMP requires clause that includes
+  /// unified_shared_memory.
+  static bool moduleRequiresUSM(mlir::ModuleOp module) {
+    assert(module && "invalid module");
+    if (auto req = module->getAttrOfType<mlir::omp::ClauseRequiresAttr>(
+            "omp.requires"))
+      return mlir::omp::bitEnumContainsAll(
+          req.getValue(), mlir::omp::ClauseRequires::unified_shared_memory);
+    return false;
+  }
+
   /// Create the member map for coordRef and append it (and its index
   /// path) to the provided new* vectors, if it is not already present.
   void appendMemberMapIfNew(
@@ -355,7 +366,7 @@ class MapInfoFinalizationPass
   /// the descriptor map onto the base address map.
   mlir::omp::MapInfoOp genBaseAddrMap(mlir::Value descriptor,
                                       mlir::OperandRange bounds,
-                                      int64_t mapType,
+                                      mlir::omp::ClauseMapFlags mapType,
                                       fir::FirOpBuilder &builder) {
     mlir::Location loc = descriptor.getLoc();
     mlir::Value baseAddrAddr = fir::BoxOffsetOp::create(
@@ -373,9 +384,7 @@ class MapInfoFinalizationPass
     return mlir::omp::MapInfoOp::create(
         builder, loc, baseAddrAddr.getType(), descriptor,
         mlir::TypeAttr::get(underlyingVarType),
-        builder.getIntegerAttr(
-            builder.getIntegerType(64, false),
-            llvm::to_underlying(llvm::omp::OpenMPOffloadMappingFlags(mapType))),
+        builder.getAttr<mlir::omp::ClauseMapFlagsAttr>(mapType),
         builder.getAttr<mlir::omp::VariableCaptureKindAttr>(
             mlir::omp::VariableCaptureKind::ByRef),
         baseAddrAddr, /*members=*/mlir::SmallVector<mlir::Value>{},
@@ -440,17 +449,15 @@ class MapInfoFinalizationPass
   /// descriptor tag to it as it's used differently to a regular mapping
   /// and some of the runtime descriptor behaviour at the moment can cause
   /// issues.
-  unsigned long getDescriptorMapType(unsigned long mapTypeFlag,
+  mlir::omp::ClauseMapFlags getDescriptorMapType(mlir::omp::ClauseMapFlags mapTypeFlag,
                                      mlir::Operation *target) {
-    using mapFlags = llvm::omp::OpenMPOffloadMappingFlags;
+    using mapFlags = mlir::omp::ClauseMapFlags;
     if (llvm::isa_and_nonnull<mlir::omp::TargetExitDataOp,
-                              mlir::omp::TargetUpdateOp>(target)) {
-      mapFlags flags = mapFlags(mapTypeFlag);
-      return llvm::to_underlying(flags);
-    }
+                              mlir::omp::TargetUpdateOp>(target))
+      return mapTypeFlag;
 
-    mapFlags flags = mapFlags::OMP_MAP_TO | mapFlags::OMP_MAP_DESCRIPTOR |
-                     (mapFlags(mapTypeFlag) & mapFlags::OMP_MAP_IMPLICIT);
+    mapFlags flags = mapFlags::to | mapFlags::descriptor |
+                     (mapTypeFlag & mapFlags::implicit);
     // Descriptors for objects will always be copied. This is because the
     // descriptor can be rematerialized by the compiler, and so the addres
     // of the descriptor for a given object at one place in the code may
@@ -460,11 +467,15 @@ class MapInfoFinalizationPass
     // TODO/FIXME: We currently cannot have MAP_CLOSE and MAP_ALWAYS on
     // the descriptor at once, these are mutually exclusive and when
     // both are applied the runtime will fail to map.
-    flags |= ((mapFlags(mapTypeFlag) & mapFlags::OMP_MAP_CLOSE) ==
-              mapFlags::OMP_MAP_CLOSE)
-                 ? mapFlags::OMP_MAP_CLOSE
-                 : mapFlags::OMP_MAP_ALWAYS;
-    return llvm::to_underlying(flags);
+    flags |= ((mapTypeFlag & mapFlags::close) == mapFlags::close)
+                 ? mapFlags::close
+                 : mapFlags::always;
+    // For unified_shared_memory, we additionally add `CLOSE` on the descriptor
+    // to ensure device-local placement where required by tests relying on USM +
+    // close semantics.
+    if (moduleRequiresUSM(target->getParentOfType<mlir::ModuleOp>()))
+      flags |= mapFlags::close;
+    return flags;
   }
 
   /// Check if the mapOp is present in the HasDeviceAddr clause on
@@ -514,11 +525,6 @@ class MapInfoFinalizationPass
     mlir::Value boxAddr = fir::BoxOffsetOp::create(
         builder, loc, op.getVarPtr(), fir::BoxFieldAttr::base_addr);
 
-    uint64_t mapTypeToImplicit = static_cast<
-        std::underlying_type_t<llvm::omp::OpenMPOffloadMappingFlags>>(
-        llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO |
-        llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT);
-
     mlir::ArrayAttr newMembersAttr;
     llvm::SmallVector<llvm::SmallVector<int64_t>> memberIdx = {{0}};
     newMembersAttr = builder.create2DI64ArrayAttr(memberIdx);
@@ -527,8 +533,9 @@ class MapInfoFinalizationPass
     mlir::omp::MapInfoOp memberMapInfoOp = mlir::omp::MapInfoOp::create(
         builder, op.getLoc(), varPtr.getType(), varPtr,
         mlir::TypeAttr::get(boxCharType.getEleTy()),
-        builder.getIntegerAttr(builder.getIntegerType(64, /*isSigned=*/false),
-                               mapTypeToImplicit),
+        builder.getAttr<mlir::omp::ClauseMapFlagsAttr>(
+            mlir::omp::ClauseMapFlags::to |
+            mlir::omp::ClauseMapFlags::implicit),
         builder.getAttr<mlir::omp::VariableCaptureKindAttr>(
             mlir::omp::VariableCaptureKind::ByRef),
         /*varPtrPtr=*/boxAddr,
@@ -555,6 +562,72 @@ class MapInfoFinalizationPass
     return newMapInfoOp;
   }
 
+  // Expand mappings of type(C_PTR) to map their `__address` field explicitly
+  // as a single pointer-sized member (USM-gated at callsite). This helps in
+  // USM scenarios to ensure the pointer-sized mapping is used.
+  mlir::omp::MapInfoOp genCptrMemberMap(mlir::omp::MapInfoOp op,
+                                        fir::FirOpBuilder &builder) {
+    if (!op.getMembers().empty())
+      return op;
+
+    mlir::Type varTy = fir::unwrapRefType(op.getVarPtr().getType());
+    if (!mlir::isa<fir::RecordType>(varTy))
+      return op;
+    auto recTy = mlir::cast<fir::RecordType>(varTy);
+    // If not a builtin C_PTR record, skip.
+    if (!recTy.getName().ends_with("__builtin_c_ptr"))
+      return op;
+
+    // Find the index of the c_ptr address component named "__address".
+    int32_t fieldIdx = recTy.getFieldIndex("__address");
+    if (fieldIdx < 0)
+      return op;
+
+    mlir::Location loc = op.getVarPtr().getLoc();
+    mlir::Type memTy = recTy.getType(fieldIdx);
+    fir::IntOrValue idxConst =
+        mlir::IntegerAttr::get(builder.getI32Type(), fieldIdx);
+    mlir::Value coord = fir::CoordinateOp::create(
+        builder, loc, builder.getRefType(memTy), op.getVarPtr(),
+        llvm::SmallVector<fir::IntOrValue, 1>{idxConst});
+
+    // Child for the `__address` member.
+    llvm::SmallVector<llvm::SmallVector<int64_t>> memberIdx = {{0}};
+    mlir::ArrayAttr newMembersAttr = builder.create2DI64ArrayAttr(memberIdx);
+    // Force CLOSE in USM paths so the pointer gets device-local placement
+    // when required by tests relying on USM + close semantics.
+    mlir::omp::ClauseMapFlagsAttr mapTypeAttr =
+        builder.getAttr<mlir::omp::ClauseMapFlagsAttr>(
+            op.getMapType() | mlir::omp::ClauseMapFlags::close);
+
+    mlir::omp::MapInfoOp memberMap = mlir::omp::MapInfoOp::create(
+        builder, loc, coord.getType(), coord,
+        mlir::TypeAttr::get(fir::unwrapRefType(coord.getType())), mapTypeAttr,
+        builder.getAttr<mlir::omp::VariableCaptureKindAttr>(
+            mlir::omp::VariableCaptureKind::ByRef),
+        /*varPtrPtr=*/mlir::Value{},
+        /*members=*/llvm::SmallVector<mlir::Value>{},
+        /*member_index=*/mlir::ArrayAttr{},
+        /*bounds=*/op.getBounds(),
+        /*mapperId=*/mlir::FlatSymbolRefAttr(),
+        /*name=*/op.getNameAttr(),
+        /*partial_map=*/builder.getBoolAttr(false));
+
+    // Rebuild the parent as a container with the `__address` member.
+    mlir::omp::MapInfoOp newParent = mlir::omp::MapInfoOp::create(
+        builder, op.getLoc(), op.getResult().getType(), op.getVarPtr(),
+        op.getVarTypeAttr(), mapTypeAttr, op.getMapCaptureTypeAttr(),
+        /*varPtrPtr=*/mlir::Value{},
+        /*members=*/llvm::SmallVector<mlir::Value>{memberMap},
+        /*member_index=*/newMembersAttr,
+        /*bounds=*/llvm::SmallVector<mlir::Value>{},
+        /*mapperId=*/mlir::FlatSymbolRefAttr(), op.getNameAttr(),
+        /*partial_map=*/builder.getBoolAttr(false));
+    op.replaceAllUsesWith(newParent.getResult());
+    op->erase();
+    return newParent;
+  }
+
   mlir::omp::MapInfoOp genDescriptorMemberMaps(mlir::omp::MapInfoOp op,
                                                fir::FirOpBuilder &builder,
                                                mlir::Operation *target) {
@@ -632,8 +705,8 @@ class MapInfoFinalizationPass
     mlir::omp::MapInfoOp newDescParentMapOp = mlir::omp::MapInfoOp::create(
         builder, op->getLoc(), op.getResult().getType(), descriptor,
         mlir::TypeAttr::get(fir::unwrapRefType(descriptor.getType())),
-        builder.getIntegerAttr(builder.getIntegerType(64, false),
-                               getDescriptorMapType(op.getMapType(), target)),
+        builder.getAttr<mlir::omp::ClauseMapFlagsAttr>(
+            getDescriptorMapType(op.getMapType(), target)),
         op.getMapCaptureTypeAttr(), /*varPtrPtr=*/mlir::Value{}, newMembers,
         newMembersAttr, /*bounds=*/mlir::SmallVector<mlir::Value>{},
         /*mapperId*/ mlir::FlatSymbolRefAttr(), op.getNameAttr(),
@@ -834,12 +907,9 @@ class MapInfoFinalizationPass
         builder.create<mlir::omp::MapInfoOp>(
             op->getLoc(), op.getResult().getType(), op.getVarPtr(),
             op.getVarTypeAttr(),
-            builder.getIntegerAttr(
-                builder.getIntegerType(64, false),
-                llvm::to_underlying(
-                    llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO |
-                    llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_ALWAYS |
-                    llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_DESCRIPTOR)),
+            builder.getAttr<mlir::omp::ClauseMapFlagsAttr>(
+                mlir::omp::ClauseMapFlags::to | mlir::omp::ClauseMapFlags::always |
+                mlir::omp::ClauseMapFlags::descriptor),
             op.getMapCaptureTypeAttr(), /*varPtrPtr=*/mlir::Value{},
             mlir::SmallVector<mlir::Value>{}, mlir::ArrayAttr{},
             /*bounds=*/mlir::SmallVector<mlir::Value>{},
@@ -1178,9 +1248,8 @@ class MapInfoFinalizationPass
         // we need to change this check for early return OR live with
         // over-mapping.
         bool hasImplicitMap =
-            (llvm::omp::OpenMPOffloadMappingFlags(op.getMapType()) &
-             llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT) ==
-            llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT;
+            (op.getMapType() & mlir::omp::ClauseMapFlags::implicit) ==
+            mlir::omp::ClauseMapFlags::implicit;
         if (hasImplicitMap)
           return;
 
@@ -1192,6 +1261,17 @@ class MapInfoFinalizationPass
         genBoxcharMemberMap(op, builder);
       });
 
+      // Expand type(C_PTR) only when unified_shared_memory is required,
+      // to ensure device-visible pointer size/behavior in USM scenarios
+      // without changing default expectations elsewhere.
+      func->walk([&](mlir::omp::MapInfoOp op) {
+        // Only expand C_PTR members when unified_shared_memory is required.
+        if (!moduleRequiresUSM(func->getParentOfType<mlir::ModuleOp>()))
+          return;
+        builder.setInsertionPoint(op);
+        genCptrMemberMap(op, builder);
+      });
+
       func->walk([&](mlir::omp::MapInfoOp op) {
         // TODO: Currently only supports a single user for the MapInfoOp. This
         // is fine for the moment, as the Fortran frontend will generate a
diff --git a/flang/lib/Optimizer/OpenMP/MapsForPrivatizedSymbols.cpp b/flang/lib/Optimizer/OpenMP/MapsForPrivatizedSymbols.cpp
index 9ae637ddb0c0c..87b0b59ea698d 100644
--- a/flang/lib/Optimizer/OpenMP/MapsForPrivatizedSymbols.cpp
+++ b/flang/lib/Optimizer/OpenMP/MapsForPrivatizedSymbols.cpp
@@ -36,7 +36,6 @@
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/SymbolTable.h"
 #include "mlir/Pass/Pass.h"
-#include "llvm/Frontend/OpenMP/OMPConstants.h"
 #include "llvm/Support/Debug.h"
 #include <type_traits>
 
@@ -72,9 +71,6 @@ class MapsForPrivatizedSymbolsPass
       return size <= ptrSize && align <= ptrAlign;
     };
 
-    uint64_t mapTypeTo = static_cast<
-        std::underlying_type_t<llvm::omp::OpenMPOffloadMappingFlags>>(
-        llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO);
     Operation *definingOp = var.getDefiningOp();
 
     Value varPtr = var;
@@ -124,8 +120,7 @@ class MapsForPrivatizedSymbolsPass
         builder, loc, varPtr.getType(), varPtr,
         TypeAttr::get(llvm::cast<omp::PointerLikeType>(varPtr.getType())
                           .getElementType()),
-        builder.getIntegerAttr(builder.getIntegerType(64, /*isSigned=*/false),
-                               mapTypeTo),
+        builder.getAttr<omp::ClauseMapFlagsAttr>(omp::ClauseMapFlags::to),
         builder.getAttr<omp::VariableCaptureKindAttr>(captureKind),
         /*varPtrPtr=*/Value{},
         /*members=*/SmallVector<Value>{},
diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp
index 1253f7b2e9d3d..1ce63ec85cef5 100644
--- a/flang/lib/Optimizer/Passes/Pipelines.cpp
+++ b/flang/lib/Optimizer/Passes/Pipelines.cpp
@@ -301,8 +301,10 @@ void createHLFIRToFIRPassPipeline(mlir::PassManager &pm,
     addNestedPassToAllTopLevelOperations<PassConstructor>(
         pm, hlfir::createInlineHLFIRAssign);
   pm.addPass(hlfir::createConvertHLFIRtoFIR());
-  if (enableOpenMP != EnableOpenMP::None)
+  if (enableOpenMP != EnableOpenMP::None) {
     pm.addPass(flangomp::createLowerWorkshare());
+    pm.addPass(flangomp::createLowerWorkdistribute());
+  }
   if (enableOpenMP == EnableOpenMP::Simd)
     pm.addPass(flangomp::createSimdOnlyPass());
 }
@@ -356,6 +358,7 @@ void createDebugPasses(mlir::PassManager &pm,
 void createDefaultFIRCodeGenPassPipeline(mlir::PassManager &pm,
                                          MLIRToLLVMPassPipelineConfig config,
                                          llvm::StringRef inputFilename) {
+  pm.addPass(fir::createMIFOpConversion());
   fir::addBoxedProcedurePass(pm);
   if (config.OptLevel.isOptimizingForSpeed() && config.AliasAnalysis &&
       !disableFirAliasTags && !useOldAliasTags)
diff --git a/flang/lib/Optimizer/Support/Utils.cpp b/flang/lib/Optimizer/Support/Utils.cpp
index c71642ce4e806..92390e4a3a230 100644
--- a/flang/lib/Optimizer/Support/Utils.cpp
+++ b/flang/lib/Optimizer/Support/Utils.cpp
@@ -51,6 +51,16 @@ std::optional<llvm::ArrayRef<int64_t>> fir::getComponentLowerBoundsIfNonDefault(
   return std::nullopt;
 }
 
+std::optional<bool>
+fir::isRecordWithFinalRoutine(fir::RecordType recordType, mlir::ModuleOp module,
+                              const mlir::SymbolTable *symbolTable) {
+  fir::TypeInfoOp typeInfo =
+      fir::lookupTypeInfoOp(recordType, module, symbolTable);
+  if (!typeInfo)
+    return std::nullopt;
+  return !typeInfo.getNoFinal();
+}
+
 mlir::LLVM::ConstantOp
 fir::genConstantIndex(mlir::Location loc, mlir::Type ity,
                       mlir::ConversionPatternRewriter &rewriter,
diff --git a/flang/lib/Optimizer/Transforms/AffinePromotion.cpp b/flang/lib/Optimizer/Transforms/AffinePromotion.cpp
index 061a7d201edd3..bdc34186a713b 100644
--- a/flang/lib/Optimizer/Transforms/AffinePromotion.cpp
+++ b/flang/lib/Optimizer/Transforms/AffinePromotion.cpp
@@ -474,7 +474,7 @@ class AffineLoopConversion : public mlir::OpRewritePattern<fir::DoLoopOp> {
                   mlir::PatternRewriter &rewriter) const override {
     LLVM_DEBUG(llvm::dbgs() << "AffineLoopConversion: rewriting loop:\n";
                loop.dump(););
-    LLVM_ATTRIBUTE_UNUSED auto loopAnalysis =
+    [[maybe_unused]] auto loopAnalysis =
         functionAnalysis.getChildLoopAnalysis(loop);
     if (!loopAnalysis.canPromoteToAffine())
       return rewriter.notifyMatchFailure(loop, "cannot promote to affine");
diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt
index 4ec16274830fe..0388439f89a54 100644
--- a/flang/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt
@@ -36,6 +36,7 @@ add_flang_library(FIRTransforms
   SimplifyFIROperations.cpp
   OptimizeArrayRepacking.cpp
   ConvertComplexPow.cpp
+  MIFOpConversion.cpp
 
   DEPENDS
   CUFAttrs
@@ -43,6 +44,7 @@ add_flang_library(FIRTransforms
   FIRDialect
   FIROptTransformsPassIncGen
   HLFIROpsIncGen
+  MIFDialect
 
   LINK_LIBS
   CUFAttrs
@@ -56,6 +58,7 @@ add_flang_library(FIRTransforms
   FIRSupport
   FortranSupport
   HLFIRDialect
+  MIFDialect
 
   MLIR_LIBS
   MLIRAffineUtils
diff --git a/flang/lib/Optimizer/Transforms/FunctionAttr.cpp b/flang/lib/Optimizer/Transforms/FunctionAttr.cpp
index 9dfe26cbf5899..fea511cc63a37 100644
--- a/flang/lib/Optimizer/Transforms/FunctionAttr.cpp
+++ b/flang/lib/Optimizer/Transforms/FunctionAttr.cpp
@@ -99,10 +99,6 @@ void FunctionAttrPass::runOnOperation() {
     func->setAttr(
         mlir::LLVM::LLVMFuncOp::getNoSignedZerosFpMathAttrName(llvmFuncOpName),
         mlir::BoolAttr::get(context, true));
-  if (unsafeFPMath)
-    func->setAttr(
-        mlir::LLVM::LLVMFuncOp::getUnsafeFpMathAttrName(llvmFuncOpName),
-        mlir::BoolAttr::get(context, true));
   if (!reciprocals.empty())
     func->setAttr(
         mlir::LLVM::LLVMFuncOp::getReciprocalEstimatesAttrName(llvmFuncOpName),
diff --git a/flang/lib/Optimizer/Transforms/MIFOpConversion.cpp b/flang/lib/Optimizer/Transforms/MIFOpConversion.cpp
new file mode 100644
index 0000000000000..206cb9be0574f
--- /dev/null
+++ b/flang/lib/Optimizer/Transforms/MIFOpConversion.cpp
@@ -0,0 +1,464 @@
+//===-- MIFOpConversion.cpp -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Transforms/MIFOpConversion.h"
+#include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
+#include "flang/Optimizer/Builder/Todo.h"
+#include "flang/Optimizer/CodeGen/TypeConverter.h"
+#include "flang/Optimizer/Dialect/FIRDialect.h"
+#include "flang/Optimizer/Dialect/FIROps.h"
+#include "flang/Optimizer/Dialect/MIF/MIFOps.h"
+#include "flang/Optimizer/HLFIR/HLFIROps.h"
+#include "flang/Optimizer/Support/DataLayout.h"
+#include "flang/Optimizer/Support/InternalNames.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+namespace fir {
+#define GEN_PASS_DEF_MIFOPCONVERSION
+#include "flang/Optimizer/Transforms/Passes.h.inc"
+} // namespace fir
+
+using namespace mlir;
+using namespace Fortran::runtime;
+
+namespace {
+
+// Default prefix for subroutines of PRIF compiled with LLVM
+static std::string getPRIFProcName(std::string fmt) {
+  std::ostringstream oss;
+  oss << "prif_" << fmt;
+  return fir::NameUniquer::doProcedure({"prif"}, {}, oss.str());
+}
+
+static mlir::Type getPRIFStatType(fir::FirOpBuilder &builder) {
+  return builder.getRefType(builder.getI32Type());
+}
+
+static mlir::Type getPRIFErrmsgType(fir::FirOpBuilder &builder) {
+  return fir::BoxType::get(fir::CharacterType::get(
+      builder.getContext(), 1, fir::CharacterType::unknownLen()));
+}
+
+// Most PRIF functions take `errmsg` and `errmsg_alloc` as two optional
+// arguments of intent (out). One is allocatable, the other is not.
+// It is the responsibility of the compiler to ensure that the appropriate
+// optional argument is passed, and at most one must be provided in a given
+// call.
+// Depending on the type of `errmsg`, this function will return the pair
+// corresponding to (`errmsg`, `errmsg_alloc`).
+static std::pair<mlir::Value, mlir::Value>
+genErrmsgPRIF(fir::FirOpBuilder &builder, mlir::Location loc,
+              mlir::Value errmsg) {
+  mlir::Value absent =
+      fir::AbsentOp::create(builder, loc, getPRIFErrmsgType(builder));
+  if (!errmsg)
+    return {absent, absent};
+
+  bool isAllocatableErrmsg = fir::isAllocatableType(errmsg.getType());
+  mlir::Value errMsg = isAllocatableErrmsg ? absent : errmsg;
+  mlir::Value errMsgAlloc = isAllocatableErrmsg ? errmsg : absent;
+  return {errMsg, errMsgAlloc};
+}
+
+/// Convert mif.init operation to runtime call of 'prif_init'
+struct MIFInitOpConversion : public mlir::OpRewritePattern<mif::InitOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(mif::InitOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    auto mod = op->template getParentOfType<mlir::ModuleOp>();
+    fir::FirOpBuilder builder(rewriter, mod);
+    mlir::Location loc = op.getLoc();
+
+    mlir::Type i32Ty = builder.getI32Type();
+    mlir::Value result = builder.createTemporary(loc, i32Ty);
+    mlir::FunctionType ftype = mlir::FunctionType::get(
+        builder.getContext(),
+        /*inputs*/ {builder.getRefType(i32Ty)}, /*results*/ {});
+    mlir::func::FuncOp funcOp =
+        builder.createFunction(loc, getPRIFProcName("init"), ftype);
+    llvm::SmallVector<mlir::Value> args =
+        fir::runtime::createArguments(builder, loc, ftype, result);
+    fir::CallOp::create(builder, loc, funcOp, args);
+    rewriter.replaceOpWithNewOp<fir::LoadOp>(op, result);
+    return mlir::success();
+  }
+};
+
+/// Convert mif.this_image operation to PRIF runtime call
+struct MIFThisImageOpConversion
+    : public mlir::OpRewritePattern<mif::ThisImageOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(mif::ThisImageOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    auto mod = op->template getParentOfType<mlir::ModuleOp>();
+    fir::FirOpBuilder builder(rewriter, mod);
+    mlir::Location loc = op.getLoc();
+
+    if (op.getCoarray())
+      TODO(loc, "mif.this_image op with coarray argument.");
+    else {
+      mlir::Type i32Ty = builder.getI32Type();
+      mlir::Type boxTy = fir::BoxType::get(rewriter.getNoneType());
+      mlir::Value result = builder.createTemporary(loc, i32Ty);
+      mlir::FunctionType ftype = mlir::FunctionType::get(
+          builder.getContext(),
+          /*inputs*/ {boxTy, builder.getRefType(i32Ty)}, /*results*/ {});
+      mlir::Value teamArg = op.getTeam();
+      if (!op.getTeam())
+        teamArg = fir::AbsentOp::create(builder, loc, boxTy);
+
+      mlir::func::FuncOp funcOp = builder.createFunction(
+          loc, getPRIFProcName("this_image_no_coarray"), ftype);
+      llvm::SmallVector<mlir::Value> args =
+          fir::runtime::createArguments(builder, loc, ftype, teamArg, result);
+      fir::CallOp::create(builder, loc, funcOp, args);
+      rewriter.replaceOpWithNewOp<fir::LoadOp>(op, result);
+      return mlir::success();
+    }
+  }
+};
+
+/// Convert mif.num_images operation to runtime call of
+/// prif_num_images_with_{team|team_number}
+struct MIFNumImagesOpConversion
+    : public mlir::OpRewritePattern<mif::NumImagesOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(mif::NumImagesOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    auto mod = op->template getParentOfType<mlir::ModuleOp>();
+    fir::FirOpBuilder builder(rewriter, mod);
+    mlir::Location loc = op.getLoc();
+
+    mlir::Type i32Ty = builder.getI32Type();
+    mlir::Type i64Ty = builder.getI64Type();
+    mlir::Type boxTy = fir::BoxType::get(rewriter.getNoneType());
+    mlir::Value result = builder.createTemporary(loc, i32Ty);
+
+    mlir::func::FuncOp funcOp;
+    llvm::SmallVector<mlir::Value> args;
+    if (!op.getTeam() && !op.getTeamNumber()) {
+      mlir::FunctionType ftype = mlir::FunctionType::get(
+          builder.getContext(),
+          /*inputs*/ {builder.getRefType(i32Ty)}, /*results*/ {});
+      funcOp =
+          builder.createFunction(loc, getPRIFProcName("num_images"), ftype);
+      args = fir::runtime::createArguments(builder, loc, ftype, result);
+    } else {
+      if (op.getTeam()) {
+        mlir::FunctionType ftype =
+            mlir::FunctionType::get(builder.getContext(),
+                                    /*inputs*/
+                                    {boxTy, builder.getRefType(i32Ty)},
+                                    /*results*/ {});
+        funcOp = builder.createFunction(
+            loc, getPRIFProcName("num_images_with_team"), ftype);
+        args = fir::runtime::createArguments(builder, loc, ftype, op.getTeam(),
+                                             result);
+      } else {
+        mlir::Value teamNumber = builder.createTemporary(loc, i64Ty);
+        mlir::Value cst = op.getTeamNumber();
+        if (op.getTeamNumber().getType() != i64Ty)
+          cst = fir::ConvertOp::create(builder, loc, i64Ty, op.getTeamNumber());
+        fir::StoreOp::create(builder, loc, cst, teamNumber);
+        mlir::FunctionType ftype = mlir::FunctionType::get(
+            builder.getContext(),
+            /*inputs*/ {builder.getRefType(i64Ty), builder.getRefType(i32Ty)},
+            /*results*/ {});
+        funcOp = builder.createFunction(
+            loc, getPRIFProcName("num_images_with_team_number"), ftype);
+        args = fir::runtime::createArguments(builder, loc, ftype, teamNumber,
+                                             result);
+      }
+    }
+    fir::CallOp::create(builder, loc, funcOp, args);
+    rewriter.replaceOpWithNewOp<fir::LoadOp>(op, result);
+    return mlir::success();
+  }
+};
+
+/// Convert mif.sync_all operation to runtime call of 'prif_sync_all'
+struct MIFSyncAllOpConversion : public mlir::OpRewritePattern<mif::SyncAllOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(mif::SyncAllOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    auto mod = op->template getParentOfType<mlir::ModuleOp>();
+    fir::FirOpBuilder builder(rewriter, mod);
+    mlir::Location loc = op.getLoc();
+
+    mlir::Type errmsgTy = getPRIFErrmsgType(builder);
+    mlir::FunctionType ftype = mlir::FunctionType::get(
+        builder.getContext(),
+        /*inputs*/ {getPRIFStatType(builder), errmsgTy, errmsgTy},
+        /*results*/ {});
+    mlir::func::FuncOp funcOp =
+        builder.createFunction(loc, getPRIFProcName("sync_all"), ftype);
+
+    auto [errmsgArg, errmsgAllocArg] =
+        genErrmsgPRIF(builder, loc, op.getErrmsg());
+    mlir::Value stat = op.getStat();
+    if (!stat)
+      stat = fir::AbsentOp::create(builder, loc, getPRIFStatType(builder));
+    llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
+        builder, loc, ftype, stat, errmsgArg, errmsgAllocArg);
+    rewriter.replaceOpWithNewOp<fir::CallOp>(op, funcOp, args);
+    return mlir::success();
+  }
+};
+
+/// Convert mif.sync_images operation to runtime call of 'prif_sync_images'
+struct MIFSyncImagesOpConversion
+    : public mlir::OpRewritePattern<mif::SyncImagesOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(mif::SyncImagesOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    auto mod = op->template getParentOfType<mlir::ModuleOp>();
+    fir::FirOpBuilder builder(rewriter, mod);
+    mlir::Location loc = op.getLoc();
+
+    mlir::Type errmsgTy = getPRIFErrmsgType(builder);
+    mlir::Type imgSetTy = fir::BoxType::get(fir::SequenceType::get(
+        {fir::SequenceType::getUnknownExtent()}, builder.getI32Type()));
+    mlir::FunctionType ftype = mlir::FunctionType::get(
+        builder.getContext(),
+        /*inputs*/
+        {imgSetTy, getPRIFStatType(builder), errmsgTy, errmsgTy},
+        /*results*/ {});
+    mlir::func::FuncOp funcOp =
+        builder.createFunction(loc, getPRIFProcName("sync_images"), ftype);
+
+    // If imageSet is scalar, PRIF require to pass an array of size 1.
+    mlir::Value imageSet = op.getImageSet();
+    if (!imageSet)
+      imageSet = fir::AbsentOp::create(builder, loc, imgSetTy);
+    else if (auto boxTy = mlir::dyn_cast<fir::BoxType>(imageSet.getType())) {
+      if (!mlir::isa<fir::SequenceType>(boxTy.getEleTy())) {
+        mlir::Value one =
+            builder.createIntegerConstant(loc, builder.getI32Type(), 1);
+        mlir::Value shape = fir::ShapeOp::create(builder, loc, one);
+        imageSet =
+            fir::ReboxOp::create(builder, loc,
+                                 fir::BoxType::get(fir::SequenceType::get(
+                                     {1}, builder.getI32Type())),
+                                 imageSet, shape, mlir::Value{});
+      }
+    }
+    auto [errmsgArg, errmsgAllocArg] =
+        genErrmsgPRIF(builder, loc, op.getErrmsg());
+    mlir::Value stat = op.getStat();
+    if (!stat)
+      stat = fir::AbsentOp::create(builder, loc, getPRIFStatType(builder));
+    llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
+        builder, loc, ftype, imageSet, stat, errmsgArg, errmsgAllocArg);
+    rewriter.replaceOpWithNewOp<fir::CallOp>(op, funcOp, args);
+    return mlir::success();
+  }
+};
+
+/// Convert mif.sync_memory operation to runtime call of 'prif_sync_memory'
+struct MIFSyncMemoryOpConversion
+    : public mlir::OpRewritePattern<mif::SyncMemoryOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(mif::SyncMemoryOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    auto mod = op->template getParentOfType<mlir::ModuleOp>();
+    fir::FirOpBuilder builder(rewriter, mod);
+    mlir::Location loc = op.getLoc();
+
+    mlir::Type errmsgTy = getPRIFErrmsgType(builder);
+    mlir::FunctionType ftype = mlir::FunctionType::get(
+        builder.getContext(),
+        /*inputs*/ {getPRIFStatType(builder), errmsgTy, errmsgTy},
+        /*results*/ {});
+    mlir::func::FuncOp funcOp =
+        builder.createFunction(loc, getPRIFProcName("sync_memory"), ftype);
+
+    auto [errmsgArg, errmsgAllocArg] =
+        genErrmsgPRIF(builder, loc, op.getErrmsg());
+    mlir::Value stat = op.getStat();
+    if (!stat)
+      stat = fir::AbsentOp::create(builder, loc, getPRIFStatType(builder));
+    llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
+        builder, loc, ftype, stat, errmsgArg, errmsgAllocArg);
+    rewriter.replaceOpWithNewOp<fir::CallOp>(op, funcOp, args);
+    return mlir::success();
+  }
+};
+
+/// Generate call to collective subroutines except co_reduce
+/// A must be lowered as a box
+static fir::CallOp genCollectiveSubroutine(fir::FirOpBuilder &builder,
+                                           mlir::Location loc, mlir::Value A,
+                                           mlir::Value image, mlir::Value stat,
+                                           mlir::Value errmsg,
+                                           std::string coName) {
+  mlir::Value rootImage;
+  mlir::Type i32Ty = builder.getI32Type();
+  if (!image)
+    rootImage = fir::AbsentOp::create(builder, loc, builder.getRefType(i32Ty));
+  else {
+    rootImage = builder.createTemporary(loc, i32Ty);
+    if (image.getType() != i32Ty)
+      image = fir::ConvertOp::create(builder, loc, i32Ty, image);
+    fir::StoreOp::create(builder, loc, image, rootImage);
+  }
+
+  mlir::Type errmsgTy = getPRIFErrmsgType(builder);
+  mlir::Type boxTy = fir::BoxType::get(builder.getNoneType());
+  mlir::FunctionType ftype =
+      mlir::FunctionType::get(builder.getContext(),
+                              /*inputs*/
+                              {boxTy, builder.getRefType(builder.getI32Type()),
+                               getPRIFStatType(builder), errmsgTy, errmsgTy},
+                              /*results*/ {});
+  mlir::func::FuncOp funcOp = builder.createFunction(loc, coName, ftype);
+
+  auto [errmsgArg, errmsgAllocArg] = genErrmsgPRIF(builder, loc, errmsg);
+  if (!stat)
+    stat = fir::AbsentOp::create(builder, loc, getPRIFStatType(builder));
+  llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments(
+      builder, loc, ftype, A, rootImage, stat, errmsgArg, errmsgAllocArg);
+  return fir::CallOp::create(builder, loc, funcOp, args);
+}
+
+/// Convert mif.co_broadcast operation to runtime call of 'prif_co_broadcast'
+struct MIFCoBroadcastOpConversion
+    : public mlir::OpRewritePattern<mif::CoBroadcastOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(mif::CoBroadcastOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    auto mod = op->template getParentOfType<mlir::ModuleOp>();
+    fir::FirOpBuilder builder(rewriter, mod);
+    mlir::Location loc = op.getLoc();
+
+    fir::CallOp callOp = genCollectiveSubroutine(
+        builder, loc, op.getA(), op.getSourceImage(), op.getStat(),
+        op.getErrmsg(), getPRIFProcName("co_broadcast"));
+    rewriter.replaceOp(op, callOp);
+    return mlir::success();
+  }
+};
+
+/// Convert mif.co_max operation to runtime call of 'prif_co_max'
+struct MIFCoMaxOpConversion : public mlir::OpRewritePattern<mif::CoMaxOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(mif::CoMaxOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    auto mod = op->template getParentOfType<mlir::ModuleOp>();
+    fir::FirOpBuilder builder(rewriter, mod);
+    mlir::Location loc = op.getLoc();
+
+    fir::CallOp callOp;
+    mlir::Type argTy =
+        fir::unwrapSequenceType(fir::unwrapPassByRefType(op.getA().getType()));
+    if (mlir::isa<fir::CharacterType>(argTy))
+      callOp = genCollectiveSubroutine(
+          builder, loc, op.getA(), op.getResultImage(), op.getStat(),
+          op.getErrmsg(), getPRIFProcName("co_max_character"));
+    else
+      callOp = genCollectiveSubroutine(
+          builder, loc, op.getA(), op.getResultImage(), op.getStat(),
+          op.getErrmsg(), getPRIFProcName("co_max"));
+    rewriter.replaceOp(op, callOp);
+    return mlir::success();
+  }
+};
+
+/// Convert mif.co_min operation to runtime call of 'prif_co_min'
+struct MIFCoMinOpConversion : public mlir::OpRewritePattern<mif::CoMinOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(mif::CoMinOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    auto mod = op->template getParentOfType<mlir::ModuleOp>();
+    fir::FirOpBuilder builder(rewriter, mod);
+    mlir::Location loc = op.getLoc();
+
+    fir::CallOp callOp;
+    mlir::Type argTy =
+        fir::unwrapSequenceType(fir::unwrapPassByRefType(op.getA().getType()));
+    if (mlir::isa<fir::CharacterType>(argTy))
+      callOp = genCollectiveSubroutine(
+          builder, loc, op.getA(), op.getResultImage(), op.getStat(),
+          op.getErrmsg(), getPRIFProcName("co_min_character"));
+    else
+      callOp = genCollectiveSubroutine(
+          builder, loc, op.getA(), op.getResultImage(), op.getStat(),
+          op.getErrmsg(), getPRIFProcName("co_min"));
+    rewriter.replaceOp(op, callOp);
+    return mlir::success();
+  }
+};
+
+/// Convert mif.co_sum operation to runtime call of 'prif_co_sum'
+struct MIFCoSumOpConversion : public mlir::OpRewritePattern<mif::CoSumOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(mif::CoSumOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    auto mod = op->template getParentOfType<mlir::ModuleOp>();
+    fir::FirOpBuilder builder(rewriter, mod);
+    mlir::Location loc = op.getLoc();
+
+    fir::CallOp callOp = genCollectiveSubroutine(
+        builder, loc, op.getA(), op.getResultImage(), op.getStat(),
+        op.getErrmsg(), getPRIFProcName("co_sum"));
+    rewriter.replaceOp(op, callOp);
+    return mlir::success();
+  }
+};
+
+class MIFOpConversion : public fir::impl::MIFOpConversionBase<MIFOpConversion> {
+public:
+  void runOnOperation() override {
+    auto *ctx = &getContext();
+    mlir::RewritePatternSet patterns(ctx);
+    mlir::ConversionTarget target(*ctx);
+
+    mif::populateMIFOpConversionPatterns(patterns);
+
+    target.addLegalDialect<fir::FIROpsDialect>();
+    target.addLegalOp<mlir::ModuleOp>();
+
+    if (mlir::failed(mlir::applyPartialConversion(getOperation(), target,
+                                                  std::move(patterns)))) {
+      mlir::emitError(mlir::UnknownLoc::get(ctx),
+                      "error in MIF op conversion\n");
+      return signalPassFailure();
+    }
+  }
+};
+} // namespace
+
+void mif::populateMIFOpConversionPatterns(mlir::RewritePatternSet &patterns) {
+  patterns.insert<MIFInitOpConversion, MIFThisImageOpConversion,
+                  MIFNumImagesOpConversion, MIFSyncAllOpConversion,
+                  MIFSyncImagesOpConversion, MIFSyncMemoryOpConversion,
+                  MIFCoBroadcastOpConversion, MIFCoMaxOpConversion,
+                  MIFCoMinOpConversion, MIFCoSumOpConversion>(
+      patterns.getContext());
+}
diff --git a/flang/lib/Optimizer/Transforms/StackArrays.cpp b/flang/lib/Optimizer/Transforms/StackArrays.cpp
index 80b3f689efa39..860149942a5c1 100644
--- a/flang/lib/Optimizer/Transforms/StackArrays.cpp
+++ b/flang/lib/Optimizer/Transforms/StackArrays.cpp
@@ -561,7 +561,7 @@ static mlir::Value convertAllocationType(mlir::PatternRewriter &rewriter,
     return stack;
 
   fir::HeapType firHeapTy = mlir::cast<fir::HeapType>(heapTy);
-  LLVM_ATTRIBUTE_UNUSED fir::ReferenceType firRefTy =
+  [[maybe_unused]] fir::ReferenceType firRefTy =
       mlir::cast<fir::ReferenceType>(stackTy);
   assert(firHeapTy.getElementType() == firRefTy.getElementType() &&
          "Allocations must have the same type");
diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp
index 9507021057476..c0472ad3c0692 100644
--- a/flang/lib/Parser/openmp-parsers.cpp
+++ b/flang/lib/Parser/openmp-parsers.cpp
@@ -548,6 +548,14 @@ TYPE_PARSER(construct<OmpAllocatorSimpleModifier>(scalarIntExpr))
 TYPE_PARSER(construct<OmpAlwaysModifier>( //
     "ALWAYS" >> pure(OmpAlwaysModifier::Value::Always)))
 
+TYPE_PARSER(construct<OmpAttachModifier::Value>(
+    "ALWAYS" >> pure(OmpAttachModifier::Value::Always) ||
+    "AUTO" >> pure(OmpAttachModifier::Value::Auto) ||
+    "NEVER" >> pure(OmpAttachModifier::Value::Never)))
+
+TYPE_PARSER(construct<OmpAttachModifier>( //
+    "ATTACH" >> parenthesized(Parser<OmpAttachModifier::Value>{})))
+
 TYPE_PARSER(construct<OmpAutomapModifier>(
     "AUTOMAP" >> pure(OmpAutomapModifier::Value::Automap)))
 
@@ -744,6 +752,7 @@ TYPE_PARSER(sourced(
 
 TYPE_PARSER(sourced(construct<OmpMapClause::Modifier>(
     sourced(construct<OmpMapClause::Modifier>(Parser<OmpAlwaysModifier>{}) ||
+        construct<OmpMapClause::Modifier>(Parser<OmpAttachModifier>{}) ||
         construct<OmpMapClause::Modifier>(Parser<OmpCloseModifier>{}) ||
         construct<OmpMapClause::Modifier>(Parser<OmpDeleteModifier>{}) ||
         construct<OmpMapClause::Modifier>(Parser<OmpPresentModifier>{}) ||
@@ -1085,7 +1094,7 @@ TYPE_PARSER(construct<OmpBindClause>(
     "TEAMS" >> pure(OmpBindClause::Binding::Teams) ||
     "THREAD" >> pure(OmpBindClause::Binding::Thread)))
 
-TYPE_PARSER(construct<OmpAlignClause>(scalarIntExpr))
+TYPE_PARSER(construct<OmpAlignClause>(scalarIntConstantExpr))
 
 TYPE_PARSER(construct<OmpAtClause>(
     "EXECUTION" >> pure(OmpAtClause::ActionTime::Execution) ||
@@ -1150,6 +1159,9 @@ TYPE_PARSER( //
             construct<OmpDestroyClause>(Parser<OmpObject>{}))))) ||
     "DEVICE" >> construct<OmpClause>(construct<OmpClause::Device>(
                     parenthesized(Parser<OmpDeviceClause>{}))) ||
+    "DEVICE_SAFESYNC" >>
+        construct<OmpClause>(construct<OmpClause::DeviceSafesync>(
+            maybe(parenthesized(scalarLogicalConstantExpr)))) ||
     "DEVICE_TYPE" >> construct<OmpClause>(construct<OmpClause::DeviceType>(
                          parenthesized(Parser<OmpDeviceTypeClause>{}))) ||
     "DIST_SCHEDULE" >>
@@ -1158,7 +1170,8 @@ TYPE_PARSER( //
     "DOACROSS" >>
         construct<OmpClause>(parenthesized(Parser<OmpDoacrossClause>{})) ||
     "DYNAMIC_ALLOCATORS" >>
-        construct<OmpClause>(construct<OmpClause::DynamicAllocators>()) ||
+        construct<OmpClause>(construct<OmpClause::DynamicAllocators>(
+            maybe(parenthesized(scalarLogicalConstantExpr)))) ||
     "DYN_GROUPPRIVATE" >>
         construct<OmpClause>(construct<OmpClause::DynGroupprivate>(
             parenthesized(Parser<OmpDynGroupprivateClause>{}))) ||
@@ -1270,12 +1283,15 @@ TYPE_PARSER( //
     "REPLAYABLE" >> construct<OmpClause>(construct<OmpClause::Replayable>(
                         maybe(parenthesized(Parser<OmpReplayableClause>{})))) ||
     "REVERSE_OFFLOAD" >>
-        construct<OmpClause>(construct<OmpClause::ReverseOffload>()) ||
+        construct<OmpClause>(construct<OmpClause::ReverseOffload>(
+            maybe(parenthesized(scalarLogicalConstantExpr)))) ||
     "SAFELEN" >> construct<OmpClause>(construct<OmpClause::Safelen>(
                      parenthesized(scalarIntConstantExpr))) ||
     "SCHEDULE" >> construct<OmpClause>(construct<OmpClause::Schedule>(
                       parenthesized(Parser<OmpScheduleClause>{}))) ||
     "SEQ_CST" >> construct<OmpClause>(construct<OmpClause::SeqCst>()) ||
+    "SELF_MAPS" >> construct<OmpClause>(construct<OmpClause::SelfMaps>(
+                       maybe(parenthesized(scalarLogicalConstantExpr)))) ||
     "SEVERITY" >> construct<OmpClause>(construct<OmpClause::Severity>(
                       parenthesized(Parser<OmpSeverityClause>{}))) ||
     "SHARED" >> construct<OmpClause>(construct<OmpClause::Shared>(
@@ -1303,9 +1319,11 @@ TYPE_PARSER( //
         construct<OmpClause>(construct<OmpClause::UseDeviceAddr>(
             parenthesized(Parser<OmpObjectList>{}))) ||
     "UNIFIED_ADDRESS" >>
-        construct<OmpClause>(construct<OmpClause::UnifiedAddress>()) ||
+        construct<OmpClause>(construct<OmpClause::UnifiedAddress>(
+            maybe(parenthesized(scalarLogicalConstantExpr)))) ||
     "UNIFIED_SHARED_MEMORY" >>
-        construct<OmpClause>(construct<OmpClause::UnifiedSharedMemory>()) ||
+        construct<OmpClause>(construct<OmpClause::UnifiedSharedMemory>(
+            maybe(parenthesized(scalarLogicalConstantExpr)))) ||
     "UNIFORM" >> construct<OmpClause>(construct<OmpClause::Uniform>(
                      parenthesized(nonemptyList(name)))) ||
     "UNTIED" >> construct<OmpClause>(construct<OmpClause::Untied>()) ||
@@ -1817,8 +1835,8 @@ TYPE_PARSER(sourced(construct<OpenMPDeclareMapperConstruct>(
 TYPE_PARSER(construct<OmpReductionCombiner>(Parser<AssignmentStmt>{}) ||
     construct<OmpReductionCombiner>(Parser<FunctionReference>{}))
 
-TYPE_PARSER(construct<OpenMPCriticalConstruct>(
-    OmpBlockConstructParser{llvm::omp::Directive::OMPD_critical}))
+TYPE_PARSER(sourced(construct<OpenMPCriticalConstruct>(
+    OmpBlockConstructParser{llvm::omp::Directive::OMPD_critical})))
 
 // 2.11.3 Executable Allocate directive
 TYPE_PARSER(
@@ -1893,12 +1911,12 @@ TYPE_PARSER(
                                 Parser<OmpMetadirectiveDirective>{})) /
                             endOmpLine))
 
-TYPE_PARSER(construct<OpenMPAssumeConstruct>(
-    sourced(OmpBlockConstructParser{llvm::omp::Directive::OMPD_assume})))
+TYPE_PARSER(sourced(construct<OpenMPAssumeConstruct>(
+    OmpBlockConstructParser{llvm::omp::Directive::OMPD_assume})))
 
 // Block Construct
 #define MakeBlockConstruct(dir) \
-  construct<OmpBlockConstruct>(OmpBlockConstructParser{dir})
+  sourced(construct<OmpBlockConstruct>(OmpBlockConstructParser{dir}))
 TYPE_PARSER( //
     MakeBlockConstruct(llvm::omp::Directive::OMPD_masked) ||
     MakeBlockConstruct(llvm::omp::Directive::OMPD_master) ||
diff --git a/flang/lib/Parser/parse-tree.cpp b/flang/lib/Parser/parse-tree.cpp
index cb3093954cbe4..8cbaa399c4763 100644
--- a/flang/lib/Parser/parse-tree.cpp
+++ b/flang/lib/Parser/parse-tree.cpp
@@ -185,7 +185,7 @@ StructureConstructor ArrayElement::ConvertToStructureConstructor(
   std::list<ComponentSpec> components;
   for (auto &subscript : subscripts) {
     components.emplace_back(std::optional<Keyword>{},
-        ComponentDataSource{std::move(*Unwrap<Expr>(subscript))});
+        ComponentDataSource{std::move(UnwrapRef<Expr>(subscript))});
   }
   DerivedTypeSpec spec{std::move(name), std::list<TypeParamSpec>{}};
   spec.derivedTypeSpec = &derived;
diff --git a/flang/lib/Parser/tools.cpp b/flang/lib/Parser/tools.cpp
index 264ca520f38b8..ed6d194c17dc3 100644
--- a/flang/lib/Parser/tools.cpp
+++ b/flang/lib/Parser/tools.cpp
@@ -179,4 +179,9 @@ bool CheckForSingleVariableOnRHS(const AssignmentStmt &assignmentStmt) {
   return Unwrap<Designator>(std::get<Expr>(assignmentStmt.t)) != nullptr;
 }
 
+const Name *GetDesignatorNameIfDataRef(const Designator &designator) {
+  const auto *dataRef{std::get_if<DataRef>(&designator.u)};
+  return dataRef ? std::get_if<Name>(&dataRef->u) : nullptr;
+}
+
 } // namespace Fortran::parser
diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp
index 0511f5bdf7478..b172e429c84e8 100644
--- a/flang/lib/Parser/unparse.cpp
+++ b/flang/lib/Parser/unparse.cpp
@@ -2384,6 +2384,11 @@ class UnparseVisitor {
     Walk(x.v);
     Put(")");
   }
+  void Unparse(const OmpAttachModifier &x) {
+    Word("ATTACH(");
+    Walk(x.v);
+    Put(")");
+  }
   void Unparse(const OmpOrderClause &x) {
     using Modifier = OmpOrderClause::Modifier;
     Walk(std::get<std::optional<std::list<Modifier>>>(x.t), ":");
@@ -2820,6 +2825,7 @@ class UnparseVisitor {
   WALK_NESTED_ENUM(OmpMapType, Value) // OMP map-type
   WALK_NESTED_ENUM(OmpMapTypeModifier, Value) // OMP map-type-modifier
   WALK_NESTED_ENUM(OmpAlwaysModifier, Value)
+  WALK_NESTED_ENUM(OmpAttachModifier, Value)
   WALK_NESTED_ENUM(OmpCloseModifier, Value)
   WALK_NESTED_ENUM(OmpDeleteModifier, Value)
   WALK_NESTED_ENUM(OmpPresentModifier, Value)
diff --git a/flang/lib/Semantics/assignment.cpp b/flang/lib/Semantics/assignment.cpp
index f4aa496e485e1..1824a7d232d70 100644
--- a/flang/lib/Semantics/assignment.cpp
+++ b/flang/lib/Semantics/assignment.cpp
@@ -194,7 +194,8 @@ void AssignmentContext::CheckShape(parser::CharBlock at, const SomeExpr *expr) {
 
 template <typename A> void AssignmentContext::PushWhereContext(const A &x) {
   const auto &expr{std::get<parser::LogicalExpr>(x.t)};
-  CheckShape(expr.thing.value().source, GetExpr(context_, expr));
+  CheckShape(
+      parser::UnwrapRef<parser::Expr>(expr).source, GetExpr(context_, expr));
   ++whereDepth_;
 }
 
diff --git a/flang/lib/Semantics/check-acc-structure.cpp b/flang/lib/Semantics/check-acc-structure.cpp
index 3cd6d6ba7689a..5e87b834edf7e 100644
--- a/flang/lib/Semantics/check-acc-structure.cpp
+++ b/flang/lib/Semantics/check-acc-structure.cpp
@@ -10,6 +10,7 @@
 #include "flang/Common/enum-set.h"
 #include "flang/Evaluate/tools.h"
 #include "flang/Parser/parse-tree.h"
+#include "flang/Parser/tools.h"
 #include "flang/Semantics/symbol.h"
 #include "flang/Semantics/tools.h"
 #include "flang/Semantics/type.h"
@@ -709,7 +710,8 @@ void AccStructureChecker::CheckMultipleOccurrenceInDeclare(
     common::visit(
         common::visitors{
             [&](const parser::Designator &designator) {
-              if (const auto *name = getDesignatorNameIfDataRef(designator)) {
+              if (const auto *name =
+                      parser::GetDesignatorNameIfDataRef(designator)) {
                 if (declareSymbols.contains(&name->symbol->GetUltimate())) {
                   if (declareSymbols[&name->symbol->GetUltimate()] == clause) {
                     context_.Warn(common::UsageWarning::OpenAccUsage,
@@ -982,7 +984,8 @@ void AccStructureChecker::Enter(const parser::AccClause::Reduction &reduction) {
     common::visit(
         common::visitors{
             [&](const parser::Designator &designator) {
-              if (const auto *name = getDesignatorNameIfDataRef(designator)) {
+              if (const auto *name =
+                      parser::GetDesignatorNameIfDataRef(designator)) {
                 if (name->symbol) {
                   if (const auto *type{name->symbol->GetType()}) {
                     if (type->IsNumeric(TypeCategory::Integer) &&
diff --git a/flang/lib/Semantics/check-allocate.cpp b/flang/lib/Semantics/check-allocate.cpp
index 823aa4e795e35..e019bbdfa27f6 100644
--- a/flang/lib/Semantics/check-allocate.cpp
+++ b/flang/lib/Semantics/check-allocate.cpp
@@ -151,7 +151,9 @@ static std::optional<AllocateCheckerInfo> CheckAllocateOptions(
                       [&](const parser::MsgVariable &var) {
                         WarnOnDeferredLengthCharacterScalar(context,
                             GetExpr(context, var),
-                            var.v.thing.thing.GetSource(), "ERRMSG=");
+                            parser::UnwrapRef<parser::Variable>(var)
+                                .GetSource(),
+                            "ERRMSG=");
                         if (info.gotMsg) { // C943
                           context.Say(
                               "ERRMSG may not be duplicated in a ALLOCATE statement"_err_en_US);
@@ -439,7 +441,7 @@ static bool HaveCompatibleLengths(
         evaluate::ToInt64(type1.characterTypeSpec().length().GetExplicit())};
     auto v2{
         evaluate::ToInt64(type2.characterTypeSpec().length().GetExplicit())};
-    return !v1 || !v2 || *v1 == *v2;
+    return !v1 || !v2 || (*v1 >= 0 ? *v1 : 0) == (*v2 >= 0 ? *v2 : 0);
   } else {
     return true;
   }
@@ -452,7 +454,7 @@ static bool HaveCompatibleLengths(
     auto v1{
         evaluate::ToInt64(type1.characterTypeSpec().length().GetExplicit())};
     auto v2{type2.knownLength()};
-    return !v1 || !v2 || *v1 == *v2;
+    return !v1 || !v2 || (*v1 >= 0 ? *v1 : 0) == (*v2 >= 0 ? *v2 : 0);
   } else {
     return true;
   }
@@ -598,7 +600,7 @@ bool AllocationCheckerHelper::RunChecks(SemanticsContext &context) {
           std::optional<evaluate::ConstantSubscript> lbound;
           if (const auto &lb{std::get<0>(shapeSpec.t)}) {
             lbound.reset();
-            const auto &lbExpr{lb->thing.thing.value()};
+            const auto &lbExpr{parser::UnwrapRef<parser::Expr>(lb)};
             if (const auto *expr{GetExpr(context, lbExpr)}) {
               auto folded{
                   evaluate::Fold(context.foldingContext(), SomeExpr(*expr))};
@@ -609,7 +611,8 @@ bool AllocationCheckerHelper::RunChecks(SemanticsContext &context) {
             lbound = 1;
           }
           if (lbound) {
-            const auto &ubExpr{std::get<1>(shapeSpec.t).thing.thing.value()};
+            const auto &ubExpr{
+                parser::UnwrapRef<parser::Expr>(std::get<1>(shapeSpec.t))};
             if (const auto *expr{GetExpr(context, ubExpr)}) {
               auto folded{
                   evaluate::Fold(context.foldingContext(), SomeExpr(*expr))};
diff --git a/flang/lib/Semantics/check-case.cpp b/flang/lib/Semantics/check-case.cpp
index 5ce143c9aec91..7593154b84c4c 100644
--- a/flang/lib/Semantics/check-case.cpp
+++ b/flang/lib/Semantics/check-case.cpp
@@ -72,7 +72,7 @@ template <typename T> class CaseValues {
   }
 
   std::optional<Value> GetValue(const parser::CaseValue &caseValue) {
-    const parser::Expr &expr{caseValue.thing.thing.value()};
+    const auto &expr{parser::UnwrapRef<parser::Expr>(caseValue)};
     auto *x{expr.typedExpr.get()};
     if (x && x->v) { // C1147
       auto type{x->v->GetType()};
diff --git a/flang/lib/Semantics/check-coarray.cpp b/flang/lib/Semantics/check-coarray.cpp
index 0e444f155f116..91133693ff614 100644
--- a/flang/lib/Semantics/check-coarray.cpp
+++ b/flang/lib/Semantics/check-coarray.cpp
@@ -112,7 +112,7 @@ static void CheckTeamType(
 
 static void CheckTeamStat(
     SemanticsContext &context, const parser::ImageSelectorSpec::Stat &stat) {
-  const parser::Variable &var{stat.v.thing.thing.value()};
+  const auto &var{parser::UnwrapRef<parser::Variable>(stat)};
   if (parser::GetCoindexedNamedObject(var)) {
     context.Say(parser::FindSourceLocation(var), // C931
         "Image selector STAT variable must not be a coindexed "
@@ -147,7 +147,8 @@ static void CheckSyncStat(SemanticsContext &context,
           },
           [&](const parser::MsgVariable &var) {
             WarnOnDeferredLengthCharacterScalar(context, GetExpr(context, var),
-                var.v.thing.thing.GetSource(), "ERRMSG=");
+                parser::UnwrapRef<parser::Variable>(var).GetSource(),
+                "ERRMSG=");
             if (gotMsg) {
               context.Say( // C1172
                   "The errmsg-variable in a sync-stat-list may not be repeated"_err_en_US);
@@ -260,7 +261,9 @@ static void CheckEventWaitSpecList(SemanticsContext &context,
                       [&](const parser::MsgVariable &var) {
                         WarnOnDeferredLengthCharacterScalar(context,
                             GetExpr(context, var),
-                            var.v.thing.thing.GetSource(), "ERRMSG=");
+                            parser::UnwrapRef<parser::Variable>(var)
+                                .GetSource(),
+                            "ERRMSG=");
                         if (gotMsg) {
                           context.Say( // C1178
                               "A errmsg-variable in a event-wait-spec-list may not be repeated"_err_en_US);
diff --git a/flang/lib/Semantics/check-data.cpp b/flang/lib/Semantics/check-data.cpp
index 5459290e59103..3bcf711735158 100644
--- a/flang/lib/Semantics/check-data.cpp
+++ b/flang/lib/Semantics/check-data.cpp
@@ -25,9 +25,10 @@ namespace Fortran::semantics {
 // Ensures that references to an implied DO loop control variable are
 // represented as such in the "body" of the implied DO loop.
 void DataChecker::Enter(const parser::DataImpliedDo &x) {
-  auto name{std::get<parser::DataImpliedDo::Bounds>(x.t).name.thing.thing};
+  const auto &name{parser::UnwrapRef<parser::Name>(
+      std::get<parser::DataImpliedDo::Bounds>(x.t).name)};
   int kind{evaluate::ResultType<evaluate::ImpliedDoIndex>::kind};
-  if (const auto dynamicType{evaluate::DynamicType::From(*name.symbol)}) {
+  if (const auto dynamicType{evaluate::DynamicType::From(DEREF(name.symbol))}) {
     if (dynamicType->category() == TypeCategory::Integer) {
       kind = dynamicType->kind();
     }
@@ -36,7 +37,8 @@ void DataChecker::Enter(const parser::DataImpliedDo &x) {
 }
 
 void DataChecker::Leave(const parser::DataImpliedDo &x) {
-  auto name{std::get<parser::DataImpliedDo::Bounds>(x.t).name.thing.thing};
+  const auto &name{parser::UnwrapRef<parser::Name>(
+      std::get<parser::DataImpliedDo::Bounds>(x.t).name)};
   exprAnalyzer_.RemoveImpliedDo(name.source);
 }
 
@@ -211,7 +213,7 @@ void DataChecker::Leave(const parser::DataIDoObject &object) {
           std::get_if<parser::Scalar<common::Indirection<parser::Designator>>>(
               &object.u)}) {
     if (MaybeExpr expr{exprAnalyzer_.Analyze(*designator)}) {
-      auto source{designator->thing.value().source};
+      auto source{parser::UnwrapRef<parser::Designator>(*designator).source};
       DataVarChecker checker{exprAnalyzer_.context(), source};
       if (checker(*expr)) {
         if (checker.HasComponentWithoutSubscripts()) { // C880
diff --git a/flang/lib/Semantics/check-deallocate.cpp b/flang/lib/Semantics/check-deallocate.cpp
index c45b58586853b..c1ebc5f4c0ec2 100644
--- a/flang/lib/Semantics/check-deallocate.cpp
+++ b/flang/lib/Semantics/check-deallocate.cpp
@@ -114,7 +114,8 @@ void DeallocateChecker::Leave(const parser::DeallocateStmt &deallocateStmt) {
             },
             [&](const parser::MsgVariable &var) {
               WarnOnDeferredLengthCharacterScalar(context_,
-                  GetExpr(context_, var), var.v.thing.thing.GetSource(),
+                  GetExpr(context_, var),
+                  parser::UnwrapRef<parser::Variable>(var).GetSource(),
                   "ERRMSG=");
               if (gotMsg) {
                 context_.Say(
diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index ea5e2c095d31a..31e246cf0ab03 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -3622,6 +3622,7 @@ void CheckHelper::CheckDioDtvArg(const Symbol &proc, const Symbol &subp,
                 ioKind == common::DefinedIo::ReadUnformatted
             ? Attr::INTENT_INOUT
             : Attr::INTENT_IN);
+    CheckDioDummyIsScalar(subp, *arg);
   }
 }
 
@@ -3687,6 +3688,7 @@ void CheckHelper::CheckDioAssumedLenCharacterArg(const Symbol &subp,
           "Dummy argument '%s' of a defined input/output procedure must be assumed-length CHARACTER of default kind"_err_en_US,
           arg->name());
     }
+    CheckDioDummyIsScalar(subp, *arg);
   }
 }
 
diff --git a/flang/lib/Semantics/check-do-forall.cpp b/flang/lib/Semantics/check-do-forall.cpp
index a2f3685950c1c..8a473406b8200 100644
--- a/flang/lib/Semantics/check-do-forall.cpp
+++ b/flang/lib/Semantics/check-do-forall.cpp
@@ -535,7 +535,8 @@ class DoContext {
     if (const SomeExpr * expr{GetExpr(context_, scalarExpression)}) {
       if (!ExprHasTypeCategory(*expr, TypeCategory::Integer)) {
         // No warnings or errors for type INTEGER
-        const parser::CharBlock &loc{scalarExpression.thing.value().source};
+        parser::CharBlock loc{
+            parser::UnwrapRef<parser::Expr>(scalarExpression).source};
         CheckDoControl(loc, ExprHasTypeCategory(*expr, TypeCategory::Real));
       }
     }
@@ -552,7 +553,7 @@ class DoContext {
       CheckDoExpression(*bounds.step);
       if (IsZero(*bounds.step)) {
         context_.Warn(common::UsageWarning::ZeroDoStep,
-            bounds.step->thing.value().source,
+            parser::UnwrapRef<parser::Expr>(bounds.step).source,
             "DO step expression should not be zero"_warn_en_US);
       }
     }
@@ -615,7 +616,7 @@ class DoContext {
   // C1121 - procedures in mask must be pure
   void CheckMaskIsPure(const parser::ScalarLogicalExpr &mask) const {
     UnorderedSymbolSet references{
-        GatherSymbolsFromExpression(mask.thing.thing.value())};
+        GatherSymbolsFromExpression(parser::UnwrapRef<parser::Expr>(mask))};
     for (const Symbol &ref : OrderBySourcePosition(references)) {
       if (IsProcedure(ref) && !IsPureProcedure(ref)) {
         context_.SayWithDecl(ref, parser::Unwrap<parser::Expr>(mask)->source,
@@ -639,32 +640,33 @@ class DoContext {
   }
 
   void HasNoReferences(const UnorderedSymbolSet &indexNames,
-      const parser::ScalarIntExpr &expr) const {
-    CheckNoCollisions(GatherSymbolsFromExpression(expr.thing.thing.value()),
-        indexNames,
+      const parser::ScalarIntExpr &scalarIntExpr) const {
+    const auto &expr{parser::UnwrapRef<parser::Expr>(scalarIntExpr)};
+    CheckNoCollisions(GatherSymbolsFromExpression(expr), indexNames,
         "%s limit expression may not reference index variable '%s'"_err_en_US,
-        expr.thing.thing.value().source);
+        expr.source);
   }
 
   // C1129, names in local locality-specs can't be in mask expressions
   void CheckMaskDoesNotReferenceLocal(const parser::ScalarLogicalExpr &mask,
       const UnorderedSymbolSet &localVars) const {
-    CheckNoCollisions(GatherSymbolsFromExpression(mask.thing.thing.value()),
-        localVars,
+    const auto &expr{parser::UnwrapRef<parser::Expr>(mask)};
+    CheckNoCollisions(GatherSymbolsFromExpression(expr), localVars,
         "%s mask expression references variable '%s'"
         " in LOCAL locality-spec"_err_en_US,
-        mask.thing.thing.value().source);
+        expr.source);
   }
 
   // C1129, names in local locality-specs can't be in limit or step
   // expressions
-  void CheckExprDoesNotReferenceLocal(const parser::ScalarIntExpr &expr,
+  void CheckExprDoesNotReferenceLocal(
+      const parser::ScalarIntExpr &scalarIntExpr,
       const UnorderedSymbolSet &localVars) const {
-    CheckNoCollisions(GatherSymbolsFromExpression(expr.thing.thing.value()),
-        localVars,
+    const auto &expr{parser::UnwrapRef<parser::Expr>(scalarIntExpr)};
+    CheckNoCollisions(GatherSymbolsFromExpression(expr), localVars,
         "%s expression references variable '%s'"
         " in LOCAL locality-spec"_err_en_US,
-        expr.thing.thing.value().source);
+        expr.source);
   }
 
   // C1130, DEFAULT(NONE) locality requires names to be in locality-specs to
@@ -772,7 +774,7 @@ class DoContext {
         HasNoReferences(indexNames, std::get<2>(control.t));
         if (const auto &intExpr{
                 std::get<std::optional<parser::ScalarIntExpr>>(control.t)}) {
-          const parser::Expr &expr{intExpr->thing.thing.value()};
+          const auto &expr{parser::UnwrapRef<parser::Expr>(intExpr)};
           CheckNoCollisions(GatherSymbolsFromExpression(expr), indexNames,
               "%s step expression may not reference index variable '%s'"_err_en_US,
               expr.source);
@@ -840,7 +842,7 @@ class DoContext {
   }
   void CheckForImpureCall(const parser::ScalarIntExpr &x,
       std::optional<IndexVarKind> nesting) const {
-    const auto &parsedExpr{x.thing.thing.value()};
+    const auto &parsedExpr{parser::UnwrapRef<parser::Expr>(x)};
     auto oldLocation{context_.location()};
     context_.set_location(parsedExpr.source);
     if (const auto &typedExpr{parsedExpr.typedExpr}) {
@@ -1124,7 +1126,8 @@ void DoForallChecker::Leave(const parser::ConnectSpec &connectSpec) {
   const auto *newunit{
       std::get_if<parser::ConnectSpec::Newunit>(&connectSpec.u)};
   if (newunit) {
-    context_.CheckIndexVarRedefine(newunit->v.thing.thing);
+    context_.CheckIndexVarRedefine(
+        parser::UnwrapRef<parser::Variable>(newunit));
   }
 }
 
@@ -1166,14 +1169,14 @@ void DoForallChecker::Leave(const parser::InquireSpec &inquireSpec) {
   const auto *intVar{std::get_if<parser::InquireSpec::IntVar>(&inquireSpec.u)};
   if (intVar) {
     const auto &scalar{std::get<parser::ScalarIntVariable>(intVar->t)};
-    context_.CheckIndexVarRedefine(scalar.thing.thing);
+    context_.CheckIndexVarRedefine(parser::UnwrapRef<parser::Variable>(scalar));
   }
 }
 
 void DoForallChecker::Leave(const parser::IoControlSpec &ioControlSpec) {
   const auto *size{std::get_if<parser::IoControlSpec::Size>(&ioControlSpec.u)};
   if (size) {
-    context_.CheckIndexVarRedefine(size->v.thing.thing);
+    context_.CheckIndexVarRedefine(parser::UnwrapRef<parser::Variable>(size));
   }
 }
 
@@ -1190,16 +1193,19 @@ static void CheckIoImpliedDoIndex(
 
 void DoForallChecker::Leave(const parser::OutputImpliedDo &outputImpliedDo) {
   CheckIoImpliedDoIndex(context_,
-      std::get<parser::IoImpliedDoControl>(outputImpliedDo.t).name.thing.thing);
+      parser::UnwrapRef<parser::Name>(
+          std::get<parser::IoImpliedDoControl>(outputImpliedDo.t).name));
 }
 
 void DoForallChecker::Leave(const parser::InputImpliedDo &inputImpliedDo) {
   CheckIoImpliedDoIndex(context_,
-      std::get<parser::IoImpliedDoControl>(inputImpliedDo.t).name.thing.thing);
+      parser::UnwrapRef<parser::Name>(
+          std::get<parser::IoImpliedDoControl>(inputImpliedDo.t).name));
 }
 
 void DoForallChecker::Leave(const parser::StatVariable &statVariable) {
-  context_.CheckIndexVarRedefine(statVariable.v.thing.thing);
+  context_.CheckIndexVarRedefine(
+      parser::UnwrapRef<parser::Variable>(statVariable));
 }
 
 } // namespace Fortran::semantics
diff --git a/flang/lib/Semantics/check-io.cpp b/flang/lib/Semantics/check-io.cpp
index a1ff4b922268b..19059ad1b1223 100644
--- a/flang/lib/Semantics/check-io.cpp
+++ b/flang/lib/Semantics/check-io.cpp
@@ -424,8 +424,8 @@ void IoChecker::Enter(const parser::InquireSpec::CharVar &spec) {
     specKind = IoSpecKind::Dispose;
     break;
   }
-  const parser::Variable &var{
-      std::get<parser::ScalarDefaultCharVariable>(spec.t).thing.thing};
+  const auto &var{parser::UnwrapRef<parser::Variable>(
+      std::get<parser::ScalarDefaultCharVariable>(spec.t))};
   std::string what{parser::ToUpperCaseLetters(common::EnumToString(specKind))};
   CheckForDefinableVariable(var, what);
   WarnOnDeferredLengthCharacterScalar(
@@ -627,7 +627,7 @@ void IoChecker::Enter(const parser::IoUnit &spec) {
 }
 
 void IoChecker::Enter(const parser::MsgVariable &msgVar) {
-  const parser::Variable &var{msgVar.v.thing.thing};
+  const auto &var{parser::UnwrapRef<parser::Variable>(msgVar)};
   if (stmt_ == IoStmtKind::None) {
     // allocate, deallocate, image control
     CheckForDefinableVariable(var, "ERRMSG");
diff --git a/flang/lib/Semantics/check-omp-atomic.cpp b/flang/lib/Semantics/check-omp-atomic.cpp
index 351af5c099aee..2707921ca1dfa 100644
--- a/flang/lib/Semantics/check-omp-atomic.cpp
+++ b/flang/lib/Semantics/check-omp-atomic.cpp
@@ -286,7 +286,7 @@ static std::optional<AnalyzedCondStmt> AnalyzeConditionalStmt(
   // Extract the evaluate::Expr from ScalarLogicalExpr.
   auto getFromLogical{[](const parser::ScalarLogicalExpr &logical) {
     // ScalarLogicalExpr is Scalar<Logical<common::Indirection<Expr>>>
-    const parser::Expr &expr{logical.thing.thing.value()};
+    auto &expr{parser::UnwrapRef<parser::Expr>(logical)};
     return GetEvaluateExpr(expr);
   }};
 
@@ -519,8 +519,8 @@ struct AtomicAnalysis {
 ///   function references with scalar data pointer result of non-character
 ///   intrinsic type or variables that are non-polymorphic scalar pointers
 ///   and any length type parameter must be constant.
-void OmpStructureChecker::CheckAtomicType(
-    SymbolRef sym, parser::CharBlock source, std::string_view name) {
+void OmpStructureChecker::CheckAtomicType(SymbolRef sym,
+    parser::CharBlock source, std::string_view name, bool checkTypeOnPointer) {
   const DeclTypeSpec *typeSpec{sym->GetType()};
   if (!typeSpec) {
     return;
@@ -547,6 +547,22 @@ void OmpStructureChecker::CheckAtomicType(
     return;
   }
 
+  // Apply pointer-to-non-intrinsic rule only for intrinsic-assignment paths.
+  if (checkTypeOnPointer) {
+    using Category = DeclTypeSpec::Category;
+    Category cat{typeSpec->category()};
+    if (cat != Category::Numeric && cat != Category::Logical) {
+      std::string details = " has the POINTER attribute";
+      if (const auto *derived{typeSpec->AsDerived()}) {
+        details += " and derived type '"s + derived->name().ToString() + "'";
+      }
+      context_.Say(source,
+          "ATOMIC operation requires an intrinsic scalar variable; '%s'%s"_err_en_US,
+          sym->name(), details);
+      return;
+    }
+  }
+
   // Go over all length parameters, if any, and check if they are
   // explicit.
   if (const DerivedTypeSpec *derived{typeSpec->AsDerived()}) {
@@ -562,7 +578,7 @@ void OmpStructureChecker::CheckAtomicType(
 }
 
 void OmpStructureChecker::CheckAtomicVariable(
-    const SomeExpr &atom, parser::CharBlock source) {
+    const SomeExpr &atom, parser::CharBlock source, bool checkTypeOnPointer) {
   if (atom.Rank() != 0) {
     context_.Say(source, "Atomic variable %s should be a scalar"_err_en_US,
         atom.AsFortran());
@@ -572,7 +588,7 @@ void OmpStructureChecker::CheckAtomicVariable(
   assert(dsgs.size() == 1 && "Should have a single top-level designator");
   evaluate::SymbolVector syms{evaluate::GetSymbolVector(dsgs.front())};
 
-  CheckAtomicType(syms.back(), source, atom.AsFortran());
+  CheckAtomicType(syms.back(), source, atom.AsFortran(), checkTypeOnPointer);
 
   if (IsAllocatable(syms.back()) && !IsArrayElement(atom)) {
     context_.Say(source, "Atomic variable %s cannot be ALLOCATABLE"_err_en_US,
@@ -789,7 +805,8 @@ void OmpStructureChecker::CheckAtomicCaptureAssignment(
   if (!IsVarOrFunctionRef(atom)) {
     ErrorShouldBeVariable(atom, rsrc);
   } else {
-    CheckAtomicVariable(atom, rsrc);
+    CheckAtomicVariable(
+        atom, rsrc, /*checkTypeOnPointer=*/!IsPointerAssignment(capture));
     // This part should have been checked prior to calling this function.
     assert(*GetConvertInput(capture.rhs) == atom &&
         "This cannot be a capture assignment");
@@ -808,7 +825,8 @@ void OmpStructureChecker::CheckAtomicReadAssignment(
     if (!IsVarOrFunctionRef(atom)) {
       ErrorShouldBeVariable(atom, rsrc);
     } else {
-      CheckAtomicVariable(atom, rsrc);
+      CheckAtomicVariable(
+          atom, rsrc, /*checkTypeOnPointer=*/!IsPointerAssignment(read));
       CheckStorageOverlap(atom, {read.lhs}, source);
     }
   } else {
@@ -829,7 +847,8 @@ void OmpStructureChecker::CheckAtomicWriteAssignment(
   if (!IsVarOrFunctionRef(atom)) {
     ErrorShouldBeVariable(atom, rsrc);
   } else {
-    CheckAtomicVariable(atom, lsrc);
+    CheckAtomicVariable(
+        atom, lsrc, /*checkTypeOnPointer=*/!IsPointerAssignment(write));
     CheckStorageOverlap(atom, {write.rhs}, source);
   }
 }
@@ -854,7 +873,8 @@ OmpStructureChecker::CheckAtomicUpdateAssignment(
     return std::nullopt;
   }
 
-  CheckAtomicVariable(atom, lsrc);
+  CheckAtomicVariable(
+      atom, lsrc, /*checkTypeOnPointer=*/!IsPointerAssignment(update));
 
   auto [hasErrors, tryReassoc]{CheckAtomicUpdateAssignmentRhs(
       atom, update.rhs, source, /*suppressDiagnostics=*/true)};
@@ -1017,7 +1037,8 @@ void OmpStructureChecker::CheckAtomicConditionalUpdateAssignment(
     return;
   }
 
-  CheckAtomicVariable(atom, alsrc);
+  CheckAtomicVariable(
+      atom, alsrc, /*checkTypeOnPointer=*/!IsPointerAssignment(assign));
 
   auto top{GetTopLevelOperationIgnoreResizing(cond)};
   // Missing arguments to operations would have been diagnosed by now.
diff --git a/flang/lib/Semantics/check-omp-loop.cpp b/flang/lib/Semantics/check-omp-loop.cpp
index c9d0495850b6e..aaaa2d6e78280 100644
--- a/flang/lib/Semantics/check-omp-loop.cpp
+++ b/flang/lib/Semantics/check-omp-loop.cpp
@@ -127,24 +127,23 @@ using namespace Fortran::semantics::omp;
 
 void OmpStructureChecker::HasInvalidDistributeNesting(
     const parser::OpenMPLoopConstruct &x) {
-  bool violation{false};
   const parser::OmpDirectiveName &beginName{x.BeginDir().DirName()};
   if (llvm::omp::topDistributeSet.test(beginName.v)) {
     // `distribute` region has to be nested
-    if (!CurrentDirectiveIsNested()) {
-      violation = true;
-    } else {
+    if (CurrentDirectiveIsNested()) {
       // `distribute` region has to be strictly nested inside `teams`
       if (!llvm::omp::bottomTeamsSet.test(GetContextParent().directive)) {
-        violation = true;
+        context_.Say(beginName.source,
+            "`DISTRIBUTE` region has to be strictly nested inside `TEAMS` "
+            "region."_err_en_US);
       }
+    } else {
+      // If not lexically nested (orphaned), issue a warning.
+      context_.Say(beginName.source,
+          "`DISTRIBUTE` must be dynamically enclosed in a `TEAMS` "
+          "region."_warn_en_US);
     }
   }
-  if (violation) {
-    context_.Say(beginName.source,
-        "`DISTRIBUTE` region has to be strictly nested inside `TEAMS` "
-        "region."_err_en_US);
-  }
 }
 void OmpStructureChecker::HasInvalidLoopBinding(
     const parser::OpenMPLoopConstruct &x) {
@@ -486,8 +485,8 @@ void OmpStructureChecker::Leave(const parser::OpenMPLoopConstruct &x) {
           common::visit(
               common::visitors{
                   [&](const parser::Designator &designator) {
-                    if (const auto *name{semantics::getDesignatorNameIfDataRef(
-                            designator)}) {
+                    if (const auto *name{
+                            parser::GetDesignatorNameIfDataRef(designator)}) {
                       checkReductionSymbolInScan(name);
                     }
                   },
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index d65a89e768466..e305755f4ad90 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -1204,9 +1204,17 @@ void OmpStructureChecker::CheckThreadprivateOrDeclareTargetVar(
         "A variable in a %s directive cannot be an element of a common block"_err_en_US,
         ContextDirectiveAsFortran());
   } else if (FindEquivalenceSet(*name->symbol)) {
-    context_.Say(name->source,
-        "A variable in a %s directive cannot appear in an EQUIVALENCE statement"_err_en_US,
-        ContextDirectiveAsFortran());
+    auto allowThreadprivateEquivalence{
+        context_.langOptions().AllowThreadprivateEquivalence};
+    if (!allowThreadprivateEquivalence) {
+      context_.Say(name->source,
+          "A variable in a %s directive cannot appear in an EQUIVALENCE statement"_err_en_US,
+          ContextDirectiveAsFortran());
+    } else {
+      context_.Say(name->source,
+          "Variable '%s' appears a %s directive and an EQUIVALENCE statement, which does not conform to the OpenMP API specification."_warn_en_US,
+          name->symbol->name(), ContextDirectiveAsFortran());
+    }
   } else if (name->symbol->test(Symbol::Flag::OmpThreadprivate) &&
       directive == llvm::omp::Directive::OMPD_declare_target) {
     context_.Say(name->source,
@@ -1250,9 +1258,17 @@ void OmpStructureChecker::CheckThreadprivateOrDeclareTargetVar(
   if (auto *cb{name.symbol->detailsIf<CommonBlockDetails>()}) {
     for (const auto &obj : cb->objects()) {
       if (FindEquivalenceSet(*obj)) {
-        context_.Say(name.source,
-            "A variable in a %s directive cannot appear in an EQUIVALENCE statement (variable '%s' from common block '/%s/')"_err_en_US,
-            ContextDirectiveAsFortran(), obj->name(), name.symbol->name());
+        auto allowThreadprivateEquivalence{
+            context_.langOptions().AllowThreadprivateEquivalence};
+        if (!allowThreadprivateEquivalence) {
+          context_.Say(name.source,
+              "A variable in a %s directive cannot appear in an EQUIVALENCE statement (variable '%s' from common block '/%s/')"_err_en_US,
+              ContextDirectiveAsFortran(), obj->name(), name.symbol->name());
+        } else {
+          context_.Say(name.source,
+              "Variable '%s' from common block '%s' appears in an EQUIVALENCE statement and a %s directive, which does not conform to the OpenMP API specification."_warn_en_US,
+              obj->name(), name.symbol->name(), ContextDirectiveAsFortran());
+        }
       }
     }
   }
@@ -1517,19 +1533,43 @@ void OmpStructureChecker::Leave(const parser::OpenMPDepobjConstruct &x) {
 void OmpStructureChecker::Enter(const parser::OpenMPRequiresConstruct &x) {
   const auto &dirName{x.v.DirName()};
   PushContextAndClauseSets(dirName.source, dirName.v);
+  unsigned version{context_.langOptions().OpenMPVersion};
 
-  if (visitedAtomicSource_.empty()) {
-    return;
-  }
   for (const parser::OmpClause &clause : x.v.Clauses().v) {
     llvm::omp::Clause id{clause.Id()};
     if (id == llvm::omp::Clause::OMPC_atomic_default_mem_order) {
-      parser::MessageFormattedText txt(
-          "REQUIRES directive with '%s' clause found lexically after atomic operation without a memory order clause"_err_en_US,
-          parser::ToUpperCaseLetters(llvm::omp::getOpenMPClauseName(id)));
-      parser::Message message(clause.source, txt);
-      message.Attach(visitedAtomicSource_, "Previous atomic construct"_en_US);
-      context_.Say(std::move(message));
+      if (!visitedAtomicSource_.empty()) {
+        parser::MessageFormattedText txt(
+            "REQUIRES directive with '%s' clause found lexically after atomic operation without a memory order clause"_err_en_US,
+            parser::ToUpperCaseLetters(llvm::omp::getOpenMPClauseName(id)));
+        parser::Message message(clause.source, txt);
+        message.Attach(visitedAtomicSource_, "Previous atomic construct"_en_US);
+        context_.Say(std::move(message));
+      }
+    } else {
+      bool hasArgument{common::visit(
+          [&](auto &&s) {
+            using TypeS = llvm::remove_cvref_t<decltype(s)>;
+            if constexpr ( //
+                std::is_same_v<TypeS, parser::OmpClause::DeviceSafesync> ||
+                std::is_same_v<TypeS, parser::OmpClause::DynamicAllocators> ||
+                std::is_same_v<TypeS, parser::OmpClause::ReverseOffload> ||
+                std::is_same_v<TypeS, parser::OmpClause::SelfMaps> ||
+                std::is_same_v<TypeS, parser::OmpClause::UnifiedAddress> ||
+                std::is_same_v<TypeS, parser::OmpClause::UnifiedSharedMemory>) {
+              return s.v.has_value();
+            } else {
+              return false;
+            }
+          },
+          clause.u)};
+      if (version < 60 && hasArgument) {
+        context_.Say(clause.source,
+            "An argument to %s is an %s feature, %s"_warn_en_US,
+            parser::ToUpperCaseLetters(
+                llvm::omp::getOpenMPClauseName(clause.Id())),
+            ThisVersion(60), TryVersion(60));
+      }
     }
   }
 }
@@ -1538,21 +1578,11 @@ void OmpStructureChecker::Leave(const parser::OpenMPRequiresConstruct &) {
   dirContext_.pop_back();
 }
 
-void OmpStructureChecker::CheckAlignValue(const parser::OmpClause &clause) {
-  if (auto *align{std::get_if<parser::OmpClause::Align>(&clause.u)}) {
-    if (const auto &v{GetIntValue(align->v)}; !v || *v <= 0) {
-      context_.Say(clause.source,
-          "The alignment value should be a constant positive integer"_err_en_US);
-    }
-  }
-}
-
 void OmpStructureChecker::Enter(const parser::OpenMPDeclarativeAllocate &x) {
   isPredefinedAllocator = true;
   const auto &dir{std::get<parser::Verbatim>(x.t)};
   const auto &objectList{std::get<parser::OmpObjectList>(x.t)};
   PushContextAndClauseSets(dir.source, llvm::omp::Directive::OMPD_allocate);
-  const auto &clauseList{std::get<parser::OmpClauseList>(x.t)};
   SymbolSourceMap currSymbols;
   GetSymbolsInObjectList(objectList, currSymbols);
   for (auto &[symbol, source] : currSymbols) {
@@ -1575,9 +1605,6 @@ void OmpStructureChecker::Enter(const parser::OpenMPDeclarativeAllocate &x) {
           source.ToString());
     }
   }
-  for (const auto &clause : clauseList.v) {
-    CheckAlignValue(clause);
-  }
   CheckVarIsNotPartOfAnotherVar(dir.source, objectList);
 }
 
@@ -1984,9 +2011,6 @@ void OmpStructureChecker::Enter(const parser::OpenMPExecutableAllocate &x) {
 
   isPredefinedAllocator = true;
   const auto &objectList{std::get<std::optional<parser::OmpObjectList>>(x.t)};
-  for (const auto &clause : clauseList.v) {
-    CheckAlignValue(clause);
-  }
   if (objectList) {
     CheckVarIsNotPartOfAnotherVar(dir.source, *objectList);
   }
@@ -2336,7 +2360,7 @@ struct TaskgraphVisitor {
       }
       if (auto &repl{std::get<parser::OmpClause::Replayable>(clause.u).v}) {
         // Scalar<Logical<Constant<indirection<Expr>>>>
-        const parser::Expr &parserExpr{repl->v.thing.thing.thing.value()};
+        const auto &parserExpr{parser::UnwrapRef<parser::Expr>(repl)};
         if (auto &&expr{GetEvaluateExpr(parserExpr)}) {
           return GetLogicalValue(*expr).value_or(true);
         }
@@ -2350,7 +2374,7 @@ struct TaskgraphVisitor {
     bool isTransparent{true};
     if (auto &transp{std::get<parser::OmpClause::Transparent>(clause.u).v}) {
       // Scalar<Integer<indirection<Expr>>>
-      const parser::Expr &parserExpr{transp->v.thing.thing.value()};
+      const auto &parserExpr{parser::UnwrapRef<parser::Expr>(transp)};
       if (auto &&expr{GetEvaluateExpr(parserExpr)}) {
         // If the argument is omp_not_impex (defined as 0), then
         // the task is not transparent, otherwise it is.
@@ -2389,8 +2413,8 @@ struct TaskgraphVisitor {
       }
     }
     // Scalar<Logical<indirection<Expr>>>
-    auto &parserExpr{
-        std::get<parser::ScalarLogicalExpr>(ifc.v.t).thing.thing.value()};
+    const auto &parserExpr{parser::UnwrapRef<parser::Expr>(
+        std::get<parser::ScalarLogicalExpr>(ifc.v.t))};
     if (auto &&expr{GetEvaluateExpr(parserExpr)}) {
       // If the value is known to be false, an undeferred task will be
       // generated.
@@ -2594,7 +2618,7 @@ void OmpStructureChecker::Enter(const parser::OpenMPCriticalConstruct &x) {
   auto getNameFromArg{[](const parser::OmpArgument &arg) {
     if (auto *object{parser::Unwrap<parser::OmpObject>(arg.u)}) {
       if (auto *designator{omp::GetDesignatorFromObj(*object)}) {
-        return getDesignatorNameIfDataRef(*designator);
+        return parser::GetDesignatorNameIfDataRef(*designator);
       }
     }
     return static_cast<const parser::Name *>(nullptr);
@@ -3017,8 +3041,8 @@ void OmpStructureChecker::Leave(const parser::OmpClauseList &) {
                                                         &objs,
                                                     std::string clause) {
             for (const auto &obj : objs.v) {
-              if (const parser::Name *
-                  objName{parser::Unwrap<parser::Name>(obj)}) {
+              if (const parser::Name *objName{
+                      parser::Unwrap<parser::Name>(obj)}) {
                 if (&objName->symbol->GetUltimate() == eventHandleSym) {
                   context_.Say(GetContext().clauseSource,
                       "A variable: `%s` that appears in a DETACH clause cannot appear on %s clause on the same construct"_err_en_US,
@@ -3211,7 +3235,6 @@ CHECK_SIMPLE_CLAUSE(AdjustArgs, OMPC_adjust_args)
 CHECK_SIMPLE_CLAUSE(AppendArgs, OMPC_append_args)
 CHECK_SIMPLE_CLAUSE(MemoryOrder, OMPC_memory_order)
 CHECK_SIMPLE_CLAUSE(Bind, OMPC_bind)
-CHECK_SIMPLE_CLAUSE(Align, OMPC_align)
 CHECK_SIMPLE_CLAUSE(Compare, OMPC_compare)
 CHECK_SIMPLE_CLAUSE(OmpxAttribute, OMPC_ompx_attribute)
 CHECK_SIMPLE_CLAUSE(Weak, OMPC_weak)
@@ -3637,7 +3660,8 @@ void OmpStructureChecker::CheckReductionModifier(
   if (modifier.v == ReductionModifier::Value::Task) {
     // "Task" is only allowed on worksharing or "parallel" directive.
     static llvm::omp::Directive worksharing[]{
-        llvm::omp::Directive::OMPD_do, llvm::omp::Directive::OMPD_scope,
+        llvm::omp::Directive::OMPD_do, //
+        llvm::omp::Directive::OMPD_scope, //
         llvm::omp::Directive::OMPD_sections,
         // There are more worksharing directives, but they do not apply:
         // "for" is C++ only,
@@ -3874,6 +3898,19 @@ void OmpStructureChecker::CheckIsLoopIvPartOfClause(
   }
 }
 
+void OmpStructureChecker::Enter(const parser::OmpClause::Align &x) {
+  CheckAllowedClause(llvm::omp::Clause::OMPC_align);
+  if (const auto &v{GetIntValue(x.v.v)}) {
+    if (*v <= 0) {
+      context_.Say(GetContext().clauseSource,
+          "The alignment should be positive"_err_en_US);
+    } else if (!llvm::isPowerOf2_64(*v)) {
+      context_.Say(GetContext().clauseSource,
+          "The alignment should be a power of 2"_err_en_US);
+    }
+  }
+}
+
 // Restrictions specific to each clause are implemented apart from the
 // generalized restrictions.
 void OmpStructureChecker::Enter(const parser::OmpClause::Aligned &x) {
@@ -4081,9 +4118,15 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Map &x) {
   if (auto *iter{OmpGetUniqueModifier<parser::OmpIterator>(modifiers)}) {
     CheckIteratorModifier(*iter);
   }
+
+  using Directive = llvm::omp::Directive;
+  Directive dir{GetContext().directive};
+  llvm::ArrayRef<Directive> leafs{llvm::omp::getLeafConstructsOrSelf(dir)};
+  parser::OmpMapType::Value mapType{parser::OmpMapType::Value::Storage};
+
   if (auto *type{OmpGetUniqueModifier<parser::OmpMapType>(modifiers)}) {
-    using Directive = llvm::omp::Directive;
     using Value = parser::OmpMapType::Value;
+    mapType = type->v;
 
     static auto isValidForVersion{
         [](parser::OmpMapType::Value t, unsigned version) {
@@ -4120,10 +4163,6 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Map &x) {
       return result;
     }()};
 
-    llvm::omp::Directive dir{GetContext().directive};
-    llvm::ArrayRef<llvm::omp::Directive> leafs{
-        llvm::omp::getLeafConstructsOrSelf(dir)};
-
     if (llvm::is_contained(leafs, Directive::OMPD_target) ||
         llvm::is_contained(leafs, Directive::OMPD_target_data)) {
       if (version >= 60) {
@@ -4141,6 +4180,43 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Map &x) {
     }
   }
 
+  if (auto *attach{
+          OmpGetUniqueModifier<parser::OmpAttachModifier>(modifiers)}) {
+    bool mapEnteringConstructOrMapper{
+        llvm::is_contained(leafs, Directive::OMPD_target) ||
+        llvm::is_contained(leafs, Directive::OMPD_target_data) ||
+        llvm::is_contained(leafs, Directive::OMPD_target_enter_data) ||
+        llvm::is_contained(leafs, Directive::OMPD_declare_mapper)};
+
+    if (!mapEnteringConstructOrMapper || !IsMapEnteringType(mapType)) {
+      const auto &desc{OmpGetDescriptor<parser::OmpAttachModifier>()};
+      context_.Say(OmpGetModifierSource(modifiers, attach),
+          "The '%s' modifier can only appear on a map-entering construct or on a DECLARE_MAPPER directive"_err_en_US,
+          desc.name.str());
+    }
+
+    auto hasBasePointer{[&](const SomeExpr &item) {
+      evaluate::SymbolVector symbols{evaluate::GetSymbolVector(item)};
+      return llvm::any_of(
+          symbols, [](SymbolRef s) { return IsPointer(s.get()); });
+    }};
+
+    evaluate::ExpressionAnalyzer ea{context_};
+    const auto &objects{std::get<parser::OmpObjectList>(x.v.t)};
+    for (auto &object : objects.v) {
+      if (const parser::Designator *d{GetDesignatorFromObj(object)}) {
+        if (auto &&expr{ea.Analyze(*d)}) {
+          if (hasBasePointer(*expr)) {
+            continue;
+          }
+        }
+      }
+      auto source{GetObjectSource(object)};
+      context_.Say(source ? *source : GetContext().clauseSource,
+          "A list-item that appears in a map clause with the ATTACH modifier must have a base-pointer"_err_en_US);
+    }
+  }
+
   auto &&typeMods{
       OmpGetRepeatableModifier<parser::OmpMapTypeModifier>(modifiers)};
   struct Less {
@@ -5132,6 +5208,10 @@ void OmpStructureChecker::Enter(
   CheckAllowedRequiresClause(llvm::omp::Clause::OMPC_atomic_default_mem_order);
 }
 
+void OmpStructureChecker::Enter(const parser::OmpClause::DeviceSafesync &x) {
+  CheckAllowedRequiresClause(llvm::omp::Clause::OMPC_device_safesync);
+}
+
 void OmpStructureChecker::Enter(const parser::OmpClause::DynamicAllocators &x) {
   CheckAllowedRequiresClause(llvm::omp::Clause::OMPC_dynamic_allocators);
 }
diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h
index f507278fba5f2..b3fd6c8b4b7c0 100644
--- a/flang/lib/Semantics/check-omp-structure.h
+++ b/flang/lib/Semantics/check-omp-structure.h
@@ -19,7 +19,6 @@
 #include "flang/Parser/parse-tree.h"
 #include "flang/Semantics/openmp-directive-sets.h"
 #include "flang/Semantics/semantics.h"
-#include "llvm/Frontend/OpenMP/OMPConstants.h"
 
 using OmpClauseSet =
     Fortran::common::EnumSet<llvm::omp::Clause, llvm::omp::Clause_enumSize>;
@@ -262,10 +261,10 @@ class OmpStructureChecker
   void CheckStorageOverlap(const evaluate::Expr<evaluate::SomeType> &,
       llvm::ArrayRef<evaluate::Expr<evaluate::SomeType>>, parser::CharBlock);
   void ErrorShouldBeVariable(const MaybeExpr &expr, parser::CharBlock source);
-  void CheckAtomicType(
-      SymbolRef sym, parser::CharBlock source, std::string_view name);
-  void CheckAtomicVariable(
-      const evaluate::Expr<evaluate::SomeType> &, parser::CharBlock);
+  void CheckAtomicType(SymbolRef sym, parser::CharBlock source,
+      std::string_view name, bool checkTypeOnPointer = true);
+  void CheckAtomicVariable(const evaluate::Expr<evaluate::SomeType> &,
+      parser::CharBlock, bool checkTypeOnPointer = true);
   std::pair<const parser::ExecutionPartConstruct *,
       const parser::ExecutionPartConstruct *>
   CheckUpdateCapture(const parser::ExecutionPartConstruct *ec1,
@@ -347,8 +346,6 @@ class OmpStructureChecker
   void CheckAllowedRequiresClause(llvmOmpClause clause);
   bool deviceConstructFound_{false};
 
-  void CheckAlignValue(const parser::OmpClause &);
-
   void AddEndDirectiveClauses(const parser::OmpClauseList &clauses);
 
   void EnterDirectiveNest(const int index) { directiveNest_[index]++; }
diff --git a/flang/lib/Semantics/data-to-inits.cpp b/flang/lib/Semantics/data-to-inits.cpp
index 1e46dabe30c89..bbf3b28fe03e6 100644
--- a/flang/lib/Semantics/data-to-inits.cpp
+++ b/flang/lib/Semantics/data-to-inits.cpp
@@ -179,13 +179,14 @@ bool DataInitializationCompiler<DSV>::Scan(
 template <typename DSV>
 bool DataInitializationCompiler<DSV>::Scan(const parser::DataImpliedDo &ido) {
   const auto &bounds{std::get<parser::DataImpliedDo::Bounds>(ido.t)};
-  auto name{bounds.name.thing.thing};
-  const auto *lowerExpr{
-      GetExpr(exprAnalyzer_.context(), bounds.lower.thing.thing)};
-  const auto *upperExpr{
-      GetExpr(exprAnalyzer_.context(), bounds.upper.thing.thing)};
+  const auto &name{parser::UnwrapRef<parser::Name>(bounds.name)};
+  const auto *lowerExpr{GetExpr(
+      exprAnalyzer_.context(), parser::UnwrapRef<parser::Expr>(bounds.lower))};
+  const auto *upperExpr{GetExpr(
+      exprAnalyzer_.context(), parser::UnwrapRef<parser::Expr>(bounds.upper))};
   const auto *stepExpr{bounds.step
-          ? GetExpr(exprAnalyzer_.context(), bounds.step->thing.thing)
+          ? GetExpr(exprAnalyzer_.context(),
+                parser::UnwrapRef<parser::Expr>(bounds.step))
           : nullptr};
   if (lowerExpr && upperExpr) {
     // Fold the bounds expressions (again) in case any of them depend
@@ -240,7 +241,9 @@ bool DataInitializationCompiler<DSV>::Scan(
   return common::visit(
       common::visitors{
           [&](const parser::Scalar<common::Indirection<parser::Designator>>
-                  &var) { return Scan(var.thing.value()); },
+                  &var) {
+            return Scan(parser::UnwrapRef<parser::Designator>(var));
+          },
           [&](const common::Indirection<parser::DataImpliedDo> &ido) {
             return Scan(ido.value());
           },
diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp
index 2feec98eead83..4aeb9a44088e2 100644
--- a/flang/lib/Semantics/expression.cpp
+++ b/flang/lib/Semantics/expression.cpp
@@ -176,8 +176,8 @@ class ArgumentAnalyzer {
 
   // Find and return a user-defined operator or report an error.
   // The provided message is used if there is no such operator.
-  MaybeExpr TryDefinedOp(
-      const char *, parser::MessageFixedText, bool isUserOp = false);
+  MaybeExpr TryDefinedOp(const char *, parser::MessageFixedText,
+      bool isUserOp = false, bool checkForNullPointer = true);
   template <typename E>
   MaybeExpr TryDefinedOp(E opr, parser::MessageFixedText msg) {
     return TryDefinedOp(
@@ -211,7 +211,8 @@ class ArgumentAnalyzer {
   void SayNoMatch(
       const std::string &, bool isAssignment = false, bool isAmbiguous = false);
   std::string TypeAsFortran(std::size_t);
-  bool AnyUntypedOrMissingOperand() const;
+  bool AnyUntypedOperand() const;
+  bool AnyMissingOperand() const;
 
   ExpressionAnalyzer &context_;
   ActualArguments actuals_;
@@ -1954,9 +1955,10 @@ void ArrayConstructorContext::Add(const parser::AcImpliedDo &impliedDo) {
   const auto &control{std::get<parser::AcImpliedDoControl>(impliedDo.t)};
   const auto &bounds{std::get<parser::AcImpliedDoControl::Bounds>(control.t)};
   exprAnalyzer_.Analyze(bounds.name);
-  parser::CharBlock name{bounds.name.thing.thing.source};
+  const auto &parsedName{parser::UnwrapRef<parser::Name>(bounds.name)};
+  parser::CharBlock name{parsedName.source};
   int kind{ImpliedDoIntType::kind};
-  if (const Symbol * symbol{bounds.name.thing.thing.symbol}) {
+  if (const Symbol *symbol{parsedName.symbol}) {
     if (auto dynamicType{DynamicType::From(symbol)}) {
       if (dynamicType->category() == TypeCategory::Integer) {
         kind = dynamicType->kind();
@@ -1981,7 +1983,7 @@ void ArrayConstructorContext::Add(const parser::AcImpliedDo &impliedDo) {
       auto cUpper{ToInt64(upper)};
       auto cStride{ToInt64(stride)};
       if (!(messageDisplayedSet_ & 0x10) && cStride && *cStride == 0) {
-        exprAnalyzer_.SayAt(bounds.step.value().thing.thing.value().source,
+        exprAnalyzer_.SayAt(parser::UnwrapRef<parser::Expr>(bounds.step).source,
             "The stride of an implied DO loop must not be zero"_err_en_US);
         messageDisplayedSet_ |= 0x10;
       }
@@ -2526,7 +2528,7 @@ static const Symbol *GetBindingResolution(
 auto ExpressionAnalyzer::AnalyzeProcedureComponentRef(
     const parser::ProcComponentRef &pcr, ActualArguments &&arguments,
     bool isSubroutine) -> std::optional<CalleeAndArguments> {
-  const parser::StructureComponent &sc{pcr.v.thing};
+  const auto &sc{parser::UnwrapRef<parser::StructureComponent>(pcr)};
   if (MaybeExpr base{Analyze(sc.base)}) {
     if (const Symbol *sym{sc.component.symbol}) {
       if (context_.HasError(sym)) {
@@ -3695,11 +3697,12 @@ std::optional<characteristics::Procedure> ExpressionAnalyzer::CheckCall(
 
 MaybeExpr ExpressionAnalyzer::Analyze(const parser::Expr::Parentheses &x) {
   if (MaybeExpr operand{Analyze(x.v.value())}) {
-    if (const semantics::Symbol *symbol{GetLastSymbol(*operand)}) {
+    if (IsNullPointerOrAllocatable(&*operand)) {
+      Say("NULL() may not be parenthesized"_err_en_US);
+    } else if (const semantics::Symbol *symbol{GetLastSymbol(*operand)}) {
       if (const semantics::Symbol *result{FindFunctionResult(*symbol)}) {
         if (semantics::IsProcedurePointer(*result)) {
-          Say("A function reference that returns a procedure "
-              "pointer may not be parenthesized"_err_en_US); // C1003
+          Say("A function reference that returns a procedure pointer may not be parenthesized"_err_en_US); // C1003
         }
       }
     }
@@ -3788,7 +3791,7 @@ MaybeExpr ExpressionAnalyzer::Analyze(const parser::Expr::DefinedUnary &x) {
   ArgumentAnalyzer analyzer{*this, name.source};
   analyzer.Analyze(std::get<1>(x.t));
   return analyzer.TryDefinedOp(name.source.ToString().c_str(),
-      "No operator %s defined for %s"_err_en_US, true);
+      "No operator %s defined for %s"_err_en_US, /*isUserOp=*/true);
 }
 
 // Binary (dyadic) operations
@@ -3997,7 +4000,9 @@ static bool CheckFuncRefToArrayElement(semantics::SemanticsContext &context,
   auto &proc{std::get<parser::ProcedureDesignator>(funcRef.v.t)};
   const auto *name{std::get_if<parser::Name>(&proc.u)};
   if (!name) {
-    name = &std::get<parser::ProcComponentRef>(proc.u).v.thing.component;
+    name = &parser::UnwrapRef<parser::StructureComponent>(
+        std::get<parser::ProcComponentRef>(proc.u))
+                .component;
   }
   if (!name->symbol) {
     return false;
@@ -4047,14 +4052,16 @@ static void FixMisparsedFunctionReference(
       }
     }
     auto &proc{std::get<parser::ProcedureDesignator>(funcRef.v.t)};
-    if (Symbol *origSymbol{
-            common::visit(common::visitors{
-                              [&](parser::Name &name) { return name.symbol; },
-                              [&](parser::ProcComponentRef &pcr) {
-                                return pcr.v.thing.component.symbol;
-                              },
-                          },
-                proc.u)}) {
+    if (Symbol *
+        origSymbol{common::visit(
+            common::visitors{
+                [&](parser::Name &name) { return name.symbol; },
+                [&](parser::ProcComponentRef &pcr) {
+                  return parser::UnwrapRef<parser::StructureComponent>(pcr)
+                      .component.symbol;
+                },
+            },
+            proc.u)}) {
       Symbol &symbol{origSymbol->GetUltimate()};
       if (symbol.has<semantics::ObjectEntityDetails>() ||
           symbol.has<semantics::AssocEntityDetails>()) {
@@ -4176,15 +4183,23 @@ MaybeExpr ExpressionAnalyzer::IterativelyAnalyzeSubexpressions(
   } while (!queue.empty());
   // Analyze the collected subexpressions in bottom-up order.
   // On an error, bail out and leave partial results in place.
-  MaybeExpr result;
-  for (auto riter{finish.rbegin()}; riter != finish.rend(); ++riter) {
-    const parser::Expr &expr{**riter};
-    result = ExprOrVariable(expr, expr.source);
-    if (!result) {
-      return result;
+  if (finish.size() == 1) {
+    const parser::Expr &expr{DEREF(finish.front())};
+    return ExprOrVariable(expr, expr.source);
+  } else {
+    // NULL() operand catching is deferred to operation analysis so
+    // that they can be accepted by defined operators.
+    auto restorer{AllowNullPointer()};
+    MaybeExpr result;
+    for (auto riter{finish.rbegin()}; riter != finish.rend(); ++riter) {
+      const parser::Expr &expr{**riter};
+      result = ExprOrVariable(expr, expr.source);
+      if (!result) {
+        return result;
+      }
     }
+    return result; // last value was from analysis of "top"
   }
-  return result; // last value was from analysis of "top"
 }
 
 MaybeExpr ExpressionAnalyzer::Analyze(const parser::Expr &expr) {
@@ -4681,7 +4696,7 @@ bool ArgumentAnalyzer::AnyCUDADeviceData() const {
 // attribute.
 bool ArgumentAnalyzer::HasDeviceDefinedIntrinsicOpOverride(
     const char *opr) const {
-  if (AnyCUDADeviceData() && !AnyUntypedOrMissingOperand()) {
+  if (AnyCUDADeviceData() && !AnyUntypedOperand() && !AnyMissingOperand()) {
     std::string oprNameString{"operator("s + opr + ')'};
     parser::CharBlock oprName{oprNameString};
     parser::Messages buffer;
@@ -4709,9 +4724,9 @@ bool ArgumentAnalyzer::HasDeviceDefinedIntrinsicOpOverride(
   return false;
 }
 
-MaybeExpr ArgumentAnalyzer::TryDefinedOp(
-    const char *opr, parser::MessageFixedText error, bool isUserOp) {
-  if (AnyUntypedOrMissingOperand()) {
+MaybeExpr ArgumentAnalyzer::TryDefinedOp(const char *opr,
+    parser::MessageFixedText error, bool isUserOp, bool checkForNullPointer) {
+  if (AnyMissingOperand()) {
     context_.Say(error, ToUpperCase(opr), TypeAsFortran(0), TypeAsFortran(1));
     return std::nullopt;
   }
@@ -4790,7 +4805,9 @@ MaybeExpr ArgumentAnalyzer::TryDefinedOp(
     context_.Say(
         "Operands of %s are not conformable; have rank %d and rank %d"_err_en_US,
         ToUpperCase(opr), actuals_[0]->Rank(), actuals_[1]->Rank());
-  } else if (CheckForNullPointer() && CheckForAssumedRank()) {
+  } else if (!CheckForAssumedRank()) {
+  } else if (checkForNullPointer && !CheckForNullPointer()) {
+  } else { // use the supplied error
     context_.Say(error, ToUpperCase(opr), TypeAsFortran(0), TypeAsFortran(1));
   }
   return result;
@@ -4808,15 +4825,16 @@ MaybeExpr ArgumentAnalyzer::TryDefinedOp(
     for (std::size_t i{0}; i < oprs.size(); ++i) {
       parser::Messages buffer;
       auto restorer{context_.GetContextualMessages().SetMessages(buffer)};
-      if (MaybeExpr thisResult{TryDefinedOp(oprs[i], error)}) {
+      if (MaybeExpr thisResult{TryDefinedOp(oprs[i], error, /*isUserOp=*/false,
+              /*checkForNullPointer=*/false)}) {
         result = std::move(thisResult);
         hit.push_back(oprs[i]);
         hitBuffer = std::move(buffer);
       }
     }
   }
-  if (hit.empty()) { // for the error
-    result = TryDefinedOp(oprs[0], error);
+  if (hit.empty()) { // run TryDefinedOp() again just to emit errors
+    CHECK(!TryDefinedOp(oprs[0], error).has_value());
   } else if (hit.size() > 1) {
     context_.Say(
         "Matching accessible definitions were found with %zd variant spellings of the generic operator ('%s', '%s')"_err_en_US,
@@ -5232,10 +5250,19 @@ std::string ArgumentAnalyzer::TypeAsFortran(std::size_t i) {
   }
 }
 
-bool ArgumentAnalyzer::AnyUntypedOrMissingOperand() const {
+bool ArgumentAnalyzer::AnyUntypedOperand() const {
+  for (const auto &actual : actuals_) {
+    if (actual && !actual->GetType() &&
+        !IsBareNullPointer(actual->UnwrapExpr())) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool ArgumentAnalyzer::AnyMissingOperand() const {
   for (const auto &actual : actuals_) {
-    if (!actual ||
-        (!actual->GetType() && !IsBareNullPointer(actual->UnwrapExpr()))) {
+    if (!actual) {
       return true;
     }
   }
@@ -5268,9 +5295,9 @@ void ExprChecker::Post(const parser::DataStmtObject &obj) {
 bool ExprChecker::Pre(const parser::DataImpliedDo &ido) {
   parser::Walk(std::get<parser::DataImpliedDo::Bounds>(ido.t), *this);
   const auto &bounds{std::get<parser::DataImpliedDo::Bounds>(ido.t)};
-  auto name{bounds.name.thing.thing};
+  const auto &name{parser::UnwrapRef<parser::Name>(bounds.name)};
   int kind{evaluate::ResultType<evaluate::ImpliedDoIndex>::kind};
-  if (const auto dynamicType{evaluate::DynamicType::From(*name.symbol)}) {
+  if (const auto dynamicType{evaluate::DynamicType::From(DEREF(name.symbol))}) {
     if (dynamicType->category() == TypeCategory::Integer) {
       kind = dynamicType->kind();
     }
diff --git a/flang/lib/Semantics/mod-file.cpp b/flang/lib/Semantics/mod-file.cpp
index 8074c94b41e1a..556259d1e5e63 100644
--- a/flang/lib/Semantics/mod-file.cpp
+++ b/flang/lib/Semantics/mod-file.cpp
@@ -17,6 +17,7 @@
 #include "flang/Semantics/semantics.h"
 #include "flang/Semantics/symbol.h"
 #include "flang/Semantics/tools.h"
+#include "llvm/Frontend/OpenMP/OMP.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
@@ -24,6 +25,7 @@
 #include <fstream>
 #include <set>
 #include <string_view>
+#include <type_traits>
 #include <variant>
 #include <vector>
 
@@ -359,6 +361,40 @@ void ModFileWriter::PrepareRenamings(const Scope &scope) {
   }
 }
 
+static void PutOpenMPRequirements(llvm::raw_ostream &os, const Symbol &symbol) {
+  using RequiresClauses = WithOmpDeclarative::RequiresClauses;
+  using OmpMemoryOrderType = common::OmpMemoryOrderType;
+
+  const auto [reqs, order]{common::visit(
+      [&](auto &&details)
+          -> std::pair<const RequiresClauses *, const OmpMemoryOrderType *> {
+        if constexpr (std::is_convertible_v<decltype(details),
+                          const WithOmpDeclarative &>) {
+          return {details.ompRequires(), details.ompAtomicDefaultMemOrder()};
+        } else {
+          return {nullptr, nullptr};
+        }
+      },
+      symbol.details())};
+
+  if (order) {
+    llvm::omp::Clause admo{llvm::omp::Clause::OMPC_atomic_default_mem_order};
+    os << "!$omp requires "
+       << parser::ToLowerCaseLetters(llvm::omp::getOpenMPClauseName(admo))
+       << '(' << parser::ToLowerCaseLetters(EnumToString(*order)) << ")\n";
+  }
+  if (reqs) {
+    os << "!$omp requires";
+    reqs->IterateOverMembers([&](llvm::omp::Clause f) {
+      if (f != llvm::omp::Clause::OMPC_atomic_default_mem_order) {
+        os << ' '
+           << parser::ToLowerCaseLetters(llvm::omp::getOpenMPClauseName(f));
+      }
+    });
+    os << "\n";
+  }
+}
+
 // Put out the visible symbols from scope.
 void ModFileWriter::PutSymbols(
     const Scope &scope, UnorderedSymbolSet *hermeticModules) {
@@ -396,6 +432,7 @@ void ModFileWriter::PutSymbols(
   for (const Symbol &symbol : uses) {
     PutUse(symbol);
   }
+  PutOpenMPRequirements(decls_, DEREF(scope.symbol()));
   for (const auto &set : scope.equivalenceSets()) {
     if (!set.empty() &&
         !set.front().symbol.test(Symbol::Flag::CompilerCreated)) {
diff --git a/flang/lib/Semantics/openmp-modifiers.cpp b/flang/lib/Semantics/openmp-modifiers.cpp
index af4000c4934ea..717fb0351ba5b 100644
--- a/flang/lib/Semantics/openmp-modifiers.cpp
+++ b/flang/lib/Semantics/openmp-modifiers.cpp
@@ -156,6 +156,22 @@ const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpAlwaysModifier>() {
   return desc;
 }
 
+template <>
+const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpAttachModifier>() {
+  static const OmpModifierDescriptor desc{
+      /*name=*/"attach-modifier",
+      /*props=*/
+      {
+          {61, {OmpProperty::Unique}},
+      },
+      /*clauses=*/
+      {
+          {61, {Clause::OMPC_map}},
+      },
+  };
+  return desc;
+}
+
 template <>
 const OmpModifierDescriptor &OmpGetDescriptor<parser::OmpAutomapModifier>() {
   static const OmpModifierDescriptor desc{
diff --git a/flang/lib/Semantics/openmp-utils.cpp b/flang/lib/Semantics/openmp-utils.cpp
index a8ec4d6c24beb..292e73b4899c0 100644
--- a/flang/lib/Semantics/openmp-utils.cpp
+++ b/flang/lib/Semantics/openmp-utils.cpp
@@ -13,6 +13,7 @@
 #include "flang/Semantics/openmp-utils.h"
 
 #include "flang/Common/Fortran-consts.h"
+#include "flang/Common/idioms.h"
 #include "flang/Common/indirection.h"
 #include "flang/Common/reference.h"
 #include "flang/Common/visit.h"
@@ -59,6 +60,26 @@ const Scope &GetScopingUnit(const Scope &scope) {
   return *iter;
 }
 
+const Scope &GetProgramUnit(const Scope &scope) {
+  const Scope *unit{nullptr};
+  for (const Scope *iter{&scope}; !iter->IsTopLevel(); iter = &iter->parent()) {
+    switch (iter->kind()) {
+    case Scope::Kind::BlockData:
+    case Scope::Kind::MainProgram:
+    case Scope::Kind::Module:
+      return *iter;
+    case Scope::Kind::Subprogram:
+      // Ignore subprograms that are nested.
+      unit = iter;
+      break;
+    default:
+      break;
+    }
+  }
+  assert(unit && "Scope not in a program unit");
+  return *unit;
+}
+
 SourcedActionStmt GetActionStmt(const parser::ExecutionPartConstruct *x) {
   if (x == nullptr) {
     return SourcedActionStmt{};
@@ -202,7 +223,7 @@ std::optional<SomeExpr> GetEvaluateExpr(const parser::Expr &parserExpr) {
   // ForwardOwningPointer           typedExpr
   // `- GenericExprWrapper          ^.get()
   //    `- std::optional<Expr>      ^->v
-  return typedExpr.get()->v;
+  return DEREF(typedExpr.get()).v;
 }
 
 std::optional<evaluate::DynamicType> GetDynamicType(
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index c77062435b7aa..ee9ed5c6abed3 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -435,6 +435,22 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> {
     return true;
   }
 
+  bool Pre(const parser::UseStmt &x) {
+    if (x.moduleName.symbol) {
+      Scope &thisScope{context_.FindScope(x.moduleName.source)};
+      common::visit(
+          [&](auto &&details) {
+            if constexpr (std::is_convertible_v<decltype(details),
+                              const WithOmpDeclarative &>) {
+              AddOmpRequiresToScope(thisScope, details.ompRequires(),
+                  details.ompAtomicDefaultMemOrder());
+            }
+          },
+          x.moduleName.symbol->details());
+    }
+    return true;
+  }
+
   bool Pre(const parser::OmpMetadirectiveDirective &x) {
     PushContext(x.v.source, llvm::omp::Directive::OMPD_metadirective);
     return true;
@@ -538,38 +554,56 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> {
   void Post(const parser::OpenMPFlushConstruct &) { PopContext(); }
 
   bool Pre(const parser::OpenMPRequiresConstruct &x) {
-    using Flags = WithOmpDeclarative::RequiresFlags;
-    using Requires = WithOmpDeclarative::RequiresFlag;
+    using RequiresClauses = WithOmpDeclarative::RequiresClauses;
     PushContext(x.source, llvm::omp::Directive::OMPD_requires);
 
+    auto getArgument{[&](auto &&maybeClause) {
+      if (maybeClause) {
+        // Scalar<Logical<Constant<common::Indirection<Expr>>>>
+        auto &parserExpr{parser::UnwrapRef<parser::Expr>(*maybeClause)};
+        evaluate::ExpressionAnalyzer ea{context_};
+        if (auto &&maybeExpr{ea.Analyze(parserExpr)}) {
+          if (auto v{omp::GetLogicalValue(*maybeExpr)}) {
+            return *v;
+          }
+        }
+      }
+      // If the argument is missing, it is assumed to be true.
+      return true;
+    }};
+
     // Gather information from the clauses.
-    Flags flags;
-    std::optional<common::OmpMemoryOrderType> memOrder;
+    RequiresClauses reqs;
+    const common::OmpMemoryOrderType *memOrder{nullptr};
     for (const parser::OmpClause &clause : x.v.Clauses().v) {
-      flags |= common::visit(
+      using OmpClause = parser::OmpClause;
+      reqs |= common::visit(
           common::visitors{
-              [&memOrder](
-                  const parser::OmpClause::AtomicDefaultMemOrder &atomic) {
-                memOrder = atomic.v.v;
-                return Flags{};
-              },
-              [](const parser::OmpClause::ReverseOffload &) {
-                return Flags{Requires::ReverseOffload};
+              [&](const OmpClause::AtomicDefaultMemOrder &atomic) {
+                memOrder = &atomic.v.v;
+                return RequiresClauses{};
               },
-              [](const parser::OmpClause::UnifiedAddress &) {
-                return Flags{Requires::UnifiedAddress};
-              },
-              [](const parser::OmpClause::UnifiedSharedMemory &) {
-                return Flags{Requires::UnifiedSharedMemory};
-              },
-              [](const parser::OmpClause::DynamicAllocators &) {
-                return Flags{Requires::DynamicAllocators};
+              [&](auto &&s) {
+                using TypeS = llvm::remove_cvref_t<decltype(s)>;
+                if constexpr ( //
+                    std::is_same_v<TypeS, OmpClause::DeviceSafesync> ||
+                    std::is_same_v<TypeS, OmpClause::DynamicAllocators> ||
+                    std::is_same_v<TypeS, OmpClause::ReverseOffload> ||
+                    std::is_same_v<TypeS, OmpClause::SelfMaps> ||
+                    std::is_same_v<TypeS, OmpClause::UnifiedAddress> ||
+                    std::is_same_v<TypeS, OmpClause::UnifiedSharedMemory>) {
+                  if (getArgument(s.v)) {
+                    return RequiresClauses{clause.Id()};
+                  }
+                }
+                return RequiresClauses{};
               },
-              [](const auto &) { return Flags{}; }},
+          },
           clause.u);
     }
+
     // Merge clauses into parents' symbols details.
-    AddOmpRequiresToScope(currScope(), flags, memOrder);
+    AddOmpRequiresToScope(currScope(), &reqs, memOrder);
     return true;
   }
   void Post(const parser::OpenMPRequiresConstruct &) { PopContext(); }
@@ -859,7 +893,7 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> {
           common::visitors{
               [&](const parser::Designator &designator) {
                 if (const auto *name{
-                        semantics::getDesignatorNameIfDataRef(designator)}) {
+                        parser::GetDesignatorNameIfDataRef(designator)}) {
                   if (name->symbol) {
                     name->symbol->set(
                         ompFlag.value_or(Symbol::Flag::OmpMapStorage));
@@ -1001,8 +1035,9 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> {
 
   std::int64_t ordCollapseLevel{0};
 
-  void AddOmpRequiresToScope(Scope &, WithOmpDeclarative::RequiresFlags,
-      std::optional<common::OmpMemoryOrderType>);
+  void AddOmpRequiresToScope(Scope &,
+      const WithOmpDeclarative::RequiresClauses *,
+      const common::OmpMemoryOrderType *);
   void IssueNonConformanceWarning(llvm::omp::Directive D,
       parser::CharBlock source, unsigned EmitFromVersion);
 
@@ -1724,7 +1759,7 @@ void AccAttributeVisitor::ResolveAccObject(
       common::visitors{
           [&](const parser::Designator &designator) {
             if (const auto *name{
-                    semantics::getDesignatorNameIfDataRef(designator)}) {
+                    parser::GetDesignatorNameIfDataRef(designator)}) {
               if (auto *symbol{ResolveAcc(*name, accFlag, currScope())}) {
                 AddToContextObjectWithDSA(*symbol, accFlag);
                 if (dataSharingAttributeFlags.test(accFlag)) {
@@ -3031,7 +3066,7 @@ void OmpAttributeVisitor::ResolveOmpDesignator(
   unsigned version{context_.langOptions().OpenMPVersion};
   llvm::omp::Directive directive{GetContext().directive};
 
-  const auto *name{semantics::getDesignatorNameIfDataRef(designator)};
+  const auto *name{parser::GetDesignatorNameIfDataRef(designator)};
   if (!name) {
     // Array sections to be changed to substrings as needed
     if (AnalyzeExpr(context_, designator)) {
@@ -3310,86 +3345,6 @@ void ResolveOmpParts(
   }
 }
 
-void ResolveOmpTopLevelParts(
-    SemanticsContext &context, const parser::Program &program) {
-  if (!context.IsEnabled(common::LanguageFeature::OpenMP)) {
-    return;
-  }
-
-  // Gather REQUIRES clauses from all non-module top-level program unit symbols,
-  // combine them together ensuring compatibility and apply them to all these
-  // program units. Modules are skipped because their REQUIRES clauses should be
-  // propagated via USE statements instead.
-  WithOmpDeclarative::RequiresFlags combinedFlags;
-  std::optional<common::OmpMemoryOrderType> combinedMemOrder;
-
-  // Function to go through non-module top level program units and extract
-  // REQUIRES information to be processed by a function-like argument.
-  auto processProgramUnits{[&](auto processFn) {
-    for (const parser::ProgramUnit &unit : program.v) {
-      if (!std::holds_alternative<common::Indirection<parser::Module>>(
-              unit.u) &&
-          !std::holds_alternative<common::Indirection<parser::Submodule>>(
-              unit.u) &&
-          !std::holds_alternative<
-              common::Indirection<parser::CompilerDirective>>(unit.u)) {
-        Symbol *symbol{common::visit(
-            [&context](auto &x) {
-              Scope *scope = GetScope(context, x.value());
-              return scope ? scope->symbol() : nullptr;
-            },
-            unit.u)};
-        // FIXME There is no symbol defined for MainProgram units in certain
-        // circumstances, so REQUIRES information has no place to be stored in
-        // these cases.
-        if (!symbol) {
-          continue;
-        }
-        common::visit(
-            [&](auto &details) {
-              if constexpr (std::is_convertible_v<decltype(&details),
-                                WithOmpDeclarative *>) {
-                processFn(*symbol, details);
-              }
-            },
-            symbol->details());
-      }
-    }
-  }};
-
-  // Combine global REQUIRES information from all program units except modules
-  // and submodules.
-  processProgramUnits([&](Symbol &symbol, WithOmpDeclarative &details) {
-    if (const WithOmpDeclarative::RequiresFlags *
-        flags{details.ompRequires()}) {
-      combinedFlags |= *flags;
-    }
-    if (const common::OmpMemoryOrderType *
-        memOrder{details.ompAtomicDefaultMemOrder()}) {
-      if (combinedMemOrder && *combinedMemOrder != *memOrder) {
-        context.Say(symbol.scope()->sourceRange(),
-            "Conflicting '%s' REQUIRES clauses found in compilation "
-            "unit"_err_en_US,
-            parser::ToUpperCaseLetters(llvm::omp::getOpenMPClauseName(
-                llvm::omp::Clause::OMPC_atomic_default_mem_order)
-                                           .str()));
-      }
-      combinedMemOrder = *memOrder;
-    }
-  });
-
-  // Update all program units except modules and submodules with the combined
-  // global REQUIRES information.
-  processProgramUnits([&](Symbol &, WithOmpDeclarative &details) {
-    if (combinedFlags.any()) {
-      details.set_ompRequires(combinedFlags);
-    }
-    if (combinedMemOrder) {
-      details.set_ompAtomicDefaultMemOrder(*combinedMemOrder);
-    }
-  });
-}
-
 static bool IsSymbolThreadprivate(const Symbol &symbol) {
   if (const auto *details{symbol.detailsIf<HostAssocDetails>()}) {
     return details->symbol().test(Symbol::Flag::OmpThreadprivate);
@@ -3548,42 +3503,39 @@ void OmpAttributeVisitor::CheckLabelContext(const parser::CharBlock source,
 }
 
 void OmpAttributeVisitor::AddOmpRequiresToScope(Scope &scope,
-    WithOmpDeclarative::RequiresFlags flags,
-    std::optional<common::OmpMemoryOrderType> memOrder) {
-  Scope *scopeIter = &scope;
-  do {
-    if (Symbol * symbol{scopeIter->symbol()}) {
-      common::visit(
-          [&](auto &details) {
-            // Store clauses information into the symbol for the parent and
-            // enclosing modules, programs, functions and subroutines.
-            if constexpr (std::is_convertible_v<decltype(&details),
-                              WithOmpDeclarative *>) {
-              if (flags.any()) {
-                if (const WithOmpDeclarative::RequiresFlags *
-                    otherFlags{details.ompRequires()}) {
-                  flags |= *otherFlags;
-                }
-                details.set_ompRequires(flags);
+    const WithOmpDeclarative::RequiresClauses *reqs,
+    const common::OmpMemoryOrderType *memOrder) {
+  const Scope &programUnit{omp::GetProgramUnit(scope)};
+  using RequiresClauses = WithOmpDeclarative::RequiresClauses;
+  RequiresClauses combinedReqs{reqs ? *reqs : RequiresClauses{}};
+
+  if (auto *symbol{const_cast<Symbol *>(programUnit.symbol())}) {
+    common::visit(
+        [&](auto &details) {
+          if constexpr (std::is_convertible_v<decltype(&details),
+                            WithOmpDeclarative *>) {
+            if (combinedReqs.any()) {
+              if (const RequiresClauses *otherReqs{details.ompRequires()}) {
+                combinedReqs |= *otherReqs;
               }
-              if (memOrder) {
-                if (details.has_ompAtomicDefaultMemOrder() &&
-                    *details.ompAtomicDefaultMemOrder() != *memOrder) {
-                  context_.Say(scopeIter->sourceRange(),
-                      "Conflicting '%s' REQUIRES clauses found in compilation "
-                      "unit"_err_en_US,
-                      parser::ToUpperCaseLetters(llvm::omp::getOpenMPClauseName(
-                          llvm::omp::Clause::OMPC_atomic_default_mem_order)
-                                                     .str()));
-                }
-                details.set_ompAtomicDefaultMemOrder(*memOrder);
+              details.set_ompRequires(combinedReqs);
+            }
+            if (memOrder) {
+              if (details.has_ompAtomicDefaultMemOrder() &&
+                  *details.ompAtomicDefaultMemOrder() != *memOrder) {
+                context_.Say(programUnit.sourceRange(),
+                    "Conflicting '%s' REQUIRES clauses found in compilation "
+                    "unit"_err_en_US,
+                    parser::ToUpperCaseLetters(llvm::omp::getOpenMPClauseName(
+                        llvm::omp::Clause::OMPC_atomic_default_mem_order)
+                            .str()));
               }
+              details.set_ompAtomicDefaultMemOrder(*memOrder);
             }
-          },
-          symbol->details());
-    }
-    scopeIter = &scopeIter->parent();
-  } while (!scopeIter->IsGlobal());
+          }
+        },
+        symbol->details());
+  }
 }
 
 void OmpAttributeVisitor::IssueNonConformanceWarning(llvm::omp::Directive D,
diff --git a/flang/lib/Semantics/resolve-directives.h b/flang/lib/Semantics/resolve-directives.h
index 5a890c26aa334..36d3ce988b1b1 100644
--- a/flang/lib/Semantics/resolve-directives.h
+++ b/flang/lib/Semantics/resolve-directives.h
@@ -23,7 +23,5 @@ class SemanticsContext;
 void ResolveAccParts(
     SemanticsContext &, const parser::ProgramUnit &, Scope *topScope);
 void ResolveOmpParts(SemanticsContext &, const parser::ProgramUnit &);
-void ResolveOmpTopLevelParts(SemanticsContext &, const parser::Program &);
-
 } // namespace Fortran::semantics
 #endif
diff --git a/flang/lib/Semantics/resolve-names-utils.cpp b/flang/lib/Semantics/resolve-names-utils.cpp
index 742bb748b7ff3..ac67799938013 100644
--- a/flang/lib/Semantics/resolve-names-utils.cpp
+++ b/flang/lib/Semantics/resolve-names-utils.cpp
@@ -492,12 +492,14 @@ bool EquivalenceSets::CheckDesignator(const parser::Designator &designator) {
             const auto &range{std::get<parser::SubstringRange>(x.t)};
             bool ok{CheckDataRef(designator.source, dataRef)};
             if (const auto &lb{std::get<0>(range.t)}) {
-              ok &= CheckSubstringBound(lb->thing.thing.value(), true);
+              ok &= CheckSubstringBound(
+                  parser::UnwrapRef<parser::Expr>(lb), true);
             } else {
               currObject_.substringStart = 1;
             }
             if (const auto &ub{std::get<1>(range.t)}) {
-              ok &= CheckSubstringBound(ub->thing.thing.value(), false);
+              ok &= CheckSubstringBound(
+                  parser::UnwrapRef<parser::Expr>(ub), false);
             }
             return ok;
           },
@@ -528,7 +530,8 @@ bool EquivalenceSets::CheckDataRef(
                         return false;
                       },
                       [&](const parser::IntExpr &y) {
-                        return CheckArrayBound(y.thing.value());
+                        return CheckArrayBound(
+                            parser::UnwrapRef<parser::Expr>(y));
                       },
                   },
                   subscript.u);
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index 861218809c0f9..0af1c94502bb4 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -1140,7 +1140,7 @@ class DeclarationVisitor : public ArraySpecVisitor,
   std::optional<SourceName> BeginCheckOnIndexUseInOwnBounds(
       const parser::DoVariable &name) {
     std::optional<SourceName> result{checkIndexUseInOwnBounds_};
-    checkIndexUseInOwnBounds_ = name.thing.thing.source;
+    checkIndexUseInOwnBounds_ = parser::UnwrapRef<parser::Name>(name).source;
     return result;
   }
   void EndCheckOnIndexUseInOwnBounds(const std::optional<SourceName> &restore) {
@@ -1494,7 +1494,7 @@ bool AccVisitor::Pre(const parser::AccClause::UseDevice &x) {
         common::visitors{
             [&](const parser::Designator &designator) {
               if (const auto *name{
-                      semantics::getDesignatorNameIfDataRef(designator)}) {
+                      parser::GetDesignatorNameIfDataRef(designator)}) {
                 Symbol *prev{currScope().FindSymbol(name->source)};
                 if (prev != name->symbol) {
                   name->symbol = prev;
@@ -1648,7 +1648,7 @@ class OmpVisitor : public virtual DeclarationVisitor {
           common::visitors{
               [&](const parser::Designator &designator) {
                 if (const auto *name{
-                        semantics::getDesignatorNameIfDataRef(designator)}) {
+                        parser::GetDesignatorNameIfDataRef(designator)}) {
                   specPartState_.declareTargetNames.insert(name->source);
                 }
               },
@@ -2016,7 +2016,7 @@ void OmpVisitor::ResolveCriticalName(const parser::OmpArgument &arg) {
 
   if (auto *object{parser::Unwrap<parser::OmpObject>(arg.u)}) {
     if (auto *desg{omp::GetDesignatorFromObj(*object)}) {
-      if (auto *name{getDesignatorNameIfDataRef(*desg)}) {
+      if (auto *name{parser::GetDesignatorNameIfDataRef(*desg)}) {
         if (auto *symbol{FindInScope(globalScope, *name)}) {
           if (!symbol->test(Symbol::Flag::OmpCriticalLock)) {
             SayWithDecl(*name, *symbol,
@@ -2130,7 +2130,7 @@ class ResolveNamesVisitor : public virtual ScopeHandler,
   void Post(const parser::SubstringInquiry &);
   template <typename A, typename B>
   void Post(const parser::LoopBounds<A, B> &x) {
-    ResolveName(*parser::Unwrap<parser::Name>(x.name));
+    ResolveName(parser::UnwrapRef<parser::Name>(x.name));
   }
   void Post(const parser::ProcComponentRef &);
   bool Pre(const parser::FunctionReference &);
@@ -2560,7 +2560,7 @@ KindExpr DeclTypeSpecVisitor::GetKindParamExpr(
         CHECK(!state_.originalKindParameter);
         // Save a pointer to the KIND= expression in the parse tree
         // in case we need to reanalyze it during PDT instantiation.
-        state_.originalKindParameter = &expr->thing.thing.thing.value();
+        state_.originalKindParameter = parser::Unwrap<parser::Expr>(expr);
       }
     }
     // Inhibit some errors now that will be caught later during instantiations.
@@ -5649,6 +5649,7 @@ bool DeclarationVisitor::Pre(const parser::NamedConstantDef &x) {
   if (details->init() || symbol.test(Symbol::Flag::InDataStmt)) {
     Say(name, "Named constant '%s' already has a value"_err_en_US);
   }
+  parser::CharBlock at{parser::UnwrapRef<parser::Expr>(expr).source};
   if (inOldStyleParameterStmt_) {
     // non-standard extension PARAMETER statement (no parentheses)
     Walk(expr);
@@ -5657,7 +5658,6 @@ bool DeclarationVisitor::Pre(const parser::NamedConstantDef &x) {
       SayWithDecl(name, symbol,
           "Alternative style PARAMETER '%s' must not already have an explicit type"_err_en_US);
     } else if (folded) {
-      auto at{expr.thing.value().source};
       if (evaluate::IsActuallyConstant(*folded)) {
         if (const auto *type{currScope().GetType(*folded)}) {
           if (type->IsPolymorphic()) {
@@ -5682,8 +5682,7 @@ bool DeclarationVisitor::Pre(const parser::NamedConstantDef &x) {
     // standard-conforming PARAMETER statement (with parentheses)
     ApplyImplicitRules(symbol);
     Walk(expr);
-    if (auto converted{EvaluateNonPointerInitializer(
-            symbol, expr, expr.thing.value().source)}) {
+    if (auto converted{EvaluateNonPointerInitializer(symbol, expr, at)}) {
       details->set_init(std::move(*converted));
     }
   }
@@ -6149,7 +6148,7 @@ bool DeclarationVisitor::Pre(const parser::KindParam &x) {
   if (const auto *kind{std::get_if<
           parser::Scalar<parser::Integer<parser::Constant<parser::Name>>>>(
           &x.u)}) {
-    const parser::Name &name{kind->thing.thing.thing};
+    const auto &name{parser::UnwrapRef<parser::Name>(kind)};
     if (!FindSymbol(name)) {
       Say(name, "Parameter '%s' not found"_err_en_US);
     }
@@ -7460,7 +7459,7 @@ void DeclarationVisitor::DeclareLocalEntity(
 Symbol *DeclarationVisitor::DeclareStatementEntity(
     const parser::DoVariable &doVar,
     const std::optional<parser::IntegerTypeSpec> &type) {
-  const parser::Name &name{doVar.thing.thing};
+  const auto &name{parser::UnwrapRef<parser::Name>(doVar)};
   const DeclTypeSpec *declTypeSpec{nullptr};
   if (auto *prev{FindSymbol(name)}) {
     if (prev->owner() == currScope()) {
@@ -7893,13 +7892,14 @@ bool ConstructVisitor::Pre(const parser::DataIDoObject &x) {
   common::visit(
       common::visitors{
           [&](const parser::Scalar<Indirection<parser::Designator>> &y) {
-            Walk(y.thing.value());
-            const parser::Name &first{parser::GetFirstName(y.thing.value())};
+            const auto &designator{parser::UnwrapRef<parser::Designator>(y)};
+            Walk(designator);
+            const parser::Name &first{parser::GetFirstName(designator)};
             if (first.symbol) {
               first.symbol->set(Symbol::Flag::InDataStmt);
             }
           },
-          [&](const Indirection<parser::DataImpliedDo> &y) { Walk(y.value()); },
+          [&](const Indirection<parser::DataImpliedDo> &y) { Walk(y); },
       },
       x.u);
   return false;
@@ -8582,8 +8582,7 @@ class ExecutionPartAsyncIOSkimmer : public ExecutionPartSkimmerBase {
   void Post(const parser::WriteStmt &) { inAsyncIO_ = false; }
   void Post(const parser::IoControlSpec::Size &size) {
     if (const auto *designator{
-            std::get_if<common::Indirection<parser::Designator>>(
-                &size.v.thing.thing.u)}) {
+            parser::Unwrap<common::Indirection<parser::Designator>>(size)}) {
       NoteAsyncIODesignator(designator->value());
     }
   }
@@ -9175,16 +9174,17 @@ bool DeclarationVisitor::CheckNonPointerInitialization(
 }
 
 void DeclarationVisitor::NonPointerInitialization(
-    const parser::Name &name, const parser::ConstantExpr &expr) {
+    const parser::Name &name, const parser::ConstantExpr &constExpr) {
   if (CheckNonPointerInitialization(
           name, /*inLegacyDataInitialization=*/false)) {
     Symbol &ultimate{name.symbol->GetUltimate()};
     auto &details{ultimate.get<ObjectEntityDetails>()};
+    const auto &expr{parser::UnwrapRef<parser::Expr>(constExpr)};
     if (ultimate.owner().IsParameterizedDerivedType()) {
       // Save the expression for per-instantiation analysis.
-      details.set_unanalyzedPDTComponentInit(&expr.thing.value());
+      details.set_unanalyzedPDTComponentInit(&expr);
     } else if (MaybeExpr folded{EvaluateNonPointerInitializer(
-                   ultimate, expr, expr.thing.value().source)}) {
+                   ultimate, constExpr, expr.source)}) {
       details.set_init(std::move(*folded));
       ultimate.set(Symbol::Flag::InDataStmt, false);
     }
@@ -10687,9 +10687,6 @@ void ResolveNamesVisitor::Post(const parser::Program &x) {
   CHECK(!attrs_);
   CHECK(!cudaDataAttr_);
   CHECK(!GetDeclTypeSpec());
-  // Top-level resolution to propagate information across program units after
-  // each of them has been resolved separately.
-  ResolveOmpTopLevelParts(context(), x);
 }
 
 // A singleton instance of the scope -> IMPLICIT rules mapping is
diff --git a/flang/lib/Semantics/symbol.cpp b/flang/lib/Semantics/symbol.cpp
index 69169469fe8ce..0ec44b7c40491 100644
--- a/flang/lib/Semantics/symbol.cpp
+++ b/flang/lib/Semantics/symbol.cpp
@@ -70,6 +70,32 @@ static void DumpList(llvm::raw_ostream &os, const char *label, const T &list) {
   }
 }
 
+llvm::raw_ostream &operator<<(
+    llvm::raw_ostream &os, const WithOmpDeclarative &x) {
+  if (x.has_ompRequires() || x.has_ompAtomicDefaultMemOrder()) {
+    os << " OmpRequirements:(";
+    if (const common::OmpMemoryOrderType *admo{x.ompAtomicDefaultMemOrder()}) {
+      os << parser::ToLowerCaseLetters(llvm::omp::getOpenMPClauseName(
+                llvm::omp::Clause::OMPC_atomic_default_mem_order))
+         << '(' << parser::ToLowerCaseLetters(EnumToString(*admo)) << ')';
+      if (x.has_ompRequires()) {
+        os << ',';
+      }
+    }
+    if (const WithOmpDeclarative::RequiresClauses *reqs{x.ompRequires()}) {
+      size_t num{0}, size{reqs->count()};
+      reqs->IterateOverMembers([&](llvm::omp::Clause f) {
+        os << parser::ToLowerCaseLetters(llvm::omp::getOpenMPClauseName(f));
+        if (++num < size) {
+          os << ',';
+        }
+      });
+    }
+    os << ')';
+  }
+  return os;
+}
+
 void SubprogramDetails::set_moduleInterface(Symbol &symbol) {
   CHECK(!moduleInterface_);
   moduleInterface_ = &symbol;
@@ -150,6 +176,7 @@ llvm::raw_ostream &operator<<(
       os << x;
     }
   }
+  os << static_cast<const WithOmpDeclarative &>(x);
   return os;
 }
 
@@ -580,7 +607,9 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &os, const Details &details) {
   common::visit( //
       common::visitors{
           [&](const UnknownDetails &) {},
-          [&](const MainProgramDetails &) {},
+          [&](const MainProgramDetails &x) {
+            os << static_cast<const WithOmpDeclarative &>(x);
+          },
           [&](const ModuleDetails &x) {
             if (x.isSubmodule()) {
               os << " (";
@@ -599,6 +628,7 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &os, const Details &details) {
             if (x.isDefaultPrivate()) {
               os << " isDefaultPrivate";
             }
+            os << static_cast<const WithOmpDeclarative &>(x);
           },
           [&](const SubprogramNameDetails &x) {
             os << ' ' << EnumToString(x.kind());
diff --git a/flang/lib/Utils/OpenMP.cpp b/flang/lib/Utils/OpenMP.cpp
index 55aa42463e6db..bcc9380bc8aac 100644
--- a/flang/lib/Utils/OpenMP.cpp
+++ b/flang/lib/Utils/OpenMP.cpp
@@ -22,8 +22,9 @@ mlir::omp::MapInfoOp createMapInfoOp(mlir::OpBuilder &builder,
     mlir::Location loc, mlir::Value baseAddr, mlir::Value varPtrPtr,
     llvm::StringRef name, llvm::ArrayRef<mlir::Value> bounds,
     llvm::ArrayRef<mlir::Value> members, mlir::ArrayAttr membersIndex,
-    uint64_t mapType, mlir::omp::VariableCaptureKind mapCaptureType,
-    mlir::Type retTy, bool partialMap, mlir::FlatSymbolRefAttr mapperId) {
+    mlir::omp::ClauseMapFlags mapType,
+    mlir::omp::VariableCaptureKind mapCaptureType, mlir::Type retTy,
+    bool partialMap, mlir::FlatSymbolRefAttr mapperId) {
 
   if (auto boxTy = llvm::dyn_cast<fir::BaseBoxType>(baseAddr.getType())) {
     baseAddr = fir::BoxAddrOp::create(builder, loc, baseAddr);
@@ -42,7 +43,7 @@ mlir::omp::MapInfoOp createMapInfoOp(mlir::OpBuilder &builder,
 
   mlir::omp::MapInfoOp op =
       mlir::omp::MapInfoOp::create(builder, loc, retTy, baseAddr, varType,
-          builder.getIntegerAttr(builder.getIntegerType(64, false), mapType),
+          builder.getAttr<mlir::omp::ClauseMapFlagsAttr>(mapType),
           builder.getAttr<mlir::omp::VariableCaptureKindAttr>(mapCaptureType),
           varPtrPtr, members, membersIndex, bounds, mapperId,
           builder.getStringAttr(name), builder.getBoolAttr(partialMap));
@@ -75,8 +76,7 @@ mlir::Value mapTemporaryValue(fir::FirOpBuilder &firOpBuilder,
 
   firOpBuilder.setInsertionPoint(targetOp);
 
-  llvm::omp::OpenMPOffloadMappingFlags mapFlag =
-      llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT;
+  mlir::omp::ClauseMapFlags mapFlag = mlir::omp::ClauseMapFlags::implicit;
   mlir::omp::VariableCaptureKind captureKind =
       mlir::omp::VariableCaptureKind::ByRef;
 
@@ -88,16 +88,14 @@ mlir::Value mapTemporaryValue(fir::FirOpBuilder &firOpBuilder,
   if (fir::isa_trivial(eleType) || fir::isa_char(eleType)) {
     captureKind = mlir::omp::VariableCaptureKind::ByCopy;
   } else if (!fir::isa_builtin_cptr_type(eleType)) {
-    mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO;
+    mapFlag |= mlir::omp::ClauseMapFlags::to;
   }
 
   mlir::Value mapOp = createMapInfoOp(firOpBuilder, copyVal.getLoc(), copyVal,
       /*varPtrPtr=*/mlir::Value{}, name.str(), bounds,
       /*members=*/llvm::SmallVector<mlir::Value>{},
-      /*membersIndex=*/mlir::ArrayAttr{},
-      static_cast<std::underlying_type_t<llvm::omp::OpenMPOffloadMappingFlags>>(
-          mapFlag),
-      captureKind, copyVal.getType());
+      /*membersIndex=*/mlir::ArrayAttr{}, mapFlag, captureKind,
+      copyVal.getType());
 
   auto argIface = llvm::cast<mlir::omp::BlockArgOpenMPOpInterface>(*targetOp);
   mlir::Region &region = targetOp.getRegion();
diff --git a/flang/test/Driver/func-attr-fast-math.f90 b/flang/test/Driver/func-attr-fast-math.f90
index 3b6ce602c5373..3af641ea2db26 100644
--- a/flang/test/Driver/func-attr-fast-math.f90
+++ b/flang/test/Driver/func-attr-fast-math.f90
@@ -11,8 +11,8 @@ end subroutine func
 
 ! CHECK-OFAST-LABEL: define void @func_() local_unnamed_addr
 ! CHECK-OFAST-SAME: #[[ATTRS:[0-9]+]]
-! CHECK-OFAST: attributes #[[ATTRS]] = { {{.*}}"no-infs-fp-math"="true" {{.*}}"no-nans-fp-math"="true" {{.*}}"no-signed-zeros-fp-math"="true" {{.*}}"unsafe-fp-math"="true"{{.*}} }
+! CHECK-OFAST: attributes #[[ATTRS]] = { {{.*}}"no-infs-fp-math"="true" {{.*}}"no-nans-fp-math"="true" {{.*}}"no-signed-zeros-fp-math"="true"{{.*}} }
 
 ! CHECK-FFAST-MATH-LABEL: define void @func_() local_unnamed_addr
 ! CHECK-FFAST-MATH-SAME: #[[ATTRS:[0-9]+]]
-! CHECK-FFAST-MATH: attributes #[[ATTRS]] = { {{.*}}"no-infs-fp-math"="true" {{.*}}"no-nans-fp-math"="true" {{.*}}"no-signed-zeros-fp-math"="true" {{.*}}"unsafe-fp-math"="true"{{.*}} }
+! CHECK-FFAST-MATH: attributes #[[ATTRS]] = { {{.*}}"no-infs-fp-math"="true" {{.*}}"no-nans-fp-math"="true" {{.*}}"no-signed-zeros-fp-math"="true"{{.*}} }
diff --git a/flang/test/Driver/mlir-debug-pass-pipeline.f90 b/flang/test/Driver/mlir-debug-pass-pipeline.f90
index 5943a3c61c342..eb5165e36c919 100644
--- a/flang/test/Driver/mlir-debug-pass-pipeline.f90
+++ b/flang/test/Driver/mlir-debug-pass-pipeline.f90
@@ -100,6 +100,7 @@
 ! ALL-NEXT: CSE
 ! ALL-NEXT:   (S) 0 num-cse'd - Number of operations CSE'd
 ! ALL-NEXT:   (S) 0 num-dce'd - Number of operations DCE'd
+! ALL-NEXT: MIFOpConversion 
 ! ALL-NEXT: BoxedProcedurePass
 
 ! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'gpu.module', 'omp.declare_reduction', 'omp.private']
diff --git a/flang/test/Driver/mlir-pass-pipeline.f90 b/flang/test/Driver/mlir-pass-pipeline.f90
index df558392c5fc2..3b6a9d7cda7ed 100644
--- a/flang/test/Driver/mlir-pass-pipeline.f90
+++ b/flang/test/Driver/mlir-pass-pipeline.f90
@@ -142,6 +142,7 @@
 ! ALL-NEXT:   (S) 0 num-dce'd - Number of operations DCE'd
 ! O2-NEXT:  'func.func' Pipeline
 ! O2-NEXT:    SetRuntimeCallAttributes
+! ALL-NEXT: MIFOpConversion 
 ! ALL-NEXT: BoxedProcedurePass
 ! O2-NEXT:  AddAliasTags
 
diff --git a/flang/test/Fir/CUDA/cuda-code-gen.mlir b/flang/test/Fir/CUDA/cuda-code-gen.mlir
index bbd3f9fbd351b..60cda9e98c7d8 100644
--- a/flang/test/Fir/CUDA/cuda-code-gen.mlir
+++ b/flang/test/Fir/CUDA/cuda-code-gen.mlir
@@ -284,3 +284,31 @@ module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_e
 // CHECK-LABEL: llvm.func @_QQxxx()
 // CHECK: llvm.alloca %{{.*}} x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<2 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
 // CHECK-NOT: llvm.call @_FortranACUFAllocDescriptor
+
+// -----
+
+module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} {
+  gpu.module @cuda_device_mod {
+    fir.global @_QMkernelsEinitial_val {data_attr = #cuf.cuda<constant>} : i32 {
+      %0 = fir.zero_bits i32
+      fir.has_value %0 : i32
+    }
+    gpu.func @_QMkernelsPassign(%arg0: !fir.ref<!fir.array<?xi32>>) kernel {
+      %c-1 = arith.constant -1 : index
+      %c1_i32 = arith.constant 1 : i32
+      %0 = arith.constant 1 : i32
+      %1 = arith.addi %0, %c1_i32 : i32
+      %2 = fir.address_of(@_QMkernelsEinitial_val) : !fir.ref<i32>
+      %4 = fir.load %2 : !fir.ref<i32>
+      %5 = fir.convert %1 : (i32) -> i64
+      %6 = fircg.ext_array_coor %arg0(%c-1)<%5> : (!fir.ref<!fir.array<?xi32>>, index, i64) -> !fir.ref<i32>
+      fir.store %4 to %6 : !fir.ref<i32>
+      gpu.return
+    }
+  }
+}
+
+// CHECK: llvm.mlir.global external @_QMkernelsEinitial_val() {addr_space = 4 : i32} : i32
+// CHECK-LABEL:  gpu.func @_QMkernelsPassign
+// CHECK: %[[ADDROF:.*]] = llvm.mlir.addressof @_QMkernelsEinitial_val : !llvm.ptr<4>
+// CHECK: %{{.*}} = llvm.addrspacecast %[[ADDROF]] : !llvm.ptr<4> to !llvm.ptr
diff --git a/flang/test/Fir/MIF/co_broadcast.mlir b/flang/test/Fir/MIF/co_broadcast.mlir
new file mode 100644
index 0000000000000..2f5782b2a825a
--- /dev/null
+++ b/flang/test/Fir/MIF/co_broadcast.mlir
@@ -0,0 +1,137 @@
+// RUN: fir-opt --mif-convert %s | FileCheck %s
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr<270> = dense<32> : vector<4xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, i64 = dense<64> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, f80 = dense<128> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little", "dlti.mangling_mode" = "e", "dlti.legal_int_widths" = array<i32: 8, 16, 32, 64>, "dlti.stack_alignment" = 128 : i64>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 22.0.0 (git@github.com:SiPearl/llvm-project.git 666e4313ebc03587f27774139ad8f780bac15c3e)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+  func.func @_QQmain() attributes {fir.bindc_name = "TEST_CO_BROADCAST"} {
+    %0 = fir.dummy_scope : !fir.dscope
+    %c2 = arith.constant 2 : index
+    %1 = fir.alloca !fir.array<2xcomplex<f32>> {bindc_name = "array_c", uniq_name = "_QFEarray_c"}
+    %2 = fir.shape %c2 : (index) -> !fir.shape<1>
+    %3:2 = hlfir.declare %1(%2) {uniq_name = "_QFEarray_c"} : (!fir.ref<!fir.array<2xcomplex<f32>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<2xcomplex<f32>>>, !fir.ref<!fir.array<2xcomplex<f32>>>)
+    %c2_0 = arith.constant 2 : index
+    %4 = fir.alloca !fir.array<2xf64> {bindc_name = "array_d", uniq_name = "_QFEarray_d"}
+    %5 = fir.shape %c2_0 : (index) -> !fir.shape<1>
+    %6:2 = hlfir.declare %4(%5) {uniq_name = "_QFEarray_d"} : (!fir.ref<!fir.array<2xf64>>, !fir.shape<1>) -> (!fir.ref<!fir.array<2xf64>>, !fir.ref<!fir.array<2xf64>>)
+    %c2_1 = arith.constant 2 : index
+    %7 = fir.alloca !fir.array<2xi32> {bindc_name = "array_i", uniq_name = "_QFEarray_i"}
+    %8 = fir.shape %c2_1 : (index) -> !fir.shape<1>
+    %9:2 = hlfir.declare %7(%8) {uniq_name = "_QFEarray_i"} : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<2xi32>>, !fir.ref<!fir.array<2xi32>>)
+    %c2_2 = arith.constant 2 : index
+    %10 = fir.alloca !fir.array<2xf32> {bindc_name = "array_r", uniq_name = "_QFEarray_r"}
+    %11 = fir.shape %c2_2 : (index) -> !fir.shape<1>
+    %12:2 = hlfir.declare %10(%11) {uniq_name = "_QFEarray_r"} : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<2xf32>>, !fir.ref<!fir.array<2xf32>>)
+    %13 = fir.alloca complex<f32> {bindc_name = "c", uniq_name = "_QFEc"}
+    %14:2 = hlfir.declare %13 {uniq_name = "_QFEc"} : (!fir.ref<complex<f32>>) -> (!fir.ref<complex<f32>>, !fir.ref<complex<f32>>)
+    %15 = fir.alloca f64 {bindc_name = "d", uniq_name = "_QFEd"}
+    %16:2 = hlfir.declare %15 {uniq_name = "_QFEd"} : (!fir.ref<f64>) -> (!fir.ref<f64>, !fir.ref<f64>)
+    %17 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"}
+    %18:2 = hlfir.declare %17 {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %c1 = arith.constant 1 : index
+    %19 = fir.alloca !fir.char<1> {bindc_name = "message", uniq_name = "_QFEmessage"}
+    %20:2 = hlfir.declare %19 typeparams %c1 {uniq_name = "_QFEmessage"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
+    %21 = fir.alloca f32 {bindc_name = "r", uniq_name = "_QFEr"}
+    %22:2 = hlfir.declare %21 {uniq_name = "_QFEr"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+    %23 = fir.alloca i32 {bindc_name = "status", uniq_name = "_QFEstatus"}
+    %24:2 = hlfir.declare %23 {uniq_name = "_QFEstatus"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %c1_i32 = arith.constant 1 : i32
+    %25 = fir.embox %18#0 : (!fir.ref<i32>) -> !fir.box<i32>
+    mif.co_broadcast %25 source %c1_i32 : (!fir.box<i32>, i32)
+    %c1_i32_3 = arith.constant 1 : i32
+    %26 = fir.embox %14#0 : (!fir.ref<complex<f32>>) -> !fir.box<complex<f32>>
+    mif.co_broadcast %26 source %c1_i32_3 stat %24#0 : (!fir.box<complex<f32>>, i32, !fir.ref<i32>)
+    %c1_i32_4 = arith.constant 1 : i32
+    %27 = fir.embox %16#0 : (!fir.ref<f64>) -> !fir.box<f64>
+    %28 = fir.embox %20#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+    mif.co_broadcast %27 source %c1_i32_4 stat %24#0 errmsg %28 : (!fir.box<f64>, i32, !fir.ref<i32>, !fir.box<!fir.char<1>>)
+    %c1_i32_5 = arith.constant 1 : i32
+    %29 = fir.embox %22#0 : (!fir.ref<f32>) -> !fir.box<f32>
+    %30 = fir.embox %20#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+    mif.co_broadcast %29 source %c1_i32_5 stat %24#0 errmsg %30 : (!fir.box<f32>, i32, !fir.ref<i32>, !fir.box<!fir.char<1>>)
+    %c1_i32_6 = arith.constant 1 : i32
+    %31 = fir.shape %c2_1 : (index) -> !fir.shape<1>
+    %32 = fir.embox %9#0(%31) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>>
+    mif.co_broadcast %32 source %c1_i32_6 : (!fir.box<!fir.array<2xi32>>, i32)
+    %c1_i32_7 = arith.constant 1 : i32
+    %33 = fir.shape %c2 : (index) -> !fir.shape<1>
+    %34 = fir.embox %3#0(%33) : (!fir.ref<!fir.array<2xcomplex<f32>>>, !fir.shape<1>) -> !fir.box<!fir.array<2xcomplex<f32>>>
+    mif.co_broadcast %34 source %c1_i32_7 : (!fir.box<!fir.array<2xcomplex<f32>>>, i32)
+    %c1_i32_8 = arith.constant 1 : i32
+    %35 = fir.shape %c2_0 : (index) -> !fir.shape<1>
+    %36 = fir.embox %6#0(%35) : (!fir.ref<!fir.array<2xf64>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf64>>
+    mif.co_broadcast %36 source %c1_i32_8 stat %24#0 : (!fir.box<!fir.array<2xf64>>, i32, !fir.ref<i32>)
+    %c1_i32_9 = arith.constant 1 : i32
+    %37 = fir.shape %c2_2 : (index) -> !fir.shape<1>
+    %38 = fir.embox %12#0(%37) : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf32>>
+    %39 = fir.embox %20#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+    mif.co_broadcast %38 source %c1_i32_9 stat %24#0 errmsg %39 : (!fir.box<!fir.array<2xf32>>, i32, !fir.ref<i32>, !fir.box<!fir.char<1>>)
+    return
+  }
+}
+
+  // CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  // CHECK: %[[V1:.*]] = fir.embox %[[VAR_I:.*]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
+  // CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  // CHECK: %[[V2:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  // CHECK: %[[V3:.*]] = fir.absent !fir.ref<i32>
+  // CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<i32>) -> !fir.box<none>
+  // CHECK: fir.call @_QMprifPprif_co_broadcast(%[[V4]], %[[IMAGE_RESULT]], %[[V3]], %[[V2]], %[[V2]]) : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+
+  // CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  // CHECK: %[[V1:.*]] = fir.embox %[[VAR_C:.*]]#0 : (!fir.ref<complex<f32>>) -> !fir.box<complex<f32>>
+  // CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  // CHECK: %[[V2:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  // CHECK: %[[V3:.*]] = fir.convert %[[V1]] : (!fir.box<complex<f32>>) -> !fir.box<none>
+  // CHECK: fir.call @_QMprifPprif_co_broadcast(%[[V3]], %[[IMAGE_RESULT]], %[[STATUS:.*]], %[[V2]], %[[V2]]) : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+
+  // CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  // CHECK: %[[V1:.*]] = fir.embox %[[VAR_D:.*]]#0 : (!fir.ref<f64>) -> !fir.box<f64>
+  // CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE:.*]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+  // CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  // CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  // CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<f64>) -> !fir.box<none>
+  // CHECK: %[[V5:.*]] = fir.convert %[[V2]] : (!fir.box<!fir.char<1>>) -> !fir.box<!fir.char<1,?>>
+  // CHECK: fir.call @_QMprifPprif_co_broadcast(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS]], %[[V5]], %[[V3]]) : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+
+  // CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  // CHECK: %[[V1:.*]] = fir.embox %[[VAR_R:.*]]#0 : (!fir.ref<f32>) -> !fir.box<f32>
+  // CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+  // CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  // CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  // CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<f32>) -> !fir.box<none>
+  // CHECK: %[[V5:.*]] = fir.convert %[[V2]] : (!fir.box<!fir.char<1>>) -> !fir.box<!fir.char<1,?>>
+  // CHECK: fir.call @_QMprifPprif_co_broadcast(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS]], %[[V5]], %[[V3]]) : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+
+  // CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  // CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
+  // CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_I:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>>
+  // CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  // CHECK: %[[V2:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  // CHECK: %[[V3:.*]] = fir.absent !fir.ref<i32>
+  // CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2xi32>>) -> !fir.box<none>
+  // CHECK: fir.call @_QMprifPprif_co_broadcast(%[[V4]], %[[IMAGE_RESULT]], %[[V3]], %[[V2]], %[[V2]]) : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  
+  // CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  // CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
+  // CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_C:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xcomplex<f32>>>, !fir.shape<1>) -> !fir.box<!fir.array<2xcomplex<f32>>>
+  // CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  // CHECK: %[[V2:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  // CHECK: %[[V3:.*]] = fir.absent !fir.ref<i32>
+  // CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2xcomplex<f32>>>) -> !fir.box<none>
+  // CHECK: fir.call @_QMprifPprif_co_broadcast(%[[V4]], %[[IMAGE_RESULT]], %[[V3]], %[[V2]], %[[V2]]) : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  
+  // CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  // CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
+  // CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_D:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xf64>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf64>>
+  // CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  // CHECK: %[[V2:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  // CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2xf64>>) -> !fir.box<none>
+  // CHECK: fir.call @_QMprifPprif_co_broadcast(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS]], %[[V2]], %[[V2]]) : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+
+  // CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  // CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
+  // CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_C:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf32>>
+  // CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+  // CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  // CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  // CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2xf32>>) -> !fir.box<none>
+  // CHECK: %[[V5:.*]] = fir.convert %[[V2]] : (!fir.box<!fir.char<1>>) -> !fir.box<!fir.char<1,?>>
+  // CHECK: fir.call @_QMprifPprif_co_broadcast(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS]], %[[V5]], %[[V3]]) : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
diff --git a/flang/test/Fir/MIF/co_max.mlir b/flang/test/Fir/MIF/co_max.mlir
new file mode 100644
index 0000000000000..f74513d862ebf
--- /dev/null
+++ b/flang/test/Fir/MIF/co_max.mlir
@@ -0,0 +1,163 @@
+// RUN: fir-opt --mif-convert %s | FileCheck %s
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr<270> = dense<32> : vector<4xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, i64 = dense<64> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, f80 = dense<128> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little", "dlti.mangling_mode" = "e", "dlti.legal_int_widths" = array<i32: 8, 16, 32, 64>, "dlti.stack_alignment" = 128 : i64>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 22.0.0 (git@github.com:SiPearl/llvm-project.git 666e4313ebc03587f27774139ad8f780bac15c3e)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+  func.func @_QQmain() attributes {fir.bindc_name = "TEST_CO_MAX"} {
+    %0 = fir.dummy_scope : !fir.dscope
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %1 = fir.alloca !fir.array<2x!fir.char<1>> {bindc_name = "array_c", uniq_name = "_QFEarray_c"}
+    %2 = fir.shape %c2 : (index) -> !fir.shape<1>
+    %3:2 = hlfir.declare %1(%2) typeparams %c1 {uniq_name = "_QFEarray_c"} : (!fir.ref<!fir.array<2x!fir.char<1>>>, !fir.shape<1>, index) -> (!fir.ref<!fir.array<2x!fir.char<1>>>, !fir.ref<!fir.array<2x!fir.char<1>>>)
+    %c2_0 = arith.constant 2 : index
+    %4 = fir.alloca !fir.array<2xf64> {bindc_name = "array_d", uniq_name = "_QFEarray_d"}
+    %5 = fir.shape %c2_0 : (index) -> !fir.shape<1>
+    %6:2 = hlfir.declare %4(%5) {uniq_name = "_QFEarray_d"} : (!fir.ref<!fir.array<2xf64>>, !fir.shape<1>) -> (!fir.ref<!fir.array<2xf64>>, !fir.ref<!fir.array<2xf64>>)
+    %c2_1 = arith.constant 2 : index
+    %7 = fir.alloca !fir.array<2xi32> {bindc_name = "array_i", uniq_name = "_QFEarray_i"}
+    %8 = fir.shape %c2_1 : (index) -> !fir.shape<1>
+    %9:2 = hlfir.declare %7(%8) {uniq_name = "_QFEarray_i"} : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<2xi32>>, !fir.ref<!fir.array<2xi32>>)
+    %c2_2 = arith.constant 2 : index
+    %10 = fir.alloca !fir.array<2xf32> {bindc_name = "array_r", uniq_name = "_QFEarray_r"}
+    %11 = fir.shape %c2_2 : (index) -> !fir.shape<1>
+    %12:2 = hlfir.declare %10(%11) {uniq_name = "_QFEarray_r"} : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<2xf32>>, !fir.ref<!fir.array<2xf32>>)
+    %c1_3 = arith.constant 1 : index
+    %13 = fir.alloca !fir.char<1> {bindc_name = "c", uniq_name = "_QFEc"}
+    %14:2 = hlfir.declare %13 typeparams %c1_3 {uniq_name = "_QFEc"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
+    %15 = fir.alloca f64 {bindc_name = "d", uniq_name = "_QFEd"}
+    %16:2 = hlfir.declare %15 {uniq_name = "_QFEd"} : (!fir.ref<f64>) -> (!fir.ref<f64>, !fir.ref<f64>)
+    %17 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"}
+    %18:2 = hlfir.declare %17 {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %c1_4 = arith.constant 1 : index
+    %19 = fir.alloca !fir.char<1> {bindc_name = "message", uniq_name = "_QFEmessage"}
+    %20:2 = hlfir.declare %19 typeparams %c1_4 {uniq_name = "_QFEmessage"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
+    %21 = fir.alloca f32 {bindc_name = "r", uniq_name = "_QFEr"}
+    %22:2 = hlfir.declare %21 {uniq_name = "_QFEr"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+    %23 = fir.alloca i32 {bindc_name = "status", uniq_name = "_QFEstatus"}
+    %24:2 = hlfir.declare %23 {uniq_name = "_QFEstatus"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %25 = fir.embox %18#0 : (!fir.ref<i32>) -> !fir.box<i32>
+    mif.co_max %25 : (!fir.box<i32>)
+    %26 = fir.embox %14#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+    mif.co_max %26 : (!fir.box<!fir.char<1>>)
+    %27 = fir.embox %16#0 : (!fir.ref<f64>) -> !fir.box<f64>
+    mif.co_max %27 : (!fir.box<f64>)
+    %28 = fir.embox %22#0 : (!fir.ref<f32>) -> !fir.box<f32>
+    mif.co_max %28 : (!fir.box<f32>)
+    %c1_i32 = arith.constant 1 : i32
+    %29 = fir.embox %18#0 : (!fir.ref<i32>) -> !fir.box<i32>
+    mif.co_max %29 result %c1_i32 : (!fir.box<i32>, i32)
+    %c1_i32_5 = arith.constant 1 : i32
+    %30 = fir.embox %16#0 : (!fir.ref<f64>) -> !fir.box<f64>
+    %31 = fir.embox %20#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+    mif.co_max %30 result %c1_i32_5 stat %24#0 errmsg %31 : (!fir.box<f64>, i32, !fir.ref<i32>, !fir.box<!fir.char<1>>)
+    %c1_i32_6 = arith.constant 1 : i32
+    %32 = fir.embox %22#0 : (!fir.ref<f32>) -> !fir.box<f32>
+    %33 = fir.embox %20#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+    mif.co_max %32 result %c1_i32_6 stat %24#0 errmsg %33 : (!fir.box<f32>, i32, !fir.ref<i32>, !fir.box<!fir.char<1>>)
+    %34 = fir.shape %c2_1 : (index) -> !fir.shape<1>
+    %35 = fir.embox %9#0(%34) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>>
+    mif.co_max %35 : (!fir.box<!fir.array<2xi32>>)
+    %c1_i32_7 = arith.constant 1 : i32
+    %36 = fir.shape %c2 : (index) -> !fir.shape<1>
+    %37 = fir.embox %3#0(%36) : (!fir.ref<!fir.array<2x!fir.char<1>>>, !fir.shape<1>) -> !fir.box<!fir.array<2x!fir.char<1>>>
+    mif.co_max %37 result %c1_i32_7 : (!fir.box<!fir.array<2x!fir.char<1>>>, i32)
+    %c1_i32_8 = arith.constant 1 : i32
+    %38 = fir.shape %c2_0 : (index) -> !fir.shape<1>
+    %39 = fir.embox %6#0(%38) : (!fir.ref<!fir.array<2xf64>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf64>>
+    mif.co_max %39 result %c1_i32_8 stat %24#0 : (!fir.box<!fir.array<2xf64>>, i32, !fir.ref<i32>)
+    %c1_i32_9 = arith.constant 1 : i32
+    %40 = fir.shape %c2_2 : (index) -> !fir.shape<1>
+    %41 = fir.embox %12#0(%40) : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf32>>
+    %42 = fir.embox %20#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+    mif.co_max %41 result %c1_i32_9 stat %24#0 errmsg %42 : (!fir.box<!fir.array<2xf32>>, i32, !fir.ref<i32>, !fir.box<!fir.char<1>>)
+    return
+  }
+}
+
+  // CHECK: %[[V1:.*]] = fir.embox %[[VAR_I:.*]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
+  // CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
+  // CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  // CHECK: %[[V4:.*]] = fir.absent !fir.ref<i32>
+  // CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<i32>) -> !fir.box<none>
+  // CHECK: fir.call @_QMprifPprif_co_max(%[[V5]], %[[V2]], %[[V4]], %[[V3]], %[[V3]]) : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+
+  // CHECK: %[[V1:.*]] = fir.embox %[[VAR_C:.*]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+  // CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
+  // CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  // CHECK: %[[V4:.*]] = fir.absent !fir.ref<i32>
+  // CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.char<1>>) -> !fir.box<none>
+  // CHECK: fir.call @_QMprifPprif_co_max_character(%[[V5]], %[[V2]], %[[V4]], %[[V3]], %[[V3]]) : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+
+  // CHECK: %[[V1:.*]] = fir.embox %[[VAR_D:.*]]#0 : (!fir.ref<f64>) -> !fir.box<f64>
+  // CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
+  // CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  // CHECK: %[[V4:.*]] = fir.absent !fir.ref<i32>
+  // CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<f64>) -> !fir.box<none>
+  // CHECK: fir.call @_QMprifPprif_co_max(%[[V5]], %[[V2]], %[[V4]], %[[V3]], %[[V3]]) : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+
+  // CHECK: %[[V1:.*]] = fir.embox %[[VAR_R:.*]]#0 : (!fir.ref<f32>) -> !fir.box<f32>
+  // CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
+  // CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  // CHECK: %[[V4:.*]] = fir.absent !fir.ref<i32>
+  // CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<f32>) -> !fir.box<none>
+  // CHECK: fir.call @_QMprifPprif_co_max(%[[V5]], %[[V2]], %[[V4]], %[[V3]], %[[V3]]) : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+
+  // CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  // CHECK: %[[V1:.*]] = fir.embox %[[VAR_I]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
+  // CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  // CHECK: %[[V2:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  // CHECK: %[[V3:.*]] = fir.absent !fir.ref<i32>
+  // CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<i32>) -> !fir.box<none>
+  // CHECK: fir.call @_QMprifPprif_co_max(%[[V4]], %[[IMAGE_RESULT]], %[[V3]], %[[V2]], %[[V2]]) : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+
+  // CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  // CHECK: %[[V1:.*]] = fir.embox %[[VAR_D]]#0 : (!fir.ref<f64>) -> !fir.box<f64>
+  // CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE:.*]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+  // CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  // CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  // CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<f64>) -> !fir.box<none>
+  // CHECK: %[[V5:.*]] = fir.convert %[[V2]] : (!fir.box<!fir.char<1>>) -> !fir.box<!fir.char<1,?>>
+  // CHECK: fir.call @_QMprifPprif_co_max(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS:.*]], %[[V5]], %[[V3]]) : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+
+  // CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  // CHECK: %[[V1:.*]] = fir.embox %[[VAR_R]]#0 : (!fir.ref<f32>) -> !fir.box<f32>
+  // CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+  // CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  // CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  // CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<f32>) -> !fir.box<none>
+  // CHECK: %[[V5:.*]] = fir.convert %[[V2]] : (!fir.box<!fir.char<1>>) -> !fir.box<!fir.char<1,?>>
+  // CHECK: fir.call @_QMprifPprif_co_max(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS]], %[[V5]], %[[V3]]) : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+
+  // CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
+  // CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_I:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>>
+  // CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
+  // CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  // CHECK: %[[V4:.*]] = fir.absent !fir.ref<i32>
+  // CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2xi32>>) -> !fir.box<none>
+  // CHECK: fir.call @_QMprifPprif_co_max(%[[V5]], %[[V2]], %[[V4]], %[[V3]], %[[V3]]) : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+   
+  // CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  // CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
+  // CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_C:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2x!fir.char<1>>>, !fir.shape<1>) -> !fir.box<!fir.array<2x!fir.char<1>>>
+  // CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  // CHECK: %[[V2:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  // CHECK: %[[V3:.*]] = fir.absent !fir.ref<i32>
+  // CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2x!fir.char<1>>>) -> !fir.box<none>
+  // CHECK: fir.call @_QMprifPprif_co_max_character(%[[V4]], %[[IMAGE_RESULT]], %[[V3]], %[[V2]], %[[V2]]) : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+   
+  // CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  // CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
+  // CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_D:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xf64>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf64>>
+  // CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  // CHECK: %[[V2:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  // CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2xf64>>) -> !fir.box<none>
+  // CHECK: fir.call @_QMprifPprif_co_max(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS]], %[[V2]], %[[V2]]) : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+
+  // CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  // CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
+  // CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_C:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf32>>
+  // CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+  // CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  // CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  // CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2xf32>>) -> !fir.box<none>
+  // CHECK: %[[V5:.*]] = fir.convert %[[V2]] : (!fir.box<!fir.char<1>>) -> !fir.box<!fir.char<1,?>>
+  // CHECK: fir.call @_QMprifPprif_co_max(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS]], %[[V5]], %[[V3]]) : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
diff --git a/flang/test/Fir/MIF/co_min.mlir b/flang/test/Fir/MIF/co_min.mlir
new file mode 100644
index 0000000000000..97806bb376d38
--- /dev/null
+++ b/flang/test/Fir/MIF/co_min.mlir
@@ -0,0 +1,163 @@
+// RUN: fir-opt --mif-convert %s | FileCheck %s
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr<270> = dense<32> : vector<4xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, i64 = dense<64> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, f80 = dense<128> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little", "dlti.mangling_mode" = "e", "dlti.legal_int_widths" = array<i32: 8, 16, 32, 64>, "dlti.stack_alignment" = 128 : i64>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 22.0.0 (git@github.com:SiPearl/llvm-project.git 666e4313ebc03587f27774139ad8f780bac15c3e)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+  func.func @_QQmain() attributes {fir.bindc_name = "TEST_CO_MIN"} {
+    %0 = fir.dummy_scope : !fir.dscope
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %1 = fir.alloca !fir.array<2x!fir.char<1>> {bindc_name = "array_c", uniq_name = "_QFEarray_c"}
+    %2 = fir.shape %c2 : (index) -> !fir.shape<1>
+    %3:2 = hlfir.declare %1(%2) typeparams %c1 {uniq_name = "_QFEarray_c"} : (!fir.ref<!fir.array<2x!fir.char<1>>>, !fir.shape<1>, index) -> (!fir.ref<!fir.array<2x!fir.char<1>>>, !fir.ref<!fir.array<2x!fir.char<1>>>)
+    %c2_0 = arith.constant 2 : index
+    %4 = fir.alloca !fir.array<2xf64> {bindc_name = "array_d", uniq_name = "_QFEarray_d"}
+    %5 = fir.shape %c2_0 : (index) -> !fir.shape<1>
+    %6:2 = hlfir.declare %4(%5) {uniq_name = "_QFEarray_d"} : (!fir.ref<!fir.array<2xf64>>, !fir.shape<1>) -> (!fir.ref<!fir.array<2xf64>>, !fir.ref<!fir.array<2xf64>>)
+    %c2_1 = arith.constant 2 : index
+    %7 = fir.alloca !fir.array<2xi32> {bindc_name = "array_i", uniq_name = "_QFEarray_i"}
+    %8 = fir.shape %c2_1 : (index) -> !fir.shape<1>
+    %9:2 = hlfir.declare %7(%8) {uniq_name = "_QFEarray_i"} : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<2xi32>>, !fir.ref<!fir.array<2xi32>>)
+    %c2_2 = arith.constant 2 : index
+    %10 = fir.alloca !fir.array<2xf32> {bindc_name = "array_r", uniq_name = "_QFEarray_r"}
+    %11 = fir.shape %c2_2 : (index) -> !fir.shape<1>
+    %12:2 = hlfir.declare %10(%11) {uniq_name = "_QFEarray_r"} : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<2xf32>>, !fir.ref<!fir.array<2xf32>>)
+    %c1_3 = arith.constant 1 : index
+    %13 = fir.alloca !fir.char<1> {bindc_name = "c", uniq_name = "_QFEc"}
+    %14:2 = hlfir.declare %13 typeparams %c1_3 {uniq_name = "_QFEc"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
+    %15 = fir.alloca f64 {bindc_name = "d", uniq_name = "_QFEd"}
+    %16:2 = hlfir.declare %15 {uniq_name = "_QFEd"} : (!fir.ref<f64>) -> (!fir.ref<f64>, !fir.ref<f64>)
+    %17 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"}
+    %18:2 = hlfir.declare %17 {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %c1_4 = arith.constant 1 : index
+    %19 = fir.alloca !fir.char<1> {bindc_name = "message", uniq_name = "_QFEmessage"}
+    %20:2 = hlfir.declare %19 typeparams %c1_4 {uniq_name = "_QFEmessage"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
+    %21 = fir.alloca f32 {bindc_name = "r", uniq_name = "_QFEr"}
+    %22:2 = hlfir.declare %21 {uniq_name = "_QFEr"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+    %23 = fir.alloca i32 {bindc_name = "status", uniq_name = "_QFEstatus"}
+    %24:2 = hlfir.declare %23 {uniq_name = "_QFEstatus"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %25 = fir.embox %18#0 : (!fir.ref<i32>) -> !fir.box<i32>
+    mif.co_min %25 : (!fir.box<i32>)
+    %26 = fir.embox %14#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+    mif.co_min %26 : (!fir.box<!fir.char<1>>)
+    %27 = fir.embox %16#0 : (!fir.ref<f64>) -> !fir.box<f64>
+    mif.co_min %27 : (!fir.box<f64>)
+    %28 = fir.embox %22#0 : (!fir.ref<f32>) -> !fir.box<f32>
+    mif.co_min %28 : (!fir.box<f32>)
+    %c1_i32 = arith.constant 1 : i32
+    %29 = fir.embox %18#0 : (!fir.ref<i32>) -> !fir.box<i32>
+    mif.co_min %29 result %c1_i32 : (!fir.box<i32>, i32)
+    %c1_i32_5 = arith.constant 1 : i32
+    %30 = fir.embox %16#0 : (!fir.ref<f64>) -> !fir.box<f64>
+    %31 = fir.embox %20#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+    mif.co_min %30 result %c1_i32_5 stat %24#0 errmsg %31 : (!fir.box<f64>, i32, !fir.ref<i32>, !fir.box<!fir.char<1>>)
+    %c1_i32_6 = arith.constant 1 : i32
+    %32 = fir.embox %22#0 : (!fir.ref<f32>) -> !fir.box<f32>
+    %33 = fir.embox %20#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+    mif.co_min %32 result %c1_i32_6 stat %24#0 errmsg %33 : (!fir.box<f32>, i32, !fir.ref<i32>, !fir.box<!fir.char<1>>)
+    %34 = fir.shape %c2_1 : (index) -> !fir.shape<1>
+    %35 = fir.embox %9#0(%34) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>>
+    mif.co_min %35 : (!fir.box<!fir.array<2xi32>>)
+    %c1_i32_7 = arith.constant 1 : i32
+    %36 = fir.shape %c2 : (index) -> !fir.shape<1>
+    %37 = fir.embox %3#0(%36) : (!fir.ref<!fir.array<2x!fir.char<1>>>, !fir.shape<1>) -> !fir.box<!fir.array<2x!fir.char<1>>>
+    mif.co_min %37 result %c1_i32_7 : (!fir.box<!fir.array<2x!fir.char<1>>>, i32)
+    %c1_i32_8 = arith.constant 1 : i32
+    %38 = fir.shape %c2_0 : (index) -> !fir.shape<1>
+    %39 = fir.embox %6#0(%38) : (!fir.ref<!fir.array<2xf64>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf64>>
+    mif.co_min %39 result %c1_i32_8 stat %24#0 : (!fir.box<!fir.array<2xf64>>, i32, !fir.ref<i32>)
+    %c1_i32_9 = arith.constant 1 : i32
+    %40 = fir.shape %c2_2 : (index) -> !fir.shape<1>
+    %41 = fir.embox %12#0(%40) : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf32>>
+    %42 = fir.embox %20#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+    mif.co_min %41 result %c1_i32_9 stat %24#0 errmsg %42 : (!fir.box<!fir.array<2xf32>>, i32, !fir.ref<i32>, !fir.box<!fir.char<1>>)
+    return
+  }
+}
+
+  // CHECK: %[[V1:.*]] = fir.embox %[[VAR_I:.*]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
+  // CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
+  // CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  // CHECK: %[[V4:.*]] = fir.absent !fir.ref<i32>
+  // CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<i32>) -> !fir.box<none>
+  // CHECK: fir.call @_QMprifPprif_co_min(%[[V5]], %[[V2]], %[[V4]], %[[V3]], %[[V3]]) : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+
+  // CHECK: %[[V1:.*]] = fir.embox %[[VAR_C:.*]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+  // CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
+  // CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  // CHECK: %[[V4:.*]] = fir.absent !fir.ref<i32>
+  // CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.char<1>>) -> !fir.box<none>
+  // CHECK: fir.call @_QMprifPprif_co_min_character(%[[V5]], %[[V2]], %[[V4]], %[[V3]], %[[V3]]) : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+
+  // CHECK: %[[V1:.*]] = fir.embox %[[VAR_D:.*]]#0 : (!fir.ref<f64>) -> !fir.box<f64>
+  // CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
+  // CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  // CHECK: %[[V4:.*]] = fir.absent !fir.ref<i32>
+  // CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<f64>) -> !fir.box<none>
+  // CHECK: fir.call @_QMprifPprif_co_min(%[[V5]], %[[V2]], %[[V4]], %[[V3]], %[[V3]]) : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+
+  // CHECK: %[[V1:.*]] = fir.embox %[[VAR_R:.*]]#0 : (!fir.ref<f32>) -> !fir.box<f32>
+  // CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
+  // CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  // CHECK: %[[V4:.*]] = fir.absent !fir.ref<i32>
+  // CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<f32>) -> !fir.box<none>
+  // CHECK: fir.call @_QMprifPprif_co_min(%[[V5]], %[[V2]], %[[V4]], %[[V3]], %[[V3]]) : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+
+  // CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  // CHECK: %[[V1:.*]] = fir.embox %[[VAR_I]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
+  // CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  // CHECK: %[[V2:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  // CHECK: %[[V3:.*]] = fir.absent !fir.ref<i32>
+  // CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<i32>) -> !fir.box<none>
+  // CHECK: fir.call @_QMprifPprif_co_min(%[[V4]], %[[IMAGE_RESULT]], %[[V3]], %[[V2]], %[[V2]]) : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+
+  // CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  // CHECK: %[[V1:.*]] = fir.embox %[[VAR_D]]#0 : (!fir.ref<f64>) -> !fir.box<f64>
+  // CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE:.*]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+  // CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  // CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  // CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<f64>) -> !fir.box<none>
+  // CHECK: %[[V5:.*]] = fir.convert %[[V2]] : (!fir.box<!fir.char<1>>) -> !fir.box<!fir.char<1,?>>
+  // CHECK: fir.call @_QMprifPprif_co_min(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS:.*]], %[[V5]], %[[V3]]) : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+
+  // CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  // CHECK: %[[V1:.*]] = fir.embox %[[VAR_R]]#0 : (!fir.ref<f32>) -> !fir.box<f32>
+  // CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+  // CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  // CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  // CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<f32>) -> !fir.box<none>
+  // CHECK: %[[V5:.*]] = fir.convert %[[V2]] : (!fir.box<!fir.char<1>>) -> !fir.box<!fir.char<1,?>>
+  // CHECK: fir.call @_QMprifPprif_co_min(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS]], %[[V5]], %[[V3]]) : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+
+  // CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
+  // CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_I:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>>
+  // CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
+  // CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  // CHECK: %[[V4:.*]] = fir.absent !fir.ref<i32>
+  // CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2xi32>>) -> !fir.box<none>
+  // CHECK: fir.call @_QMprifPprif_co_min(%[[V5]], %[[V2]], %[[V4]], %[[V3]], %[[V3]]) : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+   
+  // CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  // CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
+  // CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_C:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2x!fir.char<1>>>, !fir.shape<1>) -> !fir.box<!fir.array<2x!fir.char<1>>>
+  // CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  // CHECK: %[[V2:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  // CHECK: %[[V3:.*]] = fir.absent !fir.ref<i32>
+  // CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2x!fir.char<1>>>) -> !fir.box<none>
+  // CHECK: fir.call @_QMprifPprif_co_min_character(%[[V4]], %[[IMAGE_RESULT]], %[[V3]], %[[V2]], %[[V2]]) : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+   
+  // CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  // CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
+  // CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_D:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xf64>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf64>>
+  // CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  // CHECK: %[[V2:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  // CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2xf64>>) -> !fir.box<none>
+  // CHECK: fir.call @_QMprifPprif_co_min(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS]], %[[V2]], %[[V2]]) : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+
+  // CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  // CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
+  // CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_C:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf32>>
+  // CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+  // CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  // CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  // CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2xf32>>) -> !fir.box<none>
+  // CHECK: %[[V5:.*]] = fir.convert %[[V2]] : (!fir.box<!fir.char<1>>) -> !fir.box<!fir.char<1,?>>
+  // CHECK: fir.call @_QMprifPprif_co_min(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS]], %[[V5]], %[[V3]]) : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
diff --git a/flang/test/Fir/MIF/co_sum.mlir b/flang/test/Fir/MIF/co_sum.mlir
new file mode 100644
index 0000000000000..8afce3582a97f
--- /dev/null
+++ b/flang/test/Fir/MIF/co_sum.mlir
@@ -0,0 +1,133 @@
+// RUN: fir-opt --mif-convert %s | FileCheck %s
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr<270> = dense<32> : vector<4xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, i64 = dense<64> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, f80 = dense<128> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little", "dlti.mangling_mode" = "e", "dlti.legal_int_widths" = array<i32: 8, 16, 32, 64>, "dlti.stack_alignment" = 128 : i64>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 22.0.0 (git@github.com:SiPearl/llvm-project.git 666e4313ebc03587f27774139ad8f780bac15c3e)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+   func.func @_QQmain() attributes {fir.bindc_name = "TEST_CO_SUM"} {
+    %0 = fir.dummy_scope : !fir.dscope
+    %c2 = arith.constant 2 : index
+    %1 = fir.alloca !fir.array<2xf64> {bindc_name = "array_d", uniq_name = "_QFEarray_d"}
+    %2 = fir.shape %c2 : (index) -> !fir.shape<1>
+    %3:2 = hlfir.declare %1(%2) {uniq_name = "_QFEarray_d"} : (!fir.ref<!fir.array<2xf64>>, !fir.shape<1>) -> (!fir.ref<!fir.array<2xf64>>, !fir.ref<!fir.array<2xf64>>)
+    %c2_0 = arith.constant 2 : index
+    %4 = fir.alloca !fir.array<2xi32> {bindc_name = "array_i", uniq_name = "_QFEarray_i"}
+    %5 = fir.shape %c2_0 : (index) -> !fir.shape<1>
+    %6:2 = hlfir.declare %4(%5) {uniq_name = "_QFEarray_i"} : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<2xi32>>, !fir.ref<!fir.array<2xi32>>)
+    %c2_1 = arith.constant 2 : index
+    %7 = fir.alloca !fir.array<2xf32> {bindc_name = "array_r", uniq_name = "_QFEarray_r"}
+    %8 = fir.shape %c2_1 : (index) -> !fir.shape<1>
+    %9:2 = hlfir.declare %7(%8) {uniq_name = "_QFEarray_r"} : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<2xf32>>, !fir.ref<!fir.array<2xf32>>)
+    %10 = fir.alloca f64 {bindc_name = "d", uniq_name = "_QFEd"}
+    %11:2 = hlfir.declare %10 {uniq_name = "_QFEd"} : (!fir.ref<f64>) -> (!fir.ref<f64>, !fir.ref<f64>)
+    %12 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"}
+    %13:2 = hlfir.declare %12 {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %c1 = arith.constant 1 : index
+    %14 = fir.alloca !fir.char<1> {bindc_name = "message", uniq_name = "_QFEmessage"}
+    %15:2 = hlfir.declare %14 typeparams %c1 {uniq_name = "_QFEmessage"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
+    %16 = fir.alloca f32 {bindc_name = "r", uniq_name = "_QFEr"}
+    %17:2 = hlfir.declare %16 {uniq_name = "_QFEr"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+    %18 = fir.alloca i32 {bindc_name = "status", uniq_name = "_QFEstatus"}
+    %19:2 = hlfir.declare %18 {uniq_name = "_QFEstatus"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %20 = fir.embox %13#0 : (!fir.ref<i32>) -> !fir.box<i32>
+    mif.co_sum %20 : (!fir.box<i32>)
+    %21 = fir.embox %11#0 : (!fir.ref<f64>) -> !fir.box<f64>
+    mif.co_sum %21 : (!fir.box<f64>)
+    %22 = fir.embox %17#0 : (!fir.ref<f32>) -> !fir.box<f32>
+    mif.co_sum %22 : (!fir.box<f32>)
+    %c1_i32 = arith.constant 1 : i32
+    %23 = fir.embox %13#0 : (!fir.ref<i32>) -> !fir.box<i32>
+    mif.co_sum %23 result %c1_i32 : (!fir.box<i32>, i32)
+    %c1_i32_2 = arith.constant 1 : i32
+    %24 = fir.embox %11#0 : (!fir.ref<f64>) -> !fir.box<f64>
+    %25 = fir.embox %15#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+    mif.co_sum %24 result %c1_i32_2 stat %19#0 errmsg %25 : (!fir.box<f64>, i32, !fir.ref<i32>, !fir.box<!fir.char<1>>)
+    %c1_i32_3 = arith.constant 1 : i32
+    %26 = fir.embox %17#0 : (!fir.ref<f32>) -> !fir.box<f32>
+    %27 = fir.embox %15#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+    mif.co_sum %26 result %c1_i32_3 stat %19#0 errmsg %27 : (!fir.box<f32>, i32, !fir.ref<i32>, !fir.box<!fir.char<1>>)
+    %28 = fir.shape %c2_0 : (index) -> !fir.shape<1>
+    %29 = fir.embox %6#0(%28) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>>
+    mif.co_sum %29 : (!fir.box<!fir.array<2xi32>>)
+    %c1_i32_4 = arith.constant 1 : i32
+    %30 = fir.shape %c2 : (index) -> !fir.shape<1>
+    %31 = fir.embox %3#0(%30) : (!fir.ref<!fir.array<2xf64>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf64>>
+    mif.co_sum %31 result %c1_i32_4 stat %19#0 : (!fir.box<!fir.array<2xf64>>, i32, !fir.ref<i32>)
+    %c1_i32_5 = arith.constant 1 : i32
+    %32 = fir.shape %c2_1 : (index) -> !fir.shape<1>
+    %33 = fir.embox %9#0(%32) : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf32>>
+    %34 = fir.embox %15#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+    mif.co_sum %33 result %c1_i32_5 stat %19#0 errmsg %34 : (!fir.box<!fir.array<2xf32>>, i32, !fir.ref<i32>, !fir.box<!fir.char<1>>)
+    return
+  }
+}
+
+  // CHECK: %[[V1:.*]] = fir.embox %[[VAR_I:.*]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
+  // CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
+  // CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  // CHECK: %[[V4:.*]] = fir.absent !fir.ref<i32>
+  // CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<i32>) -> !fir.box<none>
+  // CHECK: fir.call @_QMprifPprif_co_sum(%[[V5]], %[[V2]], %[[V4]], %[[V3]], %[[V3]]) : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+
+  // CHECK: %[[V1:.*]] = fir.embox %[[VAR_D:.*]]#0 : (!fir.ref<f64>) -> !fir.box<f64>
+  // CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
+  // CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  // CHECK: %[[V4:.*]] = fir.absent !fir.ref<i32>
+  // CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<f64>) -> !fir.box<none>
+  // CHECK: fir.call @_QMprifPprif_co_sum(%[[V5]], %[[V2]], %[[V4]], %[[V3]], %[[V3]]) : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+
+  // CHECK: %[[V1:.*]] = fir.embox %[[VAR_R:.*]]#0 : (!fir.ref<f32>) -> !fir.box<f32>
+  // CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
+  // CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  // CHECK: %[[V4:.*]] = fir.absent !fir.ref<i32>
+  // CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<f32>) -> !fir.box<none>
+  // CHECK: fir.call @_QMprifPprif_co_sum(%[[V5]], %[[V2]], %[[V4]], %[[V3]], %[[V3]]) : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+
+  // CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  // CHECK: %[[V1:.*]] = fir.embox %[[VAR_I]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
+  // CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  // CHECK: %[[V2:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  // CHECK: %[[V3:.*]] = fir.absent !fir.ref<i32>
+  // CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<i32>) -> !fir.box<none>
+  // CHECK: fir.call @_QMprifPprif_co_sum(%[[V4]], %[[IMAGE_RESULT]], %[[V3]], %[[V2]], %[[V2]]) : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+
+  // CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  // CHECK: %[[V1:.*]] = fir.embox %[[VAR_D]]#0 : (!fir.ref<f64>) -> !fir.box<f64>
+  // CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE:.*]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+  // CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  // CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  // CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<f64>) -> !fir.box<none>
+  // CHECK: %[[V5:.*]] = fir.convert %[[V2]] : (!fir.box<!fir.char<1>>) -> !fir.box<!fir.char<1,?>>
+  // CHECK: fir.call @_QMprifPprif_co_sum(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS:.*]], %[[V5]], %[[V3]]) : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+
+  // CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  // CHECK: %[[V1:.*]] = fir.embox %[[VAR_R]]#0 : (!fir.ref<f32>) -> !fir.box<f32>
+  // CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+  // CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  // CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  // CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<f32>) -> !fir.box<none>
+  // CHECK: %[[V5:.*]] = fir.convert %[[V2]] : (!fir.box<!fir.char<1>>) -> !fir.box<!fir.char<1,?>>
+  // CHECK: fir.call @_QMprifPprif_co_sum(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS]], %[[V5]], %[[V3]]) : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+
+  // CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
+  // CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_I:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>>
+  // CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
+  // CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  // CHECK: %[[V4:.*]] = fir.absent !fir.ref<i32>
+  // CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2xi32>>) -> !fir.box<none>
+  // CHECK: fir.call @_QMprifPprif_co_sum(%[[V5]], %[[V2]], %[[V4]], %[[V3]], %[[V3]]) : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+   
+  // CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  // CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
+  // CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_D:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xf64>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf64>>
+  // CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  // CHECK: %[[V2:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  // CHECK: %[[V3:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2xf64>>) -> !fir.box<none>
+  // CHECK: fir.call @_QMprifPprif_co_sum(%[[V3]], %[[IMAGE_RESULT]], %[[STATUS]], %[[V2]], %[[V2]]) : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+
+  // CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  // CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
+  // CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_C:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf32>>
+  // CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+  // CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
+  // CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  // CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2xf32>>) -> !fir.box<none>
+  // CHECK: %[[V5:.*]] = fir.convert %[[V2]] : (!fir.box<!fir.char<1>>) -> !fir.box<!fir.char<1,?>>
+  // CHECK: fir.call @_QMprifPprif_co_sum(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS]], %[[V5]], %[[V3]]) : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
diff --git a/flang/test/Fir/MIF/init.mlir b/flang/test/Fir/MIF/init.mlir
new file mode 100644
index 0000000000000..0f1177f92427e
--- /dev/null
+++ b/flang/test/Fir/MIF/init.mlir
@@ -0,0 +1,24 @@
+// RUN: fir-opt --split-input-file --mif-convert %s | FileCheck %s
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr<270> = dense<32> : vector<4xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, i64 = dense<64> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, f80 = dense<128> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little", "dlti.mangling_mode" = "e", "dlti.legal_int_widths" = array<i32: 8, 16, 32, 64>, "dlti.stack_alignment" = 128 : i64>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 22.0.0 (git@github.com:SiPearl/llvm-project.git 666e4313ebc03587f27774139ad8f780bac15c3e)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+  func.func @_QQmain() attributes {fir.bindc_name = "TEST_INIT"} {
+    %0 = fir.dummy_scope : !fir.dscope
+    return
+  }
+  func.func private @_FortranAProgramStart(i32, !llvm.ptr, !llvm.ptr, !llvm.ptr)
+  func.func private @_FortranAProgramEndStatement()
+  func.func @main(%arg0: i32, %arg1: !llvm.ptr, %arg2: !llvm.ptr) -> i32 {
+    %0 = fir.zero_bits !fir.ref<tuple<i32, !fir.ref<!fir.array<0xtuple<!fir.ref<i8>, !fir.ref<i8>>>>>>
+    fir.call @_FortranAProgramStart(%arg0, %arg1, %arg2, %0) fastmath<contract> : (i32, !llvm.ptr, !llvm.ptr, !fir.ref<tuple<i32, !fir.ref<!fir.array<0xtuple<!fir.ref<i8>, !fir.ref<i8>>>>>>) -> ()
+    %1 = mif.init -> i32
+    fir.call @_QQmain() fastmath<contract> : () -> ()
+    fir.call @_FortranAProgramEndStatement() fastmath<contract> : () -> ()
+    %c0_i32 = arith.constant 0 : i32
+    return %c0_i32 : i32
+  }
+}
+
+
+// CHECK-LABEL: func.func @main
+// CHECK: %[[VAL_0:.*]] = fir.alloca i32 
+// CHECK: fir.call @_QMprifPprif_init(%[[VAL_0]]) : (!fir.ref<i32>) -> () 
diff --git a/flang/test/Fir/MIF/num_images.mlir b/flang/test/Fir/MIF/num_images.mlir
new file mode 100644
index 0000000000000..afa33a9063c3d
--- /dev/null
+++ b/flang/test/Fir/MIF/num_images.mlir
@@ -0,0 +1,22 @@
+// RUN: fir-opt --mif-convert %s | FileCheck %s
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr<270> = dense<32> : vector<4xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, i64 = dense<64> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, f80 = dense<128> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little", "dlti.mangling_mode" = "e", "dlti.legal_int_widths" = array<i32: 8, 16, 32, 64>, "dlti.stack_alignment" = 128 : i64>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 22.0.0 (git@github.com:SiPearl/llvm-project.git 666e4313ebc03587f27774139ad8f780bac15c3e)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+  func.func @_QQmain() attributes {fir.bindc_name = "TEST"} {
+    %0 = fir.dummy_scope : !fir.dscope
+    %1 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"}
+    %2:2 = hlfir.declare %1 {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %3 = fir.alloca i32 {bindc_name = "team_number", uniq_name = "_QFEteam_number"}
+    %4:2 = hlfir.declare %3 {uniq_name = "_QFEteam_number"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %5 = mif.num_images : () -> i32
+    hlfir.assign %5 to %2#0 : i32, !fir.ref<i32>
+    %6 = fir.load %4#0 : !fir.ref<i32>
+    %7 = mif.num_images team_number %6 : (i32) -> i32
+    hlfir.assign %7 to %2#0 : i32, !fir.ref<i32>
+    return
+  }
+}
+
+
+// CHECK-LABEL: func.func @_QQmain
+// CHECK: fir.call @_QMprifPprif_num_images(
+// CHECK: fir.call @_QMprifPprif_num_images_with_team_number(
diff --git a/flang/test/Fir/MIF/sync_all.mlir b/flang/test/Fir/MIF/sync_all.mlir
new file mode 100644
index 0000000000000..b2b98b9aae9e8
--- /dev/null
+++ b/flang/test/Fir/MIF/sync_all.mlir
@@ -0,0 +1,45 @@
+// RUN: fir-opt --mif-convert %s | FileCheck %s
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr<270> = dense<32> : vector<4xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, i64 = dense<64> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, f80 = dense<128> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little", "dlti.mangling_mode" = "e", "dlti.legal_int_widths" = array<i32: 8, 16, 32, 64>, "dlti.stack_alignment" = 128 : i64>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 22.0.0 (git@github.com:SiPearl/llvm-project.git 666e4313ebc03587f27774139ad8f780bac15c3e)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+  func.func @_QQmain() attributes {fir.bindc_name = "TEST_SYNC_ALL"} {
+    %0 = fir.dummy_scope : !fir.dscope
+    %1 = fir.address_of(@_QFEerror_message) : !fir.ref<!fir.char<1,128>>
+    %c128 = arith.constant 128 : index
+    %2:2 = hlfir.declare %1 typeparams %c128 {uniq_name = "_QFEerror_message"} : (!fir.ref<!fir.char<1,128>>, index) -> (!fir.ref<!fir.char<1,128>>, !fir.ref<!fir.char<1,128>>)
+    %3 = fir.alloca i32 {bindc_name = "sync_status", uniq_name = "_QFEsync_status"}
+    %4:2 = hlfir.declare %3 {uniq_name = "_QFEsync_status"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    mif.sync_all : () -> ()
+    mif.sync_all stat %4#0 : (!fir.ref<i32>) -> ()
+    %5 = fir.embox %2#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
+    mif.sync_all errmsg %5 : (!fir.box<!fir.char<1,128>>) -> ()
+    %6 = fir.embox %2#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
+    mif.sync_all stat %4#0 errmsg %6 : (!fir.ref<i32>, !fir.box<!fir.char<1,128>>) -> ()
+    return
+  }
+  fir.global internal @_QFEerror_message : !fir.char<1,128> {
+    %0 = fir.zero_bits !fir.char<1,128>
+    fir.has_value %0 : !fir.char<1,128>
+  }
+}
+
+
+// CHECK: %[[ERRMSG:.*]]:2 = hlfir.declare %[[E:.*]] typeparams %[[C_128:.*]] {uniq_name = "_QFEerror_message"} : (!fir.ref<!fir.char<1,128>>, index) -> (!fir.ref<!fir.char<1,128>>, !fir.ref<!fir.char<1,128>>)
+// CHECK: %[[STAT:.*]]:2 = hlfir.declare %[[S:.*]] {uniq_name = "_QFEsync_status"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
+// CHECK: %[[VAL_1:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+// CHECK: %[[VAL_2:.*]] = fir.absent !fir.ref<i32>
+// CHECK: fir.call @_QMprifPprif_sync_all(%[[VAL_2]], %[[VAL_1]], %[[VAL_1]]) : (!fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+
+// CHECK: %[[VAL_3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+// CHECK: fir.call @_QMprifPprif_sync_all(%[[STAT]]#0, %[[VAL_3]], %[[VAL_3]]) : (!fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+
+// CHECK: %[[VAL_4:.*]] = fir.embox %[[ERRMSG]]#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
+// CHECK: %[[VAL_5:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+// CHECK: %[[VAL_6:.*]] = fir.absent !fir.ref<i32>
+// CHECK: %[[VAL_7:.*]] = fir.convert %[[VAL_4]] : (!fir.box<!fir.char<1,128>>) -> !fir.box<!fir.char<1,?>>
+// CHECK: fir.call @_QMprifPprif_sync_all(%[[VAL_6]], %[[VAL_7]], %[[VAL_5]]) : (!fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+
+// CHECK: %[[VAL_8:.*]] = fir.embox %[[ERRMSG]]#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
+// CHECK: %[[VAL_9:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+// CHECK: %[[VAL_10:.*]] = fir.convert %[[VAL_8]] : (!fir.box<!fir.char<1,128>>) -> !fir.box<!fir.char<1,?>>
+// CHECK: fir.call @_QMprifPprif_sync_all(%[[STAT]]#0, %[[VAL_10]], %[[VAL_9]]) : (!fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
diff --git a/flang/test/Fir/MIF/sync_images.mlir b/flang/test/Fir/MIF/sync_images.mlir
new file mode 100644
index 0000000000000..e38fdaa5a4c14
--- /dev/null
+++ b/flang/test/Fir/MIF/sync_images.mlir
@@ -0,0 +1,85 @@
+// RUN: fir-opt --mif-convert %s | FileCheck %s
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr<270> = dense<32> : vector<4xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, i64 = dense<64> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, f80 = dense<128> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little", "dlti.mangling_mode" = "e", "dlti.legal_int_widths" = array<i32: 8, 16, 32, 64>, "dlti.stack_alignment" = 128 : i64>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 22.0.0 (git@github.com:SiPearl/llvm-project.git 666e4313ebc03587f27774139ad8f780bac15c3e)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+  func.func @_QQmain() attributes {fir.bindc_name = "TEST_SYNC_IMAGES"} {
+    %0 = fir.dummy_scope : !fir.dscope
+    %1 = fir.address_of(@_QFEerror_message) : !fir.ref<!fir.char<1,128>>
+    %c128 = arith.constant 128 : index
+    %2:2 = hlfir.declare %1 typeparams %c128 {uniq_name = "_QFEerror_message"} : (!fir.ref<!fir.char<1,128>>, index) -> (!fir.ref<!fir.char<1,128>>, !fir.ref<!fir.char<1,128>>)
+    %3 = fir.alloca i32 {bindc_name = "me", uniq_name = "_QFEme"}
+    %4:2 = hlfir.declare %3 {uniq_name = "_QFEme"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %5 = fir.alloca i32 {bindc_name = "sync_status", uniq_name = "_QFEsync_status"}
+    %6:2 = hlfir.declare %5 {uniq_name = "_QFEsync_status"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %7 = fir.embox %2#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
+    mif.sync_images stat %6#0 errmsg %7 : (!fir.ref<i32>, !fir.box<!fir.char<1,128>>) -> ()
+    %8 = fir.embox %2#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
+    %9 = fir.embox %4#0 : (!fir.ref<i32>) -> !fir.box<i32>
+    mif.sync_images image_set %9 stat %6#0 errmsg %8 : (!fir.box<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,128>>) -> ()
+    %10 = fir.embox %2#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
+    %11 = fir.address_of(@_QQro.1xi4.0) : !fir.ref<!fir.array<1xi32>>
+    %c1 = arith.constant 1 : index
+    %12 = fir.shape %c1 : (index) -> !fir.shape<1>
+    %13:2 = hlfir.declare %11(%12) {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQro.1xi4.0"} : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<1xi32>>, !fir.ref<!fir.array<1xi32>>)
+    %14 = fir.shape %c1 : (index) -> !fir.shape<1>
+    %15 = fir.embox %13#0(%14) : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<1xi32>>
+    mif.sync_images image_set %15 stat %6#0 errmsg %10 : (!fir.box<!fir.array<1xi32>>, !fir.ref<i32>, !fir.box<!fir.char<1,128>>) -> ()
+    mif.sync_images : () -> ()
+    %16 = fir.embox %4#0 : (!fir.ref<i32>) -> !fir.box<i32>
+    mif.sync_images image_set %16 : (!fir.box<i32>) -> ()
+    %17 = fir.address_of(@_QQro.1xi4.0) : !fir.ref<!fir.array<1xi32>>
+    %c1_0 = arith.constant 1 : index
+    %18 = fir.shape %c1_0 : (index) -> !fir.shape<1>
+    %19:2 = hlfir.declare %17(%18) {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQro.1xi4.0"} : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<1xi32>>, !fir.ref<!fir.array<1xi32>>)
+    %20 = fir.shape %c1_0 : (index) -> !fir.shape<1>
+    %21 = fir.embox %19#0(%20) : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<1xi32>>
+    mif.sync_images image_set %21 : (!fir.box<!fir.array<1xi32>>) -> ()
+    return
+  }
+  fir.global internal @_QFEerror_message : !fir.char<1,128> {
+    %0 = fir.zero_bits !fir.char<1,128>
+    fir.has_value %0 : !fir.char<1,128>
+  }
+}
+  
+  // CHECK: %[[ERRMSG:.*]]:2 = hlfir.declare %[[E:.*]] typeparams %[[C_128:.*]] {uniq_name = "_QFEerror_message"} : (!fir.ref<!fir.char<1,128>>, index) -> (!fir.ref<!fir.char<1,128>>, !fir.ref<!fir.char<1,128>>)
+  // CHECK: %[[ME:.*]]:2 = hlfir.declare %[[M:.*]] {uniq_name = "_QFEme"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  // CHECK: %[[STAT:.*]]:2 = hlfir.declare %[[S:.*]] {uniq_name = "_QFEsync_status"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
+  // CHECK: %[[VAL_1:.*]] = fir.embox %[[ERRMSG]]#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
+  // CHECK: %[[VAL_2:.*]] = fir.absent !fir.box<!fir.array<?xi32>> 
+  // CHECK: %[[VAL_3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  // CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_1]] : (!fir.box<!fir.char<1,128>>) -> !fir.box<!fir.char<1,?>>
+  // CHECK: fir.call @_QMprifPprif_sync_images(%[[VAL_2]], %[[STAT]]#0, %[[VAL_4]], %[[VAL_3]]) : (!fir.box<!fir.array<?xi32>>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+
+  // CHECK: %[[VAL_5:.*]] = fir.embox %[[ERRMSG]]#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
+  // CHECK: %[[VAL_6:.*]] = fir.embox %[[ME]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
+  // CHECK: %[[VAL_7:.*]] = fir.rebox %[[VAL_6]](%[[SHAPE:.*]]) : (!fir.box<i32>, !fir.shape<1>) -> !fir.box<!fir.array<1xi32>>
+  // CHECK: %[[VAL_8:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  // CHECK: %[[VAL_9:.*]] = fir.convert %[[VAL_7]] : (!fir.box<!fir.array<1xi32>>) -> !fir.box<!fir.array<?xi32>> 
+  // CHECK: %[[VAL_10:.*]] = fir.convert %[[VAL_5]] : (!fir.box<!fir.char<1,128>>) -> !fir.box<!fir.char<1,?>>
+  // CHECK: fir.call @_QMprifPprif_sync_images(%[[VAL_9]], %[[STAT]]#0, %[[VAL_10]], %[[VAL_8]]) : (!fir.box<!fir.array<?xi32>>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+
+  // CHECK: %[[VAL_11:.*]] = fir.embox %[[ERRMSG]]#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
+  // CHECK: %[[VAL_12:.*]] = fir.embox %[[IMG_SET:.*]]#0(%[[SHAPE_1:.*]]) : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<1xi32>>
+  // CHECK: %[[VAL_13:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  // CHECK: %[[VAL_14:.*]] = fir.convert %[[VAL_12]] : (!fir.box<!fir.array<1xi32>>) -> !fir.box<!fir.array<?xi32>>
+  // CHECK: %[[VAL_15:.*]] = fir.convert %[[VAL_11]] : (!fir.box<!fir.char<1,128>>) -> !fir.box<!fir.char<1,?>>
+  // CHECK: fir.call @_QMprifPprif_sync_images(%[[VAL_14]], %[[STAT]]#0, %[[VAL_15]], %[[VAL_13]]) : (!fir.box<!fir.array<?xi32>>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  
+  // CHECK: %[[VAL_16:.*]] = fir.absent !fir.box<!fir.array<?xi32>> 
+  // CHECK: %[[VAL_17:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  // CHECK: %[[VAL_18:.*]] = fir.absent !fir.ref<i32>
+  // CHECK: fir.call @_QMprifPprif_sync_images(%[[VAL_16]], %[[VAL_18]], %[[VAL_17]], %[[VAL_17]]) : (!fir.box<!fir.array<?xi32>>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  
+  // CHECK: %[[VAL_19:.*]] = fir.embox %[[ME]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
+  // CHECK: %[[VAL_20:.*]] = fir.rebox %[[VAL_19]](%[[SHAPE_2:.*]]) : (!fir.box<i32>, !fir.shape<1>) -> !fir.box<!fir.array<1xi32>>
+  // CHECK: %[[VAL_21:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  // CHECK: %[[VAL_22:.*]] = fir.absent !fir.ref<i32>
+  // CHECK: %[[VAL_23:.*]] = fir.convert %[[VAL_20]] : (!fir.box<!fir.array<1xi32>>) -> !fir.box<!fir.array<?xi32>> 
+  // CHECK: fir.call @_QMprifPprif_sync_images(%[[VAL_23]], %[[VAL_22]], %[[VAL_21]], %[[VAL_21]]) : (!fir.box<!fir.array<?xi32>>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+  
+  // CHECK: %[[VAL_24:.*]] = fir.embox %[[IMG_SET:.*]]#0(%[[SHAPE_3:.*]]) : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<1xi32>>
+  // CHECK: %[[VAL_25:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+  // CHECK: %[[VAL_26:.*]] = fir.absent !fir.ref<i32>
+  // CHECK: %[[VAL_27:.*]] = fir.convert %[[VAL_24]] : (!fir.box<!fir.array<1xi32>>) -> !fir.box<!fir.array<?xi32>>
+  // CHECK: fir.call @_QMprifPprif_sync_images(%[[VAL_27]], %[[VAL_26]], %[[VAL_25]], %[[VAL_25]]) : (!fir.box<!fir.array<?xi32>>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
diff --git a/flang/test/Fir/MIF/sync_memory.mlir b/flang/test/Fir/MIF/sync_memory.mlir
new file mode 100644
index 0000000000000..d6f24416bc61c
--- /dev/null
+++ b/flang/test/Fir/MIF/sync_memory.mlir
@@ -0,0 +1,45 @@
+// RUN: fir-opt --mif-convert %s | FileCheck %s
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr<270> = dense<32> : vector<4xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, i64 = dense<64> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, f80 = dense<128> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little", "dlti.mangling_mode" = "e", "dlti.legal_int_widths" = array<i32: 8, 16, 32, 64>, "dlti.stack_alignment" = 128 : i64>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 22.0.0 (git@github.com:SiPearl/llvm-project.git 666e4313ebc03587f27774139ad8f780bac15c3e)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+  func.func @_QQmain() attributes {fir.bindc_name = "TEST_sync_memory"} {
+    %0 = fir.dummy_scope : !fir.dscope
+    %1 = fir.address_of(@_QFEerror_message) : !fir.ref<!fir.char<1,128>>
+    %c128 = arith.constant 128 : index
+    %2:2 = hlfir.declare %1 typeparams %c128 {uniq_name = "_QFEerror_message"} : (!fir.ref<!fir.char<1,128>>, index) -> (!fir.ref<!fir.char<1,128>>, !fir.ref<!fir.char<1,128>>)
+    %3 = fir.alloca i32 {bindc_name = "sync_status", uniq_name = "_QFEsync_status"}
+    %4:2 = hlfir.declare %3 {uniq_name = "_QFEsync_status"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    mif.sync_memory : () -> ()
+    mif.sync_memory stat %4#0 : (!fir.ref<i32>) -> ()
+    %5 = fir.embox %2#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
+    mif.sync_memory errmsg %5 : (!fir.box<!fir.char<1,128>>) -> ()
+    %6 = fir.embox %2#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
+    mif.sync_memory stat %4#0 errmsg %6 : (!fir.ref<i32>, !fir.box<!fir.char<1,128>>) -> ()
+    return
+  }
+  fir.global internal @_QFEerror_message : !fir.char<1,128> {
+    %0 = fir.zero_bits !fir.char<1,128>
+    fir.has_value %0 : !fir.char<1,128>
+  }
+}
+
+
+// CHECK: %[[ERRMSG:.*]]:2 = hlfir.declare %[[E:.*]] typeparams %[[C_128:.*]] {uniq_name = "_QFEerror_message"} : (!fir.ref<!fir.char<1,128>>, index) -> (!fir.ref<!fir.char<1,128>>, !fir.ref<!fir.char<1,128>>)
+// CHECK: %[[STAT:.*]]:2 = hlfir.declare %[[S:.*]] {uniq_name = "_QFEsync_status"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
+// CHECK: %[[VAL_1:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+// CHECK: %[[VAL_2:.*]] = fir.absent !fir.ref<i32>
+// CHECK: fir.call @_QMprifPprif_sync_memory(%[[VAL_2]], %[[VAL_1]], %[[VAL_1]]) : (!fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+
+// CHECK: %[[VAL_3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+// CHECK: fir.call @_QMprifPprif_sync_memory(%[[STAT]]#0, %[[VAL_3]], %[[VAL_3]]) : (!fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+
+// CHECK: %[[VAL_4:.*]] = fir.embox %[[ERRMSG]]#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
+// CHECK: %[[VAL_5:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+// CHECK: %[[VAL_6:.*]] = fir.absent !fir.ref<i32>
+// CHECK: %[[VAL_7:.*]] = fir.convert %[[VAL_4]] : (!fir.box<!fir.char<1,128>>) -> !fir.box<!fir.char<1,?>>
+// CHECK: fir.call @_QMprifPprif_sync_memory(%[[VAL_6]], %[[VAL_7]], %[[VAL_5]]) : (!fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
+
+// CHECK: %[[VAL_8:.*]] = fir.embox %[[ERRMSG]]#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
+// CHECK: %[[VAL_9:.*]] = fir.absent !fir.box<!fir.char<1,?>>
+// CHECK: %[[VAL_10:.*]] = fir.convert %[[VAL_8]] : (!fir.box<!fir.char<1,128>>) -> !fir.box<!fir.char<1,?>>
+// CHECK: fir.call @_QMprifPprif_sync_memory(%[[STAT]]#0, %[[VAL_10]], %[[VAL_9]]) : (!fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
diff --git a/flang/test/Fir/MIF/this_image.mlir b/flang/test/Fir/MIF/this_image.mlir
new file mode 100644
index 0000000000000..25eafc09ef58c
--- /dev/null
+++ b/flang/test/Fir/MIF/this_image.mlir
@@ -0,0 +1,16 @@
+// RUN: fir-opt --mif-convert %s | FileCheck %s
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr<270> = dense<32> : vector<4xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, i64 = dense<64> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, f80 = dense<128> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little", "dlti.mangling_mode" = "e", "dlti.legal_int_widths" = array<i32: 8, 16, 32, 64>, "dlti.stack_alignment" = 128 : i64>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 22.0.0 (git@github.com:SiPearl/llvm-project.git 666e4313ebc03587f27774139ad8f780bac15c3e)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+  func.func @_QQmain() attributes {fir.bindc_name = "TEST"} {
+    %0 = fir.dummy_scope : !fir.dscope
+    %1 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"}
+    %2:2 = hlfir.declare %1 {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %3 = mif.this_image : () -> i32
+    hlfir.assign %3 to %2#0 : i32, !fir.ref<i32>
+    return
+  }
+}
+
+
+// CHECK-LABEL: func.func @_QQmain
+// CHECK: fir.call @_QMprifPprif_this_image_no_coarray(
diff --git a/flang/test/Fir/OpenACC/openacc-type-categories-declare-storage.mlir b/flang/test/Fir/OpenACC/openacc-type-categories-declare-storage.mlir
new file mode 100644
index 0000000000000..fabfe4caf3924
--- /dev/null
+++ b/flang/test/Fir/OpenACC/openacc-type-categories-declare-storage.mlir
@@ -0,0 +1,24 @@
+// Use --mlir-disable-threading so that the diagnostic printing is serialized.
+// RUN: fir-opt %s -pass-pipeline='builtin.module(test-fir-openacc-interfaces)' -split-input-file --mlir-disable-threading 2>&1 | FileCheck %s
+
+module {
+  // Build a scalar view via fir.declare with a storage operand into an array of i8
+  func.func @_QPdeclare_with_storage_is_nonscalar() {
+    %c0 = arith.constant 0 : index
+    %arr = fir.alloca !fir.array<4xi8>
+    %elem_i8 = fir.coordinate_of %arr, %c0 : (!fir.ref<!fir.array<4xi8>>, index) -> !fir.ref<i8>
+    %elem_f32 = fir.convert %elem_i8 : (!fir.ref<i8>) -> !fir.ref<f32>
+    %view = fir.declare %elem_f32 storage(%arr[0]) {uniq_name = "_QFpi"}
+      : (!fir.ref<f32>, !fir.ref<!fir.array<4xi8>>) -> !fir.ref<f32>
+    // Force interface query through an acc op that prints type category
+    %cp = acc.copyin varPtr(%view : !fir.ref<f32>) -> !fir.ref<f32> {name = "pi", structured = false}
+    acc.enter_data dataOperands(%cp : !fir.ref<f32>)
+    return
+  }
+
+  // CHECK: Visiting: %{{.*}} = acc.copyin varPtr(%{{.*}} : !fir.ref<f32>) -> !fir.ref<f32> {name = "pi", structured = false}
+  // CHECK: Pointer-like and Mappable: !fir.ref<f32>
+  // CHECK: Type category: array
+}
+
+
diff --git a/flang/test/Fir/OpenACC/pointer-like-interface-alloc.mlir b/flang/test/Fir/OpenACC/pointer-like-interface-alloc.mlir
new file mode 100644
index 0000000000000..0da360a26b3e6
--- /dev/null
+++ b/flang/test/Fir/OpenACC/pointer-like-interface-alloc.mlir
@@ -0,0 +1,122 @@
+// RUN: fir-opt %s --split-input-file --pass-pipeline="builtin.module(func.func(test-acc-pointer-like-interface{test-mode=alloc}))" 2>&1 | FileCheck %s
+
+// The tests here use a synthetic hlfir.declare in order to ensure that the hlfir dialect is
+// loaded. This is required because the pass used is part of OpenACC test passes outside of
+// flang and the APIs being test may generate hlfir even when it does not appear.
+
+func.func @test_ref_scalar_alloc() {
+  %0 = fir.alloca f32 {test.ptr}
+  %1:2 = hlfir.declare %0 {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  // CHECK: Successfully generated alloc for operation: %{{.*}} = fir.alloca f32 {test.ptr}
+  // CHECK: Generated: %{{.*}} = fir.alloca f32
+  return
+}
+
+// -----
+
+func.func @test_ref_static_array_alloc() {
+  %0 = fir.alloca !fir.array<10x20xf32> {test.ptr}
+  %var = fir.alloca f32
+  %1:2 = hlfir.declare %var {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  // CHECK: Successfully generated alloc for operation: %{{.*}} = fir.alloca !fir.array<10x20xf32> {test.ptr}
+  // CHECK: Generated: %{{.*}} = fir.alloca !fir.array<10x20xf32>
+  return
+}
+
+// -----
+
+func.func @test_ref_derived_type_alloc() {
+  %0 = fir.alloca !fir.type<_QTt{i:i32}> {test.ptr}
+  %var = fir.alloca f32
+  %1:2 = hlfir.declare %var {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  // CHECK: Successfully generated alloc for operation: %{{.*}} = fir.alloca !fir.type<_QTt{i:i32}> {test.ptr}
+  // CHECK: Generated: %{{.*}} = fir.alloca !fir.type<_QTt{i:i32}>
+  return
+}
+
+// -----
+
+func.func @test_heap_scalar_alloc() {
+  %0 = fir.allocmem f32 {test.ptr}
+  %var = fir.alloca f32
+  %1:2 = hlfir.declare %var {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  // CHECK: Successfully generated alloc for operation: %{{.*}} = fir.allocmem f32 {test.ptr}
+  // CHECK: Generated: %{{.*}} = fir.allocmem f32
+  return
+}
+
+// -----
+
+func.func @test_heap_static_array_alloc() {
+  %0 = fir.allocmem !fir.array<10x20xf32> {test.ptr}
+  %var = fir.alloca f32
+  %1:2 = hlfir.declare %var {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  // CHECK: Successfully generated alloc for operation: %{{.*}} = fir.allocmem !fir.array<10x20xf32> {test.ptr}
+  // CHECK: Generated: %{{.*}} = fir.allocmem !fir.array<10x20xf32>
+  return
+}
+
+// -----
+
+func.func @test_ptr_scalar_alloc() {
+  %0 = fir.alloca f32
+  %1 = fir.convert %0 {test.ptr} : (!fir.ref<f32>) -> !fir.ptr<f32>
+  %2:2 = hlfir.declare %0 {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  // CHECK: Successfully generated alloc for operation
+  // CHECK: Generated: %{{.*}} = fir.alloca f32
+  // CHECK: Generated: %{{.*}} = fir.convert %{{.*}} : (!fir.ref<f32>) -> !fir.ptr<f32>
+  return
+}
+
+// -----
+
+func.func @test_llvm_ptr_scalar_alloc() {
+  %0 = fir.alloca f32
+  %1 = fir.convert %0 {test.ptr} : (!fir.ref<f32>) -> !fir.llvm_ptr<f32>
+  %2:2 = hlfir.declare %0 {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  // CHECK: Successfully generated alloc for operation
+  // CHECK: Generated: %{{.*}} = fir.alloca f32
+  // CHECK: Generated: %{{.*}} = fir.convert %{{.*}} : (!fir.ref<f32>) -> !fir.llvm_ptr<f32>
+  return
+}
+
+// -----
+
+func.func @test_dynamic_array_alloc_fails(%arg0: !fir.ref<!fir.array<?xf32>>) {
+  %0 = fir.convert %arg0 {test.ptr} : (!fir.ref<!fir.array<?xf32>>) -> !fir.llvm_ptr<!fir.array<?xf32>>
+  %var = fir.alloca f32
+  %1:2 = hlfir.declare %var {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  // CHECK: Failed to generate alloc for operation: %{{.*}} = fir.convert %{{.*}} {test.ptr} : (!fir.ref<!fir.array<?xf32>>) -> !fir.llvm_ptr<!fir.array<?xf32>>
+  return
+}
+
+// -----
+
+func.func @test_unlimited_polymorphic_alloc_fails() {
+  %0 = fir.alloca !fir.class<none> {test.ptr}
+  %var = fir.alloca f32
+  %1:2 = hlfir.declare %var {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  // CHECK: Failed to generate alloc for operation: %{{.*}} = fir.alloca !fir.class<none> {test.ptr}
+  return
+}
+
+// -----
+
+func.func @test_dynamic_char_alloc_fails(%arg0: !fir.ref<!fir.char<1,?>>) {
+  %0 = fir.convert %arg0 {test.ptr} : (!fir.ref<!fir.char<1,?>>) -> !fir.llvm_ptr<!fir.char<1,?>>
+  %var = fir.alloca f32
+  %1:2 = hlfir.declare %var {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  // CHECK: Failed to generate alloc for operation: %{{.*}} = fir.convert %{{.*}} {test.ptr} : (!fir.ref<!fir.char<1,?>>) -> !fir.llvm_ptr<!fir.char<1,?>>
+  return
+}
+
+// -----
+
+func.func @test_static_char_alloc() {
+  %0 = fir.alloca !fir.char<1,10> {test.ptr}
+  %var = fir.alloca f32
+  %1:2 = hlfir.declare %var {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  // CHECK: Successfully generated alloc for operation: %{{.*}} = fir.alloca !fir.char<1,10> {test.ptr}
+  // CHECK: Generated: %{{.*}} = fir.alloca !fir.char<1,10>
+  return
+}
diff --git a/flang/test/Fir/OpenACC/pointer-like-interface-copy.mlir b/flang/test/Fir/OpenACC/pointer-like-interface-copy.mlir
new file mode 100644
index 0000000000000..99fc012cb903e
--- /dev/null
+++ b/flang/test/Fir/OpenACC/pointer-like-interface-copy.mlir
@@ -0,0 +1,120 @@
+// RUN: fir-opt %s --split-input-file --pass-pipeline="builtin.module(func.func(test-acc-pointer-like-interface{test-mode=copy}))" 2>&1 | FileCheck %s
+
+// The tests here use a synthetic hlfir.declare in order to ensure that the hlfir dialect is
+// loaded. This is required because the pass used is part of OpenACC test passes outside of
+// flang and the APIs being test may generate hlfir even when it does not appear.
+
+func.func @test_copy_scalar() {
+  %src = fir.alloca f32 {test.src_ptr}
+  %dest = fir.alloca f32 {test.dest_ptr}
+  %var = fir.alloca f32
+  %0:2 = hlfir.declare %var {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  // CHECK: Successfully generated copy from source: %{{.*}} = fir.alloca f32 {test.src_ptr} to destination: %{{.*}} = fir.alloca f32 {test.dest_ptr}
+  // CHECK: Generated: %{{.*}} = fir.load %{{.*}} : !fir.ref<f32>
+  // CHECK: Generated: fir.store %{{.*}} to %{{.*}} : !fir.ref<f32>
+  return
+}
+
+// -----
+
+func.func @test_copy_static_array() {
+  %src = fir.alloca !fir.array<10x20xf32> {test.src_ptr}
+  %dest = fir.alloca !fir.array<10x20xf32> {test.dest_ptr}
+  %var = fir.alloca f32
+  %0:2 = hlfir.declare %var {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  // CHECK: Successfully generated copy from source: %{{.*}} = fir.alloca !fir.array<10x20xf32> {test.src_ptr} to destination: %{{.*}} = fir.alloca !fir.array<10x20xf32> {test.dest_ptr}
+  // CHECK: Generated: hlfir.assign %{{.*}} to %{{.*}} : !fir.ref<!fir.array<10x20xf32>>, !fir.ref<!fir.array<10x20xf32>>
+  return
+}
+
+// -----
+
+func.func @test_copy_derived_type() {
+  %src = fir.alloca !fir.type<_QTt{i:i32}> {test.src_ptr}
+  %dest = fir.alloca !fir.type<_QTt{i:i32}> {test.dest_ptr}
+  %var = fir.alloca f32
+  %0:2 = hlfir.declare %var {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  // CHECK: Successfully generated copy from source: %{{.*}} = fir.alloca !fir.type<_QTt{i:i32}> {test.src_ptr} to destination: %{{.*}} = fir.alloca !fir.type<_QTt{i:i32}> {test.dest_ptr}
+  // CHECK: Generated: hlfir.assign %{{.*}} to %{{.*}} : !fir.ref<!fir.type<_QTt{i:i32}>>, !fir.ref<!fir.type<_QTt{i:i32}>>
+  return
+}
+
+// -----
+
+func.func @test_copy_heap_scalar() {
+  %src = fir.allocmem f32 {test.src_ptr}
+  %dest = fir.allocmem f32 {test.dest_ptr}
+  %var = fir.alloca f32
+  %0:2 = hlfir.declare %var {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  // CHECK: Successfully generated copy from source: %{{.*}} = fir.allocmem f32 {test.src_ptr} to destination: %{{.*}} = fir.allocmem f32 {test.dest_ptr}
+  // CHECK: Generated: %{{.*}} = fir.load %{{.*}} : !fir.heap<f32>
+  // CHECK: Generated: fir.store %{{.*}} to %{{.*}} : !fir.heap<f32>
+  return
+}
+
+// -----
+
+func.func @test_copy_static_char() {
+  %src = fir.alloca !fir.char<1,10> {test.src_ptr}
+  %dest = fir.alloca !fir.char<1,10> {test.dest_ptr}
+  %var = fir.alloca f32
+  %0:2 = hlfir.declare %var {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  // CHECK: Successfully generated copy from source: %{{.*}} = fir.alloca !fir.char<1,10> {test.src_ptr} to destination: %{{.*}} = fir.alloca !fir.char<1,10> {test.dest_ptr}
+  // CHECK: Generated: hlfir.assign %{{.*}} to %{{.*}} : !fir.ref<!fir.char<1,10>>, !fir.ref<!fir.char<1,10>>
+  return
+}
+
+// -----
+
+func.func @test_copy_mismatched_types_fails() {
+  %src = fir.alloca f32 {test.src_ptr}
+  %dest = fir.alloca f64 {test.dest_ptr}
+  %var = fir.alloca f32
+  %0:2 = hlfir.declare %var {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  // CHECK: Failed to generate copy from source: %{{.*}} = fir.alloca f32 {test.src_ptr} to destination: %{{.*}} = fir.alloca f64 {test.dest_ptr}
+  return
+}
+
+// -----
+
+func.func @test_copy_mismatched_shapes_fails() {
+  %src = fir.alloca !fir.array<10xf32> {test.src_ptr}
+  %dest = fir.alloca !fir.array<20xf32> {test.dest_ptr}
+  %var = fir.alloca f32
+  %0:2 = hlfir.declare %var {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  // CHECK: Failed to generate copy from source: %{{.*}} = fir.alloca !fir.array<10xf32> {test.src_ptr} to destination: %{{.*}} = fir.alloca !fir.array<20xf32> {test.dest_ptr}
+  return
+}
+
+// -----
+
+func.func @test_copy_dynamic_array_fails(%arg0: !fir.ref<!fir.array<?xf32>>, %arg1: !fir.ref<!fir.array<?xf32>>) {
+  %src = fir.convert %arg0 {test.src_ptr} : (!fir.ref<!fir.array<?xf32>>) -> !fir.llvm_ptr<!fir.array<?xf32>>
+  %dest = fir.convert %arg1 {test.dest_ptr} : (!fir.ref<!fir.array<?xf32>>) -> !fir.llvm_ptr<!fir.array<?xf32>>
+  %var = fir.alloca f32
+  %0:2 = hlfir.declare %var {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  // CHECK: Failed to generate copy from source: %{{.*}} = fir.convert %{{.*}} {test.src_ptr} : (!fir.ref<!fir.array<?xf32>>) -> !fir.llvm_ptr<!fir.array<?xf32>> to destination: %{{.*}} = fir.convert %{{.*}} {test.dest_ptr} : (!fir.ref<!fir.array<?xf32>>) -> !fir.llvm_ptr<!fir.array<?xf32>>
+  return
+}
+
+// -----
+
+func.func @test_copy_unlimited_polymorphic_fails() {
+  %src = fir.alloca !fir.class<none> {test.src_ptr}
+  %dest = fir.alloca !fir.class<none> {test.dest_ptr}
+  %var = fir.alloca f32
+  %0:2 = hlfir.declare %var {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  // CHECK: Failed to generate copy from source: %{{.*}} = fir.alloca !fir.class<none> {test.src_ptr} to destination: %{{.*}} = fir.alloca !fir.class<none> {test.dest_ptr}
+  return
+}
+
+// -----
+
+func.func @test_copy_dynamic_char_fails(%arg0: !fir.ref<!fir.char<1,?>>, %arg1: !fir.ref<!fir.char<1,?>>) {
+  %src = fir.convert %arg0 {test.src_ptr} : (!fir.ref<!fir.char<1,?>>) -> !fir.llvm_ptr<!fir.char<1,?>>
+  %dest = fir.convert %arg1 {test.dest_ptr} : (!fir.ref<!fir.char<1,?>>) -> !fir.llvm_ptr<!fir.char<1,?>>
+  %var = fir.alloca f32
+  %0:2 = hlfir.declare %var {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  // CHECK: Failed to generate copy from source: %{{.*}} = fir.convert %{{.*}} {test.src_ptr} : (!fir.ref<!fir.char<1,?>>) -> !fir.llvm_ptr<!fir.char<1,?>> to destination: %{{.*}} = fir.convert %{{.*}} {test.dest_ptr} : (!fir.ref<!fir.char<1,?>>) -> !fir.llvm_ptr<!fir.char<1,?>>
+  return
+}
diff --git a/flang/test/Fir/OpenACC/pointer-like-interface-free.mlir b/flang/test/Fir/OpenACC/pointer-like-interface-free.mlir
new file mode 100644
index 0000000000000..6334752d9e998
--- /dev/null
+++ b/flang/test/Fir/OpenACC/pointer-like-interface-free.mlir
@@ -0,0 +1,94 @@
+// RUN: fir-opt %s --split-input-file --pass-pipeline="builtin.module(func.func(test-acc-pointer-like-interface{test-mode=free}))" 2>&1 | FileCheck %s
+
+// The tests here use a synthetic hlfir.declare in order to ensure that the hlfir dialect is
+// loaded. This is required because the pass used is part of OpenACC test passes outside of
+// flang and the APIs being test may generate hlfir even when it does not appear.
+
+func.func @test_ref_scalar_free() {
+  %0 = fir.alloca f32 {test.ptr}
+  %1:2 = hlfir.declare %0 {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  // CHECK: Successfully generated free for operation: %{{.*}} = fir.alloca f32 {test.ptr}
+  // CHECK-NOT: Generated
+  return
+}
+
+// -----
+
+func.func @test_heap_scalar_free() {
+  %0 = fir.allocmem f32 {test.ptr}
+  %var = fir.alloca f32
+  %1:2 = hlfir.declare %var {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  // CHECK: Successfully generated free for operation: %{{.*}} = fir.allocmem f32 {test.ptr}
+  // CHECK: Generated: fir.freemem %{{.*}} : !fir.heap<f32>
+  return
+}
+
+// -----
+
+func.func @test_heap_array_free() {
+  %0 = fir.allocmem !fir.array<10x20xf32> {test.ptr}
+  %var = fir.alloca f32
+  %1:2 = hlfir.declare %var {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  // CHECK: Successfully generated free for operation: %{{.*}} = fir.allocmem !fir.array<10x20xf32> {test.ptr}
+  // CHECK: Generated: fir.freemem %{{.*}} : !fir.heap<!fir.array<10x20xf32>>
+  return
+}
+
+// -----
+
+func.func @test_convert_walking_free() {
+  %0 = fir.alloca f32
+  %1 = fir.convert %0 {test.ptr} : (!fir.ref<f32>) -> !fir.ptr<f32>
+  %2:2 = hlfir.declare %0 {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  // CHECK: Successfully generated free for operation: %{{.*}} = fir.convert %{{.*}} {test.ptr} : (!fir.ref<f32>) -> !fir.ptr<f32>
+  // CHECK-NOT: Generated
+  return
+}
+
+// -----
+
+func.func @test_declare_walking_free() {
+  %0 = fir.alloca f32
+  %1 = fir.declare %0 {test.ptr, uniq_name = "x"} : (!fir.ref<f32>) -> !fir.ref<f32>
+  %2:2 = hlfir.declare %0 {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  // CHECK: Successfully generated free for operation: %{{.*}} = fir.declare %{{.*}} {test.ptr, uniq_name = "x"} : (!fir.ref<f32>) -> !fir.ref<f32>
+  // CHECK-NOT: Generated
+  return
+}
+
+// -----
+
+func.func @test_hlfir_declare_walking_free() {
+  %0 = fir.alloca f32
+  %1:2 = hlfir.declare %0 {test.ptr, uniq_name = "x"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  %var = fir.alloca f32
+  %2:2 = hlfir.declare %var {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  // CHECK: Successfully generated free for operation
+  // CHECK-NOT: Generated
+  return
+}
+
+// -----
+
+func.func @test_heap_through_convert_free() {
+  %0 = fir.allocmem f32
+  %1 = fir.convert %0 {test.ptr} : (!fir.heap<f32>) -> !fir.llvm_ptr<f32>
+  %var = fir.alloca f32
+  %2:2 = hlfir.declare %var {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  // CHECK: Successfully generated free for operation: %{{.*}} = fir.convert %{{.*}} {test.ptr} : (!fir.heap<f32>) -> !fir.llvm_ptr<f32>
+  // CHECK: Generated: %{{.*}} = fir.convert %{{.*}} : (!fir.llvm_ptr<f32>) -> !fir.heap<f32>
+  // CHECK: Generated: fir.freemem %{{.*}} : !fir.heap<f32>
+  return
+}
+
+// -----
+
+func.func @test_heap_through_declare_free() {
+  %0 = fir.allocmem f32
+  %1 = fir.declare %0 {test.ptr, uniq_name = "x"} : (!fir.heap<f32>) -> !fir.heap<f32>
+  %var = fir.alloca f32
+  %2:2 = hlfir.declare %var {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  // CHECK: Successfully generated free for operation: %{{.*}} = fir.declare %{{.*}} {test.ptr, uniq_name = "x"} : (!fir.heap<f32>) -> !fir.heap<f32>
+  // CHECK: Generated: fir.freemem %{{.*}} : !fir.heap<f32>
+  return
+}
diff --git a/flang/test/Fir/OpenACC/recipe-bufferization.mlir b/flang/test/Fir/OpenACC/recipe-bufferization.mlir
new file mode 100644
index 0000000000000..c4f96f63d5076
--- /dev/null
+++ b/flang/test/Fir/OpenACC/recipe-bufferization.mlir
@@ -0,0 +1,316 @@
+// RUN: fir-opt %s --fir-acc-recipe-bufferization -split-input-file | FileCheck %s
+
+// -----
+
+acc.private.recipe @priv_ref_box : !fir.box<i32> init {
+^bb0(%arg0: !fir.box<i32>):
+  %1 = fir.allocmem i32
+  %2 = fir.embox %1 : (!fir.heap<i32>) -> !fir.box<i32>
+  acc.yield %2 : !fir.box<i32>
+} destroy {
+^bb0(%arg0: !fir.box<i32>, %arg1: !fir.box<i32>):
+  %0 = fir.box_addr %arg1 : (!fir.box<i32>) -> !fir.ref<i32>
+  %1 = fir.convert %0 : (!fir.ref<i32>) -> !fir.heap<i32>
+  fir.freemem %1 : !fir.heap<i32>
+  acc.yield
+}
+
+// CHECK-LABEL: acc.private.recipe @priv_ref_box : !fir.ref<!fir.box<i32>> init
+// CHECK: ^bb0(%[[ARG:.*]]: !fir.ref<!fir.box<i32>>)
+// CHECK:   %[[EMBOX:.*]] = fir.embox
+// CHECK:   %[[ALLOCA:.*]] = fir.alloca !fir.box<i32>
+// CHECK:   fir.store %[[EMBOX]] to %[[ALLOCA]] : !fir.ref<!fir.box<i32>>
+// CHECK:   acc.yield %[[ALLOCA]] : !fir.ref<!fir.box<i32>>
+// CHECK: } destroy {
+// CHECK: ^bb0(%[[DARG0:.*]]: !fir.ref<!fir.box<i32>>, %[[DARG1:.*]]: !fir.ref<!fir.box<i32>>)
+// CHECK:   %[[LD1:.*]] = fir.load %[[DARG1]] : !fir.ref<!fir.box<i32>>
+// CHECK:   %[[ADDR:.*]] = fir.box_addr %[[LD1]] : (!fir.box<i32>) -> !fir.ref<i32>
+// CHECK:   %[[CVT:.*]] = fir.convert %[[ADDR]] : (!fir.ref<i32>) -> !fir.heap<i32>
+
+// -----
+
+// Test private recipe without destroy region.
+
+acc.private.recipe @priv_ref_box_no_destroy : !fir.box<i32> init {
+^bb0(%arg0: !fir.box<i32>):
+  %1 = fir.alloca i32
+  %2 = fir.embox %1 : (!fir.ref<i32>) -> !fir.box<i32>
+  acc.yield %2 : !fir.box<i32>
+}
+
+// CHECK-LABEL: acc.private.recipe @priv_ref_box_no_destroy : !fir.ref<!fir.box<i32>> init
+// CHECK: ^bb0(%[[ARG:.*]]: !fir.ref<!fir.box<i32>>)
+// CHECK:   %[[EMBOX:.*]] = fir.embox
+// CHECK:   %[[ALLOCA:.*]] = fir.alloca !fir.box<i32>
+// CHECK:   fir.store %[[EMBOX]] to %[[ALLOCA]] : !fir.ref<!fir.box<i32>>
+// CHECK:   acc.yield %[[ALLOCA]] : !fir.ref<!fir.box<i32>>
+// CHECK: }
+
+// -----
+
+// Firstprivate recipe with destroy region.
+acc.firstprivate.recipe @fp_ref_box : !fir.box<i32> init {
+^bb0(%arg0: !fir.box<i32>):
+  %0 = fir.allocmem i32
+  %1 = fir.embox %0 : (!fir.heap<i32>) -> !fir.box<i32>
+  acc.yield %1 : !fir.box<i32>
+} copy {
+^bb0(%src: !fir.box<i32>, %dst: !fir.box<i32>):
+  %s_addr = fir.box_addr %src : (!fir.box<i32>) -> !fir.ref<i32>
+  %val = fir.load %s_addr : !fir.ref<i32>
+  %d_addr = fir.box_addr %dst : (!fir.box<i32>) -> !fir.ref<i32>
+  fir.store %val to %d_addr : !fir.ref<i32>
+  acc.yield
+} destroy {
+^bb0(%arg0: !fir.box<i32>, %arg1: !fir.box<i32>):
+  acc.yield
+}
+
+// CHECK-LABEL: acc.firstprivate.recipe @fp_ref_box : !fir.ref<!fir.box<i32>> init
+// CHECK: ^bb0(%[[IARG:.*]]: !fir.ref<!fir.box<i32>>)
+// CHECK:   %[[EMBOX_FP:.*]] = fir.embox
+// CHECK:   %[[ALLOCA_FP:.*]] = fir.alloca !fir.box<i32>
+// CHECK:   fir.store %[[EMBOX_FP]] to %[[ALLOCA_FP]] : !fir.ref<!fir.box<i32>>
+// CHECK:   acc.yield %[[ALLOCA_FP]] : !fir.ref<!fir.box<i32>>
+// CHECK: } copy {
+// CHECK: ^bb0(%[[SRC:.*]]: !fir.ref<!fir.box<i32>>, %[[DST:.*]]: !fir.ref<!fir.box<i32>>)
+// CHECK:   %[[LSRC:.*]] = fir.load %[[SRC]] : !fir.ref<!fir.box<i32>>
+// CHECK:   %[[LDST:.*]] = fir.load %[[DST]] : !fir.ref<!fir.box<i32>>
+// CHECK:   %[[SADDR:.*]] = fir.box_addr %[[LSRC]] : (!fir.box<i32>) -> !fir.ref<i32>
+// CHECK:   %[[VAL:.*]] = fir.load %[[SADDR]] : !fir.ref<i32>
+// CHECK:   %[[DADDR:.*]] = fir.box_addr %[[LDST]] : (!fir.box<i32>) -> !fir.ref<i32>
+// CHECK:   fir.store %[[VAL]] to %[[DADDR]] : !fir.ref<i32>
+// CHECK: } destroy {
+// CHECK: ^bb0(%[[FDARG0:.*]]: !fir.ref<!fir.box<i32>>, %[[FDARG1:.*]]: !fir.ref<!fir.box<i32>>)
+
+// -----
+
+// Firstprivate recipe without destroy region.
+acc.firstprivate.recipe @fp_ref_box_no_destroy : !fir.box<i32> init {
+^bb0(%arg0: !fir.box<i32>):
+  %0 = fir.alloca i32
+  %1 = fir.embox %0 : (!fir.ref<i32>) -> !fir.box<i32>
+  acc.yield %1 : !fir.box<i32>
+} copy {
+^bb0(%src: !fir.box<i32>, %dst: !fir.box<i32>):
+  %s_addr = fir.box_addr %src : (!fir.box<i32>) -> !fir.ref<i32>
+  %val = fir.load %s_addr : !fir.ref<i32>
+  %d_addr = fir.box_addr %dst : (!fir.box<i32>) -> !fir.ref<i32>
+  fir.store %val to %d_addr : !fir.ref<i32>
+  acc.yield
+}
+
+// CHECK-LABEL: acc.firstprivate.recipe @fp_ref_box_no_destroy : !fir.ref<!fir.box<i32>> init
+// CHECK: ^bb0(%[[IARG2:.*]]: !fir.ref<!fir.box<i32>>)
+// CHECK:   %[[EMBOX_FP2:.*]] = fir.embox
+// CHECK:   %[[ALLOCA_FP2:.*]] = fir.alloca !fir.box<i32>
+// CHECK:   fir.store %[[EMBOX_FP2]] to %[[ALLOCA_FP2]] : !fir.ref<!fir.box<i32>>
+// CHECK:   acc.yield %[[ALLOCA_FP2]] : !fir.ref<!fir.box<i32>>
+// CHECK: } copy {
+// CHECK: ^bb0(%[[SRC2:.*]]: !fir.ref<!fir.box<i32>>, %[[DST2:.*]]: !fir.ref<!fir.box<i32>>)
+// CHECK:   %[[LSRC2:.*]] = fir.load %[[SRC2]] : !fir.ref<!fir.box<i32>>
+// CHECK:   %[[LDST2:.*]] = fir.load %[[DST2]] : !fir.ref<!fir.box<i32>>
+// CHECK:   %[[SADDR2:.*]] = fir.box_addr %[[LSRC2]] : (!fir.box<i32>) -> !fir.ref<i32>
+// CHECK:   %[[VAL2:.*]] = fir.load %[[SADDR2]] : !fir.ref<i32>
+// CHECK:   %[[DADDR2:.*]] = fir.box_addr %[[LDST2]] : (!fir.box<i32>) -> !fir.ref<i32>
+// CHECK:   fir.store %[[VAL2]] to %[[DADDR2]] : !fir.ref<i32>
+
+// -----
+
+// Reduction recipe with destroy region.
+acc.reduction.recipe @red_ref_box : !fir.box<i32> reduction_operator <add> init {
+^bb0(%arg0: !fir.box<i32>):
+  %0 = fir.allocmem i32
+  %1 = fir.embox %0 : (!fir.heap<i32>) -> !fir.box<i32>
+  acc.yield %1 : !fir.box<i32>
+} combiner {
+^bb0(%lhs: !fir.box<i32>, %rhs: !fir.box<i32>):
+  %l_addr = fir.box_addr %lhs : (!fir.box<i32>) -> !fir.ref<i32>
+  %l_val = fir.load %l_addr : !fir.ref<i32>
+  %r_addr = fir.box_addr %rhs : (!fir.box<i32>) -> !fir.ref<i32>
+  %r_val = fir.load %r_addr : !fir.ref<i32>
+  %sum = arith.addi %l_val, %r_val : i32
+  %tmp = fir.alloca i32
+  fir.store %sum to %tmp : !fir.ref<i32>
+  %new = fir.embox %tmp : (!fir.ref<i32>) -> !fir.box<i32>
+  acc.yield %new : !fir.box<i32>
+} destroy {
+^bb0(%arg0: !fir.box<i32>, %arg1: !fir.box<i32>):
+  acc.yield
+}
+
+// CHECK-LABEL: acc.reduction.recipe @red_ref_box : !fir.ref<!fir.box<i32>> reduction_operator <add> init
+// CHECK: ^bb0(%[[IARGR:.*]]: !fir.ref<!fir.box<i32>>)
+// CHECK:   %[[EMBOXR:.*]] = fir.embox
+// CHECK:   %[[ALLOCAR:.*]] = fir.alloca !fir.box<i32>
+// CHECK:   fir.store %[[EMBOXR]] to %[[ALLOCAR]] : !fir.ref<!fir.box<i32>>
+// CHECK:   acc.yield %[[ALLOCAR]] : !fir.ref<!fir.box<i32>>
+// CHECK: } combiner {
+// CHECK: ^bb0(%[[LHS:.*]]: !fir.ref<!fir.box<i32>>, %[[RHS:.*]]: !fir.ref<!fir.box<i32>>)
+// CHECK:   %[[LLHS:.*]] = fir.load %[[LHS]] : !fir.ref<!fir.box<i32>>
+// CHECK:   %[[LRHS:.*]] = fir.load %[[RHS]] : !fir.ref<!fir.box<i32>>
+// CHECK:   %[[LADDR:.*]] = fir.box_addr %[[LLHS]] : (!fir.box<i32>) -> !fir.ref<i32>
+// CHECK:   %[[LVAL:.*]] = fir.load %[[LADDR]] : !fir.ref<i32>
+// CHECK:   %[[RADDR:.*]] = fir.box_addr %[[LRHS]] : (!fir.box<i32>) -> !fir.ref<i32>
+// CHECK:   %[[RVAL:.*]] = fir.load %[[RADDR]] : !fir.ref<i32>
+// CHECK:   %[[SUM:.*]] = arith.addi %[[LVAL]], %[[RVAL]] : i32
+// CHECK:   %[[I32ALLOCA:.*]] = fir.alloca i32
+// CHECK:   fir.store %[[SUM]] to %[[I32ALLOCA]] : !fir.ref<i32>
+// CHECK:   %[[NEWBOX:.*]] = fir.embox %[[I32ALLOCA]] : (!fir.ref<i32>) -> !fir.box<i32>
+// CHECK:   %[[BOXALLOCA:.*]] = fir.alloca !fir.box<i32>
+// CHECK:   fir.store %[[NEWBOX]] to %[[BOXALLOCA]] : !fir.ref<!fir.box<i32>>
+// CHECK:   acc.yield %[[BOXALLOCA]] : !fir.ref<!fir.box<i32>>
+// CHECK: } destroy {
+// CHECK: ^bb0(%[[RD0:.*]]: !fir.ref<!fir.box<i32>>, %[[RD1:.*]]: !fir.ref<!fir.box<i32>>)
+
+// -----
+
+// Reduction recipe without destroy region.
+acc.reduction.recipe @red_ref_box_no_destroy : !fir.box<i32> reduction_operator <add> init {
+^bb0(%arg0: !fir.box<i32>):
+  %0 = fir.alloca i32
+  %1 = fir.embox %0 : (!fir.ref<i32>) -> !fir.box<i32>
+  acc.yield %1 : !fir.box<i32>
+} combiner {
+^bb0(%lhs: !fir.box<i32>, %rhs: !fir.box<i32>):
+  %l_addr = fir.box_addr %lhs : (!fir.box<i32>) -> !fir.ref<i32>
+  %l_val = fir.load %l_addr : !fir.ref<i32>
+  %r_addr = fir.box_addr %rhs : (!fir.box<i32>) -> !fir.ref<i32>
+  %r_val = fir.load %r_addr : !fir.ref<i32>
+  %sum = arith.addi %l_val, %r_val : i32
+  %tmp = fir.alloca i32
+  fir.store %sum to %tmp : !fir.ref<i32>
+  %new = fir.embox %tmp : (!fir.ref<i32>) -> !fir.box<i32>
+  acc.yield %new : !fir.box<i32>
+}
+
+// CHECK-LABEL: acc.reduction.recipe @red_ref_box_no_destroy : !fir.ref<!fir.box<i32>> reduction_operator <add> init
+// CHECK: ^bb0(%[[IARGR2:.*]]: !fir.ref<!fir.box<i32>>)
+// CHECK:   %[[EMBOXR2:.*]] = fir.embox
+// CHECK:   %[[ALLOCAR2:.*]] = fir.alloca !fir.box<i32>
+// CHECK:   fir.store %[[EMBOXR2]] to %[[ALLOCAR2]] : !fir.ref<!fir.box<i32>>
+// CHECK:   acc.yield %[[ALLOCAR2]] : !fir.ref<!fir.box<i32>>
+// CHECK: } combiner {
+// CHECK: ^bb0(%[[LHS2:.*]]: !fir.ref<!fir.box<i32>>, %[[RHS2:.*]]: !fir.ref<!fir.box<i32>>)
+// CHECK:   %[[LLHS2:.*]] = fir.load %[[LHS2]] : !fir.ref<!fir.box<i32>>
+// CHECK:   %[[LRHS2:.*]] = fir.load %[[RHS2]] : !fir.ref<!fir.box<i32>>
+// CHECK:   %[[LADDR2:.*]] = fir.box_addr %[[LLHS2]] : (!fir.box<i32>) -> !fir.ref<i32>
+// CHECK:   %[[LVAL2:.*]] = fir.load %[[LADDR2]] : !fir.ref<i32>
+// CHECK:   %[[RADDR2:.*]] = fir.box_addr %[[LRHS2]] : (!fir.box<i32>) -> !fir.ref<i32>
+// CHECK:   %[[RVAL2:.*]] = fir.load %[[RADDR2]] : !fir.ref<i32>
+// CHECK:   %[[SUM2:.*]] = arith.addi %[[LVAL2]], %[[RVAL2]] : i32
+// CHECK:   %[[I32ALLOCA2:.*]] = fir.alloca i32
+// CHECK:   fir.store %[[SUM2]] to %[[I32ALLOCA2]] : !fir.ref<i32>
+// CHECK:   %[[NEWBOX2:.*]] = fir.embox %[[I32ALLOCA2]] : (!fir.ref<i32>) -> !fir.box<i32>
+// CHECK:   %[[BOXALLOCA2:.*]] = fir.alloca !fir.box<i32>
+// CHECK:   fir.store %[[NEWBOX2]] to %[[BOXALLOCA2]] : !fir.ref<!fir.box<i32>>
+// CHECK:   acc.yield %[[BOXALLOCA2]] : !fir.ref<!fir.box<i32>>
+
+// -----
+
+// Comprehensive tests that also test recipe usages updates.
+
+acc.private.recipe @privatization_ref_i32 : !fir.ref<i32> init {
+^bb0(%arg0: !fir.ref<i32>):
+  %0 = fir.alloca i32
+  %1 = fir.declare %0 {uniq_name = "acc.private.init"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  acc.yield %1 : !fir.ref<i32>
+}
+acc.private.recipe @privatization_box_Uxf32 : !fir.box<!fir.array<?xf32>> init {
+^bb0(%arg0: !fir.box<!fir.array<?xf32>>):
+  %c0 = arith.constant 0 : index
+  %0:3 = fir.box_dims %arg0, %c0 : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
+  %1 = fir.shape %0#1 : (index) -> !fir.shape<1>
+  %2 = fir.allocmem !fir.array<?xf32>, %0#1 {bindc_name = ".tmp", uniq_name = ""}
+  %3 = fir.declare %2(%1) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.heap<!fir.array<?xf32>>
+  %4 = fir.embox %3(%1) : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xf32>>
+  acc.yield %4 : !fir.box<!fir.array<?xf32>>
+} destroy {
+^bb0(%arg0: !fir.box<!fir.array<?xf32>>, %arg1: !fir.box<!fir.array<?xf32>>):
+  %0 = fir.box_addr %arg1 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
+  %1 = fir.convert %0 : (!fir.ref<!fir.array<?xf32>>) -> !fir.heap<!fir.array<?xf32>>
+  fir.freemem %1 : !fir.heap<!fir.array<?xf32>>
+  acc.terminator
+}
+func.func @_QPfoo(%arg0: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"}) {
+  %c200_i32 = arith.constant 200 : i32
+  %c1_i32 = arith.constant 1 : i32
+  %0 = fir.dummy_scope : !fir.dscope
+  %1 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFfooEi"}
+  %2 = fir.declare %1 {uniq_name = "_QFfooEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  %3 = fir.declare %arg0 dummy_scope %0 {uniq_name = "_QFfooEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> !fir.box<!fir.array<?xf32>>
+  acc.parallel combined(loop) {
+    %4 = acc.private var(%3 : !fir.box<!fir.array<?xf32>>) -> !fir.box<!fir.array<?xf32>> {name = "x"}
+    %5 = acc.private varPtr(%2 : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i"}
+    acc.loop combined(parallel) private(@privatization_box_Uxf32 -> %4 : !fir.box<!fir.array<?xf32>>, @privatization_ref_i32 -> %5 : !fir.ref<i32>) control(%arg1 : i32) = (%c1_i32 : i32) to (%c200_i32 : i32)  step (%c1_i32 : i32) {
+      %6 = fir.dummy_scope : !fir.dscope
+      %7 = fir.declare %4 dummy_scope %6 {uniq_name = "_QFfooEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> !fir.box<!fir.array<?xf32>>
+      %8 = fir.declare %5 {uniq_name = "_QFfooEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+      %9 = fir.convert %arg1 : (i32) -> f32
+      %10 = fir.convert %arg1 : (i32) -> i64
+      %11 = fir.array_coor %7 %10 : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
+      fir.store %9 to %11 : !fir.ref<f32>
+      acc.yield
+    } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
+    acc.yield
+  }
+  return
+}
+
+// CHECK-LABEL:   acc.private.recipe @privatization_ref_i32 : !fir.ref<i32> init {
+// CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
+// CHECK:           %[[VAL_1:.*]] = fir.alloca i32
+// CHECK:           %[[VAL_2:.*]] = fir.declare %[[VAL_1]] {uniq_name = "acc.private.init"} : (!fir.ref<i32>) -> !fir.ref<i32>
+// CHECK:           acc.yield %[[VAL_2]] : !fir.ref<i32>
+// CHECK:         }
+
+// CHECK-LABEL:   acc.private.recipe @privatization_box_Uxf32 : !fir.ref<!fir.box<!fir.array<?xf32>>> init {
+// CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.array<?xf32>>>):
+// CHECK:           %[[VAL_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
+// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_3:.*]]:3 = fir.box_dims %[[VAL_1]], %[[VAL_2]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
+// CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]]#1 : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_5:.*]] = fir.allocmem !fir.array<?xf32>, %[[VAL_3]]#1 {bindc_name = ".tmp", uniq_name = ""}
+// CHECK:           %[[VAL_6:.*]] = fir.declare %[[VAL_5]](%[[VAL_4]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.heap<!fir.array<?xf32>>
+// CHECK:           %[[VAL_7:.*]] = fir.embox %[[VAL_6]](%[[VAL_4]]) : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xf32>>
+// CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.array<?xf32>>
+// CHECK:           fir.store %[[VAL_7]] to %[[VAL_8]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
+// CHECK:           acc.yield %[[VAL_8]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
+
+// CHECK-LABEL:   } destroy {
+// CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.array<?xf32>>>, %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.array<?xf32>>>):
+// CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
+// CHECK:           %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]] : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
+// CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<!fir.array<?xf32>>) -> !fir.heap<!fir.array<?xf32>>
+// CHECK:           fir.freemem %[[VAL_4]] : !fir.heap<!fir.array<?xf32>>
+// CHECK:           acc.terminator
+// CHECK:         }
+
+// CHECK-LABEL:   func.func @_QPfoo(
+// CHECK-SAME:                      %[[ARG0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"}) {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 200 : i32
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i32
+// CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFfooEi"}
+// CHECK:           %[[VAL_4:.*]] = fir.declare %[[VAL_3]] {uniq_name = "_QFfooEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+// CHECK:           %[[VAL_5:.*]] = fir.declare %[[ARG0]] dummy_scope %[[VAL_2]] {uniq_name = "_QFfooEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> !fir.box<!fir.array<?xf32>>
+// CHECK:           %[[VAL_6:.*]] = fir.alloca !fir.box<!fir.array<?xf32>>
+// CHECK:           fir.store %[[VAL_5]] to %[[VAL_6]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
+// CHECK:           acc.parallel combined(loop) {
+// CHECK:             %[[VAL_7:.*]] = acc.private varPtr(%[[VAL_6]] : !fir.ref<!fir.box<!fir.array<?xf32>>>) -> !fir.ref<!fir.box<!fir.array<?xf32>>> {name = "x"}
+// CHECK:             %[[VAL_8:.*]] = acc.private varPtr(%[[VAL_4]] : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "i"}
+// CHECK:             acc.loop combined(parallel) private(@privatization_box_Uxf32 -> %[[VAL_7]] : !fir.ref<!fir.box<!fir.array<?xf32>>>, @privatization_ref_i32 -> %[[VAL_8]] : !fir.ref<i32>) control(%[[VAL_9:.*]] : i32) = (%[[VAL_1]] : i32) to (%[[VAL_0]] : i32)  step (%[[VAL_1]] : i32) {
+// CHECK:               %[[VAL_10:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:               %[[VAL_11:.*]] = fir.load %[[VAL_7]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
+// CHECK:               %[[VAL_12:.*]] = fir.declare %[[VAL_11]] dummy_scope %[[VAL_10]] {uniq_name = "_QFfooEx"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> !fir.box<!fir.array<?xf32>>
+// CHECK:               %[[VAL_13:.*]] = fir.declare %[[VAL_8]] {uniq_name = "_QFfooEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+// CHECK:               %[[VAL_14:.*]] = fir.convert %[[VAL_9]] : (i32) -> f32
+// CHECK:               %[[VAL_15:.*]] = fir.convert %[[VAL_9]] : (i32) -> i64
+// CHECK:               %[[VAL_16:.*]] = fir.array_coor %[[VAL_12]] %[[VAL_15]] : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
+// CHECK:               fir.store %[[VAL_14]] to %[[VAL_16]] : !fir.ref<f32>
+// CHECK:               acc.yield
+// CHECK:             } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
+// CHECK:             acc.yield
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
diff --git a/flang/test/Fir/OpenACC/recipe-populate-firstprivate.mlir b/flang/test/Fir/OpenACC/recipe-populate-firstprivate.mlir
new file mode 100644
index 0000000000000..0c3f3fea13fa0
--- /dev/null
+++ b/flang/test/Fir/OpenACC/recipe-populate-firstprivate.mlir
@@ -0,0 +1,166 @@
+// RUN: fir-opt %s --split-input-file --pass-pipeline="builtin.module(test-acc-recipe-populate{recipe-type=firstprivate})" | FileCheck %s
+
+// The tests here use a synthetic hlfir.declare in order to ensure that the hlfir dialect is
+// loaded. This is required because the pass used is part of OpenACC test passes outside of
+// flang and the APIs being test may generate hlfir even when it does not appear.
+
+// Test scalar type (f32)
+// CHECK: acc.firstprivate.recipe @firstprivate_scalar : !fir.ref<f32> init {
+// CHECK: ^bb0(%{{.*}}: !fir.ref<f32>):
+// CHECK:   %[[ALLOC:.*]] = fir.alloca f32
+// CHECK:   %{{.*}}:2 = hlfir.declare %[[ALLOC]] {uniq_name = "scalar"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+// CHECK:   acc.yield %{{.*}}#0 : !fir.ref<f32>
+// CHECK: } copy {
+// CHECK: ^bb0(%[[SRC:.*]]: !fir.ref<f32>, %[[DST:.*]]: !fir.ref<f32>):
+// CHECK:   %[[LOAD:.*]] = fir.load %[[SRC]] : !fir.ref<f32>
+// CHECK:   fir.store %[[LOAD]] to %[[DST]] : !fir.ref<f32>
+// CHECK:   acc.terminator
+// CHECK: }
+// CHECK-NOT: destroy
+
+func.func @test_scalar() {
+  %0 = fir.alloca f32 {test.var = "scalar"}
+  %var = fir.alloca f32
+  %1:2 = hlfir.declare %var {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  return
+}
+
+// -----
+
+// Test integer scalar
+// CHECK: acc.firstprivate.recipe @firstprivate_int : !fir.ref<i32> init {
+// CHECK: ^bb0(%{{.*}}: !fir.ref<i32>):
+// CHECK:   %[[ALLOC:.*]] = fir.alloca i32
+// CHECK:   %{{.*}}:2 = hlfir.declare %[[ALLOC]] {uniq_name = "int"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:   acc.yield %{{.*}}#0 : !fir.ref<i32>
+// CHECK: } copy {
+// CHECK: ^bb0(%[[SRC:.*]]: !fir.ref<i32>, %[[DST:.*]]: !fir.ref<i32>):
+// CHECK:   %[[LOAD:.*]] = fir.load %[[SRC]] : !fir.ref<i32>
+// CHECK:   fir.store %[[LOAD]] to %[[DST]] : !fir.ref<i32>
+// CHECK:   acc.terminator
+// CHECK: }
+// CHECK-NOT: destroy
+
+func.func @test_int() {
+  %0 = fir.alloca i32 {test.var = "int"}
+  %var = fir.alloca f32
+  %1:2 = hlfir.declare %var {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  return
+}
+
+// -----
+
+// Test logical type
+// CHECK: acc.firstprivate.recipe @firstprivate_logical : !fir.ref<!fir.logical<4>> init {
+// CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.logical<4>>):
+// CHECK:   %[[ALLOC:.*]] = fir.alloca !fir.logical<4>
+// CHECK:   %{{.*}}:2 = hlfir.declare %[[ALLOC]] {uniq_name = "logical"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+// CHECK:   acc.yield %{{.*}}#0 : !fir.ref<!fir.logical<4>>
+// CHECK: } copy {
+// CHECK: ^bb0(%[[SRC:.*]]: !fir.ref<!fir.logical<4>>, %[[DST:.*]]: !fir.ref<!fir.logical<4>>):
+// CHECK:   %[[LOAD:.*]] = fir.load %[[SRC]] : !fir.ref<!fir.logical<4>>
+// CHECK:   fir.store %[[LOAD]] to %[[DST]] : !fir.ref<!fir.logical<4>>
+// CHECK:   acc.terminator
+// CHECK: }
+// CHECK-NOT: destroy
+
+func.func @test_logical() {
+  %0 = fir.alloca !fir.logical<4> {test.var = "logical"}
+  %var = fir.alloca f32
+  %1:2 = hlfir.declare %var {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  return
+}
+
+// -----
+
+// Test complex type
+// CHECK: acc.firstprivate.recipe @firstprivate_complex : !fir.ref<complex<f32>> init {
+// CHECK: ^bb0(%{{.*}}: !fir.ref<complex<f32>>):
+// CHECK:   %[[ALLOC:.*]] = fir.alloca complex<f32>
+// CHECK:   %{{.*}}:2 = hlfir.declare %[[ALLOC]] {uniq_name = "complex"} : (!fir.ref<complex<f32>>) -> (!fir.ref<complex<f32>>, !fir.ref<complex<f32>>)
+// CHECK:   acc.yield %{{.*}}#0 : !fir.ref<complex<f32>>
+// CHECK: } copy {
+// CHECK: ^bb0(%[[SRC:.*]]: !fir.ref<complex<f32>>, %[[DST:.*]]: !fir.ref<complex<f32>>):
+// CHECK:   %[[LOAD:.*]] = fir.load %[[SRC]] : !fir.ref<complex<f32>>
+// CHECK:   fir.store %[[LOAD]] to %[[DST]] : !fir.ref<complex<f32>>
+// CHECK:   acc.terminator
+// CHECK: }
+// CHECK-NOT: destroy
+
+func.func @test_complex() {
+  %0 = fir.alloca complex<f32> {test.var = "complex"}
+  %var = fir.alloca f32
+  %1:2 = hlfir.declare %var {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  return
+}
+
+// -----
+
+// Test 1D static array
+// CHECK: acc.firstprivate.recipe @firstprivate_array_1d : !fir.ref<!fir.array<100xf32>> init {
+// CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.array<100xf32>>):
+// CHECK:   %[[C100:.*]] = arith.constant 100 : index
+// CHECK:   %[[SHAPE:.*]] = fir.shape %[[C100]] : (index) -> !fir.shape<1>
+// CHECK:   %[[ALLOC:.*]] = fir.alloca !fir.array<100xf32>
+// CHECK:   %{{.*}}:2 = hlfir.declare %[[ALLOC]](%[[SHAPE]]) {uniq_name = "array_1d"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>)
+// CHECK:   acc.yield %{{.*}}#0 : !fir.ref<!fir.array<100xf32>>
+// CHECK: } copy {
+// CHECK: ^bb0(%[[SRC:.*]]: !fir.ref<!fir.array<100xf32>>, %[[DST:.*]]: !fir.ref<!fir.array<100xf32>>):
+// CHECK:   hlfir.assign %[[SRC]] to %[[DST]] : !fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>
+// CHECK:   acc.terminator
+// CHECK: }
+// CHECK-NOT: destroy
+
+func.func @test_array_1d() {
+  %0 = fir.alloca !fir.array<100xf32> {test.var = "array_1d"}
+  %var = fir.alloca f32
+  %1:2 = hlfir.declare %var {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  return
+}
+
+// -----
+
+// Test 2D static array
+// CHECK: acc.firstprivate.recipe @firstprivate_array_2d : !fir.ref<!fir.array<10x20xi32>> init {
+// CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.array<10x20xi32>>):
+// CHECK:   %[[C10:.*]] = arith.constant 10 : index
+// CHECK:   %[[C20:.*]] = arith.constant 20 : index
+// CHECK:   %[[SHAPE:.*]] = fir.shape %[[C10]], %[[C20]] : (index, index) -> !fir.shape<2>
+// CHECK:   %[[ALLOC:.*]] = fir.alloca !fir.array<10x20xi32>
+// CHECK:   %{{.*}}:2 = hlfir.declare %[[ALLOC]](%[[SHAPE]]) {uniq_name = "array_2d"} : (!fir.ref<!fir.array<10x20xi32>>, !fir.shape<2>) -> (!fir.ref<!fir.array<10x20xi32>>, !fir.ref<!fir.array<10x20xi32>>)
+// CHECK:   acc.yield %{{.*}}#0 : !fir.ref<!fir.array<10x20xi32>>
+// CHECK: } copy {
+// CHECK: ^bb0(%[[SRC:.*]]: !fir.ref<!fir.array<10x20xi32>>, %[[DST:.*]]: !fir.ref<!fir.array<10x20xi32>>):
+// CHECK:   hlfir.assign %[[SRC]] to %[[DST]] : !fir.ref<!fir.array<10x20xi32>>, !fir.ref<!fir.array<10x20xi32>>
+// CHECK:   acc.terminator
+// CHECK: }
+// CHECK-NOT: destroy
+
+func.func @test_array_2d() {
+  %0 = fir.alloca !fir.array<10x20xi32> {test.var = "array_2d"}
+  %var = fir.alloca f32
+  %1:2 = hlfir.declare %var {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  return
+}
+
+// -----
+
+// Test derived type with multiple fields
+// CHECK: acc.firstprivate.recipe @firstprivate_derived : !fir.ref<!fir.type<_QTpoint{x:f32,y:f32,z:f32}>> init {
+// CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.type<_QTpoint{x:f32,y:f32,z:f32}>>):
+// CHECK:   %[[ALLOC:.*]] = fir.alloca !fir.type<_QTpoint{x:f32,y:f32,z:f32}>
+// CHECK:   %{{.*}}:2 = hlfir.declare %[[ALLOC]] {uniq_name = "derived"} : (!fir.ref<!fir.type<_QTpoint{x:f32,y:f32,z:f32}>>) -> (!fir.ref<!fir.type<_QTpoint{x:f32,y:f32,z:f32}>>, !fir.ref<!fir.type<_QTpoint{x:f32,y:f32,z:f32}>>)
+// CHECK:   acc.yield %{{.*}}#0 : !fir.ref<!fir.type<_QTpoint{x:f32,y:f32,z:f32}>>
+// CHECK: } copy {
+// CHECK: ^bb0(%[[SRC:.*]]: !fir.ref<!fir.type<_QTpoint{x:f32,y:f32,z:f32}>>, %[[DST:.*]]: !fir.ref<!fir.type<_QTpoint{x:f32,y:f32,z:f32}>>):
+// CHECK:   hlfir.assign %[[SRC]] to %[[DST]] : !fir.ref<!fir.type<_QTpoint{x:f32,y:f32,z:f32}>>, !fir.ref<!fir.type<_QTpoint{x:f32,y:f32,z:f32}>>
+// CHECK:   acc.terminator
+// CHECK: }
+// CHECK-NOT: destroy
+
+func.func @test_derived() {
+  %0 = fir.alloca !fir.type<_QTpoint{x:f32,y:f32,z:f32}> {test.var = "derived"}
+  %var = fir.alloca f32
+  %1:2 = hlfir.declare %var {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  return
+}
diff --git a/flang/test/Fir/OpenACC/recipe-populate-private.mlir b/flang/test/Fir/OpenACC/recipe-populate-private.mlir
new file mode 100644
index 0000000000000..aeb60d6b4a37f
--- /dev/null
+++ b/flang/test/Fir/OpenACC/recipe-populate-private.mlir
@@ -0,0 +1,223 @@
+// RUN: fir-opt %s --split-input-file --pass-pipeline="builtin.module(test-acc-recipe-populate{recipe-type=private})" | FileCheck %s
+
+// The tests here use a synthetic hlfir.declare in order to ensure that the hlfir dialect is
+// loaded. This is required because the pass used is part of OpenACC test passes outside of
+// flang and the APIs being test may generate hlfir even when it does not appear.
+
+// Test scalar type (f32)
+// CHECK: acc.private.recipe @private_scalar : !fir.ref<f32> init {
+// CHECK: ^bb0(%{{.*}}: !fir.ref<f32>):
+// CHECK:   %[[ALLOC:.*]] = fir.alloca f32
+// CHECK:   %{{.*}}:2 = hlfir.declare %[[ALLOC]] {uniq_name = "scalar"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+// CHECK:   acc.yield %{{.*}}#0 : !fir.ref<f32>
+// CHECK: }
+// CHECK-NOT: destroy
+
+func.func @test_scalar() {
+  %0 = fir.alloca f32 {test.var = "scalar"}
+  %var = fir.alloca f32
+  %1:2 = hlfir.declare %var {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  return
+}
+
+// -----
+
+// Test logical type
+// CHECK: acc.private.recipe @private_logical : !fir.ref<!fir.logical<4>> init {
+// CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.logical<4>>):
+// CHECK:   %[[ALLOC:.*]] = fir.alloca !fir.logical<4>
+// CHECK:   %{{.*}}:2 = hlfir.declare %[[ALLOC]] {uniq_name = "logical"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+// CHECK:   acc.yield %{{.*}}#0 : !fir.ref<!fir.logical<4>>
+// CHECK: }
+// CHECK-NOT: destroy
+
+func.func @test_logical() {
+  %0 = fir.alloca !fir.logical<4> {test.var = "logical"}
+  %var = fir.alloca f32
+  %1:2 = hlfir.declare %var {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  return
+}
+
+// -----
+
+// Test complex type
+// CHECK: acc.private.recipe @private_complex : !fir.ref<complex<f32>> init {
+// CHECK: ^bb0(%{{.*}}: !fir.ref<complex<f32>>):
+// CHECK:   %[[ALLOC:.*]] = fir.alloca complex<f32>
+// CHECK:   %{{.*}}:2 = hlfir.declare %[[ALLOC]] {uniq_name = "complex"} : (!fir.ref<complex<f32>>) -> (!fir.ref<complex<f32>>, !fir.ref<complex<f32>>)
+// CHECK:   acc.yield %{{.*}}#0 : !fir.ref<complex<f32>>
+// CHECK: }
+// CHECK-NOT: destroy
+
+func.func @test_complex() {
+  %0 = fir.alloca complex<f32> {test.var = "complex"}
+  %var = fir.alloca f32
+  %1:2 = hlfir.declare %var {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  return
+}
+
+// -----
+
+// Test 1D static array
+// CHECK: acc.private.recipe @private_array_1d : !fir.ref<!fir.array<100xf32>> init {
+// CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.array<100xf32>>):
+// CHECK:   %[[C100:.*]] = arith.constant 100 : index
+// CHECK:   %[[SHAPE:.*]] = fir.shape %[[C100]] : (index) -> !fir.shape<1>
+// CHECK:   %[[ALLOC:.*]] = fir.alloca !fir.array<100xf32>
+// CHECK:   %{{.*}}:2 = hlfir.declare %[[ALLOC]](%[[SHAPE]]) {uniq_name = "array_1d"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>)
+// CHECK:   acc.yield %{{.*}}#0 : !fir.ref<!fir.array<100xf32>>
+// CHECK: }
+// CHECK-NOT: destroy
+
+func.func @test_array_1d() {
+  %0 = fir.alloca !fir.array<100xf32> {test.var = "array_1d"}
+  %var = fir.alloca f32
+  %1:2 = hlfir.declare %var {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  return
+}
+
+// -----
+
+// Test 3D static array
+// CHECK: acc.private.recipe @private_array_3d : !fir.ref<!fir.array<5x10x15xi32>> init {
+// CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.array<5x10x15xi32>>):
+// CHECK:   %[[C5:.*]] = arith.constant 5 : index
+// CHECK:   %[[C10:.*]] = arith.constant 10 : index
+// CHECK:   %[[C15:.*]] = arith.constant 15 : index
+// CHECK:   %[[SHAPE:.*]] = fir.shape %[[C5]], %[[C10]], %[[C15]] : (index, index, index) -> !fir.shape<3>
+// CHECK:   %[[ALLOC:.*]] = fir.alloca !fir.array<5x10x15xi32>
+// CHECK:   %{{.*}}:2 = hlfir.declare %[[ALLOC]](%[[SHAPE]]) {uniq_name = "array_3d"} : (!fir.ref<!fir.array<5x10x15xi32>>, !fir.shape<3>) -> (!fir.ref<!fir.array<5x10x15xi32>>, !fir.ref<!fir.array<5x10x15xi32>>)
+// CHECK:   acc.yield %{{.*}}#0 : !fir.ref<!fir.array<5x10x15xi32>>
+// CHECK: }
+// CHECK-NOT: destroy
+
+func.func @test_array_3d() {
+  %0 = fir.alloca !fir.array<5x10x15xi32> {test.var = "array_3d"}
+  %var = fir.alloca f32
+  %1:2 = hlfir.declare %var {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  return
+}
+
+// -----
+
+// Test derived type with multiple fields
+// CHECK: acc.private.recipe @private_derived : !fir.ref<!fir.type<_QTpoint{x:f32,y:f32,z:f32}>> init {
+// CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.type<_QTpoint{x:f32,y:f32,z:f32}>>):
+// CHECK:   %[[ALLOC:.*]] = fir.alloca !fir.type<_QTpoint{x:f32,y:f32,z:f32}>
+// CHECK:   %{{.*}}:2 = hlfir.declare %[[ALLOC]] {uniq_name = "derived"} : (!fir.ref<!fir.type<_QTpoint{x:f32,y:f32,z:f32}>>) -> (!fir.ref<!fir.type<_QTpoint{x:f32,y:f32,z:f32}>>, !fir.ref<!fir.type<_QTpoint{x:f32,y:f32,z:f32}>>)
+// CHECK:   acc.yield %{{.*}}#0 : !fir.ref<!fir.type<_QTpoint{x:f32,y:f32,z:f32}>>
+// CHECK: }
+// CHECK-NOT: destroy
+
+func.func @test_derived() {
+  %0 = fir.alloca !fir.type<_QTpoint{x:f32,y:f32,z:f32}> {test.var = "derived"}
+  %var = fir.alloca f32
+  %1:2 = hlfir.declare %var {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  return
+}
+
+// -----
+
+// Test box type with heap scalar (needs destroy)
+// CHECK: acc.private.recipe @private_box_heap_scalar : !fir.ref<!fir.box<!fir.heap<f64>>> init {
+// CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.box<!fir.heap<f64>>>):
+// CHECK:   %[[BOXALLOC:.*]] = fir.alloca !fir.box<!fir.heap<f64>>
+// CHECK:   %{{.*}}:2 = hlfir.declare %[[BOXALLOC]] {uniq_name = "box_heap_scalar"} : (!fir.ref<!fir.box<!fir.heap<f64>>>) -> (!fir.ref<!fir.box<!fir.heap<f64>>>, !fir.ref<!fir.box<!fir.heap<f64>>>)
+// CHECK:   %[[SCALAR:.*]] = fir.allocmem f64
+// CHECK:   %[[EMBOX:.*]] = fir.embox %[[SCALAR]] : (!fir.heap<f64>) -> !fir.box<!fir.heap<f64>>
+// CHECK:   fir.store %[[EMBOX]] to %{{.*}}#0 : !fir.ref<!fir.box<!fir.heap<f64>>>
+// CHECK:   acc.yield %{{.*}}#0 : !fir.ref<!fir.box<!fir.heap<f64>>>
+// CHECK: } destroy {
+// CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.box<!fir.heap<f64>>>, %{{.*}}: !fir.ref<!fir.box<!fir.heap<f64>>>):
+// CHECK:   acc.terminator
+// CHECK: }
+
+func.func @test_box_heap_scalar() {
+  %0 = fir.alloca !fir.box<!fir.heap<f64>> {test.var = "box_heap_scalar"}
+  %var = fir.alloca f32
+  %1:2 = hlfir.declare %var {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  return
+}
+
+// -----
+
+// Test box type with pointer scalar (needs destroy)
+// CHECK: acc.private.recipe @private_box_ptr_scalar : !fir.ref<!fir.box<!fir.ptr<i32>>> init {
+// CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.box<!fir.ptr<i32>>>):
+// CHECK:   %[[BOXALLOC:.*]] = fir.alloca !fir.box<!fir.ptr<i32>>
+// CHECK:   %{{.*}}:2 = hlfir.declare %[[BOXALLOC]] {uniq_name = "box_ptr_scalar"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
+// CHECK:   %[[SCALAR:.*]] = fir.allocmem i32
+// CHECK:   %[[EMBOX:.*]] = fir.embox %[[SCALAR]] : (!fir.heap<i32>) -> !fir.box<!fir.ptr<i32>>
+// CHECK:   fir.store %[[EMBOX]] to %{{.*}}#0 : !fir.ref<!fir.box<!fir.ptr<i32>>>
+// CHECK:   acc.yield %{{.*}}#0 : !fir.ref<!fir.box<!fir.ptr<i32>>>
+// CHECK: } destroy {
+// CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.box<!fir.ptr<i32>>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<i32>>>):
+// CHECK:   acc.terminator
+// CHECK: }
+
+func.func @test_box_ptr_scalar() {
+  %0 = fir.alloca !fir.box<!fir.ptr<i32>> {test.var = "box_ptr_scalar"}
+  %var = fir.alloca f32
+  %1:2 = hlfir.declare %var {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  return
+}
+
+// -----
+
+// Test box type with 1D heap array (needs destroy)
+// CHECK: acc.private.recipe @private_box_heap_array_1d : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> init {
+// CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>):
+// CHECK:   %[[BOXALLOC:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>>
+// CHECK:   %{{.*}}:2 = hlfir.declare %[[BOXALLOC]] {uniq_name = "box_heap_array_1d"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+// CHECK:   acc.yield %{{.*}}#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+// CHECK: } destroy {
+// CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, %{{.*}}: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>):
+// CHECK:   acc.terminator
+// CHECK: }
+
+func.func @test_box_heap_array_1d() {
+  %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {test.var = "box_heap_array_1d"}
+  %var = fir.alloca f32
+  %1:2 = hlfir.declare %var {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  return
+}
+
+// -----
+
+// Test box type with 2D heap array (needs destroy)
+// CHECK: acc.private.recipe @private_box_heap_array_2d : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi64>>>> init {
+// CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi64>>>>):
+// CHECK:   %[[BOXALLOC:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?x?xi64>>>
+// CHECK:   %{{.*}}:2 = hlfir.declare %[[BOXALLOC]] {uniq_name = "box_heap_array_2d"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi64>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi64>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi64>>>>)
+// CHECK:   acc.yield %{{.*}}#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi64>>>>
+// CHECK: } destroy {
+// CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi64>>>>, %{{.*}}: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi64>>>>):
+// CHECK:   acc.terminator
+// CHECK: }
+
+func.func @test_box_heap_array_2d() {
+  %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?x?xi64>>> {test.var = "box_heap_array_2d"}
+  %var = fir.alloca f32
+  %1:2 = hlfir.declare %var {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  return
+}
+
+// -----
+
+// Test box type with pointer array (needs destroy)
+// CHECK: acc.private.recipe @private_box_ptr_array : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> init {
+// CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>):
+// CHECK:   %[[BOXALLOC:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xf32>>>
+// CHECK:   %{{.*}}:2 = hlfir.declare %[[BOXALLOC]] {uniq_name = "box_ptr_array"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
+// CHECK:   acc.yield %{{.*}}#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+// CHECK: } destroy {
+// CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>):
+// CHECK:   acc.terminator
+// CHECK: }
+
+func.func @test_box_ptr_array() {
+  %0 = fir.alloca !fir.box<!fir.ptr<!fir.array<?xf32>>> {test.var = "box_ptr_array"}
+  %var = fir.alloca f32
+  %1:2 = hlfir.declare %var {uniq_name = "load_hlfir"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+  return
+}
diff --git a/flang/test/Fir/basic-program.fir b/flang/test/Fir/basic-program.fir
index 195e5ad7f9dc8..5159c91dafa5c 100644
--- a/flang/test/Fir/basic-program.fir
+++ b/flang/test/Fir/basic-program.fir
@@ -69,6 +69,7 @@ func.func @_QQmain() {
 // PASSES-NEXT:     InlineHLFIRAssign
 // PASSES-NEXT:   ConvertHLFIRtoFIR
 // PASSES-NEXT:   LowerWorkshare
+// PASSES-NEXT:   LowerWorkdistribute
 // PASSES-NEXT:   CSE
 // PASSES-NEXT:   (S) 0 num-cse'd - Number of operations CSE'd
 // PASSES-NEXT:   (S) 0 num-dce'd - Number of operations DCE'd
@@ -131,6 +132,7 @@ func.func @_QQmain() {
 // PASSES-NEXT:   (S) 0 num-dce'd - Number of operations DCE'd
 // PASSES-NEXT: 'func.func' Pipeline
 // PASSES-NEXT:   SetRuntimeCallAttributes
+// PASSES-NEXT: MIFOpConversion 
 // PASSES-NEXT: BoxedProcedurePass
 // PASSES-NEXT: AddAliasTags
 
diff --git a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
index 38d51110bbde3..30ed2f0f2f760 100644
--- a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
+++ b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
@@ -252,7 +252,7 @@ func.func @_QPomp_target_data() {
   %c0_6 = arith.constant 0 : index
   %10 = arith.subi %c1024_1, %c1_5 : index
   %11 = omp.map.bounds   lower_bound(%c0_6 : index) upper_bound(%10 : index) extent(%c1024_1 : index) stride(%c1_5 : index) start_idx(%c1_5 : index)
-  %12 = omp.map.info var_ptr(%2 : !fir.ref<!fir.array<1024xi32>>, !fir.array<1024xi32>)   map_clauses(always, exit_release_or_enter_alloc) capture(ByRef) bounds(%11) -> !fir.ref<!fir.array<1024xi32>> {name = "c"}
+  %12 = omp.map.info var_ptr(%2 : !fir.ref<!fir.array<1024xi32>>, !fir.array<1024xi32>)   map_clauses(always, storage) capture(ByRef) bounds(%11) -> !fir.ref<!fir.array<1024xi32>> {name = "c"}
   omp.target_enter_data   map_entries(%6, %9, %12 : !fir.ref<!fir.array<1024xi32>>, !fir.ref<!fir.array<1024xi32>>, !fir.ref<!fir.array<1024xi32>>)
   %c1_7 = arith.constant 1 : index
   %c0_8 = arith.constant 0 : index
@@ -268,7 +268,7 @@ func.func @_QPomp_target_data() {
   %c0_12 = arith.constant 0 : index
   %19 = arith.subi %c1024_1, %c1_11 : index
   %20 = omp.map.bounds   lower_bound(%c0_12 : index) upper_bound(%19 : index) extent(%c1024_1 : index) stride(%c1_11 : index) start_idx(%c1_11 : index)
-  %21 = omp.map.info var_ptr(%2 : !fir.ref<!fir.array<1024xi32>>, !fir.array<1024xi32>)   map_clauses(exit_release_or_enter_alloc) capture(ByRef) bounds(%20) -> !fir.ref<!fir.array<1024xi32>> {name = "c"}
+  %21 = omp.map.info var_ptr(%2 : !fir.ref<!fir.array<1024xi32>>, !fir.array<1024xi32>)   map_clauses(storage) capture(ByRef) bounds(%20) -> !fir.ref<!fir.array<1024xi32>> {name = "c"}
   %c1_13 = arith.constant 1 : index
   %c0_14 = arith.constant 0 : index
   %22 = arith.subi %c1024_2, %c1_13 : index
@@ -305,7 +305,7 @@ func.func @_QPomp_target_data() {
 // CHECK:           %[[VAL_23:.*]] = llvm.mlir.constant(0 : index) : i64
 // CHECK:           %[[VAL_24:.*]] = llvm.mlir.constant(1023 : index) : i64
 // CHECK:           %[[VAL_25:.*]] = omp.map.bounds lower_bound(%[[VAL_23]] : i64) upper_bound(%[[VAL_24]] : i64) extent(%[[VAL_10]] : i64) stride(%[[VAL_22]] : i64) start_idx(%[[VAL_22]] : i64)
-// CHECK:           %[[VAL_26:.*]] = omp.map.info var_ptr(%[[VAL_4]] : !llvm.ptr, !llvm.array<1024 x i32>) map_clauses(always, exit_release_or_enter_alloc) capture(ByRef) bounds(%[[VAL_25]]) -> !llvm.ptr {name = "c"}
+// CHECK:           %[[VAL_26:.*]] = omp.map.info var_ptr(%[[VAL_4]] : !llvm.ptr, !llvm.array<1024 x i32>) map_clauses(always, storage) capture(ByRef) bounds(%[[VAL_25]]) -> !llvm.ptr {name = "c"}
 // CHECK:           omp.target_enter_data map_entries(%[[VAL_16]], %[[VAL_21]], %[[VAL_26]] : !llvm.ptr, !llvm.ptr, !llvm.ptr)
 // CHECK:           %[[VAL_27:.*]] = llvm.mlir.constant(1 : index) : i64
 // CHECK:           %[[VAL_28:.*]] = llvm.mlir.constant(0 : index) : i64
@@ -321,7 +321,7 @@ func.func @_QPomp_target_data() {
 // CHECK:           %[[VAL_38:.*]] = llvm.mlir.constant(0 : index) : i64
 // CHECK:           %[[VAL_39:.*]] = llvm.mlir.constant(1023 : index) : i64
 // CHECK:           %[[VAL_40:.*]] = omp.map.bounds lower_bound(%[[VAL_38]] : i64) upper_bound(%[[VAL_39]] : i64) extent(%[[VAL_10]] : i64) stride(%[[VAL_37]] : i64) start_idx(%[[VAL_37]] : i64)
-// CHECK:           %[[VAL_41:.*]] = omp.map.info var_ptr(%[[VAL_4]] : !llvm.ptr, !llvm.array<1024 x i32>) map_clauses(exit_release_or_enter_alloc) capture(ByRef) bounds(%[[VAL_40]]) -> !llvm.ptr {name = "c"}
+// CHECK:           %[[VAL_41:.*]] = omp.map.info var_ptr(%[[VAL_4]] : !llvm.ptr, !llvm.array<1024 x i32>) map_clauses(storage) capture(ByRef) bounds(%[[VAL_40]]) -> !llvm.ptr {name = "c"}
 // CHECK:           %[[VAL_42:.*]] = llvm.mlir.constant(1 : index) : i64
 // CHECK:           %[[VAL_43:.*]] = llvm.mlir.constant(0 : index) : i64
 // CHECK:           %[[VAL_44:.*]] = llvm.mlir.constant(1023 : index) : i64
diff --git a/flang/test/Lower/CUDA/TODO/cuda-allocate-default-init.cuf b/flang/test/Lower/CUDA/TODO/cuda-allocate-default-init.cuf
new file mode 100644
index 0000000000000..f68a9aa2aee53
--- /dev/null
+++ b/flang/test/Lower/CUDA/TODO/cuda-allocate-default-init.cuf
@@ -0,0 +1,15 @@
+! RUN: %not_todo_cmd bbc -emit-fir -fcuda -o - %s 2>&1 | FileCheck %s
+
+program test
+implicit none
+
+type :: t1
+   real(4) :: x_fin(1:10) = acos(-1.0_4)
+end type t1
+
+type(t1), allocatable, device :: t(:)
+
+! CHECK: not yet implemented: CUDA Fortran: allocate on device with default initialization
+allocate(t(1:2))
+
+end program
diff --git a/flang/test/Lower/CUDA/TODO/cuda-allocate-source-device.cuf b/flang/test/Lower/CUDA/TODO/cuda-allocate-source-device.cuf
new file mode 100644
index 0000000000000..3e59e2f01119e
--- /dev/null
+++ b/flang/test/Lower/CUDA/TODO/cuda-allocate-source-device.cuf
@@ -0,0 +1,9 @@
+! RUN: %not_todo_cmd bbc -emit-fir -fcuda -o - %s 2>&1 | FileCheck %s
+
+program main
+  implicit none
+  integer, device, allocatable :: a_d(:)
+  integer, allocatable :: a(:)
+! CHECK: not yet implemented: CUDA Fortran: allocate with device source
+  allocate(a, source=a_d)
+end program
diff --git a/flang/test/Lower/CUDA/cuda-associate-data-transfer.cuf b/flang/test/Lower/CUDA/cuda-associate-data-transfer.cuf
new file mode 100644
index 0000000000000..af850d5842443
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-associate-data-transfer.cuf
@@ -0,0 +1,21 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+! Test detection of CUDA Fortran data transfer in presence of associuate
+! statement.
+
+module m
+  real(8), device, dimension(10,10,10) :: d
+end module m
+
+subroutine foo
+  use m
+  !@CUF associate(d1 => d)
+  d1 = 0.0
+  !@CUF end associate
+end subroutine
+
+! CHECK-LABEL: func.func @_QPfoo()
+! CHECK: %[[D:.*]] = fir.address_of(@_QMmEd) : !fir.ref<!fir.array<10x10x10xf64>>
+! CHECK: %[[D_DECL:.*]]:2 = hlfir.declare %[[D]](%{{.*}}) {data_attr = #cuf.cuda<device>, uniq_name = "_QMmEd"} : (!fir.ref<!fir.array<10x10x10xf64>>, !fir.shape<3>) -> (!fir.ref<!fir.array<10x10x10xf64>>, !fir.ref<!fir.array<10x10x10xf64>>)
+! CHECK: %[[D1_DECL:.*]]:2 = hlfir.declare %[[D_DECL]]#0(%4) {uniq_name = "_QFfooEd1"} : (!fir.ref<!fir.array<10x10x10xf64>>, !fir.shape<3>) -> (!fir.ref<!fir.array<10x10x10xf64>>, !fir.ref<!fir.array<10x10x10xf64>>)
+! CHECK: cuf.data_transfer %{{.*}} to %[[D1_DECL]]#0 {transfer_kind = #cuf.cuda_transfer<host_device>} : f64, !fir.ref<!fir.array<10x10x10xf64>>
diff --git a/flang/test/Lower/CUDA/cuda-cloc.cuf b/flang/test/Lower/CUDA/cuda-cloc.cuf
new file mode 100644
index 0000000000000..87a98d47f4bfe
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-cloc.cuf
@@ -0,0 +1,19 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+module symbols
+  integer(4), device, target :: sdev(100)
+end module
+
+subroutine sub1
+  use iso_c_binding
+  use symbols
+  print*, c_loc(sdev)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub1()
+! CHECK: %[[ADDR:.*]] = fir.address_of(@_QMsymbolsEsdev) : !fir.ref<!fir.array<100xi32>>
+! CHECK: %[[EMBOX:.*]] = fir.embox %[[ADDR]](%{{.*}}) : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<100xi32>>
+! CHECK: %[[__ADDRESS:.*]] = fir.coordinate_of %{{.*}}, __address : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) -> !fir.ref<i64>
+! CHECK: %[[BOX_ADDR:.*]] = fir.box_addr %[[EMBOX]] : (!fir.box<!fir.array<100xi32>>) -> !fir.ref<!fir.array<100xi32>>
+! CHECK: %[[CONV:.*]] = fir.convert %[[BOX_ADDR]] : (!fir.ref<!fir.array<100xi32>>) -> i64
+! CHECK: fir.store %[[CONV]] to %[[__ADDRESS]] : !fir.ref<i64>
diff --git a/flang/test/Lower/CUDA/cuda-data-transfer.cuf b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
index aef926b09a1ed..b0b8d09c0c55b 100644
--- a/flang/test/Lower/CUDA/cuda-data-transfer.cuf
+++ b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
@@ -15,6 +15,13 @@ module mod1
 
   real(kind=8), device, allocatable, dimension(:) :: p
 
+  interface
+    function __sum(a_d) result(res_h)
+      integer(4), managed, intent(in) :: a_d(:,:,:,:)
+      integer(4), allocatable, managed :: res_h(:,:,:)
+    end function
+  end interface
+
 contains
   function dev1(a)
     integer, device :: a(:)
@@ -522,3 +529,33 @@ end subroutine
 ! CHECK: hlfir.yield_element %[[CONV]] : f32
 ! CHECK: }
 ! CHECKL: hlfir.assign %[[ELE]] to %[[HD]]#0 : !hlfir.expr<10x20x30xf32>, !fir.ref<!fir.array<10x20x30xf32>>
+
+subroutine sub28(N1,N2,N3,N4)
+  use mod1
+  integer(4), managed :: a(N1,N2,N3,N4) 
+  integer(4), managed :: bres(N1,N2,N3)
+  bres = __sum(a)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub28
+! CHECK: fir.call @_QP__sum
+! CHECK-NOT: cuf.data_transfer
+! CHECK: hlfir.assign
+! CHECK-NOT: cuf.data_transfer
+
+! Data transfer with conversion with more complex elemental
+! Check that the data transfer is placed before the elemental op.
+subroutine sub29()
+  real(2), device, allocatable :: a(:)
+  real(4), allocatable :: ha(:)
+  allocate(a(10))
+  allocate(ha(10))
+  ha = a
+  deallocate(a)
+end subroutine
+
+! CHECK-LABEL:  func.func @_QPsub29()
+! CHECK: %[[TMP:.*]] = fir.allocmem !fir.array<?xf16>, %24#1 {bindc_name = ".tmp", uniq_name = ""}
+! CHECK: %[[TMP_BUFFER:.*]]:2 = hlfir.declare %[[TMP]](%{{.*}}) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xf16>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf16>>, !fir.heap<!fir.array<?xf16>>)
+! CHECK: cuf.data_transfer %{{.*}} to %[[TMP_BUFFER]]#0 {transfer_kind = #cuf.cuda_transfer<device_host>} : !fir.box<!fir.heap<!fir.array<?xf16>>>, !fir.box<!fir.array<?xf16>>
+! CHECK: hlfir.elemental 
diff --git a/flang/test/Lower/CUDA/cuda-managed.cuf b/flang/test/Lower/CUDA/cuda-managed.cuf
index e14bd849670b1..69c9ecfd355f7 100644
--- a/flang/test/Lower/CUDA/cuda-managed.cuf
+++ b/flang/test/Lower/CUDA/cuda-managed.cuf
@@ -1,18 +1,14 @@
 ! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
 
+! Check for implicit data transfer of managed variable
+
 subroutine testr2(N1,N2)
   real(4), managed :: ai4(N1,N2)
   real(4), allocatable :: bRefi4(:)
 
   integer :: i1, i2
 
-  do i2 = 1, N2
-    do i1 = 1, N1
-      ai4(i1,i2) = i1 + N1*(i2-1)
-    enddo
-  enddo
-
-  allocate(bRefi4 (N1))
+  allocate(bRefi4(N1))
   do i1 = 1, N1
     bRefi4(i1) = (ai4(i1,1)+ai4(i1,N2))*N2/2
   enddo
@@ -20,8 +16,8 @@ subroutine testr2(N1,N2)
 
 end subroutine
 
-!CHECK-LABEL: func.func @_QPtestr2
-!CHECK: %[[ALLOC:.*]] = cuf.alloc !fir.array<?x?xf32>, %{{.*}}, %{{.*}} : index, index {bindc_name = "ai4", data_attr = #cuf.cuda<managed>, uniq_name = "_QFtestr2Eai4"} -> !fir.ref<!fir.array<?x?xf32>>
-!CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOC]](%{{.*}}) {data_attr = #cuf.cuda<managed>, uniq_name = "_QFtestr2Eai4"} : (!fir.ref<!fir.array<?x?xf32>>, !fir.shape<2>) -> (!fir.box<!fir.array<?x?xf32>>, !fir.ref<!fir.array<?x?xf32>>)
-!CHECK: %[[DEST:.*]] = hlfir.designate %[[DECLARE]]#0 (%{{.*}}, %{{.*}}) : (!fir.box<!fir.array<?x?xf32>>, i64, i64) -> !fir.ref<f32>
-!CHECK: cuf.data_transfer %{{.*}}#0 to %[[DEST]] {transfer_kind = #cuf.cuda_transfer<host_device>} : !fir.ref<f32>, !fir.ref<f32>
+! CHECK-LABEL: func.func @_QPtestr2
+! CHECK:  %[[MANAGED:.*]]:2 = hlfir.declare %22(%23) {data_attr = #cuf.cuda<managed>, uniq_name = "_QFtestr2Eai4"} : (!fir.ref<!fir.array<?x?xf32>>, !fir.shape<2>) -> (!fir.box<!fir.array<?x?xf32>>, !fir.ref<!fir.array<?x?xf32>>)
+! CHECK: %[[TMP:.*]] = fir.allocmem !fir.array<?x?xf32>, %16, %21 {bindc_name = ".tmp", uniq_name = ""}
+! CHECK: %[[TMP_DECL:.*]]:2 = hlfir.declare %[[TMP]](%{{.*}}) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?x?xf32>>, !fir.shape<2>) -> (!fir.box<!fir.array<?x?xf32>>, !fir.heap<!fir.array<?x?xf32>>)
+! CHECK: cuf.data_transfer %[[MANAGED]]#1 to %[[TMP_DECL]]#0 {transfer_kind = #cuf.cuda_transfer<device_host>} : !fir.ref<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>
diff --git a/flang/test/Lower/Coarray/co_broadcast.f90 b/flang/test/Lower/Coarray/co_broadcast.f90
deleted file mode 100644
index be7fdcb99252c..0000000000000
--- a/flang/test/Lower/Coarray/co_broadcast.f90
+++ /dev/null
@@ -1,92 +0,0 @@
-! RUN: %flang_fc1 -emit-hlfir -fcoarray %s -o - | FileCheck %s
-
-program test_co_broadcast
-  integer :: i, array_i(2), status
-  real :: r, array_r(2)
-  double precision :: d, array_d(2)
-  complex :: c, array_c(2)
-  character(len=1) :: message
-
-  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_I:.*]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
-  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
-  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
-  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<i32>) -> !fir.box<none>
-  ! CHECK: fir.call @_QMprifPprif_co_broadcast(%[[V5]], %[[IMAGE_RESULT]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  call co_broadcast(i,       source_image=1)
-
-  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_C:.*]]#0 : (!fir.ref<complex<f32>>) -> !fir.box<complex<f32>>
-  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
-  ! CHECK: %[[V2:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<complex<f32>>) -> !fir.box<none>
-  ! CHECK: fir.call @_QMprifPprif_co_broadcast(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS:.*]], %[[V2]], %[[V3]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  call co_broadcast(c,       source_image=1, stat=status)
-
-  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_D:.*]]#0 : (!fir.ref<f64>) -> !fir.box<f64>
-  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
-  ! CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE:.*]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
-  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<f64>) -> !fir.box<none>
-  ! CHECK: %[[V5:.*]] = fir.convert %[[V2]] : (!fir.box<!fir.char<1>>) -> !fir.box<!fir.char<1,?>>
-  ! CHECK: fir.call @_QMprifPprif_co_broadcast(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS]], %[[V5]], %[[V3]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  call co_broadcast(d,       source_image=1, stat=status, errmsg=message)
-
-  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_R:.*]]#0 : (!fir.ref<f32>) -> !fir.box<f32>
-  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
-  ! CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
-  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<f32>) -> !fir.box<none>
-  ! CHECK: %[[V5:.*]] = fir.convert %[[V2]] : (!fir.box<!fir.char<1>>) -> !fir.box<!fir.char<1,?>>
-  ! CHECK: fir.call @_QMprifPprif_co_broadcast(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS]], %[[V5]], %[[V3]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  call co_broadcast(r,       source_image=1, stat=status, errmsg=message)
-
-  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
-  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_I:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>>
-  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
-  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
-  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2xi32>>) -> !fir.box<none>
-  ! CHECK: fir.call @_QMprifPprif_co_broadcast(%[[V5]], %[[IMAGE_RESULT]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  call co_broadcast(array_i, source_image=1)
-  
-  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
-  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_C:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xcomplex<f32>>>, !fir.shape<1>) -> !fir.box<!fir.array<2xcomplex<f32>>>
-  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
-  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
-  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2xcomplex<f32>>>) -> !fir.box<none>
-  ! CHECK: fir.call @_QMprifPprif_co_broadcast(%[[V5]], %[[IMAGE_RESULT]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  call co_broadcast(array_c, source_image=1)
-  
-  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
-  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_D:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xf64>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf64>>
-  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
-  ! CHECK: %[[V2:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2xf64>>) -> !fir.box<none>
-  ! CHECK: fir.call @_QMprifPprif_co_broadcast(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS]], %[[V2]], %[[V3]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  call co_broadcast(array_d, source_image=1, stat=status)
-
-  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
-  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_C:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf32>>
-  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
-  ! CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
-  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2xf32>>) -> !fir.box<none>
-  ! CHECK: %[[V5:.*]] = fir.convert %[[V2]] : (!fir.box<!fir.char<1>>) -> !fir.box<!fir.char<1,?>>
-  ! CHECK: fir.call @_QMprifPprif_co_broadcast(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS]], %[[V5]], %[[V3]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  call co_broadcast(array_r, source_image=1, stat= status, errmsg=message)
-
-end program
diff --git a/flang/test/Lower/Coarray/co_max.f90 b/flang/test/Lower/Coarray/co_max.f90
deleted file mode 100644
index 56d863389d02b..0000000000000
--- a/flang/test/Lower/Coarray/co_max.f90
+++ /dev/null
@@ -1,112 +0,0 @@
-! RUN: %flang_fc1 -emit-hlfir -fcoarray %s -o - | FileCheck %s
-
-program test_co_max
-  integer :: i, array_i(2), status
-  real :: r, array_r(2)
-  double precision :: d, array_d(2)
-  character(len=1) :: c, array_c(2), message
-
-  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_I:.*]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
-  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
-  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<i32>) -> !fir.box<none>
-  ! CHECK: fir.call @_QMprifPprif_co_max(%[[V5]], %[[V2]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  call co_max(i)
-
-  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_C:.*]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
-  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
-  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.char<1>>) -> !fir.box<none>
-  ! CHECK: fir.call @_QMprifPprif_co_max_character(%[[V5]], %[[V2]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  call co_max(c)
-
-  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_D:.*]]#0 : (!fir.ref<f64>) -> !fir.box<f64>
-  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
-  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<f64>) -> !fir.box<none>
-  ! CHECK: fir.call @_QMprifPprif_co_max(%[[V5]], %[[V2]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  call co_max(d)
-
-  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_R:.*]]#0 : (!fir.ref<f32>) -> !fir.box<f32>
-  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
-  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<f32>) -> !fir.box<none>
-  ! CHECK: fir.call @_QMprifPprif_co_max(%[[V5]], %[[V2]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  call co_max(r)
-
-  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_I]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
-  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
-  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
-  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<i32>) -> !fir.box<none>
-  ! CHECK: fir.call @_QMprifPprif_co_max(%[[V5]], %[[IMAGE_RESULT]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  call co_max(i,       result_image=1)
-
-  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_D]]#0 : (!fir.ref<f64>) -> !fir.box<f64>
-  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
-  ! CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE:.*]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
-  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<f64>) -> !fir.box<none>
-  ! CHECK: %[[V5:.*]] = fir.convert %[[V2]] : (!fir.box<!fir.char<1>>) -> !fir.box<!fir.char<1,?>>
-  ! CHECK: fir.call @_QMprifPprif_co_max(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS:.*]], %[[V5]], %[[V3]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  call co_max(d,       result_image=1, stat=status, errmsg=message)
-
-  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_R]]#0 : (!fir.ref<f32>) -> !fir.box<f32>
-  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
-  ! CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
-  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<f32>) -> !fir.box<none>
-  ! CHECK: %[[V5:.*]] = fir.convert %[[V2]] : (!fir.box<!fir.char<1>>) -> !fir.box<!fir.char<1,?>>
-  ! CHECK: fir.call @_QMprifPprif_co_max(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS]], %[[V5]], %[[V3]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  call co_max(r,       result_image=1, stat=status, errmsg=message)
-
-  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
-  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_I:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>>
-  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
-  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2xi32>>) -> !fir.box<none>
-  ! CHECK: fir.call @_QMprifPprif_co_max(%[[V5]], %[[V2]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  call co_max(array_i)
-   
-  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
-  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_C:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2x!fir.char<1>>>, !fir.shape<1>) -> !fir.box<!fir.array<2x!fir.char<1>>>
-  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
-  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
-  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2x!fir.char<1>>>) -> !fir.box<none>
-  ! CHECK: fir.call @_QMprifPprif_co_max_character(%[[V5]], %[[IMAGE_RESULT]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  call co_max(array_c, result_image=1) 
-   
-  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
-  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_D:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xf64>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf64>>
-  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
-  ! CHECK: %[[V2:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2xf64>>) -> !fir.box<none>
-  ! CHECK: fir.call @_QMprifPprif_co_max(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS]], %[[V2]], %[[V3]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  call co_max(array_d, result_image=1, stat=status)
-
-  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
-  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_C:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf32>>
-  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
-  ! CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
-  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2xf32>>) -> !fir.box<none>
-  ! CHECK: %[[V5:.*]] = fir.convert %[[V2]] : (!fir.box<!fir.char<1>>) -> !fir.box<!fir.char<1,?>>
-  ! CHECK: fir.call @_QMprifPprif_co_max(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS]], %[[V5]], %[[V3]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  call co_max(array_r, result_image=1, stat= status, errmsg=message)
-
-end program
diff --git a/flang/test/Lower/Coarray/co_min.f90 b/flang/test/Lower/Coarray/co_min.f90
deleted file mode 100644
index dde878bb14dbf..0000000000000
--- a/flang/test/Lower/Coarray/co_min.f90
+++ /dev/null
@@ -1,112 +0,0 @@
-! RUN: %flang_fc1 -emit-hlfir -fcoarray %s -o - | FileCheck %s
-
-program test_co_min
-  integer :: i, array_i(2), status
-  real :: r, array_r(2)
-  double precision :: d, array_d(2)
-  character(len=1) :: c, array_c(2), message
-
-  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_I:.*]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
-  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
-  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<i32>) -> !fir.box<none>
-  ! CHECK: fir.call @_QMprifPprif_co_min(%[[V5]], %[[V2]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  call co_min(i)
-
-  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_C:.*]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
-  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
-  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.char<1>>) -> !fir.box<none>
-  ! CHECK: fir.call @_QMprifPprif_co_min_character(%[[V5]], %[[V2]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  call co_min(c)
-
-  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_D:.*]]#0 : (!fir.ref<f64>) -> !fir.box<f64>
-  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
-  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<f64>) -> !fir.box<none>
-  ! CHECK: fir.call @_QMprifPprif_co_min(%[[V5]], %[[V2]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  call co_min(d)
-
-  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_R:.*]]#0 : (!fir.ref<f32>) -> !fir.box<f32>
-  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
-  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<f32>) -> !fir.box<none>
-  ! CHECK: fir.call @_QMprifPprif_co_min(%[[V5]], %[[V2]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  call co_min(r)
-
-  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_I]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
-  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
-  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
-  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<i32>) -> !fir.box<none>
-  ! CHECK: fir.call @_QMprifPprif_co_min(%[[V5]], %[[IMAGE_RESULT]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  call co_min(i,       result_image=1)
-
-  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_D]]#0 : (!fir.ref<f64>) -> !fir.box<f64>
-  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
-  ! CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE:.*]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
-  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<f64>) -> !fir.box<none>
-  ! CHECK: %[[V5:.*]] = fir.convert %[[V2]] : (!fir.box<!fir.char<1>>) -> !fir.box<!fir.char<1,?>>
-  ! CHECK: fir.call @_QMprifPprif_co_min(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS:.*]], %[[V5]], %[[V3]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  call co_min(d,       result_image=1, stat=status, errmsg=message)
-
-  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_R]]#0 : (!fir.ref<f32>) -> !fir.box<f32>
-  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
-  ! CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
-  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<f32>) -> !fir.box<none>
-  ! CHECK: %[[V5:.*]] = fir.convert %[[V2]] : (!fir.box<!fir.char<1>>) -> !fir.box<!fir.char<1,?>>
-  ! CHECK: fir.call @_QMprifPprif_co_min(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS]], %[[V5]], %[[V3]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  call co_min(r,       result_image=1, stat=status, errmsg=message)
-
-  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
-  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_I:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>>
-  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
-  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2xi32>>) -> !fir.box<none>
-  ! CHECK: fir.call @_QMprifPprif_co_min(%[[V5]], %[[V2]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  call co_min(array_i)
-   
-  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
-  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_C:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2x!fir.char<1>>>, !fir.shape<1>) -> !fir.box<!fir.array<2x!fir.char<1>>>
-  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
-  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
-  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2x!fir.char<1>>>) -> !fir.box<none>
-  ! CHECK: fir.call @_QMprifPprif_co_min_character(%[[V5]], %[[IMAGE_RESULT]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  call co_min(array_c, result_image=1) 
-   
-  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
-  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_D:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xf64>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf64>>
-  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
-  ! CHECK: %[[V2:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2xf64>>) -> !fir.box<none>
-  ! CHECK: fir.call @_QMprifPprif_co_min(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS]], %[[V2]], %[[V3]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  call co_min(array_d, result_image=1, stat=status)
-
-  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
-  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_C:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf32>>
-  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
-  ! CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
-  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2xf32>>) -> !fir.box<none>
-  ! CHECK: %[[V5:.*]] = fir.convert %[[V2]] : (!fir.box<!fir.char<1>>) -> !fir.box<!fir.char<1,?>>
-  ! CHECK: fir.call @_QMprifPprif_co_min(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS]], %[[V5]], %[[V3]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  call co_min(array_r, result_image=1, stat= status, errmsg=message)
-
-end program
diff --git a/flang/test/Lower/Coarray/co_sum.f90 b/flang/test/Lower/Coarray/co_sum.f90
deleted file mode 100644
index 2932b54c60a8d..0000000000000
--- a/flang/test/Lower/Coarray/co_sum.f90
+++ /dev/null
@@ -1,122 +0,0 @@
-! RUN: %flang_fc1 -emit-hlfir -fcoarray %s -o - | FileCheck %s
-
-program test_co_sum
-  integer :: i, array_i(2), status
-  real :: r, array_r(2)
-  double precision :: d, array_d(2)
-  complex :: c, array_c(2)
-  character(len=1) :: message
-
-  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_I:.*]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
-  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
-  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<i32>) -> !fir.box<none>
-  ! CHECK: fir.call @_QMprifPprif_co_sum(%[[V5]], %[[V2]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  call co_sum(i)
-
-  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_C:.*]]#0 : (!fir.ref<complex<f32>>) -> !fir.box<complex<f32>>
-  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
-  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<complex<f32>>) -> !fir.box<none>
-  ! CHECK: fir.call @_QMprifPprif_co_sum(%[[V5]], %[[V2]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  call co_sum(c)
-
-  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_D:.*]]#0 : (!fir.ref<f64>) -> !fir.box<f64>
-  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
-  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<f64>) -> !fir.box<none>
-  ! CHECK: fir.call @_QMprifPprif_co_sum(%[[V5]], %[[V2]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  call co_sum(d)
-
-  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_R:.*]]#0 : (!fir.ref<f32>) -> !fir.box<f32>
-  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
-  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<f32>) -> !fir.box<none>
-  ! CHECK: fir.call @_QMprifPprif_co_sum(%[[V5]], %[[V2]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  call co_sum(r)
-
-  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_I]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
-  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
-  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
-  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<i32>) -> !fir.box<none>
-  ! CHECK: fir.call @_QMprifPprif_co_sum(%[[V5]], %[[IMAGE_RESULT]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  call co_sum(i,       result_image=1)
-
-  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_C]]#0 : (!fir.ref<complex<f32>>) -> !fir.box<complex<f32>>
-  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
-  ! CHECK: %[[V2:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<complex<f32>>) -> !fir.box<none>
-  ! CHECK: fir.call @_QMprifPprif_co_sum(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS:.*]], %[[V2]], %[[V3]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  call co_sum(c,       result_image=1, stat=status)
-
-  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_D]]#0 : (!fir.ref<f64>) -> !fir.box<f64>
-  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
-  ! CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE:.*]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
-  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<f64>) -> !fir.box<none>
-  ! CHECK: %[[V5:.*]] = fir.convert %[[V2]] : (!fir.box<!fir.char<1>>) -> !fir.box<!fir.char<1,?>>
-  ! CHECK: fir.call @_QMprifPprif_co_sum(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS]], %[[V5]], %[[V3]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  call co_sum(d,       result_image=1, stat=status, errmsg=message)
-
-  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_R]]#0 : (!fir.ref<f32>) -> !fir.box<f32>
-  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
-  ! CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
-  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<f32>) -> !fir.box<none>
-  ! CHECK: %[[V5:.*]] = fir.convert %[[V2]] : (!fir.box<!fir.char<1>>) -> !fir.box<!fir.char<1,?>>
-  ! CHECK: fir.call @_QMprifPprif_co_sum(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS]], %[[V5]], %[[V3]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  call co_sum(r,       result_image=1, stat=status, errmsg=message)
-
-  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
-  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_I:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>>
-  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
-  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2xi32>>) -> !fir.box<none>
-  ! CHECK: fir.call @_QMprifPprif_co_sum(%[[V5]], %[[V2]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  call co_sum(array_i)
-  
-  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
-  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_C:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xcomplex<f32>>>, !fir.shape<1>) -> !fir.box<!fir.array<2xcomplex<f32>>>
-  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
-  ! CHECK: %[[V2:.*]] = fir.absent !fir.ref<i32>
-  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V5:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2xcomplex<f32>>>) -> !fir.box<none>
-  ! CHECK: fir.call @_QMprifPprif_co_sum(%[[V5]], %[[IMAGE_RESULT]], %[[V2]], %[[V3]], %[[V4]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  call co_sum(array_c, result_image=1)
-  
-  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
-  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_D:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xf64>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf64>>
-  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
-  ! CHECK: %[[V2:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2xf64>>) -> !fir.box<none>
-  ! CHECK: fir.call @_QMprifPprif_co_sum(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS]], %[[V2]], %[[V3]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  call co_sum(array_d, result_image=1, stat=status)
-
-  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
-  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_C:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf32>>
-  ! CHECK: fir.store %[[C1_i32]] to %[[IMAGE_RESULT:.*]] : !fir.ref<i32>
-  ! CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
-  ! CHECK: %[[V3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[V4:.*]] = fir.convert %[[V1]] : (!fir.box<!fir.array<2xf32>>) -> !fir.box<none>
-  ! CHECK: %[[V5:.*]] = fir.convert %[[V2]] : (!fir.box<!fir.char<1>>) -> !fir.box<!fir.char<1,?>>
-  ! CHECK: fir.call @_QMprifPprif_co_sum(%[[V4]], %[[IMAGE_RESULT]], %[[STATUS]], %[[V5]], %[[V3]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  call co_sum(array_r, result_image=1, stat= status, errmsg=message)
-
-end program
diff --git a/flang/test/Lower/Coarray/sync_all.f90 b/flang/test/Lower/Coarray/sync_all.f90
deleted file mode 100644
index c2c12d8cdf237..0000000000000
--- a/flang/test/Lower/Coarray/sync_all.f90
+++ /dev/null
@@ -1,37 +0,0 @@
-! RUN: %flang_fc1 -emit-hlfir -fcoarray %s -o - | FileCheck %s --check-prefixes=COARRAY
-! RUN: not %flang_fc1 -emit-hlfir %s 2>&1 | FileCheck %s --check-prefixes=NOCOARRAY
-
-program test_sync_all
-  implicit none
-  ! NOCOARRAY: Not yet implemented: Multi-image features are experimental and are disabled by default, use '-fcoarray' to enable.
- 
-  ! COARRAY: %[[ERRMSG:.*]]:2 = hlfir.declare %[[VAL_1:.*]] typeparams %[[C_128:.*]] {uniq_name = "_QFEerror_message"} : (!fir.ref<!fir.char<1,128>>, index) -> (!fir.ref<!fir.char<1,128>>, !fir.ref<!fir.char<1,128>>)
-  ! COARRAY: %[[STAT:.*]]:2 = hlfir.declare %[[VAL_2:.*]] {uniq_name = "_QFEsync_status"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-  integer sync_status
-  character(len=128) :: error_message
-
-  ! COARRAY: %[[VAL_3:.*]] = fir.absent !fir.ref<i32>
-  ! COARRAY: %[[VAL_4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! COARRAY: %[[VAL_5:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! COARRAY: fir.call @_QMprifPprif_sync_all(%[[VAL_3]], %[[VAL_4]], %[[VAL_5]]) fastmath<contract> : (!fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  sync all
-
-  ! COARRAY: %[[VAL_6:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! COARRAY: %[[VAL_7:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! COARRAY: fir.call @_QMprifPprif_sync_all(%[[STAT]]#0, %[[VAL_6]], %[[VAL_7]]) fastmath<contract> : (!fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  sync all(stat=sync_status)
-  
-  ! COARRAY: %[[VAL_8:.*]] = fir.embox %[[ERRMSG]]#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
-  ! COARRAY: %[[VAL_9:.*]] = fir.absent !fir.ref<i32>
-  ! COARRAY: %[[VAL_10:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! COARRAY: %[[VAL_11:.*]] = fir.convert %[[VAL_8]] : (!fir.box<!fir.char<1,128>>) -> !fir.box<!fir.char<1,?>>
-  ! COARRAY: fir.call @_QMprifPprif_sync_all(%[[VAL_9]], %[[VAL_11]], %[[VAL_10]]) fastmath<contract> : (!fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  sync all(                  errmsg=error_message)
-  
-  ! COARRAY: %[[VAL_12:.*]] = fir.embox %[[ERRMSG]]#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
-  ! COARRAY: %[[VAL_13:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! COARRAY: %[[VAL_14:.*]] = fir.convert %[[VAL_12]] : (!fir.box<!fir.char<1,128>>) -> !fir.box<!fir.char<1,?>>
-  ! COARRAY: fir.call @_QMprifPprif_sync_all(%[[STAT]]#0, %[[VAL_14]], %[[VAL_13]]) fastmath<contract> : (!fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  sync all(stat=sync_status, errmsg=error_message)
-
-end program test_sync_all
diff --git a/flang/test/Lower/Coarray/sync_images.f90 b/flang/test/Lower/Coarray/sync_images.f90
deleted file mode 100644
index 0224bf235c36c..0000000000000
--- a/flang/test/Lower/Coarray/sync_images.f90
+++ /dev/null
@@ -1,62 +0,0 @@
-! RUN: %flang_fc1 -emit-hlfir -fcoarray %s -o - | FileCheck %s --check-prefixes=COARRAY
-! RUN: not %flang_fc1 -emit-hlfir %s 2>&1 | FileCheck %s --check-prefixes=NOCOARRAY
-
-program test_sync_images
-  implicit none
-  ! NOCOARRAY: Not yet implemented: Multi-image features are experimental and are disabled by default, use '-fcoarray' to enable.
- 
-  ! COARRAY: %[[ERRMSG:.*]]:2 = hlfir.declare %[[VAL_1:.*]] typeparams %[[C_128:.*]] {uniq_name = "_QFEerror_message"} : (!fir.ref<!fir.char<1,128>>, index) -> (!fir.ref<!fir.char<1,128>>, !fir.ref<!fir.char<1,128>>)
-  ! COARRAY: %[[ME:.*]]:2 = hlfir.declare %[[VAL_3:.*]] {uniq_name = "_QFEme"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-  ! COARRAY: %[[STAT:.*]]:2 = hlfir.declare %[[VAL_2:.*]] {uniq_name = "_QFEsync_status"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-  integer sync_status, me
-  character(len=128) :: error_message
-
-  ! COARRAY: %[[VAL_1:.*]] = fir.embox %[[ERRMSG]]#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
-  ! COARRAY: %[[VAL_2:.*]] = fir.absent !fir.box<!fir.array<?xi32>> 
-  ! COARRAY: %[[VAL_3:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! COARRAY: %[[VAL_4:.*]] = fir.convert %[[VAL_1]] : (!fir.box<!fir.char<1,128>>) -> !fir.box<!fir.char<1,?>>
-  ! COARRAY: fir.call @_QMprifPprif_sync_images(%[[VAL_2]], %[[STAT]]#0, %[[VAL_4]], %[[VAL_3]]) fastmath<contract> : (!fir.box<!fir.array<?xi32>>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  sync images(*, stat=sync_status, errmsg=error_message)
-
-  ! COARRAY: %[[VAL_5:.*]] = fir.embox %[[ERRMSG]]#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
-  ! COARRAY: %[[VAL_6:.*]] = fir.embox %[[ME]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
-  ! COARRAY: %[[VAL_7:.*]] = fir.rebox %[[VAL_6]](%[[SHAPE:.*]]) : (!fir.box<i32>, !fir.shape<1>) -> !fir.box<!fir.array<1xi32>>
-  ! COARRAY: %[[VAL_8:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! COARRAY: %[[VAL_9:.*]] = fir.convert %[[VAL_7]] : (!fir.box<!fir.array<1xi32>>) -> !fir.box<!fir.array<?xi32>> 
-  ! COARRAY: %[[VAL_10:.*]] = fir.convert %[[VAL_5]] : (!fir.box<!fir.char<1,128>>) -> !fir.box<!fir.char<1,?>>
-  ! COARRAY: fir.call @_QMprifPprif_sync_images(%[[VAL_9]], %[[STAT]]#0, %[[VAL_10]], %[[VAL_8]]) fastmath<contract> : (!fir.box<!fir.array<?xi32>>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  sync images(me,   stat=sync_status, errmsg=error_message)
-
-  ! COARRAY: %[[VAL_11:.*]] = fir.embox %[[ERRMSG]]#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
-  ! COARRAY: %[[VAL_12:.*]] = fir.embox %[[IMG_SET:.*]]#0(%[[SHAPE_1:.*]]) : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<1xi32>>
-  ! COARRAY: %[[VAL_13:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! COARRAY: %[[VAL_14:.*]] = fir.convert %[[VAL_12]] : (!fir.box<!fir.array<1xi32>>) -> !fir.box<!fir.array<?xi32>>
-  ! COARRAY: %[[VAL_15:.*]] = fir.convert %[[VAL_11]] : (!fir.box<!fir.char<1,128>>) -> !fir.box<!fir.char<1,?>>
-  ! COARRAY: fir.call @_QMprifPprif_sync_images(%[[VAL_14]], %[[STAT]]#0, %[[VAL_15]], %[[VAL_13]]) fastmath<contract> : (!fir.box<!fir.array<?xi32>>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  sync images([1],  stat=sync_status, errmsg=error_message)
-  
-  ! COARRAY: %[[VAL_17:.*]] = fir.absent !fir.ref<i32>
-  ! COARRAY: %[[VAL_18:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! COARRAY: %[[VAL_19:.*]] = fir.absent !fir.box<!fir.array<?xi32>> 
-  ! COARRAY: %[[VAL_20:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! COARRAY: fir.call @_QMprifPprif_sync_images(%[[VAL_19]], %[[VAL_17]], %[[VAL_18]], %[[VAL_20]]) fastmath<contract> : (!fir.box<!fir.array<?xi32>>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  sync images(*)
-  
-  ! COARRAY: %[[VAL_23:.*]] = fir.absent !fir.ref<i32>
-  ! COARRAY: %[[VAL_24:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! COARRAY: %[[VAL_21:.*]] = fir.embox %[[ME]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
-  ! COARRAY: %[[VAL_22:.*]] = fir.rebox %[[VAL_21]](%[[SHAPE_2:.*]]) : (!fir.box<i32>, !fir.shape<1>) -> !fir.box<!fir.array<1xi32>>
-  ! COARRAY: %[[VAL_25:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! COARRAY: %[[VAL_26:.*]] = fir.convert %[[VAL_22]] : (!fir.box<!fir.array<1xi32>>) -> !fir.box<!fir.array<?xi32>> 
-  ! COARRAY: fir.call @_QMprifPprif_sync_images(%[[VAL_26]], %[[VAL_23]], %[[VAL_24]], %[[VAL_25]]) fastmath<contract> : (!fir.box<!fir.array<?xi32>>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  sync images(me)
-  
-  ! COARRAY: %[[VAL_28:.*]] = fir.absent !fir.ref<i32>
-  ! COARRAY: %[[VAL_29:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! COARRAY: %[[VAL_27:.*]] = fir.embox %[[IMG_SET:.*]]#0(%[[SHAPE_3:.*]]) : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<1xi32>>
-  ! COARRAY: %[[VAL_30:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! COARRAY: %[[VAL_31:.*]] = fir.convert %[[VAL_27]] : (!fir.box<!fir.array<1xi32>>) -> !fir.box<!fir.array<?xi32>>
-  ! COARRAY: fir.call @_QMprifPprif_sync_images(%[[VAL_31]], %[[VAL_28]], %[[VAL_29]], %[[VAL_30]]) fastmath<contract> : (!fir.box<!fir.array<?xi32>>, !fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  sync images([1])
-
-end program test_sync_images
diff --git a/flang/test/Lower/Coarray/sync_memory.f90 b/flang/test/Lower/Coarray/sync_memory.f90
deleted file mode 100644
index 773cb6fe4efb7..0000000000000
--- a/flang/test/Lower/Coarray/sync_memory.f90
+++ /dev/null
@@ -1,37 +0,0 @@
-! RUN: %flang_fc1 -emit-hlfir -fcoarray %s -o - | FileCheck %s --check-prefixes=COARRAY
-! RUN: not %flang_fc1 -emit-hlfir %s 2>&1 | FileCheck %s --check-prefixes=NOCOARRAY
-
-program test_sync_memory
-  implicit none
-  ! NOCOARRAY: Not yet implemented: Multi-image features are experimental and are disabled by default, use '-fcoarray' to enable.
- 
-  ! COARRAY: %[[ERRMSG:.*]]:2 = hlfir.declare %[[VAL_1:.*]] typeparams %[[C_128:.*]] {uniq_name = "_QFEerror_message"} : (!fir.ref<!fir.char<1,128>>, index) -> (!fir.ref<!fir.char<1,128>>, !fir.ref<!fir.char<1,128>>)
-  ! COARRAY: %[[STAT:.*]]:2 = hlfir.declare %[[VAL_2:.*]] {uniq_name = "_QFEsync_status"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-  integer sync_status
-  character(len=128) :: error_message
-
-  ! COARRAY: %[[VAL_3:.*]] = fir.absent !fir.ref<i32>
-  ! COARRAY: %[[VAL_4:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! COARRAY: %[[VAL_5:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! COARRAY: fir.call @_QMprifPprif_sync_memory(%[[VAL_3]], %[[VAL_4]], %[[VAL_5]]) fastmath<contract> : (!fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  sync memory
-
-  ! COARRAY: %[[VAL_6:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! COARRAY: %[[VAL_7:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! COARRAY: fir.call @_QMprifPprif_sync_memory(%[[STAT]]#0, %[[VAL_6]], %[[VAL_7]]) fastmath<contract> : (!fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  sync memory(stat=sync_status)
-  
-  ! COARRAY: %[[VAL_8:.*]] = fir.embox %[[ERRMSG]]#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
-  ! COARRAY: %[[VAL_9:.*]] = fir.absent !fir.ref<i32>
-  ! COARRAY: %[[VAL_10:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! COARRAY: %[[VAL_11:.*]] = fir.convert %[[VAL_8]] : (!fir.box<!fir.char<1,128>>) -> !fir.box<!fir.char<1,?>>
-  ! COARRAY: fir.call @_QMprifPprif_sync_memory(%[[VAL_9]], %[[VAL_11]], %[[VAL_10]]) fastmath<contract> : (!fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  sync memory(                  errmsg=error_message)
-  
-  ! COARRAY: %[[VAL_12:.*]] = fir.embox %[[ERRMSG]]#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
-  ! COARRAY: %[[VAL_13:.*]] = fir.absent !fir.box<!fir.char<1,?>>
-  ! COARRAY: %[[VAL_14:.*]] = fir.convert %[[VAL_12]] : (!fir.box<!fir.char<1,128>>) -> !fir.box<!fir.char<1,?>>
-  ! COARRAY: fir.call @_QMprifPprif_sync_memory(%[[STAT]]#0, %[[VAL_14]], %[[VAL_13]]) fastmath<contract> : (!fir.ref<i32>, !fir.box<!fir.char<1,?>>, !fir.box<!fir.char<1,?>>) -> ()
-  sync memory(stat=sync_status, errmsg=error_message)
-
-end program test_sync_memory
diff --git a/flang/test/Lower/MIF/co_broadcast.f90 b/flang/test/Lower/MIF/co_broadcast.f90
new file mode 100644
index 0000000000000..25e4330ade704
--- /dev/null
+++ b/flang/test/Lower/MIF/co_broadcast.f90
@@ -0,0 +1,57 @@
+! RUN: %flang_fc1 -emit-hlfir -fcoarray %s -o - | FileCheck %s
+
+program test_co_broadcast
+  integer :: i, array_i(2), status
+  real :: r, array_r(2)
+  double precision :: d, array_d(2)
+  complex :: c, array_c(2)
+  character(len=1) :: message
+
+  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_I:.*]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
+  ! CHECK: mif.co_broadcast %[[V1]] source %[[C1_i32:.*]] : (!fir.box<i32>, i32)
+  call co_broadcast(i,       source_image=1)
+
+  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_C:.*]]#0 : (!fir.ref<complex<f32>>) -> !fir.box<complex<f32>>
+  ! CHECK: mif.co_broadcast %[[V1]] source %[[C1_i32:.*]] stat %[[STATUS:.*]]#0 : (!fir.box<complex<f32>>,  i32, !fir.ref<i32>)
+  call co_broadcast(c,       source_image=1, stat=status)
+
+  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_D:.*]]#0 : (!fir.ref<f64>) -> !fir.box<f64>
+  ! CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE:.*]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+  ! CHECK: mif.co_broadcast %[[V1]] source %[[C1_i32:.*]] stat %[[STATUS]]#0 errmsg %[[V2]] : (!fir.box<f64>, i32, !fir.ref<i32>, !fir.box<!fir.char<1>>)
+  call co_broadcast(d,       source_image=1, stat=status, errmsg=message)
+
+  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_R:.*]]#0 : (!fir.ref<f32>) -> !fir.box<f32>
+  ! CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+  ! CHECK: mif.co_broadcast %[[V1]] source %[[C1_i32:.*]] stat %[[STATUS]]#0 errmsg %[[V2]] : (!fir.box<f32>, i32, !fir.ref<i32>, !fir.box<!fir.char<1>>)
+  call co_broadcast(r,       source_image=1, stat=status, errmsg=message)
+
+  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
+  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_I:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>>
+  ! CHECK: mif.co_broadcast %[[V1]] source %[[C1_i32:.*]] : (!fir.box<!fir.array<2xi32>>, i32)
+  call co_broadcast(array_i, source_image=1)
+  
+  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
+  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_C:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xcomplex<f32>>>, !fir.shape<1>) -> !fir.box<!fir.array<2xcomplex<f32>>>
+  ! CHECK: mif.co_broadcast %[[V1]] source %[[C1_i32:.*]] : (!fir.box<!fir.array<2xcomplex<f32>>>, i32)
+  call co_broadcast(array_c, source_image=1)
+  
+  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
+  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_D:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xf64>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf64>>
+  ! CHECK: mif.co_broadcast %[[V1]] source %[[C1_i32:.*]] stat %[[STATUS:.*]]#0 : (!fir.box<!fir.array<2xf64>>, i32, !fir.ref<i32>)
+  call co_broadcast(array_d, source_image=1, stat=status)
+
+  ! CHECK: %[[C1_i32:.*]] = arith.constant 1 : i32
+  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
+  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_C:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf32>>
+  ! CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+  ! CHECK: mif.co_broadcast %[[V1]] source %[[C1_i32:.*]] stat %[[STATUS:.*]]#0 errmsg %[[V2]] : (!fir.box<!fir.array<2xf32>>, i32, !fir.ref<i32>, !fir.box<!fir.char<1>>)
+  call co_broadcast(array_r, source_image=1, stat= status, errmsg=message)
+
+end program
diff --git a/flang/test/Lower/MIF/co_max.f90 b/flang/test/Lower/MIF/co_max.f90
new file mode 100644
index 0000000000000..19e65626b50f2
--- /dev/null
+++ b/flang/test/Lower/MIF/co_max.f90
@@ -0,0 +1,60 @@
+! RUN: %flang_fc1 -emit-hlfir -fcoarray %s -o - | FileCheck %s
+
+program test_co_max
+  integer :: i, array_i(2), status
+  real :: r, array_r(2)
+  double precision :: d, array_d(2)
+  character(len=1) :: c, array_c(2), message
+
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_I:.*]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
+  ! CHECK: mif.co_max %[[V1]] : (!fir.box<i32>)
+  call co_max(i)
+
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_C:.*]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+  ! CHECK: mif.co_max %[[V1]] : (!fir.box<!fir.char<1>>)
+  call co_max(c)
+
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_D:.*]]#0 : (!fir.ref<f64>) -> !fir.box<f64>
+  ! CHECK: mif.co_max %[[V1]] : (!fir.box<f64>)
+  call co_max(d)
+
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_R:.*]]#0 : (!fir.ref<f32>) -> !fir.box<f32>
+  ! CHECK: mif.co_max %[[V1]] : (!fir.box<f32>)
+  call co_max(r)
+
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_I]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
+  ! CHECK: mif.co_max %[[V1]] result %[[C1_i32:.*]] : (!fir.box<i32>, i32)
+  call co_max(i,       result_image=1)
+
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_D]]#0 : (!fir.ref<f64>) -> !fir.box<f64>
+  ! CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE:.*]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+  ! CHECK: mif.co_max %[[V1]] result %[[C1_i32:.*]] stat %[[STATUS:.*]]#0 errmsg %[[V2]] : (!fir.box<f64>, i32, !fir.ref<i32>, !fir.box<!fir.char<1>>)
+  call co_max(d,       result_image=1, stat=status, errmsg=message)
+
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_R]]#0 : (!fir.ref<f32>) -> !fir.box<f32>
+  ! CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+  ! CHECK: mif.co_max %[[V1]] result %[[C1_i32:.*]] stat %[[STATUS:.*]]#0 errmsg %[[V2]] : (!fir.box<f32>, i32, !fir.ref<i32>, !fir.box<!fir.char<1>>)
+  call co_max(r,       result_image=1, stat=status, errmsg=message)
+
+  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
+  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_I:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>>
+  ! CHECK: mif.co_max %[[V1]] : (!fir.box<!fir.array<2xi32>>)
+  call co_max(array_i)
+   
+  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
+  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_C:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2x!fir.char<1>>>, !fir.shape<1>) -> !fir.box<!fir.array<2x!fir.char<1>>>
+  ! CHECK: mif.co_max %[[V1]] result %[[C1_i32:.*]] : (!fir.box<!fir.array<2x!fir.char<1>>>, i32)
+  call co_max(array_c, result_image=1) 
+   
+  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
+  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_D:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xf64>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf64>>
+  ! CHECK: mif.co_max %[[V1]] result %[[C1_i32:.*]] stat %[[STATUS:.*]]#0 : (!fir.box<!fir.array<2xf64>>, i32, !fir.ref<i32>)
+  call co_max(array_d, result_image=1, stat=status)
+
+  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
+  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_R:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf32>>
+  ! CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+  ! CHECK: mif.co_max %[[V1]] result %[[C1_i32:.*]] stat %[[STATUS:.*]]#0 errmsg %[[V2]] : (!fir.box<!fir.array<2xf32>>, i32, !fir.ref<i32>, !fir.box<!fir.char<1>>)
+  call co_max(array_r, result_image=1, stat= status, errmsg=message)
+
+end program
diff --git a/flang/test/Lower/MIF/co_min.f90 b/flang/test/Lower/MIF/co_min.f90
new file mode 100644
index 0000000000000..a7adc6b540147
--- /dev/null
+++ b/flang/test/Lower/MIF/co_min.f90
@@ -0,0 +1,60 @@
+! RUN: %flang_fc1 -emit-hlfir -fcoarray %s -o - | FileCheck %s
+
+program test_co_min
+  integer :: i, array_i(2), status
+  real :: r, array_r(2)
+  double precision :: d, array_d(2)
+  character(len=1) :: c, array_c(2), message
+
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_I:.*]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
+  ! CHECK: mif.co_min %[[V1]] : (!fir.box<i32>)
+  call co_min(i)
+
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_C:.*]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+  ! CHECK: mif.co_min %[[V1]] : (!fir.box<!fir.char<1>>)
+  call co_min(c)
+
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_D:.*]]#0 : (!fir.ref<f64>) -> !fir.box<f64>
+  ! CHECK: mif.co_min %[[V1]] : (!fir.box<f64>)
+  call co_min(d)
+
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_R:.*]]#0 : (!fir.ref<f32>) -> !fir.box<f32>
+  ! CHECK: mif.co_min %[[V1]] : (!fir.box<f32>)
+  call co_min(r)
+
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_I]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
+  ! CHECK: mif.co_min %[[V1]] result %[[C1_i32:.*]] : (!fir.box<i32>, i32)
+  call co_min(i,       result_image=1)
+
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_D]]#0 : (!fir.ref<f64>) -> !fir.box<f64>
+  ! CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE:.*]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+  ! CHECK: mif.co_min %[[V1]] result %[[C1_i32:.*]] stat %[[STATUS:.*]]#0 errmsg %[[V2]] : (!fir.box<f64>, i32, !fir.ref<i32>, !fir.box<!fir.char<1>>)
+  call co_min(d,       result_image=1, stat=status, errmsg=message)
+
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_R]]#0 : (!fir.ref<f32>) -> !fir.box<f32>
+  ! CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+  ! CHECK: mif.co_min %[[V1]] result %[[C1_i32:.*]] stat %[[STATUS:.*]]#0 errmsg %[[V2]] : (!fir.box<f32>, i32, !fir.ref<i32>, !fir.box<!fir.char<1>>)
+  call co_min(r,       result_image=1, stat=status, errmsg=message)
+
+  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
+  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_I:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>>
+  ! CHECK: mif.co_min %[[V1]] : (!fir.box<!fir.array<2xi32>>)
+  call co_min(array_i)
+   
+  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
+  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_C:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2x!fir.char<1>>>, !fir.shape<1>) -> !fir.box<!fir.array<2x!fir.char<1>>>
+  ! CHECK: mif.co_min %[[V1]] result %[[C1_i32:.*]] : (!fir.box<!fir.array<2x!fir.char<1>>>, i32)
+  call co_min(array_c, result_image=1) 
+   
+  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
+  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_D:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xf64>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf64>>
+  ! CHECK: mif.co_min %[[V1]] result %[[C1_i32:.*]] stat %[[STATUS:.*]]#0 : (!fir.box<!fir.array<2xf64>>, i32, !fir.ref<i32>)
+  call co_min(array_d, result_image=1, stat=status)
+
+  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
+  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_R:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf32>>
+  ! CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+  ! CHECK: mif.co_min %[[V1]] result %[[C1_i32:.*]] stat %[[STATUS:.*]]#0 errmsg %[[V2]] : (!fir.box<!fir.array<2xf32>>, i32, !fir.ref<i32>, !fir.box<!fir.char<1>>)
+  call co_min(array_r, result_image=1, stat= status, errmsg=message)
+
+end program
diff --git a/flang/test/Lower/MIF/co_sum.f90 b/flang/test/Lower/MIF/co_sum.f90
new file mode 100644
index 0000000000000..0d8a25850ad5f
--- /dev/null
+++ b/flang/test/Lower/MIF/co_sum.f90
@@ -0,0 +1,51 @@
+! RUN: %flang_fc1 -emit-hlfir -fcoarray %s -o - | FileCheck %s
+
+program test_co_sum
+  integer :: i, array_i(2), status
+  real :: r, array_r(2)
+  double precision :: d, array_d(2)
+  character(len=1) :: message
+
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_I:.*]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
+  ! CHECK: mif.co_sum %[[V1]] : (!fir.box<i32>)
+  call co_sum(i)
+
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_D:.*]]#0 : (!fir.ref<f64>) -> !fir.box<f64>
+  ! CHECK: mif.co_sum %[[V1]] : (!fir.box<f64>)
+  call co_sum(d)
+
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_R:.*]]#0 : (!fir.ref<f32>) -> !fir.box<f32>
+  ! CHECK: mif.co_sum %[[V1]] : (!fir.box<f32>)
+  call co_sum(r)
+
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_I]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
+  ! CHECK: mif.co_sum %[[V1]] result %[[C1_i32:.*]] : (!fir.box<i32>, i32)
+  call co_sum(i,       result_image=1)
+
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_D]]#0 : (!fir.ref<f64>) -> !fir.box<f64>
+  ! CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE:.*]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+  ! CHECK: mif.co_sum %[[V1]] result %[[C1_i32:.*]] stat %[[STATUS:.*]]#0 errmsg %[[V2]] : (!fir.box<f64>, i32, !fir.ref<i32>, !fir.box<!fir.char<1>>)
+  call co_sum(d,       result_image=1, stat=status, errmsg=message)
+
+  ! CHECK: %[[V1:.*]] = fir.embox %[[VAR_R]]#0 : (!fir.ref<f32>) -> !fir.box<f32>
+  ! CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+  ! CHECK: mif.co_sum %[[V1]] result %[[C1_i32:.*]] stat %[[STATUS:.*]]#0 errmsg %[[V2]] : (!fir.box<f32>, i32, !fir.ref<i32>, !fir.box<!fir.char<1>>)
+  call co_sum(r,       result_image=1, stat=status, errmsg=message)
+
+  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
+  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_I:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xi32>>
+  ! CHECK: mif.co_sum %[[V1]] : (!fir.box<!fir.array<2xi32>>)
+  call co_sum(array_i)
+   
+  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
+  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_D:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xf64>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf64>>
+  ! CHECK: mif.co_sum %[[V1]] result %[[C1_i32:.*]] stat %[[STATUS:.*]]#0 : (!fir.box<!fir.array<2xf64>>, i32, !fir.ref<i32>)
+  call co_sum(array_d, result_image=1, stat=status)
+
+  ! CHECK: %[[SHAPE_2:.*]] = fir.shape %[[C2_2:.*]] : (index) -> !fir.shape<1>
+  ! CHECK: %[[V1:.*]] = fir.embox %[[ARRAY_R:.*]]#0(%[[SHAPE_2]]) : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf32>>
+  ! CHECK: %[[V2:.*]] = fir.embox %[[MESSAGE]]#0 : (!fir.ref<!fir.char<1>>) -> !fir.box<!fir.char<1>>
+  ! CHECK: mif.co_sum %[[V1]] result %[[C1_i32:.*]] stat %[[STATUS:.*]]#0 errmsg %[[V2]] : (!fir.box<!fir.array<2xf32>>, i32, !fir.ref<i32>, !fir.box<!fir.char<1>>)
+  call co_sum(array_r, result_image=1, stat= status, errmsg=message)
+
+end program
diff --git a/flang/test/Lower/Coarray/coarray-init.f90 b/flang/test/Lower/MIF/coarray-init.f90
similarity index 100%
rename from flang/test/Lower/Coarray/coarray-init.f90
rename to flang/test/Lower/MIF/coarray-init.f90
diff --git a/flang/test/Lower/Coarray/num_images.f90 b/flang/test/Lower/MIF/num_images.f90
similarity index 60%
rename from flang/test/Lower/Coarray/num_images.f90
rename to flang/test/Lower/MIF/num_images.f90
index ebfce5db0dbfb..a673b6e8120f8 100644
--- a/flang/test/Lower/Coarray/num_images.f90
+++ b/flang/test/Lower/MIF/num_images.f90
@@ -6,13 +6,13 @@ program test
   integer :: team_number 
   type(team_type) :: team
 
-  ! CHECK: fir.call @_QMprifPprif_num_images
+  ! CHECK: mif.num_images : () -> i32
   i = num_images()
 
-  ! CHECK: fir.call @_QMprifPprif_num_images_with_team_number
+  ! CHECK: mif.num_images team_number %[[TEAM_NUMBER:.*]] : (i32) -> i32
   i = num_images(TEAM_NUMBER=team_number)
 
-  ! CHECK: fir.call @_QMprifPprif_num_images_with_team
+  ! CHECK: mif.num_images team %[[TEAM:.*]]#0 : ({{.*}}) -> i32
   i = num_images(TEAM=team)
 
 end program
diff --git a/flang/test/Lower/MIF/sync_all.f90 b/flang/test/Lower/MIF/sync_all.f90
new file mode 100644
index 0000000000000..2b1997c8cc0b8
--- /dev/null
+++ b/flang/test/Lower/MIF/sync_all.f90
@@ -0,0 +1,27 @@
+! RUN: %flang_fc1 -emit-hlfir -fcoarray %s -o - | FileCheck %s --check-prefixes=COARRAY
+! RUN: not %flang_fc1 -emit-hlfir %s 2>&1 | FileCheck %s --check-prefixes=NOCOARRAY
+
+program test_sync_all
+  implicit none
+  ! NOCOARRAY: Not yet implemented: Multi-image features are experimental and are disabled by default, use '-fcoarray' to enable.
+ 
+  ! COARRAY: %[[ERRMSG:.*]]:2 = hlfir.declare %[[VAL_1:.*]] typeparams %[[C_128:.*]] {uniq_name = "_QFEerror_message"} : (!fir.ref<!fir.char<1,128>>, index) -> (!fir.ref<!fir.char<1,128>>, !fir.ref<!fir.char<1,128>>)
+  ! COARRAY: %[[STAT:.*]]:2 = hlfir.declare %[[VAL_2:.*]] {uniq_name = "_QFEsync_status"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  integer sync_status
+  character(len=128) :: error_message
+
+  ! COARRAY: mif.sync_all : ()
+  sync all
+
+  ! COARRAY: mif.sync_all stat %[[STAT]]#0 : (!fir.ref<i32>)
+  sync all(stat=sync_status)
+  
+  ! COARRAY: %[[VAL_1:.*]] = fir.embox %[[ERRMSG]]#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
+  ! COARRAY: mif.sync_all errmsg %[[VAL_1]] : (!fir.box<!fir.char<1,128>>)
+  sync all(                  errmsg=error_message)
+  
+  ! COARRAY: %[[VAL_2:.*]] = fir.embox %[[ERRMSG]]#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
+  ! COARRAY: mif.sync_all stat %[[STAT]]#0 errmsg %[[VAL_2]] : (!fir.ref<i32>, !fir.box<!fir.char<1,128>>)
+  sync all(stat=sync_status, errmsg=error_message)
+
+end program test_sync_all
diff --git a/flang/test/Lower/MIF/sync_images.f90 b/flang/test/Lower/MIF/sync_images.f90
new file mode 100644
index 0000000000000..7ee5936131750
--- /dev/null
+++ b/flang/test/Lower/MIF/sync_images.f90
@@ -0,0 +1,39 @@
+! RUN: %flang_fc1 -emit-hlfir -fcoarray %s -o - | FileCheck %s --check-prefixes=COARRAY
+! RUN: not %flang_fc1 -emit-hlfir %s 2>&1 | FileCheck %s --check-prefixes=NOCOARRAY
+
+program test_sync_images
+  implicit none
+  ! NOCOARRAY: Not yet implemented: Multi-image features are experimental and are disabled by default, use '-fcoarray' to enable.
+ 
+  ! COARRAY: %[[ERRMSG:.*]]:2 = hlfir.declare %[[VAL_1:.*]] typeparams %[[C_128:.*]] {uniq_name = "_QFEerror_message"} : (!fir.ref<!fir.char<1,128>>, index) -> (!fir.ref<!fir.char<1,128>>, !fir.ref<!fir.char<1,128>>)
+  ! COARRAY: %[[ME:.*]]:2 = hlfir.declare %[[VAL_3:.*]] {uniq_name = "_QFEme"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! COARRAY: %[[STAT:.*]]:2 = hlfir.declare %[[VAL_2:.*]] {uniq_name = "_QFEsync_status"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  integer sync_status, me
+  character(len=128) :: error_message
+
+  ! COARRAY: %[[VAL_1:.*]] = fir.embox %[[ERRMSG]]#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
+  ! COARRAY: mif.sync_images stat %[[STAT]]#0 errmsg %[[VAL_1]] : (!fir.ref<i32>, !fir.box<!fir.char<1,128>>)
+  sync images(*, stat=sync_status, errmsg=error_message)
+
+  ! COARRAY: %[[VAL_2:.*]] = fir.embox %[[ERRMSG]]#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
+  ! COARRAY: %[[VAL_3:.*]] = fir.embox %[[ME]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
+  ! COARRAY: mif.sync_images image_set %[[VAL_3]] stat %[[STAT]]#0 errmsg %[[VAL_2]] : (!fir.box<i32>, !fir.ref<i32>, !fir.box<!fir.char<1,128>>)
+  sync images(me,   stat=sync_status, errmsg=error_message)
+
+  ! COARRAY: %[[VAL_4:.*]] = fir.embox %[[ERRMSG]]#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
+  ! COARRAY: %[[VAL_5:.*]] = fir.embox %[[IMG_SET:.*]]#0(%[[SHAPE_1:.*]]) : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<1xi32>>
+  ! COARRAY: mif.sync_images image_set %[[VAL_5]] stat %[[STAT]]#0 errmsg %[[VAL_4]] : (!fir.box<!fir.array<1xi32>>, !fir.ref<i32>, !fir.box<!fir.char<1,128>>)
+  sync images([1],  stat=sync_status, errmsg=error_message)
+  
+  ! COARRAY: mif.sync_images : ()
+  sync images(*)
+  
+  ! COARRAY: %[[VAL_6:.*]] = fir.embox %[[ME]]#0 : (!fir.ref<i32>) -> !fir.box<i32>
+  ! COARRAY: mif.sync_images image_set %[[VAL_6]] : (!fir.box<i32>)
+  sync images(me)
+  
+  ! COARRAY: %[[VAL_7:.*]] = fir.embox %[[IMG_SET:.*]]#0(%[[SHAPE_3:.*]]) : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<1xi32>>
+  ! COARRAY: mif.sync_images image_set %[[VAL_7]] : (!fir.box<!fir.array<1xi32>>)
+  sync images([1])
+
+end program test_sync_images
diff --git a/flang/test/Lower/MIF/sync_memory.f90 b/flang/test/Lower/MIF/sync_memory.f90
new file mode 100644
index 0000000000000..e6e0fa1e7fdf3
--- /dev/null
+++ b/flang/test/Lower/MIF/sync_memory.f90
@@ -0,0 +1,27 @@
+! RUN: %flang_fc1 -emit-hlfir -fcoarray %s -o - | FileCheck %s --check-prefixes=COARRAY
+! RUN: not %flang_fc1 -emit-hlfir %s 2>&1 | FileCheck %s --check-prefixes=NOCOARRAY
+
+program test_sync_memory
+  implicit none
+  ! NOCOARRAY: Not yet implemented: Multi-image features are experimental and are disabled by default, use '-fcoarray' to enable.
+ 
+  ! COARRAY: %[[ERRMSG:.*]]:2 = hlfir.declare %[[VAL_1:.*]] typeparams %[[C_128:.*]] {uniq_name = "_QFEerror_message"} : (!fir.ref<!fir.char<1,128>>, index) -> (!fir.ref<!fir.char<1,128>>, !fir.ref<!fir.char<1,128>>)
+  ! COARRAY: %[[STAT:.*]]:2 = hlfir.declare %[[VAL_2:.*]] {uniq_name = "_QFEsync_status"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  integer sync_status
+  character(len=128) :: error_message
+
+  ! COARRAY: mif.sync_memory : () 
+  sync memory
+
+  ! COARRAY: mif.sync_memory stat %[[STAT]]#0 : (!fir.ref<i32>)
+  sync memory(stat=sync_status)
+  
+  ! COARRAY: %[[VAL_1:.*]] = fir.embox %[[ERRMSG]]#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
+  ! COARRAY: mif.sync_memory errmsg %[[VAL_1]] : (!fir.box<!fir.char<1,128>>)
+  sync memory(                  errmsg=error_message)
+  
+  ! COARRAY: %[[VAL_2:.*]] = fir.embox %[[ERRMSG]]#0 : (!fir.ref<!fir.char<1,128>>) -> !fir.box<!fir.char<1,128>>
+  ! COARRAY: mif.sync_memory stat %[[STAT]]#0 errmsg %[[VAL_2]] : (!fir.ref<i32>, !fir.box<!fir.char<1,128>>)
+  sync memory(stat=sync_status, errmsg=error_message)
+
+end program test_sync_memory
diff --git a/flang/test/Lower/Coarray/this_image.f90 b/flang/test/Lower/MIF/this_image.f90
similarity index 64%
rename from flang/test/Lower/Coarray/this_image.f90
rename to flang/test/Lower/MIF/this_image.f90
index 143504b5f922c..ce729b349e6cf 100644
--- a/flang/test/Lower/Coarray/this_image.f90
+++ b/flang/test/Lower/MIF/this_image.f90
@@ -5,10 +5,10 @@ program test
   integer :: i
   type(team_type) :: team
 
-  ! CHECK: fir.call @_QMprifPprif_this_image_no_coarray
+  ! CHECK: mif.this_image : () -> i32 
   i = this_image()
 
-  ! CHECK: fir.call @_QMprifPprif_this_image_no_coarray
+  ! CHECK: mif.this_image team %[[TEAM:.*]] : ({{.*}}) -> i32
   i = this_image(TEAM=team)
 
 end program
diff --git a/flang/test/Lower/OpenACC/acc-declare-common-in-function.f90 b/flang/test/Lower/OpenACC/acc-declare-common-in-function.f90
new file mode 100644
index 0000000000000..5038f718517ac
--- /dev/null
+++ b/flang/test/Lower/OpenACC/acc-declare-common-in-function.f90
@@ -0,0 +1,40 @@
+! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s
+
+! Verify that a COMMON block declared with OpenACC declare inside a function
+! is lowered as a global declare (acc.global_ctor/dtor) rather than a
+! structured declare.
+
+program p
+  implicit none
+  real :: pi
+  integer :: i
+  common /COM/ pi
+!$acc declare copyin(/COM/)
+  data pi/0.0/
+
+! CHECK-DAG: acc.global_ctor @{{.*}}_acc_ctor {
+! CHECK-DAG: %[[ADDR0:.*]] = fir.address_of(@{{.*}}) {acc.declare = #acc.declare<dataClause = acc_copyin>} : {{.*}}
+! CHECK-DAG: acc.declare_enter dataOperands(%{{.*}} : {{.*}})
+! CHECK-DAG: acc.terminator
+! CHECK-DAG: }
+
+! CHECK-DAG: acc.global_dtor @{{.*}}_acc_dtor {
+! CHECK-DAG: %[[ADDR1:.*]] = fir.address_of(@{{.*}}) {acc.declare = #acc.declare<dataClause = acc_copyin>} : !fir.ref<tuple<f32>>
+! CHECK-DAG: %[[GDP:.*]] = acc.getdeviceptr varPtr(%[[ADDR1]] : !fir.ref<tuple<f32>>) -> !fir.ref<tuple<f32>> {dataClause = #acc<data_clause acc_copyin>, {{.*}}}
+! CHECK-DAG: acc.declare_exit dataOperands(%[[GDP]] : !fir.ref<tuple<f32>>)
+! CHECK-DAG: acc.delete accPtr(%[[GDP]] : !fir.ref<tuple<f32>>) {dataClause = #acc<data_clause acc_copyin>{{.*}}}
+! CHECK-DAG: acc.terminator
+! CHECK-DAG: }
+
+contains
+
+  subroutine s()
+    implicit none
+    real :: pi
+    common /COM/ pi
+!$acc declare copyin(/COM/)
+  end subroutine s
+
+end program p
+
+
diff --git a/flang/test/Lower/OpenACC/acc-firstprivate-derived-allocatable-component.f90 b/flang/test/Lower/OpenACC/acc-firstprivate-derived-allocatable-component.f90
index 429f207bb5669..3987f9f6f5674 100644
--- a/flang/test/Lower/OpenACC/acc-firstprivate-derived-allocatable-component.f90
+++ b/flang/test/Lower/OpenACC/acc-firstprivate-derived-allocatable-component.f90
@@ -4,6 +4,11 @@
 ! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s
 ! RUN: bbc -fopenacc -emit-fir %s -o - | FileCheck %s --check-prefix=FIR-CHECK
 
+! TODO: This test hits a fatal TODO. Deal with allocatable component
+! destructions. For arrays, allocatable component allocation may also be
+! missing.
+! XFAIL: *
+
 module m_firstprivate_derived_alloc_comp
  type point
    real, allocatable :: x(:)
diff --git a/flang/test/Lower/OpenACC/acc-private.f90 b/flang/test/Lower/OpenACC/acc-private.f90
index d37eb8d7aaf6d..485825dfa8129 100644
--- a/flang/test/Lower/OpenACC/acc-private.f90
+++ b/flang/test/Lower/OpenACC/acc-private.f90
@@ -26,6 +26,12 @@
 ! CHECK:   %[[DES_DST:.*]] = hlfir.designate %[[ARG1]] shape %[[SHAPE]] : (!fir.box<!fir.array<?x?x2xi32>>, !fir.shape<3>) -> !fir.box<!fir.array<?x?x2xi32>>
 ! CHECK:   hlfir.assign %[[DES_SRC]] to %[[DES_DST]] : !fir.box<!fir.array<?x?x2xi32>>, !fir.box<!fir.array<?x?x2xi32>>
 ! CHECK:   acc.terminator
+! CHECK: } destroy {
+! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?x?x2xi32>>, %[[ARG1:.*]]: !fir.box<!fir.array<?x?x2xi32>>):
+! CHECK:   %[[ADDR:.*]] = fir.box_addr %[[ARG1]] : (!fir.box<!fir.array<?x?x2xi32>>) -> !fir.ref<!fir.array<?x?x2xi32>>
+! CHECK:   %[[CAST:.*]] = fir.convert %[[ADDR]] : (!fir.ref<!fir.array<?x?x2xi32>>) -> !fir.heap<!fir.array<?x?x2xi32>>
+! CHECK:   fir.freemem %[[CAST]] : !fir.heap<!fir.array<?x?x2xi32>>
+! CHECK:   acc.terminator
 ! CHECK: }
 
 ! CHECK-LABEL: acc.firstprivate.recipe @firstprivatization_section_lb4.ub9_box_Uxi32 : !fir.box<!fir.array<?xi32>> init {
@@ -47,6 +53,12 @@
 ! CHECK:   %[[RIGHT:.*]] = hlfir.designate %[[ARG1]] shape %[[SHAPE]] : (!fir.box<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
 ! CHECK:   hlfir.assign %[[LEFT]] to %[[RIGHT]] : !fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>
 ! CHECK:   acc.terminator
+! CHECK: } destroy {
+! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?xi32>>, %[[ARG1:.*]]: !fir.box<!fir.array<?xi32>>):
+! CHECK:   %[[ADDR:.*]] = fir.box_addr %[[ARG1]] : (!fir.box<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
+! CHECK:   %[[CAST:.*]] = fir.convert %[[ADDR]] : (!fir.ref<!fir.array<?xi32>>) -> !fir.heap<!fir.array<?xi32>>
+! CHECK:   fir.freemem %[[CAST]] : !fir.heap<!fir.array<?xi32>>
+! CHECK:   acc.terminator
 ! CHECK: }
 
 ! CHECK-LABEL: acc.firstprivate.recipe @firstprivatization_box_Uxi32 : !fir.box<!fir.array<?xi32>> init {
@@ -64,6 +76,12 @@
 ! CHECK:   %[[DES_V2:.*]] = hlfir.designate %[[ARG1]] shape %[[SHAPE]] : (!fir.box<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
 ! CHECK:   hlfir.assign %[[DES_V1]] to %[[DES_V2]] : !fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>
 ! CHECK:   acc.terminator
+! CHECK: } destroy {
+! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?xi32>>, %[[ARG1:.*]]: !fir.box<!fir.array<?xi32>>):
+! CHECK:   %[[ADDR:.*]] = fir.box_addr %[[ARG1]] : (!fir.box<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
+! CHECK:   %[[CAST:.*]] = fir.convert %[[ADDR]] : (!fir.ref<!fir.array<?xi32>>) -> !fir.heap<!fir.array<?xi32>>
+! CHECK:   fir.freemem %[[CAST]] : !fir.heap<!fir.array<?xi32>>
+! CHECK:   acc.terminator
 ! CHECK: }
 
 ! CHECK-LABEL: acc.private.recipe @privatization_box_UxUx2xi32 : !fir.box<!fir.array<?x?x2xi32>> init {
@@ -74,6 +92,12 @@
 ! CHECK:   %[[TEMP:.*]] = fir.allocmem !fir.array<?x?x2xi32>, %[[DIM0]]#1, %[[DIM1]]#1 {bindc_name = ".tmp", uniq_name = ""}
 ! CHECK:   %[[DECL:.*]]:2 = hlfir.declare %[[TEMP]](%[[SHAPE]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?x?x2xi32>>, !fir.shape<3>) -> (!fir.box<!fir.array<?x?x2xi32>>, !fir.heap<!fir.array<?x?x2xi32>>)
 ! CHECK:   acc.yield %[[DECL]]#0 : !fir.box<!fir.array<?x?x2xi32>>
+! CHECK: } destroy {
+! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?x?x2xi32>>, %[[ARG1:.*]]: !fir.box<!fir.array<?x?x2xi32>>):
+! CHECK:   %[[ADDR:.*]] = fir.box_addr %[[ARG1]] : (!fir.box<!fir.array<?x?x2xi32>>) -> !fir.ref<!fir.array<?x?x2xi32>>
+! CHECK:   %[[CAST:.*]] = fir.convert %[[ADDR]] : (!fir.ref<!fir.array<?x?x2xi32>>) -> !fir.heap<!fir.array<?x?x2xi32>>
+! CHECK:   fir.freemem %[[CAST]] : !fir.heap<!fir.array<?x?x2xi32>>
+! CHECK:   acc.terminator
 ! CHECK: }
 
 ! CHECK-LABEL: acc.private.recipe @privatization_ref_box_ptr_Uxi32 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> init {
@@ -89,6 +113,13 @@
 ! CHECK:   %[[CONV:.*]] = fir.convert %[[DECLAREBOX]]#0 : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<!fir.array<?xi32>>>
 ! CHECK:   fir.store %[[DECLARE]]#0 to %[[CONV]] : !fir.ref<!fir.box<!fir.array<?xi32>>>
 ! CHECK:   acc.yield %[[DECLAREBOX]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+! CHECK: } destroy {
+! CHECK: ^bb0(%arg0: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, %arg1: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>):
+! CHECK:   %[[LOAD:.*]] = fir.load %arg1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+! CHECK:   %[[ADDR:.*]] = fir.box_addr %[[LOAD]] : (!fir.box<!fir.ptr<!fir.array<?xi32>>>) -> !fir.ptr<!fir.array<?xi32>>
+! CHECK:   %[[CAST:.*]] = fir.convert %[[ADDR]] : (!fir.ptr<!fir.array<?xi32>>) -> !fir.heap<!fir.array<?xi32>>
+! CHECK:   fir.freemem %[[CAST]] : !fir.heap<!fir.array<?xi32>>
+! CHECK:   acc.terminator
 ! CHECK: }
 
 ! CHECK-LABEL: @privatization_ref_box_heap_i32 : !fir.ref<!fir.box<!fir.heap<i32>>> init {
@@ -99,6 +130,12 @@
 ! CHECK:   %[[BOX:.*]] = fir.embox %[[ALLOCMEM]] : (!fir.heap<i32>) -> !fir.box<!fir.heap<i32>>
 ! CHECK:   fir.store %[[BOX]] to %[[DECLARE]]#0 : !fir.ref<!fir.box<!fir.heap<i32>>>
 ! CHECK:   acc.yield %[[DECLARE]]#0 : !fir.ref<!fir.box<!fir.heap<i32>>>
+! CHECK: } destroy {
+! CHECK: ^bb0(%arg0: !fir.ref<!fir.box<!fir.heap<i32>>>, %arg1: !fir.ref<!fir.box<!fir.heap<i32>>>):
+! CHECK:   %[[LOAD:.*]] = fir.load %arg1 : !fir.ref<!fir.box<!fir.heap<i32>>>
+! CHECK:   %[[ADDR:.*]] = fir.box_addr %[[LOAD]] : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32>
+! CHECK:   fir.freemem %[[ADDR]] : !fir.heap<i32>
+! CHECK:   acc.terminator
 ! CHECK: }
 
 ! CHECK-LABEL: acc.private.recipe @privatization_ref_box_heap_Uxi32 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> init {
@@ -114,6 +151,12 @@
 ! CHECK:   %[[CONV:.*]] = fir.convert %[[DECLAREBOX]]#0 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<!fir.array<?xi32>>>
 ! CHECK:   fir.store %[[DECLARE]]#0 to %[[CONV]] : !fir.ref<!fir.box<!fir.array<?xi32>>>
 ! CHECK:   acc.yield %[[DECLAREBOX]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! CHECK: } destroy {
+! CHECK: ^bb0(%arg0: !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, %arg1: !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>):
+! CHECK:   %[[LOAD:.*]] = fir.load %arg1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! CHECK:   %[[ADDR:.*]] = fir.box_addr %[[LOAD]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
+! CHECK:   fir.freemem %[[ADDR]] : !fir.heap<!fir.array<?xi32>>
+! CHECK:   acc.terminator
 ! CHECK: }
 
 ! CHECK-LABEL: acc.private.recipe @privatization_box_Uxi32 : !fir.box<!fir.array<?xi32>> init {
@@ -124,6 +167,12 @@
 ! CHECK:   %[[TEMP:.*]] = fir.allocmem !fir.array<?xi32>, %0#1 {bindc_name = ".tmp", uniq_name = ""}
 ! CHECK:   %[[DECLARE:.*]]:2 = hlfir.declare %[[TEMP]](%[[SHAPE]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xi32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.heap<!fir.array<?xi32>>)
 ! CHECK:   acc.yield %[[DECLARE:.*]]#0 : !fir.box<!fir.array<?xi32>>
+! CHECK: } destroy {
+! CHECK: ^bb0(%[[ARG0:.*]]: !fir.box<!fir.array<?xi32>>, %[[ARG1:.*]]: !fir.box<!fir.array<?xi32>>):
+! CHECK:   %[[ADDR:.*]] = fir.box_addr %[[ARG1]] : (!fir.box<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
+! CHECK:   %[[CAST:.*]] = fir.convert %[[ADDR]] : (!fir.ref<!fir.array<?xi32>>) -> !fir.heap<!fir.array<?xi32>>
+! CHECK:   fir.freemem %[[CAST]] : !fir.heap<!fir.array<?xi32>>
+! CHECK:   acc.terminator
 ! CHECK: }
 
 ! CHECK-LABEL: acc.firstprivate.recipe @firstprivatization_section_lb50.ub99_ref_50xf32 : !fir.ref<!fir.array<50xf32>> init {
@@ -140,6 +189,7 @@
 ! CHECK:   %[[DES_SRC:.*]] = hlfir.designate %[[DECL_SRC]]#0 shape %[[SHAPE:.*]] : (!fir.ref<!fir.array<50xf32>>, !fir.shape<1>) -> !fir.ref<!fir.array<50xf32>>
 ! CHECK:   %[[DES_DST:.*]] = hlfir.designate %[[DECL_DST]]#0 shape %[[SHAPE:.*]] : (!fir.ref<!fir.array<50xf32>>, !fir.shape<1>) -> !fir.ref<!fir.array<50xf32>>
 ! CHECK:   hlfir.assign %[[DES_SRC]] to %[[DES_DST]] : !fir.ref<!fir.array<50xf32>>, !fir.ref<!fir.array<50xf32>>
+! CHECK:   acc.terminator
 ! CHECK: }
 
 ! CHECK-LABEL: acc.firstprivate.recipe @firstprivatization_ref_100xf32 : !fir.ref<!fir.array<100xf32>> init {
diff --git a/flang/test/Lower/OpenACC/acc-unstructured.f90 b/flang/test/Lower/OpenACC/acc-unstructured.f90
index c57af98770cf5..c42c7dddc5ca1 100644
--- a/flang/test/Lower/OpenACC/acc-unstructured.f90
+++ b/flang/test/Lower/OpenACC/acc-unstructured.f90
@@ -1,4 +1,5 @@
 ! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s
+! XFAIL: *
 
 subroutine test_unstructured1(a, b, c)
   integer :: i, j, k
diff --git a/flang/test/Lower/OpenMP/DelayedPrivatization/target-teams-private-implicit-scalar-map.f90 b/flang/test/Lower/OpenMP/DelayedPrivatization/target-teams-private-implicit-scalar-map.f90
index 2e5e4e357e47c..8e76a1c641049 100644
--- a/flang/test/Lower/OpenMP/DelayedPrivatization/target-teams-private-implicit-scalar-map.f90
+++ b/flang/test/Lower/OpenMP/DelayedPrivatization/target-teams-private-implicit-scalar-map.f90
@@ -23,9 +23,9 @@ program test_default_implicit_firstprivate
 !CHECK:           %[[VAL_4:.*]] = fir.declare %{{.*}} {uniq_name = "_QFEk"} : (!fir.ref<i32>) -> !fir.ref<i32>
 !CHECK:           %[[VAL_5:.*]] = fir.declare %{{.*}} {uniq_name = "_QFExdgfx"} : (!fir.ref<i32>) -> !fir.ref<i32>
 !CHECK:           %[[VAL_6:.*]] = fir.declare %{{.*}} {uniq_name = "_QFExfpvx"} : (!fir.ref<i32>) -> !fir.ref<i32>
-!CHECK:           %[[VAL_7:.*]] = omp.map.info var_ptr(%[[VAL_2]] : !fir.ref<i32>, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref<i32> {name = "i"}
-!CHECK:           %[[VAL_8:.*]] = omp.map.info var_ptr(%[[VAL_3]] : !fir.ref<i32>, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref<i32> {name = "j"}
-!CHECK:           %[[VAL_9:.*]] = omp.map.info var_ptr(%[[VAL_4]] : !fir.ref<i32>, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref<i32> {name = "k"}
+!CHECK:           %[[VAL_7:.*]] = omp.map.info var_ptr(%[[VAL_2]] : !fir.ref<i32>, i32) map_clauses(implicit) capture(ByCopy) -> !fir.ref<i32> {name = "i"}
+!CHECK:           %[[VAL_8:.*]] = omp.map.info var_ptr(%[[VAL_3]] : !fir.ref<i32>, i32) map_clauses(implicit) capture(ByCopy) -> !fir.ref<i32> {name = "j"}
+!CHECK:           %[[VAL_9:.*]] = omp.map.info var_ptr(%[[VAL_4]] : !fir.ref<i32>, i32) map_clauses(implicit) capture(ByCopy) -> !fir.ref<i32> {name = "k"}
 !CHECK:           %[[VAL_10:.*]] = fir.box_offset %[[VAL_0]] base_addr : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?x?x?xi32>>>
 !CHECK:           %[[VAL_11:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>, i32) map_clauses(implicit, tofrom) capture(ByRef) var_ptr_ptr(%[[VAL_10]] : !fir.llvm_ptr<!fir.ref<!fir.array<?x?x?xi32>>>) bounds({{.*}}) -> !fir.llvm_ptr<!fir.ref<!fir.array<?x?x?xi32>>> {name = ""}
 !CHECK:           %[[VAL_12:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>>, !fir.box<!fir.heap<!fir.array<?x?x?xi32>>>) map_clauses(always, implicit, descriptor, to) capture(ByRef) members(%[[VAL_11]] : [0] : !fir.llvm_ptr<!fir.ref<!fir.array<?x?x?xi32>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xi32>>>> {name = "allocarr"}
diff --git a/flang/test/Lower/OpenMP/Todo/attach-modifier.f90 b/flang/test/Lower/OpenMP/Todo/attach-modifier.f90
new file mode 100644
index 0000000000000..099f4a4d8255c
--- /dev/null
+++ b/flang/test/Lower/OpenMP/Todo/attach-modifier.f90
@@ -0,0 +1,9 @@
+!RUN: %not_todo_cmd bbc -emit-hlfir -fopenmp -fopenmp-version=61 -o - %s 2>&1 | FileCheck %s
+!RUN: %not_todo_cmd %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=61 -o - %s 2>&1 | FileCheck %s
+
+!CHECK: not yet implemented: ATTACH modifier is not implemented yet
+subroutine f00(x)
+  integer, pointer :: x
+  !$omp target map(attach(always), tofrom: x)
+  !$omp end target
+end
diff --git a/flang/test/Lower/OpenMP/atomic-requires-conflict-v50-1.f90 b/flang/test/Lower/OpenMP/atomic-requires-conflict-v50-1.f90
new file mode 100644
index 0000000000000..eb8c4b4a343d0
--- /dev/null
+++ b/flang/test/Lower/OpenMP/atomic-requires-conflict-v50-1.f90
@@ -0,0 +1,11 @@
+!RUN: bbc %openmp_flags -fopenmp-version=50 -emit-hlfir %s -o - | FileCheck %s
+!RUN: %flang_fc1 -emit-hlfir %openmp_flags -fopenmp-version=50 %s -o - | FileCheck %s
+
+!CHECK: omp.atomic.read %{{[0-9]+}}#0 = %{{[0-9]+}}#0 memory_order(acquire)
+
+subroutine f00(x, v)
+  integer :: x, v
+  !$omp requires atomic_default_mem_order(acq_rel)
+  !$omp atomic read
+    v = x
+end
diff --git a/flang/test/Lower/OpenMP/atomic-requires-conflict-v50-2.f90 b/flang/test/Lower/OpenMP/atomic-requires-conflict-v50-2.f90
new file mode 100644
index 0000000000000..d309a21a2b938
--- /dev/null
+++ b/flang/test/Lower/OpenMP/atomic-requires-conflict-v50-2.f90
@@ -0,0 +1,11 @@
+!RUN: bbc %openmp_flags -fopenmp-version=50 -emit-hlfir %s -o - | FileCheck %s
+!RUN: %flang_fc1 -emit-hlfir %openmp_flags -fopenmp-version=50 %s -o - | FileCheck %s
+
+!CHECK: omp.atomic.write %{{[0-9]+}}#0 = %{{[0-9]+}} memory_order(relaxed)
+
+subroutine f02(x, v)
+  integer :: x, v
+  !$omp requires atomic_default_mem_order(acquire)
+  !$omp atomic write
+    x = v
+end
diff --git a/flang/test/Lower/OpenMP/atomic-requires-conflict-v50-3.f90 b/flang/test/Lower/OpenMP/atomic-requires-conflict-v50-3.f90
new file mode 100644
index 0000000000000..bc7529c69340a
--- /dev/null
+++ b/flang/test/Lower/OpenMP/atomic-requires-conflict-v50-3.f90
@@ -0,0 +1,11 @@
+!RUN: bbc %openmp_flags -fopenmp-version=50 -emit-hlfir %s -o - | FileCheck %s
+!RUN: %flang_fc1 -emit-hlfir %openmp_flags -fopenmp-version=50 %s -o - | FileCheck %s
+
+!CHECK: omp.atomic.update memory_order(relaxed)
+
+subroutine f05(x, v)
+  integer :: x, v
+  !$omp requires atomic_default_mem_order(acq_rel)
+  !$omp atomic update
+    x = x + 1
+end
diff --git a/flang/test/Lower/OpenMP/atomic-requires-conflict-v50-4.f90 b/flang/test/Lower/OpenMP/atomic-requires-conflict-v50-4.f90
new file mode 100644
index 0000000000000..5cffb1ac9d6c4
--- /dev/null
+++ b/flang/test/Lower/OpenMP/atomic-requires-conflict-v50-4.f90
@@ -0,0 +1,13 @@
+!RUN: bbc %openmp_flags -fopenmp-version=50 -emit-hlfir %s -o - | FileCheck %s
+!RUN: %flang_fc1 -emit-hlfir %openmp_flags -fopenmp-version=50 %s -o - | FileCheck %s
+
+!CHECK: omp.atomic.capture memory_order(relaxed)
+
+subroutine f06(x, v)
+  integer :: x, v
+  !$omp requires atomic_default_mem_order(acquire)
+  !$omp atomic update capture
+    v = x
+    x = x + 1
+  !$omp end atomic
+end
diff --git a/flang/test/Lower/OpenMP/atomic-requires-conflict-v60-1.f90 b/flang/test/Lower/OpenMP/atomic-requires-conflict-v60-1.f90
new file mode 100644
index 0000000000000..55f21978b715e
--- /dev/null
+++ b/flang/test/Lower/OpenMP/atomic-requires-conflict-v60-1.f90
@@ -0,0 +1,11 @@
+!RUN: bbc %openmp_flags -fopenmp-version=60 -emit-hlfir %s -o - | FileCheck %s
+!RUN: %flang_fc1 -emit-hlfir %openmp_flags -fopenmp-version=60 %s -o - | FileCheck %s
+
+!CHECK: omp.atomic.read %{{[0-9]+}}#0 = %{{[0-9]+}}#0 memory_order(relaxed)
+
+subroutine f01(x, v)
+  integer :: x, v
+  !$omp requires atomic_default_mem_order(release)
+  !$omp atomic read
+    v = x
+end
diff --git a/flang/test/Lower/OpenMP/atomic-requires-conflict-v60-2.f90 b/flang/test/Lower/OpenMP/atomic-requires-conflict-v60-2.f90
new file mode 100644
index 0000000000000..ca048798d8d18
--- /dev/null
+++ b/flang/test/Lower/OpenMP/atomic-requires-conflict-v60-2.f90
@@ -0,0 +1,11 @@
+!RUN: bbc %openmp_flags -fopenmp-version=60 -emit-hlfir %s -o - | FileCheck %s
+!RUN: %flang_fc1 -emit-hlfir %openmp_flags -fopenmp-version=60 %s -o - | FileCheck %s
+
+!CHECK: omp.atomic.write %{{[0-9]+}}#0 = %{{[0-9]+}} memory_order(relaxed)
+
+subroutine f02(x, v)
+  integer :: x, v
+  !$omp requires atomic_default_mem_order(acquire)
+  !$omp atomic write
+    x = v
+end
diff --git a/flang/test/Lower/OpenMP/common-block-map.f90 b/flang/test/Lower/OpenMP/common-block-map.f90
index a0a1b1fec3e36..7c690c96ddb0b 100644
--- a/flang/test/Lower/OpenMP/common-block-map.f90
+++ b/flang/test/Lower/OpenMP/common-block-map.f90
@@ -36,7 +36,7 @@ subroutine map_full_block
 !CHECK: %[[CONV:.*]] = fir.convert %[[COORD]] : (!fir.ref<i8>) -> !fir.ref<i32>
 !CHECK: %[[CB_MEMBER_2:.*]]:2 = hlfir.declare %[[CONV]] storage(%[[COMMON_BLOCK]][4]) {uniq_name = "_QFmap_mix_of_membersEvar2"} : (!fir.ref<i32>, !fir.ref<!fir.array<8xi8>>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK: %[[MAP_EXP:.*]] = omp.map.info var_ptr(%[[CB_MEMBER_2]]#1 : !fir.ref<i32>, i32) map_clauses(tofrom) capture(ByRef) -> !fir.ref<i32> {name = "var2"}
-!CHECK: %[[MAP_IMP:.*]] = omp.map.info var_ptr(%[[CB_MEMBER_1]]#1 : !fir.ref<i32>, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref<i32> {name = "var1"}
+!CHECK: %[[MAP_IMP:.*]] = omp.map.info var_ptr(%[[CB_MEMBER_1]]#1 : !fir.ref<i32>, i32) map_clauses(implicit) capture(ByCopy) -> !fir.ref<i32> {name = "var1"}
 !CHECK: omp.target map_entries(%[[MAP_EXP]] -> %[[ARG_EXP:.*]], %[[MAP_IMP]] -> %[[ARG_IMP:.*]] : !fir.ref<i32>, !fir.ref<i32>) {
 !CHECK:  %[[EXP_MEMBER:.*]]:2 = hlfir.declare %[[ARG_EXP]] {uniq_name = "_QFmap_mix_of_membersEvar2"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:  %[[IMP_MEMBER:.*]]:2 = hlfir.declare %[[ARG_IMP]] {uniq_name = "_QFmap_mix_of_membersEvar1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
diff --git a/flang/test/Lower/OpenMP/cptr-usm-close-and-use-device-ptr.f90 b/flang/test/Lower/OpenMP/cptr-usm-close-and-use-device-ptr.f90
new file mode 100644
index 0000000000000..7fc30b431ad49
--- /dev/null
+++ b/flang/test/Lower/OpenMP/cptr-usm-close-and-use-device-ptr.f90
@@ -0,0 +1,21 @@
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 %s -o - | FileCheck %s
+!
+! Checks:
+!  - C_PTR mappings expand to `__address` member with CLOSE under USM paths.
+!  - use_device_ptr does not implicitly expand member operands in the clause.
+
+subroutine only_cptr_use_device_ptr
+  use iso_c_binding
+  type(c_ptr) :: cptr
+  integer :: i
+
+  !$omp target data use_device_ptr(cptr) map(tofrom: i)
+  !$omp end target data
+end subroutine
+
+! CHECK-LABEL: func.func @_QPonly_cptr_use_device_ptr()
+! CHECK: %[[I_MAP:.*]] = omp.map.info var_ptr(%{{.*}} : !fir.ref<i32>, i32) map_clauses(tofrom) capture(ByRef) -> !fir.ref<i32> {name = "i"}
+! CHECK: %[[CP_MAP:.*]] = omp.map.info var_ptr(%{{.*}} : !fir.ref<!fir.type<{{.*}}__builtin_c_ptr{{.*}}>>, !fir.type<{{.*}}__builtin_c_ptr{{.*}}>) map_clauses(return_param) capture(ByRef) -> !fir.ref<!fir.type<{{.*}}__builtin_c_ptr{{.*}}>>
+! CHECK: omp.target_data map_entries(%[[I_MAP]] : !fir.ref<i32>) use_device_ptr(%[[CP_MAP]] -> %{{.*}} : !fir.ref<!fir.type<{{.*}}__builtin_c_ptr{{.*}}>>) {
+! CHECK:   omp.terminator
+! CHECK: }
diff --git a/flang/test/Lower/OpenMP/declare-mapper.f90 b/flang/test/Lower/OpenMP/declare-mapper.f90
index 1627388617539..c15139e69e30a 100644
--- a/flang/test/Lower/OpenMP/declare-mapper.f90
+++ b/flang/test/Lower/OpenMP/declare-mapper.f90
@@ -80,8 +80,8 @@ subroutine declare_mapper_2
    !CHECK:        %[[VAL_8:.*]] = omp.map.bounds lower_bound(%[[VAL_6]] : index) upper_bound(%[[VAL_7]] : index) extent(%[[VAL_2]] : index) stride(%[[VAL_5]] : index) start_idx(%[[VAL_5]] : index)
    !CHECK:        %[[VAL_9:.*]] = omp.map.info var_ptr(%[[VAL_4]] : !fir.ref<!fir.array<250xf32>>, !fir.array<250xf32>) map_clauses(tofrom) capture(ByRef) bounds(%[[VAL_8]]) -> !fir.ref<!fir.array<250xf32>> {name = "v%[[VAL_10:.*]]"}
    !CHECK:        %[[VAL_11:.*]] = hlfir.designate %[[VAL_1]]#0{"temp"}   : (!fir.ref<[[MY_TYPE]]>) -> !fir.ref<!fir.type<_QFdeclare_mapper_2Tmy_type{num_vals:i32,values:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>
-   !CHECK:        %[[VAL_12:.*]] = omp.map.info var_ptr(%[[VAL_11]] : !fir.ref<!fir.type<_QFdeclare_mapper_2Tmy_type{num_vals:i32,values:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>, !fir.type<_QFdeclare_mapper_2Tmy_type{num_vals:i32,values:!fir.box<!fir.heap<!fir.array<?xi32>>>}>) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> !fir.ref<!fir.type<_QFdeclare_mapper_2Tmy_type{num_vals:i32,values:!fir.box<!fir.heap<!fir.array<?xi32>>>}>> {name = "v%[[VAL_13:.*]]"}
-   !CHECK:        %[[VAL_14:.*]] = omp.map.info var_ptr(%[[VAL_1]]#1 : !fir.ref<[[MY_TYPE]]>, [[MY_TYPE]]) map_clauses(exit_release_or_enter_alloc) capture(ByRef) members(%[[VAL_9]], %[[VAL_12]] : [3], [1] : !fir.ref<!fir.array<250xf32>>, !fir.ref<!fir.type<_QFdeclare_mapper_2Tmy_type{num_vals:i32,values:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>) -> !fir.ref<[[MY_TYPE]]> {name = "v", partial_map = true}
+   !CHECK:        %[[VAL_12:.*]] = omp.map.info var_ptr(%[[VAL_11]] : !fir.ref<!fir.type<_QFdeclare_mapper_2Tmy_type{num_vals:i32,values:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>, !fir.type<_QFdeclare_mapper_2Tmy_type{num_vals:i32,values:!fir.box<!fir.heap<!fir.array<?xi32>>>}>) map_clauses(storage) capture(ByRef) -> !fir.ref<!fir.type<_QFdeclare_mapper_2Tmy_type{num_vals:i32,values:!fir.box<!fir.heap<!fir.array<?xi32>>>}>> {name = "v%[[VAL_13:.*]]"}
+   !CHECK:        %[[VAL_14:.*]] = omp.map.info var_ptr(%[[VAL_1]]#1 : !fir.ref<[[MY_TYPE]]>, [[MY_TYPE]]) map_clauses(storage) capture(ByRef) members(%[[VAL_9]], %[[VAL_12]] : [3], [1] : !fir.ref<!fir.array<250xf32>>, !fir.ref<!fir.type<_QFdeclare_mapper_2Tmy_type{num_vals:i32,values:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>) -> !fir.ref<[[MY_TYPE]]> {name = "v", partial_map = true}
    !CHECK:        omp.declare_mapper.info map_entries(%[[VAL_14]], %[[VAL_9]], %[[VAL_12]] : !fir.ref<[[MY_TYPE]]>, !fir.ref<!fir.array<250xf32>>, !fir.ref<!fir.type<_QFdeclare_mapper_2Tmy_type{num_vals:i32,values:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>)
    !CHECK:      }
    !$omp declare mapper (my_mapper : my_type2 :: v) map (v%arr) map (alloc : v%temp)
@@ -112,7 +112,7 @@ subroutine declare_mapper_3
    !CHECK:     %[[VAL_10:.*]] = arith.subi %[[VAL_5]], %[[VAL_8]] : index
    !CHECK:     %[[VAL_11:.*]] = omp.map.bounds lower_bound(%[[VAL_9]] : index) upper_bound(%[[VAL_10]] : index) extent(%[[VAL_5]] : index) stride(%[[VAL_8]] : index) start_idx(%[[VAL_8]] : index)
    !CHECK:     %[[VAL_12:.*]] = omp.map.info var_ptr(%[[VAL_7]] : !fir.ref<!fir.array<250xf32>>, !fir.array<250xf32>) map_clauses(tofrom) capture(ByRef) bounds(%[[VAL_11]]) -> !fir.ref<!fir.array<250xf32>> {name = "v%[[VAL_13:.*]]"}
-   !CHECK:     %[[VAL_14:.*]] = omp.map.info var_ptr(%[[VAL_1]]#1 : !fir.ref<[[MY_TYPE2]]>, [[MY_TYPE2]]) map_clauses(exit_release_or_enter_alloc) capture(ByRef) members(%[[VAL_3]], %[[VAL_12]] : [0], [1] : !fir.ref<[[MY_TYPE]]>, !fir.ref<!fir.array<250xf32>>) -> !fir.ref<[[MY_TYPE2]]> {name = "v", partial_map = true}
+   !CHECK:     %[[VAL_14:.*]] = omp.map.info var_ptr(%[[VAL_1]]#1 : !fir.ref<[[MY_TYPE2]]>, [[MY_TYPE2]]) map_clauses(storage) capture(ByRef) members(%[[VAL_3]], %[[VAL_12]] : [0], [1] : !fir.ref<[[MY_TYPE]]>, !fir.ref<!fir.array<250xf32>>) -> !fir.ref<[[MY_TYPE2]]> {name = "v", partial_map = true}
    !CHECK:     omp.declare_mapper.info map_entries(%[[VAL_14]], %[[VAL_3]], %[[VAL_12]] : !fir.ref<[[MY_TYPE2]]>, !fir.ref<[[MY_TYPE]]>, !fir.ref<!fir.array<250xf32>>)
    !CHECK:  }
 
diff --git a/flang/test/Lower/OpenMP/defaultmap.f90 b/flang/test/Lower/OpenMP/defaultmap.f90
index fe95d00c07e36..fa79ffd2ae87e 100644
--- a/flang/test/Lower/OpenMP/defaultmap.f90
+++ b/flang/test/Lower/OpenMP/defaultmap.f90
@@ -6,7 +6,7 @@ subroutine defaultmap_allocatable_present()
     implicit none
     integer, dimension(:), allocatable :: arr
 
-! CHECK: %[[MAP_1:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, i32) map_clauses(implicit, present, exit_release_or_enter_alloc) capture(ByRef) var_ptr_ptr({{.*}}) bounds({{.*}}) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {name = ""}
+! CHECK: %[[MAP_1:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, i32) map_clauses(implicit, present) capture(ByRef) var_ptr_ptr({{.*}}) bounds({{.*}}) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {name = ""}
 ! CHECK: %[[MAP_2:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.box<!fir.heap<!fir.array<?xi32>>>) map_clauses(always, implicit, descriptor, to) capture(ByRef) members({{.*}}) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {name = "arr"}
 !$omp target defaultmap(present: allocatable)
     arr(1) = 10
@@ -33,7 +33,7 @@ subroutine defaultmap_all_default()
     integer :: aggregate(16)
     integer :: scalar_int
 
-! CHECK: %[[MAP_1:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref<i32>, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref<i32> {name = "scalar_int"}
+! CHECK: %[[MAP_1:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref<i32>, i32) map_clauses(implicit) capture(ByCopy) -> !fir.ref<i32> {name = "scalar_int"}
 ! CHECK: %[[MAP_2:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, i32) map_clauses(implicit, tofrom) capture(ByRef) var_ptr_ptr({{.*}}) bounds({{.*}}) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {name = ""}
 ! CHECK: %[[MAP_3:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.box<!fir.heap<!fir.array<?xi32>>>) map_clauses(always, implicit, descriptor, to) capture(ByRef) members({{.*}}) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {name = "arr"}
 ! CHECK: %[[MAP_4:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref<!fir.array<16xi32>>, !fir.array<16xi32>) map_clauses(implicit, tofrom) capture(ByRef) bounds({{.*}}) -> !fir.ref<!fir.array<16xi32>> {name = "aggregate"}
@@ -55,7 +55,7 @@ subroutine defaultmap_pointer_to()
 ! CHECK-FPRIV: %[[MAP_1:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, i32) map_clauses(implicit, to) capture(ByRef) var_ptr_ptr({{.*}}) bounds({{.*}}) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {name = ""}
 ! CHECK: %[[MAP_2:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.box<!fir.ptr<!fir.array<?xi32>>>) map_clauses(always, implicit, descriptor, to) capture(ByRef) members({{.*}}) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> {name = "arr_ptr"}
 ! CHECK-FPRIV: %[[MAP_3:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref<i32>, i32) map_clauses(to) capture(ByCopy) -> !fir.ref<i32>
-! CHECK-NO-FPRIV: %[[MAP_3:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref<i32>, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref<i32> {name = "scalar_int"}
+! CHECK-NO-FPRIV: %[[MAP_3:.*]] = omp.map.info var_ptr({{.*}} : !fir.ref<i32>, i32) map_clauses(implicit) capture(ByCopy) -> !fir.ref<i32> {name = "scalar_int"}
     !$omp target defaultmap(to: pointer)
         arr_ptr(1) = scalar_int + 20
     !$omp end target
diff --git a/flang/test/Lower/OpenMP/derived-type-allocatable-map.f90 b/flang/test/Lower/OpenMP/derived-type-allocatable-map.f90
index 38d9269fed1f3..eb071240e791d 100644
--- a/flang/test/Lower/OpenMP/derived-type-allocatable-map.f90
+++ b/flang/test/Lower/OpenMP/derived-type-allocatable-map.f90
@@ -7,7 +7,7 @@
 !CHECK: %[[MEMBER_BASE_ADDR:.*]] = fir.box_offset %[[MEMBER_COORD]] base_addr : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>
 !CHECK: %[[MAP_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[MEMBER_COORD]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, i32) map_clauses(tofrom) capture(ByRef) var_ptr_ptr(%[[MEMBER_BASE_ADDR]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>)   bounds(%[[BOUNDS]]) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {{.*}}
 !CHECK: %[[MAP_MEMBER_DESCRIPTOR:.*]] = omp.map.info var_ptr(%[[MEMBER_COORD]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.box<!fir.heap<!fir.array<?xi32>>>) map_clauses(always, descriptor, to) capture(ByRef) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {{.*}}
-!CHECK: %[[MAP_PARENT:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : !fir.ref<!fir.type<[[ONE_LAYER_TY]]>>, !fir.type<[[ONE_LAYER_TY]]>) map_clauses(exit_release_or_enter_alloc) capture(ByRef) members(%[[MAP_MEMBER_DESCRIPTOR]], %[[MAP_MEMBER_BASE_ADDR]] : [4], [4, 0] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) -> !fir.ref<!fir.type<[[ONE_LAYER_TY]]>> {{{.*}} partial_map = true}
+!CHECK: %[[MAP_PARENT:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : !fir.ref<!fir.type<[[ONE_LAYER_TY]]>>, !fir.type<[[ONE_LAYER_TY]]>) map_clauses(storage) capture(ByRef) members(%[[MAP_MEMBER_DESCRIPTOR]], %[[MAP_MEMBER_BASE_ADDR]] : [4], [4, 0] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) -> !fir.ref<!fir.type<[[ONE_LAYER_TY]]>> {{{.*}} partial_map = true}
 !CHECK:   omp.target map_entries(%[[MAP_PARENT]] -> %[[ARG0:.*]], %[[MAP_MEMBER_DESCRIPTOR]] -> %[[ARG1:.*]], %[[MAP_MEMBER_BASE_ADDR]] -> %[[ARG2:.*]] : !fir.ref<!fir.type<[[ONE_LAYER_TY]]>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) {
 !CHECK:        %{{.*}}:2 = hlfir.declare %[[ARG0]] {{{.*}}} : (!fir.ref<!fir.type<[[ONE_LAYER_TY]]>>) -> (!fir.ref<!fir.type<[[ONE_LAYER_TY]]>>, !fir.ref<!fir.type<[[ONE_LAYER_TY]]>>)
 subroutine dtype_alloca_map_op_block()
@@ -41,7 +41,7 @@ subroutine dtype_alloca_map_op_block()
 !CHECK: %[[REGULAR_MEMBER:.*]] = fir.coordinate_of %[[LOAD_DTYPE]], k : (!fir.box<!fir.heap<!fir.type<[[REC_TY]]>>>) -> !fir.ref<i32>
 !CHECK: %[[MAP_REGULAR_MEMBER:.*]] = omp.map.info var_ptr(%[[REGULAR_MEMBER]] : !fir.ref<i32>, i32) map_clauses(tofrom) capture(ByRef) -> !fir.ref<i32> {{.*}}
 !CHECK: %[[DTYPE_BASE_ADDR:.*]] = fir.box_offset %[[DECLARE]]#1 base_addr : (!fir.ref<!fir.box<!fir.heap<!fir.type<[[REC_TY]]>>>>) -> !fir.llvm_ptr<!fir.ref<!fir.type<[[REC_TY]]>>>
-!CHECK: %[[MAP_DTYPE_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.type<[[REC_TY]]>>>>, !fir.type<[[REC_TY]]>) map_clauses(exit_release_or_enter_alloc) capture(ByRef) var_ptr_ptr(%[[DTYPE_BASE_ADDR]] : !fir.llvm_ptr<!fir.ref<!fir.type<[[REC_TY]]>>>)   -> !fir.llvm_ptr<!fir.ref<!fir.type<[[REC_TY]]>>> {{.*}}
+!CHECK: %[[MAP_DTYPE_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.type<[[REC_TY]]>>>>, !fir.type<[[REC_TY]]>) map_clauses(storage) capture(ByRef) var_ptr_ptr(%[[DTYPE_BASE_ADDR]] : !fir.llvm_ptr<!fir.ref<!fir.type<[[REC_TY]]>>>)   -> !fir.llvm_ptr<!fir.ref<!fir.type<[[REC_TY]]>>> {{.*}}
 !CHECK: %[[MAP_DTYPE_DESC:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.type<[[REC_TY]]>>>>, !fir.box<!fir.heap<!fir.type<[[REC_TY]]>>>) map_clauses(always, descriptor, to) capture(ByRef) members(%[[MAP_DTYPE_BASE_ADDR]], %[[MAP_MEMBER_DESC]], %[[MAP_MEMBER_BASE_ADDR]], %[[MAP_REGULAR_MEMBER]] : [0], [0, 4], [0, 4, 0], [0, 5] : !fir.llvm_ptr<!fir.ref<!fir.type<[[REC_TY]]>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>, !fir.ref<i32>) -> !fir.ref<!fir.box<!fir.heap<!fir.type<[[REC_TY]]>>>> {{.*}}
 !CHECK: omp.target map_entries(%[[MAP_DTYPE_DESC]] -> %[[ARG0:.*]], %[[MAP_DTYPE_BASE_ADDR]] -> %[[ARG1:.*]], %[[MAP_MEMBER_DESC]] -> %[[ARG2:.*]], %[[MAP_MEMBER_BASE_ADDR]] -> %[[ARG3:.*]], %[[MAP_REGULAR_MEMBER]] -> %[[ARG4:.*]] : !fir.ref<!fir.box<!fir.heap<!fir.type<[[REC_TY]]>>>>, !fir.llvm_ptr<!fir.ref<!fir.type<[[REC_TY]]>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>, !fir.ref<i32>) {
 !CHECK:  %{{.*}}:2 = hlfir.declare %[[ARG0]] {{{.*}}} : (!fir.ref<!fir.box<!fir.heap<!fir.type<[[REC_TY]]>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.type<[[REC_TY]]>>>>, !fir.ref<!fir.box<!fir.heap<!fir.type<[[REC_TY]]>>>>)
@@ -80,7 +80,7 @@ subroutine alloca_dtype_op_block_add()
 !CHECK: %[[REGULAR_NESTED_MEMBER_COORD:.*]] = fir.coordinate_of %[[NESTED_DTYPE_COORD]], k : (!fir.ref<!fir.type<[[REC_TY2]]>>) -> !fir.ref<i32>
 !CHECK: %[[MAP_REGULAR_NESTED_MEMBER:.*]] = omp.map.info var_ptr(%[[REGULAR_NESTED_MEMBER_COORD]] : !fir.ref<i32>, i32) map_clauses(tofrom) capture(ByRef) -> !fir.ref<i32> {{.*}}
 !CHECK: %[[DTYPE_BASE_ADDR:.*]] = fir.box_offset %[[DECLARE]]#1 base_addr : (!fir.ref<!fir.box<!fir.heap<!fir.type<[[REC_TY]]>}>>>>) -> !fir.llvm_ptr<!fir.ref<!fir.type<[[REC_TY]]>}>>>
-!CHECK: %[[MAP_DTYPE_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.type<[[REC_TY]]>}>>>>, !fir.type<[[REC_TY]]>}>) map_clauses(exit_release_or_enter_alloc) capture(ByRef) var_ptr_ptr(%[[DTYPE_BASE_ADDR]] : !fir.llvm_ptr<!fir.ref<!fir.type<[[REC_TY]]>}>>>)   -> !fir.llvm_ptr<!fir.ref<!fir.type<[[REC_TY]]>}>>> {{.*}}
+!CHECK: %[[MAP_DTYPE_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.type<[[REC_TY]]>}>>>>, !fir.type<[[REC_TY]]>}>) map_clauses(storage) capture(ByRef) var_ptr_ptr(%[[DTYPE_BASE_ADDR]] : !fir.llvm_ptr<!fir.ref<!fir.type<[[REC_TY]]>}>>>)   -> !fir.llvm_ptr<!fir.ref<!fir.type<[[REC_TY]]>}>>> {{.*}}
 !CHECK: %[[MAP_DTYPE:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.type<[[REC_TY]]>}>>>>, !fir.box<!fir.heap<!fir.type<[[REC_TY]]>}>>>) map_clauses(always, descriptor, to) capture(ByRef) members(%[[MAP_DTYPE_BASE_ADDR]], %[[MAP_NESTED_MEMBER_COORD]], %[[MAP_NESTED_MEMBER_BASE_ADDR]], %[[MAP_REGULAR_NESTED_MEMBER]] : [0], [0, 6, 2], [0, 6, 2, 0], [0, 6, 3] : !fir.llvm_ptr<!fir.ref<!fir.type<[[REC_TY]]>}>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>, !fir.ref<i32>) -> !fir.ref<!fir.box<!fir.heap<!fir.type<[[REC_TY]]>}>>>> {{.*}}
 !CHECK: omp.target map_entries(%[[MAP_DTYPE]] -> %[[ARG0:.*]], %[[MAP_DTYPE_BASE_ADDR]] -> %[[ARG1:.*]], %[[MAP_NESTED_MEMBER_COORD]] -> %[[ARG2:.*]], %[[MAP_NESTED_MEMBER_BASE_ADDR]] -> %[[ARG3:.*]], %[[MAP_REGULAR_NESTED_MEMBER]] -> %[[ARG4:.*]] : !fir.ref<!fir.box<!fir.heap<!fir.type<[[REC_TY]]>}>>>>, !fir.llvm_ptr<!fir.ref<!fir.type<[[REC_TY]]>}>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>, !fir.ref<i32>) {
 !CHECK:  %{{.*}}:2 = hlfir.declare %[[ARG0]] {{.*}} : (!fir.ref<!fir.box<!fir.heap<!fir.type<[[REC_TY]]>}>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.type<[[REC_TY]]>}>>>>, !fir.ref<!fir.box<!fir.heap<!fir.type<[[REC_TY]]>}>>>>)
@@ -121,7 +121,7 @@ subroutine alloca_nest_dype_map_op_block_add()
 !CHECK: %[[NESTED_MEMBER_BASE_ADDR:.*]] = fir.box_offset %[[NESTED_MEMBER_COORD]] base_addr : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>
 !CHECK: %[[MAP_NESTED_MEMBER_BASE_ADDR:.*]] = omp.map.info var_ptr(%[[NESTED_MEMBER_COORD]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, i32) map_clauses(tofrom) capture(ByRef) var_ptr_ptr(%[[NESTED_MEMBER_BASE_ADDR]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>)   bounds(%[[BOUNDS]]) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {{.*}}
 !CHECK: %[[MAP_NESTED_MEMBER_DESC:.*]] = omp.map.info var_ptr(%[[NESTED_MEMBER_COORD]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.box<!fir.heap<!fir.array<?xi32>>>) map_clauses(always, descriptor, to) capture(ByRef) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {{.*}}
-!CHECK: %[[MAP_PARENT:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : !fir.ref<!fir.type<[[REC_TY]]>>, !fir.type<[[REC_TY]]>) map_clauses(exit_release_or_enter_alloc) capture(ByRef) members(%[[MAP_NESTED_MEMBER_DESC]], %[[MAP_NESTED_MEMBER_BASE_ADDR]] : [6, 2], [6, 2, 0] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) -> !fir.ref<!fir.type<[[REC_TY]]>> {{.*}}
+!CHECK: %[[MAP_PARENT:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : !fir.ref<!fir.type<[[REC_TY]]>>, !fir.type<[[REC_TY]]>) map_clauses(storage) capture(ByRef) members(%[[MAP_NESTED_MEMBER_DESC]], %[[MAP_NESTED_MEMBER_BASE_ADDR]] : [6, 2], [6, 2, 0] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) -> !fir.ref<!fir.type<[[REC_TY]]>> {{.*}}
 !CHECK: omp.target map_entries(%[[MAP_PARENT]] -> %[[ARG0:.*]], %[[MAP_NESTED_MEMBER_DESC]] -> %[[ARG1:.*]], %[[MAP_NESTED_MEMBER_BASE_ADDR]] -> %[[ARG2:.*]] : !fir.ref<!fir.type<[[REC_TY]]>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) {
 !CHECK:  %{{.*}}:2 = hlfir.declare %[[ARG0]] {{.*}} : (!fir.ref<!fir.type<[[REC_TY]]>>) -> (!fir.ref<!fir.type<[[REC_TY]]>>, !fir.ref<!fir.type<[[REC_TY]]>>)
 subroutine nest_dtype_alloca_map_op_block_add()
diff --git a/flang/test/Lower/OpenMP/derived-type-map.f90 b/flang/test/Lower/OpenMP/derived-type-map.f90
index 79acd4cb4d324..0b08aacdc6b59 100644
--- a/flang/test/Lower/OpenMP/derived-type-map.f90
+++ b/flang/test/Lower/OpenMP/derived-type-map.f90
@@ -40,7 +40,7 @@ end subroutine mapType_derived_explicit
 !CHECK: %[[MEMBER:.*]] = hlfir.designate %[[DECLARE]]#0{"array"}   shape %{{.*}} : (!fir.ref<!fir.type<_QFmaptype_derived_explicit_single_memberTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>, !fir.shape<1>) -> !fir.ref<!fir.array<10xi32>>
 !CHECK: %[[BOUNDS:.*]] = omp.map.bounds lower_bound(%{{.*}} : index) upper_bound(%{{.*}} : index) extent(%{{.*}} : index) stride(%{{.*}} : index) start_idx(%{{.*}} : index)
 !CHECK: %[[MEMBER_MAP:.*]] = omp.map.info var_ptr(%[[MEMBER]] : !fir.ref<!fir.array<10xi32>>, !fir.array<10xi32>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<10xi32>> {name = "scalar_arr%array"}
-!CHECK: %[[PARENT_MAP:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : !fir.ref<!fir.type<_QFmaptype_derived_explicit_single_memberTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>, !fir.type<_QFmaptype_derived_explicit_single_memberTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>) map_clauses(exit_release_or_enter_alloc) capture(ByRef) members(%[[MEMBER_MAP]] : [1] : !fir.ref<!fir.array<10xi32>>) -> !fir.ref<!fir.type<_QFmaptype_derived_explicit_single_memberTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>> {name = "scalar_arr", partial_map = true}
+!CHECK: %[[PARENT_MAP:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : !fir.ref<!fir.type<_QFmaptype_derived_explicit_single_memberTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>, !fir.type<_QFmaptype_derived_explicit_single_memberTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>) map_clauses(storage) capture(ByRef) members(%[[MEMBER_MAP]] : [1] : !fir.ref<!fir.array<10xi32>>) -> !fir.ref<!fir.type<_QFmaptype_derived_explicit_single_memberTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>> {name = "scalar_arr", partial_map = true}
 !CHECK: omp.target map_entries(%[[PARENT_MAP]] -> %[[ARG0:.*]], %[[MEMBER_MAP]] -> %[[ARG1:.*]] : !fir.ref<!fir.type<_QFmaptype_derived_explicit_single_memberTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>, !fir.ref<!fir.array<10xi32>>) {
 subroutine mapType_derived_explicit_single_member
   type :: scalar_and_array
@@ -61,7 +61,7 @@ end subroutine mapType_derived_explicit_single_member
 !CHECK: %[[MEMBER_MAP_1:.*]] = omp.map.info var_ptr(%[[MEMBER1]] : !fir.ref<i32>, i32) map_clauses(tofrom) capture(ByRef) -> !fir.ref<i32> {name = "scalar_arr%int"}
 !CHECK: %[[MEMBER2:.*]] = hlfir.designate %[[DECLARE]]#0{"real"}   : (!fir.ref<!fir.type<_QFmaptype_derived_explicit_multiple_membersTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>) -> !fir.ref<f32>
 !CHECK: %[[MEMBER_MAP_2:.*]] = omp.map.info var_ptr(%[[MEMBER2]] : !fir.ref<f32>, f32) map_clauses(tofrom) capture(ByRef) -> !fir.ref<f32> {name = "scalar_arr%real"}
-!CHECK: %[[PARENT_MAP:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : !fir.ref<!fir.type<_QFmaptype_derived_explicit_multiple_membersTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>, !fir.type<_QFmaptype_derived_explicit_multiple_membersTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>) map_clauses(exit_release_or_enter_alloc) capture(ByRef) members(%[[MEMBER_MAP_1]], %[[MEMBER_MAP_2]] : [2], [0] : !fir.ref<i32>, !fir.ref<f32>) -> !fir.ref<!fir.type<_QFmaptype_derived_explicit_multiple_membersTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>> {name = "scalar_arr", partial_map = true}
+!CHECK: %[[PARENT_MAP:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : !fir.ref<!fir.type<_QFmaptype_derived_explicit_multiple_membersTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>, !fir.type<_QFmaptype_derived_explicit_multiple_membersTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>) map_clauses(storage) capture(ByRef) members(%[[MEMBER_MAP_1]], %[[MEMBER_MAP_2]] : [2], [0] : !fir.ref<i32>, !fir.ref<f32>) -> !fir.ref<!fir.type<_QFmaptype_derived_explicit_multiple_membersTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>> {name = "scalar_arr", partial_map = true}
 !CHECK: omp.target map_entries(%[[PARENT_MAP]] -> %[[ARG0:.*]], %[[MEMBER_MAP_1]] -> %[[ARG1:.*]], %[[MEMBER_MAP_2]] -> %[[ARG2:.*]] : !fir.ref<!fir.type<_QFmaptype_derived_explicit_multiple_membersTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>, !fir.ref<i32>, !fir.ref<f32>) {
 subroutine mapType_derived_explicit_multiple_members
   type :: scalar_and_array
@@ -84,7 +84,7 @@ end subroutine mapType_derived_explicit_multiple_members
 !CHECK: %[[UB:.*]] = arith.constant 4 : index
 !CHECK: %[[BOUNDS:.*]] = omp.map.bounds lower_bound(%[[LB]] : index) upper_bound(%[[UB]] : index) extent(%{{.*}} : index) stride(%{{.*}} : index) start_idx(%{{.*}} : index)
 !CHECK: %[[MEMBER_MAP:.*]] = omp.map.info var_ptr(%[[MEMBER]] : !fir.ref<!fir.array<10xi32>>, !fir.array<10xi32>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<10xi32>> {name = "scalar_arr%array(2:5)"}
-!CHECK: %[[PARENT_MAP:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : !fir.ref<!fir.type<_QFmaptype_derived_explicit_member_with_boundsTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>, !fir.type<_QFmaptype_derived_explicit_member_with_boundsTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>) map_clauses(exit_release_or_enter_alloc) capture(ByRef) members(%[[MEMBER_MAP]] : [1] : !fir.ref<!fir.array<10xi32>>) -> !fir.ref<!fir.type<_QFmaptype_derived_explicit_member_with_boundsTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>> {name = "scalar_arr", partial_map = true}
+!CHECK: %[[PARENT_MAP:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : !fir.ref<!fir.type<_QFmaptype_derived_explicit_member_with_boundsTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>, !fir.type<_QFmaptype_derived_explicit_member_with_boundsTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>) map_clauses(storage) capture(ByRef) members(%[[MEMBER_MAP]] : [1] : !fir.ref<!fir.array<10xi32>>) -> !fir.ref<!fir.type<_QFmaptype_derived_explicit_member_with_boundsTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>> {name = "scalar_arr", partial_map = true}
 !CHECK: omp.target map_entries(%[[PARENT_MAP]] -> %[[ARG0:.*]], %[[MEMBER_MAP]] -> %[[ARG1:.*]] : !fir.ref<!fir.type<_QFmaptype_derived_explicit_member_with_boundsTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>, !fir.ref<!fir.array<10xi32>>) {
 subroutine mapType_derived_explicit_member_with_bounds
   type :: scalar_and_array
@@ -105,7 +105,7 @@ end subroutine mapType_derived_explicit_member_with_bounds
 !CHECK: %[[NEST_MEMBER:.*]] = hlfir.designate %[[NEST]]{"array"}   shape %{{.*}} : (!fir.ref<!fir.type<_QFmaptype_derived_nested_explicit_single_memberTnested{int:i32,real:f32,array:!fir.array<10xi32>}>>, !fir.shape<1>) -> !fir.ref<!fir.array<10xi32>>
 !CHECK: %[[BOUNDS:.*]] = omp.map.bounds lower_bound(%{{.*}} : index) upper_bound(%{{.*}} : index) extent(%{{.*}} : index) stride(%{{.*}} : index) start_idx(%{{.*}} : index)
 !CHECK: %[[MEMBER_MAP:.*]] = omp.map.info var_ptr(%[[NEST_MEMBER]] : !fir.ref<!fir.array<10xi32>>, !fir.array<10xi32>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<10xi32>> {name = "scalar_arr%nest%array"}
-!CHECK: %[[PARENT_MAP:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : {{.*}}) map_clauses(exit_release_or_enter_alloc) capture(ByRef) members(%[[MEMBER_MAP]] : [2, 2] : !fir.ref<!fir.array<10xi32>>) -> {{.*}} {name = "scalar_arr", partial_map = true}
+!CHECK: %[[PARENT_MAP:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : {{.*}}) map_clauses(storage) capture(ByRef) members(%[[MEMBER_MAP]] : [2, 2] : !fir.ref<!fir.array<10xi32>>) -> {{.*}} {name = "scalar_arr", partial_map = true}
 !CHECK: omp.target map_entries(%[[PARENT_MAP]] -> %[[ARG0:.*]], %[[MEMBER_MAP]] -> %[[ARG1:.*]] : {{.*}}, {{.*}}) {
 subroutine mapType_derived_nested_explicit_single_member
   type :: nested
@@ -136,7 +136,7 @@ end subroutine mapType_derived_nested_explicit_single_member
 !CHECK: %[[NEST:.*]] = hlfir.designate %[[DECLARE]]#0{"nest"}   : ({{.*}}) -> {{.*}}
 !CHECK: %[[NEST_MEMBER2:.*]] = hlfir.designate %[[NEST]]{"real"}   : ({{.*}}) -> !fir.ref<f32>
 !CHECK: %[[MEMBER_MAP_2:.*]] = omp.map.info var_ptr(%[[NEST_MEMBER2]] : !fir.ref<f32>, f32) map_clauses(tofrom) capture(ByRef) -> !fir.ref<f32> {name = "scalar_arr%nest%real"}
-!CHECK: %[[PARENT_MAP:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : {{.*}}, {{.*}}) map_clauses(exit_release_or_enter_alloc) capture(ByRef) members(%[[MEMBER_MAP_1]], %[[MEMBER_MAP_2]] : [2, 0], [2, 1] : !fir.ref<i32>, !fir.ref<f32>) -> {{.*}} {name = "scalar_arr", partial_map = true}
+!CHECK: %[[PARENT_MAP:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : {{.*}}, {{.*}}) map_clauses(storage) capture(ByRef) members(%[[MEMBER_MAP_1]], %[[MEMBER_MAP_2]] : [2, 0], [2, 1] : !fir.ref<i32>, !fir.ref<f32>) -> {{.*}} {name = "scalar_arr", partial_map = true}
 !CHECK: omp.target map_entries(%[[PARENT_MAP]] -> %[[ARG0:.*]], %[[MEMBER_MAP_1]] -> %[[ARG1:.*]], %[[MEMBER_MAP_2]] -> %[[ARG2:.*]] : {{.*}}, !fir.ref<i32>, !fir.ref<f32>) {
 subroutine mapType_derived_nested_explicit_multiple_members
   type :: nested
@@ -169,7 +169,7 @@ end subroutine mapType_derived_nested_explicit_multiple_members
 !CHECK: %[[C4:.*]] = arith.constant 4 : index
 !CHECK: %[[BOUNDS:.*]] = omp.map.bounds lower_bound(%[[C1_2]] : index) upper_bound(%[[C4]] : index) extent(%[[C10]] : index) stride(%[[C1]] : index) start_idx(%[[C1]] : index)
 !CHECK: %[[MEMBER_MAP:.*]] = omp.map.info var_ptr(%[[NEST_MEMBER]] : !fir.ref<!fir.array<10xi32>>, !fir.array<10xi32>) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<10xi32>> {name = "scalar_arr%nest%array(2:5)"}
-!CHECK: %[[PARENT_MAP:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : {{.*}}, {{.*}}) map_clauses(exit_release_or_enter_alloc) capture(ByRef) members(%[[MEMBER_MAP]] : [2, 2] : !fir.ref<!fir.array<10xi32>>) -> {{.*}} {name = "scalar_arr", partial_map = true}
+!CHECK: %[[PARENT_MAP:.*]] = omp.map.info var_ptr(%[[DECLARE]]#1 : {{.*}}, {{.*}}) map_clauses(storage) capture(ByRef) members(%[[MEMBER_MAP]] : [2, 2] : !fir.ref<!fir.array<10xi32>>) -> {{.*}} {name = "scalar_arr", partial_map = true}
 !CHECK: omp.target map_entries(%[[PARENT_MAP]] -> %[[ARG0:.*]], %[[MEMBER_MAP]] -> %[[ARG1:.*]] : {{.*}}, !fir.ref<!fir.array<10xi32>>) {
 subroutine mapType_derived_nested_explicit_member_with_bounds
   type :: nested
@@ -202,8 +202,8 @@ end subroutine mapType_derived_nested_explicit_member_with_bounds
 !CHECK: %[[PARENT_2:.*]] = hlfir.designate %[[DECLARE_2]]#0{"nest"}   : {{.*}} -> {{.*}}
 !CHECK: %[[MEMBER_2:.*]] = hlfir.designate %[[PARENT_2]]{"int"}   : {{.*}} -> !fir.ref<i32>
 !CHECK: %[[MAP_MEMBER_2:.*]] = omp.map.info var_ptr(%[[MEMBER_2]] : !fir.ref<i32>, i32) map_clauses(tofrom) capture(ByRef) -> !fir.ref<i32> {name = "scalar_arr2%nest%int"}
-!CHECK: %[[MAP_PARENT_1:.*]] = omp.map.info var_ptr(%[[DECLARE_1]]#1 : {{.*}}) map_clauses(exit_release_or_enter_alloc) capture(ByRef) members(%[[MAP_MEMBER_1]] : [2, 0] : !fir.ref<i32>) -> {{.*}} {name = "scalar_arr1", partial_map = true}
-!CHECK: %[[MAP_PARENT_2:.*]] = omp.map.info var_ptr(%[[DECLARE_2]]#1 : {{.*}}) map_clauses(exit_release_or_enter_alloc) capture(ByRef) members(%[[MAP_MEMBER_2]] : [2, 0] : !fir.ref<i32>) -> {{.*}} {name = "scalar_arr2", partial_map = true}
+!CHECK: %[[MAP_PARENT_1:.*]] = omp.map.info var_ptr(%[[DECLARE_1]]#1 : {{.*}}) map_clauses(storage) capture(ByRef) members(%[[MAP_MEMBER_1]] : [2, 0] : !fir.ref<i32>) -> {{.*}} {name = "scalar_arr1", partial_map = true}
+!CHECK: %[[MAP_PARENT_2:.*]] = omp.map.info var_ptr(%[[DECLARE_2]]#1 : {{.*}}) map_clauses(storage) capture(ByRef) members(%[[MAP_MEMBER_2]] : [2, 0] : !fir.ref<i32>) -> {{.*}} {name = "scalar_arr2", partial_map = true}
 !CHECK: omp.target map_entries(%[[MAP_PARENT_1]] -> %[[ARG0:.*]], %[[MAP_PARENT_2:.*]] -> %[[ARG1:.*]], %[[MAP_MEMBER_1]] -> %[[ARG2:.*]], %[[MAP_MEMBER_2]] -> %[[ARG3:.*]] : {{.*}}, {{.*}}, !fir.ref<i32>, !fir.ref<i32>) {
 subroutine mapType_multilpe_derived_nested_explicit_member
   type :: nested
diff --git a/flang/test/Lower/OpenMP/has_device_addr-mapinfo.f90 b/flang/test/Lower/OpenMP/has_device_addr-mapinfo.f90
index 908b88c0c9a30..5a8b2b316dc69 100644
--- a/flang/test/Lower/OpenMP/has_device_addr-mapinfo.f90
+++ b/flang/test/Lower/OpenMP/has_device_addr-mapinfo.f90
@@ -17,7 +17,7 @@ integer function s(a)
 
 ! Check that the map.info for `a` only takes a single parameter.
 
-!CHECK-DAG: %[[MAP_A:[0-9]+]] = "omp.map.info"(%[[STORAGE_A:[0-9#]+]]) <{map_capture_type = #omp<variable_capture_kind(ByRef)>, map_type = 16901 : ui64, name = "a", operandSegmentSizes = array<i32: 1, 0, 0, 0>, partial_map = false, var_type = !fir.box<!fir.array<?xi32>>}> : (!fir.ref<!fir.box<!fir.array<?xi32>>>) -> !fir.ref<!fir.array<?xi32>>
-!CHECK-DAG: %[[MAP_T:[0-9]+]] = "omp.map.info"(%[[STORAGE_T:[0-9#]+]]) <{map_capture_type = #omp<variable_capture_kind(ByRef)>, map_type = 2 : ui64, name = "t", operandSegmentSizes = array<i32: 1, 0, 0, 0>, partial_map = false, var_type = i32}> : (!fir.ref<i32>) -> !fir.ref<i32>
+!CHECK-DAG: %[[MAP_A:[0-9]+]] = "omp.map.info"(%[[STORAGE_A:[0-9#]+]]) <{map_capture_type = #omp<variable_capture_kind(ByRef)>, map_type = #omp<clause_map_flags to|always|implicit|descriptor>, name = "a", operandSegmentSizes = array<i32: 1, 0, 0, 0>, partial_map = false, var_type = !fir.box<!fir.array<?xi32>>}> : (!fir.ref<!fir.box<!fir.array<?xi32>>>) -> !fir.ref<!fir.array<?xi32>>
+!CHECK-DAG: %[[MAP_T:[0-9]+]] = "omp.map.info"(%[[STORAGE_T:[0-9#]+]]) <{map_capture_type = #omp<variable_capture_kind(ByRef)>, map_type = #omp<clause_map_flags from>, name = "t", operandSegmentSizes = array<i32: 1, 0, 0, 0>, partial_map = false, var_type = i32}> : (!fir.ref<i32>) -> !fir.ref<i32>
 
 !CHECK: "omp.target"(%[[MAP_A]], %[[MAP_T]])
diff --git a/flang/test/Lower/OpenMP/local-intrinsic-sized-array-map.f90 b/flang/test/Lower/OpenMP/local-intrinsic-sized-array-map.f90
index ab2cdf380b783..76dba67df5d07 100644
--- a/flang/test/Lower/OpenMP/local-intrinsic-sized-array-map.f90
+++ b/flang/test/Lower/OpenMP/local-intrinsic-sized-array-map.f90
@@ -11,7 +11,7 @@
 !HLFIRDIALECT:   %[[B_DECLARE:.*]]:2 = hlfir.declare %[[B_ALLOCA]](%[[B_SHAPE]]) {uniq_name = "_QFlocal_variable_intrinsic_sizeEb"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
 !HLFIRDIALECT:   %[[BOUNDS:.*]] = omp.map.bounds lower_bound({{.*}} : index) upper_bound({{.*}} : index) extent({{.*}} : index) stride({{.*}} : index) start_idx({{.*}} : index) {stride_in_bytes = true}
 !HLFIRDIALECT:   %[[MAP_DATA_B:.*]] = omp.map.info var_ptr(%[[B_DECLARE]]#1 : !fir.ref<!fir.array<?xf32>>, f32) map_clauses(tofrom) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<?xf32>> {name = "b"}
-!HLFIRDIALECT:   %[[MAP_DATA_SZ:.*]] = omp.map.info var_ptr(%[[SZ_DATA]] : !fir.ref<index>, index) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref<index> {name = ""}
+!HLFIRDIALECT:   %[[MAP_DATA_SZ:.*]] = omp.map.info var_ptr(%[[SZ_DATA]] : !fir.ref<index>, index) map_clauses(implicit) capture(ByCopy) -> !fir.ref<index> {name = ""}
 !HLFIRDIALECT:   omp.target map_entries(%[[MAP_DATA_B]] -> %[[ARG1:.*]], %[[MAP_DATA_SZ]] -> %[[ARG2:.*]] : !fir.ref<!fir.array<?xf32>>, !fir.ref<index>) {
 !HLFIRDIALECT:      %[[SZ_LD:.*]] = fir.load %[[ARG2]] : !fir.ref<index>
 !HLFIRDIALECT:      %[[SZ_CONV:.*]] = fir.convert %[[SZ_LD]] : (index) -> i64
diff --git a/flang/test/Lower/OpenMP/map-component-ref.f90 b/flang/test/Lower/OpenMP/map-component-ref.f90
index 6a91ac9629e49..228d682d32830 100644
--- a/flang/test/Lower/OpenMP/map-component-ref.f90
+++ b/flang/test/Lower/OpenMP/map-component-ref.f90
@@ -6,7 +6,7 @@
 ! CHECK: %[[V1:[0-9]+]]:2 = hlfir.declare %[[V0]] {uniq_name = "_QFfoo1Ea"} : (!fir.ref<!fir.type<_QFfoo1Tt0{a0:i32,a1:i32}>>) -> (!fir.ref<!fir.type<_QFfoo1Tt0{a0:i32,a1:i32}>>, !fir.ref<!fir.type<_QFfoo1Tt0{a0:i32,a1:i32}>>)
 ! CHECK: %[[V2:[0-9]+]] = hlfir.designate %[[V1]]#0{"a1"}   : (!fir.ref<!fir.type<_QFfoo1Tt0{a0:i32,a1:i32}>>) -> !fir.ref<i32>
 ! CHECK: %[[V3:[0-9]+]] = omp.map.info var_ptr(%[[V2]] : !fir.ref<i32>, i32) map_clauses(tofrom) capture(ByRef) -> !fir.ref<i32> {name = "a%a1"}
-! CHECK: %[[V4:[0-9]+]] = omp.map.info var_ptr(%[[V1]]#1 : !fir.ref<!fir.type<_QFfoo1Tt0{a0:i32,a1:i32}>>, !fir.type<_QFfoo1Tt0{a0:i32,a1:i32}>) map_clauses(exit_release_or_enter_alloc) capture(ByRef) members(%[[V3]] : [1] : !fir.ref<i32>) -> !fir.ref<!fir.type<_QFfoo1Tt0{a0:i32,a1:i32}>> {name = "a", partial_map = true}
+! CHECK: %[[V4:[0-9]+]] = omp.map.info var_ptr(%[[V1]]#1 : !fir.ref<!fir.type<_QFfoo1Tt0{a0:i32,a1:i32}>>, !fir.type<_QFfoo1Tt0{a0:i32,a1:i32}>) map_clauses(storage) capture(ByRef) members(%[[V3]] : [1] : !fir.ref<i32>) -> !fir.ref<!fir.type<_QFfoo1Tt0{a0:i32,a1:i32}>> {name = "a", partial_map = true}
 ! CHECK: omp.target map_entries(%[[V4]] -> %arg0, %[[V3]] -> %arg1 : !fir.ref<!fir.type<_QFfoo1Tt0{a0:i32,a1:i32}>>, !fir.ref<i32>) {
 ! CHECK:   %[[V5:[0-9]+]]:2 = hlfir.declare %arg0 {uniq_name = "_QFfoo1Ea"} : (!fir.ref<!fir.type<_QFfoo1Tt0{a0:i32,a1:i32}>>) -> (!fir.ref<!fir.type<_QFfoo1Tt0{a0:i32,a1:i32}>>, !fir.ref<!fir.type<_QFfoo1Tt0{a0:i32,a1:i32}>>)
 ! CHECK:   %c0_i32 = arith.constant 0 : i32
diff --git a/flang/test/Lower/OpenMP/optional-argument-map-2.f90 b/flang/test/Lower/OpenMP/optional-argument-map-2.f90
index 73c2e62ceb1a0..10690801a762f 100644
--- a/flang/test/Lower/OpenMP/optional-argument-map-2.f90
+++ b/flang/test/Lower/OpenMP/optional-argument-map-2.f90
@@ -97,7 +97,7 @@ end module mod
 ! CHECK-NO-FPRIV:     }
 ! CHECK-NO-FPRIV:  %[[VAL_13:.*]] = arith.subi %[[VAL_14:.*]]#0, %[[VAL_10]] : index
 ! CHECK-NO-FPRIV:  %[[VAL_15:.*]] = omp.map.bounds lower_bound(%[[VAL_9]] : index) upper_bound(%[[VAL_13]] : index) extent(%[[VAL_14]]#0 : index) stride(%[[VAL_14]]#1 : index) start_idx(%[[VAL_9]] : index) {stride_in_bytes = true}
-! CHECK-NO-FPRIV:  %[[VAL_16:.*]] = omp.map.info var_ptr(%[[VAL_3]]#1 : !fir.ref<!fir.char<1,?>>, !fir.char<1,?>) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) bounds(%[[VAL_15]]) -> !fir.ref<!fir.char<1,?>> {name = "a"}
+! CHECK-NO-FPRIV:  %[[VAL_16:.*]] = omp.map.info var_ptr(%[[VAL_3]]#1 : !fir.ref<!fir.char<1,?>>, !fir.char<1,?>) map_clauses(implicit) capture(ByCopy) bounds(%[[VAL_15]]) -> !fir.ref<!fir.char<1,?>> {name = "a"}
 ! CHECK-NO-FPRIV:  fir.store %[[ARG0]] to %[[VAL_0]] : !fir.ref<!fir.boxchar<1>>
 ! CHECK-NO-FPRIV:  %[[VAL_17:.*]] = arith.constant 0 : index
 ! CHECK-NO-FPRIV:  %[[VAL_18:.*]] = arith.constant 1 : index
diff --git a/flang/test/Lower/OpenMP/reduction_var_map.f90 b/flang/test/Lower/OpenMP/reduction_var_map.f90
index a7775e70c1d30..60a75dbfb39f8 100644
--- a/flang/test/Lower/OpenMP/reduction_var_map.f90
+++ b/flang/test/Lower/OpenMP/reduction_var_map.f90
@@ -22,7 +22,7 @@ end subroutine omp_target_combined
 !CHECK-LABEL: func.func @_QPomp_target_combined() {
 !CHECK: omp.map.info var_ptr({{.*}} : !fir.ref<i64>, i64) map_clauses(tofrom) capture(ByRef) -> !fir.ref<i64> {name = "s1"}
 !CHECK: omp.map.info var_ptr({{.*}} : !fir.ref<i64>, i64) map_clauses(tofrom) capture(ByRef) -> !fir.ref<i64> {name = "s2"}
-!CHECK: omp.map.info var_ptr({{.*}} : !fir.ref<i32>, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref<i32> {name = "i"}
+!CHECK: omp.map.info var_ptr({{.*}} : !fir.ref<i32>, i32) map_clauses(implicit) capture(ByCopy) -> !fir.ref<i32> {name = "i"}
 
 subroutine omp_target_team_separate
    implicit none
diff --git a/flang/test/Lower/OpenMP/target.f90 b/flang/test/Lower/OpenMP/target.f90
index c60683a56f412..b3b6d79c19ad6 100644
--- a/flang/test/Lower/OpenMP/target.f90
+++ b/flang/test/Lower/OpenMP/target.f90
@@ -70,7 +70,7 @@ subroutine omp_target_enter_mt
    !CHECK: %[[BOUNDS_1:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
    !CHECK: %[[MAP_1:.*]] = omp.map.info var_ptr(%{{.*}})  map_clauses(to) capture(ByRef) bounds(%[[BOUNDS_1]]) -> !fir.ref<!fir.array<1024xi32>> {name = "b"}
    !CHECK: %[[BOUNDS_2:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
-   !CHECK: %[[MAP_2:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(always, exit_release_or_enter_alloc) capture(ByRef) bounds(%[[BOUNDS_2]]) -> !fir.ref<!fir.array<1024xi32>> {name = "c"}
+   !CHECK: %[[MAP_2:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(always, storage) capture(ByRef) bounds(%[[BOUNDS_2]]) -> !fir.ref<!fir.array<1024xi32>> {name = "c"}
    !CHECK: %[[BOUNDS_3:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
    !CHECK: %[[MAP_3:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(to) capture(ByRef) bounds(%[[BOUNDS_3]]) -> !fir.ref<!fir.array<1024xi32>> {name = "d"}
    !CHECK: omp.target_enter_data   map_entries(%[[MAP_0]], %[[MAP_1]], %[[MAP_2]], %[[MAP_3]] : !fir.ref<!fir.array<1024xi32>>, !fir.ref<!fir.array<1024xi32>>, !fir.ref<!fir.array<1024xi32>>, !fir.ref<!fir.array<1024xi32>>)
@@ -151,9 +151,9 @@ subroutine omp_target_exit_mt
    !CHECK: %[[BOUNDS_1:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
    !CHECK: %[[MAP_1:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(from) capture(ByRef) bounds(%[[BOUNDS_1]]) -> !fir.ref<!fir.array<1024xi32>> {name = "b"}
    !CHECK: %[[BOUNDS_2:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
-   !CHECK: %[[MAP_2:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(exit_release_or_enter_alloc) capture(ByRef) bounds(%[[BOUNDS_2]]) -> !fir.ref<!fir.array<1024xi32>> {name = "c"}
+   !CHECK: %[[MAP_2:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(storage) capture(ByRef) bounds(%[[BOUNDS_2]]) -> !fir.ref<!fir.array<1024xi32>> {name = "c"}
    !CHECK: %[[BOUNDS_3:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
-   !CHECK: %[[MAP_3:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(always, delete) capture(ByRef) bounds(%[[BOUNDS_3]]) -> !fir.ref<!fir.array<1024xi32>> {name = "d"}
+   !CHECK: %[[MAP_3:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(always, delete, storage) capture(ByRef) bounds(%[[BOUNDS_3]]) -> !fir.ref<!fir.array<1024xi32>> {name = "d"}
    !CHECK: %[[BOUNDS_4:.*]] = omp.map.bounds   lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}})
    !CHECK: %[[MAP_4:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(from) capture(ByRef) bounds(%[[BOUNDS_4]]) -> !fir.ref<!fir.array<1024xi32>> {name = "e"}
    !CHECK: omp.target_exit_data map_entries(%[[MAP_0]], %[[MAP_1]], %[[MAP_2]], %[[MAP_3]], %[[MAP_4]] : !fir.ref<!fir.array<1024xi32>>, !fir.ref<!fir.array<1024xi32>>, !fir.ref<!fir.array<1024xi32>>, !fir.ref<!fir.array<1024xi32>>, !fir.ref<!fir.array<1024xi32>>)
@@ -483,7 +483,7 @@ subroutine omp_target_implicit_bounds(n)
    integer :: a(n)
    !CHECK: %[[VAL_14:.*]] = omp.map.bounds lower_bound(%c0{{.*}} : index) upper_bound(%[[UB]] : index) extent(%[[VAL_7]] : index) stride(%c1{{.*}} : index) start_idx(%c1{{.*}} : index)
    !CHECK: %[[VAL_15:.*]] = omp.map.info var_ptr(%[[VAL_10]]#1 : !fir.ref<!fir.array<?xi32>>, i32) map_clauses(implicit, tofrom) capture(ByRef) bounds(%[[VAL_14]]) -> !fir.ref<!fir.array<?xi32>> {name = "a"}
-   !CHECK: %[[VAL_16:.*]] = omp.map.info var_ptr(%[[VAL_COPY]] : !fir.ref<i32>, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref<i32> {name = ""}
+   !CHECK: %[[VAL_16:.*]] = omp.map.info var_ptr(%[[VAL_COPY]] : !fir.ref<i32>, i32) map_clauses(implicit) capture(ByCopy) -> !fir.ref<i32> {name = ""}
    !CHECK: omp.target map_entries(%[[VAL_15]] -> %[[VAL_17:.*]], %[[VAL_16]] -> %[[VAL_18:.*]] : !fir.ref<!fir.array<?xi32>>, !fir.ref<i32>) {
    !$omp target
       !CHECK: %[[VAL_19:.*]] = fir.load %[[VAL_18]] : !fir.ref<i32>
@@ -643,8 +643,8 @@ subroutine target_unstructured
    integer :: i = 1
    !CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtarget_unstructuredEj"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
    integer :: j = 11
-   !CHECK-NO-FPRIV: %[[VAL_4:.*]] = omp.map.info var_ptr(%[[VAL_1]]#1 : !fir.ref<i32>, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref<i32> {name = "i"}
-   !CHECK-NO-FPRIV: %[[VAL_5:.*]] = omp.map.info var_ptr(%[[VAL_3]]#1 : !fir.ref<i32>, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref<i32> {name = "j"}
+   !CHECK-NO-FPRIV: %[[VAL_4:.*]] = omp.map.info var_ptr(%[[VAL_1]]#1 : !fir.ref<i32>, i32) map_clauses(implicit) capture(ByCopy) -> !fir.ref<i32> {name = "i"}
+   !CHECK-NO-FPRIV: %[[VAL_5:.*]] = omp.map.info var_ptr(%[[VAL_3]]#1 : !fir.ref<i32>, i32) map_clauses(implicit) capture(ByCopy) -> !fir.ref<i32> {name = "j"}
    !CHECK-NO-FPRIV: omp.target map_entries(%[[VAL_4]] -> %[[VAL_6:.*]], %[[VAL_5]] -> %[[VAL_7:.*]] : !fir.ref<i32>, !fir.ref<i32>) {
    !CHECK-FPRIV: %[[VAL_4:.*]] = omp.map.info var_ptr(%[[VAL_1]]#0 : !fir.ref<i32>, i32) map_clauses(to) capture(ByCopy) -> !fir.ref<i32>
    !CHECK-FPRIV: %[[VAL_5:.*]] = omp.map.info var_ptr(%[[VAL_3]]#0 : !fir.ref<i32>, i32) map_clauses(to) capture(ByCopy) -> !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/workdistribute-multiple.f90 b/flang/test/Lower/OpenMP/workdistribute-multiple.f90
new file mode 100644
index 0000000000000..cf1d9dd294cea
--- /dev/null
+++ b/flang/test/Lower/OpenMP/workdistribute-multiple.f90
@@ -0,0 +1,20 @@
+! RUN: not %flang_fc1 -emit-fir -fopenmp -fopenmp-version=60 %s -o - 2>&1 | FileCheck %s
+
+! CHECK: error: teams has multiple workdistribute ops.
+! CHECK-LABEL: func @_QPteams_workdistribute_1
+subroutine teams_workdistribute_1()
+  use iso_fortran_env
+  real(kind=real32) :: a
+  real(kind=real32), dimension(10) :: x
+  real(kind=real32), dimension(10) :: y
+  !$omp teams
+
+  !$omp workdistribute
+  y = a * x + y
+  !$omp end workdistribute
+
+  !$omp workdistribute
+  y = a * y + x
+  !$omp end workdistribute
+  !$omp end teams
+end subroutine teams_workdistribute_1
diff --git a/flang/test/Lower/OpenMP/workdistribute-saxpy-1d.f90 b/flang/test/Lower/OpenMP/workdistribute-saxpy-1d.f90
new file mode 100644
index 0000000000000..b2dbc0f15121e
--- /dev/null
+++ b/flang/test/Lower/OpenMP/workdistribute-saxpy-1d.f90
@@ -0,0 +1,39 @@
+! RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-version=60 %s -o - | FileCheck %s
+
+! CHECK-LABEL: func @_QPtarget_teams_workdistribute
+subroutine target_teams_workdistribute()
+  use iso_fortran_env
+  real(kind=real32) :: a
+  real(kind=real32), dimension(10) :: x
+  real(kind=real32), dimension(10) :: y
+
+  ! CHECK: omp.target_data
+  ! CHECK: omp.target
+  ! CHECK: omp.teams
+  ! CHECK: omp.parallel
+  ! CHECK: omp.distribute
+  ! CHECK: omp.wsloop
+  ! CHECK: omp.loop_nest
+
+  !$omp target teams workdistribute
+  y = a * x + y
+  !$omp end target teams workdistribute
+end subroutine target_teams_workdistribute
+
+! CHECK-LABEL: func @_QPteams_workdistribute
+subroutine teams_workdistribute()
+  use iso_fortran_env
+  real(kind=real32) :: a
+  real(kind=real32), dimension(10) :: x
+  real(kind=real32), dimension(10) :: y
+
+  ! CHECK: omp.teams
+  ! CHECK: omp.parallel
+  ! CHECK: omp.distribute
+  ! CHECK: omp.wsloop
+  ! CHECK: omp.loop_nest
+
+  !$omp teams workdistribute
+  y = a * x + y
+  !$omp end teams workdistribute
+end subroutine teams_workdistribute
diff --git a/flang/test/Lower/OpenMP/workdistribute-saxpy-2d.f90 b/flang/test/Lower/OpenMP/workdistribute-saxpy-2d.f90
new file mode 100644
index 0000000000000..09e1211541edb
--- /dev/null
+++ b/flang/test/Lower/OpenMP/workdistribute-saxpy-2d.f90
@@ -0,0 +1,45 @@
+! RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-version=60 %s -o - | FileCheck %s
+
+! CHECK-LABEL: func @_QPtarget_teams_workdistribute
+subroutine target_teams_workdistribute(a, x, y, rows, cols)
+  use iso_fortran_env
+  implicit none
+
+  integer, intent(in) :: rows, cols
+  real(kind=real32) :: a
+  real(kind=real32), dimension(rows, cols) :: x, y
+
+  ! CHECK: omp.target_data
+  ! CHECK: omp.target
+  ! CHECK: omp.teams
+  ! CHECK: omp.parallel
+  ! CHECK: omp.distribute
+  ! CHECK: omp.wsloop
+  ! CHECK: omp.loop_nest
+  ! CHECK: fir.do_loop
+
+  !$omp target teams workdistribute
+  y = a * x + y
+  !$omp end target teams workdistribute
+end subroutine target_teams_workdistribute
+
+! CHECK-LABEL: func @_QPteams_workdistribute
+subroutine teams_workdistribute(a, x, y, rows, cols)
+  use iso_fortran_env
+  implicit none
+
+  integer, intent(in) :: rows, cols
+  real(kind=real32) :: a
+  real(kind=real32), dimension(rows, cols) :: x, y
+
+  ! CHECK: omp.teams
+  ! CHECK: omp.parallel
+  ! CHECK: omp.distribute
+  ! CHECK: omp.wsloop
+  ! CHECK: omp.loop_nest
+  ! CHECK: fir.do_loop
+
+  !$omp teams workdistribute
+  y = a * x + y
+  !$omp end teams workdistribute
+end subroutine teams_workdistribute
diff --git a/flang/test/Lower/OpenMP/workdistribute-saxpy-3d.f90 b/flang/test/Lower/OpenMP/workdistribute-saxpy-3d.f90
new file mode 100644
index 0000000000000..cf5d0234edb39
--- /dev/null
+++ b/flang/test/Lower/OpenMP/workdistribute-saxpy-3d.f90
@@ -0,0 +1,47 @@
+! RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-version=60 %s -o - | FileCheck %s
+
+! CHECK-LABEL: func @_QPtarget_teams_workdistribute
+subroutine target_teams_workdistribute(a, x, y, rows, cols, depth)
+  use iso_fortran_env
+  implicit none
+
+  integer, intent(in) :: rows, cols, depth
+  real(kind=real32) :: a
+  real(kind=real32), dimension(rows, cols, depth) :: x, y
+
+  ! CHECK: omp.target_data
+  ! CHECK: omp.target
+  ! CHECK: omp.teams
+  ! CHECK: omp.parallel
+  ! CHECK: omp.distribute
+  ! CHECK: omp.wsloop
+  ! CHECK: omp.loop_nest
+  ! CHECK: fir.do_loop
+  ! CHECK: fir.do_loop
+
+  !$omp target teams workdistribute
+  y = a * x + y
+  !$omp end target teams workdistribute
+end subroutine target_teams_workdistribute
+
+! CHECK-LABEL: func @_QPteams_workdistribute
+subroutine teams_workdistribute(a, x, y, rows, cols, depth)
+  use iso_fortran_env
+  implicit none
+
+  integer, intent(in) :: rows, cols, depth
+  real(kind=real32) :: a
+  real(kind=real32), dimension(rows, cols, depth) :: x, y
+
+  ! CHECK: omp.teams
+  ! CHECK: omp.parallel
+  ! CHECK: omp.distribute
+  ! CHECK: omp.wsloop
+  ! CHECK: omp.loop_nest
+  ! CHECK: fir.do_loop
+  ! CHECK: fir.do_loop
+
+  !$omp teams workdistribute
+  y = a * x + y
+  !$omp end teams workdistribute
+end subroutine teams_workdistribute
diff --git a/flang/test/Lower/OpenMP/workdistribute-saxpy-and-scalar-assign.f90 b/flang/test/Lower/OpenMP/workdistribute-saxpy-and-scalar-assign.f90
new file mode 100644
index 0000000000000..516c4603bd5da
--- /dev/null
+++ b/flang/test/Lower/OpenMP/workdistribute-saxpy-and-scalar-assign.f90
@@ -0,0 +1,53 @@
+! RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-version=60 %s -o - | FileCheck %s
+
+! CHECK-LABEL: func @_QPtarget_teams_workdistribute
+subroutine target_teams_workdistribute()
+  use iso_fortran_env
+  real(kind=real32) :: a
+  real(kind=real32), dimension(10) :: x
+  real(kind=real32), dimension(10) :: y
+  !$omp target teams workdistribute
+
+  ! CHECK: omp.target_data
+  ! CHECK: omp.target
+  ! CHECK: omp.teams
+  ! CHECK: omp.parallel
+  ! CHECK: omp.distribute
+  ! CHECK: omp.wsloop
+  ! CHECK: omp.loop_nest
+
+  y = a * x + y
+
+  ! CHECK: omp.target
+  ! CHECK: omp.teams
+  ! CHECK: omp.parallel
+  ! CHECK: omp.distribute
+  ! CHECK: omp.wsloop
+  ! CHECK: omp.loop_nest
+
+  y = 2.0_real32
+
+  !$omp end target teams workdistribute
+end subroutine target_teams_workdistribute
+
+! CHECK-LABEL: func @_QPteams_workdistribute
+subroutine teams_workdistribute()
+  use iso_fortran_env
+  real(kind=real32) :: a
+  real(kind=real32), dimension(10) :: x
+  real(kind=real32), dimension(10) :: y
+  !$omp teams workdistribute
+
+  ! CHECK: omp.teams
+  ! CHECK: omp.parallel
+  ! CHECK: omp.distribute
+  ! CHECK: omp.wsloop
+  ! CHECK: omp.loop_nest
+
+  y = a * x + y
+
+  ! CHECK: fir.call @_FortranAAssign
+  y = 2.0_real32
+
+  !$omp end teams workdistribute
+end subroutine teams_workdistribute
diff --git a/flang/test/Lower/OpenMP/workdistribute-saxpy-two-2d.f90 b/flang/test/Lower/OpenMP/workdistribute-saxpy-two-2d.f90
new file mode 100644
index 0000000000000..4aeb2e89140cc
--- /dev/null
+++ b/flang/test/Lower/OpenMP/workdistribute-saxpy-two-2d.f90
@@ -0,0 +1,68 @@
+! RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-version=60 %s -o - | FileCheck %s
+
+! CHECK-LABEL: func @_QPtarget_teams_workdistribute
+subroutine target_teams_workdistribute(a, x, y, rows, cols)
+  use iso_fortran_env
+  implicit none
+
+  integer, intent(in) :: rows, cols
+  real(kind=real32) :: a
+  real(kind=real32), dimension(rows, cols) :: x, y
+
+  !$omp target teams workdistribute
+
+  ! CHECK: omp.target_data
+  ! CHECK: omp.target
+  ! CHECK: omp.teams
+  ! CHECK: omp.parallel
+  ! CHECK: omp.distribute
+  ! CHECK: omp.wsloop
+  ! CHECK: omp.loop_nest
+  ! CHECK: fir.do_loop
+
+  y = a * x + y
+  
+  ! CHECK: omp.target
+  ! CHECK: omp.teams
+  ! CHECK: omp.parallel
+  ! CHECK: omp.distribute
+  ! CHECK: omp.wsloop
+  ! CHECK: omp.loop_nest
+  ! CHECK: fir.do_loop
+  
+  y = a * y + x
+
+  !$omp end target teams workdistribute
+end subroutine target_teams_workdistribute
+
+! CHECK-LABEL: func @_QPteams_workdistribute
+subroutine teams_workdistribute(a, x, y, rows, cols)
+  use iso_fortran_env
+  implicit none
+
+  integer, intent(in) :: rows, cols
+  real(kind=real32) :: a
+  real(kind=real32), dimension(rows, cols) :: x, y
+
+  !$omp teams workdistribute
+
+  ! CHECK: omp.teams
+  ! CHECK: omp.parallel
+  ! CHECK: omp.distribute
+  ! CHECK: omp.wsloop
+  ! CHECK: omp.loop_nest
+  ! CHECK: fir.do_loop
+
+  y = a * x + y
+  
+  ! CHECK: omp.teams
+  ! CHECK: omp.parallel
+  ! CHECK: omp.distribute
+  ! CHECK: omp.wsloop
+  ! CHECK: omp.loop_nest
+  ! CHECK: fir.do_loop
+  
+  y = a * y + x
+
+  !$omp end teams workdistribute
+end subroutine teams_workdistribute
diff --git a/flang/test/Lower/OpenMP/workdistribute-scalar-assign.f90 b/flang/test/Lower/OpenMP/workdistribute-scalar-assign.f90
new file mode 100644
index 0000000000000..3062b3598b8ae
--- /dev/null
+++ b/flang/test/Lower/OpenMP/workdistribute-scalar-assign.f90
@@ -0,0 +1,29 @@
+! RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-version=60 %s -o - | FileCheck %s
+
+! CHECK-LABEL: func @_QPtarget_teams_workdistribute_scalar_assign
+subroutine target_teams_workdistribute_scalar_assign()
+  integer :: aa(10)
+
+  ! CHECK: omp.target_data
+  ! CHECK: omp.target
+  ! CHECK: omp.teams
+  ! CHECK: omp.parallel
+  ! CHECK: omp.distribute
+  ! CHECK: omp.wsloop
+  ! CHECK: omp.loop_nest
+  
+  !$omp target teams workdistribute
+  aa = 20
+  !$omp end target teams workdistribute
+
+end subroutine target_teams_workdistribute_scalar_assign
+
+! CHECK-LABEL: func @_QPteams_workdistribute_scalar_assign
+subroutine teams_workdistribute_scalar_assign()
+  integer :: aa(10)
+  ! CHECK: fir.call @_FortranAAssign
+  !$omp teams workdistribute
+  aa = 20
+  !$omp end teams workdistribute
+
+end subroutine teams_workdistribute_scalar_assign
diff --git a/flang/test/Lower/OpenMP/workdistribute-target-teams-clauses.f90 b/flang/test/Lower/OpenMP/workdistribute-target-teams-clauses.f90
new file mode 100644
index 0000000000000..4a08e53bc316a
--- /dev/null
+++ b/flang/test/Lower/OpenMP/workdistribute-target-teams-clauses.f90
@@ -0,0 +1,32 @@
+! RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-version=60 %s -o - | FileCheck %s
+
+! CHECK-LABEL: func @_QPtarget_teams_workdistribute
+! CHECK: omp.target_data map_entries({{.*}})
+! CHECK: omp.target thread_limit({{.*}}) host_eval({{.*}}) map_entries({{.*}})
+! CHECK: omp.teams num_teams({{.*}})
+! CHECK: omp.parallel
+! CHECK: omp.distribute
+! CHECK: omp.wsloop
+! CHECK: omp.loop_nest
+
+subroutine target_teams_workdistribute()
+  use iso_fortran_env
+  real(kind=real32) :: a
+  real(kind=real32), dimension(10) :: x
+  real(kind=real32), dimension(10) :: y
+  integer :: i
+
+  a = 2.0_real32
+  x = [(real(i, real32), i = 1, 10)]
+  y = [(real(i * 0.5, real32), i = 1, 10)]
+
+  !$omp target teams workdistribute &
+  !$omp&  num_teams(4) &
+  !$omp&  thread_limit(8) &
+  !$omp&  default(shared) &
+  !$omp&  private(i) &
+  !$omp&  map(to: x) &
+  !$omp&  map(tofrom: y)
+  y = a * x + y
+  !$omp end target teams workdistribute
+end subroutine target_teams_workdistribute
diff --git a/flang/test/Lower/OpenMP/workdistribute-teams-unsupported-after.f90 b/flang/test/Lower/OpenMP/workdistribute-teams-unsupported-after.f90
new file mode 100644
index 0000000000000..f9c5a771f401d
--- /dev/null
+++ b/flang/test/Lower/OpenMP/workdistribute-teams-unsupported-after.f90
@@ -0,0 +1,22 @@
+! RUN: not %flang_fc1 -emit-fir -fopenmp -fopenmp-version=60 %s -o - 2>&1 | FileCheck %s
+
+! CHECK: error: teams has omp ops other than workdistribute. Lowering not implemented yet.
+! CHECK-LABEL: func @_QPteams_workdistribute_1
+subroutine teams_workdistribute_1()
+  use iso_fortran_env
+  real(kind=real32) :: a
+  real(kind=real32), dimension(10) :: x
+  real(kind=real32), dimension(10) :: y
+  !$omp teams
+
+  !$omp workdistribute
+  y = a * x + y
+  !$omp end workdistribute
+
+  !$omp distribute
+  do i = 1, 10
+    x(i) = real(i, kind=real32)
+  end do
+  !$omp end distribute
+  !$omp end teams
+end subroutine teams_workdistribute_1
diff --git a/flang/test/Lower/OpenMP/workdistribute-teams-unsupported-before.f90 b/flang/test/Lower/OpenMP/workdistribute-teams-unsupported-before.f90
new file mode 100644
index 0000000000000..3ef7f90087944
--- /dev/null
+++ b/flang/test/Lower/OpenMP/workdistribute-teams-unsupported-before.f90
@@ -0,0 +1,22 @@
+! RUN: not %flang_fc1 -emit-fir -fopenmp -fopenmp-version=60 %s -o - 2>&1 | FileCheck %s
+
+! CHECK: error: teams has omp ops other than workdistribute. Lowering not implemented yet.
+! CHECK-LABEL: func @_QPteams_workdistribute_1
+subroutine teams_workdistribute_1()
+  use iso_fortran_env
+  real(kind=real32) :: a
+  real(kind=real32), dimension(10) :: x
+  real(kind=real32), dimension(10) :: y
+  !$omp teams
+
+  !$omp distribute
+  do i = 1, 10
+    x(i) = real(i, kind=real32)
+  end do
+  !$omp end distribute
+
+  !$omp workdistribute
+  y = a * x + y
+  !$omp end workdistribute
+  !$omp end teams
+end subroutine teams_workdistribute_1
diff --git a/flang/test/Lower/polymorphic-temp.f90 b/flang/test/Lower/polymorphic-temp.f90
index a9db9ba7b7902..ac3cbdba6646d 100644
--- a/flang/test/Lower/polymorphic-temp.f90
+++ b/flang/test/Lower/polymorphic-temp.f90
@@ -223,4 +223,75 @@ subroutine test_merge_intrinsic2(a, b, i)
 ! CHECK: %[[A_REBOX:.*]] = fir.rebox %[[LOAD_A]] : (!fir.class<!fir.heap<!fir.type<_QMpoly_tmpTp1{a:i32}>>>) -> !fir.box<!fir.heap<!fir.type<_QMpoly_tmpTp1{a:i32}>>> 
 ! CHECK: %{{.*}} = arith.select %[[CMPI]], %[[A_REBOX]], %[[LOAD_B]] : !fir.box<!fir.heap<!fir.type<_QMpoly_tmpTp1{a:i32}>>>
 
+  subroutine check_unlimited_poly(a)
+    class(*), intent(in) :: a
+  end subroutine
+
+  subroutine test_merge_intrinsic3(a, b, i)
+    class(*), intent(in) :: a, b
+    integer, intent(in) :: i
+
+    call check_unlimited_poly(merge(a, b, i==1))
+  end subroutine
+
+! CHECK-LABEL: func.func @_QMpoly_tmpPtest_merge_intrinsic3(
+! CHECK-SAME: %[[A:.*]]: !fir.class<none> {fir.bindc_name = "a"}, %[[B:.*]]: !fir.class<none> {fir.bindc_name = "b"}, %[[I:.*]]: !fir.ref<i32> {fir.bindc_name = "i"}) {
+! CHECK: %[[V_0:[0-9]+]] = fir.load %[[I]] : !fir.ref<i32>
+! CHECK: %[[C1:.*]] = arith.constant 1 : i32
+! CHECK: %[[V_1:[0-9]+]] = arith.cmpi eq, %[[V_0]], %[[C1]] : i32
+! CHECK: %[[V_2:[0-9]+]] = arith.select %[[V_1]], %[[A]], %[[B]] : !fir.class<none>
+! CHECK: fir.call @_QMpoly_tmpPcheck_unlimited_poly(%[[V_2]]) fastmath<contract> : (!fir.class<none>) -> ()
+
+  subroutine test_merge_intrinsic4(i)
+    integer, intent(in) :: i
+    class(*), allocatable :: a, b
+
+    call check_unlimited_poly(merge(a, b, i==1))
+  end subroutine
+
+! CHECK-LABEL: func.func @_QMpoly_tmpPtest_merge_intrinsic4(
+! CHECK-SAME: %[[I:.*]]: !fir.ref<i32> {fir.bindc_name = "i"}) {
+! CHECK: %[[V_0:[0-9]+]] = fir.alloca !fir.class<!fir.heap<none>> {bindc_name = "a", uniq_name = "_QMpoly_tmpFtest_merge_intrinsic4Ea"}
+! CHECK: %[[V_1:[0-9]+]] = fir.zero_bits !fir.heap<none>
+! CHECK: %[[V_2:[0-9]+]] = fir.embox %[[V_1]] : (!fir.heap<none>) -> !fir.class<!fir.heap<none>>
+! CHECK: fir.store %[[V_2]] to %[[V_0]] : !fir.ref<!fir.class<!fir.heap<none>>>
+! CHECK: %[[V_3:[0-9]+]] = fir.alloca !fir.class<!fir.heap<none>> {bindc_name = "b", uniq_name = "_QMpoly_tmpFtest_merge_intrinsic4Eb"}
+! CHECK: %[[V_4:[0-9]+]] = fir.zero_bits !fir.heap<none>
+! CHECK: %[[V_5:[0-9]+]] = fir.embox %[[V_4]] : (!fir.heap<none>) -> !fir.class<!fir.heap<none>>
+! CHECK: fir.store %[[V_5]] to %[[V_3]] : !fir.ref<!fir.class<!fir.heap<none>>>
+! CHECK: %[[V_6:[0-9]+]] = fir.load %[[V_0]] : !fir.ref<!fir.class<!fir.heap<none>>>
+! CHECK: %[[V_7:[0-9]+]] = fir.load %[[V_3]] : !fir.ref<!fir.class<!fir.heap<none>>>
+! CHECK: %[[V_8:[0-9]+]] = fir.load %[[I]] : !fir.ref<i32>
+! CHECK: %[[C1:.*]] = arith.constant 1 : i32
+! CHECK: %[[V_9:[0-9]+]] = arith.cmpi eq, %[[V_8]], %[[C1]] : i32
+! CHECK: %[[V_10:[0-9]+]] = arith.select %[[V_9]], %[[V_6]], %[[V_7]] : !fir.class<!fir.heap<none>>
+! CHECK: %[[V_11:[0-9]+]] = fir.rebox %[[V_10]] : (!fir.class<!fir.heap<none>>) -> !fir.class<none>
+! CHECK: fir.call @_QMpoly_tmpPcheck_unlimited_poly(%[[V_11]]) fastmath<contract> : (!fir.class<none>) -> ()
+
+  subroutine test_merge_intrinsic5(i)
+    integer, intent(in) :: i
+    class(*), pointer :: a, b
+
+    call check_unlimited_poly(merge(a, b, i==1))
+  end subroutine
+
+! CHECK-LABEL: func.func @_QMpoly_tmpPtest_merge_intrinsic5(
+! CHECK-SAME: %[[I:.*]]: !fir.ref<i32> {fir.bindc_name = "i"}) {
+! CHECK: %[[V_0:[0-9]+]] = fir.alloca !fir.class<!fir.ptr<none>> {bindc_name = "a", uniq_name = "_QMpoly_tmpFtest_merge_intrinsic5Ea"}
+! CHECK: %[[V_1:[0-9]+]] = fir.zero_bits !fir.ptr<none>
+! CHECK: %[[V_2:[0-9]+]] = fir.embox %[[V_1]] : (!fir.ptr<none>) -> !fir.class<!fir.ptr<none>>
+! CHECK: fir.store %[[V_2]] to %[[V_0]] : !fir.ref<!fir.class<!fir.ptr<none>>>
+! CHECK: %[[V_3:[0-9]+]] = fir.alloca !fir.class<!fir.ptr<none>> {bindc_name = "b", uniq_name = "_QMpoly_tmpFtest_merge_intrinsic5Eb"}
+! CHECK: %[[V_4:[0-9]+]] = fir.zero_bits !fir.ptr<none>
+! CHECK: %[[V_5:[0-9]+]] = fir.embox %[[V_4]] : (!fir.ptr<none>) -> !fir.class<!fir.ptr<none>>
+! CHECK: fir.store %[[V_5]] to %[[V_3]] : !fir.ref<!fir.class<!fir.ptr<none>>>
+! CHECK: %[[V_6:[0-9]+]] = fir.load %[[V_0]] : !fir.ref<!fir.class<!fir.ptr<none>>>
+! CHECK: %[[V_7:[0-9]+]] = fir.load %[[V_3]] : !fir.ref<!fir.class<!fir.ptr<none>>>
+! CHECK: %[[V_8:[0-9]+]] = fir.load %[[I]] : !fir.ref<i32>
+! CHECK: %[[C1:.*]] = arith.constant 1 : i32
+! CHECK: %[[V_9:[0-9]+]] = arith.cmpi eq, %[[V_8]], %[[C1]] : i32
+! CHECK: %[[V_10:[0-9]+]] = arith.select %[[V_9]], %[[V_6]], %[[V_7]] : !fir.class<!fir.ptr<none>>
+! CHECK: %[[V_11:[0-9]+]] = fir.rebox %[[V_10]] : (!fir.class<!fir.ptr<none>>) -> !fir.class<none>
+! CHECK: fir.call @_QMpoly_tmpPcheck_unlimited_poly(%[[V_11]]) fastmath<contract> : (!fir.class<none>) -> ()
+
 end module
diff --git a/flang/test/Lower/volatile-openmp.f90 b/flang/test/Lower/volatile-openmp.f90
index d259ec6cea6a4..a80f111822f5c 100644
--- a/flang/test/Lower/volatile-openmp.f90
+++ b/flang/test/Lower/volatile-openmp.f90
@@ -37,7 +37,7 @@
 ! CHECK:           %[[VAL_25:.*]] = fir.box_offset %[[VAL_24]] base_addr : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>
 ! CHECK:           %[[VAL_26:.*]] = omp.map.info var_ptr(%[[VAL_24]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, i32) map_clauses(to) capture(ByRef) var_ptr_ptr(%[[VAL_25]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) bounds(%[[VAL_23]]) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {name = ""}
 ! CHECK:           %[[VAL_27:.*]] = omp.map.info var_ptr(%[[VAL_24]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.box<!fir.ptr<!fir.array<?xi32>>>) map_clauses(always, descriptor, to) capture(ByRef) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> {name = "container%[[VAL_28:.*]]"}
-! CHECK:           %[[VAL_29:.*]] = omp.map.info var_ptr(%[[VAL_13]]#1 : !fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>, volatile>, !fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>) map_clauses(exit_release_or_enter_alloc) capture(ByRef) members(%[[VAL_27]], %[[VAL_26]] : [0], [0, 0] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) -> !fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>, volatile> {name = "container", partial_map = true}
+! CHECK:           %[[VAL_29:.*]] = omp.map.info var_ptr(%[[VAL_13]]#1 : !fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>, volatile>, !fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>) map_clauses(storage) capture(ByRef) members(%[[VAL_27]], %[[VAL_26]] : [0], [0, 0] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) -> !fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>, volatile> {name = "container", partial_map = true}
 ! CHECK:           omp.target_enter_data map_entries(%[[VAL_29]], %[[VAL_27]], %[[VAL_26]] : !fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>, volatile>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>)
 ! CHECK:           %[[VAL_30:.*]] = fir.load %[[VAL_10]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>, volatile>, volatile>
 ! CHECK:           %[[VAL_31:.*]] = fir.load %[[VAL_10]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>, volatile>, volatile>
diff --git a/flang/test/Parser/OpenMP/allocate-align-tree.f90 b/flang/test/Parser/OpenMP/allocate-align-tree.f90
index 8cb009dfe46c8..0d247cd1ed945 100644
--- a/flang/test/Parser/OpenMP/allocate-align-tree.f90
+++ b/flang/test/Parser/OpenMP/allocate-align-tree.f90
@@ -26,14 +26,14 @@ end program allocate_align_tree
 !CHECK: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPExecutableAllocate
 !CHECK-NEXT: | | | Verbatim
 !CHECK-NEXT: | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'xarray'
-!CHECK-NEXT: | | | OmpClauseList -> OmpClause -> Align -> OmpAlignClause -> Scalar -> Integer -> Expr = '32_4'
+!CHECK-NEXT: | | | OmpClauseList -> OmpClause -> Align -> OmpAlignClause -> Scalar -> Integer -> Constant -> Expr = '32_4'
 !CHECK-NEXT: | | | | LiteralConstant -> IntLiteralConstant = '32'
 !CHECK-NEXT: | | | OmpClause -> Allocator -> Scalar -> Integer -> Expr = '2_8'
 !CHECK-NEXT: | | | | Designator -> DataRef -> Name = 'omp_large_cap_mem_alloc'
 !CHECK-NEXT: | | | OpenMPDeclarativeAllocate
 !CHECK-NEXT: | | | | Verbatim
 !CHECK-NEXT: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'j'
-!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Align -> OmpAlignClause -> Scalar -> Integer -> Expr = '16_4'
+!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Align -> OmpAlignClause -> Scalar -> Integer -> Constant -> Expr = '16_4'
 !CHECK-NEXT: | | | | | LiteralConstant -> IntLiteralConstant = '16'
 !CHECK-NEXT: | | | AllocateStmt
 
diff --git a/flang/test/Parser/OpenMP/map-modifiers-v61.f90 b/flang/test/Parser/OpenMP/map-modifiers-v61.f90
new file mode 100644
index 0000000000000..79bf73a658875
--- /dev/null
+++ b/flang/test/Parser/OpenMP/map-modifiers-v61.f90
@@ -0,0 +1,64 @@
+!RUN: %flang_fc1 -fdebug-unparse-no-sema -fopenmp -fopenmp-version=61 %s | FileCheck --ignore-case --check-prefix="UNPARSE" %s
+!RUN: %flang_fc1 -fdebug-dump-parse-tree-no-sema -fopenmp -fopenmp-version=61 %s | FileCheck --check-prefix="PARSE-TREE" %s
+
+subroutine f00(x)
+  integer, pointer :: x
+  !$omp target map(attach(always): x)
+  !$omp end target
+end
+
+!UNPARSE: SUBROUTINE f00 (x)
+!UNPARSE:  INTEGER, POINTER :: x
+!UNPARSE: !$OMP TARGET MAP(ATTACH(ALWAYS): x)
+!UNPARSE: !$OMP END TARGET
+!UNPARSE: END SUBROUTINE
+
+!PARSE-TREE: OmpBeginDirective
+!PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = target
+!PARSE-TREE: | OmpClauseList -> OmpClause -> Map -> OmpMapClause
+!PARSE-TREE: | | Modifier -> OmpAttachModifier -> Value = Always
+!PARSE-TREE: | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'x'
+!PARSE-TREE: | | bool = 'true'
+!PARSE-TREE: | Flags = None
+
+
+subroutine f01(x)
+  integer, pointer :: x
+  !$omp target map(attach(auto): x)
+  !$omp end target
+end
+
+!UNPARSE: SUBROUTINE f01 (x)
+!UNPARSE:  INTEGER, POINTER :: x
+!UNPARSE: !$OMP TARGET MAP(ATTACH(AUTO): x)
+!UNPARSE: !$OMP END TARGET
+!UNPARSE: END SUBROUTINE
+
+!PARSE-TREE: OmpBeginDirective
+!PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = target
+!PARSE-TREE: | OmpClauseList -> OmpClause -> Map -> OmpMapClause
+!PARSE-TREE: | | Modifier -> OmpAttachModifier -> Value = Auto
+!PARSE-TREE: | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'x'
+!PARSE-TREE: | | bool = 'true'
+!PARSE-TREE: | Flags = None
+
+
+subroutine f02(x)
+  integer, pointer :: x
+  !$omp target map(attach(never): x)
+  !$omp end target
+end
+
+!UNPARSE: SUBROUTINE f02 (x)
+!UNPARSE:  INTEGER, POINTER :: x
+!UNPARSE: !$OMP TARGET MAP(ATTACH(NEVER): x)
+!UNPARSE: !$OMP END TARGET
+!UNPARSE: END SUBROUTINE
+
+!PARSE-TREE: OmpBeginDirective
+!PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = target
+!PARSE-TREE: | OmpClauseList -> OmpClause -> Map -> OmpMapClause
+!PARSE-TREE: | | Modifier -> OmpAttachModifier -> Value = Never
+!PARSE-TREE: | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'x'
+!PARSE-TREE: | | bool = 'true'
+!PARSE-TREE: | Flags = None
diff --git a/flang/test/Parser/OpenMP/requires.f90 b/flang/test/Parser/OpenMP/requires.f90
index 6cbb06eaf93c0..ab4f4371480f7 100644
--- a/flang/test/Parser/OpenMP/requires.f90
+++ b/flang/test/Parser/OpenMP/requires.f90
@@ -1,5 +1,5 @@
-!RUN: %flang_fc1 -fdebug-unparse -fopenmp -fopenmp-version=50 %s | FileCheck --ignore-case --check-prefix="UNPARSE" %s
-!RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp -fopenmp-version=50 %s | FileCheck --check-prefix="PARSE-TREE" %s
+!RUN: %flang_fc1 -fdebug-unparse -fopenmp -fopenmp-version=60 %s | FileCheck --ignore-case --check-prefix="UNPARSE" %s
+!RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp -fopenmp-version=60 %s | FileCheck --check-prefix="PARSE-TREE" %s
 
 !$omp requires atomic_default_mem_order(seq_cst)
 
@@ -30,4 +30,27 @@
 !PARSE-TREE: | OmpClause -> ReverseOffload
 !PARSE-TREE: | Flags = None
 
+!$omp requires self_maps(.true.) unified_address(.false.)
+
+!UNPARSE: !$OMP REQUIRES SELF_MAPS(.true._4) UNIFIED_ADDRESS(.false._4)
+
+!PARSE-TREE: OpenMPDeclarativeConstruct -> OpenMPRequiresConstruct -> OmpDirectiveSpecification
+!PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = requires
+!PARSE-TREE: | OmpClauseList -> OmpClause -> SelfMaps -> OmpSelfMapsClause -> Scalar -> Logical -> Constant -> Expr = '.true._4'
+!PARSE-TREE: | | LiteralConstant -> LogicalLiteralConstant
+!PARSE-TREE: | | | bool = 'true'
+!PARSE-TREE: | OmpClause -> UnifiedAddress -> OmpUnifiedAddressClause -> Scalar -> Logical -> Constant -> Expr = '.false._4'
+!PARSE-TREE: | | LiteralConstant -> LogicalLiteralConstant
+!PARSE-TREE: | | | bool = 'false'
+!PARSE-TREE: | Flags = None
+
+!$omp requires device_safesync
+
+!UNPARSE: !$OMP REQUIRES DEVICE_SAFESYNC
+
+!PARSE-TREE: OpenMPDeclarativeConstruct -> OpenMPRequiresConstruct -> OmpDirectiveSpecification
+!PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = requires
+!PARSE-TREE: | OmpClauseList -> OmpClause -> DeviceSafesync
+!PARSE-TREE: | Flags = None
+
 end
diff --git a/flang/test/Semantics/OpenMP/align-clause.f90 b/flang/test/Semantics/OpenMP/align-clause.f90
new file mode 100644
index 0000000000000..2638eece144c7
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/align-clause.f90
@@ -0,0 +1,20 @@
+!RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=51
+
+subroutine f00(y)
+  integer :: x(10), y
+  !ERROR: Must be a constant value
+  !$omp allocate(x) align(y)
+end
+
+subroutine f01()
+  integer :: x(10)
+  !ERROR: The alignment should be a power of 2
+  !$omp allocate(x) align(7)
+end
+
+subroutine f02()
+  integer :: x(10)
+  !ERROR: The alignment should be positive
+  !$omp allocate(x) align(-8)
+end
+
diff --git a/flang/test/Semantics/OpenMP/allocate-align01.f90 b/flang/test/Semantics/OpenMP/allocate-align01.f90
index 4967330e37b48..88bcd6d2f1008 100644
--- a/flang/test/Semantics/OpenMP/allocate-align01.f90
+++ b/flang/test/Semantics/OpenMP/allocate-align01.f90
@@ -11,10 +11,10 @@ program allocate_align_tree
     integer :: z, t, xx
     t = 2
     z = 3
-    !ERROR: The alignment value should be a constant positive integer
+    !ERROR: Must be a constant value
 !$omp allocate(j) align(xx)
     !WARNING: The executable form of the OpenMP ALLOCATE directive has been deprecated, please use ALLOCATORS instead [-Wopen-mp-usage]
-    !ERROR: The alignment value should be a constant positive integer
+    !ERROR: The alignment should be positive
 !$omp allocate(xarray) align(-32) allocator(omp_large_cap_mem_alloc)
     allocate(j(z), xarray(t))
 end program allocate_align_tree
diff --git a/flang/test/Semantics/OpenMP/allow-threadprivate-equivalence-1.f90 b/flang/test/Semantics/OpenMP/allow-threadprivate-equivalence-1.f90
new file mode 100644
index 0000000000000..f19f6bad14a84
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/allow-threadprivate-equivalence-1.f90
@@ -0,0 +1,28 @@
+!RUN: %python %S/../test_errors.py %s %flang -Werror -fopenmp -famd-allow-threadprivate-equivalence
+
+program equiv
+    implicit none
+    common/ba/a,b,c
+    common/bb/e,d,f
+    integer :: a,b,c
+    integer :: e,d,f
+    integer :: x,y,z
+
+    !WARNING: Variable 'a' from common block 'ba' appears in an EQUIVALENCE statement and a THREADPRIVATE directive, which does not conform to the OpenMP API specification.
+    !$omp threadprivate(/ba/)
+
+    equivalence (x,a)
+
+    !$omp parallel num_threads(2)
+        x = -42
+        !$omp masked
+            x = 42
+        !$omp end masked
+        !$omp barrier
+        !$omp atomic update
+            a = a + 1
+        !$omp end atomic
+        !$omp barrier
+        print *, a
+    !$omp end parallel
+end program equiv
diff --git a/flang/test/Semantics/OpenMP/allow-threadprivate-equivalence-2.f90 b/flang/test/Semantics/OpenMP/allow-threadprivate-equivalence-2.f90
new file mode 100644
index 0000000000000..7488f293a9f03
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/allow-threadprivate-equivalence-2.f90
@@ -0,0 +1,10 @@
+!RUN: %python %S/../test_errors.py %s %flang -Werror -fopenmp -famd-allow-threadprivate-equivalence
+
+subroutine f
+  integer, save :: y
+  integer :: x
+  !WARNING: Variable 'x' appears a THREADPRIVATE directive and an EQUIVALENCE statement, which does not conform to the OpenMP API specification.
+  !$omp threadprivate(x)
+  equivalence(x, y)
+end
+
diff --git a/flang/test/Semantics/OpenMP/combined-constructs.f90 b/flang/test/Semantics/OpenMP/combined-constructs.f90
index 0571d9559db18..b445649912000 100644
--- a/flang/test/Semantics/OpenMP/combined-constructs.f90
+++ b/flang/test/Semantics/OpenMP/combined-constructs.f90
@@ -7,7 +7,7 @@ program main
   real(8) :: a(256), b(256)
   N = 256
 
-  !ERROR: `DISTRIBUTE` region has to be strictly nested inside `TEAMS` region.
+  !WARNING: `DISTRIBUTE` must be dynamically enclosed in a `TEAMS` region.
   !$omp distribute simd
   do i = 1, N
      a(i) = 3.14d0
diff --git a/flang/test/Semantics/OpenMP/do05.f90 b/flang/test/Semantics/OpenMP/do05.f90
index 24844f9fe4f62..d8320520879ef 100644
--- a/flang/test/Semantics/OpenMP/do05.f90
+++ b/flang/test/Semantics/OpenMP/do05.f90
@@ -49,7 +49,7 @@ program omp_do
   end do
   !$omp end parallel do simd
 
-  !ERROR: `DISTRIBUTE` region has to be strictly nested inside `TEAMS` region.
+  !WARNING: `DISTRIBUTE` must be dynamically enclosed in a `TEAMS` region.
   !$omp distribute parallel do
   do i=1,10
     if( i == 3 ) then
@@ -64,7 +64,7 @@ program omp_do
   end do
   !$omp end distribute parallel do
 
-  !ERROR: `DISTRIBUTE` region has to be strictly nested inside `TEAMS` region.
+  !WARNING: `DISTRIBUTE` must be dynamically enclosed in a `TEAMS` region.
   !$omp distribute parallel do simd
   do i=1,10
     if( i == 3 ) then
diff --git a/flang/test/Semantics/OpenMP/dump-requires-details.f90 b/flang/test/Semantics/OpenMP/dump-requires-details.f90
new file mode 100644
index 0000000000000..9c844c092c5e6
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/dump-requires-details.f90
@@ -0,0 +1,14 @@
+!RUN: %flang_fc1 -fopenmp -fopenmp-version=60 -fdebug-dump-symbols %s | FileCheck %s
+
+module fred
+!$omp requires atomic_default_mem_order(relaxed)
+contains
+subroutine f00
+  !$omp requires unified_address
+end
+subroutine f01
+  !$omp requires unified_shared_memory
+end
+end module
+
+!CHECK: fred: Module OmpRequirements:(atomic_default_mem_order(relaxed),unified_address,unified_shared_memory)
diff --git a/flang/test/Semantics/OpenMP/linear-iter.f90 b/flang/test/Semantics/OpenMP/linear-iter.f90
index 1f40228be92ad..5c9f2d5fb34c4 100644
--- a/flang/test/Semantics/OpenMP/linear-iter.f90
+++ b/flang/test/Semantics/OpenMP/linear-iter.f90
@@ -53,7 +53,7 @@ SUBROUTINE LINEAR_BAD(N)
   !$omp end teams
   !$omp end target 
 
-  !ERROR: `DISTRIBUTE` region has to be strictly nested inside `TEAMS` region.
+  !WARNING: `DISTRIBUTE` must be dynamically enclosed in a `TEAMS` region.
   !ERROR: Variable 'j' not allowed in LINEAR clause, only loop iterator can be specified in LINEAR clause of a construct combined with DISTRIBUTE
   !$omp distribute simd linear(i,j)
    do i = 1, N
@@ -63,7 +63,7 @@ SUBROUTINE LINEAR_BAD(N)
    enddo
    !$omp end distribute simd
 
-   !ERROR: `DISTRIBUTE` region has to be strictly nested inside `TEAMS` region.
+   !WARNING: `DISTRIBUTE` must be dynamically enclosed in a `TEAMS` region.
    !ERROR: Variable 'j' not allowed in LINEAR clause, only loop iterator can be specified in LINEAR clause of a construct combined with DISTRIBUTE
    !$omp distribute simd linear(i,j) collapse(1)
    do i = 1, N
@@ -73,7 +73,7 @@ SUBROUTINE LINEAR_BAD(N)
    enddo
    !$omp end distribute simd
 
-   !ERROR: `DISTRIBUTE` region has to be strictly nested inside `TEAMS` region.
+   !WARNING: `DISTRIBUTE` must be dynamically enclosed in a `TEAMS` region.
    !$omp distribute simd linear(i,j) collapse(2)
    do i = 1, N
       do j = 1, N
diff --git a/flang/test/Semantics/OpenMP/map-modifiers-v61.f90 b/flang/test/Semantics/OpenMP/map-modifiers-v61.f90
new file mode 100644
index 0000000000000..2daa57892475e
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/map-modifiers-v61.f90
@@ -0,0 +1,49 @@
+!RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=61 -Werror
+
+subroutine f00(x)
+  integer, pointer :: x
+  !ERROR: 'attach-modifier' modifier cannot occur multiple times
+  !$omp target map(attach(always), attach(never): x)
+  !$omp end target
+end
+
+subroutine f01(x)
+  integer, pointer :: x
+  !ERROR: The 'attach-modifier' modifier can only appear on a map-entering construct or on a DECLARE_MAPPER directive
+  !$omp target_exit_data map(attach(always): x)
+end
+
+subroutine f02(x)
+  integer, pointer :: x
+  !ERROR: The 'attach-modifier' modifier can only appear on a map-entering construct or on a DECLARE_MAPPER directive
+  !$omp target map(attach(never), from: x)
+  !$omp end target
+end
+
+subroutine f03(x)
+  integer :: x
+  !ERROR: A list-item that appears in a map clause with the ATTACH modifier must have a base-pointer
+  !$omp target map(attach(always), tofrom: x)
+  !$omp end target
+end
+
+module m
+type t
+  integer :: z
+end type
+
+type u
+  type(t), pointer :: y
+end type
+
+contains
+
+subroutine f04(n)
+  integer :: n
+  type(u) :: x(10)
+
+  !Expect no diagonstics
+  !$omp target map(attach(always), to: x(n)%y%z)
+  !$omp end target
+end
+end module
diff --git a/flang/test/Semantics/OpenMP/nested-distribute.f90 b/flang/test/Semantics/OpenMP/nested-distribute.f90
index cb4aea3bcdffa..4134ced7d81f6 100644
--- a/flang/test/Semantics/OpenMP/nested-distribute.f90
+++ b/flang/test/Semantics/OpenMP/nested-distribute.f90
@@ -1,6 +1,15 @@
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp
 ! Check OpenMP clause validity for the following directives:
 !     2.10 Device constructs
+
+subroutine f
+  integer :: i
+  !WARNING: `DISTRIBUTE` must be dynamically enclosed in a `TEAMS` region.
+  !$omp distribute
+  do i = 1, 100
+     print *, "hello"
+  end do
+end subroutine
 program main
 
   real(8) :: arrayA(256), arrayB(256)
@@ -108,4 +117,8 @@ program main
       end do
       !$omp end distribute
   !$omp end task
+
+  !$omp teams
+    call foo
+  !$omp end teams
 end program main
diff --git a/flang/test/Semantics/OpenMP/omp-atomic-write-pointer-derived.f90 b/flang/test/Semantics/OpenMP/omp-atomic-write-pointer-derived.f90
new file mode 100644
index 0000000000000..6268b0bc07d57
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/omp-atomic-write-pointer-derived.f90
@@ -0,0 +1,8 @@
+! RUN: not %flang_fc1 -fopenmp -fsyntax-only %s 2>&1 | FileCheck %s
+type t
+end type
+type(t), pointer :: a1, a2
+!$omp atomic write
+a1 = a2
+! CHECK: error: ATOMIC operation requires an intrinsic scalar variable; 'a1' has the POINTER attribute and derived type 't'
+end
diff --git a/flang/test/Semantics/OpenMP/omp-common-fp-lp.f90 b/flang/test/Semantics/OpenMP/omp-common-fp-lp.f90
new file mode 100644
index 0000000000000..c995aa2c83503
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/omp-common-fp-lp.f90
@@ -0,0 +1,20 @@
+! RUN: %flang_fc1 -fopenmp -fopenmp-version=51 -fsyntax-only %s 2>&1 | FileCheck %s --allow-empty
+! CHECK-NOT: error:
+
+! Regression test for issue #162033.
+! Verify that a named COMMON block can appear in a data-sharing clause together
+! with one of its members in another clause on the same construct. This is valid
+! in OpenMP >= 5.1 because:
+!  - A named COMMON in a clause is equivalent to listing all its explicit members
+!  - A list item may appear in both FIRSTPRIVATE and LASTPRIVATE on the same directive
+! OpenMP 5.0 explicitly forbade this combination.
+
+subroutine sub1()
+  common /com/ j
+  j = 10
+!$omp parallel do firstprivate(j) lastprivate(/com/)
+  do i = 1, 10
+     j = j + 1
+  end do
+!$omp end parallel do
+end
diff --git a/flang/test/Semantics/OpenMP/requires-modfile.f90 b/flang/test/Semantics/OpenMP/requires-modfile.f90
new file mode 100644
index 0000000000000..52a43c2ef37ac
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/requires-modfile.f90
@@ -0,0 +1,54 @@
+!RUN: %python %S/../test_modfile.py %s %flang_fc1 -fopenmp -fopenmp-version=60
+
+module req
+contains
+
+! The requirements from the subprograms should be added to the module.
+subroutine f00
+  !$omp requires reverse_offload
+end
+
+subroutine f01
+  !$omp requires atomic_default_mem_order(seq_cst)
+end
+end module
+
+module user
+! The requirements from module req should be propagated to this module.
+use req
+  ! This has no effect, and should not be emitted.
+  !$omp requires unified_shared_memory(.false.)
+end module
+
+module fold
+  integer, parameter :: x = 10
+  integer, parameter :: y = 33
+  ! Make sure we can fold this expression to "true".
+  !$omp requires dynamic_allocators(x < y)
+end module
+
+!Expect: req.mod
+!module req
+!!$omp requires atomic_default_mem_order(seq_cst)
+!!$omp requires reverse_offload
+!contains
+!subroutine f00()
+!end
+!subroutine f01()
+!end
+!end
+
+!Expect: user.mod
+!module user
+!use req,only:f00
+!use req,only:f01
+!!$omp requires atomic_default_mem_order(seq_cst)
+!!$omp requires reverse_offload
+!end
+
+!Expect: fold.mod
+!module fold
+!integer(4),parameter::x=10_4
+!integer(4),parameter::y=33_4
+!!$omp requires dynamic_allocators
+!end
diff --git a/flang/test/Semantics/OpenMP/requires09.f90 b/flang/test/Semantics/OpenMP/requires09.f90
index 2fa5d950b9c2d..ca6ad5e8b7b8a 100644
--- a/flang/test/Semantics/OpenMP/requires09.f90
+++ b/flang/test/Semantics/OpenMP/requires09.f90
@@ -3,12 +3,16 @@
 ! 2.4 Requires directive
 ! All atomic_default_mem_order clauses in 'requires' directives found within a
 ! compilation unit must specify the same ordering.
+!ERROR: Conflicting 'ATOMIC_DEFAULT_MEM_ORDER' REQUIRES clauses found in compilation unit
+module m
+contains
 
 subroutine f
   !$omp requires atomic_default_mem_order(seq_cst)
 end subroutine f
 
-!ERROR: Conflicting 'ATOMIC_DEFAULT_MEM_ORDER' REQUIRES clauses found in compilation unit
 subroutine g
   !$omp requires atomic_default_mem_order(relaxed)
 end subroutine g
+
+end module
diff --git a/flang/test/Semantics/OpenMP/requires10.f90 b/flang/test/Semantics/OpenMP/requires10.f90
new file mode 100644
index 0000000000000..9f9832da3726e
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/requires10.f90
@@ -0,0 +1,13 @@
+!RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=52
+
+subroutine f00(x)
+  logical :: x
+  !ERROR: An argument to REVERSE_OFFLOAD is an OpenMP v6.0 feature, try -fopenmp-version=60
+  !ERROR: Must be a constant value
+  !$omp requires reverse_offload(x)
+end
+
+subroutine f01
+  !WARNING: An argument to REVERSE_OFFLOAD is an OpenMP v6.0 feature, try -fopenmp-version=60
+  !$omp requires reverse_offload(.true.)
+end
diff --git a/flang/test/Semantics/bug163242.f90 b/flang/test/Semantics/bug163242.f90
new file mode 100644
index 0000000000000..5e020aeb4dc0d
--- /dev/null
+++ b/flang/test/Semantics/bug163242.f90
@@ -0,0 +1,5 @@
+!RUN: %flang -fc1 -fsyntax-only %s | FileCheck --allow-empty %s
+!CHECK-NOT: error:
+character(0), allocatable :: ch
+allocate(character(-1) :: ch)
+end
diff --git a/flang/test/Semantics/bug163255.f90 b/flang/test/Semantics/bug163255.f90
new file mode 100644
index 0000000000000..e29322aae36a0
--- /dev/null
+++ b/flang/test/Semantics/bug163255.f90
@@ -0,0 +1,21 @@
+!RUN: %flang_fc1 -fdebug-unparse %s | FileCheck %s
+module m
+  type t
+  end type
+  interface operator (==)
+    module procedure equal
+  end interface
+ contains
+  logical function equal(b1, b2)
+    class(t), pointer, intent(in) :: b1, b2
+    equal = associated(b1, b2)
+  end
+end module
+
+program test
+  use m
+  type(t), target :: target
+  class(t), pointer :: p => target
+  !CHECK: IF (equal(p,null(p))) STOP
+  if (p == null(p)) stop
+end
diff --git a/flang/test/Semantics/dynamic-type-intrinsics.f90 b/flang/test/Semantics/dynamic-type-intrinsics.f90
new file mode 100644
index 0000000000000..a4ce3db2532c5
--- /dev/null
+++ b/flang/test/Semantics/dynamic-type-intrinsics.f90
@@ -0,0 +1,73 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+
+module m
+    type :: t1
+      real :: x
+    end type
+    type :: t2(k)
+      integer, kind :: k
+      real(kind=k) :: x
+    end type
+    type :: t3
+      real :: x
+    end type
+    type, extends(t1) :: t4
+      integer :: y
+    end type
+    type :: t5
+      sequence
+      integer :: x
+      integer :: y
+    end type
+
+    integer :: i
+    real :: r
+    type(t1) :: x1, y1
+    type(t2(4)) :: x24, y24
+    type(t2(8)) :: x28
+    type(t3) :: x3
+    type(t4) :: x4
+    type(t5) :: x5
+    class(t1), allocatable :: a1
+    class(t3), allocatable :: a3
+
+    integer(kind=merge(kind(1),-1,same_type_as(x1, x1))) same_type_as_x1_x1_true
+    integer(kind=merge(kind(1),-1,same_type_as(x1, y1))) same_type_as_x1_y1_true
+    integer(kind=merge(kind(1),-1,same_type_as(x24, x24))) same_type_as_x24_x24_true
+    integer(kind=merge(kind(1),-1,same_type_as(x24, y24))) same_type_as_x24_y24_true
+    integer(kind=merge(kind(1),-1,same_type_as(x24, x28))) same_type_as_x24_x28_true
+    !ERROR: INTEGER(KIND=-1) is not a supported type
+    integer(kind=merge(kind(1),-1,same_type_as(x1, x3))) same_type_as_x1_x3_false
+    !ERROR: INTEGER(KIND=-1) is not a supported type
+    integer(kind=merge(kind(1),-1,same_type_as(a1, a3))) same_type_as_a1_a3_false
+    !ERROR: Actual argument for 'a=' has type 't5', but was expected to be an extensible or unlimited polymorphic type
+    logical :: t1_8 = same_type_as(x5, x5)
+    !ERROR: Actual argument for 'a=' has type 't5', but was expected to be an extensible or unlimited polymorphic type
+    logical :: t1_9 = same_type_as(x5, x1)
+    !ERROR: Actual argument for 'b=' has type 't5', but was expected to be an extensible or unlimited polymorphic type
+    logical :: t1_10 = same_type_as(x1, x5)
+    !ERROR: Actual argument for 'a=' has bad type 'INTEGER(4)', expected extensible or unlimited polymorphic type
+    logical :: t1_11 = same_type_as(i, i)
+    !ERROR: Actual argument for 'a=' has bad type 'REAL(4)', expected extensible or unlimited polymorphic type
+    logical :: t1_12 = same_type_as(r, r)
+    !ERROR: Actual argument for 'a=' has bad type 'INTEGER(4)', expected extensible or unlimited polymorphic type
+    logical :: t1_13 = same_type_as(i, t)
+
+    integer(kind=merge(kind(1),-1,extends_type_of(x1, y1))) extends_type_of_x1_y1_true
+    integer(kind=merge(kind(1),-1,extends_type_of(x24, x24))) extends_type_of_x24_x24_true
+    integer(kind=merge(kind(1),-1,extends_type_of(x24, y24))) extends_type_of_x24_y24_true
+    integer(kind=merge(kind(1),-1,extends_type_of(x24, x28))) extends_type_of_x24_x28_true
+    !ERROR: INTEGER(KIND=-1) is not a supported type
+    integer(kind=merge(kind(1),-1,extends_type_of(x1, x3))) extends_type_of_x1_x3_false
+    !ERROR: INTEGER(KIND=-1) is not a supported type
+    integer(kind=merge(kind(1),-1,extends_type_of(a1, a3))) extends_type_of_a1_a3_false
+    !ERROR: INTEGER(KIND=-1) is not a supported type
+    integer(kind=merge(kind(1),-1,extends_type_of(x1, x4))) extends_type_of_x1_x4_false
+    integer(kind=merge(kind(1),-1,extends_type_of(x4, x1))) extends_type_of_x4_x1_true
+    !ERROR: Actual argument for 'a=' has type 't5', but was expected to be an extensible or unlimited polymorphic type
+    logical :: t2_9 = extends_type_of(x5, x5)
+    !ERROR: Actual argument for 'a=' has type 't5', but was expected to be an extensible or unlimited polymorphic type
+    logical :: t2_10 = extends_type_of(x5, x1)
+    !ERROR: Actual argument for 'mold=' has type 't5', but was expected to be an extensible or unlimited polymorphic type
+    logical :: t2_11 = extends_type_of(x1, x5)
+end module
diff --git a/flang/test/Semantics/io11.f90 b/flang/test/Semantics/io11.f90
index c00deede6b516..6bb7a71f0defc 100644
--- a/flang/test/Semantics/io11.f90
+++ b/flang/test/Semantics/io11.f90
@@ -809,3 +809,24 @@ subroutine wf(dtv, unit, iotype, v_list, iostat, iomsg)
     end
   end interface
 end
+
+module m30
+    type base
+        character(5), allocatable :: data
+    end type
+    interface write(formatted)
+        subroutine formattedRead (dtv, unit, iotype, v_list, iostat, iomsg)
+        import base
+            !ERROR: Dummy argument 'dtv' of a defined input/output procedure must be a scalar
+            class (base), intent(in) :: dtv(10)
+            integer, intent(in) :: unit
+            !ERROR: Dummy argument 'iotype' of a defined input/output procedure must be a scalar
+            character(*), intent(in) :: iotype(2)
+            integer, intent(in) :: v_list(:)
+            !ERROR: Dummy argument 'iostat' of a defined input/output procedure must be a scalar
+            integer, intent(out) :: iostat(*)
+            !ERROR: Dummy argument 'iomsg' of a defined input/output procedure must be a scalar
+            character(*), intent(inout) :: iomsg(:)
+        end subroutine
+    end interface
+end module
diff --git a/flang/test/Semantics/resolve63.f90 b/flang/test/Semantics/resolve63.f90
index 1cb8a8584cc77..0c3df2e8724a0 100644
--- a/flang/test/Semantics/resolve63.f90
+++ b/flang/test/Semantics/resolve63.f90
@@ -165,13 +165,12 @@ subroutine s1(x, y)
     logical :: l
     complex :: z
     y = y + z'1'  !OK
-    !ERROR: Operands of + must be numeric; have untyped and COMPLEX(4)
+    !ERROR: No intrinsic or user-defined OPERATOR(+) matches operand types untyped and COMPLEX(4)
     z = z'1' + z
     y = +z'1'  !OK
     !ERROR: Operand of unary - must be numeric; have untyped
     y = -z'1'
-    !ERROR: Operands of + must be numeric; have LOGICAL(4) and untyped
-    y = x + z'1'
+    y = x + z'1' ! matches "add" with conversion of untyped to integer
     !ERROR: A NULL() pointer is not allowed as an operand here
     l = x /= null()
     !ERROR: A NULL() pointer is not allowed as a relational operand
diff --git a/flang/test/Transforms/DoConcurrent/map_shape_info.f90 b/flang/test/Transforms/DoConcurrent/map_shape_info.f90
index 3dca1340ae6b9..40f66c19718e8 100644
--- a/flang/test/Transforms/DoConcurrent/map_shape_info.f90
+++ b/flang/test/Transforms/DoConcurrent/map_shape_info.f90
@@ -30,12 +30,12 @@ end program do_concurrent_shape
 
 ! CHECK: %[[DIM0_EXT_MAP:.*]] = omp.map.info 
 ! CHECK-SAME:   var_ptr(%[[DIM0_EXT]] : !fir.ref<index>, index)
-! CHECK-SAME:   map_clauses(implicit, exit_release_or_enter_alloc) 
+! CHECK-SAME:   map_clauses(implicit)
 ! CHECK-SAME:   capture(ByCopy) -> !fir.ref<index> {name = "_QFEa.extent.dim0"}
 
 ! CHECK: %[[DIM1_EXT_MAP:.*]] = omp.map.info
 ! CHECK-SAME:   var_ptr(%[[DIM1_EXT]] : !fir.ref<index>, index)
-! CHECK-SAME:   map_clauses(implicit, exit_release_or_enter_alloc)
+! CHECK-SAME:   map_clauses(implicit)
 ! CHECK-SAME:   capture(ByCopy) -> !fir.ref<index> {name = "_QFEa.extent.dim1"}
 
 ! CHECK: omp.target host_eval({{.*}}) map_entries(
@@ -79,12 +79,12 @@ end subroutine do_concurrent_shape_shift
 
 ! CHECK: %[[DIM0_STRT_MAP:.*]] = omp.map.info 
 ! CHECK-SAME:   var_ptr(%[[DIM0_STRT]] : !fir.ref<index>, index)
-! CHECK-SAME:   map_clauses(implicit, exit_release_or_enter_alloc) 
+! CHECK-SAME:   map_clauses(implicit) 
 ! CHECK-SAME:   capture(ByCopy) -> !fir.ref<index> {name = "_QF{{.*}}Ea.start_idx.dim0"}
 
 ! CHECK: %[[DIM0_EXT_MAP:.*]] = omp.map.info
 ! CHECK-SAME:   var_ptr(%[[DIM0_EXT]] : !fir.ref<index>, index)
-! CHECK-SAME:   map_clauses(implicit, exit_release_or_enter_alloc)
+! CHECK-SAME:   map_clauses(implicit)
 ! CHECK-SAME:   capture(ByCopy) -> !fir.ref<index> {name = "_QF{{.*}}Ea.extent.dim0"}
 
 ! CHECK: omp.target host_eval({{.*}}) map_entries(
diff --git a/flang/test/Transforms/DoConcurrent/non_reference_to_device.f90 b/flang/test/Transforms/DoConcurrent/non_reference_to_device.f90
index b6b2136e2d405..af48eb4852e91 100644
--- a/flang/test/Transforms/DoConcurrent/non_reference_to_device.f90
+++ b/flang/test/Transforms/DoConcurrent/non_reference_to_device.f90
@@ -24,7 +24,7 @@ end subroutine test_non_refernece
 ! CHECK: omp.map.info var_ptr(%{{.*}} : !fir.ref<index>, index)
 
 ! CHECK:      %[[DIM_MAP:.*]] = omp.map.info var_ptr(%{{.*}} : !fir.ref<index>, index)
-! CHECK-SAME:                     map_clauses(implicit, exit_release_or_enter_alloc)
+! CHECK-SAME:                     map_clauses(implicit)
 ! CHECK-SAME:                     capture(ByCopy) -> !fir.ref<index> {name = ""}
 
 
diff --git a/flang/test/Transforms/OpenMP/lower-workdistribute-doloop.mlir b/flang/test/Transforms/OpenMP/lower-workdistribute-doloop.mlir
new file mode 100644
index 0000000000000..00d10d6264ec9
--- /dev/null
+++ b/flang/test/Transforms/OpenMP/lower-workdistribute-doloop.mlir
@@ -0,0 +1,33 @@
+// RUN: fir-opt --lower-workdistribute %s | FileCheck %s
+
+// CHECK-LABEL:   func.func @x({{.*}})
+// CHECK:           omp.teams {
+// CHECK:             omp.parallel {
+// CHECK:               omp.distribute {
+// CHECK:                 omp.wsloop {
+// CHECK:                   omp.loop_nest (%[[VAL_1:.*]]) : index = (%[[ARG0:.*]]) to (%[[ARG1:.*]]) inclusive step (%[[ARG2:.*]]) {
+// CHECK:                     %[[VAL_0:.*]] = arith.constant 0 : index
+// CHECK:                     fir.store %[[VAL_0]] to %[[ARG4:.*]] : !fir.ref<index>
+// CHECK:                     omp.yield
+// CHECK:                   }
+// CHECK:                 } {omp.composite}
+// CHECK:               } {omp.composite}
+// CHECK:               omp.terminator
+// CHECK:             } {omp.composite}
+// CHECK:             omp.terminator
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
+func.func @x(%lb : index, %ub : index, %step : index, %b : i1, %addr : !fir.ref<index>) {
+  omp.teams {
+    omp.workdistribute { 
+      fir.do_loop %iv = %lb to %ub step %step unordered {
+        %zero = arith.constant 0 : index
+        fir.store %zero to %addr : !fir.ref<index>
+      }
+      omp.terminator
+    }
+    omp.terminator
+  }
+  return
+}
diff --git a/flang/test/Transforms/OpenMP/lower-workdistribute-fission-host.mlir b/flang/test/Transforms/OpenMP/lower-workdistribute-fission-host.mlir
new file mode 100644
index 0000000000000..aef72e4fd001e
--- /dev/null
+++ b/flang/test/Transforms/OpenMP/lower-workdistribute-fission-host.mlir
@@ -0,0 +1,117 @@
+// RUN: fir-opt --lower-workdistribute %s | FileCheck %s
+// Test lowering of workdistribute after fission on host device.
+
+// CHECK-LABEL:   func.func @x(
+// CHECK:           %[[VAL_0:.*]] = fir.alloca index {bindc_name = "lb"}
+// CHECK:           fir.store %[[ARG0:.*]] to %[[VAL_0]] : !fir.ref<index>
+// CHECK:           %[[VAL_1:.*]] = fir.alloca index {bindc_name = "ub"}
+// CHECK:           fir.store %[[ARG1:.*]] to %[[VAL_1]] : !fir.ref<index>
+// CHECK:           %[[VAL_2:.*]] = fir.alloca index {bindc_name = "step"}
+// CHECK:           fir.store %[[ARG2:.*]] to %[[VAL_2]] : !fir.ref<index>
+// CHECK:           %[[VAL_3:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref<index>, index) map_clauses(to) capture(ByRef) -> !fir.ref<index> {name = "lb"}
+// CHECK:           %[[VAL_4:.*]] = omp.map.info var_ptr(%[[VAL_1]] : !fir.ref<index>, index) map_clauses(to) capture(ByRef) -> !fir.ref<index> {name = "ub"}
+// CHECK:           %[[VAL_5:.*]] = omp.map.info var_ptr(%[[VAL_2]] : !fir.ref<index>, index) map_clauses(to) capture(ByRef) -> !fir.ref<index> {name = "step"}
+// CHECK:           %[[VAL_6:.*]] = omp.map.info var_ptr(%[[ARG3:.*]] : !fir.ref<index>, index) map_clauses(tofrom) capture(ByRef) -> !fir.ref<index> {name = "addr"}
+// CHECK:           %[[VAL_7:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref<index>, index) map_clauses(storage) capture(ByRef) -> !fir.ref<index> {name = "lb"}
+// CHECK:           %[[VAL_8:.*]] = omp.map.info var_ptr(%[[VAL_1]] : !fir.ref<index>, index) map_clauses(storage) capture(ByRef) -> !fir.ref<index> {name = "ub"}
+// CHECK:           %[[VAL_9:.*]] = omp.map.info var_ptr(%[[VAL_2]] : !fir.ref<index>, index) map_clauses(storage) capture(ByRef) -> !fir.ref<index> {name = "step"}
+// CHECK:           %[[VAL_10:.*]] = omp.map.info var_ptr(%[[ARG3]] : !fir.ref<index>, index) map_clauses(storage) capture(ByRef) -> !fir.ref<index> {name = "addr"}
+// CHECK:           omp.target_data map_entries(%[[VAL_3]], %[[VAL_4]], %[[VAL_5]], %[[VAL_6]] : !fir.ref<index>, !fir.ref<index>, !fir.ref<index>, !fir.ref<index>) {
+// CHECK:             %[[VAL_11:.*]] = fir.alloca index
+// CHECK:             %[[VAL_12:.*]] = omp.map.info var_ptr(%[[VAL_11]] : !fir.ref<index>, index) map_clauses(from) capture(ByRef) -> !fir.ref<index> {name = "__flang_workdistribute_from"}
+// CHECK:             %[[VAL_13:.*]] = omp.map.info var_ptr(%[[VAL_11]] : !fir.ref<index>, index) map_clauses(to) capture(ByRef) -> !fir.ref<index> {name = "__flang_workdistribute_to"}
+// CHECK:             %[[VAL_14:.*]] = fir.alloca index
+// CHECK:             %[[VAL_15:.*]] = omp.map.info var_ptr(%[[VAL_14]] : !fir.ref<index>, index) map_clauses(from) capture(ByRef) -> !fir.ref<index> {name = "__flang_workdistribute_from"}
+// CHECK:             %[[VAL_16:.*]] = omp.map.info var_ptr(%[[VAL_14]] : !fir.ref<index>, index) map_clauses(to) capture(ByRef) -> !fir.ref<index> {name = "__flang_workdistribute_to"}
+// CHECK:             %[[VAL_17:.*]] = fir.alloca index
+// CHECK:             %[[VAL_18:.*]] = omp.map.info var_ptr(%[[VAL_17]] : !fir.ref<index>, index) map_clauses(from) capture(ByRef) -> !fir.ref<index> {name = "__flang_workdistribute_from"}
+// CHECK:             %[[VAL_19:.*]] = omp.map.info var_ptr(%[[VAL_17]] : !fir.ref<index>, index) map_clauses(to) capture(ByRef) -> !fir.ref<index> {name = "__flang_workdistribute_to"}
+// CHECK:             %[[VAL_20:.*]] = fir.alloca !fir.heap<index>
+// CHECK:             %[[VAL_21:.*]] = omp.map.info var_ptr(%[[VAL_20]] : !fir.ref<!fir.heap<index>>, !fir.heap<index>) map_clauses(from) capture(ByRef) -> !fir.ref<!fir.heap<index>> {name = "__flang_workdistribute_from"}
+// CHECK:             %[[VAL_22:.*]] = omp.map.info var_ptr(%[[VAL_20]] : !fir.ref<!fir.heap<index>>, !fir.heap<index>) map_clauses(to) capture(ByRef) -> !fir.ref<!fir.heap<index>> {name = "__flang_workdistribute_to"}
+// CHECK:             %[[VAL_23:.*]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:             %[[VAL_24:.*]] = fir.load %[[VAL_0]] : !fir.ref<index>
+// CHECK:             %[[VAL_25:.*]] = fir.load %[[VAL_1]] : !fir.ref<index>
+// CHECK:             %[[VAL_26:.*]] = fir.load %[[VAL_2]] : !fir.ref<index>
+// CHECK:             %[[VAL_27:.*]] = arith.constant 1 : index
+// CHECK:             %[[VAL_28:.*]] = arith.addi %[[VAL_25]], %[[VAL_25]] : index
+// CHECK:             %[[VAL_29:.*]] = omp.target_allocmem %[[VAL_23]] : i32, index, %[[VAL_27]] {uniq_name = "dev_buf"}
+// CHECK:             %[[VAL_30:.*]] = fir.convert %[[VAL_29]] : (i64) -> !fir.heap<index>
+// CHECK:             fir.store %[[VAL_24]] to %[[VAL_11]] : !fir.ref<index>
+// CHECK:             fir.store %[[VAL_25]] to %[[VAL_14]] : !fir.ref<index>
+// CHECK:             fir.store %[[VAL_26]] to %[[VAL_17]] : !fir.ref<index>
+// CHECK:             fir.store %[[VAL_30]] to %[[VAL_20]] : !fir.ref<!fir.heap<index>>
+// CHECK:             omp.target host_eval(%[[VAL_24]] -> %[[VAL_31:.*]], %[[VAL_25]] -> %[[VAL_32:.*]], %[[VAL_26]] -> %[[VAL_33:.*]] : index, index, index) map_entries(%[[VAL_7]] -> %[[VAL_34:.*]], %[[VAL_8]] -> %[[VAL_35:.*]], %[[VAL_9]] -> %[[VAL_36:.*]], %[[VAL_10]] -> %[[VAL_37:.*]], %[[VAL_13]] -> %[[VAL_38:.*]], %[[VAL_16]] -> %[[VAL_39:.*]], %[[VAL_19]] -> %[[VAL_40:.*]], %[[VAL_22]] -> %[[VAL_41:.*]] : !fir.ref<index>, !fir.ref<index>, !fir.ref<index>, !fir.ref<index>, !fir.ref<index>, !fir.ref<index>, !fir.ref<index>, !fir.ref<!fir.heap<index>>) {
+// CHECK:               %[[VAL_42:.*]] = fir.load %[[VAL_38]] : !fir.ref<index>
+// CHECK:               %[[VAL_43:.*]] = fir.load %[[VAL_39]] : !fir.ref<index>
+// CHECK:               %[[VAL_44:.*]] = fir.load %[[VAL_40]] : !fir.ref<index>
+// CHECK:               %[[VAL_45:.*]] = fir.load %[[VAL_41]] : !fir.ref<!fir.heap<index>>
+// CHECK:               %[[VAL_46:.*]] = arith.addi %[[VAL_43]], %[[VAL_43]] : index
+// CHECK:               omp.teams {
+// CHECK:                 omp.parallel {
+// CHECK:                   omp.distribute {
+// CHECK:                     omp.wsloop {
+// CHECK:                       omp.loop_nest (%[[VAL_47:.*]]) : index = (%[[VAL_31]]) to (%[[VAL_32]]) inclusive step (%[[VAL_33]]) {
+// CHECK:                         fir.store %[[VAL_46]] to %[[VAL_45]] : !fir.heap<index>
+// CHECK:                         omp.yield
+// CHECK:                       }
+// CHECK:                     } {omp.composite}
+// CHECK:                   } {omp.composite}
+// CHECK:                   omp.terminator
+// CHECK:                 } {omp.composite}
+// CHECK:                 omp.terminator
+// CHECK:               }
+// CHECK:               omp.terminator
+// CHECK:             }
+// CHECK:             %[[VAL_48:.*]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:             %[[VAL_49:.*]] = fir.load %[[VAL_11]] : !fir.ref<index>
+// CHECK:             %[[VAL_50:.*]] = fir.load %[[VAL_14]] : !fir.ref<index>
+// CHECK:             %[[VAL_51:.*]] = fir.load %[[VAL_17]] : !fir.ref<index>
+// CHECK:             %[[VAL_52:.*]] = fir.load %[[VAL_20]] : !fir.ref<!fir.heap<index>>
+// CHECK:             %[[VAL_53:.*]] = arith.addi %[[VAL_50]], %[[VAL_50]] : index
+// CHECK:             fir.store %[[VAL_49]] to %[[VAL_52]] : !fir.heap<index>
+// CHECK:             %[[VAL_54:.*]] = fir.convert %[[VAL_52]] : (!fir.heap<index>) -> i64
+// CHECK:             omp.target_freemem %[[VAL_48]], %[[VAL_54]] : i32, i64
+// CHECK:             omp.terminator
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
+
+module attributes {llvm.target_triple = "x86_64-unknown-linux-gnu", omp.is_gpu = false, omp.is_target_device = false} {
+func.func @x(%lb : index, %ub : index, %step : index, %addr : !fir.ref<index>) {
+  %lb_ref = fir.alloca index {bindc_name = "lb"}
+  fir.store %lb to %lb_ref : !fir.ref<index>
+  %ub_ref = fir.alloca index {bindc_name = "ub"}
+  fir.store %ub to %ub_ref : !fir.ref<index>
+  %step_ref = fir.alloca index {bindc_name = "step"}
+  fir.store %step to %step_ref : !fir.ref<index>
+
+  %lb_map = omp.map.info var_ptr(%lb_ref : !fir.ref<index>, index) map_clauses(to) capture(ByRef) -> !fir.ref<index> {name = "lb"}
+  %ub_map = omp.map.info var_ptr(%ub_ref : !fir.ref<index>, index) map_clauses(to) capture(ByRef) -> !fir.ref<index> {name = "ub"}
+  %step_map = omp.map.info var_ptr(%step_ref : !fir.ref<index>, index) map_clauses(to) capture(ByRef) -> !fir.ref<index> {name = "step"}
+  %addr_map = omp.map.info var_ptr(%addr : !fir.ref<index>, index) map_clauses(tofrom) capture(ByRef) -> !fir.ref<index> {name = "addr"}
+
+  omp.target map_entries(%lb_map -> %ARG0, %ub_map -> %ARG1, %step_map -> %ARG2, %addr_map -> %ARG3 : !fir.ref<index>, !fir.ref<index>, !fir.ref<index>, !fir.ref<index>) {
+    %lb_val = fir.load %ARG0 : !fir.ref<index>
+    %ub_val = fir.load %ARG1 : !fir.ref<index>
+    %step_val = fir.load %ARG2 : !fir.ref<index>
+    %one = arith.constant 1 : index
+
+    %20 = arith.addi %ub_val, %ub_val : index
+    omp.teams {
+      omp.workdistribute {
+        %dev_mem = fir.allocmem index, %one {uniq_name = "dev_buf"}
+        fir.do_loop %iv = %lb_val to %ub_val step %step_val unordered {
+          fir.store %20 to %dev_mem : !fir.heap<index>
+        }
+        fir.store %lb_val to %dev_mem : !fir.heap<index>
+        fir.freemem %dev_mem : !fir.heap<index>
+        omp.terminator
+      }
+      omp.terminator
+    }
+    omp.terminator
+  }
+  return
+}
+}
diff --git a/flang/test/Transforms/OpenMP/lower-workdistribute-fission-target.mlir b/flang/test/Transforms/OpenMP/lower-workdistribute-fission-target.mlir
new file mode 100644
index 0000000000000..25f0350ab98b2
--- /dev/null
+++ b/flang/test/Transforms/OpenMP/lower-workdistribute-fission-target.mlir
@@ -0,0 +1,118 @@
+// RUN: fir-opt --lower-workdistribute %s | FileCheck %s
+// Test lowering of workdistribute after fission on host device.
+
+// CHECK-LABEL:   func.func @x(
+// CHECK:           %[[VAL_0:.*]] = fir.alloca index {bindc_name = "lb"}
+// CHECK:           fir.store %[[ARG0:.*]] to %[[VAL_0]] : !fir.ref<index>
+// CHECK:           %[[VAL_1:.*]] = fir.alloca index {bindc_name = "ub"}
+// CHECK:           fir.store %[[ARG1:.*]] to %[[VAL_1]] : !fir.ref<index>
+// CHECK:           %[[VAL_2:.*]] = fir.alloca index {bindc_name = "step"}
+// CHECK:           fir.store %[[ARG2:.*]] to %[[VAL_2]] : !fir.ref<index>
+// CHECK:           %[[VAL_3:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref<index>, index) map_clauses(to) capture(ByRef) -> !fir.ref<index> {name = "lb"}
+// CHECK:           %[[VAL_4:.*]] = omp.map.info var_ptr(%[[VAL_1]] : !fir.ref<index>, index) map_clauses(to) capture(ByRef) -> !fir.ref<index> {name = "ub"}
+// CHECK:           %[[VAL_5:.*]] = omp.map.info var_ptr(%[[VAL_2]] : !fir.ref<index>, index) map_clauses(to) capture(ByRef) -> !fir.ref<index> {name = "step"}
+// CHECK:           %[[VAL_6:.*]] = omp.map.info var_ptr(%[[ARG3:.*]] : !fir.ref<index>, index) map_clauses(tofrom) capture(ByRef) -> !fir.ref<index> {name = "addr"}
+// CHECK:           %[[VAL_7:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref<index>, index) map_clauses(storage) capture(ByRef) -> !fir.ref<index> {name = "lb"}
+// CHECK:           %[[VAL_8:.*]] = omp.map.info var_ptr(%[[VAL_1]] : !fir.ref<index>, index) map_clauses(storage) capture(ByRef) -> !fir.ref<index> {name = "ub"}
+// CHECK:           %[[VAL_9:.*]] = omp.map.info var_ptr(%[[VAL_2]] : !fir.ref<index>, index) map_clauses(storage) capture(ByRef) -> !fir.ref<index> {name = "step"}
+// CHECK:           %[[VAL_10:.*]] = omp.map.info var_ptr(%[[ARG3]] : !fir.ref<index>, index) map_clauses(storage) capture(ByRef) -> !fir.ref<index> {name = "addr"}
+// CHECK:           omp.target_data map_entries(%[[VAL_3]], %[[VAL_4]], %[[VAL_5]], %[[VAL_6]] : !fir.ref<index>, !fir.ref<index>, !fir.ref<index>, !fir.ref<index>) {
+// CHECK:             %[[VAL_11:.*]] = fir.alloca index
+// CHECK:             %[[VAL_12:.*]] = omp.map.info var_ptr(%[[VAL_11]] : !fir.ref<index>, index) map_clauses(from) capture(ByRef) -> !fir.ref<index> {name = "__flang_workdistribute_from"}
+// CHECK:             %[[VAL_13:.*]] = omp.map.info var_ptr(%[[VAL_11]] : !fir.ref<index>, index) map_clauses(to) capture(ByRef) -> !fir.ref<index> {name = "__flang_workdistribute_to"}
+// CHECK:             %[[VAL_14:.*]] = fir.alloca index
+// CHECK:             %[[VAL_15:.*]] = omp.map.info var_ptr(%[[VAL_14]] : !fir.ref<index>, index) map_clauses(from) capture(ByRef) -> !fir.ref<index> {name = "__flang_workdistribute_from"}
+// CHECK:             %[[VAL_16:.*]] = omp.map.info var_ptr(%[[VAL_14]] : !fir.ref<index>, index) map_clauses(to) capture(ByRef) -> !fir.ref<index> {name = "__flang_workdistribute_to"}
+// CHECK:             %[[VAL_17:.*]] = fir.alloca index
+// CHECK:             %[[VAL_18:.*]] = omp.map.info var_ptr(%[[VAL_17]] : !fir.ref<index>, index) map_clauses(from) capture(ByRef) -> !fir.ref<index> {name = "__flang_workdistribute_from"}
+// CHECK:             %[[VAL_19:.*]] = omp.map.info var_ptr(%[[VAL_17]] : !fir.ref<index>, index) map_clauses(to) capture(ByRef) -> !fir.ref<index> {name = "__flang_workdistribute_to"}
+// CHECK:             %[[VAL_20:.*]] = fir.alloca !fir.heap<index>
+// CHECK:             %[[VAL_21:.*]] = omp.map.info var_ptr(%[[VAL_20]] : !fir.ref<!fir.heap<index>>, !fir.heap<index>) map_clauses(from) capture(ByRef) -> !fir.ref<!fir.heap<index>> {name = "__flang_workdistribute_from"}
+// CHECK:             %[[VAL_22:.*]] = omp.map.info var_ptr(%[[VAL_20]] : !fir.ref<!fir.heap<index>>, !fir.heap<index>) map_clauses(to) capture(ByRef) -> !fir.ref<!fir.heap<index>> {name = "__flang_workdistribute_to"}
+// CHECK:             %[[VAL_23:.*]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:             %[[VAL_24:.*]] = fir.load %[[VAL_0]] : !fir.ref<index>
+// CHECK:             %[[VAL_25:.*]] = fir.load %[[VAL_1]] : !fir.ref<index>
+// CHECK:             %[[VAL_26:.*]] = fir.load %[[VAL_2]] : !fir.ref<index>
+// CHECK:             %[[VAL_27:.*]] = arith.constant 1 : index
+// CHECK:             %[[VAL_28:.*]] = arith.addi %[[VAL_25]], %[[VAL_25]] : index
+// CHECK:             %[[VAL_29:.*]] = omp.target_allocmem %[[VAL_23]] : i32, index, %[[VAL_27]] {uniq_name = "dev_buf"}
+// CHECK:             %[[VAL_30:.*]] = fir.convert %[[VAL_29]] : (i64) -> !fir.heap<index>
+// CHECK:             fir.store %[[VAL_24]] to %[[VAL_11]] : !fir.ref<index>
+// CHECK:             fir.store %[[VAL_25]] to %[[VAL_14]] : !fir.ref<index>
+// CHECK:             fir.store %[[VAL_26]] to %[[VAL_17]] : !fir.ref<index>
+// CHECK:             fir.store %[[VAL_30]] to %[[VAL_20]] : !fir.ref<!fir.heap<index>>
+// CHECK:             omp.target map_entries(%[[VAL_7]] -> %[[VAL_31:.*]], %[[VAL_8]] -> %[[VAL_32:.*]], %[[VAL_9]] -> %[[VAL_33:.*]], %[[VAL_10]] -> %[[VAL_34:.*]], %[[VAL_13]] -> %[[VAL_35:.*]], %[[VAL_16]] -> %[[VAL_36:.*]], %[[VAL_19]] -> %[[VAL_37:.*]], %[[VAL_22]] -> %[[VAL_38:.*]] : !fir.ref<index>, !fir.ref<index>, !fir.ref<index>, !fir.ref<index>, !fir.ref<index>, !fir.ref<index>, !fir.ref<index>, !fir.ref<!fir.heap<index>>) {
+// CHECK:               %[[VAL_39:.*]] = fir.load %[[VAL_35]] : !fir.ref<index>
+// CHECK:               %[[VAL_40:.*]] = fir.load %[[VAL_36]] : !fir.ref<index>
+// CHECK:               %[[VAL_41:.*]] = fir.load %[[VAL_37]] : !fir.ref<index>
+// CHECK:               %[[VAL_42:.*]] = fir.load %[[VAL_38]] : !fir.ref<!fir.heap<index>>
+// CHECK:               %[[VAL_43:.*]] = arith.addi %[[VAL_40]], %[[VAL_40]] : index
+// CHECK:               omp.teams {
+// CHECK:                 omp.parallel {
+// CHECK:                   omp.distribute {
+// CHECK:                     omp.wsloop {
+// CHECK:                       omp.loop_nest (%[[VAL_44:.*]]) : index = (%[[VAL_39]]) to (%[[VAL_40]]) inclusive step (%[[VAL_41]]) {
+// CHECK:                         fir.store %[[VAL_43]] to %[[VAL_42]] : !fir.heap<index>
+// CHECK:                         omp.yield
+// CHECK:                       }
+// CHECK:                     } {omp.composite}
+// CHECK:                   } {omp.composite}
+// CHECK:                   omp.terminator
+// CHECK:                 } {omp.composite}
+// CHECK:                 omp.terminator
+// CHECK:               }
+// CHECK:               omp.terminator
+// CHECK:             }
+// CHECK:             %[[VAL_45:.*]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:             %[[VAL_46:.*]] = fir.load %[[VAL_11]] : !fir.ref<index>
+// CHECK:             %[[VAL_47:.*]] = fir.load %[[VAL_14]] : !fir.ref<index>
+// CHECK:             %[[VAL_48:.*]] = fir.load %[[VAL_17]] : !fir.ref<index>
+// CHECK:             %[[VAL_49:.*]] = fir.load %[[VAL_20]] : !fir.ref<!fir.heap<index>>
+// CHECK:             %[[VAL_50:.*]] = arith.addi %[[VAL_47]], %[[VAL_47]] : index
+// CHECK:             fir.store %[[VAL_46]] to %[[VAL_49]] : !fir.heap<index>
+// CHECK:             %[[VAL_51:.*]] = fir.convert %[[VAL_49]] : (!fir.heap<index>) -> i64
+// CHECK:             omp.target_freemem %[[VAL_45]], %[[VAL_51]] : i32, i64
+// CHECK:             omp.terminator
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
+
+
+module attributes {llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true} {
+func.func @x(%lb : index, %ub : index, %step : index, %addr : !fir.ref<index>) {
+  %lb_ref = fir.alloca index {bindc_name = "lb"}
+  fir.store %lb to %lb_ref : !fir.ref<index>
+  %ub_ref = fir.alloca index {bindc_name = "ub"}
+  fir.store %ub to %ub_ref : !fir.ref<index>
+  %step_ref = fir.alloca index {bindc_name = "step"}
+  fir.store %step to %step_ref : !fir.ref<index>
+
+  %lb_map = omp.map.info var_ptr(%lb_ref : !fir.ref<index>, index) map_clauses(to) capture(ByRef) -> !fir.ref<index> {name = "lb"}
+  %ub_map = omp.map.info var_ptr(%ub_ref : !fir.ref<index>, index) map_clauses(to) capture(ByRef) -> !fir.ref<index> {name = "ub"}
+  %step_map = omp.map.info var_ptr(%step_ref : !fir.ref<index>, index) map_clauses(to) capture(ByRef) -> !fir.ref<index> {name = "step"}
+  %addr_map = omp.map.info var_ptr(%addr : !fir.ref<index>, index) map_clauses(tofrom) capture(ByRef) -> !fir.ref<index> {name = "addr"}
+
+  omp.target map_entries(%lb_map -> %ARG0, %ub_map -> %ARG1, %step_map -> %ARG2, %addr_map -> %ARG3 : !fir.ref<index>, !fir.ref<index>, !fir.ref<index>, !fir.ref<index>) {
+    %lb_val = fir.load %ARG0 : !fir.ref<index>
+    %ub_val = fir.load %ARG1 : !fir.ref<index>
+    %step_val = fir.load %ARG2 : !fir.ref<index>
+    %one = arith.constant 1 : index
+
+    %20 = arith.addi %ub_val, %ub_val : index
+    omp.teams {
+      omp.workdistribute {
+        %dev_mem = fir.allocmem index, %one {uniq_name = "dev_buf"}
+        fir.do_loop %iv = %lb_val to %ub_val step %step_val unordered {
+          fir.store %20 to %dev_mem : !fir.heap<index>
+        }
+        fir.store %lb_val to %dev_mem : !fir.heap<index>
+        fir.freemem %dev_mem : !fir.heap<index>
+        omp.terminator
+      }
+      omp.terminator
+    }
+    omp.terminator
+  }
+  return
+}
+}
diff --git a/flang/test/Transforms/OpenMP/lower-workdistribute-fission.mlir b/flang/test/Transforms/OpenMP/lower-workdistribute-fission.mlir
new file mode 100644
index 0000000000000..c562b7009664d
--- /dev/null
+++ b/flang/test/Transforms/OpenMP/lower-workdistribute-fission.mlir
@@ -0,0 +1,71 @@
+// RUN: fir-opt --lower-workdistribute %s | FileCheck %s
+
+// CHECK-LABEL:   func.func @test_fission_workdistribute(
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_2:.*]] = arith.constant 9 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant 5.000000e+00 : f32
+// CHECK:           fir.store %[[VAL_3]] to %[[ARG2:.*]] : !fir.ref<f32>
+// CHECK:           omp.teams {
+// CHECK:             omp.parallel {
+// CHECK:               omp.distribute {
+// CHECK:                 omp.wsloop {
+// CHECK:                   omp.loop_nest (%[[VAL_4:.*]]) : index = (%[[VAL_0]]) to (%[[VAL_2]]) inclusive step (%[[VAL_1]]) {
+// CHECK:                     %[[VAL_5:.*]] = fir.coordinate_of %[[ARG0:.*]], %[[VAL_4]] : (!fir.ref<!fir.array<10xf32>>, index) -> !fir.ref<f32>
+// CHECK:                     %[[VAL_6:.*]] = fir.load %[[VAL_5]] : !fir.ref<f32>
+// CHECK:                     %[[VAL_7:.*]] = fir.coordinate_of %[[ARG1:.*]], %[[VAL_4]] : (!fir.ref<!fir.array<10xf32>>, index) -> !fir.ref<f32>
+// CHECK:                     fir.store %[[VAL_6]] to %[[VAL_7]] : !fir.ref<f32>
+// CHECK:                     omp.yield
+// CHECK:                   }
+// CHECK:                 } {omp.composite}
+// CHECK:               } {omp.composite}
+// CHECK:               omp.terminator
+// CHECK:             } {omp.composite}
+// CHECK:             omp.terminator
+// CHECK:           }
+// CHECK:           fir.call @regular_side_effect_func(%[[ARG2:.*]]) : (!fir.ref<f32>) -> ()
+// CHECK:           fir.call @my_fir_parallel_runtime_func(%[[ARG3:.*]]) : (!fir.ref<f32>) -> ()
+// CHECK:           fir.do_loop %[[VAL_8:.*]] = %[[VAL_0]] to %[[VAL_2]] step %[[VAL_1]] {
+// CHECK:             %[[VAL_9:.*]] = fir.coordinate_of %[[ARG0]], %[[VAL_8]] : (!fir.ref<!fir.array<10xf32>>, index) -> !fir.ref<f32>
+// CHECK:             fir.store %[[VAL_3]] to %[[VAL_9]] : !fir.ref<f32>
+// CHECK:           }
+// CHECK:           %[[VAL_10:.*]] = fir.load %[[ARG2:.*]] : !fir.ref<f32>
+// CHECK:           fir.store %[[VAL_10]] to %[[ARG3:.*]] : !fir.ref<f32>
+// CHECK:           return
+// CHECK:         }
+module {
+func.func @regular_side_effect_func(%arg0: !fir.ref<f32>) {
+  return
+}
+func.func @my_fir_parallel_runtime_func(%arg0: !fir.ref<f32>) attributes {fir.runtime} {
+  return
+}
+func.func @test_fission_workdistribute(%arr1: !fir.ref<!fir.array<10xf32>>, %arr2: !fir.ref<!fir.array<10xf32>>, %scalar_ref1: !fir.ref<f32>, %scalar_ref2: !fir.ref<f32>) {
+  %c0_idx = arith.constant 0 : index
+  %c1_idx = arith.constant 1 : index
+  %c9_idx = arith.constant 9 : index
+  %float_val = arith.constant 5.0 : f32
+  omp.teams   {
+    omp.workdistribute   {
+      fir.store %float_val to %scalar_ref1 : !fir.ref<f32>
+      fir.do_loop %iv = %c0_idx to %c9_idx step %c1_idx unordered {
+        %elem_ptr_arr1 = fir.coordinate_of %arr1, %iv : (!fir.ref<!fir.array<10xf32>>, index) -> !fir.ref<f32>
+        %loaded_val_loop1 = fir.load %elem_ptr_arr1 : !fir.ref<f32>
+        %elem_ptr_arr2 = fir.coordinate_of %arr2, %iv : (!fir.ref<!fir.array<10xf32>>, index) -> !fir.ref<f32>
+        fir.store %loaded_val_loop1 to %elem_ptr_arr2 : !fir.ref<f32>
+      }
+      fir.call @regular_side_effect_func(%scalar_ref1) : (!fir.ref<f32>) -> ()
+      fir.call @my_fir_parallel_runtime_func(%scalar_ref2) : (!fir.ref<f32>) -> ()
+      fir.do_loop %jv = %c0_idx to %c9_idx step %c1_idx {
+        %elem_ptr_ordered_loop = fir.coordinate_of %arr1, %jv : (!fir.ref<!fir.array<10xf32>>, index) -> !fir.ref<f32>
+        fir.store %float_val to %elem_ptr_ordered_loop : !fir.ref<f32>
+      }
+      %loaded_for_hoist = fir.load %scalar_ref1 : !fir.ref<f32>
+      fir.store %loaded_for_hoist to %scalar_ref2 : !fir.ref<f32>
+      omp.terminator  
+    }
+    omp.terminator
+  }
+  return
+}
+}
diff --git a/flang/test/Transforms/OpenMP/lower-workdistribute-runtime-assign-scalar.mlir b/flang/test/Transforms/OpenMP/lower-workdistribute-runtime-assign-scalar.mlir
new file mode 100644
index 0000000000000..03d5d71df0a82
--- /dev/null
+++ b/flang/test/Transforms/OpenMP/lower-workdistribute-runtime-assign-scalar.mlir
@@ -0,0 +1,108 @@
+// RUN: fir-opt --lower-workdistribute %s | FileCheck %s
+
+// Test lowering of workdistribute for a scalar assignment within a target teams workdistribute region.
+// The test checks that the scalar assignment is correctly lowered to wsloop and loop_nest operations.
+
+// Example Fortran code:
+// !$omp target teams workdistribute
+// y = 3.0_real32
+// !$omp end target teams workdistribute
+
+
+// CHECK-LABEL:   func.func @x(
+// CHECK:             omp.target {{.*}} {
+// CHECK:               omp.teams {
+// CHECK:                 omp.parallel {
+// CHECK:                   omp.distribute {
+// CHECK:                     omp.wsloop {
+// CHECK:                       omp.loop_nest (%[[VAL_73:.*]]) : index = (%[[VAL_66:.*]]) to (%[[VAL_72:.*]]) inclusive step (%[[VAL_67:.*]]) {
+// CHECK:                         %[[VAL_74:.*]] = arith.constant 0 : index
+// CHECK:                         %[[VAL_75:.*]]:3 = fir.box_dims %[[VAL_64:.*]], %[[VAL_74]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
+// CHECK:                         %[[VAL_76:.*]] = arith.constant 1 : index
+// CHECK:                         %[[VAL_77:.*]]:3 = fir.box_dims %[[VAL_64]], %[[VAL_76]] : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
+// CHECK:                         %[[VAL_78:.*]] = arith.constant 1 : index
+// CHECK:                         %[[VAL_79:.*]] = arith.remsi %[[VAL_73]], %[[VAL_77]]#1 : index
+// CHECK:                         %[[VAL_80:.*]] = arith.addi %[[VAL_79]], %[[VAL_78]] : index
+// CHECK:                         %[[VAL_81:.*]] = arith.divsi %[[VAL_73]], %[[VAL_77]]#1 : index
+// CHECK:                         %[[VAL_82:.*]] = arith.remsi %[[VAL_81]], %[[VAL_75]]#1 : index
+// CHECK:                         %[[VAL_83:.*]] = arith.addi %[[VAL_82]], %[[VAL_78]] : index
+// CHECK:                         %[[VAL_84:.*]] = fir.array_coor %[[VAL_64]] %[[VAL_83]], %[[VAL_80]] : (!fir.box<!fir.array<?x?xf32>>, index, index) -> !fir.ref<f32>
+// CHECK:                         fir.store %[[VAL_65:.*]] to %[[VAL_84]] : !fir.ref<f32>
+// CHECK:                         omp.yield
+// CHECK:                       }
+// CHECK:                     } {omp.composite}
+// CHECK:                   } {omp.composite}
+// CHECK:                   omp.terminator
+// CHECK:                 } {omp.composite}
+// CHECK:                 omp.terminator
+// CHECK:               }
+// CHECK:               omp.terminator
+// CHECK:             }
+// CHECK:             omp.terminator
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
+// CHECK:         func.func private @_FortranAAssign(!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) attributes {fir.runtime}
+
+module attributes {llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true} {
+func.func @x(%arr : !fir.ref<!fir.array<?x?xf32>>) {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c78 = arith.constant 78 : index
+    %cst = arith.constant 3.000000e+00 : f32
+    %0 = fir.alloca i32
+    %1 = fir.alloca i32
+    %c10 = arith.constant 10 : index
+    %c20 = arith.constant 20 : index
+    %194 = arith.subi %c10, %c1 : index
+    %195 = omp.map.bounds lower_bound(%c0 : index) upper_bound(%194 : index) extent(%c10 : index) stride(%c1 : index) start_idx(%c1 : index)
+    %196 = arith.subi %c20, %c1 : index
+    %197 = omp.map.bounds lower_bound(%c0 : index) upper_bound(%196 : index) extent(%c20 : index) stride(%c1 : index) start_idx(%c1 : index)
+    %198 = omp.map.info var_ptr(%arr : !fir.ref<!fir.array<?x?xf32>>, f32) map_clauses(implicit, tofrom) capture(ByRef) bounds(%195, %197) -> !fir.ref<!fir.array<?x?xf32>> {name = "y"}
+    %199 = omp.map.info var_ptr(%1 : !fir.ref<i32>, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref<i32> {name = ""}
+    %200 = omp.map.info var_ptr(%0 : !fir.ref<i32>, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref<i32> {name = ""}
+    omp.target map_entries(%198 -> %arg5, %199 -> %arg6, %200 -> %arg7 : !fir.ref<!fir.array<?x?xf32>>, !fir.ref<i32>, !fir.ref<i32>) {
+      %c0_0 = arith.constant 0 : index
+      %201 = fir.load %arg7 : !fir.ref<i32>
+      %202 = fir.load %arg6 : !fir.ref<i32>
+      %203 = fir.convert %202 : (i32) -> i64
+      %204 = fir.convert %201 : (i32) -> i64
+      %205 = fir.convert %204 : (i64) -> index
+      %206 = arith.cmpi sgt, %205, %c0_0 : index
+      %207 = fir.convert %203 : (i64) -> index
+      %208 = arith.cmpi sgt, %207, %c0_0 : index
+      %209 = arith.select %208, %207, %c0_0 : index
+      %210 = arith.select %206, %205, %c0_0 : index
+      %211 = fir.shape %210, %209 : (index, index) -> !fir.shape<2>
+      %212 = fir.declare %arg5(%211) {uniq_name = "_QFFaxpy_array_workdistributeEy"} : (!fir.ref<!fir.array<?x?xf32>>, !fir.shape<2>) -> !fir.ref<!fir.array<?x?xf32>>
+      %213 = fir.embox %212(%211) : (!fir.ref<!fir.array<?x?xf32>>, !fir.shape<2>) -> !fir.box<!fir.array<?x?xf32>>
+      omp.teams {
+        %214 = fir.alloca !fir.box<!fir.array<?x?xf32>> {pinned}
+        omp.workdistribute {
+          %215 = fir.alloca f32
+          %216 = fir.embox %215 : (!fir.ref<f32>) -> !fir.box<f32>
+          %217 = fir.shape %210, %209 : (index, index) -> !fir.shape<2>
+          %218 = fir.embox %212(%217) : (!fir.ref<!fir.array<?x?xf32>>, !fir.shape<2>) -> !fir.box<!fir.array<?x?xf32>>
+          fir.store %218 to %214 : !fir.ref<!fir.box<!fir.array<?x?xf32>>>
+          %219 = fir.address_of(@_QQclXf9c642d28e5bba1f07fa9a090b72f4fc) : !fir.ref<!fir.char<1,78>>
+          %c39_i32 = arith.constant 39 : i32
+          %220 = fir.convert %214 : (!fir.ref<!fir.box<!fir.array<?x?xf32>>>) -> !fir.ref<!fir.box<none>>
+          %221 = fir.convert %216 : (!fir.box<f32>) -> !fir.box<none>
+          %222 = fir.convert %219 : (!fir.ref<!fir.char<1,78>>) -> !fir.ref<i8>
+          fir.call @_FortranAAssign(%220, %221, %222, %c39_i32) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
+          omp.terminator
+        }
+        omp.terminator
+      }
+      omp.terminator
+    }
+    return
+}
+
+func.func private @_FortranAAssign(!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) attributes {fir.runtime}
+
+fir.global linkonce @_QQclXf9c642d28e5bba1f07fa9a090b72f4fc constant : !fir.char<1,78> {
+  %0 = fir.string_lit "File: /work/github/skc7/llvm-project/build_fomp_reldebinfo/saxpy_tests/\00"(78) : !fir.char<1,78>
+  fir.has_value %0 : !fir.char<1,78>
+}
+}
diff --git a/flang/test/Transforms/omp-map-info-finalization.fir b/flang/test/Transforms/omp-map-info-finalization.fir
index 42329e3e9852b..63ad9aad32414 100644
--- a/flang/test/Transforms/omp-map-info-finalization.fir
+++ b/flang/test/Transforms/omp-map-info-finalization.fir
@@ -257,7 +257,7 @@ func.func @alloca_dtype_map_op_block_add(%arg0 : !fir.ref<!fir.box<!fir.heap<!fi
     %1 = omp.map.bounds lower_bound(%c1_15 : index) upper_bound(%c1_15 : index) extent(%c1_15 : index) stride(%c1_15 : index) start_idx(%c1_15 : index) {stride_in_bytes = true}
     %2 = fir.coordinate_of %0#0, vertexes : (!fir.ref<!fir.type<_QFmaptype_nested_derived_type_member_idxTdtype{i:f32,vertexes:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QFmaptype_nested_derived_type_member_idxTvertexes{test:i32,vertexx:!fir.box<!fir.heap<!fir.array<?xi32>>>,vertexy:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>,array_i:!fir.array<10xi32>}>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QFmaptype_nested_derived_type_member_idxTvertexes{test:i32,vertexx:!fir.box<!fir.heap<!fir.array<?xi32>>>,vertexy:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>>
     %3 = omp.map.bounds lower_bound(%c1_15 : index) upper_bound(%c1_15 : index) extent(%c1_15 : index) stride(%c1_15 : index) start_idx(%c1_15 : index) {stride_in_bytes = true}
-    %4 = omp.map.info var_ptr(%2 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QFmaptype_nested_derived_type_member_idxTvertexes{test:i32,vertexx:!fir.box<!fir.heap<!fir.array<?xi32>>>,vertexy:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>>, !fir.box<!fir.heap<!fir.array<?x!fir.type<_QFmaptype_nested_derived_type_member_idxTvertexes{test:i32,vertexx:!fir.box<!fir.heap<!fir.array<?xi32>>>,vertexy:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>) map_clauses(exit_release_or_enter_alloc) capture(ByRef) bounds(%3) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QFmaptype_nested_derived_type_member_idxTvertexes{test:i32,vertexx:!fir.box<!fir.heap<!fir.array<?xi32>>>,vertexy:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>> {name = "alloca_dtype%vertexes(2_8)%vertexy"}
+    %4 = omp.map.info var_ptr(%2 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QFmaptype_nested_derived_type_member_idxTvertexes{test:i32,vertexx:!fir.box<!fir.heap<!fir.array<?xi32>>>,vertexy:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>>, !fir.box<!fir.heap<!fir.array<?x!fir.type<_QFmaptype_nested_derived_type_member_idxTvertexes{test:i32,vertexx:!fir.box<!fir.heap<!fir.array<?xi32>>>,vertexy:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>) map_clauses(storage) capture(ByRef) bounds(%3) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QFmaptype_nested_derived_type_member_idxTvertexes{test:i32,vertexx:!fir.box<!fir.heap<!fir.array<?xi32>>>,vertexy:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>> {name = "alloca_dtype%vertexes(2_8)%vertexy"}
     %5 = fir.load %2 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QFmaptype_nested_derived_type_member_idxTvertexes{test:i32,vertexx:!fir.box<!fir.heap<!fir.array<?xi32>>>,vertexy:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>>
     %c2_i64 = arith.constant 2 : i64
     %c1_20 = arith.constant 1 : index
@@ -266,7 +266,7 @@ func.func @alloca_dtype_map_op_block_add(%arg0 : !fir.ref<!fir.box<!fir.heap<!fi
     %8 = fir.coordinate_of %5, %7 : (!fir.box<!fir.heap<!fir.array<?x!fir.type<_QFmaptype_nested_derived_type_member_idxTvertexes{test:i32,vertexx:!fir.box<!fir.heap<!fir.array<?xi32>>>,vertexy:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>, index) -> !fir.ref<!fir.type<_QFmaptype_nested_derived_type_member_idxTvertexes{test:i32,vertexx:!fir.box<!fir.heap<!fir.array<?xi32>>>,vertexy:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>
     %9 = fir.coordinate_of %8, vertexy : (!fir.ref<!fir.type<_QFmaptype_nested_derived_type_member_idxTvertexes{test:i32,vertexx:!fir.box<!fir.heap<!fir.array<?xi32>>>,vertexy:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
     %10 = omp.map.info var_ptr(%9 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.box<!fir.heap<!fir.array<?xi32>>>) map_clauses(tofrom) capture(ByRef) bounds(%1) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {name = "alloca_dtype%vertexes(2_8)%vertexy"}
-    %11 = omp.map.info var_ptr(%0#1 : !fir.ref<!fir.type<_QFmaptype_nested_derived_type_member_idxTdtype{i:f32,vertexes:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QFmaptype_nested_derived_type_member_idxTvertexes{test:i32,vertexx:!fir.box<!fir.heap<!fir.array<?xi32>>>,vertexy:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>,array_i:!fir.array<10xi32>}>>, !fir.type<_QFmaptype_nested_derived_type_member_idxTdtype{i:f32,vertexes:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QFmaptype_nested_derived_type_member_idxTvertexes{test:i32,vertexx:!fir.box<!fir.heap<!fir.array<?xi32>>>,vertexy:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>,array_i:!fir.array<10xi32>}>) map_clauses(exit_release_or_enter_alloc) capture(ByRef) members(%4, %10 : [1], [1,2] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QFmaptype_nested_derived_type_member_idxTvertexes{test:i32,vertexx:!fir.box<!fir.heap<!fir.array<?xi32>>>,vertexy:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.type<_QFmaptype_nested_derived_type_member_idxTdtype{i:f32,vertexes:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QFmaptype_nested_derived_type_member_idxTvertexes{test:i32,vertexx:!fir.box<!fir.heap<!fir.array<?xi32>>>,vertexy:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>,array_i:!fir.array<10xi32>}>> {name = "alloca_dtype", partial_map = true}
+    %11 = omp.map.info var_ptr(%0#1 : !fir.ref<!fir.type<_QFmaptype_nested_derived_type_member_idxTdtype{i:f32,vertexes:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QFmaptype_nested_derived_type_member_idxTvertexes{test:i32,vertexx:!fir.box<!fir.heap<!fir.array<?xi32>>>,vertexy:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>,array_i:!fir.array<10xi32>}>>, !fir.type<_QFmaptype_nested_derived_type_member_idxTdtype{i:f32,vertexes:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QFmaptype_nested_derived_type_member_idxTvertexes{test:i32,vertexx:!fir.box<!fir.heap<!fir.array<?xi32>>>,vertexy:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>,array_i:!fir.array<10xi32>}>) map_clauses(storage) capture(ByRef) members(%4, %10 : [1], [1,2] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QFmaptype_nested_derived_type_member_idxTvertexes{test:i32,vertexx:!fir.box<!fir.heap<!fir.array<?xi32>>>,vertexy:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.type<_QFmaptype_nested_derived_type_member_idxTdtype{i:f32,vertexes:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QFmaptype_nested_derived_type_member_idxTvertexes{test:i32,vertexx:!fir.box<!fir.heap<!fir.array<?xi32>>>,vertexy:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>,array_i:!fir.array<10xi32>}>> {name = "alloca_dtype", partial_map = true}
     omp.target map_entries(%11 -> %arg1 : !fir.ref<!fir.type<_QFmaptype_nested_derived_type_member_idxTdtype{i:f32,vertexes:!fir.box<!fir.heap<!fir.array<?x!fir.type<_QFmaptype_nested_derived_type_member_idxTvertexes{test:i32,vertexx:!fir.box<!fir.heap<!fir.array<?xi32>>>,vertexy:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>>>,array_i:!fir.array<10xi32>}>>) {
       omp.terminator
     }
@@ -277,7 +277,7 @@ func.func @alloca_dtype_map_op_block_add(%arg0 : !fir.ref<!fir.box<!fir.heap<!fi
 // CHECK:    %[[DECLARE:.*]]:2 = hlfir.declare %[[ARG0]] {{.*}} : (!fir.ref<!fir.type<[[REC_TY]]>>) -> (!fir.ref<!fir.type<[[REC_TY]]>>, !fir.ref<!fir.type<[[REC_TY]]>>)
 // CHECK:    %[[DESC_1:.*]] = fir.coordinate_of %[[DECLARE]]#0, vertexes : (!fir.ref<!fir.type<[[REC_TY]]>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<[[REC_TY2:_QFmaptype_nested_derived_type_member_idxTvertexes{test:i32,vertexx:!fir.box<!fir.heap<!fir.array<\?xi32>>>,vertexy:!fir.box<!fir.heap<!fir.array<\?xi32>>>}]]>>>>>
 // CHECK:    %[[BASE_ADDR_1:.*]] = fir.box_offset %[[DESC_1]] base_addr : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<[[REC_TY2]]>>>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?x!fir.type<[[REC_TY2]]>>>>
-// CHECK:    %[[BASE_ADDR_MAP_1:.*]] = omp.map.info var_ptr(%[[DESC_1]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<[[REC_TY2]]>>>>>, !fir.type<[[REC_TY2]]>) map_clauses(exit_release_or_enter_alloc) capture(ByRef) var_ptr_ptr(%[[BASE_ADDR_1]] : !fir.llvm_ptr<!fir.ref<!fir.array<?x!fir.type<[[REC_TY2]]>>>>)   bounds(%{{.*}}) -> !fir.llvm_ptr<!fir.ref<!fir.array<?x!fir.type<[[REC_TY2]]>>>> {{.*}}
+// CHECK:    %[[BASE_ADDR_MAP_1:.*]] = omp.map.info var_ptr(%[[DESC_1]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<[[REC_TY2]]>>>>>, !fir.type<[[REC_TY2]]>) map_clauses(storage) capture(ByRef) var_ptr_ptr(%[[BASE_ADDR_1]] : !fir.llvm_ptr<!fir.ref<!fir.array<?x!fir.type<[[REC_TY2]]>>>>)   bounds(%{{.*}}) -> !fir.llvm_ptr<!fir.ref<!fir.array<?x!fir.type<[[REC_TY2]]>>>> {{.*}}
 // CHECK:    %[[DESC_MAP_1:.*]] = omp.map.info var_ptr(%[[DESC_1]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<[[REC_TY2]]>>>>>, !fir.box<!fir.heap<!fir.array<?x!fir.type<[[REC_TY2]]>>>>) map_clauses(always, descriptor, to) capture(ByRef) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<[[REC_TY2]]>>>>> {{.*}}
 // CHECK:    %[[DESC_LD_1:.*]] = fir.load %[[DESC_1]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<[[REC_TY2]]>>>>>
 // CHECK:    %[[MEMBER_ACCESS_1:.*]] = fir.coordinate_of %[[DESC_LD_1]], %{{.*}} : (!fir.box<!fir.heap<!fir.array<?x!fir.type<[[REC_TY2]]>>>>, index) -> !fir.ref<!fir.type<[[REC_TY2]]>>
@@ -285,7 +285,7 @@ func.func @alloca_dtype_map_op_block_add(%arg0 : !fir.ref<!fir.box<!fir.heap<!fi
 // CHECK:    %[[BASE_ADDR_2:.*]] = fir.box_offset %[[DESC_2]] base_addr : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>
 // CHECK:    %[[BASE_ADDR_MAP_2:.*]] = omp.map.info var_ptr(%[[DESC_2]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, i32) map_clauses(tofrom) capture(ByRef) var_ptr_ptr(%[[BASE_ADDR_2]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>)   bounds(%{{.*}}) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {{.*}}
 // CHECK:    %[[DESC_MAP_2:.*]] = omp.map.info var_ptr(%[[DESC_2]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.box<!fir.heap<!fir.array<?xi32>>>) map_clauses(always, descriptor, to) capture(ByRef) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {{.*}}
-// CHECK:    %[[TOP_PARENT_MAP:.*]] = omp.map.info var_ptr(%0#1 : !fir.ref<!fir.type<[[REC_TY]]>>, !fir.type<[[REC_TY]]>) map_clauses(exit_release_or_enter_alloc) capture(ByRef) members(%6, %5, %14, %13 : [1], [1, 0], [1, 0, 2], [1, 0, 2, 0] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<[[REC_TY2]]>>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?x!fir.type<[[REC_TY2]]>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) -> !fir.ref<!fir.type<[[REC_TY]]>> {{{.*}} partial_map = true}
+// CHECK:    %[[TOP_PARENT_MAP:.*]] = omp.map.info var_ptr(%0#1 : !fir.ref<!fir.type<[[REC_TY]]>>, !fir.type<[[REC_TY]]>) map_clauses(storage) capture(ByRef) members(%6, %5, %14, %13 : [1], [1, 0], [1, 0, 2], [1, 0, 2, 0] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<[[REC_TY2]]>>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?x!fir.type<[[REC_TY2]]>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) -> !fir.ref<!fir.type<[[REC_TY]]>> {{{.*}} partial_map = true}
 // CHECK:    omp.target map_entries(%[[TOP_PARENT_MAP]] -> %{{.*}}, %[[DESC_MAP_1]] -> %{{.*}}, %[[BASE_ADDR_MAP_1]] -> %{{.*}}, %[[DESC_MAP_2]] -> %{{.*}}, %[[BASE_ADDR_MAP_2]] -> %{{.*}} : !fir.ref<!fir.type<[[REC_TY]]>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<[[REC_TY2]]>>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?x!fir.type<[[REC_TY2]]>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) {
 
 // -----
diff --git a/flang/tools/bbc/CMakeLists.txt b/flang/tools/bbc/CMakeLists.txt
index 7516157731b5e..2d0521464a747 100644
--- a/flang/tools/bbc/CMakeLists.txt
+++ b/flang/tools/bbc/CMakeLists.txt
@@ -35,6 +35,7 @@ target_link_libraries(bbc PRIVATE
   FortranEvaluate
   FortranSemantics
   FortranLower
+  MIFDialect
 )
 
 mlir_target_link_libraries(bbc PRIVATE
diff --git a/flang/tools/fir-lsp-server/CMakeLists.txt b/flang/tools/fir-lsp-server/CMakeLists.txt
index 86cde79a66830..17ed948608c33 100644
--- a/flang/tools/fir-lsp-server/CMakeLists.txt
+++ b/flang/tools/fir-lsp-server/CMakeLists.txt
@@ -13,7 +13,8 @@ target_link_libraries(fir-lsp-server PRIVATE
   FIRDialect
   FIROpenACCSupport
   FIROpenMPSupport
-  HLFIRDialect)
+  HLFIRDialect
+  MIFDialect)
 
 mlir_target_link_libraries(fir-lsp-server PRIVATE
   MLIRLspServerLib
diff --git a/flang/tools/fir-opt/CMakeLists.txt b/flang/tools/fir-opt/CMakeLists.txt
index 4ee9752727b87..e735c2c2ff1a3 100644
--- a/flang/tools/fir-opt/CMakeLists.txt
+++ b/flang/tools/fir-opt/CMakeLists.txt
@@ -7,6 +7,7 @@ if(FLANG_INCLUDE_TESTS)
   set(test_libs
     FIRTestAnalysis
     FIRTestOpenACCInterfaces
+    MLIROpenACCTestPasses
     MLIRTestIR
     )
 endif()
@@ -22,9 +23,11 @@ target_link_libraries(fir-opt PRIVATE
   HLFIRDialect
   HLFIRTransforms
   FIROpenACCSupport
+  FIROpenACCTransforms
   FIROpenMPSupport
   FlangOpenMPTransforms
   FIRAnalysis
+  MIFDialect
   ${test_libs}
 )
 
diff --git a/flang/tools/fir-opt/fir-opt.cpp b/flang/tools/fir-opt/fir-opt.cpp
index d66fc3f08bdf8..32b0a1dfa5c7a 100644
--- a/flang/tools/fir-opt/fir-opt.cpp
+++ b/flang/tools/fir-opt/fir-opt.cpp
@@ -14,6 +14,7 @@
 #include "mlir/Tools/mlir-opt/MlirOptMain.h"
 #include "flang/Optimizer/CodeGen/CodeGen.h"
 #include "flang/Optimizer/HLFIR/Passes.h"
+#include "flang/Optimizer/OpenACC/Passes.h"
 #include "flang/Optimizer/OpenMP/Passes.h"
 #include "flang/Optimizer/Support/InitFIR.h"
 #include "flang/Optimizer/Transforms/Passes.h"
@@ -29,7 +30,10 @@ void registerTestFIROpenACCInterfacesPass();
 // Defined in mlir/test, no pulic header.
 namespace mlir {
 void registerSideEffectTestPasses();
-}
+namespace test {
+void registerTestOpenACC();
+} // namespace test
+} // namespace mlir
 
 int main(int argc, char **argv) {
   fir::support::registerMLIRPassesForFortranTools();
@@ -37,10 +41,12 @@ int main(int argc, char **argv) {
   fir::registerOptTransformPasses();
   hlfir::registerHLFIRPasses();
   flangomp::registerFlangOpenMPPasses();
+  fir::acc::registerFIROpenACCPasses();
 #ifdef FLANG_INCLUDE_TESTS
   fir::test::registerTestFIRAliasAnalysisPass();
   fir::test::registerTestFIROpenACCInterfacesPass();
   mlir::registerSideEffectTestPasses();
+  mlir::test::registerTestOpenACC();
 #endif
   DialectRegistry registry;
   fir::support::registerDialects(registry);
diff --git a/flang/tools/tco/CMakeLists.txt b/flang/tools/tco/CMakeLists.txt
index ab2aab2cf99a6..5ab8952898a6c 100644
--- a/flang/tools/tco/CMakeLists.txt
+++ b/flang/tools/tco/CMakeLists.txt
@@ -23,6 +23,7 @@ target_link_libraries(tco PRIVATE
   FIROpenMPSupport
   FlangOpenMPTransforms
   FortranSupport
+  MIFDialect
 )
 
 mlir_target_link_libraries(tco PRIVATE
diff --git a/flang/unittests/Optimizer/CMakeLists.txt b/flang/unittests/Optimizer/CMakeLists.txt
index a84d54d96102a..3a04008b9b795 100644
--- a/flang/unittests/Optimizer/CMakeLists.txt
+++ b/flang/unittests/Optimizer/CMakeLists.txt
@@ -13,6 +13,7 @@ set(LIBS
   FIRDialectSupport
   FIRSupport
   HLFIRDialect
+  MIFDialect
 )
 
 add_flang_unittest(FlangOptimizerTests
@@ -42,7 +43,8 @@ DEPENDS
   CUFDialect
   FIRDialect
   FIRSupport
-  HLFIRDialect)
+  HLFIRDialect
+  MIFDialect)
 
 if(NOT FLANG_STANDALONE_BUILD)
   add_dependencies(FlangOptimizerTests
diff --git a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
index 2eb0f066e19e0..4c36ed8620f40 100644
--- a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
+++ b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
@@ -26,13 +26,6 @@ function(_get_compile_options_from_flags output_var)
         list(APPEND compile_options "-mavx2")
         list(APPEND compile_options "-mfma")
       endif()
-      # For clang, we will build the math functions with `-fno-math-errno` so that
-      # __builtin_fma* will generate the fused-mutliply-add instructions.  We
-      # don't put the control flag to the public config yet, and see if it makes
-      # sense to just enable this flag by default.
-      if(LIBC_ADD_FNO_MATH_ERRNO)
-        list(APPEND compile_options "-fno-math-errno")
-      endif()
     endif()
     if(ADD_ROUND_OPT_FLAG)
       if(LIBC_TARGET_ARCHITECTURE_IS_X86_64)
@@ -102,6 +95,9 @@ function(_get_compile_options_from_config output_var)
 
   if(LIBC_CONF_MATH_OPTIMIZATIONS)
     list(APPEND config_options "-DLIBC_MATH=${LIBC_CONF_MATH_OPTIMIZATIONS}")
+    if(LIBC_CONF_MATH_OPTIMIZATIONS MATCHES "LIBC_MATH_NO_ERRNO")
+      list(APPEND config_options "-fno-math-errno")
+    endif()
   endif()
 
   if(LIBC_CONF_ERRNO_MODE)
diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index 19da0ad29cd84..933b81b9f1d46 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -3,11 +3,6 @@ function(_get_common_test_compile_options output_var c_test flags)
   _get_compile_options_from_config(config_flags)
   _get_compile_options_from_arch(arch_flags)
 
-  # Remove -fno-math-errno if it was added.
-  if(LIBC_ADD_FNO_MATH_ERRNO)
-    list(REMOVE_ITEM compile_flags "-fno-math-errno")
-  endif()
-
   # Death test executor is only available in Linux for now.
   if(NOT ${LIBC_TARGET_OS} STREQUAL "linux")
     list(REMOVE_ITEM config_flags "-DLIBC_ADD_NULL_CHECKS")
diff --git a/libc/config/baremetal/config.json b/libc/config/baremetal/config.json
index f01e5084b9695..e3703c28865bb 100644
--- a/libc/config/baremetal/config.json
+++ b/libc/config/baremetal/config.json
@@ -36,7 +36,12 @@
   },
   "math": {
     "LIBC_CONF_MATH_OPTIMIZATIONS": {
-      "value": "(LIBC_MATH_SKIP_ACCURATE_PASS | LIBC_MATH_SMALL_TABLES)"
+      "value": "(LIBC_MATH_SKIP_ACCURATE_PASS | LIBC_MATH_SMALL_TABLES | LIBC_MATH_NO_ERRNO | LIBC_MATH_INTERMEDIATE_COMP_IN_FLOAT)"
+    }
+  },
+  "general": {
+    "LIBC_ADD_NULL_CHECKS": {
+      "value": false
     }
   }
 }
diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index 8bf6c44b1d669..714120a79e39a 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -945,6 +945,7 @@ if(LLVM_LIBC_FULL_BUILD)
     # arpa/inet.h entrypoints
     libc.src.arpa.inet.htonl
     libc.src.arpa.inet.htons
+    libc.src.arpa.inet.inet_aton
     libc.src.arpa.inet.ntohl
     libc.src.arpa.inet.ntohs
 
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index dffccbab9a8e9..f6bbb346d10e5 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -1077,6 +1077,7 @@ if(LLVM_LIBC_FULL_BUILD)
     # arpa/inet.h entrypoints
     libc.src.arpa.inet.htonl
     libc.src.arpa.inet.htons
+    libc.src.arpa.inet.inet_aton
     libc.src.arpa.inet.ntohl
     libc.src.arpa.inet.ntohs
 
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index b4ab073ec912f..7a8d74a4e5da9 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -1113,6 +1113,7 @@ if(LLVM_LIBC_FULL_BUILD)
     # arpa/inet.h entrypoints
     libc.src.arpa.inet.htonl
     libc.src.arpa.inet.htons
+    libc.src.arpa.inet.inet_aton
     libc.src.arpa.inet.ntohl
     libc.src.arpa.inet.ntohs
 
@@ -1373,6 +1374,11 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.wchar.wcstombs
     libc.src.wchar.wcsrtombs
     libc.src.wchar.wcsnrtombs
+
+    # nl_types.h entrypoints
+    libc.src.nl_types.catopen 
+    libc.src.nl_types.catclose
+    libc.src.nl_types.catgets
   )
 endif()
 
diff --git a/libc/config/linux/x86_64/headers.txt b/libc/config/linux/x86_64/headers.txt
index 0573851fbc20b..d0f62eb104dcc 100644
--- a/libc/config/linux/x86_64/headers.txt
+++ b/libc/config/linux/x86_64/headers.txt
@@ -19,6 +19,7 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.malloc
     libc.include.math
     libc.include.netinet_in
+    libc.include.nl_types
     libc.include.poll
     libc.include.pthread
     libc.include.sched
diff --git a/libc/docs/dev/undefined_behavior.rst b/libc/docs/dev/undefined_behavior.rst
index aeeaf17c09aa5..4f8ac22919b0a 100644
--- a/libc/docs/dev/undefined_behavior.rst
+++ b/libc/docs/dev/undefined_behavior.rst
@@ -156,3 +156,10 @@ parsed as normal. For l64a it's unspecified what happens if the input value is
 negative. For LLVM-libc, all inputs to l64a are treated as unsigned 32 bit ints.
 Additionally, the return of l64a is in a thread-local buffer that's overwritten
 on each call.
+
+`inet_aton` and Non-Standard Binary Integers
+--------------------------------------------
+The current implementation of the `inet_aton` function utilizes the same code
+as `strtol` to parse IPv4 numbers-and-dots notations. This approach may permit
+the use of binary integers (prefixed with 0b), which is not supported by the
+standard.
diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt
index afa90e6c8b655..09f169b9a985f 100644
--- a/libc/include/CMakeLists.txt
+++ b/libc/include/CMakeLists.txt
@@ -218,6 +218,9 @@ add_header_macro(
   DEPENDS
     .llvm_libc_common_h
     .llvm-libc-macros.complex_macros
+    .llvm-libc-types.float128
+    .llvm-libc-types.cfloat128
+    .llvm-libc-types.cfloat16
 )
 
 add_header_macro(
@@ -632,8 +635,9 @@ add_header_macro(
   sys/time.h
   DEPENDS
     .llvm_libc_common_h
-    .llvm-libc-types.struct_timeval
     .llvm-libc-macros.sys_time_macros
+    .llvm-libc-types.struct_itimerval
+    .llvm-libc-types.struct_timeval
 )
 
 add_header_macro(
@@ -767,6 +771,14 @@ add_header_macro(
     .llvm-libc-macros.poll-macros
   )
 
+add_header_macro(
+  nl_types
+  ../libc/include/nl_types.yaml
+  nl_types.h
+  DEPENDS
+    .llvm-libc-types.nl_catd
+  )
+
 # UEFI spec references "Uefi.h" so we use that name for compatibility
 add_header_macro(
   uefi
diff --git a/libc/include/arpa/inet.yaml b/libc/include/arpa/inet.yaml
index 10cd56d6ce786..6e0629072b6ef 100644
--- a/libc/include/arpa/inet.yaml
+++ b/libc/include/arpa/inet.yaml
@@ -1,7 +1,8 @@
 header: arpa/inet.h
 header_template: inet.h.def
 macros: []
-types: []
+types:
+  - type_name: in_addr
 enums: []
 objects: []
 functions:
@@ -17,6 +18,13 @@ functions:
     return_type: uint16_t
     arguments:
       - type: uint16_t
+  - name: inet_aton
+    standards:
+      - llvm_libc_ext
+    return_type: int
+    arguments:
+      - type: const char *
+      - type: in_addr *
   - name: ntohl
     standards:
       - POSIX
diff --git a/libc/include/llvm-libc-macros/netinet-in-macros.h b/libc/include/llvm-libc-macros/netinet-in-macros.h
index c05e5e2aec248..fb7564cee9e80 100644
--- a/libc/include/llvm-libc-macros/netinet-in-macros.h
+++ b/libc/include/llvm-libc-macros/netinet-in-macros.h
@@ -16,4 +16,12 @@
 #define IPPROTO_IPV6 41
 #define IPPROTO_RAW 255
 
+#define IPV6_UNICAST_HOPS 16
+#define IPV6_MULTICAST_IF 17
+#define IPV6_MULTICAST_HOPS 18
+#define IPV6_MULTICAST_LOOP 19
+#define IPV6_JOIN_GROUP 20
+#define IPV6_LEAVE_GROUP 21
+#define IPV6_V6ONLY 26
+
 #endif // LLVM_LIBC_MACROS_NETINET_IN_MACROS_H
diff --git a/libc/include/llvm-libc-types/CMakeLists.txt b/libc/include/llvm-libc-types/CMakeLists.txt
index 5f506c4d25c9b..a428a0e73aab2 100644
--- a/libc/include/llvm-libc-types/CMakeLists.txt
+++ b/libc/include/llvm-libc-types/CMakeLists.txt
@@ -46,6 +46,7 @@ add_header(mbstate_t HDR mbstate_t.h)
 add_header(mode_t HDR mode_t.h)
 add_header(mtx_t HDR mtx_t.h DEPENDS .__futex_word .__mutex_type)
 add_header(nfds_t HDR nfds_t.h)
+add_header(nl_catd HDR nl_catd.h)
 add_header(nlink_t HDR nlink_t.h)
 add_header(off_t HDR off_t.h)
 add_header(once_flag HDR once_flag.h DEPENDS .__futex_word)
diff --git a/libclc/clc/include/clc/math/clc_sincos_piby4.h b/libc/include/llvm-libc-types/nl_catd.h
similarity index 56%
rename from libclc/clc/include/clc/math/clc_sincos_piby4.h
rename to libc/include/llvm-libc-types/nl_catd.h
index 50608ae24e947..ccdb0206713a3 100644
--- a/libclc/clc/include/clc/math/clc_sincos_piby4.h
+++ b/libc/include/llvm-libc-types/nl_catd.h
@@ -1,4 +1,4 @@
-//===----------------------------------------------------------------------===//
+//===-- Definition of nl_catd type ----------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,9 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <clc/math/clc_fma.h>
-#include <clc/math/clc_mad.h>
-#include <clc/math/math.h>
+#ifndef LLVM_LIBC_TYPES_NL_CATD_H
+#define LLVM_LIBC_TYPES_NL_CATD_H
+
+typedef void *nl_catd;
 
-#define __CLC_BODY <clc/math/clc_sincos_piby4.inc>
-#include <clc/math/gentype.inc>
+#endif // LLVM_LIBC_TYPES_NL_CATD_H
diff --git a/libc/include/nl_types.yaml b/libc/include/nl_types.yaml
new file mode 100644
index 0000000000000..aecbb44a02224
--- /dev/null
+++ b/libc/include/nl_types.yaml
@@ -0,0 +1,31 @@
+header: nl_types.h
+standards:
+  - posix
+macros: []
+types:
+  - type_name: nl_catd
+enums: []
+objects: []
+functions:
+  - name: catopen
+    standards:
+      - posix
+    return_type: nl_catd
+    arguments:
+      - type: const char *
+      - type: int
+  - name: catclose
+    standards:
+      - posix
+    return_type: int
+    arguments:
+      - type: nl_catd
+  - name: catgets
+    standards:
+      - posix
+    return_type: char *
+    arguments:
+      - type: nl_catd
+      - type: int
+      - type: int
+      - type: const char*
diff --git a/libc/include/stdio.yaml b/libc/include/stdio.yaml
index 2a0c563709984..394437ba3bbcd 100644
--- a/libc/include/stdio.yaml
+++ b/libc/include/stdio.yaml
@@ -197,12 +197,26 @@ functions:
       - type: FILE *
       - type: long
       - type: int
+  - name: fseeko
+    standards:
+      - POSIX
+    return_type: int
+    arguments:
+      - type: FILE *
+      - type: off_t
+      - type: int
   - name: ftell
     standards:
       - stdc
     return_type: long
     arguments:
       - type: FILE *
+  - name: ftello
+    standards:
+      - POSIX
+    return_type: off_t
+    arguments:
+      - type: FILE *
   - name: funlockfile
     standards:
       - POSIX
diff --git a/libc/shared/math.h b/libc/shared/math.h
index e3f7965e19a55..bd6aee73c3933 100644
--- a/libc/shared/math.h
+++ b/libc/shared/math.h
@@ -50,6 +50,7 @@
 #include "math/exp2.h"
 #include "math/exp2f.h"
 #include "math/exp2f16.h"
+#include "math/exp2m1f.h"
 #include "math/expf.h"
 #include "math/expf16.h"
 #include "math/frexpf.h"
diff --git a/libc/shared/math/exp2m1f.h b/libc/shared/math/exp2m1f.h
new file mode 100644
index 0000000000000..ca9754774f0fc
--- /dev/null
+++ b/libc/shared/math/exp2m1f.h
@@ -0,0 +1,23 @@
+//===-- Shared exp2m1f function ---------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_EXP2M1F_H
+#define LLVM_LIBC_SHARED_MATH_EXP2M1F_H
+
+#include "shared/libc_common.h"
+#include "src/__support/math/exp2m1f.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::exp2m1f;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SHARED_MATH_EXP2M1F_H
diff --git a/libc/src/CMakeLists.txt b/libc/src/CMakeLists.txt
index d7a1e1f49e6ff..b2afe0a33acee 100644
--- a/libc/src/CMakeLists.txt
+++ b/libc/src/CMakeLists.txt
@@ -37,6 +37,7 @@ add_subdirectory(arpa)
 add_subdirectory(assert)
 add_subdirectory(compiler)
 add_subdirectory(locale)
+add_subdirectory(nl_types)
 add_subdirectory(search)
 add_subdirectory(setjmp)
 add_subdirectory(signal)
diff --git a/libc/src/__support/FPUtil/double_double.h b/libc/src/__support/FPUtil/double_double.h
index 8e54e845de493..3913f7acb7f8c 100644
--- a/libc/src/__support/FPUtil/double_double.h
+++ b/libc/src/__support/FPUtil/double_double.h
@@ -77,9 +77,9 @@ LIBC_INLINE constexpr NumberPair<T> split(T a) {
   NumberPair<T> r{0.0, 0.0};
   // CN = 2^N.
   constexpr T CN = static_cast<T>(1 << N);
-  constexpr T C = CN + 1.0;
-  double t1 = C * a;
-  double t2 = a - t1;
+  constexpr T C = CN + T(1);
+  T t1 = C * a;
+  T t2 = a - t1;
   r.hi = t1 + t2;
   r.lo = a - r.hi;
   return r;
@@ -144,8 +144,9 @@ LIBC_INLINE NumberPair<T> exact_mult(T a, T b) {
   return r;
 }
 
-LIBC_INLINE DoubleDouble quick_mult(double a, const DoubleDouble &b) {
-  DoubleDouble r = exact_mult(a, b.hi);
+template <typename T = double>
+LIBC_INLINE NumberPair<T> quick_mult(T a, const NumberPair<T> &b) {
+  NumberPair<T> r = exact_mult(a, b.hi);
   r.lo = multiply_add(a, b.lo, r.lo);
   return r;
 }
diff --git a/libc/src/__support/math/CMakeLists.txt b/libc/src/__support/math/CMakeLists.txt
index 9685496c0ec41..620900028d424 100644
--- a/libc/src/__support/math/CMakeLists.txt
+++ b/libc/src/__support/math/CMakeLists.txt
@@ -751,6 +751,24 @@ add_header_library(
     libc.src.__support.macros.optimization
 )
 
+add_header_library(
+  exp2m1f
+  HDRS
+    exp2m1f.h
+  DEPENDS
+    .exp10f_utils
+    libc.src.errno.errno
+    libc.src.__support.common
+    libc.src.__support.FPUtil.except_value_utils
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.polyeval
+    libc.src.__support.FPUtil.rounding_mode
+    libc.src.__support.macros.optimization
+    libc.src.__support.macros.properties.cpu_features
+)
+
 add_header_library(
   exp10
   HDRS
@@ -908,6 +926,7 @@ add_header_library(
   sincosf_utils
   HDRS
     sincosf_utils.h
+    sincosf_float_eval.h
   DEPENDS
     .range_reduction
     libc.src.__support.FPUtil.fp_bits
diff --git a/libc/src/__support/math/cosf.h b/libc/src/__support/math/cosf.h
index 074be0b314637..48ba71aa58a2d 100644
--- a/libc/src/__support/math/cosf.h
+++ b/libc/src/__support/math/cosf.h
@@ -9,7 +9,6 @@
 #ifndef LIBC_SRC___SUPPORT_MATH_COSF_H
 #define LIBC_SRC___SUPPORT_MATH_COSF_H
 
-#include "sincosf_utils.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/except_value_utils.h"
@@ -18,6 +17,26 @@
 #include "src/__support/macros/optimization.h"            // LIBC_UNLIKELY
 #include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA
 
+#if defined(LIBC_MATH_HAS_SKIP_ACCURATE_PASS) &&                               \
+    defined(LIBC_MATH_HAS_INTERMEDIATE_COMP_IN_FLOAT) &&                       \
+    defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT)
+
+#include "sincosf_float_eval.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace math {
+
+LIBC_INLINE static constexpr float cosf(float x) {
+  return sincosf_float_eval::sincosf_eval</*IS_SIN*/ false>(x);
+}
+
+} // namespace math
+} // namespace LIBC_NAMESPACE_DECL
+
+#else // !LIBC_MATH_HAS_INTERMEDIATE_COMP_IN_FLOAT
+
+#include "sincosf_utils.h"
+
 namespace LIBC_NAMESPACE_DECL {
 
 namespace math {
@@ -51,7 +70,6 @@ LIBC_INLINE static constexpr float cosf(float x) {
   xbits.set_sign(Sign::POS);
 
   uint32_t x_abs = xbits.uintval();
-  double xd = static_cast<double>(xbits.get_val());
 
   // Range reduction:
   // For |x| > pi/16, we perform range reduction as follows:
@@ -90,6 +108,7 @@ LIBC_INLINE static constexpr float cosf(float x) {
   // computed using degree-7 and degree-6 minimax polynomials generated by
   // Sollya respectively.
 
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
   // |x| < 0x1.0p-12f
   if (LIBC_UNLIKELY(x_abs < 0x3980'0000U)) {
     // When |x| < 2^-12, the relative error of the approximation cos(x) ~ 1
@@ -108,12 +127,12 @@ LIBC_INLINE static constexpr float cosf(float x) {
     // emulated version of FMA.
 #if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT)
     return fputil::multiply_add(xbits.get_val(), -0x1.0p-25f, 1.0f);
-#else
+#else // !LIBC_TARGET_CPU_HAS_FMA_FLOAT
+    double xd = static_cast<double>(xbits.get_val());
     return static_cast<float>(fputil::multiply_add(xd, -0x1.0p-25, 1.0));
 #endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
   }
 
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
   if (auto r = COSF_EXCEPTS.lookup(x_abs); LIBC_UNLIKELY(r.has_value()))
     return r.value();
 #endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
@@ -132,6 +151,7 @@ LIBC_INLINE static constexpr float cosf(float x) {
     return x + FPBits::quiet_nan().get_val();
   }
 
+  double xd = static_cast<double>(xbits.get_val());
   // Combine the results with the sine of sum formula:
   //   cos(x) = cos((k + y)*pi/32)
   //          = cos(y*pi/32) * cos(k*pi/32) - sin(y*pi/32) * sin(k*pi/32)
@@ -150,3 +170,5 @@ LIBC_INLINE static constexpr float cosf(float x) {
 } // namespace LIBC_NAMESPACE_DECL
 
 #endif // LIBC_SRC___SUPPORT_MATH_COSF_H
+
+#endif // LIBC_MATH_HAS_INTERMEDIATE_COMP_IN_FLOAT
diff --git a/libc/src/__support/math/exp2m1f.h b/libc/src/__support/math/exp2m1f.h
new file mode 100644
index 0000000000000..e95076c9eac22
--- /dev/null
+++ b/libc/src/__support/math/exp2m1f.h
@@ -0,0 +1,195 @@
+//===-- Implementation header for exp2m1f ------------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXP2M1F_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_EXP2M1F_H
+
+#include "exp10f_utils.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/except_value_utils.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/rounding_mode.h"
+#include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h"
+#include "src/__support/macros/properties/cpu_features.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+LIBC_INLINE static constexpr float exp2m1f(float x) {
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  constexpr size_t N_EXCEPTS_LO = 8;
+
+  constexpr fputil::ExceptValues<float, N_EXCEPTS_LO> EXP2M1F_EXCEPTS_LO = {{
+      // (input, RZ output, RU offset, RD offset, RN offset)
+      // x = 0x1.36dc8ep-36, exp2m1f(x) = 0x1.aef212p-37 (RZ)
+      {0x2d9b'6e47U, 0x2d57'7909U, 1U, 0U, 0U},
+      // x = 0x1.224936p-19, exp2m1f(x) = 0x1.926c0ep-20 (RZ)
+      {0x3611'249bU, 0x35c9'3607U, 1U, 0U, 1U},
+      // x = 0x1.d16d2p-20, exp2m1f(x) = 0x1.429becp-20 (RZ)
+      {0x35e8'b690U, 0x35a1'4df6U, 1U, 0U, 1U},
+      // x = 0x1.17949ep-14, exp2m1f(x) = 0x1.8397p-15 (RZ)
+      {0x388b'ca4fU, 0x3841'cb80U, 1U, 0U, 1U},
+      // x = -0x1.9c3e1ep-38, exp2m1f(x) = -0x1.1dbeacp-38 (RZ)
+      {0xacce'1f0fU, 0xac8e'df56U, 0U, 1U, 0U},
+      // x = -0x1.4d89b4p-32, exp2m1f(x) = -0x1.ce61b6p-33 (RZ)
+      {0xafa6'c4daU, 0xaf67'30dbU, 0U, 1U, 1U},
+      // x = -0x1.a6eac4p-10, exp2m1f(x) = -0x1.24fadap-10 (RZ)
+      {0xbad3'7562U, 0xba92'7d6dU, 0U, 1U, 1U},
+      // x = -0x1.e7526ep-6, exp2m1f(x) = -0x1.4e53dep-6 (RZ)
+      {0xbcf3'a937U, 0xbca7'29efU, 0U, 1U, 1U},
+  }};
+
+  constexpr size_t N_EXCEPTS_HI = 3;
+
+  constexpr fputil::ExceptValues<float, N_EXCEPTS_HI> EXP2M1F_EXCEPTS_HI = {{
+      // (input, RZ output, RU offset, RD offset, RN offset)
+      // x = 0x1.16a972p-1, exp2m1f(x) = 0x1.d545b2p-2 (RZ)
+      {0x3f0b'54b9U, 0x3eea'a2d9U, 1U, 0U, 0U},
+      // x = -0x1.9f12acp-5, exp2m1f(x) = -0x1.1ab68cp-5 (RZ)
+      {0xbd4f'8956U, 0xbd0d'5b46U, 0U, 1U, 0U},
+      // x = -0x1.de7b9cp-5, exp2m1f(x) = -0x1.4508f4p-5 (RZ)
+      {0xbd6f'3dceU, 0xbd22'847aU, 0U, 1U, 1U},
+  }};
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+  using FPBits = fputil::FPBits<float>;
+  FPBits xbits(x);
+
+  uint32_t x_u = xbits.uintval();
+  uint32_t x_abs = x_u & 0x7fff'ffffU;
+
+  // When |x| >= 128, or x is nan, or |x| <= 2^-5
+  if (LIBC_UNLIKELY(x_abs >= 0x4300'0000U || x_abs <= 0x3d00'0000U)) {
+    // |x| <= 2^-5
+    if (x_abs <= 0x3d00'0000U) {
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+      if (auto r = EXP2M1F_EXCEPTS_LO.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
+        return r.value();
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+      // Minimax polynomial generated by Sollya with:
+      // > display = hexadecimal;
+      // > fpminimax((2^x - 1)/x, 5, [|D...|], [-2^-5, 2^-5]);
+      constexpr double COEFFS[] = {
+          0x1.62e42fefa39f3p-1, 0x1.ebfbdff82c57bp-3,  0x1.c6b08d6f2d7aap-5,
+          0x1.3b2ab6fc92f5dp-7, 0x1.5d897cfe27125p-10, 0x1.43090e61e6af1p-13};
+      double xd = x;
+      double xsq = xd * xd;
+      double c0 = fputil::multiply_add(xd, COEFFS[1], COEFFS[0]);
+      double c1 = fputil::multiply_add(xd, COEFFS[3], COEFFS[2]);
+      double c2 = fputil::multiply_add(xd, COEFFS[5], COEFFS[4]);
+      double p = fputil::polyeval(xsq, c0, c1, c2);
+      return static_cast<float>(p * xd);
+    }
+
+    // x >= 128, or x is nan
+    if (xbits.is_pos()) {
+      if (xbits.is_finite()) {
+        int rounding = fputil::quick_get_round();
+        if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
+          return FPBits::max_normal().get_val();
+
+        fputil::set_errno_if_required(ERANGE);
+        fputil::raise_except_if_required(FE_OVERFLOW);
+      }
+
+      // x >= 128 and 2^x - 1 rounds to +inf, or x is +inf or nan
+      return x + FPBits::inf().get_val();
+    }
+  }
+
+  if (LIBC_UNLIKELY(x <= -25.0f)) {
+    // 2^(-inf) - 1 = -1
+    if (xbits.is_inf())
+      return -1.0f;
+    // 2^nan - 1 = nan
+    if (xbits.is_nan())
+      return x;
+
+    int rounding = fputil::quick_get_round();
+    if (rounding == FE_UPWARD || rounding == FE_TOWARDZERO)
+      return -0x1.ffff'fep-1f; // -1.0f + 0x1.0p-24f
+
+    fputil::set_errno_if_required(ERANGE);
+    fputil::raise_except_if_required(FE_UNDERFLOW);
+    return -1.0f;
+  }
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  if (auto r = EXP2M1F_EXCEPTS_HI.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
+    return r.value();
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+  // For -25 < x < 128, to compute 2^x, we perform the following range
+  // reduction: find hi, mid, lo such that:
+  //   x = hi + mid + lo, in which:
+  //     hi is an integer,
+  //     0 <= mid * 2^5 < 32 is an integer,
+  //     -2^(-6) <= lo <= 2^(-6).
+  // In particular,
+  //   hi + mid = round(x * 2^5) * 2^(-5).
+  // Then,
+  //   2^x = 2^(hi + mid + lo) = 2^hi * 2^mid * 2^lo.
+  // 2^mid is stored in the lookup table of 32 elements.
+  // 2^lo is computed using a degree-4 minimax polynomial generated by Sollya.
+  // We perform 2^hi * 2^mid by simply add hi to the exponent field of 2^mid.
+
+  // kf = (hi + mid) * 2^5 = round(x * 2^5)
+  float kf = 0;
+  int k = 0;
+#ifdef LIBC_TARGET_CPU_HAS_NEAREST_INT
+  kf = fputil::nearest_integer(x * 32.0f);
+  k = static_cast<int>(kf);
+#else
+  constexpr float HALF[2] = {0.5f, -0.5f};
+  k = static_cast<int>(fputil::multiply_add(x, 32.0f, HALF[x < 0.0f]));
+  kf = static_cast<float>(k);
+#endif // LIBC_TARGET_CPU_HAS_NEAREST_INT
+
+  // lo = x - (hi + mid) = x - kf * 2^(-5)
+  double lo = fputil::multiply_add(-0x1.0p-5f, kf, x);
+
+  // hi = floor(kf * 2^(-4))
+  // exp2_hi = shift hi to the exponent field of double precision.
+  int64_t exp2_hi =
+      static_cast<int64_t>(static_cast<uint64_t>(k >> ExpBase::MID_BITS)
+                           << fputil::FPBits<double>::FRACTION_LEN);
+  // mh = 2^hi * 2^mid
+  // mh_bits = bit field of mh
+  int64_t mh_bits = ExpBase::EXP_2_MID[k & ExpBase::MID_MASK] + exp2_hi;
+  double mh = fputil::FPBits<double>(static_cast<uint64_t>(mh_bits)).get_val();
+
+  // Degree-4 polynomial approximating (2^x - 1)/x generated by Sollya with:
+  // > display = hexadecimal;
+  // > fpminimax((2^x - 1)/x, 4, [|D...|], [-2^-6, 2^-6]);
+  constexpr double COEFFS[5] = {0x1.62e42fefa39efp-1, 0x1.ebfbdff8131c4p-3,
+                                0x1.c6b08d7061695p-5, 0x1.3b2b1bee74b2ap-7,
+                                0x1.5d88091198529p-10};
+  double lo_sq = lo * lo;
+  double c1 = fputil::multiply_add(lo, COEFFS[0], 1.0);
+  double c2 = fputil::multiply_add(lo, COEFFS[2], COEFFS[1]);
+  double c3 = fputil::multiply_add(lo, COEFFS[4], COEFFS[3]);
+  double exp2_lo = fputil::polyeval(lo_sq, c1, c2, c3);
+  // 2^x - 1 = 2^(hi + mid + lo) - 1
+  //         = 2^(hi + mid) * 2^lo - 1
+  //         ~ mh * (1 + lo * P(lo)) - 1
+  //         = mh * exp2_lo - 1
+  return static_cast<float>(fputil::multiply_add(exp2_lo, mh, -1.0));
+}
+
+} // namespace math
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_EXP2M1F_H
diff --git a/libc/src/__support/math/sincosf_float_eval.h b/libc/src/__support/math/sincosf_float_eval.h
new file mode 100644
index 0000000000000..836e928bc8675
--- /dev/null
+++ b/libc/src/__support/math/sincosf_float_eval.h
@@ -0,0 +1,223 @@
+//===-- Compute sin + cos for small angles ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_SINCOSF_FLOAT_EVAL_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_SINCOSF_FLOAT_EVAL_H
+
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/double_double.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/nearest_integer.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+namespace sincosf_float_eval {
+
+// Since the worst case of `x mod pi` in single precision is > 2^-28, in order
+// to be bounded by 1 ULP, the range reduction accuracy will need to be at
+// least 2^(-28 - 23) = 2^-51.
+// For fast small range reduction, we will compute as follow:
+//   Let pi ~ c0 + c1 + c2
+// with |c1| < ulp(c0)/2 and |c2| < ulp(c1)/2
+// then:
+//   k := nearest_int(x * 1/pi);
+//   u = (x - k * c0) - k * c1 - k * c2
+// We requires k * c0, k * c1 to be exactly representable in single precision.
+// Let p_k be the precision of k, then the precision of c0 and c1 are:
+//   24 - p_k,
+// and the ulp of (k * c2) is 2^(-3 * (24 - p_k)).
+// This give us the following bound on the precision of k:
+//   3 * (24 - p_k) >= 51,
+// or equivalently:
+//   p_k <= 7.
+// We set the bound for p_k to be 6 so that we can have some more wiggle room
+// for computations.
+LIBC_INLINE static unsigned sincosf_range_reduction_small(float x, float &u) {
+  // > display=hexadecimal;
+  // > a = round(pi/8, 18, RN);
+  // > b = round(pi/8 - a, 18, RN);
+  // > c = round(pi/8 - a - b, SG, RN);
+  // > round(8/pi, SG, RN);
+  constexpr float MPI[3] = {-0x1.921f8p-2f, -0x1.aa22p-21f, -0x1.68c234p-41f};
+  constexpr float ONE_OVER_PI = 0x1.45f306p+1f;
+  float prod_hi = x * ONE_OVER_PI;
+  float k = fputil::nearest_integer(prod_hi);
+
+  float y_hi = fputil::multiply_add(k, MPI[0], x); // Exact
+  u = fputil::multiply_add(k, MPI[1], y_hi);
+  u = fputil::multiply_add(k, MPI[2], u);
+  return static_cast<unsigned>(static_cast<int>(k));
+}
+
+// TODO: Add non-FMA version of large range reduction.
+LIBC_INLINE static unsigned sincosf_range_reduction_large(float x, float &u) {
+  // > for i from 0 to 13 do {
+  //     if i < 2 then { pi_inv = 0.25 + 2^(8*(i - 2)) / pi; }
+  //     else { pi_inv = 2^(8*(i-2)) / pi; };
+  //     pn = nearestint(pi_inv);
+  //     pi_frac = pi_inv - pn;
+  //     a = round(pi_frac, SG, RN);
+  //     b = round(pi_frac - a, SG, RN);
+  //     c = round(pi_frac - a - b, SG, RN);
+  //     d = round(pi_frac - a - b - c, SG, RN);
+  //     print("{", 2^3 * a, ",", 2^3 * b, ",", 2^3 * c, ",", 2^3 * d, "},");
+  // };
+  constexpr float EIGHT_OVER_PI[14][4] = {
+      {0x1.000146p1f, -0x1.9f246cp-28f, -0x1.bbead6p-54f, -0x1.ec5418p-85f},
+      {0x1.0145f4p1f, -0x1.f246c6p-24f, -0x1.df56bp-49f, -0x1.ec5418p-77f},
+      {0x1.45f306p1f, 0x1.b9391p-24f, 0x1.529fc2p-50f, 0x1.d5f47ep-76f},
+      {0x1.f306dcp1f, 0x1.391054p-24f, 0x1.4fe13ap-49f, 0x1.7d1f54p-74f},
+      {-0x1.f246c6p0f, -0x1.df56bp-25f, -0x1.ec5418p-53f, 0x1.f534dep-78f},
+      {-0x1.236378p1f, 0x1.529fc2p-26f, 0x1.d5f47ep-52f, -0x1.65912p-77f},
+      {0x1.391054p0f, 0x1.4fe13ap-25f, 0x1.7d1f54p-50f, -0x1.6447e4p-75f},
+      {0x1.1054a8p0f, -0x1.ec5418p-29f, 0x1.f534dep-54f, -0x1.f924ecp-81f},
+      {0x1.529fc2p-2f, 0x1.d5f47ep-28f, -0x1.65912p-53f, 0x1.b6c52cp-79f},
+      {-0x1.ac07b2p1f, 0x1.5f47d4p-24f, 0x1.a6ee06p-49f, 0x1.b6295ap-74f},
+      {-0x1.ec5418p-5f, 0x1.f534dep-30f, -0x1.f924ecp-57f, 0x1.5993c4p-82f},
+      {0x1.3abe9p-1f, -0x1.596448p-27f, 0x1.b6c52cp-55f, -0x1.9b0ef2p-80f},
+      {-0x1.505c16p1f, 0x1.a6ee06p-25f, 0x1.b6295ap-50f, -0x1.b0ef1cp-76f},
+      {-0x1.70565ap-1f, 0x1.dc0db6p-26f, 0x1.4acc9ep-53f, 0x1.0e4108p-80f},
+  };
+
+  using FPBits = typename fputil::FPBits<float>;
+  using fputil::FloatFloat;
+  FPBits xbits(x);
+
+  int x_e_m32 = xbits.get_biased_exponent() - (FPBits::EXP_BIAS + 32);
+  unsigned idx = static_cast<unsigned>((x_e_m32 >> 3) + 2);
+  // Scale x down by 2^(-(8 * (idx - 2))
+  xbits.set_biased_exponent((x_e_m32 & 7) + FPBits::EXP_BIAS + 32);
+  // 2^32 <= |x_reduced| < 2^(32 + 8) = 2^40
+  float x_reduced = xbits.get_val();
+  // x * c_hi = ph.hi + ph.lo exactly.
+  FloatFloat ph = fputil::exact_mult<float>(x_reduced, EIGHT_OVER_PI[idx][0]);
+  // x * c_mid = pm.hi + pm.lo exactly.
+  FloatFloat pm = fputil::exact_mult<float>(x_reduced, EIGHT_OVER_PI[idx][1]);
+  // x * c_lo = pl.hi + pl.lo exactly.
+  FloatFloat pl = fputil::exact_mult<float>(x_reduced, EIGHT_OVER_PI[idx][2]);
+  // Extract integral parts and fractional parts of (ph.lo + pm.hi).
+  float sum_hi = ph.lo + pm.hi;
+  float k = fputil::nearest_integer(sum_hi);
+
+  // x * 8/pi mod 1 ~ y_hi + y_mid + y_lo
+  float y_hi = (ph.lo - k) + pm.hi; // Exact
+  FloatFloat y_mid = fputil::exact_add(pm.lo, pl.hi);
+  float y_lo = pl.lo;
+
+  // y_l = x * c_lo_2 + pl.lo
+  float y_l = fputil::multiply_add(x_reduced, EIGHT_OVER_PI[idx][3], y_lo);
+  FloatFloat y = fputil::exact_add(y_hi, y_mid.hi);
+  y.lo += (y_mid.lo + y_l);
+
+  // Digits of pi/8, generated by Sollya with:
+  // > a = round(pi/8, SG, RN);
+  // > b = round(pi/8 - SG, D, RN);
+  constexpr FloatFloat PI_OVER_8 = {-0x1.777a5cp-27f, 0x1.921fb6p-2f};
+
+  // Error bound: with {a} denote the fractional part of a, i.e.:
+  //   {a} = a - round(a)
+  // Then,
+  //   | {x * 8/pi} - (y_hi + y_lo) | <=  ulp(ulp(y_hi)) <= 2^-47
+  //   | {x mod pi/8} - (u.hi + u.lo) | < 2 * 2^-5 * 2^-47 = 2^-51
+  u = fputil::multiply_add(y.hi, PI_OVER_8.hi, y.lo * PI_OVER_8.hi);
+
+  return static_cast<unsigned>(static_cast<int>(k));
+}
+
+template <bool IS_SIN> LIBC_INLINE static float sincosf_eval(float x) {
+  // sin(k * pi/8) for k = 0..15, generated by Sollya with:
+  // > for k from 0 to 16 do {
+  //     print(round(sin(k * pi/8), SG, RN));
+  // };
+  constexpr float SIN_K_PI_OVER_8[16] = {
+      0.0f,  0x1.87de2ap-2f,  0x1.6a09e6p-1f,  0x1.d906bcp-1f,
+      1.0f,  0x1.d906bcp-1f,  0x1.6a09e6p-1f,  0x1.87de2ap-2f,
+      0.0f,  -0x1.87de2ap-2f, -0x1.6a09e6p-1f, -0x1.d906bcp-1f,
+      -1.0f, -0x1.d906bcp-1f, -0x1.6a09e6p-1f, -0x1.87de2ap-2f,
+  };
+
+  using FPBits = fputil::FPBits<float>;
+  FPBits xbits(x);
+  uint32_t x_abs = cpp::bit_cast<uint32_t>(x) & 0x7fff'ffffU;
+
+  float y;
+  unsigned k = 0;
+  if (x_abs < 0x4880'0000U) {
+    k = sincosf_range_reduction_small(x, y);
+  } else {
+
+    if (LIBC_UNLIKELY(x_abs >= 0x7f80'0000U)) {
+      if (xbits.is_signaling_nan()) {
+        fputil::raise_except_if_required(FE_INVALID);
+        return FPBits::quiet_nan().get_val();
+      }
+
+      if (x_abs == 0x7f80'0000U) {
+        fputil::set_errno_if_required(EDOM);
+        fputil::raise_except_if_required(FE_INVALID);
+      }
+      return x + FPBits::quiet_nan().get_val();
+    }
+
+    k = sincosf_range_reduction_large(x, y);
+  }
+
+  float sin_k = SIN_K_PI_OVER_8[k & 15];
+  // cos(k * pi/8) = sin(k * pi/8 + pi/2) = sin((k + 4) * pi/8).
+  // cos_k = cos(k * pi/8)
+  float cos_k = SIN_K_PI_OVER_8[(k + 4) & 15];
+
+  float y_sq = y * y;
+
+  // Polynomial approximation of sin(y) and cos(y) for |y| <= pi/16:
+  //
+  // Using Taylor polynomial for sin(y):
+  //   sin(y) ~ y - y^3 / 6 + y^5 / 120
+  // Using minimax polynomial generated by Sollya for cos(y) with:
+  // > Q = fpminimax(cos(x), [|0, 2, 4|], [|1, SG...|], [0, pi/16]);
+  //
+  // Error bounds:
+  // * For sin(y)
+  // > P = x - SG(1/6)*x^3 + SG(1/120) * x^5;
+  // > dirtyinfnorm((sin(x) - P)/sin(x), [-pi/16, pi/16]);
+  // 0x1.825...p-27
+  // * For cos(y)
+  // > Q = fpminimax(cos(x), [|0, 2, 4|], [|1, SG...|], [0, pi/16]);
+  // > dirtyinfnorm((sin(x) - P)/sin(x), [-pi/16, pi/16]);
+  // 0x1.aa8...p-29
+
+  // p1 = y^2 * 1/120 - 1/6
+  float p1 = fputil::multiply_add(y_sq, 0x1.111112p-7f, -0x1.555556p-3f);
+  // q1 = y^2 * coeff(Q, 4) + coeff(Q, 2)
+  float q1 = fputil::multiply_add(y_sq, 0x1.54b8bep-5f, -0x1.ffffc4p-2f);
+  float y3 = y_sq * y;
+  // c1 ~ cos(y)
+  float c1 = fputil::multiply_add(y_sq, q1, 1.0f);
+  // s1 ~ sin(y)
+  float s1 = fputil::multiply_add(y3, p1, y);
+
+  if constexpr (IS_SIN) {
+    // sin(x) = cos(k * pi/8) * sin(y) + sin(k * pi/8) * cos(y).
+    return fputil::multiply_add(cos_k, s1, sin_k * c1);
+  } else {
+    // cos(x) = cos(k * pi/8) * cos(y) - sin(k * pi/8) * sin(y).
+    return fputil::multiply_add(cos_k, c1, -sin_k * s1);
+  }
+}
+
+} // namespace sincosf_float_eval
+
+} // namespace math
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_SINCOSF_FLOAT_EVAL_H
diff --git a/libc/src/arpa/inet/CMakeLists.txt b/libc/src/arpa/inet/CMakeLists.txt
index 1f39a076fde91..bb43e24ec9d0b 100644
--- a/libc/src/arpa/inet/CMakeLists.txt
+++ b/libc/src/arpa/inet/CMakeLists.txt
@@ -22,6 +22,19 @@ add_entrypoint_object(
     libc.src.__support.common
 )
 
+add_entrypoint_object(
+  inet_aton
+  SRCS
+    inet_aton.cpp
+  HDRS
+    inet_aton.h
+  DEPENDS
+    libc.include.arpa_inet
+    libc.include.llvm-libc-types.in_addr
+    libc.src.__support.common
+    libc.src.__support.str_to_integer
+)
+
 add_entrypoint_object(
   ntohl
   SRCS
diff --git a/libc/src/arpa/inet/inet_aton.cpp b/libc/src/arpa/inet/inet_aton.cpp
new file mode 100644
index 0000000000000..71419cb9a00c8
--- /dev/null
+++ b/libc/src/arpa/inet/inet_aton.cpp
@@ -0,0 +1,57 @@
+//===-- Implementation of inet_aton function ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/arpa/inet/inet_aton.h"
+#include "src/__support/common.h"
+#include "src/__support/endian_internal.h"
+#include "src/__support/str_to_integer.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(int, inet_aton, (const char *cp, in_addr *inp)) {
+  constexpr int IPV4_MAX_DOT_NUM = 3;
+  unsigned long parts[IPV4_MAX_DOT_NUM + 1] = {0};
+  int dot_num = 0;
+
+  for (; dot_num <= IPV4_MAX_DOT_NUM; ++dot_num) {
+    auto result = internal::strtointeger<unsigned long>(cp, 0);
+    parts[dot_num] = result;
+
+    if (result.has_error() || result.parsed_len == 0)
+      return 0;
+    char next_char = *(cp + result.parsed_len);
+    if (next_char != '.' && next_char != '\0')
+      return 0;
+    else if (next_char == '\0')
+      break;
+    else
+      cp += (result.parsed_len + 1);
+  }
+
+  if (dot_num > IPV4_MAX_DOT_NUM)
+    return 0;
+
+  // converts the Internet host address cp from the IPv4 numbers-and-dots
+  // notation (a[.b[.c[.d]]]) into binary form (in network byte order)
+  unsigned long result = 0;
+  for (int i = 0; i <= dot_num; ++i) {
+    unsigned long max_part =
+        i == dot_num ? (0xffffffffUL >> (8 * dot_num)) : 0xffUL;
+    if (parts[i] > max_part)
+      return 0;
+    int shift = i == dot_num ? 0 : 8 * (IPV4_MAX_DOT_NUM - i);
+    result |= parts[i] << shift;
+  }
+
+  if (inp)
+    inp->s_addr = Endian::to_big_endian(static_cast<uint32_t>(result));
+
+  return 1;
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/arpa/inet/inet_aton.h b/libc/src/arpa/inet/inet_aton.h
new file mode 100644
index 0000000000000..ea387d1f6b2f6
--- /dev/null
+++ b/libc/src/arpa/inet/inet_aton.h
@@ -0,0 +1,21 @@
+//===-- Implementation header of inet_aton ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_ARPA_INET_INET_ATON_H
+#define LLVM_LIBC_SRC_ARPA_INET_INET_ATON_H
+
+#include "include/llvm-libc-types/in_addr.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+int inet_aton(const char *cp, in_addr *inp);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_ARPA_INET_INET_ATON_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 0754b5e0b08e1..6068c36e558ef 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -1488,17 +1488,7 @@ add_entrypoint_object(
   HDRS
     ../exp2m1f.h
   DEPENDS
-    libc.src.errno.errno
-    libc.src.__support.common
-    libc.src.__support.FPUtil.except_value_utils
-    libc.src.__support.FPUtil.fenv_impl
-    libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.FPUtil.multiply_add
-    libc.src.__support.FPUtil.polyeval
-    libc.src.__support.FPUtil.rounding_mode
-    libc.src.__support.macros.optimization
-    libc.src.__support.macros.properties.cpu_features
-    libc.src.__support.math.exp10f_utils
+    libc.src.__support.math.exp2m1f
 )
 
 add_entrypoint_object(
diff --git a/libc/src/math/generic/asinpif16.cpp b/libc/src/math/generic/asinpif16.cpp
index aabc0863ba527..395f4c4ebc070 100644
--- a/libc/src/math/generic/asinpif16.cpp
+++ b/libc/src/math/generic/asinpif16.cpp
@@ -77,7 +77,7 @@ LLVM_LIBC_FUNCTION(float16, asinpif16, (float16 x)) {
   };
   // polynomial evaluation using horner's method
   // work only for |x| in [0, 0.5]
-  auto asinpi_polyeval = [](double x) -> double {
+  auto asinpi_polyeval = [&](double x) -> double {
     return x * fputil::polyeval(x * x, POLY_COEFFS[0], POLY_COEFFS[1],
                                 POLY_COEFFS[2], POLY_COEFFS[3], POLY_COEFFS[4],
                                 POLY_COEFFS[5], POLY_COEFFS[6], POLY_COEFFS[7]);
diff --git a/libc/src/math/generic/exp2m1f.cpp b/libc/src/math/generic/exp2m1f.cpp
index 16244edb4c583..14d026f1cb84e 100644
--- a/libc/src/math/generic/exp2m1f.cpp
+++ b/libc/src/math/generic/exp2m1f.cpp
@@ -7,183 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/exp2m1f.h"
-#include "src/__support/FPUtil/FEnvImpl.h"
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/PolyEval.h"
-#include "src/__support/FPUtil/except_value_utils.h"
-#include "src/__support/FPUtil/multiply_add.h"
-#include "src/__support/FPUtil/rounding_mode.h"
-#include "src/__support/common.h"
-#include "src/__support/libc_errno.h"
-#include "src/__support/macros/config.h"
-#include "src/__support/macros/optimization.h"
-#include "src/__support/macros/properties/cpu_features.h"
-#include "src/__support/math/exp10f_utils.h"
+#include "src/__support/math/exp2m1f.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-static constexpr size_t N_EXCEPTS_LO = 8;
-
-static constexpr fputil::ExceptValues<float, N_EXCEPTS_LO> EXP2M1F_EXCEPTS_LO =
-    {{
-        // (input, RZ output, RU offset, RD offset, RN offset)
-        // x = 0x1.36dc8ep-36, exp2m1f(x) = 0x1.aef212p-37 (RZ)
-        {0x2d9b'6e47U, 0x2d57'7909U, 1U, 0U, 0U},
-        // x = 0x1.224936p-19, exp2m1f(x) = 0x1.926c0ep-20 (RZ)
-        {0x3611'249bU, 0x35c9'3607U, 1U, 0U, 1U},
-        // x = 0x1.d16d2p-20, exp2m1f(x) = 0x1.429becp-20 (RZ)
-        {0x35e8'b690U, 0x35a1'4df6U, 1U, 0U, 1U},
-        // x = 0x1.17949ep-14, exp2m1f(x) = 0x1.8397p-15 (RZ)
-        {0x388b'ca4fU, 0x3841'cb80U, 1U, 0U, 1U},
-        // x = -0x1.9c3e1ep-38, exp2m1f(x) = -0x1.1dbeacp-38 (RZ)
-        {0xacce'1f0fU, 0xac8e'df56U, 0U, 1U, 0U},
-        // x = -0x1.4d89b4p-32, exp2m1f(x) = -0x1.ce61b6p-33 (RZ)
-        {0xafa6'c4daU, 0xaf67'30dbU, 0U, 1U, 1U},
-        // x = -0x1.a6eac4p-10, exp2m1f(x) = -0x1.24fadap-10 (RZ)
-        {0xbad3'7562U, 0xba92'7d6dU, 0U, 1U, 1U},
-        // x = -0x1.e7526ep-6, exp2m1f(x) = -0x1.4e53dep-6 (RZ)
-        {0xbcf3'a937U, 0xbca7'29efU, 0U, 1U, 1U},
-    }};
-
-static constexpr size_t N_EXCEPTS_HI = 3;
-
-static constexpr fputil::ExceptValues<float, N_EXCEPTS_HI> EXP2M1F_EXCEPTS_HI =
-    {{
-        // (input, RZ output, RU offset, RD offset, RN offset)
-        // x = 0x1.16a972p-1, exp2m1f(x) = 0x1.d545b2p-2 (RZ)
-        {0x3f0b'54b9U, 0x3eea'a2d9U, 1U, 0U, 0U},
-        // x = -0x1.9f12acp-5, exp2m1f(x) = -0x1.1ab68cp-5 (RZ)
-        {0xbd4f'8956U, 0xbd0d'5b46U, 0U, 1U, 0U},
-        // x = -0x1.de7b9cp-5, exp2m1f(x) = -0x1.4508f4p-5 (RZ)
-        {0xbd6f'3dceU, 0xbd22'847aU, 0U, 1U, 1U},
-    }};
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-LLVM_LIBC_FUNCTION(float, exp2m1f, (float x)) {
-  using FPBits = fputil::FPBits<float>;
-  FPBits xbits(x);
-
-  uint32_t x_u = xbits.uintval();
-  uint32_t x_abs = x_u & 0x7fff'ffffU;
-
-  // When |x| >= 128, or x is nan, or |x| <= 2^-5
-  if (LIBC_UNLIKELY(x_abs >= 0x4300'0000U || x_abs <= 0x3d00'0000U)) {
-    // |x| <= 2^-5
-    if (x_abs <= 0x3d00'0000U) {
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-      if (auto r = EXP2M1F_EXCEPTS_LO.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
-        return r.value();
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-      // Minimax polynomial generated by Sollya with:
-      // > display = hexadecimal;
-      // > fpminimax((2^x - 1)/x, 5, [|D...|], [-2^-5, 2^-5]);
-      constexpr double COEFFS[] = {
-          0x1.62e42fefa39f3p-1, 0x1.ebfbdff82c57bp-3,  0x1.c6b08d6f2d7aap-5,
-          0x1.3b2ab6fc92f5dp-7, 0x1.5d897cfe27125p-10, 0x1.43090e61e6af1p-13};
-      double xd = x;
-      double xsq = xd * xd;
-      double c0 = fputil::multiply_add(xd, COEFFS[1], COEFFS[0]);
-      double c1 = fputil::multiply_add(xd, COEFFS[3], COEFFS[2]);
-      double c2 = fputil::multiply_add(xd, COEFFS[5], COEFFS[4]);
-      double p = fputil::polyeval(xsq, c0, c1, c2);
-      return static_cast<float>(p * xd);
-    }
-
-    // x >= 128, or x is nan
-    if (xbits.is_pos()) {
-      if (xbits.is_finite()) {
-        int rounding = fputil::quick_get_round();
-        if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
-          return FPBits::max_normal().get_val();
-
-        fputil::set_errno_if_required(ERANGE);
-        fputil::raise_except_if_required(FE_OVERFLOW);
-      }
-
-      // x >= 128 and 2^x - 1 rounds to +inf, or x is +inf or nan
-      return x + FPBits::inf().get_val();
-    }
-  }
-
-  if (LIBC_UNLIKELY(x <= -25.0f)) {
-    // 2^(-inf) - 1 = -1
-    if (xbits.is_inf())
-      return -1.0f;
-    // 2^nan - 1 = nan
-    if (xbits.is_nan())
-      return x;
-
-    int rounding = fputil::quick_get_round();
-    if (rounding == FE_UPWARD || rounding == FE_TOWARDZERO)
-      return -0x1.ffff'fep-1f; // -1.0f + 0x1.0p-24f
-
-    fputil::set_errno_if_required(ERANGE);
-    fputil::raise_except_if_required(FE_UNDERFLOW);
-    return -1.0f;
-  }
-
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-  if (auto r = EXP2M1F_EXCEPTS_HI.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
-    return r.value();
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-  // For -25 < x < 128, to compute 2^x, we perform the following range
-  // reduction: find hi, mid, lo such that:
-  //   x = hi + mid + lo, in which:
-  //     hi is an integer,
-  //     0 <= mid * 2^5 < 32 is an integer,
-  //     -2^(-6) <= lo <= 2^(-6).
-  // In particular,
-  //   hi + mid = round(x * 2^5) * 2^(-5).
-  // Then,
-  //   2^x = 2^(hi + mid + lo) = 2^hi * 2^mid * 2^lo.
-  // 2^mid is stored in the lookup table of 32 elements.
-  // 2^lo is computed using a degree-4 minimax polynomial generated by Sollya.
-  // We perform 2^hi * 2^mid by simply add hi to the exponent field of 2^mid.
-
-  // kf = (hi + mid) * 2^5 = round(x * 2^5)
-  float kf;
-  int k;
-#ifdef LIBC_TARGET_CPU_HAS_NEAREST_INT
-  kf = fputil::nearest_integer(x * 32.0f);
-  k = static_cast<int>(kf);
-#else
-  constexpr float HALF[2] = {0.5f, -0.5f};
-  k = static_cast<int>(fputil::multiply_add(x, 32.0f, HALF[x < 0.0f]));
-  kf = static_cast<float>(k);
-#endif // LIBC_TARGET_CPU_HAS_NEAREST_INT
-
-  // lo = x - (hi + mid) = x - kf * 2^(-5)
-  double lo = fputil::multiply_add(-0x1.0p-5f, kf, x);
-
-  // hi = floor(kf * 2^(-4))
-  // exp2_hi = shift hi to the exponent field of double precision.
-  int64_t exp2_hi =
-      static_cast<int64_t>(static_cast<uint64_t>(k >> ExpBase::MID_BITS)
-                           << fputil::FPBits<double>::FRACTION_LEN);
-  // mh = 2^hi * 2^mid
-  // mh_bits = bit field of mh
-  int64_t mh_bits = ExpBase::EXP_2_MID[k & ExpBase::MID_MASK] + exp2_hi;
-  double mh = fputil::FPBits<double>(static_cast<uint64_t>(mh_bits)).get_val();
-
-  // Degree-4 polynomial approximating (2^x - 1)/x generated by Sollya with:
-  // > display = hexadecimal;
-  // > fpminimax((2^x - 1)/x, 4, [|D...|], [-2^-6, 2^-6]);
-  constexpr double COEFFS[5] = {0x1.62e42fefa39efp-1, 0x1.ebfbdff8131c4p-3,
-                                0x1.c6b08d7061695p-5, 0x1.3b2b1bee74b2ap-7,
-                                0x1.5d88091198529p-10};
-  double lo_sq = lo * lo;
-  double c1 = fputil::multiply_add(lo, COEFFS[0], 1.0);
-  double c2 = fputil::multiply_add(lo, COEFFS[2], COEFFS[1]);
-  double c3 = fputil::multiply_add(lo, COEFFS[4], COEFFS[3]);
-  double exp2_lo = fputil::polyeval(lo_sq, c1, c2, c3);
-  // 2^x - 1 = 2^(hi + mid + lo) - 1
-  //         = 2^(hi + mid) * 2^lo - 1
-  //         ~ mh * (1 + lo * P(lo)) - 1
-  //         = mh * exp2_lo - 1
-  return static_cast<float>(fputil::multiply_add(exp2_lo, mh, -1.0));
-}
+LLVM_LIBC_FUNCTION(float, exp2m1f, (float x)) { return math::exp2m1f(x); }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/sinf.cpp b/libc/src/math/generic/sinf.cpp
index a8e634ce2f5ac..c362628fb106c 100644
--- a/libc/src/math/generic/sinf.cpp
+++ b/libc/src/math/generic/sinf.cpp
@@ -17,13 +17,30 @@
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h"            // LIBC_UNLIKELY
 #include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA
+
+#if defined(LIBC_MATH_HAS_SKIP_ACCURATE_PASS) &&                               \
+    defined(LIBC_MATH_HAS_INTERMEDIATE_COMP_IN_FLOAT) &&                       \
+    defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT)
+
+#include "src/__support/math/sincosf_float_eval.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(float, sinf, (float x)) {
+  return math::sincosf_float_eval::sincosf_eval</*IS_SIN*/ true>(x);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#else // !LIBC_MATH_HAS_INTERMEDIATE_COMP_IN_FLOAT
+
 #include "src/__support/math/sincosf_utils.h"
 
-#if defined(LIBC_TARGET_CPU_HAS_FMA_DOUBLE)
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
 #include "src/__support/math/range_reduction_fma.h"
-#else
+#else // !LIBC_TARGET_CPU_HAS_FMA_DOUBLE
 #include "src/__support/math/range_reduction.h"
-#endif
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
 
 namespace LIBC_NAMESPACE_DECL {
 
@@ -162,3 +179,4 @@ LLVM_LIBC_FUNCTION(float, sinf, (float x)) {
 }
 
 } // namespace LIBC_NAMESPACE_DECL
+#endif // LIBC_MATH_HAS_INTERMEDIATE_COMP_IN_FLOAT
diff --git a/libc/src/nl_types/CMakeLists.txt b/libc/src/nl_types/CMakeLists.txt
new file mode 100644
index 0000000000000..9783e0ecb9678
--- /dev/null
+++ b/libc/src/nl_types/CMakeLists.txt
@@ -0,0 +1,31 @@
+add_entrypoint_object(
+  catopen
+  SRCS
+    catopen.cpp
+  HDRS
+    catopen.h
+  DEPENDS
+    libc.include.llvm-libc-types.nl_catd
+    libc.src.errno.errno
+)
+
+add_entrypoint_object(
+  catclose
+  SRCS
+    catclose.cpp
+  HDRS
+    catclose.h
+  DEPENDS
+    libc.include.llvm-libc-types.nl_catd
+)
+
+add_entrypoint_object(
+  catgets
+  SRCS
+    catgets.cpp
+  HDRS
+    catgets.h
+  DEPENDS
+    libc.include.llvm-libc-types.nl_catd
+)
+
diff --git a/libc/src/nl_types/catclose.cpp b/libc/src/nl_types/catclose.cpp
new file mode 100644
index 0000000000000..1f87900d01134
--- /dev/null
+++ b/libc/src/nl_types/catclose.cpp
@@ -0,0 +1,22 @@
+//===-- Implementation of catclose ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/nl_types/catclose.h"
+#include "include/llvm-libc-types/nl_catd.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(int, catclose, ([[maybe_unused]] nl_catd catalog)) {
+  // TODO: Add implementation for message catalogs. For now, return error
+  // regardless of input.
+  return -1;
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/nl_types/catclose.h b/libc/src/nl_types/catclose.h
new file mode 100644
index 0000000000000..433020a9dabb0
--- /dev/null
+++ b/libc/src/nl_types/catclose.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for catclose ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_NL_TYPES_CATCLOSE_H
+#define LLVM_LIBC_SRC_NL_TYPES_CATCLOSE_H
+
+#include "include/llvm-libc-types/nl_catd.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+int catclose(nl_catd catalog);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_NL_TYPES_CATCLOSE_H
diff --git a/libc/src/nl_types/catgets.cpp b/libc/src/nl_types/catgets.cpp
new file mode 100644
index 0000000000000..37689777aaef6
--- /dev/null
+++ b/libc/src/nl_types/catgets.cpp
@@ -0,0 +1,25 @@
+//===-- Implementation of catgets -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/nl_types/catgets.h"
+#include "include/llvm-libc-types/nl_catd.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(char *, catgets,
+                   ([[maybe_unused]] nl_catd catalog,
+                    [[maybe_unused]] int set_number,
+                    [[maybe_unused]] int message_number, const char *message)) {
+  // TODO: Add implementation for message catalogs. For now, return backup
+  // message regardless of input.
+  return const_cast<char *>(message);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/nl_types/catgets.h b/libc/src/nl_types/catgets.h
new file mode 100644
index 0000000000000..c909bec44ce8d
--- /dev/null
+++ b/libc/src/nl_types/catgets.h
@@ -0,0 +1,22 @@
+//===-- Implementation header for catgets -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_NL_TYPES_CATGETS_H
+#define LLVM_LIBC_SRC_NL_TYPES_CATGETS_H
+
+#include "include/llvm-libc-types/nl_catd.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+char *catgets(nl_catd catalog, int set_number, int message_number,
+              const char *message);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_NL_TYPES_CATGETS_H
diff --git a/libc/src/nl_types/catopen.cpp b/libc/src/nl_types/catopen.cpp
new file mode 100644
index 0000000000000..393d7601545f2
--- /dev/null
+++ b/libc/src/nl_types/catopen.cpp
@@ -0,0 +1,26 @@
+//===-- Implementation of catopen -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/nl_types/catopen.h"
+#include "include/llvm-libc-types/nl_catd.h"
+#include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(nl_catd, catopen,
+                   ([[maybe_unused]] const char *name,
+                    [[maybe_unused]] int flag)) {
+  // TODO: Add implementation for message catalogs. For now, return error
+  // regardless of input.
+  libc_errno = EINVAL;
+  return reinterpret_cast<nl_catd>(-1);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/nl_types/catopen.h b/libc/src/nl_types/catopen.h
new file mode 100644
index 0000000000000..08ff71ac3d32d
--- /dev/null
+++ b/libc/src/nl_types/catopen.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for catopen -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_NL_TYPES_CATOPEN_H
+#define LLVM_LIBC_SRC_NL_TYPES_CATOPEN_H
+
+#include "include/llvm-libc-types/nl_catd.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+nl_catd catopen(const char *name, int flag);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_NL_TYPES_CATOPEN_H
diff --git a/libc/test/include/netinet_in_test.cpp b/libc/test/include/netinet_in_test.cpp
index a6c47a779ea44..714892f511b1c 100644
--- a/libc/test/include/netinet_in_test.cpp
+++ b/libc/test/include/netinet_in_test.cpp
@@ -17,3 +17,13 @@ TEST(LlvmLibcNetinetInTest, IPPROTOMacro) {
   EXPECT_EQ(IPPROTO_IPV6, 41);
   EXPECT_EQ(IPPROTO_RAW, 255);
 }
+
+TEST(LlvmLibcNetinetInTest, IPV6Macro) {
+  EXPECT_EQ(IPV6_UNICAST_HOPS, 16);
+  EXPECT_EQ(IPV6_MULTICAST_IF, 17);
+  EXPECT_EQ(IPV6_MULTICAST_HOPS, 18);
+  EXPECT_EQ(IPV6_MULTICAST_LOOP, 19);
+  EXPECT_EQ(IPV6_JOIN_GROUP, 20);
+  EXPECT_EQ(IPV6_LEAVE_GROUP, 21);
+  EXPECT_EQ(IPV6_V6ONLY, 26);
+}
diff --git a/libc/test/shared/CMakeLists.txt b/libc/test/shared/CMakeLists.txt
index 8d81199f19c83..aede395350821 100644
--- a/libc/test/shared/CMakeLists.txt
+++ b/libc/test/shared/CMakeLists.txt
@@ -43,6 +43,7 @@ add_fp_unittest(
     libc.src.__support.math.exp2
     libc.src.__support.math.exp2f
     libc.src.__support.math.exp2f16
+    libc.src.__support.math.exp2m1f
     libc.src.__support.math.exp10
     libc.src.__support.math.exp10f
     libc.src.__support.math.exp10f16
diff --git a/libc/test/shared/shared_math_test.cpp b/libc/test/shared/shared_math_test.cpp
index 84787d5e02a3a..a6825a10654c9 100644
--- a/libc/test/shared/shared_math_test.cpp
+++ b/libc/test/shared/shared_math_test.cpp
@@ -61,6 +61,7 @@ TEST(LlvmLibcSharedMathTest, AllFloat) {
   EXPECT_FP_EQ(0x0p+0f, LIBC_NAMESPACE::shared::exp10m1f(0.0f));
   EXPECT_FP_EQ(0x0p+0f, LIBC_NAMESPACE::shared::erff(0.0f));
   EXPECT_FP_EQ(0x1p+0f, LIBC_NAMESPACE::shared::exp10f(0.0f));
+  EXPECT_FP_EQ(0x0p+0f, LIBC_NAMESPACE::shared::exp2m1f(0.0f));
   EXPECT_FP_EQ(0x1p+0f, LIBC_NAMESPACE::shared::expf(0.0f));
   EXPECT_FP_EQ(0x1p+0f, LIBC_NAMESPACE::shared::exp2f(0.0f));
 
diff --git a/libc/test/src/CMakeLists.txt b/libc/test/src/CMakeLists.txt
index c576e08e84bc1..0c6ec9f07a9b7 100644
--- a/libc/test/src/CMakeLists.txt
+++ b/libc/test/src/CMakeLists.txt
@@ -96,6 +96,7 @@ add_subdirectory(assert)
 add_subdirectory(compiler)
 add_subdirectory(dirent)
 add_subdirectory(locale)
+add_subdirectory(nl_types)
 add_subdirectory(signal)
 add_subdirectory(spawn)
 
diff --git a/libc/test/src/arpa/inet/CMakeLists.txt b/libc/test/src/arpa/inet/CMakeLists.txt
index 6e78e3a50e612..690f751bef5e1 100644
--- a/libc/test/src/arpa/inet/CMakeLists.txt
+++ b/libc/test/src/arpa/inet/CMakeLists.txt
@@ -6,8 +6,6 @@ add_libc_unittest(
     libc_arpa_inet_unittests
   SRCS
     htonl_test.cpp
-  CXX_STANDARD
-    20
   DEPENDS
     libc.src.arpa.inet.htonl
     libc.src.arpa.inet.ntohl
@@ -19,21 +17,28 @@ add_libc_unittest(
     libc_arpa_inet_unittests
   SRCS
     htons_test.cpp
-  CXX_STANDARD
-    20
   DEPENDS
     libc.src.arpa.inet.htons
     libc.src.arpa.inet.ntohs
 )
 
+add_libc_unittest(
+  inet_aton
+  SUITE
+    libc_arpa_inet_unittests
+  SRCS
+    inet_aton_test.cpp
+  DEPENDS
+    libc.src.arpa.inet.htonl
+    libc.src.arpa.inet.inet_aton
+)
+
 add_libc_unittest(
   ntohl
   SUITE
     libc_arpa_inet_unittests
   SRCS
     ntohl_test.cpp
-  CXX_STANDARD
-    20
   DEPENDS
     libc.src.arpa.inet.htonl
     libc.src.arpa.inet.ntohl
@@ -45,8 +50,6 @@ add_libc_unittest(
     libc_arpa_inet_unittests
   SRCS
     ntohs_test.cpp
-  CXX_STANDARD
-    20
   DEPENDS
     libc.src.arpa.inet.htons
     libc.src.arpa.inet.ntohs
diff --git a/libc/test/src/arpa/inet/inet_aton_test.cpp b/libc/test/src/arpa/inet/inet_aton_test.cpp
new file mode 100644
index 0000000000000..c9c97870e0dff
--- /dev/null
+++ b/libc/test/src/arpa/inet/inet_aton_test.cpp
@@ -0,0 +1,92 @@
+//===-- Unittests for inet_aton -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/arpa/inet/htonl.h"
+#include "src/arpa/inet/inet_aton.h"
+#include "test/UnitTest/Test.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+TEST(LlvmLibcInetAton, ValidTest) {
+  in_addr a;
+
+  // a.b.c.d
+  a.s_addr = 0;
+  ASSERT_EQ(1, inet_aton("127.1.2.4", &a));
+  ASSERT_EQ(htonl(0x7f010204), a.s_addr);
+
+  // a.b.c
+  a.s_addr = 0;
+  ASSERT_EQ(1, inet_aton("127.1.4", &a));
+  ASSERT_EQ(htonl(0x7f010004), a.s_addr);
+
+  // a.b
+  a.s_addr = 0;
+  ASSERT_EQ(1, inet_aton("127.1", &a));
+  ASSERT_EQ(htonl(0x7f000001), a.s_addr);
+
+  // a
+  a.s_addr = 0;
+  ASSERT_EQ(1, inet_aton("0x7f000001", &a));
+  ASSERT_EQ(htonl(0x7f000001), a.s_addr);
+
+  // Hex (0x) and mixed-case hex digits.
+  a.s_addr = 0;
+  ASSERT_EQ(1, inet_aton("0xFf.0.0.1", &a));
+  ASSERT_EQ(htonl(0xff000001), a.s_addr);
+
+  // Hex (0X) and mixed-case hex digits.
+  a.s_addr = 0;
+  ASSERT_EQ(1, inet_aton("0XfF.0.0.1", &a));
+  ASSERT_EQ(htonl(0xff000001), a.s_addr);
+
+  // Octal.
+  a.s_addr = 0;
+  ASSERT_EQ(1, inet_aton("0177.0.0.1", &a));
+  ASSERT_EQ(htonl(0x7f000001), a.s_addr);
+
+  a.s_addr = 0;
+  ASSERT_EQ(1, inet_aton("036", &a));
+  ASSERT_EQ(htonl(036U), a.s_addr);
+}
+
+TEST(LlvmLibcInetAton, InvalidTest) {
+  ASSERT_EQ(0, inet_aton("", nullptr));           // Empty.
+  ASSERT_EQ(0, inet_aton("x", nullptr));          // Leading junk.
+  ASSERT_EQ(0, inet_aton("127.0.0.1x", nullptr)); // Trailing junk.
+  ASSERT_EQ(0, inet_aton("09.0.0.1", nullptr));   // Invalid octal.
+  ASSERT_EQ(0, inet_aton("0xg.0.0.1", nullptr));  // Invalid hex.
+  ASSERT_EQ(0, inet_aton("1.2.3.4.5", nullptr));  // Too many dots.
+  ASSERT_EQ(0, inet_aton("1.2.3.4.", nullptr));   // Trailing dot.
+
+  // Out of range a.b.c.d form.
+  ASSERT_EQ(0, inet_aton("999.0.0.1", nullptr));
+  ASSERT_EQ(0, inet_aton("0.999.0.1", nullptr));
+  ASSERT_EQ(0, inet_aton("0.0.999.1", nullptr));
+  ASSERT_EQ(0, inet_aton("0.0.0.999", nullptr));
+
+  // Out of range a.b.c form.
+  ASSERT_EQ(0, inet_aton("256.0.0", nullptr));
+  ASSERT_EQ(0, inet_aton("0.256.0", nullptr));
+  ASSERT_EQ(0, inet_aton("0.0.0x10000", nullptr));
+
+  // Out of range a.b form.
+  ASSERT_EQ(0, inet_aton("256.0", nullptr));
+  ASSERT_EQ(0, inet_aton("0.0x1000000", nullptr));
+
+  // Out of range a form.
+  ASSERT_EQ(0, inet_aton("0x100000000", nullptr));
+
+  // 64-bit overflow.
+  ASSERT_EQ(0, inet_aton("0x10000000000000000", nullptr));
+
+  // Out of range octal.
+  ASSERT_EQ(0, inet_aton("0400.0.0.1", nullptr));
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt
index b3f54ab619dd9..ff5c511922171 100644
--- a/libc/test/src/math/CMakeLists.txt
+++ b/libc/test/src/math/CMakeLists.txt
@@ -16,6 +16,20 @@ add_fp_unittest(
     libc.src.__support.FPUtil.fp_bits
 )
 
+add_fp_unittest(
+  cosf_float_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    cosf_float_test.cpp
+  DEPENDS
+    libc.src.__support.math.sincosf_utils
+    libc.src.__support.FPUtil.fp_bits
+  FLAGS
+    FMA_OPT__ONLY
+)
+
 add_fp_unittest(
   cos_test
   NEED_MPFR
@@ -96,6 +110,20 @@ add_fp_unittest(
     libc.src.__support.FPUtil.fp_bits
 )
 
+add_fp_unittest(
+  sinf_float_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    sinf_float_test.cpp
+  DEPENDS
+    libc.src.__support.math.sincosf_utils
+    libc.src.__support.FPUtil.fp_bits
+  FLAGS
+    FMA_OPT__ONLY
+)
+
 add_fp_unittest(
   sinf16_test
   NEED_MPFR
diff --git a/libc/test/src/math/cosf_float_test.cpp b/libc/test/src/math/cosf_float_test.cpp
new file mode 100644
index 0000000000000..3d573b211f4b4
--- /dev/null
+++ b/libc/test/src/math/cosf_float_test.cpp
@@ -0,0 +1,35 @@
+//===-- Unittests for cosf float-only -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/math/sincosf_float_eval.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+#include "utils/MPFRWrapper/MPFRUtils.h"
+
+#include "hdr/stdint_proxy.h"
+
+using LlvmLibcCosfFloatTest = LIBC_NAMESPACE::testing::FPTest<float>;
+
+float cosf_fast(float x) {
+  return LIBC_NAMESPACE::math::sincosf_float_eval::sincosf_eval<
+      /*IS_SIN*/ false>(x);
+}
+
+namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
+
+TEST_F(LlvmLibcCosfFloatTest, InFloatRange) {
+  constexpr uint32_t COUNT = 100'000;
+  constexpr uint32_t STEP = UINT32_MAX / COUNT;
+  for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
+    float x = FPBits(v).get_val();
+    if (FPBits(v).is_nan() || FPBits(v).is_inf())
+      continue;
+    ASSERT_MPFR_MATCH(mpfr::Operation::Cos, x, cosf_fast(x), 3.5);
+  }
+}
diff --git a/libc/test/src/math/exhaustive/CMakeLists.txt b/libc/test/src/math/exhaustive/CMakeLists.txt
index 1583ab6f74680..2ff4f02041776 100644
--- a/libc/test/src/math/exhaustive/CMakeLists.txt
+++ b/libc/test/src/math/exhaustive/CMakeLists.txt
@@ -57,6 +57,21 @@ add_fp_unittest(
     -lpthread
 )
 
+add_fp_unittest(
+  sinf_float_test
+  NO_RUN_POSTBUILD
+  NEED_MPFR
+  SUITE
+    libc_math_exhaustive_tests
+  SRCS
+    sinf_float_test.cpp
+  LINK_LIBRARIES
+    -lpthread
+  DEPENDS
+    .exhaustive_test
+    libc.src.__support.math.sincosf_utils
+)
+
 add_fp_unittest(
   sinpif_test
   NO_RUN_POSTBUILD
@@ -89,6 +104,21 @@ add_fp_unittest(
     -lpthread
 )
 
+add_fp_unittest(
+  cosf_float_test
+  NO_RUN_POSTBUILD
+  NEED_MPFR
+  SUITE
+    libc_math_exhaustive_tests
+  SRCS
+    cosf_float_test.cpp
+  LINK_LIBRARIES
+    -lpthread
+  DEPENDS
+    .exhaustive_test
+    libc.src.__support.math.sincosf_utils
+)
+
 add_fp_unittest(
   cospif_test
   NO_RUN_POSTBUILD
diff --git a/libc/test/src/math/exhaustive/cosf_float_test.cpp b/libc/test/src/math/exhaustive/cosf_float_test.cpp
new file mode 100644
index 0000000000000..0c3a98832c84d
--- /dev/null
+++ b/libc/test/src/math/exhaustive/cosf_float_test.cpp
@@ -0,0 +1,44 @@
+//===-- Exhaustive test for cosf - float-only -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "exhaustive_test.h"
+#include "src/__support/math/sincosf_float_eval.h"
+#include "utils/MPFRWrapper/MPFRUtils.h"
+
+namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
+
+float cosf_fast(float x) {
+  return LIBC_NAMESPACE::math::sincosf_float_eval::sincosf_eval<
+      /*IS_SIN*/ false>(x);
+}
+
+using LlvmLibcCosfExhaustiveTest =
+    LlvmLibcUnaryOpExhaustiveMathTest<float, mpfr::Operation::Cos, cosf_fast,
+                                      3>;
+
+// Range: [0, Inf];
+static constexpr uint32_t POS_START = 0x0000'0000U;
+static constexpr uint32_t POS_STOP = 0x7f80'0000U;
+
+TEST_F(LlvmLibcCosfExhaustiveTest, PostiveRange) {
+  std::cout << "-- Testing for FE_TONEAREST in range [0x" << std::hex
+            << POS_START << ", 0x" << POS_STOP << ") --" << std::dec
+            << std::endl;
+  test_full_range(mpfr::RoundingMode::Nearest, POS_START, POS_STOP);
+}
+
+// Range: [-Inf, 0];
+static constexpr uint32_t NEG_START = 0x8000'0000U;
+static constexpr uint32_t NEG_STOP = 0xff80'0000U;
+
+TEST_F(LlvmLibcCosfExhaustiveTest, NegativeRange) {
+  std::cout << "-- Testing for FE_TONEAREST in range [0x" << std::hex
+            << NEG_START << ", 0x" << NEG_STOP << ") --" << std::dec
+            << std::endl;
+  test_full_range(mpfr::RoundingMode::Nearest, NEG_START, NEG_STOP);
+}
diff --git a/libc/test/src/math/exhaustive/exhaustive_test.h b/libc/test/src/math/exhaustive/exhaustive_test.h
index 8be65ba69f725..322d774d46a68 100644
--- a/libc/test/src/math/exhaustive/exhaustive_test.h
+++ b/libc/test/src/math/exhaustive/exhaustive_test.h
@@ -40,7 +40,7 @@ template <typename OutType, typename InType = OutType>
 using UnaryOp = OutType(InType);
 
 template <typename OutType, typename InType, mpfr::Operation Op,
-          UnaryOp<OutType, InType> Func>
+          UnaryOp<OutType, InType> Func, unsigned Tolerance = 0>
 struct UnaryOpChecker : public virtual LIBC_NAMESPACE::testing::Test {
   using FloatType = InType;
   using FPBits = LIBC_NAMESPACE::fputil::FPBits<FloatType>;
@@ -57,8 +57,8 @@ struct UnaryOpChecker : public virtual LIBC_NAMESPACE::testing::Test {
     do {
       FPBits xbits(bits);
       FloatType x = xbits.get_val();
-      bool correct =
-          TEST_MPFR_MATCH_ROUNDING_SILENTLY(Op, x, Func(x), 0.5, rounding);
+      bool correct = TEST_MPFR_MATCH_ROUNDING_SILENTLY(
+          Op, x, Func(x), static_cast<double>(Tolerance) + 0.5, rounding);
       failed += (!correct);
       // Uncomment to print out failed values.
       if (!correct) {
@@ -256,9 +256,10 @@ struct LlvmLibcExhaustiveMathTest
   }
 };
 
-template <typename FloatType, mpfr::Operation Op, UnaryOp<FloatType> Func>
-using LlvmLibcUnaryOpExhaustiveMathTest =
-    LlvmLibcExhaustiveMathTest<UnaryOpChecker<FloatType, FloatType, Op, Func>>;
+template <typename FloatType, mpfr::Operation Op, UnaryOp<FloatType> Func,
+          unsigned Tolerance = 0>
+using LlvmLibcUnaryOpExhaustiveMathTest = LlvmLibcExhaustiveMathTest<
+    UnaryOpChecker<FloatType, FloatType, Op, Func, Tolerance>>;
 
 template <typename OutType, typename InType, mpfr::Operation Op,
           UnaryOp<OutType, InType> Func>
diff --git a/libc/test/src/math/exhaustive/sinf_float_test.cpp b/libc/test/src/math/exhaustive/sinf_float_test.cpp
new file mode 100644
index 0000000000000..1e735e6bcf0ba
--- /dev/null
+++ b/libc/test/src/math/exhaustive/sinf_float_test.cpp
@@ -0,0 +1,47 @@
+//===-- Exhaustive test for sinf - float-only -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Test float-only fast math implementation for sinf.
+#define LIBC_MATH (LIBC_MATH_FAST | LIBC_MATH_INTERMEDIATE_COMP_IN_FLOAT)
+
+#include "exhaustive_test.h"
+#include "src/__support/math/sincosf_float_eval.h"
+#include "utils/MPFRWrapper/MPFRUtils.h"
+
+namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
+
+float sinf_fast(float x) {
+  return LIBC_NAMESPACE::math::sincosf_float_eval::sincosf_eval<
+      /*IS_SIN*/ true>(x);
+}
+
+using LlvmLibcSinfExhaustiveTest =
+    LlvmLibcUnaryOpExhaustiveMathTest<float, mpfr::Operation::Sin, sinf_fast,
+                                      3>;
+
+// Range: [0, Inf];
+static constexpr uint32_t POS_START = 0x0000'0000U;
+static constexpr uint32_t POS_STOP = 0x7f80'0000U;
+
+TEST_F(LlvmLibcSinfExhaustiveTest, PostiveRange) {
+  std::cout << "-- Testing for FE_TONEAREST in range [0x" << std::hex
+            << POS_START << ", 0x" << POS_STOP << ") --" << std::dec
+            << std::endl;
+  test_full_range(mpfr::RoundingMode::Nearest, POS_START, POS_STOP);
+}
+
+// Range: [-Inf, 0];
+static constexpr uint32_t NEG_START = 0x8000'0000U;
+static constexpr uint32_t NEG_STOP = 0xff80'0000U;
+
+TEST_F(LlvmLibcSinfExhaustiveTest, NegativeRange) {
+  std::cout << "-- Testing for FE_TONEAREST in range [0x" << std::hex
+            << NEG_START << ", 0x" << NEG_STOP << ") --" << std::dec
+            << std::endl;
+  test_full_range(mpfr::RoundingMode::Nearest, NEG_START, NEG_STOP);
+}
diff --git a/libc/test/src/math/sinf_float_test.cpp b/libc/test/src/math/sinf_float_test.cpp
new file mode 100644
index 0000000000000..33aab965ba386
--- /dev/null
+++ b/libc/test/src/math/sinf_float_test.cpp
@@ -0,0 +1,35 @@
+//===-- Unittests for sinf float-only -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/math/sincosf_float_eval.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+#include "utils/MPFRWrapper/MPFRUtils.h"
+
+#include "hdr/stdint_proxy.h"
+
+using LlvmLibcSinfFloatTest = LIBC_NAMESPACE::testing::FPTest<float>;
+
+float sinf_fast(float x) {
+  return LIBC_NAMESPACE::math::sincosf_float_eval::sincosf_eval<
+      /*IS_SIN*/ true>(x);
+}
+
+namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
+
+TEST_F(LlvmLibcSinfFloatTest, InFloatRange) {
+  constexpr uint32_t COUNT = 100'000;
+  constexpr uint32_t STEP = UINT32_MAX / COUNT;
+  for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
+    float x = FPBits(v).get_val();
+    if (FPBits(v).is_nan() || FPBits(v).is_inf())
+      continue;
+    ASSERT_MPFR_MATCH(mpfr::Operation::Sin, x, sinf_fast(x), 3.5);
+  }
+}
diff --git a/libc/test/src/nl_types/CMakeLists.txt b/libc/test/src/nl_types/CMakeLists.txt
new file mode 100644
index 0000000000000..4fce637baa726
--- /dev/null
+++ b/libc/test/src/nl_types/CMakeLists.txt
@@ -0,0 +1,14 @@
+add_custom_target(libc-nl-types-tests)
+
+add_libc_test(
+  nl_types_test
+  SUITE
+    libc-nl-types-tests
+  SRCS
+    nl_types_test.cpp
+  DEPENDS
+    libc.include.llvm-libc-types.nl_catd
+    libc.src.nl_types.catopen
+    libc.src.nl_types.catclose
+    libc.src.nl_types.catgets
+)
diff --git a/libc/test/src/nl_types/nl_types_test.cpp b/libc/test/src/nl_types/nl_types_test.cpp
new file mode 100644
index 0000000000000..5ae5c5ab28546
--- /dev/null
+++ b/libc/test/src/nl_types/nl_types_test.cpp
@@ -0,0 +1,33 @@
+//===-- Unittests for nl_types --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "include/llvm-libc-types/nl_catd.h"
+#include "src/nl_types/catclose.h"
+#include "src/nl_types/catgets.h"
+#include "src/nl_types/catopen.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
+
+using LlvmLibcNlTypesTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcNlTypesTest, CatopenFails) {
+  ASSERT_EQ(LIBC_NAMESPACE::catopen("/somepath", 0),
+            reinterpret_cast<nl_catd>(-1));
+  ASSERT_ERRNO_EQ(EINVAL);
+}
+
+TEST_F(LlvmLibcNlTypesTest, CatcloseFails) {
+  ASSERT_EQ(LIBC_NAMESPACE::catclose(nullptr), -1);
+}
+
+TEST_F(LlvmLibcNlTypesTest, CatgetsFails) {
+  const char *message = "message";
+  // Note that we test for pointer equality here, since catgets
+  // is expected to return the input argument as-is.
+  ASSERT_EQ(LIBC_NAMESPACE::catgets(nullptr, 0, 0, message),
+            const_cast<char *>(message));
+}
diff --git a/libc/test/src/sys/mman/linux/CMakeLists.txt b/libc/test/src/sys/mman/linux/CMakeLists.txt
index a362c1cf61cbc..32fee920321c0 100644
--- a/libc/test/src/sys/mman/linux/CMakeLists.txt
+++ b/libc/test/src/sys/mman/linux/CMakeLists.txt
@@ -99,6 +99,7 @@ add_libc_unittest(
     libc.src.sys.mman.mincore
     libc.src.sys.mman.mlock
     libc.src.sys.mman.munlock
+    libc.src.unistd.sysconf
     libc.test.UnitTest.ErrnoCheckingTest
     libc.test.UnitTest.ErrnoSetterMatcher
 )
@@ -123,6 +124,7 @@ add_libc_unittest(
     libc.src.sys.mman.mlockall
     libc.src.sys.mman.munlockall
     libc.src.sys.resource.getrlimit
+    libc.src.unistd.sysconf
     libc.src.__support.OSUtil.osutil
     libc.test.UnitTest.ErrnoCheckingTest
     libc.test.UnitTest.ErrnoSetterMatcher
@@ -144,6 +146,7 @@ add_libc_unittest(
     libc.src.sys.mman.mincore
     libc.src.sys.mman.mlock
     libc.src.sys.mman.munlock
+    libc.src.unistd.sysconf
     libc.test.UnitTest.ErrnoCheckingTest
     libc.test.UnitTest.ErrnoSetterMatcher
 )
@@ -165,6 +168,7 @@ add_libc_unittest(
     libc.src.sys.mman.munmap
     libc.src.fcntl.open
     libc.src.unistd.close
+    libc.src.unistd.sysconf
 )
 
 add_libc_unittest(
diff --git a/libc/test/src/sys/mman/linux/mincore_test.cpp b/libc/test/src/sys/mman/linux/mincore_test.cpp
index fb86252992def..1806bb1d671a2 100644
--- a/libc/test/src/sys/mman/linux/mincore_test.cpp
+++ b/libc/test/src/sys/mman/linux/mincore_test.cpp
@@ -12,6 +12,7 @@
 #include "src/sys/mman/mmap.h"
 #include "src/sys/mman/munlock.h"
 #include "src/sys/mman/munmap.h"
+#include "src/unistd/sysconf.h"
 #include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -20,8 +21,7 @@ using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
 using LlvmLibcMincoreTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
 
-// TODO: Replace with sysconf call once the function is properly implemented.
-constexpr size_t PAGE_SIZE = 4096;
+const size_t PAGE_SIZE = LIBC_NAMESPACE::sysconf(_SC_PAGESIZE);
 
 TEST_F(LlvmLibcMincoreTest, UnMappedMemory) {
   unsigned char vec;
diff --git a/libc/test/src/sys/mman/linux/mlock_test.cpp b/libc/test/src/sys/mman/linux/mlock_test.cpp
index f4a072ec18a31..0056ccf7f0b3d 100644
--- a/libc/test/src/sys/mman/linux/mlock_test.cpp
+++ b/libc/test/src/sys/mman/linux/mlock_test.cpp
@@ -22,14 +22,14 @@
 #include "src/sys/mman/munlockall.h"
 #include "src/sys/mman/munmap.h"
 #include "src/sys/resource/getrlimit.h"
+#include "src/unistd/sysconf.h"
 #include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #include <sys/syscall.h>
 
-// TODO: Replace with sysconf call once the function is properly implemented.
-constexpr size_t PAGE_SIZE = 4096;
+const size_t PAGE_SIZE = LIBC_NAMESPACE::sysconf(_SC_PAGESIZE);
 
 using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher;
 using LlvmLibcMlockTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
diff --git a/libc/test/src/sys/mman/linux/msync_test.cpp b/libc/test/src/sys/mman/linux/msync_test.cpp
index 764a67d02e3b1..bf9640d39ebd6 100644
--- a/libc/test/src/sys/mman/linux/msync_test.cpp
+++ b/libc/test/src/sys/mman/linux/msync_test.cpp
@@ -11,12 +11,12 @@
 #include "src/sys/mman/msync.h"
 #include "src/sys/mman/munlock.h"
 #include "src/sys/mman/munmap.h"
+#include "src/unistd/sysconf.h"
 #include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-// TODO: Replace with sysconf call once the function is properly implemented.
-constexpr size_t PAGE_SIZE = 4096;
+const size_t PAGE_SIZE = LIBC_NAMESPACE::sysconf(_SC_PAGESIZE);
 
 using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher;
 using LlvmLibcMsyncTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
diff --git a/libc/test/src/sys/mman/linux/remap_file_pages_test.cpp b/libc/test/src/sys/mman/linux/remap_file_pages_test.cpp
index 094bcb2c71427..08ffffe35ab2d 100644
--- a/libc/test/src/sys/mman/linux/remap_file_pages_test.cpp
+++ b/libc/test/src/sys/mman/linux/remap_file_pages_test.cpp
@@ -11,6 +11,7 @@
 #include "src/sys/mman/munmap.h"
 #include "src/sys/mman/remap_file_pages.h"
 #include "src/unistd/close.h"
+#include "src/unistd/sysconf.h"
 #include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -18,8 +19,7 @@
 #include <sys/mman.h>
 #include <sys/stat.h> // For S_IRWXU
 
-// TODO: Replace with sysconf call once the function is properly implemented.
-constexpr size_t PAGE_SIZE = 4096;
+const size_t PAGE_SIZE = LIBC_NAMESPACE::sysconf(_SC_PAGESIZE);
 
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt
index ba0fc4b38ce81..9a81f2658fe34 100644
--- a/libclc/CMakeLists.txt
+++ b/libclc/CMakeLists.txt
@@ -77,6 +77,7 @@ if( LIBCLC_STANDALONE_BUILD OR CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DI
 
   # Setup the paths where libclc runtimes should be stored.
   set( LIBCLC_OUTPUT_LIBRARY_DIR ${CMAKE_CURRENT_BINARY_DIR} )
+  set( LIBCLC_INSTALL_DIR ${CMAKE_INSTALL_DATADIR}/clc )
 else()
   # In-tree configuration
   set( LIBCLC_STANDALONE_BUILD FALSE )
@@ -100,10 +101,12 @@ else()
   # Setup the paths where libclc runtimes should be stored. By default, in an
   # in-tree build we place the libraries in clang's resource driectory.
   include(GetClangResourceDir)
-  get_clang_resource_dir( LIBCLC_OUTPUT_DIR PREFIX ${LLVM_LIBRARY_OUTPUT_INTDIR}/.. )
+  get_clang_resource_dir( LIBCLC_INSTALL_DIR )
+  cmake_path( APPEND LIBCLC_INSTALL_DIR "lib" "libclc" )
 
   # Note we do not adhere to LLVM_ENABLE_PER_TARGET_RUNTIME_DIR.
-  set( LIBCLC_OUTPUT_LIBRARY_DIR ${LIBCLC_OUTPUT_DIR}/lib/libclc )
+  cmake_path( GET LLVM_LIBRARY_OUTPUT_INTDIR PARENT_PATH LIBCLC_OUTPUT_LIBRARY_DIR )
+  cmake_path( APPEND LIBCLC_OUTPUT_LIBRARY_DIR ${LIBCLC_INSTALL_DIR} )
   file( MAKE_DIRECTORY ${LIBCLC_OUTPUT_LIBRARY_DIR} )
 endif()
 
@@ -124,7 +127,7 @@ if( EXISTS ${LIBCLC_CUSTOM_LLVM_TOOLS_BINARY_DIR} )
 endif()
 
 foreach( tool IN ITEMS clang opt llvm-as llvm-link )
-  if( NOT EXISTS "${${tool}_exe}" AND "${tool}_target" STREQUAL "" )
+  if( NOT EXISTS "${${tool}_exe}" AND "${${tool}_target}" STREQUAL "" )
     message( FATAL_ERROR "libclc toolchain incomplete - missing tool ${tool}!" )
   endif()
 endforeach()
diff --git a/libclc/clc/include/clc/math/clc_sincos_helpers.inc b/libclc/clc/include/clc/math/clc_sincos_helpers.inc
index 4daff92955cd7..0a3b816cb8c89 100644
--- a/libclc/clc/include/clc/math/clc_sincos_helpers.inc
+++ b/libclc/clc/include/clc/math/clc_sincos_helpers.inc
@@ -10,6 +10,11 @@ _CLC_DECL _CLC_OVERLOAD __CLC_FLOATN __clc_sinf_piby4(__CLC_FLOATN x,
                                                       __CLC_FLOATN y);
 _CLC_DECL _CLC_OVERLOAD __CLC_FLOATN __clc_cosf_piby4(__CLC_FLOATN x,
                                                       __CLC_FLOATN y);
+
+_CLC_DECL _CLC_OVERLOAD void __clc_sincos_piby4(__CLC_FLOATN x,
+                                                private __CLC_FLOATN *sinval,
+                                                private __CLC_FLOATN *cosval);
+
 _CLC_DECL _CLC_OVERLOAD __CLC_FLOATN __clc_tanf_piby4(__CLC_FLOATN x,
                                                       __CLC_INTN regn);
 
diff --git a/libclc/clc/include/clc/math/clc_sincos_helpers_fp64.inc b/libclc/clc/include/clc/math/clc_sincos_helpers_fp64.inc
index 09c6e1c965f64..15934cab32751 100644
--- a/libclc/clc/include/clc/math/clc_sincos_helpers_fp64.inc
+++ b/libclc/clc/include/clc/math/clc_sincos_helpers_fp64.inc
@@ -6,6 +6,15 @@
 //
 //===----------------------------------------------------------------------===//
 
+_CLC_DECL _CLC_OVERLOAD void __clc_sincos_piby4(__CLC_DOUBLEN x,
+                                                __CLC_DOUBLEN xx,
+                                                private __CLC_DOUBLEN *sinval,
+                                                private __CLC_DOUBLEN *cosval);
+
+_CLC_DECL _CLC_OVERLOAD void __clc_tan_piby4(__CLC_DOUBLEN x, __CLC_DOUBLEN xx,
+                                             private __CLC_DOUBLEN *leadval,
+                                             private __CLC_DOUBLEN *tailval);
+
 _CLC_DECL _CLC_OVERLOAD void
 __clc_remainder_piby2_medium(__CLC_DOUBLEN x, private __CLC_DOUBLEN *r,
                              private __CLC_DOUBLEN *rr,
diff --git a/libclc/clc/include/clc/math/clc_sincos_piby4.inc b/libclc/clc/include/clc/math/clc_sincos_piby4.inc
deleted file mode 100644
index 91ec518b70e97..0000000000000
--- a/libclc/clc/include/clc/math/clc_sincos_piby4.inc
+++ /dev/null
@@ -1,174 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#if __CLC_FPSIZE == 32
-
-// Evaluate single precisions in and cos of value in interval [-pi/4, pi/4]
-_CLC_INLINE _CLC_OVERLOAD void
-__clc_sincos_piby4(__CLC_GENTYPE x, private __CLC_GENTYPE *sinval,
-                   private __CLC_GENTYPE *cosval) {
-  // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
-  // = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
-  // = x * f(w)
-  // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
-  // We use a minimax approximation of (f(w) - 1) / w
-  // because this produces an expansion in even powers of x.
-
-  // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
-  // = f(w)
-  // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
-  // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
-  // because this produces an expansion in even powers of x.
-
-  const __CLC_GENTYPE sc1 = -0.166666666638608441788607926e0F;
-  const __CLC_GENTYPE sc2 = 0.833333187633086262120839299e-2F;
-  const __CLC_GENTYPE sc3 = -0.198400874359527693921333720e-3F;
-  const __CLC_GENTYPE sc4 = 0.272500015145584081596826911e-5F;
-
-  const __CLC_GENTYPE cc1 = 0.41666666664325175238031e-1F;
-  const __CLC_GENTYPE cc2 = -0.13888887673175665567647e-2F;
-  const __CLC_GENTYPE cc3 = 0.24800600878112441958053e-4F;
-  const __CLC_GENTYPE cc4 = -0.27301013343179832472841e-6F;
-
-  __CLC_GENTYPE x2 = x * x;
-
-  *sinval = __clc_mad(
-      x * x2, __clc_mad(x2, __clc_mad(x2, __clc_mad(x2, sc4, sc3), sc2), sc1),
-      x);
-  *cosval = __clc_mad(
-      x2 * x2, __clc_mad(x2, __clc_mad(x2, __clc_mad(x2, cc4, cc3), cc2), cc1),
-      __clc_mad(x2, -0.5f, 1.0f));
-}
-
-#elif __CLC_FPSIZE == 64
-
-_CLC_INLINE _CLC_OVERLOAD void
-__clc_sincos_piby4(__CLC_GENTYPE x, __CLC_GENTYPE xx,
-                   private __CLC_GENTYPE *sinval,
-                   private __CLC_GENTYPE *cosval) {
-  // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
-  //                      = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
-  //                      = x * f(w)
-  // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
-  // We use a minimax approximation of (f(w) - 1) / w
-  // because this produces an expansion in even powers of x.
-  // If xx (the tail of x) is non-zero, we add a correction
-  // term g(x,xx) = (1-x*x/2)*xx to the result, where g(x,xx)
-  // is an approximation to cos(x)*sin(xx) valid because
-  // xx is tiny relative to x.
-
-  // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
-  //                      = f(w)
-  // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
-  // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
-  // because this produces an expansion in even powers of x.
-  // If xx (the tail of x) is non-zero, we subtract a correction
-  // term g(x,xx) = x*xx to the result, where g(x,xx)
-  // is an approximation to sin(x)*sin(xx) valid because
-  // xx is tiny relative to x.
-
-  const __CLC_GENTYPE sc1 = -0.166666666666666646259241729;
-  const __CLC_GENTYPE sc2 = 0.833333333333095043065222816e-2;
-  const __CLC_GENTYPE sc3 = -0.19841269836761125688538679e-3;
-  const __CLC_GENTYPE sc4 = 0.275573161037288022676895908448e-5;
-  const __CLC_GENTYPE sc5 = -0.25051132068021699772257377197e-7;
-  const __CLC_GENTYPE sc6 = 0.159181443044859136852668200e-9;
-
-  const __CLC_GENTYPE cc1 = 0.41666666666666665390037e-1;
-  const __CLC_GENTYPE cc2 = -0.13888888888887398280412e-2;
-  const __CLC_GENTYPE cc3 = 0.248015872987670414957399e-4;
-  const __CLC_GENTYPE cc4 = -0.275573172723441909470836e-6;
-  const __CLC_GENTYPE cc5 = 0.208761463822329611076335e-8;
-  const __CLC_GENTYPE cc6 = -0.113826398067944859590880e-10;
-
-  __CLC_GENTYPE x2 = x * x;
-  __CLC_GENTYPE x3 = x2 * x;
-  __CLC_GENTYPE r = (__CLC_GENTYPE)0.5 * x2;
-  __CLC_GENTYPE t = (__CLC_GENTYPE)1.0 - r;
-
-  __CLC_GENTYPE sp = __clc_fma(
-      __clc_fma(__clc_fma(__clc_fma(sc6, x2, sc5), x2, sc4), x2, sc3), x2, sc2);
-
-  __CLC_GENTYPE cp =
-      t +
-      __clc_fma(__clc_fma(__clc_fma(__clc_fma(__clc_fma(__clc_fma(cc6, x2, cc5),
-                                                        x2, cc4),
-                                              x2, cc3),
-                                    x2, cc2),
-                          x2, cc1),
-                x2 * x2, __clc_fma(x, xx, (1.0 - t) - r));
-
-  *sinval =
-      x - __clc_fma(-x3, sc1, __clc_fma(__clc_fma(-x3, sp, 0.5 * xx), x2, -xx));
-  *cosval = cp;
-}
-
-_CLC_INLINE _CLC_OVERLOAD void __clc_tan_piby4(__CLC_GENTYPE x,
-                                               __CLC_GENTYPE xx,
-                                               private __CLC_GENTYPE *leadval,
-                                               private __CLC_GENTYPE *tailval) {
-  // 0x3fe921fb54442d18
-  const __CLC_GENTYPE piby4_lead = 7.85398163397448278999e-01;
-  // 0x3c81a62633145c06
-  const __CLC_GENTYPE piby4_tail = 3.06161699786838240164e-17;
-
-  // In order to maintain relative precision transform using the identity:
-  // tan(pi/4-x) = (1-tan(x))/(1+tan(x)) for arguments close to pi/4.
-  // Similarly use tan(x-pi/4) = (tan(x)-1)/(tan(x)+1) close to -pi/4.
-
-  __CLC_LONGN ca = x > 0.68;
-  __CLC_LONGN cb = x < -0.68;
-  __CLC_GENTYPE transform = ca ? 1.0 : 0.0;
-  transform = cb ? -1.0 : transform;
-
-  __CLC_GENTYPE tx = __clc_fma(-transform, x, piby4_lead) +
-                     __clc_fma(-transform, xx, piby4_tail);
-  __CLC_LONGN c = ca | cb;
-  x = c ? tx : x;
-  xx = c ? 0.0 : xx;
-
-  // Core Remez [2,3] approximation to tan(x+xx) on the interval [0,0.68].
-  __CLC_GENTYPE t1 = x;
-  __CLC_GENTYPE r = __clc_fma(2.0, x * xx, x * x);
-
-  __CLC_GENTYPE a = __clc_fma(r,
-                              __clc_fma(r, 0.224044448537022097264602535574e-3,
-                                        -0.229345080057565662883358588111e-1),
-                              0.372379159759792203640806338901e0);
-
-  __CLC_GENTYPE b =
-      __clc_fma(r,
-                __clc_fma(r,
-                          __clc_fma(r, -0.232371494088563558304549252913e-3,
-                                    0.260656620398645407524064091208e-1),
-                          -0.515658515729031149329237816945e0),
-                0.111713747927937668539901657944e1);
-
-  __CLC_GENTYPE t2 = __clc_fma(MATH_DIVIDE(a, b), x * r, xx);
-
-  __CLC_GENTYPE tp = t1 + t2;
-
-  // Compute -1.0/(t1 + t2) accurately
-  __CLC_GENTYPE z1 =
-      __CLC_AS_GENTYPE(__CLC_AS_ULONGN(tp) & 0xffffffff00000000L);
-  __CLC_GENTYPE z2 = t2 - (z1 - t1);
-  __CLC_GENTYPE trec = -MATH_RECIP(tp);
-  __CLC_GENTYPE trec_top =
-      __CLC_AS_GENTYPE(__CLC_AS_ULONGN(trec) & 0xffffffff00000000L);
-
-  __CLC_GENTYPE tpr = __clc_fma(
-      __clc_fma(trec_top, z2, __clc_fma(trec_top, z1, 1.0)), trec, trec_top);
-
-  __CLC_GENTYPE tpt = transform * (1.0 - MATH_DIVIDE(2.0 * tp, 1.0 + tp));
-  __CLC_GENTYPE tptr = transform * (MATH_DIVIDE(2.0 * tp, tp - 1.0) - 1.0);
-
-  *leadval = c ? tpt : tp;
-  *tailval = c ? tptr : tpr;
-}
-
-#endif
diff --git a/libclc/clc/lib/generic/common/clc_degrees.cl b/libclc/clc/lib/generic/common/clc_degrees.cl
index 79b97f0ca75b0..9c48f50f3518d 100644
--- a/libclc/clc/lib/generic/common/clc_degrees.cl
+++ b/libclc/clc/lib/generic/common/clc_degrees.cl
@@ -6,36 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <clc/common/clc_degrees.h>
 #include <clc/internal/clc.h>
 
-#define DEGREES_SINGLE_DEF(TYPE, LITERAL)                                      \
-  _CLC_OVERLOAD _CLC_DEF TYPE __clc_degrees(TYPE radians) {                    \
-    return (TYPE)LITERAL * radians;                                            \
-  }
-
-#define DEGREES_DEF(TYPE, LITERAL)                                             \
-  DEGREES_SINGLE_DEF(TYPE, LITERAL)                                            \
-  DEGREES_SINGLE_DEF(TYPE##2, LITERAL)                                         \
-  DEGREES_SINGLE_DEF(TYPE##3, LITERAL)                                         \
-  DEGREES_SINGLE_DEF(TYPE##4, LITERAL)                                         \
-  DEGREES_SINGLE_DEF(TYPE##8, LITERAL)                                         \
-  DEGREES_SINGLE_DEF(TYPE##16, LITERAL)
-
-// 180/pi = ~57.29577951308232087685 or 0x1.ca5dc1a63c1f8p+5 or 0x1.ca5dc2p+5F
-DEGREES_DEF(float, 0x1.ca5dc2p+5F)
-
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-// 180/pi = ~57.29577951308232087685 or 0x1.ca5dc1a63c1f8p+5 or 0x1.ca5dc2p+5F
-DEGREES_DEF(double, 0x1.ca5dc1a63c1f8p+5)
-
-#endif
-
-#ifdef cl_khr_fp16
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-// 180/pi = ~57.29577951308232087685 or 0x1.ca5dc1a63c1f8p+5 or 0x1.ca5dc2p+5F
-DEGREES_DEF(half, (half)0x1.ca5dc1a63c1f8p+5)
-
-#endif
+#define __CLC_BODY <clc_degrees.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/clc/lib/generic/common/clc_degrees.inc b/libclc/clc/lib/generic/common/clc_degrees.inc
new file mode 100644
index 0000000000000..ae6fb33b6d7ac
--- /dev/null
+++ b/libclc/clc/lib/generic/common/clc_degrees.inc
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// 180/pi = ~57.29577951308232087685 or 0x1.ca5dc1a63c1f8p+5 or 0x1.ca5dc2p+5F
+#if __CLC_FPSIZE == 32
+#define DEGREE_LITERAL 0x1.ca5dc2p+5F
+#elif __CLC_FPSIZE == 64
+#define DEGREE_LITERAL 0x1.ca5dc1a63c1f8p+5
+#elif __CLC_FPSIZE == 16
+#define DEGREE_LITERAL (half)0x1.ca5dc1a63c1f8p+5
+#endif
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_degrees(__CLC_GENTYPE radians) {
+  return DEGREE_LITERAL * radians;
+}
+
+#undef DEGREE_LITERAL
diff --git a/libclc/clc/lib/generic/common/clc_radians.cl b/libclc/clc/lib/generic/common/clc_radians.cl
index 3f013019ad974..6e8d6f3638101 100644
--- a/libclc/clc/lib/generic/common/clc_radians.cl
+++ b/libclc/clc/lib/generic/common/clc_radians.cl
@@ -6,36 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <clc/common/clc_radians.h>
 #include <clc/internal/clc.h>
 
-#define __CLC_RADIANS_SINGLE_DEF(TYPE, LITERAL)                                \
-  _CLC_OVERLOAD _CLC_DEF TYPE __clc_radians(TYPE radians) {                    \
-    return (TYPE)LITERAL * radians;                                            \
-  }
-
-#define __CLC_RADIANS_DEF(TYPE, LITERAL)                                       \
-  __CLC_RADIANS_SINGLE_DEF(TYPE, LITERAL)                                      \
-  __CLC_RADIANS_SINGLE_DEF(TYPE##2, LITERAL)                                   \
-  __CLC_RADIANS_SINGLE_DEF(TYPE##3, LITERAL)                                   \
-  __CLC_RADIANS_SINGLE_DEF(TYPE##4, LITERAL)                                   \
-  __CLC_RADIANS_SINGLE_DEF(TYPE##8, LITERAL)                                   \
-  __CLC_RADIANS_SINGLE_DEF(TYPE##16, LITERAL)
-
-// pi/180 = ~0.01745329251994329577 or 0x1.1df46a2529d39p-6 or 0x1.1df46ap-6F
-__CLC_RADIANS_DEF(float, 0x1.1df46ap-6F)
-
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-// pi/180 = ~0.01745329251994329577 or 0x1.1df46a2529d39p-6 or 0x1.1df46ap-6F
-__CLC_RADIANS_DEF(double, 0x1.1df46a2529d39p-6)
-
-#endif
-
-#ifdef cl_khr_fp16
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-// pi/180 = ~0.01745329251994329577 or 0x1.1df46a2529d39p-6 or 0x1.1df46ap-6F
-__CLC_RADIANS_DEF(half, (half)0x1.1df46a2529d39p-6)
-
-#endif
+#define __CLC_BODY <clc_radians.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/clc/lib/generic/common/clc_radians.inc b/libclc/clc/lib/generic/common/clc_radians.inc
new file mode 100644
index 0000000000000..2f859775e5725
--- /dev/null
+++ b/libclc/clc/lib/generic/common/clc_radians.inc
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// pi/180 = ~0.01745329251994329577 or 0x1.1df46a2529d39p-6 or 0x1.1df46ap-6F
+#if __CLC_FPSIZE == 32
+#define RADIAN_LITERAL 0x1.1df46ap-6F
+#elif __CLC_FPSIZE == 64
+#define RADIAN_LITERAL 0x1.1df46a2529d39p-6
+#elif __CLC_FPSIZE == 16
+#define RADIAN_LITERAL (half)0x1.1df46a2529d39p-6
+#endif
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_radians(__CLC_GENTYPE degrees) {
+  return RADIAN_LITERAL * degrees;
+}
+
+#undef RADIAN_LITERAL
diff --git a/libclc/clc/lib/generic/common/clc_smoothstep.cl b/libclc/clc/lib/generic/common/clc_smoothstep.cl
index b409c7d7b6440..1c4d4a5a9b0bf 100644
--- a/libclc/clc/lib/generic/common/clc_smoothstep.cl
+++ b/libclc/clc/lib/generic/common/clc_smoothstep.cl
@@ -5,33 +5,10 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+
+#include <clc/common/clc_smoothstep.h>
 #include <clc/internal/clc.h>
 #include <clc/shared/clc_clamp.h>
 
-#define SMOOTHSTEP_SINGLE_DEF(edge_type, x_type, lit_suff)                     \
-  _CLC_OVERLOAD _CLC_DEF x_type __clc_smoothstep(edge_type edge0,              \
-                                                 edge_type edge1, x_type x) {  \
-    x_type t = __clc_clamp((x - edge0) / (edge1 - edge0), 0.0##lit_suff,       \
-                           1.0##lit_suff);                                     \
-    return t * t * (3.0##lit_suff - 2.0##lit_suff * t);                        \
-  }
-
-#define SMOOTHSTEP_DEF(type, lit_suffix)                                       \
-  SMOOTHSTEP_SINGLE_DEF(type, type, lit_suffix)                                \
-  SMOOTHSTEP_SINGLE_DEF(type##2, type##2, lit_suffix)                          \
-  SMOOTHSTEP_SINGLE_DEF(type##3, type##3, lit_suffix)                          \
-  SMOOTHSTEP_SINGLE_DEF(type##4, type##4, lit_suffix)                          \
-  SMOOTHSTEP_SINGLE_DEF(type##8, type##8, lit_suffix)                          \
-  SMOOTHSTEP_SINGLE_DEF(type##16, type##16, lit_suffix)
-
-SMOOTHSTEP_DEF(float, F)
-
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-SMOOTHSTEP_DEF(double, );
-#endif
-
-#ifdef cl_khr_fp16
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-SMOOTHSTEP_DEF(half, H);
-#endif
+#define __CLC_BODY <clc_smoothstep.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/clc/lib/generic/common/clc_smoothstep.inc b/libclc/clc/lib/generic/common/clc_smoothstep.inc
new file mode 100644
index 0000000000000..f7d853c27b558
--- /dev/null
+++ b/libclc/clc/lib/generic/common/clc_smoothstep.inc
@@ -0,0 +1,15 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_smoothstep(__CLC_GENTYPE edge0,
+                                                      __CLC_GENTYPE edge1,
+                                                      __CLC_GENTYPE x) {
+  __CLC_GENTYPE t = __clc_clamp((x - edge0) / (edge1 - edge0),
+                                __CLC_FP_LIT(0.0), __CLC_FP_LIT(1.0));
+  return t * t * (__CLC_FP_LIT(3.0) - __CLC_FP_LIT(2.0) * t);
+}
diff --git a/libclc/clc/lib/generic/common/clc_step.cl b/libclc/clc/lib/generic/common/clc_step.cl
index 721489948233c..6bb9a66760e27 100644
--- a/libclc/clc/lib/generic/common/clc_step.cl
+++ b/libclc/clc/lib/generic/common/clc_step.cl
@@ -6,5 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <clc/common/clc_step.h>
+#include <clc/internal/clc.h>
+
 #define __CLC_BODY <clc_step.inc>
 #include <clc/math/gentype.inc>
diff --git a/libclc/clc/lib/generic/math/clc_cos.cl b/libclc/clc/lib/generic/math/clc_cos.cl
index e7e4d6ad39ede..5529ec411a195 100644
--- a/libclc/clc/lib/generic/math/clc_cos.cl
+++ b/libclc/clc/lib/generic/math/clc_cos.cl
@@ -10,7 +10,6 @@
 #include <clc/float/definitions.h>
 #include <clc/math/clc_fabs.h>
 #include <clc/math/clc_sincos_helpers.h>
-#include <clc/math/clc_sincos_piby4.h>
 #include <clc/math/math.h>
 #include <clc/relational/clc_isinf.h>
 #include <clc/relational/clc_isnan.h>
diff --git a/libclc/clc/lib/generic/math/clc_cospi.cl b/libclc/clc/lib/generic/math/clc_cospi.cl
index 07e1b49cc9e02..6a10171c723d0 100644
--- a/libclc/clc/lib/generic/math/clc_cospi.cl
+++ b/libclc/clc/lib/generic/math/clc_cospi.cl
@@ -11,7 +11,6 @@
 #include <clc/internal/clc.h>
 #include <clc/math/clc_fabs.h>
 #include <clc/math/clc_sincos_helpers.h>
-#include <clc/math/clc_sincos_piby4.h>
 #include <clc/math/math.h>
 
 #define __CLC_BODY <clc_cospi.inc>
diff --git a/libclc/clc/lib/generic/math/clc_nan.inc b/libclc/clc/lib/generic/math/clc_nan.inc
index 46e828ba48c7e..47527088960da 100644
--- a/libclc/clc/lib/generic/math/clc_nan.inc
+++ b/libclc/clc/lib/generic/math/clc_nan.inc
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #if __CLC_FPSIZE == 64
-#define NAN_MASK 0x7ff0000000000000ul
+#define NAN_MASK 0x7ff8000000000000ul
 #elif __CLC_FPSIZE == 32
 #define NAN_MASK 0x7fc00000
 #elif __CLC_FPSIZE == 16
diff --git a/libclc/clc/lib/generic/math/clc_sin.cl b/libclc/clc/lib/generic/math/clc_sin.cl
index 741383f94c456..99338c95eb60c 100644
--- a/libclc/clc/lib/generic/math/clc_sin.cl
+++ b/libclc/clc/lib/generic/math/clc_sin.cl
@@ -11,7 +11,6 @@
 #include <clc/internal/clc.h>
 #include <clc/math/clc_fabs.h>
 #include <clc/math/clc_sincos_helpers.h>
-#include <clc/math/clc_sincos_piby4.h>
 #include <clc/math/clc_trunc.h>
 #include <clc/math/math.h>
 #include <clc/math/tables.h>
diff --git a/libclc/clc/lib/generic/math/clc_sincos_helpers.inc b/libclc/clc/lib/generic/math/clc_sincos_helpers.inc
index 9a46170a3db38..2a71b5626ccc5 100644
--- a/libclc/clc/lib/generic/math/clc_sincos_helpers.inc
+++ b/libclc/clc/lib/generic/math/clc_sincos_helpers.inc
@@ -74,6 +74,43 @@ _CLC_DEF _CLC_OVERLOAD __CLC_FLOATN __clc_cosf_piby4(__CLC_FLOATN x,
   return ret;
 }
 
+// Evaluate single precisions sin and cos of value in interval [-pi/4, pi/4]
+_CLC_DEF _CLC_OVERLOAD void __clc_sincos_piby4(__CLC_FLOATN x,
+                                               private __CLC_FLOATN *sinval,
+                                               private __CLC_FLOATN *cosval) {
+  // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
+  // = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
+  // = x * f(w)
+  // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
+  // We use a minimax approximation of (f(w) - 1) / w
+  // because this produces an expansion in even powers of x.
+
+  // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
+  // = f(w)
+  // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
+  // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
+  // because this produces an expansion in even powers of x.
+
+  const __CLC_FLOATN sc1 = -0.166666666638608441788607926e0F;
+  const __CLC_FLOATN sc2 = 0.833333187633086262120839299e-2F;
+  const __CLC_FLOATN sc3 = -0.198400874359527693921333720e-3F;
+  const __CLC_FLOATN sc4 = 0.272500015145584081596826911e-5F;
+
+  const __CLC_FLOATN cc1 = 0.41666666664325175238031e-1F;
+  const __CLC_FLOATN cc2 = -0.13888887673175665567647e-2F;
+  const __CLC_FLOATN cc3 = 0.24800600878112441958053e-4F;
+  const __CLC_FLOATN cc4 = -0.27301013343179832472841e-6F;
+
+  __CLC_FLOATN x2 = x * x;
+
+  *sinval = __clc_mad(
+      x * x2, __clc_mad(x2, __clc_mad(x2, __clc_mad(x2, sc4, sc3), sc2), sc1),
+      x);
+  *cosval = __clc_mad(
+      x2 * x2, __clc_mad(x2, __clc_mad(x2, __clc_mad(x2, cc4, cc3), cc2), cc1),
+      __clc_mad(x2, -0.5f, 1.0f));
+}
+
 _CLC_DEF _CLC_OVERLOAD __CLC_FLOATN __clc_tanf_piby4(__CLC_FLOATN x,
                                                      __CLC_INTN regn) {
   // Core Remez [1,2] approximation to tan(x) on the interval [0,pi/4].
diff --git a/libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc b/libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc
index 8fae90c9cc5a5..e029c6dcfaa02 100644
--- a/libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc
+++ b/libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc
@@ -6,6 +6,129 @@
 //
 //===----------------------------------------------------------------------===//
 
+_CLC_DEF _CLC_OVERLOAD void __clc_sincos_piby4(__CLC_DOUBLEN x,
+                                               __CLC_DOUBLEN xx,
+                                               private __CLC_DOUBLEN *sinval,
+                                               private __CLC_DOUBLEN *cosval) {
+  // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
+  //                      = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
+  //                      = x * f(w)
+  // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
+  // We use a minimax approximation of (f(w) - 1) / w
+  // because this produces an expansion in even powers of x.
+  // If xx (the tail of x) is non-zero, we add a correction
+  // term g(x,xx) = (1-x*x/2)*xx to the result, where g(x,xx)
+  // is an approximation to cos(x)*sin(xx) valid because
+  // xx is tiny relative to x.
+
+  // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
+  //                      = f(w)
+  // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
+  // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
+  // because this produces an expansion in even powers of x.
+  // If xx (the tail of x) is non-zero, we subtract a correction
+  // term g(x,xx) = x*xx to the result, where g(x,xx)
+  // is an approximation to sin(x)*sin(xx) valid because
+  // xx is tiny relative to x.
+
+  const __CLC_DOUBLEN sc1 = -0.166666666666666646259241729;
+  const __CLC_DOUBLEN sc2 = 0.833333333333095043065222816e-2;
+  const __CLC_DOUBLEN sc3 = -0.19841269836761125688538679e-3;
+  const __CLC_DOUBLEN sc4 = 0.275573161037288022676895908448e-5;
+  const __CLC_DOUBLEN sc5 = -0.25051132068021699772257377197e-7;
+  const __CLC_DOUBLEN sc6 = 0.159181443044859136852668200e-9;
+
+  const __CLC_DOUBLEN cc1 = 0.41666666666666665390037e-1;
+  const __CLC_DOUBLEN cc2 = -0.13888888888887398280412e-2;
+  const __CLC_DOUBLEN cc3 = 0.248015872987670414957399e-4;
+  const __CLC_DOUBLEN cc4 = -0.275573172723441909470836e-6;
+  const __CLC_DOUBLEN cc5 = 0.208761463822329611076335e-8;
+  const __CLC_DOUBLEN cc6 = -0.113826398067944859590880e-10;
+
+  __CLC_DOUBLEN x2 = x * x;
+  __CLC_DOUBLEN x3 = x2 * x;
+  __CLC_DOUBLEN r = (__CLC_DOUBLEN)0.5 * x2;
+  __CLC_DOUBLEN t = (__CLC_DOUBLEN)1.0 - r;
+
+  __CLC_DOUBLEN sp = __clc_fma(
+      __clc_fma(__clc_fma(__clc_fma(sc6, x2, sc5), x2, sc4), x2, sc3), x2, sc2);
+
+  __CLC_DOUBLEN cp =
+      t +
+      __clc_fma(__clc_fma(__clc_fma(__clc_fma(__clc_fma(__clc_fma(cc6, x2, cc5),
+                                                        x2, cc4),
+                                              x2, cc3),
+                                    x2, cc2),
+                          x2, cc1),
+                x2 * x2, __clc_fma(x, xx, (1.0 - t) - r));
+
+  *sinval =
+      x - __clc_fma(-x3, sc1, __clc_fma(__clc_fma(-x3, sp, 0.5 * xx), x2, -xx));
+  *cosval = cp;
+}
+
+_CLC_DEF _CLC_OVERLOAD void __clc_tan_piby4(__CLC_DOUBLEN x, __CLC_DOUBLEN xx,
+                                            private __CLC_DOUBLEN *leadval,
+                                            private __CLC_DOUBLEN *tailval) {
+  // 0x3fe921fb54442d18
+  const __CLC_DOUBLEN piby4_lead = 7.85398163397448278999e-01;
+  // 0x3c81a62633145c06
+  const __CLC_DOUBLEN piby4_tail = 3.06161699786838240164e-17;
+
+  // In order to maintain relative precision transform using the identity:
+  // tan(pi/4-x) = (1-tan(x))/(1+tan(x)) for arguments close to pi/4.
+  // Similarly use tan(x-pi/4) = (tan(x)-1)/(tan(x)+1) close to -pi/4.
+
+  __CLC_LONGN ca = x > 0.68;
+  __CLC_LONGN cb = x < -0.68;
+  __CLC_DOUBLEN transform = ca ? 1.0 : 0.0;
+  transform = cb ? -1.0 : transform;
+
+  __CLC_DOUBLEN tx = __clc_fma(-transform, x, piby4_lead) +
+                     __clc_fma(-transform, xx, piby4_tail);
+  __CLC_LONGN c = ca | cb;
+  x = c ? tx : x;
+  xx = c ? 0.0 : xx;
+
+  // Core Remez [2,3] approximation to tan(x+xx) on the interval [0,0.68].
+  __CLC_DOUBLEN t1 = x;
+  __CLC_DOUBLEN r = __clc_fma(2.0, x * xx, x * x);
+
+  __CLC_DOUBLEN a = __clc_fma(r,
+                              __clc_fma(r, 0.224044448537022097264602535574e-3,
+                                        -0.229345080057565662883358588111e-1),
+                              0.372379159759792203640806338901e0);
+
+  __CLC_DOUBLEN b =
+      __clc_fma(r,
+                __clc_fma(r,
+                          __clc_fma(r, -0.232371494088563558304549252913e-3,
+                                    0.260656620398645407524064091208e-1),
+                          -0.515658515729031149329237816945e0),
+                0.111713747927937668539901657944e1);
+
+  __CLC_DOUBLEN t2 = __clc_fma(MATH_DIVIDE(a, b), x * r, xx);
+
+  __CLC_DOUBLEN tp = t1 + t2;
+
+  // Compute -1.0/(t1 + t2) accurately
+  __CLC_DOUBLEN z1 =
+      __CLC_AS_GENTYPE(__CLC_AS_ULONGN(tp) & 0xffffffff00000000L);
+  __CLC_DOUBLEN z2 = t2 - (z1 - t1);
+  __CLC_DOUBLEN trec = -MATH_RECIP(tp);
+  __CLC_DOUBLEN trec_top =
+      __CLC_AS_GENTYPE(__CLC_AS_ULONGN(trec) & 0xffffffff00000000L);
+
+  __CLC_DOUBLEN tpr = __clc_fma(
+      __clc_fma(trec_top, z2, __clc_fma(trec_top, z1, 1.0)), trec, trec_top);
+
+  __CLC_DOUBLEN tpt = transform * (1.0 - MATH_DIVIDE(2.0 * tp, 1.0 + tp));
+  __CLC_DOUBLEN tptr = transform * (MATH_DIVIDE(2.0 * tp, tp - 1.0) - 1.0);
+
+  *leadval = c ? tpt : tp;
+  *tailval = c ? tptr : tpr;
+}
+
 // Reduction for medium sized arguments
 _CLC_DEF _CLC_OVERLOAD void
 __clc_remainder_piby2_medium(__CLC_DOUBLEN x, private __CLC_DOUBLEN *r,
diff --git a/libclc/clc/lib/generic/math/clc_sinpi.cl b/libclc/clc/lib/generic/math/clc_sinpi.cl
index 6cff247707845..bb5de09f03c08 100644
--- a/libclc/clc/lib/generic/math/clc_sinpi.cl
+++ b/libclc/clc/lib/generic/math/clc_sinpi.cl
@@ -11,7 +11,6 @@
 #include <clc/internal/clc.h>
 #include <clc/math/clc_fabs.h>
 #include <clc/math/clc_sincos_helpers.h>
-#include <clc/math/clc_sincos_piby4.h>
 #include <clc/math/math.h>
 
 #define __CLC_BODY <clc_sinpi.inc>
diff --git a/libclc/clc/lib/generic/math/clc_tan.cl b/libclc/clc/lib/generic/math/clc_tan.cl
index adf42c43d0484..7e68216ca43aa 100644
--- a/libclc/clc/lib/generic/math/clc_tan.cl
+++ b/libclc/clc/lib/generic/math/clc_tan.cl
@@ -11,7 +11,6 @@
 #include <clc/internal/clc.h>
 #include <clc/math/clc_fabs.h>
 #include <clc/math/clc_sincos_helpers.h>
-#include <clc/math/clc_sincos_piby4.h>
 #include <clc/math/math.h>
 #include <clc/math/tables.h>
 #include <clc/relational/clc_isinf.h>
diff --git a/libclc/clc/lib/generic/math/clc_tanpi.cl b/libclc/clc/lib/generic/math/clc_tanpi.cl
index f1265892d107b..099457c186314 100644
--- a/libclc/clc/lib/generic/math/clc_tanpi.cl
+++ b/libclc/clc/lib/generic/math/clc_tanpi.cl
@@ -12,7 +12,6 @@
 #include <clc/math/clc_fabs.h>
 #include <clc/math/clc_native_recip.h>
 #include <clc/math/clc_sincos_helpers.h>
-#include <clc/math/clc_sincos_piby4.h>
 #include <clc/math/math.h>
 
 #define __CLC_BODY <clc_tanpi.inc>
diff --git a/libclc/cmake/modules/AddLibclc.cmake b/libclc/cmake/modules/AddLibclc.cmake
index 614f9e3790a3c..d8c22191678d4 100644
--- a/libclc/cmake/modules/AddLibclc.cmake
+++ b/libclc/cmake/modules/AddLibclc.cmake
@@ -261,7 +261,7 @@ function(libclc_install)
 
   install(
     FILES ${files}
-    DESTINATION "${CMAKE_INSTALL_DATADIR}/clc"
+    DESTINATION ${LIBCLC_INSTALL_DIR}
   )
 endfunction()
 
diff --git a/libclc/opencl/lib/generic/common/smoothstep.cl b/libclc/opencl/lib/generic/common/smoothstep.cl
index 84ed7417de3cf..ee37718231929 100644
--- a/libclc/opencl/lib/generic/common/smoothstep.cl
+++ b/libclc/opencl/lib/generic/common/smoothstep.cl
@@ -9,59 +9,5 @@
 #include <clc/common/clc_smoothstep.h>
 #include <clc/opencl/common/smoothstep.h>
 
-#define SMOOTHSTEP_SINGLE_DEF(X_TYPE)                                          \
-  _CLC_OVERLOAD _CLC_DEF X_TYPE smoothstep(X_TYPE edge0, X_TYPE edge1,         \
-                                           X_TYPE x) {                         \
-    return __clc_smoothstep(edge0, edge1, x);                                  \
-  }
-
-#define SMOOTHSTEP_S_S_V_DEFS(X_TYPE)                                          \
-  _CLC_OVERLOAD _CLC_DEF X_TYPE##2 smoothstep(X_TYPE x, X_TYPE y,              \
-                                              X_TYPE##2 z) {                   \
-    return __clc_smoothstep((X_TYPE##2)x, (X_TYPE##2)y, z);                    \
-  }                                                                            \
-                                                                               \
-  _CLC_OVERLOAD _CLC_DEF X_TYPE##3 smoothstep(X_TYPE x, X_TYPE y,              \
-                                              X_TYPE##3 z) {                   \
-    return __clc_smoothstep((X_TYPE##3)x, (X_TYPE##3)y, z);                    \
-  }                                                                            \
-                                                                               \
-  _CLC_OVERLOAD _CLC_DEF X_TYPE##4 smoothstep(X_TYPE x, X_TYPE y,              \
-                                              X_TYPE##4 z) {                   \
-    return __clc_smoothstep((X_TYPE##4)x, (X_TYPE##4)y, z);                    \
-  }                                                                            \
-                                                                               \
-  _CLC_OVERLOAD _CLC_DEF X_TYPE##8 smoothstep(X_TYPE x, X_TYPE y,              \
-                                              X_TYPE##8 z) {                   \
-    return __clc_smoothstep((X_TYPE##8)x, (X_TYPE##8)y, z);                    \
-  }                                                                            \
-                                                                               \
-  _CLC_OVERLOAD _CLC_DEF X_TYPE##16 smoothstep(X_TYPE x, X_TYPE y,             \
-                                               X_TYPE##16 z) {                 \
-    return __clc_smoothstep((X_TYPE##16)x, (X_TYPE##16)y, z);                  \
-  }
-
-#define SMOOTHSTEP_DEF(type)                                                   \
-  SMOOTHSTEP_SINGLE_DEF(type)                                                  \
-  SMOOTHSTEP_SINGLE_DEF(type##2)                                               \
-  SMOOTHSTEP_SINGLE_DEF(type##3)                                               \
-  SMOOTHSTEP_SINGLE_DEF(type##4)                                               \
-  SMOOTHSTEP_SINGLE_DEF(type##8)                                               \
-  SMOOTHSTEP_SINGLE_DEF(type##16)                                              \
-  SMOOTHSTEP_S_S_V_DEFS(type)
-
-SMOOTHSTEP_DEF(float)
-
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-SMOOTHSTEP_DEF(double);
-
-#endif
-
-#ifdef cl_khr_fp16
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-SMOOTHSTEP_DEF(half);
-
-#endif
+#define __CLC_BODY <smoothstep.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/opencl/lib/generic/common/smoothstep.inc b/libclc/opencl/lib/generic/common/smoothstep.inc
new file mode 100644
index 0000000000000..ece87aa2e0c35
--- /dev/null
+++ b/libclc/opencl/lib/generic/common/smoothstep.inc
@@ -0,0 +1,23 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE smoothstep(__CLC_GENTYPE edge0,
+                                                __CLC_GENTYPE edge1,
+                                                __CLC_GENTYPE x) {
+  return __clc_smoothstep(edge0, edge1, x);
+}
+
+#if !defined(__CLC_SCALAR)
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE smoothstep(__CLC_SCALAR_GENTYPE edge0,
+                                                __CLC_SCALAR_GENTYPE edge1,
+                                                __CLC_GENTYPE x) {
+  return __clc_smoothstep((__CLC_GENTYPE)edge0, (__CLC_GENTYPE)edge1, x);
+}
+
+#endif
diff --git a/libcxx/docs/ReleaseNotes/21.rst b/libcxx/docs/ReleaseNotes/21.rst
index 642bf7dc1bc84..2b1aa28b62090 100644
--- a/libcxx/docs/ReleaseNotes/21.rst
+++ b/libcxx/docs/ReleaseNotes/21.rst
@@ -53,8 +53,7 @@ Implemented Papers
 - P2711R1: Making multi-param constructors of ``views`` ``explicit`` (`Github <https://github.com/llvm/llvm-project/issues/105252>`__)
 - P2770R0: Stashing stashing ``iterators`` for proper flattening (`Github <https://github.com/llvm/llvm-project/issues/105250>`__)
 - P2655R3: ``common_reference_t`` of ``reference_wrapper`` Should Be a Reference Type (`Github <https://github.com/llvm/llvm-project/issues/105260>`__)
-- P2944R3: Comparisons for ``reference_wrapper`` (`Github <https://github.com/llvm/llvm-project/issues/105424>`__)
-- P3379R0: Constrain ``std::expected equality`` operators (`Github <https://github.com/llvm/llvm-project/issues/118135>`__)
+- P3379R0: Constrain ``std::expected`` equality operators (`Github <https://github.com/llvm/llvm-project/issues/118135>`__)
 
 Improvements and New Features
 -----------------------------
diff --git a/libcxx/docs/ReleaseNotes/22.rst b/libcxx/docs/ReleaseNotes/22.rst
index 1a450218cb988..25d33a9c2eb50 100644
--- a/libcxx/docs/ReleaseNotes/22.rst
+++ b/libcxx/docs/ReleaseNotes/22.rst
@@ -44,6 +44,7 @@ Implemented Papers
 - P3223R2: Making ``std::istream::ignore`` less surprising (`Github <https://llvm.org/PR148178>`__)
 - P3060R3: Add ``std::views::indices(n)`` (`Github <https://llvm.org/PR148175>`__)
 - P2835R7: Expose ``std::atomic_ref``'s object address (`Github <https://llvm.org/PR118377>`__)
+- P2944R3: Comparisons for ``reference_wrapper`` (`Github <https://llvm.org/PR105424>`__)
 - P3168R2: Give ``std::optional`` Range Support (`Github <https://llvm.org/PR105430>`__)
 
 Improvements and New Features
@@ -61,6 +62,7 @@ Improvements and New Features
   has been improved by up to 3x
 - The performance of ``insert(iterator, iterator)`` of ``map``, ``set``, ``multimap`` and ``multiset`` has been improved
   by up to 2.5x
+- The performance of ``erase(iterator, iterator)`` in the unordered containers has been improved by up to 1.9x
 - The performance of ``map::insert_or_assign`` has been improved by up to 2x
 - ``ofstream::write`` has been optimized to pass through large strings to system calls directly instead of copying them
   in chunks into a buffer.
@@ -68,6 +70,14 @@ Improvements and New Features
   reduced debug information.
 
 - The performance of ``std::find`` has been improved by up to 2x for integral types
+- The ``std::distance`` and ``std::ranges::distance`` algorithms have been optimized for segmented iterators (e.g.,
+  ``std::join_view`` iterators), reducing the complexity from ``O(n)`` to ``O(n / segment_size)``. Benchmarks show
+  performance improvements of over 1600x in favorable cases with large segment sizes (e.g., 1024).
+- The ``std::{fill, fill_n}`` and ``std::ranges::{fill, fill_n}`` algorithms have been optimized for segmented iterators,
+  resulting in a performance improvement of at least 10x for ``std::deque<int>`` iterators and
+  ``std::join_view<std::vector<std::vector<int>>>`` iterators.
+- The ``std::generate`` and ``std::generate_n`` algorithms have been optimized for segmented iterators, resulting in a
+  performance improvement for ``std::deque<short>`` and ``std::join_view<vector<vector<short>>>`` iterators.
 
 Deprecations and Removals
 -------------------------
diff --git a/libcxx/docs/Status/Cxx2cIssues.csv b/libcxx/docs/Status/Cxx2cIssues.csv
index 7bf7bc95c8281..237217a31b9ea 100644
--- a/libcxx/docs/Status/Cxx2cIssues.csv
+++ b/libcxx/docs/Status/Cxx2cIssues.csv
@@ -149,5 +149,5 @@
 "`LWG3343 <https://wg21.link/LWG3343>`__","Ordering of calls to ``unlock()`` and ``notify_all()`` in Effects element of ``notify_all_at_thread_exit()`` should be reversed","Not Adopted Yet","|Complete|","16","`#105356 <https://github.com/llvm/llvm-project/issues/105356>`__",""
 "`LWG4139 <https://wg21.link/LWG4139>`__","§[time.zone.leap] recursive constraint in ``<=>``","Not Adopted Yet","|Complete|","20","`#118369 <https://github.com/llvm/llvm-project/issues/118369>`__",""
 "`LWG3456 <https://wg21.link/LWG3456>`__","Pattern used by ``std::from_chars`` is underspecified (option B)","Not Adopted Yet","|Complete|","20","`#118370 <https://github.com/llvm/llvm-project/issues/118370>`__",""
-"`LWG3882 <https://wg21.link/LWG3882>`__","``tuple`` relational operators have confused friendships","Not Adopted Yet","|Complete|","21","The comparsion operators are constrained harder than the proposed resolution. libstdc++ and MSVC STL do the same.",""
+"`LWG3882 <https://wg21.link/LWG3882>`__","``tuple`` relational operators have confused friendships","Not Adopted Yet","|Complete|","22","The comparsion operators are constrained harder than the proposed resolution. libstdc++ and MSVC STL do the same.",""
 "","","","","",""
diff --git a/libcxx/docs/Status/Cxx2cPapers.csv b/libcxx/docs/Status/Cxx2cPapers.csv
index 9b83047f4b983..0eedc82e3dbf5 100644
--- a/libcxx/docs/Status/Cxx2cPapers.csv
+++ b/libcxx/docs/Status/Cxx2cPapers.csv
@@ -59,7 +59,7 @@
 "`P2248R8 <https://wg21.link/P2248R8>`__","Enabling list-initialization for algorithms","2024-03 (Tokyo)","","","`#105421 <https://github.com/llvm/llvm-project/issues/105421>`__",""
 "`P2810R4 <https://wg21.link/P2810R4>`__","``is_debugger_present`` ``is_replaceable``","2024-03 (Tokyo)","","","`#105422 <https://github.com/llvm/llvm-project/issues/105422>`__",""
 "`P1068R11 <https://wg21.link/P1068R11>`__","Vector API for random number generation","2024-03 (Tokyo)","","","`#105423 <https://github.com/llvm/llvm-project/issues/105423>`__",""
-"`P2944R3 <https://wg21.link/P2944R3>`__","Comparisons for ``reference_wrapper``","2024-03 (Tokyo)","|Complete|","21","`#105424 <https://github.com/llvm/llvm-project/issues/105424>`__","The changes to ``tuple``'s equality overload from P2165R4 are not yet implemented."
+"`P2944R3 <https://wg21.link/P2944R3>`__","Comparisons for ``reference_wrapper``","2024-03 (Tokyo)","|Complete|","22","`#105424 <https://github.com/llvm/llvm-project/issues/105424>`__","The changes to ``tuple``'s equality overload from P2165R4 are not yet implemented."
 "`P2642R6 <https://wg21.link/P2642R6>`__","Padded ``mdspan`` layouts","2024-03 (Tokyo)","","","`#105425 <https://github.com/llvm/llvm-project/issues/105425>`__",""
 "`P3029R1 <https://wg21.link/P3029R1>`__","Better ``mdspan``'s CTAD","2024-03 (Tokyo)","|Complete|","19","`#105426 <https://github.com/llvm/llvm-project/issues/105426>`__",""
 "","","","","",""
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index ddace8bf8c728..dd1e71380e7fc 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -782,7 +782,6 @@ set(files
   __tuple/sfinae_helpers.h
   __tuple/tuple_element.h
   __tuple/tuple_like.h
-  __tuple/tuple_like_ext.h
   __tuple/tuple_like_no_subrange.h
   __tuple/tuple_size.h
   __tuple/tuple_types.h
diff --git a/libcxx/include/__algorithm/fill.h b/libcxx/include/__algorithm/fill.h
index 1ce3eadb013d0..328ebb663376a 100644
--- a/libcxx/include/__algorithm/fill.h
+++ b/libcxx/include/__algorithm/fill.h
@@ -10,8 +10,11 @@
 #define _LIBCPP___ALGORITHM_FILL_H
 
 #include <__algorithm/fill_n.h>
+#include <__algorithm/for_each_segment.h>
 #include <__config>
 #include <__iterator/iterator_traits.h>
+#include <__iterator/segmented_iterator.h>
+#include <__type_traits/enable_if.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -21,23 +24,40 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 // fill isn't specialized for std::memset, because the compiler already optimizes the loop to a call to std::memset.
 
-template <class _ForwardIterator, class _Tp>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
-__fill(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value, forward_iterator_tag) {
+template <class _ForwardIterator, class _Sentinel, class _Tp>
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator
+__fill(_ForwardIterator __first, _Sentinel __last, const _Tp& __value) {
   for (; __first != __last; ++__first)
     *__first = __value;
+  return __first;
 }
 
-template <class _RandomAccessIterator, class _Tp>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
-__fill(_RandomAccessIterator __first, _RandomAccessIterator __last, const _Tp& __value, random_access_iterator_tag) {
-  std::fill_n(__first, __last - __first, __value);
+template <class _RandomAccessIterator,
+          class _Tp,
+          __enable_if_t<__has_random_access_iterator_category<_RandomAccessIterator>::value &&
+                            !__is_segmented_iterator_v<_RandomAccessIterator>,
+                        int> = 0>
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _RandomAccessIterator
+__fill(_RandomAccessIterator __first, _RandomAccessIterator __last, const _Tp& __value) {
+  return std::__fill_n(__first, __last - __first, __value);
+}
+
+#ifndef _LIBCPP_CXX03_LANG
+template <class _SegmentedIterator, class _Tp, __enable_if_t<__is_segmented_iterator_v<_SegmentedIterator>, int> = 0>
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
+_SegmentedIterator __fill(_SegmentedIterator __first, _SegmentedIterator __last, const _Tp& __value) {
+  using __local_iterator_t = typename __segmented_iterator_traits<_SegmentedIterator>::__local_iterator;
+  std::__for_each_segment(__first, __last, [&](__local_iterator_t __lfirst, __local_iterator_t __llast) {
+    std::__fill(__lfirst, __llast, __value);
+  });
+  return __last;
 }
+#endif // !_LIBCPP_CXX03_LANG
 
 template <class _ForwardIterator, class _Tp>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
 fill(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
-  std::__fill(__first, __last, __value, typename iterator_traits<_ForwardIterator>::iterator_category());
+  std::__fill(__first, __last, __value);
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/fill_n.h b/libcxx/include/__algorithm/fill_n.h
index 0da78e1f38c4c..2bfacf3178c4e 100644
--- a/libcxx/include/__algorithm/fill_n.h
+++ b/libcxx/include/__algorithm/fill_n.h
@@ -9,10 +9,17 @@
 #ifndef _LIBCPP___ALGORITHM_FILL_N_H
 #define _LIBCPP___ALGORITHM_FILL_N_H
 
+#include <__algorithm/for_each_n_segment.h>
 #include <__algorithm/min.h>
 #include <__config>
 #include <__fwd/bit_reference.h>
+#include <__iterator/iterator_traits.h>
+#include <__iterator/segmented_iterator.h>
 #include <__memory/pointer_traits.h>
+#include <__type_traits/conjunction.h>
+#include <__type_traits/enable_if.h>
+#include <__type_traits/integral_constant.h>
+#include <__type_traits/negation.h>
 #include <__utility/convert_to_integral.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -26,9 +33,38 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 // fill_n isn't specialized for std::memset, because the compiler already optimizes the loop to a call to std::memset.
 
-template <class _OutputIterator, class _Size, class _Tp>
+template <class _OutputIterator,
+          class _Size,
+          class _Tp
+#ifndef _LIBCPP_CXX03_LANG
+          ,
+          __enable_if_t<!_And<_BoolConstant<__is_segmented_iterator_v<_OutputIterator>>,
+                              __has_random_access_local_iterator<_OutputIterator>>::value,
+                        int> = 0
+#endif
+          >
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator
-__fill_n(_OutputIterator __first, _Size __n, const _Tp& __value);
+__fill_n(_OutputIterator __first, _Size __n, const _Tp& __value) {
+  for (; __n > 0; ++__first, (void)--__n)
+    *__first = __value;
+  return __first;
+}
+
+#ifndef _LIBCPP_CXX03_LANG
+template < class _OutputIterator,
+           class _Size,
+           class _Tp,
+           __enable_if_t<_And<_BoolConstant<__is_segmented_iterator_v<_OutputIterator>>,
+                              __has_random_access_local_iterator<_OutputIterator>>::value,
+                         int> = 0>
+inline _LIBCPP_HIDE_FROM_ABI
+_LIBCPP_CONSTEXPR_SINCE_CXX14 _OutputIterator __fill_n(_OutputIterator __first, _Size __n, const _Tp& __value) {
+  using __local_iterator_t = typename __segmented_iterator_traits<_OutputIterator>::__local_iterator;
+  return std::__for_each_n_segment(__first, __n, [&](__local_iterator_t __lfirst, __local_iterator_t __llast) {
+    std::__fill_n(__lfirst, __llast - __lfirst, __value);
+  });
+}
+#endif // !_LIBCPP_CXX03_LANG
 
 template <bool _FillVal, class _Cp>
 _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void
@@ -68,14 +104,6 @@ __fill_n(__bit_iterator<_Cp, false> __first, _Size __n, const bool& __value) {
   return __first + __n;
 }
 
-template <class _OutputIterator, class _Size, class _Tp>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator
-__fill_n(_OutputIterator __first, _Size __n, const _Tp& __value) {
-  for (; __n > 0; ++__first, (void)--__n)
-    *__first = __value;
-  return __first;
-}
-
 template <class _OutputIterator, class _Size, class _Tp>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator
 fill_n(_OutputIterator __first, _Size __n, const _Tp& __value) {
diff --git a/libcxx/include/__algorithm/for_each.h b/libcxx/include/__algorithm/for_each.h
index e31fcae83a332..6fb66d25a2462 100644
--- a/libcxx/include/__algorithm/for_each.h
+++ b/libcxx/include/__algorithm/for_each.h
@@ -16,15 +16,11 @@
 #include <__iterator/segmented_iterator.h>
 #include <__type_traits/enable_if.h>
 #include <__type_traits/invoke.h>
-#include <__utility/move.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
 #endif
 
-_LIBCPP_PUSH_MACROS
-#include <__undef_macros>
-
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _InputIterator, class _Sent, class _Func, class _Proj>
@@ -60,6 +56,4 @@ for_each(_InputIterator __first, _InputIterator __last, _Func __f) {
 
 _LIBCPP_END_NAMESPACE_STD
 
-_LIBCPP_POP_MACROS
-
 #endif // _LIBCPP___ALGORITHM_FOR_EACH_H
diff --git a/libcxx/include/__algorithm/generate.h b/libcxx/include/__algorithm/generate.h
index c95b527402f5d..c4cd75cd0a4d6 100644
--- a/libcxx/include/__algorithm/generate.h
+++ b/libcxx/include/__algorithm/generate.h
@@ -9,7 +9,9 @@
 #ifndef _LIBCPP___ALGORITHM_GENERATE_H
 #define _LIBCPP___ALGORITHM_GENERATE_H
 
+#include <__algorithm/for_each.h>
 #include <__config>
+#include <__utility/forward.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -20,8 +22,8 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _ForwardIterator, class _Generator>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
 generate(_ForwardIterator __first, _ForwardIterator __last, _Generator __gen) {
-  for (; __first != __last; ++__first)
-    *__first = __gen();
+  using __iter_ref = decltype(*__first);
+  std::for_each(__first, __last, [&](__iter_ref __element) { std::forward<__iter_ref>(__element) = __gen(); });
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/generate_n.h b/libcxx/include/__algorithm/generate_n.h
index f36403fd0f94a..e9da133f0570a 100644
--- a/libcxx/include/__algorithm/generate_n.h
+++ b/libcxx/include/__algorithm/generate_n.h
@@ -9,8 +9,10 @@
 #ifndef _LIBCPP___ALGORITHM_GENERATE_N_H
 #define _LIBCPP___ALGORITHM_GENERATE_N_H
 
+#include <__algorithm/for_each_n.h>
 #include <__config>
-#include <__utility/convert_to_integral.h>
+#include <__functional/identity.h>
+#include <__utility/forward.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -21,11 +23,10 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _OutputIterator, class _Size, class _Generator>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator
 generate_n(_OutputIterator __first, _Size __orig_n, _Generator __gen) {
-  typedef decltype(std::__convert_to_integral(__orig_n)) _IntegralSize;
-  _IntegralSize __n = __orig_n;
-  for (; __n > 0; ++__first, (void)--__n)
-    *__first = __gen();
-  return __first;
+  using __iter_ref = decltype(*__first);
+  __identity __proj;
+  auto __f = [&](__iter_ref __element) { std::forward<__iter_ref>(__element) = __gen(); };
+  return std::__for_each_n(__first, __orig_n, __f, __proj);
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/ranges_fill.h b/libcxx/include/__algorithm/ranges_fill.h
index c248009f98fe3..814ae6363fcf0 100644
--- a/libcxx/include/__algorithm/ranges_fill.h
+++ b/libcxx/include/__algorithm/ranges_fill.h
@@ -9,12 +9,14 @@
 #ifndef _LIBCPP___ALGORITHM_RANGES_FILL_H
 #define _LIBCPP___ALGORITHM_RANGES_FILL_H
 
-#include <__algorithm/ranges_fill_n.h>
+#include <__algorithm/fill.h>
+#include <__algorithm/fill_n.h>
 #include <__config>
 #include <__iterator/concepts.h>
 #include <__ranges/access.h>
 #include <__ranges/concepts.h>
 #include <__ranges/dangling.h>
+#include <__utility/move.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -31,12 +33,11 @@ namespace ranges {
 struct __fill {
   template <class _Type, output_iterator<const _Type&> _Iter, sentinel_for<_Iter> _Sent>
   _LIBCPP_HIDE_FROM_ABI constexpr _Iter operator()(_Iter __first, _Sent __last, const _Type& __value) const {
-    if constexpr (random_access_iterator<_Iter> && sized_sentinel_for<_Sent, _Iter>) {
-      return ranges::fill_n(__first, __last - __first, __value);
+    if constexpr (sized_sentinel_for<_Sent, _Iter>) {
+      auto __n = __last - __first;
+      return std::__fill_n(std::move(__first), __n, __value);
     } else {
-      for (; __first != __last; ++__first)
-        *__first = __value;
-      return __first;
+      return std::__fill(std::move(__first), std::move(__last), __value);
     }
   }
 
diff --git a/libcxx/include/__configuration/abi.h b/libcxx/include/__configuration/abi.h
index 2d33b9c03090b..c9936df30ff7f 100644
--- a/libcxx/include/__configuration/abi.h
+++ b/libcxx/include/__configuration/abi.h
@@ -30,8 +30,20 @@
 #elif _LIBCPP_ABI_FORCE_MICROSOFT
 #  define _LIBCPP_ABI_MICROSOFT
 #else
+// Windows uses the Microsoft ABI
 #  if defined(_WIN32) && defined(_MSC_VER)
 #    define _LIBCPP_ABI_MICROSOFT
+
+// 32-bit ARM uses the Itanium ABI with a few differences (array cookies, etc),
+// and so does 64-bit ARM on Apple platforms.
+#  elif defined(__arm__) || (defined(__APPLE__) && defined(__aarch64__))
+#    define _LIBCPP_ABI_ITANIUM_WITH_ARM_DIFFERENCES
+
+// Non-Apple 64-bit ARM uses the vanilla Itanium ABI
+#  elif defined(__aarch64__)
+#    define _LIBCPP_ABI_ITANIUM
+
+// We assume that other architectures also use the vanilla Itanium ABI too
 #  else
 #    define _LIBCPP_ABI_ITANIUM
 #  endif
diff --git a/libcxx/include/__cxx03/__atomic/atomic.h b/libcxx/include/__cxx03/__atomic/atomic.h
index bc4a3937ce8be..f2e849318dce2 100644
--- a/libcxx/include/__cxx03/__atomic/atomic.h
+++ b/libcxx/include/__cxx03/__atomic/atomic.h
@@ -34,9 +34,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
 struct atomic : public __atomic_base<_Tp> {
-  using __base          = __atomic_base<_Tp>;
-  using value_type      = _Tp;
-  using difference_type = value_type;
+  using __base = __atomic_base<_Tp>;
 
   _LIBCPP_HIDE_FROM_ABI atomic() _NOEXCEPT = default;
 
@@ -59,8 +57,8 @@ struct atomic : public __atomic_base<_Tp> {
 
 template <class _Tp>
 struct atomic<_Tp*> : public __atomic_base<_Tp*> {
-  using __base          = __atomic_base<_Tp*>;
-  using value_type      = _Tp*;
+  using __base = __atomic_base<_Tp*>;
+
   using difference_type = ptrdiff_t;
 
   _LIBCPP_HIDE_FROM_ABI atomic() _NOEXCEPT = default;
diff --git a/libcxx/include/__cxx03/__atomic/atomic_base.h b/libcxx/include/__cxx03/__atomic/atomic_base.h
index a2b40c6a7e6f2..d79ef7d78c343 100644
--- a/libcxx/include/__cxx03/__atomic/atomic_base.h
+++ b/libcxx/include/__cxx03/__atomic/atomic_base.h
@@ -32,6 +32,8 @@ struct __atomic_base // false
 {
   mutable __cxx_atomic_impl<_Tp> __a_;
 
+  using value_type = _Tp;
+
   _LIBCPP_HIDE_FROM_ABI bool is_lock_free() const volatile _NOEXCEPT {
     return __cxx_atomic_is_lock_free(sizeof(__cxx_atomic_impl<_Tp>));
   }
@@ -127,6 +129,8 @@ template <class _Tp>
 struct __atomic_base<_Tp, true> : public __atomic_base<_Tp, false> {
   using __base = __atomic_base<_Tp, false>;
 
+  using difference_type = typename __base::value_type;
+
   _LIBCPP_HIDE_FROM_ABI __atomic_base() _NOEXCEPT = default;
 
   _LIBCPP_HIDE_FROM_ABI __atomic_base(_Tp __d) _NOEXCEPT : __base(__d) {}
diff --git a/libcxx/include/__cxx03/__bit_reference b/libcxx/include/__cxx03/__bit_reference
index 76027e2d1523f..ac0005ff00f13 100644
--- a/libcxx/include/__cxx03/__bit_reference
+++ b/libcxx/include/__cxx03/__bit_reference
@@ -167,7 +167,7 @@ _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_aligned(
       unsigned __clz       = __bits_per_word - __first.__ctz_;
       difference_type __dn = std::min(static_cast<difference_type>(__clz), __n);
       __n -= __dn;
-      __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
+      __storage_type __m = (__storage_type(~0) << __first.__ctz_) & (__storage_type(~0) >> (__clz - __dn));
       __storage_type __b = *__first.__seg_ & __m;
       *__result.__seg_ &= ~__m;
       *__result.__seg_ |= __b;
@@ -185,7 +185,7 @@ _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_aligned(
     // do last word
     if (__n > 0) {
       __first.__seg_ += __nw;
-      __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
+      __storage_type __m = __storage_type(~0) >> (__bits_per_word - __n);
       __storage_type __b = *__first.__seg_ & __m;
       *__result.__seg_ &= ~__m;
       *__result.__seg_ |= __b;
@@ -210,11 +210,11 @@ _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_unaligned(
       unsigned __clz_f     = __bits_per_word - __first.__ctz_;
       difference_type __dn = std::min(static_cast<difference_type>(__clz_f), __n);
       __n -= __dn;
-      __storage_type __m   = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
+      __storage_type __m   = (__storage_type(~0) << __first.__ctz_) & (__storage_type(~0) >> (__clz_f - __dn));
       __storage_type __b   = *__first.__seg_ & __m;
       unsigned __clz_r     = __bits_per_word - __result.__ctz_;
       __storage_type __ddn = std::min<__storage_type>(__dn, __clz_r);
-      __m                  = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __ddn));
+      __m                  = (__storage_type(~0) << __result.__ctz_) & (__storage_type(~0) >> (__clz_r - __ddn));
       *__result.__seg_ &= ~__m;
       if (__result.__ctz_ > __first.__ctz_)
         *__result.__seg_ |= __b << (__result.__ctz_ - __first.__ctz_);
@@ -224,7 +224,7 @@ _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_unaligned(
       __result.__ctz_ = static_cast<unsigned>((__ddn + __result.__ctz_) % __bits_per_word);
       __dn -= __ddn;
       if (__dn > 0) {
-        __m = ~__storage_type(0) >> (__bits_per_word - __dn);
+        __m = __storage_type(~0) >> (__bits_per_word - __dn);
         *__result.__seg_ &= ~__m;
         *__result.__seg_ |= __b >> (__first.__ctz_ + __ddn);
         __result.__ctz_ = static_cast<unsigned>(__dn);
@@ -235,7 +235,7 @@ _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_unaligned(
     // __first.__ctz_ == 0;
     // do middle words
     unsigned __clz_r   = __bits_per_word - __result.__ctz_;
-    __storage_type __m = ~__storage_type(0) << __result.__ctz_;
+    __storage_type __m = __storage_type(~0) << __result.__ctz_;
     for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first.__seg_) {
       __storage_type __b = *__first.__seg_;
       *__result.__seg_ &= ~__m;
@@ -246,17 +246,17 @@ _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_unaligned(
     }
     // do last word
     if (__n > 0) {
-      __m                 = ~__storage_type(0) >> (__bits_per_word - __n);
+      __m                 = __storage_type(~0) >> (__bits_per_word - __n);
       __storage_type __b  = *__first.__seg_ & __m;
       __storage_type __dn = std::min(__n, static_cast<difference_type>(__clz_r));
-      __m                 = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __dn));
+      __m                 = (__storage_type(~0) << __result.__ctz_) & (__storage_type(~0) >> (__clz_r - __dn));
       *__result.__seg_ &= ~__m;
       *__result.__seg_ |= __b << __result.__ctz_;
       __result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
       __result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word);
       __n -= __dn;
       if (__n > 0) {
-        __m = ~__storage_type(0) >> (__bits_per_word - __n);
+        __m = __storage_type(~0) >> (__bits_per_word - __n);
         *__result.__seg_ &= ~__m;
         *__result.__seg_ |= __b >> __dn;
         __result.__ctz_ = static_cast<unsigned>(__n);
diff --git a/libcxx/include/__cxx03/__verbose_abort b/libcxx/include/__cxx03/__verbose_abort
index 4fcfffa2b4dfa..52d1297b65593 100644
--- a/libcxx/include/__cxx03/__verbose_abort
+++ b/libcxx/include/__cxx03/__verbose_abort
@@ -21,7 +21,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 // This function should never be called directly from the code -- it should only be called through
 // the _LIBCPP_VERBOSE_ABORT macro.
 _LIBCPP_NORETURN _LIBCPP_AVAILABILITY_VERBOSE_ABORT _LIBCPP_OVERRIDABLE_FUNC_VIS
-_LIBCPP_ATTRIBUTE_FORMAT(__printf__, 1, 2) void __libcpp_verbose_abort(const char* __format, ...);
+_LIBCPP_ATTRIBUTE_FORMAT(__printf__, 1, 2) void __libcpp_verbose_abort(const char* __format, ...) _NOEXCEPT;
 
 // _LIBCPP_VERBOSE_ABORT(format, args...)
 //
diff --git a/libcxx/include/__cxx03/regex b/libcxx/include/__cxx03/regex
index b96d59d3a252a..b6a78f27fbd37 100644
--- a/libcxx/include/__cxx03/regex
+++ b/libcxx/include/__cxx03/regex
@@ -2100,7 +2100,7 @@ public:
       __ranges_.push_back(
           std::make_pair(__traits_.transform(__b.begin(), __b.end()), __traits_.transform(__e.begin(), __e.end())));
     } else {
-      if (__b.size() != 1 || __e.size() != 1)
+      if (__b.size() != 1 || __e.size() != 1 || char_traits<typename string_type::value_type>::lt(__e[0], __b[0]))
         __throw_regex_error<regex_constants::error_range>();
       if (__icase_) {
         __b[0] = __traits_.translate_nocase(__b[0]);
@@ -3911,7 +3911,7 @@ _ForwardIterator basic_regex<_CharT, _Traits>::__parse_character_escape(
       ++__first;
       break;
     default:
-      if (*__first != '_' && !__traits_.isctype(*__first, ctype_base::alnum)) {
+      if (!__traits_.isctype(*__first, ctype_base::alnum)) {
         if (__str)
           *__str = *__first;
         else
diff --git a/libcxx/include/__cxx03/sstream b/libcxx/include/__cxx03/sstream
index 44c2423a6e1fa..741158aa1a01f 100644
--- a/libcxx/include/__cxx03/sstream
+++ b/libcxx/include/__cxx03/sstream
@@ -354,9 +354,15 @@ private:
 
 public:
   // [stringbuf.cons] constructors:
-  _LIBCPP_HIDE_FROM_ABI basic_stringbuf() : __hm_(nullptr), __mode_(ios_base::in | ios_base::out) {}
+  _LIBCPP_HIDE_FROM_ABI basic_stringbuf() : __hm_(nullptr), __mode_(ios_base::in | ios_base::out) {
+    // it is implementation-defined whether we initialize eback() & friends to nullptr, and libc++ doesn't
+    __init_buf_ptrs();
+  }
 
-  _LIBCPP_HIDE_FROM_ABI explicit basic_stringbuf(ios_base::openmode __wch) : __hm_(nullptr), __mode_(__wch) {}
+  _LIBCPP_HIDE_FROM_ABI explicit basic_stringbuf(ios_base::openmode __wch) : __hm_(nullptr), __mode_(__wch) {
+    // it is implementation-defined whether we initialize eback() & friends to nullptr, and libc++ doesn't
+    __init_buf_ptrs();
+  }
 
   _LIBCPP_HIDE_FROM_ABI explicit basic_stringbuf(const string_type& __s,
                                                  ios_base::openmode __wch = ios_base::in | ios_base::out)
diff --git a/libcxx/include/__cxx03/vector b/libcxx/include/__cxx03/vector
index 4b62e0bf33c46..43e82cd24b211 100644
--- a/libcxx/include/__cxx03/vector
+++ b/libcxx/include/__cxx03/vector
@@ -432,10 +432,12 @@ public:
   template <__enable_if_t<__is_allocator<_Allocator>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI vector(size_type __n, const value_type& __x, const allocator_type& __a)
       : __end_cap_(nullptr, __a) {
+    auto __guard = std::__make_exception_guard(__destroy_vector(*this));
     if (__n > 0) {
       __vallocate(__n);
       __construct_at_end(__n, __x);
     }
+    __guard.__complete();
   }
 
   template <class _InputIterator,
@@ -1054,9 +1056,7 @@ inline _LIBCPP_HIDE_FROM_ABI vector<_Tp, _Allocator>::vector(vector&& __x, const
     __x.__begin_ = __x.__end_ = __x.__end_cap() = nullptr;
   } else {
     typedef move_iterator<iterator> _Ip;
-    auto __guard = std::__make_exception_guard(__destroy_vector(*this));
-    assign(_Ip(__x.begin()), _Ip(__x.end()));
-    __guard.__complete();
+    __init_with_size(_Ip(__x.begin()), _Ip(__x.end()), __x.size());
   }
 }
 
@@ -1630,7 +1630,7 @@ private:
     return __n * __bits_per_word;
   }
   _LIBCPP_HIDE_FROM_ABI static size_type __external_cap_to_internal(size_type __n) _NOEXCEPT {
-    return (__n - 1) / __bits_per_word + 1;
+    return __n > 0 ? (__n - 1) / __bits_per_word + 1 : size_type(0);
   }
 
 public:
@@ -2142,11 +2142,13 @@ void vector<bool, _Allocator>::reserve(size_type __n) {
 
 template <class _Allocator>
 void vector<bool, _Allocator>::shrink_to_fit() _NOEXCEPT {
-  if (__external_cap_to_internal(size()) > __cap()) {
+  if (__external_cap_to_internal(size()) < __cap()) {
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
     try {
 #endif // _LIBCPP_HAS_NO_EXCEPTIONS
-      vector(*this, allocator_type(__alloc())).swap(*this);
+      vector __v(*this, allocator_type(__alloc()));
+      if (__v.__cap() < __cap())
+        __v.swap(*this);
 #ifndef _LIBCPP_HAS_NO_EXCEPTIONS
     } catch (...) {
     }
diff --git a/libcxx/include/__hash_table b/libcxx/include/__hash_table
index 74923ddb74e9c..5432abb4ab39d 100644
--- a/libcxx/include/__hash_table
+++ b/libcxx/include/__hash_table
@@ -808,7 +808,7 @@ public:
           }
           {
             __node_holder __h = __construct_node_hash(__hash, std::forward<_Args>(__args2)...);
-            if (size() + 1 > __bc * max_load_factor() || __bc == 0) {
+            if (size() + 1 > __bc * max_load_factor()) {
               __rehash_unique(std::max<size_type>(2 * __bc + !std::__is_hash_power2(__bc),
                                                   size_type(__math::ceil(float(size() + 1) / max_load_factor()))));
               __bc    = bucket_count();
@@ -1037,7 +1037,21 @@ private:
   }
   _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__hash_table&, false_type) _NOEXCEPT {}
 
-  _LIBCPP_HIDE_FROM_ABI void __deallocate_node(__next_pointer __np) _NOEXCEPT;
+  _LIBCPP_HIDE_FROM_ABI void __deallocate_node(__node_pointer __nd) _NOEXCEPT {
+    auto& __alloc = __node_alloc();
+    __node_traits::destroy(__alloc, std::addressof(__nd->__get_value()));
+    std::__destroy_at(std::__to_address(__nd));
+    __node_traits::deallocate(__alloc, __nd, 1);
+  }
+
+  _LIBCPP_HIDE_FROM_ABI void __deallocate_node_list(__next_pointer __np) _NOEXCEPT {
+    while (__np != nullptr) {
+      __next_pointer __next = __np->__next_;
+      __deallocate_node(__np->__upcast());
+      __np = __next;
+    }
+  }
+
   _LIBCPP_HIDE_FROM_ABI __next_pointer __detach() _NOEXCEPT;
 
   template <class _From, class _ValueT = _Tp, __enable_if_t<__is_hash_value_type<_ValueT>::value, int> = 0>
@@ -1175,7 +1189,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::~__hash_table() {
   static_assert(is_copy_constructible<hasher>::value, "Hasher must be copy-constructible.");
 #endif
 
-  __deallocate_node(__first_node_.__next_);
+  __deallocate_node_list(__first_node_.__next_);
 }
 
 template <class _Tp, class _Hash, class _Equal, class _Alloc>
@@ -1251,7 +1265,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::operator=(const __hash_table& __other)
   // At this point we either have consumed the whole incoming hash table, or we don't have any more nodes to reuse in
   // the destination. Either continue with constructing new nodes, or deallocate the left over nodes.
   if (__own_iter->__next_) {
-    __deallocate_node(__own_iter->__next_);
+    __deallocate_node_list(__own_iter->__next_);
     __own_iter->__next_ = nullptr;
   } else {
     __copy_construct(__other_iter, __own_iter, __current_chash);
@@ -1262,19 +1276,6 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::operator=(const __hash_table& __other)
   return *this;
 }
 
-template <class _Tp, class _Hash, class _Equal, class _Alloc>
-void __hash_table<_Tp, _Hash, _Equal, _Alloc>::__deallocate_node(__next_pointer __np) _NOEXCEPT {
-  __node_allocator& __na = __node_alloc();
-  while (__np != nullptr) {
-    __next_pointer __next    = __np->__next_;
-    __node_pointer __real_np = __np->__upcast();
-    __node_traits::destroy(__na, std::addressof(__real_np->__get_value()));
-    std::__destroy_at(std::addressof(*__real_np));
-    __node_traits::deallocate(__na, __real_np, 1);
-    __np = __next;
-  }
-}
-
 template <class _Tp, class _Hash, class _Equal, class _Alloc>
 typename __hash_table<_Tp, _Hash, _Equal, _Alloc>::__next_pointer
 __hash_table<_Tp, _Hash, _Equal, _Alloc>::__detach() _NOEXCEPT {
@@ -1318,7 +1319,7 @@ void __hash_table<_Tp, _Hash, _Equal, _Alloc>::__move_assign(__hash_table& __u,
     max_load_factor() = __u.max_load_factor();
     if (bucket_count() != 0) {
       __next_pointer __cache = __detach();
-      auto __guard           = std::__make_scope_guard([&] { __deallocate_node(__cache); });
+      auto __guard           = std::__make_scope_guard([&] { __deallocate_node_list(__cache); });
       const_iterator __i     = __u.begin();
       while (__cache != nullptr && __u.size() != 0) {
         __assign_value(__cache->__upcast()->__get_value(), std::move(__u.remove(__i++)->__get_value()));
@@ -1353,7 +1354,7 @@ void __hash_table<_Tp, _Hash, _Equal, _Alloc>::__assign_unique(_InputIterator __
 
   if (bucket_count() != 0) {
     __next_pointer __cache = __detach();
-    auto __guard           = std::__make_scope_guard([&] { __deallocate_node(__cache); });
+    auto __guard           = std::__make_scope_guard([&] { __deallocate_node_list(__cache); });
     for (; __cache != nullptr && __first != __last; ++__first) {
       __assign_value(__cache->__upcast()->__get_value(), *__first);
       __next_pointer __next = __cache->__next_;
@@ -1374,7 +1375,7 @@ void __hash_table<_Tp, _Hash, _Equal, _Alloc>::__assign_multi(_InputIterator __f
                 "__assign_multi may only be called with the containers value type or the nodes value type");
   if (bucket_count() != 0) {
     __next_pointer __cache = __detach();
-    auto __guard           = std::__make_scope_guard([&] { __deallocate_node(__cache); });
+    auto __guard           = std::__make_scope_guard([&] { __deallocate_node_list(__cache); });
     for (; __cache != nullptr && __first != __last; ++__first) {
       __assign_value(__cache->__upcast()->__get_value(), *__first);
       __next_pointer __next = __cache->__next_;
@@ -1413,7 +1414,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::end() const _NOEXCEPT {
 template <class _Tp, class _Hash, class _Equal, class _Alloc>
 void __hash_table<_Tp, _Hash, _Equal, _Alloc>::clear() _NOEXCEPT {
   if (size() > 0) {
-    __deallocate_node(__first_node_.__next_);
+    __deallocate_node_list(__first_node_.__next_);
     __first_node_.__next_ = nullptr;
     size_type __bc        = bucket_count();
     for (size_type __i = 0; __i < __bc; ++__i)
@@ -1873,12 +1874,61 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::erase(const_iterator __p) {
 template <class _Tp, class _Hash, class _Equal, class _Alloc>
 typename __hash_table<_Tp, _Hash, _Equal, _Alloc>::iterator
 __hash_table<_Tp, _Hash, _Equal, _Alloc>::erase(const_iterator __first, const_iterator __last) {
-  for (const_iterator __p = __first; __first != __last; __p = __first) {
-    ++__first;
-    erase(__p);
+  if (__first == __last)
+    return iterator(__last.__node_);
+
+  // current node
+  __next_pointer __current = __first.__node_;
+  size_type __bucket_count = bucket_count();
+  size_t __chash           = std::__constrain_hash(__current->__hash(), __bucket_count);
+  // find previous node
+  __next_pointer __before_first = __bucket_list_[__chash];
+  for (; __before_first->__next_ != __current; __before_first = __before_first->__next_)
+    ;
+
+  __next_pointer __last_node = __last.__node_;
+
+  // If __before_first is in the same bucket (i.e. the first element we erase is not the first in the bucket), clear
+  // this bucket first without re-linking it
+  if (__before_first != __first_node_.__ptr() &&
+      std::__constrain_hash(__before_first->__hash(), __bucket_count) == __chash) {
+    while (__current != __last_node) {
+      auto __next = __current->__next_;
+      __deallocate_node(__current->__upcast());
+      __current = __next;
+      --__size_;
+
+      if (__next) {
+        if (auto __next_chash = std::__constrain_hash(__next->__hash(), __bucket_count); __next_chash != __chash) {
+          __bucket_list_[__next_chash] = __before_first;
+          __chash                      = __next_chash;
+          break;
+        }
+      }
+    }
   }
-  __next_pointer __np = __last.__node_;
-  return iterator(__np);
+
+  while (__current != __last_node) {
+    auto __next = __current->__next_;
+    __deallocate_node(__current->__upcast());
+    __current = __next;
+    --__size_;
+
+    // When switching buckets, set the old bucket to be empty and update the next bucket to have __before_first as its
+    // before-first element
+    if (__next) {
+      if (auto __next_chash = std::__constrain_hash(__next->__hash(), __bucket_count); __next_chash != __chash) {
+        __bucket_list_[__chash]      = nullptr;
+        __bucket_list_[__next_chash] = __before_first;
+        __chash                      = __next_chash;
+      }
+    }
+  }
+
+  // re-link __before_first with __last
+  __before_first->__next_ = __current;
+
+  return iterator(__last.__node_);
 }
 
 template <class _Tp, class _Hash, class _Equal, class _Alloc>
diff --git a/libcxx/include/__iterator/distance.h b/libcxx/include/__iterator/distance.h
index 1732aa527f64a..9be9db0f0c70e 100644
--- a/libcxx/include/__iterator/distance.h
+++ b/libcxx/include/__iterator/distance.h
@@ -10,41 +10,71 @@
 #ifndef _LIBCPP___ITERATOR_DISTANCE_H
 #define _LIBCPP___ITERATOR_DISTANCE_H
 
+#include <__algorithm/for_each_segment.h>
 #include <__config>
 #include <__iterator/concepts.h>
 #include <__iterator/incrementable_traits.h>
 #include <__iterator/iterator_traits.h>
+#include <__iterator/segmented_iterator.h>
 #include <__ranges/access.h>
 #include <__ranges/concepts.h>
 #include <__ranges/size.h>
 #include <__type_traits/decay.h>
+#include <__type_traits/enable_if.h>
 #include <__type_traits/remove_cvref.h>
+#include <__utility/move.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-template <class _InputIter>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 typename iterator_traits<_InputIter>::difference_type
-__distance(_InputIter __first, _InputIter __last, input_iterator_tag) {
-  typename iterator_traits<_InputIter>::difference_type __r(0);
+#if _LIBCPP_STD_VER >= 20
+template <class _Iter>
+using __iter_distance_t _LIBCPP_NODEBUG = std::iter_difference_t<_Iter>;
+#else
+template <class _Iter>
+using __iter_distance_t _LIBCPP_NODEBUG = typename iterator_traits<_Iter>::difference_type;
+#endif
+
+template <class _InputIter, class _Sent>
+inline _LIBCPP_HIDE_FROM_ABI
+_LIBCPP_CONSTEXPR_SINCE_CXX17 __iter_distance_t<_InputIter> __distance(_InputIter __first, _Sent __last) {
+  __iter_distance_t<_InputIter> __r(0);
   for (; __first != __last; ++__first)
     ++__r;
   return __r;
 }
 
-template <class _RandIter>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 typename iterator_traits<_RandIter>::difference_type
-__distance(_RandIter __first, _RandIter __last, random_access_iterator_tag) {
+template <class _RandIter, __enable_if_t<__has_random_access_iterator_category<_RandIter>::value, int> = 0>
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 __iter_distance_t<_RandIter>
+__distance(_RandIter __first, _RandIter __last) {
   return __last - __first;
 }
 
+#if _LIBCPP_STD_VER >= 20
+template <class _SegmentedIter,
+          __enable_if_t<!__has_random_access_iterator_category<_SegmentedIter>::value &&
+                            __is_segmented_iterator_v<_SegmentedIter>,
+                        int> = 0>
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 __iter_distance_t<_SegmentedIter>
+__distance(_SegmentedIter __first, _SegmentedIter __last) {
+  __iter_distance_t<_SegmentedIter> __r(0);
+  std::__for_each_segment(__first, __last, [&__r](auto __lfirst, auto __llast) {
+    __r += std::__distance(__lfirst, __llast);
+  });
+  return __r;
+}
+#endif // _LIBCPP_STD_VER >= 20
+
 template <class _InputIter>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 typename iterator_traits<_InputIter>::difference_type
 distance(_InputIter __first, _InputIter __last) {
-  return std::__distance(__first, __last, typename iterator_traits<_InputIter>::iterator_category());
+  return std::__distance(__first, __last);
 }
 
 #if _LIBCPP_STD_VER >= 20
@@ -56,12 +86,7 @@ struct __distance {
   template <class _Ip, sentinel_for<_Ip> _Sp>
     requires(!sized_sentinel_for<_Sp, _Ip>)
   _LIBCPP_HIDE_FROM_ABI constexpr iter_difference_t<_Ip> operator()(_Ip __first, _Sp __last) const {
-    iter_difference_t<_Ip> __n = 0;
-    while (__first != __last) {
-      ++__first;
-      ++__n;
-    }
-    return __n;
+    return std::__distance(std::move(__first), std::move(__last));
   }
 
   template <class _Ip, sized_sentinel_for<decay_t<_Ip>> _Sp>
@@ -92,4 +117,6 @@ inline constexpr auto distance = __distance{};
 
 _LIBCPP_END_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___ITERATOR_DISTANCE_H
diff --git a/libcxx/include/__memory/array_cookie.h b/libcxx/include/__memory/array_cookie.h
index 806a9e99ecafe..be59f365aa80c 100644
--- a/libcxx/include/__memory/array_cookie.h
+++ b/libcxx/include/__memory/array_cookie.h
@@ -13,6 +13,7 @@
 #include <__config>
 #include <__configuration/abi.h>
 #include <__cstddef/size_t.h>
+#include <__memory/addressof.h>
 #include <__type_traits/integral_constant.h>
 #include <__type_traits/is_trivially_destructible.h>
 #include <__type_traits/negation.h>
@@ -26,14 +27,15 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 // Trait representing whether a type requires an array cookie at the start of its allocation when
 // allocated as `new T[n]` and deallocated as `delete[] array`.
 //
-// Under the Itanium C++ ABI [1], we know that an array cookie is available unless `T` is trivially
-// destructible and the call to `operator delete[]` is not a sized operator delete. Under ABIs other
-// than the Itanium ABI, we assume there are no array cookies.
+// Under the Itanium C++ ABI [1] and the ARM ABI which derives from it, we know that an array cookie is available
+// unless `T` is trivially destructible and the call to `operator delete[]` is not a sized operator delete. Under
+// other ABIs, we assume there are no array cookies.
 //
 // [1]: https://itanium-cxx-abi.github.io/cxx-abi/abi.html#array-cookies
-#ifdef _LIBCPP_ABI_ITANIUM
+#if defined(_LIBCPP_ABI_ITANIUM) || defined(_LIBCPP_ABI_ITANIUM_WITH_ARM_DIFFERENCES)
 // TODO: Use a builtin instead
-// TODO: We should factor in the choice of the usual deallocation function in this determination.
+// TODO: We should factor in the choice of the usual deallocation function in this determination:
+//       a cookie may be available in more cases but we ignore those for now.
 template <class _Tp>
 struct __has_array_cookie : _Not<is_trivially_destructible<_Tp> > {};
 #else
@@ -41,13 +43,79 @@ template <class _Tp>
 struct __has_array_cookie : false_type {};
 #endif
 
+struct __itanium_array_cookie {
+  size_t __element_count;
+};
+
+template <class _Tp>
+struct [[__gnu__::__aligned__(_LIBCPP_ALIGNOF(_Tp))]] __arm_array_cookie {
+  size_t __element_size;
+  size_t __element_count;
+};
+
+// Return the element count in the array cookie located before the given pointer.
+//
+// In the Itanium ABI [1]
+// ----------------------
+// The element count is stored immediately before the first element of the array. If the preferred alignment
+// of array elements (which is different from the ABI alignment) is more than that of size_t, additional
+// padding bytes exist before the array cookie. Assuming array elements of size and alignment 16 bytes, that
+// gives us the following layout:
+//
+// |ooooooooxxxxxxxxaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbccccccccccccccccdddddddddddddddd|
+//  ^^^^^^^^        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+//     |    ^^^^^^^^                               |
+//     |       |                              array elements
+//  padding    |
+//       element count
+//
+//
+// In the Itanium ABI with ARM differences [2]
+// -------------------------------------------
+// The array cookie is stored at the very start of the allocation and it has the following form:
+//
+//    struct array_cookie {
+//      std::size_t element_size; // element_size != 0
+//      std::size_t element_count;
+//    };
+//
+// Assuming elements of size and alignment 32 bytes, this gives us the following layout:
+//
+//  |xxxxxxxxXXXXXXXXooooooooooooooooaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb|
+//   ^^^^^^^^        ^^^^^^^^^^^^^^^^
+//      |    ^^^^^^^^        |       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+// element size  |        padding                                 |
+//         element count                                     array elements
+//
+// We must be careful to take into account the alignment of the array cookie, which may result in padding
+// bytes between the element count and the first element of the array. Note that for ARM, the compiler
+// aligns the array cookie using the ABI alignment, not the preferred alignment of array elements.
+//
+// [1]: https://itanium-cxx-abi.github.io/cxx-abi/abi.html#array-cookies
+// [2]: https://developer.apple.com/documentation/xcode/writing-arm64-code-for-apple-platforms#Handle-C++-differences
 template <class _Tp>
 // Avoid failures when -fsanitize-address-poison-custom-array-cookie is enabled
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_NO_SANITIZE("address") size_t __get_array_cookie(_Tp const* __ptr) {
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_NO_SANITIZE("address") size_t __get_array_cookie([[__maybe_unused__]] _Tp const* __ptr) {
   static_assert(
       __has_array_cookie<_Tp>::value, "Trying to access the array cookie of a type that is not guaranteed to have one");
-  size_t const* __cookie = reinterpret_cast<size_t const*>(__ptr) - 1; // TODO: Use a builtin instead
-  return *__cookie;
+
+#if defined(_LIBCPP_ABI_ITANIUM)
+  using _ArrayCookie = __itanium_array_cookie;
+#elif defined(_LIBCPP_ABI_ITANIUM_WITH_ARM_DIFFERENCES)
+  using _ArrayCookie = __arm_array_cookie<_Tp>;
+#else
+  static_assert(false, "The array cookie layout is unknown on this ABI");
+  struct _ArrayCookie { // dummy definition required to make the function parse
+    size_t element_count;
+  };
+#endif
+
+  char const* __array_cookie_start = reinterpret_cast<char const*>(__ptr) - sizeof(_ArrayCookie);
+  _ArrayCookie __cookie;
+  // This is necessary to avoid violating strict aliasing. It's valid because _ArrayCookie is an
+  // implicit lifetime type.
+  __builtin_memcpy(std::addressof(__cookie), __array_cookie_start, sizeof(_ArrayCookie));
+  return __cookie.__element_count;
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__tree b/libcxx/include/__tree
index d7d074a00f555..0738c8c6a5e2b 100644
--- a/libcxx/include/__tree
+++ b/libcxx/include/__tree
@@ -1119,15 +1119,15 @@ public:
   _LIBCPP_HIDE_FROM_ABI _InsertReturnType __node_handle_insert_unique(_NodeHandle&&);
   template <class _NodeHandle>
   _LIBCPP_HIDE_FROM_ABI iterator __node_handle_insert_unique(const_iterator, _NodeHandle&&);
-  template <class _Tree>
-  _LIBCPP_HIDE_FROM_ABI void __node_handle_merge_unique(_Tree& __source);
+  template <class _Comp2>
+  _LIBCPP_HIDE_FROM_ABI void __node_handle_merge_unique(__tree<_Tp, _Comp2, _Allocator>& __source);
 
   template <class _NodeHandle>
   _LIBCPP_HIDE_FROM_ABI iterator __node_handle_insert_multi(_NodeHandle&&);
   template <class _NodeHandle>
   _LIBCPP_HIDE_FROM_ABI iterator __node_handle_insert_multi(const_iterator, _NodeHandle&&);
-  template <class _Tree>
-  _LIBCPP_HIDE_FROM_ABI void __node_handle_merge_multi(_Tree& __source);
+  template <class _Comp2>
+  _LIBCPP_HIDE_FROM_ABI void __node_handle_merge_multi(__tree<_Tp, _Comp2, _Allocator>& __source);
 
   template <class _NodeHandle>
   _LIBCPP_HIDE_FROM_ABI _NodeHandle __node_handle_extract(key_type const&);
@@ -2020,11 +2020,10 @@ _LIBCPP_HIDE_FROM_ABI _NodeHandle __tree<_Tp, _Compare, _Allocator>::__node_hand
 }
 
 template <class _Tp, class _Compare, class _Allocator>
-template <class _Tree>
-_LIBCPP_HIDE_FROM_ABI void __tree<_Tp, _Compare, _Allocator>::__node_handle_merge_unique(_Tree& __source) {
-  static_assert(is_same<typename _Tree::__node_pointer, __node_pointer>::value, "");
-
-  for (typename _Tree::iterator __i = __source.begin(); __i != __source.end();) {
+template <class _Comp2>
+_LIBCPP_HIDE_FROM_ABI void
+__tree<_Tp, _Compare, _Allocator>::__node_handle_merge_unique(__tree<_Tp, _Comp2, _Allocator>& __source) {
+  for (iterator __i = __source.begin(); __i != __source.end();) {
     __node_pointer __src_ptr = __i.__get_np();
     auto [__parent, __child] = __find_equal(__src_ptr->__get_value());
     ++__i;
@@ -2065,11 +2064,10 @@ __tree<_Tp, _Compare, _Allocator>::__node_handle_insert_multi(const_iterator __h
 }
 
 template <class _Tp, class _Compare, class _Allocator>
-template <class _Tree>
-_LIBCPP_HIDE_FROM_ABI void __tree<_Tp, _Compare, _Allocator>::__node_handle_merge_multi(_Tree& __source) {
-  static_assert(is_same<typename _Tree::__node_pointer, __node_pointer>::value, "");
-
-  for (typename _Tree::iterator __i = __source.begin(); __i != __source.end();) {
+template <class _Comp2>
+_LIBCPP_HIDE_FROM_ABI void
+__tree<_Tp, _Compare, _Allocator>::__node_handle_merge_multi(__tree<_Tp, _Comp2, _Allocator>& __source) {
+  for (iterator __i = __source.begin(); __i != __source.end();) {
     __node_pointer __src_ptr = __i.__get_np();
     __end_node_pointer __parent;
     __node_base_pointer& __child = __find_leaf_high(__parent, __src_ptr->__get_value());
diff --git a/libcxx/include/__tuple/tuple_like_ext.h b/libcxx/include/__tuple/tuple_like_ext.h
deleted file mode 100644
index 5a6748a9cc79d..0000000000000
--- a/libcxx/include/__tuple/tuple_like_ext.h
+++ /dev/null
@@ -1,41 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___TUPLE_TUPLE_LIKE_EXT_H
-#define _LIBCPP___TUPLE_TUPLE_LIKE_EXT_H
-
-#include <__config>
-#include <__cstddef/size_t.h>
-#include <__fwd/array.h>
-#include <__fwd/pair.h>
-#include <__fwd/tuple.h>
-#include <__type_traits/integral_constant.h>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-template <class _Tp>
-struct __tuple_like_ext : false_type {};
-
-#ifndef _LIBCPP_CXX03_LANG
-template <class... _Tp>
-struct __tuple_like_ext<tuple<_Tp...> > : true_type {};
-#endif
-
-template <class _T1, class _T2>
-struct __tuple_like_ext<pair<_T1, _T2> > : true_type {};
-
-template <class _Tp, size_t _Size>
-struct __tuple_like_ext<array<_Tp, _Size> > : true_type {};
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP___TUPLE_TUPLE_LIKE_EXT_H
diff --git a/libcxx/include/__utility/cmp.h b/libcxx/include/__utility/cmp.h
index 14dc0c154c040..68864e23e0397 100644
--- a/libcxx/include/__utility/cmp.h
+++ b/libcxx/include/__utility/cmp.h
@@ -26,10 +26,18 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 20
 
+template <typename _Tp, typename _Ip>
+concept __comparison_can_promote_to =
+    sizeof(_Tp) < sizeof(_Ip) || (sizeof(_Tp) == sizeof(_Ip) && __signed_integer<_Tp>);
+
 template <__signed_or_unsigned_integer _Tp, __signed_or_unsigned_integer _Up>
 _LIBCPP_HIDE_FROM_ABI constexpr bool cmp_equal(_Tp __t, _Up __u) noexcept {
   if constexpr (is_signed_v<_Tp> == is_signed_v<_Up>)
     return __t == __u;
+  else if constexpr (__comparison_can_promote_to<_Tp, int> && __comparison_can_promote_to<_Up, int>)
+    return static_cast<int>(__t) == static_cast<int>(__u);
+  else if constexpr (__comparison_can_promote_to<_Tp, long long> && __comparison_can_promote_to<_Up, long long>)
+    return static_cast<long long>(__t) == static_cast<long long>(__u);
   else if constexpr (is_signed_v<_Tp>)
     return __t < 0 ? false : make_unsigned_t<_Tp>(__t) == __u;
   else
@@ -45,6 +53,10 @@ template <__signed_or_unsigned_integer _Tp, __signed_or_unsigned_integer _Up>
 _LIBCPP_HIDE_FROM_ABI constexpr bool cmp_less(_Tp __t, _Up __u) noexcept {
   if constexpr (is_signed_v<_Tp> == is_signed_v<_Up>)
     return __t < __u;
+  else if constexpr (__comparison_can_promote_to<_Tp, int> && __comparison_can_promote_to<_Up, int>)
+    return static_cast<int>(__t) < static_cast<int>(__u);
+  else if constexpr (__comparison_can_promote_to<_Tp, long long> && __comparison_can_promote_to<_Up, long long>)
+    return static_cast<long long>(__t) < static_cast<long long>(__u);
   else if constexpr (is_signed_v<_Tp>)
     return __t < 0 ? true : make_unsigned_t<_Tp>(__t) < __u;
   else
diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in
index 894093b409e11..a86d6c6a43d0e 100644
--- a/libcxx/include/module.modulemap.in
+++ b/libcxx/include/module.modulemap.in
@@ -2115,7 +2115,6 @@ module std [system] {
     module ignore                   { header "__tuple/ignore.h" }
     module sfinae_helpers           { header "__tuple/sfinae_helpers.h" }
     module tuple_element            { header "__tuple/tuple_element.h" }
-    module tuple_like_ext           { header "__tuple/tuple_like_ext.h" }
     module tuple_like_no_subrange   { header "__tuple/tuple_like_no_subrange.h" }
     module tuple_like               { header "__tuple/tuple_like.h" }
     module tuple_size               { header "__tuple/tuple_size.h" }
diff --git a/libcxx/include/tuple b/libcxx/include/tuple
index b0d0c38b115a2..5f3bb72e0678b 100644
--- a/libcxx/include/tuple
+++ b/libcxx/include/tuple
@@ -234,7 +234,6 @@ template <class... Types>
 #  include <__tuple/ignore.h>
 #  include <__tuple/tuple_element.h>
 #  include <__tuple/tuple_like.h>
-#  include <__tuple/tuple_like_ext.h>
 #  include <__tuple/tuple_size.h>
 #  include <__tuple/tuple_types.h>
 #  include <__type_traits/common_reference.h>
@@ -1273,15 +1272,19 @@ operator<=(const tuple<_Tp...>& __x, const tuple<_Up...>& __y) {
 template <class... _Tuples>
 struct __tuple_cat_return_impl;
 
-template <class _Tuple>
-struct __tuple_cat_return_impl<_Tuple> {
-  using type _LIBCPP_NODEBUG = _Tuple;
+template <class... _Types>
+struct __tuple_cat_return_impl<tuple<_Types...>> {
+  using type _LIBCPP_NODEBUG = tuple<_Types...>;
 };
 
-template <class... _Types0, template <class...> class _Tuple, class... _Types1, class... _Tuples>
-struct __tuple_cat_return_impl<tuple<_Types0...>, _Tuple<_Types1...>, _Tuples...>
+template <class... _Types0, class... _Types1, class... _Tuples>
+struct __tuple_cat_return_impl<tuple<_Types0...>, tuple<_Types1...>, _Tuples...>
     : __tuple_cat_return_impl<tuple<_Types0..., _Types1...>, _Tuples...> {};
 
+template <class... _Types0, class _Tp, class _Up, class... _Tuples>
+struct __tuple_cat_return_impl<tuple<_Types0...>, pair<_Tp, _Up>, _Tuples...>
+    : __tuple_cat_return_impl<tuple<_Types0..., _Tp, _Up>, _Tuples...> {};
+
 template <class, class, class>
 struct __tuple_cat_array;
 
@@ -1369,11 +1372,7 @@ __tuple_cat_select_element_wise(_TupleSrc&& __src, __index_sequence<_Indices...>
   return _TupleDst(std::get<_Indices>(std::forward<_TupleSrc>(__src))...);
 }
 
-template <class _Tuple0,
-          class... _Tuples,
-          __enable_if_t<
-              _And<__tuple_like_ext<__remove_cvref_t<_Tuple0>>, __tuple_like_ext<__remove_cvref_t<_Tuples>>...>::value,
-              int> = 0>
+template <class _Tuple0, class... _Tuples>
 inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 __tuple_cat_return_t<_Tuple0, _Tuples...>
 tuple_cat(_Tuple0&& __t0, _Tuples&&... __tpls) {
   using _T0 _LIBCPP_NODEBUG          = __libcpp_remove_reference_t<_Tuple0>;
diff --git a/libcxx/test/benchmarks/bitset.bench.cpp b/libcxx/test/benchmarks/bitset.bench.cpp
index 8fcf52e9425ce..b4c7e6feb6569 100644
--- a/libcxx/test/benchmarks/bitset.bench.cpp
+++ b/libcxx/test/benchmarks/bitset.bench.cpp
@@ -103,7 +103,7 @@ BENCHMARK(BM_BitsetToString<262144>)->Arg(50)->Name("BM_BitsetToString<262144>/U
 BENCHMARK(BM_BitsetToString<524288>)->Arg(50)->Name("BM_BitsetToString<524288>/Uniform (50%)");
 BENCHMARK(BM_BitsetToString<1048576>)->Arg(50)->Name("BM_BitsetToString<1048576>/Uniform (50%)"); // 1 << 20
 
-static void BM_ctor_ull(benchmark::State& state) {
+static void BM_Bitset_ctor_ull(benchmark::State& state) {
   unsigned long long val = (1ULL << state.range(0)) - 1;
   for (auto _ : state) {
     std::bitset<128> b(val);
@@ -111,6 +111,6 @@ static void BM_ctor_ull(benchmark::State& state) {
   }
 }
 
-BENCHMARK(BM_ctor_ull)->DenseRange(1, 63);
+BENCHMARK(BM_Bitset_ctor_ull)->DenseRange(1, 63);
 
 BENCHMARK_MAIN();
diff --git a/libcxx/test/benchmarks/iterators/distance.bench.cpp b/libcxx/test/benchmarks/iterators/distance.bench.cpp
new file mode 100644
index 0000000000000..186ef79bb0838
--- /dev/null
+++ b/libcxx/test/benchmarks/iterators/distance.bench.cpp
@@ -0,0 +1,84 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+#include <algorithm>
+#include <cstddef>
+#include <deque>
+#include <iterator>
+#include <ranges>
+#include <vector>
+
+#include <benchmark/benchmark.h>
+
+int main(int argc, char** argv) {
+  auto std_distance = [](auto first, auto last) { return std::distance(first, last); };
+
+  // {std,ranges}::distance(std::deque)
+  {
+    auto bm = [](std::string name, auto distance) {
+      benchmark::RegisterBenchmark(
+          name,
+          [distance](auto& st) {
+            std::size_t const size = st.range(0);
+            std::deque<int> c(size, 1);
+
+            for ([[maybe_unused]] auto _ : st) {
+              benchmark::DoNotOptimize(c);
+              auto result = distance(c.begin(), c.end());
+              benchmark::DoNotOptimize(result);
+            }
+          })
+          ->Arg(50) // non power-of-two
+          ->Arg(1024)
+          ->Arg(4096)
+          ->Arg(8192);
+    };
+    bm.operator()("std::distance(deque<int>)", std_distance);
+    bm.operator()("rng::distance(deque<int>)", std::ranges::distance);
+  }
+
+  // {std,ranges}::distance(std::join_view)
+  {
+    auto bm = []<class Container>(std::string name, auto distance, std::size_t seg_size) {
+      benchmark::RegisterBenchmark(
+          name,
+          [distance, seg_size](auto& st) {
+            std::size_t const size     = st.range(0);
+            std::size_t const segments = (size + seg_size - 1) / seg_size;
+            Container c(segments);
+            for (std::size_t i = 0, n = size; i < segments; ++i, n -= seg_size) {
+              c[i].resize(std::min(seg_size, n));
+            }
+
+            auto view  = c | std::views::join;
+            auto first = view.begin();
+            auto last  = view.end();
+
+            for ([[maybe_unused]] auto _ : st) {
+              benchmark::DoNotOptimize(c);
+              auto result = distance(first, last);
+              benchmark::DoNotOptimize(result);
+            }
+          })
+          ->Arg(50) // non power-of-two
+          ->Arg(1024)
+          ->Arg(4096)
+          ->Arg(8192);
+    };
+    bm.operator()<std::vector<std::vector<int>>>("std::distance(join_view(vector<vector<int>>))", std_distance, 256);
+    bm.operator()<std::vector<std::vector<int>>>(
+        "rng::distance(join_view(vector<vector<int>>)", std::ranges::distance, 256);
+  }
+
+  benchmark::Initialize(&argc, argv);
+  benchmark::RunSpecifiedBenchmarks();
+  benchmark::Shutdown();
+  return 0;
+}
diff --git a/libcxx/test/benchmarks/utility/cmp.bench.cpp b/libcxx/test/benchmarks/utility/cmp.bench.cpp
new file mode 100644
index 0000000000000..1ed179ac4e38c
--- /dev/null
+++ b/libcxx/test/benchmarks/utility/cmp.bench.cpp
@@ -0,0 +1,139 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+#include <utility>
+#include "../CartesianBenchmarks.h"
+#include "benchmark/benchmark.h"
+
+namespace {
+
+enum ValueType : size_t {
+  SChar,
+  UChar,
+  Short,
+  UShort,
+  Int,
+  UInt,
+  Long,
+  ULong,
+  LongLong,
+  ULongLong,
+#ifndef TEST_HAS_NO_INT128
+  Int128,
+  UInt128,
+#endif
+};
+
+struct AllValueTypes : EnumValuesAsTuple<AllValueTypes, ValueType, 6> {
+  static constexpr const char* Names[] = {
+      "schar",
+      "uchar",
+      "short",
+      "ushort",
+      "int",
+      "uint",
+      "long",
+      "ulong",
+      "longlong",
+      "ulonglong",
+#ifndef TEST_HAS_NO_INT128
+      "int128",
+      "uint128"
+#endif
+  };
+};
+
+using TestType =
+    std::tuple< signed char,
+                unsigned char,
+                short,
+                unsigned short,
+                int,
+                unsigned int,
+                long,
+                unsigned long,
+                long long,
+                unsigned long long
+#ifndef TEST_HAS_NO_INT128
+                ,
+                __int128_t,
+                __uint128_t
+#endif
+                >;
+
+template <typename TType, typename UType>
+struct CmpEqual {
+  static void run(benchmark::State& state) {
+    using T = std::tuple_element_t<TType::value, TestType>;
+    using U = std::tuple_element_t<UType::value, TestType>;
+
+    T x1 = T{127}, x2 = T{111};
+    U y1 = U{123}, y2 = U{1};
+    for (auto _ : state) {
+      benchmark::DoNotOptimize(x1);
+      benchmark::DoNotOptimize(x2);
+      benchmark::DoNotOptimize(y1);
+      benchmark::DoNotOptimize(y2);
+      benchmark::DoNotOptimize(std::cmp_equal(x1, y1));
+      benchmark::DoNotOptimize(std::cmp_equal(y1, x1));
+      benchmark::DoNotOptimize(std::cmp_equal(x1, x1));
+      benchmark::DoNotOptimize(std::cmp_equal(y1, y1));
+
+      benchmark::DoNotOptimize(std::cmp_equal(x2, y2));
+      benchmark::DoNotOptimize(std::cmp_equal(y2, x2));
+      benchmark::DoNotOptimize(std::cmp_equal(x2, x2));
+      benchmark::DoNotOptimize(std::cmp_equal(y2, y2));
+    }
+  }
+
+  static std::string name() { return "BM_CmpEqual" + TType::name() + UType::name(); }
+};
+
+template <typename TType, typename UType>
+struct CmpLess {
+  static void run(benchmark::State& state) {
+    using T = std::tuple_element_t<TType::value, TestType>;
+    using U = std::tuple_element_t<UType::value, TestType>;
+
+    T x1 = T{127}, x2 = T{111};
+    U y1 = U{123}, y2 = U{1};
+    for (auto _ : state) {
+      benchmark::DoNotOptimize(x1);
+      benchmark::DoNotOptimize(x2);
+      benchmark::DoNotOptimize(y1);
+      benchmark::DoNotOptimize(y2);
+      benchmark::DoNotOptimize(std::cmp_less(x1, y1));
+      benchmark::DoNotOptimize(std::cmp_less(y1, x1));
+      benchmark::DoNotOptimize(std::cmp_less(x1, x1));
+      benchmark::DoNotOptimize(std::cmp_less(y1, y1));
+
+      benchmark::DoNotOptimize(std::cmp_less(x2, y2));
+      benchmark::DoNotOptimize(std::cmp_less(y2, x2));
+      benchmark::DoNotOptimize(std::cmp_less(x2, x2));
+      benchmark::DoNotOptimize(std::cmp_less(y2, y2));
+    }
+  }
+
+  static std::string name() { return "BM_CmpLess" + TType::name() + UType::name(); }
+};
+
+} // namespace
+
+int main(int argc, char** argv) {
+  benchmark::Initialize(&argc, argv);
+  if (benchmark::ReportUnrecognizedArguments(argc, argv))
+    return 1;
+
+  makeCartesianProductBenchmark<CmpEqual, AllValueTypes, AllValueTypes>();
+  makeCartesianProductBenchmark<CmpLess, AllValueTypes, AllValueTypes>();
+  benchmark::RunSpecifiedBenchmarks();
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx-03/assertions/customize_verbose_abort.link-time.pass.cpp b/libcxx/test/libcxx-03/assertions/customize_verbose_abort.link-time.pass.cpp
index 390c6b6db190d..3c7a2d45d4f01 100644
--- a/libcxx/test/libcxx-03/assertions/customize_verbose_abort.link-time.pass.cpp
+++ b/libcxx/test/libcxx-03/assertions/customize_verbose_abort.link-time.pass.cpp
@@ -12,9 +12,7 @@
 // failures when back-deploying.
 // XFAIL: availability-verbose_abort-missing
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
-#include <__verbose_abort>
+#include <__cxx03/__verbose_abort>
 #include <cstdlib>
 
 void std::__libcpp_verbose_abort(char const*, ...) _NOEXCEPT { std::exit(EXIT_SUCCESS); }
diff --git a/libcxx/test/libcxx-03/atomics/atomics.types.operations/atomics.types.operations.req/atomic_fetch_add.verify.cpp b/libcxx/test/libcxx-03/atomics/atomics.types.operations/atomics.types.operations.req/atomic_fetch_add.verify.cpp
index 320ef57dcb6f9..1bd5792d1d272 100644
--- a/libcxx/test/libcxx-03/atomics/atomics.types.operations/atomics.types.operations.req/atomic_fetch_add.verify.cpp
+++ b/libcxx/test/libcxx-03/atomics/atomics.types.operations/atomics.types.operations.req/atomic_fetch_add.verify.cpp
@@ -6,8 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 // <atomic>
 
 // template <class T>
@@ -48,12 +46,12 @@ void pointer_to_incomplete_type() {
 void function_pointer() {
   {
     volatile std::atomic<void (*)(int)> fun;
-    // expected-error-re@*:* {{static assertion failed due to requirement '!is_function<void (int)>::value'{{.*}}Pointer to function isn't allowed}}
+    // expected-error-re@*:* {{static assertion failed due to requirement {{.+}}Pointer to function isn't allowed}}
     std::atomic_fetch_add(&fun, 0);
   }
   {
     std::atomic<void (*)(int)> fun;
-    // expected-error-re@*:* {{static assertion failed due to requirement '!is_function<void (int)>::value'{{.*}}Pointer to function isn't allowed}}
+    // expected-error-re@*:* {{static assertion failed due to requirement {{.+}}Pointer to function isn't allowed}}
     std::atomic_fetch_add(&fun, 0);
   }
 }
diff --git a/libcxx/test/libcxx-03/atomics/atomics.types.operations/atomics.types.operations.req/atomic_fetch_add_explicit.verify.cpp b/libcxx/test/libcxx-03/atomics/atomics.types.operations/atomics.types.operations.req/atomic_fetch_add_explicit.verify.cpp
index bdd8089feb281..bdd4a8371b121 100644
--- a/libcxx/test/libcxx-03/atomics/atomics.types.operations/atomics.types.operations.req/atomic_fetch_add_explicit.verify.cpp
+++ b/libcxx/test/libcxx-03/atomics/atomics.types.operations/atomics.types.operations.req/atomic_fetch_add_explicit.verify.cpp
@@ -6,8 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 // <atomic>
 
 // template <class T>
@@ -51,12 +49,12 @@ void pointer_to_incomplete_type() {
 void function_pointer() {
   {
     volatile std::atomic<void (*)(int)> fun;
-    // expected-error-re@*:* {{static assertion failed due to requirement '!is_function<void (int)>::value'{{.*}}Pointer to function isn't allowed}}
+    // expected-error-re@*:* {{static assertion failed due to requirement {{.+}}Pointer to function isn't allowed}}
     std::atomic_fetch_add_explicit(&fun, 0, std::memory_order_relaxed);
   }
   {
     std::atomic<void (*)(int)> fun;
-    // expected-error-re@*:* {{static assertion failed due to requirement '!is_function<void (int)>::value'{{.*}}Pointer to function isn't allowed}}
+    // expected-error-re@*:* {{static assertion failed due to requirement {{.+}}Pointer to function isn't allowed}}
     std::atomic_fetch_add_explicit(&fun, 0, std::memory_order_relaxed);
   }
 }
diff --git a/libcxx/test/libcxx-03/atomics/atomics.types.operations/atomics.types.operations.req/atomic_fetch_sub.verify.cpp b/libcxx/test/libcxx-03/atomics/atomics.types.operations/atomics.types.operations.req/atomic_fetch_sub.verify.cpp
index 2c9f89891d5be..105a010313254 100644
--- a/libcxx/test/libcxx-03/atomics/atomics.types.operations/atomics.types.operations.req/atomic_fetch_sub.verify.cpp
+++ b/libcxx/test/libcxx-03/atomics/atomics.types.operations/atomics.types.operations.req/atomic_fetch_sub.verify.cpp
@@ -6,8 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 // <atomic>
 
 // template <class T>
@@ -48,12 +46,12 @@ void pointer_to_incomplete_type() {
 void function_pointer() {
   {
     volatile std::atomic<void (*)(int)> fun;
-    // expected-error-re@*:* {{static assertion failed due to requirement '!is_function<void (int)>::value'{{.*}}Pointer to function isn't allowed}}
+    // expected-error-re@*:* {{static assertion failed due to requirement {{.+}}Pointer to function isn't allowed}}
     std::atomic_fetch_sub(&fun, 0);
   }
   {
     std::atomic<void (*)(int)> fun;
-    // expected-error-re@*:* {{static assertion failed due to requirement '!is_function<void (int)>::value'{{.*}}Pointer to function isn't allowed}}
+    // expected-error-re@*:* {{static assertion failed due to requirement {{.+}}Pointer to function isn't allowed}}
     std::atomic_fetch_sub(&fun, 0);
   }
 }
diff --git a/libcxx/test/libcxx-03/atomics/atomics.types.operations/atomics.types.operations.req/atomic_fetch_sub_explicit.verify.cpp b/libcxx/test/libcxx-03/atomics/atomics.types.operations/atomics.types.operations.req/atomic_fetch_sub_explicit.verify.cpp
index 88c42750b608a..1647ed3aa281f 100644
--- a/libcxx/test/libcxx-03/atomics/atomics.types.operations/atomics.types.operations.req/atomic_fetch_sub_explicit.verify.cpp
+++ b/libcxx/test/libcxx-03/atomics/atomics.types.operations/atomics.types.operations.req/atomic_fetch_sub_explicit.verify.cpp
@@ -6,8 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 // <atomic>
 
 // template <class T>
@@ -51,12 +49,12 @@ void pointer_to_incomplete_type() {
 void function_pointer() {
   {
     volatile std::atomic<void (*)(int)> fun;
-    // expected-error-re@*:* {{static assertion failed due to requirement '!is_function<void (int)>::value'{{.*}}Pointer to function isn't allowed}}
+    // expected-error-re@*:* {{static assertion failed due to requirement {{.+}}Pointer to function isn't allowed}}
     std::atomic_fetch_sub_explicit(&fun, 0, std::memory_order_relaxed);
   }
   {
     std::atomic<void (*)(int)> fun;
-    // expected-error-re@*:* {{static assertion failed due to requirement '!is_function<void (int)>::value'{{.*}}Pointer to function isn't allowed}}
+    // expected-error-re@*:* {{static assertion failed due to requirement {{.+}}Pointer to function isn't allowed}}
     std::atomic_fetch_sub_explicit(&fun, 0, std::memory_order_relaxed);
   }
 }
diff --git a/libcxx/test/libcxx-03/input.output/string.streams/stringbuf/const_sso_buffer.pass.cpp b/libcxx/test/libcxx-03/input.output/string.streams/stringbuf/const_sso_buffer.pass.cpp
index 2b6c3802a56b6..d6caa3389b8fa 100644
--- a/libcxx/test/libcxx-03/input.output/string.streams/stringbuf/const_sso_buffer.pass.cpp
+++ b/libcxx/test/libcxx-03/input.output/string.streams/stringbuf/const_sso_buffer.pass.cpp
@@ -8,8 +8,6 @@
 
 // <sstream>
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 // How the constructors of basic_stringbuf initialize the buffer pointers is
 // not specified. For some constructors it's implementation defined whether the
 // pointers are set to nullptr. Libc++'s implementation directly uses the SSO
diff --git a/libcxx/test/libcxx-03/language.support/support.dynamic/libcpp_deallocate.sh.cpp b/libcxx/test/libcxx-03/language.support/support.dynamic/libcpp_deallocate.sh.cpp
index 7ead65caf9fda..a9fe04fb0bcd5 100644
--- a/libcxx/test/libcxx-03/language.support/support.dynamic/libcpp_deallocate.sh.cpp
+++ b/libcxx/test/libcxx-03/language.support/support.dynamic/libcpp_deallocate.sh.cpp
@@ -21,8 +21,6 @@
 // GCC doesn't support the aligned-allocation flags.
 // XFAIL: gcc
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 // RUN: %{build} -faligned-allocation -fsized-deallocation
 // RUN: %{run}
 // RUN: %{build} -faligned-allocation -fno-sized-deallocation -DNO_SIZE
@@ -40,7 +38,7 @@
 
 TEST_DIAGNOSTIC_PUSH
 TEST_CLANG_DIAGNOSTIC_IGNORED("-Wprivate-header")
-#include <__memory/aligned_alloc.h>
+#include <__cxx03/__memory/aligned_alloc.h>
 TEST_DIAGNOSTIC_POP
 
 struct alloc_stats {
@@ -138,42 +136,42 @@ void test_libcpp_dealloc() {
   std::size_t with_size_val   = 2;
 
   {
-    std::__libcpp_deallocate_unsized<char>(static_cast<char*>(p), under_align_val);
+    std::__libcpp_deallocate_unsized(p, under_align_val);
     assert(stats.expect_plain());
   }
   stats.reset();
 
 #if defined(NO_SIZE) && defined(NO_ALIGN)
   {
-    std::__libcpp_deallocate<char>(static_cast<char*>(p), std::__element_count(with_size_val), over_align_val);
+    std::__libcpp_deallocate(p, with_size_val, over_align_val);
     assert(stats.expect_plain());
   }
   stats.reset();
 #elif defined(NO_SIZE)
   {
-    std::__libcpp_deallocate<char>(static_cast<char*>(p), std::__element_count(with_size_val), over_align_val);
+    std::__libcpp_deallocate(p, with_size_val, over_align_val);
     assert(stats.expect_align(over_align_val));
   }
   stats.reset();
 #elif defined(NO_ALIGN)
   {
-    std::__libcpp_deallocate<char>(static_cast<char*>(p), std::__element_count(with_size_val), over_align_val);
+    std::__libcpp_deallocate(p, with_size_val, over_align_val);
     assert(stats.expect_size(with_size_val));
   }
   stats.reset();
 #else
   {
-    std::__libcpp_deallocate<char>(static_cast<char*>(p), std::__element_count(with_size_val), over_align_val);
+    std::__libcpp_deallocate(p, with_size_val, over_align_val);
     assert(stats.expect_size_align(with_size_val, over_align_val));
   }
   stats.reset();
   {
-    std::__libcpp_deallocate_unsized<char>(static_cast<char*>(p), over_align_val);
+    std::__libcpp_deallocate_unsized(p, over_align_val);
     assert(stats.expect_align(over_align_val));
   }
   stats.reset();
   {
-    std::__libcpp_deallocate<char>(static_cast<char*>(p), std::__element_count(with_size_val), under_align_val);
+    std::__libcpp_deallocate(p, with_size_val, under_align_val);
     assert(stats.expect_size(with_size_val));
   }
   stats.reset();
diff --git a/libcxx/test/libcxx/input.output/file.streams/fstreams/filebuf/traits_mismatch.verify.cpp b/libcxx/test/libcxx/input.output/file.streams/fstreams/filebuf/traits_mismatch.verify.cpp
index fcbf6497c8d77..283adbc057d1e 100644
--- a/libcxx/test/libcxx/input.output/file.streams/fstreams/filebuf/traits_mismatch.verify.cpp
+++ b/libcxx/test/libcxx/input.output/file.streams/fstreams/filebuf/traits_mismatch.verify.cpp
@@ -15,8 +15,6 @@
 
 // UNSUPPORTED: no-wide-characters
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <fstream>
 
 std::basic_filebuf<char, std::char_traits<wchar_t> > f;
diff --git a/libcxx/test/libcxx/input.output/file.streams/fstreams/traits_mismatch.verify.cpp b/libcxx/test/libcxx/input.output/file.streams/fstreams/traits_mismatch.verify.cpp
index 8eca76c037bfe..ba6f3c31d3e34 100644
--- a/libcxx/test/libcxx/input.output/file.streams/fstreams/traits_mismatch.verify.cpp
+++ b/libcxx/test/libcxx/input.output/file.streams/fstreams/traits_mismatch.verify.cpp
@@ -15,8 +15,6 @@
 
 // UNSUPPORTED: no-wide-characters
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <fstream>
 
 std::basic_fstream<char, std::char_traits<wchar_t> > f;
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy.pass.cpp
index 3c9cf9bd61003..bede567f33019 100644
--- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy.pass.cpp
@@ -12,8 +12,6 @@
 //   constexpr OutIter   // constexpr after C++17
 //   copy(InIter first, InIter last, OutIter result);
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <algorithm>
 #include <cassert>
 #include <vector>
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp
index 9b403db85ebf9..b0a74b81f8c0f 100644
--- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp
@@ -17,6 +17,8 @@
 #include <array>
 #include <cassert>
 #include <cstddef>
+#include <deque>
+#include <ranges>
 #include <vector>
 
 #include "sized_allocator.h"
@@ -93,6 +95,13 @@ TEST_CONSTEXPR_CXX20 bool test_vector_bool(std::size_t N) {
   return true;
 }
 
+/*TEST_CONSTEXPR_CXX26*/ void test_deque() { // TODO: Mark as TEST_CONSTEXPR_CXX26 once std::deque is constexpr
+  std::deque<int> in(20);
+  std::deque<int> expected(in.size(), 42);
+  std::fill(in.begin(), in.end(), 42);
+  assert(in == expected);
+}
+
 TEST_CONSTEXPR_CXX20 bool test() {
   types::for_each(types::forward_iterator_list<char*>(), Test<char>());
   types::for_each(types::forward_iterator_list<int*>(), Test<int>());
@@ -138,6 +147,20 @@ TEST_CONSTEXPR_CXX20 bool test() {
     }
   }
 
+  if (!TEST_IS_CONSTANT_EVALUATED) // TODO: Use TEST_STD_AT_LEAST_26_OR_RUNTIME_EVALUATED when std::deque is made constexpr
+    test_deque();
+
+#if TEST_STD_VER >= 20
+  { // Verify that join_view of vectors work properly.
+    std::vector<std::vector<int>> v{{1, 2}, {1, 2, 3}, {}, {3, 4, 5}, {6}, {7, 8, 9, 6}, {0, 1, 2, 3, 0, 1, 2}};
+    auto jv = std::ranges::join_view(v);
+    std::fill(jv.begin(), jv.end(), 42);
+    for (const auto& vec : v)
+      for (auto n : vec)
+        assert(n == 42);
+  }
+#endif
+
   return true;
 }
 
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill_n.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill_n.pass.cpp
index e42172ceb960d..5dc9b901db075 100644
--- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill_n.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill_n.pass.cpp
@@ -17,6 +17,8 @@
 #include <array>
 #include <cassert>
 #include <cstddef>
+#include <deque>
+#include <ranges>
 #include <vector>
 
 #include "sized_allocator.h"
@@ -126,6 +128,13 @@ TEST_CONSTEXPR_CXX20 bool test_vector_bool(std::size_t N) {
   return true;
 }
 
+/*TEST_CONSTEXPR_CXX26*/ void test_deque() { // TODO: Mark as TEST_CONSTEXPR_CXX26 once std::deque is constexpr
+  std::deque<int> in(20);
+  std::deque<int> expected(in.size(), 42);
+  std::fill_n(in.begin(), in.size(), 42);
+  assert(in == expected);
+}
+
 TEST_CONSTEXPR_CXX20 bool test() {
   types::for_each(types::forward_iterator_list<char*>(), Test<char>());
   types::for_each(types::forward_iterator_list<int*>(), Test<int>());
@@ -221,6 +230,20 @@ TEST_CONSTEXPR_CXX20 bool test() {
     }
   }
 
+  if (!TEST_IS_CONSTANT_EVALUATED) // TODO: Use TEST_STD_AT_LEAST_26_OR_RUNTIME_EVALUATED when std::deque is made constexpr
+    test_deque();
+
+#if TEST_STD_VER >= 20
+  {
+    std::vector<std::vector<int>> v{{1, 2}, {1, 2, 3}, {}, {3, 4, 5}, {6}, {7, 8, 9, 6}, {0, 1, 2, 3, 0, 1, 2}};
+    auto jv = std::ranges::join_view(v);
+    std::fill_n(jv.begin(), std::distance(jv.begin(), jv.end()), 42);
+    for (const auto& vec : v)
+      for (auto n : vec)
+        assert(n == 42);
+  }
+#endif
+
   return true;
 }
 
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/ranges.fill.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/ranges.fill.pass.cpp
index 61a659fb0028c..7ae0a0665dfbb 100644
--- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/ranges.fill.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/ranges.fill.pass.cpp
@@ -18,6 +18,7 @@
 #include <algorithm>
 #include <array>
 #include <cassert>
+#include <deque>
 #include <ranges>
 #include <string>
 #include <vector>
@@ -128,6 +129,13 @@ constexpr bool test_vector_bool(std::size_t N) {
 }
 #endif
 
+/*TEST_CONSTEXPR_CXX26*/ void test_deque() { // TODO: Mark as TEST_CONSTEXPR_CXX26 once std::deque is constexpr
+  std::deque<int> in(20);
+  std::deque<int> expected(in.size(), 42);
+  std::ranges::fill(in, 42);
+  assert(in == expected);
+}
+
 constexpr bool test() {
   test_iterators<cpp17_output_iterator<int*>, sentinel_wrapper<cpp17_output_iterator<int*>>>();
   test_iterators<cpp20_output_iterator<int*>, sentinel_wrapper<cpp20_output_iterator<int*>>>();
@@ -227,6 +235,20 @@ constexpr bool test() {
   }
 #endif
 
+  if (!TEST_IS_CONSTANT_EVALUATED) // TODO: Use TEST_STD_AT_LEAST_26_OR_RUNTIME_EVALUATED when std::deque is made constexpr
+    test_deque();
+
+#if TEST_STD_VER >= 20
+  {
+    std::vector<std::vector<int>> v{{1, 2}, {1, 2, 3}, {}, {3, 4, 5}, {6}, {7, 8, 9, 6}, {0, 1, 2, 3, 0, 1, 2}};
+    auto jv = std::ranges::join_view(v);
+    std::ranges::fill(jv, 42);
+    for (const auto& vec : v)
+      for (auto n : vec)
+        assert(n == 42);
+  }
+#endif
+
   return true;
 }
 
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/ranges.fill_n.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/ranges.fill_n.pass.cpp
index 2d6e24a03e0b3..25db892548a6a 100644
--- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/ranges.fill_n.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/ranges.fill_n.pass.cpp
@@ -16,6 +16,7 @@
 #include <algorithm>
 #include <array>
 #include <cassert>
+#include <deque>
 #include <ranges>
 #include <string>
 #include <vector>
@@ -101,6 +102,13 @@ constexpr bool test_vector_bool(std::size_t N) {
 }
 #endif
 
+/*TEST_CONSTEXPR_CXX26*/ void test_deque() { // TODO: Mark as TEST_CONSTEXPR_CXX26 once std::deque is constexpr
+  std::deque<int> in(20);
+  std::deque<int> expected(in.size(), 42);
+  std::ranges::fill_n(std::ranges::begin(in), std::ranges::size(in), 42);
+  assert(in == expected);
+}
+
 constexpr bool test() {
   test_iterators<cpp17_output_iterator<int*>, sentinel_wrapper<cpp17_output_iterator<int*>>>();
   test_iterators<cpp20_output_iterator<int*>, sentinel_wrapper<cpp20_output_iterator<int*>>>();
@@ -175,6 +183,20 @@ constexpr bool test() {
   }
 #endif
 
+  if (!TEST_IS_CONSTANT_EVALUATED) // TODO: Use TEST_STD_AT_LEAST_26_OR_RUNTIME_EVALUATED when std::deque is made constexpr
+    test_deque();
+
+#if TEST_STD_VER >= 20
+  {
+    std::vector<std::vector<int>> v{{1, 2}, {1, 2, 3}, {}, {3, 4, 5}, {6}, {7, 8, 9, 6}, {0, 1, 2, 3, 0, 1, 2}};
+    auto jv = std::ranges::join_view(v);
+    std::ranges::fill_n(std::ranges::begin(jv), std::ranges::distance(jv), 42);
+    for (const auto& vec : v)
+      for (auto n : vec)
+        assert(n == 42);
+  }
+#endif
+
   return true;
 }
 
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.generate/generate.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.generate/generate.pass.cpp
index 29d32d7156742..4591d7ece4645 100644
--- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.generate/generate.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.generate/generate.pass.cpp
@@ -16,6 +16,7 @@
 
 #include <algorithm>
 #include <cassert>
+#include <deque>
 
 #include "test_macros.h"
 #include "test_iterators.h"
@@ -51,12 +52,22 @@ test()
     assert(ia[3] == 1);
 }
 
+void deque_test() {
+  int sizes[] = {0, 1, 2, 1023, 1024, 1025, 2047, 2048, 2049};
+  for (const int size : sizes) {
+    std::deque<int> d(size);
+    std::generate(d.begin(), d.end(), gen_test());
+    assert(std::all_of(d.begin(), d.end(), [](int x) { return x == 1; }));
+  }
+}
+
 int main(int, char**)
 {
     test<forward_iterator<int*> >();
     test<bidirectional_iterator<int*> >();
     test<random_access_iterator<int*> >();
     test<int*>();
+    deque_test();
 
 #if TEST_STD_VER > 17
     static_assert(test_constexpr());
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.generate/generate_n.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.generate/generate_n.pass.cpp
index 13fd1cbf1f33c..525737ceacfbf 100644
--- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.generate/generate_n.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.generate/generate_n.pass.cpp
@@ -16,6 +16,7 @@
 
 #include <algorithm>
 #include <cassert>
+#include <deque>
 
 #include "test_iterators.h"
 #include "test_macros.h"
@@ -71,12 +72,22 @@ test()
     test2<Iter, long double>();
 }
 
+void deque_test() {
+  int sizes[] = {0, 1, 2, 1023, 1024, 1025, 2047, 2048, 2049};
+  for (const int size : sizes) {
+    std::deque<int> d(size);
+    std::generate_n(d.begin(), size, gen_test());
+    assert(std::all_of(d.begin(), d.end(), [](int x) { return x == 2; }));
+  }
+}
+
 int main(int, char**)
 {
     test<forward_iterator<int*> >();
     test<bidirectional_iterator<int*> >();
     test<random_access_iterator<int*> >();
     test<int*>();
+    deque_test();
 
 #if TEST_STD_VER > 17
     static_assert(test_constexpr());
diff --git a/libcxx/test/std/atomics/atomics.types.generic/cas_non_power_of_2.pass.cpp b/libcxx/test/std/atomics/atomics.types.generic/cas_non_power_of_2.pass.cpp
new file mode 100644
index 0000000000000..13bd761ae9808
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.types.generic/cas_non_power_of_2.pass.cpp
@@ -0,0 +1,76 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// https://github.com/llvm/llvm-project/issues/30023
+// compare exchange does not work with types of which the size is not a power of 2
+
+// XFAIL: clang-19, clang-20, clang-21, apple-clang-15, apple-clang-16, apple-clang-17
+// UNSUPPORTED: c++03
+
+// TODO: remove the UNSUPPORTED clang-22 once libc++ CI's clang is updated to include
+// the fix https://github.com/llvm/llvm-project/pull/78707
+// UNSUPPORTED: clang-22
+
+#include <atomic>
+#include <cstring>
+#include <cassert>
+
+template <int Size>
+struct S {
+  char data[Size];
+
+  explicit S(char v = 0) noexcept { memset(&data[0], v, sizeof(data)); }
+
+  // only used in the test to check the results. Not used in atomic operations.
+  friend bool operator==(const S& lhs, const S& rhs) noexcept {
+    return memcmp(&lhs.data[0], &rhs.data[0], sizeof(lhs.data)) == 0;
+  }
+  friend bool operator!=(const S& lhs, const S& rhs) noexcept { return !(lhs == rhs); }
+};
+
+template <int Size>
+struct Expected {
+  Expected(S<Size> ss) : s(ss) {}
+
+  S<Size> s;
+  bool b = true; // used to validate that s's operation won't overwrite the memory next to it
+};
+
+template <int Size>
+void test() {
+  using T = S<Size>;
+  std::atomic<T> a(T(0));
+  Expected<Size> expected{T(17)};
+
+  assert(a.load() != expected.s);
+  assert(expected.b);
+
+  auto r1 = a.compare_exchange_strong(expected.s, T(18), std::memory_order_relaxed);
+
+  assert(!r1);
+  assert(expected.s == T(0)); // expected.s is modified by compare_exchange_strong
+  assert(expected.b);
+  assert(a.load() == T(0));
+
+  auto r2 = a.compare_exchange_strong(expected.s, T(18), std::memory_order_relaxed);
+  assert(r2);
+  assert(a.load() == T(18));
+  assert(expected.s == T(0));
+  assert(expected.b);
+}
+
+int main(int, char**) {
+  test<1>();
+  test<2>();
+  test<3>();
+  test<4>();
+  test<5>();
+  test<6>();
+
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/types.pass.cpp b/libcxx/test/std/atomics/types.pass.cpp
index c979392290b35..b1edec5942cc9 100644
--- a/libcxx/test/std/atomics/types.pass.cpp
+++ b/libcxx/test/std/atomics/types.pass.cpp
@@ -17,9 +17,6 @@
 //     typedef T value_type;
 // };
 
-// atomic still has a difference_type in the C++03 frozen headers
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <atomic>
 #include <chrono>
 #include <cstdint>
diff --git a/libcxx/test/std/containers/sequences/vector.bool/shrink_to_fit.pass.cpp b/libcxx/test/std/containers/sequences/vector.bool/shrink_to_fit.pass.cpp
index 665867a7bad4a..bf17733fa2739 100644
--- a/libcxx/test/std/containers/sequences/vector.bool/shrink_to_fit.pass.cpp
+++ b/libcxx/test/std/containers/sequences/vector.bool/shrink_to_fit.pass.cpp
@@ -11,8 +11,6 @@
 
 // void shrink_to_fit();
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <cassert>
 #include <climits>
 #include <vector>
diff --git a/libcxx/test/std/containers/sequences/vector/vector.cons/exceptions.pass.cpp b/libcxx/test/std/containers/sequences/vector/vector.cons/exceptions.pass.cpp
index 00de05347bfd9..679eec2413793 100644
--- a/libcxx/test/std/containers/sequences/vector/vector.cons/exceptions.pass.cpp
+++ b/libcxx/test/std/containers/sequences/vector/vector.cons/exceptions.pass.cpp
@@ -11,8 +11,6 @@
 // (bug report: https://llvm.org/PR58392)
 // Check that vector constructors don't leak memory when an operation inside the constructor throws an exception
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <cstddef>
 #include <memory>
 #include <type_traits>
diff --git a/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.modifiers/erase_range.pass.cpp b/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.modifiers/erase_range.pass.cpp
index f8a2bdd3fee73..38b75c0c1986b 100644
--- a/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.modifiers/erase_range.pass.cpp
+++ b/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.modifiers/erase_range.pass.cpp
@@ -91,6 +91,37 @@ int main(int, char**) {
     assert(c.size() == 0);
     assert(k == c.end());
   }
+  {   // Make sure we're properly destroying the elements when erasing
+    { // When erasing part of a bucket
+      std::unordered_multimap<int, std::string> map;
+      map.insert(std::make_pair(1, "This is a long string to make sure ASan can detect a memory leak"));
+      map.insert(std::make_pair(1, "This is another long string to make sure ASan can detect a memory leak"));
+      map.erase(++map.begin(), map.end());
+    }
+    { // When erasing the whole bucket
+      std::unordered_multimap<int, std::string> map;
+      map.insert(std::make_pair(1, "This is a long string to make sure ASan can detect a memory leak"));
+      map.insert(std::make_pair(1, "This is another long string to make sure ASan can detect a memory leak"));
+      map.erase(map.begin(), map.end());
+    }
+  }
+  { // Make sure that we're properly updating the bucket list when starting within a bucket
+    struct MyHash {
+      size_t operator()(size_t v) const { return v; }
+    };
+    std::unordered_multimap<size_t, size_t, MyHash> map;
+    size_t collision_val = 2 + map.bucket_count(); // try to get a bucket collision
+    map.rehash(3);
+    map.insert(std::pair<size_t, size_t>(1, 1));
+    map.insert(std::pair<size_t, size_t>(collision_val, 1));
+    map.insert(std::pair<size_t, size_t>(2, 1));
+    LIBCPP_ASSERT(map.bucket(2) == map.bucket(collision_val));
+
+    auto erase = map.equal_range(2);
+    map.erase(erase.first, erase.second);
+    for (const auto& v : map)
+      assert(v.first == 1 || v.first == collision_val);
+  }
 #if TEST_STD_VER >= 11
   {
     typedef std::unordered_multimap<int,
diff --git a/libcxx/test/std/containers/unord/unord.multiset/erase_range.pass.cpp b/libcxx/test/std/containers/unord/unord.multiset/erase_range.pass.cpp
index 62724fdbeead8..3bc686ec2d86e 100644
--- a/libcxx/test/std/containers/unord/unord.multiset/erase_range.pass.cpp
+++ b/libcxx/test/std/containers/unord/unord.multiset/erase_range.pass.cpp
@@ -47,6 +47,23 @@ int main(int, char**) {
     assert(c.size() == 0);
     assert(k == c.end());
   }
+  { // Make sure that we're properly updating the bucket list when starting within a bucket
+    struct MyHash {
+      size_t operator()(size_t v) const { return v; }
+    };
+    std::unordered_multiset<size_t, MyHash> map;
+    size_t collision_val = 2 + map.bucket_count(); // try to get a bucket collision
+    map.rehash(3);
+    map.insert(1);
+    map.insert(collision_val);
+    map.insert(2);
+    LIBCPP_ASSERT(map.bucket(2) == map.bucket(collision_val));
+
+    auto erase = map.equal_range(2);
+    map.erase(erase.first, erase.second);
+    for (const auto& v : map)
+      assert(v == 1 || v == collision_val);
+  }
 #if TEST_STD_VER >= 11
   {
     typedef std::unordered_multiset<int, std::hash<int>, std::equal_to<int>, min_allocator<int>> C;
diff --git a/libcxx/test/std/input.output/string.streams/stringbuf/stringbuf.cons/default.pass.cpp b/libcxx/test/std/input.output/string.streams/stringbuf/stringbuf.cons/default.pass.cpp
index dac43967d815f..d131f5c9a6fa7 100644
--- a/libcxx/test/std/input.output/string.streams/stringbuf/stringbuf.cons/default.pass.cpp
+++ b/libcxx/test/std/input.output/string.streams/stringbuf/stringbuf.cons/default.pass.cpp
@@ -15,8 +15,6 @@
 // basic_stringbuf() : basic_stringbuf(ios_base::in | ios_base::out) {}               // C++20
 // explicit basic_stringbuf(ios_base::openmode which);                                // C++20
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <sstream>
 #include <cassert>
 
diff --git a/libcxx/test/std/iterators/iterator.primitives/iterator.operations/distance.pass.cpp b/libcxx/test/std/iterators/iterator.primitives/iterator.operations/distance.pass.cpp
index 13caefff92365..d92a44f2dbe14 100644
--- a/libcxx/test/std/iterators/iterator.primitives/iterator.operations/distance.pass.cpp
+++ b/libcxx/test/std/iterators/iterator.primitives/iterator.operations/distance.pass.cpp
@@ -16,38 +16,73 @@
 //   Iter::difference_type
 //   distance(Iter first, Iter last); // constexpr in C++17
 
-#include <iterator>
+#include <array>
 #include <cassert>
+#include <deque>
+#include <iterator>
+#include <vector>
 #include <type_traits>
 
 #include "test_macros.h"
 #include "test_iterators.h"
 
 template <class It>
-TEST_CONSTEXPR_CXX17
-void check_distance(It first, It last, typename std::iterator_traits<It>::difference_type dist)
-{
-    typedef typename std::iterator_traits<It>::difference_type Difference;
-    static_assert(std::is_same<decltype(std::distance(first, last)), Difference>::value, "");
-    assert(std::distance(first, last) == dist);
+TEST_CONSTEXPR_CXX17 void check_distance(It first, It last, typename std::iterator_traits<It>::difference_type dist) {
+  typedef typename std::iterator_traits<It>::difference_type Difference;
+  static_assert(std::is_same<decltype(std::distance(first, last)), Difference>::value, "");
+  assert(std::distance(first, last) == dist);
 }
 
-TEST_CONSTEXPR_CXX17 bool tests()
-{
-    const char* s = "1234567890";
-    check_distance(cpp17_input_iterator<const char*>(s), cpp17_input_iterator<const char*>(s+10), 10);
-    check_distance(forward_iterator<const char*>(s), forward_iterator<const char*>(s+10), 10);
-    check_distance(bidirectional_iterator<const char*>(s), bidirectional_iterator<const char*>(s+10), 10);
-    check_distance(random_access_iterator<const char*>(s), random_access_iterator<const char*>(s+10), 10);
-    check_distance(s, s+10, 10);
-    return true;
+#if TEST_STD_VER >= 20
+/*TEST_CONSTEXPR_CXX26*/ void test_deque() { // TODO: Mark as TEST_CONSTEXPR_CXX26 once std::deque is constexpr
+  using Container = std::deque<std::deque<double>>;
+  Container c;
+  auto view                    = c | std::views::join;
+  Container::difference_type n = 0;
+  for (std::size_t i = 0; i < 10; ++i) {
+    n += i;
+    c.push_back(Container::value_type(i));
+  }
+  assert(std::distance(view.begin(), view.end()) == n);
+}
+#endif
+
+TEST_CONSTEXPR_CXX17 bool tests() {
+  const char* s = "1234567890";
+  check_distance(cpp17_input_iterator<const char*>(s), cpp17_input_iterator<const char*>(s + 10), 10);
+  check_distance(forward_iterator<const char*>(s), forward_iterator<const char*>(s + 10), 10);
+  check_distance(bidirectional_iterator<const char*>(s), bidirectional_iterator<const char*>(s + 10), 10);
+  check_distance(random_access_iterator<const char*>(s), random_access_iterator<const char*>(s + 10), 10);
+  check_distance(s, s + 10, 10);
+
+#if TEST_STD_VER >= 20
+  {
+    using Container = std::vector<std::vector<int>>;
+    Container c;
+    auto view                    = c | std::views::join;
+    Container::difference_type n = 0;
+    for (std::size_t i = 0; i < 10; ++i) {
+      n += i;
+      c.push_back(Container::value_type(i));
+    }
+    assert(std::distance(view.begin(), view.end()) == n);
+  }
+  {
+    using Container = std::array<std::array<char, 3>, 10>;
+    Container c;
+    auto view = c | std::views::join;
+    assert(std::distance(view.begin(), view.end()) == 30);
+  }
+  if (!TEST_IS_CONSTANT_EVALUATED) // TODO: Use TEST_STD_AT_LEAST_26_OR_RUNTIME_EVALUATED when std::deque is made constexpr
+    test_deque();
+#endif
+  return true;
 }
 
-int main(int, char**)
-{
-    tests();
+int main(int, char**) {
+  tests();
 #if TEST_STD_VER >= 17
-    static_assert(tests(), "");
+  static_assert(tests(), "");
 #endif
-    return 0;
+  return 0;
 }
diff --git a/libcxx/test/std/iterators/iterator.primitives/range.iter.ops/range.iter.ops.distance/iterator_sentinel.pass.cpp b/libcxx/test/std/iterators/iterator.primitives/range.iter.ops/range.iter.ops.distance/iterator_sentinel.pass.cpp
index b4199b73ad76a..1b7848963a739 100644
--- a/libcxx/test/std/iterators/iterator.primitives/range.iter.ops/range.iter.ops.distance/iterator_sentinel.pass.cpp
+++ b/libcxx/test/std/iterators/iterator.primitives/range.iter.ops/range.iter.ops.distance/iterator_sentinel.pass.cpp
@@ -15,18 +15,21 @@
 // template<class I, sized_sentinel_for<decay_t<I>> S>
 //   constexpr iter_difference_t<I> ranges::distance(I&& first, S last); // TODO: update when LWG3664 is resolved
 
-#include <iterator>
+#include <array>
 #include <cassert>
+#include <deque>
+#include <iterator>
+#include <vector>
 
 #include "test_iterators.h"
 #include "test_macros.h"
 
-template<class It, class Sent>
+template <class It, class Sent>
 constexpr void test_unsized() {
   static_assert(std::sentinel_for<Sent, It> && !std::sized_sentinel_for<Sent, It>);
-  int a[3] = {1,2,3};
+  int a[3] = {1, 2, 3};
   {
-    It first = It(a);
+    It first  = It(a);
     auto last = Sent(It(a));
     assert(std::ranges::distance(first, last) == 0);
     assert(std::ranges::distance(It(a), last) == 0);
@@ -36,7 +39,7 @@ constexpr void test_unsized() {
   }
   {
     auto check = [&a]<class ItQual, class SentQual> {
-      It first = It(a);
+      It first  = It(a);
       Sent last = Sent(It(a + 3));
       assert(std::ranges::distance(static_cast<ItQual>(first), static_cast<SentQual>(last)) == 3);
     };
@@ -61,13 +64,13 @@ constexpr void test_unsized() {
   }
 }
 
-template<class It, class Sent>
+template <class It, class Sent>
 constexpr void test_sized() {
   static_assert(std::sized_sentinel_for<Sent, It>);
-  int a[] = {1,2,3};
+  int a[] = {1, 2, 3};
   {
     auto check = [&a]<class ItQual, class SentQual> {
-      It first = It(a + 3);
+      It first  = It(a + 3);
       Sent last = Sent(It(a));
       assert(std::ranges::distance(static_cast<ItQual>(first), static_cast<SentQual>(last)) == -3);
     };
@@ -91,7 +94,7 @@ constexpr void test_sized() {
     check.template operator()<const It&&, const Sent&&>();
   }
   {
-    It first = It(a);
+    It first  = It(a);
     auto last = Sent(It(a));
     assert(std::ranges::distance(first, last) == 0);
     assert(std::ranges::distance(It(a), last) == 0);
@@ -100,7 +103,7 @@ constexpr void test_sized() {
     ASSERT_SAME_TYPE(decltype(std::ranges::distance(It(a), Sent(It(a)))), std::iter_difference_t<It>);
   }
   {
-    It first = It(a);
+    It first  = It(a);
     auto last = Sent(It(a + 3));
     assert(std::ranges::distance(first, last) == 3);
     assert(std::ranges::distance(It(a), last) == 3);
@@ -110,13 +113,17 @@ constexpr void test_sized() {
 }
 
 struct StrideCounter {
-  int *it_;
-  int *inc_;
-  using value_type = int;
+  int* it_;
+  int* inc_;
+  using value_type      = int;
   using difference_type = int;
   explicit StrideCounter();
-  constexpr explicit StrideCounter(int *it, int *inc) : it_(it), inc_(inc) {}
-  constexpr auto& operator++() { ++it_; *inc_ += 1; return *this; }
+  constexpr explicit StrideCounter(int* it, int* inc) : it_(it), inc_(inc) {}
+  constexpr auto& operator++() {
+    ++it_;
+    *inc_ += 1;
+    return *this;
+  }
   StrideCounter operator++(int);
   int& operator*() const;
   bool operator==(StrideCounter) const;
@@ -125,11 +132,11 @@ static_assert(std::forward_iterator<StrideCounter>);
 static_assert(!std::sized_sentinel_for<StrideCounter, StrideCounter>);
 
 struct SizedStrideCounter {
-  int *it_;
-  int *minus_;
+  int* it_;
+  int* minus_;
   using value_type = int;
   explicit SizedStrideCounter();
-  constexpr explicit SizedStrideCounter(int *it, int *minus) : it_(it), minus_(minus) {}
+  constexpr explicit SizedStrideCounter(int* it, int* minus) : it_(it), minus_(minus) {}
   SizedStrideCounter& operator++();
   SizedStrideCounter operator++(int);
   int& operator*() const;
@@ -147,22 +154,34 @@ constexpr void test_stride_counting() {
     int a[] = {1, 2, 3};
     int inc = 0;
     StrideCounter first(a, &inc);
-    StrideCounter last(a+3, nullptr);
+    StrideCounter last(a + 3, nullptr);
     std::same_as<int> auto result = std::ranges::distance(first, last);
     assert(result == 3);
     assert(inc == 3);
   }
   {
-    int a[] = {1, 2, 3};
+    int a[]   = {1, 2, 3};
     int minus = 0;
     SizedStrideCounter first(a, &minus);
-    SizedStrideCounter last(a+3, nullptr);
+    SizedStrideCounter last(a + 3, nullptr);
     std::same_as<int> auto result = std::ranges::distance(first, last);
     assert(result == 3);
     assert(minus == 1);
   }
 }
 
+/*TEST_CONSTEXPR_CXX26*/ void test_deque() { // TODO: Mark as TEST_CONSTEXPR_CXX26 once std::deque is constexpr
+  using Container = std::deque<std::deque<double>>;
+  Container c;
+  auto view                    = c | std::views::join;
+  Container::difference_type n = 0;
+  for (std::size_t i = 0; i < 10; ++i) {
+    n += i;
+    c.push_back(Container::value_type(i));
+  }
+  assert(std::ranges::distance(view.begin(), view.end()) == n);
+}
+
 constexpr bool test() {
   {
     int a[] = {1, 2, 3};
@@ -197,7 +216,7 @@ constexpr bool test() {
   test_sized<contiguous_iterator<int*>, contiguous_iterator<int*>>();
 
   {
-    using It = cpp20_input_iterator<int*>;  // non-copyable, thus not a sentinel for itself
+    using It = cpp20_input_iterator<int*>; // non-copyable, thus not a sentinel for itself
     static_assert(!std::is_copy_constructible_v<It>);
     static_assert(!std::sentinel_for<It, It>);
     static_assert(!std::is_invocable_v<decltype(std::ranges::distance), It&, It&>);
@@ -206,10 +225,10 @@ constexpr bool test() {
     static_assert(!std::is_invocable_v<decltype(std::ranges::distance), It&&, It&&>);
   }
   {
-    using It = cpp20_input_iterator<int*>;  // non-copyable
-    using Sent = sentinel_wrapper<It>;  // not a sized sentinel
+    using It   = cpp20_input_iterator<int*>; // non-copyable
+    using Sent = sentinel_wrapper<It>;       // not a sized sentinel
     static_assert(std::sentinel_for<Sent, It> && !std::sized_sentinel_for<Sent, It>);
-    int a[] = {1,2,3};
+    int a[]   = {1, 2, 3};
     Sent last = Sent(It(a + 3));
     static_assert(!std::is_invocable_v<decltype(std::ranges::distance), It&, Sent&>);
     static_assert(!std::is_invocable_v<decltype(std::ranges::distance), It&, Sent&&>);
@@ -217,7 +236,7 @@ constexpr bool test() {
     assert(std::ranges::distance(It(a), Sent(It(a + 3))) == 3);
   }
   {
-    using It = cpp17_input_iterator<int*>;  // not a sentinel for itself
+    using It = cpp17_input_iterator<int*>; // not a sentinel for itself
     static_assert(!std::sentinel_for<It, It>);
     static_assert(!std::is_invocable_v<decltype(std::ranges::distance), It&, It&>);
     static_assert(!std::is_invocable_v<decltype(std::ranges::distance), It&, It&&>);
@@ -231,6 +250,26 @@ constexpr bool test() {
   static_assert(!std::is_invocable_v<decltype(std::ranges::distance), int, int*>);
   static_assert(!std::is_invocable_v<decltype(std::ranges::distance), int*, char*>);
 
+  {
+    using Container = std::vector<std::vector<int>>;
+    Container c;
+    auto view                    = c | std::views::join;
+    Container::difference_type n = 0;
+    for (std::size_t i = 0; i < 10; ++i) {
+      n += i;
+      c.push_back(Container::value_type(i));
+    }
+    assert(std::ranges::distance(view.begin(), view.end()) == n);
+  }
+  {
+    using Container = std::array<std::array<char, 3>, 10>;
+    Container c;
+    auto view = c | std::views::join;
+    assert(std::ranges::distance(view.begin(), view.end()) == 30);
+  }
+  if (!TEST_IS_CONSTANT_EVALUATED) // TODO: Use TEST_STD_AT_LEAST_26_OR_RUNTIME_EVALUATED when std::deque is made constexpr
+    test_deque();
+
   return true;
 }
 
diff --git a/libcxx/test/std/language.support/support.runtime/cstdalign.compile.pass.cpp b/libcxx/test/std/language.support/support.runtime/cstdalign.compile.pass.cpp
index 69296dfa50121..d289ef63dfec1 100644
--- a/libcxx/test/std/language.support/support.runtime/cstdalign.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.runtime/cstdalign.compile.pass.cpp
@@ -10,7 +10,7 @@
 
 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
+// UNSUPPORTED: c++03
 
 #include <cstdalign>
 
diff --git a/libcxx/test/std/re/re.alg/re.alg.match/ecma.pass.cpp b/libcxx/test/std/re/re.alg/re.alg.match/ecma.pass.cpp
index 799a362425ad5..0184a3835fb0d 100644
--- a/libcxx/test/std/re/re.alg/re.alg.match/ecma.pass.cpp
+++ b/libcxx/test/std/re/re.alg/re.alg.match/ecma.pass.cpp
@@ -7,8 +7,6 @@
 //===----------------------------------------------------------------------===//
 //
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 // <regex>
 
 // template <class BidirectionalIterator, class Allocator, class charT, class traits>
diff --git a/libcxx/test/std/re/re.regex/re.regex.construct/bad_range.pass.cpp b/libcxx/test/std/re/re.regex/re.regex.construct/bad_range.pass.cpp
index ecfdaee2eed66..cabd9ebec520e 100644
--- a/libcxx/test/std/re/re.regex/re.regex.construct/bad_range.pass.cpp
+++ b/libcxx/test/std/re/re.regex/re.regex.construct/bad_range.pass.cpp
@@ -14,8 +14,6 @@
 // template <class ST, class SA>
 //    basic_regex(const basic_string<charT, ST, SA>& s);
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <regex>
 #include <cassert>
 #include "test_macros.h"
diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp
index 5264e7700e3d9..1c9d72ea34974 100644
--- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp
@@ -6,10 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+// REQUIRES: std-at-least-c++23
 
 // These compilers don't support __builtin_is_implicit_lifetime yet.
-// UNSUPPORTED: clang-19, gcc-14, gcc-15, apple-clang-16, apple-clang-17
+// UNSUPPORTED: clang-19, gcc-15, apple-clang-17
 
 // <type_traits>
 
@@ -139,13 +139,6 @@ constexpr void test_is_implicit_lifetime() {
   test_is_implicit_lifetime<T[94], true>();
 }
 
-struct ArithmeticTypesTest {
-  template <class T>
-  constexpr void operator()() {
-    test_is_implicit_lifetime<T>();
-  }
-};
-
 constexpr bool test() {
   // Standard fundamental C++ types
 
@@ -155,7 +148,7 @@ constexpr bool test() {
   test_is_implicit_lifetime<const void, false>();
   test_is_implicit_lifetime<volatile void, false>();
 
-  types::for_each(types::arithmetic_types(), ArithmeticTypesTest{});
+  types::for_each(types::arithmetic_types(), []<typename T> { test_is_implicit_lifetime<T>(); });
 
   test_is_implicit_lifetime<Enum>();
   test_is_implicit_lifetime<SignedEnum>();
@@ -226,7 +219,9 @@ constexpr bool test() {
 
 #ifdef _LIBCPP_VERSION
   // These types should be implicit-lifetime, but they are not guaranteed to be so.
+#  ifndef _LIBCPP_DEPRECATED_ABI_DISABLE_PAIR_TRIVIAL_COPY_CTOR
   test_is_implicit_lifetime<std::pair<int, float>>();
+#  endif
   test_is_implicit_lifetime<std::tuple<int, float>>();
 #endif
 
diff --git a/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.observers/assert.subscript.pass.cpp b/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.observers/assert.subscript.pass.cpp
index b7cc12350027b..f7390ef5eb5d2 100644
--- a/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.observers/assert.subscript.pass.cpp
+++ b/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.observers/assert.subscript.pass.cpp
@@ -58,15 +58,18 @@ void test() {
   {
     {
       std::unique_ptr<WithCookie[]> ptr(new WithCookie[5]);
+      assert(&ptr[1] == ptr.get() + 1); // ensure no assertion
       TEST_LIBCPP_ASSERT_FAILURE(ptr[6], "unique_ptr<T[]>::operator[](index): index out of range");
     }
     {
       std::unique_ptr<WithCookie[]> ptr = std::make_unique<WithCookie[]>(5);
+      assert(&ptr[1] == ptr.get() + 1); // ensure no assertion
       TEST_LIBCPP_ASSERT_FAILURE(ptr[6], "unique_ptr<T[]>::operator[](index): index out of range");
     }
 #if TEST_STD_VER >= 20
     {
       std::unique_ptr<WithCookie[]> ptr = std::make_unique_for_overwrite<WithCookie[]>(5);
+      assert(&ptr[1] == ptr.get() + 1); // ensure no assertion
       TEST_LIBCPP_ASSERT_FAILURE(ptr[6] = WithCookie(), "unique_ptr<T[]>::operator[](index): index out of range");
     }
 #endif
@@ -82,11 +85,13 @@ void test() {
   {
     {
       std::unique_ptr<NoCookie[]> ptr = std::make_unique<NoCookie[]>(5);
+      assert(&ptr[1] == ptr.get() + 1); // ensure no assertion
       TEST_LIBCPP_ASSERT_FAILURE(ptr[6], "unique_ptr<T[]>::operator[](index): index out of range");
     }
 #  if TEST_STD_VER >= 20
     {
       std::unique_ptr<NoCookie[]> ptr = std::make_unique_for_overwrite<NoCookie[]>(5);
+      assert(&ptr[1] == ptr.get() + 1); // ensure no assertion
       TEST_LIBCPP_ASSERT_FAILURE(ptr[6] = NoCookie(), "unique_ptr<T[]>::operator[](index): index out of range");
     }
 #  endif
@@ -101,6 +106,7 @@ void test() {
     {
       std::unique_ptr<T[]> ptr = std::make_unique<T[]>(5);
       std::unique_ptr<T[]> other(std::move(ptr));
+      assert(&other[1] == other.get() + 1); // ensure no assertion
       TEST_LIBCPP_ASSERT_FAILURE(other[6], "unique_ptr<T[]>::operator[](index): index out of range");
     }
 
@@ -109,6 +115,7 @@ void test() {
       std::unique_ptr<T[]> ptr = std::make_unique<T[]>(5);
       std::unique_ptr<T[]> other;
       other = std::move(ptr);
+      assert(&other[1] == other.get() + 1); // ensure no assertion
       TEST_LIBCPP_ASSERT_FAILURE(other[6], "unique_ptr<T[]>::operator[](index): index out of range");
     }
 
@@ -116,6 +123,7 @@ void test() {
     {
       std::unique_ptr<T[]> ptr = std::make_unique<T[]>(5);
       std::unique_ptr<T[], MyDeleter> other(std::move(ptr));
+      assert(&other[1] == other.get() + 1); // ensure no assertion
       TEST_LIBCPP_ASSERT_FAILURE(other[6], "unique_ptr<T[]>::operator[](index): index out of range");
     }
 
@@ -124,6 +132,7 @@ void test() {
       std::unique_ptr<T[]> ptr = std::make_unique<T[]>(5);
       std::unique_ptr<T[], MyDeleter> other;
       other = std::move(ptr);
+      assert(&other[1] == other.get() + 1); // ensure no assertion
       TEST_LIBCPP_ASSERT_FAILURE(other[6], "unique_ptr<T[]>::operator[](index): index out of range");
     }
   });
@@ -144,6 +153,34 @@ struct WithCookie {
   char padding[Size];
 };
 
+template <std::size_t Size>
+struct alignas(128) OveralignedNoCookie {
+  char padding[Size];
+};
+
+template <std::size_t Size>
+struct alignas(128) OveralignedWithCookie {
+  OveralignedWithCookie() = default;
+  OveralignedWithCookie(OveralignedWithCookie const&) {}
+  OveralignedWithCookie& operator=(OveralignedWithCookie const&) { return *this; }
+  ~OveralignedWithCookie() {}
+  char padding[Size];
+};
+
+// These types have a different ABI alignment (alignof) and preferred alignment (__alignof) on some platforms.
+// Make sure things work with these types because array cookies can be sensitive to preferred alignment on some
+// platforms.
+struct WithCookiePreferredAlignment {
+  WithCookiePreferredAlignment() = default;
+  WithCookiePreferredAlignment(WithCookiePreferredAlignment const&) {}
+  WithCookiePreferredAlignment& operator=(WithCookiePreferredAlignment const&) { return *this; }
+  ~WithCookiePreferredAlignment() {}
+  long double data;
+};
+struct NoCookiePreferredAlignment {
+  long double data;
+};
+
 int main(int, char**) {
   test<WithCookie<1>, NoCookie<1>>();
   test<WithCookie<2>, NoCookie<2>>();
@@ -153,7 +190,18 @@ int main(int, char**) {
   test<WithCookie<16>, NoCookie<16>>();
   test<WithCookie<32>, NoCookie<32>>();
   test<WithCookie<256>, NoCookie<256>>();
+
+  test<OveralignedWithCookie<1>, OveralignedNoCookie<1>>();
+  test<OveralignedWithCookie<2>, OveralignedNoCookie<2>>();
+  test<OveralignedWithCookie<3>, OveralignedNoCookie<3>>();
+  test<OveralignedWithCookie<4>, OveralignedNoCookie<4>>();
+  test<OveralignedWithCookie<8>, OveralignedNoCookie<8>>();
+  test<OveralignedWithCookie<16>, OveralignedNoCookie<16>>();
+  test<OveralignedWithCookie<32>, OveralignedNoCookie<32>>();
+  test<OveralignedWithCookie<256>, OveralignedNoCookie<256>>();
+
   test<std::string, int>();
+  test<WithCookiePreferredAlignment, NoCookiePreferredAlignment>();
 
   return 0;
 }
diff --git a/libcxx/utils/ci/buildkite-pipeline.yml b/libcxx/utils/ci/buildkite-pipeline.yml
index d564ea6c22040..ca83af9824b83 100644
--- a/libcxx/utils/ci/buildkite-pipeline.yml
+++ b/libcxx/utils/ci/buildkite-pipeline.yml
@@ -122,8 +122,8 @@ steps:
   - label: FreeBSD 13 amd64
     command: libcxx/utils/ci/run-buildbot generic-cxx26
     env:
-      CC: clang19
-      CXX: clang++19
+      CC: clang20
+      CXX: clang++20
     agents:
       queue: libcxx-builders
       os: freebsd
diff --git a/libcxx/utils/ci/docker-compose.yml b/libcxx/utils/ci/docker-compose.yml
index cac97a945b2a8..9367a8f1de6b8 100644
--- a/libcxx/utils/ci/docker-compose.yml
+++ b/libcxx/utils/ci/docker-compose.yml
@@ -23,7 +23,7 @@ services:
       dockerfile: Dockerfile
       target: actions-builder
       args:
-        GITHUB_RUNNER_VERSION: "2.328.0"
+        GITHUB_RUNNER_VERSION: "2.329.0"
         <<: [*image_versions, *compiler_versions]
 
   android-buildkite-builder:
diff --git a/libcxx/utils/compare-benchmarks b/libcxx/utils/compare-benchmarks
index 988e2439e6269..b5bd8805df9d5 100755
--- a/libcxx/utils/compare-benchmarks
+++ b/libcxx/utils/compare-benchmarks
@@ -65,9 +65,16 @@ def plain_text_comparison(data, metric, baseline_name=None, candidate_name=None)
     """
     data = data.replace(numpy.nan, None) # avoid NaNs in tabulate output
     headers = ['Benchmark', baseline_name, candidate_name, 'Difference', '% Difference']
-    fmt = (None, '.2f', '.2f', '.2f', '.2f')
-    table = data[['benchmark', f'{metric}_0', f'{metric}_1', 'difference', 'percent']].set_index('benchmark')
-    return tabulate.tabulate(table, headers=headers, floatfmt=fmt, numalign='right')
+    fmt = (None, '.2f', '.2f', '.2f', '.2%')
+    table = data[['benchmark', f'{metric}_0', f'{metric}_1', 'difference', 'percent']]
+
+    # Compute the geomean and report on their difference
+    geomean_0 = statistics.geometric_mean(data[f'{metric}_0'].dropna())
+    geomean_1 = statistics.geometric_mean(data[f'{metric}_1'].dropna())
+    geomean_row = ['Geomean', geomean_0, geomean_1, (geomean_1 - geomean_0), (geomean_1 - geomean_0) / geomean_0]
+    table.loc[table.index.max() + 1] = geomean_row
+
+    return tabulate.tabulate(table.set_index('benchmark'), headers=headers, floatfmt=fmt, numalign='right')
 
 def create_chart(data, metric, subtitle=None, series_names=None):
     """
@@ -154,7 +161,7 @@ def main(argv):
     # If we have exactly two data sets, compute additional info in new columns
     if len(lnt_inputs) == 2:
         data['difference'] = data[f'{args.metric}_1'] - data[f'{args.metric}_0']
-        data['percent'] = 100 * (data['difference'] / data[f'{args.metric}_0'])
+        data['percent'] = data['difference'] / data[f'{args.metric}_0']
 
     if args.filter is not None:
         keeplist = [b for b in data['benchmark'] if re.search(args.filter, b) is not None]
diff --git a/libcxx/utils/visualize-historical b/libcxx/utils/visualize-historical
index 114c7e81f29e7..0a94b152a7137 100755
--- a/libcxx/utils/visualize-historical
+++ b/libcxx/utils/visualize-historical
@@ -112,7 +112,7 @@ def truncate_lines(string, n, marker=None):
     assert len(truncated) <= n, "broken post-condition"
     return '\n'.join(truncated)
 
-def create_plot(data, metric, subtitle=None):
+def create_plot(data, metric, trendline=None, subtitle=None):
     """
     Create a plot object showing the evolution of each benchmark throughout the given commits for
     the given metric.
@@ -126,7 +126,7 @@ def create_plot(data, metric, subtitle=None):
                                           symbol='benchmark',
                                           color='benchmark',
                                           hover_name=[hover_info[c] for c in data['commit']],
-                                          trendline="lowess")
+                                          trendline=trendline)
     return figure
 
 def directory_path(string):
@@ -221,6 +221,10 @@ def main(argv):
     parser.add_argument('--open', action='store_true',
         help='Whether to automatically open the generated HTML file when finished. If no output file is provided, '
              'the resulting benchmark is opened automatically by default.')
+    parser.add_argument('--trendline', type=str, required=False, default=None, choices=('ols', 'lowess', 'expanding'),
+        help='Optional trendline to add on each series in the chart. See the documentation in '
+             'https://plotly.com/python-api-reference/generated/plotly.express.trendline_functions.html '
+             'details on each option.')
     args = parser.parse_args(argv)
 
     # Extract benchmark data from the directory.
@@ -250,9 +254,11 @@ def main(argv):
     if args.filter is not None:
         keeplist = [b for b in data['benchmark'] if re.search(args.filter, b) is not None]
         data = data[data['benchmark'].isin(keeplist)]
+        if len(data) == 0:
+            raise RuntimeError(f'Filter "{args.filter}" resulted in empty data set -- nothing to plot')
 
     # Plot the data for all the required benchmarks.
-    figure = create_plot(data, args.metric, subtitle=args.subtitle)
+    figure = create_plot(data, args.metric, trendline=args.trendline, subtitle=args.subtitle)
     do_open = args.output is None or args.open
     output = args.output if args.output is not None else tempfile.NamedTemporaryFile(suffix='.html').name
     plotly.io.write_html(figure, file=output, auto_open=do_open)
diff --git a/libcxxabi/include/__cxxabi_config.h b/libcxxabi/include/__cxxabi_config.h
index 759445dac91f9..f5101dbc9e599 100644
--- a/libcxxabi/include/__cxxabi_config.h
+++ b/libcxxabi/include/__cxxabi_config.h
@@ -103,6 +103,47 @@
 #define _LIBCXXABI_DTOR_FUNC
 #endif
 
+#if __has_include(<ptrauth.h>)
+#  include <ptrauth.h>
+#endif
+
+#if __has_feature(ptrauth_calls)
+
+// ptrauth_string_discriminator("__cxa_exception::actionRecord") == 0xFC91
+#  define __ptrauth_cxxabi_action_record __ptrauth(ptrauth_key_process_dependent_data, 1, 0xFC91)
+
+// ptrauth_string_discriminator("__cxa_exception::languageSpecificData") == 0xE8EE
+#  define __ptrauth_cxxabi_lsd __ptrauth(ptrauth_key_process_dependent_data, 1, 0xE8EE)
+
+// ptrauth_string_discriminator("__cxa_exception::catchTemp") == 0xFA58
+#  define __ptrauth_cxxabi_catch_temp_disc 0xFA58
+#  define __ptrauth_cxxabi_catch_temp_key ptrauth_key_process_dependent_data
+#  define __ptrauth_cxxabi_catch_temp __ptrauth(__ptrauth_cxxabi_catch_temp_key, 1, __ptrauth_cxxabi_catch_temp_disc)
+
+// ptrauth_string_discriminator("__cxa_exception::adjustedPtr") == 0x99E4
+#  define __ptrauth_cxxabi_adjusted_ptr __ptrauth(ptrauth_key_process_dependent_data, 1, 0x99E4)
+
+// ptrauth_string_discriminator("__cxa_exception::unexpectedHandler") == 0x99A9
+#  define __ptrauth_cxxabi_unexpected_handler __ptrauth(ptrauth_key_function_pointer, 1, 0x99A9)
+
+// ptrauth_string_discriminator("__cxa_exception::terminateHandler") == 0x0886)
+#  define __ptrauth_cxxabi_terminate_handler __ptrauth(ptrauth_key_function_pointer, 1, 0x886)
+
+// ptrauth_string_discriminator("__cxa_exception::exceptionDestructor") == 0xC088
+#  define __ptrauth_cxxabi_exception_destructor __ptrauth(ptrauth_key_function_pointer, 1, 0xC088)
+
+#else
+
+#  define __ptrauth_cxxabi_action_record
+#  define __ptrauth_cxxabi_lsd
+#  define __ptrauth_cxxabi_catch_temp
+#  define __ptrauth_cxxabi_adjusted_ptr
+#  define __ptrauth_cxxabi_unexpected_handler
+#  define __ptrauth_cxxabi_terminate_handler
+#  define __ptrauth_cxxabi_exception_destructor
+
+#endif
+
 #if __cplusplus < 201103L
 #  define _LIBCXXABI_NOEXCEPT throw()
 #else
diff --git a/libcxxabi/src/cxa_exception.cpp b/libcxxabi/src/cxa_exception.cpp
index 92901a83bfd03..73ba21c1df7c1 100644
--- a/libcxxabi/src/cxa_exception.cpp
+++ b/libcxxabi/src/cxa_exception.cpp
@@ -192,7 +192,9 @@ void *__cxa_allocate_exception(size_t thrown_size) throw() {
         std::terminate();
     __cxa_exception *exception_header =
         static_cast<__cxa_exception *>((void *)(raw_buffer + header_offset));
-    ::memset(exception_header, 0, actual_size);
+    // We warn on memset to a non-trivially castable type. We might want to
+    // change that diagnostic to not fire on a trivially obvious zero fill.
+    ::memset(static_cast<void*>(exception_header), 0, actual_size);
     return thrown_object_from_cxa_exception(exception_header);
 }
 
diff --git a/libcxxabi/src/cxa_exception.h b/libcxxabi/src/cxa_exception.h
index aba08f2992103..fa4c4dc55bde2 100644
--- a/libcxxabi/src/cxa_exception.h
+++ b/libcxxabi/src/cxa_exception.h
@@ -47,10 +47,10 @@ struct _LIBCXXABI_HIDDEN __cxa_exception {
     // In Wasm, a destructor returns its argument
     void *(_LIBCXXABI_DTOR_FUNC *exceptionDestructor)(void *);
 #else
-    void (_LIBCXXABI_DTOR_FUNC *exceptionDestructor)(void *);
+    void (_LIBCXXABI_DTOR_FUNC *__ptrauth_cxxabi_exception_destructor exceptionDestructor)(void *);
 #endif
-    std::unexpected_handler unexpectedHandler;
-    std::terminate_handler  terminateHandler;
+    std::unexpected_handler __ptrauth_cxxabi_unexpected_handler unexpectedHandler;
+    std::terminate_handler __ptrauth_cxxabi_terminate_handler terminateHandler;
 
     __cxa_exception *nextException;
 
@@ -61,10 +61,10 @@ struct _LIBCXXABI_HIDDEN __cxa_exception {
     int propagationCount;
 #else
     int handlerSwitchValue;
-    const unsigned char *actionRecord;
-    const unsigned char *languageSpecificData;
-    void *catchTemp;
-    void *adjustedPtr;
+    const unsigned char *__ptrauth_cxxabi_action_record actionRecord;
+    const unsigned char *__ptrauth_cxxabi_lsd languageSpecificData;
+    void *__ptrauth_cxxabi_catch_temp catchTemp;
+    void *__ptrauth_cxxabi_adjusted_ptr adjustedPtr;
 #endif
 
 #if !defined(__LP64__) && !defined(_WIN64) && !defined(_LIBCXXABI_ARM_EHABI)
@@ -79,6 +79,8 @@ struct _LIBCXXABI_HIDDEN __cxa_exception {
 // http://sourcery.mentor.com/archives/cxx-abi-dev/msg01924.html
 // The layout of this structure MUST match the layout of __cxa_exception, with
 // primaryException instead of referenceCount.
+// The pointer authentication schemas specified here must also match those of
+// the corresponding members in __cxa_exception.
 struct _LIBCXXABI_HIDDEN __cxa_dependent_exception {
 #if defined(__LP64__) || defined(_WIN64) || defined(_LIBCXXABI_ARM_EHABI)
     void* reserve; // padding.
@@ -86,9 +88,9 @@ struct _LIBCXXABI_HIDDEN __cxa_dependent_exception {
 #endif
 
     std::type_info *exceptionType;
-    void (_LIBCXXABI_DTOR_FUNC *exceptionDestructor)(void *);
-    std::unexpected_handler unexpectedHandler;
-    std::terminate_handler terminateHandler;
+    void (_LIBCXXABI_DTOR_FUNC *__ptrauth_cxxabi_exception_destructor exceptionDestructor)(void *);
+    std::unexpected_handler __ptrauth_cxxabi_unexpected_handler unexpectedHandler;
+    std::terminate_handler __ptrauth_cxxabi_terminate_handler terminateHandler;
 
     __cxa_exception *nextException;
 
@@ -99,10 +101,10 @@ struct _LIBCXXABI_HIDDEN __cxa_dependent_exception {
     int propagationCount;
 #else
     int handlerSwitchValue;
-    const unsigned char *actionRecord;
-    const unsigned char *languageSpecificData;
-    void * catchTemp;
-    void *adjustedPtr;
+    const unsigned char *__ptrauth_cxxabi_action_record actionRecord;
+    const unsigned char *__ptrauth_cxxabi_lsd languageSpecificData;
+    void *__ptrauth_cxxabi_catch_temp catchTemp;
+    void *__ptrauth_cxxabi_adjusted_ptr adjustedPtr;
 #endif
 
 #if !defined(__LP64__) && !defined(_WIN64) && !defined(_LIBCXXABI_ARM_EHABI)
diff --git a/libcxxabi/src/cxa_personality.cpp b/libcxxabi/src/cxa_personality.cpp
index 5f6e75c5be19c..b7eb0f23dbe06 100644
--- a/libcxxabi/src/cxa_personality.cpp
+++ b/libcxxabi/src/cxa_personality.cpp
@@ -20,7 +20,47 @@
 #include "cxa_exception.h"
 #include "cxa_handlers.h"
 #include "private_typeinfo.h"
-#include "unwind.h"
+
+#if __has_feature(ptrauth_calls)
+
+// CXXABI depends on defintions in libunwind as pointer auth couples the
+// definitions
+#  include "libunwind.h"
+
+// The actual value of the discriminators listed below is not important.
+// The derivation of the constants is only being included for the purpose
+// of maintaining a record of how they were originally produced.
+
+// ptrauth_string_discriminator("scan_results::languageSpecificData") == 0xE50D)
+#  define __ptrauth_scan_results_lsd __ptrauth(ptrauth_key_process_dependent_code, 1, 0xE50D)
+
+// ptrauth_string_discriminator("scan_results::actionRecord") == 0x9823
+#  define __ptrauth_scan_results_action_record __ptrauth(ptrauth_key_process_dependent_code, 1, 0x9823)
+
+// scan result is broken up as we have a manual re-sign that requires each component
+#  define __ptrauth_scan_results_landingpad_key ptrauth_key_process_dependent_code
+// ptrauth_string_discriminator("scan_results::landingPad") == 0xD27C
+#  define __ptrauth_scan_results_landingpad_disc 0xD27C
+#  define __ptrauth_scan_results_landingpad                                                                            \
+    __ptrauth(__ptrauth_scan_results_landingpad_key, 1, __ptrauth_scan_results_landingpad_disc)
+
+// `__ptrauth_restricted_intptr` is a feature of apple clang that predates
+// support for direct application of `__ptrauth` to integer types. This
+// guard is necessary to support compilation with those compiler.
+#  if __has_extension(ptrauth_restricted_intptr_qualifier)
+#    define __ptrauth_scan_results_landingpad_intptr                                                                   \
+      __ptrauth_restricted_intptr(__ptrauth_scan_results_landingpad_key, 1, __ptrauth_scan_results_landingpad_disc)
+#  else
+#    define __ptrauth_scan_results_landingpad_intptr                                                                   \
+      __ptrauth(__ptrauth_scan_results_landingpad_key, 1, __ptrauth_scan_results_landingpad_disc)
+#  endif
+
+#else
+#  define __ptrauth_scan_results_lsd
+#  define __ptrauth_scan_results_action_record
+#  define __ptrauth_scan_results_landingpad
+#  define __ptrauth_scan_results_landingpad_intptr
+#endif
 
 // TODO: This is a temporary workaround for libc++abi to recognize that it's being
 // built against LLVM's libunwind. LLVM's libunwind started reporting _LIBUNWIND_VERSION
@@ -527,12 +567,17 @@ get_thrown_object_ptr(_Unwind_Exception* unwind_exception)
 namespace
 {
 
+typedef const uint8_t *__ptrauth_scan_results_lsd lsd_ptr_t;
+typedef const uint8_t *__ptrauth_scan_results_action_record action_ptr_t;
+typedef uintptr_t __ptrauth_scan_results_landingpad_intptr landing_pad_t;
+typedef void *__ptrauth_scan_results_landingpad landing_pad_ptr_t;
+
 struct scan_results
 {
     int64_t        ttypeIndex;   // > 0 catch handler, < 0 exception spec handler, == 0 a cleanup
-    const uint8_t* actionRecord;         // Currently unused.  Retained to ease future maintenance.
-    const uint8_t* languageSpecificData;  // Needed only for __cxa_call_unexpected
-    uintptr_t      landingPad;   // null -> nothing found, else something found
+    action_ptr_t   actionRecord; // Currently unused.  Retained to ease future maintenance.
+    lsd_ptr_t      languageSpecificData; // Needed only for __cxa_call_unexpected
+    landing_pad_t  landingPad;   // null -> nothing found, else something found
     void*          adjustedPtr;  // Used in cxa_exception.cpp
     _Unwind_Reason_Code reason;  // One of _URC_FATAL_PHASE1_ERROR,
                                  //        _URC_FATAL_PHASE2_ERROR,
@@ -557,7 +602,23 @@ set_registers(_Unwind_Exception* unwind_exception, _Unwind_Context* context,
                 reinterpret_cast<uintptr_t>(unwind_exception));
   _Unwind_SetGR(context, __builtin_eh_return_data_regno(1),
                 static_cast<uintptr_t>(results.ttypeIndex));
+#if __has_feature(ptrauth_calls)
+  auto stackPointer = _Unwind_GetGR(context, UNW_REG_SP);
+  // We manually re-sign the IP as the __ptrauth qualifiers cannot
+  // express the required relationship with the destination address
+  const auto existingDiscriminator =
+      ptrauth_blend_discriminator(&results.landingPad,
+                                  __ptrauth_scan_results_landingpad_disc);
+  unw_word_t newIP /* opaque __ptrauth(ptrauth_key_return_address, stackPointer, 0) */ =
+      (unw_word_t)ptrauth_auth_and_resign(*(void* const*)&results.landingPad,
+                                          __ptrauth_scan_results_landingpad_key,
+                                          existingDiscriminator,
+                                          ptrauth_key_return_address,
+                                          stackPointer);
+  _Unwind_SetIP(context, newIP);
+#else
   _Unwind_SetIP(context, results.landingPad);
+#endif
 }
 
 /*
@@ -691,12 +752,12 @@ static void scan_eh_tab(scan_results &results, _Unwind_Action actions,
         // The call sites are ordered in increasing value of start
         uintptr_t start = readEncodedPointer(&callSitePtr, callSiteEncoding);
         uintptr_t length = readEncodedPointer(&callSitePtr, callSiteEncoding);
-        uintptr_t landingPad = readEncodedPointer(&callSitePtr, callSiteEncoding);
+        landing_pad_t landingPad = readEncodedPointer(&callSitePtr, callSiteEncoding);
         uintptr_t actionEntry = readULEB128(&callSitePtr);
         if ((start <= ipOffset) && (ipOffset < (start + length)))
 #else  // __USING_SJLJ_EXCEPTIONS__ || __WASM_EXCEPTIONS__
         // ip is 1-based index into this table
-        uintptr_t landingPad = readULEB128(&callSitePtr);
+        landing_pad_t landingPad = readULEB128(&callSitePtr);
         uintptr_t actionEntry = readULEB128(&callSitePtr);
         if (--ip == 0)
 #endif // __USING_SJLJ_EXCEPTIONS__ || __WASM_EXCEPTIONS__
@@ -903,6 +964,57 @@ _UA_CLEANUP_PHASE
 */
 
 #if !defined(_LIBCXXABI_ARM_EHABI)
+
+// We use these helper functions to work around the behavior of casting between
+// integers (even those that are authenticated) and authenticated pointers.
+// Because the schemas being used are address discriminated we cannot use a
+// trivial value union to coerce the types so instead we perform the re-signing
+// manually.
+using __cxa_catch_temp_type = decltype(__cxa_exception::catchTemp);
+static inline void set_landing_pad(scan_results& results,
+                                   const __cxa_catch_temp_type& source) {
+#if __has_feature(ptrauth_calls)
+  const uintptr_t sourceDiscriminator =
+      ptrauth_blend_discriminator(&source, __ptrauth_cxxabi_catch_temp_disc);
+  const uintptr_t targetDiscriminator =
+      ptrauth_blend_discriminator(&results.landingPad,
+                                  __ptrauth_scan_results_landingpad_disc);
+  uintptr_t reauthenticatedLandingPad =
+      (uintptr_t)ptrauth_auth_and_resign(*reinterpret_cast<void* const*>(&source),
+                                         __ptrauth_cxxabi_catch_temp_key,
+                                         sourceDiscriminator,
+                                         __ptrauth_scan_results_landingpad_key,
+                                         targetDiscriminator);
+  memmove(reinterpret_cast<void *>(&results.landingPad),
+          reinterpret_cast<void *>(&reauthenticatedLandingPad),
+          sizeof(reauthenticatedLandingPad));
+#else
+  results.landingPad = reinterpret_cast<landing_pad_t>(source);
+#endif
+}
+
+static inline void get_landing_pad(__cxa_catch_temp_type &dest,
+                                   const scan_results &results) {
+#if __has_feature(ptrauth_calls)
+  const uintptr_t sourceDiscriminator =
+      ptrauth_blend_discriminator(&results.landingPad,
+                                  __ptrauth_scan_results_landingpad_disc);
+  const uintptr_t targetDiscriminator =
+      ptrauth_blend_discriminator(&dest, __ptrauth_cxxabi_catch_temp_disc);
+  uintptr_t reauthenticatedPointer =
+      (uintptr_t)ptrauth_auth_and_resign(*reinterpret_cast<void* const*>(&results.landingPad),
+                                         __ptrauth_scan_results_landingpad_key,
+                                         sourceDiscriminator,
+                                         __ptrauth_cxxabi_catch_temp_key,
+                                         targetDiscriminator);
+  memmove(reinterpret_cast<void *>(&dest),
+          reinterpret_cast<void *>(&reauthenticatedPointer),
+          sizeof(reauthenticatedPointer));
+#else
+  dest = reinterpret_cast<__cxa_catch_temp_type>(results.landingPad);
+#endif
+}
+
 #ifdef __WASM_EXCEPTIONS__
 _Unwind_Reason_Code __gxx_personality_wasm0
 #elif defined(__SEH__) && !defined(__USING_SJLJ_EXCEPTIONS__)
@@ -935,8 +1047,7 @@ __gxx_personality_v0
         results.ttypeIndex = exception_header->handlerSwitchValue;
         results.actionRecord = exception_header->actionRecord;
         results.languageSpecificData = exception_header->languageSpecificData;
-        results.landingPad =
-            reinterpret_cast<uintptr_t>(exception_header->catchTemp);
+        set_landing_pad(results, exception_header->catchTemp);
         results.adjustedPtr = exception_header->adjustedPtr;
 
         // Jump to the handler.
@@ -970,7 +1081,7 @@ __gxx_personality_v0
             exc->handlerSwitchValue = static_cast<int>(results.ttypeIndex);
             exc->actionRecord = results.actionRecord;
             exc->languageSpecificData = results.languageSpecificData;
-            exc->catchTemp = reinterpret_cast<void*>(results.landingPad);
+            get_landing_pad(exc->catchTemp, results);
             exc->adjustedPtr = results.adjustedPtr;
 #ifdef __WASM_EXCEPTIONS__
             // Wasm only uses a single phase (_UA_SEARCH_PHASE), so save the
diff --git a/libunwind/include/__libunwind_config.h b/libunwind/include/__libunwind_config.h
index 544b3ec96216a..343934e885368 100644
--- a/libunwind/include/__libunwind_config.h
+++ b/libunwind/include/__libunwind_config.h
@@ -212,4 +212,11 @@
 # define _LIBUNWIND_HIGHEST_DWARF_REGISTER 287
 #endif // _LIBUNWIND_IS_NATIVE_ONLY
 
+#if __has_feature(ptrauth_calls) && __has_feature(ptrauth_returns)
+#  define _LIBUNWIND_TARGET_AARCH64_AUTHENTICATED_UNWINDING 1
+#elif __has_feature(ptrauth_calls) != __has_feature(ptrauth_returns)
+#  error "Either both or none of ptrauth_calls and ptrauth_returns "\
+         "is allowed to be enabled"
+#endif
+
 #endif // ____LIBUNWIND_CONFIG_H__
diff --git a/libunwind/include/libunwind.h b/libunwind/include/libunwind.h
index 94928f436025a..18684ce311f95 100644
--- a/libunwind/include/libunwind.h
+++ b/libunwind/include/libunwind.h
@@ -43,6 +43,109 @@
   #define LIBUNWIND_AVAIL
 #endif
 
+#if defined(_LIBUNWIND_TARGET_AARCH64_AUTHENTICATED_UNWINDING)
+
+  #include <ptrauth.h>
+
+  // `__ptrauth_restricted_intptr` is a feature of apple clang that predates
+  // support for direct application of `__ptrauth` to integer types. This
+  // guard is necessary to support compilation with those compiler.
+  #if __has_extension(ptrauth_restricted_intptr_qualifier)
+    #define __unwind_ptrauth_restricted_intptr(...) \
+      __ptrauth_restricted_intptr(__VA_ARGS__)
+  #else
+    #define __unwind_ptrauth_restricted_intptr(...) \
+      __ptrauth(__VA_ARGS__)
+  #endif
+
+  // ptrauth_string_discriminator("unw_proc_info_t::handler") == 0x7405
+  #define __ptrauth_unwind_upi_handler_disc 0x7405
+
+  #define __ptrauth_unwind_upi_handler \
+    __ptrauth(ptrauth_key_function_pointer, 1, __ptrauth_unwind_upi_handler_disc)
+
+  #define __ptrauth_unwind_upi_handler_intptr \
+    __unwind_ptrauth_restricted_intptr(ptrauth_key_function_pointer, 1,\
+                                       __ptrauth_unwind_upi_handler_disc)
+
+  // ptrauth_string_discriminator("unw_proc_info_t::start_ip") == 0xCA2C
+  #define __ptrauth_unwind_upi_startip \
+    __unwind_ptrauth_restricted_intptr(ptrauth_key_process_independent_code, 1, 0xCA2C)
+
+  // ptrauth_string_discriminator("unw_proc_info_t::end_ip") == 0xE183
+  #define __ptrauth_unwind_upi_endip \
+    __unwind_ptrauth_restricted_intptr(ptrauth_key_process_independent_code, 1, 0xE183)
+
+  // ptrauth_string_discriminator("unw_proc_info_t::lsda") == 0x83DE
+  #define __ptrauth_unwind_upi_lsda \
+    __unwind_ptrauth_restricted_intptr(ptrauth_key_process_dependent_data, 1, 0x83DE)
+
+  // ptrauth_string_discriminator("unw_proc_info_t::flags") == 0x79A1
+  #define __ptrauth_unwind_upi_flags \
+    __unwind_ptrauth_restricted_intptr(ptrauth_key_process_dependent_data, 1, 0x79A1)
+
+  // ptrauth_string_discriminator("unw_proc_info_t::unwind_info") == 0xC20C
+  #define __ptrauth_unwind_upi_info \
+    __unwind_ptrauth_restricted_intptr(ptrauth_key_process_dependent_data, 1, 0xC20C)
+
+  // ptrauth_string_discriminator("unw_proc_info_t::extra") == 0x03DF
+  #define __ptrauth_unwind_upi_extra \
+    __unwind_ptrauth_restricted_intptr(ptrauth_key_process_dependent_data, 1, 0x03DF)
+
+  // ptrauth_string_discriminator("Registers_arm64::link_reg_t") == 0x8301
+  #define __ptrauth_unwind_registers_arm64_link_reg \
+    __unwind_ptrauth_restricted_intptr(ptrauth_key_process_dependent_code, 1, 0x8301)
+
+  // ptrauth_string_discriminator("UnwindInfoSections::dso_base") == 0x4FF5
+  #define __ptrauth_unwind_uis_dso_base \
+    __unwind_ptrauth_restricted_intptr(ptrauth_key_process_dependent_data, 1, 0x4FF5)
+
+  // ptrauth_string_discriminator("UnwindInfoSections::dwarf_section") == 0x4974
+  #define __ptrauth_unwind_uis_dwarf_section \
+    __unwind_ptrauth_restricted_intptr(ptrauth_key_process_dependent_data, 1, 0x4974)
+
+  // ptrauth_string_discriminator("UnwindInfoSections::dwarf_section_length") == 0x2A9A
+  #define __ptrauth_unwind_uis_dwarf_section_length \
+    __unwind_ptrauth_restricted_intptr(ptrauth_key_process_dependent_data, 1, 0x2A9A)
+
+  // ptrauth_string_discriminator("UnwindInfoSections::compact_unwind_section") == 0xA27B
+  #define __ptrauth_unwind_uis_compact_unwind_section \
+    __unwind_ptrauth_restricted_intptr(ptrauth_key_process_dependent_data, 1, 0xA27B)
+
+  // ptrauth_string_discriminator("UnwindInfoSections::compact_unwind_section_length") == 0x5D0A
+  #define __ptrauth_unwind_uis_compact_unwind_section_length \
+    __unwind_ptrauth_restricted_intptr(ptrauth_key_process_dependent_data, 1, 0x5D0A)
+
+  // ptrauth_string_discriminator("CIE_Info::personality") == 0x6A40
+  #define __ptrauth_unwind_cie_info_personality_disc 0x6A40
+  #define __ptrauth_unwind_cie_info_personality \
+    __unwind_ptrauth_restricted_intptr(ptrauth_key_function_pointer, 1, \
+                                       __ptrauth_unwind_cie_info_personality_disc)
+
+  // ptrauth_string_discriminator("personality") == 0x7EAD)
+  #define __ptrauth_unwind_pauthtest_personality_disc 0x7EAD
+
+#else
+
+  #define __unwind_ptrauth_restricted_intptr(...)
+  #define __ptrauth_unwind_upi_handler
+  #define __ptrauth_unwind_upi_handler_intptr
+  #define __ptrauth_unwind_upi_startip
+  #define __ptrauth_unwind_upi_endip
+  #define __ptrauth_unwind_upi_lsda
+  #define __ptrauth_unwind_upi_flags
+  #define __ptrauth_unwind_upi_info
+  #define __ptrauth_unwind_upi_extra
+  #define __ptrauth_unwind_registers_arm64_link_reg
+  #define __ptrauth_unwind_uis_dso_base
+  #define __ptrauth_unwind_uis_dwarf_section
+  #define __ptrauth_unwind_uis_dwarf_section_length
+  #define __ptrauth_unwind_uis_compact_unwind_section
+  #define __ptrauth_unwind_uis_compact_unwind_section_length
+  #define __ptrauth_unwind_cie_info_personality
+
+#endif
+
 #if defined(_WIN32) && defined(__SEH__)
   #define LIBUNWIND_CURSOR_ALIGNMENT_ATTR __attribute__((__aligned__(16)))
 #else
@@ -88,17 +191,18 @@ typedef double unw_fpreg_t;
 #endif
 
 struct unw_proc_info_t {
-  unw_word_t  start_ip;         /* start address of function */
-  unw_word_t  end_ip;           /* address after end of function */
-  unw_word_t  lsda;             /* address of language specific data area, */
-                                /*  or zero if not used */
-  unw_word_t  handler;          /* personality routine, or zero if not used */
-  unw_word_t  gp;               /* not used */
-  unw_word_t  flags;            /* not used */
-  uint32_t    format;           /* compact unwind encoding, or zero if none */
-  uint32_t    unwind_info_size; /* size of DWARF unwind info, or zero if none */
-  unw_word_t  unwind_info;      /* address of DWARF unwind info, or zero */
-  unw_word_t  extra;            /* mach_header of mach-o image containing func */
+  unw_word_t __ptrauth_unwind_upi_startip start_ip; /* start address of function */
+  unw_word_t __ptrauth_unwind_upi_endip end_ip;     /* address after end of function */
+  unw_word_t __ptrauth_unwind_upi_lsda lsda;        /* address of language specific data area, */
+                                                    /* or zero if not used */
+
+  unw_word_t __ptrauth_unwind_upi_handler_intptr handler;
+  unw_word_t  gp;                                   /* not used */
+  unw_word_t __ptrauth_unwind_upi_flags flags;      /* not used */
+  uint32_t   format;                                /* compact unwind encoding, or zero if none */
+  uint32_t   unwind_info_size;                      /* size of DWARF unwind info, or zero if none */
+  unw_word_t __ptrauth_unwind_upi_info unwind_info; /* address of DWARF unwind info, or zero */
+  unw_word_t __ptrauth_unwind_upi_extra extra;      /* mach_header of mach-o image containing func */
 };
 typedef struct unw_proc_info_t unw_proc_info_t;
 
diff --git a/libunwind/src/AddressSpace.hpp b/libunwind/src/AddressSpace.hpp
index 5551c7d4bef1c..63f9cb367ec0c 100644
--- a/libunwind/src/AddressSpace.hpp
+++ b/libunwind/src/AddressSpace.hpp
@@ -129,22 +129,27 @@ struct UnwindInfoSections {
     defined(_LIBUNWIND_SUPPORT_COMPACT_UNWIND) ||                              \
     defined(_LIBUNWIND_USE_DL_ITERATE_PHDR)
   // No dso_base for SEH.
-  uintptr_t       dso_base;
+  uintptr_t __ptrauth_unwind_uis_dso_base
+                  dso_base = 0;
 #endif
 #if defined(_LIBUNWIND_USE_DL_ITERATE_PHDR)
   size_t          text_segment_length;
 #endif
 #if defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND)
-  uintptr_t       dwarf_section;
-  size_t          dwarf_section_length;
+  uintptr_t __ptrauth_unwind_uis_dwarf_section
+                  dwarf_section = 0;
+  size_t __ptrauth_unwind_uis_dwarf_section_length
+                  dwarf_section_length = 0;
 #endif
 #if defined(_LIBUNWIND_SUPPORT_DWARF_INDEX)
   uintptr_t       dwarf_index_section;
   size_t          dwarf_index_section_length;
 #endif
 #if defined(_LIBUNWIND_SUPPORT_COMPACT_UNWIND)
-  uintptr_t       compact_unwind_section;
-  size_t          compact_unwind_section_length;
+  uintptr_t __ptrauth_unwind_uis_compact_unwind_section
+                  compact_unwind_section = 0;
+  size_t __ptrauth_unwind_uis_compact_unwind_section_length
+                  compact_unwind_section_length = 0;
 #endif
 #if defined(_LIBUNWIND_ARM_EHABI)
   uintptr_t       arm_section;
@@ -196,7 +201,7 @@ class _LIBUNWIND_HIDDEN LocalAddressSpace {
   static int64_t  getSLEB128(pint_t &addr, pint_t end);
 
   pint_t getEncodedP(pint_t &addr, pint_t end, uint8_t encoding,
-                     pint_t datarelBase = 0);
+                     pint_t datarelBase = 0, pint_t *resultAddr = nullptr);
   bool findFunctionName(pint_t addr, char *buf, size_t bufLen,
                         unw_word_t *offset);
   bool findUnwindSections(pint_t targetAddr, UnwindInfoSections &info);
@@ -269,7 +274,7 @@ inline int64_t LocalAddressSpace::getSLEB128(pint_t &addr, pint_t end) {
 
 inline LocalAddressSpace::pint_t
 LocalAddressSpace::getEncodedP(pint_t &addr, pint_t end, uint8_t encoding,
-                               pint_t datarelBase) {
+                               pint_t datarelBase, pint_t *resultAddr) {
   pint_t startAddr = addr;
   const uint8_t *p = (uint8_t *)addr;
   pint_t result;
@@ -353,8 +358,14 @@ LocalAddressSpace::getEncodedP(pint_t &addr, pint_t end, uint8_t encoding,
     break;
   }
 
-  if (encoding & DW_EH_PE_indirect)
+  if (encoding & DW_EH_PE_indirect) {
+    if (resultAddr)
+      *resultAddr = result;
     result = getP(result);
+  } else {
+    if (resultAddr)
+      *resultAddr = startAddr;
+  }
 
   return result;
 }
diff --git a/libunwind/src/CompactUnwinder.hpp b/libunwind/src/CompactUnwinder.hpp
index a7a8a153d86a4..cd2e0e3431314 100644
--- a/libunwind/src/CompactUnwinder.hpp
+++ b/libunwind/src/CompactUnwinder.hpp
@@ -601,11 +601,17 @@ int CompactUnwinder_arm64<A>::stepWithCompactEncodingFrameless(
     savedRegisterLoc -= 8;
   }
 
+  // We load the link register prior to setting the new SP as the authentication
+  // schema for LR entangles the SP of the old frame into the diversifier.
+  Registers_arm64::reg_t linkRegister = registers.getRegister(UNW_AARCH64_LR);
+
   // subtract stack size off of sp
   registers.setSP(savedRegisterLoc);
 
-  // set pc to be value in lr
-  registers.setIP(registers.getRegister(UNW_AARCH64_LR));
+  // Set pc to be value in lr. This needs to be performed after the new SP has
+  // been set, as the PC authentication schema entangles the SP of the new
+  // frame.
+  registers.setIP(linkRegister);
 
   return UNW_STEP_SUCCESS;
 }
@@ -614,7 +620,7 @@ template <typename A>
 int CompactUnwinder_arm64<A>::stepWithCompactEncodingFrame(
     compact_unwind_encoding_t encoding, uint64_t, A &addressSpace,
     Registers_arm64 &registers) {
-  uint64_t savedRegisterLoc = registers.getFP() - 8;
+  Registers_arm64::reg_t savedRegisterLoc = registers.getFP() - 8;
 
   if (encoding & UNWIND_ARM64_FRAME_X19_X20_PAIR) {
     registers.setRegister(UNW_AARCH64_X19, addressSpace.get64(savedRegisterLoc));
@@ -680,11 +686,16 @@ int CompactUnwinder_arm64<A>::stepWithCompactEncodingFrame(
     savedRegisterLoc -= 8;
   }
 
-  uint64_t fp = registers.getFP();
+  Registers_arm64::reg_t fp = registers.getFP();
+
   // fp points to old fp
   registers.setFP(addressSpace.get64(fp));
-  // old sp is fp less saved fp and lr
+
+  // Old sp is fp less saved fp and lr. We need to set this prior to setting
+  // the lr as the pointer authentication schema for the lr incorporates the
+  // sp as part of the diversifier.
   registers.setSP(fp + 16);
+
   // pop return address into pc
   registers.setIP(addressSpace.get64(fp + 8));
 
diff --git a/libunwind/src/DwarfInstructions.hpp b/libunwind/src/DwarfInstructions.hpp
index e7be0d6d5d635..d2822e8be29ef 100644
--- a/libunwind/src/DwarfInstructions.hpp
+++ b/libunwind/src/DwarfInstructions.hpp
@@ -22,7 +22,6 @@
 #include "dwarf2.h"
 #include "libunwind_ext.h"
 
-
 namespace libunwind {
 
 
@@ -34,8 +33,9 @@ class DwarfInstructions {
   typedef typename A::pint_t pint_t;
   typedef typename A::sint_t sint_t;
 
-  static int stepWithDwarf(A &addressSpace, pint_t pc, pint_t fdeStart,
-                           R &registers, bool &isSignalFrame, bool stage2);
+  static int stepWithDwarf(A &addressSpace, const typename R::link_reg_t &pc,
+                           pint_t fdeStart, R &registers, bool &isSignalFrame,
+                           bool stage2);
 
 private:
 
@@ -64,9 +64,10 @@ class DwarfInstructions {
 
   static pint_t getCFA(A &addressSpace, const PrologInfo &prolog,
                        const R &registers) {
-    if (prolog.cfaRegister != 0)
-      return (pint_t)((sint_t)registers.getRegister((int)prolog.cfaRegister) +
-             prolog.cfaRegisterOffset);
+    if (prolog.cfaRegister != 0) {
+      uintptr_t cfaRegister = registers.getRegister((int)prolog.cfaRegister);
+      return (pint_t)(cfaRegister + prolog.cfaRegisterOffset);
+    }
     if (prolog.cfaExpression != 0)
       return evaluateExpression((pint_t)prolog.cfaExpression, addressSpace,
                                 registers, 0);
@@ -207,7 +208,8 @@ bool DwarfInstructions<A, R>::isReturnAddressSignedWithPC(A &addressSpace,
 #endif
 
 template <typename A, typename R>
-int DwarfInstructions<A, R>::stepWithDwarf(A &addressSpace, pint_t pc,
+int DwarfInstructions<A, R>::stepWithDwarf(A &addressSpace,
+                                           const typename R::link_reg_t &pc,
                                            pint_t fdeStart, R &registers,
                                            bool &isSignalFrame, bool stage2) {
   FDE_Info fdeInfo;
@@ -264,7 +266,7 @@ int DwarfInstructions<A, R>::stepWithDwarf(A &addressSpace, pint_t pc,
       // by a CFI directive later on.
       newRegisters.setSP(cfa);
 
-      pint_t returnAddress = 0;
+      typename R::reg_t returnAddress = 0;
       constexpr int lastReg = R::lastDwarfRegNum();
       static_assert(static_cast<int>(CFI_Parser<A>::kMaxRegisterNumber) >=
                         lastReg,
@@ -300,7 +302,16 @@ int DwarfInstructions<A, R>::stepWithDwarf(A &addressSpace, pint_t pc,
 
       isSignalFrame = cieInfo.isSignalFrame;
 
-#if defined(_LIBUNWIND_TARGET_AARCH64)
+#if defined(_LIBUNWIND_TARGET_AARCH64) &&                                      \
+    !defined(_LIBUNWIND_TARGET_AARCH64_AUTHENTICATED_UNWINDING)
+      // There are two ways of return address signing: pac-ret (enabled via
+      // -mbranch-protection=pac-ret) and ptrauth-returns (enabled as part of
+      // Apple's arm64e or experimental pauthtest ABI on Linux). The code
+      // below handles signed RA for pac-ret, while ptrauth-returns uses
+      // different logic.
+      // TODO: unify logic for both cases, see
+      // https://github.com/llvm/llvm-project/issues/160110
+      //
       // If the target is aarch64 then the return address may have been signed
       // using the v8.3 pointer authentication extensions. The original
       // return address needs to be authenticated before the return address is
diff --git a/libunwind/src/DwarfParser.hpp b/libunwind/src/DwarfParser.hpp
index 25250e0810987..dbd7d65c354aa 100644
--- a/libunwind/src/DwarfParser.hpp
+++ b/libunwind/src/DwarfParser.hpp
@@ -23,6 +23,10 @@
 
 #include "config.h"
 
+#if defined(_LIBUNWIND_TARGET_AARCH64_AUTHENTICATED_UNWINDING)
+#include <ptrauth.h>
+#endif
+
 namespace libunwind {
 
 /// CFI_Parser does basic parsing of a CFI (Call Frame Information) records.
@@ -33,6 +37,7 @@ template <typename A>
 class CFI_Parser {
 public:
   typedef typename A::pint_t pint_t;
+  typedef pint_t __ptrauth_unwind_cie_info_personality personality_t;
 
   /// Information encoded in a CIE (Common Information Entry)
   struct CIE_Info {
@@ -43,7 +48,7 @@ class CFI_Parser {
     uint8_t   lsdaEncoding;
     uint8_t   personalityEncoding;
     uint8_t   personalityOffsetInCIE;
-    pint_t    personality;
+    personality_t personality;
     uint32_t  codeAlignFactor;
     int       dataAlignFactor;
     bool      isSignalFrame;
@@ -369,6 +374,7 @@ const char *CFI_Parser<A>::parseCIE(A &addressSpace, pint_t cie,
   cieInfo->returnAddressRegister = (uint8_t)raReg;
   // parse augmentation data based on augmentation string
   const char *result = NULL;
+  pint_t resultAddr = 0;
   if (addressSpace.get8(strStart) == 'z') {
     // parse augmentation data length
     addressSpace.getULEB128(p, cieContentEnd);
@@ -377,13 +383,41 @@ const char *CFI_Parser<A>::parseCIE(A &addressSpace, pint_t cie,
       case 'z':
         cieInfo->fdesHaveAugmentationData = true;
         break;
-      case 'P':
+      case 'P': {
         cieInfo->personalityEncoding = addressSpace.get8(p);
         ++p;
         cieInfo->personalityOffsetInCIE = (uint8_t)(p - cie);
-        cieInfo->personality = addressSpace
-            .getEncodedP(p, cieContentEnd, cieInfo->personalityEncoding);
+        pint_t personality = addressSpace.getEncodedP(
+            p, cieContentEnd, cieInfo->personalityEncoding,
+            /*datarelBase=*/0, &resultAddr);
+#if defined(_LIBUNWIND_TARGET_AARCH64_AUTHENTICATED_UNWINDING)
+        if (personality) {
+          // The GOT for the personality function was signed address
+          // authenticated. Manually re-sign with the CIE_Info::personality
+          // schema. If we could guarantee the encoding of the personality we
+          // could avoid this by simply giving resultAddr the correct ptrauth
+          // schema and performing an assignment.
+#if defined(__arm64e__)
+          const auto oldDiscriminator = resultAddr;
+#else
+          const auto oldDiscriminator = ptrauth_blend_discriminator(
+              (void *)resultAddr, __ptrauth_unwind_pauthtest_personality_disc);
+#endif
+          const auto discriminator = ptrauth_blend_discriminator(
+              &cieInfo->personality,
+              __ptrauth_unwind_cie_info_personality_disc);
+          void *signedPtr = ptrauth_auth_and_resign(
+              (void *)personality, ptrauth_key_function_pointer,
+              oldDiscriminator, ptrauth_key_function_pointer, discriminator);
+          personality = (pint_t)signedPtr;
+        }
+#endif
+        // We use memmove to set the CIE personality as we have already
+        // re-signed the pointer to the correct schema.
+        memmove((void *)&cieInfo->personality, (void *)&personality,
+                sizeof(personality));
         break;
+      }
       case 'L':
         cieInfo->lsdaEncoding = addressSpace.get8(p);
         ++p;
diff --git a/libunwind/src/Registers.hpp b/libunwind/src/Registers.hpp
index 8b3055f5cd7cd..5a5b57835379a 100644
--- a/libunwind/src/Registers.hpp
+++ b/libunwind/src/Registers.hpp
@@ -17,6 +17,7 @@
 
 #include "config.h"
 #include "libunwind.h"
+#include "libunwind_ext.h"
 #include "shadow_stack_unwind.h"
 
 namespace libunwind {
@@ -60,6 +61,9 @@ class _LIBUNWIND_HIDDEN Registers_x86 {
   Registers_x86();
   Registers_x86(const void *registers);
 
+  typedef uint32_t reg_t;
+  typedef uint32_t link_reg_t;
+
   bool        validRegister(int num) const;
   uint32_t    getRegister(int num) const;
   void        setRegister(int num, uint32_t value);
@@ -278,6 +282,9 @@ class _LIBUNWIND_HIDDEN Registers_x86_64 {
   Registers_x86_64();
   Registers_x86_64(const void *registers);
 
+  typedef uint64_t reg_t;
+  typedef uint64_t link_reg_t;
+
   bool        validRegister(int num) const;
   uint64_t    getRegister(int num) const;
   void        setRegister(int num, uint64_t value);
@@ -597,6 +604,9 @@ class _LIBUNWIND_HIDDEN Registers_ppc {
   Registers_ppc();
   Registers_ppc(const void *registers);
 
+  typedef uint32_t reg_t;
+  typedef uint32_t link_reg_t;
+
   bool        validRegister(int num) const;
   uint32_t    getRegister(int num) const;
   void        setRegister(int num, uint32_t value);
@@ -1169,6 +1179,9 @@ class _LIBUNWIND_HIDDEN Registers_ppc64 {
   Registers_ppc64();
   Registers_ppc64(const void *registers);
 
+  typedef uint64_t reg_t;
+  typedef uint64_t link_reg_t;
+
   bool        validRegister(int num) const;
   uint64_t    getRegister(int num) const;
   void        setRegister(int num, uint64_t value);
@@ -1826,6 +1839,11 @@ class _LIBUNWIND_HIDDEN Registers_arm64 {
 public:
   Registers_arm64();
   Registers_arm64(const void *registers);
+  Registers_arm64(const Registers_arm64 &);
+  Registers_arm64 &operator=(const Registers_arm64 &);
+
+  typedef uint64_t reg_t;
+  typedef uint64_t __ptrauth_unwind_registers_arm64_link_reg link_reg_t;
 
   bool        validRegister(int num) const;
   uint64_t    getRegister(int num) const;
@@ -1845,10 +1863,47 @@ class _LIBUNWIND_HIDDEN Registers_arm64 {
 
   uint64_t  getSP() const         { return _registers.__sp; }
   void      setSP(uint64_t value) { _registers.__sp = value; }
-  uint64_t  getIP() const         { return _registers.__pc; }
-  void      setIP(uint64_t value) { _registers.__pc = value; }
-  uint64_t  getFP() const         { return _registers.__fp; }
-  void      setFP(uint64_t value) { _registers.__fp = value; }
+  uint64_t  getIP() const {
+    uint64_t value = _registers.__pc;
+#if defined(_LIBUNWIND_TARGET_AARCH64_AUTHENTICATED_UNWINDING)
+    // Note the value of the PC was signed to its address in the register state
+    // but everyone else expects it to be sign by the SP, so convert on return.
+    value = (uint64_t)ptrauth_auth_and_resign((void *)_registers.__pc,
+                                              ptrauth_key_return_address,
+                                              &_registers.__pc,
+                                              ptrauth_key_return_address,
+                                              getSP());
+#endif
+    return value;
+  }
+  void      setIP(uint64_t value) {
+#if defined(_LIBUNWIND_TARGET_AARCH64_AUTHENTICATED_UNWINDING)
+    // Note the value which was set should have been signed with the SP.
+    // We then resign with the slot we are being stored in to so that both SP
+    // and LR can't be spoofed at the same time.
+    value = (uint64_t)ptrauth_auth_and_resign((void *)value,
+                                              ptrauth_key_return_address,
+                                              getSP(),
+                                              ptrauth_key_return_address,
+                                              &_registers.__pc);
+#endif
+    _registers.__pc = value;
+  }
+  uint64_t getFP() const { return _registers.__fp; }
+  void setFP(uint64_t value) { _registers.__fp = value; }
+
+#if defined(_LIBUNWIND_TARGET_AARCH64_AUTHENTICATED_UNWINDING)
+  void
+  loadAndAuthenticateLinkRegister(reg_t inplaceAuthedLinkRegister,
+                                  link_reg_t *referenceAuthedLinkRegister) {
+    // If we are in an arm64/arm64e frame, then the PC should have been signed
+    // with the SP
+    *referenceAuthedLinkRegister =
+      (uint64_t)ptrauth_auth_data((void *)inplaceAuthedLinkRegister,
+                                  ptrauth_key_return_address,
+                                  _registers.__sp);
+  }
+#endif
 
 private:
   uint64_t lazyGetVG() const;
@@ -1889,11 +1944,35 @@ inline Registers_arm64::Registers_arm64(const void *registers) {
   memcpy(_vectorHalfRegisters,
          static_cast<const uint8_t *>(registers) + sizeof(GPRs),
          sizeof(_vectorHalfRegisters));
+  _misc_registers.__vg = 0;
+
+#if defined(_LIBUNWIND_TARGET_AARCH64_AUTHENTICATED_UNWINDING)
+  // We have to do some pointer authentication fixups after this copy,
+  // and as part of that we need to load the source pc without
+  // authenticating so that we maintain the signature for the resigning
+  // performed by setIP.
+  uint64_t pcRegister = 0;
+  memmove(&pcRegister, ((uint8_t *)&_registers) + offsetof(GPRs, __pc),
+          sizeof(pcRegister));
+  setIP(pcRegister);
+#endif
+}
+
+inline Registers_arm64::Registers_arm64(const Registers_arm64 &other) {
+  *this = other;
+}
+
+inline Registers_arm64 &
+Registers_arm64::operator=(const Registers_arm64 &other) {
+  memmove(static_cast<void *>(this), &other, sizeof(*this));
+  // We perform this step to ensure that we correctly authenticate and re-sign
+  // the pc after the bitwise copy.
+  setIP(other.getIP());
+  return *this;
 }
 
 inline Registers_arm64::Registers_arm64() {
-  memset(&_registers, 0, sizeof(_registers));
-  memset(&_vectorHalfRegisters, 0, sizeof(_vectorHalfRegisters));
+  memset(static_cast<void *>(this), 0, sizeof(*this));
 }
 
 inline bool Registers_arm64::validRegister(int regNum) const {
@@ -1930,13 +2009,13 @@ inline uint64_t Registers_arm64::lazyGetVG() const {
 
 inline uint64_t Registers_arm64::getRegister(int regNum) const {
   if (regNum == UNW_REG_IP || regNum == UNW_AARCH64_PC)
-    return _registers.__pc;
+    return getIP();
   if (regNum == UNW_REG_SP || regNum == UNW_AARCH64_SP)
     return _registers.__sp;
   if (regNum == UNW_AARCH64_RA_SIGN_STATE)
     return _registers.__ra_sign_state;
   if (regNum == UNW_AARCH64_FP)
-    return _registers.__fp;
+    return getFP();
   if (regNum == UNW_AARCH64_LR)
     return _registers.__lr;
   if (regNum == UNW_AARCH64_VG)
@@ -1948,13 +2027,13 @@ inline uint64_t Registers_arm64::getRegister(int regNum) const {
 
 inline void Registers_arm64::setRegister(int regNum, uint64_t value) {
   if (regNum == UNW_REG_IP || regNum == UNW_AARCH64_PC)
-    _registers.__pc = value;
+    setIP(value);
   else if (regNum == UNW_REG_SP || regNum == UNW_AARCH64_SP)
     _registers.__sp = value;
   else if (regNum == UNW_AARCH64_RA_SIGN_STATE)
     _registers.__ra_sign_state = value;
   else if (regNum == UNW_AARCH64_FP)
-    _registers.__fp = value;
+    setFP(value);
   else if (regNum == UNW_AARCH64_LR)
     _registers.__lr = value;
   else if (regNum == UNW_AARCH64_VG)
@@ -2148,6 +2227,9 @@ class _LIBUNWIND_HIDDEN Registers_arm {
   Registers_arm();
   Registers_arm(const void *registers);
 
+  typedef uint32_t reg_t;
+  typedef uint32_t link_reg_t;
+
   bool        validRegister(int num) const;
   uint32_t    getRegister(int num) const;
   void        setRegister(int num, uint32_t value);
@@ -2653,6 +2735,9 @@ class _LIBUNWIND_HIDDEN Registers_or1k {
   Registers_or1k();
   Registers_or1k(const void *registers);
 
+  typedef uint32_t reg_t;
+  typedef uint32_t link_reg_t;
+
   bool        validRegister(int num) const;
   uint32_t    getRegister(int num) const;
   void        setRegister(int num, uint32_t value);
@@ -2852,6 +2937,9 @@ class _LIBUNWIND_HIDDEN Registers_mips_o32 {
   Registers_mips_o32();
   Registers_mips_o32(const void *registers);
 
+  typedef uint32_t reg_t;
+  typedef uint32_t link_reg_t;
+
   bool        validRegister(int num) const;
   uint32_t    getRegister(int num) const;
   void        setRegister(int num, uint32_t value);
@@ -3187,6 +3275,9 @@ class _LIBUNWIND_HIDDEN Registers_mips_newabi {
   Registers_mips_newabi();
   Registers_mips_newabi(const void *registers);
 
+  typedef uint64_t reg_t;
+  typedef uint64_t link_reg_t;
+
   bool        validRegister(int num) const;
   uint64_t    getRegister(int num) const;
   void        setRegister(int num, uint64_t value);
@@ -3490,6 +3581,9 @@ class _LIBUNWIND_HIDDEN Registers_sparc {
   Registers_sparc();
   Registers_sparc(const void *registers);
 
+  typedef uint32_t reg_t;
+  typedef uint32_t link_reg_t;
+
   bool        validRegister(int num) const;
   uint32_t    getRegister(int num) const;
   void        setRegister(int num, uint32_t value);
@@ -3676,6 +3770,9 @@ class _LIBUNWIND_HIDDEN Registers_sparc64 {
   Registers_sparc64() = default;
   Registers_sparc64(const void *registers);
 
+  typedef uint64_t reg_t;
+  typedef uint64_t link_reg_t;
+
   bool validRegister(int num) const;
   uint64_t getRegister(int num) const;
   void setRegister(int num, uint64_t value);
@@ -3861,6 +3958,9 @@ class _LIBUNWIND_HIDDEN Registers_hexagon {
   Registers_hexagon();
   Registers_hexagon(const void *registers);
 
+  typedef uint32_t reg_t;
+  typedef uint32_t link_reg_t;
+
   bool        validRegister(int num) const;
   uint32_t    getRegister(int num) const;
   void        setRegister(int num, uint32_t value);
@@ -4076,6 +4176,9 @@ class _LIBUNWIND_HIDDEN Registers_riscv {
   Registers_riscv();
   Registers_riscv(const void *registers);
 
+  typedef ::libunwind::reg_t reg_t;
+  typedef ::libunwind::reg_t link_reg_t;
+
   bool        validRegister(int num) const;
   reg_t       getRegister(int num) const;
   void        setRegister(int num, reg_t value);
@@ -4373,6 +4476,9 @@ class _LIBUNWIND_HIDDEN Registers_ve {
   Registers_ve();
   Registers_ve(const void *registers);
 
+  typedef uint64_t reg_t;
+  typedef uint64_t link_reg_t;
+
   bool        validRegister(int num) const;
   uint64_t    getRegister(int num) const;
   void        setRegister(int num, uint64_t value);
@@ -4816,6 +4922,9 @@ class _LIBUNWIND_HIDDEN Registers_s390x {
   Registers_s390x();
   Registers_s390x(const void *registers);
 
+  typedef uint64_t reg_t;
+  typedef uint64_t link_reg_t;
+
   bool        validRegister(int num) const;
   uint64_t    getRegister(int num) const;
   void        setRegister(int num, uint64_t value);
@@ -5104,6 +5213,9 @@ class _LIBUNWIND_HIDDEN Registers_loongarch {
   Registers_loongarch();
   Registers_loongarch(const void *registers);
 
+  typedef uint64_t reg_t;
+  typedef uint64_t link_reg_t;
+
   bool validRegister(int num) const;
   uint64_t getRegister(int num) const;
   void setRegister(int num, uint64_t value);
diff --git a/libunwind/src/Unwind-seh.cpp b/libunwind/src/Unwind-seh.cpp
index 8b83f10615f22..110c5987c3f1a 100644
--- a/libunwind/src/Unwind-seh.cpp
+++ b/libunwind/src/Unwind-seh.cpp
@@ -174,7 +174,8 @@ _GCC_specific_handler(PEXCEPTION_RECORD ms_exc, PVOID frame, PCONTEXT ms_ctx,
     }
     // FIXME: Indicate target frame in foreign case!
     // phase 2: the clean up phase
-    RtlUnwindEx(frame, (PVOID)disp->ControlPc, ms_exc, exc, ms_ctx, disp->HistoryTable);
+    RtlUnwindEx(frame, (PVOID)disp->ControlPc, ms_exc, exc, disp->ContextRecord,
+                disp->HistoryTable);
     _LIBUNWIND_ABORT("RtlUnwindEx() failed");
   case _URC_INSTALL_CONTEXT: {
     // If we were called by __libunwind_seh_personality(), indicate that
diff --git a/libunwind/src/UnwindCursor.hpp b/libunwind/src/UnwindCursor.hpp
index 9a1afd3721f5a..7ec5f9e91578a 100644
--- a/libunwind/src/UnwindCursor.hpp
+++ b/libunwind/src/UnwindCursor.hpp
@@ -1047,18 +1047,26 @@ class UnwindCursor : public AbstractUnwindCursor{
   bool getInfoFromFdeCie(const typename CFI_Parser<A>::FDE_Info &fdeInfo,
                          const typename CFI_Parser<A>::CIE_Info &cieInfo,
                          pint_t pc, uintptr_t dso_base);
-  bool getInfoFromDwarfSection(pint_t pc, const UnwindInfoSections &sects,
-                                            uint32_t fdeSectionOffsetHint=0);
+  bool getInfoFromDwarfSection(const typename R::link_reg_t &pc,
+                               const UnwindInfoSections &sects,
+                               uint32_t fdeSectionOffsetHint = 0);
   int stepWithDwarfFDE(bool stage2) {
+#if defined(_LIBUNWIND_TARGET_AARCH64_AUTHENTICATED_UNWINDING)
+    typename R::reg_t rawPC = this->getReg(UNW_REG_IP);
+    typename R::link_reg_t pc;
+    _registers.loadAndAuthenticateLinkRegister(rawPC, &pc);
+#else
+    typename R::link_reg_t pc = this->getReg(UNW_REG_IP);
+#endif
     return DwarfInstructions<A, R>::stepWithDwarf(
-        _addressSpace, (pint_t)this->getReg(UNW_REG_IP),
-        (pint_t)_info.unwind_info, _registers, _isSignalFrame, stage2);
+        _addressSpace, pc, (pint_t)_info.unwind_info, _registers,
+        _isSignalFrame, stage2);
   }
 #endif
 
 #if defined(_LIBUNWIND_SUPPORT_COMPACT_UNWIND)
-  bool getInfoFromCompactEncodingSection(pint_t pc,
-                                            const UnwindInfoSections &sects);
+  bool getInfoFromCompactEncodingSection(const typename R::link_reg_t &pc,
+                                         const UnwindInfoSections &sects);
   int stepWithCompactEncoding(bool stage2 = false) {
 #if defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND)
     if ( compactSaysUseDwarf() )
@@ -1359,13 +1367,13 @@ UnwindCursor<A, R>::UnwindCursor(unw_context_t *context, A &as)
                 "UnwindCursor<> does not fit in unw_cursor_t");
   static_assert((alignof(UnwindCursor<A, R>) <= alignof(unw_cursor_t)),
                 "UnwindCursor<> requires more alignment than unw_cursor_t");
-  memset(&_info, 0, sizeof(_info));
+  memset(static_cast<void *>(&_info), 0, sizeof(_info));
 }
 
 template <typename A, typename R>
 UnwindCursor<A, R>::UnwindCursor(A &as, void *)
     : _addressSpace(as), _unwindInfoMissing(false), _isSignalFrame(false) {
-  memset(&_info, 0, sizeof(_info));
+  memset(static_cast<void *>(&_info), 0, sizeof(_info));
   // FIXME
   // fill in _registers from thread arg
 }
@@ -1683,9 +1691,9 @@ bool UnwindCursor<A, R>::getInfoFromFdeCie(
 }
 
 template <typename A, typename R>
-bool UnwindCursor<A, R>::getInfoFromDwarfSection(pint_t pc,
-                                                const UnwindInfoSections &sects,
-                                                uint32_t fdeSectionOffsetHint) {
+bool UnwindCursor<A, R>::getInfoFromDwarfSection(
+    const typename R::link_reg_t &pc, const UnwindInfoSections &sects,
+    uint32_t fdeSectionOffsetHint) {
   typename CFI_Parser<A>::FDE_Info fdeInfo;
   typename CFI_Parser<A>::CIE_Info cieInfo;
   bool foundFDE = false;
@@ -1743,8 +1751,8 @@ bool UnwindCursor<A, R>::getInfoFromDwarfSection(pint_t pc,
 
 #if defined(_LIBUNWIND_SUPPORT_COMPACT_UNWIND)
 template <typename A, typename R>
-bool UnwindCursor<A, R>::getInfoFromCompactEncodingSection(pint_t pc,
-                                              const UnwindInfoSections &sects) {
+bool UnwindCursor<A, R>::getInfoFromCompactEncodingSection(
+    const typename R::link_reg_t &pc, const UnwindInfoSections &sects) {
   const bool log = false;
   if (log)
     fprintf(stderr, "getInfoFromCompactEncodingSection(pc=0x%llX, mh=0x%llX)\n",
@@ -1975,6 +1983,16 @@ bool UnwindCursor<A, R>::getInfoFromCompactEncodingSection(pint_t pc,
         personalityIndex * sizeof(uint32_t));
     pint_t personalityPointer = sects.dso_base + (pint_t)personalityDelta;
     personality = _addressSpace.getP(personalityPointer);
+#if defined(_LIBUNWIND_TARGET_AARCH64_AUTHENTICATED_UNWINDING)
+    // The GOT for the personality function was signed address authenticated.
+    // Resign it as a regular function pointer.
+    const auto discriminator = ptrauth_blend_discriminator(
+        &_info.handler, __ptrauth_unwind_upi_handler_disc);
+    void *signedPtr = ptrauth_auth_and_resign(
+        (void *)personality, ptrauth_key_function_pointer, personalityPointer,
+        ptrauth_key_function_pointer, discriminator);
+    personality = (__typeof(personality))signedPtr;
+#endif
     if (log)
       fprintf(stderr, "getInfoFromCompactEncodingSection(pc=0x%llX), "
                       "personalityDelta=0x%08X, personality=0x%08llX\n",
@@ -1988,7 +2006,11 @@ bool UnwindCursor<A, R>::getInfoFromCompactEncodingSection(pint_t pc,
   _info.start_ip = funcStart;
   _info.end_ip = funcEnd;
   _info.lsda = lsda;
-  _info.handler = personality;
+  // We use memmove to copy the personality function as we have already manually
+  // re-signed the pointer, and assigning directly will attempt to incorrectly
+  // sign the already signed value.
+  memmove(reinterpret_cast<void *>(&_info.handler),
+          reinterpret_cast<void *>(&personality), sizeof(personality));
   _info.gp = 0;
   _info.flags = 0;
   _info.format = encoding;
@@ -2641,11 +2663,19 @@ void UnwindCursor<A, R>::setInfoBasedOnIPRegister(bool isReturnAddress) {
   _isSigReturn = false;
 #endif
 
-  pint_t pc = static_cast<pint_t>(this->getReg(UNW_REG_IP));
+  typename R::reg_t rawPC = this->getReg(UNW_REG_IP);
+
 #if defined(_LIBUNWIND_ARM_EHABI)
   // Remove the thumb bit so the IP represents the actual instruction address.
   // This matches the behaviour of _Unwind_GetIP on arm.
-  pc &= (pint_t)~0x1;
+  rawPC &= (pint_t)~0x1;
+#endif
+
+  typename R::link_reg_t pc;
+#if defined(_LIBUNWIND_TARGET_AARCH64_AUTHENTICATED_UNWINDING)
+  _registers.loadAndAuthenticateLinkRegister(rawPC, &pc);
+#else
+  pc = rawPC;
 #endif
 
   // Exit early if at the top of the stack.
@@ -3189,16 +3219,22 @@ template <typename A, typename R> int UnwindCursor<A, R>::step(bool stage2) {
 template <typename A, typename R>
 void UnwindCursor<A, R>::getInfo(unw_proc_info_t *info) {
   if (_unwindInfoMissing)
-    memset(info, 0, sizeof(*info));
+    memset(static_cast<void *>(info), 0, sizeof(*info));
   else
     *info = _info;
 }
 
 template <typename A, typename R>
 bool UnwindCursor<A, R>::getFunctionName(char *buf, size_t bufLen,
-                                                           unw_word_t *offset) {
-  return _addressSpace.findFunctionName((pint_t)this->getReg(UNW_REG_IP),
-                                         buf, bufLen, offset);
+                                         unw_word_t *offset) {
+#if defined(_LIBUNWIND_TARGET_AARCH64_AUTHENTICATED_UNWINDING)
+  typename R::reg_t rawPC = this->getReg(UNW_REG_IP);
+  typename R::link_reg_t pc;
+  _registers.loadAndAuthenticateLinkRegister(rawPC, &pc);
+#else
+  typename R::link_reg_t pc = this->getReg(UNW_REG_IP);
+#endif
+  return _addressSpace.findFunctionName(pc, buf, bufLen, offset);
 }
 
 #if defined(_LIBUNWIND_CHECK_LINUX_SIGRETURN)
diff --git a/libunwind/src/UnwindLevel1.c b/libunwind/src/UnwindLevel1.c
index f3b451ad9b730..b0cd60dfb9141 100644
--- a/libunwind/src/UnwindLevel1.c
+++ b/libunwind/src/UnwindLevel1.c
@@ -90,6 +90,23 @@
   } while (0)
 #endif
 
+// We need this helper function as the semantics of casting between integers and
+// function pointers mean that we end up with a function pointer without the
+// correct signature. Instead we assign to an integer with a matching schema,
+// and then memmove the result into a variable of the correct type. This memmove
+// is possible as `_Unwind_Personality_Fn` is a standard function pointer, and
+// as such is not address diversified.
+static _Unwind_Personality_Fn get_handler_function(unw_proc_info_t *frameInfo) {
+  uintptr_t __unwind_ptrauth_restricted_intptr(ptrauth_key_function_pointer,
+                                               0,
+                                               ptrauth_function_pointer_type_discriminator(_Unwind_Personality_Fn))
+    reauthenticatedIntegerHandler = frameInfo->handler;
+  _Unwind_Personality_Fn handler;
+  memmove(&handler, (void *)&reauthenticatedIntegerHandler,
+          sizeof(_Unwind_Personality_Fn));
+  return handler;
+}
+
 static _Unwind_Reason_Code
 unwind_phase1(unw_context_t *uc, unw_cursor_t *cursor, _Unwind_Exception *exception_object) {
   __unw_init_local(cursor, uc);
@@ -147,8 +164,7 @@ unwind_phase1(unw_context_t *uc, unw_cursor_t *cursor, _Unwind_Exception *except
     // If there is a personality routine, ask it if it will want to stop at
     // this frame.
     if (frameInfo.handler != 0) {
-      _Unwind_Personality_Fn p =
-          (_Unwind_Personality_Fn)(uintptr_t)(frameInfo.handler);
+      _Unwind_Personality_Fn p = get_handler_function(&frameInfo);
       _LIBUNWIND_TRACE_UNWINDING(
           "unwind_phase1(ex_obj=%p): calling personality function %p",
           (void *)exception_object, (void *)(uintptr_t)p);
@@ -276,8 +292,7 @@ unwind_phase2(unw_context_t *uc, unw_cursor_t *cursor,
     ++framesWalked;
     // If there is a personality routine, tell it we are unwinding.
     if (frameInfo.handler != 0) {
-      _Unwind_Personality_Fn p =
-          (_Unwind_Personality_Fn)(uintptr_t)(frameInfo.handler);
+      _Unwind_Personality_Fn p = get_handler_function(&frameInfo);
       _Unwind_Action action = _UA_CLEANUP_PHASE;
       if (sp == exception_object->private_2) {
         // Tell personality this was the frame it marked in phase 1.
@@ -394,8 +409,7 @@ unwind_phase2_forced(unw_context_t *uc, unw_cursor_t *cursor,
     ++framesWalked;
     // If there is a personality routine, tell it we are unwinding.
     if (frameInfo.handler != 0) {
-      _Unwind_Personality_Fn p =
-          (_Unwind_Personality_Fn)(intptr_t)(frameInfo.handler);
+      _Unwind_Personality_Fn p = get_handler_function(&frameInfo);
       _LIBUNWIND_TRACE_UNWINDING(
           "unwind_phase2_forced(ex_obj=%p): calling personality function %p",
           (void *)exception_object, (void *)(uintptr_t)p);
@@ -597,6 +611,18 @@ _LIBUNWIND_EXPORT uintptr_t _Unwind_GetIP(struct _Unwind_Context *context) {
   unw_cursor_t *cursor = (unw_cursor_t *)context;
   unw_word_t result;
   __unw_get_reg(cursor, UNW_REG_IP, &result);
+
+#if defined(_LIBUNWIND_TARGET_AARCH64_AUTHENTICATED_UNWINDING)
+  // If we are in an arm64e frame, then the PC should have been signed with the
+  // sp
+  {
+    unw_word_t sp;
+    __unw_get_reg(cursor, UNW_REG_SP, &sp);
+    result = (unw_word_t)ptrauth_auth_data((void *)result,
+                                           ptrauth_key_return_address, sp);
+  }
+#endif
+
   _LIBUNWIND_TRACE_API("_Unwind_GetIP(context=%p) => 0x%" PRIxPTR,
                        (void *)context, result);
   return (uintptr_t)result;
diff --git a/libunwind/src/UnwindRegistersRestore.S b/libunwind/src/UnwindRegistersRestore.S
index 1077d80bc90de..1ab4c43b673b4 100644
--- a/libunwind/src/UnwindRegistersRestore.S
+++ b/libunwind/src/UnwindRegistersRestore.S
@@ -25,6 +25,8 @@
 #if !defined(__USING_SJLJ_EXCEPTIONS__)
 
 #if defined(__i386__)
+.att_syntax
+
 DEFINE_LIBUNWIND_FUNCTION(__libunwind_Registers_x86_jumpto)
 #
 # extern "C" void __libunwind_Registers_x86_jumpto(Registers_x86 *);
@@ -69,6 +71,7 @@ DEFINE_LIBUNWIND_FUNCTION(__libunwind_Registers_x86_jumpto)
   # skip gs
 
 #elif defined(__x86_64__) && !defined(__arm64ec__)
+.att_syntax
 
 DEFINE_LIBUNWIND_FUNCTION(__libunwind_Registers_x86_64_jumpto)
 #
@@ -659,7 +662,7 @@ DEFINE_LIBUNWIND_FUNCTION(__libunwind_Registers_arm64_jumpto)
   ldp    x24,x25, [x0, #0x0C0]
   ldp    x26,x27, [x0, #0x0D0]
   ldp    x28,x29, [x0, #0x0E0]
-  ldr    x30,     [x0, #0x100]  // restore pc into lr
+
 #if defined(__ARM_FP) && __ARM_FP != 0
   ldp    d0, d1,  [x0, #0x110]
   ldp    d2, d3,  [x0, #0x120]
@@ -683,7 +686,18 @@ DEFINE_LIBUNWIND_FUNCTION(__libunwind_Registers_arm64_jumpto)
   // context struct, because it is allocated on the stack, and an exception
   // could clobber the de-allocated portion of the stack after sp has been
   // restored.
-  ldr    x16,     [x0, #0x0F8]
+
+  ldr    x16,     [x0, #0x0F8]  // load sp into scratch
+  ldr    lr,      [x0, #0x100]  // restore pc into lr
+
+#if __has_feature(ptrauth_calls)
+  // The LR is signed with its address inside the register state.  Time
+  // to resign to be a regular ROP protected signed pointer
+  add    x1, x0, #0x100
+  autib  lr, x1
+  pacib  lr, x16  // signed the scratch register for sp
+#endif
+
   ldp    x0, x1,  [x0, #0x000]  // restore x0,x1
   mov    sp,x16                 // restore sp
 #if defined(__ARM_FEATURE_GCS_DEFAULT)
@@ -696,7 +710,12 @@ DEFINE_LIBUNWIND_FUNCTION(__libunwind_Registers_arm64_jumpto)
   gcspushm x30
 Lnogcs:
 #endif
+
+#if __has_feature(ptrauth_calls)
+  retab
+#else
   ret    x30                    // jump to pc
+#endif
 
 #elif defined(__arm__) && !defined(__APPLE__)
 
diff --git a/libunwind/src/UnwindRegistersSave.S b/libunwind/src/UnwindRegistersSave.S
index 8bf99ebd942fa..31a177f4a0df4 100644
--- a/libunwind/src/UnwindRegistersSave.S
+++ b/libunwind/src/UnwindRegistersSave.S
@@ -25,6 +25,7 @@
 #if !defined(__USING_SJLJ_EXCEPTIONS__)
 
 #if defined(__i386__)
+.att_syntax
 
 #
 # extern int __unw_getcontext(unw_context_t* thread_state)
@@ -109,6 +110,7 @@ DEFINE_LIBUNWIND_FUNCTION("#__unw_getcontext")
   .text
 
 #elif defined(__x86_64__)
+.att_syntax
 
 #
 # extern int __unw_getcontext(unw_context_t* thread_state)
@@ -769,6 +771,11 @@ LnoR2Fix:
 //
   .p2align 2
 DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext)
+
+#if __has_feature(ptrauth_calls)
+  pacibsp
+#endif
+
   stp    x0, x1,  [x0, #0x000]
   stp    x2, x3,  [x0, #0x010]
   stp    x4, x5,  [x0, #0x020]
@@ -809,7 +816,12 @@ DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext)
   str    d31,     [x0, #0x208]
 #endif
   mov    x0, #0                   // return UNW_ESUCCESS
+
+#if __has_feature(ptrauth_calls)
+  retab
+#else
   ret
+#endif
 
 #elif defined(__arm__) && !defined(__APPLE__)
 
diff --git a/libunwind/src/libunwind.cpp b/libunwind/src/libunwind.cpp
index cf39ec5f7dbdf..951d87db868bc 100644
--- a/libunwind/src/libunwind.cpp
+++ b/libunwind/src/libunwind.cpp
@@ -118,14 +118,52 @@ _LIBUNWIND_HIDDEN int __unw_set_reg(unw_cursor_t *cursor, unw_regnum_t regNum,
   typedef LocalAddressSpace::pint_t pint_t;
   AbstractUnwindCursor *co = (AbstractUnwindCursor *)cursor;
   if (co->validReg(regNum)) {
-    co->setReg(regNum, (pint_t)value);
     // special case altering IP to re-find info (being called by personality
     // function)
     if (regNum == UNW_REG_IP) {
       unw_proc_info_t info;
       // First, get the FDE for the old location and then update it.
       co->getInfo(&info);
-      co->setInfoBasedOnIPRegister(false);
+
+      pint_t sp = (pint_t)co->getReg(UNW_REG_SP);
+
+#if defined(_LIBUNWIND_TARGET_AARCH64_AUTHENTICATED_UNWINDING)
+      {
+        // It is only valid to set the IP within the current function. This is
+        // important for ptrauth, otherwise the IP cannot be correctly signed.
+        // The current signature of `value` is via the schema:
+        //   __ptrauth(ptrauth_key_return_address, <<sp>>, 0)
+        // For this to be generally usable we manually re-sign it to the
+        // directly supported schema:
+        //   __ptrauth(ptrauth_key_return_address, 1, 0)
+        unw_word_t
+              __unwind_ptrauth_restricted_intptr(ptrauth_key_return_address, 1,
+                                                 0) authenticated_value;
+        unw_word_t opaque_value = (uint64_t)ptrauth_auth_and_resign(
+            (void *)value, ptrauth_key_return_address, sp,
+            ptrauth_key_return_address, &authenticated_value);
+        memmove(reinterpret_cast<void *>(&authenticated_value),
+                reinterpret_cast<void *>(&opaque_value),
+                sizeof(authenticated_value));
+        if (authenticated_value < info.start_ip ||
+            authenticated_value > info.end_ip)
+          _LIBUNWIND_ABORT("PC vs frame info mismatch");
+
+        // PC should have been signed with the sp, so we verify that
+        // roundtripping does not fail.
+        pint_t pc = (pint_t)co->getReg(UNW_REG_IP);
+        if (ptrauth_auth_and_resign((void *)pc, ptrauth_key_return_address, sp,
+                                    ptrauth_key_return_address,
+                                    sp) != (void *)pc) {
+          _LIBUNWIND_LOG("Bad unwind through arm64e (0x%zX, 0x%zX)->0x%zX\n",
+                         pc, sp,
+                         (pint_t)ptrauth_auth_data(
+                             (void *)pc, ptrauth_key_return_address, sp));
+          _LIBUNWIND_ABORT("Bad unwind through arm64e");
+        }
+      }
+#endif
+
       // If the original call expects stack adjustment, perform this now.
       // Normal frame unwinding would have included the offset already in the
       // CFA computation.
@@ -133,7 +171,11 @@ _LIBUNWIND_HIDDEN int __unw_set_reg(unw_cursor_t *cursor, unw_regnum_t regNum,
       // this should actually be - info.gp. LLVM doesn't currently support
       // any such platforms and Clang doesn't export a macro for them.
       if (info.gp)
-        co->setReg(UNW_REG_SP, co->getReg(UNW_REG_SP) + info.gp);
+        co->setReg(UNW_REG_SP, sp + info.gp);
+      co->setReg(UNW_REG_IP, value);
+      co->setInfoBasedOnIPRegister(false);
+    } else {
+      co->setReg(regNum, (pint_t)value);
     }
     return UNW_ESUCCESS;
   }
diff --git a/lld/ELF/Arch/Mips.cpp b/lld/ELF/Arch/Mips.cpp
index f88b021c8ba39..091903b103ade 100644
--- a/lld/ELF/Arch/Mips.cpp
+++ b/lld/ELF/Arch/Mips.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "InputFiles.h"
+#include "RelocScan.h"
 #include "Symbols.h"
 #include "SyntheticSections.h"
 #include "Target.h"
@@ -31,6 +32,9 @@ template <class ELFT> class MIPS final : public TargetInfo {
   void writePltHeader(uint8_t *buf) const override;
   void writePlt(uint8_t *buf, const Symbol &sym,
                 uint64_t pltEntryAddr) const override;
+  template <class RelTy>
+  void scanSectionImpl(InputSectionBase &, Relocs<RelTy>);
+  void scanSection(InputSectionBase &) override;
   bool needsThunk(RelExpr expr, RelType type, const InputFile *file,
                   uint64_t branchAddr, const Symbol &s,
                   int64_t a) const override;
@@ -570,6 +574,123 @@ static uint64_t fixupCrossModeJump(Ctx &ctx, uint8_t *loc, RelType type,
   return val;
 }
 
+template <class RelTy>
+static RelType getMipsN32RelType(Ctx &ctx, RelTy *&rel, RelTy *end) {
+  uint32_t type = 0;
+  uint64_t offset = rel->r_offset;
+  int n = 0;
+  while (rel != end && rel->r_offset == offset)
+    type |= (rel++)->getType(ctx.arg.isMips64EL) << (8 * n++);
+  return type;
+}
+
+static RelType getMipsPairType(RelType type, bool isLocal) {
+  switch (type) {
+  case R_MIPS_HI16:
+    return R_MIPS_LO16;
+  case R_MIPS_GOT16:
+    // In case of global symbol, the R_MIPS_GOT16 relocation does not
+    // have a pair. Each global symbol has a unique entry in the GOT
+    // and a corresponding instruction with help of the R_MIPS_GOT16
+    // relocation loads an address of the symbol. In case of local
+    // symbol, the R_MIPS_GOT16 relocation creates a GOT entry to hold
+    // the high 16 bits of the symbol's value. A paired R_MIPS_LO16
+    // relocations handle low 16 bits of the address. That allows
+    // to allocate only one GOT entry for every 64 KiB of local data.
+    return isLocal ? R_MIPS_LO16 : R_MIPS_NONE;
+  case R_MICROMIPS_GOT16:
+    return isLocal ? R_MICROMIPS_LO16 : R_MIPS_NONE;
+  case R_MIPS_PCHI16:
+    return R_MIPS_PCLO16;
+  case R_MICROMIPS_HI16:
+    return R_MICROMIPS_LO16;
+  default:
+    return R_MIPS_NONE;
+  }
+}
+
+template <class ELFT>
+template <class RelTy>
+void MIPS<ELFT>::scanSectionImpl(InputSectionBase &sec, Relocs<RelTy> rels) {
+  RelocScan rs(ctx, &sec);
+  sec.relocations.reserve(rels.size());
+  RelType type;
+  for (auto it = rels.begin(); it != rels.end();) {
+    const RelTy &rel = *it;
+    uint64_t offset = rel.r_offset;
+    if constexpr (ELFT::Is64Bits) {
+      type = it->getType(ctx.arg.isMips64EL);
+      ++it;
+    } else {
+      if (ctx.arg.mipsN32Abi) {
+        type = getMipsN32RelType(ctx, it, rels.end());
+      } else {
+        type = it->getType(ctx.arg.isMips64EL);
+        ++it;
+      }
+    }
+
+    uint32_t symIdx = rel.getSymbol(ctx.arg.isMips64EL);
+    Symbol &sym = sec.getFile<ELFT>()->getSymbol(symIdx);
+    RelExpr expr =
+        ctx.target->getRelExpr(type, sym, sec.content().data() + rel.r_offset);
+    if (expr == R_NONE)
+      continue;
+    if (sym.isUndefined() && symIdx != 0 &&
+        rs.maybeReportUndefined(cast<Undefined>(sym), offset))
+      continue;
+
+    auto addend = rs.getAddend<ELFT>(rel, type);
+    if (expr == RE_MIPS_GOTREL && sym.isLocal()) {
+      addend += sec.getFile<ELFT>()->mipsGp0;
+    } else if (!RelTy::HasAddend) {
+      // MIPS has an odd notion of "paired" relocations to calculate addends.
+      // For example, if a relocation is of R_MIPS_HI16, there must be a
+      // R_MIPS_LO16 relocation after that, and an addend is calculated using
+      // the two relocations.
+      RelType pairTy = getMipsPairType(type, sym.isLocal());
+      if (pairTy != R_MIPS_NONE) {
+        const uint8_t *buf = sec.content().data();
+        // To make things worse, paired relocations might not be contiguous in
+        // the relocation table, so we need to do linear search. *sigh*
+        bool found = false;
+        for (auto *ri = &rel; ri != rels.end(); ++ri) {
+          if (ri->getType(ctx.arg.isMips64EL) == pairTy &&
+              ri->getSymbol(ctx.arg.isMips64EL) == symIdx) {
+            addend += ctx.target->getImplicitAddend(buf + ri->r_offset, pairTy);
+            found = true;
+            break;
+          }
+        }
+
+        if (!found)
+          Warn(ctx) << "can't find matching " << pairTy << " relocation for "
+                    << type;
+      }
+    }
+
+    if (expr == RE_MIPS_TLSLD) {
+      ctx.in.mipsGot->addTlsIndex(*sec.file);
+      sec.addReloc({expr, type, offset, addend, &sym});
+    } else if (expr == RE_MIPS_TLSGD) {
+      ctx.in.mipsGot->addDynTlsEntry(*sec.file, sym);
+      sec.addReloc({expr, type, offset, addend, &sym});
+    } else {
+      if (expr == R_TPREL && rs.checkTlsLe(offset, sym, type))
+        continue;
+      rs.process(expr, type, offset, sym, addend);
+    }
+  }
+}
+
+template <class ELFT> void MIPS<ELFT>::scanSection(InputSectionBase &sec) {
+  auto relocs = sec.template relsOrRelas<ELFT>();
+  if (relocs.areRelocsRel())
+    scanSectionImpl(sec, relocs.rels);
+  else
+    scanSectionImpl(sec, relocs.relas);
+}
+
 template <class ELFT>
 void MIPS<ELFT>::relocate(uint8_t *loc, const Relocation &rel,
                           uint64_t val) const {
diff --git a/lld/ELF/Arch/PPC64.cpp b/lld/ELF/Arch/PPC64.cpp
index 550c091624bb5..6528cc51accc5 100644
--- a/lld/ELF/Arch/PPC64.cpp
+++ b/lld/ELF/Arch/PPC64.cpp
@@ -8,6 +8,7 @@
 
 #include "InputFiles.h"
 #include "OutputSections.h"
+#include "RelocScan.h"
 #include "SymbolTable.h"
 #include "Symbols.h"
 #include "SyntheticSections.h"
@@ -178,6 +179,10 @@ class PPC64 final : public TargetInfo {
                 uint64_t pltEntryAddr) const override;
   void writeIplt(uint8_t *buf, const Symbol &sym,
                  uint64_t pltEntryAddr) const override;
+  template <class ELFT, class RelTy>
+  void scanSectionImpl(InputSectionBase &, Relocs<RelTy>);
+  template <class ELFT> void scanSection1(InputSectionBase &);
+  void scanSection(InputSectionBase &) override;
   void relocate(uint8_t *loc, const Relocation &rel,
                 uint64_t val) const override;
   void writeGotHeader(uint8_t *buf) const override;
@@ -1257,6 +1262,132 @@ static bool isTocOptType(RelType type) {
   }
 }
 
+// R_PPC64_TLSGD/R_PPC64_TLSLD is required to mark `bl __tls_get_addr` for
+// General Dynamic/Local Dynamic code sequences. If a GD/LD GOT relocation is
+// found but no R_PPC64_TLSGD/R_PPC64_TLSLD is seen, we assume that the
+// instructions are generated by very old IBM XL compilers. Work around the
+// issue by disabling GD/LD to IE/LE relaxation.
+template <class RelTy>
+static void checkPPC64TLSRelax(InputSectionBase &sec, Relocs<RelTy> rels) {
+  // Skip if sec is synthetic (sec.file is null) or if sec has been marked.
+  if (!sec.file || sec.file->ppc64DisableTLSRelax)
+    return;
+  bool hasGDLD = false;
+  for (const RelTy &rel : rels) {
+    RelType type = rel.getType(false);
+    switch (type) {
+    case R_PPC64_TLSGD:
+    case R_PPC64_TLSLD:
+      return; // Found a marker
+    case R_PPC64_GOT_TLSGD16:
+    case R_PPC64_GOT_TLSGD16_HA:
+    case R_PPC64_GOT_TLSGD16_HI:
+    case R_PPC64_GOT_TLSGD16_LO:
+    case R_PPC64_GOT_TLSLD16:
+    case R_PPC64_GOT_TLSLD16_HA:
+    case R_PPC64_GOT_TLSLD16_HI:
+    case R_PPC64_GOT_TLSLD16_LO:
+      hasGDLD = true;
+      break;
+    }
+  }
+  if (hasGDLD) {
+    sec.file->ppc64DisableTLSRelax = true;
+    Warn(sec.file->ctx)
+        << sec.file
+        << ": disable TLS relaxation due to R_PPC64_GOT_TLS* relocations "
+           "without "
+           "R_PPC64_TLSGD/R_PPC64_TLSLD relocations";
+  }
+}
+
+template <class ELFT, class RelTy>
+void PPC64::scanSectionImpl(InputSectionBase &sec, Relocs<RelTy> rels) {
+  RelocScan rs(ctx, &sec);
+  sec.relocations.reserve(rels.size());
+  checkPPC64TLSRelax<RelTy>(sec, rels);
+  for (auto it = rels.begin(); it != rels.end(); ++it) {
+    const RelTy &rel = *it;
+    uint64_t offset = rel.r_offset;
+    uint32_t symIdx = rel.getSymbol(false);
+    Symbol &sym = sec.getFile<ELFT>()->getSymbol(symIdx);
+    RelType type = rel.getType(false);
+    RelExpr expr =
+        ctx.target->getRelExpr(type, sym, sec.content().data() + offset);
+    if (expr == R_NONE)
+      continue;
+    if (sym.isUndefined() && symIdx != 0 &&
+        rs.maybeReportUndefined(cast<Undefined>(sym), offset))
+      continue;
+
+    auto addend = getAddend<ELFT>(rel);
+    if (ctx.arg.isPic && type == R_PPC64_TOC)
+      addend += getPPC64TocBase(ctx);
+
+    // We can separate the small code model relocations into 2 categories:
+    // 1) Those that access the compiler generated .toc sections.
+    // 2) Those that access the linker allocated got entries.
+    // lld allocates got entries to symbols on demand. Since we don't try to
+    // sort the got entries in any way, we don't have to track which objects
+    // have got-based small code model relocs. The .toc sections get placed
+    // after the end of the linker allocated .got section and we do sort those
+    // so sections addressed with small code model relocations come first.
+    if (type == R_PPC64_TOC16 || type == R_PPC64_TOC16_DS)
+      sec.file->ppc64SmallCodeModelTocRelocs = true;
+
+    // Record the TOC entry (.toc + addend) as not relaxable. See the comment in
+    // PPC64::relocateAlloc().
+    if (type == R_PPC64_TOC16_LO && sym.isSection() && isa<Defined>(sym) &&
+        cast<Defined>(sym).section->name == ".toc")
+      ctx.ppc64noTocRelax.insert({&sym, addend});
+
+    if ((type == R_PPC64_TLSGD && expr == R_TLSDESC_CALL) ||
+        (type == R_PPC64_TLSLD && expr == R_TLSLD_HINT)) {
+      auto it1 = it;
+      ++it1;
+      if (it1 == rels.end()) {
+        auto diag = Err(ctx);
+        diag << "R_PPC64_TLSGD/R_PPC64_TLSLD may not be the last "
+                "relocation";
+        printLocation(diag, sec, sym, offset);
+        continue;
+      }
+
+      // Offset the 4-byte aligned R_PPC64_TLSGD by one byte in the NOTOC
+      // case, so we can discern it later from the toc-case.
+      if (it1->getType(/*isMips64EL=*/false) == R_PPC64_REL24_NOTOC)
+        ++offset;
+    }
+
+    if (oneof<R_GOTREL, RE_PPC64_TOCBASE, RE_PPC64_RELAX_TOC>(expr))
+      ctx.in.got->hasGotOffRel.store(true, std::memory_order_relaxed);
+
+    if (sym.isTls()) {
+      if (unsigned processed =
+              rs.handleTlsRelocation(expr, type, offset, sym, addend)) {
+        it += processed - 1;
+        continue;
+      }
+    }
+    rs.process(expr, type, offset, sym, addend);
+  }
+}
+
+template <class ELFT> void PPC64::scanSection1(InputSectionBase &sec) {
+  auto relocs = sec.template relsOrRelas<ELFT>();
+  if (relocs.areRelocsCrel())
+    scanSectionImpl<ELFT>(sec, relocs.crels);
+  else
+    scanSectionImpl<ELFT>(sec, relocs.relas);
+}
+
+void PPC64::scanSection(InputSectionBase &sec) {
+  if (ctx.arg.isLE)
+    scanSection1<ELF64LE>(sec);
+  else
+    scanSection1<ELF64BE>(sec);
+}
+
 void PPC64::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const {
   RelType type = rel.type;
   bool shouldTocOptimize =  isTocOptType(type);
diff --git a/lld/ELF/RelocScan.h b/lld/ELF/RelocScan.h
new file mode 100644
index 0000000000000..43f337d07c42a
--- /dev/null
+++ b/lld/ELF/RelocScan.h
@@ -0,0 +1,126 @@
+//===------------------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLD_ELF_RELOCSCAN_H
+#define LLD_ELF_RELOCSCAN_H
+
+#include "Config.h"
+#include "InputFiles.h"
+#include "InputSection.h"
+#include "Relocations.h"
+#include "SyntheticSections.h"
+#include "Target.h"
+
+using namespace llvm;
+using namespace llvm::ELF;
+using namespace llvm::object;
+
+namespace lld::elf {
+
+// Build a bitmask with one bit set for each 64 subset of RelExpr.
+inline constexpr uint64_t buildMask() { return 0; }
+
+template <typename... Tails>
+inline constexpr uint64_t buildMask(int head, Tails... tails) {
+  return (0 <= head && head < 64 ? uint64_t(1) << head : 0) |
+         buildMask(tails...);
+}
+
+// Return true if `Expr` is one of `Exprs`.
+// There are more than 64 but less than 128 RelExprs, so we divide the set of
+// exprs into [0, 64) and [64, 128) and represent each range as a constant
+// 64-bit mask. Then we decide which mask to test depending on the value of
+// expr and use a simple shift and bitwise-and to test for membership.
+template <RelExpr... Exprs> bool oneof(RelExpr expr) {
+  assert(0 <= expr && (int)expr < 128 &&
+         "RelExpr is too large for 128-bit mask!");
+
+  if (expr >= 64)
+    return (uint64_t(1) << (expr - 64)) & buildMask((Exprs - 64)...);
+  return (uint64_t(1) << expr) & buildMask(Exprs...);
+}
+
+// This class encapsulates states needed to scan relocations for one
+// InputSectionBase.
+class RelocScan {
+public:
+  Ctx &ctx;
+  InputSectionBase *sec;
+
+  RelocScan(Ctx &ctx, InputSectionBase *sec = nullptr) : ctx(ctx), sec(sec) {}
+  template <class ELFT, class RelTy>
+  void scan(typename Relocs<RelTy>::const_iterator &i, RelType type,
+            int64_t addend);
+  void scanEhSection(EhInputSection &s);
+
+  template <class ELFT, class RelTy>
+  int64_t getAddend(const RelTy &r, RelType type);
+  bool maybeReportUndefined(Undefined &sym, uint64_t offset);
+  bool checkTlsLe(uint64_t offset, Symbol &sym, RelType type);
+  bool isStaticLinkTimeConstant(RelExpr e, RelType type, const Symbol &sym,
+                                uint64_t relOff) const;
+  void process(RelExpr expr, RelType type, uint64_t offset, Symbol &sym,
+               int64_t addend) const;
+  unsigned handleTlsRelocation(RelExpr expr, RelType type, uint64_t offset,
+                               Symbol &sym, int64_t addend);
+};
+
+template <class ELFT, class RelTy>
+int64_t RelocScan::getAddend(const RelTy &r, RelType type) {
+  return RelTy::HasAddend ? elf::getAddend<ELFT>(r)
+                          : ctx.target->getImplicitAddend(
+                                sec->content().data() + r.r_offset, type);
+}
+
+template <class ELFT, class RelTy>
+void RelocScan::scan(typename Relocs<RelTy>::const_iterator &it, RelType type,
+                     int64_t addend) {
+  const RelTy &rel = *it;
+  uint32_t symIdx = rel.getSymbol(false);
+  Symbol &sym = sec->getFile<ELFT>()->getSymbol(symIdx);
+  uint64_t offset = rel.r_offset;
+  RelExpr expr =
+      ctx.target->getRelExpr(type, sym, sec->content().data() + offset);
+
+  // Ignore R_*_NONE and other marker relocations.
+  if (expr == R_NONE)
+    return;
+
+  // Error if the target symbol is undefined. Symbol index 0 may be used by
+  // marker relocations, e.g. R_*_NONE and R_ARM_V4BX. Don't error on them.
+  if (sym.isUndefined() && symIdx != 0 &&
+      maybeReportUndefined(cast<Undefined>(sym), offset))
+    return;
+
+  // Ensure GOT or GOTPLT is created for relocations that reference their base
+  // addresses without directly creating entries.
+  if (oneof<R_GOTPLTONLY_PC, R_GOTPLTREL, R_GOTPLT, R_PLT_GOTPLT,
+            R_TLSDESC_GOTPLT, R_TLSGD_GOTPLT>(expr)) {
+    ctx.in.gotPlt->hasGotPltOffRel.store(true, std::memory_order_relaxed);
+  } else if (oneof<R_GOTONLY_PC, R_GOTREL, RE_PPC32_PLTREL>(expr)) {
+    ctx.in.got->hasGotOffRel.store(true, std::memory_order_relaxed);
+  }
+
+  // Process TLS relocations, including TLS optimizations. Note that
+  // R_TPREL and R_TPREL_NEG relocations are resolved in processAux.
+  //
+  // Some RISCV TLSDESC relocations reference a local NOTYPE symbol,
+  // but we need to process them in handleTlsRelocation.
+  if (sym.isTls() || oneof<R_TLSDESC_PC, R_TLSDESC_CALL>(expr)) {
+    if (unsigned processed =
+            handleTlsRelocation(expr, type, offset, sym, addend)) {
+      it += processed - 1;
+      return;
+    }
+  }
+
+  process(expr, type, offset, sym, addend);
+}
+} // namespace lld::elf
+
+#endif
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index 84b9b5e983662..d21376fd3ee47 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -30,6 +30,7 @@
 #include "InputFiles.h"
 #include "LinkerScript.h"
 #include "OutputSections.h"
+#include "RelocScan.h"
 #include "SymbolTable.h"
 #include "Symbols.h"
 #include "SyntheticSections.h"
@@ -58,8 +59,8 @@ static void printDefinedLocation(ELFSyncStream &s, const Symbol &sym) {
 // >>> defined in /home/alice/src/foo.o
 // >>> referenced by bar.c:12 (/home/alice/src/bar.c:12)
 // >>>               /home/alice/src/bar.o:(.text+0x1)
-static void printLocation(ELFSyncStream &s, InputSectionBase &sec,
-                          const Symbol &sym, uint64_t off) {
+void elf::printLocation(ELFSyncStream &s, InputSectionBase &sec,
+                        const Symbol &sym, uint64_t off) {
   printDefinedLocation(s, sym);
   s << "\n>>> referenced by ";
   auto tell = s.tell();
@@ -111,54 +112,6 @@ void elf::reportRangeError(Ctx &ctx, uint8_t *loc, int64_t v, int n,
   }
 }
 
-// Build a bitmask with one bit set for each 64 subset of RelExpr.
-static constexpr uint64_t buildMask() { return 0; }
-
-template <typename... Tails>
-static constexpr uint64_t buildMask(int head, Tails... tails) {
-  return (0 <= head && head < 64 ? uint64_t(1) << head : 0) |
-         buildMask(tails...);
-}
-
-// Return true if `Expr` is one of `Exprs`.
-// There are more than 64 but less than 128 RelExprs, so we divide the set of
-// exprs into [0, 64) and [64, 128) and represent each range as a constant
-// 64-bit mask. Then we decide which mask to test depending on the value of
-// expr and use a simple shift and bitwise-and to test for membership.
-template <RelExpr... Exprs> static bool oneof(RelExpr expr) {
-  assert(0 <= expr && (int)expr < 128 &&
-         "RelExpr is too large for 128-bit mask!");
-
-  if (expr >= 64)
-    return (uint64_t(1) << (expr - 64)) & buildMask((Exprs - 64)...);
-  return (uint64_t(1) << expr) & buildMask(Exprs...);
-}
-
-static RelType getMipsPairType(RelType type, bool isLocal) {
-  switch (type) {
-  case R_MIPS_HI16:
-    return R_MIPS_LO16;
-  case R_MIPS_GOT16:
-    // In case of global symbol, the R_MIPS_GOT16 relocation does not
-    // have a pair. Each global symbol has a unique entry in the GOT
-    // and a corresponding instruction with help of the R_MIPS_GOT16
-    // relocation loads an address of the symbol. In case of local
-    // symbol, the R_MIPS_GOT16 relocation creates a GOT entry to hold
-    // the high 16 bits of the symbol's value. A paired R_MIPS_LO16
-    // relocations handle low 16 bits of the address. That allows
-    // to allocate only one GOT entry for every 64 KiB of local data.
-    return isLocal ? R_MIPS_LO16 : R_MIPS_NONE;
-  case R_MICROMIPS_GOT16:
-    return isLocal ? R_MICROMIPS_LO16 : R_MIPS_NONE;
-  case R_MIPS_PCHI16:
-    return R_MIPS_PCLO16;
-  case R_MICROMIPS_HI16:
-    return R_MICROMIPS_LO16;
-  default:
-    return R_MIPS_NONE;
-  }
-}
-
 // True if non-preemptable symbol always has the same value regardless of where
 // the DSO is loaded.
 bool elf::isAbsolute(const Symbol &sym) {
@@ -424,73 +377,8 @@ class OffsetGetter {
   ArrayRef<EhSectionPiece> cies, fdes;
   ArrayRef<EhSectionPiece>::iterator i, j;
 };
-
-// This class encapsulates states needed to scan relocations for one
-// InputSectionBase.
-class RelocationScanner {
-public:
-  RelocationScanner(Ctx &ctx) : ctx(ctx) {}
-  template <class ELFT> void scanSection(InputSectionBase &s);
-  template <class ELFT> void scanEhSection(EhInputSection &s);
-
-private:
-  Ctx &ctx;
-  InputSectionBase *sec;
-
-  // End of relocations, used by Mips/PPC64.
-  const void *end = nullptr;
-
-  template <class RelTy> RelType getMipsN32RelType(RelTy *&rel) const;
-  template <class ELFT, class RelTy>
-  int64_t computeMipsAddend(const RelTy &rel, RelExpr expr, bool isLocal) const;
-  bool isStaticLinkTimeConstant(RelExpr e, RelType type, const Symbol &sym,
-                                uint64_t relOff) const;
-  void process(RelExpr expr, RelType type, uint64_t offset, Symbol &sym,
-               int64_t addend) const;
-  unsigned handleTlsRelocation(RelExpr expr, RelType type, uint64_t offset,
-                               Symbol &sym, int64_t addend);
-
-  template <class ELFT, class RelTy>
-  void scan(typename Relocs<RelTy>::const_iterator &i);
-  template <class ELFT, class RelTy> void scanSectionImpl(Relocs<RelTy> rels);
-};
 } // namespace
 
-// MIPS has an odd notion of "paired" relocations to calculate addends.
-// For example, if a relocation is of R_MIPS_HI16, there must be a
-// R_MIPS_LO16 relocation after that, and an addend is calculated using
-// the two relocations.
-template <class ELFT, class RelTy>
-int64_t RelocationScanner::computeMipsAddend(const RelTy &rel, RelExpr expr,
-                                             bool isLocal) const {
-  if (expr == RE_MIPS_GOTREL && isLocal)
-    return sec->getFile<ELFT>()->mipsGp0;
-
-  // The ABI says that the paired relocation is used only for REL.
-  // See p. 4-17 at ftp://www.linux-mips.org/pub/linux/mips/doc/ABI/mipsabi.pdf
-  // This generalises to relocation types with implicit addends.
-  if (RelTy::HasAddend)
-    return 0;
-
-  RelType type = rel.getType(ctx.arg.isMips64EL);
-  RelType pairTy = getMipsPairType(type, isLocal);
-  if (pairTy == R_MIPS_NONE)
-    return 0;
-
-  const uint8_t *buf = sec->content().data();
-  uint32_t symIndex = rel.getSymbol(ctx.arg.isMips64EL);
-
-  // To make things worse, paired relocations might not be contiguous in
-  // the relocation table, so we need to do linear search. *sigh*
-  for (const RelTy *ri = &rel; ri != static_cast<const RelTy *>(end); ++ri)
-    if (ri->getType(ctx.arg.isMips64EL) == pairTy &&
-        ri->getSymbol(ctx.arg.isMips64EL) == symIndex)
-      return ctx.target->getImplicitAddend(buf + ri->r_offset, pairTy);
-
-  Warn(ctx) << "can't find matching " << pairTy << " relocation for " << type;
-  return 0;
-}
-
 // Custom error message if Sym is defined in a discarded section.
 template <class ELFT>
 static void maybeReportDiscarded(Ctx &ctx, ELFSyncStream &msg, Undefined &sym) {
@@ -767,13 +655,12 @@ void elf::reportUndefinedSymbols(Ctx &ctx) {
 
 // Report an undefined symbol if necessary.
 // Returns true if the undefined symbol will produce an error message.
-static bool maybeReportUndefined(Ctx &ctx, Undefined &sym,
-                                 InputSectionBase &sec, uint64_t offset) {
+bool RelocScan::maybeReportUndefined(Undefined &sym, uint64_t offset) {
   std::lock_guard<std::mutex> lock(ctx.relocMutex);
   // If versioned, issue an error (even if the symbol is weak) because we don't
   // know the defining filename which is required to construct a Verneed entry.
   if (sym.hasVersionSuffix) {
-    ctx.undefErrs.push_back({&sym, {{&sec, offset}}, false});
+    ctx.undefErrs.push_back({&sym, {{sec, offset}}, false});
     return true;
   }
   if (sym.isWeak())
@@ -792,30 +679,24 @@ static bool maybeReportUndefined(Ctx &ctx, Undefined &sym,
   // PPC32 .got2 is similar but cannot be fixed. Multiple .got2 is infeasible
   // because .LC0-.LTOC is not representable if the two labels are in different
   // .got2
-  if (sym.discardedSecIdx != 0 && (sec.name == ".got2" || sec.name == ".toc"))
+  if (sym.discardedSecIdx != 0 && (sec->name == ".got2" || sec->name == ".toc"))
     return false;
 
   bool isWarning =
       (ctx.arg.unresolvedSymbols == UnresolvedPolicy::Warn && canBeExternal) ||
       ctx.arg.noinhibitExec;
-  ctx.undefErrs.push_back({&sym, {{&sec, offset}}, isWarning});
+  ctx.undefErrs.push_back({&sym, {{sec, offset}}, isWarning});
   return !isWarning;
 }
 
-// MIPS N32 ABI treats series of successive relocations with the same offset
-// as a single relocation. The similar approach used by N64 ABI, but this ABI
-// packs all relocations into the single relocation record. Here we emulate
-// this for the N32 ABI. Iterate over relocation with the same offset and put
-// theirs types into the single bit-set.
-template <class RelTy>
-RelType RelocationScanner::getMipsN32RelType(RelTy *&rel) const {
-  uint32_t type = 0;
-  uint64_t offset = rel->r_offset;
-
-  int n = 0;
-  while (rel != static_cast<const RelTy *>(end) && rel->r_offset == offset)
-    type |= (rel++)->getType(ctx.arg.isMips64EL) << (8 * n++);
-  return type;
+bool RelocScan::checkTlsLe(uint64_t offset, Symbol &sym, RelType type) {
+  if (!ctx.arg.shared)
+    return false;
+  auto diag = Err(ctx);
+  diag << "relocation " << type << " against " << &sym
+       << " cannot be used with -shared";
+  printLocation(diag, *sec, sym, offset);
+  return true;
 }
 
 template <bool shard = false>
@@ -949,9 +830,9 @@ static bool canDefineSymbolInExecutable(Ctx &ctx, Symbol &sym) {
 //
 // If this function returns false, that means we need to emit a
 // dynamic relocation so that the relocation will be fixed at load-time.
-bool RelocationScanner::isStaticLinkTimeConstant(RelExpr e, RelType type,
-                                                 const Symbol &sym,
-                                                 uint64_t relOff) const {
+bool RelocScan::isStaticLinkTimeConstant(RelExpr e, RelType type,
+                                         const Symbol &sym,
+                                         uint64_t relOff) const {
   // These expressions always compute a constant
   if (oneof<
           R_GOTPLT, R_GOT_OFF, R_RELAX_HINT, RE_MIPS_GOT_LOCAL_PAGE,
@@ -1010,7 +891,7 @@ bool RelocationScanner::isStaticLinkTimeConstant(RelExpr e, RelType type,
   // We set the final symbols values for linker script defined symbols later.
   // They always can be computed as a link time constant.
   if (sym.scriptDefined)
-      return true;
+    return true;
 
   auto diag = Err(ctx);
   diag << "relocation " << type << " cannot refer to absolute symbol: " << &sym;
@@ -1031,8 +912,8 @@ bool RelocationScanner::isStaticLinkTimeConstant(RelExpr e, RelType type,
 // sections. Given that it is ro, we will need an extra PT_LOAD. This
 // complicates things for the dynamic linker and means we would have to reserve
 // space for the extra PT_LOAD even if we end up not using it.
-void RelocationScanner::process(RelExpr expr, RelType type, uint64_t offset,
-                                Symbol &sym, int64_t addend) const {
+void RelocScan::process(RelExpr expr, RelType type, uint64_t offset,
+                        Symbol &sym, int64_t addend) const {
   // If non-ifunc non-preemptible, change PLT to direct call and optimize GOT
   // indirection.
   const bool isIfunc = sym.isGnuIFunc();
@@ -1243,28 +1124,6 @@ void RelocationScanner::process(RelExpr expr, RelType type, uint64_t offset,
   printLocation(diag, *sec, sym, offset);
 }
 
-// This function is similar to the `handleTlsRelocation`. MIPS does not
-// support any relaxations for TLS relocations so by factoring out MIPS
-// handling in to the separate function we can simplify the code and do not
-// pollute other `handleTlsRelocation` by MIPS `ifs` statements.
-// Mips has a custom MipsGotSection that handles the writing of GOT entries
-// without dynamic relocations.
-static unsigned handleMipsTlsRelocation(Ctx &ctx, RelType type, Symbol &sym,
-                                        InputSectionBase &c, uint64_t offset,
-                                        int64_t addend, RelExpr expr) {
-  if (expr == RE_MIPS_TLSLD) {
-    ctx.in.mipsGot->addTlsIndex(*c.file);
-    c.addReloc({expr, type, offset, addend, &sym});
-    return 1;
-  }
-  if (expr == RE_MIPS_TLSGD) {
-    ctx.in.mipsGot->addDynTlsEntry(*c.file, sym);
-    c.addReloc({expr, type, offset, addend, &sym});
-    return 1;
-  }
-  return 0;
-}
-
 static unsigned handleAArch64PAuthTlsRelocation(InputSectionBase *sec,
                                                 RelExpr expr, RelType type,
                                                 uint64_t offset, Symbol &sym,
@@ -1293,9 +1152,9 @@ static unsigned handleAArch64PAuthTlsRelocation(InputSectionBase *sec,
 // symbol in TLS block.
 //
 // Returns the number of relocations processed.
-unsigned RelocationScanner::handleTlsRelocation(RelExpr expr, RelType type,
-                                                uint64_t offset, Symbol &sym,
-                                                int64_t addend) {
+unsigned RelocScan::handleTlsRelocation(RelExpr expr, RelType type,
+                                        uint64_t offset, Symbol &sym,
+                                        int64_t addend) {
   bool isAArch64 = ctx.arg.emachine == EM_AARCH64;
 
   if (isAArch64)
@@ -1303,19 +1162,8 @@ unsigned RelocationScanner::handleTlsRelocation(RelExpr expr, RelType type,
             sec, expr, type, offset, sym, addend))
       return processed;
 
-  if (expr == R_TPREL || expr == R_TPREL_NEG) {
-    if (ctx.arg.shared) {
-      auto diag = Err(ctx);
-      diag << "relocation " << type << " against " << &sym
-           << " cannot be used with -shared";
-      printLocation(diag, *sec, sym, offset);
-      return 1;
-    }
-    return 0;
-  }
-
-  if (ctx.arg.emachine == EM_MIPS)
-    return handleMipsTlsRelocation(ctx, type, sym, *sec, offset, addend, expr);
+  if (expr == R_TPREL || expr == R_TPREL_NEG)
+    return checkTlsLe(offset, sym, type) ? 1 : 0;
 
   bool isRISCV = ctx.arg.emachine == EM_RISCV;
 
@@ -1472,159 +1320,10 @@ unsigned RelocationScanner::handleTlsRelocation(RelExpr expr, RelType type,
 }
 
 template <class ELFT, class RelTy>
-void RelocationScanner::scan(typename Relocs<RelTy>::const_iterator &i) {
-  const RelTy &rel = *i;
-  uint32_t symIndex = rel.getSymbol(ctx.arg.isMips64EL);
-  Symbol &sym = sec->getFile<ELFT>()->getSymbol(symIndex);
-  RelType type;
-  if constexpr (ELFT::Is64Bits || RelTy::IsCrel) {
-    type = rel.getType(ctx.arg.isMips64EL);
-    ++i;
-  } else {
-    // CREL is unsupported for MIPS N32.
-    if (ctx.arg.mipsN32Abi) {
-      type = getMipsN32RelType(i);
-    } else {
-      type = rel.getType(ctx.arg.isMips64EL);
-      ++i;
-    }
-  }
-  // Get an offset in an output section this relocation is applied to.
-  uint64_t offset = rel.r_offset;
-
-  RelExpr expr =
-      ctx.target->getRelExpr(type, sym, sec->content().data() + offset);
-  int64_t addend = RelTy::HasAddend
-                       ? getAddend<ELFT>(rel)
-                       : ctx.target->getImplicitAddend(
-                             sec->content().data() + rel.r_offset, type);
-  if (LLVM_UNLIKELY(ctx.arg.emachine == EM_MIPS))
-    addend += computeMipsAddend<ELFT>(rel, expr, sym.isLocal());
-  else if (ctx.arg.emachine == EM_PPC64 && ctx.arg.isPic && type == R_PPC64_TOC)
-    addend += getPPC64TocBase(ctx);
-
-  // Ignore R_*_NONE and other marker relocations.
-  if (expr == R_NONE)
-    return;
-
-  // Error if the target symbol is undefined. Symbol index 0 may be used by
-  // marker relocations, e.g. R_*_NONE and R_ARM_V4BX. Don't error on them.
-  if (sym.isUndefined() && symIndex != 0 &&
-      maybeReportUndefined(ctx, cast<Undefined>(sym), *sec, offset))
-    return;
-
-  if (ctx.arg.emachine == EM_PPC64) {
-    // We can separate the small code model relocations into 2 categories:
-    // 1) Those that access the compiler generated .toc sections.
-    // 2) Those that access the linker allocated got entries.
-    // lld allocates got entries to symbols on demand. Since we don't try to
-    // sort the got entries in any way, we don't have to track which objects
-    // have got-based small code model relocs. The .toc sections get placed
-    // after the end of the linker allocated .got section and we do sort those
-    // so sections addressed with small code model relocations come first.
-    if (type == R_PPC64_TOC16 || type == R_PPC64_TOC16_DS)
-      sec->file->ppc64SmallCodeModelTocRelocs = true;
-
-    // Record the TOC entry (.toc + addend) as not relaxable. See the comment in
-    // PPC64::relocateAlloc().
-    if (type == R_PPC64_TOC16_LO && sym.isSection() && isa<Defined>(sym) &&
-        cast<Defined>(sym).section->name == ".toc")
-      ctx.ppc64noTocRelax.insert({&sym, addend});
-
-    if ((type == R_PPC64_TLSGD && expr == R_TLSDESC_CALL) ||
-        (type == R_PPC64_TLSLD && expr == R_TLSLD_HINT)) {
-      // Skip the error check for CREL, which does not set `end`.
-      if constexpr (!RelTy::IsCrel) {
-        if (i == end) {
-          auto diag = Err(ctx);
-          diag << "R_PPC64_TLSGD/R_PPC64_TLSLD may not be the last "
-                  "relocation";
-          printLocation(diag, *sec, sym, offset);
-          return;
-        }
-      }
-
-      // Offset the 4-byte aligned R_PPC64_TLSGD by one byte in the NOTOC
-      // case, so we can discern it later from the toc-case.
-      if (i->getType(/*isMips64EL=*/false) == R_PPC64_REL24_NOTOC)
-        ++offset;
-    }
-  }
-
-  // If the relocation does not emit a GOT or GOTPLT entry but its computation
-  // uses their addresses, we need GOT or GOTPLT to be created.
-  //
-  // The 5 types that relative GOTPLT are all x86 and x86-64 specific.
-  if (oneof<R_GOTPLTONLY_PC, R_GOTPLTREL, R_GOTPLT, R_PLT_GOTPLT,
-            R_TLSDESC_GOTPLT, R_TLSGD_GOTPLT>(expr)) {
-    ctx.in.gotPlt->hasGotPltOffRel.store(true, std::memory_order_relaxed);
-  } else if (oneof<R_GOTONLY_PC, R_GOTREL, RE_PPC32_PLTREL, RE_PPC64_TOCBASE,
-                   RE_PPC64_RELAX_TOC>(expr)) {
-    ctx.in.got->hasGotOffRel.store(true, std::memory_order_relaxed);
-  }
-
-  // Process TLS relocations, including TLS optimizations. Note that
-  // R_TPREL and R_TPREL_NEG relocations are resolved in process().
-  //
-  // Some RISCV TLSDESC relocations reference a local NOTYPE symbol,
-  // but we need to process them in handleTlsRelocation.
-  if (sym.isTls() || oneof<R_TLSDESC_PC, R_TLSDESC_CALL>(expr)) {
-    if (unsigned processed =
-            handleTlsRelocation(expr, type, offset, sym, addend)) {
-      i += processed - 1;
-      return;
-    }
-  }
-
-  process(expr, type, offset, sym, addend);
-}
-
-// R_PPC64_TLSGD/R_PPC64_TLSLD is required to mark `bl __tls_get_addr` for
-// General Dynamic/Local Dynamic code sequences. If a GD/LD GOT relocation is
-// found but no R_PPC64_TLSGD/R_PPC64_TLSLD is seen, we assume that the
-// instructions are generated by very old IBM XL compilers. Work around the
-// issue by disabling GD/LD to IE/LE relaxation.
-template <class RelTy>
-static void checkPPC64TLSRelax(InputSectionBase &sec, Relocs<RelTy> rels) {
-  // Skip if sec is synthetic (sec.file is null) or if sec has been marked.
-  if (!sec.file || sec.file->ppc64DisableTLSRelax)
-    return;
-  bool hasGDLD = false;
-  for (const RelTy &rel : rels) {
-    RelType type = rel.getType(false);
-    switch (type) {
-    case R_PPC64_TLSGD:
-    case R_PPC64_TLSLD:
-      return; // Found a marker
-    case R_PPC64_GOT_TLSGD16:
-    case R_PPC64_GOT_TLSGD16_HA:
-    case R_PPC64_GOT_TLSGD16_HI:
-    case R_PPC64_GOT_TLSGD16_LO:
-    case R_PPC64_GOT_TLSLD16:
-    case R_PPC64_GOT_TLSLD16_HA:
-    case R_PPC64_GOT_TLSLD16_HI:
-    case R_PPC64_GOT_TLSLD16_LO:
-      hasGDLD = true;
-      break;
-    }
-  }
-  if (hasGDLD) {
-    sec.file->ppc64DisableTLSRelax = true;
-    Warn(sec.file->ctx)
-        << sec.file
-        << ": disable TLS relaxation due to R_PPC64_GOT_TLS* relocations "
-           "without "
-           "R_PPC64_TLSGD/R_PPC64_TLSLD relocations";
-  }
-}
-
-template <class ELFT, class RelTy>
-void RelocationScanner::scanSectionImpl(Relocs<RelTy> rels) {
-  // Not all relocations end up in Sec->Relocations, but a lot do.
-  sec->relocations.reserve(rels.size());
-
-  if (ctx.arg.emachine == EM_PPC64)
-    checkPPC64TLSRelax<RelTy>(*sec, rels);
+void TargetInfo::scanSectionImpl(InputSectionBase &sec, Relocs<RelTy> rels) {
+  RelocScan rs(ctx, &sec);
+  // Many relocations end up in sec.relocations.
+  sec.relocations.reserve(rels.size());
 
   // On SystemZ, all sections need to be sorted by r_offset, to allow TLS
   // relaxation to be handled correctly - see SystemZ::getTlsGdRelaxSkip.
@@ -1632,40 +1331,38 @@ void RelocationScanner::scanSectionImpl(Relocs<RelTy> rels) {
   if (ctx.arg.emachine == EM_S390)
     rels = sortRels(rels, storage);
 
-  if constexpr (RelTy::IsCrel) {
-    for (auto i = rels.begin(); i != rels.end();)
-      scan<ELFT, RelTy>(i);
-  } else {
-    // The non-CREL code path has additional check for PPC64 TLS.
-    end = static_cast<const void *>(rels.end());
-    for (auto i = rels.begin(); i != end;)
-      scan<ELFT, RelTy>(i);
+  for (auto it = rels.begin(); it != rels.end(); ++it) {
+    auto type = it->getType(false);
+    rs.scan<ELFT, RelTy>(it, type, rs.getAddend<ELFT>(*it, type));
   }
 
   // Sort relocations by offset for more efficient searching for
   // R_RISCV_PCREL_HI20, ALIGN relocations, R_PPC64_ADDR64 and the
   // branch-to-branch optimization.
   if (is_contained({EM_RISCV, EM_LOONGARCH}, ctx.arg.emachine) ||
-      (ctx.arg.emachine == EM_PPC64 && sec->name == ".toc") ||
+      (ctx.arg.emachine == EM_PPC64 && sec.name == ".toc") ||
       ctx.arg.branchToBranch)
-    llvm::stable_sort(sec->relocs(),
+    llvm::stable_sort(sec.relocs(),
                       [](const Relocation &lhs, const Relocation &rhs) {
                         return lhs.offset < rhs.offset;
                       });
 }
 
-template <class ELFT> void RelocationScanner::scanSection(InputSectionBase &s) {
-  sec = &s;
-  const RelsOrRelas<ELFT> rels = s.template relsOrRelas<ELFT>();
+template <class ELFT> void TargetInfo::scanSection1(InputSectionBase &sec) {
+  const RelsOrRelas<ELFT> rels = sec.template relsOrRelas<ELFT>();
   if (rels.areRelocsCrel())
-    scanSectionImpl<ELFT>(rels.crels);
+    scanSectionImpl<ELFT>(sec, rels.crels);
   else if (rels.areRelocsRel())
-    scanSectionImpl<ELFT>(rels.rels);
+    scanSectionImpl<ELFT>(sec, rels.rels);
   else
-    scanSectionImpl<ELFT>(rels.relas);
+    scanSectionImpl<ELFT>(sec, rels.relas);
+}
+
+void TargetInfo::scanSection(InputSectionBase &sec) {
+  invokeELFT(scanSection1, sec);
 }
 
-template <class ELFT> void RelocationScanner::scanEhSection(EhInputSection &s) {
+void RelocScan::scanEhSection(EhInputSection &s) {
   sec = &s;
   OffsetGetter getter(s);
   auto rels = s.rels;
@@ -1680,7 +1377,7 @@ template <class ELFT> void RelocationScanner::scanEhSection(EhInputSection &s) {
       continue;
     Symbol *sym = r.sym;
     if (sym->isUndefined() &&
-        maybeReportUndefined(ctx, cast<Undefined>(*sym), *sec, offset))
+        maybeReportUndefined(cast<Undefined>(*sym), offset))
       continue;
     process(r.expr, r.type, offset, *sym, r.addend);
   }
@@ -1701,12 +1398,11 @@ template <class ELFT> void elf::scanRelocations(Ctx &ctx) {
   auto outerFn = [&]() {
     for (ELFFileBase *f : ctx.objectFiles) {
       auto fn = [f, &ctx]() {
-        RelocationScanner scanner(ctx);
         for (InputSectionBase *s : f->getSections()) {
           if (s && s->kind() == SectionBase::Regular && s->isLive() &&
               (s->flags & SHF_ALLOC) &&
               !(s->type == SHT_ARM_EXIDX && ctx.arg.emachine == EM_ARM))
-            scanner.template scanSection<ELFT>(*s);
+            ctx.target->scanSection(*s);
         }
       };
       if (serial)
@@ -1715,14 +1411,14 @@ template <class ELFT> void elf::scanRelocations(Ctx &ctx) {
         tg.spawn(fn);
     }
     auto scanEH = [&] {
-      RelocationScanner scanner(ctx);
+      RelocScan scanner(ctx);
       for (Partition &part : ctx.partitions) {
         for (EhInputSection *sec : part.ehFrame->sections)
-          scanner.template scanEhSection<ELFT>(*sec);
+          scanner.scanEhSection(*sec);
         if (part.armExidx && part.armExidx->isLive())
           for (InputSection *sec : part.armExidx->exidxSections)
             if (sec->isLive())
-              scanner.template scanSection<ELFT>(*sec);
+              ctx.target->scanSection(*sec);
       }
     };
     if (serial)
diff --git a/lld/ELF/Relocations.h b/lld/ELF/Relocations.h
index 7ea03c359bbf4..86ca298cd7a56 100644
--- a/lld/ELF/Relocations.h
+++ b/lld/ELF/Relocations.h
@@ -17,7 +17,9 @@
 
 namespace lld::elf {
 struct Ctx;
+struct ELFSyncStream;
 class Defined;
+class Undefined;
 class Symbol;
 class InputSection;
 class InputSectionBase;
@@ -153,12 +155,17 @@ struct JumpInstrMod {
   unsigned size;
 };
 
+void printLocation(ELFSyncStream &s, InputSectionBase &sec, const Symbol &sym,
+                   uint64_t off);
+
 // This function writes undefined symbol diagnostics to an internal buffer.
 // Call reportUndefinedSymbols() after calling scanRelocations() to emit
 // the diagnostics.
 template <class ELFT> void scanRelocations(Ctx &ctx);
 template <class ELFT> void checkNoCrossRefs(Ctx &ctx);
 void reportUndefinedSymbols(Ctx &);
+bool maybeReportUndefined(Ctx &, Undefined &sym, InputSectionBase &sec,
+                          uint64_t offset);
 void postScanRelocations(Ctx &ctx);
 void addGotEntry(Ctx &ctx, Symbol &sym);
 
diff --git a/lld/ELF/ScriptParser.cpp b/lld/ELF/ScriptParser.cpp
index ddfa24d9cacf5..4b9c941eb9d69 100644
--- a/lld/ELF/ScriptParser.cpp
+++ b/lld/ELF/ScriptParser.cpp
@@ -721,11 +721,11 @@ void ScriptParser::readTarget() {
 
 static int precedence(StringRef op) {
   return StringSwitch<int>(op)
-      .Cases("*", "/", "%", 11)
-      .Cases("+", "-", 10)
-      .Cases("<<", ">>", 9)
-      .Cases("<", "<=", ">", ">=", 8)
-      .Cases("==", "!=", 7)
+      .Cases({"*", "/", "%"}, 11)
+      .Cases({"+", "-"}, 10)
+      .Cases({"<<", ">>"}, 9)
+      .Cases({"<", "<=", ">", ">="}, 8)
+      .Cases({"==", "!="}, 7)
       .Case("&", 6)
       .Case("^", 5)
       .Case("|", 4)
diff --git a/lld/ELF/Symbols.cpp b/lld/ELF/Symbols.cpp
index de839795c50d7..220ec99c22e6f 100644
--- a/lld/ELF/Symbols.cpp
+++ b/lld/ELF/Symbols.cpp
@@ -35,7 +35,7 @@ template <typename T> struct AssertSymbol {
                 "SymbolUnion not aligned enough");
 };
 
-LLVM_ATTRIBUTE_UNUSED static inline void assertSymbols() {
+[[maybe_unused]] static inline void assertSymbols() {
   AssertSymbol<Defined>();
   AssertSymbol<CommonSymbol>();
   AssertSymbol<Undefined>();
diff --git a/lld/ELF/Target.cpp b/lld/ELF/Target.cpp
index fb79ee911273a..89e4dbeed3109 100644
--- a/lld/ELF/Target.cpp
+++ b/lld/ELF/Target.cpp
@@ -26,6 +26,7 @@
 #include "Target.h"
 #include "InputFiles.h"
 #include "OutputSections.h"
+#include "RelocScan.h"
 #include "SymbolTable.h"
 #include "Symbols.h"
 #include "lld/Common/ErrorHandler.h"
diff --git a/lld/ELF/Target.h b/lld/ELF/Target.h
index 9f0605138a4fb..90d8ddfede2c0 100644
--- a/lld/ELF/Target.h
+++ b/lld/ELF/Target.h
@@ -24,6 +24,7 @@ namespace elf {
 class Defined;
 class InputFile;
 class Symbol;
+template <class RelTy> struct Relocs;
 
 std::string toStr(Ctx &, RelType type);
 
@@ -87,6 +88,15 @@ class TargetInfo {
   virtual bool inBranchRange(RelType type, uint64_t src,
                              uint64_t dst) const;
 
+  // Function for scanning relocation. Typically overridden by targets that
+  // require special type or addend adjustment.
+  virtual void scanSection(InputSectionBase &);
+  // Called by scanSection as a default implementation for specific ELF
+  // relocation types.
+  template <class ELFT> void scanSection1(InputSectionBase &);
+  template <class ELFT, class RelTy>
+  void scanSectionImpl(InputSectionBase &, Relocs<RelTy>);
+
   virtual void relocate(uint8_t *loc, const Relocation &rel,
                         uint64_t val) const = 0;
   void relocateNoSym(uint8_t *loc, RelType type, uint64_t val) const {
diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp
index 94f441b7643a7..9b67db9fa55cf 100644
--- a/lld/MachO/Driver.cpp
+++ b/lld/MachO/Driver.cpp
@@ -291,6 +291,7 @@ struct DeferredFile {
 };
 using DeferredFiles = std::vector<DeferredFile>;
 
+#if LLVM_ENABLE_THREADS
 class SerialBackgroundQueue {
   std::deque<std::function<void()>> queue;
   std::thread *running;
@@ -355,11 +356,10 @@ void multiThreadedPageInBackground(DeferredFiles &deferred) {
     // Reference all file's mmap'd pages to load them into memory.
     for (const char *page = buff.data(), *end = page + buff.size(); page < end;
          page += pageSize) {
-      LLVM_ATTRIBUTE_UNUSED volatile char t = *page;
+      [[maybe_unused]] volatile char t = *page;
       (void)t;
     }
   };
-#if LLVM_ENABLE_THREADS
   { // Create scope for waiting for the taskGroup
     std::atomic_size_t index = 0;
     llvm::parallel::TaskGroup taskGroup;
@@ -373,7 +373,6 @@ void multiThreadedPageInBackground(DeferredFiles &deferred) {
         }
       });
   }
-#endif
 #ifndef NDEBUG
   auto dt = high_resolution_clock::now() - t0;
   if (Process::GetEnv("LLD_MULTI_THREAD_PAGE"))
@@ -390,6 +389,7 @@ static void multiThreadedPageIn(const DeferredFiles &deferred) {
     multiThreadedPageInBackground(files);
   });
 }
+#endif
 
 static InputFile *processFile(std::optional<MemoryBufferRef> buffer,
                               DeferredFiles *archiveContents, StringRef path,
@@ -1430,6 +1430,7 @@ static void createFiles(const InputArgList &args) {
     }
   }
 
+#if LLVM_ENABLE_THREADS
   if (config->readWorkers) {
     multiThreadedPageIn(deferredFiles);
 
@@ -1447,6 +1448,7 @@ static void createFiles(const InputArgList &args) {
     for (auto *archive : archives)
       archive->addLazySymbols();
   }
+#endif
 }
 
 static void gatherInputSections() {
@@ -1834,6 +1836,7 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
   }
 
   if (auto *arg = args.getLastArg(OPT_read_workers)) {
+#if LLVM_ENABLE_THREADS
     StringRef v(arg->getValue());
     unsigned workers = 0;
     if (!llvm::to_integer(v, workers, 0))
@@ -1841,6 +1844,10 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
             ": expected a non-negative integer, but got '" + arg->getValue() +
             "'");
     config->readWorkers = workers;
+#else
+    error(arg->getSpelling() +
+          ": option unavailable because lld was not built with thread support");
+#endif
   }
   if (auto *arg = args.getLastArg(OPT_threads_eq)) {
     StringRef v(arg->getValue());
diff --git a/lld/test/CMakeLists.txt b/lld/test/CMakeLists.txt
index abc8ea75da180..1bd3ad7e1765b 100644
--- a/lld/test/CMakeLists.txt
+++ b/lld/test/CMakeLists.txt
@@ -1,5 +1,6 @@
 llvm_canonicalize_cmake_booleans(
   ENABLE_BACKTRACES
+  LLVM_ENABLE_THREADS
   LLVM_ENABLE_ZLIB
   LLVM_ENABLE_ZSTD
   LLVM_ENABLE_LIBXML2
diff --git a/lld/test/MachO/read-workers-no-thread-support.s b/lld/test/MachO/read-workers-no-thread-support.s
new file mode 100644
index 0000000000000..16256be42af73
--- /dev/null
+++ b/lld/test/MachO/read-workers-no-thread-support.s
@@ -0,0 +1,10 @@
+# REQUIRES: x86 && !thread_support
+# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t.o
+
+# RUN: not %lld --read-workers=1 %t.o -o /dev/null
+
+# CHECK: error: --read-workers=: option unavailable because lld was built without thread support
+
+.globl _main
+_main:
+  ret
diff --git a/lld/test/MachO/read-workers.s b/lld/test/MachO/read-workers.s
index 6f0ea4d894408..294106ba0b084 100644
--- a/lld/test/MachO/read-workers.s
+++ b/lld/test/MachO/read-workers.s
@@ -1,4 +1,7 @@
-# REQUIRES: x86
+# REQUIRES: x86 && thread_support
+## Sometimes fails, particularly in an ASAN build, do not run until
+## https://github.com/llvm/llvm-project/pull/157917 addresses the cause.
+# UNSUPPORTED: target={{.*}}
 # RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t.o
 
 ## A non-negative integer is allowed.
diff --git a/lld/test/lit.cfg.py b/lld/test/lit.cfg.py
index 336945729954e..39c3d0aa36bfb 100644
--- a/lld/test/lit.cfg.py
+++ b/lld/test/lit.cfg.py
@@ -182,3 +182,6 @@
 # ELF tests expect the default target for ld.lld to be ELF.
 if config.ld_lld_default_mingw:
     config.excludes.append("ELF")
+
+if config.enable_threads:
+    config.available_features.add("thread_support")
diff --git a/lld/test/lit.site.cfg.py.in b/lld/test/lit.site.cfg.py.in
index bb99976005543..703d3b1fd5337 100644
--- a/lld/test/lit.site.cfg.py.in
+++ b/lld/test/lit.site.cfg.py.in
@@ -26,6 +26,7 @@ config.ld_lld_default_mingw = @LLD_DEFAULT_LD_LLD_IS_MINGW@
 config.build_examples = @LLVM_BUILD_EXAMPLES@
 config.has_plugins = @LLVM_ENABLE_PLUGINS@
 config.linked_bye_extension = @LLVM_BYE_LINK_INTO_TOOLS@
+config.enable_threads = @LLVM_ENABLE_THREADS@
 
 import lit.llvm
 lit.llvm.initialize(lit_config, config)
diff --git a/lld/utils/benchmark.py b/lld/utils/benchmark.py
index 7202e07ec438d..059f82ac78d0c 100755
--- a/lld/utils/benchmark.py
+++ b/lld/utils/benchmark.py
@@ -13,13 +13,8 @@
 import json
 import datetime
 import argparse
-
-try:
-    from urllib.parse import urlencode
-    from urllib.request import urlopen, Request
-except ImportError:
-    from urllib import urlencode
-    from urllib2 import urlopen, Request
+from urllib.parse import urlencode
+from urllib.request import urlopen, Request
 
 
 parser = argparse.ArgumentParser()
diff --git a/lldb/cmake/modules/LLDBFramework.cmake b/lldb/cmake/modules/LLDBFramework.cmake
index c6f00ed05cfc2..23d9d49d0cefc 100644
--- a/lldb/cmake/modules/LLDBFramework.cmake
+++ b/lldb/cmake/modules/LLDBFramework.cmake
@@ -68,8 +68,6 @@ if(NOT APPLE_EMBEDDED)
   )
 endif()
 
-find_program(unifdef_EXECUTABLE unifdef)
-
 # Wrap output in a target, so lldb-framework can depend on it.
 add_custom_target(liblldb-resource-headers DEPENDS lldb-sbapi-dwarf-enums ${lldb_staged_headers})
 set_target_properties(liblldb-resource-headers PROPERTIES FOLDER "LLDB/Resources")
diff --git a/lldb/docs/resources/lldbgdbremote.md b/lldb/docs/resources/lldbgdbremote.md
index 93fb0c9b5502f..287484ea5cabf 100644
--- a/lldb/docs/resources/lldbgdbremote.md
+++ b/lldb/docs/resources/lldbgdbremote.md
@@ -738,6 +738,57 @@ This is a performance optimization, which speeds up debugging by avoiding
 multiple round-trips for retrieving thread information. The information from this
 packet can be retrieved using a combination of `qThreadStopInfo` and `m` packets.
 
+### MultiMemRead
+
+Read memory from multiple memory ranges.
+
+This packet has one argument:
+
+* `ranges`: a list of pairs of numbers, formatted in base-16. Each pair is
+separated by a `,`, as is each number in each pair. The first number of the
+pair denotes the base address of the memory read, the second denotes the number
+of bytes to be read. The list must end with a `;`.
+
+The reply packet starts with a comma-separated list of numbers formatted in
+base-16, denoting how many bytes were read from each range, in the same order
+as the request packet. The list is followed by a `;`, followed by a sequence of
+bytes containing binary encoded data for all memory that was read. The length
+of the binary encodeed data, after being decoded as required by the GDB remote
+protocol, must be equal to the sum of the numbers provided at the start of the
+reply. The order of the binary data is the same as the order of the ranges in
+the request packet.
+
+If some part of a range is not readable, the stub may perform a partial read of
+a prefix of the range. In other words, partial reads will only ever be from the
+start of the range, never the middle or end. Support for partial reads depends
+on the debug stub.
+
+If, by applying the rules above, the stub has read zero bytes from a range, it
+must return a length of zero for that range in the reply packet; no bytes for
+this memory range are included in the sequence of bytes that follows.
+
+A stub that supports this packet must include `MultiMemRead+` in the reply to
+`qSupported`.
+
+```
+send packet: $MultiMemRead:ranges:100a00,4,200200,a0,400000,4;
+read packet: $4,0,2;<binary encoding of abcd1000><binary encoding of eeff>
+```
+
+In the example above, the first read produced `abcd1000`, the read of `a0`
+bytes from address `200200` failed to read any bytes, and the third read
+produced two bytes – `eeff` – out of the four requested.
+
+```
+send packet: $MultiMemRead:ranges:100a00,0;
+read packet: $0;
+```
+
+In the example above, a read of zero bytes was requested.
+
+**Priority to Implement:** Only required for performance, the debugger will
+fall back to doing separate read requests if this packet is unavailable.
+
 ## QEnvironment:NAME=VALUE
 
 Setup the environment up for a new child process that will soon be
@@ -2116,7 +2167,7 @@ following keys and values:
          be outside the watchpoint that was triggered, the remote
          stub should determine which watchpoint was triggered and
          report an address from within its range.
-      2. Wwatchpoint hardware register index number.
+      2. Watchpoint hardware register index number.
       3. Actual watchpoint trap address, which may be outside
          the range of any watched region of memory. On MIPS, an addr
          outside a watched range means lldb should disable the wp,
diff --git a/lldb/examples/python/performance.py b/lldb/examples/python/performance.py
index b86b5a52522e0..c3181b61c84f7 100755
--- a/lldb/examples/python/performance.py
+++ b/lldb/examples/python/performance.py
@@ -16,7 +16,6 @@
 import sys
 import subprocess
 import time
-import types
 
 # ----------------------------------------------------------------------
 # Code that auto imports LLDB
@@ -121,19 +120,19 @@ def __init__(
             self.breakpoints.append(breakpoint)
         else:
             if module:
-                if isinstance(module, types.ListType):
+                if isinstance(module, list):
                     for module_path in module:
                         self.modules.Append(lldb.SBFileSpec(module_path, False))
-                elif isinstance(module, types.StringTypes):
+                elif isinstance(module, str):
                     self.modules.Append(lldb.SBFileSpec(module, False))
             if name:
                 # "file" can be a list or a string
                 if file:
-                    if isinstance(file, types.ListType):
+                    if isinstance(file, list):
                         self.files = lldb.SBFileSpecList()
                         for f in file:
                             self.files.Append(lldb.SBFileSpec(f, False))
-                    elif isinstance(file, types.StringTypes):
+                    elif isinstance(file, str):
                         self.files.Append(lldb.SBFileSpec(file, False))
                 self.breakpoints.append(
                     self.target.BreakpointCreateByName(name, self.modules, self.files)
diff --git a/lldb/examples/summaries/cocoa/CFString.py b/lldb/examples/summaries/cocoa/CFString.py
index 74bd927e9db21..02b670651cd53 100644
--- a/lldb/examples/summaries/cocoa/CFString.py
+++ b/lldb/examples/summaries/cocoa/CFString.py
@@ -11,11 +11,6 @@
 import lldb.runtime.objc.objc_runtime
 import lldb.formatters.Logger
 
-try:
-    unichr
-except NameError:
-    unichr = chr
-
 
 def CFString_SummaryProvider(valobj, dict):
     logger = lldb.formatters.Logger.Logger()
@@ -107,7 +102,7 @@ def read_unicode(self, pointer, max_len=2048):
                 value = b1 * 256 + b0
             else:
                 value = b0 * 256 + b1
-            pystr = pystr + unichr(value)
+            pystr = pystr + chr(value)
             # read max_len unicode values, not max_len bytes
             max_len = max_len - 1
         return pystr
diff --git a/lldb/include/lldb/API/SBMutex.h b/lldb/include/lldb/API/SBMutex.h
index 717d5f86cbc1c..826ad077f159f 100644
--- a/lldb/include/lldb/API/SBMutex.h
+++ b/lldb/include/lldb/API/SBMutex.h
@@ -31,6 +31,10 @@ class LLDB_API SBMutex {
   /// Releases ownership of this lock.
   void unlock() const;
 
+  /// Tries to lock the mutex. Returns immediately. On successful lock
+  /// acquisition returns true, otherwise returns false.
+  bool try_lock() const;
+
 private:
   // Private constructor used by SBTarget to create the Target API mutex.
   // Requires a friend declaration.
diff --git a/lldb/include/lldb/Breakpoint/BreakpointLocationCollection.h b/lldb/include/lldb/Breakpoint/BreakpointLocationCollection.h
index 1df4e074680f5..124cb55eaf723 100644
--- a/lldb/include/lldb/Breakpoint/BreakpointLocationCollection.h
+++ b/lldb/include/lldb/Breakpoint/BreakpointLocationCollection.h
@@ -9,6 +9,7 @@
 #ifndef LLDB_BREAKPOINT_BREAKPOINTLOCATIONCOLLECTION_H
 #define LLDB_BREAKPOINT_BREAKPOINTLOCATIONCOLLECTION_H
 
+#include <map>
 #include <mutex>
 #include <vector>
 
@@ -19,7 +20,15 @@ namespace lldb_private {
 
 class BreakpointLocationCollection {
 public:
-  BreakpointLocationCollection();
+  /// Breakpoint locations don't keep their breakpoint owners alive, so neither
+  /// will a collection of breakpoint locations.  However, if you need to
+  /// use this collection in a context where some of the breakpoints whose
+  /// locations are in the collection might get deleted during its lifespan,
+  /// then you need to make sure the breakpoints don't get deleted out from
+  /// under you.  To do that, pass true for preserving, and so long as there is
+  /// a location for a given breakpoint in the collection, the breakpoint will
+  /// not get destroyed.
+  BreakpointLocationCollection(bool preserving = false);
 
   ~BreakpointLocationCollection();
 
@@ -164,6 +173,10 @@ class BreakpointLocationCollection {
 
   collection m_break_loc_collection;
   mutable std::mutex m_collection_mutex;
+  /// These are used if we're preserving breakpoints in this list:
+  const bool m_preserving_bkpts = false;
+  std::map<std::pair<lldb::break_id_t, lldb::break_id_t>, lldb::BreakpointSP>
+      m_preserved_bps;
 
 public:
   typedef llvm::iterator_range<collection::const_iterator>
@@ -172,7 +185,6 @@ class BreakpointLocationCollection {
     return BreakpointLocationCollectionIterable(m_break_loc_collection);
   }
 };
-
 } // namespace lldb_private
 
 #endif // LLDB_BREAKPOINT_BREAKPOINTLOCATIONCOLLECTION_H
diff --git a/lldb/include/lldb/DataFormatters/TypeSynthetic.h b/lldb/include/lldb/DataFormatters/TypeSynthetic.h
index b147d66def730..5779c5de0703f 100644
--- a/lldb/include/lldb/DataFormatters/TypeSynthetic.h
+++ b/lldb/include/lldb/DataFormatters/TypeSynthetic.h
@@ -28,13 +28,8 @@ class SyntheticChildrenFrontEnd {
 protected:
   ValueObject &m_backend;
 
-  void SetValid(bool valid) { m_valid = valid; }
-
-  bool IsValid() { return m_valid; }
-
 public:
-  SyntheticChildrenFrontEnd(ValueObject &backend)
-      : m_backend(backend), m_valid(true) {}
+  SyntheticChildrenFrontEnd(ValueObject &backend) : m_backend(backend) {}
 
   virtual ~SyntheticChildrenFrontEnd() = default;
 
@@ -100,7 +95,6 @@ class SyntheticChildrenFrontEnd {
                                                 CompilerType type);
 
 private:
-  bool m_valid;
   SyntheticChildrenFrontEnd(const SyntheticChildrenFrontEnd &) = delete;
   const SyntheticChildrenFrontEnd &
   operator=(const SyntheticChildrenFrontEnd &) = delete;
diff --git a/lldb/include/lldb/Symbol/DeclVendor.h b/lldb/include/lldb/Symbol/DeclVendor.h
index 19ab2bb66e2cc..5b0cbf9a15357 100644
--- a/lldb/include/lldb/Symbol/DeclVendor.h
+++ b/lldb/include/lldb/Symbol/DeclVendor.h
@@ -20,7 +20,6 @@ namespace lldb_private {
 class DeclVendor {
 public:
   enum DeclVendorKind {
-    eClangDeclVendor,
     eClangModuleDeclVendor,
     eAppleObjCDeclVendor,
     eLastClangDeclVendor,
diff --git a/lldb/include/lldb/Target/Process.h b/lldb/include/lldb/Target/Process.h
index dc75d98acea70..8f5892e16cedf 100644
--- a/lldb/include/lldb/Target/Process.h
+++ b/lldb/include/lldb/Target/Process.h
@@ -1571,6 +1571,28 @@ class Process : public std::enable_shared_from_this<Process>,
   virtual size_t ReadMemory(lldb::addr_t vm_addr, void *buf, size_t size,
                             Status &error);
 
+  /// Read from multiple memory ranges and write the results into buffer.
+  /// This calls ReadMemoryFromInferior multiple times, once per range,
+  /// bypassing the read cache. Process implementations that can perform this
+  /// operation more efficiently should override this.
+  ///
+  /// \param[in] ranges
+  ///     A collection of ranges (base address + size) to read from.
+  ///
+  /// \param[out] buffer
+  ///     A buffer where the read memory will be written to. It must be at least
+  ///     as long as the sum of the sizes of each range.
+  ///
+  /// \return
+  ///     A vector of MutableArrayRef, where each MutableArrayRef is a slice of
+  ///     the input buffer into which the memory contents were copied. The size
+  ///     of the slice indicates how many bytes were read successfully. Partial
+  ///     reads are always performed from the start of the requested range,
+  ///     never from the middle or end.
+  virtual llvm::SmallVector<llvm::MutableArrayRef<uint8_t>>
+  ReadMemoryRanges(llvm::ArrayRef<Range<lldb::addr_t, size_t>> ranges,
+                   llvm::MutableArrayRef<uint8_t> buffer);
+
   /// Read of memory from a process.
   ///
   /// This function has the same semantics of ReadMemory except that it
diff --git a/lldb/include/lldb/Utility/AnsiTerminal.h b/lldb/include/lldb/Utility/AnsiTerminal.h
index 41acac74364bb..aaaf94c6c5f7f 100644
--- a/lldb/include/lldb/Utility/AnsiTerminal.h
+++ b/lldb/include/lldb/Utility/AnsiTerminal.h
@@ -72,6 +72,17 @@
 
 #define ANSI_ESC_START_LEN 2
 
+// Cursor Position, set cursor to position [l, c] (default = [1, 1]).
+#define ANSI_CSI_CUP(...) ANSI_ESC_START #__VA_ARGS__ "H"
+// Reset cursor to position.
+#define ANSI_CSI_RESET_CURSOR ANSI_CSI_CUP()
+// Erase In Display.
+#define ANSI_CSI_ED(opt) ANSI_ESC_START #opt "J"
+// Erase complete viewport.
+#define ANSI_CSI_ERASE_VIEWPORT ANSI_CSI_ED(2)
+// Erase scrollback.
+#define ANSI_CSI_ERASE_SCROLLBACK ANSI_CSI_ED(3)
+
 // OSC (Operating System Commands)
 // https://invisible-island.net/xterm/ctlseqs/ctlseqs.html
 #define OSC_ESCAPE_START "\033"
diff --git a/lldb/include/lldb/Utility/DataExtractor.h b/lldb/include/lldb/Utility/DataExtractor.h
index 0b7e771ed4f86..b4960f5e87c85 100644
--- a/lldb/include/lldb/Utility/DataExtractor.h
+++ b/lldb/include/lldb/Utility/DataExtractor.h
@@ -994,7 +994,7 @@ class DataExtractor {
     constexpr size_t src_size = sizeof(T);
     T val = fail_value;
 
-    const T *src = static_cast<const T *>(GetData(offset_ptr, src_size));
+    const void *src = GetData(offset_ptr, src_size);
     if (!src)
       return val;
 
diff --git a/lldb/include/lldb/Utility/XcodeSDK.h b/lldb/include/lldb/Utility/XcodeSDK.h
index 5b345a4965cf9..5f89019537689 100644
--- a/lldb/include/lldb/Utility/XcodeSDK.h
+++ b/lldb/include/lldb/Utility/XcodeSDK.h
@@ -38,7 +38,7 @@ class XcodeSDK {
     watchOS,
     XRSimulator,
     XROS,
-    bridgeOS,
+    BridgeOS,
     Linux,
     unknown = -1
   };
diff --git a/lldb/include/lldb/lldb-enumerations.h b/lldb/include/lldb/lldb-enumerations.h
index fec9fdef44df9..1a7db8faecd94 100644
--- a/lldb/include/lldb/lldb-enumerations.h
+++ b/lldb/include/lldb/lldb-enumerations.h
@@ -130,6 +130,8 @@ FLAGS_ENUM(LaunchFlags){
     eLaunchFlagInheritTCCFromParent =
         (1u << 12), ///< Don't make the inferior responsible for its own TCC
                     ///< permissions but instead inherit them from its parent.
+    eLaunchFlagMemoryTagging =
+        (1u << 13), ///< Launch process with memory tagging explicitly enabled.
 };
 
 /// Thread Run Modes.
diff --git a/lldb/packages/Python/lldbsuite/test/gdbclientutils.py b/lldb/packages/Python/lldbsuite/test/gdbclientutils.py
index 53e991a1471fd..bd2fdc0a60cb4 100644
--- a/lldb/packages/Python/lldbsuite/test/gdbclientutils.py
+++ b/lldb/packages/Python/lldbsuite/test/gdbclientutils.py
@@ -1,10 +1,13 @@
+from abc import ABC, abstractmethod
 import ctypes
 import errno
 import io
 import threading
 import socket
 import traceback
+from enum import Enum
 from lldbsuite.support import seven
+from typing import Optional, List, Tuple, Union, Sequence
 
 
 def checksum(message):
@@ -74,6 +77,35 @@ def hex_decode_bytes(hex_bytes):
     return out
 
 
+class PacketDirection(Enum):
+    RECV = "recv"
+    SEND = "send"
+
+
+class PacketLog:
+    def __init__(self):
+        self._packets: list[tuple[PacketDirection, str]] = []
+
+    def add_sent(self, packet: str):
+        self._packets.append((PacketDirection.SEND, packet))
+
+    def add_received(self, packet: str):
+        self._packets.append((PacketDirection.RECV, packet))
+
+    def get_sent(self):
+        return [
+            pkt for direction, pkt in self._packets if direction == PacketDirection.SEND
+        ]
+
+    def get_received(self):
+        return [
+            pkt for direction, pkt in self._packets if direction == PacketDirection.RECV
+        ]
+
+    def __iter__(self):
+        return iter(self._packets)
+
+
 class MockGDBServerResponder:
     """
     A base class for handling client packets and issuing server responses for
@@ -86,23 +118,35 @@ class MockGDBServerResponder:
     handles any packet not recognized in the common packet handling code.
     """
 
-    registerCount = 40
+    registerCount: int = 40
 
-    class RESPONSE_DISCONNECT:
-        pass
+    class SpecialResponse(Enum):
+        RESPONSE_DISCONNECT = 0
+        RESPONSE_NONE = 1
 
-    class RESPONSE_NONE:
-        pass
+    RESPONSE_DISCONNECT = SpecialResponse.RESPONSE_DISCONNECT
+    RESPONSE_NONE = SpecialResponse.RESPONSE_NONE
+    Response = Union[str, SpecialResponse]
 
     def __init__(self):
-        self.packetLog = []
+        self.packetLog = PacketLog()
 
-    def respond(self, packet):
+    def respond(self, packet: str) -> Sequence[Response]:
         """
         Return the unframed packet data that the server should issue in response
         to the given packet received from the client.
         """
-        self.packetLog.append(packet)
+        self.packetLog.add_received(packet)
+        response = self._respond_impl(packet)
+        if not isinstance(response, list):
+            response = [response]
+        for part in response:
+            if isinstance(part, self.SpecialResponse):
+                continue
+            self.packetLog.add_sent(part)
+        return response
+
+    def _respond_impl(self, packet) -> Union[Response, List[Response]]:
         if packet is MockGDBServer.PACKET_INTERRUPT:
             return self.interrupt()
         if packet == "c":
@@ -241,7 +285,7 @@ def qProcessInfo(self):
     def qHostInfo(self):
         return "ptrsize:8;endian:little;"
 
-    def qEcho(self):
+    def qEcho(self, num: int):
         return "E04"
 
     def qQueryGDBServer(self):
@@ -262,10 +306,10 @@ def A(self, packet):
     def D(self, packet):
         return "OK"
 
-    def readRegisters(self):
+    def readRegisters(self) -> str:
         return "00000000" * self.registerCount
 
-    def readRegister(self, register):
+    def readRegister(self, register: int) -> str:
         return "00000000"
 
     def writeRegisters(self, registers_hex):
@@ -305,7 +349,9 @@ def haltReason(self):
         # SIGINT is 2, return type is 2 digit hex string
         return "S02"
 
-    def qXferRead(self, obj, annex, offset, length):
+    def qXferRead(
+        self, obj: str, annex: str, offset: int, length: int
+    ) -> Tuple[Optional[str], bool]:
         return None, False
 
     def _qXferResponse(self, data, has_more):
@@ -373,15 +419,17 @@ class UnexpectedPacketException(Exception):
         pass
 
 
-class ServerChannel:
+class ServerChannel(ABC):
     """
     A wrapper class for TCP or pty-based server.
     """
 
-    def get_connect_address(self):
+    @abstractmethod
+    def get_connect_address(self) -> str:
         """Get address for the client to connect to."""
 
-    def get_connect_url(self):
+    @abstractmethod
+    def get_connect_url(self) -> str:
         """Get URL suitable for process connect command."""
 
     def close_server(self):
@@ -393,10 +441,12 @@ def accept(self):
     def close_connection(self):
         """Close all resources used by the accepted connection."""
 
-    def recv(self):
+    @abstractmethod
+    def recv(self) -> bytes:
         """Receive a data packet from the connected client."""
 
-    def sendall(self, data):
+    @abstractmethod
+    def sendall(self, data: bytes) -> None:
         """Send the data to the connected client."""
 
 
@@ -427,11 +477,11 @@ def close_connection(self):
         self._connection.close()
         self._connection = None
 
-    def recv(self):
+    def recv(self) -> bytes:
         assert self._connection is not None
         return self._connection.recv(4096)
 
-    def sendall(self, data):
+    def sendall(self, data: bytes) -> None:
         assert self._connection is not None
         return self._connection.sendall(data)
 
@@ -443,10 +493,10 @@ def __init__(self):
         )[0]
         super().__init__(family, type, proto, addr)
 
-    def get_connect_address(self):
+    def get_connect_address(self) -> str:
         return "[{}]:{}".format(*self._server_socket.getsockname())
 
-    def get_connect_url(self):
+    def get_connect_url(self) -> str:
         return "connect://" + self.get_connect_address()
 
 
@@ -454,10 +504,10 @@ class UnixServerSocket(ServerSocket):
     def __init__(self, addr):
         super().__init__(socket.AF_UNIX, socket.SOCK_STREAM, 0, addr)
 
-    def get_connect_address(self):
+    def get_connect_address(self) -> str:
         return self._server_socket.getsockname()
 
-    def get_connect_url(self):
+    def get_connect_url(self) -> str:
         return "unix-connect://" + self.get_connect_address()
 
 
@@ -471,7 +521,7 @@ def __init__(self):
         self._primary = io.FileIO(primary, "r+b")
         self._secondary = io.FileIO(secondary, "r+b")
 
-    def get_connect_address(self):
+    def get_connect_address(self) -> str:
         libc = ctypes.CDLL(None)
         libc.ptsname.argtypes = (ctypes.c_int,)
         libc.ptsname.restype = ctypes.c_char_p
@@ -484,7 +534,7 @@ def close_server(self):
         self._secondary.close()
         self._primary.close()
 
-    def recv(self):
+    def recv(self) -> bytes:
         try:
             return self._primary.read(4096)
         except OSError as e:
@@ -493,8 +543,8 @@ def recv(self):
                 return b""
             raise
 
-    def sendall(self, data):
-        return self._primary.write(data)
+    def sendall(self, data: bytes) -> None:
+        self._primary.write(data)
 
 
 class MockGDBServer:
@@ -527,18 +577,21 @@ def stop(self):
             self._thread.join()
             self._thread = None
 
-    def get_connect_address(self):
+    def get_connect_address(self) -> str:
+        assert self._socket is not None
         return self._socket.get_connect_address()
 
-    def get_connect_url(self):
+    def get_connect_url(self) -> str:
+        assert self._socket is not None
         return self._socket.get_connect_url()
 
     def run(self):
+        assert self._socket is not None
         # For testing purposes, we only need to worry about one client
         # connecting just one time.
         try:
             self._socket.accept()
-        except:
+        except Exception:
             traceback.print_exc()
             return
         self._shouldSendAck = True
@@ -553,7 +606,7 @@ def run(self):
                 self._receive(data)
         except self.TerminateConnectionException:
             pass
-        except Exception as e:
+        except Exception:
             print(
                 "An exception happened when receiving the response from the gdb server. Closing the client..."
             )
@@ -586,7 +639,9 @@ def _parsePacket(self):
         Once a complete packet is found at the front of self._receivedData,
         its data is removed form self._receivedData.
         """
+        assert self._receivedData is not None
         data = self._receivedData
+        assert self._receivedDataOffset is not None
         i = self._receivedDataOffset
         data_len = len(data)
         if data_len == 0:
@@ -639,26 +694,31 @@ def _parsePacket(self):
         self._receivedDataOffset = 0
         return packet
 
-    def _sendPacket(self, packet):
-        self._socket.sendall(seven.bitcast_to_bytes(frame_packet(packet)))
+    def _sendPacket(self, packet: str):
+        assert self._socket is not None
+        framed_packet = seven.bitcast_to_bytes(frame_packet(packet))
+        self._socket.sendall(framed_packet)
 
     def _handlePacket(self, packet):
+        assert self._socket is not None
         if packet is self.PACKET_ACK:
             # Ignore ACKs from the client. For the future, we can consider
             # adding validation code to make sure the client only sends ACKs
             # when it's supposed to.
             return
-        response = ""
+        response = [""]
         # We'll handle the ack stuff here since it's not something any of the
         # tests will be concerned about, and it'll get turned off quickly anyway.
         if self._shouldSendAck:
             self._socket.sendall(seven.bitcast_to_bytes("+"))
         if packet == "QStartNoAckMode":
             self._shouldSendAck = False
-            response = "OK"
+            response = ["OK"]
         elif self.responder is not None:
             # Delegate everything else to our responder
             response = self.responder.respond(packet)
+        # MockGDBServerResponder no longer returns non-lists but others like
+        # ReverseTestBase still do
         if not isinstance(response, list):
             response = [response]
         for part in response:
@@ -666,6 +726,8 @@ def _handlePacket(self, packet):
                 continue
             if part is MockGDBServerResponder.RESPONSE_DISCONNECT:
                 raise self.TerminateConnectionException()
+            # Should have handled the non-str's above
+            assert isinstance(part, str)
             self._sendPacket(part)
 
     PACKET_ACK = object()
diff --git a/lldb/packages/Python/lldbsuite/test/lldbgdbclient.py b/lldb/packages/Python/lldbsuite/test/lldbgdbclient.py
index 599f7878e6edb..9b2a89e934132 100644
--- a/lldb/packages/Python/lldbsuite/test/lldbgdbclient.py
+++ b/lldb/packages/Python/lldbsuite/test/lldbgdbclient.py
@@ -10,7 +10,7 @@ class GDBRemoteTestBase(TestBase):
     Base class for GDB client tests.
 
     This class will setup and start a mock GDB server for the test to use.
-    It also provides assertPacketLogContains, which simplifies the checking
+    It also provides assertPacketLogReceived, which simplifies the checking
     of packets sent by the client.
     """
 
@@ -60,30 +60,32 @@ def connect(self, target, plugin="gdb-remote"):
         self.assertTrue(process, PROCESS_IS_VALID)
         return process
 
-    def assertPacketLogContains(self, packets, log=None):
+    def assertPacketLogReceived(self, packets, log: PacketLog = None):
         """
-        Assert that the mock server's packet log contains the given packets.
+        Assert that the mock server's packet log received the given packets.
 
         The packet log includes all packets sent by the client and received
-        by the server.  This fuction makes it easy to verify that the client
+        by the server.  This function makes it easy to verify that the client
         sent the expected packets to the server.
 
         The check does not require that the packets be consecutive, but does
         require that they are ordered in the log as they ordered in the arg.
         """
         if log is None:
-            log = self.server.responder.packetLog
+            received = self.server.responder.packetLog.get_received()
+        else:
+            received = log.get_received()
         i = 0
         j = 0
 
-        while i < len(packets) and j < len(log):
-            if log[j] == packets[i]:
+        while i < len(packets) and j < len(received):
+            if received[j] == packets[i]:
                 i += 1
             j += 1
         if i < len(packets):
             self.fail(
                 "Did not receive: %s\nLast 10 packets:\n\t%s"
-                % (packets[i], "\n\t".join(log))
+                % (packets[i], "\n\t".join(received))
             )
 
 
diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
index 8eb64b4ab8b2b..a3d924d495fb1 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
@@ -27,6 +27,10 @@
     Literal,
 )
 
+# set timeout based on whether ASAN was enabled or not. Increase
+# timeout by a factor of 10 if ASAN is enabled.
+DEFAULT_TIMEOUT = 10 * (10 if ("ASAN_OPTIONS" in os.environ) else 1)
+
 ## DAP type references
 
 
@@ -282,26 +286,24 @@ def get_output(self, category: str, clear=True) -> str:
     def collect_output(
         self,
         category: str,
-        timeout: float,
         pattern: Optional[str] = None,
         clear=True,
     ) -> str:
         """Collect output from 'output' events.
         Args:
             category: The category to collect.
-            timeout: The max duration for collecting output.
             pattern:
                 Optional, if set, return once this pattern is detected in the
                 collected output.
         Returns:
             The collected output.
         """
-        deadline = time.monotonic() + timeout
+        deadline = time.monotonic() + DEFAULT_TIMEOUT
         output = self.get_output(category, clear)
         while deadline >= time.monotonic() and (
             pattern is None or pattern not in output
         ):
-            event = self.wait_for_event(["output"], timeout=deadline - time.monotonic())
+            event = self.wait_for_event(["output"])
             if not event:  # Timeout or EOF
                 break
             output += self.get_output(category, clear=clear)
@@ -339,7 +341,7 @@ def _recv_packet(
         self,
         *,
         predicate: Optional[Callable[[ProtocolMessage], bool]] = None,
-        timeout: Optional[float] = None,
+        timeout: Optional[float] = DEFAULT_TIMEOUT,
     ) -> Optional[ProtocolMessage]:
         """Processes received packets from the adapter.
         Updates the DebugCommunication stateful properties based on the received
@@ -555,25 +557,20 @@ def predicate(p: ProtocolMessage):
 
         return cast(Optional[Response], self._recv_packet(predicate=predicate))
 
-    def wait_for_event(
-        self, filter: List[str] = [], timeout: Optional[float] = None
-    ) -> Optional[Event]:
+    def wait_for_event(self, filter: List[str] = []) -> Optional[Event]:
         """Wait for the first event that matches the filter."""
 
         def predicate(p: ProtocolMessage):
             return p["type"] == "event" and p["event"] in filter
 
         return cast(
-            Optional[Event], self._recv_packet(predicate=predicate, timeout=timeout)
+            Optional[Event],
+            self._recv_packet(predicate=predicate),
         )
 
-    def wait_for_stopped(
-        self, timeout: Optional[float] = None
-    ) -> Optional[List[Event]]:
+    def wait_for_stopped(self) -> Optional[List[Event]]:
         stopped_events = []
-        stopped_event = self.wait_for_event(
-            filter=["stopped", "exited"], timeout=timeout
-        )
+        stopped_event = self.wait_for_event(filter=["stopped", "exited"])
         while stopped_event:
             stopped_events.append(stopped_event)
             # If we exited, then we are done
@@ -582,26 +579,28 @@ def wait_for_stopped(
             # Otherwise we stopped and there might be one or more 'stopped'
             # events for each thread that stopped with a reason, so keep
             # checking for more 'stopped' events and return all of them
-            stopped_event = self.wait_for_event(
-                filter=["stopped", "exited"], timeout=0.25
+            # Use a shorter timeout for additional stopped events
+            def predicate(p: ProtocolMessage):
+                return p["type"] == "event" and p["event"] in ["stopped", "exited"]
+
+            stopped_event = cast(
+                Optional[Event], self._recv_packet(predicate=predicate, timeout=0.25)
             )
         return stopped_events
 
-    def wait_for_breakpoint_events(self, timeout: Optional[float] = None):
+    def wait_for_breakpoint_events(self):
         breakpoint_events: list[Event] = []
         while True:
-            event = self.wait_for_event(["breakpoint"], timeout=timeout)
+            event = self.wait_for_event(["breakpoint"])
             if not event:
                 break
             breakpoint_events.append(event)
         return breakpoint_events
 
-    def wait_for_breakpoints_to_be_verified(
-        self, breakpoint_ids: list[str], timeout: Optional[float] = None
-    ):
+    def wait_for_breakpoints_to_be_verified(self, breakpoint_ids: list[str]):
         """Wait for all breakpoints to be verified. Return all unverified breakpoints."""
         while any(id not in self.resolved_breakpoints for id in breakpoint_ids):
-            breakpoint_event = self.wait_for_event(["breakpoint"], timeout=timeout)
+            breakpoint_event = self.wait_for_event(["breakpoint"])
             if breakpoint_event is None:
                 break
 
@@ -614,14 +613,14 @@ def wait_for_breakpoints_to_be_verified(
             )
         ]
 
-    def wait_for_exited(self, timeout: Optional[float] = None):
-        event_dict = self.wait_for_event(["exited"], timeout=timeout)
+    def wait_for_exited(self):
+        event_dict = self.wait_for_event(["exited"])
         if event_dict is None:
             raise ValueError("didn't get exited event")
         return event_dict
 
-    def wait_for_terminated(self, timeout: Optional[float] = None):
-        event_dict = self.wait_for_event(["terminated"], timeout)
+    def wait_for_terminated(self):
+        event_dict = self.wait_for_event(["terminated"])
         if event_dict is None:
             raise ValueError("didn't get terminated event")
         return event_dict
@@ -1610,7 +1609,7 @@ def terminate(self):
                     # new messages will arrive and it should shutdown on its
                     # own.
                     process.stdin.close()
-                    process.wait(timeout=20)
+                    process.wait(timeout=DEFAULT_TIMEOUT)
                 except subprocess.TimeoutExpired:
                     process.kill()
                     process.wait()
diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
index f7b1ed80fceb5..29935bb8046ff 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
@@ -18,7 +18,7 @@
 class DAPTestCaseBase(TestBase):
     # set timeout based on whether ASAN was enabled or not. Increase
     # timeout by a factor of 10 if ASAN is enabled.
-    DEFAULT_TIMEOUT = 10 * (10 if ("ASAN_OPTIONS" in os.environ) else 1)
+    DEFAULT_TIMEOUT = dap_server.DEFAULT_TIMEOUT
     NO_DEBUG_INFO_TESTCASE = True
 
     def create_debug_adapter(
@@ -118,11 +118,9 @@ def set_function_breakpoints(
             self.wait_for_breakpoints_to_resolve(breakpoint_ids)
         return breakpoint_ids
 
-    def wait_for_breakpoints_to_resolve(
-        self, breakpoint_ids: list[str], timeout: Optional[float] = DEFAULT_TIMEOUT
-    ):
+    def wait_for_breakpoints_to_resolve(self, breakpoint_ids: list[str]):
         unresolved_breakpoints = self.dap_server.wait_for_breakpoints_to_be_verified(
-            breakpoint_ids, timeout
+            breakpoint_ids
         )
         self.assertEqual(
             len(unresolved_breakpoints),
@@ -134,11 +132,10 @@ def wait_until(
         self,
         predicate: Callable[[], bool],
         delay: float = 0.5,
-        timeout: float = DEFAULT_TIMEOUT,
     ) -> bool:
         """Repeatedly run the predicate until either the predicate returns True
         or a timeout has occurred."""
-        deadline = time.monotonic() + timeout
+        deadline = time.monotonic() + self.DEFAULT_TIMEOUT
         while deadline > time.monotonic():
             if predicate():
                 return True
@@ -155,15 +152,13 @@ def assertCapabilityIsNotSet(self, key: str, msg: Optional[str] = None) -> None:
         if key in self.dap_server.capabilities:
             self.assertEqual(self.dap_server.capabilities[key], False, msg)
 
-    def verify_breakpoint_hit(
-        self, breakpoint_ids: List[Union[int, str]], timeout: float = DEFAULT_TIMEOUT
-    ):
+    def verify_breakpoint_hit(self, breakpoint_ids: List[Union[int, str]]):
         """Wait for the process we are debugging to stop, and verify we hit
         any breakpoint location in the "breakpoint_ids" array.
         "breakpoint_ids" should be a list of breakpoint ID strings
         (["1", "2"]). The return value from self.set_source_breakpoints()
         or self.set_function_breakpoints() can be passed to this function"""
-        stopped_events = self.dap_server.wait_for_stopped(timeout)
+        stopped_events = self.dap_server.wait_for_stopped()
         normalized_bp_ids = [str(b) for b in breakpoint_ids]
         for stopped_event in stopped_events:
             if "body" in stopped_event:
@@ -186,11 +181,11 @@ def verify_breakpoint_hit(
             f"breakpoint not hit, wanted breakpoint_ids {breakpoint_ids} in stopped_events {stopped_events}",
         )
 
-    def verify_all_breakpoints_hit(self, breakpoint_ids, timeout=DEFAULT_TIMEOUT):
+    def verify_all_breakpoints_hit(self, breakpoint_ids):
         """Wait for the process we are debugging to stop, and verify we hit
         all of the breakpoint locations in the "breakpoint_ids" array.
         "breakpoint_ids" should be a list of int breakpoint IDs ([1, 2])."""
-        stopped_events = self.dap_server.wait_for_stopped(timeout)
+        stopped_events = self.dap_server.wait_for_stopped()
         for stopped_event in stopped_events:
             if "body" in stopped_event:
                 body = stopped_event["body"]
@@ -208,12 +203,12 @@ def verify_all_breakpoints_hit(self, breakpoint_ids, timeout=DEFAULT_TIMEOUT):
                     return
         self.assertTrue(False, f"breakpoints not hit, stopped_events={stopped_events}")
 
-    def verify_stop_exception_info(self, expected_description, timeout=DEFAULT_TIMEOUT):
+    def verify_stop_exception_info(self, expected_description):
         """Wait for the process we are debugging to stop, and verify the stop
         reason is 'exception' and that the description matches
         'expected_description'
         """
-        stopped_events = self.dap_server.wait_for_stopped(timeout)
+        stopped_events = self.dap_server.wait_for_stopped()
         for stopped_event in stopped_events:
             if "body" in stopped_event:
                 body = stopped_event["body"]
@@ -338,26 +333,14 @@ def get_console(self):
     def get_important(self):
         return self.dap_server.get_output("important")
 
-    def collect_stdout(
-        self, timeout: float = DEFAULT_TIMEOUT, pattern: Optional[str] = None
-    ) -> str:
-        return self.dap_server.collect_output(
-            "stdout", timeout=timeout, pattern=pattern
-        )
+    def collect_stdout(self, pattern: Optional[str] = None) -> str:
+        return self.dap_server.collect_output("stdout", pattern=pattern)
 
-    def collect_console(
-        self, timeout: float = DEFAULT_TIMEOUT, pattern: Optional[str] = None
-    ) -> str:
-        return self.dap_server.collect_output(
-            "console", timeout=timeout, pattern=pattern
-        )
+    def collect_console(self, pattern: Optional[str] = None) -> str:
+        return self.dap_server.collect_output("console", pattern=pattern)
 
-    def collect_important(
-        self, timeout: float = DEFAULT_TIMEOUT, pattern: Optional[str] = None
-    ) -> str:
-        return self.dap_server.collect_output(
-            "important", timeout=timeout, pattern=pattern
-        )
+    def collect_important(self, pattern: Optional[str] = None) -> str:
+        return self.dap_server.collect_output("important", pattern=pattern)
 
     def get_local_as_int(self, name, threadId=None):
         value = self.dap_server.get_local_variable_value(name, threadId=threadId)
@@ -393,14 +376,13 @@ def stepIn(
         targetId=None,
         waitForStop=True,
         granularity="statement",
-        timeout=DEFAULT_TIMEOUT,
     ):
         response = self.dap_server.request_stepIn(
             threadId=threadId, targetId=targetId, granularity=granularity
         )
         self.assertTrue(response["success"])
         if waitForStop:
-            return self.dap_server.wait_for_stopped(timeout)
+            return self.dap_server.wait_for_stopped()
         return None
 
     def stepOver(
@@ -408,7 +390,6 @@ def stepOver(
         threadId=None,
         waitForStop=True,
         granularity="statement",
-        timeout=DEFAULT_TIMEOUT,
     ):
         response = self.dap_server.request_next(
             threadId=threadId, granularity=granularity
@@ -417,40 +398,40 @@ def stepOver(
             response["success"], f"next request failed: response {response}"
         )
         if waitForStop:
-            return self.dap_server.wait_for_stopped(timeout)
+            return self.dap_server.wait_for_stopped()
         return None
 
-    def stepOut(self, threadId=None, waitForStop=True, timeout=DEFAULT_TIMEOUT):
+    def stepOut(self, threadId=None, waitForStop=True):
         self.dap_server.request_stepOut(threadId=threadId)
         if waitForStop:
-            return self.dap_server.wait_for_stopped(timeout)
+            return self.dap_server.wait_for_stopped()
         return None
 
     def do_continue(self):  # `continue` is a keyword.
         resp = self.dap_server.request_continue()
         self.assertTrue(resp["success"], f"continue request failed: {resp}")
 
-    def continue_to_next_stop(self, timeout=DEFAULT_TIMEOUT):
+    def continue_to_next_stop(self):
         self.do_continue()
-        return self.dap_server.wait_for_stopped(timeout)
+        return self.dap_server.wait_for_stopped()
 
-    def continue_to_breakpoint(self, breakpoint_id: str, timeout=DEFAULT_TIMEOUT):
-        self.continue_to_breakpoints((breakpoint_id), timeout)
+    def continue_to_breakpoint(self, breakpoint_id: str):
+        self.continue_to_breakpoints((breakpoint_id))
 
-    def continue_to_breakpoints(self, breakpoint_ids, timeout=DEFAULT_TIMEOUT):
+    def continue_to_breakpoints(self, breakpoint_ids):
         self.do_continue()
-        self.verify_breakpoint_hit(breakpoint_ids, timeout)
+        self.verify_breakpoint_hit(breakpoint_ids)
 
-    def continue_to_exception_breakpoint(self, filter_label, timeout=DEFAULT_TIMEOUT):
+    def continue_to_exception_breakpoint(self, filter_label):
         self.do_continue()
         self.assertTrue(
-            self.verify_stop_exception_info(filter_label, timeout),
+            self.verify_stop_exception_info(filter_label),
             'verify we got "%s"' % (filter_label),
         )
 
-    def continue_to_exit(self, exitCode=0, timeout=DEFAULT_TIMEOUT):
+    def continue_to_exit(self, exitCode=0):
         self.do_continue()
-        stopped_events = self.dap_server.wait_for_stopped(timeout)
+        stopped_events = self.dap_server.wait_for_stopped()
         self.assertEqual(
             len(stopped_events), 1, "stopped_events = {}".format(stopped_events)
         )
diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-server/gdbremote_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-server/gdbremote_testcase.py
index aea6b9fe36b2c..5ba642bbedf74 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-server/gdbremote_testcase.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-server/gdbremote_testcase.py
@@ -931,6 +931,7 @@ def add_qSupported_packets(self, client_features=[]):
         "QNonStop",
         "SupportedWatchpointTypes",
         "SupportedCompressions",
+        "MultiMemRead",
     ]
 
     def parse_qSupported_response(self, context):
diff --git a/lldb/source/API/SBMutex.cpp b/lldb/source/API/SBMutex.cpp
index 445076b5a9174..c7844dec658cc 100644
--- a/lldb/source/API/SBMutex.cpp
+++ b/lldb/source/API/SBMutex.cpp
@@ -58,3 +58,12 @@ void SBMutex::unlock() const {
   if (m_opaque_sp)
     m_opaque_sp->unlock();
 }
+
+bool SBMutex::try_lock() const {
+  LLDB_INSTRUMENT_VA(this);
+
+  if (m_opaque_sp)
+    return m_opaque_sp->try_lock();
+
+  return false;
+}
diff --git a/lldb/source/Breakpoint/BreakpointLocationCollection.cpp b/lldb/source/Breakpoint/BreakpointLocationCollection.cpp
index 1d052c5fc9bb6..97715836ec104 100644
--- a/lldb/source/Breakpoint/BreakpointLocationCollection.cpp
+++ b/lldb/source/Breakpoint/BreakpointLocationCollection.cpp
@@ -17,7 +17,8 @@ using namespace lldb;
 using namespace lldb_private;
 
 // BreakpointLocationCollection constructor
-BreakpointLocationCollection::BreakpointLocationCollection() = default;
+BreakpointLocationCollection::BreakpointLocationCollection(bool preserving)
+    : m_preserving_bkpts(preserving) {}
 
 // Destructor
 BreakpointLocationCollection::~BreakpointLocationCollection() = default;
@@ -26,8 +27,19 @@ void BreakpointLocationCollection::Add(const BreakpointLocationSP &bp_loc) {
   std::lock_guard<std::mutex> guard(m_collection_mutex);
   BreakpointLocationSP old_bp_loc =
       FindByIDPair(bp_loc->GetBreakpoint().GetID(), bp_loc->GetID());
-  if (!old_bp_loc.get())
+  if (!old_bp_loc.get()) {
     m_break_loc_collection.push_back(bp_loc);
+    if (m_preserving_bkpts) {
+      lldb::break_id_t bp_loc_id = bp_loc->GetID();
+      Breakpoint &bkpt = bp_loc->GetBreakpoint();
+      lldb::break_id_t bp_id = bkpt.GetID();
+      std::pair<lldb::break_id_t, lldb::break_id_t> key =
+          std::make_pair(bp_id, bp_loc_id);
+      auto entry = m_preserved_bps.find(key);
+      if (entry == m_preserved_bps.end())
+        m_preserved_bps.emplace(key, bkpt.shared_from_this());
+    }
+  }
 }
 
 bool BreakpointLocationCollection::Remove(lldb::break_id_t bp_id,
@@ -35,6 +47,15 @@ bool BreakpointLocationCollection::Remove(lldb::break_id_t bp_id,
   std::lock_guard<std::mutex> guard(m_collection_mutex);
   collection::iterator pos = GetIDPairIterator(bp_id, bp_loc_id); // Predicate
   if (pos != m_break_loc_collection.end()) {
+    if (m_preserving_bkpts) {
+      std::pair<lldb::break_id_t, lldb::break_id_t> key =
+          std::make_pair(bp_id, bp_loc_id);
+      auto entry = m_preserved_bps.find(key);
+      if (entry == m_preserved_bps.end())
+        assert(0 && "Breakpoint added to collection but not preserving map.");
+      else
+        m_preserved_bps.erase(entry);
+    }
     m_break_loc_collection.erase(pos);
     return true;
   }
diff --git a/lldb/source/Commands/CommandOptionsProcessLaunch.cpp b/lldb/source/Commands/CommandOptionsProcessLaunch.cpp
index 21d94d68ceb91..8ae20bd76f456 100644
--- a/lldb/source/Commands/CommandOptionsProcessLaunch.cpp
+++ b/lldb/source/Commands/CommandOptionsProcessLaunch.cpp
@@ -127,6 +127,10 @@ Status CommandOptionsProcessLaunch::SetOptionValue(
     break;
   }
 
+  case 'M':
+    launch_info.GetFlags().Set(eLaunchFlagMemoryTagging);
+    break;
+
   case 'c':
     if (!option_arg.empty())
       launch_info.SetShell(FileSpec(option_arg));
diff --git a/lldb/source/Commands/Options.td b/lldb/source/Commands/Options.td
index 595b3d08abec5..a9f054e1d3d45 100644
--- a/lldb/source/Commands/Options.td
+++ b/lldb/source/Commands/Options.td
@@ -1173,6 +1173,11 @@ let Command = "process launch" in {
         Arg<"Boolean">,
         Desc<"Set whether to shell expand arguments to the process when "
              "launching.">;
+  def process_launch_memory_tagging
+      : Option<"memory-tagging", "M">,
+        Desc<"Set whether to explicitly enable memory tagging when launching "
+             "the process.  Requires hardware support.  "
+             "(Only supported on Darwin.)">;
 }
 
 let Command = "process attach" in {
diff --git a/lldb/source/Host/freebsd/Host.cpp b/lldb/source/Host/freebsd/Host.cpp
index fa7efad466bad..dfdbfea0c3c0a 100644
--- a/lldb/source/Host/freebsd/Host.cpp
+++ b/lldb/source/Host/freebsd/Host.cpp
@@ -18,8 +18,6 @@
 #include <dlfcn.h>
 #include <execinfo.h>
 
-#include "llvm/Object/ELF.h"
-
 #include "lldb/Host/FileSystem.h"
 #include "lldb/Host/Host.h"
 #include "lldb/Host/HostInfo.h"
@@ -32,6 +30,7 @@
 #include "lldb/Utility/Status.h"
 #include "lldb/Utility/StreamString.h"
 
+#include "llvm/Object/ELF.h"
 #include "llvm/TargetParser/Host.h"
 
 namespace lldb_private {
diff --git a/lldb/source/Host/macosx/objcxx/Host.mm b/lldb/source/Host/macosx/objcxx/Host.mm
index 3c1d1179963d2..96a282c64e44c 100644
--- a/lldb/source/Host/macosx/objcxx/Host.mm
+++ b/lldb/source/Host/macosx/objcxx/Host.mm
@@ -1210,6 +1210,38 @@ static Status LaunchProcessPosixSpawn(const char *exe_path,
     }
   }
 
+  if (launch_info.GetFlags().Test(eLaunchFlagMemoryTagging)) {
+    // The following function configures the spawn attributes to launch the
+    // process with memory tagging explicitly enabled.  We look it up
+    // dynamically since it is only available on newer OS.  Does nothing on
+    // hardware which does not support MTE.
+    //
+    //   int posix_spawnattr_set_use_sec_transition_shims_np(
+    //       posix_spawnattr_t *attr, uint32_t flags);
+    //
+    using posix_spawnattr_set_use_sec_transition_shims_np_t =
+        int (*)(posix_spawnattr_t *attr, uint32_t flags);
+    auto posix_spawnattr_enable_memory_tagging_fn =
+        (posix_spawnattr_set_use_sec_transition_shims_np_t)dlsym(
+            RTLD_DEFAULT, "posix_spawnattr_set_use_sec_transition_shims_np");
+    if (posix_spawnattr_enable_memory_tagging_fn) {
+      error = Status(posix_spawnattr_enable_memory_tagging_fn(&attr, 0),
+                     eErrorTypePOSIX);
+      if (error.Fail()) {
+        LLDB_LOG(log,
+                 "error: {0}, "
+                 "posix_spawnattr_set_use_sec_transition_shims_np(&attr, 0)",
+                 error);
+        return error;
+      }
+    } else {
+      LLDB_LOG(log,
+               "error: posix_spawnattr_set_use_sec_transition_shims_np not "
+               "available",
+               error);
+    }
+  }
+
   // Don't set the binpref if a shell was provided. After all, that's only
   // going to affect what version of the shell is launched, not what fork of
   // the binary is launched.  We insert "arch --arch <ARCH> as part of the
diff --git a/lldb/source/Host/windows/MainLoopWindows.cpp b/lldb/source/Host/windows/MainLoopWindows.cpp
index c0b10797e506a..9b7df10258bcd 100644
--- a/lldb/source/Host/windows/MainLoopWindows.cpp
+++ b/lldb/source/Host/windows/MainLoopWindows.cpp
@@ -55,11 +55,7 @@ class PipeEvent : public MainLoopWindows::IOEvent {
     if (m_monitor_thread.joinable()) {
       m_stopped = true;
       SetEvent(m_ready);
-      // Keep trying to cancel ReadFile() until the thread exits.
-      do {
-        CancelIoEx(m_handle, /*lpOverlapped=*/NULL);
-      } while (WaitForSingleObject(m_monitor_thread.native_handle(), 1) ==
-               WAIT_TIMEOUT);
+      CancelIoEx(m_handle, /*lpOverlapped=*/NULL);
       m_monitor_thread.join();
     }
     CloseHandle(m_event);
diff --git a/lldb/source/Plugins/ABI/LoongArch/ABISysV_loongarch.cpp b/lldb/source/Plugins/ABI/LoongArch/ABISysV_loongarch.cpp
index e6c8c8b469873..7bf99ce7bddee 100644
--- a/lldb/source/Plugins/ABI/LoongArch/ABISysV_loongarch.cpp
+++ b/lldb/source/Plugins/ABI/LoongArch/ABISysV_loongarch.cpp
@@ -597,15 +597,16 @@ bool ABISysV_loongarch::RegisterIsCalleeSaved(const RegisterInfo *reg_info) {
 
   return llvm::StringSwitch<bool>(name)
       // integer ABI names
-      .Cases("ra", "sp", "fp", true)
-      .Cases("s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", true)
+      .Cases({"ra", "sp", "fp"}, true)
+      .Cases({"s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9"}, true)
       // integer hardware names
-      .Cases("r1", "r3", "r22", true)
-      .Cases("r23", "r24", "r25", "r26", "r27", "r28", "r29", "r30", "31", true)
+      .Cases({"r1", "r3", "r22"}, true)
+      .Cases({"r23", "r24", "r25", "r26", "r27", "r28", "r29", "r30", "31"},
+             true)
       // floating point ABI names
-      .Cases("fs0", "fs1", "fs2", "fs3", "fs4", "fs5", "fs6", "fs7", is_hw_fp)
+      .Cases({"fs0", "fs1", "fs2", "fs3", "fs4", "fs5", "fs6", "fs7"}, is_hw_fp)
       // floating point hardware names
-      .Cases("f24", "f25", "f26", "f27", "f28", "f29", "f30", "f31", is_hw_fp)
+      .Cases({"f24", "f25", "f26", "f27", "f28", "f29", "f30", "f31"}, is_hw_fp)
       .Default(false);
 }
 
diff --git a/lldb/source/Plugins/ABI/RISCV/ABISysV_riscv.cpp b/lldb/source/Plugins/ABI/RISCV/ABISysV_riscv.cpp
index b313ca03fb970..822c93dbbec3d 100644
--- a/lldb/source/Plugins/ABI/RISCV/ABISysV_riscv.cpp
+++ b/lldb/source/Plugins/ABI/RISCV/ABISysV_riscv.cpp
@@ -783,21 +783,22 @@ bool ABISysV_riscv::RegisterIsCalleeSaved(const RegisterInfo *reg_info) {
   bool is_callee_saved =
       llvm::StringSwitch<bool>(name)
           // integer ABI names
-          .Cases("ra", "sp", "fp", true)
-          .Cases("s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9",
+          .Cases({"ra", "sp", "fp"}, true)
+          .Cases({"s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9"},
                  true)
-          .Cases("s10", "s11", true)
+          .Cases({"s10", "s11"}, true)
           // integer hardware names
-          .Cases("x1", "x2", "x8", "x9", "x18", "x19", "x20", "x21", "x22",
+          .Cases({"x1", "x2", "x8", "x9", "x18", "x19", "x20", "x21", "x22"},
                  true)
-          .Cases("x23", "x24", "x25", "x26", "x27", true)
+          .Cases({"x23", "x24", "x25", "x26", "x27"}, true)
           // floating point ABI names
-          .Cases("fs0", "fs1", "fs2", "fs3", "fs4", "fs5", "fs6", "fs7",
+          .Cases({"fs0", "fs1", "fs2", "fs3", "fs4", "fs5", "fs6", "fs7"},
                  is_hw_fp)
-          .Cases("fs8", "fs9", "fs10", "fs11", is_hw_fp)
+          .Cases({"fs8", "fs9", "fs10", "fs11"}, is_hw_fp)
           // floating point hardware names
-          .Cases("f8", "f9", "f18", "f19", "f20", "f21", "f22", "f23", is_hw_fp)
-          .Cases("f24", "f25", "f26", "f27", is_hw_fp)
+          .Cases({"f8", "f9", "f18", "f19", "f20", "f21", "f22", "f23"},
+                 is_hw_fp)
+          .Cases({"f24", "f25", "f26", "f27"}, is_hw_fp)
           .Default(false);
 
   return is_callee_saved;
diff --git a/lldb/source/Plugins/ABI/X86/ABISysV_x86_64.cpp b/lldb/source/Plugins/ABI/X86/ABISysV_x86_64.cpp
index 7646ccda010d0..effb3de8215d6 100644
--- a/lldb/source/Plugins/ABI/X86/ABISysV_x86_64.cpp
+++ b/lldb/source/Plugins/ABI/X86/ABISysV_x86_64.cpp
@@ -929,8 +929,8 @@ bool ABISysV_x86_64::RegisterIsCalleeSaved(const RegisterInfo *reg_info) {
   std::string Name = std::string(reg_info->name);
   bool IsCalleeSaved =
       llvm::StringSwitch<bool>(Name)
-          .Cases("r12", "r13", "r14", "r15", "rbp", "ebp", "rbx", "ebx", true)
-          .Cases("rip", "eip", "rsp", "esp", "sp", "fp", "pc", true)
+          .Cases({"r12", "r13", "r14", "r15", "rbp", "ebp", "rbx", "ebx"}, true)
+          .Cases({"rip", "eip", "rsp", "esp", "sp", "fp", "pc"}, true)
           .Default(false);
   return IsCalleeSaved;
 }
diff --git a/lldb/source/Plugins/ABI/X86/ABIWindows_x86_64.cpp b/lldb/source/Plugins/ABI/X86/ABIWindows_x86_64.cpp
index 56df6f6b5e97c..339012cffb688 100644
--- a/lldb/source/Plugins/ABI/X86/ABIWindows_x86_64.cpp
+++ b/lldb/source/Plugins/ABI/X86/ABIWindows_x86_64.cpp
@@ -797,10 +797,11 @@ bool ABIWindows_x86_64::RegisterIsCalleeSaved(const RegisterInfo *reg_info) {
   std::string Name = std::string(reg_info->name);
   bool IsCalleeSaved =
       llvm::StringSwitch<bool>(Name)
-          .Cases("rbx", "ebx", "rbp", "ebp", "rdi", "edi", "rsi", "esi", true)
-          .Cases("rsp", "esp", "r12", "r13", "r14", "r15", "sp", "fp", true)
-          .Cases("xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12",
-                 "xmm13", "xmm14", "xmm15", true)
+          .Cases({"rbx", "ebx", "rbp", "ebp", "rdi", "edi", "rsi", "esi"}, true)
+          .Cases({"rsp", "esp", "r12", "r13", "r14", "r15", "sp", "fp"}, true)
+          .Cases({"xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12",
+                  "xmm13", "xmm14", "xmm15"},
+                 true)
           .Default(false);
   return IsCalleeSaved;
 }
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/CMakeLists.txt b/lldb/source/Plugins/ExpressionParser/Clang/CMakeLists.txt
index 2aae7d13449d1..01d588ff6a78b 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/CMakeLists.txt
+++ b/lldb/source/Plugins/ExpressionParser/Clang/CMakeLists.txt
@@ -5,7 +5,6 @@ add_lldb_library(lldbPluginExpressionParserClang
   ClangASTImporter.cpp
   ClangASTMetadata.cpp
   ClangASTSource.cpp
-  ClangDeclVendor.cpp
   ClangExpressionDeclMap.cpp
   ClangExpressionHelper.cpp
   ClangExpressionParser.cpp
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangASTImporter.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangASTImporter.cpp
index 92094c01e2a54..e3b6ff8f17b4c 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangASTImporter.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangASTImporter.cpp
@@ -332,8 +332,7 @@ CompilerType ClangASTImporter::DeportType(TypeSystemClang &dst,
   DeclContextOverride decl_context_override;
 
   if (auto *t = ClangUtil::GetQualType(src_type)->getAs<TagType>())
-    decl_context_override.OverrideAllDeclsFromContainingFunction(
-        t->getOriginalDecl());
+    decl_context_override.OverrideAllDeclsFromContainingFunction(t->getDecl());
 
   CompleteTagDeclsScope complete_scope(*this, &dst.getASTContext(),
                                        &src_ctxt->getASTContext());
@@ -393,7 +392,7 @@ bool ClangASTImporter::CanImport(const CompilerType &type) {
   case clang::Type::Record:
     return CanImport(qual_type->getAsCXXRecordDecl());
   case clang::Type::Enum:
-    return CanImport(llvm::cast<clang::EnumType>(qual_type)->getOriginalDecl());
+    return CanImport(llvm::cast<clang::EnumType>(qual_type)->getDecl());
   case clang::Type::ObjCObject:
   case clang::Type::ObjCInterface: {
     const clang::ObjCObjectType *objc_class_type =
@@ -452,7 +451,7 @@ bool ClangASTImporter::Import(const CompilerType &type) {
 
   case clang::Type::Enum: {
     clang::EnumDecl *enum_decl =
-        llvm::cast<clang::EnumType>(qual_type)->getOriginalDecl();
+        llvm::cast<clang::EnumType>(qual_type)->getDecl();
     if (enum_decl) {
       if (GetDeclOrigin(enum_decl).Valid())
         return CompleteAndFetchChildren(qual_type);
@@ -591,7 +590,7 @@ bool ExtractBaseOffsets(const ASTRecordLayout &record_layout,
       return false;
 
     DeclFromUser<RecordDecl> origin_base_record(
-        origin_base_record_type->getOriginalDecl());
+        origin_base_record_type->getDecl());
 
     if (origin_base_record.IsInvalid())
       return false;
@@ -722,8 +721,7 @@ bool ClangASTImporter::importRecordLayoutFromOrigin(
 
         QualType base_type = bi->getType();
         const RecordType *base_record_type = base_type->getAs<RecordType>();
-        DeclFromParser<RecordDecl> base_record(
-            base_record_type->getOriginalDecl());
+        DeclFromParser<RecordDecl> base_record(base_record_type->getDecl());
         DeclFromParser<CXXRecordDecl> base_cxx_record =
             DynCast<CXXRecordDecl>(base_record);
 
@@ -855,7 +853,7 @@ bool ClangASTImporter::CompleteAndFetchChildren(clang::QualType type) {
   Log *log = GetLog(LLDBLog::Expressions);
 
   if (const TagType *tag_type = type->getAs<TagType>()) {
-    TagDecl *tag_decl = tag_type->getOriginalDecl();
+    TagDecl *tag_decl = tag_type->getDecl();
 
     DeclOrigin decl_origin = GetDeclOrigin(tag_decl);
 
@@ -923,7 +921,7 @@ bool ClangASTImporter::RequireCompleteType(clang::QualType type) {
     return false;
 
   if (const TagType *tag_type = type->getAs<TagType>()) {
-    TagDecl *tag_decl = tag_type->getOriginalDecl();
+    TagDecl *tag_decl = tag_type->getDecl();
 
     if (tag_decl->getDefinition())
       return true;
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.cpp
index 21a930745893a..0efeb2e68decb 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.cpp
@@ -8,7 +8,6 @@
 
 #include "ClangASTSource.h"
 
-#include "ClangDeclVendor.h"
 #include "ClangModulesDeclVendor.h"
 
 #include "lldb/Core/Module.h"
@@ -223,7 +222,7 @@ TagDecl *ClangASTSource::FindCompleteType(const TagDecl *decl) {
           continue;
 
         TagDecl *candidate_tag_decl =
-            tag_type->getOriginalDecl()->getDefinitionOrSelf();
+            tag_type->getDecl()->getDefinitionOrSelf();
 
         if (TypeSystemClang::GetCompleteDecl(
                 &candidate_tag_decl->getASTContext(), candidate_tag_decl))
@@ -250,8 +249,7 @@ TagDecl *ClangASTSource::FindCompleteType(const TagDecl *decl) {
       if (!tag_type)
         continue;
 
-      TagDecl *candidate_tag_decl =
-          tag_type->getOriginalDecl()->getDefinitionOrSelf();
+      TagDecl *candidate_tag_decl = tag_type->getDecl()->getDefinitionOrSelf();
 
       if (TypeSystemClang::GetCompleteDecl(&candidate_tag_decl->getASTContext(),
                                            candidate_tag_decl))
@@ -800,7 +798,7 @@ void ClangASTSource::FindDeclInModules(NameSearchContext &context,
 
   bool append = false;
   uint32_t max_matches = 1;
-  std::vector<clang::NamedDecl *> decls;
+  std::vector<CompilerDecl> decls;
 
   if (!modules_decl_vendor->FindDecls(name, append, max_matches, decls))
     return;
@@ -808,7 +806,8 @@ void ClangASTSource::FindDeclInModules(NameSearchContext &context,
   LLDB_LOG(log, "  CAS::FEVD Matching entity found for \"{0}\" in the modules",
            name);
 
-  clang::NamedDecl *const decl_from_modules = decls[0];
+  auto *const decl_from_modules =
+      llvm::cast<NamedDecl>(ClangUtil::GetDecl(decls[0]));
 
   if (llvm::isa<clang::TypeDecl>(decl_from_modules) ||
       llvm::isa<clang::ObjCContainerDecl>(decl_from_modules) ||
@@ -850,16 +849,16 @@ void ClangASTSource::FindDeclInObjCRuntime(NameSearchContext &context,
 
   bool append = false;
   uint32_t max_matches = 1;
-  std::vector<clang::NamedDecl *> decls;
+  std::vector<CompilerDecl> decls;
 
-  auto *clang_decl_vendor = llvm::cast<ClangDeclVendor>(decl_vendor);
+  auto *clang_decl_vendor = llvm::cast<DeclVendor>(decl_vendor);
   if (!clang_decl_vendor->FindDecls(name, append, max_matches, decls))
     return;
 
   LLDB_LOG(log, "  CAS::FEVD Matching type found for \"{0}\" in the runtime",
            name);
 
-  clang::Decl *copied_decl = CopyDecl(decls[0]);
+  clang::Decl *copied_decl = CopyDecl(ClangUtil::GetDecl(decls[0]));
   clang::NamedDecl *copied_named_decl =
       copied_decl ? dyn_cast<clang::NamedDecl>(copied_decl) : nullptr;
 
@@ -1082,14 +1081,14 @@ void ClangASTSource::FindObjCMethodDecls(NameSearchContext &context) {
       ConstString interface_name(interface_decl->getNameAsString().c_str());
       bool append = false;
       uint32_t max_matches = 1;
-      std::vector<clang::NamedDecl *> decls;
+      std::vector<CompilerDecl> decls;
 
       if (!modules_decl_vendor->FindDecls(interface_name, append, max_matches,
                                           decls))
         break;
 
       ObjCInterfaceDecl *interface_decl_from_modules =
-          dyn_cast<ObjCInterfaceDecl>(decls[0]);
+          dyn_cast<ObjCInterfaceDecl>(ClangUtil::GetDecl(decls[0]));
 
       if (!interface_decl_from_modules)
         break;
@@ -1122,15 +1121,15 @@ void ClangASTSource::FindObjCMethodDecls(NameSearchContext &context) {
     ConstString interface_name(interface_decl->getNameAsString().c_str());
     bool append = false;
     uint32_t max_matches = 1;
-    std::vector<clang::NamedDecl *> decls;
+    std::vector<CompilerDecl> decls;
 
-    auto *clang_decl_vendor = llvm::cast<ClangDeclVendor>(decl_vendor);
+    auto *clang_decl_vendor = llvm::cast<DeclVendor>(decl_vendor);
     if (!clang_decl_vendor->FindDecls(interface_name, append, max_matches,
                                       decls))
       break;
 
     ObjCInterfaceDecl *runtime_interface_decl =
-        dyn_cast<ObjCInterfaceDecl>(decls[0]);
+        dyn_cast<ObjCInterfaceDecl>(ClangUtil::GetDecl(decls[0]));
 
     if (!runtime_interface_decl)
       break;
@@ -1255,13 +1254,13 @@ void ClangASTSource::FindObjCPropertyAndIvarDecls(NameSearchContext &context) {
 
     bool append = false;
     uint32_t max_matches = 1;
-    std::vector<clang::NamedDecl *> decls;
+    std::vector<CompilerDecl> decls;
 
     if (!modules_decl_vendor->FindDecls(class_name, append, max_matches, decls))
       break;
 
     DeclFromUser<const ObjCInterfaceDecl> interface_decl_from_modules(
-        dyn_cast<ObjCInterfaceDecl>(decls[0]));
+        dyn_cast<ObjCInterfaceDecl>(ClangUtil::GetDecl(decls[0])));
 
     if (!interface_decl_from_modules.IsValid())
       break;
@@ -1298,14 +1297,14 @@ void ClangASTSource::FindObjCPropertyAndIvarDecls(NameSearchContext &context) {
 
     bool append = false;
     uint32_t max_matches = 1;
-    std::vector<clang::NamedDecl *> decls;
+    std::vector<CompilerDecl> decls;
 
-    auto *clang_decl_vendor = llvm::cast<ClangDeclVendor>(decl_vendor);
+    auto *clang_decl_vendor = llvm::cast<DeclVendor>(decl_vendor);
     if (!clang_decl_vendor->FindDecls(class_name, append, max_matches, decls))
       break;
 
     DeclFromUser<const ObjCInterfaceDecl> interface_decl_from_runtime(
-        dyn_cast<ObjCInterfaceDecl>(decls[0]));
+        dyn_cast<ObjCInterfaceDecl>(ClangUtil::GetDecl(decls[0])));
 
     if (!interface_decl_from_runtime.IsValid())
       break;
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangDeclVendor.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangDeclVendor.cpp
deleted file mode 100644
index 867d4ff0a9077..0000000000000
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangDeclVendor.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-//===-- ClangDeclVendor.cpp -----------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "Plugins/ExpressionParser/Clang/ClangDeclVendor.h"
-#include "Plugins/ExpressionParser/Clang/ClangUtil.h"
-#include "Plugins/TypeSystem/Clang/TypeSystemClang.h"
-
-#include "lldb/Utility/ConstString.h"
-
-using namespace lldb_private;
-
-uint32_t ClangDeclVendor::FindDecls(ConstString name, bool append,
-                                    uint32_t max_matches,
-                                    std::vector<clang::NamedDecl *> &decls) {
-  if (!append)
-    decls.clear();
-
-  std::vector<CompilerDecl> compiler_decls;
-  uint32_t ret = FindDecls(name, /*append*/ false, max_matches, compiler_decls);
-  for (CompilerDecl compiler_decl : compiler_decls) {
-    clang::Decl *d = ClangUtil::GetDecl(compiler_decl);
-    clang::NamedDecl *nd = llvm::cast<clang::NamedDecl>(d);
-    decls.push_back(nd);
-  }
-  return ret;
-}
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangDeclVendor.h b/lldb/source/Plugins/ExpressionParser/Clang/ClangDeclVendor.h
deleted file mode 100644
index a9b2d4110ab2f..0000000000000
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangDeclVendor.h
+++ /dev/null
@@ -1,43 +0,0 @@
-//===-- ClangDeclVendor.h ---------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLDB_SOURCE_PLUGINS_EXPRESSIONPARSER_CLANG_CLANGDECLVENDOR_H
-#define LLDB_SOURCE_PLUGINS_EXPRESSIONPARSER_CLANG_CLANGDECLVENDOR_H
-
-#include "lldb/Symbol/DeclVendor.h"
-
-namespace clang {
-class NamedDecl;
-}
-
-namespace lldb_private {
-
-// A clang specialized extension to DeclVendor.
-class ClangDeclVendor : public DeclVendor {
-public:
-  ClangDeclVendor(DeclVendorKind kind) : DeclVendor(kind) {}
-
-  ~ClangDeclVendor() override = default;
-
-  using DeclVendor::FindDecls;
-
-  uint32_t FindDecls(ConstString name, bool append, uint32_t max_matches,
-                     std::vector<clang::NamedDecl *> &decls);
-
-  static bool classof(const DeclVendor *vendor) {
-    return vendor->GetKind() >= eClangDeclVendor &&
-           vendor->GetKind() < eLastClangDeclVendor;
-  }
-
-private:
-  ClangDeclVendor(const ClangDeclVendor &) = delete;
-  const ClangDeclVendor &operator=(const ClangDeclVendor &) = delete;
-};
-} // namespace lldb_private
-
-#endif
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionDeclMap.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionDeclMap.cpp
index 8a68282bbbf3e..9cb8f7a44de61 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionDeclMap.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionDeclMap.cpp
@@ -1023,13 +1023,14 @@ void ClangExpressionDeclMap::LookupInModulesDeclVendor(
 
   bool append = false;
   uint32_t max_matches = 1;
-  std::vector<clang::NamedDecl *> decls;
+  std::vector<CompilerDecl> decls;
 
   if (!modules_decl_vendor->FindDecls(name, append, max_matches, decls))
     return;
 
   assert(!decls.empty() && "FindDecls returned true but no decls?");
-  clang::NamedDecl *const decl_from_modules = decls[0];
+  auto *const decl_from_modules =
+      llvm::cast<NamedDecl>(ClangUtil::GetDecl(decls[0]));
 
   LLDB_LOG(log,
            "  CAS::FEVD Matching decl found for "
@@ -1223,7 +1224,7 @@ bool ClangExpressionDeclMap::LookupFunction(
 
   Target *target = m_parser_vars->m_exe_ctx.GetTargetPtr();
 
-  std::vector<clang::NamedDecl *> decls_from_modules;
+  std::vector<CompilerDecl> decls_from_modules;
 
   if (target) {
     if (std::shared_ptr<ClangModulesDeclVendor> decl_vendor =
@@ -1314,7 +1315,8 @@ bool ClangExpressionDeclMap::LookupFunction(
     }
 
     if (!found_function_with_type_info) {
-      for (clang::NamedDecl *decl : decls_from_modules) {
+      for (const CompilerDecl &compiler_decl : decls_from_modules) {
+        clang::Decl *decl = ClangUtil::GetDecl(compiler_decl);
         if (llvm::isa<clang::FunctionDecl>(decl)) {
           clang::NamedDecl *copied_decl =
               llvm::cast_or_null<FunctionDecl>(CopyDecl(decl));
@@ -1561,7 +1563,7 @@ ClangExpressionDeclMap::AddExpressionVariable(NameSearchContext &context,
 
   if (const clang::Type *parser_type = parser_opaque_type.getTypePtr()) {
     if (const TagType *tag_type = dyn_cast<TagType>(parser_type))
-      CompleteType(tag_type->getOriginalDecl()->getDefinitionOrSelf());
+      CompleteType(tag_type->getDecl()->getDefinitionOrSelf());
     if (const ObjCObjectPointerType *objc_object_ptr_type =
             dyn_cast<ObjCObjectPointerType>(parser_type))
       CompleteType(objc_object_ptr_type->getInterfaceDecl());
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.cpp
index 67984c5f44bf0..b77e2690deb06 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.cpp
@@ -226,7 +226,7 @@ void StoringDiagnosticConsumer::SetCurrentModuleProgress(
 }
 
 ClangModulesDeclVendor::ClangModulesDeclVendor()
-    : ClangDeclVendor(eClangModuleDeclVendor) {}
+    : DeclVendor(eClangModuleDeclVendor) {}
 
 ClangModulesDeclVendor::~ClangModulesDeclVendor() = default;
 
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.h b/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.h
index d820552a29129..ad4d060319e31 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.h
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.h
@@ -9,17 +9,16 @@
 #ifndef LLDB_SOURCE_PLUGINS_EXPRESSIONPARSER_CLANG_CLANGMODULESDECLVENDOR_H
 #define LLDB_SOURCE_PLUGINS_EXPRESSIONPARSER_CLANG_CLANGMODULESDECLVENDOR_H
 
+#include "lldb/Symbol/DeclVendor.h"
 #include "lldb/Symbol/SourceModule.h"
 #include "lldb/Target/Platform.h"
 
-#include "Plugins/ExpressionParser/Clang/ClangDeclVendor.h"
-
 #include <set>
 #include <vector>
 
 namespace lldb_private {
 
-class ClangModulesDeclVendor : public ClangDeclVendor {
+class ClangModulesDeclVendor : public DeclVendor {
 public:
   // Constructors and Destructors
   ClangModulesDeclVendor();
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/NameSearchContext.cpp b/lldb/source/Plugins/ExpressionParser/Clang/NameSearchContext.cpp
index 6f57c18063672..794b194ec47d5 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/NameSearchContext.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/NameSearchContext.cpp
@@ -153,7 +153,7 @@ NameSearchContext::AddTypeDecl(const CompilerType &clang_type) {
 
       return (NamedDecl *)typedef_name_decl;
     } else if (const TagType *tag_type = qual_type->getAs<TagType>()) {
-      TagDecl *tag_decl = tag_type->getOriginalDecl()->getDefinitionOrSelf();
+      TagDecl *tag_decl = tag_type->getDecl()->getDefinitionOrSelf();
 
       m_decls.push_back(tag_decl);
 
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp
index 4b183a8d62e53..5588208a3ef84 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp
@@ -52,7 +52,7 @@ class LibcxxStdUnorderedMapSyntheticFrontEnd
   ValueObject *m_tree = nullptr;
   size_t m_num_elements = 0;
   ValueObject *m_next_element = nullptr;
-  std::vector<std::pair<ValueObject *, uint64_t>> m_elements_cache;
+  std::vector<ValueObject *> m_elements_cache;
 };
 
 class LibCxxUnorderedMapIteratorSyntheticFrontEnd
@@ -192,26 +192,25 @@ lldb::ValueObjectSP lldb_private::formatters::
           return nullptr;
       }
     }
-    m_elements_cache.push_back(
-        {value_sp.get(), hash_sp->GetValueAsUnsigned(0)});
+    m_elements_cache.push_back(value_sp.get());
     m_next_element = node_sp->GetChildMemberWithName("__next_").get();
     if (!m_next_element || m_next_element->GetValueAsUnsigned(0) == 0)
       m_next_element = nullptr;
   }
 
-  std::pair<ValueObject *, uint64_t> val_hash = m_elements_cache[idx];
-  if (!val_hash.first)
+  ValueObject *val_hash = m_elements_cache[idx];
+  if (!val_hash)
     return lldb::ValueObjectSP();
   StreamString stream;
   stream.Printf("[%" PRIu64 "]", (uint64_t)idx);
   DataExtractor data;
   Status error;
-  val_hash.first->GetData(data, error);
+  val_hash->GetData(data, error);
   if (error.Fail())
     return lldb::ValueObjectSP();
   const bool thread_and_frame_only_if_stopped = true;
-  ExecutionContext exe_ctx = val_hash.first->GetExecutionContextRef().Lock(
-      thread_and_frame_only_if_stopped);
+  ExecutionContext exe_ctx =
+      val_hash->GetExecutionContextRef().Lock(thread_and_frame_only_if_stopped);
   return CreateValueObjectFromData(stream.GetString(), data, exe_ctx,
                                    m_element_type);
 }
diff --git a/lldb/source/Plugins/Language/CPlusPlus/MsvcStlAtomic.cpp b/lldb/source/Plugins/Language/CPlusPlus/MsvcStlAtomic.cpp
index c8718616c45ab..020ba10166231 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/MsvcStlAtomic.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/MsvcStlAtomic.cpp
@@ -50,7 +50,7 @@ llvm::Expected<uint32_t> lldb_private::formatters::
 lldb::ValueObjectSP
 lldb_private::formatters::MsvcStlAtomicSyntheticFrontEnd::GetChildAtIndex(
     uint32_t idx) {
-  if (idx == 0)
+  if (idx == 0 && m_storage && m_element_type.IsValid())
     return m_storage->Cast(m_element_type)->Clone(ConstString("Value"));
   return nullptr;
 }
diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.cpp b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.cpp
index cc0c9e728964e..954f269f8860b 100644
--- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.cpp
+++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.cpp
@@ -14,6 +14,7 @@
 #include "lldb/Utility/LLDBLog.h"
 #include "lldb/Utility/Log.h"
 #include "lldb/lldb-enumerations.h"
+#include "llvm/ADT/Sequence.h"
 
 using namespace lldb;
 using namespace lldb_private;
@@ -259,6 +260,7 @@ bool ClassDescriptorV2::method_list_t::Read(Process *process,
   uint32_t entsize = extractor.GetU32_unchecked(&cursor);
   m_is_small = (entsize & 0x80000000) != 0;
   m_has_direct_selector = (entsize & 0x40000000) != 0;
+  m_has_relative_types = (entsize & 0x20000000) != 0;
   m_entsize = entsize & 0xfffc;
   m_count = extractor.GetU32_unchecked(&cursor);
   m_first_ptr = addr + cursor;
@@ -266,22 +268,50 @@ bool ClassDescriptorV2::method_list_t::Read(Process *process,
   return true;
 }
 
-bool ClassDescriptorV2::method_t::Read(Process *process, lldb::addr_t addr,
-                                       lldb::addr_t relative_selector_base_addr,
-                                       bool is_small, bool has_direct_sel) {
-  size_t ptr_size = process->GetAddressByteSize();
-  size_t size = GetSize(process, is_small);
+llvm::SmallVector<ClassDescriptorV2::method_t, 0>
+ClassDescriptorV2::ReadMethods(llvm::ArrayRef<lldb::addr_t> addresses,
+                               lldb::addr_t relative_string_base_addr,
+                               bool is_small, bool has_direct_sel,
+                               bool has_relative_types) const {
+  lldb_private::Process *process = m_runtime.GetProcess();
+  if (!process)
+    return {};
 
-  DataBufferHeap buffer(size, '\0');
-  Status error;
+  const size_t size = method_t::GetSize(process, is_small);
+  const size_t num_methods = addresses.size();
 
-  process->ReadMemory(addr, buffer.GetBytes(), size, error);
-  if (error.Fail()) {
-    return false;
+  llvm::SmallVector<uint8_t, 0> buffer(num_methods * size, 0);
+
+  llvm::SmallVector<Range<addr_t, size_t>> mem_ranges =
+      llvm::to_vector(llvm::map_range(llvm::seq(num_methods), [&](size_t idx) {
+        return Range<addr_t, size_t>(addresses[idx], size);
+      }));
+
+  llvm::SmallVector<llvm::MutableArrayRef<uint8_t>> read_results =
+      process->ReadMemoryRanges(mem_ranges, buffer);
+
+  llvm::SmallVector<method_t, 0> methods;
+  methods.reserve(num_methods);
+  for (auto [addr, memory] : llvm::zip(addresses, read_results)) {
+    // Ignore partial reads.
+    if (memory.size() != size)
+      continue;
+
+    DataExtractor extractor(memory.data(), size, process->GetByteOrder(),
+                            process->GetAddressByteSize());
+    methods.push_back(method_t());
+    methods.back().Read(extractor, process, addr, relative_string_base_addr,
+                        is_small, has_direct_sel, has_relative_types);
   }
 
-  DataExtractor extractor(buffer.GetBytes(), size, process->GetByteOrder(),
-                          ptr_size);
+  return methods;
+}
+
+bool ClassDescriptorV2::method_t::Read(DataExtractor &extractor,
+                                       Process *process, lldb::addr_t addr,
+                                       lldb::addr_t relative_string_base_addr,
+                                       bool is_small, bool has_direct_sel,
+                                       bool has_relative_types) {
   lldb::offset_t cursor = 0;
 
   if (is_small) {
@@ -291,16 +321,19 @@ bool ClassDescriptorV2::method_t::Read(Process *process, lldb::addr_t addr,
 
     m_name_ptr = addr + nameref_offset;
 
+    Status error;
     if (!has_direct_sel) {
       // The SEL offset points to a SELRef. We need to dereference twice.
-      m_name_ptr = process->ReadUnsignedIntegerFromMemory(m_name_ptr, ptr_size,
-                                                          0, error);
-      if (!error.Success())
+      m_name_ptr = process->ReadPointerFromMemory(m_name_ptr, error);
+      if (error.Fail())
         return false;
-    } else if (relative_selector_base_addr != LLDB_INVALID_ADDRESS) {
-      m_name_ptr = relative_selector_base_addr + nameref_offset;
+    } else if (relative_string_base_addr != LLDB_INVALID_ADDRESS) {
+      m_name_ptr = relative_string_base_addr + nameref_offset;
     }
-    m_types_ptr = addr + 4 + types_offset;
+    if (has_relative_types)
+      m_types_ptr = relative_string_base_addr + types_offset;
+    else
+      m_types_ptr = addr + 4 + types_offset;
     m_imp_ptr = addr + 8 + imp_offset;
   } else {
     m_name_ptr = extractor.GetAddress_unchecked(&cursor);
@@ -308,13 +341,13 @@ bool ClassDescriptorV2::method_t::Read(Process *process, lldb::addr_t addr,
     m_imp_ptr = extractor.GetAddress_unchecked(&cursor);
   }
 
+  Status error;
   process->ReadCStringFromMemory(m_name_ptr, m_name, error);
-  if (error.Fail()) {
+  if (error.Fail())
     return false;
-  }
 
   process->ReadCStringFromMemory(m_types_ptr, m_types, error);
-  return !error.Fail();
+  return error.Success();
 }
 
 bool ClassDescriptorV2::ivar_list_t::Read(Process *process, lldb::addr_t addr) {
@@ -447,17 +480,20 @@ ClassDescriptorV2::GetMethodList(Process *process,
 bool ClassDescriptorV2::ProcessMethodList(
     std::function<bool(const char *, const char *)> const &instance_method_func,
     ClassDescriptorV2::method_list_t &method_list) const {
-  lldb_private::Process *process = m_runtime.GetProcess();
-  auto method = std::make_unique<method_t>();
-  lldb::addr_t relative_selector_base_addr =
-      m_runtime.GetRelativeSelectorBaseAddr();
-  for (uint32_t i = 0, e = method_list.m_count; i < e; ++i) {
-    method->Read(process, method_list.m_first_ptr + (i * method_list.m_entsize),
-                 relative_selector_base_addr, method_list.m_is_small,
-                 method_list.m_has_direct_selector);
-    if (instance_method_func(method->m_name.c_str(), method->m_types.c_str()))
+  auto idx_to_method_addr = [&](uint32_t idx) {
+    return method_list.m_first_ptr + (idx * method_list.m_entsize);
+  };
+  llvm::SmallVector<addr_t> addresses = llvm::to_vector(llvm::map_range(
+      llvm::seq<uint32_t>(method_list.m_count), idx_to_method_addr));
+
+  llvm::SmallVector<method_t, 0> methods =
+      ReadMethods(addresses, m_runtime.GetRelativeSelectorBaseAddr(),
+                  method_list.m_is_small, method_list.m_has_direct_selector,
+                  method_list.m_has_relative_types);
+
+  for (const auto &method : methods)
+    if (instance_method_func(method.m_name.c_str(), method.m_types.c_str()))
       break;
-  }
   return true;
 }
 
diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.h b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.h
index 920a5eba20abd..0fff9af438367 100644
--- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.h
+++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.h
@@ -143,6 +143,7 @@ class ClassDescriptorV2 : public ObjCLanguageRuntime::ClassDescriptor {
     uint16_t m_entsize;
     bool m_is_small;
     bool m_has_direct_selector;
+    bool m_has_relative_types;
     uint32_t m_count;
     lldb::addr_t m_first_ptr;
 
@@ -172,11 +173,16 @@ class ClassDescriptorV2 : public ObjCLanguageRuntime::ClassDescriptor {
              + field_size; // IMP imp;
     }
 
-    bool Read(Process *process, lldb::addr_t addr,
-              lldb::addr_t relative_selector_base_addr, bool is_small,
-              bool has_direct_sel);
+    bool Read(DataExtractor &extractor, Process *process, lldb::addr_t addr,
+              lldb::addr_t relative_string_base_addr, bool is_small,
+              bool has_direct_sel, bool has_relative_types);
   };
 
+  llvm::SmallVector<method_t, 0>
+  ReadMethods(llvm::ArrayRef<lldb::addr_t> addresses,
+              lldb::addr_t relative_string_base_addr, bool is_small,
+              bool has_direct_sel, bool has_relative_types) const;
+
   struct ivar_list_t {
     uint32_t m_entsize;
     uint32_t m_count;
diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCDeclVendor.cpp b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCDeclVendor.cpp
index d6d2df27c5e74..60f9893074589 100644
--- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCDeclVendor.cpp
+++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCDeclVendor.cpp
@@ -131,7 +131,7 @@ class lldb_private::AppleObjCExternalASTSource
 };
 
 AppleObjCDeclVendor::AppleObjCDeclVendor(ObjCLanguageRuntime &runtime)
-    : ClangDeclVendor(eAppleObjCDeclVendor), m_runtime(runtime),
+    : DeclVendor(eAppleObjCDeclVendor), m_runtime(runtime),
       m_type_realizer_sp(m_runtime.GetEncodingToType()) {
   m_ast_ctx = std::make_shared<TypeSystemClang>(
       "AppleObjCDeclVendor AST",
@@ -537,83 +537,75 @@ uint32_t AppleObjCDeclVendor::FindDecls(ConstString name, bool append,
   if (!append)
     decls.clear();
 
-  uint32_t ret = 0;
+  // See if the type is already in our ASTContext.
 
-  do {
-    // See if the type is already in our ASTContext.
-
-    clang::ASTContext &ast_ctx = m_ast_ctx->getASTContext();
-
-    clang::IdentifierInfo &identifier_info =
-        ast_ctx.Idents.get(name.GetStringRef());
-    clang::DeclarationName decl_name =
-        ast_ctx.DeclarationNames.getIdentifier(&identifier_info);
-
-    clang::DeclContext::lookup_result lookup_result =
-        ast_ctx.getTranslationUnitDecl()->lookup(decl_name);
-
-    if (!lookup_result.empty()) {
-      if (clang::ObjCInterfaceDecl *result_iface_decl =
-             llvm::dyn_cast<clang::ObjCInterfaceDecl>(*lookup_result.begin())) {
-        if (log) {
-          clang::QualType result_iface_type =
-              ast_ctx.getObjCInterfaceType(result_iface_decl);
-
-          uint64_t isa_value = LLDB_INVALID_ADDRESS;
-          if (std::optional<ClangASTMetadata> metadata =
-                  m_ast_ctx->GetMetadata(result_iface_decl))
-            isa_value = metadata->GetISAPtr();
-
-          LLDB_LOGF(log,
-                    "AOCTV::FT Found %s (isa 0x%" PRIx64 ") in the ASTContext",
-                    result_iface_type.getAsString().data(), isa_value);
-        }
+  clang::ASTContext &ast_ctx = m_ast_ctx->getASTContext();
 
-        decls.push_back(m_ast_ctx->GetCompilerDecl(result_iface_decl));
-        ret++;
-        break;
-      } else {
-        LLDB_LOGF(log, "AOCTV::FT There's something in the ASTContext, but "
-                       "it's not something we know about");
-        break;
+  clang::IdentifierInfo &identifier_info =
+      ast_ctx.Idents.get(name.GetStringRef());
+  clang::DeclarationName decl_name =
+      ast_ctx.DeclarationNames.getIdentifier(&identifier_info);
+
+  clang::DeclContext::lookup_result lookup_result =
+      ast_ctx.getTranslationUnitDecl()->lookup(decl_name);
+
+  if (!lookup_result.empty()) {
+    if (clang::ObjCInterfaceDecl *result_iface_decl =
+            llvm::dyn_cast<clang::ObjCInterfaceDecl>(*lookup_result.begin())) {
+      if (log) {
+        clang::QualType result_iface_type =
+            ast_ctx.getObjCInterfaceType(result_iface_decl);
+
+        uint64_t isa_value = LLDB_INVALID_ADDRESS;
+        if (std::optional<ClangASTMetadata> metadata =
+                m_ast_ctx->GetMetadata(result_iface_decl))
+          isa_value = metadata->GetISAPtr();
+
+        LLDB_LOGF(log,
+                  "AOCTV::FT Found %s (isa 0x%" PRIx64 ") in the ASTContext",
+                  result_iface_type.getAsString().data(), isa_value);
       }
-    } else if (log) {
-      LLDB_LOGF(log, "AOCTV::FT Couldn't find %s in the ASTContext",
-                name.AsCString());
+
+      decls.push_back(m_ast_ctx->GetCompilerDecl(result_iface_decl));
+      return 1;
     }
 
-    // It's not.  If it exists, we have to put it into our ASTContext.
+    LLDB_LOGF(log, "AOCTV::FT There's something in the ASTContext, but "
+                   "it's not something we know about");
+    return 0;
+  }
 
-    ObjCLanguageRuntime::ObjCISA isa = m_runtime.GetISA(name);
+  LLDB_LOGF(log, "AOCTV::FT Couldn't find %s in the ASTContext",
+            name.AsCString());
 
-    if (!isa) {
-      LLDB_LOGF(log, "AOCTV::FT Couldn't find the isa");
+  // It's not.  If it exists, we have to put it into our ASTContext.
 
-      break;
-    }
+  ObjCLanguageRuntime::ObjCISA isa = m_runtime.GetISA(name);
 
-    clang::ObjCInterfaceDecl *iface_decl = GetDeclForISA(isa);
+  if (!isa) {
+    LLDB_LOGF(log, "AOCTV::FT Couldn't find the isa");
 
-    if (!iface_decl) {
-      LLDB_LOGF(log,
-                "AOCTV::FT Couldn't get the Objective-C interface for "
-                "isa 0x%" PRIx64,
-                (uint64_t)isa);
+    return 0;
+  }
 
-      break;
-    }
+  clang::ObjCInterfaceDecl *iface_decl = GetDeclForISA(isa);
 
-    if (log) {
-      clang::QualType new_iface_type = ast_ctx.getObjCInterfaceType(iface_decl);
+  if (!iface_decl) {
+    LLDB_LOGF(log,
+              "AOCTV::FT Couldn't get the Objective-C interface for "
+              "isa 0x%" PRIx64,
+              (uint64_t)isa);
 
-      LLDB_LOG(log, "AOCTV::FT Created {0} (isa 0x{1:x})",
-               new_iface_type.getAsString(), (uint64_t)isa);
-    }
+    return 0;
+  }
 
-    decls.push_back(m_ast_ctx->GetCompilerDecl(iface_decl));
-    ret++;
-    break;
-  } while (false);
+  if (log) {
+    clang::QualType new_iface_type = ast_ctx.getObjCInterfaceType(iface_decl);
+
+    LLDB_LOG(log, "AOCTV::FT Created {0} (isa 0x{1:x})",
+             new_iface_type.getAsString(), (uint64_t)isa);
+  }
 
-  return ret;
+  decls.push_back(m_ast_ctx->GetCompilerDecl(iface_decl));
+  return 1;
 }
diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCDeclVendor.h b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCDeclVendor.h
index 3bb0f77f6bde4..2cfa86dad74b1 100644
--- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCDeclVendor.h
+++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCDeclVendor.h
@@ -11,15 +11,15 @@
 
 #include "lldb/lldb-private.h"
 
-#include "Plugins/ExpressionParser/Clang/ClangDeclVendor.h"
 #include "Plugins/LanguageRuntime/ObjC/ObjCLanguageRuntime.h"
 #include "Plugins/TypeSystem/Clang/TypeSystemClang.h"
+#include "lldb/Symbol/DeclVendor.h"
 
 namespace lldb_private {
 
 class AppleObjCExternalASTSource;
 
-class AppleObjCDeclVendor : public ClangDeclVendor {
+class AppleObjCDeclVendor : public DeclVendor {
 public:
   AppleObjCDeclVendor(ObjCLanguageRuntime &runtime);
 
diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/ObjCLanguageRuntime.cpp b/lldb/source/Plugins/LanguageRuntime/ObjC/ObjCLanguageRuntime.cpp
index c33760eccd127..2b2ca080c7f55 100644
--- a/lldb/source/Plugins/LanguageRuntime/ObjC/ObjCLanguageRuntime.cpp
+++ b/lldb/source/Plugins/LanguageRuntime/ObjC/ObjCLanguageRuntime.cpp
@@ -423,6 +423,46 @@ Status ObjCLanguageRuntime::ObjCExceptionPrecondition::ConfigurePrecondition(
   return error;
 }
 
+CompilerType ObjCLanguageRuntime::LookupInModulesVendor(ConstString class_name,
+                                                        Target &target) {
+  assert(class_name);
+
+  auto *persistent_state = llvm::cast<ClangPersistentVariables>(
+      target.GetPersistentExpressionStateForLanguage(lldb::eLanguageTypeC));
+  if (!persistent_state)
+    return {};
+
+  auto clang_modules_decl_vendor_sp =
+      persistent_state->GetClangModulesDeclVendor();
+  if (!clang_modules_decl_vendor_sp)
+    return {};
+
+  auto types = clang_modules_decl_vendor_sp->FindTypes(
+      class_name, /*max_matches*/ UINT32_MAX);
+  if (types.empty())
+    return {};
+
+  return types.front();
+}
+
+CompilerType ObjCLanguageRuntime::LookupInRuntime(ConstString class_name) {
+  auto *runtime_vendor = GetDeclVendor();
+  if (!runtime_vendor)
+    return {};
+
+  std::vector<CompilerDecl> compiler_decls;
+  runtime_vendor->FindDecls(class_name, false, UINT32_MAX, compiler_decls);
+  if (compiler_decls.empty())
+    return {};
+
+  auto *ctx =
+      llvm::dyn_cast<TypeSystemClang>(compiler_decls[0].GetTypeSystem());
+  if (!ctx)
+    return {};
+
+  return ctx->GetTypeForDecl(compiler_decls[0].GetOpaqueDecl());
+}
+
 std::optional<CompilerType>
 ObjCLanguageRuntime::GetRuntimeType(CompilerType base_type) {
   CompilerType class_type;
@@ -442,18 +482,21 @@ ObjCLanguageRuntime::GetRuntimeType(CompilerType base_type) {
   if (!class_name)
     return std::nullopt;
 
-  TypeSP complete_objc_class_type_sp = LookupInCompleteClassCache(class_name);
-  if (!complete_objc_class_type_sp)
-    return std::nullopt;
-
-  CompilerType complete_class(
-      complete_objc_class_type_sp->GetFullCompilerType());
-  if (complete_class.GetCompleteType()) {
-    if (is_pointer_type)
-      return complete_class.GetPointerType();
-    else
-      return complete_class;
+  if (TypeSP complete_objc_class_type_sp =
+          LookupInCompleteClassCache(class_name)) {
+    if (CompilerType complete_class =
+            complete_objc_class_type_sp->GetFullCompilerType();
+        complete_class.GetCompleteType())
+      return is_pointer_type ? complete_class.GetPointerType() : complete_class;
   }
 
+  assert(m_process);
+  if (CompilerType found =
+          LookupInModulesVendor(class_name, m_process->GetTarget()))
+    return is_pointer_type ? found.GetPointerType() : found;
+
+  if (CompilerType found = LookupInRuntime(class_name))
+    return is_pointer_type ? found.GetPointerType() : found;
+
   return std::nullopt;
 }
diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/ObjCLanguageRuntime.h b/lldb/source/Plugins/LanguageRuntime/ObjC/ObjCLanguageRuntime.h
index 45de098c15f51..cc8281e9d1a02 100644
--- a/lldb/source/Plugins/LanguageRuntime/ObjC/ObjCLanguageRuntime.h
+++ b/lldb/source/Plugins/LanguageRuntime/ObjC/ObjCLanguageRuntime.h
@@ -465,6 +465,10 @@ class ObjCLanguageRuntime : public LanguageRuntime {
 
   ObjCLanguageRuntime(const ObjCLanguageRuntime &) = delete;
   const ObjCLanguageRuntime &operator=(const ObjCLanguageRuntime &) = delete;
+
+private:
+  CompilerType LookupInRuntime(ConstString class_name);
+  CompilerType LookupInModulesVendor(ConstString class_name, Target &process);
 };
 
 } // namespace lldb_private
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp
index cd72454fe0287..5aad4470091bc 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp
@@ -1150,7 +1150,7 @@ void PlatformDarwin::AddClangModuleCompilationOptionsForSDKType(
     case XcodeSDK::Type::XRSimulator:
     case XcodeSDK::Type::XROS:
       // FIXME: Pass the right argument once it exists.
-    case XcodeSDK::Type::bridgeOS:
+    case XcodeSDK::Type::BridgeOS:
     case XcodeSDK::Type::Linux:
     case XcodeSDK::Type::unknown:
       if (Log *log = GetLog(LLDBLog::Host)) {
diff --git a/lldb/source/Plugins/Process/Utility/GDBRemoteSignals.cpp b/lldb/source/Plugins/Process/Utility/GDBRemoteSignals.cpp
index 15981a2c1cb80..a8d18f774c361 100644
--- a/lldb/source/Plugins/Process/Utility/GDBRemoteSignals.cpp
+++ b/lldb/source/Plugins/Process/Utility/GDBRemoteSignals.cpp
@@ -47,7 +47,7 @@ void GDBRemoteSignals::Reset() {
   AddSignal(25,     "SIGXFSZ",      false,    true,   true,   "file size limit exceeded");
   AddSignal(26,     "SIGVTALRM",    false,    true,   true,   "virtual time alarm");
   AddSignal(27,     "SIGPROF",      false,    false,  false,  "profiling time alarm");
-  AddSignal(28,     "SIGWINCH",     false,    true,   true,   "window size changes");
+  AddSignal(28,     "SIGWINCH",     false,    false,   false,   "window size changes");
   AddSignal(29,     "SIGLOST",      false,    true,   true,   "resource lost");
   AddSignal(30,     "SIGUSR1",      false,    true,   true,   "user defined signal 1");
   AddSignal(31,     "SIGUSR2",      false,    true,   true,   "user defined signal 2");
diff --git a/lldb/source/Plugins/Process/Utility/LinuxSignals.cpp b/lldb/source/Plugins/Process/Utility/LinuxSignals.cpp
index 5346babc18576..dbbfc6a352e02 100644
--- a/lldb/source/Plugins/Process/Utility/LinuxSignals.cpp
+++ b/lldb/source/Plugins/Process/Utility/LinuxSignals.cpp
@@ -160,7 +160,7 @@ void LinuxSignals::Reset() {
   ADD_LINUX_SIGNAL(25,     "SIGXFSZ",      false,    true,   true,   "file size limit exceeded");
   ADD_LINUX_SIGNAL(26,     "SIGVTALRM",    false,    true,   true,   "virtual time alarm");
   ADD_LINUX_SIGNAL(27,     "SIGPROF",      false,    false,  false,  "profiling time alarm");
-  ADD_LINUX_SIGNAL(28,     "SIGWINCH",     false,    true,   true,   "window size changes");
+  ADD_LINUX_SIGNAL(28,     "SIGWINCH",     false,    false,   false,   "window size changes");
   ADD_LINUX_SIGNAL(29,     "SIGIO",        false,    true,   true,   "input/output ready/Pollable event", "SIGPOLL");
   ADD_LINUX_SIGNAL(30,     "SIGPWR",       false,    true,   true,   "power failure");
   ADD_LINUX_SIGNAL(31,     "SIGSYS",       false,    true,   true,   "invalid system call");
diff --git a/lldb/source/Plugins/Process/Utility/RegisterContextFreeBSD_x86_64.cpp b/lldb/source/Plugins/Process/Utility/RegisterContextFreeBSD_x86_64.cpp
index e0f3971c6e272..c361b2abb726b 100644
--- a/lldb/source/Plugins/Process/Utility/RegisterContextFreeBSD_x86_64.cpp
+++ b/lldb/source/Plugins/Process/Utility/RegisterContextFreeBSD_x86_64.cpp
@@ -9,6 +9,7 @@
 #include "RegisterContextFreeBSD_x86_64.h"
 #include "RegisterContextFreeBSD_i386.h"
 #include "RegisterContextPOSIX_x86.h"
+#include "llvm/Support/Threading.h"
 #include <vector>
 
 using namespace lldb_private;
@@ -69,40 +70,34 @@ struct UserArea {
 #include "RegisterInfos_x86_64.h"
 #undef DECLARE_REGISTER_INFOS_X86_64_STRUCT
 
-static std::vector<lldb_private::RegisterInfo> &GetSharedRegisterInfoVector() {
-  static std::vector<lldb_private::RegisterInfo> register_infos;
-  return register_infos;
-}
-
-static const RegisterInfo *
-GetRegisterInfo_i386(const lldb_private::ArchSpec &arch) {
-  static std::vector<lldb_private::RegisterInfo> g_register_infos(
-      GetSharedRegisterInfoVector());
-
-  // Allocate RegisterInfo only once
-  if (g_register_infos.empty()) {
-    // Copy the register information from base class
-    std::unique_ptr<RegisterContextFreeBSD_i386> reg_interface(
-        new RegisterContextFreeBSD_i386(arch));
-    const RegisterInfo *base_info = reg_interface->GetRegisterInfo();
-    g_register_infos.insert(g_register_infos.end(), &base_info[0],
-                            &base_info[k_num_registers_i386]);
+static std::vector<lldb_private::RegisterInfo> &
+GetSharedRegisterInfoVector_i386(const lldb_private::ArchSpec &arch) {
+  static std::vector<lldb_private::RegisterInfo> g_register_infos;
+  static llvm::once_flag g_initialized;
+  llvm::call_once(g_initialized, [&]() {
+    if (g_register_infos.empty()) {
+      // Copy the register information from base class
+      std::unique_ptr<RegisterContextFreeBSD_i386> reg_interface(
+          new RegisterContextFreeBSD_i386(arch));
+      const RegisterInfo *base_info = reg_interface->GetRegisterInfo();
+      g_register_infos.insert(g_register_infos.end(), &base_info[0],
+                              &base_info[k_num_registers_i386]);
 
 // Include RegisterInfos_x86_64 to update the g_register_infos structure
 //  with x86_64 offsets.
 #define UPDATE_REGISTER_INFOS_I386_STRUCT_WITH_X86_64_OFFSETS
 #include "RegisterInfos_x86_64.h"
 #undef UPDATE_REGISTER_INFOS_I386_STRUCT_WITH_X86_64_OFFSETS
-  }
-
-  return &g_register_infos[0];
+    }
+  });
+  return g_register_infos;
 }
 
 static const RegisterInfo *
 PrivateGetRegisterInfoPtr(const lldb_private::ArchSpec &target_arch) {
   switch (target_arch.GetMachine()) {
   case llvm::Triple::x86:
-    return GetRegisterInfo_i386(target_arch);
+    return &GetSharedRegisterInfoVector_i386(target_arch)[0];
   case llvm::Triple::x86_64:
     return g_register_infos_x86_64;
   default:
@@ -116,9 +111,10 @@ PrivateGetRegisterCount(const lldb_private::ArchSpec &target_arch) {
   switch (target_arch.GetMachine()) {
   case llvm::Triple::x86:
     // This vector should have already been filled.
-    assert(!GetSharedRegisterInfoVector().empty() &&
+    assert(!GetSharedRegisterInfoVector_i386(target_arch).empty() &&
            "i386 register info vector not filled.");
-    return static_cast<uint32_t>(GetSharedRegisterInfoVector().size());
+    return static_cast<uint32_t>(
+        GetSharedRegisterInfoVector_i386(target_arch).size());
   case llvm::Triple::x86_64:
     return static_cast<uint32_t>(sizeof(g_register_infos_x86_64) /
                                  sizeof(g_register_infos_x86_64[0]));
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
index 7d2bd452acca9..11f164c2426ce 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
@@ -211,6 +211,12 @@ bool GDBRemoteCommunicationClient::GetReverseStepSupported() {
   return m_supports_reverse_step == eLazyBoolYes;
 }
 
+bool GDBRemoteCommunicationClient::GetMultiMemReadSupported() {
+  if (m_supports_multi_mem_read == eLazyBoolCalculate)
+    GetRemoteQSupported();
+  return m_supports_multi_mem_read == eLazyBoolYes;
+}
+
 bool GDBRemoteCommunicationClient::QueryNoAckModeSupported() {
   if (m_supports_not_sending_acks == eLazyBoolCalculate) {
     m_send_acks = true;
@@ -339,6 +345,7 @@ void GDBRemoteCommunicationClient::ResetDiscoverableSettings(bool did_exec) {
     m_supported_async_json_packets_is_valid = false;
     m_supported_async_json_packets_sp.reset();
     m_supports_jModulesInfo = true;
+    m_supports_multi_mem_read = eLazyBoolCalculate;
   }
 
   // These flags should be reset when we first connect to a GDB server and when
@@ -365,6 +372,7 @@ void GDBRemoteCommunicationClient::GetRemoteQSupported() {
   m_x_packet_state.reset();
   m_supports_reverse_continue = eLazyBoolNo;
   m_supports_reverse_step = eLazyBoolNo;
+  m_supports_multi_mem_read = eLazyBoolNo;
 
   m_max_packet_size = UINT64_MAX; // It's supposed to always be there, but if
                                   // not, we assume no limit
@@ -424,6 +432,8 @@ void GDBRemoteCommunicationClient::GetRemoteQSupported() {
         m_supports_reverse_continue = eLazyBoolYes;
       else if (x == "ReverseStep+")
         m_supports_reverse_step = eLazyBoolYes;
+      else if (x == "MultiMemRead+")
+        m_supports_multi_mem_read = eLazyBoolYes;
       // Look for a list of compressions in the features list e.g.
       // qXfer:features:read+;PacketSize=20000;qEcho+;SupportedCompressions=zlib-
       // deflate,lzma
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h
index a765e95bf9814..ad590a25d0f16 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h
@@ -342,6 +342,8 @@ class GDBRemoteCommunicationClient : public GDBRemoteClientBase {
 
   bool GetReverseStepSupported();
 
+  bool GetMultiMemReadSupported();
+
   LazyBool SupportsAllocDeallocMemory() // const
   {
     // Uncomment this to have lldb pretend the debug server doesn't respond to
@@ -574,6 +576,7 @@ class GDBRemoteCommunicationClient : public GDBRemoteClientBase {
   std::optional<xPacketState> m_x_packet_state;
   LazyBool m_supports_reverse_continue = eLazyBoolCalculate;
   LazyBool m_supports_reverse_step = eLazyBoolCalculate;
+  LazyBool m_supports_multi_mem_read = eLazyBoolCalculate;
 
   bool m_supports_qProcessInfoPID : 1, m_supports_qfProcessInfo : 1,
       m_supports_qUserName : 1, m_supports_qGroupName : 1,
diff --git a/lldb/source/Plugins/SymbolFile/NativePDB/PdbUtil.cpp b/lldb/source/Plugins/SymbolFile/NativePDB/PdbUtil.cpp
index 888bd89a72625..6c66d86d3e645 100644
--- a/lldb/source/Plugins/SymbolFile/NativePDB/PdbUtil.cpp
+++ b/lldb/source/Plugins/SymbolFile/NativePDB/PdbUtil.cpp
@@ -946,17 +946,21 @@ lldb_private::npdb::GetCompilerTypeForSimpleKind(SimpleTypeKind kind) {
   case SimpleTypeKind::Complex64:
     return lldb::eBasicTypeDoubleComplex;
   case SimpleTypeKind::Complex32:
+  case SimpleTypeKind::Complex32PartialPrecision:
     return lldb::eBasicTypeFloatComplex;
-  case SimpleTypeKind::Float128:
   case SimpleTypeKind::Float80:
     return lldb::eBasicTypeLongDouble;
+  case SimpleTypeKind::Float128:
+    return lldb::eBasicTypeFloat128;
   case SimpleTypeKind::Float64:
     return lldb::eBasicTypeDouble;
   case SimpleTypeKind::Float32:
+  case SimpleTypeKind::Float32PartialPrecision:
     return lldb::eBasicTypeFloat;
   case SimpleTypeKind::Float16:
     return lldb::eBasicTypeHalf;
   case SimpleTypeKind::Int128:
+  case SimpleTypeKind::Int128Oct:
     return lldb::eBasicTypeInt128;
   case SimpleTypeKind::Int64:
   case SimpleTypeKind::Int64Quad:
@@ -967,6 +971,7 @@ lldb_private::npdb::GetCompilerTypeForSimpleKind(SimpleTypeKind kind) {
   case SimpleTypeKind::Int16Short:
     return lldb::eBasicTypeShort;
   case SimpleTypeKind::UInt128:
+  case SimpleTypeKind::UInt128Oct:
     return lldb::eBasicTypeUnsignedInt128;
   case SimpleTypeKind::UInt64:
   case SimpleTypeKind::UInt64Quad:
@@ -985,16 +990,27 @@ lldb_private::npdb::GetCompilerTypeForSimpleKind(SimpleTypeKind kind) {
     return lldb::eBasicTypeVoid;
   case SimpleTypeKind::WideCharacter:
     return lldb::eBasicTypeWChar;
-  default:
+
+  // Not supported.
+  case SimpleTypeKind::Float48:
+  case SimpleTypeKind::Complex16:
+  case SimpleTypeKind::Complex48:
+  case SimpleTypeKind::Complex128:
+  case SimpleTypeKind::NotTranslated:
+  case SimpleTypeKind::None:
     return lldb::eBasicTypeInvalid;
   }
+  return lldb::eBasicTypeInvalid;
 }
 
 size_t lldb_private::npdb::GetTypeSizeForSimpleKind(SimpleTypeKind kind) {
   switch (kind) {
   case SimpleTypeKind::Boolean128:
+  case SimpleTypeKind::Complex128:
   case SimpleTypeKind::Int128:
+  case SimpleTypeKind::Int128Oct:
   case SimpleTypeKind::UInt128:
+  case SimpleTypeKind::UInt128Oct:
   case SimpleTypeKind::Float128:
     return 16;
   case SimpleTypeKind::Complex80:
@@ -1008,10 +1024,15 @@ size_t lldb_private::npdb::GetTypeSizeForSimpleKind(SimpleTypeKind kind) {
   case SimpleTypeKind::Int64:
   case SimpleTypeKind::Int64Quad:
     return 8;
+  case SimpleTypeKind::Complex48:
+  case SimpleTypeKind::Float48:
+    return 6;
   case SimpleTypeKind::Boolean32:
   case SimpleTypeKind::Character32:
   case SimpleTypeKind::Complex32:
+  case SimpleTypeKind::Complex32PartialPrecision:
   case SimpleTypeKind::Float32:
+  case SimpleTypeKind::Float32PartialPrecision:
   case SimpleTypeKind::Int32:
   case SimpleTypeKind::Int32Long:
   case SimpleTypeKind::UInt32Long:
@@ -1020,6 +1041,7 @@ size_t lldb_private::npdb::GetTypeSizeForSimpleKind(SimpleTypeKind kind) {
     return 4;
   case SimpleTypeKind::Boolean16:
   case SimpleTypeKind::Character16:
+  case SimpleTypeKind::Complex16:
   case SimpleTypeKind::Float16:
   case SimpleTypeKind::Int16:
   case SimpleTypeKind::Int16Short:
@@ -1035,10 +1057,13 @@ size_t lldb_private::npdb::GetTypeSizeForSimpleKind(SimpleTypeKind kind) {
   case SimpleTypeKind::SByte:
   case SimpleTypeKind::Character8:
     return 1;
+
   case SimpleTypeKind::Void:
-  default:
+  case SimpleTypeKind::None:
+  case SimpleTypeKind::NotTranslated:
     return 0;
   }
+  return 0;
 }
 
 PdbTypeSymId lldb_private::npdb::GetBestPossibleDecl(PdbTypeSymId id,
diff --git a/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp b/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp
index 7e275f196d10e..ecd3188b3d564 100644
--- a/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp
+++ b/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp
@@ -152,14 +152,24 @@ static bool IsFunctionEpilogue(const CompilandIndexItem &cci,
   return false;
 }
 
+// See llvm::codeview::TypeIndex::simpleTypeName as well as strForPrimitiveTi
+// from the original pdbdump:
+// https://github.com/microsoft/microsoft-pdb/blob/805655a28bd8198004be2ac27e6e0290121a5e89/pdbdump/pdbdump.cpp#L1896-L1974
+//
+// For 64bit integers we use "long long" like DIA instead of "__int64".
 static llvm::StringRef GetSimpleTypeName(SimpleTypeKind kind) {
   switch (kind) {
   case SimpleTypeKind::Boolean128:
-  case SimpleTypeKind::Boolean16:
-  case SimpleTypeKind::Boolean32:
+    return "__bool128";
   case SimpleTypeKind::Boolean64:
+    return "__bool64";
+  case SimpleTypeKind::Boolean32:
+    return "__bool32";
+  case SimpleTypeKind::Boolean16:
+    return "__bool16";
   case SimpleTypeKind::Boolean8:
     return "bool";
+
   case SimpleTypeKind::Byte:
   case SimpleTypeKind::UnsignedCharacter:
     return "unsigned char";
@@ -168,57 +178,81 @@ static llvm::StringRef GetSimpleTypeName(SimpleTypeKind kind) {
   case SimpleTypeKind::SignedCharacter:
   case SimpleTypeKind::SByte:
     return "signed char";
-  case SimpleTypeKind::Character16:
-    return "char16_t";
   case SimpleTypeKind::Character32:
     return "char32_t";
+  case SimpleTypeKind::Character16:
+    return "char16_t";
   case SimpleTypeKind::Character8:
     return "char8_t";
+
+  case SimpleTypeKind::Complex128:
+    return "_Complex __float128";
   case SimpleTypeKind::Complex80:
+    return "_Complex long double";
   case SimpleTypeKind::Complex64:
+    return "_Complex double";
+  case SimpleTypeKind::Complex48:
+    return "_Complex __float48";
   case SimpleTypeKind::Complex32:
-    return "complex";
+  case SimpleTypeKind::Complex32PartialPrecision:
+    return "_Complex float";
+  case SimpleTypeKind::Complex16:
+    return "_Complex _Float16";
+
   case SimpleTypeKind::Float128:
+    return "__float128";
   case SimpleTypeKind::Float80:
     return "long double";
   case SimpleTypeKind::Float64:
     return "double";
+  case SimpleTypeKind::Float48:
+    return "__float48";
   case SimpleTypeKind::Float32:
+  case SimpleTypeKind::Float32PartialPrecision:
     return "float";
   case SimpleTypeKind::Float16:
-    return "single";
+    return "_Float16";
+
+  case SimpleTypeKind::Int128Oct:
   case SimpleTypeKind::Int128:
     return "__int128";
   case SimpleTypeKind::Int64:
   case SimpleTypeKind::Int64Quad:
-    return "int64_t";
+    return "long long";
+  case SimpleTypeKind::Int32Long:
+    return "long";
   case SimpleTypeKind::Int32:
     return "int";
   case SimpleTypeKind::Int16:
+  case SimpleTypeKind::Int16Short:
     return "short";
+
+  case SimpleTypeKind::UInt128Oct:
   case SimpleTypeKind::UInt128:
     return "unsigned __int128";
   case SimpleTypeKind::UInt64:
   case SimpleTypeKind::UInt64Quad:
-    return "uint64_t";
-  case SimpleTypeKind::HResult:
-    return "HRESULT";
+    return "unsigned long long";
   case SimpleTypeKind::UInt32:
     return "unsigned";
   case SimpleTypeKind::UInt16:
   case SimpleTypeKind::UInt16Short:
     return "unsigned short";
-  case SimpleTypeKind::Int32Long:
-    return "long";
   case SimpleTypeKind::UInt32Long:
     return "unsigned long";
+
+  case SimpleTypeKind::HResult:
+    return "HRESULT";
   case SimpleTypeKind::Void:
     return "void";
   case SimpleTypeKind::WideCharacter:
     return "wchar_t";
-  default:
+
+  case SimpleTypeKind::None:
+  case SimpleTypeKind::NotTranslated:
     return "";
   }
+  return "";
 }
 
 static bool IsClassRecord(TypeLeafKind kind) {
@@ -598,8 +632,8 @@ lldb::TypeSP SymbolFileNativePDB::CreateSimpleType(TypeIndex ti,
   uint64_t uid = toOpaqueUid(PdbTypeSymId(ti, false));
   if (ti == TypeIndex::NullptrT()) {
     Declaration decl;
-    return MakeType(uid, ConstString("std::nullptr_t"), 0, nullptr,
-                    LLDB_INVALID_UID, Type::eEncodingIsUID, decl, ct,
+    return MakeType(uid, ConstString("decltype(nullptr)"), std::nullopt,
+                    nullptr, LLDB_INVALID_UID, Type::eEncodingIsUID, decl, ct,
                     Type::ResolveState::Full);
   }
 
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
index 12cabff7c36e4..82dfe7e540717 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
@@ -2629,7 +2629,7 @@ TypeSystemClang::GetDeclContextForType(clang::QualType type) {
   case clang::Type::Enum:
   case clang::Type::Record:
     return llvm::cast<clang::TagType>(qual_type)
-        ->getOriginalDecl()
+        ->getDecl()
         ->getDefinitionOrSelf();
   default:
     break;
@@ -2879,8 +2879,7 @@ bool TypeSystemClang::IsAnonymousType(lldb::opaque_compiler_type_t type) {
     if (const clang::RecordType *record_type =
             llvm::dyn_cast_or_null<clang::RecordType>(
                 qual_type.getTypePtrOrNull())) {
-      if (const clang::RecordDecl *record_decl =
-              record_type->getOriginalDecl()) {
+      if (const clang::RecordDecl *record_decl = record_type->getDecl()) {
         return record_decl->isAnonymousStructOrUnion();
       }
     }
@@ -3121,7 +3120,7 @@ TypeSystemClang::IsHomogeneousAggregate(lldb::opaque_compiler_type_t type,
           llvm::cast<clang::RecordType>(qual_type.getTypePtr());
       if (record_type) {
         if (const clang::RecordDecl *record_decl =
-                record_type->getOriginalDecl()->getDefinition()) {
+                record_type->getDecl()->getDefinition()) {
           // We are looking for a structure that contains only floating point
           // types
           clang::RecordDecl::field_iterator field_pos,
@@ -3301,7 +3300,7 @@ bool TypeSystemClang::IsEnumerationType(lldb::opaque_compiler_type_t type,
         GetCanonicalQualType(type)->getCanonicalTypeInternal());
 
     if (enum_type) {
-      IsIntegerType(enum_type->getOriginalDecl()
+      IsIntegerType(enum_type->getDecl()
                         ->getDefinitionOrSelf()
                         ->getIntegerType()
                         .getAsOpaquePtr(),
@@ -3529,7 +3528,7 @@ bool TypeSystemClang::IsDefined(lldb::opaque_compiler_type_t type) {
   const clang::TagType *tag_type =
       llvm::dyn_cast<clang::TagType>(qual_type.getTypePtr());
   if (tag_type) {
-    if (clang::TagDecl *tag_decl = tag_type->getOriginalDecl()->getDefinition())
+    if (clang::TagDecl *tag_decl = tag_type->getDecl()->getDefinition())
       return tag_decl->isCompleteDefinition();
     return false;
   } else {
@@ -3782,7 +3781,7 @@ bool TypeSystemClang::IsBeingDefined(lldb::opaque_compiler_type_t type) {
   clang::QualType qual_type(GetCanonicalQualType(type));
   const clang::TagType *tag_type = llvm::dyn_cast<clang::TagType>(qual_type);
   if (tag_type)
-    return tag_type->getOriginalDecl()->isEntityBeingDefined();
+    return tag_type->getDecl()->isEntityBeingDefined();
   return false;
 }
 
@@ -3988,7 +3987,7 @@ TypeSystemClang::GetTypeInfo(lldb::opaque_compiler_type_t type,
     if (pointee_or_element_clang_type)
       pointee_or_element_clang_type->SetCompilerType(
           weak_from_this(), llvm::cast<clang::EnumType>(qual_type)
-                                ->getOriginalDecl()
+                                ->getDecl()
                                 ->getDefinitionOrSelf()
                                 ->getIntegerType()
                                 .getAsOpaquePtr());
@@ -4228,7 +4227,7 @@ TypeSystemClang::GetTypeClass(lldb::opaque_compiler_type_t type) {
   case clang::Type::Record: {
     const clang::RecordType *record_type =
         llvm::cast<clang::RecordType>(qual_type.getTypePtr());
-    const clang::RecordDecl *record_decl = record_type->getOriginalDecl();
+    const clang::RecordDecl *record_decl = record_type->getDecl();
     if (record_decl->isUnion())
       return lldb::eTypeClassUnion;
     else if (record_decl->isStruct())
@@ -4704,9 +4703,9 @@ CompilerType TypeSystemClang::CreateTypedef(
     clang::TagDecl *tdecl = nullptr;
     if (!qual_type.isNull()) {
       if (const clang::RecordType *rt = qual_type->getAs<clang::RecordType>())
-        tdecl = rt->getOriginalDecl();
+        tdecl = rt->getDecl();
       if (const clang::EnumType *et = qual_type->getAs<clang::EnumType>())
-        tdecl = et->getOriginalDecl();
+        tdecl = et->getDecl();
     }
 
     // Check whether this declaration is an anonymous struct, union, or enum,
@@ -5386,7 +5385,7 @@ TypeSystemClang::GetNumChildren(lldb::opaque_compiler_type_t type,
       const clang::RecordType *record_type =
           llvm::cast<clang::RecordType>(qual_type.getTypePtr());
       const clang::RecordDecl *record_decl =
-          record_type->getOriginalDecl()->getDefinitionOrSelf();
+          record_type->getDecl()->getDefinitionOrSelf();
       const clang::CXXRecordDecl *cxx_record_decl =
           llvm::dyn_cast<clang::CXXRecordDecl>(record_decl);
 
@@ -5583,7 +5582,7 @@ void TypeSystemClang::ForEachEnumerator(
       llvm::dyn_cast<clang::EnumType>(GetCanonicalQualType(type));
   if (enum_type) {
     const clang::EnumDecl *enum_decl =
-        enum_type->getOriginalDecl()->getDefinitionOrSelf();
+        enum_type->getDecl()->getDefinitionOrSelf();
     if (enum_decl) {
       CompilerType integer_type = GetType(enum_decl->getIntegerType());
 
@@ -5615,7 +5614,7 @@ uint32_t TypeSystemClang::GetNumFields(lldb::opaque_compiler_type_t type) {
           llvm::dyn_cast<clang::RecordType>(qual_type.getTypePtr());
       if (record_type) {
         clang::RecordDecl *record_decl =
-            record_type->getOriginalDecl()->getDefinition();
+            record_type->getDecl()->getDefinition();
         if (record_decl) {
           count = std::distance(record_decl->field_begin(),
                                 record_decl->field_end());
@@ -5730,7 +5729,7 @@ CompilerType TypeSystemClang::GetFieldAtIndex(lldb::opaque_compiler_type_t type,
       const clang::RecordType *record_type =
           llvm::cast<clang::RecordType>(qual_type.getTypePtr());
       const clang::RecordDecl *record_decl =
-          record_type->getOriginalDecl()->getDefinitionOrSelf();
+          record_type->getDecl()->getDefinitionOrSelf();
       uint32_t field_idx = 0;
       clang::RecordDecl::field_iterator field, field_end;
       for (field = record_decl->field_begin(),
@@ -5916,7 +5915,7 @@ CompilerType TypeSystemClang::GetDirectBaseClassAtIndex(
                   llvm::cast<clang::CXXRecordDecl>(
                       base_class->getType()
                           ->castAs<clang::RecordType>()
-                          ->getOriginalDecl());
+                          ->getDecl());
               if (base_class->isVirtual())
                 *bit_offset_ptr =
                     record_layout.getVBaseClassOffset(base_class_decl)
@@ -6011,7 +6010,7 @@ CompilerType TypeSystemClang::GetVirtualBaseClassAtIndex(
                   llvm::cast<clang::CXXRecordDecl>(
                       base_class->getType()
                           ->castAs<clang::RecordType>()
-                          ->getOriginalDecl());
+                          ->getDecl());
               *bit_offset_ptr =
                   record_layout.getVBaseClassOffset(base_class_decl)
                       .getQuantity() *
@@ -6042,7 +6041,7 @@ TypeSystemClang::GetStaticFieldWithName(lldb::opaque_compiler_type_t type,
     const clang::RecordType *record_type =
         llvm::cast<clang::RecordType>(qual_type.getTypePtr());
     const clang::RecordDecl *record_decl =
-        record_type->getOriginalDecl()->getDefinitionOrSelf();
+        record_type->getDecl()->getDefinitionOrSelf();
 
     clang::DeclarationName decl_name(&getASTContext().Idents.get(name));
     for (NamedDecl *decl : record_decl->lookup(decl_name)) {
@@ -6271,7 +6270,7 @@ llvm::Expected<CompilerType> TypeSystemClang::GetChildCompilerTypeAtIndex(
       const clang::RecordType *record_type =
           llvm::cast<clang::RecordType>(parent_qual_type.getTypePtr());
       const clang::RecordDecl *record_decl =
-          record_type->getOriginalDecl()->getDefinitionOrSelf();
+          record_type->getDecl()->getDefinitionOrSelf();
       const clang::ASTRecordLayout &record_layout =
           getASTContext().getASTRecordLayout(record_decl);
       uint32_t child_idx = 0;
@@ -6292,7 +6291,7 @@ llvm::Expected<CompilerType> TypeSystemClang::GetChildCompilerTypeAtIndex(
             base_class_decl = llvm::cast<clang::CXXRecordDecl>(
                                   base_class->getType()
                                       ->getAs<clang::RecordType>()
-                                      ->getOriginalDecl())
+                                      ->getDecl())
                                   ->getDefinitionOrSelf();
             if (!TypeSystemClang::RecordHasFields(base_class_decl))
               continue;
@@ -6303,7 +6302,7 @@ llvm::Expected<CompilerType> TypeSystemClang::GetChildCompilerTypeAtIndex(
               base_class_decl = llvm::cast<clang::CXXRecordDecl>(
                                     base_class->getType()
                                         ->getAs<clang::RecordType>()
-                                        ->getOriginalDecl())
+                                        ->getDecl())
                                     ->getDefinitionOrSelf();
 
             if (base_class->isVirtual()) {
@@ -6766,7 +6765,7 @@ size_t TypeSystemClang::GetIndexOfChildMemberWithName(
         const clang::RecordType *record_type =
             llvm::cast<clang::RecordType>(qual_type.getTypePtr());
         const clang::RecordDecl *record_decl =
-            record_type->getOriginalDecl()->getDefinitionOrSelf();
+            record_type->getDecl()->getDefinitionOrSelf();
 
         assert(record_decl);
         uint32_t child_idx = 0;
@@ -6833,7 +6832,7 @@ size_t TypeSystemClang::GetIndexOfChildMemberWithName(
                   child_indexes.push_back(child_idx);
                   parent_record_decl = elem.Base->getType()
                                            ->castAs<clang::RecordType>()
-                                           ->getOriginalDecl()
+                                           ->getDecl()
                                            ->getDefinitionOrSelf();
                 }
               }
@@ -6969,7 +6968,7 @@ TypeSystemClang::GetIndexOfChildWithName(lldb::opaque_compiler_type_t type,
         const clang::RecordType *record_type =
             llvm::cast<clang::RecordType>(qual_type.getTypePtr());
         const clang::RecordDecl *record_decl =
-            record_type->getOriginalDecl()->getDefinitionOrSelf();
+            record_type->getDecl()->getDefinitionOrSelf();
 
         assert(record_decl);
         uint32_t child_idx = 0;
@@ -6988,7 +6987,7 @@ TypeSystemClang::GetIndexOfChildWithName(lldb::opaque_compiler_type_t type,
                 llvm::cast<clang::CXXRecordDecl>(
                     base_class->getType()
                         ->castAs<clang::RecordType>()
-                        ->getOriginalDecl())
+                        ->getDecl())
                     ->getDefinitionOrSelf();
             if (omit_empty_base_classes &&
                 !TypeSystemClang::RecordHasFields(base_class_decl))
@@ -7109,7 +7108,7 @@ TypeSystemClang::GetDirectNestedTypeWithName(lldb::opaque_compiler_type_t type,
     const clang::RecordType *record_type =
         llvm::cast<clang::RecordType>(qual_type.getTypePtr());
     const clang::RecordDecl *record_decl =
-        record_type->getOriginalDecl()->getDefinitionOrSelf();
+        record_type->getDecl()->getDefinitionOrSelf();
 
     clang::DeclarationName decl_name(&getASTContext().Idents.get(name));
     for (NamedDecl *decl : record_decl->lookup(decl_name)) {
@@ -7135,7 +7134,7 @@ bool TypeSystemClang::IsTemplateType(lldb::opaque_compiler_type_t type) {
   const clang::Type *clang_type = ClangUtil::GetQualType(ct).getTypePtr();
   if (auto *cxx_record_decl = dyn_cast<clang::TagType>(clang_type))
     return isa<clang::ClassTemplateSpecializationDecl>(
-        cxx_record_decl->getOriginalDecl());
+        cxx_record_decl->getDecl());
   return false;
 }
 
@@ -7338,7 +7337,7 @@ clang::EnumDecl *TypeSystemClang::GetAsEnumDecl(const CompilerType &type) {
   const clang::EnumType *enutype =
       llvm::dyn_cast<clang::EnumType>(ClangUtil::GetCanonicalQualType(type));
   if (enutype)
-    return enutype->getOriginalDecl()->getDefinitionOrSelf();
+    return enutype->getDecl()->getDefinitionOrSelf();
   return nullptr;
 }
 
@@ -7346,7 +7345,7 @@ clang::RecordDecl *TypeSystemClang::GetAsRecordDecl(const CompilerType &type) {
   const clang::RecordType *record_type =
       llvm::dyn_cast<clang::RecordType>(ClangUtil::GetCanonicalQualType(type));
   if (record_type)
-    return record_type->getOriginalDecl()->getDefinitionOrSelf();
+    return record_type->getDecl()->getDefinitionOrSelf();
   return nullptr;
 }
 
@@ -7428,7 +7427,7 @@ clang::FieldDecl *TypeSystemClang::AddFieldToRecordType(
       if (const clang::TagType *TagT =
               field->getType()->getAs<clang::TagType>()) {
         if (clang::RecordDecl *Rec =
-                llvm::dyn_cast<clang::RecordDecl>(TagT->getOriginalDecl()))
+                llvm::dyn_cast<clang::RecordDecl>(TagT->getDecl()))
           if (!Rec->getDeclName()) {
             Rec->setAnonymousStructOrUnion(true);
             field->setImplicit();
@@ -7514,7 +7513,7 @@ void TypeSystemClang::BuildIndirectFields(const CompilerType &type) {
         continue;
 
       clang::RecordDecl *field_record_decl =
-          field_record_type->getOriginalDecl()->getDefinition();
+          field_record_type->getDecl()->getDefinition();
 
       if (!field_record_decl)
         continue;
@@ -7656,8 +7655,7 @@ void TypeSystemClang::SetIntegerInitializerForVariable(
   // If the variable is an enum type, take the underlying integer type as
   // the type of the integer literal.
   if (const EnumType *enum_type = qt->getAs<EnumType>()) {
-    const EnumDecl *enum_decl =
-        enum_type->getOriginalDecl()->getDefinitionOrSelf();
+    const EnumDecl *enum_decl = enum_type->getDecl()->getDefinitionOrSelf();
     qt = enum_decl->getIntegerType();
   }
   // Bools are handled separately because the clang AST printer handles bools
@@ -8317,7 +8315,7 @@ bool TypeSystemClang::SetHasExternalStorage(lldb::opaque_compiler_type_t type,
 
   case clang::Type::Enum: {
     clang::EnumDecl *enum_decl =
-        llvm::cast<clang::EnumType>(qual_type)->getOriginalDecl();
+        llvm::cast<clang::EnumType>(qual_type)->getDecl();
     if (enum_decl) {
       enum_decl->setHasExternalLexicalStorage(has_extern);
       enum_decl->setHasExternalVisibleStorage(has_extern);
@@ -8355,7 +8353,7 @@ bool TypeSystemClang::StartTagDeclarationDefinition(const CompilerType &type) {
   if (!qual_type.isNull()) {
     const clang::TagType *tag_type = qual_type->getAs<clang::TagType>();
     if (tag_type) {
-      clang::TagDecl *tag_decl = tag_type->getOriginalDecl();
+      clang::TagDecl *tag_decl = tag_type->getDecl();
       if (tag_decl) {
         tag_decl->startDefinition();
         return true;
@@ -8390,8 +8388,7 @@ bool TypeSystemClang::CompleteTagDeclarationDefinition(
   // the definition.
   const clang::TagType *tag_type = qual_type->getAs<clang::TagType>();
   if (tag_type) {
-    clang::TagDecl *tag_decl =
-        tag_type->getOriginalDecl()->getDefinitionOrSelf();
+    clang::TagDecl *tag_decl = tag_type->getDecl()->getDefinitionOrSelf();
 
     if (auto *cxx_record_decl = llvm::dyn_cast<CXXRecordDecl>(tag_decl)) {
       // If we have a move constructor declared but no copy constructor we
@@ -8426,8 +8423,7 @@ bool TypeSystemClang::CompleteTagDeclarationDefinition(
 
   if (!enutype)
     return false;
-  clang::EnumDecl *enum_decl =
-      enutype->getOriginalDecl()->getDefinitionOrSelf();
+  clang::EnumDecl *enum_decl = enutype->getDecl()->getDefinitionOrSelf();
 
   if (enum_decl->isCompleteDefinition())
     return true;
@@ -8485,8 +8481,7 @@ clang::EnumConstantDecl *TypeSystemClang::AddEnumerationValueToEnumerationType(
   clang::EnumConstantDecl *enumerator_decl =
       clang::EnumConstantDecl::CreateDeserialized(getASTContext(),
                                                   GlobalDeclID());
-  clang::EnumDecl *enum_decl =
-      enutype->getOriginalDecl()->getDefinitionOrSelf();
+  clang::EnumDecl *enum_decl = enutype->getDecl()->getDefinitionOrSelf();
   enumerator_decl->setDeclContext(enum_decl);
   if (name && name[0])
     enumerator_decl->setDeclName(&getASTContext().Idents.get(name));
@@ -8521,8 +8516,7 @@ CompilerType TypeSystemClang::GetEnumerationIntegerType(CompilerType type) {
   if (!enum_type)
     return CompilerType();
 
-  return GetType(
-      enum_type->getOriginalDecl()->getDefinitionOrSelf()->getIntegerType());
+  return GetType(enum_type->getDecl()->getDefinitionOrSelf()->getIntegerType());
 }
 
 CompilerType
@@ -8630,8 +8624,7 @@ static bool DumpEnumValue(const clang::QualType &qual_type, Stream &s,
                           uint32_t bitfield_bit_size) {
   const clang::EnumType *enutype =
       llvm::cast<clang::EnumType>(qual_type.getTypePtr());
-  const clang::EnumDecl *enum_decl =
-      enutype->getOriginalDecl()->getDefinitionOrSelf();
+  const clang::EnumDecl *enum_decl = enutype->getDecl()->getDefinitionOrSelf();
   lldb::offset_t offset = byte_offset;
   bool qual_type_is_signed = qual_type->isSignedIntegerOrEnumerationType();
   const uint64_t enum_svalue =
@@ -8907,7 +8900,7 @@ void TypeSystemClang::DumpTypeDescription(lldb::opaque_compiler_type_t type,
       GetCompleteType(type);
 
       auto *record_type = llvm::cast<clang::RecordType>(qual_type.getTypePtr());
-      const clang::RecordDecl *record_decl = record_type->getOriginalDecl();
+      const clang::RecordDecl *record_decl = record_type->getDecl();
       if (level == eDescriptionLevelVerbose)
         record_decl->dump(llvm_ostrm);
       else {
@@ -8919,7 +8912,7 @@ void TypeSystemClang::DumpTypeDescription(lldb::opaque_compiler_type_t type,
     default: {
       if (auto *tag_type =
               llvm::dyn_cast<clang::TagType>(qual_type.getTypePtr())) {
-        if (clang::TagDecl *tag_decl = tag_type->getOriginalDecl()) {
+        if (clang::TagDecl *tag_decl = tag_type->getDecl()) {
           if (level == eDescriptionLevelVerbose)
             tag_decl->dump(llvm_ostrm);
           else
@@ -8959,7 +8952,7 @@ void TypeSystemClang::DumpTypeName(const CompilerType &type) {
 
     case clang::Type::Enum: {
       clang::EnumDecl *enum_decl =
-          llvm::cast<clang::EnumType>(qual_type)->getOriginalDecl();
+          llvm::cast<clang::EnumType>(qual_type)->getDecl();
       if (enum_decl) {
         printf("enum %s", enum_decl->getName().str().c_str());
       }
@@ -9825,7 +9818,7 @@ bool TypeSystemClang::IsForcefullyCompleted(lldb::opaque_compiler_type_t type) {
         llvm::dyn_cast<clang::RecordType>(qual_type.getTypePtr());
     if (record_type) {
       const clang::RecordDecl *record_decl =
-          record_type->getOriginalDecl()->getDefinitionOrSelf();
+          record_type->getDecl()->getDefinitionOrSelf();
       if (std::optional<ClangASTMetadata> metadata = GetMetadata(record_decl))
         return metadata->IsForcefullyCompleted();
     }
diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp
index 3176852f0b724..fb9e7eb5ed1bd 100644
--- a/lldb/source/Target/Process.cpp
+++ b/lldb/source/Target/Process.cpp
@@ -1971,6 +1971,49 @@ size_t Process::ReadMemory(addr_t addr, void *buf, size_t size, Status &error) {
   }
 }
 
+llvm::SmallVector<llvm::MutableArrayRef<uint8_t>>
+Process::ReadMemoryRanges(llvm::ArrayRef<Range<lldb::addr_t, size_t>> ranges,
+                          llvm::MutableArrayRef<uint8_t> buffer) {
+  auto total_ranges_len = llvm::sum_of(
+      llvm::map_range(ranges, [](auto range) { return range.size; }));
+  // If the buffer is not large enough, this is a programmer error.
+  // In production builds, gracefully fail by returning a length of 0 for all
+  // ranges.
+  assert(buffer.size() >= total_ranges_len && "provided buffer is too short");
+  if (buffer.size() < total_ranges_len) {
+    llvm::MutableArrayRef<uint8_t> empty;
+    return {ranges.size(), empty};
+  }
+
+  llvm::SmallVector<llvm::MutableArrayRef<uint8_t>> results;
+
+  // While `buffer` has space, take the next requested range and read
+  // memory into a `buffer` piece, then slice it to remove the used memory.
+  for (auto [addr, range_len] : ranges) {
+    Status status;
+    size_t num_bytes_read =
+        ReadMemoryFromInferior(addr, buffer.data(), range_len, status);
+    // FIXME: ReadMemoryFromInferior promises to return 0 in case of errors, but
+    // it doesn't; it never checks for errors.
+    if (status.Fail())
+      num_bytes_read = 0;
+
+    assert(num_bytes_read <= range_len && "read more than requested bytes");
+    if (num_bytes_read > range_len) {
+      // In production builds, gracefully fail by returning length zero for this
+      // range.
+      results.emplace_back();
+      continue;
+    }
+
+    results.push_back(buffer.take_front(num_bytes_read));
+    // Slice buffer to remove the used memory.
+    buffer = buffer.drop_front(num_bytes_read);
+  }
+
+  return results;
+}
+
 void Process::DoFindInMemory(lldb::addr_t start_addr, lldb::addr_t end_addr,
                              const uint8_t *buf, size_t size,
                              AddressRanges &matches, size_t alignment,
diff --git a/lldb/source/Target/StopInfo.cpp b/lldb/source/Target/StopInfo.cpp
index 7fa1fc5d71f13..e9e534a57973e 100644
--- a/lldb/source/Target/StopInfo.cpp
+++ b/lldb/source/Target/StopInfo.cpp
@@ -87,11 +87,15 @@ bool StopInfo::HasTargetRunSinceMe() {
 namespace lldb_private {
 class StopInfoBreakpoint : public StopInfo {
 public:
+  // We use a "breakpoint preserving BreakpointLocationCollection because we
+  // may need to hand out the "breakpoint hit" list as any point, potentially
+  // after the breakpoint has been deleted.  But we still need to refer to them.
   StopInfoBreakpoint(Thread &thread, break_id_t break_id)
       : StopInfo(thread, break_id), m_should_stop(false),
         m_should_stop_is_valid(false), m_should_perform_action(true),
         m_address(LLDB_INVALID_ADDRESS), m_break_id(LLDB_INVALID_BREAK_ID),
-        m_was_all_internal(false), m_was_one_shot(false) {
+        m_was_all_internal(false), m_was_one_shot(false),
+        m_async_stopped_locs(true) {
     StoreBPInfo();
   }
 
@@ -99,7 +103,8 @@ class StopInfoBreakpoint : public StopInfo {
       : StopInfo(thread, break_id), m_should_stop(should_stop),
         m_should_stop_is_valid(true), m_should_perform_action(true),
         m_address(LLDB_INVALID_ADDRESS), m_break_id(LLDB_INVALID_BREAK_ID),
-        m_was_all_internal(false), m_was_one_shot(false) {
+        m_was_all_internal(false), m_was_one_shot(false),
+        m_async_stopped_locs(true) {
     StoreBPInfo();
   }
 
@@ -699,6 +704,10 @@ class StopInfoBreakpoint : public StopInfo {
   lldb::break_id_t m_break_id;
   bool m_was_all_internal;
   bool m_was_one_shot;
+  /// The StopInfoBreakpoint lives after the stop, and could get queried
+  /// at any time so we need to make sure that it keeps the breakpoints for
+  /// each of the locations it records alive while it is around.  That's what
+  /// The BreakpointPreservingLocationCollection does.
   BreakpointLocationCollection m_async_stopped_locs;
 };
 
diff --git a/lldb/source/Utility/RegisterValue.cpp b/lldb/source/Utility/RegisterValue.cpp
index 0e99451c3b700..12c349a143c0f 100644
--- a/lldb/source/Utility/RegisterValue.cpp
+++ b/lldb/source/Utility/RegisterValue.cpp
@@ -199,7 +199,7 @@ Status RegisterValue::SetValueFromData(const RegisterInfo &reg_info,
     else if (reg_info.byte_size <= 16) {
       uint64_t data1 = src.GetU64(&src_offset);
       uint64_t data2 = src.GetU64(&src_offset);
-      if (src.GetByteOrder() == eByteOrderBig) {
+      if (src.GetByteOrder() == eByteOrderLittle) {
         int128.x[0] = data1;
         int128.x[1] = data2;
       } else {
diff --git a/lldb/source/Utility/XcodeSDK.cpp b/lldb/source/Utility/XcodeSDK.cpp
index 2040791882fd0..89e05de975835 100644
--- a/lldb/source/Utility/XcodeSDK.cpp
+++ b/lldb/source/Utility/XcodeSDK.cpp
@@ -38,8 +38,8 @@ static llvm::StringRef GetName(XcodeSDK::Type type) {
     return "XRSimulator";
   case XcodeSDK::XROS:
     return "XROS";
-  case XcodeSDK::bridgeOS:
-    return "bridgeOS";
+  case XcodeSDK::BridgeOS:
+    return "BridgeOS";
   case XcodeSDK::Linux:
     return "Linux";
   case XcodeSDK::unknown:
@@ -83,8 +83,8 @@ static XcodeSDK::Type ParseSDKName(llvm::StringRef &name) {
     return XcodeSDK::XRSimulator;
   if (name.consume_front("XROS"))
     return XcodeSDK::XROS;
-  if (name.consume_front("bridgeOS"))
-    return XcodeSDK::bridgeOS;
+  if (name.consume_front("BridgeOS"))
+    return XcodeSDK::BridgeOS;
   if (name.consume_front("Linux"))
     return XcodeSDK::Linux;
   static_assert(XcodeSDK::Linux == XcodeSDK::numSDKTypes - 1,
@@ -204,7 +204,7 @@ std::string XcodeSDK::GetCanonicalName(XcodeSDK::Info info) {
   case XROS:
     name = "xros";
     break;
-  case bridgeOS:
+  case BridgeOS:
     name = "bridgeos";
     break;
   case Linux:
diff --git a/lldb/test/API/CMakeLists.txt b/lldb/test/API/CMakeLists.txt
index b1ace6296f46a..e3bffbc73c394 100644
--- a/lldb/test/API/CMakeLists.txt
+++ b/lldb/test/API/CMakeLists.txt
@@ -140,7 +140,12 @@ if(CMAKE_HOST_APPLE)
 endif()
 
 if(WIN32 AND CMAKE_BUILD_TYPE STREQUAL Debug)
-  set(LLDB_PYTHON_API_TEST_EXECUTABLE "${Python3_EXECUTABLE_DEBUG}")
+  if(${CMAKE_VERSION} VERSION_LESS "3.30")
+    message(WARNING "CMake version is inferior to 3.30. Some lldb tests will fail.")
+    set(LLDB_PYTHON_API_TEST_EXECUTABLE "${Python3_EXECUTABLE}")
+  else()
+    set(LLDB_PYTHON_API_TEST_EXECUTABLE "${Python3_EXECUTABLE_DEBUG}")
+  endif()
 else()
   set(LLDB_PYTHON_API_TEST_EXECUTABLE "${Python3_EXECUTABLE}")
 endif()
diff --git a/lldb/test/API/commands/expression/import-std-module/array/TestArrayFromStdModule.py b/lldb/test/API/commands/expression/import-std-module/array/TestArrayFromStdModule.py
index ed028a1a4ea3f..4aea8009058b9 100644
--- a/lldb/test/API/commands/expression/import-std-module/array/TestArrayFromStdModule.py
+++ b/lldb/test/API/commands/expression/import-std-module/array/TestArrayFromStdModule.py
@@ -11,6 +11,9 @@ class TestCase(TestBase):
     @add_test_categories(["libc++"])
     @skipIf(compiler=no_match("clang"))
     @skipIf(macos_version=["<", "15.0"])
+    @skipIf(
+        bugnumber="ASTImport of lambdas not supported: https://github.com/llvm/llvm-project/issues/149477"
+    )
     def test(self):
         self.build()
 
diff --git a/lldb/test/API/functionalities/breakpoint/callback_deletes_breakpoints/Makefile b/lldb/test/API/functionalities/breakpoint/callback_deletes_breakpoints/Makefile
new file mode 100644
index 0000000000000..695335e068c0c
--- /dev/null
+++ b/lldb/test/API/functionalities/breakpoint/callback_deletes_breakpoints/Makefile
@@ -0,0 +1,4 @@
+C_SOURCES := main.c
+CFLAGS_EXTRAS := -std=c99
+
+include Makefile.rules
diff --git a/lldb/test/API/functionalities/breakpoint/callback_deletes_breakpoints/TestCallbackDeletesBreakpoints.py b/lldb/test/API/functionalities/breakpoint/callback_deletes_breakpoints/TestCallbackDeletesBreakpoints.py
new file mode 100644
index 0000000000000..2b8fc662ad42e
--- /dev/null
+++ b/lldb/test/API/functionalities/breakpoint/callback_deletes_breakpoints/TestCallbackDeletesBreakpoints.py
@@ -0,0 +1,67 @@
+"""
+Make sure that deleting breakpoints in another breakpoint
+callback doesn't cause problems.
+"""
+
+
+import lldb
+import lldbsuite.test.lldbutil as lldbutil
+from lldbsuite.test.lldbtest import *
+
+
+class TestBreakpointDeletionInCallback(TestBase):
+    NO_DEBUG_INFO_TESTCASE = True
+
+    def test_breakpoint_deletion_in_callback(self):
+        self.build()
+        self.main_source_file = lldb.SBFileSpec("main.c")
+        self.delete_others_test()
+
+    def delete_others_test(self):
+        """You might use the test implementation in several ways, say so here."""
+
+        # This function starts a process, "a.out" by default, sets a source
+        # breakpoint, runs to it, and returns the thread, process & target.
+        # It optionally takes an SBLaunchOption argument if you want to pass
+        # arguments or environment variables.
+        (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint(
+            self, "Set a breakpoint here", self.main_source_file
+        )
+
+        # Now set a breakpoint on "I did something" several times
+        #
+        bkpt_numbers = []
+        for idx in range(0, 5):
+            bkpt_numbers.append(
+                lldbutil.run_break_set_by_source_regexp(self, "// Deletable location")
+            )
+
+        # And add commands to the third one to delete two others:
+        deleter = target.FindBreakpointByID(bkpt_numbers[2])
+        self.assertTrue(deleter.IsValid(), "Deleter is a good breakpoint")
+        commands = lldb.SBStringList()
+        deleted_ids = [bkpt_numbers[0], bkpt_numbers[3]]
+        for idx in deleted_ids:
+            commands.AppendString(f"break delete {idx}")
+
+        deleter.SetCommandLineCommands(commands)
+
+        thread_list = lldbutil.continue_to_breakpoint(process, deleter)
+        self.assertEqual(len(thread_list), 1)
+        stop_data = thread.stop_reason_data
+        # There are 5 breakpoints so 10 break_id, break_loc_id.
+        self.assertEqual(len(stop_data), 10)
+        # We should have been able to get break ID's and locations for all the
+        # breakpoints that we originally hit, but some won't be around anymore:
+        for idx in range(0, 5):
+            bkpt_id = stop_data[idx * 2]
+            print(f"{idx}: {bkpt_id}")
+            self.assertIn(bkpt_id, bkpt_numbers, "Found breakpoints are right")
+            loc_id = stop_data[idx * 2 + 1]
+            self.assertEqual(loc_id, 1, "All breakpoints have one location")
+            bkpt = target.FindBreakpointByID(bkpt_id)
+            if bkpt_id in deleted_ids:
+                # Looking these up should be an error:
+                self.assertFalse(bkpt.IsValid(), "Deleted breakpoints are deleted")
+            else:
+                self.assertTrue(bkpt.IsValid(), "The rest are still valid")
diff --git a/lldb/test/API/functionalities/breakpoint/callback_deletes_breakpoints/main.c b/lldb/test/API/functionalities/breakpoint/callback_deletes_breakpoints/main.c
new file mode 100644
index 0000000000000..2ffb897b2f92d
--- /dev/null
+++ b/lldb/test/API/functionalities/breakpoint/callback_deletes_breakpoints/main.c
@@ -0,0 +1,12 @@
+#include <stdio.h>
+
+int do_something(int input) {
+  return input % 5; // Deletable location
+}
+
+int main() {
+  printf("Set a breakpoint here.\n");
+  do_something(100);
+  do_something(200);
+  return 0;
+}
diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestContinue.py b/lldb/test/API/functionalities/gdb_remote_client/TestContinue.py
index 3af4ca859f86e..67f0783167a32 100644
--- a/lldb/test/API/functionalities/gdb_remote_client/TestContinue.py
+++ b/lldb/test/API/functionalities/gdb_remote_client/TestContinue.py
@@ -41,7 +41,7 @@ def qfThreadInfo(self):
         lldbutil.expect_state_changes(
             self, self.dbg.GetListener(), process, [lldb.eStateExited]
         )
-        self.assertPacketLogContains(["vCont;C13:401"])
+        self.assertPacketLogReceived(["vCont;C13:401"])
 
     def test_continue_no_vCont(self):
         class MyResponder(self.BaseResponder):
@@ -61,7 +61,7 @@ def other(self, packet):
         lldbutil.expect_state_changes(
             self, self.dbg.GetListener(), process, [lldb.eStateExited]
         )
-        self.assertPacketLogContains(["Hc401", "C13"])
+        self.assertPacketLogReceived(["Hc401", "C13"])
 
     def test_continue_multiprocess(self):
         class MyResponder(self.BaseResponder):
@@ -74,4 +74,4 @@ class MyResponder(self.BaseResponder):
         lldbutil.expect_state_changes(
             self, self.dbg.GetListener(), process, [lldb.eStateExited]
         )
-        self.assertPacketLogContains(["vCont;C13:p400.401"])
+        self.assertPacketLogReceived(["vCont;C13:p400.401"])
diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemoteClient.py b/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemoteClient.py
index 67c5d7d55846d..8cf04522153d9 100644
--- a/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemoteClient.py
+++ b/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemoteClient.py
@@ -36,7 +36,7 @@ def test_connect(self):
         """Test connecting to a remote gdb server"""
         target = self.createTarget("a.yaml")
         process = self.connect(target)
-        self.assertPacketLogContains(["qProcessInfo", "qfThreadInfo"])
+        self.assertPacketLogReceived(["qProcessInfo", "qfThreadInfo"])
 
     def test_attach_fail(self):
         error_msg = "mock-error-msg"
@@ -142,15 +142,16 @@ def test_read_registers_using_g_packets(self):
         # But there certainly should be no p packets after the g packet.
 
         self.read_registers(process)
-        print(f"\nPACKET LOG:\n{self.server.responder.packetLog}\n")
+        received = self.server.responder.packetLog.get_received()
+        print(f"\nPACKET LOG:\n{received}\n")
         g_pos = 0
         try:
-            g_pos = self.server.responder.packetLog.index("g")
+            g_pos = received.index("g")
         except err:
             self.fail("'g' packet not found after fetching registers")
 
         try:
-            second_g = self.server.responder.packetLog.index("g", g_pos)
+            second_g = received.index("g", g_pos + 1)
             self.fail("Found more than one 'g' packet")
         except:
             pass
@@ -158,13 +159,7 @@ def test_read_registers_using_g_packets(self):
         # Make sure there aren't any `p` packets after the `g` packet:
         self.assertEqual(
             0,
-            len(
-                [
-                    p
-                    for p in self.server.responder.packetLog[g_pos:]
-                    if p.startswith("p")
-                ]
-            ),
+            len([p for p in received[g_pos:] if p.startswith("p")]),
         )
 
     def test_read_registers_using_p_packets(self):
@@ -177,10 +172,9 @@ def test_read_registers_using_p_packets(self):
         process = self.connect(target)
 
         self.read_registers(process)
-        self.assertNotIn("g", self.server.responder.packetLog)
-        self.assertGreater(
-            len([p for p in self.server.responder.packetLog if p.startswith("p")]), 0
-        )
+        received = self.server.responder.packetLog.get_received()
+        self.assertNotIn("g", received)
+        self.assertGreater(len([p for p in received if p.startswith("p")]), 0)
 
     def test_write_registers_using_P_packets(self):
         """Test writing registers using 'P' packets (default behavior)"""
@@ -189,12 +183,9 @@ def test_write_registers_using_P_packets(self):
         process = self.connect(target)
 
         self.write_registers(process)
-        self.assertEqual(
-            0, len([p for p in self.server.responder.packetLog if p.startswith("G")])
-        )
-        self.assertGreater(
-            len([p for p in self.server.responder.packetLog if p.startswith("P")]), 0
-        )
+        received = self.server.responder.packetLog.get_received()
+        self.assertEqual(0, len([p for p in received if p.startswith("G")]))
+        self.assertGreater(len([p for p in received if p.startswith("P")]), 0)
 
     def test_write_registers_using_G_packets(self):
         """Test writing registers using 'G' packets"""
@@ -209,12 +200,9 @@ def readRegister(self, register):
         process = self.connect(target)
 
         self.write_registers(process)
-        self.assertEqual(
-            0, len([p for p in self.server.responder.packetLog if p.startswith("P")])
-        )
-        self.assertGreater(
-            len([p for p in self.server.responder.packetLog if p.startswith("G")]), 0
-        )
+        received = self.server.responder.packetLog.get_received()
+        self.assertEqual(0, len([p for p in received if p.startswith("P")]))
+        self.assertGreater(len([p for p in received if p.startswith("G")]), 0)
 
     def read_registers(self, process):
         self.for_each_gpr(
@@ -291,7 +279,7 @@ def qLaunchSuccess(self):
         self.assertTrue(process, PROCESS_IS_VALID)
         self.assertEqual(process.GetProcessID(), 16)
 
-        self.assertPacketLogContains(
+        self.assertPacketLogReceived(
             [
                 "A%d,0,%s,8,1,61726731,8,2,61726732,8,3,61726733"
                 % (len(exe_hex), exe_hex),
@@ -352,7 +340,7 @@ def A(self, packet):
         self.assertTrue(process, PROCESS_IS_VALID)
         self.assertEqual(process.GetProcessID(), 16)
 
-        self.assertPacketLogContains(
+        self.assertPacketLogReceived(
             ["vRun;%s;61726731;61726732;61726733" % (exe_hex,)]
         )
 
@@ -424,7 +412,7 @@ def A(self, packet):
         self.assertTrue(process, PROCESS_IS_VALID)
         self.assertEqual(process.GetProcessID(), 16)
 
-        self.assertPacketLogContains(
+        self.assertPacketLogReceived(
             ["vRun;%s;61726731;61726732;61726733" % (exe_hex,)]
         )
 
@@ -468,7 +456,7 @@ def vRun(self, packet):
             lldb.SBError(),
         )  # error
 
-        self.assertPacketLogContains(
+        self.assertPacketLogReceived(
             [
                 "QEnvironment:EQUALS=foo=bar",
                 "QEnvironmentHexEncoded:4e45454453454e433d66726f6224",
@@ -522,7 +510,7 @@ def QEnvironment(self, packet):
             lldb.SBError(),
         )  # error
 
-        self.assertPacketLogContains(
+        self.assertPacketLogReceived(
             [
                 "QEnvironmentHexEncoded:455155414c533d666f6f3d626172",
                 "QEnvironmentHexEncoded:4e45454453454e433d66726f6224",
diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemoteLoad.py b/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemoteLoad.py
index f0a5429e6c1ce..d8214ae6b9a2d 100644
--- a/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemoteLoad.py
+++ b/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemoteLoad.py
@@ -22,7 +22,7 @@ def test_ram_load(self):
         target = self.createTarget("a.yaml")
         process = self.connect(target)
         self.dbg.HandleCommand("target modules load -l -s0")
-        self.assertPacketLogContains(["M1000,4:c3c3c3c3", "M1004,2:3232"])
+        self.assertPacketLogReceived(["M1000,4:c3c3c3c3", "M1004,2:3232"])
 
     @skipIfXmlSupportMissing
     def test_flash_load(self):
@@ -63,7 +63,7 @@ def other(self, packet):
         target = self.createTarget("a.yaml")
         process = self.connect(target)
         self.dbg.HandleCommand("target modules load -l -s0")
-        self.assertPacketLogContains(
+        self.assertPacketLogReceived(
             [
                 "vFlashErase:1000,100",
                 "vFlashWrite:1000:\xc3\xc3\xc3\xc3",
diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemotePlatformFile.py b/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemotePlatformFile.py
index 69e04df81bc6e..a3def8165586a 100644
--- a/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemotePlatformFile.py
+++ b/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemotePlatformFile.py
@@ -30,7 +30,7 @@ def vFile(self, packet):
         )
         self.match("platform file write 16 -o 11 -d teststring", [r"Return = 10"])
         self.match("platform file close 16", [r"file 16 closed."])
-        self.assertPacketLogContains(
+        self.assertPacketLogReceived(
             [
                 "vFile:open:2f736f6d652f66696c652e747874,00000202,000001ed",
                 "vFile:pread:10,d,b",
@@ -66,7 +66,7 @@ def vFile(self, packet):
             error=True,
         )
         self.match("platform file close 16", [enosys_regex], error=True)
-        self.assertPacketLogContains(
+        self.assertPacketLogReceived(
             [
                 "vFile:open:2f736f6d652f66696c652e747874,00000202,000001ed",
                 "vFile:pread:10,d,b",
@@ -88,7 +88,7 @@ def vFile(self, packet):
             "platform get-size /some/file.txt",
             [r"File size of /some/file\.txt \(remote\): 4096"],
         )
-        self.assertPacketLogContains(
+        self.assertPacketLogReceived(
             [
                 "vFile:size:2f736f6d652f66696c652e747874",
             ]
@@ -113,7 +113,7 @@ def vFile(self, packet):
             "platform get-size /some/file.txt",
             [r"File size of /some/file\.txt \(remote\): 66051"],
         )
-        self.assertPacketLogContains(
+        self.assertPacketLogReceived(
             [
                 "vFile:size:2f736f6d652f66696c652e747874",
                 "vFile:open:2f736f6d652f66696c652e747874,00000000,00000000",
@@ -135,7 +135,7 @@ def vFile(self, packet):
             [r"File size of /other/file\.txt \(remote\): 66051"],
         )
 
-        self.assertPacketLogContains(
+        self.assertPacketLogReceived(
             [
                 "vFile:size:2f6f746865722f66696c652e747874",
                 "vFile:open:2f6f746865722f66696c652e747874,00000000,00000000",
@@ -161,7 +161,7 @@ def vFile(self, packet):
             "platform get-permissions /some/file.txt",
             [r"File permissions of /some/file\.txt \(remote\): 0o0644"],
         )
-        self.assertPacketLogContains(
+        self.assertPacketLogReceived(
             [
                 "vFile:mode:2f736f6d652f66696c652e747874",
             ]
@@ -190,7 +190,7 @@ def vFile(self, packet):
                 "platform get-permissions /some/file.txt",
                 [r"File permissions of /some/file\.txt \(remote\): 0o0644"],
             )
-            self.assertPacketLogContains(
+            self.assertPacketLogReceived(
                 [
                     "vFile:mode:2f736f6d652f66696c652e747874",
                     "vFile:open:2f736f6d652f66696c652e747874,00000000,00000000",
@@ -214,7 +214,7 @@ def vFile(self, packet):
             "platform file-exists /some/file.txt",
             [r"File /some/file\.txt \(remote\) exists"],
         )
-        self.assertPacketLogContains(
+        self.assertPacketLogReceived(
             [
                 "vFile:exists:2f736f6d652f66696c652e747874",
             ]
@@ -233,7 +233,7 @@ def vFile(self, packet):
             "platform file-exists /some/file.txt",
             [r"File /some/file\.txt \(remote\) does not exist"],
         )
-        self.assertPacketLogContains(
+        self.assertPacketLogReceived(
             [
                 "vFile:exists:2f736f6d652f66696c652e747874",
             ]
@@ -256,7 +256,7 @@ def vFile(self, packet):
             "platform file-exists /some/file.txt",
             [r"File /some/file\.txt \(remote\) exists"],
         )
-        self.assertPacketLogContains(
+        self.assertPacketLogReceived(
             [
                 "vFile:exists:2f736f6d652f66696c652e747874",
                 "vFile:open:2f736f6d652f66696c652e747874,00000000,00000000",
@@ -279,7 +279,7 @@ def vFile(self, packet):
             "platform file-exists /some/file.txt",
             [r"File /some/file\.txt \(remote\) does not exist"],
         )
-        self.assertPacketLogContains(
+        self.assertPacketLogReceived(
             [
                 "vFile:exists:2f736f6d652f66696c652e747874",
                 "vFile:open:2f736f6d652f66696c652e747874,00000000,00000000",
diff --git a/lldb/test/API/lang/objc/ivar-in-framework-base/Makefile b/lldb/test/API/lang/objc/ivar-in-framework-base/Makefile
new file mode 100644
index 0000000000000..c7947fc138c18
--- /dev/null
+++ b/lldb/test/API/lang/objc/ivar-in-framework-base/Makefile
@@ -0,0 +1,6 @@
+OBJC_SOURCES := main.m lib.m
+LD_EXTRAS = -framework Foundation
+
+include Makefile.rules
+
+lib.o: CFLAGS = $(CFLAGS_NO_DEBUG)
diff --git a/lldb/test/API/lang/objc/ivar-in-framework-base/TestIvarInFrameworkBase.py b/lldb/test/API/lang/objc/ivar-in-framework-base/TestIvarInFrameworkBase.py
new file mode 100644
index 0000000000000..40fc6b7a68995
--- /dev/null
+++ b/lldb/test/API/lang/objc/ivar-in-framework-base/TestIvarInFrameworkBase.py
@@ -0,0 +1,39 @@
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+
+
+class TestIvarInFrameworkBase(TestBase):
+    """
+    Tests whether LLDB's data inspection commands can correctly retrieve
+    information about ivars from the Objective-C runtime.
+    In this test-case we have a base class type for which we don't have access
+    to the debug-info of the implementation (mimicking the scenario of subclassing
+    a type from a system framework). LLDB won't be able to see the backing ivar for
+    'fooProp' from just debug-info, but it will fall back on the runtime to get the
+    necessary information.
+    """
+
+    def test_frame_var(self):
+        self.build()
+        lldbutil.run_to_source_breakpoint(self, "break here", lldb.SBFileSpec("main.m"))
+        self.expect("frame variable *bar", substrs=["_fooProp = 10", "_barProp = 15"])
+
+    def test_expr(self):
+        self.build()
+        lldbutil.run_to_source_breakpoint(self, "break here", lldb.SBFileSpec("main.m"))
+        self.expect_expr(
+            "*bar",
+            result_type="Bar",
+            result_children=[
+                ValueCheck(
+                    name="Foo",
+                    children=[
+                        ValueCheck(name="NSObject"),
+                        ValueCheck(name="_fooProp", value="10"),
+                    ],
+                ),
+                ValueCheck(name="_barProp", value="15"),
+            ],
+        )
diff --git a/lldb/test/API/lang/objc/ivar-in-framework-base/lib.h b/lldb/test/API/lang/objc/ivar-in-framework-base/lib.h
new file mode 100644
index 0000000000000..31ceb53dc6885
--- /dev/null
+++ b/lldb/test/API/lang/objc/ivar-in-framework-base/lib.h
@@ -0,0 +1,6 @@
+#import <Foundation/Foundation.h>
+
+@interface Foo : NSObject
+@property int fooProp;
+- (id)init;
+@end
diff --git a/lldb/test/API/lang/objc/ivar-in-framework-base/lib.m b/lldb/test/API/lang/objc/ivar-in-framework-base/lib.m
new file mode 100644
index 0000000000000..e1bf80ac4bd4c
--- /dev/null
+++ b/lldb/test/API/lang/objc/ivar-in-framework-base/lib.m
@@ -0,0 +1,8 @@
+#import "lib.h"
+
+@implementation Foo
+- (id)init {
+  self.fooProp = 10;
+  return self;
+}
+@end
diff --git a/lldb/test/API/lang/objc/ivar-in-framework-base/main.m b/lldb/test/API/lang/objc/ivar-in-framework-base/main.m
new file mode 100644
index 0000000000000..1fd352ec92c54
--- /dev/null
+++ b/lldb/test/API/lang/objc/ivar-in-framework-base/main.m
@@ -0,0 +1,22 @@
+#import "lib.h"
+#include <stdio.h>
+
+@interface Bar : Foo
+@property int barProp;
+- (id)init;
+@end
+
+@implementation Bar
+
+- (id)init {
+  self = [super init];
+  self.barProp = 15;
+  return self;
+}
+@end
+
+int main() {
+  Bar *bar = [Bar new];
+  puts("break here");
+  return 0;
+}
diff --git a/lldb/test/API/macosx/debugserver-multimemread/Makefile b/lldb/test/API/macosx/debugserver-multimemread/Makefile
new file mode 100644
index 0000000000000..10495940055b6
--- /dev/null
+++ b/lldb/test/API/macosx/debugserver-multimemread/Makefile
@@ -0,0 +1,3 @@
+C_SOURCES := main.c
+
+include Makefile.rules
diff --git a/lldb/test/API/macosx/debugserver-multimemread/TestDebugserverMultiMemRead.py b/lldb/test/API/macosx/debugserver-multimemread/TestDebugserverMultiMemRead.py
new file mode 100644
index 0000000000000..6caa5f85c2848
--- /dev/null
+++ b/lldb/test/API/macosx/debugserver-multimemread/TestDebugserverMultiMemRead.py
@@ -0,0 +1,102 @@
+"""
+Tests debugserver support for MultiMemRead.
+"""
+
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+
+
+@skipUnlessDarwin
+@skipIfOutOfTreeDebugserver
+class TestCase(TestBase):
+    def send_process_packet(self, packet_str):
+        self.runCmd(f"proc plugin packet send {packet_str}", check=False)
+        # The output is of the form:
+        #  packet: <packet_str>
+        #  response: <response>
+        reply = self.res.GetOutput().split("\n")
+        packet = reply[0].strip()
+        response = reply[1].strip()
+
+        self.assertTrue(packet.startswith("packet: "))
+        self.assertTrue(response.startswith("response: "))
+        return response[len("response: ") :]
+
+    def check_invalid_packet(self, packet_str):
+        reply = self.send_process_packet("packet_str")
+        self.assertEqual(reply, "E03")
+
+    def test_packets(self):
+        self.build()
+        source_file = lldb.SBFileSpec("main.c")
+        target, process, thread, bkpt = lldbutil.run_to_source_breakpoint(
+            self, "break here", source_file
+        )
+
+        reply = self.send_process_packet("qSupported")
+        self.assertIn("MultiMemRead+", reply)
+
+        mem_address_var = thread.frames[0].FindVariable("memory")
+        self.assertTrue(mem_address_var)
+        mem_address = mem_address_var.GetValueAsUnsigned() + 42
+
+        # no ":"
+        self.check_invalid_packet("MultiMemRead")
+        # missing ranges
+        self.check_invalid_packet("MultiMemRead:")
+        # needs at least one range
+        self.check_invalid_packet("MultiMemRead:ranges:")
+        # needs at least one range
+        self.check_invalid_packet("MultiMemRead:ranges:,")
+        # a range is a pair of numbers
+        self.check_invalid_packet("MultiMemRead:ranges:10")
+        # a range is a pair of numbers
+        self.check_invalid_packet("MultiMemRead:ranges:10,")
+        # range list must end with ;
+        self.check_invalid_packet("MultiMemRead:ranges:10,2")
+        self.check_invalid_packet("MultiMemRead:ranges:10,2,")
+        self.check_invalid_packet("MultiMemRead:ranges:10,2,3")
+        # ranges are pairs of numbers.
+        self.check_invalid_packet("MultiMemRead:ranges:10,2,3;")
+        # unrecognized field
+        self.check_invalid_packet("MultiMemRead:ranges:10,2;blah:;")
+        # unrecognized field
+        self.check_invalid_packet("MultiMemRead:blah:;ranges:10,2;")
+
+        # Zero-length reads are ok.
+        reply = self.send_process_packet("MultiMemRead:ranges:0,0;")
+        self.assertEqual(reply, "0;")
+
+        # Debugserver is permissive with trailing commas.
+        reply = self.send_process_packet("MultiMemRead:ranges:10,2,;")
+        self.assertEqual(reply, "0;")
+        reply = self.send_process_packet(f"MultiMemRead:ranges:{mem_address:x},2,;")
+        self.assertEqual(reply, "2;ab")
+
+        reply = self.send_process_packet("MultiMemRead:ranges:10,2;")
+        self.assertEqual(reply, "0;")
+        reply = self.send_process_packet(f"MultiMemRead:ranges:{mem_address:x},0;")
+        self.assertEqual(reply, "0;")
+        reply = self.send_process_packet(f"MultiMemRead:ranges:{mem_address:x},2;")
+        self.assertEqual(reply, "2;ab")
+        reply = self.send_process_packet(
+            f"MultiMemRead:ranges:{mem_address:x},2,{mem_address+2:x},4;"
+        )
+        self.assertEqual(reply, "2,4;abcdef")
+        reply = self.send_process_packet(
+            f"MultiMemRead:ranges:{mem_address:x},2,{mem_address+2:x},4,{mem_address+6:x},8;"
+        )
+        self.assertEqual(reply, "2,4,8;abcdefghijklmn")
+
+        # Test zero length in the middle.
+        reply = self.send_process_packet(
+            f"MultiMemRead:ranges:{mem_address:x},2,{mem_address+2:x},0,{mem_address+6:x},8;"
+        )
+        self.assertEqual(reply, "2,0,8;abghijklmn")
+        # Test zero length in the end.
+        reply = self.send_process_packet(
+            f"MultiMemRead:ranges:{mem_address:x},2,{mem_address+2:x},4,{mem_address+6:x},0;"
+        )
+        self.assertEqual(reply, "2,4,0;abcdef")
diff --git a/lldb/test/API/macosx/debugserver-multimemread/main.c b/lldb/test/API/macosx/debugserver-multimemread/main.c
new file mode 100644
index 0000000000000..44cdd475b5505
--- /dev/null
+++ b/lldb/test/API/macosx/debugserver-multimemread/main.c
@@ -0,0 +1,14 @@
+#include <stdlib.h>
+#include <string.h>
+
+int main(int argc, char **argv) {
+  char *memory = malloc(1024);
+  memset(memory, '-', 1024);
+  // Write "interesting" characters at an offset from the memory filled with
+  // `-`. This way, if we read outside the range in either direction, we should
+  // find `-`s`.
+  int offset = 42;
+  for (int i = offset; i < offset + 14; i++)
+    memory[i] = 'a' + (i - offset);
+  return 0; // break here
+}
diff --git a/lldb/test/API/macosx/mte/Makefile b/lldb/test/API/macosx/mte/Makefile
index cb20942805e2a..d614e0f0a3bcd 100644
--- a/lldb/test/API/macosx/mte/Makefile
+++ b/lldb/test/API/macosx/mte/Makefile
@@ -1,12 +1,15 @@
 C_SOURCES := main.c
 
-EXE := uaf_mte
+EXE := uaf
 
-all: uaf_mte sign
+binary-plain:    uaf
+binary-entitled: uaf sign
+
+all: binary-entitled
 
 include Makefile.rules
 
-sign: mte-entitlements.plist uaf_mte
+sign: mte-entitlements.plist uaf
 ifeq ($(OS),Darwin)
 	codesign -s - -f --entitlements $^
 endif
diff --git a/lldb/test/API/macosx/mte/TestDarwinMTE.py b/lldb/test/API/macosx/mte/TestDarwinMTE.py
index 489e24a679472..a70b4b4aed26b 100644
--- a/lldb/test/API/macosx/mte/TestDarwinMTE.py
+++ b/lldb/test/API/macosx/mte/TestDarwinMTE.py
@@ -7,12 +7,24 @@
 from lldbsuite.test import lldbutil
 import lldbsuite.test.cpu_feature as cpu_feature
 
-exe_name = "uaf_mte"  # Must match Makefile
+exe_name = "uaf"  # Must match Makefile
 
 
 class TestDarwinMTE(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
 
+    @skipUnlessFeature(cpu_feature.AArch64.MTE)
+    def test_process_launch_memory_tagging(self):
+        self.build(make_targets=["binary-plain"])
+        self.createTestTarget(self.getBuildArtifact(exe_name))
+
+        self.expect("process launch", substrs=["exited with status = 0"])
+
+        self.expect(
+            "process launch --memory-tagging",
+            substrs=["stopped", "stop reason = EXC_ARM_MTE_TAG_FAULT"],
+        )
+
     @skipUnlessFeature(cpu_feature.AArch64.MTE)
     def test_tag_fault(self):
         self.build()
diff --git a/lldb/test/API/tools/lldb-dap/attach-commands/Makefile b/lldb/test/API/tools/lldb-dap/attach-commands/Makefile
new file mode 100644
index 0000000000000..10495940055b6
--- /dev/null
+++ b/lldb/test/API/tools/lldb-dap/attach-commands/Makefile
@@ -0,0 +1,3 @@
+C_SOURCES := main.c
+
+include Makefile.rules
diff --git a/lldb/test/API/tools/lldb-dap/attach-commands/TestDAP_attachCommands.py b/lldb/test/API/tools/lldb-dap/attach-commands/TestDAP_attachCommands.py
new file mode 100644
index 0000000000000..9e29f07db80f1
--- /dev/null
+++ b/lldb/test/API/tools/lldb-dap/attach-commands/TestDAP_attachCommands.py
@@ -0,0 +1,145 @@
+"""
+Test lldb-dap attach commands
+"""
+
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+import lldbdap_testcase
+import time
+
+
+class TestDAP_attachCommands(lldbdap_testcase.DAPTestCaseBase):
+    @skipIfNetBSD  # Hangs on NetBSD as well
+    def test_commands(self):
+        """
+        Tests the "initCommands", "preRunCommands", "stopCommands",
+        "exitCommands", "terminateCommands" and "attachCommands"
+        that can be passed during attach.
+
+        "initCommands" are a list of LLDB commands that get executed
+        before the target is created.
+        "preRunCommands" are a list of LLDB commands that get executed
+        after the target has been created and before the launch.
+        "stopCommands" are a list of LLDB commands that get executed each
+        time the program stops.
+        "exitCommands" are a list of LLDB commands that get executed when
+        the process exits
+        "attachCommands" are a list of LLDB commands that get executed and
+        must have a valid process in the selected target in LLDB after
+        they are done executing. This allows custom commands to create any
+        kind of debug session.
+        "terminateCommands" are a list of LLDB commands that get executed when
+        the debugger session terminates.
+        """
+        program = self.build_and_create_debug_adapter_for_attach()
+
+        # Here we just create a target and launch the process as a way to test
+        # if we are able to use attach commands to create any kind of a target
+        # and use it for debugging
+        attachCommands = [
+            'target create -d "%s"' % (program),
+            "process launch --stop-at-entry",
+        ]
+        initCommands = ["target list", "platform list"]
+        preRunCommands = ["image list a.out", "image dump sections a.out"]
+        postRunCommands = ["help trace", "help process trace"]
+        stopCommands = ["frame variable", "thread backtrace"]
+        exitCommands = ["expr 2+3", "expr 3+4"]
+        terminateCommands = ["expr 4+2"]
+        self.attach(
+            program=program,
+            attachCommands=attachCommands,
+            initCommands=initCommands,
+            preRunCommands=preRunCommands,
+            stopCommands=stopCommands,
+            exitCommands=exitCommands,
+            terminateCommands=terminateCommands,
+            postRunCommands=postRunCommands,
+        )
+        # Get output from the console. This should contain both the
+        # "initCommands" and the "preRunCommands".
+        output = self.get_console()
+        # Verify all "initCommands" were found in console output
+        self.verify_commands("initCommands", output, initCommands)
+        # Verify all "preRunCommands" were found in console output
+        self.verify_commands("preRunCommands", output, preRunCommands)
+        # Verify all "postRunCommands" were found in console output
+        self.verify_commands("postRunCommands", output, postRunCommands)
+
+        functions = ["main"]
+        breakpoint_ids = self.set_function_breakpoints(functions)
+        self.assertEqual(len(breakpoint_ids), len(functions), "expect one breakpoint")
+        self.continue_to_breakpoints(breakpoint_ids)
+        output = self.collect_console(pattern=stopCommands[-1])
+        self.verify_commands("stopCommands", output, stopCommands)
+
+        # Continue after launch and hit the "pause()" call and stop the target.
+        # Get output from the console. This should contain both the
+        # "stopCommands" that were run after we stop.
+        self.do_continue()
+        time.sleep(0.5)
+        self.dap_server.request_pause()
+        self.dap_server.wait_for_stopped()
+        output = self.collect_console(pattern=stopCommands[-1])
+        self.verify_commands("stopCommands", output, stopCommands)
+
+        # Continue until the program exits
+        self.continue_to_exit()
+        # Get output from the console. This should contain both the
+        # "exitCommands" that were run after the second breakpoint was hit
+        # and the "terminateCommands" due to the debugging session ending
+        output = self.collect_console(
+            pattern=terminateCommands[0],
+        )
+        self.verify_commands("exitCommands", output, exitCommands)
+        self.verify_commands("terminateCommands", output, terminateCommands)
+
+    def test_attach_command_process_failures(self):
+        """
+        Tests that a 'attachCommands' is expected to leave the debugger's
+        selected target with a valid process.
+        """
+        program = self.build_and_create_debug_adapter_for_attach()
+        attachCommands = ['script print("oops, forgot to attach to a process...")']
+        resp = self.attach(
+            program=program,
+            attachCommands=attachCommands,
+            expectFailure=True,
+        )
+        self.assertFalse(resp["success"])
+        self.assertIn(
+            "attachCommands failed to attach to a process",
+            resp["body"]["error"]["format"],
+        )
+
+    @skipIfNetBSD  # Hangs on NetBSD as well
+    def test_terminate_commands(self):
+        """
+        Tests that the "terminateCommands", that can be passed during
+        attach, are run when the debugger is disconnected.
+        """
+        program = self.build_and_create_debug_adapter_for_attach()
+
+        # Here we just create a target and launch the process as a way to test
+        # if we are able to use attach commands to create any kind of a target
+        # and use it for debugging
+        attachCommands = [
+            'target create -d "%s"' % (program),
+            "process launch --stop-at-entry",
+        ]
+        terminateCommands = ["expr 4+2"]
+        self.attach(
+            program=program,
+            attachCommands=attachCommands,
+            terminateCommands=terminateCommands,
+            disconnectAutomatically=False,
+        )
+        self.get_console()
+        # Once it's disconnected the console should contain the
+        # "terminateCommands"
+        self.dap_server.request_disconnect(terminateDebuggee=True)
+        output = self.collect_console(
+            pattern=terminateCommands[0],
+        )
+        self.verify_commands("terminateCommands", output, terminateCommands)
diff --git a/lldb/test/API/tools/lldb-dap/attach-commands/main.c b/lldb/test/API/tools/lldb-dap/attach-commands/main.c
new file mode 100644
index 0000000000000..f56d5d53afa05
--- /dev/null
+++ b/lldb/test/API/tools/lldb-dap/attach-commands/main.c
@@ -0,0 +1,30 @@
+#include "attach.h"
+#include <stdio.h>
+#ifdef _WIN32
+#include <process.h>
+#include <windows.h>
+#else
+#include <unistd.h>
+#endif
+
+int main(int argc, char const *argv[]) {
+  lldb_enable_attach();
+
+  if (argc >= 2) {
+    // Create the synchronization token.
+    FILE *f = fopen(argv[1], "wx");
+    if (!f)
+      return 1;
+    fputs("\n", f);
+    fflush(f);
+    fclose(f);
+  }
+
+  printf("pid = %i\n", getpid());
+#ifdef _WIN32
+  Sleep(10 * 1000);
+#else
+  sleep(10);
+#endif
+  return 0; // breakpoint 1
+}
diff --git a/lldb/test/API/tools/lldb-dap/attach/TestDAP_attach.py b/lldb/test/API/tools/lldb-dap/attach/TestDAP_attach.py
index c54e21c1b973a..2db00a5ac3b6f 100644
--- a/lldb/test/API/tools/lldb-dap/attach/TestDAP_attach.py
+++ b/lldb/test/API/tools/lldb-dap/attach/TestDAP_attach.py
@@ -11,50 +11,38 @@
 import time
 
 
-def spawn_and_wait(program, delay):
-    if delay:
-        time.sleep(delay)
-    process = subprocess.Popen(
-        [program], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE
-    )
-    process.wait()
-
-
+# Often fails on Arm Linux, but not specifically because it's Arm, something in
+# process scheduling can cause a massive (minutes) delay during this test.
+@skipIf(oslist=["linux"], archs=["arm$"])
 class TestDAP_attach(lldbdap_testcase.DAPTestCaseBase):
-    def set_and_hit_breakpoint(self, continueToExit=True):
-        source = "main.c"
-        breakpoint1_line = line_number(source, "// breakpoint 1")
-        lines = [breakpoint1_line]
-        # Set breakpoint in the thread function so we can step the threads
-        breakpoint_ids = self.set_source_breakpoints(source, lines)
-        self.assertEqual(
-            len(breakpoint_ids), len(lines), "expect correct number of breakpoints"
-        )
-        # Test binary will sleep for 10s, offset the breakpoint timeout
-        # accordingly.
-        timeout_offset = 10
-        self.continue_to_breakpoints(
-            breakpoint_ids, timeout=timeout_offset + self.DEFAULT_TIMEOUT
+    def spawn(self, args):
+        self.process = subprocess.Popen(
+            args,
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            universal_newlines=True,
         )
-        if continueToExit:
-            self.continue_to_exit()
 
-    @skipIfNetBSD  # Hangs on NetBSD as well
+    def spawn_and_wait(self, program, delay):
+        time.sleep(delay)
+        self.spawn([program])
+        self.process.wait()
+
+    def continue_and_verify_pid(self):
+        self.do_continue()
+        out, _ = self.process.communicate("foo")
+        self.assertIn(f"pid = {self.process.pid}", out)
+
     def test_by_pid(self):
         """
         Tests attaching to a process by process ID.
         """
         program = self.build_and_create_debug_adapter_for_attach()
-        self.process = subprocess.Popen(
-            [program],
-            stdin=subprocess.PIPE,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-        )
+        self.spawn([program])
         self.attach(pid=self.process.pid)
-        self.set_and_hit_breakpoint(continueToExit=True)
+        self.continue_and_verify_pid()
 
-    @skipIfNetBSD  # Hangs on NetBSD as well
     def test_by_name(self):
         """
         Tests attaching to a process by process name.
@@ -65,24 +53,20 @@ def test_by_name(self):
         pid_file_path = lldbutil.append_to_process_working_directory(
             self, "pid_file_%d" % (int(time.time()))
         )
-
-        popen = self.spawnSubprocess(program, [pid_file_path])
+        self.spawn([program, pid_file_path])
         lldbutil.wait_for_file_on_target(self, pid_file_path)
 
         self.attach(program=program)
-        self.set_and_hit_breakpoint(continueToExit=True)
+        self.continue_and_verify_pid()
 
-    @skipUnlessDarwin
-    @skipIfNetBSD  # Hangs on NetBSD as well
     def test_by_name_waitFor(self):
         """
-        Tests attaching to a process by process name and waiting for the
-        next instance of a process to be launched, ignoring all current
-        ones.
+        Tests waiting for, and attaching to a process by process name that
+        doesn't exist yet.
         """
         program = self.build_and_create_debug_adapter_for_attach()
         self.spawn_thread = threading.Thread(
-            target=spawn_and_wait,
+            target=self.spawn_and_wait,
             args=(
                 program,
                 1.0,
@@ -90,140 +74,4 @@ def test_by_name_waitFor(self):
         )
         self.spawn_thread.start()
         self.attach(program=program, waitFor=True)
-        self.set_and_hit_breakpoint(continueToExit=True)
-
-    @skipIfNetBSD  # Hangs on NetBSD as well
-    def test_commands(self):
-        """
-        Tests the "initCommands", "preRunCommands", "stopCommands",
-        "exitCommands", "terminateCommands" and "attachCommands"
-        that can be passed during attach.
-
-        "initCommands" are a list of LLDB commands that get executed
-        before the target is created.
-        "preRunCommands" are a list of LLDB commands that get executed
-        after the target has been created and before the launch.
-        "stopCommands" are a list of LLDB commands that get executed each
-        time the program stops.
-        "exitCommands" are a list of LLDB commands that get executed when
-        the process exits
-        "attachCommands" are a list of LLDB commands that get executed and
-        must have a valid process in the selected target in LLDB after
-        they are done executing. This allows custom commands to create any
-        kind of debug session.
-        "terminateCommands" are a list of LLDB commands that get executed when
-        the debugger session terminates.
-        """
-        program = self.build_and_create_debug_adapter_for_attach()
-
-        # Here we just create a target and launch the process as a way to test
-        # if we are able to use attach commands to create any kind of a target
-        # and use it for debugging
-        attachCommands = [
-            'target create -d "%s"' % (program),
-            "process launch --stop-at-entry",
-        ]
-        initCommands = ["target list", "platform list"]
-        preRunCommands = ["image list a.out", "image dump sections a.out"]
-        postRunCommands = ["help trace", "help process trace"]
-        stopCommands = ["frame variable", "thread backtrace"]
-        exitCommands = ["expr 2+3", "expr 3+4"]
-        terminateCommands = ["expr 4+2"]
-        self.attach(
-            program=program,
-            attachCommands=attachCommands,
-            initCommands=initCommands,
-            preRunCommands=preRunCommands,
-            stopCommands=stopCommands,
-            exitCommands=exitCommands,
-            terminateCommands=terminateCommands,
-            postRunCommands=postRunCommands,
-        )
-        # Get output from the console. This should contain both the
-        # "initCommands" and the "preRunCommands".
-        output = self.get_console()
-        # Verify all "initCommands" were found in console output
-        self.verify_commands("initCommands", output, initCommands)
-        # Verify all "preRunCommands" were found in console output
-        self.verify_commands("preRunCommands", output, preRunCommands)
-        # Verify all "postRunCommands" were found in console output
-        self.verify_commands("postRunCommands", output, postRunCommands)
-
-        functions = ["main"]
-        breakpoint_ids = self.set_function_breakpoints(functions)
-        self.assertEqual(len(breakpoint_ids), len(functions), "expect one breakpoint")
-        self.continue_to_breakpoints(breakpoint_ids)
-        output = self.collect_console(timeout=10, pattern=stopCommands[-1])
-        self.verify_commands("stopCommands", output, stopCommands)
-
-        # Continue after launch and hit the "pause()" call and stop the target.
-        # Get output from the console. This should contain both the
-        # "stopCommands" that were run after we stop.
-        self.do_continue()
-        time.sleep(0.5)
-        self.dap_server.request_pause()
-        self.dap_server.wait_for_stopped()
-        output = self.collect_console(timeout=10, pattern=stopCommands[-1])
-        self.verify_commands("stopCommands", output, stopCommands)
-
-        # Continue until the program exits
-        self.continue_to_exit()
-        # Get output from the console. This should contain both the
-        # "exitCommands" that were run after the second breakpoint was hit
-        # and the "terminateCommands" due to the debugging session ending
-        output = self.collect_console(
-            timeout=10.0,
-            pattern=terminateCommands[0],
-        )
-        self.verify_commands("exitCommands", output, exitCommands)
-        self.verify_commands("terminateCommands", output, terminateCommands)
-
-    def test_attach_command_process_failures(self):
-        """
-        Tests that a 'attachCommands' is expected to leave the debugger's
-        selected target with a valid process.
-        """
-        program = self.build_and_create_debug_adapter_for_attach()
-        attachCommands = ['script print("oops, forgot to attach to a process...")']
-        resp = self.attach(
-            program=program,
-            attachCommands=attachCommands,
-            expectFailure=True,
-        )
-        self.assertFalse(resp["success"])
-        self.assertIn(
-            "attachCommands failed to attach to a process",
-            resp["body"]["error"]["format"],
-        )
-
-    @skipIfNetBSD  # Hangs on NetBSD as well
-    def test_terminate_commands(self):
-        """
-        Tests that the "terminateCommands", that can be passed during
-        attach, are run when the debugger is disconnected.
-        """
-        program = self.build_and_create_debug_adapter_for_attach()
-
-        # Here we just create a target and launch the process as a way to test
-        # if we are able to use attach commands to create any kind of a target
-        # and use it for debugging
-        attachCommands = [
-            'target create -d "%s"' % (program),
-            "process launch --stop-at-entry",
-        ]
-        terminateCommands = ["expr 4+2"]
-        self.attach(
-            program=program,
-            attachCommands=attachCommands,
-            terminateCommands=terminateCommands,
-            disconnectAutomatically=False,
-        )
-        self.get_console()
-        # Once it's disconnected the console should contain the
-        # "terminateCommands"
-        self.dap_server.request_disconnect(terminateDebuggee=True)
-        output = self.collect_console(
-            timeout=1.0,
-            pattern=terminateCommands[0],
-        )
-        self.verify_commands("terminateCommands", output, terminateCommands)
+        self.continue_and_verify_pid()
diff --git a/lldb/test/API/tools/lldb-dap/attach/main.c b/lldb/test/API/tools/lldb-dap/attach/main.c
index f56d5d53afa05..e14cf71a7044c 100644
--- a/lldb/test/API/tools/lldb-dap/attach/main.c
+++ b/lldb/test/API/tools/lldb-dap/attach/main.c
@@ -2,7 +2,6 @@
 #include <stdio.h>
 #ifdef _WIN32
 #include <process.h>
-#include <windows.h>
 #else
 #include <unistd.h>
 #endif
@@ -20,11 +19,9 @@ int main(int argc, char const *argv[]) {
     fclose(f);
   }
 
+  // Wait on input from stdin.
+  getchar();
+
   printf("pid = %i\n", getpid());
-#ifdef _WIN32
-  Sleep(10 * 1000);
-#else
-  sleep(10);
-#endif
-  return 0; // breakpoint 1
+  return 0;
 }
diff --git a/lldb/test/API/tools/lldb-dap/breakpoint-events/TestDAP_breakpointEvents.py b/lldb/test/API/tools/lldb-dap/breakpoint-events/TestDAP_breakpointEvents.py
index 151ad761a5044..beab4d6c1f5a6 100644
--- a/lldb/test/API/tools/lldb-dap/breakpoint-events/TestDAP_breakpointEvents.py
+++ b/lldb/test/API/tools/lldb-dap/breakpoint-events/TestDAP_breakpointEvents.py
@@ -82,14 +82,14 @@ def test_breakpoint_events(self):
             )
 
         # Flush the breakpoint events.
-        self.dap_server.wait_for_breakpoint_events(timeout=5)
+        self.dap_server.wait_for_breakpoint_events()
 
         # Continue to the breakpoint
         self.continue_to_breakpoints(dap_breakpoint_ids)
 
         verified_breakpoint_ids = []
         unverified_breakpoint_ids = []
-        for breakpoint_event in self.dap_server.wait_for_breakpoint_events(timeout=5):
+        for breakpoint_event in self.dap_server.wait_for_breakpoint_events():
             breakpoint = breakpoint_event["body"]["breakpoint"]
             id = breakpoint["id"]
             if breakpoint["verified"]:
diff --git a/lldb/test/API/tools/lldb-dap/cancel/TestDAP_cancel.py b/lldb/test/API/tools/lldb-dap/cancel/TestDAP_cancel.py
index e722fcea9283a..14789a6694686 100644
--- a/lldb/test/API/tools/lldb-dap/cancel/TestDAP_cancel.py
+++ b/lldb/test/API/tools/lldb-dap/cancel/TestDAP_cancel.py
@@ -46,7 +46,7 @@ def test_pending_request(self):
 
         # Use a relatively short timeout since this is only to ensure the
         # following request is queued.
-        blocking_seq = self.async_blocking_request(duration=1.0)
+        blocking_seq = self.async_blocking_request(duration=self.DEFAULT_TIMEOUT / 10)
         # Use a longer timeout to ensure we catch if the request was interrupted
         # properly.
         pending_seq = self.async_blocking_request(duration=self.DEFAULT_TIMEOUT / 2)
diff --git a/lldb/test/API/tools/lldb-dap/commands/TestDAP_commands.py b/lldb/test/API/tools/lldb-dap/commands/TestDAP_commands.py
index e61d2480ea4bb..f53813a8a48f6 100644
--- a/lldb/test/API/tools/lldb-dap/commands/TestDAP_commands.py
+++ b/lldb/test/API/tools/lldb-dap/commands/TestDAP_commands.py
@@ -23,7 +23,6 @@ def test_command_directive_quiet_on_success(self):
             exitCommands=["?" + command_quiet, command_not_quiet],
         )
         full_output = self.collect_console(
-            timeout=1.0,
             pattern=command_not_quiet,
         )
         self.assertNotIn(command_quiet, full_output)
@@ -51,7 +50,6 @@ def do_test_abort_on_error(
             expectFailure=True,
         )
         full_output = self.collect_console(
-            timeout=1.0,
             pattern=command_abort_on_error,
         )
         self.assertNotIn(command_quiet, full_output)
diff --git a/lldb/test/API/tools/lldb-dap/invalidated-event/Makefile b/lldb/test/API/tools/lldb-dap/invalidated-event/Makefile
new file mode 100644
index 0000000000000..99998b20bcb05
--- /dev/null
+++ b/lldb/test/API/tools/lldb-dap/invalidated-event/Makefile
@@ -0,0 +1,3 @@
+CXX_SOURCES := main.cpp
+
+include Makefile.rules
diff --git a/lldb/test/API/tools/lldb-dap/invalidated-event/TestDAP_invalidatedEvent.py b/lldb/test/API/tools/lldb-dap/invalidated-event/TestDAP_invalidatedEvent.py
new file mode 100644
index 0000000000000..8ba56b0bb27ca
--- /dev/null
+++ b/lldb/test/API/tools/lldb-dap/invalidated-event/TestDAP_invalidatedEvent.py
@@ -0,0 +1,55 @@
+"""
+Test lldb-dap recieves invalidated-events when the area such as
+stack, variables, threads has changes but the client does not
+know about it.
+"""
+
+import lldbdap_testcase
+from lldbsuite.test.lldbtest import line_number
+from dap_server import Event
+
+
+class TestDAP_invalidatedEvent(lldbdap_testcase.DAPTestCaseBase):
+    def verify_top_frame_name(self, frame_name: str):
+        all_frames = self.get_stackFrames()
+        self.assertGreaterEqual(len(all_frames), 1, "Expected at least one frame.")
+        top_frame_name = all_frames[0]["name"]
+        self.assertRegex(top_frame_name, f"{frame_name}.*")
+
+    def test_invalidated_stack_area_event(self):
+        """
+        Test an invalidated event for the stack area.
+        The event is sent when the command `thread return <expr>` is sent by the user.
+        """
+        other_source = "other.h"
+        return_bp_line = line_number(other_source, "// thread return breakpoint")
+
+        program = self.getBuildArtifact("a.out")
+        self.build_and_launch(program)
+        self.set_source_breakpoints(other_source, [return_bp_line])
+        self.continue_to_next_stop()
+
+        self.verify_top_frame_name("add")
+        thread_id = self.dap_server.get_thread_id()
+        self.assertIsNotNone(thread_id, "Exepected a thread id.")
+
+        # run thread return
+        thread_command = "thread return 20"
+        eval_resp = self.dap_server.request_evaluate(thread_command, context="repl")
+        self.assertTrue(eval_resp["success"], f"Failed to evaluate `{thread_command}`.")
+
+        # wait for the invalidated stack event.
+        stack_event = self.dap_server.wait_for_event(["invalidated"])
+        self.assertIsNotNone(stack_event, "Expected an invalidated event.")
+        event_body: Event = stack_event["body"]
+        self.assertIn("stacks", event_body["areas"])
+        self.assertIn("threadId", event_body.keys())
+        self.assertEqual(
+            thread_id,
+            event_body["threadId"],
+            f"Expected the event from thread {thread_id}.",
+        )
+
+        # confirm we are back at the main frame.
+        self.verify_top_frame_name("main")
+        self.continue_to_exit()
diff --git a/lldb/test/API/tools/lldb-dap/invalidated-event/main.cpp b/lldb/test/API/tools/lldb-dap/invalidated-event/main.cpp
new file mode 100644
index 0000000000000..c82f9532b7517
--- /dev/null
+++ b/lldb/test/API/tools/lldb-dap/invalidated-event/main.cpp
@@ -0,0 +1,9 @@
+#include "other.h"
+
+int main() {
+  int first = 5;
+  int second = 10;
+  const int result = add(first, second);
+
+  return 0;
+}
diff --git a/lldb/test/API/tools/lldb-dap/invalidated-event/other.h b/lldb/test/API/tools/lldb-dap/invalidated-event/other.h
new file mode 100644
index 0000000000000..856db446d7b5a
--- /dev/null
+++ b/lldb/test/API/tools/lldb-dap/invalidated-event/other.h
@@ -0,0 +1,10 @@
+#ifndef OTHER_H
+#define OTHER_H
+
+int add(int a, int b) {
+  int first = a;
+  int second = b; // thread return breakpoint
+  int result = first + second;
+  return result;
+}
+#endif // OTHER_H
diff --git a/lldb/test/API/tools/lldb-dap/io/TestDAP_io.py b/lldb/test/API/tools/lldb-dap/io/TestDAP_io.py
index af5c62a8c4eb5..9fbe9aaee8c63 100644
--- a/lldb/test/API/tools/lldb-dap/io/TestDAP_io.py
+++ b/lldb/test/API/tools/lldb-dap/io/TestDAP_io.py
@@ -44,7 +44,7 @@ def test_eof_immediately(self):
         """
         process = self.launch()
         process.stdin.close()
-        self.assertEqual(process.wait(timeout=5.0), EXIT_SUCCESS)
+        self.assertEqual(process.wait(timeout=self.DEFAULT_TIMEOUT), EXIT_SUCCESS)
 
     def test_invalid_header(self):
         """
@@ -54,7 +54,7 @@ def test_invalid_header(self):
         process = self.launch()
         process.stdin.write(b"not the correct message header")
         process.stdin.close()
-        self.assertEqual(process.wait(timeout=5.0), EXIT_FAILURE)
+        self.assertEqual(process.wait(timeout=self.DEFAULT_TIMEOUT), EXIT_FAILURE)
 
     def test_partial_header(self):
         """
@@ -64,7 +64,7 @@ def test_partial_header(self):
         process = self.launch()
         process.stdin.write(b"Content-Length: ")
         process.stdin.close()
-        self.assertEqual(process.wait(timeout=5.0), EXIT_FAILURE)
+        self.assertEqual(process.wait(timeout=self.DEFAULT_TIMEOUT), EXIT_FAILURE)
 
     def test_incorrect_content_length(self):
         """
@@ -74,7 +74,7 @@ def test_incorrect_content_length(self):
         process = self.launch()
         process.stdin.write(b"Content-Length: abc")
         process.stdin.close()
-        self.assertEqual(process.wait(timeout=5.0), EXIT_FAILURE)
+        self.assertEqual(process.wait(timeout=self.DEFAULT_TIMEOUT), EXIT_FAILURE)
 
     def test_partial_content_length(self):
         """
@@ -84,4 +84,4 @@ def test_partial_content_length(self):
         process = self.launch()
         process.stdin.write(b"Content-Length: 10\r\n\r\n{")
         process.stdin.close()
-        self.assertEqual(process.wait(timeout=5.0), EXIT_FAILURE)
+        self.assertEqual(process.wait(timeout=self.DEFAULT_TIMEOUT), EXIT_FAILURE)
diff --git a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py
index ceef95dfcd0d5..8db2316e73fc8 100644
--- a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py
+++ b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py
@@ -632,7 +632,27 @@ def test_stdio_redirection(self):
         program = self.getBuildArtifact("a.out")
 
         with tempfile.NamedTemporaryFile("rt") as f:
-            self.launch(program, stdio=[None, f.name, None])
+            self.launch(program, stdio=[None, f.name])
+            self.continue_to_exit()
+            lines = f.readlines()
+            self.assertIn(
+                program, lines[0], "make sure program path is in first argument"
+            )
+
+    @skipIfAsan
+    @skipIfWindows
+    @skipIf(oslist=["linux"], archs=no_match(["x86_64"]))
+    def test_stdio_redirection_and_console(self):
+        """
+        Test stdio redirection and console.
+        """
+        self.build_and_create_debug_adapter()
+        program = self.getBuildArtifact("a.out")
+
+        with tempfile.NamedTemporaryFile("rt") as f:
+            self.launch(
+                program, console="integratedTerminal", stdio=[None, f.name, None]
+            )
             self.continue_to_exit()
             lines = f.readlines()
             self.assertIn(
diff --git a/lldb/test/API/tools/lldb-dap/module-event/TestDAP_module_event.py b/lldb/test/API/tools/lldb-dap/module-event/TestDAP_module_event.py
index bb835af12f5ef..1f4afabbd161e 100644
--- a/lldb/test/API/tools/lldb-dap/module-event/TestDAP_module_event.py
+++ b/lldb/test/API/tools/lldb-dap/module-event/TestDAP_module_event.py
@@ -23,15 +23,15 @@ def test_module_event(self):
         self.continue_to_breakpoints(breakpoint_ids)
 
         # We're now stopped at breakpoint 1 before the dlopen. Flush all the module events.
-        event = self.dap_server.wait_for_event(["module"], 0.25)
+        event = self.dap_server.wait_for_event(["module"])
         while event is not None:
-            event = self.dap_server.wait_for_event(["module"], 0.25)
+            event = self.dap_server.wait_for_event(["module"])
 
         # Continue to the second breakpoint, before the dlclose.
         self.continue_to_breakpoints(breakpoint_ids)
 
         # Make sure we got a module event for libother.
-        event = self.dap_server.wait_for_event(["module"], 5)
+        event = self.dap_server.wait_for_event(["module"])
         self.assertIsNotNone(event, "didn't get a module event")
         module_name = event["body"]["module"]["name"]
         module_id = event["body"]["module"]["id"]
@@ -42,7 +42,7 @@ def test_module_event(self):
         self.continue_to_breakpoints(breakpoint_ids)
 
         # Make sure we got a module event for libother.
-        event = self.dap_server.wait_for_event(["module"], 5)
+        event = self.dap_server.wait_for_event(["module"])
         self.assertIsNotNone(event, "didn't get a module event")
         reason = event["body"]["reason"]
         self.assertEqual(reason, "removed")
@@ -55,8 +55,4 @@ def test_module_event(self):
         self.assertListEqual(list(module_data.keys()), required_keys)
         self.assertEqual(module_data["name"], "", "expects empty name.")
 
-        # Make sure we do not send another event
-        event = self.dap_server.wait_for_event(["module"], 3)
-        self.assertIsNone(event, "expects no events.")
-
         self.continue_to_exit()
diff --git a/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py b/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py
index c5a68372d8221..0ed53dac5d869 100644
--- a/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py
+++ b/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py
@@ -67,7 +67,7 @@ def check_symbols_loaded_with_size():
         # Collect all the module names we saw as events.
         module_new_names = []
         module_changed_names = []
-        module_event = self.dap_server.wait_for_event(["module"], 1)
+        module_event = self.dap_server.wait_for_event(["module"])
         while module_event is not None:
             reason = module_event["body"]["reason"]
             if reason == "new":
@@ -75,7 +75,7 @@ def check_symbols_loaded_with_size():
             elif reason == "changed":
                 module_changed_names.append(module_event["body"]["module"]["name"])
 
-            module_event = self.dap_server.wait_for_event(["module"], 1)
+            module_event = self.dap_server.wait_for_event(["module"])
 
         # Make sure we got an event for every active module.
         self.assertNotEqual(len(module_new_names), 0)
diff --git a/lldb/test/API/tools/lldb-dap/output/TestDAP_output.py b/lldb/test/API/tools/lldb-dap/output/TestDAP_output.py
index fe978a9a73351..0065258920ecb 100644
--- a/lldb/test/API/tools/lldb-dap/output/TestDAP_output.py
+++ b/lldb/test/API/tools/lldb-dap/output/TestDAP_output.py
@@ -29,7 +29,7 @@ def test_output(self):
         self.continue_to_breakpoints(breakpoint_ids)
 
         # Ensure partial messages are still sent.
-        output = self.collect_stdout(timeout=1.0, pattern="abcdef")
+        output = self.collect_stdout(pattern="abcdef")
         self.assertTrue(output and len(output) > 0, "expect program stdout")
 
         self.continue_to_exit()
diff --git a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py
index 67483798f2265..e1ad1425a993d 100644
--- a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py
+++ b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py
@@ -105,7 +105,7 @@ def test_stopOnEntry(self):
         # Restart and check that we still get a stopped event before reaching
         # main.
         self.dap_server.request_restart()
-        stopped_events = self.dap_server.wait_for_stopped(timeout=20)
+        stopped_events = self.dap_server.wait_for_stopped()
         self.verify_stopped_on_entry(stopped_events)
 
         # continue to main
diff --git a/lldb/test/API/tools/lldb-dap/stackTraceDisassemblyDisplay/TestDAP_stackTraceDisassemblyDisplay.py b/lldb/test/API/tools/lldb-dap/stackTraceDisassemblyDisplay/TestDAP_stackTraceDisassemblyDisplay.py
index 08c225b3cada4..6008a0cb02372 100644
--- a/lldb/test/API/tools/lldb-dap/stackTraceDisassemblyDisplay/TestDAP_stackTraceDisassemblyDisplay.py
+++ b/lldb/test/API/tools/lldb-dap/stackTraceDisassemblyDisplay/TestDAP_stackTraceDisassemblyDisplay.py
@@ -29,7 +29,7 @@ def build_and_run_until_breakpoint(self):
         """
         Build the program and run until the breakpoint is hit, and return the stack frames.
         """
-        other_source_file = "other.c"
+        other_source_file = self.getBuildArtifact("other.c")
         with delete_file_on_exit(other_source_file):
             with open(other_source_file, "w") as f:
                 f.write(OTHER_C_SOURCE_CODE)
@@ -169,3 +169,4 @@ def test_stopDisassemblyDisplay(self):
         self.verify_frames_source(
             frames, main_frame_assembly=False, other_frame_assembly=False
         )
+        self.continue_to_exit()
diff --git a/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py b/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py
index 13a694602f230..977d6ce9dac8a 100644
--- a/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py
+++ b/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py
@@ -65,6 +65,11 @@ def verify_values(self, verify_dict, actual, varref_dict=None, expression=None):
                 self.assertNotIn(
                     key, actual, 'key "%s" is not expected in %s' % (key, actual)
                 )
+        isReadOnly = verify_dict.get("readOnly", False)
+        attributes = actual.get("presentationHint", {}).get("attributes", [])
+        self.assertEqual(
+            isReadOnly, "readOnly" in attributes, "%s %s" % (verify_dict, actual)
+        )
         hasVariablesReference = "variablesReference" in actual
         varRef = None
         if hasVariablesReference:
@@ -179,8 +184,9 @@ def do_test_scopes_variables_setVariable_evaluate(
                 "children": {
                     "x": {"equals": {"type": "int", "value": "11"}},
                     "y": {"equals": {"type": "int", "value": "22"}},
-                    "buffer": {"children": buffer_children},
+                    "buffer": {"children": buffer_children, "readOnly": True},
                 },
+                "readOnly": True,
             },
             "x": {"equals": {"type": "int"}},
         }
@@ -444,8 +450,10 @@ def do_test_scopes_and_evaluate_expansion(self, enableAutoVariableSummaries: boo
                     "buffer": {
                         "children": buffer_children,
                         "equals": {"indexedVariables": 16},
+                        "readOnly": True,
                     },
                 },
+                "readOnly": True,
             },
             "x": {
                 "equals": {"type": "int"},
@@ -528,7 +536,7 @@ def do_test_scopes_and_evaluate_expansion(self, enableAutoVariableSummaries: boo
             "children": {
                 "x": {"equals": {"type": "int", "value": "11"}},
                 "y": {"equals": {"type": "int", "value": "22"}},
-                "buffer": {"children": buffer_children},
+                "buffer": {"children": buffer_children, "readOnly": True},
             },
         }
 
@@ -622,11 +630,17 @@ def do_test_indexedVariables(self, enableSyntheticChildDebugging: bool):
         # "[raw]" child.
         raw_child_count = 1 if enableSyntheticChildDebugging else 0
         verify_locals = {
-            "small_array": {"equals": {"indexedVariables": 5}},
-            "large_array": {"equals": {"indexedVariables": 200}},
-            "small_vector": {"equals": {"indexedVariables": 5 + raw_child_count}},
-            "large_vector": {"equals": {"indexedVariables": 200 + raw_child_count}},
-            "pt": {"missing": ["indexedVariables"]},
+            "small_array": {"equals": {"indexedVariables": 5}, "readOnly": True},
+            "large_array": {"equals": {"indexedVariables": 200}, "readOnly": True},
+            "small_vector": {
+                "equals": {"indexedVariables": 5 + raw_child_count},
+                "readOnly": True,
+            },
+            "large_vector": {
+                "equals": {"indexedVariables": 200 + raw_child_count},
+                "readOnly": True,
+            },
+            "pt": {"missing": ["indexedVariables"], "readOnly": True},
         }
         self.verify_variables(verify_locals, locals)
 
@@ -640,7 +654,10 @@ def do_test_indexedVariables(self, enableSyntheticChildDebugging: bool):
             "[4]": {"equals": {"type": "int", "value": "0"}},
         }
         if enableSyntheticChildDebugging:
-            verify_children["[raw]"] = ({"contains": {"type": ["vector"]}},)
+            verify_children["[raw]"] = {
+                "contains": {"type": ["vector"]},
+                "readOnly": True,
+            }
 
         children = self.dap_server.request_variables(locals[2]["variablesReference"])[
             "body"
@@ -660,7 +677,7 @@ def test_return_variables(self):
             return_name: {"equals": {"type": "int", "value": "300"}},
             "argc": {},
             "argv": {},
-            "pt": {},
+            "pt": {"readOnly": True},
             "x": {},
             "return_result": {"equals": {"type": "int"}},
         }
diff --git a/lldb/test/Shell/SymbolFile/NativePDB/func-symbols.test b/lldb/test/Shell/SymbolFile/NativePDB/func-symbols.test
new file mode 100644
index 0000000000000..ee96fa8fd5c33
--- /dev/null
+++ b/lldb/test/Shell/SymbolFile/NativePDB/func-symbols.test
@@ -0,0 +1,133 @@
+# REQUIRES: lld, target-windows
+
+# Test that functions have the correct types.
+# This uses the same input as SymbolFile/PDB/func-symbols.test. However, DIA 
+# creates one named `Type` per function and uses identical UIDs for `Type` and
+# `Function`, whereas native creates one unnamed type per signature and has different UIDs.
+
+# RUN: split-file %s %t
+# RUN: %build --compiler=clang-cl --arch=32 --nodefaultlib -o %t.exe %t/main.cpp %t/second.cpp
+# RUN: lldb-test symbols %t.exe | FileCheck --check-prefix=CHECK-ONE %s
+# RUN: lldb-test symbols %t.exe | FileCheck --check-prefix=CHECK-TWO %s
+
+#--- main.cpp
+
+// Global functions
+int Func_arg_array(int array[]) { return 1; }
+void Func_arg_void(void) { return; }
+void Func_arg_none(void) { return; }
+void Func_varargs(...) { return; }
+
+// Class
+namespace MemberTest {
+  class A {
+  public:
+    int Func(int a, ...) { return 1; }
+  };
+}
+
+// Template
+template <int N=1, class ...T>
+void TemplateFunc(T ...Arg) {
+  return;
+}
+
+// namespace
+namespace {
+  void Func(int a, const long b, volatile bool c, ...) { return; }
+}
+
+namespace NS {
+  void Func(char a, int b) {
+    return;
+  }
+}
+
+// Static function
+static long StaticFunction(int a)
+{
+  return 2;
+}
+
+// Inlined function
+inline void InlinedFunction(long a) { return; }
+
+extern void FunctionCall();
+
+int main() {
+  MemberTest::A v1;
+  v1.Func('a',10);
+
+  Func(1, 5, true, 10, 8);
+  NS::Func('c', 2);
+
+  TemplateFunc(10);
+  TemplateFunc(10,11,88);
+
+  StaticFunction(2);
+  InlinedFunction(1);
+
+  FunctionCall();
+  return 0;
+}
+
+#--- main-checks
+
+# CHECK-ONE: Module [[MD:.*]]
+# CHECK-ONE-DAG: SymbolFile native-pdb ([[MD]])
+# CHECK-ONE-DAG: [[TY0:.*]]:   Type{{.*}} , size = 0, compiler_type = {{.*}} int (int *)
+# CHECK-ONE-DAG: [[TY1:.*]]:   Type{{.*}} , size = 0, compiler_type = {{.*}} void (void)
+# CHECK-ONE-DAG: [[TY2:.*]]:   Type{{.*}} , size = 0, compiler_type = {{.*}} void (...)
+# CHECK-ONE-DAG: [[TY3:.*]]:   Type{{.*}} , size = 0, compiler_type = {{.*}} void (char, int)
+# CHECK-ONE-DAG: [[TY4:.*]]:   Type{{.*}} , size = 0, compiler_type = {{.*}} int (void)
+# CHECK-ONE-DAG: [[TY5:.*]]:   Type{{.*}} , size = 0, compiler_type = {{.*}} void (int, const long, volatile _Bool, ...)
+# CHECK-ONE-DAG: [[TY6:.*]]:   Type{{.*}} , size = 0, compiler_type = {{.*}} long (int)
+# CHECK-ONE-DAG: [[TY7:.*]]:   Type{{.*}} , size = 0, compiler_type = {{.*}} int (int, ...)
+# CHECK-ONE-DAG: [[TY8:.*]]:   Type{{.*}} , size = 0, compiler_type = {{.*}} void (int)
+# CHECK-ONE-DAG: [[TY9:.*]]:   Type{{.*}} , size = 0, compiler_type = {{.*}} void (int, int, int)
+# CHECK-ONE-DAG: [[TY10:.*]]:   Type{{.*}} , size = 0, compiler_type = {{.*}} void (long)
+
+# CHECK-ONE: {{.*}}:   CompileUnit{{.*}}, language = "c++", file = '{{.*}}main.cpp'
+# CHECK-ONE-DAG: Function{{.*}}, mangled = ?Func_arg_array@@YAHQAH@Z, type = [[TY0]]
+# CHECK-ONE-DAG: Function{{.*}}, mangled = ?Func_arg_void@@YAXXZ, type = [[TY1]]
+# CHECK-ONE-DAG: Function{{.*}}, mangled = ?Func_arg_none@@YAXXZ, type = [[TY1]]
+# CHECK-ONE-DAG: Function{{.*}}, mangled = ?Func_varargs@@YAXZZ, type = [[TY2]]
+# CHECK-ONE-DAG: Function{{.*}}, mangled = ?Func@NS@@YAXDH@Z, type = [[TY3]]
+# CHECK-ONE-DAG: Function{{.*}}, demangled = main, type = [[TY4]]
+# CHECK-ONE-DAG: Function{{.*}}, demangled = {{.*}}`anonymous namespace'::Func{{.*}}, type = [[TY5]]
+# CHECK-ONE-DAG: Function{{.*}}, demangled = {{.*}}StaticFunction{{.*}}, type = [[TY6]]
+# CHECK-ONE-DAG: Function{{.*}}, mangled = ?Func@A@MemberTest@@QAAHHZZ, type = [[TY7]]
+# CHECK-ONE-DAG: Function{{.*}}, mangled = ??$TemplateFunc@$00H@@YAXH@Z, type = [[TY8]]
+# CHECK-ONE-DAG: Function{{.*}}, mangled = ??$TemplateFunc@$00HHH@@YAXHHH@Z, type = [[TY9]]
+# CHECK-ONE-DAG: Function{{.*}}, mangled = ?InlinedFunction@@YAXJ@Z, type = [[TY10]]
+
+#--- second.cpp
+
+// Static function
+namespace {
+static long StaticFunction(int a)
+{
+  return 2;
+}
+}
+
+// Inlined function
+static inline int InlinedFunction(long a) { return 10; }
+
+void FunctionCall()
+{
+  StaticFunction(1);
+  InlinedFunction(1);
+}
+
+#--- second-checks
+
+# We expect new types observed in another compile unit
+# CHECK-TWO-DAG: [[TY1:.*]]:   Type{{.*}} , size = 0, compiler_type = {{.*}} void (void)
+# CHECK-TWO-DAG: [[TY2:.*]]:   Type{{.*}} , size = 0, compiler_type = {{.*}} long (int)
+# CHECK-TWO-DAG: [[TY3:.*]]:   Type{{.*}} , size = 0, compiler_type = {{.*}} int (long)
+
+# CHECK-TWO: {{.*}}:   CompileUnit{{.*}}, language = "c++", file = '{{.*}}second.cpp'
+# CHECK-TWO-DAG: Function{{.*}}, mangled = ?FunctionCall@@YAXXZ, type = [[TY1]]
+# CHECK-TWO-DAG: Function{{.*}}, demangled = {{.*}}`anonymous namespace'::StaticFunction{{.*}}, type = [[TY2]]
+# CHECK-TWO-DAG: Function{{.*}}, demangled = {{.*}}InlinedFunction{{.*}}, type = [[TY3]]
diff --git a/lldb/test/Shell/SymbolFile/NativePDB/local-variables-registers.s b/lldb/test/Shell/SymbolFile/NativePDB/local-variables-registers.s
index fe2f397d60c01..b44b99a33626a 100644
--- a/lldb/test/Shell/SymbolFile/NativePDB/local-variables-registers.s
+++ b/lldb/test/Shell/SymbolFile/NativePDB/local-variables-registers.s
@@ -578,12 +578,12 @@ main:                                   # @main
 # CHECK:      (lldb) image lookup -a 0x14000104e -v
 # CHECK:          LineEntry: [0x000000014000104e-0x0000000140001050): C:\src\test\a.cpp:1004
 # CHECK-NEXT:        Symbol: id = {{.*}}, range = [0x0000000140001011-0x0000000140001050), name="main"
-# CHECK-NEXT:      Variable: id = {{.*}}, name = "simple_type1", type = "int64_t", valid ranges = <block>, location = [0x000000014000104e, 0x000000014000104f) -> DW_OP_reg26 XMM9, DW_OP_piece 0x4, DW_OP_reg24 XMM7, DW_OP_piece 0x4
+# CHECK-NEXT:      Variable: id = {{.*}}, name = "simple_type1", type = "long long", valid ranges = <block>, location = [0x000000014000104e, 0x000000014000104f) -> DW_OP_reg26 XMM9, DW_OP_piece 0x4, DW_OP_reg24 XMM7, DW_OP_piece 0x4
 # CHECK-EMPTY:
 # CHECK:      (lldb) image lookup -a 0x14000104f -v
 # CHECK:          LineEntry: [0x000000014000104e-0x0000000140001050): C:\src\test\a.cpp:1004
 # CHECK-NEXT:        Symbol: id = {{.*}}, range = [0x0000000140001011-0x0000000140001050), name="main"
-# CHECK-NEXT:      Variable: id = {{.*}}, name = "simple_type1", type = "int64_t", valid ranges = <block>, location = [0x000000014000104f, 0x0000000140001050) -> DW_OP_reg26 XMM9, DW_OP_piece 0x4, DW_OP_piece 0x4
+# CHECK-NEXT:      Variable: id = {{.*}}, name = "simple_type1", type = "long long", valid ranges = <block>, location = [0x000000014000104f, 0x0000000140001050) -> DW_OP_reg26 XMM9, DW_OP_piece 0x4, DW_OP_piece 0x4
 # CHECK-EMPTY:
 
 .Ltmp26:
diff --git a/lldb/test/Shell/SymbolFile/NativePDB/simple-types.cpp b/lldb/test/Shell/SymbolFile/NativePDB/simple-types.cpp
index 403cd2905e0e9..3781194e2e992 100644
--- a/lldb/test/Shell/SymbolFile/NativePDB/simple-types.cpp
+++ b/lldb/test/Shell/SymbolFile/NativePDB/simple-types.cpp
@@ -58,20 +58,28 @@ int main() {
 
   MyStruct my_struct;
 
+  _Float16 f16;
+
+  _Complex float cf;
+  _Complex double cd;
+
+  __int128 i128;
+  unsigned __int128 ui128;
+
   decltype(nullptr) np;
 }
 
-// CHECK-DAG: Type{{.*}} , name = "std::nullptr_t", size = 0, compiler_type = 0x{{[0-9a-f]+}} nullptr_t
+// CHECK-DAG: Type{{.*}} , name = "decltype(nullptr)", compiler_type = 0x{{[0-9a-f]+}} nullptr_t
 
 // CHECK-DAG: Type{{.*}} , name = "bool", size = 1, compiler_type = 0x{{[0-9a-f]+}} _Bool
 // CHECK-DAG: Type{{.*}} , name = "char", size = 1, compiler_type = 0x{{[0-9a-f]+}} char
 // CHECK-DAG: Type{{.*}} , name = "unsigned char", size = 1, compiler_type = 0x{{[0-9a-f]+}} unsigned char
 // CHECK-DAG: Type{{.*}} , name = "char8_t", size = 1, compiler_type = 0x{{[0-9a-f]+}} char8_t
 
-// CHECK-DAG: Type{{.*}} , size = 2, compiler_type = 0x{{[0-9a-f]+}} short
-// CHECK-DAG: Type{{.*}} , name = "const volatile ", size = 2, compiler_type = 0x{{[0-9a-f]+}} const volatile short
-// CHECK-DAG: Type{{.*}} , name = "const ", size = 2, compiler_type = 0x{{[0-9a-f]+}} const short
-// CHECK-DAG: Type{{.*}} , name = "volatile ", size = 2, compiler_type = 0x{{[0-9a-f]+}} volatile short
+// CHECK-DAG: Type{{.*}} , name = "short", size = 2, compiler_type = 0x{{[0-9a-f]+}} short
+// CHECK-DAG: Type{{.*}} , name = "const volatile short", size = 2, compiler_type = 0x{{[0-9a-f]+}} const volatile short
+// CHECK-DAG: Type{{.*}} , name = "const short", size = 2, compiler_type = 0x{{[0-9a-f]+}} const short
+// CHECK-DAG: Type{{.*}} , name = "volatile short", size = 2, compiler_type = 0x{{[0-9a-f]+}} volatile short
 
 // CHECK-DAG: Type{{.*}} , name = "unsigned short", size = 2, compiler_type = 0x{{[0-9a-f]+}} unsigned short
 // CHECK-DAG: Type{{.*}} , name = "wchar_t", size = 2, compiler_type = 0x{{[0-9a-f]+}} wchar_t
@@ -83,12 +91,19 @@ int main() {
 // CHECK-DAG: Type{{.*}} , name = "unsigned long", size = 4, compiler_type = 0x{{[0-9a-f]+}} unsigned long
 // CHECK-DAG: Type{{.*}} , name = "char32_t", size = 4, compiler_type = 0x{{[0-9a-f]+}} char32_t
 
-// CHECK-DAG: Type{{.*}} , name = "int64_t", size = 8, compiler_type = 0x{{[0-9a-f]+}} long long
-// CHECK-DAG: Type{{.*}} , name = "uint64_t", size = 8, compiler_type = 0x{{[0-9a-f]+}} unsigned long long
+// CHECK-DAG: Type{{.*}} , name = "long long", size = 8, compiler_type = 0x{{[0-9a-f]+}} long long
+// CHECK-DAG: Type{{.*}} , name = "unsigned long long", size = 8, compiler_type = 0x{{[0-9a-f]+}} unsigned long long
 
+// CHECK-DAG: Type{{.*}} , name = "__int128", size = 16, compiler_type = 0x{{[0-9a-f]+}} __int128
+// CHECK-DAG: Type{{.*}} , name = "unsigned __int128", size = 16, compiler_type = 0x{{[0-9a-f]+}} unsigned __int128
+
+// CHECK-DAG: Type{{.*}} , name = "_Float16", size = 2, compiler_type = 0x{{[0-9a-f]+}} __fp16
 // CHECK-DAG: Type{{.*}} , name = "float", size = 4, compiler_type = 0x{{[0-9a-f]+}} float
 // CHECK-DAG: Type{{.*}} , name = "const float", size = 4, compiler_type = 0x{{[0-9a-f]+}} const float
 
+// CHECK-DAG: Type{{.*}} , name = "_Complex float", size = 4, compiler_type = 0x{{[0-9a-f]+}} _Complex float
+// CHECK-DAG: Type{{.*}} , name = "_Complex double", size = 8, compiler_type = 0x{{[0-9a-f]+}} _Complex double
+
 // CHECK-DAG:  Type{{.*}} , name = "ReturnedStruct1", size = 1, decl = simple-types.cpp:21, compiler_type = 0x{{[0-9a-f]+}} struct ReturnedStruct1 {
 // CHECK-DAG:  Type{{.*}} , name = "ReturnedStruct2", size = 1, decl = simple-types.cpp:22, compiler_type = 0x{{[0-9a-f]+}} struct ReturnedStruct2 {
 // CHECK-DAG:  Type{{.*}} , name = "MyStruct", size = 1, decl = simple-types.cpp:24, compiler_type = 0x{{[0-9a-f]+}} struct MyStruct {
diff --git a/lldb/test/Shell/SymbolFile/PDB/func-symbols.test b/lldb/test/Shell/SymbolFile/PDB/func-symbols.test
index 408db14ba26f1..6417bf39b0678 100644
--- a/lldb/test/Shell/SymbolFile/PDB/func-symbols.test
+++ b/lldb/test/Shell/SymbolFile/PDB/func-symbols.test
@@ -1,4 +1,4 @@
-REQUIRES: target-windows, lld
+REQUIRES: target-windows, lld, diasdk
 RUN: mkdir -p %t.dir
 RUN: %build --compiler=clang-cl --arch=32 --nodefaultlib --output=%t.dir/FuncSymbolsTest.exe %S/Inputs/FuncSymbolsTestMain.cpp %S/Inputs/FuncSymbols.cpp
 RUN: lldb-test symbols %t.dir/FuncSymbolsTest.exe | FileCheck --check-prefix=CHECK-ONE %s
diff --git a/lldb/test/Shell/SymbolFile/PDB/pointers.test b/lldb/test/Shell/SymbolFile/PDB/pointers.test
index 29fe171ca97a3..2563ea96d1c9b 100644
--- a/lldb/test/Shell/SymbolFile/PDB/pointers.test
+++ b/lldb/test/Shell/SymbolFile/PDB/pointers.test
@@ -2,38 +2,44 @@ REQUIRES: target-windows, msvc
 RUN: mkdir -p %t.dir
 RUN: %build --compiler=clang-cl --mode=compile --arch=32 --nodefaultlib --output=%t.dir/PointerTypeTest.cpp.obj %S/Inputs/PointerTypeTest.cpp
 RUN: %build --compiler=msvc --mode=link --arch=32 --nodefaultlib --output=%t.dir/PointerTypeTest.cpp.exe %t.dir/PointerTypeTest.cpp.obj
-RUN: lldb-test symbols %t.dir/PointerTypeTest.cpp.exe | FileCheck %s
-RUN: lldb-test symbols %t.dir/PointerTypeTest.cpp.exe | FileCheck --check-prefix=MAIN-ST-F %s
-RUN: lldb-test symbols %t.dir/PointerTypeTest.cpp.exe | FileCheck --check-prefix=MAIN-ST %s
-RUN: lldb-test symbols %t.dir/PointerTypeTest.cpp.exe | FileCheck --check-prefix=MAIN %s
-RUN: lldb-test symbols %t.dir/PointerTypeTest.cpp.exe | FileCheck --check-prefix=F %s
+
+RUN: env LLDB_USE_NATIVE_PDB_READER=0 lldb-test symbols %t.dir/PointerTypeTest.cpp.exe | FileCheck %s
+RUN: env LLDB_USE_NATIVE_PDB_READER=0 lldb-test symbols %t.dir/PointerTypeTest.cpp.exe | FileCheck --check-prefix=MAIN-ST-INT %s
+RUN: env LLDB_USE_NATIVE_PDB_READER=0 lldb-test symbols %t.dir/PointerTypeTest.cpp.exe | FileCheck --check-prefix=MAIN-ST-FN %s
+RUN: env LLDB_USE_NATIVE_PDB_READER=0 lldb-test symbols %t.dir/PointerTypeTest.cpp.exe | FileCheck --check-prefix=MAIN %s
+RUN: env LLDB_USE_NATIVE_PDB_READER=0 lldb-test symbols %t.dir/PointerTypeTest.cpp.exe | FileCheck --check-prefix=F %s
+
+RUN: env LLDB_USE_NATIVE_PDB_READER=1 lldb-test symbols %t.dir/PointerTypeTest.cpp.exe | FileCheck %s
+RUN: env LLDB_USE_NATIVE_PDB_READER=1 lldb-test symbols %t.dir/PointerTypeTest.cpp.exe | FileCheck --check-prefix=MAIN-ST-INT %s
+RUN: env LLDB_USE_NATIVE_PDB_READER=1 lldb-test symbols %t.dir/PointerTypeTest.cpp.exe | FileCheck --check-prefix=MAIN-ST-FN %s
+RUN: env LLDB_USE_NATIVE_PDB_READER=1 lldb-test symbols %t.dir/PointerTypeTest.cpp.exe | FileCheck --check-prefix=MAIN %s
+RUN: env LLDB_USE_NATIVE_PDB_READER=1 lldb-test symbols %t.dir/PointerTypeTest.cpp.exe | FileCheck --check-prefix=F %s
 
 CHECK: Module [[MOD:.*]]
 CHECK: {{^[0-9A-F]+}}:   CompileUnit{{[{]0x[0-9a-f]+[}]}}, language = "c++", file = '{{.*}}\PointerTypeTest.cpp'
 
-MAIN-ST-F:  name = "f"
-MAIN-ST-F-SAME: decl = PointerTypeTest.cpp:8
-MAIN-ST-F-SAME: compiler_type = {{.*}} int (int)
+MAIN-ST-INT-LABEL: name = "ST", size = 4, decl = PointerTypeTest.cpp:6, compiler_type = {{.*}} struct ST {
+MAIN-ST-INT: int a;
+MAIN-ST-INT-LABEL:}
 
-MAIN-ST:  name = "ST", size = 4, decl = PointerTypeTest.cpp:6, compiler_type = {{.*}} struct ST {
-MAIN-ST-NEXT: int a;
-MAIN-ST-NEXT: int {{.*}}f(int);
-MAIN-ST-NEXT:}
+MAIN-ST-FN-LABEL:  name = "ST", size = 4, decl = PointerTypeTest.cpp:6, compiler_type = {{.*}} struct ST {
+MAIN-ST-FN: int {{.*}}f(int);
+MAIN-ST-FN-LABEL:}
 
-MAIN:   Function{[[FID1:.*]]}, mangled = {{_?}}main
+MAIN:   Function{[[FID1:.*]]}, {{(de)?}}mangled = {{_?}}main
 MAIN-NEXT:  Block{[[FID1]]}
 MAIN:     Variable{{.*}}, name = "array_pointer"
 MAIN-SAME:    (int (*)[2][4]), scope = local
 MAIN:     Variable{{.*}}, name = "p_int"
 MAIN-SAME:    (int *), scope = local
 MAIN:     Variable{{.*}}, name = "p_member_field"
-MAIN-SAME:    (int ST::*), scope = local
+MAIN-SAME:    (int {{(`extern "C" main'::`2'::)?}}ST::*), scope = local
 MAIN:     Variable{{.*}}, name = "p_member_method"
-MAIN-SAME:    (int (ST::*)(int){{( __attribute__\(\(thiscall\)\))?}}), scope = local
+MAIN-SAME:    (int ({{(`extern "C" main'::`2'::)?}}ST::*)(int){{( __attribute__\(\(thiscall\)\))?}}), scope = local
 
-F:   Function{[[FID2:.*]]}, demangled = {{.*}}f(int)
+F:   Function{[[FID2:.*]]}, demangled = {{.*}}main::ST::f
 F-NEXT:  Block{[[FID2]]}
 F:     Variable{{.*}}, name = "this"
-F-SAME:    (ST *), scope = parameter, location = {{(DW_OP.*)|(<empty>)}}, artificial
+F-SAME:    ({{(`extern "C" main'::`2'::)?}}ST *), scope = parameter, location = {{DW_OP|<empty>|0x00000000}}
 F:     Variable{{.*}}, name = "x"
-F-SAME:    (int), scope = parameter, decl = PointerTypeTest.cpp:8
+F-SAME:    (int), scope = parameter{{(, decl = PointerTypeTest.cpp:8)?}}
diff --git a/lldb/tools/debugserver/source/RNBRemote.cpp b/lldb/tools/debugserver/source/RNBRemote.cpp
index b06c6bf6735c6..11b93710444f2 100644
--- a/lldb/tools/debugserver/source/RNBRemote.cpp
+++ b/lldb/tools/debugserver/source/RNBRemote.cpp
@@ -170,6 +170,20 @@ static uint64_t decode_uint64(const char *p, int base, char **end = nullptr,
   return addr;
 }
 
+/// Attempts to parse a prefix of `number_str` as a uint64_t. If
+/// successful, the number is returned and the prefix is dropped from
+/// `number_str`.
+static std::optional<uint64_t> extract_u64(std::string_view &number_str) {
+  char *str_end = nullptr;
+  errno = 0;
+  uint64_t number = strtoull(number_str.data(), &str_end, 16);
+  if (errno != 0)
+    return std::nullopt;
+  assert(str_end);
+  number_str.remove_prefix(str_end - number_str.data());
+  return number;
+}
+
 static void append_hex_value(std::ostream &ostrm, const void *buf,
                              size_t buf_size, bool swap) {
   int i;
@@ -204,6 +218,25 @@ static void append_hexified_string(std::ostream &ostrm,
   }
 }
 
+/// Returns true if `str` starts with `prefix`.
+static bool starts_with(std::string_view str, std::string_view prefix) {
+  return str.substr(0, prefix.size()) == prefix;
+}
+
+/// Splits `list_str` into multiple string_views separated by `,`.
+static std::vector<std::string_view>
+parse_comma_separated_list(std::string_view list_str) {
+  std::vector<std::string_view> list;
+  while (!list_str.empty()) {
+    auto pos = list_str.find(',');
+    list.push_back(list_str.substr(0, pos));
+    if (pos == list_str.npos)
+      break;
+    list_str.remove_prefix(pos + 1);
+  }
+  return list;
+}
+
 // from System.framework/Versions/B/PrivateHeaders/sys/codesign.h
 extern "C" {
 #define CS_OPS_STATUS 0       /* return status */
@@ -270,6 +303,11 @@ void RNBRemote::CreatePacketTable() {
                      "Read memory"));
   t.push_back(Packet(read_register, &RNBRemote::HandlePacket_p, NULL, "p",
                      "Read one register"));
+  // Careful: this *must* come before the `M` packet, as debugserver matches
+  // packet prefixes against known packet names. Inverting the order would match
+  // `MultiMemRead` as an `M` packet.
+  t.push_back(Packet(multi_mem_read, &RNBRemote::HandlePacket_MultiMemRead,
+                     NULL, "MultiMemRead", "Read multiple memory addresses"));
   t.push_back(Packet(write_memory, &RNBRemote::HandlePacket_M, NULL, "M",
                      "Write memory"));
   t.push_back(Packet(write_register, &RNBRemote::HandlePacket_P, NULL, "P",
@@ -3150,6 +3188,86 @@ rnb_err_t RNBRemote::HandlePacket_m(const char *p) {
   return SendPacket(ostrm.str());
 }
 
+rnb_err_t RNBRemote::HandlePacket_MultiMemRead(const char *p) {
+  const std::string_view packet_name("MultiMemRead:");
+  std::string_view packet(p);
+
+  if (!starts_with(packet, packet_name))
+    return HandlePacket_ILLFORMED(__FILE__, __LINE__, p,
+                                  "Invalid MultiMemRead packet prefix");
+
+  packet.remove_prefix(packet_name.size());
+
+  const std::string_view ranges_prefix("ranges:");
+  if (!starts_with(packet, ranges_prefix))
+    return HandlePacket_ILLFORMED(__FILE__, __LINE__, packet.data(),
+                                  "Missing 'ranges' in MultiMemRead packet");
+  packet.remove_prefix(ranges_prefix.size());
+
+  std::vector<std::pair<nub_addr_t, std::size_t>> ranges;
+
+  // Ranges should have the form: <addr>,<size>[,<addr>,<size>]*;
+  auto end_of_ranges_pos = packet.find(';');
+  if (end_of_ranges_pos == packet.npos)
+    return HandlePacket_ILLFORMED(__FILE__, __LINE__, packet.data(),
+                                  "MultiMemRead missing end of ranges marker");
+
+  std::vector<std::string_view> numbers_list =
+      parse_comma_separated_list(packet.substr(0, end_of_ranges_pos));
+  packet.remove_prefix(end_of_ranges_pos + 1);
+
+  // Ranges are pairs, so the number of elements must be even.
+  if (numbers_list.size() % 2 == 1)
+    return HandlePacket_ILLFORMED(
+        __FILE__, __LINE__, p,
+        "MultiMemRead has an odd number of numbers for the ranges");
+
+  for (unsigned idx = 0; idx < numbers_list.size(); idx += 2) {
+    std::optional<uint64_t> maybe_addr = extract_u64(numbers_list[idx]);
+    std::optional<uint64_t> maybe_length = extract_u64(numbers_list[idx + 1]);
+    if (!maybe_addr || !maybe_length)
+      return HandlePacket_ILLFORMED(__FILE__, __LINE__, packet.data(),
+                                    "Invalid MultiMemRead range");
+    // A sanity check that the packet requested is not too large or a negative
+    // number.
+    if (*maybe_length > 4 * 1024 * 1024)
+      return HandlePacket_ILLFORMED(__FILE__, __LINE__, packet.data(),
+                                    "MultiMemRead length is too large");
+
+    ranges.emplace_back(*maybe_addr, *maybe_length);
+  }
+
+  if (ranges.empty())
+    return HandlePacket_ILLFORMED(__FILE__, __LINE__, p,
+                                  "MultiMemRead has an empty range list");
+
+  if (!packet.empty())
+    return HandlePacket_ILLFORMED(
+        __FILE__, __LINE__, p, "MultiMemRead packet has unrecognized fields");
+
+  std::vector<std::vector<uint8_t>> buffers;
+  buffers.reserve(ranges.size());
+  for (auto [base_addr, length] : ranges) {
+    buffers.emplace_back(length, 0);
+    nub_size_t bytes_read = DNBProcessMemoryRead(m_ctx.ProcessID(), base_addr,
+                                                 length, buffers.back().data());
+    buffers.back().resize(bytes_read);
+  }
+
+  std::ostringstream reply_stream;
+  bool first = true;
+  for (const std::vector<uint8_t> &buffer : buffers) {
+    reply_stream << (first ? "" : ",") << std::hex << buffer.size();
+    first = false;
+  }
+  reply_stream << ';';
+
+  for (const std::vector<uint8_t> &buffer : buffers)
+    binary_encode_data_vector(reply_stream, buffer);
+
+  return SendPacket(reply_stream.str());
+}
+
 // Read memory, sent it up as binary data.
 // Usage:  xADDR,LEN
 // ADDR and LEN are both base 16.
@@ -3503,6 +3621,7 @@ rnb_err_t RNBRemote::HandlePacket_qSupported(const char *p) {
   if (supports_memory_tagging())
     reply << "memory-tagging+;";
 
+  reply << "MultiMemRead+;";
   return SendPacket(reply.str().c_str());
 }
 
diff --git a/lldb/tools/debugserver/source/RNBRemote.h b/lldb/tools/debugserver/source/RNBRemote.h
index cf1c978afcd23..b32c00adc8b23 100644
--- a/lldb/tools/debugserver/source/RNBRemote.h
+++ b/lldb/tools/debugserver/source/RNBRemote.h
@@ -136,6 +136,7 @@ class RNBRemote {
     query_transfer,                     // 'qXfer:'
     json_query_dyld_process_state,      // 'jGetDyldProcessState'
     enable_error_strings,               // 'QEnableErrorStrings'
+    multi_mem_read,                     // 'MultiMemRead'
     unknown_type
   };
   // clang-format on
@@ -216,6 +217,7 @@ class RNBRemote {
   rnb_err_t HandlePacket_last_signal(const char *p);
   rnb_err_t HandlePacket_m(const char *p);
   rnb_err_t HandlePacket_M(const char *p);
+  rnb_err_t HandlePacket_MultiMemRead(const char *p);
   rnb_err_t HandlePacket_x(const char *p);
   rnb_err_t HandlePacket_X(const char *p);
   rnb_err_t HandlePacket_z(const char *p);
diff --git a/lldb/tools/lldb-dap/DAP.cpp b/lldb/tools/lldb-dap/DAP.cpp
index f76656e98ca01..61226cceb6db0 100644
--- a/lldb/tools/lldb-dap/DAP.cpp
+++ b/lldb/tools/lldb-dap/DAP.cpp
@@ -1368,6 +1368,12 @@ void DAP::EventThread() {
   broadcaster.AddListener(listener, eBroadcastBitStopEventThread);
   debugger.GetBroadcaster().AddListener(
       listener, lldb::eBroadcastBitError | lldb::eBroadcastBitWarning);
+
+  // listen for thread events.
+  listener.StartListeningForEventClass(
+      debugger, lldb::SBThread::GetBroadcasterClassName(),
+      lldb::SBThread::eBroadcastBitStackChanged);
+
   bool done = false;
   while (!done) {
     if (listener.WaitForEvent(1, event)) {
@@ -1503,6 +1509,9 @@ void DAP::EventThread() {
             SendJSON(llvm::json::Value(std::move(bp_event)));
           }
         }
+
+      } else if (lldb::SBThread::EventIsThreadEvent(event)) {
+        HandleThreadEvent(event);
       } else if (event_mask & lldb::eBroadcastBitError ||
                  event_mask & lldb::eBroadcastBitWarning) {
         lldb::SBStructuredData data =
@@ -1522,6 +1531,16 @@ void DAP::EventThread() {
   }
 }
 
+void DAP::HandleThreadEvent(const lldb::SBEvent &event) {
+  uint32_t event_type = event.GetType();
+
+  if (event_type & lldb::SBThread::eBroadcastBitStackChanged) {
+    const lldb::SBThread evt_thread = lldb::SBThread::GetThreadFromEvent(event);
+    SendInvalidatedEvent(*this, {InvalidatedEventBody::eAreaStacks},
+                         evt_thread.GetThreadID());
+  }
+}
+
 std::vector<protocol::Breakpoint> DAP::SetSourceBreakpoints(
     const protocol::Source &source,
     const std::optional<std::vector<protocol::SourceBreakpoint>> &breakpoints) {
diff --git a/lldb/tools/lldb-dap/DAP.h b/lldb/tools/lldb-dap/DAP.h
index a90ddf59671ee..bf2c3f146a396 100644
--- a/lldb/tools/lldb-dap/DAP.h
+++ b/lldb/tools/lldb-dap/DAP.h
@@ -460,6 +460,7 @@ struct DAP final : public DAPTransport::MessageHandler {
   /// Event threads.
   /// @{
   void EventThread();
+  void HandleThreadEvent(const lldb::SBEvent &event);
   void ProgressEventThread();
 
   std::thread event_thread;
diff --git a/lldb/tools/lldb-dap/EventHelper.cpp b/lldb/tools/lldb-dap/EventHelper.cpp
index 2b9ed229405a8..3042d3293b482 100644
--- a/lldb/tools/lldb-dap/EventHelper.cpp
+++ b/lldb/tools/lldb-dap/EventHelper.cpp
@@ -276,11 +276,16 @@ void SendProcessExitedEvent(DAP &dap, lldb::SBProcess &process) {
 }
 
 void SendInvalidatedEvent(
-    DAP &dap, llvm::ArrayRef<protocol::InvalidatedEventBody::Area> areas) {
+    DAP &dap, llvm::ArrayRef<protocol::InvalidatedEventBody::Area> areas,
+    lldb::tid_t tid) {
   if (!dap.clientFeatures.contains(protocol::eClientFeatureInvalidatedEvent))
     return;
   protocol::InvalidatedEventBody body;
   body.areas = areas;
+
+  if (tid != LLDB_INVALID_THREAD_ID)
+    body.threadId = tid;
+
   dap.Send(protocol::Event{"invalidated", std::move(body)});
 }
 
diff --git a/lldb/tools/lldb-dap/EventHelper.h b/lldb/tools/lldb-dap/EventHelper.h
index 48eb5af6bd0b9..be783d032a5ae 100644
--- a/lldb/tools/lldb-dap/EventHelper.h
+++ b/lldb/tools/lldb-dap/EventHelper.h
@@ -11,6 +11,8 @@
 
 #include "DAPForward.h"
 #include "Protocol/ProtocolEvents.h"
+#include "lldb/lldb-defines.h"
+#include "lldb/lldb-types.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Support/Error.h"
 
@@ -35,7 +37,8 @@ void SendContinuedEvent(DAP &dap);
 void SendProcessExitedEvent(DAP &dap, lldb::SBProcess &process);
 
 void SendInvalidatedEvent(
-    DAP &dap, llvm::ArrayRef<protocol::InvalidatedEventBody::Area> areas);
+    DAP &dap, llvm::ArrayRef<protocol::InvalidatedEventBody::Area> areas,
+    lldb::tid_t tid = LLDB_INVALID_THREAD_ID);
 
 void SendMemoryEvent(DAP &dap, lldb::SBValue variable);
 
diff --git a/lldb/tools/lldb-dap/Handler/RequestHandler.cpp b/lldb/tools/lldb-dap/Handler/RequestHandler.cpp
index 773891353db6a..de63c9d78efd1 100644
--- a/lldb/tools/lldb-dap/Handler/RequestHandler.cpp
+++ b/lldb/tools/lldb-dap/Handler/RequestHandler.cpp
@@ -57,7 +57,7 @@ SetupIORedirection(const std::vector<std::optional<std::string>> &stdio,
   size_t n = std::max(stdio.size(), static_cast<size_t>(3));
   for (size_t i = 0; i < n; i++) {
     std::optional<std::string> path;
-    if (stdio.size() < i)
+    if (stdio.size() <= i)
       path = stdio.back();
     else
       path = stdio[i];
@@ -107,7 +107,7 @@ RunInTerminal(DAP &dap, const protocol::LaunchRequestArguments &arguments) {
 
   llvm::json::Object reverse_request = CreateRunInTerminalReverseRequest(
       arguments.configuration.program, arguments.args, arguments.env,
-      arguments.cwd, comm_file.m_path, debugger_pid,
+      arguments.cwd, comm_file.m_path, debugger_pid, arguments.stdio,
       arguments.console == protocol::eConsoleExternalTerminal);
   dap.SendReverseRequest<LogFailureResponseHandler>("runInTerminal",
                                                     std::move(reverse_request));
@@ -181,7 +181,7 @@ void BaseRequestHandler::Run(const Request &request) {
 
 llvm::Error BaseRequestHandler::LaunchProcess(
     const protocol::LaunchRequestArguments &arguments) const {
-  auto launchCommands = arguments.launchCommands;
+  const std::vector<std::string> &launchCommands = arguments.launchCommands;
 
   // Instantiate a launch info instance for the target.
   auto launch_info = dap.target.GetLaunchInfo();
diff --git a/lldb/tools/lldb-dap/JSONUtils.cpp b/lldb/tools/lldb-dap/JSONUtils.cpp
index 4f26599a49bac..71e91f8f41a68 100644
--- a/lldb/tools/lldb-dap/JSONUtils.cpp
+++ b/lldb/tools/lldb-dap/JSONUtils.cpp
@@ -866,7 +866,8 @@ llvm::json::Value CreateCompileUnit(lldb::SBCompileUnit &unit) {
 llvm::json::Object CreateRunInTerminalReverseRequest(
     llvm::StringRef program, const std::vector<std::string> &args,
     const llvm::StringMap<std::string> &env, llvm::StringRef cwd,
-    llvm::StringRef comm_file, lldb::pid_t debugger_pid, bool external) {
+    llvm::StringRef comm_file, lldb::pid_t debugger_pid,
+    const std::vector<std::optional<std::string>> &stdio, bool external) {
   llvm::json::Object run_in_terminal_args;
   if (external) {
     // This indicates the IDE to open an external terminal window.
@@ -885,6 +886,18 @@ llvm::json::Object CreateRunInTerminalReverseRequest(
   }
   req_args.push_back("--launch-target");
   req_args.push_back(program.str());
+  if (!stdio.empty()) {
+    req_args.push_back("--stdio");
+    std::stringstream ss;
+    for (const std::optional<std::string> &file : stdio) {
+      if (file)
+        ss << *file;
+      ss << ":";
+    }
+    std::string files = ss.str();
+    files.pop_back();
+    req_args.push_back(std::move(files));
+  }
   req_args.insert(req_args.end(), args.begin(), args.end());
   run_in_terminal_args.try_emplace("args", req_args);
 
diff --git a/lldb/tools/lldb-dap/JSONUtils.h b/lldb/tools/lldb-dap/JSONUtils.h
index e9094f67b94ec..0c865a33a6ce4 100644
--- a/lldb/tools/lldb-dap/JSONUtils.h
+++ b/lldb/tools/lldb-dap/JSONUtils.h
@@ -388,6 +388,10 @@ llvm::json::Value CreateCompileUnit(lldb::SBCompileUnit &unit);
 ///     launcher uses it on Linux tell the kernel that it should allow the
 ///     debugger process to attach.
 ///
+/// \param[in] stdio
+///     An array of file paths for redirecting the program's standard IO
+///     streams.
+///
 /// \param[in] external
 ///     If set to true, the program will run in an external terminal window
 ///     instead of IDE's integrated terminal.
@@ -398,7 +402,8 @@ llvm::json::Value CreateCompileUnit(lldb::SBCompileUnit &unit);
 llvm::json::Object CreateRunInTerminalReverseRequest(
     llvm::StringRef program, const std::vector<std::string> &args,
     const llvm::StringMap<std::string> &env, llvm::StringRef cwd,
-    llvm::StringRef comm_file, lldb::pid_t debugger_pid, bool external);
+    llvm::StringRef comm_file, lldb::pid_t debugger_pid,
+    const std::vector<std::optional<std::string>> &stdio, bool external);
 
 /// Create a "Terminated" JSON object that contains statistics
 ///
diff --git a/lldb/tools/lldb-dap/Options.td b/lldb/tools/lldb-dap/Options.td
index c8492c6a62b25..5e9dd7a1d6419 100644
--- a/lldb/tools/lldb-dap/Options.td
+++ b/lldb/tools/lldb-dap/Options.td
@@ -47,6 +47,12 @@ def debugger_pid: S<"debugger-pid">,
   HelpText<"The PID of the lldb-dap instance that sent the launchInTerminal "
     "request when using --launch-target.">;
 
+def stdio: S<"stdio">,
+  MetaVarName<"<stdin:stdout:stderr:...>">,
+  HelpText<"An array of file paths for redirecting the program's standard IO "
+    "streams. A colon-separated list of entries. Empty value means no "
+    "redirection.">;
+
 def repl_mode
     : S<"repl-mode">,
       MetaVarName<"<mode>">,
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolEvents.cpp b/lldb/tools/lldb-dap/Protocol/ProtocolEvents.cpp
index b896eca817be6..df6be06637a13 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolEvents.cpp
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolEvents.cpp
@@ -51,7 +51,7 @@ llvm::json::Value toJSON(const InvalidatedEventBody::Area &IEBA) {
 llvm::json::Value toJSON(const InvalidatedEventBody &IEB) {
   json::Object Result{{"areas", IEB.areas}};
   if (IEB.threadId)
-    Result.insert({"threadID", IEB.threadId});
+    Result.insert({"threadId", IEB.threadId});
   if (IEB.stackFrameId)
     Result.insert({"stackFrameId", IEB.stackFrameId});
   return Result;
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
index 92dada2295841..a85a68b87014c 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
@@ -300,6 +300,7 @@ struct LaunchRequestArguments {
   /// terminal or external terminal.
   Console console = eConsoleInternal;
 
+  /// An array of file paths for redirecting the program's standard IO streams.
   std::vector<std::optional<std::string>> stdio;
 
   /// @}
diff --git a/lldb/tools/lldb-dap/ProtocolUtils.cpp b/lldb/tools/lldb-dap/ProtocolUtils.cpp
index 775c82fbb7716..868c67ca72986 100644
--- a/lldb/tools/lldb-dap/ProtocolUtils.cpp
+++ b/lldb/tools/lldb-dap/ProtocolUtils.cpp
@@ -301,6 +301,14 @@ Variable CreateVariable(lldb::SBValue v, int64_t var_ref, bool format_hex,
   if (lldb::addr_t addr = v.GetLoadAddress(); addr != LLDB_INVALID_ADDRESS)
     var.memoryReference = addr;
 
+  bool is_readonly = v.GetType().IsAggregateType() ||
+                     v.GetValueType() == lldb::eValueTypeRegisterSet;
+  if (is_readonly) {
+    if (!var.presentationHint)
+      var.presentationHint = {VariablePresentationHint()};
+    var.presentationHint->attributes.push_back("readOnly");
+  }
+
   return var;
 }
 
diff --git a/lldb/tools/lldb-dap/package.json b/lldb/tools/lldb-dap/package.json
index e961c2e48b258..05dce285dd592 100644
--- a/lldb/tools/lldb-dap/package.json
+++ b/lldb/tools/lldb-dap/package.json
@@ -453,7 +453,7 @@
               "cwd": {
                 "type": "string",
                 "description": "Program working directory.",
-                "default": "${workspaceRoot}"
+                "default": "${workspaceFolder}"
               },
               "env": {
                 "anyOf": [
@@ -626,7 +626,10 @@
               "stdio": {
                 "type": "array",
                 "items": {
-                  "type": "string"
+                  "type": [
+                    "string",
+                    "null"
+                  ]
                 },
                 "description": "The stdio property specifies the redirection targets for the debuggee's stdio streams. A null value redirects a stream to the default debug terminal. String can be a path to file, named pipe or TTY device. If less than three values are provided, the list will be padded with the last value. Specifying more than three values will create additional file descriptors (4, 5, etc.).",
                 "default": []
@@ -880,10 +883,10 @@
             "type": "lldb-dap",
             "request": "launch",
             "name": "Debug",
-            "program": "${workspaceRoot}/<your program>",
+            "program": "${workspaceFolder}/<your program>",
             "args": [],
             "env": [],
-            "cwd": "${workspaceRoot}"
+            "cwd": "${workspaceFolder}"
           }
         ],
         "configurationSnippets": [
@@ -894,10 +897,10 @@
               "type": "lldb-dap",
               "request": "launch",
               "name": "${2:Launch}",
-              "program": "^\"\\${workspaceRoot}/${1:<your program>}\"",
+              "program": "^\"\\${workspaceFolder}/${1:<your program>}\"",
               "args": [],
               "env": [],
-              "cwd": "^\"\\${workspaceRoot}\""
+              "cwd": "^\"\\${workspaceFolder}\""
             }
           },
           {
diff --git a/lldb/tools/lldb-dap/tool/lldb-dap.cpp b/lldb/tools/lldb-dap/tool/lldb-dap.cpp
index 93446c051eb54..45caa1a81059b 100644
--- a/lldb/tools/lldb-dap/tool/lldb-dap.cpp
+++ b/lldb/tools/lldb-dap/tool/lldb-dap.cpp
@@ -16,15 +16,19 @@
 #include "lldb/API/SBStream.h"
 #include "lldb/Host/Config.h"
 #include "lldb/Host/File.h"
+#include "lldb/Host/FileSystem.h"
 #include "lldb/Host/MainLoop.h"
 #include "lldb/Host/MainLoopBase.h"
 #include "lldb/Host/MemoryMonitor.h"
 #include "lldb/Host/Socket.h"
+#include "lldb/Utility/AnsiTerminal.h"
 #include "lldb/Utility/Status.h"
 #include "lldb/Utility/UriParser.h"
 #include "lldb/lldb-forward.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Option/Arg.h"
@@ -42,8 +46,10 @@
 #include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 #include <condition_variable>
+#include <cstddef>
 #include <cstdio>
 #include <cstdlib>
+#include <exception>
 #include <fcntl.h>
 #include <map>
 #include <memory>
@@ -69,6 +75,7 @@ typedef int socklen_t;
 #include <netinet/in.h>
 #include <sys/socket.h>
 #include <sys/un.h>
+#include <termios.h>
 #include <unistd.h>
 #endif
 
@@ -143,6 +150,74 @@ static void PrintVersion() {
   llvm::outs() << "liblldb: " << lldb::SBDebugger::GetVersionString() << '\n';
 }
 
+#if not defined(_WIN32)
+struct FDGroup {
+  int GetFlags() const {
+    if (read && write)
+      return O_NOCTTY | O_CREAT | O_RDWR;
+    if (read)
+      return O_NOCTTY | O_RDONLY;
+    return O_NOCTTY | O_CREAT | O_WRONLY | O_TRUNC;
+  }
+
+  std::vector<int> fds;
+  bool read = false;
+  bool write = false;
+};
+
+static llvm::Error RedirectToFile(const FDGroup &fdg, llvm::StringRef file) {
+  if (!fdg.read && !fdg.write)
+    return llvm::Error::success();
+  int target_fd = lldb_private::FileSystem::Instance().Open(
+      file.str().c_str(), fdg.GetFlags(), 0666);
+  if (target_fd == -1)
+    return llvm::errorCodeToError(
+        std::error_code(errno, std::generic_category()));
+  for (int fd : fdg.fds) {
+    if (target_fd == fd)
+      continue;
+    if (::dup2(target_fd, fd) == -1)
+      return llvm::errorCodeToError(
+          std::error_code(errno, std::generic_category()));
+  }
+  ::close(target_fd);
+  return llvm::Error::success();
+}
+
+static llvm::Error
+SetupIORedirection(const llvm::SmallVectorImpl<llvm::StringRef> &files) {
+  llvm::SmallDenseMap<llvm::StringRef, FDGroup> groups;
+  for (size_t i = 0; i < files.size(); i++) {
+    if (files[i].empty())
+      continue;
+    auto group = groups.find(files[i]);
+    if (group == groups.end())
+      group = groups.insert({files[i], {{static_cast<int>(i)}}}).first;
+    else
+      group->second.fds.push_back(i);
+    switch (i) {
+    case 0:
+      group->second.read = true;
+      break;
+    case 1:
+    case 2:
+      group->second.write = true;
+      break;
+    default:
+      group->second.read = true;
+      group->second.write = true;
+      break;
+    }
+  }
+  for (const auto &[file, group] : groups) {
+    if (llvm::Error err = RedirectToFile(group, file))
+      return llvm::createStringError(
+          llvm::formatv("{0}: {1}", file, llvm::toString(std::move(err))));
+  }
+  return llvm::Error::success();
+}
+#endif
+
 // If --launch-target is provided, this instance of lldb-dap becomes a
 // runInTerminal launcher. It will ultimately launch the program specified in
 // the --launch-target argument, which is the original program the user wanted
@@ -165,6 +240,7 @@ static void PrintVersion() {
 static llvm::Error LaunchRunInTerminalTarget(llvm::opt::Arg &target_arg,
                                              llvm::StringRef comm_file,
                                              lldb::pid_t debugger_pid,
+                                             llvm::StringRef stdio,
                                              char *argv[]) {
 #if defined(_WIN32)
   return llvm::createStringError(
@@ -179,6 +255,25 @@ static llvm::Error LaunchRunInTerminalTarget(llvm::opt::Arg &target_arg,
     (void)prctl(PR_SET_PTRACER, debugger_pid, 0, 0, 0);
 #endif
 
+  lldb_private::FileSystem::Initialize();
+  if (!stdio.empty()) {
+    llvm::SmallVector<llvm::StringRef, 3> files;
+    stdio.split(files, ':');
+    while (files.size() < 3)
+      files.push_back(files.back());
+    if (llvm::Error err = SetupIORedirection(files))
+      return err;
+  } else if ((isatty(STDIN_FILENO) != 0) &&
+             llvm::StringRef(getenv("TERM")).starts_with_insensitive("xterm")) {
+    // Clear the screen.
+    llvm::outs() << ANSI_CSI_RESET_CURSOR ANSI_CSI_ERASE_VIEWPORT
+            ANSI_CSI_ERASE_SCROLLBACK;
+    // VS Code will reuse the same terminal for the same debug configuration
+    // between runs. Clear the input buffer prior to starting the new process so
+    // prior input is not carried forward to the new debug session.
+    tcflush(STDIN_FILENO, TCIFLUSH);
+  }
+
   RunInTerminalLauncherCommChannel comm_channel(comm_file);
   if (llvm::Error err = comm_channel.NotifyPid())
     return err;
@@ -484,9 +579,10 @@ int main(int argc, char *argv[]) {
           break;
         }
       }
+      llvm::StringRef stdio = input_args.getLastArgValue(OPT_stdio);
       if (llvm::Error err =
               LaunchRunInTerminalTarget(*target_arg, comm_file->getValue(), pid,
-                                        argv + target_args_pos)) {
+                                        stdio, argv + target_args_pos)) {
         llvm::errs() << llvm::toString(std::move(err)) << '\n';
         return EXIT_FAILURE;
       }
diff --git a/lldb/tools/lldb-mcp/lldb-mcp.cpp b/lldb/tools/lldb-mcp/lldb-mcp.cpp
index 68e987237cc69..3fc362232ba6c 100644
--- a/lldb/tools/lldb-mcp/lldb-mcp.cpp
+++ b/lldb/tools/lldb-mcp/lldb-mcp.cpp
@@ -85,6 +85,8 @@ FileSpec driverPath() {
 llvm::Error launch() {
   FileSpec lldb_exec = driverPath();
   lldb_private::ProcessLaunchInfo info;
+  info.SetMonitorProcessCallback(
+      &lldb_private::ProcessLaunchInfo::NoOpMonitorCallback);
   info.SetExecutableFile(lldb_exec,
                          /*add_exe_file_as_first_arg=*/true);
   info.GetArguments().AppendArgument("-O");
diff --git a/lldb/unittests/API/SBMutexTest.cpp b/lldb/unittests/API/SBMutexTest.cpp
index aafad59d58c17..18dc420086d0a 100644
--- a/lldb/unittests/API/SBMutexTest.cpp
+++ b/lldb/unittests/API/SBMutexTest.cpp
@@ -36,11 +36,16 @@ TEST_F(SBMutexTest, LockTest) {
   std::future<void> f;
   {
     lldb::SBMutex lock = target.GetAPIMutex();
+
+    ASSERT_TRUE(lock.try_lock());
+    lock.unlock();
+
     std::lock_guard<lldb::SBMutex> lock_guard(lock);
     ASSERT_FALSE(locked.exchange(true));
 
     f = std::async(std::launch::async, [&]() {
       ASSERT_TRUE(locked);
+      EXPECT_FALSE(lock.try_lock());
       target.BreakpointCreateByName("foo", "bar");
       ASSERT_FALSE(locked);
     });
diff --git a/lldb/unittests/DAP/ProtocolTypesTest.cpp b/lldb/unittests/DAP/ProtocolTypesTest.cpp
index a5ae856a185b7..8170abdd25bc6 100644
--- a/lldb/unittests/DAP/ProtocolTypesTest.cpp
+++ b/lldb/unittests/DAP/ProtocolTypesTest.cpp
@@ -1079,14 +1079,17 @@ TEST(ProtocolTypesTest, InvalidatedEventBody) {
   body.areas = {InvalidatedEventBody::eAreaStacks,
                 InvalidatedEventBody::eAreaThreads};
   body.stackFrameId = 1;
-  StringRef json = R"({
-  "areas": [
-    "stacks",
-    "threads"
-  ],
-  "stackFrameId": 1
-})";
-  EXPECT_EQ(json, pp(body));
+  body.threadId = 20;
+  Expected<json::Value> expected = json::parse(R"({
+    "areas": [
+      "stacks",
+      "threads"
+    ],
+    "stackFrameId": 1,
+    "threadId": 20
+    })");
+  ASSERT_THAT_EXPECTED(expected, llvm::Succeeded());
+  EXPECT_EQ(pp(*expected), pp(body));
 }
 
 TEST(ProtocolTypesTest, MemoryEventBody) {
diff --git a/lldb/unittests/DAP/TestBase.cpp b/lldb/unittests/DAP/TestBase.cpp
index 3721e09d8b699..83a303554ad6b 100644
--- a/lldb/unittests/DAP/TestBase.cpp
+++ b/lldb/unittests/DAP/TestBase.cpp
@@ -45,7 +45,7 @@ void TransportBase::SetUp() {
       /*client_name=*/"test_client",
       /*transport=*/*to_client, /*loop=*/loop);
 
-  auto server_handle = to_server->RegisterMessageHandler(loop, *dap.get());
+  auto server_handle = to_server->RegisterMessageHandler(loop, *dap);
   EXPECT_THAT_EXPECTED(server_handle, Succeeded());
   handles[0] = std::move(*server_handle);
 
diff --git a/lldb/unittests/Process/gdb-remote/GDBRemoteCommunicationClientTest.cpp b/lldb/unittests/Process/gdb-remote/GDBRemoteCommunicationClientTest.cpp
index f940229985887..012eae02d5857 100644
--- a/lldb/unittests/Process/gdb-remote/GDBRemoteCommunicationClientTest.cpp
+++ b/lldb/unittests/Process/gdb-remote/GDBRemoteCommunicationClientTest.cpp
@@ -639,3 +639,25 @@ TEST_F(GDBRemoteCommunicationClientTest, CalculateMD5) {
   EXPECT_EQ(expected_high, result->high());
 }
 #endif
+
+TEST_F(GDBRemoteCommunicationClientTest, MultiMemReadSupported) {
+  std::future<bool> async_result = std::async(std::launch::async, [&] {
+    StringExtractorGDBRemote qSupported_packet_request;
+    server.GetPacket(qSupported_packet_request);
+    server.SendPacket("MultiMemRead+;");
+    return true;
+  });
+  ASSERT_TRUE(client.GetMultiMemReadSupported());
+  async_result.wait();
+}
+
+TEST_F(GDBRemoteCommunicationClientTest, MultiMemReadNotSupported) {
+  std::future<bool> async_result = std::async(std::launch::async, [&] {
+    StringExtractorGDBRemote qSupported_packet_request;
+    server.GetPacket(qSupported_packet_request);
+    server.SendPacket(";");
+    return true;
+  });
+  ASSERT_FALSE(client.GetMultiMemReadSupported());
+  async_result.wait();
+}
diff --git a/lldb/unittests/Target/MemoryTest.cpp b/lldb/unittests/Target/MemoryTest.cpp
index 4a96730e00464..f7b4e97b1f64a 100644
--- a/lldb/unittests/Target/MemoryTest.cpp
+++ b/lldb/unittests/Target/MemoryTest.cpp
@@ -17,6 +17,7 @@
 #include "lldb/Utility/ArchSpec.h"
 #include "lldb/Utility/DataBufferHeap.h"
 #include "gtest/gtest.h"
+#include <cstdint>
 
 using namespace lldb_private;
 using namespace lldb;
@@ -225,3 +226,144 @@ TEST_F(MemoryTest, TesetMemoryCacheRead) {
                                                        // instead of using an
                                                        // old cache
 }
+
+/// A process class that, when asked to read memory from some address X, returns
+/// the least significant byte of X.
+class DummyReaderProcess : public Process {
+public:
+  // If true, `DoReadMemory` will not return all requested bytes.
+  // It's not possible to control exactly how many bytes will be read, because
+  // Process::ReadMemoryFromInferior tries to fulfill the entire request by
+  // reading smaller chunks until it gets nothing back.
+  bool read_less_than_requested = false;
+  bool read_more_than_requested = false;
+
+  size_t DoReadMemory(lldb::addr_t vm_addr, void *buf, size_t size,
+                      Status &error) override {
+    if (read_less_than_requested && size > 0)
+      size--;
+    if (read_more_than_requested)
+      size *= 2;
+    uint8_t *buffer = static_cast<uint8_t *>(buf);
+    for (size_t addr = vm_addr; addr < vm_addr + size; addr++)
+      buffer[addr - vm_addr] = static_cast<uint8_t>(addr); // LSB of addr.
+    return size;
+  }
+  // Boilerplate, nothing interesting below.
+  DummyReaderProcess(lldb::TargetSP target_sp, lldb::ListenerSP listener_sp)
+      : Process(target_sp, listener_sp) {}
+  bool CanDebug(lldb::TargetSP, bool) override { return true; }
+  Status DoDestroy() override { return {}; }
+  void RefreshStateAfterStop() override {}
+  bool DoUpdateThreadList(ThreadList &, ThreadList &) override { return false; }
+  llvm::StringRef GetPluginName() override { return "Dummy"; }
+};
+
+TEST_F(MemoryTest, TestReadMemoryRanges) {
+  ArchSpec arch("x86_64-apple-macosx-");
+
+  Platform::SetHostPlatform(PlatformRemoteMacOSX::CreateInstance(true, &arch));
+
+  DebuggerSP debugger_sp = Debugger::CreateInstance();
+  ASSERT_TRUE(debugger_sp);
+
+  TargetSP target_sp = CreateTarget(debugger_sp, arch);
+  ASSERT_TRUE(target_sp);
+
+  ListenerSP listener_sp(Listener::MakeListener("dummy"));
+  ProcessSP process_sp =
+      std::make_shared<DummyReaderProcess>(target_sp, listener_sp);
+  ASSERT_TRUE(process_sp);
+
+  {
+    llvm::SmallVector<uint8_t, 0> buffer(1024, 0);
+    // Read 8 ranges of 128 bytes with arbitrary base addresses.
+    llvm::SmallVector<Range<addr_t, size_t>> ranges = {
+        {0x12345, 128},      {0x11112222, 128}, {0x77777777, 128},
+        {0xffaabbccdd, 128}, {0x0, 128},        {0x4242424242, 128},
+        {0x17171717, 128},   {0x99999, 128}};
+
+    llvm::SmallVector<llvm::MutableArrayRef<uint8_t>> read_results =
+        process_sp->ReadMemoryRanges(ranges, buffer);
+
+    for (auto [range, memory] : llvm::zip(ranges, read_results)) {
+      ASSERT_EQ(memory.size(), 128u);
+      addr_t range_base = range.GetRangeBase();
+      for (auto [idx, byte] : llvm::enumerate(memory))
+        ASSERT_EQ(byte, static_cast<uint8_t>(range_base + idx));
+    }
+  }
+
+  auto &dummy_process = static_cast<DummyReaderProcess &>(*process_sp);
+  dummy_process.read_less_than_requested = true;
+  {
+    llvm::SmallVector<uint8_t, 0> buffer(1024, 0);
+    llvm::SmallVector<Range<addr_t, size_t>> ranges = {
+        {0x12345, 128}, {0x11112222, 128}, {0x77777777, 128}};
+    llvm::SmallVector<llvm::MutableArrayRef<uint8_t>> read_results =
+        dummy_process.ReadMemoryRanges(ranges, buffer);
+    for (auto [range, memory] : llvm::zip(ranges, read_results)) {
+      ASSERT_LT(memory.size(), 128u);
+      addr_t range_base = range.GetRangeBase();
+      for (auto [idx, byte] : llvm::enumerate(memory))
+        ASSERT_EQ(byte, static_cast<uint8_t>(range_base + idx));
+    }
+  }
+}
+
+using MemoryDeathTest = MemoryTest;
+
+TEST_F(MemoryDeathTest, TestReadMemoryRangesReturnsTooMuch) {
+  ArchSpec arch("x86_64-apple-macosx-");
+  Platform::SetHostPlatform(PlatformRemoteMacOSX::CreateInstance(true, &arch));
+  DebuggerSP debugger_sp = Debugger::CreateInstance();
+  ASSERT_TRUE(debugger_sp);
+  TargetSP target_sp = CreateTarget(debugger_sp, arch);
+  ASSERT_TRUE(target_sp);
+  ListenerSP listener_sp(Listener::MakeListener("dummy"));
+  ProcessSP process_sp =
+      std::make_shared<DummyReaderProcess>(target_sp, listener_sp);
+  ASSERT_TRUE(process_sp);
+
+  auto &dummy_process = static_cast<DummyReaderProcess &>(*process_sp);
+  dummy_process.read_more_than_requested = true;
+  llvm::SmallVector<uint8_t, 0> buffer(1024, 0);
+  llvm::SmallVector<Range<addr_t, size_t>> ranges = {{0x12345, 128}};
+
+  llvm::SmallVector<llvm::MutableArrayRef<uint8_t>> read_results;
+  ASSERT_DEBUG_DEATH(
+      { read_results = process_sp->ReadMemoryRanges(ranges, buffer); },
+      "read more than requested bytes");
+#ifdef NDEBUG
+  // With asserts off, the read should return empty ranges.
+  ASSERT_EQ(read_results.size(), 1u);
+  ASSERT_TRUE(read_results[0].empty());
+#endif
+}
+
+TEST_F(MemoryDeathTest, TestReadMemoryRangesWithShortBuffer) {
+  ArchSpec arch("x86_64-apple-macosx-");
+  Platform::SetHostPlatform(PlatformRemoteMacOSX::CreateInstance(true, &arch));
+  DebuggerSP debugger_sp = Debugger::CreateInstance();
+  ASSERT_TRUE(debugger_sp);
+  TargetSP target_sp = CreateTarget(debugger_sp, arch);
+  ASSERT_TRUE(target_sp);
+  ListenerSP listener_sp(Listener::MakeListener("dummy"));
+  ProcessSP process_sp =
+      std::make_shared<DummyReaderProcess>(target_sp, listener_sp);
+  ASSERT_TRUE(process_sp);
+
+  llvm::SmallVector<uint8_t, 0> short_buffer(10, 0);
+  llvm::SmallVector<Range<addr_t, size_t>> ranges = {{0x12345, 128},
+                                                     {0x11, 128}};
+  llvm::SmallVector<llvm::MutableArrayRef<uint8_t>> read_results;
+  ASSERT_DEBUG_DEATH(
+      { read_results = process_sp->ReadMemoryRanges(ranges, short_buffer); },
+      "provided buffer is too short");
+#ifdef NDEBUG
+  // With asserts off, the read should return empty ranges.
+  ASSERT_EQ(read_results.size(), ranges.size());
+  for (llvm::MutableArrayRef<uint8_t> result : read_results)
+    ASSERT_TRUE(result.empty());
+#endif
+}
diff --git a/lldb/unittests/Utility/RegisterValueTest.cpp b/lldb/unittests/Utility/RegisterValueTest.cpp
index e0863a41605e6..6239dbe21634a 100644
--- a/lldb/unittests/Utility/RegisterValueTest.cpp
+++ b/lldb/unittests/Utility/RegisterValueTest.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "lldb/Utility/RegisterValue.h"
+#include "lldb/Utility/DataExtractor.h"
+#include "lldb/lldb-private-types.h"
 #include "gtest/gtest.h"
 #include <optional>
 
@@ -54,3 +56,41 @@ TEST(RegisterValueTest, GetScalarValue) {
             Scalar((APInt(128, 0xffeeddccbbaa9988ull) << 64) |
                    APInt(128, 0x7766554433221100)));
 }
+
+static const Scalar etalon128(APInt(128, 0xffeeddccbbaa9988ull) << 64 |
+                              APInt(128, 0x7766554433221100ull));
+
+void TestSetValueFromData128(void *src, const lldb::ByteOrder endianness) {
+  RegisterInfo ri{"uint128_register",
+                  nullptr,
+                  16,
+                  0,
+                  lldb::Encoding::eEncodingUint,
+                  lldb::Format::eFormatDefault,
+                  {0, 0, 0, LLDB_INVALID_REGNUM, 0},
+                  nullptr,
+                  nullptr,
+                  nullptr};
+  DataExtractor src_extractor(src, 16, endianness, 8);
+  RegisterValue rv;
+  EXPECT_TRUE(rv.SetValueFromData(ri, src_extractor, 0, false).Success());
+  Scalar s;
+  EXPECT_TRUE(rv.GetScalarValue(s));
+  EXPECT_EQ(s, etalon128);
+}
+
+// Test that the "RegisterValue::SetValueFromData" method works correctly
+// with 128-bit little-endian data that represents an integer.
+TEST(RegisterValueTest, SetValueFromData_128_le) {
+  uint8_t src[] = {0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77,
+                   0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff};
+  TestSetValueFromData128(src, lldb::ByteOrder::eByteOrderLittle);
+}
+
+// Test that the "RegisterValue::SetValueFromData" method works correctly
+// with 128-bit big-endian data that represents an integer.
+TEST(RegisterValueTest, SetValueFromData_128_be) {
+  uint8_t src[] = {0xff, 0xee, 0xdd, 0xcc, 0xbb, 0xaa, 0x99, 0x88,
+                   0x77, 0x66, 0x55, 0x44, 0x33, 0x22, 0x11, 0x00};
+  TestSetValueFromData128(src, lldb::ByteOrder::eByteOrderBig);
+}
diff --git a/lldb/unittests/Utility/XcodeSDKTest.cpp b/lldb/unittests/Utility/XcodeSDKTest.cpp
index de9f91a04d53e..a8a597bdeb746 100644
--- a/lldb/unittests/Utility/XcodeSDKTest.cpp
+++ b/lldb/unittests/Utility/XcodeSDKTest.cpp
@@ -27,6 +27,7 @@ TEST(XcodeSDKTest, ParseTest) {
   EXPECT_EQ(XcodeSDK("AppleTVOS.sdk").GetType(), XcodeSDK::AppleTVOS);
   EXPECT_EQ(XcodeSDK("WatchSimulator.sdk").GetType(), XcodeSDK::WatchSimulator);
   EXPECT_EQ(XcodeSDK("WatchOS.sdk").GetType(), XcodeSDK::watchOS);
+  EXPECT_EQ(XcodeSDK("BridgeOS.sdk").GetType(), XcodeSDK::BridgeOS);
   EXPECT_EQ(XcodeSDK("XRSimulator.sdk").GetType(), XcodeSDK::XRSimulator);
   EXPECT_EQ(XcodeSDK("XROS.sdk").GetType(), XcodeSDK::XROS);
   EXPECT_EQ(XcodeSDK("Linux.sdk").GetType(), XcodeSDK::Linux);
diff --git a/lldb/utils/lui/lldbutil.py b/lldb/utils/lui/lldbutil.py
index 140317af3670b..589acaeea3206 100644
--- a/lldb/utils/lui/lldbutil.py
+++ b/lldb/utils/lui/lldbutil.py
@@ -951,7 +951,7 @@ def get_GPRs(frame):
         from lldbutil import get_GPRs
         regs = get_GPRs(frame)
         for reg in regs:
-            print "%s => %s" % (reg.GetName(), reg.GetValue())
+            print("%s => %s" % (reg.GetName(), reg.GetValue()))
         ...
     """
     return get_registers(frame, "general purpose")
@@ -965,7 +965,7 @@ def get_FPRs(frame):
         from lldbutil import get_FPRs
         regs = get_FPRs(frame)
         for reg in regs:
-            print "%s => %s" % (reg.GetName(), reg.GetValue())
+            print("%s => %s" % (reg.GetName(), reg.GetValue()))
         ...
     """
     return get_registers(frame, "floating point")
diff --git a/llvm/benchmarks/SpecialCaseListBM.cpp b/llvm/benchmarks/SpecialCaseListBM.cpp
index 00aa3cdda1b6a..b5d82682199db 100644
--- a/llvm/benchmarks/SpecialCaseListBM.cpp
+++ b/llvm/benchmarks/SpecialCaseListBM.cpp
@@ -110,6 +110,26 @@ std::string genGlobAtBothSides(const std::vector<std::string> &Files) {
   return S;
 }
 
+std::string genGlobAtBothSidesAndMid(const std::vector<std::string> &Files) {
+  std::string S;
+  std::minstd_rand Rng(RNG_SEED);
+  for (std::string F : Files) {
+    std::uniform_int_distribution<> PosDistrib(0, F.size() - 1);
+    F[PosDistrib(Rng)] = '*';
+
+    std::uniform_int_distribution<> Ends(0, 1);
+    if (Ends(Rng)) {
+      F.back() = '*';
+      F.front() = '*';
+    }
+
+    S += "src:";
+    S += F;
+    S += "\n";
+  }
+  return S;
+}
+
 void BM_Make_(
     benchmark::State &state,
     std::string (*GenerateCaseList)(const std::vector<std::string> &Files)) {
@@ -171,6 +191,9 @@ BENCHMARK_CAPTURE(BM_Make_, Mid__, genGlobInMid)
 BENCHMARK_CAPTURE(BM_Make_, Both_, genGlobAtBothSides)
     ->RangeMultiplier(MAX_LIST_MUL)
     ->Range(MAX_LIST_MIN, MAX_LIST_MAX);
+BENCHMARK_CAPTURE(BM_Make_, Mix__, genGlobAtBothSidesAndMid)
+    ->RangeMultiplier(MAX_LIST_MUL)
+    ->Range(MAX_LIST_MIN, MAX_LIST_MAX);
 
 BENCHMARK_CAPTURE(BM_True_, None_, genGlobNone)
     ->RangeMultiplier(MAX_LIST_MUL)
@@ -187,6 +210,9 @@ BENCHMARK_CAPTURE(BM_True_, Mid__, genGlobInMid)
 BENCHMARK_CAPTURE(BM_True_, Both_, genGlobAtBothSides)
     ->RangeMultiplier(MAX_LIST_MUL)
     ->Range(MAX_LIST_MIN, MAX_LIST_MAX);
+BENCHMARK_CAPTURE(BM_True_, Mix__, genGlobAtBothSidesAndMid)
+    ->RangeMultiplier(MAX_LIST_MUL)
+    ->Range(MAX_LIST_MIN, MAX_LIST_MAX);
 
 BENCHMARK_CAPTURE(BM_False, None_, genGlobNone)
     ->RangeMultiplier(MAX_LIST_MUL)
@@ -203,5 +229,8 @@ BENCHMARK_CAPTURE(BM_False, Mid__, genGlobInMid)
 BENCHMARK_CAPTURE(BM_False, Both_, genGlobAtBothSides)
     ->RangeMultiplier(MAX_LIST_MUL)
     ->Range(MAX_LIST_MIN, MAX_LIST_MAX);
+BENCHMARK_CAPTURE(BM_False, Mix__, genGlobAtBothSidesAndMid)
+    ->RangeMultiplier(MAX_LIST_MUL)
+    ->Range(MAX_LIST_MIN, MAX_LIST_MAX);
 
 BENCHMARK_MAIN();
diff --git a/llvm/docs/AMDGPUDwarfExtensionsForHeterogeneousDebugging.rst b/llvm/docs/AMDGPUDwarfExtensionsForHeterogeneousDebugging.rst
index 39f5bcd60a94a..f5b052264716c 100644
--- a/llvm/docs/AMDGPUDwarfExtensionsForHeterogeneousDebugging.rst
+++ b/llvm/docs/AMDGPUDwarfExtensionsForHeterogeneousDebugging.rst
@@ -37,13 +37,13 @@ includes contributions to open source projects such as LLVM [:ref:`LLVM
 
 The LLVM compiler has upstream support for commercially available AMD GPU
 hardware (AMDGPU) [:ref:`AMDGPU-LLVM <amdgpu-dwarf-AMDGPU-LLVM>`]. The open
-source ROCgdb [:ref:`AMD-ROCgdb <amdgpu-dwarf-AMD-ROCgdb>`] GDB based debugger
+source ROCgdb [:ref:`AMD-ROCgdb <amdgpu-dwarf-AMD-ROCgdb>`] GDB-based debugger
 also has support for AMDGPU which is being upstreamed. Support for AMDGPU is
 also being added by third parties to the GCC [:ref:`GCC <amdgpu-dwarf-GCC>`]
 compiler and the Perforce TotalView HPC Debugger [:ref:`Perforce-TotalView
 <amdgpu-dwarf-Perforce-TotalView>`].
 
-To support debugging heterogeneous programs several features that are not
+To support debugging heterogeneous programs, several features that are not
 provided by current DWARF Version 5 [:ref:`DWARF <amdgpu-dwarf-DWARF>`] have
 been identified. The :ref:`amdgpu-dwarf-extensions` section gives an overview of
 the extensions devised to address the missing features. The extensions seek to
@@ -107,7 +107,7 @@ for each in terms of heterogeneous debugging.
 DWARF Version 5 does not allow location descriptions to be entries on the DWARF
 expression stack. They can only be the final result of the evaluation of a DWARF
 expression. However, by allowing a location description to be a first-class
-entry on the DWARF expression stack it becomes possible to compose expressions
+entry on the DWARF expression stack, it becomes possible to compose expressions
 containing both values and location descriptions naturally. It allows objects to
 be located in any kind of memory address space, in registers, be implicit
 values, be undefined, or a composite of any of these.
@@ -123,20 +123,20 @@ non-default address spaces and generalizing the power of composite location
 descriptions to any kind of location description.
 
 For those familiar with the definition of location descriptions in DWARF Version
-5, the definitions in these extensions are presented differently, but does in
+5, the definitions in these extensions are presented differently, but do in
 fact define the same concept with the same fundamental semantics. However, it
 does so in a way that allows the concept to extend to support address spaces,
 bit addressing, the ability for composite location descriptions to be composed
 of any kind of location description, and the ability to support objects located
 at multiple places. Collectively these changes expand the set of architectures
-that can be supported and improves support for optimized code.
+that can be supported and improve support for optimized code.
 
 Several approaches were considered, and the one presented, together with the
 extensions it enables, appears to be the simplest and cleanest one that offers
 the greatest improvement of DWARF's ability to support debugging optimized GPU
 and non-GPU code. Examining the GDB debugger and LLVM compiler, it appears only
 to require modest changes as they both already have to support general use of
-location descriptions. It is anticipated that will also be the case for other
+location descriptions. It is anticipated that this will also be the case for other
 debuggers and compilers.
 
 GDB has been modified to evaluate DWARF Version 5 expressions with location
@@ -156,7 +156,7 @@ DWARF Expression Stack* [:ref:`AMDGPU-DWARF-LOC
 2.2 Generalize CFI to Allow Any Location Description Kind
 ---------------------------------------------------------
 
-CFI describes restoring callee saved registers that are spilled. Currently CFI
+CFI describes restoring callee saved registers that are spilled. Currently, CFI
 only allows a location description that is a register, memory address, or
 implicit location description. AMDGPU optimized code may spill scalar registers
 into portions of vector registers. This requires extending CFI to allow any
@@ -223,7 +223,7 @@ infinite precision offsets to allow it to correctly track a series of positive
 and negative offsets that may transiently overflow or underflow, but end up in
 range. This is simple for the arithmetic operations as they are defined in terms
 of two's complement arithmetic on a base type of a fixed size. Therefore, the
-offset operation define that integer overflow is ill-formed. This is in contrast
+offset operation defines that integer overflow is ill-formed. This is in contrast
 to the ``DW_OP_plus``, ``DW_OP_plus_uconst``, and ``DW_OP_minus`` arithmetic
 operations which define that it causes wrap-around.
 
@@ -359,7 +359,7 @@ address space at a fixed address.
 
 The ``DW_OP_LLVM_form_aspace_address`` (see
 :ref:`amdgpu-dwarf-memory-location-description-operations`) operation is defined
-to create a memory location description from an address and address space. If
+to create a memory location description from an address and address space. It
 can be used to specify the location of a variable that is allocated in a
 specific address space. This allows the size of addresses in an address space to
 be larger than the generic type. It also allows a consumer great implementation
@@ -372,7 +372,7 @@ In contrast, if the ``DW_OP_LLVM_form_aspace_address`` operation had been
 defined to produce a value, and an implicit conversion to a memory location
 description was defined, then it would be limited to the size of the generic
 type (which matches the size of the default address space). An implementation
-would likely have to use *reserved ranges* of value to represent different
+would likely have to use *reserved ranges* of values to represent different
 address spaces. Such a value would likely not match any address value in the
 actual hardware. That would require the consumer to have special treatment for
 such values.
@@ -528,7 +528,7 @@ active. To describe the conceptual location of non-active lanes requires an
 attribute that has an expression that computes the source location PC for each
 lane.
 
-For efficiency, the expression calculates the source location the wavefront as a
+For efficiency, the expression calculates the source location of the wavefront as a
 whole. This can be done using the ``DW_OP_LLVM_select_bit_piece`` (see
 :ref:`amdgpu-dwarf-operation-to-create-vector-composite-location-descriptions`)
 operation.
@@ -556,12 +556,32 @@ object allocation is also added.
 
 See :ref:`amdgpu-dwarf-memory-spaces`.
 
+2.15 Define Augmentation Strings to Support Multiple Extensions
+---------------------------------------------------------------
+
+A ``DW_AT_LLVM_augmentation`` attribute is added to a compilation unit debugger
+information entry to indicate that there is additional target architecture
+specific information in the debugging information entries of that compilation
+unit. This allows a consumer to know what extensions are present in the debugger
+information entries as is possible with the augmentation string of other
+sections.
+
+The format that should be used for an augmentation string is also recommended.
+This allows a consumer to parse the string when it contains information from
+multiple vendors. Augmentation strings occur in the ``DW_AT_LLVM_augmentation``
+attribute, in the lookup by name table, and in the CFI Common Information Entry
+(CIE).
+
+See :ref:`amdgpu-dwarf-full-and-partial-compilation-unit-entries`,
+:ref:`amdgpu-dwarf-name-index-section-header`, and
+:ref:`amdgpu-dwarf-structure_of-call-frame-information`.
+
 2.16 Support Embedding Source Text for Online Compilation
 ---------------------------------------------------------
 
 AMDGPU supports programming languages that include online compilation where the
 source text may be created at runtime. For example, the OpenCL and HIP language
-runtimes support online compilation. To support is, a way to embed the source
+runtimes support online compilation. To support this, a way to embed the source
 text in the debug information is provided.
 
 See :ref:`amdgpu-dwarf-line-number-information`.
@@ -569,16 +589,16 @@ See :ref:`amdgpu-dwarf-line-number-information`.
 2.17 Allow MD5 Checksums to be Optionally Present
 -------------------------------------------------
 
-In DWARF Version 5 the file timestamp and file size can be optional, but if the
-MD5 checksum is present it must be valid for all files. This is a problem if
+In DWARF Version 5, the file timestamp and file size can be optional, but if the
+MD5 checksum is present, it must be valid for all files. This is a problem if
 using link time optimization to combine compilation units where some have MD5
-checksums and some do not. Therefore, sSupport to allow MD5 checksums to be
-optionally present in the line table is added.
+checksums, and others do not. Therefore, the line table is extended to allow MD5
+checksums to be optional.
 
 See :ref:`amdgpu-dwarf-line-number-information`.
 
-2.18 Add the HIP Programing Language
-------------------------------------
+2.18 Add the HIP Programming Language
+-------------------------------------
 
 The HIP programming language [:ref:`HIP <amdgpu-dwarf-HIP>`], which is supported
 by the AMDGPU, is added.
@@ -597,7 +617,7 @@ hardware to allow a single instruction to execute multiple iterations using
 vector registers.
 
 Note that although this is similar to SIMT execution, the way a client debugger
-uses the information is fundamentally different. In SIMT execution the debugger
+uses the information is fundamentally different. In SIMT execution, the debugger
 needs to present the concurrent execution as distinct source language threads
 that the user can list and switch focus between. With iteration concurrency
 optimizations, such as software pipelining and vectorized SIMD, the debugger
@@ -628,7 +648,7 @@ language loop iterations are executing concurrently. See
 It is common in SIMD vectorization for the compiler to generate code that
 promotes portions of an array into vector registers. For example, if the
 hardware has vector registers with 8 elements, and 8 wide SIMD instructions, the
-compiler may vectorize a loop so that is executes 8 iterations concurrently for
+compiler may vectorize a loop so that it executes 8 iterations concurrently for
 each vectorized loop iteration.
 
 On the first iteration of the generated vectorized loop, iterations 0 to 7 of
@@ -671,7 +691,7 @@ Inside the loop body, the machine code loads ``src[i]`` and ``dst[i]`` into
 registers, adds them, and stores the result back into ``dst[i]``.
 
 Considering the location of ``dst`` and ``src`` in the loop body, the elements
-``dst[i]`` and ``src[i]`` would be located in registers, all other elements are
+``dst[i]`` and ``src[i]`` would be located in registers; all other elements are
 located in memory. Let register ``R0`` contain the base address of ``dst``,
 register ``R1`` contain ``i``, and register ``R2`` contain the registerized
 ``dst[i]`` element. We can describe the location of ``dst`` as a memory location
@@ -702,7 +722,7 @@ with a register location overlaid at a runtime offset involving ``i``:
 ----------------------------------------------
 
 AMDGPU supports languages, such as OpenCL, that define source language memory
-spaces. Support is added to define language specific memory spaces so they can
+spaces. Support is added to define language-specific memory spaces so they can
 be used in a consistent way by consumers. See :ref:`amdgpu-dwarf-memory-spaces`.
 
 A new attribute ``DW_AT_LLVM_memory_space`` is added to support using memory
@@ -718,9 +738,9 @@ accommodates only 32 unique operations. In practice, the lack of a central
 registry and a desire for backwards compatibility means vendor extensions are
 never retired, even when standard versions are accepted into DWARF proper. This
 has produced a situation where the effective encoding space available for new
-vendor extensions is miniscule today.
+vendor extensions is minuscule today.
 
-To expand this encoding space a new DWARF operation ``DW_OP_LLVM_user`` is
+To expand this encoding space, a new DWARF operation ``DW_OP_LLVM_user`` is
 added which acts as a "prefix" for vendor extensions. It is followed by a
 ULEB128 encoded vendor extension opcode, which is then followed by the operands
 of the corresponding vendor extension operation.
@@ -756,7 +776,7 @@ A. Changes Relative to DWARF Version 5
   .. note::
 
     Notes are included to describe how the changes are to be applied to the
-    DWARF Version 5 standard. They also describe rational and issues that may
+    DWARF Version 5 standard. They also describe rationale and issues that may
     need further consideration.
 
 A.2 General Description
@@ -877,7 +897,7 @@ elements that can be specified are:
 
 *A current lane*
 
-  The 0 based SIMT lane identifier to be used in evaluating a user presented
+  The 0-based SIMT lane identifier to be used in evaluating a user presented
   expression. This applies to source languages that are implemented for a target
   architecture using a SIMT execution model. These implementations map source
   language threads of execution to lanes of the target architecture threads.
@@ -896,7 +916,7 @@ elements that can be specified are:
 
 *A current iteration*
 
-  The 0 based source language iteration instance to be used in evaluating a user
+  The 0-based source language iteration instance to be used in evaluating a user
   presented expression. This applies to target architectures that support
   optimizations that result in executing multiple source language loop iterations
   concurrently.
@@ -1824,7 +1844,7 @@ There are these special value operations currently defined:
       interpreted as a value of T. If a conversion is wanted it can be done
       explicitly using a ``DW_OP_convert`` operation.
 
-      GDB has a per register hook that allows a target specific conversion on a
+      GDB has a per register hook that allows a target-specific conversion on a
       register by register basis. It defaults to truncation of bigger registers.
       Removing use of the target hook does not cause any test failures in common
       architectures. If the compiler for a target architecture did want some
@@ -1834,7 +1854,7 @@ There are these special value operations currently defined:
       If T is a larger type than the register size, then the default GDB
       register hook reads bytes from the next register (or reads out of bounds
       for the last register!). Removing use of the target hook does not cause
-      any test failures in common architectures (except an illegal hand written
+      any test failures in common architectures (except an illegal hand-written
       assembly test). If a target architecture requires this behavior, these
       extensions allow a composite location description to be used to combine
       multiple registers.
@@ -2262,7 +2282,7 @@ bit offset equal to V scaled by 8 (the byte size).
   The implicit conversion could also be defined as target architecture specific.
   For example, GDB checks if V is an integral type. If it is not it gives an
   error. Otherwise, GDB zero-extends V to 64 bits. If the GDB target defines a
-  hook function, then it is called. The target specific hook function can modify
+  hook function, then it is called. The target-specific hook function can modify
   the 64-bit value, possibly sign extending based on the original value type.
   Finally, GDB treats the 64-bit value V as a memory location address.
 
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 73e97fda2eadc..3d2e9d290d097 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -489,21 +489,21 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following
 
      **GCN GFX11 (RDNA 3.5)** [AMD-GCN-GFX11-RDNA3.5]_
      -----------------------------------------------------------------------------------------------------------------------
-     ``gfx1150``                 ``amdgcn``   APU   - cumode          - Architected                   *TBA*
+     ``gfx1150``                 ``amdgcn``   APU   - cumode          - Architected                   Radeon 890M
                                                     - wavefrontsize64   flat
                                                                         scratch                       .. TODO::
                                                                       - Packed
                                                                         work-item                       Add product
                                                                         IDs                             names.
 
-     ``gfx1151``                 ``amdgcn``   APU   - cumode          - Architected                   *TBA*
+     ``gfx1151``                 ``amdgcn``   APU   - cumode          - Architected                   Radeon 8060S
                                                     - wavefrontsize64   flat
                                                                         scratch                       .. TODO::
                                                                       - Packed
                                                                         work-item                       Add product
                                                                         IDs                             names.
 
-     ``gfx1152``                 ``amdgcn``   APU   - cumode          - Architected                   *TBA*
+     ``gfx1152``                 ``amdgcn``   APU   - cumode          - Architected                   Radeon 860M
                                                     - wavefrontsize64   flat
                                                                         scratch                       .. TODO::
                                                                       - Packed
@@ -884,6 +884,8 @@ supported for the ``amdgcn`` target.
      Buffer Fat Pointer                    7               N/A         N/A              160     0
      Buffer Resource                       8               N/A         V#               128     0x00000000000000000000000000000000
      Buffer Strided Pointer (experimental) 9               *TODO*
+     *reserved for downstream use*         10
+     *reserved for downstream use*         11
      Streamout Registers                   128             N/A         GS_REGS
      ===================================== =============== =========== ================ ======= ============================
 
@@ -1513,6 +1515,86 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
                                                    * 1 - Data cache.
 
                                                    Instruction cache prefetches are unsafe on invalid address.
+
+  llvm.amdgcn.global.load.b128                 This intrinsic is supported on gfx942, gfx950.
+  
+                                                   Signature:
+                                                   
+                                                   .. code-block:: llvm
+                                                      
+                                                      <4 x i32> @llvm.amdgcn.raw.load.store.b128(
+                                                          ptr addrspace(1), ; source
+                                                          metadata)         ; scope    - e.g. '!0' where '!0 = !{!"wavegroup"}'
+
+                                                   Reads the value from the source address with cache behavior
+                                                   specified by the scope.
+
+                                                   For gfc942 and gfx950 devices, this emits a
+                                                   ``global_load_dwordx4`` instruction with the appropriate
+                                                   ``SC0`` and ``SC1`` bits set.
+
+                                                   Valid values for scope are
+                                                   
+                                                   ===================== =============================================================
+                                                   scope                 architecture name
+                                                   ===================== =============================================================
+                                                   ``"wavefront"``       wave
+                                                   
+                                                   ``"workgroup"``       group
+                                                   
+                                                   ``"agent"``           device
+                                                   
+                                                   ``""`` (empty string) system
+                                                   ===================== =============================================================
+ 
+                                                   For semantics on gfx942, see Table 47 in section 9.1.10
+                                                   "Memory Scope and Temporal Controls" of the "AMD Instinct
+                                                   MI300" Instruction Set Architecture Reference.
+                                                   
+                                                   For semantics on gfx950, see Table 49 in section 9.1.10
+                                                   "Memory Scope and Temporal Controls" of the CDNA4
+                                                   Instruction Set Architecture Reference.
+                                                                                                      
+  llvm.amdgcn.global.store.b128                This intrinsic is supported on gfx942, gfx950.
+  
+                                                   Signature:
+                                                   
+                                                   .. code-block:: llvm
+                                                      
+                                                      void @llvm.amdgcn.global.store.b128(
+                                                          ptr addrspace(1), ; destination
+                                                          <4 x i32>,        ; value
+                                                          metadata)         ; scope    - e.g. '!0' where '!0 = !{!"wavegroup"}'
+
+                                                   Writes the value to the destination address with cache
+                                                   behavior specified by the scope.
+
+                                                   For gfc942 and gfx950 devices, this emits a
+                                                   ``global_store_dwordx4`` instruction with the appropriate
+                                                   ``SC0`` and ``SC1`` bits set.
+
+                                                   Valid values for scope are
+                                                   
+                                                   ===================== =============================================================
+                                                   scope                 architecture name
+                                                   ===================== =============================================================
+                                                   ``"wavefront"``       wave
+                                                   
+                                                   ``"workgroup"``       group
+                                                   
+                                                   ``"agent"``           device
+                                                   
+                                                   ``""`` (empty string) system
+                                                   ===================== =============================================================
+ 
+                                                   For semantics on gfx942, see Table 48 in section 9.1.10
+                                                   "Memory Scope and Temporal Controls" of the "AMD Instinct
+                                                   MI300" Instruction Set Architecture Reference.
+                                                   
+                                                   For semantics on gfx950, see Table 50 in section 9.1.10
+                                                   "Memory Scope and Temporal Controls" of the CDNA4
+                                                   Instruction Set Architecture Reference.
+                                                                                                      
   ==============================================   ==========================================================
 
 .. TODO::
@@ -4132,7 +4214,7 @@ non-AMD key names should be prefixed by "*vendor-name*.".
                                                 "Image", or "Pipe". This may be
                                                 more restrictive than indicated
                                                 by "AccQual" to reflect what the
-                                                kernel actual does. If not
+                                                kernel actually does. If not
                                                 present then the runtime must
                                                 assume what is implied by
                                                 "AccQual" and "IsConst". Values
@@ -5396,8 +5478,8 @@ The fields used by CP for code objects before V3 also match those specified in
                                                      ``COMPUTE_PGM_RSRC1.PRIORITY``.
      13:12   2 bits  FLOAT_ROUND_MODE_32             Wavefront starts execution
                                                      with specified rounding
-                                                     mode for single (32
-                                                     bit) floating point
+                                                     mode for single (32-bit)
+                                                     floating point
                                                      precision floating point
                                                      operations.
 
@@ -5729,7 +5811,7 @@ The fields used by CP for code objects before V3 also match those specified in
 
                                                      Wavefront starts execution
                                                      with memory violation
-                                                     exceptions exceptions
+                                                     exceptions
                                                      enabled which are generated
                                                      when a memory violation has
                                                      occurred for this wavefront from
@@ -5965,7 +6047,7 @@ The fields used by CP for code objects before V3 also match those specified in
      FLOAT_DENORM_MODE_FLUSH_NONE           3     No Flush
      ====================================== ===== ====================================
 
-  Denormal flushing is sign respecting. i.e. the behavior expected by
+  Denormal flushing is sign respecting, i.e., the behavior expected by
   ``"denormal-fp-math"="preserve-sign"``. The behavior is undefined with
   ``"denormal-fp-math"="positive-zero"``
 
@@ -6470,6 +6552,13 @@ operations.
 ``buffer/global/flat_load/store/atomic`` instructions to global memory are
 termed vector memory operations.
 
+``global_load_lds`` or ``buffer/global_load`` instructions with the `lds` flag
+are LDS DMA loads. They interact with caches as if the loaded data were
+being loaded to registers and not to LDS, and so therefore support the same
+cache modifiers. They cannot be performed atomically. They implement volatile
+(via aux/cpol bit 31) and nontemporal (via metadata) as if they were loads
+from the global address space.
+
 Private address space uses ``buffer_load/store`` using the scratch V#
 (GFX6-GFX8), or ``scratch_load/store`` (GFX9-GFX11). Since only a single thread
 is accessing the memory, atomic memory orderings are not meaningful, and all
@@ -13190,9 +13279,6 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-gfx11-table`.
      store atomic release      - workgroup    - global   1. s_waitcnt lgkmcnt(0) &
                                               - generic     vmcnt(0) & vscnt(0)
 
-                                                           - If CU wavefront execution
-                                                             mode, omit vmcnt(0) and
-                                                             vscnt(0).
                                                            - If OpenCL, omit
                                                              lgkmcnt(0).
                                                            - Could be split into
@@ -13238,8 +13324,6 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-gfx11-table`.
                                                          2. buffer/global/flat_store
      store atomic release      - workgroup    - local    1. s_waitcnt vmcnt(0) & vscnt(0)
 
-                                                           - If CU wavefront execution
-                                                             mode, omit.
                                                            - If OpenCL, omit.
                                                            - Could be split into
                                                              separate s_waitcnt
@@ -13327,9 +13411,6 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-gfx11-table`.
      atomicrmw    release      - workgroup    - global   1. s_waitcnt lgkmcnt(0) &
                                               - generic     vmcnt(0) & vscnt(0)
 
-                                                           - If CU wavefront execution
-                                                             mode, omit vmcnt(0) and
-                                                             vscnt(0).
                                                            - If OpenCL, omit lgkmcnt(0).
                                                            - Could be split into
                                                              separate s_waitcnt
@@ -13374,8 +13455,6 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-gfx11-table`.
                                                          2. buffer/global/flat_atomic
      atomicrmw    release      - workgroup    - local    1. s_waitcnt vmcnt(0) & vscnt(0)
 
-                                                           - If CU wavefront execution
-                                                             mode, omit.
                                                            - If OpenCL, omit.
                                                            - Could be split into
                                                              separate s_waitcnt
@@ -13459,9 +13538,6 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-gfx11-table`.
      fence        release      - workgroup    *none*     1. s_waitcnt lgkmcnt(0) &
                                                             vmcnt(0) & vscnt(0)
 
-                                                           - If CU wavefront execution
-                                                             mode, omit vmcnt(0) and
-                                                             vscnt(0).
                                                            - If OpenCL and
                                                              address space is
                                                              not generic, omit
@@ -13588,9 +13664,6 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-gfx11-table`.
      atomicrmw    acq_rel      - workgroup    - global   1. s_waitcnt lgkmcnt(0) &
                                                             vmcnt(0) & vscnt(0)
 
-                                                           - If CU wavefront execution
-                                                             mode, omit vmcnt(0) and
-                                                             vscnt(0).
                                                            - If OpenCL, omit
                                                              lgkmcnt(0).
                                                            - Must happen after
@@ -13642,8 +13715,6 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-gfx11-table`.
                                                          2. buffer/global_atomic
                                                          3. s_waitcnt vm/vscnt(0)
 
-                                                           - If CU wavefront execution
-                                                             mode, omit.
                                                            - Use vmcnt(0) if atomic with
                                                              return and vscnt(0) if
                                                              atomic with no-return.
@@ -13668,8 +13739,6 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-gfx11-table`.
 
      atomicrmw    acq_rel      - workgroup    - local    1. s_waitcnt vmcnt(0) & vscnt(0)
 
-                                                           - If CU wavefront execution
-                                                             mode, omit.
                                                            - If OpenCL, omit.
                                                            - Could be split into
                                                              separate s_waitcnt
@@ -13729,9 +13798,6 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-gfx11-table`.
      atomicrmw    acq_rel      - workgroup    - generic  1. s_waitcnt lgkmcnt(0) &
                                                             vmcnt(0) & vscnt(0)
 
-                                                           - If CU wavefront execution
-                                                             mode, omit vmcnt(0) and
-                                                             vscnt(0).
                                                            - If OpenCL, omit lgkmcnt(0).
                                                            - Could be split into
                                                              separate s_waitcnt
@@ -13777,9 +13843,9 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-gfx11-table`.
                                                          3. s_waitcnt lgkmcnt(0) &
                                                             vmcnt(0) & vscnt(0)
 
-                                                           - If CU wavefront execution
-                                                             mode, omit vmcnt(0) and
-                                                             vscnt(0).
+                                                           - If atomic with return, omit
+                                                             vscnt(0), if atomic with
+                                                             no-return, omit vmcnt(0).
                                                            - If OpenCL, omit lgkmcnt(0).
                                                            - Must happen before
                                                              the following
@@ -13952,9 +14018,6 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-gfx11-table`.
      fence        acq_rel      - workgroup    *none*     1. s_waitcnt lgkmcnt(0) &
                                                             vmcnt(0) & vscnt(0)
 
-                                                           - If CU wavefront execution
-                                                             mode, omit vmcnt(0) and
-                                                             vscnt(0).
                                                            - If OpenCL and
                                                              address space is
                                                              not generic, omit
@@ -14184,9 +14247,6 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-gfx11-table`.
      load atomic  seq_cst      - workgroup    - global   1. s_waitcnt lgkmcnt(0) &
                                               - generic     vmcnt(0) & vscnt(0)
 
-                                                           - If CU wavefront execution
-                                                             mode, omit vmcnt(0) and
-                                                             vscnt(0).
                                                            - Could be split into
                                                              separate s_waitcnt
                                                              vmcnt(0), s_waitcnt
@@ -14295,8 +14355,6 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-gfx11-table`.
 
                                                          1. s_waitcnt vmcnt(0) & vscnt(0)
 
-                                                           - If CU wavefront execution
-                                                             mode, omit.
                                                            - Could be split into
                                                              separate s_waitcnt
                                                              vmcnt(0) and s_waitcnt
@@ -15298,8 +15356,6 @@ the instruction in the code sequence that references the table.
                                                             | ``s_wait_storecnt 0x0``
                                                             | ``s_wait_loadcnt 0x0``
                                                             | ``s_wait_dscnt 0x0``
-                                                            | **CU wavefront execution mode:**
-                                                            | ``s_wait_dscnt 0x0``
 
                                                            - If OpenCL, omit ``s_wait_dscnt 0x0``.
                                                            - The waits can be
@@ -15345,8 +15401,6 @@ the instruction in the code sequence that references the table.
                                                             | ``s_wait_storecnt 0x0``
                                                             | ``s_wait_loadcnt 0x0``
                                                             | ``s_wait_dscnt 0x0``
-                                                            | **CU wavefront execution mode:**
-                                                            | ``s_wait_dscnt 0x0``
 
                                                            - If OpenCL, omit.
                                                            - The waits can be
@@ -15440,8 +15494,6 @@ the instruction in the code sequence that references the table.
                                                             | ``s_wait_storecnt 0x0``
                                                             | ``s_wait_loadcnt 0x0``
                                                             | ``s_wait_dscnt 0x0``
-                                                            | **CU wavefront execution mode:**
-                                                            | ``s_wait_dscnt 0x0``
 
                                                            - If OpenCL, omit ``s_wait_dscnt 0x0``.
                                                            - If OpenCL and CU wavefront
@@ -15491,8 +15543,6 @@ the instruction in the code sequence that references the table.
                                                             | ``s_wait_storecnt 0x0``
                                                             | ``s_wait_loadcnt 0x0``
                                                             | ``s_wait_dscnt 0x0``
-                                                            | **CU wavefront execution mode:**
-                                                            | ``s_wait_dscnt 0x0``
 
                                                            - If OpenCL, omit all.
                                                            - The waits can be
@@ -15584,8 +15634,6 @@ the instruction in the code sequence that references the table.
                                                             | ``s_wait_storecnt 0x0``
                                                             | ``s_wait_loadcnt 0x0``
                                                             | ``s_wait_dscnt 0x0``
-                                                            | **CU wavefront execution mode:**
-                                                            | ``s_wait_dscnt 0x0``
 
                                                            - If OpenCL, omit ``s_wait_dscnt 0x0``.
                                                            - If OpenCL and
@@ -15715,8 +15763,6 @@ the instruction in the code sequence that references the table.
                                                             | ``s_wait_storecnt 0x0``
                                                             | ``s_wait_loadcnt 0x0``
                                                             | ``s_wait_dscnt 0x0``
-                                                            | **CU wavefront execution mode:**
-                                                            | ``s_wait_dscnt 0x0``
 
                                                            - If OpenCL, omit ``s_wait_dscnt 0x0``.
                                                            - Must happen after
@@ -15773,8 +15819,6 @@ the instruction in the code sequence that references the table.
                                                             | **Atomic without return:**
                                                             | ``s_wait_storecnt 0x0``
 
-                                                           - If CU wavefront execution
-                                                             mode, omit.
                                                            - Must happen before
                                                              the following
                                                              ``global_inv``.
@@ -15799,8 +15843,6 @@ the instruction in the code sequence that references the table.
                                                             | ``s_wait_storecnt 0x0``
                                                             | ``s_wait_loadcnt 0x0``
                                                             | ``s_wait_dscnt 0x0``
-                                                            | **CU wavefront execution mode:**
-                                                            | ``s_wait_dscnt 0x0``
 
                                                            - If OpenCL, omit.
                                                            - The waits can be
@@ -15862,8 +15904,6 @@ the instruction in the code sequence that references the table.
                                                             | ``s_wait_storecnt 0x0``
                                                             | ``s_wait_loadcnt 0x0``
                                                             | ``s_wait_dscnt 0x0``
-                                                            | **CU wavefront execution mode:**
-                                                            | ``s_wait_dscnt 0x0``
 
                                                            - If OpenCL, omit ``s_wait_loadcnt 0x0``.
                                                            - The waits can be
@@ -16115,8 +16155,6 @@ the instruction in the code sequence that references the table.
                                                             | ``s_wait_storecnt 0x0``
                                                             | ``s_wait_loadcnt 0x0``
                                                             | ``s_wait_dscnt 0x0``
-                                                            | **CU wavefront execution mode:**
-                                                            | ``s_wait_dscnt 0x0``
 
                                                            - If OpenCL and
                                                              address space is
@@ -16345,8 +16383,6 @@ the instruction in the code sequence that references the table.
                                                             | ``s_wait_storecnt 0x0``
                                                             | ``s_wait_loadcnt 0x0``
                                                             | ``s_wait_dscnt 0x0``
-                                                            | **CU wavefront execution mode:**
-                                                            | ``s_wait_dscnt 0x0``
 
                                                            - If OpenCL, omit
                                                              ``s_wait_dscnt 0x0``
@@ -16453,8 +16489,6 @@ the instruction in the code sequence that references the table.
                                                             | ``s_wait_storecnt 0x0``
                                                             | ``s_wait_loadcnt 0x0``
                                                             | ``s_wait_dscnt 0x0``
-                                                            | **CU wavefront execution mode:**
-                                                            | ``s_wait_dscnt 0x0``
 
                                                            - If OpenCL, omit all.
                                                            - The waits can be
@@ -16791,7 +16825,7 @@ For GFX125x:
 * Some memory operations contain a ``nv`` bit, for "non-volatile", which indicates
   memory that is not expected to change during a kernel's execution.
   This information is propagated to the cache lines for that address
-  (refered to as ``$nv``).
+  (referred to as ``$nv``).
 
   * When ``nv=0`` reads hit dirty ``$nv=1`` data in cache, the hardware will
     writeback the data to the next level in the hierarchy and then subsequently read
@@ -18930,7 +18964,7 @@ On entry to a function:
 #. All other registers are unspecified.
 #. Any necessary ``s_waitcnt`` has been performed to ensure memory is available
    to the function.
-#. Use pass-by-reference (byref) in stead of pass-by-value (byval) for struct
+#. Use pass-by-reference (byref) instead of pass-by-value (byval) for struct
    arguments in C ABI. Callee is responsible for allocating stack memory and
    copying the value of the struct if modified. Note that the backend still
    supports byval for struct arguments.
@@ -20174,7 +20208,7 @@ from the value of the ``-mcpu`` option that is passed to the assembler.
 .amdgpu_hsa_kernel (name)
 +++++++++++++++++++++++++
 
-This directives specifies that the symbol with given name is a kernel entry
+This directive specifies that the symbol with given name is a kernel entry
 point (label) and the object should contain corresponding symbol of type
 STT_AMDGPU_HSA_KERNEL.
 
diff --git a/llvm/docs/CMakeLists.txt b/llvm/docs/CMakeLists.txt
index b4522e3649d9e..fc37c6d97ddfa 100644
--- a/llvm/docs/CMakeLists.txt
+++ b/llvm/docs/CMakeLists.txt
@@ -136,17 +136,23 @@ if( NOT uses_ocaml LESS 0 AND LLVM_ENABLE_OCAMLDOC )
     list(APPEND odoc_files -load ${odoc_file})
   endforeach()
 
-  add_custom_target(ocaml_doc
-    COMMAND ${CMAKE_COMMAND} -E remove_directory ${CMAKE_CURRENT_BINARY_DIR}/ocamldoc/html
-    COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/ocamldoc/html
-    COMMAND ${OCAMLFIND} ocamldoc -d ${CMAKE_CURRENT_BINARY_DIR}/ocamldoc/html
-                                  -sort -colorize-code -html ${odoc_files}
-    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/_ocamldoc/style.css
-                                     ${CMAKE_CURRENT_BINARY_DIR}/ocamldoc/html)
+  set(OCAML_DOC_ADD_TO_ALL "")
+  if(LLVM_BUILD_DOCS)
+    set(OCAML_DOC_ADD_TO_ALL ALL)
+  endif()
+
+  add_custom_target(ocaml_doc ${OCAML_DOC_ADD_TO_ALL}
+      COMMAND ${CMAKE_COMMAND} -E remove_directory ${CMAKE_CURRENT_BINARY_DIR}/ocamldoc/html
+      COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/ocamldoc/html
+      COMMAND ${OCAMLFIND} ocamldoc -d ${CMAKE_CURRENT_BINARY_DIR}/ocamldoc/html
+                                    -sort -colorize-code -html ${odoc_files}
+      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/_ocamldoc/style.css
+                                       ${CMAKE_CURRENT_BINARY_DIR}/ocamldoc/html)
 
   add_dependencies(ocaml_doc ${doc_targets})
 
-  if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
+
+  if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY AND LLVM_BUILD_DOCS)
     # ./ suffix is needed to copy the contents of html directory without
     # appending html/ into LLVM_INSTALL_OCAMLDOC_HTML_DIR.
     install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/ocamldoc/html/.
diff --git a/llvm/docs/CallGraphSection.md b/llvm/docs/CallGraphSection.md
index 8b18727098545..84d606162718b 100644
--- a/llvm/docs/CallGraphSection.md
+++ b/llvm/docs/CallGraphSection.md
@@ -1,10 +1,10 @@
-# .callgraph Section Layout
+# .llvm.callgraph Section Layout
 
-The `.callgraph` section is used to store call graph information for each function. The section contains a series of records, with each record corresponding to a single function.
+The `.llvm.callgraph` section is used to store call graph information for each function. The section contains a series of records, with each record corresponding to a single function.
 
 ## Per Function Record Layout
 
-Each record in the `.callgraph` section has the following binary layout:
+Each record in the `.llvm.callgraph` section has the following binary layout:
 
 | Field                                  | Type          | Size (bits) | Description                                                                                             |
 | -------------------------------------- | ------------- | ----------- | ------------------------------------------------------------------------------------------------------- |
diff --git a/llvm/docs/CodeOfConduct.rst b/llvm/docs/CodeOfConduct.rst
index 645ae12704b3b..995d32bb388df 100644
--- a/llvm/docs/CodeOfConduct.rst
+++ b/llvm/docs/CodeOfConduct.rst
@@ -171,6 +171,7 @@ The current committee members are:
 Transparency Reports
 ====================
 
+* `July 15, 2025 <https://discourse.llvm.org/t/llvm-code-of-conduct-transparency-report-july-15-2024-july-15-2025/88622>`_
 * `July 15, 2024 <https://discourse.llvm.org/t/llvm-code-of-conduct-transparency-report-july-15-2023-july-15-2024/82687>`_
 * `July 15, 2023 <https://llvm.org/coc-reports/2023-07-15-report.html>`_
 * `July 15, 2022 <https://llvm.org/coc-reports/2022-07-15-report.html>`_
diff --git a/llvm/docs/CodingStandards.rst b/llvm/docs/CodingStandards.rst
index 65dd794103ac3..8677d894a94a6 100644
--- a/llvm/docs/CodingStandards.rst
+++ b/llvm/docs/CodingStandards.rst
@@ -860,27 +860,40 @@ your private interface remains private and undisturbed by outsiders.
     It's okay to put extra implementation methods in a public class itself. Just
     make them private (or protected) and all is well.
 
-Use Namespace Qualifiers to Implement Previously Declared Functions
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Use Namespace Qualifiers to Define Previously Declared Symbols
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-When providing an out-of-line implementation of a function in a source file, do
-not open namespace blocks in the source file. Instead, use namespace qualifiers
-to help ensure that your definition matches an existing declaration. Do this:
+When providing an out-of-line definition for various symbols (variables,
+functions, opaque classes) in a source file, do not open namespace blocks in the
+source file. Instead, use namespace qualifiers to help ensure that your
+definition matches an existing declaration. Do this:
 
 .. code-block:: c++
 
   // Foo.h
   namespace llvm {
+  extern int FooVal;
   int foo(const char *s);
-  }
+
+  namespace detail {
+  class FooImpl;
+  } // namespace detail
+  } // namespace llvm
 
   // Foo.cpp
   #include "Foo.h"
   using namespace llvm;
+
+  int llvm::FooVal;
+
   int llvm::foo(const char *s) {
     // ...
   }
 
+  class detail::FooImpl {
+    // ...
+  }
+
 Doing this helps to avoid bugs where the definition does not match the
 declaration from the header. For example, the following C++ code defines a new
 overload of ``llvm::foo`` instead of providing a definition for the existing
diff --git a/llvm/docs/CommandGuide/dsymutil.rst b/llvm/docs/CommandGuide/dsymutil.rst
index 8764e1fbc7a4c..8e61e01d7d9c3 100644
--- a/llvm/docs/CommandGuide/dsymutil.rst
+++ b/llvm/docs/CommandGuide/dsymutil.rst
@@ -75,14 +75,6 @@ OPTIONS
  Make a static variable keep the enclosing function even if it would have been
  omitted otherwise.
 
-.. option:: --minimize, -z
-
- When used when creating a dSYM file, this option will suppress the emission of
- the .debug_inlines, .debug_pubnames, and .debug_pubtypes sections since
- dsymutil currently has better equivalents: .apple_names and .apple_types. When
- used in conjunction with ``--update`` option, this option will cause redundant
- accelerator tables to be removed.
-
 .. option:: --no-object-timestamp
 
  Don't check timestamp for object files.
diff --git a/llvm/docs/CommandGuide/lit.rst b/llvm/docs/CommandGuide/lit.rst
index 6a721ebf9cad0..70daae46170cd 100644
--- a/llvm/docs/CommandGuide/lit.rst
+++ b/llvm/docs/CommandGuide/lit.rst
@@ -641,7 +641,6 @@ TestRunner.py:
  %{S:real}               %S after expanding all symbolic links and substitute drives
  %{p:real}               %p after expanding all symbolic links and substitute drives
  %{t:real}               %t after expanding all symbolic links and substitute drives
- %{T:real}               %T after expanding all symbolic links and substitute drives
  %{/s:real}              %/s after expanding all symbolic links and substitute drives
  %{/S:real}              %/S after expanding all symbolic links and substitute drives
  %{/p:real}              %/p after expanding all symbolic links and substitute drives
diff --git a/llvm/docs/CommandGuide/llvm-dwarfdump.rst b/llvm/docs/CommandGuide/llvm-dwarfdump.rst
index 27ad4226e6cf9..137830259eb64 100644
--- a/llvm/docs/CommandGuide/llvm-dwarfdump.rst
+++ b/llvm/docs/CommandGuide/llvm-dwarfdump.rst
@@ -83,7 +83,7 @@ OPTIONS
 .. option:: -n <name>, --name=<name>
 
             Find and print all debug info entries whose name
-            (`DW_AT_name` attribute) is <name>.
+            (`DW_AT_name`/`DW_AT_linkage_name` attribute) is <name>.
 
 .. option:: --lookup=<address>
 
diff --git a/llvm/docs/DirectX/DXILResources.rst b/llvm/docs/DirectX/DXILResources.rst
index 91dcd5c8d5214..f253e02f4cdd9 100644
--- a/llvm/docs/DirectX/DXILResources.rst
+++ b/llvm/docs/DirectX/DXILResources.rst
@@ -746,3 +746,92 @@ Examples:
        @llvm.dx.resource.load.cbufferrow.8(
            target("dx.CBuffer", target("dx.Layout", {i16}, 2, 0)) %buffer,
            i32 %index)
+
+Resource dimensions
+-------------------
+
+*relevant types: Textures and Buffer*
+
+The `getDimensions`_ DXIL operation returns the dimensions of a texture or
+buffer resource. It returns a `Dimensions`_ type, which is a struct
+containing four ``i32`` values. The values in the struct represent the size
+of each dimension of the resource, and when aplicable the number of array
+elements or number of samples. The mapping is defined in the
+`getDimensions`_ documentation.
+
+The LLVM IR representation of this operation has several forms
+depending on the resource type and the specific ``getDimensions`` query.
+The intrinsics return a scalar or anonymous struct with up to 4 `i32`
+elements. The intrinsic names include suffixes to indicate the number of
+elements in the return value. The suffix `.x` indicates a single `i32`
+return value, `.xy` indicates a struct with two `i32` values, and `.xyz`
+indicates a struct with three `i32` values.
+
+Intrinsics representing queries on multisampled texture resources include
+`.ms.` in their name and their return value includes an additional `i32` for
+the number of samples.
+
+Intrinsics with `mip_level` argument and `.levels.` in their name are used
+for texture resources with multiple MIP levels. Their return
+struct includes an additional `i32` for the number of levels the resource has.
+
+.. code-block:: llvm
+
+   i32 @llvm.dx.resource.getdimensions.x( target("dx.*") handle )
+   {i32, i32} @llvm.dx.resource.getdimensions.xy( target("dx.*") handle )
+   {i32, i32, i32} @llvm.dx.resource.getdimensions.xyz( target("dx.*") handle )
+   {i32, i32} @llvm.dx.resource.getdimensions.levels.x( target("dx.*") handle, i32 mip_level )
+   {i32, i32, i32} @llvm.dx.resource.getdimensions.levels.xy( target("dx.*") handle, i32 mip_level )
+   {i32, i32, i32, i32} @llvm.dx.resource.getdimensions.levels.xyz( target("dx.*") handle, i32 mip_level )
+   {i32, i32, i32} @llvm.dx.resource.getdimensions.ms.xy( target("dx.*") handle )
+   {i32, i32, i32, i32} @llvm.dx.resource.getdimensions.ms.xyz( target("dx.*") handle )
+
+.. list-table:: ``@llvm.dx.resource.getdimensions.*``
+   :header-rows: 1
+
+   * - Argument
+     -
+     - Type
+     - Description
+   * - Return value
+     -
+     - `i32`, `{i32, i32}`, `{i32, i32, i32}`, or `{i32, i32, i32, i32}`
+     - Width, height, and depth of the resource (based on the specific suffix), and a number of levels or samples where aplicable.
+   * - ``%handle``
+     - 0
+     - ``target(dx.*)``
+     - Resource handle
+   * - ``%mip_level``
+     - 1
+     - ``i32``
+     - MIP level for the requested dimensions.
+
+Examples:
+
+.. code-block:: llvm
+
+  ; RWBuffer<float4>
+  %dim = call i32 @llvm.dx.resource.getdimensions.x(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %handle)
+
+  ; Texture2D
+  %0 = call {i32, i32} @llvm.dx.resource.getdimensions.xy(target("dx.Texture", ...) %tex2d)
+  %tex2d_width = extractvalue {i32, i32} %0, 0
+  %tex2d_height = extractvalue {i32, i32} %0, 1
+
+  ; Texture2DArray with levels
+  %1 = call {i32, i32, i32, i32} @llvm.dx.resource.getdimensions.levels.xyz(
+     target("dx.Texture", ...) %tex2darray, i32 1)
+  %tex2darray_width = extractvalue {i32, i32, i32, i32} %1, 0
+  %tex2darray_height = extractvalue {i32, i32, i32, i32} %1, 1
+  %tex2darray_elem_count = extractvalue {i32, i32, i32, i32} %1, 2
+  %tex2darray_levels_count = extractvalue {i32, i32, i32, i32} %1, 3
+
+  ; Texture2DMS
+  %2 = call {i32, i32, i32} @llvm.dx.resource.getdimensions.ms.xy(
+     target("dx.Texture", ...) %tex2dms)
+  %tex2dms_width = extractvalue {i32, i32, i32} %2, 0
+  %tex2dms_height = extractvalue {i32, i32, i32} %2, 1
+  %tex2dms_samples_count = extractvalue {i32, i32, i32} %2, 2
+
+.. _Dimensions: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#resource-operation-return-types
+.. _getDimensions: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#getdimensions
diff --git a/llvm/docs/GettingStartedVS.rst b/llvm/docs/GettingStartedVS.rst
index bc5746df66e20..e65fd8fde829d 100644
--- a/llvm/docs/GettingStartedVS.rst
+++ b/llvm/docs/GettingStartedVS.rst
@@ -126,6 +126,15 @@ These instructions were tested with Visual Studio 2019 and Python 3.9.6:
        cmake -S llvm\llvm -B build -DLLVM_ENABLE_PROJECTS=clang -DLLVM_TARGETS_TO_BUILD=X86 -Thost=x64
        exit
 
+   .. note::
+     By default, the Visual Studio project files generated by CMake use the
+     32-bit toolset. If you are developing on a 64-bit version of Windows and
+     want to use the 64-bit toolset, pass the ``-Thost=x64`` flag when
+     generating the Visual Studio solution. This requires CMake 3.8.0 or later.
+
+     For Windows on Arm the equivalent is ``-Thost=ARM64``, but this the default
+     for those hosts, so you do not have to use this option.
+
    ``LLVM_ENABLE_PROJECTS`` specifies any additional LLVM projects you want to
    build while ``LLVM_TARGETS_TO_BUILD`` selects the compiler targets. If
    ``LLVM_TARGETS_TO_BUILD`` is omitted by default all targets are built
@@ -149,10 +158,6 @@ These instructions were tested with Visual Studio 2019 and Python 3.9.6:
    * CMake generates project files for all build types. To select a specific
      build type, use the Configuration manager from the VS IDE or the
      ``/property:Configuration`` command-line option when using MSBuild.
-   * By default, the Visual Studio project files generated by CMake use the
-     32-bit toolset. If you are developing on a 64-bit version of Windows and
-     want to use the 64-bit toolset, pass the ``-Thost=x64`` flag when
-     generating the Visual Studio solution. This requires CMake 3.8.0 or later.
 
 13. Start Visual Studio and select configuration:
 
diff --git a/llvm/docs/HowToReleaseLLVM.rst b/llvm/docs/HowToReleaseLLVM.rst
index 1795d3a81ce95..171bf889256cd 100644
--- a/llvm/docs/HowToReleaseLLVM.rst
+++ b/llvm/docs/HowToReleaseLLVM.rst
@@ -18,11 +18,11 @@ create the binary packages, please refer to the :doc:`ReleaseProcess` instead.
 Release Timeline
 ================
 
-LLVM is released on a time based schedule --- with major releases roughly
+LLVM is released on a time-based schedule --- with major releases roughly
 every 6 months.  In between major releases there may be dot releases.
 The release manager will determine if and when to make a dot release based
 on feedback from the community.  Typically, dot releases should be made if
-there are large number of bug-fixes in the stable branch or a critical bug
+there are a large number of bug fixes in the stable branch or a critical bug
 has been discovered that affects a large number of users.
 
 Unless otherwise stated, dot releases will follow the same procedure as
@@ -73,7 +73,7 @@ Release Process Summary
 
 * Generate and send out the second release candidate sources.  Only *critical*
   bugs found during this testing phase will be fixed.  Any bugs introduced by
-  merged patches will be fixed.  If so a third round of testing is needed.
+  merged patches will be fixed.  If so, a third round of testing is needed.
 
 * The release notes are updated.
 
@@ -107,15 +107,15 @@ Create Release Branch and Update LLVM Version
 Branch the Git trunk using the following procedure:
 
 #. Remind developers that the release branching is imminent and to refrain from
-   committing patches that might break the build.  E.g., new features, large
+   committing patches that might break the build, e.g., new features, large
    patches for works in progress, an overhaul of the type system, an exciting
    new TableGen feature, etc.
 
 #. Verify that the current git trunk is in decent shape by
    examining nightly tester and buildbot results.
 
-#. Bump the version in trunk to N.0.0git with the script in
-   ``llvm/utils/release/bump-version.py``, and tag the commit with llvmorg-N-init.
+#. Bump the version in trunk to ``N.0.0git`` with the script in
+   ``llvm/utils/release/bump-version.py``, and tag the commit with ``llvmorg-N-init``.
    If ``X`` is the version to be released, then ``N`` is ``X + 1``. ::
 
     $ git tag -sa llvmorg-N-init
@@ -124,14 +124,14 @@ Branch the Git trunk using the following procedure:
    ``llvm/utils/release/clear-release-notes.py``.
 
 #. Create the release branch from the last known good revision from before the
-   version bump.  The branch's name is release/X.x where ``X`` is the major version
+   version bump.  The branch's name is ``release/X.x`` where ``X`` is the major version
    number and ``x`` is just the letter ``x``.
 
 #. On the newly-created release branch, immediately bump the version
-   to X.1.0git (where ``X`` is the major version of the branch.)
+   to ``X.1.0git`` (where ``X`` is the major version of the branch.)
 
-#. All tags and branches need to be created in both the llvm/llvm-project and
-   llvm/llvm-test-suite repos.
+#. All tags and branches need to be created in both the ``llvm/llvm-project`` and
+   ``llvm/llvm-test-suite`` repos.
 
 Tagging the LLVM Release Candidates
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -157,7 +157,7 @@ the release page.
   $ for f in *.xz; do gh attestation verify --owner llvm $f && gpg -b $f; done
 
 Tarballs, release binaries,  or any other release artifacts must be uploaded to
-GitHub.  This can be done using the github-upload-release.py script in utils/release.
+GitHub.  This can be done using the ``github-upload-release.py`` script in ``utils/release``.
 
 ::
 
@@ -170,10 +170,10 @@ Build The Binary Distribution
 Creating the binary distribution requires following the instructions
 :doc:`here <ReleaseProcess>`.
 
-That process will perform both Release+Asserts and Release builds but only
-pack the Release build for upload. You should use the Release+Asserts sysroot,
+That process performs both Release+Asserts and Release builds but only packs
+the Release build for upload. You should use the Release+Asserts sysroot,
 normally under ``final/Phase3/Release+Asserts/llvmCore-3.8.1-RCn.install/``,
-for test-suite and run-time benchmarks, to make sure nothing serious has
+for test-suite and run-time benchmarks, to ensure nothing serious has
 passed through the net. For compile-time benchmarks, use the Release version.
 
 The minimum required version of the tools you'll need are :doc:`here <GettingStarted>`
@@ -181,14 +181,14 @@ The minimum required version of the tools you'll need are :doc:`here <GettingSta
 Release Qualification Criteria
 ------------------------------
 
-There are no official release qualification criteria.  It is up to the
-the release manager to determine when a release is ready.  The release manager
+There are no official release qualification criteria.
+The release manager determines when a release is ready.  The release manager
 should pay attention to the results of community testing, the number of outstanding
-bugs, and then number of regressions when determining whether or not to make a
+bugs, and the number of regressions when determining whether or not to make a
 release.
 
 The community values time based releases, so releases should not be delayed for
-too long unless there are critical issues remaining.  In most cases, the only
+too long unless critical issues remain.  In most cases, the only
 kind of bugs that are critical enough to block a release would be a major regression
 from a previous release.
 
@@ -199,33 +199,33 @@ A few developers in the community have dedicated time to validate the release
 candidates and volunteered to be the official release testers for each
 architecture.
 
-These will be the ones testing, generating and uploading the official binaries
+These will be the ones testing, generating, and uploading the official binaries
 to the server, and will be the minimum tests *necessary* for the release to
 proceed.
 
 This will obviously not cover all OSs and distributions, so additional community
-validation is important. However, if community input is not reached before the
-release is out, all bugs reported will have to go on the next stable release.
+validation is important. However, if community input is not received before the
+release, all reported bugs will be deferred to the next stable release.
 
 The official release managers are:
 
 * Even releases: Tom Stellard (tstellar@redhat.com)
 * Odd releases: Tobias Hieta (tobias@hieta.se)
 
-The official release testers are volunteered from the community and have
+The official release testers are volunteers from the community who have
 consistently validated and released binaries for their targets/OSs. To contact
 them, you should post on the `Discourse forums (Project
 Infrastructure - Release Testers). <https://discourse.llvm.org/c/infrastructure/release-testers/66>`_
 
-The official testers list is in the file `RELEASE_TESTERS.TXT
+The official testers list is in the file ``RELEASE_TESTERS.TXT``
 <https://github.com/llvm/llvm-project/blob/main/llvm/RELEASE_TESTERS.TXT>`_, in
 the LLVM repository.
 
 Community Testing
 -----------------
 
-Once all testing has been completed and appropriate bugs filed, the release
-candidate tarballs are put on the website and the LLVM community is notified.
+Once all testing is complete and appropriate bugs are filed, the release
+candidate tarballs are put on the website, and the LLVM community is notified.
 
 We ask that all LLVM developers test the release in any the following ways:
 
@@ -251,7 +251,7 @@ We ask that all LLVM developers test the release in any the following ways:
    architecture.
 
 We also ask that the OS distribution release managers test their packages with
-the first candidate of every release, and report any *new* errors in GitHub.
+the first candidate of every release and report any *new* errors in GitHub.
 If the bug can be reproduced with an unpatched upstream version of the release
 candidate (as opposed to the distribution's own build), the priority should be
 release blocker.
@@ -268,10 +268,10 @@ next stage.
 Reporting Regressions
 ---------------------
 
-Every regression that is found during the tests (as per the criteria above),
+Every regression found during the tests (as per the criteria above)
 should be filled in a bug in GitHub and added to the release milestone.
 
-If a bug can't be reproduced, or stops being a blocker, it should be removed
+If a bug can't be reproduced or stops being a blocker, it should be removed
 from the Milestone. Debugging can continue, but on trunk.
 
 Backport Requests
@@ -299,15 +299,15 @@ This section describes how to triage bug reports:
    to see the list of bugs that are being considered for the release.
 
 #. Review each bug and first check if it has been fixed in main.  If it has, update
-   its status to "Needs Pull Request", and create a pull request for the fix
-   using the /cherry-pick or /branch comments if this has not been done already.
+   its status to "Needs Pull Request" and create a pull request for the fix
+   using the ``/cherry-pick`` or ``/branch`` comments if this has not been done already.
 
 #. If a bug has been fixed and has a pull request created for backporting it,
    then update its status to "Needs Review" and notify a knowledgeable
    reviewer.  Usually you will want to notify the person who approved the
    patch, but you may use your best judgement on who a good reviewer would be.
    Once you have identified the reviewer(s), assign the issue to them and
-   mention them (i.e @username) in a comment and ask them if the patch is safe
+   mention them (i.e., ``@username``) in a comment and ask them if the patch is safe
    to backport.  You should also review the bug yourself to ensure that it
    meets the requirements for committing to the release branch.
 
@@ -323,11 +323,11 @@ Release Patch Rules
 Below are the rules regarding patching the release branch:
 
 #. Patches applied to the release branch may only be applied by the release
-   manager, the official release testers or the maintainers with approval from
+   manager, the official release testers, or the maintainers with approval from
    the release manager.
 
 #. Release managers are encouraged, but not required, to get approval from a
-   maintainer before approving patches.  If there are no reachable maintainers
+   maintainer before approving patches.  If there are no reachable maintainers,
    then release managers can ask approval from patch reviewers or other
    developers active in that area.
 
@@ -336,7 +336,7 @@ Below are the rules regarding patching the release branch:
    was created.  As with all phases, release managers and maintainers can reject
    patches that are deemed too invasive.
 
-#. *Before RC2/RC3* Patches should be limited to bug fixes or backend specific
+#. *Before RC2/RC3* Patches should be limited to bug fixes or backend-specific
    improvements that are determined to be very safe.
 
 #. *Before Final Major Release* Patches should be limited to critical
@@ -349,7 +349,7 @@ Below are the rules regarding patching the release branch:
 Release Final Tasks
 -------------------
 
-The final stages of the release process involves tagging the "final" release
+The final stages of the release process involve tagging the "final" release
 branch, updating documentation that refers to the release, and updating the
 demo page.
 
@@ -394,11 +394,11 @@ is what to do:
 #. Update the ``releases/index.html`` with the new release and link to release
    documentation.
 
-#. After you push the changes to the www-releases repo, someone with admin
-   access must login to prereleases-origin.llvm.org and manually pull the new
-   changes into /data/www-releases/.  This is where the website is served from.
+#. After you push the changes to the ``www-releases`` repo, someone with admin
+   access must log in to ``prereleases-origin.llvm.org`` and manually pull the new
+   changes into ``/data/www-releases/``. This is where the website is served from.
 
-#. Finally checkout the llvm-www repo and update the main page
+#. Finally, check out the ``llvm-www`` repo and update the main page
    (``index.html`` and sidebar) to point to the new release and release
    announcement.
 
@@ -414,5 +414,5 @@ using this command and add it to the post.
 
   $ git log --format="- %aN: [%s (%h)](https://github.com/llvm/llvm-project/commit/%H)" llvmorg-X.1.N-1..llvmorg-X.1.N
 
-Once the release has been announced add a link to the announcement on the llvm
-homepage (from the llvm-www repo) in the "Release Emails" section.
+Once the release has been announced, add a link to the announcement on the llvm
+homepage (from the ``llvm-www`` repo) in the "Release Emails" section.
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 21c40fb1991dd..ccf8c717768f6 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -7518,12 +7518,12 @@ sections that the user does not want removed after linking.
 '``unpredictable``' Metadata
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-``unpredictable`` metadata may be attached to any branch or switch
-instruction. It can be used to express the unpredictability of control
-flow. Similar to the ``llvm.expect`` intrinsic, it may be used to alter
-optimizations related to compare and branch instructions. The metadata
-is treated as a boolean value; if it exists, it signals that the branch
-or switch that it is attached to is completely unpredictable.
+``unpredictable`` metadata may be attached to any branch, select, or switch
+instruction. It can be used to express the unpredictability of control flow.
+Similar to the ``llvm.expect`` intrinsic, it may be used to alter optimizations
+related to compare and branch instructions. The metadata is treated as a
+boolean value; if it exists, it signals that the branch, select, or switch that
+it is attached to is completely unpredictable.
 
 .. _md_dereferenceable:
 
@@ -12176,8 +12176,8 @@ makes sense:
     ; get pointers for 8 elements from array B
     %ptrs = getelementptr double, ptr %B, <8 x i32> %C
     ; load 8 elements from array B into A
-    %A = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x ptr> %ptrs,
-         i32 8, <8 x i1> %mask, <8 x double> %passthru)
+    %A = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(
+         <8 x ptr> align 8 %ptrs, <8 x i1> %mask, <8 x double> %passthru)
 
 Conversion Operations
 ---------------------
@@ -21063,12 +21063,15 @@ integer element type.
 
 Syntax:
 """""""
-This is an overloaded intrinsic.
+This is an overloaded intrinsic. You can use ``llvm.matrix.column.major.load``
+to load any vector type with a stride of any bitwidth up to 64.
 
 ::
 
-      declare vectorty @llvm.matrix.column.major.load.*(
+      declare <4 x i32> @llvm.matrix.column.major.load.v4i32.i64(
           ptrty %Ptr, i64 %Stride, i1 <IsVolatile>, i32 <Rows>, i32 <Cols>)
+      declare <9 x double> @llvm.matrix.column.major.load.v9f64.i32(
+          ptrty %Ptr, i32 %Stride, i1 <IsVolatile>, i32 <Rows>, i32 <Cols>)
 
 Overview:
 """""""""
@@ -21087,9 +21090,9 @@ Arguments:
 
 The first argument ``%Ptr`` is a pointer type to the returned vector type, and
 corresponds to the start address to load from. The second argument ``%Stride``
-is a positive, constant integer with ``%Stride >= <Rows>``. ``%Stride`` is used
-to compute the column memory addresses. I.e., for a column ``C``, its start
-memory addresses is calculated with ``%Ptr + C * %Stride``. The third Argument
+is a positive integer for which ``%Stride >= <Rows>``. ``%Stride`` is used to
+compute the column memory addresses. I.e., for a column ``C``, its start memory
+addresses is calculated with ``%Ptr + C * %Stride``. The third Argument
 ``<IsVolatile>`` is a boolean value.  The fourth and fifth arguments,
 ``<Rows>`` and ``<Cols>``, correspond to the number of rows and columns,
 respectively, and must be positive, constant integers. The returned vector must
@@ -21104,11 +21107,17 @@ The :ref:`align <attr_align>` parameter attribute can be provided for the
 
 Syntax:
 """""""
+This is an overloaded intrinsic. ``llvm.matrix.column.major.store`` to store
+any vector type with a stride of any bitwidth up to 64.
 
 ::
 
-      declare void @llvm.matrix.column.major.store.*(
-          vectorty %In, ptrty %Ptr, i64 %Stride, i1 <IsVolatile>, i32 <Rows>, i32 <Cols>)
+      declare void @llvm.matrix.column.major.store.v4i32.i64(
+          <4 x i32> %In, ptrty %Ptr, i64 %Stride, i1 <IsVolatile>, i32 <Rows>,
+          i32 <Cols>)
+      declare void @llvm.matrix.column.major.store.v9f64.i32(
+          <9 x double> %In, ptrty %Ptr, i32 %Stride, i1 <IsVolatile>, i32
+          <Rows>, i32 <Cols>)
 
 Overview:
 """""""""
@@ -21128,7 +21137,7 @@ Arguments:
 The first argument ``%In`` is a vector that corresponds to a ``<Rows> x
 <Cols>`` matrix to be stored to memory. The second argument ``%Ptr`` is a
 pointer to the vector type of ``%In``, and is the start address of the matrix
-in memory. The third argument ``%Stride`` is a positive, constant integer with
+in memory. The third argument ``%Stride`` is a positive integer for which
 ``%Stride >= <Rows>``.  ``%Stride`` is used to compute the column memory
 addresses. I.e., for a column ``C``, its start memory addresses is calculated
 with ``%Ptr + C * %Stride``.  The fourth argument ``<IsVolatile>`` is a boolean
@@ -24290,7 +24299,7 @@ Examples:
 .. code-block:: llvm
 
       %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 %elem0, i64 429)
-      %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %3, i32 4, <4 x i1> %active.lane.mask, <4 x i32> poison)
+      %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(ptr align 4 %3, <4 x i1> %active.lane.mask, <4 x i32> poison)
 
 
 .. _int_loop_dependence_war_mask:
@@ -24348,9 +24357,9 @@ Examples:
 .. code-block:: llvm
 
       %loop.dependence.mask = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %ptrA, ptr %ptrB, i64 4)
-      %vecA = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(ptr %ptrA, i32 4, <4 x i1> %loop.dependence.mask, <4 x i32> poison)
+      %vecA = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(ptr align 4 %ptrA, <4 x i1> %loop.dependence.mask, <4 x i32> poison)
       [...]
-      call @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %vecA, ptr %ptrB, i32 4, <4 x i1> %loop.dependence.mask)
+      call @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %vecA, ptr align 4 %ptrB, <4 x i1> %loop.dependence.mask)
 
 .. _int_loop_dependence_raw_mask:
 
@@ -24413,9 +24422,9 @@ Examples:
 .. code-block:: llvm
 
       %loop.dependence.mask = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %ptrA, ptr %ptrB, i64 4)
-      call @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %vecA, ptr %ptrA, i32 4, <4 x i1> %loop.dependence.mask)
+      call @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %vecA, ptr align 4 %ptrA, <4 x i1> %loop.dependence.mask)
       [...]
-      %vecB = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(ptr %ptrB, i32 4, <4 x i1> %loop.dependence.mask, <4 x i32> poison)
+      %vecB = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(ptr align 4 %ptrB, <4 x i1> %loop.dependence.mask, <4 x i32> poison)
 
 .. _int_experimental_vp_splice:
 
@@ -24611,7 +24620,7 @@ Examples:
      %r = call <8 x i8> @llvm.vp.load.v8i8.p0(ptr align 2 %ptr, <8 x i1> %mask, i32 %evl)
      ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r
 
-     %also.r = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %ptr, i32 2, <8 x i1> %mask, <8 x i8> poison)
+     %also.r = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr align 2 %ptr, <8 x i1> %mask, <8 x i8> poison)
 
 
 .. _int_vp_load_ff:
@@ -24931,7 +24940,7 @@ Examples:
      %r = call <8 x i8> @llvm.vp.gather.v8i8.v8p0(<8 x ptr>  align 8 %ptrs, <8 x i1> %mask, i32 %evl)
      ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r
 
-     %also.r = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> %ptrs, i32 8, <8 x i1> %mask, <8 x i8> poison)
+     %also.r = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> align 8 %ptrs, <8 x i1> %mask, <8 x i8> poison)
 
 
 .. _int_vp_scatter:
@@ -24989,7 +24998,7 @@ Examples:
      call void @llvm.vp.scatter.v8i8.v8p0(<8 x i8> %val, <8 x ptr> align 1 %ptrs, <8 x i1> %mask, i32 %evl)
      ;; For all lanes below %evl, the call above is lane-wise equivalent to the call below.
 
-     call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> %val, <8 x ptr> %ptrs, i32 1, <8 x i1> %mask)
+     call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> %val, <8 x ptr> align 1 %ptrs, <8 x i1> %mask)
 
 
 .. _int_vp_trunc:
@@ -26770,10 +26779,10 @@ This is an overloaded intrinsic. The loaded data is a vector of any integer, flo
 
 ::
 
-      declare <16 x float>  @llvm.masked.load.v16f32.p0(ptr <ptr>, i32 <alignment>, <16 x i1> <mask>, <16 x float> <passthru>)
-      declare <2 x double>  @llvm.masked.load.v2f64.p0(ptr <ptr>, i32 <alignment>, <2 x i1>  <mask>, <2 x double> <passthru>)
+      declare <16 x float>  @llvm.masked.load.v16f32.p0(ptr <ptr>, <16 x i1> <mask>, <16 x float> <passthru>)
+      declare <2 x double>  @llvm.masked.load.v2f64.p0(ptr <ptr>, <2 x i1>  <mask>, <2 x double> <passthru>)
       ;; The data is a vector of pointers
-      declare <8 x ptr> @llvm.masked.load.v8p0.p0(ptr <ptr>, i32 <alignment>, <8 x i1> <mask>, <8 x ptr> <passthru>)
+      declare <8 x ptr> @llvm.masked.load.v8p0.p0(ptr <ptr>, <8 x i1> <mask>, <8 x ptr> <passthru>)
 
 Overview:
 """""""""
@@ -26784,7 +26793,9 @@ Reads a vector from memory according to the provided mask. The mask holds a bit
 Arguments:
 """"""""""
 
-The first argument is the base pointer for the load. The second argument is the alignment of the source location. It must be a power of two constant integer value. The third argument, mask, is a vector of boolean values with the same number of elements as the return type. The fourth is a pass-through value that is used to fill the masked-off lanes of the result. The return type, underlying type of the base pointer and the type of the '``passthru``' argument are the same vector types.
+The first argument is the base pointer for the load. The second argument, mask, is a vector of boolean values with the same number of elements as the return type. The third is a pass-through value that is used to fill the masked-off lanes of the result. The return type, underlying type of the base pointer and the type of the '``passthru``' argument are the same vector types.
+
+The alignment of the base pointer can be specified using the ``align`` attribute on the first argument.
 
 Semantics:
 """"""""""
@@ -26798,7 +26809,7 @@ Masked-off lanes are also not considered accessed for the purpose of data races
 
 ::
 
-       %res = call <16 x float> @llvm.masked.load.v16f32.p0(ptr %ptr, i32 4, <16 x i1>%mask, <16 x float> %passthru)
+       %res = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 4 %ptr, <16 x i1>%mask, <16 x float> %passthru)
 
        ;; The result of the two following instructions is identical aside from potential memory access exception
        %loadlal = load <16 x float>, ptr %ptr, align 4
@@ -26815,10 +26826,10 @@ This is an overloaded intrinsic. The data stored in memory is a vector of any in
 
 ::
 
-       declare void @llvm.masked.store.v8i32.p0 (<8  x i32>   <value>, ptr <ptr>, i32 <alignment>, <8  x i1> <mask>)
-       declare void @llvm.masked.store.v16f32.p0(<16 x float> <value>, ptr <ptr>, i32 <alignment>, <16 x i1> <mask>)
+       declare void @llvm.masked.store.v8i32.p0 (<8  x i32>   <value>, ptr <ptr>, <8  x i1> <mask>)
+       declare void @llvm.masked.store.v16f32.p0(<16 x float> <value>, ptr <ptr>, <16 x i1> <mask>)
        ;; The data is a vector of pointers
-       declare void @llvm.masked.store.v8p0.p0  (<8 x ptr>    <value>, ptr <ptr>, i32 <alignment>, <8 x i1> <mask>)
+       declare void @llvm.masked.store.v8p0.p0  (<8 x ptr>    <value>, ptr <ptr>, <8 x i1> <mask>)
 
 Overview:
 """""""""
@@ -26828,8 +26839,9 @@ Writes a vector to memory according to the provided mask. The mask holds a bit f
 Arguments:
 """"""""""
 
-The first argument is the vector value to be written to memory. The second argument is the base pointer for the store, it has the same underlying type as the value argument. The third argument is the alignment of the destination location. It must be a power of two constant integer value. The fourth argument, mask, is a vector of boolean values. The types of the mask and the value argument must have the same number of vector elements.
+The first argument is the vector value to be written to memory. The second argument is the base pointer for the store, it has the same underlying type as the value argument. The third argument, mask, is a vector of boolean values. The types of the mask and the value argument must have the same number of vector elements.
 
+The alignment of the base pointer can be specified using the ``align`` attribute on the second argument.
 
 Semantics:
 """"""""""
@@ -26842,7 +26854,7 @@ Masked-off lanes are also not considered accessed for the purpose of data races
 
 ::
 
-       call void @llvm.masked.store.v16f32.p0(<16 x float> %value, ptr %ptr, i32 4,  <16 x i1> %mask)
+       call void @llvm.masked.store.v16f32.p0(<16 x float> %value, ptr align 4 %ptr, <16 x i1> %mask)
 
        ;; The result of the following instructions is identical aside from potential data races and memory access exceptions
        %oldval = load <16 x float>, ptr %ptr, align 4
@@ -26866,9 +26878,9 @@ This is an overloaded intrinsic. The loaded data are multiple scalar values of a
 
 ::
 
-      declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> <ptrs>, i32 <alignment>, <16 x i1> <mask>, <16 x float> <passthru>)
-      declare <2 x double> @llvm.masked.gather.v2f64.v2p1(<2 x ptr addrspace(1)> <ptrs>, i32 <alignment>, <2 x i1>  <mask>, <2 x double> <passthru>)
-      declare <8 x ptr> @llvm.masked.gather.v8p0.v8p0(<8 x ptr> <ptrs>, i32 <alignment>, <8 x i1>  <mask>, <8 x ptr> <passthru>)
+      declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> <ptrs>, <16 x i1> <mask>, <16 x float> <passthru>)
+      declare <2 x double> @llvm.masked.gather.v2f64.v2p1(<2 x ptr addrspace(1)> <ptrs>, <2 x i1>  <mask>, <2 x double> <passthru>)
+      declare <8 x ptr> @llvm.masked.gather.v8p0.v8p0(<8 x ptr> <ptrs>, <8 x i1>  <mask>, <8 x ptr> <passthru>)
 
 Overview:
 """""""""
@@ -26879,7 +26891,9 @@ Reads scalar values from arbitrary memory locations and gathers them into one ve
 Arguments:
 """"""""""
 
-The first argument is a vector of pointers which holds all memory addresses to read. The second argument is an alignment of the source addresses. It must be 0 or a power of two constant integer value. The third argument, mask, is a vector of boolean values with the same number of elements as the return type. The fourth is a pass-through value that is used to fill the masked-off lanes of the result. The return type, underlying type of the vector of pointers and the type of the '``passthru``' argument are the same vector types.
+The first argument is a vector of pointers which holds all memory addresses to read. The second argument, mask, is a vector of boolean values with the same number of elements as the return type. The third is a pass-through value that is used to fill the masked-off lanes of the result. The return type, underlying type of the vector of pointers and the type of the '``passthru``' argument are the same vector types.
+
+The alignment of the pointers can be specified using the ``align`` attribute on the first argument.
 
 Semantics:
 """"""""""
@@ -26890,7 +26904,7 @@ The semantics of this operation are equivalent to a sequence of conditional scal
 
 ::
 
-       %res = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> %ptrs, i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x double> poison)
+       %res = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 8 %ptrs, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x double> poison)
 
        ;; The gather with all-true mask is equivalent to the following instruction sequence
        %ptr0 = extractelement <4 x ptr> %ptrs, i32 0
@@ -26919,9 +26933,9 @@ This is an overloaded intrinsic. The data stored in memory is a vector of any in
 
 ::
 
-       declare void @llvm.masked.scatter.v8i32.v8p0  (<8 x i32>    <value>, <8 x ptr>               <ptrs>, i32 <alignment>, <8 x i1>  <mask>)
-       declare void @llvm.masked.scatter.v16f32.v16p1(<16 x float> <value>, <16 x ptr addrspace(1)> <ptrs>, i32 <alignment>, <16 x i1> <mask>)
-       declare void @llvm.masked.scatter.v4p0.v4p0   (<4 x ptr>    <value>, <4 x ptr>               <ptrs>, i32 <alignment>, <4 x i1>  <mask>)
+       declare void @llvm.masked.scatter.v8i32.v8p0  (<8 x i32>    <value>, <8 x ptr>               <ptrs>, <8 x i1>  <mask>)
+       declare void @llvm.masked.scatter.v16f32.v16p1(<16 x float> <value>, <16 x ptr addrspace(1)> <ptrs>, <16 x i1> <mask>)
+       declare void @llvm.masked.scatter.v4p0.v4p0   (<4 x ptr>    <value>, <4 x ptr>               <ptrs>, <4 x i1>  <mask>)
 
 Overview:
 """""""""
@@ -26931,7 +26945,9 @@ Writes each element from the value vector to the corresponding memory address. T
 Arguments:
 """"""""""
 
-The first argument is a vector value to be written to memory. The second argument is a vector of pointers, pointing to where the value elements should be stored. It has the same underlying type as the value argument. The third argument is an alignment of the destination addresses. It must be 0 or a power of two constant integer value. The fourth argument, mask, is a vector of boolean values. The types of the mask and the value argument must have the same number of vector elements.
+The first argument is a vector value to be written to memory. The second argument is a vector of pointers, pointing to where the value elements should be stored. It has the same underlying type as the value argument. The third argument, mask, is a vector of boolean values. The types of the mask and the value argument must have the same number of vector elements.
+
+The alignment of the pointers can be specified using the ``align`` attribute on the second argument.
 
 Semantics:
 """"""""""
@@ -26941,7 +26957,7 @@ The '``llvm.masked.scatter``' intrinsics is designed for writing selected vector
 ::
 
        ;; This instruction unconditionally stores data vector in multiple addresses
-       call @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %value, <8 x ptr> %ptrs, i32 4,  <8 x i1>  <true, true, .. true>)
+       call @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %value, <8 x ptr> align 4 %ptrs,  <8 x i1>  <true, true, .. true>)
 
        ;; It is equivalent to a list of scalar stores
        %val0 = extractelement <8 x i32> %value, i32 0
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 640516a2e0974..754cd409f83eb 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -62,6 +62,10 @@ Changes to the LLVM IR
 * The `ptrtoaddr` instruction was introduced. This instruction returns the
   address component of a pointer type variable but unlike `ptrtoint` does not
   capture provenance ([#125687](https://github.com/llvm/llvm-project/pull/125687)).
+* The alignment argument of the `@llvm.masked.load`, `@llvm.masked.store`,
+  `@llvm.masked.gather` and `@llvm.masked.scatter` intrinsics has been removed.
+  Instead, the `align` attribute should be placed on the pointer (or vector of
+  pointers) argument.
 
 Changes to LLVM infrastructure
 ------------------------------
@@ -139,6 +143,9 @@ Changes to the Windows Target
 Changes to the X86 Backend
 --------------------------
 
+* `-mcpu=wildcatlake` is now supported.
+* `-mcpu=novalake` is now supported.
+
 Changes to the OCaml bindings
 -----------------------------
 
@@ -174,6 +181,9 @@ Changes to LLDB
 
 * LLDB can now set breakpoints, show backtraces, and display variables when
   debugging Wasm with supported runtimes (WAMR and V8).
+* LLDB no longer stops processes by default when receiving SIGWINCH signals 
+  (window resize events) on Linux. This is the default on other Unix platforms.
+  You can re-enable it using `process handle --notify=true --stop=true SIGWINCH`.
 * The `show-progress` setting, which became a NOOP with the introduction of the
   statusline, now defaults to off and controls using OSC escape codes to show a
   native progress bar in supporting terminals like Ghostty and ConEmu.
diff --git a/llvm/docs/SPIRVUsage.rst b/llvm/docs/SPIRVUsage.rst
index d2d66462b5df9..85eeabf10244a 100644
--- a/llvm/docs/SPIRVUsage.rst
+++ b/llvm/docs/SPIRVUsage.rst
@@ -235,6 +235,8 @@ Below is a list of supported SPIR-V extensions, sorted alphabetically by their e
      - Adds execution modes and decorations to control floating-point computations in both kernels and shaders. It can be used on whole modules and individual instructions.
    * - ``SPV_INTEL_predicated_io``
      - Adds predicated load and store instructions that conditionally read from or write to memory based on a boolean predicate.
+   * - ``SPV_KHR_maximal_reconvergence``
+     - Adds execution mode and capability to enable maximal reconvergence.
 
 SPIR-V representation in LLVM IR
 ================================
diff --git a/llvm/docs/TableGen/BackEnds.rst b/llvm/docs/TableGen/BackEnds.rst
index 14232bc3f9f54..7f571378860b2 100644
--- a/llvm/docs/TableGen/BackEnds.rst
+++ b/llvm/docs/TableGen/BackEnds.rst
@@ -48,7 +48,7 @@ the TableGen files, the back-ends and their users.
 For instance, a global contract is that each back-end produces macro-guarded
 sections. Based on whether the file is included by a header or a source file,
 or even in which context of each file the include is being used, you have
-todefine a macro just before including it, to get the right output:
+to define a macro just before including it, to get the right output:
 
 .. code-block:: c++
 
@@ -80,8 +80,8 @@ in the TableGen files.
 CodeEmitter
 -----------
 
-**Purpose**: CodeEmitterGen uses the descriptions of instructions and their fields to
-construct an automated code emitter: a function that, given a MachineInstr,
+**Purpose**: ``CodeEmitterGen`` uses the descriptions of instructions and their fields to
+construct an automated code emitter: a function that, given a ``MachineInstr``,
 returns the (currently, 32-bit unsigned) value of the instruction.
 
 **Output**: C++ code, implementing the target's CodeEmitter
@@ -130,7 +130,7 @@ AsmMatcher
 ----------
 
 **Purpose**: Emits a target specifier matcher for
-converting parsed assembly operands in the MCInst structures. It also
+converting parsed assembly operands in the ``MCInst`` structures. It also
 emits a matcher for custom operand parsing. Extensive documentation is
 written on the ``AsmMatcherEmitter.cpp`` file.
 
@@ -167,7 +167,7 @@ CallingConv
 conventions supported by this target.
 
 **Output**: Implement static functions to deal with calling conventions
-chained by matching styles, returning false on no match.
+chained by matching styles, returning ``false`` on no match.
 
 **Usage**: Used in ISelLowering and FastIsel as function pointers to
 implementation returned by a CC selection function.
@@ -200,7 +200,7 @@ FastISel
 
 **Purpose**: This tablegen backend emits code for use by the "fast"
 instruction selection algorithm. See the comments at the top of
-lib/CodeGen/SelectionDAG/FastISel.cpp for background. This file
+``lib/CodeGen/SelectionDAG/FastISel.cpp`` for background. This file
 scans through the target's tablegen instruction-info files
 and extracts instructions with obvious-looking patterns, and it emits
 code to look up these instructions by type and operator.
@@ -270,23 +270,23 @@ This file is included as part of ``Attr.h``.
 ClangAttrParserStringSwitches
 -----------------------------
 
-**Purpose**: Creates AttrParserStringSwitches.inc, which contains
-StringSwitch::Case statements for parser-related string switches. Each switch
+**Purpose**: Creates ``AttrParserStringSwitches.inc``, which contains
+``StringSwitch::Case`` statements for parser-related string switches. Each switch
 is given its own macro (such as ``CLANG_ATTR_ARG_CONTEXT_LIST``, or
 ``CLANG_ATTR_IDENTIFIER_ARG_LIST``), which is expected to be defined before
-including AttrParserStringSwitches.inc, and undefined after.
+including ``AttrParserStringSwitches.inc``, and undefined after.
 
 ClangAttrImpl
 -------------
 
-**Purpose**: Creates AttrImpl.inc, which contains semantic attribute class
+**Purpose**: Creates ``AttrImpl.inc``, which contains semantic attribute class
 definitions for any attribute in ``Attr.td`` that has not set ``ASTNode = 0``.
 This file is included as part of ``AttrImpl.cpp``.
 
 ClangAttrList
 -------------
 
-**Purpose**: Creates AttrList.inc, which is used when a list of semantic
+**Purpose**: Creates ``AttrList.inc``, which is used when a list of semantic
 attribute identifiers is required. For instance, ``AttrKinds.h`` includes this
 file to generate the list of ``attr::Kind`` enumeration values. This list is
 separated out into multiple categories: attributes, inheritable attributes, and
@@ -297,25 +297,25 @@ functionality required for ``dyn_cast`` and similar APIs.
 ClangAttrPCHRead
 ----------------
 
-**Purpose**: Creates AttrPCHRead.inc, which is used to deserialize attributes
+**Purpose**: Creates ``AttrPCHRead.inc``, which is used to deserialize attributes
 in the ``ASTReader::ReadAttributes`` function.
 
 ClangAttrPCHWrite
 -----------------
 
-**Purpose**: Creates AttrPCHWrite.inc, which is used to serialize attributes in
+**Purpose**: Creates ``AttrPCHWrite.inc``, which is used to serialize attributes in
 the ``ASTWriter::WriteAttributes`` function.
 
 ClangAttrSpellings
 ---------------------
 
-**Purpose**: Creates AttrSpellings.inc, which is used to implement the
+**Purpose**: Creates ``AttrSpellings.inc``, which is used to implement the
 ``__has_attribute`` feature test macro.
 
 ClangAttrSpellingListIndex
 --------------------------
 
-**Purpose**: Creates AttrSpellingListIndex.inc, which is used to map parsed
+**Purpose**: Creates ``AttrSpellingListIndex.inc``, which is used to map parsed
 attribute spellings (including which syntax or scope was used) to an attribute
 spelling list index. These spelling list index values are internal
 implementation details exposed via
@@ -324,26 +324,26 @@ implementation details exposed via
 ClangAttrVisitor
 -------------------
 
-**Purpose**: Creates AttrVisitor.inc, which is used when implementing
+**Purpose**: Creates ``AttrVisitor.inc``, which is used when implementing
 recursive AST visitors.
 
 ClangAttrTemplateInstantiate
 ----------------------------
 
-**Purpose**: Creates AttrTemplateInstantiate.inc, which implements the
+**Purpose**: Creates ``AttrTemplateInstantiate.inc``, which implements the
 ``instantiateTemplateAttribute`` function, used when instantiating a template
 that requires an attribute to be cloned.
 
 ClangAttrParsedAttrList
 -----------------------
 
-**Purpose**: Creates AttrParsedAttrList.inc, which is used to generate the
+**Purpose**: Creates ``AttrParsedAttrList.inc``, which is used to generate the
 ``AttributeList::Kind`` parsed attribute enumeration.
 
 ClangAttrParsedAttrImpl
 -----------------------
 
-**Purpose**: Creates AttrParsedAttrImpl.inc, which is used by
+**Purpose**: Creates ``AttrParsedAttrImpl.inc``, which is used by
 ``AttributeList.cpp`` to implement several functions on the ``AttributeList``
 class. This functionality is implemented via the ``AttrInfoMap ParsedAttrInfo``
 array, which contains one element per parsed attribute object.
@@ -351,14 +351,14 @@ array, which contains one element per parsed attribute object.
 ClangAttrParsedAttrKinds
 ------------------------
 
-**Purpose**: Creates AttrParsedAttrKinds.inc, which is used to implement the
+**Purpose**: Creates ``AttrParsedAttrKinds.inc``, which is used to implement the
 ``AttributeList::getKind`` function, mapping a string (and syntax) to a parsed
 attribute ``AttributeList::Kind`` enumeration.
 
 ClangAttrDump
 -------------
 
-**Purpose**: Creates AttrDump.inc, which dumps information about an attribute.
+**Purpose**: Creates ``AttrDump.inc``, which dumps information about an attribute.
 It is used to implement ``ASTDumper::dumpAttr``.
 
 ClangDiagsDefs
@@ -424,7 +424,7 @@ Generate list of commands that are used in documentation comments.
 ArmNeon
 -------
 
-Generate arm_neon.h for clang.
+Generate ``arm_neon.h`` for clang.
 
 ArmNeonSema
 -----------
@@ -473,7 +473,7 @@ to a built-in backend.
 
 **Output**:
 
-The root of the output file is a JSON object (i.e. dictionary),
+The root of the output file is a JSON object (i.e., dictionary),
 containing the following fixed keys:
 
 * ``!tablegen_json_version``: a numeric version field that will
@@ -520,7 +520,7 @@ conventions described below.
 Some TableGen data types are translated directly into the
 corresponding JSON type:
 
-* A completely undefined value (e.g. for a variable declared without
+* A completely undefined value (e.g., for a variable declared without
   initializer in some superclass of this record, and never initialized
   by the record itself or any other superclass) is emitted as the JSON
   ``null`` value.
@@ -964,7 +964,7 @@ Here is the modified lookup function.
 
 The new lookup function will return an iterator range with first pointer to the
 first result and the last pointer to the last matching result from the table.
-However, please note that the support for emitting modified definition exists
+However, please note that the support for emitting a modified definition exists
 for ``PrimaryKeyName`` only.
 
 The ``PrimaryKeyEarlyOut`` field, when set to 1, modifies the lookup
diff --git a/llvm/docs/TableGen/BackGuide.rst b/llvm/docs/TableGen/BackGuide.rst
index 83f8f47004bdb..7883d7c6f529c 100644
--- a/llvm/docs/TableGen/BackGuide.rst
+++ b/llvm/docs/TableGen/BackGuide.rst
@@ -17,7 +17,7 @@ information is coded in a declarative style involving classes and records,
 which are then processed by TableGen. The internalized records are passed on
 to various backends, which extract information from a subset of the records
 and generate an output file. These output files are typically ``.inc`` files
-for C++, but may be any type of file that the backend developer needs.
+for C++, but they may be any type of file that the backend developer needs.
 
 This document is a guide to writing a backend for TableGen. It is not a
 complete reference manual, but rather a guide to using the facilities
@@ -114,7 +114,7 @@ The ``Record`` class provides many useful functions.
   superclasses.
 
 * Functions to get a particular field value by specifying its name in various
-  forms, and returning its value in various forms
+  forms and returning its value in various forms
   (see `Getting Record Names and Fields`_).
 
 * Boolean functions to check the various attributes of the record.
@@ -301,7 +301,7 @@ The ``BitInit`` class is a subclass of ``TypedInit``. Its instances
 represent the possible values of a bit: 0 or 1. It includes a data member
 that contains the bit.
 
-*All* of the classes derived from ``TypedInit`` provide the following functions.
+*All* of the classes derived from ``TypedInit`` provide the following functions:
 
 * A static function named ``get()`` that returns an ``Init`` representing
   the specified value(s). In the case of ``BitInit``, ``get(true)`` returns
@@ -334,7 +334,7 @@ The class provides the following additional functions.
 ~~~~~~~~~~~
 
 The ``DagInit`` class is a subclass of ``TypedInit``. Its instances
-represent the possible direct acyclic graphs (``dag``).
+represent the possible directed acyclic graphs (``dag``).
 
 The class includes a pointer to an ``Init`` for the DAG operator and a
 pointer to a ``StringInit`` for the operator name. It includes the count of
@@ -357,7 +357,7 @@ The class provides many additional functions:
 * Functions to get the operands, both individually and together.
 
 * Functions to determine whether there are any names and to
-  get the number of names
+  get the number of names.
 
 * Functions to get the names, both individually and together.
 
@@ -410,7 +410,7 @@ The class provides the usual ``get()`` and ``getValues()`` functions. The
 latter function returns an ``ArrayRef`` of the vector of pointers to ``Init``
 instances.
 
-The class provides these additional functions.
+The class provides these additional functions:
 
 * A function to get the element type.
 
@@ -461,7 +461,7 @@ The Backend Skeleton
 ====================
 
 The file ``TableGenBackendSkeleton.cpp`` provides a skeleton C++ translation
-unit for writing a new TableGen backend. Here are a few notes on the file.
+unit for writing a new TableGen backend. Here are a few notes on the file:
 
 * The list of includes is the minimal list required by most backends.
 
@@ -484,7 +484,7 @@ unit for writing a new TableGen backend. Here are a few notes on the file.
 * The ``run`` function should use the ``emitSourceFileHeader`` helper function
   to include a standard header in the emitted file.
 
-* Register the class or the function as the command line option
+* Register the class or the function as the command-line option
   with ``llvm/TableGen/TableGenBackend.h``.
 
   * Use ``llvm::TableGen::Emitter::OptClass<AddressModesEmitter>``
@@ -516,7 +516,7 @@ If you need to iterate over all the class records:
     ...
   }
 
-``ClassPair.second`` gets the class's ``unique_ptr``, then ``.get()`` gets the
+``ClassPair.second`` gets the class's ``unique_ptr``, and then ``.get()`` gets the
 class ``Record`` itself.
 
 
@@ -612,7 +612,7 @@ Getting Record Superclasses
 
 The ``Record`` class provides a function to obtain the direct superclasses
 of a record. It is named ``getDirectSuperClasses`` and returns an
-``ArrayRef`` of an array of ``std::pair`` pairs. Each pair consists of a
+``ArrayRef`` of an array of ``std::pair`` instances. Each pair consists of a
 pointer to the ``Record`` instance for a superclass record and an instance
 of the ``SMRange`` class. The range indicates the source file locations of
 the beginning and end of the class definition.
@@ -630,7 +630,7 @@ then iterates over the pairs in the returned array.
 
 The ``Record`` class also provides a function, ``getSuperClasses``, to
 return a vector of *all* superclasses of a record. The superclasses are in
-post-order: the order in which the superclasses were visited while copying
+postorder: the order in which the superclasses were visited while copying
 their fields into the record.
 
 Emitting Text to the Output Stream
@@ -752,7 +752,7 @@ The ``PrintDetailedRecords`` Backend
 The TableGen command option ``--print-detailed-records`` invokes a backend
 that prints all the global variables, classes, and records defined in the
 source files. The format of the output is *not* guaranteed to be constant
-over time. The output looks like this.
+over time. The output looks like this:
 
 .. code-block:: text
 
diff --git a/llvm/include/llvm-c/DebugInfo.h b/llvm/include/llvm-c/DebugInfo.h
index 545226ea2e673..c02cedfcc698b 100644
--- a/llvm/include/llvm-c/DebugInfo.h
+++ b/llvm/include/llvm-c/DebugInfo.h
@@ -205,6 +205,11 @@ enum {
 };
 typedef unsigned LLVMMetadataKind;
 
+/**
+ * The kind of checksum to emit.
+ */
+typedef enum { CSK_MD5, CSK_SHA1, CSK_SHA256 } LLVMChecksumKind;
+
 /**
  * An LLVM DWARF type encoding.
  */
@@ -328,6 +333,25 @@ LLVM_C_ABI LLVMMetadataRef LLVMDIBuilderCreateFile(LLVMDIBuilderRef Builder,
                                                    const char *Directory,
                                                    size_t DirectoryLen);
 
+/**
+ * Create a file descriptor to hold debugging information for a file.
+ * \param Builder      The \c DIBuilder.
+ * \param Filename     File name.
+ * \param FilenameLen  The length of the C string passed to \c Filename.
+ * \param Directory    Directory.
+ * \param DirectoryLen The length of the C string passed to \c Directory.
+ * \param ChecksumKind The kind of checksum. eg MD5, SHA256
+ * \param Checksum     The checksum.
+ * \param ChecksumLen  The length of the checksum.
+ * \param Souce        The embedded source.
+ * \param SourceLen    The length of the source.
+ */
+LLVM_C_ABI LLVMMetadataRef LLVMDIBuilderCreateFileWithChecksum(
+    LLVMDIBuilderRef Builder, const char *Filename, size_t FilenameLen,
+    const char *Directory, size_t DirectoryLen, LLVMChecksumKind ChecksumKind,
+    const char *Checksum, size_t ChecksumLen, const char *Source,
+    size_t SourceLen);
+
 /**
  * Creates a new descriptor for a module with the specified parent scope.
  * \param Builder         The \c DIBuilder.
diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h
index a1bfce7d1bd5a..bccdb8930561e 100644
--- a/llvm/include/llvm/ADT/APFloat.h
+++ b/llvm/include/llvm/ADT/APFloat.h
@@ -138,10 +138,16 @@ enum lostFraction { // Example of truncated bits:
 /// New operations: sqrt, IEEE remainder, C90 fmod, nexttoward.
 ///
 
+namespace detail {
+class IEEEFloat;
+class DoubleAPFloat;
+} // namespace detail
+
 // This is the common type definitions shared by APFloat and its internal
 // implementation classes. This struct should not define any non-static data
 // members.
-struct APFloatBase {
+class APFloatBase {
+public:
   typedef APInt::WordType integerPart;
   static constexpr unsigned integerPartWidth = APInt::APINT_BITS_PER_WORD;
 
@@ -257,30 +263,64 @@ struct APFloatBase {
   LLVM_ABI static const llvm::fltSemantics &EnumToSemantics(Semantics S);
   LLVM_ABI static Semantics SemanticsToEnum(const llvm::fltSemantics &Sem);
 
-  LLVM_ABI static const fltSemantics &IEEEhalf() LLVM_READNONE;
-  LLVM_ABI static const fltSemantics &BFloat() LLVM_READNONE;
-  LLVM_ABI static const fltSemantics &IEEEsingle() LLVM_READNONE;
-  LLVM_ABI static const fltSemantics &IEEEdouble() LLVM_READNONE;
-  LLVM_ABI static const fltSemantics &IEEEquad() LLVM_READNONE;
-  LLVM_ABI static const fltSemantics &PPCDoubleDouble() LLVM_READNONE;
-  LLVM_ABI static const fltSemantics &PPCDoubleDoubleLegacy() LLVM_READNONE;
-  LLVM_ABI static const fltSemantics &Float8E5M2() LLVM_READNONE;
-  LLVM_ABI static const fltSemantics &Float8E5M2FNUZ() LLVM_READNONE;
-  LLVM_ABI static const fltSemantics &Float8E4M3() LLVM_READNONE;
-  LLVM_ABI static const fltSemantics &Float8E4M3FN() LLVM_READNONE;
-  LLVM_ABI static const fltSemantics &Float8E4M3FNUZ() LLVM_READNONE;
-  LLVM_ABI static const fltSemantics &Float8E4M3B11FNUZ() LLVM_READNONE;
-  LLVM_ABI static const fltSemantics &Float8E3M4() LLVM_READNONE;
-  LLVM_ABI static const fltSemantics &FloatTF32() LLVM_READNONE;
-  LLVM_ABI static const fltSemantics &Float8E8M0FNU() LLVM_READNONE;
-  LLVM_ABI static const fltSemantics &Float6E3M2FN() LLVM_READNONE;
-  LLVM_ABI static const fltSemantics &Float6E2M3FN() LLVM_READNONE;
-  LLVM_ABI static const fltSemantics &Float4E2M1FN() LLVM_READNONE;
-  LLVM_ABI static const fltSemantics &x87DoubleExtended() LLVM_READNONE;
+private:
+  LLVM_ABI static const fltSemantics semIEEEhalf;
+  LLVM_ABI static const fltSemantics semBFloat;
+  LLVM_ABI static const fltSemantics semIEEEsingle;
+  LLVM_ABI static const fltSemantics semIEEEdouble;
+  LLVM_ABI static const fltSemantics semIEEEquad;
+  LLVM_ABI static const fltSemantics semFloat8E5M2;
+  LLVM_ABI static const fltSemantics semFloat8E5M2FNUZ;
+  LLVM_ABI static const fltSemantics semFloat8E4M3;
+  LLVM_ABI static const fltSemantics semFloat8E4M3FN;
+  LLVM_ABI static const fltSemantics semFloat8E4M3FNUZ;
+  LLVM_ABI static const fltSemantics semFloat8E4M3B11FNUZ;
+  LLVM_ABI static const fltSemantics semFloat8E3M4;
+  LLVM_ABI static const fltSemantics semFloatTF32;
+  LLVM_ABI static const fltSemantics semFloat8E8M0FNU;
+  LLVM_ABI static const fltSemantics semFloat6E3M2FN;
+  LLVM_ABI static const fltSemantics semFloat6E2M3FN;
+  LLVM_ABI static const fltSemantics semFloat4E2M1FN;
+  LLVM_ABI static const fltSemantics semX87DoubleExtended;
+  LLVM_ABI static const fltSemantics semBogus;
+  LLVM_ABI static const fltSemantics semPPCDoubleDouble;
+  LLVM_ABI static const fltSemantics semPPCDoubleDoubleLegacy;
+
+  friend class detail::IEEEFloat;
+  friend class detail::DoubleAPFloat;
+  friend class APFloat;
+
+public:
+  static const fltSemantics &IEEEhalf() { return semIEEEhalf; }
+  static const fltSemantics &BFloat() { return semBFloat; }
+  static const fltSemantics &IEEEsingle() { return semIEEEsingle; }
+  static const fltSemantics &IEEEdouble() { return semIEEEdouble; }
+  static const fltSemantics &IEEEquad() { return semIEEEquad; }
+  static const fltSemantics &PPCDoubleDouble() { return semPPCDoubleDouble; }
+  static const fltSemantics &PPCDoubleDoubleLegacy() {
+    return semPPCDoubleDoubleLegacy;
+  }
+  static const fltSemantics &Float8E5M2() { return semFloat8E5M2; }
+  static const fltSemantics &Float8E5M2FNUZ() { return semFloat8E5M2FNUZ; }
+  static const fltSemantics &Float8E4M3() { return semFloat8E4M3; }
+  static const fltSemantics &Float8E4M3FN() { return semFloat8E4M3FN; }
+  static const fltSemantics &Float8E4M3FNUZ() { return semFloat8E4M3FNUZ; }
+  static const fltSemantics &Float8E4M3B11FNUZ() {
+    return semFloat8E4M3B11FNUZ;
+  }
+  static const fltSemantics &Float8E3M4() { return semFloat8E3M4; }
+  static const fltSemantics &FloatTF32() { return semFloatTF32; }
+  static const fltSemantics &Float8E8M0FNU() { return semFloat8E8M0FNU; }
+  static const fltSemantics &Float6E3M2FN() { return semFloat6E3M2FN; }
+  static const fltSemantics &Float6E2M3FN() { return semFloat6E2M3FN; }
+  static const fltSemantics &Float4E2M1FN() { return semFloat4E2M1FN; }
+  static const fltSemantics &x87DoubleExtended() {
+    return semX87DoubleExtended;
+  }
 
   /// A Pseudo fltsemantic used to construct APFloats that cannot conflict with
   /// anything real.
-  LLVM_ABI static const fltSemantics &Bogus() LLVM_READNONE;
+  static const fltSemantics &Bogus() { return semBogus; }
 
   // Returns true if any number described by this semantics can be precisely
   // represented by the specified semantics. Does not take into account
@@ -927,69 +967,11 @@ class APFloat : public APFloatBase {
       llvm_unreachable("Unexpected semantics");
     }
 
-    ~Storage() {
-      if (usesLayout<IEEEFloat>(*semantics)) {
-        IEEE.~IEEEFloat();
-        return;
-      }
-      if (usesLayout<DoubleAPFloat>(*semantics)) {
-        Double.~DoubleAPFloat();
-        return;
-      }
-      llvm_unreachable("Unexpected semantics");
-    }
-
-    Storage(const Storage &RHS) {
-      if (usesLayout<IEEEFloat>(*RHS.semantics)) {
-        new (this) IEEEFloat(RHS.IEEE);
-        return;
-      }
-      if (usesLayout<DoubleAPFloat>(*RHS.semantics)) {
-        new (this) DoubleAPFloat(RHS.Double);
-        return;
-      }
-      llvm_unreachable("Unexpected semantics");
-    }
-
-    Storage(Storage &&RHS) {
-      if (usesLayout<IEEEFloat>(*RHS.semantics)) {
-        new (this) IEEEFloat(std::move(RHS.IEEE));
-        return;
-      }
-      if (usesLayout<DoubleAPFloat>(*RHS.semantics)) {
-        new (this) DoubleAPFloat(std::move(RHS.Double));
-        return;
-      }
-      llvm_unreachable("Unexpected semantics");
-    }
-
-    Storage &operator=(const Storage &RHS) {
-      if (usesLayout<IEEEFloat>(*semantics) &&
-          usesLayout<IEEEFloat>(*RHS.semantics)) {
-        IEEE = RHS.IEEE;
-      } else if (usesLayout<DoubleAPFloat>(*semantics) &&
-                 usesLayout<DoubleAPFloat>(*RHS.semantics)) {
-        Double = RHS.Double;
-      } else if (this != &RHS) {
-        this->~Storage();
-        new (this) Storage(RHS);
-      }
-      return *this;
-    }
-
-    Storage &operator=(Storage &&RHS) {
-      if (usesLayout<IEEEFloat>(*semantics) &&
-          usesLayout<IEEEFloat>(*RHS.semantics)) {
-        IEEE = std::move(RHS.IEEE);
-      } else if (usesLayout<DoubleAPFloat>(*semantics) &&
-                 usesLayout<DoubleAPFloat>(*RHS.semantics)) {
-        Double = std::move(RHS.Double);
-      } else if (this != &RHS) {
-        this->~Storage();
-        new (this) Storage(std::move(RHS));
-      }
-      return *this;
-    }
+    LLVM_ABI ~Storage();
+    LLVM_ABI Storage(const Storage &RHS);
+    LLVM_ABI Storage(Storage &&RHS);
+    LLVM_ABI Storage &operator=(const Storage &RHS);
+    LLVM_ABI Storage &operator=(Storage &&RHS);
   } U;
 
   template <typename T> static bool usesLayout(const fltSemantics &Semantics) {
diff --git a/llvm/include/llvm/ADT/Bitfields.h b/llvm/include/llvm/ADT/Bitfields.h
index 1af276145ba48..1fbc41c472581 100644
--- a/llvm/include/llvm/ADT/Bitfields.h
+++ b/llvm/include/llvm/ADT/Bitfields.h
@@ -154,12 +154,9 @@ struct ResolveUnderlyingType {
   using type = std::underlying_type_t<T>;
 };
 template <typename T> struct ResolveUnderlyingType<T, false> {
-  using type = T;
-};
-template <> struct ResolveUnderlyingType<bool, false> {
-  /// In case sizeof(bool) != 1, replace `void` by an additionnal
-  /// std::conditional.
-  using type = std::conditional_t<sizeof(bool) == 1, uint8_t, void>;
+  static_assert(!std::is_same_v<T, bool> || sizeof(bool) == 1,
+                "T being bool requires sizeof(bool) == 1.");
+  using type = std::conditional_t<std::is_same_v<T, bool>, uint8_t, T>;
 };
 
 } // namespace bitfields_details
diff --git a/llvm/include/llvm/ADT/DenseMap.h b/llvm/include/llvm/ADT/DenseMap.h
index 4bda50f5a5cc0..25b5262800a10 100644
--- a/llvm/include/llvm/ADT/DenseMap.h
+++ b/llvm/include/llvm/ADT/DenseMap.h
@@ -42,7 +42,7 @@ namespace detail {
 // We extend a pair to allow users to override the bucket type with their own
 // implementation without requiring two members.
 template <typename KeyT, typename ValueT>
-struct DenseMapPair : public std::pair<KeyT, ValueT> {
+struct DenseMapPair : std::pair<KeyT, ValueT> {
   using std::pair<KeyT, ValueT>::pair;
 
   KeyT &getFirst() { return std::pair<KeyT, ValueT>::first; }
diff --git a/llvm/include/llvm/ADT/DepthFirstIterator.h b/llvm/include/llvm/ADT/DepthFirstIterator.h
index 4ced758343806..3c54f321bf2d5 100644
--- a/llvm/include/llvm/ADT/DepthFirstIterator.h
+++ b/llvm/include/llvm/ADT/DepthFirstIterator.h
@@ -66,8 +66,8 @@ class df_iterator_storage<SetType, true> {
 // one more method, completed, which is invoked when all children of a
 // node have been processed. It is intended to distinguish of back and
 // cross edges in the spanning tree but is not used in the common case.
-template <typename NodeRef, unsigned SmallSize=8>
-struct df_iterator_default_set : public SmallPtrSet<NodeRef, SmallSize> {
+template <typename NodeRef, unsigned SmallSize = 8>
+struct df_iterator_default_set : SmallPtrSet<NodeRef, SmallSize> {
   using BaseSet = SmallPtrSet<NodeRef, SmallSize>;
   using iterator = typename BaseSet::iterator;
 
@@ -235,8 +235,10 @@ iterator_range<df_iterator<T>> depth_first(const T& G) {
 }
 
 // Provide global definitions of external depth first iterators...
-template <class T, class SetTy = df_iterator_default_set<typename GraphTraits<T>::NodeRef>>
-struct df_ext_iterator : public df_iterator<T, SetTy, true> {
+template <class T,
+          class SetTy =
+              df_iterator_default_set<typename GraphTraits<T>::NodeRef>>
+struct df_ext_iterator : df_iterator<T, SetTy, true> {
   df_ext_iterator(const df_iterator<T, SetTy, true> &V)
     : df_iterator<T, SetTy, true>(V) {}
 };
@@ -262,7 +264,7 @@ template <class T,
           class SetTy =
               df_iterator_default_set<typename GraphTraits<T>::NodeRef>,
           bool External = false>
-struct idf_iterator : public df_iterator<Inverse<T>, SetTy, External> {
+struct idf_iterator : df_iterator<Inverse<T>, SetTy, External> {
   idf_iterator(const df_iterator<Inverse<T>, SetTy, External> &V)
     : df_iterator<Inverse<T>, SetTy, External>(V) {}
 };
@@ -284,8 +286,10 @@ iterator_range<idf_iterator<T>> inverse_depth_first(const T& G) {
 }
 
 // Provide global definitions of external inverse depth first iterators...
-template <class T, class SetTy = df_iterator_default_set<typename GraphTraits<T>::NodeRef>>
-struct idf_ext_iterator : public idf_iterator<T, SetTy, true> {
+template <class T,
+          class SetTy =
+              df_iterator_default_set<typename GraphTraits<T>::NodeRef>>
+struct idf_ext_iterator : idf_iterator<T, SetTy, true> {
   idf_ext_iterator(const idf_iterator<T, SetTy, true> &V)
     : idf_iterator<T, SetTy, true>(V) {}
   idf_ext_iterator(const df_iterator<Inverse<T>, SetTy, true> &V)
diff --git a/llvm/include/llvm/ADT/ImmutableSet.h b/llvm/include/llvm/ADT/ImmutableSet.h
index 310539fbb0f99..8b2425e4e8fef 100644
--- a/llvm/include/llvm/ADT/ImmutableSet.h
+++ b/llvm/include/llvm/ADT/ImmutableSet.h
@@ -931,8 +931,7 @@ struct ImutProfileInfo<T*> {
 /// ImutContainerInfo - Generic definition of comparison operations for
 ///   elements of immutable containers that defaults to using
 ///   std::equal_to<> and std::less<> to perform comparison of elements.
-template <typename T>
-struct ImutContainerInfo : public ImutProfileInfo<T> {
+template <typename T> struct ImutContainerInfo : ImutProfileInfo<T> {
   using value_type = typename ImutProfileInfo<T>::value_type;
   using value_type_ref = typename ImutProfileInfo<T>::value_type_ref;
   using key_type = value_type;
@@ -957,8 +956,7 @@ struct ImutContainerInfo : public ImutProfileInfo<T> {
 /// ImutContainerInfo - Specialization for pointer values to treat pointers
 ///  as references to unique objects.  Pointers are thus compared by
 ///  their addresses.
-template <typename T>
-struct ImutContainerInfo<T*> : public ImutProfileInfo<T*> {
+template <typename T> struct ImutContainerInfo<T *> : ImutProfileInfo<T *> {
   using value_type = typename ImutProfileInfo<T*>::value_type;
   using value_type_ref = typename ImutProfileInfo<T*>::value_type_ref;
   using key_type = value_type;
diff --git a/llvm/include/llvm/ADT/PackedVector.h b/llvm/include/llvm/ADT/PackedVector.h
index 09c20e39d1552..57e41979b4ce2 100644
--- a/llvm/include/llvm/ADT/PackedVector.h
+++ b/llvm/include/llvm/ADT/PackedVector.h
@@ -29,6 +29,8 @@ namespace llvm {
 /// an assertion.
 template <typename T, unsigned BitNum, typename BitVectorTy = BitVector>
 class PackedVector {
+  static_assert(BitNum > 0, "BitNum must be > 0");
+
   BitVectorTy Bits;
   // Keep track of the number of elements on our own.
   // We always maintain Bits.size() == NumElements * BitNum.
@@ -133,9 +135,6 @@ class PackedVector {
   BitVectorTy &raw_bits() { return Bits; }
 };
 
-// Leave BitNum=0 undefined.
-template <typename T> class PackedVector<T, 0>;
-
 } // end namespace llvm
 
 #endif // LLVM_ADT_PACKEDVECTOR_H
diff --git a/llvm/include/llvm/ADT/PagedVector.h b/llvm/include/llvm/ADT/PagedVector.h
index 52ecd0bb0ba11..0a691f8889aa0 100644
--- a/llvm/include/llvm/ADT/PagedVector.h
+++ b/llvm/include/llvm/ADT/PagedVector.h
@@ -189,8 +189,7 @@ template <typename T, size_t PageSize = 1024 / sizeof(T)> class PagedVector {
         while (ElementIdx < PV->Size &&
                !PV->PageToDataPtrs[ElementIdx / PageSize])
           ElementIdx += PageSize;
-        if (ElementIdx > PV->Size)
-          ElementIdx = PV->Size;
+        ElementIdx = std::min(ElementIdx, PV->Size);
       }
 
       return *this;
diff --git a/llvm/include/llvm/ADT/PostOrderIterator.h b/llvm/include/llvm/ADT/PostOrderIterator.h
index 1cbd3c170052c..d9aa452e25d18 100644
--- a/llvm/include/llvm/ADT/PostOrderIterator.h
+++ b/llvm/include/llvm/ADT/PostOrderIterator.h
@@ -200,7 +200,7 @@ template <class T> iterator_range<po_iterator<T>> post_order(const T &G) {
 
 // Provide global definitions of external postorder iterators...
 template <class T, class SetType = std::set<typename GraphTraits<T>::NodeRef>>
-struct po_ext_iterator : public po_iterator<T, SetType, true> {
+struct po_ext_iterator : po_iterator<T, SetType, true> {
   po_ext_iterator(const po_iterator<T, SetType, true> &V) :
   po_iterator<T, SetType, true>(V) {}
 };
@@ -223,7 +223,7 @@ iterator_range<po_ext_iterator<T, SetType>> post_order_ext(const T &G, SetType &
 // Provide global definitions of inverse post order iterators...
 template <class T, class SetType = std::set<typename GraphTraits<T>::NodeRef>,
           bool External = false>
-struct ipo_iterator : public po_iterator<Inverse<T>, SetType, External> {
+struct ipo_iterator : po_iterator<Inverse<T>, SetType, External> {
   ipo_iterator(const po_iterator<Inverse<T>, SetType, External> &V) :
      po_iterator<Inverse<T>, SetType, External> (V) {}
 };
@@ -245,7 +245,7 @@ iterator_range<ipo_iterator<T>> inverse_post_order(const T &G) {
 
 // Provide global definitions of external inverse postorder iterators...
 template <class T, class SetType = std::set<typename GraphTraits<T>::NodeRef>>
-struct ipo_ext_iterator : public ipo_iterator<T, SetType, true> {
+struct ipo_ext_iterator : ipo_iterator<T, SetType, true> {
   ipo_ext_iterator(const ipo_iterator<T, SetType, true> &V) :
     ipo_iterator<T, SetType, true>(V) {}
   ipo_ext_iterator(const po_iterator<Inverse<T>, SetType, true> &V) :
diff --git a/llvm/include/llvm/ADT/SCCIterator.h b/llvm/include/llvm/ADT/SCCIterator.h
index 64ee217eeed61..205fa669a12de 100644
--- a/llvm/include/llvm/ADT/SCCIterator.h
+++ b/llvm/include/llvm/ADT/SCCIterator.h
@@ -313,10 +313,10 @@ scc_member_iterator<GraphT, GT>::scc_member_iterator(
   // Initialize auxilary node information.
   NodeInfoMap.clear();
   for (auto *Node : InputNodes) {
-    // This is specifically used to construct a `NodeInfo` object in place. An
-    // insert operation will involve a copy construction which invalidate the
-    // initial value of the `Group` field which should be `this`.
-    (void)NodeInfoMap[Node].Group;
+    // Construct a `NodeInfo` object in place.  `insert()` would involve a copy
+    // construction, invalidating the initial value of the `Group` field, which
+    // should be `this`.
+    NodeInfoMap.try_emplace(Node);
   }
 
   // Sort edges by weights.
diff --git a/llvm/include/llvm/ADT/STLExtras.h b/llvm/include/llvm/ADT/STLExtras.h
index c6f8d120bd8bb..ddf551c719d28 100644
--- a/llvm/include/llvm/ADT/STLExtras.h
+++ b/llvm/include/llvm/ADT/STLExtras.h
@@ -674,7 +674,7 @@ using zip_traits = iterator_facade_base<
     ReferenceTupleType *, ReferenceTupleType>;
 
 template <typename ZipType, typename ReferenceTupleType, typename... Iters>
-struct zip_common : public zip_traits<ZipType, ReferenceTupleType, Iters...> {
+struct zip_common : zip_traits<ZipType, ReferenceTupleType, Iters...> {
   using Base = zip_traits<ZipType, ReferenceTupleType, Iters...>;
   using IndexSequence = std::index_sequence_for<Iters...>;
   using value_type = typename Base::value_type;
diff --git a/llvm/include/llvm/ADT/STLForwardCompat.h b/llvm/include/llvm/ADT/STLForwardCompat.h
index 3c22c84d3d7fc..4a9598c734dbf 100644
--- a/llvm/include/llvm/ADT/STLForwardCompat.h
+++ b/llvm/include/llvm/ADT/STLForwardCompat.h
@@ -20,6 +20,7 @@
 #include <array>
 #include <optional>
 #include <type_traits>
+#include <utility>
 
 namespace llvm {
 
@@ -27,6 +28,54 @@ namespace llvm {
 //     Features from C++20
 //===----------------------------------------------------------------------===//
 
+namespace numbers {
+// clang-format off
+template <typename T, typename = std::enable_if_t<std::is_floating_point_v<T>>>
+inline constexpr T e_v          = T(0x1.5bf0a8b145769P+1); // (2.7182818284590452354) https://oeis.org/A001113
+template <typename T, typename = std::enable_if_t<std::is_floating_point_v<T>>>
+inline constexpr T egamma_v     = T(0x1.2788cfc6fb619P-1); // (.57721566490153286061) https://oeis.org/A001620
+template <typename T, typename = std::enable_if_t<std::is_floating_point_v<T>>>
+inline constexpr T ln2_v        = T(0x1.62e42fefa39efP-1); // (.69314718055994530942) https://oeis.org/A002162
+template <typename T, typename = std::enable_if_t<std::is_floating_point_v<T>>>
+inline constexpr T ln10_v       = T(0x1.26bb1bbb55516P+1); // (2.3025850929940456840) https://oeis.org/A002392
+template <typename T, typename = std::enable_if_t<std::is_floating_point_v<T>>>
+inline constexpr T log2e_v      = T(0x1.71547652b82feP+0); // (1.4426950408889634074)
+template <typename T, typename = std::enable_if_t<std::is_floating_point_v<T>>>
+inline constexpr T log10e_v     = T(0x1.bcb7b1526e50eP-2); // (.43429448190325182765)
+template <typename T, typename = std::enable_if_t<std::is_floating_point_v<T>>>
+inline constexpr T pi_v         = T(0x1.921fb54442d18P+1); // (3.1415926535897932385) https://oeis.org/A000796
+template <typename T, typename = std::enable_if_t<std::is_floating_point_v<T>>>
+inline constexpr T inv_pi_v     = T(0x1.45f306dc9c883P-2); // (.31830988618379067154) https://oeis.org/A049541
+template <typename T, typename = std::enable_if_t<std::is_floating_point_v<T>>>
+inline constexpr T inv_sqrtpi_v = T(0x1.20dd750429b6dP-1); // (.56418958354775628695) https://oeis.org/A087197
+template <typename T, typename = std::enable_if_t<std::is_floating_point_v<T>>>
+inline constexpr T sqrt2_v      = T(0x1.6a09e667f3bcdP+0); // (1.4142135623730950488) https://oeis.org/A00219
+template <typename T, typename = std::enable_if_t<std::is_floating_point_v<T>>>
+inline constexpr T inv_sqrt2_v  = T(0x1.6a09e667f3bcdP-1); // (.70710678118654752440)
+template <typename T, typename = std::enable_if_t<std::is_floating_point_v<T>>>
+inline constexpr T sqrt3_v      = T(0x1.bb67ae8584caaP+0); // (1.7320508075688772935) https://oeis.org/A002194
+template <typename T, typename = std::enable_if_t<std::is_floating_point_v<T>>>
+inline constexpr T inv_sqrt3_v  = T(0x1.279a74590331cP-1); // (.57735026918962576451)
+template <typename T, typename = std::enable_if_t<std::is_floating_point_v<T>>>
+inline constexpr T phi_v        = T(0x1.9e3779b97f4a8P+0); // (1.6180339887498948482) https://oeis.org/A001622
+
+inline constexpr double e          = e_v<double>;
+inline constexpr double egamma     = egamma_v<double>;
+inline constexpr double ln2        = ln2_v<double>;
+inline constexpr double ln10       = ln10_v<double>;
+inline constexpr double log2e      = log2e_v<double>;
+inline constexpr double log10e     = log10e_v<double>;
+inline constexpr double pi         = pi_v<double>;
+inline constexpr double inv_pi     = inv_pi_v<double>;
+inline constexpr double inv_sqrtpi = inv_sqrtpi_v<double>;
+inline constexpr double sqrt2      = sqrt2_v<double>;
+inline constexpr double inv_sqrt2  = inv_sqrt2_v<double>;
+inline constexpr double sqrt3      = sqrt3_v<double>;
+inline constexpr double inv_sqrt3  = inv_sqrt3_v<double>;
+inline constexpr double phi        = phi_v<double>;
+// clang-format on
+} // namespace numbers
+
 template <typename T>
 struct remove_cvref // NOLINT(readability-identifier-naming)
 {
@@ -67,9 +116,25 @@ struct detector<std::void_t<Op<Args...>>, Op, Args...> {
 ///   using has_copy_assign_t = decltype(std::declval<T&>()
 ///                                                 = std::declval<const T&>());
 ///   bool fooHasCopyAssign = is_detected<has_copy_assign_t, FooClass>::value;
+///
+/// NOTE: The C++20 standard has adopted concepts and requires clauses as a
+/// superior alternative to std::is_detected.
+///
+/// This utility is placed in STLForwardCompat.h as a reminder
+/// to migrate usages of llvm::is_detected to concepts and 'requires'
+/// clauses when the codebase adopts C++20.
 template <template <class...> class Op, class... Args>
 using is_detected = typename detail::detector<void, Op, Args...>::value_t;
 
+struct identity_cxx20 // NOLINT(readability-identifier-naming)
+{
+  using is_transparent = void;
+
+  template <typename T> constexpr T &&operator()(T &&self) const noexcept {
+    return std::forward<T>(self);
+  }
+};
+
 //===----------------------------------------------------------------------===//
 //     Features from C++23
 //===----------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/ADT/SmallPtrSet.h b/llvm/include/llvm/ADT/SmallPtrSet.h
index f588a77a53b2a..8e7c8b30293b2 100644
--- a/llvm/include/llvm/ADT/SmallPtrSet.h
+++ b/llvm/include/llvm/ADT/SmallPtrSet.h
@@ -532,18 +532,8 @@ class SmallPtrSet : public SmallPtrSetImpl<PtrType> {
 
   using BaseT = SmallPtrSetImpl<PtrType>;
 
-  // A constexpr version of llvm::bit_ceil.
-  // TODO: Replace this with std::bit_ceil once C++20 is available.
-  static constexpr size_t RoundUpToPowerOfTwo(size_t X) {
-    size_t C = 1;
-    size_t CMax = C << (std::numeric_limits<size_t>::digits - 1);
-    while (C < X && C < CMax)
-      C <<= 1;
-    return C;
-  }
-
   // Make sure that SmallSize is a power of two, round up if not.
-  static constexpr size_t SmallSizePowTwo = RoundUpToPowerOfTwo(SmallSize);
+  static constexpr size_t SmallSizePowTwo = llvm::bit_ceil_constexpr(SmallSize);
   /// SmallStorage - Fixed size storage used in 'small mode'.
   const void *SmallStorage[SmallSizePowTwo];
 
diff --git a/llvm/include/llvm/ADT/SparseMultiSet.h b/llvm/include/llvm/ADT/SparseMultiSet.h
index 0aa7edbcea673..5e4e1703373a9 100644
--- a/llvm/include/llvm/ADT/SparseMultiSet.h
+++ b/llvm/include/llvm/ADT/SparseMultiSet.h
@@ -21,9 +21,9 @@
 #ifndef LLVM_ADT_SPARSEMULTISET_H
 #define LLVM_ADT_SPARSEMULTISET_H
 
+#include "llvm/ADT/STLForwardCompat.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/SparseSet.h"
-#include "llvm/ADT/identity.h"
 #include <cassert>
 #include <cstdint>
 #include <cstdlib>
@@ -77,11 +77,12 @@ namespace llvm {
 /// intuitive and fast removal.
 ///
 /// @tparam ValueT      The type of objects in the set.
+/// @tparam KeyT        The type of the key that identifies objects in the set.
 /// @tparam KeyFunctorT A functor that computes an unsigned index from KeyT.
 /// @tparam SparseT     An unsigned integer type. See above.
 ///
-template <typename ValueT, typename KeyFunctorT = identity<unsigned>,
-          typename SparseT = uint8_t>
+template <typename ValueT, typename KeyT = unsigned,
+          typename KeyFunctorT = identity_cxx20, typename SparseT = uint8_t>
 class SparseMultiSet {
   static_assert(std::is_unsigned_v<SparseT>,
                 "SparseT must be an unsigned integer type");
@@ -112,7 +113,6 @@ class SparseMultiSet {
     bool isValid() const { return Prev != INVALID; }
   };
 
-  using KeyT = typename KeyFunctorT::argument_type;
   using DenseT = SmallVector<SMSNode, 8>;
   DenseT Dense;
   SparseT *Sparse = nullptr;
diff --git a/llvm/include/llvm/ADT/SparseSet.h b/llvm/include/llvm/ADT/SparseSet.h
index 9783301be4b64..4697de097e7eb 100644
--- a/llvm/include/llvm/ADT/SparseSet.h
+++ b/llvm/include/llvm/ADT/SparseSet.h
@@ -20,8 +20,8 @@
 #ifndef LLVM_ADT_SPARSESET_H
 #define LLVM_ADT_SPARSESET_H
 
+#include "llvm/ADT/STLForwardCompat.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/identity.h"
 #include "llvm/Support/AllocatorBase.h"
 #include <cassert>
 #include <cstdint>
@@ -112,16 +112,16 @@ struct SparseSetValFunctor<KeyT, KeyT, KeyFunctorT> {
 /// uint16_t or uint32_t.
 ///
 /// @tparam ValueT      The type of objects in the set.
+/// @tparam KeyT        The type of the key, which is passed to the key functor.
 /// @tparam KeyFunctorT A functor that computes an unsigned index from KeyT.
 /// @tparam SparseT     An unsigned integer type. See above.
 ///
-template <typename ValueT, typename KeyFunctorT = identity<unsigned>,
-          typename SparseT = uint8_t>
+template <typename ValueT, typename KeyT = unsigned,
+          typename KeyFunctorT = identity_cxx20, typename SparseT = uint8_t>
 class SparseSet {
   static_assert(std::is_unsigned_v<SparseT>,
                 "SparseT must be an unsigned integer type");
 
-  using KeyT = typename KeyFunctorT::argument_type;
   using DenseT = SmallVector<ValueT, 8>;
   using size_type = unsigned;
   DenseT Dense;
diff --git a/llvm/include/llvm/ADT/StringSwitch.h b/llvm/include/llvm/ADT/StringSwitch.h
index a96535cd077fd..2262b1162e330 100644
--- a/llvm/include/llvm/ADT/StringSwitch.h
+++ b/llvm/include/llvm/ADT/StringSwitch.h
@@ -14,6 +14,7 @@
 #define LLVM_ADT_STRINGSWITCH_H
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <cassert>
 #include <cstring>
@@ -38,7 +39,7 @@ namespace llvm {
 ///   .Case("green", Green)
 ///   .Case("blue", Blue)
 ///   .Case("indigo", Indigo)
-///   .Cases("violet", "purple", Violet)
+///   .Cases({"violet", "purple"}, Violet)
 ///   .Default(UnknownColor);
 /// \endcode
 template<typename T, typename R = T>
@@ -54,21 +55,18 @@ class StringSwitch {
   explicit StringSwitch(StringRef S)
   : Str(S), Result() { }
 
+  StringSwitch(StringSwitch &&) = default;
+
   // StringSwitch is not copyable.
   StringSwitch(const StringSwitch &) = delete;
 
   // StringSwitch is not assignable due to 'Str' being 'const'.
   void operator=(const StringSwitch &) = delete;
-  void operator=(StringSwitch &&other) = delete;
-
-  StringSwitch(StringSwitch &&other)
-    : Str(other.Str), Result(std::move(other.Result)) { }
-
-  ~StringSwitch() = default;
+  void operator=(StringSwitch &&) = delete;
 
   // Case-sensitive case matchers
   StringSwitch &Case(StringLiteral S, T Value) {
-    CaseImpl(Value, S);
+    CaseImpl(S, Value);
     return *this;
   }
 
@@ -88,63 +86,70 @@ class StringSwitch {
 
   StringSwitch &Cases(std::initializer_list<StringLiteral> CaseStrings,
                       T Value) {
-    return CasesImpl(Value, CaseStrings);
+    return CasesImpl(CaseStrings, Value);
   }
 
   StringSwitch &Cases(StringLiteral S0, StringLiteral S1, T Value) {
-    return CasesImpl(Value, {S0, S1});
+    return CasesImpl({S0, S1}, Value);
   }
 
   StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
                       T Value) {
-    return CasesImpl(Value, {S0, S1, S2});
+    return CasesImpl({S0, S1, S2}, Value);
   }
 
+  [[deprecated("Pass cases in std::initializer_list instead")]]
   StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
                       StringLiteral S3, T Value) {
-    return CasesImpl(Value, {S0, S1, S2, S3});
+    return CasesImpl({S0, S1, S2, S3}, Value);
   }
 
+  [[deprecated("Pass cases in std::initializer_list instead")]]
   StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
                       StringLiteral S3, StringLiteral S4, T Value) {
-    return CasesImpl(Value, {S0, S1, S2, S3, S4});
+    return CasesImpl({S0, S1, S2, S3, S4}, Value);
   }
 
+  [[deprecated("Pass cases in std::initializer_list instead")]]
   StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
                       StringLiteral S3, StringLiteral S4, StringLiteral S5,
                       T Value) {
-    return CasesImpl(Value, {S0, S1, S2, S3, S4, S5});
+    return CasesImpl({S0, S1, S2, S3, S4, S5}, Value);
   }
 
+  [[deprecated("Pass cases in std::initializer_list instead")]]
   StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
                       StringLiteral S3, StringLiteral S4, StringLiteral S5,
                       StringLiteral S6, T Value) {
-    return CasesImpl(Value, {S0, S1, S2, S3, S4, S5, S6});
+    return CasesImpl({S0, S1, S2, S3, S4, S5, S6}, Value);
   }
 
+  [[deprecated("Pass cases in std::initializer_list instead")]]
   StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
                       StringLiteral S3, StringLiteral S4, StringLiteral S5,
                       StringLiteral S6, StringLiteral S7, T Value) {
-    return CasesImpl(Value, {S0, S1, S2, S3, S4, S5, S6, S7});
+    return CasesImpl({S0, S1, S2, S3, S4, S5, S6, S7}, Value);
   }
 
+  [[deprecated("Pass cases in std::initializer_list instead")]]
   StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
                       StringLiteral S3, StringLiteral S4, StringLiteral S5,
                       StringLiteral S6, StringLiteral S7, StringLiteral S8,
                       T Value) {
-    return CasesImpl(Value, {S0, S1, S2, S3, S4, S5, S6, S7, S8});
+    return CasesImpl({S0, S1, S2, S3, S4, S5, S6, S7, S8}, Value);
   }
 
+  [[deprecated("Pass cases in std::initializer_list instead")]]
   StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
                       StringLiteral S3, StringLiteral S4, StringLiteral S5,
                       StringLiteral S6, StringLiteral S7, StringLiteral S8,
                       StringLiteral S9, T Value) {
-    return CasesImpl(Value, {S0, S1, S2, S3, S4, S5, S6, S7, S8, S9});
+    return CasesImpl({S0, S1, S2, S3, S4, S5, S6, S7, S8, S9}, Value);
   }
 
   // Case-insensitive case matchers.
   StringSwitch &CaseLower(StringLiteral S, T Value) {
-    CaseLowerImpl(Value, S);
+    CaseLowerImpl(S, Value);
     return *this;
   }
 
@@ -164,26 +169,28 @@ class StringSwitch {
 
   StringSwitch &CasesLower(std::initializer_list<StringLiteral> CaseStrings,
                            T Value) {
-    return CasesLowerImpl(Value, CaseStrings);
+    return CasesLowerImpl(CaseStrings, Value);
   }
 
   StringSwitch &CasesLower(StringLiteral S0, StringLiteral S1, T Value) {
-    return CasesLowerImpl(Value, {S0, S1});
+    return CasesLowerImpl({S0, S1}, Value);
   }
 
   StringSwitch &CasesLower(StringLiteral S0, StringLiteral S1, StringLiteral S2,
                            T Value) {
-    return CasesLowerImpl(Value, {S0, S1, S2});
+    return CasesLowerImpl({S0, S1, S2}, Value);
   }
 
+  [[deprecated("Pass cases in std::initializer_list instead")]]
   StringSwitch &CasesLower(StringLiteral S0, StringLiteral S1, StringLiteral S2,
                            StringLiteral S3, T Value) {
-    return CasesLowerImpl(Value, {S0, S1, S2, S3});
+    return CasesLowerImpl({S0, S1, S2, S3}, Value);
   }
 
+  [[deprecated("Pass cases in std::initializer_list instead")]]
   StringSwitch &CasesLower(StringLiteral S0, StringLiteral S1, StringLiteral S2,
                            StringLiteral S3, StringLiteral S4, T Value) {
-    return CasesLowerImpl(Value, {S0, S1, S2, S3, S4});
+    return CasesLowerImpl({S0, S1, S2, S3, S4}, Value);
   }
 
   [[nodiscard]] R Default(T Value) {
@@ -204,7 +211,7 @@ class StringSwitch {
 
 private:
   // Returns true when `Str` matches the `S` argument, and stores the result.
-  bool CaseImpl(T &Value, StringLiteral S) {
+  bool CaseImpl(StringLiteral S, T &Value) {
     if (!Result && Str == S) {
       Result = std::move(Value);
       return true;
@@ -214,7 +221,7 @@ class StringSwitch {
 
   // Returns true when `Str` matches the `S` argument (case-insensitive), and
   // stores the result.
-  bool CaseLowerImpl(T &Value, StringLiteral S) {
+  bool CaseLowerImpl(StringLiteral S, T &Value) {
     if (!Result && Str.equals_insensitive(S)) {
       Result = std::move(Value);
       return true;
@@ -222,20 +229,20 @@ class StringSwitch {
     return false;
   }
 
-  StringSwitch &CasesImpl(T &Value,
-                          std::initializer_list<StringLiteral> Cases) {
+  StringSwitch &CasesImpl(std::initializer_list<StringLiteral> Cases,
+                          T &Value) {
     // Stop matching after the string is found.
     for (StringLiteral S : Cases)
-      if (CaseImpl(Value, S))
+      if (CaseImpl(S, Value))
         break;
     return *this;
   }
 
-  StringSwitch &CasesLowerImpl(T &Value,
-                               std::initializer_list<StringLiteral> Cases) {
+  StringSwitch &CasesLowerImpl(std::initializer_list<StringLiteral> Cases,
+                               T &Value) {
     // Stop matching after the string is found.
     for (StringLiteral S : Cases)
-      if (CaseLowerImpl(Value, S))
+      if (CaseLowerImpl(S, Value))
         break;
     return *this;
   }
diff --git a/llvm/include/llvm/ADT/bit.h b/llvm/include/llvm/ADT/bit.h
index 66c4f94813241..5971b75045b6b 100644
--- a/llvm/include/llvm/ADT/bit.h
+++ b/llvm/include/llvm/ADT/bit.h
@@ -151,7 +151,7 @@ template <typename T, typename = std::enable_if_t<std::is_unsigned_v<T>>>
 /// Count the number of set bits in a value.
 /// Ex. popcount(0xF000F000) = 8
 /// Returns 0 if Value is zero.
-template <typename T> [[nodiscard]] inline int popcount(T Value) noexcept {
+template <typename T> [[nodiscard]] constexpr int popcount(T Value) noexcept {
   static_assert(std::is_unsigned_v<T>, "T must be an unsigned integer type");
   static_assert(sizeof(T) <= 8, "T must be 8 bytes or less");
 
@@ -178,7 +178,23 @@ template <typename T> [[nodiscard]] inline int popcount(T Value) noexcept {
 }
 
 /// Count number of 0's from the least significant bit to the most
-///   stopping at the first 1.
+/// stopping at the first 1.
+///
+/// A constexpr version of countr_zero.
+///
+/// Only unsigned integral types are allowed.
+///
+/// Returns std::numeric_limits<T>::digits on an input of 0.
+template <typename T> [[nodiscard]] constexpr int countr_zero_constexpr(T Val) {
+  static_assert(std::is_unsigned_v<T>,
+                "Only unsigned integral types are allowed.");
+  // "(Val & -Val) - 1" generates a mask with all bits set up to (but not
+  // including) the least significant set bit of Val.
+  return llvm::popcount(static_cast<std::make_unsigned_t<T>>((Val & -Val) - 1));
+}
+
+/// Count number of 0's from the least significant bit to the most
+/// stopping at the first 1.
 ///
 /// Only unsigned integral types are allowed.
 ///
@@ -208,9 +224,7 @@ template <typename T> [[nodiscard]] int countr_zero(T Val) {
 #endif
   }
 
-  // Fallback to popcount.  "(Val & -Val) - 1" is a bitmask with all bits below
-  // the least significant 1 set.
-  return llvm::popcount(static_cast<std::make_unsigned_t<T>>((Val & -Val) - 1));
+  return countr_zero_constexpr(Val);
 }
 
 /// Count number of 0's from the most significant bit to the least
@@ -336,33 +350,43 @@ template <typename T> [[nodiscard]] T bit_ceil(T Value) {
   return T(1) << llvm::bit_width<T>(Value - 1u);
 }
 
-// Forward-declare rotr so that rotl can use it.
-template <typename T, typename = std::enable_if_t<std::is_unsigned_v<T>>>
-[[nodiscard]] constexpr T rotr(T V, int R);
+/// Returns the smallest integral power of two no smaller than Value if Value is
+/// nonzero.  Returns 1 otherwise.
+///
+/// Ex. bit_ceil(5) == 8.
+///
+/// The return value is undefined if the input is larger than the largest power
+/// of two representable in T.
+template <typename T> [[nodiscard]] constexpr T bit_ceil_constexpr(T Value) {
+  static_assert(std::is_unsigned_v<T>,
+                "Only unsigned integral types are allowed.");
+  if (Value < 2)
+    return 1;
+  return T(1) << llvm::bit_width_constexpr<T>(Value - 1u);
+}
 
 template <typename T, typename = std::enable_if_t<std::is_unsigned_v<T>>>
 [[nodiscard]] constexpr T rotl(T V, int R) {
-  unsigned N = std::numeric_limits<T>::digits;
+  constexpr unsigned N = std::numeric_limits<T>::digits;
 
-  R = R % N;
-  if (!R)
-    return V;
+  static_assert(has_single_bit(N), "& (N - 1) is only valid for powers of two");
+  R = R & (N - 1);
 
-  if (R < 0)
-    return llvm::rotr(V, -R);
+  if (R == 0)
+    return V;
 
   return (V << R) | (V >> (N - R));
 }
 
-template <typename T, typename> [[nodiscard]] constexpr T rotr(T V, int R) {
-  unsigned N = std::numeric_limits<T>::digits;
+template <typename T, typename = std::enable_if_t<std::is_unsigned_v<T>>>
+[[nodiscard]] constexpr T rotr(T V, int R) {
+  constexpr unsigned N = std::numeric_limits<T>::digits;
 
-  R = R % N;
-  if (!R)
-    return V;
+  static_assert(has_single_bit(N), "& (N - 1) is only valid for powers of two");
+  R = R & (N - 1);
 
-  if (R < 0)
-    return llvm::rotl(V, -R);
+  if (R == 0)
+    return V;
 
   return (V >> R) | (V << (N - R));
 }
diff --git a/llvm/include/llvm/Analysis/DXILResource.h b/llvm/include/llvm/Analysis/DXILResource.h
index 88ac0a11fe5a2..c7aff167324e6 100644
--- a/llvm/include/llvm/Analysis/DXILResource.h
+++ b/llvm/include/llvm/Analysis/DXILResource.h
@@ -243,6 +243,25 @@ class LayoutExtType : public TargetExtType {
   }
 };
 
+/// The dx.Padding target extension type
+///
+/// `target("dx.Padding", NumBytes)`
+class PaddingExtType : public TargetExtType {
+public:
+  PaddingExtType() = delete;
+  PaddingExtType(const PaddingExtType &) = delete;
+  PaddingExtType &operator=(const PaddingExtType &) = delete;
+
+  unsigned getNumBytes() const { return getIntParameter(0); }
+
+  static bool classof(const TargetExtType *T) {
+    return T->getName() == "dx.Padding";
+  }
+  static bool classof(const Type *T) {
+    return isa<TargetExtType>(T) && classof(cast<TargetExtType>(T));
+  }
+};
+
 //===----------------------------------------------------------------------===//
 
 class ResourceTypeInfo {
diff --git a/llvm/include/llvm/Analysis/IR2Vec.h b/llvm/include/llvm/Analysis/IR2Vec.h
index 6bc51feb580d9..5ad62880a779c 100644
--- a/llvm/include/llvm/Analysis/IR2Vec.h
+++ b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -575,7 +575,7 @@ class Embedder {
   /// cached embeddings should be invalidated to ensure
   /// correctness/recomputation. This is a no-op for SymbolicEmbedder but
   /// removes all the cached entries in FlowAwareEmbedder.
-  virtual void invalidateEmbeddings() { return; }
+  virtual void invalidateEmbeddings() {}
 };
 
 /// Class for computing the Symbolic embeddings of IR2Vec.
diff --git a/llvm/include/llvm/Analysis/LoopAnalysisManager.h b/llvm/include/llvm/Analysis/LoopAnalysisManager.h
index fc69cb03849c6..1755257fe6c89 100644
--- a/llvm/include/llvm/Analysis/LoopAnalysisManager.h
+++ b/llvm/include/llvm/Analysis/LoopAnalysisManager.h
@@ -36,7 +36,6 @@ namespace llvm {
 
 class AAResults;
 class AssumptionCache;
-class BlockFrequencyInfo;
 class DominatorTree;
 class Function;
 class Loop;
@@ -58,7 +57,6 @@ struct LoopStandardAnalysisResults {
   ScalarEvolution &SE;
   TargetLibraryInfo &TLI;
   TargetTransformInfo &TTI;
-  BlockFrequencyInfo *BFI;
   MemorySSA *MSSA;
 };
 
diff --git a/llvm/include/llvm/Analysis/LoopInfo.h b/llvm/include/llvm/Analysis/LoopInfo.h
index a7a6a2753709c..0ecb1141dc1be 100644
--- a/llvm/include/llvm/Analysis/LoopInfo.h
+++ b/llvm/include/llvm/Analysis/LoopInfo.h
@@ -617,7 +617,7 @@ class LLVM_ABI LoopInfoWrapperPass : public FunctionPass {
 };
 
 /// Function to print a loop's contents as LLVM's text IR assembly.
-LLVM_ABI void printLoop(Loop &L, raw_ostream &OS,
+LLVM_ABI void printLoop(const Loop &L, raw_ostream &OS,
                         const std::string &Banner = "");
 
 /// Find and return the loop attribute node for the attribute @p Name in
diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index e5a6c8cc0a6aa..3d3ec14796bc1 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -1345,6 +1345,7 @@ class ScalarEvolution {
 
   class LoopGuards {
     DenseMap<const SCEV *, const SCEV *> RewriteMap;
+    SmallDenseSet<std::pair<const SCEV *, const SCEV *>> NotEqual;
     bool PreserveNUW = false;
     bool PreserveNSW = false;
     ScalarEvolution &SE;
diff --git a/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h b/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h
index 871028de3163c..9354eef98fe91 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h
@@ -95,6 +95,10 @@ inline bind_ty<const SCEVAddExpr> m_scev_Add(const SCEVAddExpr *&V) {
   return V;
 }
 
+inline bind_ty<const SCEVMulExpr> m_scev_Mul(const SCEVMulExpr *&V) {
+  return V;
+}
+
 /// Match a specified const SCEV *.
 struct specificscev_ty {
   const SCEV *Expr;
@@ -252,6 +256,18 @@ m_scev_UDiv(const Op0_t &Op0, const Op1_t &Op1) {
   return m_scev_Binary<SCEVUDivExpr>(Op0, Op1);
 }
 
+template <typename Op0_t, typename Op1_t>
+inline SCEVBinaryExpr_match<SCEVSMaxExpr, Op0_t, Op1_t>
+m_scev_SMax(const Op0_t &Op0, const Op1_t &Op1) {
+  return m_scev_Binary<SCEVSMaxExpr>(Op0, Op1);
+}
+
+template <typename Op0_t, typename Op1_t>
+inline SCEVBinaryExpr_match<SCEVMinMaxExpr, Op0_t, Op1_t>
+m_scev_MinMax(const Op0_t &Op0, const Op1_t &Op1) {
+  return m_scev_Binary<SCEVMinMaxExpr>(Op0, Op1);
+}
+
 /// Match unsigned remainder pattern.
 /// Matches patterns generated by getURemExpr.
 template <typename Op0_t, typename Op1_t> struct SCEVURem_match {
@@ -284,14 +300,10 @@ template <typename Op0_t, typename Op1_t> struct SCEVURem_match {
                          << SE.getTypeSizeInBits(TruncTy));
       return Op0.match(LHS) && Op1.match(RHS);
     }
-    const auto *Add = dyn_cast<SCEVAddExpr>(Expr);
-    if (Add == nullptr || Add->getNumOperands() != 2)
-      return false;
-
-    const SCEV *A = Add->getOperand(1);
-    const auto *Mul = dyn_cast<SCEVMulExpr>(Add->getOperand(0));
 
-    if (Mul == nullptr)
+    const SCEV *A;
+    const SCEVMulExpr *Mul;
+    if (!SCEVPatternMatch::match(Expr, m_scev_Add(m_scev_Mul(Mul), m_SCEV(A))))
       return false;
 
     const auto MatchURemWithDivisor = [&](const SCEV *B) {
diff --git a/llvm/include/llvm/Analysis/StaticDataProfileInfo.h b/llvm/include/llvm/Analysis/StaticDataProfileInfo.h
index f06e7ceaa74ce..ac03137c93610 100644
--- a/llvm/include/llvm/Analysis/StaticDataProfileInfo.h
+++ b/llvm/include/llvm/Analysis/StaticDataProfileInfo.h
@@ -32,8 +32,11 @@ bool IsAnnotationOK(const GlobalVariable &GV);
 /// profile information and provides methods to operate on them.
 class StaticDataProfileInfo {
 public:
-  /// Accummulate the profile count of a constant that will be lowered to static
-  /// data sections.
+  /// A constant is tracked only if the following conditions are met.
+  ///   1) It has local (i.e., private or internal) linkage.
+  //    2) Its data kind is one of {.rodata, .data, .bss, .data.rel.ro}.
+  //    3) It's eligible for section prefix annotation. See `AnnotationKind`
+  //       above for ineligible reasons.
   DenseMap<const Constant *, uint64_t> ConstantProfileCounts;
 
   /// Keeps track of the constants that are seen at least once without profile
@@ -44,8 +47,31 @@ class StaticDataProfileInfo {
   LLVM_ABI std::optional<uint64_t>
   getConstantProfileCount(const Constant *C) const;
 
+  /// Use signed enums for enum value comparison, and make 'LukewarmOrUnknown'
+  /// as 0 so any accidentally uninitialized value will default to unknown.
+  enum class StaticDataHotness : int8_t {
+    Cold = -1,
+    LukewarmOrUnknown = 0,
+    Hot = 1,
+  };
+
+  /// Return the hotness of the constant \p C based on its profile count \p
+  /// Count.
+  LLVM_ABI StaticDataHotness getConstantHotnessUsingProfileCount(
+      const Constant *C, const ProfileSummaryInfo *PSI, uint64_t Count) const;
+
+  /// Return the hotness based on section prefix \p SectionPrefix.
+  LLVM_ABI StaticDataHotness getSectionHotnessUsingDataAccessProfile(
+      std::optional<StringRef> SectionPrefix) const;
+
+  /// Return the string representation of the hotness enum \p Hotness.
+  LLVM_ABI StringRef hotnessToStr(StaticDataHotness Hotness) const;
+
+  bool EnableDataAccessProf = false;
+
 public:
-  StaticDataProfileInfo() = default;
+  StaticDataProfileInfo(bool EnableDataAccessProf)
+      : EnableDataAccessProf(EnableDataAccessProf) {}
 
   /// If \p Count is not nullopt, add it to the profile count of the constant \p
   /// C in a saturating way, and clamp the count to \p getInstrMaxCountValue if
@@ -54,14 +80,10 @@ class StaticDataProfileInfo {
   LLVM_ABI void addConstantProfileCount(const Constant *C,
                                         std::optional<uint64_t> Count);
 
-  /// Return a section prefix for the constant \p C based on its profile count.
-  /// - If a constant doesn't have a counter, return an empty string.
-  /// - Otherwise,
-  ///   - If it has a hot count, return "hot".
-  ///   - If it is seen by unprofiled function, return an empty string.
-  ///   - If it has a cold count, return "unlikely".
-  ///   - Otherwise (e.g. it's used by lukewarm functions), return an empty
-  ///     string.
+  /// Given a constant \p C, returns a section prefix.
+  /// If \p C is a global variable, the section prefix is the bigger one
+  /// between its existing section prefix and its use profile count. Otherwise,
+  /// the section prefix is based on its use profile count.
   LLVM_ABI StringRef getConstantSectionPrefix(
       const Constant *C, const ProfileSummaryInfo *PSI) const;
 };
diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.h b/llvm/include/llvm/Analysis/TargetLibraryInfo.h
index 6f78eaaf42530..f783a82d800c0 100644
--- a/llvm/include/llvm/Analysis/TargetLibraryInfo.h
+++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.h
@@ -373,12 +373,10 @@ class TargetLibraryInfo {
   /// Disables all builtins.
   ///
   /// This can be used for options like -fno-builtin.
-  void disableAllFunctions() LLVM_ATTRIBUTE_UNUSED {
-    OverrideAsUnavailable.set();
-  }
+  [[maybe_unused]] void disableAllFunctions() { OverrideAsUnavailable.set(); }
 
   /// Forces a function to be marked as unavailable.
-  void setUnavailable(LibFunc F) LLVM_ATTRIBUTE_UNUSED {
+  [[maybe_unused]] void setUnavailable(LibFunc F) {
     assert(F < OverrideAsUnavailable.size() && "out-of-bounds LibFunc");
     OverrideAsUnavailable.set(F);
   }
diff --git a/llvm/include/llvm/BinaryFormat/Dwarf.h b/llvm/include/llvm/BinaryFormat/Dwarf.h
index f6b247454a451..90868ecba6008 100644
--- a/llvm/include/llvm/BinaryFormat/Dwarf.h
+++ b/llvm/include/llvm/BinaryFormat/Dwarf.h
@@ -1139,6 +1139,9 @@ struct FormParams {
   uint8_t getDwarfOffsetByteSize() const {
     return dwarf::getDwarfOffsetByteSize(Format);
   }
+  inline uint64_t getDwarfMaxOffset() const {
+    return (getDwarfOffsetByteSize() == 4) ? UINT32_MAX : UINT64_MAX;
+  }
 
   explicit operator bool() const { return Version && AddrSize; }
 };
diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index e619b186dfe3d..8d0dc64199ebf 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -647,6 +647,7 @@ enum {
   EF_HEXAGON_ISA_V85 = 0x00000085,  // Hexagon V85 ISA
   EF_HEXAGON_ISA_V87 = 0x00000087,  // Hexagon V87 ISA
   EF_HEXAGON_ISA_V89 = 0x00000089,  // Hexagon V89 ISA
+  EF_HEXAGON_ISA_V91 = 0x00000091,  // Hexagon V91 ISA
   EF_HEXAGON_ISA = 0x000003ff,      // Hexagon V.. ISA
 
   // Tiny core flag, bit[15]
@@ -680,6 +681,7 @@ enum {
   EF_HEXAGON_MACH_V85 = EF_HEXAGON_ISA_V85,      // Hexagon V85
   EF_HEXAGON_MACH_V87 = EF_HEXAGON_ISA_V87,      // Hexagon V87
   EF_HEXAGON_MACH_V89 = EF_HEXAGON_ISA_V89,      // Hexagon V89
+  EF_HEXAGON_MACH_V91 = EF_HEXAGON_ISA_V91,      // Hexagon V91
 
   EF_HEXAGON_MACH = 0x0000ffff, // Hexagon V..
 };
diff --git a/llvm/include/llvm/BinaryFormat/ELFRelocs/AArch64.def b/llvm/include/llvm/BinaryFormat/ELFRelocs/AArch64.def
index 8dcc2926e1b59..1cfcdbf67dac5 100644
--- a/llvm/include/llvm/BinaryFormat/ELFRelocs/AArch64.def
+++ b/llvm/include/llvm/BinaryFormat/ELFRelocs/AArch64.def
@@ -62,6 +62,7 @@ ELF_RELOC(R_AARCH64_LD64_GOTPAGE_LO15,               0x139)
 ELF_RELOC(R_AARCH64_PLT32,                           0x13a)
 ELF_RELOC(R_AARCH64_GOTPCREL32,                      0x13b)
 ELF_RELOC(R_AARCH64_PATCHINST,                       0x13c)
+ELF_RELOC(R_AARCH64_FUNCINIT64,                      0x13d)
 // General dynamic TLS relocations
 ELF_RELOC(R_AARCH64_TLSGD_ADR_PREL21,                0x200)
 ELF_RELOC(R_AARCH64_TLSGD_ADR_PAGE21,                0x201)
diff --git a/llvm/include/llvm/CAS/CASID.h b/llvm/include/llvm/CAS/CASID.h
index 882099451c5cc..f508ed3b26c23 100644
--- a/llvm/include/llvm/CAS/CASID.h
+++ b/llvm/include/llvm/CAS/CASID.h
@@ -95,8 +95,7 @@ class CASID {
   }
 
   friend hash_code hash_value(const CASID &ID) {
-    ArrayRef<uint8_t> Hash = ID.getHash();
-    return hash_combine_range(Hash.begin(), Hash.end());
+    return hash_combine_range(ID.getHash());
   }
 
   const CASContext &getContext() const {
diff --git a/llvm/include/llvm/CAS/FileOffset.h b/llvm/include/llvm/CAS/FileOffset.h
index 21d045e8c9d78..a3dc06b38e996 100644
--- a/llvm/include/llvm/CAS/FileOffset.h
+++ b/llvm/include/llvm/CAS/FileOffset.h
@@ -15,7 +15,7 @@
 #ifndef LLVM_CAS_FILEOFFSET_H
 #define LLVM_CAS_FILEOFFSET_H
 
-#include <cstdlib>
+#include <cstdint>
 
 namespace llvm::cas {
 
diff --git a/llvm/include/llvm/CAS/OnDiskDataAllocator.h b/llvm/include/llvm/CAS/OnDiskDataAllocator.h
index 2809df800621b..b7099dce2069d 100644
--- a/llvm/include/llvm/CAS/OnDiskDataAllocator.h
+++ b/llvm/include/llvm/CAS/OnDiskDataAllocator.h
@@ -64,7 +64,7 @@ class OnDiskDataAllocator {
 
   /// \returns the buffer that was allocated at \p create time, with size
   /// \p UserHeaderSize.
-  MutableArrayRef<uint8_t> getUserHeader();
+  MutableArrayRef<uint8_t> getUserHeader() const;
 
   size_t size() const;
   size_t capacity() const;
diff --git a/llvm/include/llvm/CAS/OnDiskGraphDB.h b/llvm/include/llvm/CAS/OnDiskGraphDB.h
new file mode 100644
index 0000000000000..5f0ee0e131c0f
--- /dev/null
+++ b/llvm/include/llvm/CAS/OnDiskGraphDB.h
@@ -0,0 +1,470 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This declares OnDiskGraphDB, an ondisk CAS database with a fixed length
+/// hash. This is the class that implements the database storage scheme without
+/// exposing the hashing algorithm.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CAS_ONDISKGRAPHDB_H
+#define LLVM_CAS_ONDISKGRAPHDB_H
+
+#include "llvm/ADT/PointerUnion.h"
+#include "llvm/CAS/OnDiskDataAllocator.h"
+#include "llvm/CAS/OnDiskTrieRawHashMap.h"
+
+namespace llvm::cas::ondisk {
+
+/// Standard 8 byte reference inside OnDiskGraphDB.
+class InternalRef {
+public:
+  FileOffset getFileOffset() const { return FileOffset(Data); }
+  uint64_t getRawData() const { return Data; }
+
+  static InternalRef getFromRawData(uint64_t Data) { return InternalRef(Data); }
+  static InternalRef getFromOffset(FileOffset Offset) {
+    return InternalRef(Offset.get());
+  }
+
+  friend bool operator==(InternalRef LHS, InternalRef RHS) {
+    return LHS.Data == RHS.Data;
+  }
+
+private:
+  InternalRef(FileOffset Offset) : Data((uint64_t)Offset.get()) {}
+  InternalRef(uint64_t Data) : Data(Data) {}
+  uint64_t Data;
+};
+
+/// Compact 4 byte reference inside OnDiskGraphDB for smaller references.
+class InternalRef4B {
+public:
+  FileOffset getFileOffset() const { return FileOffset(Data); }
+  uint32_t getRawData() const { return Data; }
+
+  /// Shrink to 4B reference.
+  static std::optional<InternalRef4B> tryToShrink(InternalRef Ref) {
+    uint64_t Offset = Ref.getRawData();
+    if (Offset > UINT32_MAX)
+      return std::nullopt;
+    return InternalRef4B(Offset);
+  }
+
+  operator InternalRef() const {
+    return InternalRef::getFromOffset(getFileOffset());
+  }
+
+private:
+  friend class InternalRef;
+  InternalRef4B(uint32_t Data) : Data(Data) {}
+  uint32_t Data;
+};
+
+/// Array of internal node references.
+class InternalRefArrayRef {
+public:
+  size_t size() const { return Size; }
+  bool empty() const { return !Size; }
+
+  class iterator
+      : public iterator_facade_base<iterator, std::random_access_iterator_tag,
+                                    const InternalRef> {
+  public:
+    bool operator==(const iterator &RHS) const { return I == RHS.I; }
+    InternalRef operator*() const {
+      if (auto *Ref = dyn_cast<const InternalRef *>(I))
+        return *Ref;
+      return InternalRef(*cast<const InternalRef4B *>(I));
+    }
+    bool operator<(const iterator &RHS) const {
+      assert(isa<const InternalRef *>(I) == isa<const InternalRef *>(RHS.I));
+      if (auto *Ref = dyn_cast<const InternalRef *>(I))
+        return Ref < cast<const InternalRef *>(RHS.I);
+      return cast<const InternalRef4B *>(I) -
+             cast<const InternalRef4B *>(RHS.I);
+    }
+    ptrdiff_t operator-(const iterator &RHS) const {
+      assert(isa<const InternalRef *>(I) == isa<const InternalRef *>(RHS.I));
+      if (auto *Ref = dyn_cast<const InternalRef *>(I))
+        return Ref - cast<const InternalRef *>(RHS.I);
+      return cast<const InternalRef4B *>(I) -
+             cast<const InternalRef4B *>(RHS.I);
+    }
+    iterator &operator+=(ptrdiff_t N) {
+      if (auto *Ref = dyn_cast<const InternalRef *>(I))
+        I = Ref + N;
+      else
+        I = cast<const InternalRef4B *>(I) + N;
+      return *this;
+    }
+    iterator &operator-=(ptrdiff_t N) {
+      if (auto *Ref = dyn_cast<const InternalRef *>(I))
+        I = Ref - N;
+      else
+        I = cast<const InternalRef4B *>(I) - N;
+      return *this;
+    }
+    InternalRef operator[](ptrdiff_t N) const { return *(this->operator+(N)); }
+
+    iterator() = default;
+
+    uint64_t getOpaqueData() const { return uintptr_t(I.getOpaqueValue()); }
+
+    static iterator fromOpaqueData(uint64_t Opaque) {
+      return iterator(
+          PointerUnion<const InternalRef *,
+                       const InternalRef4B *>::getFromOpaqueValue((void *)
+                                                                      Opaque));
+    }
+
+  private:
+    friend class InternalRefArrayRef;
+    explicit iterator(
+        PointerUnion<const InternalRef *, const InternalRef4B *> I)
+        : I(I) {}
+    PointerUnion<const InternalRef *, const InternalRef4B *> I;
+  };
+
+  bool operator==(const InternalRefArrayRef &RHS) const {
+    return size() == RHS.size() && std::equal(begin(), end(), RHS.begin());
+  }
+
+  iterator begin() const { return iterator(Begin); }
+  iterator end() const { return begin() + Size; }
+
+  /// Array accessor.
+  InternalRef operator[](ptrdiff_t N) const { return begin()[N]; }
+
+  bool is4B() const { return isa<const InternalRef4B *>(Begin); }
+  bool is8B() const { return isa<const InternalRef *>(Begin); }
+
+  ArrayRef<uint8_t> getBuffer() const {
+    if (is4B()) {
+      auto *B = cast<const InternalRef4B *>(Begin);
+      return ArrayRef((const uint8_t *)B, sizeof(InternalRef4B) * Size);
+    }
+    auto *B = cast<const InternalRef *>(Begin);
+    return ArrayRef((const uint8_t *)B, sizeof(InternalRef) * Size);
+  }
+
+  InternalRefArrayRef(std::nullopt_t = std::nullopt) {
+    // This is useful so that all the casts in the \p iterator functions can
+    // operate without needing to check for a null value.
+    static InternalRef PlaceHolder = InternalRef::getFromRawData(0);
+    Begin = &PlaceHolder;
+  }
+
+  InternalRefArrayRef(ArrayRef<InternalRef> Refs)
+      : Begin(Refs.begin()), Size(Refs.size()) {}
+
+  InternalRefArrayRef(ArrayRef<InternalRef4B> Refs)
+      : Begin(Refs.begin()), Size(Refs.size()) {}
+
+private:
+  PointerUnion<const InternalRef *, const InternalRef4B *> Begin;
+  size_t Size = 0;
+};
+
+/// Reference to a node. The node's data may not be stored in the database.
+/// An \p ObjectID instance can only be used with the \p OnDiskGraphDB instance
+/// it came from. \p ObjectIDs from different \p OnDiskGraphDB instances are not
+/// comparable.
+class ObjectID {
+public:
+  uint64_t getOpaqueData() const { return Opaque; }
+
+  static ObjectID fromOpaqueData(uint64_t Opaque) { return ObjectID(Opaque); }
+
+  friend bool operator==(const ObjectID &LHS, const ObjectID &RHS) {
+    return LHS.Opaque == RHS.Opaque;
+  }
+  friend bool operator!=(const ObjectID &LHS, const ObjectID &RHS) {
+    return !(LHS == RHS);
+  }
+
+private:
+  explicit ObjectID(uint64_t Opaque) : Opaque(Opaque) {}
+  uint64_t Opaque;
+};
+
+/// Handle for a loaded node object.
+class ObjectHandle {
+public:
+  explicit ObjectHandle(uint64_t Opaque) : Opaque(Opaque) {}
+  uint64_t getOpaqueData() const { return Opaque; }
+
+  static ObjectHandle fromFileOffset(FileOffset Offset);
+  static ObjectHandle fromMemory(uintptr_t Ptr);
+
+  friend bool operator==(const ObjectHandle &LHS, const ObjectHandle &RHS) {
+    return LHS.Opaque == RHS.Opaque;
+  }
+  friend bool operator!=(const ObjectHandle &LHS, const ObjectHandle &RHS) {
+    return !(LHS == RHS);
+  }
+
+private:
+  uint64_t Opaque;
+};
+
+/// Iterator for ObjectID.
+class object_refs_iterator
+    : public iterator_facade_base<object_refs_iterator,
+                                  std::random_access_iterator_tag, ObjectID> {
+public:
+  bool operator==(const object_refs_iterator &RHS) const { return I == RHS.I; }
+  ObjectID operator*() const {
+    return ObjectID::fromOpaqueData((*I).getRawData());
+  }
+  bool operator<(const object_refs_iterator &RHS) const { return I < RHS.I; }
+  ptrdiff_t operator-(const object_refs_iterator &RHS) const {
+    return I - RHS.I;
+  }
+  object_refs_iterator &operator+=(ptrdiff_t N) {
+    I += N;
+    return *this;
+  }
+  object_refs_iterator &operator-=(ptrdiff_t N) {
+    I -= N;
+    return *this;
+  }
+  ObjectID operator[](ptrdiff_t N) const { return *(this->operator+(N)); }
+
+  object_refs_iterator() = default;
+  object_refs_iterator(InternalRefArrayRef::iterator I) : I(I) {}
+
+  uint64_t getOpaqueData() const { return I.getOpaqueData(); }
+
+  static object_refs_iterator fromOpaqueData(uint64_t Opaque) {
+    return InternalRefArrayRef::iterator::fromOpaqueData(Opaque);
+  }
+
+private:
+  InternalRefArrayRef::iterator I;
+};
+
+using object_refs_range = llvm::iterator_range<object_refs_iterator>;
+
+/// On-disk CAS nodes database, independent of a particular hashing algorithm.
+class OnDiskGraphDB {
+public:
+  /// Associate data & references with a particular object ID. If there is
+  /// already a record for this object the operation is a no-op. \param ID the
+  /// object ID to associate the data & references with. \param Refs references
+  /// \param Data data buffer.
+  Error store(ObjectID ID, ArrayRef<ObjectID> Refs, ArrayRef<char> Data);
+
+  /// \returns \p nullopt if the object associated with \p Ref does not exist.
+  Expected<std::optional<ObjectHandle>> load(ObjectID Ref);
+
+  /// \returns the hash bytes digest for the object reference.
+  ArrayRef<uint8_t> getDigest(ObjectID Ref) const {
+    // ObjectID should be valid to fetch Digest.
+    return cantFail(getDigest(getInternalRef(Ref)));
+  }
+
+  /// Form a reference for the provided hash. The reference can be used as part
+  /// of a CAS object even if it's not associated with an object yet.
+  Expected<ObjectID> getReference(ArrayRef<uint8_t> Hash);
+
+  /// Get an existing reference to the object \p Digest.
+  ///
+  /// Returns \p nullopt if the object is not stored in this CAS.
+  std::optional<ObjectID> getExistingReference(ArrayRef<uint8_t> Digest);
+
+  /// Check whether the object associated with \p Ref is stored in the CAS.
+  /// Note that this function will fault-in according to the policy.
+  Expected<bool> isMaterialized(ObjectID Ref);
+
+  /// Check whether the object associated with \p Ref is stored in the CAS.
+  /// Note that this function does not fault-in.
+  bool containsObject(ObjectID Ref) const {
+    return containsObject(Ref, /*CheckUpstream=*/true);
+  }
+
+  /// \returns the data part of the provided object handle.
+  ArrayRef<char> getObjectData(ObjectHandle Node) const;
+
+  /// \returns the object referenced by the provided object handle.
+  object_refs_range getObjectRefs(ObjectHandle Node) const {
+    InternalRefArrayRef Refs = getInternalRefs(Node);
+    return make_range(Refs.begin(), Refs.end());
+  }
+
+  /// \returns Total size of stored objects.
+  ///
+  /// NOTE: There's a possibility that the returned size is not including a
+  /// large object if the process crashed right at the point of inserting it.
+  size_t getStorageSize() const;
+
+  /// \returns The precentage of space utilization of hard space limits.
+  ///
+  /// Return value is an integer between 0 and 100 for percentage.
+  unsigned getHardStorageLimitUtilization() const;
+
+  void print(raw_ostream &OS) const;
+
+  /// Hashing function type for validation.
+  using HashingFuncT = function_ref<void(
+      ArrayRef<ArrayRef<uint8_t>>, ArrayRef<char>, SmallVectorImpl<uint8_t> &)>;
+
+  /// Validate the OnDiskGraphDB.
+  ///
+  /// \param Deep if true, rehash all the objects to ensure no data
+  /// corruption in stored objects, otherwise just validate the structure of
+  /// CAS database.
+  /// \param Hasher is the hashing function used for objects inside CAS.
+  Error validate(bool Deep, HashingFuncT Hasher) const;
+
+  /// How to fault-in nodes if an upstream database is used.
+  enum class FaultInPolicy {
+    /// Copy only the requested node.
+    SingleNode,
+    /// Copy the the entire graph of a node.
+    FullTree,
+  };
+
+  /// Open the on-disk store from a directory.
+  ///
+  /// \param Path directory for the on-disk store. The directory will be created
+  /// if it doesn't exist.
+  /// \param HashName Identifier name for the hashing algorithm that is going to
+  /// be used.
+  /// \param HashByteSize Size for the object digest hash bytes.
+  /// \param UpstreamDB Optional on-disk store to be used for faulting-in nodes
+  /// if they don't exist in the primary store. The upstream store is only used
+  /// for reading nodes, new nodes are only written to the primary store.
+  /// \param Policy If \p UpstreamDB is provided, controls how nodes are copied
+  /// to primary store. This is recorded at creation time and subsequent opens
+  /// need to pass the same policy otherwise the \p open will fail.
+  static Expected<std::unique_ptr<OnDiskGraphDB>>
+  open(StringRef Path, StringRef HashName, unsigned HashByteSize,
+       std::unique_ptr<OnDiskGraphDB> UpstreamDB = nullptr,
+       FaultInPolicy Policy = FaultInPolicy::FullTree);
+
+  ~OnDiskGraphDB();
+
+private:
+  /// Forward declaration for a proxy for an ondisk index record.
+  struct IndexProxy;
+
+  enum class ObjectPresence {
+    Missing,
+    InPrimaryDB,
+    OnlyInUpstreamDB,
+  };
+
+  /// Check if object exists and if it is on upstream only.
+  Expected<ObjectPresence> getObjectPresence(ObjectID Ref,
+                                             bool CheckUpstream) const;
+
+  /// \returns true if object can be found in database.
+  bool containsObject(ObjectID Ref, bool CheckUpstream) const {
+    auto Presence = getObjectPresence(Ref, CheckUpstream);
+    if (!Presence) {
+      consumeError(Presence.takeError());
+      return false;
+    }
+    switch (*Presence) {
+    case ObjectPresence::Missing:
+      return false;
+    case ObjectPresence::InPrimaryDB:
+      return true;
+    case ObjectPresence::OnlyInUpstreamDB:
+      return true;
+    }
+    llvm_unreachable("Unknown ObjectPresence enum");
+  }
+
+  /// When \p load is called for a node that doesn't exist, this function tries
+  /// to load it from the upstream store and copy it to the primary one.
+  Expected<std::optional<ObjectHandle>> faultInFromUpstream(ObjectID PrimaryID);
+
+  /// Import the entire tree from upstream with \p UpstreamNode as root.
+  Error importFullTree(ObjectID PrimaryID, ObjectHandle UpstreamNode);
+  /// Import only the \param UpstreamNode.
+  Error importSingleNode(ObjectID PrimaryID, ObjectHandle UpstreamNode);
+
+  /// Found the IndexProxy for the hash.
+  Expected<IndexProxy> indexHash(ArrayRef<uint8_t> Hash);
+
+  /// Get path for creating standalone data file.
+  void getStandalonePath(StringRef FileSuffix, const IndexProxy &I,
+                         SmallVectorImpl<char> &Path) const;
+  /// Create a standalone leaf file.
+  Error createStandaloneLeaf(IndexProxy &I, ArrayRef<char> Data);
+
+  /// \name Helper functions for internal data structures.
+  /// \{
+  static InternalRef getInternalRef(ObjectID Ref) {
+    return InternalRef::getFromRawData(Ref.getOpaqueData());
+  }
+
+  static ObjectID getExternalReference(InternalRef Ref) {
+    return ObjectID::fromOpaqueData(Ref.getRawData());
+  }
+
+  static ObjectID getExternalReference(const IndexProxy &I);
+
+  static InternalRef makeInternalRef(FileOffset IndexOffset);
+
+  Expected<ArrayRef<uint8_t>> getDigest(InternalRef Ref) const;
+
+  ArrayRef<uint8_t> getDigest(const IndexProxy &I) const;
+
+  Expected<IndexProxy> getIndexProxyFromRef(InternalRef Ref) const;
+
+  IndexProxy
+  getIndexProxyFromPointer(OnDiskTrieRawHashMap::ConstOnDiskPtr P) const;
+
+  InternalRefArrayRef getInternalRefs(ObjectHandle Node) const;
+  /// \}
+
+  /// Get the atomic variable that keeps track of the standalone data storage
+  /// size.
+  std::atomic<uint64_t> &standaloneStorageSize() const;
+
+  /// Increase the standalone data size.
+  void recordStandaloneSizeIncrease(size_t SizeIncrease);
+  /// Get the standalone data size.
+  uint64_t getStandaloneStorageSize() const;
+
+  // Private constructor.
+  OnDiskGraphDB(StringRef RootPath, OnDiskTrieRawHashMap Index,
+                OnDiskDataAllocator DataPool,
+                std::unique_ptr<OnDiskGraphDB> UpstreamDB,
+                FaultInPolicy Policy);
+
+  /// Mapping from hash to object reference.
+  ///
+  /// Data type is TrieRecord.
+  OnDiskTrieRawHashMap Index;
+
+  /// Storage for most objects.
+  ///
+  /// Data type is DataRecordHandle.
+  OnDiskDataAllocator DataPool;
+
+  /// A StandaloneDataMap.
+  void *StandaloneData = nullptr;
+
+  /// Path to the root directory.
+  std::string RootPath;
+
+  /// Optional on-disk store to be used for faulting-in nodes.
+  std::unique_ptr<OnDiskGraphDB> UpstreamDB;
+
+  /// The policy used to fault in data from upstream.
+  FaultInPolicy FIPolicy;
+};
+
+} // namespace llvm::cas::ondisk
+
+#endif // LLVM_CAS_ONDISKGRAPHDB_H
diff --git a/llvm/include/llvm/CAS/OnDiskKeyValueDB.h b/llvm/include/llvm/CAS/OnDiskKeyValueDB.h
new file mode 100644
index 0000000000000..b762518366c21
--- /dev/null
+++ b/llvm/include/llvm/CAS/OnDiskKeyValueDB.h
@@ -0,0 +1,82 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This declares OnDiskKeyValueDB, a key value storage database of fixed size
+/// key and value.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CAS_ONDISKKEYVALUEDB_H
+#define LLVM_CAS_ONDISKKEYVALUEDB_H
+
+#include "llvm/CAS/OnDiskTrieRawHashMap.h"
+
+namespace llvm::cas::ondisk {
+
+/// An on-disk key-value data store with the following properties:
+/// * Keys are fixed length binary hashes with expected normal distribution.
+/// * Values are buffers of the same size, specified at creation time.
+/// * The value of a key cannot be changed once it is set.
+/// * The value buffers returned from a key lookup have 8-byte alignment.
+class OnDiskKeyValueDB {
+public:
+  /// Associate a value with a key.
+  ///
+  /// \param Key the hash bytes for the key
+  /// \param Value the value bytes, same size as \p ValueSize parameter of
+  /// \p open call.
+  ///
+  /// \returns the value associated with the \p Key. It may be different than
+  /// \p Value if another value is already associated with this key.
+  Expected<ArrayRef<char>> put(ArrayRef<uint8_t> Key, ArrayRef<char> Value);
+
+  /// \returns the value associated with the \p Key, or \p std::nullopt if the
+  /// key does not exist.
+  Expected<std::optional<ArrayRef<char>>> get(ArrayRef<uint8_t> Key);
+
+  /// \returns Total size of stored data.
+  size_t getStorageSize() const { return Cache.size(); }
+
+  /// \returns The precentage of space utilization of hard space limits.
+  ///
+  /// Return value is an integer between 0 and 100 for percentage.
+  unsigned getHardStorageLimitUtilization() const {
+    return Cache.size() * 100ULL / Cache.capacity();
+  }
+
+  /// Open the on-disk store from a directory.
+  ///
+  /// \param Path directory for the on-disk store. The directory will be created
+  /// if it doesn't exist.
+  /// \param HashName Identifier name for the hashing algorithm that is going to
+  /// be used.
+  /// \param KeySize Size for the key hash bytes.
+  /// \param ValueName Identifier name for the values.
+  /// \param ValueSize Size for the value bytes.
+  static Expected<std::unique_ptr<OnDiskKeyValueDB>>
+  open(StringRef Path, StringRef HashName, unsigned KeySize,
+       StringRef ValueName, size_t ValueSize);
+
+  using CheckValueT =
+      function_ref<Error(FileOffset Offset, ArrayRef<char> Data)>;
+  /// Validate the storage with a callback \p CheckValue to check the stored
+  /// value.
+  Error validate(CheckValueT CheckValue) const;
+
+private:
+  OnDiskKeyValueDB(size_t ValueSize, OnDiskTrieRawHashMap Cache)
+      : ValueSize(ValueSize), Cache(std::move(Cache)) {}
+
+  const size_t ValueSize;
+  OnDiskTrieRawHashMap Cache;
+};
+
+} // namespace llvm::cas::ondisk
+
+#endif // LLVM_CAS_ONDISKKEYVALUEDB_H
diff --git a/llvm/include/llvm/CodeGen/AsmPrinter.h b/llvm/include/llvm/CodeGen/AsmPrinter.h
index ec6e1ac07e4a7..02581d31145fc 100644
--- a/llvm/include/llvm/CodeGen/AsmPrinter.h
+++ b/llvm/include/llvm/CodeGen/AsmPrinter.h
@@ -16,6 +16,7 @@
 #define LLVM_CODEGEN_ASMPRINTER_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/IntrusiveRefCntPtr.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
@@ -87,6 +88,10 @@ namespace remarks {
 class RemarkStreamer;
 }
 
+namespace vfs {
+class FileSystem;
+}
+
 /// This class is intended to be used as a driving class for all asm writers.
 class LLVM_ABI AsmPrinter : public MachineFunctionPass {
 public:
@@ -105,6 +110,9 @@ class LLVM_ABI AsmPrinter : public MachineFunctionPass {
   /// generating (such as the current section etc).
   std::unique_ptr<MCStreamer> OutStreamer;
 
+  /// The VFS to resolve asm include directives.
+  IntrusiveRefCntPtr<vfs::FileSystem> VFS;
+
   /// The current machine function.
   MachineFunction *MF = nullptr;
 
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 42ddb32d24093..4f27d9f103e91 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1927,17 +1927,17 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       return thisT()->getMemcpyCost(ICA.getInst());
 
     case Intrinsic::masked_scatter: {
-      const Value *Mask = Args[3];
+      const Value *Mask = Args[2];
       bool VarMask = !isa<Constant>(Mask);
-      Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue();
+      Align Alignment = I->getParamAlign(1).valueOrOne();
       return thisT()->getGatherScatterOpCost(Instruction::Store,
                                              ICA.getArgTypes()[0], Args[1],
                                              VarMask, Alignment, CostKind, I);
     }
     case Intrinsic::masked_gather: {
-      const Value *Mask = Args[2];
+      const Value *Mask = Args[1];
       bool VarMask = !isa<Constant>(Mask);
-      Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue();
+      Align Alignment = I->getParamAlign(0).valueOrOne();
       return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0],
                                              VarMask, Alignment, CostKind, I);
     }
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h b/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h
index 3a2509345b776..f21923827039c 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h
@@ -246,6 +246,12 @@ enum {
   /// - SizeInBits(ULEB128) - The size of the pointer value in bits.
   GIM_CheckPointerToAny,
 
+  /// Check the machine type of the specified operand
+  /// - InsnID(ULEB128) - Instruction ID
+  /// - OpIdx(ULEB128) - Operand index
+  /// - MachineOperandType(ULEB128) - Expected type
+  GIM_CheckMachineOperandType,
+
   /// Check the register bank for the specified operand
   /// - InsnID(ULEB128) - Instruction ID
   /// - OpIdx(ULEB128) - Operand index
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h b/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h
index 591cf9c97ae49..d49513a022e15 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h
@@ -771,6 +771,15 @@ bool GIMatchTableExecutor::executeMatchTable(
 
       break;
     }
+    case GIM_CheckMachineOperandType: {
+      uint64_t InsnID = readULEB();
+      uint64_t OpIdx = readULEB();
+      uint64_t MOTy = readULEB();
+      MachineOperand &MO = State.MIs[InsnID]->getOperand(OpIdx);
+      if (MO.getType() != MOTy)
+        return false;
+      break;
+    }
     case GIM_RecordNamedOperand: {
       uint64_t InsnID = readULEB();
       uint64_t OpIdx = readULEB();
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
index 985544430a5a3..51318c9c2736d 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
@@ -383,7 +383,8 @@ LLVM_ABI LegalizeMutation changeElementCountTo(unsigned TypeIdx,
 
 /// Keep the same scalar or element type as \p TypeIdx, but take the number of
 /// elements from \p Ty.
-LLVM_ABI LegalizeMutation changeElementCountTo(unsigned TypeIdx, LLT Ty);
+LLVM_ABI LegalizeMutation changeElementCountTo(unsigned TypeIdx,
+                                               ElementCount EC);
 
 /// Change the scalar size or element size to have the same scalar size as type
 /// index \p FromIndex. Unlike changeElementTo, this discards pointer types and
diff --git a/llvm/include/llvm/CodeGen/LiveIntervals.h b/llvm/include/llvm/CodeGen/LiveIntervals.h
index 1050b3daa0f57..c252f9d99f2af 100644
--- a/llvm/include/llvm/CodeGen/LiveIntervals.h
+++ b/llvm/include/llvm/CodeGen/LiveIntervals.h
@@ -229,8 +229,8 @@ class LiveIntervals {
   /// doing something wrong if you call pruneValue directly on a
   /// LiveInterval. Indeed, you are supposed to call pruneValue on the main
   /// LiveRange and all the LiveRanges of the subranges if any.
-  LLVM_ATTRIBUTE_UNUSED void pruneValue(LiveInterval &, SlotIndex,
-                                        SmallVectorImpl<SlotIndex> *) {
+  [[maybe_unused]] void pruneValue(LiveInterval &, SlotIndex,
+                                   SmallVectorImpl<SlotIndex> *) {
     llvm_unreachable(
         "Use pruneValue on the main LiveRange and on each subrange");
   }
diff --git a/llvm/include/llvm/CodeGen/LivePhysRegs.h b/llvm/include/llvm/CodeGen/LivePhysRegs.h
index 4af5b00014cb7..a76b7f95d9b93 100644
--- a/llvm/include/llvm/CodeGen/LivePhysRegs.h
+++ b/llvm/include/llvm/CodeGen/LivePhysRegs.h
@@ -51,7 +51,7 @@ class raw_ostream;
 /// when walking backward/forward through a basic block.
 class LivePhysRegs {
   const TargetRegisterInfo *TRI = nullptr;
-  using RegisterSet = SparseSet<MCPhysReg, identity<MCPhysReg>>;
+  using RegisterSet = SparseSet<MCPhysReg, MCPhysReg>;
   RegisterSet LiveRegs;
 
 public:
diff --git a/llvm/include/llvm/CodeGen/LiveRangeCalc.h b/llvm/include/llvm/CodeGen/LiveRangeCalc.h
index e9b62fb685016..67f5b69e12806 100644
--- a/llvm/include/llvm/CodeGen/LiveRangeCalc.h
+++ b/llvm/include/llvm/CodeGen/LiveRangeCalc.h
@@ -259,7 +259,7 @@ class LiveRangeCalc {
   /// jointly dominated by the blocks corresponding to the slot indices
   /// in @p Defs. This function is mainly for use in self-verification
   /// checks.
-  LLVM_ABI LLVM_ATTRIBUTE_UNUSED static bool
+  [[maybe_unused]] LLVM_ABI static bool
   isJointlyDominated(const MachineBasicBlock *MBB, ArrayRef<SlotIndex> Defs,
                      const SlotIndexes &Indexes);
 };
diff --git a/llvm/include/llvm/CodeGen/MIR2Vec.h b/llvm/include/llvm/CodeGen/MIR2Vec.h
index 7b1b5d9aee15d..f6b0571f5dac6 100644
--- a/llvm/include/llvm/CodeGen/MIR2Vec.h
+++ b/llvm/include/llvm/CodeGen/MIR2Vec.h
@@ -52,11 +52,21 @@ class LLVMContext;
 class MIR2VecVocabLegacyAnalysis;
 class TargetInstrInfo;
 
+enum class MIR2VecKind { Symbolic };
+
 namespace mir2vec {
+
+// Forward declarations
+class MIREmbedder;
+class SymbolicMIREmbedder;
+
 extern llvm::cl::OptionCategory MIR2VecCategory;
 extern cl::opt<float> OpcWeight;
 
 using Embedding = ir2vec::Embedding;
+using MachineInstEmbeddingsMap = DenseMap<const MachineInstr *, Embedding>;
+using MachineBlockEmbeddingsMap =
+    DenseMap<const MachineBasicBlock *, Embedding>;
 
 /// Class for storing and accessing the MIR2Vec vocabulary.
 /// The MIRVocabulary class manages seed embeddings for LLVM Machine IR
@@ -107,19 +117,91 @@ class MIRVocabulary {
 
   const_iterator end() const { return Storage.end(); }
 
-  /// Total number of entries in the vocabulary
-  size_t getCanonicalSize() const { return Storage.size(); }
-
   MIRVocabulary() = delete;
 
   /// Factory method to create MIRVocabulary from vocabulary map
   static Expected<MIRVocabulary> create(VocabMap &&Entries,
                                         const TargetInstrInfo &TII);
 
+  /// Create a dummy vocabulary for testing purposes.
+  static Expected<MIRVocabulary>
+  createDummyVocabForTest(const TargetInstrInfo &TII, unsigned Dim = 1);
+
+  /// Total number of entries in the vocabulary
+  size_t getCanonicalSize() const { return Storage.size(); }
+
 private:
   MIRVocabulary(VocabMap &&Entries, const TargetInstrInfo &TII);
 };
 
+/// Base class for MIR embedders
+class MIREmbedder {
+protected:
+  const MachineFunction &MF;
+  const MIRVocabulary &Vocab;
+
+  /// Dimension of the embeddings; Captured from the vocabulary
+  const unsigned Dimension;
+
+  /// Weight for opcode embeddings
+  const float OpcWeight;
+
+  MIREmbedder(const MachineFunction &MF, const MIRVocabulary &Vocab)
+      : MF(MF), Vocab(Vocab), Dimension(Vocab.getDimension()),
+        OpcWeight(mir2vec::OpcWeight) {}
+
+  /// Function to compute embeddings.
+  Embedding computeEmbeddings() const;
+
+  /// Function to compute the embedding for a given machine basic block.
+  Embedding computeEmbeddings(const MachineBasicBlock &MBB) const;
+
+  /// Function to compute the embedding for a given machine instruction.
+  /// Specific to the kind of embeddings being computed.
+  virtual Embedding computeEmbeddings(const MachineInstr &MI) const = 0;
+
+public:
+  virtual ~MIREmbedder() = default;
+
+  /// Factory method to create an Embedder object of the specified kind
+  /// Returns nullptr if the requested kind is not supported.
+  static std::unique_ptr<MIREmbedder> create(MIR2VecKind Mode,
+                                             const MachineFunction &MF,
+                                             const MIRVocabulary &Vocab);
+
+  /// Computes and returns the embedding for a given machine instruction MI in
+  /// the machine function MF.
+  Embedding getMInstVector(const MachineInstr &MI) const {
+    return computeEmbeddings(MI);
+  }
+
+  /// Computes and returns the embedding for a given machine basic block in the
+  /// machine function MF.
+  Embedding getMBBVector(const MachineBasicBlock &MBB) const {
+    return computeEmbeddings(MBB);
+  }
+
+  /// Computes and returns the embedding for the current machine function.
+  Embedding getMFunctionVector() const {
+    // Currently, we always (re)compute the embeddings for the function. This is
+    // cheaper than caching the vector.
+    return computeEmbeddings();
+  }
+};
+
+/// Class for computing Symbolic embeddings
+/// Symbolic embeddings are constructed based on the entity-level
+/// representations obtained from the MIR Vocabulary.
+class SymbolicMIREmbedder : public MIREmbedder {
+private:
+  Embedding computeEmbeddings(const MachineInstr &MI) const override;
+
+public:
+  SymbolicMIREmbedder(const MachineFunction &F, const MIRVocabulary &Vocab);
+  static std::unique_ptr<SymbolicMIREmbedder>
+  create(const MachineFunction &MF, const MIRVocabulary &Vocab);
+};
+
 } // namespace mir2vec
 
 /// Pass to analyze and populate MIR2Vec vocabulary from a module
@@ -166,6 +248,31 @@ class MIR2VecVocabPrinterLegacyPass : public MachineFunctionPass {
   }
 };
 
+/// This pass prints the MIR2Vec embeddings for machine functions, basic blocks,
+/// and instructions
+class MIR2VecPrinterLegacyPass : public MachineFunctionPass {
+  raw_ostream &OS;
+
+public:
+  static char ID;
+  explicit MIR2VecPrinterLegacyPass(raw_ostream &OS)
+      : MachineFunctionPass(ID), OS(OS) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MIR2VecVocabLegacyAnalysis>();
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  StringRef getPassName() const override {
+    return "MIR2Vec Embedder Printer Pass";
+  }
+};
+
+/// Create a machine pass that prints MIR2Vec embeddings
+MachineFunctionPass *createMIR2VecPrinterLegacyPass(raw_ostream &OS);
+
 } // namespace llvm
 
 #endif // LLVM_CODEGEN_MIR2VEC_H
\ No newline at end of file
diff --git a/llvm/include/llvm/CodeGen/MIRFSDiscriminatorOptions.h b/llvm/include/llvm/CodeGen/MIRFSDiscriminatorOptions.h
new file mode 100644
index 0000000000000..672a6b359333a
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/MIRFSDiscriminatorOptions.h
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Command line options for MIR Flow Sensitive discriminators.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_MIRFSDISCRIMINATOR_OPTIONS_H
+#define LLVM_CODEGEN_MIRFSDISCRIMINATOR_OPTIONS_H
+
+#include "llvm/Support/CommandLine.h"
+
+namespace llvm {
+extern cl::opt<bool> ImprovedFSDiscriminator;
+} // namespace llvm
+
+#endif // LLVM_CODEGEN_MIRFSDISCRIMINATOR_OPTIONS_H
diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index 272b4acf950c5..7fae550d8d170 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -93,6 +93,10 @@ createMachineFunctionPrinterPass(raw_ostream &OS,
 LLVM_ABI MachineFunctionPass *
 createMIR2VecVocabPrinterLegacyPass(raw_ostream &OS);
 
+/// MIR2VecPrinter pass - This pass prints out the MIR2Vec embeddings for
+/// machine functions, basic blocks and instructions.
+LLVM_ABI MachineFunctionPass *createMIR2VecPrinterLegacyPass(raw_ostream &OS);
+
 /// StackFramePrinter pass - This pass prints out the machine function's
 /// stack frame to the given stream as a debugging tool.
 LLVM_ABI MachineFunctionPass *createStackFrameLayoutAnalysisPass();
diff --git a/llvm/include/llvm/CodeGen/RegisterPressure.h b/llvm/include/llvm/CodeGen/RegisterPressure.h
index 6d69092d7d7b4..261e5b0d73281 100644
--- a/llvm/include/llvm/CodeGen/RegisterPressure.h
+++ b/llvm/include/llvm/CodeGen/RegisterPressure.h
@@ -393,7 +393,7 @@ class RegPressureTracker {
   LiveRegSet LiveRegs;
 
   /// Set of vreg defs that start a live range.
-  SparseSet<Register, VirtReg2IndexFunctor> UntiedDefs;
+  SparseSet<Register, Register, VirtReg2IndexFunctor> UntiedDefs;
   /// Live-through pressure.
   std::vector<unsigned> LiveThruPressure;
 
diff --git a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
index ba5b2da64fd80..4eacbdca163d9 100644
--- a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
+++ b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
@@ -90,15 +90,16 @@ namespace llvm {
   /// allocated once for the pass. It can be cleared in constant time and reused
   /// without any frees.
   using RegUnit2SUnitsMap =
-      SparseMultiSet<PhysRegSUOper, identity<unsigned>, uint16_t>;
+      SparseMultiSet<PhysRegSUOper, unsigned, identity_cxx20, uint16_t>;
 
   /// Track local uses of virtual registers. These uses are gathered by the DAG
   /// builder and may be consulted by the scheduler to avoid iterating an entire
   /// vreg use list.
-  using VReg2SUnitMultiMap = SparseMultiSet<VReg2SUnit, VirtReg2IndexFunctor>;
+  using VReg2SUnitMultiMap =
+      SparseMultiSet<VReg2SUnit, Register, VirtReg2IndexFunctor>;
 
   using VReg2SUnitOperIdxMultiMap =
-      SparseMultiSet<VReg2SUnitOperIdx, VirtReg2IndexFunctor>;
+      SparseMultiSet<VReg2SUnitOperIdx, Register, VirtReg2IndexFunctor>;
 
   using ValueType = PointerUnion<const Value *, const PseudoSourceValue *>;
 
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index 116911699ab9f..69713d0d84011 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -1950,7 +1950,7 @@ LLVM_ABI bool isOnesOrOnesSplat(SDValue N, bool AllowUndefs = false);
 
 /// Return true if the value is a constant 0 integer or a splatted vector of a
 /// constant 0 integer (with no undefs).
-/// Does not permit build vector implicit truncation.
+/// Build vector implicit truncation is allowed.
 LLVM_ABI bool isZeroOrZeroSplat(SDValue N, bool AllowUndefs = false);
 
 /// Return true if \p V is either a integer or FP constant.
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 73f2c55a71125..64a7563182a98 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2459,6 +2459,12 @@ class LLVM_ABI TargetLoweringBase {
     return ISD::ANY_EXTEND;
   }
 
+  /// Returns how the platform's atomic rmw operations expect their input
+  /// argument to be extended (ZERO_EXTEND, SIGN_EXTEND, or ANY_EXTEND).
+  virtual ISD::NodeType getExtendForAtomicRMWArg(unsigned Op) const {
+    return ISD::ANY_EXTEND;
+  }
+
   /// @}
 
   /// Returns true if we should normalize
diff --git a/llvm/include/llvm/Config/llvm-config.h.cmake b/llvm/include/llvm/Config/llvm-config.h.cmake
index 9c81f740c1c43..56a40fda9d91c 100644
--- a/llvm/include/llvm/Config/llvm-config.h.cmake
+++ b/llvm/include/llvm/Config/llvm-config.h.cmake
@@ -17,7 +17,7 @@
 /* The number of commits in the linear history from the
  * start of the universe up to the latest llvm main commit
  * that has been merged */
-#define LLVM_MAIN_REVISION 555559
+#define LLVM_MAIN_REVISION 556323
 
 /* Define if LLVM_ENABLE_DUMP is enabled */
 #cmakedefine LLVM_ENABLE_DUMP
diff --git a/llvm/include/llvm/DebugInfo/GSYM/DwarfTransformer.h b/llvm/include/llvm/DebugInfo/GSYM/DwarfTransformer.h
index 77ce052201f7a..2c59a5219292f 100644
--- a/llvm/include/llvm/DebugInfo/GSYM/DwarfTransformer.h
+++ b/llvm/include/llvm/DebugInfo/GSYM/DwarfTransformer.h
@@ -43,8 +43,14 @@ class DwarfTransformer {
   ///
   /// \param LDCS Flag to indicate whether we should load the call site
   /// information from DWARF `DW_TAG_call_site` entries
-  DwarfTransformer(DWARFContext &D, GsymCreator &G, bool LDCS = false)
-      : DICtx(D), Gsym(G), LoadDwarfCallSites(LDCS) {}
+  ///
+  /// \param MachO Flag to indicate if the object file is mach-o (Apple's
+  /// executable format). Apple has some compile unit attributes that look like
+  /// split DWARF, but they aren't and they can cause warnins to be emitted
+  /// about missing DWO files.
+  DwarfTransformer(DWARFContext &D, GsymCreator &G, bool LDCS = false,
+                   bool MachO = false)
+      : DICtx(D), Gsym(G), LoadDwarfCallSites(LDCS), IsMachO(MachO) {}
 
   /// Extract the DWARF from the supplied object file and convert it into the
   /// Gsym format in the GsymCreator object that is passed in. Returns an
@@ -97,6 +103,7 @@ class DwarfTransformer {
   DWARFContext &DICtx;
   GsymCreator &Gsym;
   bool LoadDwarfCallSites;
+  bool IsMachO;
 
   friend class DwarfTransformerTest;
 };
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Core.h b/llvm/include/llvm/ExecutionEngine/Orc/Core.h
index f407b56817fc3..8613ddd8e3b11 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Core.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Core.h
@@ -26,6 +26,7 @@
 #include "llvm/ExecutionEngine/Orc/Shared/ExecutorSymbolDef.h"
 #include "llvm/ExecutionEngine/Orc/Shared/WrapperFunctionUtils.h"
 #include "llvm/ExecutionEngine/Orc/TaskDispatch.h"
+#include "llvm/ExecutionEngine/Orc/WaitingOnGraph.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ExtensibleRTTI.h"
@@ -49,6 +50,9 @@ class InProgressLookupState;
 
 enum class SymbolState : uint8_t;
 
+using WaitingOnGraph =
+    detail::WaitingOnGraph<JITDylib *, NonOwningSymbolStringPtr>;
+
 using ResourceTrackerSP = IntrusiveRefCntPtr<ResourceTracker>;
 using JITDylibSP = IntrusiveRefCntPtr<JITDylib>;
 
@@ -1131,20 +1135,6 @@ class JITDylib : public ThreadSafeRefCountedBase<JITDylib>,
   using UnmaterializedInfosList =
       std::vector<std::shared_ptr<UnmaterializedInfo>>;
 
-  struct EmissionDepUnit {
-    EmissionDepUnit(JITDylib &JD) : JD(&JD) {}
-
-    JITDylib *JD = nullptr;
-    DenseMap<NonOwningSymbolStringPtr, JITSymbolFlags> Symbols;
-    DenseMap<JITDylib *, DenseSet<NonOwningSymbolStringPtr>> Dependencies;
-  };
-
-  struct EmissionDepUnitInfo {
-    std::shared_ptr<EmissionDepUnit> EDU;
-    DenseSet<EmissionDepUnit *> IntraEmitUsers;
-    DenseMap<JITDylib *, DenseSet<NonOwningSymbolStringPtr>> NewDeps;
-  };
-
   // Information about not-yet-ready symbol.
   // * DefiningEDU will point to the EmissionDepUnit that defines the symbol.
   // * DependantEDUs will hold pointers to any EmissionDepUnits currently
@@ -1154,9 +1144,6 @@ class JITDylib : public ThreadSafeRefCountedBase<JITDylib>,
   struct MaterializingInfo {
     friend class ExecutionSession;
 
-    std::shared_ptr<EmissionDepUnit> DefiningEDU;
-    DenseSet<EmissionDepUnit *> DependantEDUs;
-
     LLVM_ABI void addQuery(std::shared_ptr<AsynchronousSymbolQuery> Q);
     LLVM_ABI void removeQuery(const AsynchronousSymbolQuery &Q);
     LLVM_ABI AsynchronousSymbolQueryList
@@ -1778,30 +1765,26 @@ class ExecutionSession {
   LLVM_ABI Error OL_notifyResolved(MaterializationResponsibility &MR,
                                    const SymbolMap &Symbols);
 
-  using EDUInfosMap =
-      DenseMap<JITDylib::EmissionDepUnit *, JITDylib::EmissionDepUnitInfo>;
-
-  template <typename HandleNewDepFn>
-  void propagateExtraEmitDeps(std::deque<JITDylib::EmissionDepUnit *> Worklist,
-                              EDUInfosMap &EDUInfos,
-                              HandleNewDepFn HandleNewDep);
-  EDUInfosMap simplifyDepGroups(MaterializationResponsibility &MR,
-                                ArrayRef<SymbolDependenceGroup> EmittedDeps);
-  void IL_makeEDUReady(std::shared_ptr<JITDylib::EmissionDepUnit> EDU,
-                       JITDylib::AsynchronousSymbolQuerySet &Queries);
-  void IL_makeEDUEmitted(std::shared_ptr<JITDylib::EmissionDepUnit> EDU,
-                         JITDylib::AsynchronousSymbolQuerySet &Queries);
-  bool IL_removeEDUDependence(JITDylib::EmissionDepUnit &EDU, JITDylib &DepJD,
-                              NonOwningSymbolStringPtr DepSym,
-                              EDUInfosMap &EDUInfos);
-
-  static Error makeJDClosedError(JITDylib::EmissionDepUnit &EDU,
-                                 JITDylib &ClosedJD);
-  static Error makeUnsatisfiedDepsError(JITDylib::EmissionDepUnit &EDU,
-                                        JITDylib &BadJD, SymbolNameSet BadDeps);
-
-  Expected<JITDylib::AsynchronousSymbolQuerySet>
-  IL_emit(MaterializationResponsibility &MR, EDUInfosMap EDUInfos);
+  // FIXME: We should be able to derive FailedSymsForQuery from each query once
+  //        we fix how the detach operation works.
+  struct EmitQueries {
+    JITDylib::AsynchronousSymbolQuerySet Updated;
+    JITDylib::AsynchronousSymbolQuerySet Failed;
+    DenseMap<AsynchronousSymbolQuery *, std::shared_ptr<SymbolDependenceMap>>
+        FailedSymsForQuery;
+  };
+
+  WaitingOnGraph::ExternalState
+  IL_getSymbolState(JITDylib *JD, NonOwningSymbolStringPtr Name);
+
+  template <typename UpdateSymbolFn, typename UpdateQueryFn>
+  void IL_collectQueries(JITDylib::AsynchronousSymbolQuerySet &Qs,
+                         WaitingOnGraph::ContainerElementsMap &QualifiedSymbols,
+                         UpdateSymbolFn &&UpdateSymbol,
+                         UpdateQueryFn &&UpdateQuery);
+
+  Expected<EmitQueries> IL_emit(MaterializationResponsibility &MR,
+                                WaitingOnGraph::SimplifyResult SR);
   LLVM_ABI Error OL_notifyEmitted(MaterializationResponsibility &MR,
                                   ArrayRef<SymbolDependenceGroup> EmittedDeps);
 
@@ -1830,6 +1813,7 @@ class ExecutionSession {
   std::vector<ResourceManager *> ResourceManagers;
 
   std::vector<JITDylibSP> JDs;
+  WaitingOnGraph G;
 
   // FIXME: Remove this (and runOutstandingMUs) once the linking layer works
   //        with callbacks from asynchronous queries.
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.h b/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.h
index f9070afc1a327..eb71e9a162197 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.h
@@ -32,8 +32,9 @@ class LLVM_ABI EPCGenericJITLinkMemoryManager
   struct SymbolAddrs {
     ExecutorAddr Allocator;
     ExecutorAddr Reserve;
-    ExecutorAddr Finalize;
-    ExecutorAddr Deallocate;
+    ExecutorAddr Initialize;
+    ExecutorAddr Deinitialize;
+    ExecutorAddr Release;
   };
 
   /// Create an EPCGenericJITLinkMemoryManager instance from a given set of
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.h b/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.h
index faec25d81fac8..fa48480f265a9 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.h
@@ -31,8 +31,8 @@ class LLVM_ABI EPCGenericRTDyldMemoryManager
   struct SymbolAddrs {
     ExecutorAddr Instance;
     ExecutorAddr Reserve;
-    ExecutorAddr Finalize;
-    ExecutorAddr Deallocate;
+    ExecutorAddr Initialize;
+    ExecutorAddr Release;
     ExecutorAddr RegisterEHFrame;
     ExecutorAddr DeregisterEHFrame;
   };
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/AllocationActions.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/AllocationActions.h
index 596cc18208f08..b0197f04a70bd 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/AllocationActions.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/AllocationActions.h
@@ -13,7 +13,6 @@
 #ifndef LLVM_EXECUTIONENGINE_ORC_SHARED_ALLOCATIONACTIONS_H
 #define LLVM_EXECUTIONENGINE_ORC_SHARED_ALLOCATIONACTIONS_H
 
-#include "llvm/ADT/FunctionExtras.h"
 #include "llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h"
 #include "llvm/ExecutionEngine/Orc/Shared/WrapperFunctionUtils.h"
 #include "llvm/Support/Compiler.h"
@@ -54,9 +53,6 @@ inline size_t numDeallocActions(const AllocActions &AAs) {
       AAs, [](const AllocActionCallPair &P) { return !!P.Dealloc; });
 }
 
-using OnRunFinalizeActionsCompleteFn =
-    unique_function<void(Expected<std::vector<WrapperFunctionCall>>)>;
-
 /// Run finalize actions.
 ///
 /// If any finalize action fails then the corresponding dealloc actions will be
@@ -67,16 +63,13 @@ using OnRunFinalizeActionsCompleteFn =
 /// be returned. The dealloc actions should be run by calling
 /// runDeallocationActions. If this function succeeds then the AA argument will
 /// be cleared before the function returns.
-LLVM_ABI void runFinalizeActions(AllocActions &AAs,
-                                 OnRunFinalizeActionsCompleteFn OnComplete);
-
-using OnRunDeallocActionsComeleteFn = unique_function<void(Error)>;
+LLVM_ABI Expected<std::vector<WrapperFunctionCall>>
+runFinalizeActions(AllocActions &AAs);
 
 /// Run deallocation actions.
 /// Dealloc actions will be run in reverse order (from last element of DAs to
 /// first).
-LLVM_ABI void runDeallocActions(ArrayRef<WrapperFunctionCall> DAs,
-                                OnRunDeallocActionsComeleteFn OnComplete);
+LLVM_ABI Error runDeallocActions(ArrayRef<WrapperFunctionCall> DAs);
 
 using SPSAllocActionCallPair =
     SPSTuple<SPSWrapperFunctionCall, SPSWrapperFunctionCall>;
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h
index a4a7fa4be8b9c..4a32113b460e1 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h
@@ -14,7 +14,7 @@
 #define LLVM_EXECUTIONENGINE_ORC_SHARED_EXECUTORADDRESS_H
 
 #include "llvm/ADT/DenseMapInfo.h"
-#include "llvm/ADT/identity.h"
+#include "llvm/ADT/STLForwardCompat.h"
 #include "llvm/ExecutionEngine/Orc/Shared/SimplePackedSerialization.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/raw_ostream.h"
@@ -34,7 +34,7 @@ using ExecutorAddrDiff = uint64_t;
 class ExecutorAddr {
 public:
   /// A wrap/unwrap function that leaves pointers unmodified.
-  template <typename T> using rawPtr = llvm::identity<T *>;
+  using rawPtr = llvm::identity_cxx20;
 
 #if __has_feature(ptrauth_calls)
   template <typename T> class PtrauthSignDefault {
@@ -63,10 +63,10 @@ class ExecutorAddr {
 #else
 
   /// Default wrap function to use on this host.
-  template <typename T> using defaultWrap = rawPtr<T>;
+  template <typename T> using defaultWrap = rawPtr;
 
   /// Default unwrap function to use on this host.
-  template <typename T> using defaultUnwrap = rawPtr<T>;
+  template <typename T> using defaultUnwrap = rawPtr;
 
 #endif
 
@@ -272,6 +272,9 @@ struct ExecutorAddrRange {
   }
 
   bool contains(ExecutorAddr Addr) const { return Start <= Addr && Addr < End; }
+  bool contains(const ExecutorAddrRange &Other) {
+    return (Other.Start >= Start && Other.End <= End);
+  }
   bool overlaps(const ExecutorAddrRange &Other) {
     return !(Other.End <= Start || End <= Other.Start);
   }
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/ExecutorSymbolDef.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/ExecutorSymbolDef.h
index 0756ab5ea9881..9ad2934cb9aa0 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/ExecutorSymbolDef.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/ExecutorSymbolDef.h
@@ -33,8 +33,8 @@ class ExecutorSymbolDef {
     JITSymbolFlags Flags = BaseFlags;
     if (std::is_function_v<T>)
       Flags |= JITSymbolFlags::Callable;
-    return ExecutorSymbolDef(
-        ExecutorAddr::fromPtr(UP, ExecutorAddr::rawPtr<T>()), Flags);
+    return ExecutorSymbolDef(ExecutorAddr::fromPtr(UP, ExecutorAddr::rawPtr()),
+                             Flags);
   }
 
   /// Cast this ExecutorSymbolDef to a pointer of the given type.
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h
index 99ba456e66c85..d68a68992a638 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h
@@ -29,8 +29,9 @@ LLVM_ABI extern const char *SimpleExecutorDylibManagerResolveWrapperName;
 
 LLVM_ABI extern const char *SimpleExecutorMemoryManagerInstanceName;
 LLVM_ABI extern const char *SimpleExecutorMemoryManagerReserveWrapperName;
-LLVM_ABI extern const char *SimpleExecutorMemoryManagerFinalizeWrapperName;
-LLVM_ABI extern const char *SimpleExecutorMemoryManagerDeallocateWrapperName;
+LLVM_ABI extern const char *SimpleExecutorMemoryManagerInitializeWrapperName;
+LLVM_ABI extern const char *SimpleExecutorMemoryManagerDeinitializeWrapperName;
+LLVM_ABI extern const char *SimpleExecutorMemoryManagerReleaseWrapperName;
 
 LLVM_ABI extern const char *ExecutorSharedMemoryMapperServiceInstanceName;
 LLVM_ABI extern const char *ExecutorSharedMemoryMapperServiceReserveWrapperName;
@@ -73,9 +74,12 @@ using SPSSimpleExecutorDylibManagerResolveSignature = shared::SPSExpected<
 using SPSSimpleExecutorMemoryManagerReserveSignature =
     shared::SPSExpected<shared::SPSExecutorAddr>(shared::SPSExecutorAddr,
                                                  uint64_t);
-using SPSSimpleExecutorMemoryManagerFinalizeSignature =
-    shared::SPSError(shared::SPSExecutorAddr, shared::SPSFinalizeRequest);
-using SPSSimpleExecutorMemoryManagerDeallocateSignature = shared::SPSError(
+using SPSSimpleExecutorMemoryManagerInitializeSignature =
+    shared::SPSExpected<shared::SPSExecutorAddr>(shared::SPSExecutorAddr,
+                                                 shared::SPSFinalizeRequest);
+using SPSSimpleExecutorMemoryManagerDeinitializeSignature = shared::SPSError(
+    shared::SPSExecutorAddr, shared::SPSSequence<shared::SPSExecutorAddr>);
+using SPSSimpleExecutorMemoryManagerReleaseSignature = shared::SPSError(
     shared::SPSExecutorAddr, shared::SPSSequence<shared::SPSExecutorAddr>);
 
 // ExecutorSharedMemoryMapperService
@@ -93,6 +97,18 @@ using SPSExecutorSharedMemoryMapperServiceDeinitializeSignature =
 using SPSExecutorSharedMemoryMapperServiceReleaseSignature = shared::SPSError(
     shared::SPSExecutorAddr, shared::SPSSequence<shared::SPSExecutorAddr>);
 
+// SimpleNativeMemoryMap APIs.
+using SPSSimpleRemoteMemoryMapReserveSignature =
+    shared::SPSExpected<shared::SPSExecutorAddr>(shared::SPSExecutorAddr,
+                                                 uint64_t);
+using SPSSimpleRemoteMemoryMapInitializeSignature =
+    shared::SPSExpected<shared::SPSExecutorAddr>(shared::SPSExecutorAddr,
+                                                 shared::SPSFinalizeRequest);
+using SPSSimpleRemoteMemoryMapDeinitializeSignature = shared::SPSError(
+    shared::SPSExecutorAddr, shared::SPSSequence<shared::SPSExecutorAddr>);
+using SPSSimpleRemoteMemoryMapReleaseSignature = shared::SPSError(
+    shared::SPSExecutorAddr, shared::SPSSequence<shared::SPSExecutorAddr>);
+
 using SPSRunAsMainSignature = int64_t(shared::SPSExecutorAddr,
                                       shared::SPSSequence<shared::SPSString>);
 using SPSRunAsVoidFunctionSignature = int32_t(shared::SPSExecutorAddr);
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/SimpleRemoteMemoryMapper.h b/llvm/include/llvm/ExecutionEngine/Orc/SimpleRemoteMemoryMapper.h
new file mode 100644
index 0000000000000..644c4f614108e
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/Orc/SimpleRemoteMemoryMapper.h
@@ -0,0 +1,87 @@
+//===- SimpleRemoteMemoryMapper.h - Remote memory mapper --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// A simple memory mapper that uses EPC calls to implement reserve, initialize,
+// deinitialize, and release.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_SIMPLEREMOTEMEMORYMAPPER_H
+#define LLVM_EXECUTIONENGINE_ORC_SIMPLEREMOTEMEMORYMAPPER_H
+
+#include "llvm/ExecutionEngine/Orc/MemoryMapper.h"
+
+namespace llvm::orc {
+
+/// Manages remote memory by making SPS-based EPC calls.
+class LLVM_ABI SimpleRemoteMemoryMapper final : public MemoryMapper {
+public:
+  struct SymbolAddrs {
+    ExecutorAddr Instance;
+    ExecutorAddr Reserve;
+    ExecutorAddr Initialize;
+    ExecutorAddr Deinitialize;
+    ExecutorAddr Release;
+  };
+
+  SimpleRemoteMemoryMapper(ExecutorProcessControl &EPC, SymbolAddrs SAs);
+
+  static Expected<std::unique_ptr<SimpleRemoteMemoryMapper>>
+  Create(ExecutorProcessControl &EPC, SymbolAddrs SAs) {
+    return std::make_unique<SimpleRemoteMemoryMapper>(EPC, SAs);
+  }
+
+  unsigned int getPageSize() override { return EPC.getPageSize(); }
+
+  /// Reserves memory in the remote process by calling a remote
+  /// SPS-wrapper-function with signature
+  ///
+  ///   SPSExpected<SPSExecutorAddr>(uint64_t Size).
+  ///
+  /// On success, returns the base address of the reserved range.
+  void reserve(size_t NumBytes, OnReservedFunction OnReserved) override;
+
+  char *prepare(jitlink::LinkGraph &G, ExecutorAddr Addr,
+                size_t ContentSize) override;
+
+  /// Initializes memory within a previously reserved region (applying
+  /// protections and running any finalization actions) by calling a remote
+  /// SPS-wrapper-function with signature
+  ///
+  ///   SPSExpected<SPSExecutorAddr>(SPSFinalizeRequest)
+  ///
+  /// On success, returns a key that can be used to deinitialize the region.
+  void initialize(AllocInfo &AI, OnInitializedFunction OnInitialized) override;
+
+  /// Given a series of keys from previous initialize calls, deinitialize
+  /// previously initialized memory regions (running dealloc actions, resetting
+  /// permissions and decommitting if possible) by calling a remote
+  /// SPS-wrapper-function with signature
+  ///
+  ///   SPSError(SPSSequence<SPSExecutorAddr> Keys)
+  ///
+  void deinitialize(ArrayRef<ExecutorAddr> Allocations,
+                    OnDeinitializedFunction OnDeInitialized) override;
+
+  /// Given a sequence of base addresses from previous reserve calls, release
+  /// the underlying ranges (deinitializing any remaining regions within them)
+  /// by calling a remote SPS-wrapper-function with signature
+  ///
+  ///   SPSError(SPSSequence<SPSExecutorAddr> Bases)
+  ///
+  void release(ArrayRef<ExecutorAddr> Reservations,
+               OnReleasedFunction OnRelease) override;
+
+private:
+  ExecutorProcessControl &EPC;
+  SymbolAddrs SAs;
+};
+
+} // namespace llvm::orc
+
+#endif // LLVM_EXECUTIONENGINE_ORC_SIMPLEREMOTEMEMORYMAPPER_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.h b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.h
index 741f203396fac..6224e92887147 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.h
@@ -34,34 +34,65 @@ class LLVM_ABI SimpleExecutorMemoryManager : public ExecutorBootstrapService {
 public:
   virtual ~SimpleExecutorMemoryManager();
 
-  Expected<ExecutorAddr> allocate(uint64_t Size);
-  Error finalize(tpctypes::FinalizeRequest &FR);
-  Error deallocate(const std::vector<ExecutorAddr> &Bases);
+  Expected<ExecutorAddr> reserve(uint64_t Size);
+  Expected<ExecutorAddr> initialize(tpctypes::FinalizeRequest &FR);
+  Error deinitialize(const std::vector<ExecutorAddr> &InitKeys);
+  Error release(const std::vector<ExecutorAddr> &Bases);
 
   Error shutdown() override;
   void addBootstrapSymbols(StringMap<ExecutorAddr> &M) override;
 
 private:
-  struct Allocation {
+  struct RegionInfo {
     size_t Size = 0;
-    std::vector<shared::WrapperFunctionCall> DeallocationActions;
+    std::vector<shared::WrapperFunctionCall> DeallocActions;
   };
 
-  using AllocationsMap = DenseMap<void *, Allocation>;
+  struct SlabInfo {
+    using RegionMap = std::map<ExecutorAddr, RegionInfo>;
+    size_t Size = 0;
+    RegionMap Regions;
+  };
+
+  using SlabMap = std::map<void *, SlabInfo>;
+
+  /// Get a reference to the slab information for the slab containing the given
+  /// address.
+  Expected<SlabInfo &> getSlabInfo(ExecutorAddr A, StringRef Context);
+
+  /// Get a reference to the slab information for the slab *covering* the given
+  /// range. The given range must be a subrange of e(possibly equal to) the
+  /// range of the slab itself.
+  Expected<SlabInfo &> getSlabInfo(ExecutorAddrRange R, StringRef Context);
 
-  Error deallocateImpl(void *Base, Allocation &A);
+  /// Create a RegionInfo for the given range, which must not overlap any
+  /// existing region.
+  Expected<RegionInfo &> createRegionInfo(ExecutorAddrRange R,
+                                          StringRef Context);
+
+  /// Get a reference to the region information for the given address. This
+  /// address must represent the start of an existing initialized region.
+  Expected<RegionInfo &> getRegionInfo(SlabInfo &Slab, ExecutorAddr A,
+                                       StringRef Context);
+
+  /// Get a reference to the region information for the given address. This
+  /// address must represent the start of an existing initialized region.
+  Expected<RegionInfo &> getRegionInfo(ExecutorAddr A, StringRef Context);
 
   static llvm::orc::shared::CWrapperFunctionResult
   reserveWrapper(const char *ArgData, size_t ArgSize);
 
   static llvm::orc::shared::CWrapperFunctionResult
-  finalizeWrapper(const char *ArgData, size_t ArgSize);
+  initializeWrapper(const char *ArgData, size_t ArgSize);
+
+  static llvm::orc::shared::CWrapperFunctionResult
+  deinitializeWrapper(const char *ArgData, size_t ArgSize);
 
   static llvm::orc::shared::CWrapperFunctionResult
-  deallocateWrapper(const char *ArgData, size_t ArgSize);
+  releaseWrapper(const char *ArgData, size_t ArgSize);
 
   std::mutex M;
-  AllocationsMap Allocations;
+  SlabMap Slabs;
 };
 
 } // end namespace rt_bootstrap
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/WaitingOnGraph.h b/llvm/include/llvm/ExecutionEngine/Orc/WaitingOnGraph.h
new file mode 100644
index 0000000000000..45834f14e9bdf
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/Orc/WaitingOnGraph.h
@@ -0,0 +1,623 @@
+//===------ WaitingOnGraph.h - ORC symbol dependence graph ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines WaitingOnGraph and related utilities.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_EXECUTIONENGINE_ORC_WAITINGONGRAPH_H
+#define LLVM_EXECUTIONENGINE_ORC_WAITINGONGRAPH_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <algorithm>
+#include <vector>
+
+namespace llvm::orc::detail {
+
+class WaitingOnGraphTest;
+
+/// WaitingOnGraph class template.
+///
+/// This type is intended to provide efficient dependence tracking for Symbols
+/// in an ORC program.
+///
+/// WaitingOnGraph models a directed graph with four partitions:
+///   1. Not-yet-emitted nodes: Nodes identified as waited-on in an emit
+///      operation.
+///   2. Emitted nodes: Nodes emitted and waiting on some non-empty set of
+///      other nodes.
+///   3. Ready nodes: Nodes emitted and not waiting on any other nodes
+///      (either because they weren't waiting on any nodes when they were
+///      emitted, or because all transitively waited-on nodes have since
+///      been emitted).
+///   4. Failed nodes: Nodes that have been marked as failed-to-emit, and
+///      nodes that were found to transitively wait-on some failed node.
+///
+/// Nodes are added to the graph by *emit* and *fail* operations.
+///
+/// The *emit* operation takes a bipartite *local dependence graph* as an
+/// argument and returns...
+///   a. the set of nodes (both existing and newly added from the local
+///      dependence graph) whose waiting-on set is the empty set, and...
+///   b. the set of newly added nodes that are found to depend on failed
+///      nodes.
+///
+/// The *fail* operation takes a set of failed nodes and returns the set of
+/// Emitted nodes that were waiting on the failed nodes.
+///
+/// The concrete representation adopts several approaches for efficiency:
+///
+/// 1. Only *Emitted* and *Not-yet-emitted* nodes are represented explicitly.
+///    *Ready* and *Failed* nodes are represented by the values returned by the
+///    GetExternalStateFn argument to *emit*.
+///
+/// 2. Labels are (*Container*, *Element*) pairs that are intended to represent
+///    ORC symbols (ORC uses types Container = JITDylib,
+///    Element = NonOwningSymbolStringPtr). The internal representation of the
+///    graph is optimized on the assumption that there are many more Elements
+///    (symbol names) than Containers (JITDylibs) used to construct the labels.
+///    (Consider for example the common case where most JIT'd code is placed in
+///    a single "main" JITDylib).
+///
+/// 3. The data structure stores *SuperNodes* which have multiple labels. This
+///    reduces the number of nodes and edges in the graph in the common case
+///    where many JIT symbols have the same set of dependencies. SuperNodes are
+///    coalesced when their dependence sets become equal.
+///
+/// 4. The *simplify* method can be applied to an initial *local dependence
+///    graph* (as a list of SuperNodes) to eliminate any internal dependence
+///    relationships that would have to be propagated internally by *emit*.
+///    Access to the WaitingOnGraph is assumed to be guarded by a mutex (ORC
+///    will access it from multiple threads) so this allows some pre-processing
+///    to be performed outside the mutex.
+template <typename ContainerIdT, typename ElementIdT> class WaitingOnGraph {
+  friend class WaitingOnGraphTest;
+
+public:
+  using ContainerId = ContainerIdT;
+  using ElementId = ElementIdT;
+  using ElementSet = DenseSet<ElementId>;
+  using ContainerElementsMap = DenseMap<ContainerId, ElementSet>;
+
+  class SuperNode {
+    friend class WaitingOnGraph;
+    friend class WaitingOnGraphTest;
+
+  public:
+    SuperNode(ContainerElementsMap Defs, ContainerElementsMap Deps)
+        : Defs(std::move(Defs)), Deps(std::move(Deps)) {}
+    ContainerElementsMap &defs() { return Defs; }
+    const ContainerElementsMap &defs() const { return Defs; }
+    ContainerElementsMap &deps() { return Deps; }
+    const ContainerElementsMap &deps() const { return Deps; }
+
+  private:
+    ContainerElementsMap Defs;
+    ContainerElementsMap Deps;
+  };
+
+private:
+  using ElemToSuperNodeMap =
+      DenseMap<ContainerId, DenseMap<ElementId, SuperNode *>>;
+
+  using SuperNodeDepsMap = DenseMap<SuperNode *, DenseSet<SuperNode *>>;
+
+  class Coalescer {
+  public:
+    std::unique_ptr<SuperNode> addOrCreateSuperNode(ContainerElementsMap Defs,
+                                                    ContainerElementsMap Deps) {
+      auto H = getHash(Deps);
+      if (auto *ExistingSN = findCanonicalSuperNode(H, Deps)) {
+        for (auto &[Container, Elems] : Defs) {
+          auto &DstCElems = ExistingSN->Defs[Container];
+          [[maybe_unused]] size_t ExpectedSize =
+              DstCElems.size() + Elems.size();
+          DstCElems.insert(Elems.begin(), Elems.end());
+          assert(DstCElems.size() == ExpectedSize);
+        }
+        return nullptr;
+      }
+
+      auto NewSN =
+          std::make_unique<SuperNode>(std::move(Defs), std::move(Deps));
+      CanonicalSNs[H].push_back(NewSN.get());
+      return NewSN;
+    }
+
+    void coalesce(std::vector<std::unique_ptr<SuperNode>> &SNs,
+                  ElemToSuperNodeMap &ElemToSN) {
+      for (size_t I = 0; I != SNs.size();) {
+        auto &SN = SNs[I];
+        auto H = getHash(SN->Deps);
+        if (auto *CanonicalSN = findCanonicalSuperNode(H, SN->Deps)) {
+          for (auto &[Container, Elems] : SN->Defs) {
+            CanonicalSN->Defs[Container].insert(Elems.begin(), Elems.end());
+            auto &ContainerElemToSN = ElemToSN[Container];
+            for (auto &Elem : Elems)
+              ContainerElemToSN[Elem] = CanonicalSN;
+          }
+          std::swap(SN, SNs.back());
+          SNs.pop_back();
+        } else {
+          CanonicalSNs[H].push_back(SN.get());
+          ++I;
+        }
+      }
+    }
+
+    template <typename Pred> void remove(Pred &&Remove) {
+      for (auto &[Hash, SNs] : CanonicalSNs) {
+        bool Found = false;
+        for (size_t I = 0; I != SNs.size(); ++I) {
+          if (Remove(SNs[I])) {
+            std::swap(SNs[I], SNs.back());
+            SNs.pop_back();
+            Found = true;
+            break;
+          }
+        }
+        if (Found) {
+          if (SNs.empty())
+            CanonicalSNs.erase(Hash);
+          break;
+        }
+      }
+    }
+
+  private:
+    hash_code getHash(const ContainerElementsMap &M) {
+      SmallVector<ContainerId> SortedContainers;
+      SortedContainers.reserve(M.size());
+      for (auto &[Container, Elems] : M)
+        SortedContainers.push_back(Container);
+      llvm::sort(SortedContainers);
+      hash_code Hash(0);
+      for (auto &Container : SortedContainers) {
+        auto &ContainerElems = M.at(Container);
+        SmallVector<ElementId> SortedElems(ContainerElems.begin(),
+                                           ContainerElems.end());
+        llvm::sort(SortedElems);
+        Hash = hash_combine(
+            Hash, Container,
+            hash_combine_range(SortedElems.begin(), SortedElems.end()));
+      }
+      return Hash;
+    }
+
+    SuperNode *findCanonicalSuperNode(hash_code H,
+                                      const ContainerElementsMap &M) {
+      for (auto *SN : CanonicalSNs[H])
+        if (SN->Deps == M)
+          return SN;
+      return nullptr;
+    }
+
+    DenseMap<hash_code, SmallVector<SuperNode *>> CanonicalSNs;
+  };
+
+public:
+  /// Build SuperNodes from (definition-set, dependence-set) pairs.
+  ///
+  /// Coalesces definition-sets with identical dependence-sets.
+  class SuperNodeBuilder {
+  public:
+    void add(ContainerElementsMap Defs, ContainerElementsMap Deps) {
+      if (Defs.empty())
+        return;
+      // Remove any self-reference.
+      SmallVector<ContainerId> ToRemove;
+      for (auto &[Container, Elems] : Defs) {
+        assert(!Elems.empty() && "Defs for container must not be empty");
+        auto I = Deps.find(Container);
+        if (I == Deps.end())
+          continue;
+        auto &DepsForContainer = I->second;
+        for (auto &Elem : Elems)
+          DepsForContainer.erase(Elem);
+        if (DepsForContainer.empty())
+          ToRemove.push_back(Container);
+      }
+      for (auto &Container : ToRemove)
+        Deps.erase(Container);
+      if (auto SN = C.addOrCreateSuperNode(std::move(Defs), std::move(Deps)))
+        SNs.push_back(std::move(SN));
+    }
+    std::vector<std::unique_ptr<SuperNode>> takeSuperNodes() {
+      return std::move(SNs);
+    }
+
+  private:
+    Coalescer C;
+    std::vector<std::unique_ptr<SuperNode>> SNs;
+  };
+
+  class SimplifyResult {
+    friend class WaitingOnGraph;
+    friend class WaitingOnGraphTest;
+
+  public:
+    const std::vector<std::unique_ptr<SuperNode>> &superNodes() const {
+      return SNs;
+    }
+
+  private:
+    SimplifyResult(std::vector<std::unique_ptr<SuperNode>> SNs,
+                   ElemToSuperNodeMap ElemToSN)
+        : SNs(std::move(SNs)), ElemToSN(std::move(ElemToSN)) {}
+    std::vector<std::unique_ptr<SuperNode>> SNs;
+    ElemToSuperNodeMap ElemToSN;
+  };
+
+  /// Preprocess a list of SuperNodes to remove all intra-SN dependencies.
+  static SimplifyResult simplify(std::vector<std::unique_ptr<SuperNode>> SNs) {
+    // Build ElemToSN map.
+    ElemToSuperNodeMap ElemToSN;
+    for (auto &SN : SNs) {
+      for (auto &[Container, Elements] : SN->Defs) {
+        auto &ContainerElemToSN = ElemToSN[Container];
+        for (auto &E : Elements)
+          ContainerElemToSN[E] = SN.get();
+      }
+    }
+
+    SuperNodeDepsMap SuperNodeDeps;
+    hoistDeps(SuperNodeDeps, SNs, ElemToSN);
+    propagateSuperNodeDeps(SuperNodeDeps);
+    sinkDeps(SNs, SuperNodeDeps);
+
+    // Pre-coalesce nodes.
+    Coalescer().coalesce(SNs, ElemToSN);
+
+    return {std::move(SNs), std::move(ElemToSN)};
+  }
+
+  struct EmitResult {
+    std::vector<std::unique_ptr<SuperNode>> Ready;
+    std::vector<std::unique_ptr<SuperNode>> Failed;
+  };
+
+  enum class ExternalState { None, Ready, Failed };
+
+  /// Add the given SuperNodes to the graph, returning any SuperNodes that
+  /// move to the Ready or Failed states as a result.
+  /// The GetExternalState function is used to represent SuperNodes that have
+  /// already become Ready or Failed (since such nodes are not explicitly
+  /// represented in the graph).
+  template <typename GetExternalStateFn>
+  EmitResult emit(SimplifyResult SR, GetExternalStateFn &&GetExternalState) {
+    auto NewSNs = std::move(SR.SNs);
+    auto ElemToNewSN = std::move(SR.ElemToSN);
+
+    // First process any dependencies on nodes with external state.
+    auto FailedSNs = processExternalDeps(NewSNs, GetExternalState);
+
+    // Collect the PendingSNs whose dep sets are about to be modified.
+    std::vector<std::unique_ptr<SuperNode>> ModifiedPendingSNs;
+    for (size_t I = 0; I != PendingSNs.size();) {
+      auto &SN = PendingSNs[I];
+      bool Remove = false;
+      for (auto &[Container, Elems] : SN->Deps) {
+        auto I = ElemToNewSN.find(Container);
+        if (I == ElemToNewSN.end())
+          continue;
+        for (auto Elem : Elems) {
+          if (I->second.contains(Elem)) {
+            Remove = true;
+            break;
+          }
+        }
+        if (Remove)
+          break;
+      }
+      if (Remove) {
+        ModifiedPendingSNs.push_back(std::move(SN));
+        std::swap(SN, PendingSNs.back());
+        PendingSNs.pop_back();
+      } else
+        ++I;
+    }
+
+    // Remove cycles from the graphs.
+    SuperNodeDepsMap SuperNodeDeps;
+    hoistDeps(SuperNodeDeps, ModifiedPendingSNs, ElemToNewSN);
+
+    CoalesceToPendingSNs.remove(
+        [&](SuperNode *SN) { return SuperNodeDeps.count(SN); });
+
+    hoistDeps(SuperNodeDeps, NewSNs, ElemToPendingSN);
+    propagateSuperNodeDeps(SuperNodeDeps);
+    sinkDeps(NewSNs, SuperNodeDeps);
+    sinkDeps(ModifiedPendingSNs, SuperNodeDeps);
+
+    // Process supernodes. Pending first, since we'll update PendingSNs when we
+    // incorporate NewSNs.
+    std::vector<std::unique_ptr<SuperNode>> ReadyNodes, FailedNodes;
+    processReadyOrFailed(ModifiedPendingSNs, ReadyNodes, FailedNodes,
+                         SuperNodeDeps, ElemToPendingSN, FailedSNs);
+    processReadyOrFailed(NewSNs, ReadyNodes, FailedNodes, SuperNodeDeps,
+                         ElemToNewSN, FailedSNs);
+
+    CoalesceToPendingSNs.coalesce(ModifiedPendingSNs, ElemToPendingSN);
+    CoalesceToPendingSNs.coalesce(NewSNs, ElemToPendingSN);
+
+    // Integrate remaining ModifiedPendingSNs and NewSNs into PendingSNs.
+    for (auto &SN : ModifiedPendingSNs)
+      PendingSNs.push_back(std::move(SN));
+
+    // Update ElemToPendingSN for the remaining elements.
+    for (auto &SN : NewSNs) {
+      for (auto &[Container, Elems] : SN->Defs) {
+        auto &Row = ElemToPendingSN[Container];
+        for (auto &Elem : Elems)
+          Row[Elem] = SN.get();
+      }
+      PendingSNs.push_back(std::move(SN));
+    }
+
+    return {std::move(ReadyNodes), std::move(FailedNodes)};
+  }
+
+  /// Identify the given symbols as Failed.
+  /// The elements of the Failed map will not be included in the returned
+  /// result, so clients should take whatever actions are needed to mark
+  /// this as failed in their external representation.
+  std::vector<std::unique_ptr<SuperNode>>
+  fail(const ContainerElementsMap &Failed) {
+    std::vector<std::unique_ptr<SuperNode>> FailedSNs;
+
+    for (size_t I = 0; I != PendingSNs.size();) {
+      auto &PendingSN = PendingSNs[I];
+      bool FailPendingSN = false;
+      for (auto &[Container, Elems] : PendingSN->Deps) {
+        if (FailPendingSN)
+          break;
+        auto I = Failed.find(Container);
+        if (I == Failed.end())
+          continue;
+        for (auto &Elem : Elems) {
+          if (I->second.count(Elem)) {
+            FailPendingSN = true;
+            break;
+          }
+        }
+      }
+      if (FailPendingSN) {
+        FailedSNs.push_back(std::move(PendingSN));
+        PendingSN = std::move(PendingSNs.back());
+        PendingSNs.pop_back();
+      } else
+        ++I;
+    }
+
+    for (auto &SN : FailedSNs) {
+      CoalesceToPendingSNs.remove(
+          [&](SuperNode *SNC) { return SNC == SN.get(); });
+      for (auto &[Container, Elems] : SN->Defs) {
+        assert(ElemToPendingSN.count(Container));
+        auto &CElems = ElemToPendingSN[Container];
+        for (auto &Elem : Elems)
+          CElems.erase(Elem);
+        if (CElems.empty())
+          ElemToPendingSN.erase(Container);
+      }
+    }
+
+    return FailedSNs;
+  }
+
+  bool validate(raw_ostream &Log) {
+    bool AllGood = true;
+    auto ErrLog = [&]() -> raw_ostream & {
+      AllGood = false;
+      return Log;
+    };
+
+    size_t DefCount = 0;
+    for (auto &PendingSN : PendingSNs) {
+      if (PendingSN->Deps.empty())
+        ErrLog() << "Pending SN " << PendingSN.get() << " has empty dep set.\n";
+      else {
+        bool BadElem = false;
+        for (auto &[Container, Elems] : PendingSN->Deps) {
+          auto I = ElemToPendingSN.find(Container);
+          if (I == ElemToPendingSN.end())
+            continue;
+          if (Elems.empty())
+            ErrLog() << "Pending SN " << PendingSN.get()
+                     << " has dependence map entry for " << Container
+                     << " with empty element set.\n";
+          for (auto &Elem : Elems) {
+            if (I->second.count(Elem)) {
+              ErrLog() << "Pending SN " << PendingSN.get()
+                       << " has dependence on emitted element ( " << Container
+                       << ", " << Elem << ")\n";
+              BadElem = true;
+              break;
+            }
+          }
+          if (BadElem)
+            break;
+        }
+      }
+
+      for (auto &[Container, Elems] : PendingSN->Defs) {
+        if (Elems.empty())
+          ErrLog() << "Pending SN " << PendingSN.get()
+                   << " has def map entry for " << Container
+                   << " with empty element set.\n";
+        DefCount += Elems.size();
+        auto I = ElemToPendingSN.find(Container);
+        if (I == ElemToPendingSN.end())
+          ErrLog() << "Pending SN " << PendingSN.get() << " has "
+                   << Elems.size() << " defs in container " << Container
+                   << " not covered by ElemsToPendingSN.\n";
+        else {
+          for (auto &Elem : Elems) {
+            auto J = I->second.find(Elem);
+            if (J == I->second.end())
+              ErrLog() << "Pending SN " << PendingSN.get() << " has element ("
+                       << Container << ", " << Elem
+                       << ") not covered by ElemsToPendingSN.\n";
+            else if (J->second != PendingSN.get())
+              ErrLog() << "ElemToPendingSN value invalid for (" << Container
+                       << ", " << Elem << ")\n";
+          }
+        }
+      }
+    }
+
+    size_t DefCount2 = 0;
+    for (auto &[Container, Elems] : ElemToPendingSN)
+      DefCount2 += Elems.size();
+
+    assert(DefCount2 >= DefCount);
+    if (DefCount2 != DefCount)
+      ErrLog() << "ElemToPendingSN contains extra elements.\n";
+
+    return AllGood;
+  }
+
+private:
+  // Replace individual dependencies with supernode dependencies.
+  //
+  // For all dependencies in SNs, if the corresponding node is defined in
+  // ElemToSN then remove the individual dependency and add the record the
+  // dependency on the corresponding supernode in SuperNodeDeps.
+  static void hoistDeps(SuperNodeDepsMap &SuperNodeDeps,
+                        std::vector<std::unique_ptr<SuperNode>> &SNs,
+                        ElemToSuperNodeMap &ElemToSN) {
+    for (auto &SN : SNs) {
+      auto &SNDeps = SuperNodeDeps[SN.get()];
+      for (auto &[DefContainer, DefElems] : ElemToSN) {
+        auto I = SN->Deps.find(DefContainer);
+        if (I == SN->Deps.end())
+          continue;
+        for (auto &[DefElem, DefSN] : DefElems)
+          if (I->second.erase(DefElem))
+            SNDeps.insert(DefSN);
+        if (I->second.empty())
+          SN->Deps.erase(I);
+      }
+    }
+  }
+
+  // Compute transitive closure of deps for each node.
+  static void propagateSuperNodeDeps(SuperNodeDepsMap &SuperNodeDeps) {
+    for (auto &[SN, Deps] : SuperNodeDeps) {
+      DenseSet<SuperNode *> Reachable({SN});
+      SmallVector<SuperNode *> Worklist(Deps.begin(), Deps.end());
+
+      while (!Worklist.empty()) {
+        auto *DepSN = Worklist.pop_back_val();
+        if (!Reachable.insert(DepSN).second)
+          continue;
+        auto I = SuperNodeDeps.find(DepSN);
+        if (I == SuperNodeDeps.end())
+          continue;
+        for (auto *DepSNDep : I->second)
+          Worklist.push_back(DepSNDep);
+      }
+
+      Deps = std::move(Reachable);
+    }
+  }
+
+  // Sink SuperNode dependencies back to dependencies on individual nodes.
+  static void sinkDeps(std::vector<std::unique_ptr<SuperNode>> &SNs,
+                       SuperNodeDepsMap &SuperNodeDeps) {
+    for (auto &SN : SNs) {
+      auto I = SuperNodeDeps.find(SN.get());
+      if (I == SuperNodeDeps.end())
+        continue;
+
+      for (auto *DepSN : I->second)
+        for (auto &[Container, Elems] : DepSN->Deps)
+          SN->Deps[Container].insert(Elems.begin(), Elems.end());
+    }
+  }
+
+  template <typename GetExternalStateFn>
+  static std::vector<SuperNode *>
+  processExternalDeps(std::vector<std::unique_ptr<SuperNode>> &SNs,
+                      GetExternalStateFn &GetExternalState) {
+    std::vector<SuperNode *> FailedSNs;
+    for (auto &SN : SNs) {
+      bool SNHasError = false;
+      SmallVector<ContainerId> ContainersToRemove;
+      for (auto &[Container, Elems] : SN->Deps) {
+        SmallVector<ElementId> ElemToRemove;
+        for (auto &Elem : Elems) {
+          switch (GetExternalState(Container, Elem)) {
+          case ExternalState::None:
+            break;
+          case ExternalState::Ready:
+            ElemToRemove.push_back(Elem);
+            break;
+          case ExternalState::Failed:
+            ElemToRemove.push_back(Elem);
+            SNHasError = true;
+            break;
+          }
+        }
+        for (auto &Elem : ElemToRemove)
+          Elems.erase(Elem);
+        if (Elems.empty())
+          ContainersToRemove.push_back(Container);
+      }
+      for (auto &Container : ContainersToRemove)
+        SN->Deps.erase(Container);
+      if (SNHasError)
+        FailedSNs.push_back(SN.get());
+    }
+
+    return FailedSNs;
+  }
+
+  void processReadyOrFailed(std::vector<std::unique_ptr<SuperNode>> &SNs,
+                            std::vector<std::unique_ptr<SuperNode>> &Ready,
+                            std::vector<std::unique_ptr<SuperNode>> &Failed,
+                            SuperNodeDepsMap &SuperNodeDeps,
+                            ElemToSuperNodeMap &ElemToSNs,
+                            std::vector<SuperNode *> FailedSNs) {
+    for (size_t I = 0; I != SNs.size();) {
+      auto &SN = SNs[I];
+
+      bool SNFailed = false;
+      assert(SuperNodeDeps.count(SN.get()));
+      auto &SNSuperNodeDeps = SuperNodeDeps[SN.get()];
+      for (auto *FailedSN : FailedSNs) {
+        if (FailedSN == SN.get() || SNSuperNodeDeps.count(FailedSN)) {
+          SNFailed = true;
+          break;
+        }
+      }
+
+      bool SNReady = SN->Deps.empty();
+
+      if (SNReady || SNFailed) {
+        auto &NodeList = SNFailed ? Failed : Ready;
+        NodeList.push_back(std::move(SN));
+        std::swap(SN, SNs.back());
+        SNs.pop_back();
+      } else
+        ++I;
+    }
+  }
+
+  std::vector<std::unique_ptr<SuperNode>> PendingSNs;
+  ElemToSuperNodeMap ElemToPendingSN;
+  Coalescer CoalesceToPendingSNs;
+};
+
+} // namespace llvm::orc::detail
+
+#endif // LLVM_EXECUTIONENGINE_ORC_WAITINGONGRAPH_H
diff --git a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
index db781b58944bc..87b95200b2459 100644
--- a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
+++ b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
@@ -541,6 +541,14 @@ struct DeviceT {
   std::tuple<OPT(DeviceModifier), DeviceDescription> t;
 };
 
+// [6.0:362]
+template <typename T, typename I, typename E> //
+struct DeviceSafesyncT {
+  using Requires = E;
+  using WrapperTrait = std::true_type;
+  OPT(Requires) v;
+};
+
 // V5.2: [13.1] `device_type` clause
 template <typename T, typename I, typename E> //
 struct DeviceTypeT {
@@ -571,7 +579,9 @@ struct DoacrossT {
 // V5.2: [8.2.1] `requirement` clauses
 template <typename T, typename I, typename E> //
 struct DynamicAllocatorsT {
-  using EmptyTrait = std::true_type;
+  using Requires = E;
+  using WrapperTrait = std::true_type;
+  OPT(Requires) v;
 };
 
 template <typename T, typename I, typename E> //
@@ -802,6 +812,7 @@ template <typename T, typename I, typename E> //
 struct MapT {
   using LocatorList = ObjectListT<I, E>;
   ENUM(MapType, To, From, Tofrom, Storage);
+  ENUM(AttachModifier, Always, Auto, Never);
   ENUM(MapTypeModifier, Always, Close, Delete, Present, Self, OmpxHold);
   ENUM(RefModifier, RefPtee, RefPtr, RefPtrPtee);
   // See note at the definition of the MapperT type.
@@ -810,8 +821,8 @@ struct MapT {
   using MapTypeModifiers = ListT<MapTypeModifier>; // Not a spec name
 
   using TupleTrait = std::true_type;
-  std::tuple<OPT(MapType), OPT(MapTypeModifiers), OPT(RefModifier),
-             OPT(Mappers), OPT(Iterator), LocatorList>
+  std::tuple<OPT(MapType), OPT(MapTypeModifiers), OPT(AttachModifier),
+             OPT(RefModifier), OPT(Mappers), OPT(Iterator), LocatorList>
       t;
 };
 
@@ -1055,7 +1066,9 @@ struct ReplayableT {
 // V5.2: [8.2.1] `requirement` clauses
 template <typename T, typename I, typename E> //
 struct ReverseOffloadT {
-  using EmptyTrait = std::true_type;
+  using Requires = E;
+  using WrapperTrait = std::true_type;
+  OPT(Requires) v;
 };
 
 // V5.2: [10.4.2] `safelen` clause
@@ -1077,6 +1090,14 @@ struct ScheduleT {
   std::tuple<Kind, OPT(OrderingModifier), OPT(ChunkModifier), OPT(ChunkSize)> t;
 };
 
+// [6.0:361]
+template <typename T, typename I, typename E> //
+struct SelfMapsT {
+  using Requires = E;
+  using WrapperTrait = std::true_type;
+  OPT(Requires) v;
+};
+
 // V5.2: [15.8.1] Memory-order clauses
 template <typename T, typename I, typename E> //
 struct SeqCstT {
@@ -1168,18 +1189,17 @@ struct TransparentT {
 // V5.2: [8.2.1] `requirement` clauses
 template <typename T, typename I, typename E> //
 struct UnifiedAddressT {
-  using EmptyTrait = std::true_type;
+  using Requires = E;
+  using WrapperTrait = std::true_type;
+  OPT(Requires) v;
 };
 
 // V5.2: [8.2.1] `requirement` clauses
 template <typename T, typename I, typename E> //
 struct UnifiedSharedMemoryT {
-  using EmptyTrait = std::true_type;
-};
-
-template <typename T, typename I, typename E> //
-struct SelfMapsT {
-  using EmptyTrait = std::true_type;
+  using Requires = E;
+  using WrapperTrait = std::true_type;
+  OPT(Requires) v;
 };
 
 // V5.2: [5.10] `uniform` clause
@@ -1287,14 +1307,12 @@ using ExtensionClausesT =
 template <typename T, typename I, typename E>
 using EmptyClausesT = std::variant<
     AcqRelT<T, I, E>, AcquireT<T, I, E>, CaptureT<T, I, E>, CompareT<T, I, E>,
-    DynamicAllocatorsT<T, I, E>, FullT<T, I, E>, InbranchT<T, I, E>,
-    MergeableT<T, I, E>, NogroupT<T, I, E>, NoOpenmpRoutinesT<T, I, E>,
+    FullT<T, I, E>, InbranchT<T, I, E>, MergeableT<T, I, E>, NogroupT<T, I, E>,
+    NoOpenmpConstructsT<T, I, E>, NoOpenmpRoutinesT<T, I, E>,
     NoOpenmpT<T, I, E>, NoParallelismT<T, I, E>, NotinbranchT<T, I, E>,
     NowaitT<T, I, E>, ReadT<T, I, E>, RelaxedT<T, I, E>, ReleaseT<T, I, E>,
-    ReverseOffloadT<T, I, E>, SeqCstT<T, I, E>, SimdT<T, I, E>,
-    ThreadsT<T, I, E>, UnifiedAddressT<T, I, E>, UnifiedSharedMemoryT<T, I, E>,
-    UnknownT<T, I, E>, UntiedT<T, I, E>, UseT<T, I, E>, WeakT<T, I, E>,
-    WriteT<T, I, E>, NoOpenmpConstructsT<T, I, E>, SelfMapsT<T, I, E>>;
+    SeqCstT<T, I, E>, SimdT<T, I, E>, ThreadsT<T, I, E>, UnknownT<T, I, E>,
+    UntiedT<T, I, E>, UseT<T, I, E>, WeakT<T, I, E>, WriteT<T, I, E>>;
 
 template <typename T, typename I, typename E>
 using IncompleteClausesT =
@@ -1322,18 +1340,21 @@ using WrapperClausesT = std::variant<
     AtomicDefaultMemOrderT<T, I, E>, AtT<T, I, E>, BindT<T, I, E>,
     CollapseT<T, I, E>, ContainsT<T, I, E>, CopyinT<T, I, E>,
     CopyprivateT<T, I, E>, DefaultT<T, I, E>, DestroyT<T, I, E>,
-    DetachT<T, I, E>, DeviceTypeT<T, I, E>, EnterT<T, I, E>,
-    ExclusiveT<T, I, E>, FailT<T, I, E>, FilterT<T, I, E>, FinalT<T, I, E>,
-    FirstprivateT<T, I, E>, HasDeviceAddrT<T, I, E>, HintT<T, I, E>,
-    HoldsT<T, I, E>, InclusiveT<T, I, E>, IndirectT<T, I, E>,
-    InitializerT<T, I, E>, IsDevicePtrT<T, I, E>, LinkT<T, I, E>,
-    MessageT<T, I, E>, NocontextT<T, I, E>, NontemporalT<T, I, E>,
-    NovariantsT<T, I, E>, NumTeamsT<T, I, E>, NumThreadsT<T, I, E>,
-    OrderedT<T, I, E>, PartialT<T, I, E>, PriorityT<T, I, E>, PrivateT<T, I, E>,
-    ProcBindT<T, I, E>, SafelenT<T, I, E>, SeverityT<T, I, E>, SharedT<T, I, E>,
-    SimdlenT<T, I, E>, SizesT<T, I, E>, PermutationT<T, I, E>,
-    ThreadLimitT<T, I, E>, UniformT<T, I, E>, UpdateT<T, I, E>,
-    UseDeviceAddrT<T, I, E>, UseDevicePtrT<T, I, E>, UsesAllocatorsT<T, I, E>>;
+    DetachT<T, I, E>, DeviceSafesyncT<T, I, E>, DeviceTypeT<T, I, E>,
+    DynamicAllocatorsT<T, I, E>, EnterT<T, I, E>, ExclusiveT<T, I, E>,
+    FailT<T, I, E>, FilterT<T, I, E>, FinalT<T, I, E>, FirstprivateT<T, I, E>,
+    HasDeviceAddrT<T, I, E>, HintT<T, I, E>, HoldsT<T, I, E>,
+    InclusiveT<T, I, E>, IndirectT<T, I, E>, InitializerT<T, I, E>,
+    IsDevicePtrT<T, I, E>, LinkT<T, I, E>, MessageT<T, I, E>,
+    NocontextT<T, I, E>, NontemporalT<T, I, E>, NovariantsT<T, I, E>,
+    NumTeamsT<T, I, E>, NumThreadsT<T, I, E>, OrderedT<T, I, E>,
+    PartialT<T, I, E>, PriorityT<T, I, E>, PrivateT<T, I, E>,
+    ProcBindT<T, I, E>, ReverseOffloadT<T, I, E>, SafelenT<T, I, E>,
+    SelfMapsT<T, I, E>, SeverityT<T, I, E>, SharedT<T, I, E>, SimdlenT<T, I, E>,
+    SizesT<T, I, E>, PermutationT<T, I, E>, ThreadLimitT<T, I, E>,
+    UnifiedAddressT<T, I, E>, UnifiedSharedMemoryT<T, I, E>, UniformT<T, I, E>,
+    UpdateT<T, I, E>, UseDeviceAddrT<T, I, E>, UseDevicePtrT<T, I, E>,
+    UsesAllocatorsT<T, I, E>>;
 
 template <typename T, typename I, typename E>
 using UnionOfAllClausesT = typename type::Union< //
diff --git a/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h b/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
index 047baa3a79f5d..6d6eb5cda52de 100644
--- a/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
+++ b/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
@@ -708,6 +708,7 @@ bool ConstructDecompositionT<C, H>::applyClause(
                      tomp::clause::MapT<TypeTy, IdTy, ExprTy>{
                          {/*MapType=*/MapType::Tofrom,
                           /*MapTypeModifier=*/std::nullopt,
+                          /*AttachModifier=*/std::nullopt,
                           /*RefModifier=*/std::nullopt,
                           /*Mapper=*/std::nullopt, /*Iterator=*/std::nullopt,
                           /*LocatorList=*/std::move(tofrom)}});
@@ -970,8 +971,9 @@ bool ConstructDecompositionT<C, H>::applyClause(
           llvm::omp::Clause::OMPC_map,
           tomp::clause::MapT<TypeTy, IdTy, ExprTy>{
               {/*MapType=*/MapType::Tofrom, /*MapTypeModifier=*/std::nullopt,
-               /*RefModifier=*/std::nullopt, /*Mapper=*/std::nullopt,
-               /*Iterator=*/std::nullopt, /*LocatorList=*/std::move(tofrom)}});
+               /*AttachModifier=*/std::nullopt, /*RefModifier=*/std::nullopt,
+               /*Mapper=*/std::nullopt, /*Iterator=*/std::nullopt,
+               /*LocatorList=*/std::move(tofrom)}});
 
       dirTarget->clauses.push_back(map);
       applied = true;
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td
index 86a9e249054a0..61a1a05f6e904 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.td
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td
@@ -163,6 +163,10 @@ def OMPC_Device : Clause<[Spelling<"device">]> {
   let clangClass = "OMPDeviceClause";
   let flangClass = "OmpDeviceClause";
 }
+def OMPC_DeviceSafesync : Clause<[Spelling<"device_safesync">]> {
+  let flangClass = "OmpDeviceSafesyncClause";
+  let isValueOptional = true;
+}
 def OMPC_DeviceType : Clause<[Spelling<"device_type">]> {
   let flangClass = "OmpDeviceTypeClause";
 }
@@ -177,6 +181,8 @@ def OMPC_Doacross : Clause<[Spelling<"doacross">]> {
 }
 def OMPC_DynamicAllocators : Clause<[Spelling<"dynamic_allocators">]> {
   let clangClass = "OMPDynamicAllocatorsClause";
+  let flangClass = "OmpDynamicAllocatorsClause";
+  let isValueOptional = true;
 }
 def OMPC_DynGroupprivate : Clause<[Spelling<"dyn_groupprivate">]> {
   let flangClass = "OmpDynGroupprivateClause";
@@ -467,6 +473,8 @@ def OMPC_Replayable : Clause<[Spelling<"replayable">]> {
 }
 def OMPC_ReverseOffload : Clause<[Spelling<"reverse_offload">]> {
   let clangClass = "OMPReverseOffloadClause";
+  let flangClass = "OmpReverseOffloadClause";
+  let isValueOptional = true;
 }
 def OMPC_SafeLen : Clause<[Spelling<"safelen">]> {
   let clangClass = "OMPSafelenClause";
@@ -541,12 +549,18 @@ def OMPC_Transparent : Clause<[Spelling<"transparent">]> {
 }
 def OMPC_UnifiedAddress : Clause<[Spelling<"unified_address">]> {
   let clangClass = "OMPUnifiedAddressClause";
+  let flangClass = "OmpUnifiedAddressClause";
+  let isValueOptional = true;
 }
 def OMPC_UnifiedSharedMemory : Clause<[Spelling<"unified_shared_memory">]> {
   let clangClass = "OMPUnifiedSharedMemoryClause";
+  let flangClass = "OmpUnifiedSharedMemoryClause";
+  let isValueOptional = true;
 }
 def OMPC_SelfMaps : Clause<[Spelling<"self_maps">]> {
   let clangClass = "OMPSelfMapsClause";
+  let flangClass = "OmpSelfMapsClause";
+  let isValueOptional = true;
 }
 def OMPC_Uniform : Clause<[Spelling<"uniform">]> {
   let flangClass = "Name";
@@ -1008,6 +1022,7 @@ def OMP_Requires : Directive<[Spelling<"requires">]> {
   let allowedOnceClauses = [
     VersionedClause<OMPC_UnifiedAddress>,
     VersionedClause<OMPC_UnifiedSharedMemory>,
+    VersionedClause<OMPC_DeviceSafesync, 60>,
     // OpenMP 5.2 Spec: If an implementation is not supporting a requirement
     // (reverse offload in this case) then it should give compile-time error
     // termination.
diff --git a/llvm/include/llvm/IR/CFG.h b/llvm/include/llvm/IR/CFG.h
index 7c7e988fa9e8f..96d3b2fbb5b0b 100644
--- a/llvm/include/llvm/IR/CFG.h
+++ b/llvm/include/llvm/IR/CFG.h
@@ -42,9 +42,9 @@ template <class Ptr, class USE_iterator> // Predecessor Iterator
 class PredIterator {
 public:
   using iterator_category = std::forward_iterator_tag;
-  using value_type = Ptr;
+  using value_type = Ptr *;
   using difference_type = std::ptrdiff_t;
-  using pointer = Ptr *;
+  using pointer = Ptr **;
   using reference = Ptr *;
 
 protected:
@@ -141,7 +141,8 @@ class SuccIterator
                                   std::random_access_iterator_tag, BlockT, int,
                                   BlockT *, BlockT *> {
 public:
-  using difference_type = int;
+  using value_type = BlockT *;
+  using difference_type = std::ptrdiff_t;
   using pointer = BlockT *;
   using reference = BlockT *;
 
diff --git a/llvm/include/llvm/IR/DebugProgramInstruction.h b/llvm/include/llvm/IR/DebugProgramInstruction.h
index e0292c2b8d2d2..457c60e3bc929 100644
--- a/llvm/include/llvm/IR/DebugProgramInstruction.h
+++ b/llvm/include/llvm/IR/DebugProgramInstruction.h
@@ -14,7 +14,7 @@
 //    dbg.value(metadata i32 %foo, ...)
 //    %bar = void call @ext(%foo);
 //
-// and all information is stored in the Value / Metadata hierachy defined
+// and all information is stored in the Value / Metadata hierarchy defined
 // elsewhere in LLVM. In the "DbgRecord" design, each instruction /may/ have a
 // connection with a DbgMarker, which identifies a position immediately before
 // the instruction, and each DbgMarker /may/ then have connections to DbgRecords
@@ -37,7 +37,7 @@
 //
 // This structure separates the two concerns of the position of the debug-info
 // in the function, and the Value that it refers to. It also creates a new
-// "place" in-between the Value / Metadata hierachy where we can customise
+// "place" in-between the Value / Metadata hierarchy where we can customise
 // storage and allocation techniques to better suite debug-info workloads.
 // NB: as of the initial prototype, none of that has actually been attempted
 // yet.
@@ -162,7 +162,7 @@ class DbgRecord : public ilist_node<DbgRecord> {
   LLVM_ABI bool isIdenticalToWhenDefined(const DbgRecord &R) const;
   /// Convert this DbgRecord back into an appropriate llvm.dbg.* intrinsic.
   /// \p InsertBefore Optional position to insert this intrinsic.
-  /// \returns A new llvm.dbg.* intrinsic representiung this DbgRecord.
+  /// \returns A new llvm.dbg.* intrinsic representing this DbgRecord.
   LLVM_ABI DbgInfoIntrinsic *
   createDebugIntrinsic(Module *M, Instruction *InsertBefore) const;
   ///@}
@@ -530,7 +530,7 @@ class DbgVariableRecord : public DbgRecord, protected DebugValueUser {
   LLVM_ABI void setKillAddress();
   /// Check whether this kills the address component. This doesn't take into
   /// account the position of the intrinsic, therefore a returned value of false
-  /// does not guarentee the address is a valid location for the variable at the
+  /// does not guarantee the address is a valid location for the variable at the
   /// intrinsic's position in IR.
   LLVM_ABI bool isKillAddress() const;
 
@@ -539,7 +539,7 @@ class DbgVariableRecord : public DbgRecord, protected DebugValueUser {
   LLVM_ABI DbgVariableRecord *clone() const;
   /// Convert this DbgVariableRecord back into a dbg.value intrinsic.
   /// \p InsertBefore Optional position to insert this intrinsic.
-  /// \returns A new dbg.value intrinsic representiung this DbgVariableRecord.
+  /// \returns A new dbg.value intrinsic representing this DbgVariableRecord.
   LLVM_ABI DbgVariableIntrinsic *
   createDebugIntrinsic(Module *M, Instruction *InsertBefore) const;
 
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 9a5e320b4f474..e8b053d015917 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -2478,29 +2478,27 @@ def int_vp_is_fpclass:
 //
 def int_masked_load:
   DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-            [llvm_anyptr_ty, llvm_i32_ty,
+            [llvm_anyptr_ty,
              LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>],
-            [IntrReadMem, IntrArgMemOnly, ImmArg<ArgIndex<1>>,
-             NoCapture<ArgIndex<0>>]>;
+            [IntrReadMem, IntrArgMemOnly, NoCapture<ArgIndex<0>>]>;
 
 def int_masked_store:
   DefaultAttrsIntrinsic<[],
             [llvm_anyvector_ty, llvm_anyptr_ty,
-             llvm_i32_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
-            [IntrWriteMem, IntrArgMemOnly,
-             ImmArg<ArgIndex<2>>, NoCapture<ArgIndex<1>>]>;
+             LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+            [IntrWriteMem, IntrArgMemOnly, NoCapture<ArgIndex<1>>]>;
 
 def int_masked_gather:
   DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-            [LLVMVectorOfAnyPointersToElt<0>, llvm_i32_ty,
+            [LLVMVectorOfAnyPointersToElt<0>,
              LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>],
-            [IntrReadMem, ImmArg<ArgIndex<1>>]>;
+            [IntrReadMem]>;
 
 def int_masked_scatter:
   DefaultAttrsIntrinsic<[],
-            [llvm_anyvector_ty, LLVMVectorOfAnyPointersToElt<0>, llvm_i32_ty,
+            [llvm_anyvector_ty, LLVMVectorOfAnyPointersToElt<0>,
              LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
-            [IntrWriteMem, ImmArg<ArgIndex<2>>]>;
+            [IntrWriteMem]>;
 
 def int_masked_expandload:
   DefaultAttrsIntrinsic<[llvm_anyvector_ty],
@@ -2859,7 +2857,15 @@ def int_ptrauth_blend :
 def int_ptrauth_sign_generic :
   DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>;
 
+//===----------------- AllocToken Intrinsics ------------------------------===//
+
+// Return the token ID for the given !alloc_token metadata.
+def int_alloc_token_id :
+  DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_metadata_ty],
+                        [IntrNoMem, NoUndef<RetIndex>]>;
+
 //===----------------------------------------------------------------------===//
+
 //===------- Convergence Intrinsics ---------------------------------------===//
 
 def int_experimental_convergence_entry
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index b0269eec3347a..b81edc385cd43 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -522,7 +522,7 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_neon_vcmla_rot90  : AdvSIMD_3VectorArg_Intrinsic;
   def int_aarch64_neon_vcmla_rot180 : AdvSIMD_3VectorArg_Intrinsic;
   def int_aarch64_neon_vcmla_rot270 : AdvSIMD_3VectorArg_Intrinsic;
-  
+
   // FP8 fscale
   def int_aarch64_neon_fp8_fscale : DefaultAttrsIntrinsic<
                                     [llvm_anyvector_ty],
@@ -1467,7 +1467,7 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
                  LLVMSubdivide2VectorType<0>,
                  llvm_i32_ty],
                 [IntrNoMem, ImmArg<ArgIndex<3>>]>;
-  
+
   class SVE2_1VectorArgIndexed_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMMatchType<0>,
@@ -1482,7 +1482,7 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
                  llvm_i32_ty,
                  llvm_i32_ty],
                 [IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
-  
+
   class SVE2_1VectorArg_Pred_Intrinsic
     : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
                             [llvm_anyvector_ty],
@@ -1492,7 +1492,7 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
     : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
                             [llvm_anyvector_ty, llvm_i32_ty],
                             [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-  
+
   class SVE2_Pred_1VectorArgIndexed_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                             [LLVMMatchType<0>,
@@ -3353,11 +3353,11 @@ let TargetPrefix = "aarch64" in {
     : DefaultAttrsIntrinsic<[llvm_nxv8bf16_ty],
                             [llvm_nxv4f32_ty, llvm_nxv4f32_ty],
                             [IntrNoMem]>;
-  
+
   class SVE2_CVT_WIDENING_VG2_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
                             [LLVMSubdivide2VectorType<0>], [IntrNoMem]>;
-  
+
 
   class SVE2_CVT_VG4_SINGLE_Intrinsic
     : DefaultAttrsIntrinsic<[LLVMSubdivide4VectorType<0>],
@@ -3740,7 +3740,7 @@ let TargetPrefix = "aarch64" in {
           llvm_anyvector_ty, LLVMMatchType<0>,
           LLVMMatchType<0>, LLVMMatchType<0>],
           [IntrInaccessibleMemOnly, IntrWriteMem]>;
-          
+
   class SME2_Add_Sub_Write_VG4_Multi_Multi_Intrinsic
       : DefaultAttrsIntrinsic<[],
           [llvm_i32_ty,
@@ -3887,7 +3887,7 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sme_luti4_lane_zt
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_i32_ty, llvm_nxv16i8_ty, llvm_i32_ty],
                             [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, IntrInaccessibleMemOnly, IntrReadMem]>;
-                            
+
   // Lookup table expand two registers
   //
   def int_aarch64_sme_luti2_lane_zt_x2
@@ -3896,7 +3896,7 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sme_luti4_lane_zt_x2
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], [llvm_i32_ty, llvm_nxv16i8_ty, llvm_i32_ty],
                             [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, IntrInaccessibleMemOnly, IntrReadMem]>;
-                            
+
   //
   // Lookup table expand four registers
   //
@@ -3914,7 +3914,7 @@ let TargetPrefix = "aarch64" in {
                             [llvm_i32_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty],
                             [ImmArg<ArgIndex<0>>, IntrInaccessibleMemOnly, IntrReadMem]>;
 
-  
+
   //
   // Register scaling
   //
@@ -3962,7 +3962,7 @@ def int_aarch64_sve_extq : AdvSIMD_2VectorArgIndexed_Intrinsic;
 //
 // SVE2.1 - Move predicate to/from vector
 //
-def int_aarch64_sve_pmov_to_pred_lane : SVE2_1VectorArgIndexed_Pred_Intrinsic; 
+def int_aarch64_sve_pmov_to_pred_lane : SVE2_1VectorArgIndexed_Pred_Intrinsic;
 
 def int_aarch64_sve_pmov_to_pred_lane_zero : SVE2_1VectorArg_Pred_Intrinsic;
 
@@ -4004,10 +4004,10 @@ let TargetPrefix = "aarch64" in {
     : DefaultAttrsIntrinsic<[llvm_nxv16i8_ty],
                             [llvm_anyvector_ty, LLVMMatchType<0>],
                             [IntrReadMem, IntrInaccessibleMemOnly]>;
-  
+
   def int_aarch64_sve_fp8_cvtn  : SVE2_FP8_Narrow_Cvt;
   def int_aarch64_sve_fp8_cvtnb : SVE2_FP8_Narrow_Cvt;
-  
+
   def int_aarch64_sve_fp8_cvtnt
     : DefaultAttrsIntrinsic<[llvm_nxv16i8_ty],
                             [llvm_nxv16i8_ty, llvm_anyvector_ty, LLVMMatchType<0>],
@@ -4019,32 +4019,32 @@ let TargetPrefix = "aarch64" in {
                             [LLVMMatchType<0>,
                              llvm_nxv16i8_ty, llvm_nxv16i8_ty],
                             [IntrReadMem, IntrInaccessibleMemOnly]>;
-  
+
   class SVE2_FP8_FMLA_FDOT_Lane
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                             [LLVMMatchType<0>,
                              llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_i32_ty],
                             [IntrReadMem, IntrInaccessibleMemOnly, ImmArg<ArgIndex<3>>]>;
-  
+
   def int_aarch64_sve_fp8_fdot      : SVE2_FP8_FMLA_FDOT;
   def int_aarch64_sve_fp8_fdot_lane : SVE2_FP8_FMLA_FDOT_Lane;
 
   // Fused multiply-add
   def int_aarch64_sve_fp8_fmlalb      : SVE2_FP8_FMLA_FDOT;
   def int_aarch64_sve_fp8_fmlalb_lane : SVE2_FP8_FMLA_FDOT_Lane;
-  
+
   def int_aarch64_sve_fp8_fmlalt      : SVE2_FP8_FMLA_FDOT;
   def int_aarch64_sve_fp8_fmlalt_lane : SVE2_FP8_FMLA_FDOT_Lane;
-  
+
   def int_aarch64_sve_fp8_fmlallbb      : SVE2_FP8_FMLA_FDOT;
   def int_aarch64_sve_fp8_fmlallbb_lane : SVE2_FP8_FMLA_FDOT_Lane;
-  
+
   def int_aarch64_sve_fp8_fmlallbt      : SVE2_FP8_FMLA_FDOT;
   def int_aarch64_sve_fp8_fmlallbt_lane : SVE2_FP8_FMLA_FDOT_Lane;
-  
+
   def int_aarch64_sve_fp8_fmlalltb      : SVE2_FP8_FMLA_FDOT;
   def int_aarch64_sve_fp8_fmlalltb_lane : SVE2_FP8_FMLA_FDOT_Lane;
-  
+
   def int_aarch64_sve_fp8_fmlalltt      : SVE2_FP8_FMLA_FDOT;
   def int_aarch64_sve_fp8_fmlalltt_lane : SVE2_FP8_FMLA_FDOT_Lane;
 
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index ded00b1274670..1965922de2c8a 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -898,6 +898,31 @@ def int_amdgcn_bitop3 :
                         [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
                         [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]>;
 
+class AMDGPUGlobalStore : Intrinsic <
+  [],
+  [global_ptr_ty,          // Base global pointer to store to
+   llvm_v4i32_ty,          // Data to store
+   llvm_metadata_ty],      // Scope
+  [ IntrWriteMem, WriteOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>,
+    IntrWillReturn, IntrNoCallback, IntrNoFree ],
+  "",
+  [SDNPMemOperand, SDNPMayStore]
+>;
+
+def int_amdgcn_global_store_b128 : AMDGPUGlobalStore;
+
+class AMDGPUGlobalLoad : Intrinsic <
+  [llvm_v4i32_ty],
+  [global_ptr_ty,          // Base global pointer to load from
+   llvm_metadata_ty],      // Scope
+  [ IntrReadMem, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, IntrWillReturn,
+    IntrNoCallback, IntrNoFree ],
+  "",
+  [SDNPMemOperand, SDNPMayLoad]
+>;
+
+def int_amdgcn_global_load_b128 : AMDGPUGlobalLoad;
+
 } // TargetPrefix = "amdgcn"
 
 // New-style image intrinsics
@@ -2819,20 +2844,24 @@ class AMDGPULoadToLDS :
      "", [SDNPMemOperand]>;
 def int_amdgcn_load_to_lds : AMDGPULoadToLDS;
 
-class AMDGPUGlobalLoadLDS :
-  ClangBuiltin<"__builtin_amdgcn_global_load_lds">,
-  Intrinsic <
-    [],
-    [LLVMQualPointerType<1>,            // Base global pointer to load from
-     LLVMQualPointerType<3>,            // LDS base pointer to store to
-     llvm_i32_ty,                       // Data byte size: 1/2/4 (/12/16 for gfx950)
-     llvm_i32_ty,                       // imm offset (applied to both global and LDS address)
-     llvm_i32_ty],                      // auxiliary data (imm, cachepolicy (bit 0 = sc0,
-                                        //                                   bit 1 = sc1,
-                                        //                                   bit 4 = scc))
-    [IntrWillReturn, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
-     ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree],
-     "", [SDNPMemOperand]>;
+class AMDGPUGlobalLoadLDS
+    : ClangBuiltin<"__builtin_amdgcn_global_load_lds">,
+      Intrinsic<
+          [],
+          [LLVMQualPointerType<1>, // Base global pointer to load from
+           LLVMQualPointerType<3>, // LDS base pointer to store to
+           llvm_i32_ty,            // Data byte size: 1/2/4 (/12/16 for gfx950)
+           llvm_i32_ty,  // imm offset (applied to both global and LDS address)
+           llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = sc0,
+                         //                                   bit 1 = sc1,
+                         //                                   bit 4 = scc,
+                         //                                   bit 31 = volatile
+                         //                                   (compiler
+                         //                                   implemented)))
+          [IntrWillReturn, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
+           ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>,
+           IntrNoCallback, IntrNoFree],
+          "", [SDNPMemOperand]>;
 def int_amdgcn_global_load_lds : AMDGPUGlobalLoadLDS;
 
 // This is IntrHasSideEffects because it reads from a volatile hardware register.
diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td
index 570d6bc35cbd0..3b7077c52db21 100644
--- a/llvm/include/llvm/IR/IntrinsicsDirectX.td
+++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td
@@ -77,6 +77,9 @@ def int_dx_resource_updatecounter
     : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_any_ty, llvm_i8_ty],
                             [IntrInaccessibleMemOrArgMemOnly]>;
 
+def int_dx_resource_getdimensions_x
+    : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_any_ty], [IntrReadMem]>;
+
 // Cast between target extension handle types and dxil-style opaque handles
 def int_dx_resource_casthandle : Intrinsic<[llvm_any_ty], [llvm_any_ty]>;
 
diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
index 66e24fae08a7b..49a182be98acd 100644
--- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -167,6 +167,9 @@ def int_spv_rsqrt : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty]
       : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_any_ty, llvm_i8_ty],
                               [IntrInaccessibleMemOrArgMemOnly]>;
 
+  def int_spv_resource_getdimensions_x
+      : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_any_ty], [IntrReadMem]>;
+
   def int_spv_resource_getpointer
       : DefaultAttrsIntrinsic<[llvm_anyptr_ty], [llvm_any_ty, llvm_i32_ty],
                               [IntrNoMem]>;
diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h
index ac79d91d417c2..0062cece17944 100644
--- a/llvm/include/llvm/IR/ModuleSummaryIndex.h
+++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h
@@ -164,9 +164,24 @@ struct alignas(8) GlobalValueSummaryInfo {
 
   inline GlobalValueSummaryInfo(bool HaveGVs);
 
+  /// Access a read-only list of global value summary structures for a
+  /// particular value held in the GlobalValueMap.
+  ArrayRef<std::unique_ptr<GlobalValueSummary>> getSummaryList() const {
+    return SummaryList;
+  }
+
+  /// Add a summary corresponding to a global value definition in a module with
+  /// the corresponding GUID.
+  void addSummary(std::unique_ptr<GlobalValueSummary> Summary) {
+    return SummaryList.push_back(std::move(Summary));
+  }
+
+private:
   /// List of global value summary structures for a particular value held
   /// in the GlobalValueMap. Requires a vector in the case of multiple
-  /// COMDAT values of the same name.
+  /// COMDAT values of the same name, weak symbols, locals of the same name when
+  /// compiling without sufficient distinguishing path, or (theoretically) hash
+  /// collisions. Each summary is from a different module.
   GlobalValueSummaryList SummaryList;
 };
 
@@ -201,7 +216,7 @@ struct ValueInfo {
   }
 
   ArrayRef<std::unique_ptr<GlobalValueSummary>> getSummaryList() const {
-    return getRef()->second.SummaryList;
+    return getRef()->second.getSummaryList();
   }
 
   // Even if the index is built with GVs available, we may not have one for
@@ -1607,8 +1622,8 @@ class ModuleSummaryIndex {
 
     for (auto &S : *this) {
       // Skip external functions
-      if (!S.second.SummaryList.size() ||
-          !isa<FunctionSummary>(S.second.SummaryList.front().get()))
+      if (!S.second.getSummaryList().size() ||
+          !isa<FunctionSummary>(S.second.getSummaryList().front().get()))
         continue;
       discoverNodes(ValueInfo(HaveGVs, &S), FunctionHasParent);
     }
@@ -1748,7 +1763,7 @@ class ModuleSummaryIndex {
     // Here we have a notionally const VI, but the value it points to is owned
     // by the non-const *this.
     const_cast<GlobalValueSummaryMapTy::value_type *>(VI.getRef())
-        ->second.SummaryList.push_back(std::move(Summary));
+        ->second.addSummary(std::move(Summary));
   }
 
   /// Add an original name for the value of the given GUID.
@@ -1937,7 +1952,7 @@ class ModuleSummaryIndex {
   collectDefinedGVSummariesPerModule(Map &ModuleToDefinedGVSummaries) const {
     for (const auto &GlobalList : *this) {
       auto GUID = GlobalList.first;
-      for (const auto &Summary : GlobalList.second.SummaryList) {
+      for (const auto &Summary : GlobalList.second.getSummaryList()) {
         ModuleToDefinedGVSummaries[Summary->modulePath()][GUID] = Summary.get();
       }
     }
@@ -2035,7 +2050,7 @@ struct GraphTraits<ModuleSummaryIndex *> : public GraphTraits<ValueInfo> {
     std::unique_ptr<GlobalValueSummary> Root =
         std::make_unique<FunctionSummary>(I->calculateCallGraphRoot());
     GlobalValueSummaryInfo G(I->haveGVs());
-    G.SummaryList.push_back(std::move(Root));
+    G.addSummary(std::move(Root));
     static auto P =
         GlobalValueSummaryMapTy::value_type(GlobalValue::GUID(0), std::move(G));
     return ValueInfo(I->haveGVs(), &P);
diff --git a/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h b/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h
index 531de514822e8..3381e1777217a 100644
--- a/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h
+++ b/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h
@@ -237,7 +237,7 @@ template <> struct CustomMappingTraits<GlobalValueSummaryMapTy> {
         // This is done in fixAliaseeLinks() which is called in
         // MappingTraits<ModuleSummaryIndex>::mapping().
         ASum->setAliasee(AliaseeVI, /*Aliasee=*/nullptr);
-        Elem.SummaryList.push_back(std::move(ASum));
+        Elem.addSummary(std::move(ASum));
         continue;
       }
       SmallVector<ValueInfo, 0> Refs;
@@ -246,7 +246,7 @@ template <> struct CustomMappingTraits<GlobalValueSummaryMapTy> {
         auto It = V.try_emplace(RefGUID, /*IsAnalysis=*/false).first;
         Refs.push_back(ValueInfo(/*IsAnalysis=*/false, &*It));
       }
-      Elem.SummaryList.push_back(std::make_unique<FunctionSummary>(
+      Elem.addSummary(std::make_unique<FunctionSummary>(
           GVFlags, /*NumInsts=*/0, FunctionSummary::FFlags{}, std::move(Refs),
           SmallVector<FunctionSummary::EdgeTy, 0>{}, std::move(GVSum.TypeTests),
           std::move(GVSum.TypeTestAssumeVCalls),
@@ -260,7 +260,7 @@ template <> struct CustomMappingTraits<GlobalValueSummaryMapTy> {
   static void output(IO &io, GlobalValueSummaryMapTy &V) {
     for (auto &P : V) {
       std::vector<GlobalValueSummaryYaml> GVSums;
-      for (auto &Sum : P.second.SummaryList) {
+      for (auto &Sum : P.second.getSummaryList()) {
         if (auto *FSum = dyn_cast<FunctionSummary>(Sum.get())) {
           std::vector<uint64_t> Refs;
           Refs.reserve(FSum->refs().size());
@@ -295,7 +295,7 @@ template <> struct CustomMappingTraits<GlobalValueSummaryMapTy> {
   }
   static void fixAliaseeLinks(GlobalValueSummaryMapTy &V) {
     for (auto &P : V) {
-      for (auto &Sum : P.second.SummaryList) {
+      for (auto &Sum : P.second.getSummaryList()) {
         if (auto *Alias = dyn_cast<AliasSummary>(Sum.get())) {
           ValueInfo AliaseeVI = Alias->getAliaseeVI();
           auto AliaseeSL = AliaseeVI.getSummaryList();
diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index 79342777aba4b..99f70b101c2ed 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -2778,27 +2778,24 @@ template <Intrinsic::ID IntrID> inline IntrinsicID_match m_Intrinsic() {
 }
 
 /// Matches MaskedLoad Intrinsic.
-template <typename Opnd0, typename Opnd1, typename Opnd2, typename Opnd3>
-inline typename m_Intrinsic_Ty<Opnd0, Opnd1, Opnd2, Opnd3>::Ty
-m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2,
-             const Opnd3 &Op3) {
-  return m_Intrinsic<Intrinsic::masked_load>(Op0, Op1, Op2, Op3);
+template <typename Opnd0, typename Opnd1, typename Opnd2>
+inline typename m_Intrinsic_Ty<Opnd0, Opnd1, Opnd2>::Ty
+m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2) {
+  return m_Intrinsic<Intrinsic::masked_load>(Op0, Op1, Op2);
 }
 
 /// Matches MaskedStore Intrinsic.
-template <typename Opnd0, typename Opnd1, typename Opnd2, typename Opnd3>
-inline typename m_Intrinsic_Ty<Opnd0, Opnd1, Opnd2, Opnd3>::Ty
-m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2,
-              const Opnd3 &Op3) {
-  return m_Intrinsic<Intrinsic::masked_store>(Op0, Op1, Op2, Op3);
+template <typename Opnd0, typename Opnd1, typename Opnd2>
+inline typename m_Intrinsic_Ty<Opnd0, Opnd1, Opnd2>::Ty
+m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2) {
+  return m_Intrinsic<Intrinsic::masked_store>(Op0, Op1, Op2);
 }
 
 /// Matches MaskedGather Intrinsic.
-template <typename Opnd0, typename Opnd1, typename Opnd2, typename Opnd3>
-inline typename m_Intrinsic_Ty<Opnd0, Opnd1, Opnd2, Opnd3>::Ty
-m_MaskedGather(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2,
-               const Opnd3 &Op3) {
-  return m_Intrinsic<Intrinsic::masked_gather>(Op0, Op1, Op2, Op3);
+template <typename Opnd0, typename Opnd1, typename Opnd2>
+inline typename m_Intrinsic_Ty<Opnd0, Opnd1, Opnd2>::Ty
+m_MaskedGather(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2) {
+  return m_Intrinsic<Intrinsic::masked_gather>(Op0, Op1, Op2);
 }
 
 template <Intrinsic::ID IntrID, typename T0>
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h b/llvm/include/llvm/IR/RuntimeLibcalls.h
index 307cc662d2022..01359894b0421 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.h
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.h
@@ -31,7 +31,6 @@
 /// implementation, which includes a name and type signature.
 #define GET_RUNTIME_LIBCALL_ENUM
 #include "llvm/IR/RuntimeLibcalls.inc"
-#undef GET_RUNTIME_LIBCALL_ENUM
 
 namespace llvm {
 
@@ -170,7 +169,6 @@ struct RuntimeLibcallsInfo {
   // querying a real set of symbols
 #define GET_LOOKUP_LIBCALL_IMPL_NAME_BODY
 #include "llvm/IR/RuntimeLibcalls.inc"
-#undef GET_LOOKUP_LIBCALL_IMPL_NAME_BODY
   }
 
   /// Check if this is valid libcall for the current module, otherwise
@@ -238,7 +236,7 @@ struct RuntimeLibcallsInfo {
 
   static bool hasAEABILibcalls(const Triple &TT) {
     return TT.isTargetAEABI() || TT.isTargetGNUAEABI() ||
-           TT.isTargetMuslAEABI() || TT.isAndroid();
+           TT.isTargetMuslAEABI() || TT.isOSFuchsia() || TT.isAndroid();
   }
 
   LLVM_READONLY
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td b/llvm/include/llvm/IR/RuntimeLibcalls.td
index 6183a7e2b04db..a8b647c22317e 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.td
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.td
@@ -405,17 +405,19 @@ def MIPS16_RET_DF : RuntimeLibcall;
 def MIPS16_RET_SC : RuntimeLibcall;
 def MIPS16_RET_SF : RuntimeLibcall;
 
-multiclass LibmLongDoubleLibCall<string libcall_basename = !toupper(NAME),
-                                 string rtbasename = NAME> {
+multiclass LibmLongDoubleLibCall<string libcall_basename = !toupper(!substr(NAME, 0, !sub(!size(NAME), 1))),
+                                 string rtname = NAME> {
+
+
   def NAME#"_f128"
       : RuntimeLibcallImpl<!cast<RuntimeLibcall>(libcall_basename#"_F128"),
-                           !strconcat(rtbasename, "l")>;
+                           rtname>;
   def NAME#"_ppcf128"
       : RuntimeLibcallImpl<!cast<RuntimeLibcall>(libcall_basename#"_PPCF128"),
-                           !strconcat(rtbasename, "l")>;
+                           rtname>;
   def NAME#"_f80"
       : RuntimeLibcallImpl<!cast<RuntimeLibcall>(libcall_basename#"_F80"),
-                           !strconcat(rtbasename, "l")>;
+                           rtname>;
 }
 
 // AArch64 calls
@@ -765,19 +767,19 @@ def fmodl_ppc128 : RuntimeLibcallImpl<REM_PPCF128, "fmodl">;
 
 def fmaf : RuntimeLibcallImpl<FMA_F32>;
 def fma : RuntimeLibcallImpl<FMA_F64>;
-defm fma : LibmLongDoubleLibCall;
+defm fmal : LibmLongDoubleLibCall;
 
 def sqrtf : RuntimeLibcallImpl<SQRT_F32>;
 def sqrt : RuntimeLibcallImpl<SQRT_F64>;
-defm sqrt : LibmLongDoubleLibCall;
+defm sqrtl : LibmLongDoubleLibCall;
 
 def cbrtf : RuntimeLibcallImpl<CBRT_F32>;
 def cbrt : RuntimeLibcallImpl<CBRT_F64>;
-defm cbrt : LibmLongDoubleLibCall;
+defm cbrtl : LibmLongDoubleLibCall;
 
 def logf : RuntimeLibcallImpl<LOG_F32>;
 def log : RuntimeLibcallImpl<LOG_F64>;
-defm log : LibmLongDoubleLibCall;
+defm logl : LibmLongDoubleLibCall;
 
 def __logf_finite : RuntimeLibcallImpl<LOG_FINITE_F32>;
 def __log_finite : RuntimeLibcallImpl<LOG_FINITE_F64>;
@@ -787,7 +789,7 @@ def __logl_finite_ppcf128 : RuntimeLibcallImpl<LOG_FINITE_PPCF128, "__logl_finit
 
 def log2f : RuntimeLibcallImpl<LOG2_F32>;
 def log2 : RuntimeLibcallImpl<LOG2_F64>;
-defm log2 : LibmLongDoubleLibCall;
+defm log2l : LibmLongDoubleLibCall;
 
 def __log2f_finite : RuntimeLibcallImpl<LOG2_FINITE_F32>;
 def __log2_finite : RuntimeLibcallImpl<LOG2_FINITE_F64>;
@@ -797,7 +799,7 @@ def __log2l_finite_ppcf128 : RuntimeLibcallImpl<LOG2_FINITE_PPCF128, "__log2l_fi
 
 def log10f : RuntimeLibcallImpl<LOG10_F32>;
 def log10 : RuntimeLibcallImpl<LOG10_F64>;
-defm log10 : LibmLongDoubleLibCall;
+defm log10l : LibmLongDoubleLibCall;
 
 def __log10f_finite : RuntimeLibcallImpl<LOG10_FINITE_F32>;
 def __log10_finite : RuntimeLibcallImpl<LOG10_FINITE_F64>;
@@ -807,7 +809,7 @@ def __log10l_finite_ppcf128 : RuntimeLibcallImpl<LOG10_FINITE_PPCF128, "__log10l
 
 def expf : RuntimeLibcallImpl<EXP_F32>;
 def exp : RuntimeLibcallImpl<EXP_F64>;
-defm exp : LibmLongDoubleLibCall<"EXP", "exp">;
+defm expl : LibmLongDoubleLibCall<"EXP">;
 
 def __expf_finite : RuntimeLibcallImpl<EXP_FINITE_F32>;
 def __exp_finite : RuntimeLibcallImpl<EXP_FINITE_F64>;
@@ -817,7 +819,7 @@ def __expl_finite_ppcf128 : RuntimeLibcallImpl<EXP_FINITE_PPCF128, "__expl_finit
 
 def exp2f : RuntimeLibcallImpl<EXP2_F32>;
 def exp2 : RuntimeLibcallImpl<EXP2_F64>;
-defm exp2 : LibmLongDoubleLibCall<"EXP2", "exp2">;
+defm exp2l : LibmLongDoubleLibCall<"EXP2">;
 
 def __exp2f_finite : RuntimeLibcallImpl<EXP2_FINITE_F32>;
 def __exp2_finite : RuntimeLibcallImpl<EXP2_FINITE_F64>;
@@ -827,47 +829,47 @@ def __exp2l_finite_ppcf128 : RuntimeLibcallImpl<EXP2_FINITE_PPCF128, "__exp2l_fi
 
 def sinf : RuntimeLibcallImpl<SIN_F32>;
 def sin : RuntimeLibcallImpl<SIN_F64>;
-defm sin : LibmLongDoubleLibCall;
+defm sinl : LibmLongDoubleLibCall;
 
 def cosf : RuntimeLibcallImpl<COS_F32>;
 def cos : RuntimeLibcallImpl<COS_F64>;
-defm cos : LibmLongDoubleLibCall;
+defm cosl : LibmLongDoubleLibCall;
 
 def tanf : RuntimeLibcallImpl<TAN_F32>;
 def tan : RuntimeLibcallImpl<TAN_F64>;
-defm tan : LibmLongDoubleLibCall;
+defm tanl : LibmLongDoubleLibCall;
 
 def sinhf : RuntimeLibcallImpl<SINH_F32>;
 def sinh : RuntimeLibcallImpl<SINH_F64>;
-defm sinh : LibmLongDoubleLibCall;
+defm sinhl : LibmLongDoubleLibCall;
 
 def coshf : RuntimeLibcallImpl<COSH_F32>;
 def cosh : RuntimeLibcallImpl<COSH_F64>;
-defm cosh : LibmLongDoubleLibCall;
+defm coshl : LibmLongDoubleLibCall;
 
 def tanhf : RuntimeLibcallImpl<TANH_F32>;
 def tanh : RuntimeLibcallImpl<TANH_F64>;
-defm tanh : LibmLongDoubleLibCall;
+defm tanhl : LibmLongDoubleLibCall;
 
 def asinf : RuntimeLibcallImpl<ASIN_F32>;
 def asin : RuntimeLibcallImpl<ASIN_F64>;
-defm asin : LibmLongDoubleLibCall;
+defm asinl : LibmLongDoubleLibCall;
 
 def acosf : RuntimeLibcallImpl<ACOS_F32>;
 def acos : RuntimeLibcallImpl<ACOS_F64>;
-defm acos : LibmLongDoubleLibCall;
+defm acosl : LibmLongDoubleLibCall;
 
 def atanf : RuntimeLibcallImpl<ATAN_F32>;
 def atan : RuntimeLibcallImpl<ATAN_F64>;
-defm atan : LibmLongDoubleLibCall;
+defm atanl : LibmLongDoubleLibCall;
 
 def atan2f : RuntimeLibcallImpl<ATAN2_F32>;
 def atan2 : RuntimeLibcallImpl<ATAN2_F64>;
-defm atan2 : LibmLongDoubleLibCall;
+defm atan2l : LibmLongDoubleLibCall;
 
 def powf : RuntimeLibcallImpl<POW_F32>;
 def pow : RuntimeLibcallImpl<POW_F64>;
-defm pow : LibmLongDoubleLibCall;
+defm powl : LibmLongDoubleLibCall;
 
 def __powf_finite : RuntimeLibcallImpl<POW_FINITE_F32>;
 def __pow_finite : RuntimeLibcallImpl<POW_FINITE_F64>;
@@ -877,91 +879,91 @@ def __powl_finite_ppcf128 : RuntimeLibcallImpl<POW_FINITE_PPCF128, "__powl_finit
 
 def ceilf : RuntimeLibcallImpl<CEIL_F32>;
 def ceil : RuntimeLibcallImpl<CEIL_F64>;
-defm ceil : LibmLongDoubleLibCall;
+defm ceill : LibmLongDoubleLibCall;
 
 def truncf : RuntimeLibcallImpl<TRUNC_F32>;
 def trunc : RuntimeLibcallImpl<TRUNC_F64>;
-defm trunc : LibmLongDoubleLibCall;
+defm truncl : LibmLongDoubleLibCall;
 
 def rintf : RuntimeLibcallImpl<RINT_F32>;
 def rint : RuntimeLibcallImpl<RINT_F64>;
-defm rint : LibmLongDoubleLibCall;
+defm rintl : LibmLongDoubleLibCall;
 
 def nearbyintf : RuntimeLibcallImpl<NEARBYINT_F32>;
 def nearbyint : RuntimeLibcallImpl<NEARBYINT_F64>;
-defm nearbyint : LibmLongDoubleLibCall;
+defm nearbyintl : LibmLongDoubleLibCall;
 
 def roundf : RuntimeLibcallImpl<ROUND_F32>;
 def round : RuntimeLibcallImpl<ROUND_F64>;
-defm round : LibmLongDoubleLibCall;
+defm roundl : LibmLongDoubleLibCall;
 
 def roundevenf : RuntimeLibcallImpl<ROUNDEVEN_F32>;
 def roundeven : RuntimeLibcallImpl<ROUNDEVEN_F64>;
-defm roundeven : LibmLongDoubleLibCall;
+defm roundevenl : LibmLongDoubleLibCall;
 
 def floorf : RuntimeLibcallImpl<FLOOR_F32>;
 def floor : RuntimeLibcallImpl<FLOOR_F64>;
-defm floor : LibmLongDoubleLibCall;
+defm floorl : LibmLongDoubleLibCall;
 
 def copysignf : RuntimeLibcallImpl<COPYSIGN_F32>;
 def copysign : RuntimeLibcallImpl<COPYSIGN_F64>;
-defm copysign : LibmLongDoubleLibCall;
+defm copysignl : LibmLongDoubleLibCall;
 
 def fminf : RuntimeLibcallImpl<FMIN_F32>;
 def fmin : RuntimeLibcallImpl<FMIN_F64>;
-defm fmin : LibmLongDoubleLibCall;
+defm fminl : LibmLongDoubleLibCall;
 
 def fmaxf : RuntimeLibcallImpl<FMAX_F32>;
 def fmax : RuntimeLibcallImpl<FMAX_F64>;
-defm fmax : LibmLongDoubleLibCall;
+defm fmaxl : LibmLongDoubleLibCall;
 
 def fminimumf : RuntimeLibcallImpl<FMINIMUM_F32>;
 def fminimum : RuntimeLibcallImpl<FMINIMUM_F64>;
-defm fminimum : LibmLongDoubleLibCall;
+defm fminimuml : LibmLongDoubleLibCall;
 
 def fmaximumf : RuntimeLibcallImpl<FMAXIMUM_F32>;
 def fmaximum : RuntimeLibcallImpl<FMAXIMUM_F64>;
-defm fmaximum : LibmLongDoubleLibCall;
+defm fmaximuml : LibmLongDoubleLibCall;
 
 def fminimum_numf : RuntimeLibcallImpl<FMINIMUM_NUM_F32>;
 def fminimum_num : RuntimeLibcallImpl<FMINIMUM_NUM_F64>;
-defm fminimum_num : LibmLongDoubleLibCall;
+defm fminimum_numl : LibmLongDoubleLibCall;
 
 def fmaximum_numf : RuntimeLibcallImpl<FMAXIMUM_NUM_F32>;
 def fmaximum_num : RuntimeLibcallImpl<FMAXIMUM_NUM_F64>;
-defm fmaximum_num : LibmLongDoubleLibCall;
+defm fmaximum_numl : LibmLongDoubleLibCall;
 
 def lroundf : RuntimeLibcallImpl<LROUND_F32>;
 def lround : RuntimeLibcallImpl<LROUND_F64>;
-defm lround : LibmLongDoubleLibCall;
+defm lroundl : LibmLongDoubleLibCall;
 
 def llroundf : RuntimeLibcallImpl<LLROUND_F32>;
 def llround : RuntimeLibcallImpl<LLROUND_F64>;
-defm llround : LibmLongDoubleLibCall;
+defm llroundl : LibmLongDoubleLibCall;
 
 def lrintf : RuntimeLibcallImpl<LRINT_F32>;
 def lrint : RuntimeLibcallImpl<LRINT_F64>;
-defm lrint : LibmLongDoubleLibCall;
+defm lrintl : LibmLongDoubleLibCall;
 
 def llrintf : RuntimeLibcallImpl<LLRINT_F32>;
 def llrint : RuntimeLibcallImpl<LLRINT_F64>;
-defm llrint : LibmLongDoubleLibCall;
+defm llrintl : LibmLongDoubleLibCall;
 
 def ldexpf : RuntimeLibcallImpl<LDEXP_F32>;
 def ldexp : RuntimeLibcallImpl<LDEXP_F64>;
-defm ldexp : LibmLongDoubleLibCall;
+defm ldexpl : LibmLongDoubleLibCall;
 
 def frexpf : RuntimeLibcallImpl<FREXP_F32>;
 def frexp : RuntimeLibcallImpl<FREXP_F64>;
-defm frexp : LibmLongDoubleLibCall;
+defm frexpl : LibmLongDoubleLibCall;
 
 def sincospif : RuntimeLibcallImpl<SINCOSPI_F32>;
 def sincospi : RuntimeLibcallImpl<SINCOSPI_F64>;
-defm sincospi : LibmLongDoubleLibCall;
+defm sincospil : LibmLongDoubleLibCall;
 
 def modff : RuntimeLibcallImpl<MODF_F32>;
 def modf : RuntimeLibcallImpl<MODF_F64>;
-defm modf : LibmLongDoubleLibCall;
+defm modfl : LibmLongDoubleLibCall;
 
 // Floating point environment
 def fegetenv : RuntimeLibcallImpl<FEGETENV>;
@@ -1033,7 +1035,7 @@ def __sincos_stret : RuntimeLibcallImpl<SINCOS_STRET_F64>;
 
 def sincosf : RuntimeLibcallImpl<SINCOS_F32>;
 def sincos : RuntimeLibcallImpl<SINCOS_F64>;
-defm sincos : LibmLongDoubleLibCall;
+defm sincosl : LibmLongDoubleLibCall;
 
 def bzero : RuntimeLibcallImpl<BZERO>;
 def __bzero : RuntimeLibcallImpl<BZERO>;
@@ -1198,9 +1200,9 @@ defvar SecurityCheckCookieIfWinMSVC =
 
 defvar LibmHasSinCosF32 = LibcallImpls<(add sincosf), hasSinCos>;
 defvar LibmHasSinCosF64 =  LibcallImpls<(add sincos), hasSinCos>;
-defvar LibmHasSinCosF80 = LibcallImpls<(add sincos_f80), hasSinCos>;
-defvar LibmHasSinCosF128 = LibcallImpls<(add sincos_f128), hasSinCos>;
-defvar LibmHasSinCosPPCF128 = LibcallImpls<(add sincos_ppcf128), hasSinCos>;
+defvar LibmHasSinCosF80 = LibcallImpls<(add sincosl_f80), hasSinCos>;
+defvar LibmHasSinCosF128 = LibcallImpls<(add sincosl_f128), hasSinCos>;
+defvar LibmHasSinCosPPCF128 = LibcallImpls<(add sincosl_ppcf128), hasSinCos>;
 
 defvar LibmHasExp10F32 = LibcallImpls<(add exp10f), hasExp10>;
 defvar LibmHasExp10F64 = LibcallImpls<(add exp10), hasExp10>;
@@ -1214,8 +1216,8 @@ defvar DefaultLibmExp10 = [
 
 
 defvar WindowsMathRemovals = [
-  ldexpf, ldexp_f80, ldexp_f128, ldexp_ppcf128,
-  frexpf, frexp_f80, frexp_f128, frexp_ppcf128
+  ldexpf, ldexpl_f80, ldexpl_f128, ldexpl_ppcf128,
+  frexpf, frexpl_f80, frexpl_f128, frexpl_ppcf128
 ];
 
 defvar MostPowI = !listremove(PowiLibcallImpls, [__powitf2_f128, __powitf2_ppc128]);
@@ -1233,11 +1235,11 @@ defvar WinDefaultLibcallImpls = (add WinDefaultLibcallImplsBaseList,
 defvar LibmHasFrexpF32 = LibcallImpls<(add frexpf), isNotOSWindowsOrIsCygwinMinGW>;
 defvar LibmHasLdexpF32 = LibcallImpls<(add ldexpf), isNotOSWindowsOrIsCygwinMinGW>;
 
-defvar LibmHasFrexpF80 = LibcallImpls<(add frexp_f80), isNotOSWindowsOrIsCygwinMinGW>;
-defvar LibmHasLdexpF80 = LibcallImpls<(add ldexp_f80), isNotOSWindowsOrIsCygwinMinGW>;
+defvar LibmHasFrexpF80 = LibcallImpls<(add frexpl_f80), isNotOSWindowsOrIsCygwinMinGW>;
+defvar LibmHasLdexpF80 = LibcallImpls<(add ldexpl_f80), isNotOSWindowsOrIsCygwinMinGW>;
 
-defvar LibmHasFrexpF128 = LibcallImpls<(add frexp_f128), isNotOSWindowsOrIsCygwinMinGW>;
-defvar LibmHasLdexpF128 = LibcallImpls<(add ldexp_f128), isNotOSWindowsOrIsCygwinMinGW>;
+defvar LibmHasFrexpF128 = LibcallImpls<(add frexpl_f128), isNotOSWindowsOrIsCygwinMinGW>;
+defvar LibmHasLdexpF128 = LibcallImpls<(add ldexpl_f128), isNotOSWindowsOrIsCygwinMinGW>;
 
 defvar has__stack_chk_fail = LibcallImpls<(add __stack_chk_fail), isNotOSOpenBSD>;
 defvar has__stack_chk_guard =
@@ -2459,7 +2461,7 @@ defvar X86CommonLibcalls =
        LibcallImpls<(add __bzero), darwinHas__bzero>,
        LibmHasFrexpF32, LibmHasLdexpF32,
        LibmHasFrexpF80, LibmHasLdexpF80,
-       LibcallImpls<(add frexp_f128, ldexp_f128, exp10l_f128), hasExpFrexplLdexplF128>,
+       LibcallImpls<(add frexpl_f128, ldexpl_f128, exp10l_f128), hasExpFrexplLdexplF128>,
        DefaultRuntimeLibcallImpls_f80,
        LibmHasExp10F32, LibmHasExp10F64, LibmHasExp10F80,
        LibcallImpls<(add MostPowI), isNotOSMSVCRT>,
diff --git a/llvm/include/llvm/IR/Value.h b/llvm/include/llvm/IR/Value.h
index 04d0391c04098..58822a0645747 100644
--- a/llvm/include/llvm/IR/Value.h
+++ b/llvm/include/llvm/IR/Value.h
@@ -484,8 +484,8 @@ class Value {
   /// Remove every uses that can safely be removed.
   ///
   /// This will remove for example uses in llvm.assume.
-  /// This should be used when performing want to perform a tranformation but
-  /// some Droppable uses pervent it.
+  /// This should be used when performing want to perform a transformation but
+  /// some Droppable uses prevent it.
   /// This function optionally takes a filter to only remove some droppable
   /// uses.
   LLVM_ABI void
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index cd774e7888e64..d507ba267d791 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -222,6 +222,7 @@ LLVM_ABI void
 initializeMachineSanitizerBinaryMetadataLegacyPass(PassRegistry &);
 LLVM_ABI void initializeMIR2VecVocabLegacyAnalysisPass(PassRegistry &);
 LLVM_ABI void initializeMIR2VecVocabPrinterLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeMIR2VecPrinterLegacyPassPass(PassRegistry &);
 LLVM_ABI void initializeMachineSchedulerLegacyPass(PassRegistry &);
 LLVM_ABI void initializeMachineSinkingLegacyPass(PassRegistry &);
 LLVM_ABI void initializeMachineTraceMetricsWrapperPassPass(PassRegistry &);
diff --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h
index 3a9a7f7c25859..000472fe9a3c4 100644
--- a/llvm/include/llvm/LTO/LTO.h
+++ b/llvm/include/llvm/LTO/LTO.h
@@ -105,12 +105,6 @@ setupStatsFile(StringRef StatsFilename);
 /// ordered indices to elements in the input array.
 LLVM_ABI std::vector<int> generateModulesOrdering(ArrayRef<BitcodeModule *> R);
 
-/// Updates MemProf attributes (and metadata) based on whether the index
-/// has recorded that we are linking with allocation libraries containing
-/// the necessary APIs for downstream transformations.
-LLVM_ABI void updateMemProfAttributes(Module &Mod,
-                                      const ModuleSummaryIndex &Index);
-
 class LTO;
 struct SymbolResolution;
 
diff --git a/llvm/include/llvm/Object/ELFTypes.h b/llvm/include/llvm/Object/ELFTypes.h
index 5a26e2fc31458..e9a417d3d4fb3 100644
--- a/llvm/include/llvm/Object/ELFTypes.h
+++ b/llvm/include/llvm/Object/ELFTypes.h
@@ -833,6 +833,7 @@ struct BBAddrMap {
     bool MultiBBRange : 1;
     bool OmitBBEntries : 1;
     bool CallsiteEndOffsets : 1;
+    bool BBHash : 1;
 
     bool hasPGOAnalysis() const { return FuncEntryCount || BBFreq || BrProb; }
 
@@ -845,7 +846,8 @@ struct BBAddrMap {
              (static_cast<uint8_t>(BrProb) << 2) |
              (static_cast<uint8_t>(MultiBBRange) << 3) |
              (static_cast<uint8_t>(OmitBBEntries) << 4) |
-             (static_cast<uint8_t>(CallsiteEndOffsets) << 5);
+             (static_cast<uint8_t>(CallsiteEndOffsets) << 5) |
+             (static_cast<uint8_t>(BBHash) << 6);
     }
 
     // Decodes from minimum bit width representation and validates no
@@ -854,7 +856,8 @@ struct BBAddrMap {
       Features Feat{
           static_cast<bool>(Val & (1 << 0)), static_cast<bool>(Val & (1 << 1)),
           static_cast<bool>(Val & (1 << 2)), static_cast<bool>(Val & (1 << 3)),
-          static_cast<bool>(Val & (1 << 4)), static_cast<bool>(Val & (1 << 5))};
+          static_cast<bool>(Val & (1 << 4)), static_cast<bool>(Val & (1 << 5)),
+          static_cast<bool>(Val & (1 << 6))};
       if (Feat.encode() != Val)
         return createStringError(
             std::error_code(), "invalid encoding for BBAddrMap::Features: 0x%x",
@@ -864,10 +867,10 @@ struct BBAddrMap {
 
     bool operator==(const Features &Other) const {
       return std::tie(FuncEntryCount, BBFreq, BrProb, MultiBBRange,
-                      OmitBBEntries, CallsiteEndOffsets) ==
+                      OmitBBEntries, CallsiteEndOffsets, BBHash) ==
              std::tie(Other.FuncEntryCount, Other.BBFreq, Other.BrProb,
                       Other.MultiBBRange, Other.OmitBBEntries,
-                      Other.CallsiteEndOffsets);
+                      Other.CallsiteEndOffsets, Other.BBHash);
     }
   };
 
@@ -920,17 +923,19 @@ struct BBAddrMap {
                    false}; // Metdata for this basic block.
     // Offsets of end of call instructions, relative to the basic block start.
     SmallVector<uint32_t, 1> CallsiteEndOffsets;
+    uint64_t Hash = 0; // Hash for this basic block.
 
     BBEntry(uint32_t ID, uint32_t Offset, uint32_t Size, Metadata MD,
-            SmallVector<uint32_t, 1> CallsiteEndOffsets)
+            SmallVector<uint32_t, 1> CallsiteEndOffsets, uint64_t Hash)
         : ID(ID), Offset(Offset), Size(Size), MD(MD),
-          CallsiteEndOffsets(std::move(CallsiteEndOffsets)) {}
+          CallsiteEndOffsets(std::move(CallsiteEndOffsets)), Hash(Hash) {}
 
     UniqueBBID getID() const { return {ID, 0}; }
 
     bool operator==(const BBEntry &Other) const {
       return ID == Other.ID && Offset == Other.Offset && Size == Other.Size &&
-             MD == Other.MD && CallsiteEndOffsets == Other.CallsiteEndOffsets;
+             MD == Other.MD && CallsiteEndOffsets == Other.CallsiteEndOffsets &&
+             Hash == Other.Hash;
     }
 
     bool hasReturn() const { return MD.HasReturn; }
diff --git a/llvm/include/llvm/ObjectYAML/ELFYAML.h b/llvm/include/llvm/ObjectYAML/ELFYAML.h
index c90591d009dcd..a7c7c7c436dc2 100644
--- a/llvm/include/llvm/ObjectYAML/ELFYAML.h
+++ b/llvm/include/llvm/ObjectYAML/ELFYAML.h
@@ -163,6 +163,7 @@ struct BBAddrMapEntry {
     llvm::yaml::Hex64 Size;
     llvm::yaml::Hex64 Metadata;
     std::optional<std::vector<llvm::yaml::Hex64>> CallsiteEndOffsets;
+    std::optional<llvm::yaml::Hex64> Hash;
   };
   uint8_t Version;
   llvm::yaml::Hex8 Feature;
diff --git a/llvm/include/llvm/ProfileData/InstrProfCorrelator.h b/llvm/include/llvm/ProfileData/InstrProfCorrelator.h
index d460eb1cdf528..1617ae782307c 100644
--- a/llvm/include/llvm/ProfileData/InstrProfCorrelator.h
+++ b/llvm/include/llvm/ProfileData/InstrProfCorrelator.h
@@ -13,6 +13,7 @@
 #define LLVM_PROFILEDATA_INSTRPROFCORRELATOR_H
 
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/Debuginfod/BuildIDFetcher.h"
 #include "llvm/Object/BuildID.h"
 #include "llvm/ProfileData/InstrProf.h"
@@ -24,7 +25,6 @@
 #include <vector>
 
 namespace llvm {
-class DWARFContext;
 class DWARFDie;
 namespace object {
 class ObjectFile;
diff --git a/llvm/include/llvm/Support/Alignment.h b/llvm/include/llvm/Support/Alignment.h
index a4ca54e26f18d..f9d7c76836657 100644
--- a/llvm/include/llvm/Support/Alignment.h
+++ b/llvm/include/llvm/Support/Alignment.h
@@ -103,7 +103,7 @@ inline Align assumeAligned(uint64_t Value) {
 
 /// This struct is a compact representation of a valid (power of two) or
 /// undefined (0) alignment.
-struct MaybeAlign : public std::optional<Align> {
+struct MaybeAlign : std::optional<Align> {
 private:
   using UP = std::optional<Align>;
 
diff --git a/llvm/include/llvm/Support/Caching.h b/llvm/include/llvm/Support/Caching.h
index 7fd9befbef9db..cebf071b0188f 100644
--- a/llvm/include/llvm/Support/Caching.h
+++ b/llvm/include/llvm/Support/Caching.h
@@ -17,11 +17,10 @@
 
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/MemoryBuffer.h"
 
 namespace llvm {
 
-class MemoryBuffer;
-
 /// This class wraps an output stream for a file. Most clients should just be
 /// able to return an instance of this base class from the stream callback, but
 /// if a client needs to perform some action after the stream is written to,
diff --git a/llvm/include/llvm/Support/Casting.h b/llvm/include/llvm/Support/Casting.h
index 2a9a149327d83..6f6df2e9703ea 100644
--- a/llvm/include/llvm/Support/Casting.h
+++ b/llvm/include/llvm/Support/Casting.h
@@ -340,7 +340,7 @@ struct ValueFromPointerCast
 /// during the cast. It's also a good example of how to implement a move-only
 /// cast.
 template <typename To, typename From, typename Derived = void>
-struct UniquePtrCast : public CastIsPossible<To, From *> {
+struct UniquePtrCast : CastIsPossible<To, From *> {
   using Self = detail::SelfType<Derived, UniquePtrCast<To, From>>;
   using CastResultType = std::unique_ptr<
       std::remove_reference_t<typename cast_retty<To, From>::ret_type>>;
@@ -473,7 +473,7 @@ struct ForwardToPointerCast {
 // take advantage of the cast traits whenever possible!
 
 template <typename To, typename From, typename Enable = void>
-struct CastInfo : public CastIsPossible<To, From> {
+struct CastInfo : CastIsPossible<To, From> {
   using Self = CastInfo<To, From, Enable>;
 
   using CastReturnType = typename cast_retty<To, From>::ret_type;
@@ -536,8 +536,7 @@ struct CastInfo<To, std::unique_ptr<From>> : public UniquePtrCast<To, From> {};
 /// the input is std::optional<From> that the output can be std::optional<To>.
 /// If that's not the case, specialize CastInfo for your use case.
 template <typename To, typename From>
-struct CastInfo<To, std::optional<From>> : public OptionalValueCast<To, From> {
-};
+struct CastInfo<To, std::optional<From>> : OptionalValueCast<To, From> {};
 
 /// isa<X> - Return true if the parameter to the template is an instance of one
 /// of the template type arguments.  Used like this:
diff --git a/llvm/include/llvm/Support/CommandLine.h b/llvm/include/llvm/Support/CommandLine.h
index dd05c530cc06e..5a5f00e844705 100644
--- a/llvm/include/llvm/Support/CommandLine.h
+++ b/llvm/include/llvm/Support/CommandLine.h
@@ -549,7 +549,7 @@ template <class DataType> struct OptionValue;
 // The default value safely does nothing. Option value printing is only
 // best-effort.
 template <class DataType, bool isClass>
-struct OptionValueBase : public GenericOptionValue {
+struct OptionValueBase : GenericOptionValue {
   // Temporary storage for argument passing.
   using WrapperType = OptionValue<DataType>;
 
diff --git a/llvm/include/llvm/Support/DOTGraphTraits.h b/llvm/include/llvm/Support/DOTGraphTraits.h
index ffa9abe328c83..3b9fe00d3282f 100644
--- a/llvm/include/llvm/Support/DOTGraphTraits.h
+++ b/llvm/include/llvm/Support/DOTGraphTraits.h
@@ -162,9 +162,8 @@ struct DefaultDOTGraphTraits {
 /// graphs are converted to 'dot' graphs.  When specializing, you may inherit
 /// from DefaultDOTGraphTraits if you don't need to override everything.
 ///
-template <typename Ty>
-struct DOTGraphTraits : public DefaultDOTGraphTraits {
-  DOTGraphTraits (bool simple=false) : DefaultDOTGraphTraits (simple) {}
+template <typename Ty> struct DOTGraphTraits : DefaultDOTGraphTraits {
+  using DefaultDOTGraphTraits::DefaultDOTGraphTraits;
 };
 
 } // End llvm namespace
diff --git a/llvm/include/llvm/Support/DebugLog.h b/llvm/include/llvm/Support/DebugLog.h
index 7025ca149ace1..e9bd2d83c46ba 100644
--- a/llvm/include/llvm/Support/DebugLog.h
+++ b/llvm/include/llvm/Support/DebugLog.h
@@ -221,12 +221,10 @@ constexpr ::llvm::StringRef strip_quotes(const char *Str) {
 #define LDBG_GET_DEBUG_TYPE_STR() LDBG_GET_DEBUG_TYPE_STR_(DEBUG_TYPE)
 
 /// Helper to call isCurrentDebugType with a StringRef.
-static LLVM_ATTRIBUTE_UNUSED bool ldbgIsCurrentDebugType(StringRef Type,
-                                                         int Level) {
+[[maybe_unused]] static bool ldbgIsCurrentDebugType(StringRef Type, int Level) {
   return ::llvm::isCurrentDebugType(Type.str().c_str(), Level);
 }
-static LLVM_ATTRIBUTE_UNUSED bool ldbgIsCurrentDebugType(int Level,
-                                                         StringRef Type) {
+[[maybe_unused]] static bool ldbgIsCurrentDebugType(int Level, StringRef Type) {
   return ::llvm::isCurrentDebugType(Type.str().c_str(), Level);
 }
 
@@ -295,14 +293,14 @@ class RAIINewLineStream final : public raw_ostream {
 
 public:
   RAIINewLineStream(raw_ostream &Os) : Os(Os) { SetUnbuffered(); }
-  ~RAIINewLineStream() { Os << '\n'; }
+  ~RAIINewLineStream() override { Os << '\n'; }
   void write_impl(const char *Ptr, size_t Size) final { Os.write(Ptr, Size); }
   uint64_t current_pos() const final { return Os.tell(); }
   RAIINewLineStream &asLvalue() { return *this; }
 };
 
 /// Remove the path prefix from the file name.
-static LLVM_ATTRIBUTE_UNUSED constexpr const char *
+[[maybe_unused]] static constexpr const char *
 getShortFileName(const char *path) {
   const char *filename = path;
   for (const char *p = path; *p != '\0'; ++p) {
@@ -315,7 +313,7 @@ getShortFileName(const char *path) {
 /// Compute the prefix for the debug log in the form of:
 /// "[DebugType] File:Line "
 /// Where the File is the file name without the path prefix.
-static LLVM_ATTRIBUTE_UNUSED std::string
+[[maybe_unused]] static std::string
 computePrefix(StringRef DebugType, const char *File, int Line, int Level) {
   std::string Prefix;
   raw_string_ostream OsPrefix(Prefix);
@@ -326,7 +324,7 @@ computePrefix(StringRef DebugType, const char *File, int Line, int Level) {
   return OsPrefix.str();
 }
 /// Overload allowing to swap the order of the DebugType and Level arguments.
-static LLVM_ATTRIBUTE_UNUSED std::string
+[[maybe_unused]] static std::string
 computePrefix(int Level, const char *File, int Line, StringRef DebugType) {
   return computePrefix(DebugType, File, Line, Level);
 }
diff --git a/llvm/include/llvm/Support/ELFAttrParserCompact.h b/llvm/include/llvm/Support/ELFAttrParserCompact.h
index e687483b21a8c..9bed36fdacd8b 100644
--- a/llvm/include/llvm/Support/ELFAttrParserCompact.h
+++ b/llvm/include/llvm/Support/ELFAttrParserCompact.h
@@ -49,7 +49,7 @@ class LLVM_ABI ELFCompactAttrParser : public ELFAttributeParser {
   }
 
 public:
-  virtual ~ELFCompactAttrParser() { static_cast<void>(!cursor.takeError()); }
+  ~ELFCompactAttrParser() override { static_cast<void>(!cursor.takeError()); }
   Error integerAttribute(unsigned tag);
   Error stringAttribute(unsigned tag);
 
diff --git a/llvm/include/llvm/Support/ELFAttrParserExtended.h b/llvm/include/llvm/Support/ELFAttrParserExtended.h
index 1da666595cd01..d3dc74263d2b0 100644
--- a/llvm/include/llvm/Support/ELFAttrParserExtended.h
+++ b/llvm/include/llvm/Support/ELFAttrParserExtended.h
@@ -36,7 +36,7 @@ class LLVM_ABI ELFExtendedAttrParser : public ELFAttributeParser {
                        const unsigned Tag);
 
 public:
-  virtual ~ELFExtendedAttrParser() { static_cast<void>(!Cursor.takeError()); }
+  ~ELFExtendedAttrParser() override { static_cast<void>(!Cursor.takeError()); }
   Error parse(ArrayRef<uint8_t> Section, llvm::endianness Endian) override;
 
   std::optional<unsigned> getAttributeValue(unsigned Tag) const override;
diff --git a/llvm/include/llvm/Support/ELFAttributes.h b/llvm/include/llvm/Support/ELFAttributes.h
index 270246f7e6d87..5771a8445949e 100644
--- a/llvm/include/llvm/Support/ELFAttributes.h
+++ b/llvm/include/llvm/Support/ELFAttributes.h
@@ -48,8 +48,6 @@ struct SubsectionAndTagToTagName {
   StringRef SubsectionName;
   unsigned Tag;
   StringRef TagName;
-  SubsectionAndTagToTagName(StringRef SN, unsigned Tg, StringRef TN)
-      : SubsectionName(SN), Tag(Tg), TagName(TN) {}
 };
 
 namespace ELFAttrs {
diff --git a/llvm/include/llvm/Support/FormatAdapters.h b/llvm/include/llvm/Support/FormatAdapters.h
index 4131e956873e9..91e9c41d8a395 100644
--- a/llvm/include/llvm/Support/FormatAdapters.h
+++ b/llvm/include/llvm/Support/FormatAdapters.h
@@ -77,7 +77,7 @@ class ErrorAdapter : public FormatAdapter<Error> {
 public:
   ErrorAdapter(Error &&Item) : FormatAdapter(std::move(Item)) {}
   ErrorAdapter(ErrorAdapter &&) = default;
-  ~ErrorAdapter() { consumeError(std::move(Item)); }
+  ~ErrorAdapter() override { consumeError(std::move(Item)); }
   void format(llvm::raw_ostream &Stream, StringRef Style) override {
     Stream << Item;
   }
diff --git a/llvm/include/llvm/Support/GraphWriter.h b/llvm/include/llvm/Support/GraphWriter.h
index a8784edfcf896..3bef75cc7e508 100644
--- a/llvm/include/llvm/Support/GraphWriter.h
+++ b/llvm/include/llvm/Support/GraphWriter.h
@@ -343,7 +343,7 @@ template <typename GraphType, typename Derived> class GraphWriterBase {
                 const void *DestNodeID, int DestNodePort,
                 const std::string &Attrs) {
     if (SrcNodePort  > 64) return;             // Eminating from truncated part?
-    if (DestNodePort > 64) DestNodePort = 64;  // Targeting the truncated part?
+    DestNodePort = std::min(DestNodePort, 64); // Targeting the truncated part?
 
     O << "\tNode" << SrcNodeID;
     if (SrcNodePort >= 0)
diff --git a/llvm/include/llvm/Support/LSP/Protocol.h b/llvm/include/llvm/Support/LSP/Protocol.h
index 93b82f1e581f8..e38203a8b4a39 100644
--- a/llvm/include/llvm/Support/LSP/Protocol.h
+++ b/llvm/include/llvm/Support/LSP/Protocol.h
@@ -449,7 +449,7 @@ struct ReferenceContext {
 bool fromJSON(const llvm::json::Value &value, ReferenceContext &result,
               llvm::json::Path path);
 
-struct ReferenceParams : public TextDocumentPositionParams {
+struct ReferenceParams : TextDocumentPositionParams {
   ReferenceContext context;
 };
 
diff --git a/llvm/include/llvm/Support/MD5.h b/llvm/include/llvm/Support/MD5.h
index ed29826bab0cb..4ba386753f397 100644
--- a/llvm/include/llvm/Support/MD5.h
+++ b/llvm/include/llvm/Support/MD5.h
@@ -41,7 +41,7 @@ template <typename T> class ArrayRef;
 
 class MD5 {
 public:
-  struct MD5Result : public std::array<uint8_t, 16> {
+  struct MD5Result : std::array<uint8_t, 16> {
     LLVM_ABI SmallString<32> digest() const;
 
     uint64_t low() const {
diff --git a/llvm/include/llvm/Support/MathExtras.h b/llvm/include/llvm/Support/MathExtras.h
index c2716a9704563..9bbb8a2a30541 100644
--- a/llvm/include/llvm/Support/MathExtras.h
+++ b/llvm/include/llvm/Support/MathExtras.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_SUPPORT_MATHEXTRAS_H
 #define LLVM_SUPPORT_MATHEXTRAS_H
 
+#include "llvm/ADT/STLForwardCompat.h"
 #include "llvm/ADT/bit.h"
 #include "llvm/Support/Compiler.h"
 #include <cassert>
@@ -42,38 +43,28 @@ using common_sint =
 
 /// Mathematical constants.
 namespace numbers {
-// TODO: Track C++20 std::numbers.
 // clang-format off
-constexpr double e          = 0x1.5bf0a8b145769P+1, // (2.7182818284590452354) https://oeis.org/A001113
-                 egamma     = 0x1.2788cfc6fb619P-1, // (.57721566490153286061) https://oeis.org/A001620
-                 ln2        = 0x1.62e42fefa39efP-1, // (.69314718055994530942) https://oeis.org/A002162
-                 ln10       = 0x1.26bb1bbb55516P+1, // (2.3025850929940456840) https://oeis.org/A002392
-                 log2e      = 0x1.71547652b82feP+0, // (1.4426950408889634074)
-                 log10e     = 0x1.bcb7b1526e50eP-2, // (.43429448190325182765)
-                 pi         = 0x1.921fb54442d18P+1, // (3.1415926535897932385) https://oeis.org/A000796
-                 inv_pi     = 0x1.45f306dc9c883P-2, // (.31830988618379067154) https://oeis.org/A049541
-                 sqrtpi     = 0x1.c5bf891b4ef6bP+0, // (1.7724538509055160273) https://oeis.org/A002161
-                 inv_sqrtpi = 0x1.20dd750429b6dP-1, // (.56418958354775628695) https://oeis.org/A087197
-                 sqrt2      = 0x1.6a09e667f3bcdP+0, // (1.4142135623730950488) https://oeis.org/A00219
-                 inv_sqrt2  = 0x1.6a09e667f3bcdP-1, // (.70710678118654752440)
-                 sqrt3      = 0x1.bb67ae8584caaP+0, // (1.7320508075688772935) https://oeis.org/A002194
-                 inv_sqrt3  = 0x1.279a74590331cP-1, // (.57735026918962576451)
-                 phi        = 0x1.9e3779b97f4a8P+0; // (1.6180339887498948482) https://oeis.org/A001622
-constexpr float ef          = 0x1.5bf0a8P+1F, // (2.71828183) https://oeis.org/A001113
-                egammaf     = 0x1.2788d0P-1F, // (.577215665) https://oeis.org/A001620
-                ln2f        = 0x1.62e430P-1F, // (.693147181) https://oeis.org/A002162
-                ln10f       = 0x1.26bb1cP+1F, // (2.30258509) https://oeis.org/A002392
-                log2ef      = 0x1.715476P+0F, // (1.44269504)
-                log10ef     = 0x1.bcb7b2P-2F, // (.434294482)
-                pif         = 0x1.921fb6P+1F, // (3.14159265) https://oeis.org/A000796
-                inv_pif     = 0x1.45f306P-2F, // (.318309886) https://oeis.org/A049541
-                sqrtpif     = 0x1.c5bf8aP+0F, // (1.77245385) https://oeis.org/A002161
-                inv_sqrtpif = 0x1.20dd76P-1F, // (.564189584) https://oeis.org/A087197
-                sqrt2f      = 0x1.6a09e6P+0F, // (1.41421356) https://oeis.org/A002193
-                inv_sqrt2f  = 0x1.6a09e6P-1F, // (.707106781)
-                sqrt3f      = 0x1.bb67aeP+0F, // (1.73205081) https://oeis.org/A002194
-                inv_sqrt3f  = 0x1.279a74P-1F, // (.577350269)
-                phif        = 0x1.9e377aP+0F; // (1.61803399) https://oeis.org/A001622
+inline constexpr float ef          = e_v<float>;
+inline constexpr float egammaf     = egamma_v<float>;
+inline constexpr float ln2f        = ln2_v<float>;
+inline constexpr float ln10f       = ln10_v<float>;
+inline constexpr float log2ef      = log2e_v<float>;
+inline constexpr float log10ef     = log10e_v<float>;
+inline constexpr float pif         = pi_v<float>;
+inline constexpr float inv_pif     = inv_pi_v<float>;
+inline constexpr float inv_sqrtpif = inv_sqrtpi_v<float>;
+inline constexpr float sqrt2f      = sqrt2_v<float>;
+inline constexpr float inv_sqrt2f  = inv_sqrt2_v<float>;
+inline constexpr float sqrt3f      = sqrt3_v<float>;
+inline constexpr float inv_sqrt3f  = inv_sqrt3_v<float>;
+inline constexpr float phif        = phi_v<float>;
+
+// sqrtpi is not in C++20 std::numbers.
+template <typename T, typename = std::enable_if_t<std::is_floating_point_v<T>>>
+inline constexpr T sqrtpi_v = T(0x1.c5bf891b4ef6bP+0); // (1.7724538509055160273) https://oeis.org/A002161
+inline constexpr double sqrtpi  = sqrtpi_v<double>;
+inline constexpr float  sqrtpif = sqrtpi_v<float>;
+
 // These string literals are taken from below:
 // https://github.com/bminor/glibc/blob/8543577b04ded6d979ffcc5a818930e4d74d0645/math/math.h#L1215-L1229
 constexpr const char *pis     = "3.141592653589793238462643383279502884",
@@ -325,11 +316,9 @@ inline bool isShiftedMask_64(uint64_t Value, unsigned &MaskIdx,
 /// Valid only for positive powers of two.
 template <size_t kValue> constexpr size_t ConstantLog2() {
   static_assert(llvm::isPowerOf2_64(kValue), "Value is not a valid power of 2");
-  return 1 + ConstantLog2<kValue / 2>();
+  return llvm::countr_zero_constexpr(kValue);
 }
 
-template <> constexpr size_t ConstantLog2<1>() { return 0; }
-
 template <size_t kValue>
 LLVM_DEPRECATED("Use ConstantLog2 instead", "ConstantLog2")
 constexpr size_t CTLog2() {
diff --git a/llvm/include/llvm/Support/ScopedPrinter.h b/llvm/include/llvm/Support/ScopedPrinter.h
index 7b87fdafd68ac..e9e9903b2482a 100644
--- a/llvm/include/llvm/Support/ScopedPrinter.h
+++ b/llvm/include/llvm/Support/ScopedPrinter.h
@@ -870,7 +870,7 @@ struct DictScope : DelimitedScope {
     W.objectBegin();
   }
 
-  ~DictScope() {
+  ~DictScope() override {
     if (W)
       W->objectEnd();
   }
@@ -889,7 +889,7 @@ struct ListScope : DelimitedScope {
     W.arrayBegin();
   }
 
-  ~ListScope() {
+  ~ListScope() override {
     if (W)
       W->arrayEnd();
   }
diff --git a/llvm/include/llvm/Support/SourceMgr.h b/llvm/include/llvm/Support/SourceMgr.h
index 5637b64c4cbfd..8320006ff5f6e 100644
--- a/llvm/include/llvm/Support/SourceMgr.h
+++ b/llvm/include/llvm/Support/SourceMgr.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_SUPPORT_SOURCEMGR_H
 #define LLVM_SUPPORT_SOURCEMGR_H
 
+#include "llvm/ADT/IntrusiveRefCntPtr.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -23,6 +24,10 @@
 
 namespace llvm {
 
+namespace vfs {
+class FileSystem;
+} // end namespace vfs
+
 class raw_ostream;
 class SMDiagnostic;
 class SMFixIt;
@@ -91,15 +96,25 @@ class SourceMgr {
   DiagHandlerTy DiagHandler = nullptr;
   void *DiagContext = nullptr;
 
+  // Optional file system for finding include files.
+  IntrusiveRefCntPtr<vfs::FileSystem> FS;
+
   bool isValidBufferID(unsigned i) const { return i && i <= Buffers.size(); }
 
 public:
-  SourceMgr() = default;
+  /// Create new source manager without support for include files.
+  SourceMgr();
+  /// Create new source manager with the capability of finding include files
+  /// via the provided file system.
+  explicit SourceMgr(IntrusiveRefCntPtr<vfs::FileSystem> FS);
   SourceMgr(const SourceMgr &) = delete;
   SourceMgr &operator=(const SourceMgr &) = delete;
-  SourceMgr(SourceMgr &&) = default;
-  SourceMgr &operator=(SourceMgr &&) = default;
-  ~SourceMgr() = default;
+  SourceMgr(SourceMgr &&);
+  SourceMgr &operator=(SourceMgr &&);
+  ~SourceMgr();
+
+  IntrusiveRefCntPtr<vfs::FileSystem> getVirtualFileSystem() const;
+  void setVirtualFileSystem(IntrusiveRefCntPtr<vfs::FileSystem> FS);
 
   /// Return the include directories of this source manager.
   ArrayRef<std::string> getIncludeDirs() const { return IncludeDirectories; }
diff --git a/llvm/include/llvm/Support/SuffixTreeNode.h b/llvm/include/llvm/Support/SuffixTreeNode.h
index b49febf16e01e..c3c0bf5d14926 100644
--- a/llvm/include/llvm/Support/SuffixTreeNode.h
+++ b/llvm/include/llvm/Support/SuffixTreeNode.h
@@ -155,7 +155,7 @@ struct LLVM_ABI SuffixTreeInternalNode : SuffixTreeNode {
       : SuffixTreeNode(NodeKind::ST_Internal, StartIdx), EndIdx(EndIdx),
         Link(Link) {}
 
-  virtual ~SuffixTreeInternalNode() = default;
+  ~SuffixTreeInternalNode() override = default;
 };
 
 // A node representing a suffix.
@@ -189,7 +189,7 @@ struct LLVM_ABI SuffixTreeLeafNode : SuffixTreeNode {
   SuffixTreeLeafNode(unsigned StartIdx, unsigned *EndIdx)
       : SuffixTreeNode(NodeKind::ST_Leaf, StartIdx), EndIdx(EndIdx) {}
 
-  virtual ~SuffixTreeLeafNode() = default;
+  ~SuffixTreeLeafNode() override = default;
 };
 } // namespace llvm
 #endif // LLVM_SUPPORT_SUFFIXTREE_NODE_H
diff --git a/llvm/include/llvm/Support/Timer.h b/llvm/include/llvm/Support/Timer.h
index 40709d49db011..6a4475882000a 100644
--- a/llvm/include/llvm/Support/Timer.h
+++ b/llvm/include/llvm/Support/Timer.h
@@ -66,6 +66,12 @@ class TimeRecord {
     MemUsed -= RHS.MemUsed;
     InstructionsExecuted -= RHS.InstructionsExecuted;
   }
+  TimeRecord operator-(const TimeRecord &RHS) const {
+    TimeRecord R = *this;
+    R -= RHS;
+    return R;
+  }
+  // Feel free to add operator+ if you need it
 
   /// Print the current time record to \p OS, with a breakdown showing
   /// contributions to the \p Total time record.
@@ -167,7 +173,7 @@ class TimeRegion {
 /// you to declare a new timer, AND specify the region to time, all in one
 /// statement.  All timers with the same name are merged.  This is primarily
 /// used for debugging and for hunting performance problems.
-struct NamedRegionTimer : public TimeRegion {
+struct NamedRegionTimer : TimeRegion {
   LLVM_ABI explicit NamedRegionTimer(StringRef Name, StringRef Description,
                                      StringRef GroupName,
                                      StringRef GroupDescription,
diff --git a/llvm/include/llvm/Support/VirtualFileSystem.h b/llvm/include/llvm/Support/VirtualFileSystem.h
index 8007f3c853f20..c8911a0225f86 100644
--- a/llvm/include/llvm/Support/VirtualFileSystem.h
+++ b/llvm/include/llvm/Support/VirtualFileSystem.h
@@ -268,7 +268,7 @@ class LLVM_ABI FileSystem : public llvm::ThreadSafeRefCountedBase<FileSystem>,
                             public RTTIExtends<FileSystem, RTTIRoot> {
 public:
   static const char ID;
-  virtual ~FileSystem();
+  ~FileSystem() override;
 
   /// Get the status of the entry at \p Path, if one exists.
   virtual llvm::ErrorOr<Status> status(const Twine &Path) = 0;
@@ -495,7 +495,7 @@ class LLVM_ABI ProxyFileSystem
 private:
   IntrusiveRefCntPtr<FileSystem> FS;
 
-  virtual void anchor() override;
+  void anchor() override;
 };
 
 namespace detail {
diff --git a/llvm/include/llvm/Support/VirtualOutputFile.h b/llvm/include/llvm/Support/VirtualOutputFile.h
index cb6d1c3d78139..dd50437605deb 100644
--- a/llvm/include/llvm/Support/VirtualOutputFile.h
+++ b/llvm/include/llvm/Support/VirtualOutputFile.h
@@ -31,7 +31,7 @@ class OutputFileImpl : public RTTIExtends<OutputFileImpl, RTTIRoot> {
 
 public:
   static char ID;
-  virtual ~OutputFileImpl() = default;
+  ~OutputFileImpl() override = default;
 
   virtual Error keep() = 0;
   virtual Error discard() = 0;
diff --git a/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h b/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
index faaff4ad8ee02..4aa6c01d29cc2 100644
--- a/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
+++ b/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
@@ -121,6 +121,7 @@ enum attributeBits {
              "The Dynamic Duo!  Prefer over all else because this changes "    \
              "most operands' meaning")                                         \
   ENUM_ENTRY(IC_64BIT_REX2, 2, "requires a REX2 prefix")                       \
+  ENUM_ENTRY(IC_64BIT_REX2_REXW, 3, "requires a REX2 and the W prefix")        \
   ENUM_ENTRY(IC_VEX, 1, "requires a VEX prefix")                               \
   ENUM_ENTRY(IC_VEX_XS, 2, "requires VEX and the XS prefix")                   \
   ENUM_ENTRY(IC_VEX_XD, 2, "requires VEX and the XD prefix")                   \
diff --git a/llvm/include/llvm/Support/raw_socket_stream.h b/llvm/include/llvm/Support/raw_socket_stream.h
index 47352e3c04220..2abff25920d16 100644
--- a/llvm/include/llvm/Support/raw_socket_stream.h
+++ b/llvm/include/llvm/Support/raw_socket_stream.h
@@ -126,7 +126,7 @@ class LLVM_ABI raw_socket_stream : public raw_fd_stream {
 
 public:
   raw_socket_stream(int SocketFD);
-  ~raw_socket_stream();
+  ~raw_socket_stream() override;
 
   /// Create a \p raw_socket_stream connected to the UNIX domain socket at \p
   /// SocketPath.
diff --git a/llvm/include/llvm/TableGen/CodeGenHelpers.h b/llvm/include/llvm/TableGen/CodeGenHelpers.h
index 5b823db494e7a..e22c6d4f6d390 100644
--- a/llvm/include/llvm/TableGen/CodeGenHelpers.h
+++ b/llvm/include/llvm/TableGen/CodeGenHelpers.h
@@ -34,20 +34,37 @@ class IfDefEmitter {
   raw_ostream &OS;
 };
 
+// Simple RAII helper for emitting header include guard (ifndef-define-endif).
+class IncludeGuardEmitter {
+public:
+  IncludeGuardEmitter(raw_ostream &OS, StringRef Name)
+      : Name(Name.str()), OS(OS) {
+    OS << "#ifndef " << Name << "\n"
+       << "#define " << Name << "\n\n";
+  }
+  ~IncludeGuardEmitter() { OS << "\n#endif // " << Name << "\n"; }
+
+private:
+  std::string Name;
+  raw_ostream &OS;
+};
+
 // Simple RAII helper for emitting namespace scope. Name can be a single
-// namespace (empty for anonymous namespace) or nested namespace.
+// namespace or nested namespace. If the name is empty, will not generate any
+// namespace scope.
 class NamespaceEmitter {
 public:
-  NamespaceEmitter(raw_ostream &OS, StringRef Name)
-      : Name(trim(Name).str()), OS(OS) {
-    OS << "namespace " << this->Name << " {\n";
+  NamespaceEmitter(raw_ostream &OS, StringRef NameUntrimmed)
+      : Name(trim(NameUntrimmed).str()), OS(OS) {
+    if (!Name.empty())
+      OS << "namespace " << Name << " {\n";
   }
 
   ~NamespaceEmitter() { close(); }
 
   // Explicit function to close the namespace scopes.
   void close() {
-    if (!Closed)
+    if (!Closed && !Name.empty())
       OS << "} // namespace " << Name << "\n";
     Closed = true;
   }
diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h
index dc8cd86d2a69a..5e43444d52a14 100644
--- a/llvm/include/llvm/TargetParser/Triple.h
+++ b/llvm/include/llvm/TargetParser/Triple.h
@@ -935,7 +935,8 @@ class Triple {
             getEnvironment() == Triple::GNUEABIHF ||
             getEnvironment() == Triple::GNUEABIHFT64 ||
             getEnvironment() == Triple::OpenHOS ||
-            getEnvironment() == Triple::MuslEABIHF || isAndroid()) &&
+            getEnvironment() == Triple::MuslEABIHF || isOSFuchsia() ||
+            isAndroid()) &&
            isOSBinFormatELF() && !isOSNetBSD();
   }
 
diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.def b/llvm/include/llvm/TargetParser/X86TargetParser.def
index e62aa6d6dadbc..a94eab1d7ae34 100644
--- a/llvm/include/llvm/TargetParser/X86TargetParser.def
+++ b/llvm/include/llvm/TargetParser/X86TargetParser.def
@@ -108,6 +108,7 @@ X86_CPU_SUBTYPE(INTEL_COREI7_ARROWLAKE_S,    "arrowlake-s")
 X86_CPU_SUBTYPE(INTEL_COREI7_PANTHERLAKE,    "pantherlake")
 X86_CPU_SUBTYPE(AMDFAM1AH_ZNVER5,            "znver5")
 X86_CPU_SUBTYPE(INTEL_COREI7_DIAMONDRAPIDS,  "diamondrapids")
+X86_CPU_SUBTYPE(INTEL_COREI7_NOVALAKE,       "novalake")
 
 // Alternate names supported by __builtin_cpu_is and target multiversioning.
 X86_CPU_SUBTYPE_ALIAS(INTEL_COREI7_ALDERLAKE, "raptorlake")
@@ -115,6 +116,7 @@ X86_CPU_SUBTYPE_ALIAS(INTEL_COREI7_ALDERLAKE, "meteorlake")
 X86_CPU_SUBTYPE_ALIAS(INTEL_COREI7_SAPPHIRERAPIDS, "emeraldrapids")
 X86_CPU_SUBTYPE_ALIAS(INTEL_COREI7_ARROWLAKE_S,"lunarlake")
 X86_CPU_SUBTYPE_ALIAS(INTEL_COREI7_ALDERLAKE, "gracemont")
+X86_CPU_SUBTYPE_ALIAS(INTEL_COREI7_PANTHERLAKE, "wildcatlake")
 
 #undef X86_CPU_SUBTYPE_ALIAS
 #undef X86_CPU_SUBTYPE
diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.h b/llvm/include/llvm/TargetParser/X86TargetParser.h
index f6aeaada346e7..80f3d35da9a9b 100644
--- a/llvm/include/llvm/TargetParser/X86TargetParser.h
+++ b/llvm/include/llvm/TargetParser/X86TargetParser.h
@@ -116,6 +116,8 @@ enum CPUKind {
   CK_ArrowlakeS,
   CK_Lunarlake,
   CK_Pantherlake,
+  CK_Wildcatlake,
+  CK_Novalake,
   CK_Sierraforest,
   CK_Grandridge,
   CK_Graniterapids,
diff --git a/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h b/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
index f2de08353f2a6..576f1eb09953a 100644
--- a/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
+++ b/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
@@ -95,6 +95,16 @@ class MemProfContextDisambiguation
            function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
                isPrevailing);
 };
+
+/// Strips MemProf attributes and metadata. Can be invoked by the pass pipeline
+/// when we don't have an index that has recorded that we are linking with
+/// allocation libraries containing the necessary APIs for downstream
+/// transformations.
+class MemProfRemoveInfo : public PassInfoMixin<MemProfRemoveInfo> {
+public:
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_IPO_MEMPROF_CONTEXT_DISAMBIGUATION_H
diff --git a/llvm/include/llvm/Transforms/Scalar/LoopPassManager.h b/llvm/include/llvm/Transforms/Scalar/LoopPassManager.h
index 750f9546625a2..1842d2dc5f05a 100644
--- a/llvm/include/llvm/Transforms/Scalar/LoopPassManager.h
+++ b/llvm/include/llvm/Transforms/Scalar/LoopPassManager.h
@@ -404,10 +404,8 @@ class FunctionToLoopPassAdaptor
 
   explicit FunctionToLoopPassAdaptor(std::unique_ptr<PassConceptT> Pass,
                                      bool UseMemorySSA = false,
-                                     bool UseBlockFrequencyInfo = false,
                                      bool LoopNestMode = false)
       : Pass(std::move(Pass)), UseMemorySSA(UseMemorySSA),
-        UseBlockFrequencyInfo(UseBlockFrequencyInfo),
         LoopNestMode(LoopNestMode) {
     LoopCanonicalizationFPM.addPass(LoopSimplifyPass());
     LoopCanonicalizationFPM.addPass(LCSSAPass());
@@ -429,7 +427,6 @@ class FunctionToLoopPassAdaptor
   FunctionPassManager LoopCanonicalizationFPM;
 
   bool UseMemorySSA = false;
-  bool UseBlockFrequencyInfo = false;
   const bool LoopNestMode;
 };
 
@@ -442,8 +439,7 @@ class FunctionToLoopPassAdaptor
 /// \c LoopPassManager and the returned adaptor will be in loop-nest mode.
 template <typename LoopPassT>
 inline FunctionToLoopPassAdaptor
-createFunctionToLoopPassAdaptor(LoopPassT &&Pass, bool UseMemorySSA = false,
-                                bool UseBlockFrequencyInfo = false) {
+createFunctionToLoopPassAdaptor(LoopPassT &&Pass, bool UseMemorySSA = false) {
   if constexpr (is_detected<HasRunOnLoopT, LoopPassT>::value) {
     using PassModelT =
         detail::PassModel<Loop, LoopPassT, LoopAnalysisManager,
@@ -453,7 +449,7 @@ createFunctionToLoopPassAdaptor(LoopPassT &&Pass, bool UseMemorySSA = false,
     return FunctionToLoopPassAdaptor(
         std::unique_ptr<FunctionToLoopPassAdaptor::PassConceptT>(
             new PassModelT(std::forward<LoopPassT>(Pass))),
-        UseMemorySSA, UseBlockFrequencyInfo, false);
+        UseMemorySSA, false);
   } else {
     LoopPassManager LPM;
     LPM.addPass(std::forward<LoopPassT>(Pass));
@@ -465,7 +461,7 @@ createFunctionToLoopPassAdaptor(LoopPassT &&Pass, bool UseMemorySSA = false,
     return FunctionToLoopPassAdaptor(
         std::unique_ptr<FunctionToLoopPassAdaptor::PassConceptT>(
             new PassModelT(std::move(LPM))),
-        UseMemorySSA, UseBlockFrequencyInfo, true);
+        UseMemorySSA, true);
   }
 }
 
@@ -474,8 +470,7 @@ createFunctionToLoopPassAdaptor(LoopPassT &&Pass, bool UseMemorySSA = false,
 template <>
 inline FunctionToLoopPassAdaptor
 createFunctionToLoopPassAdaptor<LoopPassManager>(LoopPassManager &&LPM,
-                                                 bool UseMemorySSA,
-                                                 bool UseBlockFrequencyInfo) {
+                                                 bool UseMemorySSA) {
   // Check if LPM contains any loop pass and if it does not, returns an adaptor
   // in loop-nest mode.
   using PassModelT =
@@ -487,7 +482,7 @@ createFunctionToLoopPassAdaptor<LoopPassManager>(LoopPassManager &&LPM,
   return FunctionToLoopPassAdaptor(
       std::unique_ptr<FunctionToLoopPassAdaptor::PassConceptT>(
           new PassModelT(std::move(LPM))),
-      UseMemorySSA, UseBlockFrequencyInfo, LoopNestMode);
+      UseMemorySSA, LoopNestMode);
 }
 
 /// Pass for printing a loop's contents as textual IR.
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 45c889cef8e2c..e9e2e7d0316c7 100755
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -2177,16 +2177,13 @@ Constant *constantFoldVectorReduce(Intrinsic::ID IID, Constant *Op) {
     return PoisonValue::get(VT->getElementType());
 
   // TODO: Handle undef.
-  if (!isa<ConstantVector>(Op) && !isa<ConstantDataVector>(Op))
-    return nullptr;
-
-  auto *EltC = dyn_cast<ConstantInt>(Op->getAggregateElement(0U));
+  auto *EltC = dyn_cast_or_null<ConstantInt>(Op->getAggregateElement(0U));
   if (!EltC)
     return nullptr;
 
   APInt Acc = EltC->getValue();
   for (unsigned I = 1, E = VT->getNumElements(); I != E; I++) {
-    if (!(EltC = dyn_cast<ConstantInt>(Op->getAggregateElement(I))))
+    if (!(EltC = dyn_cast_or_null<ConstantInt>(Op->getAggregateElement(I))))
       return nullptr;
     const APInt &X = EltC->getValue();
     switch (IID) {
@@ -3059,35 +3056,25 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
       Val = Val | Val << 1;
       return ConstantInt::get(Ty, Val);
     }
-
-    default:
-      return nullptr;
     }
   }
 
-  switch (IntrinsicID) {
-  default: break;
-  case Intrinsic::vector_reduce_add:
-  case Intrinsic::vector_reduce_mul:
-  case Intrinsic::vector_reduce_and:
-  case Intrinsic::vector_reduce_or:
-  case Intrinsic::vector_reduce_xor:
-  case Intrinsic::vector_reduce_smin:
-  case Intrinsic::vector_reduce_smax:
-  case Intrinsic::vector_reduce_umin:
-  case Intrinsic::vector_reduce_umax:
-    if (Constant *C = constantFoldVectorReduce(IntrinsicID, Operands[0]))
-      return C;
-    break;
-  }
-
-  // Support ConstantVector in case we have an Undef in the top.
-  if (isa<ConstantVector>(Operands[0]) ||
-      isa<ConstantDataVector>(Operands[0]) ||
-      isa<ConstantAggregateZero>(Operands[0])) {
+  if (Operands[0]->getType()->isVectorTy()) {
     auto *Op = cast<Constant>(Operands[0]);
     switch (IntrinsicID) {
     default: break;
+    case Intrinsic::vector_reduce_add:
+    case Intrinsic::vector_reduce_mul:
+    case Intrinsic::vector_reduce_and:
+    case Intrinsic::vector_reduce_or:
+    case Intrinsic::vector_reduce_xor:
+    case Intrinsic::vector_reduce_smin:
+    case Intrinsic::vector_reduce_smax:
+    case Intrinsic::vector_reduce_umin:
+    case Intrinsic::vector_reduce_umax:
+      if (Constant *C = constantFoldVectorReduce(IntrinsicID, Operands[0]))
+        return C;
+      break;
     case Intrinsic::x86_sse_cvtss2si:
     case Intrinsic::x86_sse_cvtss2si64:
     case Intrinsic::x86_sse2_cvtsd2si:
@@ -3116,10 +3103,15 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
     case Intrinsic::wasm_alltrue:
       // Check each element individually
       unsigned E = cast<FixedVectorType>(Op->getType())->getNumElements();
-      for (unsigned I = 0; I != E; ++I)
-        if (Constant *Elt = Op->getAggregateElement(I))
-          if (Elt->isZeroValue())
-            return ConstantInt::get(Ty, 0);
+      for (unsigned I = 0; I != E; ++I) {
+        Constant *Elt = Op->getAggregateElement(I);
+        // Return false as soon as we find a non-true element.
+        if (Elt && Elt->isZeroValue())
+          return ConstantInt::get(Ty, 0);
+        // Bail as soon as we find an element we cannot prove to be true.
+        if (!Elt || !isa<ConstantInt>(Elt))
+          return nullptr;
+      }
 
       return ConstantInt::get(Ty, 1);
     }
@@ -4064,8 +4056,8 @@ static Constant *ConstantFoldFixedVectorCall(
   switch (IntrinsicID) {
   case Intrinsic::masked_load: {
     auto *SrcPtr = Operands[0];
-    auto *Mask = Operands[2];
-    auto *Passthru = Operands[3];
+    auto *Mask = Operands[1];
+    auto *Passthru = Operands[2];
 
     Constant *VecData = ConstantFoldLoadFromConstPtr(SrcPtr, FVTy, DL);
 
diff --git a/llvm/lib/Analysis/DXILResource.cpp b/llvm/lib/Analysis/DXILResource.cpp
index b78cc03e34dbc..f9bf09262dd1f 100644
--- a/llvm/lib/Analysis/DXILResource.cpp
+++ b/llvm/lib/Analysis/DXILResource.cpp
@@ -281,6 +281,38 @@ static StructType *getOrCreateElementStruct(Type *ElemType, StringRef Name) {
   return StructType::create(ElemType, Name);
 }
 
+static Type *getTypeWithoutPadding(Type *Ty) {
+  // Recursively remove padding from structures.
+  if (auto *ST = dyn_cast<StructType>(Ty)) {
+    LLVMContext &Ctx = Ty->getContext();
+    SmallVector<Type *> ElementTypes;
+    ElementTypes.reserve(ST->getNumElements());
+    for (Type *ElTy : ST->elements()) {
+      if (isa<PaddingExtType>(ElTy))
+        continue;
+      ElementTypes.push_back(getTypeWithoutPadding(ElTy));
+    }
+
+    // Handle explicitly padded cbuffer arrays like { [ n x paddedty ], ty }
+    if (ElementTypes.size() == 2)
+      if (auto *AT = dyn_cast<ArrayType>(ElementTypes[0]))
+        if (ElementTypes[1] == AT->getElementType())
+          return ArrayType::get(ElementTypes[1], AT->getNumElements() + 1);
+
+    // If we only have a single element, don't wrap it in a struct.
+    if (ElementTypes.size() == 1)
+      return ElementTypes[0];
+
+    return StructType::get(Ctx, ElementTypes, /*IsPacked=*/false);
+  }
+  // Arrays just need to have their element type adjusted.
+  if (auto *AT = dyn_cast<ArrayType>(Ty))
+    return ArrayType::get(getTypeWithoutPadding(AT->getElementType()),
+                          AT->getNumElements());
+  // Anything else should be good as is.
+  return Ty;
+}
+
 StructType *ResourceTypeInfo::createElementStruct(StringRef CBufferName) {
   SmallString<64> TypeName;
 
@@ -334,14 +366,21 @@ StructType *ResourceTypeInfo::createElementStruct(StringRef CBufferName) {
   }
   case ResourceKind::CBuffer: {
     auto *RTy = cast<CBufferExtType>(HandleTy);
-    LayoutExtType *LayoutType = cast<LayoutExtType>(RTy->getResourceType());
-    StructType *Ty = cast<StructType>(LayoutType->getWrappedType());
     SmallString<64> Name = getResourceKindName(Kind);
     if (!CBufferName.empty()) {
       Name.append(".");
       Name.append(CBufferName);
     }
-    return StructType::create(Ty->elements(), Name);
+
+    // TODO: Remove this when we update the frontend to use explicit padding.
+    if (LayoutExtType *LayoutType =
+            dyn_cast<LayoutExtType>(RTy->getResourceType())) {
+      StructType *Ty = cast<StructType>(LayoutType->getWrappedType());
+      return StructType::create(Ty->elements(), Name);
+    }
+
+    return getOrCreateElementStruct(
+        getTypeWithoutPadding(RTy->getResourceType()), Name);
   }
   case ResourceKind::Sampler: {
     auto *RTy = cast<SamplerExtType>(HandleTy);
@@ -454,10 +493,10 @@ uint32_t ResourceTypeInfo::getCBufferSize(const DataLayout &DL) const {
 
   Type *ElTy = cast<CBufferExtType>(HandleTy)->getResourceType();
 
+  // TODO: Remove this when we update the frontend to use explicit padding.
   if (auto *LayoutTy = dyn_cast<LayoutExtType>(ElTy))
     return LayoutTy->getSize();
 
-  // TODO: What should we do with unannotated arrays?
   return DL.getTypeAllocSize(ElTy);
 }
 
diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp
index 8d20b0e10305b..853bd66c8a7f8 100644
--- a/llvm/lib/Analysis/DependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/DependenceAnalysis.cpp
@@ -122,11 +122,61 @@ static cl::opt<unsigned> MIVMaxLevelThreshold(
     cl::desc("Maximum depth allowed for the recursive algorithm used to "
              "explore MIV direction vectors."));
 
-static cl::opt<bool> RunSIVRoutinesOnly(
-    "da-run-siv-routines-only", cl::init(false), cl::ReallyHidden,
-    cl::desc("Run only SIV routines and disable others (ZIV, RDIV, and MIV). "
-             "The purpose is mainly to exclude the influence of those routines "
-             "in regression tests for SIV routines."));
+namespace {
+
+/// Types of dependence test routines.
+enum class DependenceTestType {
+  All,
+  StrongSIV,
+  WeakCrossingSIV,
+  ExactSIV,
+  WeakZeroSIV,
+  ExactRDIV,
+  SymbolicRDIV,
+  GCDMIV,
+  BanerjeeMIV,
+};
+
+} // anonymous namespace
+
+static cl::opt<DependenceTestType> EnableDependenceTest(
+    "da-enable-dependence-test", cl::init(DependenceTestType::All),
+    cl::ReallyHidden,
+    cl::desc("Run only specified dependence test routine and disable others. "
+             "The purpose is mainly to exclude the influence of other "
+             "dependence test routines in regression tests. If set to All, all "
+             "dependence test routines are enabled."),
+    cl::values(clEnumValN(DependenceTestType::All, "all",
+                          "Enable all dependence test routines."),
+               clEnumValN(DependenceTestType::StrongSIV, "strong-siv",
+                          "Enable only Strong SIV test."),
+               clEnumValN(DependenceTestType::WeakCrossingSIV,
+                          "weak-crossing-siv",
+                          "Enable only Weak-Crossing SIV test."),
+               clEnumValN(DependenceTestType::ExactSIV, "exact-siv",
+                          "Enable only Exact SIV test."),
+               clEnumValN(DependenceTestType::WeakZeroSIV, "weak-zero-siv",
+                          "Enable only Weak-Zero SIV test."),
+               clEnumValN(DependenceTestType::ExactRDIV, "exact-rdiv",
+                          "Enable only Exact RDIV test."),
+               clEnumValN(DependenceTestType::SymbolicRDIV, "symbolic-rdiv",
+                          "Enable only Symbolic RDIV test."),
+               clEnumValN(DependenceTestType::GCDMIV, "gcd-miv",
+                          "Enable only GCD MIV test."),
+               clEnumValN(DependenceTestType::BanerjeeMIV, "banerjee-miv",
+                          "Enable only Banerjee MIV test.")));
+
+// TODO: This flag is disabled by default because it is still under development.
+// Enable it or delete this flag when the feature is ready.
+static cl::opt<bool> EnableMonotonicityCheck(
+    "da-enable-monotonicity-check", cl::init(false), cl::Hidden,
+    cl::desc("Check if the subscripts are monotonic. If it's not, dependence "
+             "is reported as unknown."));
+
+static cl::opt<bool> DumpMonotonicityReport(
+    "da-dump-monotonicity-report", cl::init(false), cl::Hidden,
+    cl::desc(
+        "When printing analysis, dump the results of monotonicity checks."));
 
 //===----------------------------------------------------------------------===//
 // basics
@@ -177,13 +227,196 @@ void DependenceAnalysisWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequiredTransitive<LoopInfoWrapperPass>();
 }
 
+namespace {
+
+/// The property of monotonicity of a SCEV. To define the monotonicity, assume
+/// a SCEV defined within N-nested loops. Let i_k denote the iteration number
+/// of the k-th loop. Then we can regard the SCEV as an N-ary function:
+///
+///   F(i_1, i_2, ..., i_N)
+///
+/// The domain of i_k is the closed range [0, BTC_k], where BTC_k is the
+/// backedge-taken count of the k-th loop.
+///
+/// A function F is said to be "monotonically increasing with respect to the
+/// k-th loop" if x <= y implies the following condition:
+///
+///   F(i_1, ..., i_{k-1}, x, i_{k+1}, ..., i_N) <=
+///   F(i_1, ..., i_{k-1}, y, i_{k+1}, ..., i_N)
+///
+/// where i_1, ..., i_{k-1}, i_{k+1}, ..., i_N, x, and y are elements of their
+/// respective domains.
+///
+/// Likewise F is "monotonically decreasing with respect to the k-th loop"
+/// if x <= y implies
+///
+///   F(i_1, ..., i_{k-1}, x, i_{k+1}, ..., i_N) >=
+///   F(i_1, ..., i_{k-1}, y, i_{k+1}, ..., i_N)
+///
+/// A function F that is monotonically increasing or decreasing with respect to
+/// the k-th loop is simply called "monotonic with respect to k-th loop".
+///
+/// A function F is said to be "multivariate monotonic" when it is monotonic
+/// with respect to all of the N loops.
+///
+/// Since integer comparison can be either signed or unsigned, we need to
+/// distinguish monotonicity in the signed sense from that in the unsigned
+/// sense. Note that the inequality "x <= y" merely indicates loop progression
+/// and is not affected by the difference between signed and unsigned order.
+///
+/// Currently we only consider monotonicity in a signed sense.
+enum class SCEVMonotonicityType {
+  /// We don't know anything about the monotonicity of the SCEV.
+  Unknown,
+
+  /// The SCEV is loop-invariant with respect to the outermost loop. In other
+  /// words, the function F corresponding to the SCEV is a constant function.
+  Invariant,
+
+  /// The function F corresponding to the SCEV is multivariate monotonic in a
+  /// signed sense. Note that the multivariate monotonic function may also be a
+  /// constant function. The order employed in the definition of monotonicity
+  /// is not strict order.
+  MultivariateSignedMonotonic,
+};
+
+struct SCEVMonotonicity {
+  SCEVMonotonicity(SCEVMonotonicityType Type,
+                   const SCEV *FailurePoint = nullptr);
+
+  SCEVMonotonicityType getType() const { return Type; }
+
+  const SCEV *getFailurePoint() const { return FailurePoint; }
+
+  bool isUnknown() const { return Type == SCEVMonotonicityType::Unknown; }
+
+  void print(raw_ostream &OS, unsigned Depth) const;
+
+private:
+  SCEVMonotonicityType Type;
+
+  /// The subexpression that caused Unknown. Mainly for debugging purpose.
+  const SCEV *FailurePoint;
+};
+
+/// Check the monotonicity of a SCEV. Since dependence tests (SIV, MIV, etc.)
+/// assume that subscript expressions are (multivariate) monotonic, we need to
+/// verify this property before applying those tests. Violating this assumption
+/// may cause them to produce incorrect results.
+struct SCEVMonotonicityChecker
+    : public SCEVVisitor<SCEVMonotonicityChecker, SCEVMonotonicity> {
+
+  SCEVMonotonicityChecker(ScalarEvolution *SE) : SE(SE) {}
+
+  /// Check the monotonicity of \p Expr. \p Expr must be integer type. If \p
+  /// OutermostLoop is not null, \p Expr must be defined in \p OutermostLoop or
+  /// one of its nested loops.
+  SCEVMonotonicity checkMonotonicity(const SCEV *Expr,
+                                     const Loop *OutermostLoop);
+
+private:
+  ScalarEvolution *SE;
+
+  /// The outermost loop that DA is analyzing.
+  const Loop *OutermostLoop;
+
+  /// A helper to classify \p Expr as either Invariant or Unknown.
+  SCEVMonotonicity invariantOrUnknown(const SCEV *Expr);
+
+  /// Return true if \p Expr is loop-invariant with respect to the outermost
+  /// loop.
+  bool isLoopInvariant(const SCEV *Expr) const;
+
+  /// A helper to create an Unknown SCEVMonotonicity.
+  SCEVMonotonicity createUnknown(const SCEV *FailurePoint) {
+    return SCEVMonotonicity(SCEVMonotonicityType::Unknown, FailurePoint);
+  }
+
+  SCEVMonotonicity visitAddRecExpr(const SCEVAddRecExpr *Expr);
+
+  SCEVMonotonicity visitConstant(const SCEVConstant *) {
+    return SCEVMonotonicity(SCEVMonotonicityType::Invariant);
+  }
+  SCEVMonotonicity visitVScale(const SCEVVScale *) {
+    return SCEVMonotonicity(SCEVMonotonicityType::Invariant);
+  }
+
+  // TODO: Handle more cases.
+  SCEVMonotonicity visitZeroExtendExpr(const SCEVZeroExtendExpr *Expr) {
+    return invariantOrUnknown(Expr);
+  }
+  SCEVMonotonicity visitSignExtendExpr(const SCEVSignExtendExpr *Expr) {
+    return invariantOrUnknown(Expr);
+  }
+  SCEVMonotonicity visitAddExpr(const SCEVAddExpr *Expr) {
+    return invariantOrUnknown(Expr);
+  }
+  SCEVMonotonicity visitMulExpr(const SCEVMulExpr *Expr) {
+    return invariantOrUnknown(Expr);
+  }
+  SCEVMonotonicity visitPtrToIntExpr(const SCEVPtrToIntExpr *Expr) {
+    return invariantOrUnknown(Expr);
+  }
+  SCEVMonotonicity visitTruncateExpr(const SCEVTruncateExpr *Expr) {
+    return invariantOrUnknown(Expr);
+  }
+  SCEVMonotonicity visitUDivExpr(const SCEVUDivExpr *Expr) {
+    return invariantOrUnknown(Expr);
+  }
+  SCEVMonotonicity visitSMaxExpr(const SCEVSMaxExpr *Expr) {
+    return invariantOrUnknown(Expr);
+  }
+  SCEVMonotonicity visitUMaxExpr(const SCEVUMaxExpr *Expr) {
+    return invariantOrUnknown(Expr);
+  }
+  SCEVMonotonicity visitSMinExpr(const SCEVSMinExpr *Expr) {
+    return invariantOrUnknown(Expr);
+  }
+  SCEVMonotonicity visitUMinExpr(const SCEVUMinExpr *Expr) {
+    return invariantOrUnknown(Expr);
+  }
+  SCEVMonotonicity visitSequentialUMinExpr(const SCEVSequentialUMinExpr *Expr) {
+    return invariantOrUnknown(Expr);
+  }
+  SCEVMonotonicity visitUnknown(const SCEVUnknown *Expr) {
+    return invariantOrUnknown(Expr);
+  }
+  SCEVMonotonicity visitCouldNotCompute(const SCEVCouldNotCompute *Expr) {
+    return invariantOrUnknown(Expr);
+  }
+
+  friend struct SCEVVisitor<SCEVMonotonicityChecker, SCEVMonotonicity>;
+};
+
+} // anonymous namespace
+
 // Used to test the dependence analyzer.
 // Looks through the function, noting instructions that may access memory.
 // Calls depends() on every possible pair and prints out the result.
 // Ignores all other instructions.
 static void dumpExampleDependence(raw_ostream &OS, DependenceInfo *DA,
-                                  ScalarEvolution &SE, bool NormalizeResults) {
+                                  ScalarEvolution &SE, LoopInfo &LI,
+                                  bool NormalizeResults) {
   auto *F = DA->getFunction();
+
+  if (DumpMonotonicityReport) {
+    SCEVMonotonicityChecker Checker(&SE);
+    OS << "Monotonicity check:\n";
+    for (Instruction &Inst : instructions(F)) {
+      if (!isa<LoadInst>(Inst) && !isa<StoreInst>(Inst))
+        continue;
+      Value *Ptr = getLoadStorePointerOperand(&Inst);
+      const Loop *L = LI.getLoopFor(Inst.getParent());
+      const SCEV *PtrSCEV = SE.getSCEVAtScope(Ptr, L);
+      const SCEV *AccessFn = SE.removePointerBase(PtrSCEV);
+      SCEVMonotonicity Mon = Checker.checkMonotonicity(AccessFn, L);
+      OS.indent(2) << "Inst: " << Inst << "\n";
+      OS.indent(4) << "Expr: " << *AccessFn << "\n";
+      Mon.print(OS, 4);
+    }
+    OS << "\n";
+  }
+
   for (inst_iterator SrcI = inst_begin(F), SrcE = inst_end(F); SrcI != SrcE;
        ++SrcI) {
     if (SrcI->mayReadOrWriteMemory()) {
@@ -235,7 +468,8 @@ static void dumpExampleDependence(raw_ostream &OS, DependenceInfo *DA,
 void DependenceAnalysisWrapperPass::print(raw_ostream &OS,
                                           const Module *) const {
   dumpExampleDependence(
-      OS, info.get(), getAnalysis<ScalarEvolutionWrapperPass>().getSE(), false);
+      OS, info.get(), getAnalysis<ScalarEvolutionWrapperPass>().getSE(),
+      getAnalysis<LoopInfoWrapperPass>().getLoopInfo(), false);
 }
 
 PreservedAnalyses
@@ -244,7 +478,7 @@ DependenceAnalysisPrinterPass::run(Function &F, FunctionAnalysisManager &FAM) {
      << "':\n";
   dumpExampleDependence(OS, &FAM.getResult<DependenceAnalysis>(F),
                         FAM.getResult<ScalarEvolutionAnalysis>(F),
-                        NormalizeResults);
+                        FAM.getResult<LoopAnalysis>(F), NormalizeResults);
   return PreservedAnalyses::all();
 }
 
@@ -670,6 +904,81 @@ bool DependenceInfo::intersectConstraints(Constraint *X, const Constraint *Y) {
   return false;
 }
 
+//===----------------------------------------------------------------------===//
+// SCEVMonotonicity
+
+SCEVMonotonicity::SCEVMonotonicity(SCEVMonotonicityType Type,
+                                   const SCEV *FailurePoint)
+    : Type(Type), FailurePoint(FailurePoint) {
+  assert(
+      ((Type == SCEVMonotonicityType::Unknown) == (FailurePoint != nullptr)) &&
+      "FailurePoint must be provided iff Type is Unknown");
+}
+
+void SCEVMonotonicity::print(raw_ostream &OS, unsigned Depth) const {
+  OS.indent(Depth) << "Monotonicity: ";
+  switch (Type) {
+  case SCEVMonotonicityType::Unknown:
+    assert(FailurePoint && "FailurePoint must be provided for Unknown");
+    OS << "Unknown\n";
+    OS.indent(Depth) << "Reason: " << *FailurePoint << "\n";
+    break;
+  case SCEVMonotonicityType::Invariant:
+    OS << "Invariant\n";
+    break;
+  case SCEVMonotonicityType::MultivariateSignedMonotonic:
+    OS << "MultivariateSignedMonotonic\n";
+    break;
+  }
+}
+
+bool SCEVMonotonicityChecker::isLoopInvariant(const SCEV *Expr) const {
+  return !OutermostLoop || SE->isLoopInvariant(Expr, OutermostLoop);
+}
+
+SCEVMonotonicity SCEVMonotonicityChecker::invariantOrUnknown(const SCEV *Expr) {
+  if (isLoopInvariant(Expr))
+    return SCEVMonotonicity(SCEVMonotonicityType::Invariant);
+  return createUnknown(Expr);
+}
+
+SCEVMonotonicity
+SCEVMonotonicityChecker::checkMonotonicity(const SCEV *Expr,
+                                           const Loop *OutermostLoop) {
+  assert(Expr->getType()->isIntegerTy() && "Expr must be integer type");
+  this->OutermostLoop = OutermostLoop;
+  return visit(Expr);
+}
+
+/// We only care about an affine AddRec at the moment. For an affine AddRec,
+/// the monotonicity can be inferred from its nowrap property. For example, let
+/// X and Y be loop-invariant, and assume Y is non-negative. An AddRec
+/// {X,+.Y}<nsw> implies:
+///
+///   X <=s (X + Y) <=s ((X + Y) + Y) <=s ...
+///
+/// Thus, we can conclude that the AddRec is monotonically increasing with
+/// respect to the associated loop in a signed sense. The similar reasoning
+/// applies when Y is non-positive, leading to a monotonically decreasing
+/// AddRec.
+SCEVMonotonicity
+SCEVMonotonicityChecker::visitAddRecExpr(const SCEVAddRecExpr *Expr) {
+  if (!Expr->isAffine() || !Expr->hasNoSignedWrap())
+    return createUnknown(Expr);
+
+  const SCEV *Start = Expr->getStart();
+  const SCEV *Step = Expr->getStepRecurrence(*SE);
+
+  SCEVMonotonicity StartMon = visit(Start);
+  if (StartMon.isUnknown())
+    return StartMon;
+
+  if (!isLoopInvariant(Step))
+    return createUnknown(Expr);
+
+  return SCEVMonotonicity(SCEVMonotonicityType::MultivariateSignedMonotonic);
+}
+
 //===----------------------------------------------------------------------===//
 // DependenceInfo methods
 
@@ -1180,32 +1489,41 @@ bool DependenceInfo::isKnownLessThan(const SCEV *S, const SCEV *Size) const {
   S = SE->getTruncateOrZeroExtend(S, MaxType);
   Size = SE->getTruncateOrZeroExtend(Size, MaxType);
 
-  // Special check for addrecs using BE taken count
-  if (const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(S))
-    if (AddRec->isAffine() && AddRec->hasNoSignedWrap()) {
-      const SCEV *BECount = SE->getBackedgeTakenCount(AddRec->getLoop());
-      const SCEV *Start = AddRec->getStart();
-      const SCEV *Step = AddRec->getStepRecurrence(*SE);
-      const SCEV *End = AddRec->evaluateAtIteration(BECount, *SE);
-      const SCEV *Diff0 = SE->getMinusSCEV(Start, Size);
-      const SCEV *Diff1 = SE->getMinusSCEV(End, Size);
-
-      // If the value of Step is non-negative and the AddRec is non-wrap, it
-      // reaches its maximum at the last iteration. So it's enouth to check
-      // whether End - Size is negative.
-      if (SE->isKnownNonNegative(Step) && SE->isKnownNegative(Diff1))
-        return true;
+  auto CheckAddRecBECount = [&]() {
+    const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(S);
+    if (!AddRec || !AddRec->isAffine() || !AddRec->hasNoSignedWrap())
+      return false;
+    const SCEV *BECount = collectUpperBound(AddRec->getLoop(), MaxType);
+    // If the BTC cannot be computed, check the base case for S.
+    if (!BECount || isa<SCEVCouldNotCompute>(BECount))
+      return false;
+    const SCEV *Start = AddRec->getStart();
+    const SCEV *Step = AddRec->getStepRecurrence(*SE);
+    const SCEV *End = AddRec->evaluateAtIteration(BECount, *SE);
+    const SCEV *Diff0 = SE->getMinusSCEV(Start, Size);
+    const SCEV *Diff1 = SE->getMinusSCEV(End, Size);
+
+    // If the value of Step is non-negative and the AddRec is non-wrap, it
+    // reaches its maximum at the last iteration. So it's enouth to check
+    // whether End - Size is negative.
+    if (SE->isKnownNonNegative(Step) && SE->isKnownNegative(Diff1))
+      return true;
 
-      // If the value of Step is non-positive and the AddRec is non-wrap, the
-      // initial value is its maximum.
-      if (SE->isKnownNonPositive(Step) && SE->isKnownNegative(Diff0))
-        return true;
+    // If the value of Step is non-positive and the AddRec is non-wrap, the
+    // initial value is its maximum.
+    if (SE->isKnownNonPositive(Step) && SE->isKnownNegative(Diff0))
+      return true;
 
-      // Even if we don't know the sign of Step, either Start or End must be
-      // the maximum value of the AddRec since it is non-wrap.
-      if (SE->isKnownNegative(Diff0) && SE->isKnownNegative(Diff1))
-        return true;
-    }
+    // Even if we don't know the sign of Step, either Start or End must be
+    // the maximum value of the AddRec since it is non-wrap.
+    if (SE->isKnownNegative(Diff0) && SE->isKnownNegative(Diff1))
+      return true;
+
+    return false;
+  };
+
+  if (CheckAddRecBECount())
+    return true;
 
   // Check using normal isKnownNegative
   const SCEV *LimitedBound = SE->getMinusSCEV(S, Size);
@@ -1264,6 +1582,13 @@ static const SCEV *minusSCEVNoSignedOverflow(const SCEV *A, const SCEV *B,
   return nullptr;
 }
 
+/// Returns true iff \p Test is enabled.
+static bool isDependenceTestEnabled(DependenceTestType Test) {
+  if (EnableDependenceTest == DependenceTestType::All)
+    return true;
+  return EnableDependenceTest == Test;
+}
+
 // testZIV -
 // When we have a pair of subscripts of the form [c1] and [c2],
 // where c1 and c2 are both loop invariant, we attack it using
@@ -1325,6 +1650,9 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst,
                                    const Loop *CurDstLoop, unsigned Level,
                                    FullDependence &Result,
                                    Constraint &NewConstraint) const {
+  if (!isDependenceTestEnabled(DependenceTestType::StrongSIV))
+    return false;
+
   LLVM_DEBUG(dbgs() << "\tStrong SIV test\n");
   LLVM_DEBUG(dbgs() << "\t    Coeff = " << *Coeff);
   LLVM_DEBUG(dbgs() << ", " << *Coeff->getType() << "\n");
@@ -1459,6 +1787,9 @@ bool DependenceInfo::weakCrossingSIVtest(
     const Loop *CurSrcLoop, const Loop *CurDstLoop, unsigned Level,
     FullDependence &Result, Constraint &NewConstraint,
     const SCEV *&SplitIter) const {
+  if (!isDependenceTestEnabled(DependenceTestType::WeakCrossingSIV))
+    return false;
+
   LLVM_DEBUG(dbgs() << "\tWeak-Crossing SIV test\n");
   LLVM_DEBUG(dbgs() << "\t    Coeff = " << *Coeff << "\n");
   LLVM_DEBUG(dbgs() << "\t    SrcConst = " << *SrcConst << "\n");
@@ -1717,6 +2048,9 @@ bool DependenceInfo::exactSIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
                                   const Loop *CurDstLoop, unsigned Level,
                                   FullDependence &Result,
                                   Constraint &NewConstraint) const {
+  if (!isDependenceTestEnabled(DependenceTestType::ExactSIV))
+    return false;
+
   LLVM_DEBUG(dbgs() << "\tExact SIV test\n");
   LLVM_DEBUG(dbgs() << "\t    SrcCoeff = " << *SrcCoeff << " = AM\n");
   LLVM_DEBUG(dbgs() << "\t    DstCoeff = " << *DstCoeff << " = BM\n");
@@ -1896,6 +2230,9 @@ bool DependenceInfo::weakZeroSrcSIVtest(
     const SCEV *DstCoeff, const SCEV *SrcConst, const SCEV *DstConst,
     const Loop *CurSrcLoop, const Loop *CurDstLoop, unsigned Level,
     FullDependence &Result, Constraint &NewConstraint) const {
+  if (!isDependenceTestEnabled(DependenceTestType::WeakZeroSIV))
+    return false;
+
   // For the WeakSIV test, it's possible the loop isn't common to
   // the Src and Dst loops. If it isn't, then there's no need to
   // record a direction.
@@ -2004,6 +2341,9 @@ bool DependenceInfo::weakZeroDstSIVtest(
     const SCEV *SrcCoeff, const SCEV *SrcConst, const SCEV *DstConst,
     const Loop *CurSrcLoop, const Loop *CurDstLoop, unsigned Level,
     FullDependence &Result, Constraint &NewConstraint) const {
+  if (!isDependenceTestEnabled(DependenceTestType::WeakZeroSIV))
+    return false;
+
   // For the WeakSIV test, it's possible the loop isn't common to the
   // Src and Dst loops. If it isn't, then there's no need to record a direction.
   LLVM_DEBUG(dbgs() << "\tWeak-Zero (dst) SIV test\n");
@@ -2087,8 +2427,9 @@ bool DependenceInfo::exactRDIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
                                    const SCEV *SrcConst, const SCEV *DstConst,
                                    const Loop *SrcLoop, const Loop *DstLoop,
                                    FullDependence &Result) const {
-  if (RunSIVRoutinesOnly)
+  if (!isDependenceTestEnabled(DependenceTestType::ExactRDIV))
     return false;
+
   LLVM_DEBUG(dbgs() << "\tExact RDIV test\n");
   LLVM_DEBUG(dbgs() << "\t    SrcCoeff = " << *SrcCoeff << " = AM\n");
   LLVM_DEBUG(dbgs() << "\t    DstCoeff = " << *DstCoeff << " = BM\n");
@@ -2233,8 +2574,9 @@ bool DependenceInfo::symbolicRDIVtest(const SCEV *A1, const SCEV *A2,
                                       const SCEV *C1, const SCEV *C2,
                                       const Loop *Loop1,
                                       const Loop *Loop2) const {
-  if (RunSIVRoutinesOnly)
+  if (!isDependenceTestEnabled(DependenceTestType::SymbolicRDIV))
     return false;
+
   ++SymbolicRDIVapplications;
   LLVM_DEBUG(dbgs() << "\ttry symbolic RDIV test\n");
   LLVM_DEBUG(dbgs() << "\t    A1 = " << *A1);
@@ -2548,8 +2890,9 @@ bool DependenceInfo::accumulateCoefficientsGCD(const SCEV *Expr,
 // to "a common divisor".
 bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,
                                 FullDependence &Result) const {
-  if (RunSIVRoutinesOnly)
+  if (!isDependenceTestEnabled(DependenceTestType::GCDMIV))
     return false;
+
   LLVM_DEBUG(dbgs() << "starting gcd\n");
   ++GCDapplications;
   unsigned BitWidth = SE->getTypeSizeInBits(Src->getType());
@@ -2716,8 +3059,9 @@ bool DependenceInfo::gcdMIVtest(const SCEV *Src, const SCEV *Dst,
 bool DependenceInfo::banerjeeMIVtest(const SCEV *Src, const SCEV *Dst,
                                      const SmallBitVector &Loops,
                                      FullDependence &Result) const {
-  if (RunSIVRoutinesOnly)
+  if (!isDependenceTestEnabled(DependenceTestType::BanerjeeMIV))
     return false;
+
   LLVM_DEBUG(dbgs() << "starting Banerjee\n");
   ++BanerjeeApplications;
   LLVM_DEBUG(dbgs() << "    Src = " << *Src << '\n');
@@ -3479,10 +3823,19 @@ bool DependenceInfo::tryDelinearize(Instruction *Src, Instruction *Dst,
   // resize Pair to contain as many pairs of subscripts as the delinearization
   // has found, and then initialize the pairs following the delinearization.
   Pair.resize(Size);
+  SCEVMonotonicityChecker MonChecker(SE);
+  const Loop *OutermostLoop = SrcLoop ? SrcLoop->getOutermostLoop() : nullptr;
   for (int I = 0; I < Size; ++I) {
     Pair[I].Src = SrcSubscripts[I];
     Pair[I].Dst = DstSubscripts[I];
     unifySubscriptType(&Pair[I]);
+
+    if (EnableMonotonicityCheck) {
+      if (MonChecker.checkMonotonicity(Pair[I].Src, OutermostLoop).isUnknown())
+        return false;
+      if (MonChecker.checkMonotonicity(Pair[I].Dst, OutermostLoop).isUnknown())
+        return false;
+    }
   }
 
   return true;
@@ -3815,6 +4168,14 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
   Pair[0].Src = SrcEv;
   Pair[0].Dst = DstEv;
 
+  SCEVMonotonicityChecker MonChecker(SE);
+  const Loop *OutermostLoop = SrcLoop ? SrcLoop->getOutermostLoop() : nullptr;
+  if (EnableMonotonicityCheck)
+    if (MonChecker.checkMonotonicity(Pair[0].Src, OutermostLoop).isUnknown() ||
+        MonChecker.checkMonotonicity(Pair[0].Dst, OutermostLoop).isUnknown())
+      return std::make_unique<Dependence>(Src, Dst,
+                                          SCEVUnionPredicate(Assume, *SE));
+
   if (Delinearize) {
     if (tryDelinearize(Src, Dst, Pair)) {
       LLVM_DEBUG(dbgs() << "    delinearized\n");
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index b8c540ce4b99d..9f8ac6e8e2e0b 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -849,17 +849,12 @@ RecurrenceDescriptor::isMinMaxPattern(Instruction *I, RecurKind Kind,
 /// %sum.2 = select %cmp, %add, %sum.1
 RecurrenceDescriptor::InstDesc
 RecurrenceDescriptor::isConditionalRdxPattern(Instruction *I) {
-  SelectInst *SI = dyn_cast<SelectInst>(I);
-  if (!SI)
-    return InstDesc(false, I);
-
-  CmpInst *CI = dyn_cast<CmpInst>(SI->getCondition());
+  Value *TrueVal, *FalseVal;
   // Only handle single use cases for now.
-  if (!CI || !CI->hasOneUse())
+  if (!match(I,
+             m_Select(m_OneUse(m_Cmp()), m_Value(TrueVal), m_Value(FalseVal))))
     return InstDesc(false, I);
 
-  Value *TrueVal = SI->getTrueValue();
-  Value *FalseVal = SI->getFalseValue();
   // Handle only when either of operands of select instruction is a PHI
   // node for now.
   if ((isa<PHINode>(TrueVal) && isa<PHINode>(FalseVal)) ||
@@ -886,7 +881,7 @@ RecurrenceDescriptor::isConditionalRdxPattern(Instruction *I) {
   if (!IPhi || IPhi != FalseVal)
     return InstDesc(false, I);
 
-  return InstDesc(true, SI);
+  return InstDesc(true, I);
 }
 
 RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr(
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 4e3862693294e..8da51d039f197 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -5106,32 +5106,33 @@ static Value *simplifyGEPInst(Type *SrcTy, Value *Ptr,
         return Ptr;
 
       // The following transforms are only safe if the ptrtoint cast
-      // doesn't truncate the pointers.
-      if (Indices[0]->getType()->getScalarSizeInBits() ==
-          Q.DL.getPointerSizeInBits(AS)) {
+      // doesn't truncate the address of the pointers. The non-address bits
+      // must be the same, as the underlying objects are the same.
+      if (Indices[0]->getType()->getScalarSizeInBits() >=
+          Q.DL.getAddressSizeInBits(AS)) {
         auto CanSimplify = [GEPTy, &P, Ptr]() -> bool {
           return P->getType() == GEPTy &&
                  getUnderlyingObject(P) == getUnderlyingObject(Ptr);
         };
         // getelementptr V, (sub P, V) -> P if P points to a type of size 1.
         if (TyAllocSize == 1 &&
-            match(Indices[0],
-                  m_Sub(m_PtrToInt(m_Value(P)), m_PtrToInt(m_Specific(Ptr)))) &&
+            match(Indices[0], m_Sub(m_PtrToIntOrAddr(m_Value(P)),
+                                    m_PtrToIntOrAddr(m_Specific(Ptr)))) &&
             CanSimplify())
           return P;
 
         // getelementptr V, (ashr (sub P, V), C) -> P if P points to a type of
         // size 1 << C.
-        if (match(Indices[0], m_AShr(m_Sub(m_PtrToInt(m_Value(P)),
-                                           m_PtrToInt(m_Specific(Ptr))),
+        if (match(Indices[0], m_AShr(m_Sub(m_PtrToIntOrAddr(m_Value(P)),
+                                           m_PtrToIntOrAddr(m_Specific(Ptr))),
                                      m_ConstantInt(C))) &&
             TyAllocSize == 1ULL << C && CanSimplify())
           return P;
 
         // getelementptr V, (sdiv (sub P, V), C) -> P if P points to a type of
         // size C.
-        if (match(Indices[0], m_SDiv(m_Sub(m_PtrToInt(m_Value(P)),
-                                           m_PtrToInt(m_Specific(Ptr))),
+        if (match(Indices[0], m_SDiv(m_Sub(m_PtrToIntOrAddr(m_Value(P)),
+                                           m_PtrToIntOrAddr(m_Specific(Ptr))),
                                      m_SpecificInt(TyAllocSize))) &&
             CanSimplify())
           return P;
@@ -5440,9 +5441,10 @@ static Value *simplifyCastInst(unsigned CastOpc, Value *Op, Type *Ty,
 
   // ptrtoint (ptradd (Ptr, X - ptrtoint(Ptr))) -> X
   Value *Ptr, *X;
-  if (CastOpc == Instruction::PtrToInt &&
-      match(Op, m_PtrAdd(m_Value(Ptr),
-                         m_Sub(m_Value(X), m_PtrToInt(m_Deferred(Ptr))))) &&
+  if ((CastOpc == Instruction::PtrToInt || CastOpc == Instruction::PtrToAddr) &&
+      match(Op,
+            m_PtrAdd(m_Value(Ptr),
+                     m_Sub(m_Value(X), m_PtrToIntOrAddr(m_Deferred(Ptr))))) &&
       X->getType() == Ty && Ty == Q.DL.getIndexType(Ptr->getType()))
     return X;
 
@@ -6644,7 +6646,7 @@ Value *llvm::simplifyBinaryIntrinsic(Intrinsic::ID IID, Type *ReturnType,
            "Invalid mask width");
     // If index-width (mask size) is less than pointer-size then mask is
     // 1-extended.
-    if (match(Op1, m_PtrToInt(m_Specific(Op0))))
+    if (match(Op1, m_PtrToIntOrAddr(m_Specific(Op0))))
       return Op0;
 
     // NOTE: We may have attributes associated with the return value of the
@@ -6987,8 +6989,8 @@ static Value *simplifyIntrinsic(CallBase *Call, Value *Callee,
   switch (IID) {
   case Intrinsic::masked_load:
   case Intrinsic::masked_gather: {
-    Value *MaskArg = Args[2];
-    Value *PassthruArg = Args[3];
+    Value *MaskArg = Args[1];
+    Value *PassthruArg = Args[2];
     // If the mask is all zeros or undef, the "passthru" argument is the result.
     if (maskIsAllZeroOrUndef(MaskArg))
       return PassthruArg;
diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp
index 0e5bc481383a0..df75999eb6080 100644
--- a/llvm/lib/Analysis/LazyValueInfo.cpp
+++ b/llvm/lib/Analysis/LazyValueInfo.cpp
@@ -947,9 +947,8 @@ LazyValueInfoImpl::solveBlockValueSelect(SelectInst *SI, BasicBlock *BB) {
                                                   /*UseBlockValue*/ false));
   }
 
-  ValueLatticeElement Result = TrueVal;
-  Result.mergeIn(FalseVal);
-  return Result;
+  TrueVal.mergeIn(FalseVal);
+  return TrueVal;
 }
 
 std::optional<ConstantRange>
@@ -1778,9 +1777,8 @@ ValueLatticeElement LazyValueInfoImpl::getValueInBlock(Value *V, BasicBlock *BB,
     assert(OptResult && "Value not available after solving");
   }
 
-  ValueLatticeElement Result = *OptResult;
-  LLVM_DEBUG(dbgs() << "  Result = " << Result << "\n");
-  return Result;
+  LLVM_DEBUG(dbgs() << "  Result = " << *OptResult << "\n");
+  return *OptResult;
 }
 
 ValueLatticeElement LazyValueInfoImpl::getValueAt(Value *V, Instruction *CxtI) {
diff --git a/llvm/lib/Analysis/LoopInfo.cpp b/llvm/lib/Analysis/LoopInfo.cpp
index a8c3173bb1794..d84721b7f8f4b 100644
--- a/llvm/lib/Analysis/LoopInfo.cpp
+++ b/llvm/lib/Analysis/LoopInfo.cpp
@@ -986,8 +986,8 @@ PreservedAnalyses LoopPrinterPass::run(Function &F,
   return PreservedAnalyses::all();
 }
 
-void llvm::printLoop(Loop &L, raw_ostream &OS, const std::string &Banner) {
-
+void llvm::printLoop(const Loop &L, raw_ostream &OS,
+                     const std::string &Banner) {
   if (forcePrintModuleIR()) {
     // handling -print-module-scope
     OS << Banner << " (loop: ";
diff --git a/llvm/lib/Analysis/MLInlineAdvisor.cpp b/llvm/lib/Analysis/MLInlineAdvisor.cpp
index f90717d3085eb..9a5ae2ae26799 100644
--- a/llvm/lib/Analysis/MLInlineAdvisor.cpp
+++ b/llvm/lib/Analysis/MLInlineAdvisor.cpp
@@ -61,6 +61,9 @@ static cl::opt<SkipMLPolicyCriteria> SkipPolicy(
 static cl::opt<std::string> ModelSelector("ml-inliner-model-selector",
                                           cl::Hidden, cl::init(""));
 
+static cl::opt<bool> StopImmediatelyForTest("ml-inliner-stop-immediately",
+                                            cl::Hidden);
+
 #if defined(LLVM_HAVE_TF_AOT_INLINERSIZEMODEL)
 // codegen-ed file
 #include "InlinerSizeModel.h" // NOLINT
@@ -214,6 +217,7 @@ MLInlineAdvisor::MLInlineAdvisor(
     return;
   }
   ModelRunner->switchContext("");
+  ForceStop = StopImmediatelyForTest;
 }
 
 unsigned MLInlineAdvisor::getInitialFunctionLevel(const Function &F) const {
@@ -320,32 +324,44 @@ void MLInlineAdvisor::onSuccessfulInlining(const MLInlineAdvice &Advice,
     FAM.invalidate(*Caller, PA);
   }
   Advice.updateCachedCallerFPI(FAM);
-  int64_t IRSizeAfter =
-      getIRSize(*Caller) + (CalleeWasDeleted ? 0 : Advice.CalleeIRSize);
-  CurrentIRSize += IRSizeAfter - (Advice.CallerIRSize + Advice.CalleeIRSize);
+  if (Caller == Callee) {
+    assert(!CalleeWasDeleted);
+    // We double-counted CallerAndCalleeEdges - since the caller and callee
+    // would be the same
+    assert(Advice.CallerAndCalleeEdges % 2 == 0);
+    CurrentIRSize += getIRSize(*Caller) - Advice.CallerIRSize;
+    EdgeCount += getCachedFPI(*Caller).DirectCallsToDefinedFunctions -
+                 Advice.CallerAndCalleeEdges / 2;
+    // The NodeCount would stay the same.
+  } else {
+    int64_t IRSizeAfter =
+        getIRSize(*Caller) + (CalleeWasDeleted ? 0 : Advice.CalleeIRSize);
+    CurrentIRSize += IRSizeAfter - (Advice.CallerIRSize + Advice.CalleeIRSize);
+
+    // We can delta-update module-wide features. We know the inlining only
+    // changed the caller, and maybe the callee (by deleting the latter). Nodes
+    // are simple to update. For edges, we 'forget' the edges that the caller
+    // and callee used to have before inlining, and add back what they currently
+    // have together.
+    int64_t NewCallerAndCalleeEdges =
+        getCachedFPI(*Caller).DirectCallsToDefinedFunctions;
+
+    // A dead function's node is not actually removed from the call graph until
+    // the end of the call graph walk, but the node no longer belongs to any
+    // valid SCC.
+    if (CalleeWasDeleted) {
+      --NodeCount;
+      NodesInLastSCC.erase(CG.lookup(*Callee));
+      DeadFunctions.insert(Callee);
+    } else {
+      NewCallerAndCalleeEdges +=
+          getCachedFPI(*Callee).DirectCallsToDefinedFunctions;
+    }
+    EdgeCount += (NewCallerAndCalleeEdges - Advice.CallerAndCalleeEdges);
+  }
   if (CurrentIRSize > SizeIncreaseThreshold * InitialIRSize)
     ForceStop = true;
 
-  // We can delta-update module-wide features. We know the inlining only changed
-  // the caller, and maybe the callee (by deleting the latter).
-  // Nodes are simple to update.
-  // For edges, we 'forget' the edges that the caller and callee used to have
-  // before inlining, and add back what they currently have together.
-  int64_t NewCallerAndCalleeEdges =
-      getCachedFPI(*Caller).DirectCallsToDefinedFunctions;
-
-  // A dead function's node is not actually removed from the call graph until
-  // the end of the call graph walk, but the node no longer belongs to any valid
-  // SCC.
-  if (CalleeWasDeleted) {
-    --NodeCount;
-    NodesInLastSCC.erase(CG.lookup(*Callee));
-    DeadFunctions.insert(Callee);
-  } else {
-    NewCallerAndCalleeEdges +=
-        getCachedFPI(*Callee).DirectCallsToDefinedFunctions;
-  }
-  EdgeCount += (NewCallerAndCalleeEdges - Advice.CallerAndCalleeEdges);
   assert(CurrentIRSize >= 0 && EdgeCount >= 0 && NodeCount >= 0);
 }
 
@@ -379,9 +395,17 @@ std::unique_ptr<InlineAdvice> MLInlineAdvisor::getAdviceImpl(CallBase &CB) {
   auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(Caller);
 
   if (SkipPolicy == SkipMLPolicyCriteria::IfCallerIsNotCold) {
-    if (!PSI.isFunctionEntryCold(&Caller))
-      return std::make_unique<InlineAdvice>(this, CB, ORE,
-                                            GetDefaultAdvice(CB));
+    if (!PSI.isFunctionEntryCold(&Caller)) {
+      // Return a MLInlineAdvice, despite delegating to the default advice,
+      // because we need to keep track of the internal state. This is different
+      // from the other instances where we return a "default" InlineAdvice,
+      // which happen at points we won't come back to the MLAdvisor for
+      // decisions requiring that state.
+      return ForceStop ? std::make_unique<InlineAdvice>(this, CB, ORE,
+                                                        GetDefaultAdvice(CB))
+                       : std::make_unique<MLInlineAdvice>(this, CB, ORE,
+                                                          GetDefaultAdvice(CB));
+    }
   }
   auto MandatoryKind = InlineAdvisor::getMandatoryKind(CB, FAM, ORE);
   // If this is a "never inline" case, there won't be any changes to internal
diff --git a/llvm/lib/Analysis/MemoryLocation.cpp b/llvm/lib/Analysis/MemoryLocation.cpp
index dcc51178b975a..1c5f08e13498c 100644
--- a/llvm/lib/Analysis/MemoryLocation.cpp
+++ b/llvm/lib/Analysis/MemoryLocation.cpp
@@ -245,7 +245,7 @@ MemoryLocation MemoryLocation::getForArgument(const CallBase *Call,
       assert(ArgIdx == 0 && "Invalid argument index");
 
       auto *Ty = cast<VectorType>(II->getType());
-      if (auto KnownType = getKnownTypeFromMaskedOp(II->getOperand(2), Ty))
+      if (auto KnownType = getKnownTypeFromMaskedOp(II->getOperand(1), Ty))
         return MemoryLocation(Arg, DL.getTypeStoreSize(*KnownType), AATags);
 
       return MemoryLocation(
@@ -255,7 +255,7 @@ MemoryLocation MemoryLocation::getForArgument(const CallBase *Call,
       assert(ArgIdx == 1 && "Invalid argument index");
 
       auto *Ty = cast<VectorType>(II->getArgOperand(0)->getType());
-      if (auto KnownType = getKnownTypeFromMaskedOp(II->getOperand(3), Ty))
+      if (auto KnownType = getKnownTypeFromMaskedOp(II->getOperand(2), Ty))
         return MemoryLocation(Arg, DL.getTypeStoreSize(*KnownType), AATags);
 
       return MemoryLocation(
diff --git a/llvm/lib/Analysis/MemorySSA.cpp b/llvm/lib/Analysis/MemorySSA.cpp
index ab373386da579..0b2e3fcfd76df 100644
--- a/llvm/lib/Analysis/MemorySSA.cpp
+++ b/llvm/lib/Analysis/MemorySSA.cpp
@@ -393,7 +393,7 @@ static bool isUseTriviallyOptimizableToLiveOnEntry(AliasAnalysisType &AA,
 /// \param AA        The AliasAnalysis we used for our search.
 /// \param AllowImpreciseClobber Always false, unless we do relaxed verify.
 
-LLVM_ATTRIBUTE_UNUSED static void
+[[maybe_unused]] static void
 checkClobberSanity(const MemoryAccess *Start, MemoryAccess *ClobberAt,
                    const MemoryLocation &StartLoc, const MemorySSA &MSSA,
                    const UpwardsMemoryQuery &Query, BatchAAResults &AA,
diff --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
index a60a4bb1194e2..6cdf51a148e2d 100644
--- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -1100,12 +1100,12 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
 
   for (auto &GlobalList : Index) {
     // Ignore entries for references that are undefined in the current module.
-    if (GlobalList.second.SummaryList.empty())
+    if (GlobalList.second.getSummaryList().empty())
       continue;
 
-    assert(GlobalList.second.SummaryList.size() == 1 &&
+    assert(GlobalList.second.getSummaryList().size() == 1 &&
            "Expected module's index to have one summary per GUID");
-    auto &Summary = GlobalList.second.SummaryList[0];
+    auto &Summary = GlobalList.second.getSummaryList()[0];
     if (!IsThinLTO) {
       Summary->setNotEligibleToImport();
       continue;
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 3fab6b0572cb7..6f7dd79032cfe 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -1840,19 +1840,19 @@ const SCEV *ScalarEvolution::getZeroExtendExprImpl(const SCEV *Op, Type *Ty,
     //   = zext((2^K * (trunc X to i{N-K}))<nuw>) to iM
     //   = (2^K * (zext(trunc X to i{N-K}) to iM))<nuw>.
     //
-    if (SM->getNumOperands() == 2)
-      if (auto *MulLHS = dyn_cast<SCEVConstant>(SM->getOperand(0)))
-        if (MulLHS->getAPInt().isPowerOf2())
-          if (auto *TruncRHS = dyn_cast<SCEVTruncateExpr>(SM->getOperand(1))) {
-            int NewTruncBits = getTypeSizeInBits(TruncRHS->getType()) -
-                               MulLHS->getAPInt().logBase2();
-            Type *NewTruncTy = IntegerType::get(getContext(), NewTruncBits);
-            return getMulExpr(
-                getZeroExtendExpr(MulLHS, Ty),
-                getZeroExtendExpr(
-                    getTruncateExpr(TruncRHS->getOperand(), NewTruncTy), Ty),
-                SCEV::FlagNUW, Depth + 1);
-          }
+    const APInt *C;
+    const SCEV *TruncRHS;
+    if (match(SM,
+              m_scev_Mul(m_scev_APInt(C), m_scev_Trunc(m_SCEV(TruncRHS)))) &&
+        C->isPowerOf2()) {
+      int NewTruncBits =
+          getTypeSizeInBits(SM->getOperand(1)->getType()) - C->logBase2();
+      Type *NewTruncTy = IntegerType::get(getContext(), NewTruncBits);
+      return getMulExpr(
+          getZeroExtendExpr(SM->getOperand(0), Ty),
+          getZeroExtendExpr(getTruncateExpr(TruncRHS, NewTruncTy), Ty),
+          SCEV::FlagNUW, Depth + 1);
+    }
   }
 
   // zext(umin(x, y)) -> umin(zext(x), zext(y))
@@ -3144,20 +3144,19 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
   if (const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(Ops[0])) {
     if (Ops.size() == 2) {
       // C1*(C2+V) -> C1*C2 + C1*V
-      if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Ops[1]))
-        // If any of Add's ops are Adds or Muls with a constant, apply this
-        // transformation as well.
-        //
-        // TODO: There are some cases where this transformation is not
-        // profitable; for example, Add = (C0 + X) * Y + Z.  Maybe the scope of
-        // this transformation should be narrowed down.
-        if (Add->getNumOperands() == 2 && containsConstantInAddMulChain(Add)) {
-          const SCEV *LHS = getMulExpr(LHSC, Add->getOperand(0),
-                                       SCEV::FlagAnyWrap, Depth + 1);
-          const SCEV *RHS = getMulExpr(LHSC, Add->getOperand(1),
-                                       SCEV::FlagAnyWrap, Depth + 1);
-          return getAddExpr(LHS, RHS, SCEV::FlagAnyWrap, Depth + 1);
-        }
+      // If any of Add's ops are Adds or Muls with a constant, apply this
+      // transformation as well.
+      //
+      // TODO: There are some cases where this transformation is not
+      // profitable; for example, Add = (C0 + X) * Y + Z.  Maybe the scope of
+      // this transformation should be narrowed down.
+      const SCEV *Op0, *Op1;
+      if (match(Ops[1], m_scev_Add(m_SCEV(Op0), m_SCEV(Op1))) &&
+          containsConstantInAddMulChain(Ops[1])) {
+        const SCEV *LHS = getMulExpr(LHSC, Op0, SCEV::FlagAnyWrap, Depth + 1);
+        const SCEV *RHS = getMulExpr(LHSC, Op1, SCEV::FlagAnyWrap, Depth + 1);
+        return getAddExpr(LHS, RHS, SCEV::FlagAnyWrap, Depth + 1);
+      }
 
       if (Ops[0]->isAllOnesValue()) {
         // If we have a mul by -1 of an add, try distributing the -1 among the
@@ -3578,20 +3577,12 @@ const SCEV *ScalarEvolution::getUDivExpr(const SCEV *LHS,
   }
 
   // ((-C + (C smax %x)) /u %x) evaluates to zero, for any positive constant C.
-  if (const auto *AE = dyn_cast<SCEVAddExpr>(LHS);
-      AE && AE->getNumOperands() == 2) {
-    if (const auto *VC = dyn_cast<SCEVConstant>(AE->getOperand(0))) {
-      const APInt &NegC = VC->getAPInt();
-      if (NegC.isNegative() && !NegC.isMinSignedValue()) {
-        const auto *MME = dyn_cast<SCEVSMaxExpr>(AE->getOperand(1));
-        if (MME && MME->getNumOperands() == 2 &&
-            isa<SCEVConstant>(MME->getOperand(0)) &&
-            cast<SCEVConstant>(MME->getOperand(0))->getAPInt() == -NegC &&
-            MME->getOperand(1) == RHS)
-          return getZero(LHS->getType());
-      }
-    }
-  }
+  const APInt *NegC, *C;
+  if (match(LHS,
+            m_scev_Add(m_scev_APInt(NegC),
+                       m_scev_SMax(m_scev_APInt(C), m_scev_Specific(RHS)))) &&
+      NegC->isNegative() && !NegC->isMinSignedValue() && *C == -*NegC)
+    return getZero(LHS->getType());
 
   // TODO: Generalize to handle any common factors.
   // udiv (mul nuw a, vscale), (mul nuw b, vscale) --> udiv a, b
@@ -4623,17 +4614,11 @@ const SCEV *ScalarEvolution::getNegativeSCEV(const SCEV *V,
 
 /// If Expr computes ~A, return A else return nullptr
 static const SCEV *MatchNotExpr(const SCEV *Expr) {
-  const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Expr);
-  if (!Add || Add->getNumOperands() != 2 ||
-      !Add->getOperand(0)->isAllOnesValue())
-    return nullptr;
-
-  const SCEVMulExpr *AddRHS = dyn_cast<SCEVMulExpr>(Add->getOperand(1));
-  if (!AddRHS || AddRHS->getNumOperands() != 2 ||
-      !AddRHS->getOperand(0)->isAllOnesValue())
-    return nullptr;
-
-  return AddRHS->getOperand(1);
+  const SCEV *MulOp;
+  if (match(Expr, m_scev_Add(m_scev_AllOnes(),
+                             m_scev_Mul(m_scev_AllOnes(), m_SCEV(MulOp)))))
+    return MulOp;
+  return nullptr;
 }
 
 /// Return a SCEV corresponding to ~V = -1-V
@@ -6417,8 +6402,18 @@ APInt ScalarEvolution::getConstantMultipleImpl(const SCEV *S,
   case scSequentialUMinExpr:
     return GetGCDMultiple(cast<SCEVNAryExpr>(S));
   case scUnknown: {
-    // ask ValueTracking for known bits
+    // Ask ValueTracking for known bits. SCEVUnknown only become available at
+    // the point their underlying IR instruction has been defined. If CtxI was
+    // not provided, use:
+    // * the first instruction in the entry block if it is an argument
+    // * the instruction itself otherwise.
     const SCEVUnknown *U = cast<SCEVUnknown>(S);
+    if (!CtxI) {
+      if (isa<Argument>(U->getValue()))
+        CtxI = &*F.getEntryBlock().begin();
+      else if (auto *I = dyn_cast<Instruction>(U->getValue()))
+        CtxI = I;
+    }
     unsigned Known =
         computeKnownBits(U->getValue(), getDataLayout(), &AC, CtxI, &DT)
             .countMinTrailingZeros();
@@ -10787,19 +10782,15 @@ static bool HasSameValue(const SCEV *A, const SCEV *B) {
 }
 
 static bool MatchBinarySub(const SCEV *S, const SCEV *&LHS, const SCEV *&RHS) {
-  const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S);
-  if (!Add || Add->getNumOperands() != 2)
+  const SCEV *Op0, *Op1;
+  if (!match(S, m_scev_Add(m_SCEV(Op0), m_SCEV(Op1))))
     return false;
-  if (auto *ME = dyn_cast<SCEVMulExpr>(Add->getOperand(0));
-      ME && ME->getNumOperands() == 2 && ME->getOperand(0)->isAllOnesValue()) {
-    LHS = Add->getOperand(1);
-    RHS = ME->getOperand(1);
+  if (match(Op0, m_scev_Mul(m_scev_AllOnes(), m_SCEV(RHS)))) {
+    LHS = Op1;
     return true;
   }
-  if (auto *ME = dyn_cast<SCEVMulExpr>(Add->getOperand(1));
-      ME && ME->getNumOperands() == 2 && ME->getOperand(0)->isAllOnesValue()) {
-    LHS = Add->getOperand(0);
-    RHS = ME->getOperand(1);
+  if (match(Op1, m_scev_Mul(m_scev_AllOnes(), m_SCEV(RHS)))) {
+    LHS = Op0;
     return true;
   }
   return false;
@@ -12162,13 +12153,10 @@ bool ScalarEvolution::isImpliedCondBalancedTypes(
 bool ScalarEvolution::splitBinaryAdd(const SCEV *Expr,
                                      const SCEV *&L, const SCEV *&R,
                                      SCEV::NoWrapFlags &Flags) {
-  const auto *AE = dyn_cast<SCEVAddExpr>(Expr);
-  if (!AE || AE->getNumOperands() != 2)
+  if (!match(Expr, m_scev_Add(m_SCEV(L), m_SCEV(R))))
     return false;
 
-  L = AE->getOperand(0);
-  R = AE->getOperand(1);
-  Flags = AE->getNoWrapFlags();
+  Flags = cast<SCEVAddExpr>(Expr)->getNoWrapFlags();
   return true;
 }
 
@@ -12210,12 +12198,11 @@ ScalarEvolution::computeConstantDifference(const SCEV *More, const SCEV *Less) {
     // Try to match a common constant multiply.
     auto MatchConstMul =
         [](const SCEV *S) -> std::optional<std::pair<const SCEV *, APInt>> {
-      auto *M = dyn_cast<SCEVMulExpr>(S);
-      if (!M || M->getNumOperands() != 2 ||
-          !isa<SCEVConstant>(M->getOperand(0)))
-        return std::nullopt;
-      return {
-          {M->getOperand(1), cast<SCEVConstant>(M->getOperand(0))->getAPInt()}};
+      const APInt *C;
+      const SCEV *Op;
+      if (match(S, m_scev_Mul(m_scev_APInt(C), m_SCEV(Op))))
+        return {{Op, *C}};
+      return std::nullopt;
     };
     if (auto MatchedMore = MatchConstMul(More)) {
       if (auto MatchedLess = MatchConstMul(Less)) {
@@ -15486,6 +15473,38 @@ void ScalarEvolution::LoopGuards::collectFromPHI(
   }
 }
 
+// Return a new SCEV that modifies \p Expr to the closest number divides by
+// \p Divisor and less or equal than Expr. For now, only handle constant
+// Expr.
+static const SCEV *getPreviousSCEVDivisibleByDivisor(const SCEV *Expr,
+                                                     const APInt &DivisorVal,
+                                                     ScalarEvolution &SE) {
+  const APInt *ExprVal;
+  if (!match(Expr, m_scev_APInt(ExprVal)) || ExprVal->isNegative() ||
+      DivisorVal.isNonPositive())
+    return Expr;
+  APInt Rem = ExprVal->urem(DivisorVal);
+  // return the SCEV: Expr - Expr % Divisor
+  return SE.getConstant(*ExprVal - Rem);
+}
+
+// Return a new SCEV that modifies \p Expr to the closest number divides by
+// \p Divisor and greater or equal than Expr. For now, only handle constant
+// Expr.
+static const SCEV *getNextSCEVDivisibleByDivisor(const SCEV *Expr,
+                                                 const APInt &DivisorVal,
+                                                 ScalarEvolution &SE) {
+  const APInt *ExprVal;
+  if (!match(Expr, m_scev_APInt(ExprVal)) || ExprVal->isNegative() ||
+      DivisorVal.isNonPositive())
+    return Expr;
+  APInt Rem = ExprVal->urem(DivisorVal);
+  if (Rem.isZero())
+    return Expr;
+  // return the SCEV: Expr + Divisor - Expr % Divisor
+  return SE.getConstant(*ExprVal + DivisorVal - Rem);
+}
+
 void ScalarEvolution::LoopGuards::collectFromBlock(
     ScalarEvolution &SE, ScalarEvolution::LoopGuards &Guards,
     const BasicBlock *Block, const BasicBlock *Pred,
@@ -15547,51 +15566,12 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
     auto IsMinMaxSCEVWithNonNegativeConstant =
         [&](const SCEV *Expr, SCEVTypes &SCTy, const SCEV *&LHS,
             const SCEV *&RHS) {
-          if (auto *MinMax = dyn_cast<SCEVMinMaxExpr>(Expr)) {
-            if (MinMax->getNumOperands() != 2)
-              return false;
-            if (auto *C = dyn_cast<SCEVConstant>(MinMax->getOperand(0))) {
-              if (C->getAPInt().isNegative())
-                return false;
-              SCTy = MinMax->getSCEVType();
-              LHS = MinMax->getOperand(0);
-              RHS = MinMax->getOperand(1);
-              return true;
-            }
-          }
-          return false;
+          const APInt *C;
+          SCTy = Expr->getSCEVType();
+          return match(Expr, m_scev_MinMax(m_SCEV(LHS), m_SCEV(RHS))) &&
+                 match(LHS, m_scev_APInt(C)) && C->isNonNegative();
         };
 
-    // Return a new SCEV that modifies \p Expr to the closest number divides by
-    // \p Divisor and greater or equal than Expr. For now, only handle constant
-    // Expr.
-    auto GetNextSCEVDividesByDivisor = [&](const SCEV *Expr,
-                                           const APInt &DivisorVal) {
-      const APInt *ExprVal;
-      if (!match(Expr, m_scev_APInt(ExprVal)) || ExprVal->isNegative() ||
-          DivisorVal.isNonPositive())
-        return Expr;
-      APInt Rem = ExprVal->urem(DivisorVal);
-      if (Rem.isZero())
-        return Expr;
-      // return the SCEV: Expr + Divisor - Expr % Divisor
-      return SE.getConstant(*ExprVal + DivisorVal - Rem);
-    };
-
-    // Return a new SCEV that modifies \p Expr to the closest number divides by
-    // \p Divisor and less or equal than Expr. For now, only handle constant
-    // Expr.
-    auto GetPreviousSCEVDividesByDivisor = [&](const SCEV *Expr,
-                                               const APInt &DivisorVal) {
-      const APInt *ExprVal;
-      if (!match(Expr, m_scev_APInt(ExprVal)) || ExprVal->isNegative() ||
-          DivisorVal.isNonPositive())
-        return Expr;
-      APInt Rem = ExprVal->urem(DivisorVal);
-      // return the SCEV: Expr - Expr % Divisor
-      return SE.getConstant(*ExprVal - Rem);
-    };
-
     // Apply divisibilty by \p Divisor on MinMaxExpr with constant values,
     // recursively. This is done by aligning up/down the constant value to the
     // Divisor.
@@ -15613,8 +15593,9 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
           assert(SE.isKnownNonNegative(MinMaxLHS) &&
                  "Expected non-negative operand!");
           auto *DivisibleExpr =
-              IsMin ? GetPreviousSCEVDividesByDivisor(MinMaxLHS, DivisorVal)
-                    : GetNextSCEVDividesByDivisor(MinMaxLHS, DivisorVal);
+              IsMin
+                  ? getPreviousSCEVDivisibleByDivisor(MinMaxLHS, DivisorVal, SE)
+                  : getNextSCEVDivisibleByDivisor(MinMaxLHS, DivisorVal, SE);
           SmallVector<const SCEV *> Ops = {
               ApplyDivisibiltyOnMinMaxExpr(MinMaxRHS, Divisor), DivisibleExpr};
           return SE.getMinMaxExpr(SCTy, Ops);
@@ -15691,21 +15672,21 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
         [[fallthrough]];
       case CmpInst::ICMP_SLT: {
         RHS = SE.getMinusSCEV(RHS, One);
-        RHS = GetPreviousSCEVDividesByDivisor(RHS, DividesBy);
+        RHS = getPreviousSCEVDivisibleByDivisor(RHS, DividesBy, SE);
         break;
       }
       case CmpInst::ICMP_UGT:
       case CmpInst::ICMP_SGT:
         RHS = SE.getAddExpr(RHS, One);
-        RHS = GetNextSCEVDividesByDivisor(RHS, DividesBy);
+        RHS = getNextSCEVDivisibleByDivisor(RHS, DividesBy, SE);
         break;
       case CmpInst::ICMP_ULE:
       case CmpInst::ICMP_SLE:
-        RHS = GetPreviousSCEVDividesByDivisor(RHS, DividesBy);
+        RHS = getPreviousSCEVDivisibleByDivisor(RHS, DividesBy, SE);
         break;
       case CmpInst::ICMP_UGE:
       case CmpInst::ICMP_SGE:
-        RHS = GetNextSCEVDividesByDivisor(RHS, DividesBy);
+        RHS = getNextSCEVDivisibleByDivisor(RHS, DividesBy, SE);
         break;
       default:
         break;
@@ -15759,8 +15740,30 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
       case CmpInst::ICMP_NE:
         if (match(RHS, m_scev_Zero())) {
           const SCEV *OneAlignedUp =
-              GetNextSCEVDividesByDivisor(One, DividesBy);
+              getNextSCEVDivisibleByDivisor(One, DividesBy, SE);
           To = SE.getUMaxExpr(FromRewritten, OneAlignedUp);
+        } else {
+          // LHS != RHS can be rewritten as (LHS - RHS) = UMax(1, LHS - RHS),
+          // but creating the subtraction eagerly is expensive. Track the
+          // inequalities in a separate map, and materialize the rewrite lazily
+          // when encountering a suitable subtraction while re-writing.
+          if (LHS->getType()->isPointerTy()) {
+            LHS = SE.getLosslessPtrToIntExpr(LHS);
+            RHS = SE.getLosslessPtrToIntExpr(RHS);
+            if (isa<SCEVCouldNotCompute>(LHS) || isa<SCEVCouldNotCompute>(RHS))
+              break;
+          }
+          const SCEVConstant *C;
+          const SCEV *A, *B;
+          if (match(RHS, m_scev_Add(m_SCEVConstant(C), m_SCEV(A))) &&
+              match(LHS, m_scev_Add(m_scev_Specific(C), m_SCEV(B)))) {
+            RHS = A;
+            LHS = B;
+          }
+          if (LHS > RHS)
+            std::swap(LHS, RHS);
+          Guards.NotEqual.insert({LHS, RHS});
+          continue;
         }
         break;
       default:
@@ -15893,13 +15896,15 @@ const SCEV *ScalarEvolution::LoopGuards::rewrite(const SCEV *Expr) const {
   class SCEVLoopGuardRewriter
       : public SCEVRewriteVisitor<SCEVLoopGuardRewriter> {
     const DenseMap<const SCEV *, const SCEV *> &Map;
+    const SmallDenseSet<std::pair<const SCEV *, const SCEV *>> &NotEqual;
 
     SCEV::NoWrapFlags FlagMask = SCEV::FlagAnyWrap;
 
   public:
     SCEVLoopGuardRewriter(ScalarEvolution &SE,
                           const ScalarEvolution::LoopGuards &Guards)
-        : SCEVRewriteVisitor(SE), Map(Guards.RewriteMap) {
+        : SCEVRewriteVisitor(SE), Map(Guards.RewriteMap),
+          NotEqual(Guards.NotEqual) {
       if (Guards.PreserveNUW)
         FlagMask = ScalarEvolution::setFlags(FlagMask, SCEV::FlagNUW);
       if (Guards.PreserveNSW)
@@ -15954,14 +15959,39 @@ const SCEV *ScalarEvolution::LoopGuards::rewrite(const SCEV *Expr) const {
     }
 
     const SCEV *visitAddExpr(const SCEVAddExpr *Expr) {
+      // Helper to check if S is a subtraction (A - B) where A != B, and if so,
+      // return UMax(S, 1).
+      auto RewriteSubtraction = [&](const SCEV *S) -> const SCEV * {
+        const SCEV *LHS, *RHS;
+        if (MatchBinarySub(S, LHS, RHS)) {
+          if (LHS > RHS)
+            std::swap(LHS, RHS);
+          if (NotEqual.contains({LHS, RHS})) {
+            const SCEV *OneAlignedUp = getNextSCEVDivisibleByDivisor(
+                SE.getOne(S->getType()), SE.getConstantMultiple(S), SE);
+            return SE.getUMaxExpr(OneAlignedUp, S);
+          }
+        }
+        return nullptr;
+      };
+
+      // Check if Expr itself is a subtraction pattern with guard info.
+      if (const SCEV *Rewritten = RewriteSubtraction(Expr))
+        return Rewritten;
+
       // Trip count expressions sometimes consist of adding 3 operands, i.e.
       // (Const + A + B). There may be guard info for A + B, and if so, apply
       // it.
       // TODO: Could more generally apply guards to Add sub-expressions.
       if (isa<SCEVConstant>(Expr->getOperand(0)) &&
           Expr->getNumOperands() == 3) {
-        if (const SCEV *S = Map.lookup(
-                SE.getAddExpr(Expr->getOperand(1), Expr->getOperand(2))))
+        const SCEV *Add =
+            SE.getAddExpr(Expr->getOperand(1), Expr->getOperand(2));
+        if (const SCEV *Rewritten = RewriteSubtraction(Add))
+          return SE.getAddExpr(
+              Expr->getOperand(0), Rewritten,
+              ScalarEvolution::maskFlags(Expr->getNoWrapFlags(), FlagMask));
+        if (const SCEV *S = Map.lookup(Add))
           return SE.getAddExpr(Expr->getOperand(0), S);
       }
       SmallVector<const SCEV *, 2> Operands;
@@ -15996,7 +16026,7 @@ const SCEV *ScalarEvolution::LoopGuards::rewrite(const SCEV *Expr) const {
     }
   };
 
-  if (RewriteMap.empty())
+  if (RewriteMap.empty() && NotEqual.empty())
     return Expr;
 
   SCEVLoopGuardRewriter Rewriter(SE, *this);
diff --git a/llvm/lib/Analysis/StackSafetyAnalysis.cpp b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
index 5e94e0bfe6738..5e92ca1d38e70 100644
--- a/llvm/lib/Analysis/StackSafetyAnalysis.cpp
+++ b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
@@ -1136,7 +1136,7 @@ void llvm::generateParamAccessSummary(ModuleSummaryIndex &Index) {
     if (!AreStatisticsEnabled())
       return;
     for (auto &GVS : Index)
-      for (auto &GV : GVS.second.SummaryList)
+      for (auto &GV : GVS.second.getSummaryList())
         if (FunctionSummary *FS = dyn_cast<FunctionSummary>(GV.get()))
           Stat += FS->paramAccesses().size();
   };
@@ -1147,7 +1147,7 @@ void llvm::generateParamAccessSummary(ModuleSummaryIndex &Index) {
 
   // Convert the ModuleSummaryIndex to a FunctionMap
   for (auto &GVS : Index) {
-    for (auto &GV : GVS.second.SummaryList) {
+    for (auto &GV : GVS.second.getSummaryList()) {
       FunctionSummary *FS = dyn_cast<FunctionSummary>(GV.get());
       if (!FS || FS->paramAccesses().empty())
         continue;
diff --git a/llvm/lib/Analysis/StaticDataProfileInfo.cpp b/llvm/lib/Analysis/StaticDataProfileInfo.cpp
index 1f751ee5e09d9..61d49350c7702 100644
--- a/llvm/lib/Analysis/StaticDataProfileInfo.cpp
+++ b/llvm/lib/Analysis/StaticDataProfileInfo.cpp
@@ -1,10 +1,14 @@
 #include "llvm/Analysis/StaticDataProfileInfo.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/ProfileData/InstrProf.h"
 
+#define DEBUG_TYPE "static-data-profile-info"
+
 using namespace llvm;
 
 namespace llvm {
@@ -60,6 +64,47 @@ void StaticDataProfileInfo::addConstantProfileCount(
     OriginalCount = getInstrMaxCountValue();
 }
 
+StaticDataProfileInfo::StaticDataHotness
+StaticDataProfileInfo::getConstantHotnessUsingProfileCount(
+    const Constant *C, const ProfileSummaryInfo *PSI, uint64_t Count) const {
+  // The accummulated counter shows the constant is hot. Return enum 'hot'
+  // whether this variable is seen by unprofiled functions or not.
+  if (PSI->isHotCount(Count))
+    return StaticDataHotness::Hot;
+  // The constant is not hot, and seen by unprofiled functions. We don't want to
+  // assign it to unlikely sections, even if the counter says 'cold'. So return
+  // enum 'LukewarmOrUnknown'.
+  if (ConstantWithoutCounts.count(C))
+    return StaticDataHotness::LukewarmOrUnknown;
+  // The accummulated counter shows the constant is cold so return enum 'cold'.
+  if (PSI->isColdCount(Count))
+    return StaticDataHotness::Cold;
+
+  return StaticDataHotness::LukewarmOrUnknown;
+}
+
+StaticDataProfileInfo::StaticDataHotness
+StaticDataProfileInfo::getSectionHotnessUsingDataAccessProfile(
+    std::optional<StringRef> MaybeSectionPrefix) const {
+  if (!MaybeSectionPrefix)
+    return StaticDataHotness::LukewarmOrUnknown;
+  StringRef Prefix = *MaybeSectionPrefix;
+  assert((Prefix == "hot" || Prefix == "unlikely") &&
+         "Expect section_prefix to be one of hot or unlikely");
+  return Prefix == "hot" ? StaticDataHotness::Hot : StaticDataHotness::Cold;
+}
+
+StringRef StaticDataProfileInfo::hotnessToStr(StaticDataHotness Hotness) const {
+  switch (Hotness) {
+  case StaticDataHotness::Cold:
+    return "unlikely";
+  case StaticDataHotness::Hot:
+    return "hot";
+  default:
+    return "";
+  }
+}
+
 std::optional<uint64_t>
 StaticDataProfileInfo::getConstantProfileCount(const Constant *C) const {
   auto I = ConstantProfileCounts.find(C);
@@ -70,27 +115,67 @@ StaticDataProfileInfo::getConstantProfileCount(const Constant *C) const {
 
 StringRef StaticDataProfileInfo::getConstantSectionPrefix(
     const Constant *C, const ProfileSummaryInfo *PSI) const {
-  auto Count = getConstantProfileCount(C);
+  std::optional<uint64_t> Count = getConstantProfileCount(C);
+
+#ifndef NDEBUG
+  auto DbgPrintPrefix = [](StringRef Prefix) {
+    return Prefix.empty() ? "<empty>" : Prefix;
+  };
+#endif
+
+  if (EnableDataAccessProf) {
+    // Module flag `HasDataAccessProf` is 1 -> empty section prefix means
+    // unknown hotness except for string literals.
+    if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(C);
+        GV && llvm::memprof::IsAnnotationOK(*GV) &&
+        !GV->getName().starts_with(".str")) {
+      auto HotnessFromDataAccessProf =
+          getSectionHotnessUsingDataAccessProfile(GV->getSectionPrefix());
+
+      if (!Count) {
+        StringRef Prefix = hotnessToStr(HotnessFromDataAccessProf);
+        LLVM_DEBUG(dbgs() << GV->getName() << " has section prefix "
+                          << DbgPrintPrefix(Prefix)
+                          << ", solely from data access profiles\n");
+        return Prefix;
+      }
+
+      // Both data access profiles and PGO counters are available. Use the
+      // hotter one.
+      auto HotnessFromPGO = getConstantHotnessUsingProfileCount(C, PSI, *Count);
+      StaticDataHotness GlobalVarHotness = StaticDataHotness::LukewarmOrUnknown;
+      if (HotnessFromDataAccessProf == StaticDataHotness::Hot ||
+          HotnessFromPGO == StaticDataHotness::Hot) {
+        GlobalVarHotness = StaticDataHotness::Hot;
+      } else if (HotnessFromDataAccessProf ==
+                     StaticDataHotness::LukewarmOrUnknown ||
+                 HotnessFromPGO == StaticDataHotness::LukewarmOrUnknown) {
+        GlobalVarHotness = StaticDataHotness::LukewarmOrUnknown;
+      } else {
+        GlobalVarHotness = StaticDataHotness::Cold;
+      }
+      StringRef Prefix = hotnessToStr(GlobalVarHotness);
+      LLVM_DEBUG(
+          dbgs() << GV->getName() << " has section prefix "
+                 << DbgPrintPrefix(Prefix)
+                 << ", the max from data access profiles as "
+                 << DbgPrintPrefix(hotnessToStr(HotnessFromDataAccessProf))
+                 << " and PGO counters as "
+                 << DbgPrintPrefix(hotnessToStr(HotnessFromPGO)) << "\n");
+      return Prefix;
+    }
+  }
   if (!Count)
     return "";
-  // The accummulated counter shows the constant is hot. Return 'hot' whether
-  // this variable is seen by unprofiled functions or not.
-  if (PSI->isHotCount(*Count))
-    return "hot";
-  // The constant is not hot, and seen by unprofiled functions. We don't want to
-  // assign it to unlikely sections, even if the counter says 'cold'. So return
-  // an empty prefix before checking whether the counter is cold.
-  if (ConstantWithoutCounts.count(C))
-    return "";
-  // The accummulated counter shows the constant is cold. Return 'unlikely'.
-  if (PSI->isColdCount(*Count))
-    return "unlikely";
-  // The counter says lukewarm. Return an empty prefix.
-  return "";
+  return hotnessToStr(getConstantHotnessUsingProfileCount(C, PSI, *Count));
 }
 
 bool StaticDataProfileInfoWrapperPass::doInitialization(Module &M) {
-  Info.reset(new StaticDataProfileInfo());
+  bool EnableDataAccessProf = false;
+  if (auto *MD = mdconst::extract_or_null<ConstantInt>(
+          M.getModuleFlag("EnableDataAccessProf")))
+    EnableDataAccessProf = MD->getZExtValue();
+  Info.reset(new StaticDataProfileInfo(EnableDataAccessProf));
   return false;
 }
 
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 9655c886f4441..0a72076f51824 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -7695,6 +7695,11 @@ static bool isGuaranteedNotToBeUndefOrPoison(
         }
         if (IsWellDefined)
           return true;
+      } else if (auto *Splat = isa<ShuffleVectorInst>(Opr) ? getSplatValue(Opr)
+                                                           : nullptr) {
+        // For splats we only need to check the value being splatted.
+        if (OpCheck(Splat))
+          return true;
       } else if (all_of(Opr->operands(), OpCheck))
         return true;
     }
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 61bf447facd5c..632a72a5b1f7e 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -329,10 +329,6 @@ bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) {
   for (const auto &[Name, Info] : make_early_inc_range(ForwardRefVals)) {
     if (StringRef(Name).starts_with("llvm.")) {
       Intrinsic::ID IID = Intrinsic::lookupIntrinsicID(Name);
-      if (IID == Intrinsic::not_intrinsic)
-        // Don't do anything for unknown intrinsics.
-        continue;
-
       // Automatically create declarations for intrinsics. Intrinsics can only
       // be called directly, so the call function type directly determines the
       // declaration function type.
@@ -346,11 +342,26 @@ bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) {
           return error(Info.second, "intrinsic can only be used as callee");
 
         SmallVector<Type *> OverloadTys;
-        if (!Intrinsic::getIntrinsicSignature(IID, CB->getFunctionType(),
-                                              OverloadTys))
-          return error(Info.second, "invalid intrinsic signature");
-
-        U.set(Intrinsic::getOrInsertDeclaration(M, IID, OverloadTys));
+        if (IID != Intrinsic::not_intrinsic &&
+            Intrinsic::getIntrinsicSignature(IID, CB->getFunctionType(),
+                                             OverloadTys)) {
+          U.set(Intrinsic::getOrInsertDeclaration(M, IID, OverloadTys));
+        } else {
+          // Try to upgrade the intrinsic.
+          Function *TmpF = Function::Create(CB->getFunctionType(),
+                                            Function::ExternalLinkage, Name, M);
+          Function *NewF = nullptr;
+          if (!UpgradeIntrinsicFunction(TmpF, NewF)) {
+            if (IID == Intrinsic::not_intrinsic)
+              return error(Info.second, "unknown intrinsic '" + Name + "'");
+            return error(Info.second, "invalid intrinsic signature");
+          }
+
+          U.set(TmpF);
+          UpgradeIntrinsicCall(CB, NewF);
+          if (TmpF->use_empty())
+            TmpF->eraseFromParent();
+        }
       }
 
       Info.first->eraseFromParent();
@@ -1259,7 +1270,7 @@ bool LLParser::parseAliasOrIFunc(const std::string &Name, unsigned NameID,
       if (parseToken(lltok::StringConstant, "expected partition string"))
         return true;
     } else if (!IsAlias && Lex.getKind() == lltok::MetadataVar) {
-      if (parseGlobalObjectMetadataAttachment(*GI.get()))
+      if (parseGlobalObjectMetadataAttachment(*GI))
         return true;
     } else {
       return tokError("unknown alias or ifunc property!");
@@ -5909,6 +5920,7 @@ bool LLParser::parseDICompileUnit(MDNode *&Result, bool IsDistinct) {
   REQUIRED(file, MDField, (/* AllowNull */ false));                            \
   OPTIONAL(language, DwarfLangField, );                                        \
   OPTIONAL(sourceLanguageName, DwarfSourceLangNameField, );                    \
+  OPTIONAL(sourceLanguageVersion, MDUnsignedField, (0, UINT32_MAX));           \
   OPTIONAL(producer, MDStringField, );                                         \
   OPTIONAL(isOptimized, MDBoolField, );                                        \
   OPTIONAL(flags, MDStringField, );                                            \
@@ -5938,10 +5950,15 @@ bool LLParser::parseDICompileUnit(MDNode *&Result, bool IsDistinct) {
     return error(Loc, "can only specify one of 'language' and "
                       "'sourceLanguageName' on !DICompileUnit");
 
+  if (sourceLanguageVersion.Seen && !sourceLanguageName.Seen)
+    return error(Loc, "'sourceLanguageVersion' requires an associated "
+                      "'sourceLanguageName' on !DICompileUnit");
+
   Result = DICompileUnit::getDistinct(
       Context,
       language.Seen ? DISourceLanguageName(language.Val)
-                    : DISourceLanguageName(sourceLanguageName.Val, 0),
+                    : DISourceLanguageName(sourceLanguageName.Val,
+                                           sourceLanguageVersion.Val),
       file.Val, producer.Val, isOptimized.Val, flags.Val, runtimeVersion.Val,
       splitDebugFilename.Val, emissionKind.Val, enums.Val, retainedTypes.Val,
       globals.Val, imports.Val, macros.Val, dwoId.Val, splitDebugInlining.Val,
diff --git a/llvm/lib/BinaryFormat/XCOFF.cpp b/llvm/lib/BinaryFormat/XCOFF.cpp
index e0a4471bdb9c3..19d5b9897e47c 100644
--- a/llvm/lib/BinaryFormat/XCOFF.cpp
+++ b/llvm/lib/BinaryFormat/XCOFF.cpp
@@ -112,26 +112,26 @@ StringRef XCOFF::getNameForTracebackTableLanguageId(
 XCOFF::CFileCpuId XCOFF::getCpuID(StringRef CPUName) {
   StringRef CPU = PPC::normalizeCPUName(CPUName);
   return StringSwitch<XCOFF::CFileCpuId>(CPU)
-      .Cases("generic", "COM", XCOFF::TCPU_COM)
+      .Cases({"generic", "COM"}, XCOFF::TCPU_COM)
       .Case("601", XCOFF::TCPU_601)
-      .Cases("602", "603", "603e", "603ev", XCOFF::TCPU_603)
-      .Cases("604", "604e", XCOFF::TCPU_604)
+      .Cases({"602", "603", "603e", "603ev"}, XCOFF::TCPU_603)
+      .Cases({"604", "604e"}, XCOFF::TCPU_604)
       .Case("620", XCOFF::TCPU_620)
       .Case("970", XCOFF::TCPU_970)
-      .Cases("a2", "g3", "g4", "g5", "e500", XCOFF::TCPU_COM)
-      .Cases("pwr3", "pwr4", XCOFF::TCPU_COM)
-      .Cases("pwr5", "PWR5", XCOFF::TCPU_PWR5)
-      .Cases("pwr5x", "PWR5X", XCOFF::TCPU_PWR5X)
-      .Cases("pwr6", "PWR6", XCOFF::TCPU_PWR6)
-      .Cases("pwr6x", "PWR6E", XCOFF::TCPU_PWR6E)
-      .Cases("pwr7", "PWR7", XCOFF::TCPU_PWR7)
-      .Cases("pwr8", "PWR8", XCOFF::TCPU_PWR8)
-      .Cases("pwr9", "PWR9", XCOFF::TCPU_PWR9)
-      .Cases("pwr10", "PWR10", XCOFF::TCPU_PWR10)
-      .Cases("ppc", "PPC", "ppc32", "ppc64", XCOFF::TCPU_COM)
+      .Cases({"a2", "g3", "g4", "g5", "e500"}, XCOFF::TCPU_COM)
+      .Cases({"pwr3", "pwr4"}, XCOFF::TCPU_COM)
+      .Cases({"pwr5", "PWR5"}, XCOFF::TCPU_PWR5)
+      .Cases({"pwr5x", "PWR5X"}, XCOFF::TCPU_PWR5X)
+      .Cases({"pwr6", "PWR6"}, XCOFF::TCPU_PWR6)
+      .Cases({"pwr6x", "PWR6E"}, XCOFF::TCPU_PWR6E)
+      .Cases({"pwr7", "PWR7"}, XCOFF::TCPU_PWR7)
+      .Cases({"pwr8", "PWR8"}, XCOFF::TCPU_PWR8)
+      .Cases({"pwr9", "PWR9"}, XCOFF::TCPU_PWR9)
+      .Cases({"pwr10", "PWR10"}, XCOFF::TCPU_PWR10)
+      .Cases({"ppc", "PPC", "ppc32", "ppc64"}, XCOFF::TCPU_COM)
       .Case("ppc64le", XCOFF::TCPU_PWR8)
       .Case("future", XCOFF::TCPU_PWR10)
-      .Cases("any", "ANY", XCOFF::TCPU_ANY)
+      .Cases({"any", "ANY"}, XCOFF::TCPU_ANY)
       .Default(XCOFF::TCPU_INVALID);
 }
 
diff --git a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
index 046707c4ade4d..95674dad63baf 100644
--- a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
+++ b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
@@ -2057,7 +2057,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     break;
   }
   case bitc::METADATA_COMPILE_UNIT: {
-    if (Record.size() < 14 || Record.size() > 22)
+    if (Record.size() < 14 || Record.size() > 23)
       return error("Invalid record");
 
     // Ignore Record[0], which indicates whether this compile unit is
@@ -2066,11 +2066,13 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
 
     const auto LangVersionMask = (uint64_t(1) << 63);
     const bool HasVersionedLanguage = Record[1] & LangVersionMask;
+    const uint32_t LanguageVersion = Record.size() > 22 ? Record[22] : 0;
 
     auto *CU = DICompileUnit::getDistinct(
         Context,
         HasVersionedLanguage
-            ? DISourceLanguageName(Record[1] & ~LangVersionMask, 0)
+            ? DISourceLanguageName(Record[1] & ~LangVersionMask,
+                                   LanguageVersion)
             : DISourceLanguageName(Record[1]),
         getMDOrNull(Record[2]), getMDString(Record[3]), Record[4],
         getMDString(Record[5]), Record[6], getMDString(Record[7]), Record[8],
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 59f8b375ed960..1ce69f2082b5f 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -235,7 +235,7 @@ class ModuleBitcodeWriterBase : public BitcodeWriterBase {
       return;
     for (const auto &GUIDSummaryLists : *Index)
       // Examine all summaries for this GUID.
-      for (auto &Summary : GUIDSummaryLists.second.SummaryList)
+      for (auto &Summary : GUIDSummaryLists.second.getSummaryList())
         if (auto FS = dyn_cast<FunctionSummary>(Summary.get())) {
           // For each call in the function summary, see if the call
           // is to a GUID (which means it is for an indirect call,
@@ -592,7 +592,7 @@ class IndexBitcodeWriter : public BitcodeWriterBase {
         }
     } else {
       for (auto &Summaries : Index)
-        for (auto &Summary : Summaries.second.SummaryList)
+        for (auto &Summary : Summaries.second.getSummaryList())
           Callback({Summaries.first, Summary.get()}, false);
     }
   }
@@ -2143,6 +2143,7 @@ void ModuleBitcodeWriter::writeDICompileUnit(const DICompileUnit *N,
   Record.push_back(N->getRangesBaseAddress());
   Record.push_back(VE.getMetadataOrNullID(N->getRawSysRoot()));
   Record.push_back(VE.getMetadataOrNullID(N->getRawSDK()));
+  Record.push_back(Lang.hasVersionedName() ? Lang.getVersion() : 0);
 
   Stream.EmitRecord(bitc::METADATA_COMPILE_UNIT, Record, Abbrev);
   Record.clear();
diff --git a/llvm/lib/CAS/CMakeLists.txt b/llvm/lib/CAS/CMakeLists.txt
index bca39b645af45..a2f8c49e50145 100644
--- a/llvm/lib/CAS/CMakeLists.txt
+++ b/llvm/lib/CAS/CMakeLists.txt
@@ -8,6 +8,8 @@ add_llvm_component_library(LLVMCAS
   ObjectStore.cpp
   OnDiskCommon.cpp
   OnDiskDataAllocator.cpp
+  OnDiskGraphDB.cpp
+  OnDiskKeyValueDB.cpp
   OnDiskTrieRawHashMap.cpp
 
   ADDITIONAL_HEADER_DIRS
diff --git a/llvm/lib/CAS/OnDiskCommon.cpp b/llvm/lib/CAS/OnDiskCommon.cpp
index 25aa06bfe64da..281bde981457b 100644
--- a/llvm/lib/CAS/OnDiskCommon.cpp
+++ b/llvm/lib/CAS/OnDiskCommon.cpp
@@ -7,9 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "OnDiskCommon.h"
-#include "llvm/Config/config.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Process.h"
+#include <mutex>
 #include <thread>
 
 #if __has_include(<sys/file.h>)
@@ -25,8 +26,44 @@
 #include <fcntl.h>
 #endif
 
+#if __has_include(<sys/mount.h>)
+#include <sys/mount.h> // statfs
+#endif
+
 using namespace llvm;
 
+static uint64_t OnDiskCASMaxMappingSize = 0;
+
+Expected<std::optional<uint64_t>> cas::ondisk::getOverriddenMaxMappingSize() {
+  static std::once_flag Flag;
+  Error Err = Error::success();
+  std::call_once(Flag, [&Err] {
+    ErrorAsOutParameter EAO(&Err);
+    constexpr const char *EnvVar = "LLVM_CAS_MAX_MAPPING_SIZE";
+    auto Value = sys::Process::GetEnv(EnvVar);
+    if (!Value)
+      return;
+
+    uint64_t Size;
+    if (StringRef(*Value).getAsInteger(/*auto*/ 0, Size))
+      Err = createStringError(inconvertibleErrorCode(),
+                              "invalid value for %s: expected integer", EnvVar);
+    OnDiskCASMaxMappingSize = Size;
+  });
+
+  if (Err)
+    return std::move(Err);
+
+  if (OnDiskCASMaxMappingSize == 0)
+    return std::nullopt;
+
+  return OnDiskCASMaxMappingSize;
+}
+
+void cas::ondisk::setMaxMappingSize(uint64_t Size) {
+  OnDiskCASMaxMappingSize = Size;
+}
+
 std::error_code cas::ondisk::lockFileThreadSafe(int FD,
                                                 sys::fs::LockKind Kind) {
 #if HAVE_FLOCK
@@ -125,3 +162,20 @@ Expected<size_t> cas::ondisk::preallocateFileTail(int FD, size_t CurrentSize,
   return NewSize;    // Pretend it worked.
 #endif
 }
+
+bool cas::ondisk::useSmallMappingSize(const Twine &P) {
+  // Add exceptions to use small database file here.
+#if defined(__APPLE__) && __has_include(<sys/mount.h>)
+  // macOS tmpfs does not support sparse tails.
+  SmallString<128> PathStorage;
+  StringRef Path = P.toNullTerminatedStringRef(PathStorage);
+  struct statfs StatFS;
+  if (statfs(Path.data(), &StatFS) != 0)
+    return false;
+
+  if (strcmp(StatFS.f_fstypename, "tmpfs") == 0)
+    return true;
+#endif
+  // Default to use regular database file.
+  return false;
+}
diff --git a/llvm/lib/CAS/OnDiskCommon.h b/llvm/lib/CAS/OnDiskCommon.h
index 8b79ffe5c3158..ac00662a2e91e 100644
--- a/llvm/lib/CAS/OnDiskCommon.h
+++ b/llvm/lib/CAS/OnDiskCommon.h
@@ -12,9 +12,31 @@
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FileSystem.h"
 #include <chrono>
+#include <optional>
 
 namespace llvm::cas::ondisk {
 
+/// The version for all the ondisk database files. It needs to be bumped when
+/// compatibility breaking changes are introduced.
+constexpr StringLiteral CASFormatVersion = "v1";
+
+/// Retrieves an overridden maximum mapping size for CAS files, if any,
+/// speicified by LLVM_CAS_MAX_MAPPING_SIZE in the environment or set by
+/// `setMaxMappingSize()`. If the value from environment is unreadable, returns
+/// an error.
+Expected<std::optional<uint64_t>> getOverriddenMaxMappingSize();
+
+/// Set MaxMappingSize for ondisk CAS. This function is not thread-safe and
+/// should be set before creaing any ondisk CAS and does not affect CAS already
+/// created. Set value 0 to use default size.
+void setMaxMappingSize(uint64_t Size);
+
+/// Whether to use a small file mapping for ondisk databases created in \p Path.
+///
+/// For some file system that doesn't support sparse file, use a smaller file
+/// mapping to avoid consuming too much disk space on creation.
+bool useSmallMappingSize(const Twine &Path);
+
 /// Thread-safe alternative to \c sys::fs::lockFile. This does not support all
 /// the platforms that \c sys::fs::lockFile does, so keep it in the CAS library
 /// for now.
diff --git a/llvm/lib/CAS/OnDiskDataAllocator.cpp b/llvm/lib/CAS/OnDiskDataAllocator.cpp
index 13bbd66139178..9c68bc48d533a 100644
--- a/llvm/lib/CAS/OnDiskDataAllocator.cpp
+++ b/llvm/lib/CAS/OnDiskDataAllocator.cpp
@@ -185,7 +185,7 @@ Expected<ArrayRef<char>> OnDiskDataAllocator::get(FileOffset Offset,
   return ArrayRef<char>{Impl->File.getRegion().data() + Offset.get(), Size};
 }
 
-MutableArrayRef<uint8_t> OnDiskDataAllocator::getUserHeader() {
+MutableArrayRef<uint8_t> OnDiskDataAllocator::getUserHeader() const {
   return Impl->Store.getUserHeader();
 }
 
@@ -221,7 +221,9 @@ Expected<ArrayRef<char>> OnDiskDataAllocator::get(FileOffset Offset,
                            "OnDiskDataAllocator is not supported");
 }
 
-MutableArrayRef<uint8_t> OnDiskDataAllocator::getUserHeader() { return {}; }
+MutableArrayRef<uint8_t> OnDiskDataAllocator::getUserHeader() const {
+  return {};
+}
 
 size_t OnDiskDataAllocator::size() const { return 0; }
 size_t OnDiskDataAllocator::capacity() const { return 0; }
diff --git a/llvm/lib/CAS/OnDiskGraphDB.cpp b/llvm/lib/CAS/OnDiskGraphDB.cpp
new file mode 100644
index 0000000000000..64cbe9dc8e159
--- /dev/null
+++ b/llvm/lib/CAS/OnDiskGraphDB.cpp
@@ -0,0 +1,1758 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file implements OnDiskGraphDB, an on-disk CAS nodes database,
+/// independent of a particular hashing algorithm. It only needs to be
+/// configured for the hash size and controls the schema of the storage.
+///
+/// OnDiskGraphDB defines:
+///
+/// - How the data is stored inside database, either as a standalone file, or
+///   allocated inside a datapool.
+/// - How references to other objects inside the same database is stored. They
+///   are stored as internal references, instead of full hash value to save
+///   space.
+/// - How to chain databases together and import objects from upstream
+///   databases.
+///
+/// Here's a top-level description of the current layout:
+///
+/// - db/index.<version>: a file for the "index" table, named by \a
+///   IndexTableName and managed by \a TrieRawHashMap. The contents are 8B
+///   that are accessed atomically, describing the object kind and where/how
+///   it's stored (including an optional file offset). See \a TrieRecord for
+///   more details.
+/// - db/data.<version>: a file for the "data" table, named by \a
+///   DataPoolTableName and managed by \a DataStore. New objects within
+///   TrieRecord::MaxEmbeddedSize are inserted here as \a
+///   TrieRecord::StorageKind::DataPool.
+/// - db/obj.<offset>.<version>: a file storing an object outside the main
+///   "data" table, named by its offset into the "index" table, with the
+///   format of \a TrieRecord::StorageKind::Standalone.
+/// - db/leaf.<offset>.<version>: a file storing a leaf node outside the
+///   main "data" table, named by its offset into the "index" table, with
+///   the format of \a TrieRecord::StorageKind::StandaloneLeaf.
+/// - db/leaf+0.<offset>.<version>: a file storing a null-terminated leaf object
+///   outside the main "data" table, named by its offset into the "index" table,
+///   with the format of \a TrieRecord::StorageKind::StandaloneLeaf0.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/OnDiskGraphDB.h"
+#include "OnDiskCommon.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CAS/OnDiskDataAllocator.h"
+#include "llvm/CAS/OnDiskTrieRawHashMap.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Process.h"
+#include <atomic>
+#include <mutex>
+#include <optional>
+
+#define DEBUG_TYPE "on-disk-cas"
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+
+static constexpr StringLiteral IndexTableName = "llvm.cas.index";
+static constexpr StringLiteral DataPoolTableName = "llvm.cas.data";
+
+static constexpr StringLiteral IndexFilePrefix = "index.";
+static constexpr StringLiteral DataPoolFilePrefix = "data.";
+
+static constexpr StringLiteral FilePrefixObject = "obj.";
+static constexpr StringLiteral FilePrefixLeaf = "leaf.";
+static constexpr StringLiteral FilePrefixLeaf0 = "leaf+0.";
+
+static Error createCorruptObjectError(Expected<ArrayRef<uint8_t>> ID) {
+  if (!ID)
+    return ID.takeError();
+
+  return createStringError(llvm::errc::invalid_argument,
+                           "corrupt object '" + toHex(*ID) + "'");
+}
+
+namespace {
+
+/// Trie record data: 8 bytes, atomic<uint64_t>
+/// - 1-byte: StorageKind
+/// - 7-bytes: DataStoreOffset (offset into referenced file)
+class TrieRecord {
+public:
+  enum class StorageKind : uint8_t {
+    /// Unknown object.
+    Unknown = 0,
+
+    /// data.vX: main pool, full DataStore record.
+    DataPool = 1,
+
+    /// obj.<TrieRecordOffset>.vX: standalone, with a full DataStore record.
+    Standalone = 10,
+
+    /// leaf.<TrieRecordOffset>.vX: standalone, just the data. File contents
+    /// exactly the data content and file size matches the data size. No refs.
+    StandaloneLeaf = 11,
+
+    /// leaf+0.<TrieRecordOffset>.vX: standalone, just the data plus an
+    /// extra null character ('\0'). File size is 1 bigger than the data size.
+    /// No refs.
+    StandaloneLeaf0 = 12,
+  };
+
+  static StringRef getStandaloneFilePrefix(StorageKind SK) {
+    switch (SK) {
+    default:
+      llvm_unreachable("Expected standalone storage kind");
+    case TrieRecord::StorageKind::Standalone:
+      return FilePrefixObject;
+    case TrieRecord::StorageKind::StandaloneLeaf:
+      return FilePrefixLeaf;
+    case TrieRecord::StorageKind::StandaloneLeaf0:
+      return FilePrefixLeaf0;
+    }
+  }
+
+  enum Limits : int64_t {
+    /// Saves files bigger than 64KB standalone instead of embedding them.
+    MaxEmbeddedSize = 64LL * 1024LL - 1,
+  };
+
+  struct Data {
+    StorageKind SK = StorageKind::Unknown;
+    FileOffset Offset;
+  };
+
+  /// Pack StorageKind and Offset from Data into 8 byte TrieRecord.
+  static uint64_t pack(Data D) {
+    assert(D.Offset.get() < (int64_t)(1ULL << 56));
+    uint64_t Packed = uint64_t(D.SK) << 56 | D.Offset.get();
+    assert(D.SK != StorageKind::Unknown || Packed == 0);
+#ifndef NDEBUG
+    Data RoundTrip = unpack(Packed);
+    assert(D.SK == RoundTrip.SK);
+    assert(D.Offset.get() == RoundTrip.Offset.get());
+#endif
+    return Packed;
+  }
+
+  // Unpack TrieRecord into Data.
+  static Data unpack(uint64_t Packed) {
+    Data D;
+    if (!Packed)
+      return D;
+    D.SK = (StorageKind)(Packed >> 56);
+    D.Offset = FileOffset(Packed & (UINT64_MAX >> 8));
+    return D;
+  }
+
+  TrieRecord() : Storage(0) {}
+
+  Data load() const { return unpack(Storage); }
+  bool compare_exchange_strong(Data &Existing, Data New);
+
+private:
+  std::atomic<uint64_t> Storage;
+};
+
+/// DataStore record data: 4B + size? + refs? + data + 0
+/// - 4-bytes: Header
+/// - {0,4,8}-bytes: DataSize     (may be packed in Header)
+/// - {0,4,8}-bytes: NumRefs      (may be packed in Header)
+/// - NumRefs*{4,8}-bytes: Refs[] (end-ptr is 8-byte aligned)
+/// - <data>
+/// - 1-byte: 0-term
+struct DataRecordHandle {
+  /// NumRefs storage: 4B, 2B, 1B, or 0B (no refs). Or, 8B, for alignment
+  /// convenience to avoid computing padding later.
+  enum class NumRefsFlags : uint8_t {
+    Uses0B = 0U,
+    Uses1B = 1U,
+    Uses2B = 2U,
+    Uses4B = 3U,
+    Uses8B = 4U,
+    Max = Uses8B,
+  };
+
+  /// DataSize storage: 8B, 4B, 2B, or 1B.
+  enum class DataSizeFlags {
+    Uses1B = 0U,
+    Uses2B = 1U,
+    Uses4B = 2U,
+    Uses8B = 3U,
+    Max = Uses8B,
+  };
+
+  /// Kind of ref stored in Refs[]: InternalRef or InternalRef4B.
+  enum class RefKindFlags {
+    InternalRef = 0U,
+    InternalRef4B = 1U,
+    Max = InternalRef4B,
+  };
+
+  enum Counts : int {
+    NumRefsShift = 0,
+    NumRefsBits = 3,
+    DataSizeShift = NumRefsShift + NumRefsBits,
+    DataSizeBits = 2,
+    RefKindShift = DataSizeShift + DataSizeBits,
+    RefKindBits = 1,
+  };
+  static_assert(((UINT32_MAX << NumRefsBits) & (uint32_t)NumRefsFlags::Max) ==
+                    0,
+                "Not enough bits");
+  static_assert(((UINT32_MAX << DataSizeBits) & (uint32_t)DataSizeFlags::Max) ==
+                    0,
+                "Not enough bits");
+  static_assert(((UINT32_MAX << RefKindBits) & (uint32_t)RefKindFlags::Max) ==
+                    0,
+                "Not enough bits");
+
+  /// Layout of the DataRecordHandle and how to decode it.
+  struct LayoutFlags {
+    NumRefsFlags NumRefs;
+    DataSizeFlags DataSize;
+    RefKindFlags RefKind;
+
+    static uint64_t pack(LayoutFlags LF) {
+      unsigned Packed = ((unsigned)LF.NumRefs << NumRefsShift) |
+                        ((unsigned)LF.DataSize << DataSizeShift) |
+                        ((unsigned)LF.RefKind << RefKindShift);
+#ifndef NDEBUG
+      LayoutFlags RoundTrip = unpack(Packed);
+      assert(LF.NumRefs == RoundTrip.NumRefs);
+      assert(LF.DataSize == RoundTrip.DataSize);
+      assert(LF.RefKind == RoundTrip.RefKind);
+#endif
+      return Packed;
+    }
+    static LayoutFlags unpack(uint64_t Storage) {
+      assert(Storage <= UINT8_MAX && "Expect storage to fit in a byte");
+      LayoutFlags LF;
+      LF.NumRefs =
+          (NumRefsFlags)((Storage >> NumRefsShift) & ((1U << NumRefsBits) - 1));
+      LF.DataSize = (DataSizeFlags)((Storage >> DataSizeShift) &
+                                    ((1U << DataSizeBits) - 1));
+      LF.RefKind =
+          (RefKindFlags)((Storage >> RefKindShift) & ((1U << RefKindBits) - 1));
+      return LF;
+    }
+  };
+
+  /// Header layout:
+  /// - 1-byte:      LayoutFlags
+  /// - 1-byte:      1B size field
+  /// - {0,2}-bytes: 2B size field
+  struct Header {
+    using PackTy = uint32_t;
+    PackTy Packed;
+
+    static constexpr unsigned LayoutFlagsShift =
+        (sizeof(PackTy) - 1) * CHAR_BIT;
+  };
+
+  struct Input {
+    InternalRefArrayRef Refs;
+    ArrayRef<char> Data;
+  };
+
+  LayoutFlags getLayoutFlags() const {
+    return LayoutFlags::unpack(H->Packed >> Header::LayoutFlagsShift);
+  }
+
+  uint64_t getDataSize() const;
+  void skipDataSize(LayoutFlags LF, int64_t &RelOffset) const;
+  uint32_t getNumRefs() const;
+  void skipNumRefs(LayoutFlags LF, int64_t &RelOffset) const;
+  int64_t getRefsRelOffset() const;
+  int64_t getDataRelOffset() const;
+
+  static uint64_t getTotalSize(uint64_t DataRelOffset, uint64_t DataSize) {
+    return DataRelOffset + DataSize + 1;
+  }
+  uint64_t getTotalSize() const {
+    return getDataRelOffset() + getDataSize() + 1;
+  }
+
+  /// Describe the layout of data stored and how to decode from
+  /// DataRecordHandle.
+  struct Layout {
+    explicit Layout(const Input &I);
+
+    LayoutFlags Flags;
+    uint64_t DataSize = 0;
+    uint32_t NumRefs = 0;
+    int64_t RefsRelOffset = 0;
+    int64_t DataRelOffset = 0;
+    uint64_t getTotalSize() const {
+      return DataRecordHandle::getTotalSize(DataRelOffset, DataSize);
+    }
+  };
+
+  InternalRefArrayRef getRefs() const {
+    assert(H && "Expected valid handle");
+    auto *BeginByte = reinterpret_cast<const char *>(H) + getRefsRelOffset();
+    size_t Size = getNumRefs();
+    if (!Size)
+      return InternalRefArrayRef();
+    if (getLayoutFlags().RefKind == RefKindFlags::InternalRef4B)
+      return ArrayRef(reinterpret_cast<const InternalRef4B *>(BeginByte), Size);
+    return ArrayRef(reinterpret_cast<const InternalRef *>(BeginByte), Size);
+  }
+
+  ArrayRef<char> getData() const {
+    assert(H && "Expected valid handle");
+    return ArrayRef(reinterpret_cast<const char *>(H) + getDataRelOffset(),
+                    getDataSize());
+  }
+
+  static DataRecordHandle create(function_ref<char *(size_t Size)> Alloc,
+                                 const Input &I);
+  static Expected<DataRecordHandle>
+  createWithError(function_ref<Expected<char *>(size_t Size)> Alloc,
+                  const Input &I);
+  static DataRecordHandle construct(char *Mem, const Input &I);
+
+  static DataRecordHandle get(const char *Mem) {
+    return DataRecordHandle(
+        *reinterpret_cast<const DataRecordHandle::Header *>(Mem));
+  }
+  static Expected<DataRecordHandle>
+  getFromDataPool(const OnDiskDataAllocator &Pool, FileOffset Offset);
+
+  explicit operator bool() const { return H; }
+  const Header &getHeader() const { return *H; }
+
+  DataRecordHandle() = default;
+  explicit DataRecordHandle(const Header &H) : H(&H) {}
+
+private:
+  static DataRecordHandle constructImpl(char *Mem, const Input &I,
+                                        const Layout &L);
+  const Header *H = nullptr;
+};
+
+/// Proxy for any on-disk object or raw data.
+struct OnDiskContent {
+  std::optional<DataRecordHandle> Record;
+  std::optional<ArrayRef<char>> Bytes;
+};
+
+/// Data loaded inside the memory from standalone file.
+class StandaloneDataInMemory {
+public:
+  OnDiskContent getContent() const;
+
+  StandaloneDataInMemory(std::unique_ptr<sys::fs::mapped_file_region> Region,
+                         TrieRecord::StorageKind SK)
+      : Region(std::move(Region)), SK(SK) {
+#ifndef NDEBUG
+    bool IsStandalone = false;
+    switch (SK) {
+    case TrieRecord::StorageKind::Standalone:
+    case TrieRecord::StorageKind::StandaloneLeaf:
+    case TrieRecord::StorageKind::StandaloneLeaf0:
+      IsStandalone = true;
+      break;
+    default:
+      break;
+    }
+    assert(IsStandalone);
+#endif
+  }
+
+private:
+  std::unique_ptr<sys::fs::mapped_file_region> Region;
+  TrieRecord::StorageKind SK;
+};
+
+/// Container to lookup loaded standalone objects.
+template <size_t NumShards> class StandaloneDataMap {
+  static_assert(isPowerOf2_64(NumShards), "Expected power of 2");
+
+public:
+  uintptr_t insert(ArrayRef<uint8_t> Hash, TrieRecord::StorageKind SK,
+                   std::unique_ptr<sys::fs::mapped_file_region> Region);
+
+  const StandaloneDataInMemory *lookup(ArrayRef<uint8_t> Hash) const;
+  bool count(ArrayRef<uint8_t> Hash) const { return bool(lookup(Hash)); }
+
+private:
+  struct Shard {
+    /// Needs to store a std::unique_ptr for a stable address identity.
+    DenseMap<const uint8_t *, std::unique_ptr<StandaloneDataInMemory>> Map;
+    mutable std::mutex Mutex;
+  };
+  Shard &getShard(ArrayRef<uint8_t> Hash) {
+    return const_cast<Shard &>(
+        const_cast<const StandaloneDataMap *>(this)->getShard(Hash));
+  }
+  const Shard &getShard(ArrayRef<uint8_t> Hash) const {
+    static_assert(NumShards <= 256, "Expected only 8 bits of shard");
+    return Shards[Hash[0] % NumShards];
+  }
+
+  Shard Shards[NumShards];
+};
+
+using StandaloneDataMapTy = StandaloneDataMap<16>;
+
+/// A vector of internal node references.
+class InternalRefVector {
+public:
+  void push_back(InternalRef Ref) {
+    if (NeedsFull)
+      return FullRefs.push_back(Ref);
+    if (std::optional<InternalRef4B> Small = InternalRef4B::tryToShrink(Ref))
+      return SmallRefs.push_back(*Small);
+    NeedsFull = true;
+    assert(FullRefs.empty());
+    FullRefs.reserve(SmallRefs.size() + 1);
+    for (InternalRef4B Small : SmallRefs)
+      FullRefs.push_back(Small);
+    FullRefs.push_back(Ref);
+    SmallRefs.clear();
+  }
+
+  operator InternalRefArrayRef() const {
+    assert(SmallRefs.empty() || FullRefs.empty());
+    return NeedsFull ? InternalRefArrayRef(FullRefs)
+                     : InternalRefArrayRef(SmallRefs);
+  }
+
+private:
+  bool NeedsFull = false;
+  SmallVector<InternalRef4B> SmallRefs;
+  SmallVector<InternalRef> FullRefs;
+};
+
+} // namespace
+
+Expected<DataRecordHandle> DataRecordHandle::createWithError(
+    function_ref<Expected<char *>(size_t Size)> Alloc, const Input &I) {
+  Layout L(I);
+  if (Expected<char *> Mem = Alloc(L.getTotalSize()))
+    return constructImpl(*Mem, I, L);
+  else
+    return Mem.takeError();
+}
+
+DataRecordHandle
+DataRecordHandle::create(function_ref<char *(size_t Size)> Alloc,
+                         const Input &I) {
+  Layout L(I);
+  return constructImpl(Alloc(L.getTotalSize()), I, L);
+}
+
+ObjectHandle ObjectHandle::fromFileOffset(FileOffset Offset) {
+  // Store the file offset as it is.
+  assert(!(Offset.get() & 0x1));
+  return ObjectHandle(Offset.get());
+}
+
+ObjectHandle ObjectHandle::fromMemory(uintptr_t Ptr) {
+  // Store the pointer from memory with lowest bit set.
+  assert(!(Ptr & 0x1));
+  return ObjectHandle(Ptr | 1);
+}
+
+/// Proxy for an on-disk index record.
+struct OnDiskGraphDB::IndexProxy {
+  FileOffset Offset;
+  ArrayRef<uint8_t> Hash;
+  TrieRecord &Ref;
+};
+
+template <size_t N>
+uintptr_t StandaloneDataMap<N>::insert(
+    ArrayRef<uint8_t> Hash, TrieRecord::StorageKind SK,
+    std::unique_ptr<sys::fs::mapped_file_region> Region) {
+  auto &S = getShard(Hash);
+  std::lock_guard<std::mutex> Lock(S.Mutex);
+  auto &V = S.Map[Hash.data()];
+  if (!V)
+    V = std::make_unique<StandaloneDataInMemory>(std::move(Region), SK);
+  return reinterpret_cast<uintptr_t>(V.get());
+}
+
+template <size_t N>
+const StandaloneDataInMemory *
+StandaloneDataMap<N>::lookup(ArrayRef<uint8_t> Hash) const {
+  auto &S = getShard(Hash);
+  std::lock_guard<std::mutex> Lock(S.Mutex);
+  auto I = S.Map.find(Hash.data());
+  if (I == S.Map.end())
+    return nullptr;
+  return &*I->second;
+}
+
+namespace {
+
+/// Copy of \a sys::fs::TempFile that skips RemoveOnSignal, which is too
+/// expensive to register/unregister at this rate.
+///
+/// FIXME: Add a TempFileManager that maintains a thread-safe list of open temp
+/// files and has a signal handler registerd that removes them all.
+class TempFile {
+  bool Done = false;
+  TempFile(StringRef Name, int FD) : TmpName(std::string(Name)), FD(FD) {}
+
+public:
+  /// This creates a temporary file with createUniqueFile.
+  static Expected<TempFile> create(const Twine &Model);
+  TempFile(TempFile &&Other) { *this = std::move(Other); }
+  TempFile &operator=(TempFile &&Other) {
+    TmpName = std::move(Other.TmpName);
+    FD = Other.FD;
+    Other.Done = true;
+    Other.FD = -1;
+    return *this;
+  }
+
+  // Name of the temporary file.
+  std::string TmpName;
+
+  // The open file descriptor.
+  int FD = -1;
+
+  // Keep this with the given name.
+  Error keep(const Twine &Name);
+  Error discard();
+
+  // This checks that keep or delete was called.
+  ~TempFile() { consumeError(discard()); }
+};
+
+class MappedTempFile {
+public:
+  char *data() const { return Map.data(); }
+  size_t size() const { return Map.size(); }
+
+  Error discard() {
+    assert(Map && "Map already destroyed");
+    Map.unmap();
+    return Temp.discard();
+  }
+
+  Error keep(const Twine &Name) {
+    assert(Map && "Map already destroyed");
+    Map.unmap();
+    return Temp.keep(Name);
+  }
+
+  MappedTempFile(TempFile Temp, sys::fs::mapped_file_region Map)
+      : Temp(std::move(Temp)), Map(std::move(Map)) {}
+
+private:
+  TempFile Temp;
+  sys::fs::mapped_file_region Map;
+};
+} // namespace
+
+Error TempFile::discard() {
+  Done = true;
+  if (FD != -1) {
+    sys::fs::file_t File = sys::fs::convertFDToNativeFile(FD);
+    if (std::error_code EC = sys::fs::closeFile(File))
+      return errorCodeToError(EC);
+  }
+  FD = -1;
+
+  // Always try to close and remove.
+  std::error_code RemoveEC;
+  if (!TmpName.empty()) {
+    std::error_code EC = sys::fs::remove(TmpName);
+    if (EC)
+      return errorCodeToError(EC);
+  }
+  TmpName = "";
+
+  return Error::success();
+}
+
+Error TempFile::keep(const Twine &Name) {
+  assert(!Done);
+  Done = true;
+  // Always try to close and rename.
+  std::error_code RenameEC = sys::fs::rename(TmpName, Name);
+
+  if (!RenameEC)
+    TmpName = "";
+
+  sys::fs::file_t File = sys::fs::convertFDToNativeFile(FD);
+  if (std::error_code EC = sys::fs::closeFile(File))
+    return errorCodeToError(EC);
+  FD = -1;
+
+  return errorCodeToError(RenameEC);
+}
+
+Expected<TempFile> TempFile::create(const Twine &Model) {
+  int FD;
+  SmallString<128> ResultPath;
+  if (std::error_code EC = sys::fs::createUniqueFile(Model, FD, ResultPath))
+    return errorCodeToError(EC);
+
+  TempFile Ret(ResultPath, FD);
+  return std::move(Ret);
+}
+
+bool TrieRecord::compare_exchange_strong(Data &Existing, Data New) {
+  uint64_t ExistingPacked = pack(Existing);
+  uint64_t NewPacked = pack(New);
+  if (Storage.compare_exchange_strong(ExistingPacked, NewPacked))
+    return true;
+  Existing = unpack(ExistingPacked);
+  return false;
+}
+
+DataRecordHandle DataRecordHandle::construct(char *Mem, const Input &I) {
+  return constructImpl(Mem, I, Layout(I));
+}
+
+Expected<DataRecordHandle>
+DataRecordHandle::getFromDataPool(const OnDiskDataAllocator &Pool,
+                                  FileOffset Offset) {
+  auto HeaderData = Pool.get(Offset, sizeof(DataRecordHandle::Header));
+  if (!HeaderData)
+    return HeaderData.takeError();
+
+  auto Record = DataRecordHandle::get(HeaderData->data());
+  if (Record.getTotalSize() + Offset.get() > Pool.size())
+    return createStringError(
+        make_error_code(std::errc::illegal_byte_sequence),
+        "data record span passed the end of the data pool");
+
+  return Record;
+}
+
+DataRecordHandle DataRecordHandle::constructImpl(char *Mem, const Input &I,
+                                                 const Layout &L) {
+  char *Next = Mem + sizeof(Header);
+
+  // Fill in Packed and set other data, then come back to construct the header.
+  Header::PackTy Packed = 0;
+  Packed |= LayoutFlags::pack(L.Flags) << Header::LayoutFlagsShift;
+
+  // Construct DataSize.
+  switch (L.Flags.DataSize) {
+  case DataSizeFlags::Uses1B:
+    assert(I.Data.size() <= UINT8_MAX);
+    Packed |= (Header::PackTy)I.Data.size()
+              << ((sizeof(Packed) - 2) * CHAR_BIT);
+    break;
+  case DataSizeFlags::Uses2B:
+    assert(I.Data.size() <= UINT16_MAX);
+    Packed |= (Header::PackTy)I.Data.size()
+              << ((sizeof(Packed) - 4) * CHAR_BIT);
+    break;
+  case DataSizeFlags::Uses4B:
+    support::endian::write32le(Next, I.Data.size());
+    Next += 4;
+    break;
+  case DataSizeFlags::Uses8B:
+    support::endian::write64le(Next, I.Data.size());
+    Next += 8;
+    break;
+  }
+
+  // Construct NumRefs.
+  //
+  // NOTE: May be writing NumRefs even if there are zero refs in order to fix
+  // alignment.
+  switch (L.Flags.NumRefs) {
+  case NumRefsFlags::Uses0B:
+    break;
+  case NumRefsFlags::Uses1B:
+    assert(I.Refs.size() <= UINT8_MAX);
+    Packed |= (Header::PackTy)I.Refs.size()
+              << ((sizeof(Packed) - 2) * CHAR_BIT);
+    break;
+  case NumRefsFlags::Uses2B:
+    assert(I.Refs.size() <= UINT16_MAX);
+    Packed |= (Header::PackTy)I.Refs.size()
+              << ((sizeof(Packed) - 4) * CHAR_BIT);
+    break;
+  case NumRefsFlags::Uses4B:
+    support::endian::write32le(Next, I.Refs.size());
+    Next += 4;
+    break;
+  case NumRefsFlags::Uses8B:
+    support::endian::write64le(Next, I.Refs.size());
+    Next += 8;
+    break;
+  }
+
+  // Construct Refs[].
+  if (!I.Refs.empty()) {
+    assert((L.Flags.RefKind == RefKindFlags::InternalRef4B) == I.Refs.is4B());
+    ArrayRef<uint8_t> RefsBuffer = I.Refs.getBuffer();
+    llvm::copy(RefsBuffer, Next);
+    Next += RefsBuffer.size();
+  }
+
+  // Construct Data and the trailing null.
+  assert(isAddrAligned(Align(8), Next));
+  llvm::copy(I.Data, Next);
+  Next[I.Data.size()] = 0;
+
+  // Construct the header itself and return.
+  Header *H = new (Mem) Header{Packed};
+  DataRecordHandle Record(*H);
+  assert(Record.getData() == I.Data);
+  assert(Record.getNumRefs() == I.Refs.size());
+  assert(Record.getRefs() == I.Refs);
+  assert(Record.getLayoutFlags().DataSize == L.Flags.DataSize);
+  assert(Record.getLayoutFlags().NumRefs == L.Flags.NumRefs);
+  assert(Record.getLayoutFlags().RefKind == L.Flags.RefKind);
+  return Record;
+}
+
+DataRecordHandle::Layout::Layout(const Input &I) {
+  // Start initial relative offsets right after the Header.
+  uint64_t RelOffset = sizeof(Header);
+
+  // Initialize the easy stuff.
+  DataSize = I.Data.size();
+  NumRefs = I.Refs.size();
+
+  // Check refs size.
+  Flags.RefKind =
+      I.Refs.is4B() ? RefKindFlags::InternalRef4B : RefKindFlags::InternalRef;
+
+  // Find the smallest slot available for DataSize.
+  bool Has1B = true;
+  bool Has2B = true;
+  if (DataSize <= UINT8_MAX && Has1B) {
+    Flags.DataSize = DataSizeFlags::Uses1B;
+    Has1B = false;
+  } else if (DataSize <= UINT16_MAX && Has2B) {
+    Flags.DataSize = DataSizeFlags::Uses2B;
+    Has2B = false;
+  } else if (DataSize <= UINT32_MAX) {
+    Flags.DataSize = DataSizeFlags::Uses4B;
+    RelOffset += 4;
+  } else {
+    Flags.DataSize = DataSizeFlags::Uses8B;
+    RelOffset += 8;
+  }
+
+  // Find the smallest slot available for NumRefs. Never sets NumRefs8B here.
+  if (!NumRefs) {
+    Flags.NumRefs = NumRefsFlags::Uses0B;
+  } else if (NumRefs <= UINT8_MAX && Has1B) {
+    Flags.NumRefs = NumRefsFlags::Uses1B;
+    Has1B = false;
+  } else if (NumRefs <= UINT16_MAX && Has2B) {
+    Flags.NumRefs = NumRefsFlags::Uses2B;
+    Has2B = false;
+  } else {
+    Flags.NumRefs = NumRefsFlags::Uses4B;
+    RelOffset += 4;
+  }
+
+  // Helper to "upgrade" either DataSize or NumRefs by 4B to avoid complicated
+  // padding rules when reading and writing. This also bumps RelOffset.
+  //
+  // The value for NumRefs is strictly limited to UINT32_MAX, but it can be
+  // stored as 8B. This means we can *always* find a size to grow.
+  //
+  // NOTE: Only call this once.
+  auto GrowSizeFieldsBy4B = [&]() {
+    assert(isAligned(Align(4), RelOffset));
+    RelOffset += 4;
+
+    assert(Flags.NumRefs != NumRefsFlags::Uses8B &&
+           "Expected to be able to grow NumRefs8B");
+
+    // First try to grow DataSize. NumRefs will not (yet) be 8B, and if
+    // DataSize is upgraded to 8B it'll already be aligned.
+    //
+    // Failing that, grow NumRefs.
+    if (Flags.DataSize < DataSizeFlags::Uses4B)
+      Flags.DataSize = DataSizeFlags::Uses4B; // DataSize: Packed => 4B.
+    else if (Flags.DataSize < DataSizeFlags::Uses8B)
+      Flags.DataSize = DataSizeFlags::Uses8B; // DataSize: 4B => 8B.
+    else if (Flags.NumRefs < NumRefsFlags::Uses4B)
+      Flags.NumRefs = NumRefsFlags::Uses4B; // NumRefs: Packed => 4B.
+    else
+      Flags.NumRefs = NumRefsFlags::Uses8B; // NumRefs: 4B => 8B.
+  };
+
+  assert(isAligned(Align(4), RelOffset));
+  if (Flags.RefKind == RefKindFlags::InternalRef) {
+    // List of 8B refs should be 8B-aligned. Grow one of the sizes to get this
+    // without padding.
+    if (!isAligned(Align(8), RelOffset))
+      GrowSizeFieldsBy4B();
+
+    assert(isAligned(Align(8), RelOffset));
+    RefsRelOffset = RelOffset;
+    RelOffset += 8 * NumRefs;
+  } else {
+    // The array of 4B refs doesn't need 8B alignment, but the data will need
+    // to be 8B-aligned. Detect this now, and, if necessary, shift everything
+    // by 4B by growing one of the sizes.
+    // If we remove the need for 8B-alignment for data there is <1% savings in
+    // disk storage for a clang build using MCCAS but the 8B-alignment may be
+    // useful in the future so keep it for now.
+    uint64_t RefListSize = 4 * NumRefs;
+    if (!isAligned(Align(8), RelOffset + RefListSize))
+      GrowSizeFieldsBy4B();
+    RefsRelOffset = RelOffset;
+    RelOffset += RefListSize;
+  }
+
+  assert(isAligned(Align(8), RelOffset));
+  DataRelOffset = RelOffset;
+}
+
+uint64_t DataRecordHandle::getDataSize() const {
+  int64_t RelOffset = sizeof(Header);
+  auto *DataSizePtr = reinterpret_cast<const char *>(H) + RelOffset;
+  switch (getLayoutFlags().DataSize) {
+  case DataSizeFlags::Uses1B:
+    return (H->Packed >> ((sizeof(Header::PackTy) - 2) * CHAR_BIT)) & UINT8_MAX;
+  case DataSizeFlags::Uses2B:
+    return (H->Packed >> ((sizeof(Header::PackTy) - 4) * CHAR_BIT)) &
+           UINT16_MAX;
+  case DataSizeFlags::Uses4B:
+    return support::endian::read32le(DataSizePtr);
+  case DataSizeFlags::Uses8B:
+    return support::endian::read64le(DataSizePtr);
+  }
+  llvm_unreachable("Unknown DataSizeFlags enum");
+}
+
+void DataRecordHandle::skipDataSize(LayoutFlags LF, int64_t &RelOffset) const {
+  if (LF.DataSize >= DataSizeFlags::Uses4B)
+    RelOffset += 4;
+  if (LF.DataSize >= DataSizeFlags::Uses8B)
+    RelOffset += 4;
+}
+
+uint32_t DataRecordHandle::getNumRefs() const {
+  LayoutFlags LF = getLayoutFlags();
+  int64_t RelOffset = sizeof(Header);
+  skipDataSize(LF, RelOffset);
+  auto *NumRefsPtr = reinterpret_cast<const char *>(H) + RelOffset;
+  switch (LF.NumRefs) {
+  case NumRefsFlags::Uses0B:
+    return 0;
+  case NumRefsFlags::Uses1B:
+    return (H->Packed >> ((sizeof(Header::PackTy) - 2) * CHAR_BIT)) & UINT8_MAX;
+  case NumRefsFlags::Uses2B:
+    return (H->Packed >> ((sizeof(Header::PackTy) - 4) * CHAR_BIT)) &
+           UINT16_MAX;
+  case NumRefsFlags::Uses4B:
+    return support::endian::read32le(NumRefsPtr);
+  case NumRefsFlags::Uses8B:
+    return support::endian::read64le(NumRefsPtr);
+  }
+  llvm_unreachable("Unknown NumRefsFlags enum");
+}
+
+void DataRecordHandle::skipNumRefs(LayoutFlags LF, int64_t &RelOffset) const {
+  if (LF.NumRefs >= NumRefsFlags::Uses4B)
+    RelOffset += 4;
+  if (LF.NumRefs >= NumRefsFlags::Uses8B)
+    RelOffset += 4;
+}
+
+int64_t DataRecordHandle::getRefsRelOffset() const {
+  LayoutFlags LF = getLayoutFlags();
+  int64_t RelOffset = sizeof(Header);
+  skipDataSize(LF, RelOffset);
+  skipNumRefs(LF, RelOffset);
+  return RelOffset;
+}
+
+int64_t DataRecordHandle::getDataRelOffset() const {
+  LayoutFlags LF = getLayoutFlags();
+  int64_t RelOffset = sizeof(Header);
+  skipDataSize(LF, RelOffset);
+  skipNumRefs(LF, RelOffset);
+  uint32_t RefSize = LF.RefKind == RefKindFlags::InternalRef4B ? 4 : 8;
+  RelOffset += RefSize * getNumRefs();
+  return RelOffset;
+}
+
+Error OnDiskGraphDB::validate(bool Deep, HashingFuncT Hasher) const {
+  return Index.validate([&](FileOffset Offset,
+                            OnDiskTrieRawHashMap::ConstValueProxy Record)
+                            -> Error {
+    auto formatError = [&](Twine Msg) {
+      return createStringError(
+          llvm::errc::illegal_byte_sequence,
+          "bad record at 0x" +
+              utohexstr((unsigned)Offset.get(), /*LowerCase=*/true) + ": " +
+              Msg.str());
+    };
+
+    if (Record.Data.size() != sizeof(TrieRecord))
+      return formatError("wrong data record size");
+    if (!isAligned(Align::Of<TrieRecord>(), Record.Data.size()))
+      return formatError("wrong data record alignment");
+
+    auto *R = reinterpret_cast<const TrieRecord *>(Record.Data.data());
+    TrieRecord::Data D = R->load();
+    std::unique_ptr<MemoryBuffer> FileBuffer;
+    if ((uint8_t)D.SK != (uint8_t)TrieRecord::StorageKind::Unknown &&
+        (uint8_t)D.SK != (uint8_t)TrieRecord::StorageKind::DataPool &&
+        (uint8_t)D.SK != (uint8_t)TrieRecord::StorageKind::Standalone &&
+        (uint8_t)D.SK != (uint8_t)TrieRecord::StorageKind::StandaloneLeaf &&
+        (uint8_t)D.SK != (uint8_t)TrieRecord::StorageKind::StandaloneLeaf0)
+      return formatError("invalid record kind value");
+
+    auto Ref = InternalRef::getFromOffset(Offset);
+    auto I = getIndexProxyFromRef(Ref);
+    if (!I)
+      return I.takeError();
+
+    switch (D.SK) {
+    case TrieRecord::StorageKind::Unknown:
+      // This could be an abandoned entry due to a termination before updating
+      // the record. It can be reused by later insertion so just skip this entry
+      // for now.
+      return Error::success();
+    case TrieRecord::StorageKind::DataPool:
+      // Check offset is a postive value, and large enough to hold the
+      // header for the data record.
+      if (D.Offset.get() <= 0 ||
+          (uint64_t)D.Offset.get() + sizeof(DataRecordHandle::Header) >=
+              DataPool.size())
+        return formatError("datapool record out of bound");
+      break;
+    case TrieRecord::StorageKind::Standalone:
+    case TrieRecord::StorageKind::StandaloneLeaf:
+    case TrieRecord::StorageKind::StandaloneLeaf0:
+      SmallString<256> Path;
+      getStandalonePath(TrieRecord::getStandaloneFilePrefix(D.SK), *I, Path);
+      // If need to validate the content of the file later, just load the
+      // buffer here. Otherwise, just check the existance of the file.
+      if (Deep) {
+        auto File = MemoryBuffer::getFile(Path, /*IsText=*/false,
+                                          /*RequiresNullTerminator=*/false);
+        if (!File || !*File)
+          return formatError("record file \'" + Path + "\' does not exist");
+
+        FileBuffer = std::move(*File);
+      } else if (!llvm::sys::fs::exists(Path))
+        return formatError("record file \'" + Path + "\' does not exist");
+    }
+
+    if (!Deep)
+      return Error::success();
+
+    auto dataError = [&](Twine Msg) {
+      return createStringError(llvm::errc::illegal_byte_sequence,
+                               "bad data for digest \'" + toHex(I->Hash) +
+                                   "\': " + Msg.str());
+    };
+    SmallVector<ArrayRef<uint8_t>> Refs;
+    ArrayRef<char> StoredData;
+
+    switch (D.SK) {
+    case TrieRecord::StorageKind::Unknown:
+      llvm_unreachable("already handled");
+    case TrieRecord::StorageKind::DataPool: {
+      auto DataRecord = DataRecordHandle::getFromDataPool(DataPool, D.Offset);
+      if (!DataRecord)
+        return dataError(toString(DataRecord.takeError()));
+
+      for (auto InternRef : DataRecord->getRefs()) {
+        auto Index = getIndexProxyFromRef(InternRef);
+        if (!Index)
+          return Index.takeError();
+        Refs.push_back(Index->Hash);
+      }
+      StoredData = DataRecord->getData();
+      break;
+    }
+    case TrieRecord::StorageKind::Standalone: {
+      if (FileBuffer->getBufferSize() < sizeof(DataRecordHandle::Header))
+        return dataError("data record is not big enough to read the header");
+      auto DataRecord = DataRecordHandle::get(FileBuffer->getBufferStart());
+      if (DataRecord.getTotalSize() < FileBuffer->getBufferSize())
+        return dataError(
+            "data record span passed the end of the standalone file");
+      for (auto InternRef : DataRecord.getRefs()) {
+        auto Index = getIndexProxyFromRef(InternRef);
+        if (!Index)
+          return Index.takeError();
+        Refs.push_back(Index->Hash);
+      }
+      StoredData = DataRecord.getData();
+      break;
+    }
+    case TrieRecord::StorageKind::StandaloneLeaf:
+    case TrieRecord::StorageKind::StandaloneLeaf0: {
+      StoredData = arrayRefFromStringRef<char>(FileBuffer->getBuffer());
+      if (D.SK == TrieRecord::StorageKind::StandaloneLeaf0) {
+        if (!FileBuffer->getBuffer().ends_with('\0'))
+          return dataError("standalone file is not zero terminated");
+        StoredData = StoredData.drop_back(1);
+      }
+      break;
+    }
+    }
+
+    SmallVector<uint8_t> ComputedHash;
+    Hasher(Refs, StoredData, ComputedHash);
+    if (I->Hash != ArrayRef(ComputedHash))
+      return dataError("hash mismatch, got \'" + toHex(ComputedHash) +
+                       "\' instead");
+
+    return Error::success();
+  });
+}
+
+void OnDiskGraphDB::print(raw_ostream &OS) const {
+  OS << "on-disk-root-path: " << RootPath << "\n";
+
+  struct PoolInfo {
+    uint64_t Offset;
+  };
+  SmallVector<PoolInfo> Pool;
+
+  OS << "\n";
+  OS << "index:\n";
+  Index.print(OS, [&](ArrayRef<char> Data) {
+    assert(Data.size() == sizeof(TrieRecord));
+    assert(isAligned(Align::Of<TrieRecord>(), Data.size()));
+    auto *R = reinterpret_cast<const TrieRecord *>(Data.data());
+    TrieRecord::Data D = R->load();
+    OS << " SK=";
+    switch (D.SK) {
+    case TrieRecord::StorageKind::Unknown:
+      OS << "unknown          ";
+      break;
+    case TrieRecord::StorageKind::DataPool:
+      OS << "datapool         ";
+      Pool.push_back({D.Offset.get()});
+      break;
+    case TrieRecord::StorageKind::Standalone:
+      OS << "standalone-data  ";
+      break;
+    case TrieRecord::StorageKind::StandaloneLeaf:
+      OS << "standalone-leaf  ";
+      break;
+    case TrieRecord::StorageKind::StandaloneLeaf0:
+      OS << "standalone-leaf+0";
+      break;
+    }
+    OS << " Offset=" << (void *)D.Offset.get();
+  });
+  if (Pool.empty())
+    return;
+
+  OS << "\n";
+  OS << "pool:\n";
+  llvm::sort(
+      Pool, [](PoolInfo LHS, PoolInfo RHS) { return LHS.Offset < RHS.Offset; });
+  for (PoolInfo PI : Pool) {
+    OS << "- addr=" << (void *)PI.Offset << " ";
+    auto D = DataRecordHandle::getFromDataPool(DataPool, FileOffset(PI.Offset));
+    if (!D) {
+      OS << "error: " << toString(D.takeError());
+      return;
+    }
+
+    OS << "record refs=" << D->getNumRefs() << " data=" << D->getDataSize()
+       << " size=" << D->getTotalSize()
+       << " end=" << (void *)(PI.Offset + D->getTotalSize()) << "\n";
+  }
+}
+
+Expected<OnDiskGraphDB::IndexProxy>
+OnDiskGraphDB::indexHash(ArrayRef<uint8_t> Hash) {
+  auto P = Index.insertLazy(
+      Hash, [](FileOffset TentativeOffset,
+               OnDiskTrieRawHashMap::ValueProxy TentativeValue) {
+        assert(TentativeValue.Data.size() == sizeof(TrieRecord));
+        assert(
+            isAddrAligned(Align::Of<TrieRecord>(), TentativeValue.Data.data()));
+        new (TentativeValue.Data.data()) TrieRecord();
+      });
+  if (LLVM_UNLIKELY(!P))
+    return P.takeError();
+
+  assert(*P && "Expected insertion");
+  return getIndexProxyFromPointer(*P);
+}
+
+OnDiskGraphDB::IndexProxy OnDiskGraphDB::getIndexProxyFromPointer(
+    OnDiskTrieRawHashMap::ConstOnDiskPtr P) const {
+  assert(P);
+  assert(P.getOffset());
+  return IndexProxy{P.getOffset(), P->Hash,
+                    *const_cast<TrieRecord *>(
+                        reinterpret_cast<const TrieRecord *>(P->Data.data()))};
+}
+
+Expected<ObjectID> OnDiskGraphDB::getReference(ArrayRef<uint8_t> Hash) {
+  auto I = indexHash(Hash);
+  if (LLVM_UNLIKELY(!I))
+    return I.takeError();
+  return getExternalReference(*I);
+}
+
+ObjectID OnDiskGraphDB::getExternalReference(const IndexProxy &I) {
+  return getExternalReference(makeInternalRef(I.Offset));
+}
+
+std::optional<ObjectID>
+OnDiskGraphDB::getExistingReference(ArrayRef<uint8_t> Digest) {
+  auto tryUpstream =
+      [&](std::optional<IndexProxy> I) -> std::optional<ObjectID> {
+    if (!UpstreamDB)
+      return std::nullopt;
+    std::optional<ObjectID> UpstreamID =
+        UpstreamDB->getExistingReference(Digest);
+    if (LLVM_UNLIKELY(!UpstreamID))
+      return std::nullopt;
+    auto Ref = expectedToOptional(indexHash(Digest));
+    if (!Ref)
+      return std::nullopt;
+    if (!I)
+      I.emplace(*Ref);
+    return getExternalReference(*I);
+  };
+
+  OnDiskTrieRawHashMap::ConstOnDiskPtr P = Index.find(Digest);
+  if (!P)
+    return tryUpstream(std::nullopt);
+  IndexProxy I = getIndexProxyFromPointer(P);
+  TrieRecord::Data Obj = I.Ref.load();
+  if (Obj.SK == TrieRecord::StorageKind::Unknown)
+    return tryUpstream(I);
+  return getExternalReference(makeInternalRef(I.Offset));
+}
+
+Expected<OnDiskGraphDB::IndexProxy>
+OnDiskGraphDB::getIndexProxyFromRef(InternalRef Ref) const {
+  auto P = Index.recoverFromFileOffset(Ref.getFileOffset());
+  if (LLVM_UNLIKELY(!P))
+    return P.takeError();
+  return getIndexProxyFromPointer(*P);
+}
+
+Expected<ArrayRef<uint8_t>> OnDiskGraphDB::getDigest(InternalRef Ref) const {
+  auto I = getIndexProxyFromRef(Ref);
+  if (!I)
+    return I.takeError();
+  return I->Hash;
+}
+
+ArrayRef<uint8_t> OnDiskGraphDB::getDigest(const IndexProxy &I) const {
+  return I.Hash;
+}
+
+static OnDiskContent getContentFromHandle(const OnDiskDataAllocator &DataPool,
+                                          ObjectHandle OH) {
+  // Decode ObjectHandle to locate the stored content.
+  uint64_t Data = OH.getOpaqueData();
+  if (Data & 1) {
+    const auto *SDIM =
+        reinterpret_cast<const StandaloneDataInMemory *>(Data & (-1ULL << 1));
+    return SDIM->getContent();
+  }
+
+  auto DataHandle =
+      cantFail(DataRecordHandle::getFromDataPool(DataPool, FileOffset(Data)));
+  assert(DataHandle.getData().end()[0] == 0 && "Null termination");
+  return OnDiskContent{DataHandle, std::nullopt};
+}
+
+ArrayRef<char> OnDiskGraphDB::getObjectData(ObjectHandle Node) const {
+  OnDiskContent Content = getContentFromHandle(DataPool, Node);
+  if (Content.Bytes)
+    return *Content.Bytes;
+  assert(Content.Record && "Expected record or bytes");
+  return Content.Record->getData();
+}
+
+InternalRefArrayRef OnDiskGraphDB::getInternalRefs(ObjectHandle Node) const {
+  if (std::optional<DataRecordHandle> Record =
+          getContentFromHandle(DataPool, Node).Record)
+    return Record->getRefs();
+  return std::nullopt;
+}
+
+Expected<std::optional<ObjectHandle>>
+OnDiskGraphDB::load(ObjectID ExternalRef) {
+  InternalRef Ref = getInternalRef(ExternalRef);
+  auto I = getIndexProxyFromRef(Ref);
+  if (!I)
+    return I.takeError();
+  TrieRecord::Data Object = I->Ref.load();
+
+  if (Object.SK == TrieRecord::StorageKind::Unknown) {
+    if (!UpstreamDB)
+      return std::nullopt;
+    return faultInFromUpstream(ExternalRef);
+  }
+
+  if (Object.SK == TrieRecord::StorageKind::DataPool)
+    return ObjectHandle::fromFileOffset(Object.Offset);
+
+  // Only TrieRecord::StorageKind::Standalone (and variants) need to be
+  // explicitly loaded.
+  //
+  // There's corruption if standalone objects have offsets, or if we get here
+  // for something that isn't standalone.
+  if (Object.Offset)
+    return createCorruptObjectError(getDigest(*I));
+  switch (Object.SK) {
+  case TrieRecord::StorageKind::Unknown:
+  case TrieRecord::StorageKind::DataPool:
+    llvm_unreachable("unexpected storage kind");
+  case TrieRecord::StorageKind::Standalone:
+  case TrieRecord::StorageKind::StandaloneLeaf0:
+  case TrieRecord::StorageKind::StandaloneLeaf:
+    break;
+  }
+
+  // Load it from disk.
+  //
+  // Note: Creation logic guarantees that data that needs null-termination is
+  // suitably 0-padded. Requiring null-termination here would be too expensive
+  // for extremely large objects that happen to be page-aligned.
+  SmallString<256> Path;
+  getStandalonePath(TrieRecord::getStandaloneFilePrefix(Object.SK), *I, Path);
+
+  auto File = sys::fs::openNativeFileForRead(Path);
+  if (!File)
+    return createFileError(Path, File.takeError());
+
+  auto CloseFile = make_scope_exit([&]() { sys::fs::closeFile(*File); });
+
+  sys::fs::file_status Status;
+  if (std::error_code EC = sys::fs::status(*File, Status))
+    return createCorruptObjectError(getDigest(*I));
+
+  std::error_code EC;
+  auto Region = std::make_unique<sys::fs::mapped_file_region>(
+      *File, sys::fs::mapped_file_region::readonly, Status.getSize(), 0, EC);
+  if (EC)
+    return createCorruptObjectError(getDigest(*I));
+
+  return ObjectHandle::fromMemory(
+      static_cast<StandaloneDataMapTy *>(StandaloneData)
+          ->insert(I->Hash, Object.SK, std::move(Region)));
+}
+
+Expected<bool> OnDiskGraphDB::isMaterialized(ObjectID Ref) {
+  auto Presence = getObjectPresence(Ref, /*CheckUpstream=*/true);
+  if (!Presence)
+    return Presence.takeError();
+
+  switch (*Presence) {
+  case ObjectPresence::Missing:
+    return false;
+  case ObjectPresence::InPrimaryDB:
+    return true;
+  case ObjectPresence::OnlyInUpstreamDB:
+    if (auto FaultInResult = faultInFromUpstream(Ref); !FaultInResult)
+      return FaultInResult.takeError();
+    return true;
+  }
+  llvm_unreachable("Unknown ObjectPresence enum");
+}
+
+Expected<OnDiskGraphDB::ObjectPresence>
+OnDiskGraphDB::getObjectPresence(ObjectID ExternalRef,
+                                 bool CheckUpstream) const {
+  InternalRef Ref = getInternalRef(ExternalRef);
+  auto I = getIndexProxyFromRef(Ref);
+  if (!I)
+    return I.takeError();
+
+  TrieRecord::Data Object = I->Ref.load();
+  if (Object.SK != TrieRecord::StorageKind::Unknown)
+    return ObjectPresence::InPrimaryDB;
+  if (!CheckUpstream || !UpstreamDB)
+    return ObjectPresence::Missing;
+  std::optional<ObjectID> UpstreamID =
+      UpstreamDB->getExistingReference(getDigest(*I));
+  return UpstreamID.has_value() ? ObjectPresence::OnlyInUpstreamDB
+                                : ObjectPresence::Missing;
+}
+
+InternalRef OnDiskGraphDB::makeInternalRef(FileOffset IndexOffset) {
+  return InternalRef::getFromOffset(IndexOffset);
+}
+
+void OnDiskGraphDB::getStandalonePath(StringRef Prefix, const IndexProxy &I,
+                                      SmallVectorImpl<char> &Path) const {
+  Path.assign(RootPath.begin(), RootPath.end());
+  sys::path::append(Path,
+                    Prefix + Twine(I.Offset.get()) + "." + CASFormatVersion);
+}
+
+OnDiskContent StandaloneDataInMemory::getContent() const {
+  bool Leaf0 = false;
+  bool Leaf = false;
+  switch (SK) {
+  default:
+    llvm_unreachable("Storage kind must be standalone");
+  case TrieRecord::StorageKind::Standalone:
+    break;
+  case TrieRecord::StorageKind::StandaloneLeaf0:
+    Leaf = Leaf0 = true;
+    break;
+  case TrieRecord::StorageKind::StandaloneLeaf:
+    Leaf = true;
+    break;
+  }
+
+  if (Leaf) {
+    StringRef Data(Region->data(), Region->size());
+    assert(Data.drop_back(Leaf0).end()[0] == 0 &&
+           "Standalone node data missing null termination");
+    return OnDiskContent{std::nullopt,
+                         arrayRefFromStringRef<char>(Data.drop_back(Leaf0))};
+  }
+
+  DataRecordHandle Record = DataRecordHandle::get(Region->data());
+  assert(Record.getData().end()[0] == 0 &&
+         "Standalone object record missing null termination for data");
+  return OnDiskContent{Record, std::nullopt};
+}
+
+static Expected<MappedTempFile> createTempFile(StringRef FinalPath,
+                                               uint64_t Size) {
+  assert(Size && "Unexpected request for an empty temp file");
+  Expected<TempFile> File = TempFile::create(FinalPath + ".%%%%%%");
+  if (!File)
+    return File.takeError();
+
+  if (Error E = preallocateFileTail(File->FD, 0, Size).takeError())
+    return createFileError(File->TmpName, std::move(E));
+
+  if (auto EC = sys::fs::resize_file_before_mapping_readwrite(File->FD, Size))
+    return createFileError(File->TmpName, EC);
+
+  std::error_code EC;
+  sys::fs::mapped_file_region Map(sys::fs::convertFDToNativeFile(File->FD),
+                                  sys::fs::mapped_file_region::readwrite, Size,
+                                  0, EC);
+  if (EC)
+    return createFileError(File->TmpName, EC);
+  return MappedTempFile(std::move(*File), std::move(Map));
+}
+
+static size_t getPageSize() {
+  static int PageSize = sys::Process::getPageSizeEstimate();
+  return PageSize;
+}
+
+Error OnDiskGraphDB::createStandaloneLeaf(IndexProxy &I, ArrayRef<char> Data) {
+  assert(Data.size() > TrieRecord::MaxEmbeddedSize &&
+         "Expected a bigger file for external content...");
+
+  bool Leaf0 = isAligned(Align(getPageSize()), Data.size());
+  TrieRecord::StorageKind SK = Leaf0 ? TrieRecord::StorageKind::StandaloneLeaf0
+                                     : TrieRecord::StorageKind::StandaloneLeaf;
+
+  SmallString<256> Path;
+  int64_t FileSize = Data.size() + Leaf0;
+  getStandalonePath(TrieRecord::getStandaloneFilePrefix(SK), I, Path);
+
+  // Write the file. Don't reuse this mapped_file_region, which is read/write.
+  // Let load() pull up one that's read-only.
+  Expected<MappedTempFile> File = createTempFile(Path, FileSize);
+  if (!File)
+    return File.takeError();
+  assert(File->size() == (uint64_t)FileSize);
+  llvm::copy(Data, File->data());
+  if (Leaf0)
+    File->data()[Data.size()] = 0;
+  assert(File->data()[Data.size()] == 0);
+  if (Error E = File->keep(Path))
+    return E;
+
+  // Store the object reference.
+  TrieRecord::Data Existing;
+  {
+    TrieRecord::Data Leaf{SK, FileOffset()};
+    if (I.Ref.compare_exchange_strong(Existing, Leaf)) {
+      recordStandaloneSizeIncrease(FileSize);
+      return Error::success();
+    }
+  }
+
+  // If there was a race, confirm that the new value has valid storage.
+  if (Existing.SK == TrieRecord::StorageKind::Unknown)
+    return createCorruptObjectError(getDigest(I));
+
+  return Error::success();
+}
+
+Error OnDiskGraphDB::store(ObjectID ID, ArrayRef<ObjectID> Refs,
+                           ArrayRef<char> Data) {
+  auto I = getIndexProxyFromRef(getInternalRef(ID));
+  if (LLVM_UNLIKELY(!I))
+    return I.takeError();
+
+  // Early return in case the node exists.
+  {
+    TrieRecord::Data Existing = I->Ref.load();
+    if (Existing.SK != TrieRecord::StorageKind::Unknown)
+      return Error::success();
+  }
+
+  // Big leaf nodes.
+  if (Refs.empty() && Data.size() > TrieRecord::MaxEmbeddedSize)
+    return createStandaloneLeaf(*I, Data);
+
+  // TODO: Check whether it's worth checking the index for an already existing
+  // object (like storeTreeImpl() does) before building up the
+  // InternalRefVector.
+  InternalRefVector InternalRefs;
+  for (ObjectID Ref : Refs)
+    InternalRefs.push_back(getInternalRef(Ref));
+
+  // Create the object.
+
+  DataRecordHandle::Input Input{InternalRefs, Data};
+
+  // Compute the storage kind, allocate it, and create the record.
+  TrieRecord::StorageKind SK = TrieRecord::StorageKind::Unknown;
+  FileOffset PoolOffset;
+  SmallString<256> Path;
+  std::optional<MappedTempFile> File;
+  std::optional<uint64_t> FileSize;
+  auto AllocStandaloneFile = [&](size_t Size) -> Expected<char *> {
+    getStandalonePath(TrieRecord::getStandaloneFilePrefix(
+                          TrieRecord::StorageKind::Standalone),
+                      *I, Path);
+    if (Error E = createTempFile(Path, Size).moveInto(File))
+      return std::move(E);
+    assert(File->size() == Size);
+    FileSize = Size;
+    SK = TrieRecord::StorageKind::Standalone;
+    return File->data();
+  };
+  auto Alloc = [&](size_t Size) -> Expected<char *> {
+    if (Size <= TrieRecord::MaxEmbeddedSize) {
+      SK = TrieRecord::StorageKind::DataPool;
+      auto P = DataPool.allocate(Size);
+      if (LLVM_UNLIKELY(!P)) {
+        char *NewAlloc = nullptr;
+        auto NewE = handleErrors(
+            P.takeError(), [&](std::unique_ptr<StringError> E) -> Error {
+              if (E->convertToErrorCode() == std::errc::not_enough_memory)
+                return AllocStandaloneFile(Size).moveInto(NewAlloc);
+              return Error(std::move(E));
+            });
+        if (!NewE)
+          return NewAlloc;
+        return std::move(NewE);
+      }
+      PoolOffset = P->getOffset();
+      LLVM_DEBUG({
+        dbgs() << "pool-alloc addr=" << (void *)PoolOffset.get()
+               << " size=" << Size
+               << " end=" << (void *)(PoolOffset.get() + Size) << "\n";
+      });
+      return (*P)->data();
+    }
+    return AllocStandaloneFile(Size);
+  };
+
+  DataRecordHandle Record;
+  if (Error E =
+          DataRecordHandle::createWithError(Alloc, Input).moveInto(Record))
+    return E;
+  assert(Record.getData().end()[0] == 0 && "Expected null-termination");
+  assert(Record.getData() == Input.Data && "Expected initialization");
+  assert(SK != TrieRecord::StorageKind::Unknown);
+  assert(bool(File) != bool(PoolOffset) &&
+         "Expected either a mapped file or a pooled offset");
+
+  // Check for a race before calling MappedTempFile::keep().
+  //
+  // Then decide what to do with the file. Better to discard than overwrite if
+  // another thread/process has already added this.
+  TrieRecord::Data Existing = I->Ref.load();
+  {
+    TrieRecord::Data NewObject{SK, PoolOffset};
+    if (File) {
+      if (Existing.SK == TrieRecord::StorageKind::Unknown) {
+        // Keep the file!
+        if (Error E = File->keep(Path))
+          return E;
+      } else {
+        File.reset();
+      }
+    }
+
+    // If we didn't already see a racing/existing write, then try storing the
+    // new object. If that races, confirm that the new value has valid storage.
+    //
+    // TODO: Find a way to reuse the storage from the new-but-abandoned record
+    // handle.
+    if (Existing.SK == TrieRecord::StorageKind::Unknown) {
+      if (I->Ref.compare_exchange_strong(Existing, NewObject)) {
+        if (FileSize)
+          recordStandaloneSizeIncrease(*FileSize);
+        return Error::success();
+      }
+    }
+  }
+
+  if (Existing.SK == TrieRecord::StorageKind::Unknown)
+    return createCorruptObjectError(getDigest(*I));
+
+  // Load existing object.
+  return Error::success();
+}
+
+void OnDiskGraphDB::recordStandaloneSizeIncrease(size_t SizeIncrease) {
+  standaloneStorageSize().fetch_add(SizeIncrease, std::memory_order_relaxed);
+}
+
+std::atomic<uint64_t> &OnDiskGraphDB::standaloneStorageSize() const {
+  MutableArrayRef<uint8_t> UserHeader = DataPool.getUserHeader();
+  assert(UserHeader.size() == sizeof(std::atomic<uint64_t>));
+  assert(isAddrAligned(Align(8), UserHeader.data()));
+  return *reinterpret_cast<std::atomic<uint64_t> *>(UserHeader.data());
+}
+
+uint64_t OnDiskGraphDB::getStandaloneStorageSize() const {
+  return standaloneStorageSize().load(std::memory_order_relaxed);
+}
+
+size_t OnDiskGraphDB::getStorageSize() const {
+  return Index.size() + DataPool.size() + getStandaloneStorageSize();
+}
+
+unsigned OnDiskGraphDB::getHardStorageLimitUtilization() const {
+  unsigned IndexPercent = Index.size() * 100ULL / Index.capacity();
+  unsigned DataPercent = DataPool.size() * 100ULL / DataPool.capacity();
+  return std::max(IndexPercent, DataPercent);
+}
+
+Expected<std::unique_ptr<OnDiskGraphDB>> OnDiskGraphDB::open(
+    StringRef AbsPath, StringRef HashName, unsigned HashByteSize,
+    std::unique_ptr<OnDiskGraphDB> UpstreamDB, FaultInPolicy Policy) {
+  if (std::error_code EC = sys::fs::create_directories(AbsPath))
+    return createFileError(AbsPath, EC);
+
+  constexpr uint64_t MB = 1024ull * 1024ull;
+  constexpr uint64_t GB = 1024ull * 1024ull * 1024ull;
+
+  uint64_t MaxIndexSize = 12 * GB;
+  uint64_t MaxDataPoolSize = 24 * GB;
+
+  if (useSmallMappingSize(AbsPath)) {
+    MaxIndexSize = 1 * GB;
+    MaxDataPoolSize = 2 * GB;
+  }
+
+  auto CustomSize = getOverriddenMaxMappingSize();
+  if (!CustomSize)
+    return CustomSize.takeError();
+  if (*CustomSize)
+    MaxIndexSize = MaxDataPoolSize = **CustomSize;
+
+  SmallString<256> IndexPath(AbsPath);
+  sys::path::append(IndexPath, IndexFilePrefix + CASFormatVersion);
+  std::optional<OnDiskTrieRawHashMap> Index;
+  if (Error E = OnDiskTrieRawHashMap::create(
+                    IndexPath, IndexTableName + "[" + HashName + "]",
+                    HashByteSize * CHAR_BIT,
+                    /*DataSize=*/sizeof(TrieRecord), MaxIndexSize,
+                    /*MinFileSize=*/MB)
+                    .moveInto(Index))
+    return std::move(E);
+
+  uint32_t UserHeaderSize = sizeof(std::atomic<uint64_t>);
+
+  SmallString<256> DataPoolPath(AbsPath);
+  sys::path::append(DataPoolPath, DataPoolFilePrefix + CASFormatVersion);
+  std::optional<OnDiskDataAllocator> DataPool;
+  StringRef PolicyName =
+      Policy == FaultInPolicy::SingleNode ? "single" : "full";
+  if (Error E = OnDiskDataAllocator::create(
+                    DataPoolPath,
+                    DataPoolTableName + "[" + HashName + "]" + PolicyName,
+                    MaxDataPoolSize, /*MinFileSize=*/MB, UserHeaderSize,
+                    [](void *UserHeaderPtr) {
+                      new (UserHeaderPtr) std::atomic<uint64_t>(0);
+                    })
+                    .moveInto(DataPool))
+    return std::move(E);
+  if (DataPool->getUserHeader().size() != UserHeaderSize)
+    return createStringError(llvm::errc::argument_out_of_domain,
+                             "unexpected user header in '" + DataPoolPath +
+                                 "'");
+
+  return std::unique_ptr<OnDiskGraphDB>(
+      new OnDiskGraphDB(AbsPath, std::move(*Index), std::move(*DataPool),
+                        std::move(UpstreamDB), Policy));
+}
+
+OnDiskGraphDB::OnDiskGraphDB(StringRef RootPath, OnDiskTrieRawHashMap Index,
+                             OnDiskDataAllocator DataPool,
+                             std::unique_ptr<OnDiskGraphDB> UpstreamDB,
+                             FaultInPolicy Policy)
+    : Index(std::move(Index)), DataPool(std::move(DataPool)),
+      RootPath(RootPath.str()), UpstreamDB(std::move(UpstreamDB)),
+      FIPolicy(Policy) {
+  /// Lifetime for "big" objects not in DataPool.
+  ///
+  /// NOTE: Could use ThreadSafeTrieRawHashMap here. For now, doing something
+  /// simpler on the assumption there won't be much contention since most data
+  /// is not big. If there is contention, and we've already fixed ObjectProxy
+  /// object handles to be cheap enough to use consistently, the fix might be
+  /// to use better use of them rather than optimizing this map.
+  ///
+  /// FIXME: Figure out the right number of shards, if any.
+  StandaloneData = new StandaloneDataMapTy();
+}
+
+OnDiskGraphDB::~OnDiskGraphDB() {
+  delete static_cast<StandaloneDataMapTy *>(StandaloneData);
+}
+
+Error OnDiskGraphDB::importFullTree(ObjectID PrimaryID,
+                                    ObjectHandle UpstreamNode) {
+  // Copies the full CAS tree from upstream. Uses depth-first copying to protect
+  // against the process dying during importing and leaving the database with an
+  // incomplete tree. Note that if the upstream has missing nodes then the tree
+  // will be copied with missing nodes as well, it won't be considered an error.
+
+  struct UpstreamCursor {
+    ObjectHandle Node;
+    size_t RefsCount;
+    object_refs_iterator RefI;
+    object_refs_iterator RefE;
+  };
+  /// Keeps track of the state of visitation for current node and all of its
+  /// parents.
+  SmallVector<UpstreamCursor, 16> CursorStack;
+  /// Keeps track of the currently visited nodes as they are imported into
+  /// primary database, from current node and its parents. When a node is
+  /// entered for visitation it appends its own ID, then appends referenced IDs
+  /// as they get imported. When a node is fully imported it removes the
+  /// referenced IDs from the bottom of the stack which leaves its own ID at the
+  /// bottom, adding to the list of referenced IDs for the parent node.
+  SmallVector<ObjectID, 128> PrimaryNodesStack;
+
+  auto enqueueNode = [&](ObjectID PrimaryID, std::optional<ObjectHandle> Node) {
+    PrimaryNodesStack.push_back(PrimaryID);
+    if (!Node)
+      return;
+    auto Refs = UpstreamDB->getObjectRefs(*Node);
+    CursorStack.push_back({*Node,
+                           (size_t)std::distance(Refs.begin(), Refs.end()),
+                           Refs.begin(), Refs.end()});
+  };
+
+  enqueueNode(PrimaryID, UpstreamNode);
+
+  while (!CursorStack.empty()) {
+    UpstreamCursor &Cur = CursorStack.back();
+    if (Cur.RefI == Cur.RefE) {
+      // Copy the node data into the primary store.
+      // FIXME: Use hard-link or cloning if the file-system supports it and data
+      // is stored into a separate file.
+
+      // The bottom of \p PrimaryNodesStack contains the primary ID for the
+      // current node plus the list of imported referenced IDs.
+      assert(PrimaryNodesStack.size() >= Cur.RefsCount + 1);
+      ObjectID PrimaryID = *(PrimaryNodesStack.end() - Cur.RefsCount - 1);
+      auto PrimaryRefs = ArrayRef(PrimaryNodesStack)
+                             .slice(PrimaryNodesStack.size() - Cur.RefsCount);
+      auto Data = UpstreamDB->getObjectData(Cur.Node);
+      if (Error E = store(PrimaryID, PrimaryRefs, Data))
+        return E;
+      // Remove the current node and its IDs from the stack.
+      PrimaryNodesStack.truncate(PrimaryNodesStack.size() - Cur.RefsCount);
+      CursorStack.pop_back();
+      continue;
+    }
+
+    ObjectID UpstreamID = *(Cur.RefI++);
+    auto PrimaryID = getReference(UpstreamDB->getDigest(UpstreamID));
+    if (LLVM_UNLIKELY(!PrimaryID))
+      return PrimaryID.takeError();
+    if (containsObject(*PrimaryID, /*CheckUpstream=*/false)) {
+      // This \p ObjectID already exists in the primary. Either it was imported
+      // via \p importFullTree or the client created it, in which case the
+      // client takes responsibility for how it was formed.
+      enqueueNode(*PrimaryID, std::nullopt);
+      continue;
+    }
+    Expected<std::optional<ObjectHandle>> UpstreamNode =
+        UpstreamDB->load(UpstreamID);
+    if (!UpstreamNode)
+      return UpstreamNode.takeError();
+    enqueueNode(*PrimaryID, *UpstreamNode);
+  }
+
+  assert(PrimaryNodesStack.size() == 1);
+  assert(PrimaryNodesStack.front() == PrimaryID);
+  return Error::success();
+}
+
+Error OnDiskGraphDB::importSingleNode(ObjectID PrimaryID,
+                                      ObjectHandle UpstreamNode) {
+  // Copies only a single node, it doesn't copy the referenced nodes.
+
+  // Copy the node data into the primary store.
+  // FIXME: Use hard-link or cloning if the file-system supports it and data is
+  // stored into a separate file.
+
+  auto Data = UpstreamDB->getObjectData(UpstreamNode);
+  auto UpstreamRefs = UpstreamDB->getObjectRefs(UpstreamNode);
+  SmallVector<ObjectID, 64> Refs;
+  Refs.reserve(std::distance(UpstreamRefs.begin(), UpstreamRefs.end()));
+  for (ObjectID UpstreamRef : UpstreamRefs) {
+    auto Ref = getReference(UpstreamDB->getDigest(UpstreamRef));
+    if (LLVM_UNLIKELY(!Ref))
+      return Ref.takeError();
+    Refs.push_back(*Ref);
+  }
+
+  return store(PrimaryID, Refs, Data);
+}
+
+Expected<std::optional<ObjectHandle>>
+OnDiskGraphDB::faultInFromUpstream(ObjectID PrimaryID) {
+  assert(UpstreamDB);
+
+  auto UpstreamID = UpstreamDB->getReference(getDigest(PrimaryID));
+  if (LLVM_UNLIKELY(!UpstreamID))
+    return UpstreamID.takeError();
+
+  Expected<std::optional<ObjectHandle>> UpstreamNode =
+      UpstreamDB->load(*UpstreamID);
+  if (!UpstreamNode)
+    return UpstreamNode.takeError();
+  if (!*UpstreamNode)
+    return std::nullopt;
+
+  if (Error E = FIPolicy == FaultInPolicy::SingleNode
+                    ? importSingleNode(PrimaryID, **UpstreamNode)
+                    : importFullTree(PrimaryID, **UpstreamNode))
+    return std::move(E);
+  return load(PrimaryID);
+}
diff --git a/llvm/lib/CAS/OnDiskKeyValueDB.cpp b/llvm/lib/CAS/OnDiskKeyValueDB.cpp
new file mode 100644
index 0000000000000..21860717da3bf
--- /dev/null
+++ b/llvm/lib/CAS/OnDiskKeyValueDB.cpp
@@ -0,0 +1,113 @@
+//===- OnDiskKeyValueDB.cpp -------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file implements OnDiskKeyValueDB, an ondisk key value database.
+///
+/// The KeyValue database file is named `actions.<version>` inside the CAS
+/// directory. The database stores a mapping between a fixed-sized key and a
+/// fixed-sized value, where the size of key and value can be configured when
+/// opening the database.
+///
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/OnDiskKeyValueDB.h"
+#include "OnDiskCommon.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Path.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+
+static constexpr StringLiteral ActionCacheFile = "actions.";
+
+Expected<ArrayRef<char>> OnDiskKeyValueDB::put(ArrayRef<uint8_t> Key,
+                                               ArrayRef<char> Value) {
+  if (LLVM_UNLIKELY(Value.size() != ValueSize))
+    return createStringError(errc::invalid_argument,
+                             "expected value size of " + itostr(ValueSize) +
+                                 ", got: " + itostr(Value.size()));
+  assert(Value.size() == ValueSize);
+  auto ActionP = Cache.insertLazy(
+      Key, [&](FileOffset TentativeOffset,
+               OnDiskTrieRawHashMap::ValueProxy TentativeValue) {
+        assert(TentativeValue.Data.size() == ValueSize);
+        llvm::copy(Value, TentativeValue.Data.data());
+      });
+  if (LLVM_UNLIKELY(!ActionP))
+    return ActionP.takeError();
+  return (*ActionP)->Data;
+}
+
+Expected<std::optional<ArrayRef<char>>>
+OnDiskKeyValueDB::get(ArrayRef<uint8_t> Key) {
+  // Check the result cache.
+  OnDiskTrieRawHashMap::ConstOnDiskPtr ActionP = Cache.find(Key);
+  if (!ActionP)
+    return std::nullopt;
+  assert(isAddrAligned(Align(8), ActionP->Data.data()));
+  return ActionP->Data;
+}
+
+Expected<std::unique_ptr<OnDiskKeyValueDB>>
+OnDiskKeyValueDB::open(StringRef Path, StringRef HashName, unsigned KeySize,
+                       StringRef ValueName, size_t ValueSize) {
+  if (std::error_code EC = sys::fs::create_directories(Path))
+    return createFileError(Path, EC);
+
+  SmallString<256> CachePath(Path);
+  sys::path::append(CachePath, ActionCacheFile + CASFormatVersion);
+  constexpr uint64_t MB = 1024ull * 1024ull;
+  constexpr uint64_t GB = 1024ull * 1024ull * 1024ull;
+
+  uint64_t MaxFileSize = GB;
+  auto CustomSize = getOverriddenMaxMappingSize();
+  if (!CustomSize)
+    return CustomSize.takeError();
+  if (*CustomSize)
+    MaxFileSize = **CustomSize;
+
+  std::optional<OnDiskTrieRawHashMap> ActionCache;
+  if (Error E = OnDiskTrieRawHashMap::create(
+                    CachePath,
+                    "llvm.actioncache[" + HashName + "->" + ValueName + "]",
+                    KeySize * 8,
+                    /*DataSize=*/ValueSize, MaxFileSize, /*MinFileSize=*/MB)
+                    .moveInto(ActionCache))
+    return std::move(E);
+
+  return std::unique_ptr<OnDiskKeyValueDB>(
+      new OnDiskKeyValueDB(ValueSize, std::move(*ActionCache)));
+}
+
+Error OnDiskKeyValueDB::validate(CheckValueT CheckValue) const {
+  return Cache.validate(
+      [&](FileOffset Offset,
+          OnDiskTrieRawHashMap::ConstValueProxy Record) -> Error {
+        auto formatError = [&](Twine Msg) {
+          return createStringError(
+              llvm::errc::illegal_byte_sequence,
+              "bad cache value at 0x" +
+                  utohexstr((unsigned)Offset.get(), /*LowerCase=*/true) + ": " +
+                  Msg.str());
+        };
+
+        if (Record.Data.size() != ValueSize)
+          return formatError("wrong cache value size");
+        if (!isAddrAligned(Align(8), Record.Data.data()))
+          return formatError("wrong cache value alignment");
+        if (CheckValue)
+          return CheckValue(Offset, Record.Data);
+        return Error::success();
+      });
+}
diff --git a/llvm/lib/CAS/OnDiskTrieRawHashMap.cpp b/llvm/lib/CAS/OnDiskTrieRawHashMap.cpp
index 323b21e7cb691..4e6f93ecb460e 100644
--- a/llvm/lib/CAS/OnDiskTrieRawHashMap.cpp
+++ b/llvm/lib/CAS/OnDiskTrieRawHashMap.cpp
@@ -1102,8 +1102,6 @@ void TrieRawHashMapHandle::print(
 
   if (auto Err = Printer.printRecords())
     OS << "error: " << toString(std::move(Err)) << "\n";
-
-  return;
 }
 
 Error TrieRawHashMapHandle::validate(
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index bffeb1fd28b66..fabfbc4724560 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -119,6 +119,7 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/VCSRevision.h"
+#include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
@@ -476,6 +477,7 @@ void AsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const {
 }
 
 bool AsmPrinter::doInitialization(Module &M) {
+  VFS = vfs::getRealFileSystem();
   auto *MMIWP = getAnalysisIfAvailable<MachineModuleInfoWrapperPass>();
   MMI = MMIWP ? &MMIWP->getMMI() : nullptr;
   HasSplitStack = false;
@@ -1484,7 +1486,8 @@ getBBAddrMapFeature(const MachineFunction &MF, int NumMBBSectionRanges,
           BrProbEnabled,
           MF.hasBBSections() && NumMBBSectionRanges > 1,
           static_cast<bool>(BBAddrMapSkipEmitBBEntries),
-          HasCalls};
+          HasCalls,
+          false};
 }
 
 void AsmPrinter::emitBBAddrMapSection(const MachineFunction &MF) {
@@ -1729,7 +1732,7 @@ static ConstantInt *extractNumericCGTypeId(const Function &F) {
   return nullptr;
 }
 
-/// Emits .callgraph section.
+/// Emits .llvm.callgraph section.
 void AsmPrinter::emitCallGraphSection(const MachineFunction &MF,
                                       FunctionCallGraphInfo &FuncCGInfo) {
   if (!MF.getTarget().Options.EmitCallGraphSection)
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
index c364ffc6eb8c1..8dd8b9da9c50c 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
@@ -36,6 +36,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
@@ -98,6 +99,7 @@ void AsmPrinter::emitInlineAsm(StringRef Str, const MCSubtargetInfo &STI,
   unsigned BufNum = addInlineAsmDiagBuffer(Str, LocMDNode);
   SourceMgr &SrcMgr = *MMI->getContext().getInlineSourceManager();
   SrcMgr.setIncludeDirs(MCOptions.IASSearchPaths);
+  SrcMgr.setVirtualFileSystem(VFS);
 
   std::unique_ptr<MCAsmParser> Parser(
       createMCAsmParser(SrcMgr, OutContext, *OutStreamer, *MAI, BufNum));
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 7e03b24d87df8..7fbadad34058f 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -1103,12 +1103,17 @@ void DwarfDebug::finishUnitAttributes(const DICompileUnit *DIUnit,
   } else
     NewCU.addString(Die, dwarf::DW_AT_producer, Producer);
 
-  if (auto Lang = DIUnit->getSourceLanguage(); Lang.hasVersionedName())
+  if (auto Lang = DIUnit->getSourceLanguage(); Lang.hasVersionedName()) {
     NewCU.addUInt(Die, dwarf::DW_AT_language_name, dwarf::DW_FORM_data2,
                   Lang.getName());
-  else
+
+    if (uint32_t LangVersion = Lang.getVersion(); LangVersion != 0)
+      NewCU.addUInt(Die, dwarf::DW_AT_language_version, /*Form=*/std::nullopt,
+                    LangVersion);
+  } else {
     NewCU.addUInt(Die, dwarf::DW_AT_language, dwarf::DW_FORM_data2,
                   Lang.getName());
+  }
 
   NewCU.addString(Die, dwarf::DW_AT_name, FN);
   StringRef SysRoot = DIUnit->getSysRoot();
@@ -2142,11 +2147,36 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) {
   if (NoDebug)
     return;
 
+  auto RecordLineZero = [&]() {
+    // Preserve the file and column numbers, if we can, to save space in
+    // the encoded line table.
+    // Do not update PrevInstLoc, it remembers the last non-0 line.
+    const MDNode *Scope = nullptr;
+    unsigned Column = 0;
+    if (PrevInstLoc) {
+      Scope = PrevInstLoc.getScope();
+      Column = PrevInstLoc.getCol();
+    }
+    recordSourceLine(/*Line=*/0, Column, Scope, /*Flags=*/0);
+  };
+
+  // When we emit a line-0 record, we don't update PrevInstLoc; so look at
+  // the last line number actually emitted, to see if it was line 0.
+  unsigned LastAsmLine =
+      Asm->OutStreamer->getContext().getCurrentDwarfLoc().getLine();
+
   // Check if source location changes, but ignore DBG_VALUE and CFI locations.
   // If the instruction is part of the function frame setup code, do not emit
   // any line record, as there is no correspondence with any user code.
-  if (MI->isMetaInstruction() || MI->getFlag(MachineInstr::FrameSetup))
+  if (MI->isMetaInstruction())
+    return;
+  if (MI->getFlag(MachineInstr::FrameSetup)) {
+    // Prevent a loc from the previous block leaking into frame setup instrs.
+    if (LastAsmLine && PrevInstBB && PrevInstBB != MI->getParent())
+      RecordLineZero();
     return;
+  }
+
   const DebugLoc &DL = MI->getDebugLoc();
   unsigned Flags = 0;
 
@@ -2169,11 +2199,6 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) {
                      LocationString);
   };
 
-  // When we emit a line-0 record, we don't update PrevInstLoc; so look at
-  // the last line number actually emitted, to see if it was line 0.
-  unsigned LastAsmLine =
-      Asm->OutStreamer->getContext().getCurrentDwarfLoc().getLine();
-
   // There may be a mixture of scopes using and not using Key Instructions.
   // Not-Key-Instructions functions inlined into Key Instructions functions
   // should use not-key is_stmt handling. Key Instructions functions inlined
@@ -2239,18 +2264,8 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) {
     // - Instruction is at the top of a block; we don't want to inherit the
     //   location from the physically previous (maybe unrelated) block.
     if (UnknownLocations == Enable || PrevLabel ||
-        (PrevInstBB && PrevInstBB != MI->getParent())) {
-      // Preserve the file and column numbers, if we can, to save space in
-      // the encoded line table.
-      // Do not update PrevInstLoc, it remembers the last non-0 line.
-      const MDNode *Scope = nullptr;
-      unsigned Column = 0;
-      if (PrevInstLoc) {
-        Scope = PrevInstLoc.getScope();
-        Column = PrevInstLoc.getCol();
-      }
-      recordSourceLine(/*Line=*/0, Column, Scope, /*Flags=*/0);
-    }
+        (PrevInstBB && PrevInstBB != MI->getParent()))
+      RecordLineZero();
     return;
   }
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
index cfe77b0e1d59d..621f706a101f7 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
@@ -598,32 +598,54 @@ bool DwarfExpression::addExpression(
     case dwarf::DW_OP_LLVM_extract_bits_zext: {
       unsigned SizeInBits = Op->getArg(1);
       unsigned BitOffset = Op->getArg(0);
+      unsigned DerefSize = 0;
+      //  Operations are done in the DWARF "generic type" whose size
+      // is the size of a pointer.
+      unsigned PtrSizeInBytes = CU.getAsmPrinter()->MAI->getCodePointerSize();
 
       // If we have a memory location then dereference to get the value, though
       // we have to make sure we don't dereference any bytes past the end of the
       // object.
       if (isMemoryLocation()) {
-        emitOp(dwarf::DW_OP_deref_size);
-        emitUnsigned(alignTo(BitOffset + SizeInBits, 8) / 8);
+        DerefSize = alignTo(BitOffset + SizeInBits, 8) / 8;
+        if (DerefSize == PtrSizeInBytes) {
+          emitOp(dwarf::DW_OP_deref);
+        } else {
+          emitOp(dwarf::DW_OP_deref_size);
+          emitUnsigned(DerefSize);
+        }
       }
 
-      // Extract the bits by a shift left (to shift out the bits after what we
-      // want to extract) followed by shift right (to shift the bits to position
-      // 0 and also sign/zero extend). These operations are done in the DWARF
-      // "generic type" whose size is the size of a pointer.
-      unsigned PtrSizeInBytes = CU.getAsmPrinter()->MAI->getCodePointerSize();
-      unsigned LeftShift = PtrSizeInBytes * 8 - (SizeInBits + BitOffset);
-      unsigned RightShift = LeftShift + BitOffset;
-      if (LeftShift) {
-        emitOp(dwarf::DW_OP_constu);
-        emitUnsigned(LeftShift);
-        emitOp(dwarf::DW_OP_shl);
-      }
-      if (RightShift) {
-        emitOp(dwarf::DW_OP_constu);
-        emitUnsigned(RightShift);
-        emitOp(OpNum == dwarf::DW_OP_LLVM_extract_bits_sext ? dwarf::DW_OP_shra
-                                                            : dwarf::DW_OP_shr);
+      // If a dereference was emitted for an unsigned value, and
+      // there's no bit offset, then a bit of optimization is
+      // possible.
+      if (OpNum == dwarf::DW_OP_LLVM_extract_bits_zext && BitOffset == 0) {
+        if (8 * DerefSize == SizeInBits) {
+          // The correct value is already on the stack.
+        } else {
+          // No need to shift, we can just mask off the desired bits.
+          emitOp(dwarf::DW_OP_constu);
+          emitUnsigned((1u << SizeInBits) - 1);
+          emitOp(dwarf::DW_OP_and);
+        }
+      } else {
+        // Extract the bits by a shift left (to shift out the bits after what we
+        // want to extract) followed by shift right (to shift the bits to
+        // position 0 and also sign/zero extend).
+        unsigned LeftShift = PtrSizeInBytes * 8 - (SizeInBits + BitOffset);
+        unsigned RightShift = LeftShift + BitOffset;
+        if (LeftShift) {
+          emitOp(dwarf::DW_OP_constu);
+          emitUnsigned(LeftShift);
+          emitOp(dwarf::DW_OP_shl);
+        }
+        if (RightShift) {
+          emitOp(dwarf::DW_OP_constu);
+          emitUnsigned(RightShift);
+          emitOp(OpNum == dwarf::DW_OP_LLVM_extract_bits_sext
+                     ? dwarf::DW_OP_shra
+                     : dwarf::DW_OP_shr);
+        }
       }
 
       // The value is now at the top of the stack, so set the location to
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 4931403ab83a1..53f1cfe24a68d 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -770,7 +770,7 @@ struct PartwordMaskValues {
   Value *Inv_Mask = nullptr;
 };
 
-LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]]
 raw_ostream &operator<<(raw_ostream &O, const PartwordMaskValues &PMV) {
   auto PrintObj = [&O](auto *V) {
     if (V)
diff --git a/llvm/lib/CodeGen/BranchRelaxation.cpp b/llvm/lib/CodeGen/BranchRelaxation.cpp
index 2d50167faa085..fae952e888b4b 100644
--- a/llvm/lib/CodeGen/BranchRelaxation.cpp
+++ b/llvm/lib/CodeGen/BranchRelaxation.cpp
@@ -491,6 +491,20 @@ bool BranchRelaxation::fixupConditionalBranch(MachineInstr &MI) {
       return true;
     }
     if (FBB) {
+      // If we get here with a MBB which ends like this:
+      //
+      // bb.1:
+      // successors: %bb.2;
+      // ...
+      // BNE $x1, $x0, %bb.2
+      // PseudoBR %bb.2
+      //
+      // Just remove conditional branch.
+      if (TBB == FBB) {
+        removeBranch(MBB);
+        insertUncondBranch(MBB, TBB);
+        return true;
+      }
       // We need to split the basic block here to obtain two long-range
       // unconditional branches.
       NewBB = createNewBlockAfter(*MBB);
diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp
index c438eaeb29d1e..9795a0b707fd3 100644
--- a/llvm/lib/CodeGen/CodeGen.cpp
+++ b/llvm/lib/CodeGen/CodeGen.cpp
@@ -98,6 +98,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeMachineUniformityAnalysisPassPass(Registry);
   initializeMIR2VecVocabLegacyAnalysisPass(Registry);
   initializeMIR2VecVocabPrinterLegacyPassPass(Registry);
+  initializeMIR2VecPrinterLegacyPassPass(Registry);
   initializeMachineUniformityInfoPrinterPassPass(Registry);
   initializeMachineVerifierLegacyPassPass(Registry);
   initializeObjCARCContractLegacyPassPass(Registry);
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 592c4a6692418..6802dd4b6cb2c 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -820,7 +820,7 @@ void CodeGenPrepare::removeAllAssertingVHReferences(Value *V) {
 }
 
 // Verify BFI has been updated correctly by recomputing BFI and comparing them.
-void LLVM_ATTRIBUTE_UNUSED CodeGenPrepare::verifyBFIUpdates(Function &F) {
+[[maybe_unused]] void CodeGenPrepare::verifyBFIUpdates(Function &F) {
   DominatorTree NewDT(F);
   LoopInfo NewLI(NewDT);
   BranchProbabilityInfo NewBPI(F, NewLI, TLInfo);
diff --git a/llvm/lib/CodeGen/ExpandFp.cpp b/llvm/lib/CodeGen/ExpandFp.cpp
index 04c700869cd69..2b5ced3915a2c 100644
--- a/llvm/lib/CodeGen/ExpandFp.cpp
+++ b/llvm/lib/CodeGen/ExpandFp.cpp
@@ -993,7 +993,6 @@ static void addToWorklist(Instruction &I,
 static bool runImpl(Function &F, const TargetLowering &TLI,
                     AssumptionCache *AC) {
   SmallVector<Instruction *, 4> Worklist;
-  bool Modified = false;
 
   unsigned MaxLegalFpConvertBitWidth =
       TLI.getMaxLargeFPConvertBitWidthSupported();
@@ -1003,50 +1002,49 @@ static bool runImpl(Function &F, const TargetLowering &TLI,
   if (MaxLegalFpConvertBitWidth >= llvm::IntegerType::MAX_INT_BITS)
     return false;
 
-  for (auto It = inst_begin(&F), End = inst_end(F); It != End;) {
-    Instruction &I = *It++;
+  auto ShouldHandleInst = [&](Instruction &I) {
     Type *Ty = I.getType();
     // TODO: This pass doesn't handle scalable vectors.
     if (Ty->isScalableTy())
-      continue;
+      return false;
 
     switch (I.getOpcode()) {
     case Instruction::FRem:
-      if (!targetSupportsFrem(TLI, Ty) &&
-          FRemExpander::canExpandType(Ty->getScalarType())) {
-        addToWorklist(I, Worklist);
-        Modified = true;
-      }
-      break;
+      return !targetSupportsFrem(TLI, Ty) &&
+             FRemExpander::canExpandType(Ty->getScalarType());
+
     case Instruction::FPToUI:
     case Instruction::FPToSI: {
       auto *IntTy = cast<IntegerType>(Ty->getScalarType());
-      if (IntTy->getIntegerBitWidth() <= MaxLegalFpConvertBitWidth)
-        continue;
-
-      addToWorklist(I, Worklist);
-      Modified = true;
-      break;
+      return IntTy->getIntegerBitWidth() > MaxLegalFpConvertBitWidth;
     }
+
     case Instruction::UIToFP:
     case Instruction::SIToFP: {
       auto *IntTy =
           cast<IntegerType>(I.getOperand(0)->getType()->getScalarType());
-      if (IntTy->getIntegerBitWidth() <= MaxLegalFpConvertBitWidth)
-        continue;
-
-      addToWorklist(I, Worklist);
-      Modified = true;
-      break;
+      return IntTy->getIntegerBitWidth() > MaxLegalFpConvertBitWidth;
     }
-    default:
-      break;
     }
+
+    return false;
+  };
+
+  bool Modified = false;
+  for (auto It = inst_begin(&F), End = inst_end(F); It != End;) {
+    Instruction &I = *It++;
+    if (!ShouldHandleInst(I))
+      continue;
+
+    addToWorklist(I, Worklist);
+    Modified = true;
   }
 
   while (!Worklist.empty()) {
     Instruction *I = Worklist.pop_back_val();
-    if (I->getOpcode() == Instruction::FRem) {
+
+    switch (I->getOpcode()) {
+    case Instruction::FRem: {
       auto SQ = [&]() -> std::optional<SimplifyQuery> {
         if (AC) {
           auto Res = std::make_optional<SimplifyQuery>(
@@ -1058,11 +1056,18 @@ static bool runImpl(Function &F, const TargetLowering &TLI,
       }();
 
       expandFRem(cast<BinaryOperator>(*I), SQ);
-    } else if (I->getOpcode() == Instruction::FPToUI ||
-               I->getOpcode() == Instruction::FPToSI) {
+      break;
+    }
+
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
       expandFPToI(I);
-    } else {
+      break;
+
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
       expandIToFP(I);
+      break;
     }
   }
 
diff --git a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
index 3812823f9fffa..04d93098a5260 100644
--- a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
@@ -112,7 +112,7 @@ APInt GISelValueTracking::getKnownOnes(Register R) {
   return getKnownBits(R).One;
 }
 
-LLVM_ATTRIBUTE_UNUSED static void
+[[maybe_unused]] static void
 dumpResult(const MachineInstr &MI, const KnownBits &Known, unsigned Depth) {
   dbgs() << "[" << Depth << "] Compute known bits: " << MI << "[" << Depth
          << "] Computed for: " << MI << "[" << Depth << "] Known: 0x"
@@ -2013,6 +2013,43 @@ unsigned GISelValueTracking::computeNumSignBits(Register R,
     FirstAnswer = std::min(Src1NumSignBits, Src2NumSignBits) - 1;
     break;
   }
+  case TargetOpcode::G_ADD: {
+    Register Src2 = MI.getOperand(2).getReg();
+    unsigned Src2NumSignBits =
+        computeNumSignBits(Src2, DemandedElts, Depth + 1);
+    if (Src2NumSignBits <= 2)
+      return 1; // Early out.
+
+    Register Src1 = MI.getOperand(1).getReg();
+    unsigned Src1NumSignBits =
+        computeNumSignBits(Src1, DemandedElts, Depth + 1);
+    if (Src1NumSignBits == 1)
+      return 1; // Early Out.
+
+    // Special case decrementing a value (ADD X, -1):
+    KnownBits Known2 = getKnownBits(Src2, DemandedElts, Depth);
+    if (Known2.isAllOnes()) {
+      KnownBits Known1 = getKnownBits(Src1, DemandedElts, Depth);
+      // If the input is known to be 0 or 1, the output is 0/-1, which is all
+      // sign bits set.
+      if ((Known1.Zero | 1).isAllOnes())
+        return TyBits;
+
+      // If we are subtracting one from a positive number, there is no carry
+      // out of the result.
+      if (Known1.isNonNegative()) {
+        FirstAnswer = Src1NumSignBits;
+        break;
+      }
+
+      // Otherwise, we treat this like an ADD.
+    }
+
+    // Add can have at most one carry bit.  Thus we know that the output
+    // is, at worst, one more bit than the inputs.
+    FirstAnswer = std::min(Src1NumSignBits, Src2NumSignBits) - 1;
+    break;
+  }
   case TargetOpcode::G_FCMP:
   case TargetOpcode::G_ICMP: {
     bool IsFP = Opcode == TargetOpcode::G_FCMP;
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp
index 25c1db91b05d8..ded4df4edc14c 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp
@@ -55,12 +55,10 @@ LegalizeMutation LegalizeMutations::changeElementCountTo(unsigned TypeIdx,
 }
 
 LegalizeMutation LegalizeMutations::changeElementCountTo(unsigned TypeIdx,
-                                                         LLT NewEltTy) {
+                                                         ElementCount EC) {
   return [=](const LegalityQuery &Query) {
     const LLT OldTy = Query.Types[TypeIdx];
-    ElementCount NewEltCount = NewEltTy.isVector() ? NewEltTy.getElementCount()
-                                                   : ElementCount::getFixed(1);
-    return std::make_pair(TypeIdx, OldTy.changeElementCount(NewEltCount));
+    return std::make_pair(TypeIdx, OldTy.changeElementCount(EC));
   };
 }
 
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index cffaf7ce5aa06..38ec83f81f471 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -3292,8 +3292,7 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
     if (TypeIdx != 2)
       return UnableToLegalize;
     Observer.changingInstr(MI);
-    // TODO: Probably should be zext
-    widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
+    widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
     Observer.changedInstr(MI);
     return Legalized;
   }
@@ -3325,8 +3324,7 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
 
     if (TypeIdx == 2) {
       Observer.changingInstr(MI);
-      // TODO: Probably should be zext
-      widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT);
+      widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT);
       Observer.changedInstr(MI);
       return Legalized;
     }
diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index 055fdc6ad7213..ca82857319abc 100644
--- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -818,8 +818,7 @@ bool llvm::isKnownNeverNaN(Register Val, const MachineRegisterInfo &MRI,
   if (!DefMI)
     return false;
 
-  const TargetMachine& TM = DefMI->getMF()->getTarget();
-  if (DefMI->getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath)
+  if (DefMI->getFlag(MachineInstr::FmNoNans))
     return true;
 
   // If the value is a constant, we can obviously see if it is a NaN or not.
diff --git a/llvm/lib/CodeGen/IfConversion.cpp b/llvm/lib/CodeGen/IfConversion.cpp
index f80e1e8b683b3..3ac6d2a68f5d7 100644
--- a/llvm/lib/CodeGen/IfConversion.cpp
+++ b/llvm/lib/CodeGen/IfConversion.cpp
@@ -1498,7 +1498,7 @@ static void UpdatePredRedefs(MachineInstr &MI, LivePhysRegs &Redefs) {
   // Before stepping forward past MI, remember which regs were live
   // before MI. This is needed to set the Undef flag only when reg is
   // dead.
-  SparseSet<MCPhysReg, identity<MCPhysReg>> LiveBeforeMI;
+  SparseSet<MCPhysReg, MCPhysReg> LiveBeforeMI;
   LiveBeforeMI.setUniverse(TRI->getNumRegs());
   for (unsigned Reg : Redefs)
     LiveBeforeMI.insert(Reg);
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index a6a9b5058ad94..5c27a20869f81 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -258,13 +258,11 @@ static Value *getMaskOperand(IntrinsicInst *II) {
   default:
     llvm_unreachable("Unexpected intrinsic");
   case Intrinsic::vp_load:
-    return II->getOperand(1);
   case Intrinsic::masked_load:
-    return II->getOperand(2);
+    return II->getOperand(1);
   case Intrinsic::vp_store:
-    return II->getOperand(2);
   case Intrinsic::masked_store:
-    return II->getOperand(3);
+    return II->getOperand(2);
   }
 }
 
diff --git a/llvm/lib/CodeGen/LiveIntervals.cpp b/llvm/lib/CodeGen/LiveIntervals.cpp
index 0e38017e517f2..d2f2c3ef33c9c 100644
--- a/llvm/lib/CodeGen/LiveIntervals.cpp
+++ b/llvm/lib/CodeGen/LiveIntervals.cpp
@@ -661,7 +661,10 @@ void LiveIntervals::extendToIndices(LiveRange &LR,
 void LiveIntervals::pruneValue(LiveRange &LR, SlotIndex Kill,
                                SmallVectorImpl<SlotIndex> *EndPoints) {
   LiveQueryResult LRQ = LR.Query(Kill);
-  VNInfo *VNI = LRQ.valueOutOrDead();
+  // LR may have liveness reachable from early clobber slot, which may be
+  // only live-in instead of live-out of the instruction.
+  // For example, LR =[1r, 3r), Kill = 3e, we have to prune [3e, 3r) of LR.
+  VNInfo *VNI = LRQ.valueOutOrDead() ? LRQ.valueOutOrDead() : LRQ.valueIn();
   if (!VNI)
     return;
 
diff --git a/llvm/lib/CodeGen/MIR2Vec.cpp b/llvm/lib/CodeGen/MIR2Vec.cpp
index 5c78d984a9c89..99be1fc0dcf1f 100644
--- a/llvm/lib/CodeGen/MIR2Vec.cpp
+++ b/llvm/lib/CodeGen/MIR2Vec.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/MIR2Vec.h"
+#include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/Module.h"
@@ -29,20 +30,30 @@ using namespace mir2vec;
 STATISTIC(MIRVocabMissCounter,
           "Number of lookups to MIR entities not present in the vocabulary");
 
-cl::OptionCategory llvm::mir2vec::MIR2VecCategory("MIR2Vec Options");
+namespace llvm {
+namespace mir2vec {
+cl::OptionCategory MIR2VecCategory("MIR2Vec Options");
 
 // FIXME: Use a default vocab when not specified
 static cl::opt<std::string>
     VocabFile("mir2vec-vocab-path", cl::Optional,
               cl::desc("Path to the vocabulary file for MIR2Vec"), cl::init(""),
               cl::cat(MIR2VecCategory));
-cl::opt<float>
-    llvm::mir2vec::OpcWeight("mir2vec-opc-weight", cl::Optional, cl::init(1.0),
-                             cl::desc("Weight for machine opcode embeddings"),
-                             cl::cat(MIR2VecCategory));
+cl::opt<float> OpcWeight("mir2vec-opc-weight", cl::Optional, cl::init(1.0),
+                         cl::desc("Weight for machine opcode embeddings"),
+                         cl::cat(MIR2VecCategory));
+cl::opt<MIR2VecKind> MIR2VecEmbeddingKind(
+    "mir2vec-kind", cl::Optional,
+    cl::values(clEnumValN(MIR2VecKind::Symbolic, "symbolic",
+                          "Generate symbolic embeddings for MIR")),
+    cl::init(MIR2VecKind::Symbolic), cl::desc("MIR2Vec embedding kind"),
+    cl::cat(MIR2VecCategory));
+
+} // namespace mir2vec
+} // namespace llvm
 
 //===----------------------------------------------------------------------===//
-// Vocabulary Implementation
+// Vocabulary
 //===----------------------------------------------------------------------===//
 
 MIRVocabulary::MIRVocabulary(VocabMap &&OpcodeEntries,
@@ -188,6 +199,28 @@ void MIRVocabulary::buildCanonicalOpcodeMapping() {
                     << " unique base opcodes\n");
 }
 
+Expected<MIRVocabulary>
+MIRVocabulary::createDummyVocabForTest(const TargetInstrInfo &TII,
+                                       unsigned Dim) {
+  assert(Dim > 0 && "Dimension must be greater than zero");
+
+  float DummyVal = 0.1f;
+
+  // Create dummy embeddings for all canonical opcode names
+  VocabMap DummyVocabMap;
+  for (unsigned Opcode = 0; Opcode < TII.getNumOpcodes(); ++Opcode) {
+    std::string BaseOpcode = extractBaseOpcodeName(TII.getName(Opcode));
+    if (DummyVocabMap.count(BaseOpcode) == 0) {
+      // Only add if not already present
+      DummyVocabMap[BaseOpcode] = Embedding(Dim, DummyVal);
+      DummyVal += 0.1f;
+    }
+  }
+
+  // Create and return vocabulary with dummy embeddings
+  return MIRVocabulary::create(std::move(DummyVocabMap), TII);
+}
+
 //===----------------------------------------------------------------------===//
 // MIR2VecVocabLegacyAnalysis Implementation
 //===----------------------------------------------------------------------===//
@@ -258,7 +291,73 @@ MIR2VecVocabLegacyAnalysis::getMIR2VecVocabulary(const Module &M) {
 }
 
 //===----------------------------------------------------------------------===//
-// Printer Passes Implementation
+// MIREmbedder and its subclasses
+//===----------------------------------------------------------------------===//
+
+std::unique_ptr<MIREmbedder> MIREmbedder::create(MIR2VecKind Mode,
+                                                 const MachineFunction &MF,
+                                                 const MIRVocabulary &Vocab) {
+  switch (Mode) {
+  case MIR2VecKind::Symbolic:
+    return std::make_unique<SymbolicMIREmbedder>(MF, Vocab);
+  }
+  return nullptr;
+}
+
+Embedding MIREmbedder::computeEmbeddings(const MachineBasicBlock &MBB) const {
+  Embedding MBBVector(Dimension, 0);
+
+  // Get instruction info for opcode name resolution
+  const auto &Subtarget = MF.getSubtarget();
+  const auto *TII = Subtarget.getInstrInfo();
+  if (!TII) {
+    MF.getFunction().getContext().emitError(
+        "MIR2Vec: No TargetInstrInfo available; cannot compute embeddings");
+    return MBBVector;
+  }
+
+  // Process each machine instruction in the basic block
+  for (const auto &MI : MBB) {
+    // Skip debug instructions and other metadata
+    if (MI.isDebugInstr())
+      continue;
+    MBBVector += computeEmbeddings(MI);
+  }
+
+  return MBBVector;
+}
+
+Embedding MIREmbedder::computeEmbeddings() const {
+  Embedding MFuncVector(Dimension, 0);
+
+  // Consider all reachable machine basic blocks in the function
+  for (const auto *MBB : depth_first(&MF))
+    MFuncVector += computeEmbeddings(*MBB);
+  return MFuncVector;
+}
+
+SymbolicMIREmbedder::SymbolicMIREmbedder(const MachineFunction &MF,
+                                         const MIRVocabulary &Vocab)
+    : MIREmbedder(MF, Vocab) {}
+
+std::unique_ptr<SymbolicMIREmbedder>
+SymbolicMIREmbedder::create(const MachineFunction &MF,
+                            const MIRVocabulary &Vocab) {
+  return std::make_unique<SymbolicMIREmbedder>(MF, Vocab);
+}
+
+Embedding SymbolicMIREmbedder::computeEmbeddings(const MachineInstr &MI) const {
+  // Skip debug instructions and other metadata
+  if (MI.isDebugInstr())
+    return Embedding(Dimension, 0);
+
+  // Todo: Add operand/argument contributions
+
+  return Vocab[MI.getOpcode()];
+}
+
+//===----------------------------------------------------------------------===//
+// Printer Passes
 //===----------------------------------------------------------------------===//
 
 char MIR2VecVocabPrinterLegacyPass::ID = 0;
@@ -297,3 +396,56 @@ MachineFunctionPass *
 llvm::createMIR2VecVocabPrinterLegacyPass(raw_ostream &OS) {
   return new MIR2VecVocabPrinterLegacyPass(OS);
 }
+
+char MIR2VecPrinterLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(MIR2VecPrinterLegacyPass, "print-mir2vec",
+                      "MIR2Vec Embedder Printer Pass", false, true)
+INITIALIZE_PASS_DEPENDENCY(MIR2VecVocabLegacyAnalysis)
+INITIALIZE_PASS_DEPENDENCY(MachineModuleInfoWrapperPass)
+INITIALIZE_PASS_END(MIR2VecPrinterLegacyPass, "print-mir2vec",
+                    "MIR2Vec Embedder Printer Pass", false, true)
+
+bool MIR2VecPrinterLegacyPass::runOnMachineFunction(MachineFunction &MF) {
+  auto &Analysis = getAnalysis<MIR2VecVocabLegacyAnalysis>();
+  auto VocabOrErr =
+      Analysis.getMIR2VecVocabulary(*MF.getFunction().getParent());
+  assert(VocabOrErr && "Failed to get MIR2Vec vocabulary");
+  auto &MIRVocab = *VocabOrErr;
+
+  auto Emb = mir2vec::MIREmbedder::create(MIR2VecEmbeddingKind, MF, MIRVocab);
+  if (!Emb) {
+    OS << "Error creating MIR2Vec embeddings for function " << MF.getName()
+       << "\n";
+    return false;
+  }
+
+  OS << "MIR2Vec embeddings for machine function " << MF.getName() << ":\n";
+  OS << "Machine Function vector: ";
+  Emb->getMFunctionVector().print(OS);
+
+  OS << "Machine basic block vectors:\n";
+  for (const MachineBasicBlock &MBB : MF) {
+    OS << "Machine basic block: " << MBB.getFullName() << ":\n";
+    Emb->getMBBVector(MBB).print(OS);
+  }
+
+  OS << "Machine instruction vectors:\n";
+  for (const MachineBasicBlock &MBB : MF) {
+    for (const MachineInstr &MI : MBB) {
+      // Skip debug instructions as they are not
+      // embedded
+      if (MI.isDebugInstr())
+        continue;
+
+      OS << "Machine instruction: ";
+      MI.print(OS);
+      Emb->getMInstVector(MI).print(OS);
+    }
+  }
+
+  return false;
+}
+
+MachineFunctionPass *llvm::createMIR2VecPrinterLegacyPass(raw_ostream &OS) {
+  return new MIR2VecPrinterLegacyPass(OS);
+}
diff --git a/llvm/lib/CodeGen/MIRFSDiscriminator.cpp b/llvm/lib/CodeGen/MIRFSDiscriminator.cpp
index d988a2ad4e793..e37f78454cd9b 100644
--- a/llvm/lib/CodeGen/MIRFSDiscriminator.cpp
+++ b/llvm/lib/CodeGen/MIRFSDiscriminator.cpp
@@ -15,6 +15,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/Analysis/BlockFrequencyInfoImpl.h"
+#include "llvm/CodeGen/MIRFSDiscriminatorOptions.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Function.h"
@@ -35,13 +36,10 @@ using namespace sampleprofutil;
 
 // TODO(xur): Remove this option and related code once we make true as the
 // default.
-namespace llvm {
-cl::opt<bool> ImprovedFSDiscriminator(
+cl::opt<bool> llvm::ImprovedFSDiscriminator(
     "improved-fs-discriminator", cl::Hidden, cl::init(false),
     cl::desc("New FS discriminators encoding (incompatible with the original "
              "encoding)"));
-} // namespace llvm
-
 char MIRAddFSDiscriminators::ID = 0;
 
 INITIALIZE_PASS(MIRAddFSDiscriminators, DEBUG_TYPE,
diff --git a/llvm/lib/CodeGen/MIRSampleProfile.cpp b/llvm/lib/CodeGen/MIRSampleProfile.cpp
index 9bba50e8e6924..d44f577558619 100644
--- a/llvm/lib/CodeGen/MIRSampleProfile.cpp
+++ b/llvm/lib/CodeGen/MIRSampleProfile.cpp
@@ -15,6 +15,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/Analysis/BlockFrequencyInfoImpl.h"
+#include "llvm/CodeGen/MIRFSDiscriminatorOptions.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
@@ -62,9 +63,6 @@ static cl::opt<bool> ViewBFIAfter("fs-viewbfi-after", cl::Hidden,
                                   cl::init(false),
                                   cl::desc("View BFI after MIR loader"));
 
-namespace llvm {
-extern cl::opt<bool> ImprovedFSDiscriminator;
-}
 char MIRProfileLoaderPass::ID = 0;
 
 INITIALIZE_PASS_BEGIN(MIRProfileLoaderPass, DEBUG_TYPE,
diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index e35983138550f..ea08365810a29 100644
--- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -1257,7 +1257,7 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock(
   Tracker.clear();
 }
 
-static void LLVM_ATTRIBUTE_UNUSED printSpillReloadChain(
+[[maybe_unused]] static void printSpillReloadChain(
     DenseMap<MachineInstr *, SmallVector<MachineInstr *>> &SpillChain,
     DenseMap<MachineInstr *, SmallVector<MachineInstr *>> &ReloadChain,
     MachineInstr *Leader) {
diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp
index 585756665544c..d6fb2a0b0728f 100644
--- a/llvm/lib/CodeGen/RegAllocFast.cpp
+++ b/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -211,7 +211,7 @@ class RegAllocFastImpl {
     unsigned getSparseSetIndex() const { return VirtReg.virtRegIndex(); }
   };
 
-  using LiveRegMap = SparseSet<LiveReg, identity<unsigned>, uint16_t>;
+  using LiveRegMap = SparseSet<LiveReg, unsigned, identity_cxx20, uint16_t>;
   /// This map contains entries for each virtual register that is currently
   /// available in a physical register.
   LiveRegMap LiveVirtRegs;
diff --git a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
index 3268c267fc798..9662511e584c0 100644
--- a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -1551,7 +1551,7 @@ LLVM_DUMP_METHOD void ILPValue::dump() const {
   dbgs() << *this << '\n';
 }
 
-LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]]
 raw_ostream &llvm::operator<<(raw_ostream &OS, const ILPValue &Val) {
   Val.print(OS);
   return OS;
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 45517e079f547..0ebc8f0abcfaf 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -658,13 +658,13 @@ namespace {
                           bool InexpensiveOnly = false,
                           std::optional<EVT> OutVT = std::nullopt);
     SDValue BuildDivEstimate(SDValue N, SDValue Op, SDNodeFlags Flags);
-    SDValue buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags);
-    SDValue buildSqrtEstimate(SDValue Op, SDNodeFlags Flags);
-    SDValue buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags, bool Recip);
+    SDValue buildRsqrtEstimate(SDValue Op);
+    SDValue buildSqrtEstimate(SDValue Op);
+    SDValue buildSqrtEstimateImpl(SDValue Op, bool Recip);
     SDValue buildSqrtNROneConst(SDValue Arg, SDValue Est, unsigned Iterations,
-                                SDNodeFlags Flags, bool Reciprocal);
+                                bool Reciprocal);
     SDValue buildSqrtNRTwoConst(SDValue Arg, SDValue Est, unsigned Iterations,
-                                SDNodeFlags Flags, bool Reciprocal);
+                                bool Reciprocal);
     SDValue MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
                                bool DemandHighBits = true);
     SDValue MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1);
@@ -5044,7 +5044,6 @@ static SDValue simplifyDivRem(SDNode *N, SelectionDAG &DAG) {
 
   unsigned Opc = N->getOpcode();
   bool IsDiv = (ISD::SDIV == Opc) || (ISD::UDIV == Opc);
-  ConstantSDNode *N1C = isConstOrConstSplat(N1);
 
   // X / undef -> undef
   // X % undef -> undef
@@ -5076,7 +5075,7 @@ static SDValue simplifyDivRem(SDNode *N, SelectionDAG &DAG) {
   // division-by-zero or remainder-by-zero, so assume the divisor is 1.
   // TODO: Similarly, if we're zero-extending a boolean divisor, then assume
   // it's a 1.
-  if ((N1C && N1C->isOne()) || (VT.getScalarType() == MVT::i1))
+  if (isOneOrOneSplat(N1) || (VT.getScalarType() == MVT::i1))
     return IsDiv ? N0 : DAG.getConstant(0, DL, VT);
 
   return SDValue();
@@ -16434,7 +16433,8 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
   case ISD::OR:
   case ISD::XOR:
     if (!LegalOperations && N0.hasOneUse() &&
-        (isConstantOrConstantVector(N0.getOperand(0), true) ||
+        (N0.getOperand(0) == N0.getOperand(1) ||
+         isConstantOrConstantVector(N0.getOperand(0), true) ||
          isConstantOrConstantVector(N0.getOperand(1), true))) {
       // TODO: We already restricted this to pre-legalization, but for vectors
       // we are extra cautious to not create an unsupported operation.
@@ -17762,7 +17762,6 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
   bool N1CFP = DAG.isConstantFPBuildVectorOrConstantFP(N1);
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
-  const TargetOptions &Options = DAG.getTarget().Options;
   SDNodeFlags Flags = N->getFlags();
   SelectionDAG::FlagInserter FlagsInserter(DAG, N);
 
@@ -17828,7 +17827,7 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
   bool AllowNewConst = (Level < AfterLegalizeDAG);
 
   // If nnan is enabled, fold lots of things.
-  if ((Options.NoNaNsFPMath || Flags.hasNoNaNs()) && AllowNewConst) {
+  if (Flags.hasNoNaNs() && AllowNewConst) {
     // If allowed, fold (fadd (fneg x), x) -> 0.0
     if (N0.getOpcode() == ISD::FNEG && N0.getOperand(0) == N1)
       return DAG.getConstantFP(0.0, DL, VT);
@@ -17977,7 +17976,6 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
   ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1, true);
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
-  const TargetOptions &Options = DAG.getTarget().Options;
   const SDNodeFlags Flags = N->getFlags();
   SelectionDAG::FlagInserter FlagsInserter(DAG, N);
 
@@ -18005,7 +18003,7 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
 
   if (N0 == N1) {
     // (fsub x, x) -> 0.0
-    if (Options.NoNaNsFPMath || Flags.hasNoNaNs())
+    if (Flags.hasNoNaNs())
       return DAG.getConstantFP(0.0f, DL, VT);
   }
 
@@ -18316,7 +18314,6 @@ template <class MatchContextClass> SDValue DAGCombiner::visitFMA(SDNode *N) {
   ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2);
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
-  const TargetOptions &Options = DAG.getTarget().Options;
   // FMA nodes have flags that propagate to the created nodes.
   SelectionDAG::FlagInserter FlagsInserter(DAG, N);
   MatchContextClass matcher(DAG, TLI, N);
@@ -18342,8 +18339,7 @@ template <class MatchContextClass> SDValue DAGCombiner::visitFMA(SDNode *N) {
       return matcher.getNode(ISD::FMA, DL, VT, NegN0, NegN1, N2);
   }
 
-  if ((Options.NoNaNsFPMath && N->getFlags().hasNoInfs()) ||
-      (N->getFlags().hasNoNaNs() && N->getFlags().hasNoInfs())) {
+  if (N->getFlags().hasNoNaNs() && N->getFlags().hasNoInfs()) {
     if (N->getFlags().hasNoSignedZeros() ||
         (N2CFP && !N2CFP->isExactlyValue(-0.0))) {
       if (N0CFP && N0CFP->isZero())
@@ -18593,20 +18589,18 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
     // If this FDIV is part of a reciprocal square root, it may be folded
     // into a target-specific square root estimate instruction.
     if (N1.getOpcode() == ISD::FSQRT) {
-      if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0), Flags))
+      if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0)))
         return DAG.getNode(ISD::FMUL, DL, VT, N0, RV);
     } else if (N1.getOpcode() == ISD::FP_EXTEND &&
                N1.getOperand(0).getOpcode() == ISD::FSQRT) {
-      if (SDValue RV =
-              buildRsqrtEstimate(N1.getOperand(0).getOperand(0), Flags)) {
+      if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0).getOperand(0))) {
         RV = DAG.getNode(ISD::FP_EXTEND, SDLoc(N1), VT, RV);
         AddToWorklist(RV.getNode());
         return DAG.getNode(ISD::FMUL, DL, VT, N0, RV);
       }
     } else if (N1.getOpcode() == ISD::FP_ROUND &&
                N1.getOperand(0).getOpcode() == ISD::FSQRT) {
-      if (SDValue RV =
-              buildRsqrtEstimate(N1.getOperand(0).getOperand(0), Flags)) {
+      if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0).getOperand(0))) {
         RV = DAG.getNode(ISD::FP_ROUND, SDLoc(N1), VT, RV, N1.getOperand(1));
         AddToWorklist(RV.getNode());
         return DAG.getNode(ISD::FMUL, DL, VT, N0, RV);
@@ -18638,7 +18632,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
             SDValue AA = DAG.getNode(ISD::FMUL, DL, VT, A, A);
             SDValue AAZ =
                 DAG.getNode(ISD::FMUL, DL, VT, AA, Sqrt.getOperand(0));
-            if (SDValue Rsqrt = buildRsqrtEstimate(AAZ, Flags))
+            if (SDValue Rsqrt = buildRsqrtEstimate(AAZ))
               return DAG.getNode(ISD::FMUL, DL, VT, N0, Rsqrt);
 
             // Estimate creation failed. Clean up speculatively created nodes.
@@ -18648,7 +18642,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
 
         // We found a FSQRT, so try to make this fold:
         // X / (Y * sqrt(Z)) -> X * (rsqrt(Z) / Y)
-        if (SDValue Rsqrt = buildRsqrtEstimate(Sqrt.getOperand(0), Flags)) {
+        if (SDValue Rsqrt = buildRsqrtEstimate(Sqrt.getOperand(0))) {
           SDValue Div = DAG.getNode(ISD::FDIV, SDLoc(N1), VT, Rsqrt, Y);
           AddToWorklist(Div.getNode());
           return DAG.getNode(ISD::FMUL, DL, VT, N0, Div);
@@ -18745,11 +18739,12 @@ SDValue DAGCombiner::visitFSQRT(SDNode *N) {
     return SDValue();
 
   // FSQRT nodes have flags that propagate to the created nodes.
+  SelectionDAG::FlagInserter FlagInserter(DAG, Flags);
   // TODO: If this is N0/sqrt(N0), and we reach this node before trying to
   //       transform the fdiv, we may produce a sub-optimal estimate sequence
   //       because the reciprocal calculation may not have to filter out a
   //       0.0 input.
-  return buildSqrtEstimate(N0, Flags);
+  return buildSqrtEstimate(N0);
 }
 
 /// copysign(x, fp_extend(y)) -> copysign(x, y)
@@ -26884,6 +26879,8 @@ static SDValue combineTruncationShuffle(ShuffleVectorSDNode *SVN,
   // TODO: handle more extension/truncation cases as cases arise.
   if (EltSizeInBits != ExtSrcSizeInBits)
     return SDValue();
+  if (VT.getSizeInBits() != N00.getValueSizeInBits())
+    return SDValue();
 
   // We can remove *extend_vector_inreg only if the truncation happens at
   // the same scale as the extension.
@@ -29746,28 +29743,27 @@ SDValue DAGCombiner::BuildDivEstimate(SDValue N, SDValue Op,
 ///   X_{i+1} = X_i (1.5 - A X_i^2 / 2)
 /// As a result, we precompute A/2 prior to the iteration loop.
 SDValue DAGCombiner::buildSqrtNROneConst(SDValue Arg, SDValue Est,
-                                         unsigned Iterations,
-                                         SDNodeFlags Flags, bool Reciprocal) {
+                                         unsigned Iterations, bool Reciprocal) {
   EVT VT = Arg.getValueType();
   SDLoc DL(Arg);
   SDValue ThreeHalves = DAG.getConstantFP(1.5, DL, VT);
 
   // We now need 0.5 * Arg which we can write as (1.5 * Arg - Arg) so that
   // this entire sequence requires only one FP constant.
-  SDValue HalfArg = DAG.getNode(ISD::FMUL, DL, VT, ThreeHalves, Arg, Flags);
-  HalfArg = DAG.getNode(ISD::FSUB, DL, VT, HalfArg, Arg, Flags);
+  SDValue HalfArg = DAG.getNode(ISD::FMUL, DL, VT, ThreeHalves, Arg);
+  HalfArg = DAG.getNode(ISD::FSUB, DL, VT, HalfArg, Arg);
 
   // Newton iterations: Est = Est * (1.5 - HalfArg * Est * Est)
   for (unsigned i = 0; i < Iterations; ++i) {
-    SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, Est, Flags);
-    NewEst = DAG.getNode(ISD::FMUL, DL, VT, HalfArg, NewEst, Flags);
-    NewEst = DAG.getNode(ISD::FSUB, DL, VT, ThreeHalves, NewEst, Flags);
-    Est = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst, Flags);
+    SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, Est);
+    NewEst = DAG.getNode(ISD::FMUL, DL, VT, HalfArg, NewEst);
+    NewEst = DAG.getNode(ISD::FSUB, DL, VT, ThreeHalves, NewEst);
+    Est = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst);
   }
 
   // If non-reciprocal square root is requested, multiply the result by Arg.
   if (!Reciprocal)
-    Est = DAG.getNode(ISD::FMUL, DL, VT, Est, Arg, Flags);
+    Est = DAG.getNode(ISD::FMUL, DL, VT, Est, Arg);
 
   return Est;
 }
@@ -29778,8 +29774,7 @@ SDValue DAGCombiner::buildSqrtNROneConst(SDValue Arg, SDValue Est,
 ///     =>
 ///   X_{i+1} = (-0.5 * X_i) * (A * X_i * X_i + (-3.0))
 SDValue DAGCombiner::buildSqrtNRTwoConst(SDValue Arg, SDValue Est,
-                                         unsigned Iterations,
-                                         SDNodeFlags Flags, bool Reciprocal) {
+                                         unsigned Iterations, bool Reciprocal) {
   EVT VT = Arg.getValueType();
   SDLoc DL(Arg);
   SDValue MinusThree = DAG.getConstantFP(-3.0, DL, VT);
@@ -29792,9 +29787,9 @@ SDValue DAGCombiner::buildSqrtNRTwoConst(SDValue Arg, SDValue Est,
   // Newton iterations for reciprocal square root:
   // E = (E * -0.5) * ((A * E) * E + -3.0)
   for (unsigned i = 0; i < Iterations; ++i) {
-    SDValue AE = DAG.getNode(ISD::FMUL, DL, VT, Arg, Est, Flags);
-    SDValue AEE = DAG.getNode(ISD::FMUL, DL, VT, AE, Est, Flags);
-    SDValue RHS = DAG.getNode(ISD::FADD, DL, VT, AEE, MinusThree, Flags);
+    SDValue AE = DAG.getNode(ISD::FMUL, DL, VT, Arg, Est);
+    SDValue AEE = DAG.getNode(ISD::FMUL, DL, VT, AE, Est);
+    SDValue RHS = DAG.getNode(ISD::FADD, DL, VT, AEE, MinusThree);
 
     // When calculating a square root at the last iteration build:
     // S = ((A * E) * -0.5) * ((A * E) * E + -3.0)
@@ -29802,13 +29797,13 @@ SDValue DAGCombiner::buildSqrtNRTwoConst(SDValue Arg, SDValue Est,
     SDValue LHS;
     if (Reciprocal || (i + 1) < Iterations) {
       // RSQRT: LHS = (E * -0.5)
-      LHS = DAG.getNode(ISD::FMUL, DL, VT, Est, MinusHalf, Flags);
+      LHS = DAG.getNode(ISD::FMUL, DL, VT, Est, MinusHalf);
     } else {
       // SQRT: LHS = (A * E) * -0.5
-      LHS = DAG.getNode(ISD::FMUL, DL, VT, AE, MinusHalf, Flags);
+      LHS = DAG.getNode(ISD::FMUL, DL, VT, AE, MinusHalf);
     }
 
-    Est = DAG.getNode(ISD::FMUL, DL, VT, LHS, RHS, Flags);
+    Est = DAG.getNode(ISD::FMUL, DL, VT, LHS, RHS);
   }
 
   return Est;
@@ -29817,8 +29812,7 @@ SDValue DAGCombiner::buildSqrtNRTwoConst(SDValue Arg, SDValue Est,
 /// Build code to calculate either rsqrt(Op) or sqrt(Op). In the latter case
 /// Op*rsqrt(Op) is actually computed, so additional postprocessing is needed if
 /// Op can be zero.
-SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags,
-                                           bool Reciprocal) {
+SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, bool Reciprocal) {
   if (LegalDAG)
     return SDValue();
 
@@ -29846,8 +29840,8 @@ SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags,
 
     if (Iterations > 0)
       Est = UseOneConstNR
-            ? buildSqrtNROneConst(Op, Est, Iterations, Flags, Reciprocal)
-            : buildSqrtNRTwoConst(Op, Est, Iterations, Flags, Reciprocal);
+                ? buildSqrtNROneConst(Op, Est, Iterations, Reciprocal)
+                : buildSqrtNRTwoConst(Op, Est, Iterations, Reciprocal);
     if (!Reciprocal) {
       SDLoc DL(Op);
       // Try the target specific test first.
@@ -29865,12 +29859,12 @@ SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags,
   return SDValue();
 }
 
-SDValue DAGCombiner::buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags) {
-  return buildSqrtEstimateImpl(Op, Flags, true);
+SDValue DAGCombiner::buildRsqrtEstimate(SDValue Op) {
+  return buildSqrtEstimateImpl(Op, true);
 }
 
-SDValue DAGCombiner::buildSqrtEstimate(SDValue Op, SDNodeFlags Flags) {
-  return buildSqrtEstimateImpl(Op, Flags, false);
+SDValue DAGCombiner::buildSqrtEstimate(SDValue Op) {
+  return buildSqrtEstimateImpl(Op, false);
 }
 
 /// Return true if there is any possibility that the two addresses overlap.
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 437d0f4654096..bf1abfe50327e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -3765,6 +3765,8 @@ bool DAGTypeLegalizer::SoftPromoteHalfOperand(SDNode *N, unsigned OpNo) {
   case ISD::FP_TO_UINT:
   case ISD::LRINT:
   case ISD::LLRINT:
+  case ISD::LROUND:
+  case ISD::LLROUND:
     Res = SoftPromoteHalfOp_Op0WithStrict(N);
     break;
   case ISD::FP_TO_SINT_SAT:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 88a4a8b16373b..b1776eaae6e86 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -429,7 +429,20 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Atomic0(AtomicSDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_Atomic1(AtomicSDNode *N) {
-  SDValue Op2 = GetPromotedInteger(N->getOperand(2));
+  SDValue Op2 = N->getOperand(2);
+  switch (TLI.getExtendForAtomicRMWArg(N->getOpcode())) {
+  case ISD::SIGN_EXTEND:
+    Op2 = SExtPromotedInteger(Op2);
+    break;
+  case ISD::ZERO_EXTEND:
+    Op2 = ZExtPromotedInteger(Op2);
+    break;
+  case ISD::ANY_EXTEND:
+    Op2 = GetPromotedInteger(Op2);
+    break;
+  default:
+    llvm_unreachable("Invalid atomic op extension");
+  }
   SDValue Res = DAG.getAtomic(N->getOpcode(), SDLoc(N),
                               N->getMemoryVT(),
                               N->getChain(), N->getBasePtr(),
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index ab12299c85a3e..e4582169553d9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5063,8 +5063,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
     break;
   case ISD::ADD:
   case ISD::ADDC:
-    // Add can have at most one carry bit.  Thus we know that the output
-    // is, at worst, one more bit than the inputs.
+    // TODO: Move Operand 1 check before Operand 0 check
     Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
     if (Tmp == 1) return 1; // Early out.
 
@@ -5088,6 +5087,9 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
 
     Tmp2 = ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
     if (Tmp2 == 1) return 1; // Early out.
+
+    // Add can have at most one carry bit.  Thus we know that the output
+    // is, at worst, one more bit than the inputs.
     return std::min(Tmp, Tmp2) - 1;
   case ISD::SUB:
     Tmp2 = ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
@@ -6403,8 +6405,9 @@ static SDValue foldCONCAT_VECTORS(const SDLoc &DL, EVT VT,
   if (VT.isScalableVector())
     return SDValue();
 
-  // A CONCAT_VECTOR with all UNDEF/BUILD_VECTOR operands can be
-  // simplified to one big BUILD_VECTOR.
+  // A CONCAT_VECTOR of scalar sources, such as UNDEF, BUILD_VECTOR and
+  // single-element INSERT_VECTOR_ELT operands can be simplified to one big
+  // BUILD_VECTOR.
   // FIXME: Add support for SCALAR_TO_VECTOR as well.
   EVT SVT = VT.getScalarType();
   SmallVector<SDValue, 16> Elts;
@@ -6414,6 +6417,10 @@ static SDValue foldCONCAT_VECTORS(const SDLoc &DL, EVT VT,
       Elts.append(OpVT.getVectorNumElements(), DAG.getUNDEF(SVT));
     else if (Op.getOpcode() == ISD::BUILD_VECTOR)
       Elts.append(Op->op_begin(), Op->op_end());
+    else if (Op.getOpcode() == ISD::INSERT_VECTOR_ELT &&
+             OpVT.getVectorNumElements() == 1 &&
+             isNullConstant(Op.getOperand(2)))
+      Elts.push_back(Op.getOperand(1));
     else
       return SDValue();
   }
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 09674e3cddef5..e2ede57722219 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -3971,8 +3971,14 @@ void SelectionDAGBuilder::visitSIToFP(const User &I) {
 }
 
 void SelectionDAGBuilder::visitPtrToAddr(const User &I) {
-  // FIXME: this is not correct for pointers with addr width != pointer width
-  visitPtrToInt(I);
+  SDValue N = getValue(I.getOperand(0));
+  // By definition the type of the ptrtoaddr must be equal to the address type.
+  const auto &TLI = DAG.getTargetLoweringInfo();
+  EVT AddrVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+  // The address width must be smaller or equal to the pointer representation
+  // width, so we lower ptrtoaddr as a truncate (possibly folded to a no-op).
+  N = DAG.getNode(ISD::TRUNCATE, getCurSDLoc(), AddrVT, N);
+  setValue(&I, N);
 }
 
 void SelectionDAGBuilder::visitPtrToInt(const User &I) {
@@ -4831,29 +4837,10 @@ void SelectionDAGBuilder::visitMaskedStore(const CallInst &I,
                                            bool IsCompressing) {
   SDLoc sdl = getCurSDLoc();
 
-  auto getMaskedStoreOps = [&](Value *&Ptr, Value *&Mask, Value *&Src0,
-                               Align &Alignment) {
-    // llvm.masked.store.*(Src0, Ptr, alignment, Mask)
-    Src0 = I.getArgOperand(0);
-    Ptr = I.getArgOperand(1);
-    Alignment = cast<ConstantInt>(I.getArgOperand(2))->getAlignValue();
-    Mask = I.getArgOperand(3);
-  };
-  auto getCompressingStoreOps = [&](Value *&Ptr, Value *&Mask, Value *&Src0,
-                                    Align &Alignment) {
-    // llvm.masked.compressstore.*(Src0, Ptr, Mask)
-    Src0 = I.getArgOperand(0);
-    Ptr = I.getArgOperand(1);
-    Mask = I.getArgOperand(2);
-    Alignment = I.getParamAlign(1).valueOrOne();
-  };
-
-  Value  *PtrOperand, *MaskOperand, *Src0Operand;
-  Align Alignment;
-  if (IsCompressing)
-    getCompressingStoreOps(PtrOperand, MaskOperand, Src0Operand, Alignment);
-  else
-    getMaskedStoreOps(PtrOperand, MaskOperand, Src0Operand, Alignment);
+  Value *Src0Operand = I.getArgOperand(0);
+  Value *PtrOperand = I.getArgOperand(1);
+  Value *MaskOperand = I.getArgOperand(2);
+  Align Alignment = I.getParamAlign(1).valueOrOne();
 
   SDValue Ptr = getValue(PtrOperand);
   SDValue Src0 = getValue(Src0Operand);
@@ -4958,14 +4945,12 @@ static bool getUniformBase(const Value *Ptr, SDValue &Base, SDValue &Index,
 void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) {
   SDLoc sdl = getCurSDLoc();
 
-  // llvm.masked.scatter.*(Src0, Ptrs, alignment, Mask)
+  // llvm.masked.scatter.*(Src0, Ptrs, Mask)
   const Value *Ptr = I.getArgOperand(1);
   SDValue Src0 = getValue(I.getArgOperand(0));
-  SDValue Mask = getValue(I.getArgOperand(3));
+  SDValue Mask = getValue(I.getArgOperand(2));
   EVT VT = Src0.getValueType();
-  Align Alignment = cast<ConstantInt>(I.getArgOperand(2))
-                        ->getMaybeAlignValue()
-                        .value_or(DAG.getEVTAlign(VT.getScalarType()));
+  Align Alignment = I.getParamAlign(1).valueOrOne();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   SDValue Base;
@@ -5002,29 +4987,10 @@ void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) {
 void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I, bool IsExpanding) {
   SDLoc sdl = getCurSDLoc();
 
-  auto getMaskedLoadOps = [&](Value *&Ptr, Value *&Mask, Value *&Src0,
-                              Align &Alignment) {
-    // @llvm.masked.load.*(Ptr, alignment, Mask, Src0)
-    Ptr = I.getArgOperand(0);
-    Alignment = cast<ConstantInt>(I.getArgOperand(1))->getAlignValue();
-    Mask = I.getArgOperand(2);
-    Src0 = I.getArgOperand(3);
-  };
-  auto getExpandingLoadOps = [&](Value *&Ptr, Value *&Mask, Value *&Src0,
-                                 Align &Alignment) {
-    // @llvm.masked.expandload.*(Ptr, Mask, Src0)
-    Ptr = I.getArgOperand(0);
-    Alignment = I.getParamAlign(0).valueOrOne();
-    Mask = I.getArgOperand(1);
-    Src0 = I.getArgOperand(2);
-  };
-
-  Value  *PtrOperand, *MaskOperand, *Src0Operand;
-  Align Alignment;
-  if (IsExpanding)
-    getExpandingLoadOps(PtrOperand, MaskOperand, Src0Operand, Alignment);
-  else
-    getMaskedLoadOps(PtrOperand, MaskOperand, Src0Operand, Alignment);
+  Value *PtrOperand = I.getArgOperand(0);
+  Value *MaskOperand = I.getArgOperand(1);
+  Value *Src0Operand = I.getArgOperand(2);
+  Align Alignment = I.getParamAlign(0).valueOrOne();
 
   SDValue Ptr = getValue(PtrOperand);
   SDValue Src0 = getValue(Src0Operand);
@@ -5071,16 +5037,14 @@ void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I, bool IsExpanding) {
 void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
   SDLoc sdl = getCurSDLoc();
 
-  // @llvm.masked.gather.*(Ptrs, alignment, Mask, Src0)
+  // @llvm.masked.gather.*(Ptrs, Mask, Src0)
   const Value *Ptr = I.getArgOperand(0);
-  SDValue Src0 = getValue(I.getArgOperand(3));
-  SDValue Mask = getValue(I.getArgOperand(2));
+  SDValue Src0 = getValue(I.getArgOperand(2));
+  SDValue Mask = getValue(I.getArgOperand(1));
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
-  Align Alignment = cast<ConstantInt>(I.getArgOperand(1))
-                        ->getMaybeAlignValue()
-                        .value_or(DAG.getEVTAlign(VT.getScalarType()));
+  Align Alignment = I.getParamAlign(0).valueOrOne();
 
   const MDNode *Ranges = getRangeMetadata(I);
 
diff --git a/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp b/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp
index 8052773812a2c..8637b55c78f9c 100644
--- a/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp
+++ b/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp
@@ -2427,11 +2427,13 @@ void DWARFLinker::DIECloner::generateLineTableForUnit(CompileUnit &Unit) {
           uint64_t OrigStmtSeq = StmtSeq.get();
           // 1. Get the original row index from the stmt list offset.
           auto OrigRowIter = SeqOffToOrigRow.find(OrigStmtSeq);
+          const uint64_t InvalidOffset =
+              Unit.getOrigUnit().getFormParams().getDwarfMaxOffset();
           // Check whether we have an output sequence for the StmtSeq offset.
           // Some sequences are discarded by the DWARFLinker if they are invalid
           // (empty).
           if (OrigRowIter == SeqOffToOrigRow.end()) {
-            StmtSeq.set(UINT64_MAX);
+            StmtSeq.set(InvalidOffset);
             continue;
           }
           size_t OrigRowIndex = OrigRowIter->second;
@@ -2441,7 +2443,7 @@ void DWARFLinker::DIECloner::generateLineTableForUnit(CompileUnit &Unit) {
           if (NewRowIter == OrigRowToNewRow.end()) {
             // If the original row index is not found in the map, update the
             // stmt_sequence attribute to the 'invalid offset' magic value.
-            StmtSeq.set(UINT64_MAX);
+            StmtSeq.set(InvalidOffset);
             continue;
           }
 
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
index 6610eef54801b..c61f757aba793 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
@@ -181,8 +181,8 @@ DWARFDebugFrame::DWARFDebugFrame(Triple::ArchType Arch,
 
 DWARFDebugFrame::~DWARFDebugFrame() = default;
 
-static void LLVM_ATTRIBUTE_UNUSED dumpDataAux(DataExtractor Data,
-                                              uint64_t Offset, int Length) {
+[[maybe_unused]] static void dumpDataAux(DataExtractor Data, uint64_t Offset,
+                                         int Length) {
   errs() << "DUMP: ";
   for (int i = 0; i < Length; ++i) {
     uint8_t c = Data.getU8(&Offset);
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
index f9a145e3db286..9507388b7c6ce 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -121,6 +121,28 @@ static DWARFDie resolveReferencedType(DWARFDie D, DWARFFormValue F) {
   return D.getAttributeValueAsReferencedDie(F).resolveTypeUnitReference();
 }
 
+static llvm::StringRef
+prettyLanguageVersionString(const DWARFAttribute &AttrValue,
+                            const DWARFDie &Die) {
+  if (AttrValue.Attr != DW_AT_language_version)
+    return {};
+
+  auto NameForm = Die.find(DW_AT_language_name);
+  if (!NameForm)
+    return {};
+
+  auto LName = NameForm->getAsUnsignedConstant();
+  if (!LName)
+    return {};
+
+  auto LVersion = AttrValue.Value.getAsUnsignedConstant();
+  if (!LVersion)
+    return {};
+
+  return llvm::dwarf::LanguageDescription(
+      static_cast<SourceLanguageName>(*LName), *LVersion);
+}
+
 static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die,
                           const DWARFAttribute &AttrValue, unsigned Indent,
                           DIDumpOptions DumpOpts) {
@@ -160,15 +182,28 @@ static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die,
   } else if (std::optional<uint64_t> Val = FormValue.getAsUnsignedConstant())
     Name = AttributeValueString(Attr, *Val);
 
-  if (!Name.empty())
-    WithColor(OS, Color) << Name;
-  else if (Attr == DW_AT_decl_line || Attr == DW_AT_decl_column ||
-           Attr == DW_AT_call_line || Attr == DW_AT_call_column ||
-           Attr == DW_AT_language_version) {
+  auto DumpUnsignedConstant = [&OS,
+                               &DumpOpts](const DWARFFormValue &FormValue) {
     if (std::optional<uint64_t> Val = FormValue.getAsUnsignedConstant())
       OS << *Val;
     else
       FormValue.dump(OS, DumpOpts);
+  };
+
+  llvm::StringRef PrettyVersionName =
+      prettyLanguageVersionString(AttrValue, Die);
+  bool ShouldDumpRawLanguageVersion =
+      Attr == DW_AT_language_version &&
+      (DumpOpts.Verbose || PrettyVersionName.empty());
+
+  if (!Name.empty())
+    WithColor(OS, Color) << Name;
+  else if (Attr == DW_AT_decl_line || Attr == DW_AT_decl_column ||
+           Attr == DW_AT_call_line || Attr == DW_AT_call_column) {
+    DumpUnsignedConstant(FormValue);
+  } else if (Attr == DW_AT_language_version) {
+    if (ShouldDumpRawLanguageVersion)
+      DumpUnsignedConstant(FormValue);
   } else if (Attr == DW_AT_low_pc &&
              (FormValue.getAsAddress() ==
               dwarf::computeTombstoneAddress(U->getAddressByteSize()))) {
@@ -242,6 +277,10 @@ static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die,
       DumpOpts.RecoverableErrorHandler(createStringError(
           errc::invalid_argument, "decoding address ranges: %s",
           toString(RangesOrError.takeError()).c_str()));
+  } else if (Attr == DW_AT_language_version) {
+    if (!PrettyVersionName.empty())
+      WithColor(OS, Color) << (ShouldDumpRawLanguageVersion ? " " : "")
+                           << PrettyVersionName;
   }
 
   OS << ")\n";
diff --git a/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp b/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp
index 7a0256f10ea60..a326a01a75912 100644
--- a/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp
+++ b/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp
@@ -320,12 +320,16 @@ static void convertFunctionLineTable(OutputAggregator &Out, CUInfo &CUI,
   // Attempt to retrieve DW_AT_LLVM_stmt_sequence if present.
   std::optional<uint64_t> StmtSeqOffset;
   if (auto StmtSeqAttr = Die.find(llvm::dwarf::DW_AT_LLVM_stmt_sequence)) {
-    // The `DW_AT_LLVM_stmt_sequence` attribute might be set to `UINT64_MAX`
-    // when it refers to an empty line sequence. In such cases, the DWARF linker
-    // will exclude the empty sequence from the final output and assign
-    // `UINT64_MAX` to the `DW_AT_LLVM_stmt_sequence` attribute.
-    uint64_t StmtSeqVal = dwarf::toSectionOffset(StmtSeqAttr, UINT64_MAX);
-    if (StmtSeqVal != UINT64_MAX)
+    // The `DW_AT_LLVM_stmt_sequence` attribute might be set to an invalid
+    // sentinel value when it refers to an empty line sequence. In such cases,
+    // the DWARF linker will exclude the empty sequence from the final output
+    // and assign the sentinel value to the `DW_AT_LLVM_stmt_sequence`
+    // attribute. The sentinel value is UINT32_MAX for DWARF32 and UINT64_MAX
+    // for DWARF64.
+    const uint64_t InvalidOffset =
+        Die.getDwarfUnit()->getFormParams().getDwarfMaxOffset();
+    uint64_t StmtSeqVal = dwarf::toSectionOffset(StmtSeqAttr, InvalidOffset);
+    if (StmtSeqVal != InvalidOffset)
       StmtSeqOffset = StmtSeqVal;
   }
 
@@ -338,9 +342,13 @@ static void convertFunctionLineTable(OutputAggregator &Out, CUInfo &CUI,
     if (FilePath.empty()) {
       // If we had a DW_AT_decl_file, but got no file then we need to emit a
       // warning.
+      const uint64_t DwarfFileIdx = dwarf::toUnsigned(
+          Die.findRecursively(dwarf::DW_AT_decl_file), UINT32_MAX);
+      // Check if there is no DW_AT_decl_line attribute, and don't report an
+      // error if it isn't there.
+      if (DwarfFileIdx == UINT32_MAX)
+        return;
       Out.Report("Invalid file index in DW_AT_decl_file", [&](raw_ostream &OS) {
-        const uint64_t DwarfFileIdx = dwarf::toUnsigned(
-            Die.findRecursively(dwarf::DW_AT_decl_file), UINT32_MAX);
         OS << "error: function DIE at " << HEX32(Die.getOffset())
            << " has an invalid file index " << DwarfFileIdx
            << " in its DW_AT_decl_file attribute, unable to create a single "
@@ -629,6 +637,10 @@ Error DwarfTransformer::convert(uint32_t NumThreads, OutputAggregator &Out) {
   size_t NumBefore = Gsym.getNumFunctionInfos();
   auto getDie = [&](DWARFUnit &DwarfUnit) -> DWARFDie {
     DWARFDie ReturnDie = DwarfUnit.getUnitDIE(false);
+    // Apple uses DW_AT_GNU_dwo_id for things other than split DWARF.
+    if (IsMachO)
+      return ReturnDie;
+
     if (DwarfUnit.getDWOId()) {
       DWARFUnit *DWOCU = DwarfUnit.getNonSkeletonUnitDIE(false).getDwarfUnit();
       if (!DWOCU->isDWOUnit())
diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp
index 5b3c05e54c926..6c7e27e429849 100644
--- a/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp
@@ -260,22 +260,17 @@ class InProcessMemoryManager::IPInFlightAlloc
     }
 
     // Run finalization actions.
-    using WrapperFunctionCall = orc::shared::WrapperFunctionCall;
-    runFinalizeActions(
-        G->allocActions(),
-        [this, OnFinalized = std::move(OnFinalized)](
-            Expected<std::vector<WrapperFunctionCall>> DeallocActions) mutable {
-          completeFinalization(std::move(OnFinalized),
-                               std::move(DeallocActions));
-        });
-  }
+    auto DeallocActions = runFinalizeActions(G->allocActions());
+    if (!DeallocActions) {
+      OnFinalized(DeallocActions.takeError());
+      return;
+    }
 
-  void abandon(OnAbandonedFunction OnAbandoned) override {
-    Error Err = Error::success();
-    if (auto EC = sys::Memory::releaseMappedMemory(FinalizationSegments))
-      Err = joinErrors(std::move(Err), errorCodeToError(EC));
-    if (auto EC = sys::Memory::releaseMappedMemory(StandardSegments))
-      Err = joinErrors(std::move(Err), errorCodeToError(EC));
+    // Release the finalize segments slab.
+    if (auto EC = sys::Memory::releaseMappedMemory(FinalizationSegments)) {
+      OnFinalized(errorCodeToError(EC));
+      return;
+    }
 
 #ifndef NDEBUG
     // Set 'G' to null to flag that we've been successfully finalized.
@@ -284,22 +279,17 @@ class InProcessMemoryManager::IPInFlightAlloc
     G = nullptr;
 #endif
 
-    OnAbandoned(std::move(Err));
+    // Continue with finalized allocation.
+    OnFinalized(MemMgr.createFinalizedAlloc(std::move(StandardSegments),
+                                            std::move(*DeallocActions)));
   }
 
-private:
-  void completeFinalization(
-      OnFinalizedFunction OnFinalized,
-      Expected<std::vector<orc::shared::WrapperFunctionCall>> DeallocActions) {
-
-    if (!DeallocActions)
-      return OnFinalized(DeallocActions.takeError());
-
-    // Release the finalize segments slab.
-    if (auto EC = sys::Memory::releaseMappedMemory(FinalizationSegments)) {
-      OnFinalized(errorCodeToError(EC));
-      return;
-    }
+  void abandon(OnAbandonedFunction OnAbandoned) override {
+    Error Err = Error::success();
+    if (auto EC = sys::Memory::releaseMappedMemory(FinalizationSegments))
+      Err = joinErrors(std::move(Err), errorCodeToError(EC));
+    if (auto EC = sys::Memory::releaseMappedMemory(StandardSegments))
+      Err = joinErrors(std::move(Err), errorCodeToError(EC));
 
 #ifndef NDEBUG
     // Set 'G' to null to flag that we've been successfully finalized.
@@ -308,11 +298,10 @@ class InProcessMemoryManager::IPInFlightAlloc
     G = nullptr;
 #endif
 
-    // Continue with finalized allocation.
-    OnFinalized(MemMgr.createFinalizedAlloc(std::move(StandardSegments),
-                                            std::move(*DeallocActions)));
+    OnAbandoned(std::move(Err));
   }
 
+private:
   Error applyProtections() {
     for (auto &KV : BL.segments()) {
       const auto &AG = KV.first;
diff --git a/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt b/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt
index 0ffe3ae37da28..f34392538a7cb 100644
--- a/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt
+++ b/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt
@@ -56,6 +56,7 @@ add_llvm_component_library(LLVMOrcJIT
   SectCreate.cpp
   SelfExecutorProcessControl.cpp
   SimpleRemoteEPC.cpp
+  SimpleRemoteMemoryMapper.cpp
   Speculation.cpp
   SpeculateAnalyses.cpp
   ExecutorProcessControl.cpp
diff --git a/llvm/lib/ExecutionEngine/Orc/Core.cpp b/llvm/lib/ExecutionEngine/Orc/Core.cpp
index f47b7ecdcc7bb..8d413a35f5a93 100644
--- a/llvm/lib/ExecutionEngine/Orc/Core.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Core.cpp
@@ -1173,39 +1173,7 @@ void JITDylib::dump(raw_ostream &OS) {
          << " pending queries: { ";
       for (const auto &Q : KV.second.pendingQueries())
         OS << Q.get() << " (" << Q->getRequiredState() << ") ";
-      OS << "}\n      Defining EDU: ";
-      if (KV.second.DefiningEDU) {
-        OS << KV.second.DefiningEDU.get() << " { ";
-        for (auto &[Name, Flags] : KV.second.DefiningEDU->Symbols)
-          OS << Name << " ";
-        OS << "}\n";
-        OS << "        Dependencies:\n";
-        if (!KV.second.DefiningEDU->Dependencies.empty()) {
-          for (auto &[DepJD, Deps] : KV.second.DefiningEDU->Dependencies) {
-            OS << "          " << DepJD->getName() << ": [ ";
-            for (auto &Dep : Deps)
-              OS << Dep << " ";
-            OS << "]\n";
-          }
-        } else
-          OS << "          none\n";
-      } else
-        OS << "none\n";
-      OS << "      Dependant EDUs:\n";
-      if (!KV.second.DependantEDUs.empty()) {
-        for (auto &DependantEDU : KV.second.DependantEDUs) {
-          OS << "        " << DependantEDU << ": "
-             << DependantEDU->JD->getName() << " { ";
-          for (auto &[Name, Flags] : DependantEDU->Symbols)
-            OS << Name << " ";
-          OS << "}\n";
-        }
-      } else
-        OS << "        none\n";
-      assert((Symbols[KV.first].getState() != SymbolState::Ready ||
-              (KV.second.pendingQueries().empty() && !KV.second.DefiningEDU &&
-               !KV.second.DependantEDUs.empty())) &&
-             "Stale materializing info entry");
+      OS << "}\n";
     }
   });
 }
@@ -1967,9 +1935,6 @@ bool ExecutionSession::verifySessionState(Twine Phase) {
   return runSessionLocked([&]() {
     bool AllOk = true;
 
-    // We'll collect these and verify them later to avoid redundant checks.
-    DenseSet<JITDylib::EmissionDepUnit *> EDUsToCheck;
-
     for (auto &JD : JDs) {
 
       auto LogFailure = [&]() -> raw_fd_ostream & {
@@ -2063,86 +2028,6 @@ bool ExecutionSession::verifySessionState(Twine Phase) {
                              << " has stale or misordered queries.\n";
             }
           }
-
-          // If there's a DefiningEDU then check that...
-          // 1. The JD matches.
-          // 2. The symbol is in the EDU's Symbols map.
-          // 3. The symbol table entry is in the Emitted state.
-          if (MII.DefiningEDU) {
-
-            EDUsToCheck.insert(MII.DefiningEDU.get());
-
-            if (MII.DefiningEDU->JD != JD.get()) {
-              LogFailure() << "symbol " << Sym
-                           << " has DefiningEDU with incorrect JD"
-                           << (llvm::is_contained(JDs, MII.DefiningEDU->JD)
-                                   ? " (JD not currently in ExecutionSession"
-                                   : "")
-                           << "\n";
-            }
-
-            if (SymItr->second.getState() != SymbolState::Emitted) {
-              LogFailure()
-                  << "symbol " << Sym
-                  << " has DefiningEDU, but is not in Emitted state.\n";
-            }
-          }
-
-          // Check that JDs for any DependantEDUs are also in the session --
-          // that guarantees that we'll also visit them during this loop.
-          for (auto &DepEDU : MII.DependantEDUs) {
-            if (!llvm::is_contained(JDs, DepEDU->JD)) {
-              LogFailure() << "symbol " << Sym << " has DependantEDU "
-                           << (void *)DepEDU << " with JD (" << DepEDU->JD
-                           << ") that isn't in ExecutionSession.\n";
-            }
-          }
-        }
-      }
-    }
-
-    // Check EDUs.
-    for (auto *EDU : EDUsToCheck) {
-      assert(EDU->JD->State == JITDylib::Open && "EDU->JD is not Open");
-
-      auto LogFailure = [&]() -> raw_fd_ostream & {
-        AllOk = false;
-        auto &Stream = errs();
-        Stream << "In EDU defining " << EDU->JD->getName() << ": { ";
-        for (auto &[Sym, Flags] : EDU->Symbols)
-          Stream << Sym << " ";
-        Stream << "}, ";
-        return Stream;
-      };
-
-      if (EDU->Symbols.empty())
-        LogFailure() << "no symbols defined.\n";
-      else {
-        for (auto &[Sym, Flags] : EDU->Symbols) {
-          if (!Sym)
-            LogFailure() << "null symbol defined.\n";
-          else {
-            if (!EDU->JD->Symbols.count(SymbolStringPtr(Sym))) {
-              LogFailure() << "symbol " << Sym
-                           << " isn't present in JD's symbol table.\n";
-            }
-          }
-        }
-      }
-
-      for (auto &[DepJD, Symbols] : EDU->Dependencies) {
-        if (!llvm::is_contained(JDs, DepJD)) {
-          LogFailure() << "dependant symbols listed for JD that isn't in "
-                          "ExecutionSession.\n";
-        } else {
-          for (auto &DepSym : Symbols) {
-            if (!DepJD->Symbols.count(SymbolStringPtr(DepSym))) {
-              LogFailure()
-                  << "dependant symbol " << DepSym
-                  << " does not appear in symbol table for dependant JD "
-                  << DepJD->getName() << ".\n";
-            }
-          }
         }
       }
     }
@@ -2917,359 +2802,64 @@ Error ExecutionSession::OL_notifyResolved(MaterializationResponsibility &MR,
   return MR.JD.resolve(MR, Symbols);
 }
 
-template <typename HandleNewDepFn>
-void ExecutionSession::propagateExtraEmitDeps(
-    std::deque<JITDylib::EmissionDepUnit *> Worklist, EDUInfosMap &EDUInfos,
-    HandleNewDepFn HandleNewDep) {
-
-  // Iterate to a fixed-point to propagate extra-emit dependencies through the
-  // EDU graph.
-  while (!Worklist.empty()) {
-    auto &EDU = *Worklist.front();
-    Worklist.pop_front();
-
-    assert(EDUInfos.count(&EDU) && "No info entry for EDU");
-    auto &EDUInfo = EDUInfos[&EDU];
-
-    // Propagate new dependencies to users.
-    for (auto *UserEDU : EDUInfo.IntraEmitUsers) {
-
-      // UserEDUInfo only present if UserEDU has its own users.
-      JITDylib::EmissionDepUnitInfo *UserEDUInfo = nullptr;
-      {
-        auto UserEDUInfoItr = EDUInfos.find(UserEDU);
-        if (UserEDUInfoItr != EDUInfos.end())
-          UserEDUInfo = &UserEDUInfoItr->second;
-      }
-
-      for (auto &[DepJD, Deps] : EDUInfo.NewDeps) {
-        auto &UserEDUDepsForJD = UserEDU->Dependencies[DepJD];
-        DenseSet<NonOwningSymbolStringPtr> *UserEDUNewDepsForJD = nullptr;
-        for (auto Dep : Deps) {
-          if (UserEDUDepsForJD.insert(Dep).second) {
-            HandleNewDep(*UserEDU, *DepJD, Dep);
-            if (UserEDUInfo) {
-              if (!UserEDUNewDepsForJD) {
-                // If UserEDU has no new deps then it's not in the worklist
-                // yet, so add it.
-                if (UserEDUInfo->NewDeps.empty())
-                  Worklist.push_back(UserEDU);
-                UserEDUNewDepsForJD = &UserEDUInfo->NewDeps[DepJD];
-              }
-              // Add (DepJD, Dep) to NewDeps.
-              UserEDUNewDepsForJD->insert(Dep);
-            }
-          }
+WaitingOnGraph::ExternalState
+ExecutionSession::IL_getSymbolState(JITDylib *JD,
+                                    NonOwningSymbolStringPtr Name) {
+  if (JD->State != JITDylib::Open)
+    return WaitingOnGraph::ExternalState::Failed;
+
+  auto I = JD->Symbols.find_as(Name);
+
+  // FIXME: Can we eliminate this possibility if we support query binding?
+  if (I == JD->Symbols.end())
+    return WaitingOnGraph::ExternalState::Failed;
+
+  if (I->second.getFlags().hasError())
+    return WaitingOnGraph::ExternalState::Failed;
+
+  if (I->second.getState() == SymbolState::Ready)
+    return WaitingOnGraph::ExternalState::Ready;
+
+  return WaitingOnGraph::ExternalState::None;
+}
+
+template <typename UpdateSymbolFn, typename UpdateQueryFn>
+void ExecutionSession::IL_collectQueries(
+    JITDylib::AsynchronousSymbolQuerySet &Qs,
+    WaitingOnGraph::ContainerElementsMap &QualifiedSymbols,
+    UpdateSymbolFn &&UpdateSymbol, UpdateQueryFn &&UpdateQuery) {
+
+  for (auto &[JD, Symbols] : QualifiedSymbols) {
+    // IL_emit and JITDylib removal are synchronized by the session lock.
+    // Since JITDylib removal removes any contained nodes from the
+    // WaitingOnGraph, we should be able to assert that all nodes in the
+    // WaitingOnGraph have not been removed.
+    assert(JD->State == JITDylib::Open &&
+           "WaitingOnGraph includes definition in defunct JITDylib");
+    for (auto &Symbol : Symbols) {
+      // Update symbol table.
+      auto I = JD->Symbols.find_as(Symbol);
+      assert(I != JD->Symbols.end() &&
+             "Failed Symbol missing from JD symbol table");
+      auto &Entry = I->second;
+      UpdateSymbol(Entry);
+
+      // Collect queries.
+      auto J = JD->MaterializingInfos.find_as(Symbol);
+      if (J != JD->MaterializingInfos.end()) {
+        for (auto &Q : J->second.takeAllPendingQueries()) {
+          UpdateQuery(*Q, *JD, Symbol, Entry);
+          Qs.insert(std::move(Q));
         }
+        JD->MaterializingInfos.erase(J);
       }
     }
-
-    EDUInfo.NewDeps.clear();
-  }
-}
-
-// Note: This method modifies the emitted set.
-ExecutionSession::EDUInfosMap ExecutionSession::simplifyDepGroups(
-    MaterializationResponsibility &MR,
-    ArrayRef<SymbolDependenceGroup> EmittedDeps) {
-
-  auto &TargetJD = MR.getTargetJITDylib();
-
-  // 1. Build initial EmissionDepUnit -> EmissionDepUnitInfo and
-  //    Symbol -> EmissionDepUnit mappings.
-  DenseMap<JITDylib::EmissionDepUnit *, JITDylib::EmissionDepUnitInfo> EDUInfos;
-  EDUInfos.reserve(EmittedDeps.size());
-  DenseMap<NonOwningSymbolStringPtr, JITDylib::EmissionDepUnit *> EDUForSymbol;
-  for (auto &DG : EmittedDeps) {
-    assert(!DG.Symbols.empty() && "DepGroup does not cover any symbols");
-
-    // Skip empty EDUs.
-    if (DG.Dependencies.empty())
-      continue;
-
-    auto TmpEDU = std::make_shared<JITDylib::EmissionDepUnit>(TargetJD);
-    auto &EDUInfo = EDUInfos[TmpEDU.get()];
-    EDUInfo.EDU = std::move(TmpEDU);
-    for (const auto &Symbol : DG.Symbols) {
-      NonOwningSymbolStringPtr NonOwningSymbol(Symbol);
-      assert(!EDUForSymbol.count(NonOwningSymbol) &&
-             "Symbol should not appear in more than one SymbolDependenceGroup");
-      assert(MR.getSymbols().count(Symbol) &&
-             "Symbol in DepGroups not in the emitted set");
-      auto NewlyEmittedItr = MR.getSymbols().find(Symbol);
-      EDUInfo.EDU->Symbols[NonOwningSymbol] = NewlyEmittedItr->second;
-      EDUForSymbol[NonOwningSymbol] = EDUInfo.EDU.get();
-    }
-  }
-
-  // 2. Build a "residual" EDU to cover all symbols that have no dependencies.
-  {
-    DenseMap<NonOwningSymbolStringPtr, JITSymbolFlags> ResidualSymbolFlags;
-    for (auto &[Sym, Flags] : MR.getSymbols()) {
-      if (!EDUForSymbol.count(NonOwningSymbolStringPtr(Sym)))
-        ResidualSymbolFlags[NonOwningSymbolStringPtr(Sym)] = Flags;
-    }
-    if (!ResidualSymbolFlags.empty()) {
-      auto ResidualEDU = std::make_shared<JITDylib::EmissionDepUnit>(TargetJD);
-      ResidualEDU->Symbols = std::move(ResidualSymbolFlags);
-      auto &ResidualEDUInfo = EDUInfos[ResidualEDU.get()];
-      ResidualEDUInfo.EDU = std::move(ResidualEDU);
-
-      // If the residual EDU is the only one then bail out early.
-      if (EDUInfos.size() == 1)
-        return EDUInfos;
-
-      // Otherwise add the residual EDU to the EDUForSymbol map.
-      for (auto &[Sym, Flags] : ResidualEDUInfo.EDU->Symbols)
-        EDUForSymbol[Sym] = ResidualEDUInfo.EDU.get();
-    }
-  }
-
-#ifndef NDEBUG
-  assert(EDUForSymbol.size() == MR.getSymbols().size() &&
-         "MR symbols not fully covered by EDUs?");
-  for (auto &[Sym, Flags] : MR.getSymbols()) {
-    assert(EDUForSymbol.count(NonOwningSymbolStringPtr(Sym)) &&
-           "Sym in MR not covered by EDU");
-  }
-#endif // NDEBUG
-
-  // 3. Use the DepGroups array to build a graph of dependencies between
-  //    EmissionDepUnits in this finalization. We want to remove these
-  //    intra-finalization uses, propagating dependencies on symbols outside
-  //    this finalization. Add EDUs to the worklist.
-  for (auto &DG : EmittedDeps) {
-
-    // Skip SymbolDependenceGroups with no dependencies.
-    if (DG.Dependencies.empty())
-      continue;
-
-    assert(EDUForSymbol.count(NonOwningSymbolStringPtr(*DG.Symbols.begin())) &&
-           "No EDU for DG");
-    auto &EDU =
-        *EDUForSymbol.find(NonOwningSymbolStringPtr(*DG.Symbols.begin()))
-             ->second;
-
-    for (auto &[DepJD, Deps] : DG.Dependencies) {
-      DenseSet<NonOwningSymbolStringPtr> NewDepsForJD;
-
-      assert(!Deps.empty() && "Dependence set for DepJD is empty");
-
-      if (DepJD != &TargetJD) {
-        // DepJD is some other JITDylib.There can't be any intra-finalization
-        // edges here, so just skip.
-        for (auto &Dep : Deps)
-          NewDepsForJD.insert(NonOwningSymbolStringPtr(Dep));
-      } else {
-        // DepJD is the Target JITDylib. Check for intra-finaliztaion edges,
-        // skipping any and recording the intra-finalization use instead.
-        for (auto &Dep : Deps) {
-          NonOwningSymbolStringPtr NonOwningDep(Dep);
-          auto I = EDUForSymbol.find(NonOwningDep);
-          if (I == EDUForSymbol.end()) {
-            if (!MR.getSymbols().count(Dep))
-              NewDepsForJD.insert(NonOwningDep);
-            continue;
-          }
-
-          if (I->second != &EDU)
-            EDUInfos[I->second].IntraEmitUsers.insert(&EDU);
-        }
-      }
-
-      if (!NewDepsForJD.empty())
-        EDU.Dependencies[DepJD] = std::move(NewDepsForJD);
-    }
-  }
-
-  // 4. Build the worklist.
-  std::deque<JITDylib::EmissionDepUnit *> Worklist;
-  for (auto &[EDU, EDUInfo] : EDUInfos) {
-    // If this EDU has extra-finalization dependencies and intra-finalization
-    // users then add it to the worklist.
-    if (!EDU->Dependencies.empty()) {
-      auto I = EDUInfos.find(EDU);
-      if (I != EDUInfos.end()) {
-        auto &EDUInfo = I->second;
-        if (!EDUInfo.IntraEmitUsers.empty()) {
-          EDUInfo.NewDeps = EDU->Dependencies;
-          Worklist.push_back(EDU);
-        }
-      }
-    }
-  }
-
-  // 4. Propagate dependencies through the EDU graph.
-  propagateExtraEmitDeps(
-      Worklist, EDUInfos,
-      [](JITDylib::EmissionDepUnit &, JITDylib &, NonOwningSymbolStringPtr) {});
-
-  return EDUInfos;
-}
-
-void ExecutionSession::IL_makeEDUReady(
-    std::shared_ptr<JITDylib::EmissionDepUnit> EDU,
-    JITDylib::AsynchronousSymbolQuerySet &Queries) {
-
-  // The symbols for this EDU are ready.
-  auto &JD = *EDU->JD;
-
-  for (auto &[Sym, Flags] : EDU->Symbols) {
-    assert(JD.Symbols.count(SymbolStringPtr(Sym)) &&
-           "JD does not have an entry for Sym");
-    auto &Entry = JD.Symbols[SymbolStringPtr(Sym)];
-
-    assert(((Entry.getFlags().hasMaterializationSideEffectsOnly() &&
-             Entry.getState() == SymbolState::Materializing) ||
-            Entry.getState() == SymbolState::Resolved ||
-            Entry.getState() == SymbolState::Emitted) &&
-           "Emitting from state other than Resolved");
-
-    Entry.setState(SymbolState::Ready);
-
-    auto MII = JD.MaterializingInfos.find(SymbolStringPtr(Sym));
-
-    // Check for pending queries.
-    if (MII == JD.MaterializingInfos.end())
-      continue;
-    auto &MI = MII->second;
-
-    for (auto &Q : MI.takeQueriesMeeting(SymbolState::Ready)) {
-      Q->notifySymbolMetRequiredState(SymbolStringPtr(Sym), Entry.getSymbol());
-      if (Q->isComplete())
-        Queries.insert(Q);
-      Q->removeQueryDependence(JD, SymbolStringPtr(Sym));
-    }
-
-    JD.MaterializingInfos.erase(MII);
-  }
-
-  JD.shrinkMaterializationInfoMemory();
-}
-
-void ExecutionSession::IL_makeEDUEmitted(
-    std::shared_ptr<JITDylib::EmissionDepUnit> EDU,
-    JITDylib::AsynchronousSymbolQuerySet &Queries) {
-
-  // The symbols for this EDU are emitted, but not ready.
-  auto &JD = *EDU->JD;
-
-  for (auto &[Sym, Flags] : EDU->Symbols) {
-    assert(JD.Symbols.count(SymbolStringPtr(Sym)) &&
-           "JD does not have an entry for Sym");
-    auto &Entry = JD.Symbols[SymbolStringPtr(Sym)];
-
-    assert(((Entry.getFlags().hasMaterializationSideEffectsOnly() &&
-             Entry.getState() == SymbolState::Materializing) ||
-            Entry.getState() == SymbolState::Resolved ||
-            Entry.getState() == SymbolState::Emitted) &&
-           "Emitting from state other than Resolved");
-
-    if (Entry.getState() == SymbolState::Emitted) {
-      // This was already emitted, so we can skip the rest of this loop.
-#ifndef NDEBUG
-      for (auto &[Sym, Flags] : EDU->Symbols) {
-        assert(JD.Symbols.count(SymbolStringPtr(Sym)) &&
-               "JD does not have an entry for Sym");
-        auto &Entry = JD.Symbols[SymbolStringPtr(Sym)];
-        assert(Entry.getState() == SymbolState::Emitted &&
-               "Symbols for EDU in inconsistent state");
-        assert(JD.MaterializingInfos.count(SymbolStringPtr(Sym)) &&
-               "Emitted symbol has no MI");
-        auto MI = JD.MaterializingInfos[SymbolStringPtr(Sym)];
-        assert(MI.takeQueriesMeeting(SymbolState::Emitted).empty() &&
-               "Already-emitted symbol has waiting-on-emitted queries");
-      }
-#endif // NDEBUG
-      break;
-    }
-
-    Entry.setState(SymbolState::Emitted);
-    auto &MI = JD.MaterializingInfos[SymbolStringPtr(Sym)];
-    MI.DefiningEDU = EDU;
-
-    for (auto &Q : MI.takeQueriesMeeting(SymbolState::Emitted)) {
-      Q->notifySymbolMetRequiredState(SymbolStringPtr(Sym), Entry.getSymbol());
-      if (Q->isComplete())
-        Queries.insert(Q);
-    }
   }
-
-  for (auto &[DepJD, Deps] : EDU->Dependencies) {
-    for (auto &Dep : Deps)
-      DepJD->MaterializingInfos[SymbolStringPtr(Dep)].DependantEDUs.insert(
-          EDU.get());
-  }
-}
-
-/// Removes the given dependence from EDU. If EDU's dependence set becomes
-/// empty then this function adds an entry for it to the EDUInfos map.
-/// Returns true if a new EDUInfosMap entry is added.
-bool ExecutionSession::IL_removeEDUDependence(JITDylib::EmissionDepUnit &EDU,
-                                              JITDylib &DepJD,
-                                              NonOwningSymbolStringPtr DepSym,
-                                              EDUInfosMap &EDUInfos) {
-  assert(EDU.Dependencies.count(&DepJD) &&
-         "JD does not appear in Dependencies of DependantEDU");
-  assert(EDU.Dependencies[&DepJD].count(DepSym) &&
-         "Symbol does not appear in Dependencies of DependantEDU");
-  auto &JDDeps = EDU.Dependencies[&DepJD];
-  JDDeps.erase(DepSym);
-  if (JDDeps.empty()) {
-    EDU.Dependencies.erase(&DepJD);
-    if (EDU.Dependencies.empty()) {
-      // If the dependencies set has become empty then EDU _may_ be ready
-      // (we won't know for sure until we've propagated the extra-emit deps).
-      // Create an EDUInfo for it (if it doesn't have one already) so that
-      // it'll be visited after propagation.
-      auto &DepEDUInfo = EDUInfos[&EDU];
-      if (!DepEDUInfo.EDU) {
-        assert(EDU.JD->Symbols.count(
-                   SymbolStringPtr(EDU.Symbols.begin()->first)) &&
-               "Missing symbol entry for first symbol in EDU");
-        auto DepEDUFirstMI = EDU.JD->MaterializingInfos.find(
-            SymbolStringPtr(EDU.Symbols.begin()->first));
-        assert(DepEDUFirstMI != EDU.JD->MaterializingInfos.end() &&
-               "Missing MI for first symbol in DependantEDU");
-        DepEDUInfo.EDU = DepEDUFirstMI->second.DefiningEDU;
-        return true;
-      }
-    }
-  }
-  return false;
 }
 
-Error ExecutionSession::makeJDClosedError(JITDylib::EmissionDepUnit &EDU,
-                                          JITDylib &ClosedJD) {
-  SymbolNameSet FailedSymbols;
-  for (auto &[Sym, Flags] : EDU.Symbols)
-    FailedSymbols.insert(SymbolStringPtr(Sym));
-  SymbolDependenceMap BadDeps;
-  for (auto &Dep : EDU.Dependencies[&ClosedJD])
-    BadDeps[&ClosedJD].insert(SymbolStringPtr(Dep));
-  return make_error<UnsatisfiedSymbolDependencies>(
-      ClosedJD.getExecutionSession().getSymbolStringPool(), EDU.JD,
-      std::move(FailedSymbols), std::move(BadDeps),
-      ClosedJD.getName() + " is closed");
-}
-
-Error ExecutionSession::makeUnsatisfiedDepsError(JITDylib::EmissionDepUnit &EDU,
-                                                 JITDylib &BadJD,
-                                                 SymbolNameSet BadDeps) {
-  SymbolNameSet FailedSymbols;
-  for (auto &[Sym, Flags] : EDU.Symbols)
-    FailedSymbols.insert(SymbolStringPtr(Sym));
-  SymbolDependenceMap BadDepsMap;
-  BadDepsMap[&BadJD] = std::move(BadDeps);
-  return make_error<UnsatisfiedSymbolDependencies>(
-      BadJD.getExecutionSession().getSymbolStringPool(), &BadJD,
-      std::move(FailedSymbols), std::move(BadDepsMap),
-      "dependencies removed or in error state");
-}
-
-Expected<JITDylib::AsynchronousSymbolQuerySet>
+Expected<ExecutionSession::EmitQueries>
 ExecutionSession::IL_emit(MaterializationResponsibility &MR,
-                          EDUInfosMap EDUInfos) {
+                          WaitingOnGraph::SimplifyResult SR) {
 
   if (MR.RT->isDefunct())
     return make_error<ResourceTrackerDefunct>(MR.RT);
@@ -3279,169 +2869,50 @@ ExecutionSession::IL_emit(MaterializationResponsibility &MR,
     return make_error<StringError>("JITDylib " + TargetJD.getName() +
                                        " is defunct",
                                    inconvertibleErrorCode());
+
 #ifdef EXPENSIVE_CHECKS
   verifySessionState("entering ExecutionSession::IL_emit");
 #endif
 
-  // Walk all EDUs:
-  // 1. Verifying that dependencies are available (not removed or in the error
-  //    state.
-  // 2. Removing any dependencies that are already Ready.
-  // 3. Lifting any EDUs for Emitted symbols into the EDUInfos map.
-  // 4. Finding any dependant EDUs and lifting them into the EDUInfos map.
-  std::deque<JITDylib::EmissionDepUnit *> Worklist;
-  for (auto &[EDU, _] : EDUInfos)
-    Worklist.push_back(EDU);
-
-  for (auto *EDU : Worklist) {
-    auto *EDUInfo = &EDUInfos[EDU];
-
-    SmallVector<JITDylib *> DepJDsToRemove;
-    for (auto &[DepJD, Deps] : EDU->Dependencies) {
-      if (DepJD->State != JITDylib::Open)
-        return makeJDClosedError(*EDU, *DepJD);
-
-      SymbolNameSet BadDeps;
-      SmallVector<NonOwningSymbolStringPtr> DepsToRemove;
-      for (auto &Dep : Deps) {
-        auto DepEntryItr = DepJD->Symbols.find(SymbolStringPtr(Dep));
-
-        // If this dep has been removed or moved to the error state then add it
-        // to the bad deps set. We aggregate these bad deps for more
-        // comprehensive error messages.
-        if (DepEntryItr == DepJD->Symbols.end() ||
-            DepEntryItr->second.getFlags().hasError()) {
-          BadDeps.insert(SymbolStringPtr(Dep));
-          continue;
-        }
-
-        // If this dep isn't emitted yet then just add it to the NewDeps set to
-        // be propagated.
-        auto &DepEntry = DepEntryItr->second;
-        if (DepEntry.getState() < SymbolState::Emitted) {
-          EDUInfo->NewDeps[DepJD].insert(Dep);
-          continue;
-        }
-
-        // This dep has been emitted, so add it to the list to be removed from
-        // EDU.
-        DepsToRemove.push_back(Dep);
-
-        // If Dep is Ready then there's nothing further to do.
-        if (DepEntry.getState() == SymbolState::Ready) {
-          assert(!DepJD->MaterializingInfos.count(SymbolStringPtr(Dep)) &&
-                 "Unexpected MaterializationInfo attached to ready symbol");
-          continue;
-        }
+  auto ER = G.emit(std::move(SR),
+                   [this](JITDylib *JD, NonOwningSymbolStringPtr Name) {
+                     return IL_getSymbolState(JD, Name);
+                   });
 
-        // If we get here then Dep is Emitted. We need to look up its defining
-        // EDU and add this EDU to the defining EDU's list of users (this means
-        // creating an EDUInfos entry if the defining EDU doesn't have one
-        // already).
-        assert(DepJD->MaterializingInfos.count(SymbolStringPtr(Dep)) &&
-               "Expected MaterializationInfo for emitted dependency");
-        auto &DepMI = DepJD->MaterializingInfos[SymbolStringPtr(Dep)];
-        assert(DepMI.DefiningEDU &&
-               "Emitted symbol does not have a defining EDU");
-        assert(DepMI.DependantEDUs.empty() &&
-               "Already-emitted symbol has dependant EDUs?");
-        auto &DepEDUInfo = EDUInfos[DepMI.DefiningEDU.get()];
-        if (!DepEDUInfo.EDU) {
-          // No EDUInfo yet -- build initial entry, and reset the EDUInfo
-          // pointer, which we will have invalidated.
-          EDUInfo = &EDUInfos[EDU];
-          DepEDUInfo.EDU = DepMI.DefiningEDU;
-          for (auto &[DepDepJD, DepDeps] : DepEDUInfo.EDU->Dependencies) {
-            if (DepDepJD == &TargetJD) {
-              for (auto &DepDep : DepDeps)
-                if (!MR.getSymbols().count(SymbolStringPtr(DepDep)))
-                  DepEDUInfo.NewDeps[DepDepJD].insert(DepDep);
-            } else
-              DepEDUInfo.NewDeps[DepDepJD] = DepDeps;
-          }
-        }
-        DepEDUInfo.IntraEmitUsers.insert(EDU);
-      }
-
-      // Some dependencies were removed or in an error state -- error out.
-      if (!BadDeps.empty())
-        return makeUnsatisfiedDepsError(*EDU, *DepJD, std::move(BadDeps));
-
-      // Remove the emitted / ready deps from DepJD.
-      for (auto &Dep : DepsToRemove)
-        Deps.erase(Dep);
-
-      // If there are no further deps in DepJD then flag it for removal too.
-      if (Deps.empty())
-        DepJDsToRemove.push_back(DepJD);
-    }
+  EmitQueries EQ;
 
-    // Remove any JDs whose dependence sets have become empty.
-    for (auto &DepJD : DepJDsToRemove) {
-      assert(EDU->Dependencies.count(DepJD) &&
-             "Trying to remove non-existent dep entries");
-      EDU->Dependencies.erase(DepJD);
-    }
-
-    // Now look for users of this EDU.
-    for (auto &[Sym, Flags] : EDU->Symbols) {
-      assert(TargetJD.Symbols.count(SymbolStringPtr(Sym)) &&
-             "Sym not present in symbol table");
-      assert((TargetJD.Symbols[SymbolStringPtr(Sym)].getState() ==
-                  SymbolState::Resolved ||
-              TargetJD.Symbols[SymbolStringPtr(Sym)]
-                  .getFlags()
-                  .hasMaterializationSideEffectsOnly()) &&
-             "Emitting symbol not in the resolved state");
-      assert(!TargetJD.Symbols[SymbolStringPtr(Sym)].getFlags().hasError() &&
-             "Symbol is already in an error state");
-
-      auto MII = TargetJD.MaterializingInfos.find(SymbolStringPtr(Sym));
-      if (MII == TargetJD.MaterializingInfos.end() ||
-          MII->second.DependantEDUs.empty())
-        continue;
-
-      for (auto &DependantEDU : MII->second.DependantEDUs) {
-        if (IL_removeEDUDependence(*DependantEDU, TargetJD, Sym, EDUInfos))
-          EDUInfo = &EDUInfos[EDU];
-        EDUInfo->IntraEmitUsers.insert(DependantEDU);
-      }
-      MII->second.DependantEDUs.clear();
-    }
-  }
-
-  Worklist.clear();
-  for (auto &[EDU, EDUInfo] : EDUInfos) {
-    if (!EDUInfo.IntraEmitUsers.empty() && !EDU->Dependencies.empty()) {
-      if (EDUInfo.NewDeps.empty())
-        EDUInfo.NewDeps = EDU->Dependencies;
-      Worklist.push_back(EDU);
-    }
-  }
-
-  propagateExtraEmitDeps(
-      Worklist, EDUInfos,
-      [](JITDylib::EmissionDepUnit &EDU, JITDylib &JD,
-         NonOwningSymbolStringPtr Sym) {
-        JD.MaterializingInfos[SymbolStringPtr(Sym)].DependantEDUs.insert(&EDU);
-      });
+  // Handle failed queries.
+  for (auto &SN : ER.Failed)
+    IL_collectQueries(
+        EQ.Failed, SN->defs(),
+        [](JITDylib::SymbolTableEntry &E) {
+          E.setFlags(E.getFlags() = JITSymbolFlags::HasError);
+        },
+        [&](AsynchronousSymbolQuery &Q, JITDylib &JD,
+            NonOwningSymbolStringPtr Name, JITDylib::SymbolTableEntry &E) {
+          auto &FS = EQ.FailedSymsForQuery[&Q];
+          if (!FS)
+            FS = std::make_shared<SymbolDependenceMap>();
+          (*FS)[&JD].insert(SymbolStringPtr(Name));
+        });
 
-  JITDylib::AsynchronousSymbolQuerySet CompletedQueries;
+  for (auto &FQ : EQ.Failed)
+    FQ->detach();
 
-  // Extract completed queries and lodge not-yet-ready EDUs in the
-  // session.
-  for (auto &[EDU, EDUInfo] : EDUInfos) {
-    if (EDU->Dependencies.empty())
-      IL_makeEDUReady(std::move(EDUInfo.EDU), CompletedQueries);
-    else
-      IL_makeEDUEmitted(std::move(EDUInfo.EDU), CompletedQueries);
-  }
+  for (auto &SN : ER.Ready)
+    IL_collectQueries(
+        EQ.Updated, SN->defs(),
+        [](JITDylib::SymbolTableEntry &E) { E.setState(SymbolState::Ready); },
+        [](AsynchronousSymbolQuery &Q, JITDylib &JD,
+           NonOwningSymbolStringPtr Name, JITDylib::SymbolTableEntry &E) {
+          Q.notifySymbolMetRequiredState(SymbolStringPtr(Name), E.getSymbol());
+        });
 
 #ifdef EXPENSIVE_CHECKS
   verifySessionState("exiting ExecutionSession::IL_emit");
 #endif
 
-  return std::move(CompletedQueries);
+  return std::move(EQ);
 }
 
 Error ExecutionSession::OL_notifyEmitted(
@@ -3471,40 +2942,127 @@ Error ExecutionSession::OL_notifyEmitted(
   }
 #endif // NDEBUG
 
-  auto EDUInfos = simplifyDepGroups(MR, DepGroups);
+  std::vector<std::unique_ptr<WaitingOnGraph::SuperNode>> SNs;
+  WaitingOnGraph::ContainerElementsMap Residual;
+  {
+    auto &JDResidual = Residual[&MR.getTargetJITDylib()];
+    for (auto &[Name, Flags] : MR.getSymbols())
+      JDResidual.insert(NonOwningSymbolStringPtr(Name));
+
+    for (auto &SDG : DepGroups) {
+      WaitingOnGraph::ContainerElementsMap Defs;
+      assert(!SDG.Symbols.empty());
+      auto &JDDefs = Defs[&MR.getTargetJITDylib()];
+      for (auto &Def : SDG.Symbols) {
+        JDDefs.insert(NonOwningSymbolStringPtr(Def));
+        JDResidual.erase(NonOwningSymbolStringPtr(Def));
+      }
+      WaitingOnGraph::ContainerElementsMap Deps;
+      if (!SDG.Dependencies.empty()) {
+        for (auto &[JD, Syms] : SDG.Dependencies) {
+          auto &JDDeps = Deps[JD];
+          for (auto &Dep : Syms)
+            JDDeps.insert(NonOwningSymbolStringPtr(Dep));
+        }
+      }
+      SNs.push_back(std::make_unique<WaitingOnGraph::SuperNode>(
+          std::move(Defs), std::move(Deps)));
+    }
+    if (!JDResidual.empty())
+      SNs.push_back(std::make_unique<WaitingOnGraph::SuperNode>(
+          std::move(Residual), WaitingOnGraph::ContainerElementsMap()));
+  }
+
+  auto SR = WaitingOnGraph::simplify(std::move(SNs));
 
   LLVM_DEBUG({
     dbgs() << "  Simplified dependencies:\n";
-    for (auto &[EDU, EDUInfo] : EDUInfos) {
-      dbgs() << "    Symbols: { ";
-      for (auto &[Sym, Flags] : EDU->Symbols)
-        dbgs() << Sym << " ";
-      dbgs() << "}, Dependencies: { ";
-      for (auto &[DepJD, Deps] : EDU->Dependencies) {
-        dbgs() << "(" << DepJD->getName() << ", { ";
-        for (auto &Dep : Deps)
-          dbgs() << Dep << " ";
-        dbgs() << "}) ";
+    for (auto &SN : SR.superNodes()) {
+
+      auto SortedLibs = [](WaitingOnGraph::ContainerElementsMap &C) {
+        std::vector<JITDylib *> JDs;
+        for (auto &[JD, _] : C)
+          JDs.push_back(JD);
+        llvm::sort(JDs, [](const JITDylib *LHS, const JITDylib *RHS) {
+          return LHS->getName() < RHS->getName();
+        });
+        return JDs;
+      };
+
+      auto SortedNames = [](WaitingOnGraph::ElementSet &Elems) {
+        std::vector<NonOwningSymbolStringPtr> Names(Elems.begin(), Elems.end());
+        llvm::sort(Names, [](const NonOwningSymbolStringPtr &LHS,
+                             const NonOwningSymbolStringPtr &RHS) {
+          return *LHS < *RHS;
+        });
+        return Names;
+      };
+
+      dbgs() << "    Defs: {";
+      for (auto *JD : SortedLibs(SN->defs())) {
+        dbgs() << " (" << JD->getName() << ", [";
+        for (auto &Sym : SortedNames(SN->defs()[JD]))
+          dbgs() << " " << Sym;
+        dbgs() << " ])";
+      }
+      dbgs() << " }, Deps: {";
+      for (auto *JD : SortedLibs(SN->deps())) {
+        dbgs() << " (" << JD->getName() << ", [";
+        for (auto &Sym : SortedNames(SN->deps()[JD]))
+          dbgs() << " " << Sym;
+        dbgs() << " ])";
       }
-      dbgs() << "}\n";
+      dbgs() << " }\n";
     }
   });
-
-  auto CompletedQueries =
-      runSessionLocked([&]() { return IL_emit(MR, EDUInfos); });
+  auto EmitQueries =
+      runSessionLocked([&]() { return IL_emit(MR, std::move(SR)); });
 
   // On error bail out.
-  if (!CompletedQueries)
-    return CompletedQueries.takeError();
+  if (!EmitQueries)
+    return EmitQueries.takeError();
 
-  MR.SymbolFlags.clear();
+  // Otherwise notify failed queries, and any updated queries that have been
+  // completed.
 
-  // Otherwise notify all the completed queries.
-  for (auto &Q : *CompletedQueries) {
-    assert(Q->isComplete() && "Q is not complete");
-    Q->handleComplete(*this);
+  // FIXME: Get rid of error return from notifyEmitted.
+  SymbolDependenceMap BadDeps;
+  {
+    for (auto &FQ : EmitQueries->Failed) {
+      FQ->detach();
+      assert(EmitQueries->FailedSymsForQuery.count(FQ.get()) &&
+             "Missing failed symbols for query");
+      auto FailedSyms = std::move(EmitQueries->FailedSymsForQuery[FQ.get()]);
+      for (auto &[JD, Syms] : *FailedSyms) {
+        auto &BadDepsForJD = BadDeps[JD];
+        for (auto &Sym : Syms)
+          BadDepsForJD.insert(Sym);
+      }
+      FQ->handleFailed(make_error<FailedToMaterialize>(getSymbolStringPool(),
+                                                       std::move(FailedSyms)));
+    }
+  }
+
+  for (auto &UQ : EmitQueries->Updated)
+    if (UQ->isComplete())
+      UQ->handleComplete(*this);
+
+  // If there are any bad dependencies then return an error.
+  if (!BadDeps.empty()) {
+    SymbolNameSet BadNames;
+    // Note: The name set calculated here is bogus: it includes all symbols in
+    //       the MR, not just the ones that failed. We want to remove the error
+    //       return path from notifyEmitted anyway, so this is just a brief
+    //       placeholder to maintain (roughly) the current error behavior.
+    for (auto &[Name, Flags] : MR.getSymbols())
+      BadNames.insert(Name);
+    MR.SymbolFlags.clear();
+    return make_error<UnsatisfiedSymbolDependencies>(
+        getSymbolStringPool(), &MR.getTargetJITDylib(), std::move(BadNames),
+        std::move(BadDeps), "dependencies removed or in error state");
   }
 
+  MR.SymbolFlags.clear();
   return Error::success();
 }
 
@@ -3535,158 +3093,48 @@ ExecutionSession::IL_failSymbols(JITDylib &JD,
 #endif
 
   JITDylib::AsynchronousSymbolQuerySet FailedQueries;
-  auto FailedSymbolsMap = std::make_shared<SymbolDependenceMap>();
-  auto ExtractFailedQueries = [&](JITDylib::MaterializingInfo &MI) {
-    JITDylib::AsynchronousSymbolQueryList ToDetach;
-    for (auto &Q : MI.pendingQueries()) {
-      // Add the query to the list to be failed and detach it.
-      FailedQueries.insert(Q);
-      ToDetach.push_back(Q);
+  auto Fail = [&](JITDylib *FailJD, NonOwningSymbolStringPtr FailSym) {
+    auto I = FailJD->Symbols.find_as(FailSym);
+    assert(I != FailJD->Symbols.end());
+    I->second.setFlags(I->second.getFlags() | JITSymbolFlags::HasError);
+    auto J = FailJD->MaterializingInfos.find_as(FailSym);
+    if (J != FailJD->MaterializingInfos.end()) {
+      for (auto &Q : J->second.takeAllPendingQueries())
+        FailedQueries.insert(std::move(Q));
+      FailJD->MaterializingInfos.erase(J);
     }
-    for (auto &Q : ToDetach)
-      Q->detach();
-    assert(!MI.hasQueriesPending() && "Queries still pending after detach");
   };
 
-  for (auto &Name : SymbolsToFail) {
-    (*FailedSymbolsMap)[&JD].insert(Name);
-
-    // Look up the symbol to fail.
-    auto SymI = JD.Symbols.find(Name);
-
-    // FIXME: Revisit this. We should be able to assert sequencing between
-    //        ResourceTracker removal and symbol failure.
-    //
-    // It's possible that this symbol has already been removed, e.g. if a
-    // materialization failure happens concurrently with a ResourceTracker or
-    // JITDylib removal. In that case we can safely skip this symbol and
-    // continue.
-    if (SymI == JD.Symbols.end())
-      continue;
-    auto &Sym = SymI->second;
-
-    // If the symbol is already in the error state then we must have visited
-    // it earlier.
-    if (Sym.getFlags().hasError()) {
-      assert(!JD.MaterializingInfos.count(Name) &&
-             "Symbol in error state still has MaterializingInfo");
-      continue;
-    }
+  auto FailedSymbolsMap = std::make_shared<SymbolDependenceMap>();
 
-    // Move the symbol into the error state.
-    Sym.setFlags(Sym.getFlags() | JITSymbolFlags::HasError);
-
-    // FIXME: Come up with a sane mapping of state to
-    // presence-of-MaterializingInfo so that we can assert presence / absence
-    // here, rather than testing it.
-    auto MII = JD.MaterializingInfos.find(Name);
-    if (MII == JD.MaterializingInfos.end())
-      continue;
-
-    auto &MI = MII->second;
-
-    // Collect queries to be failed for this MII.
-    ExtractFailedQueries(MI);
-
-    if (MI.DefiningEDU) {
-      // If there is a DefiningEDU for this symbol then remove this
-      // symbol from it.
-      assert(MI.DependantEDUs.empty() &&
-             "Symbol with DefiningEDU should not have DependantEDUs");
-      assert(Sym.getState() >= SymbolState::Emitted &&
-             "Symbol has EDU, should have been emitted");
-      assert(MI.DefiningEDU->Symbols.count(NonOwningSymbolStringPtr(Name)) &&
-             "Symbol does not appear in its DefiningEDU");
-      MI.DefiningEDU->Symbols.erase(NonOwningSymbolStringPtr(Name));
-
-      // Remove this EDU from the dependants lists of its dependencies.
-      for (auto &[DepJD, DepSyms] : MI.DefiningEDU->Dependencies) {
-        for (auto DepSym : DepSyms) {
-          assert(DepJD->Symbols.count(SymbolStringPtr(DepSym)) &&
-                 "DepSym not in DepJD");
-          assert(DepJD->MaterializingInfos.count(SymbolStringPtr(DepSym)) &&
-                 "DepSym has not MaterializingInfo");
-          auto &SymMI = DepJD->MaterializingInfos[SymbolStringPtr(DepSym)];
-          assert(SymMI.DependantEDUs.count(MI.DefiningEDU.get()) &&
-                 "DefiningEDU missing from DependantEDUs list of dependency");
-          SymMI.DependantEDUs.erase(MI.DefiningEDU.get());
-        }
-      }
+  {
+    auto &FailedSymsForJD = (*FailedSymbolsMap)[&JD];
+    for (auto &Sym : SymbolsToFail) {
+      FailedSymsForJD.insert(Sym);
+      Fail(&JD, NonOwningSymbolStringPtr(Sym));
+    }
+  }
 
-      MI.DefiningEDU = nullptr;
-    } else {
-      // Otherwise if there are any EDUs waiting on this symbol then move
-      // those symbols to the error state too, and deregister them from the
-      // symbols that they depend on.
-      // Note: We use a copy of DependantEDUs here since we'll be removing
-      // from the original set as we go.
-      for (auto &DependantEDU : MI.DependantEDUs) {
-
-        // Remove DependantEDU from all of its users DependantEDUs lists.
-        for (auto &[DepJD, DepSyms] : DependantEDU->Dependencies) {
-          for (auto DepSym : DepSyms) {
-            // Skip self-reference to avoid invalidating the MI.DependantEDUs
-            // map. We'll clear this later.
-            if (DepJD == &JD && DepSym == Name)
-              continue;
-            assert(DepJD->Symbols.count(SymbolStringPtr(DepSym)) &&
-                   "DepSym not in DepJD?");
-            assert(DepJD->MaterializingInfos.count(SymbolStringPtr(DepSym)) &&
-                   "DependantEDU not registered with symbol it depends on");
-            auto &SymMI = DepJD->MaterializingInfos[SymbolStringPtr(DepSym)];
-            assert(SymMI.DependantEDUs.count(DependantEDU) &&
-                   "DependantEDU missing from DependantEDUs list");
-            SymMI.DependantEDUs.erase(DependantEDU);
-          }
-        }
+  WaitingOnGraph::ContainerElementsMap ToFail;
+  auto &JDToFail = ToFail[&JD];
+  for (auto &Sym : SymbolsToFail)
+    JDToFail.insert(NonOwningSymbolStringPtr(Sym));
 
-        // Move any symbols defined by DependantEDU into the error state and
-        // fail any queries waiting on them.
-        auto &DepJD = *DependantEDU->JD;
-        auto DepEDUSymbols = std::move(DependantEDU->Symbols);
-        for (auto &[DepName, Flags] : DepEDUSymbols) {
-          auto DepSymItr = DepJD.Symbols.find(SymbolStringPtr(DepName));
-          assert(DepSymItr != DepJD.Symbols.end() &&
-                 "Symbol not present in table");
-          auto &DepSym = DepSymItr->second;
-
-          assert(DepSym.getState() >= SymbolState::Emitted &&
-                 "Symbol has EDU, should have been emitted");
-          assert(!DepSym.getFlags().hasError() &&
-                 "Symbol is already in the error state?");
-          DepSym.setFlags(DepSym.getFlags() | JITSymbolFlags::HasError);
-          (*FailedSymbolsMap)[&DepJD].insert(SymbolStringPtr(DepName));
-
-          // This symbol has a defining EDU so its MaterializingInfo object must
-          // exist.
-          auto DepMIItr =
-              DepJD.MaterializingInfos.find(SymbolStringPtr(DepName));
-          assert(DepMIItr != DepJD.MaterializingInfos.end() &&
-                 "Symbol has defining EDU but not MaterializingInfo");
-          auto &DepMI = DepMIItr->second;
-          assert(DepMI.DefiningEDU.get() == DependantEDU &&
-                 "Bad EDU dependence edge");
-          assert(DepMI.DependantEDUs.empty() &&
-                 "Symbol was emitted, should not have any DependantEDUs");
-          ExtractFailedQueries(DepMI);
-          DepJD.MaterializingInfos.erase(SymbolStringPtr(DepName));
-        }
+  auto FailedSNs = G.fail(ToFail);
 
-        DepJD.shrinkMaterializationInfoMemory();
+  for (auto &SN : FailedSNs) {
+    for (auto &[FailJD, Defs] : SN->defs()) {
+      auto &FailedSymsForFailJD = (*FailedSymbolsMap)[FailJD];
+      for (auto &Def : Defs) {
+        FailedSymsForFailJD.insert(SymbolStringPtr(Def));
+        Fail(FailJD, Def);
       }
-
-      MI.DependantEDUs.clear();
     }
-
-    assert(!MI.DefiningEDU && "DefiningEDU should have been reset");
-    assert(MI.DependantEDUs.empty() &&
-           "DependantEDUs should have been removed above");
-    assert(!MI.hasQueriesPending() &&
-           "Can not delete MaterializingInfo with queries pending");
-    JD.MaterializingInfos.erase(Name);
   }
 
-  JD.shrinkMaterializationInfoMemory();
+  // Detach all failed queries.
+  for (auto &Q : FailedQueries)
+    Q->detach();
 
 #ifdef EXPENSIVE_CHECKS
   verifySessionState("exiting ExecutionSession::IL_failSymbols");
@@ -3721,9 +3169,11 @@ void ExecutionSession::OL_notifyFailed(MaterializationResponsibility &MR) {
     return IL_failSymbols(MR.getTargetJITDylib(), SymbolsToFail);
   });
 
-  for (auto &Q : FailedQueries)
+  for (auto &Q : FailedQueries) {
+    Q->detach();
     Q->handleFailed(
         make_error<FailedToMaterialize>(getSymbolStringPool(), FailedSymbols));
+  }
 }
 
 Error ExecutionSession::OL_replace(MaterializationResponsibility &MR,
diff --git a/llvm/lib/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.cpp b/llvm/lib/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.cpp
index 50e6b254595b8..0833af7f020fc 100644
--- a/llvm/lib/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.cpp
@@ -57,16 +57,17 @@ class EPCGenericJITLinkMemoryManager::InFlightAlloc
     std::swap(FR.Actions, G.allocActions());
 
     Parent.EPC.callSPSWrapperAsync<
-        rt::SPSSimpleExecutorMemoryManagerFinalizeSignature>(
-        Parent.SAs.Finalize,
+        rt::SPSSimpleExecutorMemoryManagerInitializeSignature>(
+        Parent.SAs.Initialize,
         [OnFinalize = std::move(OnFinalize), AllocAddr = this->AllocAddr](
-            Error SerializationErr, Error FinalizeErr) mutable {
+            Error SerializationErr,
+            Expected<ExecutorAddr> InitializeKey) mutable {
           // FIXME: Release abandoned alloc.
           if (SerializationErr) {
-            cantFail(std::move(FinalizeErr));
+            cantFail(InitializeKey.takeError());
             OnFinalize(std::move(SerializationErr));
-          } else if (FinalizeErr)
-            OnFinalize(std::move(FinalizeErr));
+          } else if (!InitializeKey)
+            OnFinalize(InitializeKey.takeError());
           else
             OnFinalize(FinalizedAlloc(AllocAddr));
         },
@@ -76,8 +77,8 @@ class EPCGenericJITLinkMemoryManager::InFlightAlloc
   void abandon(OnAbandonedFunction OnAbandoned) override {
     // FIXME: Return memory to pool instead.
     Parent.EPC.callSPSWrapperAsync<
-        rt::SPSSimpleExecutorMemoryManagerDeallocateSignature>(
-        Parent.SAs.Deallocate,
+        rt::SPSSimpleExecutorMemoryManagerReleaseSignature>(
+        Parent.SAs.Release,
         [OnAbandoned = std::move(OnAbandoned)](Error SerializationErr,
                                                Error DeallocateErr) mutable {
           if (SerializationErr) {
@@ -123,9 +124,8 @@ void EPCGenericJITLinkMemoryManager::allocate(const JITLinkDylib *JD,
 
 void EPCGenericJITLinkMemoryManager::deallocate(
     std::vector<FinalizedAlloc> Allocs, OnDeallocatedFunction OnDeallocated) {
-  EPC.callSPSWrapperAsync<
-      rt::SPSSimpleExecutorMemoryManagerDeallocateSignature>(
-      SAs.Deallocate,
+  EPC.callSPSWrapperAsync<rt::SPSSimpleExecutorMemoryManagerReleaseSignature>(
+      SAs.Release,
       [OnDeallocated = std::move(OnDeallocated)](Error SerErr,
                                                  Error DeallocErr) mutable {
         if (SerErr) {
diff --git a/llvm/lib/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.cpp b/llvm/lib/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.cpp
index fec7062ff79a6..cc72488455ec1 100644
--- a/llvm/lib/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.cpp
@@ -25,9 +25,9 @@ EPCGenericRTDyldMemoryManager::CreateWithDefaultBootstrapSymbols(
   if (auto Err = EPC.getBootstrapSymbols(
           {{SAs.Instance, rt::SimpleExecutorMemoryManagerInstanceName},
            {SAs.Reserve, rt::SimpleExecutorMemoryManagerReserveWrapperName},
-           {SAs.Finalize, rt::SimpleExecutorMemoryManagerFinalizeWrapperName},
-           {SAs.Deallocate,
-            rt::SimpleExecutorMemoryManagerDeallocateWrapperName},
+           {SAs.Initialize,
+            rt::SimpleExecutorMemoryManagerInitializeWrapperName},
+           {SAs.Release, rt::SimpleExecutorMemoryManagerReleaseWrapperName},
            {SAs.RegisterEHFrame, rt::RegisterEHFrameSectionAllocActionName},
            {SAs.DeregisterEHFrame,
             rt::DeregisterEHFrameSectionAllocActionName}}))
@@ -48,7 +48,7 @@ EPCGenericRTDyldMemoryManager::~EPCGenericRTDyldMemoryManager() {
 
   Error Err = Error::success();
   if (auto Err2 = EPC.callSPSWrapper<
-                  rt::SPSSimpleExecutorMemoryManagerDeallocateSignature>(
+                  rt::SPSSimpleExecutorMemoryManagerReleaseSignature>(
           SAs.Reserve, Err, SAs.Instance, FinalizedAllocs)) {
     // FIXME: Report errors through EPC once that functionality is available.
     logAllUnhandledErrors(std::move(Err2), errs(), "");
@@ -267,10 +267,10 @@ bool EPCGenericRTDyldMemoryManager::finalizeMemory(std::string *ErrMsg) {
 
     // We'll also need to make an extra allocation for the eh-frame wrapper call
     // arguments.
-    Error FinalizeErr = Error::success();
+    Expected<ExecutorAddr> InitializeKey((ExecutorAddr()));
     if (auto Err = EPC.callSPSWrapper<
-                   rt::SPSSimpleExecutorMemoryManagerFinalizeSignature>(
-            SAs.Finalize, FinalizeErr, SAs.Instance, std::move(FR))) {
+                   rt::SPSSimpleExecutorMemoryManagerInitializeSignature>(
+            SAs.Initialize, InitializeKey, SAs.Instance, std::move(FR))) {
       std::lock_guard<std::mutex> Lock(M);
       this->ErrMsg = toString(std::move(Err));
       dbgs() << "Serialization error: " << this->ErrMsg << "\n";
@@ -278,9 +278,9 @@ bool EPCGenericRTDyldMemoryManager::finalizeMemory(std::string *ErrMsg) {
         *ErrMsg = this->ErrMsg;
       return true;
     }
-    if (FinalizeErr) {
+    if (!InitializeKey) {
       std::lock_guard<std::mutex> Lock(M);
-      this->ErrMsg = toString(std::move(FinalizeErr));
+      this->ErrMsg = toString(InitializeKey.takeError());
       dbgs() << "Finalization error: " << this->ErrMsg << "\n";
       if (ErrMsg)
         *ErrMsg = this->ErrMsg;
diff --git a/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp b/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp
index 7b327af8aeeb5..7e606c6a473b6 100644
--- a/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp
@@ -91,19 +91,9 @@ void InProcessMemoryMapper::initialize(MemoryMapper::AllocInfo &AI,
       sys::Memory::InvalidateInstructionCache(Base.toPtr<void *>(), Size);
   }
 
-  std::vector<shared::WrapperFunctionCall> DeinitializeActions;
-  {
-    std::promise<MSVCPExpected<std::vector<shared::WrapperFunctionCall>>> P;
-    auto F = P.get_future();
-    shared::runFinalizeActions(
-        AI.Actions, [&](Expected<std::vector<shared::WrapperFunctionCall>> R) {
-          P.set_value(std::move(R));
-        });
-    if (auto DeinitializeActionsOrErr = F.get())
-      DeinitializeActions = std::move(*DeinitializeActionsOrErr);
-    else
-      return OnInitialized(DeinitializeActionsOrErr.takeError());
-  }
+  auto DeinitializeActions = shared::runFinalizeActions(AI.Actions);
+  if (!DeinitializeActions)
+    return OnInitialized(DeinitializeActions.takeError());
 
   {
     std::lock_guard<std::mutex> Lock(Mutex);
@@ -111,7 +101,7 @@ void InProcessMemoryMapper::initialize(MemoryMapper::AllocInfo &AI,
     // This is the maximum range whose permission have been possibly modified
     auto &Alloc = Allocations[MinAddr];
     Alloc.Size = MaxAddr - MinAddr;
-    Alloc.DeinitializationActions = std::move(DeinitializeActions);
+    Alloc.DeinitializationActions = std::move(*DeinitializeActions);
     Reservations[AI.MappingBase.toPtr<void *>()].Allocations.push_back(MinAddr);
   }
 
@@ -128,10 +118,10 @@ void InProcessMemoryMapper::deinitialize(
 
     for (auto Base : llvm::reverse(Bases)) {
 
-      shared::runDeallocActions(
-          Allocations[Base].DeinitializationActions, [&](Error Err) {
-            AllErr = joinErrors(std::move(AllErr), std::move(Err));
-          });
+      if (Error Err = shared::runDeallocActions(
+              Allocations[Base].DeinitializationActions)) {
+        AllErr = joinErrors(std::move(AllErr), std::move(Err));
+      }
 
       // Reset protections to read/write so the area can be reused
       if (auto EC = sys::Memory::protectMappedMemory(
diff --git a/llvm/lib/ExecutionEngine/Orc/Shared/AllocationActions.cpp b/llvm/lib/ExecutionEngine/Orc/Shared/AllocationActions.cpp
index 08ab0c66f6b72..91f2899449ef3 100644
--- a/llvm/lib/ExecutionEngine/Orc/Shared/AllocationActions.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Shared/AllocationActions.cpp
@@ -12,39 +12,31 @@ namespace llvm {
 namespace orc {
 namespace shared {
 
-void runFinalizeActions(AllocActions &AAs,
-                        OnRunFinalizeActionsCompleteFn OnComplete) {
+Expected<std::vector<WrapperFunctionCall>>
+runFinalizeActions(AllocActions &AAs) {
   std::vector<WrapperFunctionCall> DeallocActions;
   DeallocActions.reserve(numDeallocActions(AAs));
 
   for (auto &AA : AAs) {
     if (AA.Finalize)
-
-      if (auto Err = AA.Finalize.runWithSPSRetErrorMerged()) {
-        while (!DeallocActions.empty()) {
-          Err = joinErrors(std::move(Err),
-                           DeallocActions.back().runWithSPSRetErrorMerged());
-          DeallocActions.pop_back();
-        }
-        return OnComplete(std::move(Err));
-      }
+      if (auto Err = AA.Finalize.runWithSPSRetErrorMerged())
+        return joinErrors(std::move(Err), runDeallocActions(DeallocActions));
 
     if (AA.Dealloc)
       DeallocActions.push_back(std::move(AA.Dealloc));
   }
 
   AAs.clear();
-  OnComplete(std::move(DeallocActions));
+  return DeallocActions;
 }
 
-void runDeallocActions(ArrayRef<WrapperFunctionCall> DAs,
-                       OnRunDeallocActionsComeleteFn OnComplete) {
+Error runDeallocActions(ArrayRef<WrapperFunctionCall> DAs) {
   Error Err = Error::success();
   while (!DAs.empty()) {
     Err = joinErrors(std::move(Err), DAs.back().runWithSPSRetErrorMerged());
     DAs = DAs.drop_back();
   }
-  OnComplete(std::move(Err));
+  return Err;
 }
 
 } // namespace shared
diff --git a/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp b/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp
index 26e8f5382da0e..cc99d3c768772 100644
--- a/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp
@@ -23,10 +23,12 @@ const char *SimpleExecutorMemoryManagerInstanceName =
     "__llvm_orc_SimpleExecutorMemoryManager_Instance";
 const char *SimpleExecutorMemoryManagerReserveWrapperName =
     "__llvm_orc_SimpleExecutorMemoryManager_reserve_wrapper";
-const char *SimpleExecutorMemoryManagerFinalizeWrapperName =
-    "__llvm_orc_SimpleExecutorMemoryManager_finalize_wrapper";
-const char *SimpleExecutorMemoryManagerDeallocateWrapperName =
-    "__llvm_orc_SimpleExecutorMemoryManager_deallocate_wrapper";
+const char *SimpleExecutorMemoryManagerInitializeWrapperName =
+    "__llvm_orc_SimpleExecutorMemoryManager_initialize_wrapper";
+const char *SimpleExecutorMemoryManagerDeinitializeWrapperName =
+    "__llvm_orc_SimpleExecutorMemoryManager_deinitialize_wrapper";
+const char *SimpleExecutorMemoryManagerReleaseWrapperName =
+    "__llvm_orc_SimpleExecutorMemoryManager_release_wrapper";
 
 const char *ExecutorSharedMemoryMapperServiceInstanceName =
     "__llvm_orc_ExecutorSharedMemoryMapperService_Instance";
diff --git a/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp b/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp
index 87d757805a64c..893523ced8651 100644
--- a/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp
@@ -216,9 +216,9 @@ SimpleRemoteEPC::createDefaultMemoryManager(SimpleRemoteEPC &SREPC) {
   if (auto Err = SREPC.getBootstrapSymbols(
           {{SAs.Allocator, rt::SimpleExecutorMemoryManagerInstanceName},
            {SAs.Reserve, rt::SimpleExecutorMemoryManagerReserveWrapperName},
-           {SAs.Finalize, rt::SimpleExecutorMemoryManagerFinalizeWrapperName},
-           {SAs.Deallocate,
-            rt::SimpleExecutorMemoryManagerDeallocateWrapperName}}))
+           {SAs.Initialize,
+            rt::SimpleExecutorMemoryManagerInitializeWrapperName},
+           {SAs.Release, rt::SimpleExecutorMemoryManagerReleaseWrapperName}}))
     return std::move(Err);
 
   return std::make_unique<EPCGenericJITLinkMemoryManager>(SREPC, SAs);
@@ -448,7 +448,7 @@ Error SimpleRemoteEPC::handleHangup(SimpleRemoteEPCArgBytesVector ArgBytes) {
   if (const char *ErrMsg = WFR.getOutOfBandError())
     return make_error<StringError>(ErrMsg, inconvertibleErrorCode());
 
-  detail::SPSSerializableError Info;
+  orc::shared::detail::SPSSerializableError Info;
   SPSInputBuffer IB(WFR.data(), WFR.size());
   if (!SPSArgList<SPSError>::deserialize(IB, Info))
     return make_error<StringError>("Could not deserialize hangup info",
diff --git a/llvm/lib/ExecutionEngine/Orc/SimpleRemoteMemoryMapper.cpp b/llvm/lib/ExecutionEngine/Orc/SimpleRemoteMemoryMapper.cpp
new file mode 100644
index 0000000000000..b82de3fd15216
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/SimpleRemoteMemoryMapper.cpp
@@ -0,0 +1,104 @@
+//===---- SimpleRemoteMemoryMapper.cpp - Remote memory mapper ----*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/SimpleRemoteMemoryMapper.h"
+
+#include "llvm/ExecutionEngine/JITLink/JITLink.h"
+#include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h"
+
+namespace llvm::orc {
+
+SimpleRemoteMemoryMapper::SimpleRemoteMemoryMapper(ExecutorProcessControl &EPC,
+                                                   SymbolAddrs SAs)
+    : EPC(EPC), SAs(SAs) {}
+
+void SimpleRemoteMemoryMapper::reserve(size_t NumBytes,
+                                       OnReservedFunction OnReserved) {
+  EPC.callSPSWrapperAsync<rt::SPSSimpleRemoteMemoryMapReserveSignature>(
+      SAs.Reserve,
+      [NumBytes, OnReserved = std::move(OnReserved)](
+          Error SerializationErr, Expected<ExecutorAddr> Result) mutable {
+        if (SerializationErr) {
+          cantFail(Result.takeError());
+          return OnReserved(std::move(SerializationErr));
+        }
+
+        if (Result)
+          OnReserved(ExecutorAddrRange(*Result, NumBytes));
+        else
+          OnReserved(Result.takeError());
+      },
+      SAs.Instance, static_cast<uint64_t>(NumBytes));
+}
+
+char *SimpleRemoteMemoryMapper::prepare(jitlink::LinkGraph &G,
+                                        ExecutorAddr Addr, size_t ContentSize) {
+  return G.allocateBuffer(ContentSize).data();
+}
+
+void SimpleRemoteMemoryMapper::initialize(MemoryMapper::AllocInfo &AI,
+                                          OnInitializedFunction OnInitialized) {
+
+  tpctypes::FinalizeRequest FR;
+
+  std::swap(FR.Actions, AI.Actions);
+  FR.Segments.reserve(AI.Segments.size());
+
+  for (auto Seg : AI.Segments)
+    FR.Segments.push_back({Seg.AG, AI.MappingBase + Seg.Offset,
+                           Seg.ContentSize + Seg.ZeroFillSize,
+                           ArrayRef<char>(Seg.WorkingMem, Seg.ContentSize)});
+
+  EPC.callSPSWrapperAsync<rt::SPSSimpleRemoteMemoryMapInitializeSignature>(
+      SAs.Initialize,
+      [OnInitialized = std::move(OnInitialized)](
+          Error SerializationErr, Expected<ExecutorAddr> Result) mutable {
+        if (SerializationErr) {
+          cantFail(Result.takeError());
+          return OnInitialized(std::move(SerializationErr));
+        }
+
+        OnInitialized(std::move(Result));
+      },
+      SAs.Instance, std::move(FR));
+}
+
+void SimpleRemoteMemoryMapper::deinitialize(
+    ArrayRef<ExecutorAddr> Allocations,
+    MemoryMapper::OnDeinitializedFunction OnDeinitialized) {
+  EPC.callSPSWrapperAsync<rt::SPSSimpleRemoteMemoryMapDeinitializeSignature>(
+      SAs.Deinitialize,
+      [OnDeinitialized = std::move(OnDeinitialized)](Error SerializationErr,
+                                                     Error Result) mutable {
+        if (SerializationErr) {
+          cantFail(std::move(Result));
+          return OnDeinitialized(std::move(SerializationErr));
+        }
+
+        OnDeinitialized(std::move(Result));
+      },
+      SAs.Instance, Allocations);
+}
+
+void SimpleRemoteMemoryMapper::release(ArrayRef<ExecutorAddr> Bases,
+                                       OnReleasedFunction OnReleased) {
+  EPC.callSPSWrapperAsync<rt::SPSSimpleRemoteMemoryMapReleaseSignature>(
+      SAs.Release,
+      [OnReleased = std::move(OnReleased)](Error SerializationErr,
+                                           Error Result) mutable {
+        if (SerializationErr) {
+          cantFail(std::move(Result));
+          return OnReleased(std::move(SerializationErr));
+        }
+
+        return OnReleased(std::move(Result));
+      },
+      SAs.Instance, Bases);
+}
+
+} // namespace llvm::orc
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/ExecutorSharedMemoryMapperService.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/ExecutorSharedMemoryMapperService.cpp
index 8c24b1f3f5265..4fbf232008c89 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/ExecutorSharedMemoryMapperService.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/ExecutorSharedMemoryMapperService.cpp
@@ -9,10 +9,8 @@
 #include "llvm/ExecutionEngine/Orc/TargetProcess/ExecutorSharedMemoryMapperService.h"
 #include "llvm/Config/llvm-config.h" // for LLVM_ON_UNIX
 #include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h"
-#include "llvm/Support/MSVCErrorWorkarounds.h"
 #include "llvm/Support/Process.h"
 #include "llvm/Support/WindowsError.h"
-#include <future>
 #include <sstream>
 
 #if defined(LLVM_ON_UNIX)
@@ -183,24 +181,15 @@ Expected<ExecutorAddr> ExecutorSharedMemoryMapperService::initialize(
   }
 
   // Run finalization actions and get deinitlization action list.
-  std::vector<shared::WrapperFunctionCall> DeinitializeActions;
-  {
-    std::promise<MSVCPExpected<std::vector<shared::WrapperFunctionCall>>> P;
-    auto F = P.get_future();
-    shared::runFinalizeActions(
-        FR.Actions, [&](Expected<std::vector<shared::WrapperFunctionCall>> R) {
-          P.set_value(std::move(R));
-        });
-    if (auto DeinitializeActionsOrErr = F.get())
-      DeinitializeActions = std::move(*DeinitializeActionsOrErr);
-    else
-      return DeinitializeActionsOrErr.takeError();
+  auto DeinitializeActions = shared::runFinalizeActions(FR.Actions);
+  if (!DeinitializeActions) {
+    return DeinitializeActions.takeError();
   }
 
   {
     std::lock_guard<std::mutex> Lock(Mutex);
     Allocations[MinAddr].DeinitializationActions =
-        std::move(DeinitializeActions);
+        std::move(*DeinitializeActions);
     Reservations[Reservation.toPtr<void *>()].Allocations.push_back(MinAddr);
   }
 
@@ -221,11 +210,10 @@ Error ExecutorSharedMemoryMapperService::deinitialize(
     std::lock_guard<std::mutex> Lock(Mutex);
 
     for (auto Base : llvm::reverse(Bases)) {
-      shared::runDeallocActions(
-          Allocations[Base].DeinitializationActions, [&](Error Err) {
-            if (Err)
-              AllErr = joinErrors(std::move(AllErr), std::move(Err));
-          });
+      if (Error Err = shared::runDeallocActions(
+              Allocations[Base].DeinitializationActions)) {
+        AllErr = joinErrors(std::move(AllErr), std::move(Err));
+      }
 
       // Remove the allocation from the allocation list of its reservation
       for (auto &Reservation : Reservations) {
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp
index 3cdffb8cd0615..fe881a1eef773 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.h"
 
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h"
 #include "llvm/Support/FormatVariadic.h"
 
@@ -18,166 +19,167 @@ namespace orc {
 namespace rt_bootstrap {
 
 SimpleExecutorMemoryManager::~SimpleExecutorMemoryManager() {
-  assert(Allocations.empty() && "shutdown not called?");
+  assert(Slabs.empty() && "shutdown not called?");
 }
 
-Expected<ExecutorAddr> SimpleExecutorMemoryManager::allocate(uint64_t Size) {
+Expected<ExecutorAddr> SimpleExecutorMemoryManager::reserve(uint64_t Size) {
   std::error_code EC;
   auto MB = sys::Memory::allocateMappedMemory(
       Size, nullptr, sys::Memory::MF_READ | sys::Memory::MF_WRITE, EC);
   if (EC)
     return errorCodeToError(EC);
   std::lock_guard<std::mutex> Lock(M);
-  assert(!Allocations.count(MB.base()) && "Duplicate allocation addr");
-  Allocations[MB.base()].Size = Size;
+  assert(!Slabs.count(MB.base()) && "Duplicate allocation addr");
+  Slabs[MB.base()].Size = Size;
   return ExecutorAddr::fromPtr(MB.base());
 }
 
-Error SimpleExecutorMemoryManager::finalize(tpctypes::FinalizeRequest &FR) {
-  ExecutorAddr Base(~0ULL);
+Expected<ExecutorAddr>
+SimpleExecutorMemoryManager::initialize(tpctypes::FinalizeRequest &FR) {
   std::vector<shared::WrapperFunctionCall> DeallocationActions;
-  size_t SuccessfulFinalizationActions = 0;
 
   if (FR.Segments.empty()) {
-    // NOTE: Finalizing nothing is currently a no-op. Should it be an error?
     if (FR.Actions.empty())
-      return Error::success();
+      return make_error<StringError>("Finalization request is empty",
+                                     inconvertibleErrorCode());
     else
       return make_error<StringError>("Finalization actions attached to empty "
                                      "finalization request",
                                      inconvertibleErrorCode());
   }
 
-  for (auto &Seg : FR.Segments)
-    Base = std::min(Base, Seg.Addr);
-
-  for (auto &ActPair : FR.Actions)
-    if (ActPair.Dealloc)
-      DeallocationActions.push_back(ActPair.Dealloc);
-
-  // Get the Allocation for this finalization.
-  size_t AllocSize = 0;
-  {
-    std::lock_guard<std::mutex> Lock(M);
-    auto I = Allocations.find(Base.toPtr<void *>());
-    if (I == Allocations.end())
-      return make_error<StringError>("Attempt to finalize unrecognized "
-                                     "allocation " +
-                                         formatv("{0:x}", Base.getValue()),
-                                     inconvertibleErrorCode());
-    AllocSize = I->second.Size;
-    I->second.DeallocationActions = std::move(DeallocationActions);
-  }
-  ExecutorAddr AllocEnd = Base + ExecutorAddrDiff(AllocSize);
-
-  // Bail-out function: this will run deallocation actions corresponding to any
-  // completed finalization actions, then deallocate memory.
-  auto BailOut = [&](Error Err) {
-    std::pair<void *, Allocation> AllocToDestroy;
-
-    // Get allocation to destroy.
-    {
-      std::lock_guard<std::mutex> Lock(M);
-      auto I = Allocations.find(Base.toPtr<void *>());
-
-      // Check for missing allocation (effective a double free).
-      if (I == Allocations.end())
-        return joinErrors(
-            std::move(Err),
-            make_error<StringError>("No allocation entry found "
-                                    "for " +
-                                        formatv("{0:x}", Base.getValue()),
-                                    inconvertibleErrorCode()));
-      AllocToDestroy = std::move(*I);
-      Allocations.erase(I);
-    }
+  ExecutorAddrRange RR(FR.Segments.front().Addr, FR.Segments.front().Addr);
 
-    // Run deallocation actions for all completed finalization actions.
-    while (SuccessfulFinalizationActions)
-      Err =
-          joinErrors(std::move(Err), FR.Actions[--SuccessfulFinalizationActions]
-                                         .Dealloc.runWithSPSRetErrorMerged());
-
-    // Deallocate memory.
-    sys::MemoryBlock MB(AllocToDestroy.first, AllocToDestroy.second.Size);
-    if (auto EC = sys::Memory::releaseMappedMemory(MB))
-      Err = joinErrors(std::move(Err), errorCodeToError(EC));
-
-    return Err;
-  };
+  std::vector<sys::MemoryBlock> MBsToReset;
+  auto ResetMBs = make_scope_exit([&]() {
+    for (auto &MB : MBsToReset)
+      sys::Memory::protectMappedMemory(MB, sys::Memory::MF_READ |
+                                               sys::Memory::MF_WRITE);
+    sys::Memory::InvalidateInstructionCache(RR.Start.toPtr<void *>(),
+                                            RR.size());
+  });
 
   // Copy content and apply permissions.
   for (auto &Seg : FR.Segments) {
+    RR.Start = std::min(RR.Start, Seg.Addr);
+    RR.End = std::max(RR.End, Seg.Addr + Seg.Size);
 
     // Check segment ranges.
     if (LLVM_UNLIKELY(Seg.Size < Seg.Content.size()))
-      return BailOut(make_error<StringError>(
+      return make_error<StringError>(
           formatv("Segment {0:x} content size ({1:x} bytes) "
                   "exceeds segment size ({2:x} bytes)",
                   Seg.Addr.getValue(), Seg.Content.size(), Seg.Size),
-          inconvertibleErrorCode()));
+          inconvertibleErrorCode());
     ExecutorAddr SegEnd = Seg.Addr + ExecutorAddrDiff(Seg.Size);
-    if (LLVM_UNLIKELY(Seg.Addr < Base || SegEnd > AllocEnd))
-      return BailOut(make_error<StringError>(
+    if (LLVM_UNLIKELY(Seg.Addr < RR.Start || SegEnd > RR.End))
+      return make_error<StringError>(
           formatv("Segment {0:x} -- {1:x} crosses boundary of "
                   "allocation {2:x} -- {3:x}",
-                  Seg.Addr.getValue(), SegEnd.getValue(), Base.getValue(),
-                  AllocEnd.getValue()),
-          inconvertibleErrorCode()));
+                  Seg.Addr, SegEnd, RR.Start, RR.End),
+          inconvertibleErrorCode());
 
     char *Mem = Seg.Addr.toPtr<char *>();
     if (!Seg.Content.empty())
       memcpy(Mem, Seg.Content.data(), Seg.Content.size());
     memset(Mem + Seg.Content.size(), 0, Seg.Size - Seg.Content.size());
     assert(Seg.Size <= std::numeric_limits<size_t>::max());
+
+    sys::MemoryBlock MB(Mem, Seg.Size);
     if (auto EC = sys::Memory::protectMappedMemory(
-            {Mem, static_cast<size_t>(Seg.Size)},
-            toSysMemoryProtectionFlags(Seg.RAG.Prot)))
-      return BailOut(errorCodeToError(EC));
+            MB, toSysMemoryProtectionFlags(Seg.RAG.Prot)))
+      return errorCodeToError(EC);
+
+    MBsToReset.push_back(MB);
+
     if ((Seg.RAG.Prot & MemProt::Exec) == MemProt::Exec)
       sys::Memory::InvalidateInstructionCache(Mem, Seg.Size);
   }
 
-  // Run finalization actions.
-  for (auto &ActPair : FR.Actions) {
-    if (auto Err = ActPair.Finalize.runWithSPSRetErrorMerged())
-      return BailOut(std::move(Err));
-    ++SuccessfulFinalizationActions;
+  auto DeallocActions = runFinalizeActions(FR.Actions);
+  if (!DeallocActions)
+    return DeallocActions.takeError();
+
+  {
+    std::lock_guard<std::mutex> Lock(M);
+    auto Region = createRegionInfo(RR, "In initialize");
+    if (!Region)
+      return Region.takeError();
+    Region->DeallocActions = std::move(*DeallocActions);
   }
 
-  return Error::success();
+  // Successful initialization.
+  ResetMBs.release();
+
+  return RR.Start;
 }
 
-Error SimpleExecutorMemoryManager::deallocate(
-    const std::vector<ExecutorAddr> &Bases) {
-  std::vector<std::pair<void *, Allocation>> AllocPairs;
-  AllocPairs.reserve(Bases.size());
+Error SimpleExecutorMemoryManager::deinitialize(
+    const std::vector<ExecutorAddr> &InitKeys) {
+  Error Err = Error::success();
 
-  // Get allocation to destroy.
+  for (auto &KeyAddr : llvm::reverse(InitKeys)) {
+    std::vector<shared::WrapperFunctionCall> DeallocActions;
+    {
+      std::scoped_lock<std::mutex> Lock(M);
+      auto Slab = getSlabInfo(KeyAddr, "In deinitialize");
+      if (!Slab) {
+        Err = joinErrors(std::move(Err), Slab.takeError());
+        continue;
+      }
+
+      auto RI = getRegionInfo(*Slab, KeyAddr, "In deinitialize");
+      if (!RI) {
+        Err = joinErrors(std::move(Err), RI.takeError());
+        continue;
+      }
+
+      DeallocActions = std::move(RI->DeallocActions);
+    }
+
+    Err = joinErrors(std::move(Err),
+                     runDeallocActions(std::move(DeallocActions)));
+  }
+
+  return Err;
+}
+
+Error SimpleExecutorMemoryManager::release(
+    const std::vector<ExecutorAddr> &Bases) {
   Error Err = Error::success();
-  {
-    std::lock_guard<std::mutex> Lock(M);
-    for (auto &Base : Bases) {
-      auto I = Allocations.find(Base.toPtr<void *>());
-
-      // Check for missing allocation (effective a double free).
-      if (I != Allocations.end()) {
-        AllocPairs.push_back(std::move(*I));
-        Allocations.erase(I);
-      } else
+
+  // TODO: Prohibit new initializations within the slabs being removed?
+  for (auto &Base : llvm::reverse(Bases)) {
+    std::vector<shared::WrapperFunctionCall> DeallocActions;
+    sys::MemoryBlock MB;
+
+    {
+      std::scoped_lock<std::mutex> Lock(M);
+
+      auto SlabI = Slabs.find(Base.toPtr<void *>());
+      if (SlabI == Slabs.end()) {
         Err = joinErrors(
             std::move(Err),
-            make_error<StringError>("No allocation entry found "
-                                    "for " +
-                                        formatv("{0:x}", Base.getValue()),
+            make_error<StringError>("In release, " + formatv("{0:x}", Base) +
+                                        " is not part of any reserved "
+                                        "address range",
                                     inconvertibleErrorCode()));
+        continue;
+      }
+
+      auto &Slab = SlabI->second;
+
+      for (auto &[Addr, Region] : Slab.Regions)
+        llvm::copy(Region.DeallocActions, back_inserter(DeallocActions));
+
+      MB = {Base.toPtr<void *>(), Slab.Size};
+
+      Slabs.erase(SlabI);
     }
-  }
 
-  while (!AllocPairs.empty()) {
-    auto &P = AllocPairs.back();
-    Err = joinErrors(std::move(Err), deallocateImpl(P.first, P.second));
-    AllocPairs.pop_back();
+    Err = joinErrors(std::move(Err), runDeallocActions(DeallocActions));
+    if (auto EC = sys::Memory::releaseMappedMemory(MB))
+      Err = joinErrors(std::move(Err), errorCodeToError(EC));
   }
 
   return Err;
@@ -185,16 +187,15 @@ Error SimpleExecutorMemoryManager::deallocate(
 
 Error SimpleExecutorMemoryManager::shutdown() {
 
-  AllocationsMap AM;
+  // TODO: Prevent new allocations during shutdown.
+  std::vector<ExecutorAddr> Bases;
   {
-    std::lock_guard<std::mutex> Lock(M);
-    AM = std::move(Allocations);
+    std::scoped_lock<std::mutex> Lock(M);
+    for (auto &[Base, Slab] : Slabs)
+      Bases.push_back(ExecutorAddr::fromPtr(Base));
   }
 
-  Error Err = Error::success();
-  for (auto &KV : AM)
-    Err = joinErrors(std::move(Err), deallocateImpl(KV.first, KV.second));
-  return Err;
+  return release(Bases);
 }
 
 void SimpleExecutorMemoryManager::addBootstrapSymbols(
@@ -202,58 +203,150 @@ void SimpleExecutorMemoryManager::addBootstrapSymbols(
   M[rt::SimpleExecutorMemoryManagerInstanceName] = ExecutorAddr::fromPtr(this);
   M[rt::SimpleExecutorMemoryManagerReserveWrapperName] =
       ExecutorAddr::fromPtr(&reserveWrapper);
-  M[rt::SimpleExecutorMemoryManagerFinalizeWrapperName] =
-      ExecutorAddr::fromPtr(&finalizeWrapper);
-  M[rt::SimpleExecutorMemoryManagerDeallocateWrapperName] =
-      ExecutorAddr::fromPtr(&deallocateWrapper);
+  M[rt::SimpleExecutorMemoryManagerInitializeWrapperName] =
+      ExecutorAddr::fromPtr(&initializeWrapper);
+  M[rt::SimpleExecutorMemoryManagerDeinitializeWrapperName] =
+      ExecutorAddr::fromPtr(&deinitializeWrapper);
+  M[rt::SimpleExecutorMemoryManagerReleaseWrapperName] =
+      ExecutorAddr::fromPtr(&releaseWrapper);
 }
 
-Error SimpleExecutorMemoryManager::deallocateImpl(void *Base, Allocation &A) {
-  Error Err = Error::success();
+Expected<SimpleExecutorMemoryManager::SlabInfo &>
+SimpleExecutorMemoryManager::getSlabInfo(ExecutorAddr A, StringRef Context) {
+  auto MakeBadSlabError = [&]() {
+    return make_error<StringError>(
+        Context + ", address " + formatv("{0:x}", A) +
+            " is not part of any reserved address range",
+        inconvertibleErrorCode());
+  };
 
-  while (!A.DeallocationActions.empty()) {
-    Err = joinErrors(std::move(Err),
-                     A.DeallocationActions.back().runWithSPSRetErrorMerged());
-    A.DeallocationActions.pop_back();
+  auto I = Slabs.upper_bound(A.toPtr<void *>());
+  if (I == Slabs.begin())
+    return MakeBadSlabError();
+  --I;
+  if (!ExecutorAddrRange(ExecutorAddr::fromPtr(I->first), I->second.Size)
+           .contains(A))
+    return MakeBadSlabError();
+
+  return I->second;
+}
+
+Expected<SimpleExecutorMemoryManager::SlabInfo &>
+SimpleExecutorMemoryManager::getSlabInfo(ExecutorAddrRange R,
+                                         StringRef Context) {
+  auto MakeBadSlabError = [&]() {
+    return make_error<StringError>(
+        Context + ", range " + formatv("{0:x}", R) +
+            " is not part of any reserved address range",
+        inconvertibleErrorCode());
+  };
+
+  auto I = Slabs.upper_bound(R.Start.toPtr<void *>());
+  if (I == Slabs.begin())
+    return MakeBadSlabError();
+  --I;
+  if (!ExecutorAddrRange(ExecutorAddr::fromPtr(I->first), I->second.Size)
+           .contains(R))
+    return MakeBadSlabError();
+
+  return I->second;
+}
+
+Expected<SimpleExecutorMemoryManager::RegionInfo &>
+SimpleExecutorMemoryManager::createRegionInfo(ExecutorAddrRange R,
+                                              StringRef Context) {
+
+  auto Slab = getSlabInfo(R, Context);
+  if (!Slab)
+    return Slab.takeError();
+
+  auto MakeBadRegionError = [&](ExecutorAddrRange Other, bool Prev) {
+    return make_error<StringError>(Context + ", region " + formatv("{0:x}", R) +
+                                       " overlaps " +
+                                       (Prev ? "previous" : "following") +
+                                       " region " + formatv("{0:x}", Other),
+                                   inconvertibleErrorCode());
+  };
+
+  auto I = Slab->Regions.upper_bound(R.Start);
+  if (I != Slab->Regions.begin()) {
+    auto J = std::prev(I);
+    ExecutorAddrRange PrevRange(J->first, J->second.Size);
+    if (PrevRange.overlaps(R))
+      return MakeBadRegionError(PrevRange, true);
+  }
+  if (I != Slab->Regions.end()) {
+    ExecutorAddrRange NextRange(I->first, I->second.Size);
+    if (NextRange.overlaps(R))
+      return MakeBadRegionError(NextRange, false);
   }
 
-  sys::MemoryBlock MB(Base, A.Size);
-  if (auto EC = sys::Memory::releaseMappedMemory(MB))
-    Err = joinErrors(std::move(Err), errorCodeToError(EC));
+  auto &RInfo = Slab->Regions[R.Start];
+  RInfo.Size = R.size();
+  return RInfo;
+}
 
-  return Err;
+Expected<SimpleExecutorMemoryManager::RegionInfo &>
+SimpleExecutorMemoryManager::getRegionInfo(SlabInfo &Slab, ExecutorAddr A,
+                                           StringRef Context) {
+  auto I = Slab.Regions.find(A);
+  if (I == Slab.Regions.end())
+    return make_error<StringError>(
+        Context + ", address " + formatv("{0:x}", A) +
+            " does not correspond to the start of any initialized region",
+        inconvertibleErrorCode());
+
+  return I->second;
+}
+
+Expected<SimpleExecutorMemoryManager::RegionInfo &>
+SimpleExecutorMemoryManager::getRegionInfo(ExecutorAddr A, StringRef Context) {
+  auto Slab = getSlabInfo(A, Context);
+  if (!Slab)
+    return Slab.takeError();
+
+  return getRegionInfo(*Slab, A, Context);
 }
 
 llvm::orc::shared::CWrapperFunctionResult
 SimpleExecutorMemoryManager::reserveWrapper(const char *ArgData,
                                             size_t ArgSize) {
-  return shared::WrapperFunction<
-             rt::SPSSimpleExecutorMemoryManagerReserveSignature>::
+  return shared::WrapperFunction<rt::SPSSimpleRemoteMemoryMapReserveSignature>::
       handle(ArgData, ArgSize,
              shared::makeMethodWrapperHandler(
-                 &SimpleExecutorMemoryManager::allocate))
+                 &SimpleExecutorMemoryManager::reserve))
+          .release();
+}
+
+llvm::orc::shared::CWrapperFunctionResult
+SimpleExecutorMemoryManager::initializeWrapper(const char *ArgData,
+                                               size_t ArgSize) {
+  return shared::
+      WrapperFunction<rt::SPSSimpleRemoteMemoryMapInitializeSignature>::handle(
+             ArgData, ArgSize,
+             shared::makeMethodWrapperHandler(
+                 &SimpleExecutorMemoryManager::initialize))
           .release();
 }
 
 llvm::orc::shared::CWrapperFunctionResult
-SimpleExecutorMemoryManager::finalizeWrapper(const char *ArgData,
-                                             size_t ArgSize) {
+SimpleExecutorMemoryManager::deinitializeWrapper(const char *ArgData,
+                                                 size_t ArgSize) {
   return shared::WrapperFunction<
-             rt::SPSSimpleExecutorMemoryManagerFinalizeSignature>::
+             rt::SPSSimpleRemoteMemoryMapDeinitializeSignature>::
       handle(ArgData, ArgSize,
              shared::makeMethodWrapperHandler(
-                 &SimpleExecutorMemoryManager::finalize))
+                 &SimpleExecutorMemoryManager::deinitialize))
           .release();
 }
 
 llvm::orc::shared::CWrapperFunctionResult
-SimpleExecutorMemoryManager::deallocateWrapper(const char *ArgData,
-                                               size_t ArgSize) {
-  return shared::WrapperFunction<
-             rt::SPSSimpleExecutorMemoryManagerDeallocateSignature>::
+SimpleExecutorMemoryManager::releaseWrapper(const char *ArgData,
+                                            size_t ArgSize) {
+  return shared::WrapperFunction<rt::SPSSimpleRemoteMemoryMapReleaseSignature>::
       handle(ArgData, ArgSize,
              shared::makeMethodWrapperHandler(
-                 &SimpleExecutorMemoryManager::deallocate))
+                 &SimpleExecutorMemoryManager::release))
           .release();
 }
 
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 39b240fdefd7b..6ed28c5207c22 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -2396,16 +2396,21 @@ static void writeDICompileUnit(raw_ostream &Out, const DICompileUnit *N,
   Out << "!DICompileUnit(";
   MDFieldPrinter Printer(Out, WriterCtx);
 
-  auto Lang = N->getSourceLanguage();
-  if (Lang.hasVersionedName())
+  DISourceLanguageName Lang = N->getSourceLanguage();
+
+  if (Lang.hasVersionedName()) {
     Printer.printDwarfEnum(
         "sourceLanguageName",
         static_cast<llvm::dwarf::SourceLanguageName>(Lang.getName()),
         dwarf::SourceLanguageNameString,
         /* ShouldSkipZero */ false);
-  else
+
+    Printer.printInt("sourceLanguageVersion", Lang.getVersion(),
+                     /*ShouldSkipZero=*/true);
+  } else {
     Printer.printDwarfEnum("language", Lang.getName(), dwarf::LanguageString,
                            /* ShouldSkipZero */ false);
+  }
 
   Printer.printMetadata("file", N->getRawFile(), /* ShouldSkipNull */ false);
   Printer.printString("producer", N->getProducer());
@@ -3289,7 +3294,7 @@ void AssemblyWriter::printModuleSummaryIndex() {
   // for aliasee (then update BitcodeWriter.cpp and remove get/setAliaseeGUID).
   for (auto &GlobalList : *TheIndex) {
     auto GUID = GlobalList.first;
-    for (auto &Summary : GlobalList.second.SummaryList)
+    for (auto &Summary : GlobalList.second.getSummaryList())
       SummaryToGUIDMap[Summary.get()] = GUID;
   }
 
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 77c9b9cde2ab7..49df886186600 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1426,6 +1426,28 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
                                                 Intrinsic::memset, ParamTypes);
       return true;
     }
+
+    unsigned MaskedID =
+        StringSwitch<unsigned>(Name)
+            .StartsWith("masked.load", Intrinsic::masked_load)
+            .StartsWith("masked.gather", Intrinsic::masked_gather)
+            .StartsWith("masked.store", Intrinsic::masked_store)
+            .StartsWith("masked.scatter", Intrinsic::masked_scatter)
+            .Default(0);
+    if (MaskedID && F->arg_size() == 4) {
+      rename(F);
+      if (MaskedID == Intrinsic::masked_load ||
+          MaskedID == Intrinsic::masked_gather) {
+        NewFn = Intrinsic::getOrInsertDeclaration(
+            F->getParent(), MaskedID,
+            {F->getReturnType(), F->getArg(0)->getType()});
+        return true;
+      }
+      NewFn = Intrinsic::getOrInsertDeclaration(
+          F->getParent(), MaskedID,
+          {F->getArg(0)->getType(), F->getArg(1)->getType()});
+      return true;
+    }
     break;
   }
   case 'n': {
@@ -5231,6 +5253,54 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
     break;
   }
 
+  case Intrinsic::masked_load:
+  case Intrinsic::masked_gather:
+  case Intrinsic::masked_store:
+  case Intrinsic::masked_scatter: {
+    if (CI->arg_size() != 4) {
+      DefaultCase();
+      return;
+    }
+
+    const DataLayout &DL = CI->getDataLayout();
+    switch (NewFn->getIntrinsicID()) {
+    case Intrinsic::masked_load:
+      NewCall = Builder.CreateMaskedLoad(
+          CI->getType(), CI->getArgOperand(0),
+          cast<ConstantInt>(CI->getArgOperand(1))->getAlignValue(),
+          CI->getArgOperand(2), CI->getArgOperand(3));
+      break;
+    case Intrinsic::masked_gather:
+      NewCall = Builder.CreateMaskedGather(
+          CI->getType(), CI->getArgOperand(0),
+          DL.getValueOrABITypeAlignment(
+              cast<ConstantInt>(CI->getArgOperand(1))->getMaybeAlignValue(),
+              CI->getType()->getScalarType()),
+          CI->getArgOperand(2), CI->getArgOperand(3));
+      break;
+    case Intrinsic::masked_store:
+      NewCall = Builder.CreateMaskedStore(
+          CI->getArgOperand(0), CI->getArgOperand(1),
+          cast<ConstantInt>(CI->getArgOperand(2))->getAlignValue(),
+          CI->getArgOperand(3));
+      break;
+    case Intrinsic::masked_scatter:
+      NewCall = Builder.CreateMaskedScatter(
+          CI->getArgOperand(0), CI->getArgOperand(1),
+          DL.getValueOrABITypeAlignment(
+              cast<ConstantInt>(CI->getArgOperand(2))->getMaybeAlignValue(),
+              CI->getArgOperand(0)->getType()->getScalarType()),
+          CI->getArgOperand(3));
+      break;
+    default:
+      llvm_unreachable("Unexpected intrinsic ID");
+    }
+    // Previous metadata is still valid.
+    NewCall->copyMetadata(*CI);
+    NewCall->setTailCallKind(cast<CallInst>(CI)->getTailCallKind());
+    break;
+  }
+
   case Intrinsic::lifetime_start:
   case Intrinsic::lifetime_end: {
     if (CI->arg_size() != 2) {
@@ -6042,8 +6112,7 @@ std::string llvm::UpgradeDataLayoutString(StringRef DL, StringRef TT) {
   Triple T(TT);
   // The only data layout upgrades needed for pre-GCN, SPIR or SPIRV are setting
   // the address space of globals to 1. This does not apply to SPIRV Logical.
-  if (((T.isAMDGPU() && !T.isAMDGCN()) ||
-       (T.isSPIR() || (T.isSPIRV() && !T.isSPIRVLogical()))) &&
+  if ((T.isSPIR() || (T.isSPIRV() && !T.isSPIRVLogical())) &&
       !DL.contains("-G") && !DL.starts_with("G")) {
     return DL.empty() ? std::string("G1") : (DL + "-G1").str();
   }
@@ -6056,35 +6125,43 @@ std::string llvm::UpgradeDataLayoutString(StringRef DL, StringRef TT) {
     return DL.str();
   }
 
+  // AMDGPU data layout upgrades.
   std::string Res = DL.str();
-  // AMDGCN data layout upgrades.
-  if (T.isAMDGCN()) {
+  if (T.isAMDGPU()) {
     // Define address spaces for constants.
     if (!DL.contains("-G") && !DL.starts_with("G"))
       Res.append(Res.empty() ? "G1" : "-G1");
 
-    // Add missing non-integral declarations.
-    // This goes before adding new address spaces to prevent incoherent string
-    // values.
-    if (!DL.contains("-ni") && !DL.starts_with("ni"))
-      Res.append("-ni:7:8:9");
-    // Update ni:7 to ni:7:8:9.
-    if (DL.ends_with("ni:7"))
-      Res.append(":8:9");
-    if (DL.ends_with("ni:7:8"))
-      Res.append(":9");
-
-    // Add sizing for address spaces 7 and 8 (fat raw buffers and buffer
-    // resources) An empty data layout has already been upgraded to G1 by now.
-    if (!DL.contains("-p7") && !DL.starts_with("p7"))
-      Res.append("-p7:160:256:256:32");
-    if (!DL.contains("-p8") && !DL.starts_with("p8"))
-      Res.append("-p8:128:128:128:48");
-    constexpr StringRef OldP8("-p8:128:128-");
-    if (DL.contains(OldP8))
-      Res.replace(Res.find(OldP8), OldP8.size(), "-p8:128:128:128:48-");
-    if (!DL.contains("-p9") && !DL.starts_with("p9"))
-      Res.append("-p9:192:256:256:32");
+    // AMDGCN data layout upgrades.
+    if (T.isAMDGCN()) {
+
+      // Add missing non-integral declarations.
+      // This goes before adding new address spaces to prevent incoherent string
+      // values.
+      if (!DL.contains("-ni") && !DL.starts_with("ni"))
+        Res.append("-ni:7:8:9");
+      // Update ni:7 to ni:7:8:9.
+      if (DL.ends_with("ni:7"))
+        Res.append(":8:9");
+      if (DL.ends_with("ni:7:8"))
+        Res.append(":9");
+
+      // Add sizing for address spaces 7 and 8 (fat raw buffers and buffer
+      // resources) An empty data layout has already been upgraded to G1 by now.
+      if (!DL.contains("-p7") && !DL.starts_with("p7"))
+        Res.append("-p7:160:256:256:32");
+      if (!DL.contains("-p8") && !DL.starts_with("p8"))
+        Res.append("-p8:128:128:128:48");
+      constexpr StringRef OldP8("-p8:128:128-");
+      if (DL.contains(OldP8))
+        Res.replace(Res.find(OldP8), OldP8.size(), "-p8:128:128:128:48-");
+      if (!DL.contains("-p9") && !DL.starts_with("p9"))
+        Res.append("-p9:192:256:256:32");
+    }
+
+    // Upgrade the ELF mangling mode.
+    if (!DL.contains("m:e"))
+      Res = Res.empty() ? "m:e" : "m:e-" + Res;
 
     return Res;
   }
diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp
index 3842b1afab05d..6a9ef2efa321f 100644
--- a/llvm/lib/IR/ConstantFold.cpp
+++ b/llvm/lib/IR/ConstantFold.cpp
@@ -741,7 +741,8 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode, Constant *C1,
       assert(!CI2->isZero() && "And zero handled above");
       if (ConstantExpr *CE1 = dyn_cast<ConstantExpr>(C1)) {
         // If and'ing the address of a global with a constant, fold it.
-        if (CE1->getOpcode() == Instruction::PtrToInt &&
+        if ((CE1->getOpcode() == Instruction::PtrToInt ||
+             CE1->getOpcode() == Instruction::PtrToAddr) &&
             isa<GlobalValue>(CE1->getOperand(0))) {
           GlobalValue *GV = cast<GlobalValue>(CE1->getOperand(0));
 
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index 34130368db2a8..bf8aacc67cc10 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -294,9 +294,9 @@ void DebugInfoFinder::processSubprogram(DISubprogram *SP) {
   // just DISubprogram's, referenced from anywhere within the Function being
   // cloned prior to calling MapMetadata / RemapInstruction to avoid their
   // duplication later as DICompileUnit's are also directly referenced by
-  // llvm.dbg.cu list. Thefore we need to collect DICompileUnit's here as well.
-  // Also, DICompileUnit's may reference DISubprogram's too and therefore need
-  // to be at least looked through.
+  // llvm.dbg.cu list. Therefore we need to collect DICompileUnit's here as
+  // well. Also, DICompileUnit's may reference DISubprogram's too and therefore
+  // need to be at least looked through.
   processCompileUnit(SP->getUnit());
   processType(SP->getType());
   for (auto *Element : SP->getTemplateParams()) {
@@ -377,7 +377,7 @@ bool DebugInfoFinder::addScope(DIScope *Scope) {
 
 /// Recursively handle DILocations in followup metadata etc.
 ///
-/// TODO: If for example a followup loop metadata would refence itself this
+/// TODO: If for example a followup loop metadata would reference itself this
 /// function would go into infinite recursion. We do not expect such cycles in
 /// the loop metadata (except for the self-referencing first element
 /// "LoopID"). However, we could at least handle such situations more gracefully
@@ -679,7 +679,7 @@ class DebugTypeInfoRemoval {
     auto Variables = nullptr;
     auto TemplateParams = nullptr;
 
-    // Make a distinct DISubprogram, for situations that warrent it.
+    // Make a distinct DISubprogram, for situations that warrant it.
     auto distinctMDSubprogram = [&]() {
       return DISubprogram::getDistinct(
           MDS->getContext(), FileAndScope, MDS->getName(), LinkageName,
@@ -1110,6 +1110,35 @@ LLVMDIBuilderCreateFile(LLVMDIBuilderRef Builder, const char *Filename,
                                           StringRef(Directory, DirectoryLen)));
 }
 
+static llvm::DIFile::ChecksumKind
+map_from_llvmChecksumKind(LLVMChecksumKind CSKind) {
+  switch (CSKind) {
+  case LLVMChecksumKind::CSK_MD5:
+    return llvm::DIFile::CSK_MD5;
+  case LLVMChecksumKind::CSK_SHA1:
+    return llvm::DIFile::CSK_SHA1;
+  case LLVMChecksumKind::CSK_SHA256:
+    return llvm::DIFile::CSK_SHA256;
+  }
+  llvm_unreachable("Unhandled Checksum Kind");
+}
+
+LLVMMetadataRef LLVMDIBuilderCreateFileWithChecksum(
+    LLVMDIBuilderRef Builder, const char *Filename, size_t FilenameLen,
+    const char *Directory, size_t DirectoryLen, LLVMChecksumKind ChecksumKind,
+    const char *Checksum, size_t ChecksumLen, const char *Source,
+    size_t SourceLen) {
+  StringRef ChkSum = StringRef(Checksum, ChecksumLen);
+  auto CSK = map_from_llvmChecksumKind(ChecksumKind);
+  llvm::DIFile::ChecksumInfo<StringRef> CSInfo(CSK, ChkSum);
+  std::optional<StringRef> Src;
+  if (SourceLen > 0)
+    Src = StringRef(Source, SourceLen);
+  return wrap(unwrap(Builder)->createFile(StringRef(Filename, FilenameLen),
+                                          StringRef(Directory, DirectoryLen),
+                                          CSInfo, Src));
+}
+
 LLVMMetadataRef
 LLVMDIBuilderCreateModule(LLVMDIBuilderRef Builder, LLVMMetadataRef ParentScope,
                           const char *Name, size_t NameLen,
@@ -2033,7 +2062,7 @@ void at::remapAssignID(DenseMap<DIAssignID *, DIAssignID *> &Map,
     I.setMetadata(LLVMContext::MD_DIAssignID, GetNewID(ID));
 }
 
-/// Collect constant properies (base, size, offset) of \p StoreDest.
+/// Collect constant properties (base, size, offset) of \p StoreDest.
 /// Return std::nullopt if any properties are not constants or the
 /// offset from the base pointer is negative.
 static std::optional<AssignmentInfo>
@@ -2319,7 +2348,7 @@ PreservedAnalyses AssignmentTrackingPass::run(Function &F,
     return PreservedAnalyses::all();
 
   // Record that this module uses assignment tracking. It doesn't matter that
-  // some functons in the module may not use it - the debug info in those
+  // some functions in the module may not use it - the debug info in those
   // functions will still be handled properly.
   setAssignmentTrackingModuleFlag(*F.getParent());
 
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index 15c0198d07614..88dbd176e0d3f 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -495,9 +495,11 @@ CallInst *IRBuilderBase::CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment,
   if (!PassThru)
     PassThru = PoisonValue::get(Ty);
   Type *OverloadedTypes[] = { Ty, PtrTy };
-  Value *Ops[] = {Ptr, getInt32(Alignment.value()), Mask, PassThru};
-  return CreateMaskedIntrinsic(Intrinsic::masked_load, Ops,
-                               OverloadedTypes, Name);
+  Value *Ops[] = {Ptr, Mask, PassThru};
+  CallInst *CI =
+      CreateMaskedIntrinsic(Intrinsic::masked_load, Ops, OverloadedTypes, Name);
+  CI->addParamAttr(0, Attribute::getWithAlignment(CI->getContext(), Alignment));
+  return CI;
 }
 
 /// Create a call to a Masked Store intrinsic.
@@ -513,8 +515,11 @@ CallInst *IRBuilderBase::CreateMaskedStore(Value *Val, Value *Ptr,
   assert(DataTy->isVectorTy() && "Val should be a vector");
   assert(Mask && "Mask should not be all-ones (null)");
   Type *OverloadedTypes[] = { DataTy, PtrTy };
-  Value *Ops[] = {Val, Ptr, getInt32(Alignment.value()), Mask};
-  return CreateMaskedIntrinsic(Intrinsic::masked_store, Ops, OverloadedTypes);
+  Value *Ops[] = {Val, Ptr, Mask};
+  CallInst *CI =
+      CreateMaskedIntrinsic(Intrinsic::masked_store, Ops, OverloadedTypes);
+  CI->addParamAttr(1, Attribute::getWithAlignment(CI->getContext(), Alignment));
+  return CI;
 }
 
 /// Create a call to a Masked intrinsic, with given intrinsic Id,
@@ -552,12 +557,14 @@ CallInst *IRBuilderBase::CreateMaskedGather(Type *Ty, Value *Ptrs,
     PassThru = PoisonValue::get(Ty);
 
   Type *OverloadedTypes[] = {Ty, PtrsTy};
-  Value *Ops[] = {Ptrs, getInt32(Alignment.value()), Mask, PassThru};
+  Value *Ops[] = {Ptrs, Mask, PassThru};
 
   // We specify only one type when we create this intrinsic. Types of other
   // arguments are derived from this type.
-  return CreateMaskedIntrinsic(Intrinsic::masked_gather, Ops, OverloadedTypes,
-                               Name);
+  CallInst *CI = CreateMaskedIntrinsic(Intrinsic::masked_gather, Ops,
+                                       OverloadedTypes, Name);
+  CI->addParamAttr(0, Attribute::getWithAlignment(CI->getContext(), Alignment));
+  return CI;
 }
 
 /// Create a call to a Masked Scatter intrinsic.
@@ -577,11 +584,14 @@ CallInst *IRBuilderBase::CreateMaskedScatter(Value *Data, Value *Ptrs,
     Mask = getAllOnesMask(NumElts);
 
   Type *OverloadedTypes[] = {DataTy, PtrsTy};
-  Value *Ops[] = {Data, Ptrs, getInt32(Alignment.value()), Mask};
+  Value *Ops[] = {Data, Ptrs, Mask};
 
   // We specify only one type when we create this intrinsic. Types of other
   // arguments are derived from this type.
-  return CreateMaskedIntrinsic(Intrinsic::masked_scatter, Ops, OverloadedTypes);
+  CallInst *CI =
+      CreateMaskedIntrinsic(Intrinsic::masked_scatter, Ops, OverloadedTypes);
+  CI->addParamAttr(1, Attribute::getWithAlignment(CI->getContext(), Alignment));
+  return CI;
 }
 
 /// Create a call to Masked Expand Load intrinsic
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index 9060a89bb15ff..3b8fde8aff45f 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -2878,7 +2878,7 @@ unsigned CastInst::isEliminableCastPair(Instruction::CastOps firstOp,
     { 99,99,99, 0, 0,99,99, 0, 0,99,99,99, 4, 0}, // FPTrunc        |
     { 99,99,99, 2, 2,99,99, 8, 2,99,99,99, 4, 0}, // FPExt          |
     {  1, 0, 0,99,99, 0, 0,99,99,99,99, 7, 3, 0}, // PtrToInt       |
-    {  1, 0, 0,99,99, 0, 0,99,99,99,99, 0, 3, 0}, // PtrToAddr      |
+    {  0, 0, 0,99,99, 0, 0,99,99,99,99, 0, 3, 0}, // PtrToAddr      |
     { 99,99,99,99,99,99,99,99,99,11,11,99,15, 0}, // IntToPtr       |
     {  5, 5, 5, 0, 0, 5, 5, 0, 0,16,16, 5, 1,14}, // BitCast        |
     {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,13,12}, // AddrSpaceCast -+
diff --git a/llvm/lib/IR/Intrinsics.cpp b/llvm/lib/IR/Intrinsics.cpp
index 6797a100ff732..526800e217399 100644
--- a/llvm/lib/IR/Intrinsics.cpp
+++ b/llvm/lib/IR/Intrinsics.cpp
@@ -725,6 +725,19 @@ Function *Intrinsic::getOrInsertDeclaration(Module *M, ID id,
   // There can never be multiple globals with the same name of different types,
   // because intrinsics must be a specific type.
   auto *FT = getType(M->getContext(), id, Tys);
+  Function *F = cast<Function>(
+      M->getOrInsertFunction(
+           Tys.empty() ? getName(id) : getName(id, Tys, M, FT), FT)
+          .getCallee());
+  if (F->getFunctionType() == FT)
+    return F;
+
+  // It's possible that a declaration for this intrinsic already exists with an
+  // incorrect signature, if the signature has changed, but this particular
+  // declaration has not been auto-upgraded yet. In that case, rename the
+  // invalid declaration and insert a new one with the correct signature. The
+  // invalid declaration will get upgraded later.
+  F->setName(F->getName() + ".invalid");
   return cast<Function>(
       M->getOrInsertFunction(
            Tys.empty() ? getName(id) : getName(id, Tys, M, FT), FT)
diff --git a/llvm/lib/IR/ModuleSummaryIndex.cpp b/llvm/lib/IR/ModuleSummaryIndex.cpp
index dc55b63c8b790..a6353664e0e7a 100644
--- a/llvm/lib/IR/ModuleSummaryIndex.cpp
+++ b/llvm/lib/IR/ModuleSummaryIndex.cpp
@@ -162,7 +162,7 @@ void ModuleSummaryIndex::collectDefinedFunctionsForModule(
     StringRef ModulePath, GVSummaryMapTy &GVSummaryMap) const {
   for (auto &GlobalList : *this) {
     auto GUID = GlobalList.first;
-    for (auto &GlobSummary : GlobalList.second.SummaryList) {
+    for (auto &GlobSummary : GlobalList.second.getSummaryList()) {
       auto *Summary = dyn_cast_or_null<FunctionSummary>(GlobSummary.get());
       if (!Summary)
         // Ignore global variable, focus on functions
@@ -263,7 +263,7 @@ void ModuleSummaryIndex::propagateAttributes(
   DenseSet<ValueInfo> MarkedNonReadWriteOnly;
   for (auto &P : *this) {
     bool IsDSOLocal = true;
-    for (auto &S : P.second.SummaryList) {
+    for (auto &S : P.second.getSummaryList()) {
       if (!isGlobalValueLive(S.get())) {
         // computeDeadSymbolsAndUpdateIndirectCalls should have marked all
         // copies live. Note that it is possible that there is a GUID collision
@@ -273,7 +273,7 @@ void ModuleSummaryIndex::propagateAttributes(
         // all copies live we can assert here that all are dead if any copy is
         // dead.
         assert(llvm::none_of(
-            P.second.SummaryList,
+            P.second.getSummaryList(),
             [&](const std::unique_ptr<GlobalValueSummary> &Summary) {
               return isGlobalValueLive(Summary.get());
             }));
@@ -308,16 +308,16 @@ void ModuleSummaryIndex::propagateAttributes(
       // Mark the flag in all summaries false so that we can do quick check
       // without going through the whole list.
       for (const std::unique_ptr<GlobalValueSummary> &Summary :
-           P.second.SummaryList)
+           P.second.getSummaryList())
         Summary->setDSOLocal(false);
   }
   setWithAttributePropagation();
   setWithDSOLocalPropagation();
   if (llvm::AreStatisticsEnabled())
     for (auto &P : *this)
-      if (P.second.SummaryList.size())
+      if (P.second.getSummaryList().size())
         if (auto *GVS = dyn_cast<GlobalVarSummary>(
-                P.second.SummaryList[0]->getBaseObject()))
+                P.second.getSummaryList()[0]->getBaseObject()))
           if (isGlobalValueLive(GVS)) {
             if (GVS->maybeReadOnly())
               ReadOnlyLiveGVars++;
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index 7ea2e46a54183..77af29b9d70f6 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -21,9 +21,6 @@ using namespace RTLIB;
 #define GET_SET_TARGET_RUNTIME_LIBCALL_SETS
 #define DEFINE_GET_LOOKUP_LIBCALL_IMPL_NAME
 #include "llvm/IR/RuntimeLibcalls.inc"
-#undef GET_INIT_RUNTIME_LIBCALL_NAMES
-#undef GET_SET_TARGET_RUNTIME_LIBCALL_SETS
-#undef DEFINE_GET_LOOKUP_LIBCALL_IMPL_NAME
 
 /// Set default libcall names. If a target wants to opt-out of a libcall it
 /// should be placed here.
diff --git a/llvm/lib/IR/Type.cpp b/llvm/lib/IR/Type.cpp
index bcd6910b988ed..8e60577bf8fb4 100644
--- a/llvm/lib/IR/Type.cpp
+++ b/llvm/lib/IR/Type.cpp
@@ -1035,6 +1035,10 @@ static TargetTypeInfo getTargetTypeInfo(const TargetExtType *Ty) {
   }
 
   // DirectX resources
+  if (Name == "dx.Padding")
+    return TargetTypeInfo(
+        ArrayType::get(Type::getInt8Ty(C), Ty->getIntParameter(0)),
+        TargetExtType::CanBeGlobal);
   if (Name.starts_with("dx."))
     return TargetTypeInfo(PointerType::get(C, 0), TargetExtType::CanBeGlobal,
                           TargetExtType::CanBeLocal,
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index cdc8774025a28..3ad216a6f7ce1 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -5681,6 +5681,15 @@ void Verifier::visitInstruction(Instruction &I) {
   InstsInThisBlock.insert(&I);
 }
 
+inline MDString *getMetadataValueAsString(MetadataAsValue *MDV) {
+  if (!MDV)
+    return nullptr;
+  auto *MD = dyn_cast<MDTuple>(MDV->getMetadata());
+  if (!MD || MD->getNumOperands() != 1)
+    return nullptr;
+  return dyn_cast<MDString>(MD->getOperand(0));
+}
+
 /// Allow intrinsics to be verified in different ways.
 void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
   Function *IF = Call.getCalledFunction();
@@ -6227,13 +6236,10 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     Check(Call.getType()->isVectorTy(), "masked_load: must return a vector",
           Call);
 
-    ConstantInt *Alignment = cast<ConstantInt>(Call.getArgOperand(1));
-    Value *Mask = Call.getArgOperand(2);
-    Value *PassThru = Call.getArgOperand(3);
+    Value *Mask = Call.getArgOperand(1);
+    Value *PassThru = Call.getArgOperand(2);
     Check(Mask->getType()->isVectorTy(), "masked_load: mask must be vector",
           Call);
-    Check(Alignment->getValue().isPowerOf2(),
-          "masked_load: alignment must be a power of 2", Call);
     Check(PassThru->getType() == Call.getType(),
           "masked_load: pass through and return type must match", Call);
     Check(cast<VectorType>(Mask->getType())->getElementCount() ==
@@ -6243,33 +6249,15 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
   }
   case Intrinsic::masked_store: {
     Value *Val = Call.getArgOperand(0);
-    ConstantInt *Alignment = cast<ConstantInt>(Call.getArgOperand(2));
-    Value *Mask = Call.getArgOperand(3);
+    Value *Mask = Call.getArgOperand(2);
     Check(Mask->getType()->isVectorTy(), "masked_store: mask must be vector",
           Call);
-    Check(Alignment->getValue().isPowerOf2(),
-          "masked_store: alignment must be a power of 2", Call);
     Check(cast<VectorType>(Mask->getType())->getElementCount() ==
               cast<VectorType>(Val->getType())->getElementCount(),
           "masked_store: vector mask must be same length as value", Call);
     break;
   }
 
-  case Intrinsic::masked_gather: {
-    const APInt &Alignment =
-        cast<ConstantInt>(Call.getArgOperand(1))->getValue();
-    Check(Alignment.isZero() || Alignment.isPowerOf2(),
-          "masked_gather: alignment must be 0 or a power of 2", Call);
-    break;
-  }
-  case Intrinsic::masked_scatter: {
-    const APInt &Alignment =
-        cast<ConstantInt>(Call.getArgOperand(2))->getValue();
-    Check(Alignment.isZero() || Alignment.isPowerOf2(),
-          "masked_scatter: alignment must be 0 or a power of 2", Call);
-    break;
-  }
-
   case Intrinsic::experimental_guard: {
     Check(isa<CallInst>(Call), "experimental_guard cannot be invoked", Call);
     Check(Call.countOperandBundlesOfType(LLVMContext::OB_deopt) == 1,
@@ -6495,9 +6483,12 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
               NumRows->getZExtValue() * NumColumns->getZExtValue(),
           "Result of a matrix operation does not fit in the returned vector!");
 
-    if (Stride)
+    if (Stride) {
+      Check(Stride->getBitWidth() <= 64, "Stride bitwidth cannot exceed 64!",
+            IF);
       Check(Stride->getZExtValue() >= NumRows->getZExtValue(),
             "Stride must be greater or equal than the number of rows!", IF);
+    }
 
     break;
   }
@@ -6898,14 +6889,32 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
           &Call, PtrArg);
 
     // Last argument must be a MD string
-    auto *Op = cast<MetadataAsValue>(Call.getArgOperand(Call.arg_size() - 1));
-    MDNode *MD = cast<MDNode>(Op->getMetadata());
-    Check((MD->getNumOperands() == 1) && isa<MDString>(MD->getOperand(0)),
+    auto *Op =
+        dyn_cast<MetadataAsValue>(Call.getArgOperand(Call.arg_size() - 1));
+    Check(getMetadataValueAsString(Op) != nullptr,
           "cooperative atomic intrinsics require that the last argument is a "
           "metadata string",
           &Call, Op);
     break;
   }
+  case Intrinsic::amdgcn_global_load_b128:
+  case Intrinsic::amdgcn_global_store_b128: {
+    auto *Op =
+        dyn_cast<MetadataAsValue>(Call.getArgOperand(Call.arg_size() - 1));
+    MDString *MDStr = getMetadataValueAsString(Op);
+    Check(MDStr != nullptr,
+          "global load/store intrinsics require that the last argument is a "
+          "metadata string",
+          &Call, Op);
+
+    StringRef Scope = MDStr->getString();
+    Check(Scope == "" || Scope == "agent" || Scope == "workgroup" ||
+              Scope == "wavefront",
+          "'" + Scope +
+              "' is not a valid scope for global load/store intrinsics",
+          &Call, Op);
+    break;
+  }
   case Intrinsic::nvvm_setmaxnreg_inc_sync_aligned_u32:
   case Intrinsic::nvvm_setmaxnreg_dec_sync_aligned_u32: {
     Value *V = Call.getArgOperand(0);
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index e6544f3bafff4..cbc0b1d3256bf 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -457,7 +457,7 @@ void llvm::thinLTOResolvePrevailingInIndex(
   // when needed.
   DenseSet<GlobalValueSummary *> GlobalInvolvedWithAlias;
   for (auto &I : Index)
-    for (auto &S : I.second.SummaryList)
+    for (auto &S : I.second.getSummaryList())
       if (auto AS = dyn_cast<AliasSummary>(S.get()))
         GlobalInvolvedWithAlias.insert(&AS->getAliasee());
 
@@ -1182,7 +1182,7 @@ Error LTO::checkPartiallySplit() {
   // Otherwise check if there are any recorded in the combined summary from the
   // ThinLTO modules.
   for (auto &P : ThinLTO.CombinedIndex) {
-    for (auto &S : P.second.SummaryList) {
+    for (auto &S : P.second.getSummaryList()) {
       auto *FS = dyn_cast<FunctionSummary>(S.get());
       if (!FS)
         continue;
@@ -1257,38 +1257,6 @@ Error LTO::run(AddStreamFn AddStream, FileCache Cache) {
   return Result;
 }
 
-void lto::updateMemProfAttributes(Module &Mod,
-                                  const ModuleSummaryIndex &Index) {
-  llvm::TimeTraceScope timeScope("LTO update memprof attributes");
-  if (Index.withSupportsHotColdNew())
-    return;
-
-  // The profile matcher applies hotness attributes directly for allocations,
-  // and those will cause us to generate calls to the hot/cold interfaces
-  // unconditionally. If supports-hot-cold-new was not enabled in the LTO
-  // link then assume we don't want these calls (e.g. not linking with
-  // the appropriate library, or otherwise trying to disable this behavior).
-  for (auto &F : Mod) {
-    for (auto &BB : F) {
-      for (auto &I : BB) {
-        auto *CI = dyn_cast<CallBase>(&I);
-        if (!CI)
-          continue;
-        if (CI->hasFnAttr("memprof"))
-          CI->removeFnAttr("memprof");
-        // Strip off all memprof metadata as it is no longer needed.
-        // Importantly, this avoids the addition of new memprof attributes
-        // after inlining propagation.
-        // TODO: If we support additional types of MemProf metadata beyond hot
-        // and cold, we will need to update the metadata based on the allocator
-        // APIs supported instead of completely stripping all.
-        CI->setMetadata(LLVMContext::MD_memprof, nullptr);
-        CI->setMetadata(LLVMContext::MD_callsite, nullptr);
-      }
-    }
-  }
-}
-
 Error LTO::runRegularLTO(AddStreamFn AddStream) {
   llvm::TimeTraceScope timeScope("Run regular LTO");
   LLVMContext &CombinedCtx = RegularLTO.CombinedModule->getContext();
@@ -1346,8 +1314,6 @@ Error LTO::runRegularLTO(AddStreamFn AddStream) {
     }
   }
 
-  updateMemProfAttributes(*RegularLTO.CombinedModule, ThinLTO.CombinedIndex);
-
   bool WholeProgramVisibilityEnabledInLTO =
       Conf.HasWholeProgramVisibility &&
       // If validation is enabled, upgrade visibility only when all vtables
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index 48e3c9cf0172c..d9defe721cc18 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -799,7 +799,6 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
   }
 
   // Do this after any importing so that imported code is updated.
-  updateMemProfAttributes(Mod, CombinedIndex);
   updatePublicTypeTestCalls(Mod, CombinedIndex.withWholeProgramVisibility());
 
   if (Conf.PostImportModuleHook && !Conf.PostImportModuleHook(Task, Mod))
@@ -844,11 +843,11 @@ bool lto::initImportList(const Module &M,
   // via a WriteIndexesThinBackend.
   for (const auto &GlobalList : CombinedIndex) {
     // Ignore entries for undefined references.
-    if (GlobalList.second.SummaryList.empty())
+    if (GlobalList.second.getSummaryList().empty())
       continue;
 
     auto GUID = GlobalList.first;
-    for (const auto &Summary : GlobalList.second.SummaryList) {
+    for (const auto &Summary : GlobalList.second.getSummaryList()) {
       // Skip the summaries for the importing module. These are included to
       // e.g. record required linkage changes.
       if (Summary->modulePath() == M.getModuleIdentifier())
diff --git a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
index 5b333cda31fb8..ff94c54ab3e6e 100644
--- a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -101,8 +101,8 @@ static void saveTempBitcode(const Module &TheModule, StringRef TempDir,
   WriteBitcodeToFile(TheModule, OS, /* ShouldPreserveUseListOrder */ true);
 }
 
-static const GlobalValueSummary *
-getFirstDefinitionForLinker(const GlobalValueSummaryList &GVSummaryList) {
+static const GlobalValueSummary *getFirstDefinitionForLinker(
+    ArrayRef<std::unique_ptr<GlobalValueSummary>> GVSummaryList) {
   // If there is any strong definition anywhere, get it.
   auto StrongDefForLinker = llvm::find_if(
       GVSummaryList, [](const std::unique_ptr<GlobalValueSummary> &Summary) {
@@ -131,14 +131,15 @@ getFirstDefinitionForLinker(const GlobalValueSummaryList &GVSummaryList) {
 static void computePrevailingCopies(
     const ModuleSummaryIndex &Index,
     DenseMap<GlobalValue::GUID, const GlobalValueSummary *> &PrevailingCopy) {
-  auto HasMultipleCopies = [&](const GlobalValueSummaryList &GVSummaryList) {
-    return GVSummaryList.size() > 1;
-  };
+  auto HasMultipleCopies =
+      [&](ArrayRef<std::unique_ptr<GlobalValueSummary>> GVSummaryList) {
+        return GVSummaryList.size() > 1;
+      };
 
   for (auto &I : Index) {
-    if (HasMultipleCopies(I.second.SummaryList))
+    if (HasMultipleCopies(I.second.getSummaryList()))
       PrevailingCopy[I.first] =
-          getFirstDefinitionForLinker(I.second.SummaryList);
+          getFirstDefinitionForLinker(I.second.getSummaryList());
   }
 }
 
diff --git a/llvm/lib/MC/MCObjectFileInfo.cpp b/llvm/lib/MC/MCObjectFileInfo.cpp
index a755c22ab879a..aee3c3b84fd4a 100644
--- a/llvm/lib/MC/MCObjectFileInfo.cpp
+++ b/llvm/lib/MC/MCObjectFileInfo.cpp
@@ -553,7 +553,8 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(const Triple &T, bool Large) {
   SFrameSection =
       Ctx->getELFSection(".sframe", ELF::SHT_GNU_SFRAME, ELF::SHF_ALLOC);
 
-  CallGraphSection = Ctx->getELFSection(".callgraph", ELF::SHT_PROGBITS, 0);
+  CallGraphSection =
+      Ctx->getELFSection(".llvm.callgraph", ELF::SHT_PROGBITS, 0);
 
   StackSizesSection = Ctx->getELFSection(".stack_sizes", ELF::SHT_PROGBITS, 0);
 
@@ -1171,8 +1172,8 @@ MCObjectFileInfo::getCallGraphSection(const MCSection &TextSec) const {
   }
 
   return Ctx->getELFSection(
-      ".callgraph", ELF::SHT_PROGBITS, Flags, 0, GroupName, true,
-      ElfSec.getUniqueID(),
+      ".llvm.callgraph", ELF::SHT_PROGBITS, Flags, 0, GroupName,
+      /*IsComdat=*/true, ElfSec.getUniqueID(),
       static_cast<const MCSymbolELF *>(TextSec.getBeginSymbol()));
 }
 
diff --git a/llvm/lib/MC/MCParser/MasmParser.cpp b/llvm/lib/MC/MCParser/MasmParser.cpp
index 520def4fa007f..cb0a67e787bec 100644
--- a/llvm/lib/MC/MCParser/MasmParser.cpp
+++ b/llvm/lib/MC/MCParser/MasmParser.cpp
@@ -2902,7 +2902,7 @@ bool MasmParser::parseIdentifier(StringRef &Res,
   if (Position == StartOfStatement &&
       StringSwitch<bool>(Res)
           .CaseLower("echo", true)
-          .CasesLower("ifdef", "ifndef", "elseifdef", "elseifndef", true)
+          .CasesLower({"ifdef", "ifndef", "elseifdef", "elseifndef"}, true)
           .Default(false)) {
     ExpandNextToken = DoNotExpandMacros;
   }
diff --git a/llvm/lib/ObjCopy/ConfigManager.cpp b/llvm/lib/ObjCopy/ConfigManager.cpp
index eef8a2190c4d2..6b7b4f116f583 100644
--- a/llvm/lib/ObjCopy/ConfigManager.cpp
+++ b/llvm/lib/ObjCopy/ConfigManager.cpp
@@ -122,14 +122,14 @@ ConfigManager::getDXContainerConfig() const {
   if (!Common.AddGnuDebugLink.empty() || !Common.SplitDWO.empty() ||
       !Common.AllocSectionsPrefix.empty() ||
       Common.DiscardMode != DiscardType::None || !Common.AddSection.empty() ||
-      !Common.DumpSection.empty() || !Common.KeepSection.empty() ||
-      !Common.SectionsToRename.empty() || !Common.SetSectionAlignment.empty() ||
-      !Common.SetSectionFlags.empty() || !Common.SetSectionType.empty() ||
-      Common.ExtractDWO || Common.OnlyKeepDebug || Common.StripAllGNU ||
-      Common.StripDWO || Common.StripDebug || Common.StripNonAlloc ||
-      Common.StripSections || Common.StripUnneeded ||
-      Common.DecompressDebugSections || Common.GapFill != 0 ||
-      Common.PadTo != 0 || Common.ChangeSectionLMAValAll != 0 ||
+      !Common.KeepSection.empty() || !Common.SectionsToRename.empty() ||
+      !Common.SetSectionAlignment.empty() || !Common.SetSectionFlags.empty() ||
+      !Common.SetSectionType.empty() || Common.ExtractDWO ||
+      Common.OnlyKeepDebug || Common.StripAllGNU || Common.StripDWO ||
+      Common.StripDebug || Common.StripNonAlloc || Common.StripSections ||
+      Common.StripUnneeded || Common.DecompressDebugSections ||
+      Common.GapFill != 0 || Common.PadTo != 0 ||
+      Common.ChangeSectionLMAValAll != 0 ||
       !Common.ChangeSectionAddress.empty()) {
     return createStringError(llvm::errc::invalid_argument,
                              "option is not supported for DXContainer");
diff --git a/llvm/lib/ObjCopy/DXContainer/DXContainerObjcopy.cpp b/llvm/lib/ObjCopy/DXContainer/DXContainerObjcopy.cpp
index d7f3c0d1f7b36..95ab3d944b8f8 100644
--- a/llvm/lib/ObjCopy/DXContainer/DXContainerObjcopy.cpp
+++ b/llvm/lib/ObjCopy/DXContainer/DXContainerObjcopy.cpp
@@ -9,8 +9,10 @@
 #include "llvm/ObjCopy/DXContainer/DXContainerObjcopy.h"
 #include "DXContainerReader.h"
 #include "DXContainerWriter.h"
+#include "llvm/BinaryFormat/DXContainer.h"
 #include "llvm/ObjCopy/CommonConfig.h"
 #include "llvm/ObjCopy/DXContainer/DXContainerConfig.h"
+#include "llvm/Support/FileOutputBuffer.h"
 #include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
@@ -42,7 +44,47 @@ static Error extractPartAsObject(StringRef PartName, StringRef OutFilename,
                          "part '%s' not found", PartName.str().c_str());
 }
 
+static Error dumpPartToFile(StringRef PartName, StringRef Filename,
+                            StringRef InputFilename, Object &Obj) {
+  auto PartIter = llvm::find_if(
+      Obj.Parts, [&PartName](const Part &P) { return P.Name == PartName; });
+  if (PartIter == Obj.Parts.end())
+    return createFileError(Filename,
+                           std::make_error_code(std::errc::invalid_argument),
+                           "part '%s' not found", PartName.str().c_str());
+  ArrayRef<uint8_t> Contents = PartIter->Data;
+  // The DXContainer format is a bit odd because the part-specific headers are
+  // contained inside the part data itself. For parts that contain LLVM bitcode
+  // when we dump the part we want to skip the part-specific header so that we
+  // get a valid .bc file that we can inspect. All the data contained inside the
+  // program header is pulled out of the bitcode, so the header can be
+  // reconstructed if needed from the bitcode itself. More comprehensive
+  // documentation on the DXContainer format can be found at
+  // https://llvm.org/docs/DirectX/DXContainer.html.
+
+  if (PartName == "DXIL" || PartName == "STAT")
+    Contents = Contents.drop_front(sizeof(llvm::dxbc::ProgramHeader));
+  if (Contents.empty())
+    return createFileError(Filename, object_error::parse_failed,
+                           "part '%s' is empty", PartName.str().c_str());
+  Expected<std::unique_ptr<FileOutputBuffer>> BufferOrErr =
+      FileOutputBuffer::create(Filename, Contents.size());
+  if (!BufferOrErr)
+    return createFileError(Filename, BufferOrErr.takeError());
+  std::unique_ptr<FileOutputBuffer> Buf = std::move(*BufferOrErr);
+  llvm::copy(Contents, Buf->getBufferStart());
+  if (Error E = Buf->commit())
+    return createFileError(Filename, std::move(E));
+  return Error::success();
+}
+
 static Error handleArgs(const CommonConfig &Config, Object &Obj) {
+  for (StringRef Flag : Config.DumpSection) {
+    auto [SecName, FileName] = Flag.split("=");
+    if (Error E = dumpPartToFile(SecName, FileName, Config.InputFilename, Obj))
+      return E;
+  }
+
   // Extract all sections before any modifications.
   for (StringRef Flag : Config.ExtractSection) {
     StringRef SectionName;
diff --git a/llvm/lib/Object/ELF.cpp b/llvm/lib/Object/ELF.cpp
index 53699ce0d4fcf..f256e7b56591b 100644
--- a/llvm/lib/Object/ELF.cpp
+++ b/llvm/lib/Object/ELF.cpp
@@ -837,7 +837,7 @@ decodeBBAddrMapImpl(const ELFFile<ELFT> &EF,
     Version = Data.getU8(Cur);
     if (!Cur)
       break;
-    if (Version < 2 || Version > 3)
+    if (Version < 2 || Version > 4)
       return createError("unsupported SHT_LLVM_BB_ADDR_MAP version: " +
                          Twine(static_cast<int>(Version)));
     Feature = Data.getU8(Cur); // Feature byte
@@ -852,6 +852,11 @@ decodeBBAddrMapImpl(const ELFFile<ELFT> &EF,
                          "callsite offsets feature is enabled: version = " +
                          Twine(static_cast<int>(Version)) +
                          " feature = " + Twine(static_cast<int>(Feature)));
+    if (FeatEnable.BBHash && Version < 4)
+      return createError("version should be >= 4 for SHT_LLVM_BB_ADDR_MAP when "
+                         "basic block hash feature is enabled: version = " +
+                         Twine(static_cast<int>(Version)) +
+                         " feature = " + Twine(static_cast<int>(Feature)));
     uint32_t NumBlocksInBBRange = 0;
     uint32_t NumBBRanges = 1;
     typename ELFFile<ELFT>::uintX_t RangeBaseAddress = 0;
@@ -907,6 +912,7 @@ decodeBBAddrMapImpl(const ELFFile<ELFT> &EF,
           uint32_t Size = readULEB128As<uint32_t>(Data, Cur, ULEBSizeErr) +
                           LastCallsiteEndOffset;
           uint32_t MD = readULEB128As<uint32_t>(Data, Cur, ULEBSizeErr);
+          uint64_t Hash = FeatEnable.BBHash ? Data.getU64(Cur) : 0;
           Expected<BBAddrMap::BBEntry::Metadata> MetadataOrErr =
               BBAddrMap::BBEntry::Metadata::decode(MD);
           if (!MetadataOrErr) {
@@ -914,7 +920,7 @@ decodeBBAddrMapImpl(const ELFFile<ELFT> &EF,
             break;
           }
           BBEntries.push_back({ID, Offset + PrevBBEndOffset, Size,
-                               *MetadataOrErr, CallsiteEndOffsets});
+                               *MetadataOrErr, CallsiteEndOffsets, Hash});
           PrevBBEndOffset += Offset + Size;
         }
         TotalNumBlocks += BBEntries.size();
diff --git a/llvm/lib/ObjectYAML/ELFEmitter.cpp b/llvm/lib/ObjectYAML/ELFEmitter.cpp
index faeeab32f5ad2..8b75fbe8291f0 100644
--- a/llvm/lib/ObjectYAML/ELFEmitter.cpp
+++ b/llvm/lib/ObjectYAML/ELFEmitter.cpp
@@ -1465,7 +1465,7 @@ void ELFState<ELFT>::writeSectionContent(
   for (const auto &[Idx, E] : llvm::enumerate(*Section.Entries)) {
     // Write version and feature values.
     if (Section.Type == llvm::ELF::SHT_LLVM_BB_ADDR_MAP) {
-      if (E.Version > 3)
+      if (E.Version > 4)
         WithColor::warning() << "unsupported SHT_LLVM_BB_ADDR_MAP version: "
                              << static_cast<int>(E.Version)
                              << "; encoding using the most recent version";
@@ -1526,6 +1526,12 @@ void ELFState<ELFT>::writeSectionContent(
         }
         SHeader.sh_size += CBA.writeULEB128(BBE.Size);
         SHeader.sh_size += CBA.writeULEB128(BBE.Metadata);
+        if (FeatureOrErr->BBHash || BBE.Hash.has_value()) {
+          uint64_t Hash =
+              BBE.Hash.has_value() ? BBE.Hash.value() : llvm::yaml::Hex64(0);
+          CBA.write<uint64_t>(Hash, ELFT::Endianness);
+          SHeader.sh_size += 8;
+        }
       }
     }
     if (!PGOAnalyses)
diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp
index d9cce1eb7641d..c3a27c9326790 100644
--- a/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -488,6 +488,7 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
     BCaseMask(EF_HEXAGON_MACH_V5, EF_HEXAGON_MACH);
     BCaseMask(EF_HEXAGON_MACH_V55, EF_HEXAGON_MACH);
     BCaseMask(EF_HEXAGON_MACH_V60, EF_HEXAGON_MACH);
+    BCaseMask(EF_HEXAGON_MACH_V61, EF_HEXAGON_MACH);
     BCaseMask(EF_HEXAGON_MACH_V62, EF_HEXAGON_MACH);
     BCaseMask(EF_HEXAGON_MACH_V65, EF_HEXAGON_MACH);
     BCaseMask(EF_HEXAGON_MACH_V66, EF_HEXAGON_MACH);
@@ -499,12 +500,21 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
     BCaseMask(EF_HEXAGON_MACH_V71T, EF_HEXAGON_MACH);
     BCaseMask(EF_HEXAGON_MACH_V73, EF_HEXAGON_MACH);
     BCaseMask(EF_HEXAGON_MACH_V75, EF_HEXAGON_MACH);
+    BCaseMask(EF_HEXAGON_MACH_V77, EF_HEXAGON_MACH);
+    BCaseMask(EF_HEXAGON_MACH_V79, EF_HEXAGON_MACH);
+    BCaseMask(EF_HEXAGON_MACH_V81, EF_HEXAGON_MACH);
+    BCaseMask(EF_HEXAGON_MACH_V83, EF_HEXAGON_MACH);
+    BCaseMask(EF_HEXAGON_MACH_V85, EF_HEXAGON_MACH);
+    BCaseMask(EF_HEXAGON_MACH_V87, EF_HEXAGON_MACH);
+    BCaseMask(EF_HEXAGON_MACH_V89, EF_HEXAGON_MACH);
+    BCaseMask(EF_HEXAGON_MACH_V91, EF_HEXAGON_MACH);
     BCaseMask(EF_HEXAGON_ISA_V2, EF_HEXAGON_ISA);
     BCaseMask(EF_HEXAGON_ISA_V3, EF_HEXAGON_ISA);
     BCaseMask(EF_HEXAGON_ISA_V4, EF_HEXAGON_ISA);
     BCaseMask(EF_HEXAGON_ISA_V5, EF_HEXAGON_ISA);
     BCaseMask(EF_HEXAGON_ISA_V55, EF_HEXAGON_ISA);
     BCaseMask(EF_HEXAGON_ISA_V60, EF_HEXAGON_ISA);
+    BCaseMask(EF_HEXAGON_ISA_V61, EF_HEXAGON_ISA);
     BCaseMask(EF_HEXAGON_ISA_V62, EF_HEXAGON_ISA);
     BCaseMask(EF_HEXAGON_ISA_V65, EF_HEXAGON_ISA);
     BCaseMask(EF_HEXAGON_ISA_V66, EF_HEXAGON_ISA);
@@ -514,6 +524,14 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
     BCaseMask(EF_HEXAGON_ISA_V71, EF_HEXAGON_ISA);
     BCaseMask(EF_HEXAGON_ISA_V73, EF_HEXAGON_ISA);
     BCaseMask(EF_HEXAGON_ISA_V75, EF_HEXAGON_ISA);
+    BCaseMask(EF_HEXAGON_ISA_V77, EF_HEXAGON_ISA);
+    BCaseMask(EF_HEXAGON_ISA_V79, EF_HEXAGON_ISA);
+    BCaseMask(EF_HEXAGON_ISA_V81, EF_HEXAGON_ISA);
+    BCaseMask(EF_HEXAGON_ISA_V83, EF_HEXAGON_ISA);
+    BCaseMask(EF_HEXAGON_ISA_V85, EF_HEXAGON_ISA);
+    BCaseMask(EF_HEXAGON_ISA_V87, EF_HEXAGON_ISA);
+    BCaseMask(EF_HEXAGON_ISA_V89, EF_HEXAGON_ISA);
+    BCaseMask(EF_HEXAGON_ISA_V91, EF_HEXAGON_ISA);
     break;
   case ELF::EM_AVR:
     BCaseMask(EF_AVR_ARCH_AVR1, EF_AVR_ARCH_MASK);
@@ -1887,6 +1905,7 @@ void MappingTraits<ELFYAML::BBAddrMapEntry::BBEntry>::mapping(
   IO.mapRequired("Size", E.Size);
   IO.mapRequired("Metadata", E.Metadata);
   IO.mapOptional("CallsiteEndOffsets", E.CallsiteEndOffsets);
+  IO.mapOptional("Hash", E.Hash);
 }
 
 void MappingTraits<ELFYAML::PGOAnalysisMapEntry>::mapping(
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 53cf0046bd858..e45cac848d0a2 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -2027,13 +2027,13 @@ Error PassBuilder::parseModulePass(ModulePassManager &MPM,
 #define LOOPNEST_PASS(NAME, CREATE_PASS)                                       \
   if (Name == NAME) {                                                          \
     MPM.addPass(createModuleToFunctionPassAdaptor(                             \
-        createFunctionToLoopPassAdaptor(CREATE_PASS, false, false)));          \
+        createFunctionToLoopPassAdaptor(CREATE_PASS, false)));                 \
     return Error::success();                                                   \
   }
 #define LOOP_PASS(NAME, CREATE_PASS)                                           \
   if (Name == NAME) {                                                          \
     MPM.addPass(createModuleToFunctionPassAdaptor(                             \
-        createFunctionToLoopPassAdaptor(CREATE_PASS, false, false)));          \
+        createFunctionToLoopPassAdaptor(CREATE_PASS, false)));                 \
     return Error::success();                                                   \
   }
 #define LOOP_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS)        \
@@ -2041,9 +2041,8 @@ Error PassBuilder::parseModulePass(ModulePassManager &MPM,
     auto Params = parsePassParameters(PARSER, Name, NAME);                     \
     if (!Params)                                                               \
       return Params.takeError();                                               \
-    MPM.addPass(                                                               \
-        createModuleToFunctionPassAdaptor(createFunctionToLoopPassAdaptor(     \
-            CREATE_PASS(Params.get()), false, false)));                        \
+    MPM.addPass(createModuleToFunctionPassAdaptor(                             \
+        createFunctionToLoopPassAdaptor(CREATE_PASS(Params.get()), false)));   \
     return Error::success();                                                   \
   }
 #include "PassRegistry.def"
@@ -2142,13 +2141,13 @@ Error PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM,
 #define LOOPNEST_PASS(NAME, CREATE_PASS)                                       \
   if (Name == NAME) {                                                          \
     CGPM.addPass(createCGSCCToFunctionPassAdaptor(                             \
-        createFunctionToLoopPassAdaptor(CREATE_PASS, false, false)));          \
+        createFunctionToLoopPassAdaptor(CREATE_PASS, false)));                 \
     return Error::success();                                                   \
   }
 #define LOOP_PASS(NAME, CREATE_PASS)                                           \
   if (Name == NAME) {                                                          \
     CGPM.addPass(createCGSCCToFunctionPassAdaptor(                             \
-        createFunctionToLoopPassAdaptor(CREATE_PASS, false, false)));          \
+        createFunctionToLoopPassAdaptor(CREATE_PASS, false)));                 \
     return Error::success();                                                   \
   }
 #define LOOP_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS)        \
@@ -2156,9 +2155,8 @@ Error PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM,
     auto Params = parsePassParameters(PARSER, Name, NAME);                     \
     if (!Params)                                                               \
       return Params.takeError();                                               \
-    CGPM.addPass(                                                              \
-        createCGSCCToFunctionPassAdaptor(createFunctionToLoopPassAdaptor(      \
-            CREATE_PASS(Params.get()), false, false)));                        \
+    CGPM.addPass(createCGSCCToFunctionPassAdaptor(                             \
+        createFunctionToLoopPassAdaptor(CREATE_PASS(Params.get()), false)));   \
     return Error::success();                                                   \
   }
 #include "PassRegistry.def"
@@ -2191,11 +2189,8 @@ Error PassBuilder::parseFunctionPass(FunctionPassManager &FPM,
         return Err;
       // Add the nested pass manager with the appropriate adaptor.
       bool UseMemorySSA = (Name == "loop-mssa");
-      bool UseBFI = llvm::any_of(InnerPipeline, [](auto Pipeline) {
-        return Pipeline.Name.contains("simple-loop-unswitch");
-      });
-      FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM), UseMemorySSA,
-                                                  UseBFI));
+      FPM.addPass(
+          createFunctionToLoopPassAdaptor(std::move(LPM), UseMemorySSA));
       return Error::success();
     }
     if (Name == "machine-function") {
@@ -2248,12 +2243,12 @@ Error PassBuilder::parseFunctionPass(FunctionPassManager &FPM,
 //        The risk is that it may become obsolete if we're not careful.
 #define LOOPNEST_PASS(NAME, CREATE_PASS)                                       \
   if (Name == NAME) {                                                          \
-    FPM.addPass(createFunctionToLoopPassAdaptor(CREATE_PASS, false, false));   \
+    FPM.addPass(createFunctionToLoopPassAdaptor(CREATE_PASS, false));          \
     return Error::success();                                                   \
   }
 #define LOOP_PASS(NAME, CREATE_PASS)                                           \
   if (Name == NAME) {                                                          \
-    FPM.addPass(createFunctionToLoopPassAdaptor(CREATE_PASS, false, false));   \
+    FPM.addPass(createFunctionToLoopPassAdaptor(CREATE_PASS, false));          \
     return Error::success();                                                   \
   }
 #define LOOP_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS)        \
@@ -2261,8 +2256,8 @@ Error PassBuilder::parseFunctionPass(FunctionPassManager &FPM,
     auto Params = parsePassParameters(PARSER, Name, NAME);                     \
     if (!Params)                                                               \
       return Params.takeError();                                               \
-    FPM.addPass(createFunctionToLoopPassAdaptor(CREATE_PASS(Params.get()),     \
-                                                false, false));                \
+    FPM.addPass(                                                               \
+        createFunctionToLoopPassAdaptor(CREATE_PASS(Params.get()), false));    \
     return Error::success();                                                   \
   }
 #include "PassRegistry.def"
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index a04464f7409e6..2875cbd480951 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -525,16 +525,14 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level,
   invokeLoopOptimizerEndEPCallbacks(LPM2, Level);
 
   FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1),
-                                              /*UseMemorySSA=*/true,
-                                              /*UseBlockFrequencyInfo=*/true));
+                                              /*UseMemorySSA=*/true));
   FPM.addPass(
       SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
   FPM.addPass(InstCombinePass());
   // The loop passes in LPM2 (LoopFullUnrollPass) do not preserve MemorySSA.
   // *All* loop passes must preserve it, in order to be able to use it.
   FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2),
-                                              /*UseMemorySSA=*/false,
-                                              /*UseBlockFrequencyInfo=*/false));
+                                              /*UseMemorySSA=*/false));
 
   // Delete small array after loop unroll.
   FPM.addPass(SROAPass(SROAOptions::ModifyCFG));
@@ -716,8 +714,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   invokeLoopOptimizerEndEPCallbacks(LPM2, Level);
 
   FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1),
-                                              /*UseMemorySSA=*/true,
-                                              /*UseBlockFrequencyInfo=*/true));
+                                              /*UseMemorySSA=*/true));
   FPM.addPass(
       SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
   FPM.addPass(InstCombinePass());
@@ -725,8 +722,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   // LoopDeletionPass and LoopFullUnrollPass) do not preserve MemorySSA.
   // *All* loop passes must preserve it, in order to be able to use it.
   FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2),
-                                              /*UseMemorySSA=*/false,
-                                              /*UseBlockFrequencyInfo=*/false));
+                                              /*UseMemorySSA=*/false));
 
   // Delete small array after loop unroll.
   FPM.addPass(SROAPass(SROAOptions::ModifyCFG));
@@ -779,7 +775,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   FPM.addPass(createFunctionToLoopPassAdaptor(
       LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap,
                /*AllowSpeculation=*/true),
-      /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/false));
+      /*UseMemorySSA=*/true));
 
   FPM.addPass(CoroElidePass());
 
@@ -848,8 +844,7 @@ void PassBuilder::addPostPGOLoopRotation(ModulePassManager &MPM,
         createFunctionToLoopPassAdaptor(
             LoopRotatePass(EnableLoopHeaderDuplication ||
                            Level != OptimizationLevel::Oz),
-            /*UseMemorySSA=*/false,
-            /*UseBlockFrequencyInfo=*/false),
+            /*UseMemorySSA=*/false),
         PTO.EagerlyInvalidateAnalyses));
   }
 }
@@ -1366,8 +1361,7 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
     LPM.addPass(SimpleLoopUnswitchPass(/* NonTrivial */ Level ==
                                        OptimizationLevel::O3));
     ExtraPasses.addPass(
-        createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA=*/true,
-                                        /*UseBlockFrequencyInfo=*/true));
+        createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA=*/true));
     ExtraPasses.addPass(
         SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
     ExtraPasses.addPass(InstCombinePass());
@@ -1446,7 +1440,7 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
   FPM.addPass(createFunctionToLoopPassAdaptor(
       LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap,
                /*AllowSpeculation=*/true),
-      /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/false));
+      /*UseMemorySSA=*/true));
 
   // Now that we've vectorized and unrolled loops, we may have more refined
   // alignment information, try to re-derive it here.
@@ -1528,7 +1522,7 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
     OptimizePM.addPass(createFunctionToLoopPassAdaptor(
         LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap,
                  /*AllowSpeculation=*/true),
-        /*USeMemorySSA=*/true, /*UseBlockFrequencyInfo=*/false));
+        /*USeMemorySSA=*/true));
   }
 
   OptimizePM.addPass(Float2IntPass());
@@ -1568,8 +1562,8 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
   if (PTO.LoopInterchange)
     LPM.addPass(LoopInterchangePass());
 
-  OptimizePM.addPass(createFunctionToLoopPassAdaptor(
-      std::move(LPM), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/false));
+  OptimizePM.addPass(
+      createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA=*/false));
 
   // FIXME: This may not be the right place in the pipeline.
   // We need to have the data to support the right place.
@@ -1666,6 +1660,17 @@ PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level,
 
   ModulePassManager MPM;
 
+  // Currently this pipeline is only invoked in an LTO pre link pass or when we
+  // are not running LTO. If that changes the below checks may need updating.
+  assert(isLTOPreLink(Phase) || Phase == ThinOrFullLTOPhase::None ||
+         Phase == ThinOrFullLTOPhase::CustomLTOPostLink);
+
+  // If we are invoking this in non-LTO mode, remove any MemProf related
+  // attributes and metadata, as we don't know whether we are linking with
+  // a library containing the necessary interfaces.
+  if (Phase == ThinOrFullLTOPhase::None)
+    MPM.addPass(MemProfRemoveInfo());
+
   // Convert @llvm.global.annotations to !annotation metadata.
   MPM.addPass(Annotation2MetadataPass());
 
@@ -1811,6 +1816,12 @@ ModulePassManager PassBuilder::buildThinLTODefaultPipeline(
     OptimizationLevel Level, const ModuleSummaryIndex *ImportSummary) {
   ModulePassManager MPM;
 
+  // If we are invoking this without a summary index noting that we are linking
+  // with a library containing the necessary APIs, remove any MemProf related
+  // attributes and metadata.
+  if (!ImportSummary || !ImportSummary->withSupportsHotColdNew())
+    MPM.addPass(MemProfRemoveInfo());
+
   if (ImportSummary) {
     // For ThinLTO we must apply the context disambiguation decisions early, to
     // ensure we can correctly match the callsites to summary data.
@@ -1882,6 +1893,12 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
 
   invokeFullLinkTimeOptimizationEarlyEPCallbacks(MPM, Level);
 
+  // If we are invoking this without a summary index noting that we are linking
+  // with a library containing the necessary APIs, remove any MemProf related
+  // attributes and metadata.
+  if (!ExportSummary || !ExportSummary->withSupportsHotColdNew())
+    MPM.addPass(MemProfRemoveInfo());
+
   // Create a function that performs CFI checks for cross-DSO calls with targets
   // in the current module.
   MPM.addPass(CrossDSOCFIPass());
@@ -2119,7 +2136,7 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
   MainFPM.addPass(createFunctionToLoopPassAdaptor(
       LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap,
                /*AllowSpeculation=*/true),
-      /*USeMemorySSA=*/true, /*UseBlockFrequencyInfo=*/false));
+      /*USeMemorySSA=*/true));
 
   if (RunNewGVN)
     MainFPM.addPass(NewGVNPass());
@@ -2149,8 +2166,8 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
                                  PTO.ForgetAllSCEVInLoopUnroll));
   // The loop passes in LPM (LoopFullUnrollPass) do not preserve MemorySSA.
   // *All* loop passes must preserve it, in order to be able to use it.
-  MainFPM.addPass(createFunctionToLoopPassAdaptor(
-      std::move(LPM), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/true));
+  MainFPM.addPass(
+      createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA=*/false));
 
   MainFPM.addPass(LoopDistributePass());
 
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index a4de4af351d16..081ce580b4b57 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -113,6 +113,7 @@ MODULE_PASS("pgo-force-function-attrs",
                                           ? PGOOpt->ColdOptType
                                           : PGOOptions::ColdFuncOpt::Default))
 MODULE_PASS("memprof-context-disambiguation", MemProfContextDisambiguation())
+MODULE_PASS("memprof-remove-attributes", MemProfRemoveInfo())
 MODULE_PASS("memprof-module", ModuleMemProfilerPass())
 MODULE_PASS("mergefunc", MergeFunctionsPass())
 MODULE_PASS("metarenamer", MetaRenamerPass())
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index 3c8e44a18f533..02087355ab318 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -302,7 +302,7 @@ void ProfOStream::patch(ArrayRef<PatchItem> P) {
 
 std::string getPGOFuncName(StringRef Name, GlobalValue::LinkageTypes Linkage,
                            StringRef FileName,
-                           uint64_t Version LLVM_ATTRIBUTE_UNUSED) {
+                           [[maybe_unused]] uint64_t Version) {
   // Value names may be prefixed with a binary '1' to indicate
   // that the backend should not modify the symbols due to any platform
   // naming convention. Do not include that '1' in the PGO profile name.
diff --git a/llvm/lib/Remarks/BitstreamRemarkParser.h b/llvm/lib/Remarks/BitstreamRemarkParser.h
index 4f66c47bb4b29..914edd8961290 100644
--- a/llvm/lib/Remarks/BitstreamRemarkParser.h
+++ b/llvm/lib/Remarks/BitstreamRemarkParser.h
@@ -112,7 +112,7 @@ class BitstreamBlockParserHelper : public BitstreamBlockParserHelperBase {
 /// Helper to parse a META_BLOCK for a bitstream remark container.
 class BitstreamMetaParserHelper
     : public BitstreamBlockParserHelper<BitstreamMetaParserHelper> {
-  friend class BitstreamBlockParserHelper;
+  friend class BitstreamBlockParserHelper<BitstreamMetaParserHelper>;
 
 public:
   struct ContainerInfo {
@@ -137,7 +137,7 @@ class BitstreamMetaParserHelper
 /// Helper to parse a REMARK_BLOCK for a bitstream remark container.
 class BitstreamRemarkParserHelper
     : public BitstreamBlockParserHelper<BitstreamRemarkParserHelper> {
-  friend class BitstreamBlockParserHelper;
+  friend class BitstreamBlockParserHelper<BitstreamRemarkParserHelper>;
 
 protected:
   SmallVector<uint64_t, 5> Record;
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index 8623c06597f5c..47876042206a1 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -130,44 +130,46 @@ struct fltSemantics {
   bool hasSignBitInMSB = true;
 };
 
-static constexpr fltSemantics semIEEEhalf = {15, -14, 11, 16};
-static constexpr fltSemantics semBFloat = {127, -126, 8, 16};
-static constexpr fltSemantics semIEEEsingle = {127, -126, 24, 32};
-static constexpr fltSemantics semIEEEdouble = {1023, -1022, 53, 64};
-static constexpr fltSemantics semIEEEquad = {16383, -16382, 113, 128};
-static constexpr fltSemantics semFloat8E5M2 = {15, -14, 3, 8};
-static constexpr fltSemantics semFloat8E5M2FNUZ = {
+constexpr fltSemantics APFloatBase::semIEEEhalf = {15, -14, 11, 16};
+constexpr fltSemantics APFloatBase::semBFloat = {127, -126, 8, 16};
+constexpr fltSemantics APFloatBase::semIEEEsingle = {127, -126, 24, 32};
+constexpr fltSemantics APFloatBase::semIEEEdouble = {1023, -1022, 53, 64};
+constexpr fltSemantics APFloatBase::semIEEEquad = {16383, -16382, 113, 128};
+constexpr fltSemantics APFloatBase::semFloat8E5M2 = {15, -14, 3, 8};
+constexpr fltSemantics APFloatBase::semFloat8E5M2FNUZ = {
     15, -15, 3, 8, fltNonfiniteBehavior::NanOnly, fltNanEncoding::NegativeZero};
-static constexpr fltSemantics semFloat8E4M3 = {7, -6, 4, 8};
-static constexpr fltSemantics semFloat8E4M3FN = {
+constexpr fltSemantics APFloatBase::semFloat8E4M3 = {7, -6, 4, 8};
+constexpr fltSemantics APFloatBase::semFloat8E4M3FN = {
     8, -6, 4, 8, fltNonfiniteBehavior::NanOnly, fltNanEncoding::AllOnes};
-static constexpr fltSemantics semFloat8E4M3FNUZ = {
+constexpr fltSemantics APFloatBase::semFloat8E4M3FNUZ = {
     7, -7, 4, 8, fltNonfiniteBehavior::NanOnly, fltNanEncoding::NegativeZero};
-static constexpr fltSemantics semFloat8E4M3B11FNUZ = {
+constexpr fltSemantics APFloatBase::semFloat8E4M3B11FNUZ = {
     4, -10, 4, 8, fltNonfiniteBehavior::NanOnly, fltNanEncoding::NegativeZero};
-static constexpr fltSemantics semFloat8E3M4 = {3, -2, 5, 8};
-static constexpr fltSemantics semFloatTF32 = {127, -126, 11, 19};
-static constexpr fltSemantics semFloat8E8M0FNU = {127,
-                                                  -127,
-                                                  1,
-                                                  8,
-                                                  fltNonfiniteBehavior::NanOnly,
-                                                  fltNanEncoding::AllOnes,
-                                                  false,
-                                                  false,
-                                                  false};
-
-static constexpr fltSemantics semFloat6E3M2FN = {
+constexpr fltSemantics APFloatBase::semFloat8E3M4 = {3, -2, 5, 8};
+constexpr fltSemantics APFloatBase::semFloatTF32 = {127, -126, 11, 19};
+constexpr fltSemantics APFloatBase::semFloat8E8M0FNU = {
+    127,
+    -127,
+    1,
+    8,
+    fltNonfiniteBehavior::NanOnly,
+    fltNanEncoding::AllOnes,
+    false,
+    false,
+    false};
+
+constexpr fltSemantics APFloatBase::semFloat6E3M2FN = {
     4, -2, 3, 6, fltNonfiniteBehavior::FiniteOnly};
-static constexpr fltSemantics semFloat6E2M3FN = {
+constexpr fltSemantics APFloatBase::semFloat6E2M3FN = {
     2, 0, 4, 6, fltNonfiniteBehavior::FiniteOnly};
-static constexpr fltSemantics semFloat4E2M1FN = {
+constexpr fltSemantics APFloatBase::semFloat4E2M1FN = {
     2, 0, 2, 4, fltNonfiniteBehavior::FiniteOnly};
-static constexpr fltSemantics semX87DoubleExtended = {16383, -16382, 64, 80};
-static constexpr fltSemantics semBogus = {0, 0, 0, 0};
-static constexpr fltSemantics semPPCDoubleDouble = {-1, 0, 0, 128};
-static constexpr fltSemantics semPPCDoubleDoubleLegacy = {1023, -1022 + 53,
-                                                          53 + 53, 128};
+constexpr fltSemantics APFloatBase::semX87DoubleExtended = {16383, -16382, 64,
+                                                            80};
+constexpr fltSemantics APFloatBase::semBogus = {0, 0, 0, 0};
+constexpr fltSemantics APFloatBase::semPPCDoubleDouble = {-1, 0, 0, 128};
+constexpr fltSemantics APFloatBase::semPPCDoubleDoubleLegacy = {
+    1023, -1022 + 53, 53 + 53, 128};
 
 const llvm::fltSemantics &APFloatBase::EnumToSemantics(Semantics S) {
   switch (S) {
@@ -261,36 +263,6 @@ APFloatBase::SemanticsToEnum(const llvm::fltSemantics &Sem) {
     llvm_unreachable("Unknown floating semantics");
 }
 
-const fltSemantics &APFloatBase::IEEEhalf() { return semIEEEhalf; }
-const fltSemantics &APFloatBase::BFloat() { return semBFloat; }
-const fltSemantics &APFloatBase::IEEEsingle() { return semIEEEsingle; }
-const fltSemantics &APFloatBase::IEEEdouble() { return semIEEEdouble; }
-const fltSemantics &APFloatBase::IEEEquad() { return semIEEEquad; }
-const fltSemantics &APFloatBase::PPCDoubleDouble() {
-  return semPPCDoubleDouble;
-}
-const fltSemantics &APFloatBase::PPCDoubleDoubleLegacy() {
-  return semPPCDoubleDoubleLegacy;
-}
-const fltSemantics &APFloatBase::Float8E5M2() { return semFloat8E5M2; }
-const fltSemantics &APFloatBase::Float8E5M2FNUZ() { return semFloat8E5M2FNUZ; }
-const fltSemantics &APFloatBase::Float8E4M3() { return semFloat8E4M3; }
-const fltSemantics &APFloatBase::Float8E4M3FN() { return semFloat8E4M3FN; }
-const fltSemantics &APFloatBase::Float8E4M3FNUZ() { return semFloat8E4M3FNUZ; }
-const fltSemantics &APFloatBase::Float8E4M3B11FNUZ() {
-  return semFloat8E4M3B11FNUZ;
-}
-const fltSemantics &APFloatBase::Float8E3M4() { return semFloat8E3M4; }
-const fltSemantics &APFloatBase::FloatTF32() { return semFloatTF32; }
-const fltSemantics &APFloatBase::Float8E8M0FNU() { return semFloat8E8M0FNU; }
-const fltSemantics &APFloatBase::Float6E3M2FN() { return semFloat6E3M2FN; }
-const fltSemantics &APFloatBase::Float6E2M3FN() { return semFloat6E2M3FN; }
-const fltSemantics &APFloatBase::Float4E2M1FN() { return semFloat4E2M1FN; }
-const fltSemantics &APFloatBase::x87DoubleExtended() {
-  return semX87DoubleExtended;
-}
-const fltSemantics &APFloatBase::Bogus() { return semBogus; }
-
 bool APFloatBase::isRepresentableBy(const fltSemantics &A,
                                     const fltSemantics &B) {
   return A.maxExponent <= B.maxExponent && A.minExponent >= B.minExponent &&
@@ -1029,7 +1001,7 @@ void IEEEFloat::makeNaN(bool SNaN, bool Negative, const APInt *fill) {
   // For x87 extended precision, we want to make a NaN, not a
   // pseudo-NaN.  Maybe we should expose the ability to make
   // pseudo-NaNs?
-  if (semantics == &semX87DoubleExtended)
+  if (semantics == &APFloatBase::semX87DoubleExtended)
     APInt::tcSetBit(significand, QNaNBit + 1);
 }
 
@@ -1054,7 +1026,7 @@ IEEEFloat &IEEEFloat::operator=(IEEEFloat &&rhs) {
   category = rhs.category;
   sign = rhs.sign;
 
-  rhs.semantics = &semBogus;
+  rhs.semantics = &APFloatBase::semBogus;
   return *this;
 }
 
@@ -1247,7 +1219,7 @@ IEEEFloat::IEEEFloat(const IEEEFloat &rhs) {
   assign(rhs);
 }
 
-IEEEFloat::IEEEFloat(IEEEFloat &&rhs) : semantics(&semBogus) {
+IEEEFloat::IEEEFloat(IEEEFloat &&rhs) : semantics(&APFloatBase::semBogus) {
   *this = std::move(rhs);
 }
 
@@ -2607,8 +2579,8 @@ APFloat::opStatus IEEEFloat::convert(const fltSemantics &toSemantics,
   shift = toSemantics.precision - fromSemantics.precision;
 
   bool X86SpecialNan = false;
-  if (&fromSemantics == &semX87DoubleExtended &&
-      &toSemantics != &semX87DoubleExtended && category == fcNaN &&
+  if (&fromSemantics == &APFloatBase::semX87DoubleExtended &&
+      &toSemantics != &APFloatBase::semX87DoubleExtended && category == fcNaN &&
       (!(*significandParts() & 0x8000000000000000ULL) ||
        !(*significandParts() & 0x4000000000000000ULL))) {
     // x86 has some unusual NaNs which cannot be represented in any other
@@ -2628,8 +2600,7 @@ APFloat::opStatus IEEEFloat::convert(const fltSemantics &toSemantics,
     int exponentChange = omsb - fromSemantics.precision;
     if (exponent + exponentChange < toSemantics.minExponent)
       exponentChange = toSemantics.minExponent - exponent;
-    if (exponentChange < shift)
-      exponentChange = shift;
+    exponentChange = std::max(exponentChange, shift);
     if (exponentChange < 0) {
       shift -= exponentChange;
       exponent += exponentChange;
@@ -2694,7 +2665,7 @@ APFloat::opStatus IEEEFloat::convert(const fltSemantics &toSemantics,
 
     // For x87 extended precision, we want to make a NaN, not a special NaN if
     // the input wasn't special either.
-    if (!X86SpecialNan && semantics == &semX87DoubleExtended)
+    if (!X86SpecialNan && semantics == &APFloatBase::semX87DoubleExtended)
       APInt::tcSetBit(significandParts(), semantics->precision - 1);
 
     // Convert of sNaN creates qNaN and raises an exception (invalid op).
@@ -3071,8 +3042,7 @@ IEEEFloat::roundSignificandWithExponent(const integerPart *decSigParts,
       if (decSig.exponent < semantics->minExponent) {
         excessPrecision += (semantics->minExponent - decSig.exponent);
         truncatedBits = excessPrecision;
-        if (excessPrecision > calcSemantics.precision)
-          excessPrecision = calcSemantics.precision;
+        excessPrecision = std::min(excessPrecision, calcSemantics.precision);
       }
       /* Extra half-ulp lost in reciprocal of exponent.  */
       powHUerr = (powStatus == opOK && calcLostFraction == lfExactlyZero) ? 0:2;
@@ -3469,8 +3439,7 @@ char *IEEEFloat::convertNormalToHexString(char *dst, unsigned int hexDigits,
     /* Convert as much of "part" to hexdigits as we can.  */
     unsigned int curDigits = integerPartWidth / 4;
 
-    if (curDigits > outputDigits)
-      curDigits = outputDigits;
+    curDigits = std::min(curDigits, outputDigits);
     dst += partAsHex (dst, part, curDigits, hexDigitChars);
     outputDigits -= curDigits;
   }
@@ -3530,7 +3499,8 @@ hash_code hash_value(const IEEEFloat &Arg) {
 // the actual IEEE respresentations.  We compensate for that here.
 
 APInt IEEEFloat::convertF80LongDoubleAPFloatToAPInt() const {
-  assert(semantics == (const llvm::fltSemantics*)&semX87DoubleExtended);
+  assert(semantics ==
+         (const llvm::fltSemantics *)&APFloatBase::semX87DoubleExtended);
   assert(partCount()==2);
 
   uint64_t myexponent, mysignificand;
@@ -3560,7 +3530,8 @@ APInt IEEEFloat::convertF80LongDoubleAPFloatToAPInt() const {
 }
 
 APInt IEEEFloat::convertPPCDoubleDoubleLegacyAPFloatToAPInt() const {
-  assert(semantics == (const llvm::fltSemantics *)&semPPCDoubleDoubleLegacy);
+  assert(semantics ==
+         (const llvm::fltSemantics *)&APFloatBase::semPPCDoubleDoubleLegacy);
   assert(partCount()==2);
 
   uint64_t words[2];
@@ -3574,14 +3545,14 @@ APInt IEEEFloat::convertPPCDoubleDoubleLegacyAPFloatToAPInt() const {
   // Declare fltSemantics before APFloat that uses it (and
   // saves pointer to it) to ensure correct destruction order.
   fltSemantics extendedSemantics = *semantics;
-  extendedSemantics.minExponent = semIEEEdouble.minExponent;
+  extendedSemantics.minExponent = APFloatBase::semIEEEdouble.minExponent;
   IEEEFloat extended(*this);
   fs = extended.convert(extendedSemantics, rmNearestTiesToEven, &losesInfo);
   assert(fs == opOK && !losesInfo);
   (void)fs;
 
   IEEEFloat u(extended);
-  fs = u.convert(semIEEEdouble, rmNearestTiesToEven, &losesInfo);
+  fs = u.convert(APFloatBase::semIEEEdouble, rmNearestTiesToEven, &losesInfo);
   assert(fs == opOK || fs == opInexact);
   (void)fs;
   words[0] = *u.convertDoubleAPFloatToAPInt().getRawData();
@@ -3597,7 +3568,7 @@ APInt IEEEFloat::convertPPCDoubleDoubleLegacyAPFloatToAPInt() const {
 
     IEEEFloat v(extended);
     v.subtract(u, rmNearestTiesToEven);
-    fs = v.convert(semIEEEdouble, rmNearestTiesToEven, &losesInfo);
+    fs = v.convert(APFloatBase::semIEEEdouble, rmNearestTiesToEven, &losesInfo);
     assert(fs == opOK && !losesInfo);
     (void)fs;
     words[1] = *v.convertDoubleAPFloatToAPInt().getRawData();
@@ -3611,8 +3582,9 @@ APInt IEEEFloat::convertPPCDoubleDoubleLegacyAPFloatToAPInt() const {
 template <const fltSemantics &S>
 APInt IEEEFloat::convertIEEEFloatToAPInt() const {
   assert(semantics == &S);
-  const int bias =
-      (semantics == &semFloat8E8M0FNU) ? -S.minExponent : -(S.minExponent - 1);
+  const int bias = (semantics == &APFloatBase::semFloat8E8M0FNU)
+                       ? -S.minExponent
+                       : -(S.minExponent - 1);
   constexpr unsigned int trailing_significand_bits = S.precision - 1;
   constexpr int integer_bit_part = trailing_significand_bits / integerPartWidth;
   constexpr integerPart integer_bit =
@@ -3677,87 +3649,87 @@ APInt IEEEFloat::convertIEEEFloatToAPInt() const {
 
 APInt IEEEFloat::convertQuadrupleAPFloatToAPInt() const {
   assert(partCount() == 2);
-  return convertIEEEFloatToAPInt<semIEEEquad>();
+  return convertIEEEFloatToAPInt<APFloatBase::semIEEEquad>();
 }
 
 APInt IEEEFloat::convertDoubleAPFloatToAPInt() const {
   assert(partCount()==1);
-  return convertIEEEFloatToAPInt<semIEEEdouble>();
+  return convertIEEEFloatToAPInt<APFloatBase::semIEEEdouble>();
 }
 
 APInt IEEEFloat::convertFloatAPFloatToAPInt() const {
   assert(partCount()==1);
-  return convertIEEEFloatToAPInt<semIEEEsingle>();
+  return convertIEEEFloatToAPInt<APFloatBase::semIEEEsingle>();
 }
 
 APInt IEEEFloat::convertBFloatAPFloatToAPInt() const {
   assert(partCount() == 1);
-  return convertIEEEFloatToAPInt<semBFloat>();
+  return convertIEEEFloatToAPInt<APFloatBase::semBFloat>();
 }
 
 APInt IEEEFloat::convertHalfAPFloatToAPInt() const {
   assert(partCount()==1);
-  return convertIEEEFloatToAPInt<semIEEEhalf>();
+  return convertIEEEFloatToAPInt<APFloatBase::APFloatBase::semIEEEhalf>();
 }
 
 APInt IEEEFloat::convertFloat8E5M2APFloatToAPInt() const {
   assert(partCount() == 1);
-  return convertIEEEFloatToAPInt<semFloat8E5M2>();
+  return convertIEEEFloatToAPInt<APFloatBase::semFloat8E5M2>();
 }
 
 APInt IEEEFloat::convertFloat8E5M2FNUZAPFloatToAPInt() const {
   assert(partCount() == 1);
-  return convertIEEEFloatToAPInt<semFloat8E5M2FNUZ>();
+  return convertIEEEFloatToAPInt<APFloatBase::semFloat8E5M2FNUZ>();
 }
 
 APInt IEEEFloat::convertFloat8E4M3APFloatToAPInt() const {
   assert(partCount() == 1);
-  return convertIEEEFloatToAPInt<semFloat8E4M3>();
+  return convertIEEEFloatToAPInt<APFloatBase::semFloat8E4M3>();
 }
 
 APInt IEEEFloat::convertFloat8E4M3FNAPFloatToAPInt() const {
   assert(partCount() == 1);
-  return convertIEEEFloatToAPInt<semFloat8E4M3FN>();
+  return convertIEEEFloatToAPInt<APFloatBase::semFloat8E4M3FN>();
 }
 
 APInt IEEEFloat::convertFloat8E4M3FNUZAPFloatToAPInt() const {
   assert(partCount() == 1);
-  return convertIEEEFloatToAPInt<semFloat8E4M3FNUZ>();
+  return convertIEEEFloatToAPInt<APFloatBase::semFloat8E4M3FNUZ>();
 }
 
 APInt IEEEFloat::convertFloat8E4M3B11FNUZAPFloatToAPInt() const {
   assert(partCount() == 1);
-  return convertIEEEFloatToAPInt<semFloat8E4M3B11FNUZ>();
+  return convertIEEEFloatToAPInt<APFloatBase::semFloat8E4M3B11FNUZ>();
 }
 
 APInt IEEEFloat::convertFloat8E3M4APFloatToAPInt() const {
   assert(partCount() == 1);
-  return convertIEEEFloatToAPInt<semFloat8E3M4>();
+  return convertIEEEFloatToAPInt<APFloatBase::semFloat8E3M4>();
 }
 
 APInt IEEEFloat::convertFloatTF32APFloatToAPInt() const {
   assert(partCount() == 1);
-  return convertIEEEFloatToAPInt<semFloatTF32>();
+  return convertIEEEFloatToAPInt<APFloatBase::semFloatTF32>();
 }
 
 APInt IEEEFloat::convertFloat8E8M0FNUAPFloatToAPInt() const {
   assert(partCount() == 1);
-  return convertIEEEFloatToAPInt<semFloat8E8M0FNU>();
+  return convertIEEEFloatToAPInt<APFloatBase::semFloat8E8M0FNU>();
 }
 
 APInt IEEEFloat::convertFloat6E3M2FNAPFloatToAPInt() const {
   assert(partCount() == 1);
-  return convertIEEEFloatToAPInt<semFloat6E3M2FN>();
+  return convertIEEEFloatToAPInt<APFloatBase::semFloat6E3M2FN>();
 }
 
 APInt IEEEFloat::convertFloat6E2M3FNAPFloatToAPInt() const {
   assert(partCount() == 1);
-  return convertIEEEFloatToAPInt<semFloat6E2M3FN>();
+  return convertIEEEFloatToAPInt<APFloatBase::semFloat6E2M3FN>();
 }
 
 APInt IEEEFloat::convertFloat4E2M1FNAPFloatToAPInt() const {
   assert(partCount() == 1);
-  return convertIEEEFloatToAPInt<semFloat4E2M1FN>();
+  return convertIEEEFloatToAPInt<APFloatBase::semFloat4E2M1FN>();
 }
 
 // This function creates an APInt that is just a bit map of the floating
@@ -3765,74 +3737,77 @@ APInt IEEEFloat::convertFloat4E2M1FNAPFloatToAPInt() const {
 // and treating the result as a normal integer is unlikely to be useful.
 
 APInt IEEEFloat::bitcastToAPInt() const {
-  if (semantics == (const llvm::fltSemantics*)&semIEEEhalf)
+  if (semantics == (const llvm::fltSemantics *)&APFloatBase::semIEEEhalf)
     return convertHalfAPFloatToAPInt();
 
-  if (semantics == (const llvm::fltSemantics *)&semBFloat)
+  if (semantics == (const llvm::fltSemantics *)&APFloatBase::semBFloat)
     return convertBFloatAPFloatToAPInt();
 
-  if (semantics == (const llvm::fltSemantics*)&semIEEEsingle)
+  if (semantics == (const llvm::fltSemantics *)&APFloatBase::semIEEEsingle)
     return convertFloatAPFloatToAPInt();
 
-  if (semantics == (const llvm::fltSemantics*)&semIEEEdouble)
+  if (semantics == (const llvm::fltSemantics *)&APFloatBase::semIEEEdouble)
     return convertDoubleAPFloatToAPInt();
 
-  if (semantics == (const llvm::fltSemantics*)&semIEEEquad)
+  if (semantics == (const llvm::fltSemantics *)&APFloatBase::semIEEEquad)
     return convertQuadrupleAPFloatToAPInt();
 
-  if (semantics == (const llvm::fltSemantics *)&semPPCDoubleDoubleLegacy)
+  if (semantics ==
+      (const llvm::fltSemantics *)&APFloatBase::semPPCDoubleDoubleLegacy)
     return convertPPCDoubleDoubleLegacyAPFloatToAPInt();
 
-  if (semantics == (const llvm::fltSemantics *)&semFloat8E5M2)
+  if (semantics == (const llvm::fltSemantics *)&APFloatBase::semFloat8E5M2)
     return convertFloat8E5M2APFloatToAPInt();
 
-  if (semantics == (const llvm::fltSemantics *)&semFloat8E5M2FNUZ)
+  if (semantics == (const llvm::fltSemantics *)&APFloatBase::semFloat8E5M2FNUZ)
     return convertFloat8E5M2FNUZAPFloatToAPInt();
 
-  if (semantics == (const llvm::fltSemantics *)&semFloat8E4M3)
+  if (semantics == (const llvm::fltSemantics *)&APFloatBase::semFloat8E4M3)
     return convertFloat8E4M3APFloatToAPInt();
 
-  if (semantics == (const llvm::fltSemantics *)&semFloat8E4M3FN)
+  if (semantics == (const llvm::fltSemantics *)&APFloatBase::semFloat8E4M3FN)
     return convertFloat8E4M3FNAPFloatToAPInt();
 
-  if (semantics == (const llvm::fltSemantics *)&semFloat8E4M3FNUZ)
+  if (semantics == (const llvm::fltSemantics *)&APFloatBase::semFloat8E4M3FNUZ)
     return convertFloat8E4M3FNUZAPFloatToAPInt();
 
-  if (semantics == (const llvm::fltSemantics *)&semFloat8E4M3B11FNUZ)
+  if (semantics ==
+      (const llvm::fltSemantics *)&APFloatBase::semFloat8E4M3B11FNUZ)
     return convertFloat8E4M3B11FNUZAPFloatToAPInt();
 
-  if (semantics == (const llvm::fltSemantics *)&semFloat8E3M4)
+  if (semantics == (const llvm::fltSemantics *)&APFloatBase::semFloat8E3M4)
     return convertFloat8E3M4APFloatToAPInt();
 
-  if (semantics == (const llvm::fltSemantics *)&semFloatTF32)
+  if (semantics == (const llvm::fltSemantics *)&APFloatBase::semFloatTF32)
     return convertFloatTF32APFloatToAPInt();
 
-  if (semantics == (const llvm::fltSemantics *)&semFloat8E8M0FNU)
+  if (semantics == (const llvm::fltSemantics *)&APFloatBase::semFloat8E8M0FNU)
     return convertFloat8E8M0FNUAPFloatToAPInt();
 
-  if (semantics == (const llvm::fltSemantics *)&semFloat6E3M2FN)
+  if (semantics == (const llvm::fltSemantics *)&APFloatBase::semFloat6E3M2FN)
     return convertFloat6E3M2FNAPFloatToAPInt();
 
-  if (semantics == (const llvm::fltSemantics *)&semFloat6E2M3FN)
+  if (semantics == (const llvm::fltSemantics *)&APFloatBase::semFloat6E2M3FN)
     return convertFloat6E2M3FNAPFloatToAPInt();
 
-  if (semantics == (const llvm::fltSemantics *)&semFloat4E2M1FN)
+  if (semantics == (const llvm::fltSemantics *)&APFloatBase::semFloat4E2M1FN)
     return convertFloat4E2M1FNAPFloatToAPInt();
 
-  assert(semantics == (const llvm::fltSemantics*)&semX87DoubleExtended &&
+  assert(semantics ==
+             (const llvm::fltSemantics *)&APFloatBase::semX87DoubleExtended &&
          "unknown format!");
   return convertF80LongDoubleAPFloatToAPInt();
 }
 
 float IEEEFloat::convertToFloat() const {
-  assert(semantics == (const llvm::fltSemantics*)&semIEEEsingle &&
+  assert(semantics == (const llvm::fltSemantics *)&APFloatBase::semIEEEsingle &&
          "Float semantics are not IEEEsingle");
   APInt api = bitcastToAPInt();
   return api.bitsToFloat();
 }
 
 double IEEEFloat::convertToDouble() const {
-  assert(semantics == (const llvm::fltSemantics*)&semIEEEdouble &&
+  assert(semantics == (const llvm::fltSemantics *)&APFloatBase::semIEEEdouble &&
          "Float semantics are not IEEEdouble");
   APInt api = bitcastToAPInt();
   return api.bitsToDouble();
@@ -3840,7 +3815,7 @@ double IEEEFloat::convertToDouble() const {
 
 #ifdef HAS_IEE754_FLOAT128
 float128 IEEEFloat::convertToQuad() const {
-  assert(semantics == (const llvm::fltSemantics *)&semIEEEquad &&
+  assert(semantics == (const llvm::fltSemantics *)&APFloatBase::semIEEEquad &&
          "Float semantics are not IEEEquads");
   APInt api = bitcastToAPInt();
   return api.bitsToQuad();
@@ -3861,7 +3836,7 @@ void IEEEFloat::initFromF80LongDoubleAPInt(const APInt &api) {
   uint64_t mysignificand = i1;
   uint8_t myintegerbit = mysignificand >> 63;
 
-  initialize(&semX87DoubleExtended);
+  initialize(&APFloatBase::semX87DoubleExtended);
   assert(partCount()==2);
 
   sign = static_cast<unsigned int>(i2>>15);
@@ -3893,14 +3868,16 @@ void IEEEFloat::initFromPPCDoubleDoubleLegacyAPInt(const APInt &api) {
 
   // Get the first double and convert to our format.
   initFromDoubleAPInt(APInt(64, i1));
-  fs = convert(semPPCDoubleDoubleLegacy, rmNearestTiesToEven, &losesInfo);
+  fs = convert(APFloatBase::semPPCDoubleDoubleLegacy, rmNearestTiesToEven,
+               &losesInfo);
   assert(fs == opOK && !losesInfo);
   (void)fs;
 
   // Unless we have a special case, add in second double.
   if (isFiniteNonZero()) {
-    IEEEFloat v(semIEEEdouble, APInt(64, i2));
-    fs = v.convert(semPPCDoubleDoubleLegacy, rmNearestTiesToEven, &losesInfo);
+    IEEEFloat v(APFloatBase::semIEEEdouble, APInt(64, i2));
+    fs = v.convert(APFloatBase::semPPCDoubleDoubleLegacy, rmNearestTiesToEven,
+                   &losesInfo);
     assert(fs == opOK && !losesInfo);
     (void)fs;
 
@@ -3918,7 +3895,7 @@ void IEEEFloat::initFromFloat8E8M0FNUAPInt(const APInt &api) {
   uint64_t val = api.getRawData()[0];
   uint64_t myexponent = (val & exponent_mask);
 
-  initialize(&semFloat8E8M0FNU);
+  initialize(&APFloatBase::semFloat8E8M0FNU);
   assert(partCount() == 1);
 
   // This format has unsigned representation only
@@ -4025,109 +4002,109 @@ void IEEEFloat::initFromIEEEAPInt(const APInt &api) {
 }
 
 void IEEEFloat::initFromQuadrupleAPInt(const APInt &api) {
-  initFromIEEEAPInt<semIEEEquad>(api);
+  initFromIEEEAPInt<APFloatBase::semIEEEquad>(api);
 }
 
 void IEEEFloat::initFromDoubleAPInt(const APInt &api) {
-  initFromIEEEAPInt<semIEEEdouble>(api);
+  initFromIEEEAPInt<APFloatBase::semIEEEdouble>(api);
 }
 
 void IEEEFloat::initFromFloatAPInt(const APInt &api) {
-  initFromIEEEAPInt<semIEEEsingle>(api);
+  initFromIEEEAPInt<APFloatBase::semIEEEsingle>(api);
 }
 
 void IEEEFloat::initFromBFloatAPInt(const APInt &api) {
-  initFromIEEEAPInt<semBFloat>(api);
+  initFromIEEEAPInt<APFloatBase::semBFloat>(api);
 }
 
 void IEEEFloat::initFromHalfAPInt(const APInt &api) {
-  initFromIEEEAPInt<semIEEEhalf>(api);
+  initFromIEEEAPInt<APFloatBase::semIEEEhalf>(api);
 }
 
 void IEEEFloat::initFromFloat8E5M2APInt(const APInt &api) {
-  initFromIEEEAPInt<semFloat8E5M2>(api);
+  initFromIEEEAPInt<APFloatBase::semFloat8E5M2>(api);
 }
 
 void IEEEFloat::initFromFloat8E5M2FNUZAPInt(const APInt &api) {
-  initFromIEEEAPInt<semFloat8E5M2FNUZ>(api);
+  initFromIEEEAPInt<APFloatBase::semFloat8E5M2FNUZ>(api);
 }
 
 void IEEEFloat::initFromFloat8E4M3APInt(const APInt &api) {
-  initFromIEEEAPInt<semFloat8E4M3>(api);
+  initFromIEEEAPInt<APFloatBase::semFloat8E4M3>(api);
 }
 
 void IEEEFloat::initFromFloat8E4M3FNAPInt(const APInt &api) {
-  initFromIEEEAPInt<semFloat8E4M3FN>(api);
+  initFromIEEEAPInt<APFloatBase::semFloat8E4M3FN>(api);
 }
 
 void IEEEFloat::initFromFloat8E4M3FNUZAPInt(const APInt &api) {
-  initFromIEEEAPInt<semFloat8E4M3FNUZ>(api);
+  initFromIEEEAPInt<APFloatBase::semFloat8E4M3FNUZ>(api);
 }
 
 void IEEEFloat::initFromFloat8E4M3B11FNUZAPInt(const APInt &api) {
-  initFromIEEEAPInt<semFloat8E4M3B11FNUZ>(api);
+  initFromIEEEAPInt<APFloatBase::semFloat8E4M3B11FNUZ>(api);
 }
 
 void IEEEFloat::initFromFloat8E3M4APInt(const APInt &api) {
-  initFromIEEEAPInt<semFloat8E3M4>(api);
+  initFromIEEEAPInt<APFloatBase::semFloat8E3M4>(api);
 }
 
 void IEEEFloat::initFromFloatTF32APInt(const APInt &api) {
-  initFromIEEEAPInt<semFloatTF32>(api);
+  initFromIEEEAPInt<APFloatBase::semFloatTF32>(api);
 }
 
 void IEEEFloat::initFromFloat6E3M2FNAPInt(const APInt &api) {
-  initFromIEEEAPInt<semFloat6E3M2FN>(api);
+  initFromIEEEAPInt<APFloatBase::semFloat6E3M2FN>(api);
 }
 
 void IEEEFloat::initFromFloat6E2M3FNAPInt(const APInt &api) {
-  initFromIEEEAPInt<semFloat6E2M3FN>(api);
+  initFromIEEEAPInt<APFloatBase::semFloat6E2M3FN>(api);
 }
 
 void IEEEFloat::initFromFloat4E2M1FNAPInt(const APInt &api) {
-  initFromIEEEAPInt<semFloat4E2M1FN>(api);
+  initFromIEEEAPInt<APFloatBase::semFloat4E2M1FN>(api);
 }
 
 /// Treat api as containing the bits of a floating point number.
 void IEEEFloat::initFromAPInt(const fltSemantics *Sem, const APInt &api) {
   assert(api.getBitWidth() == Sem->sizeInBits);
-  if (Sem == &semIEEEhalf)
+  if (Sem == &APFloatBase::semIEEEhalf)
     return initFromHalfAPInt(api);
-  if (Sem == &semBFloat)
+  if (Sem == &APFloatBase::semBFloat)
     return initFromBFloatAPInt(api);
-  if (Sem == &semIEEEsingle)
+  if (Sem == &APFloatBase::semIEEEsingle)
     return initFromFloatAPInt(api);
-  if (Sem == &semIEEEdouble)
+  if (Sem == &APFloatBase::semIEEEdouble)
     return initFromDoubleAPInt(api);
-  if (Sem == &semX87DoubleExtended)
+  if (Sem == &APFloatBase::semX87DoubleExtended)
     return initFromF80LongDoubleAPInt(api);
-  if (Sem == &semIEEEquad)
+  if (Sem == &APFloatBase::semIEEEquad)
     return initFromQuadrupleAPInt(api);
-  if (Sem == &semPPCDoubleDoubleLegacy)
+  if (Sem == &APFloatBase::semPPCDoubleDoubleLegacy)
     return initFromPPCDoubleDoubleLegacyAPInt(api);
-  if (Sem == &semFloat8E5M2)
+  if (Sem == &APFloatBase::semFloat8E5M2)
     return initFromFloat8E5M2APInt(api);
-  if (Sem == &semFloat8E5M2FNUZ)
+  if (Sem == &APFloatBase::semFloat8E5M2FNUZ)
     return initFromFloat8E5M2FNUZAPInt(api);
-  if (Sem == &semFloat8E4M3)
+  if (Sem == &APFloatBase::semFloat8E4M3)
     return initFromFloat8E4M3APInt(api);
-  if (Sem == &semFloat8E4M3FN)
+  if (Sem == &APFloatBase::semFloat8E4M3FN)
     return initFromFloat8E4M3FNAPInt(api);
-  if (Sem == &semFloat8E4M3FNUZ)
+  if (Sem == &APFloatBase::semFloat8E4M3FNUZ)
     return initFromFloat8E4M3FNUZAPInt(api);
-  if (Sem == &semFloat8E4M3B11FNUZ)
+  if (Sem == &APFloatBase::semFloat8E4M3B11FNUZ)
     return initFromFloat8E4M3B11FNUZAPInt(api);
-  if (Sem == &semFloat8E3M4)
+  if (Sem == &APFloatBase::semFloat8E3M4)
     return initFromFloat8E3M4APInt(api);
-  if (Sem == &semFloatTF32)
+  if (Sem == &APFloatBase::semFloatTF32)
     return initFromFloatTF32APInt(api);
-  if (Sem == &semFloat8E8M0FNU)
+  if (Sem == &APFloatBase::semFloat8E8M0FNU)
     return initFromFloat8E8M0FNUAPInt(api);
-  if (Sem == &semFloat6E3M2FN)
+  if (Sem == &APFloatBase::semFloat6E3M2FN)
     return initFromFloat6E3M2FNAPInt(api);
-  if (Sem == &semFloat6E2M3FN)
+  if (Sem == &APFloatBase::semFloat6E2M3FN)
     return initFromFloat6E2M3FNAPInt(api);
-  if (Sem == &semFloat4E2M1FN)
+  if (Sem == &APFloatBase::semFloat4E2M1FN)
     return initFromFloat4E2M1FNAPInt(api);
 
   llvm_unreachable("unsupported semantics");
@@ -4202,11 +4179,11 @@ IEEEFloat::IEEEFloat(const fltSemantics &Sem, const APInt &API) {
 }
 
 IEEEFloat::IEEEFloat(float f) {
-  initFromAPInt(&semIEEEsingle, APInt::floatToBits(f));
+  initFromAPInt(&APFloatBase::semIEEEsingle, APInt::floatToBits(f));
 }
 
 IEEEFloat::IEEEFloat(double d) {
-  initFromAPInt(&semIEEEdouble, APInt::doubleToBits(d));
+  initFromAPInt(&APFloatBase::semIEEEdouble, APInt::doubleToBits(d));
 }
 
 namespace {
@@ -4815,38 +4792,40 @@ IEEEFloat frexp(const IEEEFloat &Val, int &Exp, roundingMode RM) {
 
 DoubleAPFloat::DoubleAPFloat(const fltSemantics &S)
     : Semantics(&S),
-      Floats(new APFloat[2]{APFloat(semIEEEdouble), APFloat(semIEEEdouble)}) {
-  assert(Semantics == &semPPCDoubleDouble);
+      Floats(new APFloat[2]{APFloat(APFloatBase::semIEEEdouble),
+                            APFloat(APFloatBase::semIEEEdouble)}) {
+  assert(Semantics == &APFloatBase::semPPCDoubleDouble);
 }
 
 DoubleAPFloat::DoubleAPFloat(const fltSemantics &S, uninitializedTag)
-    : Semantics(&S),
-      Floats(new APFloat[2]{APFloat(semIEEEdouble, uninitialized),
-                            APFloat(semIEEEdouble, uninitialized)}) {
-  assert(Semantics == &semPPCDoubleDouble);
+    : Semantics(&S), Floats(new APFloat[2]{
+                         APFloat(APFloatBase::semIEEEdouble, uninitialized),
+                         APFloat(APFloatBase::semIEEEdouble, uninitialized)}) {
+  assert(Semantics == &APFloatBase::semPPCDoubleDouble);
 }
 
 DoubleAPFloat::DoubleAPFloat(const fltSemantics &S, integerPart I)
-    : Semantics(&S), Floats(new APFloat[2]{APFloat(semIEEEdouble, I),
-                                           APFloat(semIEEEdouble)}) {
-  assert(Semantics == &semPPCDoubleDouble);
+    : Semantics(&S),
+      Floats(new APFloat[2]{APFloat(APFloatBase::semIEEEdouble, I),
+                            APFloat(APFloatBase::semIEEEdouble)}) {
+  assert(Semantics == &APFloatBase::semPPCDoubleDouble);
 }
 
 DoubleAPFloat::DoubleAPFloat(const fltSemantics &S, const APInt &I)
     : Semantics(&S),
       Floats(new APFloat[2]{
-          APFloat(semIEEEdouble, APInt(64, I.getRawData()[0])),
-          APFloat(semIEEEdouble, APInt(64, I.getRawData()[1]))}) {
-  assert(Semantics == &semPPCDoubleDouble);
+          APFloat(APFloatBase::semIEEEdouble, APInt(64, I.getRawData()[0])),
+          APFloat(APFloatBase::semIEEEdouble, APInt(64, I.getRawData()[1]))}) {
+  assert(Semantics == &APFloatBase::semPPCDoubleDouble);
 }
 
 DoubleAPFloat::DoubleAPFloat(const fltSemantics &S, APFloat &&First,
                              APFloat &&Second)
     : Semantics(&S),
       Floats(new APFloat[2]{std::move(First), std::move(Second)}) {
-  assert(Semantics == &semPPCDoubleDouble);
-  assert(&Floats[0].getSemantics() == &semIEEEdouble);
-  assert(&Floats[1].getSemantics() == &semIEEEdouble);
+  assert(Semantics == &APFloatBase::semPPCDoubleDouble);
+  assert(&Floats[0].getSemantics() == &APFloatBase::semIEEEdouble);
+  assert(&Floats[1].getSemantics() == &APFloatBase::semIEEEdouble);
 }
 
 DoubleAPFloat::DoubleAPFloat(const DoubleAPFloat &RHS)
@@ -4854,14 +4833,14 @@ DoubleAPFloat::DoubleAPFloat(const DoubleAPFloat &RHS)
       Floats(RHS.Floats ? new APFloat[2]{APFloat(RHS.Floats[0]),
                                          APFloat(RHS.Floats[1])}
                         : nullptr) {
-  assert(Semantics == &semPPCDoubleDouble);
+  assert(Semantics == &APFloatBase::semPPCDoubleDouble);
 }
 
 DoubleAPFloat::DoubleAPFloat(DoubleAPFloat &&RHS)
     : Semantics(RHS.Semantics), Floats(RHS.Floats) {
-  RHS.Semantics = &semBogus;
+  RHS.Semantics = &APFloatBase::semBogus;
   RHS.Floats = nullptr;
-  assert(Semantics == &semPPCDoubleDouble);
+  assert(Semantics == &APFloatBase::semPPCDoubleDouble);
 }
 
 DoubleAPFloat &DoubleAPFloat::operator=(const DoubleAPFloat &RHS) {
@@ -5009,12 +4988,12 @@ APFloat::opStatus DoubleAPFloat::addWithSpecial(const DoubleAPFloat &LHS,
 
   APFloat A(LHS.Floats[0]), AA(LHS.Floats[1]), C(RHS.Floats[0]),
       CC(RHS.Floats[1]);
-  assert(&A.getSemantics() == &semIEEEdouble);
-  assert(&AA.getSemantics() == &semIEEEdouble);
-  assert(&C.getSemantics() == &semIEEEdouble);
-  assert(&CC.getSemantics() == &semIEEEdouble);
-  assert(&Out.Floats[0].getSemantics() == &semIEEEdouble);
-  assert(&Out.Floats[1].getSemantics() == &semIEEEdouble);
+  assert(&A.getSemantics() == &APFloatBase::semIEEEdouble);
+  assert(&AA.getSemantics() == &APFloatBase::semIEEEdouble);
+  assert(&C.getSemantics() == &APFloatBase::semIEEEdouble);
+  assert(&CC.getSemantics() == &APFloatBase::semIEEEdouble);
+  assert(&Out.Floats[0].getSemantics() == &APFloatBase::semIEEEdouble);
+  assert(&Out.Floats[1].getSemantics() == &APFloatBase::semIEEEdouble);
   return Out.addImpl(A, AA, C, CC, RM);
 }
 
@@ -5119,28 +5098,32 @@ APFloat::opStatus DoubleAPFloat::multiply(const DoubleAPFloat &RHS,
 
 APFloat::opStatus DoubleAPFloat::divide(const DoubleAPFloat &RHS,
                                         APFloat::roundingMode RM) {
-  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
-  APFloat Tmp(semPPCDoubleDoubleLegacy, bitcastToAPInt());
-  auto Ret =
-      Tmp.divide(APFloat(semPPCDoubleDoubleLegacy, RHS.bitcastToAPInt()), RM);
-  *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt());
+  assert(Semantics == &APFloatBase::semPPCDoubleDouble &&
+         "Unexpected Semantics");
+  APFloat Tmp(APFloatBase::semPPCDoubleDoubleLegacy, bitcastToAPInt());
+  auto Ret = Tmp.divide(
+      APFloat(APFloatBase::semPPCDoubleDoubleLegacy, RHS.bitcastToAPInt()), RM);
+  *this = DoubleAPFloat(APFloatBase::semPPCDoubleDouble, Tmp.bitcastToAPInt());
   return Ret;
 }
 
 APFloat::opStatus DoubleAPFloat::remainder(const DoubleAPFloat &RHS) {
-  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
-  APFloat Tmp(semPPCDoubleDoubleLegacy, bitcastToAPInt());
-  auto Ret =
-      Tmp.remainder(APFloat(semPPCDoubleDoubleLegacy, RHS.bitcastToAPInt()));
-  *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt());
+  assert(Semantics == &APFloatBase::semPPCDoubleDouble &&
+         "Unexpected Semantics");
+  APFloat Tmp(APFloatBase::semPPCDoubleDoubleLegacy, bitcastToAPInt());
+  auto Ret = Tmp.remainder(
+      APFloat(APFloatBase::semPPCDoubleDoubleLegacy, RHS.bitcastToAPInt()));
+  *this = DoubleAPFloat(APFloatBase::semPPCDoubleDouble, Tmp.bitcastToAPInt());
   return Ret;
 }
 
 APFloat::opStatus DoubleAPFloat::mod(const DoubleAPFloat &RHS) {
-  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
-  APFloat Tmp(semPPCDoubleDoubleLegacy, bitcastToAPInt());
-  auto Ret = Tmp.mod(APFloat(semPPCDoubleDoubleLegacy, RHS.bitcastToAPInt()));
-  *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt());
+  assert(Semantics == &APFloatBase::semPPCDoubleDouble &&
+         "Unexpected Semantics");
+  APFloat Tmp(APFloatBase::semPPCDoubleDoubleLegacy, bitcastToAPInt());
+  auto Ret = Tmp.mod(
+      APFloat(APFloatBase::semPPCDoubleDoubleLegacy, RHS.bitcastToAPInt()));
+  *this = DoubleAPFloat(APFloatBase::semPPCDoubleDouble, Tmp.bitcastToAPInt());
   return Ret;
 }
 
@@ -5148,17 +5131,21 @@ APFloat::opStatus
 DoubleAPFloat::fusedMultiplyAdd(const DoubleAPFloat &Multiplicand,
                                 const DoubleAPFloat &Addend,
                                 APFloat::roundingMode RM) {
-  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
-  APFloat Tmp(semPPCDoubleDoubleLegacy, bitcastToAPInt());
+  assert(Semantics == &APFloatBase::semPPCDoubleDouble &&
+         "Unexpected Semantics");
+  APFloat Tmp(APFloatBase::semPPCDoubleDoubleLegacy, bitcastToAPInt());
   auto Ret = Tmp.fusedMultiplyAdd(
-      APFloat(semPPCDoubleDoubleLegacy, Multiplicand.bitcastToAPInt()),
-      APFloat(semPPCDoubleDoubleLegacy, Addend.bitcastToAPInt()), RM);
-  *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt());
+      APFloat(APFloatBase::semPPCDoubleDoubleLegacy,
+              Multiplicand.bitcastToAPInt()),
+      APFloat(APFloatBase::semPPCDoubleDoubleLegacy, Addend.bitcastToAPInt()),
+      RM);
+  *this = DoubleAPFloat(APFloatBase::semPPCDoubleDouble, Tmp.bitcastToAPInt());
   return Ret;
 }
 
 APFloat::opStatus DoubleAPFloat::roundToIntegral(APFloat::roundingMode RM) {
-  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  assert(Semantics == &APFloatBase::semPPCDoubleDouble &&
+         "Unexpected Semantics");
   const APFloat &Hi = getFirst();
   const APFloat &Lo = getSecond();
 
@@ -5309,22 +5296,28 @@ void DoubleAPFloat::makeZero(bool Neg) {
 }
 
 void DoubleAPFloat::makeLargest(bool Neg) {
-  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
-  Floats[0] = APFloat(semIEEEdouble, APInt(64, 0x7fefffffffffffffull));
-  Floats[1] = APFloat(semIEEEdouble, APInt(64, 0x7c8ffffffffffffeull));
+  assert(Semantics == &APFloatBase::semPPCDoubleDouble &&
+         "Unexpected Semantics");
+  Floats[0] =
+      APFloat(APFloatBase::semIEEEdouble, APInt(64, 0x7fefffffffffffffull));
+  Floats[1] =
+      APFloat(APFloatBase::semIEEEdouble, APInt(64, 0x7c8ffffffffffffeull));
   if (Neg)
     changeSign();
 }
 
 void DoubleAPFloat::makeSmallest(bool Neg) {
-  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  assert(Semantics == &APFloatBase::semPPCDoubleDouble &&
+         "Unexpected Semantics");
   Floats[0].makeSmallest(Neg);
   Floats[1].makeZero(/* Neg = */ false);
 }
 
 void DoubleAPFloat::makeSmallestNormalized(bool Neg) {
-  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
-  Floats[0] = APFloat(semIEEEdouble, APInt(64, 0x0360000000000000ull));
+  assert(Semantics == &APFloatBase::semPPCDoubleDouble &&
+         "Unexpected Semantics");
+  Floats[0] =
+      APFloat(APFloatBase::semIEEEdouble, APInt(64, 0x0360000000000000ull));
   if (Neg)
     Floats[0].changeSign();
   Floats[1].makeZero(/* Neg = */ false);
@@ -5355,7 +5348,8 @@ hash_code hash_value(const DoubleAPFloat &Arg) {
 }
 
 APInt DoubleAPFloat::bitcastToAPInt() const {
-  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  assert(Semantics == &APFloatBase::semPPCDoubleDouble &&
+         "Unexpected Semantics");
   uint64_t Data[] = {
       Floats[0].bitcastToAPInt().getRawData()[0],
       Floats[1].bitcastToAPInt().getRawData()[0],
@@ -5365,10 +5359,11 @@ APInt DoubleAPFloat::bitcastToAPInt() const {
 
 Expected<APFloat::opStatus> DoubleAPFloat::convertFromString(StringRef S,
                                                              roundingMode RM) {
-  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
-  APFloat Tmp(semPPCDoubleDoubleLegacy);
+  assert(Semantics == &APFloatBase::semPPCDoubleDouble &&
+         "Unexpected Semantics");
+  APFloat Tmp(APFloatBase::semPPCDoubleDoubleLegacy);
   auto Ret = Tmp.convertFromString(S, RM);
-  *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt());
+  *this = DoubleAPFloat(APFloatBase::semPPCDoubleDouble, Tmp.bitcastToAPInt());
   return Ret;
 }
 
@@ -5379,7 +5374,8 @@ Expected<APFloat::opStatus> DoubleAPFloat::convertFromString(StringRef S,
 // nextUp must choose the smallest output > input that follows these rules.
 // nexDown must choose the largest output < input that follows these rules.
 APFloat::opStatus DoubleAPFloat::next(bool nextDown) {
-  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  assert(Semantics == &APFloatBase::semPPCDoubleDouble &&
+         "Unexpected Semantics");
   // nextDown(x) = -nextUp(-x)
   if (nextDown) {
     changeSign();
@@ -5481,7 +5477,8 @@ APFloat::opStatus DoubleAPFloat::next(bool nextDown) {
 APFloat::opStatus DoubleAPFloat::convertToSignExtendedInteger(
     MutableArrayRef<integerPart> Input, unsigned int Width, bool IsSigned,
     roundingMode RM, bool *IsExact) const {
-  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  assert(Semantics == &APFloatBase::semPPCDoubleDouble &&
+         "Unexpected Semantics");
 
   // If Hi is not finite, or Lo is zero, the value is entirely represented
   // by Hi. Delegate to the simpler single-APFloat conversion.
@@ -5761,8 +5758,9 @@ unsigned int DoubleAPFloat::convertToHexString(char *DST,
                                                unsigned int HexDigits,
                                                bool UpperCase,
                                                roundingMode RM) const {
-  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
-  return APFloat(semPPCDoubleDoubleLegacy, bitcastToAPInt())
+  assert(Semantics == &APFloatBase::semPPCDoubleDouble &&
+         "Unexpected Semantics");
+  return APFloat(APFloatBase::semPPCDoubleDoubleLegacy, bitcastToAPInt())
       .convertToHexString(DST, HexDigits, UpperCase, RM);
 }
 
@@ -5799,7 +5797,8 @@ bool DoubleAPFloat::isLargest() const {
 }
 
 bool DoubleAPFloat::isInteger() const {
-  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  assert(Semantics == &APFloatBase::semPPCDoubleDouble &&
+         "Unexpected Semantics");
   return Floats[0].isInteger() && Floats[1].isInteger();
 }
 
@@ -5807,8 +5806,9 @@ void DoubleAPFloat::toString(SmallVectorImpl<char> &Str,
                              unsigned FormatPrecision,
                              unsigned FormatMaxPadding,
                              bool TruncateZero) const {
-  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
-  APFloat(semPPCDoubleDoubleLegacy, bitcastToAPInt())
+  assert(Semantics == &APFloatBase::semPPCDoubleDouble &&
+         "Unexpected Semantics");
+  APFloat(APFloatBase::semPPCDoubleDoubleLegacy, bitcastToAPInt())
       .toString(Str, FormatPrecision, FormatMaxPadding, TruncateZero);
 }
 
@@ -5840,14 +5840,17 @@ int ilogb(const DoubleAPFloat &Arg) {
 
 DoubleAPFloat scalbn(const DoubleAPFloat &Arg, int Exp,
                      APFloat::roundingMode RM) {
-  assert(Arg.Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
-  return DoubleAPFloat(semPPCDoubleDouble, scalbn(Arg.Floats[0], Exp, RM),
+  assert(Arg.Semantics == &APFloatBase::PPCDoubleDouble() &&
+         "Unexpected Semantics");
+  return DoubleAPFloat(APFloatBase::PPCDoubleDouble(),
+                       scalbn(Arg.Floats[0], Exp, RM),
                        scalbn(Arg.Floats[1], Exp, RM));
 }
 
 DoubleAPFloat frexp(const DoubleAPFloat &Arg, int &Exp,
                     APFloat::roundingMode RM) {
-  assert(Arg.Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  assert(Arg.Semantics == &APFloatBase::PPCDoubleDouble() &&
+         "Unexpected Semantics");
 
   // Get the unbiased exponent e of the number, where |Arg| = m * 2^e for m in
   // [1.0, 2.0).
@@ -5943,7 +5946,8 @@ DoubleAPFloat frexp(const DoubleAPFloat &Arg, int &Exp,
   }
 
   APFloat First = scalbn(Hi, -Exp, RM);
-  return DoubleAPFloat(semPPCDoubleDouble, std::move(First), std::move(Second));
+  return DoubleAPFloat(APFloatBase::PPCDoubleDouble(), std::move(First),
+                       std::move(Second));
 }
 
 } // namespace detail
@@ -5955,9 +5959,8 @@ APFloat::Storage::Storage(IEEEFloat F, const fltSemantics &Semantics) {
   }
   if (usesLayout<DoubleAPFloat>(Semantics)) {
     const fltSemantics& S = F.getSemantics();
-    new (&Double)
-        DoubleAPFloat(Semantics, APFloat(std::move(F), S),
-                      APFloat(semIEEEdouble));
+    new (&Double) DoubleAPFloat(Semantics, APFloat(std::move(F), S),
+                                APFloat(APFloatBase::IEEEdouble()));
     return;
   }
   llvm_unreachable("Unexpected semantics");
@@ -6065,8 +6068,9 @@ APFloat::opStatus APFloat::convert(const fltSemantics &ToSemantics,
     return U.IEEE.convert(ToSemantics, RM, losesInfo);
   if (usesLayout<IEEEFloat>(getSemantics()) &&
       usesLayout<DoubleAPFloat>(ToSemantics)) {
-    assert(&ToSemantics == &semPPCDoubleDouble);
-    auto Ret = U.IEEE.convert(semPPCDoubleDoubleLegacy, RM, losesInfo);
+    assert(&ToSemantics == &APFloatBase::semPPCDoubleDouble);
+    auto Ret =
+        U.IEEE.convert(APFloatBase::semPPCDoubleDoubleLegacy, RM, losesInfo);
     *this = APFloat(ToSemantics, U.IEEE.bitcastToAPInt());
     return Ret;
   }
@@ -6113,13 +6117,15 @@ APFloat::opStatus APFloat::convertToInteger(APSInt &result,
 }
 
 double APFloat::convertToDouble() const {
-  if (&getSemantics() == (const llvm::fltSemantics *)&semIEEEdouble)
+  if (&getSemantics() ==
+      (const llvm::fltSemantics *)&APFloatBase::semIEEEdouble)
     return getIEEE().convertToDouble();
   assert(isRepresentableBy(getSemantics(), semIEEEdouble) &&
          "Float semantics is not representable by IEEEdouble");
   APFloat Temp = *this;
   bool LosesInfo;
-  opStatus St = Temp.convert(semIEEEdouble, rmNearestTiesToEven, &LosesInfo);
+  opStatus St =
+      Temp.convert(APFloatBase::semIEEEdouble, rmNearestTiesToEven, &LosesInfo);
   assert(!(St & opInexact) && !LosesInfo && "Unexpected imprecision");
   (void)St;
   return Temp.getIEEE().convertToDouble();
@@ -6127,13 +6133,14 @@ double APFloat::convertToDouble() const {
 
 #ifdef HAS_IEE754_FLOAT128
 float128 APFloat::convertToQuad() const {
-  if (&getSemantics() == (const llvm::fltSemantics *)&semIEEEquad)
+  if (&getSemantics() == (const llvm::fltSemantics *)&APFloatBase::semIEEEquad)
     return getIEEE().convertToQuad();
   assert(isRepresentableBy(getSemantics(), semIEEEquad) &&
          "Float semantics is not representable by IEEEquad");
   APFloat Temp = *this;
   bool LosesInfo;
-  opStatus St = Temp.convert(semIEEEquad, rmNearestTiesToEven, &LosesInfo);
+  opStatus St =
+      Temp.convert(APFloatBase::semIEEEquad, rmNearestTiesToEven, &LosesInfo);
   assert(!(St & opInexact) && !LosesInfo && "Unexpected imprecision");
   (void)St;
   return Temp.getIEEE().convertToQuad();
@@ -6141,18 +6148,84 @@ float128 APFloat::convertToQuad() const {
 #endif
 
 float APFloat::convertToFloat() const {
-  if (&getSemantics() == (const llvm::fltSemantics *)&semIEEEsingle)
+  if (&getSemantics() ==
+      (const llvm::fltSemantics *)&APFloatBase::semIEEEsingle)
     return getIEEE().convertToFloat();
   assert(isRepresentableBy(getSemantics(), semIEEEsingle) &&
          "Float semantics is not representable by IEEEsingle");
   APFloat Temp = *this;
   bool LosesInfo;
-  opStatus St = Temp.convert(semIEEEsingle, rmNearestTiesToEven, &LosesInfo);
+  opStatus St =
+      Temp.convert(APFloatBase::semIEEEsingle, rmNearestTiesToEven, &LosesInfo);
   assert(!(St & opInexact) && !LosesInfo && "Unexpected imprecision");
   (void)St;
   return Temp.getIEEE().convertToFloat();
 }
 
+APFloat::Storage::~Storage() {
+  if (usesLayout<IEEEFloat>(*semantics)) {
+    IEEE.~IEEEFloat();
+    return;
+  }
+  if (usesLayout<DoubleAPFloat>(*semantics)) {
+    Double.~DoubleAPFloat();
+    return;
+  }
+  llvm_unreachable("Unexpected semantics");
+}
+
+APFloat::Storage::Storage(const APFloat::Storage &RHS) {
+  if (usesLayout<IEEEFloat>(*RHS.semantics)) {
+    new (this) IEEEFloat(RHS.IEEE);
+    return;
+  }
+  if (usesLayout<DoubleAPFloat>(*RHS.semantics)) {
+    new (this) DoubleAPFloat(RHS.Double);
+    return;
+  }
+  llvm_unreachable("Unexpected semantics");
+}
+
+APFloat::Storage::Storage(APFloat::Storage &&RHS) {
+  if (usesLayout<IEEEFloat>(*RHS.semantics)) {
+    new (this) IEEEFloat(std::move(RHS.IEEE));
+    return;
+  }
+  if (usesLayout<DoubleAPFloat>(*RHS.semantics)) {
+    new (this) DoubleAPFloat(std::move(RHS.Double));
+    return;
+  }
+  llvm_unreachable("Unexpected semantics");
+}
+
+APFloat::Storage &APFloat::Storage::operator=(const APFloat::Storage &RHS) {
+  if (usesLayout<IEEEFloat>(*semantics) &&
+      usesLayout<IEEEFloat>(*RHS.semantics)) {
+    IEEE = RHS.IEEE;
+  } else if (usesLayout<DoubleAPFloat>(*semantics) &&
+             usesLayout<DoubleAPFloat>(*RHS.semantics)) {
+    Double = RHS.Double;
+  } else if (this != &RHS) {
+    this->~Storage();
+    new (this) Storage(RHS);
+  }
+  return *this;
+}
+
+APFloat::Storage &APFloat::Storage::operator=(APFloat::Storage &&RHS) {
+  if (usesLayout<IEEEFloat>(*semantics) &&
+      usesLayout<IEEEFloat>(*RHS.semantics)) {
+    IEEE = std::move(RHS.IEEE);
+  } else if (usesLayout<DoubleAPFloat>(*semantics) &&
+             usesLayout<DoubleAPFloat>(*RHS.semantics)) {
+    Double = std::move(RHS.Double);
+  } else if (this != &RHS) {
+    this->~Storage();
+    new (this) Storage(std::move(RHS));
+  }
+  return *this;
+}
+
 } // namespace llvm
 
 #undef APFLOAT_DISPATCH_ON_SEMANTICS
diff --git a/llvm/lib/Support/PrettyStackTrace.cpp b/llvm/lib/Support/PrettyStackTrace.cpp
index 82b0e6ac513e1..eff99473b2057 100644
--- a/llvm/lib/Support/PrettyStackTrace.cpp
+++ b/llvm/lib/Support/PrettyStackTrace.cpp
@@ -141,7 +141,7 @@ extern "C" const char *__crashreporter_info__
 asm(".desc ___crashreporter_info__, 0x10");
 #endif
 
-static void setCrashLogMessage(const char *msg) LLVM_ATTRIBUTE_UNUSED;
+[[maybe_unused]] static void setCrashLogMessage(const char *msg);
 static void setCrashLogMessage(const char *msg) {
 #ifdef HAVE_CRASHREPORTERCLIENT_H
   (void)CRSetCrashLogMessage(msg);
diff --git a/llvm/lib/Support/SourceMgr.cpp b/llvm/lib/Support/SourceMgr.cpp
index a43cf37a79824..299615a6c8041 100644
--- a/llvm/lib/Support/SourceMgr.cpp
+++ b/llvm/lib/Support/SourceMgr.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/SMLoc.h"
+#include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
@@ -38,6 +39,22 @@ using namespace llvm;
 
 static const size_t TabStop = 8;
 
+// Out of line to avoid needing definition of vfs::FileSystem in header.
+SourceMgr::SourceMgr() = default;
+SourceMgr::SourceMgr(IntrusiveRefCntPtr<vfs::FileSystem> FS)
+    : FS(std::move(FS)) {}
+SourceMgr::SourceMgr(SourceMgr &&) = default;
+SourceMgr &SourceMgr::operator=(SourceMgr &&) = default;
+SourceMgr::~SourceMgr() = default;
+
+IntrusiveRefCntPtr<vfs::FileSystem> SourceMgr::getVirtualFileSystem() const {
+  return FS;
+}
+
+void SourceMgr::setVirtualFileSystem(IntrusiveRefCntPtr<vfs::FileSystem> FS) {
+  this->FS = std::move(FS);
+}
+
 unsigned SourceMgr::AddIncludeFile(const std::string &Filename,
                                    SMLoc IncludeLoc,
                                    std::string &IncludedFile) {
@@ -52,8 +69,11 @@ unsigned SourceMgr::AddIncludeFile(const std::string &Filename,
 ErrorOr<std::unique_ptr<MemoryBuffer>>
 SourceMgr::OpenIncludeFile(const std::string &Filename,
                            std::string &IncludedFile) {
-  ErrorOr<std::unique_ptr<MemoryBuffer>> NewBufOrErr =
-      MemoryBuffer::getFile(Filename);
+  auto GetFile = [this](StringRef Path) {
+    return FS ? FS->getBufferForFile(Path) : MemoryBuffer::getFile(Path);
+  };
+
+  ErrorOr<std::unique_ptr<MemoryBuffer>> NewBufOrErr = GetFile(Filename);
 
   SmallString<64> Buffer(Filename);
   // If the file didn't exist directly, see if it's in an include path.
@@ -61,7 +81,7 @@ SourceMgr::OpenIncludeFile(const std::string &Filename,
        ++i) {
     Buffer = IncludeDirectories[i];
     sys::path::append(Buffer, Filename);
-    NewBufOrErr = MemoryBuffer::getFile(Buffer);
+    NewBufOrErr = GetFile(Buffer);
   }
 
   if (NewBufOrErr)
diff --git a/llvm/lib/Support/TextEncoding.cpp b/llvm/lib/Support/TextEncoding.cpp
index 804ff07f6e9a8..41f51877d7128 100644
--- a/llvm/lib/Support/TextEncoding.cpp
+++ b/llvm/lib/Support/TextEncoding.cpp
@@ -54,9 +54,9 @@ static std::optional<TextEncoding> getKnownEncoding(StringRef Name) {
   return std::nullopt;
 }
 
-LLVM_ATTRIBUTE_UNUSED static void
-HandleOverflow(size_t &Capacity, char *&Output, size_t &OutputLength,
-               SmallVectorImpl<char> &Result) {
+[[maybe_unused]] static void HandleOverflow(size_t &Capacity, char *&Output,
+                                            size_t &OutputLength,
+                                            SmallVectorImpl<char> &Result) {
   // No space left in output buffer. Double the size of the underlying
   // memory in the SmallVectorImpl, adjust pointer and length and continue
   // the conversion.
diff --git a/llvm/lib/Support/UnicodeNameToCodepoint.cpp b/llvm/lib/Support/UnicodeNameToCodepoint.cpp
index 8d66348cfaba6..6f8e0915ab632 100644
--- a/llvm/lib/Support/UnicodeNameToCodepoint.cpp
+++ b/llvm/lib/Support/UnicodeNameToCodepoint.cpp
@@ -476,7 +476,7 @@ nearestMatchesForCodepointName(StringRef Pattern, std::size_t MaxMatchesCount) {
       std::min(NormalizedName.size(), UnicodeNameToCodepointLargestNameSize) +
       1;
 
-  LLVM_ATTRIBUTE_UNUSED static std::size_t Rows =
+  [[maybe_unused]] static std::size_t Rows =
       UnicodeNameToCodepointLargestNameSize + 1;
 
   std::vector<char> Distances(
diff --git a/llvm/lib/Support/Unix/Signals.inc b/llvm/lib/Support/Unix/Signals.inc
index 573ad82f2dea4..78d6540db98a7 100644
--- a/llvm/lib/Support/Unix/Signals.inc
+++ b/llvm/lib/Support/Unix/Signals.inc
@@ -868,8 +868,7 @@ void llvm::sys::PrintStackTrace(raw_ostream &OS, int Depth) {
         nwidth = strlen(name) - 1;
     }
 
-    if (nwidth > width)
-      width = nwidth;
+    width = std::max(nwidth, width);
   }
 
   for (int i = 0; i < depth; ++i) {
diff --git a/llvm/lib/Support/Windows/Signals.inc b/llvm/lib/Support/Windows/Signals.inc
index dad0fa3066868..648d6a50287ec 100644
--- a/llvm/lib/Support/Windows/Signals.inc
+++ b/llvm/lib/Support/Windows/Signals.inc
@@ -354,8 +354,8 @@ namespace llvm {
 /// Emulates hitting "retry" from an "abort, retry, ignore" CRT debug report
 /// dialog. "retry" raises an exception which ultimately triggers our stack
 /// dumper.
-static LLVM_ATTRIBUTE_UNUSED int
-AvoidMessageBoxHook(int ReportType, char *Message, int *Return) {
+[[maybe_unused]] static int AvoidMessageBoxHook(int ReportType, char *Message,
+                                                int *Return) {
   // Set *Return to the retry code for the return value of _CrtDbgReport:
   // http://msdn.microsoft.com/en-us/library/8hyw4sy7(v=vs.71).aspx
   // This may also trigger just-in-time debugging via DebugBreak().
diff --git a/llvm/lib/TableGen/Main.cpp b/llvm/lib/TableGen/Main.cpp
index 42043f70768c5..b1024a8d39e00 100644
--- a/llvm/lib/TableGen/Main.cpp
+++ b/llvm/lib/TableGen/Main.cpp
@@ -26,6 +26,7 @@
 #include "llvm/Support/SMLoc.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
@@ -128,6 +129,7 @@ int llvm::TableGenMain(const char *argv0,
   // Record the location of the include directory so that the lexer can find
   // it later.
   SrcMgr.setIncludeDirs(IncludeDirs);
+  SrcMgr.setVirtualFileSystem(vfs::getRealFileSystem());
 
   TGParser Parser(SrcMgr, MacroNames, Records, NoWarnOnUnusedTemplateArgs);
 
diff --git a/llvm/lib/TableGen/Parser.cpp b/llvm/lib/TableGen/Parser.cpp
index 2c3726a339bb8..db45054061524 100644
--- a/llvm/lib/TableGen/Parser.cpp
+++ b/llvm/lib/TableGen/Parser.cpp
@@ -9,6 +9,7 @@
 #include "llvm/TableGen/Parser.h"
 #include "TGParser.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/TableGen/Record.h"
 
 using namespace llvm;
@@ -21,6 +22,7 @@ bool llvm::TableGenParseFile(SourceMgr &InputSrcMgr, RecordKeeper &Records) {
   SrcMgr = SourceMgr();
   SrcMgr.takeSourceBuffersFrom(InputSrcMgr);
   SrcMgr.setIncludeDirs(InputSrcMgr.getIncludeDirs());
+  SrcMgr.setVirtualFileSystem(InputSrcMgr.getVirtualFileSystem());
   SrcMgr.setDiagHandler(InputSrcMgr.getDiagHandler(),
                         InputSrcMgr.getDiagContext());
 
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp b/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp
index 98016271a9d00..e9660ac140301 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp
@@ -585,7 +585,7 @@ void AArch64_IMM::expandMOVImm(uint64_t Imm, unsigned BitSize,
     uint64_t ShiftedMask = (0xFFFFULL << Shift);
     uint64_t ZeroChunk = UImm & ~ShiftedMask;
     uint64_t OneChunk = UImm | ShiftedMask;
-    uint64_t RotatedImm = (UImm << 32) | (UImm >> 32);
+    uint64_t RotatedImm = llvm::rotl(UImm, 32);
     uint64_t ReplicateChunk = ZeroChunk | (RotatedImm & ShiftedMask);
     if (AArch64_AM::processLogicalImmediate(ZeroChunk, BitSize, Encoding) ||
         AArch64_AM::processLogicalImmediate(OneChunk, BitSize, Encoding) ||
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 9926a4d7baec6..a81de5c5adc34 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1561,6 +1561,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
       setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
       setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_MUL, VT, Custom);
       setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
       setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
       setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
@@ -1717,6 +1718,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
       setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Custom);
       setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom);
       setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
       setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom);
       setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom);
@@ -7775,6 +7777,9 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
   case ISD::VECREDUCE_FMAXIMUM:
   case ISD::VECREDUCE_FMINIMUM:
     return LowerVECREDUCE(Op, DAG);
+  case ISD::VECREDUCE_MUL:
+  case ISD::VECREDUCE_FMUL:
+    return LowerVECREDUCE_MUL(Op, DAG);
   case ISD::ATOMIC_LOAD_AND:
     return LowerATOMIC_LOAD_AND(Op, DAG);
   case ISD::DYNAMIC_STACKALLOC:
@@ -16254,7 +16259,7 @@ SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
       SplatVal > 1) {
     SDValue Pg = getPredicateForScalableVector(DAG, DL, VT);
     SDValue Res =
-        DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, DL, VT, Pg, Op->getOperand(0),
+        DAG.getNode(AArch64ISD::ASRD_MERGE_OP1, DL, VT, Pg, Op->getOperand(0),
                     DAG.getTargetConstant(Log2_64(SplatVal), DL, MVT::i32));
     if (Negated)
       Res = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);
@@ -16794,6 +16799,33 @@ SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
   }
 }
 
+SDValue AArch64TargetLowering::LowerVECREDUCE_MUL(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  SDValue Src = Op.getOperand(0);
+  EVT SrcVT = Src.getValueType();
+  assert(SrcVT.isScalableVector() && "Unexpected operand type!");
+
+  SDVTList SrcVTs = DAG.getVTList(SrcVT, SrcVT);
+  unsigned BaseOpc = ISD::getVecReduceBaseOpcode(Op.getOpcode());
+  SDValue Identity = DAG.getNeutralElement(BaseOpc, DL, SrcVT, Op->getFlags());
+
+  // Whilst we don't know the size of the vector we do know the maximum size so
+  // can perform a tree reduction with an identity vector, which means once we
+  // arrive at the result the remaining stages (when the vector is smaller than
+  // the maximum) have no affect.
+
+  unsigned Segments = AArch64::SVEMaxBitsPerVector / AArch64::SVEBitsPerBlock;
+  unsigned Stages = llvm::Log2_32(Segments * SrcVT.getVectorMinNumElements());
+
+  for (unsigned I = 0; I < Stages; ++I) {
+    Src = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, SrcVTs, Src, Identity);
+    Src = DAG.getNode(BaseOpc, DL, SrcVT, Src.getValue(0), Src.getValue(1));
+  }
+
+  return DAG.getExtractVectorElt(DL, Op.getValueType(), Src, 0);
+}
+
 SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
                                                     SelectionDAG &DAG) const {
   auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
@@ -18144,8 +18176,8 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
 bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
     Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
   const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
-  if (Factor != 2 && Factor != 4) {
-    LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n");
+  if (Factor != 2 && Factor != 3 && Factor != 4) {
+    LLVM_DEBUG(dbgs() << "Matching ld2, ld3 and ld4 patterns failed\n");
     return false;
   }
   auto *LI = dyn_cast<LoadInst>(Load);
@@ -18223,8 +18255,8 @@ bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
     Instruction *Store, Value *Mask,
     ArrayRef<Value *> InterleavedValues) const {
   unsigned Factor = InterleavedValues.size();
-  if (Factor != 2 && Factor != 4) {
-    LLVM_DEBUG(dbgs() << "Matching st2 and st4 patterns failed\n");
+  if (Factor != 2 && Factor != 3 && Factor != 4) {
+    LLVM_DEBUG(dbgs() << "Matching st2, st3 and st4 patterns failed\n");
     return false;
   }
   StoreInst *SI = dyn_cast<StoreInst>(Store);
@@ -22942,7 +22974,7 @@ static SDValue performIntrinsicCombine(SDNode *N,
     return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
                        N->getOperand(1), N->getOperand(2));
   case Intrinsic::aarch64_sve_asrd:
-    return DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, SDLoc(N), N->getValueType(0),
+    return DAG.getNode(AArch64ISD::ASRD_MERGE_OP1, SDLoc(N), N->getValueType(0),
                        N->getOperand(1), N->getOperand(2), N->getOperand(3));
   case Intrinsic::aarch64_sve_cmphs:
     if (!N->getOperand(2).getValueType().isFloatingPoint())
@@ -27570,6 +27602,15 @@ static SDValue performPTestFirstCombine(SDNode *N,
 static SDValue
 performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
                              SelectionDAG &DAG) {
+  SDLoc DL(N);
+
+  // If a DUP(Op0) already exists, reuse it for the scalar_to_vector.
+  if (DCI.isAfterLegalizeDAG()) {
+    if (SDNode *LN = DCI.DAG.getNodeIfExists(AArch64ISD::DUP, N->getVTList(),
+                                             N->getOperand(0)))
+      return SDValue(LN, 0);
+  }
+
   // Let's do below transform.
   //
   //         t34: v4i32 = AArch64ISD::UADDLV t2
@@ -27606,7 +27647,6 @@ performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
     return SDValue();
 
   // Let's generate new sequence with AArch64ISD::NVCAST.
-  SDLoc DL(N);
   SDValue EXTRACT_SUBVEC =
       DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, UADDLV,
                   DAG.getConstant(0, DL, MVT::i64));
@@ -30047,7 +30087,7 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
 
     SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
     SDValue Res =
-        DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, DL, ContainerVT, Pg, Op1, Op2);
+        DAG.getNode(AArch64ISD::ASRD_MERGE_OP1, DL, ContainerVT, Pg, Op1, Op2);
     if (Negated)
       Res = DAG.getNode(ISD::SUB, DL, ContainerVT,
                         DAG.getConstant(0, DL, ContainerVT), Res);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 00956fdc8e48e..9495c9ffc47aa 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -752,6 +752,7 @@ class AArch64TargetLowering : public TargetLowering {
   SDValue LowerVSCALE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVECREDUCE_MUL(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerInlineDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
index 7322212c5bb24..fe8419301b306 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
@@ -233,6 +233,12 @@ def G_SDOT : AArch64GenericInstruction {
   let hasSideEffects = 0;
 }
 
+def G_USDOT : AArch64GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src1, type0:$src2, type0:$src3);
+  let hasSideEffects = 0;
+}
+
 // Generic instruction for the BSP pseudo. It is expanded into BSP, which
 // expands into BSL/BIT/BIF after register allocation.
 def G_BSP : AArch64GenericInstruction {
@@ -278,6 +284,7 @@ def : GINodeEquiv<G_UADDLV, AArch64uaddlv>;
 
 def : GINodeEquiv<G_UDOT, AArch64udot>;
 def : GINodeEquiv<G_SDOT, AArch64sdot>;
+def : GINodeEquiv<G_USDOT, AArch64usdot>;
 
 def : GINodeEquiv<G_EXTRACT_VECTOR_ELT, vector_extract>;
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 30dfcf2b2038a..d5117da524231 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -700,7 +700,7 @@ static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
 // csel instruction. If so, return the folded opcode, and the replacement
 // register.
 static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
-                                unsigned *NewVReg = nullptr) {
+                                unsigned *NewReg = nullptr) {
   VReg = removeCopies(MRI, VReg);
   if (!Register::isVirtualRegister(VReg))
     return 0;
@@ -708,8 +708,37 @@ static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
   bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
   const MachineInstr *DefMI = MRI.getVRegDef(VReg);
   unsigned Opc = 0;
-  unsigned SrcOpNum = 0;
+  unsigned SrcReg = 0;
   switch (DefMI->getOpcode()) {
+  case AArch64::SUBREG_TO_REG:
+    // Check for the following way to define an 64-bit immediate:
+    //   %0:gpr32 = MOVi32imm 1
+    //   %1:gpr64 = SUBREG_TO_REG 0, %0:gpr32, %subreg.sub_32
+    if (!DefMI->getOperand(1).isImm() || DefMI->getOperand(1).getImm() != 0)
+      return 0;
+    if (!DefMI->getOperand(2).isReg())
+      return 0;
+    if (!DefMI->getOperand(3).isImm() ||
+        DefMI->getOperand(3).getImm() != AArch64::sub_32)
+      return 0;
+    DefMI = MRI.getVRegDef(DefMI->getOperand(2).getReg());
+    if (DefMI->getOpcode() != AArch64::MOVi32imm)
+      return 0;
+    if (!DefMI->getOperand(1).isImm() || DefMI->getOperand(1).getImm() != 1)
+      return 0;
+    assert(Is64Bit);
+    SrcReg = AArch64::XZR;
+    Opc = AArch64::CSINCXr;
+    break;
+
+  case AArch64::MOVi32imm:
+  case AArch64::MOVi64imm:
+    if (!DefMI->getOperand(1).isImm() || DefMI->getOperand(1).getImm() != 1)
+      return 0;
+    SrcReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
+    Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
+    break;
+
   case AArch64::ADDSXri:
   case AArch64::ADDSWri:
     // if NZCV is used, do not fold.
@@ -724,7 +753,7 @@ static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
     if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
         DefMI->getOperand(3).getImm() != 0)
       return 0;
-    SrcOpNum = 1;
+    SrcReg = DefMI->getOperand(1).getReg();
     Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
     break;
 
@@ -734,7 +763,7 @@ static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
     unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
     if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
       return 0;
-    SrcOpNum = 2;
+    SrcReg = DefMI->getOperand(2).getReg();
     Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
     break;
   }
@@ -753,17 +782,17 @@ static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
     unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
     if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
       return 0;
-    SrcOpNum = 2;
+    SrcReg = DefMI->getOperand(2).getReg();
     Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
     break;
   }
   default:
     return 0;
   }
-  assert(Opc && SrcOpNum && "Missing parameters");
+  assert(Opc && SrcReg && "Missing parameters");
 
-  if (NewVReg)
-    *NewVReg = DefMI->getOperand(SrcOpNum).getReg();
+  if (NewReg)
+    *NewReg = SrcReg;
   return Opc;
 }
 
@@ -964,28 +993,34 @@ void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
 
   // Try folding simple instructions into the csel.
   if (TryFold) {
-    unsigned NewVReg = 0;
-    unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg);
+    unsigned NewReg = 0;
+    unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewReg);
     if (FoldedOpc) {
       // The folded opcodes csinc, csinc and csneg apply the operation to
       // FalseReg, so we need to invert the condition.
       CC = AArch64CC::getInvertedCondCode(CC);
       TrueReg = FalseReg;
     } else
-      FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg);
+      FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewReg);
 
     // Fold the operation. Leave any dead instructions for DCE to clean up.
     if (FoldedOpc) {
-      FalseReg = NewVReg;
+      FalseReg = NewReg;
       Opc = FoldedOpc;
-      // The extends the live range of NewVReg.
-      MRI.clearKillFlags(NewVReg);
+      // Extend the live range of NewReg.
+      MRI.clearKillFlags(NewReg);
     }
   }
 
   // Pull all virtual register into the appropriate class.
   MRI.constrainRegClass(TrueReg, RC);
-  MRI.constrainRegClass(FalseReg, RC);
+  // FalseReg might be WZR or XZR if the folded operand is a literal 1.
+  assert(
+      (FalseReg.isVirtual() || FalseReg == AArch64::WZR ||
+       FalseReg == AArch64::XZR) &&
+      "FalseReg was folded into a non-virtual register other than WZR or XZR");
+  if (FalseReg.isVirtual())
+    MRI.constrainRegClass(FalseReg, RC);
 
   // Insert the csel.
   BuildMI(MBB, I, DL, get(Opc), DstReg)
@@ -5063,7 +5098,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                    bool RenamableDest,
                                    bool RenamableSrc) const {
   if (AArch64::GPR32spRegClass.contains(DestReg) &&
-      (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) {
+      AArch64::GPR32spRegClass.contains(SrcReg)) {
     if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
       // If either operand is WSP, expand to ADD #0.
       if (Subtarget.hasZeroCycleRegMoveGPR64() &&
@@ -5088,21 +5123,14 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
             .addImm(0)
             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
       }
-    } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGPR32()) {
-      BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
-          .addImm(0)
-          .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
     } else if (Subtarget.hasZeroCycleRegMoveGPR64() &&
                !Subtarget.hasZeroCycleRegMoveGPR32()) {
       // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
       MCRegister DestRegX = RI.getMatchingSuperReg(DestReg, AArch64::sub_32,
                                                    &AArch64::GPR64spRegClass);
       assert(DestRegX.isValid() && "Destination super-reg not valid");
-      MCRegister SrcRegX =
-          SrcReg == AArch64::WZR
-              ? AArch64::XZR
-              : RI.getMatchingSuperReg(SrcReg, AArch64::sub_32,
-                                       &AArch64::GPR64spRegClass);
+      MCRegister SrcRegX = RI.getMatchingSuperReg(SrcReg, AArch64::sub_32,
+                                                  &AArch64::GPR64spRegClass);
       assert(SrcRegX.isValid() && "Source super-reg not valid");
       // This instruction is reading and writing X registers.  This may upset
       // the register scavenger and machine verifier, so we need to indicate
@@ -5121,6 +5149,51 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     return;
   }
 
+  // GPR32 zeroing
+  if (AArch64::GPR32spRegClass.contains(DestReg) && SrcReg == AArch64::WZR) {
+    if (Subtarget.hasZeroCycleZeroingGPR32()) {
+      BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
+          .addImm(0)
+          .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+    } else {
+      BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
+          .addReg(AArch64::WZR)
+          .addReg(AArch64::WZR);
+    }
+    return;
+  }
+
+  if (AArch64::GPR64spRegClass.contains(DestReg) &&
+      AArch64::GPR64spRegClass.contains(SrcReg)) {
+    if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
+      // If either operand is SP, expand to ADD #0.
+      BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc))
+          .addImm(0)
+          .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+    } else {
+      // Otherwise, expand to ORR XZR.
+      BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
+          .addReg(AArch64::XZR)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    }
+    return;
+  }
+
+  // GPR64 zeroing
+  if (AArch64::GPR64spRegClass.contains(DestReg) && SrcReg == AArch64::XZR) {
+    if (Subtarget.hasZeroCycleZeroingGPR64()) {
+      BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
+          .addImm(0)
+          .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+    } else {
+      BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
+          .addReg(AArch64::XZR)
+          .addReg(AArch64::XZR);
+    }
+    return;
+  }
+
   // Copy a Predicate register by ORRing with itself.
   if (AArch64::PPRRegClass.contains(DestReg) &&
       AArch64::PPRRegClass.contains(SrcReg)) {
@@ -5205,27 +5278,6 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     return;
   }
 
-  if (AArch64::GPR64spRegClass.contains(DestReg) &&
-      (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) {
-    if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
-      // If either operand is SP, expand to ADD #0.
-      BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
-          .addReg(SrcReg, getKillRegState(KillSrc))
-          .addImm(0)
-          .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
-    } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGPR64()) {
-      BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
-          .addImm(0)
-          .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
-    } else {
-      // Otherwise, expand to ORR XZR.
-      BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
-          .addReg(AArch64::XZR)
-          .addReg(SrcReg, getKillRegState(KillSrc));
-    }
-    return;
-  }
-
   // Copy a DDDD register quad by copying the individual sub-registers.
   if (AArch64::DDDDRegClass.contains(DestReg) &&
       AArch64::DDDDRegClass.contains(SrcReg)) {
@@ -10600,6 +10652,9 @@ describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg,
   Register DestReg = DestSrc->Destination->getReg();
   Register SrcReg = DestSrc->Source->getReg();
 
+  if (!DestReg.isValid() || !SrcReg.isValid())
+    return std::nullopt;
+
   auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
 
   // If the described register is the destination, just return the source.
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
index b3c9656d4d80b..343fd81ace0a5 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
@@ -40,7 +40,11 @@ yaml::AArch64FunctionInfo::AArch64FunctionInfo(
           getSVEStackSize(MFI, &llvm::AArch64FunctionInfo::getStackSizePPR)),
       HasStackFrame(MFI.hasStackFrame()
                         ? std::optional<bool>(MFI.hasStackFrame())
-                        : std::nullopt) {}
+                        : std::nullopt),
+      HasStreamingModeChanges(
+          MFI.hasStreamingModeChanges()
+              ? std::optional<bool>(MFI.hasStreamingModeChanges())
+              : std::nullopt) {}
 
 void yaml::AArch64FunctionInfo::mappingImpl(yaml::IO &YamlIO) {
   MappingTraits<AArch64FunctionInfo>::mapping(YamlIO, *this);
@@ -55,6 +59,8 @@ void AArch64FunctionInfo::initializeBaseYamlFields(
                     YamlMFI.StackSizePPR.value_or(0));
   if (YamlMFI.HasStackFrame)
     setHasStackFrame(*YamlMFI.HasStackFrame);
+  if (YamlMFI.HasStreamingModeChanges)
+    setHasStreamingModeChanges(*YamlMFI.HasStreamingModeChanges);
 }
 
 static std::pair<bool, bool> GetSignReturnAddress(const Function &F) {
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index bd0a17d743c02..d1832f4469b77 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -645,6 +645,7 @@ struct AArch64FunctionInfo final : public yaml::MachineFunctionInfo {
   std::optional<uint64_t> StackSizeZPR;
   std::optional<uint64_t> StackSizePPR;
   std::optional<bool> HasStackFrame;
+  std::optional<bool> HasStreamingModeChanges;
 
   AArch64FunctionInfo() = default;
   AArch64FunctionInfo(const llvm::AArch64FunctionInfo &MFI);
@@ -659,6 +660,7 @@ template <> struct MappingTraits<AArch64FunctionInfo> {
     YamlIO.mapOptional("stackSizeZPR", MFI.StackSizeZPR);
     YamlIO.mapOptional("stackSizePPR", MFI.StackSizePPR);
     YamlIO.mapOptional("hasStackFrame", MFI.HasStackFrame);
+    YamlIO.mapOptional("hasStreamingModeChanges", MFI.HasStreamingModeChanges);
   }
 };
 
diff --git a/llvm/lib/Target/AArch64/AArch64PostCoalescerPass.cpp b/llvm/lib/Target/AArch64/AArch64PostCoalescerPass.cpp
index cdf2822f3ed9d..a90950ddaaa96 100644
--- a/llvm/lib/Target/AArch64/AArch64PostCoalescerPass.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PostCoalescerPass.cpp
@@ -75,6 +75,10 @@ bool AArch64PostCoalescer::runOnMachineFunction(MachineFunction &MF) {
         if (Src != Dst)
           MRI->replaceRegWith(Dst, Src);
 
+        if (MI.getOperand(1).isUndef())
+          for (MachineOperand &MO : MRI->use_operands(Dst))
+            MO.setIsUndef();
+
         // MI must be erased from the basic block before recalculating the live
         // interval.
         LIS->RemoveMachineInstrFromMaps(MI);
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index bc6b9310686a6..98a128e582866 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -265,7 +265,7 @@ def SDT_AArch64Arith_Imm : SDTypeProfile<1, 3, [
   SDTCVecEltisVT<1,i1>, SDTCisSameAs<0,2>
 ]>;
 
-def AArch64asrd_m1 : SDNode<"AArch64ISD::SRAD_MERGE_OP1", SDT_AArch64Arith_Imm>;
+def AArch64asrd_m1 : SDNode<"AArch64ISD::ASRD_MERGE_OP1", SDT_AArch64Arith_Imm>;
 def AArch64urshri_p_node : SDNode<"AArch64ISD::URSHR_I_PRED", SDT_AArch64Arith_Imm>;
 
 def AArch64urshri_p : PatFrags<(ops node:$op1, node:$op2, node:$op3),
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 479e34515fc8a..e3370d31a0e39 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5722,7 +5722,7 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
   }
 
   // Add additional cost for the extends that would need to be inserted.
-  return Cost + 4;
+  return Cost + 2;
 }
 
 InstructionCost
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 2c3870c6da9b8..636d4f8a9ca3a 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -8217,6 +8217,8 @@ bool AArch64AsmParser::parseDataExpr(const MCExpr *&Res) {
       Spec = AArch64::S_GOTPCREL;
     else if (Identifier == "plt")
       Spec = AArch64::S_PLT;
+    else if (Identifier == "funcinit")
+      Spec = AArch64::S_FUNCINIT;
   }
   if (Spec == AArch64::S_None)
     return Error(Loc, "invalid relocation specifier");
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 9e2d698e04ae7..05a431312472e 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1855,6 +1855,8 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
     return LowerTriOp(AArch64::G_UDOT);
   case Intrinsic::aarch64_neon_sdot:
     return LowerTriOp(AArch64::G_SDOT);
+  case Intrinsic::aarch64_neon_usdot:
+    return LowerTriOp(AArch64::G_USDOT);
   case Intrinsic::aarch64_neon_sqxtn:
     return LowerUnaryOp(TargetOpcode::G_TRUNC_SSAT_S);
   case Intrinsic::aarch64_neon_sqxtun:
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
index a388216a95098..892b8da37eb69 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -232,6 +232,8 @@ unsigned AArch64ELFObjectWriter::getRelocType(const MCFixup &Fixup,
       }
       if (RefKind == AArch64::S_AUTH || RefKind == AArch64::S_AUTHADDR)
         return ELF::R_AARCH64_AUTH_ABS64;
+      if (RefKind == AArch64::S_FUNCINIT)
+        return ELF::R_AARCH64_FUNCINIT64;
       return ELF::R_AARCH64_ABS64;
     }
     case AArch64::fixup_aarch64_add_imm12:
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index 2b5cf3484ffc1..bc090c6157eef 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -40,6 +40,7 @@ const MCAsmInfo::AtSpecifier ELFAtSpecifiers[] = {
     {AArch64::S_GOT, "GOT"},
     {AArch64::S_GOTPCREL, "GOTPCREL"},
     {AArch64::S_PLT, "PLT"},
+    {AArch64::S_FUNCINIT, "FUNCINIT"},
 };
 
 const MCAsmInfo::AtSpecifier MachOAtSpecifiers[] = {
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
index 0dfa61b1dc60e..f2acff54f1665 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
@@ -164,6 +164,7 @@ enum {
   // ELF relocation specifiers in data directives:
   S_PLT          = 0x400,
   S_GOTPCREL,
+  S_FUNCINIT,
 
   // Mach-O @ relocation specifiers:
   S_MACHO_GOT,
diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
index 474974893d945..434ea67f25244 100644
--- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
+++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
@@ -294,6 +294,12 @@ struct MachineSMEABI : public MachineFunctionPass {
                                     MachineBasicBlock::iterator MBBI,
                                     LiveRegs PhysLiveRegs);
 
+  /// Attempts to find an insertion point before \p Inst where the status flags
+  /// are not live. If \p Inst is `Block.Insts.end()` a point before the end of
+  /// the block is found.
+  std::pair<MachineBasicBlock::iterator, LiveRegs>
+  findStateChangeInsertionPoint(MachineBasicBlock &MBB, const BlockInfo &Block,
+                                SmallVectorImpl<InstInfo>::const_iterator Inst);
   void emitStateChange(EmitContext &, MachineBasicBlock &MBB,
                        MachineBasicBlock::iterator MBBI, ZAState From,
                        ZAState To, LiveRegs PhysLiveRegs);
@@ -337,6 +343,28 @@ struct MachineSMEABI : public MachineFunctionPass {
   MachineRegisterInfo *MRI = nullptr;
 };
 
+static LiveRegs getPhysLiveRegs(LiveRegUnits const &LiveUnits) {
+  LiveRegs PhysLiveRegs = LiveRegs::None;
+  if (!LiveUnits.available(AArch64::NZCV))
+    PhysLiveRegs |= LiveRegs::NZCV;
+  // We have to track W0 and X0 separately as otherwise things can get
+  // confused if we attempt to preserve X0 but only W0 was defined.
+  if (!LiveUnits.available(AArch64::W0))
+    PhysLiveRegs |= LiveRegs::W0;
+  if (!LiveUnits.available(AArch64::W0_HI))
+    PhysLiveRegs |= LiveRegs::W0_HI;
+  return PhysLiveRegs;
+}
+
+static void setPhysLiveRegs(LiveRegUnits &LiveUnits, LiveRegs PhysLiveRegs) {
+  if (PhysLiveRegs & LiveRegs::NZCV)
+    LiveUnits.addReg(AArch64::NZCV);
+  if (PhysLiveRegs & LiveRegs::W0)
+    LiveUnits.addReg(AArch64::W0);
+  if (PhysLiveRegs & LiveRegs::W0_HI)
+    LiveUnits.addReg(AArch64::W0_HI);
+}
+
 FunctionInfo MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
   assert((SMEFnAttrs.hasAgnosticZAInterface() || SMEFnAttrs.hasZT0State() ||
           SMEFnAttrs.hasZAState()) &&
@@ -362,26 +390,13 @@ FunctionInfo MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
     LiveRegUnits LiveUnits(*TRI);
     LiveUnits.addLiveOuts(MBB);
 
-    auto GetPhysLiveRegs = [&] {
-      LiveRegs PhysLiveRegs = LiveRegs::None;
-      if (!LiveUnits.available(AArch64::NZCV))
-        PhysLiveRegs |= LiveRegs::NZCV;
-      // We have to track W0 and X0 separately as otherwise things can get
-      // confused if we attempt to preserve X0 but only W0 was defined.
-      if (!LiveUnits.available(AArch64::W0))
-        PhysLiveRegs |= LiveRegs::W0;
-      if (!LiveUnits.available(AArch64::W0_HI))
-        PhysLiveRegs |= LiveRegs::W0_HI;
-      return PhysLiveRegs;
-    };
-
-    Block.PhysLiveRegsAtExit = GetPhysLiveRegs();
+    Block.PhysLiveRegsAtExit = getPhysLiveRegs(LiveUnits);
     auto FirstTerminatorInsertPt = MBB.getFirstTerminator();
     auto FirstNonPhiInsertPt = MBB.getFirstNonPHI();
     for (MachineInstr &MI : reverse(MBB)) {
       MachineBasicBlock::iterator MBBI(MI);
       LiveUnits.stepBackward(MI);
-      LiveRegs PhysLiveRegs = GetPhysLiveRegs();
+      LiveRegs PhysLiveRegs = getPhysLiveRegs(LiveUnits);
       // The SMEStateAllocPseudo marker is added to a function if the save
       // buffer was allocated in SelectionDAG. It marks the end of the
       // allocation -- which is a safe point for this pass to insert any TPIDR2
@@ -476,6 +491,49 @@ MachineSMEABI::assignBundleZAStates(const EdgeBundles &Bundles,
   return BundleStates;
 }
 
+std::pair<MachineBasicBlock::iterator, LiveRegs>
+MachineSMEABI::findStateChangeInsertionPoint(
+    MachineBasicBlock &MBB, const BlockInfo &Block,
+    SmallVectorImpl<InstInfo>::const_iterator Inst) {
+  LiveRegs PhysLiveRegs;
+  MachineBasicBlock::iterator InsertPt;
+  if (Inst != Block.Insts.end()) {
+    InsertPt = Inst->InsertPt;
+    PhysLiveRegs = Inst->PhysLiveRegs;
+  } else {
+    InsertPt = MBB.getFirstTerminator();
+    PhysLiveRegs = Block.PhysLiveRegsAtExit;
+  }
+
+  if (!(PhysLiveRegs & LiveRegs::NZCV))
+    return {InsertPt, PhysLiveRegs}; // Nothing to do (no live flags).
+
+  // Find the previous state change. We can not move before this point.
+  MachineBasicBlock::iterator PrevStateChangeI;
+  if (Inst == Block.Insts.begin()) {
+    PrevStateChangeI = MBB.begin();
+  } else {
+    // Note: `std::prev(Inst)` is the previous InstInfo. We only create an
+    // InstInfo object for instructions that require a specific ZA state, so the
+    // InstInfo is the site of the previous state change in the block (which can
+    // be several MIs earlier).
+    PrevStateChangeI = std::prev(Inst)->InsertPt;
+  }
+
+  // Note: LiveUnits will only accurately track X0 and NZCV.
+  LiveRegUnits LiveUnits(*TRI);
+  setPhysLiveRegs(LiveUnits, PhysLiveRegs);
+  for (MachineBasicBlock::iterator I = InsertPt; I != PrevStateChangeI; --I) {
+    // Don't move before/into a call (which may have a state change before it).
+    if (I->getOpcode() == TII->getCallFrameDestroyOpcode() || I->isCall())
+      break;
+    LiveUnits.stepBackward(*I);
+    if (LiveUnits.available(AArch64::NZCV))
+      return {I, getPhysLiveRegs(LiveUnits)};
+  }
+  return {InsertPt, PhysLiveRegs};
+}
+
 void MachineSMEABI::insertStateChanges(EmitContext &Context,
                                        const FunctionInfo &FnInfo,
                                        const EdgeBundles &Bundles,
@@ -490,10 +548,13 @@ void MachineSMEABI::insertStateChanges(EmitContext &Context,
       CurrentState = InState;
 
     for (auto &Inst : Block.Insts) {
-      if (CurrentState != Inst.NeededState)
-        emitStateChange(Context, MBB, Inst.InsertPt, CurrentState,
-                        Inst.NeededState, Inst.PhysLiveRegs);
-      CurrentState = Inst.NeededState;
+      if (CurrentState != Inst.NeededState) {
+        auto [InsertPt, PhysLiveRegs] =
+            findStateChangeInsertionPoint(MBB, Block, &Inst);
+        emitStateChange(Context, MBB, InsertPt, CurrentState, Inst.NeededState,
+                        PhysLiveRegs);
+        CurrentState = Inst.NeededState;
+      }
     }
 
     if (MBB.succ_empty())
@@ -501,9 +562,12 @@ void MachineSMEABI::insertStateChanges(EmitContext &Context,
 
     ZAState OutState =
         BundleStates[Bundles.getBundle(MBB.getNumber(), /*Out=*/true)];
-    if (CurrentState != OutState)
-      emitStateChange(Context, MBB, MBB.getFirstTerminator(), CurrentState,
-                      OutState, Block.PhysLiveRegsAtExit);
+    if (CurrentState != OutState) {
+      auto [InsertPt, PhysLiveRegs] =
+          findStateChangeInsertionPoint(MBB, Block, Block.Insts.end());
+      emitStateChange(Context, MBB, InsertPt, CurrentState, OutState,
+                      PhysLiveRegs);
+    }
   }
 }
 
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
index dd6fa167c6f4d..d71f7280597b9 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
+++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
@@ -130,6 +130,12 @@ SMECallAttrs::SMECallAttrs(const CallBase &CB, const AArch64TargetLowering *TLI)
   if (auto *CalledFunction = CB.getCalledFunction())
     CalledFn = SMEAttrs(*CalledFunction, TLI);
 
+  // An `invoke` of an agnostic ZA function may not return normally (it may
+  // resume in an exception block). In this case, it acts like a private ZA
+  // callee and may require a ZA save to be set up before it is called.
+  if (isa<InvokeInst>(CB))
+    CalledFn.set(SMEAttrs::ZA_State_Agnostic, /*Enable=*/false);
+
   // FIXME: We probably should not allow SME attributes on direct calls but
   // clang duplicates streaming mode attributes at each callsite.
   assert((IsIndirect ||
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp
index 19e2a6a27020d..93732a7ea25ca 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp
@@ -244,11 +244,8 @@ void getInterestingMemoryOperands(
       // Masked store has an initial operand for the value.
       unsigned OpOffset = IsWrite ? 1 : 0;
       Type *Ty = IsWrite ? CI->getArgOperand(0)->getType() : CI->getType();
-      MaybeAlign Alignment = Align(1);
-      // Otherwise no alignment guarantees. We probably got Undef.
-      if (auto *Op = dyn_cast<ConstantInt>(CI->getOperand(1 + OpOffset)))
-        Alignment = Op->getMaybeAlignValue();
-      Value *Mask = CI->getOperand(2 + OpOffset);
+      MaybeAlign Alignment = CI->getParamAlign(OpOffset);
+      Value *Mask = CI->getOperand(1 + OpOffset);
       Interesting.emplace_back(I, OpOffset, IsWrite, Ty, Alignment, Mask);
       break;
     }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp
new file mode 100644
index 0000000000000..30a1f05a8a390
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp
@@ -0,0 +1,73 @@
+//===--- AMDGPUBarrierLatency.cpp - AMDGPU Barrier Latency ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains a DAG scheduling mutation to add latency to
+///       barrier edges between ATOMIC_FENCE instructions and preceding
+///       memory accesses potentially affected by the fence.
+///       This encourages the scheduling of more instructions before
+///       ATOMIC_FENCE instructions.  ATOMIC_FENCE instructions may
+///       introduce wait counting or indicate an impending S_BARRIER
+///       wait.  Having more instructions in-flight across these
+///       constructs improves latency hiding.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUBarrierLatency.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+
+using namespace llvm;
+
+namespace {
+
+class BarrierLatency : public ScheduleDAGMutation {
+public:
+  BarrierLatency() = default;
+  void apply(ScheduleDAGInstrs *DAG) override;
+};
+
+void BarrierLatency::apply(ScheduleDAGInstrs *DAG) {
+  constexpr unsigned SyntheticLatency = 2000;
+  for (SUnit &SU : DAG->SUnits) {
+    const MachineInstr *MI = SU.getInstr();
+    if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
+      continue;
+
+    // Update latency on barrier edges of ATOMIC_FENCE.
+    // We don't consider the scope of the fence or type of instruction
+    // involved in the barrier edge.
+    for (SDep &PredDep : SU.Preds) {
+      if (!PredDep.isBarrier())
+        continue;
+      SUnit *PredSU = PredDep.getSUnit();
+      MachineInstr *MI = PredSU->getInstr();
+      // Only consider memory loads
+      if (!MI->mayLoad() || MI->mayStore())
+        continue;
+      SDep ForwardD = PredDep;
+      ForwardD.setSUnit(&SU);
+      for (SDep &SuccDep : PredSU->Succs) {
+        if (SuccDep == ForwardD) {
+          SuccDep.setLatency(SuccDep.getLatency() + SyntheticLatency);
+          break;
+        }
+      }
+      PredDep.setLatency(PredDep.getLatency() + SyntheticLatency);
+      PredSU->setDepthDirty();
+      SU.setDepthDirty();
+    }
+  }
+}
+
+} // end namespace
+
+std::unique_ptr<ScheduleDAGMutation>
+llvm::createAMDGPUBarrierLatencyDAGMutation() {
+  return std::make_unique<BarrierLatency>();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h
new file mode 100644
index 0000000000000..c23f0b99fe822
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.h
@@ -0,0 +1,21 @@
+//===- AMDGPUBarrierLatency.h - AMDGPU Export Clustering --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUBARRIERLATENCY_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUBARRIERLATENCY_H
+
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
+#include <memory>
+
+namespace llvm {
+
+std::unique_ptr<ScheduleDAGMutation> createAMDGPUBarrierLatencyDAGMutation();
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUBARRIERLATENCY_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 1b559a628be08..8ed4062e43946 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -514,8 +514,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
       MVT::i64, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
 
-  setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i32,
-                     Legal);
+  setOperationAction({ISD::ABS, ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX},
+                     MVT::i32, Legal);
 
   setOperationAction(
       {ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF},
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 859e3b9564d77..128f5001aa475 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3452,10 +3452,14 @@ bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
           : 0); // swz
 
   MachineMemOperand *LoadMMO = *MI.memoperands_begin();
+  // Don't set the offset value here because the pointer points to the base of
+  // the buffer.
   MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
-  LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm();
+
   MachinePointerInfo StorePtrI = LoadPtrI;
-  StorePtrI.V = nullptr;
+  LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
+                                                 AMDGPUAS::BUFFER_RESOURCE));
+  LoadPtrI.AddrSpace = AMDGPUAS::BUFFER_RESOURCE;
   StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
 
   auto F = LoadMMO->getFlags() &
@@ -3633,13 +3637,17 @@ bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
   if (isSGPR(Addr))
     MIB.addReg(VOffset);
 
-  MIB.add(MI.getOperand(4))  // offset
-     .add(MI.getOperand(5)); // cpol
+  MIB.add(MI.getOperand(4)); // offset
+
+  unsigned Aux = MI.getOperand(5).getImm();
+  MIB.addImm(Aux & ~AMDGPU::CPol::VIRTUAL_BITS); // cpol
 
   MachineMemOperand *LoadMMO = *MI.memoperands_begin();
   MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
   LoadPtrI.Offset = MI.getOperand(4).getImm();
   MachinePointerInfo StorePtrI = LoadPtrI;
+  LoadPtrI.V = PoisonValue::get(PointerType::get(MF->getFunction().getContext(),
+                                                 AMDGPUAS::GLOBAL_ADDRESS));
   LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
   StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
   auto F = LoadMMO->getFlags() &
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index bfe2c80c810ef..a67b12a22589c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -901,6 +901,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
 
   addRulesForGOpcs({G_ABS}, Standard).Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt}});
 
+  addRulesForGOpcs({G_READSTEADYCOUNTER}, Standard).Uni(S64, {{Sgpr64}, {}});
+
   bool hasSALUFloat = ST->hasSALUFloatInsts();
 
   addRulesForGOpcs({G_FADD}, Standard)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 56807a475537d..4c2f22c72a5e3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -5615,6 +5615,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_flat_prefetch:
     case Intrinsic::amdgcn_global_prefetch:
       return getDefaultMappingVOP(MI);
+    case Intrinsic::amdgcn_global_load_b128:
+    case Intrinsic::amdgcn_global_store_b128:
+      return getDefaultMappingAllVGPR(MI);
     default:
       return getInvalidInstructionMapping();
     }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index c71eff6a270e1..3a2eae1166df6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -17,6 +17,7 @@
 #include "AMDGPUTargetMachine.h"
 #include "AMDGPU.h"
 #include "AMDGPUAliasAnalysis.h"
+#include "AMDGPUBarrierLatency.h"
 #include "AMDGPUCtorDtorLowering.h"
 #include "AMDGPUExportClustering.h"
 #include "AMDGPUExportKernelRuntimeHandles.h"
@@ -652,6 +653,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
   DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
   DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
+  DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation());
   return DAG;
 }
 
@@ -672,6 +674,7 @@ createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C) {
   if (ST.shouldClusterStores())
     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
   DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
+  DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation());
   return DAG;
 }
 
@@ -1242,6 +1245,7 @@ GCNTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const {
       EnableVOPD)
     DAG->addMutation(createVOPDPairingMutation());
   DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
+  DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation());
   return DAG;
 }
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 4b0ad7ad64850..9599b2cec2a4a 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -51,6 +51,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUAsmPrinter.cpp
   AMDGPUAtomicOptimizer.cpp
   AMDGPUAttributor.cpp
+  AMDGPUBarrierLatency.cpp
   AMDGPUCallLowering.cpp
   AMDGPUCodeGenPrepare.cpp
   AMDGPUCombinerHelper.cpp
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index d0ad120e7ca65..b841171c285d8 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -1488,6 +1488,12 @@ let AssemblerPredicate = isGFX12Plus in {
 def : MnemonicAlias<"ds_load_tr_b64", "ds_load_tr8_b64">, Requires<[isGFX1250Plus]>;
 def : MnemonicAlias<"ds_load_tr_b128", "ds_load_tr16_b128">, Requires<[isGFX1250Plus]>;
 
+// Additional aliases for ds load transpose instructions.
+def : MnemonicAlias<"ds_load_b64_tr_b8", "ds_load_tr8_b64">, Requires<[isGFX125xOnly]>;
+def : MnemonicAlias<"ds_load_b128_tr_b16", "ds_load_tr16_b128">, Requires<[isGFX125xOnly]>;
+def : MnemonicAlias<"ds_load_b64_tr_b4", "ds_load_tr4_b64">, Requires<[isGFX125xOnly]>;
+def : MnemonicAlias<"ds_load_b96_tr_b6", "ds_load_tr6_b96">, Requires<[isGFX125xOnly]>;
+
 //===----------------------------------------------------------------------===//
 // GFX11.
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index be6239590e7da..e3f3abae01648 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -513,8 +513,7 @@ void AMDGPUDisassembler::decodeImmOperands(MCInst &MI,
     }
 
     if (Imm == AMDGPU::EncValues::LITERAL_CONST) {
-      Op = decodeLiteralConstant(
-          Desc, OpDesc, OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_FP64);
+      Op = decodeLiteralConstant(Desc, OpDesc);
       continue;
     }
 
@@ -893,6 +892,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
   // have EXEC as implicit destination. Issue a warning if encoding for
   // vdst is not EXEC.
   if ((MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3) &&
+      MCII->get(MI.getOpcode()).getNumDefs() == 0 &&
       MCII->get(MI.getOpcode()).hasImplicitDefOfPhysReg(AMDGPU::EXEC)) {
     auto ExecEncoding = MRI.getEncodingValue(AMDGPU::EXEC_LO);
     if (Bytes_[0] != ExecEncoding)
@@ -1545,21 +1545,21 @@ AMDGPUDisassembler::decodeMandatoryLiteralConstant(unsigned Val) const {
 MCOperand
 AMDGPUDisassembler::decodeMandatoryLiteral64Constant(uint64_t Val) const {
   if (HasLiteral) {
-    if (Literal64 != Val)
+    if (Literal != Val)
       return errOperand(Val, "More than one unique literal is illegal");
   }
   HasLiteral = true;
-  Literal = Literal64 = Val;
+  Literal = Val;
 
-  bool UseLit64 = Hi_32(Literal64) == 0;
+  bool UseLit64 = Hi_32(Literal) == 0;
   return UseLit64 ? MCOperand::createExpr(AMDGPUMCExpr::createLit(
-                        LitModifier::Lit64, Literal64, getContext()))
-                  : MCOperand::createImm(Literal64);
+                        LitModifier::Lit64, Literal, getContext()))
+                  : MCOperand::createImm(Literal);
 }
 
-MCOperand AMDGPUDisassembler::decodeLiteralConstant(const MCInstrDesc &Desc,
-                                                    const MCOperandInfo &OpDesc,
-                                                    bool ExtendFP64) const {
+MCOperand
+AMDGPUDisassembler::decodeLiteralConstant(const MCInstrDesc &Desc,
+                                          const MCOperandInfo &OpDesc) const {
   // For now all literal constants are supposed to be unsigned integer
   // ToDo: deal with signed/unsigned 64-bit integer constants
   // ToDo: deal with float/double constants
@@ -1569,35 +1569,79 @@ MCOperand AMDGPUDisassembler::decodeLiteralConstant(const MCInstrDesc &Desc,
                         Twine(Bytes.size()));
     }
     HasLiteral = true;
-    Literal = Literal64 = eatBytes<uint32_t>(Bytes);
-    if (ExtendFP64)
-      Literal64 <<= 32;
+    Literal = eatBytes<uint32_t>(Bytes);
   }
 
-  int64_t Val = ExtendFP64 ? Literal64 : Literal;
+  // For disassembling always assume all inline constants are available.
+  bool HasInv2Pi = true;
 
-  bool CanUse64BitLiterals =
-      STI.hasFeature(AMDGPU::Feature64BitLiterals) &&
-      !(Desc.TSFlags & (SIInstrFlags::VOP3 | SIInstrFlags::VOP3P));
-
-  bool UseLit64 = false;
-  if (CanUse64BitLiterals) {
-    if (OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
-        OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_C_INT64)
-      UseLit64 = false;
-    else if (OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_FP64 ||
-             OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_C_FP64 ||
-             OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_AC_FP64)
-      UseLit64 = Hi_32(Literal64) == 0;
+  // Invalid instruction codes may contain literals for inline-only
+  // operands, so we support them here as well.
+  int64_t Val = Literal;
+  bool UseLit = false;
+  switch (OpDesc.OperandType) {
+  default:
+    llvm_unreachable("Unexpected operand type!");
+  case AMDGPU::OPERAND_REG_IMM_BF16:
+  case AMDGPU::OPERAND_REG_INLINE_C_BF16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
+    UseLit = AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
+    break;
+  case AMDGPU::OPERAND_REG_IMM_V2BF16:
+    UseLit = AMDGPU::isInlinableLiteralV2BF16(Val);
+    break;
+  case AMDGPU::OPERAND_REG_IMM_FP16:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+    UseLit = AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
+    break;
+  case AMDGPU::OPERAND_REG_IMM_V2FP16:
+    UseLit = AMDGPU::isInlinableLiteralV2F16(Val);
+    break;
+  case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
+    break;
+  case AMDGPU::OPERAND_REG_IMM_INT16:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+    UseLit = AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
+    break;
+  case AMDGPU::OPERAND_REG_IMM_V2INT16:
+    UseLit = AMDGPU::isInlinableLiteralV2I16(Val);
+    break;
+  case AMDGPU::OPERAND_REG_IMM_FP32:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+  case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
+  case AMDGPU::OPERAND_REG_IMM_INT32:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT32:
+  case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
+  case AMDGPU::OPERAND_REG_IMM_V2FP32:
+  case AMDGPU::OPERAND_REG_IMM_V2INT32:
+  case AMDGPU::OPERAND_KIMM32:
+    UseLit = AMDGPU::isInlinableLiteral32(Val, HasInv2Pi);
+    break;
+  case AMDGPU::OPERAND_REG_IMM_FP64:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+  case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
+    Val <<= 32;
+    break;
+  case AMDGPU::OPERAND_REG_IMM_INT64:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT64:
+    UseLit = AMDGPU::isInlinableLiteral64(Val, HasInv2Pi);
+    break;
+  case MCOI::OPERAND_REGISTER:
+    // TODO: Disassembling V_DUAL_FMAMK_F32_X_FMAMK_F32_gfx11 hits
+    // decoding a literal in a position of a register operand. Give
+    // it special handling in the caller, decodeImmOperands(), instead
+    // of quietly allowing it here.
+    break;
   }
 
-  return UseLit64 ? MCOperand::createExpr(AMDGPUMCExpr::createLit(
-                        LitModifier::Lit64, Val, getContext()))
-                  : MCOperand::createImm(Val);
+  return UseLit ? MCOperand::createExpr(AMDGPUMCExpr::createLit(
+                      LitModifier::Lit, Val, getContext()))
+                : MCOperand::createImm(Val);
 }
 
-MCOperand
-AMDGPUDisassembler::decodeLiteral64Constant(const MCInst &Inst) const {
+MCOperand AMDGPUDisassembler::decodeLiteral64Constant() const {
   assert(STI.hasFeature(AMDGPU::Feature64BitLiterals));
 
   if (!HasLiteral) {
@@ -1606,25 +1650,13 @@ AMDGPUDisassembler::decodeLiteral64Constant(const MCInst &Inst) const {
                                Twine(Bytes.size()));
     }
     HasLiteral = true;
-    Literal64 = eatBytes<uint64_t>(Bytes);
-  }
-
-  bool UseLit64 = false;
-  const MCInstrDesc &Desc = MCII->get(Inst.getOpcode());
-  const MCOperandInfo &OpDesc = Desc.operands()[Inst.getNumOperands()];
-  if (OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
-      OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_C_INT64) {
-    UseLit64 = false;
-  } else {
-    assert(OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_FP64 ||
-           OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_C_FP64 ||
-           OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_AC_FP64);
-    UseLit64 = Hi_32(Literal64) == 0;
+    Literal = eatBytes<uint64_t>(Bytes);
   }
 
+  bool UseLit64 = Hi_32(Literal) == 0;
   return UseLit64 ? MCOperand::createExpr(AMDGPUMCExpr::createLit(
-                        LitModifier::Lit64, Literal64, getContext()))
-                  : MCOperand::createImm(Literal64);
+                        LitModifier::Lit64, Literal, getContext()))
+                  : MCOperand::createImm(Literal);
 }
 
 MCOperand AMDGPUDisassembler::decodeIntImmed(unsigned Imm) {
@@ -1913,7 +1945,7 @@ MCOperand AMDGPUDisassembler::decodeNonVGPRSrcOp(const MCInst &Inst,
     return MCOperand::createImm(Val);
 
   if (Val == LITERAL64_CONST && STI.hasFeature(AMDGPU::Feature64BitLiterals)) {
-    return decodeLiteral64Constant(Inst);
+    return decodeLiteral64Constant();
   }
 
   switch (Width) {
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 27518577c3ebb..d103d79fdabb9 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -44,8 +44,7 @@ class AMDGPUDisassembler : public MCDisassembler {
   const unsigned HwModeRegClass;
   const unsigned TargetMaxInstBytes;
   mutable ArrayRef<uint8_t> Bytes;
-  mutable uint32_t Literal;
-  mutable uint64_t Literal64;
+  mutable uint64_t Literal;
   mutable bool HasLiteral;
   mutable std::optional<bool> EnableWavefrontSize32;
   unsigned CodeObjectVersion;
@@ -144,9 +143,8 @@ class AMDGPUDisassembler : public MCDisassembler {
   MCOperand decodeMandatoryLiteralConstant(unsigned Imm) const;
   MCOperand decodeMandatoryLiteral64Constant(uint64_t Imm) const;
   MCOperand decodeLiteralConstant(const MCInstrDesc &Desc,
-                                  const MCOperandInfo &OpDesc,
-                                  bool ExtendFP64) const;
-  MCOperand decodeLiteral64Constant(const MCInst &Inst) const;
+                                  const MCOperandInfo &OpDesc) const;
+  MCOperand decodeLiteral64Constant() const;
 
   MCOperand decodeSrcOp(const MCInst &Inst, unsigned Width, unsigned Val) const;
 
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 6de59be7665b4..21b339f2c6784 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1824,6 +1824,72 @@ multiclass GlobalFLATStorePats<FLAT_Pseudo inst, SDPatternOperator node,
   }
 }
 
+class GlobalLoadIntrinWrapper<SDPatternOperator intrin> : PatFrag<
+  (ops node:$ptr, node:$cpol),
+  (intrin $ptr, $cpol)>;
+
+def wrapped_global_load_b128_intrin : GlobalLoadIntrinWrapper<int_amdgcn_global_load_b128>;
+
+class FlatLoadIntrinSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+  (vt (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), srcvalue)),
+  (inst $vaddr, $offset)
+>;
+
+class FlatLoadIntrinSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+  (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), srcvalue)),
+  (inst $saddr, $voffset, $offset, $cpol)
+>;
+
+multiclass GlobalFLATLoadIntrinPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+  def : FlatLoadIntrinSignedPat <inst, node, vt> {
+    let AddedComplexity = 10;
+    let SubtargetPredicate = inst.SubtargetPredicate;
+    let OtherPredicates = inst.OtherPredicates;
+  }
+  
+  def : FlatLoadIntrinSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+    let AddedComplexity = 11;
+    let SubtargetPredicate = inst.SubtargetPredicate;
+    let OtherPredicates = inst.OtherPredicates;
+  }  
+}
+
+class GlobalStoreIntrinWrapper<SDPatternOperator intrin> : PatFrag<
+  (ops node:$data, node:$ptr, node:$scope),
+  (intrin $ptr, $data, $scope)>;
+
+def wrapped_global_store_b128_intrin : GlobalStoreIntrinWrapper<int_amdgcn_global_store_b128>;
+
+class FlatStoreIntrinSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+  (node vt:$data, (GlobalOffset i64:$vaddr, i32:$offset), srcvalue),
+  (inst $vaddr, getVregSrcForVT<vt>.ret:$data, $offset)
+>;
+
+class FlatStoreIntrinSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
+                            ValueType vt> : GCNPat <
+  (node vt:$data, (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), srcvalue),
+  (inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset, $cpol)
+>;
+
+multiclass GlobalFLATStoreIntrinPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+  def : FlatStoreIntrinSignedPat <inst, node, vt> {
+    let SubtargetPredicate = inst.SubtargetPredicate;
+    let OtherPredicates = inst.OtherPredicates;
+    let AddedComplexity = 10;
+  }
+
+  def : FlatStoreIntrinSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+    let SubtargetPredicate = inst.SubtargetPredicate;
+    let OtherPredicates = inst.OtherPredicates;
+    let AddedComplexity = 11;
+  }
+}
+
+let SubtargetPredicate = isGFX940Plus in {
+defm : GlobalFLATLoadIntrinPats <GLOBAL_LOAD_DWORDX4, wrapped_global_load_b128_intrin, v4i32>;
+defm : GlobalFLATStoreIntrinPats <GLOBAL_STORE_DWORDX4, wrapped_global_store_b128_intrin, v4i32>;
+}
+
 multiclass GlobalFLATStorePats_D16_t16<string inst, SDPatternOperator node, ValueType vt> {
   def : FlatStoreSignedPat<!cast<FLAT_Pseudo>(inst#"_t16"), node, vt> {
     let AddedComplexity = 10;
@@ -3711,6 +3777,12 @@ defm GLOBAL_LOAD_TR_B64_w32           : VFLAT_Real_AllAddr_gfx1250<0x058, "globa
 defm GLOBAL_LOAD_TR4_B64              : VFLAT_Real_AllAddr_gfx1250<0x073>;
 defm GLOBAL_LOAD_TR6_B96              : VFLAT_Real_AllAddr_gfx1250<0x074>;
 
+// Additional aliases for global load transpose instructions.
+def : MnemonicAlias<"global_load_b128_tr_b16", "global_load_tr16_b128">, Requires<[isGFX125xOnly]>;
+def : MnemonicAlias<"global_load_b64_tr_b8", "global_load_tr8_b64">, Requires<[isGFX125xOnly]>;
+def : MnemonicAlias<"global_load_b64_tr_b4", "global_load_tr4_b64">, Requires<[isGFX125xOnly]>;
+def : MnemonicAlias<"global_load_b96_tr_b6", "global_load_tr6_b96">, Requires<[isGFX125xOnly]>;
+
 defm FLAT_ATOMIC_ADD_F64              : VFLAT_Real_Atomics_gfx1250<0x055>;
 defm FLAT_ATOMIC_MIN_F64              : VFLAT_Real_Atomics_gfx1250<0x05b, "flat_atomic_min_num_f64">;
 defm FLAT_ATOMIC_MAX_F64              : VFLAT_Real_Atomics_gfx1250<0x05c, "flat_atomic_max_num_f64">;
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 979a8b0abfb4c..4b22c68ef01c5 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -21,6 +21,7 @@
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/RegisterPressure.h"
 #include <algorithm>
+#include <array>
 
 namespace llvm {
 
@@ -45,7 +46,7 @@ struct GCNRegPressure {
     return !Value[SGPR] && !Value[VGPR] && !Value[AGPR] && !Value[AVGPR];
   }
 
-  void clear() { std::fill(&Value[0], &Value[ValueArraySize], 0); }
+  void clear() { Value.fill(0); }
 
   unsigned getNumRegs(RegKind Kind) const {
     assert(Kind < TOTAL_KINDS);
@@ -127,9 +128,7 @@ struct GCNRegPressure {
   bool less(const MachineFunction &MF, const GCNRegPressure &O,
             unsigned MaxOccupancy = std::numeric_limits<unsigned>::max()) const;
 
-  bool operator==(const GCNRegPressure &O) const {
-    return std::equal(&Value[0], &Value[ValueArraySize], O.Value);
-  }
+  bool operator==(const GCNRegPressure &O) const { return Value == O.Value; }
 
   bool operator!=(const GCNRegPressure &O) const {
     return !(*this == O);
@@ -160,7 +159,7 @@ struct GCNRegPressure {
 
   /// Pressure for all register kinds (first all regular registers kinds, then
   /// all tuple register kinds).
-  unsigned Value[ValueArraySize];
+  std::array<unsigned, ValueArraySize> Value;
 
   static unsigned getRegKind(const TargetRegisterClass *RC,
                              const SIRegisterInfo *STI);
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 58482ea69d0b0..9fbf9e5fe8eeb 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -69,6 +69,12 @@ static cl::opt<bool> GCNTrackers(
     cl::desc("Use the AMDGPU specific RPTrackers during scheduling"),
     cl::init(false));
 
+static cl::opt<unsigned> PendingQueueLimit(
+    "amdgpu-scheduler-pending-queue-limit", cl::Hidden,
+    cl::desc(
+        "Max (Available+Pending) size to inspect pending queue (0 disables)"),
+    cl::init(256));
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 #define DUMP_MAX_REG_PRESSURE
 static cl::opt<bool> PrintMaxRPRegUsageBeforeScheduler(
@@ -335,17 +341,52 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
   }
 }
 
+static bool shouldCheckPending(SchedBoundary &Zone,
+                               const TargetSchedModel *SchedModel) {
+  bool HasBufferedModel =
+      SchedModel->hasInstrSchedModel() && SchedModel->getMicroOpBufferSize();
+  unsigned Combined = Zone.Available.size() + Zone.Pending.size();
+  return Combined <= PendingQueueLimit && HasBufferedModel;
+}
+
+static SUnit *pickOnlyChoice(SchedBoundary &Zone,
+                             const TargetSchedModel *SchedModel) {
+  // pickOnlyChoice() releases pending instructions and checks for new hazards.
+  SUnit *OnlyChoice = Zone.pickOnlyChoice();
+  if (!shouldCheckPending(Zone, SchedModel) || Zone.Pending.empty())
+    return OnlyChoice;
+
+  return nullptr;
+}
+
+void GCNSchedStrategy::printCandidateDecision(const SchedCandidate &Current,
+                                              const SchedCandidate &Preferred) {
+  LLVM_DEBUG({
+    dbgs() << "Prefer:\t\t";
+    DAG->dumpNode(*Preferred.SU);
+
+    if (Current.SU) {
+      dbgs() << "Not:\t";
+      DAG->dumpNode(*Current.SU);
+    }
+
+    dbgs() << "Reason:\t\t";
+    traceCandidate(Preferred);
+  });
+}
+
 // This function is mostly cut and pasted from
 // GenericScheduler::pickNodeFromQueue()
 void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
                                          const CandPolicy &ZonePolicy,
                                          const RegPressureTracker &RPTracker,
-                                         SchedCandidate &Cand,
+                                         SchedCandidate &Cand, bool &IsPending,
                                          bool IsBottomUp) {
   const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
   ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
   unsigned SGPRPressure = 0;
   unsigned VGPRPressure = 0;
+  IsPending = false;
   if (DAG->isTrackingPressure()) {
     if (!GCNTrackers) {
       SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
@@ -358,8 +399,9 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
       VGPRPressure = T->getPressure().getArchVGPRNum();
     }
   }
-  ReadyQueue &Q = Zone.Available;
-  for (SUnit *SU : Q) {
+  LLVM_DEBUG(dbgs() << "Available Q:\n");
+  ReadyQueue &AQ = Zone.Available;
+  for (SUnit *SU : AQ) {
 
     SchedCandidate TryCand(ZonePolicy);
     initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
@@ -371,27 +413,55 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
       // Initialize resource delta if needed in case future heuristics query it.
       if (TryCand.ResDelta == SchedResourceDelta())
         TryCand.initResourceDelta(Zone.DAG, SchedModel);
+      LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
       Cand.setBest(TryCand);
-      LLVM_DEBUG(traceCandidate(Cand));
+    } else {
+      printCandidateDecision(TryCand, Cand);
+    }
+  }
+
+  if (!shouldCheckPending(Zone, SchedModel))
+    return;
+
+  LLVM_DEBUG(dbgs() << "Pending Q:\n");
+  ReadyQueue &PQ = Zone.Pending;
+  for (SUnit *SU : PQ) {
+
+    SchedCandidate TryCand(ZonePolicy);
+    initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
+                  VGPRPressure, IsBottomUp);
+    // Pass SchedBoundary only when comparing nodes from the same boundary.
+    SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
+    tryPendingCandidate(Cand, TryCand, ZoneArg);
+    if (TryCand.Reason != NoCand) {
+      // Initialize resource delta if needed in case future heuristics query it.
+      if (TryCand.ResDelta == SchedResourceDelta())
+        TryCand.initResourceDelta(Zone.DAG, SchedModel);
+      LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
+      IsPending = true;
+      Cand.setBest(TryCand);
+    } else {
+      printCandidateDecision(TryCand, Cand);
     }
   }
 }
 
 // This function is mostly cut and pasted from
 // GenericScheduler::pickNodeBidirectional()
-SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
+SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode,
+                                               bool &PickedPending) {
   // Schedule as far as possible in the direction of no choice. This is most
   // efficient, but also provides the best heuristics for CriticalPSets.
-  if (SUnit *SU = Bot.pickOnlyChoice()) {
+  if (SUnit *SU = pickOnlyChoice(Bot, SchedModel)) {
     IsTopNode = false;
     return SU;
   }
-  if (SUnit *SU = Top.pickOnlyChoice()) {
+  if (SUnit *SU = pickOnlyChoice(Top, SchedModel)) {
     IsTopNode = true;
     return SU;
   }
-  // Set the bottom-up policy based on the state of the current bottom zone and
-  // the instructions outside the zone, including the top zone.
+  // Set the bottom-up policy based on the state of the current bottom zone
+  // and the instructions outside the zone, including the top zone.
   CandPolicy BotPolicy;
   setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top);
   // Set the top-down policy based on the state of the current top zone and
@@ -399,12 +469,14 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
   CandPolicy TopPolicy;
   setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
 
+  bool BotPending = false;
   // See if BotCand is still valid (because we previously scheduled from Top).
   LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
   if (!BotCand.isValid() || BotCand.SU->isScheduled ||
       BotCand.Policy != BotPolicy) {
     BotCand.reset(CandPolicy());
     pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand,
+                      BotPending,
                       /*IsBottomUp=*/true);
     assert(BotCand.Reason != NoCand && "failed to find the first candidate");
   } else {
@@ -414,6 +486,7 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
       SchedCandidate TCand;
       TCand.reset(CandPolicy());
       pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand,
+                        BotPending,
                         /*IsBottomUp=*/true);
       assert(TCand.SU == BotCand.SU &&
              "Last pick result should correspond to re-picking right now");
@@ -421,12 +494,14 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
 #endif
   }
 
+  bool TopPending = false;
   // Check if the top Q has a better candidate.
   LLVM_DEBUG(dbgs() << "Picking from Top:\n");
   if (!TopCand.isValid() || TopCand.SU->isScheduled ||
       TopCand.Policy != TopPolicy) {
     TopCand.reset(CandPolicy());
     pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand,
+                      TopPending,
                       /*IsBottomUp=*/false);
     assert(TopCand.Reason != NoCand && "failed to find the first candidate");
   } else {
@@ -436,6 +511,7 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
       SchedCandidate TCand;
       TCand.reset(CandPolicy());
       pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand,
+                        TopPending,
                         /*IsBottomUp=*/false);
       assert(TCand.SU == TopCand.SU &&
              "Last pick result should correspond to re-picking right now");
@@ -446,12 +522,21 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
   // Pick best from BotCand and TopCand.
   LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
              dbgs() << "Bot Cand: "; traceCandidate(BotCand););
-  SchedCandidate Cand = BotCand;
-  TopCand.Reason = NoCand;
-  tryCandidate(Cand, TopCand, nullptr);
-  if (TopCand.Reason != NoCand) {
-    Cand.setBest(TopCand);
+  SchedCandidate Cand = BotPending ? TopCand : BotCand;
+  SchedCandidate TryCand = BotPending ? BotCand : TopCand;
+  PickedPending = BotPending && TopPending;
+
+  TryCand.Reason = NoCand;
+  if (BotPending || TopPending) {
+    PickedPending |= tryPendingCandidate(Cand, TopCand, nullptr);
+  } else {
+    tryCandidate(Cand, TryCand, nullptr);
+  }
+
+  if (TryCand.Reason != NoCand) {
+    Cand.setBest(TryCand);
   }
+
   LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););
 
   IsTopNode = Cand.AtTop;
@@ -466,35 +551,55 @@ SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) {
            Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
     return nullptr;
   }
+  bool PickedPending;
   SUnit *SU;
   do {
+    PickedPending = false;
     if (RegionPolicy.OnlyTopDown) {
-      SU = Top.pickOnlyChoice();
+      SU = pickOnlyChoice(Top, SchedModel);
       if (!SU) {
         CandPolicy NoPolicy;
         TopCand.reset(NoPolicy);
         pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand,
+                          PickedPending,
                           /*IsBottomUp=*/false);
         assert(TopCand.Reason != NoCand && "failed to find a candidate");
         SU = TopCand.SU;
       }
       IsTopNode = true;
     } else if (RegionPolicy.OnlyBottomUp) {
-      SU = Bot.pickOnlyChoice();
+      SU = pickOnlyChoice(Bot, SchedModel);
       if (!SU) {
         CandPolicy NoPolicy;
         BotCand.reset(NoPolicy);
         pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand,
+                          PickedPending,
                           /*IsBottomUp=*/true);
         assert(BotCand.Reason != NoCand && "failed to find a candidate");
         SU = BotCand.SU;
       }
       IsTopNode = false;
     } else {
-      SU = pickNodeBidirectional(IsTopNode);
+      SU = pickNodeBidirectional(IsTopNode, PickedPending);
     }
   } while (SU->isScheduled);
 
+  if (PickedPending) {
+    unsigned ReadyCycle = IsTopNode ? SU->TopReadyCycle : SU->BotReadyCycle;
+    SchedBoundary &Zone = IsTopNode ? Top : Bot;
+    unsigned CurrentCycle = Zone.getCurrCycle();
+    if (ReadyCycle > CurrentCycle)
+      Zone.bumpCycle(ReadyCycle);
+
+    // FIXME: checkHazard() doesn't give information about which cycle the
+    // hazard will resolve so just keep bumping the cycle by 1. This could be
+    // made more efficient if checkHazard() returned more details.
+    while (Zone.checkHazard(SU))
+      Zone.bumpCycle(Zone.getCurrCycle() + 1);
+
+    Zone.releasePending();
+  }
+
   if (SU->isTopReady())
     Top.removeReady(SU);
   if (SU->isBottomReady())
@@ -540,6 +645,47 @@ GCNSchedStageID GCNSchedStrategy::getNextStage() const {
   return *std::next(CurrentStage);
 }
 
+bool GCNSchedStrategy::tryPendingCandidate(SchedCandidate &Cand,
+                                           SchedCandidate &TryCand,
+                                           SchedBoundary *Zone) const {
+  // Initialize the candidate if needed.
+  if (!Cand.isValid()) {
+    TryCand.Reason = NodeOrder;
+    return true;
+  }
+
+  // Bias PhysReg Defs and copies to their uses and defined respectively.
+  if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
+                 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
+    return TryCand.Reason != NoCand;
+
+  // Avoid exceeding the target's limit.
+  if (DAG->isTrackingPressure() &&
+      tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
+                  RegExcess, TRI, DAG->MF))
+    return TryCand.Reason != NoCand;
+
+  // Avoid increasing the max critical pressure in the scheduled region.
+  if (DAG->isTrackingPressure() &&
+      tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
+                  TryCand, Cand, RegCritical, TRI, DAG->MF))
+    return TryCand.Reason != NoCand;
+
+  bool SameBoundary = Zone != nullptr;
+  if (SameBoundary) {
+    TryCand.initResourceDelta(DAG, SchedModel);
+    if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
+                TryCand, Cand, ResourceReduce))
+      return TryCand.Reason != NoCand;
+    if (tryGreater(TryCand.ResDelta.DemandedResources,
+                   Cand.ResDelta.DemandedResources, TryCand, Cand,
+                   ResourceDemand))
+      return TryCand.Reason != NoCand;
+  }
+
+  return false;
+}
+
 GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
     const MachineSchedContext *C, bool IsLegacyScheduler)
     : GCNSchedStrategy(C) {
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 8ea42677454e4..975781fea9452 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -44,17 +44,32 @@ raw_ostream &operator<<(raw_ostream &OS, const GCNSchedStageID &StageID);
 /// heuristics to determine excess/critical pressure sets.
 class GCNSchedStrategy : public GenericScheduler {
 protected:
-  SUnit *pickNodeBidirectional(bool &IsTopNode);
+  SUnit *pickNodeBidirectional(bool &IsTopNode, bool &PickedPending);
 
   void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy,
                          const RegPressureTracker &RPTracker,
-                         SchedCandidate &Cand, bool IsBottomUp);
+                         SchedCandidate &Cand, bool &IsPending,
+                         bool IsBottomUp);
 
   void initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop,
                      const RegPressureTracker &RPTracker,
                      const SIRegisterInfo *SRI, unsigned SGPRPressure,
                      unsigned VGPRPressure, bool IsBottomUp);
 
+  /// Evaluates instructions in the pending queue using a subset of scheduling
+  /// heuristics.
+  ///
+  /// Instructions that cannot be issued due to hardware constraints are placed
+  /// in the pending queue rather than the available queue, making them normally
+  /// invisible to scheduling heuristics. However, in certain scenarios (such as
+  /// avoiding register spilling), it may be beneficial to consider scheduling
+  /// these not-yet-ready instructions.
+  bool tryPendingCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
+                           SchedBoundary *Zone) const;
+
+  void printCandidateDecision(const SchedCandidate &Current,
+                              const SchedCandidate &Preferred);
+
   std::vector<unsigned> Pressure;
 
   std::vector<unsigned> MaxPressure;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index f291e373a470d..c8bbcbbd76928 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -169,7 +169,6 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     : // clang-format off
     AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
     AMDGPUSubtarget(TT),
-    TargetTriple(TT),
     TargetID(*this),
     InstrItins(getInstrItineraryForCPU(GPU)),
     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index c2e6078bcc61a..a46678081b51c 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -60,7 +60,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
 protected:
   // Basic subtarget description.
-  Triple TargetTriple;
   AMDGPU::IsaInfo::AMDGPUTargetID TargetID;
   unsigned Gen = INVALID;
   InstrItineraryData InstrItins;
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index 2aa54c920a046..09ef6ac7bcdf2 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -45,6 +45,9 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
   // Legalize loads and stores to the private address space.
   setOperationAction(ISD::LOAD, {MVT::i32, MVT::v2i32, MVT::v4i32}, Custom);
 
+  // 32-bit ABS is legal for AMDGPU except for R600
+  setOperationAction(ISD::ABS, MVT::i32, Expand);
+
   // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
   // spaces, so it is custom lowered to handle those where it isn't.
   for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD})
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index ecc28244cc71e..b7a92a0a1d634 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -423,6 +423,9 @@ enum CPol {
   // Volatile (used to preserve/signal operation volatility for buffer
   // operations not a real instruction bit)
   VOLATILE = 1 << 31,
+  // The set of "cache policy" bits used for compiler features that
+  // do not correspond to handware features.
+  VIRTUAL_BITS = VOLATILE,
 };
 
 } // namespace CPol
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 2c7744ef42841..f584b6a147fda 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -1265,16 +1265,13 @@ void SIFrameLowering::emitCSRSpillStores(MachineFunction &MF,
 
   StoreWWMRegisters(WWMCalleeSavedRegs);
   if (FuncInfo->isWholeWaveFunction()) {
-    // SI_WHOLE_WAVE_FUNC_SETUP has outlived its purpose, so we can remove
-    // it now. If we have already saved some WWM CSR registers, then the EXEC is
-    // already -1 and we don't need to do anything else. Otherwise, set EXEC to
-    // -1 here.
+    // If we have already saved some WWM CSR registers, then the EXEC is already
+    // -1 and we don't need to do anything else. Otherwise, set EXEC to -1 here.
     if (!ScratchExecCopy)
       buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, /*IsProlog*/ true,
                            /*EnableInactiveLanes*/ true);
     else if (WWMCalleeSavedRegs.empty())
       EnableAllLanes();
-    TII->getWholeWaveFunctionSetup(MF)->eraseFromParent();
   } else if (ScratchExecCopy) {
     // FIXME: Split block and make terminator.
     BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg)
@@ -1588,6 +1585,11 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
          "Needed to save BP but didn't save it anywhere");
 
   assert((HasBP || !BPSaved) && "Saved BP but didn't need it");
+
+  if (FuncInfo->isWholeWaveFunction()) {
+    // SI_WHOLE_WAVE_FUNC_SETUP has outlived its purpose.
+    TII->getWholeWaveFunctionSetup(MF)->eraseFromParent();
+  }
 }
 
 void SIFrameLowering::emitEpilogue(MachineFunction &MF,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index af86ee875009c..ad62483e42997 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1644,6 +1644,26 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
     return true;
   }
+  case Intrinsic::amdgcn_global_load_b128:
+  case Intrinsic::amdgcn_global_store_b128: {
+    bool IsStore = IntrID == Intrinsic::amdgcn_global_store_b128;
+    Info.opc = IsStore ? ISD::INTRINSIC_VOID : ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = EVT::getIntegerVT(CI.getContext(), 128);
+    Info.ptrVal = CI.getArgOperand(0);
+    Info.flags |=
+        IsStore ? MachineMemOperand::MOStore : MachineMemOperand::MOLoad;
+    // Pretend to be atomic so that SIMemoryLegalizer::expandStore sets cache
+    // flags appropriately.
+    Info.order = AtomicOrdering::Monotonic;
+
+    LLVMContext &Ctx = CI.getContext();
+    unsigned ScopeIdx = CI.arg_size() - 1;
+    MDNode *ScopeMD = cast<MDNode>(
+        cast<MetadataAsValue>(CI.getArgOperand(ScopeIdx))->getMetadata());
+    StringRef Scope = cast<MDString>(ScopeMD->getOperand(0))->getString();
+    Info.ssid = Ctx.getOrInsertSyncScopeID(Scope);
+    return true;
+  }
   case Intrinsic::amdgcn_load_to_lds:
   case Intrinsic::amdgcn_global_load_lds: {
     Info.opc = ISD::INTRINSIC_VOID;
@@ -1651,6 +1671,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
     Info.ptrVal = CI.getArgOperand(1);
     Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+    auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
+    if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
+      Info.flags |= MachineMemOperand::MOVolatile;
     return true;
   }
   case Intrinsic::amdgcn_ds_bvh_stack_rtn:
@@ -1747,6 +1770,8 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
   case Intrinsic::amdgcn_global_store_async_from_lds_b32:
   case Intrinsic::amdgcn_global_store_async_from_lds_b64:
   case Intrinsic::amdgcn_global_store_async_from_lds_b128:
+  case Intrinsic::amdgcn_global_load_b128:
+  case Intrinsic::amdgcn_global_store_b128:
     Ptr = II->getArgOperand(0);
     break;
   case Intrinsic::amdgcn_load_to_lds:
@@ -11219,8 +11244,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
 
     MachinePointerInfo StorePtrI = LoadPtrI;
     LoadPtrI.V = PoisonValue::get(
-        PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
-    LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
+        PointerType::get(*DAG.getContext(), AMDGPUAS::BUFFER_RESOURCE));
+    LoadPtrI.AddrSpace = AMDGPUAS::BUFFER_RESOURCE;
     StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
 
     auto F = LoadMMO->getFlags() &
@@ -11307,7 +11332,11 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     }
 
     Ops.push_back(Op.getOperand(5));  // Offset
-    Ops.push_back(Op.getOperand(6));  // CPol
+
+    unsigned Aux = Op.getConstantOperandVal(6);
+    Ops.push_back(DAG.getTargetConstant(Aux & ~AMDGPU::CPol::VIRTUAL_BITS, DL,
+                                        MVT::i32)); // CPol
+
     Ops.push_back(M0Val.getValue(0)); // Chain
     Ops.push_back(M0Val.getValue(1)); // Glue
 
@@ -18168,7 +18197,7 @@ Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
   return CacheLineAlign;
 }
 
-LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]]
 static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
   assert(N->getOpcode() == ISD::CopyFromReg);
   do {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 3a93eed751780..220a893efb0c8 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -865,22 +865,16 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
       return;
     }
 
-    if (DestReg == AMDGPU::VCC_LO) {
-      if (AMDGPU::SReg_32RegClass.contains(SrcReg)) {
-        BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO)
-          .addReg(SrcReg, getKillRegState(KillSrc));
-      } else {
+    if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
+      if (DestReg == AMDGPU::VCC_LO) {
         // FIXME: Hack until VReg_1 removed.
         assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
         BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
-          .addImm(0)
-          .addReg(SrcReg, getKillRegState(KillSrc));
+            .addImm(0)
+            .addReg(SrcReg, getKillRegState(KillSrc));
+        return;
       }
 
-      return;
-    }
-
-    if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
       return;
     }
@@ -898,22 +892,16 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
       return;
     }
 
-    if (DestReg == AMDGPU::VCC) {
-      if (AMDGPU::SReg_64_EncodableRegClass.contains(SrcReg)) {
-        BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
-          .addReg(SrcReg, getKillRegState(KillSrc));
-      } else {
+    if (!AMDGPU::SReg_64_EncodableRegClass.contains(SrcReg)) {
+      if (DestReg == AMDGPU::VCC) {
         // FIXME: Hack until VReg_1 removed.
         assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
         BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
-          .addImm(0)
-          .addReg(SrcReg, getKillRegState(KillSrc));
+            .addImm(0)
+            .addReg(SrcReg, getKillRegState(KillSrc));
+        return;
       }
 
-      return;
-    }
-
-    if (!AMDGPU::SReg_64_EncodableRegClass.contains(SrcReg)) {
       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
       return;
     }
@@ -4118,28 +4106,31 @@ static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
   }
 }
 
+/// Helper struct for the implementation of 3-address conversion to communicate
+/// updates made to instruction operands.
+struct SIInstrInfo::ThreeAddressUpdates {
+  /// Other instruction whose def is no longer used by the converted
+  /// instruction.
+  MachineInstr *RemoveMIUse = nullptr;
+};
+
 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
                                                  LiveVariables *LV,
                                                  LiveIntervals *LIS) const {
   MachineBasicBlock &MBB = *MI.getParent();
-  unsigned Opc = MI.getOpcode();
+  ThreeAddressUpdates U;
+  MachineInstr *NewMI = convertToThreeAddressImpl(MI, U);
 
-  // Handle MFMA.
-  int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
-  if (NewMFMAOpc != -1) {
-    MachineInstrBuilder MIB =
-        BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
-    for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
-      MIB.add(MI.getOperand(I));
-    updateLiveVariables(LV, MI, *MIB);
+  if (NewMI) {
+    updateLiveVariables(LV, MI, *NewMI);
     if (LIS) {
-      LIS->ReplaceMachineInstrInMaps(MI, *MIB);
+      LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
       // SlotIndex of defs needs to be updated when converting to early-clobber
-      MachineOperand &Def = MIB->getOperand(0);
+      MachineOperand &Def = NewMI->getOperand(0);
       if (Def.isEarlyClobber() && Def.isReg() &&
           LIS->hasInterval(Def.getReg())) {
-        SlotIndex OldIndex = LIS->getInstructionIndex(*MIB).getRegSlot(false);
-        SlotIndex NewIndex = LIS->getInstructionIndex(*MIB).getRegSlot(true);
+        SlotIndex OldIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(false);
+        SlotIndex NewIndex = LIS->getInstructionIndex(*NewMI).getRegSlot(true);
         auto &LI = LIS->getInterval(Def.getReg());
         auto UpdateDefIndex = [&](LiveRange &LR) {
           auto *S = LR.find(OldIndex);
@@ -4154,6 +4145,58 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
           UpdateDefIndex(SR);
       }
     }
+  }
+
+  if (U.RemoveMIUse) {
+    MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+    // The only user is the instruction which will be killed.
+    Register DefReg = U.RemoveMIUse->getOperand(0).getReg();
+
+    if (MRI.hasOneNonDBGUse(DefReg)) {
+      // We cannot just remove the DefMI here, calling pass will crash.
+      U.RemoveMIUse->setDesc(get(AMDGPU::IMPLICIT_DEF));
+      U.RemoveMIUse->getOperand(0).setIsDead(true);
+      for (unsigned I = U.RemoveMIUse->getNumOperands() - 1; I != 0; --I)
+        U.RemoveMIUse->removeOperand(I);
+      if (LV)
+        LV->getVarInfo(DefReg).AliveBlocks.clear();
+    }
+
+    if (LIS) {
+      LiveInterval &DefLI = LIS->getInterval(DefReg);
+
+      // We cannot delete the original instruction here, so hack out the use
+      // in the original instruction with a dummy register so we can use
+      // shrinkToUses to deal with any multi-use edge cases. Other targets do
+      // not have the complexity of deleting a use to consider here.
+      Register DummyReg = MRI.cloneVirtualRegister(DefReg);
+      for (MachineOperand &MIOp : MI.uses()) {
+        if (MIOp.isReg() && MIOp.getReg() == DefReg) {
+          MIOp.setIsUndef(true);
+          MIOp.setReg(DummyReg);
+        }
+      }
+
+      LIS->shrinkToUses(&DefLI);
+    }
+  }
+
+  return NewMI;
+}
+
+MachineInstr *
+SIInstrInfo::convertToThreeAddressImpl(MachineInstr &MI,
+                                       ThreeAddressUpdates &U) const {
+  MachineBasicBlock &MBB = *MI.getParent();
+  unsigned Opc = MI.getOpcode();
+
+  // Handle MFMA.
+  int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
+  if (NewMFMAOpc != -1) {
+    MachineInstrBuilder MIB =
+        BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
+    for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
+      MIB.add(MI.getOperand(I));
     return MIB;
   }
 
@@ -4163,11 +4206,6 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
                                   .setMIFlags(MI.getFlags());
     for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
       MIB->addOperand(MI.getOperand(I));
-
-    updateLiveVariables(LV, MI, *MIB);
-    if (LIS)
-      LIS->ReplaceMachineInstrInMaps(MI, *MIB);
-
     return MIB;
   }
 
@@ -4238,39 +4276,6 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
       (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
        !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
     MachineInstr *DefMI;
-    const auto killDef = [&]() -> void {
-      MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
-      // The only user is the instruction which will be killed.
-      Register DefReg = DefMI->getOperand(0).getReg();
-
-      if (MRI.hasOneNonDBGUse(DefReg)) {
-        // We cannot just remove the DefMI here, calling pass will crash.
-        DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF));
-        DefMI->getOperand(0).setIsDead(true);
-        for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I)
-          DefMI->removeOperand(I);
-        if (LV)
-          LV->getVarInfo(DefReg).AliveBlocks.clear();
-      }
-
-      if (LIS) {
-        LiveInterval &DefLI = LIS->getInterval(DefReg);
-
-        // We cannot delete the original instruction here, so hack out the use
-        // in the original instruction with a dummy register so we can use
-        // shrinkToUses to deal with any multi-use edge cases. Other targets do
-        // not have the complexity of deleting a use to consider here.
-        Register DummyReg = MRI.cloneVirtualRegister(DefReg);
-        for (MachineOperand &MIOp : MI.uses()) {
-          if (MIOp.isReg() && MIOp.getReg() == DefReg) {
-            MIOp.setIsUndef(true);
-            MIOp.setReg(DummyReg);
-          }
-        }
-
-        LIS->shrinkToUses(&DefLI);
-      }
-    };
 
     int64_t Imm;
     if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
@@ -4282,10 +4287,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
                   .add(*Src1)
                   .addImm(Imm)
                   .setMIFlags(MI.getFlags());
-        updateLiveVariables(LV, MI, *MIB);
-        if (LIS)
-          LIS->ReplaceMachineInstrInMaps(MI, *MIB);
-        killDef();
+        U.RemoveMIUse = DefMI;
         return MIB;
       }
     }
@@ -4298,11 +4300,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
                   .addImm(Imm)
                   .add(*Src2)
                   .setMIFlags(MI.getFlags());
-        updateLiveVariables(LV, MI, *MIB);
-
-        if (LIS)
-          LIS->ReplaceMachineInstrInMaps(MI, *MIB);
-        killDef();
+        U.RemoveMIUse = DefMI;
         return MIB;
       }
     }
@@ -4321,12 +4319,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
                   .addImm(Imm)
                   .add(*Src2)
                   .setMIFlags(MI.getFlags());
-        updateLiveVariables(LV, MI, *MIB);
-
-        if (LIS)
-          LIS->ReplaceMachineInstrInMaps(MI, *MIB);
-        if (DefMI)
-          killDef();
+        U.RemoveMIUse = DefMI;
         return MIB;
       }
     }
@@ -4355,9 +4348,6 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
             .setMIFlags(MI.getFlags());
   if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
     MIB.addImm(OpSel ? OpSel->getImm() : 0);
-  updateLiveVariables(LV, MI, *MIB);
-  if (LIS)
-    LIS->ReplaceMachineInstrInMaps(MI, *MIB);
   return MIB;
 }
 
@@ -9166,6 +9156,67 @@ void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
   MachineOperand &Src1 = Inst.getOperand(2);
   const DebugLoc &DL = Inst.getDebugLoc();
 
+  if (ST.useRealTrue16Insts()) {
+    Register SrcReg0, SrcReg1;
+    if (!Src0.isReg() || !RI.isVGPR(MRI, Src0.getReg())) {
+      SrcReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+      BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), SrcReg0).add(Src0);
+    } else {
+      SrcReg0 = Src0.getReg();
+    }
+
+    if (!Src1.isReg() || !RI.isVGPR(MRI, Src1.getReg())) {
+      SrcReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+      BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), SrcReg1).add(Src1);
+    } else {
+      SrcReg1 = Src1.getReg();
+    }
+
+    bool isSrc0Reg16 = MRI.constrainRegClass(SrcReg0, &AMDGPU::VGPR_16RegClass);
+    bool isSrc1Reg16 = MRI.constrainRegClass(SrcReg1, &AMDGPU::VGPR_16RegClass);
+
+    auto NewMI = BuildMI(*MBB, Inst, DL, get(AMDGPU::REG_SEQUENCE), ResultReg);
+    switch (Inst.getOpcode()) {
+    case AMDGPU::S_PACK_LL_B32_B16:
+      NewMI
+          .addReg(SrcReg0, 0,
+                  isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
+          .addImm(AMDGPU::lo16)
+          .addReg(SrcReg1, 0,
+                  isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
+          .addImm(AMDGPU::hi16);
+      break;
+    case AMDGPU::S_PACK_LH_B32_B16:
+      NewMI
+          .addReg(SrcReg0, 0,
+                  isSrc0Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
+          .addImm(AMDGPU::lo16)
+          .addReg(SrcReg1, 0, AMDGPU::hi16)
+          .addImm(AMDGPU::hi16);
+      break;
+    case AMDGPU::S_PACK_HL_B32_B16:
+      NewMI.addReg(SrcReg0, 0, AMDGPU::hi16)
+          .addImm(AMDGPU::lo16)
+          .addReg(SrcReg1, 0,
+                  isSrc1Reg16 ? AMDGPU::NoSubRegister : AMDGPU::lo16)
+          .addImm(AMDGPU::hi16);
+      break;
+    case AMDGPU::S_PACK_HH_B32_B16:
+      NewMI.addReg(SrcReg0, 0, AMDGPU::hi16)
+          .addImm(AMDGPU::lo16)
+          .addReg(SrcReg1, 0, AMDGPU::hi16)
+          .addImm(AMDGPU::hi16);
+      break;
+    default:
+      llvm_unreachable("unhandled s_pack_* instruction");
+    }
+
+    MachineOperand &Dest = Inst.getOperand(0);
+    MRI.replaceRegWith(Dest.getReg(), ResultReg);
+    addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
+    return;
+  }
+
   switch (Inst.getOpcode()) {
   case AMDGPU::S_PACK_LL_B32_B16: {
     Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 61c069af80a78..d68909fb8f752 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -88,6 +88,8 @@ struct SIInstrWorklist {
 };
 
 class SIInstrInfo final : public AMDGPUGenInstrInfo {
+  struct ThreeAddressUpdates;
+
 private:
   const SIRegisterInfo RI;
   const GCNSubtarget &ST;
@@ -194,6 +196,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
 
   bool resultDependsOnExec(const MachineInstr &MI) const;
 
+  MachineInstr *convertToThreeAddressImpl(MachineInstr &MI,
+                                          ThreeAddressUpdates &Updates) const;
+
 protected:
   /// If the specific machine instruction is a instruction that moves/copies
   /// value from one register to another register return destination and source
@@ -900,6 +905,11 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
            MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
   }
 
+  bool isMFMA(uint16_t Opcode) const {
+    return isMAI(Opcode) && Opcode != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
+           Opcode != AMDGPU::V_ACCVGPR_READ_B32_e64;
+  }
+
   static bool isDOT(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::IsDOT;
   }
@@ -916,6 +926,10 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
     return isMFMA(MI) || isWMMA(MI) || isSWMMAC(MI);
   }
 
+  bool isMFMAorWMMA(uint16_t Opcode) const {
+    return isMFMA(Opcode) || isWMMA(Opcode) || isSWMMAC(Opcode);
+  }
+
   static bool isSWMMAC(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::IsSWMMAC;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index a2e082f24ab18..41a4effc97260 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2300,8 +2300,8 @@ def : GCNPat <
 
 def : GCNPat <
   (DivergentUnaryFrag<fneg> (v2f32 VReg_64:$src)),
-  (V_PK_ADD_F32 11 /* OP_SEL_1 | NEG_LO | HEG_HI */, VReg_64:$src,
-                11 /* OP_SEL_1 | NEG_LO | HEG_HI */, (i64 0),
+  (V_PK_ADD_F32 !or(SRCMODS.OP_SEL_1, SRCMODS.NEG, SRCMODS.NEG_HI), VReg_64:$src,
+                !or(SRCMODS.OP_SEL_1, SRCMODS.NEG, SRCMODS.NEG_HI), (i64 0),
                 0, 0, 0, 0, 0)
 > {
   let SubtargetPredicate = HasPackedFP32Ops;
@@ -3558,30 +3558,6 @@ def : GCNPat<
 >;
 } // End True16Predicate
 
-let True16Predicate = UseRealTrue16Insts in {
-def : GCNPat<
-  (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
-  (V_MUL_F16_t16_e64 0, (i16 CONST.FP16_ONE), $src_mods, $src, 0/*Clamp*/, /*omod*/0, /*opsel*/0)
->;
-
-def : GCNPat<
-  (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))),
-  (V_MUL_F16_t16_e64 0, (i16 CONST.FP16_NEG_ONE), $src_mods, $src, 0/*Clamp*/, /*omod*/0, /*opsel*/0)
->;
-} // End True16Predicate
-
-let True16Predicate = UseFakeTrue16Insts in {
-def : GCNPat<
-  (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
-  (V_MUL_F16_fake16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src)
->;
-
-def : GCNPat<
-  (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))),
-  (V_MUL_F16_fake16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src)
->;
-} // End True16Predicate
-
 def : GCNPat<
   (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
   (V_PK_MUL_F16 0, (i32 CONST.FP16_ONE), $src_mods, $src, DSTCLAMP.NONE)
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 484861dcaac07..07264d973648f 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -25,6 +25,7 @@
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/MemoryModelRelaxationAnnotations.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/TargetParser/TargetParser.h"
 
@@ -277,6 +278,12 @@ class SIMemOpAccess final {
   /// rmw operation, "std::nullopt" otherwise.
   std::optional<SIMemOpInfo>
   getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
+
+  /// \returns DMA to LDS info if \p MI is as a direct-to/from-LDS load/store,
+  /// along with an indication of whether this is a load or store. If it is not
+  /// a direct-to-LDS operation, returns std::nullopt.
+  std::optional<SIMemOpInfo>
+  getLDSDMAInfo(const MachineBasicBlock::iterator &MI) const;
 };
 
 class SICacheControl {
@@ -360,11 +367,13 @@ class SICacheControl {
   /// between memory instructions to enforce the order they become visible as
   /// observed by other memory instructions executing in memory scope \p Scope.
   /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
-  /// address spaces. Returns true iff any instructions inserted.
+  /// address spaces. If \p AtomicsOnly is true, only insert waits for counters
+  /// that are used by atomic instructions.
+  /// Returns true iff any instructions inserted.
   virtual bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
                           SIAtomicAddrSpace AddrSpace, SIMemOp Op,
                           bool IsCrossAddrSpaceOrdering, Position Pos,
-                          AtomicOrdering Order) const = 0;
+                          AtomicOrdering Order, bool AtomicsOnly) const = 0;
 
   /// Inserts any necessary instructions at position \p Pos relative to
   /// instruction \p MI to ensure any subsequent memory instructions of this
@@ -388,12 +397,6 @@ class SICacheControl {
                              bool IsCrossAddrSpaceOrdering,
                              Position Pos) const = 0;
 
-  /// Inserts any necessary instructions before the barrier start instruction
-  /// \p MI in order to support pairing of barriers and fences.
-  virtual bool insertBarrierStart(MachineBasicBlock::iterator &MI) const {
-    return false;
-  };
-
   /// Virtual destructor to allow derivations to be deleted.
   virtual ~SICacheControl() = default;
 };
@@ -437,7 +440,7 @@ class SIGfx6CacheControl : public SICacheControl {
   bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
                   SIAtomicAddrSpace AddrSpace, SIMemOp Op,
                   bool IsCrossAddrSpaceOrdering, Position Pos,
-                  AtomicOrdering Order) const override;
+                  AtomicOrdering Order, bool AtomicsOnly) const override;
 
   bool insertAcquire(MachineBasicBlock::iterator &MI,
                      SIAtomicScope Scope,
@@ -484,7 +487,7 @@ class SIGfx90ACacheControl : public SIGfx7CacheControl {
   bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
                   SIAtomicAddrSpace AddrSpace, SIMemOp Op,
                   bool IsCrossAddrSpaceOrdering, Position Pos,
-                  AtomicOrdering Order) const override;
+                  AtomicOrdering Order, bool AtomicsOnly) const override;
 
   bool insertAcquire(MachineBasicBlock::iterator &MI,
                      SIAtomicScope Scope,
@@ -572,14 +575,10 @@ class SIGfx10CacheControl : public SIGfx7CacheControl {
   bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
                   SIAtomicAddrSpace AddrSpace, SIMemOp Op,
                   bool IsCrossAddrSpaceOrdering, Position Pos,
-                  AtomicOrdering Order) const override;
-
-  bool insertAcquire(MachineBasicBlock::iterator &MI,
-                     SIAtomicScope Scope,
-                     SIAtomicAddrSpace AddrSpace,
-                     Position Pos) const override;
+                  AtomicOrdering Order, bool AtomicsOnly) const override;
 
-  bool insertBarrierStart(MachineBasicBlock::iterator &MI) const override;
+  bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
+                     SIAtomicAddrSpace AddrSpace, Position Pos) const override;
 };
 
 class SIGfx11CacheControl : public SIGfx10CacheControl {
@@ -629,7 +628,7 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
   bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
                   SIAtomicAddrSpace AddrSpace, SIMemOp Op,
                   bool IsCrossAddrSpaceOrdering, Position Pos,
-                  AtomicOrdering Order) const override;
+                  AtomicOrdering Order, bool AtomicsOnly) const override;
 
   bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
                      SIAtomicAddrSpace AddrSpace, Position Pos) const override;
@@ -701,6 +700,9 @@ class SIMemoryLegalizer final {
   /// instructions are added/deleted or \p MI is modified, false otherwise.
   bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
                                 MachineBasicBlock::iterator &MI);
+  /// Expands LDS DMA operation \p MI. Returns true if instructions are
+  /// added/deleted or \p MI is modified, false otherwise.
+  bool expandLDSDMA(const SIMemOpInfo &MOI, MachineBasicBlock::iterator &MI);
 
 public:
   SIMemoryLegalizer(const MachineModuleInfo &MMI) : MMI(MMI) {};
@@ -830,6 +832,9 @@ SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
     return SIAtomicAddrSpace::SCRATCH;
   if (AS == AMDGPUAS::REGION_ADDRESS)
     return SIAtomicAddrSpace::GDS;
+  if (AS == AMDGPUAS::BUFFER_FAT_POINTER || AS == AMDGPUAS::BUFFER_RESOURCE ||
+      AS == AMDGPUAS::BUFFER_STRIDED_POINTER)
+    return SIAtomicAddrSpace::GLOBAL;
 
   return SIAtomicAddrSpace::OTHER;
 }
@@ -985,6 +990,16 @@ std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
   return constructFromMIWithMMO(MI);
 }
 
+std::optional<SIMemOpInfo>
+SIMemOpAccess::getLDSDMAInfo(const MachineBasicBlock::iterator &MI) const {
+  assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
+
+  if (!SIInstrInfo::isLDSDMA(*MI))
+    return std::nullopt;
+
+  return constructFromMIWithMMO(MI);
+}
+
 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
   TII = ST.getInstrInfo();
   IV = getIsaVersion(ST.getCPU());
@@ -1097,7 +1112,7 @@ bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
   // Only handle load and store, not atomic read-modify-write insructions. The
   // latter use glc to indicate if the atomic returns a result and so must not
   // be used for cache control.
-  assert(MI->mayLoad() ^ MI->mayStore());
+  assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
 
   // Only update load and store, not LLVM IR atomic read-modify-write
   // instructions. The latter are always marked as volatile so cannot sensibly
@@ -1120,7 +1135,8 @@ bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
     // observable outside the program, so no need to cause a waitcnt for LDS
     // address space operations.
     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
-                          Position::AFTER, AtomicOrdering::Unordered);
+                          Position::AFTER, AtomicOrdering::Unordered,
+                          /*AtomicsOnly=*/false);
 
     return Changed;
   }
@@ -1140,7 +1156,8 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
                                     SIAtomicScope Scope,
                                     SIAtomicAddrSpace AddrSpace, SIMemOp Op,
                                     bool IsCrossAddrSpaceOrdering, Position Pos,
-                                    AtomicOrdering Order) const {
+                                    AtomicOrdering Order,
+                                    bool AtomicsOnly) const {
   bool Changed = false;
 
   MachineBasicBlock &MBB = *MI->getParent();
@@ -1294,7 +1311,8 @@ bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
                                        bool IsCrossAddrSpaceOrdering,
                                        Position Pos) const {
   return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
-                    IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
+                    IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
+                    /*AtomicsOnly=*/false);
 }
 
 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
@@ -1424,7 +1442,7 @@ bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
   // Only handle load and store, not atomic read-modify-write insructions. The
   // latter use glc to indicate if the atomic returns a result and so must not
   // be used for cache control.
-  assert(MI->mayLoad() ^ MI->mayStore());
+  assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
 
   // Only update load and store, not LLVM IR atomic read-modify-write
   // instructions. The latter are always marked as volatile so cannot sensibly
@@ -1447,7 +1465,8 @@ bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
     // observable outside the program, so no need to cause a waitcnt for LDS
     // address space operations.
     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
-                          Position::AFTER, AtomicOrdering::Unordered);
+                          Position::AFTER, AtomicOrdering::Unordered,
+                          /*AtomicsOnly=*/false);
 
     return Changed;
   }
@@ -1467,8 +1486,8 @@ bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
                                       SIAtomicScope Scope,
                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
                                       bool IsCrossAddrSpaceOrdering,
-                                      Position Pos,
-                                      AtomicOrdering Order) const {
+                                      Position Pos, AtomicOrdering Order,
+                                      bool AtomicsOnly) const {
   if (ST.isTgSplitEnabled()) {
     // In threadgroup split mode the waves of a work-group can be executing on
     // different CUs. Therefore need to wait for global or GDS memory operations
@@ -1488,7 +1507,8 @@ bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
     AddrSpace &= ~SIAtomicAddrSpace::LDS;
   }
   return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
-                                        IsCrossAddrSpaceOrdering, Pos, Order);
+                                        IsCrossAddrSpaceOrdering, Pos, Order,
+                                        AtomicsOnly);
 }
 
 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
@@ -1726,7 +1746,7 @@ bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
   // Only handle load and store, not atomic read-modify-write insructions. The
   // latter use glc to indicate if the atomic returns a result and so must not
   // be used for cache control.
-  assert(MI->mayLoad() ^ MI->mayStore());
+  assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
 
   // Only update load and store, not LLVM IR atomic read-modify-write
   // instructions. The latter are always marked as volatile so cannot sensibly
@@ -1747,7 +1767,8 @@ bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
     // observable outside the program, so no need to cause a waitcnt for LDS
     // address space operations.
     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
-                          Position::AFTER, AtomicOrdering::Unordered);
+                          Position::AFTER, AtomicOrdering::Unordered,
+                          /*AtomicsOnly=*/false);
 
     return Changed;
   }
@@ -1904,7 +1925,8 @@ bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
   // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
   // S_WAITCNT needed.
   Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
-                        IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
+                        IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
+                        /*AtomicsOnly=*/false);
 
   return Changed;
 }
@@ -1959,7 +1981,7 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
   // Only handle load and store, not atomic read-modify-write insructions. The
   // latter use glc to indicate if the atomic returns a result and so must not
   // be used for cache control.
-  assert(MI->mayLoad() ^ MI->mayStore());
+  assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
 
   // Only update load and store, not LLVM IR atomic read-modify-write
   // instructions. The latter are always marked as volatile so cannot sensibly
@@ -1984,7 +2006,8 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
     // observable outside the program, so no need to cause a waitcnt for LDS
     // address space operations.
     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
-                          Position::AFTER, AtomicOrdering::Unordered);
+                          Position::AFTER, AtomicOrdering::Unordered,
+                          /*AtomicsOnly=*/false);
     return Changed;
   }
 
@@ -2007,7 +2030,8 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
                                      SIAtomicScope Scope,
                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,
                                      bool IsCrossAddrSpaceOrdering,
-                                     Position Pos, AtomicOrdering Order) const {
+                                     Position Pos, AtomicOrdering Order,
+                                     bool AtomicsOnly) const {
   bool Changed = false;
 
   MachineBasicBlock &MBB = *MI->getParent();
@@ -2035,8 +2059,11 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
       // the WGP. Therefore need to wait for operations to complete to ensure
       // they are visible to waves in the other CU as the L0 is per CU.
       // Otherwise in CU mode and all waves of a work-group are on the same CU
-      // which shares the same L0.
-      if (!ST.isCuModeEnabled()) {
+      // which shares the same L0. Note that we still need to wait when
+      // performing a release in this mode to respect the transitivity of
+      // happens-before, e.g. other waves of the workgroup must be able to
+      // release the memory from another wave at a wider scope.
+      if (!ST.isCuModeEnabled() || isReleaseOrStronger(Order)) {
         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
           VMCnt |= true;
         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
@@ -2191,22 +2218,6 @@ bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
   return Changed;
 }
 
-bool SIGfx10CacheControl::insertBarrierStart(
-    MachineBasicBlock::iterator &MI) const {
-  // We need to wait on vm_vsrc so barriers can pair with fences in GFX10+ CU
-  // mode. This is because a CU mode release fence does not emit any wait, which
-  // is fine when only dealing with vmem, but isn't sufficient in the presence
-  // of barriers which do not go through vmem.
-  // GFX12.5 does not require this additional wait.
-  if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts())
-    return false;
-
-  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
-          TII->get(AMDGPU::S_WAITCNT_DEPCTR))
-      .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
-  return true;
-}
-
 bool SIGfx11CacheControl::enableLoadCacheBypass(
     const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
     SIAtomicAddrSpace AddrSpace) const {
@@ -2255,7 +2266,7 @@ bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
   // Only handle load and store, not atomic read-modify-write insructions. The
   // latter use glc to indicate if the atomic returns a result and so must not
   // be used for cache control.
-  assert(MI->mayLoad() ^ MI->mayStore());
+  assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
 
   // Only update load and store, not LLVM IR atomic read-modify-write
   // instructions. The latter are always marked as volatile so cannot sensibly
@@ -2281,7 +2292,8 @@ bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
     // observable outside the program, so no need to cause a waitcnt for LDS
     // address space operations.
     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
-                          Position::AFTER, AtomicOrdering::Unordered);
+                          Position::AFTER, AtomicOrdering::Unordered,
+                          /*AtomicsOnly=*/false);
     return Changed;
   }
 
@@ -2354,7 +2366,8 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
                                      SIAtomicScope Scope,
                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,
                                      bool IsCrossAddrSpaceOrdering,
-                                     Position Pos, AtomicOrdering Order) const {
+                                     Position Pos, AtomicOrdering Order,
+                                     bool AtomicsOnly) const {
   bool Changed = false;
 
   MachineBasicBlock &MBB = *MI->getParent();
@@ -2383,15 +2396,20 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
       //   In WGP mode the waves of a work-group can be executing on either CU
       //   of the WGP. Therefore need to wait for operations to complete to
       //   ensure they are visible to waves in the other CU as the L0 is per CU.
+      //
       //   Otherwise in CU mode and all waves of a work-group are on the same CU
-      //   which shares the same L0.
+      //   which shares the same L0. Note that we still need to wait when
+      //   performing a release in this mode to respect the transitivity of
+      //   happens-before, e.g. other waves of the workgroup must be able to
+      //   release the memory from another wave at a wider scope.
       //
       // GFX12.5:
       //   CU$ has two ports. To ensure operations are visible at the workgroup
       //   level, we need to ensure all operations in this port have completed
       //   so the other SIMDs in the WG can see them. There is no ordering
       //   guarantee between the ports.
-      if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) {
+      if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts() ||
+          isReleaseOrStronger(Order)) {
         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
           LOADCnt |= true;
         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
@@ -2444,7 +2462,7 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
     //
     // This also applies to fences. Fences cannot pair with an instruction
     // tracked with bvh/samplecnt as we don't have any atomics that do that.
-    if (Order != AtomicOrdering::Acquire && ST.hasImageInsts()) {
+    if (!AtomicsOnly && ST.hasImageInsts()) {
       BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0);
       BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
     }
@@ -2587,7 +2605,8 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
   // complete, whether we inserted a WB or not. If we inserted a WB (storecnt),
   // we of course need to wait for that as well.
   Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
-                        IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
+                        IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
+                        /*AtomicsOnly=*/false);
 
   return Changed;
 }
@@ -2597,7 +2616,7 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
     bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
 
   // Only handle load and store, not atomic read-modify-write instructions.
-  assert(MI->mayLoad() ^ MI->mayStore());
+  assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
 
   // Only update load and store, not LLVM IR atomic read-modify-write
   // instructions. The latter are always marked as volatile so cannot sensibly
@@ -2624,7 +2643,8 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
     // observable outside the program, so no need to cause a waitcnt for LDS
     // address space operations.
     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
-                          Position::AFTER, AtomicOrdering::Unordered);
+                          Position::AFTER, AtomicOrdering::Unordered,
+                          /*AtomicsOnly=*/false);
   }
 
   return Changed;
@@ -2748,13 +2768,15 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
       Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(),
                                 SIMemOp::LOAD | SIMemOp::STORE,
                                 MOI.getIsCrossAddressSpaceOrdering(),
-                                Position::BEFORE, Order);
+                                Position::BEFORE, Order, /*AtomicsOnly=*/false);
 
     if (Order == AtomicOrdering::Acquire ||
         Order == AtomicOrdering::SequentiallyConsistent) {
-      Changed |= CC->insertWait(
-          MI, MOI.getScope(), MOI.getInstrAddrSpace(), SIMemOp::LOAD,
-          MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER, Order);
+      // The wait below only needs to wait on the prior atomic.
+      Changed |=
+          CC->insertWait(MI, MOI.getScope(), MOI.getInstrAddrSpace(),
+                         SIMemOp::LOAD, MOI.getIsCrossAddressSpaceOrdering(),
+                         Position::AFTER, Order, /*AtomicsOnly=*/true);
       Changed |= CC->insertAcquire(MI, MOI.getScope(),
                                    MOI.getOrderingAddrSpace(),
                                    Position::AFTER);
@@ -2830,9 +2852,11 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
   if (MOI.isAtomic()) {
     const AtomicOrdering Order = MOI.getOrdering();
     if (Order == AtomicOrdering::Acquire) {
-      Changed |= CC->insertWait(
-          MI, MOI.getScope(), OrderingAddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
-          MOI.getIsCrossAddressSpaceOrdering(), Position::BEFORE, Order);
+      // Acquire fences only need to wait on the previous atomic they pair with.
+      Changed |= CC->insertWait(MI, MOI.getScope(), OrderingAddrSpace,
+                                SIMemOp::LOAD | SIMemOp::STORE,
+                                MOI.getIsCrossAddressSpaceOrdering(),
+                                Position::BEFORE, Order, /*AtomicsOnly=*/true);
     }
 
     if (Order == AtomicOrdering::Release ||
@@ -2897,10 +2921,12 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
         Order == AtomicOrdering::SequentiallyConsistent ||
         MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
-      Changed |= CC->insertWait(
-          MI, MOI.getScope(), MOI.getInstrAddrSpace(),
-          isAtomicRet(*MI) ? SIMemOp::LOAD : SIMemOp::STORE,
-          MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER, Order);
+      // Only wait on the previous atomic.
+      Changed |=
+          CC->insertWait(MI, MOI.getScope(), MOI.getInstrAddrSpace(),
+                         isAtomicRet(*MI) ? SIMemOp::LOAD : SIMemOp::STORE,
+                         MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER,
+                         Order, /*AtomicsOnly=*/true);
       Changed |= CC->insertAcquire(MI, MOI.getScope(),
                                    MOI.getOrderingAddrSpace(),
                                    Position::AFTER);
@@ -2913,6 +2939,23 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
   return Changed;
 }
 
+bool SIMemoryLegalizer::expandLDSDMA(const SIMemOpInfo &MOI,
+                                     MachineBasicBlock::iterator &MI) {
+  assert(MI->mayLoad() && MI->mayStore());
+
+  // The volatility or nontemporal-ness of the operation is a
+  // function of the global memory, not the LDS.
+  SIMemOp OpKind =
+      SIInstrInfo::mayWriteLDSThroughDMA(*MI) ? SIMemOp::LOAD : SIMemOp::STORE;
+
+  // Handle volatile and/or nontemporal markers on direct-to-LDS loads and
+  // stores. The operation is treated as a volatile/nontemporal store
+  // to its second argument.
+  return CC->enableVolatileAndOrNonTemporal(
+      MI, MOI.getInstrAddrSpace(), OpKind, MOI.isVolatile(),
+      MOI.isNonTemporal(), MOI.isLastUse());
+}
+
 bool SIMemoryLegalizerLegacy::runOnMachineFunction(MachineFunction &MF) {
   const MachineModuleInfo &MMI =
       getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
@@ -2956,22 +2999,20 @@ bool SIMemoryLegalizer::run(MachineFunction &MF) {
         MI = II->getIterator();
       }
 
-      if (ST.getInstrInfo()->isBarrierStart(MI->getOpcode())) {
-        Changed |= CC->insertBarrierStart(MI);
-        continue;
-      }
-
       if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
         continue;
 
-      if (const auto &MOI = MOA.getLoadInfo(MI))
+      if (const auto &MOI = MOA.getLoadInfo(MI)) {
         Changed |= expandLoad(*MOI, MI);
-      else if (const auto &MOI = MOA.getStoreInfo(MI)) {
+      } else if (const auto &MOI = MOA.getStoreInfo(MI)) {
         Changed |= expandStore(*MOI, MI);
-      } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
+      } else if (const auto &MOI = MOA.getLDSDMAInfo(MI)) {
+        Changed |= expandLDSDMA(*MOI, MI);
+      } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) {
         Changed |= expandAtomicFence(*MOI, MI);
-      else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
+      } else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) {
         Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
+      }
     }
   }
 
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index a672a89579cd7..6610e6cd4f72a 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -47,9 +47,6 @@ class SIPreEmitPeephole {
                              const MachineBasicBlock &From,
                              const MachineBasicBlock &To) const;
   bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
-  // Check if the machine instruction being processed is a supported packed
-  // instruction.
-  bool isUnpackingSupportedInstr(MachineInstr &MI) const;
   // Creates a list of packed instructions following an MFMA that are suitable
   // for unpacking.
   void collectUnpackingCandidates(MachineInstr &BeginMI,
@@ -465,23 +462,6 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
   return true;
 }
 
-// If support is extended to new operations, add tests in
-// llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir.
-bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const {
-  if (!TII->isNeverCoissue(MI))
-    return false;
-  unsigned Opcode = MI.getOpcode();
-  switch (Opcode) {
-  case AMDGPU::V_PK_ADD_F32:
-  case AMDGPU::V_PK_MUL_F32:
-  case AMDGPU::V_PK_FMA_F32:
-    return true;
-  default:
-    return false;
-  }
-  llvm_unreachable("Fully covered switch");
-}
-
 bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) {
   unsigned OpCode = MI.getOpcode();
   Register DstReg = MI.getOperand(0).getReg();
@@ -623,10 +603,13 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
 
   for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) {
     MachineInstr &Instr = *I;
+    uint16_t UnpackedOpCode = mapToUnpackedOpcode(Instr);
+    bool IsUnpackable =
+        !(UnpackedOpCode == std::numeric_limits<uint16_t>::max());
     if (Instr.isMetaInstruction())
       continue;
     if ((Instr.isTerminator()) ||
-        (TII->isNeverCoissue(Instr) && !isUnpackingSupportedInstr(Instr)) ||
+        (TII->isNeverCoissue(Instr) && !IsUnpackable) ||
         (SIInstrInfo::modifiesModeRegister(Instr) &&
          Instr.modifiesRegister(AMDGPU::EXEC, TRI)))
       return;
@@ -650,7 +633,7 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
       if (TRI->regsOverlap(MFMADef, InstrMO.getReg()))
         return;
     }
-    if (!isUnpackingSupportedInstr(Instr))
+    if (!IsUnpackable)
       continue;
 
     if (canUnpackingClobberRegister(Instr))
@@ -665,7 +648,6 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
     if (TotalCyclesBetweenCandidates < NumMFMACycles - 1)
       InstrsToUnpack.insert(&Instr);
   }
-  return;
 }
 
 void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) {
@@ -692,7 +674,6 @@ void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) {
   HiDstOp.setIsRenamable(DstOp.isRenamable());
 
   I.eraseFromParent();
-  return;
 }
 
 MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
@@ -700,8 +681,8 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
                                                         bool IsHiBits) {
   MachineBasicBlock &MBB = *I.getParent();
   const DebugLoc &DL = I.getDebugLoc();
-  const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src0);
-  const MachineOperand *SrcMO2 = TII->getNamedOperand(I, AMDGPU::OpName::src1);
+  const MachineOperand *SrcMO0 = TII->getNamedOperand(I, AMDGPU::OpName::src0);
+  const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src1);
   Register DstReg = I.getOperand(0).getReg();
   unsigned OpCode = I.getOpcode();
   Register UnpackedDstReg = IsHiBits ? TRI->getSubReg(DstReg, AMDGPU::sub1)
@@ -715,15 +696,15 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
 
   MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode));
   NewMI.addDef(UnpackedDstReg); // vdst
-  addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO1);
-  addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO2);
+  addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO0);
+  addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO1);
 
   if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) {
-    const MachineOperand *SrcMO3 =
+    const MachineOperand *SrcMO2 =
         TII->getNamedOperand(I, AMDGPU::OpName::src2);
     unsigned Src2Mods =
         TII->getNamedOperand(I, AMDGPU::OpName::src2_modifiers)->getImm();
-    addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO3);
+    addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO2);
   }
   NewMI.addImm(ClampVal); // clamp
   // Packed instructions do not support output modifiers. safe to assign them 0
@@ -800,9 +781,13 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) {
 
   // TODO: Fold this into previous block, if possible. Evaluate and handle any
   // side effects.
+
+  // Perform the extra MF scans only for supported archs
+  if (!ST.hasGFX940Insts())
+    return Changed;
   for (MachineBasicBlock &MBB : MF) {
-    // Unpack packed instructions overlapped by MFMAs. This allows the compiler
-    // to co-issue unpacked instructions with MFMA
+    // Unpack packed instructions overlapped by MFMAs. This allows the
+    // compiler to co-issue unpacked instructions with MFMA
     auto SchedModel = TII->getSchedModel();
     SetVector<MachineInstr *> InstrsToUnpack;
     for (auto &MI : make_early_inc_range(MBB.instrs())) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index a01a5fd060e57..5e3195b36fe4c 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1696,9 +1696,6 @@ bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi);
 LLVM_READNONE
 bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi);
 
-LLVM_READNONE
-bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi);
-
 LLVM_READNONE
 bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi);
 
diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 9945ecc9c96e0..0d7b6d1236442 100644
--- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -161,8 +161,8 @@ namespace {
     friend bool operator<(const NEONLdStTableEntry &TE, unsigned PseudoOpc) {
       return TE.PseudoOpc < PseudoOpc;
     }
-    friend bool LLVM_ATTRIBUTE_UNUSED operator<(unsigned PseudoOpc,
-                                                const NEONLdStTableEntry &TE) {
+    [[maybe_unused]] friend bool operator<(unsigned PseudoOpc,
+                                           const NEONLdStTableEntry &TE) {
       return PseudoOpc < TE.PseudoOpc;
     }
   };
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 67ea2dd3df792..b1a668eef608f 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -1089,7 +1089,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
 
   // Register based DivRem for AEABI (RTABI 4.2)
   if (TT.isTargetAEABI() || TT.isAndroid() || TT.isTargetGNUAEABI() ||
-      TT.isTargetMuslAEABI() || TT.isOSWindows()) {
+      TT.isTargetMuslAEABI() || TT.isOSFuchsia() || TT.isOSWindows()) {
     setOperationAction(ISD::SREM, MVT::i64, Custom);
     setOperationAction(ISD::UREM, MVT::i64, Custom);
     HasStandaloneRem = false;
@@ -1353,6 +1353,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
     setOperationAction(ISD::FLOG10, MVT::f16, Promote);
     setOperationAction(ISD::FLOG2, MVT::f16, Promote);
     setOperationAction(ISD::LRINT, MVT::f16, Expand);
+    setOperationAction(ISD::LROUND, MVT::f16, Expand);
 
     setOperationAction(ISD::FROUND, MVT::f16, Legal);
     setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);
@@ -20574,7 +20575,7 @@ static TargetLowering::ArgListTy getDivRemArgList(
 SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
   assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
           Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
-          Subtarget->isTargetWindows()) &&
+          Subtarget->isTargetFuchsia() || Subtarget->isTargetWindows()) &&
          "Register-based DivRem lowering only");
   unsigned Opcode = Op->getOpcode();
   assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
@@ -21287,21 +21288,28 @@ bool ARMTargetLowering::useLoadStackGuardNode(const Module &M) const {
 }
 
 void ARMTargetLowering::insertSSPDeclarations(Module &M) const {
+  // MSVC CRT provides functionalities for stack protection.
   RTLIB::LibcallImpl SecurityCheckCookieLibcall =
       getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
-  if (SecurityCheckCookieLibcall == RTLIB::Unsupported)
-    return TargetLowering::insertSSPDeclarations(M);
 
-  // MSVC CRT has a global variable holding security cookie.
-  M.getOrInsertGlobal("__security_cookie",
-                      PointerType::getUnqual(M.getContext()));
+  RTLIB::LibcallImpl SecurityCookieVar =
+      getLibcallImpl(RTLIB::STACK_CHECK_GUARD);
+  if (SecurityCheckCookieLibcall != RTLIB::Unsupported &&
+      SecurityCookieVar != RTLIB::Unsupported) {
+    // MSVC CRT has a global variable holding security cookie.
+    M.getOrInsertGlobal(getLibcallImplName(SecurityCookieVar),
+                        PointerType::getUnqual(M.getContext()));
 
-  // MSVC CRT has a function to validate security cookie.
-  FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
-      getLibcallImplName(SecurityCheckCookieLibcall),
-      Type::getVoidTy(M.getContext()), PointerType::getUnqual(M.getContext()));
-  if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee()))
-    F->addParamAttr(0, Attribute::AttrKind::InReg);
+    // MSVC CRT has a function to validate security cookie.
+    FunctionCallee SecurityCheckCookie =
+        M.getOrInsertFunction(getLibcallImplName(SecurityCheckCookieLibcall),
+                              Type::getVoidTy(M.getContext()),
+                              PointerType::getUnqual(M.getContext()));
+    if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee()))
+      F->addParamAttr(0, Attribute::AttrKind::InReg);
+  }
+
+  TargetLowering::insertSSPDeclarations(M);
 }
 
 Function *ARMTargetLowering::getSSPStackGuardCheck(const Module &M) const {
diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index 96ee69cf3f4ce..406f4c1f21983 100644
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -882,7 +882,7 @@ static bool producesFalseLanesZero(MachineInstr &MI,
       continue;
     // Skip the lr predicate reg
     int PIdx = llvm::findFirstVPTPredOperandIdx(MI);
-    if (PIdx != -1 && (int)MO.getOperandNo() == PIdx + 2)
+    if (PIdx != -1 && MO.getOperandNo() == PIdx + ARM::SUBOP_vpred_n_tp_reg)
       continue;
 
     // Check that this instruction will produce zeros in its false lanes:
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
index b2d368e0ca175..4a0883cc662e7 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -343,6 +343,7 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
   bool isTargetWatchOS() const { return TargetTriple.isWatchOS(); }
   bool isTargetWatchABI() const { return TargetTriple.isWatchABI(); }
   bool isTargetDriverKit() const { return TargetTriple.isDriverKit(); }
+  bool isTargetFuchsia() const { return TargetTriple.isOSFuchsia(); }
   bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
   bool isTargetNetBSD() const { return TargetTriple.isOSNetBSD(); }
   bool isTargetWindows() const { return TargetTriple.isOSWindows(); }
diff --git a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
index ce59ae0c95dcf..2cd5f02146146 100644
--- a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
+++ b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
@@ -407,9 +407,9 @@ Instruction *MVEGatherScatterLowering::lowerGather(IntrinsicInst *I) {
   // Potentially optimising the addressing modes as we do so.
   auto *Ty = cast<FixedVectorType>(I->getType());
   Value *Ptr = I->getArgOperand(0);
-  Align Alignment = cast<ConstantInt>(I->getArgOperand(1))->getAlignValue();
-  Value *Mask = I->getArgOperand(2);
-  Value *PassThru = I->getArgOperand(3);
+  Align Alignment = I->getParamAlign(0).valueOrOne();
+  Value *Mask = I->getArgOperand(1);
+  Value *PassThru = I->getArgOperand(2);
 
   if (!isLegalTypeAndAlignment(Ty->getNumElements(), Ty->getScalarSizeInBits(),
                                Alignment))
@@ -458,7 +458,7 @@ Instruction *MVEGatherScatterLowering::tryCreateMaskedGatherBase(
   if (Ty->getNumElements() != 4 || Ty->getScalarSizeInBits() != 32)
     // Can't build an intrinsic for this
     return nullptr;
-  Value *Mask = I->getArgOperand(2);
+  Value *Mask = I->getArgOperand(1);
   if (match(Mask, m_One()))
     return Builder.CreateIntrinsic(Intrinsic::arm_mve_vldr_gather_base,
                                    {Ty, Ptr->getType()},
@@ -479,7 +479,7 @@ Instruction *MVEGatherScatterLowering::tryCreateMaskedGatherBaseWB(
   if (Ty->getNumElements() != 4 || Ty->getScalarSizeInBits() != 32)
     // Can't build an intrinsic for this
     return nullptr;
-  Value *Mask = I->getArgOperand(2);
+  Value *Mask = I->getArgOperand(1);
   if (match(Mask, m_One()))
     return Builder.CreateIntrinsic(Intrinsic::arm_mve_vldr_gather_base_wb,
                                    {Ty, Ptr->getType()},
@@ -552,7 +552,7 @@ Instruction *MVEGatherScatterLowering::tryCreateMaskedGatherOffset(
     return nullptr;
 
   Root = Extend;
-  Value *Mask = I->getArgOperand(2);
+  Value *Mask = I->getArgOperand(1);
   Instruction *Load = nullptr;
   if (!match(Mask, m_One()))
     Load = Builder.CreateIntrinsic(
@@ -584,7 +584,7 @@ Instruction *MVEGatherScatterLowering::lowerScatter(IntrinsicInst *I) {
   // Potentially optimising the addressing modes as we do so.
   Value *Input = I->getArgOperand(0);
   Value *Ptr = I->getArgOperand(1);
-  Align Alignment = cast<ConstantInt>(I->getArgOperand(2))->getAlignValue();
+  Align Alignment = I->getParamAlign(1).valueOrOne();
   auto *Ty = cast<FixedVectorType>(Input->getType());
 
   if (!isLegalTypeAndAlignment(Ty->getNumElements(), Ty->getScalarSizeInBits(),
@@ -622,7 +622,7 @@ Instruction *MVEGatherScatterLowering::tryCreateMaskedScatterBase(
     // Can't build an intrinsic for this
     return nullptr;
   }
-  Value *Mask = I->getArgOperand(3);
+  Value *Mask = I->getArgOperand(2);
   //  int_arm_mve_vstr_scatter_base(_predicated) addr, offset, data(, mask)
   LLVM_DEBUG(dbgs() << "masked scatters: storing to a vector of pointers\n");
   if (match(Mask, m_One()))
@@ -646,7 +646,7 @@ Instruction *MVEGatherScatterLowering::tryCreateMaskedScatterBaseWB(
   if (Ty->getNumElements() != 4 || Ty->getScalarSizeInBits() != 32)
     // Can't build an intrinsic for this
     return nullptr;
-  Value *Mask = I->getArgOperand(3);
+  Value *Mask = I->getArgOperand(2);
   if (match(Mask, m_One()))
     return Builder.CreateIntrinsic(Intrinsic::arm_mve_vstr_scatter_base_wb,
                                    {Ptr->getType(), Input->getType()},
@@ -662,7 +662,7 @@ Instruction *MVEGatherScatterLowering::tryCreateMaskedScatterOffset(
     IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder) {
   using namespace PatternMatch;
   Value *Input = I->getArgOperand(0);
-  Value *Mask = I->getArgOperand(3);
+  Value *Mask = I->getArgOperand(2);
   Type *InputTy = Input->getType();
   Type *MemoryTy = InputTy;
 
diff --git a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
index 5eeb4fe995485..413e8442419fd 100644
--- a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
+++ b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
@@ -534,7 +534,7 @@ bool MVETPAndVPTOptimisations::ConvertTailPredLoop(MachineLoop *ML,
     Register LR = LoopPhi->getOperand(0).getReg();
     for (MachineInstr *MI : MVEInstrs) {
       int Idx = findFirstVPTPredOperandIdx(*MI);
-      MI->getOperand(Idx + 2).setReg(LR);
+      MI->getOperand(Idx + ARM::SUBOP_vpred_n_tp_reg).setReg(LR);
     }
   }
 
diff --git a/llvm/lib/Target/BPF/BTFDebug.cpp b/llvm/lib/Target/BPF/BTFDebug.cpp
index ba4b48990c647..9b5fc9d05e336 100644
--- a/llvm/lib/Target/BPF/BTFDebug.cpp
+++ b/llvm/lib/Target/BPF/BTFDebug.cpp
@@ -14,6 +14,7 @@
 #include "BPF.h"
 #include "BPFCORE.h"
 #include "MCTargetDesc/BPFMCTargetDesc.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
@@ -23,6 +24,7 @@
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/LineIterator.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
@@ -301,21 +303,59 @@ void BTFTypeStruct::completeType(BTFDebug &BDebug) {
 
   BTFType.NameOff = BDebug.addString(STy->getName());
 
+  if (STy->getTag() == dwarf::DW_TAG_variant_part) {
+    // Variant parts might have a discriminator, which has its own memory
+    // location, and variants, which share the memory location afterwards. LLVM
+    // DI doesn't consider discriminator as an element and instead keeps
+    // it as a separate reference.
+    // To keep BTF simple, let's represent the structure as an union with
+    // discriminator as the first element.
+    // The offsets inside variant types are already handled correctly in the
+    // DI.
+    const auto *DTy = STy->getDiscriminator();
+    if (DTy) {
+      struct BTF::BTFMember Discriminator;
+
+      Discriminator.NameOff = BDebug.addString(DTy->getName());
+      Discriminator.Offset = DTy->getOffsetInBits();
+      const auto *BaseTy = DTy->getBaseType();
+      Discriminator.Type = BDebug.getTypeId(BaseTy);
+
+      Members.push_back(Discriminator);
+    }
+  }
+
   // Add struct/union members.
   const DINodeArray Elements = STy->getElements();
   for (const auto *Element : Elements) {
     struct BTF::BTFMember BTFMember;
-    const auto *DDTy = cast<DIDerivedType>(Element);
 
-    BTFMember.NameOff = BDebug.addString(DDTy->getName());
-    if (HasBitField) {
-      uint8_t BitFieldSize = DDTy->isBitField() ? DDTy->getSizeInBits() : 0;
-      BTFMember.Offset = BitFieldSize << 24 | DDTy->getOffsetInBits();
-    } else {
-      BTFMember.Offset = DDTy->getOffsetInBits();
+    switch (Element->getTag()) {
+    case dwarf::DW_TAG_member: {
+      const auto *DDTy = cast<DIDerivedType>(Element);
+
+      BTFMember.NameOff = BDebug.addString(DDTy->getName());
+      if (HasBitField) {
+        uint8_t BitFieldSize = DDTy->isBitField() ? DDTy->getSizeInBits() : 0;
+        BTFMember.Offset = BitFieldSize << 24 | DDTy->getOffsetInBits();
+      } else {
+        BTFMember.Offset = DDTy->getOffsetInBits();
+      }
+      const auto *BaseTy = tryRemoveAtomicType(DDTy->getBaseType());
+      BTFMember.Type = BDebug.getTypeId(BaseTy);
+      break;
+    }
+    case dwarf::DW_TAG_variant_part: {
+      const auto *DCTy = dyn_cast<DICompositeType>(Element);
+
+      BTFMember.NameOff = BDebug.addString(DCTy->getName());
+      BTFMember.Offset = DCTy->getOffsetInBits();
+      BTFMember.Type = BDebug.getTypeId(DCTy);
+      break;
+    }
+    default:
+      llvm_unreachable("Unexpected DI tag of a struct/union element");
     }
-    const auto *BaseTy = tryRemoveAtomicType(DDTy->getBaseType());
-    BTFMember.Type = BDebug.getTypeId(BaseTy);
     Members.push_back(BTFMember);
   }
 }
@@ -672,16 +712,28 @@ void BTFDebug::visitStructType(const DICompositeType *CTy, bool IsStruct,
                                uint32_t &TypeId) {
   const DINodeArray Elements = CTy->getElements();
   uint32_t VLen = Elements.size();
+  // Variant parts might have a discriminator. LLVM DI doesn't consider it as
+  // an element and instead keeps it as a separate reference. But we represent
+  // it as an element in BTF.
+  if (CTy->getTag() == dwarf::DW_TAG_variant_part) {
+    const auto *DTy = CTy->getDiscriminator();
+    if (DTy) {
+      visitTypeEntry(DTy);
+      VLen++;
+    }
+  }
   if (VLen > BTF::MAX_VLEN)
     return;
 
   // Check whether we have any bitfield members or not
   bool HasBitField = false;
   for (const auto *Element : Elements) {
-    auto E = cast<DIDerivedType>(Element);
-    if (E->isBitField()) {
-      HasBitField = true;
-      break;
+    if (Element->getTag() == dwarf::DW_TAG_member) {
+      auto E = cast<DIDerivedType>(Element);
+      if (E->isBitField()) {
+        HasBitField = true;
+        break;
+      }
     }
   }
 
@@ -696,9 +748,22 @@ void BTFDebug::visitStructType(const DICompositeType *CTy, bool IsStruct,
   // Visit all struct members.
   int FieldNo = 0;
   for (const auto *Element : Elements) {
-    const auto Elem = cast<DIDerivedType>(Element);
-    visitTypeEntry(Elem);
-    processDeclAnnotations(Elem->getAnnotations(), TypeId, FieldNo);
+    switch (Element->getTag()) {
+    case dwarf::DW_TAG_member: {
+      const auto Elem = cast<DIDerivedType>(Element);
+      visitTypeEntry(Elem);
+      processDeclAnnotations(Elem->getAnnotations(), TypeId, FieldNo);
+      break;
+    }
+    case dwarf::DW_TAG_variant_part: {
+      const auto Elem = cast<DICompositeType>(Element);
+      visitTypeEntry(Elem);
+      processDeclAnnotations(Elem->getAnnotations(), TypeId, FieldNo);
+      break;
+    }
+    default:
+      llvm_unreachable("Unexpected DI tag of a struct/union element");
+    }
     FieldNo++;
   }
 }
@@ -781,16 +846,25 @@ void BTFDebug::visitFwdDeclType(const DICompositeType *CTy, bool IsUnion,
 void BTFDebug::visitCompositeType(const DICompositeType *CTy,
                                   uint32_t &TypeId) {
   auto Tag = CTy->getTag();
-  if (Tag == dwarf::DW_TAG_structure_type || Tag == dwarf::DW_TAG_union_type) {
+  switch (Tag) {
+  case dwarf::DW_TAG_structure_type:
+  case dwarf::DW_TAG_union_type:
+  case dwarf::DW_TAG_variant_part:
     // Handle forward declaration differently as it does not have members.
     if (CTy->isForwardDecl())
       visitFwdDeclType(CTy, Tag == dwarf::DW_TAG_union_type, TypeId);
     else
       visitStructType(CTy, Tag == dwarf::DW_TAG_structure_type, TypeId);
-  } else if (Tag == dwarf::DW_TAG_array_type)
+    break;
+  case dwarf::DW_TAG_array_type:
     visitArrayType(CTy, TypeId);
-  else if (Tag == dwarf::DW_TAG_enumeration_type)
+    break;
+  case dwarf::DW_TAG_enumeration_type:
     visitEnumType(CTy, TypeId);
+    break;
+  default:
+    llvm_unreachable("Unexpected DI tag of a composite type");
+  }
 }
 
 bool BTFDebug::IsForwardDeclCandidate(const DIType *Base) {
diff --git a/llvm/lib/Target/CSKY/Disassembler/CSKYDisassembler.cpp b/llvm/lib/Target/CSKY/Disassembler/CSKYDisassembler.cpp
index 39e651d52e4d3..8945ec3446d61 100644
--- a/llvm/lib/Target/CSKY/Disassembler/CSKYDisassembler.cpp
+++ b/llvm/lib/Target/CSKY/Disassembler/CSKYDisassembler.cpp
@@ -166,7 +166,7 @@ static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, uint64_t RegNo,
 }
 
 // TODO
-LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]]
 static DecodeStatus DecodesFPR128RegisterClass(MCInst &Inst, uint64_t RegNo,
                                                uint64_t Address,
                                                const MCDisassembler *Decoder) {
diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
index ca81d30473c03..8ace2d2777c74 100644
--- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
+++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
@@ -28,6 +28,7 @@
 #include "llvm/Support/MD5.h"
 #include "llvm/TargetParser/Triple.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <cstdint>
 #include <optional>
 
 using namespace llvm;
@@ -193,7 +194,12 @@ void DXContainerGlobals::addResourcesForPSV(Module &M, PSVRuntimeInfo &PSV) {
         dxbc::PSV::v2::ResourceBindInfo BindInfo;
         BindInfo.Type = Type;
         BindInfo.LowerBound = Binding.LowerBound;
-        BindInfo.UpperBound = Binding.LowerBound + Binding.Size - 1;
+        assert(Binding.Size == UINT32_MAX ||
+               (uint64_t)Binding.LowerBound + Binding.Size - 1 <= UINT32_MAX &&
+                   "Resource range is too large");
+        BindInfo.UpperBound = (Binding.Size == UINT32_MAX)
+                                  ? UINT32_MAX
+                                  : Binding.LowerBound + Binding.Size - 1;
         BindInfo.Space = Binding.Space;
         BindInfo.Kind = static_cast<dxbc::PSV::ResourceKind>(Kind);
         BindInfo.Flags = Flags;
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index 228114c5c24b2..44c48305f2832 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -57,6 +57,7 @@ def ResBindTy : DXILOpParamType;
 def ResPropsTy : DXILOpParamType;
 def SplitDoubleTy : DXILOpParamType;
 def BinaryWithCarryTy : DXILOpParamType;
+def DimensionsTy : DXILOpParamType;
 
 class DXILOpClass;
 
@@ -901,6 +902,13 @@ def CheckAccessFullyMapped : DXILOp<71, checkAccessFullyMapped> {
   let attributes = [Attributes<DXIL1_0, [ReadOnly]>];
 }
 
+def GetDimensions : DXILOp<72, getDimensions> {
+  let Doc = "gets the dimensions of a buffer or texture";
+  let arguments = [HandleTy, Int32Ty];
+  let result = DimensionsTy;
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+}
+
 def Barrier : DXILOp<80, barrier> {
   let Doc = "inserts a memory barrier in the shader";
   let intrinsics = [
diff --git a/llvm/lib/Target/DirectX/DXILOpBuilder.cpp b/llvm/lib/Target/DirectX/DXILOpBuilder.cpp
index 1aed8f9867231..944b2e6433988 100644
--- a/llvm/lib/Target/DirectX/DXILOpBuilder.cpp
+++ b/llvm/lib/Target/DirectX/DXILOpBuilder.cpp
@@ -261,6 +261,12 @@ static StructType *getBinaryWithCarryType(LLVMContext &Context) {
   return StructType::create({Int32Ty, Int1Ty}, "dx.types.i32c");
 }
 
+static StructType *getDimensionsType(LLVMContext &Ctx) {
+  Type *Int32Ty = Type::getInt32Ty(Ctx);
+  return getOrCreateStructType("dx.types.Dimensions",
+                               {Int32Ty, Int32Ty, Int32Ty, Int32Ty}, Ctx);
+}
+
 static Type *getTypeFromOpParamType(OpParamType Kind, LLVMContext &Ctx,
                                     Type *OverloadTy) {
   switch (Kind) {
@@ -318,6 +324,8 @@ static Type *getTypeFromOpParamType(OpParamType Kind, LLVMContext &Ctx,
     return getSplitDoubleType(Ctx);
   case OpParamType::BinaryWithCarryTy:
     return getBinaryWithCarryType(Ctx);
+  case OpParamType::DimensionsTy:
+    return getDimensionsType(Ctx);
   }
   llvm_unreachable("Invalid parameter kind");
   return nullptr;
diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
index 610d8b63bba27..e46a393e50906 100644
--- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp
+++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
@@ -627,6 +627,28 @@ class OpLowerer {
     });
   }
 
+  [[nodiscard]] bool lowerGetDimensionsX(Function &F) {
+    IRBuilder<> &IRB = OpBuilder.getIRB();
+    Type *Int32Ty = IRB.getInt32Ty();
+
+    return replaceFunction(F, [&](CallInst *CI) -> Error {
+      IRB.SetInsertPoint(CI);
+      Value *Handle =
+          createTmpHandleCast(CI->getArgOperand(0), OpBuilder.getHandleType());
+      Value *Undef = UndefValue::get(Int32Ty);
+
+      Expected<CallInst *> OpCall = OpBuilder.tryCreateOp(
+          OpCode::GetDimensions, {Handle, Undef}, CI->getName(), Int32Ty);
+      if (Error E = OpCall.takeError())
+        return E;
+      Value *Dim = IRB.CreateExtractValue(*OpCall, 0);
+
+      CI->replaceAllUsesWith(Dim);
+      CI->eraseFromParent();
+      return Error::success();
+    });
+  }
+
   [[nodiscard]] bool lowerGetPointer(Function &F) {
     // These should have already been handled in DXILResourceAccess, so we can
     // just clean up the dead prototype.
@@ -934,6 +956,9 @@ class OpLowerer {
       case Intrinsic::dx_resource_updatecounter:
         HasErrors |= lowerUpdateCounter(F);
         break;
+      case Intrinsic::dx_resource_getdimensions_x:
+        HasErrors |= lowerGetDimensionsX(F);
+        break;
       case Intrinsic::ctpop:
         HasErrors |= lowerCtpopToCountBits(F);
         break;
diff --git a/llvm/lib/Target/DirectX/DXILPrepare.cpp b/llvm/lib/Target/DirectX/DXILPrepare.cpp
index c8866bfefdfc5..42e90f0e27517 100644
--- a/llvm/lib/Target/DirectX/DXILPrepare.cpp
+++ b/llvm/lib/Target/DirectX/DXILPrepare.cpp
@@ -294,6 +294,14 @@ class DXILPrepareModule : public ModulePass {
     if (NamedMDNode *RootSignature = M.getNamedMetadata("dx.rootsignatures"))
       RootSignature->eraseFromParent();
 
+    // llvm.errno.tbaa was recently added but is not supported in LLVM 3.7 and
+    // causes all tests using the DXIL Validator to fail.
+    //
+    // This is a temporary fix and should be replaced with a whitelist once
+    // we have determined all metadata that the DXIL Validator allows
+    if (NamedMDNode *ErrNo = M.getNamedMetadata("llvm.errno.tbaa"))
+      ErrNo->eraseFromParent();
+
     return true;
   }
 
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
index 82c43ff8dc352..26a8728e1f37c 100644
--- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
@@ -1165,12 +1165,15 @@ void DXILBitcodeWriter::writeValueSymbolTableForwardDecl() {}
 /// Returns the bit offset to backpatch with the location of the real VST.
 void DXILBitcodeWriter::writeModuleInfo() {
   // Emit various pieces of data attached to a module.
-  if (!M.getTargetTriple().empty())
-    writeStringRecord(Stream, bitc::MODULE_CODE_TRIPLE,
-                      M.getTargetTriple().str(), 0 /*TODO*/);
-  const std::string &DL = M.getDataLayoutStr();
-  if (!DL.empty())
-    writeStringRecord(Stream, bitc::MODULE_CODE_DATALAYOUT, DL, 0 /*TODO*/);
+
+  // We need to hardcode a triple and datalayout that's compatible with the
+  // historical DXIL triple and datalayout from DXC.
+  StringRef Triple = "dxil-ms-dx";
+  StringRef DL = "e-m:e-p:32:32-i1:8-i8:8-i16:32-i32:32-i64:64-"
+                 "f16:32-f32:32-f64:64-n8:16:32:64";
+  writeStringRecord(Stream, bitc::MODULE_CODE_TRIPLE, Triple, 0 /*TODO*/);
+  writeStringRecord(Stream, bitc::MODULE_CODE_DATALAYOUT, DL, 0 /*TODO*/);
+
   if (!M.getModuleInlineAsm().empty())
     writeStringRecord(Stream, bitc::MODULE_CODE_ASM, M.getModuleInlineAsm(),
                       0 /*TODO*/);
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp
index 1eb03bfc087e3..725f2b1e7c76c 100644
--- a/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILWriterPass.cpp
@@ -149,11 +149,6 @@ class EmbedDXILPass : public llvm::ModulePass {
     std::string Data;
     llvm::raw_string_ostream OS(Data);
 
-    Triple OriginalTriple = M.getTargetTriple();
-    // Set to DXIL triple when write to bitcode.
-    // Only the output bitcode need to be DXIL triple.
-    M.setTargetTriple(Triple("dxil-ms-dx"));
-
     // Perform late legalization of lifetime intrinsics that would otherwise
     // fail the Module Verifier if performed in an earlier pass
     legalizeLifetimeIntrinsics(M);
@@ -165,9 +160,6 @@ class EmbedDXILPass : public llvm::ModulePass {
     // not-so-legal legalizations
     removeLifetimeIntrinsics(M);
 
-    // Recover triple.
-    M.setTargetTriple(OriginalTriple);
-
     Constant *ModuleConstant =
         ConstantDataArray::get(M.getContext(), arrayRefFromStringRef(Data));
     auto *GV = new llvm::GlobalVariable(M, ModuleConstant->getType(), true,
diff --git a/llvm/lib/Target/Hexagon/CMakeLists.txt b/llvm/lib/Target/Hexagon/CMakeLists.txt
index d758260a8ab5d..1a5f09642ea66 100644
--- a/llvm/lib/Target/Hexagon/CMakeLists.txt
+++ b/llvm/lib/Target/Hexagon/CMakeLists.txt
@@ -54,6 +54,7 @@ add_llvm_target(HexagonCodeGen
   HexagonOptAddrMode.cpp
   HexagonOptimizeSZextends.cpp
   HexagonPeephole.cpp
+  HexagonQFPOptimizer.cpp
   HexagonRDFOpt.cpp
   HexagonRegisterInfo.cpp
   HexagonSelectionDAGInfo.cpp
diff --git a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
index 974f6533411e0..3bd6ed4cbae86 100644
--- a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
+++ b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -66,6 +66,10 @@ class HexagonDisassembler : public MCDisassembler {
 
   void remapInstruction(MCInst &Instr) const;
 
+  Expected<bool> onSymbolStart(SymbolInfoTy &Symbol, uint64_t &Size,
+                               ArrayRef<uint8_t> Bytes,
+                               uint64_t Address) const override;
+
 private:
   bool makeBundle(ArrayRef<uint8_t> Bytes, uint64_t Address,
                   uint64_t &BytesToSkip, raw_ostream &CS) const;
@@ -567,6 +571,18 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(MCInst &MI, MCInst &MCB,
   return Result;
 }
 
+Expected<bool> HexagonDisassembler::onSymbolStart(SymbolInfoTy &Symbol,
+                                                  uint64_t &Size,
+                                                  ArrayRef<uint8_t> Bytes,
+                                                  uint64_t Address) const {
+  // At the start of a symbol, force a fresh packet by resetting any
+  // in-progress bundle state. This prevents packets from straddling label
+  // boundaries when data (e.g. jump tables) appears in between.
+  Size = 0;
+  resetBundle();
+  return true;
+}
+
 static DecodeStatus DecodeRegisterClass(MCInst &Inst, unsigned RegNo,
                                         ArrayRef<MCPhysReg> Table) {
   if (RegNo < Table.size()) {
@@ -667,11 +683,10 @@ static DecodeStatus DecodeHvxWRRegisterClass(MCInst &Inst, unsigned RegNo,
   return DecodeRegisterClass(Inst, RegNo, HvxWRDecoderTable);
 }
 
-LLVM_ATTRIBUTE_UNUSED // Suppress warning temporarily.
-    static DecodeStatus
-    DecodeHvxVQRRegisterClass(MCInst &Inst, unsigned RegNo,
-                              uint64_t /*Address*/,
-                              const MCDisassembler *Decoder) {
+[[maybe_unused]] // Suppress warning temporarily.
+static DecodeStatus DecodeHvxVQRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                              uint64_t /*Address*/,
+                                              const MCDisassembler *Decoder) {
   static const MCPhysReg HvxVQRDecoderTable[] = {
       Hexagon::VQ0,  Hexagon::VQ1,  Hexagon::VQ2,  Hexagon::VQ3,
       Hexagon::VQ4,  Hexagon::VQ5,  Hexagon::VQ6,  Hexagon::VQ7};
diff --git a/llvm/lib/Target/Hexagon/Hexagon.h b/llvm/lib/Target/Hexagon/Hexagon.h
index 109aba53b6e3e..422ab20891b94 100644
--- a/llvm/lib/Target/Hexagon/Hexagon.h
+++ b/llvm/lib/Target/Hexagon/Hexagon.h
@@ -67,6 +67,8 @@ void initializeHexagonPeepholePass(PassRegistry &);
 void initializeHexagonSplitConst32AndConst64Pass(PassRegistry &);
 void initializeHexagonVectorPrintPass(PassRegistry &);
 
+void initializeHexagonQFPOptimizerPass(PassRegistry &);
+
 Pass *createHexagonLoopIdiomPass();
 Pass *createHexagonVectorLoopCarriedReuseLegacyPass();
 
@@ -112,6 +114,7 @@ FunctionPass *createHexagonVectorCombineLegacyPass();
 FunctionPass *createHexagonVectorPrint();
 FunctionPass *createHexagonVExtract();
 FunctionPass *createHexagonExpandCondsets();
+FunctionPass *createHexagonQFPOptimizer();
 
 } // end namespace llvm;
 
diff --git a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
index 52e6b0b083c81..68f53124f9db8 100644
--- a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -174,8 +174,8 @@ namespace {
     const TargetRegisterInfo *TRI;
   };
 
-  raw_ostream &operator<< (raw_ostream &OS, const PrintRegSet &P)
-    LLVM_ATTRIBUTE_UNUSED;
+  [[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
+                                           const PrintRegSet &P);
   raw_ostream &operator<< (raw_ostream &OS, const PrintRegSet &P) {
     OS << '{';
     for (unsigned R = P.RS.find_first(); R; R = P.RS.find_next(R))
diff --git a/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp b/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
index 14b6bb318e3b2..9087f9dd82071 100644
--- a/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
@@ -272,15 +272,14 @@ namespace {
       OS << *I << ' ' << **I << '\n';
   }
 
-  raw_ostream &operator<< (raw_ostream &OS,
-                           const NodeVect &S) LLVM_ATTRIBUTE_UNUSED;
+  [[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS, const NodeVect &S);
   raw_ostream &operator<< (raw_ostream &OS, const NodeVect &S) {
     dump_node_container(OS, S);
     return OS;
   }
 
-  raw_ostream &operator<< (raw_ostream &OS,
-                           const NodeToUsesMap &M) LLVM_ATTRIBUTE_UNUSED;
+  [[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
+                                           const NodeToUsesMap &M);
   raw_ostream &operator<< (raw_ostream &OS, const NodeToUsesMap &M){
     for (const auto &I : M) {
       const UseSet &Us = I.second;
@@ -914,9 +913,8 @@ namespace {
     const NodeToValueMap &Map;
   };
 
-  raw_ostream &operator<< (raw_ostream &OS,
-                           const LocationAsBlock &Loc) LLVM_ATTRIBUTE_UNUSED ;
-  raw_ostream &operator<< (raw_ostream &OS, const LocationAsBlock &Loc) {
+  [[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
+                                           const LocationAsBlock &Loc) {
     for (const auto &I : Loc.Map) {
       OS << I.first << " -> ";
       if (BasicBlock *B = cast_or_null<BasicBlock>(I.second))
diff --git a/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp b/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp
index 5dde47ab3de57..a3296e0796412 100644
--- a/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp
@@ -419,8 +419,8 @@ namespace {
 
   using HCE = HexagonConstExtenders;
 
-  LLVM_ATTRIBUTE_UNUSED
-  raw_ostream &operator<< (raw_ostream &OS, const OffsetRange &OR) {
+  [[maybe_unused]]
+  raw_ostream &operator<<(raw_ostream &OS, const OffsetRange &OR) {
     if (OR.Min > OR.Max)
       OS << '!';
     OS << '[' << OR.Min << ',' << OR.Max << "]a" << unsigned(OR.Align)
@@ -435,8 +435,8 @@ namespace {
     const HexagonRegisterInfo &HRI;
   };
 
-  LLVM_ATTRIBUTE_UNUSED
-  raw_ostream &operator<< (raw_ostream &OS, const PrintRegister &P) {
+  [[maybe_unused]]
+  raw_ostream &operator<<(raw_ostream &OS, const PrintRegister &P) {
     if (P.Rs.Reg != 0)
       OS << printReg(P.Rs.Reg, &P.HRI, P.Rs.Sub);
     else
@@ -451,8 +451,8 @@ namespace {
     const HexagonRegisterInfo &HRI;
   };
 
-  LLVM_ATTRIBUTE_UNUSED
-  raw_ostream &operator<< (raw_ostream &OS, const PrintExpr &P) {
+  [[maybe_unused]]
+  raw_ostream &operator<<(raw_ostream &OS, const PrintExpr &P) {
     OS << "## " << (P.Ex.Neg ? "- " : "+ ");
     if (P.Ex.Rs.Reg != 0)
       OS << printReg(P.Ex.Rs.Reg, &P.HRI, P.Ex.Rs.Sub);
@@ -469,15 +469,15 @@ namespace {
     const HexagonRegisterInfo &HRI;
   };
 
-  LLVM_ATTRIBUTE_UNUSED
-  raw_ostream &operator<< (raw_ostream &OS, const PrintInit &P) {
+  [[maybe_unused]]
+  raw_ostream &operator<<(raw_ostream &OS, const PrintInit &P) {
     OS << '[' << P.ExtI.first << ", "
        << PrintExpr(P.ExtI.second, P.HRI) << ']';
     return OS;
   }
 
-  LLVM_ATTRIBUTE_UNUSED
-  raw_ostream &operator<< (raw_ostream &OS, const HCE::ExtDesc &ED) {
+  [[maybe_unused]]
+  raw_ostream &operator<<(raw_ostream &OS, const HCE::ExtDesc &ED) {
     assert(ED.OpNum != -1u);
     const MachineBasicBlock &MBB = *ED.getOp().getParent()->getParent();
     const MachineFunction &MF = *MBB.getParent();
@@ -493,8 +493,8 @@ namespace {
     return OS;
   }
 
-  LLVM_ATTRIBUTE_UNUSED
-  raw_ostream &operator<< (raw_ostream &OS, const HCE::ExtRoot &ER) {
+  [[maybe_unused]]
+  raw_ostream &operator<<(raw_ostream &OS, const HCE::ExtRoot &ER) {
     switch (ER.Kind) {
       case MachineOperand::MO_Immediate:
         OS << "imm:" << ER.V.ImmVal;
@@ -527,8 +527,8 @@ namespace {
     return OS;
   }
 
-  LLVM_ATTRIBUTE_UNUSED
-  raw_ostream &operator<< (raw_ostream &OS, const HCE::ExtValue &EV) {
+  [[maybe_unused]]
+  raw_ostream &operator<<(raw_ostream &OS, const HCE::ExtValue &EV) {
     OS << HCE::ExtRoot(EV) << "  off:" << EV.Offset;
     return OS;
   }
@@ -540,8 +540,8 @@ namespace {
     const HexagonRegisterInfo &HRI;
   };
 
-  LLVM_ATTRIBUTE_UNUSED
-  raw_ostream &operator<< (raw_ostream &OS, const PrintIMap &P) {
+  [[maybe_unused]]
+  raw_ostream &operator<<(raw_ostream &OS, const PrintIMap &P) {
     OS << "{\n";
     for (const std::pair<const HCE::ExtenderInit, HCE::IndexList> &Q : P.IMap) {
       OS << "  " << PrintInit(Q.first, P.HRI) << " -> {";
diff --git a/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp b/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
index 14a7ae722954b..3900aac884257 100644
--- a/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
@@ -132,8 +132,7 @@ namespace {
     const TargetRegisterInfo &TRI;
     friend raw_ostream &operator<< (raw_ostream &OS, const PrintFP &P);
   };
-  raw_ostream &operator<<(raw_ostream &OS,
-                          const PrintFP &P) LLVM_ATTRIBUTE_UNUSED;
+  [[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS, const PrintFP &P);
   raw_ostream &operator<<(raw_ostream &OS, const PrintFP &P) {
     OS << "{ SplitB:" << PrintMB(P.FP.SplitB)
        << ", PredR:" << printReg(P.FP.PredR, &P.TRI)
diff --git a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
index 4ddbe7a53701b..ff876f6595350 100644
--- a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
@@ -920,6 +920,10 @@ void HexagonGenInsert::collectInBlock(MachineBasicBlock *B,
   // successors have been processed.
   RegisterSet BlockDefs, InsDefs;
   for (MachineInstr &MI : *B) {
+    // Stop if the map size is too large.
+    if (IFMap.size() >= MaxIFMSize)
+      break;
+
     InsDefs.clear();
     getInstrDefs(&MI, InsDefs);
     // Leave those alone. They are more transparent than "insert".
@@ -942,8 +946,8 @@ void HexagonGenInsert::collectInBlock(MachineBasicBlock *B,
 
         findRecordInsertForms(VR, AVs);
         // Stop if the map size is too large.
-        if (IFMap.size() > MaxIFMSize)
-          return;
+        if (IFMap.size() >= MaxIFMSize)
+          break;
       }
     }
 
diff --git a/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp b/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
index f9fdab4364d4c..9c81e9638f8e2 100644
--- a/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
@@ -51,11 +51,11 @@ struct PrintRegister {
   const TargetRegisterInfo &TRI;
 };
 
-  raw_ostream &operator<< (raw_ostream &OS, const PrintRegister &PR)
-    LLVM_ATTRIBUTE_UNUSED;
-  raw_ostream &operator<< (raw_ostream &OS, const PrintRegister &PR) {
-    return OS << printReg(PR.Reg.Reg, &PR.TRI, PR.Reg.SubReg);
-  }
+[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
+                                         const PrintRegister &PR);
+raw_ostream &operator<<(raw_ostream &OS, const PrintRegister &PR) {
+  return OS << printReg(PR.Reg.Reg, &PR.TRI, PR.Reg.SubReg);
+}
 
   class HexagonGenPredicate : public MachineFunctionPass {
   public:
diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
index 4d96cfadc79ff..c7a4f6803a243 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
@@ -789,7 +789,7 @@ struct ShuffleMask {
   }
 };
 
-LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]]
 raw_ostream &operator<<(raw_ostream &OS, const ShuffleMask &SM) {
   SM.print(OS);
   return OS;
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
index a94e131dd7214..54c89721bc1f0 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@@ -117,8 +117,10 @@ HexagonTargetLowering::initializeHVXLowering() {
   setOperationAction(ISD::VECTOR_SHUFFLE,          ByteW, Legal);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
 
-  if (Subtarget.useHVX128BOps())
+  if (Subtarget.useHVX128BOps()) {
     setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
+    setOperationAction(ISD::BITCAST, MVT::v64i1, Custom);
+  }
   if (Subtarget.useHVX128BOps() && Subtarget.useHVXV68Ops() &&
       Subtarget.useHVXFloatingPoint()) {
 
@@ -2024,13 +2026,9 @@ HexagonTargetLowering::LowerHvxBitcast(SDValue Op, SelectionDAG &DAG) const {
   // Handle bitcast from i32, v2i16, and v4i8 to v32i1.
   // Splat the input into a 32-element i32 vector, then AND each element
   // with a unique bitmask to isolate individual bits.
-  if (ResTy == MVT::v32i1 &&
-      (ValTy == MVT::i32 || ValTy == MVT::v2i16 || ValTy == MVT::v4i8) &&
-      Subtarget.useHVX128BOps()) {
-    SDValue Val32 = Val;
-    if (ValTy == MVT::v2i16 || ValTy == MVT::v4i8)
-      Val32 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Val);
-
+  auto bitcastI32ToV32I1 = [&](SDValue Val32) {
+    assert(Val32.getValueType().getSizeInBits() == 32 &&
+           "Input must be 32 bits");
     MVT VecTy = MVT::getVectorVT(MVT::i32, 32);
     SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, dl, VecTy, Val32);
     SmallVector<SDValue, 32> Mask;
@@ -2039,7 +2037,31 @@ HexagonTargetLowering::LowerHvxBitcast(SDValue Op, SelectionDAG &DAG) const {
 
     SDValue MaskVec = DAG.getBuildVector(VecTy, dl, Mask);
     SDValue Anded = DAG.getNode(ISD::AND, dl, VecTy, Splat, MaskVec);
-    return DAG.getNode(HexagonISD::V2Q, dl, ResTy, Anded);
+    return DAG.getNode(HexagonISD::V2Q, dl, MVT::v32i1, Anded);
+  };
+  // === Case: v32i1 ===
+  if (ResTy == MVT::v32i1 &&
+      (ValTy == MVT::i32 || ValTy == MVT::v2i16 || ValTy == MVT::v4i8) &&
+      Subtarget.useHVX128BOps()) {
+    SDValue Val32 = Val;
+    if (ValTy == MVT::v2i16 || ValTy == MVT::v4i8)
+      Val32 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Val);
+    return bitcastI32ToV32I1(Val32);
+  }
+  // === Case: v64i1 ===
+  if (ResTy == MVT::v64i1 && ValTy == MVT::i64 && Subtarget.useHVX128BOps()) {
+    // Split i64 into lo/hi 32-bit halves.
+    SDValue Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Val);
+    SDValue HiShifted = DAG.getNode(ISD::SRL, dl, MVT::i64, Val,
+                                    DAG.getConstant(32, dl, MVT::i64));
+    SDValue Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, HiShifted);
+
+    // Reuse the same 32-bit logic twice.
+    SDValue LoRes = bitcastI32ToV32I1(Lo);
+    SDValue HiRes = bitcastI32ToV32I1(Hi);
+
+    // Concatenate into a v64i1 predicate.
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, LoRes, HiRes);
   }
 
   if (isHvxBoolTy(ResTy) && ValTy.isScalarInteger()) {
diff --git a/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp b/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp
new file mode 100644
index 0000000000000..479ac90b7d526
--- /dev/null
+++ b/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp
@@ -0,0 +1,334 @@
+//===----- HexagonQFPOptimizer.cpp - Qualcomm-FP to IEEE-FP conversions
+// optimizer ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Basic infrastructure for optimizing intermediate conversion instructions
+// generated while performing vector floating point operations.
+// Currently run at the starting of the code generation for Hexagon, cleans
+// up redundant conversion instructions and replaces the uses of conversion
+// with appropriate machine operand. Liveness is preserved after this pass.
+//
+// @note: The redundant conversion instructions are not eliminated in this pass.
+// In this pass, we are only trying to replace the uses of conversion
+// instructions with its appropriate QFP instruction. We are leaving the job to
+// Dead instruction Elimination pass to remove redundant conversion
+// instructions.
+//
+// Brief overview of working of this QFP optimizer.
+// This version of Hexagon QFP optimizer basically iterates over each
+// instruction, checks whether if it belongs to hexagon floating point HVX
+// arithmetic instruction category(Add, Sub, Mul). And then it finds the unique
+// definition for the machine operands corresponding to the instruction.
+//
+// Example:
+// MachineInstruction *MI be the HVX vadd instruction
+// MI -> $v0 = V6_vadd_sf $v1, $v2
+// MachineOperand *DefMI1 = MRI->getVRegDef(MI->getOperand(1).getReg());
+// MachineOperand *DefMI2 = MRI->getVRegDef(MI->getOperand(2).getReg());
+//
+// In the above example, DefMI1 and DefMI2 gives the unique definitions
+// corresponding to the operands($v1 and &v2 respectively) of instruction MI.
+//
+// If both of the definitions are not conversion instructions(V6_vconv_sf_qf32,
+// V6_vconv_hf_qf16), then it will skip optimizing the current instruction and
+// iterates over next instruction.
+//
+// If one the definitions is conversion instruction then our pass will replace
+// the arithmetic instruction with its corresponding mix variant.
+// In the above example, if $v1 is conversion instruction
+// DefMI1 -> $v1 = V6_vconv_sf_qf32 $v3
+// After Transformation:
+// MI -> $v0 = V6_vadd_qf32_mix $v3, $v2 ($v1 is replaced with $v3)
+//
+// If both the definitions are conversion instructions then the instruction will
+// be replaced with its qf variant
+// In the above example, if $v1 and $v2 are conversion instructions
+// DefMI1 -> $v1 = V6_vconv_sf_qf32 $v3
+// DefMI2 -> $v2 = V6_vconv_sf_qf32 $v4
+// After Transformation:
+// MI -> $v0 = V6_vadd_qf32 $v3, $v4 ($v1 is replaced with $v3, $v2 is replaced
+// with $v4)
+//
+// Currently, in this pass, we are not handling the case when the definitions
+// are PHI inst.
+//
+//===----------------------------------------------------------------------===//
+#include <unordered_set>
+#define HEXAGON_QFP_OPTIMIZER "QFP optimizer pass"
+
+#include "Hexagon.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonSubtarget.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <map>
+#include <vector>
+
+#define DEBUG_TYPE "hexagon-qfp-optimizer"
+
+using namespace llvm;
+
+cl::opt<bool>
+    DisableQFOptimizer("disable-qfp-opt", cl::init(false),
+                       cl::desc("Disable optimization of Qfloat operations."));
+
+namespace {
+const std::map<unsigned short, unsigned short> QFPInstMap{
+    {Hexagon::V6_vadd_hf, Hexagon::V6_vadd_qf16_mix},
+    {Hexagon::V6_vadd_qf16_mix, Hexagon::V6_vadd_qf16},
+    {Hexagon::V6_vadd_sf, Hexagon::V6_vadd_qf32_mix},
+    {Hexagon::V6_vadd_qf32_mix, Hexagon::V6_vadd_qf32},
+    {Hexagon::V6_vsub_hf, Hexagon::V6_vsub_qf16_mix},
+    {Hexagon::V6_vsub_qf16_mix, Hexagon::V6_vsub_qf16},
+    {Hexagon::V6_vsub_sf, Hexagon::V6_vsub_qf32_mix},
+    {Hexagon::V6_vsub_qf32_mix, Hexagon::V6_vsub_qf32},
+    {Hexagon::V6_vmpy_qf16_hf, Hexagon::V6_vmpy_qf16_mix_hf},
+    {Hexagon::V6_vmpy_qf16_mix_hf, Hexagon::V6_vmpy_qf16},
+    {Hexagon::V6_vmpy_qf32_hf, Hexagon::V6_vmpy_qf32_mix_hf},
+    {Hexagon::V6_vmpy_qf32_mix_hf, Hexagon::V6_vmpy_qf32_qf16},
+    {Hexagon::V6_vmpy_qf32_sf, Hexagon::V6_vmpy_qf32}};
+} // namespace
+
+namespace llvm {
+
+FunctionPass *createHexagonQFPOptimizer();
+void initializeHexagonQFPOptimizerPass(PassRegistry &);
+
+} // namespace llvm
+
+namespace {
+
+struct HexagonQFPOptimizer : public MachineFunctionPass {
+public:
+  static char ID;
+
+  HexagonQFPOptimizer() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  bool optimizeQfp(MachineInstr *MI, MachineBasicBlock *MBB);
+
+  StringRef getPassName() const override { return HEXAGON_QFP_OPTIMIZER; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+private:
+  const HexagonSubtarget *HST = nullptr;
+  const HexagonInstrInfo *HII = nullptr;
+  const MachineRegisterInfo *MRI = nullptr;
+};
+
+char HexagonQFPOptimizer::ID = 0;
+} // namespace
+
+INITIALIZE_PASS(HexagonQFPOptimizer, "hexagon-qfp-optimizer",
+                HEXAGON_QFP_OPTIMIZER, false, false)
+
+FunctionPass *llvm::createHexagonQFPOptimizer() {
+  return new HexagonQFPOptimizer();
+}
+
+bool HexagonQFPOptimizer::optimizeQfp(MachineInstr *MI,
+                                      MachineBasicBlock *MBB) {
+
+  // Early exit:
+  // - if instruction is invalid or has too few operands (QFP ops need 2 sources
+  // + 1 dest),
+  // - or does not have a transformation mapping.
+  if (MI->getNumOperands() < 3)
+    return false;
+  auto It = QFPInstMap.find(MI->getOpcode());
+  if (It == QFPInstMap.end())
+    return false;
+  unsigned short InstTy = It->second;
+
+  unsigned Op0F = 0;
+  unsigned Op1F = 0;
+  // Get the reaching defs of MI, DefMI1 and DefMI2
+  MachineInstr *DefMI1 = nullptr;
+  MachineInstr *DefMI2 = nullptr;
+
+  if (MI->getOperand(1).isReg())
+    DefMI1 = MRI->getVRegDef(MI->getOperand(1).getReg());
+  if (MI->getOperand(2).isReg())
+    DefMI2 = MRI->getVRegDef(MI->getOperand(2).getReg());
+  if (!DefMI1 || !DefMI2)
+    return false;
+
+  MachineOperand &Res = MI->getOperand(0);
+  MachineInstr *Inst1 = nullptr;
+  MachineInstr *Inst2 = nullptr;
+  LLVM_DEBUG(dbgs() << "\n[Reaching Defs of operands]: "; DefMI1->dump();
+             DefMI2->dump());
+
+  // Get the reaching defs of DefMI
+  if (DefMI1->getNumOperands() > 1 && DefMI1->getOperand(1).isReg() &&
+      DefMI1->getOperand(1).getReg().isVirtual())
+    Inst1 = MRI->getVRegDef(DefMI1->getOperand(1).getReg());
+
+  if (DefMI2->getNumOperands() > 1 && DefMI2->getOperand(1).isReg() &&
+      DefMI2->getOperand(1).getReg().isVirtual())
+    Inst2 = MRI->getVRegDef(DefMI2->getOperand(1).getReg());
+
+  unsigned Def1OP = DefMI1->getOpcode();
+  unsigned Def2OP = DefMI2->getOpcode();
+
+  MachineInstrBuilder MIB;
+  // Case 1: Both reaching defs of MI are qf to sf/hf conversions
+  if ((Def1OP == Hexagon::V6_vconv_sf_qf32 &&
+       Def2OP == Hexagon::V6_vconv_sf_qf32) ||
+      (Def1OP == Hexagon::V6_vconv_hf_qf16 &&
+       Def2OP == Hexagon::V6_vconv_hf_qf16)) {
+
+    // If the reaching defs of DefMI are W register type, we return
+    if ((Inst1 && Inst1->getNumOperands() > 0 && Inst1->getOperand(0).isReg() &&
+         MRI->getRegClass(Inst1->getOperand(0).getReg()) ==
+             &Hexagon::HvxWRRegClass) ||
+        (Inst2 && Inst2->getNumOperands() > 0 && Inst2->getOperand(0).isReg() &&
+         MRI->getRegClass(Inst2->getOperand(0).getReg()) ==
+             &Hexagon::HvxWRRegClass))
+      return false;
+
+    // Analyze the use operands of the conversion to get their KILL status
+    MachineOperand &Src1 = DefMI1->getOperand(1);
+    MachineOperand &Src2 = DefMI2->getOperand(1);
+
+    Op0F = getKillRegState(Src1.isKill());
+    Src1.setIsKill(false);
+
+    Op1F = getKillRegState(Src2.isKill());
+    Src2.setIsKill(false);
+
+    if (MI->getOpcode() != Hexagon::V6_vmpy_qf32_sf) {
+      auto OuterIt = QFPInstMap.find(MI->getOpcode());
+      if (OuterIt == QFPInstMap.end())
+        return false;
+      auto InnerIt = QFPInstMap.find(OuterIt->second);
+      if (InnerIt == QFPInstMap.end())
+        return false;
+      InstTy = InnerIt->second;
+    }
+
+    MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), HII->get(InstTy), Res.getReg())
+              .addReg(Src1.getReg(), Op0F, Src1.getSubReg())
+              .addReg(Src2.getReg(), Op1F, Src2.getSubReg());
+    LLVM_DEBUG(dbgs() << "\n[Inserting]: "; MIB.getInstr()->dump());
+    return true;
+
+    // Case 2: Left operand is conversion to sf/hf
+  } else if (((Def1OP == Hexagon::V6_vconv_sf_qf32 &&
+               Def2OP != Hexagon::V6_vconv_sf_qf32) ||
+              (Def1OP == Hexagon::V6_vconv_hf_qf16 &&
+               Def2OP != Hexagon::V6_vconv_hf_qf16)) &&
+             !DefMI2->isPHI() &&
+             (MI->getOpcode() != Hexagon::V6_vmpy_qf32_sf)) {
+
+    if (Inst1 && MRI->getRegClass(Inst1->getOperand(0).getReg()) ==
+                     &Hexagon::HvxWRRegClass)
+      return false;
+
+    MachineOperand &Src1 = DefMI1->getOperand(1);
+    MachineOperand &Src2 = MI->getOperand(2);
+
+    Op0F = getKillRegState(Src1.isKill());
+    Src1.setIsKill(false);
+    Op1F = getKillRegState(Src2.isKill());
+    MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), HII->get(InstTy), Res.getReg())
+              .addReg(Src1.getReg(), Op0F, Src1.getSubReg())
+              .addReg(Src2.getReg(), Op1F, Src2.getSubReg());
+    LLVM_DEBUG(dbgs() << "\n[Inserting]: "; MIB.getInstr()->dump());
+    return true;
+
+    // Case 2: Left operand is conversion to sf/hf
+  } else if (((Def1OP != Hexagon::V6_vconv_sf_qf32 &&
+               Def2OP == Hexagon::V6_vconv_sf_qf32) ||
+              (Def1OP != Hexagon::V6_vconv_hf_qf16 &&
+               Def2OP == Hexagon::V6_vconv_hf_qf16)) &&
+             !DefMI1->isPHI() &&
+             (MI->getOpcode() != Hexagon::V6_vmpy_qf32_sf)) {
+    // The second operand of original instruction is converted.
+    // In "mix" instructions, "qf" operand is always the first operand.
+
+    // Caveat: vsub is not commutative w.r.t operands.
+    if (InstTy == Hexagon::V6_vsub_qf16_mix ||
+        InstTy == Hexagon::V6_vsub_qf32_mix)
+      return false;
+
+    if (Inst2 && MRI->getRegClass(Inst2->getOperand(0).getReg()) ==
+                     &Hexagon::HvxWRRegClass)
+      return false;
+
+    MachineOperand &Src1 = MI->getOperand(1);
+    MachineOperand &Src2 = DefMI2->getOperand(1);
+
+    Op1F = getKillRegState(Src2.isKill());
+    Src2.setIsKill(false);
+    Op0F = getKillRegState(Src1.isKill());
+    MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), HII->get(InstTy), Res.getReg())
+              .addReg(Src2.getReg(), Op1F,
+                      Src2.getSubReg()) // Notice the operands are flipped.
+              .addReg(Src1.getReg(), Op0F, Src1.getSubReg());
+    LLVM_DEBUG(dbgs() << "\n[Inserting]: "; MIB.getInstr()->dump());
+    return true;
+  }
+
+  return false;
+}
+
+bool HexagonQFPOptimizer::runOnMachineFunction(MachineFunction &MF) {
+
+  bool Changed = false;
+
+  if (DisableQFOptimizer)
+    return Changed;
+
+  HST = &MF.getSubtarget<HexagonSubtarget>();
+  if (!HST->useHVXV68Ops() || !HST->usePackets() ||
+      skipFunction(MF.getFunction()))
+    return false;
+  HII = HST->getInstrInfo();
+  MRI = &MF.getRegInfo();
+
+  MachineFunction::iterator MBBI = MF.begin();
+  LLVM_DEBUG(dbgs() << "\n=== Running QFPOptimzer Pass for : " << MF.getName()
+                    << " Optimize intermediate conversions ===\n");
+  while (MBBI != MF.end()) {
+    MachineBasicBlock *MBB = &*MBBI;
+    MachineBasicBlock::iterator MII = MBBI->instr_begin();
+    while (MII != MBBI->instr_end()) {
+      MachineInstr *MI = &*MII;
+      ++MII; // As MI might be removed.
+
+      if (QFPInstMap.count(MI->getOpcode()) &&
+          MI->getOpcode() != Hexagon::V6_vconv_sf_qf32 &&
+          MI->getOpcode() != Hexagon::V6_vconv_hf_qf16) {
+        LLVM_DEBUG(dbgs() << "\n###Analyzing for removal: "; MI->dump());
+        if (optimizeQfp(MI, MBB)) {
+          MI->eraseFromParent();
+          LLVM_DEBUG(dbgs() << "\t....Removing....");
+          Changed = true;
+        }
+      }
+    }
+    ++MBBI;
+  }
+  return Changed;
+}
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index f5d8b696733ba..d9824a3154093 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -220,6 +220,7 @@ LLVMInitializeHexagonTarget() {
   initializeHexagonPeepholePass(PR);
   initializeHexagonSplitConst32AndConst64Pass(PR);
   initializeHexagonVectorPrintPass(PR);
+  initializeHexagonQFPOptimizerPass(PR);
 }
 
 HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT,
@@ -386,6 +387,7 @@ bool HexagonPassConfig::addInstSelector() {
       addPass(createHexagonGenInsert());
     if (EnableEarlyIf)
       addPass(createHexagonEarlyIfConversion());
+    addPass(createHexagonQFPOptimizer());
   }
 
   return false;
diff --git a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
index 87d052b9bb679..9ab52020e2e36 100644
--- a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
@@ -300,7 +300,6 @@ class AlignVectors {
     const_iterator end() const { return Blocks.end(); }
   };
 
-  Align getAlignFromValue(const Value *V) const;
   std::optional<AddrInfo> getAddrInfo(Instruction &In) const;
   bool isHvx(const AddrInfo &AI) const;
   // This function is only used for assertions at the moment.
@@ -364,7 +363,7 @@ class AlignVectors {
   const HexagonVectorCombine &HVC;
 };
 
-LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]]
 raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::AddrInfo &AI) {
   OS << "Inst: " << AI.Inst << "  " << *AI.Inst << '\n';
   OS << "Addr: " << *AI.Addr << '\n';
@@ -375,7 +374,7 @@ raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::AddrInfo &AI) {
   return OS;
 }
 
-LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]]
 raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::MoveGroup &MG) {
   OS << "IsLoad:" << (MG.IsLoad ? "yes" : "no");
   OS << ", IsHvx:" << (MG.IsHvx ? "yes" : "no") << '\n';
@@ -394,7 +393,7 @@ raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::MoveGroup &MG) {
   return OS;
 }
 
-LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]]
 raw_ostream &operator<<(raw_ostream &OS,
                         const AlignVectors::ByteSpan::Block &B) {
   OS << "  @" << B.Pos << " [" << B.Seg.Start << ',' << B.Seg.Size << "] ";
@@ -408,7 +407,7 @@ raw_ostream &operator<<(raw_ostream &OS,
   return OS;
 }
 
-LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]]
 raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::ByteSpan &BS) {
   OS << "ByteSpan[size=" << BS.size() << ", extent=" << BS.extent() << '\n';
   for (const AlignVectors::ByteSpan::Block &B : BS)
@@ -612,12 +611,6 @@ auto AlignVectors::ByteSpan::values() const -> SmallVector<Value *, 8> {
   return Values;
 }
 
-auto AlignVectors::getAlignFromValue(const Value *V) const -> Align {
-  const auto *C = dyn_cast<ConstantInt>(V);
-  assert(C && "Alignment must be a compile-time constant integer");
-  return C->getAlignValue();
-}
-
 auto AlignVectors::getAddrInfo(Instruction &In) const
     -> std::optional<AddrInfo> {
   if (auto *L = isCandidate<LoadInst>(&In))
@@ -631,11 +624,11 @@ auto AlignVectors::getAddrInfo(Instruction &In) const
     switch (ID) {
     case Intrinsic::masked_load:
       return AddrInfo(HVC, II, II->getArgOperand(0), II->getType(),
-                      getAlignFromValue(II->getArgOperand(1)));
+                      II->getParamAlign(0).valueOrOne());
     case Intrinsic::masked_store:
       return AddrInfo(HVC, II, II->getArgOperand(1),
                       II->getArgOperand(0)->getType(),
-                      getAlignFromValue(II->getArgOperand(2)));
+                      II->getParamAlign(1).valueOrOne());
     }
   }
   return std::nullopt;
@@ -660,9 +653,9 @@ auto AlignVectors::getMask(Value *Val) const -> Value * {
   if (auto *II = dyn_cast<IntrinsicInst>(Val)) {
     switch (II->getIntrinsicID()) {
     case Intrinsic::masked_load:
-      return II->getArgOperand(2);
+      return II->getArgOperand(1);
     case Intrinsic::masked_store:
-      return II->getArgOperand(3);
+      return II->getArgOperand(2);
     }
   }
 
@@ -675,7 +668,7 @@ auto AlignVectors::getMask(Value *Val) const -> Value * {
 auto AlignVectors::getPassThrough(Value *Val) const -> Value * {
   if (auto *II = dyn_cast<IntrinsicInst>(Val)) {
     if (II->getIntrinsicID() == Intrinsic::masked_load)
-      return II->getArgOperand(3);
+      return II->getArgOperand(2);
   }
   return UndefValue::get(getPayload(Val)->getType());
 }
diff --git a/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp b/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp
index fa8ae60e3f57b..2ff5843b19429 100644
--- a/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp
@@ -111,7 +111,7 @@ namespace {
    friend raw_ostream &operator<< (raw_ostream &OS, const DepChain &D);
   };
 
-  LLVM_ATTRIBUTE_UNUSED
+  [[maybe_unused]]
   raw_ostream &operator<<(raw_ostream &OS, const DepChain &D) {
     const ChainOfDependences &CD = D.Chain;
     int ChainSize = CD.size();
@@ -144,7 +144,7 @@ namespace {
     bool isDefined() { return Inst2Replace != nullptr; }
   };
 
-  LLVM_ATTRIBUTE_UNUSED
+  [[maybe_unused]]
   raw_ostream &operator<<(raw_ostream &OS, const ReuseValue &RU) {
     OS << "** ReuseValue ***\n";
     OS << "Instruction to Replace: " << *(RU.Inst2Replace) << "\n";
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
index ca982696b0600..e3094b4438e5f 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
@@ -275,7 +275,7 @@ namespace HexagonII {
     INST_ICLASS_ALU32_3   = 0xf0000000
   };
 
-  LLVM_ATTRIBUTE_UNUSED
+  [[maybe_unused]]
   static unsigned getMemAccessSizeInBytes(MemAccessSize S) {
     switch (S) {
       case ByteAccess:        return 1;
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index bfea50e2d6dc0..6b48a218efe80 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -422,12 +422,12 @@ static MCTargetStreamer *createHexagonNullTargetStreamer(MCStreamer &S) {
   return new HexagonTargetStreamer(S);
 }
 
-static void LLVM_ATTRIBUTE_UNUSED clearFeature(MCSubtargetInfo* STI, uint64_t F) {
+[[maybe_unused]] static void clearFeature(MCSubtargetInfo *STI, uint64_t F) {
   if (STI->hasFeature(F))
     STI->ToggleFeature(F);
 }
 
-static bool LLVM_ATTRIBUTE_UNUSED checkFeature(MCSubtargetInfo* STI, uint64_t F) {
+[[maybe_unused]] static bool checkFeature(MCSubtargetInfo *STI, uint64_t F) {
   return STI->hasFeature(F);
 }
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index 5143d53bad719..613dea6093f5f 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -2025,10 +2025,10 @@ def : Pat<(v4i32(fp_to_uint v4f64:$vj)),
               sub_128)>;
 
 // abs
-def : Pat<(abs v32i8:$xj), (XVMAX_B v32i8:$xj, (XVNEG_B v32i8:$xj))>;
-def : Pat<(abs v16i16:$xj), (XVMAX_H v16i16:$xj, (XVNEG_H v16i16:$xj))>;
-def : Pat<(abs v8i32:$xj), (XVMAX_W v8i32:$xj, (XVNEG_W v8i32:$xj))>;
-def : Pat<(abs v4i64:$xj), (XVMAX_D v4i64:$xj, (XVNEG_D v4i64:$xj))>;
+def : Pat<(abs v32i8:$xj), (XVSIGNCOV_B v32i8:$xj, v32i8:$xj)>;
+def : Pat<(abs v16i16:$xj), (XVSIGNCOV_H v16i16:$xj, v16i16:$xj)>;
+def : Pat<(abs v8i32:$xj), (XVSIGNCOV_W v8i32:$xj, v8i32:$xj)>;
+def : Pat<(abs v4i64:$xj), (XVSIGNCOV_D v4i64:$xj, v4i64:$xj)>;
 
 // XVABSD_{B/H/W/D}[U]
 defm : PatXrXr<abds, "XVABSD">;
diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
index 8d1dc99e316c9..4619c6bd248a6 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
@@ -2155,10 +2155,10 @@ def : Pat<(f64 f64imm_vldi:$in),
           (f64 (EXTRACT_SUBREG (VLDI (to_f64imm_vldi f64imm_vldi:$in)), sub_64))>;
 
 // abs
-def : Pat<(abs v16i8:$vj), (VMAX_B v16i8:$vj, (VNEG_B v16i8:$vj))>;
-def : Pat<(abs v8i16:$vj), (VMAX_H v8i16:$vj, (VNEG_H v8i16:$vj))>;
-def : Pat<(abs v4i32:$vj), (VMAX_W v4i32:$vj, (VNEG_W v4i32:$vj))>;
-def : Pat<(abs v2i64:$vj), (VMAX_D v2i64:$vj, (VNEG_D v2i64:$vj))>;
+def : Pat<(abs v16i8:$vj), (VSIGNCOV_B v16i8:$vj, v16i8:$vj)>;
+def : Pat<(abs v8i16:$vj), (VSIGNCOV_H v8i16:$vj, v8i16:$vj)>;
+def : Pat<(abs v4i32:$vj), (VSIGNCOV_W v4i32:$vj, v4i32:$vj)>;
+def : Pat<(abs v2i64:$vj), (VSIGNCOV_D v2i64:$vj, v2i64:$vj)>;
 
 // VABSD_{B/H/W/D}[U]
 defm : PatVrVr<abds, "VABSD">;
diff --git a/llvm/lib/Target/Mips/MipsFastISel.cpp b/llvm/lib/Target/Mips/MipsFastISel.cpp
index 1ce8d7e3e2339..df0c8c13fa38d 100644
--- a/llvm/lib/Target/Mips/MipsFastISel.cpp
+++ b/llvm/lib/Target/Mips/MipsFastISel.cpp
@@ -264,9 +264,10 @@ class MipsFastISel final : public FastISel {
 
 } // end anonymous namespace
 
-static bool CC_Mips(unsigned ValNo, MVT ValVT, MVT LocVT,
-                    CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
-                    Type *OrigTy, CCState &State) LLVM_ATTRIBUTE_UNUSED;
+[[maybe_unused]] static bool CC_Mips(unsigned ValNo, MVT ValVT, MVT LocVT,
+                                     CCValAssign::LocInfo LocInfo,
+                                     ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+                                     CCState &State);
 
 static bool CC_MipsO32_FP32(unsigned ValNo, MVT ValVT, MVT LocVT,
                             CCValAssign::LocInfo LocInfo,
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp
index 7f1ff4530825f..2fd73275721b1 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -3176,9 +3176,10 @@ static bool CC_MipsO32_FP64(unsigned ValNo, MVT ValVT, MVT LocVT,
                     F64Regs);
 }
 
-static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT,
-                       CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
-                       Type *OrigTy, CCState &State) LLVM_ATTRIBUTE_UNUSED;
+[[maybe_unused]] static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT,
+                                        CCValAssign::LocInfo LocInfo,
+                                        ISD::ArgFlagsTy ArgFlags, Type *OrigTy,
+                                        CCState &State);
 
 #include "MipsGenCallingConv.inc"
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 272c21f82801a..2f1a7ad2d401f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -749,7 +749,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
     setTruncStoreAction(VT, MVT::i1, Expand);
   }
 
-  // Disable generations of extload/truncstore for v2i16/v2i8. The generic
+  // Disable generations of extload/truncstore for v2i32/v2i16/v2i8. The generic
   // expansion for these nodes when they are unaligned is incorrect if the
   // type is a vector.
   //
@@ -757,7 +757,11 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   //       TargetLowering::expandUnalignedLoad/Store.
   setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i16,
                    MVT::v2i8, Expand);
+  setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i32,
+                   {MVT::v2i8, MVT::v2i16}, Expand);
   setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
+  setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
+  setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
 
   // Register custom handling for illegal type loads/stores. We'll try to custom
   // lower almost all illegal types and logic in the lowering will discard cases
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 944a1e2e6fa17..d47752267ba83 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -442,14 +442,11 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
 
   // If we're enabling GP optimizations, use hardware square root
-  if (!Subtarget.hasFSQRT() &&
-      !(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTE() &&
-        Subtarget.hasFRE()))
+  if (!Subtarget.hasFSQRT() && !(Subtarget.hasFRSQRTE() && Subtarget.hasFRE()))
     setOperationAction(ISD::FSQRT, MVT::f64, Expand);
 
   if (!Subtarget.hasFSQRT() &&
-      !(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTES() &&
-        Subtarget.hasFRES()))
+      !(Subtarget.hasFRSQRTES() && Subtarget.hasFRES()))
     setOperationAction(ISD::FSQRT, MVT::f32, Expand);
 
   if (Subtarget.hasFCPSGN()) {
@@ -565,16 +562,15 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     setOperationAction(ISD::BITCAST, MVT::i32, Legal);
     setOperationAction(ISD::BITCAST, MVT::i64, Legal);
     setOperationAction(ISD::BITCAST, MVT::f64, Legal);
-    if (TM.Options.UnsafeFPMath) {
-      setOperationAction(ISD::LRINT, MVT::f64, Legal);
-      setOperationAction(ISD::LRINT, MVT::f32, Legal);
-      setOperationAction(ISD::LLRINT, MVT::f64, Legal);
-      setOperationAction(ISD::LLRINT, MVT::f32, Legal);
-      setOperationAction(ISD::LROUND, MVT::f64, Legal);
-      setOperationAction(ISD::LROUND, MVT::f32, Legal);
-      setOperationAction(ISD::LLROUND, MVT::f64, Legal);
-      setOperationAction(ISD::LLROUND, MVT::f32, Legal);
-    }
+
+    setOperationAction(ISD::STRICT_LRINT, MVT::f64, Custom);
+    setOperationAction(ISD::STRICT_LRINT, MVT::f32, Custom);
+    setOperationAction(ISD::STRICT_LLRINT, MVT::f64, Custom);
+    setOperationAction(ISD::STRICT_LLRINT, MVT::f32, Custom);
+    setOperationAction(ISD::STRICT_LROUND, MVT::f64, Custom);
+    setOperationAction(ISD::STRICT_LROUND, MVT::f32, Custom);
+    setOperationAction(ISD::STRICT_LLROUND, MVT::f64, Custom);
+    setOperationAction(ISD::STRICT_LLROUND, MVT::f32, Custom);
   } else {
     setOperationAction(ISD::BITCAST, MVT::f32, Expand);
     setOperationAction(ISD::BITCAST, MVT::i32, Expand);
@@ -1034,11 +1030,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal);
 
       // The nearbyint variants are not allowed to raise the inexact exception
-      // so we can only code-gen them with unsafe math.
-      if (TM.Options.UnsafeFPMath) {
-        setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
-        setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
-      }
+      // so we can only code-gen them with fpexcept.ignore.
+      setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f64, Custom);
+      setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f32, Custom);
 
       setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);
       setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
@@ -8911,11 +8905,13 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
     // be lost at this stage, but is below the single-precision rounding
     // position.
     //
-    // However, if -enable-unsafe-fp-math is in effect, accept double
+    // However, if afn is in effect, accept double
     // rounding to avoid the extra overhead.
-    if (Op.getValueType() == MVT::f32 &&
-        !Subtarget.hasFPCVT() &&
-        !DAG.getTarget().Options.UnsafeFPMath) {
+    // FIXME: Currently INT_TO_FP can't support fast math flags because
+    // of nneg flag, thus Op->getFlags().hasApproximateFuncs() is always
+    // false.
+    if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT() &&
+        !Op->getFlags().hasApproximateFuncs()) {
 
       // Twiddle input to make sure the low 11 bits are zero.  (If this
       // is the case, we are guaranteed the value will fit into the 53 bit
@@ -9702,6 +9698,10 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
       }
       return SDV;
     }
+    // Recognize build vector patterns to emit VSX vector instructions
+    // instead of loading value from memory.
+    if (SDValue VecPat = combineBVLoadsSpecialValue(Op, DAG))
+      return VecPat;
   }
   // Check if this is a splat of a constant value.
   APInt APSplatBits, APSplatUndef;
@@ -12755,6 +12755,14 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return LowerADDSUBO_CARRY(Op, DAG);
   case ISD::UCMP:
     return LowerUCMP(Op, DAG);
+  case ISD::STRICT_LRINT:
+  case ISD::STRICT_LLRINT:
+  case ISD::STRICT_LROUND:
+  case ISD::STRICT_LLROUND:
+  case ISD::STRICT_FNEARBYINT:
+    if (Op->getFlags().hasNoFPExcept())
+      return Op;
+    return SDValue();
   }
 }
 
@@ -13084,7 +13092,9 @@ PPCTargetLowering::EmitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB,
   BuildMI(BB, dl, TII->get(StoreMnemonic))
     .addReg(TmpReg).addReg(ptrA).addReg(ptrB);
   BuildMI(BB, dl, TII->get(PPC::BCC))
-    .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB);
+      .addImm(PPC::PRED_NE_MINUS)
+      .addReg(PPC::CR0)
+      .addMBB(loopMBB);
   BB->addSuccessor(loopMBB);
   BB->addSuccessor(exitMBB);
 
@@ -13342,7 +13352,7 @@ MachineBasicBlock *PPCTargetLowering::EmitPartwordAtomicBinary(
       .addReg(ZeroReg)
       .addReg(PtrReg);
   BuildMI(BB, dl, TII->get(PPC::BCC))
-      .addImm(PPC::PRED_NE)
+      .addImm(PPC::PRED_NE_MINUS)
       .addReg(PPC::CR0)
       .addMBB(loopMBB);
   BB->addSuccessor(loopMBB);
@@ -14173,7 +14183,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
         .addReg(dest)
         .addReg(oldval);
     BuildMI(BB, dl, TII->get(PPC::BCC))
-        .addImm(PPC::PRED_NE)
+        .addImm(PPC::PRED_NE_MINUS)
         .addReg(CrReg)
         .addMBB(exitMBB);
     BB->addSuccessor(loop2MBB);
@@ -14185,7 +14195,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
         .addReg(ptrA)
         .addReg(ptrB);
     BuildMI(BB, dl, TII->get(PPC::BCC))
-        .addImm(PPC::PRED_NE)
+        .addImm(PPC::PRED_NE_MINUS)
         .addReg(PPC::CR0)
         .addMBB(loop1MBB);
     BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB);
@@ -15696,6 +15706,142 @@ combineElementTruncationToVectorTruncation(SDNode *N,
   return SDValue();
 }
 
+// LXVKQ instruction load VSX vector with a special quadword value
+// based on an immediate value. This helper method returns the details of the
+// match as a tuple of {LXVKQ unsigned IMM Value, right_shift_amount}
+// to help generate the LXVKQ instruction and the subsequent shift instruction
+// required to match the original build vector pattern.
+
+// LXVKQPattern: {LXVKQ unsigned IMM Value, right_shift_amount}
+using LXVKQPattern = std::tuple<uint32_t, uint8_t>;
+
+static std::optional<LXVKQPattern> getPatternInfo(const APInt &FullVal) {
+
+  // LXVKQ instruction loads the Quadword value:
+  // 0x8000_0000_0000_0000_0000_0000_0000_0000 when imm = 0b10000
+  static const APInt BasePattern = APInt(128, 0x8000000000000000ULL) << 64;
+  static const uint32_t Uim = 16;
+
+  // Check for direct LXVKQ match (no shift needed)
+  if (FullVal == BasePattern)
+    return std::make_tuple(Uim, uint8_t{0});
+
+  // Check if FullValue is 1 (the result of the base pattern >> 127)
+  if (FullVal == APInt(128, 1))
+    return std::make_tuple(Uim, uint8_t{127});
+
+  return std::nullopt;
+}
+
+/// Combine vector loads to a single load (using lxvkq) or splat with shift of a
+/// constant (xxspltib + vsrq) by recognising patterns in the Build Vector.
+/// LXVKQ instruction load VSX vector with a special quadword value based on an
+/// immediate value. if UIM=0b10000 then LXVKQ loads VSR[32×TX+T] with value
+/// 0x8000_0000_0000_0000_0000_0000_0000_0000.
+/// This can be used to inline the build vector constants that have the
+/// following patterns:
+///
+/// 0x8000_0000_0000_0000_0000_0000_0000_0000 (MSB set pattern)
+/// 0x0000_0000_0000_0000_0000_0000_0000_0001 (LSB set pattern)
+/// MSB pattern can directly loaded using LXVKQ while LSB is loaded using a
+/// combination of splatting and right shift instructions.
+
+SDValue PPCTargetLowering::combineBVLoadsSpecialValue(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+
+  assert((Op.getNode() && Op.getOpcode() == ISD::BUILD_VECTOR) &&
+         "Expected a BuildVectorSDNode in combineBVLoadsSpecialValue");
+
+  // This transformation is only supported if we are loading either a byte,
+  // halfword, word, or doubleword.
+  EVT VT = Op.getValueType();
+  if (!(VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v4i32 ||
+        VT == MVT::v2i64))
+    return SDValue();
+
+  LLVM_DEBUG(llvm::dbgs() << "\ncombineBVLoadsSpecialValue: Build vector ("
+                          << VT.getEVTString() << "): ";
+             Op->dump());
+
+  unsigned NumElems = VT.getVectorNumElements();
+  unsigned ElemBits = VT.getScalarSizeInBits();
+
+  bool IsLittleEndian = DAG.getDataLayout().isLittleEndian();
+
+  // Check for Non-constant operand in the build vector.
+  for (const SDValue &Operand : Op.getNode()->op_values()) {
+    if (!isa<ConstantSDNode>(Operand))
+      return SDValue();
+  }
+
+  // Assemble build vector operands as a 128-bit register value
+  // We need to reconstruct what the 128-bit register pattern would be
+  // that produces this vector when interpreted with the current endianness
+  APInt FullVal = APInt::getZero(128);
+
+  for (unsigned Index = 0; Index < NumElems; ++Index) {
+    auto *C = cast<ConstantSDNode>(Op.getOperand(Index));
+
+    // Get element value as raw bits (zero-extended)
+    uint64_t ElemValue = C->getZExtValue();
+
+    // Mask to element size to ensure we only get the relevant bits
+    if (ElemBits < 64)
+      ElemValue &= ((1ULL << ElemBits) - 1);
+
+    // Calculate bit position for this element in the 128-bit register
+    unsigned BitPos =
+        (IsLittleEndian) ? (Index * ElemBits) : (128 - (Index + 1) * ElemBits);
+
+    // Create APInt for the element value and shift it to correct position
+    APInt ElemAPInt(128, ElemValue);
+    ElemAPInt <<= BitPos;
+
+    // Place the element value at the correct bit position
+    FullVal |= ElemAPInt;
+  }
+
+  if (FullVal.isZero() || FullVal.isAllOnes())
+    return SDValue();
+
+  if (auto UIMOpt = getPatternInfo(FullVal)) {
+    const auto &[Uim, ShiftAmount] = *UIMOpt;
+    SDLoc Dl(Op);
+
+    // Generate LXVKQ instruction if the shift amount is zero.
+    if (ShiftAmount == 0) {
+      SDValue UimVal = DAG.getTargetConstant(Uim, Dl, MVT::i32);
+      SDValue LxvkqInstr =
+          SDValue(DAG.getMachineNode(PPC::LXVKQ, Dl, VT, UimVal), 0);
+      LLVM_DEBUG(llvm::dbgs()
+                     << "combineBVLoadsSpecialValue: Instruction Emitted ";
+                 LxvkqInstr.dump());
+      return LxvkqInstr;
+    }
+
+    assert(ShiftAmount == 127 && "Unexpected lxvkq shift amount value");
+
+    // The right shifted pattern can be constructed using a combination of
+    // XXSPLTIB and VSRQ instruction. VSRQ uses the shift amount from the lower
+    // 7 bits of byte 15. This can be specified using XXSPLTIB with immediate
+    // value 255.
+    SDValue ShiftAmountVec =
+        SDValue(DAG.getMachineNode(PPC::XXSPLTIB, Dl, MVT::v4i32,
+                                   DAG.getTargetConstant(255, Dl, MVT::i32)),
+                0);
+    // Generate appropriate right shift instruction
+    SDValue ShiftVec = SDValue(
+        DAG.getMachineNode(PPC::VSRQ, Dl, VT, ShiftAmountVec, ShiftAmountVec),
+        0);
+    LLVM_DEBUG(llvm::dbgs()
+                   << "\n combineBVLoadsSpecialValue: Instruction Emitted ";
+               ShiftVec.dump());
+    return ShiftVec;
+  }
+  // No patterns matched for build vectors.
+  return SDValue();
+}
+
 /// Reduce the number of loads when building a vector.
 ///
 /// Building a vector out of multiple loads can be converted to a load
@@ -18567,11 +18713,12 @@ bool PPCTargetLowering::isProfitableToHoist(Instruction *I) const {
     const Function *F = I->getFunction();
     const DataLayout &DL = F->getDataLayout();
     Type *Ty = User->getOperand(0)->getType();
+    bool AllowContract = I->getFastMathFlags().allowContract() &&
+                         User->getFastMathFlags().allowContract();
 
-    return !(
-        isFMAFasterThanFMulAndFAdd(*F, Ty) &&
-        isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) &&
-        (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath));
+    return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
+             isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) &&
+             (AllowContract || Options.AllowFPOpFusion == FPOpFusion::Fast));
   }
   case Instruction::Load: {
     // Don't break "store (load float*)" pattern, this pattern will be combined
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 59f338782ba43..880aca751d7d6 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -1472,6 +1472,9 @@ namespace llvm {
     combineElementTruncationToVectorTruncation(SDNode *N,
                                                DAGCombinerInfo &DCI) const;
 
+    SDValue combineBVLoadsSpecialValue(SDValue Operand,
+                                       SelectionDAG &DAG) const;
+
     /// lowerToVINSERTH - Return the SDValue if this VECTOR_SHUFFLE can be
     /// handled by the VINSERTH instruction introduced in ISA 3.0. This is
     /// essentially any shuffle of v8i16 vectors that just inserts one element
diff --git a/llvm/lib/Target/PowerPC/PPCInstrP10.td b/llvm/lib/Target/PowerPC/PPCInstrP10.td
index 2384959a60a43..2d8c633b9fef6 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrP10.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrP10.td
@@ -2404,6 +2404,190 @@ multiclass XXEvalTernarySelectOr<ValueType Vt> {
             126>;
 }
 
+// =============================================================================
+// XXEVAL Ternary Pattern Multiclass: XXEvalTernarySelectNor
+// This class matches the equivalent Ternary Operation: A ? f(B,C) : NOR(B,C)
+// and emit the corresponding xxeval instruction with the imm value.
+//
+// The patterns implement xxeval vector select operations where:
+// - A is the selector vector
+// - f(B,C) is the "true" case op in set {B, C, AND(B,C), XOR(B,C), NOT(C),
+//   NOT(B), NAND(B,C)}
+// - C is the "false" case op NOR(B,C)
+// =============================================================================
+multiclass XXEvalTernarySelectNor<ValueType Vt>{
+  // Pattern: (A ? AND(B,C) : NOR(B,C)) XXEVAL immediate value: 129
+  def : XXEvalPattern<
+          Vt, (vselect Vt:$vA, (VAnd Vt:$vB, Vt:$vC), (VNor Vt:$vB, Vt:$vC)),
+          129>;
+
+  // Pattern: (A ? B : NOR(B,C)) XXEVAL immediate value: 131
+  def : XXEvalPattern<Vt, (vselect Vt:$vA, Vt:$vB, (VNor Vt:$vB, Vt:$vC)),131>;
+
+  // Pattern: (A ? C : NOR(B,C)) XXEVAL immediate value: 133
+  def : XXEvalPattern<
+          Vt, (vselect Vt:$vA, Vt:$vC, (VNor Vt:$vB, Vt:$vC)),
+          133>;
+
+  // Pattern: (A ? XOR(B,C) : NOR(B,C)) XXEVAL immediate value: 134
+  def : XXEvalPattern<
+          Vt, (vselect Vt:$vA, (VXor Vt:$vB, Vt:$vC), (VNor Vt:$vB, Vt:$vC)),
+          134>;
+
+  // Pattern: (A ? NOT(C) : NOR(B,C)) XXEVAL immediate value: 138
+  def : XXEvalPattern<
+          Vt, (vselect Vt:$vA, (VNot Vt:$vC), (VNor Vt:$vB, Vt:$vC)),
+          138>;
+
+  // Pattern: (A ? NOT(B) : NOR(B,C)) XXEVAL immediate value: 140
+  def : XXEvalPattern<
+          Vt, (vselect Vt:$vA, (VNot Vt:$vB), (VNor Vt:$vB, Vt:$vC)),
+          140>;
+
+  // Pattern: (A ? NAND(B,C) : NOR(B,C)) XXEVAL immediate value: 142
+  def : XXEvalPattern<
+          Vt, (vselect Vt:$vA, (VNand Vt:$vB, Vt:$vC), (VNor Vt:$vB, Vt:$vC)),
+          142>;
+}
+
+// =============================================================================
+// XXEVAL Ternary Pattern Multiclass: XXEvalTernarySelectEqv
+// This class matches the equivalent Ternary Operation: A ? f(B,C) : EQV(B,C)
+// and emit the corresponding xxeval instruction with the imm value.
+//
+// The patterns implement xxeval vector select operations where:
+// - A is the selector vector
+// - f(B,C) is the "true" case op in set {OR(B,C), NOR(B,C), NAND(B,C), NOT(B),
+//   NOT(C)}
+// - C is the "false" case op EQV(B,C)
+// =============================================================================
+multiclass XXEvalTernarySelectEqv<ValueType Vt>{
+  // Pattern: (A ? OR(B,C) : EQV(B,C)) XXEVAL immediate value: 151
+  def : XXEvalPattern<
+          Vt, (vselect Vt:$vA, (VOr Vt:$vB, Vt:$vC), (VEqv Vt:$vB, Vt:$vC)),
+          151>;
+
+  // Pattern: (A ? NOR(B,C) : EQV(B,C)) XXEVAL immediate value: 152
+  def : XXEvalPattern<
+          Vt, (vselect Vt:$vA, (VNor Vt:$vB, Vt:$vC), (VEqv Vt:$vB, Vt:$vC)),
+          152>;
+
+  // Pattern: (A ? NOT(C) : EQV(B,C)) XXEVAL immediate value: 154
+  def : XXEvalPattern<
+          Vt, (vselect Vt:$vA, (VNot Vt:$vC), (VEqv Vt:$vB, Vt:$vC)),
+          154>;
+
+  // Pattern: (A ? NAND(B,C) : EQV(B,C)) XXEVAL immediate value: 158
+  def : XXEvalPattern<
+          Vt, (vselect Vt:$vA, (VNand Vt:$vB, Vt:$vC), (VEqv Vt:$vB, Vt:$vC)),
+          158>;
+}
+
+// =============================================================================
+// XXEVAL Ternary Pattern Multiclass: XXEvalTernarySelectNotC
+// This class matches the equivalent Ternary Operation: A ? f(B,C) : NOT(C)
+// and emit the corresponding xxeval instruction with the imm value.
+//
+// The patterns implement xxeval vector select operations where:
+// - A is the selector vector
+// - f(B,C) is the "true" case op in set {AND(B,C), OR(B,C), XOR(B,C), NAND(B,C),
+//   B, NOT(B)}
+// - C is the "false" case op NOT(C)
+// =============================================================================
+multiclass XXEvalTernarySelectNotC<ValueType Vt>{
+  // Pattern: (A ? AND(B,C) : NOT(C)) XXEVAL immediate value: 161
+  def : XXEvalPattern<
+          Vt, (vselect Vt:$vA, (VAnd Vt:$vB, Vt:$vC), (VNot Vt:$vC)), 161>;
+
+  // Pattern: (A ? B : NOT(C)) XXEVAL immediate value: 163
+  def : XXEvalPattern<Vt, (vselect Vt:$vA, Vt:$vB, (VNot Vt:$vC)), 163>;
+
+  // Pattern: (A ? XOR(B,C) : NOT(C)) XXEVAL immediate value: 166
+  def : XXEvalPattern<
+          Vt, (vselect Vt:$vA, (VXor Vt:$vB, Vt:$vC), (VNot Vt:$vC)), 166>;
+
+  // Pattern: (A ? OR(B,C) : NOT(C)) XXEVAL immediate value: 167
+  def : XXEvalPattern<
+          Vt, (vselect Vt:$vA, (VOr Vt:$vB, Vt:$vC), (VNot Vt:$vC)), 167>;
+  
+  // Pattern: (A ? NOT(B) : NOT(C)) XXEVAL immediate value: 172
+  def : XXEvalPattern<Vt, (vselect Vt:$vA, (VNot Vt:$vB), (VNot Vt:$vC)), 172>;
+
+  // Pattern: (A ? NAND(B,C) : NOT(C)) XXEVAL immediate value: 174
+  def : XXEvalPattern<
+          Vt, (vselect Vt:$vA, (VNand Vt:$vB, Vt:$vC), (VNot Vt:$vC)), 174>;
+}
+
+// =============================================================================
+// XXEVAL Ternary Pattern Multiclass: XXEvalTernarySelectNotB
+// This class matches the equivalent Ternary Operation: A ? f(B,C) : NOT(B)
+// and emit the corresponding xxeval instruction with the imm value.
+//
+// The patterns implement xxeval vector select operations where:
+// - A is the selector vector
+// - f(B,C) is the "true" case op in set {AND(B,C), OR(B,C), XOR(B,C), NAND(B,C),
+//   C, NOT(B)}
+// - C is the "false" case op NOT(B)
+// =============================================================================
+multiclass XXEvalTernarySelectNotB<ValueType Vt>{
+  // Pattern: (A ? AND(B,C) : NOT(B)) XXEVAL immediate value: 193
+  def : XXEvalPattern<
+          Vt, (vselect Vt:$vA, (VAnd Vt:$vB, Vt:$vC), (VNot Vt:$vB)), 193>;
+
+  // Pattern: (A ? C : NOT(B)) XXEVAL immediate value: 197
+  def : XXEvalPattern<Vt, (vselect Vt:$vA, Vt:$vC, (VNot Vt:$vB)), 197>;
+
+  // Pattern: (A ? XOR(B,C) : NOT(B)) XXEVAL immediate value: 198
+  def : XXEvalPattern<
+          Vt, (vselect Vt:$vA, (VXor Vt:$vB, Vt:$vC), (VNot Vt:$vB)), 198>;
+
+  // Pattern: (A ? OR(B,C) : NOT(B)) XXEVAL immediate value: 199
+  def : XXEvalPattern<
+          Vt, (vselect Vt:$vA, (VOr Vt:$vB, Vt:$vC), (VNot Vt:$vB)), 199>;
+  
+  // Pattern: (A ? NOT(C) : NOT(B)) XXEVAL immediate value: 202
+  def : XXEvalPattern<Vt, (vselect Vt:$vA, (VNot Vt:$vC), (VNot Vt:$vB)), 202>;
+
+  // Pattern: (A ? NAND(B,C) : NOT(B)) XXEVAL immediate value: 206
+  def : XXEvalPattern<
+          Vt, (vselect Vt:$vA, (VNand Vt:$vB, Vt:$vC), (VNot Vt:$vB)), 206>;
+}
+
+// =============================================================================
+// XXEVAL Ternary Pattern Multiclass: XXEvalTernarySelectNand
+// This class matches the equivalent Ternary Operation: A ? f(B,C) : NAND(B,C)
+// and emit the corresponding xxeval instruction with the imm value.
+//
+// The patterns implement xxeval vector select operations where:
+// - A is the selector vector
+// - f(B,C) is the "true" case op in set {B, C, XOR(B,C), OR(B,C), EQV(B,C)}
+// - C is the "false" case op NAND(B,C)
+// =============================================================================
+multiclass XXEvalTernarySelectNand<ValueType Vt>{
+  // Pattern: (A ? B : NAND(B,C)) XXEVAL immediate value: 227
+  def : XXEvalPattern<
+          Vt, (vselect Vt:$vA, Vt:$vB, (VNand Vt:$vB, Vt:$vC)), 227>;
+
+  // Pattern: (A ? C : NAND(B,C)) XXEVAL immediate value: 229
+  def : XXEvalPattern<
+          Vt, (vselect Vt:$vA, Vt:$vC, (VNand Vt:$vB, Vt:$vC)), 229>;
+
+  // Pattern: (A ? XOR(B,C) : NAND(B,C)) XXEVAL immediate value: 230
+  def : XXEvalPattern<
+          Vt, (vselect Vt:$vA, (VXor Vt:$vB, Vt:$vC), (VNand Vt:$vB, Vt:$vC)),
+          230>;
+
+  // Pattern: (A ? OR(B,C) : NAND(B,C)) XXEVAL immediate value: 231
+  def : XXEvalPattern<
+          Vt, (vselect Vt:$vA, (VOr Vt:$vB, Vt:$vC), (VNand Vt:$vB, Vt:$vC)),
+          231>;
+
+  // Pattern: (A ? EQV(B,C) : NAND(B,C)) XXEVAL immediate value: 233
+  def : XXEvalPattern<
+          Vt, (vselect Vt:$vA, (VEqv Vt:$vB, Vt:$vC), (VNand Vt:$vB, Vt:$vC)),
+          233>;
+}
+
 let Predicates = [PrefixInstrs, HasP10Vector] in {
   let AddedComplexity = 400 in {
     def : Pat<(v4i32 (build_vector i32immNonAllOneNonZero:$A,
@@ -2519,6 +2703,11 @@ let Predicates = [PrefixInstrs, HasP10Vector] in {
         defm : XXEvalTernarySelectC<Ty>;
         defm : XXEvalTernarySelectXor<Ty>;
         defm : XXEvalTernarySelectOr<Ty>;
+        defm : XXEvalTernarySelectNor<Ty>;
+        defm : XXEvalTernarySelectEqv<Ty>;
+        defm : XXEvalTernarySelectNotC<Ty>;
+        defm : XXEvalTernarySelectNotB<Ty>;
+        defm : XXEvalTernarySelectNand<Ty>;
     }
 
     // Anonymous patterns to select prefixed VSX loads and stores.
diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
index 979ba31b0431b..885bed670e319 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -2786,14 +2786,16 @@ def : Pat<(v2f64 (any_frint v2f64:$S)), (v2f64 (XVRDPIC $S))>;
 // Rounding without exceptions (nearbyint). Due to strange tblgen behaviour,
 // these need to be defined after the any_frint versions so ISEL will correctly
 // add the chain to the strict versions.
-def : Pat<(f32 (fnearbyint f32:$S)),
+// TODO: Match strict fp rounding intrinsics with instructions like xsrdpiz when
+// rounding mode is propagated to CodeGen part.
+def : Pat<(f32 (strict_fnearbyint f32:$S)),
           (f32 (COPY_TO_REGCLASS (XSRDPIC
                                    (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>;
-def : Pat<(f64 (fnearbyint f64:$S)),
+def : Pat<(f64 (strict_fnearbyint f64:$S)),
           (f64 (XSRDPIC $S))>;
-def : Pat<(v2f64 (fnearbyint v2f64:$S)),
+def : Pat<(v2f64 (strict_fnearbyint v2f64:$S)),
           (v2f64 (XVRDPIC $S))>;
-def : Pat<(v4f32 (fnearbyint v4f32:$S)),
+def : Pat<(v4f32 (strict_fnearbyint v4f32:$S)),
           (v4f32 (XVRSPIC $S))>;
 
 // Materialize a zero-vector of long long
@@ -3578,25 +3580,25 @@ def : Pat<(f64 (bitconvert i64:$S)),
           (f64 (MTVSRD $S))>;
 
 // Rounding to integer.
-def : Pat<(i64 (lrint f64:$S)),
+def : Pat<(i64 (strict_lrint f64:$S)),
           (i64 (MFVSRD (FCTID $S)))>;
-def : Pat<(i64 (lrint f32:$S)),
+def : Pat<(i64 (strict_lrint f32:$S)),
           (i64 (MFVSRD (FCTID (COPY_TO_REGCLASS $S, F8RC))))>;
-def : Pat<(i64 (llrint f64:$S)),
+def : Pat<(i64 (strict_llrint f64:$S)),
           (i64 (MFVSRD (FCTID $S)))>;
-def : Pat<(i64 (llrint f32:$S)),
+def : Pat<(i64 (strict_llrint f32:$S)),
           (i64 (MFVSRD (FCTID (COPY_TO_REGCLASS $S, F8RC))))>;
-def : Pat<(i64 (lround f64:$S)),
+def : Pat<(i64 (strict_lround f64:$S)),
           (i64 (MFVSRD (FCTID (XSRDPI $S))))>;
-def : Pat<(i64 (lround f32:$S)),
+def : Pat<(i64 (strict_lround f32:$S)),
           (i64 (MFVSRD (FCTID (XSRDPI (COPY_TO_REGCLASS $S, VSFRC)))))>;
-def : Pat<(i32 (lround f64:$S)),
+def : Pat<(i32 (strict_lround f64:$S)),
           (i32 (MFVSRWZ (FCTIW (XSRDPI $S))))>;
-def : Pat<(i32 (lround f32:$S)),
+def : Pat<(i32 (strict_lround f32:$S)),
           (i32 (MFVSRWZ (FCTIW (XSRDPI (COPY_TO_REGCLASS $S, VSFRC)))))>;
-def : Pat<(i64 (llround f64:$S)),
+def : Pat<(i64 (strict_llround f64:$S)),
           (i64 (MFVSRD (FCTID (XSRDPI $S))))>;
-def : Pat<(i64 (llround f32:$S)),
+def : Pat<(i64 (strict_llround f32:$S)),
           (i64 (MFVSRD (FCTID (XSRDPI (COPY_TO_REGCLASS $S, VSFRC)))))>;
 
 // Alternate patterns for PPCmtvsrz where the output is v8i16 or v16i8 instead
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 8851a0f3fc9a7..e857b2dd78deb 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -3356,10 +3356,10 @@ bool RISCVAsmParser::parseDirectiveAttribute() {
 
 bool isValidInsnFormat(StringRef Format, const MCSubtargetInfo &STI) {
   return StringSwitch<bool>(Format)
-      .Cases("r", "r4", "i", "b", "sb", "u", "j", "uj", "s", true)
-      .Cases("cr", "ci", "ciw", "css", "cl", "cs", "ca", "cb", "cj",
+      .Cases({"r", "r4", "i", "b", "sb", "u", "j", "uj", "s"}, true)
+      .Cases({"cr", "ci", "ciw", "css", "cl", "cs", "ca", "cb", "cj"},
              STI.hasFeature(RISCV::FeatureStdExtZca))
-      .Cases("qc.eai", "qc.ei", "qc.eb", "qc.ej", "qc.es",
+      .Cases({"qc.eai", "qc.ei", "qc.eb", "qc.ej", "qc.es"},
              !STI.hasFeature(RISCV::Feature64Bit))
       .Default(false);
 }
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index 662d3f6219033..b1794b78a3e2a 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -717,6 +717,18 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
       .clampScalar(0, sXLen, sXLen)
       .lower();
 
+  LegalityPredicate InsertVectorEltPred = [=](const LegalityQuery &Query) {
+    LLT VecTy = Query.Types[0];
+    LLT EltTy = Query.Types[1];
+    return VecTy.getElementType() == EltTy;
+  };
+
+  getActionDefinitionsBuilder(G_INSERT_VECTOR_ELT)
+      .legalIf(all(typeIsLegalIntOrFPVec(0, IntOrFPVecTys, ST),
+                   InsertVectorEltPred, typeIs(2, sXLen)))
+      .legalIf(all(typeIsLegalBoolVec(0, BoolVecTys, ST), InsertVectorEltPred,
+                   typeIs(2, sXLen)));
+
   getLegacyLegalizerInfo().computeTables();
   verify(*ST.getInstrInfo());
 }
diff --git a/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp b/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp
index 50730c697989d..b00589a2d75be 100644
--- a/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp
+++ b/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp
@@ -43,7 +43,7 @@ const llvm::StringRef RISCVLMULInstrument::DESC_NAME = "RISCV-LMUL";
 bool RISCVLMULInstrument::isDataValid(llvm::StringRef Data) {
   // Return true if not one of the valid LMUL strings
   return StringSwitch<bool>(Data)
-      .Cases("M1", "M2", "M4", "M8", "MF2", "MF4", "MF8", true)
+      .Cases({"M1", "M2", "M4", "M8", "MF2", "MF4", "MF8"}, true)
       .Default(false);
 }
 
@@ -68,7 +68,7 @@ const llvm::StringRef RISCVSEWInstrument::DESC_NAME = "RISCV-SEW";
 bool RISCVSEWInstrument::isDataValid(llvm::StringRef Data) {
   // Return true if not one of the valid SEW strings
   return StringSwitch<bool>(Data)
-      .Cases("E8", "E16", "E32", "E64", true)
+      .Cases({"E8", "E16", "E32", "E64"}, true)
       .Default(false);
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp
index 5dd4bf415a23c..98b636e8e0e55 100644
--- a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp
@@ -109,12 +109,70 @@ bool RISCVExpandAtomicPseudo::expandMI(MachineBasicBlock &MBB,
   // expanded instructions for each pseudo is correct in the Size field of the
   // tablegen definition for the pseudo.
   switch (MBBI->getOpcode()) {
+  case RISCV::PseudoAtomicSwap32:
+    return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Xchg, false, 32,
+                             NextMBBI);
+  case RISCV::PseudoAtomicSwap64:
+    return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Xchg, false, 64,
+                             NextMBBI);
+  case RISCV::PseudoAtomicLoadAdd32:
+    return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Add, false, 32,
+                             NextMBBI);
+  case RISCV::PseudoAtomicLoadAdd64:
+    return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Add, false, 64,
+                             NextMBBI);
+  case RISCV::PseudoAtomicLoadSub32:
+    return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Sub, false, 32,
+                             NextMBBI);
+  case RISCV::PseudoAtomicLoadSub64:
+    return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Sub, false, 64,
+                             NextMBBI);
+  case RISCV::PseudoAtomicLoadAnd32:
+    return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::And, false, 32,
+                             NextMBBI);
+  case RISCV::PseudoAtomicLoadAnd64:
+    return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::And, false, 64,
+                             NextMBBI);
+  case RISCV::PseudoAtomicLoadOr32:
+    return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Or, false, 32, NextMBBI);
+  case RISCV::PseudoAtomicLoadOr64:
+    return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Or, false, 64, NextMBBI);
+  case RISCV::PseudoAtomicLoadXor32:
+    return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Xor, false, 32,
+                             NextMBBI);
+  case RISCV::PseudoAtomicLoadXor64:
+    return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Xor, false, 64,
+                             NextMBBI);
   case RISCV::PseudoAtomicLoadNand32:
     return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Nand, false, 32,
                              NextMBBI);
   case RISCV::PseudoAtomicLoadNand64:
     return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Nand, false, 64,
                              NextMBBI);
+  case RISCV::PseudoAtomicLoadMin32:
+    return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::Min, false, 32,
+                                NextMBBI);
+  case RISCV::PseudoAtomicLoadMin64:
+    return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::Min, false, 64,
+                                NextMBBI);
+  case RISCV::PseudoAtomicLoadMax32:
+    return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::Max, false, 32,
+                                NextMBBI);
+  case RISCV::PseudoAtomicLoadMax64:
+    return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::Max, false, 64,
+                                NextMBBI);
+  case RISCV::PseudoAtomicLoadUMin32:
+    return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::UMin, false, 32,
+                                NextMBBI);
+  case RISCV::PseudoAtomicLoadUMin64:
+    return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::UMin, false, 64,
+                                NextMBBI);
+  case RISCV::PseudoAtomicLoadUMax32:
+    return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::UMax, false, 32,
+                                NextMBBI);
+  case RISCV::PseudoAtomicLoadUMax64:
+    return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::UMax, false, 64,
+                                NextMBBI);
   case RISCV::PseudoMaskedAtomicSwap32:
     return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Xchg, true, 32,
                              NextMBBI);
@@ -277,6 +335,36 @@ static void doAtomicBinOpExpansion(const RISCVInstrInfo *TII, MachineInstr &MI,
   switch (BinOp) {
   default:
     llvm_unreachable("Unexpected AtomicRMW BinOp");
+  case AtomicRMWInst::Xchg:
+    BuildMI(LoopMBB, DL, TII->get(RISCV::ADDI), ScratchReg)
+        .addReg(IncrReg)
+        .addImm(0);
+    break;
+  case AtomicRMWInst::Add:
+    BuildMI(LoopMBB, DL, TII->get(RISCV::ADD), ScratchReg)
+        .addReg(DestReg)
+        .addReg(IncrReg);
+    break;
+  case AtomicRMWInst::Sub:
+    BuildMI(LoopMBB, DL, TII->get(RISCV::SUB), ScratchReg)
+        .addReg(DestReg)
+        .addReg(IncrReg);
+    break;
+  case AtomicRMWInst::And:
+    BuildMI(LoopMBB, DL, TII->get(RISCV::AND), ScratchReg)
+        .addReg(DestReg)
+        .addReg(IncrReg);
+    break;
+  case AtomicRMWInst::Or:
+    BuildMI(LoopMBB, DL, TII->get(RISCV::OR), ScratchReg)
+        .addReg(DestReg)
+        .addReg(IncrReg);
+    break;
+  case AtomicRMWInst::Xor:
+    BuildMI(LoopMBB, DL, TII->get(RISCV::XOR), ScratchReg)
+        .addReg(DestReg)
+        .addReg(IncrReg);
+    break;
   case AtomicRMWInst::Nand:
     BuildMI(LoopMBB, DL, TII->get(RISCV::AND), ScratchReg)
         .addReg(DestReg)
@@ -433,38 +521,85 @@ static void insertSext(const RISCVInstrInfo *TII, DebugLoc DL,
       .addReg(ShamtReg);
 }
 
-bool RISCVExpandAtomicPseudo::expandAtomicMinMaxOp(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    AtomicRMWInst::BinOp BinOp, bool IsMasked, int Width,
-    MachineBasicBlock::iterator &NextMBBI) {
-  assert(IsMasked == true &&
-         "Should only need to expand masked atomic max/min");
-  assert(Width == 32 && "Should never need to expand masked 64-bit operations");
+static void doAtomicMinMaxOpExpansion(
+    const RISCVInstrInfo *TII, MachineInstr &MI, DebugLoc DL,
+    MachineBasicBlock *ThisMBB, MachineBasicBlock *LoopHeadMBB,
+    MachineBasicBlock *LoopIfBodyMBB, MachineBasicBlock *LoopTailMBB,
+    MachineBasicBlock *DoneMBB, AtomicRMWInst::BinOp BinOp, int Width,
+    const RISCVSubtarget *STI) {
+  Register DestReg = MI.getOperand(0).getReg();
+  Register ScratchReg = MI.getOperand(1).getReg();
+  Register AddrReg = MI.getOperand(2).getReg();
+  Register IncrReg = MI.getOperand(3).getReg();
+  AtomicOrdering Ordering =
+      static_cast<AtomicOrdering>(MI.getOperand(4).getImm());
 
-  MachineInstr &MI = *MBBI;
-  DebugLoc DL = MI.getDebugLoc();
-  MachineFunction *MF = MBB.getParent();
-  auto LoopHeadMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
-  auto LoopIfBodyMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
-  auto LoopTailMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
-  auto DoneMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+  // .loophead:
+  //   lr.[w|d] dest, (addr)
+  //   mv scratch, dest
+  //   ifnochangeneeded scratch, incr, .looptail
+  BuildMI(LoopHeadMBB, DL, TII->get(getLRForRMW(Ordering, Width, STI)), DestReg)
+      .addReg(AddrReg);
+  BuildMI(LoopHeadMBB, DL, TII->get(RISCV::ADDI), ScratchReg)
+      .addReg(DestReg)
+      .addImm(0);
+  switch (BinOp) {
+  default:
+    llvm_unreachable("Unexpected AtomicRMW BinOp");
+  case AtomicRMWInst::Max: {
+    BuildMI(LoopHeadMBB, DL, TII->get(RISCV::BGE))
+        .addReg(ScratchReg)
+        .addReg(IncrReg)
+        .addMBB(LoopTailMBB);
+    break;
+  }
+  case AtomicRMWInst::Min: {
+    BuildMI(LoopHeadMBB, DL, TII->get(RISCV::BGE))
+        .addReg(IncrReg)
+        .addReg(ScratchReg)
+        .addMBB(LoopTailMBB);
+    break;
+  }
+  case AtomicRMWInst::UMax:
+    BuildMI(LoopHeadMBB, DL, TII->get(RISCV::BGEU))
+        .addReg(ScratchReg)
+        .addReg(IncrReg)
+        .addMBB(LoopTailMBB);
+    break;
+  case AtomicRMWInst::UMin:
+    BuildMI(LoopHeadMBB, DL, TII->get(RISCV::BGEU))
+        .addReg(IncrReg)
+        .addReg(ScratchReg)
+        .addMBB(LoopTailMBB);
+    break;
+  }
 
-  // Insert new MBBs.
-  MF->insert(++MBB.getIterator(), LoopHeadMBB);
-  MF->insert(++LoopHeadMBB->getIterator(), LoopIfBodyMBB);
-  MF->insert(++LoopIfBodyMBB->getIterator(), LoopTailMBB);
-  MF->insert(++LoopTailMBB->getIterator(), DoneMBB);
+  // .loopifbody:
+  //   mv scratch, incr
+  BuildMI(LoopIfBodyMBB, DL, TII->get(RISCV::ADDI), ScratchReg)
+      .addReg(IncrReg)
+      .addImm(0);
 
-  // Set up successors and transfer remaining instructions to DoneMBB.
-  LoopHeadMBB->addSuccessor(LoopIfBodyMBB);
-  LoopHeadMBB->addSuccessor(LoopTailMBB);
-  LoopIfBodyMBB->addSuccessor(LoopTailMBB);
-  LoopTailMBB->addSuccessor(LoopHeadMBB);
-  LoopTailMBB->addSuccessor(DoneMBB);
-  DoneMBB->splice(DoneMBB->end(), &MBB, MI, MBB.end());
-  DoneMBB->transferSuccessors(&MBB);
-  MBB.addSuccessor(LoopHeadMBB);
+  // .looptail:
+  //   sc.[w|d] scratch, scratch, (addr)
+  //   bnez scratch, loop
+  BuildMI(LoopTailMBB, DL, TII->get(getSCForRMW(Ordering, Width, STI)),
+          ScratchReg)
+      .addReg(ScratchReg)
+      .addReg(AddrReg);
+  BuildMI(LoopTailMBB, DL, TII->get(RISCV::BNE))
+      .addReg(ScratchReg)
+      .addReg(RISCV::X0)
+      .addMBB(LoopHeadMBB);
+}
 
+static void doMaskedAtomicMinMaxOpExpansion(
+    const RISCVInstrInfo *TII, MachineInstr &MI, DebugLoc DL,
+    MachineBasicBlock *ThisMBB, MachineBasicBlock *LoopHeadMBB,
+    MachineBasicBlock *LoopIfBodyMBB, MachineBasicBlock *LoopTailMBB,
+    MachineBasicBlock *DoneMBB, AtomicRMWInst::BinOp BinOp, int Width,
+    const RISCVSubtarget *STI) {
+  assert(Width == 32 && "Should never need to expand masked 64-bit operations");
   Register DestReg = MI.getOperand(0).getReg();
   Register Scratch1Reg = MI.getOperand(1).getReg();
   Register Scratch2Reg = MI.getOperand(2).getReg();
@@ -541,6 +676,44 @@ bool RISCVExpandAtomicPseudo::expandAtomicMinMaxOp(
       .addReg(Scratch1Reg)
       .addReg(RISCV::X0)
       .addMBB(LoopHeadMBB);
+}
+
+bool RISCVExpandAtomicPseudo::expandAtomicMinMaxOp(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    AtomicRMWInst::BinOp BinOp, bool IsMasked, int Width,
+    MachineBasicBlock::iterator &NextMBBI) {
+
+  MachineInstr &MI = *MBBI;
+  DebugLoc DL = MI.getDebugLoc();
+  MachineFunction *MF = MBB.getParent();
+  auto LoopHeadMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+  auto LoopIfBodyMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+  auto LoopTailMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+  auto DoneMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+
+  // Insert new MBBs.
+  MF->insert(++MBB.getIterator(), LoopHeadMBB);
+  MF->insert(++LoopHeadMBB->getIterator(), LoopIfBodyMBB);
+  MF->insert(++LoopIfBodyMBB->getIterator(), LoopTailMBB);
+  MF->insert(++LoopTailMBB->getIterator(), DoneMBB);
+
+  // Set up successors and transfer remaining instructions to DoneMBB.
+  LoopHeadMBB->addSuccessor(LoopIfBodyMBB);
+  LoopHeadMBB->addSuccessor(LoopTailMBB);
+  LoopIfBodyMBB->addSuccessor(LoopTailMBB);
+  LoopTailMBB->addSuccessor(LoopHeadMBB);
+  LoopTailMBB->addSuccessor(DoneMBB);
+  DoneMBB->splice(DoneMBB->end(), &MBB, MI, MBB.end());
+  DoneMBB->transferSuccessors(&MBB);
+  MBB.addSuccessor(LoopHeadMBB);
+
+  if (!IsMasked)
+    doAtomicMinMaxOpExpansion(TII, MI, DL, &MBB, LoopHeadMBB, LoopIfBodyMBB,
+                              LoopTailMBB, DoneMBB, BinOp, Width, STI);
+  else
+    doMaskedAtomicMinMaxOpExpansion(TII, MI, DL, &MBB, LoopHeadMBB,
+                                    LoopIfBodyMBB, LoopTailMBB, DoneMBB, BinOp,
+                                    Width, STI);
 
   NextMBBI = MBB.end();
   MI.eraseFromParent();
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 5ceb477069188..3abbbb36315a8 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -218,6 +218,7 @@ def HasStdExtZaamo
     : Predicate<"Subtarget->hasStdExtZaamo()">,
       AssemblerPredicate<(any_of FeatureStdExtZaamo),
                          "'Zaamo' (Atomic Memory Operations)">;
+def NoStdExtZaamo : Predicate<"!Subtarget->hasStdExtZaamo()">;
 
 def FeatureStdExtZalrsc
     : RISCVExtension<1, 0, "Load-Reserved/Store-Conditional">;
@@ -695,6 +696,9 @@ def HasStdExtZvfbfa : Predicate<"Subtarget->hasStdExtZvfbfa()">,
 
 def FeatureStdExtZvfbfmin
     : RISCVExtension<1, 0, "Vector BF16 Converts", [FeatureStdExtZve32f]>;
+def HasStdExtZvfbfmin : Predicate<"Subtarget->hasStdExtZvfbfmin()">,
+                        AssemblerPredicate<(all_of FeatureStdExtZvfbfmin),
+                            "'Zvfbfmin' (Vector BF16 Converts)">;
 
 def FeatureStdExtZvfbfwma
     : RISCVExtension<1, 0, "Vector BF16 widening mul-add",
@@ -1861,7 +1865,7 @@ def FeatureForcedAtomics : SubtargetFeature<
     "forced-atomics", "HasForcedAtomics", "true",
     "Assume that lock-free native-width atomics are available">;
 def HasAtomicLdSt
-    : Predicate<"Subtarget->hasStdExtA() || Subtarget->hasForcedAtomics()">;
+    : Predicate<"Subtarget->hasStdExtZalrsc() || Subtarget->hasForcedAtomics()">;
 
 def FeatureTaggedGlobals : SubtargetFeature<"tagged-globals",
     "AllowTaggedGlobals",
diff --git a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
index 52dc53e4545e4..25b5af8324e64 100644
--- a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
@@ -495,18 +495,19 @@ RISCVGatherScatterLowering::determineBaseAndStride(Instruction *Ptr,
 bool RISCVGatherScatterLowering::tryCreateStridedLoadStore(IntrinsicInst *II) {
   VectorType *DataType;
   Value *StoreVal = nullptr, *Ptr, *Mask, *EVL = nullptr;
-  MaybeAlign MA;
+  Align Alignment;
   switch (II->getIntrinsicID()) {
   case Intrinsic::masked_gather:
     DataType = cast<VectorType>(II->getType());
     Ptr = II->getArgOperand(0);
-    MA = cast<ConstantInt>(II->getArgOperand(1))->getMaybeAlignValue();
-    Mask = II->getArgOperand(2);
+    Alignment = II->getParamAlign(0).valueOrOne();
+    Mask = II->getArgOperand(1);
     break;
   case Intrinsic::vp_gather:
     DataType = cast<VectorType>(II->getType());
     Ptr = II->getArgOperand(0);
-    MA = II->getParamAlign(0).value_or(
+    // FIXME: Falling back to ABI alignment is incorrect.
+    Alignment = II->getParamAlign(0).value_or(
         DL->getABITypeAlign(DataType->getElementType()));
     Mask = II->getArgOperand(1);
     EVL = II->getArgOperand(2);
@@ -515,14 +516,15 @@ bool RISCVGatherScatterLowering::tryCreateStridedLoadStore(IntrinsicInst *II) {
     DataType = cast<VectorType>(II->getArgOperand(0)->getType());
     StoreVal = II->getArgOperand(0);
     Ptr = II->getArgOperand(1);
-    MA = cast<ConstantInt>(II->getArgOperand(2))->getMaybeAlignValue();
-    Mask = II->getArgOperand(3);
+    Alignment = II->getParamAlign(1).valueOrOne();
+    Mask = II->getArgOperand(2);
     break;
   case Intrinsic::vp_scatter:
     DataType = cast<VectorType>(II->getArgOperand(0)->getType());
     StoreVal = II->getArgOperand(0);
     Ptr = II->getArgOperand(1);
-    MA = II->getParamAlign(1).value_or(
+    // FIXME: Falling back to ABI alignment is incorrect.
+    Alignment = II->getParamAlign(1).value_or(
         DL->getABITypeAlign(DataType->getElementType()));
     Mask = II->getArgOperand(2);
     EVL = II->getArgOperand(3);
@@ -533,7 +535,7 @@ bool RISCVGatherScatterLowering::tryCreateStridedLoadStore(IntrinsicInst *II) {
 
   // Make sure the operation will be supported by the backend.
   EVT DataTypeVT = TLI->getValueType(*DL, DataType);
-  if (!MA || !TLI->isLegalStridedLoadStore(DataTypeVT, *MA))
+  if (!TLI->isLegalStridedLoadStore(DataTypeVT, Alignment))
     return false;
 
   // FIXME: Let the backend type legalize by splitting/widening?
@@ -571,7 +573,7 @@ bool RISCVGatherScatterLowering::tryCreateStridedLoadStore(IntrinsicInst *II) {
 
     // Merge llvm.masked.gather's passthru
     if (II->getIntrinsicID() == Intrinsic::masked_gather)
-      Call = Builder.CreateSelect(Mask, Call, II->getArgOperand(3));
+      Call = Builder.CreateSelect(Mask, Call, II->getArgOperand(2));
   } else
     Call = Builder.CreateIntrinsic(
         Intrinsic::experimental_vp_strided_store,
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 7123a2d706787..26fe9edb6bd5d 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -688,7 +688,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
   else if (Subtarget.hasStdExtZicbop())
     setOperationAction(ISD::PREFETCH, MVT::Other, Legal);
 
-  if (Subtarget.hasStdExtA()) {
+  if (Subtarget.hasStdExtZalrsc()) {
     setMaxAtomicSizeInBitsSupported(Subtarget.getXLen());
     if (Subtarget.hasStdExtZabha() && Subtarget.hasStdExtZacas())
       setMinCmpXchgSizeInBits(8);
@@ -1558,7 +1558,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     }
   }
 
-  if (Subtarget.hasStdExtA())
+  if (Subtarget.hasStdExtZaamo())
     setOperationAction(ISD::ATOMIC_LOAD_SUB, XLenVT, Expand);
 
   if (Subtarget.hasForcedAtomics()) {
@@ -1672,6 +1672,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
   if (Subtarget.useRVVForFixedLengthVectors())
     setTargetDAGCombine(ISD::BITCAST);
 
+  setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
+
   // Disable strict node mutation.
   IsStrictFPEnabled = true;
   EnableExtLdPromotion = true;
@@ -12647,10 +12649,7 @@ SDValue RISCVTargetLowering::lowerVECTOR_REVERSE(SDValue Op,
       Lo = DAG.getNode(ISD::VECTOR_REVERSE, DL, LoVT, Lo);
       Hi = DAG.getNode(ISD::VECTOR_REVERSE, DL, HiVT, Hi);
       // Reassemble the low and high pieces reversed.
-      // FIXME: This is a CONCAT_VECTORS.
-      SDValue Res = DAG.getInsertSubvector(DL, DAG.getUNDEF(VecVT), Hi, 0);
-      return DAG.getInsertSubvector(DL, Res, Lo,
-                                    LoVT.getVectorMinNumElements());
+      return DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Hi, Lo);
     }
 
     // Just promote the int type to i16 which will double the LMUL.
@@ -21876,7 +21875,7 @@ unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode(
       // result is then sign extended to XLEN. With +A, the minimum width is
       // 32 for both 64 and 32.
       assert(getMinCmpXchgSizeInBits() == 32);
-      assert(Subtarget.hasStdExtA());
+      assert(Subtarget.hasStdExtZalrsc());
       return Op.getValueSizeInBits() - 31;
     }
     break;
@@ -24045,18 +24044,7 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
     }
   }
 
-  std::pair<Register, const TargetRegisterClass *> Res =
-      TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
-
-  // If we picked one of the Zfinx register classes, remap it to the GPR class.
-  // FIXME: When Zfinx is supported in CodeGen this will need to take the
-  // Subtarget into account.
-  if (Res.second == &RISCV::GPRF16RegClass ||
-      Res.second == &RISCV::GPRF32RegClass ||
-      Res.second == &RISCV::GPRPairRegClass)
-    return std::make_pair(Res.first, &RISCV::GPRRegClass);
-
-  return Res;
+  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
 }
 
 InlineAsm::ConstraintCode
@@ -24483,6 +24471,25 @@ ISD::NodeType RISCVTargetLowering::getExtendForAtomicCmpSwapArg() const {
   return Subtarget.hasStdExtZacas() ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND;
 }
 
+ISD::NodeType RISCVTargetLowering::getExtendForAtomicRMWArg(unsigned Op) const {
+  // Zaamo will use amo<op>.w which does not require extension.
+  if (Subtarget.hasStdExtZaamo() || Subtarget.hasForcedAtomics())
+    return ISD::ANY_EXTEND;
+
+  // Zalrsc pseudo expansions with comparison require sign-extension.
+  assert(Subtarget.hasStdExtZalrsc());
+  switch (Op) {
+  case ISD::ATOMIC_LOAD_MIN:
+  case ISD::ATOMIC_LOAD_MAX:
+  case ISD::ATOMIC_LOAD_UMIN:
+  case ISD::ATOMIC_LOAD_UMAX:
+    return ISD::SIGN_EXTEND;
+  default:
+    break;
+  }
+  return ISD::ANY_EXTEND;
+}
+
 Register RISCVTargetLowering::getExceptionPointerRegister(
     const Constant *PersonalityFn) const {
   return RISCV::X10;
@@ -24828,7 +24835,8 @@ bool RISCVTargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
   // instruction, as it is usually smaller than the alternative sequence.
   // TODO: Add vector division?
   bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
-  return OptSize && !VT.isVector();
+  return OptSize && !VT.isVector() &&
+         VT.getSizeInBits() <= getMaxDivRemBitWidthSupported();
 }
 
 bool RISCVTargetLowering::preferScalarizeSplat(SDNode *N) const {
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 3f81ed74c12ed..9e3e2a9443625 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -245,6 +245,7 @@ class RISCVTargetLowering : public TargetLowering {
   }
 
   ISD::NodeType getExtendForAtomicCmpSwapArg() const override;
+  ISD::NodeType getExtendForAtomicRMWArg(unsigned Op) const override;
 
   bool shouldTransformSignedTruncationCheck(EVT XVT,
                                             unsigned KeptBits) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index 1b7cb9bd2169d..636e31c47ddba 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -699,7 +699,8 @@ class VSETVLIInfo {
            "Can't encode VTYPE for uninitialized or unknown");
     if (TWiden != 0)
       return RISCVVType::encodeXSfmmVType(SEW, TWiden, AltFmt);
-    return RISCVVType::encodeVTYPE(VLMul, SEW, TailAgnostic, MaskAgnostic);
+    return RISCVVType::encodeVTYPE(VLMul, SEW, TailAgnostic, MaskAgnostic,
+                                   AltFmt);
   }
 
   bool hasSEWLMULRatioOnly() const { return SEWLMULRatioOnly; }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormats.td b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
index 5b06303092c0d..fee1d1596ff5c 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrFormats.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
@@ -177,7 +177,7 @@ def EltDepsVL        : EltDeps<vl=1, mask=0>;
 def EltDepsMask      : EltDeps<vl=0, mask=1>;
 def EltDepsVLMask    : EltDeps<vl=1, mask=1>;
 
-class EEW <bits<2> val> {
+class EEW<bits<2> val> {
   bits<2> Value = val;
 }
 def EEW1     : EEW<0>;
@@ -185,6 +185,13 @@ def EEWSEWx1 : EEW<1>;
 def EEWSEWx2 : EEW<2>;
 def EEWSEWx4 : EEW<3>;
 
+class AltFmtType<bits<2> val> {
+  bits<2> Value = val;
+}
+def DONT_CARE_ALTFMT : AltFmtType<0>;
+def IS_NOT_ALTFMT    : AltFmtType<1>;
+def IS_ALTFMT        : AltFmtType<2>;
+
 class RVInstCommon<dag outs, dag ins, string opcodestr, string argstr,
                    list<dag> pattern, InstFormat format> : Instruction {
   let Namespace = "RISCV";
@@ -271,8 +278,8 @@ class RVInstCommon<dag outs, dag ins, string opcodestr, string argstr,
   // 0 -> Don't care about altfmt bit in VTYPE.
   // 1 -> Is not altfmt.
   // 2 -> Is altfmt(BF16).
-  bits<2> AltFmtType = 0;
-  let TSFlags{28-27} = AltFmtType;
+  AltFmtType AltFmtType = DONT_CARE_ALTFMT;
+  let TSFlags{28-27} = AltFmtType.Value;
 
   // XSfmmbase
   bit HasTWidenOp = 0;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index ddb53a2ce62b3..12f776bbd4fa4 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -3775,11 +3775,13 @@ std::string RISCVInstrInfo::createMIROperandComment(
 
 #define CASE_VFMA_OPCODE_VV(OP)                                                \
   CASE_VFMA_OPCODE_LMULS_MF4(OP, VV, E16):                                     \
+  case CASE_VFMA_OPCODE_LMULS_MF4(OP##_ALT, VV, E16):                          \
   case CASE_VFMA_OPCODE_LMULS_MF2(OP, VV, E32):                                \
   case CASE_VFMA_OPCODE_LMULS_M1(OP, VV, E64)
 
 #define CASE_VFMA_SPLATS(OP)                                                   \
   CASE_VFMA_OPCODE_LMULS_MF4(OP, VFPR16, E16):                                 \
+  case CASE_VFMA_OPCODE_LMULS_MF4(OP##_ALT, VFPR16, E16):                      \
   case CASE_VFMA_OPCODE_LMULS_MF2(OP, VFPR32, E32):                            \
   case CASE_VFMA_OPCODE_LMULS_M1(OP, VFPR64, E64)
 // clang-format on
@@ -4003,11 +4005,13 @@ bool RISCVInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
 
 #define CASE_VFMA_CHANGE_OPCODE_VV(OLDOP, NEWOP)                               \
   CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(OLDOP, NEWOP, VV, E16)                     \
+  CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(OLDOP##_ALT, NEWOP##_ALT, VV, E16)         \
   CASE_VFMA_CHANGE_OPCODE_LMULS_MF2(OLDOP, NEWOP, VV, E32)                     \
   CASE_VFMA_CHANGE_OPCODE_LMULS_M1(OLDOP, NEWOP, VV, E64)
 
 #define CASE_VFMA_CHANGE_OPCODE_SPLATS(OLDOP, NEWOP)                           \
   CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(OLDOP, NEWOP, VFPR16, E16)                 \
+  CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(OLDOP##_ALT, NEWOP##_ALT, VFPR16, E16)     \
   CASE_VFMA_CHANGE_OPCODE_LMULS_MF2(OLDOP, NEWOP, VFPR32, E32)                 \
   CASE_VFMA_CHANGE_OPCODE_LMULS_M1(OLDOP, NEWOP, VFPR64, E64)
 // clang-format on
@@ -4469,6 +4473,20 @@ bool RISCVInstrInfo::simplifyInstruction(MachineInstr &MI) const {
   CASE_FP_WIDEOP_CHANGE_OPCODE_COMMON(OP, M2, E32)                             \
   CASE_FP_WIDEOP_CHANGE_OPCODE_COMMON(OP, M4, E16)                             \
   CASE_FP_WIDEOP_CHANGE_OPCODE_COMMON(OP, M4, E32)                             \
+
+#define CASE_FP_WIDEOP_OPCODE_LMULS_ALT(OP)                                    \
+  CASE_FP_WIDEOP_OPCODE_COMMON(OP, MF4, E16):                                  \
+  case CASE_FP_WIDEOP_OPCODE_COMMON(OP, MF2, E16):                             \
+  case CASE_FP_WIDEOP_OPCODE_COMMON(OP, M1, E16):                              \
+  case CASE_FP_WIDEOP_OPCODE_COMMON(OP, M2, E16):                              \
+  case CASE_FP_WIDEOP_OPCODE_COMMON(OP, M4, E16)
+
+#define CASE_FP_WIDEOP_CHANGE_OPCODE_LMULS_ALT(OP)                             \
+  CASE_FP_WIDEOP_CHANGE_OPCODE_COMMON(OP, MF4, E16)                            \
+  CASE_FP_WIDEOP_CHANGE_OPCODE_COMMON(OP, MF2, E16)                            \
+  CASE_FP_WIDEOP_CHANGE_OPCODE_COMMON(OP, M1, E16)                             \
+  CASE_FP_WIDEOP_CHANGE_OPCODE_COMMON(OP, M2, E16)                             \
+  CASE_FP_WIDEOP_CHANGE_OPCODE_COMMON(OP, M4, E16)
 // clang-format on
 
 MachineInstr *RISCVInstrInfo::convertToThreeAddress(MachineInstr &MI,
@@ -4478,6 +4496,8 @@ MachineInstr *RISCVInstrInfo::convertToThreeAddress(MachineInstr &MI,
   switch (MI.getOpcode()) {
   default:
     return nullptr;
+  case CASE_FP_WIDEOP_OPCODE_LMULS_ALT(FWADD_ALT_WV):
+  case CASE_FP_WIDEOP_OPCODE_LMULS_ALT(FWSUB_ALT_WV):
   case CASE_FP_WIDEOP_OPCODE_LMULS(FWADD_WV):
   case CASE_FP_WIDEOP_OPCODE_LMULS(FWSUB_WV): {
     assert(RISCVII::hasVecPolicyOp(MI.getDesc().TSFlags) &&
@@ -4494,6 +4514,8 @@ MachineInstr *RISCVInstrInfo::convertToThreeAddress(MachineInstr &MI,
       llvm_unreachable("Unexpected opcode");
     CASE_FP_WIDEOP_CHANGE_OPCODE_LMULS(FWADD_WV)
     CASE_FP_WIDEOP_CHANGE_OPCODE_LMULS(FWSUB_WV)
+    CASE_FP_WIDEOP_CHANGE_OPCODE_LMULS_ALT(FWADD_ALT_WV)
+    CASE_FP_WIDEOP_CHANGE_OPCODE_LMULS_ALT(FWSUB_ALT_WV)
     }
     // clang-format on
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 66717b96e4bc7..7c89686ebfb3c 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -1511,16 +1511,16 @@ def GIShiftMask32 :
     GIComplexOperandMatcher<s64, "selectShiftMask32">,
     GIComplexPatternEquiv<shiftMask32>;
 
-class shiftop<SDPatternOperator operator>
-    : PatFrag<(ops node:$val, node:$count),
-              (operator node:$val, (XLenVT (shiftMaskXLen node:$count)))>;
-class shiftopw<SDPatternOperator operator>
-    : PatFrag<(ops node:$val, node:$count),
-              (operator node:$val, (i64 (shiftMask32 node:$count)))>;
+class PatGprShiftMaskXLen<SDPatternOperator OpNode, RVInst Inst>
+    : Pat<(OpNode GPR:$rs1, shiftMaskXLen:$rs2),
+          (Inst GPR:$rs1, shiftMaskXLen:$rs2)>;
+class PatGprShiftMask32<SDPatternOperator OpNode, RVInst Inst>
+    : Pat<(OpNode GPR:$rs1, shiftMask32:$rs2),
+          (Inst GPR:$rs1, shiftMask32:$rs2)>;
 
-def : PatGprGpr<shiftop<shl>, SLL>;
-def : PatGprGpr<shiftop<srl>, SRL>;
-def : PatGprGpr<shiftop<sra>, SRA>;
+def : PatGprShiftMaskXLen<shl, SLL>;
+def : PatGprShiftMaskXLen<srl, SRL>;
+def : PatGprShiftMaskXLen<sra, SRA>;
 
 // This is a special case of the ADD instruction used to facilitate the use of a
 // fourth operand to emit a relocation on a symbol relating to this instruction.
@@ -2203,9 +2203,9 @@ def : Pat<(sra (sext_inreg GPR:$rs1, i32), uimm5:$shamt),
 def : Pat<(i64 (sra (shl GPR:$rs1, (i64 32)), uimm6gt32:$shamt)),
           (SRAIW GPR:$rs1, (ImmSub32 uimm6gt32:$shamt))>;
 
-def : PatGprGpr<shiftopw<riscv_sllw>, SLLW>;
-def : PatGprGpr<shiftopw<riscv_srlw>, SRLW>;
-def : PatGprGpr<shiftopw<riscv_sraw>, SRAW>;
+def : PatGprShiftMask32<riscv_sllw, SLLW>;
+def : PatGprShiftMask32<riscv_srlw, SRLW>;
+def : PatGprShiftMask32<riscv_sraw, SRAW>;
 
 // Select W instructions if only the lower 32 bits of the result are used.
 def : PatGprGpr<binop_allwusers<add>, ADDW>;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
index 571d72f904ca7..5c81a0990a64f 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
@@ -158,9 +158,9 @@ class seq_cst_store<PatFrag base>
 }
 } // IsAtomic = 1
 
-// Atomic load/store are available under both +a and +force-atomics.
-// Fences will be inserted for atomic load/stores according to the logic in
-// RISCVTargetLowering::{emitLeadingFence,emitTrailingFence}.
+// Atomic load/store are available under +zalrsc (thus also +a) and
+// +force-atomics.  Fences will be inserted for atomic load/stores according to
+// the logic in RISCVTargetLowering::{emitLeadingFence,emitTrailingFence}.
 // The normal loads/stores are relaxed (unordered) loads/stores that don't have
 // any ordering. This is necessary because AtomicExpandPass has added fences to
 // atomic load/stores and changed them to unordered ones.
@@ -308,7 +308,65 @@ class PseudoMaskedAMOMinMaxPat<Intrinsic intrin, Pseudo AMOInst>
           (AMOInst GPR:$addr, GPR:$incr, GPR:$mask, GPR:$shiftamt,
            timm:$ordering)>;
 
-let Predicates = [HasStdExtA] in {
+let Predicates = [HasStdExtZalrsc, NoStdExtZaamo] in {
+
+let Size = 16 in {
+def PseudoAtomicSwap32 : PseudoAMO;
+def PseudoAtomicLoadAdd32 : PseudoAMO;
+def PseudoAtomicLoadSub32 : PseudoAMO;
+def PseudoAtomicLoadAnd32 : PseudoAMO;
+def PseudoAtomicLoadOr32 : PseudoAMO;
+def PseudoAtomicLoadXor32 : PseudoAMO;
+} // Size = 16
+let Size = 24 in {
+def PseudoAtomicLoadMax32 : PseudoAMO;
+def PseudoAtomicLoadMin32 : PseudoAMO;
+def PseudoAtomicLoadUMax32 : PseudoAMO;
+def PseudoAtomicLoadUMin32 : PseudoAMO;
+} // Size = 24
+
+defm : PseudoAMOPat<"atomic_swap_i32", PseudoAtomicSwap32>;
+defm : PseudoAMOPat<"atomic_load_add_i32", PseudoAtomicLoadAdd32>;
+defm : PseudoAMOPat<"atomic_load_sub_i32", PseudoAtomicLoadSub32>;
+defm : PseudoAMOPat<"atomic_load_and_i32", PseudoAtomicLoadAnd32>;
+defm : PseudoAMOPat<"atomic_load_or_i32", PseudoAtomicLoadOr32>;
+defm : PseudoAMOPat<"atomic_load_xor_i32", PseudoAtomicLoadXor32>;
+defm : PseudoAMOPat<"atomic_load_max_i32", PseudoAtomicLoadMax32>;
+defm : PseudoAMOPat<"atomic_load_min_i32", PseudoAtomicLoadMin32>;
+defm : PseudoAMOPat<"atomic_load_umax_i32", PseudoAtomicLoadUMax32>;
+defm : PseudoAMOPat<"atomic_load_umin_i32", PseudoAtomicLoadUMin32>;
+} // Predicates = [HasStdExtZalrsc, NoStdExtZaamo]
+
+let Predicates = [HasStdExtZalrsc, NoStdExtZaamo, IsRV64] in {
+
+let Size = 16 in {
+def PseudoAtomicSwap64 : PseudoAMO;
+def PseudoAtomicLoadAdd64 : PseudoAMO;
+def PseudoAtomicLoadSub64 : PseudoAMO;
+def PseudoAtomicLoadAnd64 : PseudoAMO;
+def PseudoAtomicLoadOr64 : PseudoAMO;
+def PseudoAtomicLoadXor64 : PseudoAMO;
+} // Size = 16
+let Size = 24 in {
+def PseudoAtomicLoadMax64 : PseudoAMO;
+def PseudoAtomicLoadMin64 : PseudoAMO;
+def PseudoAtomicLoadUMax64 : PseudoAMO;
+def PseudoAtomicLoadUMin64 : PseudoAMO;
+} // Size = 24
+
+defm : PseudoAMOPat<"atomic_swap_i64", PseudoAtomicSwap64, i64>;
+defm : PseudoAMOPat<"atomic_load_add_i64", PseudoAtomicLoadAdd64, i64>;
+defm : PseudoAMOPat<"atomic_load_sub_i64", PseudoAtomicLoadSub64, i64>;
+defm : PseudoAMOPat<"atomic_load_and_i64", PseudoAtomicLoadAnd64, i64>;
+defm : PseudoAMOPat<"atomic_load_or_i64", PseudoAtomicLoadOr64, i64>;
+defm : PseudoAMOPat<"atomic_load_xor_i64", PseudoAtomicLoadXor64, i64>;
+defm : PseudoAMOPat<"atomic_load_max_i64", PseudoAtomicLoadMax64, i64>;
+defm : PseudoAMOPat<"atomic_load_min_i64", PseudoAtomicLoadMin64, i64>;
+defm : PseudoAMOPat<"atomic_load_umax_i64", PseudoAtomicLoadUMax64, i64>;
+defm : PseudoAMOPat<"atomic_load_umin_i64", PseudoAtomicLoadUMin64, i64>;
+} // Predicates = [HasStdExtZalrsc, NoStdExtZaamo, IsRV64]
+
+let Predicates = [HasStdExtZalrsc] in {
 
 let Size = 20 in
 def PseudoAtomicLoadNand32 : PseudoAMO;
@@ -347,14 +405,14 @@ def : PseudoMaskedAMOPat<int_riscv_masked_atomicrmw_umax,
                          PseudoMaskedAtomicLoadUMax32>;
 def : PseudoMaskedAMOPat<int_riscv_masked_atomicrmw_umin,
                          PseudoMaskedAtomicLoadUMin32>;
-} // Predicates = [HasStdExtA]
+} // Predicates = [HasStdExtZalrsc]
 
-let Predicates = [HasStdExtA, IsRV64] in {
+let Predicates = [HasStdExtZalrsc, IsRV64] in {
 
 let Size = 20 in
 def PseudoAtomicLoadNand64 : PseudoAMO;
 defm : PseudoAMOPat<"atomic_load_nand_i64", PseudoAtomicLoadNand64, i64>;
-} // Predicates = [HasStdExtA, IsRV64]
+} // Predicates = [HasStdExtZalrsc, IsRV64]
 
 
 /// Compare and exchange
@@ -385,17 +443,17 @@ multiclass PseudoCmpXchgPat<string Op, Pseudo CmpXchgInst,
             (CmpXchgInst GPR:$addr, GPR:$cmp, GPR:$new, 7)>;
 }
 
-let Predicates = [HasStdExtA, NoStdExtZacas] in {
+let Predicates = [HasStdExtZalrsc, NoStdExtZacas] in {
 def PseudoCmpXchg32 : PseudoCmpXchg;
 defm : PseudoCmpXchgPat<"atomic_cmp_swap_i32", PseudoCmpXchg32>;
 }
 
-let Predicates = [HasStdExtA, NoStdExtZacas, IsRV64] in {
+let Predicates = [HasStdExtZalrsc, NoStdExtZacas, IsRV64] in {
 def PseudoCmpXchg64 : PseudoCmpXchg;
 defm : PseudoCmpXchgPat<"atomic_cmp_swap_i64", PseudoCmpXchg64, i64>;
 }
 
-let Predicates = [HasStdExtA] in {
+let Predicates = [HasStdExtZalrsc] in {
 def PseudoMaskedCmpXchg32
     : Pseudo<(outs GPR:$res, GPR:$scratch),
              (ins GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask,
@@ -412,4 +470,4 @@ def : Pat<(XLenVT (int_riscv_masked_cmpxchg
             (XLenVT GPR:$mask), (XLenVT timm:$ordering))),
           (PseudoMaskedCmpXchg32
             GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, timm:$ordering)>;
-} // Predicates = [HasStdExtA]
+} // Predicates = [HasStdExtZalrsc]
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index c1b23afb2adb6..eb3c9b0defccb 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -128,9 +128,6 @@ defvar TAIL_AGNOSTIC = 1;
 defvar TU_MU = 0;
 defvar TA_MU = 1;
 defvar TA_MA = 3;
-defvar DONT_CARE_ALTFMT = 0;
-defvar IS_NOT_ALTFMT = 1;
-defvar IS_ALTFMT = 2;
 
 //===----------------------------------------------------------------------===//
 // Utilities.
@@ -5865,20 +5862,6 @@ multiclass VPatConversionWF_VF<string intrinsic, string instruction,
   }
 }
 
-multiclass VPatConversionWF_VF_BF<string intrinsic, string instruction,
-                                  bit isSEWAware = 0> {
-  foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in
-  {
-    defvar fvti = fvtiToFWti.Vti;
-    defvar fwti = fvtiToFWti.Wti;
-    let Predicates = !listconcat(GetVTypePredicates<fvti>.Predicates,
-                                 GetVTypePredicates<fwti>.Predicates) in
-    defm : VPatConversion<intrinsic, instruction, "V",
-                          fwti.Vector, fvti.Vector, fwti.Mask, fvti.Log2SEW,
-                          fvti.LMul, fwti.RegClass, fvti.RegClass, isSEWAware>;
-  }
-}
-
 multiclass VPatConversionVI_WF<string intrinsic, string instruction> {
   foreach vtiToWti = AllWidenableIntToFloatVectors in {
     defvar vti = vtiToWti.Vti;
@@ -5972,20 +5955,6 @@ multiclass VPatConversionVF_WF_RTZ<string intrinsic, string instruction,
   }
 }
 
-multiclass VPatConversionVF_WF_BF_RM<string intrinsic, string instruction,
-                                     bit isSEWAware = 0> {
-  foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in {
-    defvar fvti = fvtiToFWti.Vti;
-    defvar fwti = fvtiToFWti.Wti;
-    let Predicates = !listconcat(GetVTypePredicates<fvti>.Predicates,
-                                 GetVTypePredicates<fwti>.Predicates) in
-    defm : VPatConversionRoundingMode<intrinsic, instruction, "W",
-                                      fvti.Vector, fwti.Vector, fvti.Mask, fvti.Log2SEW,
-                                      fvti.LMul, fvti.RegClass, fwti.RegClass,
-                                      isSEWAware>;
-  }
-}
-
 multiclass VPatCompare_VI<string intrinsic, string inst,
                           ImmLeaf ImmType> {
   foreach vti = AllIntegerVectors in {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index 57fbaa04ec687..62b7bcd67283a 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -506,8 +506,8 @@ def : Pat<(XLenVT (xor GPR:$rs1, invLogicImm:$rs2)), (XNOR GPR:$rs1, invLogicImm
 } // Predicates = [HasStdExtZbbOrZbkb]
 
 let Predicates = [HasStdExtZbbOrZbkb] in {
-def : PatGprGpr<shiftop<rotl>, ROL>;
-def : PatGprGpr<shiftop<rotr>, ROR>;
+def : PatGprShiftMaskXLen<rotl, ROL>;
+def : PatGprShiftMaskXLen<rotr, ROR>;
 
 def : PatGprImm<rotr, RORI, uimmlog2xlen>;
 // There's no encoding for roli in the the 'B' extension as it can be
@@ -517,29 +517,29 @@ def : Pat<(XLenVT (rotl GPR:$rs1, uimmlog2xlen:$shamt)),
 } // Predicates = [HasStdExtZbbOrZbkb]
 
 let Predicates = [HasStdExtZbbOrZbkb, IsRV64] in {
-def : PatGprGpr<shiftopw<riscv_rolw>, ROLW>;
-def : PatGprGpr<shiftopw<riscv_rorw>, RORW>;
+def : PatGprShiftMask32<riscv_rolw, ROLW>;
+def : PatGprShiftMask32<riscv_rorw, RORW>;
 def : PatGprImm<riscv_rorw, RORIW, uimm5>;
 def : Pat<(riscv_rolw GPR:$rs1, uimm5:$rs2),
           (RORIW GPR:$rs1, (ImmSubFrom32 uimm5:$rs2))>;
 } // Predicates = [HasStdExtZbbOrZbkb, IsRV64]
 
 let Predicates = [HasStdExtZbs] in {
-def : Pat<(XLenVT (and (not (shiftop<shl> 1, (XLenVT GPR:$rs2))), GPR:$rs1)),
-          (BCLR GPR:$rs1, GPR:$rs2)>;
-def : Pat<(XLenVT (and (rotl -2, (XLenVT GPR:$rs2)), GPR:$rs1)),
-          (BCLR GPR:$rs1, GPR:$rs2)>;
-def : Pat<(XLenVT (or (shiftop<shl> 1, (XLenVT GPR:$rs2)), GPR:$rs1)),
-          (BSET GPR:$rs1, GPR:$rs2)>;
-def : Pat<(XLenVT (xor (shiftop<shl> 1, (XLenVT GPR:$rs2)), GPR:$rs1)),
-          (BINV GPR:$rs1, GPR:$rs2)>;
-def : Pat<(XLenVT (and (shiftop<srl> GPR:$rs1, (XLenVT GPR:$rs2)), 1)),
-          (BEXT GPR:$rs1, GPR:$rs2)>;
-
-def : Pat<(XLenVT (shiftop<shl> 1, (XLenVT GPR:$rs2))),
-          (BSET (XLenVT X0), GPR:$rs2)>;
-def : Pat<(XLenVT (not (shiftop<shl> -1, (XLenVT GPR:$rs2)))),
-          (ADDI (XLenVT (BSET (XLenVT X0), GPR:$rs2)), -1)>;
+def : Pat<(XLenVT (and (not (shl 1, shiftMaskXLen:$rs2)), GPR:$rs1)),
+          (BCLR GPR:$rs1, shiftMaskXLen:$rs2)>;
+def : Pat<(XLenVT (and (rotl -2, shiftMaskXLen:$rs2), GPR:$rs1)),
+          (BCLR GPR:$rs1, shiftMaskXLen:$rs2)>;
+def : Pat<(XLenVT (or (shl 1, shiftMaskXLen:$rs2), GPR:$rs1)),
+          (BSET GPR:$rs1, shiftMaskXLen:$rs2)>;
+def : Pat<(XLenVT (xor (shl 1, shiftMaskXLen:$rs2), GPR:$rs1)),
+          (BINV GPR:$rs1, shiftMaskXLen:$rs2)>;
+def : Pat<(XLenVT (and (srl GPR:$rs1, shiftMaskXLen:$rs2), 1)),
+          (BEXT GPR:$rs1, shiftMaskXLen:$rs2)>;
+
+def : Pat<(XLenVT (shl 1, shiftMaskXLen:$rs2)),
+          (BSET (XLenVT X0), shiftMaskXLen:$rs2)>;
+def : Pat<(XLenVT (not (shl -1, shiftMaskXLen:$rs2))),
+          (ADDI (XLenVT (BSET (XLenVT X0), shiftMaskXLen:$rs2)), -1)>;
 
 def : Pat<(XLenVT (and GPR:$rs1, BCLRMask:$mask)),
           (BCLRI GPR:$rs1, BCLRMask:$mask)>;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td
index 0be9eab6870ec..f7d1a099c69d9 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td
@@ -36,7 +36,7 @@ defm VFWMACCBF16_V : VWMAC_FV_V_F<"vfwmaccbf16", 0b111011>;
 //===----------------------------------------------------------------------===//
 // Pseudo instructions
 //===----------------------------------------------------------------------===//
-let Predicates = [HasStdExtZvfbfminOrZvfofp8min] in {
+let Predicates = [HasStdExtZvfbfmin] in {
   defm PseudoVFWCVTBF16_F_F : VPseudoVWCVTD_V;
   defm PseudoVFNCVTBF16_F_F : VPseudoVNCVTD_W_RM;
 }
@@ -44,10 +44,364 @@ let Predicates = [HasStdExtZvfbfminOrZvfofp8min] in {
 let mayRaiseFPException = true, Predicates = [HasStdExtZvfbfwma] in
   defm PseudoVFWMACCBF16 : VPseudoVWMAC_VV_VF_BF_RM;
 
+defset list<VTypeInfoToWide> AllWidenableIntToBF16Vectors = {
+  def : VTypeInfoToWide<VI8MF8, VBF16MF4>;
+  def : VTypeInfoToWide<VI8MF4, VBF16MF2>;
+  def : VTypeInfoToWide<VI8MF2, VBF16M1>;
+  def : VTypeInfoToWide<VI8M1, VBF16M2>;
+  def : VTypeInfoToWide<VI8M2, VBF16M4>;
+  def : VTypeInfoToWide<VI8M4, VBF16M8>;
+}
+
+multiclass VPseudoVALU_VV_VF_RM_BF16 {
+  foreach m = MxListF in {
+    defm "" : VPseudoBinaryFV_VV_RM<m, 16/*sew*/>,
+              SchedBinary<"WriteVFALUV", "ReadVFALUV", "ReadVFALUV", m.MX,
+                          16/*sew*/, forcePassthruRead=true>;
+  }
+
+  defvar f = SCALAR_F16;
+  foreach m = f.MxList in {
+    defm "" : VPseudoBinaryV_VF_RM<m, f, f.SEW>,
+              SchedBinary<"WriteVFALUF", "ReadVFALUV", "ReadVFALUF", m.MX,
+                          f.SEW, forcePassthruRead=true>;
+  }
+}
+
+multiclass VPseudoVALU_VF_RM_BF16 {
+  defvar f = SCALAR_F16;
+  foreach m = f.MxList in {
+    defm "" : VPseudoBinaryV_VF_RM<m, f, f.SEW>,
+              SchedBinary<"WriteVFALUF", "ReadVFALUV", "ReadVFALUF", m.MX,
+                          f.SEW, forcePassthruRead=true>;
+  }
+}
+
+multiclass VPseudoVFWALU_VV_VF_RM_BF16 {
+  foreach m = MxListFW in {
+    defm "" : VPseudoBinaryW_VV_RM<m, sew=16>,
+              SchedBinary<"WriteVFWALUV", "ReadVFWALUV", "ReadVFWALUV", m.MX,
+                          16/*sew*/, forcePassthruRead=true>;
+  }
+
+  defvar f = SCALAR_F16;
+  foreach m = f.MxListFW in {
+    defm "" : VPseudoBinaryW_VF_RM<m, f, sew=f.SEW>,
+              SchedBinary<"WriteVFWALUF", "ReadVFWALUV", "ReadVFWALUF", m.MX,
+                        f.SEW, forcePassthruRead=true>;
+  }
+}
+
+multiclass VPseudoVFWALU_WV_WF_RM_BF16 {
+  foreach m = MxListFW in {
+    defm "" : VPseudoBinaryW_WV_RM<m, sew=16>,
+              SchedBinary<"WriteVFWALUV", "ReadVFWALUV", "ReadVFWALUV", m.MX,
+                          16/*sew*/, forcePassthruRead=true>;
+  }
+  defvar f = SCALAR_F16;
+  foreach m = f.MxListFW in {
+    defm "" : VPseudoBinaryW_WF_RM<m, f, sew=f.SEW>,
+              SchedBinary<"WriteVFWALUF", "ReadVFWALUV", "ReadVFWALUF", m.MX,
+                          f.SEW, forcePassthruRead=true>;
+  }
+}
+
+multiclass VPseudoVFMUL_VV_VF_RM_BF16 {
+  foreach m = MxListF in {
+    defm "" : VPseudoBinaryFV_VV_RM<m, 16/*sew*/>,
+              SchedBinary<"WriteVFMulV", "ReadVFMulV", "ReadVFMulV", m.MX,
+                          16/*sew*/, forcePassthruRead=true>;
+  }
+
+  defvar f = SCALAR_F16;
+  foreach m = f.MxList in {
+    defm "" : VPseudoBinaryV_VF_RM<m, f, f.SEW>,
+              SchedBinary<"WriteVFMulF", "ReadVFMulV", "ReadVFMulF", m.MX,
+                          f.SEW, forcePassthruRead=true>;
+  }
+}
+
+multiclass VPseudoVWMUL_VV_VF_RM_BF16 {
+  foreach m = MxListFW in {
+    defm "" : VPseudoBinaryW_VV_RM<m, sew=16>,
+              SchedBinary<"WriteVFWMulV", "ReadVFWMulV", "ReadVFWMulV", m.MX,
+                          16/*sew*/, forcePassthruRead=true>;
+  }
+
+  defvar f = SCALAR_F16;
+  foreach m = f.MxListFW in {
+    defm "" : VPseudoBinaryW_VF_RM<m, f, sew=f.SEW>,
+              SchedBinary<"WriteVFWMulF", "ReadVFWMulV", "ReadVFWMulF", m.MX,
+                          f.SEW, forcePassthruRead=true>;
+  }
+}
+
+multiclass VPseudoVMAC_VV_VF_AAXA_RM_BF16 {
+  foreach m = MxListF in {
+    defm "" : VPseudoTernaryV_VV_AAXA_RM<m, 16/*sew*/>,
+              SchedTernary<"WriteVFMulAddV", "ReadVFMulAddV", "ReadVFMulAddV", 
+                           "ReadVFMulAddV", m.MX, 16/*sew*/>;
+  }
+
+  defvar f = SCALAR_F16;
+  foreach m = f.MxList in {
+    defm "" : VPseudoTernaryV_VF_AAXA_RM<m, f, f.SEW>,
+              SchedTernary<"WriteVFMulAddF", "ReadVFMulAddV", "ReadVFMulAddF", 
+                           "ReadVFMulAddV", m.MX, f.SEW>;
+  }
+}
+
+multiclass VPseudoVWMAC_VV_VF_RM_BF16 {
+  foreach m = MxListFW in {
+    defm "" : VPseudoTernaryW_VV_RM<m, sew=16>,
+              SchedTernary<"WriteVFWMulAddV", "ReadVFWMulAddV",
+                           "ReadVFWMulAddV", "ReadVFWMulAddV", m.MX, 16/*sew*/>;
+  }
+
+  defvar f = SCALAR_F16;
+  foreach m = f.MxListFW in {
+    defm "" : VPseudoTernaryW_VF_RM<m, f, sew=f.SEW>,
+              SchedTernary<"WriteVFWMulAddF", "ReadVFWMulAddV",
+                           "ReadVFWMulAddF", "ReadVFWMulAddV", m.MX, f.SEW>;
+  }
+}
+
+multiclass VPseudoVRCP_V_BF16 {
+  foreach m = MxListF in {
+    defvar mx = m.MX;
+    let VLMul = m.value in {
+      def "_V_" # mx # "_E16"
+          : VPseudoUnaryNoMask<m.vrclass, m.vrclass>,
+            SchedUnary<"WriteVFRecpV", "ReadVFRecpV", mx, 16/*sew*/,
+                       forcePassthruRead=true>;
+      def "_V_" # mx # "_E16_MASK"
+          : VPseudoUnaryMask<m.vrclass, m.vrclass>,
+            RISCVMaskedPseudo<MaskIdx = 2>,
+            SchedUnary<"WriteVFRecpV", "ReadVFRecpV", mx, 16/*sew*/,
+                       forcePassthruRead=true>;
+    }
+  }
+}
+
+multiclass VPseudoVRCP_V_RM_BF16 {
+  foreach m = MxListF in {
+    defvar mx = m.MX;
+    let VLMul = m.value in {
+      def "_V_" # mx # "_E16"
+          : VPseudoUnaryNoMaskRoundingMode<m.vrclass, m.vrclass>,
+            SchedUnary<"WriteVFRecpV", "ReadVFRecpV", mx, 16/*sew*/,
+                       forcePassthruRead=true>;
+      def "_V_" # mx # "_E16_MASK"
+          : VPseudoUnaryMaskRoundingMode<m.vrclass, m.vrclass>,
+            RISCVMaskedPseudo<MaskIdx = 2>,
+            SchedUnary<"WriteVFRecpV", "ReadVFRecpV", mx, 16/*sew*/,
+                       forcePassthruRead=true>;
+    }
+  }
+}
+
+multiclass VPseudoVMAX_VV_VF_BF16 {
+  foreach m = MxListF in {
+    defm "" : VPseudoBinaryV_VV<m, sew=16>,
+              SchedBinary<"WriteVFMinMaxV", "ReadVFMinMaxV", "ReadVFMinMaxV", 
+                          m.MX, 16/*sew*/, forcePassthruRead=true>;
+  }
+
+  defvar f = SCALAR_F16;
+  foreach m = f.MxList in {
+    defm "" : VPseudoBinaryV_VF<m, f, f.SEW>,
+              SchedBinary<"WriteVFMinMaxF", "ReadVFMinMaxV", "ReadVFMinMaxF", 
+                          m.MX, f.SEW, forcePassthruRead=true>;
+  }
+}
+
+multiclass VPseudoVSGNJ_VV_VF_BF16 {
+  foreach m = MxListF in {
+    defm "" : VPseudoBinaryV_VV<m, sew=16>,
+              SchedBinary<"WriteVFSgnjV", "ReadVFSgnjV", "ReadVFSgnjV", m.MX,
+                          16/*sew*/, forcePassthruRead=true>;
+  }
+
+  defvar f = SCALAR_F16;
+  foreach m = f.MxList in {
+    defm "" : VPseudoBinaryV_VF<m, f, f.SEW>,
+              SchedBinary<"WriteVFSgnjF", "ReadVFSgnjV", "ReadVFSgnjF", m.MX,
+                          f.SEW, forcePassthruRead=true>;
+  }
+}
+
+multiclass VPseudoVWCVTF_V_BF16 {
+  defvar constraint = "@earlyclobber $rd";
+  foreach m = MxListW in
+    defm _V : VPseudoConversion<m.wvrclass, m.vrclass, m, constraint, sew=8,
+                                TargetConstraintType=3>,
+              SchedUnary<"WriteVFWCvtIToFV", "ReadVFWCvtIToFV", m.MX, 8/*sew*/,
+                         forcePassthruRead=true>;
+}
+
+multiclass VPseudoVWCVTD_V_BF16 {
+  defvar constraint = "@earlyclobber $rd";
+  foreach m = MxListFW in
+    defm _V : VPseudoConversion<m.wvrclass, m.vrclass, m, constraint, sew=16,
+                                TargetConstraintType=3>,
+              SchedUnary<"WriteVFWCvtFToFV", "ReadVFWCvtFToFV", m.MX, 16/*sew*/,
+                         forcePassthruRead=true>;
+}
+
+multiclass VPseudoVNCVTD_W_BF16 {
+  defvar constraint = "@earlyclobber $rd";
+  foreach m = MxListFW in
+    defm _W : VPseudoConversion<m.vrclass, m.wvrclass, m, constraint, sew=16,
+                                TargetConstraintType=2>,
+              SchedUnary<"WriteVFNCvtFToFV", "ReadVFNCvtFToFV", m.MX, 16/*sew*/,
+                         forcePassthruRead=true>;
+}
+
+multiclass VPseudoVNCVTD_W_RM_BF16 {
+  defvar constraint = "@earlyclobber $rd";
+  foreach m = MxListFW in
+    defm _W : VPseudoConversionRoundingMode<m.vrclass, m.wvrclass, m,
+                                            constraint, sew=16,
+                                            TargetConstraintType=2>,
+              SchedUnary<"WriteVFNCvtFToFV", "ReadVFNCvtFToFV", m.MX, 16/*sew*/,
+                         forcePassthruRead=true>;
+}
+
+let Predicates = [HasStdExtZvfbfa], AltFmtType = IS_ALTFMT in {
+let mayRaiseFPException = true in {
+defm PseudoVFADD_ALT : VPseudoVALU_VV_VF_RM_BF16;
+defm PseudoVFSUB_ALT  : VPseudoVALU_VV_VF_RM_BF16;
+defm PseudoVFRSUB_ALT : VPseudoVALU_VF_RM_BF16;
+}
+
+let mayRaiseFPException = true in {
+defm PseudoVFWADD_ALT : VPseudoVFWALU_VV_VF_RM_BF16;
+defm PseudoVFWSUB_ALT : VPseudoVFWALU_VV_VF_RM_BF16;
+defm PseudoVFWADD_ALT : VPseudoVFWALU_WV_WF_RM_BF16;
+defm PseudoVFWSUB_ALT : VPseudoVFWALU_WV_WF_RM_BF16;
+}
+
+let mayRaiseFPException = true in
+defm PseudoVFMUL_ALT : VPseudoVFMUL_VV_VF_RM_BF16;
+
+let mayRaiseFPException = true in
+defm PseudoVFWMUL_ALT : VPseudoVWMUL_VV_VF_RM_BF16;
+
+let mayRaiseFPException = true in {
+defm PseudoVFMACC_ALT  : VPseudoVMAC_VV_VF_AAXA_RM_BF16;
+defm PseudoVFNMACC_ALT : VPseudoVMAC_VV_VF_AAXA_RM_BF16;
+defm PseudoVFMSAC_ALT  : VPseudoVMAC_VV_VF_AAXA_RM_BF16;
+defm PseudoVFNMSAC_ALT : VPseudoVMAC_VV_VF_AAXA_RM_BF16;
+defm PseudoVFMADD_ALT  : VPseudoVMAC_VV_VF_AAXA_RM_BF16;
+defm PseudoVFNMADD_ALT : VPseudoVMAC_VV_VF_AAXA_RM_BF16;
+defm PseudoVFMSUB_ALT  : VPseudoVMAC_VV_VF_AAXA_RM_BF16;
+defm PseudoVFNMSUB_ALT : VPseudoVMAC_VV_VF_AAXA_RM_BF16;
+}
+
+let mayRaiseFPException = true in {
+defm PseudoVFWMACC_ALT  : VPseudoVWMAC_VV_VF_RM_BF16;
+defm PseudoVFWNMACC_ALT : VPseudoVWMAC_VV_VF_RM_BF16;
+defm PseudoVFWMSAC_ALT  : VPseudoVWMAC_VV_VF_RM_BF16;
+defm PseudoVFWNMSAC_ALT : VPseudoVWMAC_VV_VF_RM_BF16;
+}
+
+let mayRaiseFPException = true in
+defm PseudoVFRSQRT7_ALT : VPseudoVRCP_V_BF16;
+
+let mayRaiseFPException = true in
+defm PseudoVFREC7_ALT : VPseudoVRCP_V_RM_BF16;
+
+let mayRaiseFPException = true in {
+defm PseudoVFMIN_ALT : VPseudoVMAX_VV_VF_BF16;
+defm PseudoVFMAX_ALT : VPseudoVMAX_VV_VF_BF16;
+}
+
+defm PseudoVFSGNJ_ALT  : VPseudoVSGNJ_VV_VF_BF16;
+defm PseudoVFSGNJN_ALT : VPseudoVSGNJ_VV_VF_BF16;
+defm PseudoVFSGNJX_ALT : VPseudoVSGNJ_VV_VF_BF16;
+
+let mayRaiseFPException = true in {
+defm PseudoVMFEQ_ALT : VPseudoVCMPM_VV_VF;
+defm PseudoVMFNE_ALT : VPseudoVCMPM_VV_VF;
+defm PseudoVMFLT_ALT : VPseudoVCMPM_VV_VF;
+defm PseudoVMFLE_ALT : VPseudoVCMPM_VV_VF;
+defm PseudoVMFGT_ALT : VPseudoVCMPM_VF;
+defm PseudoVMFGE_ALT : VPseudoVCMPM_VF;
+}
+
+defm PseudoVFCLASS_ALT : VPseudoVCLS_V;
+
+defm PseudoVFMERGE_ALT : VPseudoVMRG_FM;
+
+defm PseudoVFMV_V_ALT : VPseudoVMV_F;
+
+let mayRaiseFPException = true in {
+defm PseudoVFWCVT_F_XU_ALT : VPseudoVWCVTF_V_BF16;
+defm PseudoVFWCVT_F_X_ALT  : VPseudoVWCVTF_V_BF16;
+
+defm PseudoVFWCVT_F_F_ALT  : VPseudoVWCVTD_V_BF16;
+} // mayRaiseFPException = true
+
+let mayRaiseFPException = true in {
+let hasSideEffects = 0, hasPostISelHook = 1 in {
+defm PseudoVFNCVT_XU_F_ALT : VPseudoVNCVTI_W_RM;
+defm PseudoVFNCVT_X_F_ALT  : VPseudoVNCVTI_W_RM;
+}
+
+defm PseudoVFNCVT_RTZ_XU_F_ALT : VPseudoVNCVTI_W;
+defm PseudoVFNCVT_RTZ_X_F_ALT  : VPseudoVNCVTI_W;
+
+defm PseudoVFNCVT_F_F_ALT  : VPseudoVNCVTD_W_RM_BF16;
+
+defm PseudoVFNCVT_ROD_F_F_ALT : VPseudoVNCVTD_W_BF16;
+} // mayRaiseFPException = true
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+  defvar f = SCALAR_F16;
+  let HasSEWOp = 1, BaseInstr = VFMV_F_S in
+  def "PseudoVFMV_" # f.FX # "_S_ALT" :
+    RISCVVPseudo<(outs f.fprclass:$rd), (ins VR:$rs2, sew:$sew)>,
+    Sched<[WriteVMovFS, ReadVMovFS]>;
+  let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VFMV_S_F, isReMaterializable = 1,
+      Constraints = "$rd = $passthru" in
+  def "PseudoVFMV_S_" # f.FX # "_ALT" :
+    RISCVVPseudo<(outs VR:$rd),
+                 (ins VR:$passthru, f.fprclass:$rs1, AVL:$vl, sew:$sew)>,
+    Sched<[WriteVMovSF, ReadVMovSF_V, ReadVMovSF_F]>;
+}
+
+defm PseudoVFSLIDE1UP_ALT   : VPseudoVSLD1_VF<"@earlyclobber $rd">;
+defm PseudoVFSLIDE1DOWN_ALT : VPseudoVSLD1_VF;
+} // Predicates = [HasStdExtZvfbfa], AltFmtType = IS_ALTFMT
+
 //===----------------------------------------------------------------------===//
 // Patterns
 //===----------------------------------------------------------------------===//
-let Predicates = [HasStdExtZvfbfminOrZvfofp8min] in {
+multiclass VPatConversionWF_VF_BF<string intrinsic, string instruction,
+                                  bit isSEWAware = 0> {
+  foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in
+  {
+    defvar fvti = fvtiToFWti.Vti;
+    defvar fwti = fvtiToFWti.Wti;
+    defm : VPatConversion<intrinsic, instruction, "V",
+                          fwti.Vector, fvti.Vector, fwti.Mask, fvti.Log2SEW,
+                          fvti.LMul, fwti.RegClass, fvti.RegClass, isSEWAware>;
+  }
+}
+
+multiclass VPatConversionVF_WF_BF_RM<string intrinsic, string instruction,
+                                     bit isSEWAware = 0> {
+  foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in {
+    defvar fvti = fvtiToFWti.Vti;
+    defvar fwti = fvtiToFWti.Wti;
+    defm : VPatConversionRoundingMode<intrinsic, instruction, "W",
+                                      fvti.Vector, fwti.Vector, fvti.Mask, fvti.Log2SEW,
+                                      fvti.LMul, fvti.RegClass, fwti.RegClass,
+                                      isSEWAware>;
+  }
+}
+
+let Predicates = [HasStdExtZvfbfmin] in {
   defm : VPatConversionWF_VF_BF<"int_riscv_vfwcvtbf16_f_f_v",
                                 "PseudoVFWCVTBF16_F_F", isSEWAware=1>;
   defm : VPatConversionVF_WF_BF_RM<"int_riscv_vfncvtbf16_f_f_w",
@@ -56,7 +410,6 @@ let Predicates = [HasStdExtZvfbfminOrZvfofp8min] in {
   foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in {
     defvar fvti = fvtiToFWti.Vti;
     defvar fwti = fvtiToFWti.Wti;
-    let Predicates = [HasVInstructionsBF16Minimal] in
     def : Pat<(fwti.Vector (any_riscv_fpextend_vl
                                (fvti.Vector fvti.RegClass:$rs1),
                                (fvti.Mask VMV0:$vm),
@@ -66,18 +419,16 @@ let Predicates = [HasStdExtZvfbfminOrZvfofp8min] in {
                   (fvti.Mask VMV0:$vm),
                   GPR:$vl, fvti.Log2SEW, TA_MA)>;
 
-    let Predicates = [HasVInstructionsBF16Minimal] in
-      def : Pat<(fvti.Vector (any_riscv_fpround_vl
-                                 (fwti.Vector fwti.RegClass:$rs1),
-                                 (fwti.Mask VMV0:$vm), VLOpFrag)),
-                (!cast<Instruction>("PseudoVFNCVTBF16_F_F_W_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK")
-                    (fvti.Vector (IMPLICIT_DEF)), fwti.RegClass:$rs1,
-                    (fwti.Mask VMV0:$vm),
-                    // Value to indicate no rounding mode change in
-                    // RISCVInsertReadWriteCSR
-                    FRM_DYN,
-                    GPR:$vl, fvti.Log2SEW, TA_MA)>;
-    let Predicates = [HasVInstructionsBF16Minimal] in
+    def : Pat<(fvti.Vector (any_riscv_fpround_vl
+                               (fwti.Vector fwti.RegClass:$rs1),
+                               (fwti.Mask VMV0:$vm), VLOpFrag)),
+              (!cast<Instruction>("PseudoVFNCVTBF16_F_F_W_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK")
+                  (fvti.Vector (IMPLICIT_DEF)), fwti.RegClass:$rs1,
+                  (fwti.Mask VMV0:$vm),
+                  // Value to indicate no rounding mode change in
+                  // RISCVInsertReadWriteCSR
+                  FRM_DYN,
+                  GPR:$vl, fvti.Log2SEW, TA_MA)>;
     def : Pat<(fvti.Vector (fpround (fwti.Vector fwti.RegClass:$rs1))),
               (!cast<Instruction>("PseudoVFNCVTBF16_F_F_W_"#fvti.LMul.MX#"_E"#fvti.SEW)
                   (fvti.Vector (IMPLICIT_DEF)),
@@ -97,3 +448,224 @@ let Predicates = [HasStdExtZvfbfwma] in {
   defm : VPatWidenFPMulAccSDNode_VV_VF_RM<"PseudoVFWMACCBF16",
                                           AllWidenableBF16ToFloatVectors>;
 }
+
+multiclass VPatConversionVI_VF_BF16<string intrinsic, string instruction> {
+  foreach fvti = AllBF16Vectors in {
+    defvar ivti = GetIntVTypeInfo<fvti>.Vti;
+    let Predicates = !listconcat(GetVTypePredicates<fvti>.Predicates,
+                                 GetVTypePredicates<ivti>.Predicates) in
+    defm : VPatConversion<intrinsic, instruction, "V",
+                          ivti.Vector, fvti.Vector, ivti.Mask, fvti.Log2SEW,
+                          fvti.LMul, ivti.RegClass, fvti.RegClass>;
+  }
+}
+
+multiclass VPatConversionWF_VI_BF16<string intrinsic, string instruction,
+                                    bit isSEWAware = 0> {
+  foreach vtiToWti = AllWidenableIntToBF16Vectors in {
+    defvar vti = vtiToWti.Vti;
+    defvar fwti = vtiToWti.Wti;
+    let Predicates = !listconcat(GetVTypePredicates<vti>.Predicates,
+                                 GetVTypePredicates<fwti>.Predicates) in
+    defm : VPatConversion<intrinsic, instruction, "V",
+                          fwti.Vector, vti.Vector, fwti.Mask, vti.Log2SEW,
+                          vti.LMul, fwti.RegClass, vti.RegClass, isSEWAware>;
+  }
+}
+
+multiclass VPatConversionWF_VF_BF16<string intrinsic, string instruction,
+                                    bit isSEWAware = 0> {
+  foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in {
+    defvar fvti = fvtiToFWti.Vti;
+    defvar fwti = fvtiToFWti.Wti;
+    let Predicates = !listconcat(GetVTypeMinimalPredicates<fvti>.Predicates,
+                                 GetVTypeMinimalPredicates<fwti>.Predicates) in
+    defm : VPatConversion<intrinsic, instruction, "V",
+                          fwti.Vector, fvti.Vector, fwti.Mask, fvti.Log2SEW,
+                          fvti.LMul, fwti.RegClass, fvti.RegClass, isSEWAware>;
+  }
+}
+
+multiclass VPatConversionVI_WF_BF16<string intrinsic, string instruction> {
+  foreach vtiToWti = AllWidenableIntToBF16Vectors in {
+    defvar vti = vtiToWti.Vti;
+    defvar fwti = vtiToWti.Wti;
+    let Predicates = !listconcat(GetVTypePredicates<vti>.Predicates,
+                                 GetVTypePredicates<fwti>.Predicates) in
+    defm : VPatConversion<intrinsic, instruction, "W",
+                          vti.Vector, fwti.Vector, vti.Mask, vti.Log2SEW,
+                          vti.LMul, vti.RegClass, fwti.RegClass>;
+  }
+}
+
+multiclass VPatConversionVI_WF_RM_BF16<string intrinsic, string instruction> {
+  foreach vtiToWti = AllWidenableIntToBF16Vectors in {
+    defvar vti = vtiToWti.Vti;
+    defvar fwti = vtiToWti.Wti;
+    let Predicates = !listconcat(GetVTypePredicates<vti>.Predicates,
+                                 GetVTypePredicates<fwti>.Predicates) in
+    defm : VPatConversionRoundingMode<intrinsic, instruction, "W",
+                                      vti.Vector, fwti.Vector, vti.Mask, vti.Log2SEW,
+                                      vti.LMul, vti.RegClass, fwti.RegClass>;
+  }
+}
+
+multiclass VPatConversionVF_WF_BF16<string intrinsic, string instruction,
+                                    bit isSEWAware = 0> {
+  foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in {
+    defvar fvti = fvtiToFWti.Vti;
+    defvar fwti = fvtiToFWti.Wti;
+    let Predicates = !listconcat(GetVTypePredicates<fvti>.Predicates,
+                                 GetVTypePredicates<fwti>.Predicates) in
+    defm : VPatConversion<intrinsic, instruction, "W",
+                          fvti.Vector, fwti.Vector, fvti.Mask, fvti.Log2SEW,
+                          fvti.LMul, fvti.RegClass, fwti.RegClass, isSEWAware>;
+  }
+}
+
+let Predicates = [HasStdExtZvfbfa] in {
+defm : VPatBinaryV_VV_VX_RM<"int_riscv_vfadd", "PseudoVFADD_ALT",
+                            AllBF16Vectors, isSEWAware = 1>;
+defm : VPatBinaryV_VV_VX_RM<"int_riscv_vfsub", "PseudoVFSUB_ALT",
+                            AllBF16Vectors, isSEWAware = 1>;
+defm : VPatBinaryV_VX_RM<"int_riscv_vfrsub", "PseudoVFRSUB_ALT",
+                         AllBF16Vectors, isSEWAware = 1>;
+defm : VPatBinaryW_VV_VX_RM<"int_riscv_vfwadd", "PseudoVFWADD_ALT",
+                            AllWidenableBF16ToFloatVectors, isSEWAware=1>;
+defm : VPatBinaryW_VV_VX_RM<"int_riscv_vfwsub", "PseudoVFWSUB_ALT",
+                            AllWidenableBF16ToFloatVectors, isSEWAware=1>;
+defm : VPatBinaryW_WV_WX_RM<"int_riscv_vfwadd_w", "PseudoVFWADD_ALT",
+                            AllWidenableBF16ToFloatVectors, isSEWAware=1>;
+defm : VPatBinaryW_WV_WX_RM<"int_riscv_vfwsub_w", "PseudoVFWSUB_ALT",
+                            AllWidenableBF16ToFloatVectors, isSEWAware=1>;
+defm : VPatBinaryV_VV_VX_RM<"int_riscv_vfmul", "PseudoVFMUL_ALT",
+                            AllBF16Vectors, isSEWAware=1>;
+defm : VPatBinaryW_VV_VX_RM<"int_riscv_vfwmul", "PseudoVFWMUL_ALT",
+                            AllWidenableBF16ToFloatVectors, isSEWAware=1>;
+defm : VPatTernaryV_VV_VX_AAXA_RM<"int_riscv_vfmacc", "PseudoVFMACC_ALT",
+                                  AllBF16Vectors, isSEWAware=1>;
+defm : VPatTernaryV_VV_VX_AAXA_RM<"int_riscv_vfnmacc", "PseudoVFNMACC_ALT",
+                                  AllBF16Vectors, isSEWAware=1>;
+defm : VPatTernaryV_VV_VX_AAXA_RM<"int_riscv_vfmsac", "PseudoVFMSAC_ALT",
+                                  AllBF16Vectors, isSEWAware=1>;
+defm : VPatTernaryV_VV_VX_AAXA_RM<"int_riscv_vfnmsac", "PseudoVFNMSAC_ALT",
+                                  AllBF16Vectors, isSEWAware=1>;
+defm : VPatTernaryV_VV_VX_AAXA_RM<"int_riscv_vfmadd", "PseudoVFMADD_ALT",
+                                  AllBF16Vectors, isSEWAware=1>;
+defm : VPatTernaryV_VV_VX_AAXA_RM<"int_riscv_vfnmadd", "PseudoVFNMADD_ALT",
+                                  AllBF16Vectors, isSEWAware=1>;
+defm : VPatTernaryV_VV_VX_AAXA_RM<"int_riscv_vfmsub", "PseudoVFMSUB_ALT",
+                                  AllBF16Vectors, isSEWAware=1>;
+defm : VPatTernaryV_VV_VX_AAXA_RM<"int_riscv_vfnmsub", "PseudoVFNMSUB_ALT",
+                                  AllBF16Vectors, isSEWAware=1>;
+defm : VPatTernaryW_VV_VX_RM<"int_riscv_vfwmacc", "PseudoVFWMACC_ALT",
+                             AllWidenableBF16ToFloatVectors, isSEWAware=1>;
+defm : VPatTernaryW_VV_VX_RM<"int_riscv_vfwnmacc", "PseudoVFWNMACC_ALT",
+                             AllWidenableBF16ToFloatVectors, isSEWAware=1>;
+defm : VPatTernaryW_VV_VX_RM<"int_riscv_vfwmsac", "PseudoVFWMSAC_ALT",
+                             AllWidenableBF16ToFloatVectors, isSEWAware=1>;
+defm : VPatTernaryW_VV_VX_RM<"int_riscv_vfwnmsac", "PseudoVFWNMSAC_ALT",
+                             AllWidenableBF16ToFloatVectors, isSEWAware=1>;
+defm : VPatUnaryV_V<"int_riscv_vfrsqrt7", "PseudoVFRSQRT7_ALT",
+                    AllBF16Vectors, isSEWAware=1>;
+defm : VPatUnaryV_V_RM<"int_riscv_vfrec7", "PseudoVFREC7_ALT",
+                       AllBF16Vectors, isSEWAware=1>;
+defm : VPatBinaryV_VV_VX<"int_riscv_vfmin", "PseudoVFMIN_ALT",
+                         AllBF16Vectors, isSEWAware=1>;
+defm : VPatBinaryV_VV_VX<"int_riscv_vfmax", "PseudoVFMAX_ALT",
+                         AllBF16Vectors, isSEWAware=1>;
+defm : VPatBinaryV_VV_VX<"int_riscv_vfsgnj", "PseudoVFSGNJ_ALT",
+                         AllBF16Vectors, isSEWAware=1>;
+defm : VPatBinaryV_VV_VX<"int_riscv_vfsgnjn", "PseudoVFSGNJN_ALT",
+                         AllBF16Vectors, isSEWAware=1>;
+defm : VPatBinaryV_VV_VX<"int_riscv_vfsgnjx", "PseudoVFSGNJX_ALT",
+                         AllBF16Vectors, isSEWAware=1>;
+defm : VPatBinaryM_VV_VX<"int_riscv_vmfeq", "PseudoVMFEQ_ALT", AllBF16Vectors>;
+defm : VPatBinaryM_VV_VX<"int_riscv_vmfle", "PseudoVMFLE_ALT", AllBF16Vectors>;
+defm : VPatBinaryM_VV_VX<"int_riscv_vmflt", "PseudoVMFLT_ALT", AllBF16Vectors>;
+defm : VPatBinaryM_VV_VX<"int_riscv_vmfne", "PseudoVMFNE_ALT", AllBF16Vectors>;
+defm : VPatBinaryM_VX<"int_riscv_vmfgt", "PseudoVMFGT_ALT", AllBF16Vectors>;
+defm : VPatBinaryM_VX<"int_riscv_vmfge", "PseudoVMFGE_ALT", AllBF16Vectors>;
+defm : VPatBinarySwappedM_VV<"int_riscv_vmfgt", "PseudoVMFLT_ALT", AllBF16Vectors>;
+defm : VPatBinarySwappedM_VV<"int_riscv_vmfge", "PseudoVMFLE_ALT", AllBF16Vectors>;
+defm : VPatConversionVI_VF_BF16<"int_riscv_vfclass", "PseudoVFCLASS_ALT">;
+foreach vti = AllBF16Vectors in {
+  let Predicates = GetVTypePredicates<vti>.Predicates in
+    defm : VPatBinaryCarryInTAIL<"int_riscv_vfmerge", "PseudoVFMERGE_ALT",
+                                 "V"#vti.ScalarSuffix#"M",
+                                 vti.Vector,
+                                 vti.Vector, vti.Scalar, vti.Mask,
+                                 vti.Log2SEW, vti.LMul, vti.RegClass,
+                                 vti.RegClass, vti.ScalarRegClass>;
+}
+defm : VPatConversionWF_VI_BF16<"int_riscv_vfwcvt_f_xu_v", "PseudoVFWCVT_F_XU_ALT",
+                                isSEWAware=1>;
+defm : VPatConversionWF_VI_BF16<"int_riscv_vfwcvt_f_x_v", "PseudoVFWCVT_F_X_ALT",
+                                isSEWAware=1>;
+defm : VPatConversionWF_VF_BF16<"int_riscv_vfwcvt_f_f_v", "PseudoVFWCVT_F_F_ALT",
+                                isSEWAware=1>;
+defm : VPatConversionVI_WF_RM_BF16<"int_riscv_vfncvt_xu_f_w", "PseudoVFNCVT_XU_F_ALT">;
+defm : VPatConversionVI_WF_RM_BF16<"int_riscv_vfncvt_x_f_w", "PseudoVFNCVT_X_F_ALT">;
+defm : VPatConversionVI_WF_BF16<"int_riscv_vfncvt_rtz_xu_f_w", "PseudoVFNCVT_RTZ_XU_F_ALT">;
+defm : VPatConversionVI_WF_BF16<"int_riscv_vfncvt_rtz_x_f_w", "PseudoVFNCVT_RTZ_X_F_ALT">;
+defm : VPatConversionVF_WF_RM<"int_riscv_vfncvt_f_f_w", "PseudoVFNCVT_F_F_ALT",
+                              AllWidenableBF16ToFloatVectors, isSEWAware=1>;
+defm : VPatConversionVF_WF_BF16<"int_riscv_vfncvt_rod_f_f_w", "PseudoVFNCVT_ROD_F_F_ALT",
+                                isSEWAware=1>;
+defm : VPatBinaryV_VX<"int_riscv_vfslide1up", "PseudoVFSLIDE1UP_ALT", AllBF16Vectors>;
+defm : VPatBinaryV_VX<"int_riscv_vfslide1down", "PseudoVFSLIDE1DOWN_ALT", AllBF16Vectors>;
+
+foreach fvti = AllBF16Vectors in {
+  defvar ivti = GetIntVTypeInfo<fvti>.Vti;
+  let Predicates = GetVTypePredicates<ivti>.Predicates in {
+    // 13.16. Vector Floating-Point Move Instruction
+    // If we're splatting fpimm0, use vmv.v.x vd, x0.
+    def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl
+                           fvti.Vector:$passthru, (fvti.Scalar (fpimm0)), VLOpFrag)),
+              (!cast<Instruction>("PseudoVMV_V_I_"#fvti.LMul.MX)
+               $passthru, 0, GPR:$vl, fvti.Log2SEW, TU_MU)>;
+    def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl
+                           fvti.Vector:$passthru, (fvti.Scalar (SelectScalarFPAsInt (XLenVT GPR:$imm))), VLOpFrag)),
+              (!cast<Instruction>("PseudoVMV_V_X_"#fvti.LMul.MX)
+               $passthru, GPR:$imm, GPR:$vl, fvti.Log2SEW, TU_MU)>;
+  }
+
+  let Predicates = GetVTypePredicates<fvti>.Predicates in {
+    def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl
+                           fvti.Vector:$passthru, (fvti.Scalar fvti.ScalarRegClass:$rs2), VLOpFrag)),
+              (!cast<Instruction>("PseudoVFMV_V_ALT_" # fvti.ScalarSuffix # "_" #
+                                  fvti.LMul.MX)
+               $passthru, (fvti.Scalar fvti.ScalarRegClass:$rs2),
+               GPR:$vl, fvti.Log2SEW, TU_MU)>;
+  }
+}
+
+foreach vti = NoGroupBF16Vectors in {
+  let Predicates = GetVTypePredicates<vti>.Predicates in {
+    def : Pat<(vti.Vector (riscv_vfmv_s_f_vl (vti.Vector vti.RegClass:$passthru),
+                                             (vti.Scalar (fpimm0)),
+                                             VLOpFrag)),
+              (PseudoVMV_S_X $passthru, (XLenVT X0), GPR:$vl, vti.Log2SEW)>;
+    def : Pat<(vti.Vector (riscv_vfmv_s_f_vl (vti.Vector vti.RegClass:$passthru),
+                                             (vti.Scalar (SelectScalarFPAsInt (XLenVT GPR:$imm))),
+                                             VLOpFrag)),
+              (PseudoVMV_S_X $passthru, GPR:$imm, GPR:$vl, vti.Log2SEW)>;
+    def : Pat<(vti.Vector (riscv_vfmv_s_f_vl (vti.Vector vti.RegClass:$passthru),
+                                             vti.ScalarRegClass:$rs1,
+                                             VLOpFrag)),
+              (!cast<Instruction>("PseudoVFMV_S_"#vti.ScalarSuffix#"_ALT")
+                  vti.RegClass:$passthru,
+                  (vti.Scalar vti.ScalarRegClass:$rs1), GPR:$vl, vti.Log2SEW)>;
+  }
+
+  defvar vfmv_f_s_inst = !cast<Instruction>(!strconcat("PseudoVFMV_",
+                                                       vti.ScalarSuffix,
+                                                       "_S_ALT"));
+  // Only pattern-match extract-element operations where the index is 0. Any
+  // other index will have been custom-lowered to slide the vector correctly
+  // into place.
+  let Predicates = GetVTypePredicates<vti>.Predicates in
+  def : Pat<(vti.Scalar (extractelt (vti.Vector vti.RegClass:$rs2), 0)),
+            (vfmv_f_s_inst vti.RegClass:$rs2, vti.Log2SEW)>;
+}
+} // Predicates = [HasStdExtZvfbfa]
diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
index 5e1063155ba07..528bbdf4c26c6 100644
--- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
@@ -169,9 +169,9 @@ static bool getMemOperands(unsigned Factor, VectorType *VTy, Type *XLenTy,
   }
   case Intrinsic::masked_load: {
     Ptr = II->getOperand(0);
-    Alignment = cast<ConstantInt>(II->getArgOperand(1))->getAlignValue();
+    Alignment = II->getParamAlign(0).valueOrOne();
 
-    if (!isa<UndefValue>(II->getOperand(3)))
+    if (!isa<UndefValue>(II->getOperand(2)))
       return false;
 
     assert(Mask && "masked.load needs a mask!");
@@ -183,7 +183,7 @@ static bool getMemOperands(unsigned Factor, VectorType *VTy, Type *XLenTy,
   }
   case Intrinsic::masked_store: {
     Ptr = II->getOperand(1);
-    Alignment = cast<ConstantInt>(II->getArgOperand(2))->getAlignValue();
+    Alignment = II->getParamAlign(1).valueOrOne();
 
     assert(Mask && "masked.store needs a mask!");
 
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
index f863392a04e88..637d61fe96b47 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
@@ -270,7 +270,7 @@ class SiFive7AnyToGPRBypass<SchedRead read, int cycles = 2>
 // and floating point computation.
 // The V pipeline is modeled by the VCQ, VA, VL, and VS resources. There can
 // be one or two VA (Vector Arithmetic).
-multiclass SiFive7ProcResources<bit extraVALU = false> {
+multiclass SiFive7ProcResources<bit dualVALU = false> {
   let BufferSize = 0 in {
     def PipeA     : ProcResource<1>;
     def PipeB     : ProcResource<1>;
@@ -279,7 +279,7 @@ multiclass SiFive7ProcResources<bit extraVALU = false> {
     def FDiv      : ProcResource<1>; // FP Division/Sqrt
 
     // Arithmetic sequencer(s)
-    if extraVALU then {
+    if dualVALU then {
       // VA1 can handle any vector airthmetic instruction.
       def VA1     : ProcResource<1>;
       // VA2 generally can only handle simple vector arithmetic.
@@ -305,7 +305,7 @@ multiclass SiFive7ProcResources<bit extraVALU = false> {
   def PipeAB : ProcResGroup<[!cast<ProcResource>(NAME#"PipeA"),
                              !cast<ProcResource>(NAME#"PipeB")]>;
 
-  if extraVALU then
+  if dualVALU then
   def VA1OrVA2 : ProcResGroup<[!cast<ProcResource>(NAME#"VA1"),
                                !cast<ProcResource>(NAME#"VA2")]>;
 }
@@ -1550,10 +1550,10 @@ multiclass SiFive7ReadAdvance {
 /// This multiclass is a "bundle" of (1) processor resources (i.e. pipes) and
 /// (2) WriteRes entries. It's parameterized by config values that will
 /// eventually be supplied by different SchedMachineModels.
-multiclass SiFive7SchedResources<int vlen, bit extraVALU,
+multiclass SiFive7SchedResources<int vlen, bit dualVALU,
                                  SiFive7FPLatencies fpLatencies,
                                  bit hasFastGather> {
-  defm SiFive7 : SiFive7ProcResources<extraVALU>;
+  defm SiFive7 : SiFive7ProcResources<dualVALU>;
 
   // Pull out defs from SiFive7ProcResources so we can refer to them by name.
   defvar SiFive7PipeA = !cast<ProcResource>(NAME # SiFive7PipeA);
@@ -1562,10 +1562,10 @@ multiclass SiFive7SchedResources<int vlen, bit extraVALU,
   defvar SiFive7IDiv = !cast<ProcResource>(NAME # SiFive7IDiv);
   defvar SiFive7FDiv = !cast<ProcResource>(NAME # SiFive7FDiv);
   // Pass SiFive7VA for VA1 and VA1OrVA2 if there is only 1 VALU.
-  defvar SiFive7VA1 = !if (extraVALU,
+  defvar SiFive7VA1 = !if (dualVALU,
                             !cast<ProcResource>(NAME # SiFive7VA1),
                             !cast<ProcResource>(NAME # SiFive7VA));
-  defvar SiFive7VA1OrVA2 = !if (extraVALU,
+  defvar SiFive7VA1OrVA2 = !if (dualVALU,
                             !cast<ProcResGroup>(NAME # SiFive7VA1OrVA2),
                             !cast<ProcResource>(NAME # SiFive7VA));
   defvar SiFive7VA = !cast<ProcResource>(NAME # SiFive7VA);
@@ -1608,7 +1608,7 @@ class SiFive7SchedMachineModel<int vlen> : SchedMachineModel {
                              HasStdExtZknh, HasStdExtZksed, HasStdExtZksh,
                              HasStdExtZkr];
   int VLEN = vlen;
-  bit HasExtraVALU = false;
+  bit HasDualVALU = false;
 
   SiFive7FPLatencies FPLatencies;
   bit HasFastGather = false;
@@ -1635,7 +1635,7 @@ def SiFive7VLEN512Model : SiFive7SchedMachineModel<512> {
 }
 
 def SiFive7VLEN1024X300Model : SiFive7SchedMachineModel<1024> {
-  let HasExtraVALU = true;
+  let HasDualVALU = true;
   let FPLatencies = SiFive7LowFPLatencies;
   let HasFastGather = true;
 }
@@ -1643,7 +1643,7 @@ def SiFive7VLEN1024X300Model : SiFive7SchedMachineModel<1024> {
 /// Binding models to their scheduling resources.
 foreach model = [SiFive7VLEN512Model, SiFive7VLEN1024X300Model] in {
   let SchedModel = model in
-  defm model.Name : SiFive7SchedResources<model.VLEN, model.HasExtraVALU,
+  defm model.Name : SiFive7SchedResources<model.VLEN, model.HasDualVALU,
                                           model.FPLatencies,
                                           model.HasFastGather>;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index 6acf799dd00d3..334db4bfb75d9 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -288,9 +288,12 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
   bool hasVInstructionsI64() const { return HasStdExtZve64x; }
   bool hasVInstructionsF16Minimal() const { return HasStdExtZvfhmin; }
   bool hasVInstructionsF16() const { return HasStdExtZvfh; }
-  bool hasVInstructionsBF16Minimal() const { return HasStdExtZvfbfmin; }
+  bool hasVInstructionsBF16Minimal() const {
+    return HasStdExtZvfbfmin || HasStdExtZvfbfa;
+  }
   bool hasVInstructionsF32() const { return HasStdExtZve32f; }
   bool hasVInstructionsF64() const { return HasStdExtZve64d; }
+  bool hasVInstructionsBF16() const { return HasStdExtZvfbfa; }
   // F16 and F64 both require F32.
   bool hasVInstructionsAnyF() const { return hasVInstructionsF32(); }
   bool hasVInstructionsFullMultiply() const { return HasStdExtV; }
diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
index 96ad5c680e383..0a8838cbd45c7 100644
--- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
@@ -156,13 +156,13 @@ FunctionPass *llvm::createRISCVVLOptimizerPass() {
   return new RISCVVLOptimizer();
 }
 
-LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]]
 static raw_ostream &operator<<(raw_ostream &OS, const OperandInfo &OI) {
   OI.print(OS);
   return OS;
 }
 
-LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]]
 static raw_ostream &operator<<(raw_ostream &OS,
                                const std::optional<OperandInfo> &OI) {
   if (OI)
diff --git a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
index b765fecbc8de3..640b014646f36 100644
--- a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
@@ -78,6 +78,8 @@ class SPIRVAsmPrinter : public AsmPrinter {
   void outputExecutionModeFromNumthreadsAttribute(
       const MCRegister &Reg, const Attribute &Attr,
       SPIRV::ExecutionMode::ExecutionMode EM);
+  void outputExecutionModeFromEnableMaximalReconvergenceAttr(
+      const MCRegister &Reg, const SPIRVSubtarget &ST);
   void outputExecutionMode(const Module &M);
   void outputAnnotations(const Module &M);
   void outputModuleSections();
@@ -139,8 +141,8 @@ void SPIRVAsmPrinter::emitEndOfAsmFile(Module &M) {
 // anymore.
 void SPIRVAsmPrinter::cleanUp(Module &M) {
   // Verifier disallows uses of intrinsic global variables.
-  for (StringRef GVName : {"llvm.global_ctors", "llvm.global_dtors",
-                           "llvm.used", "llvm.compiler.used"}) {
+  for (StringRef GVName :
+       {"llvm.global_ctors", "llvm.global_dtors", "llvm.used"}) {
     if (GlobalVariable *GV = M.getNamedGlobal(GVName))
       GV->setName("");
   }
@@ -495,6 +497,20 @@ void SPIRVAsmPrinter::outputExecutionModeFromNumthreadsAttribute(
   outputMCInst(Inst);
 }
 
+void SPIRVAsmPrinter::outputExecutionModeFromEnableMaximalReconvergenceAttr(
+    const MCRegister &Reg, const SPIRVSubtarget &ST) {
+  assert(ST.canUseExtension(SPIRV::Extension::SPV_KHR_maximal_reconvergence) &&
+         "Function called when SPV_KHR_maximal_reconvergence is not enabled.");
+
+  MCInst Inst;
+  Inst.setOpcode(SPIRV::OpExecutionMode);
+  Inst.addOperand(MCOperand::createReg(Reg));
+  unsigned EM =
+      static_cast<unsigned>(SPIRV::ExecutionMode::MaximallyReconvergesKHR);
+  Inst.addOperand(MCOperand::createImm(EM));
+  outputMCInst(Inst);
+}
+
 void SPIRVAsmPrinter::outputExecutionMode(const Module &M) {
   NamedMDNode *Node = M.getNamedMetadata("spirv.ExecutionMode");
   if (Node) {
@@ -551,6 +567,10 @@ void SPIRVAsmPrinter::outputExecutionMode(const Module &M) {
     if (Attribute Attr = F.getFnAttribute("hlsl.numthreads"); Attr.isValid())
       outputExecutionModeFromNumthreadsAttribute(
           FReg, Attr, SPIRV::ExecutionMode::LocalSize);
+    if (Attribute Attr = F.getFnAttribute("enable-maximal-reconvergence");
+        Attr.getValueAsBool()) {
+      outputExecutionModeFromEnableMaximalReconvergenceAttr(FReg, *ST);
+    }
     if (MDNode *Node = F.getMetadata("work_group_size_hint"))
       outputExecutionModeFromMDNode(FReg, Node,
                                     SPIRV::ExecutionMode::LocalSizeHint, 3, 1);
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
index dbe8e18ecfcfc..d91923b41ddd3 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
@@ -507,7 +507,9 @@ static Register buildLoadInst(SPIRVType *BaseType, Register PtrRegister,
 static Register buildBuiltinVariableLoad(
     MachineIRBuilder &MIRBuilder, SPIRVType *VariableType,
     SPIRVGlobalRegistry *GR, SPIRV::BuiltIn::BuiltIn BuiltinValue, LLT LLType,
-    Register Reg = Register(0), bool isConst = true, bool hasLinkageTy = true) {
+    Register Reg = Register(0), bool isConst = true,
+    const std::optional<SPIRV::LinkageType::LinkageType> &LinkageTy = {
+        SPIRV::LinkageType::Import}) {
   Register NewRegister =
       MIRBuilder.getMRI()->createVirtualRegister(&SPIRV::pIDRegClass);
   MIRBuilder.getMRI()->setType(
@@ -521,9 +523,8 @@ static Register buildBuiltinVariableLoad(
   // Set up the global OpVariable with the necessary builtin decorations.
   Register Variable = GR->buildGlobalVariable(
       NewRegister, PtrType, getLinkStringForBuiltIn(BuiltinValue), nullptr,
-      SPIRV::StorageClass::Input, nullptr, /* isConst= */ isConst,
-      /* HasLinkageTy */ hasLinkageTy, SPIRV::LinkageType::Import, MIRBuilder,
-      false);
+      SPIRV::StorageClass::Input, nullptr, /* isConst= */ isConst, LinkageTy,
+      MIRBuilder, false);
 
   // Load the value from the global variable.
   Register LoadedRegister =
@@ -1851,7 +1852,7 @@ static bool generateWaveInst(const SPIRV::IncomingCall *Call,
 
   return buildBuiltinVariableLoad(
       MIRBuilder, Call->ReturnType, GR, Value, LLType, Call->ReturnRegister,
-      /* isConst= */ false, /* hasLinkageTy= */ false);
+      /* isConst= */ false, /* LinkageType= */ std::nullopt);
 }
 
 // We expect a builtin
diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
index 1a7c02c676465..9e11c3a281a1b 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
@@ -479,19 +479,9 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
                    .addImm(static_cast<uint32_t>(getExecutionModel(*ST, F)))
                    .addUse(FuncVReg);
     addStringImm(F.getName(), MIB);
-  } else if (F.getLinkage() != GlobalValue::InternalLinkage &&
-             F.getLinkage() != GlobalValue::PrivateLinkage &&
-             F.getVisibility() != GlobalValue::HiddenVisibility) {
-    SPIRV::LinkageType::LinkageType LnkTy =
-        F.isDeclaration()
-            ? SPIRV::LinkageType::Import
-            : (F.getLinkage() == GlobalValue::LinkOnceODRLinkage &&
-                       ST->canUseExtension(
-                           SPIRV::Extension::SPV_KHR_linkonce_odr)
-                   ? SPIRV::LinkageType::LinkOnceODR
-                   : SPIRV::LinkageType::Export);
+  } else if (const auto LnkTy = getSpirvLinkageTypeFor(*ST, F)) {
     buildOpDecorate(FuncVReg, MIRBuilder, SPIRV::Decoration::LinkageAttributes,
-                    {static_cast<uint32_t>(LnkTy)}, F.getName());
+                    {static_cast<uint32_t>(*LnkTy)}, F.getName());
   }
 
   // Handle function pointers decoration
diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
index 5f3ed862af893..96f5dee21bc2a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
@@ -153,7 +153,9 @@ static const std::map<std::string, SPIRV::Extension::Extension, std::less<>>
          SPIRV::Extension::Extension::
              SPV_EXT_relaxed_printf_string_address_space},
         {"SPV_INTEL_predicated_io",
-         SPIRV::Extension::Extension::SPV_INTEL_predicated_io}};
+         SPIRV::Extension::Extension::SPV_INTEL_predicated_io},
+        {"SPV_KHR_maximal_reconvergence",
+         SPIRV::Extension::Extension::SPV_KHR_maximal_reconvergence}};
 
 bool SPIRVExtensionsParser::parse(cl::Option &O, StringRef ArgName,
                                   StringRef ArgValue,
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index e16c8f0fc302e..a151fd2fbdb7a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -17,6 +17,7 @@
 #include "SPIRVTargetMachine.h"
 #include "SPIRVUtils.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/InstVisitor.h"
@@ -1391,19 +1392,19 @@ void SPIRVEmitIntrinsics::preprocessCompositeConstants(IRBuilder<> &B) {
       Constant *AggrConst = nullptr;
       Type *ResTy = nullptr;
       if (auto *COp = dyn_cast<ConstantVector>(Op)) {
-        AggrConst = cast<Constant>(COp);
+        AggrConst = COp;
         ResTy = COp->getType();
       } else if (auto *COp = dyn_cast<ConstantArray>(Op)) {
-        AggrConst = cast<Constant>(COp);
+        AggrConst = COp;
         ResTy = B.getInt32Ty();
       } else if (auto *COp = dyn_cast<ConstantStruct>(Op)) {
-        AggrConst = cast<Constant>(COp);
+        AggrConst = COp;
         ResTy = B.getInt32Ty();
       } else if (auto *COp = dyn_cast<ConstantDataArray>(Op)) {
-        AggrConst = cast<Constant>(COp);
+        AggrConst = COp;
         ResTy = B.getInt32Ty();
       } else if (auto *COp = dyn_cast<ConstantAggregateZero>(Op)) {
-        AggrConst = cast<Constant>(COp);
+        AggrConst = COp;
         ResTy = Op->getType()->isVectorTy() ? COp->getType() : B.getInt32Ty();
       }
       if (AggrConst) {
@@ -2028,9 +2029,13 @@ Instruction *SPIRVEmitIntrinsics::visitUnreachableInst(UnreachableInst &I) {
 
 void SPIRVEmitIntrinsics::processGlobalValue(GlobalVariable &GV,
                                              IRBuilder<> &B) {
-  // Skip special artifical variable llvm.global.annotations.
-  if (GV.getName() == "llvm.global.annotations")
+  // Skip special artificial variables.
+  static const StringSet<> ArtificialGlobals{"llvm.global.annotations",
+                                             "llvm.compiler.used"};
+
+  if (ArtificialGlobals.contains(GV.getName()))
     return;
+
   Constant *Init = nullptr;
   if (hasInitializer(&GV)) {
     // Deduce element type and store results in Global Registry.
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
index 6fd1c7ed78c06..6181abb281cc6 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
@@ -712,9 +712,9 @@ SPIRVGlobalRegistry::buildConstantSampler(Register ResReg, unsigned AddrMode,
 Register SPIRVGlobalRegistry::buildGlobalVariable(
     Register ResVReg, SPIRVType *BaseType, StringRef Name,
     const GlobalValue *GV, SPIRV::StorageClass::StorageClass Storage,
-    const MachineInstr *Init, bool IsConst, bool HasLinkageTy,
-    SPIRV::LinkageType::LinkageType LinkageType, MachineIRBuilder &MIRBuilder,
-    bool IsInstSelector) {
+    const MachineInstr *Init, bool IsConst,
+    const std::optional<SPIRV::LinkageType::LinkageType> &LinkageType,
+    MachineIRBuilder &MIRBuilder, bool IsInstSelector) {
   const GlobalVariable *GVar = nullptr;
   if (GV) {
     GVar = cast<const GlobalVariable>(GV);
@@ -792,9 +792,9 @@ Register SPIRVGlobalRegistry::buildGlobalVariable(
     buildOpDecorate(Reg, MIRBuilder, SPIRV::Decoration::Alignment, {Alignment});
   }
 
-  if (HasLinkageTy)
+  if (LinkageType)
     buildOpDecorate(Reg, MIRBuilder, SPIRV::Decoration::LinkageAttributes,
-                    {static_cast<uint32_t>(LinkageType)}, Name);
+                    {static_cast<uint32_t>(*LinkageType)}, Name);
 
   SPIRV::BuiltIn::BuiltIn BuiltInId;
   if (getSpirvBuiltInIdByName(Name, BuiltInId))
@@ -821,8 +821,8 @@ Register SPIRVGlobalRegistry::getOrCreateGlobalVariableWithBinding(
       MIRBuilder.getMRI()->createVirtualRegister(&SPIRV::iIDRegClass);
 
   buildGlobalVariable(VarReg, VarType, Name, nullptr,
-                      getPointerStorageClass(VarType), nullptr, false, false,
-                      SPIRV::LinkageType::Import, MIRBuilder, false);
+                      getPointerStorageClass(VarType), nullptr, false,
+                      std::nullopt, MIRBuilder, false);
 
   buildOpDecorate(VarReg, MIRBuilder, SPIRV::Decoration::DescriptorSet, {Set});
   buildOpDecorate(VarReg, MIRBuilder, SPIRV::Decoration::Binding, {Binding});
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
index a648defa0a888..c230e62e795e8 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
@@ -548,14 +548,12 @@ class SPIRVGlobalRegistry : public SPIRVIRMapping {
                                 MachineIRBuilder &MIRBuilder);
   Register getOrCreateUndef(MachineInstr &I, SPIRVType *SpvType,
                             const SPIRVInstrInfo &TII);
-  Register buildGlobalVariable(Register Reg, SPIRVType *BaseType,
-                               StringRef Name, const GlobalValue *GV,
-                               SPIRV::StorageClass::StorageClass Storage,
-                               const MachineInstr *Init, bool IsConst,
-                               bool HasLinkageTy,
-                               SPIRV::LinkageType::LinkageType LinkageType,
-                               MachineIRBuilder &MIRBuilder,
-                               bool IsInstSelector);
+  Register buildGlobalVariable(
+      Register Reg, SPIRVType *BaseType, StringRef Name, const GlobalValue *GV,
+      SPIRV::StorageClass::StorageClass Storage, const MachineInstr *Init,
+      bool IsConst,
+      const std::optional<SPIRV::LinkageType::LinkageType> &LinkageType,
+      MachineIRBuilder &MIRBuilder, bool IsInstSelector);
   Register getOrCreateGlobalVariableWithBinding(const SPIRVType *VarType,
                                                 uint32_t Set, uint32_t Binding,
                                                 StringRef Name,
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index a466ab24560bb..021353ab716f7 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -355,9 +355,9 @@ class SPIRVInstructionSelector : public InstructionSelector {
   SPIRVType *widenTypeToVec4(const SPIRVType *Type, MachineInstr &I) const;
   bool extractSubvector(Register &ResVReg, const SPIRVType *ResType,
                         Register &ReadReg, MachineInstr &InsertionPoint) const;
-  bool generateImageRead(Register &ResVReg, const SPIRVType *ResType,
-                         Register ImageReg, Register IdxReg, DebugLoc Loc,
-                         MachineInstr &Pos) const;
+  bool generateImageReadOrFetch(Register &ResVReg, const SPIRVType *ResType,
+                                Register ImageReg, Register IdxReg,
+                                DebugLoc Loc, MachineInstr &Pos) const;
   bool BuildCOPY(Register DestReg, Register SrcReg, MachineInstr &I) const;
   bool loadVec3BuiltinInputID(SPIRV::BuiltIn::BuiltIn BuiltInValue,
                               Register ResVReg, const SPIRVType *ResType,
@@ -1321,8 +1321,8 @@ bool SPIRVInstructionSelector::selectLoad(Register ResVReg,
       }
 
       Register IdxReg = IntPtrDef->getOperand(3).getReg();
-      return generateImageRead(ResVReg, ResType, NewHandleReg, IdxReg,
-                               I.getDebugLoc(), I);
+      return generateImageReadOrFetch(ResVReg, ResType, NewHandleReg, IdxReg,
+                                      I.getDebugLoc(), I);
     }
   }
 
@@ -3639,27 +3639,33 @@ bool SPIRVInstructionSelector::selectReadImageIntrinsic(
   DebugLoc Loc = I.getDebugLoc();
   MachineInstr &Pos = I;
 
-  return generateImageRead(ResVReg, ResType, NewImageReg, IdxReg, Loc, Pos);
+  return generateImageReadOrFetch(ResVReg, ResType, NewImageReg, IdxReg, Loc,
+                                  Pos);
 }
 
-bool SPIRVInstructionSelector::generateImageRead(Register &ResVReg,
-                                                 const SPIRVType *ResType,
-                                                 Register ImageReg,
-                                                 Register IdxReg, DebugLoc Loc,
-                                                 MachineInstr &Pos) const {
+bool SPIRVInstructionSelector::generateImageReadOrFetch(
+    Register &ResVReg, const SPIRVType *ResType, Register ImageReg,
+    Register IdxReg, DebugLoc Loc, MachineInstr &Pos) const {
   SPIRVType *ImageType = GR.getSPIRVTypeForVReg(ImageReg);
   assert(ImageType && ImageType->getOpcode() == SPIRV::OpTypeImage &&
          "ImageReg is not an image type.");
+
   bool IsSignedInteger =
       sampledTypeIsSignedInteger(GR.getTypeForSPIRVType(ImageType));
+  // Check if the "sampled" operand of the image type is 1.
+  // https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#OpImageFetch
+  auto SampledOp = ImageType->getOperand(6);
+  bool IsFetch = (SampledOp.getImm() == 1);
 
   uint64_t ResultSize = GR.getScalarOrVectorComponentCount(ResType);
   if (ResultSize == 4) {
-    auto BMI = BuildMI(*Pos.getParent(), Pos, Loc, TII.get(SPIRV::OpImageRead))
-                   .addDef(ResVReg)
-                   .addUse(GR.getSPIRVTypeID(ResType))
-                   .addUse(ImageReg)
-                   .addUse(IdxReg);
+    auto BMI =
+        BuildMI(*Pos.getParent(), Pos, Loc,
+                TII.get(IsFetch ? SPIRV::OpImageFetch : SPIRV::OpImageRead))
+            .addDef(ResVReg)
+            .addUse(GR.getSPIRVTypeID(ResType))
+            .addUse(ImageReg)
+            .addUse(IdxReg);
 
     if (IsSignedInteger)
       BMI.addImm(0x1000); // SignExtend
@@ -3668,11 +3674,13 @@ bool SPIRVInstructionSelector::generateImageRead(Register &ResVReg,
 
   SPIRVType *ReadType = widenTypeToVec4(ResType, Pos);
   Register ReadReg = MRI->createVirtualRegister(GR.getRegClass(ReadType));
-  auto BMI = BuildMI(*Pos.getParent(), Pos, Loc, TII.get(SPIRV::OpImageRead))
-                 .addDef(ReadReg)
-                 .addUse(GR.getSPIRVTypeID(ReadType))
-                 .addUse(ImageReg)
-                 .addUse(IdxReg);
+  auto BMI =
+      BuildMI(*Pos.getParent(), Pos, Loc,
+              TII.get(IsFetch ? SPIRV::OpImageFetch : SPIRV::OpImageRead))
+          .addDef(ReadReg)
+          .addUse(GR.getSPIRVTypeID(ReadType))
+          .addUse(ImageReg)
+          .addUse(IdxReg);
   if (IsSignedInteger)
     BMI.addImm(0x1000); // SignExtend
   bool Succeed = BMI.constrainAllUses(TII, TRI, RBI);
@@ -3765,7 +3773,6 @@ void SPIRVInstructionSelector::decorateUsesAsNonUniform(
                       SPIRV::Decoration::NonUniformEXT, {});
     }
   }
-  return;
 }
 
 bool SPIRVInstructionSelector::extractSubvector(
@@ -4351,15 +4358,8 @@ bool SPIRVInstructionSelector::selectGlobalValue(
   if (hasInitializer(GlobalVar) && !Init)
     return true;
 
-  bool HasLnkTy = !GV->hasInternalLinkage() && !GV->hasPrivateLinkage() &&
-                  !GV->hasHiddenVisibility();
-  SPIRV::LinkageType::LinkageType LnkType =
-      GV->isDeclarationForLinker()
-          ? SPIRV::LinkageType::Import
-          : (GV->hasLinkOnceODRLinkage() &&
-                     STI.canUseExtension(SPIRV::Extension::SPV_KHR_linkonce_odr)
-                 ? SPIRV::LinkageType::LinkOnceODR
-                 : SPIRV::LinkageType::Export);
+  const std::optional<SPIRV::LinkageType::LinkageType> LnkType =
+      getSpirvLinkageTypeFor(STI, *GV);
 
   const unsigned AddrSpace = GV->getAddressSpace();
   SPIRV::StorageClass::StorageClass StorageClass =
@@ -4367,7 +4367,7 @@ bool SPIRVInstructionSelector::selectGlobalValue(
   SPIRVType *ResType = GR.getOrCreateSPIRVPointerType(GVType, I, StorageClass);
   Register Reg = GR.buildGlobalVariable(
       ResVReg, ResType, GlobalIdent, GV, StorageClass, Init,
-      GlobalVar->isConstant(), HasLnkTy, LnkType, MIRBuilder, true);
+      GlobalVar->isConstant(), LnkType, MIRBuilder, true);
   return Reg.isValid();
 }
 
@@ -4518,8 +4518,8 @@ bool SPIRVInstructionSelector::loadVec3BuiltinInputID(
   // builtin variable.
   Register Variable = GR.buildGlobalVariable(
       NewRegister, PtrType, getLinkStringForBuiltIn(BuiltInValue), nullptr,
-      SPIRV::StorageClass::Input, nullptr, true, false,
-      SPIRV::LinkageType::Import, MIRBuilder, false);
+      SPIRV::StorageClass::Input, nullptr, true, std::nullopt, MIRBuilder,
+      false);
 
   // Create new register for loading value.
   MachineRegisterInfo *MRI = MIRBuilder.getMRI();
@@ -4571,8 +4571,8 @@ bool SPIRVInstructionSelector::loadBuiltinInputID(
   // builtin variable.
   Register Variable = GR.buildGlobalVariable(
       NewRegister, PtrType, getLinkStringForBuiltIn(BuiltInValue), nullptr,
-      SPIRV::StorageClass::Input, nullptr, true, false,
-      SPIRV::LinkageType::Import, MIRBuilder, false);
+      SPIRV::StorageClass::Input, nullptr, true, std::nullopt, MIRBuilder,
+      false);
 
   // Load uint value from the global variable.
   auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpLoad))
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index 5144fb14fa6a6..f7cdfcb65623b 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -547,9 +547,9 @@ void SPIRVModuleAnalysis::collectFuncNames(MachineInstr &MI,
   if (MI.getOpcode() == SPIRV::OpDecorate) {
     // If it's got Import linkage.
     auto Dec = MI.getOperand(1).getImm();
-    if (Dec == static_cast<unsigned>(SPIRV::Decoration::LinkageAttributes)) {
+    if (Dec == SPIRV::Decoration::LinkageAttributes) {
       auto Lnk = MI.getOperand(MI.getNumOperands() - 1).getImm();
-      if (Lnk == static_cast<unsigned>(SPIRV::LinkageType::Import)) {
+      if (Lnk == SPIRV::LinkageType::Import) {
         // Map imported function name to function ID register.
         const Function *ImportedFunc =
             F->getParent()->getFunction(getStringImm(MI, 2));
@@ -635,7 +635,7 @@ static void collectOtherInstr(MachineInstr &MI, SPIRV::ModuleAnalysisInfo &MAI,
 void SPIRVModuleAnalysis::processOtherInstrs(const Module &M) {
   InstrTraces IS;
   for (auto F = M.begin(), E = M.end(); F != E; ++F) {
-    if ((*F).isDeclaration())
+    if (F->isDeclaration())
       continue;
     MachineFunction *MF = MMI->getMachineFunction(*F);
     assert(MF);
@@ -1200,6 +1200,23 @@ void addOpAccessChainReqs(const MachineInstr &Instr,
     return;
   }
 
+  bool IsNonUniform =
+      hasNonUniformDecoration(Instr.getOperand(0).getReg(), MRI);
+
+  auto FirstIndexReg = Instr.getOperand(3).getReg();
+  bool FirstIndexIsConstant =
+      Subtarget.getInstrInfo()->isConstantInstr(*MRI.getVRegDef(FirstIndexReg));
+
+  if (StorageClass == SPIRV::StorageClass::StorageClass::StorageBuffer) {
+    if (IsNonUniform)
+      Handler.addRequirements(
+          SPIRV::Capability::StorageBufferArrayNonUniformIndexingEXT);
+    else if (!FirstIndexIsConstant)
+      Handler.addRequirements(
+          SPIRV::Capability::StorageBufferArrayDynamicIndexing);
+    return;
+  }
+
   Register PointeeTypeReg = ResTypeInst->getOperand(2).getReg();
   MachineInstr *PointeeType = MRI.getUniqueVRegDef(PointeeTypeReg);
   if (PointeeType->getOpcode() != SPIRV::OpTypeImage &&
@@ -1208,27 +1225,25 @@ void addOpAccessChainReqs(const MachineInstr &Instr,
     return;
   }
 
-  bool IsNonUniform =
-      hasNonUniformDecoration(Instr.getOperand(0).getReg(), MRI);
   if (isUniformTexelBuffer(PointeeType)) {
     if (IsNonUniform)
       Handler.addRequirements(
           SPIRV::Capability::UniformTexelBufferArrayNonUniformIndexingEXT);
-    else
+    else if (!FirstIndexIsConstant)
       Handler.addRequirements(
           SPIRV::Capability::UniformTexelBufferArrayDynamicIndexingEXT);
   } else if (isInputAttachment(PointeeType)) {
     if (IsNonUniform)
       Handler.addRequirements(
           SPIRV::Capability::InputAttachmentArrayNonUniformIndexingEXT);
-    else
+    else if (!FirstIndexIsConstant)
       Handler.addRequirements(
           SPIRV::Capability::InputAttachmentArrayDynamicIndexingEXT);
   } else if (isStorageTexelBuffer(PointeeType)) {
     if (IsNonUniform)
       Handler.addRequirements(
           SPIRV::Capability::StorageTexelBufferArrayNonUniformIndexingEXT);
-    else
+    else if (!FirstIndexIsConstant)
       Handler.addRequirements(
           SPIRV::Capability::StorageTexelBufferArrayDynamicIndexingEXT);
   } else if (isSampledImage(PointeeType) ||
@@ -1237,14 +1252,14 @@ void addOpAccessChainReqs(const MachineInstr &Instr,
     if (IsNonUniform)
       Handler.addRequirements(
           SPIRV::Capability::SampledImageArrayNonUniformIndexingEXT);
-    else
+    else if (!FirstIndexIsConstant)
       Handler.addRequirements(
           SPIRV::Capability::SampledImageArrayDynamicIndexing);
   } else if (isStorageImage(PointeeType)) {
     if (IsNonUniform)
       Handler.addRequirements(
           SPIRV::Capability::StorageImageArrayNonUniformIndexingEXT);
-    else
+    else if (!FirstIndexIsConstant)
       Handler.addRequirements(
           SPIRV::Capability::StorageImageArrayDynamicIndexing);
   }
@@ -2155,6 +2170,9 @@ static void collectReqs(const Module &M, SPIRV::ModuleAnalysisInfo &MAI,
           SPIRV::OperandCategory::ExecutionModeOperand,
           SPIRV::ExecutionMode::LocalSize, ST);
     }
+    if (F.getFnAttribute("enable-maximal-reconvergence").getValueAsBool()) {
+      MAI.Reqs.addExtension(SPIRV::Extension::SPV_KHR_maximal_reconvergence);
+    }
     if (F.getMetadata("work_group_size_hint"))
       MAI.Reqs.getAndAddRequirements(
           SPIRV::OperandCategory::ExecutionModeOperand,
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
index d8376cd1aeb5a..2d19f6de604e4 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
@@ -169,9 +169,7 @@ struct ModuleAnalysisInfo {
 
   MCRegister getFuncReg(const Function *F) {
     assert(F && "Function is null");
-    auto FuncPtrRegPair = FuncMap.find(F);
-    return FuncPtrRegPair == FuncMap.end() ? MCRegister()
-                                           : FuncPtrRegPair->second;
+    return FuncMap.lookup(F);
   }
   MCRegister getExtInstSetReg(unsigned SetNum) { return ExtInstSetMap[SetNum]; }
   InstrList &getMSInstrs(unsigned MSType) { return MS[MSType]; }
diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
index 26256429537e9..7d08b29a51a6e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
+++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
@@ -386,6 +386,7 @@ defm SPV_KHR_float_controls2 : ExtensionOperand<124, [EnvVulkan, EnvOpenCL]>;
 defm SPV_INTEL_tensor_float32_conversion : ExtensionOperand<125, [EnvOpenCL]>;
 defm SPV_KHR_bfloat16 : ExtensionOperand<126, [EnvVulkan, EnvOpenCL]>;
 defm SPV_INTEL_predicated_io : ExtensionOperand<127, [EnvOpenCL]>;
+defm SPV_KHR_maximal_reconvergence : ExtensionOperand<128, [EnvVulkan]>;
 
 //===----------------------------------------------------------------------===//
 // Multiclass used to define Capabilities enum values and at the same time
@@ -698,7 +699,7 @@ defm IntersectionNV: ExecutionModelOperand<5314, [RayTracingNV]>;
 defm AnyHitNV: ExecutionModelOperand<5315, [RayTracingNV]>;
 defm ClosestHitNV: ExecutionModelOperand<5316, [RayTracingNV]>;
 defm MissNV: ExecutionModelOperand<5317, [RayTracingNV]>;
-defm CallableNV: ExecutionModelOperand<5318, [RayTracingNV]>;
+defm CallableNV : ExecutionModelOperand<5318, [RayTracingNV]>;
 
 //===----------------------------------------------------------------------===//
 // Multiclass used to define MemoryModel enum values and at the same time
@@ -805,6 +806,7 @@ defm RoundingModeRTNINTEL : ExecutionModeOperand<5621, [RoundToInfinityINTEL]>;
 defm FloatingPointModeALTINTEL : ExecutionModeOperand<5622, [FloatingPointModeINTEL]>;
 defm FloatingPointModeIEEEINTEL : ExecutionModeOperand<5623, [FloatingPointModeINTEL]>;
 defm FPFastMathDefault : ExecutionModeOperand<6028, [FloatControls2]>;
+defm MaximallyReconvergesKHR : ExecutionModeOperand<6023, [Shader]>;
 
 //===----------------------------------------------------------------------===//
 // Multiclass used to define StorageClass enum values and at the same time
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
index 1d47c892d03b1..4e2cc882ed6ba 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
@@ -1040,4 +1040,19 @@ getFirstValidInstructionInsertPoint(MachineBasicBlock &BB) {
                                                                      : VarPos;
 }
 
+std::optional<SPIRV::LinkageType::LinkageType>
+getSpirvLinkageTypeFor(const SPIRVSubtarget &ST, const GlobalValue &GV) {
+  if (GV.hasLocalLinkage() || GV.hasHiddenVisibility())
+    return std::nullopt;
+
+  if (GV.isDeclarationForLinker())
+    return SPIRV::LinkageType::Import;
+
+  if (GV.hasLinkOnceODRLinkage() &&
+      ST.canUseExtension(SPIRV::Extension::SPV_KHR_linkonce_odr))
+    return SPIRV::LinkageType::LinkOnceODR;
+
+  return SPIRV::LinkageType::Export;
+}
+
 } // namespace llvm
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.h b/llvm/lib/Target/SPIRV/SPIRVUtils.h
index 5777a24faabed..99d9d403ea70c 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.h
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.h
@@ -559,5 +559,8 @@ unsigned getArrayComponentCount(const MachineRegisterInfo *MRI,
                                 const MachineInstr *ResType);
 MachineBasicBlock::iterator
 getFirstValidInstructionInsertPoint(MachineBasicBlock &BB);
+
+std::optional<SPIRV::LinkageType::LinkageType>
+getSpirvLinkageTypeFor(const SPIRVSubtarget &ST, const GlobalValue &GV);
 } // namespace llvm
 #endif // LLVM_LIB_TARGET_SPIRV_SPIRVUTILS_H
diff --git a/llvm/lib/Target/Sparc/SparcFrameLowering.cpp b/llvm/lib/Target/Sparc/SparcFrameLowering.cpp
index 2934c88b6bffc..fa08d4474f39e 100644
--- a/llvm/lib/Target/Sparc/SparcFrameLowering.cpp
+++ b/llvm/lib/Target/Sparc/SparcFrameLowering.cpp
@@ -246,8 +246,7 @@ SparcFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
   }
 }
 
-static bool LLVM_ATTRIBUTE_UNUSED verifyLeafProcRegUse(MachineRegisterInfo *MRI)
-{
+[[maybe_unused]] static bool verifyLeafProcRegUse(MachineRegisterInfo *MRI) {
 
   for (unsigned reg = SP::I0; reg <= SP::I7; ++reg)
     if (MRI->isPhysRegUsed(reg))
diff --git a/llvm/lib/Target/SystemZ/SystemZ.h b/llvm/lib/Target/SystemZ/SystemZ.h
index a0cf881f365d5..5a06ea30054d6 100644
--- a/llvm/lib/Target/SystemZ/SystemZ.h
+++ b/llvm/lib/Target/SystemZ/SystemZ.h
@@ -24,6 +24,7 @@ class SystemZTargetMachine;
 
 namespace SystemZ {
 // Condition-code mask values.
+const unsigned CCMASK_NONE = 0;
 const unsigned CCMASK_0 = 1 << 3;
 const unsigned CCMASK_1 = 1 << 2;
 const unsigned CCMASK_2 = 1 << 1;
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 3b7d11a318dc4..de28faf4908e9 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -15,6 +15,7 @@
 #include "SystemZConstantPoolValue.h"
 #include "SystemZMachineFunctionInfo.h"
 #include "SystemZTargetMachine.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -24,6 +25,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsS390.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
@@ -1514,6 +1516,9 @@ SystemZTargetLowering::getConstraintType(StringRef Constraint) const {
     default:
       break;
     }
+  } else if (Constraint.size() == 5 && Constraint.starts_with("{")) {
+    if (StringRef("{@cc}").compare(Constraint) == 0)
+      return C_Other;
   }
   return TargetLowering::getConstraintType(Constraint);
 }
@@ -1707,6 +1712,10 @@ SystemZTargetLowering::getRegForInlineAsmConstraint(
       return parseRegisterNumber(Constraint, &SystemZ::VR128BitRegClass,
                                  SystemZMC::VR128Regs, 32);
     }
+    if (Constraint[1] == '@') {
+      if (StringRef("{@cc}").compare(Constraint) == 0)
+        return std::make_pair(0u, &SystemZ::GR32BitRegClass);
+    }
   }
   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
 }
@@ -1737,6 +1746,38 @@ Register SystemZTargetLowering::getExceptionSelectorRegister(
   return Subtarget.isTargetXPLINK64() ? SystemZ::R2D : SystemZ::R7D;
 }
 
+// Convert condition code in CCReg to an i32 value.
+static SDValue getCCResult(SelectionDAG &DAG, SDValue CCReg) {
+  SDLoc DL(CCReg);
+  SDValue IPM = DAG.getNode(SystemZISD::IPM, DL, MVT::i32, CCReg);
+  return DAG.getNode(ISD::SRL, DL, MVT::i32, IPM,
+                     DAG.getConstant(SystemZ::IPM_CC, DL, MVT::i32));
+}
+
+// Lower @cc targets via setcc.
+SDValue SystemZTargetLowering::LowerAsmOutputForConstraint(
+    SDValue &Chain, SDValue &Glue, const SDLoc &DL,
+    const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
+  if (StringRef("{@cc}").compare(OpInfo.ConstraintCode) != 0)
+    return SDValue();
+
+  // Check that return type is valid.
+  if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
+      OpInfo.ConstraintVT.getSizeInBits() < 8)
+    report_fatal_error("Glue output operand is of invalid type");
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  MRI.addLiveIn(SystemZ::CC);
+
+  if (Glue.getNode()) {
+    Glue = DAG.getCopyFromReg(Chain, DL, SystemZ::CC, MVT::i32, Glue);
+    Chain = Glue.getValue(1);
+  } else
+    Glue = DAG.getCopyFromReg(Chain, DL, SystemZ::CC, MVT::i32);
+  return getCCResult(DAG, Glue);
+}
+
 void SystemZTargetLowering::LowerAsmOperandForConstraint(
     SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
     SelectionDAG &DAG) const {
@@ -5300,14 +5341,6 @@ SDValue SystemZTargetLowering::lowerPREFETCH(SDValue Op,
                                  Node->getMemoryVT(), Node->getMemOperand());
 }
 
-// Convert condition code in CCReg to an i32 value.
-static SDValue getCCResult(SelectionDAG &DAG, SDValue CCReg) {
-  SDLoc DL(CCReg);
-  SDValue IPM = DAG.getNode(SystemZISD::IPM, DL, MVT::i32, CCReg);
-  return DAG.getNode(ISD::SRL, DL, MVT::i32, IPM,
-                     DAG.getConstant(SystemZ::IPM_CC, DL, MVT::i32));
-}
-
 SDValue
 SystemZTargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op,
                                               SelectionDAG &DAG) const {
@@ -8723,95 +8756,247 @@ SDValue SystemZTargetLowering::combineSETCC(
   return SDValue();
 }
 
-static bool combineCCMask(SDValue &CCReg, int &CCValid, int &CCMask) {
+static std::pair<SDValue, int> findCCUse(const SDValue &Val) {
+  switch (Val.getOpcode()) {
+  default:
+    return std::make_pair(SDValue(), SystemZ::CCMASK_NONE);
+  case SystemZISD::IPM:
+    if (Val.getOperand(0).getOpcode() == SystemZISD::CLC ||
+        Val.getOperand(0).getOpcode() == SystemZISD::STRCMP)
+      return std::make_pair(Val.getOperand(0), SystemZ::CCMASK_ICMP);
+    return std::make_pair(Val.getOperand(0), SystemZ::CCMASK_ANY);
+  case SystemZISD::SELECT_CCMASK: {
+    SDValue Op4CCReg = Val.getOperand(4);
+    if (Op4CCReg.getOpcode() == SystemZISD::ICMP ||
+        Op4CCReg.getOpcode() == SystemZISD::TM) {
+      auto [OpCC, OpCCValid] = findCCUse(Op4CCReg.getOperand(0));
+      if (OpCC != SDValue())
+        return std::make_pair(OpCC, OpCCValid);
+    }
+    auto *CCValid = dyn_cast<ConstantSDNode>(Val.getOperand(2));
+    if (!CCValid)
+      return std::make_pair(SDValue(), SystemZ::CCMASK_NONE);
+    int CCValidVal = CCValid->getZExtValue();
+    return std::make_pair(Op4CCReg, CCValidVal);
+  }
+  case ISD::ADD:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
+    auto [Op0CC, Op0CCValid] = findCCUse(Val.getOperand(0));
+    if (Op0CC != SDValue())
+      return std::make_pair(Op0CC, Op0CCValid);
+    return findCCUse(Val.getOperand(1));
+  }
+}
+
+static bool combineCCMask(SDValue &CCReg, int &CCValid, int &CCMask,
+                          SelectionDAG &DAG);
+
+SmallVector<SDValue, 4> static simplifyAssumingCCVal(SDValue &Val, SDValue &CC,
+                                                     SelectionDAG &DAG) {
+  SDLoc DL(Val);
+  auto Opcode = Val.getOpcode();
+  switch (Opcode) {
+  default:
+    return {};
+  case ISD::Constant:
+    return {Val, Val, Val, Val};
+  case SystemZISD::IPM: {
+    SDValue IPMOp0 = Val.getOperand(0);
+    if (IPMOp0 != CC)
+      return {};
+    SmallVector<SDValue, 4> ShiftedCCVals;
+    for (auto CC : {0, 1, 2, 3})
+      ShiftedCCVals.emplace_back(
+          DAG.getConstant((CC << SystemZ::IPM_CC), DL, MVT::i32));
+    return ShiftedCCVals;
+  }
+  case SystemZISD::SELECT_CCMASK: {
+    SDValue TrueVal = Val.getOperand(0), FalseVal = Val.getOperand(1);
+    auto *CCValid = dyn_cast<ConstantSDNode>(Val.getOperand(2));
+    auto *CCMask = dyn_cast<ConstantSDNode>(Val.getOperand(3));
+    if (!CCValid || !CCMask)
+      return {};
+
+    int CCValidVal = CCValid->getZExtValue();
+    int CCMaskVal = CCMask->getZExtValue();
+    const auto &&TrueSDVals = simplifyAssumingCCVal(TrueVal, CC, DAG);
+    const auto &&FalseSDVals = simplifyAssumingCCVal(FalseVal, CC, DAG);
+    if (TrueSDVals.empty() || FalseSDVals.empty())
+      return {};
+    SDValue Op4CCReg = Val.getOperand(4);
+    if (Op4CCReg != CC)
+      combineCCMask(Op4CCReg, CCValidVal, CCMaskVal, DAG);
+    if (Op4CCReg != CC)
+      return {};
+    SmallVector<SDValue, 4> MergedSDVals;
+    for (auto &CCVal : {0, 1, 2, 3})
+      MergedSDVals.emplace_back(((CCMaskVal & (1 << (3 - CCVal))) != 0)
+                                    ? TrueSDVals[CCVal]
+                                    : FalseSDVals[CCVal]);
+    return MergedSDVals;
+  }
+  case ISD::ADD:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+  case ISD::SRA:
+    // Avoid introducing CC spills (because ADD/AND/OR/XOR/SRA
+    // would clobber CC).
+    if (!Val.hasOneUse())
+      return {};
+    [[fallthrough]];
+  case ISD::SHL:
+  case ISD::SRL:
+    SDValue Op0 = Val.getOperand(0), Op1 = Val.getOperand(1);
+    const auto &&Op0SDVals = simplifyAssumingCCVal(Op0, CC, DAG);
+    const auto &&Op1SDVals = simplifyAssumingCCVal(Op1, CC, DAG);
+    if (Op0SDVals.empty() || Op1SDVals.empty())
+      return {};
+    SmallVector<SDValue, 4> BinaryOpSDVals;
+    for (auto CCVal : {0, 1, 2, 3})
+      BinaryOpSDVals.emplace_back(DAG.getNode(
+          Opcode, DL, Val.getValueType(), Op0SDVals[CCVal], Op1SDVals[CCVal]));
+    return BinaryOpSDVals;
+  }
+}
+
+static bool combineCCMask(SDValue &CCReg, int &CCValid, int &CCMask,
+                          SelectionDAG &DAG) {
   // We have a SELECT_CCMASK or BR_CCMASK comparing the condition code
   // set by the CCReg instruction using the CCValid / CCMask masks,
-  // If the CCReg instruction is itself a ICMP testing the condition
+  // If the CCReg instruction is itself a ICMP / TM  testing the condition
   // code set by some other instruction, see whether we can directly
   // use that condition code.
-
-  // Verify that we have an ICMP against some constant.
-  if (CCValid != SystemZ::CCMASK_ICMP)
-    return false;
-  auto *ICmp = CCReg.getNode();
-  if (ICmp->getOpcode() != SystemZISD::ICMP)
-    return false;
-  auto *CompareLHS = ICmp->getOperand(0).getNode();
-  auto *CompareRHS = dyn_cast<ConstantSDNode>(ICmp->getOperand(1));
-  if (!CompareRHS)
+  auto *CCNode = CCReg.getNode();
+  if (!CCNode)
     return false;
 
-  // Optimize the case where CompareLHS is a SELECT_CCMASK.
-  if (CompareLHS->getOpcode() == SystemZISD::SELECT_CCMASK) {
-    // Verify that we have an appropriate mask for a EQ or NE comparison.
-    bool Invert = false;
-    if (CCMask == SystemZ::CCMASK_CMP_NE)
-      Invert = !Invert;
-    else if (CCMask != SystemZ::CCMASK_CMP_EQ)
+  if (CCNode->getOpcode() == SystemZISD::TM) {
+    if (CCValid != SystemZ::CCMASK_TM)
       return false;
-
-    // Verify that the ICMP compares against one of select values.
-    auto *TrueVal = dyn_cast<ConstantSDNode>(CompareLHS->getOperand(0));
-    if (!TrueVal)
-      return false;
-    auto *FalseVal = dyn_cast<ConstantSDNode>(CompareLHS->getOperand(1));
-    if (!FalseVal)
+    auto emulateTMCCMask = [](const SDValue &Op0Val, const SDValue &Op1Val) {
+      auto *Op0Node = dyn_cast<ConstantSDNode>(Op0Val.getNode());
+      auto *Op1Node = dyn_cast<ConstantSDNode>(Op1Val.getNode());
+      if (!Op0Node || !Op1Node)
+        return -1;
+      auto Op0APVal = Op0Node->getAPIntValue();
+      auto Op1APVal = Op1Node->getAPIntValue();
+      auto Result = Op0APVal & Op1APVal;
+      bool AllOnes = Result == Op1APVal;
+      bool AllZeros = Result == 0;
+      bool IsLeftMostBitSet = Result[Op1APVal.getActiveBits()] != 0;
+      return AllZeros ? 0 : AllOnes ? 3 : IsLeftMostBitSet ? 2 : 1;
+    };
+    SDValue Op0 = CCNode->getOperand(0);
+    SDValue Op1 = CCNode->getOperand(1);
+    auto [Op0CC, Op0CCValid] = findCCUse(Op0);
+    if (Op0CC == SDValue())
       return false;
-    if (CompareRHS->getAPIntValue() == FalseVal->getAPIntValue())
-      Invert = !Invert;
-    else if (CompareRHS->getAPIntValue() != TrueVal->getAPIntValue())
+    const auto &&Op0SDVals = simplifyAssumingCCVal(Op0, Op0CC, DAG);
+    const auto &&Op1SDVals = simplifyAssumingCCVal(Op1, Op0CC, DAG);
+    if (Op0SDVals.empty() || Op1SDVals.empty())
       return false;
-
-    // Compute the effective CC mask for the new branch or select.
-    auto *NewCCValid = dyn_cast<ConstantSDNode>(CompareLHS->getOperand(2));
-    auto *NewCCMask = dyn_cast<ConstantSDNode>(CompareLHS->getOperand(3));
-    if (!NewCCValid || !NewCCMask)
-      return false;
-    CCValid = NewCCValid->getZExtValue();
-    CCMask = NewCCMask->getZExtValue();
-    if (Invert)
-      CCMask ^= CCValid;
-
-    // Return the updated CCReg link.
-    CCReg = CompareLHS->getOperand(4);
+    int NewCCMask = 0;
+    for (auto CC : {0, 1, 2, 3}) {
+      auto CCVal = emulateTMCCMask(Op0SDVals[CC], Op1SDVals[CC]);
+      if (CCVal < 0)
+        return false;
+      NewCCMask <<= 1;
+      NewCCMask |= (CCMask & (1 << (3 - CCVal))) != 0;
+    }
+    NewCCMask &= Op0CCValid;
+    CCReg = Op0CC;
+    CCMask = NewCCMask;
+    CCValid = Op0CCValid;
     return true;
   }
+  if (CCNode->getOpcode() != SystemZISD::ICMP ||
+      CCValid != SystemZ::CCMASK_ICMP)
+    return false;
 
-  // Optimize the case where CompareRHS is (SRA (SHL (IPM))).
-  if (CompareLHS->getOpcode() == ISD::SRA) {
-    auto *SRACount = dyn_cast<ConstantSDNode>(CompareLHS->getOperand(1));
-    if (!SRACount || SRACount->getZExtValue() != 30)
-      return false;
-    auto *SHL = CompareLHS->getOperand(0).getNode();
-    if (SHL->getOpcode() != ISD::SHL)
-      return false;
-    auto *SHLCount = dyn_cast<ConstantSDNode>(SHL->getOperand(1));
-    if (!SHLCount || SHLCount->getZExtValue() != 30 - SystemZ::IPM_CC)
-      return false;
-    auto *IPM = SHL->getOperand(0).getNode();
-    if (IPM->getOpcode() != SystemZISD::IPM)
-      return false;
-
-    // Avoid introducing CC spills (because SRA would clobber CC).
-    if (!CompareLHS->hasOneUse())
-      return false;
-    // Verify that the ICMP compares against zero.
-    if (CompareRHS->getZExtValue() != 0)
+  SDValue CmpOp0 = CCNode->getOperand(0);
+  SDValue CmpOp1 = CCNode->getOperand(1);
+  SDValue CmpOp2 = CCNode->getOperand(2);
+  auto [Op0CC, Op0CCValid] = findCCUse(CmpOp0);
+  if (Op0CC != SDValue()) {
+    const auto &&Op0SDVals = simplifyAssumingCCVal(CmpOp0, Op0CC, DAG);
+    const auto &&Op1SDVals = simplifyAssumingCCVal(CmpOp1, Op0CC, DAG);
+    if (Op0SDVals.empty() || Op1SDVals.empty())
       return false;
 
-    // Compute the effective CC mask for the new branch or select.
-    CCMask = SystemZ::reverseCCMask(CCMask);
-
-    // Return the updated CCReg link.
-    CCReg = IPM->getOperand(0);
+    auto *CmpType = dyn_cast<ConstantSDNode>(CmpOp2);
+    auto CmpTypeVal = CmpType->getZExtValue();
+    const auto compareCCSigned = [&CmpTypeVal](const SDValue &Op0Val,
+                                               const SDValue &Op1Val) {
+      auto *Op0Node = dyn_cast<ConstantSDNode>(Op0Val.getNode());
+      auto *Op1Node = dyn_cast<ConstantSDNode>(Op1Val.getNode());
+      if (!Op0Node || !Op1Node)
+        return -1;
+      auto Op0APVal = Op0Node->getAPIntValue();
+      auto Op1APVal = Op1Node->getAPIntValue();
+      if (CmpTypeVal == SystemZICMP::SignedOnly)
+        return Op0APVal == Op1APVal ? 0 : Op0APVal.slt(Op1APVal) ? 1 : 2;
+      return Op0APVal == Op1APVal ? 0 : Op0APVal.ult(Op1APVal) ? 1 : 2;
+    };
+    int NewCCMask = 0;
+    for (auto CC : {0, 1, 2, 3}) {
+      auto CCVal = compareCCSigned(Op0SDVals[CC], Op1SDVals[CC]);
+      if (CCVal < 0)
+        return false;
+      NewCCMask <<= 1;
+      NewCCMask |= (CCMask & (1 << (3 - CCVal))) != 0;
+    }
+    NewCCMask &= Op0CCValid;
+    CCMask = NewCCMask;
+    CCReg = Op0CC;
+    CCValid = Op0CCValid;
     return true;
   }
 
   return false;
 }
 
-SDValue SystemZTargetLowering::combineBR_CCMASK(
-    SDNode *N, DAGCombinerInfo &DCI) const {
+// Merging versus split in multiple branches cost.
+TargetLoweringBase::CondMergingParams
+SystemZTargetLowering::getJumpConditionMergingParams(Instruction::BinaryOps Opc,
+                                                     const Value *Lhs,
+                                                     const Value *Rhs) const {
+  const auto isFlagOutOpCC = [](const Value *V) {
+    using namespace llvm::PatternMatch;
+    const Value *RHSVal;
+    const APInt *RHSC;
+    if (const auto *I = dyn_cast<Instruction>(V)) {
+      // PatternMatch.h provides concise tree-based pattern match of llvm IR.
+      if (match(I->getOperand(0), m_And(m_Value(RHSVal), m_APInt(RHSC))) ||
+          match(I, m_Cmp(m_Value(RHSVal), m_APInt(RHSC)))) {
+        if (const auto *CB = dyn_cast<CallBase>(RHSVal)) {
+          if (CB->isInlineAsm()) {
+            const InlineAsm *IA = cast<InlineAsm>(CB->getCalledOperand());
+            return IA &&
+                   IA->getConstraintString().find("{@cc}") != std::string::npos;
+          }
+        }
+      }
+    }
+    return false;
+  };
+  // Pattern (ICmp %asm) or (ICmp (And %asm)).
+  // Cost of longest dependency chain (ICmp, And) is 2. CostThreshold or
+  // BaseCost can be set >=2. If cost of instruction <= CostThreshold
+  // conditionals will be merged or else conditionals will be split.
+  if (isFlagOutOpCC(Lhs) && isFlagOutOpCC(Rhs))
+    return {3, 0, -1};
+  // Default.
+  return {-1, -1, -1};
+}
+
+SDValue SystemZTargetLowering::combineBR_CCMASK(SDNode *N,
+                                                DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
 
   // Combine BR_CCMASK (ICMP (SELECT_CCMASK)) into a single BR_CCMASK.
@@ -8824,8 +9009,7 @@ SDValue SystemZTargetLowering::combineBR_CCMASK(
   int CCMaskVal = CCMask->getZExtValue();
   SDValue Chain = N->getOperand(0);
   SDValue CCReg = N->getOperand(4);
-
-  if (combineCCMask(CCReg, CCValidVal, CCMaskVal))
+  if (combineCCMask(CCReg, CCValidVal, CCMaskVal, DAG))
     return DAG.getNode(SystemZISD::BR_CCMASK, SDLoc(N), N->getValueType(0),
                        Chain,
                        DAG.getTargetConstant(CCValidVal, SDLoc(N), MVT::i32),
@@ -8848,16 +9032,80 @@ SDValue SystemZTargetLowering::combineSELECT_CCMASK(
   int CCMaskVal = CCMask->getZExtValue();
   SDValue CCReg = N->getOperand(4);
 
-  if (combineCCMask(CCReg, CCValidVal, CCMaskVal))
-    return DAG.getNode(SystemZISD::SELECT_CCMASK, SDLoc(N), N->getValueType(0),
-                       N->getOperand(0), N->getOperand(1),
-                       DAG.getTargetConstant(CCValidVal, SDLoc(N), MVT::i32),
-                       DAG.getTargetConstant(CCMaskVal, SDLoc(N), MVT::i32),
-                       CCReg);
+  bool IsCombinedCCReg = combineCCMask(CCReg, CCValidVal, CCMaskVal, DAG);
+
+  // Populate SDVals vector for each condition code ccval for given Val, which
+  // can again be another nested select_ccmask with the same CC.
+  const auto constructCCSDValsFromSELECT = [&CCReg](SDValue &Val) {
+    if (Val.getOpcode() == SystemZISD::SELECT_CCMASK) {
+      SmallVector<SDValue, 4> Res;
+      if (Val.getOperand(4) != CCReg)
+        return SmallVector<SDValue, 4>{};
+      SDValue TrueVal = Val.getOperand(0), FalseVal = Val.getOperand(1);
+      auto *CCMask = dyn_cast<ConstantSDNode>(Val.getOperand(3));
+      if (!CCMask)
+        return SmallVector<SDValue, 4>{};
+
+      int CCMaskVal = CCMask->getZExtValue();
+      for (auto &CC : {0, 1, 2, 3})
+        Res.emplace_back(((CCMaskVal & (1 << (3 - CC))) != 0) ? TrueVal
+                                                              : FalseVal);
+      return Res;
+    }
+    return SmallVector<SDValue, 4>{Val, Val, Val, Val};
+  };
+  // Attempting to optimize TrueVal/FalseVal in outermost select_ccmask either
+  // with CCReg found by combineCCMask or original CCReg.
+  SDValue TrueVal = N->getOperand(0);
+  SDValue FalseVal = N->getOperand(1);
+  auto &&TrueSDVals = simplifyAssumingCCVal(TrueVal, CCReg, DAG);
+  auto &&FalseSDVals = simplifyAssumingCCVal(FalseVal, CCReg, DAG);
+  // TrueSDVals/FalseSDVals might be empty in case of non-constant
+  // TrueVal/FalseVal for select_ccmask, which can not be optimized further.
+  if (TrueSDVals.empty())
+    TrueSDVals = constructCCSDValsFromSELECT(TrueVal);
+  if (FalseSDVals.empty())
+    FalseSDVals = constructCCSDValsFromSELECT(FalseVal);
+  if (!TrueSDVals.empty() && !FalseSDVals.empty()) {
+    SmallSet<SDValue, 4> MergedSDValsSet;
+    // Ignoring CC values outside CCValiid.
+    for (auto CC : {0, 1, 2, 3}) {
+      if ((CCValidVal & ((1 << (3 - CC)))) != 0)
+        MergedSDValsSet.insert(((CCMaskVal & (1 << (3 - CC))) != 0)
+                                   ? TrueSDVals[CC]
+                                   : FalseSDVals[CC]);
+    }
+    if (MergedSDValsSet.size() == 1)
+      return *MergedSDValsSet.begin();
+    if (MergedSDValsSet.size() == 2) {
+      auto BeginIt = MergedSDValsSet.begin();
+      SDValue NewTrueVal = *BeginIt, NewFalseVal = *next(BeginIt);
+      if (NewTrueVal == FalseVal || NewFalseVal == TrueVal)
+        std::swap(NewTrueVal, NewFalseVal);
+      int NewCCMask = 0;
+      for (auto CC : {0, 1, 2, 3}) {
+        NewCCMask <<= 1;
+        NewCCMask |= ((CCMaskVal & (1 << (3 - CC))) != 0)
+                         ? (TrueSDVals[CC] == NewTrueVal)
+                         : (FalseSDVals[CC] == NewTrueVal);
+      }
+      CCMaskVal = NewCCMask;
+      CCMaskVal &= CCValidVal;
+      TrueVal = NewTrueVal;
+      FalseVal = NewFalseVal;
+      IsCombinedCCReg = true;
+    }
+  }
+
+  if (IsCombinedCCReg)
+    return DAG.getNode(
+        SystemZISD::SELECT_CCMASK, SDLoc(N), N->getValueType(0), TrueVal,
+        FalseVal, DAG.getTargetConstant(CCValidVal, SDLoc(N), MVT::i32),
+        DAG.getTargetConstant(CCMaskVal, SDLoc(N), MVT::i32), CCReg);
+
   return SDValue();
 }
 
-
 SDValue SystemZTargetLowering::combineGET_CCMASK(
     SDNode *N, DAGCombinerInfo &DCI) const {
 
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index f8706b748b355..d5b76031766dd 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -533,6 +533,18 @@ class SystemZTargetLowering : public TargetLowering {
   }
 
   const char *getTargetNodeName(unsigned Opcode) const override;
+
+  // This function currently returns cost for srl/ipm/cc sequence for merging.
+  CondMergingParams
+  getJumpConditionMergingParams(Instruction::BinaryOps Opc, const Value *Lhs,
+                                const Value *Rhs) const override;
+
+  // Handle Lowering flag assembly outputs.
+  SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag,
+                                      const SDLoc &DL,
+                                      const AsmOperandInfo &Constraint,
+                                      SelectionDAG &DAG) const override;
+
   std::pair<unsigned, const TargetRegisterClass *>
   getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                StringRef Constraint, MVT VT) const override;
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.cpp
index d9c8e22bbbaf5..6e99fc3b9b9fc 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.cpp
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.cpp
@@ -23,7 +23,7 @@ std::optional<wasm::ValType> WebAssembly::parseType(StringRef Type) {
       .Case("i64", wasm::ValType::I64)
       .Case("f32", wasm::ValType::F32)
       .Case("f64", wasm::ValType::F64)
-      .Cases("v128", "i8x16", "i16x8", "i32x4", "i64x2", "f32x4", "f64x2",
+      .Cases({"v128", "i8x16", "i16x8", "i32x4", "i64x2", "f32x4", "f64x2"},
              wasm::ValType::V128)
       .Case("funcref", wasm::ValType::FUNCREF)
       .Case("externref", wasm::ValType::EXTERNREF)
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 47c24fc27f1d6..f9739492e7fe3 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -601,6 +601,29 @@ static MachineBasicBlock *LowerMemcpy(MachineInstr &MI, DebugLoc DL,
   MachineOperand Src = MI.getOperand(3);
   MachineOperand Len = MI.getOperand(4);
 
+  // If the length is a constant, we don't actually need the check.
+  if (MachineInstr *Def = MRI.getVRegDef(Len.getReg())) {
+    if (Def->getOpcode() == WebAssembly::CONST_I32 ||
+        Def->getOpcode() == WebAssembly::CONST_I64) {
+      if (Def->getOperand(1).getImm() == 0) {
+        // A zero-length memcpy is a no-op.
+        MI.eraseFromParent();
+        return BB;
+      }
+      // A non-zero-length memcpy doesn't need a zero check.
+      unsigned MemoryCopy =
+          Int64 ? WebAssembly::MEMORY_COPY_A64 : WebAssembly::MEMORY_COPY_A32;
+      BuildMI(*BB, MI, DL, TII.get(MemoryCopy))
+          .add(DstMem)
+          .add(SrcMem)
+          .add(Dst)
+          .add(Src)
+          .add(Len);
+      MI.eraseFromParent();
+      return BB;
+    }
+  }
+
   // We're going to add an extra use to `Len` to test if it's zero; that
   // use shouldn't be a kill, even if the original use is.
   MachineOperand NoKillLen = Len;
@@ -669,6 +692,28 @@ static MachineBasicBlock *LowerMemset(MachineInstr &MI, DebugLoc DL,
   MachineOperand Val = MI.getOperand(2);
   MachineOperand Len = MI.getOperand(3);
 
+  // If the length is a constant, we don't actually need the check.
+  if (MachineInstr *Def = MRI.getVRegDef(Len.getReg())) {
+    if (Def->getOpcode() == WebAssembly::CONST_I32 ||
+        Def->getOpcode() == WebAssembly::CONST_I64) {
+      if (Def->getOperand(1).getImm() == 0) {
+        // A zero-length memset is a no-op.
+        MI.eraseFromParent();
+        return BB;
+      }
+      // A non-zero-length memset doesn't need a zero check.
+      unsigned MemoryFill =
+          Int64 ? WebAssembly::MEMORY_FILL_A64 : WebAssembly::MEMORY_FILL_A32;
+      BuildMI(*BB, MI, DL, TII.get(MemoryFill))
+          .add(Mem)
+          .add(Dst)
+          .add(Val)
+          .add(Len);
+      MI.eraseFromParent();
+      return BB;
+    }
+  }
+
   // We're going to add an extra use to `Len` to test if it's zero; that
   // use shouldn't be a kill, even if the original use is.
   MachineOperand NoKillLen = Len;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 0f6e1cad5a1dd..784062066ed64 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1583,11 +1583,9 @@ def : Pat<(v4i32 (partial_reduce_umla (v4i32 V128:$acc), (v8i16 V128:$lhs),
 // MLA: v16i8 -> v4i32
 def : Pat<(v4i32 (partial_reduce_smla (v4i32 V128:$acc), (v16i8 V128:$lhs),
                                                          (v16i8 V128:$rhs))),
-          (ADD_I32x4 (ADD_I32x4 (DOT (extend_low_s_I16x8 $lhs),
-                                     (extend_low_s_I16x8 $rhs)),
-                                (DOT (extend_high_s_I16x8 $lhs),
-                                     (extend_high_s_I16x8 $rhs))),
-                      $acc)>;
+          (ADD_I32x4 (ADD_I32x4 (extadd_pairwise_s_I32x4 (EXTMUL_LOW_S_I16x8 $lhs, $rhs)),
+                                (extadd_pairwise_s_I32x4 (EXTMUL_HIGH_S_I16x8 $lhs, $rhs))),
+                     $acc)>;
 def : Pat<(v4i32 (partial_reduce_umla (v4i32 V128:$acc), (v16i8 V128:$lhs),
                                                          (v16i8 V128:$rhs))),
           (ADD_I32x4 (ADD_I32x4 (extadd_pairwise_u_I32x4 (EXTMUL_LOW_U_I16x8 $lhs, $rhs)),
@@ -1763,6 +1761,26 @@ defm RELAXED_DOT :
             "i16x8.relaxed_dot_i8x16_i7x16_s\t$dst, $lhs, $rhs",
             "i16x8.relaxed_dot_i8x16_i7x16_s", 0x112>;
 
+def : Pat<
+  (v8i16 (add
+    (wasm_shuffle
+      (v8i16 (extmul_low_s v16i8:$lhs, v16i8:$rhs)),
+      (v8i16 (extmul_high_s v16i8:$lhs, v16i8:$rhs)),
+      (i32 0), (i32 1), (i32 4), (i32 5),
+      (i32 8), (i32 9), (i32 12), (i32 13),
+      (i32 16), (i32 17), (i32 20), (i32 21),
+      (i32 24), (i32 25), (i32 28), (i32 29)),
+    (wasm_shuffle
+      (v8i16 (extmul_low_s v16i8:$lhs, v16i8:$rhs)),
+      (v8i16 (extmul_high_s v16i8:$lhs, v16i8:$rhs)),
+      (i32 2), (i32 3), (i32 6), (i32 7),
+      (i32 10), (i32 11), (i32 14), (i32 15),
+      (i32 18), (i32 19), (i32 22), (i32 23),
+      (i32 26), (i32 27), (i32 30), (i32 31)))
+  ),
+  (v8i16 (RELAXED_DOT v16i8:$lhs, v16i8:$rhs))
+>;
+
 defm RELAXED_DOT_ADD :
   RELAXED_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs, V128:$acc),
             (outs), (ins),
@@ -1771,6 +1789,18 @@ defm RELAXED_DOT_ADD :
             "i32x4.relaxed_dot_i8x16_i7x16_add_s\t$dst, $lhs, $rhs, $acc",
             "i32x4.relaxed_dot_i8x16_i7x16_add_s", 0x113>;
 
+def : Pat<
+  (v4i32 (add
+    (v4i32 (int_wasm_extadd_pairwise_signed
+      (v8i16 (int_wasm_relaxed_dot_i8x16_i7x16_signed v16i8:$lhs, v16i8:$rhs)))),
+    (v4i32 V128:$acc))),
+  (v4i32 (RELAXED_DOT_ADD v16i8:$lhs, v16i8:$rhs, (v4i32 V128:$acc)))
+    >;
+
+def : Pat<(v4i32 (partial_reduce_smla (v4i32 V128:$acc), (v16i8 V128:$lhs),
+                                                         (v16i8 V128:$rhs))),
+          (RELAXED_DOT_ADD $lhs, $rhs, $acc)>, Requires<[HasRelaxedSIMD]>;
+
 //===----------------------------------------------------------------------===//
 // Relaxed BFloat16 dot product
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index a8908d4b710e6..ac251fdb3aa87 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -3514,15 +3514,16 @@ bool X86AsmParser::parseInstruction(ParseInstructionInfo &Info, StringRef Name,
   // xacquire <insn>      ; xacquire must be accompanied by 'lock'
   bool IsPrefix =
       StringSwitch<bool>(Name)
-          .Cases("cs", "ds", "es", "fs", "gs", "ss", true)
-          .Cases("rex64", "data32", "data16", "addr32", "addr16", true)
-          .Cases("xacquire", "xrelease", true)
-          .Cases("acquire", "release", isParsingIntelSyntax())
+          .Cases({"cs", "ds", "es", "fs", "gs", "ss"}, true)
+          .Cases({"rex64", "data32", "data16", "addr32", "addr16"}, true)
+          .Cases({"xacquire", "xrelease"}, true)
+          .Cases({"acquire", "release"}, isParsingIntelSyntax())
           .Default(false);
 
   auto isLockRepeatNtPrefix = [](StringRef N) {
     return StringSwitch<bool>(N)
-        .Cases("lock", "rep", "repe", "repz", "repne", "repnz", "notrack", true)
+        .Cases({"lock", "rep", "repe", "repz", "repne", "repnz", "notrack"},
+               true)
         .Default(false);
   };
 
diff --git a/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp b/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp
index 100f1ec027a66..53ec7125a6490 100644
--- a/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp
+++ b/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp
@@ -1879,28 +1879,34 @@ bool X86InstructionSelector::selectSelect(MachineInstr &I,
 
   unsigned OpCmp;
   LLT Ty = MRI.getType(DstReg);
-  switch (Ty.getSizeInBits()) {
-  default:
-    return false;
-  case 8:
-    OpCmp = X86::CMOV_GR8;
-    break;
-  case 16:
-    OpCmp = STI.canUseCMOV() ? X86::CMOV16rr : X86::CMOV_GR16;
-    break;
-  case 32:
-    OpCmp = STI.canUseCMOV() ? X86::CMOV32rr : X86::CMOV_GR32;
-    break;
-  case 64:
-    assert(STI.is64Bit() && STI.canUseCMOV());
-    OpCmp = X86::CMOV64rr;
-    break;
+  if (Ty.getSizeInBits() == 80) {
+    BuildMI(*Sel.getParent(), Sel, Sel.getDebugLoc(), TII.get(X86::CMOVE_Fp80),
+            DstReg)
+        .addReg(Sel.getTrueReg())
+        .addReg(Sel.getFalseReg());
+  } else {
+    switch (Ty.getSizeInBits()) {
+    default:
+      return false;
+    case 8:
+      OpCmp = X86::CMOV_GR8;
+      break;
+    case 16:
+      OpCmp = STI.canUseCMOV() ? X86::CMOV16rr : X86::CMOV_GR16;
+      break;
+    case 32:
+      OpCmp = STI.canUseCMOV() ? X86::CMOV32rr : X86::CMOV_GR32;
+      break;
+    case 64:
+      assert(STI.is64Bit() && STI.canUseCMOV());
+      OpCmp = X86::CMOV64rr;
+      break;
+    }
+    BuildMI(*Sel.getParent(), Sel, Sel.getDebugLoc(), TII.get(OpCmp), DstReg)
+        .addReg(Sel.getTrueReg())
+        .addReg(Sel.getFalseReg())
+        .addImm(X86::COND_E);
   }
-  BuildMI(*Sel.getParent(), Sel, Sel.getDebugLoc(), TII.get(OpCmp), DstReg)
-      .addReg(Sel.getTrueReg())
-      .addReg(Sel.getFalseReg())
-      .addImm(X86::COND_E);
-
   const TargetRegisterClass *DstRC = getRegClass(Ty, DstReg, MRI);
   if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
     LLVM_DEBUG(dbgs() << "Failed to constrain CMOV\n");
diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
index 28fa2cd0625c1..e792b1bce3c5c 100644
--- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
+++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
@@ -575,10 +575,13 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
 
   // todo: vectors and address spaces
   getActionDefinitionsBuilder(G_SELECT)
-      .legalFor({{s8, s32}, {s16, s32}, {s32, s32}, {s64, s32}, {p0, s32}})
+      .legalFor({{s16, s32}, {s32, s32}, {p0, s32}})
+      .legalFor(!HasCMOV, {{s8, s32}})
+      .legalFor(Is64Bit, {{s64, s32}})
+      .legalFor(UseX87, {{s80, s32}})
+      .clampScalar(1, s32, s32)
       .widenScalarToNextPow2(0, /*Min=*/8)
-      .clampScalar(0, HasCMOV ? s16 : s8, sMaxScalar)
-      .clampScalar(1, s32, s32);
+      .clampScalar(0, HasCMOV ? s16 : s8, sMaxScalar);
 
   // memory intrinsics
   getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE, G_MEMSET}).libcall();
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 3af8b3e060a16..a1fd366e59444 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -1164,7 +1164,6 @@ def ProcessorFeatures {
                                                   FeatureAVXNECONVERT,
                                                   FeatureAVXVNNIINT8,
                                                   FeatureAVXVNNIINT16,
-                                                  FeatureUSERMSR,
                                                   FeatureSHA512,
                                                   FeatureSM3,
                                                   FeatureEGPR,
@@ -1335,10 +1334,12 @@ def ProcessorFeatures {
     !listconcat(ARLFeatures, ARLSAdditionalFeatures);
 
   // Pantherlake
-  list<SubtargetFeature> PTLAdditionalFeatures = [FeaturePREFETCHI];
   list<SubtargetFeature> PTLFeatures =
-    !listremove(!listconcat(ARLSFeatures, PTLAdditionalFeatures), [FeatureWIDEKL]);
+    !listremove(ARLSFeatures, [FeatureWIDEKL]);
 
+  // Novalake
+  list<SubtargetFeature> NVLFeatures =
+      !listconcat(PTLFeatures, [FeaturePREFETCHI]);
 
   // Clearwaterforest
   list<SubtargetFeature> CWFAdditionalFeatures = [FeaturePREFETCHI,
@@ -1881,8 +1882,13 @@ def : ProcModel<P, AlderlakePModel,
 }
 def : ProcModel<"lunarlake", LunarlakePModel, ProcessorFeatures.ARLSFeatures,
                 ProcessorFeatures.ADLTuning>;
-def : ProcModel<"pantherlake", AlderlakePModel,
+foreach P = ["pantherlake", "wildcatlake"] in {
+def : ProcModel<P, AlderlakePModel,
                 ProcessorFeatures.PTLFeatures, ProcessorFeatures.ADLTuning>;
+}
+def : ProcModel<"novalake", AlderlakePModel, ProcessorFeatures.NVLFeatures,
+                ProcessorFeatures.ADLTuning>;
+
 def : ProcModel<"clearwaterforest", AlderlakePModel,
                 ProcessorFeatures.CWFFeatures, ProcessorFeatures.ADLTuning>;
 def : ProcModel<"emeraldrapids", SapphireRapidsModel,
diff --git a/llvm/lib/Target/X86/X86FloatingPoint.cpp b/llvm/lib/Target/X86/X86FloatingPoint.cpp
index e0991aaee3d45..9f88fda3e1c4b 100644
--- a/llvm/lib/Target/X86/X86FloatingPoint.cpp
+++ b/llvm/lib/Target/X86/X86FloatingPoint.cpp
@@ -602,8 +602,7 @@ namespace {
     friend bool operator<(const TableEntry &TE, unsigned V) {
       return TE.from < V;
     }
-    friend bool LLVM_ATTRIBUTE_UNUSED operator<(unsigned V,
-                                                const TableEntry &TE) {
+    [[maybe_unused]] friend bool operator<(unsigned V, const TableEntry &TE) {
       return V < TE.from;
     }
   };
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index eea84a2841764..b5f8ee50cba3d 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -3624,6 +3624,16 @@ X86TargetLowering::getJumpConditionMergingParams(Instruction::BinaryOps Opc,
       match(Lhs, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(), m_Value())) &&
       match(Rhs, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(), m_Value())))
     BaseCost += 1;
+
+  // For OR conditions with EQ comparisons, prefer splitting into branches
+  // (unless CCMP is available). OR+EQ cannot be optimized via bitwise ops,
+  // unlike OR+NE which becomes (P|Q)!=0. Similarly, don't split signed
+  // comparisons (SLT, SGT) that can be optimized.
+  if (BaseCost >= 0 && !Subtarget.hasCCMP() && Opc == Instruction::Or &&
+      match(Lhs, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(), m_Value())) &&
+      match(Rhs, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(), m_Value())))
+    return {-1, -1, -1};
+
   return {BaseCost, BrMergingLikelyBias.getValue(),
           BrMergingUnlikelyBias.getValue()};
 }
@@ -3787,7 +3797,7 @@ static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
 
 /// Return true if every element in Mask, is an in-place blend/select mask or is
 /// undef.
-LLVM_ATTRIBUTE_UNUSED static bool isBlendOrUndef(ArrayRef<int> Mask) {
+[[maybe_unused]] static bool isBlendOrUndef(ArrayRef<int> Mask) {
   unsigned NumElts = Mask.size();
   for (auto [I, M] : enumerate(Mask))
     if (!isUndefOrEqual(M, I) && !isUndefOrEqual(M, I + NumElts))
@@ -8096,7 +8106,7 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl,
   return DstVec;
 }
 
-LLVM_ATTRIBUTE_UNUSED static bool isHorizOp(unsigned Opcode) {
+[[maybe_unused]] static bool isHorizOp(unsigned Opcode) {
   switch (Opcode) {
   case X86ISD::PACKSS:
   case X86ISD::PACKUS:
@@ -20813,7 +20823,7 @@ SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
     // for DAG type consistency we have to match the FP operand type.
 
     APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
-    LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
+    [[maybe_unused]] APFloat::opStatus Status = APFloat::opOK;
     bool LosesInfo = false;
     if (TheVT == MVT::f64)
       // The rounding mode is irrelevant as the conversion should be exact.
@@ -22856,7 +22866,7 @@ static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y,
   // be generated by the memcmp expansion pass with oversized integer compares
   // (see PR33325).
   bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
-  if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
+  if (isNullConstant(Y) && OpSize == 128 && !IsOrXorXorTreeCCZero)
     return SDValue();
 
   // Don't perform this combine if constructing the vector will be expensive.
@@ -29745,65 +29755,30 @@ static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,
                                      const X86Subtarget &Subtarget,
                                      SelectionDAG &DAG,
                                      SDValue *Low = nullptr) {
-  unsigned NumElts = VT.getVectorNumElements();
-
   // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
   // to a vXi16 type. Do the multiplies, shift the results and pack the half
   // lane results back together.
 
   // We'll take different approaches for signed and unsigned.
-  // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
-  // and use pmullw to calculate the full 16-bit product.
+  // For unsigned we'll use punpcklbw/punpckhbw to zero extend the bytes to
+  // words and use pmullw to calculate the full 16-bit product.
   // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
   // shift them left into the upper byte of each word. This allows us to use
   // pmulhw to calculate the full 16-bit product. This trick means we don't
   // need to sign extend the bytes to use pmullw.
-
-  MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
+  MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
   SDValue Zero = DAG.getConstant(0, dl, VT);
 
-  SDValue ALo, AHi;
+  SDValue ALo, AHi, BLo, BHi;
   if (IsSigned) {
     ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
-    AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
-  } else {
-    ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
-    AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
-  }
-
-  SDValue BLo, BHi;
-  if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
-    // If the RHS is a constant, manually unpackl/unpackh and extend.
-    SmallVector<SDValue, 16> LoOps, HiOps;
-    for (unsigned i = 0; i != NumElts; i += 16) {
-      for (unsigned j = 0; j != 8; ++j) {
-        SDValue LoOp = B.getOperand(i + j);
-        SDValue HiOp = B.getOperand(i + j + 8);
-
-        if (IsSigned) {
-          LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
-          HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
-          LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
-                             DAG.getConstant(8, dl, MVT::i16));
-          HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
-                             DAG.getConstant(8, dl, MVT::i16));
-        } else {
-          LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
-          HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
-        }
-
-        LoOps.push_back(LoOp);
-        HiOps.push_back(HiOp);
-      }
-    }
-
-    BLo = DAG.getBuildVector(ExVT, dl, LoOps);
-    BHi = DAG.getBuildVector(ExVT, dl, HiOps);
-  } else if (IsSigned) {
     BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
+    AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
     BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
   } else {
+    ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
     BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
+    AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
     BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
   }
 
@@ -29816,7 +29791,7 @@ static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,
   if (Low)
     *Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
 
-  return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
+  return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf=*/true);
 }
 
 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
@@ -41871,7 +41846,7 @@ static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
     if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
         X86::mayFoldLoad(peekThroughOneUseBitcasts(N1), Subtarget))
       return SDValue();
-    Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
+    Imm = llvm::rotl<uint8_t>(Imm, 4);
     return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
                        DAG.getTargetConstant(Imm, DL, MVT::i8));
   };
@@ -44838,10 +44813,16 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
   }
   case X86ISD::PCMPGT:
     // icmp sgt(0, R) == ashr(R, BitWidth-1).
-    // iff we only need the sign bit then we can use R directly.
-    if (OriginalDemandedBits.isSignMask() &&
-        ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
-      return TLO.CombineTo(Op, Op.getOperand(1));
+    if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode())) {
+      // iff we only need the signbit then we can use R directly.
+      if (OriginalDemandedBits.isSignMask())
+        return TLO.CombineTo(Op, Op.getOperand(1));
+      // otherwise we just need R's signbit for the comparison.
+      APInt SignMask = APInt::getSignMask(BitWidth);
+      if (SimplifyDemandedBits(Op.getOperand(1), SignMask, OriginalDemandedElts,
+                               Known, TLO, Depth + 1))
+        return true;
+    }
     break;
   case X86ISD::MOVMSK: {
     SDValue Src = Op.getOperand(0);
@@ -47751,6 +47732,15 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
                                                            DL, DAG, Subtarget))
       return V;
 
+  // If the sign bit is known then BLENDV can be folded away.
+  if (N->getOpcode() == X86ISD::BLENDV) {
+    KnownBits KnownCond = DAG.computeKnownBits(Cond);
+    if (KnownCond.isNegative())
+      return LHS;
+    if (KnownCond.isNonNegative())
+      return RHS;
+  }
+
   if (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV) {
     SmallVector<int, 64> CondMask;
     if (createShuffleMaskFromVSELECT(CondMask, Cond,
@@ -58332,11 +58322,12 @@ static SDValue combineX86CloadCstore(SDNode *N, SelectionDAG &DAG) {
   } else if (Op1.getOpcode() == ISD::AND && Sub.getValue(0).use_empty()) {
     SDValue Src = Op1;
     SDValue Op10 = Op1.getOperand(0);
-    if (Op10.getOpcode() == ISD::XOR && isAllOnesConstant(Op10.getOperand(1))) {
-      // res, flags2 = sub 0, (and (xor X, -1), Y)
+    if (Op10.getOpcode() == ISD::XOR && isAllOnesConstant(Op10.getOperand(1)) &&
+        llvm::isOneConstant(Op1.getOperand(1))) {
+      // res, flags2 = sub 0, (and (xor X, -1), 1)
       // cload/cstore ..., cond_ne, flag2
       // ->
-      // res, flags2 = sub 0, (and X, Y)
+      // res, flags2 = sub 0, (and X, 1)
       // cload/cstore ..., cond_e, flag2
       Src = DAG.getNode(ISD::AND, DL, Op1.getValueType(), Op10.getOperand(0),
                         Op1.getOperand(1));
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index 6dd43b277e679..37d77728882b1 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -606,16 +606,24 @@ Value *X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
 
 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
   // MSVC CRT provides functionalities for stack protection.
-  if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
-      Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
+  RTLIB::LibcallImpl SecurityCheckCookieLibcall =
+      getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
+
+  RTLIB::LibcallImpl SecurityCookieVar =
+      getLibcallImpl(RTLIB::STACK_CHECK_GUARD);
+  if (SecurityCheckCookieLibcall != RTLIB::Unsupported &&
+      SecurityCookieVar != RTLIB::Unsupported) {
+    // MSVC CRT provides functionalities for stack protection.
     // MSVC CRT has a global variable holding security cookie.
-    M.getOrInsertGlobal("__security_cookie",
+    M.getOrInsertGlobal(getLibcallImplName(SecurityCookieVar),
                         PointerType::getUnqual(M.getContext()));
 
     // MSVC CRT has a function to validate security cookie.
-    FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
-        "__security_check_cookie", Type::getVoidTy(M.getContext()),
-        PointerType::getUnqual(M.getContext()));
+    FunctionCallee SecurityCheckCookie =
+        M.getOrInsertFunction(getLibcallImplName(SecurityCheckCookieLibcall),
+                              Type::getVoidTy(M.getContext()),
+                              PointerType::getUnqual(M.getContext()));
+
     if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
       F->setCallingConv(CallingConv::X86_FastCall);
       F->addParamAttr(0, Attribute::AttrKind::InReg);
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index 0fd44b74fd449..ec31675731b79 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1256,8 +1256,17 @@ def : Pat<(i64 (X86Wrapper tconstpool  :$dst)),
           (MOV64ri32 tconstpool  :$dst)>, Requires<[KernelCode]>;
 def : Pat<(i64 (X86Wrapper tjumptable  :$dst)),
           (MOV64ri32 tjumptable  :$dst)>, Requires<[KernelCode]>;
-def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)),
-          (MOV64ri32 tglobaladdr :$dst)>, Requires<[KernelCode]>;
+
+// If the globaladdr is an absolute_symbol, don't bother using the sign extending
+// instruction since there's no benefit to using it with absolute symbols.
+def globalAddrNoAbsSym : PatLeaf<(tglobaladdr:$dst), [{
+  auto *GA = cast<GlobalAddressSDNode>(N);
+  return !GA->getGlobal()->getAbsoluteSymbolRange();
+}]>;
+def : Pat<(i64 (X86Wrapper globalAddrNoAbsSym:$dst)),
+          (MOV64ri32 tglobaladdr:$dst)>,
+      Requires<[KernelCode]>;
+
 def : Pat<(i64 (X86Wrapper texternalsym:$dst)),
           (MOV64ri32 texternalsym:$dst)>, Requires<[KernelCode]>;
 def : Pat<(i64 (X86Wrapper mcsym:$dst)),
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 1d2cd39951bf4..5c23f917d0530 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -10809,39 +10809,27 @@ void X86InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB,
     if (!ST.hasSSE1())
       return;
 
-    // PXOR is safe to use because it doesn't affect flags.
-    BuildMI(MBB, Iter, DL, get(X86::PXORrr), Reg)
-        .addReg(Reg, RegState::Undef)
-        .addReg(Reg, RegState::Undef);
+    BuildMI(MBB, Iter, DL, get(X86::V_SET0), Reg);
   } else if (X86::VR256RegClass.contains(Reg)) {
     // YMM#
     if (!ST.hasAVX())
       return;
 
-    // VPXOR is safe to use because it doesn't affect flags.
-    BuildMI(MBB, Iter, DL, get(X86::VPXORrr), Reg)
-        .addReg(Reg, RegState::Undef)
-        .addReg(Reg, RegState::Undef);
+    BuildMI(MBB, Iter, DL, get(X86::AVX_SET0), Reg);
   } else if (X86::VR512RegClass.contains(Reg)) {
     // ZMM#
     if (!ST.hasAVX512())
       return;
 
-    // VPXORY is safe to use because it doesn't affect flags.
-    BuildMI(MBB, Iter, DL, get(X86::VPXORYrr), Reg)
-        .addReg(Reg, RegState::Undef)
-        .addReg(Reg, RegState::Undef);
+    BuildMI(MBB, Iter, DL, get(X86::AVX512_512_SET0), Reg);
   } else if (X86::VK1RegClass.contains(Reg) || X86::VK2RegClass.contains(Reg) ||
              X86::VK4RegClass.contains(Reg) || X86::VK8RegClass.contains(Reg) ||
              X86::VK16RegClass.contains(Reg)) {
     if (!ST.hasVLX())
       return;
 
-    // KXOR is safe to use because it doesn't affect flags.
-    unsigned Op = ST.hasBWI() ? X86::KXORQkk : X86::KXORWkk;
-    BuildMI(MBB, Iter, DL, get(Op), Reg)
-        .addReg(Reg, RegState::Undef)
-        .addReg(Reg, RegState::Undef);
+    unsigned Op = ST.hasBWI() ? X86::KSET0Q : X86::KSET0W;
+    BuildMI(MBB, Iter, DL, get(Op), Reg);
   }
 }
 
diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index 481a9be8374ab..713d504474f5e 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -1928,6 +1928,17 @@ static void addConstantComments(const MachineInstr *MI,
 #define INSTR_CASE(Prefix, Instr, Suffix, Postfix)                             \
   case X86::Prefix##Instr##Suffix##rm##Postfix:
 
+#define CASE_AVX512_ARITH_RM(Instr)                                            \
+  INSTR_CASE(V, Instr, Z128, )                                                 \
+  INSTR_CASE(V, Instr, Z128, k)                                                \
+  INSTR_CASE(V, Instr, Z128, kz)                                               \
+  INSTR_CASE(V, Instr, Z256, )                                                 \
+  INSTR_CASE(V, Instr, Z256, k)                                                \
+  INSTR_CASE(V, Instr, Z256, kz)                                               \
+  INSTR_CASE(V, Instr, Z, )                                                    \
+  INSTR_CASE(V, Instr, Z, k)                                                   \
+  INSTR_CASE(V, Instr, Z, kz)
+
 #define CASE_ARITH_RM(Instr)                                                   \
   INSTR_CASE(, Instr, , )   /* SSE */                                          \
   INSTR_CASE(V, Instr, , )  /* AVX-128 */                                      \
@@ -1943,40 +1954,26 @@ static void addConstantComments(const MachineInstr *MI,
   INSTR_CASE(V, Instr, Z, kz)
 
     // TODO: Add additional instructions when useful.
-    CASE_ARITH_RM(PMADDUBSW) {
-      unsigned SrcIdx = getSrcIdx(MI, 1);
-      if (auto *C = X86::getConstantFromPool(*MI, SrcIdx + 1)) {
-        if (C->getType()->getScalarSizeInBits() == 8) {
-          std::string Comment;
-          raw_string_ostream CS(Comment);
-          unsigned VectorWidth =
-              X86::getVectorRegisterWidth(MI->getDesc().operands()[0]);
-          CS << "[";
-          printConstant(C, VectorWidth, CS);
-          CS << "]";
-          OutStreamer.AddComment(CS.str());
-        }
-      }
-      break;
-    }
-
+    CASE_ARITH_RM(PMADDUBSW)
     CASE_ARITH_RM(PMADDWD)
+    CASE_ARITH_RM(PMULDQ)
+    CASE_ARITH_RM(PMULUDQ)
+    CASE_ARITH_RM(PMULLD)
+    CASE_AVX512_ARITH_RM(PMULLQ)
     CASE_ARITH_RM(PMULLW)
     CASE_ARITH_RM(PMULHW)
     CASE_ARITH_RM(PMULHUW)
     CASE_ARITH_RM(PMULHRSW) {
       unsigned SrcIdx = getSrcIdx(MI, 1);
       if (auto *C = X86::getConstantFromPool(*MI, SrcIdx + 1)) {
-        if (C->getType()->getScalarSizeInBits() == 16) {
-          std::string Comment;
-          raw_string_ostream CS(Comment);
-          unsigned VectorWidth =
-              X86::getVectorRegisterWidth(MI->getDesc().operands()[0]);
-          CS << "[";
-          printConstant(C, VectorWidth, CS);
-          CS << "]";
-          OutStreamer.AddComment(CS.str());
-        }
+        std::string Comment;
+        raw_string_ostream CS(Comment);
+        unsigned VectorWidth =
+            X86::getVectorRegisterWidth(MI->getDesc().operands()[0]);
+        CS << "[";
+        printConstant(C, VectorWidth, CS);
+        CS << "]";
+        OutStreamer.AddComment(CS.str());
       }
       break;
     }
diff --git a/llvm/lib/TargetParser/ARMTargetParser.cpp b/llvm/lib/TargetParser/ARMTargetParser.cpp
index 08944e6148a00..0fce5b9d51730 100644
--- a/llvm/lib/TargetParser/ARMTargetParser.cpp
+++ b/llvm/lib/TargetParser/ARMTargetParser.cpp
@@ -235,16 +235,16 @@ ARM::NeonSupportLevel ARM::getFPUNeonSupportLevel(ARM::FPUKind FPUKind) {
 
 StringRef ARM::getFPUSynonym(StringRef FPU) {
   return StringSwitch<StringRef>(FPU)
-      .Cases("fpa", "fpe2", "fpe3", "maverick", "invalid") // Unsupported
+      .Cases({"fpa", "fpe2", "fpe3", "maverick"}, "invalid") // Unsupported
       .Case("vfp2", "vfpv2")
       .Case("vfp3", "vfpv3")
       .Case("vfp4", "vfpv4")
       .Case("vfp3-d16", "vfpv3-d16")
       .Case("vfp4-d16", "vfpv4-d16")
-      .Cases("fp4-sp-d16", "vfpv4-sp-d16", "fpv4-sp-d16")
-      .Cases("fp4-dp-d16", "fpv4-dp-d16", "vfpv4-d16")
+      .Cases({"fp4-sp-d16", "vfpv4-sp-d16"}, "fpv4-sp-d16")
+      .Cases({"fp4-dp-d16", "fpv4-dp-d16"}, "vfpv4-d16")
       .Case("fp5-sp-d16", "fpv5-sp-d16")
-      .Cases("fp5-dp-d16", "fpv5-dp-d16", "fpv5-d16")
+      .Cases({"fp5-dp-d16", "fpv5-dp-d16"}, "fpv5-d16")
       // FIXME: Clang uses it, but it's bogus, since neon defaults to vfpv3.
       .Case("neon-vfpv3", "neon")
       .Default(FPU);
@@ -567,8 +567,8 @@ StringRef ARM::computeDefaultTargetABI(const Triple &TT) {
   default:
     if (TT.isOSNetBSD())
       return "apcs-gnu";
-    if (TT.isOSFreeBSD() || TT.isOSOpenBSD() || TT.isOSHaiku() ||
-        TT.isOHOSFamily())
+    if (TT.isOSFreeBSD() || TT.isOSFuchsia() || TT.isOSOpenBSD() ||
+        TT.isOSHaiku() || TT.isOHOSFamily())
       return "aapcs-linux";
     return "aapcs";
   }
@@ -648,6 +648,8 @@ StringRef ARM::getARMCPUForArch(const llvm::Triple &Triple, StringRef MArch) {
     }
   case llvm::Triple::OpenBSD:
     return "cortex-a8";
+  case llvm::Triple::Fuchsia:
+    return "cortex-a53";
   default:
     switch (Triple.getEnvironment()) {
     case llvm::Triple::EABIHF:
diff --git a/llvm/lib/TargetParser/ARMTargetParserCommon.cpp b/llvm/lib/TargetParser/ARMTargetParserCommon.cpp
index 89d5e0d320f86..f6cea85e5ee2b 100644
--- a/llvm/lib/TargetParser/ARMTargetParserCommon.cpp
+++ b/llvm/lib/TargetParser/ARMTargetParserCommon.cpp
@@ -22,13 +22,13 @@ StringRef ARM::getArchSynonym(StringRef Arch) {
       .Case("v5e", "v5te")
       .Case("v6j", "v6")
       .Case("v6hl", "v6k")
-      .Cases("v6m", "v6sm", "v6s-m", "v6-m")
-      .Cases("v6z", "v6zk", "v6kz")
-      .Cases("v7", "v7a", "v7hl", "v7l", "v7-a")
+      .Cases({"v6m", "v6sm", "v6s-m"}, "v6-m")
+      .Cases({"v6z", "v6zk"}, "v6kz")
+      .Cases({"v7", "v7a", "v7hl", "v7l"}, "v7-a")
       .Case("v7r", "v7-r")
       .Case("v7m", "v7-m")
       .Case("v7em", "v7e-m")
-      .Cases("v8", "v8a", "v8l", "aarch64", "arm64", "v8-a")
+      .Cases({"v8", "v8a", "v8l", "aarch64", "arm64"}, "v8-a")
       .Case("v8.1a", "v8.1-a")
       .Case("v8.2a", "v8.2-a")
       .Case("v8.3a", "v8.3-a")
@@ -39,7 +39,7 @@ StringRef ARM::getArchSynonym(StringRef Arch) {
       .Case("v8.8a", "v8.8-a")
       .Case("v8.9a", "v8.9-a")
       .Case("v8r", "v8-r")
-      .Cases("v9", "v9a", "v9-a")
+      .Cases({"v9", "v9a"}, "v9-a")
       .Case("v9.1a", "v9.1-a")
       .Case("v9.2a", "v9.2-a")
       .Case("v9.3a", "v9.3-a")
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index 0359f836d51cc..84943fc7238f5 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -70,8 +70,8 @@
 
 using namespace llvm;
 
-static std::unique_ptr<llvm::MemoryBuffer>
-    LLVM_ATTRIBUTE_UNUSED getProcCpuinfoContent() {
+[[maybe_unused]] static std::unique_ptr<llvm::MemoryBuffer>
+getProcCpuinfoContent() {
   const char *CPUInfoFile = "/proc/cpuinfo";
   if (const char *CpuinfoIntercept = std::getenv("LLVM_CPUINFO"))
     CPUInfoFile = CpuinfoIntercept;
@@ -964,6 +964,13 @@ static StringRef getIntelProcessorTypeAndSubtype(unsigned Family,
       *Subtype = X86::INTEL_COREI7_PANTHERLAKE;
       break;
 
+    // Wildcatlake:
+    case 0xd5:
+      CPU = "wildcatlake";
+      *Type = X86::INTEL_COREI7;
+      *Subtype = X86::INTEL_COREI7_PANTHERLAKE;
+      break;
+
     // Graniterapids:
     case 0xad:
       CPU = "graniterapids";
@@ -1145,6 +1152,20 @@ static StringRef getIntelProcessorTypeAndSubtype(unsigned Family,
       break;
     }
     break;
+  case 0x12:
+    switch (Model) {
+    // Novalake:
+    case 0x1:
+    case 0x3:
+      CPU = "novalake";
+      *Type = X86::INTEL_COREI7;
+      *Subtype = X86::INTEL_COREI7_NOVALAKE;
+      break;
+    default: // Unknown family 0x12 CPU.
+      break;
+    }
+    break;
+
   default:
     break; // Unknown.
   }
diff --git a/llvm/lib/TargetParser/RISCVISAInfo.cpp b/llvm/lib/TargetParser/RISCVISAInfo.cpp
index 9268df29f5237..31126cc698b3f 100644
--- a/llvm/lib/TargetParser/RISCVISAInfo.cpp
+++ b/llvm/lib/TargetParser/RISCVISAInfo.cpp
@@ -887,7 +887,7 @@ void RISCVISAInfo::updateImplication() {
 }
 
 static constexpr StringLiteral CombineIntoExts[] = {
-    {"b"},     {"zk"},    {"zkn"},  {"zks"},   {"zvkn"},
+    {"a"},     {"b"},     {"zk"},   {"zkn"},   {"zks"},   {"zvkn"},
     {"zvknc"}, {"zvkng"}, {"zvks"}, {"zvksc"}, {"zvksg"},
 };
 
diff --git a/llvm/lib/TargetParser/TargetDataLayout.cpp b/llvm/lib/TargetParser/TargetDataLayout.cpp
index 950bb2bc557b4..d765d9ccb284d 100644
--- a/llvm/lib/TargetParser/TargetDataLayout.cpp
+++ b/llvm/lib/TargetParser/TargetDataLayout.cpp
@@ -548,8 +548,11 @@ std::string Triple::computeDataLayout(StringRef ABIName) const {
   case Triple::csky:
     return computeCSKYDataLayout(*this);
   case Triple::dxil:
+    // TODO: We need to align vectors on the element size generally, but for now
+    // we hard code this for 3-element 32- and 64-bit vectors as a workaround.
+    // See https://github.com/llvm/llvm-project/issues/123968
     return "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-"
-           "f32:32-f64:64-n8:16:32:64";
+           "f32:32-f64:64-n8:16:32:64-v48:16:16-v96:32:32-v192:64:64";
   case Triple::hexagon:
     return "e-m:e-p:32:32:32-a:0-n16:32-"
            "i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-"
diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp
index f02109451aedd..1068ce422d9d0 100644
--- a/llvm/lib/TargetParser/Triple.cpp
+++ b/llvm/lib/TargetParser/Triple.cpp
@@ -579,87 +579,89 @@ static Triple::ArchType parseARMArch(StringRef ArchName) {
 }
 
 static Triple::ArchType parseArch(StringRef ArchName) {
-  auto AT = StringSwitch<Triple::ArchType>(ArchName)
-                .Cases("i386", "i486", "i586", "i686", Triple::x86)
-                // FIXME: Do we need to support these?
-                .Cases("i786", "i886", "i986", Triple::x86)
-                .Cases("amd64", "x86_64", "x86_64h", Triple::x86_64)
-                .Cases("powerpc", "powerpcspe", "ppc", "ppc32", Triple::ppc)
-                .Cases("powerpcle", "ppcle", "ppc32le", Triple::ppcle)
-                .Cases("powerpc64", "ppu", "ppc64", Triple::ppc64)
-                .Cases("powerpc64le", "ppc64le", Triple::ppc64le)
-                .Case("xscale", Triple::arm)
-                .Case("xscaleeb", Triple::armeb)
-                .Case("aarch64", Triple::aarch64)
-                .Case("aarch64_be", Triple::aarch64_be)
-                .Case("aarch64_32", Triple::aarch64_32)
-                .Case("arc", Triple::arc)
-                .Case("arm64", Triple::aarch64)
-                .Case("arm64_32", Triple::aarch64_32)
-                .Case("arm64e", Triple::aarch64)
-                .Case("arm64ec", Triple::aarch64)
-                .Case("arm", Triple::arm)
-                .Case("armeb", Triple::armeb)
-                .Case("thumb", Triple::thumb)
-                .Case("thumbeb", Triple::thumbeb)
-                .Case("avr", Triple::avr)
-                .Case("m68k", Triple::m68k)
-                .Case("msp430", Triple::msp430)
-                .Cases("mips", "mipseb", "mipsallegrex", "mipsisa32r6",
-                       "mipsr6", Triple::mips)
-                .Cases("mipsel", "mipsallegrexel", "mipsisa32r6el", "mipsr6el",
-                       Triple::mipsel)
-                .Cases("mips64", "mips64eb", "mipsn32", "mipsisa64r6",
-                       "mips64r6", "mipsn32r6", Triple::mips64)
-                .Cases("mips64el", "mipsn32el", "mipsisa64r6el", "mips64r6el",
-                       "mipsn32r6el", Triple::mips64el)
-                .Case("r600", Triple::r600)
-                .Case("amdgcn", Triple::amdgcn)
-                .Case("riscv32", Triple::riscv32)
-                .Case("riscv64", Triple::riscv64)
-                .Case("riscv32be", Triple::riscv32be)
-                .Case("riscv64be", Triple::riscv64be)
-                .Case("hexagon", Triple::hexagon)
-                .Cases("s390x", "systemz", Triple::systemz)
-                .Case("sparc", Triple::sparc)
-                .Case("sparcel", Triple::sparcel)
-                .Cases("sparcv9", "sparc64", Triple::sparcv9)
-                .Case("tce", Triple::tce)
-                .Case("tcele", Triple::tcele)
-                .Case("xcore", Triple::xcore)
-                .Case("nvptx", Triple::nvptx)
-                .Case("nvptx64", Triple::nvptx64)
-                .Case("amdil", Triple::amdil)
-                .Case("amdil64", Triple::amdil64)
-                .Case("hsail", Triple::hsail)
-                .Case("hsail64", Triple::hsail64)
-                .Case("spir", Triple::spir)
-                .Case("spir64", Triple::spir64)
-                .Cases("spirv", "spirv1.5", "spirv1.6", Triple::spirv)
-                .Cases("spirv32", "spirv32v1.0", "spirv32v1.1", "spirv32v1.2",
-                       "spirv32v1.3", "spirv32v1.4", "spirv32v1.5",
-                       "spirv32v1.6", Triple::spirv32)
-                .Cases("spirv64", "spirv64v1.0", "spirv64v1.1", "spirv64v1.2",
-                       "spirv64v1.3", "spirv64v1.4", "spirv64v1.5",
-                       "spirv64v1.6", Triple::spirv64)
-                .StartsWith("kalimba", Triple::kalimba)
-                .Case("lanai", Triple::lanai)
-                .Case("renderscript32", Triple::renderscript32)
-                .Case("renderscript64", Triple::renderscript64)
-                .Case("shave", Triple::shave)
-                .Case("ve", Triple::ve)
-                .Case("wasm32", Triple::wasm32)
-                .Case("wasm64", Triple::wasm64)
-                .Case("csky", Triple::csky)
-                .Case("loongarch32", Triple::loongarch32)
-                .Case("loongarch64", Triple::loongarch64)
-                .Cases("dxil", "dxilv1.0", "dxilv1.1", "dxilv1.2", "dxilv1.3",
-                       "dxilv1.4", "dxilv1.5", "dxilv1.6", "dxilv1.7",
-                       "dxilv1.8", Triple::dxil)
-                // Note: Cases has max limit of 10.
-                .Case("dxilv1.9", Triple::dxil)
-                .Case("xtensa", Triple::xtensa)
-                .Default(Triple::UnknownArch);
+  auto AT =
+      StringSwitch<Triple::ArchType>(ArchName)
+          .Cases({"i386", "i486", "i586", "i686"}, Triple::x86)
+          // FIXME: Do we need to support these?
+          .Cases({"i786", "i886", "i986"}, Triple::x86)
+          .Cases({"amd64", "x86_64", "x86_64h"}, Triple::x86_64)
+          .Cases({"powerpc", "powerpcspe", "ppc", "ppc32"}, Triple::ppc)
+          .Cases({"powerpcle", "ppcle", "ppc32le"}, Triple::ppcle)
+          .Cases({"powerpc64", "ppu", "ppc64"}, Triple::ppc64)
+          .Cases({"powerpc64le", "ppc64le"}, Triple::ppc64le)
+          .Case("xscale", Triple::arm)
+          .Case("xscaleeb", Triple::armeb)
+          .Case("aarch64", Triple::aarch64)
+          .Case("aarch64_be", Triple::aarch64_be)
+          .Case("aarch64_32", Triple::aarch64_32)
+          .Case("arc", Triple::arc)
+          .Case("arm64", Triple::aarch64)
+          .Case("arm64_32", Triple::aarch64_32)
+          .Case("arm64e", Triple::aarch64)
+          .Case("arm64ec", Triple::aarch64)
+          .Case("arm", Triple::arm)
+          .Case("armeb", Triple::armeb)
+          .Case("thumb", Triple::thumb)
+          .Case("thumbeb", Triple::thumbeb)
+          .Case("avr", Triple::avr)
+          .Case("m68k", Triple::m68k)
+          .Case("msp430", Triple::msp430)
+          .Cases({"mips", "mipseb", "mipsallegrex", "mipsisa32r6", "mipsr6"},
+                 Triple::mips)
+          .Cases({"mipsel", "mipsallegrexel", "mipsisa32r6el", "mipsr6el"},
+                 Triple::mipsel)
+          .Cases({"mips64", "mips64eb", "mipsn32", "mipsisa64r6", "mips64r6",
+                  "mipsn32r6"},
+                 Triple::mips64)
+          .Cases({"mips64el", "mipsn32el", "mipsisa64r6el", "mips64r6el",
+                  "mipsn32r6el"},
+                 Triple::mips64el)
+          .Case("r600", Triple::r600)
+          .Case("amdgcn", Triple::amdgcn)
+          .Case("riscv32", Triple::riscv32)
+          .Case("riscv64", Triple::riscv64)
+          .Case("riscv32be", Triple::riscv32be)
+          .Case("riscv64be", Triple::riscv64be)
+          .Case("hexagon", Triple::hexagon)
+          .Cases({"s390x", "systemz"}, Triple::systemz)
+          .Case("sparc", Triple::sparc)
+          .Case("sparcel", Triple::sparcel)
+          .Cases({"sparcv9", "sparc64"}, Triple::sparcv9)
+          .Case("tce", Triple::tce)
+          .Case("tcele", Triple::tcele)
+          .Case("xcore", Triple::xcore)
+          .Case("nvptx", Triple::nvptx)
+          .Case("nvptx64", Triple::nvptx64)
+          .Case("amdil", Triple::amdil)
+          .Case("amdil64", Triple::amdil64)
+          .Case("hsail", Triple::hsail)
+          .Case("hsail64", Triple::hsail64)
+          .Case("spir", Triple::spir)
+          .Case("spir64", Triple::spir64)
+          .Cases({"spirv", "spirv1.5", "spirv1.6"}, Triple::spirv)
+          .Cases({"spirv32", "spirv32v1.0", "spirv32v1.1", "spirv32v1.2",
+                  "spirv32v1.3", "spirv32v1.4", "spirv32v1.5", "spirv32v1.6"},
+                 Triple::spirv32)
+          .Cases({"spirv64", "spirv64v1.0", "spirv64v1.1", "spirv64v1.2",
+                  "spirv64v1.3", "spirv64v1.4", "spirv64v1.5", "spirv64v1.6"},
+                 Triple::spirv64)
+          .StartsWith("kalimba", Triple::kalimba)
+          .Case("lanai", Triple::lanai)
+          .Case("renderscript32", Triple::renderscript32)
+          .Case("renderscript64", Triple::renderscript64)
+          .Case("shave", Triple::shave)
+          .Case("ve", Triple::ve)
+          .Case("wasm32", Triple::wasm32)
+          .Case("wasm64", Triple::wasm64)
+          .Case("csky", Triple::csky)
+          .Case("loongarch32", Triple::loongarch32)
+          .Case("loongarch64", Triple::loongarch64)
+          .Cases({"dxil", "dxilv1.0", "dxilv1.1", "dxilv1.2", "dxilv1.3",
+                  "dxilv1.4", "dxilv1.5", "dxilv1.6", "dxilv1.7", "dxilv1.8",
+                  "dxilv1.9"},
+                 Triple::dxil)
+          .Case("xtensa", Triple::xtensa)
+          .Default(Triple::UnknownArch);
 
   // Some architectures require special parsing logic just to compute the
   // ArchType result.
@@ -1071,7 +1073,7 @@ Triple::Triple(std::string &&Str) : Data(std::move(Str)) {
               .StartsWith("mips64", Triple::GNUABI64)
               .StartsWith("mipsisa64", Triple::GNUABI64)
               .StartsWith("mipsisa32", Triple::GNU)
-              .Cases("mips", "mipsel", "mipsr6", "mipsr6el", Triple::GNU)
+              .Cases({"mips", "mipsel", "mipsr6", "mipsr6el"}, Triple::GNU)
               .Default(UnknownEnvironment);
     }
   }
diff --git a/llvm/lib/TargetParser/Unix/Host.inc b/llvm/lib/TargetParser/Unix/Host.inc
index aeb2f59218700..38b942d053c1d 100644
--- a/llvm/lib/TargetParser/Unix/Host.inc
+++ b/llvm/lib/TargetParser/Unix/Host.inc
@@ -59,10 +59,30 @@ static std::string updateTripleOSVersion(std::string TargetTripleString) {
     if (TT.getOS() == Triple::AIX && !TT.getOSMajorVersion()) {
       struct utsname name;
       if (uname(&name) != -1) {
+        std::string release = name.release;
+
+        if (strcmp(name.sysname, "OS400") == 0) {
+          /*
+            PASE uses different versioning system than AIX.
+            The following table shows the currently supported PASE
+            releases and the corresponding AIX release:
+            --------------------------
+              PASE    |    AIX
+            --------------------------
+              V7R4    |    7.2 (TL2)
+            --------------------------
+              V7R5    |    7.2 (TL5)
+            --------------------------
+              V7R6    |    7.3 (TL1)
+            --------------------------
+          */
+          release = (release == "4" || release == "5") ? "2" : "3";
+        }
+
         std::string NewOSName = std::string(Triple::getOSTypeName(Triple::AIX));
         NewOSName += name.version;
         NewOSName += '.';
-        NewOSName += name.release;
+        NewOSName += release;
         NewOSName += ".0.0";
         TT.setOSName(NewOSName);
         return TT.str();
diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp
index edca7c18062ae..b13c795c1649c 100644
--- a/llvm/lib/TargetParser/X86TargetParser.cpp
+++ b/llvm/lib/TargetParser/X86TargetParser.cpp
@@ -143,8 +143,7 @@ constexpr FeatureBitset FeaturesDiamondRapids =
     FeatureAVXVNNIINT8 | FeatureAVXVNNIINT16 | FeatureSHA512 | FeatureSM3 |
     FeatureSM4 | FeatureEGPR | FeatureZU | FeatureCCMP | FeaturePush2Pop2 |
     FeaturePPX | FeatureNDD | FeatureNF | FeatureMOVRS | FeatureAMX_MOVRS |
-    FeatureAMX_AVX512 | FeatureAMX_FP8 | FeatureAMX_TF32 |
-    FeatureAMX_TRANSPOSE | FeatureUSERMSR;
+    FeatureAMX_AVX512 | FeatureAMX_FP8 | FeatureAMX_TF32 | FeatureAMX_TRANSPOSE;
 
 // Intel Atom processors.
 // Bonnell has feature parity with Core2 and adds MOVBE.
@@ -175,7 +174,9 @@ constexpr FeatureBitset FeaturesArrowlakeS =
     FeaturesArrowlake | FeatureAVXVNNIINT16 | FeatureSHA512 | FeatureSM3 |
     FeatureSM4;
 constexpr FeatureBitset FeaturesPantherlake =
-    (FeaturesArrowlakeS ^ FeatureWIDEKL) | FeaturePREFETCHI;
+    (FeaturesArrowlakeS ^ FeatureWIDEKL);
+constexpr FeatureBitset FeaturesNovalake =
+    FeaturesPantherlake | FeaturePREFETCHI;
 constexpr FeatureBitset FeaturesClearwaterforest =
     (FeaturesSierraforest ^ FeatureWIDEKL) | FeatureAVXVNNIINT16 |
     FeatureSHA512 | FeatureSM3 | FeatureSM4 | FeaturePREFETCHI | FeatureUSERMSR;
@@ -378,6 +379,9 @@ constexpr ProcInfo Processors[] = {
   { {"gracemont"}, CK_Gracemont, FEATURE_AVX2, FeaturesAlderlake, 'p', false },
   // Pantherlake microarchitecture based processors.
   { {"pantherlake"}, CK_Lunarlake, FEATURE_AVX2, FeaturesPantherlake, 'p', false },
+  { {"wildcatlake"}, CK_Lunarlake, FEATURE_AVX2, FeaturesPantherlake, 'p', false },
+  // Novalake microarchitecture based processors.
+  { {"novalake"}, CK_Novalake, FEATURE_AVX2, FeaturesNovalake, 'r', false },
   // Sierraforest microarchitecture based processors.
   { {"sierraforest"}, CK_Sierraforest, FEATURE_AVX2, FeaturesSierraforest, 'p', false },
   // Grandridge microarchitecture based processors.
diff --git a/llvm/lib/Transforms/CFGuard/CFGuard.cpp b/llvm/lib/Transforms/CFGuard/CFGuard.cpp
index b73a0ce2e9ff4..46456706d46a1 100644
--- a/llvm/lib/Transforms/CFGuard/CFGuard.cpp
+++ b/llvm/lib/Transforms/CFGuard/CFGuard.cpp
@@ -147,7 +147,7 @@ class CFGuardImpl {
 
 private:
   // Only add checks if the module has the cfguard=2 flag.
-  int cfguard_module_flag = 0;
+  int CFGuardModuleFlag = 0;
   StringRef GuardFnName;
   Mechanism GuardMechanism = Mechanism::Check;
   FunctionType *GuardFnType = nullptr;
@@ -162,9 +162,7 @@ class CFGuard : public FunctionPass {
   static char ID;
 
   // Default constructor required for the INITIALIZE_PASS macro.
-  CFGuard(CFGuardImpl::Mechanism M) : FunctionPass(ID), Impl(M) {
-    initializeCFGuardPass(*PassRegistry::getPassRegistry());
-  }
+  CFGuard(CFGuardImpl::Mechanism M) : FunctionPass(ID), Impl(M) {}
 
   bool doInitialization(Module &M) override { return Impl.doInitialization(M); }
   bool runOnFunction(Function &F) override { return Impl.runOnFunction(F); }
@@ -173,7 +171,6 @@ class CFGuard : public FunctionPass {
 } // end anonymous namespace
 
 void CFGuardImpl::insertCFGuardCheck(CallBase *CB) {
-
   assert(CB->getModule()->getTargetTriple().isOSWindows() &&
          "Only applicable for Windows targets");
   assert(CB->isIndirectCall() &&
@@ -202,7 +199,6 @@ void CFGuardImpl::insertCFGuardCheck(CallBase *CB) {
 }
 
 void CFGuardImpl::insertCFGuardDispatch(CallBase *CB) {
-
   assert(CB->getModule()->getTargetTriple().isOSWindows() &&
          "Only applicable for Windows targets");
   assert(CB->isIndirectCall() &&
@@ -236,14 +232,13 @@ void CFGuardImpl::insertCFGuardDispatch(CallBase *CB) {
 }
 
 bool CFGuardImpl::doInitialization(Module &M) {
-
   // Check if this module has the cfguard flag and read its value.
   if (auto *MD =
           mdconst::extract_or_null<ConstantInt>(M.getModuleFlag("cfguard")))
-    cfguard_module_flag = MD->getZExtValue();
+    CFGuardModuleFlag = MD->getZExtValue();
 
   // Skip modules for which CFGuard checks have been disabled.
-  if (cfguard_module_flag != 2)
+  if (CFGuardModuleFlag != 2)
     return false;
 
   // Set up prototypes for the guard check and dispatch functions.
@@ -264,9 +259,8 @@ bool CFGuardImpl::doInitialization(Module &M) {
 }
 
 bool CFGuardImpl::runOnFunction(Function &F) {
-
   // Skip modules for which CFGuard checks have been disabled.
-  if (cfguard_module_flag != 2)
+  if (CFGuardModuleFlag != 2)
     return false;
 
   SmallVector<CallBase *, 8> IndirectCalls;
@@ -286,19 +280,16 @@ bool CFGuardImpl::runOnFunction(Function &F) {
   }
 
   // If no checks are needed, return early.
-  if (IndirectCalls.empty()) {
+  if (IndirectCalls.empty())
     return false;
-  }
 
   // For each indirect call/invoke, add the appropriate dispatch or check.
   if (GuardMechanism == Mechanism::Dispatch) {
-    for (CallBase *CB : IndirectCalls) {
+    for (CallBase *CB : IndirectCalls)
       insertCFGuardDispatch(CB);
-    }
   } else {
-    for (CallBase *CB : IndirectCalls) {
+    for (CallBase *CB : IndirectCalls)
       insertCFGuardCheck(CB);
-    }
   }
 
   return true;
diff --git a/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp b/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp
index f166fef74941e..cf7e4507aff9b 100644
--- a/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp
@@ -153,26 +153,23 @@ PreservedAnalyses CoroAnnotationElidePass::run(LazyCallGraph::SCC &C,
       bool IsCallerPresplitCoroutine = Caller->isPresplitCoroutine();
       bool HasAttr = CB->hasFnAttr(llvm::Attribute::CoroElideSafe);
       if (IsCallerPresplitCoroutine && HasAttr) {
-        BranchProbability MinBranchProbability(
-            static_cast<int>(CoroElideBranchRatio * MinBlockCounterExecution),
-            MinBlockCounterExecution);
-
         auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(*Caller);
 
-        auto Prob = BranchProbability::getBranchProbability(
-            BFI.getBlockFreq(CB->getParent()).getFrequency(),
-            BFI.getEntryFreq().getFrequency());
+        auto BlockFreq = BFI.getBlockFreq(CB->getParent()).getFrequency();
+        auto EntryFreq = BFI.getEntryFreq().getFrequency();
+        uint64_t MinFreq =
+            static_cast<uint64_t>(EntryFreq * CoroElideBranchRatio);
 
-        if (Prob < MinBranchProbability) {
+        if (BlockFreq < MinFreq) {
           ORE.emit([&]() {
             return OptimizationRemarkMissed(
                        DEBUG_TYPE, "CoroAnnotationElideUnlikely", Caller)
                    << "'" << ore::NV("callee", Callee->getName())
                    << "' not elided in '"
                    << ore::NV("caller", Caller->getName())
-                   << "' because of low probability: "
-                   << ore::NV("probability", Prob) << " (threshold: "
-                   << ore::NV("threshold", MinBranchProbability) << ")";
+                   << "' because of low frequency: "
+                   << ore::NV("block_freq", BlockFreq)
+                   << " (threshold: " << ore::NV("min_freq", MinFreq) << ")";
           });
           continue;
         }
@@ -188,7 +185,8 @@ PreservedAnalyses CoroAnnotationElidePass::run(LazyCallGraph::SCC &C,
           return OptimizationRemark(DEBUG_TYPE, "CoroAnnotationElide", Caller)
                  << "'" << ore::NV("callee", Callee->getName())
                  << "' elided in '" << ore::NV("caller", Caller->getName())
-                 << "' (probability: " << ore::NV("probability", Prob) << ")";
+                 << "' (block_freq: " << ore::NV("block_freq", BlockFreq)
+                 << ")";
         });
 
         FAM.invalidate(*Caller, PreservedAnalyses::none());
diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp
index 28ee4449421bd..a29faab6d0072 100644
--- a/llvm/lib/Transforms/IPO/FunctionImport.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp
@@ -1368,13 +1368,13 @@ static void ComputeCrossModuleImportForModuleFromIndexForTest(
     FunctionImporter::ImportMapTy &ImportList) {
   for (const auto &GlobalList : Index) {
     // Ignore entries for undefined references.
-    if (GlobalList.second.SummaryList.empty())
+    if (GlobalList.second.getSummaryList().empty())
       continue;
 
     auto GUID = GlobalList.first;
-    assert(GlobalList.second.SummaryList.size() == 1 &&
+    assert(GlobalList.second.getSummaryList().size() == 1 &&
            "Expected individual combined index to have one summary per GUID");
-    auto &Summary = GlobalList.second.SummaryList[0];
+    auto &Summary = GlobalList.second.getSummaryList()[0];
     // Skip the summaries for the importing module. These are included to
     // e.g. record required linkage changes.
     if (Summary->modulePath() == ModulePath)
@@ -1423,7 +1423,7 @@ void updateValueInfoForIndirectCalls(ModuleSummaryIndex &Index,
 
 void llvm::updateIndirectCalls(ModuleSummaryIndex &Index) {
   for (const auto &Entry : Index) {
-    for (const auto &S : Entry.second.SummaryList) {
+    for (const auto &S : Entry.second.getSummaryList()) {
       if (auto *FS = dyn_cast<FunctionSummary>(S.get()))
         updateValueInfoForIndirectCalls(Index, FS);
     }
@@ -1456,7 +1456,7 @@ void llvm::computeDeadSymbolsAndUpdateIndirectCalls(
   // Add values flagged in the index as live roots to the worklist.
   for (const auto &Entry : Index) {
     auto VI = Index.getValueInfo(Entry);
-    for (const auto &S : Entry.second.SummaryList) {
+    for (const auto &S : Entry.second.getSummaryList()) {
       if (auto *FS = dyn_cast<FunctionSummary>(S.get()))
         updateValueInfoForIndirectCalls(Index, FS);
       if (S->isLive()) {
@@ -2094,7 +2094,7 @@ static bool doImportingForModuleForTest(
   // is only enabled when testing importing via the 'opt' tool, which does
   // not do the ThinLink that would normally determine what values to promote.
   for (auto &I : *Index) {
-    for (auto &S : I.second.SummaryList) {
+    for (auto &S : I.second.getSummaryList()) {
       if (GlobalValue::isLocalLinkage(S->linkage()))
         S->setLinkage(GlobalValue::ExternalLinkage);
     }
diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index be6cba38d8b3c..46fb567593add 100644
--- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -2130,7 +2130,7 @@ bool LowerTypeTestsModule::lower() {
       // A set of all functions that are address taken by a live global object.
       DenseSet<GlobalValue::GUID> AddressTaken;
       for (auto &I : *ExportSummary)
-        for (auto &GVS : I.second.SummaryList)
+        for (auto &GVS : I.second.getSummaryList())
           if (GVS->isLive())
             for (const auto &Ref : GVS->refs()) {
               AddressTaken.insert(Ref.getGUID());
@@ -2409,7 +2409,7 @@ bool LowerTypeTestsModule::lower() {
     }
 
     for (auto &P : *ExportSummary) {
-      for (auto &S : P.second.SummaryList) {
+      for (auto &S : P.second.getSummaryList()) {
         if (!ExportSummary->isGlobalValueLive(S.get()))
           continue;
         if (auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject()))
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 5066a99fa0858..894d83fa530b1 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -6150,3 +6150,42 @@ void MemProfContextDisambiguation::run(
   IndexCallsiteContextGraph CCG(Index, isPrevailing);
   CCG.process();
 }
+
+// Strips MemProf attributes and metadata. Can be invoked by the pass pipeline
+// when we don't have an index that has recorded that we are linking with
+// allocation libraries containing the necessary APIs for downstream
+// transformations.
+PreservedAnalyses MemProfRemoveInfo::run(Module &M, ModuleAnalysisManager &AM) {
+  // The profile matcher applies hotness attributes directly for allocations,
+  // and those will cause us to generate calls to the hot/cold interfaces
+  // unconditionally. If supports-hot-cold-new was not enabled in the LTO
+  // link then assume we don't want these calls (e.g. not linking with
+  // the appropriate library, or otherwise trying to disable this behavior).
+  bool Changed = false;
+  for (auto &F : M) {
+    for (auto &BB : F) {
+      for (auto &I : BB) {
+        auto *CI = dyn_cast<CallBase>(&I);
+        if (!CI)
+          continue;
+        if (CI->hasFnAttr("memprof")) {
+          CI->removeFnAttr("memprof");
+          Changed = true;
+        }
+        if (!CI->hasMetadata(LLVMContext::MD_callsite)) {
+          assert(!CI->hasMetadata(LLVMContext::MD_memprof));
+          continue;
+        }
+        // Strip off all memprof metadata as it is no longer needed.
+        // Importantly, this avoids the addition of new memprof attributes
+        // after inlining propagation.
+        CI->setMetadata(LLVMContext::MD_memprof, nullptr);
+        CI->setMetadata(LLVMContext::MD_callsite, nullptr);
+        Changed = true;
+      }
+    }
+  }
+  if (!Changed)
+    return PreservedAnalyses::all();
+  return PreservedAnalyses::none();
+}
diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index 2d5cb8268ffdd..76e588b116c04 100644
--- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -928,7 +928,7 @@ void llvm::updateVCallVisibilityInIndex(
     // linker, as we have no information on their eventual use.
     if (DynamicExportSymbols.count(P.first))
       continue;
-    for (auto &S : P.second.SummaryList) {
+    for (auto &S : P.second.getSummaryList()) {
       auto *GVar = dyn_cast<GlobalVarSummary>(S.get());
       if (!GVar ||
           GVar->getVCallVisibility() != GlobalObject::VCallVisibilityPublic)
@@ -2413,7 +2413,7 @@ bool DevirtModule::run() {
     }
 
     for (auto &P : *ExportSummary) {
-      for (auto &S : P.second.SummaryList) {
+      for (auto &S : P.second.getSummaryList()) {
         auto *FS = dyn_cast<FunctionSummary>(S.get());
         if (!FS)
           continue;
@@ -2564,7 +2564,7 @@ void DevirtIndex::run() {
 
   // Collect information from summary about which calls to try to devirtualize.
   for (auto &P : ExportSummary) {
-    for (auto &S : P.second.SummaryList) {
+    for (auto &S : P.second.getSummaryList()) {
       auto *FS = dyn_cast<FunctionSummary>(S.get());
       if (!FS)
         continue;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index e1e24a99d0474..dab200d86d4a0 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -289,12 +289,11 @@ Instruction *InstCombinerImpl::SimplifyAnyMemSet(AnyMemSetInst *MI) {
 // * Narrow width by halfs excluding zero/undef lanes
 Value *InstCombinerImpl::simplifyMaskedLoad(IntrinsicInst &II) {
   Value *LoadPtr = II.getArgOperand(0);
-  const Align Alignment =
-      cast<ConstantInt>(II.getArgOperand(1))->getAlignValue();
+  const Align Alignment = II.getParamAlign(0).valueOrOne();
 
   // If the mask is all ones or undefs, this is a plain vector load of the 1st
   // argument.
-  if (maskIsAllOneOrUndef(II.getArgOperand(2))) {
+  if (maskIsAllOneOrUndef(II.getArgOperand(1))) {
     LoadInst *L = Builder.CreateAlignedLoad(II.getType(), LoadPtr, Alignment,
                                             "unmaskedload");
     L->copyMetadata(II);
@@ -308,7 +307,7 @@ Value *InstCombinerImpl::simplifyMaskedLoad(IntrinsicInst &II) {
     LoadInst *LI = Builder.CreateAlignedLoad(II.getType(), LoadPtr, Alignment,
                                              "unmaskedload");
     LI->copyMetadata(II);
-    return Builder.CreateSelect(II.getArgOperand(2), LI, II.getArgOperand(3));
+    return Builder.CreateSelect(II.getArgOperand(1), LI, II.getArgOperand(2));
   }
 
   return nullptr;
@@ -319,8 +318,8 @@ Value *InstCombinerImpl::simplifyMaskedLoad(IntrinsicInst &II) {
 // * Narrow width by halfs excluding zero/undef lanes
 Instruction *InstCombinerImpl::simplifyMaskedStore(IntrinsicInst &II) {
   Value *StorePtr = II.getArgOperand(1);
-  Align Alignment = cast<ConstantInt>(II.getArgOperand(2))->getAlignValue();
-  auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3));
+  Align Alignment = II.getParamAlign(1).valueOrOne();
+  auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(2));
   if (!ConstMask)
     return nullptr;
 
@@ -356,7 +355,7 @@ Instruction *InstCombinerImpl::simplifyMaskedStore(IntrinsicInst &II) {
 // * Narrow width by halfs excluding zero/undef lanes
 // * Vector incrementing address -> vector masked load
 Instruction *InstCombinerImpl::simplifyMaskedGather(IntrinsicInst &II) {
-  auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(2));
+  auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(1));
   if (!ConstMask)
     return nullptr;
 
@@ -366,8 +365,7 @@ Instruction *InstCombinerImpl::simplifyMaskedGather(IntrinsicInst &II) {
   if (ConstMask->isAllOnesValue())
     if (auto *SplatPtr = getSplatValue(II.getArgOperand(0))) {
       auto *VecTy = cast<VectorType>(II.getType());
-      const Align Alignment =
-          cast<ConstantInt>(II.getArgOperand(1))->getAlignValue();
+      const Align Alignment = II.getParamAlign(0).valueOrOne();
       LoadInst *L = Builder.CreateAlignedLoad(VecTy->getElementType(), SplatPtr,
                                               Alignment, "load.scalar");
       Value *Shuf =
@@ -384,7 +382,7 @@ Instruction *InstCombinerImpl::simplifyMaskedGather(IntrinsicInst &II) {
 // * Narrow store width by halfs excluding zero/undef lanes
 // * Vector incrementing address -> vector masked store
 Instruction *InstCombinerImpl::simplifyMaskedScatter(IntrinsicInst &II) {
-  auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3));
+  auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(2));
   if (!ConstMask)
     return nullptr;
 
@@ -397,8 +395,7 @@ Instruction *InstCombinerImpl::simplifyMaskedScatter(IntrinsicInst &II) {
     // scatter(splat(value), splat(ptr), non-zero-mask) -> store value, ptr
     if (auto *SplatValue = getSplatValue(II.getArgOperand(0))) {
       if (maskContainsAllOneOrUndef(ConstMask)) {
-        Align Alignment =
-            cast<ConstantInt>(II.getArgOperand(2))->getAlignValue();
+        Align Alignment = II.getParamAlign(1).valueOrOne();
         StoreInst *S = new StoreInst(SplatValue, SplatPtr, /*IsVolatile=*/false,
                                      Alignment);
         S->copyMetadata(II);
@@ -408,7 +405,7 @@ Instruction *InstCombinerImpl::simplifyMaskedScatter(IntrinsicInst &II) {
     // scatter(vector, splat(ptr), splat(true)) -> store extract(vector,
     // lastlane), ptr
     if (ConstMask->isAllOnesValue()) {
-      Align Alignment = cast<ConstantInt>(II.getArgOperand(2))->getAlignValue();
+      Align Alignment = II.getParamAlign(1).valueOrOne();
       VectorType *WideLoadTy = cast<VectorType>(II.getArgOperand(1)->getType());
       ElementCount VF = WideLoadTy->getElementCount();
       Value *RunTimeVF = Builder.CreateElementCount(Builder.getInt32Ty(), VF);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 4c9b10a094981..9b9fe265c7bce 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -156,9 +156,9 @@ Instruction *InstCombinerImpl::commonCastTransforms(CastInst &CI) {
   Value *Src = CI.getOperand(0);
   Type *Ty = CI.getType();
 
-  if (auto *SrcC = dyn_cast<Constant>(Src))
-    if (Constant *Res = ConstantFoldCastOperand(CI.getOpcode(), SrcC, Ty, DL))
-      return replaceInstUsesWith(CI, Res);
+  if (Value *Res =
+          simplifyCastInst(CI.getOpcode(), Src, Ty, SQ.getWithInstruction(&CI)))
+    return replaceInstUsesWith(CI, Res);
 
   // Try to eliminate a cast of a cast.
   if (auto *CSrc = dyn_cast<CastInst>(Src)) {   // A->B->C cast
@@ -1643,33 +1643,46 @@ Instruction *InstCombinerImpl::visitSExt(SExtInst &Sext) {
 
 /// Return a Constant* for the specified floating-point constant if it fits
 /// in the specified FP type without changing its value.
-static bool fitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) {
+static bool fitsInFPType(APFloat F, const fltSemantics &Sem) {
   bool losesInfo;
-  APFloat F = CFP->getValueAPF();
   (void)F.convert(Sem, APFloat::rmNearestTiesToEven, &losesInfo);
   return !losesInfo;
 }
 
-static Type *shrinkFPConstant(ConstantFP *CFP, bool PreferBFloat) {
-  if (CFP->getType() == Type::getPPC_FP128Ty(CFP->getContext()))
-    return nullptr;  // No constant folding of this.
+static Type *shrinkFPConstant(LLVMContext &Ctx, const APFloat &F,
+                              bool PreferBFloat) {
   // See if the value can be truncated to bfloat and then reextended.
-  if (PreferBFloat && fitsInFPType(CFP, APFloat::BFloat()))
-    return Type::getBFloatTy(CFP->getContext());
+  if (PreferBFloat && fitsInFPType(F, APFloat::BFloat()))
+    return Type::getBFloatTy(Ctx);
   // See if the value can be truncated to half and then reextended.
-  if (!PreferBFloat && fitsInFPType(CFP, APFloat::IEEEhalf()))
-    return Type::getHalfTy(CFP->getContext());
+  if (!PreferBFloat && fitsInFPType(F, APFloat::IEEEhalf()))
+    return Type::getHalfTy(Ctx);
   // See if the value can be truncated to float and then reextended.
-  if (fitsInFPType(CFP, APFloat::IEEEsingle()))
-    return Type::getFloatTy(CFP->getContext());
-  if (CFP->getType()->isDoubleTy())
-    return nullptr;  // Won't shrink.
-  if (fitsInFPType(CFP, APFloat::IEEEdouble()))
-    return Type::getDoubleTy(CFP->getContext());
+  if (fitsInFPType(F, APFloat::IEEEsingle()))
+    return Type::getFloatTy(Ctx);
+  if (&F.getSemantics() == &APFloat::IEEEdouble())
+    return nullptr; // Won't shrink.
+  // See if the value can be truncated to double and then reextended.
+  if (fitsInFPType(F, APFloat::IEEEdouble()))
+    return Type::getDoubleTy(Ctx);
   // Don't try to shrink to various long double types.
   return nullptr;
 }
 
+static Type *shrinkFPConstant(ConstantFP *CFP, bool PreferBFloat) {
+  Type *Ty = CFP->getType();
+  if (Ty->getScalarType()->isPPC_FP128Ty())
+    return nullptr; // No constant folding of this.
+
+  Type *ShrinkTy =
+      shrinkFPConstant(CFP->getContext(), CFP->getValueAPF(), PreferBFloat);
+  if (ShrinkTy)
+    if (auto *VecTy = dyn_cast<VectorType>(Ty))
+      ShrinkTy = VectorType::get(ShrinkTy, VecTy);
+
+  return ShrinkTy;
+}
+
 // Determine if this is a vector of ConstantFPs and if so, return the minimal
 // type we can safely truncate all elements to.
 static Type *shrinkFPConstantVector(Value *V, bool PreferBFloat) {
@@ -1720,10 +1733,10 @@ static Type *getMinimumFPType(Value *V, bool PreferBFloat) {
 
   // Try to shrink scalable and fixed splat vectors.
   if (auto *FPC = dyn_cast<Constant>(V))
-    if (isa<VectorType>(V->getType()))
+    if (auto *VTy = dyn_cast<VectorType>(V->getType()))
       if (auto *Splat = dyn_cast_or_null<ConstantFP>(FPC->getSplatValue()))
         if (Type *T = shrinkFPConstant(Splat, PreferBFloat))
-          return T;
+          return VectorType::get(T, VTy);
 
   // Try to shrink a vector of FP constants. This returns nullptr on scalable
   // vectors
@@ -1796,10 +1809,9 @@ Instruction *InstCombinerImpl::visitFPTrunc(FPTruncInst &FPT) {
   Type *Ty = FPT.getType();
   auto *BO = dyn_cast<BinaryOperator>(FPT.getOperand(0));
   if (BO && BO->hasOneUse()) {
-    Type *LHSMinType =
-        getMinimumFPType(BO->getOperand(0), /*PreferBFloat=*/Ty->isBFloatTy());
-    Type *RHSMinType =
-        getMinimumFPType(BO->getOperand(1), /*PreferBFloat=*/Ty->isBFloatTy());
+    bool PreferBFloat = Ty->getScalarType()->isBFloatTy();
+    Type *LHSMinType = getMinimumFPType(BO->getOperand(0), PreferBFloat);
+    Type *RHSMinType = getMinimumFPType(BO->getOperand(1), PreferBFloat);
     unsigned OpWidth = BO->getType()->getFPMantissaWidth();
     unsigned LHSWidth = LHSMinType->getFPMantissaWidth();
     unsigned RHSWidth = RHSMinType->getFPMantissaWidth();
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 07ad65c8b7d42..fba1ccf2c8c9b 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -1481,13 +1481,13 @@ Instruction *InstCombinerImpl::foldICmpTruncConstant(ICmpInst &Cmp,
       return new ICmpInst(Pred, Y, ConstantInt::get(SrcTy, C.logBase2()));
   }
 
-  if (Cmp.isEquality() && Trunc->hasOneUse()) {
+  if (Cmp.isEquality() && (Trunc->hasOneUse() || Trunc->hasNoUnsignedWrap())) {
     // Canonicalize to a mask and wider compare if the wide type is suitable:
     // (trunc X to i8) == C --> (X & 0xff) == (zext C)
     if (!SrcTy->isVectorTy() && shouldChangeType(DstBits, SrcBits)) {
       Constant *Mask =
           ConstantInt::get(SrcTy, APInt::getLowBitsSet(SrcBits, DstBits));
-      Value *And = Builder.CreateAnd(X, Mask);
+      Value *And = Trunc->hasNoUnsignedWrap() ? X : Builder.CreateAnd(X, Mask);
       Constant *WideC = ConstantInt::get(SrcTy, C.zext(SrcBits));
       return new ICmpInst(Pred, And, WideC);
     }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 6b67b48a138be..5aa8de38e4321 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -2979,10 +2979,14 @@ Instruction *InstCombinerImpl::foldAndOrOfSelectUsingImpliedCond(Value *Op,
          "Op must be either i1 or vector of i1.");
   if (SI.getCondition()->getType() != Op->getType())
     return nullptr;
-  if (Value *V = simplifyNestedSelectsUsingImpliedCond(SI, Op, IsAnd, DL))
-    return SelectInst::Create(Op,
-                              IsAnd ? V : ConstantInt::getTrue(Op->getType()),
-                              IsAnd ? ConstantInt::getFalse(Op->getType()) : V);
+  if (Value *V = simplifyNestedSelectsUsingImpliedCond(SI, Op, IsAnd, DL)) {
+    Instruction *MDFrom = nullptr;
+    if (!ProfcheckDisableMetadataFixes)
+      MDFrom = &SI;
+    return SelectInst::Create(
+        Op, IsAnd ? V : ConstantInt::getTrue(Op->getType()),
+        IsAnd ? ConstantInt::getFalse(Op->getType()) : V, "", nullptr, MDFrom);
+  }
   return nullptr;
 }
 
@@ -3451,27 +3455,45 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) {
   // select a, false, b -> select !a, b, false
   if (match(TrueVal, m_Specific(Zero))) {
     Value *NotCond = Builder.CreateNot(CondVal, "not." + CondVal->getName());
-    return SelectInst::Create(NotCond, FalseVal, Zero);
+    Instruction *MDFrom = ProfcheckDisableMetadataFixes ? nullptr : &SI;
+    SelectInst *NewSI =
+        SelectInst::Create(NotCond, FalseVal, Zero, "", nullptr, MDFrom);
+    NewSI->swapProfMetadata();
+    return NewSI;
   }
   // select a, b, true -> select !a, true, b
   if (match(FalseVal, m_Specific(One))) {
     Value *NotCond = Builder.CreateNot(CondVal, "not." + CondVal->getName());
-    return SelectInst::Create(NotCond, One, TrueVal);
+    Instruction *MDFrom = ProfcheckDisableMetadataFixes ? nullptr : &SI;
+    SelectInst *NewSI =
+        SelectInst::Create(NotCond, One, TrueVal, "", nullptr, MDFrom);
+    NewSI->swapProfMetadata();
+    return NewSI;
   }
 
   // DeMorgan in select form: !a && !b --> !(a || b)
   // select !a, !b, false --> not (select a, true, b)
   if (match(&SI, m_LogicalAnd(m_Not(m_Value(A)), m_Not(m_Value(B)))) &&
       (CondVal->hasOneUse() || TrueVal->hasOneUse()) &&
-      !match(A, m_ConstantExpr()) && !match(B, m_ConstantExpr()))
-    return BinaryOperator::CreateNot(Builder.CreateSelect(A, One, B));
+      !match(A, m_ConstantExpr()) && !match(B, m_ConstantExpr())) {
+    Instruction *MDFrom = ProfcheckDisableMetadataFixes ? nullptr : &SI;
+    SelectInst *NewSI =
+        cast<SelectInst>(Builder.CreateSelect(A, One, B, "", MDFrom));
+    NewSI->swapProfMetadata();
+    return BinaryOperator::CreateNot(NewSI);
+  }
 
   // DeMorgan in select form: !a || !b --> !(a && b)
   // select !a, true, !b --> not (select a, b, false)
   if (match(&SI, m_LogicalOr(m_Not(m_Value(A)), m_Not(m_Value(B)))) &&
       (CondVal->hasOneUse() || FalseVal->hasOneUse()) &&
-      !match(A, m_ConstantExpr()) && !match(B, m_ConstantExpr()))
-    return BinaryOperator::CreateNot(Builder.CreateSelect(A, B, Zero));
+      !match(A, m_ConstantExpr()) && !match(B, m_ConstantExpr())) {
+    Instruction *MDFrom = ProfcheckDisableMetadataFixes ? nullptr : &SI;
+    SelectInst *NewSI =
+        cast<SelectInst>(Builder.CreateSelect(A, B, Zero, "", MDFrom));
+    NewSI->swapProfMetadata();
+    return BinaryOperator::CreateNot(NewSI);
+  }
 
   // select (select a, true, b), true, b -> select a, true, b
   if (match(CondVal, m_Select(m_Value(A), m_One(), m_Value(B))) &&
@@ -3753,6 +3775,10 @@ static Instruction *foldBitCeil(SelectInst &SI, IRBuilderBase &Builder,
 //   (x < y) ? -1 : zext(x > y)
 //   (x > y) ? 1 : sext(x != y)
 //   (x > y) ? 1 : sext(x < y)
+//   (x == y) ? 0 : (x > y ? 1 : -1)
+//   (x == y) ? 0 : (x < y ? -1 : 1)
+//   Special case: x == C ? 0 : (x > C - 1 ? 1 : -1)
+//   Special case: x == C ? 0 : (x < C + 1 ? -1 : 1)
 // Into ucmp/scmp(x, y), where signedness is determined by the signedness
 // of the comparison in the original sequence.
 Instruction *InstCombinerImpl::foldSelectToCmp(SelectInst &SI) {
@@ -3845,6 +3871,44 @@ Instruction *InstCombinerImpl::foldSelectToCmp(SelectInst &SI) {
     }
   }
 
+  // Special cases with constants: x == C ? 0 : (x > C-1 ? 1 : -1)
+  if (Pred == ICmpInst::ICMP_EQ && match(TV, m_Zero())) {
+    const APInt *C;
+    if (match(RHS, m_APInt(C))) {
+      CmpPredicate InnerPred;
+      Value *InnerRHS;
+      const APInt *InnerTV, *InnerFV;
+      if (match(FV,
+                m_Select(m_ICmp(InnerPred, m_Specific(LHS), m_Value(InnerRHS)),
+                         m_APInt(InnerTV), m_APInt(InnerFV)))) {
+
+        // x == C ? 0 : (x > C-1 ? 1 : -1)
+        if (ICmpInst::isGT(InnerPred) && InnerTV->isOne() &&
+            InnerFV->isAllOnes()) {
+          IsSigned = ICmpInst::isSigned(InnerPred);
+          bool CanSubOne = IsSigned ? !C->isMinSignedValue() : !C->isMinValue();
+          if (CanSubOne) {
+            APInt Cminus1 = *C - 1;
+            if (match(InnerRHS, m_SpecificInt(Cminus1)))
+              Replace = true;
+          }
+        }
+
+        // x == C ? 0 : (x < C+1 ? -1 : 1)
+        if (ICmpInst::isLT(InnerPred) && InnerTV->isAllOnes() &&
+            InnerFV->isOne()) {
+          IsSigned = ICmpInst::isSigned(InnerPred);
+          bool CanAddOne = IsSigned ? !C->isMaxSignedValue() : !C->isMaxValue();
+          if (CanAddOne) {
+            APInt Cplus1 = *C + 1;
+            if (match(InnerRHS, m_SpecificInt(Cplus1)))
+              Replace = true;
+          }
+        }
+      }
+    }
+  }
+
   Intrinsic::ID IID = IsSigned ? Intrinsic::scmp : Intrinsic::ucmp;
   if (Replace)
     return replaceInstUsesWith(
@@ -4455,24 +4519,24 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
   if (Value *V = foldSelectIntoAddConstant(SI, Builder))
     return replaceInstUsesWith(SI, V);
 
-  // select(mask, mload(,,mask,0), 0) -> mload(,,mask,0)
+  // select(mask, mload(ptr,mask,0), 0) -> mload(ptr,mask,0)
   // Load inst is intentionally not checked for hasOneUse()
   if (match(FalseVal, m_Zero()) &&
-      (match(TrueVal, m_MaskedLoad(m_Value(), m_Value(), m_Specific(CondVal),
+      (match(TrueVal, m_MaskedLoad(m_Value(), m_Specific(CondVal),
                                    m_CombineOr(m_Undef(), m_Zero()))) ||
-       match(TrueVal, m_MaskedGather(m_Value(), m_Value(), m_Specific(CondVal),
+       match(TrueVal, m_MaskedGather(m_Value(), m_Specific(CondVal),
                                      m_CombineOr(m_Undef(), m_Zero()))))) {
     auto *MaskedInst = cast<IntrinsicInst>(TrueVal);
-    if (isa<UndefValue>(MaskedInst->getArgOperand(3)))
-      MaskedInst->setArgOperand(3, FalseVal /* Zero */);
+    if (isa<UndefValue>(MaskedInst->getArgOperand(2)))
+      MaskedInst->setArgOperand(2, FalseVal /* Zero */);
     return replaceInstUsesWith(SI, MaskedInst);
   }
 
   Value *Mask;
   if (match(TrueVal, m_Zero()) &&
-      (match(FalseVal, m_MaskedLoad(m_Value(), m_Value(), m_Value(Mask),
+      (match(FalseVal, m_MaskedLoad(m_Value(), m_Value(Mask),
                                     m_CombineOr(m_Undef(), m_Zero()))) ||
-       match(FalseVal, m_MaskedGather(m_Value(), m_Value(), m_Value(Mask),
+       match(FalseVal, m_MaskedGather(m_Value(), m_Value(Mask),
                                       m_CombineOr(m_Undef(), m_Zero())))) &&
       (CondVal->getType() == Mask->getType())) {
     // We can remove the select by ensuring the load zeros all lanes the
@@ -4485,8 +4549,8 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
 
     if (CanMergeSelectIntoLoad) {
       auto *MaskedInst = cast<IntrinsicInst>(FalseVal);
-      if (isa<UndefValue>(MaskedInst->getArgOperand(3)))
-        MaskedInst->setArgOperand(3, TrueVal /* Zero */);
+      if (isa<UndefValue>(MaskedInst->getArgOperand(2)))
+        MaskedInst->setArgOperand(2, TrueVal /* Zero */);
       return replaceInstUsesWith(SI, MaskedInst);
     }
   }
@@ -4625,14 +4689,13 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
   }
 
   Value *MaskedLoadPtr;
-  const APInt *MaskedLoadAlignment;
   if (match(TrueVal, m_OneUse(m_MaskedLoad(m_Value(MaskedLoadPtr),
-                                           m_APInt(MaskedLoadAlignment),
                                            m_Specific(CondVal), m_Value()))))
     return replaceInstUsesWith(
-        SI, Builder.CreateMaskedLoad(TrueVal->getType(), MaskedLoadPtr,
-                                     Align(MaskedLoadAlignment->getZExtValue()),
-                                     CondVal, FalseVal));
+        SI, Builder.CreateMaskedLoad(
+                TrueVal->getType(), MaskedLoadPtr,
+                cast<IntrinsicInst>(TrueVal)->getParamAlign(0).valueOrOne(),
+                CondVal, FalseVal));
 
   return nullptr;
 }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index a330bb7b2fc30..651e305f57dfc 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -1892,7 +1892,7 @@ Value *InstCombinerImpl::SimplifyDemandedVectorElts(Value *V,
       // segfaults which didn't exist in the original program.
       APInt DemandedPtrs(APInt::getAllOnes(VWidth)),
           DemandedPassThrough(DemandedElts);
-      if (auto *CMask = dyn_cast<Constant>(II->getOperand(2))) {
+      if (auto *CMask = dyn_cast<Constant>(II->getOperand(1))) {
         for (unsigned i = 0; i < VWidth; i++) {
           if (Constant *CElt = CMask->getAggregateElement(i)) {
             if (CElt->isNullValue())
@@ -1905,7 +1905,7 @@ Value *InstCombinerImpl::SimplifyDemandedVectorElts(Value *V,
 
       if (II->getIntrinsicID() == Intrinsic::masked_gather)
         simplifyAndSetOp(II, 0, DemandedPtrs, PoisonElts2);
-      simplifyAndSetOp(II, 3, DemandedPassThrough, PoisonElts3);
+      simplifyAndSetOp(II, 2, DemandedPassThrough, PoisonElts3);
 
       // Output elements are undefined if the element from both sources are.
       // TODO: can strengthen via mask as well.
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 400a3e3d556e8..8ad817dc389ad 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -605,17 +605,16 @@ static ShadowMapping getShadowMapping(const Triple &TargetTriple, int LongSize,
   return Mapping;
 }
 
-namespace llvm {
-void getAddressSanitizerParams(const Triple &TargetTriple, int LongSize,
-                               bool IsKasan, uint64_t *ShadowBase,
-                               int *MappingScale, bool *OrShadowOffset) {
+void llvm::getAddressSanitizerParams(const Triple &TargetTriple, int LongSize,
+                                     bool IsKasan, uint64_t *ShadowBase,
+                                     int *MappingScale, bool *OrShadowOffset) {
   auto Mapping = getShadowMapping(TargetTriple, LongSize, IsKasan);
   *ShadowBase = Mapping.Offset;
   *MappingScale = Mapping.Scale;
   *OrShadowOffset = Mapping.OrShadowOffset;
 }
 
-void removeASanIncompatibleFnAttributes(Function &F, bool ReadsArgMem) {
+void llvm::removeASanIncompatibleFnAttributes(Function &F, bool ReadsArgMem) {
   // Sanitizer checks read from shadow, which invalidates memory(argmem: *).
   //
   // This is not only true for sanitized functions, because AttrInfer can
@@ -668,8 +667,6 @@ ASanAccessInfo::ASanAccessInfo(bool IsWrite, bool CompileKernel,
       AccessSizeIndex(AccessSizeIndex), IsWrite(IsWrite),
       CompileKernel(CompileKernel) {}
 
-} // namespace llvm
-
 static uint64_t getRedzoneSizeForScale(int MappingScale) {
   // Redzone used for stack and globals is at least 32 bytes.
   // For scales 6 and 7, the redzone has to be 64 and 128 bytes respectively.
@@ -677,11 +674,10 @@ static uint64_t getRedzoneSizeForScale(int MappingScale) {
 }
 
 static uint64_t GetCtorAndDtorPriority(Triple &TargetTriple) {
-  if (TargetTriple.isOSEmscripten()) {
+  if (TargetTriple.isOSEmscripten())
     return kAsanEmscriptenCtorAndDtorPriority;
-  } else {
+  else
     return kAsanCtorAndDtorPriority;
-  }
 }
 
 static Twine genName(StringRef suffix) {
@@ -848,6 +844,7 @@ struct AddressSanitizer {
   bool maybeInsertAsanInitAtFunctionEntry(Function &F);
   bool maybeInsertDynamicShadowAtFunctionEntry(Function &F);
   void markEscapedLocalAllocas(Function &F);
+  void markCatchParametersAsUninteresting(Function &F);
 
 private:
   friend struct FunctionStackPoisoner;
@@ -1497,11 +1494,8 @@ void AddressSanitizer::getInterestingMemoryOperands(
       if (ignoreAccess(I, BasePtr))
         return;
       Type *Ty = IsWrite ? CI->getArgOperand(0)->getType() : CI->getType();
-      MaybeAlign Alignment = Align(1);
-      // Otherwise no alignment guarantees. We probably got Undef.
-      if (auto *Op = dyn_cast<ConstantInt>(CI->getOperand(1 + OpOffset)))
-        Alignment = Op->getMaybeAlignValue();
-      Value *Mask = CI->getOperand(2 + OpOffset);
+      MaybeAlign Alignment = CI->getParamAlign(0);
+      Value *Mask = CI->getOperand(1 + OpOffset);
       Interesting.emplace_back(I, OpOffset, IsWrite, Ty, Alignment, Mask);
       break;
     }
@@ -3004,6 +2998,22 @@ void AddressSanitizer::markEscapedLocalAllocas(Function &F) {
     }
   }
 }
+// Mitigation for https://github.com/google/sanitizers/issues/749
+// We don't instrument Windows catch-block parameters to avoid
+// interfering with exception handling assumptions.
+void AddressSanitizer::markCatchParametersAsUninteresting(Function &F) {
+  for (BasicBlock &BB : F) {
+    for (Instruction &I : BB) {
+      if (auto *CatchPad = dyn_cast<CatchPadInst>(&I)) {
+        // Mark the parameters to a catch-block as uninteresting to avoid
+        // instrumenting them.
+        for (Value *Operand : CatchPad->arg_operands())
+          if (auto *AI = dyn_cast<AllocaInst>(Operand))
+            ProcessedAllocas[AI] = false;
+      }
+    }
+  }
+}
 
 bool AddressSanitizer::suppressInstrumentationSiteForDebug(int &Instrumented) {
   bool ShouldInstrument =
@@ -3048,6 +3058,9 @@ bool AddressSanitizer::instrumentFunction(Function &F,
   // can be passed to that intrinsic.
   markEscapedLocalAllocas(F);
 
+  if (TargetTriple.isOSWindows())
+    markCatchParametersAsUninteresting(F);
+
   // We want to instrument every address only once per basic block (unless there
   // are calls between uses).
   SmallPtrSet<Value *, 16> TempsToInstrument;
@@ -3340,7 +3353,7 @@ PHINode *FunctionStackPoisoner::createPHI(IRBuilder<> &IRB, Value *Cond,
                                           Value *ValueIfTrue,
                                           Instruction *ThenTerm,
                                           Value *ValueIfFalse) {
-  PHINode *PHI = IRB.CreatePHI(IntptrTy, 2);
+  PHINode *PHI = IRB.CreatePHI(ValueIfTrue->getType(), 2);
   BasicBlock *CondBlock = cast<Instruction>(Cond)->getParent();
   PHI->addIncoming(ValueIfFalse, CondBlock);
   BasicBlock *ThenBlock = ThenTerm->getParent();
@@ -3363,7 +3376,7 @@ Value *FunctionStackPoisoner::createAllocaForLayout(
   assert((ClRealignStack & (ClRealignStack - 1)) == 0);
   uint64_t FrameAlignment = std::max(L.FrameAlignment, uint64_t(ClRealignStack));
   Alloca->setAlignment(Align(FrameAlignment));
-  return IRB.CreatePointerCast(Alloca, IntptrTy);
+  return Alloca;
 }
 
 void FunctionStackPoisoner::createDynamicAllocasInitStorage() {
@@ -3575,10 +3588,12 @@ void FunctionStackPoisoner::processStaticAllocas() {
   DoDynamicAlloca &= !HasInlineAsm && !HasReturnsTwiceCall;
   DoStackMalloc &= !HasInlineAsm && !HasReturnsTwiceCall;
 
+  Type *PtrTy = F.getDataLayout().getAllocaPtrType(F.getContext());
   Value *StaticAlloca =
       DoDynamicAlloca ? nullptr : createAllocaForLayout(IRB, L, false);
 
-  Value *FakeStack;
+  Value *FakeStackPtr;
+  Value *FakeStackInt;
   Value *LocalStackBase;
   Value *LocalStackBaseAlloca;
   uint8_t DIExprFlags = DIExpression::ApplyOffset;
@@ -3606,20 +3621,21 @@ void FunctionStackPoisoner::processStaticAllocas() {
           RTCI.createRuntimeCall(IRBIf, AsanStackMallocFunc[StackMallocIdx],
                                  ConstantInt::get(IntptrTy, LocalStackSize));
       IRB.SetInsertPoint(InsBefore);
-      FakeStack = createPHI(IRB, UseAfterReturnIsEnabled, FakeStackValue, Term,
-                            ConstantInt::get(IntptrTy, 0));
+      FakeStackInt = createPHI(IRB, UseAfterReturnIsEnabled, FakeStackValue,
+                               Term, ConstantInt::get(IntptrTy, 0));
     } else {
       // assert(ASan.UseAfterReturn == AsanDetectStackUseAfterReturnMode:Always)
       // void *FakeStack = __asan_stack_malloc_N(LocalStackSize);
       // void *LocalStackBase = (FakeStack) ? FakeStack :
       //                        alloca(LocalStackSize);
       StackMallocIdx = StackMallocSizeClass(LocalStackSize);
-      FakeStack =
+      FakeStackInt =
           RTCI.createRuntimeCall(IRB, AsanStackMallocFunc[StackMallocIdx],
                                  ConstantInt::get(IntptrTy, LocalStackSize));
     }
+    FakeStackPtr = IRB.CreateIntToPtr(FakeStackInt, PtrTy);
     Value *NoFakeStack =
-        IRB.CreateICmpEQ(FakeStack, Constant::getNullValue(IntptrTy));
+        IRB.CreateICmpEQ(FakeStackInt, Constant::getNullValue(IntptrTy));
     Instruction *Term =
         SplitBlockAndInsertIfThen(NoFakeStack, InsBefore, false);
     IRBuilder<> IRBIf(Term);
@@ -3627,67 +3643,53 @@ void FunctionStackPoisoner::processStaticAllocas() {
         DoDynamicAlloca ? createAllocaForLayout(IRBIf, L, true) : StaticAlloca;
 
     IRB.SetInsertPoint(InsBefore);
-    LocalStackBase = createPHI(IRB, NoFakeStack, AllocaValue, Term, FakeStack);
+    LocalStackBase =
+        createPHI(IRB, NoFakeStack, AllocaValue, Term, FakeStackPtr);
     IRB.CreateStore(LocalStackBase, LocalStackBaseAlloca);
     DIExprFlags |= DIExpression::DerefBefore;
   } else {
     // void *FakeStack = nullptr;
     // void *LocalStackBase = alloca(LocalStackSize);
-    FakeStack = ConstantInt::get(IntptrTy, 0);
+    FakeStackInt = Constant::getNullValue(IntptrTy);
+    FakeStackPtr = Constant::getNullValue(PtrTy);
     LocalStackBase =
         DoDynamicAlloca ? createAllocaForLayout(IRB, L, true) : StaticAlloca;
     LocalStackBaseAlloca = LocalStackBase;
   }
 
-  // It shouldn't matter whether we pass an `alloca` or a `ptrtoint` as the
-  // dbg.declare address opereand, but passing a `ptrtoint` seems to confuse
-  // later passes and can result in dropped variable coverage in debug info.
-  Value *LocalStackBaseAllocaPtr =
-      isa<PtrToIntInst>(LocalStackBaseAlloca)
-          ? cast<PtrToIntInst>(LocalStackBaseAlloca)->getPointerOperand()
-          : LocalStackBaseAlloca;
-  assert(isa<AllocaInst>(LocalStackBaseAllocaPtr) &&
-         "Variable descriptions relative to ASan stack base will be dropped");
-
   // Replace Alloca instructions with base+offset.
   SmallVector<Value *> NewAllocaPtrs;
   for (const auto &Desc : SVD) {
     AllocaInst *AI = Desc.AI;
-    replaceDbgDeclare(AI, LocalStackBaseAllocaPtr, DIB, DIExprFlags,
-                      Desc.Offset);
-    Value *NewAllocaPtr = IRB.CreateIntToPtr(
-        IRB.CreateAdd(LocalStackBase, ConstantInt::get(IntptrTy, Desc.Offset)),
-        AI->getType());
+    replaceDbgDeclare(AI, LocalStackBaseAlloca, DIB, DIExprFlags, Desc.Offset);
+    Value *NewAllocaPtr = IRB.CreatePtrAdd(
+        LocalStackBase, ConstantInt::get(IntptrTy, Desc.Offset));
     AI->replaceAllUsesWith(NewAllocaPtr);
     NewAllocaPtrs.push_back(NewAllocaPtr);
   }
 
   // The left-most redzone has enough space for at least 4 pointers.
   // Write the Magic value to redzone[0].
-  Value *BasePlus0 = IRB.CreateIntToPtr(LocalStackBase, IntptrPtrTy);
   IRB.CreateStore(ConstantInt::get(IntptrTy, kCurrentStackFrameMagic),
-                  BasePlus0);
+                  LocalStackBase);
   // Write the frame description constant to redzone[1].
-  Value *BasePlus1 = IRB.CreateIntToPtr(
-      IRB.CreateAdd(LocalStackBase,
-                    ConstantInt::get(IntptrTy, ASan.LongSize / 8)),
-      IntptrPtrTy);
+  Value *BasePlus1 = IRB.CreatePtrAdd(
+      LocalStackBase, ConstantInt::get(IntptrTy, ASan.LongSize / 8));
   GlobalVariable *StackDescriptionGlobal =
       createPrivateGlobalForString(*F.getParent(), DescriptionString,
                                    /*AllowMerging*/ true, genName("stack"));
   Value *Description = IRB.CreatePointerCast(StackDescriptionGlobal, IntptrTy);
   IRB.CreateStore(Description, BasePlus1);
   // Write the PC to redzone[2].
-  Value *BasePlus2 = IRB.CreateIntToPtr(
-      IRB.CreateAdd(LocalStackBase,
-                    ConstantInt::get(IntptrTy, 2 * ASan.LongSize / 8)),
-      IntptrPtrTy);
+  Value *BasePlus2 = IRB.CreatePtrAdd(
+      LocalStackBase, ConstantInt::get(IntptrTy, 2 * ASan.LongSize / 8));
   IRB.CreateStore(IRB.CreatePointerCast(&F, IntptrTy), BasePlus2);
 
   const auto &ShadowAfterScope = GetShadowBytesAfterScope(SVD, L);
 
   // Poison the stack red zones at the entry.
-  Value *ShadowBase = ASan.memToShadow(LocalStackBase, IRB);
+  Value *ShadowBase =
+      ASan.memToShadow(IRB.CreatePtrToInt(LocalStackBase, IntptrTy), IRB);
   // As mask we must use most poisoned case: red zones and after scope.
   // As bytes we can use either the same or just red zones only.
   copyToShadow(ShadowAfterScope, ShadowAfterScope, IRB, ShadowBase);
@@ -3726,7 +3728,7 @@ void FunctionStackPoisoner::processStaticAllocas() {
     IRBuilder<> IRBRet(Ret);
     // Mark the current frame as retired.
     IRBRet.CreateStore(ConstantInt::get(IntptrTy, kRetiredStackFrameMagic),
-                       BasePlus0);
+                       LocalStackBase);
     if (DoStackMalloc) {
       assert(StackMallocIdx >= 0);
       // if FakeStack != 0  // LocalStackBase == FakeStack
@@ -3740,7 +3742,7 @@ void FunctionStackPoisoner::processStaticAllocas() {
       // else
       //     <This is not a fake stack; unpoison the redzones>
       Value *Cmp =
-          IRBRet.CreateICmpNE(FakeStack, Constant::getNullValue(IntptrTy));
+          IRBRet.CreateICmpNE(FakeStackInt, Constant::getNullValue(IntptrTy));
       Instruction *ThenTerm, *ElseTerm;
       SplitBlockAndInsertIfThenElse(Cmp, Ret, &ThenTerm, &ElseTerm);
 
@@ -3751,11 +3753,10 @@ void FunctionStackPoisoner::processStaticAllocas() {
                                  kAsanStackUseAfterReturnMagic);
         copyToShadow(ShadowAfterReturn, ShadowAfterReturn, IRBPoison,
                      ShadowBase);
-        Value *SavedFlagPtrPtr = IRBPoison.CreateAdd(
-            FakeStack,
+        Value *SavedFlagPtrPtr = IRBPoison.CreatePtrAdd(
+            FakeStackPtr,
             ConstantInt::get(IntptrTy, ClassSize - ASan.LongSize / 8));
-        Value *SavedFlagPtr = IRBPoison.CreateLoad(
-            IntptrTy, IRBPoison.CreateIntToPtr(SavedFlagPtrPtr, IntptrPtrTy));
+        Value *SavedFlagPtr = IRBPoison.CreateLoad(IntptrTy, SavedFlagPtrPtr);
         IRBPoison.CreateStore(
             Constant::getNullValue(IRBPoison.getInt8Ty()),
             IRBPoison.CreateIntToPtr(SavedFlagPtr, IRBPoison.getPtrTy()));
@@ -3763,7 +3764,7 @@ void FunctionStackPoisoner::processStaticAllocas() {
         // For larger frames call __asan_stack_free_*.
         RTCI.createRuntimeCall(
             IRBPoison, AsanStackFreeFunc[StackMallocIdx],
-            {FakeStack, ConstantInt::get(IntptrTy, LocalStackSize)});
+            {FakeStackInt, ConstantInt::get(IntptrTy, LocalStackSize)});
       }
 
       IRBuilder<> IRBElse(ElseTerm);
diff --git a/llvm/lib/Transforms/Instrumentation/AllocToken.cpp b/llvm/lib/Transforms/Instrumentation/AllocToken.cpp
index 40720ae4b39ae..29968b834cfce 100644
--- a/llvm/lib/Transforms/Instrumentation/AllocToken.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AllocToken.cpp
@@ -31,6 +31,7 @@
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
@@ -131,7 +132,7 @@ cl::opt<uint64_t> ClFallbackToken(
 
 //===--- Statistics -------------------------------------------------------===//
 
-STATISTIC(NumFunctionsInstrumented, "Functions instrumented");
+STATISTIC(NumFunctionsModified, "Functions modified");
 STATISTIC(NumAllocationsInstrumented, "Allocations instrumented");
 
 //===----------------------------------------------------------------------===//
@@ -140,9 +141,19 @@ STATISTIC(NumAllocationsInstrumented, "Allocations instrumented");
 ///
 /// Expected format is: !{<type-name>, <contains-pointer>}
 MDNode *getAllocTokenMetadata(const CallBase &CB) {
-  MDNode *Ret = CB.getMetadata(LLVMContext::MD_alloc_token);
-  if (!Ret)
-    return nullptr;
+  MDNode *Ret = nullptr;
+  if (auto *II = dyn_cast<IntrinsicInst>(&CB);
+      II && II->getIntrinsicID() == Intrinsic::alloc_token_id) {
+    auto *MDV = cast<MetadataAsValue>(II->getArgOperand(0));
+    Ret = cast<MDNode>(MDV->getMetadata());
+    // If the intrinsic has an empty MDNode, type inference failed.
+    if (Ret->getNumOperands() == 0)
+      return nullptr;
+  } else {
+    Ret = CB.getMetadata(LLVMContext::MD_alloc_token);
+    if (!Ret)
+      return nullptr;
+  }
   assert(Ret->getNumOperands() == 2 && "bad !alloc_token");
   assert(isa<MDString>(Ret->getOperand(0)));
   assert(isa<ConstantAsMetadata>(Ret->getOperand(1)));
@@ -315,6 +326,9 @@ class AllocToken {
   FunctionCallee getTokenAllocFunction(const CallBase &CB, uint64_t TokenID,
                                        LibFunc OriginalFunc);
 
+  /// Lower alloc_token_* intrinsics.
+  void replaceIntrinsicInst(IntrinsicInst *II, OptimizationRemarkEmitter &ORE);
+
   /// Return the token ID from metadata in the call.
   uint64_t getToken(const CallBase &CB, OptimizationRemarkEmitter &ORE) {
     return std::visit([&](auto &&Mode) { return Mode(CB, ORE); }, Mode);
@@ -336,21 +350,32 @@ bool AllocToken::instrumentFunction(Function &F) {
   // Do not apply any instrumentation for naked functions.
   if (F.hasFnAttribute(Attribute::Naked))
     return false;
-  if (F.hasFnAttribute(Attribute::DisableSanitizerInstrumentation))
-    return false;
   // Don't touch available_externally functions, their actual body is elsewhere.
   if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage)
     return false;
-  // Only instrument functions that have the sanitize_alloc_token attribute.
-  if (!F.hasFnAttribute(Attribute::SanitizeAllocToken))
-    return false;
 
   auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
   auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
   SmallVector<std::pair<CallBase *, LibFunc>, 4> AllocCalls;
+  SmallVector<IntrinsicInst *, 4> IntrinsicInsts;
+
+  // Only instrument functions that have the sanitize_alloc_token attribute.
+  const bool InstrumentFunction =
+      F.hasFnAttribute(Attribute::SanitizeAllocToken) &&
+      !F.hasFnAttribute(Attribute::DisableSanitizerInstrumentation);
 
   // Collect all allocation calls to avoid iterator invalidation.
   for (Instruction &I : instructions(F)) {
+    // Collect all alloc_token_* intrinsics.
+    if (auto *II = dyn_cast<IntrinsicInst>(&I);
+        II && II->getIntrinsicID() == Intrinsic::alloc_token_id) {
+      IntrinsicInsts.emplace_back(II);
+      continue;
+    }
+
+    if (!InstrumentFunction)
+      continue;
+
     auto *CB = dyn_cast<CallBase>(&I);
     if (!CB)
       continue;
@@ -359,11 +384,21 @@ bool AllocToken::instrumentFunction(Function &F) {
   }
 
   bool Modified = false;
-  for (auto &[CB, Func] : AllocCalls)
-    Modified |= replaceAllocationCall(CB, Func, ORE, TLI);
 
-  if (Modified)
-    NumFunctionsInstrumented++;
+  if (!AllocCalls.empty()) {
+    for (auto &[CB, Func] : AllocCalls)
+      Modified |= replaceAllocationCall(CB, Func, ORE, TLI);
+    if (Modified)
+      NumFunctionsModified++;
+  }
+
+  if (!IntrinsicInsts.empty()) {
+    for (auto *II : IntrinsicInsts)
+      replaceIntrinsicInst(II, ORE);
+    Modified = true;
+    NumFunctionsModified++;
+  }
+
   return Modified;
 }
 
@@ -381,7 +416,7 @@ AllocToken::shouldInstrumentCall(const CallBase &CB,
   if (TLI.getLibFunc(*Callee, Func)) {
     if (isInstrumentableLibFunc(Func, CB, TLI))
       return Func;
-  } else if (Options.Extended && getAllocTokenMetadata(CB)) {
+  } else if (Options.Extended && CB.getMetadata(LLVMContext::MD_alloc_token)) {
     return NotLibFunc;
   }
 
@@ -528,6 +563,16 @@ FunctionCallee AllocToken::getTokenAllocFunction(const CallBase &CB,
   return TokenAlloc;
 }
 
+void AllocToken::replaceIntrinsicInst(IntrinsicInst *II,
+                                      OptimizationRemarkEmitter &ORE) {
+  assert(II->getIntrinsicID() == Intrinsic::alloc_token_id);
+
+  uint64_t TokenID = getToken(*II, ORE);
+  Value *V = ConstantInt::get(IntPtrTy, TokenID);
+  II->replaceAllUsesWith(V);
+  II->eraseFromParent();
+}
+
 } // namespace
 
 AllocTokenPass::AllocTokenPass(AllocTokenOptions Opts)
diff --git a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
index 7c78eb35a865a..72e8e504eb9e5 100644
--- a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
+++ b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
@@ -396,9 +396,8 @@ class CHR {
 
 } // end anonymous namespace
 
-static inline
-raw_ostream LLVM_ATTRIBUTE_UNUSED &operator<<(raw_ostream &OS,
-                                              const CHRStats &Stats) {
+[[maybe_unused]] static inline raw_ostream &operator<<(raw_ostream &OS,
+                                                       const CHRStats &Stats) {
   Stats.print(OS);
   return OS;
 }
@@ -425,8 +424,8 @@ static bool shouldApply(Function &F, ProfileSummaryInfo &PSI) {
   return PSI.isFunctionEntryHot(&F);
 }
 
-static void LLVM_ATTRIBUTE_UNUSED dumpIR(Function &F, const char *Label,
-                                         CHRStats *Stats) {
+[[maybe_unused]] static void dumpIR(Function &F, const char *Label,
+                                    CHRStats *Stats) {
   StringRef FuncName = F.getName();
   StringRef ModuleName = F.getParent()->getName();
   (void)(FuncName); // Unused in release build.
@@ -1622,7 +1621,7 @@ static void insertTrivialPHIs(CHRScope *Scope,
 }
 
 // Assert that all the CHR regions of the scope have a biased branch or select.
-static void LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]] static void
 assertCHRRegionsHaveBiasedBranchOrSelect(CHRScope *Scope) {
 #ifndef NDEBUG
   auto HasBiasedBranchOrSelect = [](RegInfo &RI, CHRScope *Scope) {
@@ -1644,8 +1643,9 @@ assertCHRRegionsHaveBiasedBranchOrSelect(CHRScope *Scope) {
 
 // Assert that all the condition values of the biased branches and selects have
 // been hoisted to the pre-entry block or outside of the scope.
-static void LLVM_ATTRIBUTE_UNUSED assertBranchOrSelectConditionHoisted(
-    CHRScope *Scope, BasicBlock *PreEntryBlock) {
+[[maybe_unused]] static void
+assertBranchOrSelectConditionHoisted(CHRScope *Scope,
+                                     BasicBlock *PreEntryBlock) {
   CHR_DEBUG(dbgs() << "Biased regions condition values \n");
   for (RegInfo &RI : Scope->CHRRegions) {
     Region *R = RI.R;
@@ -2007,8 +2007,8 @@ void CHR::transformScopes(SmallVectorImpl<CHRScope *> &CHRScopes) {
   }
 }
 
-static void LLVM_ATTRIBUTE_UNUSED
-dumpScopes(SmallVectorImpl<CHRScope *> &Scopes, const char *Label) {
+[[maybe_unused]] static void dumpScopes(SmallVectorImpl<CHRScope *> &Scopes,
+                                        const char *Label) {
   dbgs() << Label << " " << Scopes.size() << "\n";
   for (CHRScope *Scope : Scopes) {
     dbgs() << *Scope << "\n";
@@ -2092,8 +2092,6 @@ bool CHR::run() {
   return Changed;
 }
 
-namespace llvm {
-
 ControlHeightReductionPass::ControlHeightReductionPass() {
   parseCHRFilterFiles();
 }
@@ -2116,5 +2114,3 @@ PreservedAnalyses ControlHeightReductionPass::run(
     return PreservedAnalyses::all();
   return PreservedAnalyses::none();
 }
-
-} // namespace llvm
diff --git a/llvm/lib/Transforms/Instrumentation/MemProfInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/MemProfInstrumentation.cpp
index 3ae771a575f0f..3c0f185693a85 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfInstrumentation.cpp
@@ -338,7 +338,7 @@ MemProfiler::isInterestingMemoryAccess(Instruction *I) const {
       }
 
       auto *BasePtr = CI->getOperand(0 + OpOffset);
-      Access.MaybeMask = CI->getOperand(2 + OpOffset);
+      Access.MaybeMask = CI->getOperand(1 + OpOffset);
       Access.Addr = BasePtr;
     }
   }
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index eff6f0caf0c05..b6cbecb6133f4 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -4191,10 +4191,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   void handleMaskedGather(IntrinsicInst &I) {
     IRBuilder<> IRB(&I);
     Value *Ptrs = I.getArgOperand(0);
-    const Align Alignment(
-        cast<ConstantInt>(I.getArgOperand(1))->getZExtValue());
-    Value *Mask = I.getArgOperand(2);
-    Value *PassThru = I.getArgOperand(3);
+    const Align Alignment = I.getParamAlign(0).valueOrOne();
+    Value *Mask = I.getArgOperand(1);
+    Value *PassThru = I.getArgOperand(2);
 
     Type *PtrsShadowTy = getShadowTy(Ptrs);
     if (ClCheckAccessAddress) {
@@ -4230,9 +4229,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     IRBuilder<> IRB(&I);
     Value *Values = I.getArgOperand(0);
     Value *Ptrs = I.getArgOperand(1);
-    const Align Alignment(
-        cast<ConstantInt>(I.getArgOperand(2))->getZExtValue());
-    Value *Mask = I.getArgOperand(3);
+    const Align Alignment = I.getParamAlign(1).valueOrOne();
+    Value *Mask = I.getArgOperand(2);
 
     Type *PtrsShadowTy = getShadowTy(Ptrs);
     if (ClCheckAccessAddress) {
@@ -4262,9 +4260,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     IRBuilder<> IRB(&I);
     Value *V = I.getArgOperand(0);
     Value *Ptr = I.getArgOperand(1);
-    const Align Alignment(
-        cast<ConstantInt>(I.getArgOperand(2))->getZExtValue());
-    Value *Mask = I.getArgOperand(3);
+    const Align Alignment = I.getParamAlign(1).valueOrOne();
+    Value *Mask = I.getArgOperand(2);
     Value *Shadow = getShadow(V);
 
     if (ClCheckAccessAddress) {
@@ -4295,10 +4292,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   void handleMaskedLoad(IntrinsicInst &I) {
     IRBuilder<> IRB(&I);
     Value *Ptr = I.getArgOperand(0);
-    const Align Alignment(
-        cast<ConstantInt>(I.getArgOperand(1))->getZExtValue());
-    Value *Mask = I.getArgOperand(2);
-    Value *PassThru = I.getArgOperand(3);
+    const Align Alignment = I.getParamAlign(0).valueOrOne();
+    Value *Mask = I.getArgOperand(1);
+    Value *PassThru = I.getArgOperand(2);
 
     if (ClCheckAccessAddress) {
       insertCheckShadowOf(Ptr, &I);
diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
index 09db464ec6a25..386e48f81a93f 100644
--- a/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
+++ b/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
@@ -326,8 +326,7 @@ const unsigned BBState::OverflowOccurredValue = 0xffffffff;
 
 namespace llvm {
 
-raw_ostream &operator<<(raw_ostream &OS,
-                        BBState &BBState) LLVM_ATTRIBUTE_UNUSED;
+[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS, BBState &BBState);
 
 } // end namespace llvm
 
diff --git a/llvm/lib/Transforms/ObjCARC/PtrState.h b/llvm/lib/Transforms/ObjCARC/PtrState.h
index 232db2bd33bc7..5cc421272a100 100644
--- a/llvm/lib/Transforms/ObjCARC/PtrState.h
+++ b/llvm/lib/Transforms/ObjCARC/PtrState.h
@@ -47,8 +47,7 @@ enum Sequence {
   S_MovableRelease ///< objc_release(x), !clang.imprecise_release.
 };
 
-raw_ostream &operator<<(raw_ostream &OS,
-                        const Sequence S) LLVM_ATTRIBUTE_UNUSED;
+[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS, const Sequence S);
 
 /// Unidirectional information about either a
 /// retain-decrement-use-release sequence or release-use-decrement-retain
diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
index ff5f390d6fe18..66e45ecbde7df 100644
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -266,8 +266,7 @@ void DFAJumpThreading::unfold(DomTreeUpdater *DTU, LoopInfo *LI,
     if (!ProfcheckDisableMetadataFixes)
       BI->setMetadata(LLVMContext::MD_prof,
                       SI->getMetadata(LLVMContext::MD_prof));
-    DTU->applyUpdates({{DominatorTree::Insert, StartBlock, EndBlock},
-                       {DominatorTree::Insert, StartBlock, NewBlock}});
+    DTU->applyUpdates({{DominatorTree::Insert, StartBlock, NewBlock}});
   } else {
     BasicBlock *EndBlock = SIUse->getParent();
     BasicBlock *NewBlockT = BasicBlock::Create(
@@ -1479,10 +1478,13 @@ bool DFAJumpThreading::run(Function &F) {
   DTU->flush();
 
 #ifdef EXPENSIVE_CHECKS
-  assert(DTU->getDomTree().verify(DominatorTree::VerificationLevel::Full));
   verifyFunction(F, &dbgs());
 #endif
 
+  if (MadeChanges && VerifyDomInfo)
+    assert(DTU->getDomTree().verify(DominatorTree::VerificationLevel::Full) &&
+           "Failed to maintain validity of domtree!");
+
   return MadeChanges;
 }
 
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 7ad710ddfb273..4ac1321860f66 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -77,6 +77,7 @@
 #include "llvm/Support/DebugCounter.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -271,7 +272,7 @@ static OverwriteResult isMaskedStoreOverwrite(const Instruction *KillingI,
     if (KillingII->getIntrinsicID() == Intrinsic::masked_store) {
       // Masks.
       // TODO: check that KillingII's mask is a superset of the DeadII's mask.
-      if (KillingII->getArgOperand(3) != DeadII->getArgOperand(3))
+      if (KillingII->getArgOperand(2) != DeadII->getArgOperand(2))
         return OW_Unknown;
     } else if (KillingII->getIntrinsicID() == Intrinsic::vp_store) {
       // Masks.
@@ -805,9 +806,8 @@ tryToMergePartialOverlappingStores(StoreInst *KillingI, StoreInst *DeadI,
   return nullptr;
 }
 
-namespace {
 // Returns true if \p I is an intrinsic that does not read or write memory.
-bool isNoopIntrinsic(Instruction *I) {
+static bool isNoopIntrinsic(Instruction *I) {
   if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
     switch (II->getIntrinsicID()) {
     case Intrinsic::lifetime_start:
@@ -828,7 +828,7 @@ bool isNoopIntrinsic(Instruction *I) {
 }
 
 // Check if we can ignore \p D for DSE.
-bool canSkipDef(MemoryDef *D, bool DefVisibleToCaller) {
+static bool canSkipDef(MemoryDef *D, bool DefVisibleToCaller) {
   Instruction *DI = D->getMemoryInst();
   // Calls that only access inaccessible memory cannot read or write any memory
   // locations we consider for elimination.
@@ -856,6 +856,8 @@ bool canSkipDef(MemoryDef *D, bool DefVisibleToCaller) {
   return false;
 }
 
+namespace {
+
 // A memory location wrapper that represents a MemoryLocation, `MemLoc`,
 // defined by `MemDef`.
 struct MemoryLocationWrapper {
@@ -889,23 +891,25 @@ struct MemoryDefWrapper {
   SmallVector<MemoryLocationWrapper, 1> DefinedLocations;
 };
 
-bool hasInitializesAttr(Instruction *I) {
-  CallBase *CB = dyn_cast<CallBase>(I);
-  return CB && CB->getArgOperandWithAttribute(Attribute::Initializes);
-}
-
 struct ArgumentInitInfo {
   unsigned Idx;
   bool IsDeadOrInvisibleOnUnwind;
   ConstantRangeList Inits;
 };
+} // namespace
+
+static bool hasInitializesAttr(Instruction *I) {
+  CallBase *CB = dyn_cast<CallBase>(I);
+  return CB && CB->getArgOperandWithAttribute(Attribute::Initializes);
+}
 
 // Return the intersected range list of the initializes attributes of "Args".
 // "Args" are call arguments that alias to each other.
 // If any argument in "Args" doesn't have dead_on_unwind attr and
 // "CallHasNoUnwindAttr" is false, return empty.
-ConstantRangeList getIntersectedInitRangeList(ArrayRef<ArgumentInitInfo> Args,
-                                              bool CallHasNoUnwindAttr) {
+static ConstantRangeList
+getIntersectedInitRangeList(ArrayRef<ArgumentInitInfo> Args,
+                            bool CallHasNoUnwindAttr) {
   if (Args.empty())
     return {};
 
@@ -925,6 +929,8 @@ ConstantRangeList getIntersectedInitRangeList(ArrayRef<ArgumentInitInfo> Args,
   return IntersectedIntervals;
 }
 
+namespace {
+
 struct DSEState {
   Function &F;
   AliasAnalysis &AA;
@@ -2328,10 +2334,11 @@ struct DSEState {
   // change state: whether make any change.
   bool eliminateDeadDefs(const MemoryDefWrapper &KillingDefWrapper);
 };
+} // namespace
 
 // Return true if "Arg" is function local and isn't captured before "CB".
-bool isFuncLocalAndNotCaptured(Value *Arg, const CallBase *CB,
-                               EarliestEscapeAnalysis &EA) {
+static bool isFuncLocalAndNotCaptured(Value *Arg, const CallBase *CB,
+                                      EarliestEscapeAnalysis &EA) {
   const Value *UnderlyingObj = getUnderlyingObject(Arg);
   return isIdentifiedFunctionLocal(UnderlyingObj) &&
          capturesNothing(
@@ -2627,7 +2634,6 @@ static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
 
   return MadeChange;
 }
-} // end anonymous namespace
 
 //===----------------------------------------------------------------------===//
 // DSE Pass
@@ -2728,8 +2734,6 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_END(DSELegacyPass, "dse", "Dead Store Elimination", false,
                     false)
 
-namespace llvm {
-LLVM_ABI FunctionPass *createDeadStoreEliminationPass() {
+LLVM_ABI FunctionPass *llvm::createDeadStoreEliminationPass() {
   return new DSELegacyPass();
 }
-} // namespace llvm
diff --git a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
index 2afa7b7d40386..e30f3060c27a6 100644
--- a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -1017,14 +1017,14 @@ class EarlyCSE {
     };
     auto MaskOp = [](const IntrinsicInst *II) {
       if (II->getIntrinsicID() == Intrinsic::masked_load)
-        return II->getOperand(2);
+        return II->getOperand(1);
       if (II->getIntrinsicID() == Intrinsic::masked_store)
-        return II->getOperand(3);
+        return II->getOperand(2);
       llvm_unreachable("Unexpected IntrinsicInst");
     };
     auto ThruOp = [](const IntrinsicInst *II) {
       if (II->getIntrinsicID() == Intrinsic::masked_load)
-        return II->getOperand(3);
+        return II->getOperand(2);
       llvm_unreachable("Unexpected IntrinsicInst");
     };
 
diff --git a/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp b/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
index 213d0f389c2e4..1335665e114dd 100644
--- a/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
+++ b/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
@@ -39,10 +39,11 @@ struct FlattenCFGLegacyPass : public FunctionPass {
 private:
   AliasAnalysis *AA;
 };
+} // namespace
 
 /// iterativelyFlattenCFG - Call FlattenCFG on all the blocks in the function,
 /// iterating until no more changes are made.
-bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) {
+static bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) {
   bool Changed = false;
   bool LocalChange = true;
 
@@ -67,7 +68,6 @@ bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) {
   }
   return Changed;
 }
-} // namespace
 
 char FlattenCFGLegacyPass::ID = 0;
 
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index 42db42402a7c3..72e1131a54a86 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -2212,11 +2212,11 @@ bool GVNPass::processMaskedLoad(IntrinsicInst *I) {
   if (!DepInst || !Dep.isLocal() || !Dep.isDef())
     return false;
 
-  Value *Mask = I->getOperand(2);
-  Value *Passthrough = I->getOperand(3);
+  Value *Mask = I->getOperand(1);
+  Value *Passthrough = I->getOperand(2);
   Value *StoreVal;
-  if (!match(DepInst, m_MaskedStore(m_Value(StoreVal), m_Value(), m_Value(),
-                                    m_Specific(Mask))) ||
+  if (!match(DepInst,
+             m_MaskedStore(m_Value(StoreVal), m_Value(), m_Specific(Mask))) ||
       StoreVal->getType() != I->getType())
     return false;
 
diff --git a/llvm/lib/Transforms/Scalar/GVNSink.cpp b/llvm/lib/Transforms/Scalar/GVNSink.cpp
index 1c885320e0a67..a06f8325c90bf 100644
--- a/llvm/lib/Transforms/Scalar/GVNSink.cpp
+++ b/llvm/lib/Transforms/Scalar/GVNSink.cpp
@@ -73,24 +73,17 @@
 #include <utility>
 
 using namespace llvm;
+using namespace llvm::GVNExpression;
 
 #define DEBUG_TYPE "gvn-sink"
 
 STATISTIC(NumRemoved, "Number of instructions removed");
 
-namespace llvm {
-namespace GVNExpression {
-
 LLVM_DUMP_METHOD void Expression::dump() const {
   print(dbgs());
   dbgs() << "\n";
 }
 
-} // end namespace GVNExpression
-} // end namespace llvm
-
-namespace {
-
 static bool isMemoryInst(const Instruction *I) {
   return isa<LoadInst>(I) || isa<StoreInst>(I) ||
          (isa<InvokeInst>(I) && !cast<InvokeInst>(I)->doesNotAccessMemory()) ||
@@ -99,6 +92,8 @@ static bool isMemoryInst(const Instruction *I) {
 
 //===----------------------------------------------------------------------===//
 
+namespace {
+
 /// Candidate solution for sinking. There may be different ways to
 /// sink instructions, differing in the number of instructions sunk,
 /// the number of predecessors sunk from and the number of PHIs
@@ -125,14 +120,6 @@ struct SinkingInstructionCandidate {
   }
 };
 
-#ifndef NDEBUG
-raw_ostream &operator<<(raw_ostream &OS, const SinkingInstructionCandidate &C) {
-  OS << "<Candidate Cost=" << C.Cost << " #Blocks=" << C.NumBlocks
-     << " #Insts=" << C.NumInstructions << " #PHIs=" << C.NumPHIs << ">";
-  return OS;
-}
-#endif
-
 //===----------------------------------------------------------------------===//
 
 /// Describes a PHI node that may or may not exist. These track the PHIs
@@ -256,8 +243,18 @@ class ModelledPHI {
     return Values == Other.Values && Blocks == Other.Blocks;
   }
 };
+} // namespace
 
-template <typename ModelledPHI> struct DenseMapInfo {
+#ifndef NDEBUG
+static raw_ostream &operator<<(raw_ostream &OS,
+                               const SinkingInstructionCandidate &C) {
+  OS << "<Candidate Cost=" << C.Cost << " #Blocks=" << C.NumBlocks
+     << " #Insts=" << C.NumInstructions << " #PHIs=" << C.NumPHIs << ">";
+  return OS;
+}
+#endif
+
+template <> struct llvm::DenseMapInfo<ModelledPHI> {
   static inline ModelledPHI &getEmptyKey() {
     static ModelledPHI Dummy = ModelledPHI::createDummy(0);
     return Dummy;
@@ -275,7 +272,9 @@ template <typename ModelledPHI> struct DenseMapInfo {
   }
 };
 
-using ModelledPHISet = DenseSet<ModelledPHI, DenseMapInfo<ModelledPHI>>;
+using ModelledPHISet = DenseSet<ModelledPHI>;
+
+namespace {
 
 //===----------------------------------------------------------------------===//
 //                             ValueTable
@@ -290,7 +289,7 @@ using ModelledPHISet = DenseSet<ModelledPHI, DenseMapInfo<ModelledPHI>>;
 ///
 /// This class also contains fields for discriminators used when determining
 /// equivalence of instructions with sideeffects.
-class InstructionUseExpr : public GVNExpression::BasicExpression {
+class InstructionUseExpr : public BasicExpression {
   unsigned MemoryUseOrder = -1;
   bool Volatile = false;
   ArrayRef<int> ShuffleMask;
@@ -298,7 +297,7 @@ class InstructionUseExpr : public GVNExpression::BasicExpression {
 public:
   InstructionUseExpr(Instruction *I, ArrayRecycler<Value *> &R,
                      BumpPtrAllocator &A)
-      : GVNExpression::BasicExpression(I->getNumUses()) {
+      : BasicExpression(I->getNumUses()) {
     allocateOperands(R, A);
     setOpcode(I->getOpcode());
     setType(I->getType());
@@ -315,8 +314,8 @@ class InstructionUseExpr : public GVNExpression::BasicExpression {
   void setVolatile(bool V) { Volatile = V; }
 
   hash_code getHashValue() const override {
-    return hash_combine(GVNExpression::BasicExpression::getHashValue(),
-                        MemoryUseOrder, Volatile, ShuffleMask);
+    return hash_combine(BasicExpression::getHashValue(), MemoryUseOrder,
+                        Volatile, ShuffleMask);
   }
 
   template <typename Function> hash_code getHashValue(Function MapFn) {
@@ -332,7 +331,7 @@ using BasicBlocksSet = SmallPtrSet<const BasicBlock *, 32>;
 
 class ValueTable {
   DenseMap<Value *, uint32_t> ValueNumbering;
-  DenseMap<GVNExpression::Expression *, uint32_t> ExpressionNumbering;
+  DenseMap<Expression *, uint32_t> ExpressionNumbering;
   DenseMap<size_t, uint32_t> HashNumbering;
   BumpPtrAllocator Allocator;
   ArrayRecycler<Value *> Recycler;
@@ -431,6 +430,7 @@ class ValueTable {
     case Instruction::FPTrunc:
     case Instruction::FPExt:
     case Instruction::PtrToInt:
+    case Instruction::PtrToAddr:
     case Instruction::IntToPtr:
     case Instruction::BitCast:
     case Instruction::AddrSpaceCast:
@@ -594,6 +594,7 @@ class GVNSink {
     }
   }
 };
+} // namespace
 
 std::optional<SinkingInstructionCandidate>
 GVNSink::analyzeInstructionForSinking(LockstepReverseIterator<false> &LRI,
@@ -851,8 +852,6 @@ void GVNSink::sinkLastInstruction(ArrayRef<BasicBlock *> Blocks,
   NumRemoved += Insts.size() - 1;
 }
 
-} // end anonymous namespace
-
 PreservedAnalyses GVNSinkPass::run(Function &F, FunctionAnalysisManager &AM) {
   GVNSink G;
   if (!G.run(F))
diff --git a/llvm/lib/Transforms/Scalar/GuardWidening.cpp b/llvm/lib/Transforms/Scalar/GuardWidening.cpp
index d99f1eb9c93cd..ddb99a5af6089 100644
--- a/llvm/lib/Transforms/Scalar/GuardWidening.cpp
+++ b/llvm/lib/Transforms/Scalar/GuardWidening.cpp
@@ -75,8 +75,6 @@ static cl::opt<bool>
                                "expressed as branches by widenable conditions"),
                       cl::init(true));
 
-namespace {
-
 // Get the condition of \p I. It can either be a guard or a conditional branch.
 static Value *getCondition(Instruction *I) {
   if (IntrinsicInst *GI = dyn_cast<IntrinsicInst>(I)) {
@@ -130,6 +128,8 @@ findInsertionPointForWideCondition(Instruction *WCOrGuard) {
   return std::nullopt;
 }
 
+namespace {
+
 class GuardWideningImpl {
   DominatorTree &DT;
   PostDominatorTree *PDT;
@@ -328,7 +328,7 @@ class GuardWideningImpl {
   /// The entry point for this pass.
   bool run();
 };
-}
+} // namespace
 
 static bool isSupportedGuardInstruction(const Instruction *Insn) {
   if (isGuard(Insn))
diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index c32731185afd0..7ebcc219efc15 100644
--- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -53,6 +53,7 @@
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/PatternMatch.h"
@@ -117,6 +118,10 @@ static cl::opt<bool>
 LoopPredication("indvars-predicate-loops", cl::Hidden, cl::init(true),
                 cl::desc("Predicate conditions in read only loops"));
 
+static cl::opt<bool> LoopPredicationTraps(
+    "indvars-predicate-loop-traps", cl::Hidden, cl::init(true),
+    cl::desc("Predicate conditions that trap in loops with only local writes"));
+
 static cl::opt<bool>
 AllowIVWidening("indvars-widen-indvars", cl::Hidden, cl::init(true),
                 cl::desc("Allow widening of indvars to eliminate s/zext"));
@@ -1704,6 +1709,24 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) {
   return Changed;
 }
 
+static bool crashingBBWithoutEffect(const BasicBlock &BB) {
+  return llvm::all_of(BB, [](const Instruction &I) {
+    // TODO: for now this is overly restrictive, to make sure nothing in this
+    // BB can depend on the loop body.
+    // It's not enough to check for !I.mayHaveSideEffects(), because e.g. a
+    // load does not have a side effect, but we could have
+    // %a = load ptr, ptr %ptr
+    // %b = load i32, ptr %a
+    // Now if the loop stored a non-nullptr to %a, we could cause a nullptr
+    // dereference by skipping over loop iterations.
+    if (const auto *CB = dyn_cast<CallBase>(&I)) {
+      if (CB->onlyAccessesInaccessibleMemory())
+        return true;
+    }
+    return isa<UnreachableInst>(I);
+  });
+}
+
 bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
   SmallVector<BasicBlock*, 16> ExitingBlocks;
   L->getExitingBlocks(ExitingBlocks);
@@ -1816,11 +1839,25 @@ bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
   // suggestions on how to improve this?  I can obviously bail out for outer
   // loops, but that seems less than ideal.  MemorySSA can find memory writes,
   // is that enough for *all* side effects?
+  bool HasThreadLocalSideEffects = false;
   for (BasicBlock *BB : L->blocks())
     for (auto &I : *BB)
       // TODO:isGuaranteedToTransfer
-      if (I.mayHaveSideEffects())
-        return false;
+      if (I.mayHaveSideEffects()) {
+        if (!LoopPredicationTraps)
+          return false;
+        HasThreadLocalSideEffects = true;
+        if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
+          // Simple stores cannot be observed by other threads.
+          // If HasThreadLocalSideEffects is set, we check
+          // crashingBBWithoutEffect to make sure that the crashing BB cannot
+          // observe them either.
+          if (!SI->isSimple())
+            return false;
+        } else {
+          return false;
+        }
+      }
 
   bool Changed = false;
   // Finally, do the actual predication for all predicatable blocks.  A couple
@@ -1840,6 +1877,19 @@ bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
     const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
 
     auto *BI = cast<BranchInst>(ExitingBB->getTerminator());
+    if (HasThreadLocalSideEffects) {
+      const BasicBlock *Unreachable = nullptr;
+      for (const BasicBlock *Succ : BI->successors()) {
+        if (isa<UnreachableInst>(Succ->getTerminator()))
+          Unreachable = Succ;
+      }
+      // Exit BB which have one branch back into the loop and another one to
+      // a trap can still be optimized, because local side effects cannot
+      // be observed in the exit case (the trap). We could be smarter about
+      // this, but for now lets pattern match common cases that directly trap.
+      if (Unreachable == nullptr || !crashingBBWithoutEffect(*Unreachable))
+        return Changed;
+    }
     Value *NewCond;
     if (ExitCount == ExactBTC) {
       NewCond = L->contains(BI->getSuccessor(0)) ?
diff --git a/llvm/lib/Transforms/Scalar/InferAlignment.cpp b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
index 995b80396b8af..39751c04eba08 100644
--- a/llvm/lib/Transforms/Scalar/InferAlignment.cpp
+++ b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
@@ -45,25 +45,20 @@ static bool tryToImproveAlign(
   switch (II->getIntrinsicID()) {
   case Intrinsic::masked_load:
   case Intrinsic::masked_store: {
-    int AlignOpIdx = II->getIntrinsicID() == Intrinsic::masked_load ? 1 : 2;
-    Value *PtrOp = II->getIntrinsicID() == Intrinsic::masked_load
-                       ? II->getArgOperand(0)
-                       : II->getArgOperand(1);
+    unsigned PtrOpIdx = II->getIntrinsicID() == Intrinsic::masked_load ? 0 : 1;
+    Value *PtrOp = II->getArgOperand(PtrOpIdx);
     Type *Type = II->getIntrinsicID() == Intrinsic::masked_load
                      ? II->getType()
                      : II->getArgOperand(0)->getType();
 
-    Align OldAlign =
-        cast<ConstantInt>(II->getArgOperand(AlignOpIdx))->getAlignValue();
+    Align OldAlign = II->getParamAlign(PtrOpIdx).valueOrOne();
     Align PrefAlign = DL.getPrefTypeAlign(Type);
     Align NewAlign = Fn(PtrOp, OldAlign, PrefAlign);
-    if (NewAlign <= OldAlign ||
-        NewAlign.value() > std::numeric_limits<uint32_t>().max())
+    if (NewAlign <= OldAlign)
       return false;
 
-    Value *V =
-        ConstantInt::get(Type::getInt32Ty(II->getContext()), NewAlign.value());
-    II->setOperand(AlignOpIdx, V);
+    II->addParamAttr(PtrOpIdx,
+                     Attribute::getWithAlignment(II->getContext(), NewAlign));
     return true;
   }
   default:
diff --git a/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
index 3c14036e509ef..6fb81976c9e6e 100644
--- a/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
@@ -26,8 +26,6 @@
 
 using namespace llvm;
 
-namespace llvm {
-
 static cl::opt<unsigned>
     JumpTableSizeThreshold("jump-table-to-switch-size-threshold", cl::Hidden,
                            cl::desc("Only split jump tables with size less or "
@@ -43,8 +41,8 @@ static cl::opt<unsigned> FunctionSizeThreshold(
              "or equal than this threshold."),
     cl::init(50));
 
+namespace llvm {
 extern cl::opt<bool> ProfcheckDisableMetadataFixes;
-
 } // end namespace llvm
 
 #define DEBUG_TYPE "jump-table-to-switch"
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index 9655173de4422..b2c526b41502b 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -116,8 +116,6 @@ STATISTIC(NumIntAssociationsHoisted,
 STATISTIC(NumBOAssociationsHoisted, "Number of invariant BinaryOp expressions "
                                     "reassociated and hoisted out of the loop");
 
-namespace llvm {
-
 /// Memory promotion is enabled by default.
 static cl::opt<bool>
     DisablePromotion("disable-licm-promotion", cl::Hidden, cl::init(false),
@@ -156,7 +154,7 @@ static cl::opt<unsigned> IntAssociationUpperLimit(
 // which may not be precise, since optimizeUses is capped. The result is
 // correct, but we may not get as "far up" as possible to get which access is
 // clobbering the one queried.
-cl::opt<unsigned> SetLicmMssaOptCap(
+cl::opt<unsigned> llvm::SetLicmMssaOptCap(
     "licm-mssa-optimization-cap", cl::init(100), cl::Hidden,
     cl::desc("Enable imprecision in LICM in pathological cases, in exchange "
              "for faster compile. Caps the MemorySSA clobbering calls."));
@@ -164,15 +162,15 @@ cl::opt<unsigned> SetLicmMssaOptCap(
 // Experimentally, memory promotion carries less importance than sinking and
 // hoisting. Limit when we do promotion when using MemorySSA, in order to save
 // compile time.
-cl::opt<unsigned> SetLicmMssaNoAccForPromotionCap(
+cl::opt<unsigned> llvm::SetLicmMssaNoAccForPromotionCap(
     "licm-mssa-max-acc-promotion", cl::init(250), cl::Hidden,
     cl::desc("[LICM & MemorySSA] When MSSA in LICM is disabled, this has no "
              "effect. When MSSA in LICM is enabled, then this is the maximum "
              "number of accesses allowed to be present in a loop in order to "
              "enable memory promotion."));
 
+namespace llvm {
 extern cl::opt<bool> ProfcheckDisableMetadataFixes;
-
 } // end namespace llvm
 
 static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI);
@@ -1120,11 +1118,10 @@ static bool isLoadInvariantInLoop(LoadInst *LI, DominatorTree *DT,
   return false;
 }
 
-namespace {
 /// Return true if-and-only-if we know how to (mechanically) both hoist and
 /// sink a given instruction out of a loop.  Does not address legality
 /// concerns such as aliasing or speculation safety.
-bool isHoistableAndSinkableInst(Instruction &I) {
+static bool isHoistableAndSinkableInst(Instruction &I) {
   // Only these instructions are hoistable/sinkable.
   return (isa<LoadInst>(I) || isa<StoreInst>(I) || isa<CallInst>(I) ||
           isa<FenceInst>(I) || isa<CastInst>(I) || isa<UnaryOperator>(I) ||
@@ -1136,8 +1133,8 @@ bool isHoistableAndSinkableInst(Instruction &I) {
 }
 
 /// Return true if I is the only Instruction with a MemoryAccess in L.
-bool isOnlyMemoryAccess(const Instruction *I, const Loop *L,
-                        const MemorySSAUpdater &MSSAU) {
+static bool isOnlyMemoryAccess(const Instruction *I, const Loop *L,
+                               const MemorySSAUpdater &MSSAU) {
   for (auto *BB : L->getBlocks())
     if (auto *Accs = MSSAU.getMemorySSA()->getBlockAccesses(BB)) {
       int NotAPhi = 0;
@@ -1151,7 +1148,6 @@ bool isOnlyMemoryAccess(const Instruction *I, const Loop *L,
     }
   return true;
 }
-}
 
 static MemoryAccess *getClobberingMemoryAccess(MemorySSA &MSSA,
                                                BatchAAResults &BAA,
diff --git a/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp b/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp
index 73f1942849ac2..7706de82c090c 100644
--- a/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp
@@ -21,8 +21,7 @@
 
 #define DEBUG_TYPE "loop-bound-split"
 
-namespace llvm {
-
+using namespace llvm;
 using namespace PatternMatch;
 
 namespace {
@@ -358,8 +357,7 @@ static bool splitLoopBound(Loop &L, DominatorTree &DT, LoopInfo &LI,
   IRBuilder<> Builder(&PostLoopPreHeader->front());
 
   // Update phi nodes in header of post-loop.
-  bool isExitingLatch =
-      (L.getExitingBlock() == L.getLoopLatch()) ? true : false;
+  bool isExitingLatch = L.getExitingBlock() == L.getLoopLatch();
   Value *ExitingCondLCSSAPhi = nullptr;
   for (PHINode &PN : L.getHeader()->phis()) {
     // Create LCSSA phi node in preheader of post-loop.
@@ -472,8 +470,7 @@ static bool splitLoopBound(Loop &L, DominatorTree &DT, LoopInfo &LI,
 PreservedAnalyses LoopBoundSplitPass::run(Loop &L, LoopAnalysisManager &AM,
                                           LoopStandardAnalysisResults &AR,
                                           LPMUpdater &U) {
-  Function &F = *L.getHeader()->getParent();
-  (void)F;
+  [[maybe_unused]] Function &F = *L.getHeader()->getParent();
 
   LLVM_DEBUG(dbgs() << "Spliting bound of loop in " << F.getName() << ": " << L
                     << "\n");
@@ -486,5 +483,3 @@ PreservedAnalyses LoopBoundSplitPass::run(Loop &L, LoopAnalysisManager &AM,
 
   return getLoopPassPreservedAnalyses();
 }
-
-} // end namespace llvm
diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
index 2073303237f69..19eccb9e17020 100644
--- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
@@ -368,7 +368,7 @@ struct FusionCandidate {
     Valid = false;
   }
 
-  bool reportInvalidCandidate(llvm::Statistic &Stat) const {
+  bool reportInvalidCandidate(Statistic &Stat) const {
     using namespace ore;
     assert(L && Preheader && "Fusion candidate not initialized properly!");
 #if LLVM_ENABLE_STATS
@@ -445,6 +445,7 @@ struct FusionCandidateCompare {
         "No dominance relationship between these fusion candidates!");
   }
 };
+} // namespace
 
 using LoopVector = SmallVector<Loop *, 4>;
 
@@ -461,9 +462,15 @@ using LoopVector = SmallVector<Loop *, 4>;
 using FusionCandidateSet = std::set<FusionCandidate, FusionCandidateCompare>;
 using FusionCandidateCollection = SmallVector<FusionCandidateSet, 4>;
 
-#if !defined(NDEBUG)
-static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
-                                     const FusionCandidate &FC) {
+#ifndef NDEBUG
+static void printLoopVector(const LoopVector &LV) {
+  dbgs() << "****************************\n";
+  for (const Loop *L : LV)
+    printLoop(*L, dbgs());
+  dbgs() << "****************************\n";
+}
+
+static raw_ostream &operator<<(raw_ostream &OS, const FusionCandidate &FC) {
   if (FC.isValid())
     OS << FC.Preheader->getName();
   else
@@ -472,8 +479,8 @@ static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
   return OS;
 }
 
-static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
-                                     const FusionCandidateSet &CandSet) {
+static raw_ostream &operator<<(raw_ostream &OS,
+                               const FusionCandidateSet &CandSet) {
   for (const FusionCandidate &FC : CandSet)
     OS << FC << '\n';
 
@@ -489,7 +496,9 @@ printFusionCandidates(const FusionCandidateCollection &FusionCandidates) {
     dbgs() << "****************************\n";
   }
 }
-#endif
+#endif // NDEBUG
+
+namespace {
 
 /// Collect all loops in function at the same nest level, starting at the
 /// outermost level.
@@ -550,15 +559,6 @@ struct LoopDepthTree {
   LoopsOnLevelTy LoopsOnLevel;
 };
 
-#ifndef NDEBUG
-static void printLoopVector(const LoopVector &LV) {
-  dbgs() << "****************************\n";
-  for (auto *L : LV)
-    printLoop(*L, dbgs());
-  dbgs() << "****************************\n";
-}
-#endif
-
 struct LoopFuser {
 private:
   // Sets of control flow equivalent fusion candidates for a given nest level.
@@ -1850,7 +1850,7 @@ struct LoopFuser {
   ///       <Cand1 Preheader> and <Cand2 Preheader>: <Stat Description>
   template <typename RemarkKind>
   void reportLoopFusion(const FusionCandidate &FC0, const FusionCandidate &FC1,
-                        llvm::Statistic &Stat) {
+                        Statistic &Stat) {
     assert(FC0.Preheader && FC1.Preheader &&
            "Expecting valid fusion candidates");
     using namespace ore;
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 28ae4f0a0aad9..9aaf6a5aa4d6a 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -43,6 +43,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include <cassert>
 #include <utility>
@@ -1872,6 +1873,51 @@ static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader,
   InnerLatch->replacePhiUsesWith(InnerLatch, OuterLatch);
 }
 
+/// This deals with a corner case when a LCSSA phi node appears in a non-exit
+/// block: the outer loop latch block does not need to be exit block of the
+/// inner loop. Consider a loop that was in LCSSA form, but then some
+/// transformation like loop-unswitch comes along and creates an empty block,
+/// where BB5 in this example is the outer loop latch block:
+///
+///   BB4:
+///     br label %BB5
+///   BB5:
+///     %old.cond.lcssa = phi i16 [ %cond, %BB4 ]
+///     br outer.header
+///
+/// Interchange then brings it in LCSSA form again resulting in this chain of
+/// single-input phi nodes:
+///
+///   BB4:
+///     %new.cond.lcssa = phi i16 [ %cond, %BB3 ]
+///     br label %BB5
+///   BB5:
+///     %old.cond.lcssa = phi i16 [ %new.cond.lcssa, %BB4 ]
+///
+/// The problem is that interchange can reoder blocks BB4 and BB5 placing the
+/// use before the def if we don't check this. The solution is to simplify
+/// lcssa phi nodes (remove) if they appear in non-exit blocks.
+///
+static void simplifyLCSSAPhis(Loop *OuterLoop, Loop *InnerLoop) {
+  BasicBlock *InnerLoopExit = InnerLoop->getExitBlock();
+  BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
+
+  // Do not modify lcssa phis where they actually belong, i.e. in exit blocks.
+  if (OuterLoopLatch == InnerLoopExit)
+    return;
+
+  // Collect and remove phis in non-exit blocks if they have 1 input.
+  SmallVector<PHINode *, 8> Phis(
+      llvm::make_pointer_range(OuterLoopLatch->phis()));
+  for (PHINode *Phi : Phis) {
+    assert(Phi->getNumIncomingValues() == 1 && "Single input phi expected");
+    LLVM_DEBUG(dbgs() << "Removing 1-input phi in non-exit block: " << *Phi
+                      << "\n");
+    Phi->replaceAllUsesWith(Phi->getIncomingValue(0));
+    Phi->eraseFromParent();
+  }
+}
+
 bool LoopInterchangeTransform::adjustLoopBranches() {
   LLVM_DEBUG(dbgs() << "adjustLoopBranches called\n");
   std::vector<DominatorTree::UpdateType> DTUpdates;
@@ -1882,6 +1928,9 @@ bool LoopInterchangeTransform::adjustLoopBranches() {
   assert(OuterLoopPreHeader != OuterLoop->getHeader() &&
          InnerLoopPreHeader != InnerLoop->getHeader() && OuterLoopPreHeader &&
          InnerLoopPreHeader && "Guaranteed by loop-simplify form");
+
+  simplifyLCSSAPhis(OuterLoop, InnerLoop);
+
   // Ensure that both preheaders do not contain PHI nodes and have single
   // predecessors. This allows us to move them easily. We use
   // InsertPreHeaderForLoop to create an 'extra' preheader, if the existing
diff --git a/llvm/lib/Transforms/Scalar/LoopPassManager.cpp b/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
index 32078b1720508..d827e649d0842 100644
--- a/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
@@ -8,7 +8,6 @@
 
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
@@ -16,8 +15,6 @@
 
 using namespace llvm;
 
-namespace llvm {
-
 /// Explicitly specialize the pass manager's run method to handle loop nest
 /// structure updates.
 PreservedAnalyses
@@ -185,7 +182,6 @@ LoopPassManager::runWithoutLoopNestPasses(Loop &L, LoopAnalysisManager &AM,
   }
   return PA;
 }
-} // namespace llvm
 
 void FunctionToLoopPassAdaptor::printPipeline(
     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
@@ -193,6 +189,7 @@ void FunctionToLoopPassAdaptor::printPipeline(
   Pass->printPipeline(OS, MapClassName2PassName);
   OS << ')';
 }
+
 PreservedAnalyses FunctionToLoopPassAdaptor::run(Function &F,
                                                  FunctionAnalysisManager &AM) {
   // Before we even compute any loop analyses, first run a miniature function
@@ -219,9 +216,6 @@ PreservedAnalyses FunctionToLoopPassAdaptor::run(Function &F,
   // Get the analysis results needed by loop passes.
   MemorySSA *MSSA =
       UseMemorySSA ? (&AM.getResult<MemorySSAAnalysis>(F).getMSSA()) : nullptr;
-  BlockFrequencyInfo *BFI = UseBlockFrequencyInfo && F.hasProfileData()
-                                ? (&AM.getResult<BlockFrequencyAnalysis>(F))
-                                : nullptr;
   LoopStandardAnalysisResults LAR = {AM.getResult<AAManager>(F),
                                      AM.getResult<AssumptionAnalysis>(F),
                                      AM.getResult<DominatorTreeAnalysis>(F),
@@ -229,7 +223,6 @@ PreservedAnalyses FunctionToLoopPassAdaptor::run(Function &F,
                                      AM.getResult<ScalarEvolutionAnalysis>(F),
                                      AM.getResult<TargetLibraryAnalysis>(F),
                                      AM.getResult<TargetIRAnalysis>(F),
-                                     BFI,
                                      MSSA};
 
   // Setup the loop analysis manager from its proxy. It is important that
diff --git a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
index 448dc2b8b52b0..f3e6cbf53507a 100644
--- a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
@@ -540,8 +540,6 @@ bool LoopVersioningLICM::run(DominatorTree *DT) {
   return Changed;
 }
 
-namespace llvm {
-
 PreservedAnalyses LoopVersioningLICMPass::run(Loop &L, LoopAnalysisManager &AM,
                                               LoopStandardAnalysisResults &LAR,
                                               LPMUpdater &U) {
@@ -556,4 +554,3 @@ PreservedAnalyses LoopVersioningLICMPass::run(Loop &L, LoopAnalysisManager &AM,
     return PreservedAnalyses::all();
   return getLoopPassPreservedAnalyses();
 }
-} // namespace llvm
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 7cae94ebb4ba1..3487e812a68a3 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -97,6 +97,12 @@ static cl::opt<MatrixLayoutTy> MatrixLayout(
 static cl::opt<bool> PrintAfterTransposeOpt("matrix-print-after-transpose-opt",
                                             cl::init(false));
 
+static cl::opt<unsigned> SplitMatmulRemainderOverThreshold(
+    "matrix-split-matmul-remainder-over-threshold", cl::Hidden,
+    cl::desc("Illegal remainder vectors over this size in bits should be split "
+             "in the inner loop of matmul"),
+    cl::init(0));
+
 /// Helper function to either return Scope, if it is a subprogram or the
 /// attached subprogram for a local scope.
 static DISubprogram *getSubprogram(DIScope *Scope) {
@@ -115,18 +121,16 @@ static bool isSplat(Value *V) {
 
 /// Match any mul operation (fp or integer).
 template <typename LTy, typename RTy>
-auto m_AnyMul(const LTy &L, const RTy &R) {
+static auto m_AnyMul(const LTy &L, const RTy &R) {
   return m_CombineOr(m_Mul(L, R), m_FMul(L, R));
 }
 
 /// Match any add operation (fp or integer).
 template <typename LTy, typename RTy>
-auto m_AnyAdd(const LTy &L, const RTy &R) {
+static auto m_AnyAdd(const LTy &L, const RTy &R) {
   return m_CombineOr(m_Add(L, R), m_FAdd(L, R));
 }
 
-namespace {
-
 // Given an element pointer \p BasePtr to the start of a (sub) matrix, compute
 // the start address of vector \p VecIdx with type (\p EltType x \p NumElements)
 // assuming \p Stride elements between start two consecutive vectors.
@@ -167,9 +171,9 @@ namespace {
 //         v_2_0 |v_2_1 |v_2_2 |v_2_3
 //         v_3_0 {v_3_1 {v_3_2  v_3_3
 //
-Value *computeVectorAddr(Value *BasePtr, Value *VecIdx, Value *Stride,
-                         unsigned NumElements, Type *EltType,
-                         IRBuilder<> &Builder) {
+static Value *computeVectorAddr(Value *BasePtr, Value *VecIdx, Value *Stride,
+                                unsigned NumElements, Type *EltType,
+                                IRBuilder<> &Builder) {
 
   assert((!isa<ConstantInt>(Stride) ||
           cast<ConstantInt>(Stride)->getZExtValue() >= NumElements) &&
@@ -338,6 +342,8 @@ computeShapeInfoForInst(Instruction *I,
   return std::nullopt;
 }
 
+namespace {
+
 /// LowerMatrixIntrinsics contains the methods used to lower matrix intrinsics.
 ///
 /// Currently, the lowering for each matrix intrinsic is done as follows:
@@ -371,7 +377,8 @@ class LowerMatrixIntrinsics {
   LoopInfo *LI = nullptr;
   OptimizationRemarkEmitter *ORE = nullptr;
 
-  /// Contains estimates of the number of operations (loads, stores, compute) required to lower a matrix operation.
+  /// Contains estimates of the number of operations (loads, stores, compute)
+  /// required to lower a matrix operation.
   struct OpInfoTy {
     /// Number of stores emitted to generate this matrix.
     unsigned NumStores = 0;
@@ -1719,6 +1726,31 @@ class LowerMatrixIntrinsics {
     ToRemove.push_back(MatMul);
   }
 
+  /// Given \p Remainder iterations of the the matmul inner loop,
+  /// potentially lower \p Blocksize that is used for the underlying
+  /// vector.
+  unsigned capBlockSize(unsigned BlockSize, unsigned Remainder, Type *EltType) {
+    if (BlockSize <= Remainder)
+      return BlockSize;
+
+    // If the remainder is also a legal type just use it.
+    auto *VecTy = FixedVectorType::get(EltType, Remainder);
+    if (TTI.isTypeLegal(VecTy))
+      return Remainder;
+
+    // Similarly, if the vector is small enough that we don't want
+    // to split further.
+    if (VecTy->getPrimitiveSizeInBits() <= SplitMatmulRemainderOverThreshold)
+      return Remainder;
+
+    // Gradually lower the vectorization factor to cover the
+    // remainder.
+    do {
+      BlockSize /= 2;
+    } while (BlockSize > Remainder);
+    return BlockSize;
+  }
+
   /// Compute \p Result += \p A * \p B for input matrices with left-associating
   /// addition.
   ///
@@ -1756,10 +1788,8 @@ class LowerMatrixIntrinsics {
         bool isSumZero = isa<ConstantAggregateZero>(Result.getColumn(J));
 
         for (unsigned I = 0; I < R; I += BlockSize) {
-          // Gradually lower the vectorization factor to cover the remainder.
-          while (I + BlockSize > R)
-            BlockSize /= 2;
-
+          // Lower block size to make sure we stay within bounds.
+          BlockSize = capBlockSize(BlockSize, R - I, Result.getElementType());
           Value *Sum = IsTiled ? Result.extractVector(I, J, BlockSize, Builder)
                                : nullptr;
           for (unsigned K = 0; K < M; ++K) {
@@ -1784,9 +1814,8 @@ class LowerMatrixIntrinsics {
         unsigned BlockSize = VF;
         bool isSumZero = isa<ConstantAggregateZero>(Result.getRow(I));
         for (unsigned J = 0; J < C; J += BlockSize) {
-          // Gradually lower the vectorization factor to cover the remainder.
-          while (J + BlockSize > C)
-            BlockSize /= 2;
+          // Lower the vectorization factor to cover the remainder.
+          BlockSize = capBlockSize(BlockSize, C - J, Result.getElementType());
 
           Value *Sum = nullptr;
           for (unsigned K = 0; K < M; ++K) {
diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp
index 80aa98d59d3c6..5a8f18a60da25 100644
--- a/llvm/lib/Transforms/Scalar/NewGVN.cpp
+++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -160,9 +160,6 @@ static cl::opt<bool> EnablePhiOfOps("enable-phi-of-ops", cl::init(true),
 //===----------------------------------------------------------------------===//
 
 // Anchor methods.
-namespace llvm {
-namespace GVNExpression {
-
 Expression::~Expression() = default;
 BasicExpression::~BasicExpression() = default;
 CallExpression::~CallExpression() = default;
@@ -171,9 +168,6 @@ StoreExpression::~StoreExpression() = default;
 AggregateValueExpression::~AggregateValueExpression() = default;
 PHIExpression::~PHIExpression() = default;
 
-} // end namespace GVNExpression
-} // end namespace llvm
-
 namespace {
 
 // Tarjan's SCC finding algorithm with Nuutila's improvements
diff --git a/llvm/lib/Transforms/Scalar/Reassociate.cpp b/llvm/lib/Transforms/Scalar/Reassociate.cpp
index ba58b8e4eda5e..6d7ce36606083 100644
--- a/llvm/lib/Transforms/Scalar/Reassociate.cpp
+++ b/llvm/lib/Transforms/Scalar/Reassociate.cpp
@@ -2623,32 +2623,32 @@ PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) {
 
 namespace {
 
-  class ReassociateLegacyPass : public FunctionPass {
-    ReassociatePass Impl;
+class ReassociateLegacyPass : public FunctionPass {
+  ReassociatePass Impl;
 
-  public:
-    static char ID; // Pass identification, replacement for typeid
+public:
+  static char ID; // Pass identification, replacement for typeid
 
-    ReassociateLegacyPass() : FunctionPass(ID) {
-      initializeReassociateLegacyPassPass(*PassRegistry::getPassRegistry());
-    }
+  ReassociateLegacyPass() : FunctionPass(ID) {
+    initializeReassociateLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
 
-    bool runOnFunction(Function &F) override {
-      if (skipFunction(F))
-        return false;
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
 
-      FunctionAnalysisManager DummyFAM;
-      auto PA = Impl.run(F, DummyFAM);
-      return !PA.areAllPreserved();
-    }
+    FunctionAnalysisManager DummyFAM;
+    auto PA = Impl.run(F, DummyFAM);
+    return !PA.areAllPreserved();
+  }
 
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.setPreservesCFG();
-      AU.addPreserved<AAResultsWrapperPass>();
-      AU.addPreserved<BasicAAWrapperPass>();
-      AU.addPreserved<GlobalsAAWrapperPass>();
-    }
-  };
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addPreserved<AAResultsWrapperPass>();
+    AU.addPreserved<BasicAAWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+};
 
 } // end anonymous namespace
 
diff --git a/llvm/lib/Transforms/Scalar/Reg2Mem.cpp b/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
index 30b27cb19b4ad..764662441a040 100644
--- a/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
+++ b/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
@@ -107,9 +107,7 @@ PreservedAnalyses RegToMemPass::run(Function &F, FunctionAnalysisManager &AM) {
   return PA;
 }
 
-namespace llvm {
-
-void initializeRegToMemWrapperPassPass(PassRegistry &);
+namespace {
 
 class RegToMemWrapperPass : public FunctionPass {
 public:
@@ -136,7 +134,7 @@ class RegToMemWrapperPass : public FunctionPass {
     return N != 0 || Changed;
   }
 };
-} // namespace llvm
+} // namespace
 
 INITIALIZE_PASS_BEGIN(RegToMemWrapperPass, "reg2mem", "", true, true)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass);
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 44e527708b216..491685f9a032b 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -118,9 +118,13 @@ STATISTIC(
 STATISTIC(NumDeleted, "Number of instructions deleted");
 STATISTIC(NumVectorized, "Number of vectorized aggregates");
 
+namespace llvm {
 /// Disable running mem2reg during SROA in order to test or debug SROA.
 static cl::opt<bool> SROASkipMem2Reg("sroa-skip-mem2reg", cl::init(false),
                                      cl::Hidden);
+extern cl::opt<bool> ProfcheckDisableMetadataFixes;
+} // namespace llvm
+
 namespace {
 
 class AllocaSliceRewriter;
@@ -340,6 +344,12 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
                              uint64_t SliceSizeInBits, Instruction *OldInst,
                              Instruction *Inst, Value *Dest, Value *Value,
                              const DataLayout &DL) {
+  // If we want allocas to be migrated using this helper then we need to ensure
+  // that the BaseFragments map code still works. A simple solution would be
+  // to choose to always clone alloca dbg_assigns (rather than sometimes
+  // "stealing" them).
+  assert(!isa<AllocaInst>(Inst) && "Unexpected alloca");
+
   auto DVRAssignMarkerRange = at::getDVRAssignmentMarkers(OldInst);
   // Nothing to do if OldInst has no linked dbg.assign intrinsics.
   if (DVRAssignMarkerRange.empty())
@@ -425,11 +435,22 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
       Inst->setMetadata(LLVMContext::MD_DIAssignID, NewID);
     }
 
-    ::Value *NewValue = Value ? Value : DbgAssign->getValue();
-    DbgVariableRecord *NewAssign = cast<DbgVariableRecord>(cast<DbgRecord *>(
-        DIB.insertDbgAssign(Inst, NewValue, DbgAssign->getVariable(), Expr,
-                            Dest, DIExpression::get(Expr->getContext(), {}),
-                            DbgAssign->getDebugLoc())));
+    DbgVariableRecord *NewAssign;
+    if (IsSplit) {
+      ::Value *NewValue = Value ? Value : DbgAssign->getValue();
+      NewAssign = cast<DbgVariableRecord>(cast<DbgRecord *>(
+          DIB.insertDbgAssign(Inst, NewValue, DbgAssign->getVariable(), Expr,
+                              Dest, DIExpression::get(Expr->getContext(), {}),
+                              DbgAssign->getDebugLoc())));
+    } else {
+      // The store is not split, simply steal the existing dbg_assign.
+      NewAssign = DbgAssign;
+      NewAssign->setAssignId(NewID); // FIXME: Can we avoid generating new IDs?
+      NewAssign->setAddress(Dest);
+      if (Value)
+        NewAssign->replaceVariableLocationOp(0u, Value);
+      assert(Expr == NewAssign->getExpression());
+    }
 
     // If we've updated the value but the original dbg.assign has an arglist
     // then kill it now - we can't use the requested new value.
@@ -460,9 +481,10 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
     // noted as slightly offset (in code) from the store. In practice this
     // should have little effect on the debugging experience due to the fact
     // that all the split stores should get the same line number.
-    NewAssign->moveBefore(DbgAssign->getIterator());
-
-    NewAssign->setDebugLoc(DbgAssign->getDebugLoc());
+    if (NewAssign != DbgAssign) {
+      NewAssign->moveBefore(DbgAssign->getIterator());
+      NewAssign->setDebugLoc(DbgAssign->getDebugLoc());
+    }
     LLVM_DEBUG(dbgs() << "Created new assign: " << *NewAssign << "\n");
   };
 
@@ -547,12 +569,10 @@ class Slice {
   }
 
   /// Support comparison with a single offset to allow binary searches.
-  friend LLVM_ATTRIBUTE_UNUSED bool operator<(const Slice &LHS,
-                                              uint64_t RHSOffset) {
+  [[maybe_unused]] friend bool operator<(const Slice &LHS, uint64_t RHSOffset) {
     return LHS.beginOffset() < RHSOffset;
   }
-  friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset,
-                                              const Slice &RHS) {
+  [[maybe_unused]] friend bool operator<(uint64_t LHSOffset, const Slice &RHS) {
     return LHSOffset < RHS.beginOffset();
   }
 
@@ -1777,7 +1797,8 @@ static void speculateSelectInstLoads(SelectInst &SI, LoadInst &LI,
   }
 
   Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL,
-                              LI.getName() + ".sroa.speculated");
+                              LI.getName() + ".sroa.speculated",
+                              ProfcheckDisableMetadataFixes ? nullptr : &SI);
 
   LLVM_DEBUG(dbgs() << "          speculated to: " << *V << "\n");
   LI.replaceAllUsesWith(V);
@@ -2662,7 +2683,9 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
   for (unsigned i = 0; i != cast<FixedVectorType>(VecTy)->getNumElements(); ++i)
     Mask2.push_back(IRB.getInt1(i >= BeginIndex && i < EndIndex));
 
-  V = IRB.CreateSelect(ConstantVector::get(Mask2), V, Old, Name + "blend");
+  // No profiling support for vector selects.
+  V = IRB.CreateSelectWithUnknownProfile(ConstantVector::get(Mask2), V, Old,
+                                         DEBUG_TYPE, Name + "blend");
 
   LLVM_DEBUG(dbgs() << "    blend: " << *V << "\n");
   return V;
@@ -4360,10 +4383,13 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
     };
 
     Value *Cond, *True, *False;
+    Instruction *MDFrom = nullptr;
     if (auto *SI = dyn_cast<SelectInst>(Sel)) {
       Cond = SI->getCondition();
       True = SI->getTrueValue();
       False = SI->getFalseValue();
+      if (!ProfcheckDisableMetadataFixes)
+        MDFrom = SI;
     } else {
       Cond = Sel->getOperand(0);
       True = ConstantInt::get(Sel->getType(), 1);
@@ -4383,8 +4409,12 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
         IRB.CreateGEP(Ty, FalseOps[0], ArrayRef(FalseOps).drop_front(),
                       False->getName() + ".sroa.gep", NW);
 
-    Value *NSel =
-        IRB.CreateSelect(Cond, NTrue, NFalse, Sel->getName() + ".sroa.sel");
+    Value *NSel = MDFrom
+                      ? IRB.CreateSelect(Cond, NTrue, NFalse,
+                                         Sel->getName() + ".sroa.sel", MDFrom)
+                      : IRB.CreateSelectWithUnknownProfile(
+                            Cond, NTrue, NFalse, DEBUG_TYPE,
+                            Sel->getName() + ".sroa.sel");
     Visited.erase(&GEPI);
     GEPI.replaceAllUsesWith(NSel);
     GEPI.eraseFromParent();
diff --git a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
index 42d6680c3cb7d..146e7d1047dd0 100644
--- a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
+++ b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
@@ -111,7 +111,7 @@ static unsigned adjustForEndian(const DataLayout &DL, unsigned VectorWidth,
 }
 
 // Translate a masked load intrinsic like
-// <16 x i32 > @llvm.masked.load( <16 x i32>* %addr, i32 align,
+// <16 x i32 > @llvm.masked.load( <16 x i32>* %addr,
 //                               <16 x i1> %mask, <16 x i32> %passthru)
 // to a chain of basic blocks, with loading element one-by-one if
 // the appropriate mask bit is set
@@ -146,11 +146,10 @@ static void scalarizeMaskedLoad(const DataLayout &DL, bool HasBranchDivergence,
                                 CallInst *CI, DomTreeUpdater *DTU,
                                 bool &ModifiedDT) {
   Value *Ptr = CI->getArgOperand(0);
-  Value *Alignment = CI->getArgOperand(1);
-  Value *Mask = CI->getArgOperand(2);
-  Value *Src0 = CI->getArgOperand(3);
+  Value *Mask = CI->getArgOperand(1);
+  Value *Src0 = CI->getArgOperand(2);
 
-  const Align AlignVal = cast<ConstantInt>(Alignment)->getAlignValue();
+  const Align AlignVal = CI->getParamAlign(0).valueOrOne();
   VectorType *VecType = cast<FixedVectorType>(CI->getType());
 
   Type *EltTy = VecType->getElementType();
@@ -290,7 +289,7 @@ static void scalarizeMaskedLoad(const DataLayout &DL, bool HasBranchDivergence,
 }
 
 // Translate a masked store intrinsic, like
-// void @llvm.masked.store(<16 x i32> %src, <16 x i32>* %addr, i32 align,
+// void @llvm.masked.store(<16 x i32> %src, <16 x i32>* %addr,
 //                               <16 x i1> %mask)
 // to a chain of basic blocks, that stores element one-by-one if
 // the appropriate mask bit is set
@@ -320,10 +319,9 @@ static void scalarizeMaskedStore(const DataLayout &DL, bool HasBranchDivergence,
                                  bool &ModifiedDT) {
   Value *Src = CI->getArgOperand(0);
   Value *Ptr = CI->getArgOperand(1);
-  Value *Alignment = CI->getArgOperand(2);
-  Value *Mask = CI->getArgOperand(3);
+  Value *Mask = CI->getArgOperand(2);
 
-  const Align AlignVal = cast<ConstantInt>(Alignment)->getAlignValue();
+  const Align AlignVal = CI->getParamAlign(1).valueOrOne();
   auto *VecType = cast<VectorType>(Src->getType());
 
   Type *EltTy = VecType->getElementType();
@@ -472,9 +470,8 @@ static void scalarizeMaskedGather(const DataLayout &DL,
                                   bool HasBranchDivergence, CallInst *CI,
                                   DomTreeUpdater *DTU, bool &ModifiedDT) {
   Value *Ptrs = CI->getArgOperand(0);
-  Value *Alignment = CI->getArgOperand(1);
-  Value *Mask = CI->getArgOperand(2);
-  Value *Src0 = CI->getArgOperand(3);
+  Value *Mask = CI->getArgOperand(1);
+  Value *Src0 = CI->getArgOperand(2);
 
   auto *VecType = cast<FixedVectorType>(CI->getType());
   Type *EltTy = VecType->getElementType();
@@ -483,7 +480,7 @@ static void scalarizeMaskedGather(const DataLayout &DL,
   Instruction *InsertPt = CI;
   BasicBlock *IfBlock = CI->getParent();
   Builder.SetInsertPoint(InsertPt);
-  MaybeAlign AlignVal = cast<ConstantInt>(Alignment)->getMaybeAlignValue();
+  Align AlignVal = CI->getParamAlign(0).valueOrOne();
 
   Builder.SetCurrentDebugLocation(CI->getDebugLoc());
 
@@ -608,8 +605,7 @@ static void scalarizeMaskedScatter(const DataLayout &DL,
                                    DomTreeUpdater *DTU, bool &ModifiedDT) {
   Value *Src = CI->getArgOperand(0);
   Value *Ptrs = CI->getArgOperand(1);
-  Value *Alignment = CI->getArgOperand(2);
-  Value *Mask = CI->getArgOperand(3);
+  Value *Mask = CI->getArgOperand(2);
 
   auto *SrcFVTy = cast<FixedVectorType>(Src->getType());
 
@@ -623,7 +619,7 @@ static void scalarizeMaskedScatter(const DataLayout &DL,
   Builder.SetInsertPoint(InsertPt);
   Builder.SetCurrentDebugLocation(CI->getDebugLoc());
 
-  MaybeAlign AlignVal = cast<ConstantInt>(Alignment)->getMaybeAlignValue();
+  Align AlignVal = CI->getParamAlign(1).valueOrOne();
   unsigned VectorWidth = SrcFVTy->getNumElements();
 
   // Shorten the way if the mask is a vector of constants.
@@ -1125,8 +1121,7 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT,
     case Intrinsic::masked_load:
       // Scalarize unsupported vector masked load
       if (TTI.isLegalMaskedLoad(
-              CI->getType(),
-              cast<ConstantInt>(CI->getArgOperand(1))->getAlignValue(),
+              CI->getType(), CI->getParamAlign(0).valueOrOne(),
               cast<PointerType>(CI->getArgOperand(0)->getType())
                   ->getAddressSpace()))
         return false;
@@ -1135,18 +1130,15 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT,
     case Intrinsic::masked_store:
       if (TTI.isLegalMaskedStore(
               CI->getArgOperand(0)->getType(),
-              cast<ConstantInt>(CI->getArgOperand(2))->getAlignValue(),
+              CI->getParamAlign(1).valueOrOne(),
               cast<PointerType>(CI->getArgOperand(1)->getType())
                   ->getAddressSpace()))
         return false;
       scalarizeMaskedStore(DL, HasBranchDivergence, CI, DTU, ModifiedDT);
       return true;
     case Intrinsic::masked_gather: {
-      MaybeAlign MA =
-          cast<ConstantInt>(CI->getArgOperand(1))->getMaybeAlignValue();
+      Align Alignment = CI->getParamAlign(0).valueOrOne();
       Type *LoadTy = CI->getType();
-      Align Alignment = DL.getValueOrABITypeAlignment(MA,
-                                                      LoadTy->getScalarType());
       if (TTI.isLegalMaskedGather(LoadTy, Alignment) &&
           !TTI.forceScalarizeMaskedGather(cast<VectorType>(LoadTy), Alignment))
         return false;
@@ -1154,11 +1146,8 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT,
       return true;
     }
     case Intrinsic::masked_scatter: {
-      MaybeAlign MA =
-          cast<ConstantInt>(CI->getArgOperand(2))->getMaybeAlignValue();
+      Align Alignment = CI->getParamAlign(1).valueOrOne();
       Type *StoreTy = CI->getArgOperand(0)->getType();
-      Align Alignment = DL.getValueOrABITypeAlignment(MA,
-                                                      StoreTy->getScalarType());
       if (TTI.isLegalMaskedScatter(StoreTy, Alignment) &&
           !TTI.forceScalarizeMaskedScatter(cast<VectorType>(StoreTy),
                                            Alignment))
diff --git a/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
index aae5d6074ae8e..25a531ca33214 100644
--- a/llvm/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
@@ -50,9 +50,7 @@ using namespace llvm;
 
 #define DEBUG_TYPE "scalarizer"
 
-namespace {
-
-BasicBlock::iterator skipPastPhiNodesAndDbg(BasicBlock::iterator Itr) {
+static BasicBlock::iterator skipPastPhiNodesAndDbg(BasicBlock::iterator Itr) {
   BasicBlock *BB = Itr->getParent();
   if (isa<PHINode>(Itr))
     Itr = BB->getFirstInsertionPt();
@@ -76,6 +74,8 @@ using ScatterMap = std::map<std::pair<Value *, Type *>, ValueVector>;
 // along with a pointer to their scattered forms.
 using GatherList = SmallVector<std::pair<Instruction *, ValueVector *>, 16>;
 
+namespace {
+
 struct VectorSplit {
   // The type of the vector.
   FixedVectorType *VecTy = nullptr;
@@ -196,6 +196,7 @@ struct VectorLayout {
   // The size of each (non-remainder) fragment in bytes.
   uint64_t SplitSize = 0;
 };
+} // namespace
 
 static bool isStructOfMatchingFixedVectors(Type *Ty) {
   if (!isa<StructType>(Ty))
@@ -268,6 +269,7 @@ static Value *concatenate(IRBuilder<> &Builder, ArrayRef<Value *> Fragments,
   return Res;
 }
 
+namespace {
 class ScalarizerVisitor : public InstVisitor<ScalarizerVisitor, bool> {
 public:
   ScalarizerVisitor(DominatorTree *DT, const TargetTransformInfo *TTI,
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index e4ba70d1bce16..5af6c96c56a06 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -27,7 +27,6 @@
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/MustExecute.h"
-#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -3611,8 +3610,7 @@ static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI,
                          AssumptionCache &AC, AAResults &AA,
                          TargetTransformInfo &TTI, bool Trivial,
                          bool NonTrivial, ScalarEvolution *SE,
-                         MemorySSAUpdater *MSSAU, ProfileSummaryInfo *PSI,
-                         BlockFrequencyInfo *BFI, LPMUpdater &LoopUpdater) {
+                         MemorySSAUpdater *MSSAU, LPMUpdater &LoopUpdater) {
   assert(L.isRecursivelyLCSSAForm(DT, LI) &&
          "Loops must be in LCSSA form before unswitching.");
 
@@ -3652,35 +3650,6 @@ static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI,
   if (F->hasOptSize())
     return false;
 
-  // Returns true if Loop L's loop nest is cold, i.e. if the headers of L,
-  // of the loops L is nested in, and of the loops nested in L are all cold.
-  auto IsLoopNestCold = [&](const Loop *L) {
-    // Check L and all of its parent loops.
-    auto *Parent = L;
-    while (Parent) {
-      if (!PSI->isColdBlock(Parent->getHeader(), BFI))
-        return false;
-      Parent = Parent->getParentLoop();
-    }
-    // Next check all loops nested within L.
-    SmallVector<const Loop *, 4> Worklist;
-    llvm::append_range(Worklist, L->getSubLoops());
-    while (!Worklist.empty()) {
-      auto *CurLoop = Worklist.pop_back_val();
-      if (!PSI->isColdBlock(CurLoop->getHeader(), BFI))
-        return false;
-      llvm::append_range(Worklist, CurLoop->getSubLoops());
-    }
-    return true;
-  };
-
-  // Skip cold loops in cold loop nests, as unswitching them brings little
-  // benefit but increases the code size
-  if (PSI && PSI->hasProfileSummary() && BFI && IsLoopNestCold(&L)) {
-    LLVM_DEBUG(dbgs() << " Skip cold loop: " << L << "\n");
-    return false;
-  }
-
   // Perform legality checks.
   if (!isSafeForNoNTrivialUnswitching(L, LI))
     return false;
@@ -3705,11 +3674,6 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM,
                                               LPMUpdater &U) {
   Function &F = *L.getHeader()->getParent();
   (void)F;
-  ProfileSummaryInfo *PSI = nullptr;
-  if (auto OuterProxy =
-          AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR)
-              .getCachedResult<ModuleAnalysisManagerFunctionProxy>(F))
-    PSI = OuterProxy->getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
   LLVM_DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << L
                     << "\n");
 
@@ -3720,7 +3684,7 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM,
       AR.MSSA->verifyMemorySSA();
   }
   if (!unswitchLoop(L, AR.DT, AR.LI, AR.AC, AR.AA, AR.TTI, Trivial, NonTrivial,
-                    &AR.SE, MSSAU ? &*MSSAU : nullptr, PSI, AR.BFI, U))
+                    &AR.SE, MSSAU ? &*MSSAU : nullptr, U))
     return PreservedAnalyses::all();
 
   if (AR.MSSA && VerifyMemorySSA)
diff --git a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
index ebcbd2bf87a87..23e1243def290 100644
--- a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
+++ b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
@@ -149,8 +149,6 @@ bool SpeculativeExecutionLegacyPass::runOnFunction(Function &F) {
   return Impl.runImpl(F, TTI);
 }
 
-namespace llvm {
-
 bool SpeculativeExecutionPass::runImpl(Function &F, TargetTransformInfo *TTI) {
   if (OnlyIfDivergentTarget && !TTI->hasBranchDivergence(&F)) {
     LLVM_DEBUG(dbgs() << "Not running SpeculativeExecution because "
@@ -229,6 +227,7 @@ static InstructionCost ComputeSpeculationCost(const Instruction *I,
     case Instruction::Call:
     case Instruction::BitCast:
     case Instruction::PtrToInt:
+    case Instruction::PtrToAddr:
     case Instruction::IntToPtr:
     case Instruction::AddrSpaceCast:
     case Instruction::FPToUI:
@@ -328,11 +327,11 @@ bool SpeculativeExecutionPass::considerHoistingFromTo(
   return true;
 }
 
-FunctionPass *createSpeculativeExecutionPass() {
+FunctionPass *llvm::createSpeculativeExecutionPass() {
   return new SpeculativeExecutionLegacyPass();
 }
 
-FunctionPass *createSpeculativeExecutionIfHasBranchDivergencePass() {
+FunctionPass *llvm::createSpeculativeExecutionIfHasBranchDivergencePass() {
   return new SpeculativeExecutionLegacyPass(/* OnlyIfDivergentTarget = */ true);
 }
 
@@ -362,4 +361,3 @@ void SpeculativeExecutionPass::printPipeline(
     OS << "only-if-divergent-target";
   OS << '>';
 }
-}  // namespace llvm
diff --git a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
index 7d017095c88ce..e94ad1999e32a 100644
--- a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
@@ -716,8 +716,6 @@ bool StraightLineStrengthReduce::runOnFunction(Function &F) {
   return Ret;
 }
 
-namespace llvm {
-
 PreservedAnalyses
 StraightLineStrengthReducePass::run(Function &F, FunctionAnalysisManager &AM) {
   const DataLayout *DL = &F.getDataLayout();
@@ -735,5 +733,3 @@ StraightLineStrengthReducePass::run(Function &F, FunctionAnalysisManager &AM) {
   PA.preserve<TargetIRAnalysis>();
   return PA;
 }
-
-} // namespace llvm
diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index 2ee91a9b40026..0f3978f56045e 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -47,6 +47,7 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "llvm/Transforms/Utils/SSAUpdaterBulk.h"
 #include <cassert>
 #include <utility>
 
@@ -321,7 +322,7 @@ class StructurizeCFG {
 
   void collectInfos();
 
-  void insertConditions(bool Loops);
+  void insertConditions(bool Loops, SSAUpdaterBulk &PhiInserter);
 
   void simplifyConditions();
 
@@ -671,10 +672,9 @@ void StructurizeCFG::collectInfos() {
 }
 
 /// Insert the missing branch conditions
-void StructurizeCFG::insertConditions(bool Loops) {
+void StructurizeCFG::insertConditions(bool Loops, SSAUpdaterBulk &PhiInserter) {
   BranchVector &Conds = Loops ? LoopConds : Conditions;
   Value *Default = Loops ? BoolTrue : BoolFalse;
-  SSAUpdater PhiInserter;
 
   for (BranchInst *Term : Conds) {
     assert(Term->isConditional());
@@ -683,8 +683,9 @@ void StructurizeCFG::insertConditions(bool Loops) {
     BasicBlock *SuccTrue = Term->getSuccessor(0);
     BasicBlock *SuccFalse = Term->getSuccessor(1);
 
-    PhiInserter.Initialize(Boolean, "");
-    PhiInserter.AddAvailableValue(Loops ? SuccFalse : Parent, Default);
+    unsigned Variable = PhiInserter.AddVariable("", Boolean);
+    PhiInserter.AddAvailableValue(Variable, Loops ? SuccFalse : Parent,
+                                  Default);
 
     BBPredicates &Preds = Loops ? LoopPreds[SuccFalse] : Predicates[SuccTrue];
 
@@ -697,7 +698,7 @@ void StructurizeCFG::insertConditions(bool Loops) {
         ParentInfo = PI;
         break;
       }
-      PhiInserter.AddAvailableValue(BB, PI.Pred);
+      PhiInserter.AddAvailableValue(Variable, BB, PI.Pred);
       Dominator.addAndRememberBlock(BB);
     }
 
@@ -706,9 +707,9 @@ void StructurizeCFG::insertConditions(bool Loops) {
       CondBranchWeights::setMetadata(*Term, ParentInfo.Weights);
     } else {
       if (!Dominator.resultIsRememberedBlock())
-        PhiInserter.AddAvailableValue(Dominator.result(), Default);
+        PhiInserter.AddAvailableValue(Variable, Dominator.result(), Default);
 
-      Term->setCondition(PhiInserter.GetValueInMiddleOfBlock(Parent));
+      PhiInserter.AddUse(Variable, &Term->getOperandUse(0));
     }
   }
 }
@@ -1414,8 +1415,12 @@ bool StructurizeCFG::run(Region *R, DominatorTree *DT,
   orderNodes();
   collectInfos();
   createFlow();
-  insertConditions(false);
-  insertConditions(true);
+
+  SSAUpdaterBulk PhiInserter;
+  insertConditions(false, PhiInserter);
+  insertConditions(true, PhiInserter);
+  PhiInserter.RewriteAndOptimizeAllUses(*DT);
+
   setPhiValues();
   simplifyHoistedPhis();
   simplifyConditions();
diff --git a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 1d83ddc6b5a96..89d41f3e40de7 100644
--- a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -192,7 +192,7 @@ struct AllocaDerivedValueTracker {
   SmallPtrSet<Instruction *, 32> AllocaUsers;
   SmallPtrSet<Instruction *, 32> EscapePoints;
 };
-}
+} // namespace
 
 static bool markTails(Function &F, OptimizationRemarkEmitter *ORE) {
   if (F.callsFunctionThatReturnsTwice())
@@ -967,7 +967,7 @@ struct TailCallElim : public FunctionPass {
         /*BFI=*/nullptr);
   }
 };
-}
+} // namespace
 
 char TailCallElim::ID = 0;
 INITIALIZE_PASS_BEGIN(TailCallElim, "tailcallelim", "Tail Call Elimination",
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index b18aceaa67d77..4fe736ac29b0a 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -1106,7 +1106,6 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
       }
 
       Phi.replaceAllUsesWith(RdxResult);
-      continue;
     }
   }
 
diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index 9693ae6b8ceb5..4947d03a2dc66 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Analysis/ValueLatticeUtils.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/Instructions.h"
@@ -634,18 +635,10 @@ class SCCPInstVisitor : public InstVisitor<SCCPInstVisitor> {
   /// Merge \p MergeWithV into \p IV and push \p V to the worklist, if \p IV
   /// changes.
   bool mergeInValue(ValueLatticeElement &IV, Value *V,
-                    ValueLatticeElement MergeWithV,
+                    const ValueLatticeElement &MergeWithV,
                     ValueLatticeElement::MergeOptions Opts = {
                         /*MayIncludeUndef=*/false, /*CheckWiden=*/false});
 
-  bool mergeInValue(Value *V, ValueLatticeElement MergeWithV,
-                    ValueLatticeElement::MergeOptions Opts = {
-                        /*MayIncludeUndef=*/false, /*CheckWiden=*/false}) {
-    assert(!V->getType()->isStructTy() &&
-           "non-structs should use markConstant");
-    return mergeInValue(ValueState[V], V, MergeWithV, Opts);
-  }
-
   /// getValueState - Return the ValueLatticeElement object that corresponds to
   /// the value.  This function handles the case when the value hasn't been seen
   /// yet by properly seeding constants etc.
@@ -768,6 +761,7 @@ class SCCPInstVisitor : public InstVisitor<SCCPInstVisitor> {
   void handleCallArguments(CallBase &CB);
   void handleExtractOfWithOverflow(ExtractValueInst &EVI,
                                    const WithOverflowInst *WO, unsigned Idx);
+  bool isInstFullyOverDefined(Instruction &Inst);
 
 private:
   friend class InstVisitor<SCCPInstVisitor>;
@@ -987,7 +981,7 @@ class SCCPInstVisitor : public InstVisitor<SCCPInstVisitor> {
   void trackValueOfArgument(Argument *A) {
     if (A->getType()->isStructTy())
       return (void)markOverdefined(A);
-    mergeInValue(A, getArgAttributeVL(A));
+    mergeInValue(ValueState[A], A, getArgAttributeVL(A));
   }
 
   bool isStructLatticeConstant(Function *F, StructType *STy);
@@ -1128,8 +1122,7 @@ bool SCCPInstVisitor::isStructLatticeConstant(Function *F, StructType *STy) {
   for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
     const auto &It = TrackedMultipleRetVals.find(std::make_pair(F, i));
     assert(It != TrackedMultipleRetVals.end());
-    ValueLatticeElement LV = It->second;
-    if (!SCCPSolver::isConstant(LV))
+    if (!SCCPSolver::isConstant(It->second))
       return false;
   }
   return true;
@@ -1160,7 +1153,7 @@ Constant *SCCPInstVisitor::getConstantOrNull(Value *V) const {
     std::vector<Constant *> ConstVals;
     auto *ST = cast<StructType>(V->getType());
     for (unsigned I = 0, E = ST->getNumElements(); I != E; ++I) {
-      ValueLatticeElement LV = LVs[I];
+      const ValueLatticeElement &LV = LVs[I];
       ConstVals.push_back(SCCPSolver::isConstant(LV)
                               ? getConstant(LV, ST->getElementType(I))
                               : UndefValue::get(ST->getElementType(I)));
@@ -1225,7 +1218,7 @@ void SCCPInstVisitor::visitInstruction(Instruction &I) {
 }
 
 bool SCCPInstVisitor::mergeInValue(ValueLatticeElement &IV, Value *V,
-                                   ValueLatticeElement MergeWithV,
+                                   const ValueLatticeElement &MergeWithV,
                                    ValueLatticeElement::MergeOptions Opts) {
   if (IV.mergeIn(MergeWithV, Opts)) {
     pushUsersToWorkList(V);
@@ -1264,7 +1257,7 @@ void SCCPInstVisitor::getFeasibleSuccessors(Instruction &TI,
       return;
     }
 
-    ValueLatticeElement BCValue = getValueState(BI->getCondition());
+    const ValueLatticeElement &BCValue = getValueState(BI->getCondition());
     ConstantInt *CI = getConstantInt(BCValue, BI->getCondition()->getType());
     if (!CI) {
       // Overdefined condition variables, and branches on unfoldable constant
@@ -1326,7 +1319,7 @@ void SCCPInstVisitor::getFeasibleSuccessors(Instruction &TI,
   // the target as executable.
   if (auto *IBR = dyn_cast<IndirectBrInst>(&TI)) {
     // Casts are folded by visitCastInst.
-    ValueLatticeElement IBRValue = getValueState(IBR->getAddress());
+    const ValueLatticeElement &IBRValue = getValueState(IBR->getAddress());
     BlockAddress *Addr = dyn_cast_or_null<BlockAddress>(
         getConstant(IBRValue, IBR->getAddress()->getType()));
     if (!Addr) { // Overdefined or unknown condition?
@@ -1383,49 +1376,66 @@ bool SCCPInstVisitor::isEdgeFeasible(BasicBlock *From, BasicBlock *To) const {
 // 7. If a conditional branch has a value that is overdefined, make all
 //    successors executable.
 void SCCPInstVisitor::visitPHINode(PHINode &PN) {
-  // If this PN returns a struct, just mark the result overdefined.
-  // TODO: We could do a lot better than this if code actually uses this.
-  if (PN.getType()->isStructTy())
-    return (void)markOverdefined(&PN);
-
-  if (getValueState(&PN).isOverdefined())
-    return; // Quick exit
-
   // Super-extra-high-degree PHI nodes are unlikely to ever be marked constant,
   // and slow us down a lot.  Just mark them overdefined.
   if (PN.getNumIncomingValues() > 64)
     return (void)markOverdefined(&PN);
 
-  unsigned NumActiveIncoming = 0;
+  if (isInstFullyOverDefined(PN))
+    return;
+  SmallVector<unsigned> FeasibleIncomingIndices;
+  for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
+    if (!isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent()))
+      continue;
+    FeasibleIncomingIndices.push_back(i);
+  }
 
   // Look at all of the executable operands of the PHI node.  If any of them
   // are overdefined, the PHI becomes overdefined as well.  If they are all
   // constant, and they agree with each other, the PHI becomes the identical
   // constant.  If they are constant and don't agree, the PHI is a constant
   // range. If there are no executable operands, the PHI remains unknown.
-  ValueLatticeElement PhiState = getValueState(&PN);
-  for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
-    if (!isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent()))
-      continue;
-
-    ValueLatticeElement IV = getValueState(PN.getIncomingValue(i));
-    PhiState.mergeIn(IV);
-    NumActiveIncoming++;
-    if (PhiState.isOverdefined())
-      break;
+  if (StructType *STy = dyn_cast<StructType>(PN.getType())) {
+    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+      ValueLatticeElement PhiState = getStructValueState(&PN, i);
+      if (PhiState.isOverdefined())
+        continue;
+      for (unsigned j : FeasibleIncomingIndices) {
+        const ValueLatticeElement &IV =
+            getStructValueState(PN.getIncomingValue(j), i);
+        PhiState.mergeIn(IV);
+        if (PhiState.isOverdefined())
+          break;
+      }
+      ValueLatticeElement &PhiStateRef = getStructValueState(&PN, i);
+      mergeInValue(PhiStateRef, &PN, PhiState,
+                   ValueLatticeElement::MergeOptions().setMaxWidenSteps(
+                       FeasibleIncomingIndices.size() + 1));
+      PhiStateRef.setNumRangeExtensions(
+          std::max((unsigned)FeasibleIncomingIndices.size(),
+                   PhiStateRef.getNumRangeExtensions()));
+    }
+  } else {
+    ValueLatticeElement PhiState = getValueState(&PN);
+    for (unsigned i : FeasibleIncomingIndices) {
+      const ValueLatticeElement &IV = getValueState(PN.getIncomingValue(i));
+      PhiState.mergeIn(IV);
+      if (PhiState.isOverdefined())
+        break;
+    }
+    // We allow up to 1 range extension per active incoming value and one
+    // additional extension. Note that we manually adjust the number of range
+    // extensions to match the number of active incoming values. This helps to
+    // limit multiple extensions caused by the same incoming value, if other
+    // incoming values are equal.
+    ValueLatticeElement &PhiStateRef = ValueState[&PN];
+    mergeInValue(PhiStateRef, &PN, PhiState,
+                 ValueLatticeElement::MergeOptions().setMaxWidenSteps(
+                     FeasibleIncomingIndices.size() + 1));
+    PhiStateRef.setNumRangeExtensions(
+        std::max((unsigned)FeasibleIncomingIndices.size(),
+                 PhiStateRef.getNumRangeExtensions()));
   }
-
-  // We allow up to 1 range extension per active incoming value and one
-  // additional extension. Note that we manually adjust the number of range
-  // extensions to match the number of active incoming values. This helps to
-  // limit multiple extensions caused by the same incoming value, if other
-  // incoming values are equal.
-  mergeInValue(&PN, PhiState,
-               ValueLatticeElement::MergeOptions().setMaxWidenSteps(
-                   NumActiveIncoming + 1));
-  ValueLatticeElement &PhiStateRef = getValueState(&PN);
-  PhiStateRef.setNumRangeExtensions(
-      std::max(NumActiveIncoming, PhiStateRef.getNumRangeExtensions()));
 }
 
 void SCCPInstVisitor::visitReturnInst(ReturnInst &I) {
@@ -1481,7 +1491,7 @@ void SCCPInstVisitor::visitCastInst(CastInst &I) {
     }
   }
 
-  ValueLatticeElement OpSt = getValueState(I.getOperand(0));
+  const ValueLatticeElement &OpSt = getValueState(I.getOperand(0));
   if (OpSt.isUnknownOrUndef())
     return;
 
@@ -1496,9 +1506,9 @@ void SCCPInstVisitor::visitCastInst(CastInst &I) {
   if (I.getDestTy()->isIntOrIntVectorTy() &&
       I.getSrcTy()->isIntOrIntVectorTy() &&
       I.getOpcode() != Instruction::BitCast) {
-    auto &LV = getValueState(&I);
     ConstantRange OpRange =
         OpSt.asConstantRange(I.getSrcTy(), /*UndefAllowed=*/false);
+    auto &LV = getValueState(&I);
 
     Type *DestTy = I.getDestTy();
     ConstantRange Res = ConstantRange::getEmpty(DestTy->getScalarSizeInBits());
@@ -1516,19 +1526,24 @@ void SCCPInstVisitor::handleExtractOfWithOverflow(ExtractValueInst &EVI,
                                                   const WithOverflowInst *WO,
                                                   unsigned Idx) {
   Value *LHS = WO->getLHS(), *RHS = WO->getRHS();
-  ValueLatticeElement L = getValueState(LHS);
-  ValueLatticeElement R = getValueState(RHS);
+  Type *Ty = LHS->getType();
+
   addAdditionalUser(LHS, &EVI);
   addAdditionalUser(RHS, &EVI);
-  if (L.isUnknownOrUndef() || R.isUnknownOrUndef())
-    return; // Wait to resolve.
 
-  Type *Ty = LHS->getType();
+  const ValueLatticeElement &L = getValueState(LHS);
+  if (L.isUnknownOrUndef())
+    return; // Wait to resolve.
   ConstantRange LR = L.asConstantRange(Ty, /*UndefAllowed=*/false);
+
+  const ValueLatticeElement &R = getValueState(RHS);
+  if (R.isUnknownOrUndef())
+    return; // Wait to resolve.
+
   ConstantRange RR = R.asConstantRange(Ty, /*UndefAllowed=*/false);
   if (Idx == 0) {
     ConstantRange Res = LR.binaryOp(WO->getBinaryOp(), RR);
-    mergeInValue(&EVI, ValueLatticeElement::getRange(Res));
+    mergeInValue(ValueState[&EVI], &EVI, ValueLatticeElement::getRange(Res));
   } else {
     assert(Idx == 1 && "Index can only be 0 or 1");
     ConstantRange NWRegion = ConstantRange::makeGuaranteedNoWrapRegion(
@@ -1560,7 +1575,7 @@ void SCCPInstVisitor::visitExtractValueInst(ExtractValueInst &EVI) {
     if (auto *WO = dyn_cast<WithOverflowInst>(AggVal))
       return handleExtractOfWithOverflow(EVI, WO, i);
     ValueLatticeElement EltVal = getStructValueState(AggVal, i);
-    mergeInValue(getValueState(&EVI), &EVI, EltVal);
+    mergeInValue(ValueState[&EVI], &EVI, EltVal);
   } else {
     // Otherwise, must be extracting from an array.
     return (void)markOverdefined(&EVI);
@@ -1616,14 +1631,18 @@ void SCCPInstVisitor::visitSelectInst(SelectInst &I) {
   if (ValueState[&I].isOverdefined())
     return (void)markOverdefined(&I);
 
-  ValueLatticeElement CondValue = getValueState(I.getCondition());
+  const ValueLatticeElement &CondValue = getValueState(I.getCondition());
   if (CondValue.isUnknownOrUndef())
     return;
 
   if (ConstantInt *CondCB =
           getConstantInt(CondValue, I.getCondition()->getType())) {
     Value *OpVal = CondCB->isZero() ? I.getFalseValue() : I.getTrueValue();
-    mergeInValue(&I, getValueState(OpVal));
+    const ValueLatticeElement &OpValState = getValueState(OpVal);
+    // Safety: ValueState[&I] doesn't invalidate OpValState since it is already
+    // in the map.
+    assert(ValueState.contains(&I) && "&I is not in ValueState map.");
+    mergeInValue(ValueState[&I], &I, OpValState);
     return;
   }
 
@@ -1721,7 +1740,7 @@ void SCCPInstVisitor::visitBinaryOperator(Instruction &I) {
       // being a special floating value.
       ValueLatticeElement NewV;
       NewV.markConstant(C, /*MayIncludeUndef=*/true);
-      return (void)mergeInValue(&I, NewV);
+      return (void)mergeInValue(ValueState[&I], &I, NewV);
     }
   }
 
@@ -1741,7 +1760,7 @@ void SCCPInstVisitor::visitBinaryOperator(Instruction &I) {
     R = A.overflowingBinaryOp(BO->getOpcode(), B, OBO->getNoWrapKind());
   else
     R = A.binaryOp(BO->getOpcode(), B);
-  mergeInValue(&I, ValueLatticeElement::getRange(R));
+  mergeInValue(ValueState[&I], &I, ValueLatticeElement::getRange(R));
 
   // TODO: Currently we do not exploit special values that produce something
   // better than overdefined with an overdefined operand for vector or floating
@@ -1767,7 +1786,7 @@ void SCCPInstVisitor::visitCmpInst(CmpInst &I) {
   if (C) {
     ValueLatticeElement CV;
     CV.markConstant(C);
-    mergeInValue(&I, CV);
+    mergeInValue(ValueState[&I], &I, CV);
     return;
   }
 
@@ -1802,7 +1821,7 @@ void SCCPInstVisitor::visitGetElementPtrInst(GetElementPtrInst &I) {
   Operands.reserve(I.getNumOperands());
 
   for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i) {
-    ValueLatticeElement State = getValueState(I.getOperand(i));
+    const ValueLatticeElement &State = getValueState(I.getOperand(i));
     if (State.isUnknownOrUndef())
       return; // Operands are not resolved yet.
 
@@ -1881,14 +1900,13 @@ void SCCPInstVisitor::visitLoadInst(LoadInst &I) {
   if (ValueState[&I].isOverdefined())
     return (void)markOverdefined(&I);
 
-  ValueLatticeElement PtrVal = getValueState(I.getOperand(0));
+  const ValueLatticeElement &PtrVal = getValueState(I.getOperand(0));
   if (PtrVal.isUnknownOrUndef())
     return; // The pointer is not resolved yet!
 
-  ValueLatticeElement &IV = ValueState[&I];
-
   if (SCCPSolver::isConstant(PtrVal)) {
     Constant *Ptr = getConstant(PtrVal, I.getOperand(0)->getType());
+    ValueLatticeElement &IV = ValueState[&I];
 
     // load null is undefined.
     if (isa<ConstantPointerNull>(Ptr)) {
@@ -1916,7 +1934,7 @@ void SCCPInstVisitor::visitLoadInst(LoadInst &I) {
   }
 
   // Fall back to metadata.
-  mergeInValue(&I, getValueFromMetadata(&I));
+  mergeInValue(ValueState[&I], &I, getValueFromMetadata(&I));
 }
 
 void SCCPInstVisitor::visitCallBase(CallBase &CB) {
@@ -1944,7 +1962,7 @@ void SCCPInstVisitor::handleCallOverdefined(CallBase &CB) {
         return markOverdefined(&CB); // Can't handle struct args.
       if (A.get()->getType()->isMetadataTy())
         continue;                    // Carried in CB, not allowed in Operands.
-      ValueLatticeElement State = getValueState(A);
+      const ValueLatticeElement &State = getValueState(A);
 
       if (State.isUnknownOrUndef())
         return; // Operands are not resolved yet.
@@ -1964,7 +1982,7 @@ void SCCPInstVisitor::handleCallOverdefined(CallBase &CB) {
   }
 
   // Fall back to metadata.
-  mergeInValue(&CB, getValueFromMetadata(&CB));
+  mergeInValue(ValueState[&CB], &CB, getValueFromMetadata(&CB));
 }
 
 void SCCPInstVisitor::handleCallArguments(CallBase &CB) {
@@ -1992,10 +2010,11 @@ void SCCPInstVisitor::handleCallArguments(CallBase &CB) {
           mergeInValue(getStructValueState(&*AI, i), &*AI, CallArg,
                        getMaxWidenStepsOpts());
         }
-      } else
-        mergeInValue(&*AI,
-                     getValueState(*CAI).intersect(getArgAttributeVL(&*AI)),
-                     getMaxWidenStepsOpts());
+      } else {
+        ValueLatticeElement CallArg =
+            getValueState(*CAI).intersect(getArgAttributeVL(&*AI));
+        mergeInValue(ValueState[&*AI], &*AI, CallArg, getMaxWidenStepsOpts());
+      }
     }
   }
 }
@@ -2076,7 +2095,8 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) {
     if (II->getIntrinsicID() == Intrinsic::vscale) {
       unsigned BitWidth = CB.getType()->getScalarSizeInBits();
       const ConstantRange Result = getVScaleRange(II->getFunction(), BitWidth);
-      return (void)mergeInValue(II, ValueLatticeElement::getRange(Result));
+      return (void)mergeInValue(ValueState[II], II,
+                                ValueLatticeElement::getRange(Result));
     }
 
     if (ConstantRange::isIntrinsicSupported(II->getIntrinsicID())) {
@@ -2094,7 +2114,8 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) {
 
       ConstantRange Result =
           ConstantRange::intrinsic(II->getIntrinsicID(), OpRanges);
-      return (void)mergeInValue(II, ValueLatticeElement::getRange(Result));
+      return (void)mergeInValue(ValueState[II], II,
+                                ValueLatticeElement::getRange(Result));
     }
   }
 
@@ -2121,10 +2142,25 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) {
       return handleCallOverdefined(CB); // Not tracking this callee.
 
     // If so, propagate the return value of the callee into this call result.
-    mergeInValue(&CB, TFRVI->second, getMaxWidenStepsOpts());
+    mergeInValue(ValueState[&CB], &CB, TFRVI->second, getMaxWidenStepsOpts());
   }
 }
 
+bool SCCPInstVisitor::isInstFullyOverDefined(Instruction &Inst) {
+  // For structure Type, we handle each member separately.
+  // A structure object won't be considered as overdefined when
+  // there is at least one member that is not overdefined.
+  if (StructType *STy = dyn_cast<StructType>(Inst.getType())) {
+    for (unsigned i = 0, e = STy->getNumElements(); i < e; ++i) {
+      if (!getStructValueState(&Inst, i).isOverdefined())
+        return false;
+    }
+    return true;
+  }
+
+  return getValueState(&Inst).isOverdefined();
+}
+
 void SCCPInstVisitor::solve() {
   // Process the work lists until they are empty!
   while (!BBWorkList.empty() || !InstWorkList.empty()) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 7651ba16b35cf..3fed003282f2b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -325,6 +325,8 @@ class VPBuilder {
     VPIRFlags Flags;
     if (Opcode == Instruction::Trunc)
       Flags = VPIRFlags::TruncFlagsTy(false, false);
+    else if (Opcode == Instruction::ZExt)
+      Flags = VPIRFlags::NonNegFlagsTy(false);
     return tryInsertInstruction(
         new VPWidenCastRecipe(Opcode, Op, ResultTy, Flags));
   }
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b96d29e635465..adf27bed3d749 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1011,6 +1011,10 @@ class LoopVectorizationCostModel {
   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
   /// for vectorization factor \p VF.
   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
+    // Truncs must truncate at most to their destination type.
+    if (isa_and_nonnull<TruncInst>(I) && MinBWs.contains(I) &&
+        I->getType()->getScalarSizeInBits() < MinBWs.lookup(I))
+      return false;
     return VF.isVector() && MinBWs.contains(I) &&
            !isProfitableToScalarize(I, VF) &&
            !isScalarAfterVectorization(I, VF);
@@ -7192,7 +7196,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   // TODO: Move to VPlan transform stage once the transition to the VPlan-based
   // cost model is complete for better cost estimates.
   VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF);
-  VPlanTransforms::runPass(VPlanTransforms::materializeBuildVectors, BestVPlan);
+  VPlanTransforms::runPass(VPlanTransforms::materializePacksAndUnpacks,
+                           BestVPlan);
   VPlanTransforms::runPass(VPlanTransforms::materializeBroadcasts, BestVPlan);
   VPlanTransforms::runPass(VPlanTransforms::replicateByVF, BestVPlan, BestVF);
   bool HasBranchWeights =
@@ -7226,9 +7231,6 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
     return DenseMap<const SCEV *, Value *>();
   }
 
-  VPlanTransforms::narrowInterleaveGroups(
-      BestVPlan, BestVF,
-      TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector));
   VPlanTransforms::removeDeadRecipes(BestVPlan);
 
   VPlanTransforms::convertToConcreteRecipes(BestVPlan);
@@ -7957,9 +7959,9 @@ bool VPRecipeBuilder::getScaledReductions(
   auto CollectExtInfo = [this, &Exts, &ExtOpTypes,
                          &ExtKinds](SmallVectorImpl<Value *> &Ops) -> bool {
     for (const auto &[I, OpI] : enumerate(Ops)) {
-      auto *CI = dyn_cast<ConstantInt>(OpI);
-      if (I > 0 && CI &&
-          canConstantBeExtended(CI, ExtOpTypes[0], ExtKinds[0])) {
+      const APInt *C;
+      if (I > 0 && match(OpI, m_APInt(C)) &&
+          canConstantBeExtended(C, ExtOpTypes[0], ExtKinds[0])) {
         ExtOpTypes[I] = ExtOpTypes[0];
         ExtKinds[I] = ExtKinds[0];
         continue;
@@ -8197,6 +8199,10 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
       if (CM.foldTailWithEVL())
         VPlanTransforms::runPass(VPlanTransforms::addExplicitVectorLength,
                                  *Plan, CM.getMaxSafeElements());
+
+      if (auto P = VPlanTransforms::narrowInterleaveGroups(*Plan, TTI))
+        VPlans.push_back(std::move(P));
+
       assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
       VPlans.push_back(std::move(Plan));
     }
@@ -8240,14 +8246,14 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   // the vector loop or when not folding the tail. In the later case, we know
   // that the canonical induction increment will not overflow as the vector trip
   // count is >= increment and a multiple of the increment.
+  VPRegionBlock *LoopRegion = Plan->getVectorLoopRegion();
   bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None;
   if (!HasNUW) {
-    auto *IVInc = Plan->getVectorLoopRegion()
-                      ->getExitingBasicBlock()
-                      ->getTerminator()
-                      ->getOperand(0);
-    assert(match(IVInc, m_VPInstruction<Instruction::Add>(
-                            m_Specific(Plan->getCanonicalIV()), m_VPValue())) &&
+    auto *IVInc =
+        LoopRegion->getExitingBasicBlock()->getTerminator()->getOperand(0);
+    assert(match(IVInc,
+                 m_VPInstruction<Instruction::Add>(
+                     m_Specific(LoopRegion->getCanonicalIV()), m_VPValue())) &&
            "Did not find the canonical IV increment");
     cast<VPRecipeWithIRFlags>(IVInc)->dropPoisonGeneratingFlags();
   }
@@ -8293,7 +8299,6 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
 
   // Scan the body of the loop in a topological order to visit each basic block
   // after having visited its predecessor basic blocks.
-  VPRegionBlock *LoopRegion = Plan->getVectorLoopRegion();
   VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
   ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
       HeaderVPBB);
@@ -8377,8 +8382,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   for (VPValue *Old : Old2New.keys())
     Old->getDefiningRecipe()->eraseFromParent();
 
-  assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
-         !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
+  assert(isa<VPRegionBlock>(LoopRegion) &&
+         !LoopRegion->getEntryBasicBlock()->empty() &&
          "entry block must be set to a VPRegionBlock having a non-empty entry "
          "VPBasicBlock");
 
@@ -9326,8 +9331,9 @@ static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
   if (ResumePhiIter == MainScalarPH->phis().end()) {
     VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin());
     ResumePhi = ScalarPHBuilder.createScalarPhi(
-        {VectorTC, MainPlan.getCanonicalIV()->getStartValue()}, {},
-        "vec.epilog.resume.val");
+        {VectorTC,
+         MainPlan.getVectorLoopRegion()->getCanonicalIV()->getStartValue()},
+        {}, "vec.epilog.resume.val");
   } else {
     ResumePhi = cast<VPPhi>(&*ResumePhiIter);
     if (MainScalarPH->begin() == MainScalarPH->end())
@@ -9354,7 +9360,7 @@ static SmallVector<Instruction *> preparePlanForEpilogueVectorLoop(
   VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
   Header->setName("vec.epilog.vector.body");
 
-  VPCanonicalIVPHIRecipe *IV = Plan.getCanonicalIV();
+  VPCanonicalIVPHIRecipe *IV = VectorLoop->getCanonicalIV();
   // When vectorizing the epilogue loop, the canonical induction needs to be
   // adjusted by the value after the main vector loop. Find the resume value
   // created during execution of the main VPlan. It must be the first phi in the
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 88af2cfc6eda9..cdb9e7e381121 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2242,8 +2242,49 @@ class BoUpSLP {
   ///       may not be necessary.
   bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
   bool isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
-                     Align Alignment, const int64_t Diff, Value *Ptr0,
-                     Value *PtrN, StridedPtrInfo &SPtrInfo) const;
+                     Align Alignment, const int64_t Diff,
+                     const size_t Sz) const;
+
+  /// Return true if an array of scalar loads can be replaced with a strided
+  ///  load (with constant stride).
+  ///
+  ///  TODO:
+  ///  It is possible that the load gets "widened". Suppose that originally each
+  ///  load loads `k` bytes and `PointerOps` can be arranged as follows (`%s` is
+  ///  constant): %b + 0 * %s + 0 %b + 0 * %s + 1 %b + 0 * %s + 2
+  ///  ...
+  ///  %b + 0 * %s + (w - 1)
+  ///
+  ///  %b + 1 * %s + 0
+  ///  %b + 1 * %s + 1
+  ///  %b + 1 * %s + 2
+  ///  ...
+  ///  %b + 1 * %s + (w - 1)
+  ///  ...
+  ///
+  ///  %b + (n - 1) * %s + 0
+  ///  %b + (n - 1) * %s + 1
+  ///  %b + (n - 1) * %s + 2
+  ///  ...
+  ///  %b + (n - 1) * %s + (w - 1)
+  ///
+  /// In this case we will generate a strided load of type `<n x (k * w)>`.
+  ///
+  /// \param PointerOps list of pointer arguments of loads.
+  /// \param ElemTy original scalar type of loads.
+  /// \param Alignment alignment of the first load.
+  /// \param SortedIndices is the order of PointerOps as returned by
+  /// `sortPtrAccesses`
+  /// \param Diff Pointer difference between the lowest and the highes pointer
+  /// in `PointerOps` as returned by `getPointersDiff`.
+  /// \param Ptr0 first pointer in `PointersOps`.
+  /// \param PtrN last pointer in `PointersOps`.
+  /// \param SPtrInfo If the function return `true`, it also sets all the fields
+  /// of `SPtrInfo` necessary to generate the strided load later.
+  bool analyzeConstantStrideCandidate(
+      const ArrayRef<Value *> PointerOps, Type *ElemTy, Align Alignment,
+      const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff,
+      Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const;
 
   /// Return true if an array of scalar loads can be replaced with a strided
   /// load (with run-time stride).
@@ -5302,7 +5343,7 @@ class BoUpSLP {
             unsigned &OpCnt =
                 OrderedEntriesCount.try_emplace(TE, 0).first->getSecond();
             EdgeInfo EI(TE, U.getOperandNo());
-            if (!getScheduleCopyableData(EI, Op) && OpCnt < NumOps)
+            if (!getScheduleCopyableData(EI, Op))
               continue;
             // Found copyable operand - continue.
             ++OpCnt;
@@ -5536,62 +5577,79 @@ class BoUpSLP {
           }
           // Decrement the unscheduled counter and insert to ready list if
           // ready.
-          auto DecrUnschedForInst = [&](Instruction *I, TreeEntry *UserTE,
-                                        unsigned OpIdx) {
-            if (!ScheduleCopyableDataMap.empty()) {
-              const EdgeInfo EI = {UserTE, OpIdx};
-              if (ScheduleCopyableData *CD = getScheduleCopyableData(EI, I)) {
-                DecrUnsched(CD, /*IsControl=*/false);
-                return;
-              }
-            }
-            auto It = OperandsUses.find(I);
-            assert(It != OperandsUses.end() && "Operand not found");
-            if (It->second > 0) {
-              --It->getSecond();
-              assert(TotalOpCount > 0 && "No more operands to decrement");
-              --TotalOpCount;
-              if (ScheduleData *OpSD = getScheduleData(I))
-                DecrUnsched(OpSD, /*IsControl=*/false);
-            }
-          };
+          auto DecrUnschedForInst =
+              [&](Instruction *I, TreeEntry *UserTE, unsigned OpIdx,
+                  SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>>
+                      &Checked) {
+                if (!ScheduleCopyableDataMap.empty()) {
+                  const EdgeInfo EI = {UserTE, OpIdx};
+                  if (ScheduleCopyableData *CD =
+                          getScheduleCopyableData(EI, I)) {
+                    if (!Checked.insert(std::make_pair(CD, OpIdx)).second)
+                      return;
+                    DecrUnsched(CD, /*IsControl=*/false);
+                    return;
+                  }
+                }
+                auto It = OperandsUses.find(I);
+                assert(It != OperandsUses.end() && "Operand not found");
+                if (It->second > 0) {
+                  --It->getSecond();
+                  assert(TotalOpCount > 0 && "No more operands to decrement");
+                  --TotalOpCount;
+                  if (ScheduleData *OpSD = getScheduleData(I)) {
+                    if (!Checked.insert(std::make_pair(OpSD, OpIdx)).second)
+                      return;
+                    DecrUnsched(OpSD, /*IsControl=*/false);
+                  }
+                }
+              };
 
           for (ScheduleBundle *Bundle : Bundles) {
             if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
               break;
             // Need to search for the lane since the tree entry can be
             // reordered.
-            int Lane = std::distance(Bundle->getTreeEntry()->Scalars.begin(),
-                                     find(Bundle->getTreeEntry()->Scalars, In));
-            assert(Lane >= 0 && "Lane not set");
-            if (isa<StoreInst>(In) &&
-                !Bundle->getTreeEntry()->ReorderIndices.empty())
-              Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
-            assert(Lane < static_cast<int>(
-                              Bundle->getTreeEntry()->Scalars.size()) &&
-                   "Couldn't find extract lane");
-
-            // Since vectorization tree is being built recursively this
-            // assertion ensures that the tree entry has all operands set before
-            // reaching this code. Couple of exceptions known at the moment are
-            // extracts where their second (immediate) operand is not added.
-            // Since immediates do not affect scheduler behavior this is
-            // considered okay.
-            assert(In &&
-                   (isa<ExtractValueInst, ExtractElementInst, CallBase>(In) ||
-                    In->getNumOperands() ==
-                        Bundle->getTreeEntry()->getNumOperands() ||
-                    Bundle->getTreeEntry()->isCopyableElement(In)) &&
-                   "Missed TreeEntry operands?");
-
-            for (unsigned OpIdx :
-                 seq<unsigned>(Bundle->getTreeEntry()->getNumOperands()))
-              if (auto *I = dyn_cast<Instruction>(
-                      Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) {
-                LLVM_DEBUG(dbgs() << "SLP:   check for readiness (def): " << *I
-                                  << "\n");
-                DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx);
-              }
+            auto *It = find(Bundle->getTreeEntry()->Scalars, In);
+            SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> Checked;
+            do {
+              int Lane =
+                  std::distance(Bundle->getTreeEntry()->Scalars.begin(), It);
+              assert(Lane >= 0 && "Lane not set");
+              if (isa<StoreInst>(In) &&
+                  !Bundle->getTreeEntry()->ReorderIndices.empty())
+                Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
+              assert(Lane < static_cast<int>(
+                                Bundle->getTreeEntry()->Scalars.size()) &&
+                     "Couldn't find extract lane");
+
+              // Since vectorization tree is being built recursively this
+              // assertion ensures that the tree entry has all operands set
+              // before reaching this code. Couple of exceptions known at the
+              // moment are extracts where their second (immediate) operand is
+              // not added. Since immediates do not affect scheduler behavior
+              // this is considered okay.
+              assert(In &&
+                     (isa<ExtractValueInst, ExtractElementInst, CallBase>(In) ||
+                      In->getNumOperands() ==
+                          Bundle->getTreeEntry()->getNumOperands() ||
+                      Bundle->getTreeEntry()->isCopyableElement(In)) &&
+                     "Missed TreeEntry operands?");
+
+              for (unsigned OpIdx :
+                   seq<unsigned>(Bundle->getTreeEntry()->getNumOperands()))
+                if (auto *I = dyn_cast<Instruction>(
+                        Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) {
+                  LLVM_DEBUG(dbgs() << "SLP:   check for readiness (def): "
+                                    << *I << "\n");
+                  DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx, Checked);
+                }
+              // If parent node is schedulable, it will be handle correctly.
+              if (!Bundle->getTreeEntry()->doesNotNeedToSchedule())
+                break;
+              It = std::find(std::next(It),
+                             Bundle->getTreeEntry()->Scalars.end(), In);
+            } while (It != Bundle->getTreeEntry()->Scalars.end());
           }
         } else {
           // If BundleMember is a stand-alone instruction, no operand reordering
@@ -6849,9 +6907,8 @@ isMaskedLoadCompress(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
 /// current graph (for masked gathers extra extractelement instructions
 /// might be required).
 bool BoUpSLP::isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
-                            Align Alignment, const int64_t Diff, Value *Ptr0,
-                            Value *PtrN, StridedPtrInfo &SPtrInfo) const {
-  const size_t Sz = PointerOps.size();
+                            Align Alignment, const int64_t Diff,
+                            const size_t Sz) const {
   if (Diff % (Sz - 1) != 0)
     return false;
 
@@ -6875,27 +6932,40 @@ bool BoUpSLP::isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
       return false;
     if (!TTI->isLegalStridedLoadStore(VecTy, Alignment))
       return false;
+    return true;
+  }
+  return false;
+}
 
-    // Iterate through all pointers and check if all distances are
-    // unique multiple of Dist.
-    SmallSet<int64_t, 4> Dists;
-    for (Value *Ptr : PointerOps) {
-      int64_t Dist = 0;
-      if (Ptr == PtrN)
-        Dist = Diff;
-      else if (Ptr != Ptr0)
-        Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
-      // If the strides are not the same or repeated, we can't
-      // vectorize.
-      if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second)
-        break;
-    }
-    if (Dists.size() == Sz) {
-      Type *StrideTy = DL->getIndexType(Ptr0->getType());
-      SPtrInfo.StrideVal = ConstantInt::get(StrideTy, Stride);
-      SPtrInfo.Ty = getWidenedType(ScalarTy, Sz);
-      return true;
-    }
+bool BoUpSLP::analyzeConstantStrideCandidate(
+    const ArrayRef<Value *> PointerOps, Type *ScalarTy, Align Alignment,
+    const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff,
+    Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const {
+  const size_t Sz = PointerOps.size();
+  if (!isStridedLoad(PointerOps, ScalarTy, Alignment, Diff, Sz))
+    return false;
+
+  int64_t Stride = Diff / static_cast<int64_t>(Sz - 1);
+
+  // Iterate through all pointers and check if all distances are
+  // unique multiple of Dist.
+  SmallSet<int64_t, 4> Dists;
+  for (Value *Ptr : PointerOps) {
+    int64_t Dist = 0;
+    if (Ptr == PtrN)
+      Dist = Diff;
+    else if (Ptr != Ptr0)
+      Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
+    // If the strides are not the same or repeated, we can't
+    // vectorize.
+    if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second)
+      break;
+  }
+  if (Dists.size() == Sz) {
+    Type *StrideTy = DL->getIndexType(Ptr0->getType());
+    SPtrInfo.StrideVal = ConstantInt::get(StrideTy, Stride);
+    SPtrInfo.Ty = getWidenedType(ScalarTy, Sz);
+    return true;
   }
   return false;
 }
@@ -6995,8 +7065,8 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
     Align Alignment =
         cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
             ->getAlign();
-    if (isStridedLoad(PointerOps, ScalarTy, Alignment, *Diff, Ptr0, PtrN,
-                      SPtrInfo))
+    if (analyzeConstantStrideCandidate(PointerOps, ScalarTy, Alignment, Order,
+                                       *Diff, Ptr0, PtrN, SPtrInfo))
       return LoadsState::StridedVectorize;
   }
   if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
@@ -10493,8 +10563,11 @@ static bool tryToFindDuplicates(SmallVectorImpl<Value *> &VL,
             PoisonValue::get(UniqueValues.front()->getType()));
         // Check that extended with poisons/copyable operations are still valid
         // for vectorization (div/rem are not allowed).
-        if (!S.areInstructionsWithCopyableElements() &&
-            !getSameOpcode(PaddedUniqueValues, TLI).valid()) {
+        if ((!S.areInstructionsWithCopyableElements() &&
+             !getSameOpcode(PaddedUniqueValues, TLI).valid()) ||
+            (S.areInstructionsWithCopyableElements() && S.isMulDivLikeOp() &&
+             (S.getMainOp()->isIntDivRem() || S.getMainOp()->isFPDivRem() ||
+              isa<CallInst>(S.getMainOp())))) {
           LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
           ReuseShuffleIndices.clear();
           return false;
@@ -17632,7 +17705,9 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
   }
   if (IsPHI ||
       (!E->isGather() && E->State != TreeEntry::SplitVectorize &&
-       E->doesNotNeedToSchedule()) ||
+       (E->doesNotNeedToSchedule() ||
+        (E->hasCopyableElements() && !E->isCopyableElement(LastInst) &&
+         isUsedOutsideBlock(LastInst)))) ||
       (GatheredLoadsEntriesFirst.has_value() &&
        E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
        E->getOpcode() == Instruction::Load)) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 010194246886a..c95c88735c5b4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -217,32 +217,6 @@ VPBlockBase *VPBlockBase::getEnclosingBlockWithPredecessors() {
   return Parent->getEnclosingBlockWithPredecessors();
 }
 
-bool VPBlockUtils::isHeader(const VPBlockBase *VPB,
-                            const VPDominatorTree &VPDT) {
-  auto *VPBB = dyn_cast<VPBasicBlock>(VPB);
-  if (!VPBB)
-    return false;
-
-  // If VPBB is in a region R, VPBB is a loop header if R is a loop region with
-  // VPBB as its entry, i.e., free of predecessors.
-  if (auto *R = VPBB->getParent())
-    return !R->isReplicator() && !VPBB->hasPredecessors();
-
-  // A header dominates its second predecessor (the latch), with the other
-  // predecessor being the preheader
-  return VPB->getPredecessors().size() == 2 &&
-         VPDT.dominates(VPB, VPB->getPredecessors()[1]);
-}
-
-bool VPBlockUtils::isLatch(const VPBlockBase *VPB,
-                           const VPDominatorTree &VPDT) {
-  // A latch has a header as its second successor, with its other successor
-  // leaving the loop. A preheader OTOH has a header as its first (and only)
-  // successor.
-  return VPB->getNumSuccessors() == 2 &&
-         VPBlockUtils::isHeader(VPB->getSuccessors()[1], VPDT);
-}
-
 VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() {
   iterator It = begin();
   while (It != end() && It->isPhi())
@@ -768,8 +742,12 @@ static std::pair<VPBlockBase *, VPBlockBase *> cloneFrom(VPBlockBase *Entry) {
 
 VPRegionBlock *VPRegionBlock::clone() {
   const auto &[NewEntry, NewExiting] = cloneFrom(getEntry());
-  auto *NewRegion = getPlan()->createVPRegionBlock(NewEntry, NewExiting,
-                                                   getName(), isReplicator());
+  VPlan &Plan = *getPlan();
+  VPRegionBlock *NewRegion =
+      isReplicator()
+          ? Plan.createReplicateRegion(NewEntry, NewExiting, getName())
+          : Plan.createLoopRegion(getName(), NewEntry, NewExiting);
+
   for (VPBlockBase *Block : vp_depth_first_shallow(NewEntry))
     Block->setParent(NewRegion);
   return NewRegion;
@@ -1213,6 +1191,7 @@ VPlan *VPlan::duplicate() {
   }
   Old2NewVPValues[&VectorTripCount] = &NewPlan->VectorTripCount;
   Old2NewVPValues[&VF] = &NewPlan->VF;
+  Old2NewVPValues[&UF] = &NewPlan->UF;
   Old2NewVPValues[&VFxUF] = &NewPlan->VFxUF;
   if (BackedgeTakenCount) {
     NewPlan->BackedgeTakenCount = new VPValue();
@@ -1753,14 +1732,14 @@ void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
 }
 #endif
 
-bool llvm::canConstantBeExtended(const ConstantInt *CI, Type *NarrowType,
+bool llvm::canConstantBeExtended(const APInt *C, Type *NarrowType,
                                  TTI::PartialReductionExtendKind ExtKind) {
-  APInt TruncatedVal = CI->getValue().trunc(NarrowType->getScalarSizeInBits());
-  unsigned WideSize = CI->getType()->getScalarSizeInBits();
+  APInt TruncatedVal = C->trunc(NarrowType->getScalarSizeInBits());
+  unsigned WideSize = C->getBitWidth();
   APInt ExtendedVal = ExtKind == TTI::PR_SignExtend
                           ? TruncatedVal.sext(WideSize)
                           : TruncatedVal.zext(WideSize);
-  return ExtendedVal == CI->getValue();
+  return ExtendedVal == *C;
 }
 
 TargetTransformInfo::OperandValueInfo
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 8ca3bedfaa259..167ba553af599 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -24,12 +24,9 @@
 #ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
 #define LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
 
-#include "VPlanAnalysis.h"
 #include "VPlanValue.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/ADT/ilist.h"
@@ -41,10 +38,11 @@
 #include "llvm/IR/Operator.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/InstructionCost.h"
-#include <algorithm>
 #include <cassert>
 #include <cstddef>
+#include <functional>
 #include <string>
+#include <utility>
 
 namespace llvm {
 
@@ -346,13 +344,6 @@ class LLVM_ABI_FOR_TEST VPBlockBase {
   /// Return the cost of the block.
   virtual InstructionCost cost(ElementCount VF, VPCostContext &Ctx) = 0;
 
-  /// Return true if it is legal to hoist instructions into this block.
-  bool isLegalToHoistInto() {
-    // There are currently no constraints that prevent an instruction to be
-    // hoisted into a VPBlockBase.
-    return true;
-  }
-
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   void printAsOperand(raw_ostream &OS, bool PrintType = false) const {
     OS << getName();
@@ -416,6 +407,10 @@ class LLVM_ABI_FOR_TEST VPRecipeBase
   VPBasicBlock *getParent() { return Parent; }
   const VPBasicBlock *getParent() const { return Parent; }
 
+  /// \return the VPRegionBlock which the recipe belongs to.
+  VPRegionBlock *getRegion();
+  const VPRegionBlock *getRegion() const;
+
   /// The method which generates the output IR instructions that correspond to
   /// this VPRecipe, thereby "executing" the VPlan.
   virtual void execute(VPTransformState &State) = 0;
@@ -1012,6 +1007,11 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
     /// Creates a fixed-width vector containing all operands. The number of
     /// operands matches the vector element count.
     BuildVector,
+    /// Extracts all lanes from its (non-scalable) vector operand. This is an
+    /// abstract VPInstruction whose single defined VPValue represents VF
+    /// scalars extracted from a vector, to be replaced by VF ExtractElement
+    /// VPInstructions.
+    Unpack,
     /// Compute the final result of a AnyOf reduction with select(cmp(),x,y),
     /// where one of (x,y) is loop invariant, and both x and y are integer type.
     ComputeAnyOfResult,
@@ -1021,6 +1021,8 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
     // part if scalar. In the latter case, the recipe will be removed during
     // unrolling.
     ExtractLastElement,
+    // Extracts the last lane for each part from its operand.
+    ExtractLastLanePerPart,
     // Extracts the second-to-last lane from its operand or the second-to-last
     // part if it is scalar. In the latter case, the recipe will be removed
     // during unrolling.
@@ -2718,6 +2720,15 @@ class LLVM_ABI_FOR_TEST VPReductionRecipe : public VPRecipeWithIRFlags {
     return R && classof(R);
   }
 
+  static inline bool classof(const VPValue *VPV) {
+    const VPRecipeBase *R = VPV->getDefiningRecipe();
+    return R && classof(R);
+  }
+
+  static inline bool classof(const VPSingleDefRecipe *R) {
+    return classof(static_cast<const VPRecipeBase *>(R));
+  }
+
   /// Generate the reduction in the loop.
   void execute(VPTransformState &State) override;
 
@@ -3103,6 +3114,9 @@ class VPExpressionRecipe : public VPSingleDefRecipe {
   /// Returns true if this expression contains recipes that may have side
   /// effects.
   bool mayHaveSideEffects() const;
+
+  /// Returns true if the result of this VPExpressionRecipe is a single-scalar.
+  bool isSingleScalar() const;
 };
 
 /// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when
@@ -4067,8 +4081,29 @@ class LLVM_ABI_FOR_TEST VPRegionBlock : public VPBlockBase {
   /// Remove the current region from its VPlan, connecting its predecessor to
   /// its entry, and its exiting block to its successor.
   void dissolveToCFGLoop();
+
+  /// Returns the canonical induction recipe of the region.
+  VPCanonicalIVPHIRecipe *getCanonicalIV() {
+    VPBasicBlock *EntryVPBB = getEntryBasicBlock();
+    if (EntryVPBB->empty()) {
+      // VPlan native path. TODO: Unify both code paths.
+      EntryVPBB = cast<VPBasicBlock>(EntryVPBB->getSingleSuccessor());
+    }
+    return cast<VPCanonicalIVPHIRecipe>(&*EntryVPBB->begin());
+  }
+  const VPCanonicalIVPHIRecipe *getCanonicalIV() const {
+    return const_cast<VPRegionBlock *>(this)->getCanonicalIV();
+  }
 };
 
+inline VPRegionBlock *VPRecipeBase::getRegion() {
+  return getParent()->getParent();
+}
+
+inline const VPRegionBlock *VPRecipeBase::getRegion() const {
+  return getParent()->getParent();
+}
+
 /// VPlan models a candidate for vectorization, encoding various decisions take
 /// to produce efficient output IR, including which branches, basic-blocks and
 /// output IR instructions to generate, and their cost. VPlan holds a
@@ -4117,6 +4152,9 @@ class VPlan {
   /// Represents the vectorization factor of the loop.
   VPValue VF;
 
+  /// Represents the symbolic unroll factor of the loop.
+  VPValue UF;
+
   /// Represents the loop-invariant VF * UF of the vector loop region.
   VPValue VFxUF;
 
@@ -4261,12 +4299,17 @@ class VPlan {
       BackedgeTakenCount = new VPValue();
     return BackedgeTakenCount;
   }
+  VPValue *getBackedgeTakenCount() const { return BackedgeTakenCount; }
 
   /// The vector trip count.
   VPValue &getVectorTripCount() { return VectorTripCount; }
 
   /// Returns the VF of the vector loop region.
   VPValue &getVF() { return VF; };
+  const VPValue &getVF() const { return VF; };
+
+  /// Returns the symbolic UF of the vector loop region.
+  VPValue &getSymbolicUF() { return UF; };
 
   /// Returns VF * UF of the vector loop region.
   VPValue &getVFxUF() { return VFxUF; }
@@ -4277,6 +4320,12 @@ class VPlan {
 
   void addVF(ElementCount VF) { VFs.insert(VF); }
 
+  /// Remove \p VF from the plan.
+  void removeVF(ElementCount VF) {
+    assert(hasVF(VF) && "tried to remove VF not present in plan");
+    VFs.remove(VF);
+  }
+
   void setVF(ElementCount VF) {
     assert(hasVF(VF) && "Cannot set VF not already in plan");
     VFs.clear();
@@ -4378,16 +4427,6 @@ class VPlan {
   LLVM_DUMP_METHOD void dump() const;
 #endif
 
-  /// Returns the canonical induction recipe of the vector loop.
-  VPCanonicalIVPHIRecipe *getCanonicalIV() {
-    VPBasicBlock *EntryVPBB = getVectorLoopRegion()->getEntryBasicBlock();
-    if (EntryVPBB->empty()) {
-      // VPlan native path.
-      EntryVPBB = cast<VPBasicBlock>(EntryVPBB->getSingleSuccessor());
-    }
-    return cast<VPCanonicalIVPHIRecipe>(&*EntryVPBB->begin());
-  }
-
   VPValue *getSCEVExpansion(const SCEV *S) const {
     return SCEVToExpansion.lookup(S);
   }
@@ -4411,22 +4450,24 @@ class VPlan {
     return VPB;
   }
 
-  /// Create a new VPRegionBlock with \p Entry, \p Exiting and \p Name. If \p
-  /// IsReplicator is true, the region is a replicate region. The returned block
-  /// is owned by the VPlan and deleted once the VPlan is destroyed.
-  VPRegionBlock *createVPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting,
-                                     const std::string &Name = "",
-                                     bool IsReplicator = false) {
-    auto *VPB = new VPRegionBlock(Entry, Exiting, Name, IsReplicator);
+  /// Create a new loop region with \p Name and entry and exiting blocks set
+  /// to \p Entry and \p Exiting respectively, if set. The returned block is
+  /// owned by the VPlan and deleted once the VPlan is destroyed.
+  VPRegionBlock *createLoopRegion(const std::string &Name = "",
+                                  VPBlockBase *Entry = nullptr,
+                                  VPBlockBase *Exiting = nullptr) {
+    auto *VPB = Entry ? new VPRegionBlock(Entry, Exiting, Name)
+                      : new VPRegionBlock(Name);
     CreatedBlocks.push_back(VPB);
     return VPB;
   }
 
-  /// Create a new loop VPRegionBlock with \p Name and entry and exiting blocks set
-  /// to nullptr. The returned block is owned by the VPlan and deleted once the
-  /// VPlan is destroyed.
-  VPRegionBlock *createVPRegionBlock(const std::string &Name = "") {
-    auto *VPB = new VPRegionBlock(Name);
+  /// Create a new replicate region with \p Entry, \p Exiting and \p Name. The
+  /// returned block is owned by the VPlan and deleted once the VPlan is
+  /// destroyed.
+  VPRegionBlock *createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting,
+                                       const std::string &Name = "") {
+    auto *VPB = new VPRegionBlock(Entry, Exiting, Name, true);
     CreatedBlocks.push_back(VPB);
     return VPB;
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 07bfe7a896d86..80a2e4bc3f754 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -110,12 +110,14 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
   case VPInstruction::AnyOf:
   case VPInstruction::BuildStructVector:
   case VPInstruction::BuildVector:
+  case VPInstruction::Unpack:
     return SetResultTyFromOp();
   case VPInstruction::ExtractLane:
     return inferScalarType(R->getOperand(1));
   case VPInstruction::FirstActiveLane:
     return Type::getIntNTy(Ctx, 64);
   case VPInstruction::ExtractLastElement:
+  case VPInstruction::ExtractLastLanePerPart:
   case VPInstruction::ExtractPenultimateElement: {
     Type *BaseTy = inferScalarType(R->getOperand(0));
     if (auto *VecTy = dyn_cast<VectorType>(BaseTy))
@@ -376,7 +378,7 @@ bool VPDominatorTree::properlyDominates(const VPRecipeBase *A,
 
 #ifndef NDEBUG
   auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * {
-    auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());
+    VPRegionBlock *Region = R->getRegion();
     if (Region && Region->isReplicator()) {
       assert(Region->getNumSuccessors() == 1 &&
              Region->getNumPredecessors() == 1 && "Expected SESE region!");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index c0147ce2fc061..65688a3f0b6be 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -406,7 +406,7 @@ static void createLoopRegion(VPlan &Plan, VPBlockBase *HeaderVPB) {
   // LatchExitVPB, taking care to preserve the original predecessor & successor
   // order of blocks. Set region entry and exiting after both HeaderVPB and
   // LatchVPBB have been disconnected from their predecessors/successors.
-  auto *R = Plan.createVPRegionBlock();
+  auto *R = Plan.createLoopRegion();
   VPBlockUtils::insertOnEdge(LatchVPBB, LatchExitVPB, R);
   VPBlockUtils::disconnectBlocks(LatchVPBB, R);
   VPBlockUtils::connectBlocks(PreheaderVPBB, R);
@@ -658,9 +658,11 @@ void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond,
   }
 
   VPIRMetadata VPBranchWeights;
-  auto *Term = VPBuilder(CheckBlockVPBB)
-                   .createNaryOp(VPInstruction::BranchOnCond, {CondVPV},
-                                 Plan.getCanonicalIV()->getDebugLoc());
+  auto *Term =
+      VPBuilder(CheckBlockVPBB)
+          .createNaryOp(
+              VPInstruction::BranchOnCond, {CondVPV},
+              Plan.getVectorLoopRegion()->getCanonicalIV()->getDebugLoc());
   if (AddBranchWeights) {
     MDBuilder MDB(Plan.getContext());
     MDNode *BranchWeights =
@@ -921,8 +923,8 @@ bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) {
     if (auto *DerivedIV = dyn_cast<VPDerivedIVRecipe>(VecV)) {
       if (DerivedIV->getNumUsers() == 1 &&
           DerivedIV->getOperand(1) == &Plan.getVectorTripCount()) {
-        auto *NewSel = Builder.createSelect(AnyNaN, Plan.getCanonicalIV(),
-                                            &Plan.getVectorTripCount());
+        auto *NewSel = Builder.createSelect(
+            AnyNaN, LoopRegion->getCanonicalIV(), &Plan.getVectorTripCount());
         DerivedIV->moveAfter(&*Builder.getInsertPoint());
         DerivedIV->setOperand(1, NewSel);
         continue;
@@ -935,7 +937,8 @@ bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) {
                            "FMaxNum/FMinNum reduction.\n");
       return false;
     }
-    auto *NewSel = Builder.createSelect(AnyNaN, Plan.getCanonicalIV(), VecV);
+    auto *NewSel =
+        Builder.createSelect(AnyNaN, LoopRegion->getCanonicalIV(), VecV);
     ResumeR->setOperand(0, NewSel);
   }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
index 1580a3be3180a..2aaabd9ebdd04 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
@@ -474,7 +474,7 @@ class VPlanPrinter {
 
 /// Check if a constant \p CI can be safely treated as having been extended
 /// from a narrower type with the given extension kind.
-bool canConstantBeExtended(const ConstantInt *CI, Type *NarrowType,
+bool canConstantBeExtended(const APInt *C, Type *NarrowType,
                            TTI::PartialReductionExtendKind ExtKind);
 } // end namespace llvm
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index b42b04946f3ca..b5b98c64543e4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -173,10 +173,10 @@ inline int_pred_ty<is_zero_int> m_ZeroInt() {
 /// For vectors, this includes constants with undefined elements.
 inline int_pred_ty<is_one> m_One() { return int_pred_ty<is_one>(); }
 
-struct bind_const_int {
-  uint64_t &Res;
+struct bind_apint {
+  const APInt *&Res;
 
-  bind_const_int(uint64_t &Res) : Res(Res) {}
+  bind_apint(const APInt *&Res) : Res(Res) {}
 
   bool match(VPValue *VPV) const {
     if (!VPV->isLiveIn())
@@ -188,7 +188,23 @@ struct bind_const_int {
     const auto *CI = dyn_cast<ConstantInt>(V);
     if (!CI)
       return false;
-    if (auto C = CI->getValue().tryZExtValue()) {
+    Res = &CI->getValue();
+    return true;
+  }
+};
+
+inline bind_apint m_APInt(const APInt *&C) { return C; }
+
+struct bind_const_int {
+  uint64_t &Res;
+
+  bind_const_int(uint64_t &Res) : Res(Res) {}
+
+  bool match(VPValue *VPV) const {
+    const APInt *APConst;
+    if (!bind_apint(APConst).match(VPV))
+      return false;
+    if (auto C = APConst->tryZExtValue()) {
       Res = *C;
       return true;
     }
@@ -372,6 +388,18 @@ m_ExtractLastElement(const Op0_t &Op0) {
   return m_VPInstruction<VPInstruction::ExtractLastElement>(Op0);
 }
 
+template <typename Op0_t, typename Op1_t>
+inline VPInstruction_match<Instruction::ExtractElement, Op0_t, Op1_t>
+m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1) {
+  return m_VPInstruction<Instruction::ExtractElement>(Op0, Op1);
+}
+
+template <typename Op0_t>
+inline VPInstruction_match<VPInstruction::ExtractLastLanePerPart, Op0_t>
+m_ExtractLastLanePerPart(const Op0_t &Op0) {
+  return m_VPInstruction<VPInstruction::ExtractLastLanePerPart>(Op0);
+}
+
 template <typename Op0_t, typename Op1_t, typename Op2_t>
 inline VPInstruction_match<VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t>
 m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) {
@@ -394,6 +422,12 @@ m_AnyOf(const Op0_t &Op0) {
   return m_VPInstruction<VPInstruction::AnyOf>(Op0);
 }
 
+template <typename Op0_t>
+inline VPInstruction_match<VPInstruction::FirstActiveLane, Op0_t>
+m_FirstActiveLane(const Op0_t &Op0) {
+  return m_VPInstruction<VPInstruction::FirstActiveLane>(Op0);
+}
+
 template <unsigned Opcode, typename Op0_t>
 inline AllRecipe_match<Opcode, Op0_t> m_Unary(const Op0_t &Op0) {
   return AllRecipe_match<Opcode, Op0_t>(Op0);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index 0c27d535b680e..fb17d5dd62b9d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -168,7 +168,8 @@ void VPPredicator::createHeaderMask(VPBasicBlock *HeaderVPBB, bool FoldTail) {
   // non-phi instructions.
 
   auto &Plan = *HeaderVPBB->getPlan();
-  auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
+  auto *IV =
+      new VPWidenCanonicalIVRecipe(HeaderVPBB->getParent()->getCanonicalIV());
   Builder.setInsertPoint(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
   Builder.insert(IV);
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 2368d18b0373c..1f1b42bb9c19f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -341,12 +341,12 @@ VPPartialReductionRecipe::computeCost(ElementCount VF,
     ExtAType = GetExtendKind(ExtAR);
     ExtBType = GetExtendKind(ExtBR);
 
-    if (!ExtBR && Widen->getOperand(1)->isLiveIn()) {
-      auto *CI = cast<ConstantInt>(Widen->getOperand(1)->getLiveInIRValue());
-      if (canConstantBeExtended(CI, InputTypeA, ExtAType)) {
-        InputTypeB = InputTypeA;
-        ExtBType = ExtAType;
-      }
+    using namespace VPlanPatternMatch;
+    const APInt *C;
+    if (!ExtBR && match(Widen->getOperand(1), m_APInt(C)) &&
+        canConstantBeExtended(C, InputTypeA, ExtAType)) {
+      InputTypeB = InputTypeA;
+      ExtBType = ExtAType;
     }
   };
 
@@ -511,9 +511,11 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
   case VPInstruction::CanonicalIVIncrementForPart:
   case VPInstruction::ExplicitVectorLength:
   case VPInstruction::ExtractLastElement:
+  case VPInstruction::ExtractLastLanePerPart:
   case VPInstruction::ExtractPenultimateElement:
   case VPInstruction::FirstActiveLane:
   case VPInstruction::Not:
+  case VPInstruction::Unpack:
     return 1;
   case Instruction::ICmp:
   case Instruction::FCmp:
@@ -878,9 +880,11 @@ Value *VPInstruction::generate(VPTransformState &State) {
 
     return ReducedPartRdx;
   }
+  case VPInstruction::ExtractLastLanePerPart:
   case VPInstruction::ExtractLastElement:
   case VPInstruction::ExtractPenultimateElement: {
-    unsigned Offset = getOpcode() == VPInstruction::ExtractLastElement ? 1 : 2;
+    unsigned Offset =
+        getOpcode() == VPInstruction::ExtractPenultimateElement ? 2 : 1;
     Value *Res;
     if (State.VF.isVector()) {
       assert(Offset <= State.VF.getKnownMinValue() &&
@@ -1166,6 +1170,7 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
 
 bool VPInstruction::isVectorToScalar() const {
   return getOpcode() == VPInstruction::ExtractLastElement ||
+         getOpcode() == VPInstruction::ExtractLastLanePerPart ||
          getOpcode() == VPInstruction::ExtractPenultimateElement ||
          getOpcode() == Instruction::ExtractElement ||
          getOpcode() == VPInstruction::ExtractLane ||
@@ -1229,6 +1234,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
   case VPInstruction::CanonicalIVIncrementForPart:
   case VPInstruction::ExtractLane:
   case VPInstruction::ExtractLastElement:
+  case VPInstruction::ExtractLastLanePerPart:
   case VPInstruction::ExtractPenultimateElement:
   case VPInstruction::ActiveLaneMask:
   case VPInstruction::FirstActiveLane:
@@ -1241,6 +1247,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
   case VPInstruction::StepVector:
   case VPInstruction::ReductionStartVector:
   case VPInstruction::VScale:
+  case VPInstruction::Unpack:
     return false;
   default:
     return true;
@@ -1285,7 +1292,8 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
   case VPInstruction::PtrAdd:
     return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this);
   case VPInstruction::WidePtrAdd:
-    return Op == getOperand(0);
+    // WidePtrAdd supports scalar and vector base addresses.
+    return false;
   case VPInstruction::ComputeAnyOfResult:
   case VPInstruction::ComputeFindIVResult:
     return Op == getOperand(1);
@@ -1376,6 +1384,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
   case VPInstruction::ExtractLastElement:
     O << "extract-last-element";
     break;
+  case VPInstruction::ExtractLastLanePerPart:
+    O << "extract-last-lane-per-part";
+    break;
   case VPInstruction::ExtractPenultimateElement:
     O << "extract-penultimate-element";
     break;
@@ -1409,6 +1420,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
   case VPInstruction::ResumeForEpilogue:
     O << "resume-for-epilogue";
     break;
+  case VPInstruction::Unpack:
+    O << "unpack";
+    break;
   default:
     O << Instruction::getOpcodeName(getOpcode());
   }
@@ -2344,7 +2358,7 @@ bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
     return false;
   auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
   auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
-  auto *CanIV = cast<VPCanonicalIVPHIRecipe>(&*getParent()->begin());
+  auto *CanIV = getRegion()->getCanonicalIV();
   return StartC && StartC->isZero() && StepC && StepC->isOne() &&
          getScalarType() == CanIV->getScalarType();
 }
@@ -2880,6 +2894,13 @@ bool VPExpressionRecipe::mayHaveSideEffects() const {
   return false;
 }
 
+bool VPExpressionRecipe::isSingleScalar() const {
+  // Cannot use vputils::isSingleScalar(), because all external operands
+  // of the expression will be live-ins while bundled.
+  return isa<VPReductionRecipe>(ExpressionRecipes.back()) &&
+         !isa<VPPartialReductionRecipe>(ExpressionRecipes.back());
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 
 void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent,
@@ -3068,7 +3089,7 @@ static void scalarizeInstruction(const Instruction *Instr,
     State.AC->registerAssumption(II);
 
   assert(
-      (RepRecipe->getParent()->getParent() ||
+      (RepRecipe->getRegion() ||
        !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||
        all_of(RepRecipe->operands(),
               [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) &&
@@ -3141,7 +3162,17 @@ static bool isUsedByLoadStoreAddress(const VPUser *V) {
 
   while (!WorkList.empty()) {
     auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val());
-    if (!Cur || !Seen.insert(Cur).second || isa<VPBlendRecipe>(Cur))
+    if (!Cur || !Seen.insert(Cur).second)
+      continue;
+
+    auto *Blend = dyn_cast<VPBlendRecipe>(Cur);
+    // Skip blends that use V only through a compare by checking if any incoming
+    // value was already visited.
+    if (Blend && none_of(seq<unsigned>(0, Blend->getNumIncomingValues()),
+                         [&](unsigned I) {
+                           return Seen.contains(
+                               Blend->getIncomingValue(I)->getDefiningRecipe());
+                         }))
       continue;
 
     for (VPUser *U : Cur->users()) {
@@ -3162,7 +3193,13 @@ static bool isUsedByLoadStoreAddress(const VPUser *V) {
       }
     }
 
-    append_range(WorkList, cast<VPSingleDefRecipe>(Cur)->users());
+    // The legacy cost model only supports scalarization loads/stores with phi
+    // addresses, if the phi is directly used as load/store address. Don't
+    // traverse further for Blends.
+    if (Blend)
+      continue;
+
+    append_range(WorkList, Cur->users());
   }
   return false;
 }
@@ -3260,7 +3297,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
                                               to_vector(operands()), VF);
     // If the recipe is not predicated (i.e. not in a replicate region), return
     // the scalar cost. Otherwise handle predicated cost.
-    if (!getParent()->getParent()->isReplicator())
+    if (!getRegion()->isReplicator())
       return ScalarCost;
 
     // Account for the phi nodes that we will create.
@@ -3276,7 +3313,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
   case Instruction::Store: {
     // TODO: See getMemInstScalarizationCost for how to handle replicating and
     // predicated cases.
-    const VPRegionBlock *ParentRegion = getParent()->getParent();
+    const VPRegionBlock *ParentRegion = getRegion();
     if (ParentRegion && ParentRegion->isReplicator())
       break;
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 40b7e8df7aec9..ff25ef52e3380 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -106,7 +106,7 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
             return false;
           NewRecipe = new VPWidenIntrinsicRecipe(
               *CI, getVectorIntrinsicIDForCall(CI, &TLI),
-              {Ingredient.op_begin(), Ingredient.op_end() - 1}, CI->getType(),
+              drop_end(Ingredient.operands()), CI->getType(),
               CI->getDebugLoc());
         } else if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
           NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands());
@@ -356,8 +356,7 @@ static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe,
   // Replace predicated replicate recipe with a replicate recipe without a
   // mask but in the replicate region.
   auto *RecipeWithoutMask = new VPReplicateRecipe(
-      PredRecipe->getUnderlyingInstr(),
-      make_range(PredRecipe->op_begin(), std::prev(PredRecipe->op_end())),
+      PredRecipe->getUnderlyingInstr(), drop_end(PredRecipe->operands()),
       PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe);
   auto *Pred =
       Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
@@ -373,7 +372,7 @@ static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe,
   auto *Exiting =
       Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
   VPRegionBlock *Region =
-      Plan.createVPRegionBlock(Entry, Exiting, RegionName, true);
+      Plan.createReplicateRegion(Entry, Exiting, RegionName);
 
   // Note: first set Entry as region entry and then connect successors starting
   // from it in order, to propagate the "parent" of each VPBasicBlock.
@@ -501,7 +500,8 @@ static void removeRedundantInductionCasts(VPlan &Plan) {
 /// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
 /// recipe, if it exists.
 static void removeRedundantCanonicalIVs(VPlan &Plan) {
-  VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
+  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+  VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
   VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
   for (VPUser *U : CanonicalIV->users()) {
     WidenNewIV = dyn_cast<VPWidenCanonicalIVRecipe>(U);
@@ -512,7 +512,7 @@ static void removeRedundantCanonicalIVs(VPlan &Plan) {
   if (!WidenNewIV)
     return;
 
-  VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
+  VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
   for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
     auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
 
@@ -582,8 +582,9 @@ createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind,
                     FPMathOperator *FPBinOp, Instruction *TruncI,
                     VPValue *StartV, VPValue *Step, DebugLoc DL,
                     VPBuilder &Builder) {
-  VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
-  VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
+  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+  VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
+  VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
   VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
       Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx");
 
@@ -786,9 +787,7 @@ static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan,
                                                ScalarEvolution &SE) {
   VPValue *Incoming, *Mask;
   if (!match(Op, m_VPInstruction<VPInstruction::ExtractLane>(
-                     m_VPInstruction<VPInstruction::FirstActiveLane>(
-                         m_VPValue(Mask)),
-                     m_VPValue(Incoming))))
+                     m_FirstActiveLane(m_VPValue(Mask)), m_VPValue(Incoming))))
     return nullptr;
 
   auto *WideIV = getOptimizableIVOf(Incoming, SE);
@@ -800,8 +799,9 @@ static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan,
     return nullptr;
 
   // Calculate the final index.
-  VPValue *EndValue = Plan.getCanonicalIV();
-  auto CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
+  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+  auto *CanonicalIV = LoopRegion->getCanonicalIV();
+  Type *CanonicalIVType = CanonicalIV->getScalarType();
   VPBuilder B(cast<VPBasicBlock>(PredVPBB));
 
   DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
@@ -810,7 +810,8 @@ static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan,
   Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane);
   FirstActiveLane = B.createScalarZExtOrTrunc(FirstActiveLane, CanonicalIVType,
                                               FirstActiveLaneType, DL);
-  EndValue = B.createNaryOp(Instruction::Add, {EndValue, FirstActiveLane}, DL);
+  VPValue *EndValue =
+      B.createNaryOp(Instruction::Add, {CanonicalIV, FirstActiveLane}, DL);
 
   // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
   // changed it means the exit is using the incremented value, so we need to
@@ -937,7 +938,7 @@ static void recursivelyDeleteDeadRecipes(VPValue *V) {
       continue;
     if (!isDeadRecipe(*R))
       continue;
-    WorkList.append(R->op_begin(), R->op_end());
+    append_range(WorkList, R->operands());
     R->eraseFromParent();
   }
 }
@@ -1205,7 +1206,8 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
   }
 
   // Look through ExtractLastElement (BuildVector ....).
-  if (match(&R, m_ExtractLastElement(m_BuildVector()))) {
+  if (match(&R, m_CombineOr(m_ExtractLastElement(m_BuildVector()),
+                            m_ExtractLastLanePerPart(m_BuildVector())))) {
     auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
     Def->replaceAllUsesWith(
         BuildVector->getOperand(BuildVector->getNumOperands() - 1));
@@ -1221,6 +1223,13 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
     return;
   }
 
+  uint64_t Idx;
+  if (match(&R, m_ExtractElement(m_BuildVector(), m_ConstantInt(Idx)))) {
+    auto *BuildVector = cast<VPInstruction>(R.getOperand(0));
+    Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
+    return;
+  }
+
   if (auto *Phi = dyn_cast<VPPhi>(Def)) {
     if (Phi->getNumOperands() == 1)
       Phi->replaceAllUsesWith(Phi->getOperand(0));
@@ -1271,13 +1280,15 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
     return;
   }
 
-  if (match(Def, m_ExtractLastElement(m_Broadcast(m_VPValue(A))))) {
+  if (match(Def,
+            m_CombineOr(m_ExtractLastElement(m_Broadcast(m_VPValue(A))),
+                        m_ExtractLastLanePerPart(m_Broadcast(m_VPValue(A)))))) {
     Def->replaceAllUsesWith(A);
     return;
   }
 
-  if (match(Def,
-            m_VPInstruction<VPInstruction::ExtractLastElement>(m_VPValue(A))) &&
+  if (match(Def, m_CombineOr(m_ExtractLastElement(m_VPValue(A)),
+                             m_ExtractLastLanePerPart(m_VPValue(A)))) &&
       ((isa<VPInstruction>(A) && vputils::isSingleScalar(A)) ||
        (isa<VPReplicateRecipe>(A) &&
         cast<VPReplicateRecipe>(A)->isSingleScalar())) &&
@@ -1285,6 +1296,12 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
              [Def, A](VPUser *U) { return U->usesScalars(A) || Def == U; })) {
     return Def->replaceAllUsesWith(A);
   }
+
+  if (Plan->getUF() == 1 &&
+      match(Def, m_ExtractLastLanePerPart(m_VPValue(A)))) {
+    return Def->replaceAllUsesWith(
+        Builder.createNaryOp(VPInstruction::ExtractLastElement, {A}));
+  }
 }
 
 void VPlanTransforms::simplifyRecipes(VPlan &Plan) {
@@ -1322,8 +1339,11 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
             RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
             true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Metadata*/);
         Clone->insertBefore(RepOrWidenR);
-        auto *Ext = new VPInstruction(VPInstruction::ExtractLastElement,
-                                      {Clone->getOperand(0)});
+        unsigned ExtractOpc =
+            vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1))
+                ? VPInstruction::ExtractLastElement
+                : VPInstruction::ExtractLastLanePerPart;
+        auto *Ext = new VPInstruction(ExtractOpc, {Clone->getOperand(0)});
         Ext->insertBefore(Clone);
         Clone->setOperand(0, Ext);
         RepR->eraseFromParent();
@@ -1337,7 +1357,8 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
           !all_of(RepOrWidenR->users(), [RepOrWidenR](const VPUser *U) {
             return U->usesScalars(RepOrWidenR) ||
                    match(cast<VPRecipeBase>(U),
-                         m_ExtractLastElement(m_VPValue()));
+                         m_CombineOr(m_ExtractLastElement(m_VPValue()),
+                                     m_ExtractLastLanePerPart(m_VPValue())));
           }))
         continue;
 
@@ -1457,11 +1478,8 @@ static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan,
   if (!Plan.getVectorLoopRegion())
     return false;
 
-  if (!Plan.getTripCount()->isLiveIn())
-    return false;
-  auto *TC = dyn_cast_if_present<ConstantInt>(
-      Plan.getTripCount()->getUnderlyingValue());
-  if (!TC || !BestVF.isFixed())
+  const APInt *TC;
+  if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))
     return false;
 
   // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
@@ -1474,7 +1492,7 @@ static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan,
     return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
   };
   unsigned NewBitWidth =
-      ComputeBitWidth(TC->getValue(), BestVF.getKnownMinValue() * BestUF);
+      ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
 
   LLVMContext &Ctx = Plan.getContext();
   auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
@@ -1530,7 +1548,7 @@ static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan,
       return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, SE);
     });
 
-  auto *CanIV = Plan.getCanonicalIV();
+  auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
   if (!match(Cond, m_SpecificICmp(CmpInst::ICMP_EQ,
                                   m_Specific(CanIV->getBackedgeValue()),
                                   m_Specific(&Plan.getVectorTripCount()))))
@@ -1843,8 +1861,8 @@ static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR,
       return nullptr;
     VPRegionBlock *EnclosingLoopRegion =
         HoistCandidate->getParent()->getEnclosingLoopRegion();
-    assert((!HoistCandidate->getParent()->getParent() ||
-            HoistCandidate->getParent()->getParent() == EnclosingLoopRegion) &&
+    assert((!HoistCandidate->getRegion() ||
+            HoistCandidate->getRegion() == EnclosingLoopRegion) &&
            "CFG in VPlan should still be flat, without replicate regions");
     // Hoist candidate was already visited, no need to hoist.
     if (!Visited.insert(HoistCandidate).second)
@@ -1991,7 +2009,7 @@ struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
         .Case<VPWidenIntrinsicRecipe>([](auto *I) {
           return std::make_pair(true, I->getVectorIntrinsicID());
         })
-        .Case<VPVectorPointerRecipe>([](auto *I) {
+        .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe>([](auto *I) {
           // For recipes that do not directly map to LLVM IR instructions,
           // assign opcodes after the last VPInstruction opcode (which is also
           // after the last IR Instruction opcode), based on the VPDefID.
@@ -2068,6 +2086,15 @@ struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
           LFlags->getPredicate() !=
               cast<VPRecipeWithIRFlags>(R)->getPredicate())
         return false;
+    // Recipes in replicate regions implicitly depend on predicate. If either
+    // recipe is in a replicate region, only consider them equal if both have
+    // the same parent.
+    const VPRegionBlock *RegionL = L->getRegion();
+    const VPRegionBlock *RegionR = R->getRegion();
+    if (((RegionL && RegionL->isReplicator()) ||
+         (RegionR && RegionR->isReplicator())) &&
+        L->getParent() != R->getParent())
+      return false;
     const VPlan *Plan = L->getParent()->getPlan();
     VPTypeAnalysis TypeInfo(*Plan);
     return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R);
@@ -2107,9 +2134,18 @@ static void licm(VPlan &Plan) {
   VPBasicBlock *Preheader = Plan.getVectorPreheader();
 
   // Return true if we do not know how to (mechanically) hoist a given recipe
-  // out of a loop region. Does not address legality concerns such as aliasing
-  // or speculation safety.
+  // out of a loop region.
   auto CannotHoistRecipe = [](VPRecipeBase &R) {
+    // Assumes don't alias anything or throw; as long as they're guaranteed to
+    // execute, they're safe to hoist.
+    if (match(&R, m_Intrinsic<Intrinsic::assume>()))
+      return false;
+
+    // TODO: Relax checks in the future, e.g. we could also hoist reads, if
+    // their memory location is not modified in the vector loop.
+    if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi())
+      return true;
+
     // Allocas cannot be hoisted.
     auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
     return RepR && RepR->getOpcode() == Instruction::Alloca;
@@ -2117,17 +2153,18 @@ static void licm(VPlan &Plan) {
 
   // Hoist any loop invariant recipes from the vector loop region to the
   // preheader. Preform a shallow traversal of the vector loop region, to
-  // exclude recipes in replicate regions.
+  // exclude recipes in replicate regions. Since the top-level blocks in the
+  // vector loop region are guaranteed to execute if the vector pre-header is,
+  // we don't need to check speculation safety.
   VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+  assert(Preheader->getSingleSuccessor() == LoopRegion &&
+         "Expected vector prehader's successor to be the vector loop region");
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
            vp_depth_first_shallow(LoopRegion->getEntry()))) {
     for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
       if (CannotHoistRecipe(R))
         continue;
-      // TODO: Relax checks in the future, e.g. we could also hoist reads, if
-      // their memory location is not modified in the vector loop.
-      if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi() ||
-          any_of(R.operands(), [](VPValue *Op) {
+      if (any_of(R.operands(), [](VPValue *Op) {
             return !Op->isDefinedOutsideLoopRegions();
           }))
         continue;
@@ -2319,7 +2356,7 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
     VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck) {
   VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
   VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
-  auto *CanonicalIVPHI = Plan.getCanonicalIV();
+  auto *CanonicalIVPHI = TopRegion->getCanonicalIV();
   VPValue *StartV = CanonicalIVPHI->getStartValue();
 
   auto *CanonicalIVIncrement =
@@ -2358,7 +2395,7 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
 
   // Create the active lane mask instruction in the VPlan preheader.
   VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
-      ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
+      ConstantInt::get(TopRegion->getCanonicalIV()->getScalarType(), 1));
   auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
                                         {EntryIncrement, TC, ALMMultiplier}, DL,
                                         "active.lane.mask.entry");
@@ -2394,13 +2431,15 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
 /// TODO: Introduce explicit recipe for header-mask instead of searching
 /// for the header-mask pattern manually.
 static VPSingleDefRecipe *findHeaderMask(VPlan &Plan) {
+  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
   SmallVector<VPValue *> WideCanonicalIVs;
-  auto *FoundWidenCanonicalIVUser = find_if(Plan.getCanonicalIV()->users(),
-                                            IsaPred<VPWidenCanonicalIVRecipe>);
-  assert(count_if(Plan.getCanonicalIV()->users(),
+  auto *FoundWidenCanonicalIVUser = find_if(
+      LoopRegion->getCanonicalIV()->users(), IsaPred<VPWidenCanonicalIVRecipe>);
+  assert(count_if(LoopRegion->getCanonicalIV()->users(),
                   IsaPred<VPWidenCanonicalIVRecipe>) <= 1 &&
          "Must have at most one VPWideCanonicalIVRecipe");
-  if (FoundWidenCanonicalIVUser != Plan.getCanonicalIV()->users().end()) {
+  if (FoundWidenCanonicalIVUser !=
+      LoopRegion->getCanonicalIV()->users().end()) {
     auto *WideCanonicalIV =
         cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
     WideCanonicalIVs.push_back(WideCanonicalIV);
@@ -2408,7 +2447,7 @@ static VPSingleDefRecipe *findHeaderMask(VPlan &Plan) {
 
   // Also include VPWidenIntOrFpInductionRecipes that represent a widened
   // version of the canonical induction.
-  VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
+  VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
   for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
     auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
     if (WidenOriginalIV && WidenOriginalIV->isCanonical())
@@ -2441,8 +2480,9 @@ void VPlanTransforms::addActiveLaneMask(
          "DataAndControlFlowWithoutRuntimeCheck implies "
          "UseActiveLaneMaskForControlFlow");
 
-  auto *FoundWidenCanonicalIVUser = find_if(Plan.getCanonicalIV()->users(),
-                                            IsaPred<VPWidenCanonicalIVRecipe>);
+  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+  auto *FoundWidenCanonicalIVUser = find_if(
+      LoopRegion->getCanonicalIV()->users(), IsaPred<VPWidenCanonicalIVRecipe>);
   assert(FoundWidenCanonicalIVUser &&
          "Must have widened canonical IV when tail folding!");
   VPSingleDefRecipe *HeaderMask = findHeaderMask(Plan);
@@ -2455,7 +2495,7 @@ void VPlanTransforms::addActiveLaneMask(
   } else {
     VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
     VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
-        ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
+        ConstantInt::get(LoopRegion->getCanonicalIV()->getScalarType(), 1));
     LaneMask =
         B.createNaryOp(VPInstruction::ActiveLaneMask,
                        {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
@@ -2565,9 +2605,10 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
   });
 
   assert(all_of(Plan.getVFxUF().users(),
-                [&Plan](VPUser *U) {
-                  return match(U, m_c_Add(m_Specific(Plan.getCanonicalIV()),
-                                          m_Specific(&Plan.getVFxUF()))) ||
+                [&LoopRegion, &Plan](VPUser *U) {
+                  return match(U,
+                               m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
+                                       m_Specific(&Plan.getVFxUF()))) ||
                          isa<VPWidenPointerInductionRecipe>(U);
                 }) &&
          "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
@@ -2722,9 +2763,10 @@ void VPlanTransforms::addExplicitVectorLength(
     VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
   if (Plan.hasScalarVFOnly())
     return;
-  VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock();
+  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+  VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
 
-  auto *CanonicalIVPHI = Plan.getCanonicalIV();
+  auto *CanonicalIVPHI = LoopRegion->getCanonicalIV();
   auto *CanIVTy = CanonicalIVPHI->getScalarType();
   VPValue *StartV = CanonicalIVPHI->getStartValue();
 
@@ -2868,7 +2910,7 @@ void VPlanTransforms::replaceSymbolicStrides(
   // evolution.
   auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
     auto *R = cast<VPRecipeBase>(&U);
-    return R->getParent()->getParent() ||
+    return R->getRegion() ||
            R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
   };
   ValueToSCEVMapTy RewriteMap;
@@ -3750,7 +3792,7 @@ void VPlanTransforms::materializeBackedgeTakenCount(VPlan &Plan,
   BTC->replaceAllUsesWith(TCMO);
 }
 
-void VPlanTransforms::materializeBuildVectors(VPlan &Plan) {
+void VPlanTransforms::materializePacksAndUnpacks(VPlan &Plan) {
   if (Plan.hasScalarVFOnly())
     return;
 
@@ -3773,8 +3815,7 @@ void VPlanTransforms::materializeBuildVectors(VPlan &Plan) {
         continue;
       auto *DefR = cast<VPRecipeWithIRFlags>(&R);
       auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
-        VPRegionBlock *ParentRegion =
-            cast<VPRecipeBase>(U)->getParent()->getParent();
+        VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
         return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
       };
       if ((isa<VPReplicateRecipe>(DefR) &&
@@ -3799,6 +3840,49 @@ void VPlanTransforms::materializeBuildVectors(VPlan &Plan) {
           });
     }
   }
+
+  // Create explicit VPInstructions to convert vectors to scalars. The current
+  // implementation is conservative - it may miss some cases that may or may not
+  // be vector values. TODO: introduce Unpacks speculatively - remove them later
+  // if they are known to operate on scalar values.
+  for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
+    for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
+      if (isa<VPReplicateRecipe, VPInstruction, VPScalarIVStepsRecipe,
+              VPDerivedIVRecipe, VPCanonicalIVPHIRecipe>(&R))
+        continue;
+      for (VPValue *Def : R.definedValues()) {
+        // Skip recipes that are single-scalar or only have their first lane
+        // used.
+        // TODO: The Defs skipped here may or may not be vector values.
+        // Introduce Unpacks, and remove them later, if they are guaranteed to
+        // produce scalar values.
+        if (vputils::isSingleScalar(Def) || vputils::onlyFirstLaneUsed(Def))
+          continue;
+
+        // At the moment, we create unpacks only for scalar users outside
+        // replicate regions. Recipes inside replicate regions still extract the
+        // required lanes implicitly.
+        // TODO: Remove once replicate regions are unrolled completely.
+        auto IsCandidateUnpackUser = [Def](VPUser *U) {
+          VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
+          return U->usesScalars(Def) &&
+                 (!ParentRegion || !ParentRegion->isReplicator());
+        };
+        if (none_of(Def->users(), IsCandidateUnpackUser))
+          continue;
+
+        auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
+        if (R.isPhi())
+          Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
+        else
+          Unpack->insertAfter(&R);
+        Def->replaceUsesWithIf(Unpack,
+                               [&IsCandidateUnpackUser](VPUser &U, unsigned) {
+                                 return IsCandidateUnpackUser(&U);
+                               });
+      }
+    }
+  }
 }
 
 void VPlanTransforms::materializeVectorTripCount(VPlan &Plan,
@@ -3872,6 +3956,9 @@ void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH,
   // used.
   // TODO: Assert that they aren't used.
 
+  VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF()));
+  Plan.getSymbolicUF().replaceAllUsesWith(UF);
+
   // If there are no users of the runtime VF, compute VFxUF by constant folding
   // the multiplication of VF and UF.
   if (VF.getNumUsers() == 0) {
@@ -3891,7 +3978,6 @@ void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH,
   }
   VF.replaceAllUsesWith(RuntimeVF);
 
-  VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF()));
   VPValue *MulByUF = Builder.createNaryOp(Instruction::Mul, {RuntimeVF, UF});
   VFxUF.replaceAllUsesWith(MulByUF);
 }
@@ -3959,14 +4045,14 @@ static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
   return false;
 }
 
-/// Returns true if \p IR is a full interleave group with factor and number of
-/// members both equal to \p VF. The interleave group must also access the full
-/// vector width \p VectorRegWidth.
-static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR,
-                                         unsigned VF, VPTypeAnalysis &TypeInfo,
-                                         unsigned VectorRegWidth) {
+/// Returns VF from \p VFs if \p IR is a full interleave group with factor and
+/// number of members both equal to VF. The interleave group must also access
+/// the full vector width.
+static std::optional<ElementCount> isConsecutiveInterleaveGroup(
+    VPInterleaveRecipe *InterleaveR, ArrayRef<ElementCount> VFs,
+    VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI) {
   if (!InterleaveR)
-    return false;
+    return std::nullopt;
 
   Type *GroupElementTy = nullptr;
   if (InterleaveR->getStoredValues().empty()) {
@@ -3975,7 +4061,7 @@ static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR,
                 [&TypeInfo, GroupElementTy](VPValue *Op) {
                   return TypeInfo.inferScalarType(Op) == GroupElementTy;
                 }))
-      return false;
+      return std::nullopt;
   } else {
     GroupElementTy =
         TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
@@ -3983,13 +4069,27 @@ static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR,
                 [&TypeInfo, GroupElementTy](VPValue *Op) {
                   return TypeInfo.inferScalarType(Op) == GroupElementTy;
                 }))
-      return false;
+      return std::nullopt;
   }
 
-  unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * VF;
-  auto IG = InterleaveR->getInterleaveGroup();
-  return IG->getFactor() == VF && IG->getNumMembers() == VF &&
-         GroupSize == VectorRegWidth;
+  auto GetVectorWidthForVF = [&TTI](ElementCount VF) {
+    TypeSize Size = TTI.getRegisterBitWidth(
+        VF.isFixed() ? TargetTransformInfo::RGK_FixedWidthVector
+                     : TargetTransformInfo::RGK_ScalableVector);
+    assert(Size.isScalable() == VF.isScalable() &&
+           "if Size is scalable, VF must to and vice versa");
+    return Size.getKnownMinValue();
+  };
+
+  for (ElementCount VF : VFs) {
+    unsigned MinVal = VF.getKnownMinValue();
+    unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * MinVal;
+    auto IG = InterleaveR->getInterleaveGroup();
+    if (IG->getFactor() == MinVal && IG->getNumMembers() == MinVal &&
+        GroupSize == GetVectorWidthForVF(VF))
+      return {VF};
+  }
+  return std::nullopt;
 }
 
 /// Returns true if \p VPValue is a narrow VPValue.
@@ -4000,16 +4100,18 @@ static bool isAlreadyNarrow(VPValue *VPV) {
   return RepR && RepR->isSingleScalar();
 }
 
-void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
-                                             unsigned VectorRegWidth) {
+std::unique_ptr<VPlan>
+VPlanTransforms::narrowInterleaveGroups(VPlan &Plan,
+                                        const TargetTransformInfo &TTI) {
+  using namespace llvm::VPlanPatternMatch;
   VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
+
   if (!VectorLoop)
-    return;
+    return nullptr;
 
   VPTypeAnalysis TypeInfo(Plan);
-
-  unsigned VFMinVal = VF.getKnownMinValue();
   SmallVector<VPInterleaveRecipe *> StoreGroups;
+  std::optional<ElementCount> VFToOptimize;
   for (auto &R : *VectorLoop->getEntryBasicBlock()) {
     if (isa<VPCanonicalIVPHIRecipe>(&R) || match(&R, m_BranchOnCount()))
       continue;
@@ -4023,30 +4125,33 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
     //  * recipes writing to memory except interleave groups
     // Only support plans with a canonical induction phi.
     if (R.isPhi())
-      return;
+      return nullptr;
 
     auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
     if (R.mayWriteToMemory() && !InterleaveR)
-      return;
-
-    // Do not narrow interleave groups if there are VectorPointer recipes and
-    // the plan was unrolled. The recipe implicitly uses VF from
-    // VPTransformState.
-    // TODO: Remove restriction once the VF for the VectorPointer offset is
-    // modeled explicitly as operand.
-    if (isa<VPVectorPointerRecipe>(&R) && Plan.getUF() > 1)
-      return;
+      return nullptr;
 
     // All other ops are allowed, but we reject uses that cannot be converted
     // when checking all allowed consumers (store interleave groups) below.
     if (!InterleaveR)
       continue;
 
-    // Bail out on non-consecutive interleave groups.
-    if (!isConsecutiveInterleaveGroup(InterleaveR, VFMinVal, TypeInfo,
-                                      VectorRegWidth))
-      return;
-
+    // Try to find a single VF, where all interleave groups are consecutive and
+    // saturate the full vector width. If we already have a candidate VF, check
+    // if it is applicable for the current InterleaveR, otherwise look for a
+    // suitable VF across the Plans VFs.
+    //
+    if (VFToOptimize) {
+      if (!isConsecutiveInterleaveGroup(InterleaveR, {*VFToOptimize}, TypeInfo,
+                                        TTI))
+        return nullptr;
+    } else {
+      if (auto VF = isConsecutiveInterleaveGroup(
+              InterleaveR, to_vector(Plan.vectorFactors()), TypeInfo, TTI))
+        VFToOptimize = *VF;
+      else
+        return nullptr;
+    }
     // Skip read interleave groups.
     if (InterleaveR->getStoredValues().empty())
       continue;
@@ -4080,24 +4185,34 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
     auto *WideMember0 = dyn_cast_or_null<VPWidenRecipe>(
         InterleaveR->getStoredValues()[0]->getDefiningRecipe());
     if (!WideMember0)
-      return;
+      return nullptr;
     for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
       auto *R = dyn_cast_or_null<VPWidenRecipe>(V->getDefiningRecipe());
       if (!R || R->getOpcode() != WideMember0->getOpcode() ||
           R->getNumOperands() > 2)
-        return;
+        return nullptr;
       if (any_of(enumerate(R->operands()),
                  [WideMember0, Idx = I](const auto &P) {
                    const auto &[OpIdx, OpV] = P;
                    return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx);
                  }))
-        return;
+        return nullptr;
     }
     StoreGroups.push_back(InterleaveR);
   }
 
   if (StoreGroups.empty())
-    return;
+    return nullptr;
+
+  // All interleave groups in Plan can be narrowed for VFToOptimize. Split the
+  // original Plan into 2: a) a new clone which contains all VFs of Plan, except
+  // VFToOptimize, and b) the original Plan with VFToOptimize as single VF.
+  std::unique_ptr<VPlan> NewPlan;
+  if (size(Plan.vectorFactors()) != 1) {
+    NewPlan = std::unique_ptr<VPlan>(Plan.duplicate());
+    Plan.setVF(*VFToOptimize);
+    NewPlan->removeVF(*VFToOptimize);
+  }
 
   // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
   SmallPtrSet<VPValue *, 4> NarrowedOps;
@@ -4164,13 +4279,12 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
 
   // Adjust induction to reflect that the transformed plan only processes one
   // original iteration.
-  auto *CanIV = Plan.getCanonicalIV();
+  auto *CanIV = VectorLoop->getCanonicalIV();
   auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
   VPBuilder PHBuilder(Plan.getVectorPreheader());
 
-  VPValue *UF = Plan.getOrAddLiveIn(
-      ConstantInt::get(CanIV->getScalarType(), 1 * Plan.getUF()));
-  if (VF.isScalable()) {
+  VPValue *UF = &Plan.getSymbolicUF();
+  if (VFToOptimize->isScalable()) {
     VPValue *VScale = PHBuilder.createElementCount(
         CanIV->getScalarType(), ElementCount::getScalable(1));
     VPValue *VScaleUF = PHBuilder.createNaryOp(Instruction::Mul, {VScale, UF});
@@ -4182,6 +4296,10 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
         Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
   }
   removeDeadRecipes(Plan);
+  assert(none_of(*VectorLoop->getEntryBasicBlock(),
+                 IsaPred<VPVectorPointerRecipe>) &&
+         "All VPVectorPointerRecipes should have been removed");
+  return NewPlan;
 }
 
 /// Add branch weight metadata, if the \p Plan's middle block is terminated by a
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 5a8a2bbc2975e..ca8d956d95d27 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -325,9 +325,10 @@ struct VPlanTransforms {
   static void materializeBackedgeTakenCount(VPlan &Plan,
                                             VPBasicBlock *VectorPH);
 
-  /// Add explicit Build[Struct]Vector recipes that combine multiple scalar
-  /// values into single vectors.
-  static void materializeBuildVectors(VPlan &Plan);
+  /// Add explicit Build[Struct]Vector recipes to Pack multiple scalar values
+  /// into vectors and Unpack recipes to extract scalars from vectors as
+  /// needed.
+  static void materializePacksAndUnpacks(VPlan &Plan);
 
   /// Materialize VF and VFxUF to be computed explicitly using VPInstructions.
   static void materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH,
@@ -340,14 +341,20 @@ struct VPlanTransforms {
   static DenseMap<const SCEV *, Value *> expandSCEVs(VPlan &Plan,
                                                      ScalarEvolution &SE);
 
-  /// Try to convert a plan with interleave groups with VF elements to a plan
-  /// with the interleave groups replaced by wide loads and stores processing VF
-  /// elements, if all transformed interleave groups access the full vector
-  /// width (checked via \o VectorRegWidth). This effectively is a very simple
-  /// form of loop-aware SLP, where we use interleave groups to identify
-  /// candidates.
-  static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
-                                     unsigned VectorRegWidth);
+  /// Try to find a single VF among \p Plan's VFs for which all interleave
+  /// groups (with known minimum VF elements) can be replaced by wide loads and
+  /// stores processing VF elements, if all transformed interleave groups access
+  /// the full vector width (checked via the maximum vector register width). If
+  /// the transformation can be applied, the original \p Plan will be split in
+  /// 2:
+  ///  1. The original Plan with the single VF containing the optimized recipes
+  ///  using wide loads instead of interleave groups.
+  ///  2. A new clone which contains all VFs of Plan except the optimized VF.
+  ///
+  /// This effectively is a very simple form of loop-aware SLP, where we use
+  /// interleave groups to identify candidates.
+  static std::unique_ptr<VPlan>
+  narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI);
 
   /// Predicate and linearize the control-flow in the only loop region of
   /// \p Plan. If \p FoldTail is true, create a mask guarding the loop
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index 1c4adfca3b64f..cfd1a741ee841 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -69,7 +69,8 @@ class UnrollState {
                                 VPBasicBlock::iterator InsertPtForPhi);
 
   VPValue *getConstantVPV(unsigned Part) {
-    Type *CanIVIntTy = Plan.getCanonicalIV()->getScalarType();
+    Type *CanIVIntTy =
+        Plan.getVectorLoopRegion()->getCanonicalIV()->getScalarType();
     return Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, Part));
   }
 
@@ -351,8 +352,7 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) {
     // Compute*Result which combine all parts to compute the final value.
     VPValue *Op1;
     if (match(&R, m_VPInstruction<VPInstruction::AnyOf>(m_VPValue(Op1))) ||
-        match(&R, m_VPInstruction<VPInstruction::FirstActiveLane>(
-                      m_VPValue(Op1))) ||
+        match(&R, m_FirstActiveLane(m_VPValue(Op1))) ||
         match(&R, m_VPInstruction<VPInstruction::ComputeAnyOfResult>(
                       m_VPValue(), m_VPValue(), m_VPValue(Op1))) ||
         match(&R, m_VPInstruction<VPInstruction::ComputeReductionResult>(
@@ -465,10 +465,21 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF) {
 /// Create a single-scalar clone of \p DefR (must be a VPReplicateRecipe or
 /// VPInstruction) for lane \p Lane. Use \p Def2LaneDefs to look up scalar
 /// definitions for operands of \DefR.
-static VPRecipeWithIRFlags *
+static VPValue *
 cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
              VPRecipeWithIRFlags *DefR, VPLane Lane,
              const DenseMap<VPValue *, SmallVector<VPValue *>> &Def2LaneDefs) {
+  VPValue *Op;
+  if (match(DefR, m_VPInstruction<VPInstruction::Unpack>(m_VPValue(Op)))) {
+    auto LaneDefs = Def2LaneDefs.find(Op);
+    if (LaneDefs != Def2LaneDefs.end())
+      return LaneDefs->second[Lane.getKnownLane()];
+
+    VPValue *Idx =
+        Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, Lane.getKnownLane()));
+    return Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx});
+  }
+
   // Collect the operands at Lane, creating extracts as needed.
   SmallVector<VPValue *> NewOps;
   for (VPValue *Op : DefR->operands()) {
@@ -480,6 +491,10 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
       continue;
     }
     if (Lane.getKind() == VPLane::Kind::ScalableLast) {
+      // Look through mandatory Unpack.
+      [[maybe_unused]] bool Matched =
+          match(Op, m_VPInstruction<VPInstruction::Unpack>(m_VPValue(Op)));
+      assert(Matched && "original op must have been Unpack");
       NewOps.push_back(
           Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op}));
       continue;
@@ -547,7 +562,8 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
           (isa<VPReplicateRecipe>(&R) &&
            cast<VPReplicateRecipe>(&R)->isSingleScalar()) ||
           (isa<VPInstruction>(&R) &&
-           !cast<VPInstruction>(&R)->doesGeneratePerAllLanes()))
+           !cast<VPInstruction>(&R)->doesGeneratePerAllLanes() &&
+           cast<VPInstruction>(&R)->getOpcode() != VPInstruction::Unpack))
         continue;
 
       auto *DefR = cast<VPRecipeWithIRFlags>(&R);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index 66748c534f10b..32e4b8872b1df 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -8,6 +8,7 @@
 
 #include "VPlanUtils.h"
 #include "VPlanCFG.h"
+#include "VPlanDominatorTree.h"
 #include "VPlanPatternMatch.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
@@ -53,7 +54,7 @@ VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr) {
   return Expanded;
 }
 
-bool vputils::isHeaderMask(const VPValue *V, VPlan &Plan) {
+bool vputils::isHeaderMask(const VPValue *V, const VPlan &Plan) {
   if (isa<VPActiveLaneMaskPHIRecipe>(V))
     return true;
 
@@ -67,12 +68,14 @@ bool vputils::isHeaderMask(const VPValue *V, VPlan &Plan) {
 
   if (match(V, m_ActiveLaneMask(m_VPValue(A), m_VPValue(B), m_One())))
     return B == Plan.getTripCount() &&
-           (match(A, m_ScalarIVSteps(m_Specific(Plan.getCanonicalIV()), m_One(),
-                                     m_Specific(&Plan.getVF()))) ||
+           (match(A,
+                  m_ScalarIVSteps(
+                      m_Specific(Plan.getVectorLoopRegion()->getCanonicalIV()),
+                      m_One(), m_Specific(&Plan.getVF()))) ||
             IsWideCanonicalIV(A));
 
   return match(V, m_ICmp(m_VPValue(A), m_VPValue(B))) && IsWideCanonicalIV(A) &&
-         B == Plan.getOrCreateBackedgeTakenCount();
+         B == Plan.getBackedgeTakenCount();
 }
 
 const SCEV *vputils::getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE) {
@@ -102,7 +105,8 @@ bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) {
     return all_of(R->operands(), isUniformAcrossVFsAndUFs);
   }
 
-  auto *CanonicalIV = R->getParent()->getPlan()->getCanonicalIV();
+  auto *CanonicalIV =
+      R->getParent()->getEnclosingLoopRegion()->getCanonicalIV();
   // Canonical IV chain is uniform.
   if (V == CanonicalIV || V == CanonicalIV->getBackedgeValue())
     return true;
@@ -110,12 +114,12 @@ bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) {
   return TypeSwitch<const VPRecipeBase *, bool>(R)
       .Case<VPDerivedIVRecipe>([](const auto *R) { return true; })
       .Case<VPReplicateRecipe>([](const auto *R) {
-        // Loads and stores that are uniform across VF lanes are handled by
-        // VPReplicateRecipe.IsUniform. They are also uniform across UF parts if
-        // all their operands are invariant.
-        // TODO: Further relax the restrictions.
+        // Be conservative about side-effects, except for the
+        // known-side-effecting assumes and stores, which we know will be
+        // uniform.
         return R->isSingleScalar() &&
-               (isa<LoadInst, StoreInst>(R->getUnderlyingValue())) &&
+               (!R->mayHaveSideEffects() ||
+                isa<AssumeInst, StoreInst>(R->getUnderlyingInstr())) &&
                all_of(R->operands(), isUniformAcrossVFsAndUFs);
       })
       .Case<VPInstruction>([](const auto *VPI) {
@@ -250,3 +254,29 @@ vputils::getRecipesForUncountableExit(VPlan &Plan,
 
   return UncountableCondition;
 }
+
+bool VPBlockUtils::isHeader(const VPBlockBase *VPB,
+                            const VPDominatorTree &VPDT) {
+  auto *VPBB = dyn_cast<VPBasicBlock>(VPB);
+  if (!VPBB)
+    return false;
+
+  // If VPBB is in a region R, VPBB is a loop header if R is a loop region with
+  // VPBB as its entry, i.e., free of predecessors.
+  if (auto *R = VPBB->getParent())
+    return !R->isReplicator() && !VPBB->hasPredecessors();
+
+  // A header dominates its second predecessor (the latch), with the other
+  // predecessor being the preheader
+  return VPB->getPredecessors().size() == 2 &&
+         VPDT.dominates(VPB, VPB->getPredecessors()[1]);
+}
+
+bool VPBlockUtils::isLatch(const VPBlockBase *VPB,
+                           const VPDominatorTree &VPDT) {
+  // A latch has a header as its second successor, with its other successor
+  // leaving the loop. A preheader OTOH has a header as its first (and only)
+  // successor.
+  return VPB->getNumSuccessors() == 2 &&
+         VPBlockUtils::isHeader(VPB->getSuccessors()[1], VPDT);
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index 0222b0aa81063..840a5b9e12a01 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -64,7 +64,7 @@ inline bool isSingleScalar(const VPValue *VPV) {
     return true;
 
   if (auto *Rep = dyn_cast<VPReplicateRecipe>(VPV)) {
-    const VPRegionBlock *RegionOfR = Rep->getParent()->getParent();
+    const VPRegionBlock *RegionOfR = Rep->getRegion();
     // Don't consider recipes in replicate regions as uniform yet; their first
     // lane cannot be accessed when executing the replicate region for other
     // lanes.
@@ -84,13 +84,19 @@ inline bool isSingleScalar(const VPValue *VPV) {
     return VPI->isSingleScalar() || VPI->isVectorToScalar() ||
            (PreservesUniformity(VPI->getOpcode()) &&
             all_of(VPI->operands(), isSingleScalar));
+  if (isa<VPPartialReductionRecipe>(VPV))
+    return false;
+  if (isa<VPReductionRecipe>(VPV))
+    return true;
+  if (auto *Expr = dyn_cast<VPExpressionRecipe>(VPV))
+    return Expr->isSingleScalar();
 
   // VPExpandSCEVRecipes must be placed in the entry and are alway uniform.
   return isa<VPExpandSCEVRecipe>(VPV);
 }
 
 /// Return true if \p V is a header mask in \p Plan.
-bool isHeaderMask(const VPValue *V, VPlan &Plan);
+bool isHeaderMask(const VPValue *V, const VPlan &Plan);
 
 /// Checks if \p V is uniform across all VF lanes and UF parts. It is considered
 /// as such if it is either loop invariant (defined outside the vector region)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 5262af61d5978..91734a10cb2c8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -298,11 +298,16 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) {
         return false;
       }
     }
-    if (const auto *EVL = dyn_cast<VPInstruction>(&R)) {
-      if (EVL->getOpcode() == VPInstruction::ExplicitVectorLength &&
-          !verifyEVLRecipe(*EVL)) {
-        errs() << "EVL VPValue is not used correctly\n";
-        return false;
+    if (const auto *VPI = dyn_cast<VPInstruction>(&R)) {
+      switch (VPI->getOpcode()) {
+      case VPInstruction::ExplicitVectorLength:
+        if (!verifyEVLRecipe(*VPI)) {
+          errs() << "EVL VPValue is not used correctly\n";
+          return false;
+        }
+        break;
+      default:
+        break;
       }
     }
   }
diff --git a/llvm/lib/WindowsDriver/MSVCPaths.cpp b/llvm/lib/WindowsDriver/MSVCPaths.cpp
index 1fc89749955df..09468dab10260 100644
--- a/llvm/lib/WindowsDriver/MSVCPaths.cpp
+++ b/llvm/lib/WindowsDriver/MSVCPaths.cpp
@@ -259,9 +259,7 @@ static bool getSystemRegistryString(const char *keyPath, const char *valueName,
 #endif // _WIN32
 }
 
-namespace llvm {
-
-const char *archToWindowsSDKArch(Triple::ArchType Arch) {
+const char *llvm::archToWindowsSDKArch(Triple::ArchType Arch) {
   switch (Arch) {
   case Triple::ArchType::x86:
     return "x86";
@@ -277,7 +275,7 @@ const char *archToWindowsSDKArch(Triple::ArchType Arch) {
   }
 }
 
-const char *archToLegacyVCArch(Triple::ArchType Arch) {
+const char *llvm::archToLegacyVCArch(Triple::ArchType Arch) {
   switch (Arch) {
   case Triple::ArchType::x86:
     // x86 is default in legacy VC toolchains.
@@ -295,7 +293,7 @@ const char *archToLegacyVCArch(Triple::ArchType Arch) {
   }
 }
 
-const char *archToDevDivInternalArch(Triple::ArchType Arch) {
+const char *llvm::archToDevDivInternalArch(Triple::ArchType Arch) {
   switch (Arch) {
   case Triple::ArchType::x86:
     return "i386";
@@ -311,8 +309,9 @@ const char *archToDevDivInternalArch(Triple::ArchType Arch) {
   }
 }
 
-bool appendArchToWindowsSDKLibPath(int SDKMajor, SmallString<128> LibPath,
-                                   Triple::ArchType Arch, std::string &path) {
+bool llvm::appendArchToWindowsSDKLibPath(int SDKMajor, SmallString<128> LibPath,
+                                         Triple::ArchType Arch,
+                                         std::string &path) {
   if (SDKMajor >= 8) {
     sys::path::append(LibPath, archToWindowsSDKArch(Arch));
   } else {
@@ -336,10 +335,11 @@ bool appendArchToWindowsSDKLibPath(int SDKMajor, SmallString<128> LibPath,
   return true;
 }
 
-std::string getSubDirectoryPath(SubDirectoryType Type, ToolsetLayout VSLayout,
-                                const std::string &VCToolChainPath,
-                                Triple::ArchType TargetArch,
-                                StringRef SubdirParent) {
+std::string llvm::getSubDirectoryPath(SubDirectoryType Type,
+                                      ToolsetLayout VSLayout,
+                                      const std::string &VCToolChainPath,
+                                      Triple::ArchType TargetArch,
+                                      StringRef SubdirParent) {
   const char *SubdirName;
   const char *IncludeName;
   switch (VSLayout) {
@@ -390,19 +390,22 @@ std::string getSubDirectoryPath(SubDirectoryType Type, ToolsetLayout VSLayout,
   return std::string(Path);
 }
 
-bool useUniversalCRT(ToolsetLayout VSLayout, const std::string &VCToolChainPath,
-                     Triple::ArchType TargetArch, vfs::FileSystem &VFS) {
+bool llvm::useUniversalCRT(ToolsetLayout VSLayout,
+                           const std::string &VCToolChainPath,
+                           Triple::ArchType TargetArch, vfs::FileSystem &VFS) {
   SmallString<128> TestPath(getSubDirectoryPath(
       SubDirectoryType::Include, VSLayout, VCToolChainPath, TargetArch));
   sys::path::append(TestPath, "stdlib.h");
   return !VFS.exists(TestPath);
 }
 
-bool getWindowsSDKDir(vfs::FileSystem &VFS, std::optional<StringRef> WinSdkDir,
-                      std::optional<StringRef> WinSdkVersion,
-                      std::optional<StringRef> WinSysRoot, std::string &Path,
-                      int &Major, std::string &WindowsSDKIncludeVersion,
-                      std::string &WindowsSDKLibVersion) {
+bool llvm::getWindowsSDKDir(vfs::FileSystem &VFS,
+                            std::optional<StringRef> WinSdkDir,
+                            std::optional<StringRef> WinSdkVersion,
+                            std::optional<StringRef> WinSysRoot,
+                            std::string &Path, int &Major,
+                            std::string &WindowsSDKIncludeVersion,
+                            std::string &WindowsSDKLibVersion) {
   // Trust /winsdkdir and /winsdkversion if present.
   if (getWindowsSDKDirViaCommandLine(VFS, WinSdkDir, WinSdkVersion, WinSysRoot,
                                      Path, Major, WindowsSDKIncludeVersion)) {
@@ -460,11 +463,11 @@ bool getWindowsSDKDir(vfs::FileSystem &VFS, std::optional<StringRef> WinSdkDir,
   return false;
 }
 
-bool getUniversalCRTSdkDir(vfs::FileSystem &VFS,
-                           std::optional<StringRef> WinSdkDir,
-                           std::optional<StringRef> WinSdkVersion,
-                           std::optional<StringRef> WinSysRoot,
-                           std::string &Path, std::string &UCRTVersion) {
+bool llvm::getUniversalCRTSdkDir(vfs::FileSystem &VFS,
+                                 std::optional<StringRef> WinSdkDir,
+                                 std::optional<StringRef> WinSdkVersion,
+                                 std::optional<StringRef> WinSysRoot,
+                                 std::string &Path, std::string &UCRTVersion) {
   // If /winsdkdir is passed, use it as location for the UCRT too.
   // FIXME: Should there be a dedicated /ucrtdir to override /winsdkdir?
   int Major;
@@ -491,11 +494,11 @@ bool getUniversalCRTSdkDir(vfs::FileSystem &VFS,
   return getWindows10SDKVersionFromPath(VFS, Path, UCRTVersion);
 }
 
-bool findVCToolChainViaCommandLine(vfs::FileSystem &VFS,
-                                   std::optional<StringRef> VCToolsDir,
-                                   std::optional<StringRef> VCToolsVersion,
-                                   std::optional<StringRef> WinSysRoot,
-                                   std::string &Path, ToolsetLayout &VSLayout) {
+bool llvm::findVCToolChainViaCommandLine(
+    vfs::FileSystem &VFS, std::optional<StringRef> VCToolsDir,
+    std::optional<StringRef> VCToolsVersion,
+    std::optional<StringRef> WinSysRoot, std::string &Path,
+    ToolsetLayout &VSLayout) {
   // Don't validate the input; trust the value supplied by the user.
   // The primary motivation is to prevent unnecessary file and registry access.
   if (VCToolsDir || WinSysRoot) {
@@ -518,8 +521,9 @@ bool findVCToolChainViaCommandLine(vfs::FileSystem &VFS,
   return false;
 }
 
-bool findVCToolChainViaEnvironment(vfs::FileSystem &VFS, std::string &Path,
-                                   ToolsetLayout &VSLayout) {
+bool llvm::findVCToolChainViaEnvironment(vfs::FileSystem &VFS,
+                                         std::string &Path,
+                                         ToolsetLayout &VSLayout) {
   // These variables are typically set by vcvarsall.bat
   // when launching a developer command prompt.
   if (std::optional<std::string> VCToolsInstallDir =
@@ -627,9 +631,9 @@ bool findVCToolChainViaEnvironment(vfs::FileSystem &VFS, std::string &Path,
   return false;
 }
 
-bool findVCToolChainViaSetupConfig(vfs::FileSystem &VFS,
-                                   std::optional<StringRef> VCToolsVersion,
-                                   std::string &Path, ToolsetLayout &VSLayout) {
+bool llvm::findVCToolChainViaSetupConfig(
+    vfs::FileSystem &VFS, std::optional<StringRef> VCToolsVersion,
+    std::string &Path, ToolsetLayout &VSLayout) {
 #if !defined(USE_MSVC_SETUP_API)
   return false;
 #else
@@ -724,7 +728,8 @@ bool findVCToolChainViaSetupConfig(vfs::FileSystem &VFS,
 #endif
 }
 
-bool findVCToolChainViaRegistry(std::string &Path, ToolsetLayout &VSLayout) {
+bool llvm::findVCToolChainViaRegistry(std::string &Path,
+                                      ToolsetLayout &VSLayout) {
   std::string VSInstallPath;
   if (getSystemRegistryString(R"(SOFTWARE\Microsoft\VisualStudio\$VERSION)",
                               "InstallDir", VSInstallPath, nullptr) ||
@@ -744,5 +749,3 @@ bool findVCToolChainViaRegistry(std::string &Path, ToolsetLayout &VSLayout) {
   }
   return false;
 }
-
-} // namespace llvm
diff --git a/llvm/test/Analysis/BasicAA/intrinsics.ll b/llvm/test/Analysis/BasicAA/intrinsics.ll
index f8b30df89d039..52dd65d38d968 100644
--- a/llvm/test/Analysis/BasicAA/intrinsics.ll
+++ b/llvm/test/Analysis/BasicAA/intrinsics.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -aa-pipeline=basic-aa -passes=gvn -S < %s | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
@@ -5,12 +6,15 @@ target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-
 ; BasicAA should prove that these calls don't interfere, since they are
 ; IntrArgReadMem and have noalias pointers.
 
-; CHECK:      define <8 x i16> @test0(ptr noalias %p, ptr noalias %q, <8 x i16> %y, <8 x i1> %m, <8 x i16> %pt) {
-; CHECK-NEXT: entry:
-; CHECK-NEXT:   %a = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %p, i32 16, <8 x i1> %m, <8 x i16> %pt) [[ATTR:#[0-9]+]]
-; CHECK-NEXT:   call void @llvm.masked.store.v8i16.p0(<8 x i16> %y, ptr %q, i32 16, <8 x i1> %m)
-; CHECK-NEXT:   %c = add <8 x i16> %a, %a
 define <8 x i16> @test0(ptr noalias %p, ptr noalias %q, <8 x i16> %y, <8 x i1> %m, <8 x i16> %pt) {
+; CHECK-LABEL: define <8 x i16> @test0(
+; CHECK-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]], <8 x i16> [[Y:%.*]], <8 x i1> [[M:%.*]], <8 x i16> [[PT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[A:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 16 [[P]], <8 x i1> [[M]], <8 x i16> [[PT]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i16.p0(<8 x i16> [[Y]], ptr align 16 [[Q]], <8 x i1> [[M]])
+; CHECK-NEXT:    [[C:%.*]] = add <8 x i16> [[A]], [[A]]
+; CHECK-NEXT:    ret <8 x i16> [[C]]
+;
 entry:
   %a = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %p, i32 16, <8 x i1> %m, <8 x i16> %pt) nounwind
   call void @llvm.masked.store.v8i16.p0(<8 x i16> %y, ptr %q, i32 16, <8 x i1> %m)
@@ -24,4 +28,3 @@ declare void @llvm.masked.store.v8i16.p0(<8 x i16>, ptr, i32, <8 x i1>) nounwind
 
 ; CHECK: attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
 ; CHECK: attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) }
-; CHECK: attributes [[ATTR]] = { nounwind }
diff --git a/llvm/test/Analysis/BasicAA/ptr-vector.ll b/llvm/test/Analysis/BasicAA/ptr-vector.ll
index 7dea24fb5aba7..598c17078c16c 100644
--- a/llvm/test/Analysis/BasicAA/ptr-vector.ll
+++ b/llvm/test/Analysis/BasicAA/ptr-vector.ll
@@ -1,8 +1,8 @@
 ; RUN: opt -print-all-alias-modref-info -passes=aa-eval -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK: MayAlias:	i8* %b, i8* %p
-; CHECK: Just Ref:  Ptr: i8* %p	<->  %v1p = call <1 x ptr> @llvm.masked.load.v1p0.p0(ptr %a, i32 8, <1 x i1> %c, <1 x ptr> poison)
-; CHECK: Just Ref:  Ptr: i8* %b	<->  %v1p = call <1 x ptr> @llvm.masked.load.v1p0.p0(ptr %a, i32 8, <1 x i1> %c, <1 x ptr> poison)
+; CHECK: Just Ref:  Ptr: i8* %p	<->  %v1p = call <1 x ptr> @llvm.masked.load.v1p0.p0(ptr align 8 %a, <1 x i1> %c, <1 x ptr> poison)
+; CHECK: Just Ref:  Ptr: i8* %b	<->  %v1p = call <1 x ptr> @llvm.masked.load.v1p0.p0(ptr align 8 %a, <1 x i1> %c, <1 x ptr> poison)
 define void @test(ptr %a, ptr %b, <1 x i1> %c) {
   %v1p = call <1 x ptr> @llvm.masked.load.v1p0.p0(ptr %a, i32 8, <1 x i1> %c, <1 x ptr> poison)
   %p = bitcast <1 x ptr> %v1p to ptr
diff --git a/llvm/test/Analysis/BasicAA/scalable-dse-aa.ll b/llvm/test/Analysis/BasicAA/scalable-dse-aa.ll
index 7e980c9bfe38a..5cf0ae9f61b3c 100644
--- a/llvm/test/Analysis/BasicAA/scalable-dse-aa.ll
+++ b/llvm/test/Analysis/BasicAA/scalable-dse-aa.ll
@@ -1,10 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt < %s -aa-pipeline=basic-aa -passes=dse -S | FileCheck %s
 
 define <vscale x 4 x float> @dead_scalable_store(ptr %0) {
 ; CHECK-LABEL: define <vscale x 4 x float> @dead_scalable_store(
-; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
-; CHECK-NOT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.32, ptr nonnull %gep.arr.32, i32 1, <vscale x 4 x i1> %mask)
-; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.48, ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask)
+; CHECK-SAME: ptr [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[ARR:%.*]] = alloca [64 x i32], align 4
+; CHECK-NEXT:    [[MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+; CHECK-NEXT:    [[GEP_0_16:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
+; CHECK-NEXT:    [[GEP_0_48:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 48
+; CHECK-NEXT:    [[GEP_ARR_16:%.*]] = getelementptr inbounds nuw i8, ptr [[ARR]], i64 16
+; CHECK-NEXT:    [[GEP_ARR_48:%.*]] = getelementptr inbounds nuw i8, ptr [[ARR]], i64 48
+; CHECK-NEXT:    [[LOAD_0_16:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[GEP_0_16]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0_16]], ptr align 1 [[GEP_ARR_16]], <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT:    [[LOAD_0_48:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[GEP_0_48]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0_48]], ptr align 1 [[GEP_ARR_48]], <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT:    [[FADDOP0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[GEP_ARR_16]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    [[FADDOP1:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[GEP_ARR_48]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    [[FADD:%.*]] = fadd <vscale x 4 x float> [[FADDOP0]], [[FADDOP1]]
+; CHECK-NEXT:    ret <vscale x 4 x float> [[FADD]]
 ;
   %arr = alloca [64 x i32], align 4
   %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
@@ -34,9 +47,21 @@ define <vscale x 4 x float> @dead_scalable_store(ptr %0) {
 
 define <4 x float> @dead_scalable_store_fixed(ptr %0) {
 ; CHECK-LABEL: define <4 x float> @dead_scalable_store_fixed(
-; CHECK: call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <4 x i1> %mask)
-; CHECK-NOT: call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0.32, ptr nonnull %gep.arr.36, i32 1, <4 x i1> %mask2)
-; CHECK: call void @llvm.masked.store.v4f32.p0(<4 x float> %load.0.48, ptr nonnull %gep.arr.48, i32 1, <4 x i1> %mask)
+; CHECK-SAME: ptr [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[ARR:%.*]] = alloca [64 x i32], align 4
+; CHECK-NEXT:    [[MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 4)
+; CHECK-NEXT:    [[GEP_0_16:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
+; CHECK-NEXT:    [[GEP_0_48:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 48
+; CHECK-NEXT:    [[GEP_ARR_16:%.*]] = getelementptr inbounds nuw i8, ptr [[ARR]], i64 16
+; CHECK-NEXT:    [[GEP_ARR_48:%.*]] = getelementptr inbounds nuw i8, ptr [[ARR]], i64 48
+; CHECK-NEXT:    [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 [[GEP_0_16]], <4 x i1> [[MASK]], <4 x float> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[LOAD_0_16]], ptr align 1 [[GEP_ARR_16]], <4 x i1> [[MASK]])
+; CHECK-NEXT:    [[LOAD_0_48:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 [[GEP_0_48]], <4 x i1> [[MASK]], <4 x float> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[LOAD_0_48]], ptr align 1 [[GEP_ARR_48]], <4 x i1> [[MASK]])
+; CHECK-NEXT:    [[FADDOP0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 [[GEP_ARR_16]], <4 x i1> [[MASK]], <4 x float> zeroinitializer)
+; CHECK-NEXT:    [[FADDOP1:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 [[GEP_ARR_48]], <4 x i1> [[MASK]], <4 x float> zeroinitializer)
+; CHECK-NEXT:    [[FADD:%.*]] = fadd <4 x float> [[FADDOP0]], [[FADDOP1]]
+; CHECK-NEXT:    ret <4 x float> [[FADD]]
 ;
   %arr = alloca [64 x i32], align 4
   %mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 4)
@@ -67,9 +92,25 @@ define <4 x float> @dead_scalable_store_fixed(ptr %0) {
 
 define <vscale x 4 x float> @scalable_store_partial_overwrite(ptr %0) {
 ; CHECK-LABEL: define <vscale x 4 x float> @scalable_store_partial_overwrite(
-; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
-; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.30, ptr nonnull %gep.arr.30, i32 1, <vscale x 4 x i1> %mask)
-; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.48, ptr nonnull %gep.arr.48, i32 1, <vscale x 4 x i1> %mask)
+; CHECK-SAME: ptr [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[ARR:%.*]] = alloca [64 x i32], align 4
+; CHECK-NEXT:    [[MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+; CHECK-NEXT:    [[GEP_0_16:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
+; CHECK-NEXT:    [[GEP_0_30:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 30
+; CHECK-NEXT:    [[GEP_0_48:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 48
+; CHECK-NEXT:    [[GEP_ARR_16:%.*]] = getelementptr inbounds nuw i8, ptr [[ARR]], i64 16
+; CHECK-NEXT:    [[GEP_ARR_30:%.*]] = getelementptr inbounds nuw i8, ptr [[ARR]], i64 30
+; CHECK-NEXT:    [[GEP_ARR_48:%.*]] = getelementptr inbounds nuw i8, ptr [[ARR]], i64 48
+; CHECK-NEXT:    [[LOAD_0_16:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[GEP_0_16]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0_16]], ptr align 1 [[GEP_ARR_16]], <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT:    [[LOAD_0_30:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[GEP_0_30]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0_30]], ptr align 1 [[GEP_ARR_30]], <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT:    [[LOAD_0_48:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[GEP_0_48]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0_48]], ptr align 1 [[GEP_ARR_48]], <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT:    [[FADDOP0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[GEP_ARR_16]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    [[FADDOP1:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[GEP_ARR_48]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    [[FADD:%.*]] = fadd <vscale x 4 x float> [[FADDOP0]], [[FADDOP1]]
+; CHECK-NEXT:    ret <vscale x 4 x float> [[FADD]]
 ;
   %arr = alloca [64 x i32], align 4
   %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
@@ -99,9 +140,23 @@ define <vscale x 4 x float> @scalable_store_partial_overwrite(ptr %0) {
 
 define <vscale x 4 x float> @dead_scalable_store_small_mask(ptr %0) {
 ; CHECK-LABEL: define <vscale x 4 x float> @dead_scalable_store_small_mask(
-; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.16, ptr nonnull %gep.arr.16, i32 1, <vscale x 4 x i1> %mask)
-; CHECK-NOT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.30, ptr nonnull %gep.arr.30, i32 1, <vscale x 4 x i1> %mask)
-; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load.0.46, ptr nonnull %gep.arr.46, i32 1, <vscale x 4 x i1> %mask)
+; CHECK-SAME: ptr [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[ARR:%.*]] = alloca [64 x i32], align 4
+; CHECK-NEXT:    [[MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+; CHECK-NEXT:    [[GEP_0_16:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
+; CHECK-NEXT:    [[GEP_0_46:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 46
+; CHECK-NEXT:    [[GEP_ARR_16:%.*]] = getelementptr inbounds nuw i8, ptr [[ARR]], i64 16
+; CHECK-NEXT:    [[GEP_ARR_46:%.*]] = getelementptr inbounds nuw i8, ptr [[ARR]], i64 46
+; CHECK-NEXT:    [[LOAD_0_16:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[GEP_0_16]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0_16]], ptr align 1 [[GEP_ARR_16]], <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT:    [[LOAD_0_46:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[GEP_0_46]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0_46]], ptr align 1 [[GEP_ARR_46]], <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT:    [[SMALLMASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 2)
+; CHECK-NEXT:    [[FADDOP0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[GEP_ARR_16]], <vscale x 4 x i1> [[SMALLMASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    [[FADDOP1:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[GEP_ARR_46]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    [[FADD:%.*]] = fadd <vscale x 4 x float> [[FADDOP0]], [[FADDOP1]]
+; CHECK-NEXT:    ret <vscale x 4 x float> [[FADD]]
+;
   %arr = alloca [64 x i32], align 4
   %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
 
@@ -131,7 +186,12 @@ define <vscale x 4 x float> @dead_scalable_store_small_mask(ptr %0) {
 
 define <vscale x 4 x float> @dead_scalar_store(ptr noalias %0, ptr %1) {
 ; CHECK-LABEL: define <vscale x 4 x float> @dead_scalar_store(
-; CHECK-NOT: store i32 20, ptr %gep.1.12
+; CHECK-SAME: ptr noalias [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
+; CHECK-NEXT:    [[MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i128(i128 0, i128 4)
+; CHECK-NEXT:    [[LOAD_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[TMP0]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0]], ptr align 1 [[TMP1]], <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT:    [[RETVAL:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[TMP1]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    ret <vscale x 4 x float> [[RETVAL]]
 ;
   %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i128(i128 0, i128 4)
   %gep.1.12 = getelementptr inbounds nuw i8, ptr %1, i64 12
@@ -144,10 +204,17 @@ define <vscale x 4 x float> @dead_scalar_store(ptr noalias %0, ptr %1) {
 }
 
 
-; CHECK-LABEL: define <4 x float> @dead_scalable_store_fixed_large_mask(
-; CHECK-NOT: store i32 20, ptr %1
-; CHECK: store i32 50, ptr %gep.5
 define <4 x float> @dead_scalable_store_fixed_large_mask(ptr noalias %0, ptr %1) {
+; CHECK-LABEL: define <4 x float> @dead_scalable_store_fixed_large_mask(
+; CHECK-SAME: ptr noalias [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
+; CHECK-NEXT:    [[MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 7)
+; CHECK-NEXT:    [[GEP_5:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i64 5
+; CHECK-NEXT:    store i32 50, ptr [[GEP_5]], align 4
+; CHECK-NEXT:    [[LOAD_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 [[TMP0]], <4 x i1> [[MASK]], <4 x float> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[LOAD_0]], ptr align 1 [[TMP1]], <4 x i1> [[MASK]])
+; CHECK-NEXT:    [[RETVAL:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 [[TMP1]], <4 x i1> [[MASK]], <4 x float> zeroinitializer)
+; CHECK-NEXT:    ret <4 x float> [[RETVAL]]
+;
   %mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 7)
   store i32 20, ptr %1
 
@@ -164,8 +231,16 @@ define <4 x float> @dead_scalable_store_fixed_large_mask(ptr noalias %0, ptr %1)
 ; This get active lane mask may cover 4 or 8 integers
 define <vscale x 4 x float> @mask_gt_minimum_num_elts(ptr noalias %0, ptr %1) {
 ; CHECK-LABEL: define <vscale x 4 x float> @mask_gt_minimum_num_elts(
-; CHECK: store i32 10, ptr %gep.1.12
-; CHECK: store i32 20, ptr %gep.1.28
+; CHECK-SAME: ptr noalias [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
+; CHECK-NEXT:    [[MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8)
+; CHECK-NEXT:    [[GEP_1_12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12
+; CHECK-NEXT:    store i32 10, ptr [[GEP_1_12]], align 4
+; CHECK-NEXT:    [[GEP_1_28:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 28
+; CHECK-NEXT:    store i32 20, ptr [[GEP_1_28]], align 4
+; CHECK-NEXT:    [[LOAD_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[TMP0]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0]], ptr align 1 [[TMP1]], <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT:    [[RETVAL:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[TMP1]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    ret <vscale x 4 x float> [[RETVAL]]
 ;
   %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8)
   %gep.1.12 = getelementptr inbounds nuw i8, ptr %1, i64 12
@@ -182,7 +257,13 @@ define <vscale x 4 x float> @mask_gt_minimum_num_elts(ptr noalias %0, ptr %1) {
 ; Don't do anything if the mask's Op1 < Op0
 define <vscale x 4 x float> @active_lane_mask_lt(ptr noalias %0, ptr %1) {
 ; CHECK-LABEL: define <vscale x 4 x float> @active_lane_mask_lt(
-; CHECK: store i32 20, ptr %1
+; CHECK-SAME: ptr noalias [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
+; CHECK-NEXT:    [[MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 4, i32 2)
+; CHECK-NEXT:    store i32 20, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[LOAD_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[TMP0]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0]], ptr align 1 [[TMP1]], <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT:    [[RETVAL:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[TMP1]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    ret <vscale x 4 x float> [[RETVAL]]
 ;
   %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 4, i32 2)
   store i32 20, ptr %1
@@ -196,7 +277,13 @@ define <vscale x 4 x float> @active_lane_mask_lt(ptr noalias %0, ptr %1) {
 ; Don't do anything if the mask's Op1 == Op0
 define <vscale x 4 x float> @active_lane_mask_eq(ptr noalias %0, ptr %1) {
 ; CHECK-LABEL: define <vscale x 4 x float> @active_lane_mask_eq(
-; CHECK: store i32 20, ptr %1
+; CHECK-SAME: ptr noalias [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
+; CHECK-NEXT:    [[MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 2, i32 2)
+; CHECK-NEXT:    store i32 20, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[LOAD_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[TMP0]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0]], ptr align 1 [[TMP1]], <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT:    [[RETVAL:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[TMP1]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    ret <vscale x 4 x float> [[RETVAL]]
 ;
   %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 2, i32 2)
   store i32 20, ptr %1
@@ -209,8 +296,14 @@ define <vscale x 4 x float> @active_lane_mask_eq(ptr noalias %0, ptr %1) {
 
 define <vscale x 16 x i8> @scalar_stores_small_mask(ptr noalias %0, ptr %1) {
 ; CHECK-LABEL: define <vscale x 16 x i8> @scalar_stores_small_mask(
-; CHECK-NOT: store i8 60, ptr %gep.1.6
-; CHECK: store i8 120, ptr %gep.1.8
+; CHECK-SAME: ptr noalias [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
+; CHECK-NEXT:    [[MASK:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i8(i8 0, i8 7)
+; CHECK-NEXT:    [[GEP_1_8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 8
+; CHECK-NEXT:    store i8 120, ptr [[GEP_1_8]], align 1
+; CHECK-NEXT:    [[LOAD_0:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP0]], <vscale x 16 x i1> [[MASK]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[LOAD_0]], ptr align 1 [[TMP1]], <vscale x 16 x i1> [[MASK]])
+; CHECK-NEXT:    [[RETVAL:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP1]], <vscale x 16 x i1> [[MASK]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    ret <vscale x 16 x i8> [[RETVAL]]
 ;
   %mask = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i8.i8(i8 0, i8 7)
   %gep.1.6 = getelementptr inbounds nuw i8, ptr %1, i64 6
@@ -226,10 +319,14 @@ define <vscale x 16 x i8> @scalar_stores_small_mask(ptr noalias %0, ptr %1) {
 
 define <vscale x 4 x float> @dead_scalar_store_offset(ptr noalias %0, ptr %1) {
 ; CHECK-LABEL: define <vscale x 4 x float> @dead_scalar_store_offset(
-; CHECK-NOT: store i32 10, ptr %gep.1.0
-; CHECK-NOT: store i32 20, ptr %gep.1.4
-; CHECK-NOT: store i32 30, ptr %gep.1.8
-; CHECK: store i32 40, ptr %gep.1.12
+; CHECK-SAME: ptr noalias [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
+; CHECK-NEXT:    [[MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 1, i32 4)
+; CHECK-NEXT:    [[GEP_1_12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12
+; CHECK-NEXT:    store i32 40, ptr [[GEP_1_12]], align 4
+; CHECK-NEXT:    [[LOAD_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[TMP0]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0]], ptr align 1 [[TMP1]], <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT:    [[RETVAL:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[TMP1]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    ret <vscale x 4 x float> [[RETVAL]]
 ;
   %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 1, i32 4)
   %gep.1.0 = getelementptr inbounds nuw i8, ptr %1, i64 0
diff --git a/llvm/test/Analysis/CostModel/AArch64/cast.ll b/llvm/test/Analysis/CostModel/AArch64/cast.ll
index 38bd98ffd343f..15d67489e42b6 100644
--- a/llvm/test/Analysis/CostModel/AArch64/cast.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/cast.ll
@@ -7,708 +7,708 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define void @ext() {
 ; CHECK-LABEL: 'ext'
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r0 = sext i1 undef to i8
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r1 = zext i1 undef to i8
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r2 = sext i1 undef to i16
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r3 = zext i1 undef to i16
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r4 = sext i1 undef to i32
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r5 = zext i1 undef to i32
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r6 = sext i1 undef to i64
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r7 = zext i1 undef to i64
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r9 = sext i8 undef to i16
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r10 = zext i8 undef to i16
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r11 = sext i8 undef to i32
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r12 = zext i8 undef to i32
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r13 = sext i8 undef to i64
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r14 = zext i8 undef to i64
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r17 = sext i16 undef to i32
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r18 = zext i16 undef to i32
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r19 = sext i16 undef to i64
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r20 = zext i16 undef to i64
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r24 = sext i32 undef to i64
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r25 = zext i32 undef to i64
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %s2i8i16 = sext <2 x i8> undef to <2 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %z2i8i16 = zext <2 x i8> undef to <2 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %s2i8i32 = sext <2 x i8> undef to <2 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %z2i8i32 = zext <2 x i8> undef to <2 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %s2i8i64 = sext <2 x i8> undef to <2 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %z2i8i64 = zext <2 x i8> undef to <2 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %s2i16i32 = sext <2 x i16> undef to <2 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %z2i16i32 = zext <2 x i16> undef to <2 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %s2i16i64 = sext <2 x i16> undef to <2 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %z2i16i64 = zext <2 x i16> undef to <2 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %s2i32i64 = sext <2 x i32> undef to <2 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %z2i32i64 = zext <2 x i32> undef to <2 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %s4i8i16 = sext <4 x i8> undef to <4 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %z4i8i16 = zext <4 x i8> undef to <4 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %s4i8i32 = sext <4 x i8> undef to <4 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %z4i8i32 = zext <4 x i8> undef to <4 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s4i8i64 = sext <4 x i8> undef to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %z4i8i64 = zext <4 x i8> undef to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %s4i16i32 = sext <4 x i16> undef to <4 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %z4i16i32 = zext <4 x i16> undef to <4 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s4i16i64 = sext <4 x i16> undef to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %z4i16i64 = zext <4 x i16> undef to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s4i32i64 = sext <4 x i32> undef to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z4i32i64 = zext <4 x i32> undef to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %s8i8i16 = sext <8 x i8> undef to <8 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %z8i8i16 = zext <8 x i8> undef to <8 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i32 = sext <8 x i8> undef to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %z8i8i32 = zext <8 x i8> undef to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i64 = sext <8 x i8> undef to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %z8i8i64 = zext <8 x i8> undef to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i16i32 = sext <8 x i16> undef to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z8i16i32 = zext <8 x i16> undef to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %s8i16i64 = sext <8 x i16> undef to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %z8i16i64 = zext <8 x i16> undef to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s8i32i64 = sext <8 x i32> undef to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z8i32i64 = zext <8 x i32> undef to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i16 = sext <16 x i8> undef to <16 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i16 = zext <16 x i8> undef to <16 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i32 = sext <16 x i8> undef to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i32 = zext <16 x i8> undef to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i64 = sext <16 x i8> undef to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i64 = zext <16 x i8> undef to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i32 = sext <16 x i16> undef to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z16i16i32 = zext <16 x i16> undef to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i64 = sext <16 x i16> undef to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %z16i16i64 = zext <16 x i16> undef to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %s16i32i64 = sext <16 x i32> undef to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %z16i32i64 = zext <16 x i32> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r0 = sext i1 poison to i8
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r1 = zext i1 poison to i8
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r2 = sext i1 poison to i16
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r3 = zext i1 poison to i16
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r4 = sext i1 poison to i32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r5 = zext i1 poison to i32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r6 = sext i1 poison to i64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r7 = zext i1 poison to i64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r9 = sext i8 poison to i16
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r10 = zext i8 poison to i16
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r11 = sext i8 poison to i32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r12 = zext i8 poison to i32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r13 = sext i8 poison to i64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r14 = zext i8 poison to i64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r17 = sext i16 poison to i32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r18 = zext i16 poison to i32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r19 = sext i16 poison to i64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r20 = zext i16 poison to i64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r24 = sext i32 poison to i64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r25 = zext i32 poison to i64
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %s2i8i16 = sext <2 x i8> poison to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %z2i8i16 = zext <2 x i8> poison to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %s2i8i32 = sext <2 x i8> poison to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %z2i8i32 = zext <2 x i8> poison to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %s2i8i64 = sext <2 x i8> poison to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %z2i8i64 = zext <2 x i8> poison to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %s2i16i32 = sext <2 x i16> poison to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %z2i16i32 = zext <2 x i16> poison to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %s2i16i64 = sext <2 x i16> poison to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %z2i16i64 = zext <2 x i16> poison to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %s2i32i64 = sext <2 x i32> poison to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %z2i32i64 = zext <2 x i32> poison to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %s4i8i16 = sext <4 x i8> poison to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %z4i8i16 = zext <4 x i8> poison to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %s4i8i32 = sext <4 x i8> poison to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %z4i8i32 = zext <4 x i8> poison to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s4i8i64 = sext <4 x i8> poison to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %z4i8i64 = zext <4 x i8> poison to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %s4i16i32 = sext <4 x i16> poison to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %z4i16i32 = zext <4 x i16> poison to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s4i16i64 = sext <4 x i16> poison to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %z4i16i64 = zext <4 x i16> poison to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s4i32i64 = sext <4 x i32> poison to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z4i32i64 = zext <4 x i32> poison to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %s8i8i16 = sext <8 x i8> poison to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %z8i8i16 = zext <8 x i8> poison to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i32 = sext <8 x i8> poison to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %z8i8i32 = zext <8 x i8> poison to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i64 = sext <8 x i8> poison to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %z8i8i64 = zext <8 x i8> poison to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i16i32 = sext <8 x i16> poison to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z8i16i32 = zext <8 x i16> poison to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %s8i16i64 = sext <8 x i16> poison to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %z8i16i64 = zext <8 x i16> poison to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s8i32i64 = sext <8 x i32> poison to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z8i32i64 = zext <8 x i32> poison to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i16 = sext <16 x i8> poison to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i16 = zext <16 x i8> poison to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i32 = sext <16 x i8> poison to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i32 = zext <16 x i8> poison to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i64 = sext <16 x i8> poison to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i64 = zext <16 x i8> poison to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i32 = sext <16 x i16> poison to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z16i16i32 = zext <16 x i16> poison to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i64 = sext <16 x i16> poison to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %z16i16i64 = zext <16 x i16> poison to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %s16i32i64 = sext <16 x i32> poison to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %z16i32i64 = zext <16 x i32> poison to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %r0 = sext i1 undef to i8
-  %r1 = zext i1 undef to i8
-  %r2 = sext i1 undef to i16
-  %r3 = zext i1 undef to i16
-  %r4 = sext i1 undef to i32
-  %r5 = zext i1 undef to i32
-  %r6 = sext i1 undef to i64
-  %r7 = zext i1 undef to i64
-  %r9 = sext i8 undef to i16
-  %r10 = zext i8 undef to i16
-  %r11 = sext i8 undef to i32
-  %r12 = zext i8 undef to i32
-  %r13 = sext i8 undef to i64
-  %r14 = zext i8 undef to i64
-  %r17 = sext i16 undef to i32
-  %r18 = zext i16 undef to i32
-  %r19 = sext i16 undef to i64
-  %r20 = zext i16 undef to i64
-  %r24 = sext i32 undef to i64
-  %r25 = zext i32 undef to i64
+  %r0 = sext i1 poison to i8
+  %r1 = zext i1 poison to i8
+  %r2 = sext i1 poison to i16
+  %r3 = zext i1 poison to i16
+  %r4 = sext i1 poison to i32
+  %r5 = zext i1 poison to i32
+  %r6 = sext i1 poison to i64
+  %r7 = zext i1 poison to i64
+  %r9 = sext i8 poison to i16
+  %r10 = zext i8 poison to i16
+  %r11 = sext i8 poison to i32
+  %r12 = zext i8 poison to i32
+  %r13 = sext i8 poison to i64
+  %r14 = zext i8 poison to i64
+  %r17 = sext i16 poison to i32
+  %r18 = zext i16 poison to i32
+  %r19 = sext i16 poison to i64
+  %r20 = zext i16 poison to i64
+  %r24 = sext i32 poison to i64
+  %r25 = zext i32 poison to i64
 
-  %s2i8i16 = sext <2 x i8> undef to <2 x i16>
-  %z2i8i16 = zext <2 x i8> undef to <2 x i16>
-  %s2i8i32 = sext <2 x i8> undef to <2 x i32>
-  %z2i8i32 = zext <2 x i8> undef to <2 x i32>
-  %s2i8i64 = sext <2 x i8> undef to <2 x i64>
-  %z2i8i64 = zext <2 x i8> undef to <2 x i64>
-  %s2i16i32 = sext <2 x i16> undef to <2 x i32>
-  %z2i16i32 = zext <2 x i16> undef to <2 x i32>
-  %s2i16i64 = sext <2 x i16> undef to <2 x i64>
-  %z2i16i64 = zext <2 x i16> undef to <2 x i64>
-  %s2i32i64 = sext <2 x i32> undef to <2 x i64>
-  %z2i32i64 = zext <2 x i32> undef to <2 x i64>
+  %s2i8i16 = sext <2 x i8> poison to <2 x i16>
+  %z2i8i16 = zext <2 x i8> poison to <2 x i16>
+  %s2i8i32 = sext <2 x i8> poison to <2 x i32>
+  %z2i8i32 = zext <2 x i8> poison to <2 x i32>
+  %s2i8i64 = sext <2 x i8> poison to <2 x i64>
+  %z2i8i64 = zext <2 x i8> poison to <2 x i64>
+  %s2i16i32 = sext <2 x i16> poison to <2 x i32>
+  %z2i16i32 = zext <2 x i16> poison to <2 x i32>
+  %s2i16i64 = sext <2 x i16> poison to <2 x i64>
+  %z2i16i64 = zext <2 x i16> poison to <2 x i64>
+  %s2i32i64 = sext <2 x i32> poison to <2 x i64>
+  %z2i32i64 = zext <2 x i32> poison to <2 x i64>
 
-  %s4i8i16 = sext <4 x i8>  undef to <4 x i16>
-  %z4i8i16 = zext <4 x i8>  undef to <4 x i16>
-  %s4i8i32 = sext <4 x i8>  undef to <4 x i32>
-  %z4i8i32 = zext <4 x i8>  undef to <4 x i32>
-  %s4i8i64 = sext <4 x i8>  undef to <4 x i64>
-  %z4i8i64 = zext <4 x i8>  undef to <4 x i64>
-  %s4i16i32 = sext <4 x i16> undef to <4 x i32>
-  %z4i16i32 = zext <4 x i16> undef to <4 x i32>
-  %s4i16i64 = sext <4 x i16> undef to <4 x i64>
-  %z4i16i64 = zext <4 x i16> undef to <4 x i64>
-  %s4i32i64 = sext <4 x i32> undef to <4 x i64>
-  %z4i32i64 = zext <4 x i32> undef to <4 x i64>
+  %s4i8i16 = sext <4 x i8>  poison to <4 x i16>
+  %z4i8i16 = zext <4 x i8>  poison to <4 x i16>
+  %s4i8i32 = sext <4 x i8>  poison to <4 x i32>
+  %z4i8i32 = zext <4 x i8>  poison to <4 x i32>
+  %s4i8i64 = sext <4 x i8>  poison to <4 x i64>
+  %z4i8i64 = zext <4 x i8>  poison to <4 x i64>
+  %s4i16i32 = sext <4 x i16> poison to <4 x i32>
+  %z4i16i32 = zext <4 x i16> poison to <4 x i32>
+  %s4i16i64 = sext <4 x i16> poison to <4 x i64>
+  %z4i16i64 = zext <4 x i16> poison to <4 x i64>
+  %s4i32i64 = sext <4 x i32> poison to <4 x i64>
+  %z4i32i64 = zext <4 x i32> poison to <4 x i64>
 
-  %s8i8i16 = sext <8 x i8>  undef to <8 x i16>
-  %z8i8i16 = zext <8 x i8>  undef to <8 x i16>
-  %s8i8i32 = sext <8 x i8>  undef to <8 x i32>
-  %z8i8i32 = zext <8 x i8>  undef to <8 x i32>
-  %s8i8i64 = sext <8 x i8>  undef to <8 x i64>
-  %z8i8i64 = zext <8 x i8>  undef to <8 x i64>
-  %s8i16i32 = sext <8 x i16> undef to <8 x i32>
-  %z8i16i32 = zext <8 x i16> undef to <8 x i32>
-  %s8i16i64 = sext <8 x i16> undef to <8 x i64>
-  %z8i16i64 = zext <8 x i16> undef to <8 x i64>
-  %s8i32i64 = sext <8 x i32> undef to <8 x i64>
-  %z8i32i64 = zext <8 x i32> undef to <8 x i64>
+  %s8i8i16 = sext <8 x i8>  poison to <8 x i16>
+  %z8i8i16 = zext <8 x i8>  poison to <8 x i16>
+  %s8i8i32 = sext <8 x i8>  poison to <8 x i32>
+  %z8i8i32 = zext <8 x i8>  poison to <8 x i32>
+  %s8i8i64 = sext <8 x i8>  poison to <8 x i64>
+  %z8i8i64 = zext <8 x i8>  poison to <8 x i64>
+  %s8i16i32 = sext <8 x i16> poison to <8 x i32>
+  %z8i16i32 = zext <8 x i16> poison to <8 x i32>
+  %s8i16i64 = sext <8 x i16> poison to <8 x i64>
+  %z8i16i64 = zext <8 x i16> poison to <8 x i64>
+  %s8i32i64 = sext <8 x i32> poison to <8 x i64>
+  %z8i32i64 = zext <8 x i32> poison to <8 x i64>
 
-  %s16i8i16 = sext <16 x i8>  undef to <16 x i16>
-  %z16i8i16 = zext <16 x i8>  undef to <16 x i16>
-  %s16i8i32 = sext <16 x i8>  undef to <16 x i32>
-  %z16i8i32 = zext <16 x i8>  undef to <16 x i32>
-  %s16i8i64 = sext <16 x i8>  undef to <16 x i64>
-  %z16i8i64 = zext <16 x i8>  undef to <16 x i64>
-  %s16i16i32 = sext <16 x i16> undef to <16 x i32>
-  %z16i16i32 = zext <16 x i16> undef to <16 x i32>
-  %s16i16i64 = sext <16 x i16> undef to <16 x i64>
-  %z16i16i64 = zext <16 x i16> undef to <16 x i64>
-  %s16i32i64 = sext <16 x i32> undef to <16 x i64>
-  %z16i32i64 = zext <16 x i32> undef to <16 x i64>
+  %s16i8i16 = sext <16 x i8>  poison to <16 x i16>
+  %z16i8i16 = zext <16 x i8>  poison to <16 x i16>
+  %s16i8i32 = sext <16 x i8>  poison to <16 x i32>
+  %z16i8i32 = zext <16 x i8>  poison to <16 x i32>
+  %s16i8i64 = sext <16 x i8>  poison to <16 x i64>
+  %z16i8i64 = zext <16 x i8>  poison to <16 x i64>
+  %s16i16i32 = sext <16 x i16> poison to <16 x i32>
+  %z16i16i32 = zext <16 x i16> poison to <16 x i32>
+  %s16i16i64 = sext <16 x i16> poison to <16 x i64>
+  %z16i16i64 = zext <16 x i16> poison to <16 x i64>
+  %s16i32i64 = sext <16 x i32> poison to <16 x i64>
+  %z16i32i64 = zext <16 x i32> poison to <16 x i64>
   ret void
 }
 
 define void @trunc() {
 ; CHECK-LABEL: 'trunc'
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r8 = trunc i8 undef to i1
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r15 = trunc i16 undef to i1
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r16 = trunc i16 undef to i8
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r21 = trunc i32 undef to i1
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r22 = trunc i32 undef to i8
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r23 = trunc i32 undef to i16
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r26 = trunc i64 undef to i1
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r27 = trunc i64 undef to i8
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r28 = trunc i64 undef to i16
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r29 = trunc i64 undef to i32
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %s2i8i16 = trunc <2 x i16> undef to <2 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %s2i8i32 = trunc <2 x i32> undef to <2 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %s2i8i64 = trunc <2 x i64> undef to <2 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %s2i16i32 = trunc <2 x i32> undef to <2 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %s2i16i64 = trunc <2 x i64> undef to <2 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %s2i32i64 = trunc <2 x i64> undef to <2 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %s4i8i16 = trunc <4 x i16> undef to <4 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %s4i8i32 = trunc <4 x i32> undef to <4 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s4i8i64 = trunc <4 x i64> undef to <4 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %s4i16i32 = trunc <4 x i32> undef to <4 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s4i16i64 = trunc <4 x i64> undef to <4 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %s4i32i64 = trunc <4 x i64> undef to <4 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %s8i8i16 = trunc <8 x i16> undef to <8 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i32 = trunc <8 x i32> undef to <8 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i64 = trunc <8 x i64> undef to <8 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %s8i16i32 = trunc <8 x i32> undef to <8 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s8i16i64 = trunc <8 x i64> undef to <8 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i32i64 = trunc <8 x i64> undef to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %s16i8i16 = trunc <16 x i16> undef to <16 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i32 = trunc <16 x i32> undef to <16 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i64 = trunc <16 x i64> undef to <16 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i32 = trunc <16 x i32> undef to <16 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i64 = trunc <16 x i64> undef to <16 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s16i32i64 = trunc <16 x i64> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r8 = trunc i8 poison to i1
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r15 = trunc i16 poison to i1
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r16 = trunc i16 poison to i8
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r21 = trunc i32 poison to i1
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r22 = trunc i32 poison to i8
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r23 = trunc i32 poison to i16
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r26 = trunc i64 poison to i1
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r27 = trunc i64 poison to i8
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r28 = trunc i64 poison to i16
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r29 = trunc i64 poison to i32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %s2i8i16 = trunc <2 x i16> poison to <2 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %s2i8i32 = trunc <2 x i32> poison to <2 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %s2i8i64 = trunc <2 x i64> poison to <2 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %s2i16i32 = trunc <2 x i32> poison to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %s2i16i64 = trunc <2 x i64> poison to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %s2i32i64 = trunc <2 x i64> poison to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %s4i8i16 = trunc <4 x i16> poison to <4 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %s4i8i32 = trunc <4 x i32> poison to <4 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s4i8i64 = trunc <4 x i64> poison to <4 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %s4i16i32 = trunc <4 x i32> poison to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s4i16i64 = trunc <4 x i64> poison to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %s4i32i64 = trunc <4 x i64> poison to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %s8i8i16 = trunc <8 x i16> poison to <8 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i32 = trunc <8 x i32> poison to <8 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i64 = trunc <8 x i64> poison to <8 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %s8i16i32 = trunc <8 x i32> poison to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s8i16i64 = trunc <8 x i64> poison to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i32i64 = trunc <8 x i64> poison to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %s16i8i16 = trunc <16 x i16> poison to <16 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i32 = trunc <16 x i32> poison to <16 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i64 = trunc <16 x i64> poison to <16 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i32 = trunc <16 x i32> poison to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i64 = trunc <16 x i64> poison to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s16i32i64 = trunc <16 x i64> poison to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %r8 = trunc i8 undef to i1
-  %r15 = trunc i16 undef to i1
-  %r16 = trunc i16 undef to i8
-  %r21 = trunc i32 undef to i1
-  %r22 = trunc i32 undef to i8
-  %r23 = trunc i32 undef to i16
-  %r26 = trunc i64 undef to i1
-  %r27 = trunc i64 undef to i8
-  %r28 = trunc i64 undef to i16
-  %r29 = trunc i64 undef to i32
+  %r8 = trunc i8 poison to i1
+  %r15 = trunc i16 poison to i1
+  %r16 = trunc i16 poison to i8
+  %r21 = trunc i32 poison to i1
+  %r22 = trunc i32 poison to i8
+  %r23 = trunc i32 poison to i16
+  %r26 = trunc i64 poison to i1
+  %r27 = trunc i64 poison to i8
+  %r28 = trunc i64 poison to i16
+  %r29 = trunc i64 poison to i32
 
-  %s2i8i16 = trunc <2 x i16> undef to <2 x i8>
-  %s2i8i32 = trunc <2 x i32> undef to <2 x i8>
-  %s2i8i64 = trunc <2 x i64> undef to <2 x i8>
-  %s2i16i32 = trunc <2 x i32> undef to <2 x i16>
-  %s2i16i64 = trunc <2 x i64> undef to <2 x i16>
-  %s2i32i64 = trunc <2 x i64> undef to <2 x i32>
+  %s2i8i16 = trunc <2 x i16> poison to <2 x i8>
+  %s2i8i32 = trunc <2 x i32> poison to <2 x i8>
+  %s2i8i64 = trunc <2 x i64> poison to <2 x i8>
+  %s2i16i32 = trunc <2 x i32> poison to <2 x i16>
+  %s2i16i64 = trunc <2 x i64> poison to <2 x i16>
+  %s2i32i64 = trunc <2 x i64> poison to <2 x i32>
 
-  %s4i8i16 = trunc <4 x i16> undef to <4 x i8>
-  %s4i8i32 = trunc <4 x i32> undef to <4 x i8>
-  %s4i8i64 = trunc <4 x i64> undef to <4 x i8>
-  %s4i16i32 = trunc <4 x i32> undef to <4 x i16>
-  %s4i16i64 = trunc <4 x i64> undef to <4 x i16>
-  %s4i32i64 = trunc <4 x i64> undef to <4 x i32>
+  %s4i8i16 = trunc <4 x i16> poison to <4 x i8>
+  %s4i8i32 = trunc <4 x i32> poison to <4 x i8>
+  %s4i8i64 = trunc <4 x i64> poison to <4 x i8>
+  %s4i16i32 = trunc <4 x i32> poison to <4 x i16>
+  %s4i16i64 = trunc <4 x i64> poison to <4 x i16>
+  %s4i32i64 = trunc <4 x i64> poison to <4 x i32>
 
-  %s8i8i16 = trunc <8 x i16> undef to <8 x i8>
-  %s8i8i32 = trunc <8 x i32> undef to <8 x i8>
-  %s8i8i64 = trunc <8 x i64> undef to <8 x i8>
-  %s8i16i32 = trunc <8 x i32> undef to <8 x i16>
-  %s8i16i64 = trunc <8 x i64> undef to <8 x i16>
-  %s8i32i64 = trunc <8 x i64> undef to <8 x i32>
+  %s8i8i16 = trunc <8 x i16> poison to <8 x i8>
+  %s8i8i32 = trunc <8 x i32> poison to <8 x i8>
+  %s8i8i64 = trunc <8 x i64> poison to <8 x i8>
+  %s8i16i32 = trunc <8 x i32> poison to <8 x i16>
+  %s8i16i64 = trunc <8 x i64> poison to <8 x i16>
+  %s8i32i64 = trunc <8 x i64> poison to <8 x i32>
 
-  %s16i8i16 = trunc <16 x i16> undef to <16 x i8>
-  %s16i8i32 = trunc <16 x i32> undef to <16 x i8>
-  %s16i8i64 = trunc <16 x i64> undef to <16 x i8>
-  %s16i16i32 = trunc <16 x i32> undef to <16 x i16>
-  %s16i16i64 = trunc <16 x i64> undef to <16 x i16>
-  %s16i32i64 = trunc <16 x i64> undef to <16 x i32>
+  %s16i8i16 = trunc <16 x i16> poison to <16 x i8>
+  %s16i8i32 = trunc <16 x i32> poison to <16 x i8>
+  %s16i8i64 = trunc <16 x i64> poison to <16 x i8>
+  %s16i16i32 = trunc <16 x i32> poison to <16 x i16>
+  %s16i16i64 = trunc <16 x i64> poison to <16 x i16>
+  %s16i32i64 = trunc <16 x i64> poison to <16 x i32>
   ret void
 }
 
 define i32 @casts_no_users() {
 ; CHECK-LABEL: 'casts_no_users'
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r30 = fptoui float undef to i1
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r31 = fptosi float undef to i1
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r32 = fptoui float undef to i8
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r33 = fptosi float undef to i8
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r34 = fptoui float undef to i16
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r35 = fptosi float undef to i16
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r36 = fptoui float undef to i32
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r37 = fptosi float undef to i32
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r38 = fptoui float undef to i64
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r39 = fptosi float undef to i64
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r40 = fptoui double undef to i1
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r41 = fptosi double undef to i1
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r42 = fptoui double undef to i8
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r43 = fptosi double undef to i8
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r44 = fptoui double undef to i16
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r45 = fptosi double undef to i16
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r46 = fptoui double undef to i32
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r47 = fptosi double undef to i32
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r48 = fptoui double undef to i64
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r49 = fptosi double undef to i64
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r50 = sitofp i1 undef to float
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r51 = uitofp i1 undef to float
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r52 = sitofp i1 undef to double
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r53 = uitofp i1 undef to double
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r54 = sitofp i8 undef to float
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r55 = uitofp i8 undef to float
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r56 = sitofp i8 undef to double
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r57 = uitofp i8 undef to double
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r58 = sitofp i16 undef to float
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r59 = uitofp i16 undef to float
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r60 = sitofp i16 undef to double
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r61 = uitofp i16 undef to double
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r62 = sitofp i32 undef to float
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r63 = uitofp i32 undef to float
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r64 = sitofp i32 undef to double
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r65 = uitofp i32 undef to double
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r66 = sitofp i64 undef to float
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r67 = uitofp i64 undef to float
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r68 = sitofp i64 undef to double
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r69 = uitofp i64 undef to double
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r80 = fptrunc double undef to float
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r81 = fptrunc <2 x double> undef to <2 x float>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r82 = fptrunc <4 x double> undef to <4 x float>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r83 = fptrunc <8 x double> undef to <8 x float>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r84 = fptrunc <16 x double> undef to <16 x float>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %truncf64f16 = fptrunc double undef to half
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %truncv2f64f16 = fptrunc <2 x double> undef to <2 x half>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %truncv4f64f16 = fptrunc <4 x double> undef to <4 x half>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %truncv8f64f16 = fptrunc <8 x double> undef to <8 x half>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %truncv16f64f16 = fptrunc <16 x double> undef to <16 x half>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %truncv32f16 = fptrunc float undef to half
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %truncv2f32f16 = fptrunc <2 x float> undef to <2 x half>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %truncv4f32f16 = fptrunc <4 x float> undef to <4 x half>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %truncv8f32f16 = fptrunc <8 x float> undef to <8 x half>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %truncv16f32f16 = fptrunc <16 x float> undef to <16 x half>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r85 = fpext float undef to double
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r86 = fpext <2 x float> undef to <2 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r87 = fpext <4 x float> undef to <4 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r88 = fpext <8 x float> undef to <8 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r89 = fpext <16 x float> undef to <16 x double>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %extf16f32 = fpext half undef to float
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %extv2f16f32 = fpext <2 x half> undef to <2 x float>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %extv4f16f32 = fpext <4 x half> undef to <4 x float>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %extv8f16f32 = fpext <8 x half> undef to <8 x float>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %extv16f16f32 = fpext <16 x half> undef to <16 x float>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %extf16f64 = fpext half undef to double
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %extv2f16f64 = fpext <2 x half> undef to <2 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %extv4f16f64 = fpext <4 x half> undef to <4 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %extv8f16f64 = fpext <8 x half> undef to <8 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %extv16f16f64 = fpext <16 x half> undef to <16 x double>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r90 = fptoui <2 x float> undef to <2 x i1>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r91 = fptosi <2 x float> undef to <2 x i1>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r92 = fptoui <2 x float> undef to <2 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r93 = fptosi <2 x float> undef to <2 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r94 = fptoui <2 x float> undef to <2 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r95 = fptosi <2 x float> undef to <2 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r96 = fptoui <2 x float> undef to <2 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r97 = fptosi <2 x float> undef to <2 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r98 = fptoui <2 x float> undef to <2 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r99 = fptosi <2 x float> undef to <2 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r100 = fptoui <2 x double> undef to <2 x i1>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r101 = fptosi <2 x double> undef to <2 x i1>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r102 = fptoui <2 x double> undef to <2 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r103 = fptosi <2 x double> undef to <2 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r104 = fptoui <2 x double> undef to <2 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r105 = fptosi <2 x double> undef to <2 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r106 = fptoui <2 x double> undef to <2 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r107 = fptosi <2 x double> undef to <2 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r108 = fptoui <2 x double> undef to <2 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r109 = fptosi <2 x double> undef to <2 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r110 = fptoui <4 x float> undef to <4 x i1>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r111 = fptosi <4 x float> undef to <4 x i1>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r112 = fptoui <4 x float> undef to <4 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r113 = fptosi <4 x float> undef to <4 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r114 = fptoui <4 x float> undef to <4 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r115 = fptosi <4 x float> undef to <4 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r116 = fptoui <4 x float> undef to <4 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r117 = fptosi <4 x float> undef to <4 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r118 = fptoui <4 x float> undef to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r119 = fptosi <4 x float> undef to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r120 = fptoui <4 x double> undef to <4 x i1>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r121 = fptosi <4 x double> undef to <4 x i1>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r122 = fptoui <4 x double> undef to <4 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r123 = fptosi <4 x double> undef to <4 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r124 = fptoui <4 x double> undef to <4 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r125 = fptosi <4 x double> undef to <4 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r126 = fptoui <4 x double> undef to <4 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r127 = fptosi <4 x double> undef to <4 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r128 = fptoui <4 x double> undef to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r129 = fptosi <4 x double> undef to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:41 CodeSize:1 Lat:1 SizeLat:1 for: %r130 = fptoui <8 x float> undef to <8 x i1>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:41 CodeSize:1 Lat:1 SizeLat:1 for: %r131 = fptosi <8 x float> undef to <8 x i1>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r132 = fptoui <8 x float> undef to <8 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r133 = fptosi <8 x float> undef to <8 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r134 = fptoui <8 x float> undef to <8 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r135 = fptosi <8 x float> undef to <8 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r136 = fptoui <8 x float> undef to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r137 = fptosi <8 x float> undef to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r138 = fptoui <8 x float> undef to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r139 = fptosi <8 x float> undef to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:1 Lat:1 SizeLat:1 for: %r140 = fptoui <8 x double> undef to <8 x i1>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:1 Lat:1 SizeLat:1 for: %r141 = fptosi <8 x double> undef to <8 x i1>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r142 = fptoui <8 x double> undef to <8 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r143 = fptosi <8 x double> undef to <8 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r144 = fptoui <8 x double> undef to <8 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r145 = fptosi <8 x double> undef to <8 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r146 = fptoui <8 x double> undef to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r147 = fptosi <8 x double> undef to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r148 = fptoui <8 x double> undef to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r149 = fptosi <8 x double> undef to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:83 CodeSize:1 Lat:1 SizeLat:1 for: %r150 = fptoui <16 x float> undef to <16 x i1>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:83 CodeSize:1 Lat:1 SizeLat:1 for: %r151 = fptosi <16 x float> undef to <16 x i1>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r152 = fptoui <16 x float> undef to <16 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r153 = fptosi <16 x float> undef to <16 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r154 = fptoui <16 x float> undef to <16 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r155 = fptosi <16 x float> undef to <16 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r156 = fptoui <16 x float> undef to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r157 = fptosi <16 x float> undef to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r158 = fptoui <16 x float> undef to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r159 = fptosi <16 x float> undef to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:87 CodeSize:1 Lat:1 SizeLat:1 for: %r160 = fptoui <16 x double> undef to <16 x i1>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:87 CodeSize:1 Lat:1 SizeLat:1 for: %r161 = fptosi <16 x double> undef to <16 x i1>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:1 Lat:1 SizeLat:1 for: %r162 = fptoui <16 x double> undef to <16 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:1 Lat:1 SizeLat:1 for: %r163 = fptosi <16 x double> undef to <16 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %r164 = fptoui <16 x double> undef to <16 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %r165 = fptosi <16 x double> undef to <16 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r166 = fptoui <16 x double> undef to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r167 = fptosi <16 x double> undef to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r168 = fptoui <16 x double> undef to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r169 = fptosi <16 x double> undef to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r170 = uitofp <2 x i1> undef to <2 x float>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r171 = sitofp <2 x i1> undef to <2 x float>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r172 = uitofp <2 x i8> undef to <2 x float>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r173 = sitofp <2 x i8> undef to <2 x float>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r174 = uitofp <2 x i16> undef to <2 x float>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r175 = sitofp <2 x i16> undef to <2 x float>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r176 = uitofp <2 x i32> undef to <2 x float>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r177 = sitofp <2 x i32> undef to <2 x float>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r178 = uitofp <2 x i64> undef to <2 x float>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r179 = sitofp <2 x i64> undef to <2 x float>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r180 = uitofp <2 x i1> undef to <2 x double>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r181 = sitofp <2 x i1> undef to <2 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r182 = uitofp <2 x i8> undef to <2 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r183 = sitofp <2 x i8> undef to <2 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r184 = uitofp <2 x i16> undef to <2 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r185 = sitofp <2 x i16> undef to <2 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r186 = uitofp <2 x i32> undef to <2 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r187 = sitofp <2 x i32> undef to <2 x double>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r188 = uitofp <2 x i64> undef to <2 x double>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r189 = sitofp <2 x i64> undef to <2 x double>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r190 = uitofp <4 x i1> undef to <4 x float>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r191 = sitofp <4 x i1> undef to <4 x float>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r192 = uitofp <4 x i8> undef to <4 x float>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r193 = sitofp <4 x i8> undef to <4 x float>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r194 = uitofp <4 x i16> undef to <4 x float>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r195 = sitofp <4 x i16> undef to <4 x float>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r196 = uitofp <4 x i32> undef to <4 x float>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r197 = sitofp <4 x i32> undef to <4 x float>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: %r198 = uitofp <4 x i64> undef to <4 x float>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: %r199 = sitofp <4 x i64> undef to <4 x float>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r200 = uitofp <4 x i1> undef to <4 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r201 = sitofp <4 x i1> undef to <4 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %r202 = uitofp <4 x i8> undef to <4 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %r203 = sitofp <4 x i8> undef to <4 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %r204 = uitofp <4 x i16> undef to <4 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %r205 = sitofp <4 x i16> undef to <4 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r206 = uitofp <4 x i32> undef to <4 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r207 = sitofp <4 x i32> undef to <4 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r208 = uitofp <4 x i64> undef to <4 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r209 = sitofp <4 x i64> undef to <4 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r210 = uitofp <8 x i1> undef to <8 x float>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r211 = sitofp <8 x i1> undef to <8 x float>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r212 = uitofp <8 x i8> undef to <8 x float>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r213 = sitofp <8 x i8> undef to <8 x float>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r214 = uitofp <8 x i16> undef to <8 x float>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r215 = sitofp <8 x i16> undef to <8 x float>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r216 = uitofp <8 x i32> undef to <8 x float>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r217 = sitofp <8 x i32> undef to <8 x float>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:1 Lat:1 SizeLat:1 for: %r218 = uitofp <8 x i64> undef to <8 x float>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:1 Lat:1 SizeLat:1 for: %r219 = sitofp <8 x i64> undef to <8 x float>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r220 = uitofp <8 x i1> undef to <8 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r221 = sitofp <8 x i1> undef to <8 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %r222 = uitofp <8 x i8> undef to <8 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %r223 = sitofp <8 x i8> undef to <8 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %r224 = uitofp <8 x i16> undef to <8 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %r225 = sitofp <8 x i16> undef to <8 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %r226 = uitofp <8 x i16> undef to <8 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %r227 = sitofp <8 x i16> undef to <8 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r228 = uitofp <8 x i64> undef to <8 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r229 = sitofp <8 x i64> undef to <8 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r230 = uitofp <16 x i1> undef to <16 x float>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r231 = sitofp <16 x i1> undef to <16 x float>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r232 = uitofp <16 x i8> undef to <16 x float>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r233 = sitofp <16 x i8> undef to <16 x float>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r234 = uitofp <16 x i16> undef to <16 x float>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r235 = sitofp <16 x i16> undef to <16 x float>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r236 = uitofp <16 x i32> undef to <16 x float>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r237 = sitofp <16 x i32> undef to <16 x float>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:72 CodeSize:1 Lat:1 SizeLat:1 for: %r238 = uitofp <16 x i64> undef to <16 x float>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:72 CodeSize:1 Lat:1 SizeLat:1 for: %r239 = sitofp <16 x i64> undef to <16 x float>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %r240 = uitofp <16 x i1> undef to <16 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %r241 = sitofp <16 x i1> undef to <16 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:1 Lat:1 SizeLat:1 for: %r242 = uitofp <16 x i8> undef to <16 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:1 Lat:1 SizeLat:1 for: %r243 = sitofp <16 x i8> undef to <16 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:1 Lat:1 SizeLat:1 for: %r244 = uitofp <16 x i16> undef to <16 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:1 Lat:1 SizeLat:1 for: %r245 = sitofp <16 x i16> undef to <16 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:1 Lat:1 SizeLat:1 for: %r246 = uitofp <16 x i16> undef to <16 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:1 Lat:1 SizeLat:1 for: %r247 = sitofp <16 x i16> undef to <16 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r248 = uitofp <16 x i64> undef to <16 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r249 = sitofp <16 x i64> undef to <16 x double>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r30 = fptoui float poison to i1
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r31 = fptosi float poison to i1
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r32 = fptoui float poison to i8
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r33 = fptosi float poison to i8
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r34 = fptoui float poison to i16
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r35 = fptosi float poison to i16
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r36 = fptoui float poison to i32
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r37 = fptosi float poison to i32
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r38 = fptoui float poison to i64
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r39 = fptosi float poison to i64
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r40 = fptoui double poison to i1
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r41 = fptosi double poison to i1
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r42 = fptoui double poison to i8
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r43 = fptosi double poison to i8
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r44 = fptoui double poison to i16
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r45 = fptosi double poison to i16
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r46 = fptoui double poison to i32
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r47 = fptosi double poison to i32
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r48 = fptoui double poison to i64
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r49 = fptosi double poison to i64
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r50 = sitofp i1 poison to float
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r51 = uitofp i1 poison to float
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r52 = sitofp i1 poison to double
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r53 = uitofp i1 poison to double
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r54 = sitofp i8 poison to float
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r55 = uitofp i8 poison to float
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r56 = sitofp i8 poison to double
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r57 = uitofp i8 poison to double
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r58 = sitofp i16 poison to float
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r59 = uitofp i16 poison to float
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r60 = sitofp i16 poison to double
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r61 = uitofp i16 poison to double
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r62 = sitofp i32 poison to float
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r63 = uitofp i32 poison to float
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r64 = sitofp i32 poison to double
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r65 = uitofp i32 poison to double
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r66 = sitofp i64 poison to float
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r67 = uitofp i64 poison to float
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r68 = sitofp i64 poison to double
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r69 = uitofp i64 poison to double
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r80 = fptrunc double poison to float
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r81 = fptrunc <2 x double> poison to <2 x float>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r82 = fptrunc <4 x double> poison to <4 x float>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r83 = fptrunc <8 x double> poison to <8 x float>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r84 = fptrunc <16 x double> poison to <16 x float>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %truncf64f16 = fptrunc double poison to half
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %truncv2f64f16 = fptrunc <2 x double> poison to <2 x half>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %truncv4f64f16 = fptrunc <4 x double> poison to <4 x half>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %truncv8f64f16 = fptrunc <8 x double> poison to <8 x half>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %truncv16f64f16 = fptrunc <16 x double> poison to <16 x half>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %truncv32f16 = fptrunc float poison to half
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %truncv2f32f16 = fptrunc <2 x float> poison to <2 x half>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %truncv4f32f16 = fptrunc <4 x float> poison to <4 x half>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %truncv8f32f16 = fptrunc <8 x float> poison to <8 x half>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %truncv16f32f16 = fptrunc <16 x float> poison to <16 x half>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r85 = fpext float poison to double
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r86 = fpext <2 x float> poison to <2 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r87 = fpext <4 x float> poison to <4 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r88 = fpext <8 x float> poison to <8 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r89 = fpext <16 x float> poison to <16 x double>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %extf16f32 = fpext half poison to float
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %extv2f16f32 = fpext <2 x half> poison to <2 x float>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %extv4f16f32 = fpext <4 x half> poison to <4 x float>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %extv8f16f32 = fpext <8 x half> poison to <8 x float>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %extv16f16f32 = fpext <16 x half> poison to <16 x float>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %extf16f64 = fpext half poison to double
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %extv2f16f64 = fpext <2 x half> poison to <2 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %extv4f16f64 = fpext <4 x half> poison to <4 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %extv8f16f64 = fpext <8 x half> poison to <8 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %extv16f16f64 = fpext <16 x half> poison to <16 x double>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r90 = fptoui <2 x float> poison to <2 x i1>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r91 = fptosi <2 x float> poison to <2 x i1>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r92 = fptoui <2 x float> poison to <2 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r93 = fptosi <2 x float> poison to <2 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r94 = fptoui <2 x float> poison to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r95 = fptosi <2 x float> poison to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r96 = fptoui <2 x float> poison to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r97 = fptosi <2 x float> poison to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r98 = fptoui <2 x float> poison to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r99 = fptosi <2 x float> poison to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r100 = fptoui <2 x double> poison to <2 x i1>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r101 = fptosi <2 x double> poison to <2 x i1>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r102 = fptoui <2 x double> poison to <2 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r103 = fptosi <2 x double> poison to <2 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r104 = fptoui <2 x double> poison to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r105 = fptosi <2 x double> poison to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r106 = fptoui <2 x double> poison to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r107 = fptosi <2 x double> poison to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r108 = fptoui <2 x double> poison to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r109 = fptosi <2 x double> poison to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r110 = fptoui <4 x float> poison to <4 x i1>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r111 = fptosi <4 x float> poison to <4 x i1>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r112 = fptoui <4 x float> poison to <4 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r113 = fptosi <4 x float> poison to <4 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r114 = fptoui <4 x float> poison to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r115 = fptosi <4 x float> poison to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r116 = fptoui <4 x float> poison to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r117 = fptosi <4 x float> poison to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r118 = fptoui <4 x float> poison to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r119 = fptosi <4 x float> poison to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r120 = fptoui <4 x double> poison to <4 x i1>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r121 = fptosi <4 x double> poison to <4 x i1>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r122 = fptoui <4 x double> poison to <4 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r123 = fptosi <4 x double> poison to <4 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r124 = fptoui <4 x double> poison to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r125 = fptosi <4 x double> poison to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r126 = fptoui <4 x double> poison to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r127 = fptosi <4 x double> poison to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r128 = fptoui <4 x double> poison to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r129 = fptosi <4 x double> poison to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:41 CodeSize:1 Lat:1 SizeLat:1 for: %r130 = fptoui <8 x float> poison to <8 x i1>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:41 CodeSize:1 Lat:1 SizeLat:1 for: %r131 = fptosi <8 x float> poison to <8 x i1>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r132 = fptoui <8 x float> poison to <8 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r133 = fptosi <8 x float> poison to <8 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r134 = fptoui <8 x float> poison to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r135 = fptosi <8 x float> poison to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r136 = fptoui <8 x float> poison to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r137 = fptosi <8 x float> poison to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r138 = fptoui <8 x float> poison to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r139 = fptosi <8 x float> poison to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:1 Lat:1 SizeLat:1 for: %r140 = fptoui <8 x double> poison to <8 x i1>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:1 Lat:1 SizeLat:1 for: %r141 = fptosi <8 x double> poison to <8 x i1>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r142 = fptoui <8 x double> poison to <8 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r143 = fptosi <8 x double> poison to <8 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r144 = fptoui <8 x double> poison to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r145 = fptosi <8 x double> poison to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r146 = fptoui <8 x double> poison to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r147 = fptosi <8 x double> poison to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r148 = fptoui <8 x double> poison to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r149 = fptosi <8 x double> poison to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:83 CodeSize:1 Lat:1 SizeLat:1 for: %r150 = fptoui <16 x float> poison to <16 x i1>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:83 CodeSize:1 Lat:1 SizeLat:1 for: %r151 = fptosi <16 x float> poison to <16 x i1>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r152 = fptoui <16 x float> poison to <16 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r153 = fptosi <16 x float> poison to <16 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r154 = fptoui <16 x float> poison to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r155 = fptosi <16 x float> poison to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r156 = fptoui <16 x float> poison to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r157 = fptosi <16 x float> poison to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r158 = fptoui <16 x float> poison to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r159 = fptosi <16 x float> poison to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:87 CodeSize:1 Lat:1 SizeLat:1 for: %r160 = fptoui <16 x double> poison to <16 x i1>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:87 CodeSize:1 Lat:1 SizeLat:1 for: %r161 = fptosi <16 x double> poison to <16 x i1>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:1 Lat:1 SizeLat:1 for: %r162 = fptoui <16 x double> poison to <16 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:1 Lat:1 SizeLat:1 for: %r163 = fptosi <16 x double> poison to <16 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %r164 = fptoui <16 x double> poison to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %r165 = fptosi <16 x double> poison to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r166 = fptoui <16 x double> poison to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r167 = fptosi <16 x double> poison to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r168 = fptoui <16 x double> poison to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r169 = fptosi <16 x double> poison to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r170 = uitofp <2 x i1> poison to <2 x float>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r171 = sitofp <2 x i1> poison to <2 x float>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r172 = uitofp <2 x i8> poison to <2 x float>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r173 = sitofp <2 x i8> poison to <2 x float>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r174 = uitofp <2 x i16> poison to <2 x float>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r175 = sitofp <2 x i16> poison to <2 x float>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r176 = uitofp <2 x i32> poison to <2 x float>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r177 = sitofp <2 x i32> poison to <2 x float>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r178 = uitofp <2 x i64> poison to <2 x float>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r179 = sitofp <2 x i64> poison to <2 x float>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r180 = uitofp <2 x i1> poison to <2 x double>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r181 = sitofp <2 x i1> poison to <2 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r182 = uitofp <2 x i8> poison to <2 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r183 = sitofp <2 x i8> poison to <2 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r184 = uitofp <2 x i16> poison to <2 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r185 = sitofp <2 x i16> poison to <2 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r186 = uitofp <2 x i32> poison to <2 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r187 = sitofp <2 x i32> poison to <2 x double>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r188 = uitofp <2 x i64> poison to <2 x double>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r189 = sitofp <2 x i64> poison to <2 x double>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r190 = uitofp <4 x i1> poison to <4 x float>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r191 = sitofp <4 x i1> poison to <4 x float>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r192 = uitofp <4 x i8> poison to <4 x float>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r193 = sitofp <4 x i8> poison to <4 x float>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r194 = uitofp <4 x i16> poison to <4 x float>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r195 = sitofp <4 x i16> poison to <4 x float>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r196 = uitofp <4 x i32> poison to <4 x float>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %r197 = sitofp <4 x i32> poison to <4 x float>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: %r198 = uitofp <4 x i64> poison to <4 x float>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: %r199 = sitofp <4 x i64> poison to <4 x float>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r200 = uitofp <4 x i1> poison to <4 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r201 = sitofp <4 x i1> poison to <4 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %r202 = uitofp <4 x i8> poison to <4 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %r203 = sitofp <4 x i8> poison to <4 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %r204 = uitofp <4 x i16> poison to <4 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %r205 = sitofp <4 x i16> poison to <4 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r206 = uitofp <4 x i32> poison to <4 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r207 = sitofp <4 x i32> poison to <4 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r208 = uitofp <4 x i64> poison to <4 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r209 = sitofp <4 x i64> poison to <4 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r210 = uitofp <8 x i1> poison to <8 x float>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r211 = sitofp <8 x i1> poison to <8 x float>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r212 = uitofp <8 x i8> poison to <8 x float>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r213 = sitofp <8 x i8> poison to <8 x float>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r214 = uitofp <8 x i16> poison to <8 x float>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r215 = sitofp <8 x i16> poison to <8 x float>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r216 = uitofp <8 x i32> poison to <8 x float>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r217 = sitofp <8 x i32> poison to <8 x float>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:1 Lat:1 SizeLat:1 for: %r218 = uitofp <8 x i64> poison to <8 x float>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:1 Lat:1 SizeLat:1 for: %r219 = sitofp <8 x i64> poison to <8 x float>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r220 = uitofp <8 x i1> poison to <8 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r221 = sitofp <8 x i1> poison to <8 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %r222 = uitofp <8 x i8> poison to <8 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %r223 = sitofp <8 x i8> poison to <8 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %r224 = uitofp <8 x i16> poison to <8 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %r225 = sitofp <8 x i16> poison to <8 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %r226 = uitofp <8 x i16> poison to <8 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %r227 = sitofp <8 x i16> poison to <8 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r228 = uitofp <8 x i64> poison to <8 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r229 = sitofp <8 x i64> poison to <8 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r230 = uitofp <16 x i1> poison to <16 x float>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r231 = sitofp <16 x i1> poison to <16 x float>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r232 = uitofp <16 x i8> poison to <16 x float>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r233 = sitofp <16 x i8> poison to <16 x float>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r234 = uitofp <16 x i16> poison to <16 x float>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r235 = sitofp <16 x i16> poison to <16 x float>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r236 = uitofp <16 x i32> poison to <16 x float>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r237 = sitofp <16 x i32> poison to <16 x float>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:72 CodeSize:1 Lat:1 SizeLat:1 for: %r238 = uitofp <16 x i64> poison to <16 x float>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:72 CodeSize:1 Lat:1 SizeLat:1 for: %r239 = sitofp <16 x i64> poison to <16 x float>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %r240 = uitofp <16 x i1> poison to <16 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %r241 = sitofp <16 x i1> poison to <16 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:1 Lat:1 SizeLat:1 for: %r242 = uitofp <16 x i8> poison to <16 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:1 Lat:1 SizeLat:1 for: %r243 = sitofp <16 x i8> poison to <16 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:1 Lat:1 SizeLat:1 for: %r244 = uitofp <16 x i16> poison to <16 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:1 Lat:1 SizeLat:1 for: %r245 = sitofp <16 x i16> poison to <16 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:1 Lat:1 SizeLat:1 for: %r246 = uitofp <16 x i16> poison to <16 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:1 Lat:1 SizeLat:1 for: %r247 = sitofp <16 x i16> poison to <16 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r248 = uitofp <16 x i64> poison to <16 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r249 = sitofp <16 x i64> poison to <16 x double>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
-  %r30 = fptoui float undef to i1
-  %r31 = fptosi float undef to i1
-  %r32 = fptoui float undef to i8
-  %r33 = fptosi float undef to i8
-  %r34 = fptoui float undef to i16
-  %r35 = fptosi float undef to i16
-  %r36 = fptoui float undef to i32
-  %r37 = fptosi float undef to i32
-  %r38 = fptoui float undef to i64
-  %r39 = fptosi float undef to i64
-  %r40 = fptoui double undef to i1
-  %r41 = fptosi double undef to i1
-  %r42 = fptoui double undef to i8
-  %r43 = fptosi double undef to i8
-  %r44 = fptoui double undef to i16
-  %r45 = fptosi double undef to i16
-  %r46 = fptoui double undef to i32
-  %r47 = fptosi double undef to i32
-  %r48 = fptoui double undef to i64
-  %r49 = fptosi double undef to i64
-  %r50 = sitofp i1 undef to float
-  %r51 = uitofp i1 undef to float
-  %r52 = sitofp i1 undef to double
-  %r53 = uitofp i1 undef to double
-  %r54 = sitofp i8 undef to float
-  %r55 = uitofp i8 undef to float
-  %r56 = sitofp i8 undef to double
-  %r57 = uitofp i8 undef to double
-  %r58 = sitofp i16 undef to float
-  %r59 = uitofp i16 undef to float
-  %r60 = sitofp i16 undef to double
-  %r61 = uitofp i16 undef to double
-  %r62 = sitofp i32 undef to float
-  %r63 = uitofp i32 undef to float
-  %r64 = sitofp i32 undef to double
-  %r65 = uitofp i32 undef to double
-  %r66 = sitofp i64 undef to float
-  %r67 = uitofp i64 undef to float
-  %r68 = sitofp i64 undef to double
-  %r69 = uitofp i64 undef to double
-  %r80 = fptrunc double undef to float
-  %r81 = fptrunc <2 x double> undef to <2 x float>
-  %r82 = fptrunc <4 x double> undef to <4 x float>
-  %r83 = fptrunc <8 x double> undef to <8 x float>
-  %r84 = fptrunc <16 x double> undef to <16 x float>
-  %truncf64f16 = fptrunc double undef to half
-  %truncv2f64f16 = fptrunc <2 x double> undef to <2 x half>
-  %truncv4f64f16 = fptrunc <4 x double> undef to <4 x half>
-  %truncv8f64f16 = fptrunc <8 x double> undef to <8 x half>
-  %truncv16f64f16 = fptrunc <16 x double> undef to <16 x half>
-  %truncv32f16 = fptrunc float undef to half
-  %truncv2f32f16 = fptrunc <2 x float> undef to <2 x half>
-  %truncv4f32f16 = fptrunc <4 x float> undef to <4 x half>
-  %truncv8f32f16 = fptrunc <8 x float> undef to <8 x half>
-  %truncv16f32f16 = fptrunc <16 x float> undef to <16 x half>
-  %r85 = fpext float undef to double
-  %r86 = fpext <2 x float> undef to <2 x double>
-  %r87 = fpext <4 x float> undef to <4 x double>
-  %r88 = fpext <8 x float> undef to <8 x double>
-  %r89 = fpext <16 x float> undef to <16 x double>
-  %extf16f32 = fpext half undef to float
-  %extv2f16f32 = fpext <2 x half> undef to <2 x float>
-  %extv4f16f32 = fpext <4 x half> undef to <4 x float>
-  %extv8f16f32 = fpext <8 x half> undef to <8 x float>
-  %extv16f16f32 = fpext <16 x half> undef to <16 x float>
-  %extf16f64 = fpext half undef to double
-  %extv2f16f64 = fpext <2 x half> undef to <2 x double>
-  %extv4f16f64 = fpext <4 x half> undef to <4 x double>
-  %extv8f16f64 = fpext <8 x half> undef to <8 x double>
-  %extv16f16f64 = fpext <16 x half> undef to <16 x double>
-  %r90 = fptoui <2 x float> undef to <2 x i1>
-  %r91 = fptosi <2 x float> undef to <2 x i1>
-  %r92 = fptoui <2 x float> undef to <2 x i8>
-  %r93 = fptosi <2 x float> undef to <2 x i8>
-  %r94 = fptoui <2 x float> undef to <2 x i16>
-  %r95 = fptosi <2 x float> undef to <2 x i16>
-  %r96 = fptoui <2 x float> undef to <2 x i32>
-  %r97 = fptosi <2 x float> undef to <2 x i32>
-  %r98 = fptoui <2 x float> undef to <2 x i64>
-  %r99 = fptosi <2 x float> undef to <2 x i64>
-  %r100 = fptoui <2 x double> undef to <2 x i1>
-  %r101 = fptosi <2 x double> undef to <2 x i1>
-  %r102 = fptoui <2 x double> undef to <2 x i8>
-  %r103 = fptosi <2 x double> undef to <2 x i8>
-  %r104 = fptoui <2 x double> undef to <2 x i16>
-  %r105 = fptosi <2 x double> undef to <2 x i16>
-  %r106 = fptoui <2 x double> undef to <2 x i32>
-  %r107 = fptosi <2 x double> undef to <2 x i32>
-  %r108 = fptoui <2 x double> undef to <2 x i64>
-  %r109 = fptosi <2 x double> undef to <2 x i64>
+  %r30 = fptoui float poison to i1
+  %r31 = fptosi float poison to i1
+  %r32 = fptoui float poison to i8
+  %r33 = fptosi float poison to i8
+  %r34 = fptoui float poison to i16
+  %r35 = fptosi float poison to i16
+  %r36 = fptoui float poison to i32
+  %r37 = fptosi float poison to i32
+  %r38 = fptoui float poison to i64
+  %r39 = fptosi float poison to i64
+  %r40 = fptoui double poison to i1
+  %r41 = fptosi double poison to i1
+  %r42 = fptoui double poison to i8
+  %r43 = fptosi double poison to i8
+  %r44 = fptoui double poison to i16
+  %r45 = fptosi double poison to i16
+  %r46 = fptoui double poison to i32
+  %r47 = fptosi double poison to i32
+  %r48 = fptoui double poison to i64
+  %r49 = fptosi double poison to i64
+  %r50 = sitofp i1 poison to float
+  %r51 = uitofp i1 poison to float
+  %r52 = sitofp i1 poison to double
+  %r53 = uitofp i1 poison to double
+  %r54 = sitofp i8 poison to float
+  %r55 = uitofp i8 poison to float
+  %r56 = sitofp i8 poison to double
+  %r57 = uitofp i8 poison to double
+  %r58 = sitofp i16 poison to float
+  %r59 = uitofp i16 poison to float
+  %r60 = sitofp i16 poison to double
+  %r61 = uitofp i16 poison to double
+  %r62 = sitofp i32 poison to float
+  %r63 = uitofp i32 poison to float
+  %r64 = sitofp i32 poison to double
+  %r65 = uitofp i32 poison to double
+  %r66 = sitofp i64 poison to float
+  %r67 = uitofp i64 poison to float
+  %r68 = sitofp i64 poison to double
+  %r69 = uitofp i64 poison to double
+  %r80 = fptrunc double poison to float
+  %r81 = fptrunc <2 x double> poison to <2 x float>
+  %r82 = fptrunc <4 x double> poison to <4 x float>
+  %r83 = fptrunc <8 x double> poison to <8 x float>
+  %r84 = fptrunc <16 x double> poison to <16 x float>
+  %truncf64f16 = fptrunc double poison to half
+  %truncv2f64f16 = fptrunc <2 x double> poison to <2 x half>
+  %truncv4f64f16 = fptrunc <4 x double> poison to <4 x half>
+  %truncv8f64f16 = fptrunc <8 x double> poison to <8 x half>
+  %truncv16f64f16 = fptrunc <16 x double> poison to <16 x half>
+  %truncv32f16 = fptrunc float poison to half
+  %truncv2f32f16 = fptrunc <2 x float> poison to <2 x half>
+  %truncv4f32f16 = fptrunc <4 x float> poison to <4 x half>
+  %truncv8f32f16 = fptrunc <8 x float> poison to <8 x half>
+  %truncv16f32f16 = fptrunc <16 x float> poison to <16 x half>
+  %r85 = fpext float poison to double
+  %r86 = fpext <2 x float> poison to <2 x double>
+  %r87 = fpext <4 x float> poison to <4 x double>
+  %r88 = fpext <8 x float> poison to <8 x double>
+  %r89 = fpext <16 x float> poison to <16 x double>
+  %extf16f32 = fpext half poison to float
+  %extv2f16f32 = fpext <2 x half> poison to <2 x float>
+  %extv4f16f32 = fpext <4 x half> poison to <4 x float>
+  %extv8f16f32 = fpext <8 x half> poison to <8 x float>
+  %extv16f16f32 = fpext <16 x half> poison to <16 x float>
+  %extf16f64 = fpext half poison to double
+  %extv2f16f64 = fpext <2 x half> poison to <2 x double>
+  %extv4f16f64 = fpext <4 x half> poison to <4 x double>
+  %extv8f16f64 = fpext <8 x half> poison to <8 x double>
+  %extv16f16f64 = fpext <16 x half> poison to <16 x double>
+  %r90 = fptoui <2 x float> poison to <2 x i1>
+  %r91 = fptosi <2 x float> poison to <2 x i1>
+  %r92 = fptoui <2 x float> poison to <2 x i8>
+  %r93 = fptosi <2 x float> poison to <2 x i8>
+  %r94 = fptoui <2 x float> poison to <2 x i16>
+  %r95 = fptosi <2 x float> poison to <2 x i16>
+  %r96 = fptoui <2 x float> poison to <2 x i32>
+  %r97 = fptosi <2 x float> poison to <2 x i32>
+  %r98 = fptoui <2 x float> poison to <2 x i64>
+  %r99 = fptosi <2 x float> poison to <2 x i64>
+  %r100 = fptoui <2 x double> poison to <2 x i1>
+  %r101 = fptosi <2 x double> poison to <2 x i1>
+  %r102 = fptoui <2 x double> poison to <2 x i8>
+  %r103 = fptosi <2 x double> poison to <2 x i8>
+  %r104 = fptoui <2 x double> poison to <2 x i16>
+  %r105 = fptosi <2 x double> poison to <2 x i16>
+  %r106 = fptoui <2 x double> poison to <2 x i32>
+  %r107 = fptosi <2 x double> poison to <2 x i32>
+  %r108 = fptoui <2 x double> poison to <2 x i64>
+  %r109 = fptosi <2 x double> poison to <2 x i64>
 
-  %r110 = fptoui <4 x float> undef to <4 x i1>
-  %r111 = fptosi <4 x float> undef to <4 x i1>
-  %r112 = fptoui <4 x float> undef to <4 x i8>
-  %r113 = fptosi <4 x float> undef to <4 x i8>
-  %r114 = fptoui <4 x float> undef to <4 x i16>
-  %r115 = fptosi <4 x float> undef to <4 x i16>
-  %r116 = fptoui <4 x float> undef to <4 x i32>
-  %r117 = fptosi <4 x float> undef to <4 x i32>
-  %r118 = fptoui <4 x float> undef to <4 x i64>
-  %r119 = fptosi <4 x float> undef to <4 x i64>
+  %r110 = fptoui <4 x float> poison to <4 x i1>
+  %r111 = fptosi <4 x float> poison to <4 x i1>
+  %r112 = fptoui <4 x float> poison to <4 x i8>
+  %r113 = fptosi <4 x float> poison to <4 x i8>
+  %r114 = fptoui <4 x float> poison to <4 x i16>
+  %r115 = fptosi <4 x float> poison to <4 x i16>
+  %r116 = fptoui <4 x float> poison to <4 x i32>
+  %r117 = fptosi <4 x float> poison to <4 x i32>
+  %r118 = fptoui <4 x float> poison to <4 x i64>
+  %r119 = fptosi <4 x float> poison to <4 x i64>
 
-  %r120 = fptoui <4 x double> undef to <4 x i1>
-  %r121 = fptosi <4 x double> undef to <4 x i1>
-  %r122 = fptoui <4 x double> undef to <4 x i8>
-  %r123 = fptosi <4 x double> undef to <4 x i8>
-  %r124 = fptoui <4 x double> undef to <4 x i16>
-  %r125 = fptosi <4 x double> undef to <4 x i16>
-  %r126 = fptoui <4 x double> undef to <4 x i32>
-  %r127 = fptosi <4 x double> undef to <4 x i32>
-  %r128 = fptoui <4 x double> undef to <4 x i64>
-  %r129 = fptosi <4 x double> undef to <4 x i64>
+  %r120 = fptoui <4 x double> poison to <4 x i1>
+  %r121 = fptosi <4 x double> poison to <4 x i1>
+  %r122 = fptoui <4 x double> poison to <4 x i8>
+  %r123 = fptosi <4 x double> poison to <4 x i8>
+  %r124 = fptoui <4 x double> poison to <4 x i16>
+  %r125 = fptosi <4 x double> poison to <4 x i16>
+  %r126 = fptoui <4 x double> poison to <4 x i32>
+  %r127 = fptosi <4 x double> poison to <4 x i32>
+  %r128 = fptoui <4 x double> poison to <4 x i64>
+  %r129 = fptosi <4 x double> poison to <4 x i64>
 
-  %r130 = fptoui <8 x float> undef to <8 x i1>
-  %r131 = fptosi <8 x float> undef to <8 x i1>
-  %r132 = fptoui <8 x float> undef to <8 x i8>
-  %r133 = fptosi <8 x float> undef to <8 x i8>
-  %r134 = fptoui <8 x float> undef to <8 x i16>
-  %r135 = fptosi <8 x float> undef to <8 x i16>
-  %r136 = fptoui <8 x float> undef to <8 x i32>
-  %r137 = fptosi <8 x float> undef to <8 x i32>
-  %r138 = fptoui <8 x float> undef to <8 x i64>
-  %r139 = fptosi <8 x float> undef to <8 x i64>
+  %r130 = fptoui <8 x float> poison to <8 x i1>
+  %r131 = fptosi <8 x float> poison to <8 x i1>
+  %r132 = fptoui <8 x float> poison to <8 x i8>
+  %r133 = fptosi <8 x float> poison to <8 x i8>
+  %r134 = fptoui <8 x float> poison to <8 x i16>
+  %r135 = fptosi <8 x float> poison to <8 x i16>
+  %r136 = fptoui <8 x float> poison to <8 x i32>
+  %r137 = fptosi <8 x float> poison to <8 x i32>
+  %r138 = fptoui <8 x float> poison to <8 x i64>
+  %r139 = fptosi <8 x float> poison to <8 x i64>
 
-  %r140 = fptoui <8 x double> undef to <8 x i1>
-  %r141 = fptosi <8 x double> undef to <8 x i1>
-  %r142 = fptoui <8 x double> undef to <8 x i8>
-  %r143 = fptosi <8 x double> undef to <8 x i8>
-  %r144 = fptoui <8 x double> undef to <8 x i16>
-  %r145 = fptosi <8 x double> undef to <8 x i16>
-  %r146 = fptoui <8 x double> undef to <8 x i32>
-  %r147 = fptosi <8 x double> undef to <8 x i32>
-  %r148 = fptoui <8 x double> undef to <8 x i64>
-  %r149 = fptosi <8 x double> undef to <8 x i64>
+  %r140 = fptoui <8 x double> poison to <8 x i1>
+  %r141 = fptosi <8 x double> poison to <8 x i1>
+  %r142 = fptoui <8 x double> poison to <8 x i8>
+  %r143 = fptosi <8 x double> poison to <8 x i8>
+  %r144 = fptoui <8 x double> poison to <8 x i16>
+  %r145 = fptosi <8 x double> poison to <8 x i16>
+  %r146 = fptoui <8 x double> poison to <8 x i32>
+  %r147 = fptosi <8 x double> poison to <8 x i32>
+  %r148 = fptoui <8 x double> poison to <8 x i64>
+  %r149 = fptosi <8 x double> poison to <8 x i64>
 
-  %r150 = fptoui <16 x float> undef to <16 x i1>
-  %r151 = fptosi <16 x float> undef to <16 x i1>
-  %r152 = fptoui <16 x float> undef to <16 x i8>
-  %r153 = fptosi <16 x float> undef to <16 x i8>
-  %r154 = fptoui <16 x float> undef to <16 x i16>
-  %r155 = fptosi <16 x float> undef to <16 x i16>
-  %r156 = fptoui <16 x float> undef to <16 x i32>
-  %r157 = fptosi <16 x float> undef to <16 x i32>
-  %r158 = fptoui <16 x float> undef to <16 x i64>
-  %r159 = fptosi <16 x float> undef to <16 x i64>
+  %r150 = fptoui <16 x float> poison to <16 x i1>
+  %r151 = fptosi <16 x float> poison to <16 x i1>
+  %r152 = fptoui <16 x float> poison to <16 x i8>
+  %r153 = fptosi <16 x float> poison to <16 x i8>
+  %r154 = fptoui <16 x float> poison to <16 x i16>
+  %r155 = fptosi <16 x float> poison to <16 x i16>
+  %r156 = fptoui <16 x float> poison to <16 x i32>
+  %r157 = fptosi <16 x float> poison to <16 x i32>
+  %r158 = fptoui <16 x float> poison to <16 x i64>
+  %r159 = fptosi <16 x float> poison to <16 x i64>
 
-  %r160 = fptoui <16 x double> undef to <16 x i1>
-  %r161 = fptosi <16 x double> undef to <16 x i1>
-  %r162 = fptoui <16 x double> undef to <16 x i8>
-  %r163 = fptosi <16 x double> undef to <16 x i8>
-  %r164 = fptoui <16 x double> undef to <16 x i16>
-  %r165 = fptosi <16 x double> undef to <16 x i16>
-  %r166 = fptoui <16 x double> undef to <16 x i32>
-  %r167 = fptosi <16 x double> undef to <16 x i32>
-  %r168 = fptoui <16 x double> undef to <16 x i64>
-  %r169 = fptosi <16 x double> undef to <16 x i64>
+  %r160 = fptoui <16 x double> poison to <16 x i1>
+  %r161 = fptosi <16 x double> poison to <16 x i1>
+  %r162 = fptoui <16 x double> poison to <16 x i8>
+  %r163 = fptosi <16 x double> poison to <16 x i8>
+  %r164 = fptoui <16 x double> poison to <16 x i16>
+  %r165 = fptosi <16 x double> poison to <16 x i16>
+  %r166 = fptoui <16 x double> poison to <16 x i32>
+  %r167 = fptosi <16 x double> poison to <16 x i32>
+  %r168 = fptoui <16 x double> poison to <16 x i64>
+  %r169 = fptosi <16 x double> poison to <16 x i64>
 
-  %r170 = uitofp <2 x i1> undef to <2 x float>
-  %r171 = sitofp <2 x i1> undef to <2 x float>
-  %r172 = uitofp <2 x i8> undef to <2 x float>
-  %r173 = sitofp <2 x i8> undef to <2 x float>
-  %r174 = uitofp <2 x i16> undef to <2 x float>
-  %r175 = sitofp <2 x i16> undef to <2 x float>
-  %r176 = uitofp <2 x i32> undef to <2 x float>
-  %r177 = sitofp <2 x i32> undef to <2 x float>
-  %r178 = uitofp <2 x i64> undef to <2 x float>
-  %r179 = sitofp <2 x i64> undef to <2 x float>
+  %r170 = uitofp <2 x i1> poison to <2 x float>
+  %r171 = sitofp <2 x i1> poison to <2 x float>
+  %r172 = uitofp <2 x i8> poison to <2 x float>
+  %r173 = sitofp <2 x i8> poison to <2 x float>
+  %r174 = uitofp <2 x i16> poison to <2 x float>
+  %r175 = sitofp <2 x i16> poison to <2 x float>
+  %r176 = uitofp <2 x i32> poison to <2 x float>
+  %r177 = sitofp <2 x i32> poison to <2 x float>
+  %r178 = uitofp <2 x i64> poison to <2 x float>
+  %r179 = sitofp <2 x i64> poison to <2 x float>
 
-  %r180 = uitofp <2 x i1> undef to <2 x double>
-  %r181 = sitofp <2 x i1> undef to <2 x double>
-  %r182 = uitofp <2 x i8> undef to <2 x double>
-  %r183 = sitofp <2 x i8> undef to <2 x double>
-  %r184 = uitofp <2 x i16> undef to <2 x double>
-  %r185 = sitofp <2 x i16> undef to <2 x double>
-  %r186 = uitofp <2 x i32> undef to <2 x double>
-  %r187 = sitofp <2 x i32> undef to <2 x double>
-  %r188 = uitofp <2 x i64> undef to <2 x double>
-  %r189 = sitofp <2 x i64> undef to <2 x double>
+  %r180 = uitofp <2 x i1> poison to <2 x double>
+  %r181 = sitofp <2 x i1> poison to <2 x double>
+  %r182 = uitofp <2 x i8> poison to <2 x double>
+  %r183 = sitofp <2 x i8> poison to <2 x double>
+  %r184 = uitofp <2 x i16> poison to <2 x double>
+  %r185 = sitofp <2 x i16> poison to <2 x double>
+  %r186 = uitofp <2 x i32> poison to <2 x double>
+  %r187 = sitofp <2 x i32> poison to <2 x double>
+  %r188 = uitofp <2 x i64> poison to <2 x double>
+  %r189 = sitofp <2 x i64> poison to <2 x double>
 
-  %r190 = uitofp <4 x i1> undef to <4 x float>
-  %r191 = sitofp <4 x i1> undef to <4 x float>
-  %r192 = uitofp <4 x i8> undef to <4 x float>
-  %r193 = sitofp <4 x i8> undef to <4 x float>
-  %r194 = uitofp <4 x i16> undef to <4 x float>
-  %r195 = sitofp <4 x i16> undef to <4 x float>
-  %r196 = uitofp <4 x i32> undef to <4 x float>
-  %r197 = sitofp <4 x i32> undef to <4 x float>
-  %r198 = uitofp <4 x i64> undef to <4 x float>
-  %r199 = sitofp <4 x i64> undef to <4 x float>
+  %r190 = uitofp <4 x i1> poison to <4 x float>
+  %r191 = sitofp <4 x i1> poison to <4 x float>
+  %r192 = uitofp <4 x i8> poison to <4 x float>
+  %r193 = sitofp <4 x i8> poison to <4 x float>
+  %r194 = uitofp <4 x i16> poison to <4 x float>
+  %r195 = sitofp <4 x i16> poison to <4 x float>
+  %r196 = uitofp <4 x i32> poison to <4 x float>
+  %r197 = sitofp <4 x i32> poison to <4 x float>
+  %r198 = uitofp <4 x i64> poison to <4 x float>
+  %r199 = sitofp <4 x i64> poison to <4 x float>
 
-  %r200 = uitofp <4 x i1> undef to <4 x double>
-  %r201 = sitofp <4 x i1> undef to <4 x double>
-  %r202 = uitofp <4 x i8> undef to <4 x double>
-  %r203 = sitofp <4 x i8> undef to <4 x double>
-  %r204 = uitofp <4 x i16> undef to <4 x double>
-  %r205 = sitofp <4 x i16> undef to <4 x double>
-  %r206 = uitofp <4 x i32> undef to <4 x double>
-  %r207 = sitofp <4 x i32> undef to <4 x double>
-  %r208 = uitofp <4 x i64> undef to <4 x double>
-  %r209 = sitofp <4 x i64> undef to <4 x double>
+  %r200 = uitofp <4 x i1> poison to <4 x double>
+  %r201 = sitofp <4 x i1> poison to <4 x double>
+  %r202 = uitofp <4 x i8> poison to <4 x double>
+  %r203 = sitofp <4 x i8> poison to <4 x double>
+  %r204 = uitofp <4 x i16> poison to <4 x double>
+  %r205 = sitofp <4 x i16> poison to <4 x double>
+  %r206 = uitofp <4 x i32> poison to <4 x double>
+  %r207 = sitofp <4 x i32> poison to <4 x double>
+  %r208 = uitofp <4 x i64> poison to <4 x double>
+  %r209 = sitofp <4 x i64> poison to <4 x double>
 
-  %r210 = uitofp <8 x i1> undef to <8 x float>
-  %r211 = sitofp <8 x i1> undef to <8 x float>
-  %r212 = uitofp <8 x i8> undef to <8 x float>
-  %r213 = sitofp <8 x i8> undef to <8 x float>
-  %r214 = uitofp <8 x i16> undef to <8 x float>
-  %r215 = sitofp <8 x i16> undef to <8 x float>
-  %r216 = uitofp <8 x i32> undef to <8 x float>
-  %r217 = sitofp <8 x i32> undef to <8 x float>
-  %r218 = uitofp <8 x i64> undef to <8 x float>
-  %r219 = sitofp <8 x i64> undef to <8 x float>
+  %r210 = uitofp <8 x i1> poison to <8 x float>
+  %r211 = sitofp <8 x i1> poison to <8 x float>
+  %r212 = uitofp <8 x i8> poison to <8 x float>
+  %r213 = sitofp <8 x i8> poison to <8 x float>
+  %r214 = uitofp <8 x i16> poison to <8 x float>
+  %r215 = sitofp <8 x i16> poison to <8 x float>
+  %r216 = uitofp <8 x i32> poison to <8 x float>
+  %r217 = sitofp <8 x i32> poison to <8 x float>
+  %r218 = uitofp <8 x i64> poison to <8 x float>
+  %r219 = sitofp <8 x i64> poison to <8 x float>
 
-  %r220 = uitofp <8 x i1> undef to <8 x double>
-  %r221 = sitofp <8 x i1> undef to <8 x double>
-  %r222 = uitofp <8 x i8> undef to <8 x double>
-  %r223 = sitofp <8 x i8> undef to <8 x double>
-  %r224 = uitofp <8 x i16> undef to <8 x double>
-  %r225 = sitofp <8 x i16> undef to <8 x double>
-  %r226 = uitofp <8 x i16> undef to <8 x double>
-  %r227 = sitofp <8 x i16> undef to <8 x double>
-  %r228 = uitofp <8 x i64> undef to <8 x double>
-  %r229 = sitofp <8 x i64> undef to <8 x double>
+  %r220 = uitofp <8 x i1> poison to <8 x double>
+  %r221 = sitofp <8 x i1> poison to <8 x double>
+  %r222 = uitofp <8 x i8> poison to <8 x double>
+  %r223 = sitofp <8 x i8> poison to <8 x double>
+  %r224 = uitofp <8 x i16> poison to <8 x double>
+  %r225 = sitofp <8 x i16> poison to <8 x double>
+  %r226 = uitofp <8 x i16> poison to <8 x double>
+  %r227 = sitofp <8 x i16> poison to <8 x double>
+  %r228 = uitofp <8 x i64> poison to <8 x double>
+  %r229 = sitofp <8 x i64> poison to <8 x double>
 
-  %r230 = uitofp <16 x i1> undef to <16 x float>
-  %r231 = sitofp <16 x i1> undef to <16 x float>
-  %r232 = uitofp <16 x i8> undef to <16 x float>
-  %r233 = sitofp <16 x i8> undef to <16 x float>
-  %r234 = uitofp <16 x i16> undef to <16 x float>
-  %r235 = sitofp <16 x i16> undef to <16 x float>
-  %r236 = uitofp <16 x i32> undef to <16 x float>
-  %r237 = sitofp <16 x i32> undef to <16 x float>
-  %r238 = uitofp <16 x i64> undef to <16 x float>
-  %r239 = sitofp <16 x i64> undef to <16 x float>
+  %r230 = uitofp <16 x i1> poison to <16 x float>
+  %r231 = sitofp <16 x i1> poison to <16 x float>
+  %r232 = uitofp <16 x i8> poison to <16 x float>
+  %r233 = sitofp <16 x i8> poison to <16 x float>
+  %r234 = uitofp <16 x i16> poison to <16 x float>
+  %r235 = sitofp <16 x i16> poison to <16 x float>
+  %r236 = uitofp <16 x i32> poison to <16 x float>
+  %r237 = sitofp <16 x i32> poison to <16 x float>
+  %r238 = uitofp <16 x i64> poison to <16 x float>
+  %r239 = sitofp <16 x i64> poison to <16 x float>
 
-  %r240 = uitofp <16 x i1> undef to <16 x double>
-  %r241 = sitofp <16 x i1> undef to <16 x double>
-  %r242 = uitofp <16 x i8> undef to <16 x double>
-  %r243 = sitofp <16 x i8> undef to <16 x double>
-  %r244 = uitofp <16 x i16> undef to <16 x double>
-  %r245 = sitofp <16 x i16> undef to <16 x double>
-  %r246 = uitofp <16 x i16> undef to <16 x double>
-  %r247 = sitofp <16 x i16> undef to <16 x double>
-  %r248 = uitofp <16 x i64> undef to <16 x double>
-  %r249 = sitofp <16 x i64> undef to <16 x double>
+  %r240 = uitofp <16 x i1> poison to <16 x double>
+  %r241 = sitofp <16 x i1> poison to <16 x double>
+  %r242 = uitofp <16 x i8> poison to <16 x double>
+  %r243 = sitofp <16 x i8> poison to <16 x double>
+  %r244 = uitofp <16 x i16> poison to <16 x double>
+  %r245 = sitofp <16 x i16> poison to <16 x double>
+  %r246 = uitofp <16 x i16> poison to <16 x double>
+  %r247 = sitofp <16 x i16> poison to <16 x double>
+  %r248 = uitofp <16 x i64> poison to <16 x double>
+  %r249 = sitofp <16 x i64> poison to <16 x double>
 
   ret i32 undef
 }
@@ -836,24 +836,24 @@ define i32 @casts_with_users(i8 %a, i16 %b, i32 %c, i64 %d, i1 %e) {
 
 define i32 @bitcasts() {
 ; CHECK-LABEL: 'bitcasts'
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %a = bitcast i32 undef to i32
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %b = bitcast float undef to float
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %c = bitcast i32 undef to float
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %d = bitcast float undef to i32
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %e = bitcast i64 undef to double
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f = bitcast double undef to i64
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %g = bitcast half undef to i16
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %h = bitcast i16 undef to half
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %a = bitcast i32 poison to i32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %b = bitcast float poison to float
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %c = bitcast i32 poison to float
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %d = bitcast float poison to i32
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %e = bitcast i64 poison to double
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f = bitcast double poison to i64
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %g = bitcast half poison to i16
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %h = bitcast i16 poison to half
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
-  %a = bitcast i32 undef to i32
-  %b = bitcast float undef to float
-  %c = bitcast i32 undef to float
-  %d = bitcast float undef to i32
-  %e = bitcast i64 undef to double
-  %f = bitcast double undef to i64
-  %g = bitcast half undef to i16
-  %h = bitcast i16 undef to half
+  %a = bitcast i32 poison to i32
+  %b = bitcast float poison to float
+  %c = bitcast i32 poison to float
+  %d = bitcast float poison to i32
+  %e = bitcast i64 poison to double
+  %f = bitcast double poison to i64
+  %g = bitcast half poison to i16
+  %h = bitcast i16 poison to half
   ret i32 undef
 }
 
@@ -941,31 +941,31 @@ define i32 @load_extends() {
 
 define i32 @store_truncs() {
 ; CHECK-LABEL: 'store_truncs'
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r0 = trunc i64 undef to i8
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r0 = trunc i64 poison to i8
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: store i8 %r0, ptr undef, align 1
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r1 = trunc i64 undef to i16
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r1 = trunc i64 poison to i16
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: store i16 %r1, ptr undef, align 2
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r2 = trunc i64 undef to i32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r2 = trunc i64 poison to i32
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: store i32 %r2, ptr undef, align 4
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r3 = trunc i32 undef to i8
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r3 = trunc i32 poison to i8
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: store i8 %r3, ptr undef, align 1
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r4 = trunc i32 undef to i16
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r4 = trunc i32 poison to i16
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: store i16 %r4, ptr undef, align 2
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r5 = trunc i16 undef to i8
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r5 = trunc i16 poison to i8
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: store i8 %r5, ptr undef, align 1
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
-  %r0 = trunc i64 undef to i8
+  %r0 = trunc i64 poison to i8
   store i8 %r0, ptr undef
-  %r1 = trunc i64 undef to i16
+  %r1 = trunc i64 poison to i16
   store i16 %r1, ptr undef
-  %r2 = trunc i64 undef to i32
+  %r2 = trunc i64 poison to i32
   store i32 %r2, ptr undef
-  %r3 = trunc i32 undef to i8
+  %r3 = trunc i32 poison to i8
   store i8 %r3, ptr undef
-  %r4 = trunc i32 undef to i16
+  %r4 = trunc i32 poison to i16
   store i16 %r4, ptr undef
-  %r5 = trunc i16 undef to i8
+  %r5 = trunc i16 poison to i8
   store i8 %r5, ptr undef
   ret i32 undef
 }
@@ -1013,296 +1013,296 @@ declare void @use(i16, i16, i32, i32, i64, i64, i32, i32, i64, i64, i64, i64)
 
 define void @fp16cast() {
 ; CHECK-NOFP16-LABEL: 'fp16cast'
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r30 = fptoui half undef to i1
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r31 = fptosi half undef to i1
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r32 = fptoui half undef to i8
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r33 = fptosi half undef to i8
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r34 = fptoui half undef to i16
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r35 = fptosi half undef to i16
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r36 = fptoui half undef to i32
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r37 = fptosi half undef to i32
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r38 = fptoui half undef to i64
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r39 = fptosi half undef to i64
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r90 = fptoui <2 x half> undef to <2 x i1>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r91 = fptosi <2 x half> undef to <2 x i1>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r92 = fptoui <2 x half> undef to <2 x i8>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r93 = fptosi <2 x half> undef to <2 x i8>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r94 = fptoui <2 x half> undef to <2 x i16>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r95 = fptosi <2 x half> undef to <2 x i16>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r96 = fptoui <2 x half> undef to <2 x i32>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r97 = fptosi <2 x half> undef to <2 x i32>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r98 = fptoui <2 x half> undef to <2 x i64>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r99 = fptosi <2 x half> undef to <2 x i64>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r110 = fptoui <4 x half> undef to <4 x i1>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r111 = fptosi <4 x half> undef to <4 x i1>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r112 = fptoui <4 x half> undef to <4 x i8>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r113 = fptosi <4 x half> undef to <4 x i8>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r114 = fptoui <4 x half> undef to <4 x i16>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r115 = fptosi <4 x half> undef to <4 x i16>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r116 = fptoui <4 x half> undef to <4 x i32>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r117 = fptosi <4 x half> undef to <4 x i32>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r118 = fptoui <4 x half> undef to <4 x i64>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r119 = fptosi <4 x half> undef to <4 x i64>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r130 = fptoui <8 x half> undef to <8 x i1>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r131 = fptosi <8 x half> undef to <8 x i1>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r132 = fptoui <8 x half> undef to <8 x i8>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r133 = fptosi <8 x half> undef to <8 x i8>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r134 = fptoui <8 x half> undef to <8 x i16>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r135 = fptosi <8 x half> undef to <8 x i16>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:41 CodeSize:1 Lat:1 SizeLat:1 for: %r136 = fptoui <8 x half> undef to <8 x i32>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:41 CodeSize:1 Lat:1 SizeLat:1 for: %r137 = fptosi <8 x half> undef to <8 x i32>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:1 Lat:1 SizeLat:1 for: %r138 = fptoui <8 x half> undef to <8 x i64>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:1 Lat:1 SizeLat:1 for: %r139 = fptosi <8 x half> undef to <8 x i64>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:81 CodeSize:1 Lat:1 SizeLat:1 for: %r150 = fptoui <16 x half> undef to <16 x i1>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:81 CodeSize:1 Lat:1 SizeLat:1 for: %r151 = fptosi <16 x half> undef to <16 x i1>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:81 CodeSize:1 Lat:1 SizeLat:1 for: %r152 = fptoui <16 x half> undef to <16 x i8>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:81 CodeSize:1 Lat:1 SizeLat:1 for: %r153 = fptosi <16 x half> undef to <16 x i8>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r154 = fptoui <16 x half> undef to <16 x i16>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r155 = fptosi <16 x half> undef to <16 x i16>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:82 CodeSize:1 Lat:1 SizeLat:1 for: %r156 = fptoui <16 x half> undef to <16 x i32>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:82 CodeSize:1 Lat:1 SizeLat:1 for: %r157 = fptosi <16 x half> undef to <16 x i32>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:86 CodeSize:1 Lat:1 SizeLat:1 for: %r158 = fptoui <16 x half> undef to <16 x i64>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:86 CodeSize:1 Lat:1 SizeLat:1 for: %r159 = fptosi <16 x half> undef to <16 x i64>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r250 = uitofp <8 x i1> undef to <8 x half>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r251 = sitofp <8 x i1> undef to <8 x half>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r252 = uitofp <8 x i8> undef to <8 x half>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r253 = sitofp <8 x i8> undef to <8 x half>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r254 = uitofp <8 x i16> undef to <8 x half>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r255 = sitofp <8 x i16> undef to <8 x half>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r256 = uitofp <8 x i32> undef to <8 x half>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r257 = sitofp <8 x i32> undef to <8 x half>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r258 = uitofp <8 x i64> undef to <8 x half>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r259 = sitofp <8 x i64> undef to <8 x half>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r260 = uitofp <16 x i1> undef to <16 x half>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r261 = sitofp <16 x i1> undef to <16 x half>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r262 = uitofp <16 x i8> undef to <16 x half>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r263 = sitofp <16 x i8> undef to <16 x half>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r264 = uitofp <16 x i16> undef to <16 x half>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r265 = sitofp <16 x i16> undef to <16 x half>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %r266 = uitofp <16 x i32> undef to <16 x half>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %r267 = sitofp <16 x i32> undef to <16 x half>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %r268 = uitofp <16 x i64> undef to <16 x half>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %r269 = sitofp <16 x i64> undef to <16 x half>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r30 = fptoui half poison to i1
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r31 = fptosi half poison to i1
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r32 = fptoui half poison to i8
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r33 = fptosi half poison to i8
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r34 = fptoui half poison to i16
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r35 = fptosi half poison to i16
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r36 = fptoui half poison to i32
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r37 = fptosi half poison to i32
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r38 = fptoui half poison to i64
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r39 = fptosi half poison to i64
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r90 = fptoui <2 x half> poison to <2 x i1>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r91 = fptosi <2 x half> poison to <2 x i1>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r92 = fptoui <2 x half> poison to <2 x i8>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r93 = fptosi <2 x half> poison to <2 x i8>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r94 = fptoui <2 x half> poison to <2 x i16>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r95 = fptosi <2 x half> poison to <2 x i16>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r96 = fptoui <2 x half> poison to <2 x i32>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r97 = fptosi <2 x half> poison to <2 x i32>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r98 = fptoui <2 x half> poison to <2 x i64>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r99 = fptosi <2 x half> poison to <2 x i64>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r110 = fptoui <4 x half> poison to <4 x i1>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r111 = fptosi <4 x half> poison to <4 x i1>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r112 = fptoui <4 x half> poison to <4 x i8>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r113 = fptosi <4 x half> poison to <4 x i8>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r114 = fptoui <4 x half> poison to <4 x i16>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r115 = fptosi <4 x half> poison to <4 x i16>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r116 = fptoui <4 x half> poison to <4 x i32>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r117 = fptosi <4 x half> poison to <4 x i32>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r118 = fptoui <4 x half> poison to <4 x i64>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r119 = fptosi <4 x half> poison to <4 x i64>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r130 = fptoui <8 x half> poison to <8 x i1>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r131 = fptosi <8 x half> poison to <8 x i1>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r132 = fptoui <8 x half> poison to <8 x i8>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r133 = fptosi <8 x half> poison to <8 x i8>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r134 = fptoui <8 x half> poison to <8 x i16>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r135 = fptosi <8 x half> poison to <8 x i16>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:41 CodeSize:1 Lat:1 SizeLat:1 for: %r136 = fptoui <8 x half> poison to <8 x i32>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:41 CodeSize:1 Lat:1 SizeLat:1 for: %r137 = fptosi <8 x half> poison to <8 x i32>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:1 Lat:1 SizeLat:1 for: %r138 = fptoui <8 x half> poison to <8 x i64>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:1 Lat:1 SizeLat:1 for: %r139 = fptosi <8 x half> poison to <8 x i64>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:81 CodeSize:1 Lat:1 SizeLat:1 for: %r150 = fptoui <16 x half> poison to <16 x i1>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:81 CodeSize:1 Lat:1 SizeLat:1 for: %r151 = fptosi <16 x half> poison to <16 x i1>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:81 CodeSize:1 Lat:1 SizeLat:1 for: %r152 = fptoui <16 x half> poison to <16 x i8>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:81 CodeSize:1 Lat:1 SizeLat:1 for: %r153 = fptosi <16 x half> poison to <16 x i8>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r154 = fptoui <16 x half> poison to <16 x i16>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r155 = fptosi <16 x half> poison to <16 x i16>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:82 CodeSize:1 Lat:1 SizeLat:1 for: %r156 = fptoui <16 x half> poison to <16 x i32>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:82 CodeSize:1 Lat:1 SizeLat:1 for: %r157 = fptosi <16 x half> poison to <16 x i32>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:86 CodeSize:1 Lat:1 SizeLat:1 for: %r158 = fptoui <16 x half> poison to <16 x i64>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:86 CodeSize:1 Lat:1 SizeLat:1 for: %r159 = fptosi <16 x half> poison to <16 x i64>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r250 = uitofp <8 x i1> poison to <8 x half>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r251 = sitofp <8 x i1> poison to <8 x half>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r252 = uitofp <8 x i8> poison to <8 x half>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r253 = sitofp <8 x i8> poison to <8 x half>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r254 = uitofp <8 x i16> poison to <8 x half>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %r255 = sitofp <8 x i16> poison to <8 x half>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r256 = uitofp <8 x i32> poison to <8 x half>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r257 = sitofp <8 x i32> poison to <8 x half>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r258 = uitofp <8 x i64> poison to <8 x half>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r259 = sitofp <8 x i64> poison to <8 x half>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r260 = uitofp <16 x i1> poison to <16 x half>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r261 = sitofp <16 x i1> poison to <16 x half>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r262 = uitofp <16 x i8> poison to <16 x half>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r263 = sitofp <16 x i8> poison to <16 x half>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r264 = uitofp <16 x i16> poison to <16 x half>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r265 = sitofp <16 x i16> poison to <16 x half>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %r266 = uitofp <16 x i32> poison to <16 x half>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %r267 = sitofp <16 x i32> poison to <16 x half>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %r268 = uitofp <16 x i64> poison to <16 x half>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %r269 = sitofp <16 x i64> poison to <16 x half>
 ; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-FP16-LABEL: 'fp16cast'
-; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r30 = fptoui half undef to i1
-; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r31 = fptosi half undef to i1
-; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r32 = fptoui half undef to i8
-; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r33 = fptosi half undef to i8
-; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r34 = fptoui half undef to i16
-; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r35 = fptosi half undef to i16
-; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r36 = fptoui half undef to i32
-; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r37 = fptosi half undef to i32
-; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r38 = fptoui half undef to i64
-; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r39 = fptosi half undef to i64
-; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r90 = fptoui <2 x half> undef to <2 x i1>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r91 = fptosi <2 x half> undef to <2 x i1>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r92 = fptoui <2 x half> undef to <2 x i8>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r93 = fptosi <2 x half> undef to <2 x i8>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r94 = fptoui <2 x half> undef to <2 x i16>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r95 = fptosi <2 x half> undef to <2 x i16>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r96 = fptoui <2 x half> undef to <2 x i32>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r97 = fptosi <2 x half> undef to <2 x i32>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r98 = fptoui <2 x half> undef to <2 x i64>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r99 = fptosi <2 x half> undef to <2 x i64>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r110 = fptoui <4 x half> undef to <4 x i1>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r111 = fptosi <4 x half> undef to <4 x i1>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r112 = fptoui <4 x half> undef to <4 x i8>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r113 = fptosi <4 x half> undef to <4 x i8>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r114 = fptoui <4 x half> undef to <4 x i16>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r115 = fptosi <4 x half> undef to <4 x i16>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r116 = fptoui <4 x half> undef to <4 x i32>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r117 = fptosi <4 x half> undef to <4 x i32>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r118 = fptoui <4 x half> undef to <4 x i64>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r119 = fptosi <4 x half> undef to <4 x i64>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r130 = fptoui <8 x half> undef to <8 x i1>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r131 = fptosi <8 x half> undef to <8 x i1>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r132 = fptoui <8 x half> undef to <8 x i8>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r133 = fptosi <8 x half> undef to <8 x i8>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r134 = fptoui <8 x half> undef to <8 x i16>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r135 = fptosi <8 x half> undef to <8 x i16>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r136 = fptoui <8 x half> undef to <8 x i32>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r137 = fptosi <8 x half> undef to <8 x i32>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:1 Lat:1 SizeLat:1 for: %r138 = fptoui <8 x half> undef to <8 x i64>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:1 Lat:1 SizeLat:1 for: %r139 = fptosi <8 x half> undef to <8 x i64>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:81 CodeSize:1 Lat:1 SizeLat:1 for: %r150 = fptoui <16 x half> undef to <16 x i1>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:81 CodeSize:1 Lat:1 SizeLat:1 for: %r151 = fptosi <16 x half> undef to <16 x i1>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r152 = fptoui <16 x half> undef to <16 x i8>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r153 = fptosi <16 x half> undef to <16 x i8>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r154 = fptoui <16 x half> undef to <16 x i16>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r155 = fptosi <16 x half> undef to <16 x i16>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r156 = fptoui <16 x half> undef to <16 x i32>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r157 = fptosi <16 x half> undef to <16 x i32>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:86 CodeSize:1 Lat:1 SizeLat:1 for: %r158 = fptoui <16 x half> undef to <16 x i64>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:86 CodeSize:1 Lat:1 SizeLat:1 for: %r159 = fptosi <16 x half> undef to <16 x i64>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r250 = uitofp <8 x i1> undef to <8 x half>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r251 = sitofp <8 x i1> undef to <8 x half>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r252 = uitofp <8 x i8> undef to <8 x half>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r253 = sitofp <8 x i8> undef to <8 x half>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r254 = uitofp <8 x i16> undef to <8 x half>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r255 = sitofp <8 x i16> undef to <8 x half>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r256 = uitofp <8 x i32> undef to <8 x half>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r257 = sitofp <8 x i32> undef to <8 x half>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r258 = uitofp <8 x i64> undef to <8 x half>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r259 = sitofp <8 x i64> undef to <8 x half>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r260 = uitofp <16 x i1> undef to <16 x half>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r261 = sitofp <16 x i1> undef to <16 x half>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r262 = uitofp <16 x i8> undef to <16 x half>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r263 = sitofp <16 x i8> undef to <16 x half>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r264 = uitofp <16 x i16> undef to <16 x half>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r265 = sitofp <16 x i16> undef to <16 x half>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %r266 = uitofp <16 x i32> undef to <16 x half>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %r267 = sitofp <16 x i32> undef to <16 x half>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %r268 = uitofp <16 x i64> undef to <16 x half>
-; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %r269 = sitofp <16 x i64> undef to <16 x half>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r30 = fptoui half poison to i1
+; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r31 = fptosi half poison to i1
+; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r32 = fptoui half poison to i8
+; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r33 = fptosi half poison to i8
+; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r34 = fptoui half poison to i16
+; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r35 = fptosi half poison to i16
+; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r36 = fptoui half poison to i32
+; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r37 = fptosi half poison to i32
+; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r38 = fptoui half poison to i64
+; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r39 = fptosi half poison to i64
+; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r90 = fptoui <2 x half> poison to <2 x i1>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r91 = fptosi <2 x half> poison to <2 x i1>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r92 = fptoui <2 x half> poison to <2 x i8>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r93 = fptosi <2 x half> poison to <2 x i8>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r94 = fptoui <2 x half> poison to <2 x i16>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r95 = fptosi <2 x half> poison to <2 x i16>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r96 = fptoui <2 x half> poison to <2 x i32>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r97 = fptosi <2 x half> poison to <2 x i32>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r98 = fptoui <2 x half> poison to <2 x i64>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r99 = fptosi <2 x half> poison to <2 x i64>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r110 = fptoui <4 x half> poison to <4 x i1>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r111 = fptosi <4 x half> poison to <4 x i1>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r112 = fptoui <4 x half> poison to <4 x i8>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r113 = fptosi <4 x half> poison to <4 x i8>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r114 = fptoui <4 x half> poison to <4 x i16>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r115 = fptosi <4 x half> poison to <4 x i16>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r116 = fptoui <4 x half> poison to <4 x i32>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r117 = fptosi <4 x half> poison to <4 x i32>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r118 = fptoui <4 x half> poison to <4 x i64>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r119 = fptosi <4 x half> poison to <4 x i64>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r130 = fptoui <8 x half> poison to <8 x i1>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r131 = fptosi <8 x half> poison to <8 x i1>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r132 = fptoui <8 x half> poison to <8 x i8>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r133 = fptosi <8 x half> poison to <8 x i8>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r134 = fptoui <8 x half> poison to <8 x i16>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r135 = fptosi <8 x half> poison to <8 x i16>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r136 = fptoui <8 x half> poison to <8 x i32>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r137 = fptosi <8 x half> poison to <8 x i32>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:1 Lat:1 SizeLat:1 for: %r138 = fptoui <8 x half> poison to <8 x i64>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:1 Lat:1 SizeLat:1 for: %r139 = fptosi <8 x half> poison to <8 x i64>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:81 CodeSize:1 Lat:1 SizeLat:1 for: %r150 = fptoui <16 x half> poison to <16 x i1>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:81 CodeSize:1 Lat:1 SizeLat:1 for: %r151 = fptosi <16 x half> poison to <16 x i1>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r152 = fptoui <16 x half> poison to <16 x i8>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r153 = fptosi <16 x half> poison to <16 x i8>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r154 = fptoui <16 x half> poison to <16 x i16>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r155 = fptosi <16 x half> poison to <16 x i16>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r156 = fptoui <16 x half> poison to <16 x i32>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r157 = fptosi <16 x half> poison to <16 x i32>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:86 CodeSize:1 Lat:1 SizeLat:1 for: %r158 = fptoui <16 x half> poison to <16 x i64>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:86 CodeSize:1 Lat:1 SizeLat:1 for: %r159 = fptosi <16 x half> poison to <16 x i64>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r250 = uitofp <8 x i1> poison to <8 x half>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r251 = sitofp <8 x i1> poison to <8 x half>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r252 = uitofp <8 x i8> poison to <8 x half>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r253 = sitofp <8 x i8> poison to <8 x half>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r254 = uitofp <8 x i16> poison to <8 x half>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of 1 for: %r255 = sitofp <8 x i16> poison to <8 x half>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r256 = uitofp <8 x i32> poison to <8 x half>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r257 = sitofp <8 x i32> poison to <8 x half>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r258 = uitofp <8 x i64> poison to <8 x half>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r259 = sitofp <8 x i64> poison to <8 x half>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r260 = uitofp <16 x i1> poison to <16 x half>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r261 = sitofp <16 x i1> poison to <16 x half>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r262 = uitofp <16 x i8> poison to <16 x half>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r263 = sitofp <16 x i8> poison to <16 x half>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r264 = uitofp <16 x i16> poison to <16 x half>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r265 = sitofp <16 x i16> poison to <16 x half>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %r266 = uitofp <16 x i32> poison to <16 x half>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %r267 = sitofp <16 x i32> poison to <16 x half>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %r268 = uitofp <16 x i64> poison to <16 x half>
+; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %r269 = sitofp <16 x i64> poison to <16 x half>
 ; CHECK-FP16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %r30 = fptoui half undef to i1
-  %r31 = fptosi half undef to i1
-  %r32 = fptoui half undef to i8
-  %r33 = fptosi half undef to i8
-  %r34 = fptoui half undef to i16
-  %r35 = fptosi half undef to i16
-  %r36 = fptoui half undef to i32
-  %r37 = fptosi half undef to i32
-  %r38 = fptoui half undef to i64
-  %r39 = fptosi half undef to i64
+  %r30 = fptoui half poison to i1
+  %r31 = fptosi half poison to i1
+  %r32 = fptoui half poison to i8
+  %r33 = fptosi half poison to i8
+  %r34 = fptoui half poison to i16
+  %r35 = fptosi half poison to i16
+  %r36 = fptoui half poison to i32
+  %r37 = fptosi half poison to i32
+  %r38 = fptoui half poison to i64
+  %r39 = fptosi half poison to i64
 
-  %r90 = fptoui <2 x half> undef to <2 x i1>
-  %r91 = fptosi <2 x half> undef to <2 x i1>
-  %r92 = fptoui <2 x half> undef to <2 x i8>
-  %r93 = fptosi <2 x half> undef to <2 x i8>
-  %r94 = fptoui <2 x half> undef to <2 x i16>
-  %r95 = fptosi <2 x half> undef to <2 x i16>
-  %r96 = fptoui <2 x half> undef to <2 x i32>
-  %r97 = fptosi <2 x half> undef to <2 x i32>
-  %r98 = fptoui <2 x half> undef to <2 x i64>
-  %r99 = fptosi <2 x half> undef to <2 x i64>
+  %r90 = fptoui <2 x half> poison to <2 x i1>
+  %r91 = fptosi <2 x half> poison to <2 x i1>
+  %r92 = fptoui <2 x half> poison to <2 x i8>
+  %r93 = fptosi <2 x half> poison to <2 x i8>
+  %r94 = fptoui <2 x half> poison to <2 x i16>
+  %r95 = fptosi <2 x half> poison to <2 x i16>
+  %r96 = fptoui <2 x half> poison to <2 x i32>
+  %r97 = fptosi <2 x half> poison to <2 x i32>
+  %r98 = fptoui <2 x half> poison to <2 x i64>
+  %r99 = fptosi <2 x half> poison to <2 x i64>
 
-  %r110 = fptoui <4 x half> undef to <4 x i1>
-  %r111 = fptosi <4 x half> undef to <4 x i1>
-  %r112 = fptoui <4 x half> undef to <4 x i8>
-  %r113 = fptosi <4 x half> undef to <4 x i8>
-  %r114 = fptoui <4 x half> undef to <4 x i16>
-  %r115 = fptosi <4 x half> undef to <4 x i16>
-  %r116 = fptoui <4 x half> undef to <4 x i32>
-  %r117 = fptosi <4 x half> undef to <4 x i32>
-  %r118 = fptoui <4 x half> undef to <4 x i64>
-  %r119 = fptosi <4 x half> undef to <4 x i64>
+  %r110 = fptoui <4 x half> poison to <4 x i1>
+  %r111 = fptosi <4 x half> poison to <4 x i1>
+  %r112 = fptoui <4 x half> poison to <4 x i8>
+  %r113 = fptosi <4 x half> poison to <4 x i8>
+  %r114 = fptoui <4 x half> poison to <4 x i16>
+  %r115 = fptosi <4 x half> poison to <4 x i16>
+  %r116 = fptoui <4 x half> poison to <4 x i32>
+  %r117 = fptosi <4 x half> poison to <4 x i32>
+  %r118 = fptoui <4 x half> poison to <4 x i64>
+  %r119 = fptosi <4 x half> poison to <4 x i64>
 
-  %r130 = fptoui <8 x half> undef to <8 x i1>
-  %r131 = fptosi <8 x half> undef to <8 x i1>
-  %r132 = fptoui <8 x half> undef to <8 x i8>
-  %r133 = fptosi <8 x half> undef to <8 x i8>
-  %r134 = fptoui <8 x half> undef to <8 x i16>
-  %r135 = fptosi <8 x half> undef to <8 x i16>
-  %r136 = fptoui <8 x half> undef to <8 x i32>
-  %r137 = fptosi <8 x half> undef to <8 x i32>
-  %r138 = fptoui <8 x half> undef to <8 x i64>
-  %r139 = fptosi <8 x half> undef to <8 x i64>
+  %r130 = fptoui <8 x half> poison to <8 x i1>
+  %r131 = fptosi <8 x half> poison to <8 x i1>
+  %r132 = fptoui <8 x half> poison to <8 x i8>
+  %r133 = fptosi <8 x half> poison to <8 x i8>
+  %r134 = fptoui <8 x half> poison to <8 x i16>
+  %r135 = fptosi <8 x half> poison to <8 x i16>
+  %r136 = fptoui <8 x half> poison to <8 x i32>
+  %r137 = fptosi <8 x half> poison to <8 x i32>
+  %r138 = fptoui <8 x half> poison to <8 x i64>
+  %r139 = fptosi <8 x half> poison to <8 x i64>
 
-  %r150 = fptoui <16 x half> undef to <16 x i1>
-  %r151 = fptosi <16 x half> undef to <16 x i1>
-  %r152 = fptoui <16 x half> undef to <16 x i8>
-  %r153 = fptosi <16 x half> undef to <16 x i8>
-  %r154 = fptoui <16 x half> undef to <16 x i16>
-  %r155 = fptosi <16 x half> undef to <16 x i16>
-  %r156 = fptoui <16 x half> undef to <16 x i32>
-  %r157 = fptosi <16 x half> undef to <16 x i32>
-  %r158 = fptoui <16 x half> undef to <16 x i64>
-  %r159 = fptosi <16 x half> undef to <16 x i64>
+  %r150 = fptoui <16 x half> poison to <16 x i1>
+  %r151 = fptosi <16 x half> poison to <16 x i1>
+  %r152 = fptoui <16 x half> poison to <16 x i8>
+  %r153 = fptosi <16 x half> poison to <16 x i8>
+  %r154 = fptoui <16 x half> poison to <16 x i16>
+  %r155 = fptosi <16 x half> poison to <16 x i16>
+  %r156 = fptoui <16 x half> poison to <16 x i32>
+  %r157 = fptosi <16 x half> poison to <16 x i32>
+  %r158 = fptoui <16 x half> poison to <16 x i64>
+  %r159 = fptosi <16 x half> poison to <16 x i64>
 
-  %r250 = uitofp <8 x i1> undef to <8 x half>
-  %r251 = sitofp <8 x i1> undef to <8 x half>
-  %r252 = uitofp <8 x i8> undef to <8 x half>
-  %r253 = sitofp <8 x i8> undef to <8 x half>
-  %r254 = uitofp <8 x i16> undef to <8 x half>
-  %r255 = sitofp <8 x i16> undef to <8 x half>
-  %r256 = uitofp <8 x i32> undef to <8 x half>
-  %r257 = sitofp <8 x i32> undef to <8 x half>
-  %r258 = uitofp <8 x i64> undef to <8 x half>
-  %r259 = sitofp <8 x i64> undef to <8 x half>
+  %r250 = uitofp <8 x i1> poison to <8 x half>
+  %r251 = sitofp <8 x i1> poison to <8 x half>
+  %r252 = uitofp <8 x i8> poison to <8 x half>
+  %r253 = sitofp <8 x i8> poison to <8 x half>
+  %r254 = uitofp <8 x i16> poison to <8 x half>
+  %r255 = sitofp <8 x i16> poison to <8 x half>
+  %r256 = uitofp <8 x i32> poison to <8 x half>
+  %r257 = sitofp <8 x i32> poison to <8 x half>
+  %r258 = uitofp <8 x i64> poison to <8 x half>
+  %r259 = sitofp <8 x i64> poison to <8 x half>
 
-  %r260 = uitofp <16 x i1> undef to <16 x half>
-  %r261 = sitofp <16 x i1> undef to <16 x half>
-  %r262 = uitofp <16 x i8> undef to <16 x half>
-  %r263 = sitofp <16 x i8> undef to <16 x half>
-  %r264 = uitofp <16 x i16> undef to <16 x half>
-  %r265 = sitofp <16 x i16> undef to <16 x half>
-  %r266 = uitofp <16 x i32> undef to <16 x half>
-  %r267 = sitofp <16 x i32> undef to <16 x half>
-  %r268 = uitofp <16 x i64> undef to <16 x half>
-  %r269 = sitofp <16 x i64> undef to <16 x half>
+  %r260 = uitofp <16 x i1> poison to <16 x half>
+  %r261 = sitofp <16 x i1> poison to <16 x half>
+  %r262 = uitofp <16 x i8> poison to <16 x half>
+  %r263 = sitofp <16 x i8> poison to <16 x half>
+  %r264 = uitofp <16 x i16> poison to <16 x half>
+  %r265 = sitofp <16 x i16> poison to <16 x half>
+  %r266 = uitofp <16 x i32> poison to <16 x half>
+  %r267 = sitofp <16 x i32> poison to <16 x half>
+  %r268 = uitofp <16 x i64> poison to <16 x half>
+  %r269 = sitofp <16 x i64> poison to <16 x half>
   ret void
 }
 
 define void @bf16cast() {
 ; CHECK-NOFP16-LABEL: 'bf16cast'
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %extf16f32 = fpext bfloat undef to float
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %extv2f16f32 = fpext <2 x bfloat> undef to <2 x float>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %extv4f16f32 = fpext <4 x bfloat> undef to <4 x float>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %extv8f16f32 = fpext <8 x bfloat> undef to <8 x float>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %extv16f16f32 = fpext <16 x bfloat> undef to <16 x float>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %extf16f64 = fpext bfloat undef to double
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %extv2f16f64 = fpext <2 x bfloat> undef to <2 x double>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %extv4f16f64 = fpext <4 x bfloat> undef to <4 x double>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %extv8f16f64 = fpext <8 x bfloat> undef to <8 x double>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %extv16f16f64 = fpext <16 x bfloat> undef to <16 x double>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %truncf16f32 = fptrunc float undef to bfloat
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %truncv2f16f32 = fptrunc <2 x float> undef to <2 x bfloat>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %truncv4f16f32 = fptrunc <4 x float> undef to <4 x bfloat>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %truncv8f16f32 = fptrunc <8 x float> undef to <8 x bfloat>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:1 Lat:1 SizeLat:1 for: %truncv16f16f32 = fptrunc <16 x float> undef to <16 x bfloat>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %truncf16f64 = fptrunc double undef to bfloat
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %truncv2f16f64 = fptrunc <2 x double> undef to <2 x bfloat>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %truncv4f16f64 = fptrunc <4 x double> undef to <4 x bfloat>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %truncv8f16f64 = fptrunc <8 x double> undef to <8 x bfloat>
-; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:1 Lat:1 SizeLat:1 for: %truncv16f16f64 = fptrunc <16 x double> undef to <16 x bfloat>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %extf16f32 = fpext bfloat poison to float
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %extv2f16f32 = fpext <2 x bfloat> poison to <2 x float>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of 1 for: %extv4f16f32 = fpext <4 x bfloat> poison to <4 x float>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %extv8f16f32 = fpext <8 x bfloat> poison to <8 x float>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %extv16f16f32 = fpext <16 x bfloat> poison to <16 x float>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %extf16f64 = fpext bfloat poison to double
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %extv2f16f64 = fpext <2 x bfloat> poison to <2 x double>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %extv4f16f64 = fpext <4 x bfloat> poison to <4 x double>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %extv8f16f64 = fpext <8 x bfloat> poison to <8 x double>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %extv16f16f64 = fpext <16 x bfloat> poison to <16 x double>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %truncf16f32 = fptrunc float poison to bfloat
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %truncv2f16f32 = fptrunc <2 x float> poison to <2 x bfloat>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %truncv4f16f32 = fptrunc <4 x float> poison to <4 x bfloat>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %truncv8f16f32 = fptrunc <8 x float> poison to <8 x bfloat>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:1 Lat:1 SizeLat:1 for: %truncv16f16f32 = fptrunc <16 x float> poison to <16 x bfloat>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %truncf16f64 = fptrunc double poison to bfloat
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %truncv2f16f64 = fptrunc <2 x double> poison to <2 x bfloat>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %truncv4f16f64 = fptrunc <4 x double> poison to <4 x bfloat>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %truncv8f16f64 = fptrunc <8 x double> poison to <8 x bfloat>
+; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:1 Lat:1 SizeLat:1 for: %truncv16f16f64 = fptrunc <16 x double> poison to <16 x bfloat>
 ; CHECK-NOFP16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-BF16-LABEL: 'bf16cast'
-; CHECK-BF16-NEXT:  Cost Model: Found costs of 1 for: %extf16f32 = fpext bfloat undef to float
-; CHECK-BF16-NEXT:  Cost Model: Found costs of 1 for: %extv2f16f32 = fpext <2 x bfloat> undef to <2 x float>
-; CHECK-BF16-NEXT:  Cost Model: Found costs of 1 for: %extv4f16f32 = fpext <4 x bfloat> undef to <4 x float>
-; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %extv8f16f32 = fpext <8 x bfloat> undef to <8 x float>
-; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %extv16f16f32 = fpext <16 x bfloat> undef to <16 x float>
-; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %extf16f64 = fpext bfloat undef to double
-; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %extv2f16f64 = fpext <2 x bfloat> undef to <2 x double>
-; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %extv4f16f64 = fpext <4 x bfloat> undef to <4 x double>
-; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %extv8f16f64 = fpext <8 x bfloat> undef to <8 x double>
-; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %extv16f16f64 = fpext <16 x bfloat> undef to <16 x double>
-; CHECK-BF16-NEXT:  Cost Model: Found costs of 1 for: %truncf16f32 = fptrunc float undef to bfloat
-; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %truncv2f16f32 = fptrunc <2 x float> undef to <2 x bfloat>
-; CHECK-BF16-NEXT:  Cost Model: Found costs of 1 for: %truncv4f16f32 = fptrunc <4 x float> undef to <4 x bfloat>
-; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %truncv8f16f32 = fptrunc <8 x float> undef to <8 x bfloat>
-; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %truncv16f16f32 = fptrunc <16 x float> undef to <16 x bfloat>
-; CHECK-BF16-NEXT:  Cost Model: Found costs of 1 for: %truncf16f64 = fptrunc double undef to bfloat
-; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %truncv2f16f64 = fptrunc <2 x double> undef to <2 x bfloat>
-; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %truncv4f16f64 = fptrunc <4 x double> undef to <4 x bfloat>
-; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %truncv8f16f64 = fptrunc <8 x double> undef to <8 x bfloat>
-; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %truncv16f16f64 = fptrunc <16 x double> undef to <16 x bfloat>
+; CHECK-BF16-NEXT:  Cost Model: Found costs of 1 for: %extf16f32 = fpext bfloat poison to float
+; CHECK-BF16-NEXT:  Cost Model: Found costs of 1 for: %extv2f16f32 = fpext <2 x bfloat> poison to <2 x float>
+; CHECK-BF16-NEXT:  Cost Model: Found costs of 1 for: %extv4f16f32 = fpext <4 x bfloat> poison to <4 x float>
+; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %extv8f16f32 = fpext <8 x bfloat> poison to <8 x float>
+; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %extv16f16f32 = fpext <16 x bfloat> poison to <16 x float>
+; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %extf16f64 = fpext bfloat poison to double
+; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %extv2f16f64 = fpext <2 x bfloat> poison to <2 x double>
+; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %extv4f16f64 = fpext <4 x bfloat> poison to <4 x double>
+; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %extv8f16f64 = fpext <8 x bfloat> poison to <8 x double>
+; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %extv16f16f64 = fpext <16 x bfloat> poison to <16 x double>
+; CHECK-BF16-NEXT:  Cost Model: Found costs of 1 for: %truncf16f32 = fptrunc float poison to bfloat
+; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %truncv2f16f32 = fptrunc <2 x float> poison to <2 x bfloat>
+; CHECK-BF16-NEXT:  Cost Model: Found costs of 1 for: %truncv4f16f32 = fptrunc <4 x float> poison to <4 x bfloat>
+; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %truncv8f16f32 = fptrunc <8 x float> poison to <8 x bfloat>
+; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %truncv16f16f32 = fptrunc <16 x float> poison to <16 x bfloat>
+; CHECK-BF16-NEXT:  Cost Model: Found costs of 1 for: %truncf16f64 = fptrunc double poison to bfloat
+; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %truncv2f16f64 = fptrunc <2 x double> poison to <2 x bfloat>
+; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %truncv4f16f64 = fptrunc <4 x double> poison to <4 x bfloat>
+; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %truncv8f16f64 = fptrunc <8 x double> poison to <8 x bfloat>
+; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %truncv16f16f64 = fptrunc <16 x double> poison to <16 x bfloat>
 ; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %extf16f32 = fpext bfloat undef to float
-  %extv2f16f32 = fpext <2 x bfloat> undef to <2 x float>
-  %extv4f16f32 = fpext <4 x bfloat> undef to <4 x float>
-  %extv8f16f32 = fpext <8 x bfloat> undef to <8 x float>
-  %extv16f16f32 = fpext <16 x bfloat> undef to <16 x float>
-  %extf16f64 = fpext bfloat undef to double
-  %extv2f16f64 = fpext <2 x bfloat> undef to <2 x double>
-  %extv4f16f64 = fpext <4 x bfloat> undef to <4 x double>
-  %extv8f16f64 = fpext <8 x bfloat> undef to <8 x double>
-  %extv16f16f64 = fpext <16 x bfloat> undef to <16 x double>
-  %truncf16f32 = fptrunc float undef to bfloat
-  %truncv2f16f32 = fptrunc <2 x float> undef to <2 x bfloat>
-  %truncv4f16f32 = fptrunc <4 x float> undef to <4 x bfloat>
-  %truncv8f16f32 = fptrunc <8 x float> undef to <8 x bfloat>
-  %truncv16f16f32 = fptrunc <16 x float> undef to <16 x bfloat>
-  %truncf16f64 = fptrunc double undef to bfloat
-  %truncv2f16f64 = fptrunc <2 x double> undef to <2 x bfloat>
-  %truncv4f16f64 = fptrunc <4 x double> undef to <4 x bfloat>
-  %truncv8f16f64 = fptrunc <8 x double> undef to <8 x bfloat>
-  %truncv16f16f64 = fptrunc <16 x double> undef to <16 x bfloat>
+  %extf16f32 = fpext bfloat poison to float
+  %extv2f16f32 = fpext <2 x bfloat> poison to <2 x float>
+  %extv4f16f32 = fpext <4 x bfloat> poison to <4 x float>
+  %extv8f16f32 = fpext <8 x bfloat> poison to <8 x float>
+  %extv16f16f32 = fpext <16 x bfloat> poison to <16 x float>
+  %extf16f64 = fpext bfloat poison to double
+  %extv2f16f64 = fpext <2 x bfloat> poison to <2 x double>
+  %extv4f16f64 = fpext <4 x bfloat> poison to <4 x double>
+  %extv8f16f64 = fpext <8 x bfloat> poison to <8 x double>
+  %extv16f16f64 = fpext <16 x bfloat> poison to <16 x double>
+  %truncf16f32 = fptrunc float poison to bfloat
+  %truncv2f16f32 = fptrunc <2 x float> poison to <2 x bfloat>
+  %truncv4f16f32 = fptrunc <4 x float> poison to <4 x bfloat>
+  %truncv8f16f32 = fptrunc <8 x float> poison to <8 x bfloat>
+  %truncv16f16f32 = fptrunc <16 x float> poison to <16 x bfloat>
+  %truncf16f64 = fptrunc double poison to bfloat
+  %truncv2f16f64 = fptrunc <2 x double> poison to <2 x bfloat>
+  %truncv4f16f64 = fptrunc <4 x double> poison to <4 x bfloat>
+  %truncv8f16f64 = fptrunc <8 x double> poison to <8 x bfloat>
+  %truncv16f16f64 = fptrunc <16 x double> poison to <16 x bfloat>
   ret void
 }
diff --git a/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll b/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll
index 56ae1ac86c825..b4ced2467440e 100644
--- a/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll
@@ -5,24 +5,24 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define void @fixed() {
 ; CHECK-LABEL: 'fixed'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:8 Lat:12 SizeLat:12 for: %v2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:16 Lat:24 SizeLat:24 for: %v4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:32 Lat:48 SizeLat:48 for: %v8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 8, <8 x i1> undef, <8 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:80 CodeSize:64 Lat:96 SizeLat:96 for: %v16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 8, <16 x i1> undef, <16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:8 Lat:12 SizeLat:12 for: %v2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:16 Lat:24 SizeLat:24 for: %v4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:32 Lat:48 SizeLat:48 for: %v8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:8 Lat:12 SizeLat:12 for: %v2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:16 Lat:24 SizeLat:24 for: %v4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:8 Lat:12 SizeLat:12 for: %v2i64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:7 Lat:10 SizeLat:10 for: %v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:15 Lat:22 SizeLat:22 for: %v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:31 Lat:46 SizeLat:46 for: %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:7 Lat:10 SizeLat:10 for: %v2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:15 Lat:22 SizeLat:22 for: %v4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:7 Lat:10 SizeLat:10 for: %v2f64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:16 Lat:24 SizeLat:24 for: %v4i64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:152 CodeSize:124 Lat:184 SizeLat:184 for: %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0(ptr undef, i32 8, <32 x i1> undef, <32 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:8 Lat:12 SizeLat:12 for: %v2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr align 8 undef, <2 x i1> undef, <2 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:16 Lat:24 SizeLat:24 for: %v4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr align 8 undef, <4 x i1> undef, <4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:32 Lat:48 SizeLat:48 for: %v8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr align 8 undef, <8 x i1> undef, <8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:80 CodeSize:64 Lat:96 SizeLat:96 for: %v16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 8 undef, <16 x i1> undef, <16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:8 Lat:12 SizeLat:12 for: %v2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0(ptr align 8 undef, <2 x i1> undef, <2 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:16 Lat:24 SizeLat:24 for: %v4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 8 undef, <4 x i1> undef, <4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:32 Lat:48 SizeLat:48 for: %v8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 8 undef, <8 x i1> undef, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:8 Lat:12 SizeLat:12 for: %v2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 8 undef, <2 x i1> undef, <2 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:16 Lat:24 SizeLat:24 for: %v4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 8 undef, <4 x i1> undef, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:8 Lat:12 SizeLat:12 for: %v2i64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr align 8 undef, <2 x i1> undef, <2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:7 Lat:10 SizeLat:10 for: %v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr align 8 undef, <2 x i1> undef, <2 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:15 Lat:22 SizeLat:22 for: %v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr align 8 undef, <4 x i1> undef, <4 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:31 Lat:46 SizeLat:46 for: %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr align 8 undef, <8 x i1> undef, <8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:7 Lat:10 SizeLat:10 for: %v2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 8 undef, <2 x i1> undef, <2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:15 Lat:22 SizeLat:22 for: %v4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 8 undef, <4 x i1> undef, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:7 Lat:10 SizeLat:10 for: %v2f64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 8 undef, <2 x i1> undef, <2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:16 Lat:24 SizeLat:24 for: %v4i64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr align 8 undef, <4 x i1> undef, <4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:152 CodeSize:124 Lat:184 SizeLat:184 for: %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0(ptr align 8 undef, <32 x i1> undef, <32 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 entry:
@@ -56,26 +56,26 @@ entry:
 
 define void @scalable() {
 ; CHECK-LABEL: 'scalable'
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv2i8 = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv4i8 = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv8i8 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv16i8 = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr undef, i32 8, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv2i16 = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv4i16 = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv8i16 = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv2i32 = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv4i32 = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv2i64 = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv2f16 = call <vscale x 2 x half> @llvm.masked.load.nxv2f16.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv4f16 = call <vscale x 4 x half> @llvm.masked.load.nxv4f16.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv8f16 = call <vscale x 8 x half> @llvm.masked.load.nxv8f16.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv2f32 = call <vscale x 2 x float> @llvm.masked.load.nxv2f32.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv4f32 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv2f64 = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found costs of Invalid for: %nxv1i64 = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0(ptr undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %nxv4i64 = call <vscale x 4 x i64> @llvm.masked.load.nxv4i64.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %nxv32f16 = call <vscale x 32 x half> @llvm.masked.load.nxv32f16.p0(ptr undef, i32 8, <vscale x 32 x i1> undef, <vscale x 32 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of Invalid for: %nxv4i1 = call <vscale x 4 x i1> @llvm.masked.load.nxv4i1.p0(ptr undef, i32 16, <vscale x 4 x i1> undef, <vscale x 4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv2i8 = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv4i8 = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv8i8 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv16i8 = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 8 undef, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv2i16 = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv4i16 = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv8i16 = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv2i32 = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv4i32 = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv2i64 = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv2f16 = call <vscale x 2 x half> @llvm.masked.load.nxv2f16.p0(ptr align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv4f16 = call <vscale x 4 x half> @llvm.masked.load.nxv4f16.p0(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv8f16 = call <vscale x 8 x half> @llvm.masked.load.nxv8f16.p0(ptr align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv2f32 = call <vscale x 2 x float> @llvm.masked.load.nxv2f32.p0(ptr align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv4f32 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv2f64 = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found costs of Invalid for: %nxv1i64 = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0(ptr align 8 undef, <vscale x 1 x i1> undef, <vscale x 1 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %nxv4i64 = call <vscale x 4 x i64> @llvm.masked.load.nxv4i64.p0(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %nxv32f16 = call <vscale x 32 x half> @llvm.masked.load.nxv32f16.p0(ptr align 8 undef, <vscale x 32 x i1> undef, <vscale x 32 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of Invalid for: %nxv4i1 = call <vscale x 4 x i1> @llvm.masked.load.nxv4i1.p0(ptr align 16 undef, <vscale x 4 x i1> undef, <vscale x 4 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 entry:
@@ -113,69 +113,69 @@ entry:
 
 define void @scalable_ext_loads() {
 ; CHECK-LABEL: 'scalable_ext_loads'
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv16i8 = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr undef, i32 8, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv16i8 = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 8 undef, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %zext.nxv16i8to16 = zext <vscale x 16 x i8> %load.nxv16i8 to <vscale x 16 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv16i8.2 = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr undef, i32 8, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv16i8.2 = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 8 undef, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zext.nxv16i8to32 = zext <vscale x 16 x i8> %load.nxv16i8.2 to <vscale x 16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv16i8.3 = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr undef, i32 8, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv16i8.3 = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 8 undef, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %zext.nxv16i8to64 = zext <vscale x 16 x i8> %load.nxv16i8.3 to <vscale x 16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv8i8 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv8i8 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zext.nxv8i8to16 = zext <vscale x 8 x i8> %load.nxv8i8 to <vscale x 8 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv8i8.2 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv8i8.2 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %zext.nxv8i8to32 = zext <vscale x 8 x i8> %load.nxv8i8.2 to <vscale x 8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv8i8.3 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv8i8.3 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zext.nxv8i8to64 = zext <vscale x 8 x i8> %load.nxv8i8.3 to <vscale x 8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv4i8 = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv4i8 = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zext.nxv4i8to32 = zext <vscale x 4 x i8> %load.nxv4i8 to <vscale x 4 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv4i8.2 = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv4i8.2 = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %zext.nxv4i8to64 = zext <vscale x 4 x i8> %load.nxv4i8.2 to <vscale x 4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv2i8 = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv2i8 = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zext.nxv2i8to64 = zext <vscale x 2 x i8> %load.nxv2i8 to <vscale x 2 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv8i16 = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv8i16 = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %zext.nxv8i16to32 = zext <vscale x 8 x i16> %load.nxv8i16 to <vscale x 8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv8i16.2 = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv8i16.2 = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zext.nxv8i16to64 = zext <vscale x 8 x i16> %load.nxv8i16.2 to <vscale x 8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv4i16 = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv4i16 = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zext.nxv4i16to32 = zext <vscale x 4 x i16> %load.nxv4i16 to <vscale x 4 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv4i16.2 = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv4i16.2 = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %zext.nxv4i16to64 = zext <vscale x 4 x i16> %load.nxv4i16.2 to <vscale x 4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv2i16 = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv2i16 = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zext.nxv2i16to64 = zext <vscale x 2 x i16> %load.nxv2i16 to <vscale x 2 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %zext.nxv4i32to64 = zext <vscale x 4 x i32> %load.nxv4i32 to <vscale x 4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv2i32 = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv2i32 = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zext.nxv2i32to64 = zext <vscale x 2 x i32> %load.nxv2i32 to <vscale x 2 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv16i8 = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr undef, i32 8, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv16i8 = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 8 undef, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sext.nxv16i8to16 = sext <vscale x 16 x i8> %load2.nxv16i8 to <vscale x 16 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv16i8.2 = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr undef, i32 8, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv16i8.2 = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 8 undef, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sext.nxv16i8to32 = sext <vscale x 16 x i8> %load2.nxv16i8.2 to <vscale x 16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv16i8.3 = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr undef, i32 8, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv16i8.3 = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 8 undef, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %sext.nxv16i8to64 = sext <vscale x 16 x i8> %load2.nxv16i8.3 to <vscale x 16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv8i8 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv8i8 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sext.nxv8i8to16 = sext <vscale x 8 x i8> %load2.nxv8i8 to <vscale x 8 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv8i8.2 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv8i8.2 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sext.nxv8i8to32 = sext <vscale x 8 x i8> %load2.nxv8i8.2 to <vscale x 8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv8i8.3 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv8i8.3 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sext.nxv8i8to64 = sext <vscale x 8 x i8> %load2.nxv8i8.3 to <vscale x 8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv4i8 = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv4i8 = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sext.nxv4i8to32 = sext <vscale x 4 x i8> %load2.nxv4i8 to <vscale x 4 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv4i8.2 = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv4i8.2 = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sext.nxv4i8to64 = sext <vscale x 4 x i8> %load2.nxv4i8.2 to <vscale x 4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv2i8 = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv2i8 = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sext.nxv2i8to64 = sext <vscale x 2 x i8> %load2.nxv2i8 to <vscale x 2 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv8i16 = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv8i16 = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sext.nxv8i16to32 = sext <vscale x 8 x i16> %load2.nxv8i16 to <vscale x 8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv8i16.2 = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv8i16.2 = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sext.nxv8i16to64 = sext <vscale x 8 x i16> %load2.nxv8i16.2 to <vscale x 8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv4i16 = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv4i16 = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sext.nxv4i16to32 = sext <vscale x 4 x i16> %load2.nxv4i16 to <vscale x 4 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv4i16.2 = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv4i16.2 = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sext.nxv4i16to64 = sext <vscale x 4 x i16> %load2.nxv4i16.2 to <vscale x 4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv2i16 = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv2i16 = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sext.nxv2i16to64 = sext <vscale x 2 x i16> %load2.nxv2i16 to <vscale x 2 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sext.nxv4i32to64 = sext <vscale x 4 x i32> %load2.nxv4i32 to <vscale x 4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv2i32 = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv2i32 = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sext.nxv2i32to64 = sext <vscale x 2 x i32> %load2.nxv2i32 to <vscale x 2 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/AArch64/masked_ldst_vls.ll b/llvm/test/Analysis/CostModel/AArch64/masked_ldst_vls.ll
index 05194541f5e96..fa53a184e317b 100644
--- a/llvm/test/Analysis/CostModel/AArch64/masked_ldst_vls.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/masked_ldst_vls.ll
@@ -19,13 +19,13 @@ target triple = "aarch64-unknown-linux-gnu"
 
 define void @fixed_sve_vls() #0 {
 ; CHECK-LABEL: 'fixed_sve_vls'
-; CHECK:  Cost Model: Found an estimated cost of [[#div(2047,VBITS)+1]] for instruction: %v256i8 = call <256 x i8> @llvm.masked.load.v256i8.p0(ptr undef, i32 8, <256 x i1> undef, <256 x i8> undef)
-; CHECK:  Cost Model: Found an estimated cost of [[#div(4091,VBITS)+1]] for instruction: %v256i16 = call <256 x i16> @llvm.masked.load.v256i16.p0(ptr undef, i32 8, <256 x i1> undef, <256 x i16> undef)
-; CHECK:  Cost Model: Found an estimated cost of [[#div(511,VBITS)+1]] for instruction: %v16i32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 8, <16 x i1> undef, <16 x i32> undef)
-; CHECK:  Cost Model: Found an estimated cost of [[#div(1023,VBITS)+1]] for instruction: %v16i64 = call <16 x i64> @llvm.masked.load.v16i64.p0(ptr undef, i32 8, <16 x i1> undef, <16 x i64> undef)
-; CHECK:  Cost Model: Found an estimated cost of [[#div(8191,VBITS)+1]] for instruction: %v512f16 = call <512 x half> @llvm.masked.load.v512f16.p0(ptr undef, i32 8, <512 x i1> undef, <512 x half> undef)
-; CHECK:  Cost Model: Found an estimated cost of [[#div(8191,VBITS)+1]] for instruction: %v256f32 = call <256 x float> @llvm.masked.load.v256f32.p0(ptr undef, i32 8, <256 x i1> undef, <256 x float> undef)
-; CHECK:  Cost Model: Found an estimated cost of [[#div(8191,VBITS)+1]] for instruction: %v128f64 = call <128 x double> @llvm.masked.load.v128f64.p0(ptr undef, i32 8, <128 x i1> undef, <128 x double> undef)
+; CHECK:  Cost Model: Found an estimated cost of [[#div(2047,VBITS)+1]] for instruction: %v256i8 = call <256 x i8> @llvm.masked.load.v256i8.p0(ptr align 8 undef, <256 x i1> undef, <256 x i8> undef)
+; CHECK:  Cost Model: Found an estimated cost of [[#div(4091,VBITS)+1]] for instruction: %v256i16 = call <256 x i16> @llvm.masked.load.v256i16.p0(ptr align 8 undef, <256 x i1> undef, <256 x i16> undef)
+; CHECK:  Cost Model: Found an estimated cost of [[#div(511,VBITS)+1]] for instruction: %v16i32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 8 undef, <16 x i1> undef, <16 x i32> undef)
+; CHECK:  Cost Model: Found an estimated cost of [[#div(1023,VBITS)+1]] for instruction: %v16i64 = call <16 x i64> @llvm.masked.load.v16i64.p0(ptr align 8 undef, <16 x i1> undef, <16 x i64> undef)
+; CHECK:  Cost Model: Found an estimated cost of [[#div(8191,VBITS)+1]] for instruction: %v512f16 = call <512 x half> @llvm.masked.load.v512f16.p0(ptr align 8 undef, <512 x i1> undef, <512 x half> undef)
+; CHECK:  Cost Model: Found an estimated cost of [[#div(8191,VBITS)+1]] for instruction: %v256f32 = call <256 x float> @llvm.masked.load.v256f32.p0(ptr align 8 undef, <256 x i1> undef, <256 x float> undef)
+; CHECK:  Cost Model: Found an estimated cost of [[#div(8191,VBITS)+1]] for instruction: %v128f64 = call <128 x double> @llvm.masked.load.v128f64.p0(ptr align 8 undef, <128 x i1> undef, <128 x double> undef)
 ; CHECK:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 entry:
   %v256i8 = call <256 x i8> @llvm.masked.load.v256i8.p0(ptr undef, i32 8, <256 x i1> undef, <256 x i8> undef)
diff --git a/llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll b/llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll
index ae638e5dd366d..d031ac64c6b2e 100644
--- a/llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll
@@ -190,19 +190,19 @@ declare <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr>, i32 immarg, <4 x i1>,
 define <4 x i8> @gather_load_4xi8_constant_mask(<4 x ptr> %ptrs) {
 ; CHECK:         gather_load_4xi8_constant_mask
 ; CHECK-NEON-LABEL: 'gather_load_4xi8_constant_mask'
-; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:20 SizeLat:20 for: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> splat (i1 true), <4 x i8> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:20 SizeLat:20 for: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> align 1 %ptrs, <4 x i1> splat (i1 true), <4 x i8> undef)
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i8> %lv
 ;
 ; CHECK-SVE-128-LABEL: 'gather_load_4xi8_constant_mask'
-; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:20 SizeLat:20 for: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> splat (i1 true), <4 x i8> undef)
+; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:20 SizeLat:20 for: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> align 1 %ptrs, <4 x i1> splat (i1 true), <4 x i8> undef)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i8> %lv
 ;
 ; CHECK-SVE-256-LABEL: 'gather_load_4xi8_constant_mask'
-; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 40 for: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> splat (i1 true), <4 x i8> undef)
+; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 40 for: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> align 1 %ptrs, <4 x i1> splat (i1 true), <4 x i8> undef)
 ; CHECK-SVE-256-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i8> %lv
 ;
 ; CHECK-SVE-512-LABEL: 'gather_load_4xi8_constant_mask'
-; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 40 for: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> splat (i1 true), <4 x i8> undef)
+; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 40 for: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> align 1 %ptrs, <4 x i1> splat (i1 true), <4 x i8> undef)
 ; CHECK-SVE-512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i8> %lv
 ;
   %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef)
@@ -212,19 +212,19 @@ define <4 x i8> @gather_load_4xi8_constant_mask(<4 x ptr> %ptrs) {
 define <4 x i8> @gather_load_4xi8_variable_mask(<4 x ptr> %ptrs, <4 x i1> %cond) {
 ; CHECK:         gather_load_4xi8_variable_mask
 ; CHECK-NEON-LABEL: 'gather_load_4xi8_variable_mask'
-; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i8> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> align 1 %ptrs, <4 x i1> %cond, <4 x i8> undef)
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i8> %lv
 ;
 ; CHECK-SVE-128-LABEL: 'gather_load_4xi8_variable_mask'
-; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i8> undef)
+; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> align 1 %ptrs, <4 x i1> %cond, <4 x i8> undef)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i8> %lv
 ;
 ; CHECK-SVE-256-LABEL: 'gather_load_4xi8_variable_mask'
-; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 40 for: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i8> undef)
+; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 40 for: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> align 1 %ptrs, <4 x i1> %cond, <4 x i8> undef)
 ; CHECK-SVE-256-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i8> %lv
 ;
 ; CHECK-SVE-512-LABEL: 'gather_load_4xi8_variable_mask'
-; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 40 for: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i8> undef)
+; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 40 for: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> align 1 %ptrs, <4 x i1> %cond, <4 x i8> undef)
 ; CHECK-SVE-512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i8> %lv
 ;
   %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i8> undef)
@@ -235,19 +235,19 @@ declare void @llvm.masked.scatter.v4i8.v4p0(<4 x i8>, <4 x ptr>, i32 immarg, <4
 define void @scatter_store_4xi8_constant_mask(<4 x i8> %val, <4 x ptr> %ptrs) {
 ; CHECK:         scatter_store_4xi8_constant_mask
 ; CHECK-NEON-LABEL: 'scatter_store_4xi8_constant_mask'
-; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> splat (i1 true))
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> align 1 %ptrs, <4 x i1> splat (i1 true))
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-128-LABEL: 'scatter_store_4xi8_constant_mask'
-; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> splat (i1 true))
+; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> align 1 %ptrs, <4 x i1> splat (i1 true))
 ; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-256-LABEL: 'scatter_store_4xi8_constant_mask'
-; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 40 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> splat (i1 true))
+; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 40 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> align 1 %ptrs, <4 x i1> splat (i1 true))
 ; CHECK-SVE-256-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-512-LABEL: 'scatter_store_4xi8_constant_mask'
-; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 40 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> splat (i1 true))
+; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 40 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> align 1 %ptrs, <4 x i1> splat (i1 true))
 ; CHECK-SVE-512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
@@ -257,19 +257,19 @@ define void @scatter_store_4xi8_constant_mask(<4 x i8> %val, <4 x ptr> %ptrs) {
 define void @scatter_store_4xi8_variable_mask(<4 x i8> %val, <4 x ptr> %ptrs, <4 x i1> %cond) {
 ; CHECK:         scatter_store_4xi8_variable_mask
 ; CHECK-NEON-LABEL: 'scatter_store_4xi8_variable_mask'
-; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> align 1 %ptrs, <4 x i1> %cond)
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-128-LABEL: 'scatter_store_4xi8_variable_mask'
-; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond)
+; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> align 1 %ptrs, <4 x i1> %cond)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-256-LABEL: 'scatter_store_4xi8_variable_mask'
-; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 40 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond)
+; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 40 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> align 1 %ptrs, <4 x i1> %cond)
 ; CHECK-SVE-256-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-512-LABEL: 'scatter_store_4xi8_variable_mask'
-; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 40 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond)
+; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 40 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> align 1 %ptrs, <4 x i1> %cond)
 ; CHECK-SVE-512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond)
@@ -280,19 +280,19 @@ declare <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr>, i32 immarg, <4 x i1>
 define <4 x i32> @gather_load_4xi32_constant_mask(<4 x ptr> %ptrs) {
 ; CHECK:         gather_load_4xi32_constant_mask
 ; CHECK-NEON-LABEL: 'gather_load_4xi32_constant_mask'
-; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:20 SizeLat:20 for: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> splat (i1 true), <4 x i32> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:20 SizeLat:20 for: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 %ptrs, <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %lv
 ;
 ; CHECK-SVE-128-LABEL: 'gather_load_4xi32_constant_mask'
-; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:20 SizeLat:20 for: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> splat (i1 true), <4 x i32> undef)
+; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:20 SizeLat:20 for: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 %ptrs, <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %lv
 ;
 ; CHECK-SVE-256-LABEL: 'gather_load_4xi32_constant_mask'
-; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 40 for: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> splat (i1 true), <4 x i32> undef)
+; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 40 for: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 %ptrs, <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-SVE-256-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %lv
 ;
 ; CHECK-SVE-512-LABEL: 'gather_load_4xi32_constant_mask'
-; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 40 for: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> splat (i1 true), <4 x i32> undef)
+; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 40 for: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 %ptrs, <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-SVE-512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %lv
 ;
   %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
@@ -302,19 +302,19 @@ define <4 x i32> @gather_load_4xi32_constant_mask(<4 x ptr> %ptrs) {
 define <4 x i32> @gather_load_4xi32_variable_mask(<4 x ptr> %ptrs, <4 x i1> %cond) {
 ; CHECK:         gather_load_4xi32_variable_mask
 ; CHECK-NEON-LABEL: 'gather_load_4xi32_variable_mask'
-; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i32> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 %ptrs, <4 x i1> %cond, <4 x i32> undef)
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %lv
 ;
 ; CHECK-SVE-128-LABEL: 'gather_load_4xi32_variable_mask'
-; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i32> undef)
+; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 %ptrs, <4 x i1> %cond, <4 x i32> undef)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %lv
 ;
 ; CHECK-SVE-256-LABEL: 'gather_load_4xi32_variable_mask'
-; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 40 for: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i32> undef)
+; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 40 for: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 %ptrs, <4 x i1> %cond, <4 x i32> undef)
 ; CHECK-SVE-256-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %lv
 ;
 ; CHECK-SVE-512-LABEL: 'gather_load_4xi32_variable_mask'
-; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 40 for: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i32> undef)
+; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 40 for: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 %ptrs, <4 x i1> %cond, <4 x i32> undef)
 ; CHECK-SVE-512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %lv
 ;
   %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i32> undef)
@@ -325,19 +325,19 @@ declare void @llvm.masked.scatter.v4i32.v4p0(<4 x i32>, <4 x ptr>, i32 immarg, <
 define void @scatter_store_4xi32_constant_mask(<4 x i32> %val, <4 x ptr> %ptrs) {
 ; CHECK:         scatter_store_4xi32_constant_mask
 ; CHECK-NEON-LABEL: 'scatter_store_4xi32_constant_mask'
-; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> splat (i1 true))
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> align 1 %ptrs, <4 x i1> splat (i1 true))
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-128-LABEL: 'scatter_store_4xi32_constant_mask'
-; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> splat (i1 true))
+; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> align 1 %ptrs, <4 x i1> splat (i1 true))
 ; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-256-LABEL: 'scatter_store_4xi32_constant_mask'
-; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 40 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> splat (i1 true))
+; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 40 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> align 1 %ptrs, <4 x i1> splat (i1 true))
 ; CHECK-SVE-256-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-512-LABEL: 'scatter_store_4xi32_constant_mask'
-; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 40 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> splat (i1 true))
+; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 40 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> align 1 %ptrs, <4 x i1> splat (i1 true))
 ; CHECK-SVE-512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
@@ -347,19 +347,19 @@ define void @scatter_store_4xi32_constant_mask(<4 x i32> %val, <4 x ptr> %ptrs)
 define void @scatter_store_4xi32_variable_mask(<4 x i32> %val, <4 x ptr> %ptrs, <4 x i1> %cond) {
 ; CHECK:         scatter_store_4xi32_variable_mask
 ; CHECK-NEON-LABEL: 'scatter_store_4xi32_variable_mask'
-; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> align 1 %ptrs, <4 x i1> %cond)
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-128-LABEL: 'scatter_store_4xi32_variable_mask'
-; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond)
+; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> align 1 %ptrs, <4 x i1> %cond)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-256-LABEL: 'scatter_store_4xi32_variable_mask'
-; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 40 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond)
+; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 40 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> align 1 %ptrs, <4 x i1> %cond)
 ; CHECK-SVE-256-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-512-LABEL: 'scatter_store_4xi32_variable_mask'
-; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 40 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond)
+; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 40 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> align 1 %ptrs, <4 x i1> %cond)
 ; CHECK-SVE-512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond)
@@ -370,19 +370,19 @@ declare <256 x i16> @llvm.masked.gather.v256i16.v256p0(<256 x ptr>, i32, <256 x
 define void @sve_gather_vls(<256 x i1> %v256i1mask) {
 ; CHECK-LABEL: 'sve_scatter_vls'
 ; CHECK-NEON-LABEL: 'sve_gather_vls'
-; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1792 CodeSize:1280 Lat:2048 SizeLat:2048 for: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1792 CodeSize:1280 Lat:2048 SizeLat:2048 for: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0(<256 x ptr> align 2 undef, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer)
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-128-LABEL: 'sve_gather_vls'
-; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:1792 CodeSize:1280 Lat:2048 SizeLat:2048 for: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer)
+; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:1792 CodeSize:1280 Lat:2048 SizeLat:2048 for: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0(<256 x ptr> align 2 undef, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-256-LABEL: 'sve_gather_vls'
-; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 2560 for: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer)
+; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 2560 for: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0(<256 x ptr> align 2 undef, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer)
 ; CHECK-SVE-256-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-512-LABEL: 'sve_gather_vls'
-; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 2560 for: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer)
+; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 2560 for: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0(<256 x ptr> align 2 undef, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer)
 ; CHECK-SVE-512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 entry:
@@ -394,19 +394,19 @@ declare <256 x float> @llvm.masked.gather.v256f32.v256p0(<256 x ptr>, i32, <256
 define void @sve_gather_vls_float(<256 x i1> %v256i1mask) {
 ; CHECK-LABEL: 'sve_gather_vls_float'
 ; CHECK-NEON-LABEL: 'sve_gather_vls_float'
-; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1664 CodeSize:1216 Lat:1920 SizeLat:1920 for: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x float> zeroinitializer)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1664 CodeSize:1216 Lat:1920 SizeLat:1920 for: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0(<256 x ptr> align 4 undef, <256 x i1> %v256i1mask, <256 x float> zeroinitializer)
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-128-LABEL: 'sve_gather_vls_float'
-; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:1664 CodeSize:1216 Lat:1920 SizeLat:1920 for: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x float> zeroinitializer)
+; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:1664 CodeSize:1216 Lat:1920 SizeLat:1920 for: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0(<256 x ptr> align 4 undef, <256 x i1> %v256i1mask, <256 x float> zeroinitializer)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-256-LABEL: 'sve_gather_vls_float'
-; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 2560 for: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x float> zeroinitializer)
+; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 2560 for: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0(<256 x ptr> align 4 undef, <256 x i1> %v256i1mask, <256 x float> zeroinitializer)
 ; CHECK-SVE-256-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-512-LABEL: 'sve_gather_vls_float'
-; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 2560 for: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x float> zeroinitializer)
+; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 2560 for: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0(<256 x ptr> align 4 undef, <256 x i1> %v256i1mask, <256 x float> zeroinitializer)
 ; CHECK-SVE-512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 entry:
@@ -418,19 +418,19 @@ declare void @llvm.masked.scatter.v256i8.v256p0(<256 x i8>, <256 x ptr>, i32, <2
 define void @sve_scatter_vls(<256 x i1> %v256i1mask){
 ; CHECK-LABEL: 'sve_scatter_vls'
 ; CHECK-NEON-LABEL: 'sve_scatter_vls'
-; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1792 CodeSize:1280 Lat:2048 SizeLat:2048 for: call void @llvm.masked.scatter.v256i8.v256p0(<256 x i8> undef, <256 x ptr> undef, i32 0, <256 x i1> %v256i1mask)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1792 CodeSize:1280 Lat:2048 SizeLat:2048 for: call void @llvm.masked.scatter.v256i8.v256p0(<256 x i8> undef, <256 x ptr> align 1 undef, <256 x i1> %v256i1mask)
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-128-LABEL: 'sve_scatter_vls'
-; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:1792 CodeSize:1280 Lat:2048 SizeLat:2048 for: call void @llvm.masked.scatter.v256i8.v256p0(<256 x i8> undef, <256 x ptr> undef, i32 0, <256 x i1> %v256i1mask)
+; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:1792 CodeSize:1280 Lat:2048 SizeLat:2048 for: call void @llvm.masked.scatter.v256i8.v256p0(<256 x i8> undef, <256 x ptr> align 1 undef, <256 x i1> %v256i1mask)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-256-LABEL: 'sve_scatter_vls'
-; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 2560 for: call void @llvm.masked.scatter.v256i8.v256p0(<256 x i8> undef, <256 x ptr> undef, i32 0, <256 x i1> %v256i1mask)
+; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 2560 for: call void @llvm.masked.scatter.v256i8.v256p0(<256 x i8> undef, <256 x ptr> align 1 undef, <256 x i1> %v256i1mask)
 ; CHECK-SVE-256-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-512-LABEL: 'sve_scatter_vls'
-; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 2560 for: call void @llvm.masked.scatter.v256i8.v256p0(<256 x i8> undef, <256 x ptr> undef, i32 0, <256 x i1> %v256i1mask)
+; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 2560 for: call void @llvm.masked.scatter.v256i8.v256p0(<256 x i8> undef, <256 x ptr> align 1 undef, <256 x i1> %v256i1mask)
 ; CHECK-SVE-512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 entry:
@@ -442,19 +442,19 @@ declare void @llvm.masked.scatter.v512f16.v512p0(<512 x half>, <512 x ptr>, i32,
 define void @sve_scatter_vls_float(<512 x i1> %v512i1mask){
 ; CHECK-LABEL: 'sve_scatter_vls_float'
 ; CHECK-NEON-LABEL: 'sve_scatter_vls_float'
-; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:3456 CodeSize:2496 Lat:3968 SizeLat:3968 for: call void @llvm.masked.scatter.v512f16.v512p0(<512 x half> undef, <512 x ptr> undef, i32 0, <512 x i1> %v512i1mask)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:3456 CodeSize:2496 Lat:3968 SizeLat:3968 for: call void @llvm.masked.scatter.v512f16.v512p0(<512 x half> undef, <512 x ptr> align 2 undef, <512 x i1> %v512i1mask)
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-128-LABEL: 'sve_scatter_vls_float'
-; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:3456 CodeSize:2496 Lat:3968 SizeLat:3968 for: call void @llvm.masked.scatter.v512f16.v512p0(<512 x half> undef, <512 x ptr> undef, i32 0, <512 x i1> %v512i1mask)
+; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:3456 CodeSize:2496 Lat:3968 SizeLat:3968 for: call void @llvm.masked.scatter.v512f16.v512p0(<512 x half> undef, <512 x ptr> align 2 undef, <512 x i1> %v512i1mask)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-256-LABEL: 'sve_scatter_vls_float'
-; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 5120 for: call void @llvm.masked.scatter.v512f16.v512p0(<512 x half> undef, <512 x ptr> undef, i32 0, <512 x i1> %v512i1mask)
+; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 5120 for: call void @llvm.masked.scatter.v512f16.v512p0(<512 x half> undef, <512 x ptr> align 2 undef, <512 x i1> %v512i1mask)
 ; CHECK-SVE-256-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-512-LABEL: 'sve_scatter_vls_float'
-; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 5120 for: call void @llvm.masked.scatter.v512f16.v512p0(<512 x half> undef, <512 x ptr> undef, i32 0, <512 x i1> %v512i1mask)
+; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 5120 for: call void @llvm.masked.scatter.v512f16.v512p0(<512 x half> undef, <512 x ptr> align 2 undef, <512 x i1> %v512i1mask)
 ; CHECK-SVE-512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   call void @llvm.masked.scatter.v512f16.v512p0(<512 x half> undef, <512 x ptr> undef, i32 0, <512 x i1> %v512i1mask)
diff --git a/llvm/test/Analysis/CostModel/AArch64/no-sve-no-neon.ll b/llvm/test/Analysis/CostModel/AArch64/no-sve-no-neon.ll
index 20b83bec6cf49..9aea58ec3937e 100644
--- a/llvm/test/Analysis/CostModel/AArch64/no-sve-no-neon.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/no-sve-no-neon.ll
@@ -7,13 +7,13 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define void @uitofp() {
 ; CHECK-NONEON-LABEL: 'uitofp'
-; CHECK-NONEON-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:1 Lat:1 SizeLat:1 for: %conv = uitofp <16 x i64> undef to <16 x float>
+; CHECK-NONEON-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:1 Lat:1 SizeLat:1 for: %conv = uitofp <16 x i64> poison to <16 x float>
 ; CHECK-NONEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-WITHSVE-LABEL: 'uitofp'
-; CHECK-WITHSVE-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: %conv = uitofp <16 x i64> undef to <16 x float>
+; CHECK-WITHSVE-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: %conv = uitofp <16 x i64> poison to <16 x float>
 ; CHECK-WITHSVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %conv = uitofp <16 x i64> undef to <16 x float>
+  %conv = uitofp <16 x i64> poison to <16 x float>
   ret void
 }
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-cast.ll b/llvm/test/Analysis/CostModel/AArch64/sve-cast.ll
index cfb130eb5ec32..ecb4e1423f8b9 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-cast.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-cast.ll
@@ -8,1631 +8,1631 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define void @ext() {
 ; CHECK-SVE-LABEL: 'ext'
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r0 = sext i1 undef to i8
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r1 = zext i1 undef to i8
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r2 = sext i1 undef to i16
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r3 = zext i1 undef to i16
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r4 = sext i1 undef to i32
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r5 = zext i1 undef to i32
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r6 = sext i1 undef to i64
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r7 = zext i1 undef to i64
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r9 = sext i8 undef to i16
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r10 = zext i8 undef to i16
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r11 = sext i8 undef to i32
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r12 = zext i8 undef to i32
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r13 = sext i8 undef to i64
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r14 = zext i8 undef to i64
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r17 = sext i16 undef to i32
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r18 = zext i16 undef to i32
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r19 = sext i16 undef to i64
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r20 = zext i16 undef to i64
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r24 = sext i32 undef to i64
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r25 = zext i32 undef to i64
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %s2i8i16 = sext <2 x i8> undef to <2 x i16>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %z2i8i16 = zext <2 x i8> undef to <2 x i16>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %s2i8i32 = sext <2 x i8> undef to <2 x i32>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %z2i8i32 = zext <2 x i8> undef to <2 x i32>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %s2i8i64 = sext <2 x i8> undef to <2 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %z2i8i64 = zext <2 x i8> undef to <2 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %s2i16i32 = sext <2 x i16> undef to <2 x i32>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %z2i16i32 = zext <2 x i16> undef to <2 x i32>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %s2i16i64 = sext <2 x i16> undef to <2 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %z2i16i64 = zext <2 x i16> undef to <2 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %s2i32i64 = sext <2 x i32> undef to <2 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %z2i32i64 = zext <2 x i32> undef to <2 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %s4i8i16 = sext <4 x i8> undef to <4 x i16>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %z4i8i16 = zext <4 x i8> undef to <4 x i16>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %s4i8i32 = sext <4 x i8> undef to <4 x i32>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %z4i8i32 = zext <4 x i8> undef to <4 x i32>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s4i8i64 = sext <4 x i8> undef to <4 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %z4i8i64 = zext <4 x i8> undef to <4 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %s4i16i32 = sext <4 x i16> undef to <4 x i32>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %z4i16i32 = zext <4 x i16> undef to <4 x i32>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s4i16i64 = sext <4 x i16> undef to <4 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %z4i16i64 = zext <4 x i16> undef to <4 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s4i32i64 = sext <4 x i32> undef to <4 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z4i32i64 = zext <4 x i32> undef to <4 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %s8i8i16 = sext <8 x i8> undef to <8 x i16>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %z8i8i16 = zext <8 x i8> undef to <8 x i16>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i32 = sext <8 x i8> undef to <8 x i32>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %z8i8i32 = zext <8 x i8> undef to <8 x i32>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i64 = sext <8 x i8> undef to <8 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %z8i8i64 = zext <8 x i8> undef to <8 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i16i32 = sext <8 x i16> undef to <8 x i32>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z8i16i32 = zext <8 x i16> undef to <8 x i32>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %s8i16i64 = sext <8 x i16> undef to <8 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %z8i16i64 = zext <8 x i16> undef to <8 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s8i32i64 = sext <8 x i32> undef to <8 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z8i32i64 = zext <8 x i32> undef to <8 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i16 = sext <16 x i8> undef to <16 x i16>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i16 = zext <16 x i8> undef to <16 x i16>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i32 = sext <16 x i8> undef to <16 x i32>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i32 = zext <16 x i8> undef to <16 x i32>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i64 = sext <16 x i8> undef to <16 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i64 = zext <16 x i8> undef to <16 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i32 = sext <16 x i16> undef to <16 x i32>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z16i16i32 = zext <16 x i16> undef to <16 x i32>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i64 = sext <16 x i16> undef to <16 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %z16i16i64 = zext <16 x i16> undef to <16 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %s16i32i64 = sext <16 x i32> undef to <16 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %z16i32i64 = zext <16 x i32> undef to <16 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r0 = sext i1 poison to i8
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r1 = zext i1 poison to i8
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r2 = sext i1 poison to i16
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r3 = zext i1 poison to i16
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r4 = sext i1 poison to i32
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r5 = zext i1 poison to i32
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r6 = sext i1 poison to i64
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r7 = zext i1 poison to i64
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r9 = sext i8 poison to i16
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r10 = zext i8 poison to i16
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r11 = sext i8 poison to i32
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r12 = zext i8 poison to i32
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r13 = sext i8 poison to i64
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r14 = zext i8 poison to i64
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r17 = sext i16 poison to i32
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r18 = zext i16 poison to i32
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r19 = sext i16 poison to i64
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r20 = zext i16 poison to i64
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r24 = sext i32 poison to i64
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r25 = zext i32 poison to i64
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %s2i8i16 = sext <2 x i8> poison to <2 x i16>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %z2i8i16 = zext <2 x i8> poison to <2 x i16>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %s2i8i32 = sext <2 x i8> poison to <2 x i32>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %z2i8i32 = zext <2 x i8> poison to <2 x i32>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %s2i8i64 = sext <2 x i8> poison to <2 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %z2i8i64 = zext <2 x i8> poison to <2 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %s2i16i32 = sext <2 x i16> poison to <2 x i32>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %z2i16i32 = zext <2 x i16> poison to <2 x i32>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %s2i16i64 = sext <2 x i16> poison to <2 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %z2i16i64 = zext <2 x i16> poison to <2 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %s2i32i64 = sext <2 x i32> poison to <2 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %z2i32i64 = zext <2 x i32> poison to <2 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %s4i8i16 = sext <4 x i8> poison to <4 x i16>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %z4i8i16 = zext <4 x i8> poison to <4 x i16>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %s4i8i32 = sext <4 x i8> poison to <4 x i32>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %z4i8i32 = zext <4 x i8> poison to <4 x i32>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s4i8i64 = sext <4 x i8> poison to <4 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %z4i8i64 = zext <4 x i8> poison to <4 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %s4i16i32 = sext <4 x i16> poison to <4 x i32>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %z4i16i32 = zext <4 x i16> poison to <4 x i32>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s4i16i64 = sext <4 x i16> poison to <4 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %z4i16i64 = zext <4 x i16> poison to <4 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s4i32i64 = sext <4 x i32> poison to <4 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z4i32i64 = zext <4 x i32> poison to <4 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %s8i8i16 = sext <8 x i8> poison to <8 x i16>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %z8i8i16 = zext <8 x i8> poison to <8 x i16>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i32 = sext <8 x i8> poison to <8 x i32>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %z8i8i32 = zext <8 x i8> poison to <8 x i32>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i64 = sext <8 x i8> poison to <8 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %z8i8i64 = zext <8 x i8> poison to <8 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i16i32 = sext <8 x i16> poison to <8 x i32>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z8i16i32 = zext <8 x i16> poison to <8 x i32>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %s8i16i64 = sext <8 x i16> poison to <8 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %z8i16i64 = zext <8 x i16> poison to <8 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s8i32i64 = sext <8 x i32> poison to <8 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z8i32i64 = zext <8 x i32> poison to <8 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i16 = sext <16 x i8> poison to <16 x i16>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i16 = zext <16 x i8> poison to <16 x i16>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i32 = sext <16 x i8> poison to <16 x i32>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i32 = zext <16 x i8> poison to <16 x i32>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i64 = sext <16 x i8> poison to <16 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i64 = zext <16 x i8> poison to <16 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i32 = sext <16 x i16> poison to <16 x i32>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z16i16i32 = zext <16 x i16> poison to <16 x i32>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i64 = sext <16 x i16> poison to <16 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %z16i16i64 = zext <16 x i16> poison to <16 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %s16i32i64 = sext <16 x i32> poison to <16 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %z16i32i64 = zext <16 x i32> poison to <16 x i64>
 ; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SVE128-NO-NEON-LABEL: 'ext'
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r0 = sext i1 undef to i8
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r1 = zext i1 undef to i8
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r2 = sext i1 undef to i16
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r3 = zext i1 undef to i16
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r4 = sext i1 undef to i32
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r5 = zext i1 undef to i32
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r6 = sext i1 undef to i64
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r7 = zext i1 undef to i64
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r9 = sext i8 undef to i16
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r10 = zext i8 undef to i16
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r11 = sext i8 undef to i32
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r12 = zext i8 undef to i32
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r13 = sext i8 undef to i64
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r14 = zext i8 undef to i64
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r17 = sext i16 undef to i32
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r18 = zext i16 undef to i32
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r19 = sext i16 undef to i64
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r20 = zext i16 undef to i64
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r24 = sext i32 undef to i64
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r25 = zext i32 undef to i64
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %s2i8i16 = sext <2 x i8> undef to <2 x i16>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %z2i8i16 = zext <2 x i8> undef to <2 x i16>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %s2i8i32 = sext <2 x i8> undef to <2 x i32>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %z2i8i32 = zext <2 x i8> undef to <2 x i32>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %s2i8i64 = sext <2 x i8> undef to <2 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %z2i8i64 = zext <2 x i8> undef to <2 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %s2i16i32 = sext <2 x i16> undef to <2 x i32>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %z2i16i32 = zext <2 x i16> undef to <2 x i32>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %s2i16i64 = sext <2 x i16> undef to <2 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %z2i16i64 = zext <2 x i16> undef to <2 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %s2i32i64 = sext <2 x i32> undef to <2 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %z2i32i64 = zext <2 x i32> undef to <2 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %s4i8i16 = sext <4 x i8> undef to <4 x i16>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %z4i8i16 = zext <4 x i8> undef to <4 x i16>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %s4i8i32 = sext <4 x i8> undef to <4 x i32>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %z4i8i32 = zext <4 x i8> undef to <4 x i32>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s4i8i64 = sext <4 x i8> undef to <4 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z4i8i64 = zext <4 x i8> undef to <4 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %s4i16i32 = sext <4 x i16> undef to <4 x i32>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %z4i16i32 = zext <4 x i16> undef to <4 x i32>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s4i16i64 = sext <4 x i16> undef to <4 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z4i16i64 = zext <4 x i16> undef to <4 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s4i32i64 = sext <4 x i32> undef to <4 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z4i32i64 = zext <4 x i32> undef to <4 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %s8i8i16 = sext <8 x i8> undef to <8 x i16>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %z8i8i16 = zext <8 x i8> undef to <8 x i16>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i32 = sext <8 x i8> undef to <8 x i32>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z8i8i32 = zext <8 x i8> undef to <8 x i32>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i64 = sext <8 x i8> undef to <8 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z8i8i64 = zext <8 x i8> undef to <8 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i16i32 = sext <8 x i16> undef to <8 x i32>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z8i16i32 = zext <8 x i16> undef to <8 x i32>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s8i16i64 = sext <8 x i16> undef to <8 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z8i16i64 = zext <8 x i16> undef to <8 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s8i32i64 = sext <8 x i32> undef to <8 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z8i32i64 = zext <8 x i32> undef to <8 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i16 = sext <16 x i8> undef to <16 x i16>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i16 = zext <16 x i8> undef to <16 x i16>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i32 = sext <16 x i8> undef to <16 x i32>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i32 = zext <16 x i8> undef to <16 x i32>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i64 = sext <16 x i8> undef to <16 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i64 = zext <16 x i8> undef to <16 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i32 = sext <16 x i16> undef to <16 x i32>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z16i16i32 = zext <16 x i16> undef to <16 x i32>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i64 = sext <16 x i16> undef to <16 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %z16i16i64 = zext <16 x i16> undef to <16 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %s16i32i64 = sext <16 x i32> undef to <16 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %z16i32i64 = zext <16 x i32> undef to <16 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r0 = sext i1 poison to i8
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r1 = zext i1 poison to i8
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r2 = sext i1 poison to i16
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r3 = zext i1 poison to i16
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r4 = sext i1 poison to i32
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r5 = zext i1 poison to i32
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r6 = sext i1 poison to i64
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r7 = zext i1 poison to i64
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r9 = sext i8 poison to i16
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r10 = zext i8 poison to i16
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r11 = sext i8 poison to i32
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r12 = zext i8 poison to i32
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r13 = sext i8 poison to i64
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r14 = zext i8 poison to i64
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r17 = sext i16 poison to i32
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r18 = zext i16 poison to i32
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r19 = sext i16 poison to i64
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r20 = zext i16 poison to i64
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r24 = sext i32 poison to i64
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r25 = zext i32 poison to i64
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %s2i8i16 = sext <2 x i8> poison to <2 x i16>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %z2i8i16 = zext <2 x i8> poison to <2 x i16>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %s2i8i32 = sext <2 x i8> poison to <2 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %z2i8i32 = zext <2 x i8> poison to <2 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %s2i8i64 = sext <2 x i8> poison to <2 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %z2i8i64 = zext <2 x i8> poison to <2 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %s2i16i32 = sext <2 x i16> poison to <2 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %z2i16i32 = zext <2 x i16> poison to <2 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %s2i16i64 = sext <2 x i16> poison to <2 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %z2i16i64 = zext <2 x i16> poison to <2 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %s2i32i64 = sext <2 x i32> poison to <2 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %z2i32i64 = zext <2 x i32> poison to <2 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %s4i8i16 = sext <4 x i8> poison to <4 x i16>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %z4i8i16 = zext <4 x i8> poison to <4 x i16>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %s4i8i32 = sext <4 x i8> poison to <4 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %z4i8i32 = zext <4 x i8> poison to <4 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s4i8i64 = sext <4 x i8> poison to <4 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z4i8i64 = zext <4 x i8> poison to <4 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %s4i16i32 = sext <4 x i16> poison to <4 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %z4i16i32 = zext <4 x i16> poison to <4 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s4i16i64 = sext <4 x i16> poison to <4 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z4i16i64 = zext <4 x i16> poison to <4 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s4i32i64 = sext <4 x i32> poison to <4 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z4i32i64 = zext <4 x i32> poison to <4 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %s8i8i16 = sext <8 x i8> poison to <8 x i16>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %z8i8i16 = zext <8 x i8> poison to <8 x i16>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i32 = sext <8 x i8> poison to <8 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z8i8i32 = zext <8 x i8> poison to <8 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i64 = sext <8 x i8> poison to <8 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z8i8i64 = zext <8 x i8> poison to <8 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i16i32 = sext <8 x i16> poison to <8 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z8i16i32 = zext <8 x i16> poison to <8 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s8i16i64 = sext <8 x i16> poison to <8 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z8i16i64 = zext <8 x i16> poison to <8 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s8i32i64 = sext <8 x i32> poison to <8 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z8i32i64 = zext <8 x i32> poison to <8 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i16 = sext <16 x i8> poison to <16 x i16>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i16 = zext <16 x i8> poison to <16 x i16>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i32 = sext <16 x i8> poison to <16 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i32 = zext <16 x i8> poison to <16 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i64 = sext <16 x i8> poison to <16 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i64 = zext <16 x i8> poison to <16 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i32 = sext <16 x i16> poison to <16 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z16i16i32 = zext <16 x i16> poison to <16 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i64 = sext <16 x i16> poison to <16 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %z16i16i64 = zext <16 x i16> poison to <16 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %s16i32i64 = sext <16 x i32> poison to <16 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %z16i32i64 = zext <16 x i32> poison to <16 x i64>
 ; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; FIXED-MIN-256-LABEL: 'ext'
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r0 = sext i1 undef to i8
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r1 = zext i1 undef to i8
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r2 = sext i1 undef to i16
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r3 = zext i1 undef to i16
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r4 = sext i1 undef to i32
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r5 = zext i1 undef to i32
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r6 = sext i1 undef to i64
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r7 = zext i1 undef to i64
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r9 = sext i8 undef to i16
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r10 = zext i8 undef to i16
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r11 = sext i8 undef to i32
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r12 = zext i8 undef to i32
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r13 = sext i8 undef to i64
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r14 = zext i8 undef to i64
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r17 = sext i16 undef to i32
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r18 = zext i16 undef to i32
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r19 = sext i16 undef to i64
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r20 = zext i16 undef to i64
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r24 = sext i32 undef to i64
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r25 = zext i32 undef to i64
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %s2i8i16 = sext <2 x i8> undef to <2 x i16>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %z2i8i16 = zext <2 x i8> undef to <2 x i16>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %s2i8i32 = sext <2 x i8> undef to <2 x i32>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %z2i8i32 = zext <2 x i8> undef to <2 x i32>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %s2i8i64 = sext <2 x i8> undef to <2 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %z2i8i64 = zext <2 x i8> undef to <2 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %s2i16i32 = sext <2 x i16> undef to <2 x i32>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %z2i16i32 = zext <2 x i16> undef to <2 x i32>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %s2i16i64 = sext <2 x i16> undef to <2 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %z2i16i64 = zext <2 x i16> undef to <2 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %s2i32i64 = sext <2 x i32> undef to <2 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %z2i32i64 = zext <2 x i32> undef to <2 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %s4i8i16 = sext <4 x i8> undef to <4 x i16>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %z4i8i16 = zext <4 x i8> undef to <4 x i16>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %s4i8i32 = sext <4 x i8> undef to <4 x i32>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %z4i8i32 = zext <4 x i8> undef to <4 x i32>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %s4i8i64 = sext <4 x i8> undef to <4 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %z4i8i64 = zext <4 x i8> undef to <4 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %s4i16i32 = sext <4 x i16> undef to <4 x i32>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %z4i16i32 = zext <4 x i16> undef to <4 x i32>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %s4i16i64 = sext <4 x i16> undef to <4 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %z4i16i64 = zext <4 x i16> undef to <4 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %s4i32i64 = sext <4 x i32> undef to <4 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %z4i32i64 = zext <4 x i32> undef to <4 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %s8i8i16 = sext <8 x i8> undef to <8 x i16>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %z8i8i16 = zext <8 x i8> undef to <8 x i16>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %s8i8i32 = sext <8 x i8> undef to <8 x i32>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %z8i8i32 = zext <8 x i8> undef to <8 x i32>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i64 = sext <8 x i8> undef to <8 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z8i8i64 = zext <8 x i8> undef to <8 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %s8i16i32 = sext <8 x i16> undef to <8 x i32>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %z8i16i32 = zext <8 x i16> undef to <8 x i32>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i16i64 = sext <8 x i16> undef to <8 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z8i16i64 = zext <8 x i16> undef to <8 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i32i64 = sext <8 x i32> undef to <8 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z8i32i64 = zext <8 x i32> undef to <8 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %s16i8i16 = sext <16 x i8> undef to <16 x i16>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %z16i8i16 = zext <16 x i8> undef to <16 x i16>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i32 = sext <16 x i8> undef to <16 x i32>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i32 = zext <16 x i8> undef to <16 x i32>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i64 = sext <16 x i8> undef to <16 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i64 = zext <16 x i8> undef to <16 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i32 = sext <16 x i16> undef to <16 x i32>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z16i16i32 = zext <16 x i16> undef to <16 x i32>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i64 = sext <16 x i16> undef to <16 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z16i16i64 = zext <16 x i16> undef to <16 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s16i32i64 = sext <16 x i32> undef to <16 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z16i32i64 = zext <16 x i32> undef to <16 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r0 = sext i1 poison to i8
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r1 = zext i1 poison to i8
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r2 = sext i1 poison to i16
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r3 = zext i1 poison to i16
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r4 = sext i1 poison to i32
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r5 = zext i1 poison to i32
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r6 = sext i1 poison to i64
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r7 = zext i1 poison to i64
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r9 = sext i8 poison to i16
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r10 = zext i8 poison to i16
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r11 = sext i8 poison to i32
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r12 = zext i8 poison to i32
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r13 = sext i8 poison to i64
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r14 = zext i8 poison to i64
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r17 = sext i16 poison to i32
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r18 = zext i16 poison to i32
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r19 = sext i16 poison to i64
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r20 = zext i16 poison to i64
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r24 = sext i32 poison to i64
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r25 = zext i32 poison to i64
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %s2i8i16 = sext <2 x i8> poison to <2 x i16>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %z2i8i16 = zext <2 x i8> poison to <2 x i16>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %s2i8i32 = sext <2 x i8> poison to <2 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %z2i8i32 = zext <2 x i8> poison to <2 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %s2i8i64 = sext <2 x i8> poison to <2 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %z2i8i64 = zext <2 x i8> poison to <2 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %s2i16i32 = sext <2 x i16> poison to <2 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %z2i16i32 = zext <2 x i16> poison to <2 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %s2i16i64 = sext <2 x i16> poison to <2 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %z2i16i64 = zext <2 x i16> poison to <2 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %s2i32i64 = sext <2 x i32> poison to <2 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %z2i32i64 = zext <2 x i32> poison to <2 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %s4i8i16 = sext <4 x i8> poison to <4 x i16>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %z4i8i16 = zext <4 x i8> poison to <4 x i16>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %s4i8i32 = sext <4 x i8> poison to <4 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %z4i8i32 = zext <4 x i8> poison to <4 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %s4i8i64 = sext <4 x i8> poison to <4 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %z4i8i64 = zext <4 x i8> poison to <4 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %s4i16i32 = sext <4 x i16> poison to <4 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %z4i16i32 = zext <4 x i16> poison to <4 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %s4i16i64 = sext <4 x i16> poison to <4 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %z4i16i64 = zext <4 x i16> poison to <4 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %s4i32i64 = sext <4 x i32> poison to <4 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %z4i32i64 = zext <4 x i32> poison to <4 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %s8i8i16 = sext <8 x i8> poison to <8 x i16>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %z8i8i16 = zext <8 x i8> poison to <8 x i16>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %s8i8i32 = sext <8 x i8> poison to <8 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %z8i8i32 = zext <8 x i8> poison to <8 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i64 = sext <8 x i8> poison to <8 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z8i8i64 = zext <8 x i8> poison to <8 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %s8i16i32 = sext <8 x i16> poison to <8 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %z8i16i32 = zext <8 x i16> poison to <8 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i16i64 = sext <8 x i16> poison to <8 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z8i16i64 = zext <8 x i16> poison to <8 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i32i64 = sext <8 x i32> poison to <8 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z8i32i64 = zext <8 x i32> poison to <8 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %s16i8i16 = sext <16 x i8> poison to <16 x i16>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %z16i8i16 = zext <16 x i8> poison to <16 x i16>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i32 = sext <16 x i8> poison to <16 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i32 = zext <16 x i8> poison to <16 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i64 = sext <16 x i8> poison to <16 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i64 = zext <16 x i8> poison to <16 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i32 = sext <16 x i16> poison to <16 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z16i16i32 = zext <16 x i16> poison to <16 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i64 = sext <16 x i16> poison to <16 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z16i16i64 = zext <16 x i16> poison to <16 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s16i32i64 = sext <16 x i32> poison to <16 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z16i32i64 = zext <16 x i32> poison to <16 x i64>
 ; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; FIXED-MIN-2048-LABEL: 'ext'
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r0 = sext i1 undef to i8
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r1 = zext i1 undef to i8
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r2 = sext i1 undef to i16
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r3 = zext i1 undef to i16
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r4 = sext i1 undef to i32
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r5 = zext i1 undef to i32
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r6 = sext i1 undef to i64
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r7 = zext i1 undef to i64
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r9 = sext i8 undef to i16
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r10 = zext i8 undef to i16
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r11 = sext i8 undef to i32
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r12 = zext i8 undef to i32
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r13 = sext i8 undef to i64
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r14 = zext i8 undef to i64
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r17 = sext i16 undef to i32
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r18 = zext i16 undef to i32
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r19 = sext i16 undef to i64
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r20 = zext i16 undef to i64
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r24 = sext i32 undef to i64
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r25 = zext i32 undef to i64
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s2i8i16 = sext <2 x i8> undef to <2 x i16>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z2i8i16 = zext <2 x i8> undef to <2 x i16>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s2i8i32 = sext <2 x i8> undef to <2 x i32>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z2i8i32 = zext <2 x i8> undef to <2 x i32>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s2i8i64 = sext <2 x i8> undef to <2 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z2i8i64 = zext <2 x i8> undef to <2 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s2i16i32 = sext <2 x i16> undef to <2 x i32>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z2i16i32 = zext <2 x i16> undef to <2 x i32>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s2i16i64 = sext <2 x i16> undef to <2 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z2i16i64 = zext <2 x i16> undef to <2 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s2i32i64 = sext <2 x i32> undef to <2 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z2i32i64 = zext <2 x i32> undef to <2 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s4i8i16 = sext <4 x i8> undef to <4 x i16>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z4i8i16 = zext <4 x i8> undef to <4 x i16>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s4i8i32 = sext <4 x i8> undef to <4 x i32>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z4i8i32 = zext <4 x i8> undef to <4 x i32>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s4i8i64 = sext <4 x i8> undef to <4 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z4i8i64 = zext <4 x i8> undef to <4 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s4i16i32 = sext <4 x i16> undef to <4 x i32>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z4i16i32 = zext <4 x i16> undef to <4 x i32>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s4i16i64 = sext <4 x i16> undef to <4 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z4i16i64 = zext <4 x i16> undef to <4 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s4i32i64 = sext <4 x i32> undef to <4 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z4i32i64 = zext <4 x i32> undef to <4 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s8i8i16 = sext <8 x i8> undef to <8 x i16>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z8i8i16 = zext <8 x i8> undef to <8 x i16>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s8i8i32 = sext <8 x i8> undef to <8 x i32>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z8i8i32 = zext <8 x i8> undef to <8 x i32>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s8i8i64 = sext <8 x i8> undef to <8 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z8i8i64 = zext <8 x i8> undef to <8 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s8i16i32 = sext <8 x i16> undef to <8 x i32>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z8i16i32 = zext <8 x i16> undef to <8 x i32>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s8i16i64 = sext <8 x i16> undef to <8 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z8i16i64 = zext <8 x i16> undef to <8 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s8i32i64 = sext <8 x i32> undef to <8 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z8i32i64 = zext <8 x i32> undef to <8 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s16i8i16 = sext <16 x i8> undef to <16 x i16>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z16i8i16 = zext <16 x i8> undef to <16 x i16>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s16i8i32 = sext <16 x i8> undef to <16 x i32>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z16i8i32 = zext <16 x i8> undef to <16 x i32>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s16i8i64 = sext <16 x i8> undef to <16 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z16i8i64 = zext <16 x i8> undef to <16 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s16i16i32 = sext <16 x i16> undef to <16 x i32>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z16i16i32 = zext <16 x i16> undef to <16 x i32>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s16i16i64 = sext <16 x i16> undef to <16 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z16i16i64 = zext <16 x i16> undef to <16 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s16i32i64 = sext <16 x i32> undef to <16 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z16i32i64 = zext <16 x i32> undef to <16 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r0 = sext i1 poison to i8
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r1 = zext i1 poison to i8
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r2 = sext i1 poison to i16
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r3 = zext i1 poison to i16
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r4 = sext i1 poison to i32
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r5 = zext i1 poison to i32
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r6 = sext i1 poison to i64
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r7 = zext i1 poison to i64
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r9 = sext i8 poison to i16
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r10 = zext i8 poison to i16
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r11 = sext i8 poison to i32
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r12 = zext i8 poison to i32
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r13 = sext i8 poison to i64
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r14 = zext i8 poison to i64
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r17 = sext i16 poison to i32
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r18 = zext i16 poison to i32
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r19 = sext i16 poison to i64
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r20 = zext i16 poison to i64
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r24 = sext i32 poison to i64
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r25 = zext i32 poison to i64
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s2i8i16 = sext <2 x i8> poison to <2 x i16>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z2i8i16 = zext <2 x i8> poison to <2 x i16>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s2i8i32 = sext <2 x i8> poison to <2 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z2i8i32 = zext <2 x i8> poison to <2 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s2i8i64 = sext <2 x i8> poison to <2 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z2i8i64 = zext <2 x i8> poison to <2 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s2i16i32 = sext <2 x i16> poison to <2 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z2i16i32 = zext <2 x i16> poison to <2 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s2i16i64 = sext <2 x i16> poison to <2 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z2i16i64 = zext <2 x i16> poison to <2 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s2i32i64 = sext <2 x i32> poison to <2 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z2i32i64 = zext <2 x i32> poison to <2 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s4i8i16 = sext <4 x i8> poison to <4 x i16>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z4i8i16 = zext <4 x i8> poison to <4 x i16>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s4i8i32 = sext <4 x i8> poison to <4 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z4i8i32 = zext <4 x i8> poison to <4 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s4i8i64 = sext <4 x i8> poison to <4 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z4i8i64 = zext <4 x i8> poison to <4 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s4i16i32 = sext <4 x i16> poison to <4 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z4i16i32 = zext <4 x i16> poison to <4 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s4i16i64 = sext <4 x i16> poison to <4 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z4i16i64 = zext <4 x i16> poison to <4 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s4i32i64 = sext <4 x i32> poison to <4 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z4i32i64 = zext <4 x i32> poison to <4 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s8i8i16 = sext <8 x i8> poison to <8 x i16>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z8i8i16 = zext <8 x i8> poison to <8 x i16>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s8i8i32 = sext <8 x i8> poison to <8 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z8i8i32 = zext <8 x i8> poison to <8 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s8i8i64 = sext <8 x i8> poison to <8 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z8i8i64 = zext <8 x i8> poison to <8 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s8i16i32 = sext <8 x i16> poison to <8 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z8i16i32 = zext <8 x i16> poison to <8 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s8i16i64 = sext <8 x i16> poison to <8 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z8i16i64 = zext <8 x i16> poison to <8 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s8i32i64 = sext <8 x i32> poison to <8 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z8i32i64 = zext <8 x i32> poison to <8 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s16i8i16 = sext <16 x i8> poison to <16 x i16>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z16i8i16 = zext <16 x i8> poison to <16 x i16>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s16i8i32 = sext <16 x i8> poison to <16 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z16i8i32 = zext <16 x i8> poison to <16 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s16i8i64 = sext <16 x i8> poison to <16 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z16i8i64 = zext <16 x i8> poison to <16 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s16i16i32 = sext <16 x i16> poison to <16 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z16i16i32 = zext <16 x i16> poison to <16 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s16i16i64 = sext <16 x i16> poison to <16 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z16i16i64 = zext <16 x i16> poison to <16 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s16i32i64 = sext <16 x i32> poison to <16 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %z16i32i64 = zext <16 x i32> poison to <16 x i64>
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %r0 = sext i1 undef to i8
-  %r1 = zext i1 undef to i8
-  %r2 = sext i1 undef to i16
-  %r3 = zext i1 undef to i16
-  %r4 = sext i1 undef to i32
-  %r5 = zext i1 undef to i32
-  %r6 = sext i1 undef to i64
-  %r7 = zext i1 undef to i64
-  %r9 = sext i8 undef to i16
-  %r10 = zext i8 undef to i16
-  %r11 = sext i8 undef to i32
-  %r12 = zext i8 undef to i32
-  %r13 = sext i8 undef to i64
-  %r14 = zext i8 undef to i64
-  %r17 = sext i16 undef to i32
-  %r18 = zext i16 undef to i32
-  %r19 = sext i16 undef to i64
-  %r20 = zext i16 undef to i64
-  %r24 = sext i32 undef to i64
-  %r25 = zext i32 undef to i64
+  %r0 = sext i1 poison to i8
+  %r1 = zext i1 poison to i8
+  %r2 = sext i1 poison to i16
+  %r3 = zext i1 poison to i16
+  %r4 = sext i1 poison to i32
+  %r5 = zext i1 poison to i32
+  %r6 = sext i1 poison to i64
+  %r7 = zext i1 poison to i64
+  %r9 = sext i8 poison to i16
+  %r10 = zext i8 poison to i16
+  %r11 = sext i8 poison to i32
+  %r12 = zext i8 poison to i32
+  %r13 = sext i8 poison to i64
+  %r14 = zext i8 poison to i64
+  %r17 = sext i16 poison to i32
+  %r18 = zext i16 poison to i32
+  %r19 = sext i16 poison to i64
+  %r20 = zext i16 poison to i64
+  %r24 = sext i32 poison to i64
+  %r25 = zext i32 poison to i64
 
-  %s2i8i16 = sext <2 x i8> undef to <2 x i16>
-  %z2i8i16 = zext <2 x i8> undef to <2 x i16>
-  %s2i8i32 = sext <2 x i8> undef to <2 x i32>
-  %z2i8i32 = zext <2 x i8> undef to <2 x i32>
-  %s2i8i64 = sext <2 x i8> undef to <2 x i64>
-  %z2i8i64 = zext <2 x i8> undef to <2 x i64>
-  %s2i16i32 = sext <2 x i16> undef to <2 x i32>
-  %z2i16i32 = zext <2 x i16> undef to <2 x i32>
-  %s2i16i64 = sext <2 x i16> undef to <2 x i64>
-  %z2i16i64 = zext <2 x i16> undef to <2 x i64>
-  %s2i32i64 = sext <2 x i32> undef to <2 x i64>
-  %z2i32i64 = zext <2 x i32> undef to <2 x i64>
+  %s2i8i16 = sext <2 x i8> poison to <2 x i16>
+  %z2i8i16 = zext <2 x i8> poison to <2 x i16>
+  %s2i8i32 = sext <2 x i8> poison to <2 x i32>
+  %z2i8i32 = zext <2 x i8> poison to <2 x i32>
+  %s2i8i64 = sext <2 x i8> poison to <2 x i64>
+  %z2i8i64 = zext <2 x i8> poison to <2 x i64>
+  %s2i16i32 = sext <2 x i16> poison to <2 x i32>
+  %z2i16i32 = zext <2 x i16> poison to <2 x i32>
+  %s2i16i64 = sext <2 x i16> poison to <2 x i64>
+  %z2i16i64 = zext <2 x i16> poison to <2 x i64>
+  %s2i32i64 = sext <2 x i32> poison to <2 x i64>
+  %z2i32i64 = zext <2 x i32> poison to <2 x i64>
 
-  %s4i8i16 = sext <4 x i8>  undef to <4 x i16>
-  %z4i8i16 = zext <4 x i8>  undef to <4 x i16>
-  %s4i8i32 = sext <4 x i8>  undef to <4 x i32>
-  %z4i8i32 = zext <4 x i8>  undef to <4 x i32>
-  %s4i8i64 = sext <4 x i8>  undef to <4 x i64>
-  %z4i8i64 = zext <4 x i8>  undef to <4 x i64>
-  %s4i16i32 = sext <4 x i16> undef to <4 x i32>
-  %z4i16i32 = zext <4 x i16> undef to <4 x i32>
-  %s4i16i64 = sext <4 x i16> undef to <4 x i64>
-  %z4i16i64 = zext <4 x i16> undef to <4 x i64>
-  %s4i32i64 = sext <4 x i32> undef to <4 x i64>
-  %z4i32i64 = zext <4 x i32> undef to <4 x i64>
+  %s4i8i16 = sext <4 x i8>  poison to <4 x i16>
+  %z4i8i16 = zext <4 x i8>  poison to <4 x i16>
+  %s4i8i32 = sext <4 x i8>  poison to <4 x i32>
+  %z4i8i32 = zext <4 x i8>  poison to <4 x i32>
+  %s4i8i64 = sext <4 x i8>  poison to <4 x i64>
+  %z4i8i64 = zext <4 x i8>  poison to <4 x i64>
+  %s4i16i32 = sext <4 x i16> poison to <4 x i32>
+  %z4i16i32 = zext <4 x i16> poison to <4 x i32>
+  %s4i16i64 = sext <4 x i16> poison to <4 x i64>
+  %z4i16i64 = zext <4 x i16> poison to <4 x i64>
+  %s4i32i64 = sext <4 x i32> poison to <4 x i64>
+  %z4i32i64 = zext <4 x i32> poison to <4 x i64>
 
-  %s8i8i16 = sext <8 x i8>  undef to <8 x i16>
-  %z8i8i16 = zext <8 x i8>  undef to <8 x i16>
-  %s8i8i32 = sext <8 x i8>  undef to <8 x i32>
-  %z8i8i32 = zext <8 x i8>  undef to <8 x i32>
-  %s8i8i64 = sext <8 x i8>  undef to <8 x i64>
-  %z8i8i64 = zext <8 x i8>  undef to <8 x i64>
-  %s8i16i32 = sext <8 x i16> undef to <8 x i32>
-  %z8i16i32 = zext <8 x i16> undef to <8 x i32>
-  %s8i16i64 = sext <8 x i16> undef to <8 x i64>
-  %z8i16i64 = zext <8 x i16> undef to <8 x i64>
-  %s8i32i64 = sext <8 x i32> undef to <8 x i64>
-  %z8i32i64 = zext <8 x i32> undef to <8 x i64>
+  %s8i8i16 = sext <8 x i8>  poison to <8 x i16>
+  %z8i8i16 = zext <8 x i8>  poison to <8 x i16>
+  %s8i8i32 = sext <8 x i8>  poison to <8 x i32>
+  %z8i8i32 = zext <8 x i8>  poison to <8 x i32>
+  %s8i8i64 = sext <8 x i8>  poison to <8 x i64>
+  %z8i8i64 = zext <8 x i8>  poison to <8 x i64>
+  %s8i16i32 = sext <8 x i16> poison to <8 x i32>
+  %z8i16i32 = zext <8 x i16> poison to <8 x i32>
+  %s8i16i64 = sext <8 x i16> poison to <8 x i64>
+  %z8i16i64 = zext <8 x i16> poison to <8 x i64>
+  %s8i32i64 = sext <8 x i32> poison to <8 x i64>
+  %z8i32i64 = zext <8 x i32> poison to <8 x i64>
 
-  %s16i8i16 = sext <16 x i8>  undef to <16 x i16>
-  %z16i8i16 = zext <16 x i8>  undef to <16 x i16>
-  %s16i8i32 = sext <16 x i8>  undef to <16 x i32>
-  %z16i8i32 = zext <16 x i8>  undef to <16 x i32>
-  %s16i8i64 = sext <16 x i8>  undef to <16 x i64>
-  %z16i8i64 = zext <16 x i8>  undef to <16 x i64>
-  %s16i16i32 = sext <16 x i16> undef to <16 x i32>
-  %z16i16i32 = zext <16 x i16> undef to <16 x i32>
-  %s16i16i64 = sext <16 x i16> undef to <16 x i64>
-  %z16i16i64 = zext <16 x i16> undef to <16 x i64>
-  %s16i32i64 = sext <16 x i32> undef to <16 x i64>
-  %z16i32i64 = zext <16 x i32> undef to <16 x i64>
+  %s16i8i16 = sext <16 x i8>  poison to <16 x i16>
+  %z16i8i16 = zext <16 x i8>  poison to <16 x i16>
+  %s16i8i32 = sext <16 x i8>  poison to <16 x i32>
+  %z16i8i32 = zext <16 x i8>  poison to <16 x i32>
+  %s16i8i64 = sext <16 x i8>  poison to <16 x i64>
+  %z16i8i64 = zext <16 x i8>  poison to <16 x i64>
+  %s16i16i32 = sext <16 x i16> poison to <16 x i32>
+  %z16i16i32 = zext <16 x i16> poison to <16 x i32>
+  %s16i16i64 = sext <16 x i16> poison to <16 x i64>
+  %z16i16i64 = zext <16 x i16> poison to <16 x i64>
+  %s16i32i64 = sext <16 x i32> poison to <16 x i64>
+  %z16i32i64 = zext <16 x i32> poison to <16 x i64>
   ret void
 }
 
 define void @trunc() {
 ; CHECK-SVE-LABEL: 'trunc'
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r8 = trunc i8 undef to i1
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r15 = trunc i16 undef to i1
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r16 = trunc i16 undef to i8
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r21 = trunc i32 undef to i1
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r22 = trunc i32 undef to i8
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r23 = trunc i32 undef to i16
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r26 = trunc i64 undef to i1
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r27 = trunc i64 undef to i8
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r28 = trunc i64 undef to i16
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r29 = trunc i64 undef to i32
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %s2i8i16 = trunc <2 x i16> undef to <2 x i8>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %s2i8i32 = trunc <2 x i32> undef to <2 x i8>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %s2i8i64 = trunc <2 x i64> undef to <2 x i8>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %s2i16i32 = trunc <2 x i32> undef to <2 x i16>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %s2i16i64 = trunc <2 x i64> undef to <2 x i16>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %s2i32i64 = trunc <2 x i64> undef to <2 x i32>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %s4i8i16 = trunc <4 x i16> undef to <4 x i8>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %s4i8i32 = trunc <4 x i32> undef to <4 x i8>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s4i8i64 = trunc <4 x i64> undef to <4 x i8>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %s4i16i32 = trunc <4 x i32> undef to <4 x i16>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s4i16i64 = trunc <4 x i64> undef to <4 x i16>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %s4i32i64 = trunc <4 x i64> undef to <4 x i32>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %s8i8i16 = trunc <8 x i16> undef to <8 x i8>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i32 = trunc <8 x i32> undef to <8 x i8>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i64 = trunc <8 x i64> undef to <8 x i8>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %s8i16i32 = trunc <8 x i32> undef to <8 x i16>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s8i16i64 = trunc <8 x i64> undef to <8 x i16>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i32i64 = trunc <8 x i64> undef to <8 x i32>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %s16i8i16 = trunc <16 x i16> undef to <16 x i8>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i32 = trunc <16 x i32> undef to <16 x i8>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i64 = trunc <16 x i64> undef to <16 x i8>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i32 = trunc <16 x i32> undef to <16 x i16>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i64 = trunc <16 x i64> undef to <16 x i16>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s16i32i64 = trunc <16 x i64> undef to <16 x i32>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r8 = trunc i8 poison to i1
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r15 = trunc i16 poison to i1
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r16 = trunc i16 poison to i8
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r21 = trunc i32 poison to i1
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r22 = trunc i32 poison to i8
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r23 = trunc i32 poison to i16
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r26 = trunc i64 poison to i1
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r27 = trunc i64 poison to i8
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r28 = trunc i64 poison to i16
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %r29 = trunc i64 poison to i32
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %s2i8i16 = trunc <2 x i16> poison to <2 x i8>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %s2i8i32 = trunc <2 x i32> poison to <2 x i8>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %s2i8i64 = trunc <2 x i64> poison to <2 x i8>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %s2i16i32 = trunc <2 x i32> poison to <2 x i16>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %s2i16i64 = trunc <2 x i64> poison to <2 x i16>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %s2i32i64 = trunc <2 x i64> poison to <2 x i32>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 0 for: %s4i8i16 = trunc <4 x i16> poison to <4 x i8>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %s4i8i32 = trunc <4 x i32> poison to <4 x i8>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s4i8i64 = trunc <4 x i64> poison to <4 x i8>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %s4i16i32 = trunc <4 x i32> poison to <4 x i16>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s4i16i64 = trunc <4 x i64> poison to <4 x i16>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %s4i32i64 = trunc <4 x i64> poison to <4 x i32>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %s8i8i16 = trunc <8 x i16> poison to <8 x i8>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i32 = trunc <8 x i32> poison to <8 x i8>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i64 = trunc <8 x i64> poison to <8 x i8>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %s8i16i32 = trunc <8 x i32> poison to <8 x i16>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s8i16i64 = trunc <8 x i64> poison to <8 x i16>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i32i64 = trunc <8 x i64> poison to <8 x i32>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %s16i8i16 = trunc <16 x i16> poison to <16 x i8>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i32 = trunc <16 x i32> poison to <16 x i8>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i64 = trunc <16 x i64> poison to <16 x i8>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i32 = trunc <16 x i32> poison to <16 x i16>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i64 = trunc <16 x i64> poison to <16 x i16>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s16i32i64 = trunc <16 x i64> poison to <16 x i32>
 ; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SVE128-NO-NEON-LABEL: 'trunc'
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r8 = trunc i8 undef to i1
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r15 = trunc i16 undef to i1
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r16 = trunc i16 undef to i8
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r21 = trunc i32 undef to i1
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r22 = trunc i32 undef to i8
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r23 = trunc i32 undef to i16
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r26 = trunc i64 undef to i1
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r27 = trunc i64 undef to i8
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r28 = trunc i64 undef to i16
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r29 = trunc i64 undef to i32
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s2i8i16 = trunc <2 x i16> undef to <2 x i8>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s2i8i32 = trunc <2 x i32> undef to <2 x i8>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s2i8i64 = trunc <2 x i64> undef to <2 x i8>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s2i16i32 = trunc <2 x i32> undef to <2 x i16>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s2i16i64 = trunc <2 x i64> undef to <2 x i16>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s2i32i64 = trunc <2 x i64> undef to <2 x i32>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s4i8i16 = trunc <4 x i16> undef to <4 x i8>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s4i8i32 = trunc <4 x i32> undef to <4 x i8>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s4i8i64 = trunc <4 x i64> undef to <4 x i8>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s4i16i32 = trunc <4 x i32> undef to <4 x i16>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s4i16i64 = trunc <4 x i64> undef to <4 x i16>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s4i32i64 = trunc <4 x i64> undef to <4 x i32>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s8i8i16 = trunc <8 x i16> undef to <8 x i8>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s8i8i32 = trunc <8 x i32> undef to <8 x i8>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s8i8i64 = trunc <8 x i64> undef to <8 x i8>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s8i16i32 = trunc <8 x i32> undef to <8 x i16>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s8i16i64 = trunc <8 x i64> undef to <8 x i16>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s8i32i64 = trunc <8 x i64> undef to <8 x i32>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s16i8i16 = trunc <16 x i16> undef to <16 x i8>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s16i8i32 = trunc <16 x i32> undef to <16 x i8>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s16i8i64 = trunc <16 x i64> undef to <16 x i8>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s16i16i32 = trunc <16 x i32> undef to <16 x i16>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s16i16i64 = trunc <16 x i64> undef to <16 x i16>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s16i32i64 = trunc <16 x i64> undef to <16 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r8 = trunc i8 poison to i1
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r15 = trunc i16 poison to i1
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r16 = trunc i16 poison to i8
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r21 = trunc i32 poison to i1
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r22 = trunc i32 poison to i8
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r23 = trunc i32 poison to i16
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r26 = trunc i64 poison to i1
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r27 = trunc i64 poison to i8
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r28 = trunc i64 poison to i16
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %r29 = trunc i64 poison to i32
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s2i8i16 = trunc <2 x i16> poison to <2 x i8>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s2i8i32 = trunc <2 x i32> poison to <2 x i8>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s2i8i64 = trunc <2 x i64> poison to <2 x i8>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s2i16i32 = trunc <2 x i32> poison to <2 x i16>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s2i16i64 = trunc <2 x i64> poison to <2 x i16>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s2i32i64 = trunc <2 x i64> poison to <2 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s4i8i16 = trunc <4 x i16> poison to <4 x i8>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s4i8i32 = trunc <4 x i32> poison to <4 x i8>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s4i8i64 = trunc <4 x i64> poison to <4 x i8>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s4i16i32 = trunc <4 x i32> poison to <4 x i16>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s4i16i64 = trunc <4 x i64> poison to <4 x i16>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s4i32i64 = trunc <4 x i64> poison to <4 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s8i8i16 = trunc <8 x i16> poison to <8 x i8>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s8i8i32 = trunc <8 x i32> poison to <8 x i8>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s8i8i64 = trunc <8 x i64> poison to <8 x i8>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s8i16i32 = trunc <8 x i32> poison to <8 x i16>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s8i16i64 = trunc <8 x i64> poison to <8 x i16>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s8i32i64 = trunc <8 x i64> poison to <8 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s16i8i16 = trunc <16 x i16> poison to <16 x i8>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s16i8i32 = trunc <16 x i32> poison to <16 x i8>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s16i8i64 = trunc <16 x i64> poison to <16 x i8>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s16i16i32 = trunc <16 x i32> poison to <16 x i16>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s16i16i64 = trunc <16 x i64> poison to <16 x i16>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 0 for: %s16i32i64 = trunc <16 x i64> poison to <16 x i32>
 ; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; FIXED-MIN-256-LABEL: 'trunc'
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r8 = trunc i8 undef to i1
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r15 = trunc i16 undef to i1
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r16 = trunc i16 undef to i8
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r21 = trunc i32 undef to i1
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r22 = trunc i32 undef to i8
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r23 = trunc i32 undef to i16
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r26 = trunc i64 undef to i1
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r27 = trunc i64 undef to i8
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r28 = trunc i64 undef to i16
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r29 = trunc i64 undef to i32
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %s2i8i16 = trunc <2 x i16> undef to <2 x i8>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %s2i8i32 = trunc <2 x i32> undef to <2 x i8>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %s2i8i64 = trunc <2 x i64> undef to <2 x i8>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %s2i16i32 = trunc <2 x i32> undef to <2 x i16>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %s2i16i64 = trunc <2 x i64> undef to <2 x i16>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %s2i32i64 = trunc <2 x i64> undef to <2 x i32>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %s4i8i16 = trunc <4 x i16> undef to <4 x i8>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %s4i8i32 = trunc <4 x i32> undef to <4 x i8>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %s4i8i64 = trunc <4 x i64> undef to <4 x i8>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %s4i16i32 = trunc <4 x i32> undef to <4 x i16>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %s4i16i64 = trunc <4 x i64> undef to <4 x i16>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %s4i32i64 = trunc <4 x i64> undef to <4 x i32>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %s8i8i16 = trunc <8 x i16> undef to <8 x i8>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %s8i8i32 = trunc <8 x i32> undef to <8 x i8>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %s8i8i64 = trunc <8 x i64> undef to <8 x i8>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %s8i16i32 = trunc <8 x i32> undef to <8 x i16>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %s8i16i64 = trunc <8 x i64> undef to <8 x i16>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %s8i32i64 = trunc <8 x i64> undef to <8 x i32>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %s16i8i16 = trunc <16 x i16> undef to <16 x i8>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %s16i8i32 = trunc <16 x i32> undef to <16 x i8>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %s16i8i64 = trunc <16 x i64> undef to <16 x i8>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %s16i16i32 = trunc <16 x i32> undef to <16 x i16>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %s16i16i64 = trunc <16 x i64> undef to <16 x i16>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %s16i32i64 = trunc <16 x i64> undef to <16 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r8 = trunc i8 poison to i1
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r15 = trunc i16 poison to i1
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r16 = trunc i16 poison to i8
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r21 = trunc i32 poison to i1
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r22 = trunc i32 poison to i8
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r23 = trunc i32 poison to i16
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r26 = trunc i64 poison to i1
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r27 = trunc i64 poison to i8
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r28 = trunc i64 poison to i16
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %r29 = trunc i64 poison to i32
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %s2i8i16 = trunc <2 x i16> poison to <2 x i8>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %s2i8i32 = trunc <2 x i32> poison to <2 x i8>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %s2i8i64 = trunc <2 x i64> poison to <2 x i8>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %s2i16i32 = trunc <2 x i32> poison to <2 x i16>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %s2i16i64 = trunc <2 x i64> poison to <2 x i16>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %s2i32i64 = trunc <2 x i64> poison to <2 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %s4i8i16 = trunc <4 x i16> poison to <4 x i8>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %s4i8i32 = trunc <4 x i32> poison to <4 x i8>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %s4i8i64 = trunc <4 x i64> poison to <4 x i8>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %s4i16i32 = trunc <4 x i32> poison to <4 x i16>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %s4i16i64 = trunc <4 x i64> poison to <4 x i16>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %s4i32i64 = trunc <4 x i64> poison to <4 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %s8i8i16 = trunc <8 x i16> poison to <8 x i8>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %s8i8i32 = trunc <8 x i32> poison to <8 x i8>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %s8i8i64 = trunc <8 x i64> poison to <8 x i8>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %s8i16i32 = trunc <8 x i32> poison to <8 x i16>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %s8i16i64 = trunc <8 x i64> poison to <8 x i16>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %s8i32i64 = trunc <8 x i64> poison to <8 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %s16i8i16 = trunc <16 x i16> poison to <16 x i8>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %s16i8i32 = trunc <16 x i32> poison to <16 x i8>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %s16i8i64 = trunc <16 x i64> poison to <16 x i8>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %s16i16i32 = trunc <16 x i32> poison to <16 x i16>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %s16i16i64 = trunc <16 x i64> poison to <16 x i16>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 0 for: %s16i32i64 = trunc <16 x i64> poison to <16 x i32>
 ; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; FIXED-MIN-2048-LABEL: 'trunc'
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r8 = trunc i8 undef to i1
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r15 = trunc i16 undef to i1
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r16 = trunc i16 undef to i8
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r21 = trunc i32 undef to i1
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r22 = trunc i32 undef to i8
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r23 = trunc i32 undef to i16
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r26 = trunc i64 undef to i1
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r27 = trunc i64 undef to i8
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r28 = trunc i64 undef to i16
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r29 = trunc i64 undef to i32
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %s2i8i16 = trunc <2 x i16> undef to <2 x i8>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %s2i8i32 = trunc <2 x i32> undef to <2 x i8>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s2i8i64 = trunc <2 x i64> undef to <2 x i8>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %s2i16i32 = trunc <2 x i32> undef to <2 x i16>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s2i16i64 = trunc <2 x i64> undef to <2 x i16>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s2i32i64 = trunc <2 x i64> undef to <2 x i32>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %s4i8i16 = trunc <4 x i16> undef to <4 x i8>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s4i8i32 = trunc <4 x i32> undef to <4 x i8>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %s4i8i64 = trunc <4 x i64> undef to <4 x i8>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s4i16i32 = trunc <4 x i32> undef to <4 x i16>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %s4i16i64 = trunc <4 x i64> undef to <4 x i16>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %s4i32i64 = trunc <4 x i64> undef to <4 x i32>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s8i8i16 = trunc <8 x i16> undef to <8 x i8>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %s8i8i32 = trunc <8 x i32> undef to <8 x i8>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %s8i8i64 = trunc <8 x i64> undef to <8 x i8>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %s8i16i32 = trunc <8 x i32> undef to <8 x i16>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %s8i16i64 = trunc <8 x i64> undef to <8 x i16>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %s8i32i64 = trunc <8 x i64> undef to <8 x i32>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %s16i8i16 = trunc <16 x i16> undef to <16 x i8>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %s16i8i32 = trunc <16 x i32> undef to <16 x i8>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %s16i8i64 = trunc <16 x i64> undef to <16 x i8>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %s16i16i32 = trunc <16 x i32> undef to <16 x i16>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %s16i16i64 = trunc <16 x i64> undef to <16 x i16>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %s16i32i64 = trunc <16 x i64> undef to <16 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r8 = trunc i8 poison to i1
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r15 = trunc i16 poison to i1
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r16 = trunc i16 poison to i8
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r21 = trunc i32 poison to i1
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r22 = trunc i32 poison to i8
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r23 = trunc i32 poison to i16
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r26 = trunc i64 poison to i1
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r27 = trunc i64 poison to i8
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r28 = trunc i64 poison to i16
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %r29 = trunc i64 poison to i32
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %s2i8i16 = trunc <2 x i16> poison to <2 x i8>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %s2i8i32 = trunc <2 x i32> poison to <2 x i8>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s2i8i64 = trunc <2 x i64> poison to <2 x i8>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %s2i16i32 = trunc <2 x i32> poison to <2 x i16>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s2i16i64 = trunc <2 x i64> poison to <2 x i16>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s2i32i64 = trunc <2 x i64> poison to <2 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %s4i8i16 = trunc <4 x i16> poison to <4 x i8>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s4i8i32 = trunc <4 x i32> poison to <4 x i8>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %s4i8i64 = trunc <4 x i64> poison to <4 x i8>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s4i16i32 = trunc <4 x i32> poison to <4 x i16>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %s4i16i64 = trunc <4 x i64> poison to <4 x i16>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %s4i32i64 = trunc <4 x i64> poison to <4 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %s8i8i16 = trunc <8 x i16> poison to <8 x i8>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %s8i8i32 = trunc <8 x i32> poison to <8 x i8>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %s8i8i64 = trunc <8 x i64> poison to <8 x i8>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %s8i16i32 = trunc <8 x i32> poison to <8 x i16>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %s8i16i64 = trunc <8 x i64> poison to <8 x i16>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %s8i32i64 = trunc <8 x i64> poison to <8 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %s16i8i16 = trunc <16 x i16> poison to <16 x i8>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %s16i8i32 = trunc <16 x i32> poison to <16 x i8>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %s16i8i64 = trunc <16 x i64> poison to <16 x i8>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %s16i16i32 = trunc <16 x i32> poison to <16 x i16>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %s16i16i64 = trunc <16 x i64> poison to <16 x i16>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 0 for: %s16i32i64 = trunc <16 x i64> poison to <16 x i32>
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %r8 = trunc i8 undef to i1
-  %r15 = trunc i16 undef to i1
-  %r16 = trunc i16 undef to i8
-  %r21 = trunc i32 undef to i1
-  %r22 = trunc i32 undef to i8
-  %r23 = trunc i32 undef to i16
-  %r26 = trunc i64 undef to i1
-  %r27 = trunc i64 undef to i8
-  %r28 = trunc i64 undef to i16
-  %r29 = trunc i64 undef to i32
+  %r8 = trunc i8 poison to i1
+  %r15 = trunc i16 poison to i1
+  %r16 = trunc i16 poison to i8
+  %r21 = trunc i32 poison to i1
+  %r22 = trunc i32 poison to i8
+  %r23 = trunc i32 poison to i16
+  %r26 = trunc i64 poison to i1
+  %r27 = trunc i64 poison to i8
+  %r28 = trunc i64 poison to i16
+  %r29 = trunc i64 poison to i32
 
-  %s2i8i16 = trunc <2 x i16> undef to <2 x i8>
-  %s2i8i32 = trunc <2 x i32> undef to <2 x i8>
-  %s2i8i64 = trunc <2 x i64> undef to <2 x i8>
-  %s2i16i32 = trunc <2 x i32> undef to <2 x i16>
-  %s2i16i64 = trunc <2 x i64> undef to <2 x i16>
-  %s2i32i64 = trunc <2 x i64> undef to <2 x i32>
+  %s2i8i16 = trunc <2 x i16> poison to <2 x i8>
+  %s2i8i32 = trunc <2 x i32> poison to <2 x i8>
+  %s2i8i64 = trunc <2 x i64> poison to <2 x i8>
+  %s2i16i32 = trunc <2 x i32> poison to <2 x i16>
+  %s2i16i64 = trunc <2 x i64> poison to <2 x i16>
+  %s2i32i64 = trunc <2 x i64> poison to <2 x i32>
 
-  %s4i8i16 = trunc <4 x i16> undef to <4 x i8>
-  %s4i8i32 = trunc <4 x i32> undef to <4 x i8>
-  %s4i8i64 = trunc <4 x i64> undef to <4 x i8>
-  %s4i16i32 = trunc <4 x i32> undef to <4 x i16>
-  %s4i16i64 = trunc <4 x i64> undef to <4 x i16>
-  %s4i32i64 = trunc <4 x i64> undef to <4 x i32>
+  %s4i8i16 = trunc <4 x i16> poison to <4 x i8>
+  %s4i8i32 = trunc <4 x i32> poison to <4 x i8>
+  %s4i8i64 = trunc <4 x i64> poison to <4 x i8>
+  %s4i16i32 = trunc <4 x i32> poison to <4 x i16>
+  %s4i16i64 = trunc <4 x i64> poison to <4 x i16>
+  %s4i32i64 = trunc <4 x i64> poison to <4 x i32>
 
-  %s8i8i16 = trunc <8 x i16> undef to <8 x i8>
-  %s8i8i32 = trunc <8 x i32> undef to <8 x i8>
-  %s8i8i64 = trunc <8 x i64> undef to <8 x i8>
-  %s8i16i32 = trunc <8 x i32> undef to <8 x i16>
-  %s8i16i64 = trunc <8 x i64> undef to <8 x i16>
-  %s8i32i64 = trunc <8 x i64> undef to <8 x i32>
+  %s8i8i16 = trunc <8 x i16> poison to <8 x i8>
+  %s8i8i32 = trunc <8 x i32> poison to <8 x i8>
+  %s8i8i64 = trunc <8 x i64> poison to <8 x i8>
+  %s8i16i32 = trunc <8 x i32> poison to <8 x i16>
+  %s8i16i64 = trunc <8 x i64> poison to <8 x i16>
+  %s8i32i64 = trunc <8 x i64> poison to <8 x i32>
 
-  %s16i8i16 = trunc <16 x i16> undef to <16 x i8>
-  %s16i8i32 = trunc <16 x i32> undef to <16 x i8>
-  %s16i8i64 = trunc <16 x i64> undef to <16 x i8>
-  %s16i16i32 = trunc <16 x i32> undef to <16 x i16>
-  %s16i16i64 = trunc <16 x i64> undef to <16 x i16>
-  %s16i32i64 = trunc <16 x i64> undef to <16 x i32>
+  %s16i8i16 = trunc <16 x i16> poison to <16 x i8>
+  %s16i8i32 = trunc <16 x i32> poison to <16 x i8>
+  %s16i8i64 = trunc <16 x i64> poison to <16 x i8>
+  %s16i16i32 = trunc <16 x i32> poison to <16 x i16>
+  %s16i16i64 = trunc <16 x i64> poison to <16 x i16>
+  %s16i32i64 = trunc <16 x i64> poison to <16 x i32>
   ret void
 }
 
 define i32 @casts_no_users() {
 ; CHECK-SVE-LABEL: 'casts_no_users'
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r30 = fptoui float undef to i1
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r31 = fptosi float undef to i1
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r32 = fptoui float undef to i8
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r33 = fptosi float undef to i8
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r34 = fptoui float undef to i16
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r35 = fptosi float undef to i16
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r36 = fptoui float undef to i32
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r37 = fptosi float undef to i32
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r38 = fptoui float undef to i64
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r39 = fptosi float undef to i64
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r40 = fptoui double undef to i1
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r41 = fptosi double undef to i1
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r42 = fptoui double undef to i8
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r43 = fptosi double undef to i8
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r44 = fptoui double undef to i16
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r45 = fptosi double undef to i16
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r46 = fptoui double undef to i32
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r47 = fptosi double undef to i32
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r48 = fptoui double undef to i64
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r49 = fptosi double undef to i64
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r50 = sitofp i1 undef to float
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r51 = uitofp i1 undef to float
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r52 = sitofp i1 undef to double
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r53 = uitofp i1 undef to double
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r54 = sitofp i8 undef to float
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r55 = uitofp i8 undef to float
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r56 = sitofp i8 undef to double
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r57 = uitofp i8 undef to double
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r58 = sitofp i16 undef to float
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r59 = uitofp i16 undef to float
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r60 = sitofp i16 undef to double
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r61 = uitofp i16 undef to double
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r62 = sitofp i32 undef to float
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r63 = uitofp i32 undef to float
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r64 = sitofp i32 undef to double
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r65 = uitofp i32 undef to double
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r66 = sitofp i64 undef to float
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r67 = uitofp i64 undef to float
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r68 = sitofp i64 undef to double
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r69 = uitofp i64 undef to double
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r80 = fptrunc double undef to float
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r81 = fptrunc <2 x double> undef to <2 x float>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r82 = fptrunc <4 x double> undef to <4 x float>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r83 = fptrunc <8 x double> undef to <8 x float>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r84 = fptrunc <16 x double> undef to <16 x float>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r85 = fpext float undef to double
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r86 = fpext <2 x float> undef to <2 x double>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r87 = fpext <4 x float> undef to <4 x double>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r88 = fpext <8 x float> undef to <8 x double>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r89 = fpext <16 x float> undef to <16 x double>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r90 = fptoui <2 x float> undef to <2 x i1>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r91 = fptosi <2 x float> undef to <2 x i1>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r92 = fptoui <2 x float> undef to <2 x i8>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r93 = fptosi <2 x float> undef to <2 x i8>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r94 = fptoui <2 x float> undef to <2 x i16>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r95 = fptosi <2 x float> undef to <2 x i16>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r96 = fptoui <2 x float> undef to <2 x i32>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r97 = fptosi <2 x float> undef to <2 x i32>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r98 = fptoui <2 x float> undef to <2 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r99 = fptosi <2 x float> undef to <2 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r100 = fptoui <2 x double> undef to <2 x i1>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r101 = fptosi <2 x double> undef to <2 x i1>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r102 = fptoui <2 x double> undef to <2 x i8>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r103 = fptosi <2 x double> undef to <2 x i8>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r104 = fptoui <2 x double> undef to <2 x i16>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r105 = fptosi <2 x double> undef to <2 x i16>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r106 = fptoui <2 x double> undef to <2 x i32>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r107 = fptosi <2 x double> undef to <2 x i32>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r108 = fptoui <2 x double> undef to <2 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r109 = fptosi <2 x double> undef to <2 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r110 = fptoui <4 x float> undef to <4 x i1>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r111 = fptosi <4 x float> undef to <4 x i1>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r112 = fptoui <4 x float> undef to <4 x i8>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r113 = fptosi <4 x float> undef to <4 x i8>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r114 = fptoui <4 x float> undef to <4 x i16>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r115 = fptosi <4 x float> undef to <4 x i16>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r116 = fptoui <4 x float> undef to <4 x i32>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r117 = fptosi <4 x float> undef to <4 x i32>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r118 = fptoui <4 x float> undef to <4 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r119 = fptosi <4 x float> undef to <4 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r120 = fptoui <4 x double> undef to <4 x i1>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r121 = fptosi <4 x double> undef to <4 x i1>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r122 = fptoui <4 x double> undef to <4 x i8>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r123 = fptosi <4 x double> undef to <4 x i8>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r124 = fptoui <4 x double> undef to <4 x i16>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r125 = fptosi <4 x double> undef to <4 x i16>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r126 = fptoui <4 x double> undef to <4 x i32>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r127 = fptosi <4 x double> undef to <4 x i32>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r128 = fptoui <4 x double> undef to <4 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r129 = fptosi <4 x double> undef to <4 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:41 CodeSize:1 Lat:1 SizeLat:1 for: %r130 = fptoui <8 x float> undef to <8 x i1>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:41 CodeSize:1 Lat:1 SizeLat:1 for: %r131 = fptosi <8 x float> undef to <8 x i1>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r132 = fptoui <8 x float> undef to <8 x i8>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r133 = fptosi <8 x float> undef to <8 x i8>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r134 = fptoui <8 x float> undef to <8 x i16>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r135 = fptosi <8 x float> undef to <8 x i16>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r136 = fptoui <8 x float> undef to <8 x i32>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r137 = fptosi <8 x float> undef to <8 x i32>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r138 = fptoui <8 x float> undef to <8 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r139 = fptosi <8 x float> undef to <8 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:1 Lat:1 SizeLat:1 for: %r140 = fptoui <8 x double> undef to <8 x i1>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:1 Lat:1 SizeLat:1 for: %r141 = fptosi <8 x double> undef to <8 x i1>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r142 = fptoui <8 x double> undef to <8 x i8>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r143 = fptosi <8 x double> undef to <8 x i8>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r144 = fptoui <8 x double> undef to <8 x i16>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r145 = fptosi <8 x double> undef to <8 x i16>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r146 = fptoui <8 x double> undef to <8 x i32>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r147 = fptosi <8 x double> undef to <8 x i32>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r148 = fptoui <8 x double> undef to <8 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r149 = fptosi <8 x double> undef to <8 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:83 CodeSize:1 Lat:1 SizeLat:1 for: %r150 = fptoui <16 x float> undef to <16 x i1>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:83 CodeSize:1 Lat:1 SizeLat:1 for: %r151 = fptosi <16 x float> undef to <16 x i1>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r152 = fptoui <16 x float> undef to <16 x i8>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r153 = fptosi <16 x float> undef to <16 x i8>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r154 = fptoui <16 x float> undef to <16 x i16>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r155 = fptosi <16 x float> undef to <16 x i16>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r156 = fptoui <16 x float> undef to <16 x i32>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r157 = fptosi <16 x float> undef to <16 x i32>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r158 = fptoui <16 x float> undef to <16 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r159 = fptosi <16 x float> undef to <16 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:87 CodeSize:1 Lat:1 SizeLat:1 for: %r160 = fptoui <16 x double> undef to <16 x i1>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:87 CodeSize:1 Lat:1 SizeLat:1 for: %r161 = fptosi <16 x double> undef to <16 x i1>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:1 Lat:1 SizeLat:1 for: %r162 = fptoui <16 x double> undef to <16 x i8>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:1 Lat:1 SizeLat:1 for: %r163 = fptosi <16 x double> undef to <16 x i8>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %r164 = fptoui <16 x double> undef to <16 x i16>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %r165 = fptosi <16 x double> undef to <16 x i16>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r166 = fptoui <16 x double> undef to <16 x i32>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r167 = fptosi <16 x double> undef to <16 x i32>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r168 = fptoui <16 x double> undef to <16 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r169 = fptosi <16 x double> undef to <16 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r170 = uitofp <2 x i1> undef to <2 x float>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r171 = sitofp <2 x i1> undef to <2 x float>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r172 = uitofp <2 x i8> undef to <2 x float>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r173 = sitofp <2 x i8> undef to <2 x float>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r174 = uitofp <2 x i16> undef to <2 x float>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r175 = sitofp <2 x i16> undef to <2 x float>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r176 = uitofp <2 x i32> undef to <2 x float>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r177 = sitofp <2 x i32> undef to <2 x float>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r178 = uitofp <2 x i64> undef to <2 x float>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r179 = sitofp <2 x i64> undef to <2 x float>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r180 = uitofp <2 x i1> undef to <2 x double>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r181 = sitofp <2 x i1> undef to <2 x double>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r182 = uitofp <2 x i8> undef to <2 x double>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r183 = sitofp <2 x i8> undef to <2 x double>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r184 = uitofp <2 x i16> undef to <2 x double>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r185 = sitofp <2 x i16> undef to <2 x double>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r186 = uitofp <2 x i32> undef to <2 x double>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r187 = sitofp <2 x i32> undef to <2 x double>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r188 = uitofp <2 x i64> undef to <2 x double>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r189 = sitofp <2 x i64> undef to <2 x double>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r190 = uitofp <4 x i1> undef to <4 x float>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r191 = sitofp <4 x i1> undef to <4 x float>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r192 = uitofp <4 x i8> undef to <4 x float>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r193 = sitofp <4 x i8> undef to <4 x float>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r194 = uitofp <4 x i16> undef to <4 x float>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r195 = sitofp <4 x i16> undef to <4 x float>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r196 = uitofp <4 x i32> undef to <4 x float>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r197 = sitofp <4 x i32> undef to <4 x float>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: %r198 = uitofp <4 x i64> undef to <4 x float>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: %r199 = sitofp <4 x i64> undef to <4 x float>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r200 = uitofp <4 x i1> undef to <4 x double>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r201 = sitofp <4 x i1> undef to <4 x double>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %r202 = uitofp <4 x i8> undef to <4 x double>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %r203 = sitofp <4 x i8> undef to <4 x double>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %r204 = uitofp <4 x i16> undef to <4 x double>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %r205 = sitofp <4 x i16> undef to <4 x double>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r206 = uitofp <4 x i32> undef to <4 x double>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r207 = sitofp <4 x i32> undef to <4 x double>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r208 = uitofp <4 x i64> undef to <4 x double>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r209 = sitofp <4 x i64> undef to <4 x double>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r210 = uitofp <8 x i1> undef to <8 x float>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r211 = sitofp <8 x i1> undef to <8 x float>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r212 = uitofp <8 x i8> undef to <8 x float>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r213 = sitofp <8 x i8> undef to <8 x float>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r214 = uitofp <8 x i16> undef to <8 x float>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r215 = sitofp <8 x i16> undef to <8 x float>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r216 = uitofp <8 x i32> undef to <8 x float>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r217 = sitofp <8 x i32> undef to <8 x float>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:1 Lat:1 SizeLat:1 for: %r218 = uitofp <8 x i64> undef to <8 x float>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:1 Lat:1 SizeLat:1 for: %r219 = sitofp <8 x i64> undef to <8 x float>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r220 = uitofp <8 x i1> undef to <8 x double>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r221 = sitofp <8 x i1> undef to <8 x double>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %r222 = uitofp <8 x i8> undef to <8 x double>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %r223 = sitofp <8 x i8> undef to <8 x double>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %r224 = uitofp <8 x i16> undef to <8 x double>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %r225 = sitofp <8 x i16> undef to <8 x double>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r226 = uitofp <8 x i32> undef to <8 x double>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r227 = sitofp <8 x i32> undef to <8 x double>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r228 = uitofp <8 x i64> undef to <8 x double>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r229 = sitofp <8 x i64> undef to <8 x double>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r230 = uitofp <16 x i1> undef to <16 x float>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r231 = sitofp <16 x i1> undef to <16 x float>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r232 = uitofp <16 x i8> undef to <16 x float>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r233 = sitofp <16 x i8> undef to <16 x float>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r234 = uitofp <16 x i16> undef to <16 x float>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r235 = sitofp <16 x i16> undef to <16 x float>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r236 = uitofp <16 x i32> undef to <16 x float>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r237 = sitofp <16 x i32> undef to <16 x float>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:72 CodeSize:1 Lat:1 SizeLat:1 for: %r238 = uitofp <16 x i64> undef to <16 x float>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:72 CodeSize:1 Lat:1 SizeLat:1 for: %r239 = sitofp <16 x i64> undef to <16 x float>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %r240 = uitofp <16 x i1> undef to <16 x double>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %r241 = sitofp <16 x i1> undef to <16 x double>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:1 Lat:1 SizeLat:1 for: %r242 = uitofp <16 x i8> undef to <16 x double>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:1 Lat:1 SizeLat:1 for: %r243 = sitofp <16 x i8> undef to <16 x double>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:1 Lat:1 SizeLat:1 for: %r244 = uitofp <16 x i16> undef to <16 x double>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:1 Lat:1 SizeLat:1 for: %r245 = sitofp <16 x i16> undef to <16 x double>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: %r246 = uitofp <16 x i32> undef to <16 x double>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: %r247 = sitofp <16 x i32> undef to <16 x double>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r248 = uitofp <16 x i64> undef to <16 x double>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r249 = sitofp <16 x i64> undef to <16 x double>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r30 = fptoui float poison to i1
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r31 = fptosi float poison to i1
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r32 = fptoui float poison to i8
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r33 = fptosi float poison to i8
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r34 = fptoui float poison to i16
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r35 = fptosi float poison to i16
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r36 = fptoui float poison to i32
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r37 = fptosi float poison to i32
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r38 = fptoui float poison to i64
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r39 = fptosi float poison to i64
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r40 = fptoui double poison to i1
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r41 = fptosi double poison to i1
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r42 = fptoui double poison to i8
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r43 = fptosi double poison to i8
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r44 = fptoui double poison to i16
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r45 = fptosi double poison to i16
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r46 = fptoui double poison to i32
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r47 = fptosi double poison to i32
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r48 = fptoui double poison to i64
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r49 = fptosi double poison to i64
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r50 = sitofp i1 poison to float
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r51 = uitofp i1 poison to float
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r52 = sitofp i1 poison to double
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r53 = uitofp i1 poison to double
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r54 = sitofp i8 poison to float
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r55 = uitofp i8 poison to float
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r56 = sitofp i8 poison to double
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r57 = uitofp i8 poison to double
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r58 = sitofp i16 poison to float
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r59 = uitofp i16 poison to float
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r60 = sitofp i16 poison to double
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r61 = uitofp i16 poison to double
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r62 = sitofp i32 poison to float
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r63 = uitofp i32 poison to float
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r64 = sitofp i32 poison to double
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r65 = uitofp i32 poison to double
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r66 = sitofp i64 poison to float
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r67 = uitofp i64 poison to float
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r68 = sitofp i64 poison to double
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r69 = uitofp i64 poison to double
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r80 = fptrunc double poison to float
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r81 = fptrunc <2 x double> poison to <2 x float>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r82 = fptrunc <4 x double> poison to <4 x float>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r83 = fptrunc <8 x double> poison to <8 x float>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r84 = fptrunc <16 x double> poison to <16 x float>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r85 = fpext float poison to double
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r86 = fpext <2 x float> poison to <2 x double>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r87 = fpext <4 x float> poison to <4 x double>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r88 = fpext <8 x float> poison to <8 x double>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r89 = fpext <16 x float> poison to <16 x double>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r90 = fptoui <2 x float> poison to <2 x i1>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r91 = fptosi <2 x float> poison to <2 x i1>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r92 = fptoui <2 x float> poison to <2 x i8>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r93 = fptosi <2 x float> poison to <2 x i8>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r94 = fptoui <2 x float> poison to <2 x i16>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r95 = fptosi <2 x float> poison to <2 x i16>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r96 = fptoui <2 x float> poison to <2 x i32>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r97 = fptosi <2 x float> poison to <2 x i32>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r98 = fptoui <2 x float> poison to <2 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r99 = fptosi <2 x float> poison to <2 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r100 = fptoui <2 x double> poison to <2 x i1>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r101 = fptosi <2 x double> poison to <2 x i1>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r102 = fptoui <2 x double> poison to <2 x i8>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r103 = fptosi <2 x double> poison to <2 x i8>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r104 = fptoui <2 x double> poison to <2 x i16>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r105 = fptosi <2 x double> poison to <2 x i16>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r106 = fptoui <2 x double> poison to <2 x i32>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r107 = fptosi <2 x double> poison to <2 x i32>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r108 = fptoui <2 x double> poison to <2 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r109 = fptosi <2 x double> poison to <2 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r110 = fptoui <4 x float> poison to <4 x i1>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r111 = fptosi <4 x float> poison to <4 x i1>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r112 = fptoui <4 x float> poison to <4 x i8>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r113 = fptosi <4 x float> poison to <4 x i8>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r114 = fptoui <4 x float> poison to <4 x i16>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r115 = fptosi <4 x float> poison to <4 x i16>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r116 = fptoui <4 x float> poison to <4 x i32>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r117 = fptosi <4 x float> poison to <4 x i32>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r118 = fptoui <4 x float> poison to <4 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r119 = fptosi <4 x float> poison to <4 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r120 = fptoui <4 x double> poison to <4 x i1>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r121 = fptosi <4 x double> poison to <4 x i1>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r122 = fptoui <4 x double> poison to <4 x i8>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r123 = fptosi <4 x double> poison to <4 x i8>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r124 = fptoui <4 x double> poison to <4 x i16>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r125 = fptosi <4 x double> poison to <4 x i16>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r126 = fptoui <4 x double> poison to <4 x i32>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r127 = fptosi <4 x double> poison to <4 x i32>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r128 = fptoui <4 x double> poison to <4 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r129 = fptosi <4 x double> poison to <4 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:41 CodeSize:1 Lat:1 SizeLat:1 for: %r130 = fptoui <8 x float> poison to <8 x i1>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:41 CodeSize:1 Lat:1 SizeLat:1 for: %r131 = fptosi <8 x float> poison to <8 x i1>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r132 = fptoui <8 x float> poison to <8 x i8>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r133 = fptosi <8 x float> poison to <8 x i8>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r134 = fptoui <8 x float> poison to <8 x i16>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r135 = fptosi <8 x float> poison to <8 x i16>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r136 = fptoui <8 x float> poison to <8 x i32>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r137 = fptosi <8 x float> poison to <8 x i32>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r138 = fptoui <8 x float> poison to <8 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r139 = fptosi <8 x float> poison to <8 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:1 Lat:1 SizeLat:1 for: %r140 = fptoui <8 x double> poison to <8 x i1>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:1 Lat:1 SizeLat:1 for: %r141 = fptosi <8 x double> poison to <8 x i1>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r142 = fptoui <8 x double> poison to <8 x i8>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r143 = fptosi <8 x double> poison to <8 x i8>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r144 = fptoui <8 x double> poison to <8 x i16>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r145 = fptosi <8 x double> poison to <8 x i16>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r146 = fptoui <8 x double> poison to <8 x i32>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r147 = fptosi <8 x double> poison to <8 x i32>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r148 = fptoui <8 x double> poison to <8 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r149 = fptosi <8 x double> poison to <8 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:83 CodeSize:1 Lat:1 SizeLat:1 for: %r150 = fptoui <16 x float> poison to <16 x i1>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:83 CodeSize:1 Lat:1 SizeLat:1 for: %r151 = fptosi <16 x float> poison to <16 x i1>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r152 = fptoui <16 x float> poison to <16 x i8>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r153 = fptosi <16 x float> poison to <16 x i8>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r154 = fptoui <16 x float> poison to <16 x i16>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r155 = fptosi <16 x float> poison to <16 x i16>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r156 = fptoui <16 x float> poison to <16 x i32>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r157 = fptosi <16 x float> poison to <16 x i32>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r158 = fptoui <16 x float> poison to <16 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r159 = fptosi <16 x float> poison to <16 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:87 CodeSize:1 Lat:1 SizeLat:1 for: %r160 = fptoui <16 x double> poison to <16 x i1>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:87 CodeSize:1 Lat:1 SizeLat:1 for: %r161 = fptosi <16 x double> poison to <16 x i1>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:1 Lat:1 SizeLat:1 for: %r162 = fptoui <16 x double> poison to <16 x i8>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:1 Lat:1 SizeLat:1 for: %r163 = fptosi <16 x double> poison to <16 x i8>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %r164 = fptoui <16 x double> poison to <16 x i16>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %r165 = fptosi <16 x double> poison to <16 x i16>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r166 = fptoui <16 x double> poison to <16 x i32>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r167 = fptosi <16 x double> poison to <16 x i32>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r168 = fptoui <16 x double> poison to <16 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r169 = fptosi <16 x double> poison to <16 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r170 = uitofp <2 x i1> poison to <2 x float>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r171 = sitofp <2 x i1> poison to <2 x float>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r172 = uitofp <2 x i8> poison to <2 x float>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r173 = sitofp <2 x i8> poison to <2 x float>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r174 = uitofp <2 x i16> poison to <2 x float>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r175 = sitofp <2 x i16> poison to <2 x float>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r176 = uitofp <2 x i32> poison to <2 x float>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r177 = sitofp <2 x i32> poison to <2 x float>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r178 = uitofp <2 x i64> poison to <2 x float>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r179 = sitofp <2 x i64> poison to <2 x float>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r180 = uitofp <2 x i1> poison to <2 x double>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r181 = sitofp <2 x i1> poison to <2 x double>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r182 = uitofp <2 x i8> poison to <2 x double>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r183 = sitofp <2 x i8> poison to <2 x double>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r184 = uitofp <2 x i16> poison to <2 x double>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r185 = sitofp <2 x i16> poison to <2 x double>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r186 = uitofp <2 x i32> poison to <2 x double>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r187 = sitofp <2 x i32> poison to <2 x double>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r188 = uitofp <2 x i64> poison to <2 x double>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r189 = sitofp <2 x i64> poison to <2 x double>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r190 = uitofp <4 x i1> poison to <4 x float>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r191 = sitofp <4 x i1> poison to <4 x float>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r192 = uitofp <4 x i8> poison to <4 x float>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r193 = sitofp <4 x i8> poison to <4 x float>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r194 = uitofp <4 x i16> poison to <4 x float>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r195 = sitofp <4 x i16> poison to <4 x float>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r196 = uitofp <4 x i32> poison to <4 x float>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r197 = sitofp <4 x i32> poison to <4 x float>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: %r198 = uitofp <4 x i64> poison to <4 x float>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: %r199 = sitofp <4 x i64> poison to <4 x float>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r200 = uitofp <4 x i1> poison to <4 x double>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r201 = sitofp <4 x i1> poison to <4 x double>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %r202 = uitofp <4 x i8> poison to <4 x double>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %r203 = sitofp <4 x i8> poison to <4 x double>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %r204 = uitofp <4 x i16> poison to <4 x double>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %r205 = sitofp <4 x i16> poison to <4 x double>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r206 = uitofp <4 x i32> poison to <4 x double>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r207 = sitofp <4 x i32> poison to <4 x double>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r208 = uitofp <4 x i64> poison to <4 x double>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r209 = sitofp <4 x i64> poison to <4 x double>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r210 = uitofp <8 x i1> poison to <8 x float>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r211 = sitofp <8 x i1> poison to <8 x float>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r212 = uitofp <8 x i8> poison to <8 x float>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r213 = sitofp <8 x i8> poison to <8 x float>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r214 = uitofp <8 x i16> poison to <8 x float>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r215 = sitofp <8 x i16> poison to <8 x float>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r216 = uitofp <8 x i32> poison to <8 x float>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r217 = sitofp <8 x i32> poison to <8 x float>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:1 Lat:1 SizeLat:1 for: %r218 = uitofp <8 x i64> poison to <8 x float>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:1 Lat:1 SizeLat:1 for: %r219 = sitofp <8 x i64> poison to <8 x float>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r220 = uitofp <8 x i1> poison to <8 x double>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r221 = sitofp <8 x i1> poison to <8 x double>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %r222 = uitofp <8 x i8> poison to <8 x double>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %r223 = sitofp <8 x i8> poison to <8 x double>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %r224 = uitofp <8 x i16> poison to <8 x double>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %r225 = sitofp <8 x i16> poison to <8 x double>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r226 = uitofp <8 x i32> poison to <8 x double>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r227 = sitofp <8 x i32> poison to <8 x double>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r228 = uitofp <8 x i64> poison to <8 x double>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r229 = sitofp <8 x i64> poison to <8 x double>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r230 = uitofp <16 x i1> poison to <16 x float>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r231 = sitofp <16 x i1> poison to <16 x float>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r232 = uitofp <16 x i8> poison to <16 x float>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r233 = sitofp <16 x i8> poison to <16 x float>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r234 = uitofp <16 x i16> poison to <16 x float>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r235 = sitofp <16 x i16> poison to <16 x float>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r236 = uitofp <16 x i32> poison to <16 x float>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r237 = sitofp <16 x i32> poison to <16 x float>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:72 CodeSize:1 Lat:1 SizeLat:1 for: %r238 = uitofp <16 x i64> poison to <16 x float>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:72 CodeSize:1 Lat:1 SizeLat:1 for: %r239 = sitofp <16 x i64> poison to <16 x float>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %r240 = uitofp <16 x i1> poison to <16 x double>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %r241 = sitofp <16 x i1> poison to <16 x double>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:1 Lat:1 SizeLat:1 for: %r242 = uitofp <16 x i8> poison to <16 x double>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:1 Lat:1 SizeLat:1 for: %r243 = sitofp <16 x i8> poison to <16 x double>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:1 Lat:1 SizeLat:1 for: %r244 = uitofp <16 x i16> poison to <16 x double>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:1 Lat:1 SizeLat:1 for: %r245 = sitofp <16 x i16> poison to <16 x double>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: %r246 = uitofp <16 x i32> poison to <16 x double>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: %r247 = sitofp <16 x i32> poison to <16 x double>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r248 = uitofp <16 x i64> poison to <16 x double>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r249 = sitofp <16 x i64> poison to <16 x double>
 ; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; SVE128-NO-NEON-LABEL: 'casts_no_users'
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r30 = fptoui float undef to i1
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r31 = fptosi float undef to i1
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r32 = fptoui float undef to i8
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r33 = fptosi float undef to i8
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r34 = fptoui float undef to i16
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r35 = fptosi float undef to i16
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r36 = fptoui float undef to i32
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r37 = fptosi float undef to i32
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r38 = fptoui float undef to i64
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r39 = fptosi float undef to i64
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r40 = fptoui double undef to i1
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r41 = fptosi double undef to i1
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r42 = fptoui double undef to i8
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r43 = fptosi double undef to i8
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r44 = fptoui double undef to i16
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r45 = fptosi double undef to i16
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r46 = fptoui double undef to i32
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r47 = fptosi double undef to i32
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r48 = fptoui double undef to i64
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r49 = fptosi double undef to i64
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r50 = sitofp i1 undef to float
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r51 = uitofp i1 undef to float
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r52 = sitofp i1 undef to double
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r53 = uitofp i1 undef to double
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r54 = sitofp i8 undef to float
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r55 = uitofp i8 undef to float
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r56 = sitofp i8 undef to double
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r57 = uitofp i8 undef to double
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r58 = sitofp i16 undef to float
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r59 = uitofp i16 undef to float
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r60 = sitofp i16 undef to double
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r61 = uitofp i16 undef to double
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r62 = sitofp i32 undef to float
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r63 = uitofp i32 undef to float
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r64 = sitofp i32 undef to double
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r65 = uitofp i32 undef to double
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r66 = sitofp i64 undef to float
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r67 = uitofp i64 undef to float
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r68 = sitofp i64 undef to double
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r69 = uitofp i64 undef to double
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r80 = fptrunc double undef to float
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r81 = fptrunc <2 x double> undef to <2 x float>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r82 = fptrunc <4 x double> undef to <4 x float>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r83 = fptrunc <8 x double> undef to <8 x float>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r84 = fptrunc <16 x double> undef to <16 x float>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r85 = fpext float undef to double
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r86 = fpext <2 x float> undef to <2 x double>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r87 = fpext <4 x float> undef to <4 x double>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r88 = fpext <8 x float> undef to <8 x double>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r89 = fpext <16 x float> undef to <16 x double>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r90 = fptoui <2 x float> undef to <2 x i1>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r91 = fptosi <2 x float> undef to <2 x i1>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r92 = fptoui <2 x float> undef to <2 x i8>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r93 = fptosi <2 x float> undef to <2 x i8>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r94 = fptoui <2 x float> undef to <2 x i16>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r95 = fptosi <2 x float> undef to <2 x i16>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r96 = fptoui <2 x float> undef to <2 x i32>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r97 = fptosi <2 x float> undef to <2 x i32>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r98 = fptoui <2 x float> undef to <2 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r99 = fptosi <2 x float> undef to <2 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r100 = fptoui <2 x double> undef to <2 x i1>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r101 = fptosi <2 x double> undef to <2 x i1>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r102 = fptoui <2 x double> undef to <2 x i8>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r103 = fptosi <2 x double> undef to <2 x i8>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r104 = fptoui <2 x double> undef to <2 x i16>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r105 = fptosi <2 x double> undef to <2 x i16>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r106 = fptoui <2 x double> undef to <2 x i32>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r107 = fptosi <2 x double> undef to <2 x i32>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r108 = fptoui <2 x double> undef to <2 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r109 = fptosi <2 x double> undef to <2 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r110 = fptoui <4 x float> undef to <4 x i1>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r111 = fptosi <4 x float> undef to <4 x i1>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r112 = fptoui <4 x float> undef to <4 x i8>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r113 = fptosi <4 x float> undef to <4 x i8>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r114 = fptoui <4 x float> undef to <4 x i16>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r115 = fptosi <4 x float> undef to <4 x i16>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r116 = fptoui <4 x float> undef to <4 x i32>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r117 = fptosi <4 x float> undef to <4 x i32>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r118 = fptoui <4 x float> undef to <4 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r119 = fptosi <4 x float> undef to <4 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r120 = fptoui <4 x double> undef to <4 x i1>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r121 = fptosi <4 x double> undef to <4 x i1>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r122 = fptoui <4 x double> undef to <4 x i8>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r123 = fptosi <4 x double> undef to <4 x i8>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r124 = fptoui <4 x double> undef to <4 x i16>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r125 = fptosi <4 x double> undef to <4 x i16>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r126 = fptoui <4 x double> undef to <4 x i32>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r127 = fptosi <4 x double> undef to <4 x i32>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r128 = fptoui <4 x double> undef to <4 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r129 = fptosi <4 x double> undef to <4 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r130 = fptoui <8 x float> undef to <8 x i1>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r131 = fptosi <8 x float> undef to <8 x i1>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r132 = fptoui <8 x float> undef to <8 x i8>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r133 = fptosi <8 x float> undef to <8 x i8>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r134 = fptoui <8 x float> undef to <8 x i16>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r135 = fptosi <8 x float> undef to <8 x i16>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r136 = fptoui <8 x float> undef to <8 x i32>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r137 = fptosi <8 x float> undef to <8 x i32>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r138 = fptoui <8 x float> undef to <8 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r139 = fptosi <8 x float> undef to <8 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r140 = fptoui <8 x double> undef to <8 x i1>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r141 = fptosi <8 x double> undef to <8 x i1>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r142 = fptoui <8 x double> undef to <8 x i8>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r143 = fptosi <8 x double> undef to <8 x i8>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r144 = fptoui <8 x double> undef to <8 x i16>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r145 = fptosi <8 x double> undef to <8 x i16>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r146 = fptoui <8 x double> undef to <8 x i32>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r147 = fptosi <8 x double> undef to <8 x i32>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r148 = fptoui <8 x double> undef to <8 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r149 = fptosi <8 x double> undef to <8 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r150 = fptoui <16 x float> undef to <16 x i1>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r151 = fptosi <16 x float> undef to <16 x i1>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r152 = fptoui <16 x float> undef to <16 x i8>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r153 = fptosi <16 x float> undef to <16 x i8>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r154 = fptoui <16 x float> undef to <16 x i16>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r155 = fptosi <16 x float> undef to <16 x i16>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r156 = fptoui <16 x float> undef to <16 x i32>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r157 = fptosi <16 x float> undef to <16 x i32>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r158 = fptoui <16 x float> undef to <16 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r159 = fptosi <16 x float> undef to <16 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r160 = fptoui <16 x double> undef to <16 x i1>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r161 = fptosi <16 x double> undef to <16 x i1>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r162 = fptoui <16 x double> undef to <16 x i8>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r163 = fptosi <16 x double> undef to <16 x i8>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r164 = fptoui <16 x double> undef to <16 x i16>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r165 = fptosi <16 x double> undef to <16 x i16>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r166 = fptoui <16 x double> undef to <16 x i32>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r167 = fptosi <16 x double> undef to <16 x i32>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r168 = fptoui <16 x double> undef to <16 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r169 = fptosi <16 x double> undef to <16 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r170 = uitofp <2 x i1> undef to <2 x float>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r171 = sitofp <2 x i1> undef to <2 x float>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r172 = uitofp <2 x i8> undef to <2 x float>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r173 = sitofp <2 x i8> undef to <2 x float>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r174 = uitofp <2 x i16> undef to <2 x float>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r175 = sitofp <2 x i16> undef to <2 x float>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r176 = uitofp <2 x i32> undef to <2 x float>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r177 = sitofp <2 x i32> undef to <2 x float>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r178 = uitofp <2 x i64> undef to <2 x float>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r179 = sitofp <2 x i64> undef to <2 x float>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r180 = uitofp <2 x i1> undef to <2 x double>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r181 = sitofp <2 x i1> undef to <2 x double>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r182 = uitofp <2 x i8> undef to <2 x double>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r183 = sitofp <2 x i8> undef to <2 x double>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r184 = uitofp <2 x i16> undef to <2 x double>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r185 = sitofp <2 x i16> undef to <2 x double>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r186 = uitofp <2 x i32> undef to <2 x double>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r187 = sitofp <2 x i32> undef to <2 x double>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r188 = uitofp <2 x i64> undef to <2 x double>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r189 = sitofp <2 x i64> undef to <2 x double>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r190 = uitofp <4 x i1> undef to <4 x float>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r191 = sitofp <4 x i1> undef to <4 x float>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r192 = uitofp <4 x i8> undef to <4 x float>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r193 = sitofp <4 x i8> undef to <4 x float>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r194 = uitofp <4 x i16> undef to <4 x float>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r195 = sitofp <4 x i16> undef to <4 x float>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r196 = uitofp <4 x i32> undef to <4 x float>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r197 = sitofp <4 x i32> undef to <4 x float>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r198 = uitofp <4 x i64> undef to <4 x float>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r199 = sitofp <4 x i64> undef to <4 x float>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r200 = uitofp <4 x i1> undef to <4 x double>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r201 = sitofp <4 x i1> undef to <4 x double>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r202 = uitofp <4 x i8> undef to <4 x double>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r203 = sitofp <4 x i8> undef to <4 x double>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r204 = uitofp <4 x i16> undef to <4 x double>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r205 = sitofp <4 x i16> undef to <4 x double>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r206 = uitofp <4 x i32> undef to <4 x double>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r207 = sitofp <4 x i32> undef to <4 x double>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r208 = uitofp <4 x i64> undef to <4 x double>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r209 = sitofp <4 x i64> undef to <4 x double>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r210 = uitofp <8 x i1> undef to <8 x float>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r211 = sitofp <8 x i1> undef to <8 x float>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r212 = uitofp <8 x i8> undef to <8 x float>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r213 = sitofp <8 x i8> undef to <8 x float>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r214 = uitofp <8 x i16> undef to <8 x float>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r215 = sitofp <8 x i16> undef to <8 x float>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r216 = uitofp <8 x i32> undef to <8 x float>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r217 = sitofp <8 x i32> undef to <8 x float>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r218 = uitofp <8 x i64> undef to <8 x float>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r219 = sitofp <8 x i64> undef to <8 x float>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r220 = uitofp <8 x i1> undef to <8 x double>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r221 = sitofp <8 x i1> undef to <8 x double>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r222 = uitofp <8 x i8> undef to <8 x double>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r223 = sitofp <8 x i8> undef to <8 x double>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r224 = uitofp <8 x i16> undef to <8 x double>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r225 = sitofp <8 x i16> undef to <8 x double>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r226 = uitofp <8 x i32> undef to <8 x double>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r227 = sitofp <8 x i32> undef to <8 x double>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r228 = uitofp <8 x i64> undef to <8 x double>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r229 = sitofp <8 x i64> undef to <8 x double>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r230 = uitofp <16 x i1> undef to <16 x float>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r231 = sitofp <16 x i1> undef to <16 x float>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r232 = uitofp <16 x i8> undef to <16 x float>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r233 = sitofp <16 x i8> undef to <16 x float>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r234 = uitofp <16 x i16> undef to <16 x float>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r235 = sitofp <16 x i16> undef to <16 x float>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r236 = uitofp <16 x i32> undef to <16 x float>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r237 = sitofp <16 x i32> undef to <16 x float>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r238 = uitofp <16 x i64> undef to <16 x float>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r239 = sitofp <16 x i64> undef to <16 x float>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r240 = uitofp <16 x i1> undef to <16 x double>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r241 = sitofp <16 x i1> undef to <16 x double>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: %r242 = uitofp <16 x i8> undef to <16 x double>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: %r243 = sitofp <16 x i8> undef to <16 x double>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r244 = uitofp <16 x i16> undef to <16 x double>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r245 = sitofp <16 x i16> undef to <16 x double>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r246 = uitofp <16 x i32> undef to <16 x double>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r247 = sitofp <16 x i32> undef to <16 x double>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r248 = uitofp <16 x i64> undef to <16 x double>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r249 = sitofp <16 x i64> undef to <16 x double>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r30 = fptoui float poison to i1
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r31 = fptosi float poison to i1
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r32 = fptoui float poison to i8
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r33 = fptosi float poison to i8
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r34 = fptoui float poison to i16
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r35 = fptosi float poison to i16
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r36 = fptoui float poison to i32
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r37 = fptosi float poison to i32
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r38 = fptoui float poison to i64
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r39 = fptosi float poison to i64
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r40 = fptoui double poison to i1
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r41 = fptosi double poison to i1
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r42 = fptoui double poison to i8
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r43 = fptosi double poison to i8
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r44 = fptoui double poison to i16
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r45 = fptosi double poison to i16
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r46 = fptoui double poison to i32
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r47 = fptosi double poison to i32
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r48 = fptoui double poison to i64
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r49 = fptosi double poison to i64
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r50 = sitofp i1 poison to float
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r51 = uitofp i1 poison to float
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r52 = sitofp i1 poison to double
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r53 = uitofp i1 poison to double
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r54 = sitofp i8 poison to float
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r55 = uitofp i8 poison to float
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r56 = sitofp i8 poison to double
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r57 = uitofp i8 poison to double
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r58 = sitofp i16 poison to float
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r59 = uitofp i16 poison to float
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r60 = sitofp i16 poison to double
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r61 = uitofp i16 poison to double
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r62 = sitofp i32 poison to float
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r63 = uitofp i32 poison to float
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r64 = sitofp i32 poison to double
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r65 = uitofp i32 poison to double
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r66 = sitofp i64 poison to float
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r67 = uitofp i64 poison to float
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r68 = sitofp i64 poison to double
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r69 = uitofp i64 poison to double
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r80 = fptrunc double poison to float
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r81 = fptrunc <2 x double> poison to <2 x float>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r82 = fptrunc <4 x double> poison to <4 x float>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r83 = fptrunc <8 x double> poison to <8 x float>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r84 = fptrunc <16 x double> poison to <16 x float>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r85 = fpext float poison to double
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r86 = fpext <2 x float> poison to <2 x double>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r87 = fpext <4 x float> poison to <4 x double>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r88 = fpext <8 x float> poison to <8 x double>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r89 = fpext <16 x float> poison to <16 x double>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r90 = fptoui <2 x float> poison to <2 x i1>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r91 = fptosi <2 x float> poison to <2 x i1>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r92 = fptoui <2 x float> poison to <2 x i8>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r93 = fptosi <2 x float> poison to <2 x i8>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r94 = fptoui <2 x float> poison to <2 x i16>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r95 = fptosi <2 x float> poison to <2 x i16>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r96 = fptoui <2 x float> poison to <2 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r97 = fptosi <2 x float> poison to <2 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r98 = fptoui <2 x float> poison to <2 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r99 = fptosi <2 x float> poison to <2 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r100 = fptoui <2 x double> poison to <2 x i1>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r101 = fptosi <2 x double> poison to <2 x i1>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r102 = fptoui <2 x double> poison to <2 x i8>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r103 = fptosi <2 x double> poison to <2 x i8>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r104 = fptoui <2 x double> poison to <2 x i16>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r105 = fptosi <2 x double> poison to <2 x i16>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r106 = fptoui <2 x double> poison to <2 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r107 = fptosi <2 x double> poison to <2 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r108 = fptoui <2 x double> poison to <2 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r109 = fptosi <2 x double> poison to <2 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r110 = fptoui <4 x float> poison to <4 x i1>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r111 = fptosi <4 x float> poison to <4 x i1>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r112 = fptoui <4 x float> poison to <4 x i8>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r113 = fptosi <4 x float> poison to <4 x i8>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r114 = fptoui <4 x float> poison to <4 x i16>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r115 = fptosi <4 x float> poison to <4 x i16>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r116 = fptoui <4 x float> poison to <4 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r117 = fptosi <4 x float> poison to <4 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r118 = fptoui <4 x float> poison to <4 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r119 = fptosi <4 x float> poison to <4 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r120 = fptoui <4 x double> poison to <4 x i1>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r121 = fptosi <4 x double> poison to <4 x i1>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r122 = fptoui <4 x double> poison to <4 x i8>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r123 = fptosi <4 x double> poison to <4 x i8>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r124 = fptoui <4 x double> poison to <4 x i16>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r125 = fptosi <4 x double> poison to <4 x i16>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r126 = fptoui <4 x double> poison to <4 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r127 = fptosi <4 x double> poison to <4 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r128 = fptoui <4 x double> poison to <4 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r129 = fptosi <4 x double> poison to <4 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r130 = fptoui <8 x float> poison to <8 x i1>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r131 = fptosi <8 x float> poison to <8 x i1>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r132 = fptoui <8 x float> poison to <8 x i8>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r133 = fptosi <8 x float> poison to <8 x i8>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r134 = fptoui <8 x float> poison to <8 x i16>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r135 = fptosi <8 x float> poison to <8 x i16>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r136 = fptoui <8 x float> poison to <8 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r137 = fptosi <8 x float> poison to <8 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r138 = fptoui <8 x float> poison to <8 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r139 = fptosi <8 x float> poison to <8 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r140 = fptoui <8 x double> poison to <8 x i1>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r141 = fptosi <8 x double> poison to <8 x i1>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r142 = fptoui <8 x double> poison to <8 x i8>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r143 = fptosi <8 x double> poison to <8 x i8>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r144 = fptoui <8 x double> poison to <8 x i16>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r145 = fptosi <8 x double> poison to <8 x i16>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r146 = fptoui <8 x double> poison to <8 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r147 = fptosi <8 x double> poison to <8 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r148 = fptoui <8 x double> poison to <8 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r149 = fptosi <8 x double> poison to <8 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r150 = fptoui <16 x float> poison to <16 x i1>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r151 = fptosi <16 x float> poison to <16 x i1>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r152 = fptoui <16 x float> poison to <16 x i8>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r153 = fptosi <16 x float> poison to <16 x i8>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r154 = fptoui <16 x float> poison to <16 x i16>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r155 = fptosi <16 x float> poison to <16 x i16>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r156 = fptoui <16 x float> poison to <16 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r157 = fptosi <16 x float> poison to <16 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r158 = fptoui <16 x float> poison to <16 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r159 = fptosi <16 x float> poison to <16 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r160 = fptoui <16 x double> poison to <16 x i1>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r161 = fptosi <16 x double> poison to <16 x i1>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r162 = fptoui <16 x double> poison to <16 x i8>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r163 = fptosi <16 x double> poison to <16 x i8>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r164 = fptoui <16 x double> poison to <16 x i16>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r165 = fptosi <16 x double> poison to <16 x i16>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r166 = fptoui <16 x double> poison to <16 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r167 = fptosi <16 x double> poison to <16 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r168 = fptoui <16 x double> poison to <16 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r169 = fptosi <16 x double> poison to <16 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r170 = uitofp <2 x i1> poison to <2 x float>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r171 = sitofp <2 x i1> poison to <2 x float>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r172 = uitofp <2 x i8> poison to <2 x float>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r173 = sitofp <2 x i8> poison to <2 x float>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r174 = uitofp <2 x i16> poison to <2 x float>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r175 = sitofp <2 x i16> poison to <2 x float>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r176 = uitofp <2 x i32> poison to <2 x float>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r177 = sitofp <2 x i32> poison to <2 x float>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r178 = uitofp <2 x i64> poison to <2 x float>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r179 = sitofp <2 x i64> poison to <2 x float>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r180 = uitofp <2 x i1> poison to <2 x double>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r181 = sitofp <2 x i1> poison to <2 x double>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r182 = uitofp <2 x i8> poison to <2 x double>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r183 = sitofp <2 x i8> poison to <2 x double>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r184 = uitofp <2 x i16> poison to <2 x double>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r185 = sitofp <2 x i16> poison to <2 x double>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r186 = uitofp <2 x i32> poison to <2 x double>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r187 = sitofp <2 x i32> poison to <2 x double>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r188 = uitofp <2 x i64> poison to <2 x double>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r189 = sitofp <2 x i64> poison to <2 x double>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r190 = uitofp <4 x i1> poison to <4 x float>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r191 = sitofp <4 x i1> poison to <4 x float>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r192 = uitofp <4 x i8> poison to <4 x float>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r193 = sitofp <4 x i8> poison to <4 x float>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r194 = uitofp <4 x i16> poison to <4 x float>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r195 = sitofp <4 x i16> poison to <4 x float>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r196 = uitofp <4 x i32> poison to <4 x float>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r197 = sitofp <4 x i32> poison to <4 x float>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r198 = uitofp <4 x i64> poison to <4 x float>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r199 = sitofp <4 x i64> poison to <4 x float>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r200 = uitofp <4 x i1> poison to <4 x double>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r201 = sitofp <4 x i1> poison to <4 x double>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r202 = uitofp <4 x i8> poison to <4 x double>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r203 = sitofp <4 x i8> poison to <4 x double>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r204 = uitofp <4 x i16> poison to <4 x double>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r205 = sitofp <4 x i16> poison to <4 x double>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r206 = uitofp <4 x i32> poison to <4 x double>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r207 = sitofp <4 x i32> poison to <4 x double>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r208 = uitofp <4 x i64> poison to <4 x double>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r209 = sitofp <4 x i64> poison to <4 x double>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r210 = uitofp <8 x i1> poison to <8 x float>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r211 = sitofp <8 x i1> poison to <8 x float>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r212 = uitofp <8 x i8> poison to <8 x float>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r213 = sitofp <8 x i8> poison to <8 x float>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r214 = uitofp <8 x i16> poison to <8 x float>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r215 = sitofp <8 x i16> poison to <8 x float>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r216 = uitofp <8 x i32> poison to <8 x float>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r217 = sitofp <8 x i32> poison to <8 x float>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r218 = uitofp <8 x i64> poison to <8 x float>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r219 = sitofp <8 x i64> poison to <8 x float>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r220 = uitofp <8 x i1> poison to <8 x double>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r221 = sitofp <8 x i1> poison to <8 x double>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r222 = uitofp <8 x i8> poison to <8 x double>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r223 = sitofp <8 x i8> poison to <8 x double>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r224 = uitofp <8 x i16> poison to <8 x double>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r225 = sitofp <8 x i16> poison to <8 x double>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r226 = uitofp <8 x i32> poison to <8 x double>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r227 = sitofp <8 x i32> poison to <8 x double>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r228 = uitofp <8 x i64> poison to <8 x double>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r229 = sitofp <8 x i64> poison to <8 x double>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r230 = uitofp <16 x i1> poison to <16 x float>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r231 = sitofp <16 x i1> poison to <16 x float>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r232 = uitofp <16 x i8> poison to <16 x float>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r233 = sitofp <16 x i8> poison to <16 x float>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r234 = uitofp <16 x i16> poison to <16 x float>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r235 = sitofp <16 x i16> poison to <16 x float>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r236 = uitofp <16 x i32> poison to <16 x float>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r237 = sitofp <16 x i32> poison to <16 x float>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r238 = uitofp <16 x i64> poison to <16 x float>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r239 = sitofp <16 x i64> poison to <16 x float>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r240 = uitofp <16 x i1> poison to <16 x double>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r241 = sitofp <16 x i1> poison to <16 x double>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: %r242 = uitofp <16 x i8> poison to <16 x double>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: %r243 = sitofp <16 x i8> poison to <16 x double>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r244 = uitofp <16 x i16> poison to <16 x double>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r245 = sitofp <16 x i16> poison to <16 x double>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r246 = uitofp <16 x i32> poison to <16 x double>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r247 = sitofp <16 x i32> poison to <16 x double>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r248 = uitofp <16 x i64> poison to <16 x double>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r249 = sitofp <16 x i64> poison to <16 x double>
 ; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; FIXED-MIN-256-LABEL: 'casts_no_users'
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r30 = fptoui float undef to i1
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r31 = fptosi float undef to i1
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r32 = fptoui float undef to i8
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r33 = fptosi float undef to i8
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r34 = fptoui float undef to i16
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r35 = fptosi float undef to i16
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r36 = fptoui float undef to i32
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r37 = fptosi float undef to i32
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r38 = fptoui float undef to i64
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r39 = fptosi float undef to i64
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r40 = fptoui double undef to i1
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r41 = fptosi double undef to i1
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r42 = fptoui double undef to i8
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r43 = fptosi double undef to i8
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r44 = fptoui double undef to i16
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r45 = fptosi double undef to i16
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r46 = fptoui double undef to i32
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r47 = fptosi double undef to i32
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r48 = fptoui double undef to i64
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r49 = fptosi double undef to i64
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r50 = sitofp i1 undef to float
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r51 = uitofp i1 undef to float
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r52 = sitofp i1 undef to double
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r53 = uitofp i1 undef to double
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r54 = sitofp i8 undef to float
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r55 = uitofp i8 undef to float
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r56 = sitofp i8 undef to double
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r57 = uitofp i8 undef to double
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r58 = sitofp i16 undef to float
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r59 = uitofp i16 undef to float
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r60 = sitofp i16 undef to double
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r61 = uitofp i16 undef to double
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r62 = sitofp i32 undef to float
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r63 = uitofp i32 undef to float
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r64 = sitofp i32 undef to double
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r65 = uitofp i32 undef to double
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r66 = sitofp i64 undef to float
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r67 = uitofp i64 undef to float
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r68 = sitofp i64 undef to double
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r69 = uitofp i64 undef to double
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r80 = fptrunc double undef to float
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r81 = fptrunc <2 x double> undef to <2 x float>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r82 = fptrunc <4 x double> undef to <4 x float>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r83 = fptrunc <8 x double> undef to <8 x float>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r84 = fptrunc <16 x double> undef to <16 x float>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r85 = fpext float undef to double
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r86 = fpext <2 x float> undef to <2 x double>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r87 = fpext <4 x float> undef to <4 x double>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r88 = fpext <8 x float> undef to <8 x double>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r89 = fpext <16 x float> undef to <16 x double>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r90 = fptoui <2 x float> undef to <2 x i1>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r91 = fptosi <2 x float> undef to <2 x i1>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r92 = fptoui <2 x float> undef to <2 x i8>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r93 = fptosi <2 x float> undef to <2 x i8>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r94 = fptoui <2 x float> undef to <2 x i16>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r95 = fptosi <2 x float> undef to <2 x i16>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r96 = fptoui <2 x float> undef to <2 x i32>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r97 = fptosi <2 x float> undef to <2 x i32>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r98 = fptoui <2 x float> undef to <2 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r99 = fptosi <2 x float> undef to <2 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r100 = fptoui <2 x double> undef to <2 x i1>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r101 = fptosi <2 x double> undef to <2 x i1>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r102 = fptoui <2 x double> undef to <2 x i8>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r103 = fptosi <2 x double> undef to <2 x i8>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r104 = fptoui <2 x double> undef to <2 x i16>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r105 = fptosi <2 x double> undef to <2 x i16>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r106 = fptoui <2 x double> undef to <2 x i32>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r107 = fptosi <2 x double> undef to <2 x i32>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r108 = fptoui <2 x double> undef to <2 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r109 = fptosi <2 x double> undef to <2 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r110 = fptoui <4 x float> undef to <4 x i1>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r111 = fptosi <4 x float> undef to <4 x i1>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r112 = fptoui <4 x float> undef to <4 x i8>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r113 = fptosi <4 x float> undef to <4 x i8>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r114 = fptoui <4 x float> undef to <4 x i16>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r115 = fptosi <4 x float> undef to <4 x i16>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r116 = fptoui <4 x float> undef to <4 x i32>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r117 = fptosi <4 x float> undef to <4 x i32>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r118 = fptoui <4 x float> undef to <4 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r119 = fptosi <4 x float> undef to <4 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r120 = fptoui <4 x double> undef to <4 x i1>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r121 = fptosi <4 x double> undef to <4 x i1>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r122 = fptoui <4 x double> undef to <4 x i8>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r123 = fptosi <4 x double> undef to <4 x i8>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r124 = fptoui <4 x double> undef to <4 x i16>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r125 = fptosi <4 x double> undef to <4 x i16>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r126 = fptoui <4 x double> undef to <4 x i32>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r127 = fptosi <4 x double> undef to <4 x i32>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r128 = fptoui <4 x double> undef to <4 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r129 = fptosi <4 x double> undef to <4 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r130 = fptoui <8 x float> undef to <8 x i1>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r131 = fptosi <8 x float> undef to <8 x i1>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r132 = fptoui <8 x float> undef to <8 x i8>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r133 = fptosi <8 x float> undef to <8 x i8>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r134 = fptoui <8 x float> undef to <8 x i16>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r135 = fptosi <8 x float> undef to <8 x i16>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r136 = fptoui <8 x float> undef to <8 x i32>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r137 = fptosi <8 x float> undef to <8 x i32>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r138 = fptoui <8 x float> undef to <8 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r139 = fptosi <8 x float> undef to <8 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r140 = fptoui <8 x double> undef to <8 x i1>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r141 = fptosi <8 x double> undef to <8 x i1>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r142 = fptoui <8 x double> undef to <8 x i8>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r143 = fptosi <8 x double> undef to <8 x i8>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r144 = fptoui <8 x double> undef to <8 x i16>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r145 = fptosi <8 x double> undef to <8 x i16>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r146 = fptoui <8 x double> undef to <8 x i32>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r147 = fptosi <8 x double> undef to <8 x i32>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r148 = fptoui <8 x double> undef to <8 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r149 = fptosi <8 x double> undef to <8 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r150 = fptoui <16 x float> undef to <16 x i1>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r151 = fptosi <16 x float> undef to <16 x i1>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r152 = fptoui <16 x float> undef to <16 x i8>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r153 = fptosi <16 x float> undef to <16 x i8>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r154 = fptoui <16 x float> undef to <16 x i16>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r155 = fptosi <16 x float> undef to <16 x i16>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r156 = fptoui <16 x float> undef to <16 x i32>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r157 = fptosi <16 x float> undef to <16 x i32>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r158 = fptoui <16 x float> undef to <16 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r159 = fptosi <16 x float> undef to <16 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r160 = fptoui <16 x double> undef to <16 x i1>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r161 = fptosi <16 x double> undef to <16 x i1>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r162 = fptoui <16 x double> undef to <16 x i8>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r163 = fptosi <16 x double> undef to <16 x i8>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r164 = fptoui <16 x double> undef to <16 x i16>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r165 = fptosi <16 x double> undef to <16 x i16>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r166 = fptoui <16 x double> undef to <16 x i32>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r167 = fptosi <16 x double> undef to <16 x i32>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r168 = fptoui <16 x double> undef to <16 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r169 = fptosi <16 x double> undef to <16 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r170 = uitofp <2 x i1> undef to <2 x float>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r171 = sitofp <2 x i1> undef to <2 x float>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r172 = uitofp <2 x i8> undef to <2 x float>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r173 = sitofp <2 x i8> undef to <2 x float>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r174 = uitofp <2 x i16> undef to <2 x float>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r175 = sitofp <2 x i16> undef to <2 x float>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r176 = uitofp <2 x i32> undef to <2 x float>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r177 = sitofp <2 x i32> undef to <2 x float>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r178 = uitofp <2 x i64> undef to <2 x float>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r179 = sitofp <2 x i64> undef to <2 x float>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r180 = uitofp <2 x i1> undef to <2 x double>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r181 = sitofp <2 x i1> undef to <2 x double>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r182 = uitofp <2 x i8> undef to <2 x double>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r183 = sitofp <2 x i8> undef to <2 x double>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r184 = uitofp <2 x i16> undef to <2 x double>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r185 = sitofp <2 x i16> undef to <2 x double>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r186 = uitofp <2 x i32> undef to <2 x double>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r187 = sitofp <2 x i32> undef to <2 x double>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r188 = uitofp <2 x i64> undef to <2 x double>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r189 = sitofp <2 x i64> undef to <2 x double>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r190 = uitofp <4 x i1> undef to <4 x float>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r191 = sitofp <4 x i1> undef to <4 x float>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r192 = uitofp <4 x i8> undef to <4 x float>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r193 = sitofp <4 x i8> undef to <4 x float>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r194 = uitofp <4 x i16> undef to <4 x float>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r195 = sitofp <4 x i16> undef to <4 x float>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r196 = uitofp <4 x i32> undef to <4 x float>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r197 = sitofp <4 x i32> undef to <4 x float>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r198 = uitofp <4 x i64> undef to <4 x float>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r199 = sitofp <4 x i64> undef to <4 x float>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r200 = uitofp <4 x i1> undef to <4 x double>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r201 = sitofp <4 x i1> undef to <4 x double>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r202 = uitofp <4 x i8> undef to <4 x double>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r203 = sitofp <4 x i8> undef to <4 x double>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r204 = uitofp <4 x i16> undef to <4 x double>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r205 = sitofp <4 x i16> undef to <4 x double>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r206 = uitofp <4 x i32> undef to <4 x double>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r207 = sitofp <4 x i32> undef to <4 x double>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r208 = uitofp <4 x i64> undef to <4 x double>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r209 = sitofp <4 x i64> undef to <4 x double>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r210 = uitofp <8 x i1> undef to <8 x float>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r211 = sitofp <8 x i1> undef to <8 x float>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r212 = uitofp <8 x i8> undef to <8 x float>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r213 = sitofp <8 x i8> undef to <8 x float>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r214 = uitofp <8 x i16> undef to <8 x float>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r215 = sitofp <8 x i16> undef to <8 x float>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r216 = uitofp <8 x i32> undef to <8 x float>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r217 = sitofp <8 x i32> undef to <8 x float>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r218 = uitofp <8 x i64> undef to <8 x float>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r219 = sitofp <8 x i64> undef to <8 x float>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r220 = uitofp <8 x i1> undef to <8 x double>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r221 = sitofp <8 x i1> undef to <8 x double>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r222 = uitofp <8 x i8> undef to <8 x double>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r223 = sitofp <8 x i8> undef to <8 x double>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r224 = uitofp <8 x i16> undef to <8 x double>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r225 = sitofp <8 x i16> undef to <8 x double>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r226 = uitofp <8 x i32> undef to <8 x double>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r227 = sitofp <8 x i32> undef to <8 x double>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r228 = uitofp <8 x i64> undef to <8 x double>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r229 = sitofp <8 x i64> undef to <8 x double>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r230 = uitofp <16 x i1> undef to <16 x float>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r231 = sitofp <16 x i1> undef to <16 x float>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r232 = uitofp <16 x i8> undef to <16 x float>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r233 = sitofp <16 x i8> undef to <16 x float>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r234 = uitofp <16 x i16> undef to <16 x float>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r235 = sitofp <16 x i16> undef to <16 x float>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r236 = uitofp <16 x i32> undef to <16 x float>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r237 = sitofp <16 x i32> undef to <16 x float>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r238 = uitofp <16 x i64> undef to <16 x float>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r239 = sitofp <16 x i64> undef to <16 x float>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r240 = uitofp <16 x i1> undef to <16 x double>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r241 = sitofp <16 x i1> undef to <16 x double>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r242 = uitofp <16 x i8> undef to <16 x double>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r243 = sitofp <16 x i8> undef to <16 x double>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r244 = uitofp <16 x i16> undef to <16 x double>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r245 = sitofp <16 x i16> undef to <16 x double>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r246 = uitofp <16 x i32> undef to <16 x double>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r247 = sitofp <16 x i32> undef to <16 x double>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r248 = uitofp <16 x i64> undef to <16 x double>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r249 = sitofp <16 x i64> undef to <16 x double>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r30 = fptoui float poison to i1
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r31 = fptosi float poison to i1
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r32 = fptoui float poison to i8
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r33 = fptosi float poison to i8
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r34 = fptoui float poison to i16
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r35 = fptosi float poison to i16
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r36 = fptoui float poison to i32
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r37 = fptosi float poison to i32
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r38 = fptoui float poison to i64
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r39 = fptosi float poison to i64
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r40 = fptoui double poison to i1
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r41 = fptosi double poison to i1
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r42 = fptoui double poison to i8
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r43 = fptosi double poison to i8
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r44 = fptoui double poison to i16
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r45 = fptosi double poison to i16
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r46 = fptoui double poison to i32
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r47 = fptosi double poison to i32
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r48 = fptoui double poison to i64
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r49 = fptosi double poison to i64
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r50 = sitofp i1 poison to float
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r51 = uitofp i1 poison to float
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r52 = sitofp i1 poison to double
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r53 = uitofp i1 poison to double
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r54 = sitofp i8 poison to float
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r55 = uitofp i8 poison to float
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r56 = sitofp i8 poison to double
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r57 = uitofp i8 poison to double
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r58 = sitofp i16 poison to float
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r59 = uitofp i16 poison to float
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r60 = sitofp i16 poison to double
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r61 = uitofp i16 poison to double
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r62 = sitofp i32 poison to float
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r63 = uitofp i32 poison to float
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r64 = sitofp i32 poison to double
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r65 = uitofp i32 poison to double
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r66 = sitofp i64 poison to float
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r67 = uitofp i64 poison to float
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r68 = sitofp i64 poison to double
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r69 = uitofp i64 poison to double
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r80 = fptrunc double poison to float
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r81 = fptrunc <2 x double> poison to <2 x float>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r82 = fptrunc <4 x double> poison to <4 x float>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r83 = fptrunc <8 x double> poison to <8 x float>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r84 = fptrunc <16 x double> poison to <16 x float>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r85 = fpext float poison to double
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r86 = fpext <2 x float> poison to <2 x double>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r87 = fpext <4 x float> poison to <4 x double>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r88 = fpext <8 x float> poison to <8 x double>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r89 = fpext <16 x float> poison to <16 x double>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r90 = fptoui <2 x float> poison to <2 x i1>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r91 = fptosi <2 x float> poison to <2 x i1>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r92 = fptoui <2 x float> poison to <2 x i8>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r93 = fptosi <2 x float> poison to <2 x i8>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r94 = fptoui <2 x float> poison to <2 x i16>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r95 = fptosi <2 x float> poison to <2 x i16>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r96 = fptoui <2 x float> poison to <2 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r97 = fptosi <2 x float> poison to <2 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r98 = fptoui <2 x float> poison to <2 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r99 = fptosi <2 x float> poison to <2 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r100 = fptoui <2 x double> poison to <2 x i1>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r101 = fptosi <2 x double> poison to <2 x i1>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r102 = fptoui <2 x double> poison to <2 x i8>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r103 = fptosi <2 x double> poison to <2 x i8>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r104 = fptoui <2 x double> poison to <2 x i16>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r105 = fptosi <2 x double> poison to <2 x i16>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r106 = fptoui <2 x double> poison to <2 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r107 = fptosi <2 x double> poison to <2 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r108 = fptoui <2 x double> poison to <2 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r109 = fptosi <2 x double> poison to <2 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r110 = fptoui <4 x float> poison to <4 x i1>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r111 = fptosi <4 x float> poison to <4 x i1>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r112 = fptoui <4 x float> poison to <4 x i8>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r113 = fptosi <4 x float> poison to <4 x i8>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r114 = fptoui <4 x float> poison to <4 x i16>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r115 = fptosi <4 x float> poison to <4 x i16>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r116 = fptoui <4 x float> poison to <4 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r117 = fptosi <4 x float> poison to <4 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r118 = fptoui <4 x float> poison to <4 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r119 = fptosi <4 x float> poison to <4 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r120 = fptoui <4 x double> poison to <4 x i1>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r121 = fptosi <4 x double> poison to <4 x i1>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r122 = fptoui <4 x double> poison to <4 x i8>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r123 = fptosi <4 x double> poison to <4 x i8>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r124 = fptoui <4 x double> poison to <4 x i16>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r125 = fptosi <4 x double> poison to <4 x i16>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r126 = fptoui <4 x double> poison to <4 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r127 = fptosi <4 x double> poison to <4 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r128 = fptoui <4 x double> poison to <4 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r129 = fptosi <4 x double> poison to <4 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r130 = fptoui <8 x float> poison to <8 x i1>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r131 = fptosi <8 x float> poison to <8 x i1>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r132 = fptoui <8 x float> poison to <8 x i8>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r133 = fptosi <8 x float> poison to <8 x i8>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r134 = fptoui <8 x float> poison to <8 x i16>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r135 = fptosi <8 x float> poison to <8 x i16>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r136 = fptoui <8 x float> poison to <8 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r137 = fptosi <8 x float> poison to <8 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r138 = fptoui <8 x float> poison to <8 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r139 = fptosi <8 x float> poison to <8 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r140 = fptoui <8 x double> poison to <8 x i1>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r141 = fptosi <8 x double> poison to <8 x i1>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r142 = fptoui <8 x double> poison to <8 x i8>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r143 = fptosi <8 x double> poison to <8 x i8>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r144 = fptoui <8 x double> poison to <8 x i16>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r145 = fptosi <8 x double> poison to <8 x i16>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r146 = fptoui <8 x double> poison to <8 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r147 = fptosi <8 x double> poison to <8 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r148 = fptoui <8 x double> poison to <8 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r149 = fptosi <8 x double> poison to <8 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r150 = fptoui <16 x float> poison to <16 x i1>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r151 = fptosi <16 x float> poison to <16 x i1>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r152 = fptoui <16 x float> poison to <16 x i8>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r153 = fptosi <16 x float> poison to <16 x i8>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r154 = fptoui <16 x float> poison to <16 x i16>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r155 = fptosi <16 x float> poison to <16 x i16>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r156 = fptoui <16 x float> poison to <16 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r157 = fptosi <16 x float> poison to <16 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r158 = fptoui <16 x float> poison to <16 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r159 = fptosi <16 x float> poison to <16 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r160 = fptoui <16 x double> poison to <16 x i1>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r161 = fptosi <16 x double> poison to <16 x i1>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r162 = fptoui <16 x double> poison to <16 x i8>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r163 = fptosi <16 x double> poison to <16 x i8>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r164 = fptoui <16 x double> poison to <16 x i16>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r165 = fptosi <16 x double> poison to <16 x i16>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r166 = fptoui <16 x double> poison to <16 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r167 = fptosi <16 x double> poison to <16 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r168 = fptoui <16 x double> poison to <16 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r169 = fptosi <16 x double> poison to <16 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r170 = uitofp <2 x i1> poison to <2 x float>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r171 = sitofp <2 x i1> poison to <2 x float>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r172 = uitofp <2 x i8> poison to <2 x float>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r173 = sitofp <2 x i8> poison to <2 x float>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r174 = uitofp <2 x i16> poison to <2 x float>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r175 = sitofp <2 x i16> poison to <2 x float>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r176 = uitofp <2 x i32> poison to <2 x float>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r177 = sitofp <2 x i32> poison to <2 x float>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r178 = uitofp <2 x i64> poison to <2 x float>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r179 = sitofp <2 x i64> poison to <2 x float>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r180 = uitofp <2 x i1> poison to <2 x double>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r181 = sitofp <2 x i1> poison to <2 x double>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r182 = uitofp <2 x i8> poison to <2 x double>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r183 = sitofp <2 x i8> poison to <2 x double>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r184 = uitofp <2 x i16> poison to <2 x double>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r185 = sitofp <2 x i16> poison to <2 x double>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r186 = uitofp <2 x i32> poison to <2 x double>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r187 = sitofp <2 x i32> poison to <2 x double>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r188 = uitofp <2 x i64> poison to <2 x double>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r189 = sitofp <2 x i64> poison to <2 x double>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r190 = uitofp <4 x i1> poison to <4 x float>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r191 = sitofp <4 x i1> poison to <4 x float>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r192 = uitofp <4 x i8> poison to <4 x float>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r193 = sitofp <4 x i8> poison to <4 x float>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r194 = uitofp <4 x i16> poison to <4 x float>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r195 = sitofp <4 x i16> poison to <4 x float>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r196 = uitofp <4 x i32> poison to <4 x float>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r197 = sitofp <4 x i32> poison to <4 x float>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r198 = uitofp <4 x i64> poison to <4 x float>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r199 = sitofp <4 x i64> poison to <4 x float>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r200 = uitofp <4 x i1> poison to <4 x double>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r201 = sitofp <4 x i1> poison to <4 x double>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r202 = uitofp <4 x i8> poison to <4 x double>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r203 = sitofp <4 x i8> poison to <4 x double>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r204 = uitofp <4 x i16> poison to <4 x double>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r205 = sitofp <4 x i16> poison to <4 x double>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r206 = uitofp <4 x i32> poison to <4 x double>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r207 = sitofp <4 x i32> poison to <4 x double>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r208 = uitofp <4 x i64> poison to <4 x double>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r209 = sitofp <4 x i64> poison to <4 x double>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r210 = uitofp <8 x i1> poison to <8 x float>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r211 = sitofp <8 x i1> poison to <8 x float>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r212 = uitofp <8 x i8> poison to <8 x float>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r213 = sitofp <8 x i8> poison to <8 x float>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r214 = uitofp <8 x i16> poison to <8 x float>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r215 = sitofp <8 x i16> poison to <8 x float>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r216 = uitofp <8 x i32> poison to <8 x float>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r217 = sitofp <8 x i32> poison to <8 x float>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r218 = uitofp <8 x i64> poison to <8 x float>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r219 = sitofp <8 x i64> poison to <8 x float>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r220 = uitofp <8 x i1> poison to <8 x double>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r221 = sitofp <8 x i1> poison to <8 x double>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r222 = uitofp <8 x i8> poison to <8 x double>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r223 = sitofp <8 x i8> poison to <8 x double>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r224 = uitofp <8 x i16> poison to <8 x double>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r225 = sitofp <8 x i16> poison to <8 x double>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r226 = uitofp <8 x i32> poison to <8 x double>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r227 = sitofp <8 x i32> poison to <8 x double>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r228 = uitofp <8 x i64> poison to <8 x double>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r229 = sitofp <8 x i64> poison to <8 x double>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r230 = uitofp <16 x i1> poison to <16 x float>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r231 = sitofp <16 x i1> poison to <16 x float>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r232 = uitofp <16 x i8> poison to <16 x float>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r233 = sitofp <16 x i8> poison to <16 x float>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r234 = uitofp <16 x i16> poison to <16 x float>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r235 = sitofp <16 x i16> poison to <16 x float>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r236 = uitofp <16 x i32> poison to <16 x float>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r237 = sitofp <16 x i32> poison to <16 x float>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r238 = uitofp <16 x i64> poison to <16 x float>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r239 = sitofp <16 x i64> poison to <16 x float>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r240 = uitofp <16 x i1> poison to <16 x double>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r241 = sitofp <16 x i1> poison to <16 x double>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r242 = uitofp <16 x i8> poison to <16 x double>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r243 = sitofp <16 x i8> poison to <16 x double>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r244 = uitofp <16 x i16> poison to <16 x double>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r245 = sitofp <16 x i16> poison to <16 x double>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r246 = uitofp <16 x i32> poison to <16 x double>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r247 = sitofp <16 x i32> poison to <16 x double>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r248 = uitofp <16 x i64> poison to <16 x double>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r249 = sitofp <16 x i64> poison to <16 x double>
 ; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; FIXED-MIN-2048-LABEL: 'casts_no_users'
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r30 = fptoui float undef to i1
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r31 = fptosi float undef to i1
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r32 = fptoui float undef to i8
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r33 = fptosi float undef to i8
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r34 = fptoui float undef to i16
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r35 = fptosi float undef to i16
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r36 = fptoui float undef to i32
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r37 = fptosi float undef to i32
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r38 = fptoui float undef to i64
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r39 = fptosi float undef to i64
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r40 = fptoui double undef to i1
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r41 = fptosi double undef to i1
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r42 = fptoui double undef to i8
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r43 = fptosi double undef to i8
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r44 = fptoui double undef to i16
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r45 = fptosi double undef to i16
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r46 = fptoui double undef to i32
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r47 = fptosi double undef to i32
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r48 = fptoui double undef to i64
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r49 = fptosi double undef to i64
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r50 = sitofp i1 undef to float
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r51 = uitofp i1 undef to float
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r52 = sitofp i1 undef to double
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r53 = uitofp i1 undef to double
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r54 = sitofp i8 undef to float
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r55 = uitofp i8 undef to float
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r56 = sitofp i8 undef to double
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r57 = uitofp i8 undef to double
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r58 = sitofp i16 undef to float
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r59 = uitofp i16 undef to float
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r60 = sitofp i16 undef to double
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r61 = uitofp i16 undef to double
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r62 = sitofp i32 undef to float
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r63 = uitofp i32 undef to float
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r64 = sitofp i32 undef to double
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r65 = uitofp i32 undef to double
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r66 = sitofp i64 undef to float
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r67 = uitofp i64 undef to float
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r68 = sitofp i64 undef to double
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r69 = uitofp i64 undef to double
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r80 = fptrunc double undef to float
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r81 = fptrunc <2 x double> undef to <2 x float>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r82 = fptrunc <4 x double> undef to <4 x float>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r83 = fptrunc <8 x double> undef to <8 x float>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r84 = fptrunc <16 x double> undef to <16 x float>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r85 = fpext float undef to double
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r86 = fpext <2 x float> undef to <2 x double>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r87 = fpext <4 x float> undef to <4 x double>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r88 = fpext <8 x float> undef to <8 x double>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r89 = fpext <16 x float> undef to <16 x double>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r90 = fptoui <2 x float> undef to <2 x i1>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r91 = fptosi <2 x float> undef to <2 x i1>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r92 = fptoui <2 x float> undef to <2 x i8>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r93 = fptosi <2 x float> undef to <2 x i8>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r94 = fptoui <2 x float> undef to <2 x i16>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r95 = fptosi <2 x float> undef to <2 x i16>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r96 = fptoui <2 x float> undef to <2 x i32>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r97 = fptosi <2 x float> undef to <2 x i32>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r98 = fptoui <2 x float> undef to <2 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r99 = fptosi <2 x float> undef to <2 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r100 = fptoui <2 x double> undef to <2 x i1>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r101 = fptosi <2 x double> undef to <2 x i1>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r102 = fptoui <2 x double> undef to <2 x i8>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r103 = fptosi <2 x double> undef to <2 x i8>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r104 = fptoui <2 x double> undef to <2 x i16>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r105 = fptosi <2 x double> undef to <2 x i16>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r106 = fptoui <2 x double> undef to <2 x i32>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r107 = fptosi <2 x double> undef to <2 x i32>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r108 = fptoui <2 x double> undef to <2 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r109 = fptosi <2 x double> undef to <2 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r110 = fptoui <4 x float> undef to <4 x i1>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r111 = fptosi <4 x float> undef to <4 x i1>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r112 = fptoui <4 x float> undef to <4 x i8>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r113 = fptosi <4 x float> undef to <4 x i8>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r114 = fptoui <4 x float> undef to <4 x i16>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r115 = fptosi <4 x float> undef to <4 x i16>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r116 = fptoui <4 x float> undef to <4 x i32>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r117 = fptosi <4 x float> undef to <4 x i32>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r118 = fptoui <4 x float> undef to <4 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r119 = fptosi <4 x float> undef to <4 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r120 = fptoui <4 x double> undef to <4 x i1>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r121 = fptosi <4 x double> undef to <4 x i1>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r122 = fptoui <4 x double> undef to <4 x i8>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r123 = fptosi <4 x double> undef to <4 x i8>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r124 = fptoui <4 x double> undef to <4 x i16>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r125 = fptosi <4 x double> undef to <4 x i16>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r126 = fptoui <4 x double> undef to <4 x i32>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r127 = fptosi <4 x double> undef to <4 x i32>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r128 = fptoui <4 x double> undef to <4 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r129 = fptosi <4 x double> undef to <4 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r130 = fptoui <8 x float> undef to <8 x i1>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r131 = fptosi <8 x float> undef to <8 x i1>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r132 = fptoui <8 x float> undef to <8 x i8>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r133 = fptosi <8 x float> undef to <8 x i8>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r134 = fptoui <8 x float> undef to <8 x i16>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r135 = fptosi <8 x float> undef to <8 x i16>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r136 = fptoui <8 x float> undef to <8 x i32>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r137 = fptosi <8 x float> undef to <8 x i32>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r138 = fptoui <8 x float> undef to <8 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r139 = fptosi <8 x float> undef to <8 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r140 = fptoui <8 x double> undef to <8 x i1>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r141 = fptosi <8 x double> undef to <8 x i1>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r142 = fptoui <8 x double> undef to <8 x i8>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r143 = fptosi <8 x double> undef to <8 x i8>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r144 = fptoui <8 x double> undef to <8 x i16>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r145 = fptosi <8 x double> undef to <8 x i16>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r146 = fptoui <8 x double> undef to <8 x i32>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r147 = fptosi <8 x double> undef to <8 x i32>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r148 = fptoui <8 x double> undef to <8 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r149 = fptosi <8 x double> undef to <8 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r150 = fptoui <16 x float> undef to <16 x i1>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r151 = fptosi <16 x float> undef to <16 x i1>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r152 = fptoui <16 x float> undef to <16 x i8>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r153 = fptosi <16 x float> undef to <16 x i8>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r154 = fptoui <16 x float> undef to <16 x i16>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r155 = fptosi <16 x float> undef to <16 x i16>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r156 = fptoui <16 x float> undef to <16 x i32>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r157 = fptosi <16 x float> undef to <16 x i32>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r158 = fptoui <16 x float> undef to <16 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r159 = fptosi <16 x float> undef to <16 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r160 = fptoui <16 x double> undef to <16 x i1>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r161 = fptosi <16 x double> undef to <16 x i1>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r162 = fptoui <16 x double> undef to <16 x i8>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r163 = fptosi <16 x double> undef to <16 x i8>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r164 = fptoui <16 x double> undef to <16 x i16>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r165 = fptosi <16 x double> undef to <16 x i16>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r166 = fptoui <16 x double> undef to <16 x i32>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r167 = fptosi <16 x double> undef to <16 x i32>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r168 = fptoui <16 x double> undef to <16 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r169 = fptosi <16 x double> undef to <16 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r170 = uitofp <2 x i1> undef to <2 x float>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r171 = sitofp <2 x i1> undef to <2 x float>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r172 = uitofp <2 x i8> undef to <2 x float>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r173 = sitofp <2 x i8> undef to <2 x float>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r174 = uitofp <2 x i16> undef to <2 x float>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r175 = sitofp <2 x i16> undef to <2 x float>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r176 = uitofp <2 x i32> undef to <2 x float>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r177 = sitofp <2 x i32> undef to <2 x float>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r178 = uitofp <2 x i64> undef to <2 x float>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r179 = sitofp <2 x i64> undef to <2 x float>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r180 = uitofp <2 x i1> undef to <2 x double>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r181 = sitofp <2 x i1> undef to <2 x double>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r182 = uitofp <2 x i8> undef to <2 x double>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r183 = sitofp <2 x i8> undef to <2 x double>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r184 = uitofp <2 x i16> undef to <2 x double>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r185 = sitofp <2 x i16> undef to <2 x double>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r186 = uitofp <2 x i32> undef to <2 x double>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r187 = sitofp <2 x i32> undef to <2 x double>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r188 = uitofp <2 x i64> undef to <2 x double>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r189 = sitofp <2 x i64> undef to <2 x double>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r190 = uitofp <4 x i1> undef to <4 x float>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r191 = sitofp <4 x i1> undef to <4 x float>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r192 = uitofp <4 x i8> undef to <4 x float>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r193 = sitofp <4 x i8> undef to <4 x float>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r194 = uitofp <4 x i16> undef to <4 x float>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r195 = sitofp <4 x i16> undef to <4 x float>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r196 = uitofp <4 x i32> undef to <4 x float>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r197 = sitofp <4 x i32> undef to <4 x float>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r198 = uitofp <4 x i64> undef to <4 x float>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r199 = sitofp <4 x i64> undef to <4 x float>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r200 = uitofp <4 x i1> undef to <4 x double>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r201 = sitofp <4 x i1> undef to <4 x double>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r202 = uitofp <4 x i8> undef to <4 x double>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r203 = sitofp <4 x i8> undef to <4 x double>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r204 = uitofp <4 x i16> undef to <4 x double>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r205 = sitofp <4 x i16> undef to <4 x double>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r206 = uitofp <4 x i32> undef to <4 x double>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r207 = sitofp <4 x i32> undef to <4 x double>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r208 = uitofp <4 x i64> undef to <4 x double>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r209 = sitofp <4 x i64> undef to <4 x double>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r210 = uitofp <8 x i1> undef to <8 x float>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r211 = sitofp <8 x i1> undef to <8 x float>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r212 = uitofp <8 x i8> undef to <8 x float>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r213 = sitofp <8 x i8> undef to <8 x float>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r214 = uitofp <8 x i16> undef to <8 x float>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r215 = sitofp <8 x i16> undef to <8 x float>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r216 = uitofp <8 x i32> undef to <8 x float>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r217 = sitofp <8 x i32> undef to <8 x float>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r218 = uitofp <8 x i64> undef to <8 x float>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r219 = sitofp <8 x i64> undef to <8 x float>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r220 = uitofp <8 x i1> undef to <8 x double>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r221 = sitofp <8 x i1> undef to <8 x double>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r222 = uitofp <8 x i8> undef to <8 x double>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r223 = sitofp <8 x i8> undef to <8 x double>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r224 = uitofp <8 x i16> undef to <8 x double>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r225 = sitofp <8 x i16> undef to <8 x double>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r226 = uitofp <8 x i32> undef to <8 x double>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r227 = sitofp <8 x i32> undef to <8 x double>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r228 = uitofp <8 x i64> undef to <8 x double>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r229 = sitofp <8 x i64> undef to <8 x double>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r230 = uitofp <16 x i1> undef to <16 x float>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r231 = sitofp <16 x i1> undef to <16 x float>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r232 = uitofp <16 x i8> undef to <16 x float>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r233 = sitofp <16 x i8> undef to <16 x float>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r234 = uitofp <16 x i16> undef to <16 x float>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r235 = sitofp <16 x i16> undef to <16 x float>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r236 = uitofp <16 x i32> undef to <16 x float>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r237 = sitofp <16 x i32> undef to <16 x float>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r238 = uitofp <16 x i64> undef to <16 x float>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r239 = sitofp <16 x i64> undef to <16 x float>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r240 = uitofp <16 x i1> undef to <16 x double>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r241 = sitofp <16 x i1> undef to <16 x double>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r242 = uitofp <16 x i8> undef to <16 x double>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r243 = sitofp <16 x i8> undef to <16 x double>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r244 = uitofp <16 x i16> undef to <16 x double>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r245 = sitofp <16 x i16> undef to <16 x double>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r246 = uitofp <16 x i32> undef to <16 x double>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r247 = sitofp <16 x i32> undef to <16 x double>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r248 = uitofp <16 x i64> undef to <16 x double>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r249 = sitofp <16 x i64> undef to <16 x double>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r30 = fptoui float poison to i1
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r31 = fptosi float poison to i1
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r32 = fptoui float poison to i8
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r33 = fptosi float poison to i8
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r34 = fptoui float poison to i16
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r35 = fptosi float poison to i16
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r36 = fptoui float poison to i32
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r37 = fptosi float poison to i32
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r38 = fptoui float poison to i64
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r39 = fptosi float poison to i64
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r40 = fptoui double poison to i1
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r41 = fptosi double poison to i1
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r42 = fptoui double poison to i8
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r43 = fptosi double poison to i8
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r44 = fptoui double poison to i16
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r45 = fptosi double poison to i16
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r46 = fptoui double poison to i32
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r47 = fptosi double poison to i32
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r48 = fptoui double poison to i64
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r49 = fptosi double poison to i64
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r50 = sitofp i1 poison to float
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r51 = uitofp i1 poison to float
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r52 = sitofp i1 poison to double
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r53 = uitofp i1 poison to double
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r54 = sitofp i8 poison to float
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r55 = uitofp i8 poison to float
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r56 = sitofp i8 poison to double
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r57 = uitofp i8 poison to double
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r58 = sitofp i16 poison to float
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r59 = uitofp i16 poison to float
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r60 = sitofp i16 poison to double
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r61 = uitofp i16 poison to double
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r62 = sitofp i32 poison to float
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r63 = uitofp i32 poison to float
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r64 = sitofp i32 poison to double
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r65 = uitofp i32 poison to double
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r66 = sitofp i64 poison to float
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r67 = uitofp i64 poison to float
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r68 = sitofp i64 poison to double
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r69 = uitofp i64 poison to double
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r80 = fptrunc double poison to float
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r81 = fptrunc <2 x double> poison to <2 x float>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r82 = fptrunc <4 x double> poison to <4 x float>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r83 = fptrunc <8 x double> poison to <8 x float>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r84 = fptrunc <16 x double> poison to <16 x float>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r85 = fpext float poison to double
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r86 = fpext <2 x float> poison to <2 x double>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r87 = fpext <4 x float> poison to <4 x double>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r88 = fpext <8 x float> poison to <8 x double>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r89 = fpext <16 x float> poison to <16 x double>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r90 = fptoui <2 x float> poison to <2 x i1>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r91 = fptosi <2 x float> poison to <2 x i1>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r92 = fptoui <2 x float> poison to <2 x i8>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r93 = fptosi <2 x float> poison to <2 x i8>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r94 = fptoui <2 x float> poison to <2 x i16>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r95 = fptosi <2 x float> poison to <2 x i16>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r96 = fptoui <2 x float> poison to <2 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r97 = fptosi <2 x float> poison to <2 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r98 = fptoui <2 x float> poison to <2 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r99 = fptosi <2 x float> poison to <2 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r100 = fptoui <2 x double> poison to <2 x i1>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r101 = fptosi <2 x double> poison to <2 x i1>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r102 = fptoui <2 x double> poison to <2 x i8>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r103 = fptosi <2 x double> poison to <2 x i8>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r104 = fptoui <2 x double> poison to <2 x i16>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r105 = fptosi <2 x double> poison to <2 x i16>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r106 = fptoui <2 x double> poison to <2 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r107 = fptosi <2 x double> poison to <2 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r108 = fptoui <2 x double> poison to <2 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r109 = fptosi <2 x double> poison to <2 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r110 = fptoui <4 x float> poison to <4 x i1>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r111 = fptosi <4 x float> poison to <4 x i1>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r112 = fptoui <4 x float> poison to <4 x i8>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r113 = fptosi <4 x float> poison to <4 x i8>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r114 = fptoui <4 x float> poison to <4 x i16>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r115 = fptosi <4 x float> poison to <4 x i16>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r116 = fptoui <4 x float> poison to <4 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r117 = fptosi <4 x float> poison to <4 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r118 = fptoui <4 x float> poison to <4 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r119 = fptosi <4 x float> poison to <4 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r120 = fptoui <4 x double> poison to <4 x i1>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r121 = fptosi <4 x double> poison to <4 x i1>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r122 = fptoui <4 x double> poison to <4 x i8>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r123 = fptosi <4 x double> poison to <4 x i8>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r124 = fptoui <4 x double> poison to <4 x i16>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r125 = fptosi <4 x double> poison to <4 x i16>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r126 = fptoui <4 x double> poison to <4 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r127 = fptosi <4 x double> poison to <4 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r128 = fptoui <4 x double> poison to <4 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r129 = fptosi <4 x double> poison to <4 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r130 = fptoui <8 x float> poison to <8 x i1>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r131 = fptosi <8 x float> poison to <8 x i1>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r132 = fptoui <8 x float> poison to <8 x i8>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r133 = fptosi <8 x float> poison to <8 x i8>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r134 = fptoui <8 x float> poison to <8 x i16>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r135 = fptosi <8 x float> poison to <8 x i16>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r136 = fptoui <8 x float> poison to <8 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r137 = fptosi <8 x float> poison to <8 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r138 = fptoui <8 x float> poison to <8 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r139 = fptosi <8 x float> poison to <8 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r140 = fptoui <8 x double> poison to <8 x i1>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r141 = fptosi <8 x double> poison to <8 x i1>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r142 = fptoui <8 x double> poison to <8 x i8>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r143 = fptosi <8 x double> poison to <8 x i8>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r144 = fptoui <8 x double> poison to <8 x i16>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r145 = fptosi <8 x double> poison to <8 x i16>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r146 = fptoui <8 x double> poison to <8 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r147 = fptosi <8 x double> poison to <8 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r148 = fptoui <8 x double> poison to <8 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r149 = fptosi <8 x double> poison to <8 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r150 = fptoui <16 x float> poison to <16 x i1>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r151 = fptosi <16 x float> poison to <16 x i1>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r152 = fptoui <16 x float> poison to <16 x i8>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r153 = fptosi <16 x float> poison to <16 x i8>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r154 = fptoui <16 x float> poison to <16 x i16>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r155 = fptosi <16 x float> poison to <16 x i16>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r156 = fptoui <16 x float> poison to <16 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r157 = fptosi <16 x float> poison to <16 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r158 = fptoui <16 x float> poison to <16 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r159 = fptosi <16 x float> poison to <16 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r160 = fptoui <16 x double> poison to <16 x i1>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r161 = fptosi <16 x double> poison to <16 x i1>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r162 = fptoui <16 x double> poison to <16 x i8>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r163 = fptosi <16 x double> poison to <16 x i8>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r164 = fptoui <16 x double> poison to <16 x i16>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r165 = fptosi <16 x double> poison to <16 x i16>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r166 = fptoui <16 x double> poison to <16 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r167 = fptosi <16 x double> poison to <16 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r168 = fptoui <16 x double> poison to <16 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r169 = fptosi <16 x double> poison to <16 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r170 = uitofp <2 x i1> poison to <2 x float>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r171 = sitofp <2 x i1> poison to <2 x float>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r172 = uitofp <2 x i8> poison to <2 x float>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r173 = sitofp <2 x i8> poison to <2 x float>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r174 = uitofp <2 x i16> poison to <2 x float>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r175 = sitofp <2 x i16> poison to <2 x float>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r176 = uitofp <2 x i32> poison to <2 x float>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r177 = sitofp <2 x i32> poison to <2 x float>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r178 = uitofp <2 x i64> poison to <2 x float>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r179 = sitofp <2 x i64> poison to <2 x float>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r180 = uitofp <2 x i1> poison to <2 x double>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r181 = sitofp <2 x i1> poison to <2 x double>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r182 = uitofp <2 x i8> poison to <2 x double>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r183 = sitofp <2 x i8> poison to <2 x double>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r184 = uitofp <2 x i16> poison to <2 x double>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r185 = sitofp <2 x i16> poison to <2 x double>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r186 = uitofp <2 x i32> poison to <2 x double>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r187 = sitofp <2 x i32> poison to <2 x double>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r188 = uitofp <2 x i64> poison to <2 x double>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r189 = sitofp <2 x i64> poison to <2 x double>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r190 = uitofp <4 x i1> poison to <4 x float>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r191 = sitofp <4 x i1> poison to <4 x float>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r192 = uitofp <4 x i8> poison to <4 x float>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r193 = sitofp <4 x i8> poison to <4 x float>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r194 = uitofp <4 x i16> poison to <4 x float>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r195 = sitofp <4 x i16> poison to <4 x float>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r196 = uitofp <4 x i32> poison to <4 x float>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r197 = sitofp <4 x i32> poison to <4 x float>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r198 = uitofp <4 x i64> poison to <4 x float>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r199 = sitofp <4 x i64> poison to <4 x float>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r200 = uitofp <4 x i1> poison to <4 x double>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r201 = sitofp <4 x i1> poison to <4 x double>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r202 = uitofp <4 x i8> poison to <4 x double>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r203 = sitofp <4 x i8> poison to <4 x double>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r204 = uitofp <4 x i16> poison to <4 x double>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r205 = sitofp <4 x i16> poison to <4 x double>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r206 = uitofp <4 x i32> poison to <4 x double>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r207 = sitofp <4 x i32> poison to <4 x double>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r208 = uitofp <4 x i64> poison to <4 x double>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r209 = sitofp <4 x i64> poison to <4 x double>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r210 = uitofp <8 x i1> poison to <8 x float>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r211 = sitofp <8 x i1> poison to <8 x float>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r212 = uitofp <8 x i8> poison to <8 x float>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r213 = sitofp <8 x i8> poison to <8 x float>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r214 = uitofp <8 x i16> poison to <8 x float>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r215 = sitofp <8 x i16> poison to <8 x float>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r216 = uitofp <8 x i32> poison to <8 x float>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r217 = sitofp <8 x i32> poison to <8 x float>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r218 = uitofp <8 x i64> poison to <8 x float>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r219 = sitofp <8 x i64> poison to <8 x float>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r220 = uitofp <8 x i1> poison to <8 x double>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r221 = sitofp <8 x i1> poison to <8 x double>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r222 = uitofp <8 x i8> poison to <8 x double>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r223 = sitofp <8 x i8> poison to <8 x double>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r224 = uitofp <8 x i16> poison to <8 x double>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r225 = sitofp <8 x i16> poison to <8 x double>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r226 = uitofp <8 x i32> poison to <8 x double>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r227 = sitofp <8 x i32> poison to <8 x double>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r228 = uitofp <8 x i64> poison to <8 x double>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r229 = sitofp <8 x i64> poison to <8 x double>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r230 = uitofp <16 x i1> poison to <16 x float>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r231 = sitofp <16 x i1> poison to <16 x float>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r232 = uitofp <16 x i8> poison to <16 x float>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r233 = sitofp <16 x i8> poison to <16 x float>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r234 = uitofp <16 x i16> poison to <16 x float>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r235 = sitofp <16 x i16> poison to <16 x float>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r236 = uitofp <16 x i32> poison to <16 x float>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r237 = sitofp <16 x i32> poison to <16 x float>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r238 = uitofp <16 x i64> poison to <16 x float>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r239 = sitofp <16 x i64> poison to <16 x float>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r240 = uitofp <16 x i1> poison to <16 x double>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r241 = sitofp <16 x i1> poison to <16 x double>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r242 = uitofp <16 x i8> poison to <16 x double>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r243 = sitofp <16 x i8> poison to <16 x double>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r244 = uitofp <16 x i16> poison to <16 x double>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r245 = sitofp <16 x i16> poison to <16 x double>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r246 = uitofp <16 x i32> poison to <16 x double>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r247 = sitofp <16 x i32> poison to <16 x double>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r248 = uitofp <16 x i64> poison to <16 x double>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r249 = sitofp <16 x i64> poison to <16 x double>
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
-  %r30 = fptoui float undef to i1
-  %r31 = fptosi float undef to i1
-  %r32 = fptoui float undef to i8
-  %r33 = fptosi float undef to i8
-  %r34 = fptoui float undef to i16
-  %r35 = fptosi float undef to i16
-  %r36 = fptoui float undef to i32
-  %r37 = fptosi float undef to i32
-  %r38 = fptoui float undef to i64
-  %r39 = fptosi float undef to i64
-  %r40 = fptoui double undef to i1
-  %r41 = fptosi double undef to i1
-  %r42 = fptoui double undef to i8
-  %r43 = fptosi double undef to i8
-  %r44 = fptoui double undef to i16
-  %r45 = fptosi double undef to i16
-  %r46 = fptoui double undef to i32
-  %r47 = fptosi double undef to i32
-  %r48 = fptoui double undef to i64
-  %r49 = fptosi double undef to i64
-  %r50 = sitofp i1 undef to float
-  %r51 = uitofp i1 undef to float
-  %r52 = sitofp i1 undef to double
-  %r53 = uitofp i1 undef to double
-  %r54 = sitofp i8 undef to float
-  %r55 = uitofp i8 undef to float
-  %r56 = sitofp i8 undef to double
-  %r57 = uitofp i8 undef to double
-  %r58 = sitofp i16 undef to float
-  %r59 = uitofp i16 undef to float
-  %r60 = sitofp i16 undef to double
-  %r61 = uitofp i16 undef to double
-  %r62 = sitofp i32 undef to float
-  %r63 = uitofp i32 undef to float
-  %r64 = sitofp i32 undef to double
-  %r65 = uitofp i32 undef to double
-  %r66 = sitofp i64 undef to float
-  %r67 = uitofp i64 undef to float
-  %r68 = sitofp i64 undef to double
-  %r69 = uitofp i64 undef to double
-  %r80 = fptrunc double undef to float
-  %r81 = fptrunc <2 x double> undef to <2 x float>
-  %r82 = fptrunc <4 x double> undef to <4 x float>
-  %r83 = fptrunc <8 x double> undef to <8 x float>
-  %r84 = fptrunc <16 x double> undef to <16 x float>
-  %r85 = fpext float undef to double
-  %r86 = fpext <2 x float> undef to <2 x double>
-  %r87 = fpext <4 x float> undef to <4 x double>
-  %r88 = fpext <8 x float> undef to <8 x double>
-  %r89 = fpext <16 x float> undef to <16 x double>
-  %r90 = fptoui <2 x float> undef to <2 x i1>
-  %r91 = fptosi <2 x float> undef to <2 x i1>
-  %r92 = fptoui <2 x float> undef to <2 x i8>
-  %r93 = fptosi <2 x float> undef to <2 x i8>
-  %r94 = fptoui <2 x float> undef to <2 x i16>
-  %r95 = fptosi <2 x float> undef to <2 x i16>
-  %r96 = fptoui <2 x float> undef to <2 x i32>
-  %r97 = fptosi <2 x float> undef to <2 x i32>
-  %r98 = fptoui <2 x float> undef to <2 x i64>
-  %r99 = fptosi <2 x float> undef to <2 x i64>
-  %r100 = fptoui <2 x double> undef to <2 x i1>
-  %r101 = fptosi <2 x double> undef to <2 x i1>
-  %r102 = fptoui <2 x double> undef to <2 x i8>
-  %r103 = fptosi <2 x double> undef to <2 x i8>
-  %r104 = fptoui <2 x double> undef to <2 x i16>
-  %r105 = fptosi <2 x double> undef to <2 x i16>
-  %r106 = fptoui <2 x double> undef to <2 x i32>
-  %r107 = fptosi <2 x double> undef to <2 x i32>
-  %r108 = fptoui <2 x double> undef to <2 x i64>
-  %r109 = fptosi <2 x double> undef to <2 x i64>
+  %r30 = fptoui float poison to i1
+  %r31 = fptosi float poison to i1
+  %r32 = fptoui float poison to i8
+  %r33 = fptosi float poison to i8
+  %r34 = fptoui float poison to i16
+  %r35 = fptosi float poison to i16
+  %r36 = fptoui float poison to i32
+  %r37 = fptosi float poison to i32
+  %r38 = fptoui float poison to i64
+  %r39 = fptosi float poison to i64
+  %r40 = fptoui double poison to i1
+  %r41 = fptosi double poison to i1
+  %r42 = fptoui double poison to i8
+  %r43 = fptosi double poison to i8
+  %r44 = fptoui double poison to i16
+  %r45 = fptosi double poison to i16
+  %r46 = fptoui double poison to i32
+  %r47 = fptosi double poison to i32
+  %r48 = fptoui double poison to i64
+  %r49 = fptosi double poison to i64
+  %r50 = sitofp i1 poison to float
+  %r51 = uitofp i1 poison to float
+  %r52 = sitofp i1 poison to double
+  %r53 = uitofp i1 poison to double
+  %r54 = sitofp i8 poison to float
+  %r55 = uitofp i8 poison to float
+  %r56 = sitofp i8 poison to double
+  %r57 = uitofp i8 poison to double
+  %r58 = sitofp i16 poison to float
+  %r59 = uitofp i16 poison to float
+  %r60 = sitofp i16 poison to double
+  %r61 = uitofp i16 poison to double
+  %r62 = sitofp i32 poison to float
+  %r63 = uitofp i32 poison to float
+  %r64 = sitofp i32 poison to double
+  %r65 = uitofp i32 poison to double
+  %r66 = sitofp i64 poison to float
+  %r67 = uitofp i64 poison to float
+  %r68 = sitofp i64 poison to double
+  %r69 = uitofp i64 poison to double
+  %r80 = fptrunc double poison to float
+  %r81 = fptrunc <2 x double> poison to <2 x float>
+  %r82 = fptrunc <4 x double> poison to <4 x float>
+  %r83 = fptrunc <8 x double> poison to <8 x float>
+  %r84 = fptrunc <16 x double> poison to <16 x float>
+  %r85 = fpext float poison to double
+  %r86 = fpext <2 x float> poison to <2 x double>
+  %r87 = fpext <4 x float> poison to <4 x double>
+  %r88 = fpext <8 x float> poison to <8 x double>
+  %r89 = fpext <16 x float> poison to <16 x double>
+  %r90 = fptoui <2 x float> poison to <2 x i1>
+  %r91 = fptosi <2 x float> poison to <2 x i1>
+  %r92 = fptoui <2 x float> poison to <2 x i8>
+  %r93 = fptosi <2 x float> poison to <2 x i8>
+  %r94 = fptoui <2 x float> poison to <2 x i16>
+  %r95 = fptosi <2 x float> poison to <2 x i16>
+  %r96 = fptoui <2 x float> poison to <2 x i32>
+  %r97 = fptosi <2 x float> poison to <2 x i32>
+  %r98 = fptoui <2 x float> poison to <2 x i64>
+  %r99 = fptosi <2 x float> poison to <2 x i64>
+  %r100 = fptoui <2 x double> poison to <2 x i1>
+  %r101 = fptosi <2 x double> poison to <2 x i1>
+  %r102 = fptoui <2 x double> poison to <2 x i8>
+  %r103 = fptosi <2 x double> poison to <2 x i8>
+  %r104 = fptoui <2 x double> poison to <2 x i16>
+  %r105 = fptosi <2 x double> poison to <2 x i16>
+  %r106 = fptoui <2 x double> poison to <2 x i32>
+  %r107 = fptosi <2 x double> poison to <2 x i32>
+  %r108 = fptoui <2 x double> poison to <2 x i64>
+  %r109 = fptosi <2 x double> poison to <2 x i64>
 
-  %r110 = fptoui <4 x float> undef to <4 x i1>
-  %r111 = fptosi <4 x float> undef to <4 x i1>
-  %r112 = fptoui <4 x float> undef to <4 x i8>
-  %r113 = fptosi <4 x float> undef to <4 x i8>
-  %r114 = fptoui <4 x float> undef to <4 x i16>
-  %r115 = fptosi <4 x float> undef to <4 x i16>
-  %r116 = fptoui <4 x float> undef to <4 x i32>
-  %r117 = fptosi <4 x float> undef to <4 x i32>
-  %r118 = fptoui <4 x float> undef to <4 x i64>
-  %r119 = fptosi <4 x float> undef to <4 x i64>
+  %r110 = fptoui <4 x float> poison to <4 x i1>
+  %r111 = fptosi <4 x float> poison to <4 x i1>
+  %r112 = fptoui <4 x float> poison to <4 x i8>
+  %r113 = fptosi <4 x float> poison to <4 x i8>
+  %r114 = fptoui <4 x float> poison to <4 x i16>
+  %r115 = fptosi <4 x float> poison to <4 x i16>
+  %r116 = fptoui <4 x float> poison to <4 x i32>
+  %r117 = fptosi <4 x float> poison to <4 x i32>
+  %r118 = fptoui <4 x float> poison to <4 x i64>
+  %r119 = fptosi <4 x float> poison to <4 x i64>
 
-  %r120 = fptoui <4 x double> undef to <4 x i1>
-  %r121 = fptosi <4 x double> undef to <4 x i1>
-  %r122 = fptoui <4 x double> undef to <4 x i8>
-  %r123 = fptosi <4 x double> undef to <4 x i8>
-  %r124 = fptoui <4 x double> undef to <4 x i16>
-  %r125 = fptosi <4 x double> undef to <4 x i16>
-  %r126 = fptoui <4 x double> undef to <4 x i32>
-  %r127 = fptosi <4 x double> undef to <4 x i32>
-  %r128 = fptoui <4 x double> undef to <4 x i64>
-  %r129 = fptosi <4 x double> undef to <4 x i64>
+  %r120 = fptoui <4 x double> poison to <4 x i1>
+  %r121 = fptosi <4 x double> poison to <4 x i1>
+  %r122 = fptoui <4 x double> poison to <4 x i8>
+  %r123 = fptosi <4 x double> poison to <4 x i8>
+  %r124 = fptoui <4 x double> poison to <4 x i16>
+  %r125 = fptosi <4 x double> poison to <4 x i16>
+  %r126 = fptoui <4 x double> poison to <4 x i32>
+  %r127 = fptosi <4 x double> poison to <4 x i32>
+  %r128 = fptoui <4 x double> poison to <4 x i64>
+  %r129 = fptosi <4 x double> poison to <4 x i64>
 
-  %r130 = fptoui <8 x float> undef to <8 x i1>
-  %r131 = fptosi <8 x float> undef to <8 x i1>
-  %r132 = fptoui <8 x float> undef to <8 x i8>
-  %r133 = fptosi <8 x float> undef to <8 x i8>
-  %r134 = fptoui <8 x float> undef to <8 x i16>
-  %r135 = fptosi <8 x float> undef to <8 x i16>
-  %r136 = fptoui <8 x float> undef to <8 x i32>
-  %r137 = fptosi <8 x float> undef to <8 x i32>
-  %r138 = fptoui <8 x float> undef to <8 x i64>
-  %r139 = fptosi <8 x float> undef to <8 x i64>
+  %r130 = fptoui <8 x float> poison to <8 x i1>
+  %r131 = fptosi <8 x float> poison to <8 x i1>
+  %r132 = fptoui <8 x float> poison to <8 x i8>
+  %r133 = fptosi <8 x float> poison to <8 x i8>
+  %r134 = fptoui <8 x float> poison to <8 x i16>
+  %r135 = fptosi <8 x float> poison to <8 x i16>
+  %r136 = fptoui <8 x float> poison to <8 x i32>
+  %r137 = fptosi <8 x float> poison to <8 x i32>
+  %r138 = fptoui <8 x float> poison to <8 x i64>
+  %r139 = fptosi <8 x float> poison to <8 x i64>
 
-  %r140 = fptoui <8 x double> undef to <8 x i1>
-  %r141 = fptosi <8 x double> undef to <8 x i1>
-  %r142 = fptoui <8 x double> undef to <8 x i8>
-  %r143 = fptosi <8 x double> undef to <8 x i8>
-  %r144 = fptoui <8 x double> undef to <8 x i16>
-  %r145 = fptosi <8 x double> undef to <8 x i16>
-  %r146 = fptoui <8 x double> undef to <8 x i32>
-  %r147 = fptosi <8 x double> undef to <8 x i32>
-  %r148 = fptoui <8 x double> undef to <8 x i64>
-  %r149 = fptosi <8 x double> undef to <8 x i64>
+  %r140 = fptoui <8 x double> poison to <8 x i1>
+  %r141 = fptosi <8 x double> poison to <8 x i1>
+  %r142 = fptoui <8 x double> poison to <8 x i8>
+  %r143 = fptosi <8 x double> poison to <8 x i8>
+  %r144 = fptoui <8 x double> poison to <8 x i16>
+  %r145 = fptosi <8 x double> poison to <8 x i16>
+  %r146 = fptoui <8 x double> poison to <8 x i32>
+  %r147 = fptosi <8 x double> poison to <8 x i32>
+  %r148 = fptoui <8 x double> poison to <8 x i64>
+  %r149 = fptosi <8 x double> poison to <8 x i64>
 
-  %r150 = fptoui <16 x float> undef to <16 x i1>
-  %r151 = fptosi <16 x float> undef to <16 x i1>
-  %r152 = fptoui <16 x float> undef to <16 x i8>
-  %r153 = fptosi <16 x float> undef to <16 x i8>
-  %r154 = fptoui <16 x float> undef to <16 x i16>
-  %r155 = fptosi <16 x float> undef to <16 x i16>
-  %r156 = fptoui <16 x float> undef to <16 x i32>
-  %r157 = fptosi <16 x float> undef to <16 x i32>
-  %r158 = fptoui <16 x float> undef to <16 x i64>
-  %r159 = fptosi <16 x float> undef to <16 x i64>
+  %r150 = fptoui <16 x float> poison to <16 x i1>
+  %r151 = fptosi <16 x float> poison to <16 x i1>
+  %r152 = fptoui <16 x float> poison to <16 x i8>
+  %r153 = fptosi <16 x float> poison to <16 x i8>
+  %r154 = fptoui <16 x float> poison to <16 x i16>
+  %r155 = fptosi <16 x float> poison to <16 x i16>
+  %r156 = fptoui <16 x float> poison to <16 x i32>
+  %r157 = fptosi <16 x float> poison to <16 x i32>
+  %r158 = fptoui <16 x float> poison to <16 x i64>
+  %r159 = fptosi <16 x float> poison to <16 x i64>
 
-  %r160 = fptoui <16 x double> undef to <16 x i1>
-  %r161 = fptosi <16 x double> undef to <16 x i1>
-  %r162 = fptoui <16 x double> undef to <16 x i8>
-  %r163 = fptosi <16 x double> undef to <16 x i8>
-  %r164 = fptoui <16 x double> undef to <16 x i16>
-  %r165 = fptosi <16 x double> undef to <16 x i16>
-  %r166 = fptoui <16 x double> undef to <16 x i32>
-  %r167 = fptosi <16 x double> undef to <16 x i32>
-  %r168 = fptoui <16 x double> undef to <16 x i64>
-  %r169 = fptosi <16 x double> undef to <16 x i64>
+  %r160 = fptoui <16 x double> poison to <16 x i1>
+  %r161 = fptosi <16 x double> poison to <16 x i1>
+  %r162 = fptoui <16 x double> poison to <16 x i8>
+  %r163 = fptosi <16 x double> poison to <16 x i8>
+  %r164 = fptoui <16 x double> poison to <16 x i16>
+  %r165 = fptosi <16 x double> poison to <16 x i16>
+  %r166 = fptoui <16 x double> poison to <16 x i32>
+  %r167 = fptosi <16 x double> poison to <16 x i32>
+  %r168 = fptoui <16 x double> poison to <16 x i64>
+  %r169 = fptosi <16 x double> poison to <16 x i64>
 
-  %r170 = uitofp <2 x i1> undef to <2 x float>
-  %r171 = sitofp <2 x i1> undef to <2 x float>
-  %r172 = uitofp <2 x i8> undef to <2 x float>
-  %r173 = sitofp <2 x i8> undef to <2 x float>
-  %r174 = uitofp <2 x i16> undef to <2 x float>
-  %r175 = sitofp <2 x i16> undef to <2 x float>
-  %r176 = uitofp <2 x i32> undef to <2 x float>
-  %r177 = sitofp <2 x i32> undef to <2 x float>
-  %r178 = uitofp <2 x i64> undef to <2 x float>
-  %r179 = sitofp <2 x i64> undef to <2 x float>
+  %r170 = uitofp <2 x i1> poison to <2 x float>
+  %r171 = sitofp <2 x i1> poison to <2 x float>
+  %r172 = uitofp <2 x i8> poison to <2 x float>
+  %r173 = sitofp <2 x i8> poison to <2 x float>
+  %r174 = uitofp <2 x i16> poison to <2 x float>
+  %r175 = sitofp <2 x i16> poison to <2 x float>
+  %r176 = uitofp <2 x i32> poison to <2 x float>
+  %r177 = sitofp <2 x i32> poison to <2 x float>
+  %r178 = uitofp <2 x i64> poison to <2 x float>
+  %r179 = sitofp <2 x i64> poison to <2 x float>
 
-  %r180 = uitofp <2 x i1> undef to <2 x double>
-  %r181 = sitofp <2 x i1> undef to <2 x double>
-  %r182 = uitofp <2 x i8> undef to <2 x double>
-  %r183 = sitofp <2 x i8> undef to <2 x double>
-  %r184 = uitofp <2 x i16> undef to <2 x double>
-  %r185 = sitofp <2 x i16> undef to <2 x double>
-  %r186 = uitofp <2 x i32> undef to <2 x double>
-  %r187 = sitofp <2 x i32> undef to <2 x double>
-  %r188 = uitofp <2 x i64> undef to <2 x double>
-  %r189 = sitofp <2 x i64> undef to <2 x double>
+  %r180 = uitofp <2 x i1> poison to <2 x double>
+  %r181 = sitofp <2 x i1> poison to <2 x double>
+  %r182 = uitofp <2 x i8> poison to <2 x double>
+  %r183 = sitofp <2 x i8> poison to <2 x double>
+  %r184 = uitofp <2 x i16> poison to <2 x double>
+  %r185 = sitofp <2 x i16> poison to <2 x double>
+  %r186 = uitofp <2 x i32> poison to <2 x double>
+  %r187 = sitofp <2 x i32> poison to <2 x double>
+  %r188 = uitofp <2 x i64> poison to <2 x double>
+  %r189 = sitofp <2 x i64> poison to <2 x double>
 
-  %r190 = uitofp <4 x i1> undef to <4 x float>
-  %r191 = sitofp <4 x i1> undef to <4 x float>
-  %r192 = uitofp <4 x i8> undef to <4 x float>
-  %r193 = sitofp <4 x i8> undef to <4 x float>
-  %r194 = uitofp <4 x i16> undef to <4 x float>
-  %r195 = sitofp <4 x i16> undef to <4 x float>
-  %r196 = uitofp <4 x i32> undef to <4 x float>
-  %r197 = sitofp <4 x i32> undef to <4 x float>
-  %r198 = uitofp <4 x i64> undef to <4 x float>
-  %r199 = sitofp <4 x i64> undef to <4 x float>
+  %r190 = uitofp <4 x i1> poison to <4 x float>
+  %r191 = sitofp <4 x i1> poison to <4 x float>
+  %r192 = uitofp <4 x i8> poison to <4 x float>
+  %r193 = sitofp <4 x i8> poison to <4 x float>
+  %r194 = uitofp <4 x i16> poison to <4 x float>
+  %r195 = sitofp <4 x i16> poison to <4 x float>
+  %r196 = uitofp <4 x i32> poison to <4 x float>
+  %r197 = sitofp <4 x i32> poison to <4 x float>
+  %r198 = uitofp <4 x i64> poison to <4 x float>
+  %r199 = sitofp <4 x i64> poison to <4 x float>
 
-  %r200 = uitofp <4 x i1> undef to <4 x double>
-  %r201 = sitofp <4 x i1> undef to <4 x double>
-  %r202 = uitofp <4 x i8> undef to <4 x double>
-  %r203 = sitofp <4 x i8> undef to <4 x double>
-  %r204 = uitofp <4 x i16> undef to <4 x double>
-  %r205 = sitofp <4 x i16> undef to <4 x double>
-  %r206 = uitofp <4 x i32> undef to <4 x double>
-  %r207 = sitofp <4 x i32> undef to <4 x double>
-  %r208 = uitofp <4 x i64> undef to <4 x double>
-  %r209 = sitofp <4 x i64> undef to <4 x double>
+  %r200 = uitofp <4 x i1> poison to <4 x double>
+  %r201 = sitofp <4 x i1> poison to <4 x double>
+  %r202 = uitofp <4 x i8> poison to <4 x double>
+  %r203 = sitofp <4 x i8> poison to <4 x double>
+  %r204 = uitofp <4 x i16> poison to <4 x double>
+  %r205 = sitofp <4 x i16> poison to <4 x double>
+  %r206 = uitofp <4 x i32> poison to <4 x double>
+  %r207 = sitofp <4 x i32> poison to <4 x double>
+  %r208 = uitofp <4 x i64> poison to <4 x double>
+  %r209 = sitofp <4 x i64> poison to <4 x double>
 
-  %r210 = uitofp <8 x i1> undef to <8 x float>
-  %r211 = sitofp <8 x i1> undef to <8 x float>
-  %r212 = uitofp <8 x i8> undef to <8 x float>
-  %r213 = sitofp <8 x i8> undef to <8 x float>
-  %r214 = uitofp <8 x i16> undef to <8 x float>
-  %r215 = sitofp <8 x i16> undef to <8 x float>
-  %r216 = uitofp <8 x i32> undef to <8 x float>
-  %r217 = sitofp <8 x i32> undef to <8 x float>
-  %r218 = uitofp <8 x i64> undef to <8 x float>
-  %r219 = sitofp <8 x i64> undef to <8 x float>
+  %r210 = uitofp <8 x i1> poison to <8 x float>
+  %r211 = sitofp <8 x i1> poison to <8 x float>
+  %r212 = uitofp <8 x i8> poison to <8 x float>
+  %r213 = sitofp <8 x i8> poison to <8 x float>
+  %r214 = uitofp <8 x i16> poison to <8 x float>
+  %r215 = sitofp <8 x i16> poison to <8 x float>
+  %r216 = uitofp <8 x i32> poison to <8 x float>
+  %r217 = sitofp <8 x i32> poison to <8 x float>
+  %r218 = uitofp <8 x i64> poison to <8 x float>
+  %r219 = sitofp <8 x i64> poison to <8 x float>
 
-  %r220 = uitofp <8 x i1> undef to <8 x double>
-  %r221 = sitofp <8 x i1> undef to <8 x double>
-  %r222 = uitofp <8 x i8> undef to <8 x double>
-  %r223 = sitofp <8 x i8> undef to <8 x double>
-  %r224 = uitofp <8 x i16> undef to <8 x double>
-  %r225 = sitofp <8 x i16> undef to <8 x double>
-  %r226 = uitofp <8 x i32> undef to <8 x double>
-  %r227 = sitofp <8 x i32> undef to <8 x double>
-  %r228 = uitofp <8 x i64> undef to <8 x double>
-  %r229 = sitofp <8 x i64> undef to <8 x double>
+  %r220 = uitofp <8 x i1> poison to <8 x double>
+  %r221 = sitofp <8 x i1> poison to <8 x double>
+  %r222 = uitofp <8 x i8> poison to <8 x double>
+  %r223 = sitofp <8 x i8> poison to <8 x double>
+  %r224 = uitofp <8 x i16> poison to <8 x double>
+  %r225 = sitofp <8 x i16> poison to <8 x double>
+  %r226 = uitofp <8 x i32> poison to <8 x double>
+  %r227 = sitofp <8 x i32> poison to <8 x double>
+  %r228 = uitofp <8 x i64> poison to <8 x double>
+  %r229 = sitofp <8 x i64> poison to <8 x double>
 
-  %r230 = uitofp <16 x i1> undef to <16 x float>
-  %r231 = sitofp <16 x i1> undef to <16 x float>
-  %r232 = uitofp <16 x i8> undef to <16 x float>
-  %r233 = sitofp <16 x i8> undef to <16 x float>
-  %r234 = uitofp <16 x i16> undef to <16 x float>
-  %r235 = sitofp <16 x i16> undef to <16 x float>
-  %r236 = uitofp <16 x i32> undef to <16 x float>
-  %r237 = sitofp <16 x i32> undef to <16 x float>
-  %r238 = uitofp <16 x i64> undef to <16 x float>
-  %r239 = sitofp <16 x i64> undef to <16 x float>
+  %r230 = uitofp <16 x i1> poison to <16 x float>
+  %r231 = sitofp <16 x i1> poison to <16 x float>
+  %r232 = uitofp <16 x i8> poison to <16 x float>
+  %r233 = sitofp <16 x i8> poison to <16 x float>
+  %r234 = uitofp <16 x i16> poison to <16 x float>
+  %r235 = sitofp <16 x i16> poison to <16 x float>
+  %r236 = uitofp <16 x i32> poison to <16 x float>
+  %r237 = sitofp <16 x i32> poison to <16 x float>
+  %r238 = uitofp <16 x i64> poison to <16 x float>
+  %r239 = sitofp <16 x i64> poison to <16 x float>
 
-  %r240 = uitofp <16 x i1> undef to <16 x double>
-  %r241 = sitofp <16 x i1> undef to <16 x double>
-  %r242 = uitofp <16 x i8> undef to <16 x double>
-  %r243 = sitofp <16 x i8> undef to <16 x double>
-  %r244 = uitofp <16 x i16> undef to <16 x double>
-  %r245 = sitofp <16 x i16> undef to <16 x double>
-  %r246 = uitofp <16 x i32> undef to <16 x double>
-  %r247 = sitofp <16 x i32> undef to <16 x double>
-  %r248 = uitofp <16 x i64> undef to <16 x double>
-  %r249 = sitofp <16 x i64> undef to <16 x double>
+  %r240 = uitofp <16 x i1> poison to <16 x double>
+  %r241 = sitofp <16 x i1> poison to <16 x double>
+  %r242 = uitofp <16 x i8> poison to <16 x double>
+  %r243 = sitofp <16 x i8> poison to <16 x double>
+  %r244 = uitofp <16 x i16> poison to <16 x double>
+  %r245 = sitofp <16 x i16> poison to <16 x double>
+  %r246 = uitofp <16 x i32> poison to <16 x double>
+  %r247 = sitofp <16 x i32> poison to <16 x double>
+  %r248 = uitofp <16 x i64> poison to <16 x double>
+  %r249 = sitofp <16 x i64> poison to <16 x double>
 
   ret i32 undef
 }
@@ -1760,24 +1760,24 @@ define i32 @casts_with_users(i8 %a, i16 %b, i32 %c, i64 %d, i1 %e) {
 
 define i32 @bitcasts() {
 ; CHECK-LABEL: 'bitcasts'
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %a = bitcast i32 undef to i32
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %b = bitcast float undef to float
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %c = bitcast i32 undef to float
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %d = bitcast float undef to i32
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %e = bitcast i64 undef to double
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f = bitcast double undef to i64
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %g = bitcast half undef to i16
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %h = bitcast i16 undef to half
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %a = bitcast i32 poison to i32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %b = bitcast float poison to float
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %c = bitcast i32 poison to float
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %d = bitcast float poison to i32
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %e = bitcast i64 poison to double
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %f = bitcast double poison to i64
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %g = bitcast half poison to i16
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %h = bitcast i16 poison to half
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
-  %a = bitcast i32 undef to i32
-  %b = bitcast float undef to float
-  %c = bitcast i32 undef to float
-  %d = bitcast float undef to i32
-  %e = bitcast i64 undef to double
-  %f = bitcast double undef to i64
-  %g = bitcast half undef to i16
-  %h = bitcast i16 undef to half
+  %a = bitcast i32 poison to i32
+  %b = bitcast float poison to float
+  %c = bitcast i32 poison to float
+  %d = bitcast float poison to i32
+  %e = bitcast i64 poison to double
+  %f = bitcast double poison to i64
+  %g = bitcast half poison to i16
+  %h = bitcast i16 poison to half
   ret i32 undef
 }
 
@@ -2012,31 +2012,31 @@ define i32 @load_extends() #0 {
 
 define i32 @store_truncs() {
 ; CHECK-LABEL: 'store_truncs'
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r0 = trunc i64 undef to i8
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r0 = trunc i64 poison to i8
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: store i8 %r0, ptr undef, align 1
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r1 = trunc i64 undef to i16
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r1 = trunc i64 poison to i16
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: store i16 %r1, ptr undef, align 2
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r2 = trunc i64 undef to i32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r2 = trunc i64 poison to i32
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: store i32 %r2, ptr undef, align 4
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r3 = trunc i32 undef to i8
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r3 = trunc i32 poison to i8
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: store i8 %r3, ptr undef, align 1
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r4 = trunc i32 undef to i16
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r4 = trunc i32 poison to i16
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: store i16 %r4, ptr undef, align 2
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r5 = trunc i16 undef to i8
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %r5 = trunc i16 poison to i8
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: store i8 %r5, ptr undef, align 1
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
-  %r0 = trunc i64 undef to i8
+  %r0 = trunc i64 poison to i8
   store i8 %r0, ptr undef
-  %r1 = trunc i64 undef to i16
+  %r1 = trunc i64 poison to i16
   store i16 %r1, ptr undef
-  %r2 = trunc i64 undef to i32
+  %r2 = trunc i64 poison to i32
   store i32 %r2, ptr undef
-  %r3 = trunc i32 undef to i8
+  %r3 = trunc i32 poison to i8
   store i8 %r3, ptr undef
-  %r4 = trunc i32 undef to i16
+  %r4 = trunc i32 poison to i16
   store i16 %r4, ptr undef
-  %r5 = trunc i16 undef to i8
+  %r5 = trunc i16 poison to i8
   store i8 %r5, ptr undef
   ret i32 undef
 }
@@ -2084,372 +2084,372 @@ declare void @use(i16, i16, i32, i32, i64, i64, i32, i32, i64, i64, i64, i64)
 
 define void @fp16cast() {
 ; CHECK-SVE-LABEL: 'fp16cast'
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r30 = fptoui half undef to i1
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r31 = fptosi half undef to i1
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r32 = fptoui half undef to i8
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r33 = fptosi half undef to i8
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r34 = fptoui half undef to i16
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r35 = fptosi half undef to i16
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r36 = fptoui half undef to i32
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r37 = fptosi half undef to i32
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r38 = fptoui half undef to i64
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r39 = fptosi half undef to i64
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r90 = fptoui <2 x half> undef to <2 x i1>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r91 = fptosi <2 x half> undef to <2 x i1>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r92 = fptoui <2 x half> undef to <2 x i8>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r93 = fptosi <2 x half> undef to <2 x i8>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r94 = fptoui <2 x half> undef to <2 x i16>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r95 = fptosi <2 x half> undef to <2 x i16>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r96 = fptoui <2 x half> undef to <2 x i32>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r97 = fptosi <2 x half> undef to <2 x i32>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r98 = fptoui <2 x half> undef to <2 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r99 = fptosi <2 x half> undef to <2 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r110 = fptoui <4 x half> undef to <4 x i1>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r111 = fptosi <4 x half> undef to <4 x i1>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r112 = fptoui <4 x half> undef to <4 x i8>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r113 = fptosi <4 x half> undef to <4 x i8>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r114 = fptoui <4 x half> undef to <4 x i16>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r115 = fptosi <4 x half> undef to <4 x i16>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r116 = fptoui <4 x half> undef to <4 x i32>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r117 = fptosi <4 x half> undef to <4 x i32>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r118 = fptoui <4 x half> undef to <4 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r119 = fptosi <4 x half> undef to <4 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r130 = fptoui <8 x half> undef to <8 x i1>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r131 = fptosi <8 x half> undef to <8 x i1>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r132 = fptoui <8 x half> undef to <8 x i8>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r133 = fptosi <8 x half> undef to <8 x i8>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r134 = fptoui <8 x half> undef to <8 x i16>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r135 = fptosi <8 x half> undef to <8 x i16>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r136 = fptoui <8 x half> undef to <8 x i32>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r137 = fptosi <8 x half> undef to <8 x i32>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:1 Lat:1 SizeLat:1 for: %r138 = fptoui <8 x half> undef to <8 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:1 Lat:1 SizeLat:1 for: %r139 = fptosi <8 x half> undef to <8 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:81 CodeSize:1 Lat:1 SizeLat:1 for: %r150 = fptoui <16 x half> undef to <16 x i1>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:81 CodeSize:1 Lat:1 SizeLat:1 for: %r151 = fptosi <16 x half> undef to <16 x i1>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r152 = fptoui <16 x half> undef to <16 x i8>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r153 = fptosi <16 x half> undef to <16 x i8>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r154 = fptoui <16 x half> undef to <16 x i16>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r155 = fptosi <16 x half> undef to <16 x i16>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r156 = fptoui <16 x half> undef to <16 x i32>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r157 = fptosi <16 x half> undef to <16 x i32>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:86 CodeSize:1 Lat:1 SizeLat:1 for: %r158 = fptoui <16 x half> undef to <16 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:86 CodeSize:1 Lat:1 SizeLat:1 for: %r159 = fptosi <16 x half> undef to <16 x i64>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r250 = uitofp <8 x i1> undef to <8 x half>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r251 = sitofp <8 x i1> undef to <8 x half>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r252 = uitofp <8 x i8> undef to <8 x half>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r253 = sitofp <8 x i8> undef to <8 x half>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r254 = uitofp <8 x i16> undef to <8 x half>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r255 = sitofp <8 x i16> undef to <8 x half>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r256 = uitofp <8 x i32> undef to <8 x half>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r257 = sitofp <8 x i32> undef to <8 x half>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r258 = uitofp <8 x i64> undef to <8 x half>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r259 = sitofp <8 x i64> undef to <8 x half>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r260 = uitofp <16 x i1> undef to <16 x half>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r261 = sitofp <16 x i1> undef to <16 x half>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r262 = uitofp <16 x i8> undef to <16 x half>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r263 = sitofp <16 x i8> undef to <16 x half>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r264 = uitofp <16 x i16> undef to <16 x half>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r265 = sitofp <16 x i16> undef to <16 x half>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %r266 = uitofp <16 x i32> undef to <16 x half>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %r267 = sitofp <16 x i32> undef to <16 x half>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %r268 = uitofp <16 x i64> undef to <16 x half>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %r269 = sitofp <16 x i64> undef to <16 x half>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r30 = fptoui half poison to i1
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r31 = fptosi half poison to i1
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r32 = fptoui half poison to i8
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r33 = fptosi half poison to i8
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r34 = fptoui half poison to i16
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r35 = fptosi half poison to i16
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r36 = fptoui half poison to i32
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r37 = fptosi half poison to i32
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r38 = fptoui half poison to i64
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r39 = fptosi half poison to i64
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r90 = fptoui <2 x half> poison to <2 x i1>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r91 = fptosi <2 x half> poison to <2 x i1>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r92 = fptoui <2 x half> poison to <2 x i8>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r93 = fptosi <2 x half> poison to <2 x i8>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r94 = fptoui <2 x half> poison to <2 x i16>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r95 = fptosi <2 x half> poison to <2 x i16>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r96 = fptoui <2 x half> poison to <2 x i32>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r97 = fptosi <2 x half> poison to <2 x i32>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r98 = fptoui <2 x half> poison to <2 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r99 = fptosi <2 x half> poison to <2 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r110 = fptoui <4 x half> poison to <4 x i1>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r111 = fptosi <4 x half> poison to <4 x i1>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r112 = fptoui <4 x half> poison to <4 x i8>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r113 = fptosi <4 x half> poison to <4 x i8>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r114 = fptoui <4 x half> poison to <4 x i16>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r115 = fptosi <4 x half> poison to <4 x i16>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r116 = fptoui <4 x half> poison to <4 x i32>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r117 = fptosi <4 x half> poison to <4 x i32>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r118 = fptoui <4 x half> poison to <4 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r119 = fptosi <4 x half> poison to <4 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r130 = fptoui <8 x half> poison to <8 x i1>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r131 = fptosi <8 x half> poison to <8 x i1>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r132 = fptoui <8 x half> poison to <8 x i8>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r133 = fptosi <8 x half> poison to <8 x i8>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r134 = fptoui <8 x half> poison to <8 x i16>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r135 = fptosi <8 x half> poison to <8 x i16>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r136 = fptoui <8 x half> poison to <8 x i32>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r137 = fptosi <8 x half> poison to <8 x i32>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:1 Lat:1 SizeLat:1 for: %r138 = fptoui <8 x half> poison to <8 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:1 Lat:1 SizeLat:1 for: %r139 = fptosi <8 x half> poison to <8 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:81 CodeSize:1 Lat:1 SizeLat:1 for: %r150 = fptoui <16 x half> poison to <16 x i1>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:81 CodeSize:1 Lat:1 SizeLat:1 for: %r151 = fptosi <16 x half> poison to <16 x i1>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r152 = fptoui <16 x half> poison to <16 x i8>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r153 = fptosi <16 x half> poison to <16 x i8>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r154 = fptoui <16 x half> poison to <16 x i16>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r155 = fptosi <16 x half> poison to <16 x i16>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r156 = fptoui <16 x half> poison to <16 x i32>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r157 = fptosi <16 x half> poison to <16 x i32>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:86 CodeSize:1 Lat:1 SizeLat:1 for: %r158 = fptoui <16 x half> poison to <16 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:86 CodeSize:1 Lat:1 SizeLat:1 for: %r159 = fptosi <16 x half> poison to <16 x i64>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r250 = uitofp <8 x i1> poison to <8 x half>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r251 = sitofp <8 x i1> poison to <8 x half>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r252 = uitofp <8 x i8> poison to <8 x half>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r253 = sitofp <8 x i8> poison to <8 x half>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r254 = uitofp <8 x i16> poison to <8 x half>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %r255 = sitofp <8 x i16> poison to <8 x half>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r256 = uitofp <8 x i32> poison to <8 x half>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r257 = sitofp <8 x i32> poison to <8 x half>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r258 = uitofp <8 x i64> poison to <8 x half>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r259 = sitofp <8 x i64> poison to <8 x half>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r260 = uitofp <16 x i1> poison to <16 x half>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r261 = sitofp <16 x i1> poison to <16 x half>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r262 = uitofp <16 x i8> poison to <16 x half>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r263 = sitofp <16 x i8> poison to <16 x half>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r264 = uitofp <16 x i16> poison to <16 x half>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r265 = sitofp <16 x i16> poison to <16 x half>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %r266 = uitofp <16 x i32> poison to <16 x half>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %r267 = sitofp <16 x i32> poison to <16 x half>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %r268 = uitofp <16 x i64> poison to <16 x half>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %r269 = sitofp <16 x i64> poison to <16 x half>
 ; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SVE128-NO-NEON-LABEL: 'fp16cast'
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r30 = fptoui half undef to i1
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r31 = fptosi half undef to i1
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r32 = fptoui half undef to i8
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r33 = fptosi half undef to i8
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r34 = fptoui half undef to i16
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r35 = fptosi half undef to i16
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r36 = fptoui half undef to i32
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r37 = fptosi half undef to i32
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r38 = fptoui half undef to i64
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r39 = fptosi half undef to i64
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r90 = fptoui <2 x half> undef to <2 x i1>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r91 = fptosi <2 x half> undef to <2 x i1>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r92 = fptoui <2 x half> undef to <2 x i8>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r93 = fptosi <2 x half> undef to <2 x i8>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r94 = fptoui <2 x half> undef to <2 x i16>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r95 = fptosi <2 x half> undef to <2 x i16>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r96 = fptoui <2 x half> undef to <2 x i32>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r97 = fptosi <2 x half> undef to <2 x i32>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r98 = fptoui <2 x half> undef to <2 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r99 = fptosi <2 x half> undef to <2 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r110 = fptoui <4 x half> undef to <4 x i1>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r111 = fptosi <4 x half> undef to <4 x i1>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r112 = fptoui <4 x half> undef to <4 x i8>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r113 = fptosi <4 x half> undef to <4 x i8>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r114 = fptoui <4 x half> undef to <4 x i16>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r115 = fptosi <4 x half> undef to <4 x i16>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r116 = fptoui <4 x half> undef to <4 x i32>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r117 = fptosi <4 x half> undef to <4 x i32>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r118 = fptoui <4 x half> undef to <4 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r119 = fptosi <4 x half> undef to <4 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r130 = fptoui <8 x half> undef to <8 x i1>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r131 = fptosi <8 x half> undef to <8 x i1>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r132 = fptoui <8 x half> undef to <8 x i8>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r133 = fptosi <8 x half> undef to <8 x i8>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r134 = fptoui <8 x half> undef to <8 x i16>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r135 = fptosi <8 x half> undef to <8 x i16>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r136 = fptoui <8 x half> undef to <8 x i32>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r137 = fptosi <8 x half> undef to <8 x i32>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r138 = fptoui <8 x half> undef to <8 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r139 = fptosi <8 x half> undef to <8 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r150 = fptoui <16 x half> undef to <16 x i1>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r151 = fptosi <16 x half> undef to <16 x i1>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r152 = fptoui <16 x half> undef to <16 x i8>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r153 = fptosi <16 x half> undef to <16 x i8>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r154 = fptoui <16 x half> undef to <16 x i16>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r155 = fptosi <16 x half> undef to <16 x i16>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r156 = fptoui <16 x half> undef to <16 x i32>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r157 = fptosi <16 x half> undef to <16 x i32>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r158 = fptoui <16 x half> undef to <16 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r159 = fptosi <16 x half> undef to <16 x i64>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r250 = uitofp <8 x i1> undef to <8 x half>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r251 = sitofp <8 x i1> undef to <8 x half>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r252 = uitofp <8 x i8> undef to <8 x half>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r253 = sitofp <8 x i8> undef to <8 x half>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r254 = uitofp <8 x i16> undef to <8 x half>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r255 = sitofp <8 x i16> undef to <8 x half>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r256 = uitofp <8 x i32> undef to <8 x half>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r257 = sitofp <8 x i32> undef to <8 x half>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r258 = uitofp <8 x i64> undef to <8 x half>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r259 = sitofp <8 x i64> undef to <8 x half>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r260 = uitofp <16 x i1> undef to <16 x half>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r261 = sitofp <16 x i1> undef to <16 x half>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r262 = uitofp <16 x i8> undef to <16 x half>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r263 = sitofp <16 x i8> undef to <16 x half>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r264 = uitofp <16 x i16> undef to <16 x half>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r265 = sitofp <16 x i16> undef to <16 x half>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r266 = uitofp <16 x i32> undef to <16 x half>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r267 = sitofp <16 x i32> undef to <16 x half>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r268 = uitofp <16 x i64> undef to <16 x half>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r269 = sitofp <16 x i64> undef to <16 x half>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r30 = fptoui half poison to i1
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r31 = fptosi half poison to i1
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r32 = fptoui half poison to i8
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r33 = fptosi half poison to i8
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r34 = fptoui half poison to i16
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r35 = fptosi half poison to i16
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r36 = fptoui half poison to i32
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r37 = fptosi half poison to i32
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r38 = fptoui half poison to i64
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r39 = fptosi half poison to i64
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r90 = fptoui <2 x half> poison to <2 x i1>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r91 = fptosi <2 x half> poison to <2 x i1>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r92 = fptoui <2 x half> poison to <2 x i8>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r93 = fptosi <2 x half> poison to <2 x i8>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r94 = fptoui <2 x half> poison to <2 x i16>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r95 = fptosi <2 x half> poison to <2 x i16>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r96 = fptoui <2 x half> poison to <2 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r97 = fptosi <2 x half> poison to <2 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r98 = fptoui <2 x half> poison to <2 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r99 = fptosi <2 x half> poison to <2 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r110 = fptoui <4 x half> poison to <4 x i1>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r111 = fptosi <4 x half> poison to <4 x i1>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r112 = fptoui <4 x half> poison to <4 x i8>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r113 = fptosi <4 x half> poison to <4 x i8>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r114 = fptoui <4 x half> poison to <4 x i16>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r115 = fptosi <4 x half> poison to <4 x i16>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r116 = fptoui <4 x half> poison to <4 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r117 = fptosi <4 x half> poison to <4 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r118 = fptoui <4 x half> poison to <4 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r119 = fptosi <4 x half> poison to <4 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r130 = fptoui <8 x half> poison to <8 x i1>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r131 = fptosi <8 x half> poison to <8 x i1>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r132 = fptoui <8 x half> poison to <8 x i8>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r133 = fptosi <8 x half> poison to <8 x i8>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r134 = fptoui <8 x half> poison to <8 x i16>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r135 = fptosi <8 x half> poison to <8 x i16>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r136 = fptoui <8 x half> poison to <8 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r137 = fptosi <8 x half> poison to <8 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r138 = fptoui <8 x half> poison to <8 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r139 = fptosi <8 x half> poison to <8 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r150 = fptoui <16 x half> poison to <16 x i1>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r151 = fptosi <16 x half> poison to <16 x i1>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r152 = fptoui <16 x half> poison to <16 x i8>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r153 = fptosi <16 x half> poison to <16 x i8>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r154 = fptoui <16 x half> poison to <16 x i16>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r155 = fptosi <16 x half> poison to <16 x i16>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r156 = fptoui <16 x half> poison to <16 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r157 = fptosi <16 x half> poison to <16 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r158 = fptoui <16 x half> poison to <16 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r159 = fptosi <16 x half> poison to <16 x i64>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r250 = uitofp <8 x i1> poison to <8 x half>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r251 = sitofp <8 x i1> poison to <8 x half>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r252 = uitofp <8 x i8> poison to <8 x half>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r253 = sitofp <8 x i8> poison to <8 x half>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r254 = uitofp <8 x i16> poison to <8 x half>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of 1 for: %r255 = sitofp <8 x i16> poison to <8 x half>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r256 = uitofp <8 x i32> poison to <8 x half>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r257 = sitofp <8 x i32> poison to <8 x half>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r258 = uitofp <8 x i64> poison to <8 x half>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r259 = sitofp <8 x i64> poison to <8 x half>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r260 = uitofp <16 x i1> poison to <16 x half>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r261 = sitofp <16 x i1> poison to <16 x half>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r262 = uitofp <16 x i8> poison to <16 x half>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r263 = sitofp <16 x i8> poison to <16 x half>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r264 = uitofp <16 x i16> poison to <16 x half>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r265 = sitofp <16 x i16> poison to <16 x half>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r266 = uitofp <16 x i32> poison to <16 x half>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r267 = sitofp <16 x i32> poison to <16 x half>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r268 = uitofp <16 x i64> poison to <16 x half>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r269 = sitofp <16 x i64> poison to <16 x half>
 ; SVE128-NO-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; FIXED-MIN-256-LABEL: 'fp16cast'
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r30 = fptoui half undef to i1
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r31 = fptosi half undef to i1
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r32 = fptoui half undef to i8
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r33 = fptosi half undef to i8
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r34 = fptoui half undef to i16
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r35 = fptosi half undef to i16
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r36 = fptoui half undef to i32
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r37 = fptosi half undef to i32
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r38 = fptoui half undef to i64
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r39 = fptosi half undef to i64
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r90 = fptoui <2 x half> undef to <2 x i1>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r91 = fptosi <2 x half> undef to <2 x i1>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r92 = fptoui <2 x half> undef to <2 x i8>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r93 = fptosi <2 x half> undef to <2 x i8>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r94 = fptoui <2 x half> undef to <2 x i16>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r95 = fptosi <2 x half> undef to <2 x i16>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r96 = fptoui <2 x half> undef to <2 x i32>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r97 = fptosi <2 x half> undef to <2 x i32>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r98 = fptoui <2 x half> undef to <2 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r99 = fptosi <2 x half> undef to <2 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r110 = fptoui <4 x half> undef to <4 x i1>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r111 = fptosi <4 x half> undef to <4 x i1>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r112 = fptoui <4 x half> undef to <4 x i8>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r113 = fptosi <4 x half> undef to <4 x i8>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r114 = fptoui <4 x half> undef to <4 x i16>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r115 = fptosi <4 x half> undef to <4 x i16>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r116 = fptoui <4 x half> undef to <4 x i32>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r117 = fptosi <4 x half> undef to <4 x i32>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r118 = fptoui <4 x half> undef to <4 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r119 = fptosi <4 x half> undef to <4 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r130 = fptoui <8 x half> undef to <8 x i1>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r131 = fptosi <8 x half> undef to <8 x i1>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r132 = fptoui <8 x half> undef to <8 x i8>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r133 = fptosi <8 x half> undef to <8 x i8>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r134 = fptoui <8 x half> undef to <8 x i16>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r135 = fptosi <8 x half> undef to <8 x i16>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r136 = fptoui <8 x half> undef to <8 x i32>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r137 = fptosi <8 x half> undef to <8 x i32>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r138 = fptoui <8 x half> undef to <8 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r139 = fptosi <8 x half> undef to <8 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r150 = fptoui <16 x half> undef to <16 x i1>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r151 = fptosi <16 x half> undef to <16 x i1>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r152 = fptoui <16 x half> undef to <16 x i8>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r153 = fptosi <16 x half> undef to <16 x i8>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r154 = fptoui <16 x half> undef to <16 x i16>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r155 = fptosi <16 x half> undef to <16 x i16>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r156 = fptoui <16 x half> undef to <16 x i32>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r157 = fptosi <16 x half> undef to <16 x i32>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r158 = fptoui <16 x half> undef to <16 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r159 = fptosi <16 x half> undef to <16 x i64>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r250 = uitofp <8 x i1> undef to <8 x half>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r251 = sitofp <8 x i1> undef to <8 x half>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r252 = uitofp <8 x i8> undef to <8 x half>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r253 = sitofp <8 x i8> undef to <8 x half>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r254 = uitofp <8 x i16> undef to <8 x half>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r255 = sitofp <8 x i16> undef to <8 x half>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r256 = uitofp <8 x i32> undef to <8 x half>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r257 = sitofp <8 x i32> undef to <8 x half>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r258 = uitofp <8 x i64> undef to <8 x half>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r259 = sitofp <8 x i64> undef to <8 x half>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r260 = uitofp <16 x i1> undef to <16 x half>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r261 = sitofp <16 x i1> undef to <16 x half>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r262 = uitofp <16 x i8> undef to <16 x half>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r263 = sitofp <16 x i8> undef to <16 x half>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r264 = uitofp <16 x i16> undef to <16 x half>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r265 = sitofp <16 x i16> undef to <16 x half>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r266 = uitofp <16 x i32> undef to <16 x half>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r267 = sitofp <16 x i32> undef to <16 x half>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r268 = uitofp <16 x i64> undef to <16 x half>
-; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r269 = sitofp <16 x i64> undef to <16 x half>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r30 = fptoui half poison to i1
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r31 = fptosi half poison to i1
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r32 = fptoui half poison to i8
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r33 = fptosi half poison to i8
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r34 = fptoui half poison to i16
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r35 = fptosi half poison to i16
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r36 = fptoui half poison to i32
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r37 = fptosi half poison to i32
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r38 = fptoui half poison to i64
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r39 = fptosi half poison to i64
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r90 = fptoui <2 x half> poison to <2 x i1>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r91 = fptosi <2 x half> poison to <2 x i1>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r92 = fptoui <2 x half> poison to <2 x i8>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r93 = fptosi <2 x half> poison to <2 x i8>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r94 = fptoui <2 x half> poison to <2 x i16>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r95 = fptosi <2 x half> poison to <2 x i16>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r96 = fptoui <2 x half> poison to <2 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r97 = fptosi <2 x half> poison to <2 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r98 = fptoui <2 x half> poison to <2 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r99 = fptosi <2 x half> poison to <2 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r110 = fptoui <4 x half> poison to <4 x i1>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r111 = fptosi <4 x half> poison to <4 x i1>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r112 = fptoui <4 x half> poison to <4 x i8>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r113 = fptosi <4 x half> poison to <4 x i8>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r114 = fptoui <4 x half> poison to <4 x i16>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r115 = fptosi <4 x half> poison to <4 x i16>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r116 = fptoui <4 x half> poison to <4 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r117 = fptosi <4 x half> poison to <4 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r118 = fptoui <4 x half> poison to <4 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r119 = fptosi <4 x half> poison to <4 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r130 = fptoui <8 x half> poison to <8 x i1>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r131 = fptosi <8 x half> poison to <8 x i1>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r132 = fptoui <8 x half> poison to <8 x i8>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r133 = fptosi <8 x half> poison to <8 x i8>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r134 = fptoui <8 x half> poison to <8 x i16>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r135 = fptosi <8 x half> poison to <8 x i16>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r136 = fptoui <8 x half> poison to <8 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r137 = fptosi <8 x half> poison to <8 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r138 = fptoui <8 x half> poison to <8 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r139 = fptosi <8 x half> poison to <8 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r150 = fptoui <16 x half> poison to <16 x i1>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r151 = fptosi <16 x half> poison to <16 x i1>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r152 = fptoui <16 x half> poison to <16 x i8>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r153 = fptosi <16 x half> poison to <16 x i8>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r154 = fptoui <16 x half> poison to <16 x i16>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r155 = fptosi <16 x half> poison to <16 x i16>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r156 = fptoui <16 x half> poison to <16 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r157 = fptosi <16 x half> poison to <16 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r158 = fptoui <16 x half> poison to <16 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r159 = fptosi <16 x half> poison to <16 x i64>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r250 = uitofp <8 x i1> poison to <8 x half>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r251 = sitofp <8 x i1> poison to <8 x half>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r252 = uitofp <8 x i8> poison to <8 x half>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r253 = sitofp <8 x i8> poison to <8 x half>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r254 = uitofp <8 x i16> poison to <8 x half>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r255 = sitofp <8 x i16> poison to <8 x half>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r256 = uitofp <8 x i32> poison to <8 x half>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r257 = sitofp <8 x i32> poison to <8 x half>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r258 = uitofp <8 x i64> poison to <8 x half>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r259 = sitofp <8 x i64> poison to <8 x half>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r260 = uitofp <16 x i1> poison to <16 x half>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r261 = sitofp <16 x i1> poison to <16 x half>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r262 = uitofp <16 x i8> poison to <16 x half>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r263 = sitofp <16 x i8> poison to <16 x half>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r264 = uitofp <16 x i16> poison to <16 x half>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of 1 for: %r265 = sitofp <16 x i16> poison to <16 x half>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r266 = uitofp <16 x i32> poison to <16 x half>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r267 = sitofp <16 x i32> poison to <16 x half>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r268 = uitofp <16 x i64> poison to <16 x half>
+; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r269 = sitofp <16 x i64> poison to <16 x half>
 ; FIXED-MIN-256-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; FIXED-MIN-2048-LABEL: 'fp16cast'
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r30 = fptoui half undef to i1
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r31 = fptosi half undef to i1
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r32 = fptoui half undef to i8
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r33 = fptosi half undef to i8
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r34 = fptoui half undef to i16
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r35 = fptosi half undef to i16
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r36 = fptoui half undef to i32
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r37 = fptosi half undef to i32
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r38 = fptoui half undef to i64
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r39 = fptosi half undef to i64
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r90 = fptoui <2 x half> undef to <2 x i1>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r91 = fptosi <2 x half> undef to <2 x i1>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r92 = fptoui <2 x half> undef to <2 x i8>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r93 = fptosi <2 x half> undef to <2 x i8>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r94 = fptoui <2 x half> undef to <2 x i16>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r95 = fptosi <2 x half> undef to <2 x i16>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r96 = fptoui <2 x half> undef to <2 x i32>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r97 = fptosi <2 x half> undef to <2 x i32>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r98 = fptoui <2 x half> undef to <2 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r99 = fptosi <2 x half> undef to <2 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r110 = fptoui <4 x half> undef to <4 x i1>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r111 = fptosi <4 x half> undef to <4 x i1>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r112 = fptoui <4 x half> undef to <4 x i8>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r113 = fptosi <4 x half> undef to <4 x i8>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r114 = fptoui <4 x half> undef to <4 x i16>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r115 = fptosi <4 x half> undef to <4 x i16>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r116 = fptoui <4 x half> undef to <4 x i32>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r117 = fptosi <4 x half> undef to <4 x i32>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r118 = fptoui <4 x half> undef to <4 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r119 = fptosi <4 x half> undef to <4 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r130 = fptoui <8 x half> undef to <8 x i1>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r131 = fptosi <8 x half> undef to <8 x i1>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r132 = fptoui <8 x half> undef to <8 x i8>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r133 = fptosi <8 x half> undef to <8 x i8>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r134 = fptoui <8 x half> undef to <8 x i16>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r135 = fptosi <8 x half> undef to <8 x i16>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r136 = fptoui <8 x half> undef to <8 x i32>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r137 = fptosi <8 x half> undef to <8 x i32>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r138 = fptoui <8 x half> undef to <8 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r139 = fptosi <8 x half> undef to <8 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r150 = fptoui <16 x half> undef to <16 x i1>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r151 = fptosi <16 x half> undef to <16 x i1>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r152 = fptoui <16 x half> undef to <16 x i8>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r153 = fptosi <16 x half> undef to <16 x i8>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r154 = fptoui <16 x half> undef to <16 x i16>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r155 = fptosi <16 x half> undef to <16 x i16>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r156 = fptoui <16 x half> undef to <16 x i32>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r157 = fptosi <16 x half> undef to <16 x i32>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r158 = fptoui <16 x half> undef to <16 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r159 = fptosi <16 x half> undef to <16 x i64>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r250 = uitofp <8 x i1> undef to <8 x half>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r251 = sitofp <8 x i1> undef to <8 x half>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r252 = uitofp <8 x i8> undef to <8 x half>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r253 = sitofp <8 x i8> undef to <8 x half>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r254 = uitofp <8 x i16> undef to <8 x half>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r255 = sitofp <8 x i16> undef to <8 x half>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r256 = uitofp <8 x i32> undef to <8 x half>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r257 = sitofp <8 x i32> undef to <8 x half>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r258 = uitofp <8 x i64> undef to <8 x half>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r259 = sitofp <8 x i64> undef to <8 x half>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r260 = uitofp <16 x i1> undef to <16 x half>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r261 = sitofp <16 x i1> undef to <16 x half>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r262 = uitofp <16 x i8> undef to <16 x half>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r263 = sitofp <16 x i8> undef to <16 x half>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r264 = uitofp <16 x i16> undef to <16 x half>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r265 = sitofp <16 x i16> undef to <16 x half>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r266 = uitofp <16 x i32> undef to <16 x half>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r267 = sitofp <16 x i32> undef to <16 x half>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r268 = uitofp <16 x i64> undef to <16 x half>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r269 = sitofp <16 x i64> undef to <16 x half>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r30 = fptoui half poison to i1
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r31 = fptosi half poison to i1
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r32 = fptoui half poison to i8
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r33 = fptosi half poison to i8
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r34 = fptoui half poison to i16
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r35 = fptosi half poison to i16
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r36 = fptoui half poison to i32
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r37 = fptosi half poison to i32
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r38 = fptoui half poison to i64
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r39 = fptosi half poison to i64
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r90 = fptoui <2 x half> poison to <2 x i1>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r91 = fptosi <2 x half> poison to <2 x i1>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r92 = fptoui <2 x half> poison to <2 x i8>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r93 = fptosi <2 x half> poison to <2 x i8>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r94 = fptoui <2 x half> poison to <2 x i16>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r95 = fptosi <2 x half> poison to <2 x i16>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r96 = fptoui <2 x half> poison to <2 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r97 = fptosi <2 x half> poison to <2 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r98 = fptoui <2 x half> poison to <2 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r99 = fptosi <2 x half> poison to <2 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r110 = fptoui <4 x half> poison to <4 x i1>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r111 = fptosi <4 x half> poison to <4 x i1>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r112 = fptoui <4 x half> poison to <4 x i8>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r113 = fptosi <4 x half> poison to <4 x i8>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r114 = fptoui <4 x half> poison to <4 x i16>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r115 = fptosi <4 x half> poison to <4 x i16>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r116 = fptoui <4 x half> poison to <4 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r117 = fptosi <4 x half> poison to <4 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r118 = fptoui <4 x half> poison to <4 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r119 = fptosi <4 x half> poison to <4 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r130 = fptoui <8 x half> poison to <8 x i1>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r131 = fptosi <8 x half> poison to <8 x i1>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r132 = fptoui <8 x half> poison to <8 x i8>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r133 = fptosi <8 x half> poison to <8 x i8>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r134 = fptoui <8 x half> poison to <8 x i16>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r135 = fptosi <8 x half> poison to <8 x i16>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r136 = fptoui <8 x half> poison to <8 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r137 = fptosi <8 x half> poison to <8 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r138 = fptoui <8 x half> poison to <8 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r139 = fptosi <8 x half> poison to <8 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r150 = fptoui <16 x half> poison to <16 x i1>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r151 = fptosi <16 x half> poison to <16 x i1>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r152 = fptoui <16 x half> poison to <16 x i8>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r153 = fptosi <16 x half> poison to <16 x i8>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r154 = fptoui <16 x half> poison to <16 x i16>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r155 = fptosi <16 x half> poison to <16 x i16>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r156 = fptoui <16 x half> poison to <16 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r157 = fptosi <16 x half> poison to <16 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r158 = fptoui <16 x half> poison to <16 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r159 = fptosi <16 x half> poison to <16 x i64>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r250 = uitofp <8 x i1> poison to <8 x half>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r251 = sitofp <8 x i1> poison to <8 x half>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r252 = uitofp <8 x i8> poison to <8 x half>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r253 = sitofp <8 x i8> poison to <8 x half>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r254 = uitofp <8 x i16> poison to <8 x half>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r255 = sitofp <8 x i16> poison to <8 x half>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r256 = uitofp <8 x i32> poison to <8 x half>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r257 = sitofp <8 x i32> poison to <8 x half>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r258 = uitofp <8 x i64> poison to <8 x half>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r259 = sitofp <8 x i64> poison to <8 x half>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r260 = uitofp <16 x i1> poison to <16 x half>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r261 = sitofp <16 x i1> poison to <16 x half>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r262 = uitofp <16 x i8> poison to <16 x half>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r263 = sitofp <16 x i8> poison to <16 x half>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r264 = uitofp <16 x i16> poison to <16 x half>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r265 = sitofp <16 x i16> poison to <16 x half>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r266 = uitofp <16 x i32> poison to <16 x half>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r267 = sitofp <16 x i32> poison to <16 x half>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r268 = uitofp <16 x i64> poison to <16 x half>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of 1 for: %r269 = sitofp <16 x i64> poison to <16 x half>
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %r30 = fptoui half undef to i1
-  %r31 = fptosi half undef to i1
-  %r32 = fptoui half undef to i8
-  %r33 = fptosi half undef to i8
-  %r34 = fptoui half undef to i16
-  %r35 = fptosi half undef to i16
-  %r36 = fptoui half undef to i32
-  %r37 = fptosi half undef to i32
-  %r38 = fptoui half undef to i64
-  %r39 = fptosi half undef to i64
+  %r30 = fptoui half poison to i1
+  %r31 = fptosi half poison to i1
+  %r32 = fptoui half poison to i8
+  %r33 = fptosi half poison to i8
+  %r34 = fptoui half poison to i16
+  %r35 = fptosi half poison to i16
+  %r36 = fptoui half poison to i32
+  %r37 = fptosi half poison to i32
+  %r38 = fptoui half poison to i64
+  %r39 = fptosi half poison to i64
 
-  %r90 = fptoui <2 x half> undef to <2 x i1>
-  %r91 = fptosi <2 x half> undef to <2 x i1>
-  %r92 = fptoui <2 x half> undef to <2 x i8>
-  %r93 = fptosi <2 x half> undef to <2 x i8>
-  %r94 = fptoui <2 x half> undef to <2 x i16>
-  %r95 = fptosi <2 x half> undef to <2 x i16>
-  %r96 = fptoui <2 x half> undef to <2 x i32>
-  %r97 = fptosi <2 x half> undef to <2 x i32>
-  %r98 = fptoui <2 x half> undef to <2 x i64>
-  %r99 = fptosi <2 x half> undef to <2 x i64>
+  %r90 = fptoui <2 x half> poison to <2 x i1>
+  %r91 = fptosi <2 x half> poison to <2 x i1>
+  %r92 = fptoui <2 x half> poison to <2 x i8>
+  %r93 = fptosi <2 x half> poison to <2 x i8>
+  %r94 = fptoui <2 x half> poison to <2 x i16>
+  %r95 = fptosi <2 x half> poison to <2 x i16>
+  %r96 = fptoui <2 x half> poison to <2 x i32>
+  %r97 = fptosi <2 x half> poison to <2 x i32>
+  %r98 = fptoui <2 x half> poison to <2 x i64>
+  %r99 = fptosi <2 x half> poison to <2 x i64>
 
-  %r110 = fptoui <4 x half> undef to <4 x i1>
-  %r111 = fptosi <4 x half> undef to <4 x i1>
-  %r112 = fptoui <4 x half> undef to <4 x i8>
-  %r113 = fptosi <4 x half> undef to <4 x i8>
-  %r114 = fptoui <4 x half> undef to <4 x i16>
-  %r115 = fptosi <4 x half> undef to <4 x i16>
-  %r116 = fptoui <4 x half> undef to <4 x i32>
-  %r117 = fptosi <4 x half> undef to <4 x i32>
-  %r118 = fptoui <4 x half> undef to <4 x i64>
-  %r119 = fptosi <4 x half> undef to <4 x i64>
+  %r110 = fptoui <4 x half> poison to <4 x i1>
+  %r111 = fptosi <4 x half> poison to <4 x i1>
+  %r112 = fptoui <4 x half> poison to <4 x i8>
+  %r113 = fptosi <4 x half> poison to <4 x i8>
+  %r114 = fptoui <4 x half> poison to <4 x i16>
+  %r115 = fptosi <4 x half> poison to <4 x i16>
+  %r116 = fptoui <4 x half> poison to <4 x i32>
+  %r117 = fptosi <4 x half> poison to <4 x i32>
+  %r118 = fptoui <4 x half> poison to <4 x i64>
+  %r119 = fptosi <4 x half> poison to <4 x i64>
 
-  %r130 = fptoui <8 x half> undef to <8 x i1>
-  %r131 = fptosi <8 x half> undef to <8 x i1>
-  %r132 = fptoui <8 x half> undef to <8 x i8>
-  %r133 = fptosi <8 x half> undef to <8 x i8>
-  %r134 = fptoui <8 x half> undef to <8 x i16>
-  %r135 = fptosi <8 x half> undef to <8 x i16>
-  %r136 = fptoui <8 x half> undef to <8 x i32>
-  %r137 = fptosi <8 x half> undef to <8 x i32>
-  %r138 = fptoui <8 x half> undef to <8 x i64>
-  %r139 = fptosi <8 x half> undef to <8 x i64>
+  %r130 = fptoui <8 x half> poison to <8 x i1>
+  %r131 = fptosi <8 x half> poison to <8 x i1>
+  %r132 = fptoui <8 x half> poison to <8 x i8>
+  %r133 = fptosi <8 x half> poison to <8 x i8>
+  %r134 = fptoui <8 x half> poison to <8 x i16>
+  %r135 = fptosi <8 x half> poison to <8 x i16>
+  %r136 = fptoui <8 x half> poison to <8 x i32>
+  %r137 = fptosi <8 x half> poison to <8 x i32>
+  %r138 = fptoui <8 x half> poison to <8 x i64>
+  %r139 = fptosi <8 x half> poison to <8 x i64>
 
-  %r150 = fptoui <16 x half> undef to <16 x i1>
-  %r151 = fptosi <16 x half> undef to <16 x i1>
-  %r152 = fptoui <16 x half> undef to <16 x i8>
-  %r153 = fptosi <16 x half> undef to <16 x i8>
-  %r154 = fptoui <16 x half> undef to <16 x i16>
-  %r155 = fptosi <16 x half> undef to <16 x i16>
-  %r156 = fptoui <16 x half> undef to <16 x i32>
-  %r157 = fptosi <16 x half> undef to <16 x i32>
-  %r158 = fptoui <16 x half> undef to <16 x i64>
-  %r159 = fptosi <16 x half> undef to <16 x i64>
+  %r150 = fptoui <16 x half> poison to <16 x i1>
+  %r151 = fptosi <16 x half> poison to <16 x i1>
+  %r152 = fptoui <16 x half> poison to <16 x i8>
+  %r153 = fptosi <16 x half> poison to <16 x i8>
+  %r154 = fptoui <16 x half> poison to <16 x i16>
+  %r155 = fptosi <16 x half> poison to <16 x i16>
+  %r156 = fptoui <16 x half> poison to <16 x i32>
+  %r157 = fptosi <16 x half> poison to <16 x i32>
+  %r158 = fptoui <16 x half> poison to <16 x i64>
+  %r159 = fptosi <16 x half> poison to <16 x i64>
 
-  %r250 = uitofp <8 x i1> undef to <8 x half>
-  %r251 = sitofp <8 x i1> undef to <8 x half>
-  %r252 = uitofp <8 x i8> undef to <8 x half>
-  %r253 = sitofp <8 x i8> undef to <8 x half>
-  %r254 = uitofp <8 x i16> undef to <8 x half>
-  %r255 = sitofp <8 x i16> undef to <8 x half>
-  %r256 = uitofp <8 x i32> undef to <8 x half>
-  %r257 = sitofp <8 x i32> undef to <8 x half>
-  %r258 = uitofp <8 x i64> undef to <8 x half>
-  %r259 = sitofp <8 x i64> undef to <8 x half>
+  %r250 = uitofp <8 x i1> poison to <8 x half>
+  %r251 = sitofp <8 x i1> poison to <8 x half>
+  %r252 = uitofp <8 x i8> poison to <8 x half>
+  %r253 = sitofp <8 x i8> poison to <8 x half>
+  %r254 = uitofp <8 x i16> poison to <8 x half>
+  %r255 = sitofp <8 x i16> poison to <8 x half>
+  %r256 = uitofp <8 x i32> poison to <8 x half>
+  %r257 = sitofp <8 x i32> poison to <8 x half>
+  %r258 = uitofp <8 x i64> poison to <8 x half>
+  %r259 = sitofp <8 x i64> poison to <8 x half>
 
-  %r260 = uitofp <16 x i1> undef to <16 x half>
-  %r261 = sitofp <16 x i1> undef to <16 x half>
-  %r262 = uitofp <16 x i8> undef to <16 x half>
-  %r263 = sitofp <16 x i8> undef to <16 x half>
-  %r264 = uitofp <16 x i16> undef to <16 x half>
-  %r265 = sitofp <16 x i16> undef to <16 x half>
-  %r266 = uitofp <16 x i32> undef to <16 x half>
-  %r267 = sitofp <16 x i32> undef to <16 x half>
-  %r268 = uitofp <16 x i64> undef to <16 x half>
-  %r269 = sitofp <16 x i64> undef to <16 x half>
+  %r260 = uitofp <16 x i1> poison to <16 x half>
+  %r261 = sitofp <16 x i1> poison to <16 x half>
+  %r262 = uitofp <16 x i8> poison to <16 x half>
+  %r263 = sitofp <16 x i8> poison to <16 x half>
+  %r264 = uitofp <16 x i16> poison to <16 x half>
+  %r265 = sitofp <16 x i16> poison to <16 x half>
+  %r266 = uitofp <16 x i32> poison to <16 x half>
+  %r267 = sitofp <16 x i32> poison to <16 x half>
+  %r268 = uitofp <16 x i64> poison to <16 x half>
+  %r269 = sitofp <16 x i64> poison to <16 x half>
   ret void
 }
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-ext.ll b/llvm/test/Analysis/CostModel/AArch64/sve-ext.ll
index b8876540900b6..91aaea24c488a 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-ext.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-ext.ll
@@ -5,49 +5,49 @@ target triple = "aarch64-unknown-linux-gnu"
 
 define void @sve_ext() {
 ; CHECK-LABEL: 'sve_ext'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv16_i8_to_i16 = zext <vscale x 16 x i8> undef to <vscale x 16 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv16_i8_to_i32 = zext <vscale x 16 x i8> undef to <vscale x 16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv16_i8_to_i64 = zext <vscale x 16 x i8> undef to <vscale x 16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv8_i16_to_i32 = zext <vscale x 8 x i16> undef to <vscale x 8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv8_i16_to_i64 = zext <vscale x 8 x i16> undef to <vscale x 8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv4_i32_to_i64 = zext <vscale x 4 x i32> undef to <vscale x 4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv4_i8_to_i64 = zext <vscale x 4 x i8> undef to <vscale x 4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv8_i8_to_i32 = zext <vscale x 8 x i8> undef to <vscale x 8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv4_i16_to_i64 = zext <vscale x 4 x i16> undef to <vscale x 4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv8_i8_to_i64 = zext <vscale x 8 x i8> undef to <vscale x 8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv16_i8_to_i16 = sext <vscale x 16 x i8> undef to <vscale x 16 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv16_i8_to_i32 = sext <vscale x 16 x i8> undef to <vscale x 16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv16_i8_to_i64 = sext <vscale x 16 x i8> undef to <vscale x 16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv8_i16_to_i32 = sext <vscale x 8 x i16> undef to <vscale x 8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv8_i16_to_i64 = sext <vscale x 8 x i16> undef to <vscale x 8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv4_i32_to_i64 = sext <vscale x 4 x i32> undef to <vscale x 4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv4_i8_to_i64 = sext <vscale x 4 x i8> undef to <vscale x 4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv8_i8_to_i32 = sext <vscale x 8 x i8> undef to <vscale x 8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv4_i16_to_i64 = sext <vscale x 4 x i16> undef to <vscale x 4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv8_i8_to_i64 = sext <vscale x 8 x i8> undef to <vscale x 8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv16_i8_to_i16 = zext <vscale x 16 x i8> poison to <vscale x 16 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv16_i8_to_i32 = zext <vscale x 16 x i8> poison to <vscale x 16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv16_i8_to_i64 = zext <vscale x 16 x i8> poison to <vscale x 16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv8_i16_to_i32 = zext <vscale x 8 x i16> poison to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv8_i16_to_i64 = zext <vscale x 8 x i16> poison to <vscale x 8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv4_i32_to_i64 = zext <vscale x 4 x i32> poison to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv4_i8_to_i64 = zext <vscale x 4 x i8> poison to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv8_i8_to_i32 = zext <vscale x 8 x i8> poison to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv4_i16_to_i64 = zext <vscale x 4 x i16> poison to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv8_i8_to_i64 = zext <vscale x 8 x i8> poison to <vscale x 8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv16_i8_to_i16 = sext <vscale x 16 x i8> poison to <vscale x 16 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv16_i8_to_i32 = sext <vscale x 16 x i8> poison to <vscale x 16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv16_i8_to_i64 = sext <vscale x 16 x i8> poison to <vscale x 16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv8_i16_to_i32 = sext <vscale x 8 x i16> poison to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv8_i16_to_i64 = sext <vscale x 8 x i16> poison to <vscale x 8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv4_i32_to_i64 = sext <vscale x 4 x i32> poison to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv4_i8_to_i64 = sext <vscale x 4 x i8> poison to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv8_i8_to_i32 = sext <vscale x 8 x i8> poison to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv4_i16_to_i64 = sext <vscale x 4 x i16> poison to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv8_i8_to_i64 = sext <vscale x 8 x i8> poison to <vscale x 8 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %zext_nxv16_i8_to_i16 = zext <vscale x 16 x i8> undef to <vscale x 16 x i16>
-  %zext_nxv16_i8_to_i32 = zext <vscale x 16 x i8> undef to <vscale x 16 x i32>
-  %zext_nxv16_i8_to_i64 = zext <vscale x 16 x i8> undef to <vscale x 16 x i64>
-  %zext_nxv8_i16_to_i32 = zext <vscale x 8 x i16> undef to <vscale x 8 x i32>
-  %zext_nxv8_i16_to_i64 = zext <vscale x 8 x i16> undef to <vscale x 8 x i64>
-  %zext_nxv4_i32_to_i64 = zext <vscale x 4 x i32> undef to <vscale x 4 x i64>
-  %zext_nxv4_i8_to_i64  = zext <vscale x 4 x i8>  undef to <vscale x 4 x i64>
-  %zext_nxv8_i8_to_i32  = zext <vscale x 8 x i8>  undef to <vscale x 8 x i32>
-  %zext_nxv4_i16_to_i64 = zext <vscale x 4 x i16> undef to <vscale x 4 x i64>
-  %zext_nxv8_i8_to_i64  = zext <vscale x 8 x i8>  undef to <vscale x 8 x i64>
+  %zext_nxv16_i8_to_i16 = zext <vscale x 16 x i8> poison to <vscale x 16 x i16>
+  %zext_nxv16_i8_to_i32 = zext <vscale x 16 x i8> poison to <vscale x 16 x i32>
+  %zext_nxv16_i8_to_i64 = zext <vscale x 16 x i8> poison to <vscale x 16 x i64>
+  %zext_nxv8_i16_to_i32 = zext <vscale x 8 x i16> poison to <vscale x 8 x i32>
+  %zext_nxv8_i16_to_i64 = zext <vscale x 8 x i16> poison to <vscale x 8 x i64>
+  %zext_nxv4_i32_to_i64 = zext <vscale x 4 x i32> poison to <vscale x 4 x i64>
+  %zext_nxv4_i8_to_i64  = zext <vscale x 4 x i8>  poison to <vscale x 4 x i64>
+  %zext_nxv8_i8_to_i32  = zext <vscale x 8 x i8>  poison to <vscale x 8 x i32>
+  %zext_nxv4_i16_to_i64 = zext <vscale x 4 x i16> poison to <vscale x 4 x i64>
+  %zext_nxv8_i8_to_i64  = zext <vscale x 8 x i8>  poison to <vscale x 8 x i64>
 
-  %sext_nxv16_i8_to_i16 = sext <vscale x 16 x i8> undef to <vscale x 16 x i16>
-  %sext_nxv16_i8_to_i32 = sext <vscale x 16 x i8> undef to <vscale x 16 x i32>
-  %sext_nxv16_i8_to_i64 = sext <vscale x 16 x i8> undef to <vscale x 16 x i64>
-  %sext_nxv8_i16_to_i32 = sext <vscale x 8 x i16> undef to <vscale x 8 x i32>
-  %sext_nxv8_i16_to_i64 = sext <vscale x 8 x i16> undef to <vscale x 8 x i64>
-  %sext_nxv4_i32_to_i64 = sext <vscale x 4 x i32> undef to <vscale x 4 x i64>
-  %sext_nxv4_i8_to_i64  = sext <vscale x 4 x i8>  undef to <vscale x 4 x i64>
-  %sext_nxv8_i8_to_i32  = sext <vscale x 8 x i8>  undef to <vscale x 8 x i32>
-  %sext_nxv4_i16_to_i64 = sext <vscale x 4 x i16> undef to <vscale x 4 x i64>
-  %sext_nxv8_i8_to_i64  = sext <vscale x 8 x i8>  undef to <vscale x 8 x i64>
+  %sext_nxv16_i8_to_i16 = sext <vscale x 16 x i8> poison to <vscale x 16 x i16>
+  %sext_nxv16_i8_to_i32 = sext <vscale x 16 x i8> poison to <vscale x 16 x i32>
+  %sext_nxv16_i8_to_i64 = sext <vscale x 16 x i8> poison to <vscale x 16 x i64>
+  %sext_nxv8_i16_to_i32 = sext <vscale x 8 x i16> poison to <vscale x 8 x i32>
+  %sext_nxv8_i16_to_i64 = sext <vscale x 8 x i16> poison to <vscale x 8 x i64>
+  %sext_nxv4_i32_to_i64 = sext <vscale x 4 x i32> poison to <vscale x 4 x i64>
+  %sext_nxv4_i8_to_i64  = sext <vscale x 4 x i8>  poison to <vscale x 4 x i64>
+  %sext_nxv8_i8_to_i32  = sext <vscale x 8 x i8>  poison to <vscale x 8 x i32>
+  %sext_nxv4_i16_to_i64 = sext <vscale x 4 x i16> poison to <vscale x 4 x i64>
+  %sext_nxv8_i8_to_i64  = sext <vscale x 8 x i8>  poison to <vscale x 8 x i64>
 
   ret void
 }
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-fpext.ll b/llvm/test/Analysis/CostModel/AArch64/sve-fpext.ll
index 4ad0e3fb8b4ca..1e698b164195d 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-fpext.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-fpext.ll
@@ -6,49 +6,49 @@ target triple = "aarch64-unknown-linux-gnu"
 
 define void @sve_fpext() {
 ; CHECK-LABEL: 'sve_fpext'
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv2_f16_to_f32 = fpext <vscale x 2 x half> undef to <vscale x 2 x float>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv4_f16_to_f32 = fpext <vscale x 4 x half> undef to <vscale x 4 x float>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_to_f32 = fpext <vscale x 8 x half> undef to <vscale x 8 x float>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv2_f16_to_f64 = fpext <vscale x 2 x half> undef to <vscale x 2 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f16_to_f64 = fpext <vscale x 4 x half> undef to <vscale x 4 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_to_f64 = fpext <vscale x 8 x half> undef to <vscale x 8 x double>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv2_f32_to_f64 = fpext <vscale x 2 x float> undef to <vscale x 2 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f32_to_f64 = fpext <vscale x 4 x float> undef to <vscale x 4 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f32_to_f64 = fpext <vscale x 8 x float> undef to <vscale x 8 x double>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv2_f16_to_f32 = fpext <vscale x 2 x half> poison to <vscale x 2 x float>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv4_f16_to_f32 = fpext <vscale x 4 x half> poison to <vscale x 4 x float>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_to_f32 = fpext <vscale x 8 x half> poison to <vscale x 8 x float>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv2_f16_to_f64 = fpext <vscale x 2 x half> poison to <vscale x 2 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f16_to_f64 = fpext <vscale x 4 x half> poison to <vscale x 4 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_to_f64 = fpext <vscale x 8 x half> poison to <vscale x 8 x double>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv2_f32_to_f64 = fpext <vscale x 2 x float> poison to <vscale x 2 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f32_to_f64 = fpext <vscale x 4 x float> poison to <vscale x 4 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f32_to_f64 = fpext <vscale x 8 x float> poison to <vscale x 8 x double>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %nxv2_f16_to_f32 = fpext <vscale x 2 x half> undef to <vscale x 2 x float>
-  %nxv4_f16_to_f32 = fpext <vscale x 4 x half> undef to <vscale x 4 x float>
-  %nxv8_f16_to_f32 = fpext <vscale x 8 x half> undef to <vscale x 8 x float>
+  %nxv2_f16_to_f32 = fpext <vscale x 2 x half> poison to <vscale x 2 x float>
+  %nxv4_f16_to_f32 = fpext <vscale x 4 x half> poison to <vscale x 4 x float>
+  %nxv8_f16_to_f32 = fpext <vscale x 8 x half> poison to <vscale x 8 x float>
 
-  %nxv2_f16_to_f64 = fpext <vscale x 2 x half> undef to <vscale x 2 x double>
-  %nxv4_f16_to_f64 = fpext <vscale x 4 x half> undef to <vscale x 4 x double>
-  %nxv8_f16_to_f64 = fpext <vscale x 8 x half> undef to <vscale x 8 x double>
+  %nxv2_f16_to_f64 = fpext <vscale x 2 x half> poison to <vscale x 2 x double>
+  %nxv4_f16_to_f64 = fpext <vscale x 4 x half> poison to <vscale x 4 x double>
+  %nxv8_f16_to_f64 = fpext <vscale x 8 x half> poison to <vscale x 8 x double>
 
-  %nxv2_f32_to_f64 = fpext <vscale x 2 x float> undef to <vscale x 2 x double>
-  %nxv4_f32_to_f64 = fpext <vscale x 4 x float> undef to <vscale x 4 x double>
-  %nxv8_f32_to_f64 = fpext <vscale x 8 x float> undef to <vscale x 8 x double>
+  %nxv2_f32_to_f64 = fpext <vscale x 2 x float> poison to <vscale x 2 x double>
+  %nxv4_f32_to_f64 = fpext <vscale x 4 x float> poison to <vscale x 4 x double>
+  %nxv8_f32_to_f64 = fpext <vscale x 8 x float> poison to <vscale x 8 x double>
 
   ret void
 }
 
 define void @sve_fpext_bf16() {
 ; CHECK-LABEL: 'sve_fpext_bf16'
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv2_f16_to_f32 = fpext <vscale x 2 x bfloat> undef to <vscale x 2 x float>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv4_f16_to_f32 = fpext <vscale x 4 x bfloat> undef to <vscale x 4 x float>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_to_f32 = fpext <vscale x 8 x bfloat> undef to <vscale x 8 x float>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %nxv2_f16_to_f64 = fpext <vscale x 2 x bfloat> undef to <vscale x 2 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f16_to_f64 = fpext <vscale x 4 x bfloat> undef to <vscale x 4 x double>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_to_f64 = fpext <vscale x 8 x bfloat> undef to <vscale x 8 x double>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv2_f16_to_f32 = fpext <vscale x 2 x bfloat> poison to <vscale x 2 x float>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv4_f16_to_f32 = fpext <vscale x 4 x bfloat> poison to <vscale x 4 x float>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_to_f32 = fpext <vscale x 8 x bfloat> poison to <vscale x 8 x float>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %nxv2_f16_to_f64 = fpext <vscale x 2 x bfloat> poison to <vscale x 2 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f16_to_f64 = fpext <vscale x 4 x bfloat> poison to <vscale x 4 x double>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_to_f64 = fpext <vscale x 8 x bfloat> poison to <vscale x 8 x double>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %nxv2_f16_to_f32 = fpext <vscale x 2 x bfloat> undef to <vscale x 2 x float>
-  %nxv4_f16_to_f32 = fpext <vscale x 4 x bfloat> undef to <vscale x 4 x float>
-  %nxv8_f16_to_f32 = fpext <vscale x 8 x bfloat> undef to <vscale x 8 x float>
+  %nxv2_f16_to_f32 = fpext <vscale x 2 x bfloat> poison to <vscale x 2 x float>
+  %nxv4_f16_to_f32 = fpext <vscale x 4 x bfloat> poison to <vscale x 4 x float>
+  %nxv8_f16_to_f32 = fpext <vscale x 8 x bfloat> poison to <vscale x 8 x float>
 
-  %nxv2_f16_to_f64 = fpext <vscale x 2 x bfloat> undef to <vscale x 2 x double>
-  %nxv4_f16_to_f64 = fpext <vscale x 4 x bfloat> undef to <vscale x 4 x double>
-  %nxv8_f16_to_f64 = fpext <vscale x 8 x bfloat> undef to <vscale x 8 x double>
+  %nxv2_f16_to_f64 = fpext <vscale x 2 x bfloat> poison to <vscale x 2 x double>
+  %nxv4_f16_to_f64 = fpext <vscale x 4 x bfloat> poison to <vscale x 4 x double>
+  %nxv8_f16_to_f64 = fpext <vscale x 8 x bfloat> poison to <vscale x 8 x double>
 
   ret void
 }
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-fptoi.ll b/llvm/test/Analysis/CostModel/AArch64/sve-fptoi.ll
index 06ed58dc0ca25..ce624a1cf3340 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-fptoi.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-fptoi.ll
@@ -6,163 +6,163 @@ target triple = "aarch64-unknown-linux-gnu"
 
 define void @sve-fptoi() {
 ; CHECK-LABEL: 'sve-fptoi'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f16_to_si8 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f16_to_ui8 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f16_to_si32 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f16_to_ui32 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f16_to_si64 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f16_to_ui64 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f32_to_si8 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f32_to_ui8 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f32_to_si16 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f32_to_ui16 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f32_to_si64 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f32_to_ui64 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv1f64_to_si8 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv1f64_to_ui8 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv1f64_to_si16 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv1f64_to_ui16 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv1f64_to_si32 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv1f64_to_ui32 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv2f16_to_si8 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv2f16_to_ui8 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv2f16_to_si32 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv2f16_to_ui32 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv2f16_to_si64 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv2f16_to_ui64 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv2f32_to_si8 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv2f32_to_ui8 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv2f32_to_si16 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv2f32_to_ui16 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv2f32_to_si64 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv2f32_to_ui64 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv2f64_to_si8 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv2f64_to_ui8 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv2f64_to_si16 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv2f64_to_ui16 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv2f64_to_si32 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv2f64_to_ui32 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv4f16_to_si8 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv4f16_to_ui8 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv4f16_to_si32 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv4f16_to_ui32 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f16_to_si64 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f16_to_ui64 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv4f32_to_si8 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv4f32_to_ui8 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv4f32_to_si16 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv4f32_to_ui16 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f32_to_si64 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f32_to_ui64 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f64_to_si8 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f64_to_ui8 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f64_to_si16 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f64_to_ui16 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f64_to_si32 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f64_to_ui32 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv8f16_to_si8 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv8f16_to_ui8 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f16_to_si32 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f16_to_ui32 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f16_to_si64 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f16_to_ui64 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f32_to_si8 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f32_to_ui8 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f32_to_si16 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f32_to_ui16 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f32_to_si64 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f32_to_ui64 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f64_to_si8 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f64_to_ui8 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f64_to_si16 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f64_to_ui16 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f64_to_si32 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f64_to_ui32 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f16_to_si8 = fptosi <vscale x 1 x half> poison to <vscale x 1 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f16_to_ui8 = fptoui <vscale x 1 x half> poison to <vscale x 1 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f16_to_si32 = fptosi <vscale x 1 x half> poison to <vscale x 1 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f16_to_ui32 = fptoui <vscale x 1 x half> poison to <vscale x 1 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f16_to_si64 = fptosi <vscale x 1 x half> poison to <vscale x 1 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f16_to_ui64 = fptoui <vscale x 1 x half> poison to <vscale x 1 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f32_to_si8 = fptosi <vscale x 1 x float> poison to <vscale x 1 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f32_to_ui8 = fptoui <vscale x 1 x float> poison to <vscale x 1 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f32_to_si16 = fptosi <vscale x 1 x float> poison to <vscale x 1 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f32_to_ui16 = fptoui <vscale x 1 x float> poison to <vscale x 1 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f32_to_si64 = fptosi <vscale x 1 x float> poison to <vscale x 1 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f32_to_ui64 = fptoui <vscale x 1 x float> poison to <vscale x 1 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv1f64_to_si8 = fptosi <vscale x 1 x double> poison to <vscale x 1 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv1f64_to_ui8 = fptoui <vscale x 1 x double> poison to <vscale x 1 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv1f64_to_si16 = fptosi <vscale x 1 x double> poison to <vscale x 1 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv1f64_to_ui16 = fptoui <vscale x 1 x double> poison to <vscale x 1 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv1f64_to_si32 = fptosi <vscale x 1 x double> poison to <vscale x 1 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv1f64_to_ui32 = fptoui <vscale x 1 x double> poison to <vscale x 1 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv2f16_to_si8 = fptosi <vscale x 2 x half> poison to <vscale x 2 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv2f16_to_ui8 = fptoui <vscale x 2 x half> poison to <vscale x 2 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv2f16_to_si32 = fptosi <vscale x 2 x half> poison to <vscale x 2 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv2f16_to_ui32 = fptoui <vscale x 2 x half> poison to <vscale x 2 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv2f16_to_si64 = fptosi <vscale x 2 x half> poison to <vscale x 2 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv2f16_to_ui64 = fptoui <vscale x 2 x half> poison to <vscale x 2 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv2f32_to_si8 = fptosi <vscale x 2 x float> poison to <vscale x 2 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv2f32_to_ui8 = fptoui <vscale x 2 x float> poison to <vscale x 2 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv2f32_to_si16 = fptosi <vscale x 2 x float> poison to <vscale x 2 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv2f32_to_ui16 = fptoui <vscale x 2 x float> poison to <vscale x 2 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv2f32_to_si64 = fptosi <vscale x 2 x float> poison to <vscale x 2 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv2f32_to_ui64 = fptoui <vscale x 2 x float> poison to <vscale x 2 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv2f64_to_si8 = fptosi <vscale x 2 x double> poison to <vscale x 2 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv2f64_to_ui8 = fptoui <vscale x 2 x double> poison to <vscale x 2 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv2f64_to_si16 = fptosi <vscale x 2 x double> poison to <vscale x 2 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv2f64_to_ui16 = fptoui <vscale x 2 x double> poison to <vscale x 2 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv2f64_to_si32 = fptosi <vscale x 2 x double> poison to <vscale x 2 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv2f64_to_ui32 = fptoui <vscale x 2 x double> poison to <vscale x 2 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv4f16_to_si8 = fptosi <vscale x 4 x half> poison to <vscale x 4 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv4f16_to_ui8 = fptoui <vscale x 4 x half> poison to <vscale x 4 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv4f16_to_si32 = fptosi <vscale x 4 x half> poison to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv4f16_to_ui32 = fptoui <vscale x 4 x half> poison to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f16_to_si64 = fptosi <vscale x 4 x half> poison to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f16_to_ui64 = fptoui <vscale x 4 x half> poison to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv4f32_to_si8 = fptosi <vscale x 4 x float> poison to <vscale x 4 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv4f32_to_ui8 = fptoui <vscale x 4 x float> poison to <vscale x 4 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv4f32_to_si16 = fptosi <vscale x 4 x float> poison to <vscale x 4 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv4f32_to_ui16 = fptoui <vscale x 4 x float> poison to <vscale x 4 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f32_to_si64 = fptosi <vscale x 4 x float> poison to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f32_to_ui64 = fptoui <vscale x 4 x float> poison to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f64_to_si8 = fptosi <vscale x 4 x double> poison to <vscale x 4 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f64_to_ui8 = fptoui <vscale x 4 x double> poison to <vscale x 4 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f64_to_si16 = fptosi <vscale x 4 x double> poison to <vscale x 4 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f64_to_ui16 = fptoui <vscale x 4 x double> poison to <vscale x 4 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f64_to_si32 = fptosi <vscale x 4 x double> poison to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f64_to_ui32 = fptoui <vscale x 4 x double> poison to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv8f16_to_si8 = fptosi <vscale x 8 x half> poison to <vscale x 8 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nv8f16_to_ui8 = fptoui <vscale x 8 x half> poison to <vscale x 8 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f16_to_si32 = fptosi <vscale x 8 x half> poison to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f16_to_ui32 = fptoui <vscale x 8 x half> poison to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f16_to_si64 = fptosi <vscale x 8 x half> poison to <vscale x 8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f16_to_ui64 = fptoui <vscale x 8 x half> poison to <vscale x 8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f32_to_si8 = fptosi <vscale x 8 x float> poison to <vscale x 8 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f32_to_ui8 = fptoui <vscale x 8 x float> poison to <vscale x 8 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f32_to_si16 = fptosi <vscale x 8 x float> poison to <vscale x 8 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f32_to_ui16 = fptoui <vscale x 8 x float> poison to <vscale x 8 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f32_to_si64 = fptosi <vscale x 8 x float> poison to <vscale x 8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f32_to_ui64 = fptoui <vscale x 8 x float> poison to <vscale x 8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f64_to_si8 = fptosi <vscale x 8 x double> poison to <vscale x 8 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f64_to_ui8 = fptoui <vscale x 8 x double> poison to <vscale x 8 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f64_to_si16 = fptosi <vscale x 8 x double> poison to <vscale x 8 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f64_to_ui16 = fptoui <vscale x 8 x double> poison to <vscale x 8 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f64_to_si32 = fptosi <vscale x 8 x double> poison to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f64_to_ui32 = fptoui <vscale x 8 x double> poison to <vscale x 8 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %nv1f16_to_si8  = fptosi <vscale x 1 x half> undef to <vscale x 1 x i8>
-  %nv1f16_to_ui8  = fptoui <vscale x 1 x half> undef to <vscale x 1 x i8>
-  %nv1f16_to_si32 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i32>
-  %nv1f16_to_ui32 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i32>
-  %nv1f16_to_si64 = fptosi <vscale x 1 x half> undef to <vscale x 1 x i64>
-  %nv1f16_to_ui64 = fptoui <vscale x 1 x half> undef to <vscale x 1 x i64>
+  %nv1f16_to_si8  = fptosi <vscale x 1 x half> poison to <vscale x 1 x i8>
+  %nv1f16_to_ui8  = fptoui <vscale x 1 x half> poison to <vscale x 1 x i8>
+  %nv1f16_to_si32 = fptosi <vscale x 1 x half> poison to <vscale x 1 x i32>
+  %nv1f16_to_ui32 = fptoui <vscale x 1 x half> poison to <vscale x 1 x i32>
+  %nv1f16_to_si64 = fptosi <vscale x 1 x half> poison to <vscale x 1 x i64>
+  %nv1f16_to_ui64 = fptoui <vscale x 1 x half> poison to <vscale x 1 x i64>
 
-  %nv1f32_to_si8  = fptosi <vscale x 1 x float> undef to <vscale x 1 x i8>
-  %nv1f32_to_ui8  = fptoui <vscale x 1 x float> undef to <vscale x 1 x i8>
-  %nv1f32_to_si16 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i16>
-  %nv1f32_to_ui16 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i16>
-  %nv1f32_to_si64 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i64>
-  %nv1f32_to_ui64 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i64>
+  %nv1f32_to_si8  = fptosi <vscale x 1 x float> poison to <vscale x 1 x i8>
+  %nv1f32_to_ui8  = fptoui <vscale x 1 x float> poison to <vscale x 1 x i8>
+  %nv1f32_to_si16 = fptosi <vscale x 1 x float> poison to <vscale x 1 x i16>
+  %nv1f32_to_ui16 = fptoui <vscale x 1 x float> poison to <vscale x 1 x i16>
+  %nv1f32_to_si64 = fptosi <vscale x 1 x float> poison to <vscale x 1 x i64>
+  %nv1f32_to_ui64 = fptoui <vscale x 1 x float> poison to <vscale x 1 x i64>
 
-  %nv1f64_to_si8  = fptosi <vscale x 1 x double> undef to <vscale x 1 x i8>
-  %nv1f64_to_ui8  = fptoui <vscale x 1 x double> undef to <vscale x 1 x i8>
-  %nv1f64_to_si16 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i16>
-  %nv1f64_to_ui16 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i16>
-  %nv1f64_to_si32 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i32>
-  %nv1f64_to_ui32 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i32>
+  %nv1f64_to_si8  = fptosi <vscale x 1 x double> poison to <vscale x 1 x i8>
+  %nv1f64_to_ui8  = fptoui <vscale x 1 x double> poison to <vscale x 1 x i8>
+  %nv1f64_to_si16 = fptosi <vscale x 1 x double> poison to <vscale x 1 x i16>
+  %nv1f64_to_ui16 = fptoui <vscale x 1 x double> poison to <vscale x 1 x i16>
+  %nv1f64_to_si32 = fptosi <vscale x 1 x double> poison to <vscale x 1 x i32>
+  %nv1f64_to_ui32 = fptoui <vscale x 1 x double> poison to <vscale x 1 x i32>
 
-  %nv2f16_to_si8  = fptosi <vscale x 2 x half> undef to <vscale x 2 x i8>
-  %nv2f16_to_ui8  = fptoui <vscale x 2 x half> undef to <vscale x 2 x i8>
-  %nv2f16_to_si32 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i32>
-  %nv2f16_to_ui32 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i32>
-  %nv2f16_to_si64 = fptosi <vscale x 2 x half> undef to <vscale x 2 x i64>
-  %nv2f16_to_ui64 = fptoui <vscale x 2 x half> undef to <vscale x 2 x i64>
+  %nv2f16_to_si8  = fptosi <vscale x 2 x half> poison to <vscale x 2 x i8>
+  %nv2f16_to_ui8  = fptoui <vscale x 2 x half> poison to <vscale x 2 x i8>
+  %nv2f16_to_si32 = fptosi <vscale x 2 x half> poison to <vscale x 2 x i32>
+  %nv2f16_to_ui32 = fptoui <vscale x 2 x half> poison to <vscale x 2 x i32>
+  %nv2f16_to_si64 = fptosi <vscale x 2 x half> poison to <vscale x 2 x i64>
+  %nv2f16_to_ui64 = fptoui <vscale x 2 x half> poison to <vscale x 2 x i64>
 
-  %nv2f32_to_si8  = fptosi <vscale x 2 x float> undef to <vscale x 2 x i8>
-  %nv2f32_to_ui8  = fptoui <vscale x 2 x float> undef to <vscale x 2 x i8>
-  %nv2f32_to_si16 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i16>
-  %nv2f32_to_ui16 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i16>
-  %nv2f32_to_si64 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i64>
-  %nv2f32_to_ui64 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i64>
+  %nv2f32_to_si8  = fptosi <vscale x 2 x float> poison to <vscale x 2 x i8>
+  %nv2f32_to_ui8  = fptoui <vscale x 2 x float> poison to <vscale x 2 x i8>
+  %nv2f32_to_si16 = fptosi <vscale x 2 x float> poison to <vscale x 2 x i16>
+  %nv2f32_to_ui16 = fptoui <vscale x 2 x float> poison to <vscale x 2 x i16>
+  %nv2f32_to_si64 = fptosi <vscale x 2 x float> poison to <vscale x 2 x i64>
+  %nv2f32_to_ui64 = fptoui <vscale x 2 x float> poison to <vscale x 2 x i64>
 
-  %nv2f64_to_si8  = fptosi <vscale x 2 x double> undef to <vscale x 2 x i8>
-  %nv2f64_to_ui8  = fptoui <vscale x 2 x double> undef to <vscale x 2 x i8>
-  %nv2f64_to_si16 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i16>
-  %nv2f64_to_ui16 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i16>
-  %nv2f64_to_si32 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i32>
-  %nv2f64_to_ui32 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i32>
+  %nv2f64_to_si8  = fptosi <vscale x 2 x double> poison to <vscale x 2 x i8>
+  %nv2f64_to_ui8  = fptoui <vscale x 2 x double> poison to <vscale x 2 x i8>
+  %nv2f64_to_si16 = fptosi <vscale x 2 x double> poison to <vscale x 2 x i16>
+  %nv2f64_to_ui16 = fptoui <vscale x 2 x double> poison to <vscale x 2 x i16>
+  %nv2f64_to_si32 = fptosi <vscale x 2 x double> poison to <vscale x 2 x i32>
+  %nv2f64_to_ui32 = fptoui <vscale x 2 x double> poison to <vscale x 2 x i32>
 
-  %nv4f16_to_si8  = fptosi <vscale x 4 x half> undef to <vscale x 4 x i8>
-  %nv4f16_to_ui8  = fptoui <vscale x 4 x half> undef to <vscale x 4 x i8>
-  %nv4f16_to_si32 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i32>
-  %nv4f16_to_ui32 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i32>
-  %nv4f16_to_si64 = fptosi <vscale x 4 x half> undef to <vscale x 4 x i64>
-  %nv4f16_to_ui64 = fptoui <vscale x 4 x half> undef to <vscale x 4 x i64>
+  %nv4f16_to_si8  = fptosi <vscale x 4 x half> poison to <vscale x 4 x i8>
+  %nv4f16_to_ui8  = fptoui <vscale x 4 x half> poison to <vscale x 4 x i8>
+  %nv4f16_to_si32 = fptosi <vscale x 4 x half> poison to <vscale x 4 x i32>
+  %nv4f16_to_ui32 = fptoui <vscale x 4 x half> poison to <vscale x 4 x i32>
+  %nv4f16_to_si64 = fptosi <vscale x 4 x half> poison to <vscale x 4 x i64>
+  %nv4f16_to_ui64 = fptoui <vscale x 4 x half> poison to <vscale x 4 x i64>
 
-  %nv4f32_to_si8  = fptosi <vscale x 4 x float> undef to <vscale x 4 x i8>
-  %nv4f32_to_ui8  = fptoui <vscale x 4 x float> undef to <vscale x 4 x i8>
-  %nv4f32_to_si16 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i16>
-  %nv4f32_to_ui16 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i16>
-  %nv4f32_to_si64 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i64>
-  %nv4f32_to_ui64 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i64>
+  %nv4f32_to_si8  = fptosi <vscale x 4 x float> poison to <vscale x 4 x i8>
+  %nv4f32_to_ui8  = fptoui <vscale x 4 x float> poison to <vscale x 4 x i8>
+  %nv4f32_to_si16 = fptosi <vscale x 4 x float> poison to <vscale x 4 x i16>
+  %nv4f32_to_ui16 = fptoui <vscale x 4 x float> poison to <vscale x 4 x i16>
+  %nv4f32_to_si64 = fptosi <vscale x 4 x float> poison to <vscale x 4 x i64>
+  %nv4f32_to_ui64 = fptoui <vscale x 4 x float> poison to <vscale x 4 x i64>
 
-  %nv4f64_to_si8  = fptosi <vscale x 4 x double> undef to <vscale x 4 x i8>
-  %nv4f64_to_ui8  = fptoui <vscale x 4 x double> undef to <vscale x 4 x i8>
-  %nv4f64_to_si16 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i16>
-  %nv4f64_to_ui16 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i16>
-  %nv4f64_to_si32 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i32>
-  %nv4f64_to_ui32 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i32>
+  %nv4f64_to_si8  = fptosi <vscale x 4 x double> poison to <vscale x 4 x i8>
+  %nv4f64_to_ui8  = fptoui <vscale x 4 x double> poison to <vscale x 4 x i8>
+  %nv4f64_to_si16 = fptosi <vscale x 4 x double> poison to <vscale x 4 x i16>
+  %nv4f64_to_ui16 = fptoui <vscale x 4 x double> poison to <vscale x 4 x i16>
+  %nv4f64_to_si32 = fptosi <vscale x 4 x double> poison to <vscale x 4 x i32>
+  %nv4f64_to_ui32 = fptoui <vscale x 4 x double> poison to <vscale x 4 x i32>
 
-  %nv8f16_to_si8  = fptosi <vscale x 8 x half> undef to <vscale x 8 x i8>
-  %nv8f16_to_ui8  = fptoui <vscale x 8 x half> undef to <vscale x 8 x i8>
-  %nv8f16_to_si32 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i32>
-  %nv8f16_to_ui32 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i32>
-  %nv8f16_to_si64 = fptosi <vscale x 8 x half> undef to <vscale x 8 x i64>
-  %nv8f16_to_ui64 = fptoui <vscale x 8 x half> undef to <vscale x 8 x i64>
+  %nv8f16_to_si8  = fptosi <vscale x 8 x half> poison to <vscale x 8 x i8>
+  %nv8f16_to_ui8  = fptoui <vscale x 8 x half> poison to <vscale x 8 x i8>
+  %nv8f16_to_si32 = fptosi <vscale x 8 x half> poison to <vscale x 8 x i32>
+  %nv8f16_to_ui32 = fptoui <vscale x 8 x half> poison to <vscale x 8 x i32>
+  %nv8f16_to_si64 = fptosi <vscale x 8 x half> poison to <vscale x 8 x i64>
+  %nv8f16_to_ui64 = fptoui <vscale x 8 x half> poison to <vscale x 8 x i64>
 
-  %nv8f32_to_si8  = fptosi <vscale x 8 x float> undef to <vscale x 8 x i8>
-  %nv8f32_to_ui8  = fptoui <vscale x 8 x float> undef to <vscale x 8 x i8>
-  %nv8f32_to_si16 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i16>
-  %nv8f32_to_ui16 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i16>
-  %nv8f32_to_si64 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i64>
-  %nv8f32_to_ui64 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i64>
+  %nv8f32_to_si8  = fptosi <vscale x 8 x float> poison to <vscale x 8 x i8>
+  %nv8f32_to_ui8  = fptoui <vscale x 8 x float> poison to <vscale x 8 x i8>
+  %nv8f32_to_si16 = fptosi <vscale x 8 x float> poison to <vscale x 8 x i16>
+  %nv8f32_to_ui16 = fptoui <vscale x 8 x float> poison to <vscale x 8 x i16>
+  %nv8f32_to_si64 = fptosi <vscale x 8 x float> poison to <vscale x 8 x i64>
+  %nv8f32_to_ui64 = fptoui <vscale x 8 x float> poison to <vscale x 8 x i64>
 
-  %nv8f64_to_si8  = fptosi <vscale x 8 x double> undef to <vscale x 8 x i8>
-  %nv8f64_to_ui8  = fptoui <vscale x 8 x double> undef to <vscale x 8 x i8>
-  %nv8f64_to_si16 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i16>
-  %nv8f64_to_ui16 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i16>
-  %nv8f64_to_si32 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i32>
-  %nv8f64_to_ui32 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i32>
+  %nv8f64_to_si8  = fptosi <vscale x 8 x double> poison to <vscale x 8 x i8>
+  %nv8f64_to_ui8  = fptoui <vscale x 8 x double> poison to <vscale x 8 x i8>
+  %nv8f64_to_si16 = fptosi <vscale x 8 x double> poison to <vscale x 8 x i16>
+  %nv8f64_to_ui16 = fptoui <vscale x 8 x double> poison to <vscale x 8 x i16>
+  %nv8f64_to_si32 = fptosi <vscale x 8 x double> poison to <vscale x 8 x i32>
+  %nv8f64_to_ui32 = fptoui <vscale x 8 x double> poison to <vscale x 8 x i32>
 
   ret void
 }
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-fptrunc.ll b/llvm/test/Analysis/CostModel/AArch64/sve-fptrunc.ll
index 73556d7ee1d41..5b30c3380efd5 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-fptrunc.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-fptrunc.ll
@@ -8,67 +8,67 @@ target triple = "aarch64-unknown-linux-gnu"
 
 define void @sve_fptruncs() {
 ; CHECK-LABEL: 'sve_fptruncs'
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv2_f16_from_f32 = fptrunc <vscale x 2 x float> undef to <vscale x 2 x half>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv4_f16_from_f32 = fptrunc <vscale x 4 x float> undef to <vscale x 4 x half>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_from_f32 = fptrunc <vscale x 8 x float> undef to <vscale x 8 x half>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv2_f16_from_f64 = fptrunc <vscale x 2 x double> undef to <vscale x 2 x half>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f16_from_f64 = fptrunc <vscale x 4 x double> undef to <vscale x 4 x half>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_from_f64 = fptrunc <vscale x 8 x double> undef to <vscale x 8 x half>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv2_f32_from_f64 = fptrunc <vscale x 2 x double> undef to <vscale x 2 x float>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f32_from_f64 = fptrunc <vscale x 4 x double> undef to <vscale x 4 x float>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f32_from_f64 = fptrunc <vscale x 8 x double> undef to <vscale x 8 x float>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv2_f16_from_f32 = fptrunc <vscale x 2 x float> poison to <vscale x 2 x half>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv4_f16_from_f32 = fptrunc <vscale x 4 x float> poison to <vscale x 4 x half>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_from_f32 = fptrunc <vscale x 8 x float> poison to <vscale x 8 x half>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv2_f16_from_f64 = fptrunc <vscale x 2 x double> poison to <vscale x 2 x half>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f16_from_f64 = fptrunc <vscale x 4 x double> poison to <vscale x 4 x half>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_from_f64 = fptrunc <vscale x 8 x double> poison to <vscale x 8 x half>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv2_f32_from_f64 = fptrunc <vscale x 2 x double> poison to <vscale x 2 x float>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f32_from_f64 = fptrunc <vscale x 4 x double> poison to <vscale x 4 x float>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f32_from_f64 = fptrunc <vscale x 8 x double> poison to <vscale x 8 x float>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %nxv2_f16_from_f32 = fptrunc <vscale x 2 x float> undef to <vscale x 2 x half>
-  %nxv4_f16_from_f32 = fptrunc <vscale x 4 x float> undef to <vscale x 4 x half>
-  %nxv8_f16_from_f32 = fptrunc <vscale x 8 x float> undef to <vscale x 8 x half>
+  %nxv2_f16_from_f32 = fptrunc <vscale x 2 x float> poison to <vscale x 2 x half>
+  %nxv4_f16_from_f32 = fptrunc <vscale x 4 x float> poison to <vscale x 4 x half>
+  %nxv8_f16_from_f32 = fptrunc <vscale x 8 x float> poison to <vscale x 8 x half>
 
-  %nxv2_f16_from_f64 = fptrunc <vscale x 2 x double> undef to <vscale x 2 x half>
-  %nxv4_f16_from_f64 = fptrunc <vscale x 4 x double> undef to <vscale x 4 x half>
-  %nxv8_f16_from_f64 = fptrunc <vscale x 8 x double> undef to <vscale x 8 x half>
+  %nxv2_f16_from_f64 = fptrunc <vscale x 2 x double> poison to <vscale x 2 x half>
+  %nxv4_f16_from_f64 = fptrunc <vscale x 4 x double> poison to <vscale x 4 x half>
+  %nxv8_f16_from_f64 = fptrunc <vscale x 8 x double> poison to <vscale x 8 x half>
 
-  %nxv2_f32_from_f64 = fptrunc <vscale x 2 x double> undef to <vscale x 2 x float>
-  %nxv4_f32_from_f64 = fptrunc <vscale x 4 x double> undef to <vscale x 4 x float>
-  %nxv8_f32_from_f64 = fptrunc <vscale x 8 x double> undef to <vscale x 8 x float>
+  %nxv2_f32_from_f64 = fptrunc <vscale x 2 x double> poison to <vscale x 2 x float>
+  %nxv4_f32_from_f64 = fptrunc <vscale x 4 x double> poison to <vscale x 4 x float>
+  %nxv8_f32_from_f64 = fptrunc <vscale x 8 x double> poison to <vscale x 8 x float>
 
   ret void
 }
 
 define void @sve_fptruncs_bf16() {
 ; CHECK-SVE-LABEL: 'sve_fptruncs_bf16'
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %nxv2_f16_from_f32 = fptrunc <vscale x 2 x float> undef to <vscale x 2 x bfloat>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f16_from_f32 = fptrunc <vscale x 4 x float> undef to <vscale x 4 x bfloat>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_from_f32 = fptrunc <vscale x 8 x float> undef to <vscale x 8 x bfloat>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of Invalid for: %nxv2_f16_from_f64 = fptrunc <vscale x 2 x double> undef to <vscale x 2 x bfloat>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of Invalid for: %nxv4_f16_from_f64 = fptrunc <vscale x 4 x double> undef to <vscale x 4 x bfloat>
-; CHECK-SVE-NEXT:  Cost Model: Found costs of Invalid for: %nxv8_f16_from_f64 = fptrunc <vscale x 8 x double> undef to <vscale x 8 x bfloat>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %nxv2_f16_from_f32 = fptrunc <vscale x 2 x float> poison to <vscale x 2 x bfloat>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f16_from_f32 = fptrunc <vscale x 4 x float> poison to <vscale x 4 x bfloat>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_from_f32 = fptrunc <vscale x 8 x float> poison to <vscale x 8 x bfloat>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of Invalid for: %nxv2_f16_from_f64 = fptrunc <vscale x 2 x double> poison to <vscale x 2 x bfloat>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of Invalid for: %nxv4_f16_from_f64 = fptrunc <vscale x 4 x double> poison to <vscale x 4 x bfloat>
+; CHECK-SVE-NEXT:  Cost Model: Found costs of Invalid for: %nxv8_f16_from_f64 = fptrunc <vscale x 8 x double> poison to <vscale x 8 x bfloat>
 ; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE2-LABEL: 'sve_fptruncs_bf16'
-; CHECK-SVE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %nxv2_f16_from_f32 = fptrunc <vscale x 2 x float> undef to <vscale x 2 x bfloat>
-; CHECK-SVE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f16_from_f32 = fptrunc <vscale x 4 x float> undef to <vscale x 4 x bfloat>
-; CHECK-SVE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_from_f32 = fptrunc <vscale x 8 x float> undef to <vscale x 8 x bfloat>
-; CHECK-SVE2-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %nxv2_f16_from_f64 = fptrunc <vscale x 2 x double> undef to <vscale x 2 x bfloat>
-; CHECK-SVE2-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f16_from_f64 = fptrunc <vscale x 4 x double> undef to <vscale x 4 x bfloat>
-; CHECK-SVE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_from_f64 = fptrunc <vscale x 8 x double> undef to <vscale x 8 x bfloat>
+; CHECK-SVE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %nxv2_f16_from_f32 = fptrunc <vscale x 2 x float> poison to <vscale x 2 x bfloat>
+; CHECK-SVE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f16_from_f32 = fptrunc <vscale x 4 x float> poison to <vscale x 4 x bfloat>
+; CHECK-SVE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_from_f32 = fptrunc <vscale x 8 x float> poison to <vscale x 8 x bfloat>
+; CHECK-SVE2-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %nxv2_f16_from_f64 = fptrunc <vscale x 2 x double> poison to <vscale x 2 x bfloat>
+; CHECK-SVE2-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f16_from_f64 = fptrunc <vscale x 4 x double> poison to <vscale x 4 x bfloat>
+; CHECK-SVE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_from_f64 = fptrunc <vscale x 8 x double> poison to <vscale x 8 x bfloat>
 ; CHECK-SVE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-BF16-LABEL: 'sve_fptruncs_bf16'
-; CHECK-BF16-NEXT:  Cost Model: Found costs of 1 for: %nxv2_f16_from_f32 = fptrunc <vscale x 2 x float> undef to <vscale x 2 x bfloat>
-; CHECK-BF16-NEXT:  Cost Model: Found costs of 1 for: %nxv4_f16_from_f32 = fptrunc <vscale x 4 x float> undef to <vscale x 4 x bfloat>
-; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_from_f32 = fptrunc <vscale x 8 x float> undef to <vscale x 8 x bfloat>
-; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %nxv2_f16_from_f64 = fptrunc <vscale x 2 x double> undef to <vscale x 2 x bfloat>
-; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f16_from_f64 = fptrunc <vscale x 4 x double> undef to <vscale x 4 x bfloat>
-; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_from_f64 = fptrunc <vscale x 8 x double> undef to <vscale x 8 x bfloat>
+; CHECK-BF16-NEXT:  Cost Model: Found costs of 1 for: %nxv2_f16_from_f32 = fptrunc <vscale x 2 x float> poison to <vscale x 2 x bfloat>
+; CHECK-BF16-NEXT:  Cost Model: Found costs of 1 for: %nxv4_f16_from_f32 = fptrunc <vscale x 4 x float> poison to <vscale x 4 x bfloat>
+; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_from_f32 = fptrunc <vscale x 8 x float> poison to <vscale x 8 x bfloat>
+; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %nxv2_f16_from_f64 = fptrunc <vscale x 2 x double> poison to <vscale x 2 x bfloat>
+; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f16_from_f64 = fptrunc <vscale x 4 x double> poison to <vscale x 4 x bfloat>
+; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_from_f64 = fptrunc <vscale x 8 x double> poison to <vscale x 8 x bfloat>
 ; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %nxv2_f16_from_f32 = fptrunc <vscale x 2 x float> undef to <vscale x 2 x bfloat>
-  %nxv4_f16_from_f32 = fptrunc <vscale x 4 x float> undef to <vscale x 4 x bfloat>
-  %nxv8_f16_from_f32 = fptrunc <vscale x 8 x float> undef to <vscale x 8 x bfloat>
+  %nxv2_f16_from_f32 = fptrunc <vscale x 2 x float> poison to <vscale x 2 x bfloat>
+  %nxv4_f16_from_f32 = fptrunc <vscale x 4 x float> poison to <vscale x 4 x bfloat>
+  %nxv8_f16_from_f32 = fptrunc <vscale x 8 x float> poison to <vscale x 8 x bfloat>
 
-  %nxv2_f16_from_f64 = fptrunc <vscale x 2 x double> undef to <vscale x 2 x bfloat>
-  %nxv4_f16_from_f64 = fptrunc <vscale x 4 x double> undef to <vscale x 4 x bfloat>
-  %nxv8_f16_from_f64 = fptrunc <vscale x 8 x double> undef to <vscale x 8 x bfloat>
+  %nxv2_f16_from_f64 = fptrunc <vscale x 2 x double> poison to <vscale x 2 x bfloat>
+  %nxv4_f16_from_f64 = fptrunc <vscale x 4 x double> poison to <vscale x 4 x bfloat>
+  %nxv8_f16_from_f64 = fptrunc <vscale x 8 x double> poison to <vscale x 8 x bfloat>
 
   ret void
 }
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll b/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll
index 35b987532e394..6e0b1ec912df9 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll
@@ -10,17 +10,17 @@ target triple="aarch64--linux-gnu"
 
 define void @masked_gathers(<vscale x 4 x i1> %nxv4i1mask, <vscale x 8 x i1> %nxv8i1mask, <4 x i1> %v4i1mask, <1 x i1> %v1i1mask, <vscale x 1 x i1> %nxv1i1mask) #0 {
 ; CHECK-VSCALE-1-LABEL: 'masked_gathers'
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Invalid cost for instruction: %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Invalid cost for instruction: %res.nxv4i1 = call <vscale x 4 x i1> @llvm.masked.gather.nxv4i1.nxv4p0(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i1> zeroinitializer)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 undef, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> align 4 undef, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Invalid cost for instruction: %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> align 8 undef, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Invalid cost for instruction: %res.nxv4i1 = call <vscale x 4 x i1> @llvm.masked.gather.nxv4i1.nxv4p0(<vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i1> zeroinitializer)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-VSCALE-2-LABEL: 'masked_gathers'
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Invalid cost for instruction: %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Invalid cost for instruction: %res.nxv4i1 = call <vscale x 4 x i1> @llvm.masked.gather.nxv4i1.nxv4p0(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i1> zeroinitializer)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 undef, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> align 4 undef, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Invalid cost for instruction: %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> align 8 undef, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Invalid cost for instruction: %res.nxv4i1 = call <vscale x 4 x i1> @llvm.masked.gather.nxv4i1.nxv4p0(<vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i1> zeroinitializer)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
@@ -32,15 +32,15 @@ define void @masked_gathers(<vscale x 4 x i1> %nxv4i1mask, <vscale x 8 x i1> %nx
 
 define void @masked_gathers_tune_generic(<vscale x 4 x i1> %nxv4i1mask, <vscale x 8 x i1> %nxv8i1mask, <4 x i1> %v4i1mask, <1 x i1> %v1i1mask, <vscale x 1 x i1> %nxv1i1mask) #1 {
 ; CHECK-VSCALE-1-LABEL: 'masked_gathers_tune_generic'
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Invalid cost for instruction: %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 undef, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> align 4 undef, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Invalid cost for instruction: %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> align 8 undef, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-VSCALE-2-LABEL: 'masked_gathers_tune_generic'
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Invalid cost for instruction: %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 undef, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> align 4 undef, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Invalid cost for instruction: %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> align 8 undef, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
@@ -51,25 +51,25 @@ define void @masked_gathers_tune_generic(<vscale x 4 x i1> %nxv4i1mask, <vscale
 
 define void @masked_gathers_no_vscale_range() #2 {
 ; CHECK-VSCALE-1-LABEL: 'masked_gathers_no_vscale_range'
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %res.nxv4f64 = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0(<vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x double> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res.nxv2f64 = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> undef, i32 1, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %res.nxv8f32 = call <vscale x 8 x float> @llvm.masked.gather.nxv8f32.nxv8p0(<vscale x 8 x ptr> undef, i32 1, <vscale x 8 x i1> undef, <vscale x 8 x float> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %res.nxv4f32 = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res.nxv2f32 = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0(<vscale x 2 x ptr> undef, i32 1, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %res.nxv16i16 = call <vscale x 16 x i16> @llvm.masked.gather.nxv16i16.nxv16p0(<vscale x 16 x ptr> undef, i32 1, <vscale x 16 x i1> undef, <vscale x 16 x i16> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %res.nxv8i16 = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0(<vscale x 8 x ptr> undef, i32 1, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %res.nxv4i16 = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %res.nxv4f64 = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0(<vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef, <vscale x 4 x double> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res.nxv2f64 = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> align 1 undef, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %res.nxv8f32 = call <vscale x 8 x float> @llvm.masked.gather.nxv8f32.nxv8p0(<vscale x 8 x ptr> align 1 undef, <vscale x 8 x i1> undef, <vscale x 8 x float> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %res.nxv4f32 = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res.nxv2f32 = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0(<vscale x 2 x ptr> align 1 undef, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %res.nxv16i16 = call <vscale x 16 x i16> @llvm.masked.gather.nxv16i16.nxv16p0(<vscale x 16 x ptr> align 1 undef, <vscale x 16 x i1> undef, <vscale x 16 x i16> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %res.nxv8i16 = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0(<vscale x 8 x ptr> align 1 undef, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %res.nxv4i16 = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-VSCALE-2-LABEL: 'masked_gathers_no_vscale_range'
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %res.nxv4f64 = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0(<vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x double> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %res.nxv2f64 = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> undef, i32 1, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %res.nxv8f32 = call <vscale x 8 x float> @llvm.masked.gather.nxv8f32.nxv8p0(<vscale x 8 x ptr> undef, i32 1, <vscale x 8 x i1> undef, <vscale x 8 x float> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %res.nxv4f32 = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %res.nxv2f32 = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0(<vscale x 2 x ptr> undef, i32 1, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %res.nxv16i16 = call <vscale x 16 x i16> @llvm.masked.gather.nxv16i16.nxv16p0(<vscale x 16 x ptr> undef, i32 1, <vscale x 16 x i1> undef, <vscale x 16 x i16> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %res.nxv8i16 = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0(<vscale x 8 x ptr> undef, i32 1, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %res.nxv4i16 = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %res.nxv4f64 = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0(<vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef, <vscale x 4 x double> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %res.nxv2f64 = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> align 1 undef, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %res.nxv8f32 = call <vscale x 8 x float> @llvm.masked.gather.nxv8f32.nxv8p0(<vscale x 8 x ptr> align 1 undef, <vscale x 8 x i1> undef, <vscale x 8 x float> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %res.nxv4f32 = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %res.nxv2f32 = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0(<vscale x 2 x ptr> align 1 undef, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %res.nxv16i16 = call <vscale x 16 x i16> @llvm.masked.gather.nxv16i16.nxv16p0(<vscale x 16 x ptr> align 1 undef, <vscale x 16 x i1> undef, <vscale x 16 x i16> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %res.nxv8i16 = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0(<vscale x 8 x ptr> align 1 undef, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %res.nxv4i16 = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %res.nxv4f64 = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64(<vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x double> undef)
@@ -88,11 +88,11 @@ define void @masked_gathers_no_vscale_range() #2 {
 
 define <2 x i128> @masked_gather_v1i128(<2 x ptr> %ld, <2 x i1> %masks, <2 x i128> %passthru) #3 {
 ; CHECK-VSCALE-1-LABEL: 'masked_gather_v1i128'
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res = call <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr> %ld, i32 0, <2 x i1> %masks, <2 x i128> %passthru)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res = call <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr> align 16 %ld, <2 x i1> %masks, <2 x i128> %passthru)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i128> %res
 ;
 ; CHECK-VSCALE-2-LABEL: 'masked_gather_v1i128'
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res = call <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr> %ld, i32 0, <2 x i1> %masks, <2 x i128> %passthru)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res = call <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr> align 16 %ld, <2 x i1> %masks, <2 x i128> %passthru)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i128> %res
 ;
   %res = call <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr> %ld, i32 0, <2 x i1> %masks, <2 x i128> %passthru)
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-illegal-types.ll b/llvm/test/Analysis/CostModel/AArch64/sve-illegal-types.ll
index 3e85760c3f53e..20e5dbc47ee92 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-illegal-types.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-illegal-types.ll
@@ -1,14 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64--linux-gnu -mattr=+sve  < %s | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define void @load_store(ptr %ptrs) {
 ; CHECK-LABEL: 'load_store'
-; CHECK-NEXT: Invalid cost for instruction: %load1 = load <vscale x 1 x i128>, ptr undef
-; CHECK-NEXT: Invalid cost for instruction: %load2 = load <vscale x 2 x i128>, ptr undef
-; CHECK-NEXT: Invalid cost for instruction: %load3 = load <vscale x 1 x fp128>, ptr undef
-; CHECK-NEXT: Invalid cost for instruction: %load4 = load <vscale x 2 x fp128>, ptr undef
-; CHECK-NEXT: Invalid cost for instruction: store <vscale x 1 x i128> %load1, ptr %ptrs
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %load1 = load <vscale x 1 x i128>, ptr undef, align 16
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %load2 = load <vscale x 2 x i128>, ptr undef, align 32
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %load3 = load <vscale x 1 x fp128>, ptr undef, align 16
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %load4 = load <vscale x 2 x fp128>, ptr undef, align 32
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: store <vscale x 1 x i128> %load1, ptr %ptrs, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
   %load1 = load <vscale x 1 x i128>, ptr undef
   %load2 = load <vscale x 2 x i128>, ptr undef
   %load3 = load <vscale x 1 x fp128>, ptr undef
@@ -19,8 +22,10 @@ define void @load_store(ptr %ptrs) {
 
 define void @masked_load_store(ptr %ptrs, ptr %val, <vscale x 1 x i1> %mask, <vscale x 1 x i128> %passthru) {
 ; CHECK-LABEL: 'masked_load_store'
-; CHECK-NEXT: Invalid cost for instruction: %mload = call <vscale x 1 x i128> @llvm.masked.load.nxv1i128.p0(ptr %val, i32 8, <vscale x 1 x i1> %mask, <vscale x 1 x i128> %passthru)
-; CHECK-NEXT: Invalid cost for instruction: call void @llvm.masked.store.nxv1i128.p0(<vscale x 1 x i128> %mload, ptr %ptrs, i32 8, <vscale x 1 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %mload = call <vscale x 1 x i128> @llvm.masked.load.nxv1i128.p0(ptr align 8 %val, <vscale x 1 x i1> %mask, <vscale x 1 x i128> %passthru)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.store.nxv1i128.p0(<vscale x 1 x i128> %mload, ptr align 8 %ptrs, <vscale x 1 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
   %mload = call <vscale x 1 x i128> @llvm.masked.load.nxv1i128(ptr %val, i32 8, <vscale x 1 x i1> %mask, <vscale x 1 x i128> %passthru)
   call void @llvm.masked.store.nxv1i128(<vscale x 1 x i128> %mload, ptr %ptrs, i32 8, <vscale x 1 x i1> %mask)
   ret void
@@ -28,8 +33,10 @@ define void @masked_load_store(ptr %ptrs, ptr %val, <vscale x 1 x i1> %mask, <vs
 
 define void @masked_gather_scatter(<vscale x 1 x ptr> %ptrs, <vscale x 1 x ptr> %val, <vscale x 1 x i1> %mask, <vscale x 1 x i128> %passthru) {
 ; CHECK-LABEL: 'masked_gather_scatter'
-; CHECK-NEXT: Invalid cost for instruction: %mgather = call <vscale x 1 x i128> @llvm.masked.gather.nxv1i128.nxv1p0(<vscale x 1 x ptr> %val, i32 0, <vscale x 1 x i1> %mask, <vscale x 1 x i128> %passthru)
-; CHECK-NEXT: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i128.nxv1p0(<vscale x 1 x i128> %mgather, <vscale x 1 x ptr> %ptrs, i32 0, <vscale x 1 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %mgather = call <vscale x 1 x i128> @llvm.masked.gather.nxv1i128.nxv1p0(<vscale x 1 x ptr> align 16 %val, <vscale x 1 x i1> %mask, <vscale x 1 x i128> %passthru)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i128.nxv1p0(<vscale x 1 x i128> %mgather, <vscale x 1 x ptr> align 16 %ptrs, <vscale x 1 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
   %mgather = call <vscale x 1 x i128> @llvm.masked.gather.nxv1i128(<vscale x 1 x ptr> %val, i32 0, <vscale x 1 x i1> %mask, <vscale x 1 x i128> %passthru)
   call void @llvm.masked.scatter.nxv1i128(<vscale x 1 x i128> %mgather, <vscale x 1 x ptr> %ptrs, i32 0, <vscale x 1 x i1> %mask)
   ret void
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
index 609a23bc07938..e0078006b320d 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
@@ -40,50 +40,50 @@ declare <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x
 
 define void @vector_insert_extract_idxzero_128b() #1 {
 ; CHECK-VSCALE-1-LABEL: 'vector_insert_extract_idxzero_128b'
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 0 for: %insert_legal_fixed_into_scalable = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> undef, i64 0)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 0 for: %extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> undef, i64 0)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:32 Lat:48 SizeLat:48 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> undef, i64 0)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 0 for: %insert_legal_fixed_into_scalable = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> poison, <4 x float> poison, i64 0)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 0 for: %extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> poison, i64 0)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> poison, <vscale x 2 x i1> poison, i64 0)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> poison, i64 0)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:32 Lat:48 SizeLat:48 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> poison, i64 0)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> poison, <2 x float> poison, i64 0)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> poison, i64 0)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> poison, <vscale x 2 x float> poison, i64 0)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> poison, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-VSCALE-2-LABEL: 'vector_insert_extract_idxzero_128b'
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 0 for: %insert_legal_fixed_into_scalable = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> undef, i64 0)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 0 for: %extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> undef, i64 0)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:32 Lat:48 SizeLat:48 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> undef, i64 0)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 0 for: %insert_legal_fixed_into_scalable = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> poison, <4 x float> poison, i64 0)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 0 for: %extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> poison, i64 0)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> poison, <vscale x 2 x i1> poison, i64 0)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> poison, i64 0)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:32 Lat:48 SizeLat:48 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> poison, i64 0)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> poison, <2 x float> poison, i64 0)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> poison, i64 0)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> poison, <vscale x 2 x float> poison, i64 0)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> poison, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; TYPE_BASED_ONLY-LABEL: 'vector_insert_extract_idxzero_128b'
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %insert_legal_fixed_into_scalable = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> undef, i64 0)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> undef, i64 0)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> undef, i64 0)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %insert_legal_fixed_into_scalable = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> poison, <4 x float> poison, i64 0)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> poison, i64 0)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> poison, <vscale x 2 x i1> poison, i64 0)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> poison, i64 0)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> poison, i64 0)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> poison, <2 x float> poison, i64 0)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> poison, i64 0)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> poison, <vscale x 2 x float> poison, i64 0)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> poison, i64 0)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %insert_legal_fixed_into_scalable = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> undef, i64 0)
-  %extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> undef, i64 0)
-  %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.v2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
-  %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
-  %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
-  %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
-  %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
-  %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
-  %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> undef, i64 0)
+  %insert_legal_fixed_into_scalable = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> poison, <4 x float> poison, i64 0)
+  %extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> poison, i64 0)
+  %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.v2i1(<vscale x 16 x i1> poison, <vscale x 2 x i1> poison, i64 0)
+  %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> poison, i64 0)
+  %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> poison, i64 0)
+  %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> poison, <2 x float> poison, i64 0)
+  %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> poison, i64 0)
+  %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> poison, <vscale x 2 x float> poison, i64 0)
+  %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> poison, i64 0)
   ret void
 }
 declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float>, <4 x float>, i64)
@@ -97,50 +97,50 @@ declare <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x
 
 define void @vector_insert_extract_idxzero_256b() #2 {
 ; CHECK-VSCALE-1-LABEL: 'vector_insert_extract_idxzero_256b'
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 0 for: %insert_legal_fixed_into_scalable = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> undef, <16 x i16> undef, i64 0)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 0 for: %extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> undef, i64 0)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:32 Lat:48 SizeLat:48 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> undef, i64 0)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 0 for: %insert_legal_fixed_into_scalable = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> poison, <16 x i16> poison, i64 0)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 0 for: %extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> poison, i64 0)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> poison, <vscale x 2 x i1> poison, i64 0)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> poison, i64 0)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:32 Lat:48 SizeLat:48 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> poison, i64 0)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> poison, <2 x float> poison, i64 0)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> poison, i64 0)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> poison, <vscale x 2 x float> poison, i64 0)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> poison, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-VSCALE-2-LABEL: 'vector_insert_extract_idxzero_256b'
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 0 for: %insert_legal_fixed_into_scalable = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> undef, <16 x i16> undef, i64 0)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 0 for: %extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> undef, i64 0)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:32 Lat:48 SizeLat:48 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> undef, i64 0)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 0 for: %insert_legal_fixed_into_scalable = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> poison, <16 x i16> poison, i64 0)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 0 for: %extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> poison, i64 0)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> poison, <vscale x 2 x i1> poison, i64 0)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> poison, i64 0)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:32 Lat:48 SizeLat:48 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> poison, i64 0)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> poison, <2 x float> poison, i64 0)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> poison, i64 0)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> poison, <vscale x 2 x float> poison, i64 0)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> poison, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; TYPE_BASED_ONLY-LABEL: 'vector_insert_extract_idxzero_256b'
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %insert_legal_fixed_into_scalable = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> undef, <16 x i16> undef, i64 0)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> undef, i64 0)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> undef, i64 0)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %insert_legal_fixed_into_scalable = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> poison, <16 x i16> poison, i64 0)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> poison, i64 0)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> poison, <vscale x 2 x i1> poison, i64 0)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> poison, i64 0)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> poison, i64 0)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> poison, <2 x float> poison, i64 0)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> poison, i64 0)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> poison, <vscale x 2 x float> poison, i64 0)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> poison, i64 0)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %insert_legal_fixed_into_scalable = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> undef, <16 x i16> undef, i64 0)
-  %extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nx4f32(<vscale x 4 x float> undef, i64 0)
-  %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.v2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
-  %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
-  %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
-  %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
-  %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
-  %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
-  %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> undef, i64 0)
+  %insert_legal_fixed_into_scalable = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> poison, <16 x i16> poison, i64 0)
+  %extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nx4f32(<vscale x 4 x float> poison, i64 0)
+  %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.v2i1(<vscale x 16 x i1> poison, <vscale x 2 x i1> poison, i64 0)
+  %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> poison, i64 0)
+  %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> poison, i64 0)
+  %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> poison, <2 x float> poison, i64 0)
+  %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> poison, i64 0)
+  %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> poison, <vscale x 2 x float> poison, i64 0)
+  %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> poison, i64 0)
   ret void
 }
 declare <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16>, <16 x i16>, i64)
@@ -148,157 +148,157 @@ declare <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float>, i64
 
 define void @reductions(<vscale x 4 x i32> %v0, <vscale x 4 x i64> %v1, <vscale x 4 x float> %v2, <vscale x 4 x double> %v3) {
 ; CHECK-VSCALE-1-LABEL: 'reductions'
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of Invalid for: %add_nxv1i32 = call i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of Invalid for: %add_nxv1i32 = call i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> poison)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 2 for: %add_nxv4i32 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %v0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 3 for: %add_nxv4i64 = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> %v1)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of Invalid for: %mul_nxv1i32 = call i32 @llvm.vector.reduce.mul.nxv1i32(<vscale x 1 x i32> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of Invalid for: %mul_nxv1i32 = call i32 @llvm.vector.reduce.mul.nxv1i32(<vscale x 1 x i32> poison)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of Invalid for: %mul_nxv4i32 = call i32 @llvm.vector.reduce.mul.nxv4i32(<vscale x 4 x i32> %v0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of Invalid for: %mul_nxv4i64 = call i64 @llvm.vector.reduce.mul.nxv4i64(<vscale x 4 x i64> %v1)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of Invalid for: %and_nxv1i32 = call i32 @llvm.vector.reduce.and.nxv1i32(<vscale x 1 x i32> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of Invalid for: %and_nxv1i32 = call i32 @llvm.vector.reduce.and.nxv1i32(<vscale x 1 x i32> poison)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 2 for: %and_nxv4i32 = call i32 @llvm.vector.reduce.and.nxv4i32(<vscale x 4 x i32> %v0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 3 for: %and_nxv4i64 = call i64 @llvm.vector.reduce.and.nxv4i64(<vscale x 4 x i64> %v1)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of Invalid for: %or_nxv1i32 = call i32 @llvm.vector.reduce.or.nxv1i32(<vscale x 1 x i32> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of Invalid for: %or_nxv1i32 = call i32 @llvm.vector.reduce.or.nxv1i32(<vscale x 1 x i32> poison)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 2 for: %or_nxv4i32 = call i32 @llvm.vector.reduce.or.nxv4i32(<vscale x 4 x i32> %v0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 3 for: %or_nxv4i64 = call i64 @llvm.vector.reduce.or.nxv4i64(<vscale x 4 x i64> %v1)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of Invalid for: %xor_nxv1i32 = call i32 @llvm.vector.reduce.xor.nxv1i32(<vscale x 1 x i32> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of Invalid for: %xor_nxv1i32 = call i32 @llvm.vector.reduce.xor.nxv1i32(<vscale x 1 x i32> poison)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 2 for: %xor_nxv4i32 = call i32 @llvm.vector.reduce.xor.nxv4i32(<vscale x 4 x i32> %v0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 3 for: %xor_nxv4i64 = call i64 @llvm.vector.reduce.xor.nxv4i64(<vscale x 4 x i64> %v1)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of Invalid for: %umin_nxv1i64 = call i64 @llvm.vector.reduce.umin.nxv1i64(<vscale x 1 x i64> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of Invalid for: %umin_nxv1i64 = call i64 @llvm.vector.reduce.umin.nxv1i64(<vscale x 1 x i64> poison)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 2 for: %umin_nxv4i32 = call i32 @llvm.vector.reduce.umin.nxv4i32(<vscale x 4 x i32> %v0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 3 for: %umin_nxv4i64 = call i64 @llvm.vector.reduce.umin.nxv4i64(<vscale x 4 x i64> %v1)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of Invalid for: %smin_nxv1i64 = call i64 @llvm.vector.reduce.smin.nxv1i64(<vscale x 1 x i64> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of Invalid for: %smin_nxv1i64 = call i64 @llvm.vector.reduce.smin.nxv1i64(<vscale x 1 x i64> poison)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 2 for: %smin_nxv4i32 = call i32 @llvm.vector.reduce.smin.nxv4i32(<vscale x 4 x i32> %v0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 3 for: %smin_nxv4i64 = call i64 @llvm.vector.reduce.smin.nxv4i64(<vscale x 4 x i64> %v1)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of Invalid for: %umax_nxv1i64 = call i64 @llvm.vector.reduce.umax.nxv1i64(<vscale x 1 x i64> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of Invalid for: %umax_nxv1i64 = call i64 @llvm.vector.reduce.umax.nxv1i64(<vscale x 1 x i64> poison)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 2 for: %umax_nxv4i32 = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> %v0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 3 for: %umax_nxv4i64 = call i64 @llvm.vector.reduce.umax.nxv4i64(<vscale x 4 x i64> %v1)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of Invalid for: %smax_nxv1i64 = call i64 @llvm.vector.reduce.smax.nxv1i64(<vscale x 1 x i64> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of Invalid for: %smax_nxv1i64 = call i64 @llvm.vector.reduce.smax.nxv1i64(<vscale x 1 x i64> poison)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 2 for: %smax_nxv4i32 = call i32 @llvm.vector.reduce.smax.nxv4i32(<vscale x 4 x i32> %v0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 3 for: %smax_nxv4i64 = call i64 @llvm.vector.reduce.smax.nxv4i64(<vscale x 4 x i64> %v1)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of Invalid for: %fadd_nxv1f32 = call fast float @llvm.vector.reduce.fadd.nxv1f32(float 0.000000e+00, <vscale x 1 x float> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of Invalid for: %fadd_nxv1f32 = call fast float @llvm.vector.reduce.fadd.nxv1f32(float 0.000000e+00, <vscale x 1 x float> poison)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 2 for: %fadd_nxv4f32 = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> %v2)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:5 SizeLat:3 for: %fadd_nxv4f64 = call fast double @llvm.vector.reduce.fadd.nxv4f64(double 0.000000e+00, <vscale x 4 x double> %v3)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of Invalid for: %fmin_nxv1f32 = call fast float @llvm.vector.reduce.fmin.nxv1f32(<vscale x 1 x float> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of Invalid for: %fmin_nxv1f32 = call fast float @llvm.vector.reduce.fmin.nxv1f32(<vscale x 1 x float> poison)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 2 for: %fmin_nxv4f32 = call fast float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> %v2)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 4 for: %fmin_nxv4f64 = call fast double @llvm.vector.reduce.fmin.nxv4f64(<vscale x 4 x double> %v3)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of Invalid for: %fmax_nxv1f32 = call fast float @llvm.vector.reduce.fmax.nxv1f32(<vscale x 1 x float> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of Invalid for: %fmax_nxv1f32 = call fast float @llvm.vector.reduce.fmax.nxv1f32(<vscale x 1 x float> poison)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 2 for: %fmax_nxv4f32 = call fast float @llvm.vector.reduce.fmax.nxv4f32(<vscale x 4 x float> %v2)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 4 for: %fmax_nxv4f64 = call fast double @llvm.vector.reduce.fmax.nxv4f64(<vscale x 4 x double> %v3)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-VSCALE-2-LABEL: 'reductions'
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of Invalid for: %add_nxv1i32 = call i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of Invalid for: %add_nxv1i32 = call i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> poison)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 2 for: %add_nxv4i32 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %v0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 3 for: %add_nxv4i64 = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> %v1)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of Invalid for: %mul_nxv1i32 = call i32 @llvm.vector.reduce.mul.nxv1i32(<vscale x 1 x i32> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of Invalid for: %mul_nxv1i32 = call i32 @llvm.vector.reduce.mul.nxv1i32(<vscale x 1 x i32> poison)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of Invalid for: %mul_nxv4i32 = call i32 @llvm.vector.reduce.mul.nxv4i32(<vscale x 4 x i32> %v0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of Invalid for: %mul_nxv4i64 = call i64 @llvm.vector.reduce.mul.nxv4i64(<vscale x 4 x i64> %v1)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of Invalid for: %and_nxv1i32 = call i32 @llvm.vector.reduce.and.nxv1i32(<vscale x 1 x i32> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of Invalid for: %and_nxv1i32 = call i32 @llvm.vector.reduce.and.nxv1i32(<vscale x 1 x i32> poison)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 2 for: %and_nxv4i32 = call i32 @llvm.vector.reduce.and.nxv4i32(<vscale x 4 x i32> %v0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 3 for: %and_nxv4i64 = call i64 @llvm.vector.reduce.and.nxv4i64(<vscale x 4 x i64> %v1)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of Invalid for: %or_nxv1i32 = call i32 @llvm.vector.reduce.or.nxv1i32(<vscale x 1 x i32> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of Invalid for: %or_nxv1i32 = call i32 @llvm.vector.reduce.or.nxv1i32(<vscale x 1 x i32> poison)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 2 for: %or_nxv4i32 = call i32 @llvm.vector.reduce.or.nxv4i32(<vscale x 4 x i32> %v0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 3 for: %or_nxv4i64 = call i64 @llvm.vector.reduce.or.nxv4i64(<vscale x 4 x i64> %v1)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of Invalid for: %xor_nxv1i32 = call i32 @llvm.vector.reduce.xor.nxv1i32(<vscale x 1 x i32> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of Invalid for: %xor_nxv1i32 = call i32 @llvm.vector.reduce.xor.nxv1i32(<vscale x 1 x i32> poison)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 2 for: %xor_nxv4i32 = call i32 @llvm.vector.reduce.xor.nxv4i32(<vscale x 4 x i32> %v0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 3 for: %xor_nxv4i64 = call i64 @llvm.vector.reduce.xor.nxv4i64(<vscale x 4 x i64> %v1)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of Invalid for: %umin_nxv1i64 = call i64 @llvm.vector.reduce.umin.nxv1i64(<vscale x 1 x i64> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of Invalid for: %umin_nxv1i64 = call i64 @llvm.vector.reduce.umin.nxv1i64(<vscale x 1 x i64> poison)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 2 for: %umin_nxv4i32 = call i32 @llvm.vector.reduce.umin.nxv4i32(<vscale x 4 x i32> %v0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 3 for: %umin_nxv4i64 = call i64 @llvm.vector.reduce.umin.nxv4i64(<vscale x 4 x i64> %v1)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of Invalid for: %smin_nxv1i64 = call i64 @llvm.vector.reduce.smin.nxv1i64(<vscale x 1 x i64> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of Invalid for: %smin_nxv1i64 = call i64 @llvm.vector.reduce.smin.nxv1i64(<vscale x 1 x i64> poison)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 2 for: %smin_nxv4i32 = call i32 @llvm.vector.reduce.smin.nxv4i32(<vscale x 4 x i32> %v0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 3 for: %smin_nxv4i64 = call i64 @llvm.vector.reduce.smin.nxv4i64(<vscale x 4 x i64> %v1)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of Invalid for: %umax_nxv1i64 = call i64 @llvm.vector.reduce.umax.nxv1i64(<vscale x 1 x i64> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of Invalid for: %umax_nxv1i64 = call i64 @llvm.vector.reduce.umax.nxv1i64(<vscale x 1 x i64> poison)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 2 for: %umax_nxv4i32 = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> %v0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 3 for: %umax_nxv4i64 = call i64 @llvm.vector.reduce.umax.nxv4i64(<vscale x 4 x i64> %v1)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of Invalid for: %smax_nxv1i64 = call i64 @llvm.vector.reduce.smax.nxv1i64(<vscale x 1 x i64> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of Invalid for: %smax_nxv1i64 = call i64 @llvm.vector.reduce.smax.nxv1i64(<vscale x 1 x i64> poison)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 2 for: %smax_nxv4i32 = call i32 @llvm.vector.reduce.smax.nxv4i32(<vscale x 4 x i32> %v0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 3 for: %smax_nxv4i64 = call i64 @llvm.vector.reduce.smax.nxv4i64(<vscale x 4 x i64> %v1)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of Invalid for: %fadd_nxv1f32 = call fast float @llvm.vector.reduce.fadd.nxv1f32(float 0.000000e+00, <vscale x 1 x float> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of Invalid for: %fadd_nxv1f32 = call fast float @llvm.vector.reduce.fadd.nxv1f32(float 0.000000e+00, <vscale x 1 x float> poison)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 2 for: %fadd_nxv4f32 = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> %v2)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:5 SizeLat:3 for: %fadd_nxv4f64 = call fast double @llvm.vector.reduce.fadd.nxv4f64(double 0.000000e+00, <vscale x 4 x double> %v3)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of Invalid for: %fmin_nxv1f32 = call fast float @llvm.vector.reduce.fmin.nxv1f32(<vscale x 1 x float> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of Invalid for: %fmin_nxv1f32 = call fast float @llvm.vector.reduce.fmin.nxv1f32(<vscale x 1 x float> poison)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 2 for: %fmin_nxv4f32 = call fast float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> %v2)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 4 for: %fmin_nxv4f64 = call fast double @llvm.vector.reduce.fmin.nxv4f64(<vscale x 4 x double> %v3)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of Invalid for: %fmax_nxv1f32 = call fast float @llvm.vector.reduce.fmax.nxv1f32(<vscale x 1 x float> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of Invalid for: %fmax_nxv1f32 = call fast float @llvm.vector.reduce.fmax.nxv1f32(<vscale x 1 x float> poison)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 2 for: %fmax_nxv4f32 = call fast float @llvm.vector.reduce.fmax.nxv4f32(<vscale x 4 x float> %v2)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 4 for: %fmax_nxv4f64 = call fast double @llvm.vector.reduce.fmax.nxv4f64(<vscale x 4 x double> %v3)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; TYPE_BASED_ONLY-LABEL: 'reductions'
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %add_nxv1i32 = call i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %add_nxv1i32 = call i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> poison)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 2 for: %add_nxv4i32 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %v0)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 3 for: %add_nxv4i64 = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> %v1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %mul_nxv1i32 = call i32 @llvm.vector.reduce.mul.nxv1i32(<vscale x 1 x i32> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %mul_nxv1i32 = call i32 @llvm.vector.reduce.mul.nxv1i32(<vscale x 1 x i32> poison)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %mul_nxv4i32 = call i32 @llvm.vector.reduce.mul.nxv4i32(<vscale x 4 x i32> %v0)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %mul_nxv4i64 = call i64 @llvm.vector.reduce.mul.nxv4i64(<vscale x 4 x i64> %v1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %and_nxv1i32 = call i32 @llvm.vector.reduce.and.nxv1i32(<vscale x 1 x i32> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %and_nxv1i32 = call i32 @llvm.vector.reduce.and.nxv1i32(<vscale x 1 x i32> poison)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 2 for: %and_nxv4i32 = call i32 @llvm.vector.reduce.and.nxv4i32(<vscale x 4 x i32> %v0)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 3 for: %and_nxv4i64 = call i64 @llvm.vector.reduce.and.nxv4i64(<vscale x 4 x i64> %v1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %or_nxv1i32 = call i32 @llvm.vector.reduce.or.nxv1i32(<vscale x 1 x i32> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %or_nxv1i32 = call i32 @llvm.vector.reduce.or.nxv1i32(<vscale x 1 x i32> poison)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 2 for: %or_nxv4i32 = call i32 @llvm.vector.reduce.or.nxv4i32(<vscale x 4 x i32> %v0)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 3 for: %or_nxv4i64 = call i64 @llvm.vector.reduce.or.nxv4i64(<vscale x 4 x i64> %v1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %xor_nxv1i32 = call i32 @llvm.vector.reduce.xor.nxv1i32(<vscale x 1 x i32> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %xor_nxv1i32 = call i32 @llvm.vector.reduce.xor.nxv1i32(<vscale x 1 x i32> poison)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 2 for: %xor_nxv4i32 = call i32 @llvm.vector.reduce.xor.nxv4i32(<vscale x 4 x i32> %v0)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 3 for: %xor_nxv4i64 = call i64 @llvm.vector.reduce.xor.nxv4i64(<vscale x 4 x i64> %v1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %umin_nxv1i64 = call i64 @llvm.vector.reduce.umin.nxv1i64(<vscale x 1 x i64> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %umin_nxv1i64 = call i64 @llvm.vector.reduce.umin.nxv1i64(<vscale x 1 x i64> poison)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 2 for: %umin_nxv4i32 = call i32 @llvm.vector.reduce.umin.nxv4i32(<vscale x 4 x i32> %v0)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 3 for: %umin_nxv4i64 = call i64 @llvm.vector.reduce.umin.nxv4i64(<vscale x 4 x i64> %v1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %smin_nxv1i64 = call i64 @llvm.vector.reduce.smin.nxv1i64(<vscale x 1 x i64> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %smin_nxv1i64 = call i64 @llvm.vector.reduce.smin.nxv1i64(<vscale x 1 x i64> poison)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 2 for: %smin_nxv4i32 = call i32 @llvm.vector.reduce.smin.nxv4i32(<vscale x 4 x i32> %v0)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 3 for: %smin_nxv4i64 = call i64 @llvm.vector.reduce.smin.nxv4i64(<vscale x 4 x i64> %v1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %umax_nxv1i64 = call i64 @llvm.vector.reduce.umax.nxv1i64(<vscale x 1 x i64> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %umax_nxv1i64 = call i64 @llvm.vector.reduce.umax.nxv1i64(<vscale x 1 x i64> poison)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 2 for: %umax_nxv4i32 = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> %v0)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 3 for: %umax_nxv4i64 = call i64 @llvm.vector.reduce.umax.nxv4i64(<vscale x 4 x i64> %v1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %smax_nxv1i64 = call i64 @llvm.vector.reduce.smax.nxv1i64(<vscale x 1 x i64> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %smax_nxv1i64 = call i64 @llvm.vector.reduce.smax.nxv1i64(<vscale x 1 x i64> poison)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 2 for: %smax_nxv4i32 = call i32 @llvm.vector.reduce.smax.nxv4i32(<vscale x 4 x i32> %v0)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 3 for: %smax_nxv4i64 = call i64 @llvm.vector.reduce.smax.nxv4i64(<vscale x 4 x i64> %v1)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %fadd_nxv1f32 = call fast float @llvm.vector.reduce.fadd.nxv1f32(float 0.000000e+00, <vscale x 1 x float> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %fadd_nxv1f32 = call fast float @llvm.vector.reduce.fadd.nxv1f32(float 0.000000e+00, <vscale x 1 x float> poison)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 2 for: %fadd_nxv4f32 = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> %v2)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:3 Lat:5 SizeLat:3 for: %fadd_nxv4f64 = call fast double @llvm.vector.reduce.fadd.nxv4f64(double 0.000000e+00, <vscale x 4 x double> %v3)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %fmin_nxv1f32 = call fast float @llvm.vector.reduce.fmin.nxv1f32(<vscale x 1 x float> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %fmin_nxv1f32 = call fast float @llvm.vector.reduce.fmin.nxv1f32(<vscale x 1 x float> poison)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 2 for: %fmin_nxv4f32 = call fast float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> %v2)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 4 for: %fmin_nxv4f64 = call fast double @llvm.vector.reduce.fmin.nxv4f64(<vscale x 4 x double> %v3)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %fmax_nxv1f32 = call fast float @llvm.vector.reduce.fmax.nxv1f32(<vscale x 1 x float> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %fmax_nxv1f32 = call fast float @llvm.vector.reduce.fmax.nxv1f32(<vscale x 1 x float> poison)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 2 for: %fmax_nxv4f32 = call fast float @llvm.vector.reduce.fmax.nxv4f32(<vscale x 4 x float> %v2)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 4 for: %fmax_nxv4f64 = call fast double @llvm.vector.reduce.fmax.nxv4f64(<vscale x 4 x double> %v3)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %add_nxv1i32 = call i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> undef)
+  %add_nxv1i32 = call i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> poison)
   %add_nxv4i32 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %v0)
   %add_nxv4i64 = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> %v1)
-  %mul_nxv1i32 = call i32 @llvm.vector.reduce.mul.nxv1i32(<vscale x 1 x i32> undef)
+  %mul_nxv1i32 = call i32 @llvm.vector.reduce.mul.nxv1i32(<vscale x 1 x i32> poison)
   %mul_nxv4i32 = call i32 @llvm.vector.reduce.mul.nxv4i32(<vscale x 4 x i32> %v0)
   %mul_nxv4i64 = call i64 @llvm.vector.reduce.mul.nxv4i64(<vscale x 4 x i64> %v1)
-  %and_nxv1i32 = call i32 @llvm.vector.reduce.and.nxv1i32(<vscale x 1 x i32> undef)
+  %and_nxv1i32 = call i32 @llvm.vector.reduce.and.nxv1i32(<vscale x 1 x i32> poison)
   %and_nxv4i32 = call i32 @llvm.vector.reduce.and.nxv4i32(<vscale x 4 x i32> %v0)
   %and_nxv4i64 = call i64 @llvm.vector.reduce.and.nxv4i64(<vscale x 4 x i64> %v1)
-  %or_nxv1i32 = call i32 @llvm.vector.reduce.or.nxv1i32(<vscale x 1 x i32> undef)
+  %or_nxv1i32 = call i32 @llvm.vector.reduce.or.nxv1i32(<vscale x 1 x i32> poison)
   %or_nxv4i32 = call i32 @llvm.vector.reduce.or.nxv4i32(<vscale x 4 x i32> %v0)
   %or_nxv4i64 = call i64 @llvm.vector.reduce.or.nxv4i64(<vscale x 4 x i64> %v1)
-  %xor_nxv1i32 = call i32 @llvm.vector.reduce.xor.nxv1i32(<vscale x 1 x i32> undef)
+  %xor_nxv1i32 = call i32 @llvm.vector.reduce.xor.nxv1i32(<vscale x 1 x i32> poison)
   %xor_nxv4i32 = call i32 @llvm.vector.reduce.xor.nxv4i32(<vscale x 4 x i32> %v0)
   %xor_nxv4i64 = call i64 @llvm.vector.reduce.xor.nxv4i64(<vscale x 4 x i64> %v1)
-  %umin_nxv1i64 = call i64 @llvm.vector.reduce.umin.nxv1i64(<vscale x 1 x i64> undef)
+  %umin_nxv1i64 = call i64 @llvm.vector.reduce.umin.nxv1i64(<vscale x 1 x i64> poison)
   %umin_nxv4i32 = call i32 @llvm.vector.reduce.umin.nxv4i32(<vscale x 4 x i32> %v0)
   %umin_nxv4i64 = call i64 @llvm.vector.reduce.umin.nxv4i64(<vscale x 4 x i64> %v1)
-  %smin_nxv1i64 = call i64 @llvm.vector.reduce.smin.nxv1i64(<vscale x 1 x i64> undef)
+  %smin_nxv1i64 = call i64 @llvm.vector.reduce.smin.nxv1i64(<vscale x 1 x i64> poison)
   %smin_nxv4i32 = call i32 @llvm.vector.reduce.smin.nxv4i32(<vscale x 4 x i32> %v0)
   %smin_nxv4i64 = call i64 @llvm.vector.reduce.smin.nxv4i64(<vscale x 4 x i64> %v1)
-  %umax_nxv1i64 = call i64 @llvm.vector.reduce.umax.nxv1i64(<vscale x 1 x i64> undef)
+  %umax_nxv1i64 = call i64 @llvm.vector.reduce.umax.nxv1i64(<vscale x 1 x i64> poison)
   %umax_nxv4i32 = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> %v0)
   %umax_nxv4i64 = call i64 @llvm.vector.reduce.umax.nxv4i64(<vscale x 4 x i64> %v1)
-  %smax_nxv1i64 = call i64 @llvm.vector.reduce.smax.nxv1i64(<vscale x 1 x i64> undef)
+  %smax_nxv1i64 = call i64 @llvm.vector.reduce.smax.nxv1i64(<vscale x 1 x i64> poison)
   %smax_nxv4i32 = call i32 @llvm.vector.reduce.smax.nxv4i32(<vscale x 4 x i32> %v0)
   %smax_nxv4i64 = call i64 @llvm.vector.reduce.smax.nxv4i64(<vscale x 4 x i64> %v1)
 
-  %fadd_nxv1f32 = call fast float @llvm.vector.reduce.fadd.nxv1f32(float 0.0, <vscale x 1 x float> undef)
+  %fadd_nxv1f32 = call fast float @llvm.vector.reduce.fadd.nxv1f32(float 0.0, <vscale x 1 x float> poison)
   %fadd_nxv4f32 = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.0, <vscale x 4 x float> %v2)
   %fadd_nxv4f64 = call fast double @llvm.vector.reduce.fadd.nxv4f64(double 0.0, <vscale x 4 x double> %v3)
-  %fmin_nxv1f32 = call fast float @llvm.vector.reduce.fmin.nxv1f32(<vscale x 1 x float> undef)
+  %fmin_nxv1f32 = call fast float @llvm.vector.reduce.fmin.nxv1f32(<vscale x 1 x float> poison)
   %fmin_nxv4f32 = call fast float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> %v2)
   %fmin_nxv4f64 = call fast double @llvm.vector.reduce.fmin.nxv4f64(<vscale x 4 x double> %v3)
-  %fmax_nxv1f32 = call fast float @llvm.vector.reduce.fmax.nxv1f32(<vscale x 1 x float> undef)
+  %fmax_nxv1f32 = call fast float @llvm.vector.reduce.fmax.nxv1f32(<vscale x 1 x float> poison)
   %fmax_nxv4f32 = call fast float @llvm.vector.reduce.fmax.nxv4f32(<vscale x 4 x float> %v2)
   %fmax_nxv4f64 = call fast double @llvm.vector.reduce.fmax.nxv4f64(<vscale x 4 x double> %v3)
 
@@ -389,123 +389,123 @@ declare <vscale x 4 x i32> @llvm.cttz.nxv4i32(<vscale x 4 x i32>, i1)
 
 define void @vector_reverse() #0 {
 ; CHECK-VSCALE-1-LABEL: 'vector_reverse'
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv2f16 = call <vscale x 2 x half> @llvm.vector.reverse.nxv2f16(<vscale x 2 x half> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv4f16 = call <vscale x 4 x half> @llvm.vector.reverse.nxv4f16(<vscale x 4 x half> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv8f16 = call <vscale x 8 x half> @llvm.vector.reverse.nxv8f16(<vscale x 8 x half> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv16f16 = call <vscale x 16 x half> @llvm.vector.reverse.nxv16f16(<vscale x 16 x half> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv2f32 = call <vscale x 2 x float> @llvm.vector.reverse.nxv2f32(<vscale x 2 x float> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv4f32 = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv8f32 = call <vscale x 8 x float> @llvm.vector.reverse.nxv8f32(<vscale x 8 x float> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv2f64 = call <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv4f64 = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.vector.reverse.nxv2bf16(<vscale x 2 x bfloat> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.vector.reverse.nxv4bf16(<vscale x 4 x bfloat> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.vector.reverse.nxv8bf16(<vscale x 8 x bfloat> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.vector.reverse.nxv16bf16(<vscale x 16 x bfloat> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16> poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16> poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv2f16 = call <vscale x 2 x half> @llvm.vector.reverse.nxv2f16(<vscale x 2 x half> poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv4f16 = call <vscale x 4 x half> @llvm.vector.reverse.nxv4f16(<vscale x 4 x half> poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv8f16 = call <vscale x 8 x half> @llvm.vector.reverse.nxv8f16(<vscale x 8 x half> poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv16f16 = call <vscale x 16 x half> @llvm.vector.reverse.nxv16f16(<vscale x 16 x half> poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv2f32 = call <vscale x 2 x float> @llvm.vector.reverse.nxv2f32(<vscale x 2 x float> poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv4f32 = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv8f32 = call <vscale x 8 x float> @llvm.vector.reverse.nxv8f32(<vscale x 8 x float> poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv2f64 = call <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double> poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv4f64 = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.vector.reverse.nxv2bf16(<vscale x 2 x bfloat> poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.vector.reverse.nxv4bf16(<vscale x 4 x bfloat> poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.vector.reverse.nxv8bf16(<vscale x 8 x bfloat> poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.vector.reverse.nxv16bf16(<vscale x 16 x bfloat> poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> poison)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-VSCALE-2-LABEL: 'vector_reverse'
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv2f16 = call <vscale x 2 x half> @llvm.vector.reverse.nxv2f16(<vscale x 2 x half> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv4f16 = call <vscale x 4 x half> @llvm.vector.reverse.nxv4f16(<vscale x 4 x half> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv8f16 = call <vscale x 8 x half> @llvm.vector.reverse.nxv8f16(<vscale x 8 x half> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv16f16 = call <vscale x 16 x half> @llvm.vector.reverse.nxv16f16(<vscale x 16 x half> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv2f32 = call <vscale x 2 x float> @llvm.vector.reverse.nxv2f32(<vscale x 2 x float> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv4f32 = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv8f32 = call <vscale x 8 x float> @llvm.vector.reverse.nxv8f32(<vscale x 8 x float> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv2f64 = call <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv4f64 = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.vector.reverse.nxv2bf16(<vscale x 2 x bfloat> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.vector.reverse.nxv4bf16(<vscale x 4 x bfloat> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.vector.reverse.nxv8bf16(<vscale x 8 x bfloat> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.vector.reverse.nxv16bf16(<vscale x 16 x bfloat> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16> poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16> poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv2f16 = call <vscale x 2 x half> @llvm.vector.reverse.nxv2f16(<vscale x 2 x half> poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv4f16 = call <vscale x 4 x half> @llvm.vector.reverse.nxv4f16(<vscale x 4 x half> poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv8f16 = call <vscale x 8 x half> @llvm.vector.reverse.nxv8f16(<vscale x 8 x half> poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv16f16 = call <vscale x 16 x half> @llvm.vector.reverse.nxv16f16(<vscale x 16 x half> poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv2f32 = call <vscale x 2 x float> @llvm.vector.reverse.nxv2f32(<vscale x 2 x float> poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv4f32 = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv8f32 = call <vscale x 8 x float> @llvm.vector.reverse.nxv8f32(<vscale x 8 x float> poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv2f64 = call <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double> poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv4f64 = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.vector.reverse.nxv2bf16(<vscale x 2 x bfloat> poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.vector.reverse.nxv4bf16(<vscale x 4 x bfloat> poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.vector.reverse.nxv8bf16(<vscale x 8 x bfloat> poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.vector.reverse.nxv16bf16(<vscale x 16 x bfloat> poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> poison)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; TYPE_BASED_ONLY-LABEL: 'vector_reverse'
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv2f16 = call <vscale x 2 x half> @llvm.vector.reverse.nxv2f16(<vscale x 2 x half> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv4f16 = call <vscale x 4 x half> @llvm.vector.reverse.nxv4f16(<vscale x 4 x half> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv8f16 = call <vscale x 8 x half> @llvm.vector.reverse.nxv8f16(<vscale x 8 x half> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv16f16 = call <vscale x 16 x half> @llvm.vector.reverse.nxv16f16(<vscale x 16 x half> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv2f32 = call <vscale x 2 x float> @llvm.vector.reverse.nxv2f32(<vscale x 2 x float> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv4f32 = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv8f32 = call <vscale x 8 x float> @llvm.vector.reverse.nxv8f32(<vscale x 8 x float> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv2f64 = call <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv4f64 = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.vector.reverse.nxv2bf16(<vscale x 2 x bfloat> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.vector.reverse.nxv4bf16(<vscale x 4 x bfloat> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.vector.reverse.nxv8bf16(<vscale x 8 x bfloat> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.vector.reverse.nxv16bf16(<vscale x 16 x bfloat> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16> poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16> poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv2f16 = call <vscale x 2 x half> @llvm.vector.reverse.nxv2f16(<vscale x 2 x half> poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv4f16 = call <vscale x 4 x half> @llvm.vector.reverse.nxv4f16(<vscale x 4 x half> poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv8f16 = call <vscale x 8 x half> @llvm.vector.reverse.nxv8f16(<vscale x 8 x half> poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv16f16 = call <vscale x 16 x half> @llvm.vector.reverse.nxv16f16(<vscale x 16 x half> poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv2f32 = call <vscale x 2 x float> @llvm.vector.reverse.nxv2f32(<vscale x 2 x float> poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv4f32 = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv8f32 = call <vscale x 8 x float> @llvm.vector.reverse.nxv8f32(<vscale x 8 x float> poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv2f64 = call <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double> poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv4f64 = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.vector.reverse.nxv2bf16(<vscale x 2 x bfloat> poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.vector.reverse.nxv4bf16(<vscale x 4 x bfloat> poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.vector.reverse.nxv8bf16(<vscale x 8 x bfloat> poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 2 for: %reverse_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.vector.reverse.nxv16bf16(<vscale x 16 x bfloat> poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> poison)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 
-  %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
-  %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
-  %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
-  %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
-  %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
-  %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
-  %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
-  %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
-  %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
-  %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
-  %reverse_nxv2f16 = call <vscale x 2 x half> @llvm.vector.reverse.nxv2f16(<vscale x 2 x half> undef)
-  %reverse_nxv4f16 = call <vscale x 4 x half> @llvm.vector.reverse.nxv4f16(<vscale x 4 x half> undef)
-  %reverse_nxv8f16 = call <vscale x 8 x half> @llvm.vector.reverse.nxv8f16(<vscale x 8 x half> undef)
-  %reverse_nxv16f16 = call <vscale x 16 x half> @llvm.vector.reverse.nxv16f16(<vscale x 16 x half> undef)
-  %reverse_nxv2f32 = call <vscale x 2 x float> @llvm.vector.reverse.nxv2f32(<vscale x 2 x float> undef)
-  %reverse_nxv4f32 = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> undef)
-  %reverse_nxv8f32 = call <vscale x 8 x float> @llvm.vector.reverse.nxv8f32(<vscale x 8 x float> undef)
-  %reverse_nxv2f64 = call <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double> undef)
-  %reverse_nxv4f64 = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> undef)
-  %reverse_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.vector.reverse.nxv2bf16(<vscale x 2 x bfloat> undef)
-  %reverse_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.vector.reverse.nxv4bf16(<vscale x 4 x bfloat> undef)
-  %reverse_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.vector.reverse.nxv8bf16(<vscale x 8 x bfloat> undef)
-  %reverse_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.vector.reverse.nxv16bf16(<vscale x 16 x bfloat> undef)
-  %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
-  %reverse_nxv8i1 =  call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
-  %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
-  %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
+  %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> poison)
+  %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> poison)
+  %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16> poison)
+  %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16> poison)
+  %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> poison)
+  %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> poison)
+  %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> poison)
+  %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> poison)
+  %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> poison)
+  %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> poison)
+  %reverse_nxv2f16 = call <vscale x 2 x half> @llvm.vector.reverse.nxv2f16(<vscale x 2 x half> poison)
+  %reverse_nxv4f16 = call <vscale x 4 x half> @llvm.vector.reverse.nxv4f16(<vscale x 4 x half> poison)
+  %reverse_nxv8f16 = call <vscale x 8 x half> @llvm.vector.reverse.nxv8f16(<vscale x 8 x half> poison)
+  %reverse_nxv16f16 = call <vscale x 16 x half> @llvm.vector.reverse.nxv16f16(<vscale x 16 x half> poison)
+  %reverse_nxv2f32 = call <vscale x 2 x float> @llvm.vector.reverse.nxv2f32(<vscale x 2 x float> poison)
+  %reverse_nxv4f32 = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> poison)
+  %reverse_nxv8f32 = call <vscale x 8 x float> @llvm.vector.reverse.nxv8f32(<vscale x 8 x float> poison)
+  %reverse_nxv2f64 = call <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double> poison)
+  %reverse_nxv4f64 = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> poison)
+  %reverse_nxv2bf16 = call <vscale x 2 x bfloat> @llvm.vector.reverse.nxv2bf16(<vscale x 2 x bfloat> poison)
+  %reverse_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.vector.reverse.nxv4bf16(<vscale x 4 x bfloat> poison)
+  %reverse_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.vector.reverse.nxv8bf16(<vscale x 8 x bfloat> poison)
+  %reverse_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.vector.reverse.nxv16bf16(<vscale x 16 x bfloat> poison)
+  %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> poison)
+  %reverse_nxv8i1 =  call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> poison)
+  %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> poison)
+  %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> poison)
   ret void
 }
 declare <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8>)
@@ -912,172 +912,172 @@ declare <vscale x 4 x double> @llvm.vector.splice.nxv4f64(<vscale x 4 x double>,
 
 define void @get_lane_mask() #0 {
 ; CHECK-VSCALE-1-LABEL: 'get_lane_mask'
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv16i1_i64 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv8i1_i64 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv4i1_i64 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv2i1_i64 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv16i1_i32 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 undef, i32 undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv8i1_i32 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 undef, i32 undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv4i1_i32 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 undef, i32 undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv2i1_i32 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 undef, i32 undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 2 for: %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 undef, i16 undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 32 for: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 16 for: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 8 for: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 4 for: %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 32 for: %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 undef, i32 undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 16 for: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 undef, i32 undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 8 for: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 undef, i32 undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 4 for: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 undef, i32 undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 48 for: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 6 for: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 undef, i16 undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv16i1_i64 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 poison, i64 poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv8i1_i64 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 poison, i64 poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv4i1_i64 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 poison, i64 poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv2i1_i64 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 poison, i64 poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv16i1_i32 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 poison, i32 poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv8i1_i32 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 poison, i32 poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv4i1_i32 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 poison, i32 poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv2i1_i32 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 poison, i32 poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 2 for: %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 poison, i16 poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 32 for: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 poison, i64 poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 16 for: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 poison, i64 poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 8 for: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 poison, i64 poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 4 for: %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 poison, i64 poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 32 for: %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 poison, i32 poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 16 for: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 poison, i32 poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 8 for: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 poison, i32 poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 4 for: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 poison, i32 poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 48 for: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 poison, i64 poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 6 for: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 poison, i16 poison)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-VSCALE-2-LABEL: 'get_lane_mask'
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv16i1_i64 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv8i1_i64 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv4i1_i64 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv2i1_i64 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv16i1_i32 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 undef, i32 undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv8i1_i32 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 undef, i32 undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv4i1_i32 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 undef, i32 undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv2i1_i32 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 undef, i32 undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 2 for: %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 undef, i16 undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 32 for: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 16 for: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 8 for: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 4 for: %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 32 for: %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 undef, i32 undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 16 for: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 undef, i32 undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 8 for: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 undef, i32 undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 4 for: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 undef, i32 undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 48 for: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 undef, i64 undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 6 for: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 undef, i16 undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv16i1_i64 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 poison, i64 poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv8i1_i64 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 poison, i64 poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv4i1_i64 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 poison, i64 poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv2i1_i64 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 poison, i64 poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv16i1_i32 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 poison, i32 poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv8i1_i32 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 poison, i32 poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv4i1_i32 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 poison, i32 poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv2i1_i32 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 poison, i32 poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 2 for: %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 poison, i16 poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 32 for: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 poison, i64 poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 16 for: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 poison, i64 poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 8 for: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 poison, i64 poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 4 for: %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 poison, i64 poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 32 for: %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 poison, i32 poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 16 for: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 poison, i32 poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 8 for: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 poison, i32 poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 4 for: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 poison, i32 poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 48 for: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 poison, i64 poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 6 for: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 poison, i16 poison)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; TYPE_BASED_ONLY-LABEL: 'get_lane_mask'
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv16i1_i64 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 undef, i64 undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv8i1_i64 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 undef, i64 undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv4i1_i64 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 undef, i64 undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv2i1_i64 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 undef, i64 undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv16i1_i32 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 undef, i32 undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv8i1_i32 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 undef, i32 undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv4i1_i32 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 undef, i32 undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv2i1_i32 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 undef, i32 undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 2 for: %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 undef, i64 undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 undef, i16 undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 32 for: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 undef, i64 undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 16 for: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 undef, i64 undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 8 for: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 undef, i64 undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 4 for: %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 undef, i64 undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 32 for: %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 undef, i32 undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 16 for: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 undef, i32 undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 8 for: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 undef, i32 undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 4 for: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 undef, i32 undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 48 for: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 undef, i64 undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 6 for: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 undef, i16 undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv16i1_i64 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 poison, i64 poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv8i1_i64 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 poison, i64 poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv4i1_i64 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 poison, i64 poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv2i1_i64 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 poison, i64 poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv16i1_i32 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 poison, i32 poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv8i1_i32 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 poison, i32 poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv4i1_i32 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 poison, i32 poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv2i1_i32 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 poison, i32 poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 2 for: %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 poison, i16 poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 32 for: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 poison, i64 poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 16 for: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 poison, i64 poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 8 for: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 poison, i64 poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 4 for: %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 poison, i64 poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 32 for: %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 poison, i32 poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 16 for: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 poison, i32 poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 8 for: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 poison, i32 poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 4 for: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 poison, i32 poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 48 for: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 poison, i64 poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 6 for: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 poison, i16 poison)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %mask_nxv16i1_i64 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 undef, i64 undef)
-  %mask_nxv8i1_i64 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 undef, i64 undef)
-  %mask_nxv4i1_i64 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 undef, i64 undef)
-  %mask_nxv2i1_i64 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 undef, i64 undef)
+  %mask_nxv16i1_i64 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 poison, i64 poison)
+  %mask_nxv8i1_i64 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 poison, i64 poison)
+  %mask_nxv4i1_i64 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 poison, i64 poison)
+  %mask_nxv2i1_i64 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 poison, i64 poison)
 
-  %mask_nxv16i1_i32 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 undef, i32 undef)
-  %mask_nxv8i1_i32 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 undef, i32 undef)
-  %mask_nxv4i1_i32 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 undef, i32 undef)
-  %mask_nxv2i1_i32 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 undef, i32 undef)
+  %mask_nxv16i1_i32 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 poison, i32 poison)
+  %mask_nxv8i1_i32 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 poison, i32 poison)
+  %mask_nxv4i1_i32 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 poison, i32 poison)
+  %mask_nxv2i1_i32 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 poison, i32 poison)
 
-  %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 undef, i64 undef)
-  %mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 undef, i16 undef)
+  %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison)
+  %mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 poison, i16 poison)
 
-  %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 undef, i64 undef)
-  %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 undef, i64 undef)
-  %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 undef, i64 undef)
-  %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 undef, i64 undef)
+  %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 poison, i64 poison)
+  %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 poison, i64 poison)
+  %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 poison, i64 poison)
+  %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 poison, i64 poison)
 
-  %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 undef, i32 undef)
-  %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 undef, i32 undef)
-  %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 undef, i32 undef)
-  %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 undef, i32 undef)
+  %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 poison, i32 poison)
+  %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 poison, i32 poison)
+  %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 poison, i32 poison)
+  %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 poison, i32 poison)
 
-  %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 undef, i64 undef)
-  %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 undef, i16 undef)
+  %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 poison, i64 poison)
+  %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 poison, i16 poison)
 
   ret void
 }
 
 define void @fshr() #0 {
 ; CHECK-VSCALE-1-LABEL: 'fshr'
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 5 for: %1 = call <vscale x 16 x i8> @llvm.fshr.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i8> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 5 for: %2 = call <vscale x 8 x i16> @llvm.fshr.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, <vscale x 8 x i16> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 5 for: %3 = call <vscale x 4 x i32> @llvm.fshr.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i32> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 5 for: %4 = call <vscale x 2 x i64> @llvm.fshr.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i64> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 5 for: %1 = call <vscale x 16 x i8> @llvm.fshr.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i8> poison, <vscale x 16 x i8> poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 5 for: %2 = call <vscale x 8 x i16> @llvm.fshr.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i16> poison, <vscale x 8 x i16> poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 5 for: %3 = call <vscale x 4 x i32> @llvm.fshr.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> poison, <vscale x 4 x i32> poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 5 for: %4 = call <vscale x 2 x i64> @llvm.fshr.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i64> poison, <vscale x 2 x i64> poison)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-VSCALE-2-LABEL: 'fshr'
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 5 for: %1 = call <vscale x 16 x i8> @llvm.fshr.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i8> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 5 for: %2 = call <vscale x 8 x i16> @llvm.fshr.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, <vscale x 8 x i16> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 5 for: %3 = call <vscale x 4 x i32> @llvm.fshr.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i32> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 5 for: %4 = call <vscale x 2 x i64> @llvm.fshr.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i64> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 5 for: %1 = call <vscale x 16 x i8> @llvm.fshr.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i8> poison, <vscale x 16 x i8> poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 5 for: %2 = call <vscale x 8 x i16> @llvm.fshr.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i16> poison, <vscale x 8 x i16> poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 5 for: %3 = call <vscale x 4 x i32> @llvm.fshr.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> poison, <vscale x 4 x i32> poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 5 for: %4 = call <vscale x 2 x i64> @llvm.fshr.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i64> poison, <vscale x 2 x i64> poison)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; TYPE_BASED_ONLY-LABEL: 'fshr'
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 7 for: %1 = call <vscale x 16 x i8> @llvm.fshr.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i8> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 7 for: %2 = call <vscale x 8 x i16> @llvm.fshr.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, <vscale x 8 x i16> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 7 for: %3 = call <vscale x 4 x i32> @llvm.fshr.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i32> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 7 for: %4 = call <vscale x 2 x i64> @llvm.fshr.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i64> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 7 for: %1 = call <vscale x 16 x i8> @llvm.fshr.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i8> poison, <vscale x 16 x i8> poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 7 for: %2 = call <vscale x 8 x i16> @llvm.fshr.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i16> poison, <vscale x 8 x i16> poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 7 for: %3 = call <vscale x 4 x i32> @llvm.fshr.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> poison, <vscale x 4 x i32> poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 7 for: %4 = call <vscale x 2 x i64> @llvm.fshr.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i64> poison, <vscale x 2 x i64> poison)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  call <vscale x 16 x i8> @llvm.fshr.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i8> undef)
-  call <vscale x 8 x i16> @llvm.fshr.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, <vscale x 8 x i16> undef)
-  call <vscale x 4 x i32> @llvm.fshr.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i32> undef)
-  call <vscale x 2 x i64> @llvm.fshr.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i64> undef)
+  call <vscale x 16 x i8> @llvm.fshr.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i8> poison, <vscale x 16 x i8> poison)
+  call <vscale x 8 x i16> @llvm.fshr.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i16> poison, <vscale x 8 x i16> poison)
+  call <vscale x 4 x i32> @llvm.fshr.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> poison, <vscale x 4 x i32> poison)
+  call <vscale x 2 x i64> @llvm.fshr.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i64> poison, <vscale x 2 x i64> poison)
   ret void
 }
 
 define void @fshl() #0 {
 ; CHECK-VSCALE-1-LABEL: 'fshl'
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 5 for: %1 = call <vscale x 16 x i8> @llvm.fshl.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i8> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 5 for: %2 = call <vscale x 8 x i16> @llvm.fshl.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, <vscale x 8 x i16> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 5 for: %3 = call <vscale x 4 x i32> @llvm.fshl.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i32> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 5 for: %4 = call <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i64> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 5 for: %1 = call <vscale x 16 x i8> @llvm.fshl.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i8> poison, <vscale x 16 x i8> poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 5 for: %2 = call <vscale x 8 x i16> @llvm.fshl.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i16> poison, <vscale x 8 x i16> poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 5 for: %3 = call <vscale x 4 x i32> @llvm.fshl.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> poison, <vscale x 4 x i32> poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 5 for: %4 = call <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i64> poison, <vscale x 2 x i64> poison)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-VSCALE-2-LABEL: 'fshl'
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 5 for: %1 = call <vscale x 16 x i8> @llvm.fshl.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i8> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 5 for: %2 = call <vscale x 8 x i16> @llvm.fshl.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, <vscale x 8 x i16> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 5 for: %3 = call <vscale x 4 x i32> @llvm.fshl.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i32> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 5 for: %4 = call <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i64> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 5 for: %1 = call <vscale x 16 x i8> @llvm.fshl.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i8> poison, <vscale x 16 x i8> poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 5 for: %2 = call <vscale x 8 x i16> @llvm.fshl.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i16> poison, <vscale x 8 x i16> poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 5 for: %3 = call <vscale x 4 x i32> @llvm.fshl.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> poison, <vscale x 4 x i32> poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 5 for: %4 = call <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i64> poison, <vscale x 2 x i64> poison)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; TYPE_BASED_ONLY-LABEL: 'fshl'
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 7 for: %1 = call <vscale x 16 x i8> @llvm.fshl.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i8> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 7 for: %2 = call <vscale x 8 x i16> @llvm.fshl.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, <vscale x 8 x i16> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 7 for: %3 = call <vscale x 4 x i32> @llvm.fshl.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i32> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 7 for: %4 = call <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i64> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 7 for: %1 = call <vscale x 16 x i8> @llvm.fshl.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i8> poison, <vscale x 16 x i8> poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 7 for: %2 = call <vscale x 8 x i16> @llvm.fshl.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i16> poison, <vscale x 8 x i16> poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 7 for: %3 = call <vscale x 4 x i32> @llvm.fshl.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> poison, <vscale x 4 x i32> poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 7 for: %4 = call <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i64> poison, <vscale x 2 x i64> poison)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  call <vscale x 16 x i8> @llvm.fshl.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, <vscale x 16 x i8> undef)
-  call <vscale x 8 x i16> @llvm.fshl.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, <vscale x 8 x i16> undef)
-  call <vscale x 4 x i32> @llvm.fshl.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i32> undef)
-  call <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i64> undef)
+  call <vscale x 16 x i8> @llvm.fshl.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i8> poison, <vscale x 16 x i8> poison)
+  call <vscale x 8 x i16> @llvm.fshl.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i16> poison, <vscale x 8 x i16> poison)
+  call <vscale x 4 x i32> @llvm.fshl.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> poison, <vscale x 4 x i32> poison)
+  call <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i64> poison, <vscale x 2 x i64> poison)
   ret void
 }
 
 define <vscale x 4 x i32> @masked_gather_nxv4i32(<vscale x 4 x ptr> %ld, <vscale x 4 x i1> %masks, <vscale x 4 x i32> %passthru) {
 ; CHECK-VSCALE-1-LABEL: 'masked_gather_nxv4i32'
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 40 for: %res = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> %ld, i32 0, <vscale x 4 x i1> %masks, <vscale x 4 x i32> %passthru)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 40 for: %res = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 %ld, <vscale x 4 x i1> %masks, <vscale x 4 x i32> %passthru)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <vscale x 4 x i32> %res
 ;
 ; CHECK-VSCALE-2-LABEL: 'masked_gather_nxv4i32'
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 80 for: %res = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> %ld, i32 0, <vscale x 4 x i1> %masks, <vscale x 4 x i32> %passthru)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 80 for: %res = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 %ld, <vscale x 4 x i1> %masks, <vscale x 4 x i32> %passthru)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <vscale x 4 x i32> %res
 ;
 ; TYPE_BASED_ONLY-LABEL: 'masked_gather_nxv4i32'
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %res = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> %ld, i32 0, <vscale x 4 x i1> %masks, <vscale x 4 x i32> %passthru)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %res = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 %ld, <vscale x 4 x i1> %masks, <vscale x 4 x i32> %passthru)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <vscale x 4 x i32> %res
 ;
   %res = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x ptr> %ld, i32 0, <vscale x 4 x i1> %masks, <vscale x 4 x i32> %passthru)
@@ -1086,15 +1086,15 @@ define <vscale x 4 x i32> @masked_gather_nxv4i32(<vscale x 4 x ptr> %ld, <vscale
 
 define <vscale x 8 x i32> @masked_gather_nxv8i32(<vscale x 8 x ptr> %ld, <vscale x 8 x i1> %masks, <vscale x 8 x i32> %passthru) {
 ; CHECK-VSCALE-1-LABEL: 'masked_gather_nxv8i32'
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 80 for: %res = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> %ld, i32 0, <vscale x 8 x i1> %masks, <vscale x 8 x i32> %passthru)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 80 for: %res = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> align 4 %ld, <vscale x 8 x i1> %masks, <vscale x 8 x i32> %passthru)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <vscale x 8 x i32> %res
 ;
 ; CHECK-VSCALE-2-LABEL: 'masked_gather_nxv8i32'
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 160 for: %res = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> %ld, i32 0, <vscale x 8 x i1> %masks, <vscale x 8 x i32> %passthru)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 160 for: %res = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> align 4 %ld, <vscale x 8 x i1> %masks, <vscale x 8 x i32> %passthru)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <vscale x 8 x i32> %res
 ;
 ; TYPE_BASED_ONLY-LABEL: 'masked_gather_nxv8i32'
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %res = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> %ld, i32 0, <vscale x 8 x i1> %masks, <vscale x 8 x i32> %passthru)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %res = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> align 4 %ld, <vscale x 8 x i1> %masks, <vscale x 8 x i32> %passthru)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <vscale x 8 x i32> %res
 ;
   %res = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32(<vscale x 8 x ptr> %ld, i32 0, <vscale x 8 x i1> %masks, <vscale x 8 x i32> %passthru)
@@ -1103,15 +1103,15 @@ define <vscale x 8 x i32> @masked_gather_nxv8i32(<vscale x 8 x ptr> %ld, <vscale
 
 define <4 x i32> @masked_gather_v4i32(<4 x ptr> %ld, <4 x i1> %masks, <4 x i32> %passthru) {
 ; CHECK-VSCALE-1-LABEL: 'masked_gather_v4i32'
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ld, i32 0, <4 x i1> %masks, <4 x i32> %passthru)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ld, <4 x i1> %masks, <4 x i32> %passthru)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; CHECK-VSCALE-2-LABEL: 'masked_gather_v4i32'
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ld, i32 0, <4 x i1> %masks, <4 x i32> %passthru)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ld, <4 x i1> %masks, <4 x i32> %passthru)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; TYPE_BASED_ONLY-LABEL: 'masked_gather_v4i32'
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:20 Lat:36 SizeLat:36 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ld, i32 0, <4 x i1> %masks, <4 x i32> %passthru)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:20 Lat:36 SizeLat:36 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ld, <4 x i1> %masks, <4 x i32> %passthru)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
   %res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x ptr> %ld, i32 0, <4 x i1> %masks, <4 x i32> %passthru)
@@ -1120,15 +1120,15 @@ define <4 x i32> @masked_gather_v4i32(<4 x ptr> %ld, <4 x i1> %masks, <4 x i32>
 
 define <1 x i128> @masked_gather_v1i128(<1 x ptr> %ld, <1 x i1> %masks, <1 x i128> %passthru) {
 ; CHECK-VSCALE-1-LABEL: 'masked_gather_v1i128'
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:6 Lat:8 SizeLat:9 for: %res = call <1 x i128> @llvm.masked.gather.v1i128.v1p0(<1 x ptr> %ld, i32 0, <1 x i1> %masks, <1 x i128> %passthru)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:6 Lat:8 SizeLat:9 for: %res = call <1 x i128> @llvm.masked.gather.v1i128.v1p0(<1 x ptr> align 16 %ld, <1 x i1> %masks, <1 x i128> %passthru)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <1 x i128> %res
 ;
 ; CHECK-VSCALE-2-LABEL: 'masked_gather_v1i128'
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:6 Lat:8 SizeLat:9 for: %res = call <1 x i128> @llvm.masked.gather.v1i128.v1p0(<1 x ptr> %ld, i32 0, <1 x i1> %masks, <1 x i128> %passthru)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:6 Lat:8 SizeLat:9 for: %res = call <1 x i128> @llvm.masked.gather.v1i128.v1p0(<1 x ptr> align 16 %ld, <1 x i1> %masks, <1 x i128> %passthru)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <1 x i128> %res
 ;
 ; TYPE_BASED_ONLY-LABEL: 'masked_gather_v1i128'
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %res = call <1 x i128> @llvm.masked.gather.v1i128.v1p0(<1 x ptr> %ld, i32 0, <1 x i1> %masks, <1 x i128> %passthru)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %res = call <1 x i128> @llvm.masked.gather.v1i128.v1p0(<1 x ptr> align 16 %ld, <1 x i1> %masks, <1 x i128> %passthru)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <1 x i128> %res
 ;
   %res = call <1 x i128> @llvm.masked.gather.v1i128.v1p0(<1 x ptr> %ld, i32 0, <1 x i1> %masks, <1 x i128> %passthru)
@@ -1137,15 +1137,15 @@ define <1 x i128> @masked_gather_v1i128(<1 x ptr> %ld, <1 x i1> %masks, <1 x i12
 
 define void @masked_scatter_nxv4i32(<vscale x 4 x i32> %data, <vscale x 4 x ptr> %ptrs, <vscale x 4 x i1> %masks) {
 ; CHECK-VSCALE-1-LABEL: 'masked_scatter_nxv4i32'
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 40 for: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> %data, <vscale x 4 x ptr> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 40 for: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> %data, <vscale x 4 x ptr> align 4 %ptrs, <vscale x 4 x i1> %masks)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-VSCALE-2-LABEL: 'masked_scatter_nxv4i32'
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 80 for: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> %data, <vscale x 4 x ptr> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 80 for: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> %data, <vscale x 4 x ptr> align 4 %ptrs, <vscale x 4 x i1> %masks)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; TYPE_BASED_ONLY-LABEL: 'masked_scatter_nxv4i32'
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> %data, <vscale x 4 x ptr> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> %data, <vscale x 4 x ptr> align 4 %ptrs, <vscale x 4 x i1> %masks)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 
@@ -1155,15 +1155,15 @@ define void @masked_scatter_nxv4i32(<vscale x 4 x i32> %data, <vscale x 4 x ptr>
 
 define void @masked_scatter_nxv8i32(<vscale x 8 x i32> %data, <vscale x 8 x ptr> %ptrs, <vscale x 8 x i1> %masks) {
 ; CHECK-VSCALE-1-LABEL: 'masked_scatter_nxv8i32'
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 80 for: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> %data, <vscale x 8 x ptr> %ptrs, i32 0, <vscale x 8 x i1> %masks)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 80 for: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> %data, <vscale x 8 x ptr> align 4 %ptrs, <vscale x 8 x i1> %masks)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-VSCALE-2-LABEL: 'masked_scatter_nxv8i32'
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 160 for: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> %data, <vscale x 8 x ptr> %ptrs, i32 0, <vscale x 8 x i1> %masks)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 160 for: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> %data, <vscale x 8 x ptr> align 4 %ptrs, <vscale x 8 x i1> %masks)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; TYPE_BASED_ONLY-LABEL: 'masked_scatter_nxv8i32'
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> %data, <vscale x 8 x ptr> %ptrs, i32 0, <vscale x 8 x i1> %masks)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> %data, <vscale x 8 x ptr> align 4 %ptrs, <vscale x 8 x i1> %masks)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 
@@ -1173,15 +1173,15 @@ define void @masked_scatter_nxv8i32(<vscale x 8 x i32> %data, <vscale x 8 x ptr>
 
 define void @masked_scatter_v4i32(<4 x i32> %data, <4 x ptr> %ptrs, <4 x i1> %masks) {
 ; CHECK-VSCALE-1-LABEL: 'masked_scatter_v4i32'
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %data, <4 x ptr> %ptrs, i32 0, <4 x i1> %masks)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %data, <4 x ptr> align 4 %ptrs, <4 x i1> %masks)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-VSCALE-2-LABEL: 'masked_scatter_v4i32'
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %data, <4 x ptr> %ptrs, i32 0, <4 x i1> %masks)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %data, <4 x ptr> align 4 %ptrs, <4 x i1> %masks)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; TYPE_BASED_ONLY-LABEL: 'masked_scatter_v4i32'
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:16 Lat:28 SizeLat:28 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %data, <4 x ptr> %ptrs, i32 0, <4 x i1> %masks)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:16 Lat:28 SizeLat:28 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %data, <4 x ptr> align 4 %ptrs, <4 x i1> %masks)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 
@@ -1191,15 +1191,15 @@ define void @masked_scatter_v4i32(<4 x i32> %data, <4 x ptr> %ptrs, <4 x i1> %ma
 
 define void @masked_scatter_v1i128(<1 x i128> %data, <1 x ptr> %ptrs, <1 x i1> %masks) {
 ; CHECK-VSCALE-1-LABEL: 'masked_scatter_v1i128'
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:6 Lat:8 SizeLat:9 for: call void @llvm.masked.scatter.v1i128.v1p0(<1 x i128> %data, <1 x ptr> %ptrs, i32 0, <1 x i1> %masks)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:6 Lat:8 SizeLat:9 for: call void @llvm.masked.scatter.v1i128.v1p0(<1 x i128> %data, <1 x ptr> align 16 %ptrs, <1 x i1> %masks)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-VSCALE-2-LABEL: 'masked_scatter_v1i128'
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:6 Lat:8 SizeLat:9 for: call void @llvm.masked.scatter.v1i128.v1p0(<1 x i128> %data, <1 x ptr> %ptrs, i32 0, <1 x i1> %masks)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:6 Lat:8 SizeLat:9 for: call void @llvm.masked.scatter.v1i128.v1p0(<1 x i128> %data, <1 x ptr> align 16 %ptrs, <1 x i1> %masks)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; TYPE_BASED_ONLY-LABEL: 'masked_scatter_v1i128'
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.scatter.v1i128.v1p0(<1 x i128> %data, <1 x ptr> %ptrs, i32 0, <1 x i1> %masks)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.scatter.v1i128.v1p0(<1 x i128> %data, <1 x ptr> align 16 %ptrs, <1 x i1> %masks)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 
@@ -1362,48 +1362,48 @@ define void @histogram_nxv4i64(<vscale x 4 x ptr> %buckets, <vscale x 4 x i1> %m
 
 define void @match() #3 {
 ; CHECK-VSCALE-1-LABEL: 'match'
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 4 for: %match_nxv16i8_v16i8 = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> undef, <vscale x 16 x i1> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 4 for: %match_nxv8i16_v8i16 = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> undef, <8 x i16> undef, <vscale x 8 x i1> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 14 for: %match_v16i8_v16i8 = call <16 x i1> @llvm.experimental.vector.match.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 14 for: %match_v8i16_v8i16 = call <8 x i1> @llvm.experimental.vector.match.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 4 for: %match_nxv16i8_v16i8 = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> poison, <16 x i8> poison, <vscale x 16 x i1> poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 4 for: %match_nxv8i16_v8i16 = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> poison, <8 x i16> poison, <vscale x 8 x i1> poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> poison, <vscale x 4 x i1> poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> poison, <2 x i64> poison, <vscale x 2 x i1> poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 14 for: %match_v16i8_v16i8 = call <16 x i1> @llvm.experimental.vector.match.v16i8.v16i8(<16 x i8> poison, <16 x i8> poison, <16 x i1> poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 14 for: %match_v8i16_v8i16 = call <8 x i1> @llvm.experimental.vector.match.v8i16.v8i16(<8 x i16> poison, <8 x i16> poison, <8 x i1> poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> poison, <4 x i32> poison, <4 x i1> poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> poison, <2 x i64> poison, <2 x i1> poison)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-VSCALE-2-LABEL: 'match'
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 4 for: %match_nxv16i8_v16i8 = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> undef, <vscale x 16 x i1> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 4 for: %match_nxv8i16_v8i16 = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> undef, <8 x i16> undef, <vscale x 8 x i1> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 14 for: %match_v16i8_v16i8 = call <16 x i1> @llvm.experimental.vector.match.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 14 for: %match_v8i16_v8i16 = call <8 x i1> @llvm.experimental.vector.match.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 4 for: %match_nxv16i8_v16i8 = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> poison, <16 x i8> poison, <vscale x 16 x i1> poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 4 for: %match_nxv8i16_v8i16 = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> poison, <8 x i16> poison, <vscale x 8 x i1> poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> poison, <vscale x 4 x i1> poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> poison, <2 x i64> poison, <vscale x 2 x i1> poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 14 for: %match_v16i8_v16i8 = call <16 x i1> @llvm.experimental.vector.match.v16i8.v16i8(<16 x i8> poison, <16 x i8> poison, <16 x i1> poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 14 for: %match_v8i16_v8i16 = call <8 x i1> @llvm.experimental.vector.match.v8i16.v8i16(<8 x i16> poison, <8 x i16> poison, <8 x i1> poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> poison, <4 x i32> poison, <4 x i1> poison)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> poison, <2 x i64> poison, <2 x i1> poison)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; TYPE_BASED_ONLY-LABEL: 'match'
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 4 for: %match_nxv16i8_v16i8 = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> undef, <vscale x 16 x i1> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 4 for: %match_nxv8i16_v8i16 = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> undef, <8 x i16> undef, <vscale x 8 x i1> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 14 for: %match_v16i8_v16i8 = call <16 x i1> @llvm.experimental.vector.match.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 14 for: %match_v8i16_v8i16 = call <8 x i1> @llvm.experimental.vector.match.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 4 for: %match_nxv16i8_v16i8 = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> poison, <16 x i8> poison, <vscale x 16 x i1> poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 4 for: %match_nxv8i16_v8i16 = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> poison, <8 x i16> poison, <vscale x 8 x i1> poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> poison, <vscale x 4 x i1> poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> poison, <2 x i64> poison, <vscale x 2 x i1> poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 14 for: %match_v16i8_v16i8 = call <16 x i1> @llvm.experimental.vector.match.v16i8.v16i8(<16 x i8> poison, <16 x i8> poison, <16 x i1> poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 14 for: %match_v8i16_v8i16 = call <8 x i1> @llvm.experimental.vector.match.v8i16.v8i16(<8 x i16> poison, <8 x i16> poison, <8 x i1> poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> poison, <4 x i32> poison, <4 x i1> poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> poison, <2 x i64> poison, <2 x i1> poison)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 
-  %match_nxv16i8_v16i8 = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> undef, <vscale x 16 x i1> undef)
-  %match_nxv8i16_v8i16 = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> undef, <8 x i16> undef, <vscale x 8 x i1> undef)
-  %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
-  %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
+  %match_nxv16i8_v16i8 = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> poison, <16 x i8> poison, <vscale x 16 x i1> poison)
+  %match_nxv8i16_v8i16 = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> poison, <8 x i16> poison, <vscale x 8 x i1> poison)
+  %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> poison, <vscale x 4 x i1> poison)
+  %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> poison, <2 x i64> poison, <vscale x 2 x i1> poison)
 
-  %match_v16i8_v16i8 = call <16 x i1> @llvm.experimental.vector.match.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef)
-  %match_v8i16_v8i16 = call <8 x i1> @llvm.experimental.vector.match.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef)
-  %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
-  %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
+  %match_v16i8_v16i8 = call <16 x i1> @llvm.experimental.vector.match.v16i8.v16i8(<16 x i8> poison, <16 x i8> poison, <16 x i1> poison)
+  %match_v8i16_v8i16 = call <8 x i1> @llvm.experimental.vector.match.v8i16.v8i16(<8 x i16> poison, <8 x i16> poison, <8 x i1> poison)
+  %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> poison, <4 x i32> poison, <4 x i1> poison)
+  %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> poison, <2 x i64> poison, <2 x i1> poison)
 
   ret void
 }
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-scatter.ll b/llvm/test/Analysis/CostModel/AArch64/sve-scatter.ll
index b9defdf8be2c7..0bb38de03828c 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-scatter.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-scatter.ll
@@ -10,24 +10,24 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define void @masked_scatters(<vscale x 4 x i1> %nxv4i1mask, <vscale x 8 x i1> %nxv8i1mask, <4 x i1> %v4i1mask, <1 x i1> %v1i1mask, <vscale x 1 x i1> %nxv1i1mask) #0 {
 ; CHECK-VSCALE-1-LABEL: 'masked_scatters'
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i1.nxv4p0(<vscale x 4 x i1> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> align 4 undef, <vscale x 4 x i1> %nxv4i1mask)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> align 4 undef, <vscale x 8 x i1> %nxv8i1mask)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> align 8 undef, <vscale x 1 x i1> %nxv1i1mask)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i1.nxv4p0(<vscale x 4 x i1> undef, <vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> %nxv4i1mask)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-VSCALE-2-LABEL: 'masked_scatters'
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i1.nxv4p0(<vscale x 4 x i1> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> align 4 undef, <vscale x 4 x i1> %nxv4i1mask)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> align 4 undef, <vscale x 8 x i1> %nxv8i1mask)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> align 8 undef, <vscale x 1 x i1> %nxv1i1mask)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i1.nxv4p0(<vscale x 4 x i1> undef, <vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> %nxv4i1mask)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-V2-LABEL: 'masked_scatters'
-; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
-; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask)
-; CHECK-V2-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask)
-; CHECK-V2-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i1.nxv4p0(<vscale x 4 x i1> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
+; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> align 4 undef, <vscale x 4 x i1> %nxv4i1mask)
+; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> align 4 undef, <vscale x 8 x i1> %nxv8i1mask)
+; CHECK-V2-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> align 8 undef, <vscale x 1 x i1> %nxv1i1mask)
+; CHECK-V2-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i1.nxv4p0(<vscale x 4 x i1> undef, <vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> %nxv4i1mask)
 ; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   call void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
@@ -39,21 +39,21 @@ define void @masked_scatters(<vscale x 4 x i1> %nxv4i1mask, <vscale x 8 x i1> %n
 
 define void @masked_scatters_tune_generic(<vscale x 4 x i1> %nxv4i1mask, <vscale x 8 x i1> %nxv8i1mask, <4 x i1> %v4i1mask, <1 x i1> %v1i1mask, <vscale x 1 x i1> %nxv1i1mask) #1 {
 ; CHECK-VSCALE-1-LABEL: 'masked_scatters_tune_generic'
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> align 4 undef, <vscale x 4 x i1> %nxv4i1mask)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> align 4 undef, <vscale x 8 x i1> %nxv8i1mask)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> align 8 undef, <vscale x 1 x i1> %nxv1i1mask)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-VSCALE-2-LABEL: 'masked_scatters_tune_generic'
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> align 4 undef, <vscale x 4 x i1> %nxv4i1mask)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> align 4 undef, <vscale x 8 x i1> %nxv8i1mask)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> align 8 undef, <vscale x 1 x i1> %nxv1i1mask)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-V2-LABEL: 'masked_scatters_tune_generic'
-; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
-; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask)
-; CHECK-V2-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask)
+; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> align 4 undef, <vscale x 4 x i1> %nxv4i1mask)
+; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> align 4 undef, <vscale x 8 x i1> %nxv8i1mask)
+; CHECK-V2-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> align 8 undef, <vscale x 1 x i1> %nxv1i1mask)
 ; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   call void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
@@ -64,36 +64,36 @@ define void @masked_scatters_tune_generic(<vscale x 4 x i1> %nxv4i1mask, <vscale
 
 define void @masked_scatters_no_vscale_range() #2 {
 ; CHECK-VSCALE-1-LABEL: 'masked_scatters_no_vscale_range'
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.nxv4f64.nxv4p0(<vscale x 4 x double> undef, <vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> undef, <vscale x 2 x ptr> undef, i32 1, <vscale x 2 x i1> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv8f32.nxv8p0(<vscale x 8 x float> undef, <vscale x 8 x ptr> undef, i32 1, <vscale x 8 x i1> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> undef, <vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.nxv2f32.nxv2p0(<vscale x 2 x float> undef, <vscale x 2 x ptr> undef, i32 1, <vscale x 2 x i1> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.nxv16i16.nxv16p0(<vscale x 16 x i16> undef, <vscale x 16 x ptr> undef, i32 1, <vscale x 16 x i1> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> undef, <vscale x 8 x ptr> undef, i32 1, <vscale x 8 x i1> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> undef, <vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.nxv4f64.nxv4p0(<vscale x 4 x double> undef, <vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> undef, <vscale x 2 x ptr> align 1 undef, <vscale x 2 x i1> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv8f32.nxv8p0(<vscale x 8 x float> undef, <vscale x 8 x ptr> align 1 undef, <vscale x 8 x i1> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> undef, <vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.nxv2f32.nxv2p0(<vscale x 2 x float> undef, <vscale x 2 x ptr> align 1 undef, <vscale x 2 x i1> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.nxv16i16.nxv16p0(<vscale x 16 x i16> undef, <vscale x 16 x ptr> align 1 undef, <vscale x 16 x i1> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> undef, <vscale x 8 x ptr> align 1 undef, <vscale x 8 x i1> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> undef, <vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-VSCALE-2-LABEL: 'masked_scatters_no_vscale_range'
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv4f64.nxv4p0(<vscale x 4 x double> undef, <vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> undef, <vscale x 2 x ptr> undef, i32 1, <vscale x 2 x i1> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.nxv8f32.nxv8p0(<vscale x 8 x float> undef, <vscale x 8 x ptr> undef, i32 1, <vscale x 8 x i1> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> undef, <vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.nxv2f32.nxv2p0(<vscale x 2 x float> undef, <vscale x 2 x ptr> undef, i32 1, <vscale x 2 x i1> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.scatter.nxv16i16.nxv16p0(<vscale x 16 x i16> undef, <vscale x 16 x ptr> undef, i32 1, <vscale x 16 x i1> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> undef, <vscale x 8 x ptr> undef, i32 1, <vscale x 8 x i1> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> undef, <vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv4f64.nxv4p0(<vscale x 4 x double> undef, <vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> undef, <vscale x 2 x ptr> align 1 undef, <vscale x 2 x i1> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.nxv8f32.nxv8p0(<vscale x 8 x float> undef, <vscale x 8 x ptr> align 1 undef, <vscale x 8 x i1> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> undef, <vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.nxv2f32.nxv2p0(<vscale x 2 x float> undef, <vscale x 2 x ptr> align 1 undef, <vscale x 2 x i1> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.scatter.nxv16i16.nxv16p0(<vscale x 16 x i16> undef, <vscale x 16 x ptr> align 1 undef, <vscale x 16 x i1> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> undef, <vscale x 8 x ptr> align 1 undef, <vscale x 8 x i1> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> undef, <vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-V2-LABEL: 'masked_scatters_no_vscale_range'
-; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.scatter.nxv4f64.nxv4p0(<vscale x 4 x double> undef, <vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef)
-; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> undef, <vscale x 2 x ptr> undef, i32 1, <vscale x 2 x i1> undef)
-; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: call void @llvm.masked.scatter.nxv8f32.nxv8p0(<vscale x 8 x float> undef, <vscale x 8 x ptr> undef, i32 1, <vscale x 8 x i1> undef)
-; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> undef, <vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef)
-; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.scatter.nxv2f32.nxv2p0(<vscale x 2 x float> undef, <vscale x 2 x ptr> undef, i32 1, <vscale x 2 x i1> undef)
-; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: call void @llvm.masked.scatter.nxv16i16.nxv16p0(<vscale x 16 x i16> undef, <vscale x 16 x ptr> undef, i32 1, <vscale x 16 x i1> undef)
-; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: call void @llvm.masked.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> undef, <vscale x 8 x ptr> undef, i32 1, <vscale x 8 x i1> undef)
-; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> undef, <vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef)
+; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.scatter.nxv4f64.nxv4p0(<vscale x 4 x double> undef, <vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef)
+; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> undef, <vscale x 2 x ptr> align 1 undef, <vscale x 2 x i1> undef)
+; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: call void @llvm.masked.scatter.nxv8f32.nxv8p0(<vscale x 8 x float> undef, <vscale x 8 x ptr> align 1 undef, <vscale x 8 x i1> undef)
+; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> undef, <vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef)
+; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.scatter.nxv2f32.nxv2p0(<vscale x 2 x float> undef, <vscale x 2 x ptr> align 1 undef, <vscale x 2 x i1> undef)
+; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: call void @llvm.masked.scatter.nxv16i16.nxv16p0(<vscale x 16 x i16> undef, <vscale x 16 x ptr> align 1 undef, <vscale x 16 x i1> undef)
+; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: call void @llvm.masked.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> undef, <vscale x 8 x ptr> align 1 undef, <vscale x 8 x i1> undef)
+; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> undef, <vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef)
 ; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   call void @llvm.masked.scatter.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef)
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-trunc.ll b/llvm/test/Analysis/CostModel/AArch64/sve-trunc.ll
index 397b73753e680..f7d3719ec2477 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-trunc.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-trunc.ll
@@ -5,82 +5,82 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define void @sve_truncs() {
 ; CHECK-LABEL: 'sve_truncs'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv2i8_to_i1 = trunc <vscale x 2 x i8> undef to <vscale x 2 x i1>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv2i16_to_i1 = trunc <vscale x 2 x i16> undef to <vscale x 2 x i1>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv2i32_to_i1 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i1>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv2i64_to_i1 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i1>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv4i8_to_i1 = trunc <vscale x 4 x i8> undef to <vscale x 4 x i1>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv4i16_to_i1 = trunc <vscale x 4 x i16> undef to <vscale x 4 x i1>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv4i32_to_i1 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i1>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv4i64_to_i1 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i1>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv8i8_to_i1 = trunc <vscale x 8 x i8> undef to <vscale x 8 x i1>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv8i16_to_i1 = trunc <vscale x 8 x i16> undef to <vscale x 8 x i1>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv8i32_to_i1 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i1>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv8i64_to_i1 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i1>
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %trunc_nxv2i16_to_i8 = trunc <vscale x 2 x i16> undef to <vscale x 2 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %trunc_nxv2i32_to_i8 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %trunc_nxv2i64_to_i8 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %trunc_nxv2i32_to_i16 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %trunc_nxv2i64_to_i16 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %trunc_nxv2i64_to_i32 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %trunc_nxv4i16_to_i8 = trunc <vscale x 4 x i16> undef to <vscale x 4 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %trunc_nxv4i32_to_i8 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %trunc_nxv4i64_to_i8 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %trunc_nxv4i32_to_i16 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %trunc_nxv4i64_to_i16 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %trunc_nxv4i64_to_i32 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %trunc_nxv8i16_to_i8 = trunc <vscale x 8 x i16> undef to <vscale x 8 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %trunc_nxv8i32_to_i8 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv8i64_to_i8 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %trunc_nxv8i32_to_i16 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv8i64_to_i16 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %trunc_nxv16i16_to_i8 = trunc <vscale x 16 x i16> undef to <vscale x 16 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv16i32_to_i8 = trunc <vscale x 16 x i32> undef to <vscale x 16 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv16i64_to_i8 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv2i8_to_i1 = trunc <vscale x 2 x i8> poison to <vscale x 2 x i1>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv2i16_to_i1 = trunc <vscale x 2 x i16> poison to <vscale x 2 x i1>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv2i32_to_i1 = trunc <vscale x 2 x i32> poison to <vscale x 2 x i1>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv2i64_to_i1 = trunc <vscale x 2 x i64> poison to <vscale x 2 x i1>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv4i8_to_i1 = trunc <vscale x 4 x i8> poison to <vscale x 4 x i1>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv4i16_to_i1 = trunc <vscale x 4 x i16> poison to <vscale x 4 x i1>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv4i32_to_i1 = trunc <vscale x 4 x i32> poison to <vscale x 4 x i1>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv4i64_to_i1 = trunc <vscale x 4 x i64> poison to <vscale x 4 x i1>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv8i8_to_i1 = trunc <vscale x 8 x i8> poison to <vscale x 8 x i1>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv8i16_to_i1 = trunc <vscale x 8 x i16> poison to <vscale x 8 x i1>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv8i32_to_i1 = trunc <vscale x 8 x i32> poison to <vscale x 8 x i1>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv8i64_to_i1 = trunc <vscale x 8 x i64> poison to <vscale x 8 x i1>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %trunc_nxv2i16_to_i8 = trunc <vscale x 2 x i16> poison to <vscale x 2 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %trunc_nxv2i32_to_i8 = trunc <vscale x 2 x i32> poison to <vscale x 2 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %trunc_nxv2i64_to_i8 = trunc <vscale x 2 x i64> poison to <vscale x 2 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %trunc_nxv2i32_to_i16 = trunc <vscale x 2 x i32> poison to <vscale x 2 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %trunc_nxv2i64_to_i16 = trunc <vscale x 2 x i64> poison to <vscale x 2 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %trunc_nxv2i64_to_i32 = trunc <vscale x 2 x i64> poison to <vscale x 2 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %trunc_nxv4i16_to_i8 = trunc <vscale x 4 x i16> poison to <vscale x 4 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %trunc_nxv4i32_to_i8 = trunc <vscale x 4 x i32> poison to <vscale x 4 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %trunc_nxv4i64_to_i8 = trunc <vscale x 4 x i64> poison to <vscale x 4 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %trunc_nxv4i32_to_i16 = trunc <vscale x 4 x i32> poison to <vscale x 4 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %trunc_nxv4i64_to_i16 = trunc <vscale x 4 x i64> poison to <vscale x 4 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %trunc_nxv4i64_to_i32 = trunc <vscale x 4 x i64> poison to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %trunc_nxv8i16_to_i8 = trunc <vscale x 8 x i16> poison to <vscale x 8 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %trunc_nxv8i32_to_i8 = trunc <vscale x 8 x i32> poison to <vscale x 8 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv8i64_to_i8 = trunc <vscale x 8 x i64> poison to <vscale x 8 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %trunc_nxv8i32_to_i16 = trunc <vscale x 8 x i32> poison to <vscale x 8 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv8i64_to_i16 = trunc <vscale x 8 x i64> poison to <vscale x 8 x i16>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %trunc_nxv16i16_to_i8 = trunc <vscale x 16 x i16> poison to <vscale x 16 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv16i32_to_i8 = trunc <vscale x 16 x i32> poison to <vscale x 16 x i8>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv16i64_to_i8 = trunc <vscale x 16 x i64> poison to <vscale x 16 x i8>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-  %trunc_nxv2i8_to_i1   = trunc <vscale x 2 x i8>  undef to <vscale x 2 x i1>
-  %trunc_nxv2i16_to_i1  = trunc <vscale x 2 x i16> undef to <vscale x 2 x i1>
-  %trunc_nxv2i32_to_i1  = trunc <vscale x 2 x i32> undef to <vscale x 2 x i1>
-  %trunc_nxv2i64_to_i1  = trunc <vscale x 2 x i64> undef to <vscale x 2 x i1>
+  %trunc_nxv2i8_to_i1   = trunc <vscale x 2 x i8>  poison to <vscale x 2 x i1>
+  %trunc_nxv2i16_to_i1  = trunc <vscale x 2 x i16> poison to <vscale x 2 x i1>
+  %trunc_nxv2i32_to_i1  = trunc <vscale x 2 x i32> poison to <vscale x 2 x i1>
+  %trunc_nxv2i64_to_i1  = trunc <vscale x 2 x i64> poison to <vscale x 2 x i1>
 
-  %trunc_nxv4i8_to_i1   = trunc <vscale x 4 x i8>  undef to <vscale x 4 x i1>
-  %trunc_nxv4i16_to_i1  = trunc <vscale x 4 x i16> undef to <vscale x 4 x i1>
-  %trunc_nxv4i32_to_i1  = trunc <vscale x 4 x i32> undef to <vscale x 4 x i1>
-  %trunc_nxv4i64_to_i1  = trunc <vscale x 4 x i64> undef to <vscale x 4 x i1>
+  %trunc_nxv4i8_to_i1   = trunc <vscale x 4 x i8>  poison to <vscale x 4 x i1>
+  %trunc_nxv4i16_to_i1  = trunc <vscale x 4 x i16> poison to <vscale x 4 x i1>
+  %trunc_nxv4i32_to_i1  = trunc <vscale x 4 x i32> poison to <vscale x 4 x i1>
+  %trunc_nxv4i64_to_i1  = trunc <vscale x 4 x i64> poison to <vscale x 4 x i1>
 
-  %trunc_nxv8i8_to_i1   = trunc <vscale x 8 x i8>  undef to <vscale x 8 x i1>
-  %trunc_nxv8i16_to_i1  = trunc <vscale x 8 x i16> undef to <vscale x 8 x i1>
-  %trunc_nxv8i32_to_i1  = trunc <vscale x 8 x i32> undef to <vscale x 8 x i1>
-  %trunc_nxv8i64_to_i1  = trunc <vscale x 8 x i64> undef to <vscale x 8 x i1>
+  %trunc_nxv8i8_to_i1   = trunc <vscale x 8 x i8>  poison to <vscale x 8 x i1>
+  %trunc_nxv8i16_to_i1  = trunc <vscale x 8 x i16> poison to <vscale x 8 x i1>
+  %trunc_nxv8i32_to_i1  = trunc <vscale x 8 x i32> poison to <vscale x 8 x i1>
+  %trunc_nxv8i64_to_i1  = trunc <vscale x 8 x i64> poison to <vscale x 8 x i1>
 
 ; Truncates to unpacked or legal types with vscale x 2 elements
-  %trunc_nxv2i16_to_i8 = trunc <vscale x 2 x i16> undef to <vscale x 2 x i8>
-  %trunc_nxv2i32_to_i8 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i8>
-  %trunc_nxv2i64_to_i8 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i8>
-  %trunc_nxv2i32_to_i16 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i16>
-  %trunc_nxv2i64_to_i16 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i16>
-  %trunc_nxv2i64_to_i32 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i32>
+  %trunc_nxv2i16_to_i8 = trunc <vscale x 2 x i16> poison to <vscale x 2 x i8>
+  %trunc_nxv2i32_to_i8 = trunc <vscale x 2 x i32> poison to <vscale x 2 x i8>
+  %trunc_nxv2i64_to_i8 = trunc <vscale x 2 x i64> poison to <vscale x 2 x i8>
+  %trunc_nxv2i32_to_i16 = trunc <vscale x 2 x i32> poison to <vscale x 2 x i16>
+  %trunc_nxv2i64_to_i16 = trunc <vscale x 2 x i64> poison to <vscale x 2 x i16>
+  %trunc_nxv2i64_to_i32 = trunc <vscale x 2 x i64> poison to <vscale x 2 x i32>
 
 ; Truncates to unpacked or legal with vscale x 4 elements
-  %trunc_nxv4i16_to_i8  = trunc <vscale x 4 x i16> undef to <vscale x 4 x i8>
-  %trunc_nxv4i32_to_i8  = trunc <vscale x 4 x i32> undef to <vscale x 4 x i8>
-  %trunc_nxv4i64_to_i8  = trunc <vscale x 4 x i64> undef to <vscale x 4 x i8>
-  %trunc_nxv4i32_to_i16 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i16>
-  %trunc_nxv4i64_to_i16 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i16>
-  %trunc_nxv4i64_to_i32 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i32>
+  %trunc_nxv4i16_to_i8  = trunc <vscale x 4 x i16> poison to <vscale x 4 x i8>
+  %trunc_nxv4i32_to_i8  = trunc <vscale x 4 x i32> poison to <vscale x 4 x i8>
+  %trunc_nxv4i64_to_i8  = trunc <vscale x 4 x i64> poison to <vscale x 4 x i8>
+  %trunc_nxv4i32_to_i16 = trunc <vscale x 4 x i32> poison to <vscale x 4 x i16>
+  %trunc_nxv4i64_to_i16 = trunc <vscale x 4 x i64> poison to <vscale x 4 x i16>
+  %trunc_nxv4i64_to_i32 = trunc <vscale x 4 x i64> poison to <vscale x 4 x i32>
 
 ; Truncates to unpacked or legal with vscale x 8 elements
-  %trunc_nxv8i16_to_i8  = trunc <vscale x 8 x i16> undef to <vscale x 8 x i8>
-  %trunc_nxv8i32_to_i8  = trunc <vscale x 8 x i32> undef to <vscale x 8 x i8>
-  %trunc_nxv8i64_to_i8  = trunc <vscale x 8 x i64> undef to <vscale x 8 x i8>
-  %trunc_nxv8i32_to_i16 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i16>
-  %trunc_nxv8i64_to_i16 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i16>
+  %trunc_nxv8i16_to_i8  = trunc <vscale x 8 x i16> poison to <vscale x 8 x i8>
+  %trunc_nxv8i32_to_i8  = trunc <vscale x 8 x i32> poison to <vscale x 8 x i8>
+  %trunc_nxv8i64_to_i8  = trunc <vscale x 8 x i64> poison to <vscale x 8 x i8>
+  %trunc_nxv8i32_to_i16 = trunc <vscale x 8 x i32> poison to <vscale x 8 x i16>
+  %trunc_nxv8i64_to_i16 = trunc <vscale x 8 x i64> poison to <vscale x 8 x i16>
 
 ; Truncates to unpacked or legal with vscale x 16 elements
-  %trunc_nxv16i16_to_i8 = trunc <vscale x 16 x i16> undef to <vscale x 16 x i8>
-  %trunc_nxv16i32_to_i8 = trunc <vscale x 16 x i32> undef to <vscale x 16 x i8>
-  %trunc_nxv16i64_to_i8 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i8>
+  %trunc_nxv16i16_to_i8 = trunc <vscale x 16 x i16> poison to <vscale x 16 x i8>
+  %trunc_nxv16i32_to_i8 = trunc <vscale x 16 x i32> poison to <vscale x 16 x i8>
+  %trunc_nxv16i64_to_i8 = trunc <vscale x 16 x i64> poison to <vscale x 16 x i8>
 
   ret void
 }
diff --git a/llvm/test/Analysis/CostModel/ARM/cast_ldst.ll b/llvm/test/Analysis/CostModel/ARM/cast_ldst.ll
index 1570fc794cd34..c2248c2278ce8 100644
--- a/llvm/test/Analysis/CostModel/ARM/cast_ldst.ll
+++ b/llvm/test/Analysis/CostModel/ARM/cast_ldst.ll
@@ -1081,15 +1081,15 @@ define i32 @load_fptrunc() {
 
 define i32 @maskedload_extends() {
 ; CHECK-NEON-LABEL: 'maskedload_extends'
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: %loadv2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i8> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: %loadv4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i8> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 64 for: %loadv8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 128 for: %loadv16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: %loadv2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0(ptr undef, i32 2, <2 x i1> undef, <2 x i16> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: %loadv4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 2, <4 x i1> undef, <4 x i16> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 64 for: %loadv8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 2, <8 x i1> undef, <8 x i16> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: %loadv2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 4, <2 x i1> undef, <2 x i32> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: %loadv4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 4, <4 x i1> undef, <4 x i32> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: %loadv2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr align 1 undef, <2 x i1> undef, <2 x i8> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: %loadv4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr align 1 undef, <4 x i1> undef, <4 x i8> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 64 for: %loadv8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr align 1 undef, <8 x i1> undef, <8 x i8> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 128 for: %loadv16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 undef, <16 x i1> undef, <16 x i8> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: %loadv2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0(ptr align 2 undef, <2 x i1> undef, <2 x i16> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: %loadv4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 2 undef, <4 x i1> undef, <4 x i16> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 64 for: %loadv8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 2 undef, <8 x i1> undef, <8 x i16> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: %loadv2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 4 undef, <2 x i1> undef, <2 x i32> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: %loadv4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 undef, <4 x i1> undef, <4 x i32> undef)
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2816s = sext <2 x i8> %loadv2i8 to <2 x i16>
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2816u = zext <2 x i8> %loadv2i8 to <2 x i16>
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2832s = sext <2 x i8> %loadv2i8 to <2 x i32>
@@ -1133,15 +1133,15 @@ define i32 @maskedload_extends() {
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; CHECK-MVE-LABEL: 'maskedload_extends'
-; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: %loadv2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i8> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %loadv4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i8> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %loadv8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %loadv16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: %loadv2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0(ptr undef, i32 2, <2 x i1> undef, <2 x i16> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %loadv4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 2, <4 x i1> undef, <4 x i16> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %loadv8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 2, <8 x i1> undef, <8 x i16> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: %loadv2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 4, <2 x i1> undef, <2 x i32> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %loadv4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 4, <4 x i1> undef, <4 x i32> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: %loadv2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr align 1 undef, <2 x i1> undef, <2 x i8> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %loadv4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr align 1 undef, <4 x i1> undef, <4 x i8> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %loadv8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr align 1 undef, <8 x i1> undef, <8 x i8> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %loadv16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 undef, <16 x i1> undef, <16 x i8> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: %loadv2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0(ptr align 2 undef, <2 x i1> undef, <2 x i16> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %loadv4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 2 undef, <4 x i1> undef, <4 x i16> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %loadv8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 2 undef, <8 x i1> undef, <8 x i16> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: %loadv2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 4 undef, <2 x i1> undef, <2 x i32> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %loadv4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 undef, <4 x i1> undef, <4 x i32> undef)
 ; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %v2816s = sext <2 x i8> %loadv2i8 to <2 x i16>
 ; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v2816u = zext <2 x i8> %loadv2i8 to <2 x i16>
 ; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %v2832s = sext <2 x i8> %loadv2i8 to <2 x i32>
@@ -1185,15 +1185,15 @@ define i32 @maskedload_extends() {
 ; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; CHECK-V8M-MAIN-LABEL: 'maskedload_extends'
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: %loadv2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i8> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: %loadv4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i8> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 64 for: %loadv8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 128 for: %loadv16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: %loadv2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0(ptr undef, i32 2, <2 x i1> undef, <2 x i16> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: %loadv4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 2, <4 x i1> undef, <4 x i16> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 64 for: %loadv8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 2, <8 x i1> undef, <8 x i16> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: %loadv2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 4, <2 x i1> undef, <2 x i32> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: %loadv4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 4, <4 x i1> undef, <4 x i32> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: %loadv2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr align 1 undef, <2 x i1> undef, <2 x i8> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: %loadv4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr align 1 undef, <4 x i1> undef, <4 x i8> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 64 for: %loadv8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr align 1 undef, <8 x i1> undef, <8 x i8> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 128 for: %loadv16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 undef, <16 x i1> undef, <16 x i8> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: %loadv2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0(ptr align 2 undef, <2 x i1> undef, <2 x i16> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: %loadv4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 2 undef, <4 x i1> undef, <4 x i16> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 64 for: %loadv8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 2 undef, <8 x i1> undef, <8 x i16> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: %loadv2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 4 undef, <2 x i1> undef, <2 x i32> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: %loadv4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 undef, <4 x i1> undef, <4 x i32> undef)
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v2816s = sext <2 x i8> %loadv2i8 to <2 x i16>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v2816u = zext <2 x i8> %loadv2i8 to <2 x i16>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v2832s = sext <2 x i8> %loadv2i8 to <2 x i32>
@@ -1237,15 +1237,15 @@ define i32 @maskedload_extends() {
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 1 for: ret i32 undef
 ;
 ; CHECK-V8M-BASE-LABEL: 'maskedload_extends'
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: %loadv2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i8> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: %loadv4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i8> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 64 for: %loadv8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 128 for: %loadv16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: %loadv2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0(ptr undef, i32 2, <2 x i1> undef, <2 x i16> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: %loadv4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 2, <4 x i1> undef, <4 x i16> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 64 for: %loadv8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 2, <8 x i1> undef, <8 x i16> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: %loadv2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 4, <2 x i1> undef, <2 x i32> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: %loadv4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 4, <4 x i1> undef, <4 x i32> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: %loadv2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr align 1 undef, <2 x i1> undef, <2 x i8> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: %loadv4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr align 1 undef, <4 x i1> undef, <4 x i8> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 64 for: %loadv8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr align 1 undef, <8 x i1> undef, <8 x i8> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 128 for: %loadv16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 undef, <16 x i1> undef, <16 x i8> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: %loadv2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0(ptr align 2 undef, <2 x i1> undef, <2 x i16> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: %loadv4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 2 undef, <4 x i1> undef, <4 x i16> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 64 for: %loadv8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 2 undef, <8 x i1> undef, <8 x i16> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: %loadv2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 4 undef, <2 x i1> undef, <2 x i32> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: %loadv4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 undef, <4 x i1> undef, <4 x i32> undef)
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v2816s = sext <2 x i8> %loadv2i8 to <2 x i16>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v2816u = zext <2 x i8> %loadv2i8 to <2 x i16>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v2832s = sext <2 x i8> %loadv2i8 to <2 x i32>
@@ -1289,15 +1289,15 @@ define i32 @maskedload_extends() {
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 1 for: ret i32 undef
 ;
 ; CHECK-V8R-LABEL: 'maskedload_extends'
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: %loadv2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i8> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: %loadv4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i8> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 64 for: %loadv8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 128 for: %loadv16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: %loadv2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0(ptr undef, i32 2, <2 x i1> undef, <2 x i16> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: %loadv4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 2, <4 x i1> undef, <4 x i16> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 64 for: %loadv8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 2, <8 x i1> undef, <8 x i16> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: %loadv2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 4, <2 x i1> undef, <2 x i32> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: %loadv4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 4, <4 x i1> undef, <4 x i32> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: %loadv2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr align 1 undef, <2 x i1> undef, <2 x i8> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: %loadv4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr align 1 undef, <4 x i1> undef, <4 x i8> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 64 for: %loadv8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr align 1 undef, <8 x i1> undef, <8 x i8> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 128 for: %loadv16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 undef, <16 x i1> undef, <16 x i8> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: %loadv2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0(ptr align 2 undef, <2 x i1> undef, <2 x i16> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: %loadv4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 2 undef, <4 x i1> undef, <4 x i16> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 64 for: %loadv8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 2 undef, <8 x i1> undef, <8 x i16> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: %loadv2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 4 undef, <2 x i1> undef, <2 x i32> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: %loadv4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 undef, <4 x i1> undef, <4 x i32> undef)
 ; CHECK-V8R-NEXT:  Cost Model: Found costs of 1 for: %v2816s = sext <2 x i8> %loadv2i8 to <2 x i16>
 ; CHECK-V8R-NEXT:  Cost Model: Found costs of 1 for: %v2816u = zext <2 x i8> %loadv2i8 to <2 x i16>
 ; CHECK-V8R-NEXT:  Cost Model: Found costs of 1 for: %v2832s = sext <2 x i8> %loadv2i8 to <2 x i32>
@@ -1418,26 +1418,26 @@ define i32 @maskedstore_trunc() {
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %v81664 = trunc <8 x i64> undef to <8 x i16>
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v23264 = trunc <2 x i64> undef to <2 x i32>
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of 0 for: %v43264 = trunc <4 x i64> undef to <4 x i32>
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2816, ptr undef, i32 1, <2 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2832, ptr undef, i32 1, <2 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2864, ptr undef, i32 1, <2 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4816, ptr undef, i32 1, <4 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4832, ptr undef, i32 1, <4 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4864, ptr undef, i32 1, <4 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8816, ptr undef, i32 1, <8 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8832, ptr undef, i32 1, <8 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8864, ptr undef, i32 1, <8 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16816, ptr undef, i32 1, <16 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16832, ptr undef, i32 1, <16 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16864, ptr undef, i32 1, <16 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i16.p0(<2 x i16> %v21632, ptr undef, i32 2, <2 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i16.p0(<2 x i16> %v21664, ptr undef, i32 2, <2 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> %v41632, ptr undef, i32 2, <4 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> %v41664, ptr undef, i32 2, <4 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> %v81632, ptr undef, i32 2, <8 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> %v81664, ptr undef, i32 2, <8 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %v23264, ptr undef, i32 4, <2 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %v43264, ptr undef, i32 4, <4 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2816, ptr align 1 undef, <2 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2832, ptr align 1 undef, <2 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2864, ptr align 1 undef, <2 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4816, ptr align 1 undef, <4 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4832, ptr align 1 undef, <4 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4864, ptr align 1 undef, <4 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8816, ptr align 1 undef, <8 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8832, ptr align 1 undef, <8 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8864, ptr align 1 undef, <8 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16816, ptr align 1 undef, <16 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16832, ptr align 1 undef, <16 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16864, ptr align 1 undef, <16 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i16.p0(<2 x i16> %v21632, ptr align 2 undef, <2 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i16.p0(<2 x i16> %v21664, ptr align 2 undef, <2 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> %v41632, ptr align 2 undef, <4 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> %v41664, ptr align 2 undef, <4 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> %v81632, ptr align 2 undef, <8 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> %v81664, ptr align 2 undef, <8 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %v23264, ptr align 4 undef, <2 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %v43264, ptr align 4 undef, <4 x i1> undef)
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; CHECK-MVE-LABEL: 'maskedstore_trunc'
@@ -1461,26 +1461,26 @@ define i32 @maskedstore_trunc() {
 ; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %v81664 = trunc <8 x i64> undef to <8 x i16>
 ; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %v23264 = trunc <2 x i64> undef to <2 x i32>
 ; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v43264 = trunc <4 x i64> undef to <4 x i32>
-; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2816, ptr undef, i32 1, <2 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2832, ptr undef, i32 1, <2 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2864, ptr undef, i32 1, <2 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4816, ptr undef, i32 1, <4 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4832, ptr undef, i32 1, <4 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4864, ptr undef, i32 1, <4 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8816, ptr undef, i32 1, <8 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8832, ptr undef, i32 1, <8 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8864, ptr undef, i32 1, <8 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16816, ptr undef, i32 1, <16 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16832, ptr undef, i32 1, <16 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16864, ptr undef, i32 1, <16 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i16.p0(<2 x i16> %v21632, ptr undef, i32 2, <2 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i16.p0(<2 x i16> %v21664, ptr undef, i32 2, <2 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> %v41632, ptr undef, i32 2, <4 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> %v41664, ptr undef, i32 2, <4 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> %v81632, ptr undef, i32 2, <8 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> %v81664, ptr undef, i32 2, <8 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %v23264, ptr undef, i32 4, <2 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %v43264, ptr undef, i32 4, <4 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2816, ptr align 1 undef, <2 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2832, ptr align 1 undef, <2 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2864, ptr align 1 undef, <2 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4816, ptr align 1 undef, <4 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4832, ptr align 1 undef, <4 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4864, ptr align 1 undef, <4 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8816, ptr align 1 undef, <8 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8832, ptr align 1 undef, <8 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8864, ptr align 1 undef, <8 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16816, ptr align 1 undef, <16 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16832, ptr align 1 undef, <16 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16864, ptr align 1 undef, <16 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i16.p0(<2 x i16> %v21632, ptr align 2 undef, <2 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i16.p0(<2 x i16> %v21664, ptr align 2 undef, <2 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> %v41632, ptr align 2 undef, <4 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> %v41664, ptr align 2 undef, <4 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> %v81632, ptr align 2 undef, <8 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> %v81664, ptr align 2 undef, <8 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %v23264, ptr align 4 undef, <2 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %v43264, ptr align 4 undef, <4 x i1> undef)
 ; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; CHECK-V8M-MAIN-LABEL: 'maskedstore_trunc'
@@ -1504,26 +1504,26 @@ define i32 @maskedstore_trunc() {
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 0 for: %v81664 = trunc <8 x i64> undef to <8 x i16>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 0 for: %v23264 = trunc <2 x i64> undef to <2 x i32>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 0 for: %v43264 = trunc <4 x i64> undef to <4 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2816, ptr undef, i32 1, <2 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2832, ptr undef, i32 1, <2 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2864, ptr undef, i32 1, <2 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4816, ptr undef, i32 1, <4 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4832, ptr undef, i32 1, <4 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4864, ptr undef, i32 1, <4 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8816, ptr undef, i32 1, <8 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8832, ptr undef, i32 1, <8 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8864, ptr undef, i32 1, <8 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16816, ptr undef, i32 1, <16 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16832, ptr undef, i32 1, <16 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16864, ptr undef, i32 1, <16 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i16.p0(<2 x i16> %v21632, ptr undef, i32 2, <2 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i16.p0(<2 x i16> %v21664, ptr undef, i32 2, <2 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> %v41632, ptr undef, i32 2, <4 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> %v41664, ptr undef, i32 2, <4 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> %v81632, ptr undef, i32 2, <8 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> %v81664, ptr undef, i32 2, <8 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %v23264, ptr undef, i32 4, <2 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %v43264, ptr undef, i32 4, <4 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2816, ptr align 1 undef, <2 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2832, ptr align 1 undef, <2 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2864, ptr align 1 undef, <2 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4816, ptr align 1 undef, <4 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4832, ptr align 1 undef, <4 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4864, ptr align 1 undef, <4 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8816, ptr align 1 undef, <8 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8832, ptr align 1 undef, <8 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8864, ptr align 1 undef, <8 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16816, ptr align 1 undef, <16 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16832, ptr align 1 undef, <16 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16864, ptr align 1 undef, <16 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i16.p0(<2 x i16> %v21632, ptr align 2 undef, <2 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i16.p0(<2 x i16> %v21664, ptr align 2 undef, <2 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> %v41632, ptr align 2 undef, <4 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> %v41664, ptr align 2 undef, <4 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> %v81632, ptr align 2 undef, <8 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> %v81664, ptr align 2 undef, <8 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %v23264, ptr align 4 undef, <2 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %v43264, ptr align 4 undef, <4 x i1> undef)
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 1 for: ret i32 undef
 ;
 ; CHECK-V8M-BASE-LABEL: 'maskedstore_trunc'
@@ -1547,26 +1547,26 @@ define i32 @maskedstore_trunc() {
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 0 for: %v81664 = trunc <8 x i64> undef to <8 x i16>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 0 for: %v23264 = trunc <2 x i64> undef to <2 x i32>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 0 for: %v43264 = trunc <4 x i64> undef to <4 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2816, ptr undef, i32 1, <2 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2832, ptr undef, i32 1, <2 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2864, ptr undef, i32 1, <2 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4816, ptr undef, i32 1, <4 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4832, ptr undef, i32 1, <4 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4864, ptr undef, i32 1, <4 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8816, ptr undef, i32 1, <8 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8832, ptr undef, i32 1, <8 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8864, ptr undef, i32 1, <8 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16816, ptr undef, i32 1, <16 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16832, ptr undef, i32 1, <16 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16864, ptr undef, i32 1, <16 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i16.p0(<2 x i16> %v21632, ptr undef, i32 2, <2 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i16.p0(<2 x i16> %v21664, ptr undef, i32 2, <2 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> %v41632, ptr undef, i32 2, <4 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> %v41664, ptr undef, i32 2, <4 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> %v81632, ptr undef, i32 2, <8 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> %v81664, ptr undef, i32 2, <8 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %v23264, ptr undef, i32 4, <2 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %v43264, ptr undef, i32 4, <4 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2816, ptr align 1 undef, <2 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2832, ptr align 1 undef, <2 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2864, ptr align 1 undef, <2 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4816, ptr align 1 undef, <4 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4832, ptr align 1 undef, <4 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4864, ptr align 1 undef, <4 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8816, ptr align 1 undef, <8 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8832, ptr align 1 undef, <8 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8864, ptr align 1 undef, <8 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16816, ptr align 1 undef, <16 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16832, ptr align 1 undef, <16 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16864, ptr align 1 undef, <16 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i16.p0(<2 x i16> %v21632, ptr align 2 undef, <2 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i16.p0(<2 x i16> %v21664, ptr align 2 undef, <2 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> %v41632, ptr align 2 undef, <4 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> %v41664, ptr align 2 undef, <4 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> %v81632, ptr align 2 undef, <8 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> %v81664, ptr align 2 undef, <8 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %v23264, ptr align 4 undef, <2 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %v43264, ptr align 4 undef, <4 x i1> undef)
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 1 for: ret i32 undef
 ;
 ; CHECK-V8R-LABEL: 'maskedstore_trunc'
@@ -1590,26 +1590,26 @@ define i32 @maskedstore_trunc() {
 ; CHECK-V8R-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %v81664 = trunc <8 x i64> undef to <8 x i16>
 ; CHECK-V8R-NEXT:  Cost Model: Found costs of 1 for: %v23264 = trunc <2 x i64> undef to <2 x i32>
 ; CHECK-V8R-NEXT:  Cost Model: Found costs of 0 for: %v43264 = trunc <4 x i64> undef to <4 x i32>
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2816, ptr undef, i32 1, <2 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2832, ptr undef, i32 1, <2 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2864, ptr undef, i32 1, <2 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4816, ptr undef, i32 1, <4 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4832, ptr undef, i32 1, <4 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4864, ptr undef, i32 1, <4 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8816, ptr undef, i32 1, <8 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8832, ptr undef, i32 1, <8 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8864, ptr undef, i32 1, <8 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16816, ptr undef, i32 1, <16 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16832, ptr undef, i32 1, <16 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16864, ptr undef, i32 1, <16 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i16.p0(<2 x i16> %v21632, ptr undef, i32 2, <2 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i16.p0(<2 x i16> %v21664, ptr undef, i32 2, <2 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> %v41632, ptr undef, i32 2, <4 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> %v41664, ptr undef, i32 2, <4 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> %v81632, ptr undef, i32 2, <8 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> %v81664, ptr undef, i32 2, <8 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %v23264, ptr undef, i32 4, <2 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %v43264, ptr undef, i32 4, <4 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2816, ptr align 1 undef, <2 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2832, ptr align 1 undef, <2 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2864, ptr align 1 undef, <2 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4816, ptr align 1 undef, <4 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4832, ptr align 1 undef, <4 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4864, ptr align 1 undef, <4 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8816, ptr align 1 undef, <8 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8832, ptr align 1 undef, <8 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8864, ptr align 1 undef, <8 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16816, ptr align 1 undef, <16 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16832, ptr align 1 undef, <16 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16864, ptr align 1 undef, <16 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i16.p0(<2 x i16> %v21632, ptr align 2 undef, <2 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i16.p0(<2 x i16> %v21664, ptr align 2 undef, <2 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> %v41632, ptr align 2 undef, <4 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> %v41664, ptr align 2 undef, <4 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> %v81632, ptr align 2 undef, <8 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> %v81664, ptr align 2 undef, <8 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %v23264, ptr align 4 undef, <2 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %v43264, ptr align 4 undef, <4 x i1> undef)
 ; CHECK-V8R-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %v2816 = trunc <2 x i16> undef to <2 x i8>
@@ -1661,13 +1661,13 @@ define i32 @maskedstore_trunc() {
 
 define i32 @maskedload_fpextends() {
 ; CHECK-NEON-LABEL: 'maskedload_fpextends'
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: %loadv2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr undef, i32 2, <2 x i1> undef, <2 x half> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: %loadv4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr undef, i32 2, <4 x i1> undef, <4 x half> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 64 for: %loadv8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr undef, i32 2, <8 x i1> undef, <8 x half> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 128 for: %loadv16f16 = call <16 x half> @llvm.masked.load.v16f16.p0(ptr undef, i32 2, <16 x i1> undef, <16 x half> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: %loadv2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 4, <2 x i1> undef, <2 x float> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: %loadv4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 4, <4 x i1> undef, <4 x float> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 64 for: %loadv8f32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 4, <8 x i1> undef, <8 x float> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: %loadv2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr align 2 undef, <2 x i1> undef, <2 x half> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: %loadv4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr align 2 undef, <4 x i1> undef, <4 x half> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 64 for: %loadv8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr align 2 undef, <8 x i1> undef, <8 x half> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 128 for: %loadv16f16 = call <16 x half> @llvm.masked.load.v16f16.p0(ptr align 2 undef, <16 x i1> undef, <16 x half> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: %loadv2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 4 undef, <2 x i1> undef, <2 x float> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: %loadv4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 undef, <4 x i1> undef, <4 x float> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 64 for: %loadv8f32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 undef, <8 x i1> undef, <8 x float> undef)
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of 20 for: %v1 = fpext <2 x half> %loadv2f16 to <2 x float>
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of 40 for: %v2 = fpext <4 x half> %loadv4f16 to <4 x float>
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of 80 for: %v3 = fpext <8 x half> %loadv8f16 to <8 x float>
@@ -1684,13 +1684,13 @@ define i32 @maskedload_fpextends() {
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; CHECK-MVE-LABEL: 'maskedload_fpextends'
-; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: %loadv2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr undef, i32 2, <2 x i1> undef, <2 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of 32 for: %loadv4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr undef, i32 2, <4 x i1> undef, <4 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %loadv8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr undef, i32 2, <8 x i1> undef, <8 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of 128 for: %loadv16f16 = call <16 x half> @llvm.masked.load.v16f16.p0(ptr undef, i32 2, <16 x i1> undef, <16 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: %loadv2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 4, <2 x i1> undef, <2 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %loadv4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 4, <4 x i1> undef, <4 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of 64 for: %loadv8f32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 4, <8 x i1> undef, <8 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: %loadv2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr align 2 undef, <2 x i1> undef, <2 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 32 for: %loadv4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr align 2 undef, <4 x i1> undef, <4 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %loadv8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr align 2 undef, <8 x i1> undef, <8 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 128 for: %loadv16f16 = call <16 x half> @llvm.masked.load.v16f16.p0(ptr align 2 undef, <16 x i1> undef, <16 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: %loadv2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 4 undef, <2 x i1> undef, <2 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %loadv4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 undef, <4 x i1> undef, <4 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 64 for: %loadv8f32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 undef, <8 x i1> undef, <8 x float> undef)
 ; CHECK-MVE-NEXT:  Cost Model: Found costs of 2 for: %v1 = fpext <2 x half> %loadv2f16 to <2 x float>
 ; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2 = fpext <4 x half> %loadv4f16 to <4 x float>
 ; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v3 = fpext <8 x half> %loadv8f16 to <8 x float>
@@ -1707,13 +1707,13 @@ define i32 @maskedload_fpextends() {
 ; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; CHECK-V8M-MAIN-LABEL: 'maskedload_fpextends'
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: %loadv2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr undef, i32 2, <2 x i1> undef, <2 x half> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: %loadv4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr undef, i32 2, <4 x i1> undef, <4 x half> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 64 for: %loadv8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr undef, i32 2, <8 x i1> undef, <8 x half> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 128 for: %loadv16f16 = call <16 x half> @llvm.masked.load.v16f16.p0(ptr undef, i32 2, <16 x i1> undef, <16 x half> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: %loadv2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 4, <2 x i1> undef, <2 x float> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: %loadv4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 4, <4 x i1> undef, <4 x float> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 64 for: %loadv8f32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 4, <8 x i1> undef, <8 x float> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: %loadv2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr align 2 undef, <2 x i1> undef, <2 x half> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: %loadv4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr align 2 undef, <4 x i1> undef, <4 x half> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 64 for: %loadv8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr align 2 undef, <8 x i1> undef, <8 x half> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 128 for: %loadv16f16 = call <16 x half> @llvm.masked.load.v16f16.p0(ptr align 2 undef, <16 x i1> undef, <16 x half> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: %loadv2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 4 undef, <2 x i1> undef, <2 x float> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: %loadv4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 undef, <4 x i1> undef, <4 x float> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 64 for: %loadv8f32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 undef, <8 x i1> undef, <8 x float> undef)
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 20 for: %v1 = fpext <2 x half> %loadv2f16 to <2 x float>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 40 for: %v2 = fpext <4 x half> %loadv4f16 to <4 x float>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 80 for: %v3 = fpext <8 x half> %loadv8f16 to <8 x float>
@@ -1730,13 +1730,13 @@ define i32 @maskedload_fpextends() {
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 1 for: ret i32 undef
 ;
 ; CHECK-V8M-BASE-LABEL: 'maskedload_fpextends'
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: %loadv2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr undef, i32 2, <2 x i1> undef, <2 x half> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: %loadv4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr undef, i32 2, <4 x i1> undef, <4 x half> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 64 for: %loadv8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr undef, i32 2, <8 x i1> undef, <8 x half> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 128 for: %loadv16f16 = call <16 x half> @llvm.masked.load.v16f16.p0(ptr undef, i32 2, <16 x i1> undef, <16 x half> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: %loadv2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 4, <2 x i1> undef, <2 x float> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: %loadv4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 4, <4 x i1> undef, <4 x float> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 64 for: %loadv8f32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 4, <8 x i1> undef, <8 x float> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: %loadv2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr align 2 undef, <2 x i1> undef, <2 x half> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: %loadv4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr align 2 undef, <4 x i1> undef, <4 x half> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 64 for: %loadv8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr align 2 undef, <8 x i1> undef, <8 x half> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 128 for: %loadv16f16 = call <16 x half> @llvm.masked.load.v16f16.p0(ptr align 2 undef, <16 x i1> undef, <16 x half> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: %loadv2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 4 undef, <2 x i1> undef, <2 x float> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: %loadv4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 undef, <4 x i1> undef, <4 x float> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 64 for: %loadv8f32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 undef, <8 x i1> undef, <8 x float> undef)
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 20 for: %v1 = fpext <2 x half> %loadv2f16 to <2 x float>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 40 for: %v2 = fpext <4 x half> %loadv4f16 to <4 x float>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 80 for: %v3 = fpext <8 x half> %loadv8f16 to <8 x float>
@@ -1753,13 +1753,13 @@ define i32 @maskedload_fpextends() {
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 1 for: ret i32 undef
 ;
 ; CHECK-V8R-LABEL: 'maskedload_fpextends'
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: %loadv2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr undef, i32 2, <2 x i1> undef, <2 x half> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: %loadv4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr undef, i32 2, <4 x i1> undef, <4 x half> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 64 for: %loadv8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr undef, i32 2, <8 x i1> undef, <8 x half> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 128 for: %loadv16f16 = call <16 x half> @llvm.masked.load.v16f16.p0(ptr undef, i32 2, <16 x i1> undef, <16 x half> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: %loadv2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 4, <2 x i1> undef, <2 x float> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: %loadv4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 4, <4 x i1> undef, <4 x float> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 64 for: %loadv8f32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 4, <8 x i1> undef, <8 x float> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: %loadv2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr align 2 undef, <2 x i1> undef, <2 x half> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: %loadv4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr align 2 undef, <4 x i1> undef, <4 x half> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 64 for: %loadv8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr align 2 undef, <8 x i1> undef, <8 x half> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 128 for: %loadv16f16 = call <16 x half> @llvm.masked.load.v16f16.p0(ptr align 2 undef, <16 x i1> undef, <16 x half> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: %loadv2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 4 undef, <2 x i1> undef, <2 x float> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: %loadv4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 undef, <4 x i1> undef, <4 x float> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 64 for: %loadv8f32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 undef, <8 x i1> undef, <8 x float> undef)
 ; CHECK-V8R-NEXT:  Cost Model: Found costs of 20 for: %v1 = fpext <2 x half> %loadv2f16 to <2 x float>
 ; CHECK-V8R-NEXT:  Cost Model: Found costs of 40 for: %v2 = fpext <4 x half> %loadv4f16 to <4 x float>
 ; CHECK-V8R-NEXT:  Cost Model: Found costs of 80 for: %v3 = fpext <8 x half> %loadv8f16 to <8 x float>
@@ -1811,14 +1811,14 @@ define i32 @maskedload_fptrunc() {
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of 80 for: %v81664 = fptrunc <8 x double> undef to <8 x half>
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v23264 = fptrunc <2 x double> undef to <2 x float>
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %v43264 = fptrunc <4 x double> undef to <4 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f16.p0(<2 x half> %v21632, ptr undef, i32 2, <2 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f16.p0(<2 x half> %v21664, ptr undef, i32 2, <2 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f16.p0(<4 x half> %v41632, ptr undef, i32 2, <4 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f16.p0(<4 x half> %v41664, ptr undef, i32 2, <4 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8f16.p0(<8 x half> %v81632, ptr undef, i32 2, <8 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8f16.p0(<8 x half> %v81664, ptr undef, i32 2, <8 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %v23264, ptr undef, i32 4, <2 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f32.p0(<4 x float> %v43264, ptr undef, i32 4, <4 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f16.p0(<2 x half> %v21632, ptr align 2 undef, <2 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f16.p0(<2 x half> %v21664, ptr align 2 undef, <2 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f16.p0(<4 x half> %v41632, ptr align 2 undef, <4 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f16.p0(<4 x half> %v41664, ptr align 2 undef, <4 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8f16.p0(<8 x half> %v81632, ptr align 2 undef, <8 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8f16.p0(<8 x half> %v81664, ptr align 2 undef, <8 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %v23264, ptr align 4 undef, <2 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f32.p0(<4 x float> %v43264, ptr align 4 undef, <4 x i1> undef)
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; CHECK-MVE-LABEL: 'maskedload_fptrunc'
@@ -1830,14 +1830,14 @@ define i32 @maskedload_fptrunc() {
 ; CHECK-MVE-NEXT:  Cost Model: Found costs of 80 for: %v81664 = fptrunc <8 x double> undef to <8 x half>
 ; CHECK-MVE-NEXT:  Cost Model: Found costs of 20 for: %v23264 = fptrunc <2 x double> undef to <2 x float>
 ; CHECK-MVE-NEXT:  Cost Model: Found costs of 40 for: %v43264 = fptrunc <4 x double> undef to <4 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f16.p0(<2 x half> %v21632, ptr undef, i32 2, <2 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f16.p0(<2 x half> %v21664, ptr undef, i32 2, <2 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f16.p0(<4 x half> %v41632, ptr undef, i32 2, <4 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f16.p0(<4 x half> %v41664, ptr undef, i32 2, <4 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v8f16.p0(<8 x half> %v81632, ptr undef, i32 2, <8 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v8f16.p0(<8 x half> %v81664, ptr undef, i32 2, <8 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %v23264, ptr undef, i32 4, <2 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v4f32.p0(<4 x float> %v43264, ptr undef, i32 4, <4 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f16.p0(<2 x half> %v21632, ptr align 2 undef, <2 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f16.p0(<2 x half> %v21664, ptr align 2 undef, <2 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f16.p0(<4 x half> %v41632, ptr align 2 undef, <4 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f16.p0(<4 x half> %v41664, ptr align 2 undef, <4 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v8f16.p0(<8 x half> %v81632, ptr align 2 undef, <8 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v8f16.p0(<8 x half> %v81664, ptr align 2 undef, <8 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %v23264, ptr align 4 undef, <2 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v4f32.p0(<4 x float> %v43264, ptr align 4 undef, <4 x i1> undef)
 ; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; CHECK-V8M-MAIN-LABEL: 'maskedload_fptrunc'
@@ -1849,14 +1849,14 @@ define i32 @maskedload_fptrunc() {
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 80 for: %v81664 = fptrunc <8 x double> undef to <8 x half>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 20 for: %v23264 = fptrunc <2 x double> undef to <2 x float>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 40 for: %v43264 = fptrunc <4 x double> undef to <4 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f16.p0(<2 x half> %v21632, ptr undef, i32 2, <2 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f16.p0(<2 x half> %v21664, ptr undef, i32 2, <2 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f16.p0(<4 x half> %v41632, ptr undef, i32 2, <4 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f16.p0(<4 x half> %v41664, ptr undef, i32 2, <4 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8f16.p0(<8 x half> %v81632, ptr undef, i32 2, <8 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8f16.p0(<8 x half> %v81664, ptr undef, i32 2, <8 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %v23264, ptr undef, i32 4, <2 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f32.p0(<4 x float> %v43264, ptr undef, i32 4, <4 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f16.p0(<2 x half> %v21632, ptr align 2 undef, <2 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f16.p0(<2 x half> %v21664, ptr align 2 undef, <2 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f16.p0(<4 x half> %v41632, ptr align 2 undef, <4 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f16.p0(<4 x half> %v41664, ptr align 2 undef, <4 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8f16.p0(<8 x half> %v81632, ptr align 2 undef, <8 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8f16.p0(<8 x half> %v81664, ptr align 2 undef, <8 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %v23264, ptr align 4 undef, <2 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f32.p0(<4 x float> %v43264, ptr align 4 undef, <4 x i1> undef)
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 1 for: ret i32 undef
 ;
 ; CHECK-V8M-BASE-LABEL: 'maskedload_fptrunc'
@@ -1868,14 +1868,14 @@ define i32 @maskedload_fptrunc() {
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 80 for: %v81664 = fptrunc <8 x double> undef to <8 x half>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 20 for: %v23264 = fptrunc <2 x double> undef to <2 x float>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 40 for: %v43264 = fptrunc <4 x double> undef to <4 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f16.p0(<2 x half> %v21632, ptr undef, i32 2, <2 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f16.p0(<2 x half> %v21664, ptr undef, i32 2, <2 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f16.p0(<4 x half> %v41632, ptr undef, i32 2, <4 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f16.p0(<4 x half> %v41664, ptr undef, i32 2, <4 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8f16.p0(<8 x half> %v81632, ptr undef, i32 2, <8 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8f16.p0(<8 x half> %v81664, ptr undef, i32 2, <8 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %v23264, ptr undef, i32 4, <2 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f32.p0(<4 x float> %v43264, ptr undef, i32 4, <4 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f16.p0(<2 x half> %v21632, ptr align 2 undef, <2 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f16.p0(<2 x half> %v21664, ptr align 2 undef, <2 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f16.p0(<4 x half> %v41632, ptr align 2 undef, <4 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f16.p0(<4 x half> %v41664, ptr align 2 undef, <4 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8f16.p0(<8 x half> %v81632, ptr align 2 undef, <8 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8f16.p0(<8 x half> %v81664, ptr align 2 undef, <8 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %v23264, ptr align 4 undef, <2 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f32.p0(<4 x float> %v43264, ptr align 4 undef, <4 x i1> undef)
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 1 for: ret i32 undef
 ;
 ; CHECK-V8R-LABEL: 'maskedload_fptrunc'
@@ -1887,14 +1887,14 @@ define i32 @maskedload_fptrunc() {
 ; CHECK-V8R-NEXT:  Cost Model: Found costs of 80 for: %v81664 = fptrunc <8 x double> undef to <8 x half>
 ; CHECK-V8R-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v23264 = fptrunc <2 x double> undef to <2 x float>
 ; CHECK-V8R-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %v43264 = fptrunc <4 x double> undef to <4 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f16.p0(<2 x half> %v21632, ptr undef, i32 2, <2 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f16.p0(<2 x half> %v21664, ptr undef, i32 2, <2 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f16.p0(<4 x half> %v41632, ptr undef, i32 2, <4 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f16.p0(<4 x half> %v41664, ptr undef, i32 2, <4 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8f16.p0(<8 x half> %v81632, ptr undef, i32 2, <8 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8f16.p0(<8 x half> %v81664, ptr undef, i32 2, <8 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %v23264, ptr undef, i32 4, <2 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f32.p0(<4 x float> %v43264, ptr undef, i32 4, <4 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f16.p0(<2 x half> %v21632, ptr align 2 undef, <2 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f16.p0(<2 x half> %v21664, ptr align 2 undef, <2 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f16.p0(<4 x half> %v41632, ptr align 2 undef, <4 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f16.p0(<4 x half> %v41664, ptr align 2 undef, <4 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8f16.p0(<8 x half> %v81632, ptr align 2 undef, <8 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8f16.p0(<8 x half> %v81664, ptr align 2 undef, <8 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %v23264, ptr align 4 undef, <2 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f32.p0(<4 x float> %v43264, ptr align 4 undef, <4 x i1> undef)
 ; CHECK-V8R-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %v21632 = fptrunc <2 x float> undef to <2 x half>
diff --git a/llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll b/llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll
index 7a40252dfa619..6377437c8ca4b 100644
--- a/llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll
+++ b/llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll
@@ -256,19 +256,19 @@ define void @fshl(i32 %a, i32 %b, i32 %c, <16 x i32> %va, <16 x i32> %vb, <16 x
 
 define void @maskedgather(<16 x ptr> %va, <16 x i1> %vb, <16 x float> %vc) {
 ; THRU-LABEL: 'maskedgather'
-; THRU-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %va, i32 1, <16 x i1> %vb, <16 x float> %vc)
+; THRU-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 %va, <16 x i1> %vb, <16 x float> %vc)
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; LATE-LABEL: 'maskedgather'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %va, i32 1, <16 x i1> %vb, <16 x float> %vc)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 %va, <16 x i1> %vb, <16 x float> %vc)
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'maskedgather'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %va, i32 1, <16 x i1> %vb, <16 x float> %vc)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 %va, <16 x i1> %vb, <16 x float> %vc)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE_LATE-LABEL: 'maskedgather'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %va, i32 1, <16 x i1> %vb, <16 x float> %vc)
+; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 %va, <16 x i1> %vb, <16 x float> %vc)
 ; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %va, i32 1, <16 x i1> %vb, <16 x float> %vc)
@@ -277,19 +277,19 @@ define void @maskedgather(<16 x ptr> %va, <16 x i1> %vb, <16 x float> %vc) {
 
 define void @maskedscatter(<16 x float> %va, <16 x ptr> %vb, <16 x i1> %vc) {
 ; THRU-LABEL: 'maskedscatter'
-; THRU-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> %vb, i32 1, <16 x i1> %vc)
+; THRU-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> align 1 %vb, <16 x i1> %vc)
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; LATE-LABEL: 'maskedscatter'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> %vb, i32 1, <16 x i1> %vc)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> align 1 %vb, <16 x i1> %vc)
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'maskedscatter'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> %vb, i32 1, <16 x i1> %vc)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> align 1 %vb, <16 x i1> %vc)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE_LATE-LABEL: 'maskedscatter'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> %vb, i32 1, <16 x i1> %vc)
+; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> align 1 %vb, <16 x i1> %vc)
 ; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> %vb, i32 1, <16 x i1> %vc)
diff --git a/llvm/test/Analysis/CostModel/ARM/mve-gather-scatter-cost.ll b/llvm/test/Analysis/CostModel/ARM/mve-gather-scatter-cost.ll
index 5a23ebf0c2b8d..5281b5d691827 100644
--- a/llvm/test/Analysis/CostModel/ARM/mve-gather-scatter-cost.ll
+++ b/llvm/test/Analysis/CostModel/ARM/mve-gather-scatter-cost.ll
@@ -5,32 +5,32 @@ target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
 define i32 @masked_gather() {
 ; CHECK-LABEL: 'masked_gather'
-; CHECK-NEXT:  Cost Model: Found costs of 16 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 4, <4 x i1> undef, <4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 6 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 4, <2 x i1> undef, <2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 96 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 4, <16 x i1> undef, <16 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 32 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 4, <8 x i1> undef, <8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 4, <4 x i1> undef, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 6 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 4, <2 x i1> undef, <2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 64 for: %V16F16 = call <16 x half> @llvm.masked.gather.v16f16.v16p0(<16 x ptr> undef, i32 2, <16 x i1> undef, <16 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 24 for: %V8F16 = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> undef, i32 2, <8 x i1> undef, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 12 for: %V4F16 = call <4 x half> @llvm.masked.gather.v4f16.v4p0(<4 x ptr> undef, i32 2, <4 x i1> undef, <4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 6 for: %V2F16 = call <2 x half> @llvm.masked.gather.v2f16.v2p0(<2 x ptr> undef, i32 2, <2 x i1> undef, <2 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 72 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 4, <4 x i1> undef, <4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 34 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 4, <2 x i1> undef, <2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 192 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 4, <16 x i1> undef, <16 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 80 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 4, <8 x i1> undef, <8 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 4, <4 x i1> undef, <4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 18 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 4, <2 x i1> undef, <2 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 160 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 2, <16 x i1> undef, <16 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 72 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 2, <8 x i1> undef, <8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 36 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 2, <4 x i1> undef, <4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 18 for: %V2I16 = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> undef, i32 2, <2 x i1> undef, <2 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 320 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 144 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 72 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 36 for: %V4I8 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 18 for: %V2I8 = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 12 for: %V4I32p = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> undef, i32 4, <4 x i1> undef, <4 x ptr> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 16 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 4 undef, <4 x i1> undef, <4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 6 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 4 undef, <2 x i1> undef, <2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 96 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 undef, <16 x i1> undef, <16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 32 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> align 4 undef, <8 x i1> undef, <8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 undef, <4 x i1> undef, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 6 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 4 undef, <2 x i1> undef, <2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 64 for: %V16F16 = call <16 x half> @llvm.masked.gather.v16f16.v16p0(<16 x ptr> align 2 undef, <16 x i1> undef, <16 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 24 for: %V8F16 = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> align 2 undef, <8 x i1> undef, <8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 12 for: %V4F16 = call <4 x half> @llvm.masked.gather.v4f16.v4p0(<4 x ptr> align 2 undef, <4 x i1> undef, <4 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 6 for: %V2F16 = call <2 x half> @llvm.masked.gather.v2f16.v2p0(<2 x ptr> align 2 undef, <2 x i1> undef, <2 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 72 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 4 undef, <4 x i1> undef, <4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 34 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 4 undef, <2 x i1> undef, <2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 192 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 4 undef, <16 x i1> undef, <16 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 80 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 4 undef, <8 x i1> undef, <8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 undef, <4 x i1> undef, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 18 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 4 undef, <2 x i1> undef, <2 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 160 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> align 2 undef, <16 x i1> undef, <16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 72 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 2 undef, <8 x i1> undef, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 36 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 2 undef, <4 x i1> undef, <4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 18 for: %V2I16 = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> align 2 undef, <2 x i1> undef, <2 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 320 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> align 1 undef, <32 x i1> undef, <32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 144 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> align 1 undef, <16 x i1> undef, <16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 72 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> align 1 undef, <8 x i1> undef, <8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 36 for: %V4I8 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> align 1 undef, <4 x i1> undef, <4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 18 for: %V2I8 = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> align 1 undef, <2 x i1> undef, <2 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 12 for: %V4I32p = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> align 4 undef, <4 x i1> undef, <4 x ptr> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
   %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 4, <4 x i1> undef, <4 x double> undef)
@@ -72,31 +72,31 @@ define i32 @masked_gather() {
 
 define i32 @masked_scatter() {
 ; CHECK-LABEL: 'masked_scatter'
-; CHECK-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 4, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 6 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 4, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 96 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 4, <16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 4, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 4, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 6 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 4, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.scatter.v16f16.v16p0(<16 x half> undef, <16 x ptr> undef, i32 2, <16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 24 for: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> undef, <8 x ptr> undef, i32 2, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 12 for: call void @llvm.masked.scatter.v4f16.v4p0(<4 x half> undef, <4 x ptr> undef, i32 2, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 6 for: call void @llvm.masked.scatter.v2f16.v2p0(<2 x half> undef, <2 x ptr> undef, i32 2, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 72 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 4, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 34 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 4, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 192 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 4, <16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 80 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 4, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 4, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 18 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 4, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 160 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 2, <16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 72 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 2, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 36 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 2, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 18 for: call void @llvm.masked.scatter.v2i16.v2p0(<2 x i16> undef, <2 x ptr> undef, i32 2, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 320 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 144 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 72 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 36 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 18 for: call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> align 4 undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 6 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> align 4 undef, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 96 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> align 4 undef, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> align 4 undef, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> align 4 undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 6 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> align 4 undef, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.scatter.v16f16.v16p0(<16 x half> undef, <16 x ptr> align 2 undef, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 24 for: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> undef, <8 x ptr> align 2 undef, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 12 for: call void @llvm.masked.scatter.v4f16.v4p0(<4 x half> undef, <4 x ptr> align 2 undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 6 for: call void @llvm.masked.scatter.v2f16.v2p0(<2 x half> undef, <2 x ptr> align 2 undef, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 72 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> align 4 undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 34 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> align 4 undef, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 192 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> align 4 undef, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 80 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> align 4 undef, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> align 4 undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 18 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> align 4 undef, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 160 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> align 2 undef, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 72 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> align 2 undef, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 36 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> align 2 undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 18 for: call void @llvm.masked.scatter.v2i16.v2p0(<2 x i16> undef, <2 x ptr> align 2 undef, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 320 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> align 1 undef, <32 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 144 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> align 1 undef, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 72 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> align 1 undef, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 36 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> undef, <4 x ptr> align 1 undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 18 for: call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> undef, <2 x ptr> align 1 undef, <2 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
   call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 4, <4 x i1> undef)
@@ -137,27 +137,27 @@ define i32 @masked_scatter() {
 define void @gep_v4i32(ptr %base, ptr %base16, ptr %base8, <4 x i32> %ind32, <4 x i16> %ind16, <4 x i1> %mask)  {
 ; CHECK-LABEL: 'gep_v4i32'
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep1 = getelementptr i32, ptr %base, <4 x i32> %ind32
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res1 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %gep1, i32 4, <4 x i1> %mask, <4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %res1, <4 x ptr> %gep1, i32 4, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res1 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %gep1, <4 x i1> %mask, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %res1, <4 x ptr> align 4 %gep1, <4 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %indzext = zext <4 x i16> %ind16 to <4 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep2 = getelementptr i32, ptr %base, <4 x i32> %indzext
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res2 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %gep2, i32 4, <4 x i1> %mask, <4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %res2, <4 x ptr> %gep2, i32 4, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res2 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %gep2, <4 x i1> %mask, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %res2, <4 x ptr> align 4 %gep2, <4 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %indsext = sext <4 x i16> %ind16 to <4 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep3 = getelementptr i32, ptr %base, <4 x i32> %indsext
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res3 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %gep3, i32 4, <4 x i1> %mask, <4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %res3, <4 x ptr> %gep3, i32 4, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res3 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %gep3, <4 x i1> %mask, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %res3, <4 x ptr> align 4 %gep3, <4 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepu = getelementptr i32, ptr %base, <4 x i32> %ind32
-; CHECK-NEXT:  Cost Model: Found costs of 56 for: %resu = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %gepu, i32 1, <4 x i1> %mask, <4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 56 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %resu, <4 x ptr> %gepu, i32 1, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 56 for: %resu = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 %gepu, <4 x i1> %mask, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 56 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %resu, <4 x ptr> align 1 %gepu, <4 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepos = getelementptr i8, ptr %base8, <4 x i32> %indzext
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %geposb = bitcast <4 x ptr> %gepos to <4 x ptr>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %resos = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %geposb, i32 4, <4 x i1> %mask, <4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %resos, <4 x ptr> %geposb, i32 4, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %resos = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %geposb, <4 x i1> %mask, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %resos, <4 x ptr> align 4 %geposb, <4 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepbs = getelementptr i16, ptr %base16, <4 x i32> %indzext
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepbsb = bitcast <4 x ptr> %gepbs to <4 x ptr>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %resbs = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %gepbsb, i32 4, <4 x i1> %mask, <4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %resbs, <4 x ptr> %gepbsb, i32 4, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %resbs = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %gepbsb, <4 x i1> %mask, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %resbs, <4 x ptr> align 4 %gepbsb, <4 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %gep1 = getelementptr i32, ptr %base, <4 x i32> %ind32
@@ -196,27 +196,27 @@ define void @gep_v4i32(ptr %base, ptr %base16, ptr %base8, <4 x i32> %ind32, <4
 define void @gep_v4f32(ptr %base, ptr %base16, ptr %base8, <4 x i32> %ind32, <4 x i16> %ind16, <4 x i1> %mask)  {
 ; CHECK-LABEL: 'gep_v4f32'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %gep1 = getelementptr float, ptr %base, <4 x i32> %ind32
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res1 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep1, i32 4, <4 x i1> %mask, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %res1, <4 x ptr> %gep1, i32 4, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res1 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep1, <4 x i1> %mask, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %res1, <4 x ptr> align 4 %gep1, <4 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %indzext = zext <4 x i16> %ind16 to <4 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %gep2 = getelementptr float, ptr %base, <4 x i32> %indzext
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res2 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep2, i32 4, <4 x i1> %mask, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %res2, <4 x ptr> %gep2, i32 4, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res2 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep2, <4 x i1> %mask, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %res2, <4 x ptr> align 4 %gep2, <4 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %indsext = sext <4 x i16> %ind16 to <4 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %gep3 = getelementptr float, ptr %base, <4 x i32> %indsext
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res3 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep3, i32 4, <4 x i1> %mask, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %res3, <4 x ptr> %gep3, i32 4, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res3 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep3, <4 x i1> %mask, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %res3, <4 x ptr> align 4 %gep3, <4 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %gepu = getelementptr float, ptr %base, <4 x i32> %ind32
-; CHECK-NEXT:  Cost Model: Found costs of 32 for: %resu = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gepu, i32 1, <4 x i1> %mask, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %resu, <4 x ptr> %gepu, i32 1, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 32 for: %resu = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 1 %gepu, <4 x i1> %mask, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %resu, <4 x ptr> align 1 %gepu, <4 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepos = getelementptr i8, ptr %base8, <4 x i32> %indzext
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %geposb = bitcast <4 x ptr> %gepos to <4 x ptr>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %resos = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %geposb, i32 4, <4 x i1> %mask, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %resos, <4 x ptr> %geposb, i32 4, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %resos = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %geposb, <4 x i1> %mask, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %resos, <4 x ptr> align 4 %geposb, <4 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepbs = getelementptr i16, ptr %base16, <4 x i32> %indzext
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepbsb = bitcast <4 x ptr> %gepbs to <4 x ptr>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %resbs = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gepbsb, i32 4, <4 x i1> %mask, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %resbs, <4 x ptr> %gepbsb, i32 4, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %resbs = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gepbsb, <4 x i1> %mask, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %resbs, <4 x ptr> align 4 %gepbsb, <4 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %gep1 = getelementptr float, ptr %base, <4 x i32> %ind32
@@ -255,25 +255,25 @@ define void @gep_v4f32(ptr %base, ptr %base16, ptr %base8, <4 x i32> %ind32, <4
 define void @gep_v4i16(ptr %base, <4 x i32> %ind32, <4 x i16> %ind16, <4 x i1> %mask)  {
 ; CHECK-LABEL: 'gep_v4i16'
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep1 = getelementptr i16, ptr %base, <4 x i32> %ind32
-; CHECK-NEXT:  Cost Model: Found costs of 56 for: %res1 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %gep1, i32 2, <4 x i1> %mask, <4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 56 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %res1, <4 x ptr> %gep1, i32 2, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 56 for: %res1 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 2 %gep1, <4 x i1> %mask, <4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 56 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %res1, <4 x ptr> align 2 %gep1, <4 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %indzext = zext <4 x i16> %ind16 to <4 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep2 = getelementptr i16, ptr %base, <4 x i32> %indzext
-; CHECK-NEXT:  Cost Model: Found costs of 56 for: %res2 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %gep2, i32 2, <4 x i1> %mask, <4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 56 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %res2, <4 x ptr> %gep2, i32 2, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 56 for: %res2 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 2 %gep2, <4 x i1> %mask, <4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 56 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %res2, <4 x ptr> align 2 %gep2, <4 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %indsext = sext <4 x i16> %ind16 to <4 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep3 = getelementptr i16, ptr %base, <4 x i32> %indsext
-; CHECK-NEXT:  Cost Model: Found costs of 56 for: %res3 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %gep3, i32 2, <4 x i1> %mask, <4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 56 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %res3, <4 x ptr> %gep3, i32 2, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 56 for: %res3 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 2 %gep3, <4 x i1> %mask, <4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 56 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %res3, <4 x ptr> align 2 %gep3, <4 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep5 = getelementptr i16, ptr %base, <4 x i16> %ind16
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res5 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %gep5, i32 2, <4 x i1> %mask, <4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res5 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 2 %gep5, <4 x i1> %mask, <4 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %res5zext = zext <4 x i16> %res5 to <4 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %res5trunc = trunc <4 x i32> %res5zext to <4 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %res5trunc, <4 x ptr> %gep5, i32 4, <4 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res6 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %gep5, i32 2, <4 x i1> %mask, <4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %res5trunc, <4 x ptr> align 4 %gep5, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res6 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 2 %gep5, <4 x i1> %mask, <4 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %res6sext = sext <4 x i16> %res6 to <4 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %res6trunc = trunc <4 x i32> %res6sext to <4 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %res6trunc, <4 x ptr> %gep5, i32 4, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %res6trunc, <4 x ptr> align 4 %gep5, <4 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %gep1 = getelementptr i16, ptr %base, <4 x i32> %ind32
@@ -309,14 +309,14 @@ define void @gep_v4i16(ptr %base, <4 x i32> %ind32, <4 x i16> %ind16, <4 x i1> %
 define void @gep_v4i8(ptr %base, <4 x i8> %ind8, <4 x i1> %mask)  {
 ; CHECK-LABEL: 'gep_v4i8'
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep5 = getelementptr i8, ptr %base, <4 x i8> %ind8
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res5 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %gep5, i32 2, <4 x i1> %mask, <4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res5 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> align 2 %gep5, <4 x i1> %mask, <4 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %res5zext = zext <4 x i8> %res5 to <4 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %res5trunc = trunc <4 x i32> %res5zext to <4 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %res5trunc, <4 x ptr> %gep5, i32 4, <4 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res6 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %gep5, i32 2, <4 x i1> %mask, <4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %res5trunc, <4 x ptr> align 4 %gep5, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res6 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> align 2 %gep5, <4 x i1> %mask, <4 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %res6sext = sext <4 x i8> %res6 to <4 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %res6trunc = trunc <4 x i32> %res6sext to <4 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %res6trunc, <4 x ptr> %gep5, i32 4, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %res6trunc, <4 x ptr> align 4 %gep5, <4 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   ; result zext
@@ -338,34 +338,34 @@ define void @gep_v4i8(ptr %base, <4 x i8> %ind8, <4 x i1> %mask)  {
 define void @gep_v8i16(ptr %base, ptr %base8, ptr %base32, <8 x i32> %ind32, <8 x i16> %ind16, <8 x i8> %ind8, <8 x i1> %mask)  {
 ; CHECK-LABEL: 'gep_v8i16'
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep1 = getelementptr i16, ptr %base, <8 x i32> %ind32
-; CHECK-NEXT:  Cost Model: Found costs of 112 for: %res1 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %gep1, i32 2, <8 x i1> %mask, <8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 112 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %res1, <8 x ptr> %gep1, i32 2, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 112 for: %res1 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 2 %gep1, <8 x i1> %mask, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 112 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %res1, <8 x ptr> align 2 %gep1, <8 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %indzext = zext <8 x i16> %ind16 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep2 = getelementptr i16, ptr %base, <8 x i32> %indzext
-; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %res2 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %gep2, i32 2, <8 x i1> %mask, <8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %res2, <8 x ptr> %gep2, i32 2, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %res2 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 2 %gep2, <8 x i1> %mask, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %res2, <8 x ptr> align 2 %gep2, <8 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %indsext = sext <8 x i16> %ind16 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep3 = getelementptr i16, ptr %base, <8 x i32> %indsext
-; CHECK-NEXT:  Cost Model: Found costs of 112 for: %res3 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %gep3, i32 2, <8 x i1> %mask, <8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 112 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %res3, <8 x ptr> %gep3, i32 2, <8 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found costs of 112 for: %resu = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %gep2, i32 1, <8 x i1> %mask, <8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 112 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %resu, <8 x ptr> %gep2, i32 1, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 112 for: %res3 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 2 %gep3, <8 x i1> %mask, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 112 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %res3, <8 x ptr> align 2 %gep3, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 112 for: %resu = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 1 %gep2, <8 x i1> %mask, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 112 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %resu, <8 x ptr> align 1 %gep2, <8 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepos = getelementptr i8, ptr %base8, <8 x i32> %indzext
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %geposb = bitcast <8 x ptr> %gepos to <8 x ptr>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %resos = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %geposb, i32 2, <8 x i1> %mask, <8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %resos, <8 x ptr> %geposb, i32 2, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %resos = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 2 %geposb, <8 x i1> %mask, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %resos, <8 x ptr> align 2 %geposb, <8 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepbs = getelementptr i32, ptr %base32, <8 x i32> %indzext
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepbsb = bitcast <8 x ptr> %gepbs to <8 x ptr>
-; CHECK-NEXT:  Cost Model: Found costs of 112 for: %resbs = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %gepbsb, i32 2, <8 x i1> %mask, <8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 112 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %resbs, <8 x ptr> %gepbsb, i32 2, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 112 for: %resbs = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 2 %gepbsb, <8 x i1> %mask, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 112 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %resbs, <8 x ptr> align 2 %gepbsb, <8 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %indzext4 = zext <8 x i16> %ind16 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep4 = getelementptr i16, ptr %base, <8 x i32> %indzext4
 ; CHECK-NEXT:  Cost Model: Found costs of 16 for: %indtrunc = trunc <8 x i32> %ind32 to <8 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %indtrunc, <8 x ptr> %gep4, i32 2, <8 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %res = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %gep4, i32 2, <8 x i1> %mask, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %indtrunc, <8 x ptr> align 2 %gep4, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %res = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 2 %gep4, <8 x i1> %mask, <8 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %ressext = sext <8 x i16> %res to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of 16 for: %restrunc = trunc <8 x i32> %ressext to <8 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %restrunc, <8 x ptr> %gep4, i32 4, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %restrunc, <8 x ptr> align 4 %gep4, <8 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   ; no offset ext
@@ -419,26 +419,26 @@ define void @gep_v8i16(ptr %base, ptr %base8, ptr %base32, <8 x i32> %ind32, <8
 define void @gep_v8f16(ptr %base, ptr %base8, ptr %base32, <8 x i32> %ind32, <8 x i16> %ind16, <8 x i1> %mask)  {
 ; CHECK-LABEL: 'gep_v8f16'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %gep1 = getelementptr half, ptr %base, <8 x i32> %ind32
-; CHECK-NEXT:  Cost Model: Found costs of 64 for: %res1 = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> %gep1, i32 2, <8 x i1> %mask, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %res1, <8 x ptr> %gep1, i32 2, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 64 for: %res1 = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> align 2 %gep1, <8 x i1> %mask, <8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %res1, <8 x ptr> align 2 %gep1, <8 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %indzext = zext <8 x i16> %ind16 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %gep2 = getelementptr half, ptr %base, <8 x i32> %indzext
-; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %res2 = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> %gep2, i32 2, <8 x i1> %mask, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %res2, <8 x ptr> %gep2, i32 2, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %res2 = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> align 2 %gep2, <8 x i1> %mask, <8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %res2, <8 x ptr> align 2 %gep2, <8 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %indsext = sext <8 x i16> %ind16 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %gep3 = getelementptr half, ptr %base, <8 x i32> %indsext
-; CHECK-NEXT:  Cost Model: Found costs of 64 for: %res3 = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> %gep3, i32 2, <8 x i1> %mask, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %res3, <8 x ptr> %gep3, i32 2, <8 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found costs of 64 for: %resu = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> %gep2, i32 1, <8 x i1> %mask, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %resu, <8 x ptr> %gep2, i32 1, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 64 for: %res3 = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> align 2 %gep3, <8 x i1> %mask, <8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %res3, <8 x ptr> align 2 %gep3, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 64 for: %resu = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> align 1 %gep2, <8 x i1> %mask, <8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %resu, <8 x ptr> align 1 %gep2, <8 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepos = getelementptr i8, ptr %base8, <8 x i32> %indzext
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %geposb = bitcast <8 x ptr> %gepos to <8 x ptr>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %resos = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> %geposb, i32 2, <8 x i1> %mask, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %resos, <8 x ptr> %geposb, i32 2, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %resos = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> align 2 %geposb, <8 x i1> %mask, <8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %resos, <8 x ptr> align 2 %geposb, <8 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepbs = getelementptr i32, ptr %base32, <8 x i32> %indzext
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepbsb = bitcast <8 x ptr> %gepbs to <8 x ptr>
-; CHECK-NEXT:  Cost Model: Found costs of 64 for: %resbs = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> %gepbsb, i32 2, <8 x i1> %mask, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %resbs, <8 x ptr> %gepbsb, i32 2, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 64 for: %resbs = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> align 2 %gepbsb, <8 x i1> %mask, <8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %resbs, <8 x ptr> align 2 %gepbsb, <8 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   ; no offset ext
@@ -481,14 +481,14 @@ define void @gep_v8i8(ptr %base, <8 x i8> %ind8, <8 x i1> %mask)  {
 ; CHECK-LABEL: 'gep_v8i8'
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: %indzext = zext <8 x i8> %ind8 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep5 = getelementptr i8, ptr %base, <8 x i32> %indzext
-; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %res5 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> %gep5, i32 2, <8 x i1> %mask, <8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %res5 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> align 2 %gep5, <8 x i1> %mask, <8 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %res5zext = zext <8 x i8> %res5 to <8 x i16>
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %res5trunc = trunc <8 x i16> %res5zext to <8 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> %res5trunc, <8 x ptr> %gep5, i32 4, <8 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %res6 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> %gep5, i32 2, <8 x i1> %mask, <8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> %res5trunc, <8 x ptr> align 4 %gep5, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %res6 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> align 2 %gep5, <8 x i1> %mask, <8 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %res6sext = sext <8 x i8> %res6 to <8 x i16>
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %res6trunc = trunc <8 x i16> %res6sext to <8 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> %res6trunc, <8 x ptr> %gep5, i32 4, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> %res6trunc, <8 x ptr> align 4 %gep5, <8 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   ; result zext
@@ -511,24 +511,24 @@ define void @gep_v8i8(ptr %base, <8 x i8> %ind8, <8 x i1> %mask)  {
 define void @gep_v16i8(ptr %base, ptr %base16, <16 x i8> %ind8, <16 x i32> %ind32, <16 x i1> %mask)  {
 ; CHECK-LABEL: 'gep_v16i8'
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep1 = getelementptr i8, ptr %base, <16 x i32> %ind32
-; CHECK-NEXT:  Cost Model: Found costs of 224 for: %res1 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %gep1, i32 1, <16 x i1> %mask, <16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 224 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %res1, <16 x ptr> %gep1, i32 2, <16 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 224 for: %res1 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> align 1 %gep1, <16 x i1> %mask, <16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 224 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %res1, <16 x ptr> align 2 %gep1, <16 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:74 CodeSize:1 Lat:1 SizeLat:1 for: %indzext = zext <16 x i8> %ind8 to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep2 = getelementptr i8, ptr %base, <16 x i32> %indzext
-; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %res2 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %gep2, i32 2, <16 x i1> %mask, <16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %res2, <16 x ptr> %gep2, i32 2, <16 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %res2 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> align 2 %gep2, <16 x i1> %mask, <16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %res2, <16 x ptr> align 2 %gep2, <16 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:74 CodeSize:1 Lat:1 SizeLat:1 for: %indsext = sext <16 x i8> %ind8 to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep3 = getelementptr i8, ptr %base, <16 x i32> %indsext
-; CHECK-NEXT:  Cost Model: Found costs of 224 for: %res3 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %gep3, i32 2, <16 x i1> %mask, <16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 224 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %res3, <16 x ptr> %gep3, i32 2, <16 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 224 for: %res3 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> align 2 %gep3, <16 x i1> %mask, <16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 224 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %res3, <16 x ptr> align 2 %gep3, <16 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepbs = getelementptr i16, ptr %base16, <16 x i32> %indzext
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepbsb = bitcast <16 x ptr> %gepbs to <16 x ptr>
-; CHECK-NEXT:  Cost Model: Found costs of 224 for: %resbs = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %gepbsb, i32 2, <16 x i1> %mask, <16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 224 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %resbs, <16 x ptr> %gepbsb, i32 2, <16 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 224 for: %resbs = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> align 2 %gepbsb, <16 x i1> %mask, <16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 224 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %resbs, <16 x ptr> align 2 %gepbsb, <16 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:74 CodeSize:1 Lat:1 SizeLat:1 for: %indzext4 = zext <16 x i8> %ind8 to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %gep4 = getelementptr i8, ptr %base, <16 x i32> %indzext
 ; CHECK-NEXT:  Cost Model: Found costs of 32 for: %indtrunc = trunc <16 x i32> %ind32 to <16 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %indtrunc, <16 x ptr> %gep4, i32 2, <16 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %indtrunc, <16 x ptr> align 2 %gep4, <16 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   ; no offset ext
@@ -566,8 +566,8 @@ define void @gep_v16i8(ptr %base, ptr %base16, <16 x i8> %ind8, <16 x i32> %ind3
 define void @gep_v16i8p(<16 x ptr> %base, i32 %off, <16 x i1> %mask)  {
 ; CHECK-LABEL: 'gep_v16i8p'
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepbs = getelementptr i8, <16 x ptr> %base, i32 %off
-; CHECK-NEXT:  Cost Model: Found costs of 224 for: %resbs = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %gepbs, i32 2, <16 x i1> %mask, <16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 224 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %resbs, <16 x ptr> %gepbs, i32 2, <16 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 224 for: %resbs = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> align 2 %gepbs, <16 x i1> %mask, <16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 224 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %resbs, <16 x ptr> align 2 %gepbs, <16 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %gepbs = getelementptr i8, <16 x ptr> %base, i32 %off
diff --git a/llvm/test/Analysis/CostModel/RISCV/fixed-vector-gather.ll b/llvm/test/Analysis/CostModel/RISCV/fixed-vector-gather.ll
index 6eec7ed2f98ec..0f4b1d1c4985e 100644
--- a/llvm/test/Analysis/CostModel/RISCV/fixed-vector-gather.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/fixed-vector-gather.ll
@@ -6,85 +6,85 @@
 
 define i32 @masked_gather() {
 ; CHECK-LABEL: 'masked_gather'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 8, <8 x i1> undef, <8 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 8, <4 x i1> undef, <4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 8, <2 x i1> undef, <2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 8, <1 x i1> undef, <1 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 4, <16 x i1> undef, <16 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 4, <8 x i1> undef, <8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 4, <4 x i1> undef, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 4, <2 x i1> undef, <2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F32 = call <1 x float> @llvm.masked.gather.v1f32.v1p0(<1 x ptr> undef, i32 4, <1 x i1> undef, <1 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32BF16 = call <32 x bfloat> @llvm.masked.gather.v32bf16.v32p0(<32 x ptr> undef, i32 2, <32 x i1> undef, <32 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16BF16 = call <16 x bfloat> @llvm.masked.gather.v16bf16.v16p0(<16 x ptr> undef, i32 2, <16 x i1> undef, <16 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8BF16 = call <8 x bfloat> @llvm.masked.gather.v8bf16.v8p0(<8 x ptr> undef, i32 2, <8 x i1> undef, <8 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4BF16 = call <4 x bfloat> @llvm.masked.gather.v4bf16.v4p0(<4 x ptr> undef, i32 2, <4 x i1> undef, <4 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2BF16 = call <2 x bfloat> @llvm.masked.gather.v2bf16.v2p0(<2 x ptr> undef, i32 2, <2 x i1> undef, <2 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1BF16 = call <1 x bfloat> @llvm.masked.gather.v1bf16.v1p0(<1 x ptr> undef, i32 2, <1 x i1> undef, <1 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32F16 = call <32 x half> @llvm.masked.gather.v32f16.v32p0(<32 x ptr> undef, i32 2, <32 x i1> undef, <32 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F16 = call <16 x half> @llvm.masked.gather.v16f16.v16p0(<16 x ptr> undef, i32 2, <16 x i1> undef, <16 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F16 = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> undef, i32 2, <8 x i1> undef, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F16 = call <4 x half> @llvm.masked.gather.v4f16.v4p0(<4 x ptr> undef, i32 2, <4 x i1> undef, <4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = call <2 x half> @llvm.masked.gather.v2f16.v2p0(<2 x ptr> undef, i32 2, <2 x i1> undef, <2 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F16 = call <1 x half> @llvm.masked.gather.v1f16.v1p0(<1 x ptr> undef, i32 2, <1 x i1> undef, <1 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 8, <8 x i1> undef, <8 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 8, <4 x i1> undef, <4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 8, <2 x i1> undef, <2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 8, <1 x i1> undef, <1 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 4, <16 x i1> undef, <16 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 4, <8 x i1> undef, <8 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 4, <4 x i1> undef, <4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 4, <2 x i1> undef, <2 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I32 = call <1 x i32> @llvm.masked.gather.v1i32.v1p0(<1 x ptr> undef, i32 4, <1 x i1> undef, <1 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 2, <32 x i1> undef, <32 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 2, <16 x i1> undef, <16 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 2, <8 x i1> undef, <8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 2, <4 x i1> undef, <4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> undef, i32 2, <2 x i1> undef, <2 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I16 = call <1 x i16> @llvm.masked.gather.v1i16.v1p0(<1 x ptr> undef, i32 2, <1 x i1> undef, <1 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I8 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I8 = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I8 = call <1 x i8> @llvm.masked.gather.v1i8.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8F64.u = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 2, <8 x i1> undef, <8 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4F64.u = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 2, <4 x i1> undef, <4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F64.u = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 2, <2 x i1> undef, <2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64.u = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 2, <1 x i1> undef, <1 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16F32.u = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 2, <16 x i1> undef, <16 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8F32.u = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 2, <8 x i1> undef, <8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4F32.u = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 2, <4 x i1> undef, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F32.u = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 2, <2 x i1> undef, <2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32.u = call <1 x float> @llvm.masked.gather.v1f32.v1p0(<1 x ptr> undef, i32 2, <1 x i1> undef, <1 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32BF16.u = call <32 x bfloat> @llvm.masked.gather.v32bf16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16BF16.u = call <16 x bfloat> @llvm.masked.gather.v16bf16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8BF16.u = call <8 x bfloat> @llvm.masked.gather.v8bf16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4BF16.u = call <4 x bfloat> @llvm.masked.gather.v4bf16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2BF16.u = call <2 x bfloat> @llvm.masked.gather.v2bf16.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1BF16.u = call <1 x bfloat> @llvm.masked.gather.v1bf16.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32F16.u = call <32 x half> @llvm.masked.gather.v32f16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16F16.u = call <16 x half> @llvm.masked.gather.v16f16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8F16.u = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4F16.u = call <4 x half> @llvm.masked.gather.v4f16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F16.u = call <2 x half> @llvm.masked.gather.v2f16.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F16.u = call <1 x half> @llvm.masked.gather.v1f16.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64.u = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 4, <8 x i1> undef, <8 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64.u = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 4, <4 x i1> undef, <4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64.u = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 4, <2 x i1> undef, <2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64.u = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 4, <1 x i1> undef, <1 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I32.u = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I32.u = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32.u = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32.u = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32.u = call <1 x i32> @llvm.masked.gather.v1i32.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32I16.u = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I16.u = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I16.u = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I16.u = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I16.u = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I16.u = call <1 x i16> @llvm.masked.gather.v1i16.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> align 8 undef, <8 x i1> undef, <8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 8 undef, <4 x i1> undef, <4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 8 undef, <2 x i1> undef, <2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> align 8 undef, <1 x i1> undef, <1 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 undef, <16 x i1> undef, <16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> align 4 undef, <8 x i1> undef, <8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 undef, <4 x i1> undef, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 4 undef, <2 x i1> undef, <2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F32 = call <1 x float> @llvm.masked.gather.v1f32.v1p0(<1 x ptr> align 4 undef, <1 x i1> undef, <1 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32BF16 = call <32 x bfloat> @llvm.masked.gather.v32bf16.v32p0(<32 x ptr> align 2 undef, <32 x i1> undef, <32 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16BF16 = call <16 x bfloat> @llvm.masked.gather.v16bf16.v16p0(<16 x ptr> align 2 undef, <16 x i1> undef, <16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8BF16 = call <8 x bfloat> @llvm.masked.gather.v8bf16.v8p0(<8 x ptr> align 2 undef, <8 x i1> undef, <8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4BF16 = call <4 x bfloat> @llvm.masked.gather.v4bf16.v4p0(<4 x ptr> align 2 undef, <4 x i1> undef, <4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2BF16 = call <2 x bfloat> @llvm.masked.gather.v2bf16.v2p0(<2 x ptr> align 2 undef, <2 x i1> undef, <2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1BF16 = call <1 x bfloat> @llvm.masked.gather.v1bf16.v1p0(<1 x ptr> align 2 undef, <1 x i1> undef, <1 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32F16 = call <32 x half> @llvm.masked.gather.v32f16.v32p0(<32 x ptr> align 2 undef, <32 x i1> undef, <32 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F16 = call <16 x half> @llvm.masked.gather.v16f16.v16p0(<16 x ptr> align 2 undef, <16 x i1> undef, <16 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F16 = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> align 2 undef, <8 x i1> undef, <8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F16 = call <4 x half> @llvm.masked.gather.v4f16.v4p0(<4 x ptr> align 2 undef, <4 x i1> undef, <4 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = call <2 x half> @llvm.masked.gather.v2f16.v2p0(<2 x ptr> align 2 undef, <2 x i1> undef, <2 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F16 = call <1 x half> @llvm.masked.gather.v1f16.v1p0(<1 x ptr> align 2 undef, <1 x i1> undef, <1 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 8 undef, <8 x i1> undef, <8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 8 undef, <4 x i1> undef, <4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 8 undef, <2 x i1> undef, <2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> align 8 undef, <1 x i1> undef, <1 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 4 undef, <16 x i1> undef, <16 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 4 undef, <8 x i1> undef, <8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 undef, <4 x i1> undef, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 4 undef, <2 x i1> undef, <2 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I32 = call <1 x i32> @llvm.masked.gather.v1i32.v1p0(<1 x ptr> align 4 undef, <1 x i1> undef, <1 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> align 2 undef, <32 x i1> undef, <32 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> align 2 undef, <16 x i1> undef, <16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 2 undef, <8 x i1> undef, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 2 undef, <4 x i1> undef, <4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> align 2 undef, <2 x i1> undef, <2 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I16 = call <1 x i16> @llvm.masked.gather.v1i16.v1p0(<1 x ptr> align 2 undef, <1 x i1> undef, <1 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> align 1 undef, <64 x i1> undef, <64 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> align 1 undef, <32 x i1> undef, <32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> align 1 undef, <16 x i1> undef, <16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> align 1 undef, <8 x i1> undef, <8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I8 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> align 1 undef, <4 x i1> undef, <4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I8 = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> align 1 undef, <2 x i1> undef, <2 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I8 = call <1 x i8> @llvm.masked.gather.v1i8.v1p0(<1 x ptr> align 1 undef, <1 x i1> undef, <1 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8F64.u = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> align 2 undef, <8 x i1> undef, <8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4F64.u = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 2 undef, <4 x i1> undef, <4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F64.u = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 2 undef, <2 x i1> undef, <2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64.u = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> align 2 undef, <1 x i1> undef, <1 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16F32.u = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 2 undef, <16 x i1> undef, <16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8F32.u = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> align 2 undef, <8 x i1> undef, <8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4F32.u = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 2 undef, <4 x i1> undef, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F32.u = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 2 undef, <2 x i1> undef, <2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32.u = call <1 x float> @llvm.masked.gather.v1f32.v1p0(<1 x ptr> align 2 undef, <1 x i1> undef, <1 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32BF16.u = call <32 x bfloat> @llvm.masked.gather.v32bf16.v32p0(<32 x ptr> align 1 undef, <32 x i1> undef, <32 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16BF16.u = call <16 x bfloat> @llvm.masked.gather.v16bf16.v16p0(<16 x ptr> align 1 undef, <16 x i1> undef, <16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8BF16.u = call <8 x bfloat> @llvm.masked.gather.v8bf16.v8p0(<8 x ptr> align 1 undef, <8 x i1> undef, <8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4BF16.u = call <4 x bfloat> @llvm.masked.gather.v4bf16.v4p0(<4 x ptr> align 1 undef, <4 x i1> undef, <4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2BF16.u = call <2 x bfloat> @llvm.masked.gather.v2bf16.v2p0(<2 x ptr> align 1 undef, <2 x i1> undef, <2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1BF16.u = call <1 x bfloat> @llvm.masked.gather.v1bf16.v1p0(<1 x ptr> align 1 undef, <1 x i1> undef, <1 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32F16.u = call <32 x half> @llvm.masked.gather.v32f16.v32p0(<32 x ptr> align 1 undef, <32 x i1> undef, <32 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16F16.u = call <16 x half> @llvm.masked.gather.v16f16.v16p0(<16 x ptr> align 1 undef, <16 x i1> undef, <16 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8F16.u = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> align 1 undef, <8 x i1> undef, <8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4F16.u = call <4 x half> @llvm.masked.gather.v4f16.v4p0(<4 x ptr> align 1 undef, <4 x i1> undef, <4 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F16.u = call <2 x half> @llvm.masked.gather.v2f16.v2p0(<2 x ptr> align 1 undef, <2 x i1> undef, <2 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F16.u = call <1 x half> @llvm.masked.gather.v1f16.v1p0(<1 x ptr> align 1 undef, <1 x i1> undef, <1 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64.u = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 4 undef, <8 x i1> undef, <8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64.u = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 4 undef, <4 x i1> undef, <4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64.u = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 4 undef, <2 x i1> undef, <2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64.u = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> align 4 undef, <1 x i1> undef, <1 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I32.u = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 1 undef, <16 x i1> undef, <16 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I32.u = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 1 undef, <8 x i1> undef, <8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32.u = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 undef, <4 x i1> undef, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32.u = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 1 undef, <2 x i1> undef, <2 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32.u = call <1 x i32> @llvm.masked.gather.v1i32.v1p0(<1 x ptr> align 1 undef, <1 x i1> undef, <1 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32I16.u = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> align 1 undef, <32 x i1> undef, <32 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I16.u = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> align 1 undef, <16 x i1> undef, <16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I16.u = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 1 undef, <8 x i1> undef, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I16.u = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 1 undef, <4 x i1> undef, <4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I16.u = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> align 1 undef, <2 x i1> undef, <2 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I16.u = call <1 x i16> @llvm.masked.gather.v1i16.v1p0(<1 x ptr> align 1 undef, <1 x i1> undef, <1 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
   %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 8, <8 x i1> undef, <8 x double> undef)
diff --git a/llvm/test/Analysis/CostModel/RISCV/fixed-vector-scatter.ll b/llvm/test/Analysis/CostModel/RISCV/fixed-vector-scatter.ll
index 338683e12654c..fe90a1a4f2cd9 100644
--- a/llvm/test/Analysis/CostModel/RISCV/fixed-vector-scatter.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/fixed-vector-scatter.ll
@@ -6,85 +6,85 @@
 
 define i32 @masked_scatter() {
 ; CHECK-LABEL: 'masked_scatter'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 8, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 8, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 8, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 8, <1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 4, <16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 4, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 4, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 4, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1f32.v1p0(<1 x float> undef, <1 x ptr> undef, i32 4, <1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v32bf16.v32p0(<32 x bfloat> undef, <32 x ptr> undef, i32 2, <32 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v16bf16.v16p0(<16 x bfloat> undef, <16 x ptr> undef, i32 2, <16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8bf16.v8p0(<8 x bfloat> undef, <8 x ptr> undef, i32 2, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4bf16.v4p0(<4 x bfloat> undef, <4 x ptr> undef, i32 2, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2bf16.v2p0(<2 x bfloat> undef, <2 x ptr> undef, i32 2, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1bf16.v1p0(<1 x bfloat> undef, <1 x ptr> undef, i32 2, <1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v32f16.v32p0(<32 x half> undef, <32 x ptr> undef, i32 2, <32 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v16f16.v16p0(<16 x half> undef, <16 x ptr> undef, i32 2, <16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> undef, <8 x ptr> undef, i32 2, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4f16.v4p0(<4 x half> undef, <4 x ptr> undef, i32 2, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2f16.v2p0(<2 x half> undef, <2 x ptr> undef, i32 2, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1f16.v1p0(<1 x half> undef, <1 x ptr> undef, i32 2, <1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 8, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 8, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 8, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 8, <1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 4, <16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 4, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 4, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 4, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i32.v1p0(<1 x i32> undef, <1 x ptr> undef, i32 4, <1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 2, <32 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 2, <16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 2, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 2, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2i16.v2p0(<2 x i16> undef, <2 x ptr> undef, i32 2, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i16.v1p0(<1 x i16> undef, <1 x ptr> undef, i32 2, <1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i8.v1p0(<1 x i8> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 2, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 2, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 2, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 2, <1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 2, <16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 2, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 2, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 2, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f32.v1p0(<1 x float> undef, <1 x ptr> undef, i32 2, <1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: call void @llvm.masked.scatter.v32bf16.v32p0(<32 x bfloat> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.scatter.v16bf16.v16p0(<16 x bfloat> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8bf16.v8p0(<8 x bfloat> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4bf16.v4p0(<4 x bfloat> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2bf16.v2p0(<2 x bfloat> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1bf16.v1p0(<1 x bfloat> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: call void @llvm.masked.scatter.v32f16.v32p0(<32 x half> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.scatter.v16f16.v16p0(<16 x half> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f16.v4p0(<4 x half> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f16.v2p0(<2 x half> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f16.v1p0(<1 x half> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i32.v1p0(<1 x i32> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2i16.v2p0(<2 x i16> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i16.v1p0(<1 x i16> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> align 8 undef, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> align 8 undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> align 8 undef, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> align 8 undef, <1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> align 4 undef, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> align 4 undef, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> align 4 undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> align 4 undef, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1f32.v1p0(<1 x float> undef, <1 x ptr> align 4 undef, <1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v32bf16.v32p0(<32 x bfloat> undef, <32 x ptr> align 2 undef, <32 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v16bf16.v16p0(<16 x bfloat> undef, <16 x ptr> align 2 undef, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8bf16.v8p0(<8 x bfloat> undef, <8 x ptr> align 2 undef, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4bf16.v4p0(<4 x bfloat> undef, <4 x ptr> align 2 undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2bf16.v2p0(<2 x bfloat> undef, <2 x ptr> align 2 undef, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1bf16.v1p0(<1 x bfloat> undef, <1 x ptr> align 2 undef, <1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v32f16.v32p0(<32 x half> undef, <32 x ptr> align 2 undef, <32 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v16f16.v16p0(<16 x half> undef, <16 x ptr> align 2 undef, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> undef, <8 x ptr> align 2 undef, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4f16.v4p0(<4 x half> undef, <4 x ptr> align 2 undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2f16.v2p0(<2 x half> undef, <2 x ptr> align 2 undef, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1f16.v1p0(<1 x half> undef, <1 x ptr> align 2 undef, <1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> align 8 undef, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> align 8 undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> align 8 undef, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> align 8 undef, <1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> align 4 undef, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> align 4 undef, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> align 4 undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> align 4 undef, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i32.v1p0(<1 x i32> undef, <1 x ptr> align 4 undef, <1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> align 2 undef, <32 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> align 2 undef, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> align 2 undef, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> align 2 undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2i16.v2p0(<2 x i16> undef, <2 x ptr> align 2 undef, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i16.v1p0(<1 x i16> undef, <1 x ptr> align 2 undef, <1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> align 1 undef, <64 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> align 1 undef, <32 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> align 1 undef, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> align 1 undef, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> undef, <4 x ptr> align 1 undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> undef, <2 x ptr> align 1 undef, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i8.v1p0(<1 x i8> undef, <1 x ptr> align 1 undef, <1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> align 2 undef, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> align 2 undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> align 2 undef, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> align 2 undef, <1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> align 2 undef, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> align 2 undef, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> align 2 undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> align 2 undef, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f32.v1p0(<1 x float> undef, <1 x ptr> align 2 undef, <1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: call void @llvm.masked.scatter.v32bf16.v32p0(<32 x bfloat> undef, <32 x ptr> align 1 undef, <32 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.scatter.v16bf16.v16p0(<16 x bfloat> undef, <16 x ptr> align 1 undef, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8bf16.v8p0(<8 x bfloat> undef, <8 x ptr> align 1 undef, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4bf16.v4p0(<4 x bfloat> undef, <4 x ptr> align 1 undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2bf16.v2p0(<2 x bfloat> undef, <2 x ptr> align 1 undef, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1bf16.v1p0(<1 x bfloat> undef, <1 x ptr> align 1 undef, <1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: call void @llvm.masked.scatter.v32f16.v32p0(<32 x half> undef, <32 x ptr> align 1 undef, <32 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.scatter.v16f16.v16p0(<16 x half> undef, <16 x ptr> align 1 undef, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> undef, <8 x ptr> align 1 undef, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f16.v4p0(<4 x half> undef, <4 x ptr> align 1 undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f16.v2p0(<2 x half> undef, <2 x ptr> align 1 undef, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f16.v1p0(<1 x half> undef, <1 x ptr> align 1 undef, <1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> align 1 undef, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> align 1 undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> align 1 undef, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> align 1 undef, <1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> align 1 undef, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> align 1 undef, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> align 1 undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> align 1 undef, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i32.v1p0(<1 x i32> undef, <1 x ptr> align 1 undef, <1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> align 1 undef, <32 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> align 1 undef, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> align 1 undef, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> align 1 undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2i16.v2p0(<2 x i16> undef, <2 x ptr> align 1 undef, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i16.v1p0(<1 x i16> undef, <1 x ptr> align 1 undef, <1 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
   call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 8, <8 x i1> undef)
diff --git a/llvm/test/Analysis/CostModel/RISCV/gep.ll b/llvm/test/Analysis/CostModel/RISCV/gep.ll
index f8c370050eaee..12dffb43cba7d 100644
--- a/llvm/test/Analysis/CostModel/RISCV/gep.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/gep.ll
@@ -264,9 +264,9 @@ define void @non_foldable_vector_uses(ptr %base, <2 x ptr> %base.vec) {
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = getelementptr i8, ptr %base, i32 42
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %x1 = load volatile <2 x i8>, ptr %1, align 2
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = getelementptr i8, ptr %base, i32 42
-; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %x2 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr %2, i32 1, <2 x i1> undef, <2 x i8> undef)
+; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %x2 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr align 1 %2, <2 x i1> undef, <2 x i8> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = getelementptr i8, <2 x ptr> %base.vec, <2 x i32> <i32 42, i32 43>
-; RVI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %x3 = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> %3, i32 1, <2 x i1> undef, <2 x i8> undef)
+; RVI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %x3 = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> align 1 %3, <2 x i1> undef, <2 x i8> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = getelementptr i8, ptr %base, i32 42
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %x4 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr %4, <2 x i1> undef, <2 x i8> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = getelementptr i8, ptr %base, i32 42
@@ -276,9 +276,9 @@ define void @non_foldable_vector_uses(ptr %base, <2 x ptr> %base.vec) {
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = getelementptr i8, ptr %base, i32 42
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store volatile <2 x i8> undef, ptr %7, align 2
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = getelementptr i8, ptr %base, i32 42
-; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i8.p0(<2 x i8> undef, ptr %8, i32 1, <2 x i1> undef)
+; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i8.p0(<2 x i8> undef, ptr align 1 %8, <2 x i1> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = getelementptr i8, <2 x ptr> %base.vec, <2 x i32> <i32 42, i32 43>
-; RVI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> undef, <2 x ptr> %9, i32 1, <2 x i1> undef)
+; RVI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> undef, <2 x ptr> align 1 %9, <2 x i1> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = getelementptr i8, ptr %base, i32 42
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v2i8(<2 x i8> undef, ptr %10, <2 x i1> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = getelementptr i8, ptr %base, i32 42
@@ -334,9 +334,9 @@ define void @foldable_vector_uses(ptr %base, <2 x ptr> %base.vec) {
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %1 = getelementptr i8, ptr %base, i32 0
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %x1 = load volatile <2 x i8>, ptr %1, align 2
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %2 = getelementptr i8, ptr %base, i32 0
-; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %x2 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr %2, i32 1, <2 x i1> undef, <2 x i8> undef)
+; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %x2 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr align 1 %2, <2 x i1> undef, <2 x i8> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %3 = getelementptr i8, <2 x ptr> %base.vec, <2 x i32> zeroinitializer
-; RVI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %x3 = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> %3, i32 1, <2 x i1> undef, <2 x i8> undef)
+; RVI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %x3 = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> align 1 %3, <2 x i1> undef, <2 x i8> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %4 = getelementptr i8, ptr %base, i32 0
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %x4 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr %4, <2 x i1> undef, <2 x i8> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %5 = getelementptr i8, ptr %base, i32 0
@@ -346,9 +346,9 @@ define void @foldable_vector_uses(ptr %base, <2 x ptr> %base.vec) {
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %7 = getelementptr i8, ptr %base, i32 0
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store volatile <2 x i8> undef, ptr %7, align 2
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %8 = getelementptr i8, ptr %base, i32 0
-; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i8.p0(<2 x i8> undef, ptr %8, i32 1, <2 x i1> undef)
+; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i8.p0(<2 x i8> undef, ptr align 1 %8, <2 x i1> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %9 = getelementptr i8, <2 x ptr> %base.vec, <2 x i32> zeroinitializer
-; RVI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> undef, <2 x ptr> %9, i32 1, <2 x i1> undef)
+; RVI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> undef, <2 x ptr> align 1 %9, <2 x i1> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %10 = getelementptr i8, ptr %base, i32 0
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v2i8(<2 x i8> undef, ptr %10, <2 x i1> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %11 = getelementptr i8, ptr %base, i32 0
diff --git a/llvm/test/Analysis/CostModel/RISCV/masked_ldst.ll b/llvm/test/Analysis/CostModel/RISCV/masked_ldst.ll
index 892277a2d5740..e572be71004b2 100644
--- a/llvm/test/Analysis/CostModel/RISCV/masked_ldst.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/masked_ldst.ll
@@ -3,24 +3,24 @@
 
 define void @fixed() {
 ; CHECK-LABEL: 'fixed'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 8, <8 x i1> undef, <8 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 8, <16 x i1> undef, <16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 223 for instruction: %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0(ptr undef, i32 8, <32 x i1> undef, <32 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr align 8 undef, <2 x i1> undef, <2 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr align 8 undef, <4 x i1> undef, <4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr align 8 undef, <8 x i1> undef, <8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 8 undef, <16 x i1> undef, <16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0(ptr align 8 undef, <2 x i1> undef, <2 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 8 undef, <4 x i1> undef, <4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 8 undef, <8 x i1> undef, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 8 undef, <2 x i1> undef, <2 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 8 undef, <4 x i1> undef, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr align 8 undef, <2 x i1> undef, <2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr align 8 undef, <2 x i1> undef, <2 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr align 8 undef, <4 x i1> undef, <4 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr align 8 undef, <8 x i1> undef, <8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 8 undef, <2 x i1> undef, <2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 8 undef, <4 x i1> undef, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 8 undef, <2 x i1> undef, <2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr align 8 undef, <4 x i1> undef, <4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 223 for instruction: %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0(ptr align 8 undef, <32 x i1> undef, <32 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 entry:
@@ -53,25 +53,25 @@ entry:
 
 define void @scalable() {
 ; CHECK-LABEL: 'scalable'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8 = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8 = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8 = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr undef, i32 8, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16 = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16 = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16 = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32 = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32 = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64 = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f16 = call <vscale x 2 x half> @llvm.masked.load.nxv2f16.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x half> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f16 = call <vscale x 4 x half> @llvm.masked.load.nxv4f16.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x half> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f16 = call <vscale x 8 x half> @llvm.masked.load.nxv8f16.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32 = call <vscale x 2 x float> @llvm.masked.load.nxv2f32.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64 = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64 = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0(ptr undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64 = call <vscale x 4 x i64> @llvm.masked.load.nxv4i64.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %nxv32f16 = call <vscale x 32 x half> @llvm.masked.load.nxv32f16.p0(ptr undef, i32 8, <vscale x 32 x i1> undef, <vscale x 32 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8 = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8 = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8 = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 8 undef, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16 = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16 = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16 = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32 = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32 = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64 = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f16 = call <vscale x 2 x half> @llvm.masked.load.nxv2f16.p0(ptr align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x half> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f16 = call <vscale x 4 x half> @llvm.masked.load.nxv4f16.p0(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x half> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f16 = call <vscale x 8 x half> @llvm.masked.load.nxv8f16.p0(ptr align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32 = call <vscale x 2 x float> @llvm.masked.load.nxv2f32.p0(ptr align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64 = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64 = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0(ptr align 8 undef, <vscale x 1 x i1> undef, <vscale x 1 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64 = call <vscale x 4 x i64> @llvm.masked.load.nxv4i64.p0(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %nxv32f16 = call <vscale x 32 x half> @llvm.masked.load.nxv32f16.p0(ptr align 8 undef, <vscale x 32 x i1> undef, <vscale x 32 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 entry:
diff --git a/llvm/test/Analysis/CostModel/RISCV/scalable-gather.ll b/llvm/test/Analysis/CostModel/RISCV/scalable-gather.ll
index 8504b4cee8a6f..997ec12b714f6 100644
--- a/llvm/test/Analysis/CostModel/RISCV/scalable-gather.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/scalable-gather.ll
@@ -45,79 +45,79 @@ define void @masked_gather_aligned() {
 ; GENERIC-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; MAX256-LABEL: 'masked_gather_aligned'
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <vscale x 8 x double> @llvm.masked.gather.nxv8f64.nxv8p0(<vscale x 8 x ptr> undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x double> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0(<vscale x 4 x ptr> undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x double> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <vscale x 1 x double> @llvm.masked.gather.nxv1f64.nxv1p0(<vscale x 1 x ptr> undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x double> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <vscale x 16 x float> @llvm.masked.gather.nxv16f32.nxv16p0(<vscale x 16 x ptr> undef, i32 4, <vscale x 16 x i1> undef, <vscale x 16 x float> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <vscale x 8 x float> @llvm.masked.gather.nxv8f32.nxv8p0(<vscale x 8 x ptr> undef, i32 4, <vscale x 8 x i1> undef, <vscale x 8 x float> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> undef, i32 4, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0(<vscale x 2 x ptr> undef, i32 4, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = call <vscale x 1 x float> @llvm.masked.gather.nxv1f32.nxv1p0(<vscale x 1 x ptr> undef, i32 4, <vscale x 1 x i1> undef, <vscale x 1 x float> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <vscale x 8 x i64> @llvm.masked.gather.nxv8i64.nxv8p0(<vscale x 8 x ptr> undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i64> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <vscale x 4 x i64> @llvm.masked.gather.nxv4i64.nxv4p0(<vscale x 4 x ptr> undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i64> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i64> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x i64> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <vscale x 16 x i32> @llvm.masked.gather.nxv16i32.nxv16p0(<vscale x 16 x ptr> undef, i32 4, <vscale x 16 x i1> undef, <vscale x 16 x i32> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> undef, i32 4, <vscale x 8 x i1> undef, <vscale x 8 x i32> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> undef, i32 4, <vscale x 4 x i1> undef, <vscale x 4 x i32> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> undef, i32 4, <vscale x 2 x i1> undef, <vscale x 2 x i32> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = call <vscale x 1 x i32> @llvm.masked.gather.nxv1i32.nxv1p0(<vscale x 1 x ptr> undef, i32 4, <vscale x 1 x i1> undef, <vscale x 1 x i32> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <vscale x 32 x i16> @llvm.masked.gather.nxv32i16.nxv32p0(<vscale x 32 x ptr> undef, i32 2, <vscale x 32 x i1> undef, <vscale x 32 x i16> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <vscale x 16 x i16> @llvm.masked.gather.nxv16i16.nxv16p0(<vscale x 16 x ptr> undef, i32 2, <vscale x 16 x i1> undef, <vscale x 16 x i16> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0(<vscale x 8 x ptr> undef, i32 2, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> undef, i32 2, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16.nxv2p0(<vscale x 2 x ptr> undef, i32 2, <vscale x 2 x i1> undef, <vscale x 2 x i16> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I16 = call <vscale x 1 x i16> @llvm.masked.gather.nxv1i16.nxv1p0(<vscale x 1 x ptr> undef, i32 2, <vscale x 1 x i1> undef, <vscale x 1 x i16> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <vscale x 64 x i8> @llvm.masked.gather.nxv64i8.nxv64p0(<vscale x 64 x ptr> undef, i32 1, <vscale x 64 x i1> undef, <vscale x 64 x i8> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <vscale x 32 x i8> @llvm.masked.gather.nxv32i8.nxv32p0(<vscale x 32 x ptr> undef, i32 1, <vscale x 32 x i1> undef, <vscale x 32 x i8> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> undef, i32 1, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> undef, i32 1, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8.nxv4p0(<vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0(<vscale x 2 x ptr> undef, i32 1, <vscale x 2 x i1> undef, <vscale x 2 x i8> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I8 = call <vscale x 1 x i8> @llvm.masked.gather.nxv1i8.nxv1p0(<vscale x 1 x ptr> undef, i32 1, <vscale x 1 x i1> undef, <vscale x 1 x i8> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8PTR = call <vscale x 8 x ptr> @llvm.masked.gather.nxv8p0.nxv8p0(<vscale x 8 x ptr> undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x ptr> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4PTR = call <vscale x 4 x ptr> @llvm.masked.gather.nxv4p0.nxv4p0(<vscale x 4 x ptr> undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x ptr> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2PTR = call <vscale x 2 x ptr> @llvm.masked.gather.nxv2p0.nxv2p0(<vscale x 2 x ptr> undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x ptr> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1PTR = call <vscale x 1 x ptr> @llvm.masked.gather.nxv1p0.nxv1p0(<vscale x 1 x ptr> undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x ptr> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <vscale x 8 x double> @llvm.masked.gather.nxv8f64.nxv8p0(<vscale x 8 x ptr> align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x double> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0(<vscale x 4 x ptr> align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x double> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <vscale x 1 x double> @llvm.masked.gather.nxv1f64.nxv1p0(<vscale x 1 x ptr> align 8 undef, <vscale x 1 x i1> undef, <vscale x 1 x double> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <vscale x 16 x float> @llvm.masked.gather.nxv16f32.nxv16p0(<vscale x 16 x ptr> align 4 undef, <vscale x 16 x i1> undef, <vscale x 16 x float> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <vscale x 8 x float> @llvm.masked.gather.nxv8f32.nxv8p0(<vscale x 8 x ptr> align 4 undef, <vscale x 8 x i1> undef, <vscale x 8 x float> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> align 4 undef, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0(<vscale x 2 x ptr> align 4 undef, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = call <vscale x 1 x float> @llvm.masked.gather.nxv1f32.nxv1p0(<vscale x 1 x ptr> align 4 undef, <vscale x 1 x i1> undef, <vscale x 1 x float> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <vscale x 8 x i64> @llvm.masked.gather.nxv8i64.nxv8p0(<vscale x 8 x ptr> align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x i64> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <vscale x 4 x i64> @llvm.masked.gather.nxv4i64.nxv4p0(<vscale x 4 x ptr> align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x i64> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x i64> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> align 8 undef, <vscale x 1 x i1> undef, <vscale x 1 x i64> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <vscale x 16 x i32> @llvm.masked.gather.nxv16i32.nxv16p0(<vscale x 16 x ptr> align 4 undef, <vscale x 16 x i1> undef, <vscale x 16 x i32> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> align 4 undef, <vscale x 8 x i1> undef, <vscale x 8 x i32> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 undef, <vscale x 4 x i1> undef, <vscale x 4 x i32> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> align 4 undef, <vscale x 2 x i1> undef, <vscale x 2 x i32> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = call <vscale x 1 x i32> @llvm.masked.gather.nxv1i32.nxv1p0(<vscale x 1 x ptr> align 4 undef, <vscale x 1 x i1> undef, <vscale x 1 x i32> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <vscale x 32 x i16> @llvm.masked.gather.nxv32i16.nxv32p0(<vscale x 32 x ptr> align 2 undef, <vscale x 32 x i1> undef, <vscale x 32 x i16> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <vscale x 16 x i16> @llvm.masked.gather.nxv16i16.nxv16p0(<vscale x 16 x ptr> align 2 undef, <vscale x 16 x i1> undef, <vscale x 16 x i16> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0(<vscale x 8 x ptr> align 2 undef, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> align 2 undef, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16.nxv2p0(<vscale x 2 x ptr> align 2 undef, <vscale x 2 x i1> undef, <vscale x 2 x i16> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I16 = call <vscale x 1 x i16> @llvm.masked.gather.nxv1i16.nxv1p0(<vscale x 1 x ptr> align 2 undef, <vscale x 1 x i1> undef, <vscale x 1 x i16> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <vscale x 64 x i8> @llvm.masked.gather.nxv64i8.nxv64p0(<vscale x 64 x ptr> align 1 undef, <vscale x 64 x i1> undef, <vscale x 64 x i8> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <vscale x 32 x i8> @llvm.masked.gather.nxv32i8.nxv32p0(<vscale x 32 x ptr> align 1 undef, <vscale x 32 x i1> undef, <vscale x 32 x i8> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 undef, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> align 1 undef, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8.nxv4p0(<vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0(<vscale x 2 x ptr> align 1 undef, <vscale x 2 x i1> undef, <vscale x 2 x i8> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I8 = call <vscale x 1 x i8> @llvm.masked.gather.nxv1i8.nxv1p0(<vscale x 1 x ptr> align 1 undef, <vscale x 1 x i1> undef, <vscale x 1 x i8> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8PTR = call <vscale x 8 x ptr> @llvm.masked.gather.nxv8p0.nxv8p0(<vscale x 8 x ptr> align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x ptr> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4PTR = call <vscale x 4 x ptr> @llvm.masked.gather.nxv4p0.nxv4p0(<vscale x 4 x ptr> align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x ptr> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2PTR = call <vscale x 2 x ptr> @llvm.masked.gather.nxv2p0.nxv2p0(<vscale x 2 x ptr> align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x ptr> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1PTR = call <vscale x 1 x ptr> @llvm.masked.gather.nxv1p0.nxv1p0(<vscale x 1 x ptr> align 8 undef, <vscale x 1 x i1> undef, <vscale x 1 x ptr> undef)
 ; MAX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; UNSUPPORTED-LABEL: 'masked_gather_aligned'
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V8F64 = call <vscale x 8 x double> @llvm.masked.gather.nxv8f64.nxv8p0(<vscale x 8 x ptr> undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x double> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V4F64 = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0(<vscale x 4 x ptr> undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x double> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V2F64 = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V1F64 = call <vscale x 1 x double> @llvm.masked.gather.nxv1f64.nxv1p0(<vscale x 1 x ptr> undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x double> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V16F32 = call <vscale x 16 x float> @llvm.masked.gather.nxv16f32.nxv16p0(<vscale x 16 x ptr> undef, i32 4, <vscale x 16 x i1> undef, <vscale x 16 x float> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V8F32 = call <vscale x 8 x float> @llvm.masked.gather.nxv8f32.nxv8p0(<vscale x 8 x ptr> undef, i32 4, <vscale x 8 x i1> undef, <vscale x 8 x float> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V4F32 = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> undef, i32 4, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V2F32 = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0(<vscale x 2 x ptr> undef, i32 4, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V1F32 = call <vscale x 1 x float> @llvm.masked.gather.nxv1f32.nxv1p0(<vscale x 1 x ptr> undef, i32 4, <vscale x 1 x i1> undef, <vscale x 1 x float> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V8I64 = call <vscale x 8 x i64> @llvm.masked.gather.nxv8i64.nxv8p0(<vscale x 8 x ptr> undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i64> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V4I64 = call <vscale x 4 x i64> @llvm.masked.gather.nxv4i64.nxv4p0(<vscale x 4 x ptr> undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i64> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V2I64 = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i64> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V1I64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x i64> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V16I32 = call <vscale x 16 x i32> @llvm.masked.gather.nxv16i32.nxv16p0(<vscale x 16 x ptr> undef, i32 4, <vscale x 16 x i1> undef, <vscale x 16 x i32> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V8I32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> undef, i32 4, <vscale x 8 x i1> undef, <vscale x 8 x i32> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V4I32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> undef, i32 4, <vscale x 4 x i1> undef, <vscale x 4 x i32> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V2I32 = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> undef, i32 4, <vscale x 2 x i1> undef, <vscale x 2 x i32> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V1I32 = call <vscale x 1 x i32> @llvm.masked.gather.nxv1i32.nxv1p0(<vscale x 1 x ptr> undef, i32 4, <vscale x 1 x i1> undef, <vscale x 1 x i32> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V32I16 = call <vscale x 32 x i16> @llvm.masked.gather.nxv32i16.nxv32p0(<vscale x 32 x ptr> undef, i32 2, <vscale x 32 x i1> undef, <vscale x 32 x i16> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V16I16 = call <vscale x 16 x i16> @llvm.masked.gather.nxv16i16.nxv16p0(<vscale x 16 x ptr> undef, i32 2, <vscale x 16 x i1> undef, <vscale x 16 x i16> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V8I16 = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0(<vscale x 8 x ptr> undef, i32 2, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V4I16 = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> undef, i32 2, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V2I16 = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16.nxv2p0(<vscale x 2 x ptr> undef, i32 2, <vscale x 2 x i1> undef, <vscale x 2 x i16> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V1I16 = call <vscale x 1 x i16> @llvm.masked.gather.nxv1i16.nxv1p0(<vscale x 1 x ptr> undef, i32 2, <vscale x 1 x i1> undef, <vscale x 1 x i16> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V64I8 = call <vscale x 64 x i8> @llvm.masked.gather.nxv64i8.nxv64p0(<vscale x 64 x ptr> undef, i32 1, <vscale x 64 x i1> undef, <vscale x 64 x i8> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V32I8 = call <vscale x 32 x i8> @llvm.masked.gather.nxv32i8.nxv32p0(<vscale x 32 x ptr> undef, i32 1, <vscale x 32 x i1> undef, <vscale x 32 x i8> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V16I8 = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> undef, i32 1, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V8I8 = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> undef, i32 1, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V4I8 = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8.nxv4p0(<vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V2I8 = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0(<vscale x 2 x ptr> undef, i32 1, <vscale x 2 x i1> undef, <vscale x 2 x i8> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V1I8 = call <vscale x 1 x i8> @llvm.masked.gather.nxv1i8.nxv1p0(<vscale x 1 x ptr> undef, i32 1, <vscale x 1 x i1> undef, <vscale x 1 x i8> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V8PTR = call <vscale x 8 x ptr> @llvm.masked.gather.nxv8p0.nxv8p0(<vscale x 8 x ptr> undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x ptr> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V4PTR = call <vscale x 4 x ptr> @llvm.masked.gather.nxv4p0.nxv4p0(<vscale x 4 x ptr> undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x ptr> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V2PTR = call <vscale x 2 x ptr> @llvm.masked.gather.nxv2p0.nxv2p0(<vscale x 2 x ptr> undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x ptr> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V1PTR = call <vscale x 1 x ptr> @llvm.masked.gather.nxv1p0.nxv1p0(<vscale x 1 x ptr> undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x ptr> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V8F64 = call <vscale x 8 x double> @llvm.masked.gather.nxv8f64.nxv8p0(<vscale x 8 x ptr> align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x double> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V4F64 = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0(<vscale x 4 x ptr> align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x double> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V2F64 = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V1F64 = call <vscale x 1 x double> @llvm.masked.gather.nxv1f64.nxv1p0(<vscale x 1 x ptr> align 8 undef, <vscale x 1 x i1> undef, <vscale x 1 x double> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V16F32 = call <vscale x 16 x float> @llvm.masked.gather.nxv16f32.nxv16p0(<vscale x 16 x ptr> align 4 undef, <vscale x 16 x i1> undef, <vscale x 16 x float> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V8F32 = call <vscale x 8 x float> @llvm.masked.gather.nxv8f32.nxv8p0(<vscale x 8 x ptr> align 4 undef, <vscale x 8 x i1> undef, <vscale x 8 x float> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V4F32 = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> align 4 undef, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V2F32 = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0(<vscale x 2 x ptr> align 4 undef, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V1F32 = call <vscale x 1 x float> @llvm.masked.gather.nxv1f32.nxv1p0(<vscale x 1 x ptr> align 4 undef, <vscale x 1 x i1> undef, <vscale x 1 x float> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V8I64 = call <vscale x 8 x i64> @llvm.masked.gather.nxv8i64.nxv8p0(<vscale x 8 x ptr> align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x i64> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V4I64 = call <vscale x 4 x i64> @llvm.masked.gather.nxv4i64.nxv4p0(<vscale x 4 x ptr> align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x i64> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V2I64 = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x i64> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V1I64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> align 8 undef, <vscale x 1 x i1> undef, <vscale x 1 x i64> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V16I32 = call <vscale x 16 x i32> @llvm.masked.gather.nxv16i32.nxv16p0(<vscale x 16 x ptr> align 4 undef, <vscale x 16 x i1> undef, <vscale x 16 x i32> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V8I32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> align 4 undef, <vscale x 8 x i1> undef, <vscale x 8 x i32> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V4I32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 undef, <vscale x 4 x i1> undef, <vscale x 4 x i32> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V2I32 = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> align 4 undef, <vscale x 2 x i1> undef, <vscale x 2 x i32> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V1I32 = call <vscale x 1 x i32> @llvm.masked.gather.nxv1i32.nxv1p0(<vscale x 1 x ptr> align 4 undef, <vscale x 1 x i1> undef, <vscale x 1 x i32> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V32I16 = call <vscale x 32 x i16> @llvm.masked.gather.nxv32i16.nxv32p0(<vscale x 32 x ptr> align 2 undef, <vscale x 32 x i1> undef, <vscale x 32 x i16> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V16I16 = call <vscale x 16 x i16> @llvm.masked.gather.nxv16i16.nxv16p0(<vscale x 16 x ptr> align 2 undef, <vscale x 16 x i1> undef, <vscale x 16 x i16> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V8I16 = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0(<vscale x 8 x ptr> align 2 undef, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V4I16 = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> align 2 undef, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V2I16 = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16.nxv2p0(<vscale x 2 x ptr> align 2 undef, <vscale x 2 x i1> undef, <vscale x 2 x i16> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V1I16 = call <vscale x 1 x i16> @llvm.masked.gather.nxv1i16.nxv1p0(<vscale x 1 x ptr> align 2 undef, <vscale x 1 x i1> undef, <vscale x 1 x i16> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V64I8 = call <vscale x 64 x i8> @llvm.masked.gather.nxv64i8.nxv64p0(<vscale x 64 x ptr> align 1 undef, <vscale x 64 x i1> undef, <vscale x 64 x i8> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V32I8 = call <vscale x 32 x i8> @llvm.masked.gather.nxv32i8.nxv32p0(<vscale x 32 x ptr> align 1 undef, <vscale x 32 x i1> undef, <vscale x 32 x i8> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V16I8 = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 undef, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V8I8 = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> align 1 undef, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V4I8 = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8.nxv4p0(<vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V2I8 = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0(<vscale x 2 x ptr> align 1 undef, <vscale x 2 x i1> undef, <vscale x 2 x i8> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V1I8 = call <vscale x 1 x i8> @llvm.masked.gather.nxv1i8.nxv1p0(<vscale x 1 x ptr> align 1 undef, <vscale x 1 x i1> undef, <vscale x 1 x i8> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V8PTR = call <vscale x 8 x ptr> @llvm.masked.gather.nxv8p0.nxv8p0(<vscale x 8 x ptr> align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x ptr> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V4PTR = call <vscale x 4 x ptr> @llvm.masked.gather.nxv4p0.nxv4p0(<vscale x 4 x ptr> align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x ptr> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V2PTR = call <vscale x 2 x ptr> @llvm.masked.gather.nxv2p0.nxv2p0(<vscale x 2 x ptr> align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x ptr> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V1PTR = call <vscale x 1 x ptr> @llvm.masked.gather.nxv1p0.nxv1p0(<vscale x 1 x ptr> align 8 undef, <vscale x 1 x i1> undef, <vscale x 1 x ptr> undef)
 ; UNSUPPORTED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V8F64 = call <vscale x 8 x double> @llvm.masked.gather.nxv8f64.nxv8p0(<vscale x 8 x ptr> undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x double> undef)
@@ -176,21 +176,21 @@ define void @masked_gather_aligned_f16() {
 ; GENERIC-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; MAX256-LABEL: 'masked_gather_aligned_f16'
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32F16 = call <vscale x 32 x half> @llvm.masked.gather.nxv32f16.nxv32p0(<vscale x 32 x ptr> undef, i32 2, <vscale x 32 x i1> undef, <vscale x 32 x half> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F16 = call <vscale x 16 x half> @llvm.masked.gather.nxv16f16.nxv16p0(<vscale x 16 x ptr> undef, i32 2, <vscale x 16 x i1> undef, <vscale x 16 x half> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F16 = call <vscale x 8 x half> @llvm.masked.gather.nxv8f16.nxv8p0(<vscale x 8 x ptr> undef, i32 2, <vscale x 8 x i1> undef, <vscale x 8 x half> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F16 = call <vscale x 4 x half> @llvm.masked.gather.nxv4f16.nxv4p0(<vscale x 4 x ptr> undef, i32 2, <vscale x 4 x i1> undef, <vscale x 4 x half> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F16 = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16.nxv2p0(<vscale x 2 x ptr> undef, i32 2, <vscale x 2 x i1> undef, <vscale x 2 x half> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = call <vscale x 1 x half> @llvm.masked.gather.nxv1f16.nxv1p0(<vscale x 1 x ptr> undef, i32 2, <vscale x 1 x i1> undef, <vscale x 1 x half> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32F16 = call <vscale x 32 x half> @llvm.masked.gather.nxv32f16.nxv32p0(<vscale x 32 x ptr> align 2 undef, <vscale x 32 x i1> undef, <vscale x 32 x half> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F16 = call <vscale x 16 x half> @llvm.masked.gather.nxv16f16.nxv16p0(<vscale x 16 x ptr> align 2 undef, <vscale x 16 x i1> undef, <vscale x 16 x half> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F16 = call <vscale x 8 x half> @llvm.masked.gather.nxv8f16.nxv8p0(<vscale x 8 x ptr> align 2 undef, <vscale x 8 x i1> undef, <vscale x 8 x half> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F16 = call <vscale x 4 x half> @llvm.masked.gather.nxv4f16.nxv4p0(<vscale x 4 x ptr> align 2 undef, <vscale x 4 x i1> undef, <vscale x 4 x half> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F16 = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16.nxv2p0(<vscale x 2 x ptr> align 2 undef, <vscale x 2 x i1> undef, <vscale x 2 x half> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = call <vscale x 1 x half> @llvm.masked.gather.nxv1f16.nxv1p0(<vscale x 1 x ptr> align 2 undef, <vscale x 1 x i1> undef, <vscale x 1 x half> undef)
 ; MAX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; UNSUPPORTED-LABEL: 'masked_gather_aligned_f16'
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V32F16 = call <vscale x 32 x half> @llvm.masked.gather.nxv32f16.nxv32p0(<vscale x 32 x ptr> undef, i32 2, <vscale x 32 x i1> undef, <vscale x 32 x half> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V16F16 = call <vscale x 16 x half> @llvm.masked.gather.nxv16f16.nxv16p0(<vscale x 16 x ptr> undef, i32 2, <vscale x 16 x i1> undef, <vscale x 16 x half> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V8F16 = call <vscale x 8 x half> @llvm.masked.gather.nxv8f16.nxv8p0(<vscale x 8 x ptr> undef, i32 2, <vscale x 8 x i1> undef, <vscale x 8 x half> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V4F16 = call <vscale x 4 x half> @llvm.masked.gather.nxv4f16.nxv4p0(<vscale x 4 x ptr> undef, i32 2, <vscale x 4 x i1> undef, <vscale x 4 x half> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V2F16 = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16.nxv2p0(<vscale x 2 x ptr> undef, i32 2, <vscale x 2 x i1> undef, <vscale x 2 x half> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V1F16 = call <vscale x 1 x half> @llvm.masked.gather.nxv1f16.nxv1p0(<vscale x 1 x ptr> undef, i32 2, <vscale x 1 x i1> undef, <vscale x 1 x half> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V32F16 = call <vscale x 32 x half> @llvm.masked.gather.nxv32f16.nxv32p0(<vscale x 32 x ptr> align 2 undef, <vscale x 32 x i1> undef, <vscale x 32 x half> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V16F16 = call <vscale x 16 x half> @llvm.masked.gather.nxv16f16.nxv16p0(<vscale x 16 x ptr> align 2 undef, <vscale x 16 x i1> undef, <vscale x 16 x half> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V8F16 = call <vscale x 8 x half> @llvm.masked.gather.nxv8f16.nxv8p0(<vscale x 8 x ptr> align 2 undef, <vscale x 8 x i1> undef, <vscale x 8 x half> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V4F16 = call <vscale x 4 x half> @llvm.masked.gather.nxv4f16.nxv4p0(<vscale x 4 x ptr> align 2 undef, <vscale x 4 x i1> undef, <vscale x 4 x half> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V2F16 = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16.nxv2p0(<vscale x 2 x ptr> align 2 undef, <vscale x 2 x i1> undef, <vscale x 2 x half> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V1F16 = call <vscale x 1 x half> @llvm.masked.gather.nxv1f16.nxv1p0(<vscale x 1 x ptr> align 2 undef, <vscale x 1 x i1> undef, <vscale x 1 x half> undef)
 ; UNSUPPORTED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V32F16 = call <vscale x 32 x half> @llvm.masked.gather.nxv32f16.nxv32p0(<vscale x 32 x ptr> undef, i32 2, <vscale x 32 x i1> undef, <vscale x 32 x half> undef)
@@ -205,34 +205,34 @@ define void @masked_gather_aligned_f16() {
 
 define void @masked_gather_unaligned() {
 ; CHECK-LABEL: 'masked_gather_unaligned'
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V8F64.u = call <vscale x 8 x double> @llvm.masked.gather.nxv8f64.nxv8p0(<vscale x 8 x ptr> undef, i32 2, <vscale x 8 x i1> undef, <vscale x 8 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V4F64.u = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0(<vscale x 4 x ptr> undef, i32 2, <vscale x 4 x i1> undef, <vscale x 4 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V2F64.u = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> undef, i32 2, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V1F64.u = call <vscale x 1 x double> @llvm.masked.gather.nxv1f64.nxv1p0(<vscale x 1 x ptr> undef, i32 2, <vscale x 1 x i1> undef, <vscale x 1 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V16F32.u = call <vscale x 16 x float> @llvm.masked.gather.nxv16f32.nxv16p0(<vscale x 16 x ptr> undef, i32 2, <vscale x 16 x i1> undef, <vscale x 16 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V8F32.u = call <vscale x 8 x float> @llvm.masked.gather.nxv8f32.nxv8p0(<vscale x 8 x ptr> undef, i32 2, <vscale x 8 x i1> undef, <vscale x 8 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V4F32.u = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> undef, i32 2, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V2F32.u = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0(<vscale x 2 x ptr> undef, i32 2, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V1F32.u = call <vscale x 1 x float> @llvm.masked.gather.nxv1f32.nxv1p0(<vscale x 1 x ptr> undef, i32 2, <vscale x 1 x i1> undef, <vscale x 1 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V8I64.u = call <vscale x 8 x i64> @llvm.masked.gather.nxv8i64.nxv8p0(<vscale x 8 x ptr> undef, i32 4, <vscale x 8 x i1> undef, <vscale x 8 x i64> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V4I64.u = call <vscale x 4 x i64> @llvm.masked.gather.nxv4i64.nxv4p0(<vscale x 4 x ptr> undef, i32 4, <vscale x 4 x i1> undef, <vscale x 4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V2I64.u = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> undef, i32 4, <vscale x 2 x i1> undef, <vscale x 2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V1I64.u = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> undef, i32 4, <vscale x 1 x i1> undef, <vscale x 1 x i64> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V16I32.u = call <vscale x 16 x i32> @llvm.masked.gather.nxv16i32.nxv16p0(<vscale x 16 x ptr> undef, i32 1, <vscale x 16 x i1> undef, <vscale x 16 x i32> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V8I32.u = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> undef, i32 1, <vscale x 8 x i1> undef, <vscale x 8 x i32> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V4I32.u = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V2I32.u = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> undef, i32 1, <vscale x 2 x i1> undef, <vscale x 2 x i32> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V1I32.u = call <vscale x 1 x i32> @llvm.masked.gather.nxv1i32.nxv1p0(<vscale x 1 x ptr> undef, i32 1, <vscale x 1 x i1> undef, <vscale x 1 x i32> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V32I16.u = call <vscale x 32 x i16> @llvm.masked.gather.nxv32i16.nxv32p0(<vscale x 32 x ptr> undef, i32 1, <vscale x 32 x i1> undef, <vscale x 32 x i16> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V16I16.u = call <vscale x 16 x i16> @llvm.masked.gather.nxv16i16.nxv16p0(<vscale x 16 x ptr> undef, i32 1, <vscale x 16 x i1> undef, <vscale x 16 x i16> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V8I16.u = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0(<vscale x 8 x ptr> undef, i32 1, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V4I16.u = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V2I16.u = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16.nxv2p0(<vscale x 2 x ptr> undef, i32 1, <vscale x 2 x i1> undef, <vscale x 2 x i16> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V1I16.u = call <vscale x 1 x i16> @llvm.masked.gather.nxv1i16.nxv1p0(<vscale x 1 x ptr> undef, i32 1, <vscale x 1 x i1> undef, <vscale x 1 x i16> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V8PTR = call <vscale x 8 x ptr> @llvm.masked.gather.nxv8p0.nxv8p0(<vscale x 8 x ptr> undef, i32 1, <vscale x 8 x i1> undef, <vscale x 8 x ptr> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V4PTR = call <vscale x 4 x ptr> @llvm.masked.gather.nxv4p0.nxv4p0(<vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x ptr> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V2PTR = call <vscale x 2 x ptr> @llvm.masked.gather.nxv2p0.nxv2p0(<vscale x 2 x ptr> undef, i32 1, <vscale x 2 x i1> undef, <vscale x 2 x ptr> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V1PTR = call <vscale x 1 x ptr> @llvm.masked.gather.nxv1p0.nxv1p0(<vscale x 1 x ptr> undef, i32 1, <vscale x 1 x i1> undef, <vscale x 1 x ptr> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V8F64.u = call <vscale x 8 x double> @llvm.masked.gather.nxv8f64.nxv8p0(<vscale x 8 x ptr> align 2 undef, <vscale x 8 x i1> undef, <vscale x 8 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V4F64.u = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0(<vscale x 4 x ptr> align 2 undef, <vscale x 4 x i1> undef, <vscale x 4 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V2F64.u = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> align 2 undef, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V1F64.u = call <vscale x 1 x double> @llvm.masked.gather.nxv1f64.nxv1p0(<vscale x 1 x ptr> align 2 undef, <vscale x 1 x i1> undef, <vscale x 1 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V16F32.u = call <vscale x 16 x float> @llvm.masked.gather.nxv16f32.nxv16p0(<vscale x 16 x ptr> align 2 undef, <vscale x 16 x i1> undef, <vscale x 16 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V8F32.u = call <vscale x 8 x float> @llvm.masked.gather.nxv8f32.nxv8p0(<vscale x 8 x ptr> align 2 undef, <vscale x 8 x i1> undef, <vscale x 8 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V4F32.u = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> align 2 undef, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V2F32.u = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0(<vscale x 2 x ptr> align 2 undef, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V1F32.u = call <vscale x 1 x float> @llvm.masked.gather.nxv1f32.nxv1p0(<vscale x 1 x ptr> align 2 undef, <vscale x 1 x i1> undef, <vscale x 1 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V8I64.u = call <vscale x 8 x i64> @llvm.masked.gather.nxv8i64.nxv8p0(<vscale x 8 x ptr> align 4 undef, <vscale x 8 x i1> undef, <vscale x 8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V4I64.u = call <vscale x 4 x i64> @llvm.masked.gather.nxv4i64.nxv4p0(<vscale x 4 x ptr> align 4 undef, <vscale x 4 x i1> undef, <vscale x 4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V2I64.u = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> align 4 undef, <vscale x 2 x i1> undef, <vscale x 2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V1I64.u = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> align 4 undef, <vscale x 1 x i1> undef, <vscale x 1 x i64> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V16I32.u = call <vscale x 16 x i32> @llvm.masked.gather.nxv16i32.nxv16p0(<vscale x 16 x ptr> align 1 undef, <vscale x 16 x i1> undef, <vscale x 16 x i32> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V8I32.u = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> align 1 undef, <vscale x 8 x i1> undef, <vscale x 8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V4I32.u = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef, <vscale x 4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V2I32.u = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> align 1 undef, <vscale x 2 x i1> undef, <vscale x 2 x i32> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V1I32.u = call <vscale x 1 x i32> @llvm.masked.gather.nxv1i32.nxv1p0(<vscale x 1 x ptr> align 1 undef, <vscale x 1 x i1> undef, <vscale x 1 x i32> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V32I16.u = call <vscale x 32 x i16> @llvm.masked.gather.nxv32i16.nxv32p0(<vscale x 32 x ptr> align 1 undef, <vscale x 32 x i1> undef, <vscale x 32 x i16> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V16I16.u = call <vscale x 16 x i16> @llvm.masked.gather.nxv16i16.nxv16p0(<vscale x 16 x ptr> align 1 undef, <vscale x 16 x i1> undef, <vscale x 16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V8I16.u = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0(<vscale x 8 x ptr> align 1 undef, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V4I16.u = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V2I16.u = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16.nxv2p0(<vscale x 2 x ptr> align 1 undef, <vscale x 2 x i1> undef, <vscale x 2 x i16> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V1I16.u = call <vscale x 1 x i16> @llvm.masked.gather.nxv1i16.nxv1p0(<vscale x 1 x ptr> align 1 undef, <vscale x 1 x i1> undef, <vscale x 1 x i16> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V8PTR = call <vscale x 8 x ptr> @llvm.masked.gather.nxv8p0.nxv8p0(<vscale x 8 x ptr> align 1 undef, <vscale x 8 x i1> undef, <vscale x 8 x ptr> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V4PTR = call <vscale x 4 x ptr> @llvm.masked.gather.nxv4p0.nxv4p0(<vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef, <vscale x 4 x ptr> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V2PTR = call <vscale x 2 x ptr> @llvm.masked.gather.nxv2p0.nxv2p0(<vscale x 2 x ptr> align 1 undef, <vscale x 2 x i1> undef, <vscale x 2 x ptr> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V1PTR = call <vscale x 1 x ptr> @llvm.masked.gather.nxv1p0.nxv1p0(<vscale x 1 x ptr> align 1 undef, <vscale x 1 x i1> undef, <vscale x 1 x ptr> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V8F64.u = call <vscale x 8 x double> @llvm.masked.gather.nxv8f64.nxv8p0(<vscale x 8 x ptr> undef, i32 2, <vscale x 8 x i1> undef, <vscale x 8 x double> undef)
@@ -274,12 +274,12 @@ define void @masked_gather_unaligned() {
 
 define void @masked_gather_unaligned_f16() {
 ; CHECK-LABEL: 'masked_gather_unaligned_f16'
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V32F16.u = call <vscale x 32 x half> @llvm.masked.gather.nxv32f16.nxv32p0(<vscale x 32 x ptr> undef, i32 1, <vscale x 32 x i1> undef, <vscale x 32 x half> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V16F16.u = call <vscale x 16 x half> @llvm.masked.gather.nxv16f16.nxv16p0(<vscale x 16 x ptr> undef, i32 1, <vscale x 16 x i1> undef, <vscale x 16 x half> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V8F16.u = call <vscale x 8 x half> @llvm.masked.gather.nxv8f16.nxv8p0(<vscale x 8 x ptr> undef, i32 1, <vscale x 8 x i1> undef, <vscale x 8 x half> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V4F16.u = call <vscale x 4 x half> @llvm.masked.gather.nxv4f16.nxv4p0(<vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x half> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V2F16.u = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16.nxv2p0(<vscale x 2 x ptr> undef, i32 1, <vscale x 2 x i1> undef, <vscale x 2 x half> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V1F16.u = call <vscale x 1 x half> @llvm.masked.gather.nxv1f16.nxv1p0(<vscale x 1 x ptr> undef, i32 1, <vscale x 1 x i1> undef, <vscale x 1 x half> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V32F16.u = call <vscale x 32 x half> @llvm.masked.gather.nxv32f16.nxv32p0(<vscale x 32 x ptr> align 1 undef, <vscale x 32 x i1> undef, <vscale x 32 x half> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V16F16.u = call <vscale x 16 x half> @llvm.masked.gather.nxv16f16.nxv16p0(<vscale x 16 x ptr> align 1 undef, <vscale x 16 x i1> undef, <vscale x 16 x half> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V8F16.u = call <vscale x 8 x half> @llvm.masked.gather.nxv8f16.nxv8p0(<vscale x 8 x ptr> align 1 undef, <vscale x 8 x i1> undef, <vscale x 8 x half> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V4F16.u = call <vscale x 4 x half> @llvm.masked.gather.nxv4f16.nxv4p0(<vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef, <vscale x 4 x half> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V2F16.u = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16.nxv2p0(<vscale x 2 x ptr> align 1 undef, <vscale x 2 x i1> undef, <vscale x 2 x half> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V1F16.u = call <vscale x 1 x half> @llvm.masked.gather.nxv1f16.nxv1p0(<vscale x 1 x ptr> align 1 undef, <vscale x 1 x i1> undef, <vscale x 1 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V32F16.u = call <vscale x 32 x half> @llvm.masked.gather.nxv32f16.nxv32p0(<vscale x 32 x ptr> undef, i32 1, <vscale x 32 x i1> undef, <vscale x 32 x half> undef)
diff --git a/llvm/test/Analysis/CostModel/RISCV/scalable-scatter.ll b/llvm/test/Analysis/CostModel/RISCV/scalable-scatter.ll
index 0e3c43a228ef4..69abcdea176cc 100644
--- a/llvm/test/Analysis/CostModel/RISCV/scalable-scatter.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/scalable-scatter.ll
@@ -45,79 +45,79 @@ define void @masked_scatter_aligned() {
 ; GENERIC-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; MAX256-LABEL: 'masked_scatter_aligned'
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8f64.nxv8p0(<vscale x 8 x double> undef, <vscale x 8 x ptr> undef, i32 8, <vscale x 8 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4f64.nxv4p0(<vscale x 4 x double> undef, <vscale x 4 x ptr> undef, i32 8, <vscale x 4 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> undef, <vscale x 2 x ptr> undef, i32 8, <vscale x 2 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1f64.nxv1p0(<vscale x 1 x double> undef, <vscale x 1 x ptr> undef, i32 8, <vscale x 1 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv16f32.nxv16p0(<vscale x 16 x float> undef, <vscale x 16 x ptr> undef, i32 4, <vscale x 16 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8f32.nxv8p0(<vscale x 8 x float> undef, <vscale x 8 x ptr> undef, i32 4, <vscale x 8 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> undef, <vscale x 4 x ptr> undef, i32 4, <vscale x 4 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2f32.nxv2p0(<vscale x 2 x float> undef, <vscale x 2 x ptr> undef, i32 4, <vscale x 2 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1f32.nxv1p0(<vscale x 1 x float> undef, <vscale x 1 x ptr> undef, i32 4, <vscale x 1 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8i64.nxv8p0(<vscale x 8 x i64> undef, <vscale x 8 x ptr> undef, i32 8, <vscale x 8 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4i64.nxv4p0(<vscale x 4 x i64> undef, <vscale x 4 x ptr> undef, i32 8, <vscale x 4 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> undef, <vscale x 2 x ptr> undef, i32 8, <vscale x 2 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> undef, i32 8, <vscale x 1 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv16i32.nxv16p0(<vscale x 16 x i32> undef, <vscale x 16 x ptr> undef, i32 4, <vscale x 16 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> undef, i32 4, <vscale x 8 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> undef, i32 4, <vscale x 4 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> undef, <vscale x 2 x ptr> undef, i32 4, <vscale x 2 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1i32.nxv1p0(<vscale x 1 x i32> undef, <vscale x 1 x ptr> undef, i32 4, <vscale x 1 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.nxv32i16.nxv32p0(<vscale x 32 x i16> undef, <vscale x 32 x ptr> undef, i32 2, <vscale x 32 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv16i16.nxv16p0(<vscale x 16 x i16> undef, <vscale x 16 x ptr> undef, i32 2, <vscale x 16 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> undef, <vscale x 8 x ptr> undef, i32 2, <vscale x 8 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> undef, <vscale x 4 x ptr> undef, i32 2, <vscale x 4 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2i16.nxv2p0(<vscale x 2 x i16> undef, <vscale x 2 x ptr> undef, i32 2, <vscale x 2 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1i16.nxv1p0(<vscale x 1 x i16> undef, <vscale x 1 x ptr> undef, i32 2, <vscale x 1 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.scatter.nxv64i8.nxv64p0(<vscale x 64 x i8> undef, <vscale x 64 x ptr> undef, i32 1, <vscale x 64 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.nxv32i8.nxv32p0(<vscale x 32 x i8> undef, <vscale x 32 x ptr> undef, i32 1, <vscale x 32 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> undef, <vscale x 16 x ptr> undef, i32 1, <vscale x 16 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> undef, <vscale x 8 x ptr> undef, i32 1, <vscale x 8 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4i8.nxv4p0(<vscale x 4 x i8> undef, <vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2i8.nxv2p0(<vscale x 2 x i8> undef, <vscale x 2 x ptr> undef, i32 1, <vscale x 2 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1i8.nxv1p0(<vscale x 1 x i8> undef, <vscale x 1 x ptr> undef, i32 1, <vscale x 1 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8p0.nxv8p0(<vscale x 8 x ptr> undef, <vscale x 8 x ptr> undef, i32 8, <vscale x 8 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4p0.nxv4p0(<vscale x 4 x ptr> undef, <vscale x 4 x ptr> undef, i32 8, <vscale x 4 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2p0.nxv2p0(<vscale x 2 x ptr> undef, <vscale x 2 x ptr> undef, i32 8, <vscale x 2 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1p0.nxv1p0(<vscale x 1 x ptr> undef, <vscale x 1 x ptr> undef, i32 8, <vscale x 1 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8f64.nxv8p0(<vscale x 8 x double> undef, <vscale x 8 x ptr> align 8 undef, <vscale x 8 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4f64.nxv4p0(<vscale x 4 x double> undef, <vscale x 4 x ptr> align 8 undef, <vscale x 4 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> undef, <vscale x 2 x ptr> align 8 undef, <vscale x 2 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1f64.nxv1p0(<vscale x 1 x double> undef, <vscale x 1 x ptr> align 8 undef, <vscale x 1 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv16f32.nxv16p0(<vscale x 16 x float> undef, <vscale x 16 x ptr> align 4 undef, <vscale x 16 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8f32.nxv8p0(<vscale x 8 x float> undef, <vscale x 8 x ptr> align 4 undef, <vscale x 8 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> undef, <vscale x 4 x ptr> align 4 undef, <vscale x 4 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2f32.nxv2p0(<vscale x 2 x float> undef, <vscale x 2 x ptr> align 4 undef, <vscale x 2 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1f32.nxv1p0(<vscale x 1 x float> undef, <vscale x 1 x ptr> align 4 undef, <vscale x 1 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8i64.nxv8p0(<vscale x 8 x i64> undef, <vscale x 8 x ptr> align 8 undef, <vscale x 8 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4i64.nxv4p0(<vscale x 4 x i64> undef, <vscale x 4 x ptr> align 8 undef, <vscale x 4 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> undef, <vscale x 2 x ptr> align 8 undef, <vscale x 2 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> align 8 undef, <vscale x 1 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv16i32.nxv16p0(<vscale x 16 x i32> undef, <vscale x 16 x ptr> align 4 undef, <vscale x 16 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> align 4 undef, <vscale x 8 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> align 4 undef, <vscale x 4 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> undef, <vscale x 2 x ptr> align 4 undef, <vscale x 2 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1i32.nxv1p0(<vscale x 1 x i32> undef, <vscale x 1 x ptr> align 4 undef, <vscale x 1 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.nxv32i16.nxv32p0(<vscale x 32 x i16> undef, <vscale x 32 x ptr> align 2 undef, <vscale x 32 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv16i16.nxv16p0(<vscale x 16 x i16> undef, <vscale x 16 x ptr> align 2 undef, <vscale x 16 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> undef, <vscale x 8 x ptr> align 2 undef, <vscale x 8 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> undef, <vscale x 4 x ptr> align 2 undef, <vscale x 4 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2i16.nxv2p0(<vscale x 2 x i16> undef, <vscale x 2 x ptr> align 2 undef, <vscale x 2 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1i16.nxv1p0(<vscale x 1 x i16> undef, <vscale x 1 x ptr> align 2 undef, <vscale x 1 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.scatter.nxv64i8.nxv64p0(<vscale x 64 x i8> undef, <vscale x 64 x ptr> align 1 undef, <vscale x 64 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.nxv32i8.nxv32p0(<vscale x 32 x i8> undef, <vscale x 32 x ptr> align 1 undef, <vscale x 32 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> undef, <vscale x 16 x ptr> align 1 undef, <vscale x 16 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> undef, <vscale x 8 x ptr> align 1 undef, <vscale x 8 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4i8.nxv4p0(<vscale x 4 x i8> undef, <vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2i8.nxv2p0(<vscale x 2 x i8> undef, <vscale x 2 x ptr> align 1 undef, <vscale x 2 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1i8.nxv1p0(<vscale x 1 x i8> undef, <vscale x 1 x ptr> align 1 undef, <vscale x 1 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8p0.nxv8p0(<vscale x 8 x ptr> undef, <vscale x 8 x ptr> align 8 undef, <vscale x 8 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4p0.nxv4p0(<vscale x 4 x ptr> undef, <vscale x 4 x ptr> align 8 undef, <vscale x 4 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2p0.nxv2p0(<vscale x 2 x ptr> undef, <vscale x 2 x ptr> align 8 undef, <vscale x 2 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1p0.nxv1p0(<vscale x 1 x ptr> undef, <vscale x 1 x ptr> align 8 undef, <vscale x 1 x i1> undef)
 ; MAX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; UNSUPPORTED-LABEL: 'masked_scatter_aligned'
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8f64.nxv8p0(<vscale x 8 x double> undef, <vscale x 8 x ptr> undef, i32 8, <vscale x 8 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4f64.nxv4p0(<vscale x 4 x double> undef, <vscale x 4 x ptr> undef, i32 8, <vscale x 4 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> undef, <vscale x 2 x ptr> undef, i32 8, <vscale x 2 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1f64.nxv1p0(<vscale x 1 x double> undef, <vscale x 1 x ptr> undef, i32 8, <vscale x 1 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv16f32.nxv16p0(<vscale x 16 x float> undef, <vscale x 16 x ptr> undef, i32 4, <vscale x 16 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8f32.nxv8p0(<vscale x 8 x float> undef, <vscale x 8 x ptr> undef, i32 4, <vscale x 8 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> undef, <vscale x 4 x ptr> undef, i32 4, <vscale x 4 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2f32.nxv2p0(<vscale x 2 x float> undef, <vscale x 2 x ptr> undef, i32 4, <vscale x 2 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1f32.nxv1p0(<vscale x 1 x float> undef, <vscale x 1 x ptr> undef, i32 4, <vscale x 1 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8i64.nxv8p0(<vscale x 8 x i64> undef, <vscale x 8 x ptr> undef, i32 8, <vscale x 8 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i64.nxv4p0(<vscale x 4 x i64> undef, <vscale x 4 x ptr> undef, i32 8, <vscale x 4 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> undef, <vscale x 2 x ptr> undef, i32 8, <vscale x 2 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> undef, i32 8, <vscale x 1 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv16i32.nxv16p0(<vscale x 16 x i32> undef, <vscale x 16 x ptr> undef, i32 4, <vscale x 16 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> undef, i32 4, <vscale x 8 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> undef, i32 4, <vscale x 4 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> undef, <vscale x 2 x ptr> undef, i32 4, <vscale x 2 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i32.nxv1p0(<vscale x 1 x i32> undef, <vscale x 1 x ptr> undef, i32 4, <vscale x 1 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv32i16.nxv32p0(<vscale x 32 x i16> undef, <vscale x 32 x ptr> undef, i32 2, <vscale x 32 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv16i16.nxv16p0(<vscale x 16 x i16> undef, <vscale x 16 x ptr> undef, i32 2, <vscale x 16 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> undef, <vscale x 8 x ptr> undef, i32 2, <vscale x 8 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> undef, <vscale x 4 x ptr> undef, i32 2, <vscale x 4 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2i16.nxv2p0(<vscale x 2 x i16> undef, <vscale x 2 x ptr> undef, i32 2, <vscale x 2 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i16.nxv1p0(<vscale x 1 x i16> undef, <vscale x 1 x ptr> undef, i32 2, <vscale x 1 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv64i8.nxv64p0(<vscale x 64 x i8> undef, <vscale x 64 x ptr> undef, i32 1, <vscale x 64 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv32i8.nxv32p0(<vscale x 32 x i8> undef, <vscale x 32 x ptr> undef, i32 1, <vscale x 32 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> undef, <vscale x 16 x ptr> undef, i32 1, <vscale x 16 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> undef, <vscale x 8 x ptr> undef, i32 1, <vscale x 8 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i8.nxv4p0(<vscale x 4 x i8> undef, <vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2i8.nxv2p0(<vscale x 2 x i8> undef, <vscale x 2 x ptr> undef, i32 1, <vscale x 2 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i8.nxv1p0(<vscale x 1 x i8> undef, <vscale x 1 x ptr> undef, i32 1, <vscale x 1 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8p0.nxv8p0(<vscale x 8 x ptr> undef, <vscale x 8 x ptr> undef, i32 8, <vscale x 8 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4p0.nxv4p0(<vscale x 4 x ptr> undef, <vscale x 4 x ptr> undef, i32 8, <vscale x 4 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2p0.nxv2p0(<vscale x 2 x ptr> undef, <vscale x 2 x ptr> undef, i32 8, <vscale x 2 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1p0.nxv1p0(<vscale x 1 x ptr> undef, <vscale x 1 x ptr> undef, i32 8, <vscale x 1 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8f64.nxv8p0(<vscale x 8 x double> undef, <vscale x 8 x ptr> align 8 undef, <vscale x 8 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4f64.nxv4p0(<vscale x 4 x double> undef, <vscale x 4 x ptr> align 8 undef, <vscale x 4 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> undef, <vscale x 2 x ptr> align 8 undef, <vscale x 2 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1f64.nxv1p0(<vscale x 1 x double> undef, <vscale x 1 x ptr> align 8 undef, <vscale x 1 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv16f32.nxv16p0(<vscale x 16 x float> undef, <vscale x 16 x ptr> align 4 undef, <vscale x 16 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8f32.nxv8p0(<vscale x 8 x float> undef, <vscale x 8 x ptr> align 4 undef, <vscale x 8 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> undef, <vscale x 4 x ptr> align 4 undef, <vscale x 4 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2f32.nxv2p0(<vscale x 2 x float> undef, <vscale x 2 x ptr> align 4 undef, <vscale x 2 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1f32.nxv1p0(<vscale x 1 x float> undef, <vscale x 1 x ptr> align 4 undef, <vscale x 1 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8i64.nxv8p0(<vscale x 8 x i64> undef, <vscale x 8 x ptr> align 8 undef, <vscale x 8 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i64.nxv4p0(<vscale x 4 x i64> undef, <vscale x 4 x ptr> align 8 undef, <vscale x 4 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> undef, <vscale x 2 x ptr> align 8 undef, <vscale x 2 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> align 8 undef, <vscale x 1 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv16i32.nxv16p0(<vscale x 16 x i32> undef, <vscale x 16 x ptr> align 4 undef, <vscale x 16 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> align 4 undef, <vscale x 8 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> align 4 undef, <vscale x 4 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> undef, <vscale x 2 x ptr> align 4 undef, <vscale x 2 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i32.nxv1p0(<vscale x 1 x i32> undef, <vscale x 1 x ptr> align 4 undef, <vscale x 1 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv32i16.nxv32p0(<vscale x 32 x i16> undef, <vscale x 32 x ptr> align 2 undef, <vscale x 32 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv16i16.nxv16p0(<vscale x 16 x i16> undef, <vscale x 16 x ptr> align 2 undef, <vscale x 16 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> undef, <vscale x 8 x ptr> align 2 undef, <vscale x 8 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> undef, <vscale x 4 x ptr> align 2 undef, <vscale x 4 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2i16.nxv2p0(<vscale x 2 x i16> undef, <vscale x 2 x ptr> align 2 undef, <vscale x 2 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i16.nxv1p0(<vscale x 1 x i16> undef, <vscale x 1 x ptr> align 2 undef, <vscale x 1 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv64i8.nxv64p0(<vscale x 64 x i8> undef, <vscale x 64 x ptr> align 1 undef, <vscale x 64 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv32i8.nxv32p0(<vscale x 32 x i8> undef, <vscale x 32 x ptr> align 1 undef, <vscale x 32 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> undef, <vscale x 16 x ptr> align 1 undef, <vscale x 16 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> undef, <vscale x 8 x ptr> align 1 undef, <vscale x 8 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i8.nxv4p0(<vscale x 4 x i8> undef, <vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2i8.nxv2p0(<vscale x 2 x i8> undef, <vscale x 2 x ptr> align 1 undef, <vscale x 2 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i8.nxv1p0(<vscale x 1 x i8> undef, <vscale x 1 x ptr> align 1 undef, <vscale x 1 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8p0.nxv8p0(<vscale x 8 x ptr> undef, <vscale x 8 x ptr> align 8 undef, <vscale x 8 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4p0.nxv4p0(<vscale x 4 x ptr> undef, <vscale x 4 x ptr> align 8 undef, <vscale x 4 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2p0.nxv2p0(<vscale x 2 x ptr> undef, <vscale x 2 x ptr> align 8 undef, <vscale x 2 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1p0.nxv1p0(<vscale x 1 x ptr> undef, <vscale x 1 x ptr> align 8 undef, <vscale x 1 x i1> undef)
 ; UNSUPPORTED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   call void @llvm.masked.scatter.nxv8f64.nxv8p0(<vscale x 8 x double> undef, <vscale x 8 x ptr> undef, i32 8, <vscale x 8 x i1> undef)
@@ -176,21 +176,21 @@ define void @masked_scatter_aligned_f16() {
 ; GENERIC-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; MAX256-LABEL: 'masked_scatter_aligned_f16'
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.nxv32f16.nxv32p0(<vscale x 32 x half> undef, <vscale x 32 x ptr> undef, i32 2, <vscale x 32 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv16f16.nxv16p0(<vscale x 16 x half> undef, <vscale x 16 x ptr> undef, i32 2, <vscale x 16 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8f16.nxv8p0(<vscale x 8 x half> undef, <vscale x 8 x ptr> undef, i32 2, <vscale x 8 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4f16.nxv4p0(<vscale x 4 x half> undef, <vscale x 4 x ptr> undef, i32 2, <vscale x 4 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2f16.nxv2p0(<vscale x 2 x half> undef, <vscale x 2 x ptr> undef, i32 2, <vscale x 2 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1f16.nxv1p0(<vscale x 1 x half> undef, <vscale x 1 x ptr> undef, i32 2, <vscale x 1 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.nxv32f16.nxv32p0(<vscale x 32 x half> undef, <vscale x 32 x ptr> align 2 undef, <vscale x 32 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv16f16.nxv16p0(<vscale x 16 x half> undef, <vscale x 16 x ptr> align 2 undef, <vscale x 16 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8f16.nxv8p0(<vscale x 8 x half> undef, <vscale x 8 x ptr> align 2 undef, <vscale x 8 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4f16.nxv4p0(<vscale x 4 x half> undef, <vscale x 4 x ptr> align 2 undef, <vscale x 4 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2f16.nxv2p0(<vscale x 2 x half> undef, <vscale x 2 x ptr> align 2 undef, <vscale x 2 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1f16.nxv1p0(<vscale x 1 x half> undef, <vscale x 1 x ptr> align 2 undef, <vscale x 1 x i1> undef)
 ; MAX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; UNSUPPORTED-LABEL: 'masked_scatter_aligned_f16'
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv32f16.nxv32p0(<vscale x 32 x half> undef, <vscale x 32 x ptr> undef, i32 2, <vscale x 32 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv16f16.nxv16p0(<vscale x 16 x half> undef, <vscale x 16 x ptr> undef, i32 2, <vscale x 16 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8f16.nxv8p0(<vscale x 8 x half> undef, <vscale x 8 x ptr> undef, i32 2, <vscale x 8 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4f16.nxv4p0(<vscale x 4 x half> undef, <vscale x 4 x ptr> undef, i32 2, <vscale x 4 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2f16.nxv2p0(<vscale x 2 x half> undef, <vscale x 2 x ptr> undef, i32 2, <vscale x 2 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1f16.nxv1p0(<vscale x 1 x half> undef, <vscale x 1 x ptr> undef, i32 2, <vscale x 1 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv32f16.nxv32p0(<vscale x 32 x half> undef, <vscale x 32 x ptr> align 2 undef, <vscale x 32 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv16f16.nxv16p0(<vscale x 16 x half> undef, <vscale x 16 x ptr> align 2 undef, <vscale x 16 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8f16.nxv8p0(<vscale x 8 x half> undef, <vscale x 8 x ptr> align 2 undef, <vscale x 8 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4f16.nxv4p0(<vscale x 4 x half> undef, <vscale x 4 x ptr> align 2 undef, <vscale x 4 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2f16.nxv2p0(<vscale x 2 x half> undef, <vscale x 2 x ptr> align 2 undef, <vscale x 2 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1f16.nxv1p0(<vscale x 1 x half> undef, <vscale x 1 x ptr> align 2 undef, <vscale x 1 x i1> undef)
 ; UNSUPPORTED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   call void @llvm.masked.scatter.nxv32f16.nxv32p0(<vscale x 32 x half> undef, <vscale x 32 x ptr> undef, i32 2, <vscale x 32 x i1> undef)
@@ -205,34 +205,34 @@ define void @masked_scatter_aligned_f16() {
 
 define void @masked_scatter_unaligned() {
 ; CHECK-LABEL: 'masked_scatter_unaligned'
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8f64.nxv8p0(<vscale x 8 x double> undef, <vscale x 8 x ptr> undef, i32 2, <vscale x 8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4f64.nxv4p0(<vscale x 4 x double> undef, <vscale x 4 x ptr> undef, i32 2, <vscale x 4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> undef, <vscale x 2 x ptr> undef, i32 2, <vscale x 2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1f64.nxv1p0(<vscale x 1 x double> undef, <vscale x 1 x ptr> undef, i32 2, <vscale x 1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv16f32.nxv16p0(<vscale x 16 x float> undef, <vscale x 16 x ptr> undef, i32 2, <vscale x 16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8f32.nxv8p0(<vscale x 8 x float> undef, <vscale x 8 x ptr> undef, i32 2, <vscale x 8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> undef, <vscale x 4 x ptr> undef, i32 2, <vscale x 4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2f32.nxv2p0(<vscale x 2 x float> undef, <vscale x 2 x ptr> undef, i32 2, <vscale x 2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1f32.nxv1p0(<vscale x 1 x float> undef, <vscale x 1 x ptr> undef, i32 2, <vscale x 1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8i64.nxv8p0(<vscale x 8 x i64> undef, <vscale x 8 x ptr> undef, i32 1, <vscale x 8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i64.nxv4p0(<vscale x 4 x i64> undef, <vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> undef, <vscale x 2 x ptr> undef, i32 1, <vscale x 2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> undef, i32 1, <vscale x 1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv16i32.nxv16p0(<vscale x 16 x i32> undef, <vscale x 16 x ptr> undef, i32 1, <vscale x 16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> undef, i32 1, <vscale x 8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> undef, <vscale x 2 x ptr> undef, i32 1, <vscale x 2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i32.nxv1p0(<vscale x 1 x i32> undef, <vscale x 1 x ptr> undef, i32 1, <vscale x 1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv32i16.nxv32p0(<vscale x 32 x i16> undef, <vscale x 32 x ptr> undef, i32 1, <vscale x 32 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv16i16.nxv16p0(<vscale x 16 x i16> undef, <vscale x 16 x ptr> undef, i32 1, <vscale x 16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> undef, <vscale x 8 x ptr> undef, i32 1, <vscale x 8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> undef, <vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2i16.nxv2p0(<vscale x 2 x i16> undef, <vscale x 2 x ptr> undef, i32 1, <vscale x 2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i16.nxv1p0(<vscale x 1 x i16> undef, <vscale x 1 x ptr> undef, i32 1, <vscale x 1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8p0.nxv8p0(<vscale x 8 x ptr> undef, <vscale x 8 x ptr> undef, i32 1, <vscale x 8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4p0.nxv4p0(<vscale x 4 x ptr> undef, <vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2p0.nxv2p0(<vscale x 2 x ptr> undef, <vscale x 2 x ptr> undef, i32 1, <vscale x 2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1p0.nxv1p0(<vscale x 1 x ptr> undef, <vscale x 1 x ptr> undef, i32 1, <vscale x 1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8f64.nxv8p0(<vscale x 8 x double> undef, <vscale x 8 x ptr> align 2 undef, <vscale x 8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4f64.nxv4p0(<vscale x 4 x double> undef, <vscale x 4 x ptr> align 2 undef, <vscale x 4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> undef, <vscale x 2 x ptr> align 2 undef, <vscale x 2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1f64.nxv1p0(<vscale x 1 x double> undef, <vscale x 1 x ptr> align 2 undef, <vscale x 1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv16f32.nxv16p0(<vscale x 16 x float> undef, <vscale x 16 x ptr> align 2 undef, <vscale x 16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8f32.nxv8p0(<vscale x 8 x float> undef, <vscale x 8 x ptr> align 2 undef, <vscale x 8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> undef, <vscale x 4 x ptr> align 2 undef, <vscale x 4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2f32.nxv2p0(<vscale x 2 x float> undef, <vscale x 2 x ptr> align 2 undef, <vscale x 2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1f32.nxv1p0(<vscale x 1 x float> undef, <vscale x 1 x ptr> align 2 undef, <vscale x 1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8i64.nxv8p0(<vscale x 8 x i64> undef, <vscale x 8 x ptr> align 1 undef, <vscale x 8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i64.nxv4p0(<vscale x 4 x i64> undef, <vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> undef, <vscale x 2 x ptr> align 1 undef, <vscale x 2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> align 1 undef, <vscale x 1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv16i32.nxv16p0(<vscale x 16 x i32> undef, <vscale x 16 x ptr> align 1 undef, <vscale x 16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> align 1 undef, <vscale x 8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> undef, <vscale x 2 x ptr> align 1 undef, <vscale x 2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i32.nxv1p0(<vscale x 1 x i32> undef, <vscale x 1 x ptr> align 1 undef, <vscale x 1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv32i16.nxv32p0(<vscale x 32 x i16> undef, <vscale x 32 x ptr> align 1 undef, <vscale x 32 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv16i16.nxv16p0(<vscale x 16 x i16> undef, <vscale x 16 x ptr> align 1 undef, <vscale x 16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> undef, <vscale x 8 x ptr> align 1 undef, <vscale x 8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> undef, <vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2i16.nxv2p0(<vscale x 2 x i16> undef, <vscale x 2 x ptr> align 1 undef, <vscale x 2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i16.nxv1p0(<vscale x 1 x i16> undef, <vscale x 1 x ptr> align 1 undef, <vscale x 1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8p0.nxv8p0(<vscale x 8 x ptr> undef, <vscale x 8 x ptr> align 1 undef, <vscale x 8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4p0.nxv4p0(<vscale x 4 x ptr> undef, <vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2p0.nxv2p0(<vscale x 2 x ptr> undef, <vscale x 2 x ptr> align 1 undef, <vscale x 2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1p0.nxv1p0(<vscale x 1 x ptr> undef, <vscale x 1 x ptr> align 1 undef, <vscale x 1 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   call void @llvm.masked.scatter.nxv8f64.nxv8p0(<vscale x 8 x double> undef, <vscale x 8 x ptr> undef, i32 2, <vscale x 8 x i1> undef)
@@ -274,12 +274,12 @@ define void @masked_scatter_unaligned() {
 
 define void @masked_scatter_unaligned_f16() {
 ; CHECK-LABEL: 'masked_scatter_unaligned_f16'
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv32f16.nxv32p0(<vscale x 32 x half> undef, <vscale x 32 x ptr> undef, i32 1, <vscale x 32 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv16f16.nxv16p0(<vscale x 16 x half> undef, <vscale x 16 x ptr> undef, i32 1, <vscale x 16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8f16.nxv8p0(<vscale x 8 x half> undef, <vscale x 8 x ptr> undef, i32 1, <vscale x 8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4f16.nxv4p0(<vscale x 4 x half> undef, <vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2f16.nxv2p0(<vscale x 2 x half> undef, <vscale x 2 x ptr> undef, i32 1, <vscale x 2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1f16.nxv1p0(<vscale x 1 x half> undef, <vscale x 1 x ptr> undef, i32 1, <vscale x 1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv32f16.nxv32p0(<vscale x 32 x half> undef, <vscale x 32 x ptr> align 1 undef, <vscale x 32 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv16f16.nxv16p0(<vscale x 16 x half> undef, <vscale x 16 x ptr> align 1 undef, <vscale x 16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8f16.nxv8p0(<vscale x 8 x half> undef, <vscale x 8 x ptr> align 1 undef, <vscale x 8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4f16.nxv4p0(<vscale x 4 x half> undef, <vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2f16.nxv2p0(<vscale x 2 x half> undef, <vscale x 2 x ptr> align 1 undef, <vscale x 2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1f16.nxv1p0(<vscale x 1 x half> undef, <vscale x 1 x ptr> align 1 undef, <vscale x 1 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   call void @llvm.masked.scatter.nxv32f16.nxv32p0(<vscale x 32 x half> undef, <vscale x 32 x ptr> undef, i32 1, <vscale x 32 x i1> undef)
diff --git a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
index bcef47ee9e056..8ed8b2e78e87e 100644
--- a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
+++ b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
@@ -310,19 +310,19 @@ define void @fshl(i32 %a, i32 %b, i32 %c, <16 x i32> %va, <16 x i32> %vb, <16 x
 
 define void @maskedgather(<16 x ptr> %va, <16 x i1> %vb, <16 x float> %vc) {
 ; THRU-LABEL: 'maskedgather'
-; THRU-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %va, i32 1, <16 x i1> %vb, <16 x float> %vc)
+; THRU-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 %va, <16 x i1> %vb, <16 x float> %vc)
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; LATE-LABEL: 'maskedgather'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %va, i32 1, <16 x i1> %vb, <16 x float> %vc)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 %va, <16 x i1> %vb, <16 x float> %vc)
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'maskedgather'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %va, i32 1, <16 x i1> %vb, <16 x float> %vc)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 %va, <16 x i1> %vb, <16 x float> %vc)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE_LATE-LABEL: 'maskedgather'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %va, i32 1, <16 x i1> %vb, <16 x float> %vc)
+; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 %va, <16 x i1> %vb, <16 x float> %vc)
 ; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %va, i32 1, <16 x i1> %vb, <16 x float> %vc)
@@ -331,19 +331,19 @@ define void @maskedgather(<16 x ptr> %va, <16 x i1> %vb, <16 x float> %vc) {
 
 define void @maskedscatter(<16 x float> %va, <16 x ptr> %vb, <16 x i1> %vc) {
 ; THRU-LABEL: 'maskedscatter'
-; THRU-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> %vb, i32 1, <16 x i1> %vc)
+; THRU-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> align 1 %vb, <16 x i1> %vc)
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; LATE-LABEL: 'maskedscatter'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> %vb, i32 1, <16 x i1> %vc)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> align 1 %vb, <16 x i1> %vc)
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'maskedscatter'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> %vb, i32 1, <16 x i1> %vc)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> align 1 %vb, <16 x i1> %vc)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE_LATE-LABEL: 'maskedscatter'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> %vb, i32 1, <16 x i1> %vc)
+; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> align 1 %vb, <16 x i1> %vc)
 ; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> %vb, i32 1, <16 x i1> %vc)
diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll
index aea7cc8f036be..08543785575f0 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll
@@ -10,298 +10,298 @@
 
 define i32 @masked_load(<1 x i1> %m1, <2 x i1> %m2, <3 x i1> %m3, <4 x i1> %m4, <5 x i1> %m5, <6 x i1> %m6, <7 x i1> %m7, <8 x i1> %m8, <9 x i1> %m9, <10 x i1> %m10, <11 x i1> %m11, <12 x i1> %m12, <13 x i1> %m13, <14 x i1> %m14, <15 x i1> %m15, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_load'
-; SSE2-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:31 CodeSize:38 Lat:38 SizeLat:38 for: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:33 Lat:33 SizeLat:33 for: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:28 Lat:28 SizeLat:28 for: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:16 Lat:16 SizeLat:16 for: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:75 CodeSize:91 Lat:91 SizeLat:91 for: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:86 Lat:86 SizeLat:86 for: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:67 CodeSize:81 Lat:81 SizeLat:81 for: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:63 CodeSize:76 Lat:76 SizeLat:76 for: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:59 CodeSize:71 Lat:71 SizeLat:71 for: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:66 Lat:66 SizeLat:66 for: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:61 Lat:61 SizeLat:61 for: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:47 CodeSize:56 Lat:56 SizeLat:56 for: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:45 SizeLat:45 for: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:40 Lat:40 SizeLat:40 for: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:35 Lat:35 SizeLat:35 for: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:30 Lat:30 SizeLat:30 for: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:22 Lat:22 SizeLat:22 for: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:17 Lat:17 SizeLat:17 for: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:51 Lat:51 SizeLat:51 for: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:45 Lat:45 SizeLat:45 for: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:39 Lat:39 SizeLat:39 for: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:33 Lat:33 SizeLat:33 for: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:25 Lat:25 SizeLat:25 for: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:19 Lat:19 SizeLat:19 for: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:91 CodeSize:107 Lat:107 SizeLat:107 for: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:86 CodeSize:101 Lat:101 SizeLat:101 for: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:81 CodeSize:95 Lat:95 SizeLat:95 for: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:76 CodeSize:89 Lat:89 SizeLat:89 for: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:83 Lat:83 SizeLat:83 for: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:66 CodeSize:77 Lat:77 SizeLat:77 for: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:71 Lat:71 SizeLat:71 for: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:65 Lat:65 SizeLat:65 for: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:53 Lat:53 SizeLat:53 for: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:47 Lat:47 SizeLat:47 for: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:41 Lat:41 SizeLat:41 for: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:35 Lat:35 SizeLat:35 for: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:26 Lat:26 SizeLat:26 for: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:20 Lat:20 SizeLat:20 for: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:158 CodeSize:190 Lat:190 SizeLat:190 for: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:79 CodeSize:95 Lat:95 SizeLat:95 for: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:47 SizeLat:47 for: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:23 Lat:23 SizeLat:23 for: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:376 CodeSize:440 Lat:440 SizeLat:440 for: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:188 CodeSize:220 Lat:220 SizeLat:220 for: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:94 CodeSize:110 Lat:110 SizeLat:110 for: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:46 CodeSize:54 Lat:54 SizeLat:54 for: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:31 CodeSize:38 Lat:38 SizeLat:38 for: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr align 1 undef, <7 x i1> %m7, <7 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:33 Lat:33 SizeLat:33 for: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr align 1 undef, <6 x i1> %m6, <6 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:28 Lat:28 SizeLat:28 for: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr align 1 undef, <5 x i1> %m5, <5 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:16 Lat:16 SizeLat:16 for: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr align 1 undef, <3 x i1> %m3, <3 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:75 CodeSize:91 Lat:91 SizeLat:91 for: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:86 Lat:86 SizeLat:86 for: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr align 1 undef, <15 x i1> %m15, <15 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:67 CodeSize:81 Lat:81 SizeLat:81 for: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr align 1 undef, <14 x i1> %m14, <14 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:63 CodeSize:76 Lat:76 SizeLat:76 for: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr align 1 undef, <13 x i1> %m13, <13 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:59 CodeSize:71 Lat:71 SizeLat:71 for: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr align 1 undef, <12 x i1> %m12, <12 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:66 Lat:66 SizeLat:66 for: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr align 1 undef, <11 x i1> %m11, <11 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:61 Lat:61 SizeLat:61 for: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr align 1 undef, <10 x i1> %m10, <10 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:47 CodeSize:56 Lat:56 SizeLat:56 for: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr align 1 undef, <9 x i1> %m9, <9 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:45 SizeLat:45 for: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:40 Lat:40 SizeLat:40 for: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr align 1 undef, <7 x i1> %m7, <7 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:35 Lat:35 SizeLat:35 for: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr align 1 undef, <6 x i1> %m6, <6 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:30 Lat:30 SizeLat:30 for: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr align 1 undef, <5 x i1> %m5, <5 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:22 Lat:22 SizeLat:22 for: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:17 Lat:17 SizeLat:17 for: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr align 1 undef, <3 x i1> %m3, <3 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr align 1 undef, <1 x i1> %m1, <1 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:51 Lat:51 SizeLat:51 for: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:45 Lat:45 SizeLat:45 for: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr align 1 undef, <7 x i1> %m7, <7 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:39 Lat:39 SizeLat:39 for: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr align 1 undef, <6 x i1> %m6, <6 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:33 Lat:33 SizeLat:33 for: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr align 1 undef, <5 x i1> %m5, <5 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:25 Lat:25 SizeLat:25 for: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:19 Lat:19 SizeLat:19 for: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr align 1 undef, <3 x i1> %m3, <3 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:91 CodeSize:107 Lat:107 SizeLat:107 for: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:86 CodeSize:101 Lat:101 SizeLat:101 for: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr align 1 undef, <15 x i1> %m15, <15 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:81 CodeSize:95 Lat:95 SizeLat:95 for: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr align 1 undef, <14 x i1> %m14, <14 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:76 CodeSize:89 Lat:89 SizeLat:89 for: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr align 1 undef, <13 x i1> %m13, <13 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:83 Lat:83 SizeLat:83 for: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr align 1 undef, <12 x i1> %m12, <12 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:66 CodeSize:77 Lat:77 SizeLat:77 for: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr align 1 undef, <11 x i1> %m11, <11 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:71 Lat:71 SizeLat:71 for: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr align 1 undef, <10 x i1> %m10, <10 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:65 Lat:65 SizeLat:65 for: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr align 1 undef, <9 x i1> %m9, <9 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:53 Lat:53 SizeLat:53 for: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:47 Lat:47 SizeLat:47 for: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr align 1 undef, <7 x i1> %m7, <7 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:41 Lat:41 SizeLat:41 for: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr align 1 undef, <6 x i1> %m6, <6 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:35 Lat:35 SizeLat:35 for: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr align 1 undef, <5 x i1> %m5, <5 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:26 Lat:26 SizeLat:26 for: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:20 Lat:20 SizeLat:20 for: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr align 1 undef, <3 x i1> %m3, <3 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 1 undef, <1 x i1> %m1, <1 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:158 CodeSize:190 Lat:190 SizeLat:190 for: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:79 CodeSize:95 Lat:95 SizeLat:95 for: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:47 SizeLat:47 for: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:23 Lat:23 SizeLat:23 for: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:376 CodeSize:440 Lat:440 SizeLat:440 for: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:188 CodeSize:220 Lat:220 SizeLat:220 for: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:94 CodeSize:110 Lat:110 SizeLat:110 for: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:46 CodeSize:54 Lat:54 SizeLat:54 for: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_load'
-; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:32 Lat:32 SizeLat:32 for: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:28 Lat:28 SizeLat:28 for: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:24 Lat:24 SizeLat:24 for: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:14 Lat:14 SizeLat:14 for: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:76 Lat:76 SizeLat:76 for: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:71 Lat:71 SizeLat:71 for: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:52 CodeSize:66 Lat:66 SizeLat:66 for: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:61 Lat:61 SizeLat:61 for: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:57 Lat:57 SizeLat:57 for: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:41 CodeSize:52 Lat:52 SizeLat:52 for: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:47 Lat:47 SizeLat:47 for: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:42 Lat:42 SizeLat:42 for: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:38 Lat:38 SizeLat:38 for: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:33 Lat:33 SizeLat:33 for: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:28 Lat:28 SizeLat:28 for: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:23 Lat:23 SizeLat:23 for: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:14 Lat:14 SizeLat:14 for: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:35 Lat:35 SizeLat:35 for: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:30 Lat:30 SizeLat:30 for: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:25 Lat:25 SizeLat:25 for: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:15 Lat:15 SizeLat:15 for: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:75 Lat:75 SizeLat:75 for: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:70 Lat:70 SizeLat:70 for: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:52 CodeSize:65 Lat:65 SizeLat:65 for: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:60 Lat:60 SizeLat:60 for: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:55 Lat:55 SizeLat:55 for: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:50 Lat:50 SizeLat:50 for: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:45 Lat:45 SizeLat:45 for: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:35 Lat:35 SizeLat:35 for: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:30 Lat:30 SizeLat:30 for: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:25 Lat:25 SizeLat:25 for: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:15 Lat:15 SizeLat:15 for: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:160 Lat:160 SizeLat:160 for: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:320 Lat:320 SizeLat:320 for: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:160 Lat:160 SizeLat:160 for: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:32 Lat:32 SizeLat:32 for: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr align 1 undef, <7 x i1> %m7, <7 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:28 Lat:28 SizeLat:28 for: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr align 1 undef, <6 x i1> %m6, <6 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:24 Lat:24 SizeLat:24 for: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr align 1 undef, <5 x i1> %m5, <5 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:14 Lat:14 SizeLat:14 for: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr align 1 undef, <3 x i1> %m3, <3 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:76 Lat:76 SizeLat:76 for: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:71 Lat:71 SizeLat:71 for: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr align 1 undef, <15 x i1> %m15, <15 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:52 CodeSize:66 Lat:66 SizeLat:66 for: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr align 1 undef, <14 x i1> %m14, <14 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:61 Lat:61 SizeLat:61 for: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr align 1 undef, <13 x i1> %m13, <13 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:57 Lat:57 SizeLat:57 for: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr align 1 undef, <12 x i1> %m12, <12 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:41 CodeSize:52 Lat:52 SizeLat:52 for: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr align 1 undef, <11 x i1> %m11, <11 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:47 Lat:47 SizeLat:47 for: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr align 1 undef, <10 x i1> %m10, <10 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:42 Lat:42 SizeLat:42 for: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr align 1 undef, <9 x i1> %m9, <9 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:38 Lat:38 SizeLat:38 for: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:33 Lat:33 SizeLat:33 for: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr align 1 undef, <7 x i1> %m7, <7 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:28 Lat:28 SizeLat:28 for: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr align 1 undef, <6 x i1> %m6, <6 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:23 Lat:23 SizeLat:23 for: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr align 1 undef, <5 x i1> %m5, <5 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:14 Lat:14 SizeLat:14 for: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr align 1 undef, <3 x i1> %m3, <3 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr align 1 undef, <1 x i1> %m1, <1 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:35 Lat:35 SizeLat:35 for: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr align 1 undef, <7 x i1> %m7, <7 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:30 Lat:30 SizeLat:30 for: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr align 1 undef, <6 x i1> %m6, <6 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:25 Lat:25 SizeLat:25 for: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr align 1 undef, <5 x i1> %m5, <5 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:15 Lat:15 SizeLat:15 for: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr align 1 undef, <3 x i1> %m3, <3 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:75 Lat:75 SizeLat:75 for: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr align 1 undef, <15 x i1> %m15, <15 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:70 Lat:70 SizeLat:70 for: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr align 1 undef, <14 x i1> %m14, <14 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:52 CodeSize:65 Lat:65 SizeLat:65 for: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr align 1 undef, <13 x i1> %m13, <13 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:60 Lat:60 SizeLat:60 for: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr align 1 undef, <12 x i1> %m12, <12 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:55 Lat:55 SizeLat:55 for: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr align 1 undef, <11 x i1> %m11, <11 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:50 Lat:50 SizeLat:50 for: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr align 1 undef, <10 x i1> %m10, <10 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:45 Lat:45 SizeLat:45 for: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr align 1 undef, <9 x i1> %m9, <9 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:35 Lat:35 SizeLat:35 for: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr align 1 undef, <7 x i1> %m7, <7 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:30 Lat:30 SizeLat:30 for: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr align 1 undef, <6 x i1> %m6, <6 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:25 Lat:25 SizeLat:25 for: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr align 1 undef, <5 x i1> %m5, <5 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:15 Lat:15 SizeLat:15 for: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr align 1 undef, <3 x i1> %m3, <3 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 1 undef, <1 x i1> %m1, <1 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:160 Lat:160 SizeLat:160 for: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:320 Lat:320 SizeLat:320 for: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:160 Lat:160 SizeLat:160 for: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; AVX-LABEL: 'masked_load'
-; AVX-NEXT:  Cost Model: Found costs of 4 for: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
-; AVX-NEXT:  Cost Model: Found costs of 2 for: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
-; AVX-NEXT:  Cost Model: Found costs of 2 for: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; AVX-NEXT:  Cost Model: Found costs of 4 for: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 2 for: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 2 for: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 4 for: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
-; AVX-NEXT:  Cost Model: Found costs of 2 for: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
-; AVX-NEXT:  Cost Model: Found costs of 2 for: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; AVX-NEXT:  Cost Model: Found costs of 4 for: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 2 for: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 2 for: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:131 CodeSize:163 Lat:163 SizeLat:163 for: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:260 CodeSize:324 Lat:324 SizeLat:324 for: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; AVX-NEXT:  Cost Model: Found costs of 4 for: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr align 1 undef, <7 x i1> %m7, <7 x double> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr align 1 undef, <6 x i1> %m6, <6 x double> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr align 1 undef, <5 x i1> %m5, <5 x double> undef)
+; AVX-NEXT:  Cost Model: Found costs of 2 for: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr align 1 undef, <3 x i1> %m3, <3 x double> undef)
+; AVX-NEXT:  Cost Model: Found costs of 2 for: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; AVX-NEXT:  Cost Model: Found costs of 4 for: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr align 1 undef, <15 x i1> %m15, <15 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr align 1 undef, <14 x i1> %m14, <14 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr align 1 undef, <13 x i1> %m13, <13 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr align 1 undef, <12 x i1> %m12, <12 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr align 1 undef, <11 x i1> %m11, <11 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr align 1 undef, <10 x i1> %m10, <10 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr align 1 undef, <9 x i1> %m9, <9 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 2 for: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr align 1 undef, <7 x i1> %m7, <7 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr align 1 undef, <6 x i1> %m6, <6 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr align 1 undef, <5 x i1> %m5, <5 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 2 for: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr align 1 undef, <3 x i1> %m3, <3 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr align 1 undef, <1 x i1> %m1, <1 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 4 for: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr align 1 undef, <7 x i1> %m7, <7 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr align 1 undef, <6 x i1> %m6, <6 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr align 1 undef, <5 x i1> %m5, <5 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of 2 for: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr align 1 undef, <3 x i1> %m3, <3 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of 2 for: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of 4 for: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr align 1 undef, <15 x i1> %m15, <15 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr align 1 undef, <14 x i1> %m14, <14 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr align 1 undef, <13 x i1> %m13, <13 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr align 1 undef, <12 x i1> %m12, <12 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr align 1 undef, <11 x i1> %m11, <11 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr align 1 undef, <10 x i1> %m10, <10 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr align 1 undef, <9 x i1> %m9, <9 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 2 for: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr align 1 undef, <7 x i1> %m7, <7 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr align 1 undef, <6 x i1> %m6, <6 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr align 1 undef, <5 x i1> %m5, <5 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 2 for: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr align 1 undef, <3 x i1> %m3, <3 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 1 undef, <1 x i1> %m1, <1 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:131 CodeSize:163 Lat:163 SizeLat:163 for: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:260 CodeSize:324 Lat:324 SizeLat:324 for: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; KNL-LABEL: 'masked_load'
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:132 CodeSize:164 Lat:164 SizeLat:164 for: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:262 CodeSize:326 Lat:326 SizeLat:326 for: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr align 1 undef, <7 x i1> %m7, <7 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr align 1 undef, <6 x i1> %m6, <6 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr align 1 undef, <5 x i1> %m5, <5 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr align 1 undef, <3 x i1> %m3, <3 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr align 1 undef, <15 x i1> %m15, <15 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr align 1 undef, <14 x i1> %m14, <14 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr align 1 undef, <13 x i1> %m13, <13 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr align 1 undef, <12 x i1> %m12, <12 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr align 1 undef, <11 x i1> %m11, <11 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr align 1 undef, <10 x i1> %m10, <10 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr align 1 undef, <9 x i1> %m9, <9 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr align 1 undef, <7 x i1> %m7, <7 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr align 1 undef, <6 x i1> %m6, <6 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr align 1 undef, <5 x i1> %m5, <5 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr align 1 undef, <3 x i1> %m3, <3 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr align 1 undef, <1 x i1> %m1, <1 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr align 1 undef, <7 x i1> %m7, <7 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr align 1 undef, <6 x i1> %m6, <6 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr align 1 undef, <5 x i1> %m5, <5 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr align 1 undef, <3 x i1> %m3, <3 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr align 1 undef, <15 x i1> %m15, <15 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr align 1 undef, <14 x i1> %m14, <14 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr align 1 undef, <13 x i1> %m13, <13 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr align 1 undef, <12 x i1> %m12, <12 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr align 1 undef, <11 x i1> %m11, <11 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr align 1 undef, <10 x i1> %m10, <10 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr align 1 undef, <9 x i1> %m9, <9 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr align 1 undef, <7 x i1> %m7, <7 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr align 1 undef, <6 x i1> %m6, <6 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr align 1 undef, <5 x i1> %m5, <5 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr align 1 undef, <3 x i1> %m3, <3 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 1 undef, <1 x i1> %m1, <1 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:132 CodeSize:164 Lat:164 SizeLat:164 for: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:262 CodeSize:326 Lat:326 SizeLat:326 for: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; KNL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SKX-LABEL: 'masked_load'
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr align 1 undef, <7 x i1> %m7, <7 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr align 1 undef, <6 x i1> %m6, <6 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr align 1 undef, <5 x i1> %m5, <5 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr align 1 undef, <3 x i1> %m3, <3 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr align 1 undef, <15 x i1> %m15, <15 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr align 1 undef, <14 x i1> %m14, <14 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr align 1 undef, <13 x i1> %m13, <13 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr align 1 undef, <12 x i1> %m12, <12 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr align 1 undef, <11 x i1> %m11, <11 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr align 1 undef, <10 x i1> %m10, <10 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr align 1 undef, <9 x i1> %m9, <9 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr align 1 undef, <7 x i1> %m7, <7 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr align 1 undef, <6 x i1> %m6, <6 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr align 1 undef, <5 x i1> %m5, <5 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr align 1 undef, <3 x i1> %m3, <3 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr align 1 undef, <1 x i1> %m1, <1 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr align 1 undef, <7 x i1> %m7, <7 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr align 1 undef, <6 x i1> %m6, <6 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr align 1 undef, <5 x i1> %m5, <5 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr align 1 undef, <3 x i1> %m3, <3 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr align 1 undef, <15 x i1> %m15, <15 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr align 1 undef, <14 x i1> %m14, <14 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr align 1 undef, <13 x i1> %m13, <13 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr align 1 undef, <12 x i1> %m12, <12 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr align 1 undef, <11 x i1> %m11, <11 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr align 1 undef, <10 x i1> %m10, <10 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr align 1 undef, <9 x i1> %m9, <9 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr align 1 undef, <7 x i1> %m7, <7 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr align 1 undef, <6 x i1> %m6, <6 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr align 1 undef, <5 x i1> %m5, <5 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr align 1 undef, <3 x i1> %m3, <3 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 1 undef, <1 x i1> %m1, <1 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; SKX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
   %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
@@ -371,298 +371,298 @@ define i32 @masked_load(<1 x i1> %m1, <2 x i1> %m2, <3 x i1> %m3, <4 x i1> %m4,
 
 define i32 @masked_store(<1 x i1> %m1, <2 x i1> %m2, <3 x i1> %m3, <4 x i1> %m4, <5 x i1> %m5, <6 x i1> %m6, <7 x i1> %m7, <8 x i1> %m8, <9 x i1> %m9, <10 x i1> %m10, <11 x i1> %m11, <12 x i1> %m12, <13 x i1> %m13, <14 x i1> %m14, <15 x i1> %m15, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_store'
-; SSE2-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:32 Lat:32 SizeLat:32 for: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:26 Lat:26 SizeLat:26 for: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:15 Lat:15 SizeLat:15 for: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:75 CodeSize:91 Lat:91 SizeLat:91 for: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:85 Lat:85 SizeLat:85 for: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:79 Lat:79 SizeLat:79 for: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:73 Lat:73 SizeLat:73 for: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:68 Lat:68 SizeLat:68 for: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:62 Lat:62 SizeLat:62 for: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:46 CodeSize:56 Lat:56 SizeLat:56 for: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:41 CodeSize:50 Lat:50 SizeLat:50 for: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:45 SizeLat:45 for: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:39 Lat:39 SizeLat:39 for: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:27 Lat:27 SizeLat:27 for: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:22 Lat:22 SizeLat:22 for: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:16 Lat:16 SizeLat:16 for: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:51 Lat:51 SizeLat:51 for: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:44 Lat:44 SizeLat:44 for: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:38 Lat:38 SizeLat:38 for: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:31 Lat:31 SizeLat:31 for: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:25 Lat:25 SizeLat:25 for: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:91 CodeSize:107 Lat:107 SizeLat:107 for: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:85 CodeSize:100 Lat:100 SizeLat:100 for: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:79 CodeSize:93 Lat:93 SizeLat:93 for: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:73 CodeSize:86 Lat:86 SizeLat:86 for: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:68 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:62 CodeSize:73 Lat:73 SizeLat:73 for: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:66 Lat:66 SizeLat:66 for: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:50 CodeSize:59 Lat:59 SizeLat:59 for: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:53 Lat:53 SizeLat:53 for: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:46 Lat:46 SizeLat:46 for: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:39 Lat:39 SizeLat:39 for: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:32 Lat:32 SizeLat:32 for: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:26 Lat:26 SizeLat:26 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:158 CodeSize:190 Lat:190 SizeLat:190 for: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:79 CodeSize:95 Lat:95 SizeLat:95 for: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:47 SizeLat:47 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:23 Lat:23 SizeLat:23 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:376 CodeSize:440 Lat:440 SizeLat:440 for: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:188 CodeSize:220 Lat:220 SizeLat:220 for: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:94 CodeSize:110 Lat:110 SizeLat:110 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:46 CodeSize:54 Lat:54 SizeLat:54 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr align 1 undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr align 1 undef, <7 x i1> %m7)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:32 Lat:32 SizeLat:32 for: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr align 1 undef, <6 x i1> %m6)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:26 Lat:26 SizeLat:26 for: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr align 1 undef, <5 x i1> %m5)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr align 1 undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:15 Lat:15 SizeLat:15 for: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr align 1 undef, <3 x i1> %m3)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr align 1 undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr align 1 undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:75 CodeSize:91 Lat:91 SizeLat:91 for: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr align 1 undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:85 Lat:85 SizeLat:85 for: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr align 1 undef, <15 x i1> %m15)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:79 Lat:79 SizeLat:79 for: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr align 1 undef, <14 x i1> %m14)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:73 Lat:73 SizeLat:73 for: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr align 1 undef, <13 x i1> %m13)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:68 Lat:68 SizeLat:68 for: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr align 1 undef, <12 x i1> %m12)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:62 Lat:62 SizeLat:62 for: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr align 1 undef, <11 x i1> %m11)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:46 CodeSize:56 Lat:56 SizeLat:56 for: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr align 1 undef, <10 x i1> %m10)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:41 CodeSize:50 Lat:50 SizeLat:50 for: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr align 1 undef, <9 x i1> %m9)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:45 SizeLat:45 for: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr align 1 undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:39 Lat:39 SizeLat:39 for: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr align 1 undef, <7 x i1> %m7)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr align 1 undef, <6 x i1> %m6)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:27 Lat:27 SizeLat:27 for: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr align 1 undef, <5 x i1> %m5)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:22 Lat:22 SizeLat:22 for: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr align 1 undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:16 Lat:16 SizeLat:16 for: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr align 1 undef, <3 x i1> %m3)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr align 1 undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr align 1 undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:51 Lat:51 SizeLat:51 for: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr align 1 undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:44 Lat:44 SizeLat:44 for: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr align 1 undef, <7 x i1> %m7)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:38 Lat:38 SizeLat:38 for: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr align 1 undef, <6 x i1> %m6)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:31 Lat:31 SizeLat:31 for: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr align 1 undef, <5 x i1> %m5)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:25 Lat:25 SizeLat:25 for: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr align 1 undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr align 1 undef, <3 x i1> %m3)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr align 1 undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr align 1 undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:91 CodeSize:107 Lat:107 SizeLat:107 for: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr align 1 undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:85 CodeSize:100 Lat:100 SizeLat:100 for: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr align 1 undef, <15 x i1> %m15)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:79 CodeSize:93 Lat:93 SizeLat:93 for: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr align 1 undef, <14 x i1> %m14)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:73 CodeSize:86 Lat:86 SizeLat:86 for: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr align 1 undef, <13 x i1> %m13)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:68 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr align 1 undef, <12 x i1> %m12)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:62 CodeSize:73 Lat:73 SizeLat:73 for: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr align 1 undef, <11 x i1> %m11)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:66 Lat:66 SizeLat:66 for: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr align 1 undef, <10 x i1> %m10)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:50 CodeSize:59 Lat:59 SizeLat:59 for: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr align 1 undef, <9 x i1> %m9)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:53 Lat:53 SizeLat:53 for: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr align 1 undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:46 Lat:46 SizeLat:46 for: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr align 1 undef, <7 x i1> %m7)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:39 Lat:39 SizeLat:39 for: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr align 1 undef, <6 x i1> %m6)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:32 Lat:32 SizeLat:32 for: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr align 1 undef, <5 x i1> %m5)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:26 Lat:26 SizeLat:26 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr align 1 undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr align 1 undef, <3 x i1> %m3)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr align 1 undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr align 1 undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:158 CodeSize:190 Lat:190 SizeLat:190 for: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr align 1 undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:79 CodeSize:95 Lat:95 SizeLat:95 for: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr align 1 undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:47 SizeLat:47 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr align 1 undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:23 Lat:23 SizeLat:23 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr align 1 undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:376 CodeSize:440 Lat:440 SizeLat:440 for: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr align 1 undef, <64 x i1> %m64)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:188 CodeSize:220 Lat:220 SizeLat:220 for: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr align 1 undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:94 CodeSize:110 Lat:110 SizeLat:110 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr align 1 undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:46 CodeSize:54 Lat:54 SizeLat:54 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr align 1 undef, <8 x i1> %m8)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_store'
-; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:31 Lat:31 SizeLat:31 for: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:27 Lat:27 SizeLat:27 for: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:22 Lat:22 SizeLat:22 for: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:13 Lat:13 SizeLat:13 for: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:76 Lat:76 SizeLat:76 for: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:52 CodeSize:66 Lat:66 SizeLat:66 for: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:61 Lat:61 SizeLat:61 for: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:57 Lat:57 SizeLat:57 for: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:41 CodeSize:52 Lat:52 SizeLat:52 for: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:47 Lat:47 SizeLat:47 for: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:42 Lat:42 SizeLat:42 for: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:38 Lat:38 SizeLat:38 for: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:28 Lat:28 SizeLat:28 for: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:23 Lat:23 SizeLat:23 for: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:14 Lat:14 SizeLat:14 for: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:30 Lat:30 SizeLat:30 for: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:25 Lat:25 SizeLat:25 for: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:15 Lat:15 SizeLat:15 for: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:75 Lat:75 SizeLat:75 for: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:70 Lat:70 SizeLat:70 for: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:52 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:60 Lat:60 SizeLat:60 for: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:55 Lat:55 SizeLat:55 for: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:50 Lat:50 SizeLat:50 for: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:45 Lat:45 SizeLat:45 for: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:30 Lat:30 SizeLat:30 for: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:25 Lat:25 SizeLat:25 for: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:15 Lat:15 SizeLat:15 for: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:160 Lat:160 SizeLat:160 for: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:320 Lat:320 SizeLat:320 for: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:160 Lat:160 SizeLat:160 for: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr align 1 undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:31 Lat:31 SizeLat:31 for: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr align 1 undef, <7 x i1> %m7)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:27 Lat:27 SizeLat:27 for: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr align 1 undef, <6 x i1> %m6)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:22 Lat:22 SizeLat:22 for: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr align 1 undef, <5 x i1> %m5)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr align 1 undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:13 Lat:13 SizeLat:13 for: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr align 1 undef, <3 x i1> %m3)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr align 1 undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr align 1 undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:76 Lat:76 SizeLat:76 for: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr align 1 undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr align 1 undef, <15 x i1> %m15)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:52 CodeSize:66 Lat:66 SizeLat:66 for: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr align 1 undef, <14 x i1> %m14)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:61 Lat:61 SizeLat:61 for: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr align 1 undef, <13 x i1> %m13)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:57 Lat:57 SizeLat:57 for: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr align 1 undef, <12 x i1> %m12)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:41 CodeSize:52 Lat:52 SizeLat:52 for: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr align 1 undef, <11 x i1> %m11)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:47 Lat:47 SizeLat:47 for: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr align 1 undef, <10 x i1> %m10)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:42 Lat:42 SizeLat:42 for: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr align 1 undef, <9 x i1> %m9)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:38 Lat:38 SizeLat:38 for: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr align 1 undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr align 1 undef, <7 x i1> %m7)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:28 Lat:28 SizeLat:28 for: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr align 1 undef, <6 x i1> %m6)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:23 Lat:23 SizeLat:23 for: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr align 1 undef, <5 x i1> %m5)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr align 1 undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:14 Lat:14 SizeLat:14 for: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr align 1 undef, <3 x i1> %m3)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr align 1 undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr align 1 undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr align 1 undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr align 1 undef, <7 x i1> %m7)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:30 Lat:30 SizeLat:30 for: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr align 1 undef, <6 x i1> %m6)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:25 Lat:25 SizeLat:25 for: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr align 1 undef, <5 x i1> %m5)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr align 1 undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:15 Lat:15 SizeLat:15 for: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr align 1 undef, <3 x i1> %m3)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr align 1 undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr align 1 undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr align 1 undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:75 Lat:75 SizeLat:75 for: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr align 1 undef, <15 x i1> %m15)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:70 Lat:70 SizeLat:70 for: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr align 1 undef, <14 x i1> %m14)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:52 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr align 1 undef, <13 x i1> %m13)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:60 Lat:60 SizeLat:60 for: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr align 1 undef, <12 x i1> %m12)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:55 Lat:55 SizeLat:55 for: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr align 1 undef, <11 x i1> %m11)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:50 Lat:50 SizeLat:50 for: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr align 1 undef, <10 x i1> %m10)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:45 Lat:45 SizeLat:45 for: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr align 1 undef, <9 x i1> %m9)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr align 1 undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr align 1 undef, <7 x i1> %m7)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:30 Lat:30 SizeLat:30 for: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr align 1 undef, <6 x i1> %m6)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:25 Lat:25 SizeLat:25 for: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr align 1 undef, <5 x i1> %m5)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr align 1 undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:15 Lat:15 SizeLat:15 for: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr align 1 undef, <3 x i1> %m3)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr align 1 undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr align 1 undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:160 Lat:160 SizeLat:160 for: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr align 1 undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr align 1 undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr align 1 undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr align 1 undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:320 Lat:320 SizeLat:320 for: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr align 1 undef, <64 x i1> %m64)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:160 Lat:160 SizeLat:160 for: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr align 1 undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr align 1 undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr align 1 undef, <8 x i1> %m8)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; AVX-LABEL: 'masked_store'
-; AVX-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
-; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
-; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
-; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
-; AVX-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
-; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
-; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
-; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
-; AVX-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
-; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
-; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
-; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
-; AVX-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
-; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
-; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
-; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
-; AVX-NEXT:  Cost Model: Found costs of RThru:131 CodeSize:163 Lat:163 SizeLat:163 for: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
-; AVX-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
-; AVX-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
-; AVX-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
-; AVX-NEXT:  Cost Model: Found costs of RThru:260 CodeSize:324 Lat:324 SizeLat:324 for: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
-; AVX-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
-; AVX-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
-; AVX-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr align 1 undef, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr align 1 undef, <7 x i1> %m7)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr align 1 undef, <6 x i1> %m6)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr align 1 undef, <5 x i1> %m5)
+; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr align 1 undef, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr align 1 undef, <3 x i1> %m3)
+; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr align 1 undef, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr align 1 undef, <1 x i1> %m1)
+; AVX-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr align 1 undef, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr align 1 undef, <15 x i1> %m15)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr align 1 undef, <14 x i1> %m14)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr align 1 undef, <13 x i1> %m13)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr align 1 undef, <12 x i1> %m12)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr align 1 undef, <11 x i1> %m11)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr align 1 undef, <10 x i1> %m10)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr align 1 undef, <9 x i1> %m9)
+; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr align 1 undef, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr align 1 undef, <7 x i1> %m7)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr align 1 undef, <6 x i1> %m6)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr align 1 undef, <5 x i1> %m5)
+; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr align 1 undef, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr align 1 undef, <3 x i1> %m3)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr align 1 undef, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr align 1 undef, <1 x i1> %m1)
+; AVX-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr align 1 undef, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr align 1 undef, <7 x i1> %m7)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr align 1 undef, <6 x i1> %m6)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr align 1 undef, <5 x i1> %m5)
+; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr align 1 undef, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr align 1 undef, <3 x i1> %m3)
+; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr align 1 undef, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr align 1 undef, <1 x i1> %m1)
+; AVX-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr align 1 undef, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr align 1 undef, <15 x i1> %m15)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr align 1 undef, <14 x i1> %m14)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr align 1 undef, <13 x i1> %m13)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr align 1 undef, <12 x i1> %m12)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr align 1 undef, <11 x i1> %m11)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr align 1 undef, <10 x i1> %m10)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr align 1 undef, <9 x i1> %m9)
+; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr align 1 undef, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr align 1 undef, <7 x i1> %m7)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr align 1 undef, <6 x i1> %m6)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr align 1 undef, <5 x i1> %m5)
+; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr align 1 undef, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr align 1 undef, <3 x i1> %m3)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr align 1 undef, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr align 1 undef, <1 x i1> %m1)
+; AVX-NEXT:  Cost Model: Found costs of RThru:131 CodeSize:163 Lat:163 SizeLat:163 for: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr align 1 undef, <32 x i1> %m32)
+; AVX-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr align 1 undef, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr align 1 undef, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr align 1 undef, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found costs of RThru:260 CodeSize:324 Lat:324 SizeLat:324 for: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr align 1 undef, <64 x i1> %m64)
+; AVX-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr align 1 undef, <32 x i1> %m32)
+; AVX-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr align 1 undef, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr align 1 undef, <8 x i1> %m8)
 ; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; KNL-LABEL: 'masked_store'
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
-; KNL-NEXT:  Cost Model: Found costs of RThru:132 CodeSize:164 Lat:164 SizeLat:164 for: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
-; KNL-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
-; KNL-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found costs of RThru:262 CodeSize:326 Lat:326 SizeLat:326 for: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
-; KNL-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
-; KNL-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
-; KNL-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr align 1 undef, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr align 1 undef, <7 x i1> %m7)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr align 1 undef, <6 x i1> %m6)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr align 1 undef, <5 x i1> %m5)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr align 1 undef, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr align 1 undef, <3 x i1> %m3)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr align 1 undef, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr align 1 undef, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr align 1 undef, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr align 1 undef, <15 x i1> %m15)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr align 1 undef, <14 x i1> %m14)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr align 1 undef, <13 x i1> %m13)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr align 1 undef, <12 x i1> %m12)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr align 1 undef, <11 x i1> %m11)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr align 1 undef, <10 x i1> %m10)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr align 1 undef, <9 x i1> %m9)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr align 1 undef, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr align 1 undef, <7 x i1> %m7)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr align 1 undef, <6 x i1> %m6)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr align 1 undef, <5 x i1> %m5)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr align 1 undef, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr align 1 undef, <3 x i1> %m3)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr align 1 undef, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr align 1 undef, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr align 1 undef, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr align 1 undef, <7 x i1> %m7)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr align 1 undef, <6 x i1> %m6)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr align 1 undef, <5 x i1> %m5)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr align 1 undef, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr align 1 undef, <3 x i1> %m3)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr align 1 undef, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr align 1 undef, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr align 1 undef, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr align 1 undef, <15 x i1> %m15)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr align 1 undef, <14 x i1> %m14)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr align 1 undef, <13 x i1> %m13)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr align 1 undef, <12 x i1> %m12)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr align 1 undef, <11 x i1> %m11)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr align 1 undef, <10 x i1> %m10)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr align 1 undef, <9 x i1> %m9)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr align 1 undef, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr align 1 undef, <7 x i1> %m7)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr align 1 undef, <6 x i1> %m6)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr align 1 undef, <5 x i1> %m5)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr align 1 undef, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr align 1 undef, <3 x i1> %m3)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr align 1 undef, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr align 1 undef, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found costs of RThru:132 CodeSize:164 Lat:164 SizeLat:164 for: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr align 1 undef, <32 x i1> %m32)
+; KNL-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr align 1 undef, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr align 1 undef, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr align 1 undef, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found costs of RThru:262 CodeSize:326 Lat:326 SizeLat:326 for: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr align 1 undef, <64 x i1> %m64)
+; KNL-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr align 1 undef, <32 x i1> %m32)
+; KNL-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr align 1 undef, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr align 1 undef, <8 x i1> %m8)
 ; KNL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SKX-LABEL: 'masked_store'
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr align 1 undef, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr align 1 undef, <7 x i1> %m7)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr align 1 undef, <6 x i1> %m6)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr align 1 undef, <5 x i1> %m5)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr align 1 undef, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr align 1 undef, <3 x i1> %m3)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr align 1 undef, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr align 1 undef, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr align 1 undef, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr align 1 undef, <15 x i1> %m15)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr align 1 undef, <14 x i1> %m14)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr align 1 undef, <13 x i1> %m13)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr align 1 undef, <12 x i1> %m12)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr align 1 undef, <11 x i1> %m11)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr align 1 undef, <10 x i1> %m10)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr align 1 undef, <9 x i1> %m9)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr align 1 undef, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr align 1 undef, <7 x i1> %m7)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr align 1 undef, <6 x i1> %m6)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr align 1 undef, <5 x i1> %m5)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr align 1 undef, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr align 1 undef, <3 x i1> %m3)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr align 1 undef, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr align 1 undef, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr align 1 undef, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr align 1 undef, <7 x i1> %m7)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr align 1 undef, <6 x i1> %m6)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr align 1 undef, <5 x i1> %m5)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr align 1 undef, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr align 1 undef, <3 x i1> %m3)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr align 1 undef, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr align 1 undef, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr align 1 undef, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr align 1 undef, <15 x i1> %m15)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr align 1 undef, <14 x i1> %m14)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr align 1 undef, <13 x i1> %m13)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr align 1 undef, <12 x i1> %m12)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr align 1 undef, <11 x i1> %m11)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr align 1 undef, <10 x i1> %m10)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr align 1 undef, <9 x i1> %m9)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr align 1 undef, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr align 1 undef, <7 x i1> %m7)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr align 1 undef, <6 x i1> %m6)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr align 1 undef, <5 x i1> %m5)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr align 1 undef, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr align 1 undef, <3 x i1> %m3)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr align 1 undef, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr align 1 undef, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr align 1 undef, <32 x i1> %m32)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr align 1 undef, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr align 1 undef, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr align 1 undef, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr align 1 undef, <64 x i1> %m64)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr align 1 undef, <32 x i1> %m32)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr align 1 undef, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr align 1 undef, <8 x i1> %m8)
 ; SKX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
   call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
@@ -732,192 +732,192 @@ define i32 @masked_store(<1 x i1> %m1, <2 x i1> %m2, <3 x i1> %m3, <4 x i1> %m4,
 
 define i32 @masked_gather(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_gather'
-; SSE2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:77 Lat:77 SizeLat:77 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:31 CodeSize:39 Lat:39 SizeLat:39 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:45 SizeLat:45 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:23 Lat:23 SizeLat:23 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:77 CodeSize:93 Lat:93 SizeLat:93 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:47 SizeLat:47 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:24 Lat:24 SizeLat:24 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:41 Lat:41 SizeLat:41 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:320 CodeSize:384 Lat:384 SizeLat:384 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:160 CodeSize:192 Lat:192 SizeLat:192 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:80 CodeSize:96 Lat:96 SizeLat:96 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:48 Lat:48 SizeLat:48 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:77 Lat:77 SizeLat:77 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:31 CodeSize:39 Lat:39 SizeLat:39 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:45 SizeLat:45 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:23 Lat:23 SizeLat:23 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:77 CodeSize:93 Lat:93 SizeLat:93 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:47 SizeLat:47 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:24 Lat:24 SizeLat:24 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:41 Lat:41 SizeLat:41 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:320 CodeSize:384 Lat:384 SizeLat:384 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:160 CodeSize:192 Lat:192 SizeLat:192 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:80 CodeSize:96 Lat:96 SizeLat:96 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:48 Lat:48 SizeLat:48 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_gather'
-; SSE42-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:29 Lat:29 SizeLat:29 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:15 Lat:15 SizeLat:15 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:61 Lat:61 SizeLat:61 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:31 Lat:31 SizeLat:31 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:16 Lat:16 SizeLat:16 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:130 SizeLat:130 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:196 CodeSize:260 Lat:260 SizeLat:260 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:130 SizeLat:130 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:29 Lat:29 SizeLat:29 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:15 Lat:15 SizeLat:15 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:61 Lat:61 SizeLat:61 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:31 Lat:31 SizeLat:31 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:16 Lat:16 SizeLat:16 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:130 SizeLat:130 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:196 CodeSize:260 Lat:260 SizeLat:260 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:130 SizeLat:130 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_gather'
-; AVX1-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:108 CodeSize:140 Lat:140 SizeLat:140 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:214 CodeSize:278 Lat:278 SizeLat:278 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:108 CodeSize:140 Lat:140 SizeLat:140 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:214 CodeSize:278 Lat:278 SizeLat:278 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_gather'
-; AVX2-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:276 SizeLat:276 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:138 SizeLat:138 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:276 SizeLat:276 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:138 SizeLat:138 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SKL-LABEL: 'masked_gather'
-; SKL-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:2 Lat:12 SizeLat:12 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:2 Lat:12 SizeLat:12 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:2 Lat:12 SizeLat:12 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:2 Lat:12 SizeLat:12 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:276 SizeLat:276 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:138 SizeLat:138 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:2 Lat:12 SizeLat:12 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:2 Lat:12 SizeLat:12 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:2 Lat:12 SizeLat:12 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:2 Lat:12 SizeLat:12 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:276 SizeLat:276 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:138 SizeLat:138 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; KNL-LABEL: 'masked_gather'
-; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:22 Lat:22 SizeLat:22 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:143 CodeSize:175 Lat:175 SizeLat:175 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:87 Lat:87 SizeLat:87 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:283 CodeSize:347 Lat:347 SizeLat:347 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:141 CodeSize:173 Lat:173 SizeLat:173 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:86 Lat:86 SizeLat:86 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:22 Lat:22 SizeLat:22 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:143 CodeSize:175 Lat:175 SizeLat:175 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:87 Lat:87 SizeLat:87 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:283 CodeSize:347 Lat:347 SizeLat:347 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:141 CodeSize:173 Lat:173 SizeLat:173 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:86 Lat:86 SizeLat:86 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; KNL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SKX-LABEL: 'masked_gather'
-; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:143 CodeSize:175 Lat:175 SizeLat:175 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:87 Lat:87 SizeLat:87 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:283 CodeSize:347 Lat:347 SizeLat:347 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:141 CodeSize:173 Lat:173 SizeLat:173 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:86 Lat:86 SizeLat:86 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:143 CodeSize:175 Lat:175 SizeLat:175 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:87 Lat:87 SizeLat:87 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:283 CodeSize:347 Lat:347 SizeLat:347 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:141 CodeSize:173 Lat:173 SizeLat:173 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:86 Lat:86 SizeLat:86 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; SKX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
   %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
@@ -955,192 +955,192 @@ define i32 @masked_gather(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8
 
 define i32 @masked_scatter(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_scatter'
-; SSE2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:77 Lat:77 SizeLat:77 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:31 CodeSize:39 Lat:39 SizeLat:39 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:45 SizeLat:45 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:23 Lat:23 SizeLat:23 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:77 CodeSize:93 Lat:93 SizeLat:93 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:47 SizeLat:47 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:24 Lat:24 SizeLat:24 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:41 Lat:41 SizeLat:41 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:320 CodeSize:384 Lat:384 SizeLat:384 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:160 CodeSize:192 Lat:192 SizeLat:192 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:80 CodeSize:96 Lat:96 SizeLat:96 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:48 Lat:48 SizeLat:48 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:77 Lat:77 SizeLat:77 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:31 CodeSize:39 Lat:39 SizeLat:39 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:45 SizeLat:45 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:23 Lat:23 SizeLat:23 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:77 CodeSize:93 Lat:93 SizeLat:93 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:47 SizeLat:47 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:24 Lat:24 SizeLat:24 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:41 Lat:41 SizeLat:41 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:320 CodeSize:384 Lat:384 SizeLat:384 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> align 1 undef, <64 x i1> %m64)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:160 CodeSize:192 Lat:192 SizeLat:192 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:80 CodeSize:96 Lat:96 SizeLat:96 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:48 Lat:48 SizeLat:48 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_scatter'
-; SSE42-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:29 Lat:29 SizeLat:29 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:15 Lat:15 SizeLat:15 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:61 Lat:61 SizeLat:61 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:31 Lat:31 SizeLat:31 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:16 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:130 SizeLat:130 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:196 CodeSize:260 Lat:260 SizeLat:260 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:130 SizeLat:130 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:29 Lat:29 SizeLat:29 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:15 Lat:15 SizeLat:15 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:61 Lat:61 SizeLat:61 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:31 Lat:31 SizeLat:31 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:16 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:130 SizeLat:130 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:196 CodeSize:260 Lat:260 SizeLat:260 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> align 1 undef, <64 x i1> %m64)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:130 SizeLat:130 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_scatter'
-; AVX1-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:108 CodeSize:140 Lat:140 SizeLat:140 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:214 CodeSize:278 Lat:278 SizeLat:278 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:108 CodeSize:140 Lat:140 SizeLat:140 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:214 CodeSize:278 Lat:278 SizeLat:278 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> align 1 undef, <64 x i1> %m64)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_scatter'
-; AVX2-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:276 SizeLat:276 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:138 SizeLat:138 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:276 SizeLat:276 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> align 1 undef, <64 x i1> %m64)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:138 SizeLat:138 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SKL-LABEL: 'masked_scatter'
-; SKL-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SKL-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SKL-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SKL-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:276 SizeLat:276 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
-; SKL-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:138 SizeLat:138 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SKL-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:276 SizeLat:276 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> align 1 undef, <64 x i1> %m64)
+; SKL-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:138 SizeLat:138 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; KNL-LABEL: 'masked_scatter'
-; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; KNL-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:22 Lat:22 SizeLat:22 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; KNL-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; KNL-NEXT:  Cost Model: Found costs of RThru:143 CodeSize:175 Lat:175 SizeLat:175 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; KNL-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:87 Lat:87 SizeLat:87 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; KNL-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found costs of RThru:283 CodeSize:347 Lat:347 SizeLat:347 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
-; KNL-NEXT:  Cost Model: Found costs of RThru:141 CodeSize:173 Lat:173 SizeLat:173 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; KNL-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:86 Lat:86 SizeLat:86 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; KNL-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:22 Lat:22 SizeLat:22 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found costs of RThru:143 CodeSize:175 Lat:175 SizeLat:175 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; KNL-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:87 Lat:87 SizeLat:87 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found costs of RThru:283 CodeSize:347 Lat:347 SizeLat:347 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> align 1 undef, <64 x i1> %m64)
+; KNL-NEXT:  Cost Model: Found costs of RThru:141 CodeSize:173 Lat:173 SizeLat:173 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; KNL-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:86 Lat:86 SizeLat:86 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
 ; KNL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SKX-LABEL: 'masked_scatter'
-; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SKX-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SKX-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKX-NEXT:  Cost Model: Found costs of RThru:143 CodeSize:175 Lat:175 SizeLat:175 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SKX-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:87 Lat:87 SizeLat:87 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKX-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found costs of RThru:283 CodeSize:347 Lat:347 SizeLat:347 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
-; SKX-NEXT:  Cost Model: Found costs of RThru:141 CodeSize:173 Lat:173 SizeLat:173 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SKX-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:86 Lat:86 SizeLat:86 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKX-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found costs of RThru:143 CodeSize:175 Lat:175 SizeLat:175 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; SKX-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:87 Lat:87 SizeLat:87 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found costs of RThru:283 CodeSize:347 Lat:347 SizeLat:347 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> align 1 undef, <64 x i1> %m64)
+; SKX-NEXT:  Cost Model: Found costs of RThru:141 CodeSize:173 Lat:173 SizeLat:173 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; SKX-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:86 Lat:86 SizeLat:86 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
 ; SKX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
   call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
@@ -1571,22 +1571,22 @@ define i32 @masked_compressstore(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x
 define <2 x double> @test1(<2 x i64> %trigger, ptr %addr, <2 x double> %dst) {
 ; SSE2-LABEL: 'test1'
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:4 SizeLat:5 for: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x double> %dst)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %res
 ;
 ; SSE42-LABEL: 'test1'
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x double> %dst)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %res
 ;
 ; AVX-LABEL: 'test1'
 ; AVX-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
-; AVX-NEXT:  Cost Model: Found costs of 2 for: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; AVX-NEXT:  Cost Model: Found costs of 2 for: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x double> %dst)
 ; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %res
 ;
 ; AVX512-LABEL: 'test1'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; AVX512-NEXT:  Cost Model: Found costs of 1 for: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x double> %dst)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %res
 ;
   %mask = icmp eq <2 x i64> %trigger, zeroinitializer
@@ -1597,22 +1597,22 @@ define <2 x double> @test1(<2 x i64> %trigger, ptr %addr, <2 x double> %dst) {
 define <4 x i32> @test2(<4 x i32> %trigger, ptr %addr, <4 x i32> %dst) {
 ; SSE2-LABEL: 'test2'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
-; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:26 Lat:26 SizeLat:26 for: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:26 Lat:26 SizeLat:26 for: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 %addr, <4 x i1> %mask, <4 x i32> %dst)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; SSE42-LABEL: 'test2'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 %addr, <4 x i1> %mask, <4 x i32> %dst)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; AVX-LABEL: 'test2'
 ; AVX-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
-; AVX-NEXT:  Cost Model: Found costs of 2 for: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; AVX-NEXT:  Cost Model: Found costs of 2 for: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 %addr, <4 x i1> %mask, <4 x i32> %dst)
 ; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; AVX512-LABEL: 'test2'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; AVX512-NEXT:  Cost Model: Found costs of 1 for: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 %addr, <4 x i1> %mask, <4 x i32> %dst)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
@@ -1623,22 +1623,22 @@ define <4 x i32> @test2(<4 x i32> %trigger, ptr %addr, <4 x i32> %dst) {
 define void @test3(<4 x i32> %trigger, ptr %addr, <4 x i32> %val) {
 ; SSE2-LABEL: 'test3'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
-; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:26 Lat:26 SizeLat:26 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:26 Lat:26 SizeLat:26 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr align 4 %addr, <4 x i1> %mask)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'test3'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr align 4 %addr, <4 x i1> %mask)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX-LABEL: 'test3'
 ; AVX-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
-; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr align 4 %addr, <4 x i1> %mask)
 ; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test3'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr align 4 %addr, <4 x i1> %mask)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
@@ -1649,32 +1649,32 @@ define void @test3(<4 x i32> %trigger, ptr %addr, <4 x i32> %val) {
 define <8 x float> @test4(<8 x i32> %trigger, ptr %addr, <8 x float> %dst) {
 ; SSE2-LABEL: 'test4'
 ; SSE2-NEXT:  Cost Model: Found costs of 2 for: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
-; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:45 SizeLat:45 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:45 SizeLat:45 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 %addr, <8 x i1> %mask, <8 x float> %dst)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x float> %res
 ;
 ; SSE42-LABEL: 'test4'
 ; SSE42-NEXT:  Cost Model: Found costs of 2 for: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:38 Lat:38 SizeLat:38 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:38 Lat:38 SizeLat:38 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 %addr, <8 x i1> %mask, <8 x float> %dst)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x float> %res
 ;
 ; AVX1-LABEL: 'test4'
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:5 Lat:2 SizeLat:6 for: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX1-NEXT:  Cost Model: Found costs of 2 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 %addr, <8 x i1> %mask, <8 x float> %dst)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x float> %res
 ;
 ; AVX2-LABEL: 'test4'
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:1 SizeLat:2 for: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 2 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX2-NEXT:  Cost Model: Found costs of 2 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 %addr, <8 x i1> %mask, <8 x float> %dst)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x float> %res
 ;
 ; SKL-LABEL: 'test4'
 ; SKL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:1 SizeLat:2 for: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
-; SKL-NEXT:  Cost Model: Found costs of 2 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SKL-NEXT:  Cost Model: Found costs of 2 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 %addr, <8 x i1> %mask, <8 x float> %dst)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x float> %res
 ;
 ; AVX512-LABEL: 'test4'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX512-NEXT:  Cost Model: Found costs of 1 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 %addr, <8 x i1> %mask, <8 x float> %dst)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x float> %res
 ;
   %mask = icmp eq <8 x i32> %trigger, zeroinitializer
@@ -1685,22 +1685,22 @@ define <8 x float> @test4(<8 x i32> %trigger, ptr %addr, <8 x float> %dst) {
 define void @test5(<2 x i32> %trigger, ptr %addr, <2 x float> %val) {
 ; SSE2-LABEL: 'test5'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr align 4 %addr, <2 x i1> %mask)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'test5'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr align 4 %addr, <2 x i1> %mask)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX-LABEL: 'test5'
 ; AVX-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr align 4 %addr, <2 x i1> %mask)
 ; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test5'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr align 4 %addr, <2 x i1> %mask)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
@@ -1711,22 +1711,22 @@ define void @test5(<2 x i32> %trigger, ptr %addr, <2 x float> %val) {
 define void @test6(<2 x i32> %trigger, ptr %addr, <2 x i32> %val) {
 ; SSE2-LABEL: 'test6'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr align 4 %addr, <2 x i1> %mask)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'test6'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr align 4 %addr, <2 x i1> %mask)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX-LABEL: 'test6'
 ; AVX-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr align 4 %addr, <2 x i1> %mask)
 ; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test6'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr align 4 %addr, <2 x i1> %mask)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
@@ -1737,22 +1737,22 @@ define void @test6(<2 x i32> %trigger, ptr %addr, <2 x i32> %val) {
 define <2 x float> @test7(<2 x i32> %trigger, ptr %addr, <2 x float> %dst) {
 ; SSE2-LABEL: 'test7'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x float> %dst)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x float> %res
 ;
 ; SSE42-LABEL: 'test7'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x float> %dst)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x float> %res
 ;
 ; AVX-LABEL: 'test7'
 ; AVX-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x float> %dst)
 ; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x float> %res
 ;
 ; AVX512-LABEL: 'test7'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 2 for: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; AVX512-NEXT:  Cost Model: Found costs of 2 for: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x float> %dst)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x float> %res
 ;
   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
@@ -1763,22 +1763,22 @@ define <2 x float> @test7(<2 x i32> %trigger, ptr %addr, <2 x float> %dst) {
 define <2 x i32> @test8(<2 x i32> %trigger, ptr %addr, <2 x i32> %dst) {
 ; SSE2-LABEL: 'test8'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x i32> %dst)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i32> %res
 ;
 ; SSE42-LABEL: 'test8'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x i32> %dst)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i32> %res
 ;
 ; AVX-LABEL: 'test8'
 ; AVX-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x i32> %dst)
 ; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i32> %res
 ;
 ; AVX512-LABEL: 'test8'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 2 for: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; AVX512-NEXT:  Cost Model: Found costs of 2 for: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x i32> %dst)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i32> %res
 ;
   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
@@ -1788,27 +1788,27 @@ define <2 x i32> @test8(<2 x i32> %trigger, ptr %addr, <2 x i32> %dst) {
 
 define <2 x double> @test_gather_2f64(<2 x ptr> %ptrs, <2 x i1> %mask, <2 x double> %src0)  {
 ; SSE2-LABEL: 'test_gather_2f64'
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 4 %ptrs, <2 x i1> %mask, <2 x double> %src0)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %res
 ;
 ; SSE42-LABEL: 'test_gather_2f64'
-; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 4 %ptrs, <2 x i1> %mask, <2 x double> %src0)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %res
 ;
 ; AVX1-LABEL: 'test_gather_2f64'
-; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 4 %ptrs, <2 x i1> %mask, <2 x double> %src0)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %res
 ;
 ; AVX2-LABEL: 'test_gather_2f64'
-; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 4 %ptrs, <2 x i1> %mask, <2 x double> %src0)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %res
 ;
 ; SKL-LABEL: 'test_gather_2f64'
-; SKL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SKL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 4 %ptrs, <2 x i1> %mask, <2 x double> %src0)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %res
 ;
 ; AVX512-LABEL: 'test_gather_2f64'
-; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 4 %ptrs, <2 x i1> %mask, <2 x double> %src0)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %res
 ;
   %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
@@ -1817,31 +1817,31 @@ define <2 x double> @test_gather_2f64(<2 x ptr> %ptrs, <2 x i1> %mask, <2 x doub
 
 define <4 x i32> @test_gather_4i32(<4 x ptr> %ptrs, <4 x i1> %mask, <4 x i32> %src0)  {
 ; SSE2-LABEL: 'test_gather_4i32'
-; SSE2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:24 Lat:24 SizeLat:24 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:24 Lat:24 SizeLat:24 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> %mask, <4 x i32> %src0)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; SSE42-LABEL: 'test_gather_4i32'
-; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> %mask, <4 x i32> %src0)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; AVX1-LABEL: 'test_gather_4i32'
-; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> %mask, <4 x i32> %src0)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; AVX2-LABEL: 'test_gather_4i32'
-; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> %mask, <4 x i32> %src0)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; SKL-LABEL: 'test_gather_4i32'
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> %mask, <4 x i32> %src0)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; KNL-LABEL: 'test_gather_4i32'
-; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> %mask, <4 x i32> %src0)
 ; KNL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; SKX-LABEL: 'test_gather_4i32'
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> %mask, <4 x i32> %src0)
 ; SKX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
   %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
@@ -1850,31 +1850,31 @@ define <4 x i32> @test_gather_4i32(<4 x ptr> %ptrs, <4 x i1> %mask, <4 x i32> %s
 
 define <4 x i32> @test_gather_4i32_const_mask(<4 x ptr> %ptrs, <4 x i32> %src0)  {
 ; SSE2-LABEL: 'test_gather_4i32_const_mask'
-; SSE2-NEXT:  Cost Model: Found costs of 19 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> splat (i1 true), <4 x i32> %src0)
+; SSE2-NEXT:  Cost Model: Found costs of 19 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> splat (i1 true), <4 x i32> %src0)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; SSE42-LABEL: 'test_gather_4i32_const_mask'
-; SSE42-NEXT:  Cost Model: Found costs of 12 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> splat (i1 true), <4 x i32> %src0)
+; SSE42-NEXT:  Cost Model: Found costs of 12 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> splat (i1 true), <4 x i32> %src0)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; AVX1-LABEL: 'test_gather_4i32_const_mask'
-; AVX1-NEXT:  Cost Model: Found costs of 13 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> splat (i1 true), <4 x i32> %src0)
+; AVX1-NEXT:  Cost Model: Found costs of 13 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> splat (i1 true), <4 x i32> %src0)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; AVX2-LABEL: 'test_gather_4i32_const_mask'
-; AVX2-NEXT:  Cost Model: Found costs of 13 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> splat (i1 true), <4 x i32> %src0)
+; AVX2-NEXT:  Cost Model: Found costs of 13 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> splat (i1 true), <4 x i32> %src0)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; SKL-LABEL: 'test_gather_4i32_const_mask'
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> splat (i1 true), <4 x i32> %src0)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> splat (i1 true), <4 x i32> %src0)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; KNL-LABEL: 'test_gather_4i32_const_mask'
-; KNL-NEXT:  Cost Model: Found costs of 13 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> splat (i1 true), <4 x i32> %src0)
+; KNL-NEXT:  Cost Model: Found costs of 13 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> splat (i1 true), <4 x i32> %src0)
 ; KNL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; SKX-LABEL: 'test_gather_4i32_const_mask'
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> splat (i1 true), <4 x i32> %src0)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> splat (i1 true), <4 x i32> %src0)
 ; SKX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
   %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
@@ -1885,37 +1885,37 @@ define <16 x float> @test_gather_16f32_const_mask(ptr %base, <16 x i32> %ind) {
 ; SSE2-LABEL: 'test_gather_16f32_const_mask'
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:8 SizeLat:8 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; SSE2-NEXT:  Cost Model: Found costs of 60 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of 60 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> splat (i1 true), <16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_16f32_const_mask'
 ; SSE42-NEXT:  Cost Model: Found costs of 8 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE42-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; SSE42-NEXT:  Cost Model: Found costs of 44 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of 44 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> splat (i1 true), <16 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX1-LABEL: 'test_gather_16f32_const_mask'
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX1-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; AVX1-NEXT:  Cost Model: Found costs of 50 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 50 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> splat (i1 true), <16 x float> undef)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX2-LABEL: 'test_gather_16f32_const_mask'
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX2-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; AVX2-NEXT:  Cost Model: Found costs of 50 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 50 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> splat (i1 true), <16 x float> undef)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; SKL-LABEL: 'test_gather_16f32_const_mask'
 ; SKL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SKL-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> splat (i1 true), <16 x float> undef)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX512-LABEL: 'test_gather_16f32_const_mask'
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX512-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; AVX512-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:18 SizeLat:18 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:18 SizeLat:18 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> splat (i1 true), <16 x float> undef)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
   %sext_ind = sext <16 x i32> %ind to <16 x i64>
@@ -1929,37 +1929,37 @@ define <16 x float> @test_gather_16f32_var_mask(ptr %base, <16 x i32> %ind, <16
 ; SSE2-LABEL: 'test_gather_16f32_var_mask'
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:8 SizeLat:8 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:77 Lat:77 SizeLat:77 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:77 Lat:77 SizeLat:77 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_16f32_var_mask'
 ; SSE42-NEXT:  Cost Model: Found costs of 8 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE42-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:61 Lat:61 SizeLat:61 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:61 Lat:61 SizeLat:61 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX1-LABEL: 'test_gather_16f32_var_mask'
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX1-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; AVX1-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX2-LABEL: 'test_gather_16f32_var_mask'
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX2-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; AVX2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; SKL-LABEL: 'test_gather_16f32_var_mask'
 ; SKL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SKL-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX512-LABEL: 'test_gather_16f32_var_mask'
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX512-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; AVX512-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:18 SizeLat:18 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:18 SizeLat:18 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
   %sext_ind = sext <16 x i32> %ind to <16 x i64>
@@ -1973,37 +1973,37 @@ define <16 x float> @test_gather_16f32_ra_var_mask(<16 x ptr> %ptrs, <16 x i32>
 ; SSE2-LABEL: 'test_gather_16f32_ra_var_mask'
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:8 SizeLat:8 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
-; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:77 Lat:77 SizeLat:77 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:77 Lat:77 SizeLat:77 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_16f32_ra_var_mask'
 ; SSE42-NEXT:  Cost Model: Found costs of 8 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE42-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
-; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:61 Lat:61 SizeLat:61 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:61 Lat:61 SizeLat:61 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX1-LABEL: 'test_gather_16f32_ra_var_mask'
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX1-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
-; AVX1-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX2-LABEL: 'test_gather_16f32_ra_var_mask'
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX2-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
-; AVX2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; SKL-LABEL: 'test_gather_16f32_ra_var_mask'
 ; SKL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SKL-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
-; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX512-LABEL: 'test_gather_16f32_ra_var_mask'
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX512-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
-; AVX512-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
   %sext_ind = sext <16 x i32> %ind to <16 x i64>
@@ -2019,7 +2019,7 @@ define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:8 SizeLat:8 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
-; SSE2-NEXT:  Cost Model: Found costs of 60 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of 60 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.random, <16 x i1> splat (i1 true), <16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_16f32_const_mask2'
@@ -2027,7 +2027,7 @@ define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of 8 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE42-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
-; SSE42-NEXT:  Cost Model: Found costs of 44 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of 44 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.random, <16 x i1> splat (i1 true), <16 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX1-LABEL: 'test_gather_16f32_const_mask2'
@@ -2035,7 +2035,7 @@ define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX1-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
-; AVX1-NEXT:  Cost Model: Found costs of 50 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 50 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.random, <16 x i1> splat (i1 true), <16 x float> undef)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX2-LABEL: 'test_gather_16f32_const_mask2'
@@ -2043,7 +2043,7 @@ define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX2-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
-; AVX2-NEXT:  Cost Model: Found costs of 50 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 50 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.random, <16 x i1> splat (i1 true), <16 x float> undef)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; SKL-LABEL: 'test_gather_16f32_const_mask2'
@@ -2051,7 +2051,7 @@ define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
 ; SKL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
 ; SKL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SKL-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
-; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.random, <16 x i1> splat (i1 true), <16 x float> undef)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX512-LABEL: 'test_gather_16f32_const_mask2'
@@ -2059,7 +2059,7 @@ define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX512-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
-; AVX512-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:18 SizeLat:18 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:18 SizeLat:18 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.random, <16 x i1> splat (i1 true), <16 x float> undef)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
   %broadcast.splatinsert = insertelement <16 x ptr> poison, ptr %base, i32 0
@@ -2078,7 +2078,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %imask = bitcast i16 %mask to <16 x i1>
-; SSE2-NEXT:  Cost Model: Found costs of RThru:77 CodeSize:93 Lat:93 SizeLat:93 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:77 CodeSize:93 Lat:93 SizeLat:93 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> align 4 %gep.random, <16 x i1> %imask)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'test_scatter_16i32'
@@ -2086,7 +2086,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %imask = bitcast i16 %mask to <16 x i1>
-; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> align 4 %gep.random, <16 x i1> %imask)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX1-LABEL: 'test_scatter_16i32'
@@ -2094,7 +2094,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %imask = bitcast i16 %mask to <16 x i1>
-; AVX1-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> align 4 %gep.random, <16 x i1> %imask)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'test_scatter_16i32'
@@ -2102,7 +2102,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %imask = bitcast i16 %mask to <16 x i1>
-; AVX2-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> align 4 %gep.random, <16 x i1> %imask)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SKL-LABEL: 'test_scatter_16i32'
@@ -2110,7 +2110,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 ; SKL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
 ; SKL-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
 ; SKL-NEXT:  Cost Model: Found costs of 1 for: %imask = bitcast i16 %mask to <16 x i1>
-; SKL-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SKL-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> align 4 %gep.random, <16 x i1> %imask)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test_scatter_16i32'
@@ -2118,7 +2118,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %imask = bitcast i16 %mask to <16 x i1>
-; AVX512-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> align 4 %gep.random, <16 x i1> %imask)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %broadcast.splatinsert = insertelement <16 x ptr> poison, ptr %base, i32 0
@@ -2132,19 +2132,19 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 
 define void @test_scatter_8i32(<8 x i32>%a1, <8 x ptr> %ptr, <8 x i1>%mask) {
 ; SSE2-LABEL: 'test_scatter_8i32'
-; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:47 SizeLat:47 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:47 SizeLat:47 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> align 4 %ptr, <8 x i1> %mask)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'test_scatter_8i32'
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> align 4 %ptr, <8 x i1> %mask)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX-LABEL: 'test_scatter_8i32'
-; AVX-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> align 4 %ptr, <8 x i1> %mask)
 ; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test_scatter_8i32'
-; AVX512-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> align 4 %ptr, <8 x i1> %mask)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
@@ -2153,23 +2153,23 @@ define void @test_scatter_8i32(<8 x i32>%a1, <8 x ptr> %ptr, <8 x i1>%mask) {
 
 define void @test_scatter_4i32(<4 x i32>%a1, <4 x ptr> %ptr, <4 x i1>%mask) {
 ; SSE2-LABEL: 'test_scatter_4i32'
-; SSE2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:24 Lat:24 SizeLat:24 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:24 Lat:24 SizeLat:24 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> align 4 %ptr, <4 x i1> %mask)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'test_scatter_4i32'
-; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> align 4 %ptr, <4 x i1> %mask)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX-LABEL: 'test_scatter_4i32'
-; AVX-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> align 4 %ptr, <4 x i1> %mask)
 ; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; KNL-LABEL: 'test_scatter_4i32'
-; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> align 4 %ptr, <4 x i1> %mask)
 ; KNL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SKX-LABEL: 'test_scatter_4i32'
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> align 4 %ptr, <4 x i1> %mask)
 ; SKX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
@@ -2180,43 +2180,43 @@ define <4 x float> @test_gather_4f32(ptr %ptr, <4 x i32> %ind, <4 x i1>%mask) {
 ; SSE2-LABEL: 'test_gather_4f32'
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SSE2-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> %mask, <4 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_4f32'
 ; SSE42-NEXT:  Cost Model: Found costs of 2 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SSE42-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:16 Lat:16 SizeLat:16 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:16 Lat:16 SizeLat:16 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> %mask, <4 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; AVX1-LABEL: 'test_gather_4f32'
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; AVX1-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> %mask, <4 x float> undef)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; AVX2-LABEL: 'test_gather_4f32'
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; AVX2-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> %mask, <4 x float> undef)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; SKL-LABEL: 'test_gather_4f32'
 ; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SKL-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> %mask, <4 x float> undef)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; KNL-LABEL: 'test_gather_4f32'
 ; KNL-NEXT:  Cost Model: Found costs of 1 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; KNL-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> %mask, <4 x float> undef)
 ; KNL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; SKX-LABEL: 'test_gather_4f32'
 ; SKX-NEXT:  Cost Model: Found costs of 1 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SKX-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> %mask, <4 x float> undef)
 ; SKX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
   %sext_ind = sext <4 x i32> %ind to <4 x i64>
@@ -2230,43 +2230,43 @@ define <4 x float> @test_gather_4f32_const_mask(ptr %ptr, <4 x i32> %ind) {
 ; SSE2-LABEL: 'test_gather_4f32_const_mask'
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SSE2-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; SSE2-NEXT:  Cost Model: Found costs of 15 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> splat (i1 true), <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of 15 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> splat (i1 true), <4 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_4f32_const_mask'
 ; SSE42-NEXT:  Cost Model: Found costs of 2 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SSE42-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; SSE42-NEXT:  Cost Model: Found costs of 11 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> splat (i1 true), <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of 11 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> splat (i1 true), <4 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; AVX1-LABEL: 'test_gather_4f32_const_mask'
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; AVX1-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; AVX1-NEXT:  Cost Model: Found costs of 12 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> splat (i1 true), <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 12 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> splat (i1 true), <4 x float> undef)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; AVX2-LABEL: 'test_gather_4f32_const_mask'
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; AVX2-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; AVX2-NEXT:  Cost Model: Found costs of 12 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> splat (i1 true), <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 12 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> splat (i1 true), <4 x float> undef)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; SKL-LABEL: 'test_gather_4f32_const_mask'
 ; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SKL-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> splat (i1 true), <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> splat (i1 true), <4 x float> undef)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; KNL-LABEL: 'test_gather_4f32_const_mask'
 ; KNL-NEXT:  Cost Model: Found costs of 1 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; KNL-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; KNL-NEXT:  Cost Model: Found costs of 12 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> splat (i1 true), <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 12 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> splat (i1 true), <4 x float> undef)
 ; KNL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; SKX-LABEL: 'test_gather_4f32_const_mask'
 ; SKX-NEXT:  Cost Model: Found costs of 1 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SKX-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> splat (i1 true), <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> splat (i1 true), <4 x float> undef)
 ; SKX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
   %sext_ind = sext <4 x i32> %ind to <4 x i64>
diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
index 742b5b2ad62f9..b68975f9e6091 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
@@ -10,298 +10,298 @@
 
 define i32 @masked_load(<1 x i1> %m1, <2 x i1> %m2, <3 x i1> %m3, <4 x i1> %m4, <5 x i1> %m5, <6 x i1> %m6, <7 x i1> %m7, <8 x i1> %m8, <9 x i1> %m9, <10 x i1> %m10, <11 x i1> %m11, <12 x i1> %m12, <13 x i1> %m13, <14 x i1> %m14, <15 x i1> %m15, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_load'
-; SSE2-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:31 CodeSize:38 Lat:38 SizeLat:38 for: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:33 Lat:33 SizeLat:33 for: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:28 Lat:28 SizeLat:28 for: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:16 Lat:16 SizeLat:16 for: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:75 CodeSize:91 Lat:91 SizeLat:91 for: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:86 Lat:86 SizeLat:86 for: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:67 CodeSize:81 Lat:81 SizeLat:81 for: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:63 CodeSize:76 Lat:76 SizeLat:76 for: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:59 CodeSize:71 Lat:71 SizeLat:71 for: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:66 Lat:66 SizeLat:66 for: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:61 Lat:61 SizeLat:61 for: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:47 CodeSize:56 Lat:56 SizeLat:56 for: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:45 SizeLat:45 for: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:40 Lat:40 SizeLat:40 for: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:35 Lat:35 SizeLat:35 for: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:30 Lat:30 SizeLat:30 for: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:22 Lat:22 SizeLat:22 for: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:17 Lat:17 SizeLat:17 for: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:51 Lat:51 SizeLat:51 for: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:45 Lat:45 SizeLat:45 for: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:39 Lat:39 SizeLat:39 for: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:33 Lat:33 SizeLat:33 for: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:25 Lat:25 SizeLat:25 for: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:19 Lat:19 SizeLat:19 for: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:91 CodeSize:107 Lat:107 SizeLat:107 for: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:86 CodeSize:101 Lat:101 SizeLat:101 for: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:81 CodeSize:95 Lat:95 SizeLat:95 for: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:76 CodeSize:89 Lat:89 SizeLat:89 for: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:83 Lat:83 SizeLat:83 for: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:66 CodeSize:77 Lat:77 SizeLat:77 for: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:71 Lat:71 SizeLat:71 for: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:65 Lat:65 SizeLat:65 for: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:53 Lat:53 SizeLat:53 for: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:47 Lat:47 SizeLat:47 for: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:41 Lat:41 SizeLat:41 for: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:35 Lat:35 SizeLat:35 for: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:26 Lat:26 SizeLat:26 for: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:20 Lat:20 SizeLat:20 for: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:158 CodeSize:190 Lat:190 SizeLat:190 for: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:79 CodeSize:95 Lat:95 SizeLat:95 for: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:47 SizeLat:47 for: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:23 Lat:23 SizeLat:23 for: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:376 CodeSize:440 Lat:440 SizeLat:440 for: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:188 CodeSize:220 Lat:220 SizeLat:220 for: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:94 CodeSize:110 Lat:110 SizeLat:110 for: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:46 CodeSize:54 Lat:54 SizeLat:54 for: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:31 CodeSize:38 Lat:38 SizeLat:38 for: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr align 1 undef, <7 x i1> %m7, <7 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:33 Lat:33 SizeLat:33 for: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr align 1 undef, <6 x i1> %m6, <6 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:28 Lat:28 SizeLat:28 for: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr align 1 undef, <5 x i1> %m5, <5 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:16 Lat:16 SizeLat:16 for: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr align 1 undef, <3 x i1> %m3, <3 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:75 CodeSize:91 Lat:91 SizeLat:91 for: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:86 Lat:86 SizeLat:86 for: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr align 1 undef, <15 x i1> %m15, <15 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:67 CodeSize:81 Lat:81 SizeLat:81 for: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr align 1 undef, <14 x i1> %m14, <14 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:63 CodeSize:76 Lat:76 SizeLat:76 for: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr align 1 undef, <13 x i1> %m13, <13 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:59 CodeSize:71 Lat:71 SizeLat:71 for: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr align 1 undef, <12 x i1> %m12, <12 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:66 Lat:66 SizeLat:66 for: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr align 1 undef, <11 x i1> %m11, <11 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:61 Lat:61 SizeLat:61 for: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr align 1 undef, <10 x i1> %m10, <10 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:47 CodeSize:56 Lat:56 SizeLat:56 for: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr align 1 undef, <9 x i1> %m9, <9 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:45 SizeLat:45 for: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:40 Lat:40 SizeLat:40 for: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr align 1 undef, <7 x i1> %m7, <7 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:35 Lat:35 SizeLat:35 for: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr align 1 undef, <6 x i1> %m6, <6 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:30 Lat:30 SizeLat:30 for: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr align 1 undef, <5 x i1> %m5, <5 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:22 Lat:22 SizeLat:22 for: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:17 Lat:17 SizeLat:17 for: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr align 1 undef, <3 x i1> %m3, <3 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr align 1 undef, <1 x i1> %m1, <1 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:51 Lat:51 SizeLat:51 for: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:45 Lat:45 SizeLat:45 for: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr align 1 undef, <7 x i1> %m7, <7 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:39 Lat:39 SizeLat:39 for: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr align 1 undef, <6 x i1> %m6, <6 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:33 Lat:33 SizeLat:33 for: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr align 1 undef, <5 x i1> %m5, <5 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:25 Lat:25 SizeLat:25 for: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:19 Lat:19 SizeLat:19 for: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr align 1 undef, <3 x i1> %m3, <3 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:91 CodeSize:107 Lat:107 SizeLat:107 for: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:86 CodeSize:101 Lat:101 SizeLat:101 for: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr align 1 undef, <15 x i1> %m15, <15 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:81 CodeSize:95 Lat:95 SizeLat:95 for: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr align 1 undef, <14 x i1> %m14, <14 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:76 CodeSize:89 Lat:89 SizeLat:89 for: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr align 1 undef, <13 x i1> %m13, <13 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:83 Lat:83 SizeLat:83 for: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr align 1 undef, <12 x i1> %m12, <12 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:66 CodeSize:77 Lat:77 SizeLat:77 for: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr align 1 undef, <11 x i1> %m11, <11 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:71 Lat:71 SizeLat:71 for: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr align 1 undef, <10 x i1> %m10, <10 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:65 Lat:65 SizeLat:65 for: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr align 1 undef, <9 x i1> %m9, <9 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:53 Lat:53 SizeLat:53 for: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:47 Lat:47 SizeLat:47 for: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr align 1 undef, <7 x i1> %m7, <7 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:41 Lat:41 SizeLat:41 for: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr align 1 undef, <6 x i1> %m6, <6 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:35 Lat:35 SizeLat:35 for: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr align 1 undef, <5 x i1> %m5, <5 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:26 Lat:26 SizeLat:26 for: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:20 Lat:20 SizeLat:20 for: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr align 1 undef, <3 x i1> %m3, <3 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 1 undef, <1 x i1> %m1, <1 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:158 CodeSize:190 Lat:190 SizeLat:190 for: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:79 CodeSize:95 Lat:95 SizeLat:95 for: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:47 SizeLat:47 for: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:23 Lat:23 SizeLat:23 for: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:376 CodeSize:440 Lat:440 SizeLat:440 for: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:188 CodeSize:220 Lat:220 SizeLat:220 for: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:94 CodeSize:110 Lat:110 SizeLat:110 for: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:46 CodeSize:54 Lat:54 SizeLat:54 for: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_load'
-; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:32 Lat:32 SizeLat:32 for: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:28 Lat:28 SizeLat:28 for: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:24 Lat:24 SizeLat:24 for: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:14 Lat:14 SizeLat:14 for: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:76 Lat:76 SizeLat:76 for: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:71 Lat:71 SizeLat:71 for: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:52 CodeSize:66 Lat:66 SizeLat:66 for: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:61 Lat:61 SizeLat:61 for: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:57 Lat:57 SizeLat:57 for: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:41 CodeSize:52 Lat:52 SizeLat:52 for: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:47 Lat:47 SizeLat:47 for: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:42 Lat:42 SizeLat:42 for: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:38 Lat:38 SizeLat:38 for: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:33 Lat:33 SizeLat:33 for: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:28 Lat:28 SizeLat:28 for: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:23 Lat:23 SizeLat:23 for: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:14 Lat:14 SizeLat:14 for: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:35 Lat:35 SizeLat:35 for: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:30 Lat:30 SizeLat:30 for: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:25 Lat:25 SizeLat:25 for: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:15 Lat:15 SizeLat:15 for: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:75 Lat:75 SizeLat:75 for: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:70 Lat:70 SizeLat:70 for: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:52 CodeSize:65 Lat:65 SizeLat:65 for: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:60 Lat:60 SizeLat:60 for: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:55 Lat:55 SizeLat:55 for: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:50 Lat:50 SizeLat:50 for: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:45 Lat:45 SizeLat:45 for: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:35 Lat:35 SizeLat:35 for: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:30 Lat:30 SizeLat:30 for: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:25 Lat:25 SizeLat:25 for: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:15 Lat:15 SizeLat:15 for: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:160 Lat:160 SizeLat:160 for: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:320 Lat:320 SizeLat:320 for: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:160 Lat:160 SizeLat:160 for: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:32 Lat:32 SizeLat:32 for: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr align 1 undef, <7 x i1> %m7, <7 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:28 Lat:28 SizeLat:28 for: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr align 1 undef, <6 x i1> %m6, <6 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:24 Lat:24 SizeLat:24 for: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr align 1 undef, <5 x i1> %m5, <5 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:14 Lat:14 SizeLat:14 for: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr align 1 undef, <3 x i1> %m3, <3 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:76 Lat:76 SizeLat:76 for: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:71 Lat:71 SizeLat:71 for: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr align 1 undef, <15 x i1> %m15, <15 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:52 CodeSize:66 Lat:66 SizeLat:66 for: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr align 1 undef, <14 x i1> %m14, <14 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:61 Lat:61 SizeLat:61 for: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr align 1 undef, <13 x i1> %m13, <13 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:57 Lat:57 SizeLat:57 for: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr align 1 undef, <12 x i1> %m12, <12 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:41 CodeSize:52 Lat:52 SizeLat:52 for: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr align 1 undef, <11 x i1> %m11, <11 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:47 Lat:47 SizeLat:47 for: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr align 1 undef, <10 x i1> %m10, <10 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:42 Lat:42 SizeLat:42 for: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr align 1 undef, <9 x i1> %m9, <9 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:38 Lat:38 SizeLat:38 for: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:33 Lat:33 SizeLat:33 for: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr align 1 undef, <7 x i1> %m7, <7 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:28 Lat:28 SizeLat:28 for: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr align 1 undef, <6 x i1> %m6, <6 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:23 Lat:23 SizeLat:23 for: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr align 1 undef, <5 x i1> %m5, <5 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:14 Lat:14 SizeLat:14 for: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr align 1 undef, <3 x i1> %m3, <3 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr align 1 undef, <1 x i1> %m1, <1 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:35 Lat:35 SizeLat:35 for: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr align 1 undef, <7 x i1> %m7, <7 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:30 Lat:30 SizeLat:30 for: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr align 1 undef, <6 x i1> %m6, <6 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:25 Lat:25 SizeLat:25 for: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr align 1 undef, <5 x i1> %m5, <5 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:15 Lat:15 SizeLat:15 for: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr align 1 undef, <3 x i1> %m3, <3 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:75 Lat:75 SizeLat:75 for: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr align 1 undef, <15 x i1> %m15, <15 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:70 Lat:70 SizeLat:70 for: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr align 1 undef, <14 x i1> %m14, <14 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:52 CodeSize:65 Lat:65 SizeLat:65 for: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr align 1 undef, <13 x i1> %m13, <13 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:60 Lat:60 SizeLat:60 for: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr align 1 undef, <12 x i1> %m12, <12 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:55 Lat:55 SizeLat:55 for: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr align 1 undef, <11 x i1> %m11, <11 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:50 Lat:50 SizeLat:50 for: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr align 1 undef, <10 x i1> %m10, <10 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:45 Lat:45 SizeLat:45 for: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr align 1 undef, <9 x i1> %m9, <9 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:35 Lat:35 SizeLat:35 for: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr align 1 undef, <7 x i1> %m7, <7 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:30 Lat:30 SizeLat:30 for: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr align 1 undef, <6 x i1> %m6, <6 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:25 Lat:25 SizeLat:25 for: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr align 1 undef, <5 x i1> %m5, <5 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:15 Lat:15 SizeLat:15 for: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr align 1 undef, <3 x i1> %m3, <3 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 1 undef, <1 x i1> %m1, <1 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:160 Lat:160 SizeLat:160 for: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:320 Lat:320 SizeLat:320 for: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:160 Lat:160 SizeLat:160 for: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; AVX-LABEL: 'masked_load'
-; AVX-NEXT:  Cost Model: Found costs of 4 for: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
-; AVX-NEXT:  Cost Model: Found costs of 2 for: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
-; AVX-NEXT:  Cost Model: Found costs of 2 for: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; AVX-NEXT:  Cost Model: Found costs of 4 for: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 2 for: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 2 for: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 4 for: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
-; AVX-NEXT:  Cost Model: Found costs of 2 for: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
-; AVX-NEXT:  Cost Model: Found costs of 2 for: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; AVX-NEXT:  Cost Model: Found costs of 4 for: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 2 for: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 2 for: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:131 CodeSize:163 Lat:163 SizeLat:163 for: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:260 CodeSize:324 Lat:324 SizeLat:324 for: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; AVX-NEXT:  Cost Model: Found costs of 4 for: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr align 1 undef, <7 x i1> %m7, <7 x double> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr align 1 undef, <6 x i1> %m6, <6 x double> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr align 1 undef, <5 x i1> %m5, <5 x double> undef)
+; AVX-NEXT:  Cost Model: Found costs of 2 for: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr align 1 undef, <3 x i1> %m3, <3 x double> undef)
+; AVX-NEXT:  Cost Model: Found costs of 2 for: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; AVX-NEXT:  Cost Model: Found costs of 4 for: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr align 1 undef, <15 x i1> %m15, <15 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr align 1 undef, <14 x i1> %m14, <14 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr align 1 undef, <13 x i1> %m13, <13 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr align 1 undef, <12 x i1> %m12, <12 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr align 1 undef, <11 x i1> %m11, <11 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr align 1 undef, <10 x i1> %m10, <10 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr align 1 undef, <9 x i1> %m9, <9 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 2 for: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr align 1 undef, <7 x i1> %m7, <7 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr align 1 undef, <6 x i1> %m6, <6 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr align 1 undef, <5 x i1> %m5, <5 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 2 for: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr align 1 undef, <3 x i1> %m3, <3 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr align 1 undef, <1 x i1> %m1, <1 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 4 for: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr align 1 undef, <7 x i1> %m7, <7 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr align 1 undef, <6 x i1> %m6, <6 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr align 1 undef, <5 x i1> %m5, <5 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of 2 for: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr align 1 undef, <3 x i1> %m3, <3 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of 2 for: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of 4 for: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr align 1 undef, <15 x i1> %m15, <15 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr align 1 undef, <14 x i1> %m14, <14 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr align 1 undef, <13 x i1> %m13, <13 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr align 1 undef, <12 x i1> %m12, <12 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr align 1 undef, <11 x i1> %m11, <11 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr align 1 undef, <10 x i1> %m10, <10 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr align 1 undef, <9 x i1> %m9, <9 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 2 for: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr align 1 undef, <7 x i1> %m7, <7 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr align 1 undef, <6 x i1> %m6, <6 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr align 1 undef, <5 x i1> %m5, <5 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 2 for: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr align 1 undef, <3 x i1> %m3, <3 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 1 undef, <1 x i1> %m1, <1 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:131 CodeSize:163 Lat:163 SizeLat:163 for: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:260 CodeSize:324 Lat:324 SizeLat:324 for: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; KNL-LABEL: 'masked_load'
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:132 CodeSize:164 Lat:164 SizeLat:164 for: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:262 CodeSize:326 Lat:326 SizeLat:326 for: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr align 1 undef, <7 x i1> %m7, <7 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr align 1 undef, <6 x i1> %m6, <6 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr align 1 undef, <5 x i1> %m5, <5 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr align 1 undef, <3 x i1> %m3, <3 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr align 1 undef, <15 x i1> %m15, <15 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr align 1 undef, <14 x i1> %m14, <14 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr align 1 undef, <13 x i1> %m13, <13 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr align 1 undef, <12 x i1> %m12, <12 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr align 1 undef, <11 x i1> %m11, <11 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr align 1 undef, <10 x i1> %m10, <10 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr align 1 undef, <9 x i1> %m9, <9 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr align 1 undef, <7 x i1> %m7, <7 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr align 1 undef, <6 x i1> %m6, <6 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr align 1 undef, <5 x i1> %m5, <5 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr align 1 undef, <3 x i1> %m3, <3 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr align 1 undef, <1 x i1> %m1, <1 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr align 1 undef, <7 x i1> %m7, <7 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr align 1 undef, <6 x i1> %m6, <6 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr align 1 undef, <5 x i1> %m5, <5 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr align 1 undef, <3 x i1> %m3, <3 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr align 1 undef, <15 x i1> %m15, <15 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr align 1 undef, <14 x i1> %m14, <14 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr align 1 undef, <13 x i1> %m13, <13 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr align 1 undef, <12 x i1> %m12, <12 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr align 1 undef, <11 x i1> %m11, <11 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr align 1 undef, <10 x i1> %m10, <10 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr align 1 undef, <9 x i1> %m9, <9 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr align 1 undef, <7 x i1> %m7, <7 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr align 1 undef, <6 x i1> %m6, <6 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr align 1 undef, <5 x i1> %m5, <5 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr align 1 undef, <3 x i1> %m3, <3 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 1 undef, <1 x i1> %m1, <1 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:132 CodeSize:164 Lat:164 SizeLat:164 for: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:262 CodeSize:326 Lat:326 SizeLat:326 for: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; KNL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SKX-LABEL: 'masked_load'
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr align 1 undef, <7 x i1> %m7, <7 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr align 1 undef, <6 x i1> %m6, <6 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr align 1 undef, <5 x i1> %m5, <5 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr align 1 undef, <3 x i1> %m3, <3 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr align 1 undef, <15 x i1> %m15, <15 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr align 1 undef, <14 x i1> %m14, <14 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr align 1 undef, <13 x i1> %m13, <13 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr align 1 undef, <12 x i1> %m12, <12 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr align 1 undef, <11 x i1> %m11, <11 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr align 1 undef, <10 x i1> %m10, <10 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr align 1 undef, <9 x i1> %m9, <9 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr align 1 undef, <7 x i1> %m7, <7 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr align 1 undef, <6 x i1> %m6, <6 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr align 1 undef, <5 x i1> %m5, <5 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr align 1 undef, <3 x i1> %m3, <3 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr align 1 undef, <1 x i1> %m1, <1 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr align 1 undef, <7 x i1> %m7, <7 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr align 1 undef, <6 x i1> %m6, <6 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr align 1 undef, <5 x i1> %m5, <5 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr align 1 undef, <3 x i1> %m3, <3 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr align 1 undef, <15 x i1> %m15, <15 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr align 1 undef, <14 x i1> %m14, <14 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr align 1 undef, <13 x i1> %m13, <13 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr align 1 undef, <12 x i1> %m12, <12 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr align 1 undef, <11 x i1> %m11, <11 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr align 1 undef, <10 x i1> %m10, <10 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr align 1 undef, <9 x i1> %m9, <9 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr align 1 undef, <7 x i1> %m7, <7 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr align 1 undef, <6 x i1> %m6, <6 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr align 1 undef, <5 x i1> %m5, <5 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr align 1 undef, <3 x i1> %m3, <3 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 1 undef, <1 x i1> %m1, <1 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; SKX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
   %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
@@ -371,298 +371,298 @@ define i32 @masked_load(<1 x i1> %m1, <2 x i1> %m2, <3 x i1> %m3, <4 x i1> %m4,
 
 define i32 @masked_store(<1 x i1> %m1, <2 x i1> %m2, <3 x i1> %m3, <4 x i1> %m4, <5 x i1> %m5, <6 x i1> %m6, <7 x i1> %m7, <8 x i1> %m8, <9 x i1> %m9, <10 x i1> %m10, <11 x i1> %m11, <12 x i1> %m12, <13 x i1> %m13, <14 x i1> %m14, <15 x i1> %m15, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_store'
-; SSE2-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:32 Lat:32 SizeLat:32 for: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:26 Lat:26 SizeLat:26 for: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:15 Lat:15 SizeLat:15 for: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:75 CodeSize:91 Lat:91 SizeLat:91 for: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:85 Lat:85 SizeLat:85 for: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:79 Lat:79 SizeLat:79 for: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:73 Lat:73 SizeLat:73 for: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:68 Lat:68 SizeLat:68 for: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:62 Lat:62 SizeLat:62 for: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:46 CodeSize:56 Lat:56 SizeLat:56 for: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:41 CodeSize:50 Lat:50 SizeLat:50 for: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:45 SizeLat:45 for: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:39 Lat:39 SizeLat:39 for: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:27 Lat:27 SizeLat:27 for: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:22 Lat:22 SizeLat:22 for: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:16 Lat:16 SizeLat:16 for: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:51 Lat:51 SizeLat:51 for: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:44 Lat:44 SizeLat:44 for: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:38 Lat:38 SizeLat:38 for: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:31 Lat:31 SizeLat:31 for: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:25 Lat:25 SizeLat:25 for: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:91 CodeSize:107 Lat:107 SizeLat:107 for: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:85 CodeSize:100 Lat:100 SizeLat:100 for: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:79 CodeSize:93 Lat:93 SizeLat:93 for: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:73 CodeSize:86 Lat:86 SizeLat:86 for: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:68 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:62 CodeSize:73 Lat:73 SizeLat:73 for: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:66 Lat:66 SizeLat:66 for: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:50 CodeSize:59 Lat:59 SizeLat:59 for: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:53 Lat:53 SizeLat:53 for: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:46 Lat:46 SizeLat:46 for: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:39 Lat:39 SizeLat:39 for: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:32 Lat:32 SizeLat:32 for: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:26 Lat:26 SizeLat:26 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:158 CodeSize:190 Lat:190 SizeLat:190 for: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:79 CodeSize:95 Lat:95 SizeLat:95 for: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:47 SizeLat:47 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:23 Lat:23 SizeLat:23 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:376 CodeSize:440 Lat:440 SizeLat:440 for: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:188 CodeSize:220 Lat:220 SizeLat:220 for: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:94 CodeSize:110 Lat:110 SizeLat:110 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:46 CodeSize:54 Lat:54 SizeLat:54 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr align 1 undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr align 1 undef, <7 x i1> %m7)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:32 Lat:32 SizeLat:32 for: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr align 1 undef, <6 x i1> %m6)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:26 Lat:26 SizeLat:26 for: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr align 1 undef, <5 x i1> %m5)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr align 1 undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:15 Lat:15 SizeLat:15 for: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr align 1 undef, <3 x i1> %m3)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr align 1 undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr align 1 undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:75 CodeSize:91 Lat:91 SizeLat:91 for: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr align 1 undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:85 Lat:85 SizeLat:85 for: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr align 1 undef, <15 x i1> %m15)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:79 Lat:79 SizeLat:79 for: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr align 1 undef, <14 x i1> %m14)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:73 Lat:73 SizeLat:73 for: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr align 1 undef, <13 x i1> %m13)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:68 Lat:68 SizeLat:68 for: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr align 1 undef, <12 x i1> %m12)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:62 Lat:62 SizeLat:62 for: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr align 1 undef, <11 x i1> %m11)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:46 CodeSize:56 Lat:56 SizeLat:56 for: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr align 1 undef, <10 x i1> %m10)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:41 CodeSize:50 Lat:50 SizeLat:50 for: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr align 1 undef, <9 x i1> %m9)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:45 SizeLat:45 for: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr align 1 undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:39 Lat:39 SizeLat:39 for: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr align 1 undef, <7 x i1> %m7)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr align 1 undef, <6 x i1> %m6)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:27 Lat:27 SizeLat:27 for: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr align 1 undef, <5 x i1> %m5)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:22 Lat:22 SizeLat:22 for: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr align 1 undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:16 Lat:16 SizeLat:16 for: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr align 1 undef, <3 x i1> %m3)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr align 1 undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr align 1 undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:51 Lat:51 SizeLat:51 for: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr align 1 undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:44 Lat:44 SizeLat:44 for: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr align 1 undef, <7 x i1> %m7)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:38 Lat:38 SizeLat:38 for: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr align 1 undef, <6 x i1> %m6)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:31 Lat:31 SizeLat:31 for: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr align 1 undef, <5 x i1> %m5)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:25 Lat:25 SizeLat:25 for: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr align 1 undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr align 1 undef, <3 x i1> %m3)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr align 1 undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr align 1 undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:91 CodeSize:107 Lat:107 SizeLat:107 for: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr align 1 undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:85 CodeSize:100 Lat:100 SizeLat:100 for: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr align 1 undef, <15 x i1> %m15)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:79 CodeSize:93 Lat:93 SizeLat:93 for: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr align 1 undef, <14 x i1> %m14)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:73 CodeSize:86 Lat:86 SizeLat:86 for: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr align 1 undef, <13 x i1> %m13)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:68 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr align 1 undef, <12 x i1> %m12)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:62 CodeSize:73 Lat:73 SizeLat:73 for: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr align 1 undef, <11 x i1> %m11)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:66 Lat:66 SizeLat:66 for: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr align 1 undef, <10 x i1> %m10)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:50 CodeSize:59 Lat:59 SizeLat:59 for: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr align 1 undef, <9 x i1> %m9)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:53 Lat:53 SizeLat:53 for: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr align 1 undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:46 Lat:46 SizeLat:46 for: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr align 1 undef, <7 x i1> %m7)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:39 Lat:39 SizeLat:39 for: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr align 1 undef, <6 x i1> %m6)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:32 Lat:32 SizeLat:32 for: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr align 1 undef, <5 x i1> %m5)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:26 Lat:26 SizeLat:26 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr align 1 undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr align 1 undef, <3 x i1> %m3)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr align 1 undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr align 1 undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:158 CodeSize:190 Lat:190 SizeLat:190 for: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr align 1 undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:79 CodeSize:95 Lat:95 SizeLat:95 for: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr align 1 undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:47 SizeLat:47 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr align 1 undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:23 Lat:23 SizeLat:23 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr align 1 undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:376 CodeSize:440 Lat:440 SizeLat:440 for: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr align 1 undef, <64 x i1> %m64)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:188 CodeSize:220 Lat:220 SizeLat:220 for: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr align 1 undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:94 CodeSize:110 Lat:110 SizeLat:110 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr align 1 undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:46 CodeSize:54 Lat:54 SizeLat:54 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr align 1 undef, <8 x i1> %m8)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_store'
-; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:31 Lat:31 SizeLat:31 for: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:27 Lat:27 SizeLat:27 for: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:22 Lat:22 SizeLat:22 for: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:13 Lat:13 SizeLat:13 for: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:76 Lat:76 SizeLat:76 for: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:52 CodeSize:66 Lat:66 SizeLat:66 for: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:61 Lat:61 SizeLat:61 for: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:57 Lat:57 SizeLat:57 for: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:41 CodeSize:52 Lat:52 SizeLat:52 for: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:47 Lat:47 SizeLat:47 for: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:42 Lat:42 SizeLat:42 for: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:38 Lat:38 SizeLat:38 for: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:28 Lat:28 SizeLat:28 for: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:23 Lat:23 SizeLat:23 for: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:14 Lat:14 SizeLat:14 for: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:30 Lat:30 SizeLat:30 for: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:25 Lat:25 SizeLat:25 for: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:15 Lat:15 SizeLat:15 for: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:75 Lat:75 SizeLat:75 for: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:70 Lat:70 SizeLat:70 for: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:52 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:60 Lat:60 SizeLat:60 for: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:55 Lat:55 SizeLat:55 for: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:50 Lat:50 SizeLat:50 for: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:45 Lat:45 SizeLat:45 for: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:30 Lat:30 SizeLat:30 for: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:25 Lat:25 SizeLat:25 for: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:15 Lat:15 SizeLat:15 for: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:160 Lat:160 SizeLat:160 for: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:320 Lat:320 SizeLat:320 for: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:160 Lat:160 SizeLat:160 for: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr align 1 undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:31 Lat:31 SizeLat:31 for: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr align 1 undef, <7 x i1> %m7)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:27 Lat:27 SizeLat:27 for: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr align 1 undef, <6 x i1> %m6)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:22 Lat:22 SizeLat:22 for: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr align 1 undef, <5 x i1> %m5)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr align 1 undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:13 Lat:13 SizeLat:13 for: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr align 1 undef, <3 x i1> %m3)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr align 1 undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr align 1 undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:76 Lat:76 SizeLat:76 for: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr align 1 undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr align 1 undef, <15 x i1> %m15)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:52 CodeSize:66 Lat:66 SizeLat:66 for: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr align 1 undef, <14 x i1> %m14)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:61 Lat:61 SizeLat:61 for: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr align 1 undef, <13 x i1> %m13)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:57 Lat:57 SizeLat:57 for: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr align 1 undef, <12 x i1> %m12)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:41 CodeSize:52 Lat:52 SizeLat:52 for: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr align 1 undef, <11 x i1> %m11)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:47 Lat:47 SizeLat:47 for: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr align 1 undef, <10 x i1> %m10)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:42 Lat:42 SizeLat:42 for: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr align 1 undef, <9 x i1> %m9)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:38 Lat:38 SizeLat:38 for: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr align 1 undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr align 1 undef, <7 x i1> %m7)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:28 Lat:28 SizeLat:28 for: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr align 1 undef, <6 x i1> %m6)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:23 Lat:23 SizeLat:23 for: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr align 1 undef, <5 x i1> %m5)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr align 1 undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:14 Lat:14 SizeLat:14 for: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr align 1 undef, <3 x i1> %m3)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr align 1 undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr align 1 undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr align 1 undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr align 1 undef, <7 x i1> %m7)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:30 Lat:30 SizeLat:30 for: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr align 1 undef, <6 x i1> %m6)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:25 Lat:25 SizeLat:25 for: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr align 1 undef, <5 x i1> %m5)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr align 1 undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:15 Lat:15 SizeLat:15 for: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr align 1 undef, <3 x i1> %m3)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr align 1 undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr align 1 undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr align 1 undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:75 Lat:75 SizeLat:75 for: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr align 1 undef, <15 x i1> %m15)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:70 Lat:70 SizeLat:70 for: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr align 1 undef, <14 x i1> %m14)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:52 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr align 1 undef, <13 x i1> %m13)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:60 Lat:60 SizeLat:60 for: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr align 1 undef, <12 x i1> %m12)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:55 Lat:55 SizeLat:55 for: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr align 1 undef, <11 x i1> %m11)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:50 Lat:50 SizeLat:50 for: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr align 1 undef, <10 x i1> %m10)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:45 Lat:45 SizeLat:45 for: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr align 1 undef, <9 x i1> %m9)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr align 1 undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr align 1 undef, <7 x i1> %m7)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:30 Lat:30 SizeLat:30 for: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr align 1 undef, <6 x i1> %m6)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:25 Lat:25 SizeLat:25 for: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr align 1 undef, <5 x i1> %m5)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr align 1 undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:15 Lat:15 SizeLat:15 for: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr align 1 undef, <3 x i1> %m3)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr align 1 undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr align 1 undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:160 Lat:160 SizeLat:160 for: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr align 1 undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr align 1 undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr align 1 undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr align 1 undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:320 Lat:320 SizeLat:320 for: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr align 1 undef, <64 x i1> %m64)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:160 Lat:160 SizeLat:160 for: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr align 1 undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr align 1 undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr align 1 undef, <8 x i1> %m8)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; AVX-LABEL: 'masked_store'
-; AVX-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
-; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
-; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
-; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
-; AVX-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
-; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
-; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
-; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
-; AVX-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
-; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
-; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
-; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
-; AVX-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
-; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
-; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
-; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
-; AVX-NEXT:  Cost Model: Found costs of RThru:131 CodeSize:163 Lat:163 SizeLat:163 for: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
-; AVX-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
-; AVX-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
-; AVX-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
-; AVX-NEXT:  Cost Model: Found costs of RThru:260 CodeSize:324 Lat:324 SizeLat:324 for: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
-; AVX-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
-; AVX-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
-; AVX-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr align 1 undef, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr align 1 undef, <7 x i1> %m7)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr align 1 undef, <6 x i1> %m6)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr align 1 undef, <5 x i1> %m5)
+; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr align 1 undef, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr align 1 undef, <3 x i1> %m3)
+; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr align 1 undef, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr align 1 undef, <1 x i1> %m1)
+; AVX-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr align 1 undef, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr align 1 undef, <15 x i1> %m15)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr align 1 undef, <14 x i1> %m14)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr align 1 undef, <13 x i1> %m13)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr align 1 undef, <12 x i1> %m12)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr align 1 undef, <11 x i1> %m11)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr align 1 undef, <10 x i1> %m10)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr align 1 undef, <9 x i1> %m9)
+; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr align 1 undef, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr align 1 undef, <7 x i1> %m7)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr align 1 undef, <6 x i1> %m6)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr align 1 undef, <5 x i1> %m5)
+; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr align 1 undef, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr align 1 undef, <3 x i1> %m3)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr align 1 undef, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr align 1 undef, <1 x i1> %m1)
+; AVX-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr align 1 undef, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr align 1 undef, <7 x i1> %m7)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr align 1 undef, <6 x i1> %m6)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr align 1 undef, <5 x i1> %m5)
+; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr align 1 undef, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr align 1 undef, <3 x i1> %m3)
+; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr align 1 undef, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr align 1 undef, <1 x i1> %m1)
+; AVX-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr align 1 undef, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr align 1 undef, <15 x i1> %m15)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr align 1 undef, <14 x i1> %m14)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr align 1 undef, <13 x i1> %m13)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr align 1 undef, <12 x i1> %m12)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr align 1 undef, <11 x i1> %m11)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr align 1 undef, <10 x i1> %m10)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr align 1 undef, <9 x i1> %m9)
+; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr align 1 undef, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr align 1 undef, <7 x i1> %m7)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr align 1 undef, <6 x i1> %m6)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr align 1 undef, <5 x i1> %m5)
+; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr align 1 undef, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr align 1 undef, <3 x i1> %m3)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr align 1 undef, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr align 1 undef, <1 x i1> %m1)
+; AVX-NEXT:  Cost Model: Found costs of RThru:131 CodeSize:163 Lat:163 SizeLat:163 for: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr align 1 undef, <32 x i1> %m32)
+; AVX-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr align 1 undef, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr align 1 undef, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr align 1 undef, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found costs of RThru:260 CodeSize:324 Lat:324 SizeLat:324 for: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr align 1 undef, <64 x i1> %m64)
+; AVX-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr align 1 undef, <32 x i1> %m32)
+; AVX-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr align 1 undef, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr align 1 undef, <8 x i1> %m8)
 ; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; KNL-LABEL: 'masked_store'
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
-; KNL-NEXT:  Cost Model: Found costs of RThru:132 CodeSize:164 Lat:164 SizeLat:164 for: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
-; KNL-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
-; KNL-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found costs of RThru:262 CodeSize:326 Lat:326 SizeLat:326 for: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
-; KNL-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
-; KNL-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
-; KNL-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr align 1 undef, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr align 1 undef, <7 x i1> %m7)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr align 1 undef, <6 x i1> %m6)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr align 1 undef, <5 x i1> %m5)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr align 1 undef, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr align 1 undef, <3 x i1> %m3)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr align 1 undef, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr align 1 undef, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr align 1 undef, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr align 1 undef, <15 x i1> %m15)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr align 1 undef, <14 x i1> %m14)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr align 1 undef, <13 x i1> %m13)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr align 1 undef, <12 x i1> %m12)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr align 1 undef, <11 x i1> %m11)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr align 1 undef, <10 x i1> %m10)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr align 1 undef, <9 x i1> %m9)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr align 1 undef, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr align 1 undef, <7 x i1> %m7)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr align 1 undef, <6 x i1> %m6)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr align 1 undef, <5 x i1> %m5)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr align 1 undef, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr align 1 undef, <3 x i1> %m3)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr align 1 undef, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr align 1 undef, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr align 1 undef, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr align 1 undef, <7 x i1> %m7)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr align 1 undef, <6 x i1> %m6)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr align 1 undef, <5 x i1> %m5)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr align 1 undef, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr align 1 undef, <3 x i1> %m3)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr align 1 undef, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr align 1 undef, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr align 1 undef, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr align 1 undef, <15 x i1> %m15)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr align 1 undef, <14 x i1> %m14)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr align 1 undef, <13 x i1> %m13)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr align 1 undef, <12 x i1> %m12)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr align 1 undef, <11 x i1> %m11)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr align 1 undef, <10 x i1> %m10)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr align 1 undef, <9 x i1> %m9)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr align 1 undef, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr align 1 undef, <7 x i1> %m7)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr align 1 undef, <6 x i1> %m6)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr align 1 undef, <5 x i1> %m5)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr align 1 undef, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr align 1 undef, <3 x i1> %m3)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr align 1 undef, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr align 1 undef, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found costs of RThru:132 CodeSize:164 Lat:164 SizeLat:164 for: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr align 1 undef, <32 x i1> %m32)
+; KNL-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr align 1 undef, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr align 1 undef, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr align 1 undef, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found costs of RThru:262 CodeSize:326 Lat:326 SizeLat:326 for: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr align 1 undef, <64 x i1> %m64)
+; KNL-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr align 1 undef, <32 x i1> %m32)
+; KNL-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr align 1 undef, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr align 1 undef, <8 x i1> %m8)
 ; KNL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SKX-LABEL: 'masked_store'
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr align 1 undef, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr align 1 undef, <7 x i1> %m7)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr align 1 undef, <6 x i1> %m6)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr align 1 undef, <5 x i1> %m5)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr align 1 undef, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr align 1 undef, <3 x i1> %m3)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr align 1 undef, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr align 1 undef, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr align 1 undef, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr align 1 undef, <15 x i1> %m15)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr align 1 undef, <14 x i1> %m14)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr align 1 undef, <13 x i1> %m13)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr align 1 undef, <12 x i1> %m12)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr align 1 undef, <11 x i1> %m11)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr align 1 undef, <10 x i1> %m10)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr align 1 undef, <9 x i1> %m9)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr align 1 undef, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr align 1 undef, <7 x i1> %m7)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr align 1 undef, <6 x i1> %m6)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr align 1 undef, <5 x i1> %m5)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr align 1 undef, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr align 1 undef, <3 x i1> %m3)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr align 1 undef, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr align 1 undef, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr align 1 undef, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr align 1 undef, <7 x i1> %m7)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr align 1 undef, <6 x i1> %m6)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr align 1 undef, <5 x i1> %m5)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr align 1 undef, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr align 1 undef, <3 x i1> %m3)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr align 1 undef, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr align 1 undef, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr align 1 undef, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr align 1 undef, <15 x i1> %m15)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr align 1 undef, <14 x i1> %m14)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr align 1 undef, <13 x i1> %m13)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr align 1 undef, <12 x i1> %m12)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr align 1 undef, <11 x i1> %m11)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr align 1 undef, <10 x i1> %m10)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr align 1 undef, <9 x i1> %m9)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr align 1 undef, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr align 1 undef, <7 x i1> %m7)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr align 1 undef, <6 x i1> %m6)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr align 1 undef, <5 x i1> %m5)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr align 1 undef, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr align 1 undef, <3 x i1> %m3)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr align 1 undef, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr align 1 undef, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr align 1 undef, <32 x i1> %m32)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr align 1 undef, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr align 1 undef, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr align 1 undef, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr align 1 undef, <64 x i1> %m64)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr align 1 undef, <32 x i1> %m32)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr align 1 undef, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr align 1 undef, <8 x i1> %m8)
 ; SKX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
   call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
@@ -732,192 +732,192 @@ define i32 @masked_store(<1 x i1> %m1, <2 x i1> %m2, <3 x i1> %m3, <4 x i1> %m4,
 
 define i32 @masked_gather(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_gather'
-; SSE2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:77 Lat:77 SizeLat:77 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:31 CodeSize:39 Lat:39 SizeLat:39 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:45 SizeLat:45 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:23 Lat:23 SizeLat:23 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:77 CodeSize:93 Lat:93 SizeLat:93 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:47 SizeLat:47 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:24 Lat:24 SizeLat:24 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:41 Lat:41 SizeLat:41 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:320 CodeSize:384 Lat:384 SizeLat:384 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:160 CodeSize:192 Lat:192 SizeLat:192 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:80 CodeSize:96 Lat:96 SizeLat:96 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:48 Lat:48 SizeLat:48 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:77 Lat:77 SizeLat:77 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:31 CodeSize:39 Lat:39 SizeLat:39 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:45 SizeLat:45 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:23 Lat:23 SizeLat:23 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:77 CodeSize:93 Lat:93 SizeLat:93 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:47 SizeLat:47 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:24 Lat:24 SizeLat:24 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:41 Lat:41 SizeLat:41 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:320 CodeSize:384 Lat:384 SizeLat:384 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:160 CodeSize:192 Lat:192 SizeLat:192 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:80 CodeSize:96 Lat:96 SizeLat:96 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:48 Lat:48 SizeLat:48 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_gather'
-; SSE42-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:29 Lat:29 SizeLat:29 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:15 Lat:15 SizeLat:15 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:61 Lat:61 SizeLat:61 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:31 Lat:31 SizeLat:31 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:16 Lat:16 SizeLat:16 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:130 SizeLat:130 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:196 CodeSize:260 Lat:260 SizeLat:260 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:130 SizeLat:130 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:29 Lat:29 SizeLat:29 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:15 Lat:15 SizeLat:15 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:61 Lat:61 SizeLat:61 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:31 Lat:31 SizeLat:31 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:16 Lat:16 SizeLat:16 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:130 SizeLat:130 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:196 CodeSize:260 Lat:260 SizeLat:260 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:130 SizeLat:130 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_gather'
-; AVX1-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:108 CodeSize:140 Lat:140 SizeLat:140 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:214 CodeSize:278 Lat:278 SizeLat:278 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:108 CodeSize:140 Lat:140 SizeLat:140 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:214 CodeSize:278 Lat:278 SizeLat:278 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_gather'
-; AVX2-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:276 SizeLat:276 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:138 SizeLat:138 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:276 SizeLat:276 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:138 SizeLat:138 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SKL-LABEL: 'masked_gather'
-; SKL-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:2 Lat:12 SizeLat:12 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:2 Lat:12 SizeLat:12 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:2 Lat:12 SizeLat:12 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:2 Lat:12 SizeLat:12 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:276 SizeLat:276 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:138 SizeLat:138 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:2 Lat:12 SizeLat:12 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:2 Lat:12 SizeLat:12 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:2 Lat:12 SizeLat:12 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:2 Lat:12 SizeLat:12 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:276 SizeLat:276 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:138 SizeLat:138 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; KNL-LABEL: 'masked_gather'
-; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:22 Lat:22 SizeLat:22 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:143 CodeSize:175 Lat:175 SizeLat:175 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:87 Lat:87 SizeLat:87 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:283 CodeSize:347 Lat:347 SizeLat:347 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:141 CodeSize:173 Lat:173 SizeLat:173 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:86 Lat:86 SizeLat:86 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:22 Lat:22 SizeLat:22 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:143 CodeSize:175 Lat:175 SizeLat:175 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:87 Lat:87 SizeLat:87 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:283 CodeSize:347 Lat:347 SizeLat:347 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:141 CodeSize:173 Lat:173 SizeLat:173 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:86 Lat:86 SizeLat:86 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; KNL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SKX-LABEL: 'masked_gather'
-; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:143 CodeSize:175 Lat:175 SizeLat:175 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:87 Lat:87 SizeLat:87 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:283 CodeSize:347 Lat:347 SizeLat:347 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:141 CodeSize:173 Lat:173 SizeLat:173 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:86 Lat:86 SizeLat:86 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:143 CodeSize:175 Lat:175 SizeLat:175 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:87 Lat:87 SizeLat:87 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:283 CodeSize:347 Lat:347 SizeLat:347 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:141 CodeSize:173 Lat:173 SizeLat:173 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:86 Lat:86 SizeLat:86 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; SKX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
   %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
@@ -955,192 +955,192 @@ define i32 @masked_gather(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8
 
 define i32 @masked_scatter(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_scatter'
-; SSE2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:77 Lat:77 SizeLat:77 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:31 CodeSize:39 Lat:39 SizeLat:39 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:45 SizeLat:45 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:23 Lat:23 SizeLat:23 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:77 CodeSize:93 Lat:93 SizeLat:93 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:47 SizeLat:47 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:24 Lat:24 SizeLat:24 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:41 Lat:41 SizeLat:41 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:320 CodeSize:384 Lat:384 SizeLat:384 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:160 CodeSize:192 Lat:192 SizeLat:192 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:80 CodeSize:96 Lat:96 SizeLat:96 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:48 Lat:48 SizeLat:48 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:77 Lat:77 SizeLat:77 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:31 CodeSize:39 Lat:39 SizeLat:39 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:45 SizeLat:45 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:23 Lat:23 SizeLat:23 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:77 CodeSize:93 Lat:93 SizeLat:93 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:47 SizeLat:47 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:24 Lat:24 SizeLat:24 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:41 Lat:41 SizeLat:41 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:320 CodeSize:384 Lat:384 SizeLat:384 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> align 1 undef, <64 x i1> %m64)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:160 CodeSize:192 Lat:192 SizeLat:192 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:80 CodeSize:96 Lat:96 SizeLat:96 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:48 Lat:48 SizeLat:48 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_scatter'
-; SSE42-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:29 Lat:29 SizeLat:29 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:15 Lat:15 SizeLat:15 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:61 Lat:61 SizeLat:61 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:31 Lat:31 SizeLat:31 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:16 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:130 SizeLat:130 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:196 CodeSize:260 Lat:260 SizeLat:260 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:130 SizeLat:130 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:29 Lat:29 SizeLat:29 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:15 Lat:15 SizeLat:15 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:61 Lat:61 SizeLat:61 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:31 Lat:31 SizeLat:31 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:16 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:130 SizeLat:130 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:196 CodeSize:260 Lat:260 SizeLat:260 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> align 1 undef, <64 x i1> %m64)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:130 SizeLat:130 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_scatter'
-; AVX1-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:108 CodeSize:140 Lat:140 SizeLat:140 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:214 CodeSize:278 Lat:278 SizeLat:278 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:108 CodeSize:140 Lat:140 SizeLat:140 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:214 CodeSize:278 Lat:278 SizeLat:278 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> align 1 undef, <64 x i1> %m64)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_scatter'
-; AVX2-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:276 SizeLat:276 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:138 SizeLat:138 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:276 SizeLat:276 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> align 1 undef, <64 x i1> %m64)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:138 SizeLat:138 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SKL-LABEL: 'masked_scatter'
-; SKL-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SKL-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SKL-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SKL-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:276 SizeLat:276 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
-; SKL-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:138 SizeLat:138 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SKL-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:276 SizeLat:276 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> align 1 undef, <64 x i1> %m64)
+; SKL-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:138 SizeLat:138 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; KNL-LABEL: 'masked_scatter'
-; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; KNL-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:22 Lat:22 SizeLat:22 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; KNL-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; KNL-NEXT:  Cost Model: Found costs of RThru:143 CodeSize:175 Lat:175 SizeLat:175 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; KNL-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:87 Lat:87 SizeLat:87 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; KNL-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found costs of RThru:283 CodeSize:347 Lat:347 SizeLat:347 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
-; KNL-NEXT:  Cost Model: Found costs of RThru:141 CodeSize:173 Lat:173 SizeLat:173 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; KNL-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:86 Lat:86 SizeLat:86 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; KNL-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:22 Lat:22 SizeLat:22 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found costs of RThru:143 CodeSize:175 Lat:175 SizeLat:175 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; KNL-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:87 Lat:87 SizeLat:87 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found costs of RThru:283 CodeSize:347 Lat:347 SizeLat:347 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> align 1 undef, <64 x i1> %m64)
+; KNL-NEXT:  Cost Model: Found costs of RThru:141 CodeSize:173 Lat:173 SizeLat:173 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; KNL-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:86 Lat:86 SizeLat:86 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
 ; KNL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SKX-LABEL: 'masked_scatter'
-; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SKX-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SKX-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKX-NEXT:  Cost Model: Found costs of RThru:143 CodeSize:175 Lat:175 SizeLat:175 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SKX-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:87 Lat:87 SizeLat:87 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKX-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found costs of RThru:283 CodeSize:347 Lat:347 SizeLat:347 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
-; SKX-NEXT:  Cost Model: Found costs of RThru:141 CodeSize:173 Lat:173 SizeLat:173 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SKX-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:86 Lat:86 SizeLat:86 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKX-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found costs of RThru:143 CodeSize:175 Lat:175 SizeLat:175 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; SKX-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:87 Lat:87 SizeLat:87 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found costs of RThru:283 CodeSize:347 Lat:347 SizeLat:347 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> align 1 undef, <64 x i1> %m64)
+; SKX-NEXT:  Cost Model: Found costs of RThru:141 CodeSize:173 Lat:173 SizeLat:173 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; SKX-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:86 Lat:86 SizeLat:86 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
 ; SKX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
   call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
@@ -1571,22 +1571,22 @@ define i32 @masked_compressstore(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x
 define <2 x double> @test1(<2 x i64> %trigger, ptr %addr, <2 x double> %dst) {
 ; SSE2-LABEL: 'test1'
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:4 SizeLat:5 for: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x double> %dst)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %res
 ;
 ; SSE42-LABEL: 'test1'
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x double> %dst)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %res
 ;
 ; AVX-LABEL: 'test1'
 ; AVX-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
-; AVX-NEXT:  Cost Model: Found costs of 2 for: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; AVX-NEXT:  Cost Model: Found costs of 2 for: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x double> %dst)
 ; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %res
 ;
 ; AVX512-LABEL: 'test1'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; AVX512-NEXT:  Cost Model: Found costs of 1 for: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x double> %dst)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %res
 ;
   %mask = icmp eq <2 x i64> %trigger, zeroinitializer
@@ -1597,22 +1597,22 @@ define <2 x double> @test1(<2 x i64> %trigger, ptr %addr, <2 x double> %dst) {
 define <4 x i32> @test2(<4 x i32> %trigger, ptr %addr, <4 x i32> %dst) {
 ; SSE2-LABEL: 'test2'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
-; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:26 Lat:26 SizeLat:26 for: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:26 Lat:26 SizeLat:26 for: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 %addr, <4 x i1> %mask, <4 x i32> %dst)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; SSE42-LABEL: 'test2'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 %addr, <4 x i1> %mask, <4 x i32> %dst)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; AVX-LABEL: 'test2'
 ; AVX-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
-; AVX-NEXT:  Cost Model: Found costs of 2 for: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; AVX-NEXT:  Cost Model: Found costs of 2 for: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 %addr, <4 x i1> %mask, <4 x i32> %dst)
 ; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; AVX512-LABEL: 'test2'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; AVX512-NEXT:  Cost Model: Found costs of 1 for: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 %addr, <4 x i1> %mask, <4 x i32> %dst)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
@@ -1623,22 +1623,22 @@ define <4 x i32> @test2(<4 x i32> %trigger, ptr %addr, <4 x i32> %dst) {
 define void @test3(<4 x i32> %trigger, ptr %addr, <4 x i32> %val) {
 ; SSE2-LABEL: 'test3'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
-; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:26 Lat:26 SizeLat:26 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:26 Lat:26 SizeLat:26 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr align 4 %addr, <4 x i1> %mask)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'test3'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr align 4 %addr, <4 x i1> %mask)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX-LABEL: 'test3'
 ; AVX-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
-; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr align 4 %addr, <4 x i1> %mask)
 ; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test3'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr align 4 %addr, <4 x i1> %mask)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
@@ -1649,32 +1649,32 @@ define void @test3(<4 x i32> %trigger, ptr %addr, <4 x i32> %val) {
 define <8 x float> @test4(<8 x i32> %trigger, ptr %addr, <8 x float> %dst) {
 ; SSE2-LABEL: 'test4'
 ; SSE2-NEXT:  Cost Model: Found costs of 2 for: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
-; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:45 SizeLat:45 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:45 SizeLat:45 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 %addr, <8 x i1> %mask, <8 x float> %dst)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x float> %res
 ;
 ; SSE42-LABEL: 'test4'
 ; SSE42-NEXT:  Cost Model: Found costs of 2 for: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:38 Lat:38 SizeLat:38 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:38 Lat:38 SizeLat:38 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 %addr, <8 x i1> %mask, <8 x float> %dst)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x float> %res
 ;
 ; AVX1-LABEL: 'test4'
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:5 Lat:2 SizeLat:6 for: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX1-NEXT:  Cost Model: Found costs of 2 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 %addr, <8 x i1> %mask, <8 x float> %dst)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x float> %res
 ;
 ; AVX2-LABEL: 'test4'
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:1 SizeLat:2 for: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 2 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX2-NEXT:  Cost Model: Found costs of 2 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 %addr, <8 x i1> %mask, <8 x float> %dst)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x float> %res
 ;
 ; SKL-LABEL: 'test4'
 ; SKL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:1 SizeLat:2 for: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
-; SKL-NEXT:  Cost Model: Found costs of 2 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SKL-NEXT:  Cost Model: Found costs of 2 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 %addr, <8 x i1> %mask, <8 x float> %dst)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x float> %res
 ;
 ; AVX512-LABEL: 'test4'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX512-NEXT:  Cost Model: Found costs of 1 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 %addr, <8 x i1> %mask, <8 x float> %dst)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x float> %res
 ;
   %mask = icmp eq <8 x i32> %trigger, zeroinitializer
@@ -1685,22 +1685,22 @@ define <8 x float> @test4(<8 x i32> %trigger, ptr %addr, <8 x float> %dst) {
 define void @test5(<2 x i32> %trigger, ptr %addr, <2 x float> %val) {
 ; SSE2-LABEL: 'test5'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr align 4 %addr, <2 x i1> %mask)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'test5'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr align 4 %addr, <2 x i1> %mask)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX-LABEL: 'test5'
 ; AVX-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr align 4 %addr, <2 x i1> %mask)
 ; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test5'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr align 4 %addr, <2 x i1> %mask)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
@@ -1711,22 +1711,22 @@ define void @test5(<2 x i32> %trigger, ptr %addr, <2 x float> %val) {
 define void @test6(<2 x i32> %trigger, ptr %addr, <2 x i32> %val) {
 ; SSE2-LABEL: 'test6'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr align 4 %addr, <2 x i1> %mask)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'test6'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr align 4 %addr, <2 x i1> %mask)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX-LABEL: 'test6'
 ; AVX-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr align 4 %addr, <2 x i1> %mask)
 ; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test6'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr align 4 %addr, <2 x i1> %mask)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
@@ -1737,22 +1737,22 @@ define void @test6(<2 x i32> %trigger, ptr %addr, <2 x i32> %val) {
 define <2 x float> @test7(<2 x i32> %trigger, ptr %addr, <2 x float> %dst) {
 ; SSE2-LABEL: 'test7'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x float> %dst)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x float> %res
 ;
 ; SSE42-LABEL: 'test7'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x float> %dst)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x float> %res
 ;
 ; AVX-LABEL: 'test7'
 ; AVX-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x float> %dst)
 ; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x float> %res
 ;
 ; AVX512-LABEL: 'test7'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 2 for: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; AVX512-NEXT:  Cost Model: Found costs of 2 for: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x float> %dst)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x float> %res
 ;
   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
@@ -1763,22 +1763,22 @@ define <2 x float> @test7(<2 x i32> %trigger, ptr %addr, <2 x float> %dst) {
 define <2 x i32> @test8(<2 x i32> %trigger, ptr %addr, <2 x i32> %dst) {
 ; SSE2-LABEL: 'test8'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x i32> %dst)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i32> %res
 ;
 ; SSE42-LABEL: 'test8'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x i32> %dst)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i32> %res
 ;
 ; AVX-LABEL: 'test8'
 ; AVX-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x i32> %dst)
 ; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i32> %res
 ;
 ; AVX512-LABEL: 'test8'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 2 for: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; AVX512-NEXT:  Cost Model: Found costs of 2 for: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x i32> %dst)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i32> %res
 ;
   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
@@ -1788,27 +1788,27 @@ define <2 x i32> @test8(<2 x i32> %trigger, ptr %addr, <2 x i32> %dst) {
 
 define <2 x double> @test_gather_2f64(<2 x ptr> %ptrs, <2 x i1> %mask, <2 x double> %src0)  {
 ; SSE2-LABEL: 'test_gather_2f64'
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 4 %ptrs, <2 x i1> %mask, <2 x double> %src0)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %res
 ;
 ; SSE42-LABEL: 'test_gather_2f64'
-; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 4 %ptrs, <2 x i1> %mask, <2 x double> %src0)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %res
 ;
 ; AVX1-LABEL: 'test_gather_2f64'
-; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 4 %ptrs, <2 x i1> %mask, <2 x double> %src0)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %res
 ;
 ; AVX2-LABEL: 'test_gather_2f64'
-; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 4 %ptrs, <2 x i1> %mask, <2 x double> %src0)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %res
 ;
 ; SKL-LABEL: 'test_gather_2f64'
-; SKL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SKL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 4 %ptrs, <2 x i1> %mask, <2 x double> %src0)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %res
 ;
 ; AVX512-LABEL: 'test_gather_2f64'
-; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 4 %ptrs, <2 x i1> %mask, <2 x double> %src0)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %res
 ;
   %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
@@ -1817,31 +1817,31 @@ define <2 x double> @test_gather_2f64(<2 x ptr> %ptrs, <2 x i1> %mask, <2 x doub
 
 define <4 x i32> @test_gather_4i32(<4 x ptr> %ptrs, <4 x i1> %mask, <4 x i32> %src0)  {
 ; SSE2-LABEL: 'test_gather_4i32'
-; SSE2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:24 Lat:24 SizeLat:24 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:24 Lat:24 SizeLat:24 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> %mask, <4 x i32> %src0)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; SSE42-LABEL: 'test_gather_4i32'
-; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> %mask, <4 x i32> %src0)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; AVX1-LABEL: 'test_gather_4i32'
-; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> %mask, <4 x i32> %src0)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; AVX2-LABEL: 'test_gather_4i32'
-; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> %mask, <4 x i32> %src0)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; SKL-LABEL: 'test_gather_4i32'
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> %mask, <4 x i32> %src0)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; KNL-LABEL: 'test_gather_4i32'
-; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> %mask, <4 x i32> %src0)
 ; KNL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; SKX-LABEL: 'test_gather_4i32'
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> %mask, <4 x i32> %src0)
 ; SKX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
   %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
@@ -1850,31 +1850,31 @@ define <4 x i32> @test_gather_4i32(<4 x ptr> %ptrs, <4 x i1> %mask, <4 x i32> %s
 
 define <4 x i32> @test_gather_4i32_const_mask(<4 x ptr> %ptrs, <4 x i32> %src0)  {
 ; SSE2-LABEL: 'test_gather_4i32_const_mask'
-; SSE2-NEXT:  Cost Model: Found costs of 19 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> splat (i1 true), <4 x i32> %src0)
+; SSE2-NEXT:  Cost Model: Found costs of 19 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> splat (i1 true), <4 x i32> %src0)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; SSE42-LABEL: 'test_gather_4i32_const_mask'
-; SSE42-NEXT:  Cost Model: Found costs of 12 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> splat (i1 true), <4 x i32> %src0)
+; SSE42-NEXT:  Cost Model: Found costs of 12 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> splat (i1 true), <4 x i32> %src0)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; AVX1-LABEL: 'test_gather_4i32_const_mask'
-; AVX1-NEXT:  Cost Model: Found costs of 13 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> splat (i1 true), <4 x i32> %src0)
+; AVX1-NEXT:  Cost Model: Found costs of 13 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> splat (i1 true), <4 x i32> %src0)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; AVX2-LABEL: 'test_gather_4i32_const_mask'
-; AVX2-NEXT:  Cost Model: Found costs of 13 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> splat (i1 true), <4 x i32> %src0)
+; AVX2-NEXT:  Cost Model: Found costs of 13 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> splat (i1 true), <4 x i32> %src0)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; SKL-LABEL: 'test_gather_4i32_const_mask'
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> splat (i1 true), <4 x i32> %src0)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> splat (i1 true), <4 x i32> %src0)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; KNL-LABEL: 'test_gather_4i32_const_mask'
-; KNL-NEXT:  Cost Model: Found costs of 13 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> splat (i1 true), <4 x i32> %src0)
+; KNL-NEXT:  Cost Model: Found costs of 13 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> splat (i1 true), <4 x i32> %src0)
 ; KNL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; SKX-LABEL: 'test_gather_4i32_const_mask'
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> splat (i1 true), <4 x i32> %src0)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> splat (i1 true), <4 x i32> %src0)
 ; SKX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
   %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
@@ -1885,37 +1885,37 @@ define <16 x float> @test_gather_16f32_const_mask(ptr %base, <16 x i32> %ind) {
 ; SSE2-LABEL: 'test_gather_16f32_const_mask'
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:8 SizeLat:8 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; SSE2-NEXT:  Cost Model: Found costs of 60 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of 60 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> splat (i1 true), <16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_16f32_const_mask'
 ; SSE42-NEXT:  Cost Model: Found costs of 8 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE42-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; SSE42-NEXT:  Cost Model: Found costs of 44 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of 44 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> splat (i1 true), <16 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX1-LABEL: 'test_gather_16f32_const_mask'
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX1-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; AVX1-NEXT:  Cost Model: Found costs of 50 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 50 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> splat (i1 true), <16 x float> undef)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX2-LABEL: 'test_gather_16f32_const_mask'
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX2-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; AVX2-NEXT:  Cost Model: Found costs of 50 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 50 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> splat (i1 true), <16 x float> undef)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; SKL-LABEL: 'test_gather_16f32_const_mask'
 ; SKL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SKL-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> splat (i1 true), <16 x float> undef)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX512-LABEL: 'test_gather_16f32_const_mask'
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX512-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; AVX512-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:18 SizeLat:18 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:18 SizeLat:18 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> splat (i1 true), <16 x float> undef)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
   %sext_ind = sext <16 x i32> %ind to <16 x i64>
@@ -1929,37 +1929,37 @@ define <16 x float> @test_gather_16f32_var_mask(ptr %base, <16 x i32> %ind, <16
 ; SSE2-LABEL: 'test_gather_16f32_var_mask'
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:8 SizeLat:8 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:77 Lat:77 SizeLat:77 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:77 Lat:77 SizeLat:77 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_16f32_var_mask'
 ; SSE42-NEXT:  Cost Model: Found costs of 8 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE42-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:61 Lat:61 SizeLat:61 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:61 Lat:61 SizeLat:61 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX1-LABEL: 'test_gather_16f32_var_mask'
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX1-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; AVX1-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX2-LABEL: 'test_gather_16f32_var_mask'
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX2-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; AVX2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; SKL-LABEL: 'test_gather_16f32_var_mask'
 ; SKL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SKL-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX512-LABEL: 'test_gather_16f32_var_mask'
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX512-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; AVX512-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:18 SizeLat:18 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:18 SizeLat:18 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
   %sext_ind = sext <16 x i32> %ind to <16 x i64>
@@ -1973,37 +1973,37 @@ define <16 x float> @test_gather_16f32_ra_var_mask(<16 x ptr> %ptrs, <16 x i32>
 ; SSE2-LABEL: 'test_gather_16f32_ra_var_mask'
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:8 SizeLat:8 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
-; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:77 Lat:77 SizeLat:77 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:77 Lat:77 SizeLat:77 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_16f32_ra_var_mask'
 ; SSE42-NEXT:  Cost Model: Found costs of 8 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE42-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
-; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:61 Lat:61 SizeLat:61 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:61 Lat:61 SizeLat:61 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX1-LABEL: 'test_gather_16f32_ra_var_mask'
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX1-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
-; AVX1-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX2-LABEL: 'test_gather_16f32_ra_var_mask'
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX2-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
-; AVX2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; SKL-LABEL: 'test_gather_16f32_ra_var_mask'
 ; SKL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SKL-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
-; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX512-LABEL: 'test_gather_16f32_ra_var_mask'
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX512-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
-; AVX512-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
   %sext_ind = sext <16 x i32> %ind to <16 x i64>
@@ -2019,7 +2019,7 @@ define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:8 SizeLat:8 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
-; SSE2-NEXT:  Cost Model: Found costs of 60 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of 60 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.random, <16 x i1> splat (i1 true), <16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_16f32_const_mask2'
@@ -2027,7 +2027,7 @@ define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of 8 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE42-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
-; SSE42-NEXT:  Cost Model: Found costs of 44 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of 44 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.random, <16 x i1> splat (i1 true), <16 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX1-LABEL: 'test_gather_16f32_const_mask2'
@@ -2035,7 +2035,7 @@ define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX1-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
-; AVX1-NEXT:  Cost Model: Found costs of 50 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 50 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.random, <16 x i1> splat (i1 true), <16 x float> undef)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX2-LABEL: 'test_gather_16f32_const_mask2'
@@ -2043,7 +2043,7 @@ define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX2-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
-; AVX2-NEXT:  Cost Model: Found costs of 50 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 50 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.random, <16 x i1> splat (i1 true), <16 x float> undef)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; SKL-LABEL: 'test_gather_16f32_const_mask2'
@@ -2051,7 +2051,7 @@ define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
 ; SKL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
 ; SKL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SKL-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
-; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.random, <16 x i1> splat (i1 true), <16 x float> undef)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX512-LABEL: 'test_gather_16f32_const_mask2'
@@ -2059,7 +2059,7 @@ define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX512-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
-; AVX512-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:18 SizeLat:18 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:18 SizeLat:18 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.random, <16 x i1> splat (i1 true), <16 x float> undef)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
   %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
@@ -2078,7 +2078,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %imask = bitcast i16 %mask to <16 x i1>
-; SSE2-NEXT:  Cost Model: Found costs of RThru:77 CodeSize:93 Lat:93 SizeLat:93 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:77 CodeSize:93 Lat:93 SizeLat:93 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> align 4 %gep.random, <16 x i1> %imask)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'test_scatter_16i32'
@@ -2086,7 +2086,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %imask = bitcast i16 %mask to <16 x i1>
-; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> align 4 %gep.random, <16 x i1> %imask)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX1-LABEL: 'test_scatter_16i32'
@@ -2094,7 +2094,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %imask = bitcast i16 %mask to <16 x i1>
-; AVX1-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> align 4 %gep.random, <16 x i1> %imask)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'test_scatter_16i32'
@@ -2102,7 +2102,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %imask = bitcast i16 %mask to <16 x i1>
-; AVX2-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> align 4 %gep.random, <16 x i1> %imask)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SKL-LABEL: 'test_scatter_16i32'
@@ -2110,7 +2110,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 ; SKL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
 ; SKL-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
 ; SKL-NEXT:  Cost Model: Found costs of 1 for: %imask = bitcast i16 %mask to <16 x i1>
-; SKL-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SKL-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> align 4 %gep.random, <16 x i1> %imask)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test_scatter_16i32'
@@ -2118,7 +2118,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %imask = bitcast i16 %mask to <16 x i1>
-; AVX512-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> align 4 %gep.random, <16 x i1> %imask)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
@@ -2132,19 +2132,19 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 
 define void @test_scatter_8i32(<8 x i32>%a1, <8 x ptr> %ptr, <8 x i1>%mask) {
 ; SSE2-LABEL: 'test_scatter_8i32'
-; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:47 SizeLat:47 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:47 SizeLat:47 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> align 4 %ptr, <8 x i1> %mask)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'test_scatter_8i32'
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> align 4 %ptr, <8 x i1> %mask)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX-LABEL: 'test_scatter_8i32'
-; AVX-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> align 4 %ptr, <8 x i1> %mask)
 ; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test_scatter_8i32'
-; AVX512-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> align 4 %ptr, <8 x i1> %mask)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
@@ -2153,23 +2153,23 @@ define void @test_scatter_8i32(<8 x i32>%a1, <8 x ptr> %ptr, <8 x i1>%mask) {
 
 define void @test_scatter_4i32(<4 x i32>%a1, <4 x ptr> %ptr, <4 x i1>%mask) {
 ; SSE2-LABEL: 'test_scatter_4i32'
-; SSE2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:24 Lat:24 SizeLat:24 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:24 Lat:24 SizeLat:24 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> align 4 %ptr, <4 x i1> %mask)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'test_scatter_4i32'
-; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> align 4 %ptr, <4 x i1> %mask)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX-LABEL: 'test_scatter_4i32'
-; AVX-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> align 4 %ptr, <4 x i1> %mask)
 ; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; KNL-LABEL: 'test_scatter_4i32'
-; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> align 4 %ptr, <4 x i1> %mask)
 ; KNL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SKX-LABEL: 'test_scatter_4i32'
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> align 4 %ptr, <4 x i1> %mask)
 ; SKX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
@@ -2180,43 +2180,43 @@ define <4 x float> @test_gather_4f32(ptr %ptr, <4 x i32> %ind, <4 x i1>%mask) {
 ; SSE2-LABEL: 'test_gather_4f32'
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SSE2-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> %mask, <4 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_4f32'
 ; SSE42-NEXT:  Cost Model: Found costs of 2 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SSE42-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:16 Lat:16 SizeLat:16 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:16 Lat:16 SizeLat:16 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> %mask, <4 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; AVX1-LABEL: 'test_gather_4f32'
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; AVX1-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> %mask, <4 x float> undef)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; AVX2-LABEL: 'test_gather_4f32'
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; AVX2-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> %mask, <4 x float> undef)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; SKL-LABEL: 'test_gather_4f32'
 ; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SKL-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> %mask, <4 x float> undef)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; KNL-LABEL: 'test_gather_4f32'
 ; KNL-NEXT:  Cost Model: Found costs of 1 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; KNL-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> %mask, <4 x float> undef)
 ; KNL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; SKX-LABEL: 'test_gather_4f32'
 ; SKX-NEXT:  Cost Model: Found costs of 1 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SKX-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> %mask, <4 x float> undef)
 ; SKX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
   %sext_ind = sext <4 x i32> %ind to <4 x i64>
@@ -2230,43 +2230,43 @@ define <4 x float> @test_gather_4f32_const_mask(ptr %ptr, <4 x i32> %ind) {
 ; SSE2-LABEL: 'test_gather_4f32_const_mask'
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SSE2-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; SSE2-NEXT:  Cost Model: Found costs of 15 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> splat (i1 true), <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of 15 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> splat (i1 true), <4 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_4f32_const_mask'
 ; SSE42-NEXT:  Cost Model: Found costs of 2 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SSE42-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; SSE42-NEXT:  Cost Model: Found costs of 11 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> splat (i1 true), <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of 11 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> splat (i1 true), <4 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; AVX1-LABEL: 'test_gather_4f32_const_mask'
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; AVX1-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; AVX1-NEXT:  Cost Model: Found costs of 12 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> splat (i1 true), <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 12 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> splat (i1 true), <4 x float> undef)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; AVX2-LABEL: 'test_gather_4f32_const_mask'
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; AVX2-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; AVX2-NEXT:  Cost Model: Found costs of 12 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> splat (i1 true), <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 12 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> splat (i1 true), <4 x float> undef)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; SKL-LABEL: 'test_gather_4f32_const_mask'
 ; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SKL-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> splat (i1 true), <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> splat (i1 true), <4 x float> undef)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; KNL-LABEL: 'test_gather_4f32_const_mask'
 ; KNL-NEXT:  Cost Model: Found costs of 1 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; KNL-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; KNL-NEXT:  Cost Model: Found costs of 12 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> splat (i1 true), <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 12 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> splat (i1 true), <4 x float> undef)
 ; KNL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; SKX-LABEL: 'test_gather_4f32_const_mask'
 ; SKX-NEXT:  Cost Model: Found costs of 1 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SKX-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> splat (i1 true), <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> splat (i1 true), <4 x float> undef)
 ; SKX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
   %sext_ind = sext <4 x i32> %ind to <4 x i64>
diff --git a/llvm/test/Analysis/DXILResource/buffer-frombinding.ll b/llvm/test/Analysis/DXILResource/buffer-frombinding.ll
index aeeb21ebb3201..ab1945d2921cc 100644
--- a/llvm/test/Analysis/DXILResource/buffer-frombinding.ll
+++ b/llvm/test/Analysis/DXILResource/buffer-frombinding.ll
@@ -150,7 +150,7 @@ define void @test_typedbuffer() {
   ; CHECK:   Kind: CBuffer
   ; CHECK:   CBuffer size: 4
 
-  %cb1 = call target("dx.CBuffer", target("dx.Layout", {float}, 4, 0))
+  %cb1 = call target("dx.CBuffer", <{ [2 x <{ float, target("dx.Padding", 12) }>], float }>)
      @llvm.dx.resource.handlefrombinding(i32 1, i32 8, i32 1, i32 0, ptr @Constants.str)
   ; CHECK: Resource [[CB1:[0-9]+]]:
   ; CHECK:   Name: Constants
@@ -161,7 +161,7 @@ define void @test_typedbuffer() {
   ; CHECK:     Size: 1
   ; CHECK:   Class: CBV
   ; CHECK:   Kind: CBuffer
-  ; CHECK:   CBuffer size: 4
+  ; CHECK:   CBuffer size: 36
 
   ; CHECK-NOT: Resource {{[0-9]+}}:
 
diff --git a/llvm/test/Analysis/DependenceAnalysis/ExactSIV.ll b/llvm/test/Analysis/DependenceAnalysis/ExactSIV.ll
index e8e7cb11bb23e..b6b44ad4bfc53 100644
--- a/llvm/test/Analysis/DependenceAnalysis/ExactSIV.ll
+++ b/llvm/test/Analysis/DependenceAnalysis/ExactSIV.ll
@@ -1,8 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -disable-output "-passes=print<da>" -aa-pipeline=basic-aa 2>&1 \
 ; RUN: | FileCheck %s
-; RUN: opt < %s -disable-output "-passes=print<da>" -da-run-siv-routines-only 2>&1 \
-; RUN: | FileCheck %s --check-prefix=CHECK-SIV-ONLY
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.6.0"
@@ -27,20 +25,6 @@ define void @exact0(ptr %A, ptr %B) nounwind uwtable ssp {
 ; CHECK-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
 ; CHECK-NEXT:    da analyze - none!
 ;
-; CHECK-SIV-ONLY-LABEL: 'exact0'
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - flow [<=|<]!
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
-; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-;
 entry:
   br label %for.body
 
@@ -85,20 +69,6 @@ define void @exact1(ptr %A, ptr %B) nounwind uwtable ssp {
 ; CHECK-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
 ; CHECK-NEXT:    da analyze - none!
 ;
-; CHECK-SIV-ONLY-LABEL: 'exact1'
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx3, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
-; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx3, align 4 --> Dst: %0 = load i32, ptr %arrayidx3, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx3, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-;
 entry:
   br label %for.body
 
@@ -144,20 +114,6 @@ define void @exact2(ptr %A, ptr %B) nounwind uwtable ssp {
 ; CHECK-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
 ; CHECK-NEXT:    da analyze - none!
 ;
-; CHECK-SIV-ONLY-LABEL: 'exact2'
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx1, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
-; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx1, align 4 --> Dst: %0 = load i32, ptr %arrayidx1, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx1, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-;
 entry:
   br label %for.body
 
@@ -201,20 +157,6 @@ define void @exact3(ptr %A, ptr %B) nounwind uwtable ssp {
 ; CHECK-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
 ; CHECK-NEXT:    da analyze - none!
 ;
-; CHECK-SIV-ONLY-LABEL: 'exact3'
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx1, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - flow [>]!
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
-; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx1, align 4 --> Dst: %0 = load i32, ptr %arrayidx1, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx1, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-;
 entry:
   br label %for.body
 
@@ -258,20 +200,6 @@ define void @exact4(ptr %A, ptr %B) nounwind uwtable ssp {
 ; CHECK-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
 ; CHECK-NEXT:    da analyze - none!
 ;
-; CHECK-SIV-ONLY-LABEL: 'exact4'
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx1, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - flow [>]!
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
-; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx1, align 4 --> Dst: %0 = load i32, ptr %arrayidx1, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx1, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-;
 entry:
   br label %for.body
 
@@ -315,20 +243,6 @@ define void @exact5(ptr %A, ptr %B) nounwind uwtable ssp {
 ; CHECK-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
 ; CHECK-NEXT:    da analyze - none!
 ;
-; CHECK-SIV-ONLY-LABEL: 'exact5'
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx1, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - flow [=>|<]!
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
-; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx1, align 4 --> Dst: %0 = load i32, ptr %arrayidx1, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx1, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-;
 entry:
   br label %for.body
 
@@ -372,20 +286,6 @@ define void @exact6(ptr %A, ptr %B) nounwind uwtable ssp {
 ; CHECK-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
 ; CHECK-NEXT:    da analyze - none!
 ;
-; CHECK-SIV-ONLY-LABEL: 'exact6'
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx1, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - flow [=>|<]!
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
-; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx1, align 4 --> Dst: %0 = load i32, ptr %arrayidx1, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx1, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-;
 entry:
   br label %for.body
 
@@ -429,20 +329,6 @@ define void @exact7(ptr %A, ptr %B) nounwind uwtable ssp {
 ; CHECK-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
 ; CHECK-NEXT:    da analyze - none!
 ;
-; CHECK-SIV-ONLY-LABEL: 'exact7'
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx1, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - flow [*|<]!
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
-; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx1, align 4 --> Dst: %0 = load i32, ptr %arrayidx1, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx1, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-;
 entry:
   br label %for.body
 
@@ -486,20 +372,6 @@ define void @exact8(ptr %A, ptr %B) nounwind uwtable ssp {
 ; CHECK-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
 ; CHECK-NEXT:    da analyze - none!
 ;
-; CHECK-SIV-ONLY-LABEL: 'exact8'
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
-; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-;
 entry:
   br label %for.body
 
@@ -543,20 +415,6 @@ define void @exact9(ptr %A, ptr %B) nounwind uwtable ssp {
 ; CHECK-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
 ; CHECK-NEXT:    da analyze - none!
 ;
-; CHECK-SIV-ONLY-LABEL: 'exact9'
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - flow [>]!
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
-; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-;
 entry:
   br label %for.body
 
@@ -600,20 +458,6 @@ define void @exact10(ptr %A, ptr %B) nounwind uwtable ssp {
 ; CHECK-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
 ; CHECK-NEXT:    da analyze - none!
 ;
-; CHECK-SIV-ONLY-LABEL: 'exact10'
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - flow [>]!
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
-; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-;
 entry:
   br label %for.body
 
@@ -657,20 +501,6 @@ define void @exact11(ptr %A, ptr %B) nounwind uwtable ssp {
 ; CHECK-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
 ; CHECK-NEXT:    da analyze - none!
 ;
-; CHECK-SIV-ONLY-LABEL: 'exact11'
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - flow [=>|<]!
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
-; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-;
 entry:
   br label %for.body
 
@@ -714,20 +544,6 @@ define void @exact12(ptr %A, ptr %B) nounwind uwtable ssp {
 ; CHECK-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
 ; CHECK-NEXT:    da analyze - none!
 ;
-; CHECK-SIV-ONLY-LABEL: 'exact12'
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - flow [=>|<]!
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
-; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-;
 entry:
   br label %for.body
 
@@ -771,20 +587,6 @@ define void @exact13(ptr %A, ptr %B) nounwind uwtable ssp {
 ; CHECK-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
 ; CHECK-NEXT:    da analyze - none!
 ;
-; CHECK-SIV-ONLY-LABEL: 'exact13'
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %conv, ptr %arrayidx, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - flow [*|<]!
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %conv, ptr %arrayidx, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
-; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: %0 = load i32, ptr %arrayidx2, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-; CHECK-SIV-ONLY-NEXT:  Src: %0 = load i32, ptr %arrayidx2, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - confused!
-; CHECK-SIV-ONLY-NEXT:  Src: store i32 %0, ptr %B.addr.01, align 4 --> Dst: store i32 %0, ptr %B.addr.01, align 4
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-;
 entry:
   br label %for.body
 
@@ -807,123 +609,3 @@ for.body:                                         ; preds = %entry, %for.body
 for.end:                                          ; preds = %for.body
   ret void
 }
-
-;; max_i = INT64_MAX/6  // 1537228672809129301
-;; for (long long i = 0; i <= max_i; i++) {
-;;   A[-6*i + INT64_MAX] = 0;
-;;   if (i)
-;;     A[3*i - 2] = 1;
-;; }
-;;
-;; FIXME: DependencyAnalsysis currently detects no dependency between
-;; `A[-6*i + INT64_MAX]` and `A[3*i - 2]`, but it does exist. For example,
-;;
-;; | memory location        | -6*i + INT64_MAX       | 3*i - 2
-;; |------------------------|------------------------|-----------
-;; | A[1]                   | i = max_i              | i = 1
-;; | A[4611686018427387901] | i = 768614336404564651 | i = max_i
-;;
-;; Actually,
-;;  * 1                   = -6*max_i              + INT64_MAX = 3*1     - 2
-;;  * 4611686018427387901 = -6*768614336404564651 + INT64_MAX = 3*max_i - 2
-;;
-
-define void @exact14(ptr %A) {
-; CHECK-LABEL: 'exact14'
-; CHECK-NEXT:  Src: store i8 0, ptr %idx.0, align 1 --> Dst: store i8 0, ptr %idx.0, align 1
-; CHECK-NEXT:    da analyze - none!
-; CHECK-NEXT:  Src: store i8 0, ptr %idx.0, align 1 --> Dst: store i8 1, ptr %idx.1, align 1
-; CHECK-NEXT:    da analyze - none!
-; CHECK-NEXT:  Src: store i8 1, ptr %idx.1, align 1 --> Dst: store i8 1, ptr %idx.1, align 1
-; CHECK-NEXT:    da analyze - none!
-;
-; CHECK-SIV-ONLY-LABEL: 'exact14'
-; CHECK-SIV-ONLY-NEXT:  Src: store i8 0, ptr %idx.0, align 1 --> Dst: store i8 0, ptr %idx.0, align 1
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-; CHECK-SIV-ONLY-NEXT:  Src: store i8 0, ptr %idx.0, align 1 --> Dst: store i8 1, ptr %idx.1, align 1
-; CHECK-SIV-ONLY-NEXT:    da analyze - output [*|<]!
-; CHECK-SIV-ONLY-NEXT:  Src: store i8 1, ptr %idx.1, align 1 --> Dst: store i8 1, ptr %idx.1, align 1
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-;
-entry:
-  br label %loop.header
-
-loop.header:
-  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.latch ]
-  %subscript.0 = phi i64 [ 9223372036854775807, %entry ], [ %subscript.0.next, %loop.latch ]
-  %subscript.1 = phi i64 [ -2, %entry ], [ %subscript.1.next, %loop.latch ]
-  %idx.0 = getelementptr inbounds i8, ptr %A, i64 %subscript.0
-  store i8 0, ptr %idx.0
-  %cond.store = icmp ne i64 %i, 0
-  br i1 %cond.store, label %if.store, label %loop.latch
-
-if.store:
-  %idx.1 = getelementptr inbounds i8, ptr %A, i64 %subscript.1
-  store i8 1, ptr %idx.1
-  br label %loop.latch
-
-loop.latch:
-  %i.inc = add nuw nsw i64 %i, 1
-  %subscript.0.next = add nsw i64 %subscript.0, -6
-  %subscript.1.next = add nsw i64 %subscript.1, 3
-  %exitcond = icmp sgt i64 %i.inc, 1537228672809129301
-  br i1 %exitcond, label %exit, label %loop.header
-
-exit:
-  ret void
-}
-
-;; A generalized version of @exact14.
-;;
-;; for (long long i = 0; i <= n / 6; i++) {
-;;   A[-6*i + n] = 0;
-;;   if (i)
-;;     A[3*i - 2] = 1;
-;; }
-
-define void @exact15(ptr %A, i64 %n) {
-; CHECK-LABEL: 'exact15'
-; CHECK-NEXT:  Src: store i8 0, ptr %idx.0, align 1 --> Dst: store i8 0, ptr %idx.0, align 1
-; CHECK-NEXT:    da analyze - none!
-; CHECK-NEXT:  Src: store i8 0, ptr %idx.0, align 1 --> Dst: store i8 1, ptr %idx.1, align 1
-; CHECK-NEXT:    da analyze - output [*|<]!
-; CHECK-NEXT:  Src: store i8 1, ptr %idx.1, align 1 --> Dst: store i8 1, ptr %idx.1, align 1
-; CHECK-NEXT:    da analyze - none!
-;
-; CHECK-SIV-ONLY-LABEL: 'exact15'
-; CHECK-SIV-ONLY-NEXT:  Src: store i8 0, ptr %idx.0, align 1 --> Dst: store i8 0, ptr %idx.0, align 1
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-; CHECK-SIV-ONLY-NEXT:  Src: store i8 0, ptr %idx.0, align 1 --> Dst: store i8 1, ptr %idx.1, align 1
-; CHECK-SIV-ONLY-NEXT:    da analyze - output [*|<]!
-; CHECK-SIV-ONLY-NEXT:  Src: store i8 1, ptr %idx.1, align 1 --> Dst: store i8 1, ptr %idx.1, align 1
-; CHECK-SIV-ONLY-NEXT:    da analyze - none!
-;
-entry:
-  %bound = sdiv i64 %n, 6
-  %guard = icmp sgt i64 %n, 0
-  br i1 %guard, label %loop.header, label %exit
-
-loop.header:
-  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.latch ]
-  %subscript.0 = phi i64 [ %n, %entry ], [ %subscript.0.next, %loop.latch ]
-  %subscript.1 = phi i64 [ -2, %entry ], [ %subscript.1.next, %loop.latch ]
-  %idx.0 = getelementptr inbounds i8, ptr %A, i64 %subscript.0
-  store i8 0, ptr %idx.0
-  %cond.store = icmp ne i64 %i, 0
-  br i1 %cond.store, label %if.store, label %loop.latch
-
-if.store:
-  %idx.1 = getelementptr inbounds i8, ptr %A, i64 %subscript.1
-  store i8 1, ptr %idx.1
-  br label %loop.latch
-
-loop.latch:
-  %i.inc = add nuw nsw i64 %i, 1
-  %subscript.0.next = add nsw i64 %subscript.0, -6
-  %subscript.1.next = add nsw i64 %subscript.1, 3
-  %exitcond = icmp sgt i64 %i.inc, %bound
-  br i1 %exitcond, label %exit, label %loop.header
-
-exit:
-  ret void
-}
diff --git a/llvm/test/Analysis/DependenceAnalysis/becount-couldnotcompute.ll b/llvm/test/Analysis/DependenceAnalysis/becount-couldnotcompute.ll
new file mode 100644
index 0000000000000..49fbad3510ae6
--- /dev/null
+++ b/llvm/test/Analysis/DependenceAnalysis/becount-couldnotcompute.ll
@@ -0,0 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -disable-output "-passes=print<da>" -aa-pipeline=basic-aa 2>&1 | FileCheck %s
+
+; Test for function isKnownLessThan that calculates a back-edge taken count,
+; which can return a CouldNotCompute SCEV.
+
+define void @test(i64 %conv, ptr %a) {
+; CHECK-LABEL: 'test'
+; CHECK-NEXT:  Src: %ld = load i32, ptr %arrayidx12, align 4 --> Dst: %ld = load i32, ptr %arrayidx12, align 4
+; CHECK-NEXT:    da analyze - none!
+;
+entry:
+  %sub = add i64 %conv, 1
+  br label %loop
+
+loop:
+  %i = phi i64 [ %add26, %loop ], [ 0, %entry ]
+  %arrayidx12 = getelementptr i32, ptr %a, i64 %i
+  %ld = load i32, ptr %arrayidx12, align 4
+  %add26 = add nsw i64 %sub, %i
+  br label %loop
+}
diff --git a/llvm/test/Analysis/DependenceAnalysis/exact-siv-overflow.ll b/llvm/test/Analysis/DependenceAnalysis/exact-siv-overflow.ll
new file mode 100644
index 0000000000000..59e2ed3c7e35f
--- /dev/null
+++ b/llvm/test/Analysis/DependenceAnalysis/exact-siv-overflow.ll
@@ -0,0 +1,124 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -disable-output "-passes=print<da>" 2>&1 | FileCheck %s
+; RUN: opt < %s -disable-output "-passes=print<da>" -da-enable-dependence-test=exact-siv 2>&1 \
+; RUN:     | FileCheck %s --check-prefix=CHECK-EXACT-SIV
+
+;; max_i = INT64_MAX/6  // 1537228672809129301
+;; for (long long i = 0; i <= max_i; i++) {
+;;   A[-6*i + INT64_MAX] = 0;
+;;   if (i)
+;;     A[3*i - 2] = 1;
+;; }
+;;
+;; FIXME: DependencyAnalsysis currently detects no dependency between
+;; `A[-6*i + INT64_MAX]` and `A[3*i - 2]`, but it does exist. For example,
+;;
+;; | memory location        | -6*i + INT64_MAX       | 3*i - 2
+;; |------------------------|------------------------|-----------
+;; | A[1]                   | i = max_i              | i = 1
+;; | A[4611686018427387901] | i = 768614336404564651 | i = max_i
+;;
+;; Actually,
+;;  * 1                   = -6*max_i              + INT64_MAX = 3*1     - 2
+;;  * 4611686018427387901 = -6*768614336404564651 + INT64_MAX = 3*max_i - 2
+;;
+
+define void @exactsiv_const_ovfl(ptr %A) {
+; CHECK-LABEL: 'exactsiv_const_ovfl'
+; CHECK-NEXT:  Src: store i8 0, ptr %idx.0, align 1 --> Dst: store i8 0, ptr %idx.0, align 1
+; CHECK-NEXT:    da analyze - none!
+; CHECK-NEXT:  Src: store i8 0, ptr %idx.0, align 1 --> Dst: store i8 1, ptr %idx.1, align 1
+; CHECK-NEXT:    da analyze - none!
+; CHECK-NEXT:  Src: store i8 1, ptr %idx.1, align 1 --> Dst: store i8 1, ptr %idx.1, align 1
+; CHECK-NEXT:    da analyze - none!
+;
+; CHECK-EXACT-SIV-LABEL: 'exactsiv_const_ovfl'
+; CHECK-EXACT-SIV-NEXT:  Src: store i8 0, ptr %idx.0, align 1 --> Dst: store i8 0, ptr %idx.0, align 1
+; CHECK-EXACT-SIV-NEXT:    da analyze - consistent output [*]!
+; CHECK-EXACT-SIV-NEXT:  Src: store i8 0, ptr %idx.0, align 1 --> Dst: store i8 1, ptr %idx.1, align 1
+; CHECK-EXACT-SIV-NEXT:    da analyze - output [*|<]!
+; CHECK-EXACT-SIV-NEXT:  Src: store i8 1, ptr %idx.1, align 1 --> Dst: store i8 1, ptr %idx.1, align 1
+; CHECK-EXACT-SIV-NEXT:    da analyze - consistent output [*]!
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.latch ]
+  %subscript.0 = phi i64 [ 9223372036854775807, %entry ], [ %subscript.0.next, %loop.latch ]
+  %subscript.1 = phi i64 [ -2, %entry ], [ %subscript.1.next, %loop.latch ]
+  %idx.0 = getelementptr inbounds i8, ptr %A, i64 %subscript.0
+  store i8 0, ptr %idx.0
+  %cond.store = icmp ne i64 %i, 0
+  br i1 %cond.store, label %if.store, label %loop.latch
+
+if.store:
+  %idx.1 = getelementptr inbounds i8, ptr %A, i64 %subscript.1
+  store i8 1, ptr %idx.1
+  br label %loop.latch
+
+loop.latch:
+  %i.inc = add nuw nsw i64 %i, 1
+  %subscript.0.next = add nsw i64 %subscript.0, -6
+  %subscript.1.next = add nsw i64 %subscript.1, 3
+  %exitcond = icmp sgt i64 %i.inc, 1537228672809129301
+  br i1 %exitcond, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+;; A generalized version of the above case.
+;;
+;; for (long long i = 0; i <= n / 6; i++) {
+;;   A[-6*i + n] = 0;
+;;   if (i)
+;;     A[3*i - 2] = 1;
+;; }
+
+define void @exactsiv_param_ovfl(ptr %A, i64 %n) {
+; CHECK-LABEL: 'exactsiv_param_ovfl'
+; CHECK-NEXT:  Src: store i8 0, ptr %idx.0, align 1 --> Dst: store i8 0, ptr %idx.0, align 1
+; CHECK-NEXT:    da analyze - none!
+; CHECK-NEXT:  Src: store i8 0, ptr %idx.0, align 1 --> Dst: store i8 1, ptr %idx.1, align 1
+; CHECK-NEXT:    da analyze - output [*|<]!
+; CHECK-NEXT:  Src: store i8 1, ptr %idx.1, align 1 --> Dst: store i8 1, ptr %idx.1, align 1
+; CHECK-NEXT:    da analyze - none!
+;
+; CHECK-EXACT-SIV-LABEL: 'exactsiv_param_ovfl'
+; CHECK-EXACT-SIV-NEXT:  Src: store i8 0, ptr %idx.0, align 1 --> Dst: store i8 0, ptr %idx.0, align 1
+; CHECK-EXACT-SIV-NEXT:    da analyze - consistent output [*]!
+; CHECK-EXACT-SIV-NEXT:  Src: store i8 0, ptr %idx.0, align 1 --> Dst: store i8 1, ptr %idx.1, align 1
+; CHECK-EXACT-SIV-NEXT:    da analyze - output [*|<]!
+; CHECK-EXACT-SIV-NEXT:  Src: store i8 1, ptr %idx.1, align 1 --> Dst: store i8 1, ptr %idx.1, align 1
+; CHECK-EXACT-SIV-NEXT:    da analyze - consistent output [*]!
+;
+entry:
+  %bound = sdiv i64 %n, 6
+  %guard = icmp sgt i64 %n, 0
+  br i1 %guard, label %loop.header, label %exit
+
+loop.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.latch ]
+  %subscript.0 = phi i64 [ %n, %entry ], [ %subscript.0.next, %loop.latch ]
+  %subscript.1 = phi i64 [ -2, %entry ], [ %subscript.1.next, %loop.latch ]
+  %idx.0 = getelementptr inbounds i8, ptr %A, i64 %subscript.0
+  store i8 0, ptr %idx.0
+  %cond.store = icmp ne i64 %i, 0
+  br i1 %cond.store, label %if.store, label %loop.latch
+
+if.store:
+  %idx.1 = getelementptr inbounds i8, ptr %A, i64 %subscript.1
+  store i8 1, ptr %idx.1
+  br label %loop.latch
+
+loop.latch:
+  %i.inc = add nuw nsw i64 %i, 1
+  %subscript.0.next = add nsw i64 %subscript.0, -6
+  %subscript.1.next = add nsw i64 %subscript.1, 3
+  %exitcond = icmp sgt i64 %i.inc, %bound
+  br i1 %exitcond, label %exit, label %loop.header
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Analysis/DependenceAnalysis/monotonicity-cast.ll b/llvm/test/Analysis/DependenceAnalysis/monotonicity-cast.ll
new file mode 100644
index 0000000000000..e43d00d0bf651
--- /dev/null
+++ b/llvm/test/Analysis/DependenceAnalysis/monotonicity-cast.ll
@@ -0,0 +1,207 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -disable-output -passes="print<da>" -da-dump-monotonicity-report \
+; RUN:     -da-enable-monotonicity-check 2>&1 | FileCheck %s
+
+; int8_t offset = start;
+; for (int i = 0; i < 100; i++, offset += step)
+;   a[sext(offset)] = 0;
+;
+define void @sext_nsw(ptr %a, i8 %start, i8 %step) {
+; CHECK-LABEL: 'sext_nsw'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: {(sext i8 %start to i64),+,(sext i8 %step to i64)}<nsw><%loop>
+; CHECK-NEXT:      Monotonicity: MultivariateSignedMonotonic
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - none!
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop ]
+  %offset = phi i8 [ %start, %entry ], [ %offset.next, %loop ]
+  %offset.sext = sext i8 %offset to i64
+  %idx = getelementptr i8, ptr %a, i64 %offset.sext
+  store i8 0, ptr %idx
+  %i.inc = add nsw i64 %i, 1
+  %offset.next = add nsw i8 %offset, %step
+  %exitcond = icmp eq i64 %i.inc, 100
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; The addition for `%offset.next` can wrap, so we cannot prove monotonicity.
+;
+; int8_t offset = start;
+; for (int i = 0; i < 100; i++, offset += step)
+;   a[sext(offset)] = 0;
+;
+define void @sext_may_wrap(ptr %a, i8 %start, i8 %step) {
+; CHECK-LABEL: 'sext_may_wrap'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: (sext i8 {%start,+,%step}<%loop> to i64)
+; CHECK-NEXT:      Monotonicity: Unknown
+; CHECK-NEXT:      Reason: (sext i8 {%start,+,%step}<%loop> to i64)
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - confused!
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop ]
+  %offset = phi i8 [ %start, %entry ], [ %offset.next, %loop ]
+  %offset.sext = sext i8 %offset to i64
+  %idx = getelementptr i8, ptr %a, i64 %offset.sext
+  store i8 0, ptr %idx
+  %i.inc = add nsw i64 %i, 1
+  %offset.next = add i8 %offset, %step
+  %exitcond = icmp eq i64 %i.inc, 100
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; for (int8_t i = 0; i < 100; i++)
+;   a[zext(offset)] = 0;
+;
+define void @zext_pos(ptr %a) {
+; CHECK-LABEL: 'zext_pos'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: {0,+,1}<nuw><nsw><%loop>
+; CHECK-NEXT:      Monotonicity: MultivariateSignedMonotonic
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - none!
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i8 [ 0, %entry ], [ %i.inc, %loop ]
+  %offset.zext = zext nneg i8 %i to i64
+  %idx = getelementptr i8, ptr %a, i64 %offset.zext
+  store i8 0, ptr %idx
+  %i.inc = add nsw i8 %i, 1
+  %exitcond = icmp eq i8 %i.inc, 100
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; The zero-extened value of `offset` is no longer monotonic. In fact, the
+; values of `offset` in each iteration are:
+;
+;    iteration |   0 | 1 | 2 | ...
+; -------------|-----|---|---|---------
+;       offset |  -1 | 0 | 1 | ...
+; zext(offset) | 255 | 0 | 1 | ...
+;
+;
+; for (int8_t i = -1; i < 100; i++)
+;   a[zext(offset)] = 0;
+;
+define void @zext_cross_zero(ptr %a) {
+; CHECK-LABEL: 'zext_cross_zero'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: (zext i8 {-1,+,1}<nsw><%loop> to i64)
+; CHECK-NEXT:      Monotonicity: Unknown
+; CHECK-NEXT:      Reason: (zext i8 {-1,+,1}<nsw><%loop> to i64)
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - confused!
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i8 [ -1, %entry ], [ %i.inc, %loop ]
+  %offset.zext = zext nneg i8 %i to i64
+  %idx = getelementptr i8, ptr %a, i64 %offset.zext
+  store i8 0, ptr %idx
+  %i.inc = add nsw i8 %i, 1
+  %exitcond = icmp eq i8 %i.inc, 100
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; In principle, we can prove that `zext(offset)` is monotonic since we know
+; that `offset` is non-negative.
+;
+; int8_t offset = 0;
+; for (int i = 0; i < 100; i++, offset += step)
+;   a[zext(offset)] = 0;
+;
+define void @zext_nneg_nsw(ptr %a, i8 %step) {
+; CHECK-LABEL: 'zext_nneg_nsw'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: (zext i8 {0,+,%step}<nsw><%loop> to i64)
+; CHECK-NEXT:      Monotonicity: Unknown
+; CHECK-NEXT:      Reason: (zext i8 {0,+,%step}<nsw><%loop> to i64)
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - confused!
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop ]
+  %offset = phi i8 [ 0, %entry ], [ %offset.next, %loop ]
+  %offset.zext = zext nneg i8 %offset to i64
+  %idx = getelementptr i8, ptr %a, i64 %offset.zext
+  store i8 0, ptr %idx
+  %i.inc = add nsw i64 %i, 1
+  %offset.next = add nsw i8 %offset, %step
+  %exitcond = icmp eq i64 %i.inc, 100
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; SCEV handles `i & 1` as an i1 addrec. Ensure that the monotonicity analysis
+; properly analyzes it.
+;
+; for (i = 0; i < 100; i++)
+;  a[i & 1] = 0;
+;
+define void @offset_truncated_to_i1(ptr %a) {
+; CHECK-LABEL: 'offset_truncated_to_i1'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: (zext i1 {false,+,true}<%loop> to i64)
+; CHECK-NEXT:      Monotonicity: Unknown
+; CHECK-NEXT:      Reason: (zext i1 {false,+,true}<%loop> to i64)
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - confused!
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop ]
+  %and = and i64 %i, 1
+  %idx = getelementptr inbounds i8, ptr %a, i64 %and
+  store i8 0, ptr %idx
+  %i.inc = add nsw i64 %i, 1
+  %exitcond = icmp eq i64 %i.inc, 100
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Analysis/DependenceAnalysis/monotonicity-delinearize.ll b/llvm/test/Analysis/DependenceAnalysis/monotonicity-delinearize.ll
new file mode 100644
index 0000000000000..71ea4e95059a0
--- /dev/null
+++ b/llvm/test/Analysis/DependenceAnalysis/monotonicity-delinearize.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -disable-output -passes="print<da>" -da-dump-monotonicity-report \
+; RUN:     -da-enable-monotonicity-check 2>&1 | FileCheck %s
+
+; The offset SCEV will be delinearized into a 2D array access, like as follows:
+;
+; - Outer subscript: {0,+,1}<nuw><nsw><%loop.i.header>
+; - Inner subscript: {0,+,1}<nuw><nsw><%loop.j.header>
+;
+; These subscripts are both monotonic, but we also need to check the
+; monotonicity of the original addrec.
+;
+; char A[...][32];
+; for (i = 0; i < 1ll << 62; i++)
+;   for (j = 0; j < 32; j++)
+;     if (i < (1ll << 57))
+;       A[i][j] = 0;
+;
+define void @linearized_offset_wrap(ptr %a) {
+; CHECK-LABEL: 'linearized_offset_wrap'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %gep, align 1
+; CHECK-NEXT:      Expr: {{\{\{}}0,+,32}<%loop.i.header>,+,1}<nw><%loop.j.header>
+; CHECK-NEXT:      Monotonicity: Unknown
+; CHECK-NEXT:      Reason: {{\{\{}}0,+,32}<%loop.i.header>,+,1}<nw><%loop.j.header>
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %gep, align 1 --> Dst: store i8 0, ptr %gep, align 1
+; CHECK-NEXT:    da analyze - confused!
+;
+entry:
+  br label %loop.i.header
+
+loop.i.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.i.latch ]
+  br label %loop.j.header
+
+loop.j.header:
+  %j = phi i64 [ 0, %loop.i.header ], [ %j.inc, %loop.j.latch ]
+  %cond = icmp slt i64 %i, 144115188075855872  ; 2^57
+  br i1 %cond, label %if.then, label %loop.j.latch
+
+if.then:
+  %gep = getelementptr inbounds [32 x i8], ptr %a, i64 %i, i64 %j
+  store i8 0, ptr %gep
+  br label %loop.j.latch
+
+loop.j.latch:
+  %j.inc = add nuw nsw i64 %j, 1
+  %ec.j = icmp eq i64 %j.inc, 32
+  br i1 %ec.j, label %loop.i.latch, label %loop.j.header
+
+loop.i.latch:
+  %i.inc = add nuw nsw i64 %i, 1
+  %ec.i = icmp eq i64 %i.inc, 4611686018427387904  ; 2^62
+  br i1 %ec.i, label %exit, label %loop.i.header
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Analysis/DependenceAnalysis/monotonicity-invariant.ll b/llvm/test/Analysis/DependenceAnalysis/monotonicity-invariant.ll
new file mode 100644
index 0000000000000..e5b6ddbaca6fe
--- /dev/null
+++ b/llvm/test/Analysis/DependenceAnalysis/monotonicity-invariant.ll
@@ -0,0 +1,150 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -disable-output -passes="print<da>" -da-dump-monotonicity-report \
+; RUN:     -da-enable-monotonicity-check 2>&1 | FileCheck %s
+
+; for (int i = 0; i < n; i++)
+;   a[x] = 0;
+define void @single_loop_invariant(ptr %a, i64 %x, i64 %n) {
+; CHECK-LABEL: 'single_loop_invariant'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: %x
+; CHECK-NEXT:      Monotonicity: Invariant
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - consistent output [S]!
+;
+entry:
+  %guard = icmp sgt i64 %n, 0
+  br i1 %guard, label %loop, label %exit
+
+loop:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop ]
+  %idx = getelementptr inbounds i8, ptr %a, i64 %x
+  store i8 0, ptr %idx
+  %i.inc = add nsw i64 %i, 1
+  %exitcond = icmp eq i64 %i.inc, %n
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; for (int i = 0; i < n; i++)
+;   a[(i % 2 == 0 ? x : y)] = 0;
+define void @single_loop_variant(ptr %a, i64 %x, i64 %y, i64 %n) {
+; CHECK-LABEL: 'single_loop_variant'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: %offset
+; CHECK-NEXT:      Monotonicity: Unknown
+; CHECK-NEXT:      Reason: %offset
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - confused!
+;
+entry:
+  %guard = icmp sgt i64 %n, 0
+  br i1 %guard, label %loop, label %exit
+
+loop:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop ]
+  %offset = phi i64 [ %x, %entry ], [ %offset.next, %loop ]
+  %offset.next = phi i64 [ %y, %entry ], [ %offset, %loop ]
+  %idx = getelementptr inbounds i8, ptr %a, i64 %offset
+  store i8 0, ptr %idx
+  %i.inc = add nsw i64 %i, 1
+  %exitcond = icmp eq i64 %i.inc, %n
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; for (int i = 0; i < n; i++)
+;   for (int j = 0; j < m; j++)
+;     a[x + i] = 0;
+define void @invariant_plus_monotonic0(ptr %a, i64 %x, i64 %n, i64 %m) {
+; CHECK-LABEL: 'invariant_plus_monotonic0'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: {%x,+,1}<nsw><%loop.i.header>
+; CHECK-NEXT:      Monotonicity: MultivariateSignedMonotonic
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - consistent output [0 S]!
+;
+entry:
+  %guard.i = icmp sgt i64 %n, 0
+  br i1 %guard.i, label %loop.i.header, label %exit
+
+loop.i.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.i.latch ]
+  %offset = phi i64 [ %x, %entry ], [ %offset.inc, %loop.i.latch ]
+  br label %loop.j.preheader
+
+loop.j.preheader:
+  %gurard.j = icmp sgt i64 %m, 0
+  br i1 %gurard.j, label %loop.j, label %loop.i.latch
+
+loop.j:
+  %j = phi i64 [ 0, %loop.j.preheader ], [ %j.inc, %loop.j ]
+  %idx = getelementptr inbounds i8, ptr %a, i64 %offset
+  store i8 0, ptr %idx
+  %j.inc = add nuw nsw i64 %j, 1
+  %exitcond.j = icmp eq i64 %j.inc, %m
+  br i1 %exitcond.j, label %loop.i.latch, label %loop.j
+
+loop.i.latch:
+  %i.inc = add nsw i64 %i, 1
+  %offset.inc = add nsw i64 %offset, 1
+  %exitcond.i = icmp eq i64 %i.inc, %n
+  br i1 %exitcond.i, label %exit, label %loop.i.header
+
+exit:
+  ret void
+}
+
+; for (int i = 0; i < n; i++)
+;   for (int j = 0; j < m; j++)
+;     a[x + j] = 0;
+define void @invariant_plus_monotonic1(ptr %a, i64 %x, i64 %n, i64 %m) {
+; CHECK-LABEL: 'invariant_plus_monotonic1'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: {%x,+,1}<nsw><%loop.j>
+; CHECK-NEXT:      Monotonicity: MultivariateSignedMonotonic
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - consistent output [S 0]!
+;
+entry:
+  %guard.i = icmp sgt i64 %n, 0
+  br i1 %guard.i, label %loop.i.header, label %exit
+
+loop.i.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.i.latch ]
+  br label %loop.j.preheader
+
+loop.j.preheader:
+  %gurard.j = icmp sgt i64 %m, 0
+  br i1 %gurard.j, label %loop.j, label %loop.i.latch
+
+loop.j:
+  %j = phi i64 [ 0, %loop.j.preheader ], [ %j.inc, %loop.j ]
+  %offset = phi i64 [ %x, %loop.j.preheader ], [ %offset.inc, %loop.j ]
+  %idx = getelementptr inbounds i8, ptr %a, i64 %offset
+  store i8 0, ptr %idx
+  %j.inc = add nuw nsw i64 %j, 1
+  %offset.inc = add nsw i64 %offset, 1
+  %exitcond.j = icmp eq i64 %j.inc, %m
+  br i1 %exitcond.j, label %loop.i.latch, label %loop.j
+
+loop.i.latch:
+  %i.inc = add nsw i64 %i, 1
+  %exitcond.i = icmp eq i64 %i.inc, %n
+  br i1 %exitcond.i, label %exit, label %loop.i.header
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Analysis/DependenceAnalysis/monotonicity-no-wrap-flags.ll b/llvm/test/Analysis/DependenceAnalysis/monotonicity-no-wrap-flags.ll
new file mode 100644
index 0000000000000..7411dc9f5c053
--- /dev/null
+++ b/llvm/test/Analysis/DependenceAnalysis/monotonicity-no-wrap-flags.ll
@@ -0,0 +1,519 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -disable-output -passes="print<da>" -da-dump-monotonicity-report \
+; RUN:     -da-enable-monotonicity-check 2>&1 | FileCheck %s
+
+; for (int i = 0; i < n; i++)
+;   a[i] = 0;
+;
+define void @single_loop_nsw(ptr %a, i64 %n) {
+; CHECK-LABEL: 'single_loop_nsw'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: {0,+,1}<nuw><nsw><%loop>
+; CHECK-NEXT:      Monotonicity: MultivariateSignedMonotonic
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - none!
+;
+entry:
+  %guard = icmp sgt i64 %n, 0
+  br i1 %guard, label %loop, label %exit
+
+loop:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop ]
+  %idx = getelementptr inbounds i8, ptr %a, i64 %i
+  store i8 0, ptr %idx
+  %i.inc = add nsw i64 %i, 1
+  %exitcond = icmp eq i64 %i.inc, %n
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; The purpose of the variable `begin` is to avoid violating the size limitation
+; of the allocated object in LLVM IR, which would cause UB.
+;
+; for (unsigned long long i = begin; i < end; i++)
+;   a[i] = 0;
+;
+define void @single_loop_nuw(ptr %a, i64 %begin, i64 %end) {
+; CHECK-LABEL: 'single_loop_nuw'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: {%begin,+,1}<nuw><%loop>
+; CHECK-NEXT:      Monotonicity: Unknown
+; CHECK-NEXT:      Reason: {%begin,+,1}<nuw><%loop>
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - confused!
+;
+entry:
+  %guard = icmp ult i64 %begin, %end
+  br i1 %guard, label %loop, label %exit
+
+loop:
+  %i = phi i64 [ %begin, %entry ], [ %i.inc, %loop ]
+  %idx = getelementptr i8, ptr %a, i64 %i
+  store i8 0, ptr %idx
+  %i.inc = add nuw i64 %i, 1
+  %exitcond = icmp eq i64 %i.inc, %end
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; for (int i = 0; i < n; i++)
+;   for (int j = 0; j < m; j++)
+;     a[i + j] = 0;
+;
+define void @nested_loop_nsw0(ptr %a, i64 %n, i64 %m) {
+; CHECK-LABEL: 'nested_loop_nsw0'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: {{\{\{}}0,+,1}<nuw><nsw><%loop.i.header>,+,1}<nuw><nsw><%loop.j>
+; CHECK-NEXT:      Monotonicity: MultivariateSignedMonotonic
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - output [* *]!
+;
+entry:
+  %guard.i = icmp sgt i64 %n, 0
+  br i1 %guard.i, label %loop.i.header, label %exit
+
+loop.i.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.i.latch ]
+  br label %loop.j.preheader
+
+loop.j.preheader:
+  %gurard.j = icmp sgt i64 %m, 0
+  br i1 %gurard.j, label %loop.j, label %loop.i.latch
+
+loop.j:
+  %j = phi i64 [ 0, %loop.j.preheader ], [ %j.inc, %loop.j ]
+  %offset = add nsw i64 %i, %j
+  %idx = getelementptr inbounds i8, ptr %a, i64 %offset
+  store i8 0, ptr %idx
+  %j.inc = add nsw i64 %j, 1
+  %exitcond.j = icmp eq i64 %j.inc, %m
+  br i1 %exitcond.j, label %loop.i.latch, label %loop.j
+
+loop.i.latch:
+  %i.inc = add nsw i64 %i, 1
+  %exitcond.i = icmp eq i64 %i.inc, %n
+  br i1 %exitcond.i, label %exit, label %loop.i.header
+
+exit:
+  ret void
+}
+
+; for (int i = n - 1; i >= 0; i--)
+;   for (int j = 0; j < m; j++)
+;     a[i + j] = 0;
+;
+define void @nested_loop_nsw1(ptr %a, i64 %n, i64 %m) {
+; CHECK-LABEL: 'nested_loop_nsw1'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: {{\{\{}}(-1 + %n),+,-1}<nsw><%loop.i.header>,+,1}<nsw><%loop.j>
+; CHECK-NEXT:      Monotonicity: MultivariateSignedMonotonic
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - output [* *]!
+;
+entry:
+  %guard.i = icmp sgt i64 %n, 0
+  br i1 %guard.i, label %loop.i.header, label %exit
+
+loop.i.header:
+  %i = phi i64 [ %n, %entry ], [ %i.dec, %loop.i.latch ]
+  %i.dec = add nsw i64 %i, -1
+  br label %loop.j.preheader
+
+loop.j.preheader:
+  %gurard.j = icmp sgt i64 %m, 0
+  br i1 %gurard.j, label %loop.j, label %loop.i.latch
+
+loop.j:
+  %j = phi i64 [ 0, %loop.j.preheader ], [ %j.inc, %loop.j ]
+  %offset = add nsw i64 %i.dec, %j
+  %idx = getelementptr inbounds i8, ptr %a, i64 %offset
+  store i8 0, ptr %idx
+  %j.inc = add nsw i64 %j, 1
+  %exitcond.j = icmp eq i64 %j.inc, %m
+  br i1 %exitcond.j, label %loop.i.latch, label %loop.j
+
+loop.i.latch:
+  %exitcond.i = icmp eq i64 %i.dec, 0
+  br i1 %exitcond.i, label %exit, label %loop.i.header
+
+exit:
+  ret void
+}
+
+; for (int i = 0; i < n; i--)
+;   for (int j = 0; j < m; j++)
+;     a[i - j] = 0;
+;
+define void @nested_loop_nsw2(ptr %a, i64 %n, i64 %m) {
+; CHECK-LABEL: 'nested_loop_nsw2'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: {{\{\{}}0,+,1}<nuw><nsw><%loop.i.header>,+,-1}<nsw><%loop.j>
+; CHECK-NEXT:      Monotonicity: MultivariateSignedMonotonic
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - output [* *]!
+;
+entry:
+  %guard.i = icmp sgt i64 %n, 0
+  br i1 %guard.i, label %loop.i.header, label %exit
+
+loop.i.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.i.latch ]
+  br label %loop.j.preheader
+
+loop.j.preheader:
+  %gurard.j = icmp sgt i64 %m, 0
+  br i1 %gurard.j, label %loop.j, label %loop.i.latch
+
+loop.j:
+  %j = phi i64 [ 0, %loop.j.preheader ], [ %j.inc, %loop.j ]
+  %offset = sub nsw i64 %i, %j
+  %idx = getelementptr inbounds i8, ptr %a, i64 %offset
+  store i8 0, ptr %idx
+  %j.inc = add nsw i64 %j, 1
+  %exitcond.j = icmp eq i64 %j.inc, %m
+  br i1 %exitcond.j, label %loop.i.latch, label %loop.j
+
+loop.i.latch:
+  %i.inc = add nsw i64 %i, 1
+  %exitcond.i = icmp eq i64 %i.inc, %n
+  br i1 %exitcond.i, label %exit, label %loop.i.header
+
+exit:
+  ret void
+}
+
+; for (int i = begin0; i < end0; i++)
+;   for (int j = begin1; j < end1; j++) {
+;     unsigned long long offset = (unsigned long long)i + (unsigned long long)j;
+;     a[offset] = 0;
+;   }
+;
+define void @nested_loop_nuw(ptr %a, i64 %begin0, i64 %end0, i64 %begin1, i64 %end1) {
+; CHECK-LABEL: 'nested_loop_nuw'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: {{\{\{}}(%begin0 + %begin1),+,1}<nw><%loop.i.header>,+,1}<nw><%loop.j>
+; CHECK-NEXT:      Monotonicity: Unknown
+; CHECK-NEXT:      Reason: {{\{\{}}(%begin0 + %begin1),+,1}<nw><%loop.i.header>,+,1}<nw><%loop.j>
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - confused!
+;
+entry:
+  %guard.i.0 = icmp slt i64 0, %begin0
+  %guard.i.1 = icmp slt i64 %begin0, %end0
+  %guard.i.2 = icmp slt i64 0, %end0
+  %and.i.0 = and i1 %guard.i.0, %guard.i.1
+  %and.i.1 = and i1 %and.i.0, %guard.i.2
+  br i1 %and.i.1, label %loop.i.header, label %exit
+
+loop.i.header:
+  %i = phi i64 [ %begin0, %entry ], [ %i.inc, %loop.i.latch ]
+  br label %loop.j.preheader
+
+loop.j.preheader:
+  %guard.j.0 = icmp slt i64 0, %begin1
+  %guard.j.1 = icmp slt i64 %begin1, %end1
+  %guard.j.2 = icmp slt i64 0, %end1
+  %and.j.0 = and i1 %guard.j.0, %guard.j.1
+  %and.j.1 = and i1 %and.j.0, %guard.j.2
+  br i1 %and.j.1, label %loop.j, label %loop.i.latch
+
+loop.j:
+  %j = phi i64 [ %begin1, %loop.j.preheader ], [ %j.inc, %loop.j ]
+  %offset = add nuw i64 %i, %j
+  %idx = getelementptr i8, ptr %a, i64 %offset
+  store i8 0, ptr %idx
+  %j.inc = add nsw i64 %j, 1
+  %exitcond.j = icmp eq i64 %j.inc, %end1
+  br i1 %exitcond.j, label %loop.i.latch, label %loop.j
+
+loop.i.latch:
+  %i.inc = add nsw i64 %i, 1
+  %exitcond.i = icmp eq i64 %i.inc, %end0
+  br i1 %exitcond.i, label %exit, label %loop.i.header
+
+exit:
+  ret void
+}
+
+; for (int i = 0; i < n; i++)
+;   for (int j = 0; j < m; j++)
+;     a[i + step*j] = 0;
+;
+define void @nested_loop_step(ptr %a, i64 %n, i64 %m, i64 %step) {
+; CHECK-LABEL: 'nested_loop_step'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: {{\{\{}}0,+,1}<nuw><nsw><%loop.i.header>,+,%step}<nsw><%loop.j>
+; CHECK-NEXT:      Monotonicity: MultivariateSignedMonotonic
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - output [* *]!
+;
+entry:
+  %guard.i = icmp sgt i64 %n, 0
+  br i1 %guard.i, label %loop.i.header, label %exit
+
+loop.i.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.i.latch ]
+  br label %loop.j.preheader
+
+loop.j.preheader:
+  %gurard.j = icmp sgt i64 %m, 0
+  br i1 %gurard.j, label %loop.j, label %loop.i.latch
+
+loop.j:
+  %j = phi i64 [ 0, %loop.j.preheader ], [ %j.inc, %loop.j ]
+  %offset.j = phi i64 [ 0, %loop.j.preheader ], [ %offset.j.next, %loop.j ]
+  %offset = add nsw i64 %i, %offset.j
+  %idx = getelementptr inbounds i8, ptr %a, i64 %offset
+  store i8 0, ptr %idx
+  %j.inc = add nsw i64 %j, 1
+  %offset.j.next = add nsw i64 %offset.j, %step
+  %exitcond.j = icmp eq i64 %j.inc, %m
+  br i1 %exitcond.j, label %loop.i.latch, label %loop.j
+
+loop.i.latch:
+  %i.inc = add nsw i64 %i, 1
+  %exitcond.i = icmp eq i64 %i.inc, %n
+  br i1 %exitcond.i, label %exit, label %loop.i.header
+
+exit:
+  ret void
+}
+
+; The value of step reccurence is not invariant with respect to the outer most
+; loop (the i-loop).
+;
+; offset_i = 0;
+; for (int i = 0; i < 100; i++) {
+;   for (int j = 0; j < 100; j++)
+;     a[offset_i + j] = 0;
+;   offset_i += (i % 2 == 0) ? 0 : 3;
+; }
+;
+define void @step_is_variant(ptr %a) {
+; CHECK-LABEL: 'step_is_variant'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: {%offset.i,+,1}<nuw><nsw><%loop.j>
+; CHECK-NEXT:      Monotonicity: MultivariateSignedMonotonic
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - confused!
+;
+entry:
+  br label %loop.i.header
+
+loop.i.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.i.latch ]
+  %offset.i = phi i64 [ 0, %entry ], [ %offset.i.next, %loop.i.latch ]
+  %step.i.0 = phi i64 [ 0, %entry ], [ %step.i.1, %loop.i.latch ]
+  %step.i.1 = phi i64 [ 3, %entry ], [ %step.i.0, %loop.i.latch ]
+  br label %loop.j
+
+loop.j:
+  %j = phi i64 [ 0, %loop.i.header ], [ %j.inc, %loop.j ]
+  %offset = add nsw i64 %offset.i, %j
+  %idx = getelementptr inbounds i8, ptr %a, i64 %offset
+  store i8 0, ptr %idx
+  %j.inc = add nsw i64 %j, 1
+  %exitcond.j = icmp eq i64 %j.inc, 100
+  br i1 %exitcond.j, label %loop.i.latch, label %loop.j
+
+loop.i.latch:
+  %i.inc = add nsw i64 %i, 1
+  %offset.i.next = add nsw i64 %offset.i, %step.i.0
+  %exitcond.i = icmp eq i64 %i.inc, 100
+  br i1 %exitcond.i, label %exit, label %loop.i.header
+
+exit:
+  ret void
+}
+
+; The AddRec doesn't have nsw flag for the j-loop, since the store may not be
+; executed.
+;
+; for (int i = 0; i < n; i++)
+;   for (int j = 0; j < m; j++)
+;     if (cond)
+;       a[i + j] = 0;
+;
+define void @conditional_store0(ptr %a, i64 %n, i64 %m) {
+; CHECK-LABEL: 'conditional_store0'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: {{\{\{}}0,+,1}<nuw><nsw><%loop.i.header>,+,1}<nw><%loop.j.header>
+; CHECK-NEXT:      Monotonicity: Unknown
+; CHECK-NEXT:      Reason: {{\{\{}}0,+,1}<nuw><nsw><%loop.i.header>,+,1}<nw><%loop.j.header>
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - confused!
+;
+entry:
+  %guard.i = icmp sgt i64 %n, 0
+  br i1 %guard.i, label %loop.i.header, label %exit
+
+loop.i.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.i.latch ]
+  br label %loop.j.preheader
+
+loop.j.preheader:
+  %gurard.j = icmp sgt i64 %m, 0
+  br i1 %gurard.j, label %loop.j.header, label %loop.i.latch
+
+loop.j.header:
+  %j = phi i64 [ 0, %loop.j.preheader ], [ %j.inc, %loop.j.latch ]
+  %offset = add nsw i64 %i, %j
+  %cond = freeze i1 poison
+  br i1 %cond, label %if.then, label %loop.j.latch
+
+if.then:
+  %idx = getelementptr inbounds i8, ptr %a, i64 %offset
+  store i8 0, ptr %idx
+  br label %loop.j.latch
+
+loop.j.latch:
+  %j.inc = add nsw i64 %j, 1
+  %exitcond.j = icmp eq i64 %j.inc, %m
+  br i1 %exitcond.j, label %loop.i.latch, label %loop.j.header
+
+loop.i.latch:
+  %i.inc = add nsw i64 %i, 1
+  %exitcond.i = icmp eq i64 %i.inc, %n
+  br i1 %exitcond.i, label %exit, label %loop.i.header
+
+exit:
+  ret void
+}
+
+; Similar to the @conditional_store0, but the definition of the `%offset` is
+; different from it and we can infer `nsw` in this case.
+;
+; for (int i = 0; i < n; i++)
+;   for (int j = 0; j < m; j++)
+;     if (cond)
+;       a[i + j] = 0;
+;
+define void @conditional_store1(ptr %a, i64 %n, i64 %m) {
+; CHECK-LABEL: 'conditional_store1'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: {{\{\{}}0,+,1}<nuw><nsw><%loop.i.header>,+,1}<nuw><nsw><%loop.j.header>
+; CHECK-NEXT:      Monotonicity: MultivariateSignedMonotonic
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - output [* *]!
+;
+entry:
+  %guard.i = icmp sgt i64 %n, 0
+  br i1 %guard.i, label %loop.i.header, label %exit
+
+loop.i.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.i.latch ]
+  br label %loop.j.preheader
+
+loop.j.preheader:
+  %gurard.j = icmp sgt i64 %m, 0
+  br i1 %gurard.j, label %loop.j.header, label %loop.i.latch
+
+loop.j.header:
+  %j = phi i64 [ 0, %loop.j.preheader ], [ %j.inc, %loop.j.latch ]
+  %offset = phi i64 [ %i, %loop.j.preheader ], [ %offset.next, %loop.j.latch ]
+  %cond = freeze i1 poison
+  br i1 %cond, label %if.then, label %loop.j.latch
+
+if.then:
+  %idx = getelementptr inbounds i8, ptr %a, i64 %offset
+  store i8 0, ptr %idx
+  br label %loop.j.latch
+
+loop.j.latch:
+  %j.inc = add nsw i64 %j, 1
+  %offset.next = add nsw i64 %offset, 1
+  %exitcond.j = icmp eq i64 %j.inc, %m
+  br i1 %exitcond.j, label %loop.i.latch, label %loop.j.header
+
+loop.i.latch:
+  %i.inc = add nsw i64 %i, 1
+  %exitcond.i = icmp eq i64 %i.inc, %n
+  br i1 %exitcond.i, label %exit, label %loop.i.header
+
+exit:
+  ret void
+}
+
+; In the following case, the computation `offset = offset_i + j` will not wrap,
+; but `offset_i += 1024` will wrap both in a signed sense and an unsigned
+; sense. We cannot prove the monotonicity in this case.
+;
+; offset_i = (1ULL << 63) - 256;
+; for (i = 0; i < (1ULL << 62); i++, offset_i += 1024)
+;   for (j = 0; j < 32; j++) {
+;     offset = offset_i + j;
+;
+;     // The value of `offset` is positive in a signed sense.
+;     if (offset < (1ULL << 63))
+;       a[offset] = 0;
+;   }
+;
+define void @outer_loop_may_wrap(ptr %a) {
+; CHECK-LABEL: 'outer_loop_may_wrap'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %gep, align 1
+; CHECK-NEXT:      Expr: {{\{\{}}9223372036854775552,+,1024}<%loop.i.header>,+,1}<nuw><nsw><%loop.j.header>
+; CHECK-NEXT:      Monotonicity: Unknown
+; CHECK-NEXT:      Reason: {9223372036854775552,+,1024}<%loop.i.header>
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %gep, align 1 --> Dst: store i8 0, ptr %gep, align 1
+; CHECK-NEXT:    da analyze - confused!
+;
+entry:
+  br label %loop.i.header
+
+loop.i.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.i.latch ]
+  %subscript.i = phi i64 [ 9223372036854775552, %entry ], [ %subscript.i.next, %loop.i.latch ]  ; The initial value is 2^63 - 256
+  br label %loop.j.header
+
+loop.j.header:
+  %j = phi i64 [ 0, %loop.i.header ], [ %j.inc, %loop.j.latch ]
+  %subscript = phi i64 [ %subscript.i, %loop.i.header ], [ %subscript.next, %loop.j.latch ]
+  %cond = icmp sge i64 %subscript, 0
+  br i1 %cond, label %if.then, label %loop.j.latch
+
+if.then:
+  %gep = getelementptr inbounds i8, ptr %a, i64 %subscript
+  store i8 0, ptr %gep
+  br label %loop.j.latch
+
+loop.j.latch:
+  %j.inc = add nuw nsw i64 %j, 1
+  %subscript.next = add nuw nsw i64 %subscript, 1
+  %ec.j = icmp eq i64 %j.inc, 32
+  br i1 %ec.j, label %loop.i.latch, label %loop.j.header
+
+loop.i.latch:
+  %i.inc = add nuw nsw i64 %i, 1
+  %subscript.i.next = add i64 %subscript.i, 1024
+  %ec.i = icmp eq i64 %i.inc, 4611686018427387904  ; 2^62
+  br i1 %ec.i, label %exit, label %loop.i.header
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Analysis/DependenceAnalysis/non-monotonic.ll b/llvm/test/Analysis/DependenceAnalysis/non-monotonic.ll
new file mode 100644
index 0000000000000..6247336456d2c
--- /dev/null
+++ b/llvm/test/Analysis/DependenceAnalysis/non-monotonic.ll
@@ -0,0 +1,77 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -disable-output -passes="print<da>" -da-dump-monotonicity-report \
+; RUN:     -da-enable-monotonicity-check 2>&1 | FileCheck %s
+; RUN: opt < %s -disable-output -passes="print<da>" 2>&1 | FileCheck %s -check-prefix=DISABLE-CHECK
+
+;
+; for (i = 0; i < (1ULL << 60); i++) {
+;   A[i] = 1;
+;
+;   unsigned long long offset = i * 32 + (1ULL << 62);
+;   // offset is positive when interpreted as a signed value.
+;   // To prevent violating the size limitation for an allocated object.
+;   if (offset < (1ULL << 63))
+;     A[offset] = 2;
+; }
+;
+; -----------------------------------------------------------------------------
+;
+; There is a dependency between the two stores. To detect it, we need to check
+; the monotonicity and bail out the analysis since `offset` is not monotonic.
+;
+;  memory location  | first store (A[i]) | second store (A[offset])
+; ------------------|--------------------|----------------------------
+;  A[0]             | i = 0              | i = 2^59 - 2^57
+;  A[2^60 - 32]     | i = 2^60 - 32      | i = 2^59 - 2^57 + 2^55 - 1
+;
+define void @f(ptr %A) {
+; CHECK-LABEL: 'f'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 1, ptr %idx.0, align 1
+; CHECK-NEXT:      Expr: {0,+,1}<nuw><nsw><%loop.header>
+; CHECK-NEXT:      Monotonicity: MultivariateSignedMonotonic
+; CHECK-NEXT:    Inst: store i8 2, ptr %idx.1, align 1
+; CHECK-NEXT:      Expr: {4611686018427387904,+,32}<%loop.header>
+; CHECK-NEXT:      Monotonicity: Unknown
+; CHECK-NEXT:      Reason: {4611686018427387904,+,32}<%loop.header>
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 1, ptr %idx.0, align 1 --> Dst: store i8 1, ptr %idx.0, align 1
+; CHECK-NEXT:    da analyze - none!
+; CHECK-NEXT:  Src: store i8 1, ptr %idx.0, align 1 --> Dst: store i8 2, ptr %idx.1, align 1
+; CHECK-NEXT:    da analyze - confused!
+; CHECK-NEXT:  Src: store i8 2, ptr %idx.1, align 1 --> Dst: store i8 2, ptr %idx.1, align 1
+; CHECK-NEXT:    da analyze - confused!
+;
+; DISABLE-CHECK-LABEL: 'f'
+; DISABLE-CHECK-NEXT:  Src: store i8 1, ptr %idx.0, align 1 --> Dst: store i8 1, ptr %idx.0, align 1
+; DISABLE-CHECK-NEXT:    da analyze - none!
+; DISABLE-CHECK-NEXT:  Src: store i8 1, ptr %idx.0, align 1 --> Dst: store i8 2, ptr %idx.1, align 1
+; DISABLE-CHECK-NEXT:    da analyze - none!
+; DISABLE-CHECK-NEXT:  Src: store i8 2, ptr %idx.1, align 1 --> Dst: store i8 2, ptr %idx.1, align 1
+; DISABLE-CHECK-NEXT:    da analyze - none!
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %loop.latch ]
+  %idx.0 = getelementptr inbounds i8, ptr %A, i64 %i
+  store i8 1, ptr %idx.0
+  %offset.tmp = mul i64 %i, 32
+  %offset = add i64 %offset.tmp, 4611686018427387904 ; 1ULL << 62
+  %if.cond = icmp sge i64 %offset, 0
+  br i1 %if.cond, label %if.then, label %loop.latch
+
+if.then:
+  %idx.1 = getelementptr inbounds i8, ptr %A, i64 %offset
+  store i8 2, ptr %idx.1
+  br label %loop.latch
+
+loop.latch:
+  %i.next = add nuw nsw i64 %i, 1
+  %exit.cond = icmp eq i64 %i.next, 1152921504606846976 ; 1ULL << 60
+  br i1 %exit.cond, label %exit, label %loop.header
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Analysis/DependenceAnalysis/run-specific-dependence-test.ll b/llvm/test/Analysis/DependenceAnalysis/run-specific-dependence-test.ll
new file mode 100644
index 0000000000000..8bfbf930a9ef6
--- /dev/null
+++ b/llvm/test/Analysis/DependenceAnalysis/run-specific-dependence-test.ll
@@ -0,0 +1,783 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -disable-output "-passes=print<da>" -da-enable-dependence-test=all 2>&1 \
+; RUN:     | FileCheck %s --check-prefix=CHECK-ALL
+; RUN: opt < %s -disable-output "-passes=print<da>" -da-enable-dependence-test=strong-siv 2>&1 \
+; RUN:     | FileCheck %s --check-prefix=CHECK-STRONG-SIV
+; RUN: opt < %s -disable-output "-passes=print<da>" -da-enable-dependence-test=weak-crossing-siv 2>&1 \
+; RUN:     | FileCheck %s --check-prefix=CHECK-WEAK-CROSSING-SIV
+; RUN: opt < %s -disable-output "-passes=print<da>" -da-enable-dependence-test=exact-siv 2>&1 \
+; RUN:     | FileCheck %s --check-prefix=CHECK-EXACT-SIV
+; RUN: opt < %s -disable-output "-passes=print<da>" -da-enable-dependence-test=weak-zero-siv 2>&1 \
+; RUN:     | FileCheck %s --check-prefix=CHECK-WEAK-ZERO-SIV
+; RUN: opt < %s -disable-output "-passes=print<da>" -da-enable-dependence-test=exact-rdiv 2>&1 \
+; RUN:     | FileCheck %s --check-prefix=CHECK-EXACT-RDIV
+; RUN: opt < %s -disable-output "-passes=print<da>" -da-enable-dependence-test=symbolic-rdiv 2>&1 \
+; RUN:     | FileCheck %s --check-prefix=CHECK-SYMBOLIC-RDIV
+; RUN: opt < %s -disable-output "-passes=print<da>" -da-enable-dependence-test=gcd-miv 2>&1 \
+; RUN:     | FileCheck %s --check-prefix=CHECK-GCD-MIV
+; RUN: opt < %s -disable-output "-passes=print<da>" -da-enable-dependence-test=banerjee-miv 2>&1 \
+; RUN:     | FileCheck %s --check-prefix=CHECK-BANERJEE-MIV
+
+; for (i = 0; i < 10; i++)
+;   a[i] = 0;
+define void @strong_siv(ptr %a) {
+; CHECK-ALL-LABEL: 'strong_siv'
+; CHECK-ALL-NEXT:  Src: store i8 0, ptr %gep, align 1 --> Dst: store i8 0, ptr %gep, align 1
+; CHECK-ALL-NEXT:    da analyze - none!
+;
+; CHECK-STRONG-SIV-LABEL: 'strong_siv'
+; CHECK-STRONG-SIV-NEXT:  Src: store i8 0, ptr %gep, align 1 --> Dst: store i8 0, ptr %gep, align 1
+; CHECK-STRONG-SIV-NEXT:    da analyze - none!
+;
+; CHECK-WEAK-CROSSING-SIV-LABEL: 'strong_siv'
+; CHECK-WEAK-CROSSING-SIV-NEXT:  Src: store i8 0, ptr %gep, align 1 --> Dst: store i8 0, ptr %gep, align 1
+; CHECK-WEAK-CROSSING-SIV-NEXT:    da analyze - consistent output [*]!
+;
+; CHECK-EXACT-SIV-LABEL: 'strong_siv'
+; CHECK-EXACT-SIV-NEXT:  Src: store i8 0, ptr %gep, align 1 --> Dst: store i8 0, ptr %gep, align 1
+; CHECK-EXACT-SIV-NEXT:    da analyze - consistent output [*]!
+;
+; CHECK-WEAK-ZERO-SIV-LABEL: 'strong_siv'
+; CHECK-WEAK-ZERO-SIV-NEXT:  Src: store i8 0, ptr %gep, align 1 --> Dst: store i8 0, ptr %gep, align 1
+; CHECK-WEAK-ZERO-SIV-NEXT:    da analyze - consistent output [*]!
+;
+; CHECK-EXACT-RDIV-LABEL: 'strong_siv'
+; CHECK-EXACT-RDIV-NEXT:  Src: store i8 0, ptr %gep, align 1 --> Dst: store i8 0, ptr %gep, align 1
+; CHECK-EXACT-RDIV-NEXT:    da analyze - consistent output [*]!
+;
+; CHECK-SYMBOLIC-RDIV-LABEL: 'strong_siv'
+; CHECK-SYMBOLIC-RDIV-NEXT:  Src: store i8 0, ptr %gep, align 1 --> Dst: store i8 0, ptr %gep, align 1
+; CHECK-SYMBOLIC-RDIV-NEXT:    da analyze - consistent output [*]!
+;
+; CHECK-GCD-MIV-LABEL: 'strong_siv'
+; CHECK-GCD-MIV-NEXT:  Src: store i8 0, ptr %gep, align 1 --> Dst: store i8 0, ptr %gep, align 1
+; CHECK-GCD-MIV-NEXT:    da analyze - consistent output [*]!
+;
+; CHECK-BANERJEE-MIV-LABEL: 'strong_siv'
+; CHECK-BANERJEE-MIV-NEXT:  Src: store i8 0, ptr %gep, align 1 --> Dst: store i8 0, ptr %gep, align 1
+; CHECK-BANERJEE-MIV-NEXT:    da analyze - consistent output [*]!
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop ]
+  %gep = getelementptr i8, ptr %a, i64 %i
+  store i8 0, ptr %gep
+  %i.inc = add i64 %i, 1
+  %ec = icmp eq i64 %i.inc, 10
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; for (i = 0; i < 10; i++) {
+;   a[i] = 1;
+;   a[10 - i] = 2;
+; }
+define void @weak_crossing_siv(ptr %a) {
+; CHECK-ALL-LABEL: 'weak_crossing_siv'
+; CHECK-ALL-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-ALL-NEXT:    da analyze - none!
+; CHECK-ALL-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-ALL-NEXT:    da analyze - output [*|<] splitable!
+; CHECK-ALL-NEXT:    da analyze - split level = 1, iteration = 5!
+; CHECK-ALL-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-ALL-NEXT:    da analyze - none!
+;
+; CHECK-STRONG-SIV-LABEL: 'weak_crossing_siv'
+; CHECK-STRONG-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-STRONG-SIV-NEXT:    da analyze - none!
+; CHECK-STRONG-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-STRONG-SIV-NEXT:    da analyze - consistent output [*|<]!
+; CHECK-STRONG-SIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-STRONG-SIV-NEXT:    da analyze - none!
+;
+; CHECK-WEAK-CROSSING-SIV-LABEL: 'weak_crossing_siv'
+; CHECK-WEAK-CROSSING-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-WEAK-CROSSING-SIV-NEXT:    da analyze - consistent output [*]!
+; CHECK-WEAK-CROSSING-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-WEAK-CROSSING-SIV-NEXT:    da analyze - output [*|<] splitable!
+; CHECK-WEAK-CROSSING-SIV-NEXT:    da analyze - split level = 1, iteration = 5!
+; CHECK-WEAK-CROSSING-SIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-WEAK-CROSSING-SIV-NEXT:    da analyze - consistent output [*]!
+;
+; CHECK-EXACT-SIV-LABEL: 'weak_crossing_siv'
+; CHECK-EXACT-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-EXACT-SIV-NEXT:    da analyze - consistent output [*]!
+; CHECK-EXACT-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-EXACT-SIV-NEXT:    da analyze - consistent output [*|<]!
+; CHECK-EXACT-SIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-EXACT-SIV-NEXT:    da analyze - consistent output [*]!
+;
+; CHECK-WEAK-ZERO-SIV-LABEL: 'weak_crossing_siv'
+; CHECK-WEAK-ZERO-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-WEAK-ZERO-SIV-NEXT:    da analyze - consistent output [*]!
+; CHECK-WEAK-ZERO-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-WEAK-ZERO-SIV-NEXT:    da analyze - consistent output [*|<]!
+; CHECK-WEAK-ZERO-SIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-WEAK-ZERO-SIV-NEXT:    da analyze - consistent output [*]!
+;
+; CHECK-EXACT-RDIV-LABEL: 'weak_crossing_siv'
+; CHECK-EXACT-RDIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-EXACT-RDIV-NEXT:    da analyze - consistent output [*]!
+; CHECK-EXACT-RDIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-EXACT-RDIV-NEXT:    da analyze - consistent output [*|<]!
+; CHECK-EXACT-RDIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-EXACT-RDIV-NEXT:    da analyze - consistent output [*]!
+;
+; CHECK-SYMBOLIC-RDIV-LABEL: 'weak_crossing_siv'
+; CHECK-SYMBOLIC-RDIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-SYMBOLIC-RDIV-NEXT:    da analyze - consistent output [*]!
+; CHECK-SYMBOLIC-RDIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-SYMBOLIC-RDIV-NEXT:    da analyze - consistent output [*|<]!
+; CHECK-SYMBOLIC-RDIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-SYMBOLIC-RDIV-NEXT:    da analyze - consistent output [*]!
+;
+; CHECK-GCD-MIV-LABEL: 'weak_crossing_siv'
+; CHECK-GCD-MIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-GCD-MIV-NEXT:    da analyze - consistent output [*]!
+; CHECK-GCD-MIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-GCD-MIV-NEXT:    da analyze - consistent output [*|<]!
+; CHECK-GCD-MIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-GCD-MIV-NEXT:    da analyze - consistent output [*]!
+;
+; CHECK-BANERJEE-MIV-LABEL: 'weak_crossing_siv'
+; CHECK-BANERJEE-MIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-BANERJEE-MIV-NEXT:    da analyze - consistent output [*]!
+; CHECK-BANERJEE-MIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-BANERJEE-MIV-NEXT:    da analyze - consistent output [*|<]!
+; CHECK-BANERJEE-MIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-BANERJEE-MIV-NEXT:    da analyze - consistent output [*]!
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop ]
+  %i.sub10 = sub i64 10, %i
+  %gep.0 = getelementptr i8, ptr %a, i64 %i
+  %gep.1 = getelementptr i8, ptr %a, i64 %i.sub10
+  store i8 1, ptr %gep.0
+  store i8 2, ptr %gep.1
+  %i.inc = add i64 %i, 1
+  %ec = icmp eq i64 %i.inc, 10
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; for (i = 0; i < 10; i++) {
+;   a[2 * i + 0] = 1;
+;   a[2 * i + 1] = 2;
+; }
+define void @exact_siv(ptr %a) {
+; CHECK-ALL-LABEL: 'exact_siv'
+; CHECK-ALL-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-ALL-NEXT:    da analyze - none!
+; CHECK-ALL-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-ALL-NEXT:    da analyze - none!
+; CHECK-ALL-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-ALL-NEXT:    da analyze - none!
+;
+; CHECK-STRONG-SIV-LABEL: 'exact_siv'
+; CHECK-STRONG-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-STRONG-SIV-NEXT:    da analyze - none!
+; CHECK-STRONG-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-STRONG-SIV-NEXT:    da analyze - none!
+; CHECK-STRONG-SIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-STRONG-SIV-NEXT:    da analyze - none!
+;
+; CHECK-WEAK-CROSSING-SIV-LABEL: 'exact_siv'
+; CHECK-WEAK-CROSSING-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-WEAK-CROSSING-SIV-NEXT:    da analyze - consistent output [*]!
+; CHECK-WEAK-CROSSING-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-WEAK-CROSSING-SIV-NEXT:    da analyze - consistent output [*|<]!
+; CHECK-WEAK-CROSSING-SIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-WEAK-CROSSING-SIV-NEXT:    da analyze - consistent output [*]!
+;
+; CHECK-EXACT-SIV-LABEL: 'exact_siv'
+; CHECK-EXACT-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-EXACT-SIV-NEXT:    da analyze - consistent output [*]!
+; CHECK-EXACT-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-EXACT-SIV-NEXT:    da analyze - consistent output [*|<]!
+; CHECK-EXACT-SIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-EXACT-SIV-NEXT:    da analyze - consistent output [*]!
+;
+; CHECK-WEAK-ZERO-SIV-LABEL: 'exact_siv'
+; CHECK-WEAK-ZERO-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-WEAK-ZERO-SIV-NEXT:    da analyze - consistent output [*]!
+; CHECK-WEAK-ZERO-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-WEAK-ZERO-SIV-NEXT:    da analyze - consistent output [*|<]!
+; CHECK-WEAK-ZERO-SIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-WEAK-ZERO-SIV-NEXT:    da analyze - consistent output [*]!
+;
+; CHECK-EXACT-RDIV-LABEL: 'exact_siv'
+; CHECK-EXACT-RDIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-EXACT-RDIV-NEXT:    da analyze - consistent output [*]!
+; CHECK-EXACT-RDIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-EXACT-RDIV-NEXT:    da analyze - consistent output [*|<]!
+; CHECK-EXACT-RDIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-EXACT-RDIV-NEXT:    da analyze - consistent output [*]!
+;
+; CHECK-SYMBOLIC-RDIV-LABEL: 'exact_siv'
+; CHECK-SYMBOLIC-RDIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-SYMBOLIC-RDIV-NEXT:    da analyze - consistent output [*]!
+; CHECK-SYMBOLIC-RDIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-SYMBOLIC-RDIV-NEXT:    da analyze - consistent output [*|<]!
+; CHECK-SYMBOLIC-RDIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-SYMBOLIC-RDIV-NEXT:    da analyze - consistent output [*]!
+;
+; CHECK-GCD-MIV-LABEL: 'exact_siv'
+; CHECK-GCD-MIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-GCD-MIV-NEXT:    da analyze - consistent output [*]!
+; CHECK-GCD-MIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-GCD-MIV-NEXT:    da analyze - none!
+; CHECK-GCD-MIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-GCD-MIV-NEXT:    da analyze - consistent output [*]!
+;
+; CHECK-BANERJEE-MIV-LABEL: 'exact_siv'
+; CHECK-BANERJEE-MIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-BANERJEE-MIV-NEXT:    da analyze - consistent output [*]!
+; CHECK-BANERJEE-MIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-BANERJEE-MIV-NEXT:    da analyze - consistent output [*|<]!
+; CHECK-BANERJEE-MIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-BANERJEE-MIV-NEXT:    da analyze - consistent output [*]!
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop ]
+  %offset.0 = mul i64 2, %i
+  %offset.1 = add i64 %offset.0, 1
+  %gep.0 = getelementptr i8, ptr %a, i64 %offset.0
+  %gep.1 = getelementptr i8, ptr %a, i64 %offset.1
+  store i8 1, ptr %gep.0
+  store i8 2, ptr %gep.1
+  %i.inc = add i64 %i, 1
+  %ec = icmp eq i64 %i.inc, 10
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; for (i = 0; i < 10; i++) {
+;   a[10] = 1;
+;   a[i] = 2;
+; }
+define void @weak_zero_siv(ptr %a) {
+; CHECK-ALL-LABEL: 'weak_zero_siv'
+; CHECK-ALL-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-ALL-NEXT:    da analyze - consistent output [S]!
+; CHECK-ALL-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-ALL-NEXT:    da analyze - none!
+; CHECK-ALL-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-ALL-NEXT:    da analyze - none!
+;
+; CHECK-STRONG-SIV-LABEL: 'weak_zero_siv'
+; CHECK-STRONG-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-STRONG-SIV-NEXT:    da analyze - consistent output [S]!
+; CHECK-STRONG-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-STRONG-SIV-NEXT:    da analyze - consistent output [*|<]!
+; CHECK-STRONG-SIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-STRONG-SIV-NEXT:    da analyze - none!
+;
+; CHECK-WEAK-CROSSING-SIV-LABEL: 'weak_zero_siv'
+; CHECK-WEAK-CROSSING-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-WEAK-CROSSING-SIV-NEXT:    da analyze - consistent output [S]!
+; CHECK-WEAK-CROSSING-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-WEAK-CROSSING-SIV-NEXT:    da analyze - consistent output [*|<]!
+; CHECK-WEAK-CROSSING-SIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-WEAK-CROSSING-SIV-NEXT:    da analyze - consistent output [*]!
+;
+; CHECK-EXACT-SIV-LABEL: 'weak_zero_siv'
+; CHECK-EXACT-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-EXACT-SIV-NEXT:    da analyze - consistent output [S]!
+; CHECK-EXACT-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-EXACT-SIV-NEXT:    da analyze - consistent output [*|<]!
+; CHECK-EXACT-SIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-EXACT-SIV-NEXT:    da analyze - consistent output [*]!
+;
+; CHECK-WEAK-ZERO-SIV-LABEL: 'weak_zero_siv'
+; CHECK-WEAK-ZERO-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-WEAK-ZERO-SIV-NEXT:    da analyze - consistent output [S]!
+; CHECK-WEAK-ZERO-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-WEAK-ZERO-SIV-NEXT:    da analyze - none!
+; CHECK-WEAK-ZERO-SIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-WEAK-ZERO-SIV-NEXT:    da analyze - consistent output [*]!
+;
+; CHECK-EXACT-RDIV-LABEL: 'weak_zero_siv'
+; CHECK-EXACT-RDIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-EXACT-RDIV-NEXT:    da analyze - consistent output [S]!
+; CHECK-EXACT-RDIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-EXACT-RDIV-NEXT:    da analyze - consistent output [*|<]!
+; CHECK-EXACT-RDIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-EXACT-RDIV-NEXT:    da analyze - consistent output [*]!
+;
+; CHECK-SYMBOLIC-RDIV-LABEL: 'weak_zero_siv'
+; CHECK-SYMBOLIC-RDIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-SYMBOLIC-RDIV-NEXT:    da analyze - consistent output [S]!
+; CHECK-SYMBOLIC-RDIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-SYMBOLIC-RDIV-NEXT:    da analyze - consistent output [*|<]!
+; CHECK-SYMBOLIC-RDIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-SYMBOLIC-RDIV-NEXT:    da analyze - consistent output [*]!
+;
+; CHECK-GCD-MIV-LABEL: 'weak_zero_siv'
+; CHECK-GCD-MIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-GCD-MIV-NEXT:    da analyze - consistent output [S]!
+; CHECK-GCD-MIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-GCD-MIV-NEXT:    da analyze - consistent output [*|<]!
+; CHECK-GCD-MIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-GCD-MIV-NEXT:    da analyze - consistent output [*]!
+;
+; CHECK-BANERJEE-MIV-LABEL: 'weak_zero_siv'
+; CHECK-BANERJEE-MIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-BANERJEE-MIV-NEXT:    da analyze - consistent output [S]!
+; CHECK-BANERJEE-MIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-BANERJEE-MIV-NEXT:    da analyze - consistent output [*|<]!
+; CHECK-BANERJEE-MIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-BANERJEE-MIV-NEXT:    da analyze - consistent output [*]!
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop ]
+  %gep.0 = getelementptr i8, ptr %a, i64 10
+  %gep.1 = getelementptr i8, ptr %a, i64 %i
+  store i8 1, ptr %gep.0
+  store i8 2, ptr %gep.1
+  %i.inc = add i64 %i, 1
+  %ec = icmp eq i64 %i.inc, 10
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; for (i = 0; i < 10; i++)
+;   a[2 * i + 0] = 1;
+; for (i = 0; i < 20; i++)
+;   a[2 * i + 1] = 2;
+define void @exact_rdiv(ptr %a) {
+; CHECK-ALL-LABEL: 'exact_rdiv'
+; CHECK-ALL-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-ALL-NEXT:    da analyze - none!
+; CHECK-ALL-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-ALL-NEXT:    da analyze - none!
+; CHECK-ALL-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-ALL-NEXT:    da analyze - none!
+;
+; CHECK-STRONG-SIV-LABEL: 'exact_rdiv'
+; CHECK-STRONG-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-STRONG-SIV-NEXT:    da analyze - none!
+; CHECK-STRONG-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-STRONG-SIV-NEXT:    da analyze - consistent output [|<]!
+; CHECK-STRONG-SIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-STRONG-SIV-NEXT:    da analyze - none!
+;
+; CHECK-WEAK-CROSSING-SIV-LABEL: 'exact_rdiv'
+; CHECK-WEAK-CROSSING-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-WEAK-CROSSING-SIV-NEXT:    da analyze - consistent output [*]!
+; CHECK-WEAK-CROSSING-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-WEAK-CROSSING-SIV-NEXT:    da analyze - consistent output [|<]!
+; CHECK-WEAK-CROSSING-SIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-WEAK-CROSSING-SIV-NEXT:    da analyze - consistent output [*]!
+;
+; CHECK-EXACT-SIV-LABEL: 'exact_rdiv'
+; CHECK-EXACT-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-EXACT-SIV-NEXT:    da analyze - consistent output [*]!
+; CHECK-EXACT-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-EXACT-SIV-NEXT:    da analyze - consistent output [|<]!
+; CHECK-EXACT-SIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-EXACT-SIV-NEXT:    da analyze - consistent output [*]!
+;
+; CHECK-WEAK-ZERO-SIV-LABEL: 'exact_rdiv'
+; CHECK-WEAK-ZERO-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-WEAK-ZERO-SIV-NEXT:    da analyze - consistent output [*]!
+; CHECK-WEAK-ZERO-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-WEAK-ZERO-SIV-NEXT:    da analyze - consistent output [|<]!
+; CHECK-WEAK-ZERO-SIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-WEAK-ZERO-SIV-NEXT:    da analyze - consistent output [*]!
+;
+; CHECK-EXACT-RDIV-LABEL: 'exact_rdiv'
+; CHECK-EXACT-RDIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-EXACT-RDIV-NEXT:    da analyze - consistent output [*]!
+; CHECK-EXACT-RDIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-EXACT-RDIV-NEXT:    da analyze - none!
+; CHECK-EXACT-RDIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-EXACT-RDIV-NEXT:    da analyze - consistent output [*]!
+;
+; CHECK-SYMBOLIC-RDIV-LABEL: 'exact_rdiv'
+; CHECK-SYMBOLIC-RDIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-SYMBOLIC-RDIV-NEXT:    da analyze - consistent output [*]!
+; CHECK-SYMBOLIC-RDIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-SYMBOLIC-RDIV-NEXT:    da analyze - consistent output [|<]!
+; CHECK-SYMBOLIC-RDIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-SYMBOLIC-RDIV-NEXT:    da analyze - consistent output [*]!
+;
+; CHECK-GCD-MIV-LABEL: 'exact_rdiv'
+; CHECK-GCD-MIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-GCD-MIV-NEXT:    da analyze - consistent output [*]!
+; CHECK-GCD-MIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-GCD-MIV-NEXT:    da analyze - none!
+; CHECK-GCD-MIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-GCD-MIV-NEXT:    da analyze - consistent output [*]!
+;
+; CHECK-BANERJEE-MIV-LABEL: 'exact_rdiv'
+; CHECK-BANERJEE-MIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-BANERJEE-MIV-NEXT:    da analyze - consistent output [*]!
+; CHECK-BANERJEE-MIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-BANERJEE-MIV-NEXT:    da analyze - consistent output [|<]!
+; CHECK-BANERJEE-MIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-BANERJEE-MIV-NEXT:    da analyze - consistent output [*]!
+;
+entry:
+  br label %loop.0
+
+loop.0:
+  %i.0 = phi i64 [ 0, %entry ], [ %i.0.inc, %loop.0 ]
+  %offset.0 = phi i64 [ 0, %entry ], [ %offset.0.next, %loop.0 ]
+  %gep.0 = getelementptr i8, ptr %a, i64 %offset.0
+  store i8 1, ptr %gep.0
+  %i.0.inc = add i64 %i.0, 1
+  %offset.0.next = add i64 %offset.0, 2
+  %ec.0 = icmp eq i64 %i.0.inc, 10
+  br i1 %ec.0, label %loop.1, label %loop.0
+
+loop.1:
+  %i.1 = phi i64 [ 0, %loop.0 ], [ %i.1.inc, %loop.1 ]
+  %offset.1 = phi i64 [ 1, %loop.0 ], [ %offset.1.next, %loop.1 ]
+  %gep.1 = getelementptr i8, ptr %a, i64 %offset.1
+  store i8 2, ptr %gep.1
+  %i.1.inc = add i64 %i.1, 1
+  %offset.1.next = add i64 %offset.1, 2
+  %ec.1 = icmp eq i64 %i.1.inc, 20
+  br i1 %ec.1, label %exit, label %loop.1
+
+exit:
+  ret void
+}
+
+; for (i = 0; i < 10; i++)
+;   a[i + 1] = 1;
+; for (i = 0; i < 20; i++)
+;   a[-i] = 2;
+define void @symbolic_rdiv(ptr %a) {
+; CHECK-ALL-LABEL: 'symbolic_rdiv'
+; CHECK-ALL-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-ALL-NEXT:    da analyze - none!
+; CHECK-ALL-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-ALL-NEXT:    da analyze - none!
+; CHECK-ALL-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-ALL-NEXT:    da analyze - none!
+;
+; CHECK-STRONG-SIV-LABEL: 'symbolic_rdiv'
+; CHECK-STRONG-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-STRONG-SIV-NEXT:    da analyze - none!
+; CHECK-STRONG-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-STRONG-SIV-NEXT:    da analyze - consistent output [|<]!
+; CHECK-STRONG-SIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-STRONG-SIV-NEXT:    da analyze - none!
+;
+; CHECK-WEAK-CROSSING-SIV-LABEL: 'symbolic_rdiv'
+; CHECK-WEAK-CROSSING-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-WEAK-CROSSING-SIV-NEXT:    da analyze - consistent output [*]!
+; CHECK-WEAK-CROSSING-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-WEAK-CROSSING-SIV-NEXT:    da analyze - consistent output [|<]!
+; CHECK-WEAK-CROSSING-SIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-WEAK-CROSSING-SIV-NEXT:    da analyze - consistent output [*]!
+;
+; CHECK-EXACT-SIV-LABEL: 'symbolic_rdiv'
+; CHECK-EXACT-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-EXACT-SIV-NEXT:    da analyze - consistent output [*]!
+; CHECK-EXACT-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-EXACT-SIV-NEXT:    da analyze - consistent output [|<]!
+; CHECK-EXACT-SIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-EXACT-SIV-NEXT:    da analyze - consistent output [*]!
+;
+; CHECK-WEAK-ZERO-SIV-LABEL: 'symbolic_rdiv'
+; CHECK-WEAK-ZERO-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-WEAK-ZERO-SIV-NEXT:    da analyze - consistent output [*]!
+; CHECK-WEAK-ZERO-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-WEAK-ZERO-SIV-NEXT:    da analyze - consistent output [|<]!
+; CHECK-WEAK-ZERO-SIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-WEAK-ZERO-SIV-NEXT:    da analyze - consistent output [*]!
+;
+; CHECK-EXACT-RDIV-LABEL: 'symbolic_rdiv'
+; CHECK-EXACT-RDIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-EXACT-RDIV-NEXT:    da analyze - consistent output [*]!
+; CHECK-EXACT-RDIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-EXACT-RDIV-NEXT:    da analyze - none!
+; CHECK-EXACT-RDIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-EXACT-RDIV-NEXT:    da analyze - consistent output [*]!
+;
+; CHECK-SYMBOLIC-RDIV-LABEL: 'symbolic_rdiv'
+; CHECK-SYMBOLIC-RDIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-SYMBOLIC-RDIV-NEXT:    da analyze - consistent output [*]!
+; CHECK-SYMBOLIC-RDIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-SYMBOLIC-RDIV-NEXT:    da analyze - none!
+; CHECK-SYMBOLIC-RDIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-SYMBOLIC-RDIV-NEXT:    da analyze - consistent output [*]!
+;
+; CHECK-GCD-MIV-LABEL: 'symbolic_rdiv'
+; CHECK-GCD-MIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-GCD-MIV-NEXT:    da analyze - consistent output [*]!
+; CHECK-GCD-MIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-GCD-MIV-NEXT:    da analyze - consistent output [|<]!
+; CHECK-GCD-MIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-GCD-MIV-NEXT:    da analyze - consistent output [*]!
+;
+; CHECK-BANERJEE-MIV-LABEL: 'symbolic_rdiv'
+; CHECK-BANERJEE-MIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-BANERJEE-MIV-NEXT:    da analyze - consistent output [*]!
+; CHECK-BANERJEE-MIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-BANERJEE-MIV-NEXT:    da analyze - consistent output [|<]!
+; CHECK-BANERJEE-MIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-BANERJEE-MIV-NEXT:    da analyze - consistent output [*]!
+;
+entry:
+  br label %loop.0
+
+loop.0:
+  %i.0 = phi i64 [ 0, %entry ], [ %i.0.inc, %loop.0 ]
+  %i.0.inc = add i64 %i.0, 1
+  %gep.0 = getelementptr i8, ptr %a, i64 %i.0.inc
+  store i8 1, ptr %gep.0
+  %ec.0 = icmp eq i64 %i.0.inc, 10
+  br i1 %ec.0, label %loop.1, label %loop.0
+
+loop.1:
+  %i.1 = phi i64 [ 0, %loop.0 ], [ %i.1.inc, %loop.1 ]
+  %i.1.neg = sub i64 0, %i.1
+  %gep.1 = getelementptr i8, ptr %a, i64 %i.1.neg
+  store i8 2, ptr %gep.1
+  %i.1.inc = add i64 %i.1, 1
+  %ec.1 = icmp eq i64 %i.1.inc, 20
+  br i1 %ec.1, label %exit, label %loop.1
+
+exit:
+  ret void
+}
+
+; for (i = 0; i < 10; i++)
+;   for (j = 0; j < 10; j++) {
+;     a[2*i + 2*j + 0] = 1;
+;     a[2*i + 2*j + 1] = 2;
+;   }
+define void @gcd_miv(ptr %a) {
+; CHECK-ALL-LABEL: 'gcd_miv'
+; CHECK-ALL-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-ALL-NEXT:    da analyze - output [* *]!
+; CHECK-ALL-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-ALL-NEXT:    da analyze - none!
+; CHECK-ALL-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-ALL-NEXT:    da analyze - output [* *]!
+;
+; CHECK-STRONG-SIV-LABEL: 'gcd_miv'
+; CHECK-STRONG-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-STRONG-SIV-NEXT:    da analyze - output [* *]!
+; CHECK-STRONG-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-STRONG-SIV-NEXT:    da analyze - output [* *|<]!
+; CHECK-STRONG-SIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-STRONG-SIV-NEXT:    da analyze - output [* *]!
+;
+; CHECK-WEAK-CROSSING-SIV-LABEL: 'gcd_miv'
+; CHECK-WEAK-CROSSING-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-WEAK-CROSSING-SIV-NEXT:    da analyze - output [* *]!
+; CHECK-WEAK-CROSSING-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-WEAK-CROSSING-SIV-NEXT:    da analyze - output [* *|<]!
+; CHECK-WEAK-CROSSING-SIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-WEAK-CROSSING-SIV-NEXT:    da analyze - output [* *]!
+;
+; CHECK-EXACT-SIV-LABEL: 'gcd_miv'
+; CHECK-EXACT-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-EXACT-SIV-NEXT:    da analyze - output [* *]!
+; CHECK-EXACT-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-EXACT-SIV-NEXT:    da analyze - output [* *|<]!
+; CHECK-EXACT-SIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-EXACT-SIV-NEXT:    da analyze - output [* *]!
+;
+; CHECK-WEAK-ZERO-SIV-LABEL: 'gcd_miv'
+; CHECK-WEAK-ZERO-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-WEAK-ZERO-SIV-NEXT:    da analyze - output [* *]!
+; CHECK-WEAK-ZERO-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-WEAK-ZERO-SIV-NEXT:    da analyze - output [* *|<]!
+; CHECK-WEAK-ZERO-SIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-WEAK-ZERO-SIV-NEXT:    da analyze - output [* *]!
+;
+; CHECK-EXACT-RDIV-LABEL: 'gcd_miv'
+; CHECK-EXACT-RDIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-EXACT-RDIV-NEXT:    da analyze - output [* *]!
+; CHECK-EXACT-RDIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-EXACT-RDIV-NEXT:    da analyze - output [* *|<]!
+; CHECK-EXACT-RDIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-EXACT-RDIV-NEXT:    da analyze - output [* *]!
+;
+; CHECK-SYMBOLIC-RDIV-LABEL: 'gcd_miv'
+; CHECK-SYMBOLIC-RDIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-SYMBOLIC-RDIV-NEXT:    da analyze - output [* *]!
+; CHECK-SYMBOLIC-RDIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-SYMBOLIC-RDIV-NEXT:    da analyze - output [* *|<]!
+; CHECK-SYMBOLIC-RDIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-SYMBOLIC-RDIV-NEXT:    da analyze - output [* *]!
+;
+; CHECK-GCD-MIV-LABEL: 'gcd_miv'
+; CHECK-GCD-MIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-GCD-MIV-NEXT:    da analyze - output [* *]!
+; CHECK-GCD-MIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-GCD-MIV-NEXT:    da analyze - none!
+; CHECK-GCD-MIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-GCD-MIV-NEXT:    da analyze - output [* *]!
+;
+; CHECK-BANERJEE-MIV-LABEL: 'gcd_miv'
+; CHECK-BANERJEE-MIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-BANERJEE-MIV-NEXT:    da analyze - output [* *]!
+; CHECK-BANERJEE-MIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-BANERJEE-MIV-NEXT:    da analyze - output [<> <>]!
+; CHECK-BANERJEE-MIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-BANERJEE-MIV-NEXT:    da analyze - output [* *]!
+;
+entry:
+  br label %loop.i.header
+
+loop.i.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.i.latch ]
+  %offset.0.i = phi i64 [ 0, %entry ], [ %offset.0.i.next, %loop.i.latch ]
+  %offset.1.i = phi i64 [ 1, %entry ], [ %offset.1.i.next, %loop.i.latch ]
+  br label %loop.j
+
+loop.j:
+  %j = phi i64 [ 0, %loop.i.header ], [ %j.inc, %loop.j ]
+  %offset.0 = phi i64 [ %offset.0.i, %loop.i.header ], [ %offset.0.next, %loop.j ]
+  %offset.1 = phi i64 [ %offset.1.i, %loop.i.header ], [ %offset.1.next, %loop.j ]
+  %gep.0 = getelementptr i8, ptr %a, i64 %offset.0
+  %gep.1 = getelementptr i8, ptr %a, i64 %offset.1
+  store i8 1, ptr %gep.0
+  store i8 2, ptr %gep.1
+  %j.inc = add i64 %j, 1
+  %offset.0.next = add i64 %offset.0, 2
+  %offset.1.next = add i64 %offset.1, 2
+  %ec.j = icmp eq i64 %j.inc, 10
+  br i1 %ec.j, label %loop.i.latch, label %loop.j
+
+loop.i.latch:
+  %i.inc = add i64 %i, 1
+  %offset.0.i.next = add i64 %offset.0.i, 2
+  %offset.1.i.next = add i64 %offset.1.i, 2
+  %ec.i = icmp eq i64 %i.inc, 10
+  br i1 %ec.i, label %exit, label %loop.i.header
+
+exit:
+  ret void
+}
+
+; for (i = 0; i < 10; i++)
+;   for (j = 0; j < 10; j++) {
+;     a[i + j] = 1;
+;     a[i + j + 100] = 2;
+;   }
+define void @banerjee_miv(ptr %a) {
+; CHECK-ALL-LABEL: 'banerjee_miv'
+; CHECK-ALL-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-ALL-NEXT:    da analyze - output [* *]!
+; CHECK-ALL-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-ALL-NEXT:    da analyze - none!
+; CHECK-ALL-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-ALL-NEXT:    da analyze - output [* *]!
+;
+; CHECK-STRONG-SIV-LABEL: 'banerjee_miv'
+; CHECK-STRONG-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-STRONG-SIV-NEXT:    da analyze - output [* *]!
+; CHECK-STRONG-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-STRONG-SIV-NEXT:    da analyze - output [* *|<]!
+; CHECK-STRONG-SIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-STRONG-SIV-NEXT:    da analyze - output [* *]!
+;
+; CHECK-WEAK-CROSSING-SIV-LABEL: 'banerjee_miv'
+; CHECK-WEAK-CROSSING-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-WEAK-CROSSING-SIV-NEXT:    da analyze - output [* *]!
+; CHECK-WEAK-CROSSING-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-WEAK-CROSSING-SIV-NEXT:    da analyze - output [* *|<]!
+; CHECK-WEAK-CROSSING-SIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-WEAK-CROSSING-SIV-NEXT:    da analyze - output [* *]!
+;
+; CHECK-EXACT-SIV-LABEL: 'banerjee_miv'
+; CHECK-EXACT-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-EXACT-SIV-NEXT:    da analyze - output [* *]!
+; CHECK-EXACT-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-EXACT-SIV-NEXT:    da analyze - output [* *|<]!
+; CHECK-EXACT-SIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-EXACT-SIV-NEXT:    da analyze - output [* *]!
+;
+; CHECK-WEAK-ZERO-SIV-LABEL: 'banerjee_miv'
+; CHECK-WEAK-ZERO-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-WEAK-ZERO-SIV-NEXT:    da analyze - output [* *]!
+; CHECK-WEAK-ZERO-SIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-WEAK-ZERO-SIV-NEXT:    da analyze - output [* *|<]!
+; CHECK-WEAK-ZERO-SIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-WEAK-ZERO-SIV-NEXT:    da analyze - output [* *]!
+;
+; CHECK-EXACT-RDIV-LABEL: 'banerjee_miv'
+; CHECK-EXACT-RDIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-EXACT-RDIV-NEXT:    da analyze - output [* *]!
+; CHECK-EXACT-RDIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-EXACT-RDIV-NEXT:    da analyze - output [* *|<]!
+; CHECK-EXACT-RDIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-EXACT-RDIV-NEXT:    da analyze - output [* *]!
+;
+; CHECK-SYMBOLIC-RDIV-LABEL: 'banerjee_miv'
+; CHECK-SYMBOLIC-RDIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-SYMBOLIC-RDIV-NEXT:    da analyze - output [* *]!
+; CHECK-SYMBOLIC-RDIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-SYMBOLIC-RDIV-NEXT:    da analyze - output [* *|<]!
+; CHECK-SYMBOLIC-RDIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-SYMBOLIC-RDIV-NEXT:    da analyze - output [* *]!
+;
+; CHECK-GCD-MIV-LABEL: 'banerjee_miv'
+; CHECK-GCD-MIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-GCD-MIV-NEXT:    da analyze - output [* *]!
+; CHECK-GCD-MIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-GCD-MIV-NEXT:    da analyze - output [* *|<]!
+; CHECK-GCD-MIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-GCD-MIV-NEXT:    da analyze - output [* *]!
+;
+; CHECK-BANERJEE-MIV-LABEL: 'banerjee_miv'
+; CHECK-BANERJEE-MIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.0, align 1
+; CHECK-BANERJEE-MIV-NEXT:    da analyze - output [* *]!
+; CHECK-BANERJEE-MIV-NEXT:  Src: store i8 1, ptr %gep.0, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-BANERJEE-MIV-NEXT:    da analyze - none!
+; CHECK-BANERJEE-MIV-NEXT:  Src: store i8 2, ptr %gep.1, align 1 --> Dst: store i8 2, ptr %gep.1, align 1
+; CHECK-BANERJEE-MIV-NEXT:    da analyze - output [* *]!
+;
+entry:
+  br label %loop.i.header
+
+loop.i.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.i.latch ]
+  br label %loop.j
+
+loop.j:
+  %j = phi i64 [ 0, %loop.i.header ], [ %j.inc, %loop.j ]
+  %offset.0 = add i64 %i, %j
+  %offset.1 = add i64 %offset.0, 100
+  %gep.0 = getelementptr i8, ptr %a, i64 %offset.0
+  %gep.1 = getelementptr i8, ptr %a, i64 %offset.1
+  store i8 1, ptr %gep.0
+  store i8 2, ptr %gep.1
+  %j.inc = add i64 %j, 1
+  %ec.j = icmp eq i64 %j.inc, 10
+  br i1 %ec.j, label %loop.i.latch, label %loop.j
+
+loop.i.latch:
+  %i.inc = add i64 %i, 1
+  %ec.i = icmp eq i64 %i.inc, 10
+  br i1 %ec.i, label %exit, label %loop.i.header
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info.ll b/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info.ll
index 3c3748a6a5f02..1964fca603e23 100644
--- a/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info.ll
+++ b/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info.ll
@@ -104,3 +104,56 @@ exit:
 }
 
 declare void @clobber()
+
+
+declare void @clobber.i32(i32)
+
+define void @test_guards_across_loops(i32 %N) {
+; CHECK-LABEL: 'test_guards_across_loops'
+; CHECK-NEXT:  Classifying expressions for: @test_guards_across_loops
+; CHECK-NEXT:    %iv.1 = phi i32 [ 0, %entry ], [ %iv.1.next, %loop.1 ]
+; CHECK-NEXT:    --> {0,+,1}<%loop.1> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop.1: Computable }
+; CHECK-NEXT:    %iv.1.next = add i32 %iv.1, 1
+; CHECK-NEXT:    --> {1,+,1}<%loop.1> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop.1: Computable }
+; CHECK-NEXT:    %iv.2 = phi i32 [ 0, %loop.1 ], [ %iv.2.next, %loop.2 ]
+; CHECK-NEXT:    --> {0,+,1}<nuw><%loop.2> U: full-set S: full-set Exits: (1 + %N) LoopDispositions: { %loop.2: Computable }
+; CHECK-NEXT:    %iv.2.next = add i32 %iv.2, 1
+; CHECK-NEXT:    --> {1,+,1}<nw><%loop.2> U: full-set S: full-set Exits: (2 + %N) LoopDispositions: { %loop.2: Computable }
+; CHECK-NEXT:  Determining loop execution counts for: @test_guards_across_loops
+; CHECK-NEXT:  Loop %loop.2: backedge-taken count is (1 + (zext i32 %N to i64))<nuw><nsw>
+; CHECK-NEXT:  Loop %loop.2: constant max backedge-taken count is i64 4294967296
+; CHECK-NEXT:  Loop %loop.2: symbolic max backedge-taken count is (1 + (zext i32 %N to i64))<nuw><nsw>
+; CHECK-NEXT:  Loop %loop.2: Trip multiple is 1
+; CHECK-NEXT:  Loop %loop.1: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %loop.1: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %loop.1: Unpredictable symbolic max backedge-taken count.
+; CHECK-NEXT:  Loop %loop.1: Predicated backedge-taken count is (1 + (zext i32 %N to i64))<nuw><nsw>
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {0,+,1}<%loop.1> Added Flags: <nusw>
+; CHECK-NEXT:  Loop %loop.1: Predicated constant max backedge-taken count is i64 4294967296
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {0,+,1}<%loop.1> Added Flags: <nusw>
+; CHECK-NEXT:  Loop %loop.1: Predicated symbolic max backedge-taken count is (1 + (zext i32 %N to i64))<nuw><nsw>
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {0,+,1}<%loop.1> Added Flags: <nusw>
+;
+entry:
+  br label %loop.1
+
+loop.1:
+  %iv.1 = phi i32 [ 0, %entry ], [ %iv.1.next, %loop.1 ]
+  call void @clobber.i32(i32 %iv.1)
+  %ec.1 = icmp ugt i32 %iv.1, %N
+  %iv.1.next = add i32 %iv.1, 1
+  br i1 %ec.1, label %loop.2, label %loop.1
+
+loop.2:
+  %iv.2 = phi i32  [ 0, %loop.1 ], [ %iv.2.next, %loop.2 ]
+  call void @clobber.i32(i32 %iv.2)
+  %ec.2 = icmp ugt i32 %iv.2, %N
+  %iv.2.next = add i32 %iv.2, 1
+  br i1 %ec.2, label %exit, label %loop.2
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info.ll b/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info.ll
index 4024c986dd11d..d5a2181e9bc5e 100644
--- a/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info.ll
+++ b/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info.ll
@@ -1431,7 +1431,7 @@ define void @ptr_induction_early_exit_eq_1_with_align_on_load(ptr %a, ptr %b, pt
 ; CHECK-NEXT:  Loop %loop: <multiple exits> Unpredictable backedge-taken count.
 ; CHECK-NEXT:    exit count for loop: ***COULDNOTCOMPUTE***
 ; CHECK-NEXT:    exit count for loop.inc: ((-8 + (-1 * (ptrtoint ptr %a_ to i64)) + (ptrtoint ptr %b_ to i64)) /u 8)
-; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 2305843009213693951
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 2305843009213693950
 ; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is ((-8 + (-1 * (ptrtoint ptr %a_ to i64)) + (ptrtoint ptr %b_ to i64)) /u 8)
 ; CHECK-NEXT:    symbolic max exit count for loop: ***COULDNOTCOMPUTE***
 ; CHECK-NEXT:    symbolic max exit count for loop.inc: ((-8 + (-1 * (ptrtoint ptr %a_ to i64)) + (ptrtoint ptr %b_ to i64)) /u 8)
@@ -1470,7 +1470,7 @@ define void @ptr_induction_early_exit_eq_1_with_align_on_arguments(ptr align 8 %
 ; CHECK-NEXT:  Loop %loop: <multiple exits> Unpredictable backedge-taken count.
 ; CHECK-NEXT:    exit count for loop: ***COULDNOTCOMPUTE***
 ; CHECK-NEXT:    exit count for loop.inc: ((-8 + (-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64)) /u 8)
-; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 2305843009213693951
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 2305843009213693950
 ; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is ((-8 + (-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64)) /u 8)
 ; CHECK-NEXT:    symbolic max exit count for loop: ***COULDNOTCOMPUTE***
 ; CHECK-NEXT:    symbolic max exit count for loop.inc: ((-8 + (-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64)) /u 8)
@@ -1511,7 +1511,7 @@ define void @ptr_induction_early_exit_eq_1_align_assumption_1(ptr %a, ptr %b, pt
 ; CHECK-NEXT:  Loop %loop: <multiple exits> Unpredictable backedge-taken count.
 ; CHECK-NEXT:    exit count for loop: ***COULDNOTCOMPUTE***
 ; CHECK-NEXT:    exit count for loop.inc: ((-8 + (-1 * (ptrtoint ptr %a_ to i64)) + (ptrtoint ptr %b_ to i64)) /u 8)
-; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 2305843009213693951
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 2305843009213693950
 ; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is ((-8 + (-1 * (ptrtoint ptr %a_ to i64)) + (ptrtoint ptr %b_ to i64)) /u 8)
 ; CHECK-NEXT:    symbolic max exit count for loop: ***COULDNOTCOMPUTE***
 ; CHECK-NEXT:    symbolic max exit count for loop.inc: ((-8 + (-1 * (ptrtoint ptr %a_ to i64)) + (ptrtoint ptr %b_ to i64)) /u 8)
@@ -1556,7 +1556,7 @@ define void @ptr_induction_early_exit_eq_1_align_assumption_2(ptr %a, ptr %b, pt
 ; CHECK-NEXT:  Loop %loop: <multiple exits> Unpredictable backedge-taken count.
 ; CHECK-NEXT:    exit count for loop: ***COULDNOTCOMPUTE***
 ; CHECK-NEXT:    exit count for loop.inc: ((-8 + (-1 * (ptrtoint ptr %a_ to i64)) + (ptrtoint ptr %b_ to i64)) /u 8)
-; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 2305843009213693951
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 2305843009213693950
 ; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is ((-8 + (-1 * (ptrtoint ptr %a_ to i64)) + (ptrtoint ptr %b_ to i64)) /u 8)
 ; CHECK-NEXT:    symbolic max exit count for loop: ***COULDNOTCOMPUTE***
 ; CHECK-NEXT:    symbolic max exit count for loop.inc: ((-8 + (-1 * (ptrtoint ptr %a_ to i64)) + (ptrtoint ptr %b_ to i64)) /u 8)
diff --git a/llvm/test/Analysis/ScalarEvolution/ne-guard-multiple-trip-count.ll b/llvm/test/Analysis/ScalarEvolution/ne-guard-multiple-trip-count.ll
new file mode 100644
index 0000000000000..220c5a1deb1a0
--- /dev/null
+++ b/llvm/test/Analysis/ScalarEvolution/ne-guard-multiple-trip-count.ll
@@ -0,0 +1,72 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes='print<scalar-evolution>' -disable-output %s 2>&1 | FileCheck %s
+
+declare void @foo()
+
+; Tests with multiple guards for the same value and different values.
+
+define void @test_guard_order_b_then_c_and_d(ptr %a, ptr %b, ptr %c, ptr %d) {
+; CHECK-LABEL: 'test_guard_order_b_then_c_and_d'
+; CHECK-NEXT:  Classifying expressions for: @test_guard_order_b_then_c_and_d
+; CHECK-NEXT:    %iv = phi ptr [ %a, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    --> {%a,+,1}<%loop> U: full-set S: full-set Exits: (-1 + (-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64) + %a) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %iv.next = getelementptr i8, ptr %iv, i64 1
+; CHECK-NEXT:    --> {(1 + %a),+,1}<%loop> U: full-set S: full-set Exits: ((-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64) + %a) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:  Determining loop execution counts for: @test_guard_order_b_then_c_and_d
+; CHECK-NEXT:  Loop %loop: backedge-taken count is (-1 + (-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64))
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 -2
+; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is (-1 + (-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64))
+; CHECK-NEXT:  Loop %loop: Trip multiple is 1
+;
+entry:
+  %cmp.eq.b = icmp ne ptr %a, %b
+  %cmp.eq.c = icmp ne ptr %a, %c
+  %cmp.eq.d = icmp ne ptr %b, %d
+  call void @llvm.assume(i1 %cmp.eq.b)
+  call void @llvm.assume(i1 %cmp.eq.c)
+  call void @llvm.assume(i1 %cmp.eq.d)
+  br label %loop
+
+loop:
+  %iv = phi ptr [ %a, %entry ], [ %iv.next, %loop ]
+  %iv.next = getelementptr i8, ptr %iv, i64 1
+  call void @foo()
+  %ec = icmp eq ptr %iv.next, %b
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_guard_order_d_then_c_and_b(ptr %a, ptr %b, ptr %c, ptr %d) {
+; CHECK-LABEL: 'test_guard_order_d_then_c_and_b'
+; CHECK-NEXT:  Classifying expressions for: @test_guard_order_d_then_c_and_b
+; CHECK-NEXT:    %iv = phi ptr [ %a, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    --> {%a,+,1}<%loop> U: full-set S: full-set Exits: (-1 + (-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64) + %a) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %iv.next = getelementptr i8, ptr %iv, i64 1
+; CHECK-NEXT:    --> {(1 + %a),+,1}<%loop> U: full-set S: full-set Exits: ((-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64) + %a) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:  Determining loop execution counts for: @test_guard_order_d_then_c_and_b
+; CHECK-NEXT:  Loop %loop: backedge-taken count is (-1 + (-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64))
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 -2
+; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is (-1 + (-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64))
+; CHECK-NEXT:  Loop %loop: Trip multiple is 1
+;
+entry:
+  %cmp.eq.b = icmp ne ptr %a, %b
+  %cmp.eq.c = icmp ne ptr %a, %c
+  %cmp.eq.d = icmp ne ptr %b, %d
+  call void @llvm.assume(i1 %cmp.eq.d)
+  call void @llvm.assume(i1 %cmp.eq.c)
+  call void @llvm.assume(i1 %cmp.eq.b)
+  br label %loop
+
+loop:
+  %iv = phi ptr [ %a, %entry ], [ %iv.next, %loop ]
+  %iv.next = getelementptr i8, ptr %iv, i64 1
+  call void @foo()
+  %ec = icmp eq ptr %iv.next, %b
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Analysis/ScalarEvolution/ptrtoaddr.ll b/llvm/test/Analysis/ScalarEvolution/ptrtoaddr.ll
new file mode 100644
index 0000000000000..ebab9f0f20323
--- /dev/null
+++ b/llvm/test/Analysis/ScalarEvolution/ptrtoaddr.ll
@@ -0,0 +1,135 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s --data-layout="e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" -S -disable-output -disable-verify "-passes=print<scalar-evolution>" 2>&1 | FileCheck --check-prefixes=ALL,X64 %s
+; RUN: opt < %s --data-layout="e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-f64:32:64-f80:32-n8:16:32-S128" -S -disable-output -disable-verify "-passes=print<scalar-evolution>" 2>&1 | FileCheck --check-prefixes=ALL,X32 %s
+
+declare void @useptr(ptr)
+
+define void @ptrtoaddr(ptr %in, ptr %out0, ptr %out1, ptr %out2, ptr %out3) {
+; X64-LABEL: 'ptrtoaddr'
+; X64-NEXT:  Classifying expressions for: @ptrtoaddr
+; X64-NEXT:    %p0 = ptrtoaddr ptr %in to i64
+; X64-NEXT:    --> %p0 U: full-set S: full-set
+; X64-NEXT:    %p1 = ptrtoaddr ptr %in to i32
+; X64-NEXT:    --> %p1 U: full-set S: full-set
+; X64-NEXT:    %p2 = ptrtoaddr ptr %in to i16
+; X64-NEXT:    --> %p2 U: full-set S: full-set
+; X64-NEXT:    %p3 = ptrtoaddr ptr %in to i128
+; X64-NEXT:    --> %p3 U: full-set S: full-set
+; X64-NEXT:  Determining loop execution counts for: @ptrtoaddr
+;
+; X32-LABEL: 'ptrtoaddr'
+; X32-NEXT:  Classifying expressions for: @ptrtoaddr
+; X32-NEXT:    %p0 = ptrtoaddr ptr %in to i64
+; X32-NEXT:    --> %p0 U: full-set S: full-set
+; X32-NEXT:    %p1 = ptrtoaddr ptr %in to i32
+; X32-NEXT:    --> %p1 U: full-set S: full-set
+; X32-NEXT:    %p2 = ptrtoaddr ptr %in to i16
+; X32-NEXT:    --> %p2 U: full-set S: full-set
+; X32-NEXT:    %p3 = ptrtoaddr ptr %in to i128
+; X32-NEXT:    --> %p3 U: full-set S: full-set
+; X32-NEXT:  Determining loop execution counts for: @ptrtoaddr
+;
+  %p0 = ptrtoaddr ptr %in to i64
+  %p1 = ptrtoaddr ptr %in to i32
+  %p2 = ptrtoaddr ptr %in to i16
+  %p3 = ptrtoaddr ptr %in to i128
+  store i64  %p0, ptr  %out0
+  store i32  %p1, ptr  %out1
+  store i16  %p2, ptr  %out2
+  store i128 %p3, ptr %out3
+  ret void
+}
+
+define void @ptrtoaddr_as1(ptr addrspace(1) %in, ptr %out0, ptr %out1, ptr %out2, ptr %out3) {
+; X64-LABEL: 'ptrtoaddr_as1'
+; X64-NEXT:  Classifying expressions for: @ptrtoaddr_as1
+; X64-NEXT:    %p0 = ptrtoaddr ptr addrspace(1) %in to i64
+; X64-NEXT:    --> %p0 U: full-set S: full-set
+; X64-NEXT:    %p1 = ptrtoaddr ptr addrspace(1) %in to i32
+; X64-NEXT:    --> %p1 U: full-set S: full-set
+; X64-NEXT:    %p2 = ptrtoaddr ptr addrspace(1) %in to i16
+; X64-NEXT:    --> %p2 U: full-set S: full-set
+; X64-NEXT:    %p3 = ptrtoaddr ptr addrspace(1) %in to i128
+; X64-NEXT:    --> %p3 U: full-set S: full-set
+; X64-NEXT:  Determining loop execution counts for: @ptrtoaddr_as1
+;
+; X32-LABEL: 'ptrtoaddr_as1'
+; X32-NEXT:  Classifying expressions for: @ptrtoaddr_as1
+; X32-NEXT:    %p0 = ptrtoaddr ptr addrspace(1) %in to i64
+; X32-NEXT:    --> %p0 U: full-set S: full-set
+; X32-NEXT:    %p1 = ptrtoaddr ptr addrspace(1) %in to i32
+; X32-NEXT:    --> %p1 U: full-set S: full-set
+; X32-NEXT:    %p2 = ptrtoaddr ptr addrspace(1) %in to i16
+; X32-NEXT:    --> %p2 U: full-set S: full-set
+; X32-NEXT:    %p3 = ptrtoaddr ptr addrspace(1) %in to i128
+; X32-NEXT:    --> %p3 U: full-set S: full-set
+; X32-NEXT:  Determining loop execution counts for: @ptrtoaddr_as1
+;
+  %p0 = ptrtoaddr ptr addrspace(1) %in to i64
+  %p1 = ptrtoaddr ptr addrspace(1) %in to i32
+  %p2 = ptrtoaddr ptr addrspace(1) %in to i16
+  %p3 = ptrtoaddr ptr addrspace(1) %in to i128
+  store i64  %p0, ptr  %out0
+  store i32  %p1, ptr  %out1
+  store i16  %p2, ptr  %out2
+  store i128 %p3, ptr %out3
+  ret void
+}
+
+define void @ptrtoaddr_of_bitcast(ptr %in, ptr %out0) {
+; X64-LABEL: 'ptrtoaddr_of_bitcast'
+; X64-NEXT:  Classifying expressions for: @ptrtoaddr_of_bitcast
+; X64-NEXT:    %in_casted = bitcast ptr %in to ptr
+; X64-NEXT:    --> %in U: full-set S: full-set
+; X64-NEXT:    %p0 = ptrtoaddr ptr %in_casted to i64
+; X64-NEXT:    --> %p0 U: full-set S: full-set
+; X64-NEXT:  Determining loop execution counts for: @ptrtoaddr_of_bitcast
+;
+; X32-LABEL: 'ptrtoaddr_of_bitcast'
+; X32-NEXT:  Classifying expressions for: @ptrtoaddr_of_bitcast
+; X32-NEXT:    %in_casted = bitcast ptr %in to ptr
+; X32-NEXT:    --> %in U: full-set S: full-set
+; X32-NEXT:    %p0 = ptrtoaddr ptr %in_casted to i64
+; X32-NEXT:    --> %p0 U: full-set S: full-set
+; X32-NEXT:  Determining loop execution counts for: @ptrtoaddr_of_bitcast
+;
+  %in_casted = bitcast ptr %in to ptr
+  %p0 = ptrtoaddr ptr %in_casted to i64
+  store i64 %p0, ptr %out0
+  ret void
+}
+
+define void @ptrtoaddr_of_nullptr(ptr %out0) {
+; ALL-LABEL: 'ptrtoaddr_of_nullptr'
+; ALL-NEXT:  Classifying expressions for: @ptrtoaddr_of_nullptr
+; ALL-NEXT:    %p0 = ptrtoaddr ptr null to i64
+; ALL-NEXT:    --> %p0 U: full-set S: full-set
+; ALL-NEXT:  Determining loop execution counts for: @ptrtoaddr_of_nullptr
+;
+  %p0 = ptrtoaddr ptr null to i64
+  store i64 %p0, ptr %out0
+  ret void
+}
+
+define void @ptrtoaddr_of_gep(ptr %in, ptr %out0) {
+; X64-LABEL: 'ptrtoaddr_of_gep'
+; X64-NEXT:  Classifying expressions for: @ptrtoaddr_of_gep
+; X64-NEXT:    %in_adj = getelementptr inbounds i8, ptr %in, i64 42
+; X64-NEXT:    --> (42 + %in) U: full-set S: full-set
+; X64-NEXT:    %p0 = ptrtoaddr ptr %in_adj to i64
+; X64-NEXT:    --> %p0 U: full-set S: full-set
+; X64-NEXT:  Determining loop execution counts for: @ptrtoaddr_of_gep
+;
+; X32-LABEL: 'ptrtoaddr_of_gep'
+; X32-NEXT:  Classifying expressions for: @ptrtoaddr_of_gep
+; X32-NEXT:    %in_adj = getelementptr inbounds i8, ptr %in, i64 42
+; X32-NEXT:    --> (42 + %in) U: full-set S: full-set
+; X32-NEXT:    %p0 = ptrtoaddr ptr %in_adj to i64
+; X32-NEXT:    --> %p0 U: full-set S: full-set
+; X32-NEXT:  Determining loop execution counts for: @ptrtoaddr_of_gep
+;
+  %in_adj = getelementptr inbounds i8, ptr %in, i64 42
+  %p0 = ptrtoaddr ptr %in_adj to i64
+  store i64  %p0, ptr  %out0
+  ret void
+}
diff --git a/llvm/test/Analysis/ScalarEvolution/ptrtoint.ll b/llvm/test/Analysis/ScalarEvolution/ptrtoint.ll
index acac2c98ad174..0c1f37bf58601 100644
--- a/llvm/test/Analysis/ScalarEvolution/ptrtoint.ll
+++ b/llvm/test/Analysis/ScalarEvolution/ptrtoint.ll
@@ -382,7 +382,7 @@ define void @pr46786_c26_char(ptr %arg, ptr %arg1, ptr %arg2) {
 ; X64-NEXT:    %i9 = ptrtoint ptr %i7 to i64
 ; X64-NEXT:    --> {(ptrtoint ptr %arg to i64),+,1}<nuw><%bb6> U: full-set S: full-set Exits: (-1 + (ptrtoint ptr %arg1 to i64)) LoopDispositions: { %bb6: Computable }
 ; X64-NEXT:    %i10 = sub i64 %i9, %i4
-; X64-NEXT:    --> {0,+,1}<nuw><%bb6> U: full-set S: full-set Exits: (-1 + (-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64)) LoopDispositions: { %bb6: Computable }
+; X64-NEXT:    --> {0,+,1}<nuw><%bb6> U: [0,-1) S: [0,-1) Exits: (-1 + (-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64)) LoopDispositions: { %bb6: Computable }
 ; X64-NEXT:    %i11 = getelementptr inbounds i8, ptr %arg2, i64 %i10
 ; X64-NEXT:    --> {%arg2,+,1}<nw><%bb6> U: full-set S: full-set Exits: (-1 + (-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64) + %arg2) LoopDispositions: { %bb6: Computable }
 ; X64-NEXT:    %i12 = load i8, ptr %i11, align 1
@@ -393,7 +393,7 @@ define void @pr46786_c26_char(ptr %arg, ptr %arg1, ptr %arg2) {
 ; X64-NEXT:    --> {(1 + %arg),+,1}<nuw><%bb6> U: full-set S: full-set Exits: ((-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64) + %arg) LoopDispositions: { %bb6: Computable }
 ; X64-NEXT:  Determining loop execution counts for: @pr46786_c26_char
 ; X64-NEXT:  Loop %bb6: backedge-taken count is (-1 + (-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64))
-; X64-NEXT:  Loop %bb6: constant max backedge-taken count is i64 -1
+; X64-NEXT:  Loop %bb6: constant max backedge-taken count is i64 -2
 ; X64-NEXT:  Loop %bb6: symbolic max backedge-taken count is (-1 + (-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64))
 ; X64-NEXT:  Loop %bb6: Trip multiple is 1
 ;
@@ -406,9 +406,9 @@ define void @pr46786_c26_char(ptr %arg, ptr %arg1, ptr %arg2) {
 ; X32-NEXT:    %i8 = load i8, ptr %i7, align 1
 ; X32-NEXT:    --> %i8 U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %bb6: Variant }
 ; X32-NEXT:    %i9 = ptrtoint ptr %i7 to i64
-; X32-NEXT:    --> {(zext i32 (ptrtoint ptr %arg to i32) to i64),+,1}<nuw><%bb6> U: [0,8589934591) S: [0,8589934591) Exits: ((zext i32 (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32)) to i64) + (zext i32 (ptrtoint ptr %arg to i32) to i64)) LoopDispositions: { %bb6: Computable }
+; X32-NEXT:    --> {(zext i32 (ptrtoint ptr %arg to i32) to i64),+,1}<nuw><%bb6> U: [0,8589934590) S: [0,8589934590) Exits: ((zext i32 (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32)) to i64) + (zext i32 (ptrtoint ptr %arg to i32) to i64)) LoopDispositions: { %bb6: Computable }
 ; X32-NEXT:    %i10 = sub i64 %i9, %i4
-; X32-NEXT:    --> {0,+,1}<nuw><%bb6> U: [0,4294967296) S: [0,4294967296) Exits: (zext i32 (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32)) to i64) LoopDispositions: { %bb6: Computable }
+; X32-NEXT:    --> {0,+,1}<nuw><%bb6> U: [0,4294967295) S: [0,4294967295) Exits: (zext i32 (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32)) to i64) LoopDispositions: { %bb6: Computable }
 ; X32-NEXT:    %i11 = getelementptr inbounds i8, ptr %arg2, i64 %i10
 ; X32-NEXT:    --> {%arg2,+,1}<%bb6> U: full-set S: full-set Exits: (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32) + %arg2) LoopDispositions: { %bb6: Computable }
 ; X32-NEXT:    %i12 = load i8, ptr %i11, align 1
@@ -419,7 +419,7 @@ define void @pr46786_c26_char(ptr %arg, ptr %arg1, ptr %arg2) {
 ; X32-NEXT:    --> {(1 + %arg),+,1}<nuw><%bb6> U: full-set S: full-set Exits: ((-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32) + %arg) LoopDispositions: { %bb6: Computable }
 ; X32-NEXT:  Determining loop execution counts for: @pr46786_c26_char
 ; X32-NEXT:  Loop %bb6: backedge-taken count is (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32))
-; X32-NEXT:  Loop %bb6: constant max backedge-taken count is i32 -1
+; X32-NEXT:  Loop %bb6: constant max backedge-taken count is i32 -2
 ; X32-NEXT:  Loop %bb6: symbolic max backedge-taken count is (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32))
 ; X32-NEXT:  Loop %bb6: Trip multiple is 1
 ;
@@ -459,7 +459,7 @@ define void @pr46786_c26_char_cmp_ops_swapped(ptr %arg, ptr %arg1, ptr %arg2) {
 ; X64-NEXT:    %i9 = ptrtoint ptr %i7 to i64
 ; X64-NEXT:    --> {(ptrtoint ptr %arg to i64),+,1}<nuw><%bb6> U: full-set S: full-set Exits: (-1 + (ptrtoint ptr %arg1 to i64)) LoopDispositions: { %bb6: Computable }
 ; X64-NEXT:    %i10 = sub i64 %i9, %i4
-; X64-NEXT:    --> {0,+,1}<nuw><%bb6> U: full-set S: full-set Exits: (-1 + (-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64)) LoopDispositions: { %bb6: Computable }
+; X64-NEXT:    --> {0,+,1}<nuw><%bb6> U: [0,-1) S: [0,-1) Exits: (-1 + (-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64)) LoopDispositions: { %bb6: Computable }
 ; X64-NEXT:    %i11 = getelementptr inbounds i8, ptr %arg2, i64 %i10
 ; X64-NEXT:    --> {%arg2,+,1}<nw><%bb6> U: full-set S: full-set Exits: (-1 + (-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64) + %arg2) LoopDispositions: { %bb6: Computable }
 ; X64-NEXT:    %i12 = load i8, ptr %i11, align 1
@@ -470,7 +470,7 @@ define void @pr46786_c26_char_cmp_ops_swapped(ptr %arg, ptr %arg1, ptr %arg2) {
 ; X64-NEXT:    --> {(1 + %arg),+,1}<nuw><%bb6> U: full-set S: full-set Exits: ((-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64) + %arg) LoopDispositions: { %bb6: Computable }
 ; X64-NEXT:  Determining loop execution counts for: @pr46786_c26_char_cmp_ops_swapped
 ; X64-NEXT:  Loop %bb6: backedge-taken count is (-1 + (-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64))
-; X64-NEXT:  Loop %bb6: constant max backedge-taken count is i64 -1
+; X64-NEXT:  Loop %bb6: constant max backedge-taken count is i64 -2
 ; X64-NEXT:  Loop %bb6: symbolic max backedge-taken count is (-1 + (-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64))
 ; X64-NEXT:  Loop %bb6: Trip multiple is 1
 ;
@@ -483,9 +483,9 @@ define void @pr46786_c26_char_cmp_ops_swapped(ptr %arg, ptr %arg1, ptr %arg2) {
 ; X32-NEXT:    %i8 = load i8, ptr %i7, align 1
 ; X32-NEXT:    --> %i8 U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %bb6: Variant }
 ; X32-NEXT:    %i9 = ptrtoint ptr %i7 to i64
-; X32-NEXT:    --> {(zext i32 (ptrtoint ptr %arg to i32) to i64),+,1}<nuw><%bb6> U: [0,8589934591) S: [0,8589934591) Exits: ((zext i32 (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32)) to i64) + (zext i32 (ptrtoint ptr %arg to i32) to i64)) LoopDispositions: { %bb6: Computable }
+; X32-NEXT:    --> {(zext i32 (ptrtoint ptr %arg to i32) to i64),+,1}<nuw><%bb6> U: [0,8589934590) S: [0,8589934590) Exits: ((zext i32 (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32)) to i64) + (zext i32 (ptrtoint ptr %arg to i32) to i64)) LoopDispositions: { %bb6: Computable }
 ; X32-NEXT:    %i10 = sub i64 %i9, %i4
-; X32-NEXT:    --> {0,+,1}<nuw><%bb6> U: [0,4294967296) S: [0,4294967296) Exits: (zext i32 (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32)) to i64) LoopDispositions: { %bb6: Computable }
+; X32-NEXT:    --> {0,+,1}<nuw><%bb6> U: [0,4294967295) S: [0,4294967295) Exits: (zext i32 (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32)) to i64) LoopDispositions: { %bb6: Computable }
 ; X32-NEXT:    %i11 = getelementptr inbounds i8, ptr %arg2, i64 %i10
 ; X32-NEXT:    --> {%arg2,+,1}<%bb6> U: full-set S: full-set Exits: (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32) + %arg2) LoopDispositions: { %bb6: Computable }
 ; X32-NEXT:    %i12 = load i8, ptr %i11, align 1
@@ -496,7 +496,7 @@ define void @pr46786_c26_char_cmp_ops_swapped(ptr %arg, ptr %arg1, ptr %arg2) {
 ; X32-NEXT:    --> {(1 + %arg),+,1}<nuw><%bb6> U: full-set S: full-set Exits: ((-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32) + %arg) LoopDispositions: { %bb6: Computable }
 ; X32-NEXT:  Determining loop execution counts for: @pr46786_c26_char_cmp_ops_swapped
 ; X32-NEXT:  Loop %bb6: backedge-taken count is (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32))
-; X32-NEXT:  Loop %bb6: constant max backedge-taken count is i32 -1
+; X32-NEXT:  Loop %bb6: constant max backedge-taken count is i32 -2
 ; X32-NEXT:  Loop %bb6: symbolic max backedge-taken count is (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32))
 ; X32-NEXT:  Loop %bb6: Trip multiple is 1
 ;
diff --git a/llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll b/llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll
index 7ba422da79ad8..f35c48b3e6fc5 100644
--- a/llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll
+++ b/llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll
@@ -578,22 +578,22 @@ define void @test_ptr_aligned_by_2_and_4_via_assumption(ptr %start, ptr %end) {
 ; CHECK-LABEL: 'test_ptr_aligned_by_2_and_4_via_assumption'
 ; CHECK-NEXT:  Classifying expressions for: @test_ptr_aligned_by_2_and_4_via_assumption
 ; CHECK-NEXT:    %iv = phi ptr [ %start, %entry ], [ %iv.next, %loop ]
-; CHECK-NEXT:    --> {%start,+,4}<%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {%start,+,4}<%loop> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %iv.next = getelementptr i8, ptr %iv, i64 4
-; CHECK-NEXT:    --> {(4 + %start),+,4}<%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {(4 + %start),+,4}<%loop> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @test_ptr_aligned_by_2_and_4_via_assumption
 ; CHECK-NEXT:  Loop %loop: Unpredictable backedge-taken count.
 ; CHECK-NEXT:  Loop %loop: Unpredictable constant max backedge-taken count.
 ; CHECK-NEXT:  Loop %loop: Unpredictable symbolic max backedge-taken count.
 ; CHECK-NEXT:  Loop %loop: Predicated backedge-taken count is ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4)
 ; CHECK-NEXT:   Predicates:
-; CHECK-NEXT:      Equal predicate: (zext i2 ((trunc i64 (ptrtoint ptr %end to i64) to i2) + (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2))) to i64) == 0
+; CHECK-NEXT:      Equal predicate: (zext i2 (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2)) to i64) == 0
 ; CHECK-NEXT:  Loop %loop: Predicated constant max backedge-taken count is i64 4611686018427387903
 ; CHECK-NEXT:   Predicates:
-; CHECK-NEXT:      Equal predicate: (zext i2 ((trunc i64 (ptrtoint ptr %end to i64) to i2) + (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2))) to i64) == 0
+; CHECK-NEXT:      Equal predicate: (zext i2 (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2)) to i64) == 0
 ; CHECK-NEXT:  Loop %loop: Predicated symbolic max backedge-taken count is ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4)
 ; CHECK-NEXT:   Predicates:
-; CHECK-NEXT:      Equal predicate: (zext i2 ((trunc i64 (ptrtoint ptr %end to i64) to i2) + (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2))) to i64) == 0
+; CHECK-NEXT:      Equal predicate: (zext i2 (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2)) to i64) == 0
 ;
 entry:
   call void @llvm.assume(i1 true) [ "align"(ptr %start, i64 2) ]
@@ -615,9 +615,9 @@ define void @test_ptrs_aligned_by_4_via_assumption(ptr %start, ptr %end) {
 ; CHECK-LABEL: 'test_ptrs_aligned_by_4_via_assumption'
 ; CHECK-NEXT:  Classifying expressions for: @test_ptrs_aligned_by_4_via_assumption
 ; CHECK-NEXT:    %iv = phi ptr [ %start, %entry ], [ %iv.next, %loop ]
-; CHECK-NEXT:    --> {%start,+,4}<%loop> U: full-set S: full-set Exits: ((4 * ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4))<nuw> + %start) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {%start,+,4}<%loop> U: [0,-3) S: [-9223372036854775808,9223372036854775805) Exits: (-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64) + %start) LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %iv.next = getelementptr i8, ptr %iv, i64 4
-; CHECK-NEXT:    --> {(4 + %start),+,4}<%loop> U: full-set S: full-set Exits: (4 + (4 * ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4))<nuw> + %start) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {(4 + %start),+,4}<%loop> U: [0,-3) S: [-9223372036854775808,9223372036854775805) Exits: ((-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64) + %start) LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @test_ptrs_aligned_by_4_via_assumption
 ; CHECK-NEXT:  Loop %loop: backedge-taken count is ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4)
 ; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 4611686018427387903
@@ -644,9 +644,9 @@ define void @test_ptrs_aligned_by_8_via_assumption(ptr %start, ptr %end) {
 ; CHECK-LABEL: 'test_ptrs_aligned_by_8_via_assumption'
 ; CHECK-NEXT:  Classifying expressions for: @test_ptrs_aligned_by_8_via_assumption
 ; CHECK-NEXT:    %iv = phi ptr [ %start, %entry ], [ %iv.next, %loop ]
-; CHECK-NEXT:    --> {%start,+,4}<%loop> U: full-set S: full-set Exits: ((4 * ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4))<nuw> + %start) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {%start,+,4}<%loop> U: [0,-3) S: [-9223372036854775808,9223372036854775805) Exits: (-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64) + %start) LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %iv.next = getelementptr i8, ptr %iv, i64 4
-; CHECK-NEXT:    --> {(4 + %start),+,4}<%loop> U: full-set S: full-set Exits: (4 + (4 * ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4))<nuw> + %start) LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {(4 + %start)<nuw><nsw>,+,4}<%loop> U: [0,-3) S: [-9223372036854775808,9223372036854775805) Exits: ((-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64) + %start) LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @test_ptrs_aligned_by_8_via_assumption
 ; CHECK-NEXT:  Loop %loop: backedge-taken count is ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4)
 ; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 4611686018427387903
@@ -677,22 +677,22 @@ define void @test_ptr_aligned_by_4_via_assumption_multiple_loop_predecessors(ptr
 ; CHECK-NEXT:    %c = call i1 @cond()
 ; CHECK-NEXT:    --> %c U: full-set S: full-set
 ; CHECK-NEXT:    %iv = phi ptr [ %start, %then ], [ %start, %else ], [ %iv.next, %loop ]
-; CHECK-NEXT:    --> {%start,+,4}<%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {%start,+,4}<%loop> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %iv.next = getelementptr i8, ptr %iv, i64 4
-; CHECK-NEXT:    --> {(4 + %start),+,4}<%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {(4 + %start),+,4}<%loop> U: [0,-1) S: [-9223372036854775808,9223372036854775807) Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @test_ptr_aligned_by_4_via_assumption_multiple_loop_predecessors
 ; CHECK-NEXT:  Loop %loop: Unpredictable backedge-taken count.
 ; CHECK-NEXT:  Loop %loop: Unpredictable constant max backedge-taken count.
 ; CHECK-NEXT:  Loop %loop: Unpredictable symbolic max backedge-taken count.
 ; CHECK-NEXT:  Loop %loop: Predicated backedge-taken count is ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4)
 ; CHECK-NEXT:   Predicates:
-; CHECK-NEXT:      Equal predicate: (zext i2 ((trunc i64 (ptrtoint ptr %end to i64) to i2) + (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2))) to i64) == 0
+; CHECK-NEXT:      Equal predicate: (zext i2 (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2)) to i64) == 0
 ; CHECK-NEXT:  Loop %loop: Predicated constant max backedge-taken count is i64 4611686018427387903
 ; CHECK-NEXT:   Predicates:
-; CHECK-NEXT:      Equal predicate: (zext i2 ((trunc i64 (ptrtoint ptr %end to i64) to i2) + (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2))) to i64) == 0
+; CHECK-NEXT:      Equal predicate: (zext i2 (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2)) to i64) == 0
 ; CHECK-NEXT:  Loop %loop: Predicated symbolic max backedge-taken count is ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4)
 ; CHECK-NEXT:   Predicates:
-; CHECK-NEXT:      Equal predicate: (zext i2 ((trunc i64 (ptrtoint ptr %end to i64) to i2) + (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2))) to i64) == 0
+; CHECK-NEXT:      Equal predicate: (zext i2 (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2)) to i64) == 0
 ;
 entry:
   call void @llvm.assume(i1 true) [ "align"(ptr %start, i64 2) ]
@@ -717,5 +717,46 @@ exit:
   ret void
 }
 
+define void @test_urem_non_constant(ptr %dst, i32 %a, i32 %b) {
+; CHECK-LABEL: 'test_urem_non_constant'
+; CHECK-NEXT:  Classifying expressions for: @test_urem_non_constant
+; CHECK-NEXT:    %rem = urem i32 %a, %b
+; CHECK-NEXT:    --> ((-1 * (%a /u %b) * %b) + %a) U: full-set S: full-set
+; CHECK-NEXT:    %and.0 = and i1 %pre.0, %pre.1
+; CHECK-NEXT:    --> (%pre.1 umin %pre.0) U: full-set S: full-set
+; CHECK-NEXT:    %and.1 = and i1 %and.0, %pre.2
+; CHECK-NEXT:    --> (%pre.1 umin %pre.2 umin %pre.0) U: full-set S: full-set
+; CHECK-NEXT:    %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    --> {0,+,%b}<%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %gep.dst = getelementptr inbounds i8, ptr %dst, i32 %b
+; CHECK-NEXT:    --> ((sext i32 %b to i64) + %dst) U: full-set S: full-set Exits: ((sext i32 %b to i64) + %dst) LoopDispositions: { %loop: Invariant }
+; CHECK-NEXT:    %iv.next = add i32 %iv, %b
+; CHECK-NEXT:    --> {%b,+,%b}<%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:  Determining loop execution counts for: @test_urem_non_constant
+; CHECK-NEXT:  Loop %loop: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %loop: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %loop: Unpredictable symbolic max backedge-taken count.
+;
+entry:
+  %rem = urem i32 %a, %b
+  %pre.0 = icmp eq i32 %rem, 0
+  %pre.1 = icmp ne i32 %a, 0
+  %pre.2 = icmp ne i32 %b, 0
+  %and.0 = and i1 %pre.0, %pre.1
+  %and.1 = and i1 %and.0, %pre.2
+  br i1 %and.1, label %loop, label %exit
+
+loop:
+  %iv = phi i32 [ 0, %entry], [ %iv.next, %loop ]
+  %gep.dst = getelementptr inbounds i8, ptr %dst, i32 %b
+  store i8 0, ptr %gep.dst
+  %iv.next = add i32 %iv, %b
+  %ec = icmp ne i32 %iv.next, %a
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret void
+}
+
 declare void @llvm.assume(i1)
 declare void @llvm.experimental.guard(i1, ...)
diff --git a/llvm/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll b/llvm/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll
index 7ec674a4e2c84..d148b9eabc167 100644
--- a/llvm/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll
+++ b/llvm/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -aa-pipeline=tbaa,basic-aa -passes=gvn -S < %s | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
@@ -5,12 +6,15 @@ target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-
 ; TBAA should prove that these calls don't interfere, since they are
 ; IntrArgReadMem and have TBAA metadata.
 
-; CHECK:      define <8 x i16> @test0(ptr %p, ptr %q, <8 x i16> %y, <8 x i1> %m, <8 x i16> %pt) {
-; CHECK-NEXT: entry:
-; CHECK-NEXT:   %a = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %p, i32 16, <8 x i1> %m, <8 x i16> %pt) [[NUW:#[0-9]+]]
-; CHECK-NEXT:   call void @llvm.masked.store.v8i16.p0(<8 x i16> %y, ptr %q, i32 16, <8 x i1> %m)
-; CHECK-NEXT:   %c = add <8 x i16> %a, %a
 define <8 x i16> @test0(ptr %p, ptr %q, <8 x i16> %y, <8 x i1> %m, <8 x i16> %pt) {
+; CHECK-LABEL: define <8 x i16> @test0(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], <8 x i16> [[Y:%.*]], <8 x i1> [[M:%.*]], <8 x i16> [[PT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[A:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 16 [[P]], <8 x i1> [[M]], <8 x i16> [[PT]]), !tbaa [[B_TBAA0:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.masked.store.v8i16.p0(<8 x i16> [[Y]], ptr align 16 [[Q]], <8 x i1> [[M]]), !tbaa [[A_TBAA3:![0-9]+]]
+; CHECK-NEXT:    [[C:%.*]] = add <8 x i16> [[A]], [[A]]
+; CHECK-NEXT:    ret <8 x i16> [[C]]
+;
 entry:
   %a = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %p, i32 16, <8 x i1> %m, <8 x i16> %pt) nounwind, !tbaa !2
   call void @llvm.masked.store.v8i16.p0(<8 x i16> %y, ptr %q, i32 16, <8 x i1> %m), !tbaa !1
@@ -24,10 +28,16 @@ declare void @llvm.masked.store.v8i16.p0(<8 x i16>, ptr, i32, <8 x i1>) nounwind
 
 ; CHECK: attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
 ; CHECK: attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) }
-; CHECK: attributes [[NUW]] = { nounwind }
 
 !0 = !{!"tbaa root"}
 !1 = !{!3, !3, i64 0}
 !2 = !{!4, !4, i64 0}
 !3 = !{!"A", !0}
 !4 = !{!"B", !0}
+;.
+; CHECK: [[B_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK: [[META1]] = !{!"B", [[META2:![0-9]+]]}
+; CHECK: [[META2]] = !{!"tbaa root"}
+; CHECK: [[A_TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+; CHECK: [[META4]] = !{!"A", [[META2]]}
+;.
diff --git a/llvm/test/Assembler/autoupgrade-lifetime-intrinsics.ll b/llvm/test/Assembler/autoupgrade-lifetime-intrinsics.ll
index 377c00203079f..49174d2cdc187 100644
--- a/llvm/test/Assembler/autoupgrade-lifetime-intrinsics.ll
+++ b/llvm/test/Assembler/autoupgrade-lifetime-intrinsics.ll
@@ -56,6 +56,45 @@ define void @remove_unanalyzable(ptr %p) {
   ret void
 }
 
+define void @no_declaration() {
+; CHECK-LABEL: define void @no_declaration() {
+; CHECK-NEXT:    [[A:%.*]] = alloca i8, align 1, addrspace(2)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p2(ptr addrspace(2) [[A]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p2(ptr addrspace(2) [[A]])
+; CHECK-NEXT:    ret void
+;
+  %a = alloca i8, addrspace(2)
+  call void @llvm.lifetime.start.p2(i64 1, ptr addrspace(2) %a)
+  call void @llvm.lifetime.end.p2(i64 1, ptr addrspace(2) %a)
+  ret void
+}
+
+define void @no_suffix1() {
+; CHECK-LABEL: define void @no_suffix1() {
+; CHECK-NEXT:    [[A:%.*]] = alloca i8, align 1, addrspace(3)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p3(ptr addrspace(3) [[A]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p3(ptr addrspace(3) [[A]])
+; CHECK-NEXT:    ret void
+;
+  %a = alloca i8, addrspace(3)
+  call void @llvm.lifetime.start(i64 1, ptr addrspace(3) %a)
+  call void @llvm.lifetime.end(i64 1, ptr addrspace(3) %a)
+  ret void
+}
+
+define void @no_suffix2() {
+; CHECK-LABEL: define void @no_suffix2() {
+; CHECK-NEXT:    [[A:%.*]] = alloca i8, align 1, addrspace(4)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p4(ptr addrspace(4) [[A]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p4(ptr addrspace(4) [[A]])
+; CHECK-NEXT:    ret void
+;
+  %a = alloca i8, addrspace(4)
+  call void @llvm.lifetime.start(i64 1, ptr addrspace(4) %a)
+  call void @llvm.lifetime.end(i64 1, ptr addrspace(4) %a)
+  ret void
+}
+
 declare void @llvm.lifetime.start.p0(i64, ptr)
 declare void @llvm.lifetime.end.p0(i64, ptr)
 declare void @llvm.lifetime.start.p1(i64, ptr addrspace(1))
diff --git a/llvm/test/Assembler/autoupgrade-wasm-intrinsics.ll b/llvm/test/Assembler/autoupgrade-wasm-intrinsics.ll
index 012fa1dfe7e28..e54efa45a9539 100644
--- a/llvm/test/Assembler/autoupgrade-wasm-intrinsics.ll
+++ b/llvm/test/Assembler/autoupgrade-wasm-intrinsics.ll
@@ -46,7 +46,10 @@ define <4 x float> @test_fms(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
   ret <4 x float> %res
 }
 
-declare <16 x i8> @llvm.wasm.laneselect.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
+; This declaration is intentionally omitted to check that intrinsic upgrade
+; also works without a declaration.
+; declare <16 x i8> @llvm.wasm.laneselect.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
+
 declare <8 x i16> @llvm.wasm.dot.i8x16.i7x16.signed(<16 x i8>, <16 x i8>)
 declare <4 x i32> @llvm.wasm.dot.i8x16.i7x16.add.signed(<16 x i8>, <16 x i8>, <4 x i32>)
 declare <4 x float> @llvm.wasm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
diff --git a/llvm/test/Assembler/dicompileunit-invalid-language-version.ll b/llvm/test/Assembler/dicompileunit-invalid-language-version.ll
new file mode 100644
index 0000000000000..b3794ac749f60
--- /dev/null
+++ b/llvm/test/Assembler/dicompileunit-invalid-language-version.ll
@@ -0,0 +1,25 @@
+; RUN: split-file %s %t
+; RUN: not llvm-as < %t/dw_lang_with_version.ll -disable-output 2>&1 | FileCheck %s --check-prefix=WRONG-ATTR
+; RUN: not llvm-as < %t/overflow.ll -disable-output 2>&1 | FileCheck %s --check-prefix=OVERFLOW
+; RUN: not llvm-as < %t/version_without_name.ll -disable-output 2>&1 | FileCheck %s --check-prefix=NO-NAME
+; RUN: not llvm-as < %t/negative.ll -disable-output 2>&1 | FileCheck %s --check-prefix=NEGATIVE
+
+; WRONG-ATTR: error: 'sourceLanguageVersion' requires an associated 'sourceLanguageName' on !DICompileUnit
+; OVERFLOW: error: value for 'sourceLanguageVersion' too large, limit is 4294967295
+; NEGATIVE: error: expected unsigned integer
+; NO-NAME: error: missing one of 'language' or 'sourceLanguageName', required for !DICompileUnit
+
+;--- dw_lang_with_version.ll
+!0 = distinct !DICompileUnit(language: DW_LANG_C, sourceLanguageVersion: 1,
+                             file: !DIFile(filename: "", directory: ""))
+
+;--- overflow.ll
+!0 = distinct !DICompileUnit(sourceLanguageName: DW_LNAME_C, sourceLanguageVersion: 4294967298)
+
+;--- negative.ll
+!0 = distinct !DICompileUnit(sourceLanguageName: DW_LNAME_C, sourceLanguageVersion: -1,
+                             file: !DIFile(filename: "", directory: ""))
+
+;--- version_without_name.ll
+!0 = distinct !DICompileUnit(sourceLanguageVersion: 1,
+                             file: !DIFile(filename: "", directory: ""))
diff --git a/llvm/test/Assembler/implicit-intrinsic-declaration-invalid3.ll b/llvm/test/Assembler/implicit-intrinsic-declaration-invalid3.ll
index ad5a96a6d8519..4caee57814977 100644
--- a/llvm/test/Assembler/implicit-intrinsic-declaration-invalid3.ll
+++ b/llvm/test/Assembler/implicit-intrinsic-declaration-invalid3.ll
@@ -2,7 +2,7 @@
 
 ; Use of unknown intrinsic without declaration should be rejected.
 
-; CHECK: error: use of undefined value '@llvm.foobar'
+; CHECK: error: unknown intrinsic 'llvm.foobar'
 define void @test() {
   call i8 @llvm.foobar(i8 0, i16 1)
   ret void
diff --git a/llvm/test/Assembler/masked-load-store-intrinsics-attributes.ll b/llvm/test/Assembler/masked-load-store-intrinsics-attributes.ll
index bd0da026ca5db..e24db9959425d 100644
--- a/llvm/test/Assembler/masked-load-store-intrinsics-attributes.ll
+++ b/llvm/test/Assembler/masked-load-store-intrinsics-attributes.ll
@@ -4,11 +4,11 @@
 ; Specifically `captures(none)' should be added to the pointer parameters for
 ; the loads/stores
 
-; CHECK: declare <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr captures(none), i32 immarg, <vscale x 2 x i1>, <vscale x 2 x i64>) [[ARGMEMONLY_NOCALLBACK_NOFREE_NOSYNC_NOUNWIND_READONLY_WILLRETURN:#[0-9]+]]
-declare <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
+; CHECK: declare <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr captures(none), <vscale x 2 x i1>, <vscale x 2 x i64>) [[ARGMEMONLY_NOCALLBACK_NOFREE_NOSYNC_NOUNWIND_READONLY_WILLRETURN:#[0-9]+]]
+declare <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr, <vscale x 2 x i1>, <vscale x 2 x i64>)
 
-; CHECK: declare void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64>, ptr captures(none), i32 immarg, <vscale x 2 x i1>) [[ARGMEMONLY_NOCALLBACK_NOFREE_NOSYNC_NOUNWIND_WILLRETURN_WRITEONLY:#[0-9]+]]
-declare void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64>, ptr, i32, <vscale x 2 x i1>)
+; CHECK: declare void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64>, ptr captures(none), <vscale x 2 x i1>) [[ARGMEMONLY_NOCALLBACK_NOFREE_NOSYNC_NOUNWIND_WILLRETURN_WRITEONLY:#[0-9]+]]
+declare void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64>, ptr, <vscale x 2 x i1>)
 
 ; CHECK: declare <16 x float> @llvm.masked.expandload.v16f32(ptr captures(none), <16 x i1>, <16 x float>) [[NOCALLBACK_NOFREE_NOSYNC_NOUNWIND_READONLY_WILLRETURN:#[0-9]+]]
 declare <16 x float> @llvm.masked.expandload.v16f32 (ptr, <16 x i1>, <16 x float>)
diff --git a/llvm/test/Bindings/llvm-c/debug_info_new_format.ll b/llvm/test/Bindings/llvm-c/debug_info_new_format.ll
index c2c55d544e22c..b86f27482fa4c 100644
--- a/llvm/test/Bindings/llvm-c/debug_info_new_format.ll
+++ b/llvm/test/Bindings/llvm-c/debug_info_new_format.ll
@@ -3,37 +3,37 @@
 
 ; CHECK: ; ModuleID = 'debuginfo.c'
 ; CHECK-NEXT: source_filename = "debuginfo.c"
- 
-; CHECK:      define i64 @foo(i64 %0, i64 %1, <10 x i64> %2) !dbg !44 {
+
+; CHECK:      define i64 @foo(i64 %0, i64 %1, <10 x i64> %2) !dbg !45 {
 ; CHECK-NEXT: entry:
-; CHECK-NEXT:     #dbg_declare(i64 0, !49, !DIExpression(), !58)
-; CHECK-NEXT:     #dbg_declare(i64 0, !50, !DIExpression(), !58)
-; CHECK-NEXT:     #dbg_declare(i64 0, !51, !DIExpression(), !58)
-; CHECK-NEXT:     #dbg_label(!59, !58)
+; CHECK-NEXT:     #dbg_declare(i64 0, !50, !DIExpression(), !59)
+; CHECK-NEXT:     #dbg_declare(i64 0, !51, !DIExpression(), !59)
+; CHECK-NEXT:     #dbg_declare(i64 0, !52, !DIExpression(), !59)
+; CHECK-NEXT:     #dbg_label(!60, !59)
 ; CHECK-NEXT:   br label %vars
-; CHECK-NEXT:     #dbg_label(!60, !58)
+; CHECK-NEXT:     #dbg_label(!61, !59)
 ; CHECK-NEXT:   br label %vars
  
 ; CHECK:      vars:                                             ; preds = %entry, %entry
 ; CHECK-NEXT:   %p1 = phi i64 [ 0, %entry ]
 ; CHECK-NEXT:   %p2 = phi i64 [ 0, %entry ]
-; CHECK-NEXT:     #dbg_value(i64 0, !42, !DIExpression(DW_OP_constu, 0, DW_OP_stack_value), !61)
-; CHECK-NEXT:     #dbg_value(i64 1, !52, !DIExpression(DW_OP_constu, 1, DW_OP_stack_value), !61)
+; CHECK-NEXT:     #dbg_value(i64 0, !43, !DIExpression(DW_OP_constu, 0, DW_OP_stack_value), !62)
+; CHECK-NEXT:     #dbg_value(i64 1, !53, !DIExpression(DW_OP_constu, 1, DW_OP_stack_value), !62)
 ; CHECK-NEXT:   %a = add i64 %p1, %p2
 ; CHECK-NEXT:   ret i64 0
 ; CHECK-NEXT: }
  
 ; CHECK:      !llvm.dbg.cu = !{!0}
-; CHECK-NEXT: !FooType = !{!33}
+; CHECK-NEXT: !FooType = !{!34}
 ; CHECK-NEXT: !EnumTest = !{!3}
 ; CHECK-NEXT: !LargeEnumTest = !{!11}
-; CHECK-NEXT: !SubrangeType = !{!36}
-; CHECK-NEXT: !SetType1 = !{!37}
-; CHECK-NEXT: !SetType2 = !{!38}
-; CHECK-NEXT: !DynType = !{!39}
-; CHECK-NEXT: !ClassType = !{!54}
+; CHECK-NEXT: !SubrangeType = !{!37}
+; CHECK-NEXT: !SetType1 = !{!38}
+; CHECK-NEXT: !SetType2 = !{!39}
+; CHECK-NEXT: !DynType = !{!40}
+; CHECK-NEXT: !ClassType = !{!55}
  
-; CHECK:      !0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "llvm-c-test", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, globals: !16, imports: !24, macros: !28, splitDebugInlining: false, sysroot: "/")
+; CHECK:      !0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "llvm-c-test", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, globals: !16, imports: !24, macros: !29, splitDebugInlining: false, sysroot: "/")
 ; CHECK-NEXT: !1 = !DIFile(filename: "debuginfo.c", directory: ".")
 ; CHECK-NEXT: !2 = !{!3, !11}
 ; CHECK-NEXT: !3 = !DICompositeType(tag: DW_TAG_enumeration_type, name: "EnumTest", scope: !4, file: !1, baseType: !6, size: 64, elements: !7)
@@ -57,41 +57,42 @@
 ; CHECK-NEXT: !21 = !DIGlobalVariableExpression(var: !22, expr: !DIExpression(DW_OP_constu, 0, DW_OP_stack_value))
 ; CHECK-NEXT: !22 = distinct !DIGlobalVariable(name: "global", scope: !5, file: !1, line: 1, type: !23, isLocal: true, isDefinition: true)
 ; CHECK-NEXT: !23 = !DIDerivedType(tag: DW_TAG_typedef, name: "int64_t", scope: !1, file: !1, line: 42, baseType: !6)
-; CHECK-NEXT: !24 = !{!25, !27}
-; CHECK-NEXT: !25 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !5, entity: !26, file: !1, line: 42)
+; CHECK-NEXT: !24 = !{!25, !28}
+; CHECK-NEXT: !25 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !5, entity: !26, file: !27, line: 42)
 ; CHECK-NEXT: !26 = !DIModule(scope: null, name: "llvm-c-test-import", includePath: "/test/include/llvm-c-test-import.h")
-; CHECK-NEXT: !27 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !5, entity: !25, file: !1, line: 42)
-; CHECK-NEXT: !28 = !{!29}
-; CHECK-NEXT: !29 = !DIMacroFile(file: !1, nodes: !30)
-; CHECK-NEXT: !30 = !{!31, !32}
-; CHECK-NEXT: !31 = !DIMacro(type: DW_MACINFO_define, name: "SIMPLE_DEFINE")
-; CHECK-NEXT: !32 = !DIMacro(type: DW_MACINFO_define, name: "VALUE_DEFINE", value: "1")
-; CHECK-NEXT: !33 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !34, size: 192, addressSpace: 0)
-; CHECK-NEXT: !34 = !DICompositeType(tag: DW_TAG_structure_type, name: "MyStruct", scope: !4, file: !1, size: 192, elements: !35, runtimeLang: DW_LANG_C89, identifier: "MyStruct")
-; CHECK-NEXT: !35 = !{!6, !6, !6}
-; CHECK-NEXT: !36 = !DISubrangeType(name: "foo", scope: !1, file: !1, line: 42, size: 64, baseType: !6, lowerBound: i64 0, upperBound: i64 1, stride: i64 8, bias: i64 4)
-; CHECK-NEXT: !37 = !DIDerivedType(tag: DW_TAG_set_type, name: "enumset", scope: !1, file: !1, line: 42, baseType: !3, size: 64)
-; CHECK-NEXT: !38 = !DIDerivedType(tag: DW_TAG_set_type, name: "subrangeset", scope: !1, file: !1, line: 42, baseType: !36, size: 64)
-; CHECK-NEXT: !39 = !DICompositeType(tag: DW_TAG_array_type, name: "foo", scope: !1, file: !1, line: 42, baseType: !6, size: 640, elements: !40, dataLocation: !DIExpression(), associated: !42, rank: !DIExpression())
-; CHECK-NEXT: !40 = !{!41}
-; CHECK-NEXT: !41 = !DISubrange(count: 10, lowerBound: 0)
-; CHECK-NEXT: !42 = !DILocalVariable(name: "d", scope: !43, file: !1, line: 43, type: !6)
-; CHECK-NEXT: !43 = distinct !DILexicalBlock(scope: !44, file: !1, line: 42)
-; CHECK-NEXT: !44 = distinct !DISubprogram(name: "foo", linkageName: "foo", scope: !1, file: !1, line: 42, type: !45, scopeLine: 42, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !48)
-; CHECK-NEXT: !45 = !DISubroutineType(types: !46)
-; CHECK-NEXT: !46 = !{!6, !6, !47}
-; CHECK-NEXT: !47 = !DICompositeType(tag: DW_TAG_array_type, baseType: !6, size: 640, flags: DIFlagVector, elements: !40)
-; CHECK-NEXT: !48 = !{!49, !50, !51, !42, !52, !53}
-; CHECK-NEXT: !49 = !DILocalVariable(name: "a", arg: 1, scope: !44, file: !1, line: 42, type: !6)
-; CHECK-NEXT: !50 = !DILocalVariable(name: "b", arg: 2, scope: !44, file: !1, line: 42, type: !6)
-; CHECK-NEXT: !51 = !DILocalVariable(name: "c", arg: 3, scope: !44, file: !1, line: 42, type: !47)
-; CHECK-NEXT: !52 = !DILocalVariable(name: "e", scope: !43, file: !1, line: 44, type: !6)
-; CHECK-NEXT: !53 = !DILabel(scope: !44, name: "label3", file: !1, line: 42)
-; CHECK-NEXT: !54 = !DICompositeType(tag: DW_TAG_class_type, name: "Class", scope: !4, file: !1, size: 192, flags: DIFlagFwdDecl, elements: !55, identifier: "FooClass")
-; CHECK-NEXT: !55 = !{!56}
-; CHECK-NEXT: !56 = !{!6, !6, !57}
-; CHECK-NEXT: !57 = !DIBasicType(name: "Int32", size: 32)
-; CHECK-NEXT: !58 = !DILocation(line: 42, scope: !44)
-; CHECK-NEXT: !59 = !DILabel(scope: !44, name: "label1", file: !1, line: 42)
-; CHECK-NEXT: !60 = !DILabel(scope: !44, name: "label2", file: !1, line: 42)
-; CHECK-NEXT: !61 = !DILocation(line: 43, scope: !44)
+; CHECK-NEXT: !27 = !DIFile(filename: "debuginfo.c", directory: ".", checksumkind: CSK_MD5, checksum: "1234", source: "source")
+; CHECK-NEXT: !28 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !5, entity: !25, file: !1, line: 42)
+; CHECK-NEXT: !29 = !{!30}
+; CHECK-NEXT: !30 = !DIMacroFile(file: !1, nodes: !31)
+; CHECK-NEXT: !31 = !{!32, !33}
+; CHECK-NEXT: !32 = !DIMacro(type: DW_MACINFO_define, name: "SIMPLE_DEFINE")
+; CHECK-NEXT: !33 = !DIMacro(type: DW_MACINFO_define, name: "VALUE_DEFINE", value: "1")
+; CHECK-NEXT: !34 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !35, size: 192, addressSpace: 0)
+; CHECK-NEXT: !35 = !DICompositeType(tag: DW_TAG_structure_type, name: "MyStruct", scope: !4, file: !1, size: 192, elements: !36, runtimeLang: DW_LANG_C89, identifier: "MyStruct")
+; CHECK-NEXT: !36 = !{!6, !6, !6}
+; CHECK-NEXT: !37 = !DISubrangeType(name: "foo", scope: !1, file: !1, line: 42, size: 64, baseType: !6, lowerBound: i64 0, upperBound: i64 1, stride: i64 8, bias: i64 4)
+; CHECK-NEXT: !38 = !DIDerivedType(tag: DW_TAG_set_type, name: "enumset", scope: !1, file: !1, line: 42, baseType: !3, size: 64)
+; CHECK-NEXT: !39 = !DIDerivedType(tag: DW_TAG_set_type, name: "subrangeset", scope: !1, file: !1, line: 42, baseType: !37, size: 64)
+; CHECK-NEXT: !40 = !DICompositeType(tag: DW_TAG_array_type, name: "foo", scope: !1, file: !1, line: 42, baseType: !6, size: 640, elements: !41, dataLocation: !DIExpression(), associated: !43, rank: !DIExpression())
+; CHECK-NEXT: !41 = !{!42}
+; CHECK-NEXT: !42 = !DISubrange(count: 10, lowerBound: 0)
+; CHECK-NEXT: !43 = !DILocalVariable(name: "d", scope: !44, file: !1, line: 43, type: !6)
+; CHECK-NEXT: !44 = distinct !DILexicalBlock(scope: !45, file: !1, line: 42)
+; CHECK-NEXT: !45 = distinct !DISubprogram(name: "foo", linkageName: "foo", scope: !1, file: !1, line: 42, type: !46, scopeLine: 42, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !49)
+; CHECK-NEXT: !46 = !DISubroutineType(types: !47)
+; CHECK-NEXT: !47 = !{!6, !6, !48}
+; CHECK-NEXT: !48 = !DICompositeType(tag: DW_TAG_array_type, baseType: !6, size: 640, flags: DIFlagVector, elements: !41)
+; CHECK-NEXT: !49 = !{!50, !51, !52, !43, !53, !54}
+; CHECK-NEXT: !50 = !DILocalVariable(name: "a", arg: 1, scope: !45, file: !1, line: 42, type: !6)
+; CHECK-NEXT: !51 = !DILocalVariable(name: "b", arg: 2, scope: !45, file: !1, line: 42, type: !6)
+; CHECK-NEXT: !52 = !DILocalVariable(name: "c", arg: 3, scope: !45, file: !1, line: 42, type: !48)
+; CHECK-NEXT: !53 = !DILocalVariable(name: "e", scope: !44, file: !1, line: 44, type: !6)
+; CHECK-NEXT: !54 = !DILabel(scope: !45, name: "label3", file: !1, line: 42)
+; CHECK-NEXT: !55 = !DICompositeType(tag: DW_TAG_class_type, name: "Class", scope: !4, file: !1, size: 192, flags: DIFlagFwdDecl, elements: !56, identifier: "FooClass")
+; CHECK-NEXT: !56 = !{!57}
+; CHECK-NEXT: !57 = !{!6, !6, !58}
+; CHECK-NEXT: !58 = !DIBasicType(name: "Int32", size: 32)
+; CHECK-NEXT: !59 = !DILocation(line: 42, scope: !45)
+; CHECK-NEXT: !60 = !DILabel(scope: !45, name: "label1", file: !1, line: 42)
+; CHECK-NEXT: !61 = !DILabel(scope: !45, name: "label2", file: !1, line: 42)
+; CHECK-NEXT: !62 = !DILocation(line: 43, scope: !45)
diff --git a/llvm/test/Bitcode/Inputs/compile-unit-no-versioned-language.bc b/llvm/test/Bitcode/Inputs/compile-unit-no-versioned-language.bc
new file mode 100644
index 0000000000000..461a34d02ffb1
Binary files /dev/null and b/llvm/test/Bitcode/Inputs/compile-unit-no-versioned-language.bc differ
diff --git a/llvm/test/Bitcode/dwarf-source-language-version.ll b/llvm/test/Bitcode/dwarf-source-language-version.ll
new file mode 100644
index 0000000000000..311afd5a99afe
--- /dev/null
+++ b/llvm/test/Bitcode/dwarf-source-language-version.ll
@@ -0,0 +1,17 @@
+; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s --implicit-check-not "sourceLanguageVersion: 0"
+
+; CHECK: sourceLanguageVersion: 120
+
+source_filename = "cu.cpp"
+target triple = "arm64-apple-macosx"
+
+!llvm.dbg.cu = !{!0, !5}
+!llvm.module.flags = !{!3, !4}
+
+!0 = distinct !DICompileUnit(sourceLanguageName: DW_LNAME_ObjC_plus_plus, sourceLanguageVersion: 120, file: !1, producer: "handwritten", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, globals: !2, splitDebugInlining: false, nameTableKind: Apple, sysroot: "/")
+!1 = !DIFile(filename: "cu.cpp", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 7, !"Dwarf Version", i32 5}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = distinct !DICompileUnit(sourceLanguageName: DW_LNAME_ObjC_plus_plus, sourceLanguageVersion: 0, file: !6, producer: "handwritten", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, globals: !2, splitDebugInlining: false, nameTableKind: Apple, sysroot: "/")
+!6 = !DIFile(filename: "cu2.cpp", directory: "/tmp")
diff --git a/llvm/test/Bitcode/upgrade-DICompileUnit-no-versioned-language.test b/llvm/test/Bitcode/upgrade-DICompileUnit-no-versioned-language.test
new file mode 100644
index 0000000000000..9475f9b1134fa
--- /dev/null
+++ b/llvm/test/Bitcode/upgrade-DICompileUnit-no-versioned-language.test
@@ -0,0 +1,21 @@
+; Test loading metadata which was not aware of versioned language names.
+;
+; RUN: llvm-dis -o - %p/Inputs/compile-unit-no-versioned-language.bc \
+; RUN:     | FileCheck %s --implicit-check-not "sourceLanguageName" --implicit-check-not "sourceLanguageVersion"
+
+; Input bitcode file was compiled from following source on
+; LLVM commit `fc22b58c25963ece6b041cadbdc931c2338955e4`:
+;
+; source_filename = "cu.cpp"
+; target triple = "arm64-apple-macosx"
+; 
+; !llvm.dbg.cu = !{!0}
+; !llvm.module.flags = !{!3, !4}
+; 
+; !0 = distinct !DICompileUnit(language: DW_LANG_ObjC, file: !1, producer: "handwritten", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, globals: !2, splitDebugInlining: false, nameTableKind: Apple, sysroot: "/")
+; !1 = !DIFile(filename: "cu.cpp", directory: "/tmp")
+; !2 = !{}
+; !3 = !{i32 7, !"Dwarf Version", i32 5}
+; !4 = !{i32 2, !"Debug Info Version", i32 3}
+
+; CHECK: distinct !DICompileUnit(language: DW_LANG_ObjC,
diff --git a/llvm/test/Bitcode/upgrade-masked-keep-metadata.ll b/llvm/test/Bitcode/upgrade-masked-keep-metadata.ll
index a4667ab62f789..0c44b300bc00d 100644
--- a/llvm/test/Bitcode/upgrade-masked-keep-metadata.ll
+++ b/llvm/test/Bitcode/upgrade-masked-keep-metadata.ll
@@ -4,7 +4,7 @@ define <4 x i32> @load(ptr nocapture readonly %a0) !dbg !8 {
 ; CHECK-LABEL: define <4 x i32> @load(
 ; CHECK-SAME: ptr readonly captures(none) [[A0:%.*]]) !dbg [[DBG8:![0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[V0:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[A0]], i32 16, <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> undef), !dbg [[DBG19:![0-9]+]], !tbaa [[CHAR_TBAA20:![0-9]+]]
+; CHECK-NEXT:    [[V0:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 16 [[A0]], <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> undef), !dbg [[DBG19:![0-9]+]], !tbaa [[CHAR_TBAA20:![0-9]+]]
 ; CHECK-NEXT:    ret <4 x i32> [[V0]], !dbg [[DBG23:![0-9]+]]
 ;
 entry:
@@ -16,7 +16,7 @@ define void @store(<4 x i32> %a0, ptr nocapture %a1) !dbg !24 {
 ; CHECK-LABEL: define void @store(
 ; CHECK-SAME: <4 x i32> [[A0:%.*]], ptr captures(none) [[A1:%.*]]) !dbg [[DBG24:![0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[A0]], ptr [[A1]], i32 16, <4 x i1> <i1 false, i1 true, i1 false, i1 true>), !dbg [[DBG30:![0-9]+]], !tbaa [[CHAR_TBAA20]]
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[A0]], ptr align 16 [[A1]], <4 x i1> <i1 false, i1 true, i1 false, i1 true>), !dbg [[DBG30:![0-9]+]], !tbaa [[CHAR_TBAA20]]
 ; CHECK-NEXT:    ret void, !dbg [[DBG31:![0-9]+]]
 ;
 entry:
@@ -28,7 +28,7 @@ define <4 x i32> @gather(<4 x ptr> %a0) !dbg !32 {
 ; CHECK-LABEL: define <4 x i32> @gather(
 ; CHECK-SAME: <4 x ptr> [[A0:%.*]]) !dbg [[DBG32:![0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[V0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[A0]], i32 16, <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x i32> undef), !dbg [[DBG35:![0-9]+]], !tbaa [[CHAR_TBAA20]]
+; CHECK-NEXT:    [[V0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 16 [[A0]], <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x i32> undef), !dbg [[DBG35:![0-9]+]], !tbaa [[CHAR_TBAA20]]
 ; CHECK-NEXT:    ret <4 x i32> [[V0]], !dbg [[DBG36:![0-9]+]]
 ;
 entry:
@@ -40,7 +40,7 @@ define void @scatter(<4 x i32> %a0, <4 x ptr> %a1) !dbg !37 {
 ; CHECK-LABEL: define void @scatter(
 ; CHECK-SAME: <4 x i32> [[A0:%.*]], <4 x ptr> [[A1:%.*]]) !dbg [[DBG37:![0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[A0]], <4 x ptr> [[A1]], i32 16, <4 x i1> <i1 false, i1 true, i1 true, i1 true>), !dbg [[DBG41:![0-9]+]], !tbaa [[CHAR_TBAA20]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[A0]], <4 x ptr> align 16 [[A1]], <4 x i1> <i1 false, i1 true, i1 true, i1 true>), !dbg [[DBG41:![0-9]+]], !tbaa [[CHAR_TBAA20]]
 ; CHECK-NEXT:    ret void, !dbg [[DBG42:![0-9]+]]
 ;
 entry:
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-add.mir b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-add.mir
new file mode 100644
index 0000000000000..824ada1cf4a05
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-add.mir
@@ -0,0 +1,278 @@
+# NOTE: Assertions have been autogenerated by utils/update_givaluetracking_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=aarch64 -passes="print<gisel-value-tracking>" -filetype=null %s 2>&1 | FileCheck %s
+
+---
+name:            Cst
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @Cst
+  ; CHECK-NEXT: %0:_ KnownBits:00000010 SignBits:6
+  ; CHECK-NEXT: %1:_ KnownBits:00011000 SignBits:3
+  ; CHECK-NEXT: %2:_ KnownBits:00011010 SignBits:3
+    %0:_(s8) = G_CONSTANT i8 2
+    %1:_(s8) = G_CONSTANT i8 24
+    %2:_(s8) = G_ADD %0, %1
+...
+---
+name:            CstZero
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @CstZero
+  ; CHECK-NEXT: %0:_ KnownBits:00000001 SignBits:7
+  ; CHECK-NEXT: %1:_ KnownBits:11111111 SignBits:8
+  ; CHECK-NEXT: %2:_ KnownBits:00000000 SignBits:8
+    %0:_(s8) = G_CONSTANT i8 1
+    %1:_(s8) = G_CONSTANT i8 255
+    %2:_(s8) = G_ADD %0, %1
+...
+---
+name:            CstNegOne
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @CstNegOne
+  ; CHECK-NEXT: %0:_ KnownBits:00000000 SignBits:8
+  ; CHECK-NEXT: %1:_ KnownBits:11111111 SignBits:8
+  ; CHECK-NEXT: %2:_ KnownBits:11111111 SignBits:8
+    %0:_(s8) = G_CONSTANT i8 0
+    %1:_(s8) = G_CONSTANT i8 255
+    %2:_(s8) = G_ADD %0, %1
+...
+---
+name:            CstSeven
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @CstSeven
+  ; CHECK-NEXT: %0:_ KnownBits:00001000 SignBits:4
+  ; CHECK-NEXT: %1:_ KnownBits:11111111 SignBits:8
+  ; CHECK-NEXT: %2:_ KnownBits:00000111 SignBits:5
+    %0:_(s8) = G_CONSTANT i8 8
+    %1:_(s8) = G_CONSTANT i8 255
+    %2:_(s8) = G_ADD %0, %1
+...
+---
+name:            CstNeg
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @CstNeg
+  ; CHECK-NEXT: %0:_ KnownBits:11100000 SignBits:3
+  ; CHECK-NEXT: %1:_ KnownBits:00000010 SignBits:6
+  ; CHECK-NEXT: %2:_ KnownBits:11100010 SignBits:3
+    %0:_(s8) = G_CONSTANT i8 224
+    %1:_(s8) = G_CONSTANT i8 2
+    %2:_(s8) = G_ADD %0, %1
+...
+---
+name:            ScalarVar
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @ScalarVar
+  ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:???????? SignBits:1
+  ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1
+    %0:_(s8) = COPY $b0
+    %1:_(s8) = COPY $b1
+    %2:_(s8) = G_ADD %0, %1
+...
+---
+name:            ScalarRhsEarlyOut
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @ScalarRhsEarlyOut
+  ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:00000011 SignBits:6
+  ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1
+    %0:_(s8) = COPY $b0
+    %1:_(s8) = G_CONSTANT i8 3
+    %2:_(s8) = G_ADD %0, %1
+...
+---
+name:            ScalarNonNegative
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @ScalarNonNegative
+  ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:00001111 SignBits:4
+  ; CHECK-NEXT: %2:_ KnownBits:0000???? SignBits:4
+  ; CHECK-NEXT: %3:_ KnownBits:11111111 SignBits:8
+  ; CHECK-NEXT: %4:_ KnownBits:???????? SignBits:4
+    %0:_(s8) = COPY $b0
+    %1:_(s8) = G_CONSTANT i8 15
+    %2:_(s8) = G_AND %0, %1
+    %3:_(s8) = G_CONSTANT i8 255
+    %4:_(s8) = G_ADD %2, %3
+...
+---
+name:            ScalarLhsEarlyOut
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @ScalarLhsEarlyOut
+  ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:00000011 SignBits:6
+  ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1
+    %0:_(s8) = COPY $b0
+    %1:_(s8) = G_CONSTANT i8 3
+    %2:_(s8) = G_ADD %1, %0
+...
+---
+name:            ScalarPartKnown
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @ScalarPartKnown
+  ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:00001111 SignBits:4
+  ; CHECK-NEXT: %2:_ KnownBits:0000???? SignBits:4
+  ; CHECK-NEXT: %3:_ KnownBits:00000101 SignBits:5
+  ; CHECK-NEXT: %4:_ KnownBits:000????? SignBits:3
+    %0:_(s8) = COPY $b0
+    %1:_(s8) = G_CONSTANT i8 15
+    %2:_(s8) = G_AND %0, %1
+    %3:_(s8) = G_CONSTANT i8 5
+    %4:_(s8) = G_ADD %2, %3
+...
+---
+name:            VectorCstZero
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorCstZero
+  ; CHECK-NEXT: %0:_ KnownBits:0000000000000001 SignBits:15
+  ; CHECK-NEXT: %1:_ KnownBits:1111111111111111 SignBits:16
+  ; CHECK-NEXT: %2:_ KnownBits:0000000000000001 SignBits:15
+  ; CHECK-NEXT: %3:_ KnownBits:1111111111111111 SignBits:16
+  ; CHECK-NEXT: %4:_ KnownBits:0000000000000000 SignBits:16
+    %0:_(s16) = G_CONSTANT i16 1
+    %1:_(s16) = G_CONSTANT i16 65535
+    %2:_(<4 x s16>) = G_BUILD_VECTOR %0, %0, %0, %0
+    %3:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1
+    %4:_(<4 x s16>) = G_ADD %2, %3
+...
+---
+name:            VectorCstNegOne
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorCstNegOne
+  ; CHECK-NEXT: %0:_ KnownBits:0000000000000000 SignBits:16
+  ; CHECK-NEXT: %1:_ KnownBits:1111111111111111 SignBits:16
+  ; CHECK-NEXT: %2:_ KnownBits:0000000000000000 SignBits:16
+  ; CHECK-NEXT: %3:_ KnownBits:1111111111111111 SignBits:16
+  ; CHECK-NEXT: %4:_ KnownBits:1111111111111111 SignBits:16
+    %0:_(s16) = G_CONSTANT i16 0
+    %1:_(s16) = G_CONSTANT i16 65535
+    %2:_(<4 x s16>) = G_BUILD_VECTOR %0, %0, %0, %0
+    %3:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1
+    %4:_(<4 x s16>) = G_ADD %2, %3
+...
+---
+name:            VectorVar
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorVar
+  ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %2:_ KnownBits:???????????????? SignBits:1
+    %0:_(<4 x s16>) = COPY $d0
+    %1:_(<4 x s16>) = COPY $d1
+    %2:_(<4 x s16>) = G_ADD %0, %1
+...
+---
+name:            VectorRhsEarlyOut
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorRhsEarlyOut
+  ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:0000000000000011 SignBits:14
+  ; CHECK-NEXT: %2:_ KnownBits:0000000000000011 SignBits:14
+  ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1
+    %0:_(<4 x s16>) = COPY $d0
+    %1:_(s16) = G_CONSTANT i16 3
+    %2:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1
+    %3:_(<4 x s16>) = G_ADD %2, %0
+...
+---
+name:            VectorNonNegative
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorNonNegative
+  ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:0000000011111111 SignBits:8
+  ; CHECK-NEXT: %2:_ KnownBits:0000000011111111 SignBits:8
+  ; CHECK-NEXT: %3:_ KnownBits:00000000???????? SignBits:8
+  ; CHECK-NEXT: %4:_ KnownBits:1111111111111111 SignBits:16
+  ; CHECK-NEXT: %5:_ KnownBits:1111111111111111 SignBits:16
+  ; CHECK-NEXT: %6:_ KnownBits:???????????????? SignBits:8
+    %0:_(<4 x s16>) = COPY $d0
+    %1:_(s16) = G_CONSTANT i16 255
+    %2:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1
+    %3:_(<4 x s16>) = G_AND %0, %2
+    %4:_(s16) = G_CONSTANT i16 65535
+    %5:_(<4 x s16>) = G_BUILD_VECTOR %4, %4, %4, %4
+    %6:_(<4 x s16>) = G_ADD %3, %5
+...
+---
+name:            VectorLhsEarlyOut
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorLhsEarlyOut
+  ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:0000000000000011 SignBits:14
+  ; CHECK-NEXT: %2:_ KnownBits:0000000000000011 SignBits:14
+  ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1
+    %0:_(<4 x s16>) = COPY $d0
+    %1:_(s16) = G_CONSTANT i16 3
+    %2:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1
+    %3:_(<4 x s16>) = G_ADD %0, %2
+...
+---
+name:            VectorPartKnown
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorPartKnown
+  ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:0000000011111111 SignBits:8
+  ; CHECK-NEXT: %2:_ KnownBits:0000000011111111 SignBits:8
+  ; CHECK-NEXT: %3:_ KnownBits:00000000???????? SignBits:8
+  ; CHECK-NEXT: %4:_ KnownBits:0000000000101010 SignBits:10
+  ; CHECK-NEXT: %5:_ KnownBits:0000000001001010 SignBits:9
+  ; CHECK-NEXT: %6:_ KnownBits:000000000??01010 SignBits:9
+  ; CHECK-NEXT: %7:_ KnownBits:0000000????????? SignBits:7
+    %0:_(<4 x s16>) = COPY $d0
+    %1:_(s16) = G_CONSTANT i16 255
+    %2:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1
+    %3:_(<4 x s16>) = G_AND %0, %2
+    %4:_(s16) = G_CONSTANT i16 42
+    %5:_(s16) = G_CONSTANT i16 74
+    %6:_(<4 x s16>) = G_BUILD_VECTOR %4, %5, %5, %4
+    %7:_(<4 x s16>) = G_ADD %6, %3
+...
+---
+name:            VectorCst36
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorCst36
+  ; CHECK-NEXT: %0:_ KnownBits:0000000000000011 SignBits:14
+  ; CHECK-NEXT: %1:_ KnownBits:0000000000000110 SignBits:13
+  ; CHECK-NEXT: %2:_ KnownBits:0000000000000?1? SignBits:13
+  ; CHECK-NEXT: %3:_ KnownBits:0000000000000?1? SignBits:13
+  ; CHECK-NEXT: %4:_ KnownBits:000000000000???? SignBits:12
+    %0:_(s16) = G_CONSTANT i16 3
+    %1:_(s16) = G_CONSTANT i16 6
+    %2:_(<4 x s16>) = G_BUILD_VECTOR %0, %1, %1, %0
+    %3:_(<4 x s16>) = G_BUILD_VECTOR %0, %1, %1, %0
+    %4:_(<4 x s16>) = G_ADD %2, %3
+...
+
+---
+name:            VectorCst3unknown
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorCst3unknown
+  ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %2:_ KnownBits:0000000000000011 SignBits:14
+  ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %4:_ KnownBits:???????????????? SignBits:1
+    %0:_(<4 x s16>) = COPY $d0
+    %1:_(s16) = COPY $h0
+    %2:_(s16) = G_CONSTANT i16 3
+    %3:_(<4 x s16>) = G_BUILD_VECTOR %1, %2, %2, %1
+    %4:_(<4 x s16>) = G_ADD %0, %3
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-vector-compress.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-vector-compress.mir
index cc7577473b548..c2bf95ce21adf 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-vector-compress.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-vector-compress.mir
@@ -15,8 +15,9 @@ body:             |
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s32>), [[C1]](s64)
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[C1]](s64)
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[C1]], [[C2]]
+    ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[COPY2]], [[C2]]
     ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[MUL]](s64)
     ; CHECK-NEXT: G_STORE [[EVEC]](s32), [[PTR_ADD]](p0) :: (store (s32))
     ; CHECK-NEXT: [[EVEC1:%[0-9]+]]:_(s16) = G_EXTRACT_VECTOR_ELT [[COPY1]](<4 x s16>), [[C1]](s64)
@@ -91,7 +92,8 @@ body:             |
     ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32))
     ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s32>), [[C3]](s64)
-    ; CHECK-NEXT: [[MUL1:%[0-9]+]]:_(s64) = G_MUL [[C3]], [[C2]]
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[C3]](s64)
+    ; CHECK-NEXT: [[MUL1:%[0-9]+]]:_(s64) = G_MUL [[COPY3]], [[C2]]
     ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[MUL1]](s64)
     ; CHECK-NEXT: G_STORE [[EVEC]](s32), [[PTR_ADD1]](p0) :: (store (s32))
     ; CHECK-NEXT: [[EVEC1:%[0-9]+]]:_(s16) = G_EXTRACT_VECTOR_ELT [[COPY1]](<4 x s16>), [[C3]](s64)
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matmul.ll b/llvm/test/CodeGen/AArch64/aarch64-matmul.ll
index 649d0a9bfcab4..e7e9ee7330613 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matmul.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matmul.ll
@@ -1,41 +1,54 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon,+i8mm < %s -o -| FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+i8mm    < %s | FileCheck %s
+; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+i8mm -global-isel < %s | FileCheck %s
 
 define <4 x i32> @smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: smmla.v4i32.v16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smmla v0.4s, v1.16b, v2.16b
+; CHECK-NEXT:    ret
 entry:
-; CHECK-LABEL: smmla.v4i32.v16i8
-; CHECK: smmla   v0.4s, v1.16b, v2.16b
   %vmmla1.i = tail call <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
   ret <4 x i32> %vmmla1.i
 }
 
 define <4 x i32> @ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: ummla.v4i32.v16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ummla v0.4s, v1.16b, v2.16b
+; CHECK-NEXT:    ret
 entry:
-; CHECK-LABEL: ummla.v4i32.v16i8
-; CHECK: ummla   v0.4s, v1.16b, v2.16b
   %vmmla1.i = tail call <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
   ret <4 x i32> %vmmla1.i
 }
 
 define <4 x i32> @usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: usmmla.v4i32.v16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    usmmla v0.4s, v1.16b, v2.16b
+; CHECK-NEXT:    ret
 entry:
-; CHECK-LABEL: usmmla.v4i32.v16i8
-; CHECK: usmmla   v0.4s, v1.16b, v2.16b
   %vusmmla1.i = tail call <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) #3
   ret <4 x i32> %vusmmla1.i
 }
 
 define <2 x i32> @usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: usdot.v2i32.v8i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    usdot v0.2s, v1.8b, v2.8b
+; CHECK-NEXT:    ret
 entry:
-; CHECK-LABEL: usdot.v2i32.v8i8
-; CHECK: usdot   v0.2s, v1.8b, v2.8b
   %vusdot1.i = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b)
   ret <2 x i32> %vusdot1.i
 }
 
 define <2 x i32> @usdot_lane.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: usdot_lane.v2i32.v8i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    usdot v0.2s, v1.8b, v2.4b[0]
+; CHECK-NEXT:    ret
 entry:
-; CHECK-LABEL: usdot_lane.v2i32.v8i8
-; CHECK: usdot   v0.2s, v1.8b, v2.4b[0]
   %0 = bitcast <8 x i8> %b to <2 x i32>
   %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <2 x i32> zeroinitializer
   %1 = bitcast <2 x i32> %shuffle to <8 x i8>
@@ -44,9 +57,12 @@ entry:
 }
 
 define <2 x i32> @sudot_lane.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: sudot_lane.v2i32.v8i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    sudot v0.2s, v1.8b, v2.4b[0]
+; CHECK-NEXT:    ret
 entry:
-; CHECK-LABEL: sudot_lane.v2i32.v8i8
-; CHECK: sudot   v0.2s, v1.8b, v2.4b[0]
   %0 = bitcast <8 x i8> %b to <2 x i32>
   %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <2 x i32> zeroinitializer
   %1 = bitcast <2 x i32> %shuffle to <8 x i8>
@@ -55,9 +71,11 @@ entry:
 }
 
 define <2 x i32> @usdot_lane.v2i32.v16i8(<2 x i32> %r, <8 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: usdot_lane.v2i32.v16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    usdot v0.2s, v1.8b, v2.4b[0]
+; CHECK-NEXT:    ret
 entry:
-; CHECK-LABEL: usdot_lane.v2i32.v16i8
-; CHECK: usdot   v0.2s, v1.8b, v2.4b[0]
   %0 = bitcast <16 x i8> %b to <4 x i32>
   %shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> zeroinitializer
   %1 = bitcast <2 x i32> %shuffle to <8 x i8>
@@ -66,9 +84,11 @@ entry:
 }
 
 define <2 x i32> @sudot_lane.v2i32.v16i8(<2 x i32> %r, <8 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: sudot_lane.v2i32.v16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sudot v0.2s, v1.8b, v2.4b[0]
+; CHECK-NEXT:    ret
 entry:
-; CHECK-LABEL: sudot_lane.v2i32.v16i8
-; CHECK: sudot   v0.2s, v1.8b, v2.4b[0]
   %0 = bitcast <16 x i8> %b to <4 x i32>
   %shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> zeroinitializer
   %1 = bitcast <2 x i32> %shuffle to <8 x i8>
@@ -77,17 +97,22 @@ entry:
 }
 
 define <4 x i32> @usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: usdot.v4i32.v16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    usdot v0.4s, v1.16b, v2.16b
+; CHECK-NEXT:    ret
 entry:
-; CHECK-LABEL: usdot.v4i32.v16i8
-; CHECK: usdot   v0.4s, v1.16b, v2.16b
   %vusdot1.i = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) #3
   ret <4 x i32> %vusdot1.i
 }
 
 define <4 x i32> @usdot_lane.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: usdot_lane.v4i32.v16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    usdot v0.4s, v1.16b, v2.4b[0]
+; CHECK-NEXT:    ret
 entry:
-; CHECK-LABEL: usdot_lane.v4i32.v16i8
-; CHECK: usdot   v0.4s, v1.16b, v2.4b[0]
   %0 = bitcast <8 x i8> %b to <2 x i32>
   %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> zeroinitializer
   %1 = bitcast <4 x i32> %shuffle to <16 x i8>
@@ -96,9 +121,12 @@ entry:
 }
 
 define <4 x i32> @sudot_lane.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: sudot_lane.v4i32.v16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    sudot v0.4s, v1.16b, v2.4b[0]
+; CHECK-NEXT:    ret
 entry:
-; CHECK-LABEL: sudot_lane.v4i32.v16i8
-; CHECK: sudot   v0.4s, v1.16b, v2.4b[0]
   %0 = bitcast <8 x i8> %b to <2 x i32>
   %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> zeroinitializer
   %1 = bitcast <4 x i32> %shuffle to <16 x i8>
@@ -107,9 +135,11 @@ entry:
 }
 
 define <4 x i32> @usdot_laneq.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: usdot_laneq.v4i32.v16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    usdot v0.4s, v1.16b, v2.4b[0]
+; CHECK-NEXT:    ret
 entry:
-; CHECK-LABEL: usdot_laneq.v4i32.v16i8
-; CHECK: usdot   v0.4s, v1.16b, v2.4b[0]
   %0 = bitcast <16 x i8> %b to <4 x i32>
   %shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer
   %1 = bitcast <4 x i32> %shuffle to <16 x i8>
@@ -118,9 +148,11 @@ entry:
 }
 
 define <4 x i32> @sudot_laneq.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: sudot_laneq.v4i32.v16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sudot v0.4s, v1.16b, v2.4b[0]
+; CHECK-NEXT:    ret
 entry:
-; CHECK-LABEL: sudot_laneq.v4i32.v16i8
-; CHECK: sudot   v0.4s, v1.16b, v2.4b[0]
   %0 = bitcast <16 x i8> %b to <4 x i32>
   %shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer
   %1 = bitcast <4 x i32> %shuffle to <16 x i8>
@@ -133,4 +165,3 @@ declare <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16
 declare <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
 declare <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>) #2
 declare <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
-
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
index b215c518dce12..0933e67ed278b 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -1371,11 +1371,10 @@ define noundef <8 x i16> @cmplx_mul_combined_re_im(<8 x i16> noundef %a, i64 %sc
 ; CHECK-SD-NEXT:    lsr x9, x0, #16
 ; CHECK-SD-NEXT:    adrp x8, .LCPI14_0
 ; CHECK-SD-NEXT:    dup v4.8h, w0
-; CHECK-SD-NEXT:    dup v1.8h, w9
-; CHECK-SD-NEXT:    fmov s3, w9
-; CHECK-SD-NEXT:    sqneg v2.8h, v1.8h
-; CHECK-SD-NEXT:    ldr q1, [x8, :lo12:.LCPI14_0]
-; CHECK-SD-NEXT:    tbl v1.16b, { v2.16b, v3.16b }, v1.16b
+; CHECK-SD-NEXT:    ldr q3, [x8, :lo12:.LCPI14_0]
+; CHECK-SD-NEXT:    dup v2.8h, w9
+; CHECK-SD-NEXT:    sqneg v1.8h, v2.8h
+; CHECK-SD-NEXT:    tbl v1.16b, { v1.16b, v2.16b }, v3.16b
 ; CHECK-SD-NEXT:    rev32 v2.8h, v0.8h
 ; CHECK-SD-NEXT:    sqdmull v3.4s, v0.4h, v4.4h
 ; CHECK-SD-NEXT:    sqdmull2 v0.4s, v0.8h, v4.8h
diff --git a/llvm/test/CodeGen/AArch64/aarch64-post-coalescer.mir b/llvm/test/CodeGen/AArch64/aarch64-post-coalescer.mir
new file mode 100644
index 0000000000000..654016014be98
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-post-coalescer.mir
@@ -0,0 +1,16 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=aarch64 -mattr=+sme -run-pass=aarch64-post-coalescer-pass -o - %s | FileCheck %s
+
+---
+name:            foo
+machineFunctionInfo:
+  hasStreamingModeChanges: true
+body:             |
+  bb.0.entry:
+  ; CHECK-LABEL: name: foo
+  ; CHECK: $d0 = COPY undef %0:fpr64
+  ; CHECK-NEXT: FAKE_USE implicit $d0
+  %1:fpr64 = COALESCER_BARRIER_FPR64 undef %1
+  $d0 = COPY %1
+  FAKE_USE implicit $d0
+...
diff --git a/llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir b/llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir
index 284d624a4e68f..f34d3ed510a97 100644
--- a/llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir
+++ b/llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir
@@ -1,16 +1,12 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,-zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \
-# RUN:    | FileCheck --check-prefix=CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ %s
-# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,-zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \
-# RUN:    | FileCheck --check-prefix=CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ %s
-# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,+zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \
-# RUN:    | FileCheck --check-prefix=CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ %s
-# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,+zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \
-# RUN:    | FileCheck --check-prefix=CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ %s
-# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,-zcm-gpr64,+zcz-gpr32,+zcz-gpr64" %s \
-# RUN:    | FileCheck --check-prefix=CHECK-NO-ZCM-ZCZ %s
-# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,+zcm-gpr64,+zcz-gpr32,+zcz-gpr64" %s \
-# RUN:    | FileCheck --check-prefix=CHECK-ZCM-ZCZ %s
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcz-gpr32,-zcz-gpr64" %s \
+# RUN:    | FileCheck --check-prefix=CHECK-NOZCZ-GPR32-NOZCZ-GPR64 %s
+# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcz-gpr32,-zcz-gpr64" %s \
+# RUN:    | FileCheck --check-prefix=CHECK-ZCZ-GPR32-NOZCZ-GPR64 %s
+# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcz-gpr32,+zcz-gpr64" %s \
+# RUN:    | FileCheck --check-prefix=CHECK-NOZCZ-GPR32-ZCZ-GPR64 %s
+# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcz-gpr32,+zcz-gpr64" %s \
+# RUN:    | FileCheck --check-prefix=CHECK-ZCZ-GPR32-ZCZ-GPR64 %s
 
 --- |
   define void @f0(i64 noundef %x) { ret void }
@@ -24,41 +20,29 @@ liveins:
 body:             |
   bb.0:
     liveins: $x0, $lr
-    ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-LABEL: name: f0
-    ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ: liveins: $x0, $lr
-    ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: {{  $}}
-    ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: $w0 = ORRWrr $wzr, $wzr
-    ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
+    ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64-LABEL: name: f0
+    ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64: liveins: $x0, $lr
+    ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64-NEXT: {{  $}}
+    ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64-NEXT: $w0 = ORRWrr $wzr, $wzr
+    ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
     ;
-    ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-LABEL: name: f0
-    ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ: liveins: $x0, $lr
-    ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: {{  $}}
-    ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: $w0 = ORRWrr $wzr, $wzr
-    ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
+    ; CHECK-ZCZ-GPR32-NOZCZ-GPR64-LABEL: name: f0
+    ; CHECK-ZCZ-GPR32-NOZCZ-GPR64: liveins: $x0, $lr
+    ; CHECK-ZCZ-GPR32-NOZCZ-GPR64-NEXT: {{  $}}
+    ; CHECK-ZCZ-GPR32-NOZCZ-GPR64-NEXT: $w0 = MOVZWi 0, 0
+    ; CHECK-ZCZ-GPR32-NOZCZ-GPR64-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
     ;
-    ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-LABEL: name: f0
-    ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ: liveins: $x0, $lr
-    ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: {{  $}}
-    ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: $x0 = ORRXrr $xzr, undef $xzr, implicit $wzr
-    ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
+    ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-LABEL: name: f0
+    ; CHECK-NOZCZ-GPR32-ZCZ-GPR64: liveins: $x0, $lr
+    ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-NEXT: {{  $}}
+    ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-NEXT: $w0 = ORRWrr $wzr, $wzr
+    ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
     ;
-    ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-LABEL: name: f0
-    ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ: liveins: $x0, $lr
-    ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: {{  $}}
-    ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: $w0 = ORRWrr $wzr, $wzr
-    ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
-    ;
-    ; CHECK-NO-ZCM-ZCZ-LABEL: name: f0
-    ; CHECK-NO-ZCM-ZCZ: liveins: $x0, $lr
-    ; CHECK-NO-ZCM-ZCZ-NEXT: {{  $}}
-    ; CHECK-NO-ZCM-ZCZ-NEXT: $w0 = MOVZWi 0, 0
-    ; CHECK-NO-ZCM-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
-    ;
-    ; CHECK-ZCM-ZCZ-LABEL: name: f0
-    ; CHECK-ZCM-ZCZ: liveins: $x0, $lr
-    ; CHECK-ZCM-ZCZ-NEXT: {{  $}}
-    ; CHECK-ZCM-ZCZ-NEXT: $w0 = MOVZWi 0, 0
-    ; CHECK-ZCM-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
+    ; CHECK-ZCZ-GPR32-ZCZ-GPR64-LABEL: name: f0
+    ; CHECK-ZCZ-GPR32-ZCZ-GPR64: liveins: $x0, $lr
+    ; CHECK-ZCZ-GPR32-ZCZ-GPR64-NEXT: {{  $}}
+    ; CHECK-ZCZ-GPR32-ZCZ-GPR64-NEXT: $w0 = MOVZWi 0, 0
+    ; CHECK-ZCZ-GPR32-ZCZ-GPR64-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
     $w0 = COPY $wzr
     BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
 ...
@@ -69,41 +53,29 @@ liveins:
 body:             |
   bb.0:
     liveins: $x0, $lr
-    ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-LABEL: name: f1
-    ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ: liveins: $x0, $lr
-    ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: {{  $}}
-    ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: $x0 = ORRXrr $xzr, $xzr
-    ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
-    ;
-    ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-LABEL: name: f1
-    ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ: liveins: $x0, $lr
-    ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: {{  $}}
-    ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: $x0 = ORRXrr $xzr, $xzr
-    ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
-    ;
-    ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-LABEL: name: f1
-    ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ: liveins: $x0, $lr
-    ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: {{  $}}
-    ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: $x0 = ORRXrr $xzr, $xzr
-    ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
+    ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64-LABEL: name: f1
+    ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64: liveins: $x0, $lr
+    ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64-NEXT: {{  $}}
+    ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64-NEXT: $x0 = ORRXrr $xzr, $xzr
+    ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
     ;
-    ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-LABEL: name: f1
-    ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ: liveins: $x0, $lr
-    ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: {{  $}}
-    ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: $x0 = ORRXrr $xzr, $xzr
-    ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
+    ; CHECK-ZCZ-GPR32-NOZCZ-GPR64-LABEL: name: f1
+    ; CHECK-ZCZ-GPR32-NOZCZ-GPR64: liveins: $x0, $lr
+    ; CHECK-ZCZ-GPR32-NOZCZ-GPR64-NEXT: {{  $}}
+    ; CHECK-ZCZ-GPR32-NOZCZ-GPR64-NEXT: $x0 = ORRXrr $xzr, $xzr
+    ; CHECK-ZCZ-GPR32-NOZCZ-GPR64-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
     ;
-    ; CHECK-NO-ZCM-ZCZ-LABEL: name: f1
-    ; CHECK-NO-ZCM-ZCZ: liveins: $x0, $lr
-    ; CHECK-NO-ZCM-ZCZ-NEXT: {{  $}}
-    ; CHECK-NO-ZCM-ZCZ-NEXT: $x0 = MOVZXi 0, 0
-    ; CHECK-NO-ZCM-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
+    ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-LABEL: name: f1
+    ; CHECK-NOZCZ-GPR32-ZCZ-GPR64: liveins: $x0, $lr
+    ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-NEXT: {{  $}}
+    ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-NEXT: $x0 = MOVZXi 0, 0
+    ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
     ;
-    ; CHECK-ZCM-ZCZ-LABEL: name: f1
-    ; CHECK-ZCM-ZCZ: liveins: $x0, $lr
-    ; CHECK-ZCM-ZCZ-NEXT: {{  $}}
-    ; CHECK-ZCM-ZCZ-NEXT: $x0 = MOVZXi 0, 0
-    ; CHECK-ZCM-ZCZ-NEXT:BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
+    ; CHECK-ZCZ-GPR32-ZCZ-GPR64-LABEL: name: f1
+    ; CHECK-ZCZ-GPR32-ZCZ-GPR64: liveins: $x0, $lr
+    ; CHECK-ZCZ-GPR32-ZCZ-GPR64-NEXT: {{  $}}
+    ; CHECK-ZCZ-GPR32-ZCZ-GPR64-NEXT: $x0 = MOVZXi 0, 0
+    ; CHECK-ZCZ-GPR32-ZCZ-GPR64-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
     $x0 = COPY $xzr
     BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
 ...
diff --git a/llvm/test/CodeGen/AArch64/arm64-early-ifcvt.ll b/llvm/test/CodeGen/AArch64/arm64-early-ifcvt.ll
index 97a7741bcde75..849323f0fedf3 100644
--- a/llvm/test/CodeGen/AArch64/arm64-early-ifcvt.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-early-ifcvt.ll
@@ -421,3 +421,83 @@ for.body51:                                       ; preds = %is_sbox.exit155
   unreachable
 }
 declare fastcc void @get_switch_type(i32, i32, i16 signext, i16 signext, ptr nocapture) nounwind ssp
+
+; CHECK-LABEL: fold_imm1_csinc_32:
+; CHECK:      cmp w0, w1
+; CHECK-NEXT: csinc w0, w2, wzr, ge
+; CHECK-NEXT: ret
+define i32 @fold_imm1_csinc_32(i32 %x, i32 %y, i32 %n) nounwind ssp {
+entry:
+  %cmp = icmp slt i32 %x, %y
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  br label %exit
+
+if.else:
+  br label %exit
+
+exit:
+  %result = phi i32 [ 1, %if.then ], [ %n, %if.else ]
+  ret i32 %result
+}
+
+; CHECK-LABEL: fold_imm1_csinc_64:
+; CHECK:      cmp x0, x1
+; CHECK-NEXT: csinc x0, x2, xzr, ge
+; CHECK-NEXT: ret
+define i64 @fold_imm1_csinc_64(i64 %x, i64 %y, i64 %n) nounwind ssp {
+entry:
+  %cmp = icmp slt i64 %x, %y
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  br label %exit
+
+if.else:
+  br label %exit
+
+exit:
+  %result = phi i64 [ 1, %if.then ], [ %n, %if.else ]
+  ret i64 %result
+}
+
+; CHECK-LABEL: fold_imm1_cset_32:
+; CHECK:      cmp w0, w1
+; CHECK-NEXT: cset w0, lt
+; CHECK-NEXT: ret
+define i32 @fold_imm1_cset_32(i32 %x, i32 %y) nounwind ssp {
+entry:
+  %cmp = icmp slt i32 %x, %y
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  br label %exit
+
+if.else:
+  br label %exit
+
+exit:
+  %result = phi i32 [ 1, %if.then ], [ 0, %if.else ]
+  ret i32 %result
+}
+
+; CHECK-LABEL: fold_imm1_cset_64:
+; CHECK:      cmp x0, x1
+; CHECK-NEXT: cset x0, lt
+; CHECK-NEXT: ret
+define i64 @fold_imm1_cset_64(i64 %x, i64 %y) nounwind ssp {
+entry:
+  %cmp = icmp slt i64 %x, %y
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  br label %exit
+
+if.else:
+  br label %exit
+
+exit:
+  %result = phi i64 [ 1, %if.then ], [ 0, %if.else ]
+  ret i64 %result
+}
diff --git a/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll b/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll
index d4cc154ac6afc..52ca22ba00794 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll
@@ -1,38 +1,24 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s --check-prefixes=CHECK,GENERIC
-; RUN: llc < %s -O0 -fast-isel -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s --check-prefixes=CHECK,FAST
-; RUN: llc < %s -global-isel -global-isel-abort=2 -pass-remarks-missed=gisel* \
-; RUN:          -mtriple=arm64-eabi -aarch64-neon-syntax=apple \
-; RUN:          | FileCheck %s --check-prefixes=GISEL,FALLBACK
+; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -O0 -fast-isel | FileCheck %s --check-prefixes=CHECK,CHECK-FI
+; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple  -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; CHECK-GI:       warning: Instruction selection used fallback path for test_vcvt_bf16_f64
 
-; FALLBACK-NOT: remark{{.*}}G_FPEXT{{.*}}(in function: test_vcvt_f64_f32)
-; FALLBACK-NOT: remark{{.*}}fpext{{.*}}(in function: test_vcvt_f64_f32)
 define <2 x double> @test_vcvt_f64_f32(<2 x float> %x) nounwind readnone ssp {
 ; CHECK-LABEL: test_vcvt_f64_f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtl v0.2d, v0.2s
 ; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: test_vcvt_f64_f32:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvtl v0.2d, v0.2s
-; GISEL-NEXT:    ret
   %vcvt1.i = fpext <2 x float> %x to <2 x double>
   ret <2 x double> %vcvt1.i
 }
 
-; FALLBACK-NOT: remark{{.*}}G_FPEXT{{.*}}(in function: test_vcvt_high_f64_f32)
-; FALLBACK-NOT: remark{{.*}}fpext{{.*}}(in function: test_vcvt_high_f64_f32)
 define <2 x double> @test_vcvt_high_f64_f32(<4 x float> %x) nounwind readnone ssp {
 ; CHECK-LABEL: test_vcvt_high_f64_f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtl2 v0.2d, v0.4s
 ; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: test_vcvt_high_f64_f32:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvtl2 v0.2d, v0.4s
-; GISEL-NEXT:    ret
   %cvt_in = shufflevector <4 x float> %x, <4 x float> undef, <2 x i32> <i32 2, i32 3>
   %vcvt1.i = fpext <2 x float> %cvt_in to <2 x double>
   ret <2 x double> %vcvt1.i
@@ -43,11 +29,6 @@ define <2 x double> @test_vcvt_high_v1f64_f32_bitcast(<4 x float> %x) nounwind r
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtl2 v0.2d, v0.4s
 ; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: test_vcvt_high_v1f64_f32_bitcast:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvtl2 v0.2d, v0.4s
-; GISEL-NEXT:    ret
   %bc1 = bitcast <4 x float> %x to <2 x double>
   %ext = shufflevector <2 x double> %bc1, <2 x double> undef, <1 x i32> <i32 1>
   %bc2 = bitcast <1 x double> %ext to <2 x float>
@@ -60,11 +41,6 @@ define <2 x double> @test_vcvt_high_v1i64_f32_bitcast(<2 x i64> %x) nounwind rea
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtl2 v0.2d, v0.4s
 ; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: test_vcvt_high_v1i64_f32_bitcast:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvtl2 v0.2d, v0.4s
-; GISEL-NEXT:    ret
   %ext = shufflevector <2 x i64> %x, <2 x i64> undef, <1 x i32> <i32 1>
   %bc2 = bitcast <1 x i64> %ext to <2 x float>
   %r = fpext <2 x float> %bc2 to <2 x double>
@@ -76,11 +52,6 @@ define <2 x double> @test_vcvt_high_v2i32_f32_bitcast(<4 x i32> %x) nounwind rea
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtl2 v0.2d, v0.4s
 ; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: test_vcvt_high_v2i32_f32_bitcast:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvtl2 v0.2d, v0.4s
-; GISEL-NEXT:    ret
   %ext = shufflevector <4 x i32> %x, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %bc2 = bitcast <2 x i32> %ext to <2 x float>
   %r = fpext <2 x float> %bc2 to <2 x double>
@@ -92,11 +63,6 @@ define <2 x double> @test_vcvt_high_v4i16_f32_bitcast(<8 x i16> %x) nounwind rea
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtl2 v0.2d, v0.4s
 ; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: test_vcvt_high_v4i16_f32_bitcast:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvtl2 v0.2d, v0.4s
-; GISEL-NEXT:    ret
   %ext = shufflevector <8 x i16> %x, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %bc2 = bitcast <4 x i16> %ext to <2 x float>
   %r = fpext <2 x float> %bc2 to <2 x double>
@@ -108,11 +74,6 @@ define <2 x double> @test_vcvt_high_v8i8_f32_bitcast(<16 x i8> %x) nounwind read
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtl2 v0.2d, v0.4s
 ; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: test_vcvt_high_v8i8_f32_bitcast:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvtl2 v0.2d, v0.4s
-; GISEL-NEXT:    ret
   %ext = shufflevector <16 x i8> %x, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %bc2 = bitcast <8 x i8> %ext to <2 x float>
   %r = fpext <2 x float> %bc2 to <2 x double>
@@ -124,11 +85,6 @@ define <4 x float> @test_vcvt_high_v1i64_f16_bitcast(<2 x i64> %x) nounwind read
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtl2 v0.4s, v0.8h
 ; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: test_vcvt_high_v1i64_f16_bitcast:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvtl2 v0.4s, v0.8h
-; GISEL-NEXT:    ret
   %ext = shufflevector <2 x i64> %x, <2 x i64> undef, <1 x i32> <i32 1>
   %bc2 = bitcast <1 x i64> %ext to <4 x half>
   %r = fpext <4 x half> %bc2 to <4 x float>
@@ -140,11 +96,6 @@ define <4 x float> @test_vcvt_high_v2i32_f16_bitcast(<4 x i32> %x) nounwind read
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtl2 v0.4s, v0.8h
 ; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: test_vcvt_high_v2i32_f16_bitcast:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvtl2 v0.4s, v0.8h
-; GISEL-NEXT:    ret
   %ext = shufflevector <4 x i32> %x, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %bc2 = bitcast <2 x i32> %ext to <4 x half>
   %r = fpext <4 x half> %bc2 to <4 x float>
@@ -156,11 +107,6 @@ define <4 x float> @test_vcvt_high_v4i16_f16_bitcast(<8 x i16> %x) nounwind read
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtl2 v0.4s, v0.8h
 ; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: test_vcvt_high_v4i16_f16_bitcast:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvtl2 v0.4s, v0.8h
-; GISEL-NEXT:    ret
   %ext = shufflevector <8 x i16> %x, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %bc2 = bitcast <4 x i16> %ext to <4 x half>
   %r = fpext <4 x half> %bc2 to <4 x float>
@@ -172,134 +118,118 @@ define <4 x float> @test_vcvt_high_v8i8_f16_bitcast(<16 x i8> %x) nounwind readn
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtl2 v0.4s, v0.8h
 ; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: test_vcvt_high_v8i8_f16_bitcast:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvtl2 v0.4s, v0.8h
-; GISEL-NEXT:    ret
   %ext = shufflevector <16 x i8> %x, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %bc2 = bitcast <8 x i8> %ext to <4 x half>
   %r = fpext <4 x half> %bc2 to <4 x float>
   ret <4 x float> %r
 }
 
-; FALLBACK-NOT: remark{{.*}}G_FPEXT{{.*}}(in function: test_vcvt_f32_f64)
-; FALLBACK-NOT: remark{{.*}}fpext{{.*}}(in function: test_vcvt_f32_f64)
 define <2 x float> @test_vcvt_f32_f64(<2 x double> %v) nounwind readnone ssp {
 ; CHECK-LABEL: test_vcvt_f32_f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtn v0.2s, v0.2d
 ; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: test_vcvt_f32_f64:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvtn v0.2s, v0.2d
-; GISEL-NEXT:    ret
   %vcvt1.i = fptrunc <2 x double> %v to <2 x float>
   ret <2 x float> %vcvt1.i
 }
 
-; FALLBACK-NOT: remark{{.*}}G_FPEXT{{.*}}(in function: test_vcvt_bf16_f64)
-; FALLBACK-NOT: remark{{.*}}fpext{{.*}}(in function: test_vcvt_bf16_f64)
 define <2 x bfloat> @test_vcvt_bf16_f64(<2 x double> %v) nounwind readnone ssp {
-; GENERIC-LABEL: test_vcvt_bf16_f64:
-; GENERIC:       // %bb.0:
-; GENERIC-NEXT:    fcvtxn v0.2s, v0.2d
-; GENERIC-NEXT:    movi.4s v1, #1
-; GENERIC-NEXT:    movi.4s v2, #127, msl #8
-; GENERIC-NEXT:    ushr.4s v3, v0, #16
-; GENERIC-NEXT:    add.4s v2, v0, v2
-; GENERIC-NEXT:    and.16b v1, v3, v1
-; GENERIC-NEXT:    fcmeq.4s v3, v0, v0
-; GENERIC-NEXT:    orr.4s v0, #64, lsl #16
-; GENERIC-NEXT:    add.4s v1, v1, v2
-; GENERIC-NEXT:    bit.16b v0, v1, v3
-; GENERIC-NEXT:    shrn.4h v0, v0, #16
-; GENERIC-NEXT:    ret
+; CHECK-SD-LABEL: test_vcvt_bf16_f64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fcvtxn v0.2s, v0.2d
+; CHECK-SD-NEXT:    movi.4s v1, #1
+; CHECK-SD-NEXT:    movi.4s v2, #127, msl #8
+; CHECK-SD-NEXT:    ushr.4s v3, v0, #16
+; CHECK-SD-NEXT:    add.4s v2, v0, v2
+; CHECK-SD-NEXT:    and.16b v1, v3, v1
+; CHECK-SD-NEXT:    fcmeq.4s v3, v0, v0
+; CHECK-SD-NEXT:    orr.4s v0, #64, lsl #16
+; CHECK-SD-NEXT:    add.4s v1, v1, v2
+; CHECK-SD-NEXT:    bit.16b v0, v1, v3
+; CHECK-SD-NEXT:    shrn.4h v0, v0, #16
+; CHECK-SD-NEXT:    ret
 ;
-; FAST-LABEL: test_vcvt_bf16_f64:
-; FAST:       // %bb.0:
-; FAST-NEXT:    fcvtxn v1.2s, v0.2d
-; FAST-NEXT:    // implicit-def: $q0
-; FAST-NEXT:    fmov d0, d1
-; FAST-NEXT:    ushr.4s v1, v0, #16
-; FAST-NEXT:    movi.4s v2, #1
-; FAST-NEXT:    and.16b v1, v1, v2
-; FAST-NEXT:    add.4s v1, v1, v0
-; FAST-NEXT:    movi.4s v2, #127, msl #8
-; FAST-NEXT:    add.4s v1, v1, v2
-; FAST-NEXT:    mov.16b v2, v0
-; FAST-NEXT:    orr.4s v2, #64, lsl #16
-; FAST-NEXT:    fcmeq.4s v0, v0, v0
-; FAST-NEXT:    bsl.16b v0, v1, v2
-; FAST-NEXT:    shrn.4h v0, v0, #16
-; FAST-NEXT:    ret
+; CHECK-FI-LABEL: test_vcvt_bf16_f64:
+; CHECK-FI:       // %bb.0:
+; CHECK-FI-NEXT:    fcvtxn v1.2s, v0.2d
+; CHECK-FI-NEXT:    // implicit-def: $q0
+; CHECK-FI-NEXT:    fmov d0, d1
+; CHECK-FI-NEXT:    ushr.4s v1, v0, #16
+; CHECK-FI-NEXT:    movi.4s v2, #1
+; CHECK-FI-NEXT:    and.16b v1, v1, v2
+; CHECK-FI-NEXT:    add.4s v1, v1, v0
+; CHECK-FI-NEXT:    movi.4s v2, #127, msl #8
+; CHECK-FI-NEXT:    add.4s v1, v1, v2
+; CHECK-FI-NEXT:    mov.16b v2, v0
+; CHECK-FI-NEXT:    orr.4s v2, #64, lsl #16
+; CHECK-FI-NEXT:    fcmeq.4s v0, v0, v0
+; CHECK-FI-NEXT:    bsl.16b v0, v1, v2
+; CHECK-FI-NEXT:    shrn.4h v0, v0, #16
+; CHECK-FI-NEXT:    ret
 ;
-; GISEL-LABEL: test_vcvt_bf16_f64:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvtxn v0.2s, v0.2d
-; GISEL-NEXT:    movi.4s v1, #1
-; GISEL-NEXT:    movi.4s v2, #127, msl #8
-; GISEL-NEXT:    ushr.4s v3, v0, #16
-; GISEL-NEXT:    add.4s v2, v0, v2
-; GISEL-NEXT:    and.16b v1, v3, v1
-; GISEL-NEXT:    fcmeq.4s v3, v0, v0
-; GISEL-NEXT:    orr.4s v0, #64, lsl #16
-; GISEL-NEXT:    add.4s v1, v1, v2
-; GISEL-NEXT:    bit.16b v0, v1, v3
-; GISEL-NEXT:    shrn.4h v0, v0, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: test_vcvt_bf16_f64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fcvtxn v0.2s, v0.2d
+; CHECK-GI-NEXT:    movi.4s v1, #1
+; CHECK-GI-NEXT:    movi.4s v2, #127, msl #8
+; CHECK-GI-NEXT:    ushr.4s v3, v0, #16
+; CHECK-GI-NEXT:    add.4s v2, v0, v2
+; CHECK-GI-NEXT:    and.16b v1, v3, v1
+; CHECK-GI-NEXT:    fcmeq.4s v3, v0, v0
+; CHECK-GI-NEXT:    orr.4s v0, #64, lsl #16
+; CHECK-GI-NEXT:    add.4s v1, v1, v2
+; CHECK-GI-NEXT:    bit.16b v0, v1, v3
+; CHECK-GI-NEXT:    shrn.4h v0, v0, #16
+; CHECK-GI-NEXT:    ret
   %vcvt1.i = fptrunc <2 x double> %v to <2 x bfloat>
   ret <2 x bfloat> %vcvt1.i
 }
 
 define half @test_vcvt_f16_f32(<1 x float> %x) {
-; GENERIC-LABEL: test_vcvt_f16_f32:
-; GENERIC:       // %bb.0:
-; GENERIC-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GENERIC-NEXT:    fcvt h0, s0
-; GENERIC-NEXT:    ret
+; CHECK-SD-LABEL: test_vcvt_f16_f32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    fcvt h0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; FAST-LABEL: test_vcvt_f16_f32:
-; FAST:       // %bb.0:
-; FAST-NEXT:    fmov d1, d0
-; FAST-NEXT:    // implicit-def: $q0
-; FAST-NEXT:    fmov d0, d1
-; FAST-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; FAST-NEXT:    fcvt h0, s0
-; FAST-NEXT:    ret
+; CHECK-FI-LABEL: test_vcvt_f16_f32:
+; CHECK-FI:       // %bb.0:
+; CHECK-FI-NEXT:    fmov d1, d0
+; CHECK-FI-NEXT:    // implicit-def: $q0
+; CHECK-FI-NEXT:    fmov d0, d1
+; CHECK-FI-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-FI-NEXT:    fcvt h0, s0
+; CHECK-FI-NEXT:    ret
 ;
-; GISEL-LABEL: test_vcvt_f16_f32:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvt h0, s0
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: test_vcvt_f16_f32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ret
   %tmp = fptrunc <1 x float> %x to <1 x half>
   %elt = extractelement <1 x half> %tmp, i32 0
   ret half %elt
 }
 
-; FALLBACK-NOT: remark{{.*}}G_FPEXT{{.*}}(in function: test_vcvt_high_f32_f64)
-; FALLBACK-NOT: remark{{.*}}fpext{{.*}}(in function: test_vcvt_high_f32_f64)
 define <4 x float> @test_vcvt_high_f32_f64(<2 x float> %x, <2 x double> %v) nounwind readnone ssp {
-; GENERIC-LABEL: test_vcvt_high_f32_f64:
-; GENERIC:       // %bb.0:
-; GENERIC-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GENERIC-NEXT:    fcvtn2 v0.4s, v1.2d
-; GENERIC-NEXT:    ret
+; CHECK-SD-LABEL: test_vcvt_high_f32_f64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-SD-NEXT:    ret
 ;
-; FAST-LABEL: test_vcvt_high_f32_f64:
-; FAST:       // %bb.0:
-; FAST-NEXT:    fmov d2, d0
-; FAST-NEXT:    // implicit-def: $q0
-; FAST-NEXT:    fmov d0, d2
-; FAST-NEXT:    fcvtn2 v0.4s, v1.2d
-; FAST-NEXT:    ret
+; CHECK-FI-LABEL: test_vcvt_high_f32_f64:
+; CHECK-FI:       // %bb.0:
+; CHECK-FI-NEXT:    fmov d2, d0
+; CHECK-FI-NEXT:    // implicit-def: $q0
+; CHECK-FI-NEXT:    fmov d0, d2
+; CHECK-FI-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-FI-NEXT:    ret
 ;
-; GISEL-LABEL: test_vcvt_high_f32_f64:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    fcvtn2 v0.4s, v1.2d
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: test_vcvt_high_f32_f64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-GI-NEXT:    ret
   %cvt = fptrunc <2 x double> %v to <2 x float>
   %vcvt2.i = shufflevector <2 x float> %x, <2 x float> %cvt, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x float> %vcvt2.i
@@ -310,99 +240,80 @@ define <2 x float> @test_vcvtx_f32_f64(<2 x double> %v) nounwind readnone ssp {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtxn v0.2s, v0.2d
 ; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: test_vcvtx_f32_f64:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvtxn v0.2s, v0.2d
-; GISEL-NEXT:    ret
   %vcvtx1.i = tail call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %v) nounwind
   ret <2 x float> %vcvtx1.i
 }
 
 define <4 x float> @test_vcvtx_high_f32_f64(<2 x float> %x, <2 x double> %v) nounwind readnone ssp {
-; GENERIC-LABEL: test_vcvtx_high_f32_f64:
-; GENERIC:       // %bb.0:
-; GENERIC-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GENERIC-NEXT:    fcvtxn2 v0.4s, v1.2d
-; GENERIC-NEXT:    ret
+; CHECK-SD-LABEL: test_vcvtx_high_f32_f64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    fcvtxn2 v0.4s, v1.2d
+; CHECK-SD-NEXT:    ret
 ;
-; FAST-LABEL: test_vcvtx_high_f32_f64:
-; FAST:       // %bb.0:
-; FAST-NEXT:    fmov d2, d0
-; FAST-NEXT:    // implicit-def: $q0
-; FAST-NEXT:    fmov d0, d2
-; FAST-NEXT:    fcvtxn2 v0.4s, v1.2d
-; FAST-NEXT:    ret
+; CHECK-FI-LABEL: test_vcvtx_high_f32_f64:
+; CHECK-FI:       // %bb.0:
+; CHECK-FI-NEXT:    fmov d2, d0
+; CHECK-FI-NEXT:    // implicit-def: $q0
+; CHECK-FI-NEXT:    fmov d0, d2
+; CHECK-FI-NEXT:    fcvtxn2 v0.4s, v1.2d
+; CHECK-FI-NEXT:    ret
 ;
-; GISEL-LABEL: test_vcvtx_high_f32_f64:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    fcvtxn2 v0.4s, v1.2d
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: test_vcvtx_high_f32_f64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    fcvtxn2 v0.4s, v1.2d
+; CHECK-GI-NEXT:    ret
   %vcvtx2.i = tail call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %v) nounwind
   %res = shufflevector <2 x float> %x, <2 x float> %vcvtx2.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x float> %res
 }
 
-
-declare <2 x double> @llvm.aarch64.neon.vcvthighfp2df(<4 x float>) nounwind readnone
-declare <2 x double> @llvm.aarch64.neon.vcvtfp2df(<2 x float>) nounwind readnone
-
-declare <2 x float> @llvm.aarch64.neon.vcvtdf2fp(<2 x double>) nounwind readnone
-declare <4 x float> @llvm.aarch64.neon.vcvthighdf2fp(<2 x float>, <2 x double>) nounwind readnone
-
-declare <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double>) nounwind readnone
-
 define i16 @to_half(float %in) {
-; GENERIC-LABEL: to_half:
-; GENERIC:       // %bb.0:
-; GENERIC-NEXT:    fcvt h0, s0
-; GENERIC-NEXT:    fmov w0, s0
-; GENERIC-NEXT:    ret
+; CHECK-SD-LABEL: to_half:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fcvt h0, s0
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; FAST-LABEL: to_half:
-; FAST:       // %bb.0:
-; FAST-NEXT:    fcvt h1, s0
-; FAST-NEXT:    // implicit-def: $w0
-; FAST-NEXT:    fmov s0, w0
-; FAST-NEXT:    fmov s0, s1
-; FAST-NEXT:    fmov w0, s0
-; FAST-NEXT:    // kill: def $w1 killed $w0
-; FAST-NEXT:    ret
+; CHECK-FI-LABEL: to_half:
+; CHECK-FI:       // %bb.0:
+; CHECK-FI-NEXT:    fcvt h1, s0
+; CHECK-FI-NEXT:    // implicit-def: $w0
+; CHECK-FI-NEXT:    fmov s0, w0
+; CHECK-FI-NEXT:    fmov s0, s1
+; CHECK-FI-NEXT:    fmov w0, s0
+; CHECK-FI-NEXT:    // kill: def $w1 killed $w0
+; CHECK-FI-NEXT:    ret
 ;
-; GISEL-LABEL: to_half:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvt h0, s0
-; GISEL-NEXT:    fmov w0, s0
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: to_half:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
   %res = call i16 @llvm.convert.to.fp16.f32(float %in)
   ret i16 %res
 }
 
 define float @from_half(i16 %in) {
-; GENERIC-LABEL: from_half:
-; GENERIC:       // %bb.0:
-; GENERIC-NEXT:    fmov s0, w0
-; GENERIC-NEXT:    fcvt s0, h0
-; GENERIC-NEXT:    ret
+; CHECK-SD-LABEL: from_half:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    fcvt s0, h0
+; CHECK-SD-NEXT:    ret
 ;
-; FAST-LABEL: from_half:
-; FAST:       // %bb.0:
-; FAST-NEXT:    fmov s0, w0
-; FAST-NEXT:    // kill: def $h0 killed $h0 killed $s0
-; FAST-NEXT:    fcvt s0, h0
-; FAST-NEXT:    ret
+; CHECK-FI-LABEL: from_half:
+; CHECK-FI:       // %bb.0:
+; CHECK-FI-NEXT:    fmov s0, w0
+; CHECK-FI-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-FI-NEXT:    fcvt s0, h0
+; CHECK-FI-NEXT:    ret
 ;
-; GISEL-LABEL: from_half:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fmov s0, w0
-; GISEL-NEXT:    fcvt s0, h0
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: from_half:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmov s0, w0
+; CHECK-GI-NEXT:    fcvt s0, h0
+; CHECK-GI-NEXT:    ret
   %res = call float @llvm.convert.from.fp16.f32(i16 %in)
   ret float %res
 }
-
-declare float @llvm.convert.from.fp16.f32(i16) #1
-declare i16 @llvm.convert.to.fp16.f32(float) #1
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; FALLBACK: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmove-fpr.ll
similarity index 95%
rename from llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr.ll
rename to llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmove-fpr.ll
index a0f1b719372b3..bb362d27979ad 100644
--- a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmove-fpr.ll
@@ -4,7 +4,7 @@
 ; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 -mattr=-zcm-fpr128 | FileCheck %s -check-prefixes=NOZCM-FPR128-ATTR --match-full-lines
 ; RUN: llc < %s -mtriple=arm64-apple-macosx -mattr=+zcm-fpr128 | FileCheck %s -check-prefixes=ZCM-FPR128-ATTR --match-full-lines
 
-define void @zero_cycle_regmov_FPR64(double %a, double %b, double %c, double %d) {
+define void @zero_cycle_regmove_FPR64(double %a, double %b, double %c, double %d) {
 entry:
 ; CHECK-LABEL: t:
 ; NOZCM-FPR128-CPU: fmov d0, d2
@@ -45,7 +45,7 @@ entry:
 
 declare float @foo_double(double, double)
 
-define void @zero_cycle_regmov_FPR32(float %a, float %b, float %c, float %d) {
+define void @zero_cycle_regmove_FPR32(float %a, float %b, float %c, float %d) {
 entry:
 ; CHECK-LABEL: t:
 ; NOZCM-FPR128-CPU: fmov s0, s2
@@ -86,7 +86,7 @@ entry:
 
 declare float @foo_float(float, float)
 
-define void @zero_cycle_regmov_FPR16(half %a, half %b, half %c, half %d) {
+define void @zero_cycle_regmove_FPR16(half %a, half %b, half %c, half %d) {
 entry:
 ; CHECK-LABEL: t:
 ; NOZCM-FPR128-CPU: fmov s0, s2
diff --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-gpr.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmove-gpr.ll
similarity index 96%
rename from llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-gpr.ll
rename to llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmove-gpr.ll
index e14e69b5e6a2a..d6d3f15713c55 100644
--- a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-gpr.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmove-gpr.ll
@@ -4,7 +4,7 @@
 ; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 -mattr=-zcm-gpr64 | FileCheck %s -check-prefixes=NOTATTR --match-full-lines
 ; RUN: llc < %s -mtriple=arm64-apple-macosx -mattr=+zcm-gpr64 | FileCheck %s -check-prefixes=ATTR --match-full-lines
 
-define void @zero_cycle_regmov_GPR32(i32 %a, i32 %b, i32 %c, i32 %d) {
+define void @zero_cycle_regmove_GPR32(i32 %a, i32 %b, i32 %c, i32 %d) {
 entry:
 ; CHECK-LABEL: t:
 ; NOTCPU-LINUX: mov w0, w2
diff --git a/llvm/test/CodeGen/AArch64/avoid-pre-trunc.ll b/llvm/test/CodeGen/AArch64/avoid-pre-trunc.ll
index c4de177176e33..d7a2a83cf3660 100644
--- a/llvm/test/CodeGen/AArch64/avoid-pre-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/avoid-pre-trunc.ll
@@ -5,32 +5,30 @@ define <16 x i8> @lower_trunc_16xi8(i16 %a, i16 %b, i16 %c, i16 %d, i16 %e, i16
 ; CHECK-LABEL: lower_trunc_16xi8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmov s0, w0
-; CHECK-NEXT:    ldr h1, [sp]
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    mov v0.b[1], w1
+; CHECK-NEXT:    mov v0.b[2], w2
+; CHECK-NEXT:    mov v0.b[3], w3
+; CHECK-NEXT:    mov v0.b[4], w4
+; CHECK-NEXT:    mov v0.b[5], w5
+; CHECK-NEXT:    mov v0.b[6], w6
+; CHECK-NEXT:    mov v0.b[7], w7
+; CHECK-NEXT:    ld1 { v0.b }[8], [x8]
 ; CHECK-NEXT:    add x8, sp, #8
-; CHECK-NEXT:    ld1 { v1.h }[1], [x8]
+; CHECK-NEXT:    ld1 { v0.b }[9], [x8]
 ; CHECK-NEXT:    add x8, sp, #16
-; CHECK-NEXT:    mov v0.h[1], w1
-; CHECK-NEXT:    ld1 { v1.h }[2], [x8]
+; CHECK-NEXT:    ld1 { v0.b }[10], [x8]
 ; CHECK-NEXT:    add x8, sp, #24
-; CHECK-NEXT:    mov v0.h[2], w2
-; CHECK-NEXT:    ld1 { v1.h }[3], [x8]
+; CHECK-NEXT:    ld1 { v0.b }[11], [x8]
 ; CHECK-NEXT:    add x8, sp, #32
-; CHECK-NEXT:    mov v0.h[3], w3
-; CHECK-NEXT:    ld1 { v1.h }[4], [x8]
+; CHECK-NEXT:    ld1 { v0.b }[12], [x8]
 ; CHECK-NEXT:    add x8, sp, #40
-; CHECK-NEXT:    ld1 { v1.h }[5], [x8]
+; CHECK-NEXT:    ld1 { v0.b }[13], [x8]
 ; CHECK-NEXT:    add x8, sp, #48
-; CHECK-NEXT:    mov v0.h[4], w4
-; CHECK-NEXT:    ld1 { v1.h }[6], [x8]
+; CHECK-NEXT:    ld1 { v0.b }[14], [x8]
 ; CHECK-NEXT:    add x8, sp, #56
-; CHECK-NEXT:    mov v0.h[5], w5
-; CHECK-NEXT:    ld1 { v1.h }[7], [x8]
-; CHECK-NEXT:    mov v0.h[6], w6
-; CHECK-NEXT:    add v2.8h, v1.8h, v1.8h
-; CHECK-NEXT:    mov v0.h[7], w7
-; CHECK-NEXT:    add v3.8h, v0.8h, v0.8h
-; CHECK-NEXT:    uzp1 v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    uzp1 v1.16b, v3.16b, v2.16b
+; CHECK-NEXT:    ld1 { v0.b }[15], [x8]
+; CHECK-NEXT:    add v1.16b, v0.16b, v0.16b
 ; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %a1 = insertelement <16 x i16> poison, i16 %a, i16 0
@@ -59,18 +57,15 @@ define <16 x i8> @lower_trunc_16xi8(i16 %a, i16 %b, i16 %c, i16 %d, i16 %e, i16
 define <8 x i16> @lower_trunc_8xi16(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h) {
 ; CHECK-LABEL: lower_trunc_8xi16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov s0, w4
-; CHECK-NEXT:    fmov s1, w0
-; CHECK-NEXT:    mov v0.s[1], w5
-; CHECK-NEXT:    mov v1.s[1], w1
-; CHECK-NEXT:    mov v0.s[2], w6
-; CHECK-NEXT:    mov v1.s[2], w2
-; CHECK-NEXT:    mov v0.s[3], w7
-; CHECK-NEXT:    mov v1.s[3], w3
-; CHECK-NEXT:    add v2.4s, v0.4s, v0.4s
-; CHECK-NEXT:    add v3.4s, v1.4s, v1.4s
-; CHECK-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
-; CHECK-NEXT:    uzp1 v1.8h, v3.8h, v2.8h
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    mov v0.h[1], w1
+; CHECK-NEXT:    mov v0.h[2], w2
+; CHECK-NEXT:    mov v0.h[3], w3
+; CHECK-NEXT:    mov v0.h[4], w4
+; CHECK-NEXT:    mov v0.h[5], w5
+; CHECK-NEXT:    mov v0.h[6], w6
+; CHECK-NEXT:    mov v0.h[7], w7
+; CHECK-NEXT:    add v1.8h, v0.8h, v0.8h
 ; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %a1 = insertelement <8 x i32> poison, i32 %a, i32 0
@@ -91,14 +86,11 @@ define <8 x i16> @lower_trunc_8xi16(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32
 define <4 x i32> @lower_trunc_4xi32(i64 %a, i64 %b, i64 %c, i64 %d) {
 ; CHECK-LABEL: lower_trunc_4xi32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov d0, x2
-; CHECK-NEXT:    fmov d1, x0
-; CHECK-NEXT:    mov v0.d[1], x3
-; CHECK-NEXT:    mov v1.d[1], x1
-; CHECK-NEXT:    add v2.2d, v0.2d, v0.2d
-; CHECK-NEXT:    add v3.2d, v1.2d, v1.2d
-; CHECK-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    uzp1 v1.4s, v3.4s, v2.4s
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    mov v0.s[1], w1
+; CHECK-NEXT:    mov v0.s[2], w2
+; CHECK-NEXT:    mov v0.s[3], w3
+; CHECK-NEXT:    add v1.4s, v0.4s, v0.4s
 ; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %a1 = insertelement <4 x i64> poison, i64 %a, i64 0
@@ -115,24 +107,20 @@ define <4 x i32> @lower_trunc_4xi32(i64 %a, i64 %b, i64 %c, i64 %d) {
 define <8 x i32> @lower_trunc_8xi32(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f, i64 %g, i64 %h) {
 ; CHECK-LABEL: lower_trunc_8xi32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov d0, x2
-; CHECK-NEXT:    fmov d1, x0
-; CHECK-NEXT:    fmov d2, x6
-; CHECK-NEXT:    fmov d3, x4
-; CHECK-NEXT:    mov v0.d[1], x3
-; CHECK-NEXT:    mov v1.d[1], x1
-; CHECK-NEXT:    mov v2.d[1], x7
-; CHECK-NEXT:    mov v3.d[1], x5
-; CHECK-NEXT:    add v4.2d, v0.2d, v0.2d
-; CHECK-NEXT:    add v5.2d, v1.2d, v1.2d
-; CHECK-NEXT:    add v6.2d, v2.2d, v2.2d
-; CHECK-NEXT:    add v7.2d, v3.2d, v3.2d
+; CHECK-NEXT:    fmov d0, x6
+; CHECK-NEXT:    fmov d1, x4
+; CHECK-NEXT:    fmov d2, x2
+; CHECK-NEXT:    fmov d3, x0
+; CHECK-NEXT:    mov v0.d[1], x7
+; CHECK-NEXT:    mov v1.d[1], x5
+; CHECK-NEXT:    mov v2.d[1], x3
+; CHECK-NEXT:    mov v3.d[1], x1
+; CHECK-NEXT:    uzp1 v1.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    uzp1 v2.4s, v3.4s, v2.4s
-; CHECK-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    uzp1 v3.4s, v5.4s, v4.4s
-; CHECK-NEXT:    uzp1 v1.4s, v7.4s, v6.4s
-; CHECK-NEXT:    eor v0.16b, v0.16b, v3.16b
-; CHECK-NEXT:    eor v1.16b, v2.16b, v1.16b
+; CHECK-NEXT:    add v3.4s, v1.4s, v1.4s
+; CHECK-NEXT:    add v0.4s, v2.4s, v2.4s
+; CHECK-NEXT:    eor v1.16b, v1.16b, v3.16b
+; CHECK-NEXT:    eor v0.16b, v2.16b, v0.16b
 ; CHECK-NEXT:    ret
   %a1 = insertelement <8 x i64> poison, i64 %a, i64 0
   %b1 = insertelement <8 x i64> %a1, i64 %b, i64 1
diff --git a/llvm/test/CodeGen/AArch64/combine-sdiv.ll b/llvm/test/CodeGen/AArch64/combine-sdiv.ll
index dc88f9414b866..cca190f08df2b 100644
--- a/llvm/test/CodeGen/AArch64/combine-sdiv.ll
+++ b/llvm/test/CodeGen/AArch64/combine-sdiv.ll
@@ -1774,3 +1774,88 @@ define i128 @combine_i128_sdiv_const100(i128 %x) {
   %1 = sdiv i128 %x, 100
   ret i128 %1
 }
+
+; The following only becomes an sdiv_by_one after type legalisation, after which
+; the splatted scalar constant has a different type to the splat vector. This
+; test verifies DAGCombiner does not care about this type difference.
+define <16 x i16> @combine_vec_sdiv_by_one_obfuscated(<16 x i16> %x) "target-features"="+sve" {
+; CHECK-SD-LABEL: combine_vec_sdiv_by_one_obfuscated:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: combine_vec_sdiv_by_one_obfuscated:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-GI-NEXT:    movi v3.8h, #1
+; CHECK-GI-NEXT:    smov w8, v0.h[0]
+; CHECK-GI-NEXT:    mov v3.h[0], v2.h[0]
+; CHECK-GI-NEXT:    smov w9, v3.h[0]
+; CHECK-GI-NEXT:    smov w16, v3.h[7]
+; CHECK-GI-NEXT:    sdiv w14, w8, w9
+; CHECK-GI-NEXT:    smov w8, v0.h[1]
+; CHECK-GI-NEXT:    smov w9, v3.h[1]
+; CHECK-GI-NEXT:    sdiv w15, w8, w9
+; CHECK-GI-NEXT:    smov w8, v0.h[2]
+; CHECK-GI-NEXT:    smov w9, v3.h[2]
+; CHECK-GI-NEXT:    sdiv w13, w8, w9
+; CHECK-GI-NEXT:    smov w8, v0.h[3]
+; CHECK-GI-NEXT:    smov w9, v3.h[3]
+; CHECK-GI-NEXT:    sdiv w12, w8, w9
+; CHECK-GI-NEXT:    smov w8, v0.h[4]
+; CHECK-GI-NEXT:    smov w9, v3.h[4]
+; CHECK-GI-NEXT:    sdiv w11, w8, w9
+; CHECK-GI-NEXT:    smov w8, v0.h[5]
+; CHECK-GI-NEXT:    smov w9, v3.h[5]
+; CHECK-GI-NEXT:    sdiv w10, w8, w9
+; CHECK-GI-NEXT:    smov w8, v0.h[6]
+; CHECK-GI-NEXT:    smov w9, v3.h[6]
+; CHECK-GI-NEXT:    movi v3.8h, #1
+; CHECK-GI-NEXT:    smov w17, v3.h[0]
+; CHECK-GI-NEXT:    smov w18, v3.h[1]
+; CHECK-GI-NEXT:    smov w0, v3.h[2]
+; CHECK-GI-NEXT:    smov w1, v3.h[3]
+; CHECK-GI-NEXT:    smov w2, v3.h[4]
+; CHECK-GI-NEXT:    smov w3, v3.h[5]
+; CHECK-GI-NEXT:    sdiv w8, w8, w9
+; CHECK-GI-NEXT:    smov w9, v0.h[7]
+; CHECK-GI-NEXT:    fmov s0, w14
+; CHECK-GI-NEXT:    mov v0.h[1], w15
+; CHECK-GI-NEXT:    smov w15, v1.h[6]
+; CHECK-GI-NEXT:    mov v0.h[2], w13
+; CHECK-GI-NEXT:    sdiv w9, w9, w16
+; CHECK-GI-NEXT:    smov w16, v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[3], w12
+; CHECK-GI-NEXT:    smov w12, v1.h[7]
+; CHECK-GI-NEXT:    mov v0.h[4], w11
+; CHECK-GI-NEXT:    sdiv w16, w16, w17
+; CHECK-GI-NEXT:    smov w17, v1.h[1]
+; CHECK-GI-NEXT:    mov v0.h[5], w10
+; CHECK-GI-NEXT:    mov v0.h[6], w8
+; CHECK-GI-NEXT:    sdiv w17, w17, w18
+; CHECK-GI-NEXT:    smov w18, v1.h[2]
+; CHECK-GI-NEXT:    fmov s2, w16
+; CHECK-GI-NEXT:    smov w16, v3.h[6]
+; CHECK-GI-NEXT:    mov v0.h[7], w9
+; CHECK-GI-NEXT:    sdiv w18, w18, w0
+; CHECK-GI-NEXT:    smov w0, v1.h[3]
+; CHECK-GI-NEXT:    mov v2.h[1], w17
+; CHECK-GI-NEXT:    sdiv w0, w0, w1
+; CHECK-GI-NEXT:    smov w1, v1.h[4]
+; CHECK-GI-NEXT:    mov v2.h[2], w18
+; CHECK-GI-NEXT:    sdiv w1, w1, w2
+; CHECK-GI-NEXT:    smov w2, v1.h[5]
+; CHECK-GI-NEXT:    mov v2.h[3], w0
+; CHECK-GI-NEXT:    sdiv w14, w2, w3
+; CHECK-GI-NEXT:    mov v2.h[4], w1
+; CHECK-GI-NEXT:    sdiv w13, w15, w16
+; CHECK-GI-NEXT:    smov w15, v3.h[7]
+; CHECK-GI-NEXT:    mov v2.h[5], w14
+; CHECK-GI-NEXT:    sdiv w10, w12, w15
+; CHECK-GI-NEXT:    mov v2.h[6], w13
+; CHECK-GI-NEXT:    mov v2.h[7], w10
+; CHECK-GI-NEXT:    mov v1.16b, v2.16b
+; CHECK-GI-NEXT:    ret
+  %zero_and_ones = shufflevector <16 x i16> zeroinitializer, <16 x i16> splat (i16 1), <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %div = sdiv <16 x i16> %x, %zero_and_ones
+  ret <16 x i16> %div
+}
diff --git a/llvm/test/CodeGen/AArch64/machine-sme-abi-find-insert-pt.mir b/llvm/test/CodeGen/AArch64/machine-sme-abi-find-insert-pt.mir
new file mode 100644
index 0000000000000..3f174a62128a8
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/machine-sme-abi-find-insert-pt.mir
@@ -0,0 +1,227 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=aarch64 -mattr=+sve -mattr=+sme -run-pass=aarch64-machine-sme-abi -verify-machineinstrs %s -o - | FileCheck %s
+
+--- |
+  ; Test moving a state change to be before a $nzcv def
+  define void @move_before_nzcv_def() "aarch64_inout_za" { ret void }
+
+  ; Test moving a state change to a point where $x0 is live
+  define void @move_to_x0_live() "aarch64_inout_za" { ret void }
+
+  ; Test we don't move before a previous state change.
+  define void @do_not_move_before_prior_state_change() "aarch64_za_state_agnostic" { ret void }
+
+  ; Test we don't move into a call sequence.
+  define void @do_not_move_into_call() "aarch64_inout_za" { ret void }
+
+  declare void @clobber()
+  declare void @inout_call() "aarch64_inout_za"
+...
+---
+name:            move_before_nzcv_def
+tracksRegLiveness: true
+isSSA:             true
+noVRegs:           false
+body:             |
+  bb.0:
+
+    ; CHECK-LABEL: name: move_before_nzcv_def
+    ; CHECK: [[RDSVLI_XI:%[0-9]+]]:gpr64 = RDSVLI_XI 1, implicit $vg
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $sp
+    ; CHECK-NEXT: [[MSUBXrrr:%[0-9]+]]:gpr64 = MSUBXrrr [[RDSVLI_XI]], [[RDSVLI_XI]], [[COPY]]
+    ; CHECK-NEXT: $sp = COPY [[MSUBXrrr]]
+    ; CHECK-NEXT: STPXi [[MSUBXrrr]], [[RDSVLI_XI]], %stack.0, 0
+    ; CHECK-NEXT: [[ADDXri:%[0-9]+]]:gpr64sp = ADDXri %stack.0, 0, 0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY [[ADDXri]]
+    ; CHECK-NEXT: MSR 56965, [[COPY1]]
+    ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    ; CHECK-NEXT: RequiresZASavePseudo
+    ; CHECK-NEXT: BL @clobber, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
+    ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+    ; CHECK-NEXT: MSRpstatesvcrImm1 2, 1, implicit-def $nzcv
+    ; CHECK-NEXT: [[MRS:%[0-9]+]]:gpr64 = MRS 56965, implicit-def $nzcv
+    ; CHECK-NEXT: $x0 = ADDXri %stack.0, 0, 0
+    ; CHECK-NEXT: RestoreZAPseudo [[MRS]], $x0, &__arm_tpidr2_restore, csr_aarch64_sme_abi_support_routines_preservemost_from_x0
+    ; CHECK-NEXT: MSR 56965, $xzr
+    ; CHECK-NEXT: $nzcv = IMPLICIT_DEF
+    ; CHECK-NEXT: $zab0 = IMPLICIT_DEF
+    ; CHECK-NEXT: FAKE_USE $nzcv
+    ; CHECK-NEXT: RET_ReallyLR
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    RequiresZASavePseudo
+    BL @clobber, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+
+    $nzcv = IMPLICIT_DEF
+    $zab0 = IMPLICIT_DEF
+    FAKE_USE $nzcv
+
+    RET_ReallyLR
+...
+---
+name:            move_to_x0_live
+tracksRegLiveness: true
+isSSA:             true
+noVRegs:           false
+body:             |
+  bb.0:
+
+    ; CHECK-LABEL: name: move_to_x0_live
+    ; CHECK: [[RDSVLI_XI:%[0-9]+]]:gpr64 = RDSVLI_XI 1, implicit $vg
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $sp
+    ; CHECK-NEXT: [[MSUBXrrr:%[0-9]+]]:gpr64 = MSUBXrrr [[RDSVLI_XI]], [[RDSVLI_XI]], [[COPY]]
+    ; CHECK-NEXT: $sp = COPY [[MSUBXrrr]]
+    ; CHECK-NEXT: STPXi [[MSUBXrrr]], [[RDSVLI_XI]], %stack.0, 0
+    ; CHECK-NEXT: [[ADDXri:%[0-9]+]]:gpr64sp = ADDXri %stack.0, 0, 0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY [[ADDXri]]
+    ; CHECK-NEXT: MSR 56965, [[COPY1]]
+    ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    ; CHECK-NEXT: RequiresZASavePseudo
+    ; CHECK-NEXT: BL @clobber, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
+    ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+    ; CHECK-NEXT: $x0 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK-NEXT: MSRpstatesvcrImm1 2, 1, implicit-def $nzcv
+    ; CHECK-NEXT: [[MRS:%[0-9]+]]:gpr64 = MRS 56965, implicit-def $nzcv
+    ; CHECK-NEXT: $x0 = ADDXri %stack.0, 0, 0
+    ; CHECK-NEXT: RestoreZAPseudo [[MRS]], $x0, &__arm_tpidr2_restore, csr_aarch64_sme_abi_support_routines_preservemost_from_x0
+    ; CHECK-NEXT: MSR 56965, $xzr
+    ; CHECK-NEXT: $x0 = COPY [[COPY2]]
+    ; CHECK-NEXT: $nzcv = IMPLICIT_DEF
+    ; CHECK-NEXT: FAKE_USE $x0
+    ; CHECK-NEXT: $zab0 = IMPLICIT_DEF
+    ; CHECK-NEXT: FAKE_USE $nzcv
+    ; CHECK-NEXT: RET_ReallyLR
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    RequiresZASavePseudo
+    BL @clobber, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+
+    $x0 = IMPLICIT_DEF
+
+    $nzcv = IMPLICIT_DEF
+    FAKE_USE $x0
+
+    $zab0 = IMPLICIT_DEF
+    FAKE_USE $nzcv
+
+    RET_ReallyLR
+...
+---
+name:            do_not_move_before_prior_state_change
+tracksRegLiveness: true
+isSSA:             true
+noVRegs:           false
+body:             |
+  ; CHECK-LABEL: name: do_not_move_before_prior_state_change
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   BL &__arm_sme_state_size, csr_aarch64_sme_abi_support_routines_preservemost_from_x1, implicit-def $lr, implicit $sp, implicit-def $x0
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+  ; CHECK-NEXT:   $sp = SUBXrx64 $sp, [[COPY]], 24
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr64 = COPY $sp
+  ; CHECK-NEXT:   $nzcv = IMPLICIT_DEF
+  ; CHECK-NEXT:   $zab0 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[MRS:%[0-9]+]]:gpr64 = MRS 55824, implicit-def $nzcv, implicit $nzcv
+  ; CHECK-NEXT:   $x0 = COPY [[COPY1]]
+  ; CHECK-NEXT:   BL &__arm_sme_save, csr_aarch64_sme_abi_support_routines_preservemost_from_x1, implicit-def $lr, implicit $sp, implicit $x0
+  ; CHECK-NEXT:   MSR 55824, [[MRS]], implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 2, %bb.1, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $nzcv
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   FAKE_USE $nzcv
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-NEXT:   RequiresZASavePseudo
+  ; CHECK-NEXT:   BL @clobber, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-NEXT:   $x0 = COPY [[COPY1]]
+  ; CHECK-NEXT:   BL &__arm_sme_restore, csr_aarch64_sme_abi_support_routines_preservemost_from_x1, implicit-def $lr, implicit $sp, implicit $x0
+  ; CHECK-NEXT:   RET_ReallyLR
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-NEXT:   RequiresZASavePseudo
+  ; CHECK-NEXT:   BL @clobber, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-NEXT:   $x0 = COPY [[COPY1]]
+  ; CHECK-NEXT:   BL &__arm_sme_restore, csr_aarch64_sme_abi_support_routines_preservemost_from_x1, implicit-def $lr, implicit $sp, implicit $x0
+  ; CHECK-NEXT:   RET_ReallyLR
+  bb.0:
+    successors: %bb.1, %bb.2
+
+    ; The insertion point can move before the $nzcv def (as that would require
+    ; moving before a $zab0 def -- that requires the ACTIVE state).
+    $nzcv = IMPLICIT_DEF
+    $zab0 = IMPLICIT_DEF
+    Bcc 2, %bb.1, implicit $nzcv
+    B %bb.2
+  ; bb.1 and bb.2 both require ZA saved on entry (to force bb.0's exit bundle to
+  ; pick the LOCAL_SAVED state).
+  bb.1:
+    liveins: $nzcv
+    FAKE_USE $nzcv
+
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    RequiresZASavePseudo
+    BL @clobber, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+
+    RET_ReallyLR
+  bb.2:
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    RequiresZASavePseudo
+    BL @clobber, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+
+    RET_ReallyLR
+...
+---
+name:            do_not_move_into_call
+tracksRegLiveness: true
+isSSA:             true
+noVRegs:           false
+body:             |
+  bb.0:
+
+    ; CHECK-LABEL: name: do_not_move_into_call
+    ; CHECK: [[RDSVLI_XI:%[0-9]+]]:gpr64 = RDSVLI_XI 1, implicit $vg
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $sp
+    ; CHECK-NEXT: [[MSUBXrrr:%[0-9]+]]:gpr64 = MSUBXrrr [[RDSVLI_XI]], [[RDSVLI_XI]], [[COPY]]
+    ; CHECK-NEXT: $sp = COPY [[MSUBXrrr]]
+    ; CHECK-NEXT: STPXi [[MSUBXrrr]], [[RDSVLI_XI]], %stack.0, 0
+    ; CHECK-NEXT: [[ADDXri:%[0-9]+]]:gpr64sp = ADDXri %stack.0, 0, 0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY [[ADDXri]]
+    ; CHECK-NEXT: MSR 56965, [[COPY1]]
+    ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    ; CHECK-NEXT: RequiresZASavePseudo
+    ; CHECK-NEXT: BL @clobber, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
+    ; CHECK-NEXT: $nzcv = IMPLICIT_DEF
+    ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+    ; CHECK-NEXT: [[MRS:%[0-9]+]]:gpr64 = MRS 55824, implicit-def $nzcv, implicit $nzcv
+    ; CHECK-NEXT: MSRpstatesvcrImm1 2, 1, implicit-def $nzcv
+    ; CHECK-NEXT: [[MRS1:%[0-9]+]]:gpr64 = MRS 56965, implicit-def $nzcv
+    ; CHECK-NEXT: $x0 = ADDXri %stack.0, 0, 0
+    ; CHECK-NEXT: RestoreZAPseudo [[MRS1]], $x0, &__arm_tpidr2_restore, csr_aarch64_sme_abi_support_routines_preservemost_from_x0
+    ; CHECK-NEXT: MSR 56965, $xzr
+    ; CHECK-NEXT: MSR 55824, [[MRS]], implicit-def $nzcv
+    ; CHECK-NEXT: $zab0 = IMPLICIT_DEF
+    ; CHECK-NEXT: FAKE_USE $nzcv
+    ; CHECK-NEXT: RET_ReallyLR
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    RequiresZASavePseudo
+    BL @clobber, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
+
+    ; This is artificial test where NZCV is def'd inside a call, so we can't
+    ; move the insert point before it's definition.
+    $nzcv = IMPLICIT_DEF
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+
+    $zab0 = IMPLICIT_DEF
+    FAKE_USE $nzcv
+
+    RET_ReallyLR
+...
diff --git a/llvm/test/CodeGen/AArch64/mir-yaml-has-streaming-mode-changes.ll b/llvm/test/CodeGen/AArch64/mir-yaml-has-streaming-mode-changes.ll
new file mode 100644
index 0000000000000..8f1fe5cbc1f97
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/mir-yaml-has-streaming-mode-changes.ll
@@ -0,0 +1,13 @@
+; RUN: llc -mtriple=aarch64 -mattr=+sme -stop-after=aarch64-isel < %s | FileCheck %s
+
+target triple = "aarch64"
+
+declare void @foo() "aarch64_pstate_sm_enabled"
+
+define dso_local void @bar() local_unnamed_addr {
+; CHECK-LABEL: name: bar
+; CHECK: hasStreamingModeChanges: true
+entry:
+  tail call void @foo() "aarch64_pstate_sm_enabled"
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/peephole-csel.ll b/llvm/test/CodeGen/AArch64/peephole-csel.ll
index 868b9f1f2f6ac..b085258059b7e 100644
--- a/llvm/test/CodeGen/AArch64/peephole-csel.ll
+++ b/llvm/test/CodeGen/AArch64/peephole-csel.ll
@@ -5,10 +5,9 @@ define void @peephole_csel(ptr %dst, i1 %0, i1 %cmp) {
 ; CHECK-LABEL: peephole_csel:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    tst w2, #0x1
-; CHECK-NEXT:    mov w8, #1 // =0x1
-; CHECK-NEXT:    mov x9, xzr
+; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:    tst w1, #0x1
-; CHECK-NEXT:    csel x8, x8, x9, eq
+; CHECK-NEXT:    csinc x8, x8, xzr, ne
 ; CHECK-NEXT:    str x8, [x0]
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/rax1.ll b/llvm/test/CodeGen/AArch64/rax1.ll
index f679e9099d09e..4bb551f0c512d 100644
--- a/llvm/test/CodeGen/AArch64/rax1.ll
+++ b/llvm/test/CodeGen/AArch64/rax1.ll
@@ -1,22 +1,44 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64 -mattr=+sha3 < %s | FileCheck --check-prefix=SHA3 %s
-; RUN: llc -mtriple=aarch64 -mattr=-sha3 < %s | FileCheck --check-prefix=NOSHA3 %s
+; RUN: llc -mtriple=aarch64 -mattr=+sha3 < %s | FileCheck --check-prefixes=SHA3,SHA3-SD %s
+; RUN: llc -mtriple=aarch64 -mattr=-sha3 < %s | FileCheck --check-prefixes=NOSHA3,NOSHA3-SD %s
+; RUN: llc -mtriple=aarch64 -mattr=+sha3 < %s -global-isel | FileCheck --check-prefixes=SHA3,SHA3-GI %s
+; RUN: llc -mtriple=aarch64 -mattr=-sha3 < %s -global-isel | FileCheck --check-prefixes=NOSHA3,NOSHA3-GI %s
 
 define <2 x i64> @rax1(<2 x i64> %x, <2 x i64> %y) {
-; SHA3-LABEL: rax1:
-; SHA3:       // %bb.0:
-; SHA3-NEXT:    rax1 v0.2d, v0.2d, v1.2d
-; SHA3-NEXT:    ret
+; SHA3-SD-LABEL: rax1:
+; SHA3-SD:       // %bb.0:
+; SHA3-SD-NEXT:    rax1 v0.2d, v0.2d, v1.2d
+; SHA3-SD-NEXT:    ret
 ;
-; NOSHA3-LABEL: rax1:
-; NOSHA3:       // %bb.0:
-; NOSHA3-NEXT:    add v2.2d, v1.2d, v1.2d
-; NOSHA3-NEXT:    usra v2.2d, v1.2d, #63
-; NOSHA3-NEXT:    eor v0.16b, v0.16b, v2.16b
-; NOSHA3-NEXT:    ret
+; NOSHA3-SD-LABEL: rax1:
+; NOSHA3-SD:       // %bb.0:
+; NOSHA3-SD-NEXT:    add v2.2d, v1.2d, v1.2d
+; NOSHA3-SD-NEXT:    usra v2.2d, v1.2d, #63
+; NOSHA3-SD-NEXT:    eor v0.16b, v0.16b, v2.16b
+; NOSHA3-SD-NEXT:    ret
+;
+; SHA3-GI-LABEL: rax1:
+; SHA3-GI:       // %bb.0:
+; SHA3-GI-NEXT:    shl v2.2d, v1.2d, #1
+; SHA3-GI-NEXT:    ushr v1.2d, v1.2d, #63
+; SHA3-GI-NEXT:    orr v1.16b, v2.16b, v1.16b
+; SHA3-GI-NEXT:    eor v0.16b, v0.16b, v1.16b
+; SHA3-GI-NEXT:    ret
+;
+; NOSHA3-GI-LABEL: rax1:
+; NOSHA3-GI:       // %bb.0:
+; NOSHA3-GI-NEXT:    shl v2.2d, v1.2d, #1
+; NOSHA3-GI-NEXT:    ushr v1.2d, v1.2d, #63
+; NOSHA3-GI-NEXT:    orr v1.16b, v2.16b, v1.16b
+; NOSHA3-GI-NEXT:    eor v0.16b, v0.16b, v1.16b
+; NOSHA3-GI-NEXT:    ret
     %a = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %y, <2 x i64> %y, <2 x i64> <i64 1, i64 1>)
     %b = xor <2 x i64> %x, %a
     ret <2 x i64> %b
 }
 
 declare <2 x i64> @llvm.fshl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; NOSHA3: {{.*}}
+; SHA3: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
index e3007a3723484..e4f9efae0e98f 100644
--- a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
+++ b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
@@ -391,11 +391,9 @@ define void @agnostic_za_buffer_alloc_with_stack_probes() nounwind "aarch64_za_s
 ; CHECK-NEWLOWERING-NEXT:    sub x19, x8, x0
 ; CHECK-NEWLOWERING-NEXT:  .LBB7_1: // =>This Inner Loop Header: Depth=1
 ; CHECK-NEWLOWERING-NEXT:    sub sp, sp, #16, lsl #12 // =65536
-; CHECK-NEWLOWERING-NEXT:    cmp sp, x19
 ; CHECK-NEWLOWERING-NEXT:    mov x0, x19
-; CHECK-NEWLOWERING-NEXT:    mrs x8, NZCV
 ; CHECK-NEWLOWERING-NEXT:    bl __arm_sme_save
-; CHECK-NEWLOWERING-NEXT:    msr NZCV, x8
+; CHECK-NEWLOWERING-NEXT:    cmp sp, x19
 ; CHECK-NEWLOWERING-NEXT:    b.le .LBB7_3
 ; CHECK-NEWLOWERING-NEXT:  // %bb.2: // in Loop: Header=BB7_1 Depth=1
 ; CHECK-NEWLOWERING-NEXT:    mov x0, x19
diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-sve-nzcv-live.mir b/llvm/test/CodeGen/AArch64/sme-lazy-sve-nzcv-live.mir
index 18764d508d0fa..9f33c0614cee0 100644
--- a/llvm/test/CodeGen/AArch64/sme-lazy-sve-nzcv-live.mir
+++ b/llvm/test/CodeGen/AArch64/sme-lazy-sve-nzcv-live.mir
@@ -62,14 +62,12 @@ body:             |
   ; CHECK-NEXT:   RequiresZASavePseudo
   ; CHECK-NEXT:   BL @clobber, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
-  ; CHECK-NEXT:   [[SUBSWri:%[0-9]+]]:gpr32 = SUBSWri [[COPY1]], 101, 0, implicit-def $nzcv
-  ; CHECK-NEXT:   [[MRS:%[0-9]+]]:gpr64 = MRS 55824, implicit-def $nzcv, implicit $nzcv
   ; CHECK-NEXT:   MSRpstatesvcrImm1 2, 1, implicit-def $nzcv
   ; CHECK-NEXT:   [[MRS1:%[0-9]+]]:gpr64 = MRS 56965, implicit-def $nzcv
   ; CHECK-NEXT:   $x0 = ADDXri %stack.0, 0, 0
   ; CHECK-NEXT:   RestoreZAPseudo [[MRS1]], $x0, &__arm_tpidr2_restore, csr_aarch64_sme_abi_support_routines_preservemost_from_x0
   ; CHECK-NEXT:   MSR 56965, $xzr
-  ; CHECK-NEXT:   MSR 55824, [[MRS]], implicit-def $nzcv
+  ; CHECK-NEXT:   [[SUBSWri:%[0-9]+]]:gpr32 = SUBSWri [[COPY1]], 101, 0, implicit-def $nzcv
   ; CHECK-NEXT:   Bcc 11, %bb.2, implicit $nzcv
   ; CHECK-NEXT:   B %bb.1
   ; CHECK-NEXT: {{  $}}
@@ -116,16 +114,14 @@ body:             |
 # CHECK-ASM-LABEL: cmp_branch
 #       CHECK-ASM:   msr TPIDR2_EL0, x10
 #  CHECK-ASM-NEXT:   bl clobber
-#  CHECK-ASM-NEXT:   cmp w20, #101
-#  CHECK-ASM-NEXT:   mrs x8, NZCV
 #  CHECK-ASM-NEXT:   smstart za
-#  CHECK-ASM-NEXT:   mrs x9, TPIDR2_EL0
+#  CHECK-ASM-NEXT:   mrs x8, TPIDR2_EL0
 #  CHECK-ASM-NEXT:   sub x0, x29, #16
-#  CHECK-ASM-NEXT:   cbnz x9, .LBB0_2
+#  CHECK-ASM-NEXT:   cbnz x8, .LBB0_2
 #       CHECK-ASM:   bl __arm_tpidr2_restore
 #  CHECK-ASM-NEXT: .LBB0_2:
+#  CHECK-ASM-NEXT:   cmp w20, #101
 #  CHECK-ASM-NEXT:   msr TPIDR2_EL0, xzr
-#  CHECK-ASM-NEXT:   msr NZCV, x8
 #  CHECK-ASM-NEXT:   b.lt .LBB0_4
 #       CHECK-ASM:   bl inout_call
 #  CHECK-ASM-NEXT: .LBB0_4:
diff --git a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
index b6dee97ea2962..b8d6c8824759a 100644
--- a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
+++ b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
@@ -732,6 +732,247 @@ exit:
   ret void
 }
 
+; This example corresponds to:
+;
+; __arm_agnostic("sme_za_state") void try_catch_agnostic_za_invoke()
+; {
+;    try {
+;        agnostic_za_call();
+;    } catch(...) {
+;    }
+; }
+;
+; In this example we preserve all SME state enabled by PSTATE.ZA using
+; `__arm_sme_save` before agnostic_za_call(). This is because on all normal
+; returns from an agnostic ZA function ZA state should be preserved. That means
+; we need to make sure ZA state is saved in case agnostic_za_call() throws, and
+; we need to restore ZA state after unwinding to the catch block.
+
+define void @try_catch_agnostic_za_invoke() "aarch64_za_state_agnostic" personality ptr @__gxx_personality_v0 {
+; CHECK-LABEL: try_catch_agnostic_za_invoke:
+; CHECK:       .Lfunc_begin5:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:    .cfi_personality 156, DW.ref.__gxx_personality_v0
+; CHECK-NEXT:    .cfi_lsda 28, .Lexception5
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-NEXT:    .cfi_offset w19, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    bl __arm_sme_state_size
+; CHECK-NEXT:    sub sp, sp, x0
+; CHECK-NEXT:    mov x19, sp
+; CHECK-NEXT:  .Ltmp15: // EH_LABEL
+; CHECK-NEXT:    mov x0, x19
+; CHECK-NEXT:    bl __arm_sme_save
+; CHECK-NEXT:    bl agnostic_za_call
+; CHECK-NEXT:  .Ltmp16: // EH_LABEL
+; CHECK-NEXT:  .LBB5_1: // %exit
+; CHECK-NEXT:    mov x0, x19
+; CHECK-NEXT:    bl __arm_sme_restore
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB5_2: // %catch
+; CHECK-NEXT:  .Ltmp17: // EH_LABEL
+; CHECK-NEXT:    bl __cxa_begin_catch
+; CHECK-NEXT:    bl __cxa_end_catch
+; CHECK-NEXT:    b .LBB5_1
+;
+; CHECK-SDAG-LABEL: try_catch_agnostic_za_invoke:
+; CHECK-SDAG:       .Lfunc_begin5:
+; CHECK-SDAG-NEXT:    .cfi_startproc
+; CHECK-SDAG-NEXT:    .cfi_personality 156, DW.ref.__gxx_personality_v0
+; CHECK-SDAG-NEXT:    .cfi_lsda 28, .Lexception5
+; CHECK-SDAG-NEXT:  // %bb.0: // %entry
+; CHECK-SDAG-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SDAG-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-SDAG-NEXT:    mov x29, sp
+; CHECK-SDAG-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-SDAG-NEXT:    .cfi_offset w19, -16
+; CHECK-SDAG-NEXT:    .cfi_offset w30, -24
+; CHECK-SDAG-NEXT:    .cfi_offset w29, -32
+; CHECK-SDAG-NEXT:    bl __arm_sme_state_size
+; CHECK-SDAG-NEXT:    sub sp, sp, x0
+; CHECK-SDAG-NEXT:    mov x19, sp
+; CHECK-SDAG-NEXT:  .Ltmp15: // EH_LABEL
+; CHECK-SDAG-NEXT:    mov x0, x19
+; CHECK-SDAG-NEXT:    bl __arm_sme_save
+; CHECK-SDAG-NEXT:    bl agnostic_za_call
+; CHECK-SDAG-NEXT:    mov x0, x19
+; CHECK-SDAG-NEXT:    bl __arm_sme_restore
+; CHECK-SDAG-NEXT:  .Ltmp16: // EH_LABEL
+; CHECK-SDAG-NEXT:  .LBB5_1: // %exit
+; CHECK-SDAG-NEXT:    mov sp, x29
+; CHECK-SDAG-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-SDAG-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-SDAG-NEXT:    ret
+; CHECK-SDAG-NEXT:  .LBB5_2: // %catch
+; CHECK-SDAG-NEXT:  .Ltmp17: // EH_LABEL
+; CHECK-SDAG-NEXT:    mov x1, x0
+; CHECK-SDAG-NEXT:    mov x0, x19
+; CHECK-SDAG-NEXT:    bl __arm_sme_restore
+; CHECK-SDAG-NEXT:    mov x0, x19
+; CHECK-SDAG-NEXT:    bl __arm_sme_save
+; CHECK-SDAG-NEXT:    mov x0, x1
+; CHECK-SDAG-NEXT:    bl __cxa_begin_catch
+; CHECK-SDAG-NEXT:    mov x0, x19
+; CHECK-SDAG-NEXT:    bl __arm_sme_restore
+; CHECK-SDAG-NEXT:    mov x0, x19
+; CHECK-SDAG-NEXT:    bl __arm_sme_save
+; CHECK-SDAG-NEXT:    bl __cxa_end_catch
+; CHECK-SDAG-NEXT:    mov x0, x19
+; CHECK-SDAG-NEXT:    bl __arm_sme_restore
+; CHECK-SDAG-NEXT:    b .LBB5_1
+entry:
+  invoke void @agnostic_za_call()
+          to label %exit unwind label %catch
+
+catch:
+  %eh_info = landingpad { ptr, i32 }
+          catch ptr null
+  %exception_ptr = extractvalue { ptr, i32 } %eh_info, 0
+  tail call ptr @__cxa_begin_catch(ptr %exception_ptr)
+  tail call void @__cxa_end_catch()
+  br label %exit
+
+exit:
+  ret void
+}
+
+; This is the same `try_catch_agnostic_za_invoke`, but shows a lazy save would
+; also need to be committed in a shared-ZA function calling an agnostic-ZA function.
+define void @try_catch_inout_za_agnostic_za_callee() "aarch64_inout_za" personality ptr @__gxx_personality_v0 {
+; CHECK-LABEL: try_catch_inout_za_agnostic_za_callee:
+; CHECK:       .Lfunc_begin6:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:    .cfi_personality 156, DW.ref.__gxx_personality_v0
+; CHECK-NEXT:    .cfi_lsda 28, .Lexception6
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    msub x9, x8, x8, x9
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    stp x9, x8, [x29, #-16]
+; CHECK-NEXT:  .Ltmp18: // EH_LABEL
+; CHECK-NEXT:    sub x8, x29, #16
+; CHECK-NEXT:    msr TPIDR2_EL0, x8
+; CHECK-NEXT:    bl agnostic_za_call
+; CHECK-NEXT:  .Ltmp19: // EH_LABEL
+; CHECK-NEXT:  .LBB6_1: // %exit
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-NEXT:    sub x0, x29, #16
+; CHECK-NEXT:    cbnz x8, .LBB6_3
+; CHECK-NEXT:  // %bb.2: // %exit
+; CHECK-NEXT:    bl __arm_tpidr2_restore
+; CHECK-NEXT:  .LBB6_3: // %exit
+; CHECK-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB6_4: // %catch
+; CHECK-NEXT:  .Ltmp20: // EH_LABEL
+; CHECK-NEXT:    bl __cxa_begin_catch
+; CHECK-NEXT:    bl __cxa_end_catch
+; CHECK-NEXT:    b .LBB6_1
+;
+; CHECK-SDAG-LABEL: try_catch_inout_za_agnostic_za_callee:
+; CHECK-SDAG:       .Lfunc_begin6:
+; CHECK-SDAG-NEXT:    .cfi_startproc
+; CHECK-SDAG-NEXT:    .cfi_personality 156, DW.ref.__gxx_personality_v0
+; CHECK-SDAG-NEXT:    .cfi_lsda 28, .Lexception6
+; CHECK-SDAG-NEXT:  // %bb.0: // %entry
+; CHECK-SDAG-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SDAG-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-SDAG-NEXT:    mov x29, sp
+; CHECK-SDAG-NEXT:    sub sp, sp, #16
+; CHECK-SDAG-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-SDAG-NEXT:    .cfi_offset w19, -16
+; CHECK-SDAG-NEXT:    .cfi_offset w30, -24
+; CHECK-SDAG-NEXT:    .cfi_offset w29, -32
+; CHECK-SDAG-NEXT:    rdsvl x8, #1
+; CHECK-SDAG-NEXT:    mov x9, sp
+; CHECK-SDAG-NEXT:    msub x9, x8, x8, x9
+; CHECK-SDAG-NEXT:    mov sp, x9
+; CHECK-SDAG-NEXT:    stp x9, x8, [x29, #-16]
+; CHECK-SDAG-NEXT:  .Ltmp18: // EH_LABEL
+; CHECK-SDAG-NEXT:    sub x19, x29, #16
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, x19
+; CHECK-SDAG-NEXT:    bl agnostic_za_call
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT:    sub x0, x29, #16
+; CHECK-SDAG-NEXT:    cbnz x8, .LBB6_2
+; CHECK-SDAG-NEXT:  // %bb.1: // %entry
+; CHECK-SDAG-NEXT:    bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT:  .LBB6_2: // %entry
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT:  .Ltmp19: // EH_LABEL
+; CHECK-SDAG-NEXT:  .LBB6_3: // %exit
+; CHECK-SDAG-NEXT:    mov sp, x29
+; CHECK-SDAG-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-SDAG-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-SDAG-NEXT:    ret
+; CHECK-SDAG-NEXT:  .LBB6_4: // %catch
+; CHECK-SDAG-NEXT:  .Ltmp20: // EH_LABEL
+; CHECK-SDAG-NEXT:    mov x1, x0
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT:    sub x0, x29, #16
+; CHECK-SDAG-NEXT:    cbnz x8, .LBB6_6
+; CHECK-SDAG-NEXT:  // %bb.5: // %catch
+; CHECK-SDAG-NEXT:    bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT:  .LBB6_6: // %catch
+; CHECK-SDAG-NEXT:    mov x0, x1
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, x19
+; CHECK-SDAG-NEXT:    bl __cxa_begin_catch
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT:    sub x0, x29, #16
+; CHECK-SDAG-NEXT:    cbnz x8, .LBB6_8
+; CHECK-SDAG-NEXT:  // %bb.7: // %catch
+; CHECK-SDAG-NEXT:    bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT:  .LBB6_8: // %catch
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, x19
+; CHECK-SDAG-NEXT:    bl __cxa_end_catch
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT:    sub x0, x29, #16
+; CHECK-SDAG-NEXT:    cbnz x8, .LBB6_10
+; CHECK-SDAG-NEXT:  // %bb.9: // %catch
+; CHECK-SDAG-NEXT:    bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT:  .LBB6_10: // %catch
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT:    b .LBB6_3
+entry:
+  invoke void @agnostic_za_call()
+          to label %exit unwind label %catch
+
+catch:
+  %eh_info = landingpad { ptr, i32 }
+          catch ptr null
+  %exception_ptr = extractvalue { ptr, i32 } %eh_info, 0
+  tail call ptr @__cxa_begin_catch(ptr %exception_ptr)
+  tail call void @__cxa_end_catch()
+  br label %exit
+
+exit:
+  ret void
+}
+
 declare ptr @__cxa_allocate_exception(i64)
 declare void @__cxa_throw(ptr, ptr, ptr)
 declare ptr @__cxa_begin_catch(ptr)
@@ -742,3 +983,4 @@ declare void @may_throw()
 declare void @shared_za_call() "aarch64_inout_za"
 declare void @noexcept_shared_za_call() "aarch64_inout_za"
 declare void @shared_zt0_call() "aarch64_inout_zt0"
+declare void @agnostic_za_call() "aarch64_za_state_agnostic"
diff --git a/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll
index 15ee6a02a1639..36655f6b781b9 100644
--- a/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll
@@ -359,12 +359,177 @@ define float @fadd_reduct_reassoc_v4v8f32(<vscale x 4 x float> %a, <vscale x 8 x
   ret float %r
 }
 
+; No FMULV instruction so use knowledge about the architectural maximum size of
+; an SVE register to "scalarise" the reduction.
+
+define half @fmulv_nxv2f16(half %init, <vscale x 2 x half> %a) {
+; CHECK-LABEL: fmulv_nxv2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov z2.h, #1.00000000
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT:    uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT:    fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT:    uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT:    uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT:    fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT:    uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT:    uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT:    fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT:    uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT:    uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT:    fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT:    uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT:    uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT:    fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT:    fmul h0, h0, h1
+; CHECK-NEXT:    ret
+  %res = call fast half @llvm.vector.reduce.fmul.nxv2f16(half %init, <vscale x 2 x half> %a)
+  ret half %res
+}
+
+define half @fmulv_nxv4f16(half %init, <vscale x 4 x half> %a) {
+; CHECK-LABEL: fmulv_nxv4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov z2.h, #1.00000000
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT:    fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT:    uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT:    fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT:    uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT:    fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT:    uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT:    fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT:    uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT:    fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT:    uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT:    fmul z1.h, p0/m, z1.h, z3.h
+; CHECK-NEXT:    fmul h0, h0, h1
+; CHECK-NEXT:    ret
+  %res = call fast half @llvm.vector.reduce.fmul.nxv4f16(half %init, <vscale x 4 x half> %a)
+  ret half %res
+}
+
+define half @fmulv_nxv8f16(half %init, <vscale x 8 x half> %a) {
+; CHECK-LABEL: fmulv_nxv8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov z2.h, #1.00000000
+; CHECK-NEXT:    uzp2 z3.h, z1.h, z2.h
+; CHECK-NEXT:    uzp1 z1.h, z1.h, z2.h
+; CHECK-NEXT:    fmul z1.h, z1.h, z3.h
+; CHECK-NEXT:    uzp2 z3.h, z1.h, z2.h
+; CHECK-NEXT:    uzp1 z1.h, z1.h, z2.h
+; CHECK-NEXT:    fmul z1.h, z1.h, z3.h
+; CHECK-NEXT:    uzp2 z3.h, z1.h, z2.h
+; CHECK-NEXT:    uzp1 z1.h, z1.h, z2.h
+; CHECK-NEXT:    fmul z1.h, z1.h, z3.h
+; CHECK-NEXT:    uzp2 z3.h, z1.h, z2.h
+; CHECK-NEXT:    uzp1 z1.h, z1.h, z2.h
+; CHECK-NEXT:    fmul z1.h, z1.h, z3.h
+; CHECK-NEXT:    uzp2 z3.h, z1.h, z2.h
+; CHECK-NEXT:    uzp1 z1.h, z1.h, z2.h
+; CHECK-NEXT:    fmul z1.h, z1.h, z3.h
+; CHECK-NEXT:    uzp2 z3.h, z1.h, z2.h
+; CHECK-NEXT:    uzp1 z1.h, z1.h, z2.h
+; CHECK-NEXT:    fmul z1.h, z1.h, z3.h
+; CHECK-NEXT:    uzp2 z3.h, z1.h, z2.h
+; CHECK-NEXT:    uzp1 z1.h, z1.h, z2.h
+; CHECK-NEXT:    fmul z1.h, z1.h, z3.h
+; CHECK-NEXT:    fmul h0, h0, h1
+; CHECK-NEXT:    ret
+  %res = call fast half @llvm.vector.reduce.fmul.nxv8f16(half %init, <vscale x 8 x half> %a)
+  ret half %res
+}
+
+define float @fmulv_nxv2f32(float %init, <vscale x 2 x float> %a) {
+; CHECK-LABEL: fmulv_nxv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov z2.s, #1.00000000
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT:    uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT:    fmul z1.s, p0/m, z1.s, z3.s
+; CHECK-NEXT:    uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT:    uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT:    fmul z1.s, p0/m, z1.s, z3.s
+; CHECK-NEXT:    uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT:    uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT:    fmul z1.s, p0/m, z1.s, z3.s
+; CHECK-NEXT:    uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT:    uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT:    fmul z1.s, p0/m, z1.s, z3.s
+; CHECK-NEXT:    uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT:    uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT:    fmul z1.s, p0/m, z1.s, z3.s
+; CHECK-NEXT:    fmul s0, s0, s1
+; CHECK-NEXT:    ret
+  %res = call fast float @llvm.vector.reduce.fmul.nxv2f32(float %init, <vscale x 2 x float> %a)
+  ret float %res
+}
+
+define float @fmulv_nxv4f32(float %init, <vscale x 4 x float> %a) {
+; CHECK-LABEL: fmulv_nxv4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov z2.s, #1.00000000
+; CHECK-NEXT:    uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT:    fmul z1.s, z1.s, z3.s
+; CHECK-NEXT:    uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT:    fmul z1.s, z1.s, z3.s
+; CHECK-NEXT:    uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT:    fmul z1.s, z1.s, z3.s
+; CHECK-NEXT:    uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT:    fmul z1.s, z1.s, z3.s
+; CHECK-NEXT:    uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT:    fmul z1.s, z1.s, z3.s
+; CHECK-NEXT:    uzp2 z3.s, z1.s, z2.s
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT:    fmul z1.s, z1.s, z3.s
+; CHECK-NEXT:    fmul s0, s0, s1
+; CHECK-NEXT:    ret
+  %res = call fast float @llvm.vector.reduce.fmul.nxv4f32(float %init, <vscale x 4 x float> %a)
+  ret float %res
+}
+
+define double @fmulv_nxv2f64(double %init, <vscale x 2 x double> %a) {
+; CHECK-LABEL: fmulv_nxv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov z2.d, #1.00000000
+; CHECK-NEXT:    uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT:    uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT:    fmul z1.d, z1.d, z3.d
+; CHECK-NEXT:    uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT:    uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT:    fmul z1.d, z1.d, z3.d
+; CHECK-NEXT:    uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT:    uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT:    fmul z1.d, z1.d, z3.d
+; CHECK-NEXT:    uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT:    uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT:    fmul z1.d, z1.d, z3.d
+; CHECK-NEXT:    uzp2 z3.d, z1.d, z2.d
+; CHECK-NEXT:    uzp1 z1.d, z1.d, z2.d
+; CHECK-NEXT:    fmul z1.d, z1.d, z3.d
+; CHECK-NEXT:    fmul d0, d0, d1
+; CHECK-NEXT:    ret
+  %res = call fast double @llvm.vector.reduce.fmul.nxv2f64(double %init, <vscale x 2 x double> %a)
+  ret double %res
+}
+
 declare half @llvm.vector.reduce.fadd.nxv2f16(half, <vscale x 2 x half>)
 declare half @llvm.vector.reduce.fadd.nxv4f16(half, <vscale x 4 x half>)
 declare half @llvm.vector.reduce.fadd.nxv8f16(half, <vscale x 8 x half>)
-declare half @llvm.vector.reduce.fadd.nxv6f16(half, <vscale x 6 x half>)
-declare half @llvm.vector.reduce.fadd.nxv10f16(half, <vscale x 10 x half>)
-declare half @llvm.vector.reduce.fadd.nxv12f16(half, <vscale x 12 x half>)
 declare float @llvm.vector.reduce.fadd.nxv2f32(float, <vscale x 2 x float>)
 declare float @llvm.vector.reduce.fadd.nxv4f32(float, <vscale x 4 x float>)
 declare float @llvm.vector.reduce.fadd.nxv8f32(float, <vscale x 8 x float>)
@@ -397,3 +562,10 @@ declare half @llvm.vector.reduce.fminimum.nxv8f16(<vscale x 8 x half>)
 declare float @llvm.vector.reduce.fminimum.nxv2f32(<vscale x 2 x float>)
 declare float @llvm.vector.reduce.fminimum.nxv4f32(<vscale x 4 x float>)
 declare double @llvm.vector.reduce.fminimum.nxv2f64(<vscale x 2 x double>)
+
+declare half @llvm.vector.reduce.fmul.nxv2f16(half, <vscale x 2 x half>)
+declare half @llvm.vector.reduce.fmul.nxv4f16(half, <vscale x 4 x half>)
+declare half @llvm.vector.reduce.fmul.nxv8f16(half, <vscale x 8 x half>)
+declare float @llvm.vector.reduce.fmul.nxv2f32(float, <vscale x 2 x float>)
+declare float @llvm.vector.reduce.fmul.nxv4f32(float, <vscale x 4 x float>)
+declare double @llvm.vector.reduce.fmul.nxv2f64(double, <vscale x 2 x double>)
diff --git a/llvm/test/CodeGen/AArch64/sve-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-int-reduce.ll
index be936f0fd6d4a..6fb0315e27fb5 100644
--- a/llvm/test/CodeGen/AArch64/sve-int-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-int-reduce.ll
@@ -369,6 +369,131 @@ define i64 @smax_nxv2i64(<vscale x 2 x i64> %a) {
   ret i64 %res
 }
 
+; No MULV instruction so use knowledge about the architectural maximum size of
+; an SVE register to "scalarise" the reduction.
+
+define i8 @mulv_nxv16i8(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: mulv_nxv16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.b, #1 // =0x1
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    uzp2 z2.b, z0.b, z1.b
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT:    mul z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT:    uzp2 z2.b, z0.b, z1.b
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT:    mul z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT:    uzp2 z2.b, z0.b, z1.b
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT:    mul z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT:    uzp2 z2.b, z0.b, z1.b
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT:    mul z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT:    uzp2 z2.b, z0.b, z1.b
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT:    mul z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT:    uzp2 z2.b, z0.b, z1.b
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT:    mul z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT:    uzp2 z2.b, z0.b, z1.b
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT:    mul z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT:    uzp2 z2.b, z0.b, z1.b
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT:    mul z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %res = call i8 @llvm.vector.reduce.mul.nxv16i8(<vscale x 16 x i8> %a)
+  ret i8 %res
+}
+
+define i16 @mulv_nxv8i16(<vscale x 8 x i16> %a) {
+; CHECK-LABEL: mulv_nxv8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.h, #1 // =0x1
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    uzp2 z2.h, z0.h, z1.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT:    mul z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT:    uzp2 z2.h, z0.h, z1.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT:    mul z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT:    uzp2 z2.h, z0.h, z1.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT:    mul z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT:    uzp2 z2.h, z0.h, z1.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT:    mul z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT:    uzp2 z2.h, z0.h, z1.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT:    mul z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT:    uzp2 z2.h, z0.h, z1.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT:    mul z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT:    uzp2 z2.h, z0.h, z1.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT:    mul z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %res = call i16 @llvm.vector.reduce.mul.nxv8i16(<vscale x 8 x i16> %a)
+  ret i16 %res
+}
+
+define i32 @mulv_nxv4i32(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: mulv_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.s, #1 // =0x1
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    uzp2 z2.s, z0.s, z1.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT:    uzp2 z2.s, z0.s, z1.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT:    uzp2 z2.s, z0.s, z1.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT:    uzp2 z2.s, z0.s, z1.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT:    uzp2 z2.s, z0.s, z1.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT:    uzp2 z2.s, z0.s, z1.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %res = call i32 @llvm.vector.reduce.mul.nxv4i32(<vscale x 4 x i32> %a)
+  ret i32 %res
+}
+
+define i64 @mulv_nxv2i64(<vscale x 2 x i64> %a) {
+; CHECK-LABEL: mulv_nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.d, #1 // =0x1
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uzp2 z2.d, z0.d, z1.d
+; CHECK-NEXT:    uzp1 z0.d, z0.d, z1.d
+; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT:    uzp2 z2.d, z0.d, z1.d
+; CHECK-NEXT:    uzp1 z0.d, z0.d, z1.d
+; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT:    uzp2 z2.d, z0.d, z1.d
+; CHECK-NEXT:    uzp1 z0.d, z0.d, z1.d
+; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT:    uzp2 z2.d, z0.d, z1.d
+; CHECK-NEXT:    uzp1 z0.d, z0.d, z1.d
+; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT:    uzp2 z2.d, z0.d, z1.d
+; CHECK-NEXT:    uzp1 z0.d, z0.d, z1.d
+; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+  %res = call i64 @llvm.vector.reduce.mul.nxv2i64(<vscale x 2 x i64> %a)
+  ret i64 %res
+}
+
 ; Test widen vector reduce type
 declare i8 @llvm.vector.reduce.smin.nxv10i8(<vscale x 10 x i8>)
 
diff --git a/llvm/test/CodeGen/AArch64/sve-lsr-scaled-index-addressing-mode.ll b/llvm/test/CodeGen/AArch64/sve-lsr-scaled-index-addressing-mode.ll
index c680f8942f9a8..5b9f188ed5838 100644
--- a/llvm/test/CodeGen/AArch64/sve-lsr-scaled-index-addressing-mode.ll
+++ b/llvm/test/CodeGen/AArch64/sve-lsr-scaled-index-addressing-mode.ll
@@ -89,9 +89,9 @@ define void @masked_ld_st_nxv8i16(ptr %in, ptr %out, i64 %n) {
 ; IR-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[IN:%.*]], i64 [[TMP0]]
 ; IR-NEXT:    [[TMP1:%.*]] = shl i64 [[INDVAR]], 1
 ; IR-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[OUT:%.*]], i64 [[TMP1]]
-; IR-NEXT:    [[VAL:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[UGLYGEP1]], i32 4, <vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> poison)
+; IR-NEXT:    [[VAL:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr align 4 [[UGLYGEP1]], <vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> poison)
 ; IR-NEXT:    [[ADDP_VEC:%.*]] = add <vscale x 8 x i16> [[VAL]], splat (i16 3)
-; IR-NEXT:    call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[ADDP_VEC]], ptr [[UGLYGEP]], i32 4, <vscale x 8 x i1> splat (i1 true))
+; IR-NEXT:    call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[ADDP_VEC]], ptr align 4 [[UGLYGEP]], <vscale x 8 x i1> splat (i1 true))
 ; IR-NEXT:    [[INDVAR_NEXT]] = add nsw i64 [[INDVAR]], [[SCALED_VF]]
 ; IR-NEXT:    [[EXIT_COND:%.*]] = icmp eq i64 [[N:%.*]], [[INDVAR_NEXT]]
 ; IR-NEXT:    br i1 [[EXIT_COND]], label [[LOOP_EXIT:%.*]], label [[LOOP]]
diff --git a/llvm/test/CodeGen/AArch64/sve2-vscale-sinking.ll b/llvm/test/CodeGen/AArch64/sve2-vscale-sinking.ll
index 790da46cd831f..f91be1f5f6e7c 100644
--- a/llvm/test/CodeGen/AArch64/sve2-vscale-sinking.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-vscale-sinking.ll
@@ -115,19 +115,19 @@ define void @gep(i32 noundef %first, i32 noundef %N, ptr nocapture noundef write
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32 [ [[N]], [[ENTRY:%.*]] ], [ [[LSR_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[PTR_ADDR:%.*]] = phi ptr [ [[PTR]], [[ENTRY]] ], [ [[ADD_PTR_3:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[PTR_ADDR]], i32 1, <vscale x 16 x i1> [[PG]])
+; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr align 1 [[PTR_ADDR]], <vscale x 16 x i1> [[PG]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[PTR_ADDR]], i64 [[TMP1]]
-; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[ADD_PTR]], i32 1, <vscale x 16 x i1> [[PG]])
+; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr align 1 [[ADD_PTR]], <vscale x 16 x i1> [[PG]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[TMP3]]
-; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[ADD_PTR_1]], i32 1, <vscale x 16 x i1> [[PG]])
+; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr align 1 [[ADD_PTR_1]], <vscale x 16 x i1> [[PG]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 4
 ; CHECK-NEXT:    [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[TMP5]]
-; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[ADD_PTR_2]], i32 1, <vscale x 16 x i1> [[PG]])
+; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr align 1 [[ADD_PTR_2]], <vscale x 16 x i1> [[PG]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = tail call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 4
 ; CHECK-NEXT:    [[ADD_PTR_3]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 [[TMP7]]
@@ -278,22 +278,22 @@ define void @gep_i32(i32 noundef %first, i32 noundef %N, ptr nocapture noundef w
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32 [ [[N]], [[ENTRY:%.*]] ], [ [[LSR_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[PTR_ADDR:%.*]] = phi ptr [ [[PTR]], [[ENTRY]] ], [ [[ADD_PTR_3:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[PTR_ADDR]], i32 1, <vscale x 16 x i1> [[PG]])
+; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr align 1 [[PTR_ADDR]], <vscale x 16 x i1> [[PG]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP1]], 4
 ; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[PTR_ADDR]], i64 [[TMP2]]
-; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[ADD_PTR]], i32 1, <vscale x 16 x i1> [[PG]])
+; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr align 1 [[ADD_PTR]], <vscale x 16 x i1> [[PG]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 4
 ; CHECK-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[TMP5]]
-; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[ADD_PTR_1]], i32 1, <vscale x 16 x i1> [[PG]])
+; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr align 1 [[ADD_PTR_1]], <vscale x 16 x i1> [[PG]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 4
 ; CHECK-NEXT:    [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[TMP8]]
-; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[ADD_PTR_2]], i32 1, <vscale x 16 x i1> [[PG]])
+; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr align 1 [[ADD_PTR_2]], <vscale x 16 x i1> [[PG]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = tail call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = shl i64 [[TMP10]], 4
diff --git a/llvm/test/CodeGen/AArch64/zext-shuffle.ll b/llvm/test/CodeGen/AArch64/zext-shuffle.ll
index 20d2071d7fe54..a0d4e18acb6c8 100644
--- a/llvm/test/CodeGen/AArch64/zext-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/zext-shuffle.ll
@@ -674,10 +674,8 @@ define <4 x i32> @isUndefDeInterleave_t1_bad(<8 x i16> %a) {
 define i16 @undeftop(<8 x i16> %0) {
 ; CHECK-LABEL: undeftop:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    dup v0.8h, v0.h[4]
-; CHECK-NEXT:    uaddl v0.4s, v0.4h, v0.4h
-; CHECK-NEXT:    xtn v0.4h, v0.4s
-; CHECK-NEXT:    umov w0, v0.h[0]
+; CHECK-NEXT:    add v0.8h, v0.8h, v0.8h
+; CHECK-NEXT:    umov w0, v0.h[4]
 ; CHECK-NEXT:    ret
   %2 = shufflevector <8 x i16> %0, <8 x i16> zeroinitializer, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 9, i32 7, i32 5, i32 3>
   %3 = zext <8 x i16> %2 to <8 x i64>
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll
index 26b9d996fc284..8705647e36fe1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll
@@ -206,7 +206,7 @@ define <2 x half> @test_max_K0min_K1Val_v2f16(<2 x half> %a) #1 {
 
 ; global nnan function attribute always forces clamp combine
 
-define float @test_min_max_global_nnan(float %a) #3 {
+define float @test_min_max_global_nnan(float %a) {
 ; GFX10-LABEL: test_min_max_global_nnan:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -223,11 +223,11 @@ define float @test_min_max_global_nnan(float %a) #3 {
 ; GFX12-NEXT:    v_max_num_f32_e64 v0, v0, v0 clamp
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %maxnum = call float @llvm.maxnum.f32(float %a, float 0.0)
-  %fmed = call float @llvm.minnum.f32(float %maxnum, float 1.0)
+  %fmed = call nnan float @llvm.minnum.f32(float %maxnum, float 1.0)
   ret float %fmed
 }
 
-define float @test_max_min_global_nnan(float %a) #3 {
+define float @test_max_min_global_nnan(float %a) {
 ; GFX10-LABEL: test_max_min_global_nnan:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -244,7 +244,7 @@ define float @test_max_min_global_nnan(float %a) #3 {
 ; GFX12-NEXT:    v_max_num_f32_e64 v0, v0, v0 clamp
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %minnum = call float @llvm.minnum.f32(float %a, float 1.0)
-  %fmed = call float @llvm.maxnum.f32(float %minnum, float 0.0)
+  %fmed = call nnan float @llvm.maxnum.f32(float %minnum, float 0.0)
   ret float %fmed
 }
 
@@ -414,5 +414,4 @@ declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>)
 attributes #0 = {"amdgpu-ieee"="true"}
 attributes #1 = {"amdgpu-ieee"="false"}
 attributes #2 = {"amdgpu-ieee"="true" "amdgpu-dx10-clamp"="true"}
-attributes #3 = {"no-nans-fp-math"="true"}
 attributes #4 = {"amdgpu-ieee"="true" "amdgpu-dx10-clamp"="false"}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll
index d2c93e75cbed6..696a87b9d0b4d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll
@@ -232,7 +232,7 @@ define half @test_max_K0min_K1Val_f16(half %a) #1 {
 
 ; global nnan function attribute always forces fmed3 combine
 
-define float @test_min_max_global_nnan(float %a) #2 {
+define float @test_min_max_global_nnan(float %a) {
 ; GFX10-LABEL: test_min_max_global_nnan:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -254,12 +254,12 @@ define float @test_min_max_global_nnan(float %a) #2 {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_med3_num_f32 v0, v0, 2.0, 4.0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
-  %maxnum = call float @llvm.maxnum.f32(float %a, float 2.0)
+  %maxnum = call nnan float @llvm.maxnum.f32(float %a, float 2.0)
   %fmed = call float @llvm.minnum.f32(float %maxnum, float 4.0)
   ret float %fmed
 }
 
-define float @test_max_min_global_nnan(float %a) #2 {
+define float @test_max_min_global_nnan(float %a) {
 ; GFX10-LABEL: test_max_min_global_nnan:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -281,8 +281,8 @@ define float @test_max_min_global_nnan(float %a) #2 {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_med3_num_f32 v0, v0, 2.0, 4.0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
-  %minnum = call float @llvm.minnum.f32(float %a, float 4.0)
-  %fmed = call float @llvm.maxnum.f32(float %minnum, float 2.0)
+  %minnum = call nnan float @llvm.minnum.f32(float %a, float 4.0)
+  %fmed = call nnan float @llvm.maxnum.f32(float %minnum, float 2.0)
   ret float %fmed
 }
 
@@ -560,4 +560,3 @@ declare <2 x half> @llvm.minnum.v2f16(<2 x half>, <2 x half>)
 declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>)
 attributes #0 = {"amdgpu-ieee"="true"}
 attributes #1 = {"amdgpu-ieee"="false"}
-attributes #2 = {"no-nans-fp-math"="true"}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
index df418a264b548..c037a93af124b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
@@ -715,7 +715,9 @@ define amdgpu_kernel void @workgroup_one_as_release() #0 {
   ; GFX10CU: bb.0.entry:
   ; GFX10CU-NEXT:   frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02
   ; GFX10CU-NEXT:   frame-setup CFI_INSTRUCTION undefined $pc_reg
+  ; GFX10CU-NEXT:   S_WAITCNT_soft 16240
   ; GFX10CU-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: workgroup_one_as_release
@@ -730,6 +732,8 @@ define amdgpu_kernel void @workgroup_one_as_release() #0 {
   ; GFX11CU: bb.0.entry:
   ; GFX11CU-NEXT:   frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02
   ; GFX11CU-NEXT:   frame-setup CFI_INSTRUCTION undefined $pc_reg
+  ; GFX11CU-NEXT:   S_WAITCNT_soft 1015
+  ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence syncscope("workgroup-one-as") release
@@ -763,7 +767,9 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel() #0 {
   ; GFX10CU: bb.0.entry:
   ; GFX10CU-NEXT:   frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02
   ; GFX10CU-NEXT:   frame-setup CFI_INSTRUCTION undefined $pc_reg
+  ; GFX10CU-NEXT:   S_WAITCNT_soft 16240
   ; GFX10CU-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: workgroup_one_as_acq_rel
@@ -779,6 +785,8 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel() #0 {
   ; GFX11CU: bb.0.entry:
   ; GFX11CU-NEXT:   frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02
   ; GFX11CU-NEXT:   frame-setup CFI_INSTRUCTION undefined $pc_reg
+  ; GFX11CU-NEXT:   S_WAITCNT_soft 1015
+  ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence syncscope("workgroup-one-as") acq_rel
@@ -812,7 +820,9 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst() #0 {
   ; GFX10CU: bb.0.entry:
   ; GFX10CU-NEXT:   frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02
   ; GFX10CU-NEXT:   frame-setup CFI_INSTRUCTION undefined $pc_reg
+  ; GFX10CU-NEXT:   S_WAITCNT_soft 16240
   ; GFX10CU-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: workgroup_one_as_seq_cst
@@ -828,6 +838,8 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst() #0 {
   ; GFX11CU: bb.0.entry:
   ; GFX11CU-NEXT:   frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02
   ; GFX11CU-NEXT:   frame-setup CFI_INSTRUCTION undefined $pc_reg
+  ; GFX11CU-NEXT:   S_WAITCNT_soft 1015
+  ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence syncscope("workgroup-one-as") seq_cst
@@ -1709,8 +1721,9 @@ define amdgpu_kernel void @workgroup_release() #0 {
   ; GFX10CU: bb.0.entry:
   ; GFX10CU-NEXT:   frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02
   ; GFX10CU-NEXT:   frame-setup CFI_INSTRUCTION undefined $pc_reg
-  ; GFX10CU-NEXT:   S_WAITCNT_soft 49279
+  ; GFX10CU-NEXT:   S_WAITCNT_soft 112
   ; GFX10CU-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: workgroup_release
@@ -1725,7 +1738,8 @@ define amdgpu_kernel void @workgroup_release() #0 {
   ; GFX11CU: bb.0.entry:
   ; GFX11CU-NEXT:   frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02
   ; GFX11CU-NEXT:   frame-setup CFI_INSTRUCTION undefined $pc_reg
-  ; GFX11CU-NEXT:   S_WAITCNT_soft 64519
+  ; GFX11CU-NEXT:   S_WAITCNT_soft 7
+  ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence syncscope("workgroup") release
@@ -1761,8 +1775,9 @@ define amdgpu_kernel void @workgroup_acq_rel() #0 {
   ; GFX10CU: bb.0.entry:
   ; GFX10CU-NEXT:   frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02
   ; GFX10CU-NEXT:   frame-setup CFI_INSTRUCTION undefined $pc_reg
-  ; GFX10CU-NEXT:   S_WAITCNT_soft 49279
+  ; GFX10CU-NEXT:   S_WAITCNT_soft 112
   ; GFX10CU-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: workgroup_acq_rel
@@ -1778,7 +1793,8 @@ define amdgpu_kernel void @workgroup_acq_rel() #0 {
   ; GFX11CU: bb.0.entry:
   ; GFX11CU-NEXT:   frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02
   ; GFX11CU-NEXT:   frame-setup CFI_INSTRUCTION undefined $pc_reg
-  ; GFX11CU-NEXT:   S_WAITCNT_soft 64519
+  ; GFX11CU-NEXT:   S_WAITCNT_soft 7
+  ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence syncscope("workgroup") acq_rel
@@ -1814,8 +1830,9 @@ define amdgpu_kernel void @workgroup_seq_cst() #0 {
   ; GFX10CU: bb.0.entry:
   ; GFX10CU-NEXT:   frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02
   ; GFX10CU-NEXT:   frame-setup CFI_INSTRUCTION undefined $pc_reg
-  ; GFX10CU-NEXT:   S_WAITCNT_soft 49279
+  ; GFX10CU-NEXT:   S_WAITCNT_soft 112
   ; GFX10CU-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: workgroup_seq_cst
@@ -1831,7 +1848,8 @@ define amdgpu_kernel void @workgroup_seq_cst() #0 {
   ; GFX11CU: bb.0.entry:
   ; GFX11CU-NEXT:   frame-setup CFI_INSTRUCTION escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02
   ; GFX11CU-NEXT:   frame-setup CFI_INSTRUCTION undefined $pc_reg
-  ; GFX11CU-NEXT:   S_WAITCNT_soft 64519
+  ; GFX11CU-NEXT:   S_WAITCNT_soft 7
+  ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence syncscope("workgroup") seq_cst
diff --git a/llvm/test/CodeGen/AMDGPU/abs_i16.ll b/llvm/test/CodeGen/AMDGPU/abs_i16.ll
index 7633ba0eb4f9c..66cc7f3db03c2 100644
--- a/llvm/test/CodeGen/AMDGPU/abs_i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/abs_i16.ll
@@ -15,7 +15,7 @@ define i16 @abs_i16(i16 %arg) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, 0, v0
-; GFX6-NEXT:    v_max_i32_e32 v0, v0, v1
+; GFX6-NEXT:    v_max_i32_e32 v0, v1, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: abs_i16:
@@ -23,7 +23,7 @@ define i16 @abs_i16(i16 %arg) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX7-NEXT:    v_sub_i32_e32 v1, vcc, 0, v0
-; GFX7-NEXT:    v_max_i32_e32 v0, v0, v1
+; GFX7-NEXT:    v_max_i32_e32 v0, v1, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: abs_i16:
@@ -97,9 +97,9 @@ define <2 x i16> @v_abs_v2i16(<2 x i16> %arg) {
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 0, v0
-; GFX6-NEXT:    v_max_i32_e32 v0, v0, v2
+; GFX6-NEXT:    v_max_i32_e32 v0, v2, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 0, v1
-; GFX6-NEXT:    v_max_i32_e32 v1, v1, v2
+; GFX6-NEXT:    v_max_i32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -110,9 +110,9 @@ define <2 x i16> @v_abs_v2i16(<2 x i16> %arg) {
 ; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX7-NEXT:    v_bfe_i32 v1, v1, 0, 16
 ; GFX7-NEXT:    v_sub_i32_e32 v2, vcc, 0, v0
-; GFX7-NEXT:    v_max_i32_e32 v0, v0, v2
+; GFX7-NEXT:    v_max_i32_e32 v0, v2, v0
 ; GFX7-NEXT:    v_sub_i32_e32 v2, vcc, 0, v1
-; GFX7-NEXT:    v_max_i32_e32 v1, v1, v2
+; GFX7-NEXT:    v_max_i32_e32 v1, v2, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -172,15 +172,15 @@ define <3 x i16> @v_abs_v3i16(<3 x i16> %arg) {
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0, v0
-; GFX6-NEXT:    v_max_i32_e32 v0, v0, v3
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
-; GFX6-NEXT:    v_max_i32_e32 v1, v1, v3
 ; GFX6-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GFX6-NEXT:    v_max_i32_e32 v0, v3, v0
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
+; GFX6-NEXT:    v_max_i32_e32 v1, v3, v1
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_max_i32_e32 v2, v3, v2
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, 0, v2
-; GFX6-NEXT:    v_max_i32_e32 v2, v2, v1
-; GFX6-NEXT:    v_alignbit_b32 v1, v2, v0, 16
+; GFX6-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_abs_v3i16:
@@ -189,15 +189,15 @@ define <3 x i16> @v_abs_v3i16(<3 x i16> %arg) {
 ; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX7-NEXT:    v_bfe_i32 v1, v1, 0, 16
 ; GFX7-NEXT:    v_sub_i32_e32 v3, vcc, 0, v0
-; GFX7-NEXT:    v_max_i32_e32 v0, v0, v3
-; GFX7-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
-; GFX7-NEXT:    v_max_i32_e32 v1, v1, v3
 ; GFX7-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GFX7-NEXT:    v_max_i32_e32 v0, v3, v0
+; GFX7-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
+; GFX7-NEXT:    v_max_i32_e32 v1, v3, v1
+; GFX7-NEXT:    v_sub_i32_e32 v3, vcc, 0, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_max_i32_e32 v2, v3, v2
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT:    v_sub_i32_e32 v1, vcc, 0, v2
-; GFX7-NEXT:    v_max_i32_e32 v2, v2, v1
-; GFX7-NEXT:    v_alignbit_b32 v1, v2, v0, 16
+; GFX7-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_abs_v3i16:
@@ -262,47 +262,45 @@ define <4 x i16> @v_abs_v4i16(<4 x i16> %arg) {
 ; GFX6-LABEL: v_abs_v4i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 0, v0
 ; GFX6-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GFX6-NEXT:    v_max_i32_e32 v0, v4, v0
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 0, v1
 ; GFX6-NEXT:    v_bfe_i32 v3, v3, 0, 16
+; GFX6-NEXT:    v_max_i32_e32 v1, v4, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 0, v2
-; GFX6-NEXT:    v_max_i32_e32 v2, v2, v4
+; GFX6-NEXT:    v_max_i32_e32 v2, v4, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 0, v3
-; GFX6-NEXT:    v_max_i32_e32 v3, v3, v4
-; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
-; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0, v0
-; GFX6-NEXT:    v_max_i32_e32 v0, v0, v3
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
-; GFX6-NEXT:    v_max_i32_e32 v1, v1, v3
+; GFX6-NEXT:    v_max_i32_e32 v3, v4, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    v_alignbit_b32 v1, v2, v0, 16
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX6-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_abs_v4i16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX7-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX7-NEXT:    v_sub_i32_e32 v4, vcc, 0, v0
 ; GFX7-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GFX7-NEXT:    v_max_i32_e32 v0, v4, v0
+; GFX7-NEXT:    v_sub_i32_e32 v4, vcc, 0, v1
 ; GFX7-NEXT:    v_bfe_i32 v3, v3, 0, 16
+; GFX7-NEXT:    v_max_i32_e32 v1, v4, v1
 ; GFX7-NEXT:    v_sub_i32_e32 v4, vcc, 0, v2
-; GFX7-NEXT:    v_max_i32_e32 v2, v2, v4
+; GFX7-NEXT:    v_max_i32_e32 v2, v4, v2
 ; GFX7-NEXT:    v_sub_i32_e32 v4, vcc, 0, v3
-; GFX7-NEXT:    v_max_i32_e32 v3, v3, v4
-; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_bfe_i32 v1, v1, 0, 16
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT:    v_sub_i32_e32 v3, vcc, 0, v0
-; GFX7-NEXT:    v_max_i32_e32 v0, v0, v3
-; GFX7-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
-; GFX7-NEXT:    v_max_i32_e32 v1, v1, v3
+; GFX7-NEXT:    v_max_i32_e32 v3, v4, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT:    v_alignbit_b32 v1, v2, v0, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX7-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_abs_v4i16:
@@ -370,63 +368,61 @@ define <6 x i16> @v_abs_v6i16(<6 x i16> %arg) {
 ; GFX6-LABEL: v_abs_v6i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, 0, v0
+; GFX6-NEXT:    v_bfe_i32 v4, v4, 0, 16
+; GFX6-NEXT:    v_max_i32_e32 v0, v6, v0
+; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, 0, v1
+; GFX6-NEXT:    v_bfe_i32 v5, v5, 0, 16
+; GFX6-NEXT:    v_max_i32_e32 v1, v6, v1
+; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, 0, v4
+; GFX6-NEXT:    v_max_i32_e32 v4, v6, v4
+; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, 0, v5
+; GFX6-NEXT:    v_max_i32_e32 v5, v6, v5
 ; GFX6-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX6-NEXT:    v_bfe_i32 v3, v3, 0, 16
+; GFX6-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, 0, v2
-; GFX6-NEXT:    v_max_i32_e32 v2, v2, v6
+; GFX6-NEXT:    v_max_i32_e32 v2, v6, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, 0, v3
-; GFX6-NEXT:    v_max_i32_e32 v3, v3, v6
-; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
-; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0, v0
-; GFX6-NEXT:    v_max_i32_e32 v0, v0, v3
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
-; GFX6-NEXT:    v_bfe_i32 v5, v5, 0, 16
-; GFX6-NEXT:    v_max_i32_e32 v1, v1, v3
-; GFX6-NEXT:    v_bfe_i32 v4, v4, 0, 16
+; GFX6-NEXT:    v_max_i32_e32 v3, v6, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0, v5
+; GFX6-NEXT:    v_or_b32_e32 v2, v2, v6
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, 0, v4
-; GFX6-NEXT:    v_max_i32_e32 v5, v5, v3
-; GFX6-NEXT:    v_max_i32_e32 v1, v4, v1
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX6-NEXT:    v_or_b32_e32 v4, v1, v3
-; GFX6-NEXT:    v_alignbit_b32 v1, v2, v0, 16
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX6-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_abs_v6i16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX7-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX7-NEXT:    v_sub_i32_e32 v6, vcc, 0, v0
+; GFX7-NEXT:    v_bfe_i32 v4, v4, 0, 16
+; GFX7-NEXT:    v_max_i32_e32 v0, v6, v0
+; GFX7-NEXT:    v_sub_i32_e32 v6, vcc, 0, v1
+; GFX7-NEXT:    v_bfe_i32 v5, v5, 0, 16
+; GFX7-NEXT:    v_max_i32_e32 v1, v6, v1
+; GFX7-NEXT:    v_sub_i32_e32 v6, vcc, 0, v4
+; GFX7-NEXT:    v_max_i32_e32 v4, v6, v4
+; GFX7-NEXT:    v_sub_i32_e32 v6, vcc, 0, v5
+; GFX7-NEXT:    v_max_i32_e32 v5, v6, v5
 ; GFX7-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX7-NEXT:    v_bfe_i32 v3, v3, 0, 16
+; GFX7-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX7-NEXT:    v_sub_i32_e32 v6, vcc, 0, v2
-; GFX7-NEXT:    v_max_i32_e32 v2, v2, v6
+; GFX7-NEXT:    v_max_i32_e32 v2, v6, v2
 ; GFX7-NEXT:    v_sub_i32_e32 v6, vcc, 0, v3
-; GFX7-NEXT:    v_max_i32_e32 v3, v3, v6
-; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_bfe_i32 v1, v1, 0, 16
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT:    v_sub_i32_e32 v3, vcc, 0, v0
-; GFX7-NEXT:    v_max_i32_e32 v0, v0, v3
-; GFX7-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
-; GFX7-NEXT:    v_bfe_i32 v5, v5, 0, 16
-; GFX7-NEXT:    v_max_i32_e32 v1, v1, v3
-; GFX7-NEXT:    v_bfe_i32 v4, v4, 0, 16
+; GFX7-NEXT:    v_max_i32_e32 v3, v6, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_sub_i32_e32 v3, vcc, 0, v5
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v6
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT:    v_sub_i32_e32 v1, vcc, 0, v4
-; GFX7-NEXT:    v_max_i32_e32 v5, v5, v3
-; GFX7-NEXT:    v_max_i32_e32 v1, v4, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX7-NEXT:    v_or_b32_e32 v4, v1, v3
-; GFX7-NEXT:    v_alignbit_b32 v1, v2, v0, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX7-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_abs_v6i16:
@@ -509,83 +505,79 @@ define <8 x i16> @v_abs_v8i16(<8 x i16> %arg) {
 ; GFX6-LABEL: v_abs_v8i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 0, v0
+; GFX6-NEXT:    v_bfe_i32 v4, v4, 0, 16
+; GFX6-NEXT:    v_max_i32_e32 v0, v8, v0
+; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 0, v1
+; GFX6-NEXT:    v_bfe_i32 v5, v5, 0, 16
+; GFX6-NEXT:    v_max_i32_e32 v1, v8, v1
+; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 0, v4
 ; GFX6-NEXT:    v_bfe_i32 v6, v6, 0, 16
+; GFX6-NEXT:    v_max_i32_e32 v4, v8, v4
+; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 0, v5
 ; GFX6-NEXT:    v_bfe_i32 v7, v7, 0, 16
+; GFX6-NEXT:    v_max_i32_e32 v5, v8, v5
 ; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 0, v6
-; GFX6-NEXT:    v_max_i32_e32 v6, v6, v8
+; GFX6-NEXT:    v_max_i32_e32 v6, v8, v6
 ; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 0, v7
-; GFX6-NEXT:    v_max_i32_e32 v7, v7, v8
-; GFX6-NEXT:    v_bfe_i32 v4, v4, 0, 16
-; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX6-NEXT:    v_bfe_i32 v5, v5, 0, 16
-; GFX6-NEXT:    v_or_b32_e32 v6, v6, v7
-; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, 0, v4
-; GFX6-NEXT:    v_max_i32_e32 v4, v4, v7
-; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, 0, v5
-; GFX6-NEXT:    v_max_i32_e32 v5, v5, v7
+; GFX6-NEXT:    v_max_i32_e32 v7, v8, v7
 ; GFX6-NEXT:    v_bfe_i32 v2, v2, 0, 16
-; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
 ; GFX6-NEXT:    v_bfe_i32 v3, v3, 0, 16
-; GFX6-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, 0, v2
-; GFX6-NEXT:    v_max_i32_e32 v2, v2, v5
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, 0, v3
-; GFX6-NEXT:    v_max_i32_e32 v3, v3, v5
-; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
-; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0, v0
-; GFX6-NEXT:    v_max_i32_e32 v0, v0, v3
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
-; GFX6-NEXT:    v_max_i32_e32 v1, v1, v3
+; GFX6-NEXT:    v_or_b32_e32 v6, v6, v8
+; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 0, v2
+; GFX6-NEXT:    v_max_i32_e32 v2, v8, v2
+; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 0, v3
+; GFX6-NEXT:    v_max_i32_e32 v3, v8, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT:    v_or_b32_e32 v2, v2, v8
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    v_alignbit_b32 v1, v2, v0, 16
-; GFX6-NEXT:    v_alignbit_b32 v5, v6, v4, 16
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; GFX6-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX6-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; GFX6-NEXT:    v_alignbit_b32 v5, v6, v5, 16
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_abs_v8i16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX7-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX7-NEXT:    v_sub_i32_e32 v8, vcc, 0, v0
+; GFX7-NEXT:    v_bfe_i32 v4, v4, 0, 16
+; GFX7-NEXT:    v_max_i32_e32 v0, v8, v0
+; GFX7-NEXT:    v_sub_i32_e32 v8, vcc, 0, v1
+; GFX7-NEXT:    v_bfe_i32 v5, v5, 0, 16
+; GFX7-NEXT:    v_max_i32_e32 v1, v8, v1
+; GFX7-NEXT:    v_sub_i32_e32 v8, vcc, 0, v4
 ; GFX7-NEXT:    v_bfe_i32 v6, v6, 0, 16
+; GFX7-NEXT:    v_max_i32_e32 v4, v8, v4
+; GFX7-NEXT:    v_sub_i32_e32 v8, vcc, 0, v5
 ; GFX7-NEXT:    v_bfe_i32 v7, v7, 0, 16
+; GFX7-NEXT:    v_max_i32_e32 v5, v8, v5
 ; GFX7-NEXT:    v_sub_i32_e32 v8, vcc, 0, v6
-; GFX7-NEXT:    v_max_i32_e32 v6, v6, v8
+; GFX7-NEXT:    v_max_i32_e32 v6, v8, v6
 ; GFX7-NEXT:    v_sub_i32_e32 v8, vcc, 0, v7
-; GFX7-NEXT:    v_max_i32_e32 v7, v7, v8
-; GFX7-NEXT:    v_bfe_i32 v4, v4, 0, 16
-; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX7-NEXT:    v_bfe_i32 v5, v5, 0, 16
-; GFX7-NEXT:    v_or_b32_e32 v6, v6, v7
-; GFX7-NEXT:    v_sub_i32_e32 v7, vcc, 0, v4
-; GFX7-NEXT:    v_max_i32_e32 v4, v4, v7
-; GFX7-NEXT:    v_sub_i32_e32 v7, vcc, 0, v5
-; GFX7-NEXT:    v_max_i32_e32 v5, v5, v7
+; GFX7-NEXT:    v_max_i32_e32 v7, v8, v7
 ; GFX7-NEXT:    v_bfe_i32 v2, v2, 0, 16
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
 ; GFX7-NEXT:    v_bfe_i32 v3, v3, 0, 16
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-NEXT:    v_sub_i32_e32 v5, vcc, 0, v2
-; GFX7-NEXT:    v_max_i32_e32 v2, v2, v5
-; GFX7-NEXT:    v_sub_i32_e32 v5, vcc, 0, v3
-; GFX7-NEXT:    v_max_i32_e32 v3, v3, v5
-; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_bfe_i32 v1, v1, 0, 16
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT:    v_sub_i32_e32 v3, vcc, 0, v0
-; GFX7-NEXT:    v_max_i32_e32 v0, v0, v3
-; GFX7-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
-; GFX7-NEXT:    v_max_i32_e32 v1, v1, v3
+; GFX7-NEXT:    v_or_b32_e32 v6, v6, v8
+; GFX7-NEXT:    v_sub_i32_e32 v8, vcc, 0, v2
+; GFX7-NEXT:    v_max_i32_e32 v2, v8, v2
+; GFX7-NEXT:    v_sub_i32_e32 v8, vcc, 0, v3
+; GFX7-NEXT:    v_max_i32_e32 v3, v8, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v8
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT:    v_alignbit_b32 v1, v2, v0, 16
-; GFX7-NEXT:    v_alignbit_b32 v5, v6, v4, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; GFX7-NEXT:    v_alignbit_b32 v5, v6, v5, 16
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_abs_v8i16:
@@ -682,155 +674,147 @@ define <16 x i16> @v_abs_v16i16(<16 x i16> %arg) {
 ; GFX6-LABEL: v_abs_v16i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, 0, v0
+; GFX6-NEXT:    v_bfe_i32 v4, v4, 0, 16
+; GFX6-NEXT:    v_max_i32_e32 v0, v16, v0
+; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, 0, v1
+; GFX6-NEXT:    v_bfe_i32 v5, v5, 0, 16
+; GFX6-NEXT:    v_max_i32_e32 v1, v16, v1
+; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, 0, v4
+; GFX6-NEXT:    v_bfe_i32 v8, v8, 0, 16
+; GFX6-NEXT:    v_max_i32_e32 v4, v16, v4
+; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, 0, v5
+; GFX6-NEXT:    v_bfe_i32 v9, v9, 0, 16
+; GFX6-NEXT:    v_max_i32_e32 v5, v16, v5
+; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, 0, v8
+; GFX6-NEXT:    v_bfe_i32 v12, v12, 0, 16
+; GFX6-NEXT:    v_max_i32_e32 v8, v16, v8
+; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, 0, v9
+; GFX6-NEXT:    v_bfe_i32 v13, v13, 0, 16
+; GFX6-NEXT:    v_max_i32_e32 v9, v16, v9
+; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, 0, v12
 ; GFX6-NEXT:    v_bfe_i32 v14, v14, 0, 16
+; GFX6-NEXT:    v_max_i32_e32 v12, v16, v12
+; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, 0, v13
 ; GFX6-NEXT:    v_bfe_i32 v15, v15, 0, 16
+; GFX6-NEXT:    v_max_i32_e32 v13, v16, v13
 ; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, 0, v14
-; GFX6-NEXT:    v_max_i32_e32 v14, v14, v16
+; GFX6-NEXT:    v_max_i32_e32 v14, v16, v14
 ; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, 0, v15
-; GFX6-NEXT:    v_max_i32_e32 v15, v15, v16
-; GFX6-NEXT:    v_bfe_i32 v12, v12, 0, 16
-; GFX6-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX6-NEXT:    v_bfe_i32 v13, v13, 0, 16
-; GFX6-NEXT:    v_or_b32_e32 v14, v14, v15
-; GFX6-NEXT:    v_sub_i32_e32 v15, vcc, 0, v12
-; GFX6-NEXT:    v_max_i32_e32 v12, v12, v15
-; GFX6-NEXT:    v_sub_i32_e32 v15, vcc, 0, v13
-; GFX6-NEXT:    v_max_i32_e32 v13, v13, v15
+; GFX6-NEXT:    v_max_i32_e32 v15, v16, v15
 ; GFX6-NEXT:    v_bfe_i32 v10, v10, 0, 16
-; GFX6-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX6-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
 ; GFX6-NEXT:    v_bfe_i32 v11, v11, 0, 16
-; GFX6-NEXT:    v_or_b32_e32 v12, v12, v13
-; GFX6-NEXT:    v_sub_i32_e32 v13, vcc, 0, v10
-; GFX6-NEXT:    v_max_i32_e32 v10, v10, v13
-; GFX6-NEXT:    v_sub_i32_e32 v13, vcc, 0, v11
-; GFX6-NEXT:    v_max_i32_e32 v11, v11, v13
-; GFX6-NEXT:    v_bfe_i32 v8, v8, 0, 16
-; GFX6-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX6-NEXT:    v_bfe_i32 v9, v9, 0, 16
-; GFX6-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX6-NEXT:    v_sub_i32_e32 v11, vcc, 0, v8
-; GFX6-NEXT:    v_max_i32_e32 v8, v8, v11
-; GFX6-NEXT:    v_sub_i32_e32 v11, vcc, 0, v9
-; GFX6-NEXT:    v_max_i32_e32 v9, v9, v11
+; GFX6-NEXT:    v_or_b32_e32 v14, v14, v16
+; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, 0, v10
+; GFX6-NEXT:    v_max_i32_e32 v10, v16, v10
+; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, 0, v11
+; GFX6-NEXT:    v_max_i32_e32 v11, v16, v11
 ; GFX6-NEXT:    v_bfe_i32 v6, v6, 0, 16
-; GFX6-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX6-NEXT:    v_lshlrev_b32_e32 v16, 16, v11
 ; GFX6-NEXT:    v_bfe_i32 v7, v7, 0, 16
-; GFX6-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, 0, v6
-; GFX6-NEXT:    v_max_i32_e32 v6, v6, v9
-; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, 0, v7
-; GFX6-NEXT:    v_max_i32_e32 v7, v7, v9
-; GFX6-NEXT:    v_bfe_i32 v4, v4, 0, 16
-; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX6-NEXT:    v_bfe_i32 v5, v5, 0, 16
-; GFX6-NEXT:    v_or_b32_e32 v6, v6, v7
-; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, 0, v4
-; GFX6-NEXT:    v_max_i32_e32 v4, v4, v7
-; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, 0, v5
-; GFX6-NEXT:    v_max_i32_e32 v5, v5, v7
+; GFX6-NEXT:    v_or_b32_e32 v10, v10, v16
+; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, 0, v6
+; GFX6-NEXT:    v_max_i32_e32 v6, v16, v6
+; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, 0, v7
+; GFX6-NEXT:    v_max_i32_e32 v7, v16, v7
 ; GFX6-NEXT:    v_bfe_i32 v2, v2, 0, 16
-; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v16, 16, v7
 ; GFX6-NEXT:    v_bfe_i32 v3, v3, 0, 16
-; GFX6-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, 0, v2
-; GFX6-NEXT:    v_max_i32_e32 v2, v2, v5
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, 0, v3
-; GFX6-NEXT:    v_max_i32_e32 v3, v3, v5
-; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
-; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0, v0
-; GFX6-NEXT:    v_max_i32_e32 v0, v0, v3
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
-; GFX6-NEXT:    v_max_i32_e32 v1, v1, v3
+; GFX6-NEXT:    v_or_b32_e32 v6, v6, v16
+; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, 0, v2
+; GFX6-NEXT:    v_max_i32_e32 v2, v16, v2
+; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, 0, v3
+; GFX6-NEXT:    v_max_i32_e32 v3, v16, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v16, 16, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX6-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX6-NEXT:    v_or_b32_e32 v2, v2, v16
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    v_alignbit_b32 v1, v2, v0, 16
-; GFX6-NEXT:    v_alignbit_b32 v5, v6, v4, 16
-; GFX6-NEXT:    v_alignbit_b32 v9, v10, v8, 16
-; GFX6-NEXT:    v_alignbit_b32 v13, v14, v12, 16
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; GFX6-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
-; GFX6-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
+; GFX6-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX6-NEXT:    v_or_b32_e32 v8, v8, v9
+; GFX6-NEXT:    v_or_b32_e32 v12, v12, v13
+; GFX6-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; GFX6-NEXT:    v_alignbit_b32 v5, v6, v5, 16
+; GFX6-NEXT:    v_alignbit_b32 v9, v10, v9, 16
+; GFX6-NEXT:    v_alignbit_b32 v13, v14, v13, 16
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_abs_v16i16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX7-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX7-NEXT:    v_sub_i32_e32 v16, vcc, 0, v0
+; GFX7-NEXT:    v_bfe_i32 v4, v4, 0, 16
+; GFX7-NEXT:    v_max_i32_e32 v0, v16, v0
+; GFX7-NEXT:    v_sub_i32_e32 v16, vcc, 0, v1
+; GFX7-NEXT:    v_bfe_i32 v5, v5, 0, 16
+; GFX7-NEXT:    v_max_i32_e32 v1, v16, v1
+; GFX7-NEXT:    v_sub_i32_e32 v16, vcc, 0, v4
+; GFX7-NEXT:    v_bfe_i32 v8, v8, 0, 16
+; GFX7-NEXT:    v_max_i32_e32 v4, v16, v4
+; GFX7-NEXT:    v_sub_i32_e32 v16, vcc, 0, v5
+; GFX7-NEXT:    v_bfe_i32 v9, v9, 0, 16
+; GFX7-NEXT:    v_max_i32_e32 v5, v16, v5
+; GFX7-NEXT:    v_sub_i32_e32 v16, vcc, 0, v8
+; GFX7-NEXT:    v_bfe_i32 v12, v12, 0, 16
+; GFX7-NEXT:    v_max_i32_e32 v8, v16, v8
+; GFX7-NEXT:    v_sub_i32_e32 v16, vcc, 0, v9
+; GFX7-NEXT:    v_bfe_i32 v13, v13, 0, 16
+; GFX7-NEXT:    v_max_i32_e32 v9, v16, v9
+; GFX7-NEXT:    v_sub_i32_e32 v16, vcc, 0, v12
 ; GFX7-NEXT:    v_bfe_i32 v14, v14, 0, 16
+; GFX7-NEXT:    v_max_i32_e32 v12, v16, v12
+; GFX7-NEXT:    v_sub_i32_e32 v16, vcc, 0, v13
 ; GFX7-NEXT:    v_bfe_i32 v15, v15, 0, 16
+; GFX7-NEXT:    v_max_i32_e32 v13, v16, v13
 ; GFX7-NEXT:    v_sub_i32_e32 v16, vcc, 0, v14
-; GFX7-NEXT:    v_max_i32_e32 v14, v14, v16
+; GFX7-NEXT:    v_max_i32_e32 v14, v16, v14
 ; GFX7-NEXT:    v_sub_i32_e32 v16, vcc, 0, v15
-; GFX7-NEXT:    v_max_i32_e32 v15, v15, v16
-; GFX7-NEXT:    v_bfe_i32 v12, v12, 0, 16
-; GFX7-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX7-NEXT:    v_bfe_i32 v13, v13, 0, 16
-; GFX7-NEXT:    v_or_b32_e32 v14, v14, v15
-; GFX7-NEXT:    v_sub_i32_e32 v15, vcc, 0, v12
-; GFX7-NEXT:    v_max_i32_e32 v12, v12, v15
-; GFX7-NEXT:    v_sub_i32_e32 v15, vcc, 0, v13
-; GFX7-NEXT:    v_max_i32_e32 v13, v13, v15
+; GFX7-NEXT:    v_max_i32_e32 v15, v16, v15
 ; GFX7-NEXT:    v_bfe_i32 v10, v10, 0, 16
-; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
 ; GFX7-NEXT:    v_bfe_i32 v11, v11, 0, 16
-; GFX7-NEXT:    v_or_b32_e32 v12, v12, v13
-; GFX7-NEXT:    v_sub_i32_e32 v13, vcc, 0, v10
-; GFX7-NEXT:    v_max_i32_e32 v10, v10, v13
-; GFX7-NEXT:    v_sub_i32_e32 v13, vcc, 0, v11
-; GFX7-NEXT:    v_max_i32_e32 v11, v11, v13
-; GFX7-NEXT:    v_bfe_i32 v8, v8, 0, 16
-; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX7-NEXT:    v_bfe_i32 v9, v9, 0, 16
-; GFX7-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX7-NEXT:    v_sub_i32_e32 v11, vcc, 0, v8
-; GFX7-NEXT:    v_max_i32_e32 v8, v8, v11
-; GFX7-NEXT:    v_sub_i32_e32 v11, vcc, 0, v9
-; GFX7-NEXT:    v_max_i32_e32 v9, v9, v11
+; GFX7-NEXT:    v_or_b32_e32 v14, v14, v16
+; GFX7-NEXT:    v_sub_i32_e32 v16, vcc, 0, v10
+; GFX7-NEXT:    v_max_i32_e32 v10, v16, v10
+; GFX7-NEXT:    v_sub_i32_e32 v16, vcc, 0, v11
+; GFX7-NEXT:    v_max_i32_e32 v11, v16, v11
 ; GFX7-NEXT:    v_bfe_i32 v6, v6, 0, 16
-; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 16, v11
 ; GFX7-NEXT:    v_bfe_i32 v7, v7, 0, 16
-; GFX7-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX7-NEXT:    v_sub_i32_e32 v9, vcc, 0, v6
-; GFX7-NEXT:    v_max_i32_e32 v6, v6, v9
-; GFX7-NEXT:    v_sub_i32_e32 v9, vcc, 0, v7
-; GFX7-NEXT:    v_max_i32_e32 v7, v7, v9
-; GFX7-NEXT:    v_bfe_i32 v4, v4, 0, 16
-; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX7-NEXT:    v_bfe_i32 v5, v5, 0, 16
-; GFX7-NEXT:    v_or_b32_e32 v6, v6, v7
-; GFX7-NEXT:    v_sub_i32_e32 v7, vcc, 0, v4
-; GFX7-NEXT:    v_max_i32_e32 v4, v4, v7
-; GFX7-NEXT:    v_sub_i32_e32 v7, vcc, 0, v5
-; GFX7-NEXT:    v_max_i32_e32 v5, v5, v7
+; GFX7-NEXT:    v_or_b32_e32 v10, v10, v16
+; GFX7-NEXT:    v_sub_i32_e32 v16, vcc, 0, v6
+; GFX7-NEXT:    v_max_i32_e32 v6, v16, v6
+; GFX7-NEXT:    v_sub_i32_e32 v16, vcc, 0, v7
+; GFX7-NEXT:    v_max_i32_e32 v7, v16, v7
 ; GFX7-NEXT:    v_bfe_i32 v2, v2, 0, 16
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 16, v7
 ; GFX7-NEXT:    v_bfe_i32 v3, v3, 0, 16
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-NEXT:    v_sub_i32_e32 v5, vcc, 0, v2
-; GFX7-NEXT:    v_max_i32_e32 v2, v2, v5
-; GFX7-NEXT:    v_sub_i32_e32 v5, vcc, 0, v3
-; GFX7-NEXT:    v_max_i32_e32 v3, v3, v5
-; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_bfe_i32 v1, v1, 0, 16
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT:    v_sub_i32_e32 v3, vcc, 0, v0
-; GFX7-NEXT:    v_max_i32_e32 v0, v0, v3
-; GFX7-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
-; GFX7-NEXT:    v_max_i32_e32 v1, v1, v3
+; GFX7-NEXT:    v_or_b32_e32 v6, v6, v16
+; GFX7-NEXT:    v_sub_i32_e32 v16, vcc, 0, v2
+; GFX7-NEXT:    v_max_i32_e32 v2, v16, v2
+; GFX7-NEXT:    v_sub_i32_e32 v16, vcc, 0, v3
+; GFX7-NEXT:    v_max_i32_e32 v3, v16, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 16, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v16
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT:    v_alignbit_b32 v1, v2, v0, 16
-; GFX7-NEXT:    v_alignbit_b32 v5, v6, v4, 16
-; GFX7-NEXT:    v_alignbit_b32 v9, v10, v8, 16
-; GFX7-NEXT:    v_alignbit_b32 v13, v14, v12, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
-; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
+; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT:    v_or_b32_e32 v8, v8, v9
+; GFX7-NEXT:    v_or_b32_e32 v12, v12, v13
+; GFX7-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; GFX7-NEXT:    v_alignbit_b32 v5, v6, v5, 16
+; GFX7-NEXT:    v_alignbit_b32 v9, v10, v9, 16
+; GFX7-NEXT:    v_alignbit_b32 v13, v14, v13, 16
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_abs_v16i16:
@@ -974,303 +958,287 @@ define <32 x i16> @v_abs_v32i16(<32 x i16> %arg) {
 ; GFX6-LABEL: v_abs_v32i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX6-NEXT:    v_sub_i32_e32 v31, vcc, 0, v0
+; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX6-NEXT:    v_max_i32_e32 v0, v31, v0
+; GFX6-NEXT:    v_sub_i32_e32 v31, vcc, 0, v1
+; GFX6-NEXT:    v_bfe_i32 v4, v4, 0, 16
+; GFX6-NEXT:    v_max_i32_e32 v1, v31, v1
+; GFX6-NEXT:    v_sub_i32_e32 v31, vcc, 0, v4
+; GFX6-NEXT:    v_bfe_i32 v5, v5, 0, 16
+; GFX6-NEXT:    v_max_i32_e32 v4, v31, v4
+; GFX6-NEXT:    v_sub_i32_e32 v31, vcc, 0, v5
+; GFX6-NEXT:    v_bfe_i32 v8, v8, 0, 16
+; GFX6-NEXT:    v_max_i32_e32 v5, v31, v5
+; GFX6-NEXT:    v_sub_i32_e32 v31, vcc, 0, v8
+; GFX6-NEXT:    v_bfe_i32 v9, v9, 0, 16
+; GFX6-NEXT:    v_max_i32_e32 v8, v31, v8
+; GFX6-NEXT:    v_sub_i32_e32 v31, vcc, 0, v9
+; GFX6-NEXT:    v_bfe_i32 v12, v12, 0, 16
+; GFX6-NEXT:    v_max_i32_e32 v9, v31, v9
+; GFX6-NEXT:    v_sub_i32_e32 v31, vcc, 0, v12
+; GFX6-NEXT:    v_bfe_i32 v13, v13, 0, 16
+; GFX6-NEXT:    v_max_i32_e32 v12, v31, v12
+; GFX6-NEXT:    v_sub_i32_e32 v31, vcc, 0, v13
+; GFX6-NEXT:    v_bfe_i32 v16, v16, 0, 16
+; GFX6-NEXT:    v_max_i32_e32 v13, v31, v13
+; GFX6-NEXT:    v_sub_i32_e32 v31, vcc, 0, v16
+; GFX6-NEXT:    v_bfe_i32 v17, v17, 0, 16
+; GFX6-NEXT:    v_max_i32_e32 v16, v31, v16
+; GFX6-NEXT:    v_sub_i32_e32 v31, vcc, 0, v17
+; GFX6-NEXT:    v_bfe_i32 v20, v20, 0, 16
+; GFX6-NEXT:    v_max_i32_e32 v17, v31, v17
+; GFX6-NEXT:    v_sub_i32_e32 v31, vcc, 0, v20
+; GFX6-NEXT:    v_bfe_i32 v21, v21, 0, 16
+; GFX6-NEXT:    v_max_i32_e32 v20, v31, v20
+; GFX6-NEXT:    v_sub_i32_e32 v31, vcc, 0, v21
+; GFX6-NEXT:    v_bfe_i32 v24, v24, 0, 16
+; GFX6-NEXT:    v_max_i32_e32 v21, v31, v21
+; GFX6-NEXT:    v_sub_i32_e32 v31, vcc, 0, v24
+; GFX6-NEXT:    v_bfe_i32 v25, v25, 0, 16
+; GFX6-NEXT:    v_max_i32_e32 v24, v31, v24
+; GFX6-NEXT:    v_sub_i32_e32 v31, vcc, 0, v25
 ; GFX6-NEXT:    v_bfe_i32 v28, v28, 0, 16
+; GFX6-NEXT:    v_max_i32_e32 v25, v31, v25
 ; GFX6-NEXT:    v_sub_i32_e32 v31, vcc, 0, v28
 ; GFX6-NEXT:    v_bfe_i32 v29, v29, 0, 16
-; GFX6-NEXT:    v_max_i32_e32 v28, v28, v31
+; GFX6-NEXT:    v_max_i32_e32 v28, v31, v28
 ; GFX6-NEXT:    v_sub_i32_e32 v31, vcc, 0, v29
 ; GFX6-NEXT:    v_bfe_i32 v30, v30, 0, 16
-; GFX6-NEXT:    v_max_i32_e32 v29, v29, v31
+; GFX6-NEXT:    v_max_i32_e32 v29, v31, v29
 ; GFX6-NEXT:    v_sub_i32_e32 v31, vcc, 0, v30
+; GFX6-NEXT:    v_max_i32_e32 v30, v31, v30
+; GFX6-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX6-NEXT:    v_bfe_i32 v26, v26, 0, 16
-; GFX6-NEXT:    v_max_i32_e32 v30, v30, v31
-; GFX6-NEXT:    v_sub_i32_e32 v31, vcc, 0, v26
 ; GFX6-NEXT:    v_bfe_i32 v27, v27, 0, 16
-; GFX6-NEXT:    v_max_i32_e32 v26, v26, v31
-; GFX6-NEXT:    v_sub_i32_e32 v31, vcc, 0, v27
-; GFX6-NEXT:    v_bfe_i32 v24, v24, 0, 16
-; GFX6-NEXT:    v_max_i32_e32 v27, v27, v31
-; GFX6-NEXT:    v_sub_i32_e32 v31, vcc, 0, v24
-; GFX6-NEXT:    v_bfe_i32 v25, v25, 0, 16
-; GFX6-NEXT:    v_max_i32_e32 v24, v24, v31
-; GFX6-NEXT:    v_sub_i32_e32 v31, vcc, 0, v25
 ; GFX6-NEXT:    v_bfe_i32 v22, v22, 0, 16
-; GFX6-NEXT:    v_max_i32_e32 v25, v25, v31
-; GFX6-NEXT:    v_sub_i32_e32 v31, vcc, 0, v22
 ; GFX6-NEXT:    v_bfe_i32 v23, v23, 0, 16
-; GFX6-NEXT:    v_max_i32_e32 v22, v22, v31
-; GFX6-NEXT:    v_sub_i32_e32 v31, vcc, 0, v23
-; GFX6-NEXT:    v_max_i32_e32 v23, v23, v31
-; GFX6-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX6-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX6-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX6-NEXT:    v_or_b32_e32 v22, v22, v23
-; GFX6-NEXT:    v_or_b32_e32 v24, v24, v25
-; GFX6-NEXT:    v_bfe_i32 v21, v21, 0, 16
-; GFX6-NEXT:    v_bfe_i32 v20, v20, 0, 16
-; GFX6-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX6-NEXT:    v_or_b32_e32 v28, v28, v29
-; GFX6-NEXT:    v_sub_i32_e32 v29, vcc, 0, v20
-; GFX6-NEXT:    v_max_i32_e32 v20, v20, v29
 ; GFX6-NEXT:    v_bfe_i32 v18, v18, 0, 16
 ; GFX6-NEXT:    v_bfe_i32 v19, v19, 0, 16
-; GFX6-NEXT:    v_bfe_i32 v16, v16, 0, 16
-; GFX6-NEXT:    v_bfe_i32 v17, v17, 0, 16
 ; GFX6-NEXT:    v_bfe_i32 v14, v14, 0, 16
 ; GFX6-NEXT:    v_bfe_i32 v15, v15, 0, 16
-; GFX6-NEXT:    v_bfe_i32 v12, v12, 0, 16
-; GFX6-NEXT:    v_bfe_i32 v13, v13, 0, 16
 ; GFX6-NEXT:    v_bfe_i32 v10, v10, 0, 16
 ; GFX6-NEXT:    v_bfe_i32 v11, v11, 0, 16
-; GFX6-NEXT:    v_bfe_i32 v8, v8, 0, 16
-; GFX6-NEXT:    v_bfe_i32 v9, v9, 0, 16
 ; GFX6-NEXT:    v_bfe_i32 v6, v6, 0, 16
 ; GFX6-NEXT:    v_bfe_i32 v7, v7, 0, 16
-; GFX6-NEXT:    v_bfe_i32 v4, v4, 0, 16
-; GFX6-NEXT:    v_bfe_i32 v5, v5, 0, 16
 ; GFX6-NEXT:    v_bfe_i32 v2, v2, 0, 16
 ; GFX6-NEXT:    v_bfe_i32 v3, v3, 0, 16
-; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
-; GFX6-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX6-NEXT:    v_or_b32_e32 v26, v26, v27
-; GFX6-NEXT:    v_lshrrev_b32_e32 v27, 16, v26
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_bfe_i32 v23, v31, 0, 16
-; GFX6-NEXT:    v_sub_i32_e32 v25, vcc, 0, v23
-; GFX6-NEXT:    v_max_i32_e32 v23, v23, v25
-; GFX6-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX6-NEXT:    v_or_b32_e32 v30, v30, v23
-; GFX6-NEXT:    v_sub_i32_e32 v23, vcc, 0, v21
-; GFX6-NEXT:    v_max_i32_e32 v21, v21, v23
-; GFX6-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX6-NEXT:    v_or_b32_e32 v20, v20, v21
-; GFX6-NEXT:    v_sub_i32_e32 v21, vcc, 0, v18
-; GFX6-NEXT:    v_max_i32_e32 v18, v18, v21
-; GFX6-NEXT:    v_sub_i32_e32 v21, vcc, 0, v19
-; GFX6-NEXT:    v_max_i32_e32 v19, v19, v21
-; GFX6-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX6-NEXT:    v_or_b32_e32 v18, v18, v19
-; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, 0, v16
-; GFX6-NEXT:    v_max_i32_e32 v16, v16, v19
-; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, 0, v17
-; GFX6-NEXT:    v_max_i32_e32 v17, v17, v19
-; GFX6-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX6-NEXT:    v_or_b32_e32 v16, v16, v17
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, 0, v14
-; GFX6-NEXT:    v_max_i32_e32 v14, v14, v17
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, 0, v15
-; GFX6-NEXT:    v_max_i32_e32 v15, v15, v17
-; GFX6-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX6-NEXT:    v_or_b32_e32 v14, v14, v15
-; GFX6-NEXT:    v_sub_i32_e32 v15, vcc, 0, v12
-; GFX6-NEXT:    v_max_i32_e32 v12, v12, v15
-; GFX6-NEXT:    v_sub_i32_e32 v15, vcc, 0, v13
-; GFX6-NEXT:    v_max_i32_e32 v13, v13, v15
-; GFX6-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX6-NEXT:    v_or_b32_e32 v12, v12, v13
-; GFX6-NEXT:    v_sub_i32_e32 v13, vcc, 0, v10
-; GFX6-NEXT:    v_max_i32_e32 v10, v10, v13
-; GFX6-NEXT:    v_sub_i32_e32 v13, vcc, 0, v11
-; GFX6-NEXT:    v_max_i32_e32 v11, v11, v13
-; GFX6-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX6-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX6-NEXT:    v_sub_i32_e32 v11, vcc, 0, v8
-; GFX6-NEXT:    v_max_i32_e32 v8, v8, v11
-; GFX6-NEXT:    v_sub_i32_e32 v11, vcc, 0, v9
-; GFX6-NEXT:    v_max_i32_e32 v9, v9, v11
-; GFX6-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX6-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, 0, v6
-; GFX6-NEXT:    v_max_i32_e32 v6, v6, v9
-; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, 0, v7
-; GFX6-NEXT:    v_max_i32_e32 v7, v7, v9
-; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX6-NEXT:    v_or_b32_e32 v6, v6, v7
-; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, 0, v4
-; GFX6-NEXT:    v_max_i32_e32 v4, v4, v7
-; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, 0, v5
-; GFX6-NEXT:    v_max_i32_e32 v5, v5, v7
-; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX6-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, 0, v2
-; GFX6-NEXT:    v_max_i32_e32 v2, v2, v5
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, 0, v3
-; GFX6-NEXT:    v_max_i32_e32 v3, v3, v5
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0, v0
-; GFX6-NEXT:    v_max_i32_e32 v0, v0, v3
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
-; GFX6-NEXT:    v_max_i32_e32 v1, v1, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX6-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX6-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX6-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX6-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX6-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    v_alignbit_b32 v1, v2, v0, 16
-; GFX6-NEXT:    v_alignbit_b32 v5, v6, v4, 16
-; GFX6-NEXT:    v_alignbit_b32 v9, v10, v8, 16
-; GFX6-NEXT:    v_alignbit_b32 v13, v14, v12, 16
-; GFX6-NEXT:    v_alignbit_b32 v17, v18, v16, 16
-; GFX6-NEXT:    v_alignbit_b32 v21, v22, v20, 16
-; GFX6-NEXT:    v_alignbit_b32 v25, v26, v24, 16
-; GFX6-NEXT:    v_alignbit_b32 v29, v30, v28, 16
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; GFX6-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
-; GFX6-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
-; GFX6-NEXT:    v_lshrrev_b32_e32 v19, 16, v18
-; GFX6-NEXT:    v_lshrrev_b32_e32 v23, 16, v22
-; GFX6-NEXT:    v_lshrrev_b32_e32 v31, 16, v30
+; GFX6-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX6-NEXT:    v_or_b32_e32 v8, v8, v9
+; GFX6-NEXT:    v_or_b32_e32 v12, v12, v13
+; GFX6-NEXT:    v_or_b32_e32 v16, v16, v17
+; GFX6-NEXT:    v_or_b32_e32 v20, v20, v21
+; GFX6-NEXT:    v_or_b32_e32 v24, v24, v25
+; GFX6-NEXT:    v_or_b32_e32 v28, v28, v29
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_bfe_i32 v31, v31, 0, 16
+; GFX6-NEXT:    v_sub_i32_e32 v32, vcc, 0, v31
+; GFX6-NEXT:    v_max_i32_e32 v31, v32, v31
+; GFX6-NEXT:    v_lshlrev_b32_e32 v32, 16, v31
+; GFX6-NEXT:    v_or_b32_e32 v30, v30, v32
+; GFX6-NEXT:    v_sub_i32_e32 v32, vcc, 0, v26
+; GFX6-NEXT:    v_max_i32_e32 v26, v32, v26
+; GFX6-NEXT:    v_sub_i32_e32 v32, vcc, 0, v27
+; GFX6-NEXT:    v_max_i32_e32 v27, v32, v27
+; GFX6-NEXT:    v_lshlrev_b32_e32 v32, 16, v27
+; GFX6-NEXT:    v_or_b32_e32 v26, v26, v32
+; GFX6-NEXT:    v_sub_i32_e32 v32, vcc, 0, v22
+; GFX6-NEXT:    v_max_i32_e32 v22, v32, v22
+; GFX6-NEXT:    v_sub_i32_e32 v32, vcc, 0, v23
+; GFX6-NEXT:    v_max_i32_e32 v23, v32, v23
+; GFX6-NEXT:    v_lshlrev_b32_e32 v32, 16, v23
+; GFX6-NEXT:    v_or_b32_e32 v22, v22, v32
+; GFX6-NEXT:    v_sub_i32_e32 v32, vcc, 0, v18
+; GFX6-NEXT:    v_max_i32_e32 v18, v32, v18
+; GFX6-NEXT:    v_sub_i32_e32 v32, vcc, 0, v19
+; GFX6-NEXT:    v_max_i32_e32 v19, v32, v19
+; GFX6-NEXT:    v_lshlrev_b32_e32 v32, 16, v19
+; GFX6-NEXT:    v_or_b32_e32 v18, v18, v32
+; GFX6-NEXT:    v_sub_i32_e32 v32, vcc, 0, v14
+; GFX6-NEXT:    v_max_i32_e32 v14, v32, v14
+; GFX6-NEXT:    v_sub_i32_e32 v32, vcc, 0, v15
+; GFX6-NEXT:    v_max_i32_e32 v15, v32, v15
+; GFX6-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX6-NEXT:    v_or_b32_e32 v14, v14, v32
+; GFX6-NEXT:    v_sub_i32_e32 v32, vcc, 0, v10
+; GFX6-NEXT:    v_max_i32_e32 v10, v32, v10
+; GFX6-NEXT:    v_sub_i32_e32 v32, vcc, 0, v11
+; GFX6-NEXT:    v_max_i32_e32 v11, v32, v11
+; GFX6-NEXT:    v_lshlrev_b32_e32 v32, 16, v11
+; GFX6-NEXT:    v_or_b32_e32 v10, v10, v32
+; GFX6-NEXT:    v_sub_i32_e32 v32, vcc, 0, v6
+; GFX6-NEXT:    v_max_i32_e32 v6, v32, v6
+; GFX6-NEXT:    v_sub_i32_e32 v32, vcc, 0, v7
+; GFX6-NEXT:    v_max_i32_e32 v7, v32, v7
+; GFX6-NEXT:    v_lshlrev_b32_e32 v32, 16, v7
+; GFX6-NEXT:    v_or_b32_e32 v6, v6, v32
+; GFX6-NEXT:    v_sub_i32_e32 v32, vcc, 0, v2
+; GFX6-NEXT:    v_max_i32_e32 v2, v32, v2
+; GFX6-NEXT:    v_sub_i32_e32 v32, vcc, 0, v3
+; GFX6-NEXT:    v_max_i32_e32 v3, v32, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v32, 16, v3
+; GFX6-NEXT:    v_or_b32_e32 v2, v2, v32
+; GFX6-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; GFX6-NEXT:    v_alignbit_b32 v5, v6, v5, 16
+; GFX6-NEXT:    v_alignbit_b32 v9, v10, v9, 16
+; GFX6-NEXT:    v_alignbit_b32 v13, v14, v13, 16
+; GFX6-NEXT:    v_alignbit_b32 v17, v18, v17, 16
+; GFX6-NEXT:    v_alignbit_b32 v21, v22, v21, 16
+; GFX6-NEXT:    v_alignbit_b32 v25, v26, v25, 16
+; GFX6-NEXT:    v_alignbit_b32 v29, v30, v29, 16
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_abs_v32i16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX7-NEXT:    v_sub_i32_e32 v31, vcc, 0, v0
+; GFX7-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX7-NEXT:    v_max_i32_e32 v0, v31, v0
+; GFX7-NEXT:    v_sub_i32_e32 v31, vcc, 0, v1
+; GFX7-NEXT:    v_bfe_i32 v4, v4, 0, 16
+; GFX7-NEXT:    v_max_i32_e32 v1, v31, v1
+; GFX7-NEXT:    v_sub_i32_e32 v31, vcc, 0, v4
+; GFX7-NEXT:    v_bfe_i32 v5, v5, 0, 16
+; GFX7-NEXT:    v_max_i32_e32 v4, v31, v4
+; GFX7-NEXT:    v_sub_i32_e32 v31, vcc, 0, v5
+; GFX7-NEXT:    v_bfe_i32 v8, v8, 0, 16
+; GFX7-NEXT:    v_max_i32_e32 v5, v31, v5
+; GFX7-NEXT:    v_sub_i32_e32 v31, vcc, 0, v8
+; GFX7-NEXT:    v_bfe_i32 v9, v9, 0, 16
+; GFX7-NEXT:    v_max_i32_e32 v8, v31, v8
+; GFX7-NEXT:    v_sub_i32_e32 v31, vcc, 0, v9
+; GFX7-NEXT:    v_bfe_i32 v12, v12, 0, 16
+; GFX7-NEXT:    v_max_i32_e32 v9, v31, v9
+; GFX7-NEXT:    v_sub_i32_e32 v31, vcc, 0, v12
+; GFX7-NEXT:    v_bfe_i32 v13, v13, 0, 16
+; GFX7-NEXT:    v_max_i32_e32 v12, v31, v12
+; GFX7-NEXT:    v_sub_i32_e32 v31, vcc, 0, v13
+; GFX7-NEXT:    v_bfe_i32 v16, v16, 0, 16
+; GFX7-NEXT:    v_max_i32_e32 v13, v31, v13
+; GFX7-NEXT:    v_sub_i32_e32 v31, vcc, 0, v16
+; GFX7-NEXT:    v_bfe_i32 v17, v17, 0, 16
+; GFX7-NEXT:    v_max_i32_e32 v16, v31, v16
+; GFX7-NEXT:    v_sub_i32_e32 v31, vcc, 0, v17
+; GFX7-NEXT:    v_bfe_i32 v20, v20, 0, 16
+; GFX7-NEXT:    v_max_i32_e32 v17, v31, v17
+; GFX7-NEXT:    v_sub_i32_e32 v31, vcc, 0, v20
+; GFX7-NEXT:    v_bfe_i32 v21, v21, 0, 16
+; GFX7-NEXT:    v_max_i32_e32 v20, v31, v20
+; GFX7-NEXT:    v_sub_i32_e32 v31, vcc, 0, v21
+; GFX7-NEXT:    v_bfe_i32 v24, v24, 0, 16
+; GFX7-NEXT:    v_max_i32_e32 v21, v31, v21
+; GFX7-NEXT:    v_sub_i32_e32 v31, vcc, 0, v24
+; GFX7-NEXT:    v_bfe_i32 v25, v25, 0, 16
+; GFX7-NEXT:    v_max_i32_e32 v24, v31, v24
+; GFX7-NEXT:    v_sub_i32_e32 v31, vcc, 0, v25
 ; GFX7-NEXT:    v_bfe_i32 v28, v28, 0, 16
+; GFX7-NEXT:    v_max_i32_e32 v25, v31, v25
 ; GFX7-NEXT:    v_sub_i32_e32 v31, vcc, 0, v28
 ; GFX7-NEXT:    v_bfe_i32 v29, v29, 0, 16
-; GFX7-NEXT:    v_max_i32_e32 v28, v28, v31
+; GFX7-NEXT:    v_max_i32_e32 v28, v31, v28
 ; GFX7-NEXT:    v_sub_i32_e32 v31, vcc, 0, v29
 ; GFX7-NEXT:    v_bfe_i32 v30, v30, 0, 16
-; GFX7-NEXT:    v_max_i32_e32 v29, v29, v31
+; GFX7-NEXT:    v_max_i32_e32 v29, v31, v29
 ; GFX7-NEXT:    v_sub_i32_e32 v31, vcc, 0, v30
+; GFX7-NEXT:    v_max_i32_e32 v30, v31, v30
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX7-NEXT:    v_bfe_i32 v26, v26, 0, 16
-; GFX7-NEXT:    v_max_i32_e32 v30, v30, v31
-; GFX7-NEXT:    v_sub_i32_e32 v31, vcc, 0, v26
 ; GFX7-NEXT:    v_bfe_i32 v27, v27, 0, 16
-; GFX7-NEXT:    v_max_i32_e32 v26, v26, v31
-; GFX7-NEXT:    v_sub_i32_e32 v31, vcc, 0, v27
-; GFX7-NEXT:    v_bfe_i32 v24, v24, 0, 16
-; GFX7-NEXT:    v_max_i32_e32 v27, v27, v31
-; GFX7-NEXT:    v_sub_i32_e32 v31, vcc, 0, v24
-; GFX7-NEXT:    v_bfe_i32 v25, v25, 0, 16
-; GFX7-NEXT:    v_max_i32_e32 v24, v24, v31
-; GFX7-NEXT:    v_sub_i32_e32 v31, vcc, 0, v25
 ; GFX7-NEXT:    v_bfe_i32 v22, v22, 0, 16
-; GFX7-NEXT:    v_max_i32_e32 v25, v25, v31
-; GFX7-NEXT:    v_sub_i32_e32 v31, vcc, 0, v22
 ; GFX7-NEXT:    v_bfe_i32 v23, v23, 0, 16
-; GFX7-NEXT:    v_max_i32_e32 v22, v22, v31
-; GFX7-NEXT:    v_sub_i32_e32 v31, vcc, 0, v23
-; GFX7-NEXT:    v_max_i32_e32 v23, v23, v31
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX7-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX7-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX7-NEXT:    v_or_b32_e32 v22, v22, v23
-; GFX7-NEXT:    v_or_b32_e32 v24, v24, v25
-; GFX7-NEXT:    v_bfe_i32 v21, v21, 0, 16
-; GFX7-NEXT:    v_bfe_i32 v20, v20, 0, 16
-; GFX7-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX7-NEXT:    v_or_b32_e32 v28, v28, v29
-; GFX7-NEXT:    v_sub_i32_e32 v29, vcc, 0, v20
-; GFX7-NEXT:    v_max_i32_e32 v20, v20, v29
 ; GFX7-NEXT:    v_bfe_i32 v18, v18, 0, 16
 ; GFX7-NEXT:    v_bfe_i32 v19, v19, 0, 16
-; GFX7-NEXT:    v_bfe_i32 v16, v16, 0, 16
-; GFX7-NEXT:    v_bfe_i32 v17, v17, 0, 16
 ; GFX7-NEXT:    v_bfe_i32 v14, v14, 0, 16
 ; GFX7-NEXT:    v_bfe_i32 v15, v15, 0, 16
-; GFX7-NEXT:    v_bfe_i32 v12, v12, 0, 16
-; GFX7-NEXT:    v_bfe_i32 v13, v13, 0, 16
 ; GFX7-NEXT:    v_bfe_i32 v10, v10, 0, 16
 ; GFX7-NEXT:    v_bfe_i32 v11, v11, 0, 16
-; GFX7-NEXT:    v_bfe_i32 v8, v8, 0, 16
-; GFX7-NEXT:    v_bfe_i32 v9, v9, 0, 16
 ; GFX7-NEXT:    v_bfe_i32 v6, v6, 0, 16
 ; GFX7-NEXT:    v_bfe_i32 v7, v7, 0, 16
-; GFX7-NEXT:    v_bfe_i32 v4, v4, 0, 16
-; GFX7-NEXT:    v_bfe_i32 v5, v5, 0, 16
 ; GFX7-NEXT:    v_bfe_i32 v2, v2, 0, 16
 ; GFX7-NEXT:    v_bfe_i32 v3, v3, 0, 16
-; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GFX7-NEXT:    v_bfe_i32 v1, v1, 0, 16
-; GFX7-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX7-NEXT:    v_or_b32_e32 v26, v26, v27
-; GFX7-NEXT:    v_lshrrev_b32_e32 v27, 16, v26
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_bfe_i32 v23, v31, 0, 16
-; GFX7-NEXT:    v_sub_i32_e32 v25, vcc, 0, v23
-; GFX7-NEXT:    v_max_i32_e32 v23, v23, v25
-; GFX7-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX7-NEXT:    v_or_b32_e32 v30, v30, v23
-; GFX7-NEXT:    v_sub_i32_e32 v23, vcc, 0, v21
-; GFX7-NEXT:    v_max_i32_e32 v21, v21, v23
-; GFX7-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX7-NEXT:    v_or_b32_e32 v20, v20, v21
-; GFX7-NEXT:    v_sub_i32_e32 v21, vcc, 0, v18
-; GFX7-NEXT:    v_max_i32_e32 v18, v18, v21
-; GFX7-NEXT:    v_sub_i32_e32 v21, vcc, 0, v19
-; GFX7-NEXT:    v_max_i32_e32 v19, v19, v21
-; GFX7-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX7-NEXT:    v_or_b32_e32 v18, v18, v19
-; GFX7-NEXT:    v_sub_i32_e32 v19, vcc, 0, v16
-; GFX7-NEXT:    v_max_i32_e32 v16, v16, v19
-; GFX7-NEXT:    v_sub_i32_e32 v19, vcc, 0, v17
-; GFX7-NEXT:    v_max_i32_e32 v17, v17, v19
-; GFX7-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX7-NEXT:    v_or_b32_e32 v16, v16, v17
-; GFX7-NEXT:    v_sub_i32_e32 v17, vcc, 0, v14
-; GFX7-NEXT:    v_max_i32_e32 v14, v14, v17
-; GFX7-NEXT:    v_sub_i32_e32 v17, vcc, 0, v15
-; GFX7-NEXT:    v_max_i32_e32 v15, v15, v17
-; GFX7-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX7-NEXT:    v_or_b32_e32 v14, v14, v15
-; GFX7-NEXT:    v_sub_i32_e32 v15, vcc, 0, v12
-; GFX7-NEXT:    v_max_i32_e32 v12, v12, v15
-; GFX7-NEXT:    v_sub_i32_e32 v15, vcc, 0, v13
-; GFX7-NEXT:    v_max_i32_e32 v13, v13, v15
-; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX7-NEXT:    v_or_b32_e32 v12, v12, v13
-; GFX7-NEXT:    v_sub_i32_e32 v13, vcc, 0, v10
-; GFX7-NEXT:    v_max_i32_e32 v10, v10, v13
-; GFX7-NEXT:    v_sub_i32_e32 v13, vcc, 0, v11
-; GFX7-NEXT:    v_max_i32_e32 v11, v11, v13
-; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX7-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX7-NEXT:    v_sub_i32_e32 v11, vcc, 0, v8
-; GFX7-NEXT:    v_max_i32_e32 v8, v8, v11
-; GFX7-NEXT:    v_sub_i32_e32 v11, vcc, 0, v9
-; GFX7-NEXT:    v_max_i32_e32 v9, v9, v11
-; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX7-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX7-NEXT:    v_sub_i32_e32 v9, vcc, 0, v6
-; GFX7-NEXT:    v_max_i32_e32 v6, v6, v9
-; GFX7-NEXT:    v_sub_i32_e32 v9, vcc, 0, v7
-; GFX7-NEXT:    v_max_i32_e32 v7, v7, v9
-; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX7-NEXT:    v_or_b32_e32 v6, v6, v7
-; GFX7-NEXT:    v_sub_i32_e32 v7, vcc, 0, v4
-; GFX7-NEXT:    v_max_i32_e32 v4, v4, v7
-; GFX7-NEXT:    v_sub_i32_e32 v7, vcc, 0, v5
-; GFX7-NEXT:    v_max_i32_e32 v5, v5, v7
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-NEXT:    v_sub_i32_e32 v5, vcc, 0, v2
-; GFX7-NEXT:    v_max_i32_e32 v2, v2, v5
-; GFX7-NEXT:    v_sub_i32_e32 v5, vcc, 0, v3
-; GFX7-NEXT:    v_max_i32_e32 v3, v3, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT:    v_sub_i32_e32 v3, vcc, 0, v0
-; GFX7-NEXT:    v_max_i32_e32 v0, v0, v3
-; GFX7-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
-; GFX7-NEXT:    v_max_i32_e32 v1, v1, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX7-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX7-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX7-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT:    v_alignbit_b32 v1, v2, v0, 16
-; GFX7-NEXT:    v_alignbit_b32 v5, v6, v4, 16
-; GFX7-NEXT:    v_alignbit_b32 v9, v10, v8, 16
-; GFX7-NEXT:    v_alignbit_b32 v13, v14, v12, 16
-; GFX7-NEXT:    v_alignbit_b32 v17, v18, v16, 16
-; GFX7-NEXT:    v_alignbit_b32 v21, v22, v20, 16
-; GFX7-NEXT:    v_alignbit_b32 v25, v26, v24, 16
-; GFX7-NEXT:    v_alignbit_b32 v29, v30, v28, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
-; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
-; GFX7-NEXT:    v_lshrrev_b32_e32 v19, 16, v18
-; GFX7-NEXT:    v_lshrrev_b32_e32 v23, 16, v22
-; GFX7-NEXT:    v_lshrrev_b32_e32 v31, 16, v30
+; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT:    v_or_b32_e32 v8, v8, v9
+; GFX7-NEXT:    v_or_b32_e32 v12, v12, v13
+; GFX7-NEXT:    v_or_b32_e32 v16, v16, v17
+; GFX7-NEXT:    v_or_b32_e32 v20, v20, v21
+; GFX7-NEXT:    v_or_b32_e32 v24, v24, v25
+; GFX7-NEXT:    v_or_b32_e32 v28, v28, v29
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_bfe_i32 v31, v31, 0, 16
+; GFX7-NEXT:    v_sub_i32_e32 v32, vcc, 0, v31
+; GFX7-NEXT:    v_max_i32_e32 v31, v32, v31
+; GFX7-NEXT:    v_lshlrev_b32_e32 v32, 16, v31
+; GFX7-NEXT:    v_or_b32_e32 v30, v30, v32
+; GFX7-NEXT:    v_sub_i32_e32 v32, vcc, 0, v26
+; GFX7-NEXT:    v_max_i32_e32 v26, v32, v26
+; GFX7-NEXT:    v_sub_i32_e32 v32, vcc, 0, v27
+; GFX7-NEXT:    v_max_i32_e32 v27, v32, v27
+; GFX7-NEXT:    v_lshlrev_b32_e32 v32, 16, v27
+; GFX7-NEXT:    v_or_b32_e32 v26, v26, v32
+; GFX7-NEXT:    v_sub_i32_e32 v32, vcc, 0, v22
+; GFX7-NEXT:    v_max_i32_e32 v22, v32, v22
+; GFX7-NEXT:    v_sub_i32_e32 v32, vcc, 0, v23
+; GFX7-NEXT:    v_max_i32_e32 v23, v32, v23
+; GFX7-NEXT:    v_lshlrev_b32_e32 v32, 16, v23
+; GFX7-NEXT:    v_or_b32_e32 v22, v22, v32
+; GFX7-NEXT:    v_sub_i32_e32 v32, vcc, 0, v18
+; GFX7-NEXT:    v_max_i32_e32 v18, v32, v18
+; GFX7-NEXT:    v_sub_i32_e32 v32, vcc, 0, v19
+; GFX7-NEXT:    v_max_i32_e32 v19, v32, v19
+; GFX7-NEXT:    v_lshlrev_b32_e32 v32, 16, v19
+; GFX7-NEXT:    v_or_b32_e32 v18, v18, v32
+; GFX7-NEXT:    v_sub_i32_e32 v32, vcc, 0, v14
+; GFX7-NEXT:    v_max_i32_e32 v14, v32, v14
+; GFX7-NEXT:    v_sub_i32_e32 v32, vcc, 0, v15
+; GFX7-NEXT:    v_max_i32_e32 v15, v32, v15
+; GFX7-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX7-NEXT:    v_or_b32_e32 v14, v14, v32
+; GFX7-NEXT:    v_sub_i32_e32 v32, vcc, 0, v10
+; GFX7-NEXT:    v_max_i32_e32 v10, v32, v10
+; GFX7-NEXT:    v_sub_i32_e32 v32, vcc, 0, v11
+; GFX7-NEXT:    v_max_i32_e32 v11, v32, v11
+; GFX7-NEXT:    v_lshlrev_b32_e32 v32, 16, v11
+; GFX7-NEXT:    v_or_b32_e32 v10, v10, v32
+; GFX7-NEXT:    v_sub_i32_e32 v32, vcc, 0, v6
+; GFX7-NEXT:    v_max_i32_e32 v6, v32, v6
+; GFX7-NEXT:    v_sub_i32_e32 v32, vcc, 0, v7
+; GFX7-NEXT:    v_max_i32_e32 v7, v32, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v32, 16, v7
+; GFX7-NEXT:    v_or_b32_e32 v6, v6, v32
+; GFX7-NEXT:    v_sub_i32_e32 v32, vcc, 0, v2
+; GFX7-NEXT:    v_max_i32_e32 v2, v32, v2
+; GFX7-NEXT:    v_sub_i32_e32 v32, vcc, 0, v3
+; GFX7-NEXT:    v_max_i32_e32 v3, v32, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v32, 16, v3
+; GFX7-NEXT:    v_or_b32_e32 v2, v2, v32
+; GFX7-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; GFX7-NEXT:    v_alignbit_b32 v5, v6, v5, 16
+; GFX7-NEXT:    v_alignbit_b32 v9, v10, v9, 16
+; GFX7-NEXT:    v_alignbit_b32 v13, v14, v13, 16
+; GFX7-NEXT:    v_alignbit_b32 v17, v18, v17, 16
+; GFX7-NEXT:    v_alignbit_b32 v21, v22, v21, 16
+; GFX7-NEXT:    v_alignbit_b32 v25, v26, v25, 16
+; GFX7-NEXT:    v_alignbit_b32 v29, v30, v29, 16
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_abs_v32i16:
diff --git a/llvm/test/CodeGen/AMDGPU/abs_i32.ll b/llvm/test/CodeGen/AMDGPU/abs_i32.ll
new file mode 100644
index 0000000000000..b53047fadf382
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/abs_i32.ll
@@ -0,0 +1,92 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=r600 -mcpu=cypress < %s | FileCheck -check-prefixes=R600 %s
+
+define amdgpu_kernel void @abs_v1(ptr addrspace(1) %out, i32 %arg) {
+; GFX9-LABEL: abs_v1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_abs_i32 s2, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; R600-LABEL: abs_v1:
+; R600:       ; %bb.0:
+; R600-NEXT:    ALU 4, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; R600-NEXT:    CF_END
+; R600-NEXT:    PAD
+; R600-NEXT:    ALU clause starting at 4:
+; R600-NEXT:     MOV * T0.W, KC0[2].Z,
+; R600-NEXT:     SUB_INT * T1.W, 0.0, PV.W,
+; R600-NEXT:     MAX_INT T0.X, T0.W, PV.W,
+; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+  %res = call i32 @llvm.abs.i32(i32 %arg, i1 false)
+  store i32 %res, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @abs_v2(ptr addrspace(1) %out, i32 %arg) {
+; GFX9-LABEL: abs_v2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_abs_i32 s2, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; R600-LABEL: abs_v2:
+; R600:       ; %bb.0:
+; R600-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; R600-NEXT:    CF_END
+; R600-NEXT:    PAD
+; R600-NEXT:    ALU clause starting at 4:
+; R600-NEXT:     SUB_INT * T0.W, 0.0, KC0[2].Z,
+; R600-NEXT:     MAX_INT T0.X, KC0[2].Z, PV.W,
+; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+  %neg = sub i32 0, %arg
+  %cond = icmp sgt i32 %arg, %neg
+  %res = select i1 %cond, i32 %arg, i32 %neg
+  store i32 %res, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @abs_v3(ptr addrspace(1) %out, i32 %arg) {
+; GFX9-LABEL: abs_v3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_abs_i32 s2, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; R600-LABEL: abs_v3:
+; R600:       ; %bb.0:
+; R600-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; R600-NEXT:    CF_END
+; R600-NEXT:    PAD
+; R600-NEXT:    ALU clause starting at 4:
+; R600-NEXT:     SUB_INT * T0.W, 0.0, KC0[2].Z,
+; R600-NEXT:     MAX_INT T0.X, PV.W, KC0[2].Z,
+; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+  %neg = sub i32 0, %arg
+  %cond = icmp sgt i32 %neg, %arg
+  %res = select i1 %cond, i32 %neg, i32 %arg
+  store i32 %res, ptr addrspace(1) %out, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
index d25bfbba0b372..12309f356c6b3 100644
--- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
@@ -780,7 +780,7 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out,
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, 0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
@@ -790,11 +790,9 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out,
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, v1, v0
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v2, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.h
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX11-TRUE16-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index 4bbf3ffcb6ba3..7aa648f674f35 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -29339,940 +29339,1982 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; GFX9-NEXT:  .LBB19_4:
 ; GFX9-NEXT:    s_branch .LBB19_2
 ;
-; GFX11-LABEL: bitcast_v64bf16_to_v32i32_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:288
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:284
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:280
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v43, s32 offset:276
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v44, s32 offset:272
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v45, s32 offset:268
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v46, s32 offset:264
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v47, s32 offset:260
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v56, s32 offset:256
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v57, s32 offset:252
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v58, s32 offset:248
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v59, s32 offset:244
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v60, s32 offset:240
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v61, s32 offset:236
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v62, s32 offset:232
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v63, s32 offset:228
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v72, s32 offset:224
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v73, s32 offset:220
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v74, s32 offset:216
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v75, s32 offset:212
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v76, s32 offset:208
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v77, s32 offset:204
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v78, s32 offset:200
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v79, s32 offset:196
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v88, s32 offset:192
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v89, s32 offset:188
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v90, s32 offset:184
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v91, s32 offset:180
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v92, s32 offset:176
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v93, s32 offset:172
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v94, s32 offset:168
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v95, s32 offset:164
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_store_b32 off, v104, s32 offset:160
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v105, s32 offset:156
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v106, s32 offset:152
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v107, s32 offset:148
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v108, s32 offset:144
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v109, s32 offset:140
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v110, s32 offset:136
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v111, s32 offset:132
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v120, s32 offset:128
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v121, s32 offset:124
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v122, s32 offset:120
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v123, s32 offset:116
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v124, s32 offset:112
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v125, s32 offset:108
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v126, s32 offset:104
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v127, s32 offset:100
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v136, s32 offset:96
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v137, s32 offset:92
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v138, s32 offset:88
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v139, s32 offset:84
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v140, s32 offset:80
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v141, s32 offset:76
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v142, s32 offset:72
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v143, s32 offset:68
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v152, s32 offset:64
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v153, s32 offset:60
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v154, s32 offset:56
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v155, s32 offset:52
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v156, s32 offset:48
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v157, s32 offset:44
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v158, s32 offset:40
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v159, s32 offset:36
-; GFX11-NEXT:    s_clause 0x8
-; GFX11-NEXT:    scratch_store_b32 off, v168, s32 offset:32
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v169, s32 offset:28
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v170, s32 offset:24
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v171, s32 offset:20
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v172, s32 offset:16
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v173, s32 offset:12
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v174, s32 offset:8
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v175, s32 offset:4
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v184, s32
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT:    v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12
-; GFX11-NEXT:    v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9
-; GFX11-NEXT:    v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7
-; GFX11-NEXT:    v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3
-; GFX11-NEXT:    v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4
-; GFX11-NEXT:    v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0
-; GFX11-NEXT:    v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1
-; GFX11-NEXT:    v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cbranch_scc0 .LBB19_4
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2
-; GFX11-NEXT:    v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3
-; GFX11-NEXT:    v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18
-; GFX11-NEXT:    v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19
-; GFX11-NEXT:    v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22
-; GFX11-NEXT:    v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23
-; GFX11-NEXT:    v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26
-; GFX11-NEXT:    v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_vccnz .LBB19_3
-; GFX11-NEXT:  .LBB19_2: ; %cmp.true
-; GFX11-NEXT:    s_and_b32 s5, s27, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s4, s27, 16
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s5
-; GFX11-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
-; GFX11-NEXT:    s_lshl_b32 s6, s26, 16
-; GFX11-NEXT:    s_and_b32 s4, s26, 0xffff0000
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s6
-; GFX11-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v1
-; GFX11-NEXT:    v_add_f32_e64 v3, 0x40c00000, s4
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT:    v_bfe_u32 v10, v5, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_bfe_u32 v9, v3, 16, 1
-; GFX11-NEXT:    s_lshl_b32 s7, s25, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT:    s_and_b32 s5, s25, 0xffff0000
-; GFX11-NEXT:    s_and_b32 s4, s24, 0xffff0000
-; GFX11-NEXT:    v_add_f32_e64 v6, 0x40c00000, s5
-; GFX11-NEXT:    v_and_b32_e32 v51, 0xffff0000, v183
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v10, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v9, v3
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_bfe_u32 v10, v6, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v4, v7, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v8
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s7
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_bfe_u32 v3, v8, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshl_or_b32 v15, v1, 16, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, v3, v8
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v10, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v7, v9, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    s_lshl_b32 s4, s24, 16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v6
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v8
-; GFX11-NEXT:    v_add_f32_e64 v9, 0x40c00000, s4
-; GFX11-NEXT:    s_and_b32 s4, s23, 0xffff0000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v9
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT:    v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2
-; GFX11-NEXT:    v_bfe_u32 v7, v9, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, v7, v9
-; GFX11-NEXT:    v_add_f32_e64 v7, 0x40c00000, s4
-; GFX11-NEXT:    s_lshl_b32 s4, s23, 16
-; GFX11-NEXT:    v_lshl_or_b32 v151, v0, 16, v1
-; GFX11-NEXT:    v_add_f32_e64 v12, 0x40c00000, s4
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT:    v_bfe_u32 v11, v7, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v4, v8, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    s_and_b32 s4, s22, 0xffff0000
-; GFX11-NEXT:    v_bfe_u32 v9, v12, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v11, v7
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v10, 0x40c00000, s4
-; GFX11-NEXT:    s_lshl_b32 s4, s22, 16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
-; GFX11-NEXT:    v_add_f32_e64 v11, 0x40c00000, s4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v9, v12
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v7
-; GFX11-NEXT:    v_bfe_u32 v14, v10, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v12
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v8
-; GFX11-NEXT:    s_and_b32 s4, s21, 0xffff0000
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v6, v9, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v9, v11, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT:    v_add_nc_u32_e32 v12, v14, v10
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
-; GFX11-NEXT:    v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, 0x7fff, v12
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v10
-; GFX11-NEXT:    v_add_f32_e64 v13, 0x40c00000, s4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    s_lshl_b32 s4, s21, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v11
-; GFX11-NEXT:    v_add_f32_e64 v16, 0x40c00000, s4
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v10, v13, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_bfe_u32 v12, v16, 16, 1
-; GFX11-NEXT:    s_and_b32 s4, s20, 0xffff0000
-; GFX11-NEXT:    v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v9
-; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v13
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v11
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, v12, v16
-; GFX11-NEXT:    v_add_f32_e64 v12, 0x40c00000, s4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT:    s_lshl_b32 s4, s20, 16
-; GFX11-NEXT:    v_or_b32_e32 v17, 0x400000, v16
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, 0x7fff, v11
-; GFX11-NEXT:    v_bfe_u32 v18, v12, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v19, 0x40c00000, s4
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, v10, v14, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT:    s_and_b32 s4, s19, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v14, v18, v12
-; GFX11-NEXT:    v_bfe_u32 v16, v19, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v9
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v11, v17, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v17, 0x40c00000, s4
-; GFX11-NEXT:    s_lshl_b32 s4, s19, 16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v13
-; GFX11-NEXT:    v_add_nc_u32_e32 v13, 0x7fff, v14
-; GFX11-NEXT:    v_add_nc_u32_e32 v14, v16, v19
-; GFX11-NEXT:    v_or_b32_e32 v16, 0x400000, v12
-; GFX11-NEXT:    v_add_f32_e64 v18, 0x40c00000, s4
-; GFX11-NEXT:    v_bfe_u32 v21, v17, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_add_nc_u32_e32 v14, 0x7fff, v14
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v19
-; GFX11-NEXT:    s_and_b32 s4, s18, 0xffff0000
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, v13, v16, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v16, v18, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT:    v_add_nc_u32_e32 v19, v21, v17
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v13
-; GFX11-NEXT:    v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v16, 0x7fff, v19
-; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v17
-; GFX11-NEXT:    v_add_f32_e64 v20, 0x40c00000, s4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT:    s_lshl_b32 s4, s18, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v13, 0x7fff, v13
-; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v18
-; GFX11-NEXT:    v_add_f32_e64 v22, 0x40c00000, s4
-; GFX11-NEXT:    v_cndmask_b32_e32 v16, v16, v19, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v17, v20, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_bfe_u32 v19, v22, 16, 1
-; GFX11-NEXT:    s_and_b32 s4, s17, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v17, v17, v20
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v16
-; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v20
-; GFX11-NEXT:    v_add_nc_u32_e32 v17, 0x7fff, v17
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v18
-; GFX11-NEXT:    v_add_nc_u32_e32 v18, v19, v22
-; GFX11-NEXT:    v_add_f32_e64 v19, 0x40c00000, s4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-NEXT:    s_lshl_b32 s4, s17, 16
-; GFX11-NEXT:    v_or_b32_e32 v23, 0x400000, v22
-; GFX11-NEXT:    v_add_nc_u32_e32 v18, 0x7fff, v18
-; GFX11-NEXT:    v_bfe_u32 v24, v19, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v25, 0x40c00000, s4
-; GFX11-NEXT:    v_cndmask_b32_e32 v20, v17, v21, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-NEXT:    s_and_b32 s4, s16, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v21, v24, v19
-; GFX11-NEXT:    v_bfe_u32 v22, v25, 16, 1
-; GFX11-NEXT:    v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16
-; GFX11-NEXT:    v_add_f32_e64 v23, 0x40c00000, s4
-; GFX11-NEXT:    s_lshl_b32 s4, s16, 16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v20
-; GFX11-NEXT:    v_add_nc_u32_e32 v20, 0x7fff, v21
-; GFX11-NEXT:    v_add_nc_u32_e32 v21, v22, v25
-; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v19
-; GFX11-NEXT:    v_add_f32_e64 v24, 0x40c00000, s4
-; GFX11-NEXT:    v_bfe_u32 v27, v23, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
-; GFX11-NEXT:    v_add_nc_u32_e32 v21, 0x7fff, v21
-; GFX11-NEXT:    v_or_b32_e32 v26, 0x400000, v25
-; GFX11-NEXT:    s_and_b32 s4, s3, 0xffff0000
-; GFX11-NEXT:    v_cndmask_b32_e32 v20, v20, v22, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v22, v24, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-NEXT:    v_add_nc_u32_e32 v25, v27, v23
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v20
-; GFX11-NEXT:    v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v22, 0x7fff, v25
-; GFX11-NEXT:    v_or_b32_e32 v25, 0x400000, v23
-; GFX11-NEXT:    v_add_f32_e64 v26, 0x40c00000, s4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v20, 0x7fff, v20
-; GFX11-NEXT:    v_or_b32_e32 v27, 0x400000, v24
-; GFX11-NEXT:    v_add_f32_e64 v28, 0x40c00000, s3
-; GFX11-NEXT:    v_cndmask_b32_e32 v22, v22, v25, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v23, v26, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
-; GFX11-NEXT:    v_bfe_u32 v25, v28, 16, 1
-; GFX11-NEXT:    s_and_b32 s3, s2, 0xffff0000
-; GFX11-NEXT:    v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 16, v22
-; GFX11-NEXT:    v_or_b32_e32 v27, 0x400000, v26
-; GFX11-NEXT:    v_add_nc_u32_e32 v23, 0x7fff, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v24
-; GFX11-NEXT:    v_add_nc_u32_e32 v24, v25, v28
-; GFX11-NEXT:    v_add_f32_e64 v25, 0x40c00000, s3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX11-NEXT:    v_or_b32_e32 v29, 0x400000, v28
-; GFX11-NEXT:    v_add_nc_u32_e32 v24, 0x7fff, v24
-; GFX11-NEXT:    v_bfe_u32 v30, v25, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v31, 0x40c00000, s2
-; GFX11-NEXT:    v_cndmask_b32_e32 v26, v23, v27, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-NEXT:    s_and_b32 s2, s1, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v27, v30, v25
-; GFX11-NEXT:    v_bfe_u32 v28, v31, 16, 1
-; GFX11-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-NEXT:    v_cndmask_b32_e32 v24, v24, v29, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v29, 0x40c00000, s2
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v26
-; GFX11-NEXT:    v_add_nc_u32_e32 v26, 0x7fff, v27
-; GFX11-NEXT:    v_add_nc_u32_e32 v27, v28, v31
-; GFX11-NEXT:    v_or_b32_e32 v28, 0x400000, v25
-; GFX11-NEXT:    v_add_f32_e64 v30, 0x40c00000, s1
-; GFX11-NEXT:    v_bfe_u32 v33, v29, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
-; GFX11-NEXT:    v_add_nc_u32_e32 v27, 0x7fff, v27
-; GFX11-NEXT:    v_or_b32_e32 v32, 0x400000, v31
-; GFX11-NEXT:    s_and_b32 s1, s0, 0xffff0000
-; GFX11-NEXT:    v_cndmask_b32_e32 v26, v26, v28, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v28, v30, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-NEXT:    v_add_nc_u32_e32 v31, v33, v29
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 16, v26
-; GFX11-NEXT:    v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v28, 0x7fff, v31
-; GFX11-NEXT:    v_or_b32_e32 v31, 0x400000, v29
-; GFX11-NEXT:    v_add_f32_e64 v32, 0x40c00000, s1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX11-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v26, 0x7fff, v26
-; GFX11-NEXT:    v_or_b32_e32 v33, 0x400000, v30
-; GFX11-NEXT:    v_add_f32_e64 v34, 0x40c00000, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v28, v28, v31, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v29, v32, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX11-NEXT:    v_or_b32_e32 v35, 0x400000, v32
-; GFX11-NEXT:    v_bfe_u32 v31, v34, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
-; GFX11-NEXT:    v_cndmask_b32_e32 v30, v26, v33, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v28
-; GFX11-NEXT:    v_add_nc_u32_e32 v28, v29, v32
-; GFX11-NEXT:    v_lshlrev_b32_e32 v33, 16, v178
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v30
-; GFX11-NEXT:    v_add_nc_u32_e32 v30, v31, v34
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff0000, v178
-; GFX11-NEXT:    v_add_nc_u32_e32 v28, 0x7fff, v28
-; GFX11-NEXT:    v_add_f32_e32 v33, 0x40c00000, v33
-; GFX11-NEXT:    v_lshl_or_b32 v109, v5, 16, v7
-; GFX11-NEXT:    v_add_nc_u32_e32 v30, 0x7fff, v30
-; GFX11-NEXT:    v_add_f32_e32 v31, 0x40c00000, v31
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v28, v35, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v37, v33, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff, v29
-; GFX11-NEXT:    v_bfe_u32 v35, v31, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v32
-; GFX11-NEXT:    v_add_nc_u32_e32 v32, v37, v33
-; GFX11-NEXT:    v_cndmask_b32_e32 v30, v30, v36, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v179
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, v35, v31
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v33
-; GFX11-NEXT:    v_add_nc_u32_e32 v32, 0x7fff, v32
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff0000, v179
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v31
-; GFX11-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v37, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT:    v_and_b32_e32 v37, 0xffff0000, v180
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT:    v_cndmask_b32_e32 v31, v34, v38, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v34, v36, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v33, v35, 16, 1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v180
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36
-; GFX11-NEXT:    v_add_nc_u32_e32 v33, v33, v35
-; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT:    v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT:    v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff, v32
-; GFX11-NEXT:    v_bfe_u32 v36, v37, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
-; GFX11-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_bfe_u32 v35, v38, 16, 1
-; GFX11-NEXT:    v_lshl_or_b32 v178, v31, 16, v32
-; GFX11-NEXT:    v_add_nc_u32_e32 v31, v36, v37
-; GFX11-NEXT:    v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v33
-; GFX11-NEXT:    v_add_nc_u32_e32 v33, v35, v38
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff0000, v182
-; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v37
-; GFX11-NEXT:    v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT:    v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT:    v_lshl_or_b32 v179, v32, 16, v34
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xffff, v30
-; GFX11-NEXT:    v_lshl_or_b32 v136, v2, 16, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v39, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT:    v_bfe_u32 v37, v36, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT:    v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v32, v37, v36
-; GFX11-NEXT:    v_and_b32_e32 v37, 0xffff0000, v181
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v38
-; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v181
-; GFX11-NEXT:    v_add_nc_u32_e32 v32, 0x7fff, v32
-; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT:    v_add_f32_e32 v37, 0x40c00000, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT:    v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v39, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff, v33
-; GFX11-NEXT:    v_bfe_u32 v35, v37, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v36, v38, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT:    v_cndmask_b32_e32 v34, v34, v48, vcc_lo
-; GFX11-NEXT:    v_lshl_or_b32 v180, v31, 16, v33
-; GFX11-NEXT:    v_add_nc_u32_e32 v33, v35, v37
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff0000, v170
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v34
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, v36, v38
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff, v32
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-NEXT:    v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v37
-; GFX11-NEXT:    v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v39, v36, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT:    v_lshl_or_b32 v182, v31, 16, v32
-; GFX11-NEXT:    v_add_nc_u32_e32 v37, v38, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT:    v_add_nc_u32_e32 v31, v39, v36
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v48, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v36
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT:    v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v33
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff, v34
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v37
-; GFX11-NEXT:    v_and_b32_e32 v37, 0xffff0000, v169
-; GFX11-NEXT:    v_cndmask_b32_e32 v31, v31, v38, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v169
-; GFX11-NEXT:    v_lshl_or_b32 v181, v32, 16, v33
-; GFX11-NEXT:    v_add_f32_e32 v37, 0x40c00000, v37
-; GFX11-NEXT:    v_and_b32_e32 v38, 0xffff0000, v176
-; GFX11-NEXT:    v_cndmask_b32_e32 v34, v34, v48, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v36, 0x40c00000, v39
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT:    v_bfe_u32 v35, v37, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v34
-; GFX11-NEXT:    v_bfe_u32 v32, v36, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, v35, v37
-; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v176
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT:    v_add_nc_u32_e32 v32, v32, v36
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v36
-; GFX11-NEXT:    v_add_nc_u32_e32 v32, 0x7fff, v32
-; GFX11-NEXT:    v_bfe_u32 v37, v38, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v49, v35, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT:    v_lshl_or_b32 v170, v33, 16, v31
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v34
-; GFX11-NEXT:    v_add_nc_u32_e32 v36, v49, v35
-; GFX11-NEXT:    v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38
-; GFX11-NEXT:    v_and_b32_e32 v37, 0xffff0000, v174
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v36
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT:    v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT:    v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36
-; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v174
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff, v32
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v37, v35, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT:    v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39
-; GFX11-NEXT:    v_lshl_or_b32 v169, v31, 16, v32
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v37, v37, v35
-; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v31, v36, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff0000, v171
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v177
-; GFX11-NEXT:    v_add_nc_u32_e32 v31, v31, v36
-; GFX11-NEXT:    v_lshl_or_b32 v176, v33, 16, v34
-; GFX11-NEXT:    v_add_nc_u32_e32 v33, 0x7fff, v37
-; GFX11-NEXT:    v_or_b32_e32 v34, 0x400000, v35
-; GFX11-NEXT:    v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171
-; GFX11-NEXT:    v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT:    v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v34, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v34, 0x400000, v36
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v37
-; GFX11-NEXT:    v_bfe_u32 v37, v32, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v32
-; GFX11-NEXT:    v_bfe_u32 v50, v38, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT:    v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32
-; GFX11-NEXT:    v_bfe_u32 v34, v35, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v37, 0xffff0000, v177
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT:    v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v36, v39, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT:    v_bfe_u32 v49, v37, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v36, v50, v38
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT:    v_lshlrev_b32_e32 v50, 16, v184
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37
-; GFX11-NEXT:    v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT:    v_and_b32_e32 v48, 0xffff0000, v184
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT:    v_add_nc_u32_e32 v35, 0x7fff, v35
-; GFX11-NEXT:    v_or_b32_e32 v49, 0x400000, v37
-; GFX11-NEXT:    v_cndmask_b32_e32 v36, v36, v39, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT:    v_add_f32_e32 v37, 0x40c00000, v50
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT:    v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v36
-; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT:    v_bfe_u32 v48, v37, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v39, v38, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v35
-; GFX11-NEXT:    v_and_b32_e32 v36, 0xffff, v36
-; GFX11-NEXT:    v_lshl_or_b32 v174, v33, 16, v31
-; GFX11-NEXT:    v_lshl_or_b32 v171, v32, 16, v34
-; GFX11-NEXT:    v_add_nc_u32_e32 v31, v48, v37
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff0000, v175
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v175
-; GFX11-NEXT:    v_add_nc_u32_e32 v39, v39, v38
-; GFX11-NEXT:    v_lshl_or_b32 v177, v35, 16, v36
-; GFX11-NEXT:    v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT:    v_or_b32_e32 v35, 0x400000, v37
-; GFX11-NEXT:    v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT:    v_add_nc_u32_e32 v32, 0x7fff, v39
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v38
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v37, v33, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v39, v34, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v31, v31, v35, vcc_lo
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff0000, v173
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v173
-; GFX11-NEXT:    v_or_b32_e32 v49, 0x400000, v33
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36
-; GFX11-NEXT:    v_add_nc_u32_e32 v36, v37, v33
-; GFX11-NEXT:    v_add_nc_u32_e32 v37, v39, v34
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v34
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT:    v_add_nc_u32_e32 v37, 0x7fff, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT:    v_add_nc_u32_e32 v38, v38, v35
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT:    v_lshl_or_b32 v122, v3, 16, v6
-; GFX11-NEXT:    v_cndmask_b32_e32 v34, v37, v39, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT:    v_add_nc_u32_e32 v37, 0x7fff, v38
-; GFX11-NEXT:    v_add_f32_e32 v38, 0x40c00000, v48
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v35
-; GFX11-NEXT:    v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v36, v38, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v49, 0x400000, v38
-; GFX11-NEXT:    v_add_f32_e32 v48, 0x40c00000, v48
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT:    v_cndmask_b32_e32 v35, v37, v39, vcc_lo
-; GFX11-NEXT:    v_and_b32_e32 v37, 0xffff0000, v172
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v172
-; GFX11-NEXT:    v_add_nc_u32_e32 v36, v36, v38
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT:    v_or_b32_e32 v55, 0x400000, v48
-; GFX11-NEXT:    v_add_f32_e32 v37, 0x40c00000, v37
-; GFX11-NEXT:    v_add_f32_e32 v39, 0x40c00000, v39
-; GFX11-NEXT:    v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT:    v_bfe_u32 v50, v37, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v38, v39, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v36, v36, v49, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v54, 0x400000, v39
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
-; GFX11-NEXT:    v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37
-; GFX11-NEXT:    v_bfe_u32 v51, v48, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v38, v38, v39
-; GFX11-NEXT:    v_or_b32_e32 v53, 0x400000, v37
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v49, 0x7fff, v49
-; GFX11-NEXT:    v_bfe_u32 v52, v50, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v51, v51, v48
-; GFX11-NEXT:    v_add_nc_u32_e32 v38, 0x7fff, v38
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v35
-; GFX11-NEXT:    v_add_nc_u32_e32 v52, v52, v50
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v48, v48
-; GFX11-NEXT:    v_and_b32_e32 v36, 0xffff, v36
-; GFX11-NEXT:    v_add_nc_u32_e32 v39, 0x7fff, v52
-; GFX11-NEXT:    v_or_b32_e32 v52, 0x400000, v50
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v38
-; GFX11-NEXT:    v_cndmask_b32_e32 v48, v51, v55, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT:    v_lshl_or_b32 v184, v32, 16, v31
-; GFX11-NEXT:    v_lshl_or_b32 v175, v33, 16, v34
-; GFX11-NEXT:    v_and_b32_e32 v38, 0xffff, v38
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v48
-; GFX11-NEXT:    v_cndmask_b32_e32 v37, v49, v53, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
-; GFX11-NEXT:    v_lshl_or_b32 v173, v35, 16, v36
-; GFX11-NEXT:    v_lshl_or_b32 v97, v8, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v48, 0xffff, v48
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v37
-; GFX11-NEXT:    v_cndmask_b32_e32 v39, v39, v52, vcc_lo
-; GFX11-NEXT:    v_lshl_or_b32 v86, v9, 16, v12
-; GFX11-NEXT:    v_lshl_or_b32 v76, v11, 16, v13
-; GFX11-NEXT:    v_lshl_or_b32 v67, v14, 16, v17
-; GFX11-NEXT:    v_lshl_or_b32 v172, v37, 16, v38
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v39
-; GFX11-NEXT:    v_lshl_or_b32 v59, v16, 16, v19
-; GFX11-NEXT:    v_lshl_or_b32 v52, v18, 16, v20
-; GFX11-NEXT:    v_lshl_or_b32 v46, v21, 16, v23
-; GFX11-NEXT:    v_lshl_or_b32 v41, v22, 16, v25
-; GFX11-NEXT:    v_lshl_or_b32 v183, v39, 16, v48
-; GFX11-NEXT:    v_lshl_or_b32 v37, v24, 16, v27
-; GFX11-NEXT:    v_lshl_or_b32 v34, v26, 16, v28
-; GFX11-NEXT:    v_lshl_or_b32 v32, v29, 16, v30
-; GFX11-NEXT:  .LBB19_3: ; %end
-; GFX11-NEXT:    v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46
-; GFX11-NEXT:    v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86
-; GFX11-NEXT:    v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76
-; GFX11-NEXT:    v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136
-; GFX11-NEXT:    v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122
-; GFX11-NEXT:    v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172
-; GFX11-NEXT:    v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175
-; GFX11-NEXT:    v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174
-; GFX11-NEXT:    v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169
-; GFX11-NEXT:    v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_b32 v184, off, s32
-; GFX11-NEXT:    scratch_load_b32 v175, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_b32 v174, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_b32 v173, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_b32 v172, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_b32 v171, off, s32 offset:20
-; GFX11-NEXT:    scratch_load_b32 v170, off, s32 offset:24
-; GFX11-NEXT:    scratch_load_b32 v169, off, s32 offset:28
-; GFX11-NEXT:    scratch_load_b32 v168, off, s32 offset:32
-; GFX11-NEXT:    scratch_load_b32 v159, off, s32 offset:36
-; GFX11-NEXT:    scratch_load_b32 v158, off, s32 offset:40
-; GFX11-NEXT:    scratch_load_b32 v157, off, s32 offset:44
-; GFX11-NEXT:    scratch_load_b32 v156, off, s32 offset:48
-; GFX11-NEXT:    scratch_load_b32 v155, off, s32 offset:52
-; GFX11-NEXT:    scratch_load_b32 v154, off, s32 offset:56
-; GFX11-NEXT:    scratch_load_b32 v153, off, s32 offset:60
-; GFX11-NEXT:    scratch_load_b32 v152, off, s32 offset:64
-; GFX11-NEXT:    scratch_load_b32 v143, off, s32 offset:68
-; GFX11-NEXT:    scratch_load_b32 v142, off, s32 offset:72
-; GFX11-NEXT:    scratch_load_b32 v141, off, s32 offset:76
-; GFX11-NEXT:    scratch_load_b32 v140, off, s32 offset:80
-; GFX11-NEXT:    scratch_load_b32 v139, off, s32 offset:84
-; GFX11-NEXT:    scratch_load_b32 v138, off, s32 offset:88
-; GFX11-NEXT:    scratch_load_b32 v137, off, s32 offset:92
-; GFX11-NEXT:    scratch_load_b32 v136, off, s32 offset:96
-; GFX11-NEXT:    scratch_load_b32 v127, off, s32 offset:100
-; GFX11-NEXT:    scratch_load_b32 v126, off, s32 offset:104
-; GFX11-NEXT:    scratch_load_b32 v125, off, s32 offset:108
-; GFX11-NEXT:    scratch_load_b32 v124, off, s32 offset:112
-; GFX11-NEXT:    scratch_load_b32 v123, off, s32 offset:116
-; GFX11-NEXT:    scratch_load_b32 v122, off, s32 offset:120
-; GFX11-NEXT:    scratch_load_b32 v121, off, s32 offset:124
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_b32 v120, off, s32 offset:128
-; GFX11-NEXT:    scratch_load_b32 v111, off, s32 offset:132
-; GFX11-NEXT:    scratch_load_b32 v110, off, s32 offset:136
-; GFX11-NEXT:    scratch_load_b32 v109, off, s32 offset:140
-; GFX11-NEXT:    scratch_load_b32 v108, off, s32 offset:144
-; GFX11-NEXT:    scratch_load_b32 v107, off, s32 offset:148
-; GFX11-NEXT:    scratch_load_b32 v106, off, s32 offset:152
-; GFX11-NEXT:    scratch_load_b32 v105, off, s32 offset:156
-; GFX11-NEXT:    scratch_load_b32 v104, off, s32 offset:160
-; GFX11-NEXT:    scratch_load_b32 v95, off, s32 offset:164
-; GFX11-NEXT:    scratch_load_b32 v94, off, s32 offset:168
-; GFX11-NEXT:    scratch_load_b32 v93, off, s32 offset:172
-; GFX11-NEXT:    scratch_load_b32 v92, off, s32 offset:176
-; GFX11-NEXT:    scratch_load_b32 v91, off, s32 offset:180
-; GFX11-NEXT:    scratch_load_b32 v90, off, s32 offset:184
-; GFX11-NEXT:    scratch_load_b32 v89, off, s32 offset:188
-; GFX11-NEXT:    scratch_load_b32 v88, off, s32 offset:192
-; GFX11-NEXT:    scratch_load_b32 v79, off, s32 offset:196
-; GFX11-NEXT:    scratch_load_b32 v78, off, s32 offset:200
-; GFX11-NEXT:    scratch_load_b32 v77, off, s32 offset:204
-; GFX11-NEXT:    scratch_load_b32 v76, off, s32 offset:208
-; GFX11-NEXT:    scratch_load_b32 v75, off, s32 offset:212
-; GFX11-NEXT:    scratch_load_b32 v74, off, s32 offset:216
-; GFX11-NEXT:    scratch_load_b32 v73, off, s32 offset:220
-; GFX11-NEXT:    scratch_load_b32 v72, off, s32 offset:224
-; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:228
-; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:232
-; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:236
-; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:240
-; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:244
-; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:248
-; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:252
-; GFX11-NEXT:    s_clause 0x8
-; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:256
-; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:260
-; GFX11-NEXT:    scratch_load_b32 v46, off, s32 offset:264
-; GFX11-NEXT:    scratch_load_b32 v45, off, s32 offset:268
-; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:272
-; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:276
-; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:280
-; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:284
-; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:288
-; GFX11-NEXT:    v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34
-; GFX11-NEXT:    v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52
-; GFX11-NEXT:    v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177
-; GFX11-NEXT:    v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181
-; GFX11-NEXT:    v_mov_b32_e32 v28, v182
-; GFX11-NEXT:    v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB19_4:
-; GFX11-NEXT:    ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
-; GFX11-NEXT:    ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69
-; GFX11-NEXT:    ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73
-; GFX11-NEXT:    ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78
-; GFX11-NEXT:    ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84
-; GFX11-NEXT:    ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91
-; GFX11-NEXT:    ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99
-; GFX11-NEXT:    ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108
-; GFX11-NEXT:    ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118
-; GFX11-NEXT:    ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129
-; GFX11-NEXT:    ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141
-; GFX11-NEXT:    ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154
-; GFX11-NEXT:    ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168
-; GFX11-NEXT:    s_branch .LBB19_2
+; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v32i32_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:192
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:188
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:180
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:172
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:156
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:64
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:60
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:52
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:44
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:28
+; GFX11-TRUE16-NEXT:    s_clause 0x6
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v167, v13 :: v_dual_mov_b32 v176, v12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v177, v11 :: v_dual_mov_b32 v178, v10
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v179, v9 :: v_dual_mov_b32 v180, v8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v181, v7 :: v_dual_mov_b32 v182, v6
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v183, v5 :: v_dual_mov_b32 v168, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v169, v3 :: v_dual_mov_b32 v170, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v171, v1 :: v_dual_mov_b32 v172, v0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v174, s28 :: v_dual_mov_b32 v173, s29
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB19_4
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v135, s0 :: v_dual_mov_b32 v134, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v132, s2 :: v_dual_mov_b32 v129, s3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v125, s16 :: v_dual_mov_b32 v120, s17
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v114, s18 :: v_dual_mov_b32 v107, s19
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v99, s20 :: v_dual_mov_b32 v90, s21
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v80, s22 :: v_dual_mov_b32 v69, s23
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v57, s24 :: v_dual_mov_b32 v44, s25
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v30, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB19_3
+; GFX11-TRUE16-NEXT:  .LBB19_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s27, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s27, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s5
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s26, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s26, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_and_b32 s5, s25, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s7, s25, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s5
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s24, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v2, v8 :: v_dual_add_nc_u32 v7, v7, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s7
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s24, 16
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v2, v7, v2 :: v_dual_add_nc_u32 v7, v8, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v9, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v3, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v1.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v44, 16, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v44.h, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s23, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v57, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v57.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s23, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s22, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v69.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s22, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s21, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v80.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s21, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s20, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v90, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v90.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s20, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s19, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v99.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s19, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s18, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v107, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v107.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s17, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v114, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v114.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s17, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s16, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v120, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v120.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s16, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s3, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v125, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v125.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-TRUE16-NEXT:    s_and_b32 s3, s2, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v129.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    s_and_b32 s2, s1, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v132.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, s0, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v134.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v135.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v167
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v167
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v167, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v167.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v176
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v176
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v176, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v176.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v177
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v177
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v177, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v177.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v178
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v178
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v178, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v178.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v179
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v179
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v179, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v179.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v180
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v180
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v180, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v180.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v181
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v181
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v181, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v181.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v182
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v182
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v182, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v182.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v183
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v183
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v183, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v183.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v168
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v168
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v168, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v168.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v169
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v169
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v169, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v169.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v170
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v170
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v170, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v170.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v171
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v171
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v171, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v171.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v172
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v172
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v172, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v172.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v173
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v173
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v173, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v173.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v174
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v174
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v174, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v174.h, v0.l
+; GFX11-TRUE16-NEXT:  .LBB19_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, v125 :: v_dual_mov_b32 v5, v120
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, v114 :: v_dual_mov_b32 v7, v107
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, v99 :: v_dual_mov_b32 v9, v90
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, v57 :: v_dual_mov_b32 v13, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, v30 :: v_dual_mov_b32 v17, v173
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, v174 :: v_dual_mov_b32 v19, v171
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0x6
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, v135 :: v_dual_mov_b32 v1, v134
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, v132 :: v_dual_mov_b32 v3, v129
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, v80 :: v_dual_mov_b32 v11, v69
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v24, v182 :: v_dual_mov_b32 v27, v179
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, v180 :: v_dual_mov_b32 v29, v177
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, v178 :: v_dual_mov_b32 v31, v167
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v30, v176
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB19_4:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166
+; GFX11-TRUE16-NEXT:    s_branch .LBB19_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v32i32_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:288
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:284
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:280
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v43, s32 offset:276
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v44, s32 offset:272
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v45, s32 offset:268
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v46, s32 offset:264
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v47, s32 offset:260
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v56, s32 offset:256
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v57, s32 offset:252
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v58, s32 offset:248
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v59, s32 offset:244
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v60, s32 offset:240
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v61, s32 offset:236
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v62, s32 offset:232
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v63, s32 offset:228
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v72, s32 offset:224
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v73, s32 offset:220
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v74, s32 offset:216
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:212
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v76, s32 offset:208
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v77, s32 offset:204
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v78, s32 offset:200
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v79, s32 offset:196
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v88, s32 offset:192
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v89, s32 offset:188
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v90, s32 offset:184
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v91, s32 offset:180
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v92, s32 offset:176
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:172
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:168
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:164
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:160
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:156
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:152
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v107, s32 offset:148
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v108, s32 offset:144
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v109, s32 offset:140
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v110, s32 offset:136
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v111, s32 offset:132
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v120, s32 offset:128
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v121, s32 offset:124
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v122, s32 offset:120
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v123, s32 offset:116
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v124, s32 offset:112
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v125, s32 offset:108
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v126, s32 offset:104
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v127, s32 offset:100
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v136, s32 offset:96
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v137, s32 offset:92
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v138, s32 offset:88
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v139, s32 offset:84
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v140, s32 offset:80
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v141, s32 offset:76
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v142, s32 offset:72
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v143, s32 offset:68
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v152, s32 offset:64
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v153, s32 offset:60
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v154, s32 offset:56
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v155, s32 offset:52
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v156, s32 offset:48
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v157, s32 offset:44
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v158, s32 offset:40
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v159, s32 offset:36
+; GFX11-FAKE16-NEXT:    s_clause 0x8
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v168, s32 offset:32
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v169, s32 offset:28
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v170, s32 offset:24
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v171, s32 offset:20
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v172, s32 offset:16
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v173, s32 offset:12
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v174, s32 offset:8
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v175, s32 offset:4
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v184, s32
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB19_4
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB19_3
+; GFX11-FAKE16-NEXT:  .LBB19_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_and_b32 s5, s27, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s27, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s26, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s26, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v3, 16, 1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s25, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT:    s_and_b32 s5, s25, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s24, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v51, 0xffff0000, v183
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v10, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v9, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v8, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, v3, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v10, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s24, 16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s23, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, v7, v9
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s23, 16
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v151, v0, 16, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v12, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s22, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v12, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v11, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s22, 16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v11, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v9, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v14, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, 0x400000, v12
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s21, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v6, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v11, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, v14, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 0x7fff, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v13, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s21, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x400000, v11
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v16, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v13, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v12, v16, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s20, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x400000, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, v12, v16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v12, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s20, 16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, 0x400000, v16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v18, v12, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v19, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v10, v14, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s19, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, v18, v12
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v19, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v17, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v17, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s19, 16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, 0x7fff, v14
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, v16, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, 0x400000, v12
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v18, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v21, v17, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v19
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s18, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v13, v16, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v18, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v19, v21, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v13
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v16, 0x7fff, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, 0x400000, v17
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v20, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s18, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, 0x7fff, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, 0x400000, v18
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v22, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v16, v19, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v17, v20, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v19, v22, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s17, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, v17, v20
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, 0x400000, v20
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, 0x7fff, v17
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v18
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v18, v19, v22
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v19, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s17, 16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, 0x400000, v22
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v18, 0x7fff, v18
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v24, v19, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v25, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v17, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s16, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v21, v24, v19
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v25, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v23, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s16, 16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v20
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v20, 0x7fff, v21
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v21, v22, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v19
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v24, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v27, v23, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v21, 0x7fff, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, 0x400000, v25
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s3, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v20, v22, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v24, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v25, v27, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v20
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v22, 0x7fff, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, 0x400000, v23
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v26, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v20, 0x7fff, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, 0x400000, v24
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v28, 0x40c00000, s3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v22, v25, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v23, v26, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v25, v28, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s3, s2, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, 0x400000, v26
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v23, 0x7fff, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v24
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v24, v25, v28
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v25, 0x40c00000, s3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, 0x400000, v28
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v24, 0x7fff, v24
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v30, v25, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v31, 0x40c00000, s2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v26, v23, v27, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s1, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v27, v30, v25
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v28, v31, 16, 1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v24, v29, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v29, 0x40c00000, s2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v26
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v26, 0x7fff, v27
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v27, v28, v31
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, 0x400000, v25
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v30, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v29, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v27, 0x7fff, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v32, 0x400000, v31
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s0, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v26, v26, v28, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v28, v30, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, v33, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v26
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v28, 0x7fff, v31
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, 0x400000, v29
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v32, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v26, 0x7fff, v26
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v33, 0x400000, v30
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v34, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v28, v28, v31, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v29, v32, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v35, 0x400000, v32
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v31, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v30, v26, v33, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v28
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v28, v29, v32
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v33, 16, v178
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v30
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v30, v31, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xffff0000, v178
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v28, 0x7fff, v28
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v33, 0x40c00000, v33
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v109, v5, 16, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v30, 0x7fff, v30
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v31, 0x40c00000, v31
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v28, v35, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v33, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff, v29
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v31, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v32
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v32, v37, v33
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v30, v30, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v179
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, v35, v31
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v33
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v32, 0x7fff, v32
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v179
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v31
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v32, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v180
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v34, v38, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v36, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v180
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v33, v33, v35
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff, v32
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v36, v37, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v38, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v178, v31, 16, v32
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, v36, v37
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v33
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v33, v35, v38
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v182
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v48, 0x400000, v37
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v179, v32, 16, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xffff, v30
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v136, v2, 16, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v36, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v32, v37, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v181
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v38
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v181
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v32, 0x7fff, v32
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v32, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xffff, v33
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v37, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v36, v38, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v34, v48, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v180, v31, 16, v33
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v33, v35, v37
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v170
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v34
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, v36, v38
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff, v32
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v48, 0x400000, v37
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v39, v36, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v182, v31, 16, v32
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v37, v38, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, v39, v36
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v48, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v36
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xffff, v34
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v37
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v169
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v31, v38, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v169
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v181, v32, 16, v33
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v176
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v34, v48, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v39
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v37, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v34
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v36, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, v35, v37
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v35, 16, v176
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v32, v32, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v48, 0x400000, v36
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v32, 0x7fff, v32
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v38, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v49, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v170, v33, 16, v31
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v34
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v36, v49, v35
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v174
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v174
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff, v32
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v169, v31, 16, v32
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v37, v37, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v31, v36, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v171
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v177
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, v31, v36
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v176, v33, 16, v34
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v33, 0x7fff, v37
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v34, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v34, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v34, 0x400000, v36
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v32, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v32
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v50, v38, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v177
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v36, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v49, v37, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v36, v50, v38
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v184
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v48, 0xffff0000, v184
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v35, 0x7fff, v35
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v49, 0x400000, v37
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v36, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v37, 0x40c00000, v50
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v48, v37, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v39, v38, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v36, 0xffff, v36
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v174, v33, 16, v31
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v171, v32, 16, v34
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, v48, v37
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v175
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v175
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v39, v39, v38
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v177, v35, 16, v36
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v35, 0x400000, v37
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v32, 0x7fff, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v38
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v33, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v39, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v31, v35, vcc_lo
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v173
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v173
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v49, 0x400000, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v36, v37, v33
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v37, v39, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v34
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v37, 0x7fff, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v38, v38, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v122, v3, 16, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v37, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v37, 0x7fff, v38
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v38, 0x40c00000, v48
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v36, v38, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v49, 0x400000, v38
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v48, 0x40c00000, v48
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v37, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v172
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v172
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v36, v36, v38
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v55, 0x400000, v48
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v39, 0x40c00000, v39
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v50, v37, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v39, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v36, v49, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v54, 0x400000, v39
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v51, v48, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v38, v38, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v53, 0x400000, v37
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v49, 0x7fff, v49
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v52, v50, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v51, v51, v48
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v38, 0x7fff, v38
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v35
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v52, v52, v50
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v36, 0xffff, v36
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v39, 0x7fff, v52
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v52, 0x400000, v50
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v38
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v48, v51, v55, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v184, v32, 16, v31
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v175, v33, 16, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff, v38
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v48
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v37, v49, v53, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v173, v35, 16, v36
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v97, v8, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v48, 0xffff, v48
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v37
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v39, v52, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v86, v9, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v76, v11, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v67, v14, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v172, v37, 16, v38
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v39
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v59, v16, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v52, v18, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v46, v21, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v41, v22, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v183, v39, 16, v48
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v37, v24, 16, v27
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v34, v26, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v32, v29, 16, v30
+; GFX11-FAKE16-NEXT:  .LBB19_3: ; %end
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v184, off, s32
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v175, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v174, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v173, off, s32 offset:12
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v172, off, s32 offset:16
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v171, off, s32 offset:20
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v170, off, s32 offset:24
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v169, off, s32 offset:28
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v168, off, s32 offset:32
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v159, off, s32 offset:36
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v158, off, s32 offset:40
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v157, off, s32 offset:44
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v156, off, s32 offset:48
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v155, off, s32 offset:52
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v154, off, s32 offset:56
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v153, off, s32 offset:60
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v152, off, s32 offset:64
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v143, off, s32 offset:68
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v142, off, s32 offset:72
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v141, off, s32 offset:76
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v140, off, s32 offset:80
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v139, off, s32 offset:84
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v138, off, s32 offset:88
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v137, off, s32 offset:92
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v136, off, s32 offset:96
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v127, off, s32 offset:100
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v126, off, s32 offset:104
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v125, off, s32 offset:108
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v124, off, s32 offset:112
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v123, off, s32 offset:116
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v122, off, s32 offset:120
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v121, off, s32 offset:124
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v120, off, s32 offset:128
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v111, off, s32 offset:132
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v110, off, s32 offset:136
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v109, off, s32 offset:140
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v108, off, s32 offset:144
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v107, off, s32 offset:148
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v106, off, s32 offset:152
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v105, off, s32 offset:156
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v104, off, s32 offset:160
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v95, off, s32 offset:164
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v94, off, s32 offset:168
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v93, off, s32 offset:172
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v92, off, s32 offset:176
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v91, off, s32 offset:180
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v90, off, s32 offset:184
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v89, off, s32 offset:188
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v88, off, s32 offset:192
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v79, off, s32 offset:196
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v78, off, s32 offset:200
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v77, off, s32 offset:204
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v76, off, s32 offset:208
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:212
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:216
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:220
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:224
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v63, off, s32 offset:228
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v62, off, s32 offset:232
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v61, off, s32 offset:236
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v60, off, s32 offset:240
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v59, off, s32 offset:244
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:248
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:252
+; GFX11-FAKE16-NEXT:    s_clause 0x8
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:256
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:260
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:264
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v45, off, s32 offset:268
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v44, off, s32 offset:272
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v43, off, s32 offset:276
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v42, off, s32 offset:280
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v41, off, s32 offset:284
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s32 offset:288
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v28, v182
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB19_4:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168
+; GFX11-FAKE16-NEXT:    s_branch .LBB19_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -66864,940 +67906,1982 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; GFX9-NEXT:  .LBB43_4:
 ; GFX9-NEXT:    s_branch .LBB43_2
 ;
-; GFX11-LABEL: bitcast_v64bf16_to_v32f32_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:288
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:284
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:280
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v43, s32 offset:276
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v44, s32 offset:272
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v45, s32 offset:268
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v46, s32 offset:264
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v47, s32 offset:260
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v56, s32 offset:256
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v57, s32 offset:252
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v58, s32 offset:248
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v59, s32 offset:244
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v60, s32 offset:240
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v61, s32 offset:236
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v62, s32 offset:232
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v63, s32 offset:228
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v72, s32 offset:224
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v73, s32 offset:220
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v74, s32 offset:216
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v75, s32 offset:212
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v76, s32 offset:208
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v77, s32 offset:204
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v78, s32 offset:200
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v79, s32 offset:196
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v88, s32 offset:192
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v89, s32 offset:188
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v90, s32 offset:184
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v91, s32 offset:180
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v92, s32 offset:176
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v93, s32 offset:172
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v94, s32 offset:168
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v95, s32 offset:164
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_store_b32 off, v104, s32 offset:160
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v105, s32 offset:156
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v106, s32 offset:152
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v107, s32 offset:148
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v108, s32 offset:144
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v109, s32 offset:140
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v110, s32 offset:136
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v111, s32 offset:132
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v120, s32 offset:128
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v121, s32 offset:124
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v122, s32 offset:120
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v123, s32 offset:116
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v124, s32 offset:112
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v125, s32 offset:108
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v126, s32 offset:104
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v127, s32 offset:100
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v136, s32 offset:96
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v137, s32 offset:92
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v138, s32 offset:88
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v139, s32 offset:84
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v140, s32 offset:80
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v141, s32 offset:76
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v142, s32 offset:72
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v143, s32 offset:68
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v152, s32 offset:64
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v153, s32 offset:60
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v154, s32 offset:56
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v155, s32 offset:52
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v156, s32 offset:48
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v157, s32 offset:44
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v158, s32 offset:40
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v159, s32 offset:36
-; GFX11-NEXT:    s_clause 0x8
-; GFX11-NEXT:    scratch_store_b32 off, v168, s32 offset:32
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v169, s32 offset:28
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v170, s32 offset:24
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v171, s32 offset:20
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v172, s32 offset:16
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v173, s32 offset:12
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v174, s32 offset:8
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v175, s32 offset:4
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v184, s32
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT:    v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12
-; GFX11-NEXT:    v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9
-; GFX11-NEXT:    v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7
-; GFX11-NEXT:    v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3
-; GFX11-NEXT:    v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4
-; GFX11-NEXT:    v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0
-; GFX11-NEXT:    v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1
-; GFX11-NEXT:    v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cbranch_scc0 .LBB43_4
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2
-; GFX11-NEXT:    v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3
-; GFX11-NEXT:    v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18
-; GFX11-NEXT:    v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19
-; GFX11-NEXT:    v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22
-; GFX11-NEXT:    v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23
-; GFX11-NEXT:    v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26
-; GFX11-NEXT:    v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_vccnz .LBB43_3
-; GFX11-NEXT:  .LBB43_2: ; %cmp.true
-; GFX11-NEXT:    s_and_b32 s5, s27, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s4, s27, 16
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s5
-; GFX11-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
-; GFX11-NEXT:    s_lshl_b32 s6, s26, 16
-; GFX11-NEXT:    s_and_b32 s4, s26, 0xffff0000
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s6
-; GFX11-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v1
-; GFX11-NEXT:    v_add_f32_e64 v3, 0x40c00000, s4
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT:    v_bfe_u32 v10, v5, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_bfe_u32 v9, v3, 16, 1
-; GFX11-NEXT:    s_lshl_b32 s7, s25, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT:    s_and_b32 s5, s25, 0xffff0000
-; GFX11-NEXT:    s_and_b32 s4, s24, 0xffff0000
-; GFX11-NEXT:    v_add_f32_e64 v6, 0x40c00000, s5
-; GFX11-NEXT:    v_and_b32_e32 v51, 0xffff0000, v183
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v10, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v9, v3
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_bfe_u32 v10, v6, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v4, v7, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v8
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s7
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_bfe_u32 v3, v8, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshl_or_b32 v15, v1, 16, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, v3, v8
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v10, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v7, v9, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    s_lshl_b32 s4, s24, 16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v6
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v8
-; GFX11-NEXT:    v_add_f32_e64 v9, 0x40c00000, s4
-; GFX11-NEXT:    s_and_b32 s4, s23, 0xffff0000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v9
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT:    v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2
-; GFX11-NEXT:    v_bfe_u32 v7, v9, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, v7, v9
-; GFX11-NEXT:    v_add_f32_e64 v7, 0x40c00000, s4
-; GFX11-NEXT:    s_lshl_b32 s4, s23, 16
-; GFX11-NEXT:    v_lshl_or_b32 v151, v0, 16, v1
-; GFX11-NEXT:    v_add_f32_e64 v12, 0x40c00000, s4
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT:    v_bfe_u32 v11, v7, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v4, v8, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    s_and_b32 s4, s22, 0xffff0000
-; GFX11-NEXT:    v_bfe_u32 v9, v12, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v11, v7
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v10, 0x40c00000, s4
-; GFX11-NEXT:    s_lshl_b32 s4, s22, 16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
-; GFX11-NEXT:    v_add_f32_e64 v11, 0x40c00000, s4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v9, v12
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v7
-; GFX11-NEXT:    v_bfe_u32 v14, v10, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v12
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v8
-; GFX11-NEXT:    s_and_b32 s4, s21, 0xffff0000
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v6, v9, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v9, v11, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT:    v_add_nc_u32_e32 v12, v14, v10
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
-; GFX11-NEXT:    v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, 0x7fff, v12
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v10
-; GFX11-NEXT:    v_add_f32_e64 v13, 0x40c00000, s4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    s_lshl_b32 s4, s21, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v11
-; GFX11-NEXT:    v_add_f32_e64 v16, 0x40c00000, s4
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v10, v13, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_bfe_u32 v12, v16, 16, 1
-; GFX11-NEXT:    s_and_b32 s4, s20, 0xffff0000
-; GFX11-NEXT:    v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v9
-; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v13
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v11
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, v12, v16
-; GFX11-NEXT:    v_add_f32_e64 v12, 0x40c00000, s4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT:    s_lshl_b32 s4, s20, 16
-; GFX11-NEXT:    v_or_b32_e32 v17, 0x400000, v16
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, 0x7fff, v11
-; GFX11-NEXT:    v_bfe_u32 v18, v12, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v19, 0x40c00000, s4
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, v10, v14, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT:    s_and_b32 s4, s19, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v14, v18, v12
-; GFX11-NEXT:    v_bfe_u32 v16, v19, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v9
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v11, v17, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v17, 0x40c00000, s4
-; GFX11-NEXT:    s_lshl_b32 s4, s19, 16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v13
-; GFX11-NEXT:    v_add_nc_u32_e32 v13, 0x7fff, v14
-; GFX11-NEXT:    v_add_nc_u32_e32 v14, v16, v19
-; GFX11-NEXT:    v_or_b32_e32 v16, 0x400000, v12
-; GFX11-NEXT:    v_add_f32_e64 v18, 0x40c00000, s4
-; GFX11-NEXT:    v_bfe_u32 v21, v17, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_add_nc_u32_e32 v14, 0x7fff, v14
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v19
-; GFX11-NEXT:    s_and_b32 s4, s18, 0xffff0000
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, v13, v16, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v16, v18, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT:    v_add_nc_u32_e32 v19, v21, v17
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v13
-; GFX11-NEXT:    v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v16, 0x7fff, v19
-; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v17
-; GFX11-NEXT:    v_add_f32_e64 v20, 0x40c00000, s4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT:    s_lshl_b32 s4, s18, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v13, 0x7fff, v13
-; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v18
-; GFX11-NEXT:    v_add_f32_e64 v22, 0x40c00000, s4
-; GFX11-NEXT:    v_cndmask_b32_e32 v16, v16, v19, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v17, v20, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_bfe_u32 v19, v22, 16, 1
-; GFX11-NEXT:    s_and_b32 s4, s17, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v17, v17, v20
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v16
-; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v20
-; GFX11-NEXT:    v_add_nc_u32_e32 v17, 0x7fff, v17
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v18
-; GFX11-NEXT:    v_add_nc_u32_e32 v18, v19, v22
-; GFX11-NEXT:    v_add_f32_e64 v19, 0x40c00000, s4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-NEXT:    s_lshl_b32 s4, s17, 16
-; GFX11-NEXT:    v_or_b32_e32 v23, 0x400000, v22
-; GFX11-NEXT:    v_add_nc_u32_e32 v18, 0x7fff, v18
-; GFX11-NEXT:    v_bfe_u32 v24, v19, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v25, 0x40c00000, s4
-; GFX11-NEXT:    v_cndmask_b32_e32 v20, v17, v21, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-NEXT:    s_and_b32 s4, s16, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v21, v24, v19
-; GFX11-NEXT:    v_bfe_u32 v22, v25, 16, 1
-; GFX11-NEXT:    v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16
-; GFX11-NEXT:    v_add_f32_e64 v23, 0x40c00000, s4
-; GFX11-NEXT:    s_lshl_b32 s4, s16, 16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v20
-; GFX11-NEXT:    v_add_nc_u32_e32 v20, 0x7fff, v21
-; GFX11-NEXT:    v_add_nc_u32_e32 v21, v22, v25
-; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v19
-; GFX11-NEXT:    v_add_f32_e64 v24, 0x40c00000, s4
-; GFX11-NEXT:    v_bfe_u32 v27, v23, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
-; GFX11-NEXT:    v_add_nc_u32_e32 v21, 0x7fff, v21
-; GFX11-NEXT:    v_or_b32_e32 v26, 0x400000, v25
-; GFX11-NEXT:    s_and_b32 s4, s3, 0xffff0000
-; GFX11-NEXT:    v_cndmask_b32_e32 v20, v20, v22, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v22, v24, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-NEXT:    v_add_nc_u32_e32 v25, v27, v23
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v20
-; GFX11-NEXT:    v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v22, 0x7fff, v25
-; GFX11-NEXT:    v_or_b32_e32 v25, 0x400000, v23
-; GFX11-NEXT:    v_add_f32_e64 v26, 0x40c00000, s4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v20, 0x7fff, v20
-; GFX11-NEXT:    v_or_b32_e32 v27, 0x400000, v24
-; GFX11-NEXT:    v_add_f32_e64 v28, 0x40c00000, s3
-; GFX11-NEXT:    v_cndmask_b32_e32 v22, v22, v25, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v23, v26, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
-; GFX11-NEXT:    v_bfe_u32 v25, v28, 16, 1
-; GFX11-NEXT:    s_and_b32 s3, s2, 0xffff0000
-; GFX11-NEXT:    v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 16, v22
-; GFX11-NEXT:    v_or_b32_e32 v27, 0x400000, v26
-; GFX11-NEXT:    v_add_nc_u32_e32 v23, 0x7fff, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v24
-; GFX11-NEXT:    v_add_nc_u32_e32 v24, v25, v28
-; GFX11-NEXT:    v_add_f32_e64 v25, 0x40c00000, s3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX11-NEXT:    v_or_b32_e32 v29, 0x400000, v28
-; GFX11-NEXT:    v_add_nc_u32_e32 v24, 0x7fff, v24
-; GFX11-NEXT:    v_bfe_u32 v30, v25, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v31, 0x40c00000, s2
-; GFX11-NEXT:    v_cndmask_b32_e32 v26, v23, v27, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-NEXT:    s_and_b32 s2, s1, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v27, v30, v25
-; GFX11-NEXT:    v_bfe_u32 v28, v31, 16, 1
-; GFX11-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-NEXT:    v_cndmask_b32_e32 v24, v24, v29, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v29, 0x40c00000, s2
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v26
-; GFX11-NEXT:    v_add_nc_u32_e32 v26, 0x7fff, v27
-; GFX11-NEXT:    v_add_nc_u32_e32 v27, v28, v31
-; GFX11-NEXT:    v_or_b32_e32 v28, 0x400000, v25
-; GFX11-NEXT:    v_add_f32_e64 v30, 0x40c00000, s1
-; GFX11-NEXT:    v_bfe_u32 v33, v29, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
-; GFX11-NEXT:    v_add_nc_u32_e32 v27, 0x7fff, v27
-; GFX11-NEXT:    v_or_b32_e32 v32, 0x400000, v31
-; GFX11-NEXT:    s_and_b32 s1, s0, 0xffff0000
-; GFX11-NEXT:    v_cndmask_b32_e32 v26, v26, v28, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v28, v30, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-NEXT:    v_add_nc_u32_e32 v31, v33, v29
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 16, v26
-; GFX11-NEXT:    v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v28, 0x7fff, v31
-; GFX11-NEXT:    v_or_b32_e32 v31, 0x400000, v29
-; GFX11-NEXT:    v_add_f32_e64 v32, 0x40c00000, s1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX11-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v26, 0x7fff, v26
-; GFX11-NEXT:    v_or_b32_e32 v33, 0x400000, v30
-; GFX11-NEXT:    v_add_f32_e64 v34, 0x40c00000, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v28, v28, v31, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v29, v32, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX11-NEXT:    v_or_b32_e32 v35, 0x400000, v32
-; GFX11-NEXT:    v_bfe_u32 v31, v34, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
-; GFX11-NEXT:    v_cndmask_b32_e32 v30, v26, v33, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v28
-; GFX11-NEXT:    v_add_nc_u32_e32 v28, v29, v32
-; GFX11-NEXT:    v_lshlrev_b32_e32 v33, 16, v178
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v30
-; GFX11-NEXT:    v_add_nc_u32_e32 v30, v31, v34
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff0000, v178
-; GFX11-NEXT:    v_add_nc_u32_e32 v28, 0x7fff, v28
-; GFX11-NEXT:    v_add_f32_e32 v33, 0x40c00000, v33
-; GFX11-NEXT:    v_lshl_or_b32 v109, v5, 16, v7
-; GFX11-NEXT:    v_add_nc_u32_e32 v30, 0x7fff, v30
-; GFX11-NEXT:    v_add_f32_e32 v31, 0x40c00000, v31
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v28, v35, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v37, v33, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff, v29
-; GFX11-NEXT:    v_bfe_u32 v35, v31, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v32
-; GFX11-NEXT:    v_add_nc_u32_e32 v32, v37, v33
-; GFX11-NEXT:    v_cndmask_b32_e32 v30, v30, v36, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v179
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, v35, v31
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v33
-; GFX11-NEXT:    v_add_nc_u32_e32 v32, 0x7fff, v32
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff0000, v179
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v31
-; GFX11-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v37, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT:    v_and_b32_e32 v37, 0xffff0000, v180
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT:    v_cndmask_b32_e32 v31, v34, v38, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v34, v36, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v33, v35, 16, 1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v180
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36
-; GFX11-NEXT:    v_add_nc_u32_e32 v33, v33, v35
-; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT:    v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT:    v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff, v32
-; GFX11-NEXT:    v_bfe_u32 v36, v37, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
-; GFX11-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_bfe_u32 v35, v38, 16, 1
-; GFX11-NEXT:    v_lshl_or_b32 v178, v31, 16, v32
-; GFX11-NEXT:    v_add_nc_u32_e32 v31, v36, v37
-; GFX11-NEXT:    v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v33
-; GFX11-NEXT:    v_add_nc_u32_e32 v33, v35, v38
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff0000, v182
-; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v37
-; GFX11-NEXT:    v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT:    v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT:    v_lshl_or_b32 v179, v32, 16, v34
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xffff, v30
-; GFX11-NEXT:    v_lshl_or_b32 v136, v2, 16, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v39, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT:    v_bfe_u32 v37, v36, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT:    v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v32, v37, v36
-; GFX11-NEXT:    v_and_b32_e32 v37, 0xffff0000, v181
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v38
-; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v181
-; GFX11-NEXT:    v_add_nc_u32_e32 v32, 0x7fff, v32
-; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT:    v_add_f32_e32 v37, 0x40c00000, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT:    v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v39, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff, v33
-; GFX11-NEXT:    v_bfe_u32 v35, v37, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v36, v38, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT:    v_cndmask_b32_e32 v34, v34, v48, vcc_lo
-; GFX11-NEXT:    v_lshl_or_b32 v180, v31, 16, v33
-; GFX11-NEXT:    v_add_nc_u32_e32 v33, v35, v37
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff0000, v170
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v34
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, v36, v38
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff, v32
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-NEXT:    v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v37
-; GFX11-NEXT:    v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v39, v36, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT:    v_lshl_or_b32 v182, v31, 16, v32
-; GFX11-NEXT:    v_add_nc_u32_e32 v37, v38, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT:    v_add_nc_u32_e32 v31, v39, v36
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v48, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v36
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT:    v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v33
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff, v34
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v37
-; GFX11-NEXT:    v_and_b32_e32 v37, 0xffff0000, v169
-; GFX11-NEXT:    v_cndmask_b32_e32 v31, v31, v38, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v169
-; GFX11-NEXT:    v_lshl_or_b32 v181, v32, 16, v33
-; GFX11-NEXT:    v_add_f32_e32 v37, 0x40c00000, v37
-; GFX11-NEXT:    v_and_b32_e32 v38, 0xffff0000, v176
-; GFX11-NEXT:    v_cndmask_b32_e32 v34, v34, v48, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v36, 0x40c00000, v39
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT:    v_bfe_u32 v35, v37, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v34
-; GFX11-NEXT:    v_bfe_u32 v32, v36, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, v35, v37
-; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v176
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT:    v_add_nc_u32_e32 v32, v32, v36
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v36
-; GFX11-NEXT:    v_add_nc_u32_e32 v32, 0x7fff, v32
-; GFX11-NEXT:    v_bfe_u32 v37, v38, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v49, v35, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT:    v_lshl_or_b32 v170, v33, 16, v31
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v34
-; GFX11-NEXT:    v_add_nc_u32_e32 v36, v49, v35
-; GFX11-NEXT:    v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38
-; GFX11-NEXT:    v_and_b32_e32 v37, 0xffff0000, v174
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v36
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT:    v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT:    v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36
-; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v174
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff, v32
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v37, v35, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT:    v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39
-; GFX11-NEXT:    v_lshl_or_b32 v169, v31, 16, v32
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v37, v37, v35
-; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v31, v36, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff0000, v171
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v177
-; GFX11-NEXT:    v_add_nc_u32_e32 v31, v31, v36
-; GFX11-NEXT:    v_lshl_or_b32 v176, v33, 16, v34
-; GFX11-NEXT:    v_add_nc_u32_e32 v33, 0x7fff, v37
-; GFX11-NEXT:    v_or_b32_e32 v34, 0x400000, v35
-; GFX11-NEXT:    v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171
-; GFX11-NEXT:    v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT:    v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v34, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v34, 0x400000, v36
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v37
-; GFX11-NEXT:    v_bfe_u32 v37, v32, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v32
-; GFX11-NEXT:    v_bfe_u32 v50, v38, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT:    v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32
-; GFX11-NEXT:    v_bfe_u32 v34, v35, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v37, 0xffff0000, v177
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT:    v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v36, v39, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT:    v_bfe_u32 v49, v37, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v36, v50, v38
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT:    v_lshlrev_b32_e32 v50, 16, v184
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37
-; GFX11-NEXT:    v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT:    v_and_b32_e32 v48, 0xffff0000, v184
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT:    v_add_nc_u32_e32 v35, 0x7fff, v35
-; GFX11-NEXT:    v_or_b32_e32 v49, 0x400000, v37
-; GFX11-NEXT:    v_cndmask_b32_e32 v36, v36, v39, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT:    v_add_f32_e32 v37, 0x40c00000, v50
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT:    v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v36
-; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT:    v_bfe_u32 v48, v37, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v39, v38, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v35
-; GFX11-NEXT:    v_and_b32_e32 v36, 0xffff, v36
-; GFX11-NEXT:    v_lshl_or_b32 v174, v33, 16, v31
-; GFX11-NEXT:    v_lshl_or_b32 v171, v32, 16, v34
-; GFX11-NEXT:    v_add_nc_u32_e32 v31, v48, v37
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff0000, v175
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v175
-; GFX11-NEXT:    v_add_nc_u32_e32 v39, v39, v38
-; GFX11-NEXT:    v_lshl_or_b32 v177, v35, 16, v36
-; GFX11-NEXT:    v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT:    v_or_b32_e32 v35, 0x400000, v37
-; GFX11-NEXT:    v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT:    v_add_nc_u32_e32 v32, 0x7fff, v39
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v38
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v37, v33, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v39, v34, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v31, v31, v35, vcc_lo
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff0000, v173
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v173
-; GFX11-NEXT:    v_or_b32_e32 v49, 0x400000, v33
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36
-; GFX11-NEXT:    v_add_nc_u32_e32 v36, v37, v33
-; GFX11-NEXT:    v_add_nc_u32_e32 v37, v39, v34
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v34
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT:    v_add_nc_u32_e32 v37, 0x7fff, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT:    v_add_nc_u32_e32 v38, v38, v35
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT:    v_lshl_or_b32 v122, v3, 16, v6
-; GFX11-NEXT:    v_cndmask_b32_e32 v34, v37, v39, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT:    v_add_nc_u32_e32 v37, 0x7fff, v38
-; GFX11-NEXT:    v_add_f32_e32 v38, 0x40c00000, v48
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v35
-; GFX11-NEXT:    v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v36, v38, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v49, 0x400000, v38
-; GFX11-NEXT:    v_add_f32_e32 v48, 0x40c00000, v48
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT:    v_cndmask_b32_e32 v35, v37, v39, vcc_lo
-; GFX11-NEXT:    v_and_b32_e32 v37, 0xffff0000, v172
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v172
-; GFX11-NEXT:    v_add_nc_u32_e32 v36, v36, v38
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT:    v_or_b32_e32 v55, 0x400000, v48
-; GFX11-NEXT:    v_add_f32_e32 v37, 0x40c00000, v37
-; GFX11-NEXT:    v_add_f32_e32 v39, 0x40c00000, v39
-; GFX11-NEXT:    v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT:    v_bfe_u32 v50, v37, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v38, v39, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v36, v36, v49, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v54, 0x400000, v39
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
-; GFX11-NEXT:    v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37
-; GFX11-NEXT:    v_bfe_u32 v51, v48, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v38, v38, v39
-; GFX11-NEXT:    v_or_b32_e32 v53, 0x400000, v37
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v49, 0x7fff, v49
-; GFX11-NEXT:    v_bfe_u32 v52, v50, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v51, v51, v48
-; GFX11-NEXT:    v_add_nc_u32_e32 v38, 0x7fff, v38
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v35
-; GFX11-NEXT:    v_add_nc_u32_e32 v52, v52, v50
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v48, v48
-; GFX11-NEXT:    v_and_b32_e32 v36, 0xffff, v36
-; GFX11-NEXT:    v_add_nc_u32_e32 v39, 0x7fff, v52
-; GFX11-NEXT:    v_or_b32_e32 v52, 0x400000, v50
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v38
-; GFX11-NEXT:    v_cndmask_b32_e32 v48, v51, v55, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT:    v_lshl_or_b32 v184, v32, 16, v31
-; GFX11-NEXT:    v_lshl_or_b32 v175, v33, 16, v34
-; GFX11-NEXT:    v_and_b32_e32 v38, 0xffff, v38
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v48
-; GFX11-NEXT:    v_cndmask_b32_e32 v37, v49, v53, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
-; GFX11-NEXT:    v_lshl_or_b32 v173, v35, 16, v36
-; GFX11-NEXT:    v_lshl_or_b32 v97, v8, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v48, 0xffff, v48
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v37
-; GFX11-NEXT:    v_cndmask_b32_e32 v39, v39, v52, vcc_lo
-; GFX11-NEXT:    v_lshl_or_b32 v86, v9, 16, v12
-; GFX11-NEXT:    v_lshl_or_b32 v76, v11, 16, v13
-; GFX11-NEXT:    v_lshl_or_b32 v67, v14, 16, v17
-; GFX11-NEXT:    v_lshl_or_b32 v172, v37, 16, v38
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v39
-; GFX11-NEXT:    v_lshl_or_b32 v59, v16, 16, v19
-; GFX11-NEXT:    v_lshl_or_b32 v52, v18, 16, v20
-; GFX11-NEXT:    v_lshl_or_b32 v46, v21, 16, v23
-; GFX11-NEXT:    v_lshl_or_b32 v41, v22, 16, v25
-; GFX11-NEXT:    v_lshl_or_b32 v183, v39, 16, v48
-; GFX11-NEXT:    v_lshl_or_b32 v37, v24, 16, v27
-; GFX11-NEXT:    v_lshl_or_b32 v34, v26, 16, v28
-; GFX11-NEXT:    v_lshl_or_b32 v32, v29, 16, v30
-; GFX11-NEXT:  .LBB43_3: ; %end
-; GFX11-NEXT:    v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46
-; GFX11-NEXT:    v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86
-; GFX11-NEXT:    v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76
-; GFX11-NEXT:    v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136
-; GFX11-NEXT:    v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122
-; GFX11-NEXT:    v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172
-; GFX11-NEXT:    v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175
-; GFX11-NEXT:    v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174
-; GFX11-NEXT:    v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169
-; GFX11-NEXT:    v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_b32 v184, off, s32
-; GFX11-NEXT:    scratch_load_b32 v175, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_b32 v174, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_b32 v173, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_b32 v172, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_b32 v171, off, s32 offset:20
-; GFX11-NEXT:    scratch_load_b32 v170, off, s32 offset:24
-; GFX11-NEXT:    scratch_load_b32 v169, off, s32 offset:28
-; GFX11-NEXT:    scratch_load_b32 v168, off, s32 offset:32
-; GFX11-NEXT:    scratch_load_b32 v159, off, s32 offset:36
-; GFX11-NEXT:    scratch_load_b32 v158, off, s32 offset:40
-; GFX11-NEXT:    scratch_load_b32 v157, off, s32 offset:44
-; GFX11-NEXT:    scratch_load_b32 v156, off, s32 offset:48
-; GFX11-NEXT:    scratch_load_b32 v155, off, s32 offset:52
-; GFX11-NEXT:    scratch_load_b32 v154, off, s32 offset:56
-; GFX11-NEXT:    scratch_load_b32 v153, off, s32 offset:60
-; GFX11-NEXT:    scratch_load_b32 v152, off, s32 offset:64
-; GFX11-NEXT:    scratch_load_b32 v143, off, s32 offset:68
-; GFX11-NEXT:    scratch_load_b32 v142, off, s32 offset:72
-; GFX11-NEXT:    scratch_load_b32 v141, off, s32 offset:76
-; GFX11-NEXT:    scratch_load_b32 v140, off, s32 offset:80
-; GFX11-NEXT:    scratch_load_b32 v139, off, s32 offset:84
-; GFX11-NEXT:    scratch_load_b32 v138, off, s32 offset:88
-; GFX11-NEXT:    scratch_load_b32 v137, off, s32 offset:92
-; GFX11-NEXT:    scratch_load_b32 v136, off, s32 offset:96
-; GFX11-NEXT:    scratch_load_b32 v127, off, s32 offset:100
-; GFX11-NEXT:    scratch_load_b32 v126, off, s32 offset:104
-; GFX11-NEXT:    scratch_load_b32 v125, off, s32 offset:108
-; GFX11-NEXT:    scratch_load_b32 v124, off, s32 offset:112
-; GFX11-NEXT:    scratch_load_b32 v123, off, s32 offset:116
-; GFX11-NEXT:    scratch_load_b32 v122, off, s32 offset:120
-; GFX11-NEXT:    scratch_load_b32 v121, off, s32 offset:124
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_b32 v120, off, s32 offset:128
-; GFX11-NEXT:    scratch_load_b32 v111, off, s32 offset:132
-; GFX11-NEXT:    scratch_load_b32 v110, off, s32 offset:136
-; GFX11-NEXT:    scratch_load_b32 v109, off, s32 offset:140
-; GFX11-NEXT:    scratch_load_b32 v108, off, s32 offset:144
-; GFX11-NEXT:    scratch_load_b32 v107, off, s32 offset:148
-; GFX11-NEXT:    scratch_load_b32 v106, off, s32 offset:152
-; GFX11-NEXT:    scratch_load_b32 v105, off, s32 offset:156
-; GFX11-NEXT:    scratch_load_b32 v104, off, s32 offset:160
-; GFX11-NEXT:    scratch_load_b32 v95, off, s32 offset:164
-; GFX11-NEXT:    scratch_load_b32 v94, off, s32 offset:168
-; GFX11-NEXT:    scratch_load_b32 v93, off, s32 offset:172
-; GFX11-NEXT:    scratch_load_b32 v92, off, s32 offset:176
-; GFX11-NEXT:    scratch_load_b32 v91, off, s32 offset:180
-; GFX11-NEXT:    scratch_load_b32 v90, off, s32 offset:184
-; GFX11-NEXT:    scratch_load_b32 v89, off, s32 offset:188
-; GFX11-NEXT:    scratch_load_b32 v88, off, s32 offset:192
-; GFX11-NEXT:    scratch_load_b32 v79, off, s32 offset:196
-; GFX11-NEXT:    scratch_load_b32 v78, off, s32 offset:200
-; GFX11-NEXT:    scratch_load_b32 v77, off, s32 offset:204
-; GFX11-NEXT:    scratch_load_b32 v76, off, s32 offset:208
-; GFX11-NEXT:    scratch_load_b32 v75, off, s32 offset:212
-; GFX11-NEXT:    scratch_load_b32 v74, off, s32 offset:216
-; GFX11-NEXT:    scratch_load_b32 v73, off, s32 offset:220
-; GFX11-NEXT:    scratch_load_b32 v72, off, s32 offset:224
-; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:228
-; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:232
-; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:236
-; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:240
-; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:244
-; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:248
-; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:252
-; GFX11-NEXT:    s_clause 0x8
-; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:256
-; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:260
-; GFX11-NEXT:    scratch_load_b32 v46, off, s32 offset:264
-; GFX11-NEXT:    scratch_load_b32 v45, off, s32 offset:268
-; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:272
-; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:276
-; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:280
-; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:284
-; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:288
-; GFX11-NEXT:    v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34
-; GFX11-NEXT:    v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52
-; GFX11-NEXT:    v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177
-; GFX11-NEXT:    v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181
-; GFX11-NEXT:    v_mov_b32_e32 v28, v182
-; GFX11-NEXT:    v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB43_4:
-; GFX11-NEXT:    ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
-; GFX11-NEXT:    ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69
-; GFX11-NEXT:    ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73
-; GFX11-NEXT:    ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78
-; GFX11-NEXT:    ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84
-; GFX11-NEXT:    ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91
-; GFX11-NEXT:    ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99
-; GFX11-NEXT:    ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108
-; GFX11-NEXT:    ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118
-; GFX11-NEXT:    ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129
-; GFX11-NEXT:    ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141
-; GFX11-NEXT:    ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154
-; GFX11-NEXT:    ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168
-; GFX11-NEXT:    s_branch .LBB43_2
+; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v32f32_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:192
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:188
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:180
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:172
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:156
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:64
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:60
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:52
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:44
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:28
+; GFX11-TRUE16-NEXT:    s_clause 0x6
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v167, v13 :: v_dual_mov_b32 v176, v12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v177, v11 :: v_dual_mov_b32 v178, v10
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v179, v9 :: v_dual_mov_b32 v180, v8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v181, v7 :: v_dual_mov_b32 v182, v6
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v183, v5 :: v_dual_mov_b32 v168, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v169, v3 :: v_dual_mov_b32 v170, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v171, v1 :: v_dual_mov_b32 v172, v0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v174, s28 :: v_dual_mov_b32 v173, s29
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB43_4
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v135, s0 :: v_dual_mov_b32 v134, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v132, s2 :: v_dual_mov_b32 v129, s3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v125, s16 :: v_dual_mov_b32 v120, s17
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v114, s18 :: v_dual_mov_b32 v107, s19
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v99, s20 :: v_dual_mov_b32 v90, s21
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v80, s22 :: v_dual_mov_b32 v69, s23
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v57, s24 :: v_dual_mov_b32 v44, s25
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v30, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB43_3
+; GFX11-TRUE16-NEXT:  .LBB43_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s27, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s27, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s5
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s26, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s26, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_and_b32 s5, s25, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s7, s25, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s5
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s24, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v2, v8 :: v_dual_add_nc_u32 v7, v7, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s7
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s24, 16
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v2, v7, v2 :: v_dual_add_nc_u32 v7, v8, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v9, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v3, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v1.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v44, 16, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v44.h, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s23, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v57, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v57.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s23, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s22, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v69.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s22, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s21, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v80.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s21, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s20, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v90, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v90.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s20, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s19, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v99.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s19, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s18, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v107, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v107.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s17, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v114, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v114.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s17, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s16, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v120, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v120.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s16, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s3, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v125, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v125.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-TRUE16-NEXT:    s_and_b32 s3, s2, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v129.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    s_and_b32 s2, s1, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v132.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, s0, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v134.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v135.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v167
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v167
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v167, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v167.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v176
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v176
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v176, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v176.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v177
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v177
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v177, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v177.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v178
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v178
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v178, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v178.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v179
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v179
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v179, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v179.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v180
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v180
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v180, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v180.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v181
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v181
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v181, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v181.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v182
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v182
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v182, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v182.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v183
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v183
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v183, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v183.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v168
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v168
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v168, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v168.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v169
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v169
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v169, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v169.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v170
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v170
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v170, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v170.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v171
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v171
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v171, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v171.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v172
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v172
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v172, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v172.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v173
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v173
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v173, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v173.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v174
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v174
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v174, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v174.h, v0.l
+; GFX11-TRUE16-NEXT:  .LBB43_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, v125 :: v_dual_mov_b32 v5, v120
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, v114 :: v_dual_mov_b32 v7, v107
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, v99 :: v_dual_mov_b32 v9, v90
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, v57 :: v_dual_mov_b32 v13, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, v30 :: v_dual_mov_b32 v17, v173
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, v174 :: v_dual_mov_b32 v19, v171
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0x6
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, v135 :: v_dual_mov_b32 v1, v134
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, v132 :: v_dual_mov_b32 v3, v129
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, v80 :: v_dual_mov_b32 v11, v69
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v24, v182 :: v_dual_mov_b32 v27, v179
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, v180 :: v_dual_mov_b32 v29, v177
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, v178 :: v_dual_mov_b32 v31, v167
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v30, v176
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB43_4:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166
+; GFX11-TRUE16-NEXT:    s_branch .LBB43_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v32f32_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:288
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:284
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:280
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v43, s32 offset:276
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v44, s32 offset:272
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v45, s32 offset:268
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v46, s32 offset:264
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v47, s32 offset:260
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v56, s32 offset:256
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v57, s32 offset:252
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v58, s32 offset:248
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v59, s32 offset:244
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v60, s32 offset:240
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v61, s32 offset:236
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v62, s32 offset:232
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v63, s32 offset:228
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v72, s32 offset:224
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v73, s32 offset:220
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v74, s32 offset:216
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:212
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v76, s32 offset:208
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v77, s32 offset:204
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v78, s32 offset:200
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v79, s32 offset:196
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v88, s32 offset:192
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v89, s32 offset:188
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v90, s32 offset:184
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v91, s32 offset:180
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v92, s32 offset:176
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:172
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:168
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:164
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:160
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:156
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:152
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v107, s32 offset:148
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v108, s32 offset:144
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v109, s32 offset:140
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v110, s32 offset:136
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v111, s32 offset:132
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v120, s32 offset:128
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v121, s32 offset:124
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v122, s32 offset:120
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v123, s32 offset:116
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v124, s32 offset:112
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v125, s32 offset:108
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v126, s32 offset:104
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v127, s32 offset:100
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v136, s32 offset:96
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v137, s32 offset:92
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v138, s32 offset:88
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v139, s32 offset:84
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v140, s32 offset:80
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v141, s32 offset:76
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v142, s32 offset:72
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v143, s32 offset:68
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v152, s32 offset:64
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v153, s32 offset:60
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v154, s32 offset:56
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v155, s32 offset:52
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v156, s32 offset:48
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v157, s32 offset:44
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v158, s32 offset:40
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v159, s32 offset:36
+; GFX11-FAKE16-NEXT:    s_clause 0x8
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v168, s32 offset:32
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v169, s32 offset:28
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v170, s32 offset:24
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v171, s32 offset:20
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v172, s32 offset:16
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v173, s32 offset:12
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v174, s32 offset:8
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v175, s32 offset:4
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v184, s32
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB43_4
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB43_3
+; GFX11-FAKE16-NEXT:  .LBB43_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_and_b32 s5, s27, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s27, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s26, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s26, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v3, 16, 1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s25, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT:    s_and_b32 s5, s25, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s24, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v51, 0xffff0000, v183
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v10, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v9, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v8, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, v3, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v10, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s24, 16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s23, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, v7, v9
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s23, 16
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v151, v0, 16, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v12, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s22, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v12, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v11, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s22, 16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v11, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v9, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v14, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, 0x400000, v12
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s21, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v6, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v11, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, v14, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 0x7fff, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v13, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s21, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x400000, v11
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v16, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v13, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v12, v16, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s20, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x400000, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, v12, v16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v12, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s20, 16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, 0x400000, v16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v18, v12, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v19, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v10, v14, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s19, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, v18, v12
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v19, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v17, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v17, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s19, 16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, 0x7fff, v14
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, v16, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, 0x400000, v12
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v18, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v21, v17, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v19
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s18, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v13, v16, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v18, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v19, v21, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v13
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v16, 0x7fff, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, 0x400000, v17
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v20, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s18, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, 0x7fff, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, 0x400000, v18
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v22, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v16, v19, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v17, v20, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v19, v22, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s17, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, v17, v20
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, 0x400000, v20
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, 0x7fff, v17
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v18
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v18, v19, v22
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v19, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s17, 16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, 0x400000, v22
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v18, 0x7fff, v18
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v24, v19, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v25, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v17, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s16, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v21, v24, v19
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v25, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v23, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s16, 16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v20
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v20, 0x7fff, v21
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v21, v22, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v19
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v24, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v27, v23, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v21, 0x7fff, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, 0x400000, v25
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s3, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v20, v22, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v24, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v25, v27, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v20
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v22, 0x7fff, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, 0x400000, v23
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v26, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v20, 0x7fff, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, 0x400000, v24
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v28, 0x40c00000, s3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v22, v25, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v23, v26, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v25, v28, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s3, s2, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, 0x400000, v26
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v23, 0x7fff, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v24
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v24, v25, v28
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v25, 0x40c00000, s3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, 0x400000, v28
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v24, 0x7fff, v24
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v30, v25, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v31, 0x40c00000, s2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v26, v23, v27, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s1, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v27, v30, v25
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v28, v31, 16, 1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v24, v29, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v29, 0x40c00000, s2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v26
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v26, 0x7fff, v27
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v27, v28, v31
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, 0x400000, v25
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v30, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v29, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v27, 0x7fff, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v32, 0x400000, v31
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s0, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v26, v26, v28, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v28, v30, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, v33, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v26
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v28, 0x7fff, v31
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, 0x400000, v29
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v32, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v26, 0x7fff, v26
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v33, 0x400000, v30
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v34, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v28, v28, v31, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v29, v32, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v35, 0x400000, v32
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v31, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v30, v26, v33, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v28
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v28, v29, v32
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v33, 16, v178
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v30
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v30, v31, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xffff0000, v178
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v28, 0x7fff, v28
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v33, 0x40c00000, v33
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v109, v5, 16, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v30, 0x7fff, v30
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v31, 0x40c00000, v31
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v28, v35, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v33, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff, v29
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v31, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v32
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v32, v37, v33
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v30, v30, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v179
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, v35, v31
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v33
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v32, 0x7fff, v32
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v179
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v31
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v32, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v180
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v34, v38, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v36, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v180
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v33, v33, v35
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff, v32
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v36, v37, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v38, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v178, v31, 16, v32
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, v36, v37
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v33
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v33, v35, v38
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v182
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v48, 0x400000, v37
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v179, v32, 16, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xffff, v30
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v136, v2, 16, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v36, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v32, v37, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v181
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v38
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v181
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v32, 0x7fff, v32
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v32, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xffff, v33
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v37, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v36, v38, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v34, v48, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v180, v31, 16, v33
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v33, v35, v37
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v170
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v34
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, v36, v38
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff, v32
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v48, 0x400000, v37
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v39, v36, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v182, v31, 16, v32
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v37, v38, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, v39, v36
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v48, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v36
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xffff, v34
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v37
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v169
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v31, v38, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v169
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v181, v32, 16, v33
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v176
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v34, v48, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v39
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v37, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v34
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v36, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, v35, v37
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v35, 16, v176
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v32, v32, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v48, 0x400000, v36
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v32, 0x7fff, v32
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v38, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v49, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v170, v33, 16, v31
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v34
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v36, v49, v35
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v174
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v174
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff, v32
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v169, v31, 16, v32
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v37, v37, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v31, v36, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v171
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v177
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, v31, v36
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v176, v33, 16, v34
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v33, 0x7fff, v37
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v34, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v34, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v34, 0x400000, v36
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v32, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v32
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v50, v38, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v177
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v36, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v49, v37, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v36, v50, v38
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v184
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v48, 0xffff0000, v184
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v35, 0x7fff, v35
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v49, 0x400000, v37
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v36, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v37, 0x40c00000, v50
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v48, v37, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v39, v38, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v36, 0xffff, v36
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v174, v33, 16, v31
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v171, v32, 16, v34
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, v48, v37
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v175
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v175
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v39, v39, v38
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v177, v35, 16, v36
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v35, 0x400000, v37
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v32, 0x7fff, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v38
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v33, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v39, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v31, v35, vcc_lo
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v173
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v173
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v49, 0x400000, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v36, v37, v33
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v37, v39, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v34
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v37, 0x7fff, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v38, v38, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v122, v3, 16, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v37, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v37, 0x7fff, v38
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v38, 0x40c00000, v48
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v36, v38, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v49, 0x400000, v38
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v48, 0x40c00000, v48
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v37, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v172
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v172
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v36, v36, v38
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v55, 0x400000, v48
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v39, 0x40c00000, v39
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v50, v37, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v39, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v36, v49, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v54, 0x400000, v39
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v51, v48, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v38, v38, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v53, 0x400000, v37
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v49, 0x7fff, v49
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v52, v50, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v51, v51, v48
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v38, 0x7fff, v38
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v35
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v52, v52, v50
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v36, 0xffff, v36
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v39, 0x7fff, v52
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v52, 0x400000, v50
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v38
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v48, v51, v55, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v184, v32, 16, v31
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v175, v33, 16, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff, v38
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v48
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v37, v49, v53, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v173, v35, 16, v36
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v97, v8, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v48, 0xffff, v48
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v37
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v39, v52, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v86, v9, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v76, v11, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v67, v14, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v172, v37, 16, v38
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v39
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v59, v16, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v52, v18, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v46, v21, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v41, v22, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v183, v39, 16, v48
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v37, v24, 16, v27
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v34, v26, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v32, v29, 16, v30
+; GFX11-FAKE16-NEXT:  .LBB43_3: ; %end
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v184, off, s32
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v175, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v174, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v173, off, s32 offset:12
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v172, off, s32 offset:16
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v171, off, s32 offset:20
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v170, off, s32 offset:24
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v169, off, s32 offset:28
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v168, off, s32 offset:32
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v159, off, s32 offset:36
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v158, off, s32 offset:40
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v157, off, s32 offset:44
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v156, off, s32 offset:48
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v155, off, s32 offset:52
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v154, off, s32 offset:56
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v153, off, s32 offset:60
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v152, off, s32 offset:64
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v143, off, s32 offset:68
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v142, off, s32 offset:72
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v141, off, s32 offset:76
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v140, off, s32 offset:80
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v139, off, s32 offset:84
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v138, off, s32 offset:88
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v137, off, s32 offset:92
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v136, off, s32 offset:96
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v127, off, s32 offset:100
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v126, off, s32 offset:104
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v125, off, s32 offset:108
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v124, off, s32 offset:112
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v123, off, s32 offset:116
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v122, off, s32 offset:120
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v121, off, s32 offset:124
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v120, off, s32 offset:128
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v111, off, s32 offset:132
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v110, off, s32 offset:136
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v109, off, s32 offset:140
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v108, off, s32 offset:144
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v107, off, s32 offset:148
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v106, off, s32 offset:152
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v105, off, s32 offset:156
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v104, off, s32 offset:160
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v95, off, s32 offset:164
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v94, off, s32 offset:168
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v93, off, s32 offset:172
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v92, off, s32 offset:176
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v91, off, s32 offset:180
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v90, off, s32 offset:184
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v89, off, s32 offset:188
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v88, off, s32 offset:192
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v79, off, s32 offset:196
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v78, off, s32 offset:200
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v77, off, s32 offset:204
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v76, off, s32 offset:208
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:212
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:216
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:220
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:224
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v63, off, s32 offset:228
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v62, off, s32 offset:232
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v61, off, s32 offset:236
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v60, off, s32 offset:240
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v59, off, s32 offset:244
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:248
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:252
+; GFX11-FAKE16-NEXT:    s_clause 0x8
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:256
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:260
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:264
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v45, off, s32 offset:268
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v44, off, s32 offset:272
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v43, off, s32 offset:276
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v42, off, s32 offset:280
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v41, off, s32 offset:284
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s32 offset:288
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v28, v182
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB43_4:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168
+; GFX11-FAKE16-NEXT:    s_branch .LBB43_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -102109,940 +104193,1982 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; GFX9-NEXT:  .LBB63_4:
 ; GFX9-NEXT:    s_branch .LBB63_2
 ;
-; GFX11-LABEL: bitcast_v64bf16_to_v16i64_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:288
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:284
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:280
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v43, s32 offset:276
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v44, s32 offset:272
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v45, s32 offset:268
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v46, s32 offset:264
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v47, s32 offset:260
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v56, s32 offset:256
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v57, s32 offset:252
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v58, s32 offset:248
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v59, s32 offset:244
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v60, s32 offset:240
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v61, s32 offset:236
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v62, s32 offset:232
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v63, s32 offset:228
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v72, s32 offset:224
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v73, s32 offset:220
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v74, s32 offset:216
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v75, s32 offset:212
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v76, s32 offset:208
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v77, s32 offset:204
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v78, s32 offset:200
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v79, s32 offset:196
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v88, s32 offset:192
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v89, s32 offset:188
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v90, s32 offset:184
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v91, s32 offset:180
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v92, s32 offset:176
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v93, s32 offset:172
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v94, s32 offset:168
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v95, s32 offset:164
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_store_b32 off, v104, s32 offset:160
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v105, s32 offset:156
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v106, s32 offset:152
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v107, s32 offset:148
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v108, s32 offset:144
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v109, s32 offset:140
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v110, s32 offset:136
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v111, s32 offset:132
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v120, s32 offset:128
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v121, s32 offset:124
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v122, s32 offset:120
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v123, s32 offset:116
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v124, s32 offset:112
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v125, s32 offset:108
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v126, s32 offset:104
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v127, s32 offset:100
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v136, s32 offset:96
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v137, s32 offset:92
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v138, s32 offset:88
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v139, s32 offset:84
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v140, s32 offset:80
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v141, s32 offset:76
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v142, s32 offset:72
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v143, s32 offset:68
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v152, s32 offset:64
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v153, s32 offset:60
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v154, s32 offset:56
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v155, s32 offset:52
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v156, s32 offset:48
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v157, s32 offset:44
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v158, s32 offset:40
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v159, s32 offset:36
-; GFX11-NEXT:    s_clause 0x8
-; GFX11-NEXT:    scratch_store_b32 off, v168, s32 offset:32
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v169, s32 offset:28
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v170, s32 offset:24
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v171, s32 offset:20
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v172, s32 offset:16
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v173, s32 offset:12
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v174, s32 offset:8
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v175, s32 offset:4
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v184, s32
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT:    v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12
-; GFX11-NEXT:    v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9
-; GFX11-NEXT:    v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7
-; GFX11-NEXT:    v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3
-; GFX11-NEXT:    v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4
-; GFX11-NEXT:    v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0
-; GFX11-NEXT:    v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1
-; GFX11-NEXT:    v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cbranch_scc0 .LBB63_4
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2
-; GFX11-NEXT:    v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3
-; GFX11-NEXT:    v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18
-; GFX11-NEXT:    v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19
-; GFX11-NEXT:    v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22
-; GFX11-NEXT:    v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23
-; GFX11-NEXT:    v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26
-; GFX11-NEXT:    v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_vccnz .LBB63_3
-; GFX11-NEXT:  .LBB63_2: ; %cmp.true
-; GFX11-NEXT:    s_and_b32 s5, s27, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s4, s27, 16
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s5
-; GFX11-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
-; GFX11-NEXT:    s_lshl_b32 s6, s26, 16
-; GFX11-NEXT:    s_and_b32 s4, s26, 0xffff0000
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s6
-; GFX11-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v1
-; GFX11-NEXT:    v_add_f32_e64 v3, 0x40c00000, s4
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT:    v_bfe_u32 v10, v5, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_bfe_u32 v9, v3, 16, 1
-; GFX11-NEXT:    s_lshl_b32 s7, s25, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT:    s_and_b32 s5, s25, 0xffff0000
-; GFX11-NEXT:    s_and_b32 s4, s24, 0xffff0000
-; GFX11-NEXT:    v_add_f32_e64 v6, 0x40c00000, s5
-; GFX11-NEXT:    v_and_b32_e32 v51, 0xffff0000, v183
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v10, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v9, v3
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_bfe_u32 v10, v6, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v4, v7, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v8
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s7
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_bfe_u32 v3, v8, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshl_or_b32 v15, v1, 16, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, v3, v8
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v10, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v7, v9, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    s_lshl_b32 s4, s24, 16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v6
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v8
-; GFX11-NEXT:    v_add_f32_e64 v9, 0x40c00000, s4
-; GFX11-NEXT:    s_and_b32 s4, s23, 0xffff0000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v9
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT:    v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2
-; GFX11-NEXT:    v_bfe_u32 v7, v9, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, v7, v9
-; GFX11-NEXT:    v_add_f32_e64 v7, 0x40c00000, s4
-; GFX11-NEXT:    s_lshl_b32 s4, s23, 16
-; GFX11-NEXT:    v_lshl_or_b32 v151, v0, 16, v1
-; GFX11-NEXT:    v_add_f32_e64 v12, 0x40c00000, s4
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT:    v_bfe_u32 v11, v7, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v4, v8, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    s_and_b32 s4, s22, 0xffff0000
-; GFX11-NEXT:    v_bfe_u32 v9, v12, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v11, v7
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v10, 0x40c00000, s4
-; GFX11-NEXT:    s_lshl_b32 s4, s22, 16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
-; GFX11-NEXT:    v_add_f32_e64 v11, 0x40c00000, s4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v9, v12
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v7
-; GFX11-NEXT:    v_bfe_u32 v14, v10, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v12
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v8
-; GFX11-NEXT:    s_and_b32 s4, s21, 0xffff0000
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v6, v9, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v9, v11, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT:    v_add_nc_u32_e32 v12, v14, v10
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
-; GFX11-NEXT:    v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, 0x7fff, v12
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v10
-; GFX11-NEXT:    v_add_f32_e64 v13, 0x40c00000, s4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    s_lshl_b32 s4, s21, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v11
-; GFX11-NEXT:    v_add_f32_e64 v16, 0x40c00000, s4
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v10, v13, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_bfe_u32 v12, v16, 16, 1
-; GFX11-NEXT:    s_and_b32 s4, s20, 0xffff0000
-; GFX11-NEXT:    v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v9
-; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v13
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v11
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, v12, v16
-; GFX11-NEXT:    v_add_f32_e64 v12, 0x40c00000, s4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT:    s_lshl_b32 s4, s20, 16
-; GFX11-NEXT:    v_or_b32_e32 v17, 0x400000, v16
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, 0x7fff, v11
-; GFX11-NEXT:    v_bfe_u32 v18, v12, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v19, 0x40c00000, s4
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, v10, v14, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT:    s_and_b32 s4, s19, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v14, v18, v12
-; GFX11-NEXT:    v_bfe_u32 v16, v19, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v9
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v11, v17, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v17, 0x40c00000, s4
-; GFX11-NEXT:    s_lshl_b32 s4, s19, 16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v13
-; GFX11-NEXT:    v_add_nc_u32_e32 v13, 0x7fff, v14
-; GFX11-NEXT:    v_add_nc_u32_e32 v14, v16, v19
-; GFX11-NEXT:    v_or_b32_e32 v16, 0x400000, v12
-; GFX11-NEXT:    v_add_f32_e64 v18, 0x40c00000, s4
-; GFX11-NEXT:    v_bfe_u32 v21, v17, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_add_nc_u32_e32 v14, 0x7fff, v14
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v19
-; GFX11-NEXT:    s_and_b32 s4, s18, 0xffff0000
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, v13, v16, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v16, v18, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT:    v_add_nc_u32_e32 v19, v21, v17
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v13
-; GFX11-NEXT:    v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v16, 0x7fff, v19
-; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v17
-; GFX11-NEXT:    v_add_f32_e64 v20, 0x40c00000, s4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT:    s_lshl_b32 s4, s18, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v13, 0x7fff, v13
-; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v18
-; GFX11-NEXT:    v_add_f32_e64 v22, 0x40c00000, s4
-; GFX11-NEXT:    v_cndmask_b32_e32 v16, v16, v19, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v17, v20, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_bfe_u32 v19, v22, 16, 1
-; GFX11-NEXT:    s_and_b32 s4, s17, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v17, v17, v20
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v16
-; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v20
-; GFX11-NEXT:    v_add_nc_u32_e32 v17, 0x7fff, v17
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v18
-; GFX11-NEXT:    v_add_nc_u32_e32 v18, v19, v22
-; GFX11-NEXT:    v_add_f32_e64 v19, 0x40c00000, s4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-NEXT:    s_lshl_b32 s4, s17, 16
-; GFX11-NEXT:    v_or_b32_e32 v23, 0x400000, v22
-; GFX11-NEXT:    v_add_nc_u32_e32 v18, 0x7fff, v18
-; GFX11-NEXT:    v_bfe_u32 v24, v19, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v25, 0x40c00000, s4
-; GFX11-NEXT:    v_cndmask_b32_e32 v20, v17, v21, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-NEXT:    s_and_b32 s4, s16, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v21, v24, v19
-; GFX11-NEXT:    v_bfe_u32 v22, v25, 16, 1
-; GFX11-NEXT:    v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16
-; GFX11-NEXT:    v_add_f32_e64 v23, 0x40c00000, s4
-; GFX11-NEXT:    s_lshl_b32 s4, s16, 16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v20
-; GFX11-NEXT:    v_add_nc_u32_e32 v20, 0x7fff, v21
-; GFX11-NEXT:    v_add_nc_u32_e32 v21, v22, v25
-; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v19
-; GFX11-NEXT:    v_add_f32_e64 v24, 0x40c00000, s4
-; GFX11-NEXT:    v_bfe_u32 v27, v23, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
-; GFX11-NEXT:    v_add_nc_u32_e32 v21, 0x7fff, v21
-; GFX11-NEXT:    v_or_b32_e32 v26, 0x400000, v25
-; GFX11-NEXT:    s_and_b32 s4, s3, 0xffff0000
-; GFX11-NEXT:    v_cndmask_b32_e32 v20, v20, v22, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v22, v24, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-NEXT:    v_add_nc_u32_e32 v25, v27, v23
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v20
-; GFX11-NEXT:    v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v22, 0x7fff, v25
-; GFX11-NEXT:    v_or_b32_e32 v25, 0x400000, v23
-; GFX11-NEXT:    v_add_f32_e64 v26, 0x40c00000, s4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v20, 0x7fff, v20
-; GFX11-NEXT:    v_or_b32_e32 v27, 0x400000, v24
-; GFX11-NEXT:    v_add_f32_e64 v28, 0x40c00000, s3
-; GFX11-NEXT:    v_cndmask_b32_e32 v22, v22, v25, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v23, v26, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
-; GFX11-NEXT:    v_bfe_u32 v25, v28, 16, 1
-; GFX11-NEXT:    s_and_b32 s3, s2, 0xffff0000
-; GFX11-NEXT:    v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 16, v22
-; GFX11-NEXT:    v_or_b32_e32 v27, 0x400000, v26
-; GFX11-NEXT:    v_add_nc_u32_e32 v23, 0x7fff, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v24
-; GFX11-NEXT:    v_add_nc_u32_e32 v24, v25, v28
-; GFX11-NEXT:    v_add_f32_e64 v25, 0x40c00000, s3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX11-NEXT:    v_or_b32_e32 v29, 0x400000, v28
-; GFX11-NEXT:    v_add_nc_u32_e32 v24, 0x7fff, v24
-; GFX11-NEXT:    v_bfe_u32 v30, v25, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v31, 0x40c00000, s2
-; GFX11-NEXT:    v_cndmask_b32_e32 v26, v23, v27, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-NEXT:    s_and_b32 s2, s1, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v27, v30, v25
-; GFX11-NEXT:    v_bfe_u32 v28, v31, 16, 1
-; GFX11-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-NEXT:    v_cndmask_b32_e32 v24, v24, v29, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v29, 0x40c00000, s2
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v26
-; GFX11-NEXT:    v_add_nc_u32_e32 v26, 0x7fff, v27
-; GFX11-NEXT:    v_add_nc_u32_e32 v27, v28, v31
-; GFX11-NEXT:    v_or_b32_e32 v28, 0x400000, v25
-; GFX11-NEXT:    v_add_f32_e64 v30, 0x40c00000, s1
-; GFX11-NEXT:    v_bfe_u32 v33, v29, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
-; GFX11-NEXT:    v_add_nc_u32_e32 v27, 0x7fff, v27
-; GFX11-NEXT:    v_or_b32_e32 v32, 0x400000, v31
-; GFX11-NEXT:    s_and_b32 s1, s0, 0xffff0000
-; GFX11-NEXT:    v_cndmask_b32_e32 v26, v26, v28, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v28, v30, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-NEXT:    v_add_nc_u32_e32 v31, v33, v29
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 16, v26
-; GFX11-NEXT:    v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v28, 0x7fff, v31
-; GFX11-NEXT:    v_or_b32_e32 v31, 0x400000, v29
-; GFX11-NEXT:    v_add_f32_e64 v32, 0x40c00000, s1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX11-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v26, 0x7fff, v26
-; GFX11-NEXT:    v_or_b32_e32 v33, 0x400000, v30
-; GFX11-NEXT:    v_add_f32_e64 v34, 0x40c00000, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v28, v28, v31, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v29, v32, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX11-NEXT:    v_or_b32_e32 v35, 0x400000, v32
-; GFX11-NEXT:    v_bfe_u32 v31, v34, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
-; GFX11-NEXT:    v_cndmask_b32_e32 v30, v26, v33, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v28
-; GFX11-NEXT:    v_add_nc_u32_e32 v28, v29, v32
-; GFX11-NEXT:    v_lshlrev_b32_e32 v33, 16, v178
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v30
-; GFX11-NEXT:    v_add_nc_u32_e32 v30, v31, v34
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff0000, v178
-; GFX11-NEXT:    v_add_nc_u32_e32 v28, 0x7fff, v28
-; GFX11-NEXT:    v_add_f32_e32 v33, 0x40c00000, v33
-; GFX11-NEXT:    v_lshl_or_b32 v109, v5, 16, v7
-; GFX11-NEXT:    v_add_nc_u32_e32 v30, 0x7fff, v30
-; GFX11-NEXT:    v_add_f32_e32 v31, 0x40c00000, v31
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v28, v35, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v37, v33, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff, v29
-; GFX11-NEXT:    v_bfe_u32 v35, v31, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v32
-; GFX11-NEXT:    v_add_nc_u32_e32 v32, v37, v33
-; GFX11-NEXT:    v_cndmask_b32_e32 v30, v30, v36, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v179
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, v35, v31
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v33
-; GFX11-NEXT:    v_add_nc_u32_e32 v32, 0x7fff, v32
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff0000, v179
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v31
-; GFX11-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v37, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT:    v_and_b32_e32 v37, 0xffff0000, v180
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT:    v_cndmask_b32_e32 v31, v34, v38, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v34, v36, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v33, v35, 16, 1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v180
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36
-; GFX11-NEXT:    v_add_nc_u32_e32 v33, v33, v35
-; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT:    v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT:    v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff, v32
-; GFX11-NEXT:    v_bfe_u32 v36, v37, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
-; GFX11-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_bfe_u32 v35, v38, 16, 1
-; GFX11-NEXT:    v_lshl_or_b32 v178, v31, 16, v32
-; GFX11-NEXT:    v_add_nc_u32_e32 v31, v36, v37
-; GFX11-NEXT:    v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v33
-; GFX11-NEXT:    v_add_nc_u32_e32 v33, v35, v38
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff0000, v182
-; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v37
-; GFX11-NEXT:    v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT:    v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT:    v_lshl_or_b32 v179, v32, 16, v34
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xffff, v30
-; GFX11-NEXT:    v_lshl_or_b32 v136, v2, 16, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v39, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT:    v_bfe_u32 v37, v36, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT:    v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v32, v37, v36
-; GFX11-NEXT:    v_and_b32_e32 v37, 0xffff0000, v181
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v38
-; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v181
-; GFX11-NEXT:    v_add_nc_u32_e32 v32, 0x7fff, v32
-; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT:    v_add_f32_e32 v37, 0x40c00000, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT:    v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v39, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff, v33
-; GFX11-NEXT:    v_bfe_u32 v35, v37, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v36, v38, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT:    v_cndmask_b32_e32 v34, v34, v48, vcc_lo
-; GFX11-NEXT:    v_lshl_or_b32 v180, v31, 16, v33
-; GFX11-NEXT:    v_add_nc_u32_e32 v33, v35, v37
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff0000, v170
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v34
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, v36, v38
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff, v32
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-NEXT:    v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v37
-; GFX11-NEXT:    v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v39, v36, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT:    v_lshl_or_b32 v182, v31, 16, v32
-; GFX11-NEXT:    v_add_nc_u32_e32 v37, v38, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT:    v_add_nc_u32_e32 v31, v39, v36
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v48, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v36
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT:    v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v33
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff, v34
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v37
-; GFX11-NEXT:    v_and_b32_e32 v37, 0xffff0000, v169
-; GFX11-NEXT:    v_cndmask_b32_e32 v31, v31, v38, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v169
-; GFX11-NEXT:    v_lshl_or_b32 v181, v32, 16, v33
-; GFX11-NEXT:    v_add_f32_e32 v37, 0x40c00000, v37
-; GFX11-NEXT:    v_and_b32_e32 v38, 0xffff0000, v176
-; GFX11-NEXT:    v_cndmask_b32_e32 v34, v34, v48, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v36, 0x40c00000, v39
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT:    v_bfe_u32 v35, v37, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v34
-; GFX11-NEXT:    v_bfe_u32 v32, v36, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, v35, v37
-; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v176
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT:    v_add_nc_u32_e32 v32, v32, v36
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v36
-; GFX11-NEXT:    v_add_nc_u32_e32 v32, 0x7fff, v32
-; GFX11-NEXT:    v_bfe_u32 v37, v38, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v49, v35, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT:    v_lshl_or_b32 v170, v33, 16, v31
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v34
-; GFX11-NEXT:    v_add_nc_u32_e32 v36, v49, v35
-; GFX11-NEXT:    v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38
-; GFX11-NEXT:    v_and_b32_e32 v37, 0xffff0000, v174
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v36
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT:    v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT:    v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36
-; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v174
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff, v32
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v37, v35, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT:    v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39
-; GFX11-NEXT:    v_lshl_or_b32 v169, v31, 16, v32
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v37, v37, v35
-; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v31, v36, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff0000, v171
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v177
-; GFX11-NEXT:    v_add_nc_u32_e32 v31, v31, v36
-; GFX11-NEXT:    v_lshl_or_b32 v176, v33, 16, v34
-; GFX11-NEXT:    v_add_nc_u32_e32 v33, 0x7fff, v37
-; GFX11-NEXT:    v_or_b32_e32 v34, 0x400000, v35
-; GFX11-NEXT:    v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171
-; GFX11-NEXT:    v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT:    v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v34, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v34, 0x400000, v36
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v37
-; GFX11-NEXT:    v_bfe_u32 v37, v32, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v32
-; GFX11-NEXT:    v_bfe_u32 v50, v38, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT:    v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32
-; GFX11-NEXT:    v_bfe_u32 v34, v35, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v37, 0xffff0000, v177
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT:    v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v36, v39, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT:    v_bfe_u32 v49, v37, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v36, v50, v38
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT:    v_lshlrev_b32_e32 v50, 16, v184
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37
-; GFX11-NEXT:    v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT:    v_and_b32_e32 v48, 0xffff0000, v184
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT:    v_add_nc_u32_e32 v35, 0x7fff, v35
-; GFX11-NEXT:    v_or_b32_e32 v49, 0x400000, v37
-; GFX11-NEXT:    v_cndmask_b32_e32 v36, v36, v39, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT:    v_add_f32_e32 v37, 0x40c00000, v50
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT:    v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v36
-; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT:    v_bfe_u32 v48, v37, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v39, v38, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v35
-; GFX11-NEXT:    v_and_b32_e32 v36, 0xffff, v36
-; GFX11-NEXT:    v_lshl_or_b32 v174, v33, 16, v31
-; GFX11-NEXT:    v_lshl_or_b32 v171, v32, 16, v34
-; GFX11-NEXT:    v_add_nc_u32_e32 v31, v48, v37
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff0000, v175
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v175
-; GFX11-NEXT:    v_add_nc_u32_e32 v39, v39, v38
-; GFX11-NEXT:    v_lshl_or_b32 v177, v35, 16, v36
-; GFX11-NEXT:    v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT:    v_or_b32_e32 v35, 0x400000, v37
-; GFX11-NEXT:    v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT:    v_add_nc_u32_e32 v32, 0x7fff, v39
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v38
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v37, v33, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v39, v34, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v31, v31, v35, vcc_lo
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff0000, v173
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v173
-; GFX11-NEXT:    v_or_b32_e32 v49, 0x400000, v33
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36
-; GFX11-NEXT:    v_add_nc_u32_e32 v36, v37, v33
-; GFX11-NEXT:    v_add_nc_u32_e32 v37, v39, v34
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v34
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT:    v_add_nc_u32_e32 v37, 0x7fff, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT:    v_add_nc_u32_e32 v38, v38, v35
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT:    v_lshl_or_b32 v122, v3, 16, v6
-; GFX11-NEXT:    v_cndmask_b32_e32 v34, v37, v39, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT:    v_add_nc_u32_e32 v37, 0x7fff, v38
-; GFX11-NEXT:    v_add_f32_e32 v38, 0x40c00000, v48
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v35
-; GFX11-NEXT:    v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v36, v38, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v49, 0x400000, v38
-; GFX11-NEXT:    v_add_f32_e32 v48, 0x40c00000, v48
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT:    v_cndmask_b32_e32 v35, v37, v39, vcc_lo
-; GFX11-NEXT:    v_and_b32_e32 v37, 0xffff0000, v172
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v172
-; GFX11-NEXT:    v_add_nc_u32_e32 v36, v36, v38
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT:    v_or_b32_e32 v55, 0x400000, v48
-; GFX11-NEXT:    v_add_f32_e32 v37, 0x40c00000, v37
-; GFX11-NEXT:    v_add_f32_e32 v39, 0x40c00000, v39
-; GFX11-NEXT:    v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT:    v_bfe_u32 v50, v37, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v38, v39, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v36, v36, v49, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v54, 0x400000, v39
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
-; GFX11-NEXT:    v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37
-; GFX11-NEXT:    v_bfe_u32 v51, v48, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v38, v38, v39
-; GFX11-NEXT:    v_or_b32_e32 v53, 0x400000, v37
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v49, 0x7fff, v49
-; GFX11-NEXT:    v_bfe_u32 v52, v50, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v51, v51, v48
-; GFX11-NEXT:    v_add_nc_u32_e32 v38, 0x7fff, v38
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v35
-; GFX11-NEXT:    v_add_nc_u32_e32 v52, v52, v50
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v48, v48
-; GFX11-NEXT:    v_and_b32_e32 v36, 0xffff, v36
-; GFX11-NEXT:    v_add_nc_u32_e32 v39, 0x7fff, v52
-; GFX11-NEXT:    v_or_b32_e32 v52, 0x400000, v50
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v38
-; GFX11-NEXT:    v_cndmask_b32_e32 v48, v51, v55, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT:    v_lshl_or_b32 v184, v32, 16, v31
-; GFX11-NEXT:    v_lshl_or_b32 v175, v33, 16, v34
-; GFX11-NEXT:    v_and_b32_e32 v38, 0xffff, v38
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v48
-; GFX11-NEXT:    v_cndmask_b32_e32 v37, v49, v53, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
-; GFX11-NEXT:    v_lshl_or_b32 v173, v35, 16, v36
-; GFX11-NEXT:    v_lshl_or_b32 v97, v8, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v48, 0xffff, v48
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v37
-; GFX11-NEXT:    v_cndmask_b32_e32 v39, v39, v52, vcc_lo
-; GFX11-NEXT:    v_lshl_or_b32 v86, v9, 16, v12
-; GFX11-NEXT:    v_lshl_or_b32 v76, v11, 16, v13
-; GFX11-NEXT:    v_lshl_or_b32 v67, v14, 16, v17
-; GFX11-NEXT:    v_lshl_or_b32 v172, v37, 16, v38
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v39
-; GFX11-NEXT:    v_lshl_or_b32 v59, v16, 16, v19
-; GFX11-NEXT:    v_lshl_or_b32 v52, v18, 16, v20
-; GFX11-NEXT:    v_lshl_or_b32 v46, v21, 16, v23
-; GFX11-NEXT:    v_lshl_or_b32 v41, v22, 16, v25
-; GFX11-NEXT:    v_lshl_or_b32 v183, v39, 16, v48
-; GFX11-NEXT:    v_lshl_or_b32 v37, v24, 16, v27
-; GFX11-NEXT:    v_lshl_or_b32 v34, v26, 16, v28
-; GFX11-NEXT:    v_lshl_or_b32 v32, v29, 16, v30
-; GFX11-NEXT:  .LBB63_3: ; %end
-; GFX11-NEXT:    v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46
-; GFX11-NEXT:    v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86
-; GFX11-NEXT:    v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76
-; GFX11-NEXT:    v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136
-; GFX11-NEXT:    v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122
-; GFX11-NEXT:    v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172
-; GFX11-NEXT:    v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175
-; GFX11-NEXT:    v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174
-; GFX11-NEXT:    v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169
-; GFX11-NEXT:    v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_b32 v184, off, s32
-; GFX11-NEXT:    scratch_load_b32 v175, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_b32 v174, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_b32 v173, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_b32 v172, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_b32 v171, off, s32 offset:20
-; GFX11-NEXT:    scratch_load_b32 v170, off, s32 offset:24
-; GFX11-NEXT:    scratch_load_b32 v169, off, s32 offset:28
-; GFX11-NEXT:    scratch_load_b32 v168, off, s32 offset:32
-; GFX11-NEXT:    scratch_load_b32 v159, off, s32 offset:36
-; GFX11-NEXT:    scratch_load_b32 v158, off, s32 offset:40
-; GFX11-NEXT:    scratch_load_b32 v157, off, s32 offset:44
-; GFX11-NEXT:    scratch_load_b32 v156, off, s32 offset:48
-; GFX11-NEXT:    scratch_load_b32 v155, off, s32 offset:52
-; GFX11-NEXT:    scratch_load_b32 v154, off, s32 offset:56
-; GFX11-NEXT:    scratch_load_b32 v153, off, s32 offset:60
-; GFX11-NEXT:    scratch_load_b32 v152, off, s32 offset:64
-; GFX11-NEXT:    scratch_load_b32 v143, off, s32 offset:68
-; GFX11-NEXT:    scratch_load_b32 v142, off, s32 offset:72
-; GFX11-NEXT:    scratch_load_b32 v141, off, s32 offset:76
-; GFX11-NEXT:    scratch_load_b32 v140, off, s32 offset:80
-; GFX11-NEXT:    scratch_load_b32 v139, off, s32 offset:84
-; GFX11-NEXT:    scratch_load_b32 v138, off, s32 offset:88
-; GFX11-NEXT:    scratch_load_b32 v137, off, s32 offset:92
-; GFX11-NEXT:    scratch_load_b32 v136, off, s32 offset:96
-; GFX11-NEXT:    scratch_load_b32 v127, off, s32 offset:100
-; GFX11-NEXT:    scratch_load_b32 v126, off, s32 offset:104
-; GFX11-NEXT:    scratch_load_b32 v125, off, s32 offset:108
-; GFX11-NEXT:    scratch_load_b32 v124, off, s32 offset:112
-; GFX11-NEXT:    scratch_load_b32 v123, off, s32 offset:116
-; GFX11-NEXT:    scratch_load_b32 v122, off, s32 offset:120
-; GFX11-NEXT:    scratch_load_b32 v121, off, s32 offset:124
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_b32 v120, off, s32 offset:128
-; GFX11-NEXT:    scratch_load_b32 v111, off, s32 offset:132
-; GFX11-NEXT:    scratch_load_b32 v110, off, s32 offset:136
-; GFX11-NEXT:    scratch_load_b32 v109, off, s32 offset:140
-; GFX11-NEXT:    scratch_load_b32 v108, off, s32 offset:144
-; GFX11-NEXT:    scratch_load_b32 v107, off, s32 offset:148
-; GFX11-NEXT:    scratch_load_b32 v106, off, s32 offset:152
-; GFX11-NEXT:    scratch_load_b32 v105, off, s32 offset:156
-; GFX11-NEXT:    scratch_load_b32 v104, off, s32 offset:160
-; GFX11-NEXT:    scratch_load_b32 v95, off, s32 offset:164
-; GFX11-NEXT:    scratch_load_b32 v94, off, s32 offset:168
-; GFX11-NEXT:    scratch_load_b32 v93, off, s32 offset:172
-; GFX11-NEXT:    scratch_load_b32 v92, off, s32 offset:176
-; GFX11-NEXT:    scratch_load_b32 v91, off, s32 offset:180
-; GFX11-NEXT:    scratch_load_b32 v90, off, s32 offset:184
-; GFX11-NEXT:    scratch_load_b32 v89, off, s32 offset:188
-; GFX11-NEXT:    scratch_load_b32 v88, off, s32 offset:192
-; GFX11-NEXT:    scratch_load_b32 v79, off, s32 offset:196
-; GFX11-NEXT:    scratch_load_b32 v78, off, s32 offset:200
-; GFX11-NEXT:    scratch_load_b32 v77, off, s32 offset:204
-; GFX11-NEXT:    scratch_load_b32 v76, off, s32 offset:208
-; GFX11-NEXT:    scratch_load_b32 v75, off, s32 offset:212
-; GFX11-NEXT:    scratch_load_b32 v74, off, s32 offset:216
-; GFX11-NEXT:    scratch_load_b32 v73, off, s32 offset:220
-; GFX11-NEXT:    scratch_load_b32 v72, off, s32 offset:224
-; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:228
-; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:232
-; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:236
-; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:240
-; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:244
-; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:248
-; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:252
-; GFX11-NEXT:    s_clause 0x8
-; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:256
-; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:260
-; GFX11-NEXT:    scratch_load_b32 v46, off, s32 offset:264
-; GFX11-NEXT:    scratch_load_b32 v45, off, s32 offset:268
-; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:272
-; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:276
-; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:280
-; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:284
-; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:288
-; GFX11-NEXT:    v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34
-; GFX11-NEXT:    v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52
-; GFX11-NEXT:    v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177
-; GFX11-NEXT:    v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181
-; GFX11-NEXT:    v_mov_b32_e32 v28, v182
-; GFX11-NEXT:    v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB63_4:
-; GFX11-NEXT:    ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
-; GFX11-NEXT:    ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69
-; GFX11-NEXT:    ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73
-; GFX11-NEXT:    ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78
-; GFX11-NEXT:    ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84
-; GFX11-NEXT:    ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91
-; GFX11-NEXT:    ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99
-; GFX11-NEXT:    ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108
-; GFX11-NEXT:    ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118
-; GFX11-NEXT:    ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129
-; GFX11-NEXT:    ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141
-; GFX11-NEXT:    ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154
-; GFX11-NEXT:    ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168
-; GFX11-NEXT:    s_branch .LBB63_2
+; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v16i64_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:192
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:188
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:180
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:172
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:156
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:64
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:60
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:52
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:44
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:28
+; GFX11-TRUE16-NEXT:    s_clause 0x6
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v167, v13 :: v_dual_mov_b32 v176, v12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v177, v11 :: v_dual_mov_b32 v178, v10
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v179, v9 :: v_dual_mov_b32 v180, v8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v181, v7 :: v_dual_mov_b32 v182, v6
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v183, v5 :: v_dual_mov_b32 v168, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v169, v3 :: v_dual_mov_b32 v170, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v171, v1 :: v_dual_mov_b32 v172, v0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v174, s28 :: v_dual_mov_b32 v173, s29
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB63_4
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v135, s0 :: v_dual_mov_b32 v134, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v132, s2 :: v_dual_mov_b32 v129, s3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v125, s16 :: v_dual_mov_b32 v120, s17
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v114, s18 :: v_dual_mov_b32 v107, s19
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v99, s20 :: v_dual_mov_b32 v90, s21
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v80, s22 :: v_dual_mov_b32 v69, s23
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v57, s24 :: v_dual_mov_b32 v44, s25
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v30, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB63_3
+; GFX11-TRUE16-NEXT:  .LBB63_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s27, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s27, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s5
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s26, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s26, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_and_b32 s5, s25, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s7, s25, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s5
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s24, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v2, v8 :: v_dual_add_nc_u32 v7, v7, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s7
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s24, 16
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v2, v7, v2 :: v_dual_add_nc_u32 v7, v8, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v9, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v3, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v1.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v44, 16, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v44.h, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s23, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v57, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v57.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s23, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s22, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v69.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s22, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s21, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v80.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s21, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s20, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v90, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v90.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s20, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s19, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v99.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s19, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s18, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v107, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v107.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s17, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v114, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v114.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s17, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s16, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v120, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v120.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s16, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s3, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v125, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v125.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-TRUE16-NEXT:    s_and_b32 s3, s2, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v129.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    s_and_b32 s2, s1, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v132.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, s0, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v134.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v135.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v167
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v167
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v167, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v167.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v176
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v176
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v176, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v176.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v177
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v177
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v177, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v177.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v178
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v178
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v178, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v178.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v179
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v179
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v179, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v179.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v180
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v180
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v180, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v180.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v181
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v181
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v181, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v181.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v182
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v182
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v182, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v182.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v183
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v183
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v183, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v183.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v168
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v168
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v168, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v168.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v169
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v169
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v169, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v169.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v170
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v170
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v170, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v170.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v171
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v171
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v171, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v171.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v172
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v172
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v172, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v172.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v173
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v173
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v173, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v173.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v174
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v174
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v174, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v174.h, v0.l
+; GFX11-TRUE16-NEXT:  .LBB63_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, v125 :: v_dual_mov_b32 v5, v120
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, v114 :: v_dual_mov_b32 v7, v107
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, v99 :: v_dual_mov_b32 v9, v90
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, v57 :: v_dual_mov_b32 v13, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, v30 :: v_dual_mov_b32 v17, v173
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, v174 :: v_dual_mov_b32 v19, v171
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0x6
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, v135 :: v_dual_mov_b32 v1, v134
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, v132 :: v_dual_mov_b32 v3, v129
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, v80 :: v_dual_mov_b32 v11, v69
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v24, v182 :: v_dual_mov_b32 v27, v179
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, v180 :: v_dual_mov_b32 v29, v177
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, v178 :: v_dual_mov_b32 v31, v167
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v30, v176
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB63_4:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166
+; GFX11-TRUE16-NEXT:    s_branch .LBB63_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v16i64_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:288
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:284
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:280
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v43, s32 offset:276
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v44, s32 offset:272
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v45, s32 offset:268
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v46, s32 offset:264
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v47, s32 offset:260
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v56, s32 offset:256
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v57, s32 offset:252
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v58, s32 offset:248
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v59, s32 offset:244
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v60, s32 offset:240
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v61, s32 offset:236
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v62, s32 offset:232
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v63, s32 offset:228
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v72, s32 offset:224
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v73, s32 offset:220
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v74, s32 offset:216
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:212
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v76, s32 offset:208
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v77, s32 offset:204
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v78, s32 offset:200
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v79, s32 offset:196
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v88, s32 offset:192
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v89, s32 offset:188
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v90, s32 offset:184
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v91, s32 offset:180
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v92, s32 offset:176
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:172
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:168
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:164
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:160
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:156
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:152
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v107, s32 offset:148
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v108, s32 offset:144
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v109, s32 offset:140
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v110, s32 offset:136
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v111, s32 offset:132
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v120, s32 offset:128
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v121, s32 offset:124
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v122, s32 offset:120
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v123, s32 offset:116
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v124, s32 offset:112
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v125, s32 offset:108
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v126, s32 offset:104
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v127, s32 offset:100
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v136, s32 offset:96
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v137, s32 offset:92
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v138, s32 offset:88
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v139, s32 offset:84
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v140, s32 offset:80
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v141, s32 offset:76
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v142, s32 offset:72
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v143, s32 offset:68
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v152, s32 offset:64
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v153, s32 offset:60
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v154, s32 offset:56
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v155, s32 offset:52
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v156, s32 offset:48
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v157, s32 offset:44
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v158, s32 offset:40
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v159, s32 offset:36
+; GFX11-FAKE16-NEXT:    s_clause 0x8
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v168, s32 offset:32
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v169, s32 offset:28
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v170, s32 offset:24
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v171, s32 offset:20
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v172, s32 offset:16
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v173, s32 offset:12
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v174, s32 offset:8
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v175, s32 offset:4
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v184, s32
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB63_4
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB63_3
+; GFX11-FAKE16-NEXT:  .LBB63_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_and_b32 s5, s27, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s27, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s26, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s26, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v3, 16, 1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s25, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT:    s_and_b32 s5, s25, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s24, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v51, 0xffff0000, v183
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v10, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v9, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v8, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, v3, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v10, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s24, 16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s23, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, v7, v9
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s23, 16
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v151, v0, 16, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v12, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s22, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v12, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v11, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s22, 16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v11, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v9, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v14, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, 0x400000, v12
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s21, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v6, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v11, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, v14, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 0x7fff, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v13, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s21, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x400000, v11
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v16, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v13, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v12, v16, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s20, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x400000, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, v12, v16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v12, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s20, 16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, 0x400000, v16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v18, v12, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v19, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v10, v14, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s19, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, v18, v12
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v19, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v17, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v17, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s19, 16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, 0x7fff, v14
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, v16, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, 0x400000, v12
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v18, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v21, v17, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v19
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s18, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v13, v16, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v18, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v19, v21, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v13
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v16, 0x7fff, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, 0x400000, v17
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v20, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s18, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, 0x7fff, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, 0x400000, v18
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v22, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v16, v19, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v17, v20, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v19, v22, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s17, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, v17, v20
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, 0x400000, v20
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, 0x7fff, v17
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v18
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v18, v19, v22
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v19, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s17, 16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, 0x400000, v22
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v18, 0x7fff, v18
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v24, v19, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v25, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v17, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s16, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v21, v24, v19
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v25, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v23, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s16, 16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v20
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v20, 0x7fff, v21
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v21, v22, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v19
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v24, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v27, v23, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v21, 0x7fff, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, 0x400000, v25
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s3, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v20, v22, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v24, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v25, v27, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v20
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v22, 0x7fff, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, 0x400000, v23
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v26, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v20, 0x7fff, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, 0x400000, v24
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v28, 0x40c00000, s3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v22, v25, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v23, v26, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v25, v28, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s3, s2, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, 0x400000, v26
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v23, 0x7fff, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v24
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v24, v25, v28
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v25, 0x40c00000, s3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, 0x400000, v28
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v24, 0x7fff, v24
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v30, v25, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v31, 0x40c00000, s2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v26, v23, v27, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s1, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v27, v30, v25
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v28, v31, 16, 1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v24, v29, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v29, 0x40c00000, s2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v26
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v26, 0x7fff, v27
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v27, v28, v31
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, 0x400000, v25
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v30, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v29, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v27, 0x7fff, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v32, 0x400000, v31
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s0, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v26, v26, v28, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v28, v30, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, v33, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v26
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v28, 0x7fff, v31
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, 0x400000, v29
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v32, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v26, 0x7fff, v26
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v33, 0x400000, v30
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v34, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v28, v28, v31, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v29, v32, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v35, 0x400000, v32
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v31, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v30, v26, v33, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v28
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v28, v29, v32
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v33, 16, v178
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v30
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v30, v31, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xffff0000, v178
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v28, 0x7fff, v28
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v33, 0x40c00000, v33
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v109, v5, 16, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v30, 0x7fff, v30
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v31, 0x40c00000, v31
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v28, v35, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v33, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff, v29
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v31, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v32
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v32, v37, v33
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v30, v30, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v179
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, v35, v31
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v33
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v32, 0x7fff, v32
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v179
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v31
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v32, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v180
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v34, v38, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v36, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v180
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v33, v33, v35
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff, v32
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v36, v37, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v38, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v178, v31, 16, v32
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, v36, v37
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v33
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v33, v35, v38
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v182
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v48, 0x400000, v37
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v179, v32, 16, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xffff, v30
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v136, v2, 16, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v36, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v32, v37, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v181
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v38
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v181
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v32, 0x7fff, v32
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v32, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xffff, v33
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v37, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v36, v38, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v34, v48, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v180, v31, 16, v33
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v33, v35, v37
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v170
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v34
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, v36, v38
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff, v32
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v48, 0x400000, v37
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v39, v36, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v182, v31, 16, v32
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v37, v38, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, v39, v36
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v48, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v36
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xffff, v34
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v37
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v169
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v31, v38, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v169
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v181, v32, 16, v33
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v176
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v34, v48, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v39
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v37, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v34
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v36, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, v35, v37
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v35, 16, v176
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v32, v32, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v48, 0x400000, v36
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v32, 0x7fff, v32
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v38, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v49, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v170, v33, 16, v31
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v34
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v36, v49, v35
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v174
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v174
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff, v32
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v169, v31, 16, v32
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v37, v37, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v31, v36, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v171
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v177
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, v31, v36
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v176, v33, 16, v34
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v33, 0x7fff, v37
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v34, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v34, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v34, 0x400000, v36
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v32, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v32
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v50, v38, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v177
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v36, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v49, v37, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v36, v50, v38
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v184
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v48, 0xffff0000, v184
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v35, 0x7fff, v35
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v49, 0x400000, v37
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v36, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v37, 0x40c00000, v50
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v48, v37, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v39, v38, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v36, 0xffff, v36
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v174, v33, 16, v31
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v171, v32, 16, v34
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, v48, v37
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v175
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v175
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v39, v39, v38
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v177, v35, 16, v36
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v35, 0x400000, v37
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v32, 0x7fff, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v38
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v33, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v39, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v31, v35, vcc_lo
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v173
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v173
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v49, 0x400000, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v36, v37, v33
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v37, v39, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v34
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v37, 0x7fff, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v38, v38, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v122, v3, 16, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v37, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v37, 0x7fff, v38
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v38, 0x40c00000, v48
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v36, v38, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v49, 0x400000, v38
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v48, 0x40c00000, v48
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v37, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v172
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v172
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v36, v36, v38
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v55, 0x400000, v48
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v39, 0x40c00000, v39
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v50, v37, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v39, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v36, v49, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v54, 0x400000, v39
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v51, v48, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v38, v38, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v53, 0x400000, v37
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v49, 0x7fff, v49
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v52, v50, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v51, v51, v48
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v38, 0x7fff, v38
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v35
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v52, v52, v50
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v36, 0xffff, v36
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v39, 0x7fff, v52
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v52, 0x400000, v50
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v38
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v48, v51, v55, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v184, v32, 16, v31
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v175, v33, 16, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff, v38
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v48
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v37, v49, v53, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v173, v35, 16, v36
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v97, v8, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v48, 0xffff, v48
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v37
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v39, v52, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v86, v9, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v76, v11, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v67, v14, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v172, v37, 16, v38
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v39
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v59, v16, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v52, v18, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v46, v21, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v41, v22, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v183, v39, 16, v48
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v37, v24, 16, v27
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v34, v26, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v32, v29, 16, v30
+; GFX11-FAKE16-NEXT:  .LBB63_3: ; %end
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v184, off, s32
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v175, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v174, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v173, off, s32 offset:12
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v172, off, s32 offset:16
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v171, off, s32 offset:20
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v170, off, s32 offset:24
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v169, off, s32 offset:28
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v168, off, s32 offset:32
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v159, off, s32 offset:36
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v158, off, s32 offset:40
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v157, off, s32 offset:44
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v156, off, s32 offset:48
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v155, off, s32 offset:52
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v154, off, s32 offset:56
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v153, off, s32 offset:60
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v152, off, s32 offset:64
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v143, off, s32 offset:68
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v142, off, s32 offset:72
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v141, off, s32 offset:76
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v140, off, s32 offset:80
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v139, off, s32 offset:84
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v138, off, s32 offset:88
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v137, off, s32 offset:92
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v136, off, s32 offset:96
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v127, off, s32 offset:100
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v126, off, s32 offset:104
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v125, off, s32 offset:108
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v124, off, s32 offset:112
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v123, off, s32 offset:116
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v122, off, s32 offset:120
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v121, off, s32 offset:124
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v120, off, s32 offset:128
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v111, off, s32 offset:132
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v110, off, s32 offset:136
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v109, off, s32 offset:140
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v108, off, s32 offset:144
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v107, off, s32 offset:148
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v106, off, s32 offset:152
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v105, off, s32 offset:156
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v104, off, s32 offset:160
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v95, off, s32 offset:164
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v94, off, s32 offset:168
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v93, off, s32 offset:172
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v92, off, s32 offset:176
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v91, off, s32 offset:180
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v90, off, s32 offset:184
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v89, off, s32 offset:188
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v88, off, s32 offset:192
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v79, off, s32 offset:196
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v78, off, s32 offset:200
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v77, off, s32 offset:204
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v76, off, s32 offset:208
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:212
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:216
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:220
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:224
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v63, off, s32 offset:228
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v62, off, s32 offset:232
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v61, off, s32 offset:236
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v60, off, s32 offset:240
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v59, off, s32 offset:244
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:248
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:252
+; GFX11-FAKE16-NEXT:    s_clause 0x8
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:256
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:260
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:264
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v45, off, s32 offset:268
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v44, off, s32 offset:272
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v43, off, s32 offset:276
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v42, off, s32 offset:280
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v41, off, s32 offset:284
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s32 offset:288
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v28, v182
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB63_4:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168
+; GFX11-FAKE16-NEXT:    s_branch .LBB63_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -137512,940 +140638,1982 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; GFX9-NEXT:  .LBB79_4:
 ; GFX9-NEXT:    s_branch .LBB79_2
 ;
-; GFX11-LABEL: bitcast_v64bf16_to_v16f64_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:288
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:284
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:280
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v43, s32 offset:276
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v44, s32 offset:272
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v45, s32 offset:268
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v46, s32 offset:264
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v47, s32 offset:260
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v56, s32 offset:256
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v57, s32 offset:252
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v58, s32 offset:248
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v59, s32 offset:244
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v60, s32 offset:240
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v61, s32 offset:236
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v62, s32 offset:232
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v63, s32 offset:228
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v72, s32 offset:224
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v73, s32 offset:220
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v74, s32 offset:216
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v75, s32 offset:212
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v76, s32 offset:208
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v77, s32 offset:204
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v78, s32 offset:200
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v79, s32 offset:196
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v88, s32 offset:192
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v89, s32 offset:188
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v90, s32 offset:184
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v91, s32 offset:180
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v92, s32 offset:176
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v93, s32 offset:172
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v94, s32 offset:168
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v95, s32 offset:164
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_store_b32 off, v104, s32 offset:160
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v105, s32 offset:156
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v106, s32 offset:152
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v107, s32 offset:148
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v108, s32 offset:144
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v109, s32 offset:140
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v110, s32 offset:136
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v111, s32 offset:132
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v120, s32 offset:128
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v121, s32 offset:124
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v122, s32 offset:120
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v123, s32 offset:116
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v124, s32 offset:112
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v125, s32 offset:108
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v126, s32 offset:104
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v127, s32 offset:100
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v136, s32 offset:96
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v137, s32 offset:92
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v138, s32 offset:88
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v139, s32 offset:84
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v140, s32 offset:80
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v141, s32 offset:76
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v142, s32 offset:72
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v143, s32 offset:68
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v152, s32 offset:64
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v153, s32 offset:60
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v154, s32 offset:56
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v155, s32 offset:52
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v156, s32 offset:48
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v157, s32 offset:44
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v158, s32 offset:40
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v159, s32 offset:36
-; GFX11-NEXT:    s_clause 0x8
-; GFX11-NEXT:    scratch_store_b32 off, v168, s32 offset:32
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v169, s32 offset:28
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v170, s32 offset:24
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v171, s32 offset:20
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v172, s32 offset:16
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v173, s32 offset:12
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v174, s32 offset:8
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v175, s32 offset:4
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v184, s32
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT:    v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12
-; GFX11-NEXT:    v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9
-; GFX11-NEXT:    v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7
-; GFX11-NEXT:    v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3
-; GFX11-NEXT:    v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4
-; GFX11-NEXT:    v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0
-; GFX11-NEXT:    v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1
-; GFX11-NEXT:    v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cbranch_scc0 .LBB79_4
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2
-; GFX11-NEXT:    v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3
-; GFX11-NEXT:    v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18
-; GFX11-NEXT:    v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19
-; GFX11-NEXT:    v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22
-; GFX11-NEXT:    v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23
-; GFX11-NEXT:    v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26
-; GFX11-NEXT:    v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_vccnz .LBB79_3
-; GFX11-NEXT:  .LBB79_2: ; %cmp.true
-; GFX11-NEXT:    s_and_b32 s5, s27, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s4, s27, 16
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s5
-; GFX11-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
-; GFX11-NEXT:    s_lshl_b32 s6, s26, 16
-; GFX11-NEXT:    s_and_b32 s4, s26, 0xffff0000
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s6
-; GFX11-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v1
-; GFX11-NEXT:    v_add_f32_e64 v3, 0x40c00000, s4
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT:    v_bfe_u32 v10, v5, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_bfe_u32 v9, v3, 16, 1
-; GFX11-NEXT:    s_lshl_b32 s7, s25, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT:    s_and_b32 s5, s25, 0xffff0000
-; GFX11-NEXT:    s_and_b32 s4, s24, 0xffff0000
-; GFX11-NEXT:    v_add_f32_e64 v6, 0x40c00000, s5
-; GFX11-NEXT:    v_and_b32_e32 v51, 0xffff0000, v183
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v10, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v9, v3
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_bfe_u32 v10, v6, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v4, v7, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v8
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s7
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_bfe_u32 v3, v8, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshl_or_b32 v15, v1, 16, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, v3, v8
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v10, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v7, v9, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    s_lshl_b32 s4, s24, 16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v6
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v8
-; GFX11-NEXT:    v_add_f32_e64 v9, 0x40c00000, s4
-; GFX11-NEXT:    s_and_b32 s4, s23, 0xffff0000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v9
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT:    v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2
-; GFX11-NEXT:    v_bfe_u32 v7, v9, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, v7, v9
-; GFX11-NEXT:    v_add_f32_e64 v7, 0x40c00000, s4
-; GFX11-NEXT:    s_lshl_b32 s4, s23, 16
-; GFX11-NEXT:    v_lshl_or_b32 v151, v0, 16, v1
-; GFX11-NEXT:    v_add_f32_e64 v12, 0x40c00000, s4
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT:    v_bfe_u32 v11, v7, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v4, v8, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    s_and_b32 s4, s22, 0xffff0000
-; GFX11-NEXT:    v_bfe_u32 v9, v12, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v11, v7
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v10, 0x40c00000, s4
-; GFX11-NEXT:    s_lshl_b32 s4, s22, 16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
-; GFX11-NEXT:    v_add_f32_e64 v11, 0x40c00000, s4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v9, v12
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v7
-; GFX11-NEXT:    v_bfe_u32 v14, v10, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v12
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v8
-; GFX11-NEXT:    s_and_b32 s4, s21, 0xffff0000
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v6, v9, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v9, v11, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT:    v_add_nc_u32_e32 v12, v14, v10
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
-; GFX11-NEXT:    v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, 0x7fff, v12
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v10
-; GFX11-NEXT:    v_add_f32_e64 v13, 0x40c00000, s4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    s_lshl_b32 s4, s21, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v11
-; GFX11-NEXT:    v_add_f32_e64 v16, 0x40c00000, s4
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v10, v13, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_bfe_u32 v12, v16, 16, 1
-; GFX11-NEXT:    s_and_b32 s4, s20, 0xffff0000
-; GFX11-NEXT:    v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v9
-; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v13
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v11
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, v12, v16
-; GFX11-NEXT:    v_add_f32_e64 v12, 0x40c00000, s4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT:    s_lshl_b32 s4, s20, 16
-; GFX11-NEXT:    v_or_b32_e32 v17, 0x400000, v16
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, 0x7fff, v11
-; GFX11-NEXT:    v_bfe_u32 v18, v12, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v19, 0x40c00000, s4
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, v10, v14, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT:    s_and_b32 s4, s19, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v14, v18, v12
-; GFX11-NEXT:    v_bfe_u32 v16, v19, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v9
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v11, v17, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v17, 0x40c00000, s4
-; GFX11-NEXT:    s_lshl_b32 s4, s19, 16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v13
-; GFX11-NEXT:    v_add_nc_u32_e32 v13, 0x7fff, v14
-; GFX11-NEXT:    v_add_nc_u32_e32 v14, v16, v19
-; GFX11-NEXT:    v_or_b32_e32 v16, 0x400000, v12
-; GFX11-NEXT:    v_add_f32_e64 v18, 0x40c00000, s4
-; GFX11-NEXT:    v_bfe_u32 v21, v17, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_add_nc_u32_e32 v14, 0x7fff, v14
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v19
-; GFX11-NEXT:    s_and_b32 s4, s18, 0xffff0000
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, v13, v16, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v16, v18, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT:    v_add_nc_u32_e32 v19, v21, v17
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v13
-; GFX11-NEXT:    v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v16, 0x7fff, v19
-; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v17
-; GFX11-NEXT:    v_add_f32_e64 v20, 0x40c00000, s4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT:    s_lshl_b32 s4, s18, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v13, 0x7fff, v13
-; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v18
-; GFX11-NEXT:    v_add_f32_e64 v22, 0x40c00000, s4
-; GFX11-NEXT:    v_cndmask_b32_e32 v16, v16, v19, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v17, v20, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_bfe_u32 v19, v22, 16, 1
-; GFX11-NEXT:    s_and_b32 s4, s17, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v17, v17, v20
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v16
-; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v20
-; GFX11-NEXT:    v_add_nc_u32_e32 v17, 0x7fff, v17
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v18
-; GFX11-NEXT:    v_add_nc_u32_e32 v18, v19, v22
-; GFX11-NEXT:    v_add_f32_e64 v19, 0x40c00000, s4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-NEXT:    s_lshl_b32 s4, s17, 16
-; GFX11-NEXT:    v_or_b32_e32 v23, 0x400000, v22
-; GFX11-NEXT:    v_add_nc_u32_e32 v18, 0x7fff, v18
-; GFX11-NEXT:    v_bfe_u32 v24, v19, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v25, 0x40c00000, s4
-; GFX11-NEXT:    v_cndmask_b32_e32 v20, v17, v21, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-NEXT:    s_and_b32 s4, s16, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v21, v24, v19
-; GFX11-NEXT:    v_bfe_u32 v22, v25, 16, 1
-; GFX11-NEXT:    v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16
-; GFX11-NEXT:    v_add_f32_e64 v23, 0x40c00000, s4
-; GFX11-NEXT:    s_lshl_b32 s4, s16, 16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v20
-; GFX11-NEXT:    v_add_nc_u32_e32 v20, 0x7fff, v21
-; GFX11-NEXT:    v_add_nc_u32_e32 v21, v22, v25
-; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v19
-; GFX11-NEXT:    v_add_f32_e64 v24, 0x40c00000, s4
-; GFX11-NEXT:    v_bfe_u32 v27, v23, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
-; GFX11-NEXT:    v_add_nc_u32_e32 v21, 0x7fff, v21
-; GFX11-NEXT:    v_or_b32_e32 v26, 0x400000, v25
-; GFX11-NEXT:    s_and_b32 s4, s3, 0xffff0000
-; GFX11-NEXT:    v_cndmask_b32_e32 v20, v20, v22, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v22, v24, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-NEXT:    v_add_nc_u32_e32 v25, v27, v23
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v20
-; GFX11-NEXT:    v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v22, 0x7fff, v25
-; GFX11-NEXT:    v_or_b32_e32 v25, 0x400000, v23
-; GFX11-NEXT:    v_add_f32_e64 v26, 0x40c00000, s4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v20, 0x7fff, v20
-; GFX11-NEXT:    v_or_b32_e32 v27, 0x400000, v24
-; GFX11-NEXT:    v_add_f32_e64 v28, 0x40c00000, s3
-; GFX11-NEXT:    v_cndmask_b32_e32 v22, v22, v25, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v23, v26, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
-; GFX11-NEXT:    v_bfe_u32 v25, v28, 16, 1
-; GFX11-NEXT:    s_and_b32 s3, s2, 0xffff0000
-; GFX11-NEXT:    v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 16, v22
-; GFX11-NEXT:    v_or_b32_e32 v27, 0x400000, v26
-; GFX11-NEXT:    v_add_nc_u32_e32 v23, 0x7fff, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v24
-; GFX11-NEXT:    v_add_nc_u32_e32 v24, v25, v28
-; GFX11-NEXT:    v_add_f32_e64 v25, 0x40c00000, s3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX11-NEXT:    v_or_b32_e32 v29, 0x400000, v28
-; GFX11-NEXT:    v_add_nc_u32_e32 v24, 0x7fff, v24
-; GFX11-NEXT:    v_bfe_u32 v30, v25, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v31, 0x40c00000, s2
-; GFX11-NEXT:    v_cndmask_b32_e32 v26, v23, v27, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-NEXT:    s_and_b32 s2, s1, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v27, v30, v25
-; GFX11-NEXT:    v_bfe_u32 v28, v31, 16, 1
-; GFX11-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-NEXT:    v_cndmask_b32_e32 v24, v24, v29, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v29, 0x40c00000, s2
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v26
-; GFX11-NEXT:    v_add_nc_u32_e32 v26, 0x7fff, v27
-; GFX11-NEXT:    v_add_nc_u32_e32 v27, v28, v31
-; GFX11-NEXT:    v_or_b32_e32 v28, 0x400000, v25
-; GFX11-NEXT:    v_add_f32_e64 v30, 0x40c00000, s1
-; GFX11-NEXT:    v_bfe_u32 v33, v29, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
-; GFX11-NEXT:    v_add_nc_u32_e32 v27, 0x7fff, v27
-; GFX11-NEXT:    v_or_b32_e32 v32, 0x400000, v31
-; GFX11-NEXT:    s_and_b32 s1, s0, 0xffff0000
-; GFX11-NEXT:    v_cndmask_b32_e32 v26, v26, v28, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v28, v30, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-NEXT:    v_add_nc_u32_e32 v31, v33, v29
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 16, v26
-; GFX11-NEXT:    v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v28, 0x7fff, v31
-; GFX11-NEXT:    v_or_b32_e32 v31, 0x400000, v29
-; GFX11-NEXT:    v_add_f32_e64 v32, 0x40c00000, s1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX11-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v26, 0x7fff, v26
-; GFX11-NEXT:    v_or_b32_e32 v33, 0x400000, v30
-; GFX11-NEXT:    v_add_f32_e64 v34, 0x40c00000, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v28, v28, v31, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v29, v32, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX11-NEXT:    v_or_b32_e32 v35, 0x400000, v32
-; GFX11-NEXT:    v_bfe_u32 v31, v34, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
-; GFX11-NEXT:    v_cndmask_b32_e32 v30, v26, v33, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v28
-; GFX11-NEXT:    v_add_nc_u32_e32 v28, v29, v32
-; GFX11-NEXT:    v_lshlrev_b32_e32 v33, 16, v178
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v30
-; GFX11-NEXT:    v_add_nc_u32_e32 v30, v31, v34
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff0000, v178
-; GFX11-NEXT:    v_add_nc_u32_e32 v28, 0x7fff, v28
-; GFX11-NEXT:    v_add_f32_e32 v33, 0x40c00000, v33
-; GFX11-NEXT:    v_lshl_or_b32 v109, v5, 16, v7
-; GFX11-NEXT:    v_add_nc_u32_e32 v30, 0x7fff, v30
-; GFX11-NEXT:    v_add_f32_e32 v31, 0x40c00000, v31
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v28, v35, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v37, v33, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff, v29
-; GFX11-NEXT:    v_bfe_u32 v35, v31, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v32
-; GFX11-NEXT:    v_add_nc_u32_e32 v32, v37, v33
-; GFX11-NEXT:    v_cndmask_b32_e32 v30, v30, v36, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v179
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, v35, v31
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v33
-; GFX11-NEXT:    v_add_nc_u32_e32 v32, 0x7fff, v32
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff0000, v179
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v31
-; GFX11-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v37, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT:    v_and_b32_e32 v37, 0xffff0000, v180
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT:    v_cndmask_b32_e32 v31, v34, v38, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v34, v36, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v33, v35, 16, 1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v180
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36
-; GFX11-NEXT:    v_add_nc_u32_e32 v33, v33, v35
-; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT:    v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT:    v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff, v32
-; GFX11-NEXT:    v_bfe_u32 v36, v37, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
-; GFX11-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_bfe_u32 v35, v38, 16, 1
-; GFX11-NEXT:    v_lshl_or_b32 v178, v31, 16, v32
-; GFX11-NEXT:    v_add_nc_u32_e32 v31, v36, v37
-; GFX11-NEXT:    v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v33
-; GFX11-NEXT:    v_add_nc_u32_e32 v33, v35, v38
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff0000, v182
-; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v37
-; GFX11-NEXT:    v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT:    v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT:    v_lshl_or_b32 v179, v32, 16, v34
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xffff, v30
-; GFX11-NEXT:    v_lshl_or_b32 v136, v2, 16, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v39, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT:    v_bfe_u32 v37, v36, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT:    v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v32, v37, v36
-; GFX11-NEXT:    v_and_b32_e32 v37, 0xffff0000, v181
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v38
-; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v181
-; GFX11-NEXT:    v_add_nc_u32_e32 v32, 0x7fff, v32
-; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT:    v_add_f32_e32 v37, 0x40c00000, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT:    v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v39, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff, v33
-; GFX11-NEXT:    v_bfe_u32 v35, v37, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v36, v38, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT:    v_cndmask_b32_e32 v34, v34, v48, vcc_lo
-; GFX11-NEXT:    v_lshl_or_b32 v180, v31, 16, v33
-; GFX11-NEXT:    v_add_nc_u32_e32 v33, v35, v37
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff0000, v170
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v34
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, v36, v38
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff, v32
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-NEXT:    v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v37
-; GFX11-NEXT:    v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v39, v36, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT:    v_lshl_or_b32 v182, v31, 16, v32
-; GFX11-NEXT:    v_add_nc_u32_e32 v37, v38, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT:    v_add_nc_u32_e32 v31, v39, v36
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v48, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v36
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT:    v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v33
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff, v34
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v37
-; GFX11-NEXT:    v_and_b32_e32 v37, 0xffff0000, v169
-; GFX11-NEXT:    v_cndmask_b32_e32 v31, v31, v38, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v169
-; GFX11-NEXT:    v_lshl_or_b32 v181, v32, 16, v33
-; GFX11-NEXT:    v_add_f32_e32 v37, 0x40c00000, v37
-; GFX11-NEXT:    v_and_b32_e32 v38, 0xffff0000, v176
-; GFX11-NEXT:    v_cndmask_b32_e32 v34, v34, v48, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v36, 0x40c00000, v39
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT:    v_bfe_u32 v35, v37, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v34
-; GFX11-NEXT:    v_bfe_u32 v32, v36, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, v35, v37
-; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v176
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT:    v_add_nc_u32_e32 v32, v32, v36
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v36
-; GFX11-NEXT:    v_add_nc_u32_e32 v32, 0x7fff, v32
-; GFX11-NEXT:    v_bfe_u32 v37, v38, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v49, v35, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT:    v_lshl_or_b32 v170, v33, 16, v31
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v34
-; GFX11-NEXT:    v_add_nc_u32_e32 v36, v49, v35
-; GFX11-NEXT:    v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38
-; GFX11-NEXT:    v_and_b32_e32 v37, 0xffff0000, v174
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v36
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT:    v_add_nc_u32_e32 v33, 0x7fff, v33
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT:    v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36
-; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v174
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff, v32
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v37, v35, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT:    v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39
-; GFX11-NEXT:    v_lshl_or_b32 v169, v31, 16, v32
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v37, v37, v35
-; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v31, v36, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff0000, v171
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v177
-; GFX11-NEXT:    v_add_nc_u32_e32 v31, v31, v36
-; GFX11-NEXT:    v_lshl_or_b32 v176, v33, 16, v34
-; GFX11-NEXT:    v_add_nc_u32_e32 v33, 0x7fff, v37
-; GFX11-NEXT:    v_or_b32_e32 v34, 0x400000, v35
-; GFX11-NEXT:    v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171
-; GFX11-NEXT:    v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT:    v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v34, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v34, 0x400000, v36
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v37
-; GFX11-NEXT:    v_bfe_u32 v37, v32, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v32
-; GFX11-NEXT:    v_bfe_u32 v50, v38, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v35
-; GFX11-NEXT:    v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32
-; GFX11-NEXT:    v_bfe_u32 v34, v35, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v37, 0xffff0000, v177
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT:    v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v36, v39, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT:    v_bfe_u32 v49, v37, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v36, v50, v38
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT:    v_lshlrev_b32_e32 v50, 16, v184
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37
-; GFX11-NEXT:    v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT:    v_and_b32_e32 v48, 0xffff0000, v184
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT:    v_add_nc_u32_e32 v35, 0x7fff, v35
-; GFX11-NEXT:    v_or_b32_e32 v49, 0x400000, v37
-; GFX11-NEXT:    v_cndmask_b32_e32 v36, v36, v39, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT:    v_add_f32_e32 v37, 0x40c00000, v50
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT:    v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v36
-; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT:    v_bfe_u32 v48, v37, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v39, v38, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v35
-; GFX11-NEXT:    v_and_b32_e32 v36, 0xffff, v36
-; GFX11-NEXT:    v_lshl_or_b32 v174, v33, 16, v31
-; GFX11-NEXT:    v_lshl_or_b32 v171, v32, 16, v34
-; GFX11-NEXT:    v_add_nc_u32_e32 v31, v48, v37
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff0000, v175
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v175
-; GFX11-NEXT:    v_add_nc_u32_e32 v39, v39, v38
-; GFX11-NEXT:    v_lshl_or_b32 v177, v35, 16, v36
-; GFX11-NEXT:    v_add_nc_u32_e32 v31, 0x7fff, v31
-; GFX11-NEXT:    v_or_b32_e32 v35, 0x400000, v37
-; GFX11-NEXT:    v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT:    v_add_nc_u32_e32 v32, 0x7fff, v39
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v38
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v37, v33, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v39, v34, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v31, v31, v35, vcc_lo
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff0000, v173
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v173
-; GFX11-NEXT:    v_or_b32_e32 v49, 0x400000, v33
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36
-; GFX11-NEXT:    v_add_nc_u32_e32 v36, v37, v33
-; GFX11-NEXT:    v_add_nc_u32_e32 v37, v39, v34
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v34
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT:    v_add_nc_u32_e32 v37, 0x7fff, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT:    v_add_nc_u32_e32 v38, v38, v35
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT:    v_lshl_or_b32 v122, v3, 16, v6
-; GFX11-NEXT:    v_cndmask_b32_e32 v34, v37, v39, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT:    v_add_nc_u32_e32 v37, 0x7fff, v38
-; GFX11-NEXT:    v_add_f32_e32 v38, 0x40c00000, v48
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v35
-; GFX11-NEXT:    v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v36, v38, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v49, 0x400000, v38
-; GFX11-NEXT:    v_add_f32_e32 v48, 0x40c00000, v48
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT:    v_cndmask_b32_e32 v35, v37, v39, vcc_lo
-; GFX11-NEXT:    v_and_b32_e32 v37, 0xffff0000, v172
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v172
-; GFX11-NEXT:    v_add_nc_u32_e32 v36, v36, v38
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT:    v_or_b32_e32 v55, 0x400000, v48
-; GFX11-NEXT:    v_add_f32_e32 v37, 0x40c00000, v37
-; GFX11-NEXT:    v_add_f32_e32 v39, 0x40c00000, v39
-; GFX11-NEXT:    v_add_nc_u32_e32 v36, 0x7fff, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT:    v_bfe_u32 v50, v37, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v38, v39, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v36, v36, v49, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v54, 0x400000, v39
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
-; GFX11-NEXT:    v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37
-; GFX11-NEXT:    v_bfe_u32 v51, v48, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v38, v38, v39
-; GFX11-NEXT:    v_or_b32_e32 v53, 0x400000, v37
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v49, 0x7fff, v49
-; GFX11-NEXT:    v_bfe_u32 v52, v50, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v51, v51, v48
-; GFX11-NEXT:    v_add_nc_u32_e32 v38, 0x7fff, v38
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v35
-; GFX11-NEXT:    v_add_nc_u32_e32 v52, v52, v50
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v48, v48
-; GFX11-NEXT:    v_and_b32_e32 v36, 0xffff, v36
-; GFX11-NEXT:    v_add_nc_u32_e32 v39, 0x7fff, v52
-; GFX11-NEXT:    v_or_b32_e32 v52, 0x400000, v50
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v38
-; GFX11-NEXT:    v_cndmask_b32_e32 v48, v51, v55, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT:    v_lshl_or_b32 v184, v32, 16, v31
-; GFX11-NEXT:    v_lshl_or_b32 v175, v33, 16, v34
-; GFX11-NEXT:    v_and_b32_e32 v38, 0xffff, v38
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v48
-; GFX11-NEXT:    v_cndmask_b32_e32 v37, v49, v53, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
-; GFX11-NEXT:    v_lshl_or_b32 v173, v35, 16, v36
-; GFX11-NEXT:    v_lshl_or_b32 v97, v8, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v48, 0xffff, v48
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v37
-; GFX11-NEXT:    v_cndmask_b32_e32 v39, v39, v52, vcc_lo
-; GFX11-NEXT:    v_lshl_or_b32 v86, v9, 16, v12
-; GFX11-NEXT:    v_lshl_or_b32 v76, v11, 16, v13
-; GFX11-NEXT:    v_lshl_or_b32 v67, v14, 16, v17
-; GFX11-NEXT:    v_lshl_or_b32 v172, v37, 16, v38
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v39
-; GFX11-NEXT:    v_lshl_or_b32 v59, v16, 16, v19
-; GFX11-NEXT:    v_lshl_or_b32 v52, v18, 16, v20
-; GFX11-NEXT:    v_lshl_or_b32 v46, v21, 16, v23
-; GFX11-NEXT:    v_lshl_or_b32 v41, v22, 16, v25
-; GFX11-NEXT:    v_lshl_or_b32 v183, v39, 16, v48
-; GFX11-NEXT:    v_lshl_or_b32 v37, v24, 16, v27
-; GFX11-NEXT:    v_lshl_or_b32 v34, v26, 16, v28
-; GFX11-NEXT:    v_lshl_or_b32 v32, v29, 16, v30
-; GFX11-NEXT:  .LBB79_3: ; %end
-; GFX11-NEXT:    v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46
-; GFX11-NEXT:    v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86
-; GFX11-NEXT:    v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76
-; GFX11-NEXT:    v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136
-; GFX11-NEXT:    v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122
-; GFX11-NEXT:    v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172
-; GFX11-NEXT:    v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175
-; GFX11-NEXT:    v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174
-; GFX11-NEXT:    v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169
-; GFX11-NEXT:    v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_b32 v184, off, s32
-; GFX11-NEXT:    scratch_load_b32 v175, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_b32 v174, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_b32 v173, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_b32 v172, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_b32 v171, off, s32 offset:20
-; GFX11-NEXT:    scratch_load_b32 v170, off, s32 offset:24
-; GFX11-NEXT:    scratch_load_b32 v169, off, s32 offset:28
-; GFX11-NEXT:    scratch_load_b32 v168, off, s32 offset:32
-; GFX11-NEXT:    scratch_load_b32 v159, off, s32 offset:36
-; GFX11-NEXT:    scratch_load_b32 v158, off, s32 offset:40
-; GFX11-NEXT:    scratch_load_b32 v157, off, s32 offset:44
-; GFX11-NEXT:    scratch_load_b32 v156, off, s32 offset:48
-; GFX11-NEXT:    scratch_load_b32 v155, off, s32 offset:52
-; GFX11-NEXT:    scratch_load_b32 v154, off, s32 offset:56
-; GFX11-NEXT:    scratch_load_b32 v153, off, s32 offset:60
-; GFX11-NEXT:    scratch_load_b32 v152, off, s32 offset:64
-; GFX11-NEXT:    scratch_load_b32 v143, off, s32 offset:68
-; GFX11-NEXT:    scratch_load_b32 v142, off, s32 offset:72
-; GFX11-NEXT:    scratch_load_b32 v141, off, s32 offset:76
-; GFX11-NEXT:    scratch_load_b32 v140, off, s32 offset:80
-; GFX11-NEXT:    scratch_load_b32 v139, off, s32 offset:84
-; GFX11-NEXT:    scratch_load_b32 v138, off, s32 offset:88
-; GFX11-NEXT:    scratch_load_b32 v137, off, s32 offset:92
-; GFX11-NEXT:    scratch_load_b32 v136, off, s32 offset:96
-; GFX11-NEXT:    scratch_load_b32 v127, off, s32 offset:100
-; GFX11-NEXT:    scratch_load_b32 v126, off, s32 offset:104
-; GFX11-NEXT:    scratch_load_b32 v125, off, s32 offset:108
-; GFX11-NEXT:    scratch_load_b32 v124, off, s32 offset:112
-; GFX11-NEXT:    scratch_load_b32 v123, off, s32 offset:116
-; GFX11-NEXT:    scratch_load_b32 v122, off, s32 offset:120
-; GFX11-NEXT:    scratch_load_b32 v121, off, s32 offset:124
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_b32 v120, off, s32 offset:128
-; GFX11-NEXT:    scratch_load_b32 v111, off, s32 offset:132
-; GFX11-NEXT:    scratch_load_b32 v110, off, s32 offset:136
-; GFX11-NEXT:    scratch_load_b32 v109, off, s32 offset:140
-; GFX11-NEXT:    scratch_load_b32 v108, off, s32 offset:144
-; GFX11-NEXT:    scratch_load_b32 v107, off, s32 offset:148
-; GFX11-NEXT:    scratch_load_b32 v106, off, s32 offset:152
-; GFX11-NEXT:    scratch_load_b32 v105, off, s32 offset:156
-; GFX11-NEXT:    scratch_load_b32 v104, off, s32 offset:160
-; GFX11-NEXT:    scratch_load_b32 v95, off, s32 offset:164
-; GFX11-NEXT:    scratch_load_b32 v94, off, s32 offset:168
-; GFX11-NEXT:    scratch_load_b32 v93, off, s32 offset:172
-; GFX11-NEXT:    scratch_load_b32 v92, off, s32 offset:176
-; GFX11-NEXT:    scratch_load_b32 v91, off, s32 offset:180
-; GFX11-NEXT:    scratch_load_b32 v90, off, s32 offset:184
-; GFX11-NEXT:    scratch_load_b32 v89, off, s32 offset:188
-; GFX11-NEXT:    scratch_load_b32 v88, off, s32 offset:192
-; GFX11-NEXT:    scratch_load_b32 v79, off, s32 offset:196
-; GFX11-NEXT:    scratch_load_b32 v78, off, s32 offset:200
-; GFX11-NEXT:    scratch_load_b32 v77, off, s32 offset:204
-; GFX11-NEXT:    scratch_load_b32 v76, off, s32 offset:208
-; GFX11-NEXT:    scratch_load_b32 v75, off, s32 offset:212
-; GFX11-NEXT:    scratch_load_b32 v74, off, s32 offset:216
-; GFX11-NEXT:    scratch_load_b32 v73, off, s32 offset:220
-; GFX11-NEXT:    scratch_load_b32 v72, off, s32 offset:224
-; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:228
-; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:232
-; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:236
-; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:240
-; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:244
-; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:248
-; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:252
-; GFX11-NEXT:    s_clause 0x8
-; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:256
-; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:260
-; GFX11-NEXT:    scratch_load_b32 v46, off, s32 offset:264
-; GFX11-NEXT:    scratch_load_b32 v45, off, s32 offset:268
-; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:272
-; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:276
-; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:280
-; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:284
-; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:288
-; GFX11-NEXT:    v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34
-; GFX11-NEXT:    v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52
-; GFX11-NEXT:    v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177
-; GFX11-NEXT:    v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181
-; GFX11-NEXT:    v_mov_b32_e32 v28, v182
-; GFX11-NEXT:    v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB79_4:
-; GFX11-NEXT:    ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
-; GFX11-NEXT:    ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69
-; GFX11-NEXT:    ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73
-; GFX11-NEXT:    ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78
-; GFX11-NEXT:    ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84
-; GFX11-NEXT:    ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91
-; GFX11-NEXT:    ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99
-; GFX11-NEXT:    ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108
-; GFX11-NEXT:    ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118
-; GFX11-NEXT:    ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129
-; GFX11-NEXT:    ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141
-; GFX11-NEXT:    ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154
-; GFX11-NEXT:    ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168
-; GFX11-NEXT:    s_branch .LBB79_2
+; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v16f64_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:192
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:188
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:180
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:172
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:156
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:64
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:60
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:52
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:44
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:28
+; GFX11-TRUE16-NEXT:    s_clause 0x6
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v167, v13 :: v_dual_mov_b32 v176, v12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v177, v11 :: v_dual_mov_b32 v178, v10
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v179, v9 :: v_dual_mov_b32 v180, v8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v181, v7 :: v_dual_mov_b32 v182, v6
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v183, v5 :: v_dual_mov_b32 v168, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v169, v3 :: v_dual_mov_b32 v170, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v171, v1 :: v_dual_mov_b32 v172, v0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v174, s28 :: v_dual_mov_b32 v173, s29
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB79_4
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v135, s0 :: v_dual_mov_b32 v134, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v132, s2 :: v_dual_mov_b32 v129, s3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v125, s16 :: v_dual_mov_b32 v120, s17
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v114, s18 :: v_dual_mov_b32 v107, s19
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v99, s20 :: v_dual_mov_b32 v90, s21
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v80, s22 :: v_dual_mov_b32 v69, s23
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v57, s24 :: v_dual_mov_b32 v44, s25
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v30, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB79_3
+; GFX11-TRUE16-NEXT:  .LBB79_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s27, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s27, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s5
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s26, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s26, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_and_b32 s5, s25, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s7, s25, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s5
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s24, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v2, v8 :: v_dual_add_nc_u32 v7, v7, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s7
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s24, 16
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v2, v7, v2 :: v_dual_add_nc_u32 v7, v8, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v9, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v3, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v1.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v44, 16, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v44.h, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s23, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v57, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v57.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s23, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s22, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v69.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s22, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s21, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v80.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s21, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s20, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v90, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v90.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s20, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s19, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v99.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s19, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s18, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v107, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v107.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s17, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v114, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v114.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s17, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s16, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v120, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v120.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s16, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s3, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v125, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v125.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-TRUE16-NEXT:    s_and_b32 s3, s2, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v129.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    s_and_b32 s2, s1, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v132.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, s0, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v134.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v135.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v167
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v167
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v167, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v167.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v176
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v176
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v176, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v176.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v177
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v177
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v177, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v177.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v178
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v178
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v178, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v178.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v179
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v179
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v179, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v179.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v180
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v180
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v180, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v180.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v181
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v181
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v181, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v181.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v182
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v182
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v182, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v182.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v183
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v183
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v183, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v183.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v168
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v168
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v168, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v168.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v169
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v169
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v169, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v169.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v170
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v170
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v170, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v170.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v171
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v171
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v171, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v171.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v172
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v172
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v172, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v172.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v173
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v173
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v173, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v173.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v174
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v174
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v174, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v174.h, v0.l
+; GFX11-TRUE16-NEXT:  .LBB79_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, v125 :: v_dual_mov_b32 v5, v120
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, v114 :: v_dual_mov_b32 v7, v107
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, v99 :: v_dual_mov_b32 v9, v90
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, v57 :: v_dual_mov_b32 v13, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, v30 :: v_dual_mov_b32 v17, v173
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, v174 :: v_dual_mov_b32 v19, v171
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0x6
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, v135 :: v_dual_mov_b32 v1, v134
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, v132 :: v_dual_mov_b32 v3, v129
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, v80 :: v_dual_mov_b32 v11, v69
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v24, v182 :: v_dual_mov_b32 v27, v179
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, v180 :: v_dual_mov_b32 v29, v177
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, v178 :: v_dual_mov_b32 v31, v167
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v30, v176
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB79_4:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166
+; GFX11-TRUE16-NEXT:    s_branch .LBB79_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v16f64_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:288
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:284
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:280
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v43, s32 offset:276
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v44, s32 offset:272
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v45, s32 offset:268
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v46, s32 offset:264
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v47, s32 offset:260
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v56, s32 offset:256
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v57, s32 offset:252
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v58, s32 offset:248
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v59, s32 offset:244
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v60, s32 offset:240
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v61, s32 offset:236
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v62, s32 offset:232
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v63, s32 offset:228
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v72, s32 offset:224
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v73, s32 offset:220
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v74, s32 offset:216
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:212
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v76, s32 offset:208
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v77, s32 offset:204
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v78, s32 offset:200
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v79, s32 offset:196
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v88, s32 offset:192
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v89, s32 offset:188
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v90, s32 offset:184
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v91, s32 offset:180
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v92, s32 offset:176
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:172
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:168
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:164
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:160
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:156
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:152
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v107, s32 offset:148
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v108, s32 offset:144
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v109, s32 offset:140
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v110, s32 offset:136
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v111, s32 offset:132
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v120, s32 offset:128
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v121, s32 offset:124
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v122, s32 offset:120
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v123, s32 offset:116
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v124, s32 offset:112
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v125, s32 offset:108
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v126, s32 offset:104
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v127, s32 offset:100
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v136, s32 offset:96
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v137, s32 offset:92
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v138, s32 offset:88
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v139, s32 offset:84
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v140, s32 offset:80
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v141, s32 offset:76
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v142, s32 offset:72
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v143, s32 offset:68
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v152, s32 offset:64
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v153, s32 offset:60
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v154, s32 offset:56
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v155, s32 offset:52
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v156, s32 offset:48
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v157, s32 offset:44
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v158, s32 offset:40
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v159, s32 offset:36
+; GFX11-FAKE16-NEXT:    s_clause 0x8
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v168, s32 offset:32
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v169, s32 offset:28
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v170, s32 offset:24
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v171, s32 offset:20
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v172, s32 offset:16
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v173, s32 offset:12
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v174, s32 offset:8
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v175, s32 offset:4
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v184, s32
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v178, v13 :: v_dual_mov_b32 v179, v12
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v180, v11 :: v_dual_mov_b32 v181, v9
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v182, v10 :: v_dual_mov_b32 v169, v7
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v170, v8 :: v_dual_mov_b32 v177, v3
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v176, v6 :: v_dual_mov_b32 v171, v4
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v174, v5 :: v_dual_mov_b32 v173, v0
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v184, v2 :: v_dual_mov_b32 v175, v1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v183, s28 :: v_dual_mov_b32 v172, s29
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB79_4
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, s0 :: v_dual_mov_b32 v37, s2
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, s1 :: v_dual_mov_b32 v41, s3
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v59, s18
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v52, s17 :: v_dual_mov_b32 v67, s19
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v76, s20 :: v_dual_mov_b32 v97, s22
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v86, s21 :: v_dual_mov_b32 v109, s23
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v122, s24 :: v_dual_mov_b32 v151, s26
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v136, s25 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB79_3
+; GFX11-FAKE16-NEXT:  .LBB79_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_and_b32 s5, s27, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s27, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s26, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s26, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v3, 16, 1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s25, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT:    s_and_b32 s5, s25, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s24, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v51, 0xffff0000, v183
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v10, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v9, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v8, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, v3, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v10, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s24, 16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s23, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v1, v7 :: v_dual_and_b32 v1, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, v7, v9
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s23, 16
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v151, v0, 16, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v12, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s22, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v12, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v11, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s22, 16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v11, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v9, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v14, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, 0x400000, v12
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s21, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v6, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v11, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, v14, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_nc_u32 v7, v9, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 0x7fff, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v13, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s21, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x400000, v11
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v16, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v13, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v12, v16, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s20, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v7, v14 :: v_dual_add_nc_u32 v10, v10, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x400000, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, v12, v16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v12, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s20, 16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, 0x400000, v16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v18, v12, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v19, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v10, v14, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s19, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, v18, v12
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v19, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v17, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v17, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s19, 16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, 0x7fff, v14
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, v16, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, 0x400000, v12
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v18, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v21, v17, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v19
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s18, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v13, v16, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v18, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v19, v21, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v13
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v14, v20 :: v_dual_add_nc_u32 v13, v16, v18
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v16, 0x7fff, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, 0x400000, v17
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v20, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s18, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, 0x7fff, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, 0x400000, v18
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v22, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v16, v19, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v17, v20, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v19, v22, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s17, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, v17, v20
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v18, v13, v21 :: v_dual_and_b32 v13, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, 0x400000, v20
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, 0x7fff, v17
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v18
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v18, v19, v22
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v19, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s17, 16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, 0x400000, v22
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v18, 0x7fff, v18
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v24, v19, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v25, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v17, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s16, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v21, v24, v19
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v25, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v17, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v23, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s16, 16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v20
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v20, 0x7fff, v21
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v21, v22, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v19
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v24, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v27, v23, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v21, 0x7fff, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, 0x400000, v25
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s3, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v20, v22, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v24, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v25, v27, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v20
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v21, v21, v26 :: v_dual_add_nc_u32 v20, v22, v24
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v22, 0x7fff, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, 0x400000, v23
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v26, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v20, 0x7fff, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, 0x400000, v24
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v28, 0x40c00000, s3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v22, v25, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v23, v26, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v25, v28, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s3, s2, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v24, v20, v27 :: v_dual_add_nc_u32 v23, v23, v26
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, 0x400000, v26
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v23, 0x7fff, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v24
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v24, v25, v28
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v25, 0x40c00000, s3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, 0x400000, v28
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v24, 0x7fff, v24
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v30, v25, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v31, 0x40c00000, s2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v26, v23, v27, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s1, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v27, v30, v25
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v28, v31, 16, 1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v24, v29, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v29, 0x40c00000, s2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v26
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v26, 0x7fff, v27
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v27, v28, v31
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, 0x400000, v25
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v30, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v29, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v27, 0x7fff, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v32, 0x400000, v31
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s0, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v26, v26, v28, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v28, v30, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, v33, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v26
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v27, v27, v32 :: v_dual_add_nc_u32 v26, v28, v30
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v28, 0x7fff, v31
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, 0x400000, v29
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v32, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v26, 0x7fff, v26
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v33, 0x400000, v30
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v34, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v28, v28, v31, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v29, v32, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v35, 0x400000, v32
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v31, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v30, v26, v33, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v28
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v28, v29, v32
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v33, 16, v178
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v30
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v30, v31, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xffff0000, v178
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v28, 0x7fff, v28
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v33, 0x40c00000, v33
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v109, v5, 16, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v30, 0x7fff, v30
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v31, 0x40c00000, v31
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v28, v35, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v33, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff, v29
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v31, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v32
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v32, v37, v33
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v30, v30, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v179
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, v35, v31
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v33
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v32, 0x7fff, v32
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v179
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v31
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v32, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v180
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v34, v38, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v36, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v180
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v36
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v33, v33, v35
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff, v32
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v36, v37, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v38, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v178, v31, 16, v32
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, v36, v37
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v33, v33, v48 :: v_dual_lshlrev_b32 v36, 16, v182
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v33
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v33, v35, v38
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v182
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v48, 0x400000, v37
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v179, v32, 16, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xffff, v30
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v136, v2, 16, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v36, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v31, v31, v48 :: v_dual_add_nc_u32 v38, v38, v35
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v32, v37, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v181
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v38
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v181
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v32, 0x7fff, v32
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v32, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xffff, v33
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v37, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v36, v38, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v34, v48, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v180, v31, 16, v33
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v33, v35, v37
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v170
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v34
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, v36, v38
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v36, 16, v170
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff, v32
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v48, 0x400000, v37
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v39, v36, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v182, v31, 16, v32
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v37, v38, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, v39, v36
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v48, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v36
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xffff, v34
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v37
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v169
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v31, v38, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v169
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v181, v32, 16, v33
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v176
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v34, v48, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v39
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v37, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v34
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v36, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, v35, v37
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v35, 16, v176
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v32, v32, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v48, 0x400000, v36
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v32, 0x7fff, v32
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v38, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v49, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v170, v33, 16, v31
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v34
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v36, v49, v35
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v32, v32, v48 :: v_dual_add_nc_u32 v33, v37, v38
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v174
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v33, 0x7fff, v33
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v34, v34, v36
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v174
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff, v32
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_cndmask_b32 v33, v33, v39
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v169, v31, 16, v32
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v37, v37, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v31, v36, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v171
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v177
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, v31, v36
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v176, v33, 16, v34
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v33, 0x7fff, v37
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v34, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v37, 16, v171
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v34, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v34, 0x400000, v36
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v32, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v32
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v50, v38, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v48, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v31, v31, v34 :: v_dual_add_nc_u32 v36, v37, v32
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v177
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_nc_u32 v34, v34, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v36, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v49, v37, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v36, v50, v38
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v184
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v34, v34, v48 :: v_dual_add_nc_u32 v35, v49, v37
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v48, 0xffff0000, v184
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v35, 0x7fff, v35
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v49, 0x400000, v37
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v36, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v37, 0x40c00000, v50
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_cndmask_b32 v35, v35, v49
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v48, v37, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v39, v38, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v36, 0xffff, v36
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v174, v33, 16, v31
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v171, v32, 16, v34
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, v48, v37
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v175
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v175
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v39, v39, v38
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v177, v35, 16, v36
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, 0x7fff, v31
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v35, 0x400000, v37
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v34, 0x40c00000, v34
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v32, 0x7fff, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v38
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v33, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v39, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v31, v35, vcc_lo
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v173
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v173
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v49, 0x400000, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v36, v37, v33
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v37, v39, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v34
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v37, 0x7fff, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v38, v38, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v122, v3, 16, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v37, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v37, 0x7fff, v38
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v38, 0x40c00000, v48
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v33, v36, v49 :: v_dual_lshlrev_b32 v48, 16, v183
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v36, v38, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v49, 0x400000, v38
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v48, 0x40c00000, v48
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v37, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v172
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v172
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v36, v36, v38
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v55, 0x400000, v48
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v39, 0x40c00000, v39
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v36, 0x7fff, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v50, v37, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v39, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v36, v49, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v54, 0x400000, v39
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_add_nc_u32 v49, v50, v37
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v51, v48, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v38, v38, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v53, 0x400000, v37
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v49, 0x7fff, v49
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v52, v50, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v51, v51, v48
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v38, 0x7fff, v38
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v35
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v52, v52, v50
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v38, v38, v54 :: v_dual_add_nc_u32 v51, 0x7fff, v51
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v36, 0xffff, v36
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v39, 0x7fff, v52
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v52, 0x400000, v50
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v38
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v48, v51, v55, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v184, v32, 16, v31
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v175, v33, 16, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff, v38
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v48
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v37, v49, v53, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v173, v35, 16, v36
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v97, v8, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v48, 0xffff, v48
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v37
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v39, v52, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v86, v9, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v76, v11, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v67, v14, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v172, v37, 16, v38
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v39
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v59, v16, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v52, v18, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v46, v21, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v41, v22, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v183, v39, 16, v48
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v37, v24, 16, v27
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v34, v26, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v32, v29, 16, v30
+; GFX11-FAKE16-NEXT:  .LBB79_3: ; %end
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v3, v41 :: v_dual_mov_b32 v4, v46
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, v59 :: v_dual_mov_b32 v9, v86
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v7, v67 :: v_dual_mov_b32 v8, v76
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, v97 :: v_dual_mov_b32 v13, v136
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v11, v109 :: v_dual_mov_b32 v12, v122
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, v151 :: v_dual_mov_b32 v17, v172
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v18, v173 :: v_dual_mov_b32 v19, v175
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v184, off, s32
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v175, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v174, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v173, off, s32 offset:12
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v172, off, s32 offset:16
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v171, off, s32 offset:20
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v170, off, s32 offset:24
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v169, off, s32 offset:28
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v168, off, s32 offset:32
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v159, off, s32 offset:36
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v158, off, s32 offset:40
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v157, off, s32 offset:44
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v156, off, s32 offset:48
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v155, off, s32 offset:52
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v154, off, s32 offset:56
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v153, off, s32 offset:60
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v152, off, s32 offset:64
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v143, off, s32 offset:68
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v142, off, s32 offset:72
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v141, off, s32 offset:76
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v140, off, s32 offset:80
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v139, off, s32 offset:84
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v138, off, s32 offset:88
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v137, off, s32 offset:92
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v136, off, s32 offset:96
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v127, off, s32 offset:100
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v126, off, s32 offset:104
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v125, off, s32 offset:108
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v124, off, s32 offset:112
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v123, off, s32 offset:116
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v122, off, s32 offset:120
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v121, off, s32 offset:124
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v120, off, s32 offset:128
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v111, off, s32 offset:132
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v110, off, s32 offset:136
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v109, off, s32 offset:140
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v108, off, s32 offset:144
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v107, off, s32 offset:148
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v106, off, s32 offset:152
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v105, off, s32 offset:156
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v104, off, s32 offset:160
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v95, off, s32 offset:164
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v94, off, s32 offset:168
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v93, off, s32 offset:172
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v92, off, s32 offset:176
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v91, off, s32 offset:180
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v90, off, s32 offset:184
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v89, off, s32 offset:188
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v88, off, s32 offset:192
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v79, off, s32 offset:196
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v78, off, s32 offset:200
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v77, off, s32 offset:204
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v76, off, s32 offset:208
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:212
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:216
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:220
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:224
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v63, off, s32 offset:228
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v62, off, s32 offset:232
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v61, off, s32 offset:236
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v60, off, s32 offset:240
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v59, off, s32 offset:244
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:248
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:252
+; GFX11-FAKE16-NEXT:    s_clause 0x8
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:256
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:260
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:264
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v45, off, s32 offset:268
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v44, off, s32 offset:272
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v43, off, s32 offset:276
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v42, off, s32 offset:280
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v41, off, s32 offset:284
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s32 offset:288
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v34
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, v37 :: v_dual_mov_b32 v5, v52
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, v183 :: v_dual_mov_b32 v21, v177
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v24, v176 :: v_dual_mov_b32 v27, v181
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v28, v182
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v30, v179 :: v_dual_mov_b32 v31, v178
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB79_4:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168
+; GFX11-FAKE16-NEXT:    s_branch .LBB79_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -155778,9 +159946,10 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s29, 8
 ; GFX11-TRUE16-NEXT:    s_and_b32 s7, s2, 0xff
 ; GFX11-TRUE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s1, 8
-; GFX11-TRUE16-NEXT:    v_and_b32_e64 v5, 0xffff, s5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s5 :: v_dual_and_b32 v1, 0xff, v35
 ; GFX11-TRUE16-NEXT:    s_and_b32 s5, s0, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s1, 8
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s3, 8
 ; GFX11-TRUE16-NEXT:    s_or_b32 s5, s5, s6
 ; GFX11-TRUE16-NEXT:    s_or_b32 s6, s7, s8
@@ -155796,6 +159965,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s21, 8
 ; GFX11-TRUE16-NEXT:    s_and_b32 s9, s22, 0xff
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s23, 8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v32
 ; GFX11-TRUE16-NEXT:    s_or_b32 s7, s7, s8
 ; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s10
 ; GFX11-TRUE16-NEXT:    s_and_b32 s9, s24, 0xff
@@ -155806,201 +159976,169 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX11-TRUE16-NEXT:    s_or_b32 s10, s11, s12
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s9, s10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v36
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v32
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v35
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v34
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v33
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v68
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v33
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v64
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v66
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v4, v67
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v65
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v0, 16, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v38
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v39
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v6, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v49
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v3, 16, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v1, v66
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v37
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v70
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v34
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v65
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v36
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xff, v118
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v67
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v38
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v0, v68
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v1, v69
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v39
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v2, v70
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v50
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v71
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v48
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v69
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v82
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v7, v80
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v8, v81
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v9, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v55
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v10, 16, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v84
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v52
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xff, v54
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v86
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v83
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v48
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v49
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v80
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v3, v82
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v55
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v81
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v2, v71
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v51
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v53
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v52
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v83
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v3, v86
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v1, v84
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xff, v96
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v85
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v10, v97
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v11, v87
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v99
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v3, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v103
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xff, v114
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v98
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v0, 16, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v96
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v85
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v54
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v98
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v87
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v99
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v2, v97
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v3, v102
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v103
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v0, v101
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v100
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v113
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v101
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xff, v116
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v14, v128
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v114
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v1, v113
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v117
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v112
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xff, v117
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v102
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v13, v130
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xff, v133
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v14, v132
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v0, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v116
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v128
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v16, v134
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v132
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v133
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v3, v130
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, s6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v0, v161
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v129
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v147
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v148
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xff, v118
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xff, v129
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v16, v161
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v2, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v166
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v144
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v134
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v18, v147
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v167
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v17, 16, v19
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v17, v18, 16, v22
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s7
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v1, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v0, v166
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v144
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v167
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v151
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v149
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v16, v20, 16, v21
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, s8
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v180
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v177
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v0, v180
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v149
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v177
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v165
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v162
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v42
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v41
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v1, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v0, v42
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v162
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v41
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v178
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v115
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v45
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v44
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v0, v45
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v115
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v44
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v131
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v119
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v59
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v56
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v1, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v0, v59
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v119
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v56
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v145
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v135
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v60
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v61
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v0, v60
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v135
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v61
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v150
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v146
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v63
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v62
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v1, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v0, v63
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v146
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v62
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v163
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v160
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v73
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v72
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v0, v73
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v160
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v72
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v176
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v164
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v75
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v74
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v1, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v0, v75
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v164
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v74
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v181
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v179
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v77
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v76
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v0, v77
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v179
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v76
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v183
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v182
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v78
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v79
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v28, v1, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v0, v78
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v182
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v79
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v43
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v40
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v89
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v88
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v29, v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v0, v89
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v40
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v88
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v47
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v46
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v91
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v90
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v30, v1, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v0, v91
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v46
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v90
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v58
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v57
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v92
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v93
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v31, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v0, v92
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v57
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v93
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.h, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB89_3
 ; GFX11-TRUE16-NEXT:  .LBB89_2: ; %cmp.true
@@ -156040,57 +160178,59 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX11-TRUE16-NEXT:    s_or_b32 s10, s11, s10
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, s1, s0
 ; GFX11-TRUE16-NEXT:    s_or_b32 s1, s3, s2
+; GFX11-TRUE16-NEXT:    s_addk_i32 s5, 0x300
+; GFX11-TRUE16-NEXT:    s_addk_i32 s6, 0x300
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s9, 0x300
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s0, 0x300
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s1, 0x300
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s10, 0x300
+; GFX11-TRUE16-NEXT:    s_addk_i32 s4, 0x300
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(38)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v57
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s9, s10
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(37)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v58
-; GFX11-TRUE16-NEXT:    s_addk_i32 s5, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s6, 0x300
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v57
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s5, s6
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(35)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v47
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s7, 0x300
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v46
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v92, v0
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v46
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v93, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v92, v0
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v91, v2
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v43
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v27, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v93, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v90, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v31, 0x300, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v30, 0x300, v2
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v40
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v31, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v28, 0x300, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v43, 0x300, v1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v90, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v183
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v182
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v182, 0x300, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v89, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v30, 0x300, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v181
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v88, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v181, 0x300, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v29, 0x300, v1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v78, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v79, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v29, 0x300, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v181, 0x300, v0
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v179
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v182, 0x300, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v28, 0x300, v1
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v179, 0x300, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v77, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
@@ -156099,7 +160239,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v164
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(25)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v163
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v163, 0x300, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v27, 0x300, v1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v76, v0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v3
@@ -156110,18 +160250,18 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v74, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v73, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v26, 0x300, v1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(23)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v150
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v26, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v23, 0x300, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v150, 0x300, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v25, 0x300, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v72, v3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v146
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(21)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v145
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v135
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v25, 0x300, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v135, 0x300, v0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v63, v1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v3
@@ -156129,13 +160269,13 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(19)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v131
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v62, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v131, 0x300, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v24, 0x300, v1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v60, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v61, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v24, 0x300, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v131, 0x300, v0
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v119
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v135, 0x300, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v23, 0x300, v1
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v119, 0x300, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v59, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(17)
@@ -156144,29 +160284,29 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v115
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v165
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v115, 0x300, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v56, v0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v162
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v45, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v145, 0x300, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v115, 0x300, v0
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v44, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v42, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v21, 0x300, v1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v151
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v21, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v18, 0x300, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v145, 0x300, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, 0x300, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v41, v3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v149
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v148
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v144
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, 0x300, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v144, 0x300, v0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v180, v1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v3
@@ -156180,8 +160320,8 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v133, 0x300, v0
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v129
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v129, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v144, 0x300, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v18, 0x300, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v129, 0x300, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v161, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v118
@@ -156189,167 +160329,141 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v117
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v116
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v116, 0x300, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v147, v0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v114
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v99
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v134, v1
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v114, 0x300, v0
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v132, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v130, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v103
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 3, v98
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v54
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v53
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 0x300, v1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v103
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v103, 0x300, v0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v98
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v128, v3
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v99
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v54
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 3, v39
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 3, v52
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v113, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v53
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v34, 3, v34
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v35, 3, v35
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v33, 3, v33
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v113, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v128, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v100
-; GFX11-TRUE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v101, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v102, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, 0x300, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v101, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v102, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v5
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v96
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v134, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v97, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v55
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v96
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, 0x300, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v13, 0x300, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v97, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v55
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v100
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xff, v33
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v87, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v6
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v52
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v51
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v86, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v51, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v85, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v84, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v50
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v50, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 0x300, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v83, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v48
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 3, v49
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v39
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v39, 0x300, v4
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v87, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v82, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v38
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v81, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 0x300, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v71, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v80, v6
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v51
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v86, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v85, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v84, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v51, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v50
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v50, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v49
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v83, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v48
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v82, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 3, v38
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v81, v5
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v38, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v71, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v80, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v37
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v37, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v39, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v70, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v36
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v34
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v34, 3, v35
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v35, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v69, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v34
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v112, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v68, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v34, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v67, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v66, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v33
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v33, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v32
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v32, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v65, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e64 v8, 0xffff, s4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v37
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 0x300, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v70, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 3, v36
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v64, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v10, 16, v36
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v7, 16, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v33
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v37
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff, v22
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v32, 3, v32
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v69, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v34
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xff, v35
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xff, v32
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v112, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v35, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v67, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v68, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v34, v66, v34
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v32, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v4, 16, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v35
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff, v51
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v38
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v34, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v39, 16, v33
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v50, 16, v32
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v9, 16, v35
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v15, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v13, 16, v33
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v3, 16, v34
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v16, v16, 16, v32
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff, v116
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v129
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v17
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v26, 16, v36
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v17, v114, 16, v32
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v144, 16, v33
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v20, 16, v34
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v21, 16, v35
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff, v115
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v135
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v131
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v23
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff, v27
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v145, 16, v32
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v119, 16, v33
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v24, 16, v34
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v25, 16, v35
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff, v163
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v182
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v181
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v28
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v2, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v36, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v65, v33
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x300, v34
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v36.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v33, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v64, v32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v33.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v32, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, s4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v51.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v133, 16, v19
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v160, 16, v32
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v28, v179, 16, v33
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v29, v29, 16, v34
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v30, v30, 16, v35
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v31, v31, 16, v36
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v103.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v114.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v18.h, v129.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v19.h, v133.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v20.h, v144.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v21.h, v145.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v115.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v119.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v24.h, v131.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v25.h, v135.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v26.h, v150.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v27.h, v160.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v28.h, v179.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v29.h, v181.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v30.h, v182.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.h, v43.l
 ; GFX11-TRUE16-NEXT:  .LBB89_3: ; %end
 ; GFX11-TRUE16-NEXT:    s_clause 0x1e
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:320
@@ -170061,1578 +174175,3144 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v64bf16_to_v128i8_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    scratch_store_b32 off, v40, s32
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:4
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:8
-; GFX11-NEXT:    ; meta instruction
-; GFX11-NEXT:    scratch_store_b32 off, v43, s32 offset:12
-; GFX11-NEXT:    s_mov_b32 exec_lo, s4
-; GFX11-NEXT:    v_writelane_b32 v40, s34, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s35, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s36, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s37, 3
-; GFX11-NEXT:    v_writelane_b32 v40, s38, 4
-; GFX11-NEXT:    v_writelane_b32 v40, s39, 5
-; GFX11-NEXT:    v_writelane_b32 v40, s48, 6
-; GFX11-NEXT:    v_writelane_b32 v40, s49, 7
-; GFX11-NEXT:    v_writelane_b32 v40, s50, 8
-; GFX11-NEXT:    v_writelane_b32 v40, s51, 9
-; GFX11-NEXT:    v_writelane_b32 v40, s52, 10
-; GFX11-NEXT:    v_writelane_b32 v40, s53, 11
-; GFX11-NEXT:    v_writelane_b32 v40, s54, 12
-; GFX11-NEXT:    v_writelane_b32 v40, s55, 13
-; GFX11-NEXT:    v_writelane_b32 v40, s64, 14
-; GFX11-NEXT:    v_writelane_b32 v40, s65, 15
-; GFX11-NEXT:    v_writelane_b32 v40, s66, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s67, 17
-; GFX11-NEXT:    v_writelane_b32 v40, s68, 18
-; GFX11-NEXT:    v_writelane_b32 v40, s69, 19
-; GFX11-NEXT:    v_writelane_b32 v40, s70, 20
-; GFX11-NEXT:    v_writelane_b32 v40, s71, 21
-; GFX11-NEXT:    v_writelane_b32 v40, s80, 22
-; GFX11-NEXT:    v_writelane_b32 v40, s81, 23
-; GFX11-NEXT:    v_writelane_b32 v40, s82, 24
-; GFX11-NEXT:    v_writelane_b32 v40, s83, 25
-; GFX11-NEXT:    v_writelane_b32 v40, s84, 26
-; GFX11-NEXT:    v_writelane_b32 v40, s85, 27
-; GFX11-NEXT:    v_writelane_b32 v40, s86, 28
-; GFX11-NEXT:    v_writelane_b32 v40, s87, 29
-; GFX11-NEXT:    v_writelane_b32 v40, s96, 30
-; GFX11-NEXT:    v_writelane_b32 v40, s97, 31
-; GFX11-NEXT:    v_writelane_b32 v41, s98, 0
-; GFX11-NEXT:    v_writelane_b32 v41, s99, 1
-; GFX11-NEXT:    v_writelane_b32 v41, s100, 2
-; GFX11-NEXT:    v_writelane_b32 v41, s101, 3
-; GFX11-NEXT:    v_writelane_b32 v41, s102, 4
-; GFX11-NEXT:    v_writelane_b32 v41, s103, 5
-; GFX11-NEXT:    v_writelane_b32 v41, s104, 6
-; GFX11-NEXT:    v_writelane_b32 v41, s30, 7
-; GFX11-NEXT:    v_writelane_b32 v41, s31, 8
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v15
-; GFX11-NEXT:    v_readfirstlane_b32 s72, v1
-; GFX11-NEXT:    v_readfirstlane_b32 s73, v2
-; GFX11-NEXT:    v_readfirstlane_b32 s62, v3
-; GFX11-NEXT:    v_readfirstlane_b32 s63, v4
-; GFX11-NEXT:    v_readfirstlane_b32 s60, v5
-; GFX11-NEXT:    v_readfirstlane_b32 s61, v6
-; GFX11-NEXT:    v_readfirstlane_b32 s58, v7
-; GFX11-NEXT:    v_readfirstlane_b32 s59, v8
-; GFX11-NEXT:    v_readfirstlane_b32 s56, v9
-; GFX11-NEXT:    v_readfirstlane_b32 s57, v10
-; GFX11-NEXT:    v_readfirstlane_b32 s46, v11
-; GFX11-NEXT:    v_readfirstlane_b32 s47, v12
-; GFX11-NEXT:    v_readfirstlane_b32 s44, v13
-; GFX11-NEXT:    v_readfirstlane_b32 s45, v14
-; GFX11-NEXT:    s_mov_b32 vcc_hi, 0
-; GFX11-NEXT:    s_and_b32 s4, vcc_lo, exec_lo
-; GFX11-NEXT:    ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
-; GFX11-NEXT:    ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
-; GFX11-NEXT:    s_cbranch_scc0 .LBB91_3
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    s_lshr_b32 s4, s27, 24
-; GFX11-NEXT:    s_lshr_b64 s[12:13], s[26:27], 24
-; GFX11-NEXT:    v_writelane_b32 v43, s4, 15
-; GFX11-NEXT:    s_lshr_b32 s4, s27, 16
-; GFX11-NEXT:    s_lshr_b32 s99, s2, 16
-; GFX11-NEXT:    s_lshr_b32 s100, s2, 8
-; GFX11-NEXT:    s_lshr_b32 s101, s1, 24
-; GFX11-NEXT:    v_writelane_b32 v43, s4, 14
-; GFX11-NEXT:    s_lshr_b32 s4, s27, 8
-; GFX11-NEXT:    s_lshr_b32 s11, s1, 16
-; GFX11-NEXT:    s_lshr_b32 s102, s1, 8
-; GFX11-NEXT:    s_lshr_b32 s103, s0, 16
-; GFX11-NEXT:    v_writelane_b32 v43, s4, 16
-; GFX11-NEXT:    s_lshr_b32 s4, s26, 16
-; GFX11-NEXT:    s_lshr_b32 s104, s0, 8
-; GFX11-NEXT:    s_lshr_b32 s85, s45, 24
-; GFX11-NEXT:    s_lshr_b32 s10, s45, 16
-; GFX11-NEXT:    v_writelane_b32 v43, s4, 17
-; GFX11-NEXT:    s_lshr_b32 s4, s26, 8
-; GFX11-NEXT:    s_lshr_b32 s5, s45, 8
-; GFX11-NEXT:    s_lshr_b32 s87, s44, 16
-; GFX11-NEXT:    s_lshr_b32 s86, s44, 8
-; GFX11-NEXT:    v_writelane_b32 v43, s4, 18
-; GFX11-NEXT:    s_lshr_b32 s4, s25, 24
-; GFX11-NEXT:    s_lshr_b32 s81, s47, 24
-; GFX11-NEXT:    s_lshr_b32 s98, s47, 16
-; GFX11-NEXT:    s_lshr_b32 s84, s47, 8
-; GFX11-NEXT:    v_writelane_b32 v43, s4, 19
-; GFX11-NEXT:    s_lshr_b32 s4, s25, 16
-; GFX11-NEXT:    s_lshr_b32 s48, s46, 8
-; GFX11-NEXT:    s_lshr_b32 s70, s57, 24
-; GFX11-NEXT:    s_lshr_b32 s97, s57, 16
-; GFX11-NEXT:    v_writelane_b32 v43, s4, 13
-; GFX11-NEXT:    s_lshr_b32 s4, s25, 8
-; GFX11-NEXT:    s_lshr_b32 s80, s57, 8
-; GFX11-NEXT:    s_lshr_b32 s83, s56, 16
-; GFX11-NEXT:    s_lshr_b32 s82, s56, 8
-; GFX11-NEXT:    v_writelane_b32 v43, s4, 20
-; GFX11-NEXT:    s_lshr_b32 s4, s24, 16
-; GFX11-NEXT:    s_lshr_b32 s66, s59, 24
-; GFX11-NEXT:    s_lshr_b32 s9, s59, 16
-; GFX11-NEXT:    s_lshr_b32 s69, s59, 8
-; GFX11-NEXT:    v_writelane_b32 v43, s4, 21
-; GFX11-NEXT:    s_lshr_b32 s4, s24, 8
-; GFX11-NEXT:    s_lshr_b32 s71, s58, 16
-; GFX11-NEXT:    s_lshr_b32 s39, s58, 8
-; GFX11-NEXT:    s_lshr_b32 s55, s61, 24
-; GFX11-NEXT:    v_writelane_b32 v43, s4, 22
-; GFX11-NEXT:    s_lshr_b32 s4, s23, 24
-; GFX11-NEXT:    s_lshr_b32 s8, s61, 16
-; GFX11-NEXT:    s_lshr_b32 s65, s61, 8
-; GFX11-NEXT:    s_lshr_b32 s68, s60, 16
-; GFX11-NEXT:    v_writelane_b32 v43, s4, 23
-; GFX11-NEXT:    s_lshr_b32 s4, s23, 16
-; GFX11-NEXT:    s_lshr_b32 s67, s60, 8
-; GFX11-NEXT:    s_lshr_b32 s51, s63, 24
-; GFX11-NEXT:    s_lshr_b32 s96, s63, 16
-; GFX11-NEXT:    v_writelane_b32 v43, s4, 12
-; GFX11-NEXT:    s_lshr_b32 s4, s23, 8
-; GFX11-NEXT:    s_lshr_b32 s54, s63, 8
-; GFX11-NEXT:    s_lshr_b32 s38, s62, 16
-; GFX11-NEXT:    s_lshr_b32 s64, s62, 8
-; GFX11-NEXT:    v_writelane_b32 v43, s4, 24
-; GFX11-NEXT:    s_lshr_b32 s4, s22, 16
-; GFX11-NEXT:    s_lshr_b32 s36, s73, 24
-; GFX11-NEXT:    s_lshr_b32 s7, s73, 16
-; GFX11-NEXT:    s_lshr_b32 s50, s73, 8
-; GFX11-NEXT:    v_writelane_b32 v43, s4, 25
-; GFX11-NEXT:    s_lshr_b32 s4, s22, 8
-; GFX11-NEXT:    s_lshr_b32 s53, s72, 16
-; GFX11-NEXT:    s_lshr_b32 s52, s72, 8
-; GFX11-NEXT:    s_lshr_b32 s34, s29, 24
-; GFX11-NEXT:    v_writelane_b32 v43, s4, 26
-; GFX11-NEXT:    s_lshr_b32 s4, s21, 24
-; GFX11-NEXT:    s_lshr_b32 s6, s29, 16
-; GFX11-NEXT:    s_lshr_b32 s35, s29, 8
-; GFX11-NEXT:    s_lshr_b32 s37, s28, 16
-; GFX11-NEXT:    v_writelane_b32 v43, s4, 27
-; GFX11-NEXT:    s_lshr_b32 s4, s21, 16
-; GFX11-NEXT:    s_lshr_b32 s49, s28, 8
-; GFX11-NEXT:    s_lshr_b64 s[14:15], s[16:17], 24
-; GFX11-NEXT:    s_lshr_b64 s[40:41], s[2:3], 24
-; GFX11-NEXT:    v_writelane_b32 v43, s4, 11
-; GFX11-NEXT:    s_lshr_b32 s4, s21, 8
-; GFX11-NEXT:    s_lshr_b64 s[42:43], s[0:1], 24
-; GFX11-NEXT:    s_lshr_b64 s[74:75], s[44:45], 24
-; GFX11-NEXT:    s_lshr_b64 s[76:77], s[46:47], 24
-; GFX11-NEXT:    v_writelane_b32 v43, s4, 28
-; GFX11-NEXT:    s_lshr_b32 s4, s20, 16
-; GFX11-NEXT:    s_lshr_b64 s[78:79], s[56:57], 24
-; GFX11-NEXT:    s_lshr_b64 s[88:89], s[58:59], 24
-; GFX11-NEXT:    s_lshr_b64 s[90:91], s[60:61], 24
-; GFX11-NEXT:    v_writelane_b32 v43, s4, 29
-; GFX11-NEXT:    s_lshr_b32 s4, s20, 8
-; GFX11-NEXT:    s_lshr_b64 s[92:93], s[62:63], 24
-; GFX11-NEXT:    s_lshr_b64 s[94:95], s[72:73], 24
-; GFX11-NEXT:    s_lshr_b64 s[30:31], s[28:29], 24
-; GFX11-NEXT:    v_writelane_b32 v43, s4, 30
-; GFX11-NEXT:    s_lshr_b32 s4, s19, 24
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_writelane_b32 v43, s4, 31
-; GFX11-NEXT:    s_lshr_b32 s4, s19, 16
-; GFX11-NEXT:    v_writelane_b32 v43, s4, 10
-; GFX11-NEXT:    s_lshr_b32 s4, s19, 8
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_writelane_b32 v42, s4, 0
-; GFX11-NEXT:    s_lshr_b32 s4, s18, 16
-; GFX11-NEXT:    v_writelane_b32 v42, s4, 1
-; GFX11-NEXT:    s_lshr_b32 s4, s18, 8
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_writelane_b32 v42, s4, 2
-; GFX11-NEXT:    s_lshr_b32 s4, s17, 24
-; GFX11-NEXT:    v_writelane_b32 v42, s4, 3
-; GFX11-NEXT:    s_lshr_b32 s4, s17, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_writelane_b32 v43, s4, 9
-; GFX11-NEXT:    s_lshr_b32 s4, s17, 8
-; GFX11-NEXT:    v_writelane_b32 v42, s4, 4
-; GFX11-NEXT:    s_lshr_b32 s4, s16, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_writelane_b32 v42, s4, 5
-; GFX11-NEXT:    s_lshr_b32 s4, s16, 8
-; GFX11-NEXT:    v_writelane_b32 v42, s4, 6
-; GFX11-NEXT:    s_lshr_b32 s4, s3, 24
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_writelane_b32 v42, s4, 7
-; GFX11-NEXT:    s_lshr_b32 s4, s3, 16
-; GFX11-NEXT:    v_writelane_b32 v43, s4, 8
-; GFX11-NEXT:    s_lshr_b32 s4, s3, 8
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_writelane_b32 v42, s4, 8
-; GFX11-NEXT:    s_lshr_b32 s4, s46, 16
-; GFX11-NEXT:    v_writelane_b32 v43, s12, 6
-; GFX11-NEXT:    v_writelane_b32 v43, s13, 7
-; GFX11-NEXT:    s_lshr_b64 s[12:13], s[24:25], 24
-; GFX11-NEXT:    v_writelane_b32 v43, s12, 4
-; GFX11-NEXT:    v_writelane_b32 v43, s13, 5
-; GFX11-NEXT:    s_lshr_b64 s[12:13], s[22:23], 24
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_writelane_b32 v43, s12, 2
-; GFX11-NEXT:    v_writelane_b32 v43, s13, 3
-; GFX11-NEXT:    s_lshr_b64 s[12:13], s[20:21], 24
-; GFX11-NEXT:    v_writelane_b32 v43, s12, 0
-; GFX11-NEXT:    v_writelane_b32 v43, s13, 1
-; GFX11-NEXT:    s_lshr_b64 s[12:13], s[18:19], 24
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, vcc_hi
-; GFX11-NEXT:    s_cbranch_vccnz .LBB91_4
-; GFX11-NEXT:  .LBB91_2: ; %cmp.true
-; GFX11-NEXT:    s_and_b32 s4, s29, 0xffff0000
-; GFX11-NEXT:    s_and_b32 s14, s47, 0xffff0000
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s4
-; GFX11-NEXT:    s_and_b32 s4, s1, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s15, s47, 16
-; GFX11-NEXT:    v_add_f32_e64 v6, 0x40c00000, s4
-; GFX11-NEXT:    s_lshl_b32 s6, s29, 16
-; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v2, 0x40c00000, s6
-; GFX11-NEXT:    s_and_b32 s8, s45, 0xffff0000
-; GFX11-NEXT:    v_readfirstlane_b32 s47, v6
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT:    s_lshl_b32 s7, s45, 16
-; GFX11-NEXT:    s_and_b32 s78, s28, 0xffff0000
-; GFX11-NEXT:    s_bfe_u32 s6, s47, 0x10010
-; GFX11-NEXT:    s_lshl_b32 s79, s28, 16
-; GFX11-NEXT:    s_add_i32 s45, s6, s47
-; GFX11-NEXT:    s_and_b32 s5, s73, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s77, s73, 16
-; GFX11-NEXT:    s_and_b32 s75, s72, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s76, s72, 16
-; GFX11-NEXT:    s_and_b32 s11, s63, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s74, s63, 16
-; GFX11-NEXT:    s_and_b32 s72, s62, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s73, s62, 16
-; GFX11-NEXT:    s_and_b32 s63, s61, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s62, s61, 16
-; GFX11-NEXT:    s_and_b32 s61, s60, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s60, s60, 16
-; GFX11-NEXT:    s_and_b32 s41, s59, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s40, s59, 16
-; GFX11-NEXT:    s_and_b32 s28, s58, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s29, s58, 16
-; GFX11-NEXT:    s_and_b32 s13, s57, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s10, s57, 16
-; GFX11-NEXT:    s_and_b32 s42, s56, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s43, s56, 16
-; GFX11-NEXT:    s_and_b32 s12, s46, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s9, s46, 16
-; GFX11-NEXT:    s_and_b32 s4, s44, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s6, s44, 16
-; GFX11-NEXT:    s_addk_i32 s45, 0x7fff
-; GFX11-NEXT:    s_bitset1_b32 s47, 22
-; GFX11-NEXT:    v_bfe_u32 v4, v2, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    s_and_b32 s44, vcc_lo, exec_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    s_cselect_b32 s44, s47, s45
-; GFX11-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v2
-; GFX11-NEXT:    s_lshr_b32 s58, s44, 16
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v3, 0x40c00000, s1
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_add_f32_e64 v6, 0x40c00000, s78
-; GFX11-NEXT:    v_readfirstlane_b32 s1, v3
-; GFX11-NEXT:    v_add_f32_e64 v7, 0x40c00000, s79
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    s_bfe_u32 s45, s1, 0x10010
-; GFX11-NEXT:    v_bfe_u32 v4, v6, 16, 1
-; GFX11-NEXT:    s_add_i32 s45, s45, s1
-; GFX11-NEXT:    s_bitset1_b32 s1, 22
-; GFX11-NEXT:    s_addk_i32 s45, 0x7fff
-; GFX11-NEXT:    s_and_b32 s44, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cselect_b32 s1, s1, s45
-; GFX11-NEXT:    s_and_b32 s44, s0, 0xffff0000
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 16, v2
-; GFX11-NEXT:    v_add_f32_e64 v2, 0x40c00000, s44
-; GFX11-NEXT:    v_bfe_u32 v5, v7, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v4, v6
-; GFX11-NEXT:    s_lshr_b32 s1, s1, 16
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v7
-; GFX11-NEXT:    v_readfirstlane_b32 s44, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    s_bfe_u32 s45, s44, 0x10010
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_i32 s45, s45, s44
-; GFX11-NEXT:    s_bitset1_b32 s44, 22
-; GFX11-NEXT:    s_addk_i32 s45, 0x7fff
-; GFX11-NEXT:    s_and_b32 s46, vcc_lo, exec_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, v5, v7
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v6
-; GFX11-NEXT:    s_cselect_b32 s44, s44, s45
-; GFX11-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v3, 0x40c00000, s0
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v21
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s5
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s77
-; GFX11-NEXT:    s_bfe_u32 s5, s0, 0x10010
-; GFX11-NEXT:    v_lshl_or_b32 v7, v22, 16, v4
-; GFX11-NEXT:    s_add_i32 s45, s5, s0
-; GFX11-NEXT:    s_lshr_b32 s5, s44, 16
-; GFX11-NEXT:    s_addk_i32 s45, 0x7fff
-; GFX11-NEXT:    s_bitset1_b32 s0, 22
-; GFX11-NEXT:    s_and_b32 s44, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cselect_b32 s0, s0, s45
-; GFX11-NEXT:    s_and_b32 s44, s3, 0xffff0000
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v1
-; GFX11-NEXT:    v_add_f32_e64 v9, 0x40c00000, s44
-; GFX11-NEXT:    v_bfe_u32 v6, v8, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v1, v5, 16, 1
-; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v23
-; GFX11-NEXT:    v_readfirstlane_b32 s44, v9
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v6, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, v1, v5
-; GFX11-NEXT:    v_lshl_or_b32 v6, v2, 16, v3
-; GFX11-NEXT:    s_bfe_u32 s45, s44, 0x10010
-; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v5
-; GFX11-NEXT:    s_add_i32 s45, s45, s44
-; GFX11-NEXT:    s_bitset1_b32 s44, 22
-; GFX11-NEXT:    s_addk_i32 s45, 0x7fff
-; GFX11-NEXT:    s_and_b32 s46, vcc_lo, exec_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v4
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v8
-; GFX11-NEXT:    s_cselect_b32 s44, s44, s45
-; GFX11-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_add_f32_e64 v10, 0x40c00000, s3
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT:    v_add_f32_e64 v9, 0x40c00000, s76
-; GFX11-NEXT:    s_lshr_b32 s59, s44, 16
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s75
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_readfirstlane_b32 s3, v10
-; GFX11-NEXT:    v_bfe_u32 v8, v9, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v87, 24, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v96, 16, v6
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v2, v4, 16, 1
-; GFX11-NEXT:    s_bfe_u32 s45, s3, 0x10010
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    s_add_i32 s45, s45, s3
-; GFX11-NEXT:    s_bitset1_b32 s3, 22
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v4
-; GFX11-NEXT:    s_addk_i32 s45, 0x7fff
-; GFX11-NEXT:    s_and_b32 s44, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cselect_b32 s3, s3, s45
-; GFX11-NEXT:    s_and_b32 s44, s2, 0xffff0000
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v1
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s44
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 16, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v8, v9
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v9
-; GFX11-NEXT:    v_readfirstlane_b32 s44, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    s_lshr_b32 s3, s3, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v4
-; GFX11-NEXT:    s_bfe_u32 s45, s44, 0x10010
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v24
-; GFX11-NEXT:    s_add_i32 s45, s45, s44
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    s_addk_i32 s45, 0x7fff
-; GFX11-NEXT:    s_bitset1_b32 s44, 22
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s74
-; GFX11-NEXT:    v_lshl_or_b32 v14, v25, 16, v5
-; GFX11-NEXT:    s_and_b32 s46, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cselect_b32 s44, s44, s45
-; GFX11-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s2
-; GFX11-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v85, 24, v14
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_readfirstlane_b32 s2, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v3
-; GFX11-NEXT:    v_add_f32_e64 v3, 0x40c00000, s11
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    s_bfe_u32 s11, s2, 0x10010
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    s_add_i32 s45, s11, s2
-; GFX11-NEXT:    s_lshr_b32 s11, s44, 16
-; GFX11-NEXT:    s_addk_i32 s45, 0x7fff
-; GFX11-NEXT:    s_bitset1_b32 s2, 22
-; GFX11-NEXT:    s_and_b32 s44, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cselect_b32 s2, s2, s45
-; GFX11-NEXT:    s_and_b32 s44, s17, 0xffff0000
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v26
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s44
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_bfe_u32 v10, v3, 16, 1
-; GFX11-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX11-NEXT:    v_lshl_or_b32 v13, v2, 16, v9
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX11-NEXT:    v_readfirstlane_b32 s44, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v10, v3
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v86, 16, v13
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX11-NEXT:    s_bfe_u32 s45, s44, 0x10010
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    s_add_i32 s45, s45, s44
-; GFX11-NEXT:    s_bitset1_b32 s44, 22
-; GFX11-NEXT:    s_addk_i32 s45, 0x7fff
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v8
-; GFX11-NEXT:    s_and_b32 s46, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cselect_b32 s44, s44, s45
-; GFX11-NEXT:    s_lshl_b32 s17, s17, 16
-; GFX11-NEXT:    v_add_f32_e64 v2, 0x40c00000, s73
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s17
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v1
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s72
-; GFX11-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT:    v_readfirstlane_b32 s17, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    s_lshr_b32 s72, s44, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v5, v2
-; GFX11-NEXT:    s_bfe_u32 s45, s17, 0x10010
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v27
-; GFX11-NEXT:    s_add_i32 s45, s45, s17
-; GFX11-NEXT:    s_bitset1_b32 s17, 22
-; GFX11-NEXT:    s_addk_i32 s45, 0x7fff
-; GFX11-NEXT:    s_and_b32 s44, vcc_lo, exec_lo
-; GFX11-NEXT:    v_lshl_or_b32 v16, v28, 16, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_bfe_u32 v8, v1, 16, 1
-; GFX11-NEXT:    s_cselect_b32 s17, s17, s45
-; GFX11-NEXT:    s_and_b32 s44, s16, 0xffff0000
-; GFX11-NEXT:    s_lshr_b32 s17, s17, 16
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v3, 0x40c00000, s63
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 24, v16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v2
-; GFX11-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v5, v3
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v29
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v8, v1
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s44
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_readfirstlane_b32 s44, v8
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    s_bfe_u32 s45, s44, 0x10010
-; GFX11-NEXT:    s_add_i32 s45, s45, s44
-; GFX11-NEXT:    s_bitset1_b32 s44, 22
-; GFX11-NEXT:    s_addk_i32 s45, 0x7fff
-; GFX11-NEXT:    s_and_b32 s46, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cselect_b32 s44, s44, s45
-; GFX11-NEXT:    s_lshl_b32 s16, s16, 16
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s16
-; GFX11-NEXT:    s_lshr_b32 s46, s44, 16
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v4, v9, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_readfirstlane_b32 s16, v8
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s62
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_bfe_u32 s45, s16, 0x10010
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_i32 s45, s45, s16
-; GFX11-NEXT:    s_bitset1_b32 s16, 22
-; GFX11-NEXT:    s_addk_i32 s45, 0x7fff
-; GFX11-NEXT:    s_and_b32 s44, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cselect_b32 s16, s16, s45
-; GFX11-NEXT:    s_and_b32 s44, s19, 0xffff0000
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_add_f32_e64 v10, 0x40c00000, s44
-; GFX11-NEXT:    v_lshl_or_b32 v15, v1, 16, v5
-; GFX11-NEXT:    v_bfe_u32 v9, v4, 16, 1
-; GFX11-NEXT:    s_lshr_b32 s16, s16, 16
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v8, vcc_lo
-; GFX11-NEXT:    v_readfirstlane_b32 s44, v10
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, v9, v4
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s60
-; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v4
-; GFX11-NEXT:    s_bfe_u32 s45, s44, 0x10010
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s61
-; GFX11-NEXT:    s_add_i32 s45, s45, s44
-; GFX11-NEXT:    s_bitset1_b32 s44, 22
-; GFX11-NEXT:    s_addk_i32 s45, 0x7fff
-; GFX11-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cselect_b32 s44, s44, s45
-; GFX11-NEXT:    s_lshl_b32 s19, s19, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v9
-; GFX11-NEXT:    v_add_f32_e64 v10, 0x40c00000, s19
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_bfe_u32 v9, v8, 16, 1
-; GFX11-NEXT:    s_lshr_b32 s60, s44, 16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v1
-; GFX11-NEXT:    v_readfirstlane_b32 s19, v10
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    v_bfe_u32 v3, v5, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v9, v8
-; GFX11-NEXT:    s_bfe_u32 s45, s19, 0x10010
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v2
-; GFX11-NEXT:    s_add_i32 s45, s45, s19
-; GFX11-NEXT:    s_bitset1_b32 s19, 22
-; GFX11-NEXT:    s_addk_i32 s45, 0x7fff
-; GFX11-NEXT:    s_and_b32 s44, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cselect_b32 s19, s19, s45
-; GFX11-NEXT:    s_and_b32 s44, s18, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, v3, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v8
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s44
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    s_lshr_b32 s19, s19, 16
-; GFX11-NEXT:    v_add_f32_e64 v10, 0x40c00000, s29
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v3, 0x40c00000, s41
-; GFX11-NEXT:    v_readfirstlane_b32 s41, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    s_pack_ll_b32_b16 s47, s17, s72
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v2
-; GFX11-NEXT:    v_bfe_u32 v2, v3, 16, 1
-; GFX11-NEXT:    s_bfe_u32 s44, s41, 0x10010
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    s_add_i32 s44, s44, s41
-; GFX11-NEXT:    s_bitset1_b32 s41, 22
-; GFX11-NEXT:    s_addk_i32 s44, 0x7fff
-; GFX11-NEXT:    v_add_f32_e64 v9, 0x40c00000, s40
-; GFX11-NEXT:    s_and_b32 s45, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cselect_b32 s41, s41, s44
-; GFX11-NEXT:    s_lshl_b32 s18, s18, 16
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v31
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v32
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v3
-; GFX11-NEXT:    v_lshl_or_b32 v18, v30, 16, v4
-; GFX11-NEXT:    v_readfirstlane_b32 s18, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_lshl_or_b32 v17, v1, 16, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v2
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v3
-; GFX11-NEXT:    s_bfe_u32 s40, s18, 0x10010
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s28
-; GFX11-NEXT:    s_add_i32 s44, s40, s18
-; GFX11-NEXT:    s_lshr_b32 s40, s41, 16
-; GFX11-NEXT:    s_addk_i32 s44, 0x7fff
-; GFX11-NEXT:    s_bitset1_b32 s18, 22
-; GFX11-NEXT:    s_and_b32 s41, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cselect_b32 s18, s18, s44
-; GFX11-NEXT:    s_and_b32 s41, s21, 0xffff0000
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s41
-; GFX11-NEXT:    v_bfe_u32 v2, v9, 16, 1
-; GFX11-NEXT:    s_lshr_b32 s18, s18, 16
-; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v9
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
-; GFX11-NEXT:    v_readfirstlane_b32 s28, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v9
-; GFX11-NEXT:    v_bfe_u32 v4, v8, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v5, v10, 16, 1
-; GFX11-NEXT:    s_bfe_u32 s29, s28, 0x10010
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v1
-; GFX11-NEXT:    s_add_i32 s29, s29, s28
-; GFX11-NEXT:    s_bitset1_b32 s28, 22
-; GFX11-NEXT:    s_addk_i32 s29, 0x7fff
-; GFX11-NEXT:    s_and_b32 s41, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cselect_b32 s28, s28, s29
-; GFX11-NEXT:    s_lshl_b32 s21, s21, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_add_f32_e64 v11, 0x40c00000, s21
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    s_lshr_b32 s61, s28, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, v5, v10
-; GFX11-NEXT:    s_pack_ll_b32_b16 s44, s2, s11
-; GFX11-NEXT:    v_readfirstlane_b32 s21, v11
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v4, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT:    s_bfe_u32 s29, s21, 0x10010
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v2
-; GFX11-NEXT:    s_add_i32 s29, s29, s21
-; GFX11-NEXT:    s_bitset1_b32 s21, 22
-; GFX11-NEXT:    s_addk_i32 s29, 0x7fff
-; GFX11-NEXT:    s_and_b32 s28, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cselect_b32 s21, s21, s29
-; GFX11-NEXT:    s_and_b32 s28, s20, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v3
-; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v8
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s28
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v10
-; GFX11-NEXT:    s_lshr_b32 s21, s21, 16
-; GFX11-NEXT:    s_pack_ll_b32_b16 s45, s3, s59
-; GFX11-NEXT:    s_pack_ll_b32_b16 s46, s16, s46
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    v_add_f32_e64 v3, 0x40c00000, s13
-; GFX11-NEXT:    v_readfirstlane_b32 s13, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 24, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    s_bfe_u32 s28, s13, 0x10010
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v34
-; GFX11-NEXT:    s_add_i32 s28, s28, s13
-; GFX11-NEXT:    s_bitset1_b32 s13, 22
-; GFX11-NEXT:    s_addk_i32 s28, 0x7fff
-; GFX11-NEXT:    s_and_b32 s29, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cselect_b32 s13, s13, s28
-; GFX11-NEXT:    s_lshl_b32 s20, s20, 16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v1
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s20
-; GFX11-NEXT:    v_bfe_u32 v1, v3, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s10
-; GFX11-NEXT:    v_lshl_or_b32 v20, v33, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v35
-; GFX11-NEXT:    v_readfirstlane_b32 s20, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, v1, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT:    v_lshl_or_b32 v19, v2, 16, v9
-; GFX11-NEXT:    s_bfe_u32 s10, s20, 0x10010
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT:    s_add_i32 s28, s10, s20
-; GFX11-NEXT:    s_lshr_b32 s10, s13, 16
-; GFX11-NEXT:    s_addk_i32 s28, 0x7fff
-; GFX11-NEXT:    s_bitset1_b32 s20, 22
-; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v3
-; GFX11-NEXT:    s_and_b32 s13, vcc_lo, exec_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    s_cselect_b32 s13, s20, s28
-; GFX11-NEXT:    s_and_b32 s20, s23, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s42
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v2, 0x40c00000, s20
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v4
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v5
-; GFX11-NEXT:    v_add_f32_e64 v9, 0x40c00000, s43
-; GFX11-NEXT:    v_readfirstlane_b32 s28, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 16, v19
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    s_bfe_u32 s20, s28, 0x10010
-; GFX11-NEXT:    v_bfe_u32 v4, v8, 16, 1
-; GFX11-NEXT:    s_add_i32 s29, s20, s28
-; GFX11-NEXT:    s_lshr_b32 s20, s13, 16
-; GFX11-NEXT:    s_addk_i32 s29, 0x7fff
-; GFX11-NEXT:    s_bitset1_b32 s28, 22
-; GFX11-NEXT:    s_and_b32 s13, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cselect_b32 s13, s28, s29
-; GFX11-NEXT:    s_lshl_b32 s23, s23, 16
-; GFX11-NEXT:    v_bfe_u32 v5, v9, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v2, 0x40c00000, s23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v4, v8
-; GFX11-NEXT:    s_lshr_b32 s62, s13, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, v5, v9
-; GFX11-NEXT:    v_readfirstlane_b32 s23, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT:    s_bfe_u32 s28, s23, 0x10010
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v9
-; GFX11-NEXT:    s_add_i32 s28, s28, s23
-; GFX11-NEXT:    s_bitset1_b32 s23, 22
-; GFX11-NEXT:    s_addk_i32 s28, 0x7fff
-; GFX11-NEXT:    s_and_b32 s13, vcc_lo, exec_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    s_cselect_b32 s13, s23, s28
-; GFX11-NEXT:    s_and_b32 s23, s22, 0xffff0000
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s15
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v36
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v3, 0x40c00000, s23
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s14
-; GFX11-NEXT:    s_lshr_b32 s23, s13, 16
-; GFX11-NEXT:    v_bfe_u32 v9, v8, 16, 1
-; GFX11-NEXT:    v_readfirstlane_b32 s14, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_lshl_or_b32 v71, v37, 16, v4
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s12
-; GFX11-NEXT:    s_bfe_u32 s15, s14, 0x10010
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v1
-; GFX11-NEXT:    s_add_i32 s15, s15, s14
-; GFX11-NEXT:    s_bitset1_b32 s14, 22
-; GFX11-NEXT:    s_addk_i32 s15, 0x7fff
-; GFX11-NEXT:    s_and_b32 s13, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cselect_b32 s13, s14, s15
-; GFX11-NEXT:    s_lshl_b32 s14, s22, 16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_add_f32_e64 v10, 0x40c00000, s14
-; GFX11-NEXT:    v_bfe_u32 v1, v5, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v38
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, v9, v8
-; GFX11-NEXT:    s_lshr_b32 s13, s13, 16
-; GFX11-NEXT:    v_readfirstlane_b32 s14, v10
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, v1, v5
-; GFX11-NEXT:    v_lshl_or_b32 v70, v2, 16, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v9
-; GFX11-NEXT:    s_bfe_u32 s12, s14, 0x10010
-; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v8
-; GFX11-NEXT:    s_add_i32 s12, s12, s14
-; GFX11-NEXT:    s_bitset1_b32 s14, 22
-; GFX11-NEXT:    s_addk_i32 s12, 0x7fff
-; GFX11-NEXT:    s_and_b32 s15, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cselect_b32 s12, s14, s12
-; GFX11-NEXT:    s_and_b32 s14, s25, 0xffff0000
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_add_f32_e64 v10, 0x40c00000, s14
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s9
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-NEXT:    v_readfirstlane_b32 s9, v10
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    s_lshr_b32 s22, s12, 16
-; GFX11-NEXT:    v_bfe_u32 v3, v4, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v2
-; GFX11-NEXT:    s_bfe_u32 s14, s9, 0x10010
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    s_add_i32 s14, s14, s9
-; GFX11-NEXT:    s_bitset1_b32 s9, 22
-; GFX11-NEXT:    s_addk_i32 s14, 0x7fff
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
-; GFX11-NEXT:    s_and_b32 s12, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cselect_b32 s9, s9, s14
-; GFX11-NEXT:    s_lshl_b32 s12, s25, 16
-; GFX11-NEXT:    v_add_f32_e64 v9, 0x40c00000, s8
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s12
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v3, v4
-; GFX11-NEXT:    s_lshr_b32 s63, s9, 16
-; GFX11-NEXT:    v_bfe_u32 v3, v8, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v4
-; GFX11-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v3, v8
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v8
-; GFX11-NEXT:    s_bfe_u32 s12, s8, 0x10010
-; GFX11-NEXT:    v_bfe_u32 v12, v9, 16, 1
-; GFX11-NEXT:    s_add_i32 s12, s12, s8
-; GFX11-NEXT:    s_bitset1_b32 s8, 22
-; GFX11-NEXT:    s_addk_i32 s12, 0x7fff
-; GFX11-NEXT:    s_and_b32 s9, vcc_lo, exec_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    s_cselect_b32 s8, s8, s12
-; GFX11-NEXT:    s_and_b32 s9, s24, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    s_lshr_b32 s25, s8, 16
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v10, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v2, 0x40c00000, s9
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s7
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v12, v9
-; GFX11-NEXT:    v_add_f32_e64 v12, 0x40c00000, s6
-; GFX11-NEXT:    v_readfirstlane_b32 s7, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v9
-; GFX11-NEXT:    s_pack_ll_b32_b16 s28, s0, s5
-; GFX11-NEXT:    s_bfe_u32 s9, s7, 0x10010
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v3
-; GFX11-NEXT:    s_add_i32 s9, s9, s7
-; GFX11-NEXT:    s_bitset1_b32 s7, 22
-; GFX11-NEXT:    s_addk_i32 s9, 0x7fff
-; GFX11-NEXT:    s_and_b32 s8, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cselect_b32 s7, s7, s9
-; GFX11-NEXT:    s_lshl_b32 s8, s24, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v4
-; GFX11-NEXT:    v_add_f32_e64 v10, 0x40c00000, s8
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_add_f32_e64 v9, 0x40c00000, s4
-; GFX11-NEXT:    v_bfe_u32 v4, v8, 16, 1
-; GFX11-NEXT:    s_lshr_b32 s12, s7, 16
-; GFX11-NEXT:    v_readfirstlane_b32 s8, v10
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v8
-; GFX11-NEXT:    v_bfe_u32 v10, v12, 16, 1
-; GFX11-NEXT:    s_bfe_u32 s4, s8, 0x10010
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v2
-; GFX11-NEXT:    s_add_i32 s4, s4, s8
-; GFX11-NEXT:    s_bitset1_b32 s8, 22
-; GFX11-NEXT:    s_addk_i32 s4, 0x7fff
-; GFX11-NEXT:    s_and_b32 s6, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cselect_b32 s4, s8, s4
-; GFX11-NEXT:    s_and_b32 s6, s27, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v4
-; GFX11-NEXT:    v_add_f32_e64 v52, 0x40c00000, s6
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v8
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v10, v12
-; GFX11-NEXT:    s_lshr_b32 s24, s4, 16
-; GFX11-NEXT:    v_readfirstlane_b32 s6, v52
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v9
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
-; GFX11-NEXT:    v_bfe_u32 v4, v9, 16, 1
-; GFX11-NEXT:    s_bfe_u32 s7, s6, 0x10010
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_add_i32 s7, s7, s6
-; GFX11-NEXT:    s_bitset1_b32 s6, 22
-; GFX11-NEXT:    s_addk_i32 s7, 0x7fff
-; GFX11-NEXT:    s_and_b32 s4, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cselect_b32 s4, s6, s7
-; GFX11-NEXT:    s_lshl_b32 s6, s27, 16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v4, v9
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v8
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v12
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s6
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    s_lshr_b32 s73, s4, 16
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v49
-; GFX11-NEXT:    v_readfirstlane_b32 s6, v8
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v51
-; GFX11-NEXT:    v_lshl_or_b32 v66, v1, 16, v11
-; GFX11-NEXT:    s_bfe_u32 s7, s6, 0x10010
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    s_add_i32 s7, s7, s6
-; GFX11-NEXT:    s_bitset1_b32 s6, 22
-; GFX11-NEXT:    s_addk_i32 s7, 0x7fff
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    s_and_b32 s4, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cselect_b32 s4, s6, s7
-; GFX11-NEXT:    s_and_b32 s6, s26, 0xffff0000
-; GFX11-NEXT:    s_lshr_b32 s27, s4, 16
-; GFX11-NEXT:    v_add_f32_e64 v3, 0x40c00000, s6
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v52
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v39
-; GFX11-NEXT:    v_lshl_or_b32 v55, v50, 16, v4
-; GFX11-NEXT:    s_pack_ll_b32_b16 s8, s22, s13
-; GFX11-NEXT:    v_readfirstlane_b32 s6, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_lshl_or_b32 v54, v2, 16, v8
-; GFX11-NEXT:    v_lshl_or_b32 v67, v48, 16, v5
-; GFX11-NEXT:    v_lshrrev_b64 v[8:9], 24, v[17:18]
-; GFX11-NEXT:    s_bfe_u32 s5, s6, 0x10010
-; GFX11-NEXT:    v_lshrrev_b64 v[9:10], 24, v[15:16]
-; GFX11-NEXT:    s_add_i32 s5, s5, s6
-; GFX11-NEXT:    s_bitset1_b32 s6, 22
-; GFX11-NEXT:    s_addk_i32 s5, 0x7fff
-; GFX11-NEXT:    s_and_b32 s4, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cselect_b32 s14, s6, s5
-; GFX11-NEXT:    s_lshl_b32 s4, s26, 16
-; GFX11-NEXT:    s_pack_ll_b32_b16 s6, s20, s10
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s4
-; GFX11-NEXT:    s_lshr_b32 s13, s14, 16
-; GFX11-NEXT:    v_lshrrev_b64 v[10:11], 24, v[13:14]
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[6:7]
-; GFX11-NEXT:    s_pack_ll_b32_b16 s29, s1, s58
-; GFX11-NEXT:    v_readfirstlane_b32 s11, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_lshrrev_b64 v[1:2], 24, v[54:55]
-; GFX11-NEXT:    v_lshrrev_b64 v[2:3], 24, v[66:67]
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[70:71]
-; GFX11-NEXT:    s_bfe_u32 s10, s11, 0x10010
-; GFX11-NEXT:    v_lshrrev_b64 v[4:5], 24, v[19:20]
-; GFX11-NEXT:    s_add_i32 s10, s10, s11
-; GFX11-NEXT:    s_bitset1_b32 s11, 22
-; GFX11-NEXT:    s_addk_i32 s10, 0x7fff
-; GFX11-NEXT:    s_and_b32 s14, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cselect_b32 s10, s11, s10
-; GFX11-NEXT:    s_pack_ll_b32_b16 s5, s19, s60
-; GFX11-NEXT:    s_lshr_b32 s26, s10, 16
-; GFX11-NEXT:    s_pack_ll_b32_b16 s4, s18, s40
-; GFX11-NEXT:    s_pack_ll_b32_b16 s9, s23, s62
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 24, v55
-; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 8, v55
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v54
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 8, v54
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 24, v67
-; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 8, v67
-; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v66
-; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 8, v66
-; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 24, v71
-; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 8, v71
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 16, v70
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 8, v70
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 24, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 8, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 8, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 8, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 8, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 8, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v84, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 8, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 8, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 8, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 8, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 8, v6
-; GFX11-NEXT:    s_pack_ll_b32_b16 s7, s21, s61
-; GFX11-NEXT:    s_pack_ll_b32_b16 s11, s25, s63
-; GFX11-NEXT:    s_pack_ll_b32_b16 s57, s27, s73
-; GFX11-NEXT:    s_pack_ll_b32_b16 s56, s26, s13
-; GFX11-NEXT:    s_pack_ll_b32_b16 s10, s24, s12
-; GFX11-NEXT:    s_lshr_b64 s[94:95], s[8:9], 24
-; GFX11-NEXT:    s_lshr_b64 s[12:13], s[4:5], 24
-; GFX11-NEXT:    s_lshr_b64 s[14:15], s[46:47], 24
-; GFX11-NEXT:    s_lshr_b64 s[40:41], s[44:45], 24
-; GFX11-NEXT:    s_lshr_b64 s[42:43], s[28:29], 24
-; GFX11-NEXT:    s_lshr_b64 vcc, s[56:57], 24
-; GFX11-NEXT:    s_lshr_b64 s[34:35], s[10:11], 24
-; GFX11-NEXT:    s_lshr_b64 s[30:31], s[6:7], 24
-; GFX11-NEXT:    s_lshr_b32 s13, s57, 24
-; GFX11-NEXT:    s_lshr_b32 s15, s57, 8
-; GFX11-NEXT:    s_lshr_b32 s41, s56, 16
-; GFX11-NEXT:    s_lshr_b32 s43, s56, 8
-; GFX11-NEXT:    s_lshr_b32 s56, s11, 24
-; GFX11-NEXT:    s_lshr_b32 s11, s11, 8
-; GFX11-NEXT:    s_lshr_b32 s57, s10, 16
-; GFX11-NEXT:    s_lshr_b32 s10, s10, 8
-; GFX11-NEXT:    s_lshr_b32 s74, s9, 24
-; GFX11-NEXT:    s_lshr_b32 s9, s9, 8
-; GFX11-NEXT:    s_lshr_b32 s75, s8, 16
-; GFX11-NEXT:    s_lshr_b32 s8, s8, 8
-; GFX11-NEXT:    s_lshr_b32 s76, s7, 24
-; GFX11-NEXT:    s_lshr_b32 s77, s7, 8
-; GFX11-NEXT:    s_lshr_b32 s78, s6, 16
-; GFX11-NEXT:    s_lshr_b32 s79, s6, 8
-; GFX11-NEXT:    s_lshr_b32 s88, s5, 24
-; GFX11-NEXT:    s_lshr_b32 s89, s5, 8
-; GFX11-NEXT:    s_lshr_b32 s90, s4, 16
-; GFX11-NEXT:    s_lshr_b32 s91, s4, 8
-; GFX11-NEXT:    s_lshr_b32 s92, s47, 24
-; GFX11-NEXT:    s_lshr_b32 s47, s47, 8
-; GFX11-NEXT:    s_lshr_b32 s93, s46, 16
-; GFX11-NEXT:    s_lshr_b32 s46, s46, 8
-; GFX11-NEXT:    s_lshr_b32 s95, s45, 24
-; GFX11-NEXT:    s_lshr_b32 s45, s45, 8
-; GFX11-NEXT:    s_lshr_b32 s99, s44, 16
-; GFX11-NEXT:    s_lshr_b32 s100, s44, 8
-; GFX11-NEXT:    s_lshr_b32 s101, s29, 24
-; GFX11-NEXT:    s_lshr_b32 s102, s29, 8
-; GFX11-NEXT:    s_lshr_b32 s103, s28, 16
-; GFX11-NEXT:    s_lshr_b32 s104, s28, 8
-; GFX11-NEXT:    s_branch .LBB91_5
-; GFX11-NEXT:  .LBB91_3:
-; GFX11-NEXT:    ; implicit-def: $sgpr4
-; GFX11-NEXT:    ; kill: killed $sgpr4
-; GFX11-NEXT:    ; implicit-def: $sgpr74
-; GFX11-NEXT:    ; implicit-def: $sgpr4
-; GFX11-NEXT:    ; kill: killed $sgpr4
-; GFX11-NEXT:    ; implicit-def: $sgpr104
-; GFX11-NEXT:    ; implicit-def: $sgpr103
-; GFX11-NEXT:    ; implicit-def: $sgpr42
-; GFX11-NEXT:    ; implicit-def: $sgpr102
-; GFX11-NEXT:    ; implicit-def: $sgpr11
-; GFX11-NEXT:    ; implicit-def: $sgpr101
-; GFX11-NEXT:    ; implicit-def: $sgpr100
-; GFX11-NEXT:    ; implicit-def: $sgpr99
-; GFX11-NEXT:    ; implicit-def: $sgpr40
-; GFX11-NEXT:    ; implicit-def: $sgpr14
-; GFX11-NEXT:    ; implicit-def: $sgpr12
-; GFX11-NEXT:    ; implicit-def: $sgpr49
-; GFX11-NEXT:    ; implicit-def: $sgpr37
-; GFX11-NEXT:    ; implicit-def: $sgpr35
-; GFX11-NEXT:    ; implicit-def: $sgpr6
-; GFX11-NEXT:    ; implicit-def: $sgpr34
-; GFX11-NEXT:    ; implicit-def: $sgpr52
-; GFX11-NEXT:    ; implicit-def: $sgpr53
-; GFX11-NEXT:    ; implicit-def: $sgpr50
-; GFX11-NEXT:    ; implicit-def: $sgpr7
-; GFX11-NEXT:    ; implicit-def: $sgpr36
-; GFX11-NEXT:    ; implicit-def: $sgpr64
-; GFX11-NEXT:    ; implicit-def: $sgpr38
-; GFX11-NEXT:    ; implicit-def: $sgpr54
-; GFX11-NEXT:    ; implicit-def: $sgpr96
-; GFX11-NEXT:    ; implicit-def: $sgpr51
-; GFX11-NEXT:    ; implicit-def: $sgpr67
-; GFX11-NEXT:    ; implicit-def: $sgpr68
-; GFX11-NEXT:    ; implicit-def: $sgpr65
-; GFX11-NEXT:    ; implicit-def: $sgpr8
-; GFX11-NEXT:    ; implicit-def: $sgpr55
-; GFX11-NEXT:    ; implicit-def: $sgpr39
-; GFX11-NEXT:    ; implicit-def: $sgpr71
-; GFX11-NEXT:    ; implicit-def: $sgpr69
-; GFX11-NEXT:    ; implicit-def: $sgpr9
-; GFX11-NEXT:    ; implicit-def: $sgpr66
-; GFX11-NEXT:    ; implicit-def: $sgpr82
-; GFX11-NEXT:    ; implicit-def: $sgpr83
-; GFX11-NEXT:    ; implicit-def: $sgpr80
-; GFX11-NEXT:    ; implicit-def: $sgpr97
-; GFX11-NEXT:    ; implicit-def: $sgpr70
-; GFX11-NEXT:    ; implicit-def: $sgpr48
-; GFX11-NEXT:    ; implicit-def: $sgpr84
-; GFX11-NEXT:    ; implicit-def: $sgpr98
-; GFX11-NEXT:    ; implicit-def: $sgpr81
-; GFX11-NEXT:    ; implicit-def: $sgpr86
-; GFX11-NEXT:    ; implicit-def: $sgpr87
-; GFX11-NEXT:    ; implicit-def: $sgpr10
-; GFX11-NEXT:    ; implicit-def: $sgpr85
-; GFX11-NEXT:    ; implicit-def: $sgpr30
-; GFX11-NEXT:    ; implicit-def: $sgpr94
-; GFX11-NEXT:    ; implicit-def: $sgpr92
-; GFX11-NEXT:    ; implicit-def: $sgpr90
-; GFX11-NEXT:    ; implicit-def: $sgpr88
-; GFX11-NEXT:    ; implicit-def: $sgpr78
-; GFX11-NEXT:    ; implicit-def: $sgpr76
-; GFX11-NEXT:    ; implicit-def: $sgpr4
-; GFX11-NEXT:    ; kill: killed $sgpr4
-; GFX11-NEXT:    ; implicit-def: $sgpr4
-; GFX11-NEXT:    ; kill: killed $sgpr4
-; GFX11-NEXT:    ; implicit-def: $sgpr4
-; GFX11-NEXT:    ; kill: killed $sgpr4
-; GFX11-NEXT:    ; implicit-def: $sgpr4
-; GFX11-NEXT:    ; kill: killed $sgpr4
-; GFX11-NEXT:    ; implicit-def: $sgpr4
-; GFX11-NEXT:    ; kill: killed $sgpr4
-; GFX11-NEXT:    ; implicit-def: $sgpr4
-; GFX11-NEXT:    ; kill: killed $sgpr4
-; GFX11-NEXT:    ; implicit-def: $sgpr4
-; GFX11-NEXT:    ; kill: killed $sgpr4
-; GFX11-NEXT:    ; implicit-def: $sgpr4
-; GFX11-NEXT:    ; kill: killed $sgpr4
-; GFX11-NEXT:    ; implicit-def: $sgpr4
-; GFX11-NEXT:    ; kill: killed $sgpr4
-; GFX11-NEXT:    ; implicit-def: $sgpr4
-; GFX11-NEXT:    ; kill: killed $sgpr4
-; GFX11-NEXT:    ; implicit-def: $sgpr4
-; GFX11-NEXT:    ; kill: killed $sgpr4
-; GFX11-NEXT:    ; implicit-def: $sgpr4
-; GFX11-NEXT:    ; kill: killed $sgpr4
-; GFX11-NEXT:    ; implicit-def: $sgpr4
-; GFX11-NEXT:    ; kill: killed $sgpr4
-; GFX11-NEXT:    ; implicit-def: $sgpr4
-; GFX11-NEXT:    v_writelane_b32 v43, s4, 0
-; GFX11-NEXT:    v_writelane_b32 v43, s5, 1
-; GFX11-NEXT:    ; implicit-def: $sgpr4
-; GFX11-NEXT:    ; kill: killed $sgpr4
-; GFX11-NEXT:    ; implicit-def: $sgpr4
-; GFX11-NEXT:    ; kill: killed $sgpr4
-; GFX11-NEXT:    ; implicit-def: $sgpr4
-; GFX11-NEXT:    ; kill: killed $sgpr4
-; GFX11-NEXT:    ; implicit-def: $sgpr4
-; GFX11-NEXT:    ; kill: killed $sgpr4
-; GFX11-NEXT:    ; implicit-def: $sgpr4
-; GFX11-NEXT:    ; kill: killed $sgpr4
-; GFX11-NEXT:    ; implicit-def: $sgpr4
-; GFX11-NEXT:    v_writelane_b32 v43, s4, 2
-; GFX11-NEXT:    v_writelane_b32 v43, s5, 3
-; GFX11-NEXT:    ; implicit-def: $sgpr4
-; GFX11-NEXT:    ; kill: killed $sgpr4
-; GFX11-NEXT:    ; implicit-def: $sgpr4
-; GFX11-NEXT:    ; kill: killed $sgpr4
-; GFX11-NEXT:    ; implicit-def: $sgpr4
-; GFX11-NEXT:    ; kill: killed $sgpr4
-; GFX11-NEXT:    ; implicit-def: $sgpr4
-; GFX11-NEXT:    ; kill: killed $sgpr4
-; GFX11-NEXT:    ; implicit-def: $sgpr4
-; GFX11-NEXT:    v_writelane_b32 v43, s74, 4
-; GFX11-NEXT:    ; kill: killed $sgpr4
-; GFX11-NEXT:    ; implicit-def: $sgpr4
-; GFX11-NEXT:    ; kill: killed $sgpr4
-; GFX11-NEXT:    ; implicit-def: $sgpr4
-; GFX11-NEXT:    ; kill: killed $sgpr4
-; GFX11-NEXT:    ; implicit-def: $sgpr4
-; GFX11-NEXT:    ; kill: killed $sgpr4
-; GFX11-NEXT:    ; implicit-def: $sgpr4
-; GFX11-NEXT:    ; kill: killed $sgpr4
-; GFX11-NEXT:    ; implicit-def: $sgpr4
-; GFX11-NEXT:    v_writelane_b32 v43, s75, 5
-; GFX11-NEXT:    ; implicit-def: $sgpr74
-; GFX11-NEXT:    ; kill: killed $sgpr4
-; GFX11-NEXT:    ; implicit-def: $sgpr4
-; GFX11-NEXT:    ; kill: killed $sgpr4
-; GFX11-NEXT:    ; implicit-def: $sgpr4
-; GFX11-NEXT:    ; kill: killed $sgpr4
-; GFX11-NEXT:    ; implicit-def: $sgpr4
-; GFX11-NEXT:    ; kill: killed $sgpr4
-; GFX11-NEXT:    ; implicit-def: $sgpr4
-; GFX11-NEXT:    ; implicit-def: $sgpr5
-; GFX11-NEXT:    v_writelane_b32 v43, s74, 6
-; GFX11-NEXT:    v_writelane_b32 v43, s75, 7
-; GFX11-NEXT:    ; implicit-def: $sgpr74
-; GFX11-NEXT:    s_branch .LBB91_2
-; GFX11-NEXT:  .LBB91_4:
-; GFX11-NEXT:    v_dual_mov_b32 v10, s94 :: v_dual_mov_b32 v11, s30
-; GFX11-NEXT:    v_readlane_b32 s94, v43, 2
-; GFX11-NEXT:    v_dual_mov_b32 v96, s37 :: v_dual_mov_b32 v87, s34
-; GFX11-NEXT:    v_dual_mov_b32 v6, s49 :: v_dual_mov_b32 v7, s35
-; GFX11-NEXT:    v_readlane_b32 s95, v43, 3
-; GFX11-NEXT:    v_readlane_b32 vcc_lo, v43, 6
-; GFX11-NEXT:    v_readlane_b32 s30, v43, 0
-; GFX11-NEXT:    v_readlane_b32 s34, v43, 4
-; GFX11-NEXT:    v_dual_mov_b32 v52, s44 :: v_dual_mov_b32 v51, s45
-; GFX11-NEXT:    v_dual_mov_b32 v50, s10 :: v_dual_mov_b32 v49, s46
-; GFX11-NEXT:    v_dual_mov_b32 v39, s47 :: v_dual_mov_b32 v48, s98
-; GFX11-NEXT:    v_dual_mov_b32 v38, s56 :: v_dual_mov_b32 v37, s97
-; GFX11-NEXT:    v_dual_mov_b32 v36, s57 :: v_dual_mov_b32 v35, s58
-; GFX11-NEXT:    v_dual_mov_b32 v34, s59 :: v_dual_mov_b32 v33, s9
-; GFX11-NEXT:    v_dual_mov_b32 v32, s60 :: v_dual_mov_b32 v31, s61
-; GFX11-NEXT:    v_dual_mov_b32 v30, s8 :: v_dual_mov_b32 v29, s62
-; GFX11-NEXT:    v_dual_mov_b32 v27, s63 :: v_dual_mov_b32 v28, s96
-; GFX11-NEXT:    v_dual_mov_b32 v26, s72 :: v_dual_mov_b32 v25, s7
-; GFX11-NEXT:    v_dual_mov_b32 v24, s73 :: v_dual_mov_b32 v23, s28
-; GFX11-NEXT:    v_dual_mov_b32 v21, s29 :: v_dual_mov_b32 v22, s6
-; GFX11-NEXT:    v_dual_mov_b32 v53, s87 :: v_dual_mov_b32 v54, s86
-; GFX11-NEXT:    v_dual_mov_b32 v5, s85 :: v_dual_mov_b32 v12, s5
-; GFX11-NEXT:    v_dual_mov_b32 v65, s4 :: v_dual_mov_b32 v66, s48
-; GFX11-NEXT:    v_dual_mov_b32 v55, s81 :: v_dual_mov_b32 v64, s84
-; GFX11-NEXT:    v_dual_mov_b32 v69, s83 :: v_dual_mov_b32 v70, s82
-; GFX11-NEXT:    v_dual_mov_b32 v67, s70 :: v_dual_mov_b32 v68, s80
-; GFX11-NEXT:    v_dual_mov_b32 v80, s71 :: v_dual_mov_b32 v19, s39
-; GFX11-NEXT:    v_dual_mov_b32 v71, s66 :: v_dual_mov_b32 v20, s69
-; GFX11-NEXT:    v_dual_mov_b32 v82, s68 :: v_dual_mov_b32 v17, s67
-; GFX11-NEXT:    v_dual_mov_b32 v81, s55 :: v_dual_mov_b32 v18, s65
-; GFX11-NEXT:    v_dual_mov_b32 v84, s38 :: v_dual_mov_b32 v15, s64
-; GFX11-NEXT:    v_dual_mov_b32 v83, s51 :: v_dual_mov_b32 v16, s54
-; GFX11-NEXT:    v_dual_mov_b32 v86, s53 :: v_dual_mov_b32 v13, s52
-; GFX11-NEXT:    v_dual_mov_b32 v85, s36 :: v_dual_mov_b32 v14, s50
-; GFX11-NEXT:    v_dual_mov_b32 v1, s74 :: v_dual_mov_b32 v2, s76
-; GFX11-NEXT:    v_dual_mov_b32 v3, s78 :: v_dual_mov_b32 v4, s88
-; GFX11-NEXT:    v_dual_mov_b32 v8, s90 :: v_dual_mov_b32 v9, s92
-; GFX11-NEXT:    s_mov_b32 s58, s11
-; GFX11-NEXT:    v_readlane_b32 s59, v43, 8
-; GFX11-NEXT:    v_readlane_b32 s72, v43, 9
-; GFX11-NEXT:    v_readlane_b32 s60, v43, 10
-; GFX11-NEXT:    v_readlane_b32 s61, v43, 11
-; GFX11-NEXT:    v_readlane_b32 s62, v43, 12
-; GFX11-NEXT:    v_readlane_b32 s63, v43, 13
-; GFX11-NEXT:    v_readlane_b32 s73, v43, 14
-; GFX11-NEXT:    v_readlane_b32 s13, v43, 15
-; GFX11-NEXT:    v_readlane_b32 s15, v43, 16
-; GFX11-NEXT:    v_readlane_b32 s41, v43, 17
-; GFX11-NEXT:    v_readlane_b32 s43, v43, 18
-; GFX11-NEXT:    v_readlane_b32 s56, v43, 19
-; GFX11-NEXT:    v_readlane_b32 s11, v43, 20
-; GFX11-NEXT:    v_readlane_b32 s57, v43, 21
-; GFX11-NEXT:    v_readlane_b32 s10, v43, 22
-; GFX11-NEXT:    v_readlane_b32 s74, v43, 23
-; GFX11-NEXT:    v_readlane_b32 s9, v43, 24
-; GFX11-NEXT:    v_readlane_b32 s75, v43, 25
-; GFX11-NEXT:    v_readlane_b32 s8, v43, 26
-; GFX11-NEXT:    v_readlane_b32 s76, v43, 27
-; GFX11-NEXT:    v_readlane_b32 s77, v43, 28
-; GFX11-NEXT:    v_readlane_b32 s78, v43, 29
-; GFX11-NEXT:    v_readlane_b32 s79, v43, 30
-; GFX11-NEXT:    v_readlane_b32 s88, v43, 31
-; GFX11-NEXT:    v_readlane_b32 s89, v42, 0
-; GFX11-NEXT:    v_readlane_b32 s90, v42, 1
-; GFX11-NEXT:    v_readlane_b32 s91, v42, 2
-; GFX11-NEXT:    v_readlane_b32 s92, v42, 3
-; GFX11-NEXT:    v_readlane_b32 s47, v42, 4
-; GFX11-NEXT:    v_readlane_b32 s93, v42, 5
-; GFX11-NEXT:    v_readlane_b32 vcc_hi, v43, 7
-; GFX11-NEXT:    v_readlane_b32 s46, v42, 6
-; GFX11-NEXT:    v_readlane_b32 s31, v43, 1
-; GFX11-NEXT:    v_readlane_b32 s95, v42, 7
-; GFX11-NEXT:    v_readlane_b32 s45, v42, 8
-; GFX11-NEXT:    v_readlane_b32 s35, v43, 5
-; GFX11-NEXT:  .LBB91_5: ; %end
-; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX11-NEXT:    s_lshl_b32 s4, s104, 8
-; GFX11-NEXT:    s_and_b32 s5, s103, 0xff
-; GFX11-NEXT:    s_lshl_b32 s6, s42, 8
-; GFX11-NEXT:    s_or_b32 s0, s0, s4
-; GFX11-NEXT:    s_or_b32 s4, s5, s6
-; GFX11-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX11-NEXT:    s_lshl_b32 s5, s102, 8
-; GFX11-NEXT:    s_and_b32 s6, s58, 0xff
-; GFX11-NEXT:    s_lshl_b32 s7, s101, 8
-; GFX11-NEXT:    s_or_b32 s1, s1, s5
-; GFX11-NEXT:    s_or_b32 s5, s6, s7
-; GFX11-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX11-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX11-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX11-NEXT:    s_lshl_b32 s5, s5, 16
-; GFX11-NEXT:    s_or_b32 s0, s0, s4
-; GFX11-NEXT:    s_or_b32 s1, s1, s5
-; GFX11-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX11-NEXT:    s_lshl_b32 s4, s100, 8
-; GFX11-NEXT:    s_and_b32 s5, s99, 0xff
-; GFX11-NEXT:    s_lshl_b32 s6, s40, 8
-; GFX11-NEXT:    s_or_b32 s2, s2, s4
-; GFX11-NEXT:    s_or_b32 s4, s5, s6
-; GFX11-NEXT:    s_and_b32 s3, s3, 0xff
-; GFX11-NEXT:    s_lshl_b32 s5, s45, 8
-; GFX11-NEXT:    s_and_b32 s6, s59, 0xff
-; GFX11-NEXT:    s_lshl_b32 s7, s95, 8
-; GFX11-NEXT:    s_or_b32 s3, s3, s5
-; GFX11-NEXT:    s_or_b32 s5, s6, s7
-; GFX11-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX11-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX11-NEXT:    s_and_b32 s3, s3, 0xffff
-; GFX11-NEXT:    s_lshl_b32 s5, s5, 16
-; GFX11-NEXT:    s_or_b32 s2, s2, s4
-; GFX11-NEXT:    s_or_b32 s3, s3, s5
-; GFX11-NEXT:    v_dual_mov_b32 v97, s0 :: v_dual_mov_b32 v98, s1
-; GFX11-NEXT:    v_dual_mov_b32 v99, s2 :: v_dual_mov_b32 v100, s3
-; GFX11-NEXT:    s_and_b32 s0, s16, 0xff
-; GFX11-NEXT:    s_lshl_b32 s1, s46, 8
-; GFX11-NEXT:    s_and_b32 s2, s93, 0xff
-; GFX11-NEXT:    s_lshl_b32 s3, s14, 8
-; GFX11-NEXT:    s_or_b32 s0, s0, s1
-; GFX11-NEXT:    s_or_b32 s1, s2, s3
-; GFX11-NEXT:    s_and_b32 s2, s17, 0xff
-; GFX11-NEXT:    s_lshl_b32 s3, s47, 8
-; GFX11-NEXT:    s_and_b32 s4, s72, 0xff
-; GFX11-NEXT:    s_lshl_b32 s5, s92, 8
-; GFX11-NEXT:    s_or_b32 s2, s2, s3
-; GFX11-NEXT:    s_or_b32 s3, s4, s5
-; GFX11-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX11-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX11-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX11-NEXT:    s_or_b32 s0, s0, s1
-; GFX11-NEXT:    s_or_b32 s1, s2, s3
-; GFX11-NEXT:    s_and_b32 s2, s18, 0xff
-; GFX11-NEXT:    s_lshl_b32 s3, s91, 8
-; GFX11-NEXT:    s_and_b32 s4, s90, 0xff
-; GFX11-NEXT:    s_lshl_b32 s5, s12, 8
-; GFX11-NEXT:    s_or_b32 s2, s2, s3
-; GFX11-NEXT:    s_or_b32 s3, s4, s5
-; GFX11-NEXT:    s_and_b32 s4, s19, 0xff
-; GFX11-NEXT:    s_lshl_b32 s5, s89, 8
-; GFX11-NEXT:    s_and_b32 s6, s60, 0xff
-; GFX11-NEXT:    s_lshl_b32 s7, s88, 8
-; GFX11-NEXT:    s_or_b32 s4, s4, s5
-; GFX11-NEXT:    s_or_b32 s5, s6, s7
-; GFX11-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX11-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX11-NEXT:    s_and_b32 s4, s4, 0xffff
-; GFX11-NEXT:    s_lshl_b32 s5, s5, 16
-; GFX11-NEXT:    s_or_b32 s2, s2, s3
-; GFX11-NEXT:    s_or_b32 s3, s4, s5
-; GFX11-NEXT:    v_dual_mov_b32 v112, s0 :: v_dual_mov_b32 v113, s1
-; GFX11-NEXT:    v_dual_mov_b32 v114, s2 :: v_dual_mov_b32 v115, s3
-; GFX11-NEXT:    s_and_b32 s0, s20, 0xff
-; GFX11-NEXT:    s_lshl_b32 s1, s79, 8
-; GFX11-NEXT:    s_and_b32 s2, s78, 0xff
-; GFX11-NEXT:    s_lshl_b32 s3, s30, 8
-; GFX11-NEXT:    s_or_b32 s0, s0, s1
-; GFX11-NEXT:    s_or_b32 s1, s2, s3
-; GFX11-NEXT:    s_and_b32 s2, s21, 0xff
-; GFX11-NEXT:    s_lshl_b32 s3, s77, 8
-; GFX11-NEXT:    s_and_b32 s4, s61, 0xff
-; GFX11-NEXT:    s_lshl_b32 s5, s76, 8
-; GFX11-NEXT:    s_or_b32 s2, s2, s3
-; GFX11-NEXT:    s_or_b32 s3, s4, s5
-; GFX11-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX11-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX11-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX11-NEXT:    s_or_b32 s0, s0, s1
-; GFX11-NEXT:    s_or_b32 s1, s2, s3
-; GFX11-NEXT:    s_and_b32 s2, s22, 0xff
-; GFX11-NEXT:    s_lshl_b32 s3, s8, 8
-; GFX11-NEXT:    s_and_b32 s4, s75, 0xff
-; GFX11-NEXT:    s_lshl_b32 s5, s94, 8
-; GFX11-NEXT:    s_or_b32 s2, s2, s3
-; GFX11-NEXT:    s_or_b32 s3, s4, s5
-; GFX11-NEXT:    s_and_b32 s4, s23, 0xff
-; GFX11-NEXT:    s_lshl_b32 s5, s9, 8
-; GFX11-NEXT:    s_and_b32 s6, s62, 0xff
-; GFX11-NEXT:    s_lshl_b32 s7, s74, 8
-; GFX11-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX11-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX11-NEXT:    s_or_b32 s4, s4, s5
-; GFX11-NEXT:    s_or_b32 s5, s6, s7
-; GFX11-NEXT:    s_and_b32 s4, s4, 0xffff
-; GFX11-NEXT:    s_lshl_b32 s5, s5, 16
-; GFX11-NEXT:    s_or_b32 s2, s2, s3
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_store_b128 v0, v[97:100], off
-; GFX11-NEXT:    scratch_store_b128 v0, v[112:115], off offset:16
-; GFX11-NEXT:    s_or_b32 s3, s4, s5
-; GFX11-NEXT:    v_dual_mov_b32 v97, s0 :: v_dual_mov_b32 v98, s1
-; GFX11-NEXT:    v_dual_mov_b32 v99, s2 :: v_dual_mov_b32 v100, s3
-; GFX11-NEXT:    s_and_b32 s0, s24, 0xff
-; GFX11-NEXT:    s_lshl_b32 s1, s10, 8
-; GFX11-NEXT:    s_and_b32 s2, s57, 0xff
-; GFX11-NEXT:    s_lshl_b32 s4, s34, 8
-; GFX11-NEXT:    s_or_b32 s0, s0, s1
-; GFX11-NEXT:    s_or_b32 s1, s2, s4
-; GFX11-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX11-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-NEXT:    s_lshl_b32 s2, s11, 8
-; GFX11-NEXT:    s_or_b32 s0, s0, s1
-; GFX11-NEXT:    s_and_b32 s1, s25, 0xff
-; GFX11-NEXT:    s_and_b32 s3, s63, 0xff
-; GFX11-NEXT:    s_lshl_b32 s4, s56, 8
-; GFX11-NEXT:    s_or_b32 s1, s1, s2
-; GFX11-NEXT:    s_or_b32 s2, s3, s4
-; GFX11-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX11-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX11-NEXT:    s_and_b32 s3, s26, 0xff
-; GFX11-NEXT:    s_lshl_b32 s4, s43, 8
-; GFX11-NEXT:    s_or_b32 s1, s1, s2
-; GFX11-NEXT:    s_or_b32 s2, s3, s4
-; GFX11-NEXT:    s_and_b32 s3, s41, 0xff
-; GFX11-NEXT:    s_lshl_b32 s4, vcc_lo, 8
-; GFX11-NEXT:    s_lshl_b32 s5, s15, 8
-; GFX11-NEXT:    s_or_b32 s3, s3, s4
-; GFX11-NEXT:    s_and_b32 s4, s27, 0xff
-; GFX11-NEXT:    s_lshl_b32 s6, s13, 8
-; GFX11-NEXT:    s_or_b32 s4, s4, s5
-; GFX11-NEXT:    s_and_b32 s5, s73, 0xff
-; GFX11-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX11-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX11-NEXT:    s_and_b32 s4, s4, 0xffff
-; GFX11-NEXT:    s_lshl_b32 s5, s5, 16
-; GFX11-NEXT:    v_dual_mov_b32 v112, s0 :: v_dual_and_b32 v23, 0xff, v23
-; GFX11-NEXT:    v_dual_mov_b32 v113, s1 :: v_dual_lshlrev_b32 v6, 8, v6
-; GFX11-NEXT:    s_or_b32 s2, s2, s3
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v114, s2 :: v_dual_lshlrev_b32 v11, 8, v11
-; GFX11-NEXT:    s_or_b32 s3, s4, s5
-; GFX11-NEXT:    v_dual_mov_b32 v115, s3 :: v_dual_and_b32 v96, 0xff, v96
-; GFX11-NEXT:    v_or_b32_e32 v6, v23, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v11, v96, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xff, v24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 8, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 8, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
-; GFX11-NEXT:    v_or_b32_e32 v23, v6, v11
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v21
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v22
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 8, v87
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v26
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xff, v86
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; GFX11-NEXT:    v_or_b32_e32 v7, v11, v21
-; GFX11-NEXT:    v_or_b32_e32 v11, v22, v13
-; GFX11-NEXT:    v_or_b32_e32 v10, v26, v10
-; GFX11-NEXT:    v_or_b32_e32 v13, v24, v14
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v25
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 8, v85
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v29
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xff, v84
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v27
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xff, v28
-; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 8, v83
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v21
-; GFX11-NEXT:    v_or_b32_e32 v15, v22, v15
-; GFX11-NEXT:    v_or_b32_e32 v9, v24, v9
-; GFX11-NEXT:    v_or_b32_e32 v16, v25, v16
-; GFX11-NEXT:    v_or_b32_e32 v21, v26, v27
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-NEXT:    v_or_b32_e32 v24, v6, v7
-; GFX11-NEXT:    v_or_b32_e32 v25, v11, v10
-; GFX11-NEXT:    v_or_b32_e32 v26, v13, v14
-; GFX11-NEXT:    v_or_b32_e32 v6, v15, v9
-; GFX11-NEXT:    v_or_b32_e32 v7, v16, v21
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v32
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 8, v17
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v82
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v31
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 8, v18
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v30
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 8, v81
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v35
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 8, v19
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v10
-; GFX11-NEXT:    v_or_b32_e32 v8, v11, v8
-; GFX11-NEXT:    v_or_b32_e32 v10, v13, v14
-; GFX11-NEXT:    v_or_b32_e32 v11, v15, v16
-; GFX11-NEXT:    v_or_b32_e32 v13, v17, v18
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v80
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v34
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 8, v20
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v33
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 8, v71
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xff, v38
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 8, v70
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xff, v69
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; GFX11-NEXT:    v_or_b32_e32 v4, v14, v4
-; GFX11-NEXT:    v_or_b32_e32 v14, v15, v16
-; GFX11-NEXT:    v_or_b32_e32 v15, v17, v18
-; GFX11-NEXT:    v_or_b32_e32 v16, v19, v20
-; GFX11-NEXT:    v_or_b32_e32 v3, v21, v3
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_or_b32_e32 v8, v9, v8
-; GFX11-NEXT:    v_or_b32_e32 v9, v10, v11
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v4
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v15
-; GFX11-NEXT:    v_or_b32_e32 v15, v16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v36
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 8, v68
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v37
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 8, v67
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v49
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 8, v66
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v65
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xff, v39
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 8, v64
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX11-NEXT:    v_or_b32_e32 v4, v10, v11
-; GFX11-NEXT:    v_or_b32_e32 v10, v16, v17
-; GFX11-NEXT:    v_or_b32_e32 v2, v18, v2
-; GFX11-NEXT:    v_or_b32_e32 v11, v19, v20
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v48
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 8, v55
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v52
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 8, v54
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v53
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xff, v51
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 8, v12
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v50
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX11-NEXT:    v_or_b32_e32 v16, v16, v17
-; GFX11-NEXT:    v_or_b32_e32 v17, v18, v19
-; GFX11-NEXT:    v_or_b32_e32 v1, v20, v1
-; GFX11-NEXT:    v_or_b32_e32 v12, v21, v12
-; GFX11-NEXT:    v_or_b32_e32 v5, v22, v5
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v16
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_or_b32_e32 v16, v3, v4
-; GFX11-NEXT:    v_readlane_b32 s30, v41, 7
-; GFX11-NEXT:    v_or_b32_e32 v1, v10, v2
-; GFX11-NEXT:    v_or_b32_e32 v2, v11, v18
-; GFX11-NEXT:    v_or_b32_e32 v3, v17, v19
-; GFX11-NEXT:    v_or_b32_e32 v4, v12, v5
-; GFX11-NEXT:    s_clause 0x5
-; GFX11-NEXT:    scratch_store_b128 v0, v[97:100], off offset:32
-; GFX11-NEXT:    scratch_store_b128 v0, v[112:115], off offset:48
-; GFX11-NEXT:    scratch_store_b128 v0, v[23:26], off offset:64
-; GFX11-NEXT:    scratch_store_b128 v0, v[6:9], off offset:80
-; GFX11-NEXT:    scratch_store_b128 v0, v[13:16], off offset:96
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:112
-; GFX11-NEXT:    v_readlane_b32 s31, v41, 8
-; GFX11-NEXT:    v_readlane_b32 s104, v41, 6
-; GFX11-NEXT:    v_readlane_b32 s103, v41, 5
-; GFX11-NEXT:    v_readlane_b32 s102, v41, 4
-; GFX11-NEXT:    v_readlane_b32 s101, v41, 3
-; GFX11-NEXT:    v_readlane_b32 s100, v41, 2
-; GFX11-NEXT:    v_readlane_b32 s99, v41, 1
-; GFX11-NEXT:    v_readlane_b32 s98, v41, 0
-; GFX11-NEXT:    v_readlane_b32 s97, v40, 31
-; GFX11-NEXT:    v_readlane_b32 s96, v40, 30
-; GFX11-NEXT:    v_readlane_b32 s87, v40, 29
-; GFX11-NEXT:    v_readlane_b32 s86, v40, 28
-; GFX11-NEXT:    v_readlane_b32 s85, v40, 27
-; GFX11-NEXT:    v_readlane_b32 s84, v40, 26
-; GFX11-NEXT:    v_readlane_b32 s83, v40, 25
-; GFX11-NEXT:    v_readlane_b32 s82, v40, 24
-; GFX11-NEXT:    v_readlane_b32 s81, v40, 23
-; GFX11-NEXT:    v_readlane_b32 s80, v40, 22
-; GFX11-NEXT:    v_readlane_b32 s71, v40, 21
-; GFX11-NEXT:    v_readlane_b32 s70, v40, 20
-; GFX11-NEXT:    v_readlane_b32 s69, v40, 19
-; GFX11-NEXT:    v_readlane_b32 s68, v40, 18
-; GFX11-NEXT:    v_readlane_b32 s67, v40, 17
-; GFX11-NEXT:    v_readlane_b32 s66, v40, 16
-; GFX11-NEXT:    v_readlane_b32 s65, v40, 15
-; GFX11-NEXT:    v_readlane_b32 s64, v40, 14
-; GFX11-NEXT:    v_readlane_b32 s55, v40, 13
-; GFX11-NEXT:    v_readlane_b32 s54, v40, 12
-; GFX11-NEXT:    v_readlane_b32 s53, v40, 11
-; GFX11-NEXT:    v_readlane_b32 s52, v40, 10
-; GFX11-NEXT:    v_readlane_b32 s51, v40, 9
-; GFX11-NEXT:    v_readlane_b32 s50, v40, 8
-; GFX11-NEXT:    v_readlane_b32 s49, v40, 7
-; GFX11-NEXT:    v_readlane_b32 s48, v40, 6
-; GFX11-NEXT:    v_readlane_b32 s39, v40, 5
-; GFX11-NEXT:    v_readlane_b32 s38, v40, 4
-; GFX11-NEXT:    v_readlane_b32 s37, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s36, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s35, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s34, v40, 0
-; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    scratch_load_b32 v40, off, s32
-; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:12
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v128i8_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX11-TRUE16-NEXT:    s_clause 0x3
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:12
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s4
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s34, 0
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s35, 1
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s36, 2
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s37, 3
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s38, 4
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s39, 5
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s48, 6
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s49, 7
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s50, 8
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s51, 9
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s52, 10
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s53, 11
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s54, 12
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s55, 13
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s64, 14
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s65, 15
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s66, 16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s67, 17
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s68, 18
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s69, 19
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s70, 20
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s71, 21
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s80, 22
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s81, 23
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s82, 24
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s83, 25
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s84, 26
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s85, 27
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s86, 28
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s87, 29
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s96, 30
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s97, 31
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v41, s98, 0
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v41, s99, 1
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v41, s100, 2
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v41, s101, 3
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v41, s102, 4
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v41, s103, 5
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v41, s104, 6
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v41, s30, 7
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v41, s31, 8
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v15
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s72, v1
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s73, v2
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s62, v3
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s63, v4
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s60, v5
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s61, v6
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s58, v7
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s59, v8
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s46, v9
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s47, v10
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s44, v11
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s45, v12
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s42, v13
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s43, v14
+; GFX11-TRUE16-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB91_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s27, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[12:13], s[26:27], 24
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 15
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s99, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s100, s2, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s101, s1, 24
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 14
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s27, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s102, s1, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s103, s0, 16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s104, s0, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s85, s43, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s43, 16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 17
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s26, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s43, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s87, s42, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s86, s42, 8
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 18
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s25, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s81, s45, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s98, s45, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s84, s45, 8
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 19
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s48, s44, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s70, s47, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s97, s47, 16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 13
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s25, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s80, s47, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s83, s46, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s82, s46, 8
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 20
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s66, s59, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s59, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s69, s59, 8
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 21
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s24, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s71, s58, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s39, s58, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s55, s61, 24
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 22
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s23, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s61, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s65, s61, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s68, s60, 16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 23
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s67, s60, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s51, s63, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s96, s63, 16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 12
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s23, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s54, s63, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s38, s62, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s64, s62, 8
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s36, s73, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s73, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s50, s73, 8
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 25
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s22, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s53, s72, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s52, s72, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s34, s29, 24
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 26
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s21, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s35, s29, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s37, s28, 16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 27
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s49, s28, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[14:15], s[16:17], 24
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[40:41], s[2:3], 24
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 11
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s21, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[56:57], s[0:1], 24
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[74:75], s[42:43], 24
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[76:77], s[44:45], 24
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 28
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[78:79], s[46:47], 24
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[88:89], s[58:59], 24
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[90:91], s[60:61], 24
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 29
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s20, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[92:93], s[62:63], 24
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[94:95], s[72:73], 24
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[30:31], s[28:29], 24
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 30
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s19, 24
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 31
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s19, 16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 10
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s19, 8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s4, 0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s4, 1
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s4, 2
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s17, 24
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s4, 3
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s17, 16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 9
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s17, 8
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s4, 4
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s16, 16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s4, 5
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s16, 8
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s4, 6
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s3, 24
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s4, 7
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s3, 16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s3, 8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s4, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s44, 16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s12, 6
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s13, 7
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[12:13], s[24:25], 24
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s12, 4
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s13, 5
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[12:13], s[22:23], 24
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s12, 2
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s13, 3
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[12:13], s[20:21], 24
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s12, 0
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s13, 1
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[12:13], s[18:19], 24
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, vcc_hi
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB91_4
+; GFX11-TRUE16-NEXT:  .LBB91_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_and_b32 s5, s29, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s29, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s5
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s28, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s78, s28, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v4, v2
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s1, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_and_b32 s15, s45, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s28, s45, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s8, s43, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s7, s43, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s6, s73, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s77, s73, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s76, s72, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s75, s72, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s11, s63, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s74, s63, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s73, s62, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s72, s62, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s62, s61, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s63, s61, 16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v4, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    s_and_b32 s61, s60, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s57, s60, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s40, s59, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s45, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s56, s59, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s29, s58, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s14, s58, 16
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s4, s45, 0x10010
+; GFX11-TRUE16-NEXT:    s_and_b32 s12, s47, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_add_i32 s43, s4, s45
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s13, s47, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s47, s46, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s41, s46, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s10, s44, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s44, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s5, s42, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s42, 16
+; GFX11-TRUE16-NEXT:    s_addk_i32 s43, 0x7fff
+; GFX11-TRUE16-NEXT:    s_bitset1_b32 s45, 22
+; GFX11-TRUE16-NEXT:    s_and_b32 s42, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s42, s45, s43
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s78
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v2
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s58, s42, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v3, v6
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s43, s1, 0x10010
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v24.l
+; GFX11-TRUE16-NEXT:    s_add_i32 s43, s43, s1
+; GFX11-TRUE16-NEXT:    s_bitset1_b32 s1, 22
+; GFX11-TRUE16-NEXT:    s_addk_i32 s43, 0x7fff
+; GFX11-TRUE16-NEXT:    s_and_b32 s42, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s1, s1, s43
+; GFX11-TRUE16-NEXT:    s_and_b32 s42, s0, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s42
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s77
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v25.l
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s42, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 24, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s6
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s6, s42, 0x10010
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    s_add_i32 s6, s6, s42
+; GFX11-TRUE16-NEXT:    s_bitset1_b32 s42, 22
+; GFX11-TRUE16-NEXT:    s_addk_i32 s6, 0x7fff
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v1
+; GFX11-TRUE16-NEXT:    s_and_b32 s43, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s6, s42, s6
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, v7, v6
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s6, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v8, v3
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s42, s0, 0x10010
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT:    s_add_i32 s42, s42, s0
+; GFX11-TRUE16-NEXT:    s_bitset1_b32 s0, 22
+; GFX11-TRUE16-NEXT:    s_addk_i32 s42, 0x7fff
+; GFX11-TRUE16-NEXT:    s_and_b32 s43, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s0, s0, s42
+; GFX11-TRUE16-NEXT:    s_and_b32 s42, s3, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s42
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v27.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s42, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s76
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v6
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s43, s42, 0x10010
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    s_add_i32 s43, s43, s42
+; GFX11-TRUE16-NEXT:    s_bitset1_b32 s42, 22
+; GFX11-TRUE16-NEXT:    s_addk_i32 s43, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v7, 16, 1
+; GFX11-TRUE16-NEXT:    s_and_b32 s44, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s42, s42, s43
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s75
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v3, v7
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s59, s42, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s3, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s43, s3, 0x10010
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT:    s_add_i32 s43, s43, s3
+; GFX11-TRUE16-NEXT:    s_bitset1_b32 s3, 22
+; GFX11-TRUE16-NEXT:    s_addk_i32 s43, 0x7fff
+; GFX11-TRUE16-NEXT:    s_and_b32 s42, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s3, s3, s43
+; GFX11-TRUE16-NEXT:    s_and_b32 s42, s2, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s42
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s74
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s42, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v26.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v28.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v3, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s11
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s11, s42, 0x10010
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    s_add_i32 s11, s11, s42
+; GFX11-TRUE16-NEXT:    s_bitset1_b32 s42, 22
+; GFX11-TRUE16-NEXT:    s_addk_i32 s11, 0x7fff
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v2
+; GFX11-TRUE16-NEXT:    s_and_b32 s43, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s11, s42, s11
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v8, v7
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s11, 16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v6, v3
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s42, s2, 0x10010
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT:    s_add_i32 s42, s42, s2
+; GFX11-TRUE16-NEXT:    s_bitset1_b32 s2, 22
+; GFX11-TRUE16-NEXT:    s_addk_i32 s42, 0x7fff
+; GFX11-TRUE16-NEXT:    s_and_b32 s43, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s2, s2, s42
+; GFX11-TRUE16-NEXT:    s_and_b32 s42, s17, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s42
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s73
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v30.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s42, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v6
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s43, s42, 0x10010
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    s_add_i32 s43, s43, s42
+; GFX11-TRUE16-NEXT:    s_bitset1_b32 s42, 22
+; GFX11-TRUE16-NEXT:    s_addk_i32 s43, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s72
+; GFX11-TRUE16-NEXT:    s_and_b32 s44, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s42, s42, s43
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s17, s17, 16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s17
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v3, v7
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s60, s42, 16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s17, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v29.l
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s43, s17, 0x10010
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v31.l
+; GFX11-TRUE16-NEXT:    s_add_i32 s43, s43, s17
+; GFX11-TRUE16-NEXT:    s_bitset1_b32 s17, 22
+; GFX11-TRUE16-NEXT:    s_addk_i32 s43, 0x7fff
+; GFX11-TRUE16-NEXT:    s_and_b32 s42, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s17, s17, s43
+; GFX11-TRUE16-NEXT:    s_and_b32 s42, s16, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s42
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s63
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s17, s17, 16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s42, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[13:14], 24, v[11:12]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[14:15], 24, v[4:5]
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s43, s42, 0x10010
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v3, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    s_add_i32 s43, s43, s42
+; GFX11-TRUE16-NEXT:    s_bitset1_b32 s42, 22
+; GFX11-TRUE16-NEXT:    s_addk_i32 s43, 0x7fff
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-TRUE16-NEXT:    s_and_b32 s44, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s42, s42, s43
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s16, s16, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s62
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s42, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s16, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v6, v3
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s43, s16, 0x10010
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    s_add_i32 s43, s43, s16
+; GFX11-TRUE16-NEXT:    s_bitset1_b32 s16, 22
+; GFX11-TRUE16-NEXT:    s_addk_i32 s43, 0x7fff
+; GFX11-TRUE16-NEXT:    s_and_b32 s42, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s16, s16, s43
+; GFX11-TRUE16-NEXT:    s_and_b32 s42, s19, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s42
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s16, s16, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s61
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s42, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v33.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v7, 16, 1
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s43, s42, 0x10010
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    s_add_i32 s43, s43, s42
+; GFX11-TRUE16-NEXT:    s_bitset1_b32 s42, 22
+; GFX11-TRUE16-NEXT:    s_addk_i32 s43, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s57
+; GFX11-TRUE16-NEXT:    s_and_b32 s45, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s42, s42, s43
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s19, s19, 16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s19
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v3, v7
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s61, s42, 16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s19, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s43, s19, 0x10010
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v32.l
+; GFX11-TRUE16-NEXT:    s_add_i32 s43, s43, s19
+; GFX11-TRUE16-NEXT:    s_bitset1_b32 s19, 22
+; GFX11-TRUE16-NEXT:    s_addk_i32 s43, 0x7fff
+; GFX11-TRUE16-NEXT:    s_and_b32 s42, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s19, s19, s43
+; GFX11-TRUE16-NEXT:    s_and_b32 s42, s18, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s42
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s56
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s19, s19, 16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s42, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v34.l
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s45, s17, s60
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s44, s16, s44
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v3, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s40
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s40, s42, 0x10010
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    s_add_i32 s40, s40, s42
+; GFX11-TRUE16-NEXT:    s_bitset1_b32 s42, 22
+; GFX11-TRUE16-NEXT:    s_addk_i32 s40, 0x7fff
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v2
+; GFX11-TRUE16-NEXT:    s_and_b32 s43, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s40, s42, s40
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s18, s18, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s18
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s40, 16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s18, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v6, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s42, s18, 0x10010
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v7
+; GFX11-TRUE16-NEXT:    s_add_i32 s42, s42, s18
+; GFX11-TRUE16-NEXT:    s_bitset1_b32 s18, 22
+; GFX11-TRUE16-NEXT:    s_addk_i32 s42, 0x7fff
+; GFX11-TRUE16-NEXT:    s_and_b32 s43, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s18, s18, s42
+; GFX11-TRUE16-NEXT:    s_and_b32 s42, s21, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s42
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s29
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s18, s18, 16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s29, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v6
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s42, s29, 0x10010
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    s_add_i32 s42, s42, s29
+; GFX11-TRUE16-NEXT:    s_bitset1_b32 s29, 22
+; GFX11-TRUE16-NEXT:    s_addk_i32 s42, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s14
+; GFX11-TRUE16-NEXT:    s_and_b32 s43, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s29, s29, s42
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s21, s21, 16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s21
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v3, v7
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s62, s29, 16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s14, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s42, s2, s11
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s21, s14, 0x10010
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v35.l
+; GFX11-TRUE16-NEXT:    s_add_i32 s21, s21, s14
+; GFX11-TRUE16-NEXT:    s_bitset1_b32 s14, 22
+; GFX11-TRUE16-NEXT:    s_addk_i32 s21, 0x7fff
+; GFX11-TRUE16-NEXT:    s_and_b32 s29, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s14, s14, s21
+; GFX11-TRUE16-NEXT:    s_and_b32 s21, s20, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s21
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s13
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s21, s14, 16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s13, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v37.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v36.l
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s43, s3, s59
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v3, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s12
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s12, s13, 0x10010
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    s_add_i32 s12, s12, s13
+; GFX11-TRUE16-NEXT:    s_bitset1_b32 s13, 22
+; GFX11-TRUE16-NEXT:    s_addk_i32 s12, 0x7fff
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v2
+; GFX11-TRUE16-NEXT:    s_and_b32 s14, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s12, s13, s12
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s13, s20, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s13
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s12, 16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s13, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v6, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s14, s13, 0x10010
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v7
+; GFX11-TRUE16-NEXT:    s_add_i32 s14, s14, s13
+; GFX11-TRUE16-NEXT:    s_bitset1_b32 s13, 22
+; GFX11-TRUE16-NEXT:    s_addk_i32 s14, 0x7fff
+; GFX11-TRUE16-NEXT:    s_and_b32 s20, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s13, s13, s14
+; GFX11-TRUE16-NEXT:    s_and_b32 s14, s23, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s14
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s47
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v39.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s14, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v6
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s20, s14, 0x10010
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    s_add_i32 s29, s20, s14
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s20, s13, 16
+; GFX11-TRUE16-NEXT:    s_addk_i32 s29, 0x7fff
+; GFX11-TRUE16-NEXT:    s_bitset1_b32 s14, 22
+; GFX11-TRUE16-NEXT:    s_and_b32 s13, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s13, s14, s29
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s14, s23, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s41
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v3, v7
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s63, s13, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s14, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s28
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s23, s14, 0x10010
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s28, s0, s6
+; GFX11-TRUE16-NEXT:    s_add_i32 s23, s23, s14
+; GFX11-TRUE16-NEXT:    s_bitset1_b32 s14, 22
+; GFX11-TRUE16-NEXT:    s_addk_i32 s23, 0x7fff
+; GFX11-TRUE16-NEXT:    s_and_b32 s13, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s13, s14, s23
+; GFX11-TRUE16-NEXT:    s_and_b32 s14, s22, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s14
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s23, s13, 16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s14, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s15
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v38.l
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s15, s14, 0x10010
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v3, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    s_add_i32 s15, s15, s14
+; GFX11-TRUE16-NEXT:    s_bitset1_b32 s14, 22
+; GFX11-TRUE16-NEXT:    s_addk_i32 s15, 0x7fff
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v2
+; GFX11-TRUE16-NEXT:    s_and_b32 s13, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s13, s14, s15
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s14, s22, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s14
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v9, 16, 1
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s13, 16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v48.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v8
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s14, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, v7, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s15, s14, 0x10010
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    s_add_i32 s15, s15, s14
+; GFX11-TRUE16-NEXT:    s_bitset1_b32 s14, 22
+; GFX11-TRUE16-NEXT:    s_addk_i32 s15, 0x7fff
+; GFX11-TRUE16-NEXT:    s_and_b32 s22, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s14, s14, s15
+; GFX11-TRUE16-NEXT:    s_and_b32 s15, s25, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s22, s14, 16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v3, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s15
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s10
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s10, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s9
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v1.l
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s9, s10, 0x10010
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v7, 16, 1
+; GFX11-TRUE16-NEXT:    s_add_i32 s9, s9, s10
+; GFX11-TRUE16-NEXT:    s_bitset1_b32 s10, 22
+; GFX11-TRUE16-NEXT:    s_addk_i32 s9, 0x7fff
+; GFX11-TRUE16-NEXT:    s_and_b32 s14, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s9, s10, s9
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s25, 16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v6
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s10
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v7
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s72, s9, 16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s8, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, v6, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v8, 16, 1
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s10, s8, 0x10010
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v51.l
+; GFX11-TRUE16-NEXT:    s_add_i32 s10, s10, s8
+; GFX11-TRUE16-NEXT:    s_bitset1_b32 s8, 22
+; GFX11-TRUE16-NEXT:    s_addk_i32 s10, 0x7fff
+; GFX11-TRUE16-NEXT:    s_and_b32 s9, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s8, s8, s10
+; GFX11-TRUE16-NEXT:    s_and_b32 s9, s24, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s9
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s25, s8, 16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, v10, v8
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s9, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v69.l, v50.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v69.h, v49.l
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s29, s1, s58
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v9
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s7
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s7, s9, 0x10010
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    s_add_i32 s7, s7, s9
+; GFX11-TRUE16-NEXT:    s_bitset1_b32 s9, 22
+; GFX11-TRUE16-NEXT:    s_addk_i32 s7, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v8
+; GFX11-TRUE16-NEXT:    s_and_b32 s8, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s7, s9, s7
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s24, 16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s8
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s7, 16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v68.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v68.l, v53.l
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s8, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s5
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s5, s8, 0x10010
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v6, v9
+; GFX11-TRUE16-NEXT:    s_add_i32 s5, s5, s8
+; GFX11-TRUE16-NEXT:    s_bitset1_b32 s8, 22
+; GFX11-TRUE16-NEXT:    s_addk_i32 s5, 0x7fff
+; GFX11-TRUE16-NEXT:    s_and_b32 s7, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s5, s8, s5
+; GFX11-TRUE16-NEXT:    s_and_b32 s7, s27, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s7
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v2, 16, 1
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s24, s5, 16
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s4, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, v8, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v10, v2
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s7, s4, 0x10010
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-TRUE16-NEXT:    s_add_i32 s7, s7, s4
+; GFX11-TRUE16-NEXT:    s_bitset1_b32 s4, 22
+; GFX11-TRUE16-NEXT:    s_addk_i32 s7, 0x7fff
+; GFX11-TRUE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s4, s4, s7
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s27, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s73, s4, 16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v54.l
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s7, s5, 0x10010
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.h, v52.l
+; GFX11-TRUE16-NEXT:    s_add_i32 s7, s7, s5
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_addk_i32 s7, 0x7fff
+; GFX11-TRUE16-NEXT:    s_bitset1_b32 s5, 22
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v2
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s4, s5, s7
+; GFX11-TRUE16-NEXT:    s_and_b32 s5, s26, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s27, s4, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v55.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[6:7], 24, v[22:23]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[7:8], 24, v[20:21]
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.h, v2.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[8:9], 24, v[18:19]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[9:10], 24, v[16:17]
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s6, s5, 0x10010
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s63
+; GFX11-TRUE16-NEXT:    s_add_i32 s6, s6, s5
+; GFX11-TRUE16-NEXT:    s_bitset1_b32 s5, 22
+; GFX11-TRUE16-NEXT:    s_addk_i32 s6, 0x7fff
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s14, s5, s6
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s26, 16
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s12
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s14, 16
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s61
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s40
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v10, 8, v65
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s11, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[1:2], 24, v[64:65]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[2:3], 24, v[68:69]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v65
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s12, s11, 0x10010
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v64
+; GFX11-TRUE16-NEXT:    s_add_i32 s12, s12, s11
+; GFX11-TRUE16-NEXT:    s_bitset1_b32 s11, 22
+; GFX11-TRUE16-NEXT:    s_addk_i32 s12, 0x7fff
+; GFX11-TRUE16-NEXT:    s_and_b32 s14, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s12, s11, s12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v64
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s26, s12, 16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 24, v69
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 8, v69
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v68
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 8, v68
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 24, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 8, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 8, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 24, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 8, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 24, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 8, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 24, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 8, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v12, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v11, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 8, v4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s62
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s72
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s47, s27, s73
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s46, s26, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[94:95], s[8:9], 24
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[12:13], s[4:5], 24
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[14:15], s[44:45], 24
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[40:41], s[42:43], 24
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[56:57], s[28:29], 24
+; GFX11-TRUE16-NEXT:    s_lshr_b64 vcc, s[46:47], 24
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[34:35], s[10:11], 24
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[30:31], s[6:7], 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s47, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s47, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s46, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s46, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s47, s11, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s11, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s57, s10, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s10, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s74, s9, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s9, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s75, s8, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s8, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s76, s7, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s77, s7, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s78, s6, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s79, s6, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s88, s5, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s89, s5, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s90, s4, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s91, s4, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s92, s45, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s45, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s93, s44, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s44, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s95, s43, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s43, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s99, s42, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s100, s42, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s101, s29, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s102, s29, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s103, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s104, s28, 8
+; GFX11-TRUE16-NEXT:    s_branch .LBB91_5
+; GFX11-TRUE16-NEXT:  .LBB91_3:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr74
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr104
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr103
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr56
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr102
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr11
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr101
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr100
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr99
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr40
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr14
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr12
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr49
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr35
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr6
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr50
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr7
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr36
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr64
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr38
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr54
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr96
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr51
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr68
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr65
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr8
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr55
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr39
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr71
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr69
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr9
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr66
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr82
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr83
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr80
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr70
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr48
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr84
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr98
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr81
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr87
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr85
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr30
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr94
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr92
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr90
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr88
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr78
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 0
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s5, 1
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s4, 2
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s5, 3
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s74, 4
+; GFX11-TRUE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s75, 5
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr74
+; GFX11-TRUE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr5
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s74, 6
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v43, s75, 7
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr74
+; GFX11-TRUE16-NEXT:    s_branch .LBB91_2
+; GFX11-TRUE16-NEXT:  .LBB91_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v13, s94 :: v_dual_mov_b32 v14, s30
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s94, v43, 2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v96, s37 :: v_dual_mov_b32 v87, s34
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s49 :: v_dual_mov_b32 v5, s35
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s95, v43, 3
+; GFX11-TRUE16-NEXT:    v_readlane_b32 vcc_lo, v43, 6
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v43, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s34, v43, 4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v55, s42 :: v_dual_mov_b32 v54, s43
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v52, s10 :: v_dual_mov_b32 v53, s44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v50, s45 :: v_dual_mov_b32 v49, s98
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v51, s46 :: v_dual_mov_b32 v38, s47
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v48, s97 :: v_dual_mov_b32 v39, s58
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, s59 :: v_dual_mov_b32 v36, s60
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v37, s9 :: v_dual_mov_b32 v32, s61
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v34, s8 :: v_dual_mov_b32 v33, s62
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, s63 :: v_dual_mov_b32 v30, s72
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, s96 :: v_dual_mov_b32 v26, s73
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, s7 :: v_dual_mov_b32 v27, s28
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v24, s29 :: v_dual_mov_b32 v25, s6
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, s87 :: v_dual_mov_b32 v64, s86
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, s85 :: v_dual_mov_b32 v10, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v67, s4 :: v_dual_mov_b32 v68, s48
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v65, s81 :: v_dual_mov_b32 v66, s84
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v70, s83 :: v_dual_mov_b32 v69, s70
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, s82 :: v_dual_mov_b32 v23, s80
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v80, s71 :: v_dual_mov_b32 v71, s66
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s39 :: v_dual_mov_b32 v21, s69
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v82, s68 :: v_dual_mov_b32 v81, s55
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, s67 :: v_dual_mov_b32 v19, s65
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v84, s38 :: v_dual_mov_b32 v83, s51
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s64 :: v_dual_mov_b32 v17, s54
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v86, s53 :: v_dual_mov_b32 v11, s52
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v85, s36 :: v_dual_mov_b32 v12, s50
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, s74 :: v_dual_mov_b32 v2, s76
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s78 :: v_dual_mov_b32 v7, s88
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s90 :: v_dual_mov_b32 v9, s92
+; GFX11-TRUE16-NEXT:    s_mov_b32 s58, s11
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s59, v43, 8
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s60, v43, 9
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s61, v43, 10
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s62, v43, 11
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s63, v43, 12
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s72, v43, 13
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s73, v43, 14
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s13, v43, 15
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s15, v43, 16
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s41, v43, 17
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s46, v43, 18
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s47, v43, 19
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s11, v43, 20
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s57, v43, 21
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s10, v43, 22
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s74, v43, 23
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s9, v43, 24
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s75, v43, 25
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s8, v43, 26
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s76, v43, 27
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s77, v43, 28
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s78, v43, 29
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s79, v43, 30
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s88, v43, 31
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s89, v42, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s90, v42, 1
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s91, v42, 2
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s92, v42, 3
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s45, v42, 4
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s93, v42, 5
+; GFX11-TRUE16-NEXT:    v_readlane_b32 vcc_hi, v43, 7
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s44, v42, 6
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v43, 1
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s95, v42, 7
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s43, v42, 8
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s35, v43, 5
+; GFX11-TRUE16-NEXT:  .LBB91_5: ; %end
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s104, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s5, s103, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s56, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s0, s0, s4
+; GFX11-TRUE16-NEXT:    s_or_b32 s4, s5, s6
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s102, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s6, s58, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s7, s101, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s1, s1, s5
+; GFX11-TRUE16-NEXT:    s_or_b32 s5, s6, s7
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX11-TRUE16-NEXT:    s_or_b32 s0, s0, s4
+; GFX11-TRUE16-NEXT:    s_or_b32 s1, s1, s5
+; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s100, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s5, s99, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s40, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s2, s2, s4
+; GFX11-TRUE16-NEXT:    s_or_b32 s4, s5, s6
+; GFX11-TRUE16-NEXT:    s_and_b32 s3, s3, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s43, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s6, s59, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s7, s95, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s3, s3, s5
+; GFX11-TRUE16-NEXT:    s_or_b32 s5, s6, s7
+; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX11-TRUE16-NEXT:    s_or_b32 s2, s2, s4
+; GFX11-TRUE16-NEXT:    s_or_b32 s3, s3, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v97, s0 :: v_dual_mov_b32 v98, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v99, s2 :: v_dual_mov_b32 v100, s3
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s16, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s44, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s2, s93, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s14, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-TRUE16-NEXT:    s_or_b32 s1, s2, s3
+; GFX11-TRUE16-NEXT:    s_and_b32 s2, s17, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s45, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s60, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s92, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s2, s2, s3
+; GFX11-TRUE16-NEXT:    s_or_b32 s3, s4, s5
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-TRUE16-NEXT:    s_or_b32 s1, s2, s3
+; GFX11-TRUE16-NEXT:    s_and_b32 s2, s18, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s91, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s90, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s12, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s2, s2, s3
+; GFX11-TRUE16-NEXT:    s_or_b32 s3, s4, s5
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s19, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s89, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s6, s61, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s7, s88, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s4, s4, s5
+; GFX11-TRUE16-NEXT:    s_or_b32 s5, s6, s7
+; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s4, 0xffff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX11-TRUE16-NEXT:    s_or_b32 s2, s2, s3
+; GFX11-TRUE16-NEXT:    s_or_b32 s3, s4, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v112, s0 :: v_dual_mov_b32 v113, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v114, s2 :: v_dual_mov_b32 v115, s3
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s20, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s79, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s2, s78, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s30, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-TRUE16-NEXT:    s_or_b32 s1, s2, s3
+; GFX11-TRUE16-NEXT:    s_and_b32 s2, s21, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s77, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s62, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s76, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s2, s2, s3
+; GFX11-TRUE16-NEXT:    s_or_b32 s3, s4, s5
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-TRUE16-NEXT:    s_or_b32 s1, s2, s3
+; GFX11-TRUE16-NEXT:    s_and_b32 s2, s22, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s8, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s75, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s94, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s2, s2, s3
+; GFX11-TRUE16-NEXT:    s_or_b32 s3, s4, s5
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s23, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s9, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s6, s63, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s7, s74, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT:    s_or_b32 s4, s4, s5
+; GFX11-TRUE16-NEXT:    s_or_b32 s5, s6, s7
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s4, 0xffff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX11-TRUE16-NEXT:    s_or_b32 s2, s2, s3
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[97:100], off
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[112:115], off offset:16
+; GFX11-TRUE16-NEXT:    s_or_b32 s3, s4, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v97, s0 :: v_dual_mov_b32 v98, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v99, s2 :: v_dual_mov_b32 v100, s3
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s24, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s10, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s2, s57, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s34, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-TRUE16-NEXT:    s_or_b32 s1, s2, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s11, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, s25, 0xff
+; GFX11-TRUE16-NEXT:    s_and_b32 s3, s72, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s47, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s1, s1, s2
+; GFX11-TRUE16-NEXT:    s_or_b32 s2, s3, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s3, s26, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s46, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s1, s1, s2
+; GFX11-TRUE16-NEXT:    s_or_b32 s2, s3, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s3, s41, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, vcc_lo, 8
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s15, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s3, s3, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s27, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s13, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s4, s4, s5
+; GFX11-TRUE16-NEXT:    s_and_b32 s5, s73, 0xff
+; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX11-TRUE16-NEXT:    s_or_b32 s5, s5, s6
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s4, 0xffff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v112, s0 :: v_dual_and_b32 v27, 0xff, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v113, s1 :: v_dual_lshlrev_b32 v4, 8, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
+; GFX11-TRUE16-NEXT:    s_or_b32 s2, s2, s3
+; GFX11-TRUE16-NEXT:    s_or_b32 s3, s4, s5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v115, s3 :: v_dual_and_b32 v96, 0xff, v96
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v27, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v114, s2 :: v_dual_lshlrev_b32 v5, 8, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v96, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xff, v26
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v4, v14
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xff, v25
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 8, v87
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xff, v30
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v30, 0xff, v86
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v26, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v14, v24
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v25, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v30, v13
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xff, v28
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 8, v85
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xff, v33
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xff, v84
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v28, 0xff, v29
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xff, v31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 8, v83
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v14, v24
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v25, v16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v26, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v28, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v29, v30
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v4, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v11, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v12, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v16, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v17, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xff, v36
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 8, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xff, v82
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xff, v32
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 8, v19
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xff, v34
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 8, v81
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xff, v39
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 8, v20
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v9, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v12, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v13, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v16, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v18, v19
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xff, v80
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xff, v35
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 8, v21
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xff, v37
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 8, v71
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xff, v51
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 8, v22
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xff, v70
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v14, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v16, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v18, v19
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v20, v21
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v22, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v7
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v9, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v11, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v13, v18
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v14, v16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v17, v19
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v38
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 8, v23
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xff, v48
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 8, v69
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xff, v53
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 8, v68
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xff, v67
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xff, v50
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 8, v66
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v8, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v14, v16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v17, v18
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v19, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v20, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 8, v65
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xff, v55
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xff, v49
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 8, v64
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xff, v15
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xff, v54
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xff, v52
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v14, v18
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v19, v20
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v15, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v21, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v22, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v14
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v8, v9
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v41, 7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v17, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v16, v19
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v18, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v10, v3
+; GFX11-TRUE16-NEXT:    s_clause 0x5
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[97:100], off offset:32
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[112:115], off offset:48
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[27:30], off offset:64
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[4:7], off offset:80
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[11:14], off offset:96
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[15:18], off offset:112
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v41, 8
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s104, v41, 6
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s103, v41, 5
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s102, v41, 4
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s101, v41, 3
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s100, v41, 2
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s99, v41, 1
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s98, v41, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s97, v40, 31
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s96, v40, 30
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s87, v40, 29
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s86, v40, 28
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s85, v40, 27
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s84, v40, 26
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s83, v40, 25
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s82, v40, 24
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s81, v40, 23
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s80, v40, 22
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s71, v40, 21
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s70, v40, 20
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s69, v40, 19
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s68, v40, 18
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s67, v40, 17
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s66, v40, 16
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s65, v40, 15
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s64, v40, 14
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s55, v40, 13
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s54, v40, 12
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s53, v40, 11
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s52, v40, 10
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s51, v40, 9
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s50, v40, 8
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s49, v40, 7
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s48, v40, 6
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s39, v40, 5
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s38, v40, 4
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s37, v40, 3
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s36, v40, 2
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s35, v40, 1
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s34, v40, 0
+; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX11-TRUE16-NEXT:    s_clause 0x3
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v128i8_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX11-FAKE16-NEXT:    s_clause 0x3
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:4
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:8
+; GFX11-FAKE16-NEXT:    ; meta instruction
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v43, s32 offset:12
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s4
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s34, 0
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s35, 1
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s36, 2
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s37, 3
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s38, 4
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s39, 5
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s48, 6
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s49, 7
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s50, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s51, 9
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s52, 10
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s53, 11
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s54, 12
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s55, 13
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s64, 14
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s65, 15
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s66, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s67, 17
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s68, 18
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s69, 19
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s70, 20
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s71, 21
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s80, 22
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s81, 23
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s82, 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s83, 25
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s84, 26
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s85, 27
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s86, 28
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s87, 29
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s96, 30
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s97, 31
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v41, s98, 0
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v41, s99, 1
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v41, s100, 2
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v41, s101, 3
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v41, s102, 4
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v41, s103, 5
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v41, s104, 6
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v41, s30, 7
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v41, s31, 8
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v15
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s72, v1
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s73, v2
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s62, v3
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s63, v4
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s60, v5
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s61, v6
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s58, v7
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s59, v8
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s56, v9
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s57, v10
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s46, v11
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s47, v12
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s44, v13
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s45, v14
+; GFX11-FAKE16-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB91_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s27, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[12:13], s[26:27], 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 15
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s27, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s99, s2, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s100, s2, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s101, s1, 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 14
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s27, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s11, s1, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s102, s1, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s103, s0, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s26, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s104, s0, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s85, s45, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s45, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 17
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s26, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s5, s45, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s87, s44, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s86, s44, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 18
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s25, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s81, s47, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s98, s47, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s84, s47, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 19
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s25, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s48, s46, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s70, s57, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s97, s57, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 13
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s25, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s80, s57, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s83, s56, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s82, s56, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 20
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s24, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s66, s59, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s9, s59, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s69, s59, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 21
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s24, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s71, s58, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s39, s58, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s55, s61, 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 22
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s23, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s8, s61, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s65, s61, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s68, s60, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 23
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s23, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s67, s60, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s51, s63, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s96, s63, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 12
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s23, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s54, s63, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s38, s62, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s64, s62, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s22, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s36, s73, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s7, s73, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s50, s73, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 25
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s22, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s53, s72, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s52, s72, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s34, s29, 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 26
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s21, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s6, s29, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s35, s29, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s37, s28, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 27
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s21, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s49, s28, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[14:15], s[16:17], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[40:41], s[2:3], 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 11
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s21, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[42:43], s[0:1], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[74:75], s[44:45], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[76:77], s[46:47], 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 28
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s20, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[78:79], s[56:57], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[88:89], s[58:59], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[90:91], s[60:61], 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 29
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s20, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[92:93], s[62:63], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[94:95], s[72:73], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[30:31], s[28:29], 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 30
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s19, 24
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 31
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s19, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 10
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s19, 8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s4, 0
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s4, 1
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s18, 8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s4, 2
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s17, 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s4, 3
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s17, 16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 9
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s17, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s4, 4
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s16, 16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s4, 5
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s16, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s4, 6
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s3, 24
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s4, 7
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s3, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s3, 8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s4, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s46, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s12, 6
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s13, 7
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[12:13], s[24:25], 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s12, 4
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s13, 5
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[12:13], s[22:23], 24
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s12, 2
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s13, 3
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[12:13], s[20:21], 24
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s12, 0
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s13, 1
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[12:13], s[18:19], 24
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, vcc_hi
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB91_4
+; GFX11-FAKE16-NEXT:  .LBB91_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s29, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_and_b32 s14, s47, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s1, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s15, s47, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s29, 16
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s6
+; GFX11-FAKE16-NEXT:    s_and_b32 s8, s45, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s47, v6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s45, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s78, s28, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s6, s47, 0x10010
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s79, s28, 16
+; GFX11-FAKE16-NEXT:    s_add_i32 s45, s6, s47
+; GFX11-FAKE16-NEXT:    s_and_b32 s5, s73, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s77, s73, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s75, s72, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s76, s72, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s11, s63, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s74, s63, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s72, s62, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s73, s62, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s63, s61, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s62, s61, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s61, s60, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s60, s60, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s41, s59, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s40, s59, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s28, s58, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s29, s58, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s13, s57, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s57, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s42, s56, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s43, s56, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s12, s46, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s9, s46, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s44, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s44, 16
+; GFX11-FAKE16-NEXT:    s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT:    s_bitset1_b32 s47, 22
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    s_and_b32 s44, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s44, s47, s45
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v2
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s58, s44, 16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s78
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s79
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s45, s1, 0x10010
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v6, 16, 1
+; GFX11-FAKE16-NEXT:    s_add_i32 s45, s45, s1
+; GFX11-FAKE16-NEXT:    s_bitset1_b32 s1, 22
+; GFX11-FAKE16-NEXT:    s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT:    s_and_b32 s44, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s1, s1, s45
+; GFX11-FAKE16-NEXT:    s_and_b32 s44, s0, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v2
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s44
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v4, v6
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s44, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s45, s44, 0x10010
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_add_i32 s45, s45, s44
+; GFX11-FAKE16-NEXT:    s_bitset1_b32 s44, 22
+; GFX11-FAKE16-NEXT:    s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT:    s_and_b32 s46, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, v5, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v6
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s44, s44, s45
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v21
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s5
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s77
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s5, s0, 0x10010
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v22, 16, v4
+; GFX11-FAKE16-NEXT:    s_add_i32 s45, s5, s0
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s5, s44, 16
+; GFX11-FAKE16-NEXT:    s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT:    s_bitset1_b32 s0, 22
+; GFX11-FAKE16-NEXT:    s_and_b32 s44, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s0, s0, s45
+; GFX11-FAKE16-NEXT:    s_and_b32 s44, s3, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s44
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v8, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v1, v5, 16, 1
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v23
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s44, v9
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v6, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, v1, v5
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v2, 16, v3
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s45, s44, 0x10010
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v5
+; GFX11-FAKE16-NEXT:    s_add_i32 s45, s45, s44
+; GFX11-FAKE16-NEXT:    s_bitset1_b32 s44, 22
+; GFX11-FAKE16-NEXT:    s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT:    s_and_b32 s46, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v8
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s44, s44, s45
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s76
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s59, s44, 16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s75
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s3, v10
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 24, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 16, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s45, s3, 0x10010
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    s_add_i32 s45, s45, s3
+; GFX11-FAKE16-NEXT:    s_bitset1_b32 s3, 22
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT:    s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT:    s_and_b32 s44, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s3, s3, s45
+; GFX11-FAKE16-NEXT:    s_and_b32 s44, s2, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s44
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v8, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s44, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v4
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s45, s44, 0x10010
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v24
+; GFX11-FAKE16-NEXT:    s_add_i32 s45, s45, s44
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT:    s_bitset1_b32 s44, 22
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s74
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v25, 16, v5
+; GFX11-FAKE16-NEXT:    s_and_b32 s46, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s44, s44, s45
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v14
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s2, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s11
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s11, s2, 0x10010
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    s_add_i32 s45, s11, s2
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s11, s44, 16
+; GFX11-FAKE16-NEXT:    s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT:    s_bitset1_b32 s2, 22
+; GFX11-FAKE16-NEXT:    s_and_b32 s44, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s2, s2, s45
+; GFX11-FAKE16-NEXT:    s_and_b32 s44, s17, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v26
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s44
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v3, 16, 1
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v2, 16, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s44, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v10, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v86, 16, v13
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s45, s44, 0x10010
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    s_add_i32 s45, s45, s44
+; GFX11-FAKE16-NEXT:    s_bitset1_b32 s44, 22
+; GFX11-FAKE16-NEXT:    s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    s_and_b32 s46, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s44, s44, s45
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s17, s17, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s73
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s17
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s72
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s17, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s72, s44, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s45, s17, 0x10010
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v27
+; GFX11-FAKE16-NEXT:    s_add_i32 s45, s45, s17
+; GFX11-FAKE16-NEXT:    s_bitset1_b32 s17, 22
+; GFX11-FAKE16-NEXT:    s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT:    s_and_b32 s44, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v16, v28, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s17, s17, s45
+; GFX11-FAKE16-NEXT:    s_and_b32 s44, s16, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s17, s17, 16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s63
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 24, v16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v5, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v29
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v8, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s44
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s44, v8
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s45, s44, 0x10010
+; GFX11-FAKE16-NEXT:    s_add_i32 s45, s45, s44
+; GFX11-FAKE16-NEXT:    s_bitset1_b32 s44, 22
+; GFX11-FAKE16-NEXT:    s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT:    s_and_b32 s46, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s44, s44, s45
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s16, s16, 16
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s46, s44, 16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s16, v8
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s62
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s45, s16, 0x10010
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_add_i32 s45, s45, s16
+; GFX11-FAKE16-NEXT:    s_bitset1_b32 s16, 22
+; GFX11-FAKE16-NEXT:    s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT:    s_and_b32 s44, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s16, s16, s45
+; GFX11-FAKE16-NEXT:    s_and_b32 s44, s19, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s44
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v1, 16, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v4, 16, 1
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s16, s16, 16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s44, v10
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, v9, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s60
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v4
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s45, s44, 0x10010
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s61
+; GFX11-FAKE16-NEXT:    s_add_i32 s45, s45, s44
+; GFX11-FAKE16-NEXT:    s_bitset1_b32 s44, 22
+; GFX11-FAKE16-NEXT:    s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s44, s44, s45
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s19, s19, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v9
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s19
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v8, 16, 1
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s60, s44, 16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v1
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s19, v10
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v9, v8
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s45, s19, 0x10010
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v2
+; GFX11-FAKE16-NEXT:    s_add_i32 s45, s45, s19
+; GFX11-FAKE16-NEXT:    s_bitset1_b32 s19, 22
+; GFX11-FAKE16-NEXT:    s_addk_i32 s45, 0x7fff
+; GFX11-FAKE16-NEXT:    s_and_b32 s44, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s19, s19, s45
+; GFX11-FAKE16-NEXT:    s_and_b32 s44, s18, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, v3, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s44
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s19, s19, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s29
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s41
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s41, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s47, s17, s72
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v3, 16, 1
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s44, s41, 0x10010
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    s_add_i32 s44, s44, s41
+; GFX11-FAKE16-NEXT:    s_bitset1_b32 s41, 22
+; GFX11-FAKE16-NEXT:    s_addk_i32 s44, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s40
+; GFX11-FAKE16-NEXT:    s_and_b32 s45, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s41, s41, s44
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s18, s18, 16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v31
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v32
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v30, 16, v4
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s18, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v17, v1, 16, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v3
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s40, s18, 0x10010
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s28
+; GFX11-FAKE16-NEXT:    s_add_i32 s44, s40, s18
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s40, s41, 16
+; GFX11-FAKE16-NEXT:    s_addk_i32 s44, 0x7fff
+; GFX11-FAKE16-NEXT:    s_bitset1_b32 s18, 22
+; GFX11-FAKE16-NEXT:    s_and_b32 s41, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s18, s18, s44
+; GFX11-FAKE16-NEXT:    s_and_b32 s41, s21, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s41
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v9, 16, 1
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s18, s18, 16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s28, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v9
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v8, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v10, 16, 1
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s29, s28, 0x10010
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v1
+; GFX11-FAKE16-NEXT:    s_add_i32 s29, s29, s28
+; GFX11-FAKE16-NEXT:    s_bitset1_b32 s28, 22
+; GFX11-FAKE16-NEXT:    s_addk_i32 s29, 0x7fff
+; GFX11-FAKE16-NEXT:    s_and_b32 s41, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s28, s28, s29
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s21, s21, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v11, 0x40c00000, s21
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s61, s28, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, v5, v10
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s44, s2, s11
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s21, v11
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v4, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s29, s21, 0x10010
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v2
+; GFX11-FAKE16-NEXT:    s_add_i32 s29, s29, s21
+; GFX11-FAKE16-NEXT:    s_bitset1_b32 s21, 22
+; GFX11-FAKE16-NEXT:    s_addk_i32 s29, 0x7fff
+; GFX11-FAKE16-NEXT:    s_and_b32 s28, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s21, s21, s29
+; GFX11-FAKE16-NEXT:    s_and_b32 s28, s20, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s28
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v10
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s21, s21, 16
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s45, s3, s59
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s46, s16, s46
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s13
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s13, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 24, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s28, s13, 0x10010
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v34
+; GFX11-FAKE16-NEXT:    s_add_i32 s28, s28, s13
+; GFX11-FAKE16-NEXT:    s_bitset1_b32 s13, 22
+; GFX11-FAKE16-NEXT:    s_addk_i32 s28, 0x7fff
+; GFX11-FAKE16-NEXT:    s_and_b32 s29, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s13, s13, s28
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s20, s20, 16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s20
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v1, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s10
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v33, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v35
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s20, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, v1, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v19, v2, 16, v9
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s10, s20, 0x10010
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT:    s_add_i32 s28, s10, s20
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s13, 16
+; GFX11-FAKE16-NEXT:    s_addk_i32 s28, 0x7fff
+; GFX11-FAKE16-NEXT:    s_bitset1_b32 s20, 22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v3
+; GFX11-FAKE16-NEXT:    s_and_b32 s13, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s13, s20, s28
+; GFX11-FAKE16-NEXT:    s_and_b32 s20, s23, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s42
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s20
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s43
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s28, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v19
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s20, s28, 0x10010
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v8, 16, 1
+; GFX11-FAKE16-NEXT:    s_add_i32 s29, s20, s28
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s20, s13, 16
+; GFX11-FAKE16-NEXT:    s_addk_i32 s29, 0x7fff
+; GFX11-FAKE16-NEXT:    s_bitset1_b32 s28, 22
+; GFX11-FAKE16-NEXT:    s_and_b32 s13, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s13, s28, s29
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s23, s23, 16
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v4, v8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s62, s13, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, v5, v9
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s23, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s28, s23, 0x10010
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v9
+; GFX11-FAKE16-NEXT:    s_add_i32 s28, s28, s23
+; GFX11-FAKE16-NEXT:    s_bitset1_b32 s23, 22
+; GFX11-FAKE16-NEXT:    s_addk_i32 s28, 0x7fff
+; GFX11-FAKE16-NEXT:    s_and_b32 s13, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s13, s23, s28
+; GFX11-FAKE16-NEXT:    s_and_b32 s23, s22, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v36
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s23
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s14
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s23, s13, 16
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v8, 16, 1
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s14, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v71, v37, 16, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s12
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s15, s14, 0x10010
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-FAKE16-NEXT:    s_add_i32 s15, s15, s14
+; GFX11-FAKE16-NEXT:    s_bitset1_b32 s14, 22
+; GFX11-FAKE16-NEXT:    s_addk_i32 s15, 0x7fff
+; GFX11-FAKE16-NEXT:    s_and_b32 s13, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s13, s14, s15
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s14, s22, 16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s14
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v1, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v38
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, v9, v8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s13, s13, 16
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s14, v10
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, v1, v5
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v70, v2, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v9
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s12, s14, 0x10010
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v8
+; GFX11-FAKE16-NEXT:    s_add_i32 s12, s12, s14
+; GFX11-FAKE16-NEXT:    s_bitset1_b32 s14, 22
+; GFX11-FAKE16-NEXT:    s_addk_i32 s12, 0x7fff
+; GFX11-FAKE16-NEXT:    s_and_b32 s15, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s12, s14, s12
+; GFX11-FAKE16-NEXT:    s_and_b32 s14, s25, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s14
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s9
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s9, v10
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s22, s12, 16
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v2
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s14, s9, 0x10010
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    s_add_i32 s14, s14, s9
+; GFX11-FAKE16-NEXT:    s_bitset1_b32 s9, 22
+; GFX11-FAKE16-NEXT:    s_addk_i32 s14, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-FAKE16-NEXT:    s_and_b32 s12, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s9, s9, s14
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s12, s25, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s8
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s12
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v3, v4
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s63, s9, 16
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v8, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v3, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v8
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s12, s8, 0x10010
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v12, v9, 16, 1
+; GFX11-FAKE16-NEXT:    s_add_i32 s12, s12, s8
+; GFX11-FAKE16-NEXT:    s_bitset1_b32 s8, 22
+; GFX11-FAKE16-NEXT:    s_addk_i32 s12, 0x7fff
+; GFX11-FAKE16-NEXT:    s_and_b32 s9, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s8, s8, s12
+; GFX11-FAKE16-NEXT:    s_and_b32 s9, s24, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s25, s8, 16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s9
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s7
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v12, v9
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v12, 0x40c00000, s6
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s7, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v9
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s28, s0, s5
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s9, s7, 0x10010
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v3
+; GFX11-FAKE16-NEXT:    s_add_i32 s9, s9, s7
+; GFX11-FAKE16-NEXT:    s_bitset1_b32 s7, 22
+; GFX11-FAKE16-NEXT:    s_addk_i32 s9, 0x7fff
+; GFX11-FAKE16-NEXT:    s_and_b32 s8, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s7, s7, s9
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s24, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s8
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v8, 16, 1
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s12, s7, 16
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s8, v10
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v8
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v12, 16, 1
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s4, s8, 0x10010
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v2
+; GFX11-FAKE16-NEXT:    s_add_i32 s4, s4, s8
+; GFX11-FAKE16-NEXT:    s_bitset1_b32 s8, 22
+; GFX11-FAKE16-NEXT:    s_addk_i32 s4, 0x7fff
+; GFX11-FAKE16-NEXT:    s_and_b32 s6, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s4, s8, s4
+; GFX11-FAKE16-NEXT:    s_and_b32 s6, s27, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v52, 0x40c00000, s6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v10, v12
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s24, s4, 16
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s6, v52
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v9, 16, 1
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s7, s6, 0x10010
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_add_i32 s7, s7, s6
+; GFX11-FAKE16-NEXT:    s_bitset1_b32 s6, 22
+; GFX11-FAKE16-NEXT:    s_addk_i32 s7, 0x7fff
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s4, s6, s7
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s27, 16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v4, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v12
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s73, s4, 16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v49
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s6, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v51
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v66, v1, 16, v11
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s7, s6, 0x10010
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    s_add_i32 s7, s7, s6
+; GFX11-FAKE16-NEXT:    s_bitset1_b32 s6, 22
+; GFX11-FAKE16-NEXT:    s_addk_i32 s7, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s4, s6, s7
+; GFX11-FAKE16-NEXT:    s_and_b32 s6, s26, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s27, s4, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v52
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v39
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v55, v50, 16, v4
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s13
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v54, v2, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v67, v48, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[8:9], 24, v[17:18]
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s5, s6, 0x10010
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[9:10], 24, v[15:16]
+; GFX11-FAKE16-NEXT:    s_add_i32 s5, s5, s6
+; GFX11-FAKE16-NEXT:    s_bitset1_b32 s6, 22
+; GFX11-FAKE16-NEXT:    s_addk_i32 s5, 0x7fff
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s14, s6, s5
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s26, 16
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s10
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s13, s14, 16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[10:11], 24, v[13:14]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[6:7]
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s29, s1, s58
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s11, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[1:2], 24, v[54:55]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[2:3], 24, v[66:67]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[70:71]
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s10, s11, 0x10010
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[4:5], 24, v[19:20]
+; GFX11-FAKE16-NEXT:    s_add_i32 s10, s10, s11
+; GFX11-FAKE16-NEXT:    s_bitset1_b32 s11, 22
+; GFX11-FAKE16-NEXT:    s_addk_i32 s10, 0x7fff
+; GFX11-FAKE16-NEXT:    s_and_b32 s14, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s10, s11, s10
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s60
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s26, s10, 16
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s40
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s62
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 24, v55
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 8, v55
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v54
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 8, v54
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 24, v67
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v67
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v66
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 8, v66
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 24, v71
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 8, v71
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v70
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v70
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 24, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v20, 8, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v19, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 8, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 8, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 8, v6
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s61
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s63
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s57, s27, s73
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s56, s26, s13
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s12
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[94:95], s[8:9], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[12:13], s[4:5], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[14:15], s[46:47], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[40:41], s[44:45], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[42:43], s[28:29], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b64 vcc, s[56:57], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[34:35], s[10:11], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[30:31], s[6:7], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s13, s57, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s15, s57, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s56, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s56, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s56, s11, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s11, s11, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s57, s10, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s10, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s74, s9, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s9, s9, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s75, s8, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s8, s8, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s76, s7, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s77, s7, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s78, s6, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s79, s6, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s88, s5, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s89, s5, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s90, s4, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s91, s4, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s92, s47, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s47, s47, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s93, s46, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s46, s46, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s95, s45, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s45, s45, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s99, s44, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s100, s44, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s101, s29, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s102, s29, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s103, s28, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s104, s28, 8
+; GFX11-FAKE16-NEXT:    s_branch .LBB91_5
+; GFX11-FAKE16-NEXT:  .LBB91_3:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr74
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr104
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr103
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr102
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr101
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr100
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr99
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr40
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr12
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr96
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr67
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr68
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr55
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr71
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr82
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr83
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr80
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr97
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr70
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr84
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr98
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr81
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr86
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr87
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr85
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr94
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr92
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr90
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr88
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr78
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr76
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 0
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s5, 1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s4, 2
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s5, 3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s74, 4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s75, 5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr74
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; kill: killed $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr5
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s74, 6
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v43, s75, 7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr74
+; GFX11-FAKE16-NEXT:    s_branch .LBB91_2
+; GFX11-FAKE16-NEXT:  .LBB91_4:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, s94 :: v_dual_mov_b32 v11, s30
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s94, v43, 2
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v96, s37 :: v_dual_mov_b32 v87, s34
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, s49 :: v_dual_mov_b32 v7, s35
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s95, v43, 3
+; GFX11-FAKE16-NEXT:    v_readlane_b32 vcc_lo, v43, 6
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v43, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s34, v43, 4
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v52, s44 :: v_dual_mov_b32 v51, s45
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v50, s10 :: v_dual_mov_b32 v49, s46
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v39, s47 :: v_dual_mov_b32 v48, s98
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v38, s56 :: v_dual_mov_b32 v37, s97
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v36, s57 :: v_dual_mov_b32 v35, s58
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, s59 :: v_dual_mov_b32 v33, s9
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, s60 :: v_dual_mov_b32 v31, s61
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v30, s8 :: v_dual_mov_b32 v29, s62
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v27, s63 :: v_dual_mov_b32 v28, s96
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v26, s72 :: v_dual_mov_b32 v25, s7
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v24, s73 :: v_dual_mov_b32 v23, s28
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v21, s29 :: v_dual_mov_b32 v22, s6
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v53, s87 :: v_dual_mov_b32 v54, s86
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, s85 :: v_dual_mov_b32 v12, s5
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v65, s4 :: v_dual_mov_b32 v66, s48
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v55, s81 :: v_dual_mov_b32 v64, s84
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v69, s83 :: v_dual_mov_b32 v70, s82
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v67, s70 :: v_dual_mov_b32 v68, s80
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v80, s71 :: v_dual_mov_b32 v19, s39
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v71, s66 :: v_dual_mov_b32 v20, s69
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v82, s68 :: v_dual_mov_b32 v17, s67
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v81, s55 :: v_dual_mov_b32 v18, s65
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v84, s38 :: v_dual_mov_b32 v15, s64
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v83, s51 :: v_dual_mov_b32 v16, s54
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v86, s53 :: v_dual_mov_b32 v13, s52
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v85, s36 :: v_dual_mov_b32 v14, s50
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, s74 :: v_dual_mov_b32 v2, s76
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v3, s78 :: v_dual_mov_b32 v4, s88
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, s90 :: v_dual_mov_b32 v9, s92
+; GFX11-FAKE16-NEXT:    s_mov_b32 s58, s11
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s59, v43, 8
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s72, v43, 9
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s60, v43, 10
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s61, v43, 11
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s62, v43, 12
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s63, v43, 13
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s73, v43, 14
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s13, v43, 15
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s15, v43, 16
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s41, v43, 17
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s43, v43, 18
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s56, v43, 19
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s11, v43, 20
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s57, v43, 21
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s10, v43, 22
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s74, v43, 23
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s9, v43, 24
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s75, v43, 25
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s8, v43, 26
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s76, v43, 27
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s77, v43, 28
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s78, v43, 29
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s79, v43, 30
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s88, v43, 31
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s89, v42, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s90, v42, 1
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s91, v42, 2
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s92, v42, 3
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s47, v42, 4
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s93, v42, 5
+; GFX11-FAKE16-NEXT:    v_readlane_b32 vcc_hi, v43, 7
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s46, v42, 6
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v43, 1
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s95, v42, 7
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s45, v42, 8
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s35, v43, 5
+; GFX11-FAKE16-NEXT:  .LBB91_5: ; %end
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s104, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s5, s103, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s42, 8
+; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s4
+; GFX11-FAKE16-NEXT:    s_or_b32 s4, s5, s6
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s102, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s6, s58, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s101, 8
+; GFX11-FAKE16-NEXT:    s_or_b32 s1, s1, s5
+; GFX11-FAKE16-NEXT:    s_or_b32 s5, s6, s7
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s4
+; GFX11-FAKE16-NEXT:    s_or_b32 s1, s1, s5
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s100, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s5, s99, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s40, 8
+; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s4
+; GFX11-FAKE16-NEXT:    s_or_b32 s4, s5, s6
+; GFX11-FAKE16-NEXT:    s_and_b32 s3, s3, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s45, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s6, s59, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s95, 8
+; GFX11-FAKE16-NEXT:    s_or_b32 s3, s3, s5
+; GFX11-FAKE16-NEXT:    s_or_b32 s5, s6, s7
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s4
+; GFX11-FAKE16-NEXT:    s_or_b32 s3, s3, s5
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v97, s0 :: v_dual_mov_b32 v98, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v99, s2 :: v_dual_mov_b32 v100, s3
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s16, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s46, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s93, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s14, 8
+; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-FAKE16-NEXT:    s_or_b32 s1, s2, s3
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s17, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s47, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s72, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s92, 8
+; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
+; GFX11-FAKE16-NEXT:    s_or_b32 s3, s4, s5
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-FAKE16-NEXT:    s_or_b32 s1, s2, s3
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s18, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s91, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s90, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s12, 8
+; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
+; GFX11-FAKE16-NEXT:    s_or_b32 s3, s4, s5
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s19, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s89, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s6, s60, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s88, 8
+; GFX11-FAKE16-NEXT:    s_or_b32 s4, s4, s5
+; GFX11-FAKE16-NEXT:    s_or_b32 s5, s6, s7
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s4, 0xffff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
+; GFX11-FAKE16-NEXT:    s_or_b32 s3, s4, s5
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v112, s0 :: v_dual_mov_b32 v113, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v114, s2 :: v_dual_mov_b32 v115, s3
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s20, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s79, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s78, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s30, 8
+; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-FAKE16-NEXT:    s_or_b32 s1, s2, s3
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s21, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s77, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s61, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s76, 8
+; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
+; GFX11-FAKE16-NEXT:    s_or_b32 s3, s4, s5
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-FAKE16-NEXT:    s_or_b32 s1, s2, s3
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s22, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s8, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s75, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s94, 8
+; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
+; GFX11-FAKE16-NEXT:    s_or_b32 s3, s4, s5
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s23, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s9, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s6, s62, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s74, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT:    s_or_b32 s4, s4, s5
+; GFX11-FAKE16-NEXT:    s_or_b32 s5, s6, s7
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s4, 0xffff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[97:100], off
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[112:115], off offset:16
+; GFX11-FAKE16-NEXT:    s_or_b32 s3, s4, s5
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v97, s0 :: v_dual_mov_b32 v98, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v99, s2 :: v_dual_mov_b32 v100, s3
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s24, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s10, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s57, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s34, 8
+; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-FAKE16-NEXT:    s_or_b32 s1, s2, s4
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s11, 8
+; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s25, 0xff
+; GFX11-FAKE16-NEXT:    s_and_b32 s3, s63, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s56, 8
+; GFX11-FAKE16-NEXT:    s_or_b32 s1, s1, s2
+; GFX11-FAKE16-NEXT:    s_or_b32 s2, s3, s4
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s3, s26, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s43, 8
+; GFX11-FAKE16-NEXT:    s_or_b32 s1, s1, s2
+; GFX11-FAKE16-NEXT:    s_or_b32 s2, s3, s4
+; GFX11-FAKE16-NEXT:    s_and_b32 s3, s41, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, vcc_lo, 8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s15, 8
+; GFX11-FAKE16-NEXT:    s_or_b32 s3, s3, s4
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s27, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s13, 8
+; GFX11-FAKE16-NEXT:    s_or_b32 s4, s4, s5
+; GFX11-FAKE16-NEXT:    s_and_b32 s5, s73, 0xff
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX11-FAKE16-NEXT:    s_or_b32 s5, s5, s6
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s4, 0xffff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v112, s0 :: v_dual_and_b32 v23, 0xff, v23
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v113, s1 :: v_dual_lshlrev_b32 v6, 8, v6
+; GFX11-FAKE16-NEXT:    s_or_b32 s2, s2, s3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v114, s2 :: v_dual_lshlrev_b32 v11, 8, v11
+; GFX11-FAKE16-NEXT:    s_or_b32 s3, s4, s5
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v115, s3 :: v_dual_and_b32 v96, 0xff, v96
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v23, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v96, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v6, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 8, v87
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v86
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v11, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v22, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v26, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v24, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 8, v85
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v84
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 8, v83
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v22, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v24, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v25, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v26, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v6, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v11, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v13, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v15, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v16, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v32
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 8, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v82
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v31
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 8, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v30
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 8, v81
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v35
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 8, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v11, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v13, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v15, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v17, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v80
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v34
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 8, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v33
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 8, v71
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v38
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 8, v70
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v69
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v14, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v15, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v17, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v19, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v21, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v9, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v10, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v36
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 8, v68
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v37
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 8, v67
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v49
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 8, v66
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v65
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v39
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 8, v64
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v10, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v16, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v18, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v19, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v48
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 8, v55
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v52
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 8, v54
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v53
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v51
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 8, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v50
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v16, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v18, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v20, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v21, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v22, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v3, v4
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v41, 7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v10, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v11, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v17, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v12, v5
+; GFX11-FAKE16-NEXT:    s_clause 0x5
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[97:100], off offset:32
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[112:115], off offset:48
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[23:26], off offset:64
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[6:9], off offset:80
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:96
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[1:4], off offset:112
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v41, 8
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s104, v41, 6
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s103, v41, 5
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s102, v41, 4
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s101, v41, 3
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s100, v41, 2
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s99, v41, 1
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s98, v41, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s97, v40, 31
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s96, v40, 30
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s87, v40, 29
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s86, v40, 28
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s85, v40, 27
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s84, v40, 26
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s83, v40, 25
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s82, v40, 24
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s81, v40, 23
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s80, v40, 22
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s71, v40, 21
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s70, v40, 20
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s69, v40, 19
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s68, v40, 18
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s67, v40, 17
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s66, v40, 16
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s65, v40, 15
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s64, v40, 14
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s55, v40, 13
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s54, v40, 12
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s53, v40, 11
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s52, v40, 10
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s51, v40, 9
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s50, v40, 8
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s49, v40, 7
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s48, v40, 6
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s39, v40, 5
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s38, v40, 4
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s37, v40, 3
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s36, v40, 2
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s35, v40, 1
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s34, v40, 0
+; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX11-FAKE16-NEXT:    s_clause 0x3
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s32
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v41, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v42, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v43, off, s32 offset:12
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -182076,9 +187756,10 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s29, 8
 ; GFX11-TRUE16-NEXT:    s_and_b32 s7, s2, 0xff
 ; GFX11-TRUE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s1, 8
-; GFX11-TRUE16-NEXT:    v_and_b32_e64 v5, 0xffff, s5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s5 :: v_dual_and_b32 v1, 0xff, v35
 ; GFX11-TRUE16-NEXT:    s_and_b32 s5, s0, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s1, 8
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s3, 8
 ; GFX11-TRUE16-NEXT:    s_or_b32 s5, s5, s6
 ; GFX11-TRUE16-NEXT:    s_or_b32 s6, s7, s8
@@ -182094,6 +187775,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s21, 8
 ; GFX11-TRUE16-NEXT:    s_and_b32 s9, s22, 0xff
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s23, 8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v32
 ; GFX11-TRUE16-NEXT:    s_or_b32 s7, s7, s8
 ; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s10
 ; GFX11-TRUE16-NEXT:    s_and_b32 s9, s24, 0xff
@@ -182104,201 +187786,169 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_or_b32 s10, s11, s12
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s9, s10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v36
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v32
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v35
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v34
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v33
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v68
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v33
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v64
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v66
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v4, v67
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v65
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v0, 16, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v38
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v39
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v6, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v49
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v3, 16, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v1, v66
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v37
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v70
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v34
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v65
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v36
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xff, v118
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v67
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v38
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v0, v68
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v1, v69
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v39
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v2, v70
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v50
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v71
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v48
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v69
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v82
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v7, v80
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v8, v81
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v9, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v55
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v10, 16, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v84
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v52
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xff, v54
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v86
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v83
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v48
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v49
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v80
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v3, v82
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v55
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v81
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v2, v71
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v51
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v53
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v52
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v83
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v3, v86
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v1, v84
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xff, v96
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v85
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v10, v97
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v11, v87
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v99
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v3, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v103
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xff, v114
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v98
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v0, 16, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v96
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v85
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v54
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v98
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v87
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v99
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v2, v97
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v3, v102
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v103
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v0, v101
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v100
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v113
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v101
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xff, v116
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v14, v128
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v114
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v1, v113
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v117
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v112
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xff, v117
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v102
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v13, v130
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xff, v133
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v14, v132
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v0, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v116
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v128
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v16, v134
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v132
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v133
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v3, v130
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, s6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v0, v161
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v129
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v147
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v148
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xff, v118
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xff, v129
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v16, v161
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v2, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v166
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v144
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v134
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v18, v147
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v167
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v17, 16, v19
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v17, v18, 16, v22
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s7
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v1, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v0, v166
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v144
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v167
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v151
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v149
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v16, v20, 16, v21
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, s8
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v180
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v177
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v0, v180
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v149
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v177
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v165
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v162
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v42
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v41
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v1, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v0, v42
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v162
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v41
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v178
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v115
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v45
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v44
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v0, v45
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v115
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v44
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v131
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v119
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v59
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v56
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v1, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v0, v59
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v119
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v56
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v145
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v135
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v60
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v61
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v0, v60
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v135
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v61
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v150
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v146
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v63
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v62
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v1, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v0, v63
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v146
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v62
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v163
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v160
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v73
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v72
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v0, v73
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v160
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v72
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v176
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v164
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v75
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v74
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v1, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v0, v75
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v164
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v74
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v181
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v179
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v77
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v76
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v0, v77
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v179
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v76
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v183
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v182
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v78
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v79
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v28, v1, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v0, v78
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v182
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v79
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v43
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v40
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v89
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v88
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v29, v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v0, v89
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v40
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v88
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v47
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v46
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v91
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v90
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v30, v1, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v0, v91
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v46
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v90
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v58
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v57
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v92
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v93
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v31, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v0, v92
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v57
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v93
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.h, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB93_3
 ; GFX11-TRUE16-NEXT:  .LBB93_2: ; %cmp.true
@@ -182338,57 +187988,59 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_or_b32 s10, s11, s10
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, s1, s0
 ; GFX11-TRUE16-NEXT:    s_or_b32 s1, s3, s2
+; GFX11-TRUE16-NEXT:    s_addk_i32 s5, 0x300
+; GFX11-TRUE16-NEXT:    s_addk_i32 s6, 0x300
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s9, 0x300
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s0, 0x300
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s1, 0x300
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s10, 0x300
+; GFX11-TRUE16-NEXT:    s_addk_i32 s4, 0x300
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(38)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v57
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s9, s10
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(37)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v58
-; GFX11-TRUE16-NEXT:    s_addk_i32 s5, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s6, 0x300
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v57
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s5, s6
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(35)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v47
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s7, 0x300
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v46
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v92, v0
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v46
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v93, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v92, v0
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v91, v2
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v43
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v27, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v93, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v90, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v31, 0x300, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v30, 0x300, v2
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v40
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v31, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v28, 0x300, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v43, 0x300, v1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v90, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v183
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v182
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v182, 0x300, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v89, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v30, 0x300, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v181
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v88, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v181, 0x300, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v29, 0x300, v1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v78, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v79, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v29, 0x300, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v181, 0x300, v0
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v179
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v182, 0x300, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v28, 0x300, v1
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v179, 0x300, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v77, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
@@ -182397,7 +188049,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v164
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(25)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v163
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v163, 0x300, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v27, 0x300, v1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v76, v0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v3
@@ -182408,18 +188060,18 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v74, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v73, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v26, 0x300, v1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(23)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v150
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v26, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v23, 0x300, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v150, 0x300, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v25, 0x300, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v72, v3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v146
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(21)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v145
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v135
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v25, 0x300, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v135, 0x300, v0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v63, v1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v3
@@ -182427,13 +188079,13 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(19)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v131
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v62, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v131, 0x300, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v24, 0x300, v1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v60, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v61, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v24, 0x300, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v131, 0x300, v0
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v119
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v135, 0x300, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v23, 0x300, v1
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v119, 0x300, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v59, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(17)
@@ -182442,29 +188094,29 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v115
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v165
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v115, 0x300, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v56, v0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v162
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v45, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v145, 0x300, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v115, 0x300, v0
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v44, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v42, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v21, 0x300, v1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v151
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v21, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v18, 0x300, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v145, 0x300, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, 0x300, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v41, v3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v149
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v148
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v144
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, 0x300, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v144, 0x300, v0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v180, v1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v3
@@ -182478,8 +188130,8 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v133, 0x300, v0
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v129
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v129, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v144, 0x300, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v18, 0x300, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v129, 0x300, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v161, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v118
@@ -182487,167 +188139,141 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v117
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v116
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v116, 0x300, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v147, v0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v114
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v99
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v134, v1
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v114, 0x300, v0
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v132, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v130, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v103
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 3, v98
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v54
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v53
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 0x300, v1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v103
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v103, 0x300, v0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v98
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v128, v3
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v99
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v54
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 3, v39
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 3, v52
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v113, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v53
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v34, 3, v34
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v35, 3, v35
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v33, 3, v33
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v113, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v128, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v100
-; GFX11-TRUE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v101, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v102, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, 0x300, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v101, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v102, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v5
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v96
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v134, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v97, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v55
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v96
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, 0x300, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v13, 0x300, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v97, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v55
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v100
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xff, v33
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v87, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v51
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v86, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v51, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v85, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v84, v6
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v52
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v50
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v50, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 0x300, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v83, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v48
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 3, v49
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v39
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v39, 0x300, v4
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v87, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v82, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v38
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v81, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 0x300, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v71, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v80, v6
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v51
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v86, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v85, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v84, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v51, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v50
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v50, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v49
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v83, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v48
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v82, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 3, v38
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v81, v5
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v38, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v71, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v80, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v37
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v37, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v39, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v70, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v36
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v34
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v34, 3, v35
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v35, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v69, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v34
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v112, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v68, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v34, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v67, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v66, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v33
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v33, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v32
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v32, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v65, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e64 v8, 0xffff, s4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v37
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 0x300, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v70, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 3, v36
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v64, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v10, 16, v36
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v7, 16, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v33
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v37
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff, v22
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v32, 3, v32
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v69, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v34
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xff, v35
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xff, v32
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v112, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v35, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v67, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v68, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v34, v66, v34
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v32, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v4, 16, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v35
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff, v51
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v38
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v34, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v39, 16, v33
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v50, 16, v32
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v9, 16, v35
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v15, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v13, 16, v33
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v3, 16, v34
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v16, v16, 16, v32
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff, v116
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v129
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v17
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v26, 16, v36
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v17, v114, 16, v32
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v144, 16, v33
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v20, 16, v34
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v21, 16, v35
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff, v115
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v135
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v131
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v23
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff, v27
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v145, 16, v32
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v119, 16, v33
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v24, 16, v34
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v25, 16, v35
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff, v163
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v182
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v181
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v28
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v2, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v36, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v65, v33
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x300, v34
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v36.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v33, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v64, v32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v33.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v32, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, s4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v51.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v133, 16, v19
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v160, 16, v32
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v28, v179, 16, v33
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v29, v29, 16, v34
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v30, v30, 16, v35
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v31, v31, 16, v36
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v103.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v114.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v18.h, v129.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v19.h, v133.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v20.h, v144.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v21.h, v145.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v115.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v119.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v24.h, v131.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v25.h, v135.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v26.h, v150.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v27.h, v160.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v28.h, v179.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v29.h, v181.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v30.h, v182.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.h, v43.l
 ; GFX11-TRUE16-NEXT:  .LBB93_3: ; %end
 ; GFX11-TRUE16-NEXT:    s_clause 0x1e
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:320
@@ -204699,9 +210325,10 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s29, 8
 ; GFX11-TRUE16-NEXT:    s_and_b32 s7, s2, 0xff
 ; GFX11-TRUE16-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s1, 8
-; GFX11-TRUE16-NEXT:    v_and_b32_e64 v5, 0xffff, s5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s5 :: v_dual_and_b32 v1, 0xff, v35
 ; GFX11-TRUE16-NEXT:    s_and_b32 s5, s0, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s1, 8
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s3, 8
 ; GFX11-TRUE16-NEXT:    s_or_b32 s5, s5, s6
 ; GFX11-TRUE16-NEXT:    s_or_b32 s6, s7, s8
@@ -204717,6 +210344,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s21, 8
 ; GFX11-TRUE16-NEXT:    s_and_b32 s9, s22, 0xff
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s23, 8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v32
 ; GFX11-TRUE16-NEXT:    s_or_b32 s7, s7, s8
 ; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s10
 ; GFX11-TRUE16-NEXT:    s_and_b32 s9, s24, 0xff
@@ -204727,201 +210355,169 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    s_or_b32 s10, s11, s12
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s9, s10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v36
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v32
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v35
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v34
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v33
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v68
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v33
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v64
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v66
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v4, v67
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v65
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v0, 16, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v38
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v39
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v6, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v49
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v3, 16, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v1, v66
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v37
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v70
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v34
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v65
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v36
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xff, v118
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v67
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v38
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v0, v68
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v1, v69
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v39
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v2, v70
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v50
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v71
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v48
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v69
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v82
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v7, v80
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v8, v81
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v9, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v55
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v10, 16, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v84
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v52
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xff, v54
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v86
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v83
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v48
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v49
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v80
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v3, v82
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v55
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v81
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v2, v71
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v51
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v53
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v52
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v83
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v3, v86
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v1, v84
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xff, v96
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v85
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v10, v97
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v11, v87
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v99
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v3, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v103
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xff, v114
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v98
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v0, 16, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v96
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v85
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v54
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v98
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v87
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v99
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v2, v97
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v3, v102
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v103
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v0, v101
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v100
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v113
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v101
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xff, v116
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v14, v128
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v114
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v1, v113
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v117
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v112
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xff, v117
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v102
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v13, v130
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xff, v133
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v14, v132
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v0, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v116
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v128
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v16, v134
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v132
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v133
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v3, v130
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, s6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v0, v161
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v129
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v147
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v148
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xff, v118
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xff, v129
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v16, v161
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v2, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v166
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v144
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v134
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v18, v147
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v167
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v17, 16, v19
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v17, v18, 16, v22
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s7
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v1, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v0, v166
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v144
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v167
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v151
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v149
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v16, v20, 16, v21
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, s8
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v180
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v177
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v0, v180
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v149
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v177
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v165
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v162
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v42
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v41
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v1, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v0, v42
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v162
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v41
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v178
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v115
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v45
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v44
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v0, v45
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v115
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v44
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v131
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v119
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v59
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v56
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v1, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v0, v59
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v119
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v56
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v145
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v135
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v60
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v61
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v0, v60
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v135
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v61
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v150
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v146
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v63
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v62
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v1, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v0, v63
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v146
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v62
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v163
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v160
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v73
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v72
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v0, v73
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v160
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v72
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v176
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v164
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v75
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v74
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v1, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v0, v75
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v164
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v74
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v181
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v179
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v77
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v76
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v0, v77
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v179
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v76
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v183
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v182
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v78
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v79
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v28, v1, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v0, v78
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v182
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v79
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v43
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v40
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v89
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v88
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v29, v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v0, v89
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v40
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v88
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v47
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v46
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v91
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v90
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v30, v1, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v0, v91
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v46
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v90
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v58
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v57
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v92
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v93
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v31, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v0, v92
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v57
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v93
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.h, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB97_3
 ; GFX11-TRUE16-NEXT:  .LBB97_2: ; %cmp.true
@@ -204961,57 +210557,59 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    s_or_b32 s10, s11, s10
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, s1, s0
 ; GFX11-TRUE16-NEXT:    s_or_b32 s1, s3, s2
+; GFX11-TRUE16-NEXT:    s_addk_i32 s5, 0x300
+; GFX11-TRUE16-NEXT:    s_addk_i32 s6, 0x300
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s9, 0x300
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s0, 0x300
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s1, 0x300
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s10, 0x300
+; GFX11-TRUE16-NEXT:    s_addk_i32 s4, 0x300
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(38)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v57
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s9, s10
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(37)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v58
-; GFX11-TRUE16-NEXT:    s_addk_i32 s5, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s6, 0x300
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v57
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s5, s6
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(35)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v47
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s7, 0x300
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v46
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v92, v0
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v46
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v93, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v92, v0
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v91, v2
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v43
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v27, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v93, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v90, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v31, 0x300, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v30, 0x300, v2
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v40
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v31, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v28, 0x300, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v43, 0x300, v1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v90, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v183
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v182
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v182, 0x300, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v89, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v30, 0x300, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v181
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v88, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v181, 0x300, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v29, 0x300, v1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v78, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v79, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v29, 0x300, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v181, 0x300, v0
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v179
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v182, 0x300, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v28, 0x300, v1
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v179, 0x300, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v77, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
@@ -205020,7 +210618,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v164
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(25)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v163
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v163, 0x300, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v27, 0x300, v1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v76, v0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v3
@@ -205031,18 +210629,18 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v74, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v73, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v26, 0x300, v1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(23)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v150
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v26, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v23, 0x300, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v150, 0x300, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v25, 0x300, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v72, v3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v146
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(21)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v145
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v135
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v25, 0x300, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v135, 0x300, v0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v63, v1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v3
@@ -205050,13 +210648,13 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(19)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v131
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v62, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v131, 0x300, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v24, 0x300, v1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v60, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v61, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v24, 0x300, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v131, 0x300, v0
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v119
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v135, 0x300, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v23, 0x300, v1
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v119, 0x300, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v59, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(17)
@@ -205065,29 +210663,29 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v115
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v165
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v115, 0x300, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v56, v0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v162
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v45, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v145, 0x300, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v115, 0x300, v0
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v44, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v42, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v21, 0x300, v1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v151
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v21, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v18, 0x300, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v145, 0x300, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, 0x300, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v41, v3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v149
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v148
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v144
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, 0x300, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v144, 0x300, v0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v180, v1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v3
@@ -205101,8 +210699,8 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v133, 0x300, v0
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v129
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v129, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v144, 0x300, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v18, 0x300, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v129, 0x300, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v161, v3
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v118
@@ -205110,167 +210708,141 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v117
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v116
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v116, 0x300, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v147, v0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v114
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v99
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v134, v1
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v114, 0x300, v0
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v132, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v130, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v103
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 3, v98
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v54
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v53
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 0x300, v1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v103
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v103, 0x300, v0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v98
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v128, v3
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v99
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v54
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 3, v39
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 3, v52
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v113, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v53
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v34, 3, v34
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v35, 3, v35
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v33, 3, v33
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 0x300, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v113, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v128, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v100
-; GFX11-TRUE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v101, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v102, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, 0x300, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v101, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v102, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v5
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v96
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v134, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v97, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v55
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v96
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, 0x300, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v13, 0x300, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v97, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v55
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v100
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xff, v33
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v87, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v51
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v86, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v51, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v85, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v84, v6
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v52
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v50
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v50, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 0x300, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v83, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v48
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 3, v49
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v39
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v39, 0x300, v4
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v87, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v82, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v38
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v81, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 0x300, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v71, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v80, v6
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v51
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v86, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v85, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v84, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v51, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v50
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v50, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v49
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v83, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v48
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v82, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 3, v38
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v81, v5
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v38, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v71, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v80, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v37
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v37, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v39, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v70, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v36
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v34
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v34, 3, v35
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v35, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v69, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v34
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v112, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v68, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v34, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v67, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v66, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v33
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v33, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v32
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v32, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v65, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e64 v8, 0xffff, s4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v37
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 0x300, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v70, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 3, v36
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v64, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v10, 16, v36
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v7, 16, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v33
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v37
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff, v22
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v32, 3, v32
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v69, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v34
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xff, v35
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xff, v32
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v112, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v35, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v67, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v68, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v34, v66, v34
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v32, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v4, 16, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v35
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff, v51
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v38
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v34, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v39, 16, v33
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v50, 16, v32
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v9, 16, v35
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v15, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v13, 16, v33
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v3, 16, v34
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v16, v16, 16, v32
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff, v116
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v129
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v17
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v26, 16, v36
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v17, v114, 16, v32
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v144, 16, v33
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v20, 16, v34
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v21, 16, v35
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff, v115
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v135
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v131
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v23
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff, v27
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v145, 16, v32
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v119, 16, v33
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v24, 16, v34
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v25, 16, v35
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff, v163
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v182
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v181
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v28
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v2, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v36, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v65, v33
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x300, v34
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v36.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v33, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v64, v32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v33.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v32, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, s4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v51.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v133, 16, v19
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v160, 16, v32
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v28, v179, 16, v33
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v29, v29, 16, v34
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v30, v30, 16, v35
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v31, v31, 16, v36
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v103.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v114.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v18.h, v129.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v19.h, v133.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v20.h, v144.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v21.h, v145.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v115.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v119.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v24.h, v131.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v25.h, v135.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v26.h, v150.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v27.h, v160.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v28.h, v179.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v29.h, v181.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v30.h, v182.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.h, v43.l
 ; GFX11-TRUE16-NEXT:  .LBB97_3: ; %end
 ; GFX11-TRUE16-NEXT:    s_clause 0x1e
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:320
@@ -223030,700 +228602,1362 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v64bf16_to_v64f16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT:    v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12
-; GFX11-NEXT:    v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
-; GFX11-NEXT:    v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8
-; GFX11-NEXT:    v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6
-; GFX11-NEXT:    v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
-; GFX11-NEXT:    v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
-; GFX11-NEXT:    v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
-; GFX11-NEXT:    v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT:    s_mov_b32 s15, s3
-; GFX11-NEXT:    s_mov_b32 s14, s2
-; GFX11-NEXT:    s_mov_b32 s13, s1
-; GFX11-NEXT:    s_mov_b32 s12, s0
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cbranch_scc0 .LBB101_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccnz .LBB101_4
-; GFX11-NEXT:  .LBB101_2: ; %cmp.true
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v16
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v18
-; GFX11-NEXT:    s_and_b32 s0, s12, 0xffff0000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v1, 0x40c00000, v1
-; GFX11-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v6, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, v7, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, v6, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_lshlrev_b32 v3, 16, v17
-; GFX11-NEXT:    v_bfe_u32 v5, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v5, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_cndmask_b32 v0, v5, v8
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_bfe_u32 v11, v3, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v6, v9, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v7, v10, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v18
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, v5, v4
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v19
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, v11, v3
-; GFX11-NEXT:    v_lshl_or_b32 v16, v16, 16, v32
-; GFX11-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v3, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, v4, v5
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v20
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v20
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v19
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_bfe_u32 v2, v4, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v2, v4
-; GFX11-NEXT:    v_dual_add_f32 v2, 0x40c00000, v6 :: v_dual_add_f32 v3, 0x40c00000, v3
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v8, v3
-; GFX11-NEXT:    v_bfe_u32 v8, v2, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v5
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v3
-; GFX11-NEXT:    v_add_f32_e32 v3, 0x40c00000, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v1, v5 :: v_dual_and_b32 v6, 0xffff0000, v22
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v8, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v5, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v1, v3, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v4
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, v1, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v3
-; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff, v34
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v4 :: v_dual_add_nc_u32 v1, 0x7fff, v1
-; GFX11-NEXT:    v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_add_f32_e32 v3, 0x40c00000, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v0
-; GFX11-NEXT:    v_bfe_u32 v0, v6, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v23
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v1, v2 :: v_dual_add_nc_u32 v2, v4, v5
-; GFX11-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, v0, v6
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 16, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v2
-; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT:    v_add_f32_e32 v5, 0x40c00000, v7
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v36, 0xffff, v36
-; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v6
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v23
-; GFX11-NEXT:    v_lshl_or_b32 v19, v19, 16, v34
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v1
-; GFX11-NEXT:    v_bfe_u32 v1, v5, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v2, v4 :: v_dual_add_nc_u32 v0, v1, v5
-; GFX11-NEXT:    v_add_f32_e32 v2, 0x40c00000, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v24
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v5
-; GFX11-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 16, v1
-; GFX11-NEXT:    v_bfe_u32 v1, v3, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v7, v2
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v6 :: v_dual_and_b32 v7, 0xffff0000, v25
-; GFX11-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, v1, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v5
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, v6, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v5 :: v_dual_add_nc_u32 v5, 0x7fff, v6
-; GFX11-NEXT:    v_add_f32_e32 v6, 0x40c00000, v7
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX11-NEXT:    v_bfe_u32 v3, v6, 16, 1
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v1, v2 :: v_dual_lshlrev_b32 v2, 16, v25
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v1
-; GFX11-NEXT:    v_add_f32_e32 v0, 0x40c00000, v2
-; GFX11-NEXT:    v_dual_cndmask_b32 v4, v5, v7 :: v_dual_add_nc_u32 v1, v3, v6
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v26
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v26
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_nc_u32 v3, v3, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v0
-; GFX11-NEXT:    v_and_b32_e32 v39, 0xffff, v39
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, v5, v2
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v3, v6 :: v_dual_and_b32 v5, 0xffff0000, v27
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v27
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v7, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v2
-; GFX11-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v0
-; GFX11-NEXT:    v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_bfe_u32 v4, v6, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, v2, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, v4, v6
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v28
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v3, 16, v28
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT:    v_and_b32_e32 v49, 0xffff, v49
-; GFX11-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_cndmask_b32 v0, v0, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_bfe_u32 v4, v2, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v4, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v29
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, v5, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v29
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT:    v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_add_f32 v5, 0x40c00000, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v8, v4
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, v2, v5
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v30
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v30
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v3
-; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v4, 0x40c00000, v7
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v31
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v3, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v6 :: v_dual_add_f32 v1, 0x40c00000, v5
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v31
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v0
-; GFX11-NEXT:    v_bfe_u32 v0, v4, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_bfe_u32 v7, v1, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, v0, v4
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v3, v5 :: v_dual_add_f32 v3, 0x40c00000, v6
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, v7, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v2
-; GFX11-NEXT:    v_bfe_u32 v2, v3, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v6
-; GFX11-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s12, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v0
-; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v4, v6, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT:    s_and_b32 s0, s13, 0xffff0000
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v0
-; GFX11-NEXT:    v_add_f32_e64 v3, 0x40c00000, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, v4, v6
-; GFX11-NEXT:    v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v6
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    s_lshl_b32 s0, s13, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v5
-; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, v7, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v1
-; GFX11-NEXT:    v_bfe_u32 v1, v4, 16, 1
-; GFX11-NEXT:    s_and_b32 s0, s14, 0xffff0000
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v2, v6 :: v_dual_add_nc_u32 v5, 0x7fff, v7
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, v1, v4
-; GFX11-NEXT:    v_add_f32_e64 v7, 0x40c00000, s0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    s_lshl_b32 s0, s14, 16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT:    v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v1
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    s_and_b32 s0, s15, 0xffff0000
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v8, v7
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v7
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v5, v9, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    s_lshl_b32 s0, s15, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v5, v9
-; GFX11-NEXT:    v_bfe_u32 v10, v6, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v7, 0x40c00000, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v9
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, v10, v6
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 16, v4
-; GFX11-NEXT:    v_bfe_u32 v4, v7, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT:    s_and_b32 s0, s16, 0xffff0000
-; GFX11-NEXT:    v_dual_cndmask_b32 v5, v5, v8 :: v_dual_add_nc_u32 v8, 0x7fff, v10
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 16, v5
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v9, vcc_lo
-; GFX11-NEXT:    s_lshl_b32 s0, s16, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v7
-; GFX11-NEXT:    v_bfe_u32 v9, v5, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v6
-; GFX11-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    s_and_b32 s0, s17, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, v9, v5
-; GFX11-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s17, 16
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_bfe_u32 v10, v9, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v66, 0xffff, v66
-; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 16, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v7
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v8, v6
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, v10, v9
-; GFX11-NEXT:    s_and_b32 s0, s18, 0xffff0000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v4, v4, v7 :: v_dual_add_nc_u32 v7, 0x7fff, v8
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v6
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_bfe_u32 v11, v5, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v67, 0xffff, v67
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v54, 0xffff, v54
-; GFX11-NEXT:    v_dual_cndmask_b32 v6, v7, v8 :: v_dual_add_nc_u32 v7, 0x7fff, v10
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v9
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, v11, v5
-; GFX11-NEXT:    v_add_f32_e64 v11, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s18, 16
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v7, v7, v8 :: v_dual_add_nc_u32 v8, 0x7fff, v10
-; GFX11-NEXT:    v_bfe_u32 v10, v11, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 16, v7
-; GFX11-NEXT:    v_add_f32_e64 v7, 0x40c00000, s0
-; GFX11-NEXT:    s_and_b32 s0, s19, 0xffff0000
-; GFX11-NEXT:    v_dual_cndmask_b32 v5, v8, v9 :: v_dual_add_nc_u32 v8, v10, v11
-; GFX11-NEXT:    v_add_f32_e64 v10, 0x40c00000, s0
-; GFX11-NEXT:    v_bfe_u32 v9, v7, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    s_lshl_b32 s0, s19, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v8
-; GFX11-NEXT:    v_bfe_u32 v13, v10, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, v9, v7
-; GFX11-NEXT:    v_add_f32_e64 v11, 0x40c00000, s0
-; GFX11-NEXT:    s_and_b32 s0, s20, 0xffff0000
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v8, v12, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v7
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, 0x7fff, v9
-; GFX11-NEXT:    v_add_nc_u32_e32 v13, v13, v10
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 16, v8
-; GFX11-NEXT:    v_bfe_u32 v8, v11, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v9, v12, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, 0x7fff, v13
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v10
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v8, v11
-; GFX11-NEXT:    v_add_f32_e64 v13, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s20, 16
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v11
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v8
-; GFX11-NEXT:    v_bfe_u32 v12, v13, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v9
-; GFX11-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT:    s_and_b32 s0, s21, 0xffff0000
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, v12, v13
-; GFX11-NEXT:    v_add_f32_e64 v12, 0x40c00000, s0
-; GFX11-NEXT:    v_bfe_u32 v11, v9, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT:    s_lshl_b32 s0, s21, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT:    v_bfe_u32 v15, v12, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, v11, v9
-; GFX11-NEXT:    v_add_f32_e64 v13, 0x40c00000, s0
-; GFX11-NEXT:    s_and_b32 s0, s22, 0xffff0000
-; GFX11-NEXT:    v_cndmask_b32_e32 v10, v10, v14, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v9
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, 0x7fff, v11
-; GFX11-NEXT:    v_add_nc_u32_e32 v15, v15, v12
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 16, v10
-; GFX11-NEXT:    v_bfe_u32 v10, v13, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v11, v14, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, 0x7fff, v15
-; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v12
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, v10, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 16, v9
-; GFX11-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s22, 16
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v11, v14, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v13
-; GFX11-NEXT:    v_bfe_u32 v14, v9, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 16, v11
-; GFX11-NEXT:    v_add_f32_e64 v11, 0x40c00000, s0
-; GFX11-NEXT:    s_and_b32 s0, s23, 0xffff0000
-; GFX11-NEXT:    v_or_b32_e32 v15, 0x400000, v9
-; GFX11-NEXT:    v_cndmask_b32_e32 v10, v10, v12, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v12, v14, v9
-; GFX11-NEXT:    v_bfe_u32 v13, v11, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v14, 0x40c00000, s0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    s_lshl_b32 s0, s23, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v12, 0x7fff, v12
-; GFX11-NEXT:    v_add_nc_u32_e32 v13, v13, v11
-; GFX11-NEXT:    v_bfe_u32 v82, v14, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v12, v15, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v12, 0x40c00000, s0
-; GFX11-NEXT:    v_add_nc_u32_e32 v13, 0x7fff, v13
-; GFX11-NEXT:    v_or_b32_e32 v15, 0x400000, v11
-; GFX11-NEXT:    v_add_nc_u32_e32 v82, v82, v14
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    v_bfe_u32 v83, v12, 16, 1
-; GFX11-NEXT:    s_and_b32 s0, s24, 0xffff0000
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_lshl_or_b32 v5, v68, 16, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v13, v15, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v13, 0x7fff, v82
-; GFX11-NEXT:    v_or_b32_e32 v15, 0x400000, v14
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT:    v_add_nc_u32_e32 v82, v83, v12
-; GFX11-NEXT:    v_add_f32_e64 v83, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s24, 16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v13, v13, v15 :: v_dual_add_nc_u32 v14, 0x7fff, v82
-; GFX11-NEXT:    v_or_b32_e32 v15, 0x400000, v12
-; GFX11-NEXT:    v_bfe_u32 v82, v83, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v84, 16, v13
-; GFX11-NEXT:    v_add_f32_e64 v13, 0x40c00000, s0
-; GFX11-NEXT:    s_and_b32 s0, s25, 0xffff0000
-; GFX11-NEXT:    v_or_b32_e32 v85, 0x400000, v83
-; GFX11-NEXT:    v_cndmask_b32_e32 v12, v14, v15, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v14, v82, v83
-; GFX11-NEXT:    v_bfe_u32 v15, v13, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v82, 0x40c00000, s0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v83, v83
-; GFX11-NEXT:    s_lshl_b32 s0, s25, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v14, 0x7fff, v14
-; GFX11-NEXT:    v_add_nc_u32_e32 v15, v15, v13
-; GFX11-NEXT:    v_bfe_u32 v86, v82, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v83, 0x40c00000, s0
-; GFX11-NEXT:    s_and_b32 s0, s26, 0xffff0000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v14, v14, v85 :: v_dual_add_nc_u32 v15, 0x7fff, v15
-; GFX11-NEXT:    v_or_b32_e32 v85, 0x400000, v13
-; GFX11-NEXT:    v_add_nc_u32_e32 v86, v86, v82
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v87, 16, v14
-; GFX11-NEXT:    v_bfe_u32 v14, v83, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v100, 0x400000, v83
-; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, v15, v85, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v15, 0x7fff, v86
-; GFX11-NEXT:    v_add_f32_e64 v86, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s26, 16
-; GFX11-NEXT:    v_or_b32_e32 v85, 0x400000, v82
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v82, v82
-; GFX11-NEXT:    v_add_f32_e64 v82, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s27, 16
-; GFX11-NEXT:    v_or_b32_e32 v102, 0x400000, v86
-; GFX11-NEXT:    v_add_f32_e64 v96, 0x40c00000, s0
-; GFX11-NEXT:    s_and_b32 s0, s27, 0xffff0000
-; GFX11-NEXT:    v_bfe_u32 v97, v82, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v15, v15, v85, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v85, v86, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v99, v96, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v98, 0x40c00000, s0
-; GFX11-NEXT:    v_add_nc_u32_e32 v97, v97, v82
-; GFX11-NEXT:    v_or_b32_e32 v103, 0x400000, v82
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v82, v82
-; GFX11-NEXT:    v_add_nc_u32_e32 v99, v99, v96
-; GFX11-NEXT:    v_add_nc_u32_e32 v85, v85, v86
-; GFX11-NEXT:    v_add_nc_u32_e32 v97, 0x7fff, v97
-; GFX11-NEXT:    v_bfe_u32 v101, v98, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v112, 0x400000, v96
-; GFX11-NEXT:    v_add_nc_u32_e32 v99, 0x7fff, v99
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v82, v97, v103 :: v_dual_add_nc_u32 v85, 0x7fff, v85
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v96, v96
-; GFX11-NEXT:    v_add_nc_u32_e32 v101, v101, v98
-; GFX11-NEXT:    v_add_nc_u32_e32 v14, v14, v83
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 16, v82
-; GFX11-NEXT:    v_cndmask_b32_e32 v96, v99, v112, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v86, v86
-; GFX11-NEXT:    v_add_nc_u32_e32 v97, 0x7fff, v101
-; GFX11-NEXT:    v_or_b32_e32 v101, 0x400000, v98
-; GFX11-NEXT:    v_add_nc_u32_e32 v14, 0x7fff, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v96, 16, v96
-; GFX11-NEXT:    v_cndmask_b32_e32 v85, v85, v102, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v98, v98
-; GFX11-NEXT:    v_and_b32_e32 v82, 0xffff, v82
-; GFX11-NEXT:    v_and_b32_e32 v68, 0xffff, v3
-; GFX11-NEXT:    v_lshl_or_b32 v3, v65, 16, v67
-; GFX11-NEXT:    v_dual_cndmask_b32 v86, v97, v101 :: v_dual_and_b32 v65, 0xffff, v28
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v83, v83
-; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 16, v85
-; GFX11-NEXT:    v_lshrrev_b32_e32 v97, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v80, 0xffff, v80
-; GFX11-NEXT:    v_lshrrev_b32_e32 v85, 16, v86
-; GFX11-NEXT:    v_cndmask_b32_e32 v14, v14, v100, vcc_lo
-; GFX11-NEXT:    v_and_b32_e32 v86, 0xffff, v96
-; GFX11-NEXT:    v_lshl_or_b32 v1, v1, 16, v68
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v96, 16, v14
-; GFX11-NEXT:    v_lshl_or_b32 v14, v83, 16, v82
-; GFX11-NEXT:    v_lshl_or_b32 v15, v85, 16, v86
-; GFX11-NEXT:    v_and_b32_e32 v83, 0xffff, v13
-; GFX11-NEXT:    v_and_b32_e32 v86, 0xffff, v11
-; GFX11-NEXT:    v_and_b32_e32 v82, 0xffff, v96
-; GFX11-NEXT:    v_and_b32_e32 v96, 0xffff, v10
-; GFX11-NEXT:    v_and_b32_e32 v85, 0xffff, v12
-; GFX11-NEXT:    v_lshl_or_b32 v12, v87, 16, v83
-; GFX11-NEXT:    v_lshl_or_b32 v10, v9, 16, v86
-; GFX11-NEXT:    v_lshl_or_b32 v13, v97, 16, v82
-; GFX11-NEXT:    v_and_b32_e32 v82, 0xffff, v7
-; GFX11-NEXT:    v_lshl_or_b32 v9, v81, 16, v96
-; GFX11-NEXT:    v_and_b32_e32 v81, 0xffff, v8
-; GFX11-NEXT:    v_and_b32_e32 v83, 0xffff, v6
-; GFX11-NEXT:    v_lshl_or_b32 v11, v84, 16, v85
-; GFX11-NEXT:    v_lshl_or_b32 v6, v69, 16, v82
-; GFX11-NEXT:    v_and_b32_e32 v69, 0xffff, v2
-; GFX11-NEXT:    v_lshl_or_b32 v2, v64, 16, v66
-; GFX11-NEXT:    v_and_b32_e32 v64, 0xffff, v29
-; GFX11-NEXT:    v_lshl_or_b32 v7, v70, 16, v81
-; GFX11-NEXT:    v_and_b32_e32 v70, 0xffff, v0
-; GFX11-NEXT:    v_lshl_or_b32 v0, v55, 16, v69
-; GFX11-NEXT:    v_and_b32_e32 v55, 0xffff, v30
-; GFX11-NEXT:    v_lshl_or_b32 v28, v51, 16, v64
-; GFX11-NEXT:    v_and_b32_e32 v51, 0xffff, v24
-; GFX11-NEXT:    v_and_b32_e32 v66, 0xffff, v27
-; GFX11-NEXT:    v_lshl_or_b32 v27, v50, 16, v65
-; GFX11-NEXT:    v_lshl_or_b32 v29, v52, 16, v55
-; GFX11-NEXT:    v_and_b32_e32 v50, 0xffff, v25
-; GFX11-NEXT:    v_and_b32_e32 v52, 0xffff, v22
-; GFX11-NEXT:    v_lshl_or_b32 v24, v38, 16, v39
-; GFX11-NEXT:    v_lshl_or_b32 v22, v37, 16, v51
-; GFX11-NEXT:    v_and_b32_e32 v37, 0xffff, v20
-; GFX11-NEXT:    v_and_b32_e32 v38, 0xffff, v18
-; GFX11-NEXT:    v_lshl_or_b32 v8, v71, 16, v80
-; GFX11-NEXT:    v_lshl_or_b32 v4, v4, 16, v83
-; GFX11-NEXT:    v_lshl_or_b32 v31, v31, 16, v70
-; GFX11-NEXT:    v_lshl_or_b32 v30, v53, 16, v54
-; GFX11-NEXT:    v_lshl_or_b32 v26, v26, 16, v66
-; GFX11-NEXT:    v_lshl_or_b32 v25, v48, 16, v49
-; GFX11-NEXT:    v_lshl_or_b32 v23, v23, 16, v50
-; GFX11-NEXT:    v_lshl_or_b32 v21, v21, 16, v52
-; GFX11-NEXT:    v_lshl_or_b32 v20, v35, 16, v36
-; GFX11-NEXT:    v_lshl_or_b32 v18, v33, 16, v37
-; GFX11-NEXT:    v_lshl_or_b32 v17, v17, 16, v38
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB101_3:
-; GFX11-NEXT:    s_branch .LBB101_2
-; GFX11-NEXT:  .LBB101_4:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
-; GFX11-NEXT:    v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
-; GFX11-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
-; GFX11-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
-; GFX11-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
-; GFX11-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
-; GFX11-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
-; GFX11-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v64f16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT:    s_mov_b32 s15, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s14, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB101_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB101_4
+; GFX11-TRUE16-NEXT:  .LBB101_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v17
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v16
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v18
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, s25, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s12, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v85, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, s27, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v97, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, v7, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, v6, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v87, 0x400000, v85
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v98, v97, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v17
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v98, v98, v97
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v98, 0x7fff, v98
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, v5, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_cndmask_b32 v0, v5, v8
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v6, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v32.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v18
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v5, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v19
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, v11, v3
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v3, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v33.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v4, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v20
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v19
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v34.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v2, v4
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v2, 0x40c00000, v6 :: v_dual_add_f32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, v8, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v2, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v1, v1, v5 :: v_dual_and_b32 v6, 0xffff0000, v22
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v8, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v5, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v3, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v35.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v0, v4 :: v_dual_add_nc_u32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v0, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v23
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v1, v1, v2 :: v_dual_add_nc_u32 v2, v4, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, v0, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v4, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v36.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v23
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, v1, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v37.l
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v1, v2, v4 :: v_dual_add_f32 v2, 0x40c00000, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v24
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, v7, v2
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v0, v6 :: v_dual_and_b32 v7, 0xffff0000, v25
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v38.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, v6, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v0, v5 :: v_dual_add_nc_u32 v5, 0x7fff, v6
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v6, 0x40c00000, v7
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v39.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v1, v1, v2 :: v_dual_lshlrev_b32 v2, 16, v25
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v4, v5, v7 :: v_dual_add_nc_u32 v1, v3, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v26
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v26
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_nc_u32 v3, v3, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v48.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v5, v2
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v3, v6 :: v_dual_and_b32 v5, 0xffff0000, v27
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v27
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v7, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, v2, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v4, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v28
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v3, 16, v28
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v50.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v49.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v4, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v29
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v5, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v29
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v51.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_cndmask_b32 v0, v0, v6
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v8, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, v2, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v30
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v52.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v31
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v31
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v0, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v53.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, v0, v4
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v2, v3, v5 :: v_dual_add_f32 v3, 0x40c00000, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, v7, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s12, 16
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v1, v4, v5 :: v_dual_add_nc_u32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v54.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v5, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s13, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s13, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.h, v55.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s14, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, v6, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v0, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s14, 16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, v0, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v64.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v5, v7
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s15, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s15, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v6
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v8, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v5, 16, 1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s16, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v3, v3, v7 :: v_dual_add_nc_u32 v6, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v2, v5
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s16, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s17, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v3, v4, v6 :: v_dual_add_nc_u32 v4, v7, v9
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s17, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, v5, v8
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, v10, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v4
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v7, 16, 1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s18, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v5, v5, v9 :: v_dual_add_nc_u32 v8, 0x7fff, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, v4, v7
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v5
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s18, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v10
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v11, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s19, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v10, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v68.l
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v5, v6, v8 :: v_dual_add_nc_u32 v6, v9, v11
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v11
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s19, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, v7, v10
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v12, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v6, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, v12, v8
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v6
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v9, 16, 1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s20, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v7, v7, v11 :: v_dual_add_nc_u32 v10, 0x7fff, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, v6, v9
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s20, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v12
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v7, v10, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v13, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s21, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v7
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v12, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v70.l
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v7, v8, v10 :: v_dual_add_nc_u32 v8, v11, v13
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v13
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s21, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, v9, v12
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v14, v10, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v8, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 0x7fff, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, v14, v10
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v8
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v11, 16, 1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s22, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v9, v9, v13 :: v_dual_add_nc_u32 v12, 0x7fff, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, v8, v11
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v9
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s22, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v14
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v9, v12, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v11
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v15, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s23, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v14, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v80.l
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v9, v10, v12 :: v_dual_add_nc_u32 v10, v13, v15
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v15
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s23, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, v11, v14
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v82, v12, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, 0x400000, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v82, v82, v12
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v10
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v13, 16, 1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s24, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v11, v11, v15 :: v_dual_add_nc_u32 v14, 0x7fff, v82
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, 0x400000, v12
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v82, v10, v13
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v84, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s24, 16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 0x7fff, v82
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v82, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v11, v14, v15, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v13
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v15, v84, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s25, 16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 16, v11
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v83.l
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v12, v12, v14 :: v_dual_add_nc_u32 v13, v15, v84
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v14, v82, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v15, v85, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v84, v84
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v81.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v12
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 0x7fff, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v84
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, v14, v82
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, v15, v85
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v84, 0x400000, v82
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v86.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v12, v12, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v82, v82
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s26, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, 0x7fff, v15
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v96, v13, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v82, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v14, v14, v84, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v85, v85
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s26, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v84, v96, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v14
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v15, v15, v87, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v87, v82, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, 0x7fff, v84
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v99, 0x400000, v82
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 16, v15
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, 0x400000, v13
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v84, v87, v82
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v87, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s27, 16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v85.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v13, v14, v15, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v15, v87, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v84, 0x7fff, v84
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v82, v82
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v100, v14, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, v15, v87
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v101, 0x400000, v14
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v82, v84, v99, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v99, 0x400000, v97
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v97, v97
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v84, v100, v14
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, 0x7fff, v15
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v100, 0x400000, v87
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v82
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v97, v98, v99, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v87, v87
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v84, 0x7fff, v84
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v96.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v71.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 16, v97
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v15, v15, v100, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v69.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v67.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v66.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v15
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v84, v84, v101, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v82.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v65.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v84
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v87.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB101_3:
+; GFX11-TRUE16-NEXT:    s_branch .LBB101_2
+; GFX11-TRUE16-NEXT:  .LBB101_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v64f16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-FAKE16-NEXT:    s_mov_b32 s15, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s14, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s13, s1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s12, s0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB101_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB101_4
+; GFX11-FAKE16-NEXT:  .LBB101_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v18
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s12, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v1, 0x40c00000, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, v7, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, v6, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_lshlrev_b32 v3, 16, v17
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v5, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_cndmask_b32 v0, v5, v8
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v6, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v7, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v18
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, v5, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v19
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, v11, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v16, v16, 16, v32
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v3, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, v4, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v20
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v19
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v2, v4
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v2, 0x40c00000, v6 :: v_dual_add_f32 v3, 0x40c00000, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v8, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v2, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v5 :: v_dual_and_b32 v6, 0xffff0000, v22
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v8, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v5, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v1, v3, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, v1, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff, v34
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v4 :: v_dual_add_nc_u32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v0, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v23
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v2 :: v_dual_add_nc_u32 v2, v4, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, v0, v6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v36, 0xffff, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v19, v19, 16, v34
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v1, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v2, v4 :: v_dual_add_nc_u32 v0, v1, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v24
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v1, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v7, v2
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v6 :: v_dual_and_b32 v7, 0xffff0000, v25
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, v1, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, v6, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v5 :: v_dual_add_nc_u32 v5, 0x7fff, v6
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v6, 0x40c00000, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v2 :: v_dual_lshlrev_b32 v2, 16, v25
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v2
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v5, v7 :: v_dual_add_nc_u32 v1, v3, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v26
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v26
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_nc_u32 v3, v3, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v39, 0xffff, v39
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, v5, v2
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v3, v6 :: v_dual_and_b32 v5, 0xffff0000, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v27
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v7, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, v2, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, v4, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v28
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v3, 16, v28
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v49, 0xffff, v49
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_cndmask_b32 v0, v0, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v4, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v29
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, v5, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v29
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_add_f32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v8, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, v2, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v30
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v30
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v7
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v31
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v3, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v0, v6 :: v_dual_add_f32 v1, 0x40c00000, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v31
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v0, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, v0, v4
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v3, v5 :: v_dual_add_f32 v3, 0x40c00000, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, v7, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v6
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s12, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s13, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, v4, v6
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s13, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, v7, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v1, v4, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s14, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v6 :: v_dual_add_nc_u32 v5, 0x7fff, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, v1, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s14, 16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s15, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v8, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s15, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v5, v9
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, v10, v6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s16, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v8 :: v_dual_add_nc_u32 v8, 0x7fff, v10
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v8, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s16, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s17, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, v9, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s17, 16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v66, 0xffff, v66
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v8, v6
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, v10, v9
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s18, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v7 :: v_dual_add_nc_u32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v67, 0xffff, v67
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v54, 0xffff, v54
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v7, v8 :: v_dual_add_nc_u32 v7, 0x7fff, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, v11, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s18, 16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v8 :: v_dual_add_nc_u32 v8, 0x7fff, v10
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v11, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s19, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v8, v9 :: v_dual_add_nc_u32 v8, v10, v11
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s19, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, v9, v7
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s20, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v8, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 0x7fff, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, v13, v10
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v8
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v11, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v9, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 0x7fff, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v8, v11
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s20, 16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v11
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v12, v13, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x400000, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v9
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s21, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, v12, v13
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s21, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v15, v12, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, v11, v9
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s22, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v10, v14, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, v15, v12
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v10
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v13, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v11, v14, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 0x7fff, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x400000, v12
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, v10, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v9
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s22, 16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v14, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v13
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v14, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v11
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s23, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v10, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, v14, v9
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v11, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s23, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, v13, v11
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v82, v14, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v12, v15, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, 0x7fff, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, 0x400000, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v82, v82, v14
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v83, v12, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s24, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v68, 16, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v13, v15, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, 0x7fff, v82
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, 0x400000, v14
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v82, v83, v12
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v83, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s24, 16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v13, v13, v15 :: v_dual_add_nc_u32 v14, 0x7fff, v82
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, 0x400000, v12
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v82, v83, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 16, v13
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s25, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v85, 0x400000, v83
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v14, v15, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, v82, v83
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v15, v13, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v82, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v83, v83
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s25, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, v15, v13
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v86, v82, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v83, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s26, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v14, v85 :: v_dual_add_nc_u32 v15, 0x7fff, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v85, 0x400000, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v86, v86, v82
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 16, v14
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v14, v83, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v100, 0x400000, v83
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v15, v85, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, 0x7fff, v86
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v86, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s26, 16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v85, 0x400000, v82
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v82, v82
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v82, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s27, 16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v102, 0x400000, v86
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v96, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s27, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v97, v82, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v15, v85, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v85, v86, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v99, v96, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v98, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v97, v97, v82
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v103, 0x400000, v82
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v82, v82
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v99, v99, v96
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v85, v85, v86
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v97, 0x7fff, v97
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v101, v98, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v112, 0x400000, v96
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v99, 0x7fff, v99
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v82, v97, v103 :: v_dual_add_nc_u32 v85, 0x7fff, v85
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v96, v96
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v101, v101, v98
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, v14, v83
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v82
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v96, v99, v112, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v86, v86
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v97, 0x7fff, v101
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v101, 0x400000, v98
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 16, v96
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v85, v85, v102, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v98, v98
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v82, 0xffff, v82
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v68, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v3, v65, 16, v67
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v86, v97, v101 :: v_dual_and_b32 v65, 0xffff, v28
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v83, v83
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v85
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v97, 16, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v80, 0xffff, v80
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v85, 16, v86
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v14, v100, vcc_lo
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v86, 0xffff, v96
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v1, v1, 16, v68
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v83, 16, v82
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v85, 16, v86
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v83, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v86, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v82, 0xffff, v96
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v96, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v85, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v87, 16, v83
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v9, 16, v86
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v97, 16, v82
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v82, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v81, 16, v96
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v81, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v83, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v84, 16, v85
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v69, 16, v82
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v69, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v2, v64, 16, v66
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v64, 0xffff, v29
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v70, 16, v81
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v70, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v55, 16, v69
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v55, 0xffff, v30
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v28, v51, 16, v64
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v51, 0xffff, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v66, 0xffff, v27
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v27, v50, 16, v65
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v29, v52, 16, v55
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v50, 0xffff, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v52, 0xffff, v22
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v24, v38, 16, v39
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v22, v37, 16, v51
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v37, 0xffff, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff, v18
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v71, 16, v80
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v4, 16, v83
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v31, v31, 16, v70
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v30, v53, 16, v54
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v26, v26, 16, v66
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v25, v48, 16, v49
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v23, v23, 16, v50
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v21, v21, 16, v52
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v35, 16, v36
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v33, 16, v37
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v17, v17, 16, v38
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB101_3:
+; GFX11-FAKE16-NEXT:    s_branch .LBB101_2
+; GFX11-FAKE16-NEXT:  .LBB101_4:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -229100,568 +235334,496 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v33, 16, v17
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff0000, v20
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v22
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v32, 0x40c00000, v32
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v16 :: v_dual_lshlrev_b32 v35, 16, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff0000, v24
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v33, 16, 1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v32, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v32
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_lshlrev_b32 v52, 16, v24
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v32, 0x40c00000, v32
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v70, 16, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v38, v33, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v32, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v32
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v34
-; GFX11-TRUE16-NEXT:    v_add3_u32 v37, v37, v33, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v16, v16, v32, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v48, 0x400000, v33
+; GFX11-TRUE16-NEXT:    v_add3_u32 v38, v38, v33, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v32, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v32, 0x400000, v33
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff0000, v30
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v16, v16, v38
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v17
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v34, 16, 1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v68, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v70, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v34, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v34, v36, 16, 1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v80, 16, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v17, v17, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v26
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v35, v36, vcc_lo
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_lshlrev_b32 v34, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v19
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v17, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v26, 0x40c00000, v26
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v80, 16, v4
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v17, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_lshlrev_b32 v82, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v64, 16, v28
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v84, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v30
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v16, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v16
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v30, 0x40c00000, v30
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v10
+; GFX11-TRUE16-NEXT:    v_add3_u32 v37, v37, v16, 0x7fff
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v68, 16, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v96, 16, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v16, v37, v39, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v33, 0x400000, v36
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v82, 0x40c00000, v82 :: v_dual_lshlrev_b32 v83, 16, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v17
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v39, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_cndmask_b32 v33, v38, v48
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v38, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v20
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v17, v35, v37 :: v_dual_and_b32 v6, 0xffff0000, v6
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v38, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v38, v18, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v18, v37, v32 :: v_dual_add_f32 v37, 0x40c00000, v38
-; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v34, v36, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v34, v35, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v34, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v38, v18, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v18
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT:    v_add3_u32 v38, v39, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v18, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v19, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff0000, v19
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v37, 16, 1
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v83, 0x40c00000, v83 :: v_dual_add_f32 v8, 0x40c00000, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v32, v33, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v34, v35, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v34, 0x400000, v35
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v19
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v84, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v48
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-TRUE16-NEXT:    v_add3_u32 v37, v37, v19, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v35, v38, v39 :: v_dual_lshlrev_b32 v38, 16, v21
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v19
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v48, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v36, v37, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v34, 0x400000, v37
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v36, 0x40c00000, v39 :: v_dual_lshlrev_b32 v39, 16, v20
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v38, 16, 1
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v86, 0x40c00000, v86 :: v_dual_lshlrev_b32 v87, 16, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v33, v33, v34, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v36, 16, 1
-; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v35, v38, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v35, 0x400000, v38
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v39, 0x40c00000, v39
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v38, 0x40c00000, v48
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v21
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v87, 0x40c00000, v87 :: v_dual_lshlrev_b32 v96, 16, v13
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v20, v34, v35, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v37, v36, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v35, 0x400000, v36
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v39, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v34.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v33.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v19, v37, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v37, v48, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v48, v20, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v39
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v48, 0x40c00000, v48 :: v_dual_add_f32 v49, 0x40c00000, v21
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v34, v34, v35 :: v_dual_lshlrev_b32 v21, 16, v22
-; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v39, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v38, 16, 1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v39, v48, 16, 1
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v50, 0x40c00000, v21
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v97, 0x400000, v87
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v35, v35, v36, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v36, v37, v38, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v38
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v49, v38, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v35.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v32.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v36, v37, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v37, v48, v20, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v20
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v21, 0x40c00000, v21
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT:    v_add3_u32 v48, v49, v38, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v49, 0x400000, v38
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v20, v37, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v39, v21, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v38, 0x40c00000, v50
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v36.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v39, v39, v21, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v37, v48, v49 :: v_dual_lshlrev_b32 v48, 16, v23
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v49, 0x400000, v21
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v50, v38, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v48, 0x40c00000, v48 :: v_dual_cndmask_b32 v21, v39, v49
+; GFX11-TRUE16-NEXT:    v_add3_u32 v39, v50, v38, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v49, 0x400000, v38
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v50, v22, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v48
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v35
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v25
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v36, v36, v37, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v37, v39, v48, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v39, v49, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v51, v48, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v37.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v38, v39, v49, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v39, v50, v22, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v49, 0x400000, v22
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-TRUE16-NEXT:    v_add3_u32 v50, v51, v48, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v51, 0x400000, v48
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v22, v39, v49, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v49, v23, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v48, v48
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v48, 0x40c00000, v22
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v64, 16, v28
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff0000, v29
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v0
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v21, v37, v38, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v37, v39, v49, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v49
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v39, v50, 16, 1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v69, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v22, v37, v38 :: v_dual_lshlrev_b32 v71, 16, v4
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v49, 0x40c00000, v51
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff0000, v23
-; GFX11-TRUE16-NEXT:    v_add3_u32 v37, v39, v50, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v50
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v50, 0x40c00000, v51 :: v_dual_lshlrev_b32 v51, 16, v24
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v39, v48, 16, 1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v23, v37, v38, vcc_lo
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v48
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v48, 0x40c00000, v52
+; GFX11-TRUE16-NEXT:    v_add3_u32 v49, v49, v23, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v39, v50, v51 :: v_dual_lshlrev_b32 v50, 16, v25
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v51, 0x400000, v23
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v52, v48, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v50, 0x40c00000, v50 :: v_dual_cndmask_b32 v23, v49, v51
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v49, v52, v48, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v51, 0x400000, v48
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v52, v24, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v48, v48
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v51, 0x40c00000, v51
-; GFX11-TRUE16-NEXT:    v_add3_u32 v37, v39, v48, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v39, v49, 16, 1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v22.h
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v71, 0x40c00000, v71 :: v_dual_add_f32 v4, 0x40c00000, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v37, v37, v38, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v38, v39, v49, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v49
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v48, v50, 16, 1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v49, 0x40c00000, v52 :: v_dual_lshlrev_b32 v52, 16, v25
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v34, 16, v21
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v24, v38, v39 :: v_dual_and_b32 v5, 0xffff0000, v5
-; GFX11-TRUE16-NEXT:    v_add3_u32 v38, v48, v50, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v50
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v48, v51, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v53, v50, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v39.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v48, v49, v51, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v49, v52, v24, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v51, 0x400000, v24
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-TRUE16-NEXT:    v_add3_u32 v52, v53, v50, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v53, 0x400000, v50
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v24, v49, v51, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v51, v25, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v50, v49, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v50, 0x40c00000, v54
+; GFX11-TRUE16-NEXT:    v_add3_u32 v51, v51, v25, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v49, v52, v53 :: v_dual_lshlrev_b32 v52, 16, v27
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v53, 0x400000, v25
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v54, v50, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v52, 0x40c00000, v52
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v80, 0x40c00000, v80 :: v_dual_add_f32 v5, 0x40c00000, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v38, v38, v39, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v39, v48, v51, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v48, 0x400000, v51
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v51, v51
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v51, 0x40c00000, v53
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v25, v39, v48, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v39, v50, v49, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v48, 0x400000, v49
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v50, v52, 16, 1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v49, 0x400000, v52
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v81, 0x40c00000, v81 :: v_dual_add_f32 v6, 0x40c00000, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v39, v39, v48, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v48, v50, v52, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v38.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v25, v51, v53, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v51, v54, v50, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v53, 0x400000, v50
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v54, v26, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v55, v52, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v27, 0x40c00000, v27
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v49.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v50, v51, v53, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v51, v54, v26, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v53, 0x400000, v26
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-TRUE16-NEXT:    v_add3_u32 v54, v55, v52, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v55, 0x400000, v52
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v26, v51, v53, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v53, v27, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v26
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v50, v51, 16, 1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v84, 0x40c00000, v84 :: v_dual_add_f32 v9, 0x40c00000, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v48, v48, v49, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v53, 0x40c00000, v53 :: v_dual_add_f32 v54, 0x40c00000, v26
-; GFX11-TRUE16-NEXT:    v_add3_u32 v49, v50, v51, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v50, 0x400000, v51
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v27
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v52, v53, 16, 1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v51, v51
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v51, 0x400000, v53
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v48
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v85, 16, v10
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v49, v49, v50, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v50, v52, v53, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v52, v54, 16, 1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v55, 0x40c00000, v26
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v53, 0x40c00000, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v85, 0x40c00000, v85 :: v_dual_cndmask_b32 v26, v50, v51
-; GFX11-TRUE16-NEXT:    v_add3_u32 v50, v52, v54, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v51, 0x400000, v54
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v52, v55, 16, 1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v54, 0x40c00000, v64
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff0000, v28
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v10, 0x40c00000, v10 :: v_dual_add_f32 v11, 0x40c00000, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v27, v50, v51, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v50, v52, v55, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v51, 0x400000, v55
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v52, v53, 16, 1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v55, 0x40c00000, v64
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v27.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v28, v50, v51 :: v_dual_and_b32 v13, 0xffff0000, v13
-; GFX11-TRUE16-NEXT:    v_add3_u32 v50, v52, v53, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v51, 0x400000, v53
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v52, v54, 16, 1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v53, v55, 16, 1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v33, 16, v26
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v37.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v23
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v50, v50, v51, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v51, v52, v54, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v52, 0x400000, v54
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v64, 16, v29
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v52, 0x40c00000, v64
+; GFX11-TRUE16-NEXT:    v_add3_u32 v53, v53, v27, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v51, v54, v55 :: v_dual_lshlrev_b32 v54, 16, v29
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v55, 0x400000, v27
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v29, v51, v52, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v51, v53, v55, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v52, 0x400000, v55
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v54, 0x40c00000, v65 :: v_dual_lshlrev_b32 v65, 16, v30
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v64, 0x40c00000, v64
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v50.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v51, v51, v52, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v55, v54, 16, 1
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v65, 0x40c00000, v65
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v53, v64, 16, 1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v64, v64
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v28
-; GFX11-TRUE16-NEXT:    v_add3_u32 v52, v53, v64, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v53, 0x400000, v64
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v64, 0x40c00000, v66
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v64, v52, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v54, 0x40c00000, v54
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v48.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v27, v53, v55, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v53, v64, v52, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v55, 0x400000, v52
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v64, v28, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v65, v54, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v29, 0x40c00000, v29
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v51.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v52, v53, v55, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v53, v64, v28, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v55, 0x400000, v28
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-TRUE16-NEXT:    v_add3_u32 v64, v65, v54, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v65, 0x400000, v54
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v28, v53, v55, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v55, v29, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v54, 0x40c00000, v66
+; GFX11-TRUE16-NEXT:    v_add3_u32 v55, v55, v29, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v31
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v30, v52, v53 :: v_dual_and_b32 v31, 0xffff0000, v31
-; GFX11-TRUE16-NEXT:    v_add3_u32 v52, v55, v54, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v53, 0x400000, v54
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v53, v64, v65 :: v_dual_lshlrev_b32 v64, 16, v31
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v65, 0x400000, v29
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v66, v54, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v64, 0x40c00000, v64 :: v_dual_cndmask_b32 v29, v55, v65
+; GFX11-TRUE16-NEXT:    v_add3_u32 v55, v66, v54, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v65, 0x400000, v54
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v66, v30, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v55, v65, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v54, 0x400000, v65
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v66, 0x40c00000, v66 :: v_dual_add_f32 v31, 0x40c00000, v31
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v52, v52, v53, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v53, v55, v65, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v55, v64, 16, 1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v65, v66, 16, 1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v53, v53, v54, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v54, v55, v64, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v55, 0x400000, v64
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v64, v64
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v64, 0x400000, v66
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v53
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v54, v54, v55, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v55, v65, v66, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v67, v64, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v31, 0x40c00000, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v53.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v54, v55, v65, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v55, v66, v30, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v65, 0x400000, v30
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-TRUE16-NEXT:    v_add3_u32 v66, v67, v64, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v67, 0x400000, v64
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v30, v55, v65, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v65, v31, 16, 1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v66, v66
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v67, 0x40c00000, v67 :: v_dual_add_f32 v66, 0x40c00000, v68
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v68, 16, v1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v55, v64, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v55, v65, v31, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v64, 0x400000, v31
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v65, v67, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v64, v64
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v64, 0x40c00000, v68
+; GFX11-TRUE16-NEXT:    v_add3_u32 v65, v65, v31, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v55, v66, v67 :: v_dual_lshlrev_b32 v66, 16, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v67, 0x400000, v31
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v68, v64, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v31, v55, v64, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v55, v65, v67, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v64, 0x400000, v67
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v67, v67
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v67, 0x40c00000, v69
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v65, v66, 16, 1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v69, 16, v2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.l, v31.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v55, v64, vcc_lo
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v64, 0x400000, v66
-; GFX11-TRUE16-NEXT:    v_add3_u32 v55, v65, v66, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v66, 0x40c00000, v66 :: v_dual_cndmask_b32 v31, v65, v67
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v65, v68, v64, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v67, 0x400000, v64
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v68, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v64, v64
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v69, v66, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.l, v55.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v64, v65, v67, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v65, v68, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v67, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v68, v69, v66, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v69, 0x400000, v66
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v65, v67, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v65, v1, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v66, v66
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v66, v67, 16, 1
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v69, 0x40c00000, v69
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v55, v55, v64, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v67, 0x40c00000, v70
+; GFX11-TRUE16-NEXT:    v_add3_u32 v65, v65, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v66, v68, v69, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v68, 16, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v69, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v70, v67, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v68, 0x40c00000, v68
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v55.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v65, v68, 16, 1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v68, v68
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v1
-; GFX11-TRUE16-NEXT:    v_add3_u32 v64, v65, v68, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v65, 0x400000, v68
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v68, 0x40c00000, v70
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v70, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v2, v64, v65 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-TRUE16-NEXT:    v_add3_u32 v64, v66, v67, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v65, 0x400000, v67
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v66, v69, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v50.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v65, v69, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v65, v70, v67, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v69, 0x400000, v67
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v70, v2, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v67, v67
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v67, v68, 16, 1
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v70, 0x40c00000, v70 :: v_dual_add_f32 v3, 0x40c00000, v3
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v64, v64, v65, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v65, v66, v69, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v66, 0x400000, v69
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v69, v69
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v69, v70, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v65, v65, v66, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v66, v67, v68, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v67, 0x400000, v68
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v68, v68
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v68, 0x400000, v70
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v65
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v71, v68, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v67, v70, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v65, v65, v69, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v69, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_add3_u32 v70, v71, v68, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v71, 0x400000, v68
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v66, v66, v67, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v67, v69, v70, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v69, v3, 16, 1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v70, v70
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v70, v71, 16, 1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v67, v67, v68, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v68, v69, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v69, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v67, v69, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v67, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v68, v68
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v69, 0x40c00000, v80
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v65.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v67, v67, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v68, v70, v71, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v70, 16, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v71, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v80, v69, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v67
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v68, v69, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v68, v70, v71, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v69, 0x400000, v71
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v70, v4, 16, 1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v71, v71
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v71, v80, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v68, v68, v69, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v69, v70, v4, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v70, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v70, 0x40c00000, v70
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v52.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v67, v71, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v67, v80, v69, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v71, 0x400000, v69
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v80, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v69, v69
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v81, v70, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v68.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v69, v80, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v67, v67, v71, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v71, 0x400000, v4
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v68
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v69, v70, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v69, v71, v80, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v70, 0x400000, v80
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v71, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v80, v80
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v80, v81, 16, 1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v4.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v69, v69, v70, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v70, v71, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v71, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v80, v81, v70, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v81, 0x400000, v70
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v69, v71, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v69, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v70, v70
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v71, 0x40c00000, v82
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v67.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v69, v69, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v70, v80, v81, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v80, 16, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v81, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v82, v71, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v69
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v70, v71, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v70, v80, v81, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v71, 0x400000, v81
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v80, v6, 16, 1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v81, v81
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v81, v82, 16, 1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v5.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v70, v70, v71, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v71, v80, v6, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v80, 0x400000, v6
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v71, v80, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v71, v81, v82, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v80, 0x400000, v82
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v81, v7, 16, 1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v82, v82
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v82, v83, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v71, v71, v80, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v80, v81, v7, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v81, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v80, 0x40c00000, v80 :: v_dual_cndmask_b32 v5, v69, v81
+; GFX11-TRUE16-NEXT:    v_add3_u32 v69, v82, v71, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v81, 0x400000, v71
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v82, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v71, v71
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v83, v80, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v70.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v71, v82, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v69, v69, v81, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v81, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_add3_u32 v82, v83, v80, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v83, 0x400000, v80
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v71, v81, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v71, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v80, v80
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v81, 0x40c00000, v84
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v69.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v71, v71, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v80, v82, v83, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v83, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v84, v81, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v7, v80, v81, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v80, v82, v83, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v81, 0x400000, v83
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v82, v8, 16, 1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v83, v83
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v83, v84, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v80, v80, v81, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v81, v82, v8, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v82, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v82, 0x40c00000, v82 :: v_dual_cndmask_b32 v7, v71, v83
+; GFX11-TRUE16-NEXT:    v_add3_u32 v71, v84, v81, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v83, 0x400000, v81
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v84, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v81, v81
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v85, v82, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v80.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v81, v84, v8, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v71, v71, v83, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v83, 0x400000, v8
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v81, v82, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v81, v83, v84, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v82, 0x400000, v84
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v83, v9, 16, 1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v84, v84
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v84, v85, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v81, v81, v82, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v82, v83, v9, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v83, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_add3_u32 v84, v85, v82, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v85, 0x400000, v82
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v81, v83, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v81, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v82, v82
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v83, 0x40c00000, v86
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v71.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v81, v81, v9, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v82, v84, v85, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v84, 16, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v85, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v86, v83, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v9, v82, v83, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v82, v84, v85, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v83, 0x400000, v85
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v84, v10, 16, 1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v85, v85
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v85, v86, 16, 1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v9.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v81
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v82, v82, v83, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v83, v84, v10, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v84, 0x400000, v10
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v84, 0x40c00000, v84
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v54.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v9, v81, v85, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v81, v86, v83, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v85, 0x400000, v83
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v86, v10, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v83, v83
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v87, v84, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v82.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v83, v86, v10, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v81, v81, v85, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v85, 0x400000, v10
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v7, 16, v9
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v10, v83, v84, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v83, v85, v86, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v84, 0x400000, v86
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v85, v11, 16, 1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v86, v86
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v86, 0x40c00000, v96
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v96, 0x400000, v11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v10.h
-; GFX11-TRUE16-NEXT:    v_add3_u32 v85, v85, v11, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v83, v83, v84, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v84, v87, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v86, v87, v84, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v87, 0x400000, v84
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v10, v83, v85, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v83, v11, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v84, v84
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v85, 0x40c00000, v96
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v81.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v83, v83, v11, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v84, v86, v87, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v86, 0x400000, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v13
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v96, v85, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v99, v86, 16, 1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v82
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v84, v84, v87, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v11, v85, v96, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v87, v87
-; GFX11-TRUE16-NEXT:    v_add3_u32 v87, v99, v86, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v96, 0x400000, v86
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v6, 16, v10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v11.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v84, v84, v97, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v86, v86
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v83
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v86, v87, v96, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v96, 16, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v64.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v11, v83, v86, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v83, 0x40c00000, v87
+; GFX11-TRUE16-NEXT:    v_add3_u32 v86, v96, v85, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v87, 0x400000, v85
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v96, v12, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v85, v85
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v97, v83, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v98, 0x400000, v83
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v84.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v85, v86, v87, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v86, v96, v12, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v87, 0x400000, v12
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    v_add3_u32 v96, v97, v83, 0x7fff
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v14
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v99, v13, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v12, v86, v87, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v83, v83
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v86, 0x40c00000, v97
+; GFX11-TRUE16-NEXT:    v_add3_u32 v87, v99, v13, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v85.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v83, v96, v98, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v96, 0x400000, v13
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v15
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v97, v86, 16, 1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v5, 16, v11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v70
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v96, 0x40c00000, v96 :: v_dual_add_f32 v15, 0x40c00000, v15
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v5, 16, v6
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v101, v96, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v102, v15, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v113, 0x400000, v15
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v114, 0x400000, v96
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v17, 16, v69
-; GFX11-TRUE16-NEXT:    v_add3_u32 v101, v101, v96, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v102, v102, v15, 0x7fff
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v66.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v27, 16, v55
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v51.h
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v29
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v31, v31, 16, v66
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v98, v12, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v97, 0x400000, v12
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v28, v27, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v38.h
-; GFX11-TRUE16-NEXT:    v_add3_u32 v85, v98, v12, 0x7fff
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v24
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v100, 0x400000, v14
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v66.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v13, v87, v96, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v87, 0x40c00000, v98
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v98, v13, 16, 1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v34.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v20
+; GFX11-TRUE16-NEXT:    v_add3_u32 v96, v97, v86, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v97, 0x400000, v86
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v98, v14, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v86, v86
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v99, v87, 16, 1
-; GFX11-TRUE16-NEXT:    v_add3_u32 v98, v98, v13, 0x7fff
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v103, 0x400000, v87
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v18
-; GFX11-TRUE16-NEXT:    v_add3_u32 v99, v99, v87, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v100, v14, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v112, 0x400000, v14
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v83.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v86, v96, v97, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v97, v98, v14, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v98, v99, v87, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v99, 0x400000, v87
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v87, v87
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v96, v15, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v101, 0x400000, v15
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v87, v98, v99, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v100, v100, v14, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v14, v100, v112, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v96, v96, v15, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v14, v97, v100, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v14.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v15, v102, v113, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v96, v96
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v15.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v96, v101, v114, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v87, v87
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v96
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v87, v99, v103, vcc_lo
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v99, 0x400000, v13
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v3, 16, v15
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v87
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v13, v98, v99, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v4, 16, v14
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v13.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v12, v85, v97, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v86
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v12.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v84
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v3, 16, v13
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v8.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v80
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v4, 16, v12
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v7.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v71
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v3, 16, v8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v3.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v4, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v22, 16, v68
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v64.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v17, 16, v65
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v54.h
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v3, v3, 16, v67
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v22, 16, v64
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v52.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v30
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v30, v17, 16, v53
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v49.h
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v29, v22, 16, v52
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v25
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v17, 16, v48
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v36.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v19
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v22, 16, v39
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v33, 16, v37
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v33.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v17.h
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v17, 16, v35
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v33, 16, v36
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v17, v37, 16, v38
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v86.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v16, v39, 16, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v15, v96, v101, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v87.h
 ; GFX11-TRUE16-NEXT:  .LBB104_2: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
@@ -232627,641 +238789,1242 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v64bf16_to_v64i16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT:    v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12
-; GFX11-NEXT:    v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
-; GFX11-NEXT:    v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8
-; GFX11-NEXT:    v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6
-; GFX11-NEXT:    v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
-; GFX11-NEXT:    v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
-; GFX11-NEXT:    v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
-; GFX11-NEXT:    v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT:    s_mov_b32 s15, s3
-; GFX11-NEXT:    s_mov_b32 s14, s2
-; GFX11-NEXT:    s_mov_b32 s13, s1
-; GFX11-NEXT:    s_mov_b32 s12, s0
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cbranch_scc0 .LBB105_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccnz .LBB105_4
-; GFX11-NEXT:  .LBB105_2: ; %cmp.true
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v16
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v18
-; GFX11-NEXT:    s_and_b32 s0, s12, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s1, s24, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v1, 0x40c00000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v17
-; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v6, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v2
-; GFX11-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, v7, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, v6, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-NEXT:    v_bfe_u32 v5, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v5, v0
-; GFX11-NEXT:    v_add_f32_e32 v0, 0x40c00000, v4
-; GFX11-NEXT:    v_bfe_u32 v11, v3, 16, 1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v18
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v16, v5, v8, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, v11, v3
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v17, v6, v9, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v19
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v7, v10, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_dual_add_f32 v3, 0x40c00000, v6 :: v_dual_add_nc_u32 v2, v2, v0
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v18, v1, v5 :: v_dual_lshlrev_b32 v5, 16, v19
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_add_f32 v4, 0x40c00000, v4
-; GFX11-NEXT:    v_dual_cndmask_b32 v33, v2, v6 :: v_dual_add_nc_u32 v2, v7, v3
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v20
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v1, v4, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, v1, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, 0x400000, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v6, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v19, v0, v1 :: v_dual_add_nc_u32 v0, 0x7fff, v2
-; GFX11-NEXT:    v_or_b32_e32 v1, 0x400000, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_add_f32_e32 v2, 0x40c00000, v7
-; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v34, v0, v1, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v20
-; GFX11-NEXT:    v_bfe_u32 v0, v2, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_add_f32 v4, 0x40c00000, v6
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v21
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v20, v1, v3, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_bfe_u32 v1, v4, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v3, 0x40c00000, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, v1, v4
-; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v35, v0, v6, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, 0x400000, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v7, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v6, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v22
-; GFX11-NEXT:    v_cndmask_b32_e32 v36, v0, v1, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v2
-; GFX11-NEXT:    v_or_b32_e32 v1, 0x400000, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v36
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v21, v0, v1, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v22
-; GFX11-NEXT:    v_add_f32_e32 v4, 0x40c00000, v6
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v23
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v23
-; GFX11-NEXT:    v_cndmask_b32_e32 v22, v1, v3, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v2, 0x40c00000, v7
-; GFX11-NEXT:    v_bfe_u32 v1, v4, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v3, 0x40c00000, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v0, v2, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, v0, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v7, v3
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v24
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, v1, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v37, v0, v6 :: v_dual_add_nc_u32 v0, 0x7fff, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_cndmask_b32 v23, v0, v1 :: v_dual_add_nc_u32 v0, 0x7fff, v2
-; GFX11-NEXT:    v_or_b32_e32 v1, 0x400000, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v38, v0, v1, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v6, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v25
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v4
-; GFX11-NEXT:    v_add_f32_e32 v4, 0x40c00000, v6
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v25
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_cndmask_b32 v24, v1, v3
-; GFX11-NEXT:    v_dual_add_f32 v2, 0x40c00000, v7 :: v_dual_add_f32 v3, 0x40c00000, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v1, v4, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v0, v2, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v7, v3
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v26
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, v1, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_cndmask_b32 v39, v0, v6 :: v_dual_add_nc_u32 v0, 0x7fff, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, 0x400000, v4
-; GFX11-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v6, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v25, v0, v1 :: v_dual_add_nc_u32 v0, 0x7fff, v2
-; GFX11-NEXT:    v_or_b32_e32 v1, 0x400000, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v26
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v48, v0, v1, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v2, 0x40c00000, v7
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v4
-; GFX11-NEXT:    v_add_f32_e32 v4, 0x40c00000, v6
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v27
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_bfe_u32 v0, v2, 16, 1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v27
-; GFX11-NEXT:    v_cndmask_b32_e32 v49, v1, v3, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v3, 0x40c00000, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, v0, v2
-; GFX11-NEXT:    v_bfe_u32 v1, v4, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX11-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v49
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v7, v3
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v28
-; GFX11-NEXT:    v_dual_cndmask_b32 v26, v0, v6 :: v_dual_add_nc_u32 v1, v1, v4
-; GFX11-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, 0x400000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v6, v5
-; GFX11-NEXT:    v_dual_cndmask_b32 v27, v0, v1 :: v_dual_lshlrev_b32 v6, 16, v28
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v2
-; GFX11-NEXT:    v_or_b32_e32 v1, 0x400000, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v50, v0, v1, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v2, 0x40c00000, v7
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v4
-; GFX11-NEXT:    v_add_f32_e32 v4, 0x40c00000, v6
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v29
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_bfe_u32 v0, v2, 16, 1
-; GFX11-NEXT:    v_dual_cndmask_b32 v28, v1, v3 :: v_dual_lshlrev_b32 v5, 16, v29
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_f32_e32 v3, 0x40c00000, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, v0, v2
-; GFX11-NEXT:    v_bfe_u32 v1, v4, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX11-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v7, v3
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v30
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, v1, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v51, v0, v6, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, 0x400000, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v6, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_cndmask_b32 v29, v0, v1 :: v_dual_lshlrev_b32 v6, 16, v30
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v2
-; GFX11-NEXT:    v_or_b32_e32 v1, 0x400000, v3
-; GFX11-NEXT:    v_add_f32_e32 v2, 0x40c00000, v7
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v52, v0, v1 :: v_dual_add_nc_u32 v1, 0x7fff, v4
-; GFX11-NEXT:    v_bfe_u32 v0, v2, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v4, 0x40c00000, v6
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v31
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_cndmask_b32 v30, v1, v3 :: v_dual_add_f32 v3, 0x40c00000, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v31
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT:    v_dual_cndmask_b32 v53, v0, v5 :: v_dual_add_f32 v0, 0x40c00000, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v7, v3
-; GFX11-NEXT:    v_bfe_u32 v1, v4, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v4
-; GFX11-NEXT:    v_bfe_u32 v6, v0, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v7, 0x40c00000, s0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, v1, v4
-; GFX11-NEXT:    s_lshl_b32 s0, s12, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v6, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT:    v_dual_cndmask_b32 v54, v1, v2 :: v_dual_add_nc_u32 v1, 0x7fff, v5
-; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v3
-; GFX11-NEXT:    v_bfe_u32 v5, v7, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v54
-; GFX11-NEXT:    v_cndmask_b32_e32 v31, v1, v2, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v5, v7
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_and_b32 s0, s13, 0xffff0000
-; GFX11-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s13, 16
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v7
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v5, v1
-; GFX11-NEXT:    v_bfe_u32 v5, v6, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v55, v2, v3, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v3, 0x40c00000, s0
-; GFX11-NEXT:    s_and_b32 s0, s14, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s0
-; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v5, v6
-; GFX11-NEXT:    s_lshl_b32 s0, s14, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v2, v4 :: v_dual_add_nc_u32 v1, 0x7fff, v5
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v7, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v8, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, v7, v8
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    s_and_b32 s0, s15, 0xffff0000
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_f32_e64 v10, 0x40c00000, s0
-; GFX11-NEXT:    v_bfe_u32 v9, v4, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v7
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v8
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    s_lshl_b32 s0, s15, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, v9, v4
-; GFX11-NEXT:    v_bfe_u32 v9, v10, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v64, v5, v6, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT:    s_and_b32 s0, s16, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v7
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v9, v10
-; GFX11-NEXT:    v_bfe_u32 v9, v5, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v11, 0x40c00000, s0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    s_lshl_b32 s0, s16, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v8, v11, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v65, v6, v7, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v10
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, v9, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v8, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v65
-; GFX11-NEXT:    v_cndmask_b32_e32 v66, v4, v6, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v7
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    s_and_b32 s0, s17, 0xffff0000
-; GFX11-NEXT:    v_bfe_u32 v9, v4, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v10, 0x40c00000, s0
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v8
-; GFX11-NEXT:    v_cndmask_b32_e32 v67, v6, v7, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v11
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    s_lshl_b32 s0, s17, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, v9, v4
-; GFX11-NEXT:    v_bfe_u32 v8, v10, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 16, v67
-; GFX11-NEXT:    v_cndmask_b32_e32 v68, v5, v6, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT:    s_and_b32 s0, s18, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v7
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v8, v10
-; GFX11-NEXT:    v_bfe_u32 v9, v5, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v11, 0x40c00000, s0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    s_lshl_b32 s0, s18, 16
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v8
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v10
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v9, v5
-; GFX11-NEXT:    v_bfe_u32 v9, v11, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, v9, v11
-; GFX11-NEXT:    v_cndmask_b32_e32 v69, v6, v7, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v8
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    s_and_b32 s0, s19, 0xffff0000
-; GFX11-NEXT:    v_bfe_u32 v10, v6, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v12, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s19, 16
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v9
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v11
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, v10, v6
-; GFX11-NEXT:    v_bfe_u32 v10, v12, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v70, v7, v8, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v7, 0x40c00000, s0
-; GFX11-NEXT:    s_and_b32 s0, s20, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v9
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, v10, v12
-; GFX11-NEXT:    v_bfe_u32 v11, v7, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v13, 0x40c00000, s0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    s_lshl_b32 s0, s20, 16
-; GFX11-NEXT:    v_or_b32_e32 v15, 0x400000, v7
-; GFX11-NEXT:    v_and_or_b32 v5, 0xffff0000, v69, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v9, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v10
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, v11, v7
-; GFX11-NEXT:    v_bfe_u32 v10, v13, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v11, 0x40c00000, s0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT:    s_and_b32 s0, s21, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, 0x7fff, v9
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, v10, v13
-; GFX11-NEXT:    v_bfe_u32 v71, v11, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v80, v8, v14, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_add_f32_e64 v12, 0x40c00000, s0
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v10
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v13
-; GFX11-NEXT:    s_lshl_b32 s0, s21, 16
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v9, v15, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, v71, v11
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT:    v_add_f32_e64 v14, 0x40c00000, s0
-; GFX11-NEXT:    v_or_b32_e32 v15, 0x400000, v11
-; GFX11-NEXT:    v_bfe_u32 v71, v12, 16, 1
-; GFX11-NEXT:    v_dual_cndmask_b32 v8, v8, v10 :: v_dual_add_nc_u32 v9, 0x7fff, v9
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    s_and_b32 s0, s22, 0xffff0000
-; GFX11-NEXT:    v_bfe_u32 v10, v14, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v11, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s22, 16
-; GFX11-NEXT:    v_cndmask_b32_e32 v81, v9, v15, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, v71, v12
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, v10, v14
-; GFX11-NEXT:    v_add_f32_e64 v13, 0x40c00000, s0
-; GFX11-NEXT:    v_or_b32_e32 v15, 0x400000, v12
-; GFX11-NEXT:    v_bfe_u32 v71, v11, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, 0x7fff, v9
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT:    v_or_b32_e32 v82, 0x400000, v14
-; GFX11-NEXT:    v_bfe_u32 v83, v13, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v12, v71, v11
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v9, v15, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT:    s_and_b32 s0, s23, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v14, v83, v13
-; GFX11-NEXT:    v_add_nc_u32_e32 v12, 0x7fff, v12
-; GFX11-NEXT:    v_add_f32_e64 v15, 0x40c00000, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v10, v10, v82, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v71, 0x400000, v11
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    s_lshl_b32 s0, s23, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v14, 0x7fff, v14
-; GFX11-NEXT:    v_or_b32_e32 v82, 0x400000, v13
-; GFX11-NEXT:    v_bfe_u32 v83, v15, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v84, 0x40c00000, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v71, v12, v71, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT:    s_and_b32 s0, s24, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v12, v83, v15
-; GFX11-NEXT:    v_bfe_u32 v13, v84, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v83, 0x400000, v15
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v14, v82, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v14, 0x40c00000, s0
-; GFX11-NEXT:    v_add_f32_e64 v82, 0x40c00000, s1
-; GFX11-NEXT:    v_add_nc_u32_e32 v12, 0x7fff, v12
-; GFX11-NEXT:    v_add_nc_u32_e32 v13, v13, v84
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-NEXT:    v_bfe_u32 v85, v14, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v86, v82, 16, 1
-; GFX11-NEXT:    s_and_b32 s0, s25, 0xffff0000
-; GFX11-NEXT:    v_or_b32_e32 v96, 0x400000, v82
-; GFX11-NEXT:    v_dual_cndmask_b32 v83, v12, v83 :: v_dual_add_nc_u32 v12, 0x7fff, v13
-; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v84
-; GFX11-NEXT:    v_add_nc_u32_e32 v15, v85, v14
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v84, v84
-; GFX11-NEXT:    v_add_nc_u32_e32 v85, v86, v82
-; GFX11-NEXT:    v_or_b32_e32 v84, 0x400000, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_dual_cndmask_b32 v12, v12, v13 :: v_dual_add_nc_u32 v15, 0x7fff, v15
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT:    v_add_nc_u32_e32 v85, 0x7fff, v85
-; GFX11-NEXT:    v_add_f32_e64 v13, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s25, 16
-; GFX11-NEXT:    v_and_or_b32 v6, 0xffff0000, v70, v6
-; GFX11-NEXT:    v_cndmask_b32_e32 v84, v15, v84, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v82, v82
-; GFX11-NEXT:    v_add_f32_e64 v87, 0x40c00000, s0
-; GFX11-NEXT:    s_and_b32 s0, s26, 0xffff0000
-; GFX11-NEXT:    v_bfe_u32 v86, v13, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v82, v85, v96, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v85, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s26, 16
-; GFX11-NEXT:    v_bfe_u32 v15, v87, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v96, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s27, 16
-; GFX11-NEXT:    v_bfe_u32 v97, v85, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v98, 0x40c00000, s0
-; GFX11-NEXT:    s_and_b32 s0, s27, 0xffff0000
-; GFX11-NEXT:    v_bfe_u32 v99, v96, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v100, 0x40c00000, s0
-; GFX11-NEXT:    v_or_b32_e32 v113, 0x400000, v96
-; GFX11-NEXT:    v_bfe_u32 v101, v98, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v96, v96
-; GFX11-NEXT:    v_add_nc_u32_e32 v99, v99, v96
-; GFX11-NEXT:    v_add_nc_u32_e32 v97, v97, v85
-; GFX11-NEXT:    v_bfe_u32 v103, v100, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v101, v101, v98
-; GFX11-NEXT:    v_or_b32_e32 v114, 0x400000, v98
-; GFX11-NEXT:    v_add_nc_u32_e32 v99, 0x7fff, v99
-; GFX11-NEXT:    v_add_nc_u32_e32 v97, 0x7fff, v97
-; GFX11-NEXT:    v_or_b32_e32 v112, 0x400000, v85
-; GFX11-NEXT:    v_add_nc_u32_e32 v101, 0x7fff, v101
-; GFX11-NEXT:    v_add_nc_u32_e32 v103, v103, v100
-; GFX11-NEXT:    v_cndmask_b32_e32 v96, v99, v113, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v98, v98
-; GFX11-NEXT:    v_add_nc_u32_e32 v15, v15, v87
-; GFX11-NEXT:    v_add_nc_u32_e32 v14, v86, v13
-; GFX11-NEXT:    v_add_nc_u32_e32 v99, 0x7fff, v103
-; GFX11-NEXT:    v_or_b32_e32 v103, 0x400000, v100
-; GFX11-NEXT:    v_cndmask_b32_e32 v98, v101, v114, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v85, v85
-; GFX11-NEXT:    v_add_nc_u32_e32 v15, 0x7fff, v15
-; GFX11-NEXT:    v_or_b32_e32 v102, 0x400000, v87
-; GFX11-NEXT:    v_add_nc_u32_e32 v14, 0x7fff, v14
-; GFX11-NEXT:    v_or_b32_e32 v86, 0x400000, v13
-; GFX11-NEXT:    v_cndmask_b32_e32 v85, v97, v112, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v100, v100
-; GFX11-NEXT:    v_lshrrev_b32_e32 v96, 16, v96
-; GFX11-NEXT:    v_and_or_b32 v2, 0xffff0000, v64, v65
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff0000, v55, v69
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v30
-; GFX11-NEXT:    v_cndmask_b32_e32 v97, v99, v103, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v87, v87
-; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 16, v29
-; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v28
-; GFX11-NEXT:    v_and_or_b32 v4, 0xffff0000, v68, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 16, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v87, v15, v102, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT:    v_and_or_b32 v3, 0xffff0000, v66, v67
-; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 16, v27
-; GFX11-NEXT:    v_and_or_b32 v29, 0xffff0000, v52, v55
-; GFX11-NEXT:    v_and_or_b32 v28, 0xffff0000, v51, v64
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, v14, v86, vcc_lo
-; GFX11-NEXT:    v_and_or_b32 v14, 0xffff0000, v85, v96
-; GFX11-NEXT:    v_lshrrev_b32_e32 v85, 16, v87
-; GFX11-NEXT:    v_lshrrev_b32_e32 v87, 16, v11
-; GFX11-NEXT:    v_and_or_b32 v27, 0xffff0000, v50, v65
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v98, 16, v98
-; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 16, v82
-; GFX11-NEXT:    v_lshrrev_b32_e32 v86, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v96, 16, v10
-; GFX11-NEXT:    v_and_or_b32 v10, 0xffff0000, v71, v87
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 16, v81
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_and_or_b32 v30, 0xffff0000, v53, v54
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v22
-; GFX11-NEXT:    v_and_or_b32 v25, 0xffff0000, v48, v49
-; GFX11-NEXT:    v_and_or_b32 v24, 0xffff0000, v39, v50
-; GFX11-NEXT:    v_and_or_b32 v23, 0xffff0000, v38, v51
-; GFX11-NEXT:    v_and_or_b32 v22, 0xffff0000, v37, v52
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v17
-; GFX11-NEXT:    v_and_or_b32 v15, 0xffff0000, v97, v98
-; GFX11-NEXT:    v_and_or_b32 v13, 0xffff0000, v13, v85
-; GFX11-NEXT:    v_and_or_b32 v12, 0xffff0000, v84, v82
-; GFX11-NEXT:    v_and_or_b32 v11, 0xffff0000, v83, v86
-; GFX11-NEXT:    v_and_or_b32 v9, 0xffff0000, v9, v96
-; GFX11-NEXT:    v_and_or_b32 v8, 0xffff0000, v8, v71
-; GFX11-NEXT:    v_and_or_b32 v7, 0xffff0000, v80, v7
-; GFX11-NEXT:    v_and_or_b32 v1, 0xffff0000, v1, v68
-; GFX11-NEXT:    v_and_or_b32 v31, 0xffff0000, v31, v70
-; GFX11-NEXT:    v_and_or_b32 v26, 0xffff0000, v26, v66
-; GFX11-NEXT:    v_and_or_b32 v21, 0xffff0000, v21, v53
-; GFX11-NEXT:    v_and_or_b32 v20, 0xffff0000, v35, v36
-; GFX11-NEXT:    v_and_or_b32 v19, 0xffff0000, v34, v37
-; GFX11-NEXT:    v_and_or_b32 v18, 0xffff0000, v33, v38
-; GFX11-NEXT:    v_and_or_b32 v17, 0xffff0000, v32, v39
-; GFX11-NEXT:    v_and_or_b32 v16, 0xffff0000, v16, v48
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB105_3:
-; GFX11-NEXT:    s_branch .LBB105_2
-; GFX11-NEXT:  .LBB105_4:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
-; GFX11-NEXT:    v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
-; GFX11-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
-; GFX11-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
-; GFX11-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
-; GFX11-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
-; GFX11-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
-; GFX11-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v64i16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT:    s_mov_b32 s15, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s14, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB105_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB105_4
+; GFX11-TRUE16-NEXT:  .LBB105_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v17
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v16
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v18
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s12, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, s25, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v3, 16, v17
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, v7, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v16
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, v5, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_cndmask_b32 v16, v5, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, v6, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v19
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, v11, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v6, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v18
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, v5, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v17, v7, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v33, v0, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v0, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v19
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v18, v5, v7 :: v_dual_and_b32 v7, 0xffff0000, v20
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v20
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v33.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v34, v0, v1, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v7 :: v_dual_add_nc_u32 v0, v5, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v34.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v19, v3, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v21
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v6, 16, v21
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v35, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v35.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v0, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v20, v4, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v22
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v32.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v36, v0, v1, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v23
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v36.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v21, v4, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v6, 0x40c00000, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, v5, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v23
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_cndmask_b32 v37, v0, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v0, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v37.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v22
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v7
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v22, v4, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v24
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v24
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v38, v0, v1, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v7 :: v_dual_add_nc_u32 v0, v5, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v23, v4, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v25
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v38.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v39, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v0, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v39.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v24, v4, v7 :: v_dual_and_b32 v7, 0xffff0000, v26
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v26
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v48, v0, v1 :: v_dual_add_f32 v1, 0x40c00000, v7
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v48.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v25
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v27
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v6, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v25, v4, v8 :: v_dual_add_nc_u32 v0, v5, v6
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v27
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_cndmask_b32 v49, v0, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v0, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v49.h
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v26, v4, v7 :: v_dual_and_b32 v7, 0xffff0000, v28
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v28
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v50, v0, v1, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v7 :: v_dual_add_nc_u32 v0, v5, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v29
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v27, v4, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v50.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v51, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v0, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v51.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v28, v4, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v30
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v7, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v52, v0, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v52.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v4, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v29
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, v5, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v30
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v29, v5, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v31
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s12, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v53, v2, v6 :: v_dual_lshlrev_b32 v2, 16, v31
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v53.h
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v30, v4, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, v6, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v54, v0, v4 :: v_dual_add_nc_u32 v1, v1, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s13, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v31, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s13, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v8
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v64, v0, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v0, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, v0, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v5
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s14, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v3 :: v_dual_add_nc_u32 v1, 0x7fff, v6
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s14, 16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v55, v1, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s15, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.l, v64.h
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v1, v2, v8 :: v_dual_add_nc_u32 v2, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s15, 16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v65, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v2, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v4, v6
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s16, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v2, v3, v7 :: v_dual_add_nc_u32 v3, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v10, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, v7, v10
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s16, 16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v66, v3, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v11, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v54.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v66.h
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v3, v4, v9 :: v_dual_add_nc_u32 v4, 0x7fff, v7
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s17, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, v5, v11
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s17, 16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v67, v4, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v11
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, v4, v7
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, v6, v8
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s18, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v4, v5, v9 :: v_dual_add_nc_u32 v5, 0x7fff, v10
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v12, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s18, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, v9, v12
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v13, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v68, v5, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v12
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v67.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, v7, v13
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v68.h
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v5, v6, v11 :: v_dual_add_nc_u32 v6, 0x7fff, v9
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s19, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s19, 16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v69, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v13
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v10, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, v6, v9
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s20, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v8, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v6, v7, v11 :: v_dual_add_nc_u32 v7, 0x7fff, v12
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v14, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, v11, v14
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s20, 16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v70, v7, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v15, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v14
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v69.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v70.h
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v7, v8, v13 :: v_dual_add_nc_u32 v8, 0x7fff, v11
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s21, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, v9, v15
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s21, 16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v71, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v11, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 0x7fff, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v12, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v15
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v80, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, v8, v11
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, v10, v12
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s22, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, 0x400000, v12
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v81, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v8, v9, v13 :: v_dual_add_nc_u32 v9, 0x7fff, v14
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v80, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v11
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s22, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v13, v13, v80
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v81, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v82, v9, v14, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v80
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v71.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, v11, v81
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v82.h
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v9, v10, v15 :: v_dual_add_nc_u32 v10, 0x7fff, v13
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v80, v80
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s23, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s23, 16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v80, v10, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v13, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v81
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v15, v14, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v84, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v81, v81
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v83, v10, v13
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s24, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v85, 0x400000, v14
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v81, v84, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v10, v11, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, v15, v14
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 0x7fff, v83
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, 0x400000, v13
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v83, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v81, v81, v84
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s24, 16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v86, v12, v15, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v12, v83, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v13, 0x7fff, v81
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v84
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v11, v11, v85, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v84, v84
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, v12, v83
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v81, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s25, 16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v85, 0x400000, v83
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v84, v13, v14, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v15, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v14, v81, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v87, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v83, v83
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v13, v13, v15
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v96, 0x400000, v81
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, v14, v81
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v83, v87, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v12, v12, v85, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v13, 0x7fff, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v85, 0x400000, v15
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v83, v83, v87
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s26, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v84.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v85, v13, v85, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v81, v81
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s26, 16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v86.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v85.h
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v13, v14, v96 :: v_dual_add_nc_u32 v14, 0x7fff, v83
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v83, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s27, 16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v96, 0x400000, v87
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v97, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v87, v87
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v98, v83, 16, 1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s27, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v81, v15, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v100, v97, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v87, v14, v96, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, v98, v83
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v99, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v101, 0x400000, v83
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v98, v100, v97
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v83, v83
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v81, v81, v15
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v96, v99, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v98, 0x7fff, v98
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v102, 0x400000, v97
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v83, v14, v101, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v97, v97
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v81, 0x7fff, v81
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v100, 0x400000, v15
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v96, v96, v99
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v101, 0x400000, v99
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v97, v98, v102, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v87.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v96, 0x7fff, v96
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v80.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v65.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v14, v81, v100, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v99, v99
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v83.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v55.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v15, v96, v101, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v97.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB105_3:
+; GFX11-TRUE16-NEXT:    s_branch .LBB105_2
+; GFX11-TRUE16-NEXT:  .LBB105_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v64i16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v31, v13 :: v_dual_mov_b32 v30, v12
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-FAKE16-NEXT:    s_mov_b32 s15, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s14, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s13, s1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s12, s0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB105_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB105_4
+; GFX11-FAKE16-NEXT:  .LBB105_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v18
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s12, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s24, 16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v1, 0x40c00000, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v17
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, v7, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, v6, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v5, v0
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v18
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v5, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, v11, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v6, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v19
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v7, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v3, 0x40c00000, v6 :: v_dual_add_nc_u32 v2, v2, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v18, v1, v5 :: v_dual_lshlrev_b32 v5, 16, v19
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_add_f32 v4, 0x40c00000, v4
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v33, v2, v6 :: v_dual_add_nc_u32 v2, v7, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v20
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v1, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, v1, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v6, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v19, v0, v1 :: v_dual_add_nc_u32 v0, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v20
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v0, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_add_f32 v4, 0x40c00000, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v21
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, v0, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v1, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v1, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, v1, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v0, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v7, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v6, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v22
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v36
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v22
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v23
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v1, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v7
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v1, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v0, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, v0, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v7, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v24
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, v1, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v37, v0, v6 :: v_dual_add_nc_u32 v0, 0x7fff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v23, v0, v1 :: v_dual_add_nc_u32 v0, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v38, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v6, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v25
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v25
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_cndmask_b32 v24, v1, v3
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v2, 0x40c00000, v7 :: v_dual_add_f32 v3, 0x40c00000, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v1, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v0, v2, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, v0, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v7, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v26
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, v1, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v39, v0, v6 :: v_dual_add_nc_u32 v0, 0x7fff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v6, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v25, v0, v1 :: v_dual_add_nc_u32 v0, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v26
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v48, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v27
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v0, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v27
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v49, v1, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, v0, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v1, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v49
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v7, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v28
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v26, v0, v6 :: v_dual_add_nc_u32 v1, v1, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, 0x400000, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v6, v5
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v27, v0, v1 :: v_dual_lshlrev_b32 v6, 16, v28
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v50, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v29
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v0, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v28, v1, v3 :: v_dual_lshlrev_b32 v5, 16, v29
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, v0, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v1, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v7, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v30
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, v1, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v51, v0, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v6, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v29, v0, v1 :: v_dual_lshlrev_b32 v6, 16, v30
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v52, v0, v1 :: v_dual_add_nc_u32 v1, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v0, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v31
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, v0, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v30, v1, v3 :: v_dual_add_f32 v3, 0x40c00000, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v31
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v53, v0, v5 :: v_dual_add_f32 v0, 0x40c00000, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v7, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v1, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, v1, v4
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s12, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v6, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v54, v1, v2 :: v_dual_add_nc_u32 v1, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v54
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v1, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v5, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s13, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s13, 16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v5, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v6, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v55, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s14, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v5, v6
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s14, 16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v4 :: v_dual_add_nc_u32 v1, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v7, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v8, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, v7, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s15, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s15, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, v9, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v64, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s16, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v9, v10
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s16, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v11, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v65, v6, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, v9, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v8, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v65
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v66, v4, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s17, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v67, v6, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v11
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s17, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, v9, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v67
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v68, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s18, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v8, v10
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s18, 16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v9, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v11, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, v9, v11
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v69, v6, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s19, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s19, 16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v11
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, v10, v6
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v12, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x400000, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v70, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s20, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, v10, v12
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s20, 16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v5, 0xffff0000, v69, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v8, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, v11, v7
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v13, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s21, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 0x7fff, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, v10, v13
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v71, v11, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v80, v8, v14, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v13
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s21, 16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v9, v15, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, v71, v11
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, 0x400000, v11
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v71, v12, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v8, v10 :: v_dual_add_nc_u32 v9, 0x7fff, v9
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s22, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v14, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s22, 16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v81, v9, v15, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, v71, v12
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, v10, v14
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, 0x400000, v12
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v71, v11, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 0x7fff, v9
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v82, 0x400000, v14
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v83, v13, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, v71, v11
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v15, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s23, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, v83, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v10, v82, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v71, 0x400000, v11
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s23, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v82, 0x400000, v13
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v83, v15, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v84, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v71, v12, v71, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s24, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, v83, v15
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v84, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v83, 0x400000, v15
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v14, v82, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v82, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, v13, v84
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v85, v14, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v86, v82, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s25, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v96, 0x400000, v82
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v83, v12, v83 :: v_dual_add_nc_u32 v12, 0x7fff, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, 0x400000, v84
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, v85, v14
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v84, v84
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v85, v86, v82
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v84, 0x400000, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v12, v12, v13 :: v_dual_add_nc_u32 v15, 0x7fff, v15
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v85, 0x7fff, v85
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s25, 16
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v6, 0xffff0000, v70, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v84, v15, v84, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v82, v82
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v87, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s26, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v86, v13, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v82, v85, v96, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v85, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s26, 16
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v15, v87, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v96, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s27, 16
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v97, v85, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v98, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s27, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v99, v96, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v100, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v113, 0x400000, v96
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v101, v98, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v96, v96
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v99, v99, v96
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v97, v97, v85
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v103, v100, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v101, v101, v98
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v114, 0x400000, v98
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v99, 0x7fff, v99
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v97, 0x7fff, v97
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v112, 0x400000, v85
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v101, 0x7fff, v101
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v103, v103, v100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v96, v99, v113, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v98, v98
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, v15, v87
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, v86, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v99, 0x7fff, v103
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v103, 0x400000, v100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v98, v101, v114, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v85, v85
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, 0x7fff, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v102, 0x400000, v87
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v86, 0x400000, v13
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v85, v97, v112, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v100, v100
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 16, v96
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v64, v65
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v0, 0xffff0000, v55, v69
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v30
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v97, v99, v103, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v87, v87
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v29
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v28
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v4, 0xffff0000, v68, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v87, v15, v102, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v66, v67
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v27
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v29, 0xffff0000, v52, v55
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v28, 0xffff0000, v51, v64
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v14, v86, vcc_lo
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v14, 0xffff0000, v85, v96
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v85, 16, v87
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 16, v11
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v27, 0xffff0000, v50, v65
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v98, 16, v98
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v82
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v86, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v10, 0xffff0000, v71, v87
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v81
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v30, 0xffff0000, v53, v54
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v22
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v25, 0xffff0000, v48, v49
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v24, 0xffff0000, v39, v50
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v23, 0xffff0000, v38, v51
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v22, 0xffff0000, v37, v52
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v17
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v15, 0xffff0000, v97, v98
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v13, 0xffff0000, v13, v85
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v12, 0xffff0000, v84, v82
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v11, 0xffff0000, v83, v86
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v9, 0xffff0000, v9, v96
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v8, 0xffff0000, v8, v71
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v7, 0xffff0000, v80, v7
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v1, 0xffff0000, v1, v68
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v31, 0xffff0000, v31, v70
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v26, 0xffff0000, v26, v66
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v21, 0xffff0000, v21, v53
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v20, 0xffff0000, v35, v36
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v19, 0xffff0000, v34, v37
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v18, 0xffff0000, v33, v38
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v17, 0xffff0000, v32, v39
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v16, 0xffff0000, v16, v48
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB105_3:
+; GFX11-FAKE16-NEXT:    s_branch .LBB105_2
+; GFX11-FAKE16-NEXT:  .LBB105_4:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
index 582f31b0361ae..c6211aae19c1b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
@@ -3090,108 +3090,206 @@ define inreg <4 x i32> @bitcast_v8bf16_to_v4i32_scalar(<8 x bfloat> inreg %a, i3
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s19
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v8bf16_to_v4i32_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB23_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_vccnz .LBB23_4
-; GFX11-NEXT:  .LBB23_2: ; %cmp.true
-; GFX11-NEXT:    s_lshl_b32 s4, s3, 16
-; GFX11-NEXT:    s_pack_lh_b32_b16 s3, 0, s3
-; GFX11-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s3
-; GFX11-NEXT:    s_pack_lh_b32_b16 s4, 0, s2
-; GFX11-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX11-NEXT:    v_add_f32_e64 v3, 0x40c00000, s4
-; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s2
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, v7, v3
-; GFX11-NEXT:    s_pack_lh_b32_b16 s3, 0, s1
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s1
-; GFX11-NEXT:    s_lshl_b32 s1, s0, 16
-; GFX11-NEXT:    s_pack_lh_b32_b16 s0, 0, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v5
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s3
-; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v3, v7, v9 :: v_dual_add_nc_u32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
-; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v5
-; GFX11-NEXT:    v_bfe_u32 v5, v8, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, v7, v4
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v4
-; GFX11-NEXT:    v_bfe_u32 v11, v9, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v3, 0x40c00000, s1
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v5, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, v11, v9
-; GFX11-NEXT:    v_bfe_u32 v10, v3, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v5, v5, v13 :: v_dual_add_nc_u32 v10, v10, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v10, v14, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshl_or_b32 v2, v6, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v7, v12, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v8, v11, vcc_lo
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v3
-; GFX11-NEXT:    v_lshl_or_b32 v3, v1, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshl_or_b32 v1, v4, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshl_or_b32 v0, v7, 16, v8
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB23_3:
-; GFX11-NEXT:    s_branch .LBB23_2
-; GFX11-NEXT:  .LBB23_4:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v8bf16_to_v4i32_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB23_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB23_4
+; GFX11-TRUE16-NEXT:  .LBB23_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s4, 0, s3
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s4, 0, s2
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v4, 16, 1
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v8, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s3
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v10, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v4, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, v9, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, v10, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v6, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v6, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v1, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v10.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v5.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB23_3:
+; GFX11-TRUE16-NEXT:    s_branch .LBB23_2
+; GFX11-TRUE16-NEXT:  .LBB23_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v8bf16_to_v4i32_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB23_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB23_4
+; GFX11-FAKE16-NEXT:  .LBB23_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s3, 16
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s3, 0, s3
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s4, 0, s2
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, v7, v3
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s0, 0, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v7, v9 :: v_dual_add_nc_u32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v8, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, v7, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v5, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, v11, v9
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v13 :: v_dual_add_nc_u32 v10, v10, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v10, v14, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v2, v6, 16, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v7, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v8, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v3, v1, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v1, v4, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v7, 16, v8
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB23_3:
+; GFX11-FAKE16-NEXT:    s_branch .LBB23_2
+; GFX11-FAKE16-NEXT:  .LBB23_4:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -7531,108 +7629,206 @@ define inreg <4 x float> @bitcast_v8bf16_to_v4f32_scalar(<8 x bfloat> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s19
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v8bf16_to_v4f32_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB47_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_vccnz .LBB47_4
-; GFX11-NEXT:  .LBB47_2: ; %cmp.true
-; GFX11-NEXT:    s_lshl_b32 s4, s3, 16
-; GFX11-NEXT:    s_pack_lh_b32_b16 s3, 0, s3
-; GFX11-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s3
-; GFX11-NEXT:    s_pack_lh_b32_b16 s4, 0, s2
-; GFX11-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX11-NEXT:    v_add_f32_e64 v3, 0x40c00000, s4
-; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s2
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, v7, v3
-; GFX11-NEXT:    s_pack_lh_b32_b16 s3, 0, s1
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s1
-; GFX11-NEXT:    s_lshl_b32 s1, s0, 16
-; GFX11-NEXT:    s_pack_lh_b32_b16 s0, 0, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v5
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s3
-; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v3, v7, v9 :: v_dual_add_nc_u32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
-; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v5
-; GFX11-NEXT:    v_bfe_u32 v5, v8, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, v7, v4
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v4
-; GFX11-NEXT:    v_bfe_u32 v11, v9, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v3, 0x40c00000, s1
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v5, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, v11, v9
-; GFX11-NEXT:    v_bfe_u32 v10, v3, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v5, v5, v13 :: v_dual_add_nc_u32 v10, v10, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v10, v14, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshl_or_b32 v2, v6, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v7, v12, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v8, v11, vcc_lo
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v3
-; GFX11-NEXT:    v_lshl_or_b32 v3, v1, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshl_or_b32 v1, v4, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshl_or_b32 v0, v7, 16, v8
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB47_3:
-; GFX11-NEXT:    s_branch .LBB47_2
-; GFX11-NEXT:  .LBB47_4:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v8bf16_to_v4f32_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB47_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB47_4
+; GFX11-TRUE16-NEXT:  .LBB47_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s4, 0, s3
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s4, 0, s2
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v4, 16, 1
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v8, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s3
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v10, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v4, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, v9, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, v10, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v6, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v6, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v1, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v10.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v5.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB47_3:
+; GFX11-TRUE16-NEXT:    s_branch .LBB47_2
+; GFX11-TRUE16-NEXT:  .LBB47_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v8bf16_to_v4f32_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB47_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB47_4
+; GFX11-FAKE16-NEXT:  .LBB47_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s3, 16
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s3, 0, s3
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s4, 0, s2
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, v7, v3
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s0, 0, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v7, v9 :: v_dual_add_nc_u32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v8, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, v7, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v5, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, v11, v9
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v13 :: v_dual_add_nc_u32 v10, v10, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v10, v14, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v2, v6, 16, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v7, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v8, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v3, v1, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v1, v4, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v7, 16, v8
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB47_3:
+; GFX11-FAKE16-NEXT:    s_branch .LBB47_2
+; GFX11-FAKE16-NEXT:  .LBB47_4:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -11622,108 +11818,206 @@ define inreg <2 x i64> @bitcast_v8bf16_to_v2i64_scalar(<8 x bfloat> inreg %a, i3
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s19
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v8bf16_to_v2i64_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB67_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_vccnz .LBB67_4
-; GFX11-NEXT:  .LBB67_2: ; %cmp.true
-; GFX11-NEXT:    s_lshl_b32 s4, s3, 16
-; GFX11-NEXT:    s_pack_lh_b32_b16 s3, 0, s3
-; GFX11-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s3
-; GFX11-NEXT:    s_pack_lh_b32_b16 s4, 0, s2
-; GFX11-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX11-NEXT:    v_add_f32_e64 v3, 0x40c00000, s4
-; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s2
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, v7, v3
-; GFX11-NEXT:    s_pack_lh_b32_b16 s3, 0, s1
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s1
-; GFX11-NEXT:    s_lshl_b32 s1, s0, 16
-; GFX11-NEXT:    s_pack_lh_b32_b16 s0, 0, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v5
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s3
-; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v3, v7, v9 :: v_dual_add_nc_u32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
-; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v5
-; GFX11-NEXT:    v_bfe_u32 v5, v8, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, v7, v4
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v4
-; GFX11-NEXT:    v_bfe_u32 v11, v9, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v3, 0x40c00000, s1
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v5, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, v11, v9
-; GFX11-NEXT:    v_bfe_u32 v10, v3, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v5, v5, v13 :: v_dual_add_nc_u32 v10, v10, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v10, v14, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshl_or_b32 v2, v6, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v7, v12, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v8, v11, vcc_lo
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v3
-; GFX11-NEXT:    v_lshl_or_b32 v3, v1, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshl_or_b32 v1, v4, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshl_or_b32 v0, v7, 16, v8
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB67_3:
-; GFX11-NEXT:    s_branch .LBB67_2
-; GFX11-NEXT:  .LBB67_4:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v8bf16_to_v2i64_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB67_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB67_4
+; GFX11-TRUE16-NEXT:  .LBB67_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s4, 0, s3
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s4, 0, s2
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v4, 16, 1
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v8, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s3
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v10, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v4, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, v9, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, v10, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v6, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v6, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v1, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v10.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v5.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB67_3:
+; GFX11-TRUE16-NEXT:    s_branch .LBB67_2
+; GFX11-TRUE16-NEXT:  .LBB67_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v8bf16_to_v2i64_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB67_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB67_4
+; GFX11-FAKE16-NEXT:  .LBB67_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s3, 16
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s3, 0, s3
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s4, 0, s2
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, v7, v3
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s0, 0, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v7, v9 :: v_dual_add_nc_u32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v8, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, v7, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v5, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, v11, v9
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v13 :: v_dual_add_nc_u32 v10, v10, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v10, v14, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v2, v6, 16, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v7, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v8, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v3, v1, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v1, v4, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v7, 16, v8
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB67_3:
+; GFX11-FAKE16-NEXT:    s_branch .LBB67_2
+; GFX11-FAKE16-NEXT:  .LBB67_4:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -15292,108 +15586,206 @@ define inreg <2 x double> @bitcast_v8bf16_to_v2f64_scalar(<8 x bfloat> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s19
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v8bf16_to_v2f64_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB83_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_vccnz .LBB83_4
-; GFX11-NEXT:  .LBB83_2: ; %cmp.true
-; GFX11-NEXT:    s_lshl_b32 s4, s3, 16
-; GFX11-NEXT:    s_pack_lh_b32_b16 s3, 0, s3
-; GFX11-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s3
-; GFX11-NEXT:    s_pack_lh_b32_b16 s4, 0, s2
-; GFX11-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX11-NEXT:    v_add_f32_e64 v3, 0x40c00000, s4
-; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s2
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, v7, v3
-; GFX11-NEXT:    s_pack_lh_b32_b16 s3, 0, s1
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s1
-; GFX11-NEXT:    s_lshl_b32 s1, s0, 16
-; GFX11-NEXT:    s_pack_lh_b32_b16 s0, 0, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v5
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s3
-; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v3, v7, v9 :: v_dual_add_nc_u32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
-; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v5
-; GFX11-NEXT:    v_bfe_u32 v5, v8, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, v7, v4
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v4
-; GFX11-NEXT:    v_bfe_u32 v11, v9, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v3, 0x40c00000, s1
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v5, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, v11, v9
-; GFX11-NEXT:    v_bfe_u32 v10, v3, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v5, v5, v13 :: v_dual_add_nc_u32 v10, v10, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v10, v14, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshl_or_b32 v2, v6, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v7, v12, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v8, v11, vcc_lo
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v3
-; GFX11-NEXT:    v_lshl_or_b32 v3, v1, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshl_or_b32 v1, v4, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshl_or_b32 v0, v7, 16, v8
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB83_3:
-; GFX11-NEXT:    s_branch .LBB83_2
-; GFX11-NEXT:  .LBB83_4:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v8bf16_to_v2f64_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB83_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB83_4
+; GFX11-TRUE16-NEXT:  .LBB83_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s4, 0, s3
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s4, 0, s2
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v4, 16, 1
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v8, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s3
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v10, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v4, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, v9, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, v10, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v6, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v6, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v1, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v10.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v5.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB83_3:
+; GFX11-TRUE16-NEXT:    s_branch .LBB83_2
+; GFX11-TRUE16-NEXT:  .LBB83_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v8bf16_to_v2f64_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB83_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB83_4
+; GFX11-FAKE16-NEXT:  .LBB83_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s3, 16
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s3, 0, s3
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s4, 0, s2
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, v7, v3
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s0, 0, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v7, v9 :: v_dual_add_nc_u32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v8, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, v7, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v5, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, v11, v9
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v13 :: v_dual_add_nc_u32 v10, v10, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v10, v14, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v2, v6, 16, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v7, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v8, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v3, v1, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v1, v4, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v7, 16, v8
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB83_3:
+; GFX11-FAKE16-NEXT:    s_branch .LBB83_2
+; GFX11-FAKE16-NEXT:  .LBB83_4:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -18154,83 +18546,75 @@ define <8 x i16> @bitcast_v8bf16_to_v8i16(<8 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB94_2
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v0, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v6, 0x40c00000, v4
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v6, 16, 1
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v7, 0x40c00000, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v6
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v4
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v6, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v12, v7, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v1, 16, 1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v5, v5, v7 :: v_dual_add_f32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v1, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v11, v8, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v0
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT:    v_add3_u32 v12, v12, v7, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v11, v1, 0x7fff
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v0, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v9, v10, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v7, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v5.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v6, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v1, v7, v8 :: v_dual_and_b32 v2, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v9, 0x40c00000, v4 :: v_dual_add_f32 v2, 0x40c00000, v2
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v3.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, 0x400000, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v12, v13, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v7, 16, 1
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v7
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v12, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v7, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v4, 16, 1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v7, v9, v11, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v12, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v13, v4, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v4
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v4, v11, v12 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v6.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v10, v9, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v2, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v10, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v14, v3, 16, 1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    v_add3_u32 v13, v14, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v13, v14, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v11, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v10, v11, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v3.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v9, v15, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v8, v10, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v7
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v0.h
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v3, v0, 16, v3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.h
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v1, 16, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v7, 16, v5
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v4, 16, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v9, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v7.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v4.h
 ; GFX11-TRUE16-NEXT:  .LBB94_2: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -18580,104 +18964,191 @@ define inreg <8 x i16> @bitcast_v8bf16_to_v8i16_scalar(<8 x bfloat> inreg %a, i3
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s19
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v8bf16_to_v8i16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB95_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_vccnz .LBB95_4
-; GFX11-NEXT:  .LBB95_2: ; %cmp.true
-; GFX11-NEXT:    s_pack_lh_b32_b16 s4, 0, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX11-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
-; GFX11-NEXT:    s_pack_lh_b32_b16 s4, 0, s1
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT:    v_add_f32_e64 v2, 0x40c00000, s4
-; GFX11-NEXT:    s_lshl_b32 s0, s1, 16
-; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v0
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v1
-; GFX11-NEXT:    v_bfe_u32 v6, v2, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, v6, v2
-; GFX11-NEXT:    s_pack_lh_b32_b16 s0, 0, s2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v10, v4, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v3, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v3, v7 :: v_dual_add_nc_u32 v5, v5, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v8, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s2, 16
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT:    v_add_f32_e64 v7, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s3, 16
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v6, v9, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v7
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v10, v4
-; GFX11-NEXT:    v_bfe_u32 v10, v7, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v12, v9, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v9
-; GFX11-NEXT:    s_pack_lh_b32_b16 s0, 0, s3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v10, v7
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, v12, v9
-; GFX11-NEXT:    v_add_f32_e64 v11, 0x40c00000, s0
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v3
-; GFX11-NEXT:    v_bfe_u32 v3, v5, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v3, v5
-; GFX11-NEXT:    v_bfe_u32 v8, v11, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v13, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v8, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v10, v14, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v8
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v11
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v3, v12, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX11-NEXT:    v_and_or_b32 v2, 0xffff0000, v5, v4
-; GFX11-NEXT:    v_and_or_b32 v3, 0xffff0000, v3, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_or_b32 v1, 0xffff0000, v6, v8
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff0000, v0, v9
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB95_3:
-; GFX11-NEXT:    s_branch .LBB95_2
-; GFX11-NEXT:  .LBB95_4:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v8bf16_to_v8i16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB95_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB95_4
+; GFX11-TRUE16-NEXT:  .LBB95_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s4, 0, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s4, 0, s1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s1, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s0, 0, s2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, v5, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v4, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v3, v7 :: v_dual_add_nc_u32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, v6, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s2, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v10, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v5, v5, v8 :: v_dual_add_nc_u32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v5.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v6, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s3, 16
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s0, 0, s3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v2, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v12, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v10, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, v12, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v11, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v2, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v8, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v11
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v4.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v9, v10, v14, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v3, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v6.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v9.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB95_3:
+; GFX11-TRUE16-NEXT:    s_branch .LBB95_2
+; GFX11-TRUE16-NEXT:  .LBB95_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v8bf16_to_v8i16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB95_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB95_4
+; GFX11-FAKE16-NEXT:  .LBB95_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s4, 0, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s4, 0, s1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s1, 16
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, v6, v2
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s0, 0, s2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v3, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v3, v7 :: v_dual_add_nc_u32 v5, v5, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v5, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s2, 16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s3, 16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v10, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v12, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x400000, v9
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s0, 0, s3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v10, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, v12, v9
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v5, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v3, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v11, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v13, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v8, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v10, v14, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v11
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v3, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v5, v4
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v3, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v1, 0xffff0000, v6, v8
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v0, 0xffff0000, v0, v9
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB95_3:
+; GFX11-FAKE16-NEXT:    s_branch .LBB95_2
+; GFX11-FAKE16-NEXT:  .LBB95_4:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -21477,112 +21948,210 @@ define inreg <8 x half> @bitcast_v8bf16_to_v8f16_scalar(<8 x bfloat> inreg %a, i
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s19
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v8bf16_to_v8f16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB103_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_vccnz .LBB103_4
-; GFX11-NEXT:  .LBB103_2: ; %cmp.true
-; GFX11-NEXT:    s_pack_lh_b32_b16 s4, 0, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX11-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
-; GFX11-NEXT:    s_pack_lh_b32_b16 s4, 0, s1
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT:    v_add_f32_e64 v2, 0x40c00000, s4
-; GFX11-NEXT:    s_lshl_b32 s0, s1, 16
-; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v0
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v1
-; GFX11-NEXT:    v_bfe_u32 v6, v2, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, v6, v2
-; GFX11-NEXT:    s_pack_lh_b32_b16 s0, 0, s2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v10, v4, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v3, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v3, v7 :: v_dual_add_nc_u32 v5, v5, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v8, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s2, 16
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT:    v_add_f32_e64 v7, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s3, 16
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v7
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v10, v4
-; GFX11-NEXT:    v_bfe_u32 v10, v7, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v12, v9, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v9
-; GFX11-NEXT:    s_pack_lh_b32_b16 s0, 0, s3
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v10, v7
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, v12, v9
-; GFX11-NEXT:    v_add_f32_e64 v11, 0x40c00000, s0
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v3
-; GFX11-NEXT:    v_bfe_u32 v3, v5, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v3, v5
-; GFX11-NEXT:    v_bfe_u32 v8, v11, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v13, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v8, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v10, v14, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v8
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v11
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v12, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v7
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v2
-; GFX11-NEXT:    v_lshl_or_b32 v2, v8, 16, v4
-; GFX11-NEXT:    v_lshl_or_b32 v0, v0, 16, v9
-; GFX11-NEXT:    v_lshl_or_b32 v3, v3, 16, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_lshl_or_b32 v1, v6, 16, v7
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB103_3:
-; GFX11-NEXT:    s_branch .LBB103_2
-; GFX11-NEXT:  .LBB103_4:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v8bf16_to_v8f16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB103_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB103_4
+; GFX11-TRUE16-NEXT:  .LBB103_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s4, 0, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s4, 0, s1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s1, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, v6, v2
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s0, 0, s2
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s1, 0, s3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v3, v7 :: v_dual_add_nc_u32 v5, v5, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s2, 16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v5, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, v10, v4
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, v7, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s3, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v4, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v12, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v3, v5, v11 :: v_dual_add_nc_u32 v10, v10, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, v12, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v6.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v9, v10, v11 :: v_dual_add_nc_u32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v9.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v7.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB103_3:
+; GFX11-TRUE16-NEXT:    s_branch .LBB103_2
+; GFX11-TRUE16-NEXT:  .LBB103_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v8bf16_to_v8f16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB103_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB103_4
+; GFX11-FAKE16-NEXT:  .LBB103_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s4, 0, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s4, 0, s1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s1, 16
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, v6, v2
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s0, 0, s2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v3, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v3, v7 :: v_dual_add_nc_u32 v5, v5, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v5, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s2, 16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s3, 16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v10, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v12, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x400000, v9
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s0, 0, s3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v10, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, v12, v9
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v5, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v3, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v11, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v13, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v8, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v10, v14, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v2, v8, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v3, v3, 16, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v1, v6, 16, v7
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB103_3:
+; GFX11-FAKE16-NEXT:    s_branch .LBB103_2
+; GFX11-FAKE16-NEXT:  .LBB103_4:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -24345,152 +24914,299 @@ define inreg <16 x i8> @bitcast_v8bf16_to_v16i8_scalar(<8 x bfloat> inreg %a, i3
 ; GFX9-NEXT:    v_mov_b32_e32 v12, v16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v8bf16_to_v16i8_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX11-NEXT:    s_mov_b32 s8, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB109_3
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    s_lshr_b32 s11, s3, 24
-; GFX11-NEXT:    s_lshr_b32 s18, s3, 16
-; GFX11-NEXT:    s_lshr_b32 s14, s3, 8
-; GFX11-NEXT:    s_lshr_b32 s16, s2, 16
-; GFX11-NEXT:    s_lshr_b32 s15, s2, 8
-; GFX11-NEXT:    s_lshr_b32 s9, s1, 24
-; GFX11-NEXT:    s_lshr_b32 s17, s1, 16
-; GFX11-NEXT:    s_lshr_b32 s10, s1, 8
-; GFX11-NEXT:    s_lshr_b32 s13, s0, 16
-; GFX11-NEXT:    s_lshr_b32 s12, s0, 8
-; GFX11-NEXT:    s_lshr_b64 s[6:7], s[2:3], 24
-; GFX11-NEXT:    s_lshr_b64 s[4:5], s[0:1], 24
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s8
-; GFX11-NEXT:    s_cbranch_vccnz .LBB109_4
-; GFX11-NEXT:  .LBB109_2: ; %cmp.true
-; GFX11-NEXT:    s_lshl_b32 s4, s1, 16
-; GFX11-NEXT:    s_pack_lh_b32_b16 s1, 0, s1
-; GFX11-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s1
-; GFX11-NEXT:    s_pack_lh_b32_b16 s4, 0, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX11-NEXT:    v_add_f32_e64 v3, 0x40c00000, s4
-; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT:    s_lshl_b32 s0, s3, 16
-; GFX11-NEXT:    s_pack_lh_b32_b16 s1, 0, s3
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v8 :: v_dual_add_nc_u32 v7, v7, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT:    v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_add_f32_e64 v3, 0x40c00000, s1
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v7, v9, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v7, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s2, 16
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT:    s_pack_lh_b32_b16 s0, 0, s2
-; GFX11-NEXT:    v_bfe_u32 v10, v7, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v11, 0x40c00000, s0
-; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v7
-; GFX11-NEXT:    v_bfe_u32 v12, v9, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v9
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v10, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v2
-; GFX11-NEXT:    v_bfe_u32 v2, v3, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, v12, v9
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v3
-; GFX11-NEXT:    v_bfe_u32 v8, v11, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v8, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v10, v14, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v17
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v8
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v11
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v16
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v12, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v9
-; GFX11-NEXT:    v_lshl_or_b32 v2, v6, 16, v1
-; GFX11-NEXT:    v_lshl_or_b32 v1, v4, 16, v11
-; GFX11-NEXT:    v_lshl_or_b32 v10, v14, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 24, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 8, v10
-; GFX11-NEXT:    v_lshl_or_b32 v9, v3, 16, v7
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[1:2]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v9
-; GFX11-NEXT:    s_branch .LBB109_5
-; GFX11-NEXT:  .LBB109_3:
-; GFX11-NEXT:    ; implicit-def: $sgpr12
-; GFX11-NEXT:    ; implicit-def: $sgpr13
-; GFX11-NEXT:    ; implicit-def: $sgpr4
-; GFX11-NEXT:    ; implicit-def: $sgpr10
-; GFX11-NEXT:    ; implicit-def: $sgpr17
-; GFX11-NEXT:    ; implicit-def: $sgpr9
-; GFX11-NEXT:    ; implicit-def: $sgpr15
-; GFX11-NEXT:    ; implicit-def: $sgpr16
-; GFX11-NEXT:    ; implicit-def: $sgpr6
-; GFX11-NEXT:    ; implicit-def: $sgpr14
-; GFX11-NEXT:    ; implicit-def: $sgpr18
-; GFX11-NEXT:    ; implicit-def: $sgpr11
-; GFX11-NEXT:    s_branch .LBB109_2
-; GFX11-NEXT:  .LBB109_4:
-; GFX11-NEXT:    v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v17, s1
-; GFX11-NEXT:    v_dual_mov_b32 v16, s3 :: v_dual_mov_b32 v9, s15
-; GFX11-NEXT:    v_dual_mov_b32 v14, s18 :: v_dual_mov_b32 v15, s11
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v13, s14
-; GFX11-NEXT:    v_dual_mov_b32 v6, s17 :: v_dual_mov_b32 v1, s12
-; GFX11-NEXT:    v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v7, s9
-; GFX11-NEXT:    v_dual_mov_b32 v2, s13 :: v_dual_mov_b32 v5, s10
-; GFX11-NEXT:    v_mov_b32_e32 v11, s6
-; GFX11-NEXT:    v_mov_b32_e32 v3, s4
-; GFX11-NEXT:  .LBB109_5: ; %end
-; GFX11-NEXT:    v_mov_b32_e32 v4, v17
-; GFX11-NEXT:    v_mov_b32_e32 v12, v16
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v8bf16_to_v16i8_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB109_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s3, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s18, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s3, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s16, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s2, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s1, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s17, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s1, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s0, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[6:7], s[2:3], 24
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[4:5], s[0:1], 24
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s8
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB109_4
+; GFX11-TRUE16-NEXT:  .LBB109_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s1, 16
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s1, 0, s1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s1, 0, s0
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s4, 0, s3
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v7
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s2, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, v6, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s3, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v10, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, v11, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v3, v2 :: v_dual_add_nc_u32 v1, v8, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s0, 0, s2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v17.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v6.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v3, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, v5, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v14.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v7, v11, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v16.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v7
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v8.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v3.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[1:2]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v9
+; GFX11-TRUE16-NEXT:    s_branch .LBB109_5
+; GFX11-TRUE16-NEXT:  .LBB109_3:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr12
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr13
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr17
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr9
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr15
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr6
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr14
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr18
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr11
+; GFX11-TRUE16-NEXT:    s_branch .LBB109_2
+; GFX11-TRUE16-NEXT:  .LBB109_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v17, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s3 :: v_dual_mov_b32 v9, s15
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s18 :: v_dual_mov_b32 v15, s11
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v13, s14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s17 :: v_dual_mov_b32 v1, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v7, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s13 :: v_dual_mov_b32 v5, s10
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v11, s6
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-TRUE16-NEXT:  .LBB109_5: ; %end
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v17
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v12, v16
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v8bf16_to_v16i8_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB109_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s11, s3, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s18, s3, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s14, s3, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s16, s2, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s15, s2, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s9, s1, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s17, s1, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s1, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s13, s0, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s12, s0, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[6:7], s[2:3], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[4:5], s[0:1], 24
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s8
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB109_4
+; GFX11-FAKE16-NEXT:  .LBB109_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s1, 16
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s1, 0, s1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s4, 0, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s3, 16
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s1, 0, s3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v2, v8 :: v_dual_add_nc_u32 v7, v7, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s2, 16
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s0, 0, s2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v12, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v10, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, v12, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v11, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v8, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v10, v14, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v2, v6, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v1, v4, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v14, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v3, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[1:2]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v9
+; GFX11-FAKE16-NEXT:    s_branch .LBB109_5
+; GFX11-FAKE16-NEXT:  .LBB109_3:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr12
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr15
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr16
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr11
+; GFX11-FAKE16-NEXT:    s_branch .LBB109_2
+; GFX11-FAKE16-NEXT:  .LBB109_4:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v17, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s3 :: v_dual_mov_b32 v9, s15
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s18 :: v_dual_mov_b32 v15, s11
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v13, s14
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, s17 :: v_dual_mov_b32 v1, s12
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v7, s9
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s13 :: v_dual_mov_b32 v5, s10
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v11, s6
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-FAKE16-NEXT:  .LBB109_5: ; %end
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v17
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v12, v16
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
index 58383e6285201..a48eb27460f7d 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
@@ -4485,203 +4485,384 @@ define inreg <8 x i32> @bitcast_v16bf16_to_v8i32_scalar(<16 x bfloat> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v7, s23
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v16bf16_to_v8i32_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s7, s19
-; GFX11-NEXT:    s_mov_b32 s6, s18
-; GFX11-NEXT:    s_mov_b32 s5, s17
-; GFX11-NEXT:    s_mov_b32 s4, s16
-; GFX11-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX11-NEXT:    s_mov_b32 s8, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB23_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s8
-; GFX11-NEXT:    s_cbranch_vccnz .LBB23_4
-; GFX11-NEXT:  .LBB23_2: ; %cmp.true
-; GFX11-NEXT:    s_lshl_b32 s8, s7, 16
-; GFX11-NEXT:    s_and_b32 s7, s7, 0xffff0000
-; GFX11-NEXT:    v_add_f32_e64 v0, 0x40c00000, s8
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s7
-; GFX11-NEXT:    s_and_b32 s8, s6, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX11-NEXT:    v_add_f32_e64 v3, 0x40c00000, s8
-; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT:    s_and_b32 s7, s5, 0xffff0000
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s6
-; GFX11-NEXT:    v_add_f32_e64 v6, 0x40c00000, s7
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT:    v_bfe_u32 v10, v3, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_lshl_b32 s5, s5, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT:    v_add_f32_e64 v7, 0x40c00000, s5
-; GFX11-NEXT:    v_bfe_u32 v11, v6, 16, 1
-; GFX11-NEXT:    s_and_b32 s5, s4, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v10, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v8, 0x7fff, v8
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v3
-; GFX11-NEXT:    v_bfe_u32 v9, v7, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v3, v8, v4 :: v_dual_add_nc_u32 v4, v9, v7
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v11, v6
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v2, v10 :: v_dual_add_nc_u32 v5, 0x7fff, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT:    v_add_f32_e64 v10, 0x40c00000, s4
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshl_or_b32 v7, v1, 16, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v9, v10, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v6, v8, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    s_and_b32 s4, s3, 0xffff0000
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, v9, v10
-; GFX11-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e64 v9, 0x40c00000, s3
-; GFX11-NEXT:    s_and_b32 s3, s2, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, v6, v8
-; GFX11-NEXT:    v_lshl_or_b32 v6, v3, 16, v2
-; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v10
-; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v8
-; GFX11-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s3
-; GFX11-NEXT:    v_bfe_u32 v3, v9, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v4, 0xffff, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v10, v8, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshl_or_b32 v5, v5, 16, v4
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s4
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, v10, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v2, v4, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v4
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s2
-; GFX11-NEXT:    s_and_b32 s2, s1, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v3, v9
-; GFX11-NEXT:    v_add_f32_e64 v13, 0x40c00000, s1
-; GFX11-NEXT:    s_lshl_b32 s1, s0, 16
-; GFX11-NEXT:    s_and_b32 s0, s0, 0xffff0000
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v11, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, 0x7fff, v10
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    v_bfe_u32 v11, v4, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v14, 0x40c00000, s0
-; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v12, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_bfe_u32 v16, v14, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_dual_cndmask_b32 v8, v9, v10 :: v_dual_add_nc_u32 v9, v11, v4
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add_f32_e64 v10, 0x40c00000, s2
-; GFX11-NEXT:    v_add_nc_u32_e32 v16, v16, v14
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, 0x7fff, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v12, v10, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v17, 0x400000, v10
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v9, v11, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v9, 0x40c00000, s1
-; GFX11-NEXT:    v_bfe_u32 v11, v13, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT:    v_add_nc_u32_e32 v12, v12, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v15, v9, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, v11, v13
-; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v12, 0x7fff, v12
-; GFX11-NEXT:    v_add_nc_u32_e32 v13, 0x7fff, v16
-; GFX11-NEXT:    v_add_nc_u32_e32 v15, v15, v9
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, 0x7fff, v11
-; GFX11-NEXT:    v_or_b32_e32 v16, 0x400000, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v15, 0x7fff, v15
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v11, v18, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v15, v19, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_cndmask_b32_e32 v10, v12, v17, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_cndmask_b32_e32 v12, v13, v16, vcc_lo
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v4
-; GFX11-NEXT:    v_lshl_or_b32 v4, v1, 16, v0
-; GFX11-NEXT:    v_lshl_or_b32 v3, v2, 16, v3
-; GFX11-NEXT:    v_lshl_or_b32 v1, v10, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_lshl_or_b32 v2, v8, 16, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_lshl_or_b32 v0, v12, 16, v9
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB23_3:
-; GFX11-NEXT:    s_branch .LBB23_2
-; GFX11-NEXT:  .LBB23_4:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v16bf16_to_v8i32_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, s19
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, s18
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s17
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s16
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB23_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s8
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB23_4
+; GFX11-TRUE16-NEXT:  .LBB23_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_and_b32 s8, s7, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s7, s7, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s8
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s7
+; GFX11-TRUE16-NEXT:    s_and_b32 s7, s6, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s7
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, v7, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT:    s_and_b32 s6, s5, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v2, v6 :: v_dual_add_nc_u32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v9, v5
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s6
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v0.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, v6, v8
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    s_and_b32 s5, s4, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v4, v2
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s5
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s3, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v9
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v9
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-TRUE16-NEXT:    s_and_b32 s3, s2, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v3, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v11, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v3, v8 :: v_dual_add_nc_u32 v3, v9, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_and_b32 s2, s1, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v8, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, v11, v9
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v10, 16, 1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s2, s0, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v8, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v8, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v10
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v14, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v12, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, 0x400000, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, v13, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v13, v14, v9
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v10, v12
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x7fff, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v9, v11, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v10, v15, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v10, v1, v16, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v14.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v9.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB23_3:
+; GFX11-TRUE16-NEXT:    s_branch .LBB23_2
+; GFX11-TRUE16-NEXT:  .LBB23_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v16bf16_to_v8i32_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, s19
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, s18
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s17
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s16
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB23_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s8
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB23_4
+; GFX11-FAKE16-NEXT:  .LBB23_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s7, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s7, s7, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s8
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s7
+; GFX11-FAKE16-NEXT:    s_and_b32 s8, s6, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s8
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT:    s_and_b32 s7, s5, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s7
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v6, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s5, s4, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v10, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v8, v4 :: v_dual_add_nc_u32 v4, v9, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v11, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v10 :: v_dual_add_nc_u32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v8, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s3, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, v9, v10
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s3
+; GFX11-FAKE16-NEXT:    s_and_b32 s3, s2, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, v6, v8
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v3, 16, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v8, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v5, 16, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, v10, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s2
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s1, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v3, v9
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v13, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 0x7fff, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, 0x400000, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v14, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v9, v10 :: v_dual_add_nc_u32 v9, v11, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v16, v16, v14
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 0x7fff, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v12, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v9, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v13, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, v12, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v15, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, v11, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, 0x400000, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, 0x7fff, v16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, v15, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, 0x400000, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, 0x7fff, v15
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v18, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v15, v19, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v12, v17, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v13, v16, vcc_lo
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v3, v2, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v1, v10, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v2, v8, 16, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v12, 16, v9
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB23_3:
+; GFX11-FAKE16-NEXT:    s_branch .LBB23_2
+; GFX11-FAKE16-NEXT:  .LBB23_4:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -11456,203 +11637,384 @@ define inreg <8 x float> @bitcast_v16bf16_to_v8f32_scalar(<16 x bfloat> inreg %a
 ; GFX9-NEXT:    v_mov_b32_e32 v7, s23
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v16bf16_to_v8f32_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s7, s19
-; GFX11-NEXT:    s_mov_b32 s6, s18
-; GFX11-NEXT:    s_mov_b32 s5, s17
-; GFX11-NEXT:    s_mov_b32 s4, s16
-; GFX11-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX11-NEXT:    s_mov_b32 s8, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB47_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s8
-; GFX11-NEXT:    s_cbranch_vccnz .LBB47_4
-; GFX11-NEXT:  .LBB47_2: ; %cmp.true
-; GFX11-NEXT:    s_lshl_b32 s8, s7, 16
-; GFX11-NEXT:    s_and_b32 s7, s7, 0xffff0000
-; GFX11-NEXT:    v_add_f32_e64 v0, 0x40c00000, s8
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s7
-; GFX11-NEXT:    s_and_b32 s8, s6, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX11-NEXT:    v_add_f32_e64 v3, 0x40c00000, s8
-; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT:    s_and_b32 s7, s5, 0xffff0000
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s6
-; GFX11-NEXT:    v_add_f32_e64 v6, 0x40c00000, s7
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT:    v_bfe_u32 v10, v3, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_lshl_b32 s5, s5, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT:    v_add_f32_e64 v7, 0x40c00000, s5
-; GFX11-NEXT:    v_bfe_u32 v11, v6, 16, 1
-; GFX11-NEXT:    s_and_b32 s5, s4, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v10, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v8, 0x7fff, v8
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v3
-; GFX11-NEXT:    v_bfe_u32 v9, v7, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v3, v8, v4 :: v_dual_add_nc_u32 v4, v9, v7
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v11, v6
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v2, v10 :: v_dual_add_nc_u32 v5, 0x7fff, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT:    v_add_f32_e64 v10, 0x40c00000, s4
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshl_or_b32 v7, v1, 16, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v9, v10, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v6, v8, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    s_and_b32 s4, s3, 0xffff0000
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, v9, v10
-; GFX11-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e64 v9, 0x40c00000, s3
-; GFX11-NEXT:    s_and_b32 s3, s2, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, v6, v8
-; GFX11-NEXT:    v_lshl_or_b32 v6, v3, 16, v2
-; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v10
-; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v8
-; GFX11-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s3
-; GFX11-NEXT:    v_bfe_u32 v3, v9, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v4, 0xffff, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v10, v8, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshl_or_b32 v5, v5, 16, v4
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s4
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, v10, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v2, v4, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v4
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s2
-; GFX11-NEXT:    s_and_b32 s2, s1, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v3, v9
-; GFX11-NEXT:    v_add_f32_e64 v13, 0x40c00000, s1
-; GFX11-NEXT:    s_lshl_b32 s1, s0, 16
-; GFX11-NEXT:    s_and_b32 s0, s0, 0xffff0000
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v11, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, 0x7fff, v10
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    v_bfe_u32 v11, v4, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v14, 0x40c00000, s0
-; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v12, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_bfe_u32 v16, v14, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_dual_cndmask_b32 v8, v9, v10 :: v_dual_add_nc_u32 v9, v11, v4
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add_f32_e64 v10, 0x40c00000, s2
-; GFX11-NEXT:    v_add_nc_u32_e32 v16, v16, v14
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, 0x7fff, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v12, v10, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v17, 0x400000, v10
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v9, v11, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v9, 0x40c00000, s1
-; GFX11-NEXT:    v_bfe_u32 v11, v13, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT:    v_add_nc_u32_e32 v12, v12, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v15, v9, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, v11, v13
-; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v12, 0x7fff, v12
-; GFX11-NEXT:    v_add_nc_u32_e32 v13, 0x7fff, v16
-; GFX11-NEXT:    v_add_nc_u32_e32 v15, v15, v9
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, 0x7fff, v11
-; GFX11-NEXT:    v_or_b32_e32 v16, 0x400000, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v15, 0x7fff, v15
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v11, v18, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v15, v19, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_cndmask_b32_e32 v10, v12, v17, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_cndmask_b32_e32 v12, v13, v16, vcc_lo
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v4
-; GFX11-NEXT:    v_lshl_or_b32 v4, v1, 16, v0
-; GFX11-NEXT:    v_lshl_or_b32 v3, v2, 16, v3
-; GFX11-NEXT:    v_lshl_or_b32 v1, v10, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_lshl_or_b32 v2, v8, 16, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_lshl_or_b32 v0, v12, 16, v9
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB47_3:
-; GFX11-NEXT:    s_branch .LBB47_2
-; GFX11-NEXT:  .LBB47_4:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v16bf16_to_v8f32_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, s19
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, s18
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s17
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s16
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB47_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s8
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB47_4
+; GFX11-TRUE16-NEXT:  .LBB47_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_and_b32 s8, s7, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s7, s7, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s8
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s7
+; GFX11-TRUE16-NEXT:    s_and_b32 s7, s6, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s7
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, v7, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT:    s_and_b32 s6, s5, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v2, v6 :: v_dual_add_nc_u32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v9, v5
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s6
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v0.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, v6, v8
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    s_and_b32 s5, s4, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v4, v2
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s5
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s3, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v9
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v9
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-TRUE16-NEXT:    s_and_b32 s3, s2, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v3, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v11, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v3, v8 :: v_dual_add_nc_u32 v3, v9, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_and_b32 s2, s1, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v8, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, v11, v9
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v10, 16, 1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s2, s0, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v8, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v8, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v10
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v14, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v12, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, 0x400000, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, v13, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v13, v14, v9
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v10, v12
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x7fff, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v9, v11, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v10, v15, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v10, v1, v16, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v14.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v9.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB47_3:
+; GFX11-TRUE16-NEXT:    s_branch .LBB47_2
+; GFX11-TRUE16-NEXT:  .LBB47_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v16bf16_to_v8f32_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, s19
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, s18
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s17
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s16
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB47_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s8
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB47_4
+; GFX11-FAKE16-NEXT:  .LBB47_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s7, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s7, s7, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s8
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s7
+; GFX11-FAKE16-NEXT:    s_and_b32 s8, s6, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s8
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT:    s_and_b32 s7, s5, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s7
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v6, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s5, s4, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v10, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v8, v4 :: v_dual_add_nc_u32 v4, v9, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v11, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v10 :: v_dual_add_nc_u32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v8, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s3, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, v9, v10
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s3
+; GFX11-FAKE16-NEXT:    s_and_b32 s3, s2, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, v6, v8
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v3, 16, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v8, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v5, 16, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, v10, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s2
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s1, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v3, v9
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v13, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 0x7fff, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, 0x400000, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v14, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v9, v10 :: v_dual_add_nc_u32 v9, v11, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v16, v16, v14
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 0x7fff, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v12, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v9, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v13, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, v12, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v15, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, v11, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, 0x400000, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, 0x7fff, v16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, v15, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, 0x400000, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, 0x7fff, v15
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v18, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v15, v19, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v12, v17, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v13, v16, vcc_lo
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v3, v2, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v1, v10, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v2, v8, 16, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v12, 16, v9
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB47_3:
+; GFX11-FAKE16-NEXT:    s_branch .LBB47_2
+; GFX11-FAKE16-NEXT:  .LBB47_4:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -17995,203 +18357,384 @@ define inreg <4 x i64> @bitcast_v16bf16_to_v4i64_scalar(<16 x bfloat> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v7, s23
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v16bf16_to_v4i64_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s7, s19
-; GFX11-NEXT:    s_mov_b32 s6, s18
-; GFX11-NEXT:    s_mov_b32 s5, s17
-; GFX11-NEXT:    s_mov_b32 s4, s16
-; GFX11-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX11-NEXT:    s_mov_b32 s8, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB67_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s8
-; GFX11-NEXT:    s_cbranch_vccnz .LBB67_4
-; GFX11-NEXT:  .LBB67_2: ; %cmp.true
-; GFX11-NEXT:    s_lshl_b32 s8, s7, 16
-; GFX11-NEXT:    s_and_b32 s7, s7, 0xffff0000
-; GFX11-NEXT:    v_add_f32_e64 v0, 0x40c00000, s8
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s7
-; GFX11-NEXT:    s_and_b32 s8, s6, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX11-NEXT:    v_add_f32_e64 v3, 0x40c00000, s8
-; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT:    s_and_b32 s7, s5, 0xffff0000
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s6
-; GFX11-NEXT:    v_add_f32_e64 v6, 0x40c00000, s7
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT:    v_bfe_u32 v10, v3, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_lshl_b32 s5, s5, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT:    v_add_f32_e64 v7, 0x40c00000, s5
-; GFX11-NEXT:    v_bfe_u32 v11, v6, 16, 1
-; GFX11-NEXT:    s_and_b32 s5, s4, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v10, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v8, 0x7fff, v8
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v3
-; GFX11-NEXT:    v_bfe_u32 v9, v7, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v3, v8, v4 :: v_dual_add_nc_u32 v4, v9, v7
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v11, v6
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v2, v10 :: v_dual_add_nc_u32 v5, 0x7fff, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT:    v_add_f32_e64 v10, 0x40c00000, s4
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshl_or_b32 v7, v1, 16, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v9, v10, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v6, v8, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    s_and_b32 s4, s3, 0xffff0000
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, v9, v10
-; GFX11-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e64 v9, 0x40c00000, s3
-; GFX11-NEXT:    s_and_b32 s3, s2, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, v6, v8
-; GFX11-NEXT:    v_lshl_or_b32 v6, v3, 16, v2
-; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v10
-; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v8
-; GFX11-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s3
-; GFX11-NEXT:    v_bfe_u32 v3, v9, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v4, 0xffff, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v10, v8, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshl_or_b32 v5, v5, 16, v4
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s4
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, v10, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v2, v4, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v4
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s2
-; GFX11-NEXT:    s_and_b32 s2, s1, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v3, v9
-; GFX11-NEXT:    v_add_f32_e64 v13, 0x40c00000, s1
-; GFX11-NEXT:    s_lshl_b32 s1, s0, 16
-; GFX11-NEXT:    s_and_b32 s0, s0, 0xffff0000
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v11, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, 0x7fff, v10
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    v_bfe_u32 v11, v4, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v14, 0x40c00000, s0
-; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v12, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_bfe_u32 v16, v14, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_dual_cndmask_b32 v8, v9, v10 :: v_dual_add_nc_u32 v9, v11, v4
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add_f32_e64 v10, 0x40c00000, s2
-; GFX11-NEXT:    v_add_nc_u32_e32 v16, v16, v14
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, 0x7fff, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v12, v10, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v17, 0x400000, v10
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v9, v11, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v9, 0x40c00000, s1
-; GFX11-NEXT:    v_bfe_u32 v11, v13, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT:    v_add_nc_u32_e32 v12, v12, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v15, v9, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, v11, v13
-; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v12, 0x7fff, v12
-; GFX11-NEXT:    v_add_nc_u32_e32 v13, 0x7fff, v16
-; GFX11-NEXT:    v_add_nc_u32_e32 v15, v15, v9
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, 0x7fff, v11
-; GFX11-NEXT:    v_or_b32_e32 v16, 0x400000, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v15, 0x7fff, v15
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v11, v18, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v15, v19, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_cndmask_b32_e32 v10, v12, v17, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_cndmask_b32_e32 v12, v13, v16, vcc_lo
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v4
-; GFX11-NEXT:    v_lshl_or_b32 v4, v1, 16, v0
-; GFX11-NEXT:    v_lshl_or_b32 v3, v2, 16, v3
-; GFX11-NEXT:    v_lshl_or_b32 v1, v10, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_lshl_or_b32 v2, v8, 16, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_lshl_or_b32 v0, v12, 16, v9
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB67_3:
-; GFX11-NEXT:    s_branch .LBB67_2
-; GFX11-NEXT:  .LBB67_4:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v16bf16_to_v4i64_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, s19
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, s18
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s17
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s16
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB67_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s8
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB67_4
+; GFX11-TRUE16-NEXT:  .LBB67_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_and_b32 s8, s7, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s7, s7, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s8
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s7
+; GFX11-TRUE16-NEXT:    s_and_b32 s7, s6, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s7
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, v7, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT:    s_and_b32 s6, s5, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v2, v6 :: v_dual_add_nc_u32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v9, v5
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s6
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v0.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, v6, v8
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    s_and_b32 s5, s4, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v4, v2
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s5
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s3, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v9
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v9
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-TRUE16-NEXT:    s_and_b32 s3, s2, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v3, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v11, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v3, v8 :: v_dual_add_nc_u32 v3, v9, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_and_b32 s2, s1, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v8, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, v11, v9
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v10, 16, 1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s2, s0, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v8, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v8, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v10
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v14, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v12, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, 0x400000, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, v13, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v13, v14, v9
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v10, v12
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x7fff, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v9, v11, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v10, v15, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v10, v1, v16, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v14.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v9.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB67_3:
+; GFX11-TRUE16-NEXT:    s_branch .LBB67_2
+; GFX11-TRUE16-NEXT:  .LBB67_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v16bf16_to_v4i64_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, s19
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, s18
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s17
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s16
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB67_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s8
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB67_4
+; GFX11-FAKE16-NEXT:  .LBB67_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s7, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s7, s7, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s8
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s7
+; GFX11-FAKE16-NEXT:    s_and_b32 s8, s6, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s8
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT:    s_and_b32 s7, s5, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s7
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v6, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s5, s4, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v10, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v8, v4 :: v_dual_add_nc_u32 v4, v9, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v11, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v10 :: v_dual_add_nc_u32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v8, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s3, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, v9, v10
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s3
+; GFX11-FAKE16-NEXT:    s_and_b32 s3, s2, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, v6, v8
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v3, 16, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v8, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v5, 16, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, v10, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s2
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s1, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v3, v9
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v13, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 0x7fff, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, 0x400000, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v14, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v9, v10 :: v_dual_add_nc_u32 v9, v11, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v16, v16, v14
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 0x7fff, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v12, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v9, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v13, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, v12, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v15, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, v11, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, 0x400000, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, 0x7fff, v16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, v15, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, 0x400000, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, 0x7fff, v15
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v18, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v15, v19, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v12, v17, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v13, v16, vcc_lo
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v3, v2, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v1, v10, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v2, v8, 16, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v12, 16, v9
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB67_3:
+; GFX11-FAKE16-NEXT:    s_branch .LBB67_2
+; GFX11-FAKE16-NEXT:  .LBB67_4:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -23982,203 +24525,384 @@ define inreg <4 x double> @bitcast_v16bf16_to_v4f64_scalar(<16 x bfloat> inreg %
 ; GFX9-NEXT:    v_mov_b32_e32 v7, s23
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v16bf16_to_v4f64_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s7, s19
-; GFX11-NEXT:    s_mov_b32 s6, s18
-; GFX11-NEXT:    s_mov_b32 s5, s17
-; GFX11-NEXT:    s_mov_b32 s4, s16
-; GFX11-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX11-NEXT:    s_mov_b32 s8, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB83_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s8
-; GFX11-NEXT:    s_cbranch_vccnz .LBB83_4
-; GFX11-NEXT:  .LBB83_2: ; %cmp.true
-; GFX11-NEXT:    s_lshl_b32 s8, s7, 16
-; GFX11-NEXT:    s_and_b32 s7, s7, 0xffff0000
-; GFX11-NEXT:    v_add_f32_e64 v0, 0x40c00000, s8
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s7
-; GFX11-NEXT:    s_and_b32 s8, s6, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX11-NEXT:    v_add_f32_e64 v3, 0x40c00000, s8
-; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT:    s_and_b32 s7, s5, 0xffff0000
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s6
-; GFX11-NEXT:    v_add_f32_e64 v6, 0x40c00000, s7
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT:    v_bfe_u32 v10, v3, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_lshl_b32 s5, s5, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT:    v_add_f32_e64 v7, 0x40c00000, s5
-; GFX11-NEXT:    v_bfe_u32 v11, v6, 16, 1
-; GFX11-NEXT:    s_and_b32 s5, s4, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v10, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v8, 0x7fff, v8
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v3
-; GFX11-NEXT:    v_bfe_u32 v9, v7, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v3, v8, v4 :: v_dual_add_nc_u32 v4, v9, v7
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v11, v6
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v2, v10 :: v_dual_add_nc_u32 v5, 0x7fff, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT:    v_add_f32_e64 v10, 0x40c00000, s4
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshl_or_b32 v7, v1, 16, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v9, v10, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v6, v8, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    s_and_b32 s4, s3, 0xffff0000
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, v9, v10
-; GFX11-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e64 v9, 0x40c00000, s3
-; GFX11-NEXT:    s_and_b32 s3, s2, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, v6, v8
-; GFX11-NEXT:    v_lshl_or_b32 v6, v3, 16, v2
-; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v10
-; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v8
-; GFX11-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s3
-; GFX11-NEXT:    v_bfe_u32 v3, v9, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v4, 0xffff, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v10, v8, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshl_or_b32 v5, v5, 16, v4
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s4
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, v10, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v2, v4, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v4
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s2
-; GFX11-NEXT:    s_and_b32 s2, s1, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v3, v9
-; GFX11-NEXT:    v_add_f32_e64 v13, 0x40c00000, s1
-; GFX11-NEXT:    s_lshl_b32 s1, s0, 16
-; GFX11-NEXT:    s_and_b32 s0, s0, 0xffff0000
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v11, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, 0x7fff, v10
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    v_bfe_u32 v11, v4, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v14, 0x40c00000, s0
-; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v12, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_bfe_u32 v16, v14, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_dual_cndmask_b32 v8, v9, v10 :: v_dual_add_nc_u32 v9, v11, v4
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add_f32_e64 v10, 0x40c00000, s2
-; GFX11-NEXT:    v_add_nc_u32_e32 v16, v16, v14
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, 0x7fff, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v12, v10, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v17, 0x400000, v10
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v9, v11, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v9, 0x40c00000, s1
-; GFX11-NEXT:    v_bfe_u32 v11, v13, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT:    v_add_nc_u32_e32 v12, v12, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v15, v9, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, v11, v13
-; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v12, 0x7fff, v12
-; GFX11-NEXT:    v_add_nc_u32_e32 v13, 0x7fff, v16
-; GFX11-NEXT:    v_add_nc_u32_e32 v15, v15, v9
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, 0x7fff, v11
-; GFX11-NEXT:    v_or_b32_e32 v16, 0x400000, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v15, 0x7fff, v15
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v11, v18, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v15, v19, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_cndmask_b32_e32 v10, v12, v17, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_cndmask_b32_e32 v12, v13, v16, vcc_lo
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v4
-; GFX11-NEXT:    v_lshl_or_b32 v4, v1, 16, v0
-; GFX11-NEXT:    v_lshl_or_b32 v3, v2, 16, v3
-; GFX11-NEXT:    v_lshl_or_b32 v1, v10, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_lshl_or_b32 v2, v8, 16, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_lshl_or_b32 v0, v12, 16, v9
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB83_3:
-; GFX11-NEXT:    s_branch .LBB83_2
-; GFX11-NEXT:  .LBB83_4:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v16bf16_to_v4f64_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, s19
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, s18
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s17
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s16
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB83_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s8
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB83_4
+; GFX11-TRUE16-NEXT:  .LBB83_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_and_b32 s8, s7, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s7, s7, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s8
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s7
+; GFX11-TRUE16-NEXT:    s_and_b32 s7, s6, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s7
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, v7, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT:    s_and_b32 s6, s5, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v2, v6 :: v_dual_add_nc_u32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v9, v5
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s6
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v0.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, v6, v8
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    s_and_b32 s5, s4, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v4, v2
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s5
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s3, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v9
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v9
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s3
+; GFX11-TRUE16-NEXT:    s_and_b32 s3, s2, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v3, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v11, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v3, v8 :: v_dual_add_nc_u32 v3, v9, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_and_b32 s2, s1, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v8, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, v11, v9
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v10, 16, 1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s2, s0, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v8, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v8, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v10
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v14, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v12, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, 0x400000, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, v13, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v13, v14, v9
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v10, v12
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x7fff, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v9, v11, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v10, v15, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v10, v1, v16, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v14.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v9.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB83_3:
+; GFX11-TRUE16-NEXT:    s_branch .LBB83_2
+; GFX11-TRUE16-NEXT:  .LBB83_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v16bf16_to_v4f64_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, s19
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, s18
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s17
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s16
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB83_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s8
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB83_4
+; GFX11-FAKE16-NEXT:  .LBB83_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s7, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s7, s7, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s8
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s7
+; GFX11-FAKE16-NEXT:    s_and_b32 s8, s6, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s8
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT:    s_and_b32 s7, s5, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s6
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s7
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v6, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s5, s4, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v10, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v8, v4 :: v_dual_add_nc_u32 v4, v9, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v11, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v10 :: v_dual_add_nc_u32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v8, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s3, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, v9, v10
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s3
+; GFX11-FAKE16-NEXT:    s_and_b32 s3, s2, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, v6, v8
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v3, 16, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v8, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v5, 16, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, v10, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s2
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s1, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v3, v9
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v13, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 0x7fff, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, 0x400000, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v14, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v9, v10 :: v_dual_add_nc_u32 v9, v11, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v16, v16, v14
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 0x7fff, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v12, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v9, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v13, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, v12, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v15, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, v11, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, 0x400000, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, 0x7fff, v16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, v15, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, 0x400000, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, 0x7fff, v15
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v18, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v15, v19, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v12, v17, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v13, v16, vcc_lo
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v3, v2, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v1, v10, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v2, v8, 16, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v12, 16, v9
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB83_3:
+; GFX11-FAKE16-NEXT:    s_branch .LBB83_2
+; GFX11-FAKE16-NEXT:  .LBB83_4:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -28722,13 +29446,10 @@ define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB94_2
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_add_f32 v8, 0x40c00000, v8
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_lshlrev_b32 v8, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v14, v9, 16, 1
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v8, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v8
@@ -28736,142 +29457,128 @@ define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, 0x400000, v9
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v14, v14, v9, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v11, v8, 0x7fff
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v22, v6, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, 0x400000, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v11, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v8, v11, v12 :: v_dual_and_b32 v1, 0xffff0000, v1
 ; GFX11-TRUE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v10, 16, v2
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v1, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_dual_add_f32 v12, 0x40c00000, v12 :: v_dual_add_f32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v11, v1, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v0, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, 0x400000, v0
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v13, v13, v0, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v13, v15, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v1
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v15, v12, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v8.h
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v9, v14, v16, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v14, v10, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v11, v13, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v10
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v9.h
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v14, v10, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v14, v2, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, 0x400000, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v10, v11, v13, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v14, v2, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v2
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v14, v15, v12, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, 0x400000, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v11, v13, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v3, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v10.h
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v11, v3, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v12, v14, v15, vcc_lo
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v11, v15, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v4, 16, 1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_add_f32 v5, 0x40c00000, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v11, v4, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, 0x400000, v14
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v11, v18, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_add_f32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v11, v14, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v11, 0x40c00000, v15
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v13, 0x40c00000, v16
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v23, v18, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v24, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, 0x400000, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v13, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, 0x400000, v13
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v15, v16, v13, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v14, 16, 1
-; GFX11-TRUE16-NEXT:    v_add3_u32 v16, v16, v14, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v14, 0x40c00000, v21 :: v_dual_cndmask_b32 v11, v16, v19
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v14, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, 0x400000, v13
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v11, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    v_add3_u32 v14, v16, v13, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v5, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v13, v14, v15, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v14, v16, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v16, v17, v11, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v4, v14, v15 :: v_dual_lshlrev_b32 v17, 16, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT:    v_add3_u32 v15, v19, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v13.h
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v11, v16, v18 :: v_dual_lshlrev_b32 v18, 16, v7
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v14, 0x40c00000, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v14, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v15, v16, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v15, 0x40c00000, v18
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-TRUE16-NEXT:    v_add3_u32 v16, v20, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, 0x400000, v5
-; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v14, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v14, v19, v21, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v22, v6, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v21, v23, v18, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v18
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-TRUE16-NEXT:    v_add3_u32 v23, v24, v7, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, 0x400000, v7
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v18, v21, v22, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v7, v23, v24, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v16, v17, v14, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, 0x400000, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v15, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v11.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v14, v16, v17, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v16, v16, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v18, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, 0x400000, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v18, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v18, v19, v15, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, 0x400000, v15
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v15, v18, v19, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v7.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v19, v25, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v18
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v6.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v16, v20, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v0, 16, v7
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v5.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v11, v15, v17, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v1, 16, v6
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v2, 16, v5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v0.h
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v3, v1, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v0, 16, v4
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v2, 16, v10
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v11, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v12, 16, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v17, v20, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v14.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v7, v16, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v15.h
 ; GFX11-TRUE16-NEXT:  .LBB94_2: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -29497,175 +30204,334 @@ define inreg <16 x i16> @bitcast_v16bf16_to_v16i16_scalar(<16 x bfloat> inreg %a
 ; GFX9-NEXT:    v_mov_b32_e32 v7, s23
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v16bf16_to_v16i16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s7, s19
-; GFX11-NEXT:    s_mov_b32 s6, s18
-; GFX11-NEXT:    s_mov_b32 s5, s17
-; GFX11-NEXT:    s_mov_b32 s4, s16
-; GFX11-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX11-NEXT:    s_mov_b32 s8, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB95_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s8
-; GFX11-NEXT:    s_cbranch_vccnz .LBB95_4
-; GFX11-NEXT:  .LBB95_2: ; %cmp.true
-; GFX11-NEXT:    s_and_b32 s8, s0, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX11-NEXT:    v_add_f32_e64 v0, 0x40c00000, s8
-; GFX11-NEXT:    s_and_b32 s8, s1, 0xffff0000
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT:    v_add_f32_e64 v2, 0x40c00000, s8
-; GFX11-NEXT:    s_lshl_b32 s0, s1, 16
-; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT:    s_and_b32 s1, s2, 0xffff0000
-; GFX11-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v3, v0
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v7
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v5, v2
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT:    s_lshl_b32 s0, s2, 16
-; GFX11-NEXT:    s_and_b32 s1, s5, 0xffff0000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v3, v8 :: v_dual_add_nc_u32 v5, 0x7fff, v5
-; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v4, v9, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v4, v7, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v8, v6
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v6
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v5, v3, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v4, v7
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT:    s_and_b32 s0, s3, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT:    v_bfe_u32 v10, v4, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    s_lshl_b32 s0, s3, 16
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, v10, v4
-; GFX11-NEXT:    v_bfe_u32 v12, v5, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v7, 0x40c00000, s0
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v3, v11, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    s_and_b32 s0, s4, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v12, v5
-; GFX11-NEXT:    v_bfe_u32 v12, v7, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v13, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s4, 16
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v6, v10, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v10, 0x40c00000, s0
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v12, v7
-; GFX11-NEXT:    v_bfe_u32 v14, v10, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v7
-; GFX11-NEXT:    s_lshl_b32 s0, s5, 16
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v11, 0x40c00000, s1
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, v14, v10
-; GFX11-NEXT:    v_or_b32_e32 v17, 0x400000, v10
-; GFX11-NEXT:    v_bfe_u32 v15, v11, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v11
-; GFX11-NEXT:    v_cndmask_b32_e32 v12, v5, v12, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    v_add_nc_u32_e32 v14, v15, v11
-; GFX11-NEXT:    s_and_b32 s0, s6, 0xffff0000
-; GFX11-NEXT:    v_bfe_u32 v16, v5, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v6, v13, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v10, v7, v17, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v7, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s6, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v14, 0x7fff, v14
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    v_add_nc_u32_e32 v16, v16, v5
-; GFX11-NEXT:    v_add_f32_e64 v17, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s7, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, v6, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v11, v14, v18 :: v_dual_add_nc_u32 v14, 0x7fff, v16
-; GFX11-NEXT:    v_bfe_u32 v16, v7, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v5
-; GFX11-NEXT:    v_add_f32_e64 v19, 0x40c00000, s0
-; GFX11-NEXT:    v_bfe_u32 v20, v17, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v16, v16, v7
-; GFX11-NEXT:    s_and_b32 s0, s7, 0xffff0000
-; GFX11-NEXT:    v_bfe_u32 v22, v19, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v21, 0x40c00000, s0
-; GFX11-NEXT:    v_dual_cndmask_b32 v5, v14, v18 :: v_dual_add_nc_u32 v14, v20, v17
-; GFX11-NEXT:    v_or_b32_e32 v23, 0x400000, v17
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v20, v22, v19
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT:    v_bfe_u32 v18, v21, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v14, 0x7fff, v14
-; GFX11-NEXT:    v_or_b32_e32 v24, 0x400000, v19
-; GFX11-NEXT:    v_add_nc_u32_e32 v20, 0x7fff, v20
-; GFX11-NEXT:    v_add_nc_u32_e32 v16, 0x7fff, v16
-; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v7
-; GFX11-NEXT:    v_cndmask_b32_e32 v14, v14, v23, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT:    v_add_nc_u32_e32 v18, v18, v21
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT:    v_or_b32_e32 v15, 0x400000, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_cndmask_b32_e32 v19, v20, v24, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_add_nc_u32_e32 v17, 0x7fff, v18
-; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_cndmask_b32_e32 v16, v16, v22, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_and_or_b32 v5, 0xffff0000, v11, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v17, v18, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v19
-; GFX11-NEXT:    v_and_or_b32 v3, 0xffff0000, v3, v11
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, v6, v15, vcc_lo
-; GFX11-NEXT:    v_and_or_b32 v6, 0xffff0000, v16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 16, v1
-; GFX11-NEXT:    v_and_or_b32 v7, 0xffff0000, v7, v17
-; GFX11-NEXT:    v_and_or_b32 v4, 0xffff0000, v13, v10
-; GFX11-NEXT:    v_and_or_b32 v2, 0xffff0000, v8, v12
-; GFX11-NEXT:    v_and_or_b32 v1, 0xffff0000, v9, v14
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff0000, v0, v15
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB95_3:
-; GFX11-NEXT:    s_branch .LBB95_2
-; GFX11-NEXT:  .LBB95_4:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v16bf16_to_v16i16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, s19
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, s18
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s17
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s16
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB95_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s8
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB95_4
+; GFX11-TRUE16-NEXT:  .LBB95_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_and_b32 s8, s0, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s8
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_and_b32 s8, s1, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s1, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s8
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, v6, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, v5, v1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s2, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v3, v7 :: v_dual_add_nc_u32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v10, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s2, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, s5, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v5, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v8.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v6, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s3, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s3, 16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v10, v2, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, v2, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s4, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v12, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v10.h
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v2, v3, v4 :: v_dual_add_nc_u32 v3, v5, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, v7, v12
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s4, 16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v14, v4, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v11, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v12
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v3, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v4, v11
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s5, 16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v11
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v12, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, v6, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v15, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v4, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, v11, v15
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s6, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v12.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v13, v5, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s6, 16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v14.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v13.h
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v5, v6, v16 :: v_dual_add_nc_u32 v6, 0x7fff, v11
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s7, 16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, 0x400000, v15
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v17, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v18, v11, 16, 1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s7, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v17, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v15, v6, v16, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, v18, v11
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v19, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v18, v20, v17
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, v9, v7
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v19, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v18, 0x7fff, v18
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v17
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v11, v6, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 0x7fff, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, v16, v19
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v19
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v17, v18, v22, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v15.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 0x7fff, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v9, v20, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v11.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v7, v16, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v17.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB95_3:
+; GFX11-TRUE16-NEXT:    s_branch .LBB95_2
+; GFX11-TRUE16-NEXT:  .LBB95_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v16bf16_to_v16i16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, s19
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, s18
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s17
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s16
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB95_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s8
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB95_4
+; GFX11-FAKE16-NEXT:  .LBB95_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_and_b32 s8, s0, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s8
+; GFX11-FAKE16-NEXT:    s_and_b32 s8, s1, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s8
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s1, 16
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s2, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v3, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s2, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s5, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v3, v8 :: v_dual_add_nc_u32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v8, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v5, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v4, v7
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s3, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s3, 16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, v10, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v12, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v3, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s4, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v12, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v12, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s4, 16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v6, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v12, v7
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v14, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v7
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s5, 16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v11, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, v14, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v15, v11, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, 0x400000, v11
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v5, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, v15, v11
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s6, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v13, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v7, v17, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s6, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v16, v16, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v17, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s7, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, v6, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v14, v18 :: v_dual_add_nc_u32 v14, 0x7fff, v16
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v19, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v20, v17, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v16, v16, v7
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s7, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v19, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v21, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v14, v18 :: v_dual_add_nc_u32 v14, v20, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, 0x400000, v17
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v20, v22, v19
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v18, v21, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, 0x400000, v19
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v20, 0x7fff, v20
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v16, 0x7fff, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v14, v23, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v18, v18, v21
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, 0x400000, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v20, v24, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, 0x7fff, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, 0x400000, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v16, v22, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v5, 0xffff0000, v11, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v17, v18, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v3, v11
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v6, v15, vcc_lo
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v6, 0xffff0000, v16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v7, 0xffff0000, v7, v17
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v4, 0xffff0000, v13, v10
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v8, v12
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v1, 0xffff0000, v9, v14
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v0, 0xffff0000, v0, v15
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB95_3:
+; GFX11-FAKE16-NEXT:    s_branch .LBB95_2
+; GFX11-FAKE16-NEXT:  .LBB95_4:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -32462,177 +33328,351 @@ define inreg <16 x i16> @bitcast_v32i8_to_v16i16_scalar(<32 x i8> inreg %a, i32
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
 ; GFX9-NEXT:    s_branch .LBB99_2
 ;
-; GFX11-LABEL: bitcast_v32i8_to_v16i16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT:    v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v17, v0
-; GFX11-NEXT:    v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v15, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 8, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 8, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 8, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 8, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cbranch_scc0 .LBB99_4
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    s_and_b32 s5, s0, 0xff
-; GFX11-NEXT:    s_lshl_b32 s6, s1, 8
-; GFX11-NEXT:    s_and_b32 s7, s2, 0xff
-; GFX11-NEXT:    s_lshl_b32 s8, s3, 8
-; GFX11-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-NEXT:    s_or_b32 s6, s7, s8
-; GFX11-NEXT:    s_and_b32 s7, s16, 0xff
-; GFX11-NEXT:    s_lshl_b32 s8, s17, 8
-; GFX11-NEXT:    s_and_b32 s9, s18, 0xff
-; GFX11-NEXT:    s_lshl_b32 s10, s19, 8
-; GFX11-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-NEXT:    s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-NEXT:    s_and_b32 s7, s20, 0xff
-; GFX11-NEXT:    s_lshl_b32 s8, s21, 8
-; GFX11-NEXT:    s_and_b32 s9, s22, 0xff
-; GFX11-NEXT:    s_lshl_b32 s10, s23, 8
-; GFX11-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-NEXT:    s_and_b32 s9, s24, 0xff
-; GFX11-NEXT:    s_lshl_b32 s10, s25, 8
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v15
-; GFX11-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-NEXT:    s_and_b32 s9, s26, 0xff
-; GFX11-NEXT:    s_lshl_b32 s10, s27, 8
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v16
-; GFX11-NEXT:    s_or_b32 s9, s9, s10
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v20
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v10
-; GFX11-NEXT:    s_pack_ll_b32_b16 s8, s8, s9
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v18
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v17
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v8
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v21
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v12
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v19
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v11
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    s_and_b32 s11, s28, 0xff
-; GFX11-NEXT:    s_lshl_b32 s12, s29, 8
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v9
-; GFX11-NEXT:    s_or_b32 s10, s11, s12
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff, v4
-; GFX11-NEXT:    v_and_b32_e64 v3, 0xffff, s10
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v13
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff, v5
-; GFX11-NEXT:    v_lshl_or_b32 v5, v2, 16, v1
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v14
-; GFX11-NEXT:    v_lshl_or_b32 v6, v6, 16, v22
-; GFX11-NEXT:    v_lshl_or_b32 v7, v7, 16, v23
-; GFX11-NEXT:    v_mov_b32_e32 v2, s7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_lshl_or_b32 v4, v0, 16, v3
-; GFX11-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s8
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_vccnz .LBB99_3
-; GFX11-NEXT:  .LBB99_2: ; %cmp.true
-; GFX11-NEXT:    s_add_i32 s28, s28, 3
-; GFX11-NEXT:    s_lshl_b32 s5, s29, 8
-; GFX11-NEXT:    s_and_b32 s4, s28, 0xff
-; GFX11-NEXT:    s_add_i32 s24, s24, 3
-; GFX11-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-NEXT:    s_and_b32 s5, s24, 0xff
-; GFX11-NEXT:    s_lshl_b32 s6, s25, 8
-; GFX11-NEXT:    s_add_i32 s26, s26, 3
-; GFX11-NEXT:    s_or_b32 s5, s6, s5
-; GFX11-NEXT:    s_and_b32 s6, s26, 0xff
-; GFX11-NEXT:    s_lshl_b32 s7, s27, 8
-; GFX11-NEXT:    s_add_i32 s20, s20, 3
-; GFX11-NEXT:    s_or_b32 s6, s7, s6
-; GFX11-NEXT:    s_and_b32 s7, s20, 0xff
-; GFX11-NEXT:    s_lshl_b32 s8, s21, 8
-; GFX11-NEXT:    s_add_i32 s22, s22, 3
-; GFX11-NEXT:    s_or_b32 s7, s8, s7
-; GFX11-NEXT:    s_and_b32 s8, s22, 0xff
-; GFX11-NEXT:    s_lshl_b32 s9, s23, 8
-; GFX11-NEXT:    s_add_i32 s16, s16, 3
-; GFX11-NEXT:    s_or_b32 s8, s9, s8
-; GFX11-NEXT:    s_and_b32 s9, s16, 0xff
-; GFX11-NEXT:    s_lshl_b32 s10, s17, 8
-; GFX11-NEXT:    s_add_i32 s18, s18, 3
-; GFX11-NEXT:    s_add_i32 s0, s0, 3
-; GFX11-NEXT:    s_add_i32 s2, s2, 3
-; GFX11-NEXT:    s_or_b32 s9, s10, s9
-; GFX11-NEXT:    s_and_b32 s10, s18, 0xff
-; GFX11-NEXT:    s_lshl_b32 s11, s19, 8
-; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX11-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX11-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-NEXT:    s_or_b32 s10, s11, s10
-; GFX11-NEXT:    s_or_b32 s0, s1, s0
-; GFX11-NEXT:    s_or_b32 s1, s3, s2
-; GFX11-NEXT:    s_addk_i32 s5, 0x300
-; GFX11-NEXT:    s_addk_i32 s6, 0x300
-; GFX11-NEXT:    s_addk_i32 s9, 0x300
-; GFX11-NEXT:    s_addk_i32 s0, 0x300
-; GFX11-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-NEXT:    s_addk_i32 s10, 0x300
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v15
-; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-NEXT:    s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v10
-; GFX11-NEXT:    s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v16
-; GFX11-NEXT:    s_addk_i32 s7, 0x300
-; GFX11-NEXT:    s_addk_i32 s8, 0x300
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v12
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 3, v18
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 3, v17
-; GFX11-NEXT:    s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v8
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_or_b32_e32 v0, v11, v0
-; GFX11-NEXT:    v_or_b32_e32 v2, v21, v2
-; GFX11-NEXT:    v_or_b32_e32 v4, v20, v4
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_or_b32_e32 v1, v13, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-NEXT:    v_or_b32_e32 v3, v9, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-NEXT:    v_or_b32_e32 v5, v19, v5
-; GFX11-NEXT:    v_or_b32_e32 v6, v14, v6
-; GFX11-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 0x300, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-NEXT:    v_and_b32_e64 v7, 0xffff, s4
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v4
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshl_or_b32 v4, v6, 16, v7
-; GFX11-NEXT:    v_lshl_or_b32 v5, v5, 16, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshl_or_b32 v6, v3, 16, v2
-; GFX11-NEXT:    v_lshl_or_b32 v7, v1, 16, v0
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT:  .LBB99_3: ; %end
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB99_4:
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
-; GFX11-NEXT:    s_branch .LBB99_2
+; GFX11-TRUE16-LABEL: bitcast_v32i8_to_v16i16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v17, v0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v15, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 8, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB99_4
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    s_and_b32 s5, s0, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s1, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s7, s2, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s3, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s5, s5, s6
+; GFX11-TRUE16-NEXT:    s_or_b32 s6, s7, s8
+; GFX11-TRUE16-NEXT:    s_and_b32 s7, s16, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s17, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s9, s18, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s19, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s7, s7, s8
+; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s7, s8
+; GFX11-TRUE16-NEXT:    s_and_b32 s7, s20, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s21, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s9, s22, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s23, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s7, s7, s8
+; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-TRUE16-NEXT:    s_and_b32 s9, s24, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s25, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s11, s26, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s12, s27, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s9, s9, s10
+; GFX11-TRUE16-NEXT:    s_or_b32 s10, s11, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s9, s10
+; GFX11-TRUE16-NEXT:    s_and_b32 s9, s28, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s29, 8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v10
+; GFX11-TRUE16-NEXT:    s_or_b32 s9, s9, s10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s9 :: v_dual_and_b32 v1, 0xff, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v7, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v18
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v1, v20
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v13
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v3, v21
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, s8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v19
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, s6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v14
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, s5
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB99_3
+; GFX11-TRUE16-NEXT:  .LBB99_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_add_i32 s28, s28, 3
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s29, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s28, 0xff
+; GFX11-TRUE16-NEXT:    s_add_i32 s24, s24, 3
+; GFX11-TRUE16-NEXT:    s_or_b32 s4, s5, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s5, s24, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s25, 8
+; GFX11-TRUE16-NEXT:    s_add_i32 s26, s26, 3
+; GFX11-TRUE16-NEXT:    s_or_b32 s5, s6, s5
+; GFX11-TRUE16-NEXT:    s_and_b32 s6, s26, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s7, s27, 8
+; GFX11-TRUE16-NEXT:    s_add_i32 s20, s20, 3
+; GFX11-TRUE16-NEXT:    s_or_b32 s6, s7, s6
+; GFX11-TRUE16-NEXT:    s_and_b32 s7, s20, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s21, 8
+; GFX11-TRUE16-NEXT:    s_add_i32 s22, s22, 3
+; GFX11-TRUE16-NEXT:    s_or_b32 s7, s8, s7
+; GFX11-TRUE16-NEXT:    s_and_b32 s8, s22, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s23, 8
+; GFX11-TRUE16-NEXT:    s_add_i32 s16, s16, 3
+; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s8
+; GFX11-TRUE16-NEXT:    s_and_b32 s9, s16, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s17, 8
+; GFX11-TRUE16-NEXT:    s_add_i32 s18, s18, 3
+; GFX11-TRUE16-NEXT:    s_add_i32 s0, s0, 3
+; GFX11-TRUE16-NEXT:    s_add_i32 s2, s2, 3
+; GFX11-TRUE16-NEXT:    s_or_b32 s9, s10, s9
+; GFX11-TRUE16-NEXT:    s_and_b32 s10, s18, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s11, s19, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s10, s11, s10
+; GFX11-TRUE16-NEXT:    s_or_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    s_or_b32 s1, s3, s2
+; GFX11-TRUE16-NEXT:    s_addk_i32 s9, 0x300
+; GFX11-TRUE16-NEXT:    s_addk_i32 s0, 0x300
+; GFX11-TRUE16-NEXT:    s_addk_i32 s1, 0x300
+; GFX11-TRUE16-NEXT:    s_addk_i32 s10, 0x300
+; GFX11-TRUE16-NEXT:    s_addk_i32 s4, 0x300
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v10
+; GFX11-TRUE16-NEXT:    s_addk_i32 s7, 0x300
+; GFX11-TRUE16-NEXT:    s_addk_i32 s8, 0x300
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s7, s8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v15
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v18
+; GFX11-TRUE16-NEXT:    s_addk_i32 s5, 0x300
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v11, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v13, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-TRUE16-NEXT:    s_addk_i32 s6, 0x300
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v1
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v0.l
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_and_b32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v9, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v20, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x300, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v19, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_add_nc_u32 v3, 0x300, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v3.l
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, s3 :: v_dual_and_b32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v21, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v17
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v1.l
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_and_b32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v14, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-TRUE16-NEXT:  .LBB99_3: ; %end
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB99_4:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+; GFX11-TRUE16-NEXT:    s_branch .LBB99_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v32i8_to_v16i16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v17, v0
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v15, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB99_4
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    s_and_b32 s5, s0, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s1, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s7, s2, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s3, 8
+; GFX11-FAKE16-NEXT:    s_or_b32 s5, s5, s6
+; GFX11-FAKE16-NEXT:    s_or_b32 s6, s7, s8
+; GFX11-FAKE16-NEXT:    s_and_b32 s7, s16, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s17, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s9, s18, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s19, 8
+; GFX11-FAKE16-NEXT:    s_or_b32 s7, s7, s8
+; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s6, s7, s8
+; GFX11-FAKE16-NEXT:    s_and_b32 s7, s20, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s21, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s9, s22, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s23, 8
+; GFX11-FAKE16-NEXT:    s_or_b32 s7, s7, s8
+; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-FAKE16-NEXT:    s_and_b32 s9, s24, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s25, 8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v15
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
+; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-FAKE16-NEXT:    s_and_b32 s9, s26, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s27, 8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v16
+; GFX11-FAKE16-NEXT:    s_or_b32 s9, s9, s10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v10
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s8, s8, s9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    s_and_b32 s11, s28, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s12, s29, 8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v9
+; GFX11-FAKE16-NEXT:    s_or_b32 s10, s11, s12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e64 v3, 0xffff, s10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v2, 16, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, s6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v14
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v6, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v7, 16, v23
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, s7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v0, 16, v3
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s8
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB99_3
+; GFX11-FAKE16-NEXT:  .LBB99_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_add_i32 s28, s28, 3
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s29, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s28, 0xff
+; GFX11-FAKE16-NEXT:    s_add_i32 s24, s24, 3
+; GFX11-FAKE16-NEXT:    s_or_b32 s4, s5, s4
+; GFX11-FAKE16-NEXT:    s_and_b32 s5, s24, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s25, 8
+; GFX11-FAKE16-NEXT:    s_add_i32 s26, s26, 3
+; GFX11-FAKE16-NEXT:    s_or_b32 s5, s6, s5
+; GFX11-FAKE16-NEXT:    s_and_b32 s6, s26, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s27, 8
+; GFX11-FAKE16-NEXT:    s_add_i32 s20, s20, 3
+; GFX11-FAKE16-NEXT:    s_or_b32 s6, s7, s6
+; GFX11-FAKE16-NEXT:    s_and_b32 s7, s20, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s21, 8
+; GFX11-FAKE16-NEXT:    s_add_i32 s22, s22, 3
+; GFX11-FAKE16-NEXT:    s_or_b32 s7, s8, s7
+; GFX11-FAKE16-NEXT:    s_and_b32 s8, s22, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s9, s23, 8
+; GFX11-FAKE16-NEXT:    s_add_i32 s16, s16, 3
+; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s8
+; GFX11-FAKE16-NEXT:    s_and_b32 s9, s16, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s17, 8
+; GFX11-FAKE16-NEXT:    s_add_i32 s18, s18, 3
+; GFX11-FAKE16-NEXT:    s_add_i32 s0, s0, 3
+; GFX11-FAKE16-NEXT:    s_add_i32 s2, s2, 3
+; GFX11-FAKE16-NEXT:    s_or_b32 s9, s10, s9
+; GFX11-FAKE16-NEXT:    s_and_b32 s10, s18, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s11, s19, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX11-FAKE16-NEXT:    s_or_b32 s10, s11, s10
+; GFX11-FAKE16-NEXT:    s_or_b32 s0, s1, s0
+; GFX11-FAKE16-NEXT:    s_or_b32 s1, s3, s2
+; GFX11-FAKE16-NEXT:    s_addk_i32 s5, 0x300
+; GFX11-FAKE16-NEXT:    s_addk_i32 s6, 0x300
+; GFX11-FAKE16-NEXT:    s_addk_i32 s9, 0x300
+; GFX11-FAKE16-NEXT:    s_addk_i32 s0, 0x300
+; GFX11-FAKE16-NEXT:    s_addk_i32 s1, 0x300
+; GFX11-FAKE16-NEXT:    s_addk_i32 s10, 0x300
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v15
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v10
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v16
+; GFX11-FAKE16-NEXT:    s_addk_i32 s7, 0x300
+; GFX11-FAKE16-NEXT:    s_addk_i32 s8, 0x300
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 3, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 3, v17
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s2, s7, s8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v11, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v21, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v20, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v13, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v9, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v19, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v14, v6
+; GFX11-FAKE16-NEXT:    s_addk_i32 s4, 0x300
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x300, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e64 v7, 0xffff, s4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v6, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v5, 16, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v3, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT:  .LBB99_3: ; %end
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB99_4:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+; GFX11-FAKE16-NEXT:    s_branch .LBB99_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -34430,192 +35470,369 @@ define inreg <16 x half> @bitcast_v16bf16_to_v16f16_scalar(<16 x bfloat> inreg %
 ; GFX9-NEXT:    v_mov_b32_e32 v7, s23
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v16bf16_to_v16f16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s7, s19
-; GFX11-NEXT:    s_mov_b32 s6, s18
-; GFX11-NEXT:    s_mov_b32 s5, s17
-; GFX11-NEXT:    s_mov_b32 s4, s16
-; GFX11-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX11-NEXT:    s_mov_b32 s8, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB103_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s8
-; GFX11-NEXT:    s_cbranch_vccnz .LBB103_4
-; GFX11-NEXT:  .LBB103_2: ; %cmp.true
-; GFX11-NEXT:    s_and_b32 s8, s0, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX11-NEXT:    v_add_f32_e64 v0, 0x40c00000, s8
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT:    s_and_b32 s0, s1, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v0
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v1
-; GFX11-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s1
-; GFX11-NEXT:    s_lshl_b32 s0, s2, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, v7, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT:    v_bfe_u32 v9, v5, 16, 1
-; GFX11-NEXT:    s_and_b32 s8, s2, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s1, s4, 16
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v7
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    v_add_f32_e64 v7, 0x40c00000, s8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v9, v5
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v5
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_bfe_u32 v6, v7, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
-; GFX11-NEXT:    v_add_f32_e64 v2, 0x40c00000, s0
-; GFX11-NEXT:    s_and_b32 s0, s3, 0xffff0000
-; GFX11-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v5, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v3, v3, v4 :: v_dual_add_nc_u32 v4, v6, v7
-; GFX11-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    s_lshl_b32 s0, s3, 16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    v_bfe_u32 v10, v6, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v7, 0x40c00000, s0
-; GFX11-NEXT:    s_and_b32 s0, s4, 0xffff0000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, v10, v6
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v4
-; GFX11-NEXT:    v_bfe_u32 v4, v7, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v5, v9 :: v_dual_add_nc_u32 v5, 0x7fff, v10
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT:    v_add_f32_e64 v10, 0x40c00000, s0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    s_and_b32 s0, s5, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v7
-; GFX11-NEXT:    v_add_f32_e64 v12, 0x40c00000, s0
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v7
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v9, v10, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    s_lshl_b32 s0, s5, 16
-; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v10
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, v9, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v5
-; GFX11-NEXT:    v_bfe_u32 v5, v12, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v6, 0x40c00000, s1
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT:    v_add_f32_e64 v15, 0x40c00000, s0
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v5, v12
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    v_bfe_u32 v13, v6, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v12
-; GFX11-NEXT:    s_and_b32 s0, s6, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v7, v14, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v14, v15, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT:    v_add_f32_e64 v12, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s6, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v13, v13, v6
-; GFX11-NEXT:    v_or_b32_e32 v16, 0x400000, v6
-; GFX11-NEXT:    v_dual_cndmask_b32 v5, v5, v10 :: v_dual_add_nc_u32 v10, v14, v15
-; GFX11-NEXT:    v_add_f32_e64 v14, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s7, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v13, 0x7fff, v13
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_add_f32_e64 v17, 0x40c00000, s0
-; GFX11-NEXT:    v_bfe_u32 v18, v14, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT:    s_and_b32 s0, s7, 0xffff0000
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v13, v16, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v16, 0x400000, v15
-; GFX11-NEXT:    v_bfe_u32 v20, v17, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-NEXT:    v_add_nc_u32_e32 v15, v18, v14
-; GFX11-NEXT:    v_bfe_u32 v13, v12, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v19, 0x40c00000, s0
-; GFX11-NEXT:    v_add_nc_u32_e32 v18, v20, v17
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v10, v10, v16 :: v_dual_add_nc_u32 v15, 0x7fff, v15
-; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v14
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT:    v_add_nc_u32_e32 v13, v13, v12
-; GFX11-NEXT:    v_bfe_u32 v16, v19, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v18, 0x7fff, v18
-; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v17
-; GFX11-NEXT:    v_cndmask_b32_e32 v14, v15, v21, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT:    v_add_nc_u32_e32 v13, 0x7fff, v13
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v12
-; GFX11-NEXT:    v_add_nc_u32_e32 v16, v16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_cndmask_b32_e32 v17, v18, v22, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_add_nc_u32_e32 v15, 0x7fff, v16
-; GFX11-NEXT:    v_or_b32_e32 v16, 0x400000, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v12, v13, v20, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, v15, v16, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 16, v17
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_lshl_or_b32 v6, v12, 16, v14
-; GFX11-NEXT:    v_lshl_or_b32 v5, v5, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v16
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v4
-; GFX11-NEXT:    v_lshl_or_b32 v7, v13, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v3
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v1
-; GFX11-NEXT:    v_lshl_or_b32 v4, v17, 16, v10
-; GFX11-NEXT:    v_lshl_or_b32 v3, v9, 16, v12
-; GFX11-NEXT:    v_lshl_or_b32 v2, v11, 16, v2
-; GFX11-NEXT:    v_lshl_or_b32 v1, v8, 16, v13
-; GFX11-NEXT:    v_lshl_or_b32 v0, v0, 16, v14
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB103_3:
-; GFX11-NEXT:    s_branch .LBB103_2
-; GFX11-NEXT:  .LBB103_4:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v16bf16_to_v16f16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, s19
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, s18
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s17
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s16
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB103_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s8
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB103_4
+; GFX11-TRUE16-NEXT:  .LBB103_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_and_b32 s8, s0, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s8
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s1, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, v7, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT:    s_and_b32 s8, s2, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s2, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, s5, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v2, v6 :: v_dual_add_nc_u32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s8
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v13, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v9, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, s7, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v1, v6, v2 :: v_dual_add_nc_u32 v2, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s3, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v4, v7
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v17, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s3, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v6
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v18, v17, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, 0x400000, v13
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, v10, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v18, v18, v17
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s4, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v18, 0x7fff, v18
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v10
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, v2, v5
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s4, 16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v3, v6, v7 :: v_dual_add_nc_u32 v4, 0x7fff, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v12, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s5, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, v7, v12
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v13, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v10, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v11.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, v7, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v12
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, v6, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v10
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v14.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s6, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v9.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v6, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s6, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, v16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v15, v10, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v12
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, 0x400000, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, v15, v10
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s7, 16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v13.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v15, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, v7, v15
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v10, v12, v19, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, 0x400000, v17
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, v20, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, 0x400000, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v17, v18, v19, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v8.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v17
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v7, v7, v20, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v12, v12, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v10.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v12
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v15.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB103_3:
+; GFX11-TRUE16-NEXT:    s_branch .LBB103_2
+; GFX11-TRUE16-NEXT:  .LBB103_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v16bf16_to_v16f16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, s19
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, s18
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s17
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s16
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB103_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s8
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB103_4
+; GFX11-FAKE16-NEXT:  .LBB103_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_and_b32 s8, s0, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s8
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s1, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s2, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, v7, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s8, s2, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s4, 16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v9, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v7, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s3, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v3, v4 :: v_dual_add_nc_u32 v4, v6, v7
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s3, 16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s4, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, v10, v6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v7, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v5, v9 :: v_dual_add_nc_u32 v5, 0x7fff, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s5, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v7
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s5, 16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, v9, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v12, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v5, v12
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v12
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s6, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v14, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v14, v15, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s6, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, v13, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v5, v10 :: v_dual_add_nc_u32 v10, v14, v15
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s7, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, 0x7fff, v13
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v17, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v18, v14, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s7, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v13, v16, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, 0x400000, v15
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v20, v17, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, v18, v14
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v12, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v19, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v18, v20, v17
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v10, v16 :: v_dual_add_nc_u32 v15, 0x7fff, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, 0x400000, v14
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, v13, v12
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v19, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v18, 0x7fff, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v17
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v15, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, 0x7fff, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v12
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v16, v16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v18, v22, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, 0x7fff, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, 0x400000, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v13, v20, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v15, v16, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v12, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v5, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v13, 16, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v17, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v3, v9, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v2, v11, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v1, v8, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v14
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB103_3:
+; GFX11-FAKE16-NEXT:    s_branch .LBB103_2
+; GFX11-FAKE16-NEXT:  .LBB103_4:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -37354,177 +38571,351 @@ define inreg <16 x half> @bitcast_v32i8_to_v16f16_scalar(<32 x i8> inreg %a, i32
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
 ; GFX9-NEXT:    s_branch .LBB107_2
 ;
-; GFX11-LABEL: bitcast_v32i8_to_v16f16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT:    v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v17, v0
-; GFX11-NEXT:    v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v15, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 8, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 8, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 8, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 8, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cbranch_scc0 .LBB107_4
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    s_and_b32 s5, s0, 0xff
-; GFX11-NEXT:    s_lshl_b32 s6, s1, 8
-; GFX11-NEXT:    s_and_b32 s7, s2, 0xff
-; GFX11-NEXT:    s_lshl_b32 s8, s3, 8
-; GFX11-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-NEXT:    s_or_b32 s6, s7, s8
-; GFX11-NEXT:    s_and_b32 s7, s16, 0xff
-; GFX11-NEXT:    s_lshl_b32 s8, s17, 8
-; GFX11-NEXT:    s_and_b32 s9, s18, 0xff
-; GFX11-NEXT:    s_lshl_b32 s10, s19, 8
-; GFX11-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-NEXT:    s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-NEXT:    s_and_b32 s7, s20, 0xff
-; GFX11-NEXT:    s_lshl_b32 s8, s21, 8
-; GFX11-NEXT:    s_and_b32 s9, s22, 0xff
-; GFX11-NEXT:    s_lshl_b32 s10, s23, 8
-; GFX11-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-NEXT:    s_and_b32 s9, s24, 0xff
-; GFX11-NEXT:    s_lshl_b32 s10, s25, 8
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v15
-; GFX11-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-NEXT:    s_and_b32 s9, s26, 0xff
-; GFX11-NEXT:    s_lshl_b32 s10, s27, 8
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v16
-; GFX11-NEXT:    s_or_b32 s9, s9, s10
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v20
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v10
-; GFX11-NEXT:    s_pack_ll_b32_b16 s8, s8, s9
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v18
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v17
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v8
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v21
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v12
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v19
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v11
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    s_and_b32 s11, s28, 0xff
-; GFX11-NEXT:    s_lshl_b32 s12, s29, 8
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v9
-; GFX11-NEXT:    s_or_b32 s10, s11, s12
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff, v4
-; GFX11-NEXT:    v_and_b32_e64 v3, 0xffff, s10
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v13
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff, v5
-; GFX11-NEXT:    v_lshl_or_b32 v5, v2, 16, v1
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v14
-; GFX11-NEXT:    v_lshl_or_b32 v6, v6, 16, v22
-; GFX11-NEXT:    v_lshl_or_b32 v7, v7, 16, v23
-; GFX11-NEXT:    v_mov_b32_e32 v2, s7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_lshl_or_b32 v4, v0, 16, v3
-; GFX11-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s8
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_vccnz .LBB107_3
-; GFX11-NEXT:  .LBB107_2: ; %cmp.true
-; GFX11-NEXT:    s_add_i32 s28, s28, 3
-; GFX11-NEXT:    s_lshl_b32 s5, s29, 8
-; GFX11-NEXT:    s_and_b32 s4, s28, 0xff
-; GFX11-NEXT:    s_add_i32 s24, s24, 3
-; GFX11-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-NEXT:    s_and_b32 s5, s24, 0xff
-; GFX11-NEXT:    s_lshl_b32 s6, s25, 8
-; GFX11-NEXT:    s_add_i32 s26, s26, 3
-; GFX11-NEXT:    s_or_b32 s5, s6, s5
-; GFX11-NEXT:    s_and_b32 s6, s26, 0xff
-; GFX11-NEXT:    s_lshl_b32 s7, s27, 8
-; GFX11-NEXT:    s_add_i32 s20, s20, 3
-; GFX11-NEXT:    s_or_b32 s6, s7, s6
-; GFX11-NEXT:    s_and_b32 s7, s20, 0xff
-; GFX11-NEXT:    s_lshl_b32 s8, s21, 8
-; GFX11-NEXT:    s_add_i32 s22, s22, 3
-; GFX11-NEXT:    s_or_b32 s7, s8, s7
-; GFX11-NEXT:    s_and_b32 s8, s22, 0xff
-; GFX11-NEXT:    s_lshl_b32 s9, s23, 8
-; GFX11-NEXT:    s_add_i32 s16, s16, 3
-; GFX11-NEXT:    s_or_b32 s8, s9, s8
-; GFX11-NEXT:    s_and_b32 s9, s16, 0xff
-; GFX11-NEXT:    s_lshl_b32 s10, s17, 8
-; GFX11-NEXT:    s_add_i32 s18, s18, 3
-; GFX11-NEXT:    s_add_i32 s0, s0, 3
-; GFX11-NEXT:    s_add_i32 s2, s2, 3
-; GFX11-NEXT:    s_or_b32 s9, s10, s9
-; GFX11-NEXT:    s_and_b32 s10, s18, 0xff
-; GFX11-NEXT:    s_lshl_b32 s11, s19, 8
-; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX11-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX11-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-NEXT:    s_or_b32 s10, s11, s10
-; GFX11-NEXT:    s_or_b32 s0, s1, s0
-; GFX11-NEXT:    s_or_b32 s1, s3, s2
-; GFX11-NEXT:    s_addk_i32 s5, 0x300
-; GFX11-NEXT:    s_addk_i32 s6, 0x300
-; GFX11-NEXT:    s_addk_i32 s9, 0x300
-; GFX11-NEXT:    s_addk_i32 s0, 0x300
-; GFX11-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-NEXT:    s_addk_i32 s10, 0x300
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v15
-; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-NEXT:    s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v10
-; GFX11-NEXT:    s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v16
-; GFX11-NEXT:    s_addk_i32 s7, 0x300
-; GFX11-NEXT:    s_addk_i32 s8, 0x300
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v12
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 3, v18
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 3, v17
-; GFX11-NEXT:    s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v8
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_or_b32_e32 v0, v11, v0
-; GFX11-NEXT:    v_or_b32_e32 v2, v21, v2
-; GFX11-NEXT:    v_or_b32_e32 v4, v20, v4
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_or_b32_e32 v1, v13, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-NEXT:    v_or_b32_e32 v3, v9, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-NEXT:    v_or_b32_e32 v5, v19, v5
-; GFX11-NEXT:    v_or_b32_e32 v6, v14, v6
-; GFX11-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 0x300, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-NEXT:    v_and_b32_e64 v7, 0xffff, s4
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v4
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshl_or_b32 v4, v6, 16, v7
-; GFX11-NEXT:    v_lshl_or_b32 v5, v5, 16, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshl_or_b32 v6, v3, 16, v2
-; GFX11-NEXT:    v_lshl_or_b32 v7, v1, 16, v0
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT:  .LBB107_3: ; %end
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB107_4:
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
-; GFX11-NEXT:    s_branch .LBB107_2
+; GFX11-TRUE16-LABEL: bitcast_v32i8_to_v16f16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v17, v0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v15, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 8, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB107_4
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    s_and_b32 s5, s0, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s1, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s7, s2, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s3, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s5, s5, s6
+; GFX11-TRUE16-NEXT:    s_or_b32 s6, s7, s8
+; GFX11-TRUE16-NEXT:    s_and_b32 s7, s16, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s17, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s9, s18, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s19, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s7, s7, s8
+; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s7, s8
+; GFX11-TRUE16-NEXT:    s_and_b32 s7, s20, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s21, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s9, s22, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s23, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s7, s7, s8
+; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-TRUE16-NEXT:    s_and_b32 s9, s24, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s25, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s11, s26, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s12, s27, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s9, s9, s10
+; GFX11-TRUE16-NEXT:    s_or_b32 s10, s11, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s9, s10
+; GFX11-TRUE16-NEXT:    s_and_b32 s9, s28, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s29, 8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v10
+; GFX11-TRUE16-NEXT:    s_or_b32 s9, s9, s10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s9 :: v_dual_and_b32 v1, 0xff, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v7, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v18
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v1, v20
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v13
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v3, v21
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, s8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v19
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, s6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v14
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, s5
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB107_3
+; GFX11-TRUE16-NEXT:  .LBB107_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_add_i32 s28, s28, 3
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s29, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s28, 0xff
+; GFX11-TRUE16-NEXT:    s_add_i32 s24, s24, 3
+; GFX11-TRUE16-NEXT:    s_or_b32 s4, s5, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s5, s24, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s25, 8
+; GFX11-TRUE16-NEXT:    s_add_i32 s26, s26, 3
+; GFX11-TRUE16-NEXT:    s_or_b32 s5, s6, s5
+; GFX11-TRUE16-NEXT:    s_and_b32 s6, s26, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s7, s27, 8
+; GFX11-TRUE16-NEXT:    s_add_i32 s20, s20, 3
+; GFX11-TRUE16-NEXT:    s_or_b32 s6, s7, s6
+; GFX11-TRUE16-NEXT:    s_and_b32 s7, s20, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s21, 8
+; GFX11-TRUE16-NEXT:    s_add_i32 s22, s22, 3
+; GFX11-TRUE16-NEXT:    s_or_b32 s7, s8, s7
+; GFX11-TRUE16-NEXT:    s_and_b32 s8, s22, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s23, 8
+; GFX11-TRUE16-NEXT:    s_add_i32 s16, s16, 3
+; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s8
+; GFX11-TRUE16-NEXT:    s_and_b32 s9, s16, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s17, 8
+; GFX11-TRUE16-NEXT:    s_add_i32 s18, s18, 3
+; GFX11-TRUE16-NEXT:    s_add_i32 s0, s0, 3
+; GFX11-TRUE16-NEXT:    s_add_i32 s2, s2, 3
+; GFX11-TRUE16-NEXT:    s_or_b32 s9, s10, s9
+; GFX11-TRUE16-NEXT:    s_and_b32 s10, s18, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s11, s19, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s10, s11, s10
+; GFX11-TRUE16-NEXT:    s_or_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    s_or_b32 s1, s3, s2
+; GFX11-TRUE16-NEXT:    s_addk_i32 s9, 0x300
+; GFX11-TRUE16-NEXT:    s_addk_i32 s0, 0x300
+; GFX11-TRUE16-NEXT:    s_addk_i32 s1, 0x300
+; GFX11-TRUE16-NEXT:    s_addk_i32 s10, 0x300
+; GFX11-TRUE16-NEXT:    s_addk_i32 s4, 0x300
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v10
+; GFX11-TRUE16-NEXT:    s_addk_i32 s7, 0x300
+; GFX11-TRUE16-NEXT:    s_addk_i32 s8, 0x300
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s7, s8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v15
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v18
+; GFX11-TRUE16-NEXT:    s_addk_i32 s5, 0x300
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v11, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v13, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-TRUE16-NEXT:    s_addk_i32 s6, 0x300
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v1
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v0.l
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_and_b32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v9, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v20, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x300, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v19, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_add_nc_u32 v3, 0x300, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v3.l
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, s3 :: v_dual_and_b32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v21, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v17
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v1.l
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_and_b32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v14, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-TRUE16-NEXT:  .LBB107_3: ; %end
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB107_4:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+; GFX11-TRUE16-NEXT:    s_branch .LBB107_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v32i8_to_v16f16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v17, v0
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v15, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB107_4
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    s_and_b32 s5, s0, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s1, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s7, s2, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s3, 8
+; GFX11-FAKE16-NEXT:    s_or_b32 s5, s5, s6
+; GFX11-FAKE16-NEXT:    s_or_b32 s6, s7, s8
+; GFX11-FAKE16-NEXT:    s_and_b32 s7, s16, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s17, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s9, s18, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s19, 8
+; GFX11-FAKE16-NEXT:    s_or_b32 s7, s7, s8
+; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s6, s7, s8
+; GFX11-FAKE16-NEXT:    s_and_b32 s7, s20, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s21, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s9, s22, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s23, 8
+; GFX11-FAKE16-NEXT:    s_or_b32 s7, s7, s8
+; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-FAKE16-NEXT:    s_and_b32 s9, s24, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s25, 8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v15
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
+; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-FAKE16-NEXT:    s_and_b32 s9, s26, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s27, 8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v16
+; GFX11-FAKE16-NEXT:    s_or_b32 s9, s9, s10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v10
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s8, s8, s9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    s_and_b32 s11, s28, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s12, s29, 8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v9
+; GFX11-FAKE16-NEXT:    s_or_b32 s10, s11, s12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e64 v3, 0xffff, s10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v2, 16, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, s6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v14
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v6, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v7, 16, v23
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, s7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v0, 16, v3
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s8
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB107_3
+; GFX11-FAKE16-NEXT:  .LBB107_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_add_i32 s28, s28, 3
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s29, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s28, 0xff
+; GFX11-FAKE16-NEXT:    s_add_i32 s24, s24, 3
+; GFX11-FAKE16-NEXT:    s_or_b32 s4, s5, s4
+; GFX11-FAKE16-NEXT:    s_and_b32 s5, s24, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s25, 8
+; GFX11-FAKE16-NEXT:    s_add_i32 s26, s26, 3
+; GFX11-FAKE16-NEXT:    s_or_b32 s5, s6, s5
+; GFX11-FAKE16-NEXT:    s_and_b32 s6, s26, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s27, 8
+; GFX11-FAKE16-NEXT:    s_add_i32 s20, s20, 3
+; GFX11-FAKE16-NEXT:    s_or_b32 s6, s7, s6
+; GFX11-FAKE16-NEXT:    s_and_b32 s7, s20, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s21, 8
+; GFX11-FAKE16-NEXT:    s_add_i32 s22, s22, 3
+; GFX11-FAKE16-NEXT:    s_or_b32 s7, s8, s7
+; GFX11-FAKE16-NEXT:    s_and_b32 s8, s22, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s9, s23, 8
+; GFX11-FAKE16-NEXT:    s_add_i32 s16, s16, 3
+; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s8
+; GFX11-FAKE16-NEXT:    s_and_b32 s9, s16, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s17, 8
+; GFX11-FAKE16-NEXT:    s_add_i32 s18, s18, 3
+; GFX11-FAKE16-NEXT:    s_add_i32 s0, s0, 3
+; GFX11-FAKE16-NEXT:    s_add_i32 s2, s2, 3
+; GFX11-FAKE16-NEXT:    s_or_b32 s9, s10, s9
+; GFX11-FAKE16-NEXT:    s_and_b32 s10, s18, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s11, s19, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX11-FAKE16-NEXT:    s_or_b32 s10, s11, s10
+; GFX11-FAKE16-NEXT:    s_or_b32 s0, s1, s0
+; GFX11-FAKE16-NEXT:    s_or_b32 s1, s3, s2
+; GFX11-FAKE16-NEXT:    s_addk_i32 s5, 0x300
+; GFX11-FAKE16-NEXT:    s_addk_i32 s6, 0x300
+; GFX11-FAKE16-NEXT:    s_addk_i32 s9, 0x300
+; GFX11-FAKE16-NEXT:    s_addk_i32 s0, 0x300
+; GFX11-FAKE16-NEXT:    s_addk_i32 s1, 0x300
+; GFX11-FAKE16-NEXT:    s_addk_i32 s10, 0x300
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v15
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v10
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v16
+; GFX11-FAKE16-NEXT:    s_addk_i32 s7, 0x300
+; GFX11-FAKE16-NEXT:    s_addk_i32 s8, 0x300
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 3, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 3, v17
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s2, s7, s8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v11, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v21, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v20, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v13, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v9, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v19, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v14, v6
+; GFX11-FAKE16-NEXT:    s_addk_i32 s4, 0x300
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x300, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e64 v7, 0xffff, s4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v6, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v5, 16, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v3, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT:  .LBB107_3: ; %end
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB107_4:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+; GFX11-FAKE16-NEXT:    s_branch .LBB107_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -39446,281 +40837,552 @@ define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v28, v32
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v16bf16_to_v32i8_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX11-NEXT:    s_mov_b32 s12, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB109_3
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    s_lshr_b32 s27, s19, 24
-; GFX11-NEXT:    s_lshr_b32 s46, s19, 16
-; GFX11-NEXT:    s_lshr_b32 s40, s19, 8
-; GFX11-NEXT:    s_lshr_b32 s42, s18, 16
-; GFX11-NEXT:    s_lshr_b32 s41, s18, 8
-; GFX11-NEXT:    s_lshr_b32 s23, s17, 24
-; GFX11-NEXT:    s_lshr_b32 s45, s17, 16
-; GFX11-NEXT:    s_lshr_b32 s26, s17, 8
-; GFX11-NEXT:    s_lshr_b32 s29, s16, 16
-; GFX11-NEXT:    s_lshr_b32 s28, s16, 8
-; GFX11-NEXT:    s_lshr_b32 s15, s3, 24
-; GFX11-NEXT:    s_lshr_b32 s44, s3, 16
-; GFX11-NEXT:    s_lshr_b32 s22, s3, 8
-; GFX11-NEXT:    s_lshr_b32 s25, s2, 16
-; GFX11-NEXT:    s_lshr_b32 s24, s2, 8
-; GFX11-NEXT:    s_lshr_b32 s13, s1, 24
-; GFX11-NEXT:    s_lshr_b32 s43, s1, 16
-; GFX11-NEXT:    s_lshr_b32 s14, s1, 8
-; GFX11-NEXT:    s_lshr_b32 s21, s0, 16
-; GFX11-NEXT:    s_lshr_b32 s20, s0, 8
-; GFX11-NEXT:    s_lshr_b64 s[10:11], s[18:19], 24
-; GFX11-NEXT:    s_lshr_b64 s[8:9], s[16:17], 24
-; GFX11-NEXT:    s_lshr_b64 s[6:7], s[2:3], 24
-; GFX11-NEXT:    s_lshr_b64 s[4:5], s[0:1], 24
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s12
-; GFX11-NEXT:    s_cbranch_vccnz .LBB109_4
-; GFX11-NEXT:  .LBB109_2: ; %cmp.true
-; GFX11-NEXT:    s_lshl_b32 s4, s1, 16
-; GFX11-NEXT:    s_and_b32 s1, s1, 0xffff0000
-; GFX11-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s1
-; GFX11-NEXT:    s_and_b32 s4, s0, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX11-NEXT:    v_add_f32_e64 v3, 0x40c00000, s4
-; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT:    v_bfe_u32 v10, v3, 16, 1
-; GFX11-NEXT:    s_and_b32 s1, s3, 0xffff0000
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT:    v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX11-NEXT:    s_and_b32 s0, s2, 0xffff0000
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v4, v10, v3
-; GFX11-NEXT:    v_bfe_u32 v10, v7, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v35
-; GFX11-NEXT:    v_bfe_u32 v9, v8, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v0, 0x400000, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v3, v4, v11 :: v_dual_add_nc_u32 v12, 0x7fff, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v10, v7
-; GFX11-NEXT:    v_lshl_or_b32 v2, v6, 16, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, v9, v8
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v12, v0, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s2, 16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v1, v5, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT:    s_and_b32 s0, s17, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, v1, v5
-; GFX11-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s17, 16
-; GFX11-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT:    v_or_b32_e32 v16, 0x400000, v9
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v34
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v8, v4
-; GFX11-NEXT:    v_bfe_u32 v12, v9, 16, 1
-; GFX11-NEXT:    v_lshl_or_b32 v10, v14, 16, v7
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT:    s_and_b32 s0, s16, 0xffff0000
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_f32_e64 v15, 0x40c00000, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v8
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT:    v_bfe_u32 v13, v5, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    s_lshl_b32 s0, s16, 16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_add_f32_e64 v17, 0x40c00000, s0
-; GFX11-NEXT:    s_and_b32 s0, s19, 0xffff0000
-; GFX11-NEXT:    v_dual_cndmask_b32 v4, v7, v8 :: v_dual_add_nc_u32 v7, v12, v9
-; GFX11-NEXT:    v_add_nc_u32_e32 v12, v13, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_bfe_u32 v13, v15, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v7
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v12
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v13, v13, v15
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v12, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v13
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v15
-; GFX11-NEXT:    v_bfe_u32 v13, v17, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v16, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-NEXT:    v_or_b32_e32 v15, 0x400000, v17
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v33
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v4
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s19, 16
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v12, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, v13, v17
-; GFX11-NEXT:    v_add_f32_e64 v13, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s18, 16
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT:    v_add_f32_e64 v16, 0x40c00000, s0
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT:    v_bfe_u32 v18, v13, 16, 1
-; GFX11-NEXT:    s_and_b32 s0, s18, 0xffff0000
-; GFX11-NEXT:    v_bfe_u32 v12, v4, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v20, v16, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v15, v18, v13
-; GFX11-NEXT:    v_add_f32_e64 v19, 0x40c00000, s0
-; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v13
-; GFX11-NEXT:    v_add_nc_u32_e32 v18, v20, v16
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT:    v_add_nc_u32_e32 v15, 0x7fff, v15
-; GFX11-NEXT:    v_add_nc_u32_e32 v12, v12, v4
-; GFX11-NEXT:    v_bfe_u32 v17, v19, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v18, 0x7fff, v18
-; GFX11-NEXT:    v_or_b32_e32 v23, 0x400000, v16
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, v15, v21, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v12, 0x7fff, v12
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v17, v17, v19
-; GFX11-NEXT:    v_cndmask_b32_e32 v16, v18, v23, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v15, 0x7fff, v17
-; GFX11-NEXT:    v_or_b32_e32 v17, 0x400000, v19
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v12, v20, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v7
-; GFX11-NEXT:    v_lshl_or_b32 v18, v22, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v12, v15, v17, vcc_lo
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v32
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v16
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 24, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v24
-; GFX11-NEXT:    v_lshl_or_b32 v26, v30, 16, v4
-; GFX11-NEXT:    v_lshl_or_b32 v17, v5, 16, v13
-; GFX11-NEXT:    v_lshl_or_b32 v9, v1, 16, v15
-; GFX11-NEXT:    v_lshl_or_b32 v1, v3, 16, v11
-; GFX11-NEXT:    v_lshl_or_b32 v25, v7, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 24, v26
-; GFX11-NEXT:    v_lshrrev_b64 v[19:20], 24, v[17:18]
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[1:2]
-; GFX11-NEXT:    v_lshrrev_b64 v[27:28], 24, v[25:26]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 8, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 8, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 8, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 8, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 24, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 8, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-NEXT:    s_branch .LBB109_5
-; GFX11-NEXT:  .LBB109_3:
-; GFX11-NEXT:    ; implicit-def: $sgpr20
-; GFX11-NEXT:    ; implicit-def: $sgpr21
-; GFX11-NEXT:    ; implicit-def: $sgpr4
-; GFX11-NEXT:    ; implicit-def: $sgpr14
-; GFX11-NEXT:    ; implicit-def: $sgpr43
-; GFX11-NEXT:    ; implicit-def: $sgpr13
-; GFX11-NEXT:    ; implicit-def: $sgpr24
-; GFX11-NEXT:    ; implicit-def: $sgpr25
-; GFX11-NEXT:    ; implicit-def: $sgpr6
-; GFX11-NEXT:    ; implicit-def: $sgpr22
-; GFX11-NEXT:    ; implicit-def: $sgpr44
-; GFX11-NEXT:    ; implicit-def: $sgpr15
-; GFX11-NEXT:    ; implicit-def: $sgpr28
-; GFX11-NEXT:    ; implicit-def: $sgpr29
-; GFX11-NEXT:    ; implicit-def: $sgpr8
-; GFX11-NEXT:    ; implicit-def: $sgpr26
-; GFX11-NEXT:    ; implicit-def: $sgpr45
-; GFX11-NEXT:    ; implicit-def: $sgpr23
-; GFX11-NEXT:    ; implicit-def: $sgpr41
-; GFX11-NEXT:    ; implicit-def: $sgpr42
-; GFX11-NEXT:    ; implicit-def: $sgpr10
-; GFX11-NEXT:    ; implicit-def: $sgpr40
-; GFX11-NEXT:    ; implicit-def: $sgpr46
-; GFX11-NEXT:    ; implicit-def: $sgpr27
-; GFX11-NEXT:    s_branch .LBB109_2
-; GFX11-NEXT:  .LBB109_4:
-; GFX11-NEXT:    v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v33, s17
-; GFX11-NEXT:    v_dual_mov_b32 v32, s19 :: v_dual_mov_b32 v35, s1
-; GFX11-NEXT:    v_dual_mov_b32 v30, s46 :: v_dual_mov_b32 v25, s41
-; GFX11-NEXT:    v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v31, s27
-; GFX11-NEXT:    v_dual_mov_b32 v22, s45 :: v_dual_mov_b32 v29, s40
-; GFX11-NEXT:    v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v17, s28
-; GFX11-NEXT:    v_dual_mov_b32 v34, s3 :: v_dual_mov_b32 v23, s23
-; GFX11-NEXT:    v_dual_mov_b32 v14, s44 :: v_dual_mov_b32 v21, s26
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v9, s24
-; GFX11-NEXT:    v_dual_mov_b32 v6, s43 :: v_dual_mov_b32 v15, s15
-; GFX11-NEXT:    v_dual_mov_b32 v26, s42 :: v_dual_mov_b32 v13, s22
-; GFX11-NEXT:    v_dual_mov_b32 v18, s29 :: v_dual_mov_b32 v1, s20
-; GFX11-NEXT:    v_dual_mov_b32 v10, s25 :: v_dual_mov_b32 v7, s13
-; GFX11-NEXT:    v_dual_mov_b32 v2, s21 :: v_dual_mov_b32 v5, s14
-; GFX11-NEXT:    v_mov_b32_e32 v27, s10
-; GFX11-NEXT:    v_mov_b32_e32 v19, s8
-; GFX11-NEXT:    v_mov_b32_e32 v11, s6
-; GFX11-NEXT:    v_mov_b32_e32 v3, s4
-; GFX11-NEXT:  .LBB109_5: ; %end
-; GFX11-NEXT:    v_mov_b32_e32 v4, v35
-; GFX11-NEXT:    v_mov_b32_e32 v12, v34
-; GFX11-NEXT:    v_mov_b32_e32 v20, v33
-; GFX11-NEXT:    v_mov_b32_e32 v28, v32
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v16bf16_to_v32i8_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB109_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s27, s19, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s19, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s18, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s23, s17, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s26, s17, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s29, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s28, s16, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s3, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s22, s3, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s25, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s24, s2, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s1, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s1, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s21, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s20, s0, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[10:11], s[18:19], 24
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[8:9], s[16:17], 24
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[6:7], s[2:3], 24
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[4:5], s[0:1], 24
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s12
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB109_4
+; GFX11-TRUE16-NEXT:  .LBB109_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s1, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, s1, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s0, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, s3, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s4
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v3, 16, 1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s2, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v9, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v10, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s18, 16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v19, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, v10, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v19, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v35.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v6.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, v20, v19
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, 0x7fff, v20
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s2, 16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s17, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s17, 16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, 0x400000, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v4
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v3, v3, v9 :: v_dual_add_nc_u32 v4, v5, v8
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s16, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v12, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v34.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v3, v3, v4 :: v_dual_add_nc_u32 v4, 0x7fff, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v13, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v11.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, v8, v12
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, v7, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s16, 16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v8.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v5, v15, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, v16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v5
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s19, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v7, v7, v12 :: v_dual_add_nc_u32 v12, 0x7fff, v15
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v15, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v22.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v7
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v12, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, v15, v5
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s19, 16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s18, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v13, v13, v7
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v17, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v12, v15, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v12, v16, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v13, 0x7fff, v13
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v15, v17, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v17
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, v12, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, v15, v17
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v7, v13, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, 0x7fff, v15
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v7
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v12, v12, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v33.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v30.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v12
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v13, v20, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v32.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v13
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v15, v15, v23, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[17:18]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[1:2]
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v24.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v18
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v5.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 8, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[27:28], 24, v[25:26]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-TRUE16-NEXT:    s_branch .LBB109_5
+; GFX11-TRUE16-NEXT:  .LBB109_3:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr20
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr21
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr14
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr43
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr13
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr24
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr25
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr6
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr22
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr44
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr15
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr28
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr29
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr8
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr26
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr45
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr23
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr40
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr27
+; GFX11-TRUE16-NEXT:    s_branch .LBB109_2
+; GFX11-TRUE16-NEXT:  .LBB109_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v33, s17
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v32, s19 :: v_dual_mov_b32 v35, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v30, s46 :: v_dual_mov_b32 v25, s41
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v31, s27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, s45 :: v_dual_mov_b32 v29, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v17, s28
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v34, s3 :: v_dual_mov_b32 v23, s23
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s44 :: v_dual_mov_b32 v21, s26
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v9, s24
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s43 :: v_dual_mov_b32 v15, s15
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, s42 :: v_dual_mov_b32 v13, s22
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, s29 :: v_dual_mov_b32 v1, s20
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s25 :: v_dual_mov_b32 v7, s13
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s21 :: v_dual_mov_b32 v5, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v27, s10
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v19, s8
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v11, s6
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-TRUE16-NEXT:  .LBB109_5: ; %end
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v35
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v12, v34
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v20, v33
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v28, v32
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v16bf16_to_v32i8_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s12, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB109_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s27, s19, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s46, s19, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s40, s19, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s18, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s18, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s23, s17, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s45, s17, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s26, s17, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s29, s16, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s28, s16, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s15, s3, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s44, s3, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s22, s3, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s25, s2, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s24, s2, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s13, s1, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s1, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s14, s1, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s21, s0, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s20, s0, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[10:11], s[18:19], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[8:9], s[16:17], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[6:7], s[2:3], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[4:5], s[0:1], 24
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s12
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB109_4
+; GFX11-FAKE16-NEXT:  .LBB109_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s1, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s1, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s0, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v3, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s3, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s2, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v4, v10, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v35
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v8, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, 0x400000, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v4, v11 :: v_dual_add_nc_u32 v12, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v10, v7
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v2, v6, 16, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, v9, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v12, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s2, 16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v1, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s17, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, v1, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s17, 16
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v4, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v34
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v8, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v12, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v14, 16, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s16, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s16, 16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v17, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s19, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v7, v8 :: v_dual_add_nc_u32 v7, v12, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, v13, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v15, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, v13, v15
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v15
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v17, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v16, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, 0x400000, v17
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s19, 16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, v13, v17
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s18, 16
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v18, v13, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s18, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v12, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v20, v16, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, v18, v13
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v19, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, 0x400000, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v18, v20, v16
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, 0x7fff, v15
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, v12, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v17, v19, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v18, 0x7fff, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, 0x400000, v16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v15, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, v17, v19
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v18, v23, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, 0x7fff, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, 0x400000, v19
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v12, v20, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v22, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v15, v17, vcc_lo
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v24
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v26, v30, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v17, v5, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v1, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v1, v3, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v25, v7, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[17:18]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[1:2]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[27:28], 24, v[25:26]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-FAKE16-NEXT:    s_branch .LBB109_5
+; GFX11-FAKE16-NEXT:  .LBB109_3:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr20
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr21
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr43
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr24
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr25
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr22
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr44
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr15
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr26
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr45
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr23
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr41
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr40
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr27
+; GFX11-FAKE16-NEXT:    s_branch .LBB109_2
+; GFX11-FAKE16-NEXT:  .LBB109_4:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v33, s17
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, s19 :: v_dual_mov_b32 v35, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v30, s46 :: v_dual_mov_b32 v25, s41
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v31, s27
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v22, s45 :: v_dual_mov_b32 v29, s40
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v17, s28
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, s3 :: v_dual_mov_b32 v23, s23
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s44 :: v_dual_mov_b32 v21, s26
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v9, s24
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, s43 :: v_dual_mov_b32 v15, s15
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v26, s42 :: v_dual_mov_b32 v13, s22
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v18, s29 :: v_dual_mov_b32 v1, s20
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, s25 :: v_dual_mov_b32 v7, s13
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s21 :: v_dual_mov_b32 v5, s14
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v27, s10
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v19, s8
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v11, s6
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-FAKE16-NEXT:  .LBB109_5: ; %end
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v35
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v12, v34
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v20, v33
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v28, v32
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -41235,177 +42897,351 @@ define inreg <16 x bfloat> @bitcast_v32i8_to_v16bf16_scalar(<32 x i8> inreg %a,
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
 ; GFX9-NEXT:    s_branch .LBB111_2
 ;
-; GFX11-LABEL: bitcast_v32i8_to_v16bf16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT:    v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v17, v0
-; GFX11-NEXT:    v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v15, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 8, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 8, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 8, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 8, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cbranch_scc0 .LBB111_4
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    s_and_b32 s5, s0, 0xff
-; GFX11-NEXT:    s_lshl_b32 s6, s1, 8
-; GFX11-NEXT:    s_and_b32 s7, s2, 0xff
-; GFX11-NEXT:    s_lshl_b32 s8, s3, 8
-; GFX11-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-NEXT:    s_or_b32 s6, s7, s8
-; GFX11-NEXT:    s_and_b32 s7, s16, 0xff
-; GFX11-NEXT:    s_lshl_b32 s8, s17, 8
-; GFX11-NEXT:    s_and_b32 s9, s18, 0xff
-; GFX11-NEXT:    s_lshl_b32 s10, s19, 8
-; GFX11-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-NEXT:    s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-NEXT:    s_and_b32 s7, s20, 0xff
-; GFX11-NEXT:    s_lshl_b32 s8, s21, 8
-; GFX11-NEXT:    s_and_b32 s9, s22, 0xff
-; GFX11-NEXT:    s_lshl_b32 s10, s23, 8
-; GFX11-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-NEXT:    s_and_b32 s9, s24, 0xff
-; GFX11-NEXT:    s_lshl_b32 s10, s25, 8
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v15
-; GFX11-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-NEXT:    s_and_b32 s9, s26, 0xff
-; GFX11-NEXT:    s_lshl_b32 s10, s27, 8
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v16
-; GFX11-NEXT:    s_or_b32 s9, s9, s10
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v20
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v10
-; GFX11-NEXT:    s_pack_ll_b32_b16 s8, s8, s9
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v18
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v17
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v8
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v21
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v12
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v19
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v11
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    s_and_b32 s11, s28, 0xff
-; GFX11-NEXT:    s_lshl_b32 s12, s29, 8
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v9
-; GFX11-NEXT:    s_or_b32 s10, s11, s12
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff, v4
-; GFX11-NEXT:    v_and_b32_e64 v3, 0xffff, s10
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v13
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff, v5
-; GFX11-NEXT:    v_lshl_or_b32 v5, v2, 16, v1
-; GFX11-NEXT:    v_mov_b32_e32 v1, s6
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v14
-; GFX11-NEXT:    v_lshl_or_b32 v6, v6, 16, v22
-; GFX11-NEXT:    v_lshl_or_b32 v7, v7, 16, v23
-; GFX11-NEXT:    v_mov_b32_e32 v2, s7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_lshl_or_b32 v4, v0, 16, v3
-; GFX11-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s8
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_vccnz .LBB111_3
-; GFX11-NEXT:  .LBB111_2: ; %cmp.true
-; GFX11-NEXT:    s_add_i32 s28, s28, 3
-; GFX11-NEXT:    s_lshl_b32 s5, s29, 8
-; GFX11-NEXT:    s_and_b32 s4, s28, 0xff
-; GFX11-NEXT:    s_add_i32 s24, s24, 3
-; GFX11-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-NEXT:    s_and_b32 s5, s24, 0xff
-; GFX11-NEXT:    s_lshl_b32 s6, s25, 8
-; GFX11-NEXT:    s_add_i32 s26, s26, 3
-; GFX11-NEXT:    s_or_b32 s5, s6, s5
-; GFX11-NEXT:    s_and_b32 s6, s26, 0xff
-; GFX11-NEXT:    s_lshl_b32 s7, s27, 8
-; GFX11-NEXT:    s_add_i32 s20, s20, 3
-; GFX11-NEXT:    s_or_b32 s6, s7, s6
-; GFX11-NEXT:    s_and_b32 s7, s20, 0xff
-; GFX11-NEXT:    s_lshl_b32 s8, s21, 8
-; GFX11-NEXT:    s_add_i32 s22, s22, 3
-; GFX11-NEXT:    s_or_b32 s7, s8, s7
-; GFX11-NEXT:    s_and_b32 s8, s22, 0xff
-; GFX11-NEXT:    s_lshl_b32 s9, s23, 8
-; GFX11-NEXT:    s_add_i32 s16, s16, 3
-; GFX11-NEXT:    s_or_b32 s8, s9, s8
-; GFX11-NEXT:    s_and_b32 s9, s16, 0xff
-; GFX11-NEXT:    s_lshl_b32 s10, s17, 8
-; GFX11-NEXT:    s_add_i32 s18, s18, 3
-; GFX11-NEXT:    s_add_i32 s0, s0, 3
-; GFX11-NEXT:    s_add_i32 s2, s2, 3
-; GFX11-NEXT:    s_or_b32 s9, s10, s9
-; GFX11-NEXT:    s_and_b32 s10, s18, 0xff
-; GFX11-NEXT:    s_lshl_b32 s11, s19, 8
-; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX11-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX11-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-NEXT:    s_or_b32 s10, s11, s10
-; GFX11-NEXT:    s_or_b32 s0, s1, s0
-; GFX11-NEXT:    s_or_b32 s1, s3, s2
-; GFX11-NEXT:    s_addk_i32 s5, 0x300
-; GFX11-NEXT:    s_addk_i32 s6, 0x300
-; GFX11-NEXT:    s_addk_i32 s9, 0x300
-; GFX11-NEXT:    s_addk_i32 s0, 0x300
-; GFX11-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-NEXT:    s_addk_i32 s10, 0x300
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v15
-; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-NEXT:    s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v10
-; GFX11-NEXT:    s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v16
-; GFX11-NEXT:    s_addk_i32 s7, 0x300
-; GFX11-NEXT:    s_addk_i32 s8, 0x300
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v12
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 3, v18
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 3, v17
-; GFX11-NEXT:    s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v8
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_or_b32_e32 v0, v11, v0
-; GFX11-NEXT:    v_or_b32_e32 v2, v21, v2
-; GFX11-NEXT:    v_or_b32_e32 v4, v20, v4
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_or_b32_e32 v1, v13, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-NEXT:    v_or_b32_e32 v3, v9, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-NEXT:    v_or_b32_e32 v5, v19, v5
-; GFX11-NEXT:    v_or_b32_e32 v6, v14, v6
-; GFX11-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 0x300, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-NEXT:    v_and_b32_e64 v7, 0xffff, s4
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v4
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshl_or_b32 v4, v6, 16, v7
-; GFX11-NEXT:    v_lshl_or_b32 v5, v5, 16, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshl_or_b32 v6, v3, 16, v2
-; GFX11-NEXT:    v_lshl_or_b32 v7, v1, 16, v0
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT:  .LBB111_3: ; %end
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB111_4:
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
-; GFX11-NEXT:    s_branch .LBB111_2
+; GFX11-TRUE16-LABEL: bitcast_v32i8_to_v16bf16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v17, v0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v15, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 8, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB111_4
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    s_and_b32 s5, s0, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s1, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s7, s2, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s3, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s5, s5, s6
+; GFX11-TRUE16-NEXT:    s_or_b32 s6, s7, s8
+; GFX11-TRUE16-NEXT:    s_and_b32 s7, s16, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s17, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s9, s18, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s19, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s7, s7, s8
+; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s7, s8
+; GFX11-TRUE16-NEXT:    s_and_b32 s7, s20, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s21, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s9, s22, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s23, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s7, s7, s8
+; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-TRUE16-NEXT:    s_and_b32 s9, s24, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s25, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s11, s26, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s12, s27, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s9, s9, s10
+; GFX11-TRUE16-NEXT:    s_or_b32 s10, s11, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s9, s10
+; GFX11-TRUE16-NEXT:    s_and_b32 s9, s28, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s29, 8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v10
+; GFX11-TRUE16-NEXT:    s_or_b32 s9, s9, s10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s9 :: v_dual_and_b32 v1, 0xff, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v7, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v18
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v1, v20
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v13
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v3, v21
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, s8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v19
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, s6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v14
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, s5
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB111_3
+; GFX11-TRUE16-NEXT:  .LBB111_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_add_i32 s28, s28, 3
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s29, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s28, 0xff
+; GFX11-TRUE16-NEXT:    s_add_i32 s24, s24, 3
+; GFX11-TRUE16-NEXT:    s_or_b32 s4, s5, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s5, s24, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s25, 8
+; GFX11-TRUE16-NEXT:    s_add_i32 s26, s26, 3
+; GFX11-TRUE16-NEXT:    s_or_b32 s5, s6, s5
+; GFX11-TRUE16-NEXT:    s_and_b32 s6, s26, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s7, s27, 8
+; GFX11-TRUE16-NEXT:    s_add_i32 s20, s20, 3
+; GFX11-TRUE16-NEXT:    s_or_b32 s6, s7, s6
+; GFX11-TRUE16-NEXT:    s_and_b32 s7, s20, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s21, 8
+; GFX11-TRUE16-NEXT:    s_add_i32 s22, s22, 3
+; GFX11-TRUE16-NEXT:    s_or_b32 s7, s8, s7
+; GFX11-TRUE16-NEXT:    s_and_b32 s8, s22, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s23, 8
+; GFX11-TRUE16-NEXT:    s_add_i32 s16, s16, 3
+; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s8
+; GFX11-TRUE16-NEXT:    s_and_b32 s9, s16, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s17, 8
+; GFX11-TRUE16-NEXT:    s_add_i32 s18, s18, 3
+; GFX11-TRUE16-NEXT:    s_add_i32 s0, s0, 3
+; GFX11-TRUE16-NEXT:    s_add_i32 s2, s2, 3
+; GFX11-TRUE16-NEXT:    s_or_b32 s9, s10, s9
+; GFX11-TRUE16-NEXT:    s_and_b32 s10, s18, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s11, s19, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s10, s11, s10
+; GFX11-TRUE16-NEXT:    s_or_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    s_or_b32 s1, s3, s2
+; GFX11-TRUE16-NEXT:    s_addk_i32 s9, 0x300
+; GFX11-TRUE16-NEXT:    s_addk_i32 s0, 0x300
+; GFX11-TRUE16-NEXT:    s_addk_i32 s1, 0x300
+; GFX11-TRUE16-NEXT:    s_addk_i32 s10, 0x300
+; GFX11-TRUE16-NEXT:    s_addk_i32 s4, 0x300
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v10
+; GFX11-TRUE16-NEXT:    s_addk_i32 s7, 0x300
+; GFX11-TRUE16-NEXT:    s_addk_i32 s8, 0x300
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s7, s8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v15
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v18
+; GFX11-TRUE16-NEXT:    s_addk_i32 s5, 0x300
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v11, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v13, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-TRUE16-NEXT:    s_addk_i32 s6, 0x300
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v1
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v0.l
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_and_b32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v9, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v20, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x300, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v19, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_add_nc_u32 v3, 0x300, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v3.l
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, s3 :: v_dual_and_b32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v21, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v17
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v1.l
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_and_b32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v14, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-TRUE16-NEXT:  .LBB111_3: ; %end
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB111_4:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+; GFX11-TRUE16-NEXT:    s_branch .LBB111_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v32i8_to_v16bf16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v17, v0
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v15, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB111_4
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    s_and_b32 s5, s0, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s1, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s7, s2, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s3, 8
+; GFX11-FAKE16-NEXT:    s_or_b32 s5, s5, s6
+; GFX11-FAKE16-NEXT:    s_or_b32 s6, s7, s8
+; GFX11-FAKE16-NEXT:    s_and_b32 s7, s16, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s17, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s9, s18, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s19, 8
+; GFX11-FAKE16-NEXT:    s_or_b32 s7, s7, s8
+; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s6, s7, s8
+; GFX11-FAKE16-NEXT:    s_and_b32 s7, s20, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s21, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s9, s22, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s23, 8
+; GFX11-FAKE16-NEXT:    s_or_b32 s7, s7, s8
+; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-FAKE16-NEXT:    s_and_b32 s9, s24, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s25, 8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v15
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
+; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-FAKE16-NEXT:    s_and_b32 s9, s26, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s27, 8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v16
+; GFX11-FAKE16-NEXT:    s_or_b32 s9, s9, s10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v10
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s8, s8, s9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    s_and_b32 s11, s28, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s12, s29, 8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v9
+; GFX11-FAKE16-NEXT:    s_or_b32 s10, s11, s12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e64 v3, 0xffff, s10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v2, 16, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, s6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v14
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v6, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v7, 16, v23
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, s7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v0, 16, v3
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s8
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB111_3
+; GFX11-FAKE16-NEXT:  .LBB111_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_add_i32 s28, s28, 3
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s29, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s28, 0xff
+; GFX11-FAKE16-NEXT:    s_add_i32 s24, s24, 3
+; GFX11-FAKE16-NEXT:    s_or_b32 s4, s5, s4
+; GFX11-FAKE16-NEXT:    s_and_b32 s5, s24, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s25, 8
+; GFX11-FAKE16-NEXT:    s_add_i32 s26, s26, 3
+; GFX11-FAKE16-NEXT:    s_or_b32 s5, s6, s5
+; GFX11-FAKE16-NEXT:    s_and_b32 s6, s26, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s27, 8
+; GFX11-FAKE16-NEXT:    s_add_i32 s20, s20, 3
+; GFX11-FAKE16-NEXT:    s_or_b32 s6, s7, s6
+; GFX11-FAKE16-NEXT:    s_and_b32 s7, s20, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s21, 8
+; GFX11-FAKE16-NEXT:    s_add_i32 s22, s22, 3
+; GFX11-FAKE16-NEXT:    s_or_b32 s7, s8, s7
+; GFX11-FAKE16-NEXT:    s_and_b32 s8, s22, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s9, s23, 8
+; GFX11-FAKE16-NEXT:    s_add_i32 s16, s16, 3
+; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s8
+; GFX11-FAKE16-NEXT:    s_and_b32 s9, s16, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s17, 8
+; GFX11-FAKE16-NEXT:    s_add_i32 s18, s18, 3
+; GFX11-FAKE16-NEXT:    s_add_i32 s0, s0, 3
+; GFX11-FAKE16-NEXT:    s_add_i32 s2, s2, 3
+; GFX11-FAKE16-NEXT:    s_or_b32 s9, s10, s9
+; GFX11-FAKE16-NEXT:    s_and_b32 s10, s18, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s11, s19, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX11-FAKE16-NEXT:    s_or_b32 s10, s11, s10
+; GFX11-FAKE16-NEXT:    s_or_b32 s0, s1, s0
+; GFX11-FAKE16-NEXT:    s_or_b32 s1, s3, s2
+; GFX11-FAKE16-NEXT:    s_addk_i32 s5, 0x300
+; GFX11-FAKE16-NEXT:    s_addk_i32 s6, 0x300
+; GFX11-FAKE16-NEXT:    s_addk_i32 s9, 0x300
+; GFX11-FAKE16-NEXT:    s_addk_i32 s0, 0x300
+; GFX11-FAKE16-NEXT:    s_addk_i32 s1, 0x300
+; GFX11-FAKE16-NEXT:    s_addk_i32 s10, 0x300
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v15
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v10
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v16
+; GFX11-FAKE16-NEXT:    s_addk_i32 s7, 0x300
+; GFX11-FAKE16-NEXT:    s_addk_i32 s8, 0x300
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 3, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 3, v17
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s2, s7, s8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v11, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v21, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v20, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v13, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v9, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v19, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v14, v6
+; GFX11-FAKE16-NEXT:    s_addk_i32 s4, 0x300
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x300, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e64 v7, 0xffff, s4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v6, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v5, 16, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v3, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT:  .LBB111_3: ; %end
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB111_4:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+; GFX11-FAKE16-NEXT:    s_branch .LBB111_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
index 682b2a978eb11..7adaa6d3c3651 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
@@ -19562,212 +19562,421 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_branch .LBB51_2
 ;
-; GFX11-LABEL: bitcast_v40i8_to_v20i16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v22
-; GFX11-NEXT:    v_dual_mov_b32 v25, v14 :: v_dual_mov_b32 v28, v12
-; GFX11-NEXT:    v_dual_mov_b32 v27, v10 :: v_dual_mov_b32 v26, v8
-; GFX11-NEXT:    v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v0
-; GFX11-NEXT:    v_dual_mov_b32 v30, v4 :: v_dual_mov_b32 v29, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 8, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v32, 8, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v31, 8, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 8, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v33, 8, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 8, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 8, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 8, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 8, v19
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 8, v21
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cbranch_scc0 .LBB51_4
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    s_and_b32 s5, s0, 0xff
-; GFX11-NEXT:    s_lshl_b32 s6, s1, 8
-; GFX11-NEXT:    s_and_b32 s7, s2, 0xff
-; GFX11-NEXT:    s_lshl_b32 s8, s3, 8
-; GFX11-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-NEXT:    s_or_b32 s6, s7, s8
-; GFX11-NEXT:    s_and_b32 s7, s16, 0xff
-; GFX11-NEXT:    s_lshl_b32 s8, s17, 8
-; GFX11-NEXT:    s_and_b32 s9, s18, 0xff
-; GFX11-NEXT:    s_lshl_b32 s10, s19, 8
-; GFX11-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-NEXT:    s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-NEXT:    s_and_b32 s7, s20, 0xff
-; GFX11-NEXT:    s_lshl_b32 s8, s21, 8
-; GFX11-NEXT:    s_and_b32 s9, s22, 0xff
-; GFX11-NEXT:    s_lshl_b32 s10, s23, 8
-; GFX11-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-NEXT:    s_and_b32 s9, s24, 0xff
-; GFX11-NEXT:    s_lshl_b32 s10, s25, 8
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v23
-; GFX11-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-NEXT:    s_and_b32 s9, s26, 0xff
-; GFX11-NEXT:    s_lshl_b32 s10, s27, 8
-; GFX11-NEXT:    s_and_b32 s11, s28, 0xff
-; GFX11-NEXT:    s_lshl_b32 s12, s29, 8
-; GFX11-NEXT:    s_or_b32 s9, s9, s10
-; GFX11-NEXT:    s_or_b32 s10, s11, s12
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v22
-; GFX11-NEXT:    v_and_b32_e64 v2, 0xffff, s10
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v30
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v24
-; GFX11-NEXT:    s_pack_ll_b32_b16 s8, s8, s9
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v28
-; GFX11-NEXT:    v_lshl_or_b32 v4, v0, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v26
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v31
-; GFX11-NEXT:    v_or_b32_e32 v3, v5, v34
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v29
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v27
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v33
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v25
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v18
-; GFX11-NEXT:    v_or_b32_e32 v9, v6, v35
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v16
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v36
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v37
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v20
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v19
-; GFX11-NEXT:    v_or_b32_e32 v12, v6, v17
-; GFX11-NEXT:    v_lshl_or_b32 v6, v0, 16, v3
-; GFX11-NEXT:    v_mov_b32_e32 v0, s5
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v32
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v5
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v7
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v21
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v8
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_lshl_or_b32 v7, v9, 16, v11
-; GFX11-NEXT:    v_lshl_or_b32 v8, v12, 16, v13
-; GFX11-NEXT:    v_mov_b32_e32 v3, s8
-; GFX11-NEXT:    v_lshl_or_b32 v9, v10, 16, v14
-; GFX11-NEXT:    v_lshl_or_b32 v5, v2, 16, v1
-; GFX11-NEXT:    v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_vccnz .LBB51_3
-; GFX11-NEXT:  .LBB51_2: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v27
-; GFX11-NEXT:    s_add_i32 s28, s28, 3
-; GFX11-NEXT:    s_lshl_b32 s5, s29, 8
-; GFX11-NEXT:    s_and_b32 s4, s28, 0xff
-; GFX11-NEXT:    s_add_i32 s24, s24, 3
-; GFX11-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-NEXT:    s_and_b32 s5, s24, 0xff
-; GFX11-NEXT:    s_lshl_b32 s6, s25, 8
-; GFX11-NEXT:    s_add_i32 s26, s26, 3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    s_or_b32 s5, s6, s5
-; GFX11-NEXT:    s_and_b32 s6, s26, 0xff
-; GFX11-NEXT:    s_lshl_b32 s7, s27, 8
-; GFX11-NEXT:    s_add_i32 s20, s20, 3
-; GFX11-NEXT:    s_or_b32 s6, s7, s6
-; GFX11-NEXT:    s_and_b32 s7, s20, 0xff
-; GFX11-NEXT:    s_lshl_b32 s8, s21, 8
-; GFX11-NEXT:    s_add_i32 s22, s22, 3
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 3, v28
-; GFX11-NEXT:    v_or_b32_e32 v4, v36, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 3, v26
-; GFX11-NEXT:    s_or_b32 s7, s8, s7
-; GFX11-NEXT:    s_and_b32 s8, s22, 0xff
-; GFX11-NEXT:    s_lshl_b32 s9, s23, 8
-; GFX11-NEXT:    s_add_i32 s16, s16, 3
-; GFX11-NEXT:    s_or_b32 s8, s9, s8
-; GFX11-NEXT:    s_and_b32 s9, s16, 0xff
-; GFX11-NEXT:    s_lshl_b32 s10, s17, 8
-; GFX11-NEXT:    s_add_i32 s18, s18, 3
-; GFX11-NEXT:    s_add_i32 s0, s0, 3
-; GFX11-NEXT:    s_add_i32 s2, s2, 3
-; GFX11-NEXT:    s_or_b32 s9, s10, s9
-; GFX11-NEXT:    s_and_b32 s10, s18, 0xff
-; GFX11-NEXT:    s_lshl_b32 s11, s19, 8
-; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX11-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX11-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 3, v24
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, 0x300, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v7
-; GFX11-NEXT:    s_or_b32 s10, s11, s10
-; GFX11-NEXT:    s_or_b32 s0, s1, s0
-; GFX11-NEXT:    s_or_b32 s1, s3, s2
-; GFX11-NEXT:    s_addk_i32 s5, 0x300
-; GFX11-NEXT:    s_addk_i32 s6, 0x300
-; GFX11-NEXT:    s_addk_i32 s9, 0x300
-; GFX11-NEXT:    s_addk_i32 s10, 0x300
-; GFX11-NEXT:    s_addk_i32 s0, 0x300
-; GFX11-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, 3, v29
-; GFX11-NEXT:    v_or_b32_e32 v5, v35, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_or_b32_e32 v4, v33, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, 3, v23
-; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-NEXT:    s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v18
-; GFX11-NEXT:    s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v25
-; GFX11-NEXT:    s_addk_i32 s7, 0x300
-; GFX11-NEXT:    s_addk_i32 s8, 0x300
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v20
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, 0x300, v5
-; GFX11-NEXT:    v_or_b32_e32 v5, v34, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 3, v30
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v10
-; GFX11-NEXT:    s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v16
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_or_b32_e32 v0, v19, v0
-; GFX11-NEXT:    v_or_b32_e32 v2, v37, v2
-; GFX11-NEXT:    v_or_b32_e32 v7, v32, v7
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_or_b32_e32 v4, v22, v4
-; GFX11-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-NEXT:    v_or_b32_e32 v1, v21, v1
-; GFX11-NEXT:    v_or_b32_e32 v3, v17, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 0x300, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-NEXT:    v_or_b32_e32 v6, v31, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-NEXT:    v_and_b32_e64 v10, 0xffff, s4
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshl_or_b32 v4, v4, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v5
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshl_or_b32 v5, v6, 16, v7
-; GFX11-NEXT:    v_lshl_or_b32 v6, v11, 16, v10
-; GFX11-NEXT:    v_lshl_or_b32 v7, v8, 16, v9
-; GFX11-NEXT:    v_lshl_or_b32 v8, v3, 16, v2
-; GFX11-NEXT:    v_lshl_or_b32 v9, v1, 16, v0
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT:  .LBB51_3: ; %end
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB51_4:
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-NEXT:    s_branch .LBB51_2
+; GFX11-TRUE16-LABEL: bitcast_v40i8_to_v20i16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v22
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v14 :: v_dual_mov_b32 v28, v12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v27, v10 :: v_dual_mov_b32 v26, v8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v30, v4 :: v_dual_mov_b32 v29, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 8, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v34, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v33, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v36, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v35, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v37, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 8, v21
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB51_4
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    s_and_b32 s5, s0, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s1, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s7, s2, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s3, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s5, s5, s6
+; GFX11-TRUE16-NEXT:    s_or_b32 s6, s7, s8
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s7, s17, 8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-TRUE16-NEXT:    s_and_b32 s6, s16, 0xff
+; GFX11-TRUE16-NEXT:    s_and_b32 s8, s18, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s19, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s6, s6, s7
+; GFX11-TRUE16-NEXT:    s_or_b32 s7, s8, s9
+; GFX11-TRUE16-NEXT:    s_and_b32 s8, s20, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s21, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s10, s22, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s11, s23, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s8, s8, s9
+; GFX11-TRUE16-NEXT:    s_or_b32 s9, s10, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s6, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s8, s9
+; GFX11-TRUE16-NEXT:    s_and_b32 s8, s24, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s25, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s10, s26, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s11, s27, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s8, s8, s9
+; GFX11-TRUE16-NEXT:    s_or_b32 s9, s10, s11
+; GFX11-TRUE16-NEXT:    s_and_b32 s10, s28, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s11, s29, 8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s8, s9
+; GFX11-TRUE16-NEXT:    s_or_b32 s9, s10, s11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s9 :: v_dual_and_b32 v1, 0xff, v30
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v29
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xff, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v23
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v26
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v2, v32
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v9, v19
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v27
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v1, v36
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v20
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v21
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, s6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v22
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v24
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v0, v34
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v28
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v35
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, s5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v3, v33
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v25
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v3, v37
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, s8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v17
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s7
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB51_3
+; GFX11-TRUE16-NEXT:  .LBB51_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_add_i32 s28, s28, 3
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s29, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s28, 0xff
+; GFX11-TRUE16-NEXT:    s_add_i32 s24, s24, 3
+; GFX11-TRUE16-NEXT:    s_or_b32 s4, s5, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s5, s24, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s25, 8
+; GFX11-TRUE16-NEXT:    s_add_i32 s26, s26, 3
+; GFX11-TRUE16-NEXT:    s_add_i32 s0, s0, 3
+; GFX11-TRUE16-NEXT:    s_add_i32 s2, s2, 3
+; GFX11-TRUE16-NEXT:    s_or_b32 s5, s6, s5
+; GFX11-TRUE16-NEXT:    s_and_b32 s6, s26, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s7, s27, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s6, s7, s6
+; GFX11-TRUE16-NEXT:    s_or_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    s_or_b32 s1, s3, s2
+; GFX11-TRUE16-NEXT:    s_addk_i32 s5, 0x300
+; GFX11-TRUE16-NEXT:    s_addk_i32 s6, 0x300
+; GFX11-TRUE16-NEXT:    s_addk_i32 s0, 0x300
+; GFX11-TRUE16-NEXT:    s_addk_i32 s1, 0x300
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v25
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v27
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v26
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 3, v29
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-TRUE16-NEXT:    s_add_i32 s20, s20, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v37, v2
+; GFX11-TRUE16-NEXT:    s_and_b32 s7, s20, 0xff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v17, v3
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s21, 8
+; GFX11-TRUE16-NEXT:    s_add_i32 s22, s22, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 0x300, v2
+; GFX11-TRUE16-NEXT:    s_or_b32 s7, s8, s7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v36, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v24
+; GFX11-TRUE16-NEXT:    s_and_b32 s8, s22, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s23, 8
+; GFX11-TRUE16-NEXT:    s_add_i32 s16, s16, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 3, v30
+; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s8
+; GFX11-TRUE16-NEXT:    s_and_b32 s9, s16, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s17, 8
+; GFX11-TRUE16-NEXT:    s_add_i32 s18, s18, 3
+; GFX11-TRUE16-NEXT:    s_or_b32 s9, s10, s9
+; GFX11-TRUE16-NEXT:    s_and_b32 s10, s18, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s11, s19, 8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v33, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xff, v6
+; GFX11-TRUE16-NEXT:    s_or_b32 s10, s11, s10
+; GFX11-TRUE16-NEXT:    s_addk_i32 s9, 0x300
+; GFX11-TRUE16-NEXT:    s_addk_i32 s10, 0x300
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v31, v10
+; GFX11-TRUE16-NEXT:    s_addk_i32 s4, 0x300
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v20
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v18
+; GFX11-TRUE16-NEXT:    s_addk_i32 s7, 0x300
+; GFX11-TRUE16-NEXT:    s_addk_i32 s8, 0x300
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 0x300, v4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s7, s8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v34, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v19, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v21, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v23
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 0x300, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v32, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v28
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v11.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x300, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v22, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v35, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, s3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-TRUE16-NEXT:  .LBB51_3: ; %end
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB51_4:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-TRUE16-NEXT:    s_branch .LBB51_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v40i8_to_v20i16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v22
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v25, v14 :: v_dual_mov_b32 v28, v12
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v27, v10 :: v_dual_mov_b32 v26, v8
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v0
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v30, v4 :: v_dual_mov_b32 v29, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v31, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v33, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v36, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v35, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 8, v21
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB51_4
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    s_and_b32 s5, s0, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s1, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s7, s2, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s3, 8
+; GFX11-FAKE16-NEXT:    s_or_b32 s5, s5, s6
+; GFX11-FAKE16-NEXT:    s_or_b32 s6, s7, s8
+; GFX11-FAKE16-NEXT:    s_and_b32 s7, s16, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s17, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s9, s18, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s19, 8
+; GFX11-FAKE16-NEXT:    s_or_b32 s7, s7, s8
+; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s6, s7, s8
+; GFX11-FAKE16-NEXT:    s_and_b32 s7, s20, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s21, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s9, s22, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s23, 8
+; GFX11-FAKE16-NEXT:    s_or_b32 s7, s7, s8
+; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-FAKE16-NEXT:    s_and_b32 s9, s24, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s25, 8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v23
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
+; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-FAKE16-NEXT:    s_and_b32 s9, s26, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s27, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s11, s28, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s12, s29, 8
+; GFX11-FAKE16-NEXT:    s_or_b32 s9, s9, s10
+; GFX11-FAKE16-NEXT:    s_or_b32 s10, s11, s12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e64 v2, 0xffff, s10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v30
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v24
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s8, s8, s9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v28
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v0, 16, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v26
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v3, v31
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v5, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v6, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v37
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v6, v17
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v0, 16, v3
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, s5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v9, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v12, 16, v13
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, s8
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v10, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v2, 16, v1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB51_3
+; GFX11-FAKE16-NEXT:  .LBB51_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v27
+; GFX11-FAKE16-NEXT:    s_add_i32 s28, s28, 3
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s29, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s28, 0xff
+; GFX11-FAKE16-NEXT:    s_add_i32 s24, s24, 3
+; GFX11-FAKE16-NEXT:    s_or_b32 s4, s5, s4
+; GFX11-FAKE16-NEXT:    s_and_b32 s5, s24, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s25, 8
+; GFX11-FAKE16-NEXT:    s_add_i32 s26, s26, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    s_or_b32 s5, s6, s5
+; GFX11-FAKE16-NEXT:    s_and_b32 s6, s26, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s27, 8
+; GFX11-FAKE16-NEXT:    s_add_i32 s20, s20, 3
+; GFX11-FAKE16-NEXT:    s_or_b32 s6, s7, s6
+; GFX11-FAKE16-NEXT:    s_and_b32 s7, s20, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s21, 8
+; GFX11-FAKE16-NEXT:    s_add_i32 s22, s22, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 3, v28
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v36, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 3, v26
+; GFX11-FAKE16-NEXT:    s_or_b32 s7, s8, s7
+; GFX11-FAKE16-NEXT:    s_and_b32 s8, s22, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s9, s23, 8
+; GFX11-FAKE16-NEXT:    s_add_i32 s16, s16, 3
+; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s8
+; GFX11-FAKE16-NEXT:    s_and_b32 s9, s16, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s17, 8
+; GFX11-FAKE16-NEXT:    s_add_i32 s18, s18, 3
+; GFX11-FAKE16-NEXT:    s_add_i32 s0, s0, 3
+; GFX11-FAKE16-NEXT:    s_add_i32 s2, s2, 3
+; GFX11-FAKE16-NEXT:    s_or_b32 s9, s10, s9
+; GFX11-FAKE16-NEXT:    s_and_b32 s10, s18, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s11, s19, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 3, v24
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v7
+; GFX11-FAKE16-NEXT:    s_or_b32 s10, s11, s10
+; GFX11-FAKE16-NEXT:    s_or_b32 s0, s1, s0
+; GFX11-FAKE16-NEXT:    s_or_b32 s1, s3, s2
+; GFX11-FAKE16-NEXT:    s_addk_i32 s5, 0x300
+; GFX11-FAKE16-NEXT:    s_addk_i32 s6, 0x300
+; GFX11-FAKE16-NEXT:    s_addk_i32 s9, 0x300
+; GFX11-FAKE16-NEXT:    s_addk_i32 s10, 0x300
+; GFX11-FAKE16-NEXT:    s_addk_i32 s0, 0x300
+; GFX11-FAKE16-NEXT:    s_addk_i32 s1, 0x300
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 3, v29
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v35, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v33, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 3, v23
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v18
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v25
+; GFX11-FAKE16-NEXT:    s_addk_i32 s7, 0x300
+; GFX11-FAKE16-NEXT:    s_addk_i32 s8, 0x300
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 0x300, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v34, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 3, v30
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v10
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s2, s7, s8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v19, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v37, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v32, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v22, v4
+; GFX11-FAKE16-NEXT:    s_addk_i32 s4, 0x300
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v21, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v17, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x300, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v31, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e64 v10, 0xffff, s4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v4, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v6, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v11, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v8, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v3, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT:  .LBB51_3: ; %end
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB51_4:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-FAKE16-NEXT:    s_branch .LBB51_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -25600,212 +25809,421 @@ define inreg <20 x half> @bitcast_v40i8_to_v20f16_scalar(<40 x i8> inreg %a, i32
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_branch .LBB63_2
 ;
-; GFX11-LABEL: bitcast_v40i8_to_v20f16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v22
-; GFX11-NEXT:    v_dual_mov_b32 v25, v14 :: v_dual_mov_b32 v28, v12
-; GFX11-NEXT:    v_dual_mov_b32 v27, v10 :: v_dual_mov_b32 v26, v8
-; GFX11-NEXT:    v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v0
-; GFX11-NEXT:    v_dual_mov_b32 v30, v4 :: v_dual_mov_b32 v29, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 8, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v32, 8, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v31, 8, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 8, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v33, 8, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 8, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 8, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 8, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 8, v19
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 8, v21
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cbranch_scc0 .LBB63_4
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    s_and_b32 s5, s0, 0xff
-; GFX11-NEXT:    s_lshl_b32 s6, s1, 8
-; GFX11-NEXT:    s_and_b32 s7, s2, 0xff
-; GFX11-NEXT:    s_lshl_b32 s8, s3, 8
-; GFX11-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-NEXT:    s_or_b32 s6, s7, s8
-; GFX11-NEXT:    s_and_b32 s7, s16, 0xff
-; GFX11-NEXT:    s_lshl_b32 s8, s17, 8
-; GFX11-NEXT:    s_and_b32 s9, s18, 0xff
-; GFX11-NEXT:    s_lshl_b32 s10, s19, 8
-; GFX11-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-NEXT:    s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-NEXT:    s_and_b32 s7, s20, 0xff
-; GFX11-NEXT:    s_lshl_b32 s8, s21, 8
-; GFX11-NEXT:    s_and_b32 s9, s22, 0xff
-; GFX11-NEXT:    s_lshl_b32 s10, s23, 8
-; GFX11-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-NEXT:    s_and_b32 s9, s24, 0xff
-; GFX11-NEXT:    s_lshl_b32 s10, s25, 8
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v23
-; GFX11-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-NEXT:    s_and_b32 s9, s26, 0xff
-; GFX11-NEXT:    s_lshl_b32 s10, s27, 8
-; GFX11-NEXT:    s_and_b32 s11, s28, 0xff
-; GFX11-NEXT:    s_lshl_b32 s12, s29, 8
-; GFX11-NEXT:    s_or_b32 s9, s9, s10
-; GFX11-NEXT:    s_or_b32 s10, s11, s12
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v22
-; GFX11-NEXT:    v_and_b32_e64 v2, 0xffff, s10
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v30
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v24
-; GFX11-NEXT:    s_pack_ll_b32_b16 s8, s8, s9
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v28
-; GFX11-NEXT:    v_lshl_or_b32 v4, v0, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v26
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v31
-; GFX11-NEXT:    v_or_b32_e32 v3, v5, v34
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v29
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v27
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v33
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v25
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v18
-; GFX11-NEXT:    v_or_b32_e32 v9, v6, v35
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v16
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v36
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v37
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v20
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v19
-; GFX11-NEXT:    v_or_b32_e32 v12, v6, v17
-; GFX11-NEXT:    v_lshl_or_b32 v6, v0, 16, v3
-; GFX11-NEXT:    v_mov_b32_e32 v0, s5
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v32
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v5
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v7
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v21
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v8
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_lshl_or_b32 v7, v9, 16, v11
-; GFX11-NEXT:    v_lshl_or_b32 v8, v12, 16, v13
-; GFX11-NEXT:    v_mov_b32_e32 v3, s8
-; GFX11-NEXT:    v_lshl_or_b32 v9, v10, 16, v14
-; GFX11-NEXT:    v_lshl_or_b32 v5, v2, 16, v1
-; GFX11-NEXT:    v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_vccnz .LBB63_3
-; GFX11-NEXT:  .LBB63_2: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v27
-; GFX11-NEXT:    s_add_i32 s28, s28, 3
-; GFX11-NEXT:    s_lshl_b32 s5, s29, 8
-; GFX11-NEXT:    s_and_b32 s4, s28, 0xff
-; GFX11-NEXT:    s_add_i32 s24, s24, 3
-; GFX11-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-NEXT:    s_and_b32 s5, s24, 0xff
-; GFX11-NEXT:    s_lshl_b32 s6, s25, 8
-; GFX11-NEXT:    s_add_i32 s26, s26, 3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    s_or_b32 s5, s6, s5
-; GFX11-NEXT:    s_and_b32 s6, s26, 0xff
-; GFX11-NEXT:    s_lshl_b32 s7, s27, 8
-; GFX11-NEXT:    s_add_i32 s20, s20, 3
-; GFX11-NEXT:    s_or_b32 s6, s7, s6
-; GFX11-NEXT:    s_and_b32 s7, s20, 0xff
-; GFX11-NEXT:    s_lshl_b32 s8, s21, 8
-; GFX11-NEXT:    s_add_i32 s22, s22, 3
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 3, v28
-; GFX11-NEXT:    v_or_b32_e32 v4, v36, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 3, v26
-; GFX11-NEXT:    s_or_b32 s7, s8, s7
-; GFX11-NEXT:    s_and_b32 s8, s22, 0xff
-; GFX11-NEXT:    s_lshl_b32 s9, s23, 8
-; GFX11-NEXT:    s_add_i32 s16, s16, 3
-; GFX11-NEXT:    s_or_b32 s8, s9, s8
-; GFX11-NEXT:    s_and_b32 s9, s16, 0xff
-; GFX11-NEXT:    s_lshl_b32 s10, s17, 8
-; GFX11-NEXT:    s_add_i32 s18, s18, 3
-; GFX11-NEXT:    s_add_i32 s0, s0, 3
-; GFX11-NEXT:    s_add_i32 s2, s2, 3
-; GFX11-NEXT:    s_or_b32 s9, s10, s9
-; GFX11-NEXT:    s_and_b32 s10, s18, 0xff
-; GFX11-NEXT:    s_lshl_b32 s11, s19, 8
-; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX11-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX11-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 3, v24
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, 0x300, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v7
-; GFX11-NEXT:    s_or_b32 s10, s11, s10
-; GFX11-NEXT:    s_or_b32 s0, s1, s0
-; GFX11-NEXT:    s_or_b32 s1, s3, s2
-; GFX11-NEXT:    s_addk_i32 s5, 0x300
-; GFX11-NEXT:    s_addk_i32 s6, 0x300
-; GFX11-NEXT:    s_addk_i32 s9, 0x300
-; GFX11-NEXT:    s_addk_i32 s10, 0x300
-; GFX11-NEXT:    s_addk_i32 s0, 0x300
-; GFX11-NEXT:    s_addk_i32 s1, 0x300
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, 3, v29
-; GFX11-NEXT:    v_or_b32_e32 v5, v35, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_or_b32_e32 v4, v33, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, 3, v23
-; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-NEXT:    s_pack_ll_b32_b16 s1, s9, s10
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v18
-; GFX11-NEXT:    s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v25
-; GFX11-NEXT:    s_addk_i32 s7, 0x300
-; GFX11-NEXT:    s_addk_i32 s8, 0x300
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v20
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, 0x300, v5
-; GFX11-NEXT:    v_or_b32_e32 v5, v34, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 3, v30
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v10
-; GFX11-NEXT:    s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v16
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_or_b32_e32 v0, v19, v0
-; GFX11-NEXT:    v_or_b32_e32 v2, v37, v2
-; GFX11-NEXT:    v_or_b32_e32 v7, v32, v7
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_or_b32_e32 v4, v22, v4
-; GFX11-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-NEXT:    v_or_b32_e32 v1, v21, v1
-; GFX11-NEXT:    v_or_b32_e32 v3, v17, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 0x300, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-NEXT:    v_or_b32_e32 v6, v31, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-NEXT:    v_and_b32_e64 v10, 0xffff, s4
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshl_or_b32 v4, v4, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v5
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshl_or_b32 v5, v6, 16, v7
-; GFX11-NEXT:    v_lshl_or_b32 v6, v11, 16, v10
-; GFX11-NEXT:    v_lshl_or_b32 v7, v8, 16, v9
-; GFX11-NEXT:    v_lshl_or_b32 v8, v3, 16, v2
-; GFX11-NEXT:    v_lshl_or_b32 v9, v1, 16, v0
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT:  .LBB63_3: ; %end
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB63_4:
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-NEXT:    s_branch .LBB63_2
+; GFX11-TRUE16-LABEL: bitcast_v40i8_to_v20f16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v22
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v14 :: v_dual_mov_b32 v28, v12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v27, v10 :: v_dual_mov_b32 v26, v8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v30, v4 :: v_dual_mov_b32 v29, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 8, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v34, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v33, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v36, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v35, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v37, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 8, v21
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB63_4
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    s_and_b32 s5, s0, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s1, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s7, s2, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s3, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s5, s5, s6
+; GFX11-TRUE16-NEXT:    s_or_b32 s6, s7, s8
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s7, s17, 8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-TRUE16-NEXT:    s_and_b32 s6, s16, 0xff
+; GFX11-TRUE16-NEXT:    s_and_b32 s8, s18, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s19, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s6, s6, s7
+; GFX11-TRUE16-NEXT:    s_or_b32 s7, s8, s9
+; GFX11-TRUE16-NEXT:    s_and_b32 s8, s20, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s21, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s10, s22, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s11, s23, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s8, s8, s9
+; GFX11-TRUE16-NEXT:    s_or_b32 s9, s10, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s6, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s8, s9
+; GFX11-TRUE16-NEXT:    s_and_b32 s8, s24, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s25, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s10, s26, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s11, s27, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s8, s8, s9
+; GFX11-TRUE16-NEXT:    s_or_b32 s9, s10, s11
+; GFX11-TRUE16-NEXT:    s_and_b32 s10, s28, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s11, s29, 8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s8, s9
+; GFX11-TRUE16-NEXT:    s_or_b32 s9, s10, s11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s9 :: v_dual_and_b32 v1, 0xff, v30
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v29
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xff, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v23
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v26
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v2, v32
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v9, v19
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v27
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v1, v36
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v20
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v21
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, s6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v22
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v24
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v0, v34
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v28
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v35
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, s5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v3, v33
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v25
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v3, v37
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, s8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v17
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s7
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB63_3
+; GFX11-TRUE16-NEXT:  .LBB63_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_add_i32 s28, s28, 3
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s29, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s28, 0xff
+; GFX11-TRUE16-NEXT:    s_add_i32 s24, s24, 3
+; GFX11-TRUE16-NEXT:    s_or_b32 s4, s5, s4
+; GFX11-TRUE16-NEXT:    s_and_b32 s5, s24, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s25, 8
+; GFX11-TRUE16-NEXT:    s_add_i32 s26, s26, 3
+; GFX11-TRUE16-NEXT:    s_add_i32 s0, s0, 3
+; GFX11-TRUE16-NEXT:    s_add_i32 s2, s2, 3
+; GFX11-TRUE16-NEXT:    s_or_b32 s5, s6, s5
+; GFX11-TRUE16-NEXT:    s_and_b32 s6, s26, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s7, s27, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s6, s7, s6
+; GFX11-TRUE16-NEXT:    s_or_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    s_or_b32 s1, s3, s2
+; GFX11-TRUE16-NEXT:    s_addk_i32 s5, 0x300
+; GFX11-TRUE16-NEXT:    s_addk_i32 s6, 0x300
+; GFX11-TRUE16-NEXT:    s_addk_i32 s0, 0x300
+; GFX11-TRUE16-NEXT:    s_addk_i32 s1, 0x300
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v25
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v27
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v26
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 3, v29
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-TRUE16-NEXT:    s_add_i32 s20, s20, 3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v37, v2
+; GFX11-TRUE16-NEXT:    s_and_b32 s7, s20, 0xff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v17, v3
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s21, 8
+; GFX11-TRUE16-NEXT:    s_add_i32 s22, s22, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 0x300, v2
+; GFX11-TRUE16-NEXT:    s_or_b32 s7, s8, s7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v36, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v24
+; GFX11-TRUE16-NEXT:    s_and_b32 s8, s22, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s23, 8
+; GFX11-TRUE16-NEXT:    s_add_i32 s16, s16, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 3, v30
+; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s8
+; GFX11-TRUE16-NEXT:    s_and_b32 s9, s16, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s17, 8
+; GFX11-TRUE16-NEXT:    s_add_i32 s18, s18, 3
+; GFX11-TRUE16-NEXT:    s_or_b32 s9, s10, s9
+; GFX11-TRUE16-NEXT:    s_and_b32 s10, s18, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s11, s19, 8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v33, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xff, v6
+; GFX11-TRUE16-NEXT:    s_or_b32 s10, s11, s10
+; GFX11-TRUE16-NEXT:    s_addk_i32 s9, 0x300
+; GFX11-TRUE16-NEXT:    s_addk_i32 s10, 0x300
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v31, v10
+; GFX11-TRUE16-NEXT:    s_addk_i32 s4, 0x300
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v20
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v18
+; GFX11-TRUE16-NEXT:    s_addk_i32 s7, 0x300
+; GFX11-TRUE16-NEXT:    s_addk_i32 s8, 0x300
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 0x300, v4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s7, s8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v34, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v19, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v21, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v23
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 0x300, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v32, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v28
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v11.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x300, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v22, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v35, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, s3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-TRUE16-NEXT:  .LBB63_3: ; %end
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB63_4:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-TRUE16-NEXT:    s_branch .LBB63_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v40i8_to_v20f16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v22
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v25, v14 :: v_dual_mov_b32 v28, v12
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v27, v10 :: v_dual_mov_b32 v26, v8
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v0
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v30, v4 :: v_dual_mov_b32 v29, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v31, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v33, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v36, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v35, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 8, v21
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB63_4
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    s_and_b32 s5, s0, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s1, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s7, s2, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s3, 8
+; GFX11-FAKE16-NEXT:    s_or_b32 s5, s5, s6
+; GFX11-FAKE16-NEXT:    s_or_b32 s6, s7, s8
+; GFX11-FAKE16-NEXT:    s_and_b32 s7, s16, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s17, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s9, s18, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s19, 8
+; GFX11-FAKE16-NEXT:    s_or_b32 s7, s7, s8
+; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s6, s7, s8
+; GFX11-FAKE16-NEXT:    s_and_b32 s7, s20, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s21, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s9, s22, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s23, 8
+; GFX11-FAKE16-NEXT:    s_or_b32 s7, s7, s8
+; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-FAKE16-NEXT:    s_and_b32 s9, s24, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s25, 8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v23
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
+; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-FAKE16-NEXT:    s_and_b32 s9, s26, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s27, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s11, s28, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s12, s29, 8
+; GFX11-FAKE16-NEXT:    s_or_b32 s9, s9, s10
+; GFX11-FAKE16-NEXT:    s_or_b32 s10, s11, s12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e64 v2, 0xffff, s10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v30
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v24
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s8, s8, s9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v28
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v0, 16, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v26
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v3, v31
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v5, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v6, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v37
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v6, v17
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v0, 16, v3
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, s5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v9, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v12, 16, v13
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, s8
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v10, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v2, 16, v1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB63_3
+; GFX11-FAKE16-NEXT:  .LBB63_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v27
+; GFX11-FAKE16-NEXT:    s_add_i32 s28, s28, 3
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s5, s29, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s28, 0xff
+; GFX11-FAKE16-NEXT:    s_add_i32 s24, s24, 3
+; GFX11-FAKE16-NEXT:    s_or_b32 s4, s5, s4
+; GFX11-FAKE16-NEXT:    s_and_b32 s5, s24, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s6, s25, 8
+; GFX11-FAKE16-NEXT:    s_add_i32 s26, s26, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    s_or_b32 s5, s6, s5
+; GFX11-FAKE16-NEXT:    s_and_b32 s6, s26, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s7, s27, 8
+; GFX11-FAKE16-NEXT:    s_add_i32 s20, s20, 3
+; GFX11-FAKE16-NEXT:    s_or_b32 s6, s7, s6
+; GFX11-FAKE16-NEXT:    s_and_b32 s7, s20, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s8, s21, 8
+; GFX11-FAKE16-NEXT:    s_add_i32 s22, s22, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 3, v28
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v36, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 3, v26
+; GFX11-FAKE16-NEXT:    s_or_b32 s7, s8, s7
+; GFX11-FAKE16-NEXT:    s_and_b32 s8, s22, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s9, s23, 8
+; GFX11-FAKE16-NEXT:    s_add_i32 s16, s16, 3
+; GFX11-FAKE16-NEXT:    s_or_b32 s8, s9, s8
+; GFX11-FAKE16-NEXT:    s_and_b32 s9, s16, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s10, s17, 8
+; GFX11-FAKE16-NEXT:    s_add_i32 s18, s18, 3
+; GFX11-FAKE16-NEXT:    s_add_i32 s0, s0, 3
+; GFX11-FAKE16-NEXT:    s_add_i32 s2, s2, 3
+; GFX11-FAKE16-NEXT:    s_or_b32 s9, s10, s9
+; GFX11-FAKE16-NEXT:    s_and_b32 s10, s18, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s11, s19, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 3, v24
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v7
+; GFX11-FAKE16-NEXT:    s_or_b32 s10, s11, s10
+; GFX11-FAKE16-NEXT:    s_or_b32 s0, s1, s0
+; GFX11-FAKE16-NEXT:    s_or_b32 s1, s3, s2
+; GFX11-FAKE16-NEXT:    s_addk_i32 s5, 0x300
+; GFX11-FAKE16-NEXT:    s_addk_i32 s6, 0x300
+; GFX11-FAKE16-NEXT:    s_addk_i32 s9, 0x300
+; GFX11-FAKE16-NEXT:    s_addk_i32 s10, 0x300
+; GFX11-FAKE16-NEXT:    s_addk_i32 s0, 0x300
+; GFX11-FAKE16-NEXT:    s_addk_i32 s1, 0x300
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 3, v29
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v35, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v33, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 3, v23
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v18
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v25
+; GFX11-FAKE16-NEXT:    s_addk_i32 s7, 0x300
+; GFX11-FAKE16-NEXT:    s_addk_i32 s8, 0x300
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 0x300, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v34, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 3, v30
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v10
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s2, s7, s8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v19, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v37, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v32, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v22, v4
+; GFX11-FAKE16-NEXT:    s_addk_i32 s4, 0x300
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v21, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v17, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x300, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v31, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e64 v10, 0xffff, s4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v4, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v6, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v11, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v8, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v3, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT:  .LBB63_3: ; %end
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB63_4:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-FAKE16-NEXT:    s_branch .LBB63_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
index e6c7b1a0f4845..73b57a52201af 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
@@ -1482,46 +1482,87 @@ define inreg i32 @bitcast_v2bf16_to_i32_scalar(<2 x bfloat> inreg %a, i32 inreg
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v2bf16_to_i32_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX11-NEXT:    s_mov_b32 s1, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB15_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_vccnz .LBB15_4
-; GFX11-NEXT:  .LBB15_2: ; %cmp.true
-; GFX11-NEXT:    s_lshl_b32 s1, s0, 16
-; GFX11-NEXT:    s_and_b32 s0, s0, 0xffff0000
-; GFX11-NEXT:    v_add_f32_e64 v0, 0x40c00000, s1
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB15_3:
-; GFX11-NEXT:    s_branch .LBB15_2
-; GFX11-NEXT:  .LBB15_4:
-; GFX11-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_i32_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s1, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB15_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s1
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB15_4
+; GFX11-TRUE16-NEXT:  .LBB15_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, s0, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB15_3:
+; GFX11-TRUE16-NEXT:    s_branch .LBB15_2
+; GFX11-TRUE16-NEXT:  .LBB15_4:
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_i32_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB15_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB15_4
+; GFX11-FAKE16-NEXT:  .LBB15_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB15_3:
+; GFX11-FAKE16-NEXT:    s_branch .LBB15_2
+; GFX11-FAKE16-NEXT:  .LBB15_4:
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -3720,46 +3761,87 @@ define inreg float @bitcast_v2bf16_to_f32_scalar(<2 x bfloat> inreg %a, i32 inre
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v2bf16_to_f32_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX11-NEXT:    s_mov_b32 s1, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB35_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_vccnz .LBB35_4
-; GFX11-NEXT:  .LBB35_2: ; %cmp.true
-; GFX11-NEXT:    s_lshl_b32 s1, s0, 16
-; GFX11-NEXT:    s_and_b32 s0, s0, 0xffff0000
-; GFX11-NEXT:    v_add_f32_e64 v0, 0x40c00000, s1
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB35_3:
-; GFX11-NEXT:    s_branch .LBB35_2
-; GFX11-NEXT:  .LBB35_4:
-; GFX11-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_f32_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s1, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB35_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s1
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB35_4
+; GFX11-TRUE16-NEXT:  .LBB35_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, s0, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB35_3:
+; GFX11-TRUE16-NEXT:    s_branch .LBB35_2
+; GFX11-TRUE16-NEXT:  .LBB35_4:
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_f32_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB35_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB35_4
+; GFX11-FAKE16-NEXT:  .LBB35_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB35_3:
+; GFX11-FAKE16-NEXT:    s_branch .LBB35_2
+; GFX11-FAKE16-NEXT:  .LBB35_4:
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -5424,27 +5506,24 @@ define <2 x i16> @bitcast_v2bf16_to_v2i16(<2 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB50_2
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.h
 ; GFX11-TRUE16-NEXT:  .LBB50_2: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -5592,44 +5671,81 @@ define inreg <2 x i16> @bitcast_v2bf16_to_v2i16_scalar(<2 x bfloat> inreg %a, i3
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v2bf16_to_v2i16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX11-NEXT:    s_mov_b32 s1, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB51_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_vccnz .LBB51_4
-; GFX11-NEXT:  .LBB51_2: ; %cmp.true
-; GFX11-NEXT:    s_lshl_b32 s1, s0, 16
-; GFX11-NEXT:    s_and_b32 s0, s0, 0xffff0000
-; GFX11-NEXT:    v_add_f32_e64 v0, 0x40c00000, s1
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff0000, v1, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB51_3:
-; GFX11-NEXT:    s_branch .LBB51_2
-; GFX11-NEXT:  .LBB51_4:
-; GFX11-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_v2i16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s1, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB51_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s1
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB51_4
+; GFX11-TRUE16-NEXT:  .LBB51_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s0, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v2, v2, v4 :: v_dual_add_nc_u32 v3, v3, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB51_3:
+; GFX11-TRUE16-NEXT:    s_branch .LBB51_2
+; GFX11-TRUE16-NEXT:  .LBB51_4:
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_v2i16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB51_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB51_4
+; GFX11-FAKE16-NEXT:  .LBB51_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v0, 0xffff0000, v1, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB51_3:
+; GFX11-FAKE16-NEXT:    s_branch .LBB51_2
+; GFX11-FAKE16-NEXT:  .LBB51_4:
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -7223,46 +7339,87 @@ define inreg <2 x half> @bitcast_v2bf16_to_v2f16_scalar(<2 x bfloat> inreg %a, i
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v2bf16_to_v2f16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX11-NEXT:    s_mov_b32 s1, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB63_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_vccnz .LBB63_4
-; GFX11-NEXT:  .LBB63_2: ; %cmp.true
-; GFX11-NEXT:    s_lshl_b32 s1, s0, 16
-; GFX11-NEXT:    s_and_b32 s0, s0, 0xffff0000
-; GFX11-NEXT:    v_add_f32_e64 v0, 0x40c00000, s1
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB63_3:
-; GFX11-NEXT:    s_branch .LBB63_2
-; GFX11-NEXT:  .LBB63_4:
-; GFX11-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_v2f16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s1, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB63_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s1
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB63_4
+; GFX11-TRUE16-NEXT:  .LBB63_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, s0, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB63_3:
+; GFX11-TRUE16-NEXT:    s_branch .LBB63_2
+; GFX11-TRUE16-NEXT:  .LBB63_4:
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_v2f16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB63_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB63_4
+; GFX11-FAKE16-NEXT:  .LBB63_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB63_3:
+; GFX11-FAKE16-NEXT:    s_branch .LBB63_2
+; GFX11-FAKE16-NEXT:  .LBB63_4:
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -8653,46 +8810,87 @@ define inreg <1 x i32> @bitcast_v2bf16_to_v1i32_scalar(<2 x bfloat> inreg %a, i3
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v2bf16_to_v1i32_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX11-NEXT:    s_mov_b32 s1, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB73_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_vccnz .LBB73_4
-; GFX11-NEXT:  .LBB73_2: ; %cmp.true
-; GFX11-NEXT:    s_lshl_b32 s1, s0, 16
-; GFX11-NEXT:    s_and_b32 s0, s0, 0xffff0000
-; GFX11-NEXT:    v_add_f32_e64 v0, 0x40c00000, s1
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB73_3:
-; GFX11-NEXT:    s_branch .LBB73_2
-; GFX11-NEXT:  .LBB73_4:
-; GFX11-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_v1i32_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s1, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB73_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s1
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB73_4
+; GFX11-TRUE16-NEXT:  .LBB73_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, s0, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB73_3:
+; GFX11-TRUE16-NEXT:    s_branch .LBB73_2
+; GFX11-TRUE16-NEXT:  .LBB73_4:
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_v1i32_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB73_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB73_4
+; GFX11-FAKE16-NEXT:  .LBB73_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB73_3:
+; GFX11-FAKE16-NEXT:    s_branch .LBB73_2
+; GFX11-FAKE16-NEXT:  .LBB73_4:
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -9258,57 +9456,109 @@ define inreg <4 x i8> @bitcast_v2bf16_to_v4i8_scalar(<2 x bfloat> inreg %a, i32
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v2bf16_to_v4i8_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX11-NEXT:    s_mov_b32 s1, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB77_3
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    s_lshr_b32 s2, s0, 24
-; GFX11-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX11-NEXT:    s_lshr_b32 s3, s0, 8
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_vccnz .LBB77_4
-; GFX11-NEXT:  .LBB77_2: ; %cmp.true
-; GFX11-NEXT:    s_lshl_b32 s1, s0, 16
-; GFX11-NEXT:    s_and_b32 s0, s0, 0xffff0000
-; GFX11-NEXT:    v_add_f32_e64 v0, 0x40c00000, s1
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v0
-; GFX11-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB77_3:
-; GFX11-NEXT:    ; implicit-def: $sgpr3
-; GFX11-NEXT:    ; implicit-def: $sgpr4
-; GFX11-NEXT:    ; implicit-def: $sgpr2
-; GFX11-NEXT:    s_branch .LBB77_2
-; GFX11-NEXT:  .LBB77_4:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s2
-; GFX11-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_v4i8_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s1, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB77_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s2, s0, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s1
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB77_4
+; GFX11-TRUE16-NEXT:  .LBB77_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s0, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB77_3:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr3
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr2
+; GFX11-TRUE16-NEXT:    s_branch .LBB77_2
+; GFX11-TRUE16-NEXT:  .LBB77_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v1, s3
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_v4i8_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB77_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s2, s0, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s1
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB77_4
+; GFX11-FAKE16-NEXT:  .LBB77_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB77_3:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr2
+; GFX11-FAKE16-NEXT:    s_branch .LBB77_2
+; GFX11-FAKE16-NEXT:  .LBB77_4:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s2
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v1, s3
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll
index acc02472c7161..d5d2d4aafaa19 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll
@@ -374,59 +374,112 @@ define inreg <3 x half> @bitcast_v3bf16_to_v3f16_scalar(<3 x bfloat> inreg %a, i
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s17
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v3bf16_to_v3f16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX11-NEXT:    s_mov_b32 s2, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB1_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s2
-; GFX11-NEXT:    s_cbranch_vccnz .LBB1_4
-; GFX11-NEXT:  .LBB1_2: ; %cmp.true
-; GFX11-NEXT:    s_lshl_b32 s2, s0, 16
-; GFX11-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-NEXT:    v_add_f32_e64 v0, 0x40c00000, s2
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s1
-; GFX11-NEXT:    s_pack_lh_b32_b16 s0, 0, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_f32_e64 v2, 0x40c00000, s0
-; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v3, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v5, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v5, v8 :: v_dual_and_b32 v1, 0xffff, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshl_or_b32 v1, 0x7fc0, 16, v1
-; GFX11-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB1_3:
-; GFX11-NEXT:    s_branch .LBB1_2
-; GFX11-NEXT:  .LBB1_4:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v3bf16_to_v3f16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB1_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB1_4
+; GFX11-TRUE16-NEXT:  .LBB1_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, v5, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, 0x7fc0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v5, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v4.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v3.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB1_3:
+; GFX11-TRUE16-NEXT:    s_branch .LBB1_2
+; GFX11-TRUE16-NEXT:  .LBB1_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v3bf16_to_v3f16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB1_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB1_4
+; GFX11-FAKE16-NEXT:  .LBB1_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s0, 16
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s0, 0, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v3, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v5, v8 :: v_dual_and_b32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v1, 0x7fc0, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB1_3:
+; GFX11-FAKE16-NEXT:    s_branch .LBB1_2
+; GFX11-FAKE16-NEXT:  .LBB1_4:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -803,38 +856,36 @@ define <3 x i16> @bitcast_v3bf16_to_v3i16(<3 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
-; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v4, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v7, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v5, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, 0x7fc0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, 0x7fc0, 16, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v5, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.h
 ; GFX11-TRUE16-NEXT:  .LBB4_2: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -1025,56 +1076,105 @@ define inreg <3 x i16> @bitcast_v3bf16_to_v3i16_scalar(<3 x bfloat> inreg %a, i3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s17
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v3bf16_to_v3i16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX11-NEXT:    s_mov_b32 s2, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB5_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s2
-; GFX11-NEXT:    s_cbranch_vccnz .LBB5_4
-; GFX11-NEXT:  .LBB5_2: ; %cmp.true
-; GFX11-NEXT:    s_lshl_b32 s2, s0, 16
-; GFX11-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-NEXT:    v_add_f32_e64 v0, 0x40c00000, s2
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s1
-; GFX11-NEXT:    s_pack_lh_b32_b16 s0, 0, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_f32_e64 v2, 0x40c00000, s0
-; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v3, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v5, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v5, v8, vcc_lo
-; GFX11-NEXT:    v_lshl_or_b32 v1, 0x7fc0, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff0000, v2, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB5_3:
-; GFX11-NEXT:    s_branch .LBB5_2
-; GFX11-NEXT:  .LBB5_4:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v3bf16_to_v3i16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB5_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB5_4
+; GFX11-TRUE16-NEXT:  .LBB5_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s0, 16
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s0, 0, s0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, v5, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v3, v3, v7 :: v_dual_add_nc_u32 v4, v4, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, 0x7fc0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v3.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v4.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB5_3:
+; GFX11-TRUE16-NEXT:    s_branch .LBB5_2
+; GFX11-TRUE16-NEXT:  .LBB5_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v3bf16_to_v3i16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB5_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB5_4
+; GFX11-FAKE16-NEXT:  .LBB5_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s0, 16
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s0, 0, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v3, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v5, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v5, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v1, 0x7fc0, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v0, 0xffff0000, v2, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB5_3:
+; GFX11-FAKE16-NEXT:    s_branch .LBB5_2
+; GFX11-FAKE16-NEXT:  .LBB5_4:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
index 1d61e7b5e91ba..de18eec1ccc79 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
@@ -7351,360 +7351,696 @@ define inreg <16 x i32> @bitcast_v32bf16_to_v16i32_scalar(<32 x bfloat> inreg %a
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v32bf16_to_v16i32_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s15, s3
-; GFX11-NEXT:    s_mov_b32 s14, s2
-; GFX11-NEXT:    s_mov_b32 s13, s1
-; GFX11-NEXT:    s_mov_b32 s12, s0
-; GFX11-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB23_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccnz .LBB23_4
-; GFX11-NEXT:  .LBB23_2: ; %cmp.true
-; GFX11-NEXT:    s_and_b32 s1, s27, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s0, s27, 16
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s1
-; GFX11-NEXT:    v_add_f32_e64 v0, 0x40c00000, s0
-; GFX11-NEXT:    s_and_b32 s0, s26, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s2, s26, 16
-; GFX11-NEXT:    v_add_f32_e64 v3, 0x40c00000, s0
-; GFX11-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s2
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT:    v_bfe_u32 v9, v3, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v10, v5, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_and_b32 s1, s25, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT:    v_add_f32_e64 v6, 0x40c00000, s1
-; GFX11-NEXT:    s_lshl_b32 s3, s25, 16
-; GFX11-NEXT:    s_and_b32 s0, s24, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s1, s24, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v9, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v10, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v4, v7 :: v_dual_add_nc_u32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v8
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s3
-; GFX11-NEXT:    v_bfe_u32 v5, v6, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v8, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshl_or_b32 v15, v1, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v5, v6
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v8
-; GFX11-NEXT:    v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT:    s_lshl_b32 s1, s23, 16
-; GFX11-NEXT:    v_lshl_or_b32 v14, v0, 16, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT:    v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT:    s_and_b32 s0, s23, 0xffff0000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, v8, v7
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    s_lshl_b32 s1, s22, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v6
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v7
-; GFX11-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT:    v_bfe_u32 v7, v8, 16, 1
-; GFX11-NEXT:    s_and_b32 s0, s22, 0xffff0000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v4, v6, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v7, v8
-; GFX11-NEXT:    v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v6
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT:    v_lshl_or_b32 v13, v0, 16, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v8
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT:    v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT:    s_and_b32 s0, s21, 0xffff0000
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, v8, v7
-; GFX11-NEXT:    s_lshl_b32 s1, s21, 16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT:    v_lshl_or_b32 v12, v0, 16, v1
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v7
-; GFX11-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_bfe_u32 v7, v8, 16, 1
-; GFX11-NEXT:    s_and_b32 s0, s20, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s1, s20, 16
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v4, v6, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v7, v8
-; GFX11-NEXT:    v_lshl_or_b32 v11, v0, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v6
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v8
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT:    v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT:    v_lshl_or_b32 v10, v0, 16, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, v8, v7
-; GFX11-NEXT:    s_lshl_b32 s0, s19, 16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v7
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v4
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT:    s_and_b32 s0, s19, 0xffff0000
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_bfe_u32 v16, v4, 16, 1
-; GFX11-NEXT:    s_and_b32 s0, s18, 0xffff0000
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_bfe_u32 v6, v8, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v16, v4
-; GFX11-NEXT:    v_lshl_or_b32 v9, v1, 16, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, v6, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v4
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s1, s18, 16
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s1
-; GFX11-NEXT:    s_lshl_b32 s0, s17, 16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_lshl_or_b32 v8, v2, 16, v0
-; GFX11-NEXT:    s_and_b32 s1, s17, 0xffff0000
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v6, v4
-; GFX11-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v7, v5
-; GFX11-NEXT:    v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_or_b32_e32 v16, 0x400000, v4
-; GFX11-NEXT:    v_bfe_u32 v17, v6, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v5
-; GFX11-NEXT:    v_bfe_u32 v19, v7, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v17, v6
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v16, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_or_b32_e32 v16, 0x400000, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v19, v7
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    s_lshl_b32 s1, s16, 16
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v18, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT:    v_or_b32_e32 v17, 0x400000, v7
-; GFX11-NEXT:    v_add_f32_e64 v18, 0x40c00000, s1
-; GFX11-NEXT:    s_and_b32 s0, s16, 0xffff0000
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v16, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_add_f32_e64 v16, 0x40c00000, s0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v5, v17, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v17, v18, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v6, v16, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshl_or_b32 v7, v0, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, v17, v18
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v19, v6, v16
-; GFX11-NEXT:    v_lshl_or_b32 v6, v2, 16, v3
-; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v18
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    s_and_b32 s0, s15, 0xffff0000
-; GFX11-NEXT:    v_lshl_or_b32 v5, v5, 16, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v19
-; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v16
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT:    s_lshl_b32 s1, s15, 16
-; GFX11-NEXT:    s_and_b32 s0, s14, 0xffff0000
-; GFX11-NEXT:    v_add_f32_e64 v17, 0x40c00000, s1
-; GFX11-NEXT:    v_add_f32_e64 v16, 0x40c00000, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v2, v4, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v4
-; GFX11-NEXT:    v_bfe_u32 v3, v17, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v18, v16, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v4
-; GFX11-NEXT:    s_lshl_b32 s0, s14, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v3, v17
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v17
-; GFX11-NEXT:    v_add_nc_u32_e32 v18, v18, v16
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    s_and_b32 s0, s13, 0xffff0000
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v19, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT:    v_add_nc_u32_e32 v17, 0x7fff, v18
-; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v16
-; GFX11-NEXT:    v_bfe_u32 v19, v4, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v20, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_dual_cndmask_b32 v16, v17, v18 :: v_dual_add_nc_u32 v17, v19, v4
-; GFX11-NEXT:    v_add_f32_e64 v18, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s13, 16
-; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v4
-; GFX11-NEXT:    v_add_f32_e64 v21, 0x40c00000, s0
-; GFX11-NEXT:    v_add_nc_u32_e32 v17, 0x7fff, v17
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    s_lshl_b32 s0, s12, 16
-; GFX11-NEXT:    v_bfe_u32 v20, v18, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v26, 0x400000, v21
-; GFX11-NEXT:    v_or_b32_e32 v25, 0x400000, v18
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v17, v19, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v17, 0x40c00000, s0
-; GFX11-NEXT:    v_bfe_u32 v19, v21, 16, 1
-; GFX11-NEXT:    s_and_b32 s0, s12, 0xffff0000
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-NEXT:    v_add_f32_e64 v22, 0x40c00000, s0
-; GFX11-NEXT:    v_bfe_u32 v23, v17, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v19, v19, v21
-; GFX11-NEXT:    v_add_nc_u32_e32 v20, v20, v18
-; GFX11-NEXT:    v_or_b32_e32 v27, 0x400000, v17
-; GFX11-NEXT:    v_bfe_u32 v24, v22, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v23, v23, v17
-; GFX11-NEXT:    v_add_nc_u32_e32 v19, 0x7fff, v19
-; GFX11-NEXT:    v_add_nc_u32_e32 v20, 0x7fff, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v24, v24, v22
-; GFX11-NEXT:    v_add_nc_u32_e32 v23, 0x7fff, v23
-; GFX11-NEXT:    v_cndmask_b32_e32 v19, v19, v26, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v21, 0x7fff, v24
-; GFX11-NEXT:    v_or_b32_e32 v24, 0x400000, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
-; GFX11-NEXT:    v_cndmask_b32_e32 v17, v23, v27, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; GFX11-NEXT:    v_lshl_or_b32 v3, v2, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
-; GFX11-NEXT:    v_cndmask_b32_e32 v18, v20, v25, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
-; GFX11-NEXT:    v_cndmask_b32_e32 v20, v21, v24, vcc_lo
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v4
-; GFX11-NEXT:    v_lshl_or_b32 v4, v1, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshl_or_b32 v1, v18, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshl_or_b32 v2, v16, 16, v21
-; GFX11-NEXT:    v_lshl_or_b32 v0, v20, 16, v17
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB23_3:
-; GFX11-NEXT:    s_branch .LBB23_2
-; GFX11-NEXT:  .LBB23_4:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
-; GFX11-NEXT:    v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
-; GFX11-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
-; GFX11-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
-; GFX11-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
-; GFX11-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
-; GFX11-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
-; GFX11-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v16i32_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s15, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s14, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s0
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB23_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB23_4
+; GFX11-TRUE16-NEXT:  .LBB23_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s27, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s27, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s26, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s26, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s25, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s17, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v2, v6 :: v_dual_add_nc_u32 v7, v7, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v9, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v7
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s25, 16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s24, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v0.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v8, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v5, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s24, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v6, 16, 1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s23, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v8, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v0.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v2, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s23, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v0.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v8, 16, 1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s22, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v3, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s22, 16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v5, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v7, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v9, 16, 1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s21, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v2.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v5, v9
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v3
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s21, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v8, v5
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s20, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v3, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s20, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v0.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v8, 16, 1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s19, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v3, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s19, 16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v5, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v7, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v16, 16, 1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s18, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v2.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v5, v16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v3
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s18, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v17, v5
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s17, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, v3, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v16, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v16, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v0, v17 :: v_dual_add_nc_u32 v3, v3, v4
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s16, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v1.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v5, v16
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s16, 16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v17, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v17, 16, 1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s15, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v4, v17
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, 0x400000, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v16, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v1, 16, 1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s15, 16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v4, v18, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, v19, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s14, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s14, 16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, 0x400000, v0
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, s12, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v16, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v17, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v17, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s13, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, v16, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v17, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v18, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v18, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 0x7fff, v16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v19, v19, v17
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v0, v18, 16, 1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s13, 16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v16, v20, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 0x7fff, v19
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, 0x400000, v17
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, v0, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v17, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v16, v19, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, 0x400000, v18
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v22, v17, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v21, v16, 16, 1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s12, 16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v20, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v0, v19 :: v_dual_add_nc_u32 v19, v21, v16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v21, v22, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v17
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v18, v20, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v19, 0x7fff, v19
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v21, 0x7fff, v21
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, 0x400000, v20
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v18, v18, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v17, v21, v22, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v18, 0x7fff, v18
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v16, v19, v23, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v18, v18, v24, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v17.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB23_3:
+; GFX11-TRUE16-NEXT:    s_branch .LBB23_2
+; GFX11-TRUE16-NEXT:  .LBB23_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v32bf16_to_v16i32_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s15, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s14, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s13, s1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s12, s0
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB23_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB23_4
+; GFX11-FAKE16-NEXT:  .LBB23_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s27, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s27, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s26, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s26, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s25, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s25, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s24, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s24, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v9, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v10, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v4, v7 :: v_dual_add_nc_u32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v8, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v5, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s23, 16
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v0, 16, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s23, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, v8, v7
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s22, 16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v8, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s22, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v7, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v0, 16, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s21, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, v8, v7
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s21, 16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v0, 16, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v8, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s20, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s20, 16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v7, v8
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v0, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v0, 16, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, v8, v7
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s19, 16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s19, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v4, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s18, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v8, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v16, v4
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v1, 16, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, v6, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s18, 16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s17, 16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v2, 16, v0
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s17, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v6, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v7, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v17, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v19, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v17, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v16, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v19, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s16, 16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v18, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v18, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s16, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v16, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v17, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v17, v18, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v16, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v0, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, v17, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v19, v6, v16
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v2, 16, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v18
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s15, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v5, 16, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s15, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s14, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v17, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v17, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v18, v16, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s14, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v3, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v17
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v18, v18, v16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s13, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v19, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, 0x7fff, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, 0x400000, v16
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v19, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v20, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v16, v17, v18 :: v_dual_add_nc_u32 v17, v19, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v18, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s13, 16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v21, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, 0x7fff, v17
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s12, 16
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v20, v18, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, 0x400000, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, 0x400000, v18
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v17, v19, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v17, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v19, v21, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s12, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v22, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v23, v17, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v19, v19, v21
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v20, v20, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, 0x400000, v17
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v24, v22, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v23, v23, v17
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v19, 0x7fff, v19
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v20, 0x7fff, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v24, v24, v22
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v23, 0x7fff, v23
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v19, v26, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v21, 0x7fff, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, 0x400000, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v23, v27, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v3, v2, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v20, v25, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v21, v24, vcc_lo
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v1, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v1, v18, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v2, v16, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v20, 16, v17
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB23_3:
+; GFX11-FAKE16-NEXT:    s_branch .LBB23_2
+; GFX11-FAKE16-NEXT:  .LBB23_4:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -21906,360 +22242,696 @@ define inreg <16 x float> @bitcast_v32bf16_to_v16f32_scalar(<32 x bfloat> inreg
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v32bf16_to_v16f32_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s15, s3
-; GFX11-NEXT:    s_mov_b32 s14, s2
-; GFX11-NEXT:    s_mov_b32 s13, s1
-; GFX11-NEXT:    s_mov_b32 s12, s0
-; GFX11-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB47_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccnz .LBB47_4
-; GFX11-NEXT:  .LBB47_2: ; %cmp.true
-; GFX11-NEXT:    s_and_b32 s1, s27, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s0, s27, 16
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s1
-; GFX11-NEXT:    v_add_f32_e64 v0, 0x40c00000, s0
-; GFX11-NEXT:    s_and_b32 s0, s26, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s2, s26, 16
-; GFX11-NEXT:    v_add_f32_e64 v3, 0x40c00000, s0
-; GFX11-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s2
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT:    v_bfe_u32 v9, v3, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v10, v5, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_and_b32 s1, s25, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT:    v_add_f32_e64 v6, 0x40c00000, s1
-; GFX11-NEXT:    s_lshl_b32 s3, s25, 16
-; GFX11-NEXT:    s_and_b32 s0, s24, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s1, s24, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v9, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v10, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v4, v7 :: v_dual_add_nc_u32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v8
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s3
-; GFX11-NEXT:    v_bfe_u32 v5, v6, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v8, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshl_or_b32 v15, v1, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v5, v6
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v8
-; GFX11-NEXT:    v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT:    s_lshl_b32 s1, s23, 16
-; GFX11-NEXT:    v_lshl_or_b32 v14, v0, 16, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT:    v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT:    s_and_b32 s0, s23, 0xffff0000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, v8, v7
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    s_lshl_b32 s1, s22, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v6
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v7
-; GFX11-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT:    v_bfe_u32 v7, v8, 16, 1
-; GFX11-NEXT:    s_and_b32 s0, s22, 0xffff0000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v4, v6, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v7, v8
-; GFX11-NEXT:    v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v6
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT:    v_lshl_or_b32 v13, v0, 16, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v8
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT:    v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT:    s_and_b32 s0, s21, 0xffff0000
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, v8, v7
-; GFX11-NEXT:    s_lshl_b32 s1, s21, 16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT:    v_lshl_or_b32 v12, v0, 16, v1
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v7
-; GFX11-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_bfe_u32 v7, v8, 16, 1
-; GFX11-NEXT:    s_and_b32 s0, s20, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s1, s20, 16
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v4, v6, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v7, v8
-; GFX11-NEXT:    v_lshl_or_b32 v11, v0, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v6
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v8
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT:    v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT:    v_lshl_or_b32 v10, v0, 16, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, v8, v7
-; GFX11-NEXT:    s_lshl_b32 s0, s19, 16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v7
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v4
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT:    s_and_b32 s0, s19, 0xffff0000
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_bfe_u32 v16, v4, 16, 1
-; GFX11-NEXT:    s_and_b32 s0, s18, 0xffff0000
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_bfe_u32 v6, v8, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v16, v4
-; GFX11-NEXT:    v_lshl_or_b32 v9, v1, 16, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, v6, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v4
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s1, s18, 16
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s1
-; GFX11-NEXT:    s_lshl_b32 s0, s17, 16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_lshl_or_b32 v8, v2, 16, v0
-; GFX11-NEXT:    s_and_b32 s1, s17, 0xffff0000
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v6, v4
-; GFX11-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v7, v5
-; GFX11-NEXT:    v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_or_b32_e32 v16, 0x400000, v4
-; GFX11-NEXT:    v_bfe_u32 v17, v6, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v5
-; GFX11-NEXT:    v_bfe_u32 v19, v7, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v17, v6
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v16, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_or_b32_e32 v16, 0x400000, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v19, v7
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    s_lshl_b32 s1, s16, 16
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v18, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT:    v_or_b32_e32 v17, 0x400000, v7
-; GFX11-NEXT:    v_add_f32_e64 v18, 0x40c00000, s1
-; GFX11-NEXT:    s_and_b32 s0, s16, 0xffff0000
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v16, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_add_f32_e64 v16, 0x40c00000, s0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v5, v17, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v17, v18, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v6, v16, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshl_or_b32 v7, v0, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, v17, v18
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v19, v6, v16
-; GFX11-NEXT:    v_lshl_or_b32 v6, v2, 16, v3
-; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v18
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    s_and_b32 s0, s15, 0xffff0000
-; GFX11-NEXT:    v_lshl_or_b32 v5, v5, 16, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v19
-; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v16
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT:    s_lshl_b32 s1, s15, 16
-; GFX11-NEXT:    s_and_b32 s0, s14, 0xffff0000
-; GFX11-NEXT:    v_add_f32_e64 v17, 0x40c00000, s1
-; GFX11-NEXT:    v_add_f32_e64 v16, 0x40c00000, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v2, v4, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v4
-; GFX11-NEXT:    v_bfe_u32 v3, v17, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v18, v16, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v4
-; GFX11-NEXT:    s_lshl_b32 s0, s14, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v3, v17
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v17
-; GFX11-NEXT:    v_add_nc_u32_e32 v18, v18, v16
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    s_and_b32 s0, s13, 0xffff0000
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v19, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT:    v_add_nc_u32_e32 v17, 0x7fff, v18
-; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v16
-; GFX11-NEXT:    v_bfe_u32 v19, v4, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v20, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_dual_cndmask_b32 v16, v17, v18 :: v_dual_add_nc_u32 v17, v19, v4
-; GFX11-NEXT:    v_add_f32_e64 v18, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s13, 16
-; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v4
-; GFX11-NEXT:    v_add_f32_e64 v21, 0x40c00000, s0
-; GFX11-NEXT:    v_add_nc_u32_e32 v17, 0x7fff, v17
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    s_lshl_b32 s0, s12, 16
-; GFX11-NEXT:    v_bfe_u32 v20, v18, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v26, 0x400000, v21
-; GFX11-NEXT:    v_or_b32_e32 v25, 0x400000, v18
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v17, v19, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v17, 0x40c00000, s0
-; GFX11-NEXT:    v_bfe_u32 v19, v21, 16, 1
-; GFX11-NEXT:    s_and_b32 s0, s12, 0xffff0000
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-NEXT:    v_add_f32_e64 v22, 0x40c00000, s0
-; GFX11-NEXT:    v_bfe_u32 v23, v17, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v19, v19, v21
-; GFX11-NEXT:    v_add_nc_u32_e32 v20, v20, v18
-; GFX11-NEXT:    v_or_b32_e32 v27, 0x400000, v17
-; GFX11-NEXT:    v_bfe_u32 v24, v22, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v23, v23, v17
-; GFX11-NEXT:    v_add_nc_u32_e32 v19, 0x7fff, v19
-; GFX11-NEXT:    v_add_nc_u32_e32 v20, 0x7fff, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v24, v24, v22
-; GFX11-NEXT:    v_add_nc_u32_e32 v23, 0x7fff, v23
-; GFX11-NEXT:    v_cndmask_b32_e32 v19, v19, v26, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v21, 0x7fff, v24
-; GFX11-NEXT:    v_or_b32_e32 v24, 0x400000, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
-; GFX11-NEXT:    v_cndmask_b32_e32 v17, v23, v27, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; GFX11-NEXT:    v_lshl_or_b32 v3, v2, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
-; GFX11-NEXT:    v_cndmask_b32_e32 v18, v20, v25, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
-; GFX11-NEXT:    v_cndmask_b32_e32 v20, v21, v24, vcc_lo
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v4
-; GFX11-NEXT:    v_lshl_or_b32 v4, v1, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshl_or_b32 v1, v18, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshl_or_b32 v2, v16, 16, v21
-; GFX11-NEXT:    v_lshl_or_b32 v0, v20, 16, v17
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB47_3:
-; GFX11-NEXT:    s_branch .LBB47_2
-; GFX11-NEXT:  .LBB47_4:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
-; GFX11-NEXT:    v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
-; GFX11-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
-; GFX11-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
-; GFX11-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
-; GFX11-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
-; GFX11-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
-; GFX11-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v16f32_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s15, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s14, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s0
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB47_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB47_4
+; GFX11-TRUE16-NEXT:  .LBB47_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s27, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s27, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s26, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s26, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s25, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s17, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v2, v6 :: v_dual_add_nc_u32 v7, v7, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v9, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v7
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s25, 16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s24, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v0.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v8, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v5, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s24, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v6, 16, 1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s23, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v8, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v0.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v2, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s23, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v0.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v8, 16, 1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s22, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v3, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s22, 16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v5, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v7, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v9, 16, 1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s21, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v2.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v5, v9
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v3
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s21, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v8, v5
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s20, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v3, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s20, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v0.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v8, 16, 1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s19, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v3, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s19, 16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v5, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v7, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v16, 16, 1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s18, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v2.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v5, v16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v3
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s18, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v17, v5
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s17, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, v3, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v16, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v16, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v0, v17 :: v_dual_add_nc_u32 v3, v3, v4
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s16, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v1.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v5, v16
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s16, 16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v17, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v17, 16, 1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s15, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v4, v17
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, 0x400000, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v16, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v1, 16, 1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s15, 16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v4, v18, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, v19, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s14, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s14, 16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, 0x400000, v0
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, s12, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v16, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v17, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v17, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s13, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, v16, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v17, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v18, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v18, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 0x7fff, v16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v19, v19, v17
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v0, v18, 16, 1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s13, 16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v16, v20, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 0x7fff, v19
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, 0x400000, v17
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, v0, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v17, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v16, v19, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, 0x400000, v18
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v22, v17, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v21, v16, 16, 1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s12, 16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v20, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v0, v19 :: v_dual_add_nc_u32 v19, v21, v16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v21, v22, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v17
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v18, v20, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v19, 0x7fff, v19
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v21, 0x7fff, v21
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, 0x400000, v20
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v18, v18, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v17, v21, v22, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v18, 0x7fff, v18
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v16, v19, v23, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v18, v18, v24, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v17.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB47_3:
+; GFX11-TRUE16-NEXT:    s_branch .LBB47_2
+; GFX11-TRUE16-NEXT:  .LBB47_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v32bf16_to_v16f32_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s15, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s14, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s13, s1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s12, s0
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB47_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB47_4
+; GFX11-FAKE16-NEXT:  .LBB47_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s27, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s27, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s26, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s26, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s25, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s25, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s24, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s24, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v9, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v10, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v4, v7 :: v_dual_add_nc_u32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v8, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v5, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s23, 16
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v0, 16, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s23, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, v8, v7
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s22, 16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v8, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s22, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v7, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v0, 16, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s21, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, v8, v7
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s21, 16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v0, 16, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v8, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s20, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s20, 16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v7, v8
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v0, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v0, 16, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, v8, v7
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s19, 16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s19, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v4, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s18, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v8, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v16, v4
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v1, 16, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, v6, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s18, 16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s17, 16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v2, 16, v0
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s17, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v6, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v7, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v17, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v19, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v17, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v16, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v19, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s16, 16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v18, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v18, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s16, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v16, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v17, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v17, v18, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v16, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v0, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, v17, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v19, v6, v16
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v2, 16, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v18
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s15, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v5, 16, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s15, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s14, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v17, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v17, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v18, v16, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s14, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v3, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v17
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v18, v18, v16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s13, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v19, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, 0x7fff, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, 0x400000, v16
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v19, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v20, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v16, v17, v18 :: v_dual_add_nc_u32 v17, v19, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v18, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s13, 16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v21, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, 0x7fff, v17
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s12, 16
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v20, v18, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, 0x400000, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, 0x400000, v18
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v17, v19, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v17, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v19, v21, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s12, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v22, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v23, v17, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v19, v19, v21
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v20, v20, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, 0x400000, v17
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v24, v22, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v23, v23, v17
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v19, 0x7fff, v19
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v20, 0x7fff, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v24, v24, v22
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v23, 0x7fff, v23
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v19, v26, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v21, 0x7fff, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, 0x400000, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v23, v27, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v3, v2, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v20, v25, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v21, v24, vcc_lo
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v1, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v1, v18, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v2, v16, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v20, 16, v17
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB47_3:
+; GFX11-FAKE16-NEXT:    s_branch .LBB47_2
+; GFX11-FAKE16-NEXT:  .LBB47_4:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -35969,360 +36641,696 @@ define inreg <8 x i64> @bitcast_v32bf16_to_v8i64_scalar(<32 x bfloat> inreg %a,
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v32bf16_to_v8i64_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s15, s3
-; GFX11-NEXT:    s_mov_b32 s14, s2
-; GFX11-NEXT:    s_mov_b32 s13, s1
-; GFX11-NEXT:    s_mov_b32 s12, s0
-; GFX11-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB67_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccnz .LBB67_4
-; GFX11-NEXT:  .LBB67_2: ; %cmp.true
-; GFX11-NEXT:    s_and_b32 s1, s27, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s0, s27, 16
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s1
-; GFX11-NEXT:    v_add_f32_e64 v0, 0x40c00000, s0
-; GFX11-NEXT:    s_and_b32 s0, s26, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s2, s26, 16
-; GFX11-NEXT:    v_add_f32_e64 v3, 0x40c00000, s0
-; GFX11-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s2
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT:    v_bfe_u32 v9, v3, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v10, v5, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_and_b32 s1, s25, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT:    v_add_f32_e64 v6, 0x40c00000, s1
-; GFX11-NEXT:    s_lshl_b32 s3, s25, 16
-; GFX11-NEXT:    s_and_b32 s0, s24, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s1, s24, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v9, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v10, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v4, v7 :: v_dual_add_nc_u32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v8
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s3
-; GFX11-NEXT:    v_bfe_u32 v5, v6, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v8, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshl_or_b32 v15, v1, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v5, v6
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v8
-; GFX11-NEXT:    v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT:    s_lshl_b32 s1, s23, 16
-; GFX11-NEXT:    v_lshl_or_b32 v14, v0, 16, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT:    v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT:    s_and_b32 s0, s23, 0xffff0000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, v8, v7
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    s_lshl_b32 s1, s22, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v6
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v7
-; GFX11-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT:    v_bfe_u32 v7, v8, 16, 1
-; GFX11-NEXT:    s_and_b32 s0, s22, 0xffff0000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v4, v6, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v7, v8
-; GFX11-NEXT:    v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v6
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT:    v_lshl_or_b32 v13, v0, 16, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v8
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT:    v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT:    s_and_b32 s0, s21, 0xffff0000
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, v8, v7
-; GFX11-NEXT:    s_lshl_b32 s1, s21, 16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT:    v_lshl_or_b32 v12, v0, 16, v1
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v7
-; GFX11-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_bfe_u32 v7, v8, 16, 1
-; GFX11-NEXT:    s_and_b32 s0, s20, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s1, s20, 16
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v4, v6, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v7, v8
-; GFX11-NEXT:    v_lshl_or_b32 v11, v0, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v6
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v8
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT:    v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT:    v_lshl_or_b32 v10, v0, 16, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, v8, v7
-; GFX11-NEXT:    s_lshl_b32 s0, s19, 16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v7
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v4
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT:    s_and_b32 s0, s19, 0xffff0000
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_bfe_u32 v16, v4, 16, 1
-; GFX11-NEXT:    s_and_b32 s0, s18, 0xffff0000
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_bfe_u32 v6, v8, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v16, v4
-; GFX11-NEXT:    v_lshl_or_b32 v9, v1, 16, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, v6, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v4
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s1, s18, 16
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s1
-; GFX11-NEXT:    s_lshl_b32 s0, s17, 16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_lshl_or_b32 v8, v2, 16, v0
-; GFX11-NEXT:    s_and_b32 s1, s17, 0xffff0000
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v6, v4
-; GFX11-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v7, v5
-; GFX11-NEXT:    v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_or_b32_e32 v16, 0x400000, v4
-; GFX11-NEXT:    v_bfe_u32 v17, v6, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v5
-; GFX11-NEXT:    v_bfe_u32 v19, v7, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v17, v6
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v16, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_or_b32_e32 v16, 0x400000, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v19, v7
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    s_lshl_b32 s1, s16, 16
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v18, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT:    v_or_b32_e32 v17, 0x400000, v7
-; GFX11-NEXT:    v_add_f32_e64 v18, 0x40c00000, s1
-; GFX11-NEXT:    s_and_b32 s0, s16, 0xffff0000
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v16, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_add_f32_e64 v16, 0x40c00000, s0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v5, v17, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v17, v18, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v6, v16, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshl_or_b32 v7, v0, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, v17, v18
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v19, v6, v16
-; GFX11-NEXT:    v_lshl_or_b32 v6, v2, 16, v3
-; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v18
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    s_and_b32 s0, s15, 0xffff0000
-; GFX11-NEXT:    v_lshl_or_b32 v5, v5, 16, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v19
-; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v16
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT:    s_lshl_b32 s1, s15, 16
-; GFX11-NEXT:    s_and_b32 s0, s14, 0xffff0000
-; GFX11-NEXT:    v_add_f32_e64 v17, 0x40c00000, s1
-; GFX11-NEXT:    v_add_f32_e64 v16, 0x40c00000, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v2, v4, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v4
-; GFX11-NEXT:    v_bfe_u32 v3, v17, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v18, v16, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v4
-; GFX11-NEXT:    s_lshl_b32 s0, s14, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v3, v17
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v17
-; GFX11-NEXT:    v_add_nc_u32_e32 v18, v18, v16
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    s_and_b32 s0, s13, 0xffff0000
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v19, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT:    v_add_nc_u32_e32 v17, 0x7fff, v18
-; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v16
-; GFX11-NEXT:    v_bfe_u32 v19, v4, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v20, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_dual_cndmask_b32 v16, v17, v18 :: v_dual_add_nc_u32 v17, v19, v4
-; GFX11-NEXT:    v_add_f32_e64 v18, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s13, 16
-; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v4
-; GFX11-NEXT:    v_add_f32_e64 v21, 0x40c00000, s0
-; GFX11-NEXT:    v_add_nc_u32_e32 v17, 0x7fff, v17
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    s_lshl_b32 s0, s12, 16
-; GFX11-NEXT:    v_bfe_u32 v20, v18, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v26, 0x400000, v21
-; GFX11-NEXT:    v_or_b32_e32 v25, 0x400000, v18
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v17, v19, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v17, 0x40c00000, s0
-; GFX11-NEXT:    v_bfe_u32 v19, v21, 16, 1
-; GFX11-NEXT:    s_and_b32 s0, s12, 0xffff0000
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-NEXT:    v_add_f32_e64 v22, 0x40c00000, s0
-; GFX11-NEXT:    v_bfe_u32 v23, v17, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v19, v19, v21
-; GFX11-NEXT:    v_add_nc_u32_e32 v20, v20, v18
-; GFX11-NEXT:    v_or_b32_e32 v27, 0x400000, v17
-; GFX11-NEXT:    v_bfe_u32 v24, v22, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v23, v23, v17
-; GFX11-NEXT:    v_add_nc_u32_e32 v19, 0x7fff, v19
-; GFX11-NEXT:    v_add_nc_u32_e32 v20, 0x7fff, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v24, v24, v22
-; GFX11-NEXT:    v_add_nc_u32_e32 v23, 0x7fff, v23
-; GFX11-NEXT:    v_cndmask_b32_e32 v19, v19, v26, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v21, 0x7fff, v24
-; GFX11-NEXT:    v_or_b32_e32 v24, 0x400000, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
-; GFX11-NEXT:    v_cndmask_b32_e32 v17, v23, v27, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; GFX11-NEXT:    v_lshl_or_b32 v3, v2, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
-; GFX11-NEXT:    v_cndmask_b32_e32 v18, v20, v25, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
-; GFX11-NEXT:    v_cndmask_b32_e32 v20, v21, v24, vcc_lo
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v4
-; GFX11-NEXT:    v_lshl_or_b32 v4, v1, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshl_or_b32 v1, v18, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshl_or_b32 v2, v16, 16, v21
-; GFX11-NEXT:    v_lshl_or_b32 v0, v20, 16, v17
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB67_3:
-; GFX11-NEXT:    s_branch .LBB67_2
-; GFX11-NEXT:  .LBB67_4:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
-; GFX11-NEXT:    v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
-; GFX11-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
-; GFX11-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
-; GFX11-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
-; GFX11-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
-; GFX11-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
-; GFX11-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v8i64_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s15, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s14, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s0
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB67_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB67_4
+; GFX11-TRUE16-NEXT:  .LBB67_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s27, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s27, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s26, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s26, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s25, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s17, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v2, v6 :: v_dual_add_nc_u32 v7, v7, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v9, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v7
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s25, 16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s24, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v0.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v8, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v5, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s24, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v6, 16, 1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s23, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v8, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v0.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v2, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s23, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v0.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v8, 16, 1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s22, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v3, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s22, 16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v5, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v7, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v9, 16, 1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s21, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v2.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v5, v9
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v3
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s21, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v8, v5
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s20, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v3, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s20, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v0.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v8, 16, 1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s19, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v3, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s19, 16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v5, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v7, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v16, 16, 1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s18, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v2.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v5, v16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v3
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s18, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v17, v5
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s17, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, v3, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v16, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v16, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v0, v17 :: v_dual_add_nc_u32 v3, v3, v4
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s16, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v1.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v5, v16
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s16, 16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v17, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v17, 16, 1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s15, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v4, v17
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, 0x400000, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v16, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v1, 16, 1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s15, 16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v4, v18, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, v19, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s14, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s14, 16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, 0x400000, v0
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, s12, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v16, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v17, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v17, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s13, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, v16, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v17, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v18, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v18, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 0x7fff, v16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v19, v19, v17
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v0, v18, 16, 1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s13, 16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v16, v20, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 0x7fff, v19
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, 0x400000, v17
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, v0, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v17, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v16, v19, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, 0x400000, v18
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v22, v17, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v21, v16, 16, 1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s12, 16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v20, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v0, v19 :: v_dual_add_nc_u32 v19, v21, v16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v21, v22, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v17
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v18, v20, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v19, 0x7fff, v19
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v21, 0x7fff, v21
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, 0x400000, v20
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v18, v18, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v17, v21, v22, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v18, 0x7fff, v18
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v16, v19, v23, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v18, v18, v24, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v17.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB67_3:
+; GFX11-TRUE16-NEXT:    s_branch .LBB67_2
+; GFX11-TRUE16-NEXT:  .LBB67_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v32bf16_to_v8i64_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s15, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s14, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s13, s1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s12, s0
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB67_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB67_4
+; GFX11-FAKE16-NEXT:  .LBB67_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s27, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s27, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s26, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s26, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s25, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s25, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s24, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s24, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v9, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v10, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v4, v7 :: v_dual_add_nc_u32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v8, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v5, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s23, 16
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v0, 16, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s23, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, v8, v7
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s22, 16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v8, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s22, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v7, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v0, 16, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s21, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, v8, v7
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s21, 16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v0, 16, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v8, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s20, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s20, 16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v7, v8
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v0, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v0, 16, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, v8, v7
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s19, 16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s19, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v4, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s18, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v8, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v16, v4
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v1, 16, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, v6, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s18, 16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s17, 16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v2, 16, v0
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s17, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v6, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v7, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v17, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v19, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v17, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v16, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v19, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s16, 16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v18, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v18, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s16, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v16, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v17, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v17, v18, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v16, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v0, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, v17, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v19, v6, v16
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v2, 16, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v18
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s15, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v5, 16, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s15, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s14, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v17, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v17, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v18, v16, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s14, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v3, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v17
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v18, v18, v16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s13, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v19, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, 0x7fff, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, 0x400000, v16
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v19, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v20, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v16, v17, v18 :: v_dual_add_nc_u32 v17, v19, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v18, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s13, 16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v21, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, 0x7fff, v17
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s12, 16
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v20, v18, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, 0x400000, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, 0x400000, v18
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v17, v19, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v17, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v19, v21, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s12, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v22, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v23, v17, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v19, v19, v21
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v20, v20, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, 0x400000, v17
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v24, v22, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v23, v23, v17
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v19, 0x7fff, v19
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v20, 0x7fff, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v24, v24, v22
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v23, 0x7fff, v23
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v19, v26, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v21, 0x7fff, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, 0x400000, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v23, v27, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v3, v2, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v20, v25, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v21, v24, vcc_lo
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v1, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v1, v18, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v2, v16, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v20, 16, v17
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB67_3:
+; GFX11-FAKE16-NEXT:    s_branch .LBB67_2
+; GFX11-FAKE16-NEXT:  .LBB67_4:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -49092,360 +50100,696 @@ define inreg <8 x double> @bitcast_v32bf16_to_v8f64_scalar(<32 x bfloat> inreg %
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v32bf16_to_v8f64_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s15, s3
-; GFX11-NEXT:    s_mov_b32 s14, s2
-; GFX11-NEXT:    s_mov_b32 s13, s1
-; GFX11-NEXT:    s_mov_b32 s12, s0
-; GFX11-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB83_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccnz .LBB83_4
-; GFX11-NEXT:  .LBB83_2: ; %cmp.true
-; GFX11-NEXT:    s_and_b32 s1, s27, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s0, s27, 16
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s1
-; GFX11-NEXT:    v_add_f32_e64 v0, 0x40c00000, s0
-; GFX11-NEXT:    s_and_b32 s0, s26, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s2, s26, 16
-; GFX11-NEXT:    v_add_f32_e64 v3, 0x40c00000, s0
-; GFX11-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s2
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT:    v_bfe_u32 v9, v3, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v10, v5, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_and_b32 s1, s25, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT:    v_add_f32_e64 v6, 0x40c00000, s1
-; GFX11-NEXT:    s_lshl_b32 s3, s25, 16
-; GFX11-NEXT:    s_and_b32 s0, s24, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s1, s24, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v9, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v10, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v4, v7 :: v_dual_add_nc_u32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v8
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s3
-; GFX11-NEXT:    v_bfe_u32 v5, v6, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v8, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshl_or_b32 v15, v1, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v5, v6
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v8
-; GFX11-NEXT:    v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT:    s_lshl_b32 s1, s23, 16
-; GFX11-NEXT:    v_lshl_or_b32 v14, v0, 16, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT:    v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT:    s_and_b32 s0, s23, 0xffff0000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, v8, v7
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    s_lshl_b32 s1, s22, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v6
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v7
-; GFX11-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT:    v_bfe_u32 v7, v8, 16, 1
-; GFX11-NEXT:    s_and_b32 s0, s22, 0xffff0000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v4, v6, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v7, v8
-; GFX11-NEXT:    v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v6
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT:    v_lshl_or_b32 v13, v0, 16, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v8
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT:    v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT:    s_and_b32 s0, s21, 0xffff0000
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, v8, v7
-; GFX11-NEXT:    s_lshl_b32 s1, s21, 16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT:    v_lshl_or_b32 v12, v0, 16, v1
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v7
-; GFX11-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_bfe_u32 v7, v8, 16, 1
-; GFX11-NEXT:    s_and_b32 s0, s20, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s1, s20, 16
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v4, v6, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v7, v8
-; GFX11-NEXT:    v_lshl_or_b32 v11, v0, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v6
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v8
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT:    v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT:    v_lshl_or_b32 v10, v0, 16, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v4, v5, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, v8, v7
-; GFX11-NEXT:    s_lshl_b32 s0, s19, 16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v7
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v4
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT:    s_and_b32 s0, s19, 0xffff0000
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_bfe_u32 v16, v4, 16, 1
-; GFX11-NEXT:    s_and_b32 s0, s18, 0xffff0000
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_bfe_u32 v6, v8, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v16, v4
-; GFX11-NEXT:    v_lshl_or_b32 v9, v1, 16, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, v6, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v5
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v4
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s1, s18, 16
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s1
-; GFX11-NEXT:    s_lshl_b32 s0, s17, 16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_lshl_or_b32 v8, v2, 16, v0
-; GFX11-NEXT:    s_and_b32 s1, s17, 0xffff0000
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v6, v4
-; GFX11-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v7, v5
-; GFX11-NEXT:    v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_or_b32_e32 v16, 0x400000, v4
-; GFX11-NEXT:    v_bfe_u32 v17, v6, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v5
-; GFX11-NEXT:    v_bfe_u32 v19, v7, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v17, v6
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v16, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_or_b32_e32 v16, 0x400000, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, v19, v7
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    s_lshl_b32 s1, s16, 16
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v18, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
-; GFX11-NEXT:    v_or_b32_e32 v17, 0x400000, v7
-; GFX11-NEXT:    v_add_f32_e64 v18, 0x40c00000, s1
-; GFX11-NEXT:    s_and_b32 s0, s16, 0xffff0000
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v16, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_add_f32_e64 v16, 0x40c00000, s0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v5, v17, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v17, v18, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v6, v16, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshl_or_b32 v7, v0, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, v17, v18
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v19, v6, v16
-; GFX11-NEXT:    v_lshl_or_b32 v6, v2, 16, v3
-; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v18
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    s_and_b32 s0, s15, 0xffff0000
-; GFX11-NEXT:    v_lshl_or_b32 v5, v5, 16, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v19
-; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v16
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT:    s_lshl_b32 s1, s15, 16
-; GFX11-NEXT:    s_and_b32 s0, s14, 0xffff0000
-; GFX11-NEXT:    v_add_f32_e64 v17, 0x40c00000, s1
-; GFX11-NEXT:    v_add_f32_e64 v16, 0x40c00000, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v2, v4, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v4
-; GFX11-NEXT:    v_bfe_u32 v3, v17, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v18, v16, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v4
-; GFX11-NEXT:    s_lshl_b32 s0, s14, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v3, v17
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v17
-; GFX11-NEXT:    v_add_nc_u32_e32 v18, v18, v16
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    s_and_b32 s0, s13, 0xffff0000
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v19, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT:    v_add_nc_u32_e32 v17, 0x7fff, v18
-; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v16
-; GFX11-NEXT:    v_bfe_u32 v19, v4, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v20, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_dual_cndmask_b32 v16, v17, v18 :: v_dual_add_nc_u32 v17, v19, v4
-; GFX11-NEXT:    v_add_f32_e64 v18, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s13, 16
-; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v4
-; GFX11-NEXT:    v_add_f32_e64 v21, 0x40c00000, s0
-; GFX11-NEXT:    v_add_nc_u32_e32 v17, 0x7fff, v17
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    s_lshl_b32 s0, s12, 16
-; GFX11-NEXT:    v_bfe_u32 v20, v18, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v26, 0x400000, v21
-; GFX11-NEXT:    v_or_b32_e32 v25, 0x400000, v18
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v17, v19, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v17, 0x40c00000, s0
-; GFX11-NEXT:    v_bfe_u32 v19, v21, 16, 1
-; GFX11-NEXT:    s_and_b32 s0, s12, 0xffff0000
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-NEXT:    v_add_f32_e64 v22, 0x40c00000, s0
-; GFX11-NEXT:    v_bfe_u32 v23, v17, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v19, v19, v21
-; GFX11-NEXT:    v_add_nc_u32_e32 v20, v20, v18
-; GFX11-NEXT:    v_or_b32_e32 v27, 0x400000, v17
-; GFX11-NEXT:    v_bfe_u32 v24, v22, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v23, v23, v17
-; GFX11-NEXT:    v_add_nc_u32_e32 v19, 0x7fff, v19
-; GFX11-NEXT:    v_add_nc_u32_e32 v20, 0x7fff, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v24, v24, v22
-; GFX11-NEXT:    v_add_nc_u32_e32 v23, 0x7fff, v23
-; GFX11-NEXT:    v_cndmask_b32_e32 v19, v19, v26, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v21, 0x7fff, v24
-; GFX11-NEXT:    v_or_b32_e32 v24, 0x400000, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
-; GFX11-NEXT:    v_cndmask_b32_e32 v17, v23, v27, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; GFX11-NEXT:    v_lshl_or_b32 v3, v2, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
-; GFX11-NEXT:    v_cndmask_b32_e32 v18, v20, v25, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
-; GFX11-NEXT:    v_cndmask_b32_e32 v20, v21, v24, vcc_lo
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v4
-; GFX11-NEXT:    v_lshl_or_b32 v4, v1, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshl_or_b32 v1, v18, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshl_or_b32 v2, v16, 16, v21
-; GFX11-NEXT:    v_lshl_or_b32 v0, v20, 16, v17
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB83_3:
-; GFX11-NEXT:    s_branch .LBB83_2
-; GFX11-NEXT:  .LBB83_4:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
-; GFX11-NEXT:    v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
-; GFX11-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
-; GFX11-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
-; GFX11-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
-; GFX11-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
-; GFX11-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
-; GFX11-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v8f64_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s15, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s14, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s0
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB83_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB83_4
+; GFX11-TRUE16-NEXT:  .LBB83_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s27, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s27, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s26, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s26, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s25, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s17, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v2, v6 :: v_dual_add_nc_u32 v7, v7, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v9, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v7
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s25, 16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s24, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v0.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v8, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v5, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s24, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v6, 16, 1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s23, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v8, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v0.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v2, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s23, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v0.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v8, 16, 1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s22, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v3, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s22, 16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v5, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v7, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v9, 16, 1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s21, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v2.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v5, v9
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v3
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s21, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v8, v5
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s20, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v3, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s20, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v0.l
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v8, 16, 1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s19, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v1, v1, v7 :: v_dual_add_nc_u32 v0, v3, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s19, 16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v5, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v7, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v16, 16, 1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s18, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v2.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v5, v16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v3
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s18, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v17, v5
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s17, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, v3, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v16, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v16, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v0, v17 :: v_dual_add_nc_u32 v3, v3, v4
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s16, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v1.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v5, v16
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s16, 16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v17, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v17, 16, 1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s15, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v4, v17
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, 0x400000, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v16, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v1, 16, 1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s15, 16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v4, v18, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, v19, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s14, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s14, 16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, 0x400000, v0
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, s12, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v16, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v17, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v17, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s13, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, v16, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v17, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v18, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v18, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 0x7fff, v16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v19, v19, v17
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v0, v18, 16, 1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s13, 16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v16, v20, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 0x7fff, v19
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, 0x400000, v17
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, v0, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v17, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v16, v19, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, 0x400000, v18
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v22, v17, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v21, v16, 16, 1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s12, 16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v20, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v0, v19 :: v_dual_add_nc_u32 v19, v21, v16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v21, v22, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v17
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v18, v20, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v19, 0x7fff, v19
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v21, 0x7fff, v21
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, 0x400000, v20
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v18, v18, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v17, v21, v22, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v18, 0x7fff, v18
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v16, v19, v23, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v18, v18, v24, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v17.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB83_3:
+; GFX11-TRUE16-NEXT:    s_branch .LBB83_2
+; GFX11-TRUE16-NEXT:  .LBB83_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v32bf16_to_v8f64_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s15, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s14, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s13, s1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s12, s0
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB83_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB83_4
+; GFX11-FAKE16-NEXT:  .LBB83_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s27, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s27, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s26, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s26, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s25, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s3, s25, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s24, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s24, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v9, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v10, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v4, v7 :: v_dual_add_nc_u32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v8, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v5, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s23, 16
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v0, 16, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s23, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, v8, v7
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s22, 16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v8, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s22, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v7, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v0, 16, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s21, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, v8, v7
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s21, 16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v0, 16, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v8, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s20, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s20, 16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v7, v8
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v0, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v0, 16, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, v8, v7
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s19, 16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s19, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v4, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s18, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v8, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v16, v4
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v1, 16, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, v6, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s18, 16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s17, 16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v2, 16, v0
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s17, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v6, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v7, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v17, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v19, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v17, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v16, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v19, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s16, 16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v18, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v18, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s16, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v16, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v17, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v17, v18, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v16, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v0, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, v17, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v19, v6, v16
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v2, 16, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v18
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s15, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v5, 16, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s15, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s14, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v17, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v16, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v17, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v18, v16, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v4
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s14, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v3, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v17
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v18, v18, v16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s13, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v19, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, 0x7fff, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, 0x400000, v16
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v19, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v20, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v16, v17, v18 :: v_dual_add_nc_u32 v17, v19, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v18, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s13, 16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v21, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, 0x7fff, v17
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s12, 16
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v20, v18, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, 0x400000, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, 0x400000, v18
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v17, v19, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v17, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v19, v21, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s12, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v22, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v23, v17, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v19, v19, v21
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v20, v20, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, 0x400000, v17
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v24, v22, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v23, v23, v17
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v19, 0x7fff, v19
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v20, 0x7fff, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v24, v24, v22
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v23, 0x7fff, v23
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v19, v26, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v21, 0x7fff, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, 0x400000, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v23, v27, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v3, v2, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v20, v25, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v21, v24, vcc_lo
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v1, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v1, v18, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v2, v16, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v20, 16, v17
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB83_3:
+; GFX11-FAKE16-NEXT:    s_branch .LBB83_2
+; GFX11-FAKE16-NEXT:  .LBB83_4:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -60096,298 +61440,258 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_lshlrev_b32 v26, 16, v7
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v18, 0x40c00000, v0 :: v_dual_lshlrev_b32 v19, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v9
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v21, v17, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_add_f32 v16, 0x40c00000, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v10
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v0, v16, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v22, v17, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v16, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, 0x400000, v16
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v18
-; GFX11-TRUE16-NEXT:    v_add3_u32 v21, v21, v17, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v0, v0, v16, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, 0x400000, v17
+; GFX11-TRUE16-NEXT:    v_add3_u32 v22, v22, v17, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v16, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, 0x400000, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v11
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_cndmask_b32 v0, v0, v22
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v18, 16, 1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-TRUE16-NEXT:    v_add3_u32 v1, v1, v18, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v18, v20, 16, 1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v23, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v16, v19, v20, vcc_lo
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v18, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v21, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v21, v21, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v21, v23, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, 0x400000, v20
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v10
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v12
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v13, 0x40c00000, v13 :: v_dual_cndmask_b32 v2, v21, v16
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v21, 0x40c00000, v22
-; GFX11-TRUE16-NEXT:    v_add3_u32 v16, v18, v20, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v18, v19, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v23, v20, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v16.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v17, v22, v24, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v22, v18, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v19, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v18
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v22, v18, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v22, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v17.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v18, v19, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v22, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_add3_u32 v22, v23, v20, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v20
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v19, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v3, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v35, 0x400000, v13
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v21, 16, 1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v16, v16, v17, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v18, v19, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, 0x400000, v19
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, 0x400000, v21
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v17, v17, v18, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v22, 0x40c00000, v22 :: v_dual_add_f32 v3, 0x40c00000, v3
-; GFX11-TRUE16-NEXT:    v_add3_u32 v18, v20, v21, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v4
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v22, 16, 1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v21, v3, 16, 1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v18, v18, v19, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v20, v22, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, 0x400000, v22
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v16.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v19, v19, v20, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v21, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v21, 0x40c00000, v24
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v18.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v20, v22, v23, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v24, v21, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v20, v21, vcc_lo
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v23, 0x40c00000, v23 :: v_dual_add_f32 v4, 0x40c00000, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v22, v23, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v23
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v22, v23, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v22, v4, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v20, v20, v21, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v21, v22, v4, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v22, 0x40c00000, v22 :: v_dual_cndmask_b32 v3, v19, v23
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v24, v21, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v21
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v24, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v25, v22, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v20.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v21, v24, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v19, v19, v23, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v4
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v4, v21, v22 :: v_dual_and_b32 v5, 0xffff0000, v5
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v24, 0x40c00000, v24 :: v_dual_add_f32 v5, 0x40c00000, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v23, v24, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v24
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-TRUE16-NEXT:    v_add3_u32 v21, v23, v24, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v23, v5, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v21, v21, v22, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add3_u32 v22, v23, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v24, v25, v22, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, 0x400000, v22
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v21, v23, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v21, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v23, 0x40c00000, v26
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v19.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v21, v21, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v22, v24, v25, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v26, v23, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v22, v23, vcc_lo
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v25, 0x40c00000, v25 :: v_dual_add_f32 v6, 0x40c00000, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v5.h
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v24, v25, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v25
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v22, v24, v25, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v24, v6, 16, 1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v22, v22, v23, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v23, v24, v6, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v24, 0x40c00000, v24 :: v_dual_cndmask_b32 v5, v21, v25
+; GFX11-TRUE16-NEXT:    v_add3_u32 v21, v26, v23, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, 0x400000, v23
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v26, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v27, v24, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v23, v26, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v21, v21, v25, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, 0x400000, v6
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v6, v23, v24 :: v_dual_and_b32 v7, 0xffff0000, v7
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v26, 0x40c00000, v26 :: v_dual_add_f32 v7, 0x40c00000, v7
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v25, v26, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, 0x400000, v26
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v23, v25, v26, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v25, v7, 16, 1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v23, v23, v24, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v24, v25, v7, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_add3_u32 v26, v27, v24, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, 0x400000, v24
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v23, v25, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v23, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v25, 0x40c00000, v28
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v21.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v23, v23, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v24, v26, v27, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v28, v25, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v7, v24, v25 :: v_dual_and_b32 v8, 0xffff0000, v8
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v27, 0x40c00000, v27 :: v_dual_add_f32 v8, 0x40c00000, v8
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v26, v27, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, 0x400000, v27
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v24, v26, v27, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v26, v8, 16, 1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v24, v24, v25, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v25, v26, v8, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v26, 0x40c00000, v26 :: v_dual_cndmask_b32 v7, v23, v27
+; GFX11-TRUE16-NEXT:    v_add3_u32 v23, v28, v25, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, 0x400000, v25
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v28, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v29, v26, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v25, v28, v8, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v23, v23, v27, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, 0x400000, v8
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v8, v25, v26 :: v_dual_and_b32 v9, 0xffff0000, v9
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v28, 0x40c00000, v28 :: v_dual_add_f32 v9, 0x40c00000, v9
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v27, v28, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, 0x400000, v28
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v25, v27, v28, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v27, v9, 16, 1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v25, v25, v26, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v26, v27, v9, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_add3_u32 v28, v29, v26, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, 0x400000, v26
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v25, v27, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v25, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v27, 0x40c00000, v30
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v23.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v22.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v25, v25, v9, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v26, v28, v29, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v30, v27, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v9, v26, v27 :: v_dual_and_b32 v10, 0xffff0000, v10
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v29, 0x40c00000, v29 :: v_dual_add_f32 v10, 0x40c00000, v10
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v28, v29, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, 0x400000, v29
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v26, v28, v29, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v28, v10, 16, 1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v26, v26, v27, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v27, v28, v10, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, 0x400000, v10
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v28, 0x40c00000, v28 :: v_dual_cndmask_b32 v9, v25, v29
+; GFX11-TRUE16-NEXT:    v_add3_u32 v25, v30, v27, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, 0x400000, v27
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v30, v10, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v31, v28, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v27, v30, v10, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v25, v25, v29, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, 0x400000, v10
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v10, v27, v28 :: v_dual_and_b32 v11, 0xffff0000, v11
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v30, 0x40c00000, v30 :: v_dual_add_f32 v11, 0x40c00000, v11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v10.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v29, v30, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, 0x400000, v30
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v26
-; GFX11-TRUE16-NEXT:    v_add3_u32 v27, v29, v30, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v29, v11, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v6, 16, v10
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v27, v27, v28, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v28, v29, v11, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, 0x400000, v11
+; GFX11-TRUE16-NEXT:    v_add3_u32 v30, v31, v28, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, 0x400000, v28
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v10, v27, v29, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v27, v11, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v29, 0x40c00000, v32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v25.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v24.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v27, v27, v11, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v28, v30, v31, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, 0x400000, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v13
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v29, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v11, v28, v29 :: v_dual_and_b32 v12, 0xffff0000, v12
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v31, 0x40c00000, v31 :: v_dual_add_f32 v12, 0x40c00000, v12
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v11.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v30, v31, 16, 1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v28, v12, 16, 1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v33, 0x400000, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v27
-; GFX11-TRUE16-NEXT:    v_add3_u32 v29, v30, v31, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v30, 0x40c00000, v32
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v32, 0x400000, v31
-; GFX11-TRUE16-NEXT:    v_add3_u32 v28, v28, v12, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v31, v13, 16, 1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v5, 16, v11
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v34, v30, 16, 1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v29, v29, v32, vcc_lo
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v11, v27, v30, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v27, 0x40c00000, v31
+; GFX11-TRUE16-NEXT:    v_add3_u32 v30, v32, v29, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, 0x400000, v29
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v12, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v27, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v34, 0x400000, v27
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v29, v30, v31, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v30, v32, v12, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, 0x400000, v12
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-TRUE16-NEXT:    v_add3_u32 v31, v31, v13, 0x7fff
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v22
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v12, v28, v33, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
-; GFX11-TRUE16-NEXT:    v_add3_u32 v28, v34, v30, 0x7fff
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v13, v31, v35 :: v_dual_add_f32 v32, 0x40c00000, v32
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v33, 0x400000, v30
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v15, 0x40c00000, v15
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v13.h
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v32, 16, 1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v12.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v38, v34, 16, 1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v31, v15, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v15
-; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v36, v32, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v29
-; GFX11-TRUE16-NEXT:    v_add3_u32 v31, v31, v15, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v33, v27, 0x7fff
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v33, 16, v14
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v13, 16, 1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v5, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v16, 16, v21
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v4, 16, v12
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v7.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v12, v30, v31, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v30, 0x40c00000, v33
+; GFX11-TRUE16-NEXT:    v_add3_u32 v31, v35, v13, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v23
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v18.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v14, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v48, 0x400000, v14
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v4, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v21, 16, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v36, v37, v14, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v37, v38, v34, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v34, v37, v38, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v34.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v15, v31, v39, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, 0x400000, v32
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v14, v36, v48, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v1, 16, v15
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v14.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v31, v35, v31, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v29.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v27, v32, v34, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v32, 0x400000, v13
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v15
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v30, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v14
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v26.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v13, v31, v32, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v31, 0x40c00000, v34
+; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v33, v30, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v33, 0x400000, v30
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v34, v14, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v31
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v28, v28, v33, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v1, 16, v14
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v28
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v9.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v25
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v3, 16, v13
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v8.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v24
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v1, 16, v9
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v4.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v20
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v3, 16, v8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v3.h
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v16, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v1, 16, v4
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v18, 16, v20
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v31, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v27.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v30, v32, v33, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v34, v14, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v35, v31, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v35, 0x400000, v31
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v15, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v15
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v31, v34, v35, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v32, v15, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v14, v33, v36, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v30.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v3, v3, 16, v19
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v15, v32, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v31.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v28.h
 ; GFX11-TRUE16-NEXT:  .LBB94_2: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -61605,325 +62909,620 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v32bf16_to_v32i16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s15, s3
-; GFX11-NEXT:    s_mov_b32 s14, s2
-; GFX11-NEXT:    s_mov_b32 s13, s1
-; GFX11-NEXT:    s_mov_b32 s12, s0
-; GFX11-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB95_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccnz .LBB95_4
-; GFX11-NEXT:  .LBB95_2: ; %cmp.true
-; GFX11-NEXT:    s_and_b32 s0, s12, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s1, s12, 16
-; GFX11-NEXT:    v_add_f32_e64 v0, 0x40c00000, s0
-; GFX11-NEXT:    s_and_b32 s0, s13, 0xffff0000
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s1
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s1, s13, 16
-; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v0
-; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s1
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, v7, v4
-; GFX11-NEXT:    s_and_b32 s2, s14, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT:    v_bfe_u32 v9, v5, 16, 1
-; GFX11-NEXT:    s_lshl_b32 s0, s14, 16
-; GFX11-NEXT:    s_lshl_b32 s1, s27, 16
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v7
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v3, 0x40c00000, s2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v5
-; GFX11-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v16, v2, v6, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, v9, v5
-; GFX11-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, v8, v3
-; GFX11-NEXT:    s_and_b32 s0, s15, 0xffff0000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v7
-; GFX11-NEXT:    v_bfe_u32 v7, v9, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v6
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, v7, v9
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    s_lshl_b32 s0, s15, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v10, 0x40c00000, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v17, v5, v6, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v7
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v9
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    s_and_b32 s0, s16, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, v8, v4
-; GFX11-NEXT:    v_bfe_u32 v7, v10, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s16, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, v7, v10
-; GFX11-NEXT:    v_add_f32_e64 v11, 0x40c00000, s0
-; GFX11-NEXT:    v_bfe_u32 v9, v5, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    s_and_b32 s0, s17, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v18, v6, v8 :: v_dual_add_nc_u32 v7, v9, v5
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v10
-; GFX11-NEXT:    v_bfe_u32 v8, v11, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v8, v11
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    s_lshl_b32 s0, s17, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v8
-; GFX11-NEXT:    v_add_f32_e64 v12, 0x40c00000, s0
-; GFX11-NEXT:    v_bfe_u32 v10, v6, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v19, v7, v9, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v11
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    s_and_b32 s0, s18, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v10, v6
-; GFX11-NEXT:    v_bfe_u32 v9, v12, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v20, v5, v7, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s18, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v8
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, v9, v12
-; GFX11-NEXT:    v_bfe_u32 v10, v5, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v11, 0x40c00000, s0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    s_and_b32 s0, s19, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    v_bfe_u32 v9, v11, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v21, v7, v8, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v12
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v10, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, v9, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v8
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v7, 0x40c00000, s0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    s_lshl_b32 s0, s19, 16
-; GFX11-NEXT:    v_add_f32_e64 v13, 0x40c00000, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v12, v7, 16, 1
-; GFX11-NEXT:    v_dual_cndmask_b32 v5, v8, v10 :: v_dual_add_nc_u32 v8, 0x7fff, v9
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v11
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    s_and_b32 s0, s20, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, v12, v7
-; GFX11-NEXT:    v_bfe_u32 v12, v13, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v7
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s20, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT:    v_add_nc_u32_e32 v12, v12, v13
-; GFX11-NEXT:    v_add_f32_e64 v15, 0x40c00000, s0
-; GFX11-NEXT:    v_bfe_u32 v14, v9, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    s_and_b32 s0, s21, 0xffff0000
-; GFX11-NEXT:    v_dual_cndmask_b32 v7, v10, v11 :: v_dual_add_nc_u32 v10, 0x7fff, v12
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v13
-; GFX11-NEXT:    v_add_nc_u32_e32 v12, v14, v9
-; GFX11-NEXT:    v_bfe_u32 v14, v15, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_add_nc_u32_e32 v13, v14, v15
-; GFX11-NEXT:    v_cndmask_b32_e32 v22, v10, v11, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v10, 0x40c00000, s0
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, 0x7fff, v12
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v9
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    s_lshl_b32 s0, s21, 16
-; GFX11-NEXT:    v_bfe_u32 v14, v10, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v23, 0x40c00000, s0
-; GFX11-NEXT:    v_dual_cndmask_b32 v24, v11, v12 :: v_dual_add_nc_u32 v9, 0x7fff, v13
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v15
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-NEXT:    s_and_b32 s0, s22, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v12, v14, v10
-; GFX11-NEXT:    v_bfe_u32 v13, v23, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v10
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v11, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s22, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v12, 0x7fff, v12
-; GFX11-NEXT:    v_add_nc_u32_e32 v13, v13, v23
-; GFX11-NEXT:    v_add_f32_e64 v25, 0x40c00000, s0
-; GFX11-NEXT:    v_bfe_u32 v15, v11, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    s_and_b32 s0, s23, 0xffff0000
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v13
-; GFX11-NEXT:    v_bfe_u32 v13, v25, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v27, 0x400000, v11
-; GFX11-NEXT:    v_cndmask_b32_e32 v26, v12, v14, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v12, v15, v11
-; GFX11-NEXT:    v_add_f32_e64 v14, 0x40c00000, s0
-; GFX11-NEXT:    v_or_b32_e32 v15, 0x400000, v23
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-NEXT:    v_add_nc_u32_e32 v13, v13, v25
-; GFX11-NEXT:    v_add_nc_u32_e32 v12, 0x7fff, v12
-; GFX11-NEXT:    v_bfe_u32 v28, v14, 16, 1
-; GFX11-NEXT:    s_lshl_b32 s0, s23, 16
-; GFX11-NEXT:    v_cndmask_b32_e32 v23, v10, v15, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, 0x7fff, v13
-; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v25
-; GFX11-NEXT:    v_add_f32_e64 v15, 0x40c00000, s0
-; GFX11-NEXT:    s_and_b32 s0, s24, 0xffff0000
-; GFX11-NEXT:    v_cndmask_b32_e32 v10, v12, v27, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v12, v28, v14
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-NEXT:    v_add_f32_e64 v27, 0x40c00000, s0
-; GFX11-NEXT:    v_or_b32_e32 v28, 0x400000, v14
-; GFX11-NEXT:    v_bfe_u32 v29, v15, 16, 1
-; GFX11-NEXT:    v_dual_cndmask_b32 v11, v11, v13 :: v_dual_add_nc_u32 v12, 0x7fff, v12
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT:    s_lshl_b32 s0, s24, 16
-; GFX11-NEXT:    v_bfe_u32 v13, v27, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v14, 0x40c00000, s0
-; GFX11-NEXT:    s_and_b32 s0, s25, 0xffff0000
-; GFX11-NEXT:    v_dual_cndmask_b32 v25, v12, v28 :: v_dual_add_nc_u32 v12, v29, v15
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_add_nc_u32_e32 v13, v13, v27
-; GFX11-NEXT:    v_add_f32_e64 v28, 0x40c00000, s0
-; GFX11-NEXT:    v_or_b32_e32 v29, 0x400000, v15
-; GFX11-NEXT:    v_bfe_u32 v30, v14, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v12, 0x7fff, v12
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-NEXT:    v_add_nc_u32_e32 v13, 0x7fff, v13
-; GFX11-NEXT:    v_or_b32_e32 v31, 0x400000, v27
-; GFX11-NEXT:    v_bfe_u32 v32, v28, 16, 1
-; GFX11-NEXT:    v_dual_cndmask_b32 v12, v12, v29 :: v_dual_add_nc_u32 v15, v30, v14
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX11-NEXT:    v_or_b32_e32 v30, 0x400000, v14
-; GFX11-NEXT:    s_lshl_b32 s0, s25, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_add_nc_u32_e32 v15, 0x7fff, v15
-; GFX11-NEXT:    v_add_f32_e64 v29, 0x40c00000, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v27, v13, v31, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v13, v32, v28
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT:    v_or_b32_e32 v31, 0x400000, v28
-; GFX11-NEXT:    s_and_b32 s0, s26, 0xffff0000
-; GFX11-NEXT:    v_bfe_u32 v32, v29, 16, 1
-; GFX11-NEXT:    v_dual_cndmask_b32 v14, v15, v30 :: v_dual_add_nc_u32 v13, 0x7fff, v13
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-NEXT:    v_add_f32_e64 v33, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s26, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v15, v32, v29
-; GFX11-NEXT:    v_add_f32_e64 v30, 0x40c00000, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, v13, v31, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v31, 0x40c00000, s1
-; GFX11-NEXT:    s_and_b32 s0, s27, 0xffff0000
-; GFX11-NEXT:    v_bfe_u32 v28, v33, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v32, 0x40c00000, s0
-; GFX11-NEXT:    v_bfe_u32 v34, v30, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v35, v31, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v31
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-NEXT:    v_bfe_u32 v37, v32, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, v34, v30
-; GFX11-NEXT:    v_add_nc_u32_e32 v35, v35, v31
-; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v30
-; GFX11-NEXT:    v_or_b32_e32 v49, 0x400000, v32
-; GFX11-NEXT:    v_add_nc_u32_e32 v37, v37, v32
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT:    v_add_nc_u32_e32 v35, 0x7fff, v35
-; GFX11-NEXT:    v_add_nc_u32_e32 v15, 0x7fff, v15
-; GFX11-NEXT:    v_add_nc_u32_e32 v28, v28, v33
-; GFX11-NEXT:    v_add_nc_u32_e32 v37, 0x7fff, v37
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v29
-; GFX11-NEXT:    v_cndmask_b32_e32 v31, v35, v39, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX11-NEXT:    v_add_nc_u32_e32 v28, 0x7fff, v28
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v33
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
-; GFX11-NEXT:    v_cndmask_b32_e32 v30, v34, v48, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v37, v49, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX11-NEXT:    v_and_or_b32 v7, 0xffff0000, v7, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v29, v15, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT:    v_and_or_b32 v15, 0xffff0000, v32, v31
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v11
-; GFX11-NEXT:    v_cndmask_b32_e32 v28, v28, v38, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
-; GFX11-NEXT:    v_and_or_b32 v12, 0xffff0000, v27, v31
-; GFX11-NEXT:    v_and_or_b32 v11, 0xffff0000, v25, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v9
-; GFX11-NEXT:    v_and_or_b32 v14, 0xffff0000, v28, v30
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v6
-; GFX11-NEXT:    v_and_or_b32 v9, 0xffff0000, v26, v23
-; GFX11-NEXT:    v_and_or_b32 v8, 0xffff0000, v24, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v2
-; GFX11-NEXT:    v_and_or_b32 v6, 0xffff0000, v5, v27
-; GFX11-NEXT:    v_and_or_b32 v5, 0xffff0000, v21, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 16, v1
-; GFX11-NEXT:    v_and_or_b32 v13, 0xffff0000, v13, v29
-; GFX11-NEXT:    v_and_or_b32 v10, 0xffff0000, v10, v33
-; GFX11-NEXT:    v_and_or_b32 v4, 0xffff0000, v19, v20
-; GFX11-NEXT:    v_and_or_b32 v3, 0xffff0000, v18, v21
-; GFX11-NEXT:    v_and_or_b32 v2, 0xffff0000, v17, v22
-; GFX11-NEXT:    v_and_or_b32 v1, 0xffff0000, v16, v23
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff0000, v0, v24
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB95_3:
-; GFX11-NEXT:    s_branch .LBB95_2
-; GFX11-NEXT:  .LBB95_4:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
-; GFX11-NEXT:    v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
-; GFX11-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
-; GFX11-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
-; GFX11-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
-; GFX11-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
-; GFX11-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
-; GFX11-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v32i16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s15, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s14, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s0
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB95_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB95_4
+; GFX11-TRUE16-NEXT:  .LBB95_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s12, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s12, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s13, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, s25, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s13, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, v5, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, v6, v2
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s14, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v3, v7 :: v_dual_add_nc_u32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v10, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s14, 16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v16, v5, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v16.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v6, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s15, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v7
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s15, 16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v17, v2, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, v2, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v4, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s16, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v10, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v18, v3, v7 :: v_dual_add_nc_u32 v5, v5, v10
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s16, 16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v10
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s17, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v4, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v11, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s17, 16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, v5, v11
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v19, v4, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, v4, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, v6, v8
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s18, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v4, v5, v9 :: v_dual_add_nc_u32 v5, 0x7fff, v10
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v12, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, v9, v12
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s18, 16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v20, v5, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v13, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v12
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v19.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v20.h
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v5, v6, v11 :: v_dual_add_nc_u32 v6, 0x7fff, v9
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s19, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, v7, v13
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s19, 16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v21, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v10, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v13
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, v6, v9
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v8, v10
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s20, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v10
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v6, v7, v11 :: v_dual_add_nc_u32 v7, 0x7fff, v12
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v14, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s20, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, v11, v14
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v15, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v22, v7, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v14
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v21.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, v9, v15
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v22.h
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v7, v8, v13 :: v_dual_add_nc_u32 v8, 0x7fff, v11
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s21, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 0x7fff, v9
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s21, 16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v23, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v11, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v15
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v12, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v24, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, v8, v11
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s22, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, v10, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, 0x400000, v12
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v8, v9, v13 :: v_dual_add_nc_u32 v9, 0x7fff, v14
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v24, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v11
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v25, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v13, v13, v24
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s22, 16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v26, v9, v14, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v25, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v24
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v23.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v26.h
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v9, v10, v15 :: v_dual_add_nc_u32 v10, 0x7fff, v13
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s23, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, v11, v25
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s23, 16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v24, v10, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v13, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v25
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v15, v14, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v28, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v27, v10, v13
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s24, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, 0x400000, v14
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v25, v28, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v24.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v10, v11, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, v15, v14
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 0x7fff, v27
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, 0x400000, v13
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v27, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v25, v25, v28
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s24, 16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v30, v12, v15, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v12, v27, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v13, 0x7fff, v25
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v28
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v11, v11, v29, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, v12, v27
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v25, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s25, 16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, 0x400000, v27
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v28, v13, v14, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v15, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v14, v25, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v31, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v13, v13, v15
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v32, 0x400000, v25
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, v14, v25
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v27, v31, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v12, v12, v29, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v13, 0x7fff, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, 0x400000, v15
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v27, v27, v31
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s26, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v28.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v29, v13, v29, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s26, 16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v30.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v29.h
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v13, v14, v32 :: v_dual_add_nc_u32 v14, 0x7fff, v27
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v27, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s27, 16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v32, 0x400000, v31
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v33, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v34, v27, 16, 1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s27, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v25, v15, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v33, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v31, v14, v32, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, v34, v27
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v35, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v27
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v34, v36, v33
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v25, v25, v15
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v35, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v33
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v27, v14, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v25, 0x7fff, v25
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v15
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v32, v32, v35
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v33, v34, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v31.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v32, 0x7fff, v32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v18.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v17.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v14, v25, v36, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v27.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v15, v32, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v33.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB95_3:
+; GFX11-TRUE16-NEXT:    s_branch .LBB95_2
+; GFX11-TRUE16-NEXT:  .LBB95_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v32bf16_to_v32i16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s15, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s14, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s13, s1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s12, s0
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB95_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB95_4
+; GFX11-FAKE16-NEXT:  .LBB95_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s12, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s12, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s13, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s13, 16
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, v7, v4
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s14, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s14, 16
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s27, 16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v2, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, v9, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, v8, v3
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s15, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v7
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v9, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, v7, v9
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s15, 16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s16, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, v8, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s16, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, v7, v10
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s17, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v18, v6, v8 :: v_dual_add_nc_u32 v7, v9, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v11, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v8, v11
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s17, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v11
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s18, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v10, v6
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v12, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v5, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s18, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, v9, v12
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s19, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v11, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v12
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v10, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, v9, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s19, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v12, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v8, v10 :: v_dual_add_nc_u32 v8, 0x7fff, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v11
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s20, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, v12, v7
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v12, v13, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s20, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, v12, v13
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v14, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s21, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v10, v11 :: v_dual_add_nc_u32 v10, 0x7fff, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, v14, v9
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v14, v15, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, v14, v15
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v10, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 0x7fff, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s21, 16
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v14, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v23, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v24, v11, v12 :: v_dual_add_nc_u32 v9, 0x7fff, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v15
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s22, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, v14, v10
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v23, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s22, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, v13, v23
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v25, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v15, v11, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s23, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v13
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v25, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, 0x400000, v11
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v26, v12, v14, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, v15, v11
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, 0x400000, v23
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, v13, v25
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v28, v14, 16, 1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s23, 16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v10, v15, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 0x7fff, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, 0x400000, v25
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s24, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v12, v27, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, v28, v14
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v27, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, 0x400000, v14
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v29, v15, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v11, v13 :: v_dual_add_nc_u32 v12, 0x7fff, v12
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s24, 16
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v27, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s25, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v25, v12, v28 :: v_dual_add_nc_u32 v12, v29, v15
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, v13, v27
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v28, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, 0x400000, v15
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v30, v14, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 0x7fff, v12
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, 0x7fff, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, 0x400000, v27
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v28, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v12, v12, v29 :: v_dual_add_nc_u32 v15, v30, v14
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, 0x400000, v14
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s25, 16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, 0x7fff, v15
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v29, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v13, v31, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, v32, v28
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, 0x400000, v28
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s26, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v29, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v15, v30 :: v_dual_add_nc_u32 v13, 0x7fff, v13
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v33, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s26, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, v32, v29
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v30, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v13, v31, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v31, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s27, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v28, v33, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v32, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v30, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v31, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v31
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v32, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, v34, v30
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v35, v35, v31
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v48, 0x400000, v30
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v49, 0x400000, v32
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v37, v37, v32
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v35, 0x7fff, v35
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, 0x7fff, v15
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v28, v28, v33
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v37, 0x7fff, v37
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v29
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v35, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v28, 0x7fff, v28
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v30, v34, v48, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v37, v49, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v7, 0xffff0000, v7, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v29, v15, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v15, 0xffff0000, v32, v31
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v11
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v28, v28, v38, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v12, 0xffff0000, v27, v31
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v11, 0xffff0000, v25, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v9
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v14, 0xffff0000, v28, v30
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v9, 0xffff0000, v26, v23
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v8, 0xffff0000, v24, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v2
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v6, 0xffff0000, v5, v27
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v5, 0xffff0000, v21, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v13, 0xffff0000, v13, v29
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v10, 0xffff0000, v10, v33
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v4, 0xffff0000, v19, v20
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v3, 0xffff0000, v18, v21
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v17, v22
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v1, 0xffff0000, v16, v23
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v0, 0xffff0000, v0, v24
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB95_3:
+; GFX11-FAKE16-NEXT:    s_branch .LBB95_2
+; GFX11-FAKE16-NEXT:  .LBB95_4:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -69965,170 +71564,106 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s3, 8
 ; GFX11-TRUE16-NEXT:    s_or_b32 s5, s5, s6
 ; GFX11-TRUE16-NEXT:    s_or_b32 s6, s7, s8
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s16, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s17, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s18, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s19, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s7, s17, 8
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s20, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s21, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s22, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s23, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s24, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s25, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s6, s16, 0xff
+; GFX11-TRUE16-NEXT:    s_and_b32 s8, s18, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s19, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s6, s6, s7
+; GFX11-TRUE16-NEXT:    s_or_b32 s7, s8, s9
+; GFX11-TRUE16-NEXT:    s_and_b32 s8, s20, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s21, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s10, s22, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s11, s23, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s8, s8, s9
+; GFX11-TRUE16-NEXT:    s_or_b32 s9, s10, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s6, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s8, s9
+; GFX11-TRUE16-NEXT:    s_and_b32 s8, s24, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s25, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s10, s26, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s11, s27, 8
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v35
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-TRUE16-NEXT:    s_or_b32 s8, s8, s9
+; GFX11-TRUE16-NEXT:    s_or_b32 s9, s10, s11
 ; GFX11-TRUE16-NEXT:    s_and_b32 s10, s28, 0xff
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s11, s29, 8
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v39
-; GFX11-TRUE16-NEXT:    s_or_b32 s10, s10, s11
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s26, 0xff
-; GFX11-TRUE16-NEXT:    v_and_b32_e64 v1, 0xffff, s10
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s12, s27, 8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v38
-; GFX11-TRUE16-NEXT:    s_or_b32 s9, s9, s12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v31
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s8, s9
+; GFX11-TRUE16-NEXT:    s_or_b32 s9, s10, s11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s9 :: v_dual_and_b32 v1, 0xff, v38
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v39
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v37
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v0, 16, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v31
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v48
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xff, v82
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v33
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v48
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v36
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v2, v49
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v32
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v51
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v5, v50
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v6, v52
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v53
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xff, v24
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xff, v68
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v3, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v34
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v16
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v7, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v18
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v8, 16, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v54
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v3, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v22
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v55
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xff, v67
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v19
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v8, v23
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v2, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v9, v21
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v26
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v3, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v30
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xff, v80
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xff, v82
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v2, v49
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v3, v50
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v36
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v0, v51
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v52
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v34
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v0, v53
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v20
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v17
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v18
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v3, v54
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v19
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v24
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v1, v55
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v22
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v28
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v27
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v1, 16, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v2, v21
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v26
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v3, v23
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v65
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v66
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v11, v70
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v12, v71
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xff, v69
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v13, v83
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v86, 0xff, v64
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v14, v84
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v65
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v25
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v29
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v87, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v96, v12, v81
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v97, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v86, v86, v85
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v98, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v0, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v1, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v15, 16, v87
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v96, 16, v97
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v86, 16, v98
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v30
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v2, v27
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v3, v29
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v68
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v1, v66
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v67
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v69
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v0, v70
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v64
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v80
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v71
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v81
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v84
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v85
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v3, v83
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, s6
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB99_3
 ; GFX11-TRUE16-NEXT:  .LBB99_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v68
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v67
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 3, v30
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v65
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 3, v16
 ; GFX11-TRUE16-NEXT:    s_add_i32 s28, s28, 3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v70, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v71, v5
-; GFX11-TRUE16-NEXT:    s_and_b32 s4, s28, 0xff
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s29, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s28, 0xff
 ; GFX11-TRUE16-NEXT:    s_add_i32 s24, s24, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v66, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v26
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v28
 ; GFX11-TRUE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v29, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v24
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v27, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v25, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v23, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v18
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v18, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v34
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v21, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
 ; GFX11-TRUE16-NEXT:    s_and_b32 s5, s24, 0xff
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s25, 8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v55, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 3, v32
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v19, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v54, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v17, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v9
 ; GFX11-TRUE16-NEXT:    s_add_i32 s26, s26, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v36
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v19, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v53, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v33
 ; GFX11-TRUE16-NEXT:    s_or_b32 s5, s6, s5
 ; GFX11-TRUE16-NEXT:    s_and_b32 s6, s26, 0xff
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s7, s27, 8
@@ -70137,11 +71672,6 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; GFX11-TRUE16-NEXT:    s_and_b32 s7, s20, 0xff
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s21, 8
 ; GFX11-TRUE16-NEXT:    s_add_i32 s22, s22, 3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v31
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, 3, v37
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v21, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v7
 ; GFX11-TRUE16-NEXT:    s_or_b32 s7, s8, s7
 ; GFX11-TRUE16-NEXT:    s_and_b32 s8, s22, 0xff
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s23, 8
@@ -70159,83 +71689,133 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v52, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v20
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, 3, v38
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v51, v4
 ; GFX11-TRUE16-NEXT:    s_or_b32 s10, s11, s10
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, s1, s0
 ; GFX11-TRUE16-NEXT:    s_or_b32 s1, s3, s2
-; GFX11-TRUE16-NEXT:    s_addk_i32 s5, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s6, 0x300
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s9, 0x300
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s0, 0x300
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s1, 0x300
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s10, 0x300
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v50, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v49, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v20
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v35
+; GFX11-TRUE16-NEXT:    s_addk_i32 s4, 0x300
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v64
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v82
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v80
+; GFX11-TRUE16-NEXT:    s_addk_i32 s5, 0x300
+; GFX11-TRUE16-NEXT:    s_addk_i32 s6, 0x300
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s7, 0x300
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v64
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v69
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v80
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v68
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v67
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v84, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v85, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 3, v28
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, 0x300, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v83, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v23, 0x300, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v30
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v22
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v69
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, 0x300, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v70, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v5
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v65
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v0.l
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_add_nc_u32 v13, 0x300, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v66, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v26
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v48, v8
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v39, v4
-; GFX11-TRUE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v85, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v71, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v29, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v24
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v27, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v25, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v23, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v18
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v18, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 0x300, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v21, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v20
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 3, v34
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v55, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v32
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v19, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 0x300, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v54, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v17, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v36
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 0x300, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v53, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 3, v33
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v19, 3, v31
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, 3, v37
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v52, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v19
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xff, v20
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, 3, v38
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v51, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v21, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v50, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v49, v19
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xff, v20
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v23, 3, v35
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v24, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x300, v19
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v48, v20
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xff, v23
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v81, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e64 v8, 0xffff, s4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v24.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v39, v19
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v4, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v7, 16, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v21
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v9, 16, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v23, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v22, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v19, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v10, 16, v20
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v18, 16, v15
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v14, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v13, 16, v16
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v3, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v21.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v20.l
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_add_nc_u32 v19, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v17.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v18.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v19.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v3.l
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX11-TRUE16-NEXT:  .LBB99_3: ; %end
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -73991,358 +75571,687 @@ define inreg <32 x half> @bitcast_v32bf16_to_v32f16_scalar(<32 x bfloat> inreg %
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v32bf16_to_v32f16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s15, s3
-; GFX11-NEXT:    s_mov_b32 s14, s2
-; GFX11-NEXT:    s_mov_b32 s13, s1
-; GFX11-NEXT:    s_mov_b32 s12, s0
-; GFX11-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB103_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccnz .LBB103_4
-; GFX11-NEXT:  .LBB103_2: ; %cmp.true
-; GFX11-NEXT:    s_and_b32 s0, s12, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s1, s12, 16
-; GFX11-NEXT:    v_add_f32_e64 v0, 0x40c00000, s0
-; GFX11-NEXT:    s_and_b32 s0, s13, 0xffff0000
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s1
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s1, s13, 16
-; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s1
-; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, v7, v4
-; GFX11-NEXT:    v_bfe_u32 v9, v5, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT:    s_and_b32 s2, s14, 0xffff0000
-; GFX11-NEXT:    s_lshl_b32 s0, s14, 16
-; GFX11-NEXT:    s_and_b32 s1, s27, 0xffff0000
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v7
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    v_add_f32_e64 v7, 0x40c00000, s2
-; GFX11-NEXT:    v_add_f32_e64 v33, 0x40c00000, s1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v9, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v5
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s0
-; GFX11-NEXT:    s_and_b32 s0, s15, 0xffff0000
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v6, v2 :: v_dual_add_nc_u32 v3, 0x7fff, v3
-; GFX11-NEXT:    v_bfe_u32 v6, v7, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s15, 16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v2
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v33
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v3, v4 :: v_dual_add_nc_u32 v3, v6, v7
-; GFX11-NEXT:    v_bfe_u32 v4, v8, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v7
-; GFX11-NEXT:    v_bfe_u32 v9, v5, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v8
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, v9, v5
-; GFX11-NEXT:    v_bfe_u32 v36, v33, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v3
-; GFX11-NEXT:    v_bfe_u32 v3, v6, 16, 1
-; GFX11-NEXT:    s_and_b32 s0, s16, 0xffff0000
-; GFX11-NEXT:    v_dual_cndmask_b32 v4, v4, v7 :: v_dual_add_nc_u32 v7, 0x7fff, v9
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, v3, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT:    s_lshl_b32 s0, s16, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v9
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v6
-; GFX11-NEXT:    v_bfe_u32 v9, v4, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v5
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    s_and_b32 s0, s17, 0xffff0000
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, v9, v4
-; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v7
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v8, v5
-; GFX11-NEXT:    v_bfe_u32 v10, v9, 16, 1
-; GFX11-NEXT:    s_lshl_b32 s0, s17, 16
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v4, v6, v7 :: v_dual_add_nc_u32 v7, 0x7fff, v8
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
-; GFX11-NEXT:    s_and_b32 s0, s18, 0xffff0000
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v9
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, v10, v9
-; GFX11-NEXT:    v_bfe_u32 v11, v6, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 16, v5
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s18, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v10
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, v11, v6
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v5
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v7, v7, v8 :: v_dual_add_nc_u32 v8, 0x7fff, v10
-; GFX11-NEXT:    v_bfe_u32 v10, v5, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_lshl_or_b32 v4, v4, 16, v20
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 16, v7
-; GFX11-NEXT:    v_add_f32_e64 v7, 0x40c00000, s0
-; GFX11-NEXT:    s_and_b32 s0, s19, 0xffff0000
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v9, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v10, v5
-; GFX11-NEXT:    v_add_f32_e64 v10, 0x40c00000, s0
-; GFX11-NEXT:    v_bfe_u32 v9, v7, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    s_lshl_b32 s0, s19, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v8
-; GFX11-NEXT:    v_bfe_u32 v12, v10, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, v9, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v11, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s0
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, 0x7fff, v9
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v7
-; GFX11-NEXT:    v_add_nc_u32_e32 v12, v12, v10
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_bfe_u32 v13, v8, 16, 1
-; GFX11-NEXT:    s_and_b32 s0, s20, 0xffff0000
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v9, v11, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, 0x7fff, v12
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v10
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    v_add_nc_u32_e32 v12, v13, v8
-; GFX11-NEXT:    v_add_f32_e64 v13, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s20, 16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v9, v9, v11 :: v_dual_add_nc_u32 v10, 0x7fff, v12
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v8
-; GFX11-NEXT:    v_bfe_u32 v12, v13, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v9
-; GFX11-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
-; GFX11-NEXT:    s_and_b32 s0, s21, 0xffff0000
-; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v13
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v10, v11, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, v12, v13
-; GFX11-NEXT:    v_bfe_u32 v11, v9, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v12, 0x40c00000, s0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT:    s_lshl_b32 s0, s21, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, v11, v9
-; GFX11-NEXT:    v_bfe_u32 v15, v12, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v13, 0x40c00000, s0
-; GFX11-NEXT:    s_and_b32 s0, s22, 0xffff0000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v10, v10, v14 :: v_dual_add_nc_u32 v11, 0x7fff, v11
-; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v9
-; GFX11-NEXT:    v_add_nc_u32_e32 v15, v15, v12
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v10
-; GFX11-NEXT:    v_bfe_u32 v10, v13, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v11, v14, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, 0x7fff, v15
-; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v12
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, v10, v13
-; GFX11-NEXT:    v_add_f32_e64 v15, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s22, 16
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v13
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v11, v14, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v10
-; GFX11-NEXT:    v_bfe_u32 v14, v15, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 16, v11
-; GFX11-NEXT:    v_add_f32_e64 v11, 0x40c00000, s0
-; GFX11-NEXT:    s_and_b32 s0, s23, 0xffff0000
-; GFX11-NEXT:    v_cndmask_b32_e32 v10, v10, v12, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v12, v14, v15
-; GFX11-NEXT:    v_add_f32_e64 v14, 0x40c00000, s0
-; GFX11-NEXT:    v_bfe_u32 v13, v11, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v10
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v12
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v15
-; GFX11-NEXT:    v_add_nc_u32_e32 v13, v13, v11
-; GFX11-NEXT:    v_bfe_u32 v26, v14, 16, 1
-; GFX11-NEXT:    s_lshl_b32 s0, s23, 16
-; GFX11-NEXT:    v_or_b32_e32 v15, 0x400000, v11
-; GFX11-NEXT:    v_cndmask_b32_e32 v10, v10, v12, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v12, 0x40c00000, s0
-; GFX11-NEXT:    v_add_nc_u32_e32 v13, 0x7fff, v13
-; GFX11-NEXT:    v_add_nc_u32_e32 v26, v26, v14
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    s_and_b32 s0, s24, 0xffff0000
-; GFX11-NEXT:    v_bfe_u32 v27, v12, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v13, v15, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v13, 0x7fff, v26
-; GFX11-NEXT:    v_or_b32_e32 v15, 0x400000, v14
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT:    v_add_nc_u32_e32 v26, v27, v12
-; GFX11-NEXT:    v_add_f32_e64 v27, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s24, 16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v13, v13, v15 :: v_dual_add_nc_u32 v14, 0x7fff, v26
-; GFX11-NEXT:    v_or_b32_e32 v15, 0x400000, v12
-; GFX11-NEXT:    v_bfe_u32 v26, v27, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v13
-; GFX11-NEXT:    v_add_f32_e64 v13, 0x40c00000, s0
-; GFX11-NEXT:    s_and_b32 s0, s25, 0xffff0000
-; GFX11-NEXT:    v_or_b32_e32 v29, 0x400000, v27
-; GFX11-NEXT:    v_cndmask_b32_e32 v12, v14, v15, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v14, v26, v27
-; GFX11-NEXT:    v_bfe_u32 v15, v13, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v26, 0x40c00000, s0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX11-NEXT:    s_lshl_b32 s0, s25, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v14, 0x7fff, v14
-; GFX11-NEXT:    v_add_nc_u32_e32 v15, v15, v13
-; GFX11-NEXT:    v_bfe_u32 v30, v26, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v27, 0x40c00000, s0
-; GFX11-NEXT:    s_and_b32 s0, s26, 0xffff0000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v14, v14, v29 :: v_dual_add_nc_u32 v15, 0x7fff, v15
-; GFX11-NEXT:    v_or_b32_e32 v29, 0x400000, v13
-; GFX11-NEXT:    v_add_nc_u32_e32 v30, v30, v26
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v14
-; GFX11-NEXT:    v_bfe_u32 v14, v27, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v32, 0x400000, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, v15, v29, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v15, 0x7fff, v30
-; GFX11-NEXT:    v_or_b32_e32 v29, 0x400000, v26
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-NEXT:    v_add_f32_e64 v30, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s27, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v14, v14, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_cndmask_b32_e32 v15, v15, v29, vcc_lo
-; GFX11-NEXT:    v_add_f32_e64 v29, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s26, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v14, 0x7fff, v14
-; GFX11-NEXT:    v_add_f32_e64 v35, 0x40c00000, s0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX11-NEXT:    v_bfe_u32 v34, v29, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v29
-; GFX11-NEXT:    v_bfe_u32 v26, v30, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v14, v14, v32, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v27, v34, v29
-; GFX11-NEXT:    v_bfe_u32 v32, v35, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, v36, v33
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX11-NEXT:    v_add_nc_u32_e32 v26, v26, v30
-; GFX11-NEXT:    v_add_nc_u32_e32 v27, 0x7fff, v27
-; GFX11-NEXT:    v_add_nc_u32_e32 v32, v32, v35
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v34
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v30
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v27, v27, v37 :: v_dual_add_nc_u32 v26, 0x7fff, v26
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT:    v_add_nc_u32_e32 v29, 0x7fff, v32
-; GFX11-NEXT:    v_or_b32_e32 v32, 0x400000, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v34, v38, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT:    v_cndmask_b32_e32 v29, v29, v32, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v33
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
-; GFX11-NEXT:    v_cndmask_b32_e32 v26, v26, v36, vcc_lo
-; GFX11-NEXT:    v_lshl_or_b32 v15, v30, 16, v27
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xffff, v13
-; GFX11-NEXT:    v_lshl_or_b32 v11, v28, 16, v33
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff, v29
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff, v14
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff, v7
-; GFX11-NEXT:    v_lshl_or_b32 v12, v31, 16, v30
-; GFX11-NEXT:    v_lshl_or_b32 v10, v10, 16, v34
-; GFX11-NEXT:    v_lshl_or_b32 v14, v26, 16, v27
-; GFX11-NEXT:    v_lshl_or_b32 v13, v32, 16, v29
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff, v9
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff, v8
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff, v6
-; GFX11-NEXT:    v_lshl_or_b32 v6, v5, 16, v28
-; GFX11-NEXT:    v_lshl_or_b32 v9, v24, 16, v25
-; GFX11-NEXT:    v_lshl_or_b32 v8, v23, 16, v26
-; GFX11-NEXT:    v_lshl_or_b32 v7, v22, 16, v27
-; GFX11-NEXT:    v_lshl_or_b32 v5, v21, 16, v29
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v3
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff, v2
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff, v1
-; GFX11-NEXT:    v_lshl_or_b32 v3, v18, 16, v19
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshl_or_b32 v2, v17, 16, v21
-; GFX11-NEXT:    v_lshl_or_b32 v1, v16, 16, v22
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_lshl_or_b32 v0, v0, 16, v23
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB103_3:
-; GFX11-NEXT:    s_branch .LBB103_2
-; GFX11-NEXT:  .LBB103_4:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
-; GFX11-NEXT:    v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
-; GFX11-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
-; GFX11-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
-; GFX11-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
-; GFX11-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
-; GFX11-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
-; GFX11-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v32f16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s15, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s14, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s0
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB103_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB103_4
+; GFX11-TRUE16-NEXT:  .LBB103_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s12, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s12, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s13, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s13, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, v7, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT:    s_and_b32 s2, s14, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s14, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, s25, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v2, v6 :: v_dual_add_nc_u32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v9, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v29, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, s27, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v1, v6, v2 :: v_dual_add_nc_u32 v2, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s15, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v33, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v4, v7
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s15, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v6
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, 0x400000, v29
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s16, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v8, v4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s16, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v34, v33, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v17.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v2, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v34, v34, v33
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v18.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v3, v6, v7 :: v_dual_add_nc_u32 v4, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v3
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s17, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v3, v4, v6 :: v_dual_add_nc_u32 v4, v7, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s17, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, v5, v8
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, v10, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v4
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v7, 16, 1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s18, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v5, v5, v9 :: v_dual_add_nc_u32 v8, 0x7fff, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, v4, v7
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v5
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s18, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v10
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v11, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s19, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v10, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v5, v6, v8 :: v_dual_add_nc_u32 v6, v9, v11
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v11
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s19, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, v7, v10
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v12, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v6, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, v12, v8
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v6
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v9, 16, 1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s20, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v7, v7, v11 :: v_dual_add_nc_u32 v10, 0x7fff, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, v6, v9
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s20, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v12
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v7, v10, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v13, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s21, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v7
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v12, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v22.l
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v7, v8, v10 :: v_dual_add_nc_u32 v8, v11, v13
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v13
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s21, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, v9, v12
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v14, v10, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v8, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 0x7fff, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, v14, v10
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v8
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v11, 16, 1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s22, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v9, v9, v13 :: v_dual_add_nc_u32 v12, 0x7fff, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, v8, v11
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v9
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s22, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v14
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v9, v12, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v11
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v15, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s23, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v14, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v24.l
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v9, v10, v12 :: v_dual_add_nc_u32 v10, v13, v15
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v15
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s23, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, v11, v14
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v26, v12, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, 0x400000, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x7fff, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v26, v26, v12
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v10
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v13, 16, 1
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s24, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v11, v11, v15 :: v_dual_add_nc_u32 v14, 0x7fff, v26
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, 0x400000, v12
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v26, v10, v13
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v28, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s24, 16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 0x7fff, v26
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v26, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v11, v14, v15, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v13
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v15, v28, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s25, 16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v11
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v27.l
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v12, v12, v14 :: v_dual_add_nc_u32 v13, v15, v28
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v14, v26, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v15, v29, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v25.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v12
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 0x7fff, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v28
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, v14, v26
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, v15, v29
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, 0x400000, v26
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v30.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v12, v12, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s26, 0xffff0000
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, 0x7fff, v15
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v13, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v26, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v14, v14, v28, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s26, 16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v28, v32, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v14
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v15, v15, v31, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v31, v26, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, 0x7fff, v28
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v35, 0x400000, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, 0x400000, v13
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v28, v31, v26
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v31, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s27, 16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v29.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v13, v14, v15, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v15, v31, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v28, 0x7fff, v28
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v14, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, v15, v31
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v14
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v26, v28, v35, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v35, 0x400000, v33
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v28, v36, v14
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, 0x7fff, v15
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v31
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v33, v34, v35, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v28, 0x7fff, v28
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v23.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v33
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v15, v15, v36, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v21.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v19.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v15
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v28, v28, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v16.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v28
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v31.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB103_3:
+; GFX11-TRUE16-NEXT:    s_branch .LBB103_2
+; GFX11-TRUE16-NEXT:  .LBB103_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v32bf16_to_v32f16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s15, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s14, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s13, s1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s12, s0
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB103_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB103_4
+; GFX11-FAKE16-NEXT:  .LBB103_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s12, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s12, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s13, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s13, 16
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, v7, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT:    s_and_b32 s2, s14, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s14, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s27, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s2
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v33, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v9, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s15, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v6, v2 :: v_dual_add_nc_u32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s15, 16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v33
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v3, v4 :: v_dual_add_nc_u32 v3, v6, v7
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v8, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, v9, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v36, v33, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v6, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s16, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v4, v7 :: v_dual_add_nc_u32 v7, 0x7fff, v9
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, v3, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s16, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s17, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, v9, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v8, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v9, 16, 1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s17, 16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v6, v7 :: v_dual_add_nc_u32 v7, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s18, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, v10, v9
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s18, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, v11, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v8 :: v_dual_add_nc_u32 v8, 0x7fff, v10
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v4, 16, v20
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v7
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s19, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v8, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v10, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v10, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s19, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v12, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, v9, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 0x7fff, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, v12, v10
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v8, 16, 1
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s20, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v9, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 0x7fff, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, v13, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s20, 16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v9, v11 :: v_dual_add_nc_u32 v10, 0x7fff, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v12, v13, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v9
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v9, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s21, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x400000, v13
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v10, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, v12, v13
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s21, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, v11, v9
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v15, v12, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s22, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v10, v14 :: v_dual_add_nc_u32 v11, 0x7fff, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, v15, v12
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v10
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v13, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v11, v14, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 0x7fff, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x400000, v12
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, v10, v13
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v15, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s22, 16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v13
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v14, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v10
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v14, v15, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v11
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v11, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s23, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v10, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, v14, v15
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v14, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v11, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 0x7fff, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v15
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, v13, v11
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v26, v14, 16, 1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s23, 16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, 0x400000, v11
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v10, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v12, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, 0x7fff, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v26, v26, v14
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s24, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v27, v12, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v13, v15, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, 0x7fff, v26
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, 0x400000, v14
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v26, v27, v12
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v27, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s24, 16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v13, v13, v15 :: v_dual_add_nc_u32 v14, 0x7fff, v26
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, 0x400000, v12
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v26, v27, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v13
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v13, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s25, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, 0x400000, v27
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v14, v15, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, v26, v27
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v15, v13, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v26, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s25, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, v15, v13
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v30, v26, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v27, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s26, 0xffff0000
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v14, v29 :: v_dual_add_nc_u32 v15, 0x7fff, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, 0x400000, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v30, v30, v26
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v14
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v14, v27, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v32, 0x400000, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v15, v29, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, 0x7fff, v30
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, 0x400000, v26
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v30, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s27, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, v14, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v15, v29, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v29, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s26, 16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, 0x7fff, v14
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v35, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v29, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v29
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v26, v30, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v14, v32, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v27, v34, v29
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, v36, v33
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v26, v26, v30
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v27, 0x7fff, v27
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v32, v32, v35
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, 0x7fff, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v30
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v27, v27, v37 :: v_dual_add_nc_u32 v26, 0x7fff, v26
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v29, 0x7fff, v32
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v32, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v34, v38, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v29, v29, v32, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v26, v26, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v30, 16, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v28, 16, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xffff, v29
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v31, 16, v30
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v10, 16, v34
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v26, 16, v27
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v32, 16, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v5, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v24, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v23, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v22, 16, v27
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v21, 16, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v3, v18, 16, v19
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v2, v17, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v1, v16, 16, v22
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v23
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB103_3:
+; GFX11-FAKE16-NEXT:    s_branch .LBB103_2
+; GFX11-FAKE16-NEXT:  .LBB103_4:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -81928,170 +83837,106 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s3, 8
 ; GFX11-TRUE16-NEXT:    s_or_b32 s5, s5, s6
 ; GFX11-TRUE16-NEXT:    s_or_b32 s6, s7, s8
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s16, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s17, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s18, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s19, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s7, s17, 8
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s20, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s21, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s22, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s23, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s24, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s25, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s6, s16, 0xff
+; GFX11-TRUE16-NEXT:    s_and_b32 s8, s18, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s19, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s6, s6, s7
+; GFX11-TRUE16-NEXT:    s_or_b32 s7, s8, s9
+; GFX11-TRUE16-NEXT:    s_and_b32 s8, s20, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s21, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s10, s22, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s11, s23, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s8, s8, s9
+; GFX11-TRUE16-NEXT:    s_or_b32 s9, s10, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s6, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s8, s9
+; GFX11-TRUE16-NEXT:    s_and_b32 s8, s24, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s25, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s10, s26, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s11, s27, 8
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v35
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-TRUE16-NEXT:    s_or_b32 s8, s8, s9
+; GFX11-TRUE16-NEXT:    s_or_b32 s9, s10, s11
 ; GFX11-TRUE16-NEXT:    s_and_b32 s10, s28, 0xff
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s11, s29, 8
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v39
-; GFX11-TRUE16-NEXT:    s_or_b32 s10, s10, s11
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s26, 0xff
-; GFX11-TRUE16-NEXT:    v_and_b32_e64 v1, 0xffff, s10
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s12, s27, 8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v38
-; GFX11-TRUE16-NEXT:    s_or_b32 s9, s9, s12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v31
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s8, s9
+; GFX11-TRUE16-NEXT:    s_or_b32 s9, s10, s11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s9 :: v_dual_and_b32 v1, 0xff, v38
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v39
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v37
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v0, 16, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v31
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v48
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xff, v82
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v33
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v48
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v36
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v2, v49
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v32
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v51
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v5, v50
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v6, v52
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v53
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xff, v24
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xff, v68
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v3, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v34
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v16
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v7, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v18
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v8, 16, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v54
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v3, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v22
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v55
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xff, v67
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v19
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v8, v23
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v2, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v9, v21
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v26
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v3, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v30
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xff, v80
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xff, v82
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v2, v49
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v3, v50
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v36
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v0, v51
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v52
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v34
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v0, v53
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v20
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v17
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v18
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v3, v54
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v19
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v24
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v1, v55
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v22
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v28
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v27
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v1, 16, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v2, v21
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v26
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v3, v23
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v65
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v66
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v11, v70
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v12, v71
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xff, v69
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v13, v83
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v86, 0xff, v64
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v14, v84
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v65
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v25
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v29
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v87, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v96, v12, v81
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v97, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v86, v86, v85
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v98, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v0, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v1, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v15, 16, v87
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v96, 16, v97
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v86, 16, v98
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v30
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v2, v27
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v3, v29
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v68
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v1, v66
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v67
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v69
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v0, v70
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v64
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v80
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v71
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v81
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v84
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v85
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v3, v83
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, s6
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB107_3
 ; GFX11-TRUE16-NEXT:  .LBB107_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v68
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v67
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 3, v30
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v65
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 3, v16
 ; GFX11-TRUE16-NEXT:    s_add_i32 s28, s28, 3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v70, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v71, v5
-; GFX11-TRUE16-NEXT:    s_and_b32 s4, s28, 0xff
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s29, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s28, 0xff
 ; GFX11-TRUE16-NEXT:    s_add_i32 s24, s24, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v66, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v26
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v28
 ; GFX11-TRUE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v29, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v24
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v27, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v25, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v23, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v18
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v18, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v34
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v21, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
 ; GFX11-TRUE16-NEXT:    s_and_b32 s5, s24, 0xff
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s25, 8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v55, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 3, v32
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v19, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v54, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v17, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v9
 ; GFX11-TRUE16-NEXT:    s_add_i32 s26, s26, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v36
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v19, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v53, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v33
 ; GFX11-TRUE16-NEXT:    s_or_b32 s5, s6, s5
 ; GFX11-TRUE16-NEXT:    s_and_b32 s6, s26, 0xff
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s7, s27, 8
@@ -82100,11 +83945,6 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
 ; GFX11-TRUE16-NEXT:    s_and_b32 s7, s20, 0xff
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s21, 8
 ; GFX11-TRUE16-NEXT:    s_add_i32 s22, s22, 3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v31
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, 3, v37
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v21, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v7
 ; GFX11-TRUE16-NEXT:    s_or_b32 s7, s8, s7
 ; GFX11-TRUE16-NEXT:    s_and_b32 s8, s22, 0xff
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s23, 8
@@ -82122,83 +83962,133 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v52, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v20
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, 3, v38
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v51, v4
 ; GFX11-TRUE16-NEXT:    s_or_b32 s10, s11, s10
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, s1, s0
 ; GFX11-TRUE16-NEXT:    s_or_b32 s1, s3, s2
-; GFX11-TRUE16-NEXT:    s_addk_i32 s5, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s6, 0x300
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s9, 0x300
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s0, 0x300
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s1, 0x300
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s10, 0x300
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v50, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v49, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v20
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v35
+; GFX11-TRUE16-NEXT:    s_addk_i32 s4, 0x300
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v64
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v82
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v80
+; GFX11-TRUE16-NEXT:    s_addk_i32 s5, 0x300
+; GFX11-TRUE16-NEXT:    s_addk_i32 s6, 0x300
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s7, 0x300
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v64
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v69
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v80
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v68
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v67
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v84, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v85, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 3, v28
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, 0x300, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v83, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v23, 0x300, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v30
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v22
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v69
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, 0x300, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v70, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v5
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v65
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v0.l
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_add_nc_u32 v13, 0x300, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v66, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v26
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v48, v8
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v39, v4
-; GFX11-TRUE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v85, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v71, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v29, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v24
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v27, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v25, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v23, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v18
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v18, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 0x300, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v21, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v20
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 3, v34
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v55, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v32
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v19, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 0x300, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v54, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v17, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v36
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 0x300, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v53, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 3, v33
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v19, 3, v31
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, 3, v37
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v52, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v19
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xff, v20
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, 3, v38
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v51, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v21, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v50, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v49, v19
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xff, v20
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v23, 3, v35
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v24, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x300, v19
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v48, v20
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xff, v23
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v81, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e64 v8, 0xffff, s4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v24.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v39, v19
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v4, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v7, 16, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v21
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v9, 16, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v23, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v22, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v19, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v10, 16, v20
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v18, 16, v15
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v14, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v13, 16, v16
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v3, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v21.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v20.l
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_add_nc_u32 v19, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v17.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v18.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v19.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v3.l
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX11-TRUE16-NEXT:  .LBB107_3: ; %end
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -92186,170 +94076,106 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s3, 8
 ; GFX11-TRUE16-NEXT:    s_or_b32 s5, s5, s6
 ; GFX11-TRUE16-NEXT:    s_or_b32 s6, s7, s8
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s16, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s17, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s18, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s19, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s7, s17, 8
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s5, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s7, s8
-; GFX11-TRUE16-NEXT:    s_and_b32 s7, s20, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s21, 8
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s22, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s23, 8
-; GFX11-TRUE16-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s10
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s24, 0xff
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s10, s25, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s6, s16, 0xff
+; GFX11-TRUE16-NEXT:    s_and_b32 s8, s18, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s19, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s6, s6, s7
+; GFX11-TRUE16-NEXT:    s_or_b32 s7, s8, s9
+; GFX11-TRUE16-NEXT:    s_and_b32 s8, s20, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s21, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s10, s22, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s11, s23, 8
+; GFX11-TRUE16-NEXT:    s_or_b32 s8, s8, s9
+; GFX11-TRUE16-NEXT:    s_or_b32 s9, s10, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s6, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s8, s9
+; GFX11-TRUE16-NEXT:    s_and_b32 s8, s24, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s25, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s10, s26, 0xff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s11, s27, 8
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v35
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s9, s10
+; GFX11-TRUE16-NEXT:    s_or_b32 s8, s8, s9
+; GFX11-TRUE16-NEXT:    s_or_b32 s9, s10, s11
 ; GFX11-TRUE16-NEXT:    s_and_b32 s10, s28, 0xff
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s11, s29, 8
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v39
-; GFX11-TRUE16-NEXT:    s_or_b32 s10, s10, s11
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s26, 0xff
-; GFX11-TRUE16-NEXT:    v_and_b32_e64 v1, 0xffff, s10
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s12, s27, 8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v38
-; GFX11-TRUE16-NEXT:    s_or_b32 s9, s9, s12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v31
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s8, s9
+; GFX11-TRUE16-NEXT:    s_or_b32 s9, s10, s11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s9 :: v_dual_and_b32 v1, 0xff, v38
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v39
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v37
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v0, 16, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v31
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v48
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xff, v82
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v33
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v48
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v36
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v2, v49
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v32
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v51
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v5, v50
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v6, v52
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v53
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xff, v24
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xff, v68
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v3, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v34
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v16
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v7, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v18
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v8, 16, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v54
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v3, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v22
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v55
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xff, v67
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v19
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v8, v23
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v2, 16, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v9, v21
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v26
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v3, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v30
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xff, v80
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xff, v82
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v2, v49
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v3, v50
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v36
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v0, v51
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v52
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v34
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v0, v53
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v20
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v17
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v18
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v3, v54
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v19
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v24
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v1, v55
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v22
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v28
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v27
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v1, 16, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v2, v21
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v26
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v3, v23
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v65
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v66
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v11, v70
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v12, v71
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xff, v69
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v13, v83
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v86, 0xff, v64
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v14, v84
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v65
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v25
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v29
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v87, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v96, v12, v81
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v97, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v86, v86, v85
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v98, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v0, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v1, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v15, 16, v87
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v96, 16, v97
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v86, 16, v98
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v30
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v2, v27
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v3, v29
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v68
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v1, v66
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v67
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v69
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v0, v70
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v64
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v80
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v71
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v81
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v84
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v85
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v3, v83
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, s6
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB111_3
 ; GFX11-TRUE16-NEXT:  .LBB111_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v68
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v67
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 3, v30
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v65
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 3, v16
 ; GFX11-TRUE16-NEXT:    s_add_i32 s28, s28, 3
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v70, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v71, v5
-; GFX11-TRUE16-NEXT:    s_and_b32 s4, s28, 0xff
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s5, s29, 8
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s28, 0xff
 ; GFX11-TRUE16-NEXT:    s_add_i32 s24, s24, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v13, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v66, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v26
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v28
 ; GFX11-TRUE16-NEXT:    s_or_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v29, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v24
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v27, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v25, v6
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v23, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v18
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v18, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v34
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v21, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
 ; GFX11-TRUE16-NEXT:    s_and_b32 s5, s24, 0xff
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s6, s25, 8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v55, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v9
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 3, v32
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v19, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v54, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v17, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v9
 ; GFX11-TRUE16-NEXT:    s_add_i32 s26, s26, 3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v36
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v19, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v53, v8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v33
 ; GFX11-TRUE16-NEXT:    s_or_b32 s5, s6, s5
 ; GFX11-TRUE16-NEXT:    s_and_b32 s6, s26, 0xff
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s7, s27, 8
@@ -92358,11 +94184,6 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; GFX11-TRUE16-NEXT:    s_and_b32 s7, s20, 0xff
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s8, s21, 8
 ; GFX11-TRUE16-NEXT:    s_add_i32 s22, s22, 3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v31
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, 3, v37
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v21, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v7
 ; GFX11-TRUE16-NEXT:    s_or_b32 s7, s8, s7
 ; GFX11-TRUE16-NEXT:    s_and_b32 s8, s22, 0xff
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s9, s23, 8
@@ -92380,83 +94201,133 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a,
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX11-TRUE16-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v52, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v20
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, 3, v38
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v51, v4
 ; GFX11-TRUE16-NEXT:    s_or_b32 s10, s11, s10
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, s1, s0
 ; GFX11-TRUE16-NEXT:    s_or_b32 s1, s3, s2
-; GFX11-TRUE16-NEXT:    s_addk_i32 s5, 0x300
-; GFX11-TRUE16-NEXT:    s_addk_i32 s6, 0x300
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s9, 0x300
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s0, 0x300
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s1, 0x300
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s10, 0x300
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v5
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v50, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v49, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xff, v20
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v35
+; GFX11-TRUE16-NEXT:    s_addk_i32 s4, 0x300
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v64
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s9, s10
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 3, v82
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s5, s6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v80
+; GFX11-TRUE16-NEXT:    s_addk_i32 s5, 0x300
+; GFX11-TRUE16-NEXT:    s_addk_i32 s6, 0x300
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s7, 0x300
 ; GFX11-TRUE16-NEXT:    s_addk_i32 s8, 0x300
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v64
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s7, s8
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v69
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s5, s6
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v80
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v68
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v67
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v84, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v85, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 3, v28
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, 0x300, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v83, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v23, 0x300, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v30
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v22
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v69
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, 0x300, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v70, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v5
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v65
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v0.l
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_add_nc_u32 v13, 0x300, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v66, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v26
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v48, v8
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v39, v4
-; GFX11-TRUE16-NEXT:    s_addk_i32 s4, 0x300
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v85, v1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x300, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v71, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 0x300, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v29, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v24
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v27, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v22, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v25, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v23, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0x300, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v18
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v18, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 0x300, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v21, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v20
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 3, v34
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v55, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v32
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v19, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 0x300, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v54, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v17, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v36
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 0x300, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v53, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 3, v33
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v19, 3, v31
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, 3, v37
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v52, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff, v19
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xff, v20
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, 3, v38
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v51, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v21, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v50, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v49, v19
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xff, v20
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v23, 3, v35
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v24, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x300, v19
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v48, v20
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xff, v23
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v81, v3
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x300, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x300, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x300, v7
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x300, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e64 v8, 0xffff, s4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x300, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v24.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v39, v19
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x300, v3
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v4, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v7, 16, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v21
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v9, 16, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v23, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v22, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v19, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v10, 16, v20
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v18, 16, v15
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v14, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v13, 16, v16
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v3, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v21.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v20.l
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_add_nc_u32 v19, 0x300, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v17.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v18.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v19.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v3.l
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX11-TRUE16-NEXT:  .LBB111_3: ; %end
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
index f017913886ea8..07c574944ad4e 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
@@ -10227,149 +10227,285 @@ define inreg <36 x i16> @bitcast_v18f32_to_v36i16_scalar(<18 x float> inreg %a,
 ; GFX9-NEXT:    ; implicit-def: $vgpr22
 ; GFX9-NEXT:    s_branch .LBB29_2
 ;
-; GFX11-LABEL: bitcast_v18f32_to_v36i16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cbranch_scc0 .LBB29_3
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    s_lshr_b32 s5, s29, 16
-; GFX11-NEXT:    s_lshr_b32 s6, s28, 16
-; GFX11-NEXT:    s_lshr_b32 s7, s27, 16
-; GFX11-NEXT:    s_lshr_b32 s8, s26, 16
-; GFX11-NEXT:    s_lshr_b32 s9, s25, 16
-; GFX11-NEXT:    s_lshr_b32 s10, s24, 16
-; GFX11-NEXT:    s_lshr_b32 s11, s23, 16
-; GFX11-NEXT:    s_lshr_b32 s12, s22, 16
-; GFX11-NEXT:    s_lshr_b32 s13, s21, 16
-; GFX11-NEXT:    s_lshr_b32 s14, s20, 16
-; GFX11-NEXT:    s_lshr_b32 s15, s19, 16
-; GFX11-NEXT:    s_lshr_b32 s40, s18, 16
-; GFX11-NEXT:    s_lshr_b32 s41, s17, 16
-; GFX11-NEXT:    s_lshr_b32 s42, s16, 16
-; GFX11-NEXT:    s_lshr_b32 s43, s3, 16
-; GFX11-NEXT:    s_lshr_b32 s44, s2, 16
-; GFX11-NEXT:    s_lshr_b32 s45, s1, 16
-; GFX11-NEXT:    s_lshr_b32 s46, s0, 16
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_vccnz .LBB29_4
-; GFX11-NEXT:  .LBB29_2: ; %cmp.true
-; GFX11-NEXT:    v_add_f32_e64 v13, s29, 1.0
-; GFX11-NEXT:    v_add_f32_e64 v14, s28, 1.0
-; GFX11-NEXT:    v_add_f32_e64 v15, s27, 1.0
-; GFX11-NEXT:    v_add_f32_e64 v16, s26, 1.0
-; GFX11-NEXT:    v_add_f32_e64 v17, s25, 1.0
-; GFX11-NEXT:    v_add_f32_e64 v8, s24, 1.0
-; GFX11-NEXT:    v_add_f32_e64 v9, s23, 1.0
-; GFX11-NEXT:    v_add_f32_e64 v10, s22, 1.0
-; GFX11-NEXT:    v_add_f32_e64 v11, s21, 1.0
-; GFX11-NEXT:    v_add_f32_e64 v12, s20, 1.0
-; GFX11-NEXT:    v_add_f32_e64 v3, s19, 1.0
-; GFX11-NEXT:    v_add_f32_e64 v4, s18, 1.0
-; GFX11-NEXT:    v_add_f32_e64 v5, s17, 1.0
-; GFX11-NEXT:    v_add_f32_e64 v6, s16, 1.0
-; GFX11-NEXT:    v_add_f32_e64 v7, s3, 1.0
-; GFX11-NEXT:    v_add_f32_e64 v0, s2, 1.0
-; GFX11-NEXT:    v_add_f32_e64 v1, s1, 1.0
-; GFX11-NEXT:    v_add_f32_e64 v2, s0, 1.0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v2
-; GFX11-NEXT:    s_branch .LBB29_5
-; GFX11-NEXT:  .LBB29_3:
-; GFX11-NEXT:    ; implicit-def: $sgpr46
-; GFX11-NEXT:    ; implicit-def: $sgpr45
-; GFX11-NEXT:    ; implicit-def: $sgpr44
-; GFX11-NEXT:    ; implicit-def: $sgpr43
-; GFX11-NEXT:    ; implicit-def: $sgpr42
-; GFX11-NEXT:    ; implicit-def: $sgpr41
-; GFX11-NEXT:    ; implicit-def: $sgpr40
-; GFX11-NEXT:    ; implicit-def: $sgpr15
-; GFX11-NEXT:    ; implicit-def: $sgpr14
-; GFX11-NEXT:    ; implicit-def: $sgpr13
-; GFX11-NEXT:    ; implicit-def: $sgpr12
-; GFX11-NEXT:    ; implicit-def: $sgpr11
-; GFX11-NEXT:    ; implicit-def: $sgpr10
-; GFX11-NEXT:    ; implicit-def: $sgpr9
-; GFX11-NEXT:    ; implicit-def: $sgpr8
-; GFX11-NEXT:    ; implicit-def: $sgpr7
-; GFX11-NEXT:    ; implicit-def: $sgpr6
-; GFX11-NEXT:    ; implicit-def: $sgpr5
-; GFX11-NEXT:    s_branch .LBB29_2
-; GFX11-NEXT:  .LBB29_4:
-; GFX11-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v7, s3
-; GFX11-NEXT:    v_dual_mov_b32 v6, s16 :: v_dual_mov_b32 v5, s17
-; GFX11-NEXT:    v_dual_mov_b32 v4, s18 :: v_dual_mov_b32 v3, s19
-; GFX11-NEXT:    v_dual_mov_b32 v12, s20 :: v_dual_mov_b32 v11, s21
-; GFX11-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v9, s23
-; GFX11-NEXT:    v_dual_mov_b32 v8, s24 :: v_dual_mov_b32 v17, s25
-; GFX11-NEXT:    v_dual_mov_b32 v16, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT:    v_dual_mov_b32 v14, s28 :: v_dual_mov_b32 v13, s29
-; GFX11-NEXT:    v_dual_mov_b32 v35, s46 :: v_dual_mov_b32 v34, s45
-; GFX11-NEXT:    v_dual_mov_b32 v33, s44 :: v_dual_mov_b32 v32, s43
-; GFX11-NEXT:    v_dual_mov_b32 v31, s42 :: v_dual_mov_b32 v30, s41
-; GFX11-NEXT:    v_dual_mov_b32 v29, s40 :: v_dual_mov_b32 v28, s15
-; GFX11-NEXT:    v_dual_mov_b32 v27, s14 :: v_dual_mov_b32 v26, s13
-; GFX11-NEXT:    v_dual_mov_b32 v25, s12 :: v_dual_mov_b32 v24, s11
-; GFX11-NEXT:    v_dual_mov_b32 v23, s10 :: v_dual_mov_b32 v22, s9
-; GFX11-NEXT:    v_dual_mov_b32 v21, s8 :: v_dual_mov_b32 v20, s7
-; GFX11-NEXT:    v_dual_mov_b32 v19, s6 :: v_dual_mov_b32 v18, s5
-; GFX11-NEXT:  .LBB29_5: ; %end
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_and_b32_e32 v36, 0xffff, v0
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshl_or_b32 v0, v35, 16, v2
-; GFX11-NEXT:    v_lshl_or_b32 v1, v34, 16, v1
-; GFX11-NEXT:    v_lshl_or_b32 v2, v33, 16, v36
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff, v4
-; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff, v3
-; GFX11-NEXT:    v_lshl_or_b32 v3, v32, 16, v7
-; GFX11-NEXT:    v_lshl_or_b32 v4, v31, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshl_or_b32 v6, v29, 16, v33
-; GFX11-NEXT:    v_lshl_or_b32 v7, v28, 16, v34
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff, v9
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff, v8
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshl_or_b32 v8, v27, 16, v12
-; GFX11-NEXT:    v_lshl_or_b32 v9, v26, 16, v11
-; GFX11-NEXT:    v_lshl_or_b32 v11, v24, 16, v28
-; GFX11-NEXT:    v_lshl_or_b32 v12, v23, 16, v29
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff, v14
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff, v13
-; GFX11-NEXT:    v_lshl_or_b32 v5, v30, 16, v5
-; GFX11-NEXT:    v_lshl_or_b32 v10, v25, 16, v10
-; GFX11-NEXT:    v_lshl_or_b32 v13, v22, 16, v17
-; GFX11-NEXT:    v_lshl_or_b32 v14, v21, 16, v16
-; GFX11-NEXT:    v_lshl_or_b32 v15, v20, 16, v15
-; GFX11-NEXT:    v_lshl_or_b32 v16, v19, 16, v23
-; GFX11-NEXT:    v_lshl_or_b32 v17, v18, 16, v24
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v18f32_to_v36i16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB29_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s0, 16
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB29_4
+; GFX11-TRUE16-NEXT:  .LBB29_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v17, s29, 1.0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v16, s28, 1.0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v15, s27, 1.0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v14, s26, 1.0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v13, s25, 1.0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v12, s24, 1.0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v11, s23, 1.0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v10, s22, 1.0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, s21, 1.0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v8, s20, 1.0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v7, s19, 1.0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, s18, 1.0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, s17, 1.0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, s16, 1.0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v3, s3, 1.0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, s2, 1.0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, s1, 1.0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, s0, 1.0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-TRUE16-NEXT:    s_branch .LBB29_5
+; GFX11-TRUE16-NEXT:  .LBB29_3:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr45
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr44
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr43
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr40
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr15
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr14
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr13
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr12
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr11
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr9
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr8
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr7
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr6
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr5
+; GFX11-TRUE16-NEXT:    s_branch .LBB29_2
+; GFX11-TRUE16-NEXT:  .LBB29_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, s46 :: v_dual_mov_b32 v34, s45
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, s44 :: v_dual_mov_b32 v32, s43
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, s42 :: v_dual_mov_b32 v30, s41
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, s40 :: v_dual_mov_b32 v28, s15
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v27, s14 :: v_dual_mov_b32 v26, s13
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, s12 :: v_dual_mov_b32 v24, s11
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, s10 :: v_dual_mov_b32 v22, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, s8 :: v_dual_mov_b32 v20, s7
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, s6 :: v_dual_mov_b32 v18, s5
+; GFX11-TRUE16-NEXT:  .LBB29_5: ; %end
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v21 :: v_dual_mov_b32 v20, v20
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v19 :: v_dual_mov_b32 v18, v18
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v25.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v23.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v21.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v19.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v18.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v18f32_to_v36i16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB29_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s5, s29, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s6, s28, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s7, s27, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s8, s26, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s9, s25, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s11, s23, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s12, s22, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s13, s21, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s14, s20, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s15, s19, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s40, s18, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s17, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s16, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s3, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s44, s2, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s45, s1, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s46, s0, 16
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB29_4
+; GFX11-FAKE16-NEXT:  .LBB29_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v13, s29, 1.0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v14, s28, 1.0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v15, s27, 1.0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v16, s26, 1.0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v17, s25, 1.0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, s24, 1.0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v9, s23, 1.0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v10, s22, 1.0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v11, s21, 1.0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v12, s20, 1.0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, s19, 1.0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, s18, 1.0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, s17, 1.0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, s16, 1.0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, s3, 1.0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, s2, 1.0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, s1, 1.0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v2, s0, 1.0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v2
+; GFX11-FAKE16-NEXT:    s_branch .LBB29_5
+; GFX11-FAKE16-NEXT:  .LBB29_3:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr45
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr44
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr43
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr41
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr40
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr15
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr12
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr5
+; GFX11-FAKE16-NEXT:    s_branch .LBB29_2
+; GFX11-FAKE16-NEXT:  .LBB29_4:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v7, s3
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s18 :: v_dual_mov_b32 v3, s19
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, s20 :: v_dual_mov_b32 v11, s21
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v9, s23
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, s24 :: v_dual_mov_b32 v17, s25
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s28 :: v_dual_mov_b32 v13, s29
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v35, s46 :: v_dual_mov_b32 v34, s45
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v33, s44 :: v_dual_mov_b32 v32, s43
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v31, s42 :: v_dual_mov_b32 v30, s41
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v29, s40 :: v_dual_mov_b32 v28, s15
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v27, s14 :: v_dual_mov_b32 v26, s13
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v25, s12 :: v_dual_mov_b32 v24, s11
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v23, s10 :: v_dual_mov_b32 v22, s9
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v21, s8 :: v_dual_mov_b32 v20, s7
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v19, s6 :: v_dual_mov_b32 v18, s5
+; GFX11-FAKE16-NEXT:  .LBB29_5: ; %end
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v36, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v35, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v1, v34, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v2, v33, 16, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v3, v32, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v31, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v29, 16, v33
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v28, 16, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v27, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v26, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v24, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v23, 16, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v30, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v25, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v22, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v21, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v20, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v16, v19, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v17, v18, 16, v24
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -12999,149 +13135,285 @@ define inreg <36 x half> @bitcast_v18f32_to_v36f16_scalar(<18 x float> inreg %a,
 ; GFX9-NEXT:    ; implicit-def: $vgpr22
 ; GFX9-NEXT:    s_branch .LBB33_2
 ;
-; GFX11-LABEL: bitcast_v18f32_to_v36f16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cbranch_scc0 .LBB33_3
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    s_lshr_b32 s5, s29, 16
-; GFX11-NEXT:    s_lshr_b32 s6, s28, 16
-; GFX11-NEXT:    s_lshr_b32 s7, s27, 16
-; GFX11-NEXT:    s_lshr_b32 s8, s26, 16
-; GFX11-NEXT:    s_lshr_b32 s9, s25, 16
-; GFX11-NEXT:    s_lshr_b32 s10, s24, 16
-; GFX11-NEXT:    s_lshr_b32 s11, s23, 16
-; GFX11-NEXT:    s_lshr_b32 s12, s22, 16
-; GFX11-NEXT:    s_lshr_b32 s13, s21, 16
-; GFX11-NEXT:    s_lshr_b32 s14, s20, 16
-; GFX11-NEXT:    s_lshr_b32 s15, s19, 16
-; GFX11-NEXT:    s_lshr_b32 s40, s18, 16
-; GFX11-NEXT:    s_lshr_b32 s41, s17, 16
-; GFX11-NEXT:    s_lshr_b32 s42, s16, 16
-; GFX11-NEXT:    s_lshr_b32 s43, s3, 16
-; GFX11-NEXT:    s_lshr_b32 s44, s2, 16
-; GFX11-NEXT:    s_lshr_b32 s45, s1, 16
-; GFX11-NEXT:    s_lshr_b32 s46, s0, 16
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_vccnz .LBB33_4
-; GFX11-NEXT:  .LBB33_2: ; %cmp.true
-; GFX11-NEXT:    v_add_f32_e64 v13, s29, 1.0
-; GFX11-NEXT:    v_add_f32_e64 v14, s28, 1.0
-; GFX11-NEXT:    v_add_f32_e64 v15, s27, 1.0
-; GFX11-NEXT:    v_add_f32_e64 v16, s26, 1.0
-; GFX11-NEXT:    v_add_f32_e64 v17, s25, 1.0
-; GFX11-NEXT:    v_add_f32_e64 v8, s24, 1.0
-; GFX11-NEXT:    v_add_f32_e64 v9, s23, 1.0
-; GFX11-NEXT:    v_add_f32_e64 v10, s22, 1.0
-; GFX11-NEXT:    v_add_f32_e64 v11, s21, 1.0
-; GFX11-NEXT:    v_add_f32_e64 v12, s20, 1.0
-; GFX11-NEXT:    v_add_f32_e64 v3, s19, 1.0
-; GFX11-NEXT:    v_add_f32_e64 v4, s18, 1.0
-; GFX11-NEXT:    v_add_f32_e64 v5, s17, 1.0
-; GFX11-NEXT:    v_add_f32_e64 v6, s16, 1.0
-; GFX11-NEXT:    v_add_f32_e64 v7, s3, 1.0
-; GFX11-NEXT:    v_add_f32_e64 v0, s2, 1.0
-; GFX11-NEXT:    v_add_f32_e64 v1, s1, 1.0
-; GFX11-NEXT:    v_add_f32_e64 v2, s0, 1.0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v2
-; GFX11-NEXT:    s_branch .LBB33_5
-; GFX11-NEXT:  .LBB33_3:
-; GFX11-NEXT:    ; implicit-def: $sgpr46
-; GFX11-NEXT:    ; implicit-def: $sgpr45
-; GFX11-NEXT:    ; implicit-def: $sgpr44
-; GFX11-NEXT:    ; implicit-def: $sgpr43
-; GFX11-NEXT:    ; implicit-def: $sgpr42
-; GFX11-NEXT:    ; implicit-def: $sgpr41
-; GFX11-NEXT:    ; implicit-def: $sgpr40
-; GFX11-NEXT:    ; implicit-def: $sgpr15
-; GFX11-NEXT:    ; implicit-def: $sgpr14
-; GFX11-NEXT:    ; implicit-def: $sgpr13
-; GFX11-NEXT:    ; implicit-def: $sgpr12
-; GFX11-NEXT:    ; implicit-def: $sgpr11
-; GFX11-NEXT:    ; implicit-def: $sgpr10
-; GFX11-NEXT:    ; implicit-def: $sgpr9
-; GFX11-NEXT:    ; implicit-def: $sgpr8
-; GFX11-NEXT:    ; implicit-def: $sgpr7
-; GFX11-NEXT:    ; implicit-def: $sgpr6
-; GFX11-NEXT:    ; implicit-def: $sgpr5
-; GFX11-NEXT:    s_branch .LBB33_2
-; GFX11-NEXT:  .LBB33_4:
-; GFX11-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v7, s3
-; GFX11-NEXT:    v_dual_mov_b32 v6, s16 :: v_dual_mov_b32 v5, s17
-; GFX11-NEXT:    v_dual_mov_b32 v4, s18 :: v_dual_mov_b32 v3, s19
-; GFX11-NEXT:    v_dual_mov_b32 v12, s20 :: v_dual_mov_b32 v11, s21
-; GFX11-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v9, s23
-; GFX11-NEXT:    v_dual_mov_b32 v8, s24 :: v_dual_mov_b32 v17, s25
-; GFX11-NEXT:    v_dual_mov_b32 v16, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT:    v_dual_mov_b32 v14, s28 :: v_dual_mov_b32 v13, s29
-; GFX11-NEXT:    v_dual_mov_b32 v35, s46 :: v_dual_mov_b32 v34, s45
-; GFX11-NEXT:    v_dual_mov_b32 v33, s44 :: v_dual_mov_b32 v32, s43
-; GFX11-NEXT:    v_dual_mov_b32 v31, s42 :: v_dual_mov_b32 v30, s41
-; GFX11-NEXT:    v_dual_mov_b32 v29, s40 :: v_dual_mov_b32 v28, s15
-; GFX11-NEXT:    v_dual_mov_b32 v27, s14 :: v_dual_mov_b32 v26, s13
-; GFX11-NEXT:    v_dual_mov_b32 v25, s12 :: v_dual_mov_b32 v24, s11
-; GFX11-NEXT:    v_dual_mov_b32 v23, s10 :: v_dual_mov_b32 v22, s9
-; GFX11-NEXT:    v_dual_mov_b32 v21, s8 :: v_dual_mov_b32 v20, s7
-; GFX11-NEXT:    v_dual_mov_b32 v19, s6 :: v_dual_mov_b32 v18, s5
-; GFX11-NEXT:  .LBB33_5: ; %end
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_and_b32_e32 v36, 0xffff, v0
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshl_or_b32 v0, v35, 16, v2
-; GFX11-NEXT:    v_lshl_or_b32 v1, v34, 16, v1
-; GFX11-NEXT:    v_lshl_or_b32 v2, v33, 16, v36
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff, v4
-; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff, v3
-; GFX11-NEXT:    v_lshl_or_b32 v3, v32, 16, v7
-; GFX11-NEXT:    v_lshl_or_b32 v4, v31, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshl_or_b32 v6, v29, 16, v33
-; GFX11-NEXT:    v_lshl_or_b32 v7, v28, 16, v34
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff, v9
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff, v8
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshl_or_b32 v8, v27, 16, v12
-; GFX11-NEXT:    v_lshl_or_b32 v9, v26, 16, v11
-; GFX11-NEXT:    v_lshl_or_b32 v11, v24, 16, v28
-; GFX11-NEXT:    v_lshl_or_b32 v12, v23, 16, v29
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff, v14
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff, v13
-; GFX11-NEXT:    v_lshl_or_b32 v5, v30, 16, v5
-; GFX11-NEXT:    v_lshl_or_b32 v10, v25, 16, v10
-; GFX11-NEXT:    v_lshl_or_b32 v13, v22, 16, v17
-; GFX11-NEXT:    v_lshl_or_b32 v14, v21, 16, v16
-; GFX11-NEXT:    v_lshl_or_b32 v15, v20, 16, v15
-; GFX11-NEXT:    v_lshl_or_b32 v16, v19, 16, v23
-; GFX11-NEXT:    v_lshl_or_b32 v17, v18, 16, v24
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v18f32_to_v36f16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB33_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s0, 16
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB33_4
+; GFX11-TRUE16-NEXT:  .LBB33_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v17, s29, 1.0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v16, s28, 1.0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v15, s27, 1.0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v14, s26, 1.0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v13, s25, 1.0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v12, s24, 1.0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v11, s23, 1.0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v10, s22, 1.0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v9, s21, 1.0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v8, s20, 1.0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v7, s19, 1.0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, s18, 1.0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, s17, 1.0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, s16, 1.0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v3, s3, 1.0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, s2, 1.0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, s1, 1.0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, s0, 1.0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-TRUE16-NEXT:    s_branch .LBB33_5
+; GFX11-TRUE16-NEXT:  .LBB33_3:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr45
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr44
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr43
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr40
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr15
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr14
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr13
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr12
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr11
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr9
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr8
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr7
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr6
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr5
+; GFX11-TRUE16-NEXT:    s_branch .LBB33_2
+; GFX11-TRUE16-NEXT:  .LBB33_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, s46 :: v_dual_mov_b32 v34, s45
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, s44 :: v_dual_mov_b32 v32, s43
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, s42 :: v_dual_mov_b32 v30, s41
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, s40 :: v_dual_mov_b32 v28, s15
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v27, s14 :: v_dual_mov_b32 v26, s13
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, s12 :: v_dual_mov_b32 v24, s11
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, s10 :: v_dual_mov_b32 v22, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, s8 :: v_dual_mov_b32 v20, s7
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, s6 :: v_dual_mov_b32 v18, s5
+; GFX11-TRUE16-NEXT:  .LBB33_5: ; %end
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v21 :: v_dual_mov_b32 v20, v20
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v19 :: v_dual_mov_b32 v18, v18
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v25.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v23.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v21.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v19.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v18.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v18f32_to_v36f16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB33_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s5, s29, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s6, s28, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s7, s27, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s8, s26, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s9, s25, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s11, s23, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s12, s22, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s13, s21, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s14, s20, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s15, s19, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s40, s18, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s17, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s16, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s3, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s44, s2, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s45, s1, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s46, s0, 16
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB33_4
+; GFX11-FAKE16-NEXT:  .LBB33_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v13, s29, 1.0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v14, s28, 1.0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v15, s27, 1.0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v16, s26, 1.0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v17, s25, 1.0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, s24, 1.0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v9, s23, 1.0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v10, s22, 1.0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v11, s21, 1.0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v12, s20, 1.0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, s19, 1.0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, s18, 1.0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, s17, 1.0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, s16, 1.0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, s3, 1.0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, s2, 1.0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, s1, 1.0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v2, s0, 1.0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v2
+; GFX11-FAKE16-NEXT:    s_branch .LBB33_5
+; GFX11-FAKE16-NEXT:  .LBB33_3:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr45
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr44
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr43
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr41
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr40
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr15
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr12
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr5
+; GFX11-FAKE16-NEXT:    s_branch .LBB33_2
+; GFX11-FAKE16-NEXT:  .LBB33_4:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v7, s3
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s18 :: v_dual_mov_b32 v3, s19
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, s20 :: v_dual_mov_b32 v11, s21
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v9, s23
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, s24 :: v_dual_mov_b32 v17, s25
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s28 :: v_dual_mov_b32 v13, s29
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v35, s46 :: v_dual_mov_b32 v34, s45
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v33, s44 :: v_dual_mov_b32 v32, s43
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v31, s42 :: v_dual_mov_b32 v30, s41
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v29, s40 :: v_dual_mov_b32 v28, s15
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v27, s14 :: v_dual_mov_b32 v26, s13
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v25, s12 :: v_dual_mov_b32 v24, s11
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v23, s10 :: v_dual_mov_b32 v22, s9
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v21, s8 :: v_dual_mov_b32 v20, s7
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v19, s6 :: v_dual_mov_b32 v18, s5
+; GFX11-FAKE16-NEXT:  .LBB33_5: ; %end
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v36, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v35, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v1, v34, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v2, v33, 16, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v3, v32, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v31, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v29, 16, v33
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v28, 16, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v27, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v26, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v24, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v23, 16, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v30, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v25, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v22, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v21, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v20, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v16, v19, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v17, v18, 16, v24
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -21895,140 +22167,270 @@ define inreg <36 x i16> @bitcast_v9f64_to_v36i16_scalar(<9 x double> inreg %a, i
 ; GFX9-NEXT:    ; implicit-def: $vgpr22
 ; GFX9-NEXT:    s_branch .LBB49_2
 ;
-; GFX11-LABEL: bitcast_v9f64_to_v36i16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cbranch_scc0 .LBB49_3
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    s_lshr_b32 s5, s29, 16
-; GFX11-NEXT:    s_lshr_b32 s14, s28, 16
-; GFX11-NEXT:    s_lshr_b32 s6, s27, 16
-; GFX11-NEXT:    s_lshr_b32 s15, s26, 16
-; GFX11-NEXT:    s_lshr_b32 s7, s25, 16
-; GFX11-NEXT:    s_lshr_b32 s40, s24, 16
-; GFX11-NEXT:    s_lshr_b32 s8, s23, 16
-; GFX11-NEXT:    s_lshr_b32 s41, s22, 16
-; GFX11-NEXT:    s_lshr_b32 s9, s21, 16
-; GFX11-NEXT:    s_lshr_b32 s42, s20, 16
-; GFX11-NEXT:    s_lshr_b32 s10, s19, 16
-; GFX11-NEXT:    s_lshr_b32 s43, s18, 16
-; GFX11-NEXT:    s_lshr_b32 s11, s17, 16
-; GFX11-NEXT:    s_lshr_b32 s44, s16, 16
-; GFX11-NEXT:    s_lshr_b32 s12, s3, 16
-; GFX11-NEXT:    s_lshr_b32 s45, s2, 16
-; GFX11-NEXT:    s_lshr_b32 s13, s1, 16
-; GFX11-NEXT:    s_lshr_b32 s46, s0, 16
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_vccnz .LBB49_4
-; GFX11-NEXT:  .LBB49_2: ; %cmp.true
-; GFX11-NEXT:    v_add_f64 v[13:14], s[28:29], 1.0
-; GFX11-NEXT:    v_add_f64 v[15:16], s[26:27], 1.0
-; GFX11-NEXT:    v_add_f64 v[17:18], s[24:25], 1.0
-; GFX11-NEXT:    v_add_f64 v[8:9], s[22:23], 1.0
-; GFX11-NEXT:    v_add_f64 v[10:11], s[20:21], 1.0
-; GFX11-NEXT:    v_add_f64 v[3:4], s[18:19], 1.0
-; GFX11-NEXT:    v_add_f64 v[5:6], s[16:17], 1.0
-; GFX11-NEXT:    v_add_f64 v[19:20], s[2:3], 1.0
-; GFX11-NEXT:    v_add_f64 v[0:1], s[0:1], 1.0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v0
-; GFX11-NEXT:    s_branch .LBB49_5
-; GFX11-NEXT:  .LBB49_3:
-; GFX11-NEXT:    ; implicit-def: $sgpr46
-; GFX11-NEXT:    ; implicit-def: $sgpr13
-; GFX11-NEXT:    ; implicit-def: $sgpr45
-; GFX11-NEXT:    ; implicit-def: $sgpr12
-; GFX11-NEXT:    ; implicit-def: $sgpr44
-; GFX11-NEXT:    ; implicit-def: $sgpr11
-; GFX11-NEXT:    ; implicit-def: $sgpr43
-; GFX11-NEXT:    ; implicit-def: $sgpr10
-; GFX11-NEXT:    ; implicit-def: $sgpr42
-; GFX11-NEXT:    ; implicit-def: $sgpr9
-; GFX11-NEXT:    ; implicit-def: $sgpr41
-; GFX11-NEXT:    ; implicit-def: $sgpr8
-; GFX11-NEXT:    ; implicit-def: $sgpr40
-; GFX11-NEXT:    ; implicit-def: $sgpr7
-; GFX11-NEXT:    ; implicit-def: $sgpr15
-; GFX11-NEXT:    ; implicit-def: $sgpr6
-; GFX11-NEXT:    ; implicit-def: $sgpr14
-; GFX11-NEXT:    ; implicit-def: $sgpr5
-; GFX11-NEXT:    s_branch .LBB49_2
-; GFX11-NEXT:  .LBB49_4:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v19, s2
-; GFX11-NEXT:    v_dual_mov_b32 v5, s16 :: v_dual_mov_b32 v10, s20
-; GFX11-NEXT:    v_dual_mov_b32 v3, s18 :: v_dual_mov_b32 v8, s22
-; GFX11-NEXT:    v_dual_mov_b32 v17, s24 :: v_dual_mov_b32 v20, s3
-; GFX11-NEXT:    v_dual_mov_b32 v15, s26 :: v_dual_mov_b32 v6, s17
-; GFX11-NEXT:    v_dual_mov_b32 v13, s28 :: v_dual_mov_b32 v4, s19
-; GFX11-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v18, s25
-; GFX11-NEXT:    v_dual_mov_b32 v11, s21 :: v_dual_mov_b32 v16, s27
-; GFX11-NEXT:    v_dual_mov_b32 v9, s23 :: v_dual_mov_b32 v14, s29
-; GFX11-NEXT:    v_dual_mov_b32 v34, s46 :: v_dual_mov_b32 v7, s43
-; GFX11-NEXT:    v_dual_mov_b32 v2, s45 :: v_dual_mov_b32 v27, s42
-; GFX11-NEXT:    v_dual_mov_b32 v30, s44 :: v_dual_mov_b32 v21, s14
-; GFX11-NEXT:    v_dual_mov_b32 v26, s41 :: v_dual_mov_b32 v35, s13
-; GFX11-NEXT:    v_dual_mov_b32 v12, s40 :: v_dual_mov_b32 v33, s12
-; GFX11-NEXT:    v_dual_mov_b32 v22, s15 :: v_dual_mov_b32 v31, s10
-; GFX11-NEXT:    v_dual_mov_b32 v32, s11 :: v_dual_mov_b32 v29, s9
-; GFX11-NEXT:    v_dual_mov_b32 v28, s8 :: v_dual_mov_b32 v25, s7
-; GFX11-NEXT:    v_dual_mov_b32 v24, s6 :: v_dual_mov_b32 v23, s5
-; GFX11-NEXT:  .LBB49_5: ; %end
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshl_or_b32 v2, v2, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v20
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v3
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_lshl_or_b32 v0, v34, 16, v0
-; GFX11-NEXT:    v_lshl_or_b32 v3, v33, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v8
-; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff, v4
-; GFX11-NEXT:    v_lshl_or_b32 v4, v30, 16, v5
-; GFX11-NEXT:    v_lshl_or_b32 v5, v32, 16, v6
-; GFX11-NEXT:    v_lshl_or_b32 v6, v7, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v9
-; GFX11-NEXT:    v_lshl_or_b32 v8, v27, 16, v10
-; GFX11-NEXT:    v_lshl_or_b32 v10, v26, 16, v19
-; GFX11-NEXT:    v_lshl_or_b32 v12, v12, 16, v17
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v18
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v13
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v14
-; GFX11-NEXT:    v_lshl_or_b32 v1, v35, 16, v1
-; GFX11-NEXT:    v_lshl_or_b32 v7, v31, 16, v34
-; GFX11-NEXT:    v_lshl_or_b32 v9, v29, 16, v11
-; GFX11-NEXT:    v_lshl_or_b32 v11, v28, 16, v20
-; GFX11-NEXT:    v_lshl_or_b32 v13, v25, 16, v17
-; GFX11-NEXT:    v_lshl_or_b32 v14, v22, 16, v15
-; GFX11-NEXT:    v_lshl_or_b32 v15, v24, 16, v16
-; GFX11-NEXT:    v_lshl_or_b32 v16, v21, 16, v18
-; GFX11-NEXT:    v_lshl_or_b32 v17, v23, 16, v19
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v9f64_to_v36i16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB49_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s0, 16
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB49_4
+; GFX11-TRUE16-NEXT:  .LBB49_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_f64 v[16:17], s[28:29], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[14:15], s[26:27], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[12:13], s[24:25], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[10:11], s[22:23], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[8:9], s[20:21], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[6:7], s[18:19], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[4:5], s[16:17], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[2:3], s[2:3], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[0:1], s[0:1], 1.0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v0
+; GFX11-TRUE16-NEXT:    s_branch .LBB49_5
+; GFX11-TRUE16-NEXT:  .LBB49_3:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr13
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr45
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr12
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr44
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr11
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr43
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr9
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr8
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr40
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr7
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr15
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr6
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr14
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr5
+; GFX11-TRUE16-NEXT:    s_branch .LBB49_2
+; GFX11-TRUE16-NEXT:  .LBB49_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v34, s46 :: v_dual_mov_b32 v35, s13
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v32, s45 :: v_dual_mov_b32 v33, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v30, s44 :: v_dual_mov_b32 v31, s11
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, s43 :: v_dual_mov_b32 v29, s10
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, s42 :: v_dual_mov_b32 v27, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v24, s41 :: v_dual_mov_b32 v25, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, s40 :: v_dual_mov_b32 v23, s7
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s15 :: v_dual_mov_b32 v21, s6
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, s14 :: v_dual_mov_b32 v19, s5
+; GFX11-TRUE16-NEXT:  .LBB49_5: ; %end
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v32, v32 :: v_dual_mov_b32 v31, v31
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v22, v22
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v34, v34 :: v_dual_mov_b32 v35, v35
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v32.l
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v32, v33 :: v_dual_mov_b32 v27, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v30, v30 :: v_dual_mov_b32 v29, v29
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, v28 :: v_dual_mov_b32 v25, v25
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, v26 :: v_dual_mov_b32 v21, v21
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v24, v24 :: v_dual_mov_b32 v19, v19
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v22, v23
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v20, v20
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v18, v18
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v25.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v21.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v19.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v9f64_to_v36i16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB49_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s5, s29, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s6, s27, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s15, s26, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s7, s25, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s40, s24, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s8, s23, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s22, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s9, s21, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s20, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s19, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s18, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s11, s17, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s12, s3, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s45, s2, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s13, s1, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s46, s0, 16
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB49_4
+; GFX11-FAKE16-NEXT:  .LBB49_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_f64 v[13:14], s[28:29], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[15:16], s[26:27], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[17:18], s[24:25], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[8:9], s[22:23], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[10:11], s[20:21], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[3:4], s[18:19], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[5:6], s[16:17], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[19:20], s[2:3], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[0:1], s[0:1], 1.0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v0
+; GFX11-FAKE16-NEXT:    s_branch .LBB49_5
+; GFX11-FAKE16-NEXT:  .LBB49_3:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr45
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr12
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr44
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr43
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr41
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr40
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr15
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr5
+; GFX11-FAKE16-NEXT:    s_branch .LBB49_2
+; GFX11-FAKE16-NEXT:  .LBB49_4:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v19, s2
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, s16 :: v_dual_mov_b32 v10, s20
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v3, s18 :: v_dual_mov_b32 v8, s22
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v17, s24 :: v_dual_mov_b32 v20, s3
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v15, s26 :: v_dual_mov_b32 v6, s17
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v13, s28 :: v_dual_mov_b32 v4, s19
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v18, s25
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v11, s21 :: v_dual_mov_b32 v16, s27
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v9, s23 :: v_dual_mov_b32 v14, s29
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, s46 :: v_dual_mov_b32 v7, s43
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s45 :: v_dual_mov_b32 v27, s42
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v30, s44 :: v_dual_mov_b32 v21, s14
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v26, s41 :: v_dual_mov_b32 v35, s13
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, s40 :: v_dual_mov_b32 v33, s12
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v22, s15 :: v_dual_mov_b32 v31, s10
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, s11 :: v_dual_mov_b32 v29, s9
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v28, s8 :: v_dual_mov_b32 v25, s7
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v24, s6 :: v_dual_mov_b32 v23, s5
+; GFX11-FAKE16-NEXT:  .LBB49_5: ; %end
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v2, v2, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v34, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v3, v33, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v30, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v32, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v7, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v27, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v26, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v12, 16, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v1, v35, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v31, 16, v34
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v29, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v28, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v25, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v22, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v24, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v16, v21, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v17, v23, 16, v19
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -24595,140 +24997,270 @@ define inreg <36 x half> @bitcast_v9f64_to_v36f16_scalar(<9 x double> inreg %a,
 ; GFX9-NEXT:    ; implicit-def: $vgpr22
 ; GFX9-NEXT:    s_branch .LBB53_2
 ;
-; GFX11-LABEL: bitcast_v9f64_to_v36f16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cbranch_scc0 .LBB53_3
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    s_lshr_b32 s5, s29, 16
-; GFX11-NEXT:    s_lshr_b32 s14, s28, 16
-; GFX11-NEXT:    s_lshr_b32 s6, s27, 16
-; GFX11-NEXT:    s_lshr_b32 s15, s26, 16
-; GFX11-NEXT:    s_lshr_b32 s7, s25, 16
-; GFX11-NEXT:    s_lshr_b32 s40, s24, 16
-; GFX11-NEXT:    s_lshr_b32 s8, s23, 16
-; GFX11-NEXT:    s_lshr_b32 s41, s22, 16
-; GFX11-NEXT:    s_lshr_b32 s9, s21, 16
-; GFX11-NEXT:    s_lshr_b32 s42, s20, 16
-; GFX11-NEXT:    s_lshr_b32 s10, s19, 16
-; GFX11-NEXT:    s_lshr_b32 s43, s18, 16
-; GFX11-NEXT:    s_lshr_b32 s11, s17, 16
-; GFX11-NEXT:    s_lshr_b32 s44, s16, 16
-; GFX11-NEXT:    s_lshr_b32 s12, s3, 16
-; GFX11-NEXT:    s_lshr_b32 s45, s2, 16
-; GFX11-NEXT:    s_lshr_b32 s13, s1, 16
-; GFX11-NEXT:    s_lshr_b32 s46, s0, 16
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_vccnz .LBB53_4
-; GFX11-NEXT:  .LBB53_2: ; %cmp.true
-; GFX11-NEXT:    v_add_f64 v[13:14], s[28:29], 1.0
-; GFX11-NEXT:    v_add_f64 v[15:16], s[26:27], 1.0
-; GFX11-NEXT:    v_add_f64 v[17:18], s[24:25], 1.0
-; GFX11-NEXT:    v_add_f64 v[8:9], s[22:23], 1.0
-; GFX11-NEXT:    v_add_f64 v[10:11], s[20:21], 1.0
-; GFX11-NEXT:    v_add_f64 v[3:4], s[18:19], 1.0
-; GFX11-NEXT:    v_add_f64 v[5:6], s[16:17], 1.0
-; GFX11-NEXT:    v_add_f64 v[19:20], s[2:3], 1.0
-; GFX11-NEXT:    v_add_f64 v[0:1], s[0:1], 1.0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v0
-; GFX11-NEXT:    s_branch .LBB53_5
-; GFX11-NEXT:  .LBB53_3:
-; GFX11-NEXT:    ; implicit-def: $sgpr46
-; GFX11-NEXT:    ; implicit-def: $sgpr13
-; GFX11-NEXT:    ; implicit-def: $sgpr45
-; GFX11-NEXT:    ; implicit-def: $sgpr12
-; GFX11-NEXT:    ; implicit-def: $sgpr44
-; GFX11-NEXT:    ; implicit-def: $sgpr11
-; GFX11-NEXT:    ; implicit-def: $sgpr43
-; GFX11-NEXT:    ; implicit-def: $sgpr10
-; GFX11-NEXT:    ; implicit-def: $sgpr42
-; GFX11-NEXT:    ; implicit-def: $sgpr9
-; GFX11-NEXT:    ; implicit-def: $sgpr41
-; GFX11-NEXT:    ; implicit-def: $sgpr8
-; GFX11-NEXT:    ; implicit-def: $sgpr40
-; GFX11-NEXT:    ; implicit-def: $sgpr7
-; GFX11-NEXT:    ; implicit-def: $sgpr15
-; GFX11-NEXT:    ; implicit-def: $sgpr6
-; GFX11-NEXT:    ; implicit-def: $sgpr14
-; GFX11-NEXT:    ; implicit-def: $sgpr5
-; GFX11-NEXT:    s_branch .LBB53_2
-; GFX11-NEXT:  .LBB53_4:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v19, s2
-; GFX11-NEXT:    v_dual_mov_b32 v5, s16 :: v_dual_mov_b32 v10, s20
-; GFX11-NEXT:    v_dual_mov_b32 v3, s18 :: v_dual_mov_b32 v8, s22
-; GFX11-NEXT:    v_dual_mov_b32 v17, s24 :: v_dual_mov_b32 v20, s3
-; GFX11-NEXT:    v_dual_mov_b32 v15, s26 :: v_dual_mov_b32 v6, s17
-; GFX11-NEXT:    v_dual_mov_b32 v13, s28 :: v_dual_mov_b32 v4, s19
-; GFX11-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v18, s25
-; GFX11-NEXT:    v_dual_mov_b32 v11, s21 :: v_dual_mov_b32 v16, s27
-; GFX11-NEXT:    v_dual_mov_b32 v9, s23 :: v_dual_mov_b32 v14, s29
-; GFX11-NEXT:    v_dual_mov_b32 v34, s46 :: v_dual_mov_b32 v7, s43
-; GFX11-NEXT:    v_dual_mov_b32 v2, s45 :: v_dual_mov_b32 v27, s42
-; GFX11-NEXT:    v_dual_mov_b32 v30, s44 :: v_dual_mov_b32 v21, s14
-; GFX11-NEXT:    v_dual_mov_b32 v26, s41 :: v_dual_mov_b32 v35, s13
-; GFX11-NEXT:    v_dual_mov_b32 v12, s40 :: v_dual_mov_b32 v33, s12
-; GFX11-NEXT:    v_dual_mov_b32 v22, s15 :: v_dual_mov_b32 v31, s10
-; GFX11-NEXT:    v_dual_mov_b32 v32, s11 :: v_dual_mov_b32 v29, s9
-; GFX11-NEXT:    v_dual_mov_b32 v28, s8 :: v_dual_mov_b32 v25, s7
-; GFX11-NEXT:    v_dual_mov_b32 v24, s6 :: v_dual_mov_b32 v23, s5
-; GFX11-NEXT:  .LBB53_5: ; %end
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshl_or_b32 v2, v2, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v20
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v3
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_lshl_or_b32 v0, v34, 16, v0
-; GFX11-NEXT:    v_lshl_or_b32 v3, v33, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v8
-; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff, v4
-; GFX11-NEXT:    v_lshl_or_b32 v4, v30, 16, v5
-; GFX11-NEXT:    v_lshl_or_b32 v5, v32, 16, v6
-; GFX11-NEXT:    v_lshl_or_b32 v6, v7, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v9
-; GFX11-NEXT:    v_lshl_or_b32 v8, v27, 16, v10
-; GFX11-NEXT:    v_lshl_or_b32 v10, v26, 16, v19
-; GFX11-NEXT:    v_lshl_or_b32 v12, v12, 16, v17
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v18
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v13
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v14
-; GFX11-NEXT:    v_lshl_or_b32 v1, v35, 16, v1
-; GFX11-NEXT:    v_lshl_or_b32 v7, v31, 16, v34
-; GFX11-NEXT:    v_lshl_or_b32 v9, v29, 16, v11
-; GFX11-NEXT:    v_lshl_or_b32 v11, v28, 16, v20
-; GFX11-NEXT:    v_lshl_or_b32 v13, v25, 16, v17
-; GFX11-NEXT:    v_lshl_or_b32 v14, v22, 16, v15
-; GFX11-NEXT:    v_lshl_or_b32 v15, v24, 16, v16
-; GFX11-NEXT:    v_lshl_or_b32 v16, v21, 16, v18
-; GFX11-NEXT:    v_lshl_or_b32 v17, v23, 16, v19
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v9f64_to_v36f16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB53_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s0, 16
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB53_4
+; GFX11-TRUE16-NEXT:  .LBB53_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_f64 v[16:17], s[28:29], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[14:15], s[26:27], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[12:13], s[24:25], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[10:11], s[22:23], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[8:9], s[20:21], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[6:7], s[18:19], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[4:5], s[16:17], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[2:3], s[2:3], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[0:1], s[0:1], 1.0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v0
+; GFX11-TRUE16-NEXT:    s_branch .LBB53_5
+; GFX11-TRUE16-NEXT:  .LBB53_3:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr13
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr45
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr12
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr44
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr11
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr43
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr9
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr8
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr40
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr7
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr15
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr6
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr14
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr5
+; GFX11-TRUE16-NEXT:    s_branch .LBB53_2
+; GFX11-TRUE16-NEXT:  .LBB53_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v34, s46 :: v_dual_mov_b32 v35, s13
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v32, s45 :: v_dual_mov_b32 v33, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v30, s44 :: v_dual_mov_b32 v31, s11
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, s43 :: v_dual_mov_b32 v29, s10
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, s42 :: v_dual_mov_b32 v27, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v24, s41 :: v_dual_mov_b32 v25, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, s40 :: v_dual_mov_b32 v23, s7
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s15 :: v_dual_mov_b32 v21, s6
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, s14 :: v_dual_mov_b32 v19, s5
+; GFX11-TRUE16-NEXT:  .LBB53_5: ; %end
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v32, v32 :: v_dual_mov_b32 v31, v31
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v22, v22
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v34, v34 :: v_dual_mov_b32 v35, v35
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v32.l
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v32, v33 :: v_dual_mov_b32 v27, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v30, v30 :: v_dual_mov_b32 v29, v29
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, v28 :: v_dual_mov_b32 v25, v25
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, v26 :: v_dual_mov_b32 v21, v21
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v24, v24 :: v_dual_mov_b32 v19, v19
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v22, v23
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v20, v20
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v18, v18
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v25.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v21.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v19.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v9f64_to_v36f16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s5, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB53_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s5, s29, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s6, s27, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s15, s26, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s7, s25, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s40, s24, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s8, s23, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s22, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s9, s21, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s20, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s19, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s18, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s11, s17, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s12, s3, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s45, s2, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s13, s1, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s46, s0, 16
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB53_4
+; GFX11-FAKE16-NEXT:  .LBB53_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_f64 v[13:14], s[28:29], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[15:16], s[26:27], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[17:18], s[24:25], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[8:9], s[22:23], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[10:11], s[20:21], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[3:4], s[18:19], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[5:6], s[16:17], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[19:20], s[2:3], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[0:1], s[0:1], 1.0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v0
+; GFX11-FAKE16-NEXT:    s_branch .LBB53_5
+; GFX11-FAKE16-NEXT:  .LBB53_3:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr46
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr45
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr12
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr44
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr43
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr41
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr40
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr15
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr5
+; GFX11-FAKE16-NEXT:    s_branch .LBB53_2
+; GFX11-FAKE16-NEXT:  .LBB53_4:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v19, s2
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, s16 :: v_dual_mov_b32 v10, s20
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v3, s18 :: v_dual_mov_b32 v8, s22
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v17, s24 :: v_dual_mov_b32 v20, s3
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v15, s26 :: v_dual_mov_b32 v6, s17
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v13, s28 :: v_dual_mov_b32 v4, s19
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v18, s25
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v11, s21 :: v_dual_mov_b32 v16, s27
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v9, s23 :: v_dual_mov_b32 v14, s29
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, s46 :: v_dual_mov_b32 v7, s43
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s45 :: v_dual_mov_b32 v27, s42
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v30, s44 :: v_dual_mov_b32 v21, s14
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v26, s41 :: v_dual_mov_b32 v35, s13
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, s40 :: v_dual_mov_b32 v33, s12
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v22, s15 :: v_dual_mov_b32 v31, s10
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, s11 :: v_dual_mov_b32 v29, s9
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v28, s8 :: v_dual_mov_b32 v25, s7
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v24, s6 :: v_dual_mov_b32 v23, s5
+; GFX11-FAKE16-NEXT:  .LBB53_5: ; %end
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v2, v2, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v34, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v3, v33, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v30, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v32, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v7, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v27, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v26, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v12, 16, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v1, v35, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v31, 16, v34
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v29, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v28, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v25, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v22, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v24, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v16, v21, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v17, v23, 16, v19
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -27654,149 +28186,285 @@ define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v19
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v36i16_to_v36f16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_lshr_b32 s45, s29, 16
-; GFX11-NEXT:    s_lshr_b32 s44, s28, 16
-; GFX11-NEXT:    s_lshr_b32 s43, s27, 16
-; GFX11-NEXT:    s_lshr_b32 s42, s26, 16
-; GFX11-NEXT:    s_lshr_b32 s41, s25, 16
-; GFX11-NEXT:    s_lshr_b32 s40, s24, 16
-; GFX11-NEXT:    s_lshr_b32 s15, s23, 16
-; GFX11-NEXT:    s_lshr_b32 s14, s22, 16
-; GFX11-NEXT:    s_lshr_b32 s13, s21, 16
-; GFX11-NEXT:    s_lshr_b32 s12, s20, 16
-; GFX11-NEXT:    s_lshr_b32 s11, s19, 16
-; GFX11-NEXT:    s_lshr_b32 s10, s18, 16
-; GFX11-NEXT:    s_lshr_b32 s9, s17, 16
-; GFX11-NEXT:    s_lshr_b32 s7, s16, 16
-; GFX11-NEXT:    s_lshr_b32 s6, s3, 16
-; GFX11-NEXT:    s_lshr_b32 s8, s2, 16
-; GFX11-NEXT:    s_lshr_b32 s4, s1, 16
-; GFX11-NEXT:    s_lshr_b32 s5, s0, 16
-; GFX11-NEXT:    s_mov_b32 s46, 0
-; GFX11-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cbranch_scc0 .LBB57_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s46
-; GFX11-NEXT:    s_cbranch_vccnz .LBB57_4
-; GFX11-NEXT:  .LBB57_2: ; %cmp.true
-; GFX11-NEXT:    s_pack_ll_b32_b16 s29, s29, s45
-; GFX11-NEXT:    s_pack_ll_b32_b16 s28, s28, s44
-; GFX11-NEXT:    s_pack_ll_b32_b16 s27, s27, s43
-; GFX11-NEXT:    s_pack_ll_b32_b16 s26, s26, s42
-; GFX11-NEXT:    s_pack_ll_b32_b16 s25, s25, s41
-; GFX11-NEXT:    s_pack_ll_b32_b16 s24, s24, s40
-; GFX11-NEXT:    s_pack_ll_b32_b16 s15, s23, s15
-; GFX11-NEXT:    s_pack_ll_b32_b16 s14, s22, s14
-; GFX11-NEXT:    s_pack_ll_b32_b16 s13, s21, s13
-; GFX11-NEXT:    s_pack_ll_b32_b16 s12, s20, s12
-; GFX11-NEXT:    s_pack_ll_b32_b16 s11, s19, s11
-; GFX11-NEXT:    s_pack_ll_b32_b16 s10, s18, s10
-; GFX11-NEXT:    s_pack_ll_b32_b16 s9, s17, s9
-; GFX11-NEXT:    s_pack_ll_b32_b16 s7, s16, s7
-; GFX11-NEXT:    s_pack_ll_b32_b16 s3, s3, s6
-; GFX11-NEXT:    s_pack_ll_b32_b16 s2, s2, s8
-; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s5
-; GFX11-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
-; GFX11-NEXT:    v_pk_add_u16 v13, s29, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v14, s28, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v16, s26, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v17, s25, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v10, s14, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v11, s13, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v3, s11, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v5, s9, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v0, s2, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v7, s3, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v13
-; GFX11-NEXT:    s_branch .LBB57_5
-; GFX11-NEXT:  .LBB57_3:
-; GFX11-NEXT:    s_branch .LBB57_2
-; GFX11-NEXT:  .LBB57_4:
-; GFX11-NEXT:    v_dual_mov_b32 v13, s29 :: v_dual_mov_b32 v14, s28
-; GFX11-NEXT:    v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s26
-; GFX11-NEXT:    v_dual_mov_b32 v17, s25 :: v_dual_mov_b32 v8, s24
-; GFX11-NEXT:    v_dual_mov_b32 v9, s23 :: v_dual_mov_b32 v10, s22
-; GFX11-NEXT:    v_dual_mov_b32 v11, s21 :: v_dual_mov_b32 v12, s20
-; GFX11-NEXT:    v_dual_mov_b32 v3, s19 :: v_dual_mov_b32 v4, s18
-; GFX11-NEXT:    v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s16
-; GFX11-NEXT:    v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v0, s2
-; GFX11-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-NEXT:    v_dual_mov_b32 v18, s45 :: v_dual_mov_b32 v19, s44
-; GFX11-NEXT:    v_dual_mov_b32 v20, s43 :: v_dual_mov_b32 v21, s42
-; GFX11-NEXT:    v_dual_mov_b32 v22, s41 :: v_dual_mov_b32 v23, s40
-; GFX11-NEXT:    v_dual_mov_b32 v24, s15 :: v_dual_mov_b32 v25, s14
-; GFX11-NEXT:    v_dual_mov_b32 v26, s13 :: v_dual_mov_b32 v27, s12
-; GFX11-NEXT:    v_dual_mov_b32 v28, s11 :: v_dual_mov_b32 v29, s10
-; GFX11-NEXT:    v_dual_mov_b32 v30, s9 :: v_dual_mov_b32 v31, s7
-; GFX11-NEXT:    v_dual_mov_b32 v32, s6 :: v_dual_mov_b32 v33, s8
-; GFX11-NEXT:    v_dual_mov_b32 v34, s4 :: v_dual_mov_b32 v35, s5
-; GFX11-NEXT:  .LBB57_5: ; %end
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_and_b32_e32 v36, 0xffff, v0
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshl_or_b32 v0, v35, 16, v2
-; GFX11-NEXT:    v_lshl_or_b32 v1, v34, 16, v1
-; GFX11-NEXT:    v_lshl_or_b32 v2, v33, 16, v36
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff, v4
-; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff, v3
-; GFX11-NEXT:    v_lshl_or_b32 v3, v32, 16, v7
-; GFX11-NEXT:    v_lshl_or_b32 v4, v31, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshl_or_b32 v6, v29, 16, v33
-; GFX11-NEXT:    v_lshl_or_b32 v7, v28, 16, v34
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff, v9
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff, v8
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshl_or_b32 v8, v27, 16, v12
-; GFX11-NEXT:    v_lshl_or_b32 v9, v26, 16, v11
-; GFX11-NEXT:    v_lshl_or_b32 v11, v24, 16, v28
-; GFX11-NEXT:    v_lshl_or_b32 v12, v23, 16, v29
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff, v14
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff, v13
-; GFX11-NEXT:    v_lshl_or_b32 v5, v30, 16, v5
-; GFX11-NEXT:    v_lshl_or_b32 v10, v25, 16, v10
-; GFX11-NEXT:    v_lshl_or_b32 v13, v22, 16, v17
-; GFX11-NEXT:    v_lshl_or_b32 v14, v21, 16, v16
-; GFX11-NEXT:    v_lshl_or_b32 v15, v20, 16, v15
-; GFX11-NEXT:    v_lshl_or_b32 v16, v19, 16, v23
-; GFX11-NEXT:    v_lshl_or_b32 v17, v18, 16, v24
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v36i16_to_v36f16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s46, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB57_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s46
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB57_4
+; GFX11-TRUE16-NEXT:  .LBB57_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s29, s29, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s28, s28, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s27, s27, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s26, s26, s42
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s25, s25, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s24, s24, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s23, s15
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s22, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s21, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s20, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s19, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s18, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s17, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s16, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v17, s29, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v16, s28, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v11, s15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v10, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v8, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v7, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v6, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v4, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v17
+; GFX11-TRUE16-NEXT:    s_branch .LBB57_5
+; GFX11-TRUE16-NEXT:  .LBB57_3:
+; GFX11-TRUE16-NEXT:    s_branch .LBB57_2
+; GFX11-TRUE16-NEXT:  .LBB57_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v16, s28
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v14, s26
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v12, s24
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v8, s20
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v4, s16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, s45 :: v_dual_mov_b32 v19, s44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s43 :: v_dual_mov_b32 v21, s42
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, s41 :: v_dual_mov_b32 v23, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v24, s15 :: v_dual_mov_b32 v25, s14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, s13 :: v_dual_mov_b32 v27, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, s11 :: v_dual_mov_b32 v29, s10
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v30, s9 :: v_dual_mov_b32 v31, s7
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v32, s6 :: v_dual_mov_b32 v33, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v34, s4 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:  .LBB57_5: ; %end
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v21 :: v_dual_mov_b32 v20, v20
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v19 :: v_dual_mov_b32 v18, v18
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v25.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v23.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v21.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v19.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v18.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v36i16_to_v36f16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s45, s29, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s44, s28, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s27, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s26, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s25, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s40, s24, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s15, s23, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s14, s22, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s13, s21, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s12, s20, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s11, s19, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s18, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s9, s17, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s7, s16, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s6, s3, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s8, s2, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s5, s0, 16
+; GFX11-FAKE16-NEXT:    s_mov_b32 s46, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB57_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s46
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB57_4
+; GFX11-FAKE16-NEXT:  .LBB57_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s29, s29, s45
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s28, s28, s44
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s27, s27, s43
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s26, s26, s42
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s25, s25, s41
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s24, s24, s40
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s15, s23, s15
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s14, s22, s14
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s13, s21, s13
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s12, s20, s12
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s11, s19, s11
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s10, s18, s10
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s9, s17, s9
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s7, s16, s7
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s6
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s8
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s5
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v13, s29, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v14, s28, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v16, s26, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v17, s25, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v8, s24, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v9, s15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v10, s14, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v11, s13, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v3, s11, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v4, s10, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v5, s9, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v2, s0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v0, s2, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v7, s3, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v13
+; GFX11-FAKE16-NEXT:    s_branch .LBB57_5
+; GFX11-FAKE16-NEXT:  .LBB57_3:
+; GFX11-FAKE16-NEXT:    s_branch .LBB57_2
+; GFX11-FAKE16-NEXT:  .LBB57_4:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v13, s29 :: v_dual_mov_b32 v14, s28
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s26
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v17, s25 :: v_dual_mov_b32 v8, s24
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v9, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v11, s21 :: v_dual_mov_b32 v12, s20
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v3, s19 :: v_dual_mov_b32 v4, s18
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s16
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v0, s2
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v18, s45 :: v_dual_mov_b32 v19, s44
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v20, s43 :: v_dual_mov_b32 v21, s42
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v22, s41 :: v_dual_mov_b32 v23, s40
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v24, s15 :: v_dual_mov_b32 v25, s14
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v26, s13 :: v_dual_mov_b32 v27, s12
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v28, s11 :: v_dual_mov_b32 v29, s10
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v30, s9 :: v_dual_mov_b32 v31, s7
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, s6 :: v_dual_mov_b32 v33, s8
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, s4 :: v_dual_mov_b32 v35, s5
+; GFX11-FAKE16-NEXT:  .LBB57_5: ; %end
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v36, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v35, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v1, v34, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v2, v33, 16, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v3, v32, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v31, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v29, 16, v33
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v28, 16, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v27, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v26, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v24, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v23, 16, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v30, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v25, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v22, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v21, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v20, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v16, v19, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v17, v18, 16, v24
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -29137,149 +29805,285 @@ define inreg <36 x i16> @bitcast_v36f16_to_v36i16_scalar(<36 x half> inreg %a, i
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v19
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v36f16_to_v36i16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_lshr_b32 s45, s29, 16
-; GFX11-NEXT:    s_lshr_b32 s44, s28, 16
-; GFX11-NEXT:    s_lshr_b32 s43, s27, 16
-; GFX11-NEXT:    s_lshr_b32 s42, s26, 16
-; GFX11-NEXT:    s_lshr_b32 s41, s25, 16
-; GFX11-NEXT:    s_lshr_b32 s40, s24, 16
-; GFX11-NEXT:    s_lshr_b32 s15, s23, 16
-; GFX11-NEXT:    s_lshr_b32 s14, s22, 16
-; GFX11-NEXT:    s_lshr_b32 s13, s21, 16
-; GFX11-NEXT:    s_lshr_b32 s12, s20, 16
-; GFX11-NEXT:    s_lshr_b32 s11, s19, 16
-; GFX11-NEXT:    s_lshr_b32 s10, s18, 16
-; GFX11-NEXT:    s_lshr_b32 s9, s17, 16
-; GFX11-NEXT:    s_lshr_b32 s7, s16, 16
-; GFX11-NEXT:    s_lshr_b32 s6, s3, 16
-; GFX11-NEXT:    s_lshr_b32 s8, s2, 16
-; GFX11-NEXT:    s_lshr_b32 s4, s1, 16
-; GFX11-NEXT:    s_lshr_b32 s5, s0, 16
-; GFX11-NEXT:    s_mov_b32 s46, 0
-; GFX11-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cbranch_scc0 .LBB59_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s46
-; GFX11-NEXT:    s_cbranch_vccnz .LBB59_4
-; GFX11-NEXT:  .LBB59_2: ; %cmp.true
-; GFX11-NEXT:    s_pack_ll_b32_b16 s29, s29, s45
-; GFX11-NEXT:    s_pack_ll_b32_b16 s28, s28, s44
-; GFX11-NEXT:    s_pack_ll_b32_b16 s27, s27, s43
-; GFX11-NEXT:    s_pack_ll_b32_b16 s26, s26, s42
-; GFX11-NEXT:    s_pack_ll_b32_b16 s25, s25, s41
-; GFX11-NEXT:    s_pack_ll_b32_b16 s24, s24, s40
-; GFX11-NEXT:    s_pack_ll_b32_b16 s15, s23, s15
-; GFX11-NEXT:    s_pack_ll_b32_b16 s14, s22, s14
-; GFX11-NEXT:    s_pack_ll_b32_b16 s13, s21, s13
-; GFX11-NEXT:    s_pack_ll_b32_b16 s12, s20, s12
-; GFX11-NEXT:    s_pack_ll_b32_b16 s11, s19, s11
-; GFX11-NEXT:    s_pack_ll_b32_b16 s10, s18, s10
-; GFX11-NEXT:    s_pack_ll_b32_b16 s9, s17, s9
-; GFX11-NEXT:    s_pack_ll_b32_b16 s7, s16, s7
-; GFX11-NEXT:    s_pack_ll_b32_b16 s3, s3, s6
-; GFX11-NEXT:    s_pack_ll_b32_b16 s2, s2, s8
-; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s5
-; GFX11-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
-; GFX11-NEXT:    v_pk_add_f16 v13, 0x200, s29 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v14, 0x200, s28 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v16, 0x200, s26 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v17, 0x200, s25 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v8, 0x200, s24 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v9, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v10, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v11, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v3, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v4, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v5, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v0, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v7, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v13
-; GFX11-NEXT:    s_branch .LBB59_5
-; GFX11-NEXT:  .LBB59_3:
-; GFX11-NEXT:    s_branch .LBB59_2
-; GFX11-NEXT:  .LBB59_4:
-; GFX11-NEXT:    v_dual_mov_b32 v13, s29 :: v_dual_mov_b32 v14, s28
-; GFX11-NEXT:    v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s26
-; GFX11-NEXT:    v_dual_mov_b32 v17, s25 :: v_dual_mov_b32 v8, s24
-; GFX11-NEXT:    v_dual_mov_b32 v9, s23 :: v_dual_mov_b32 v10, s22
-; GFX11-NEXT:    v_dual_mov_b32 v11, s21 :: v_dual_mov_b32 v12, s20
-; GFX11-NEXT:    v_dual_mov_b32 v3, s19 :: v_dual_mov_b32 v4, s18
-; GFX11-NEXT:    v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s16
-; GFX11-NEXT:    v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v0, s2
-; GFX11-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-NEXT:    v_dual_mov_b32 v18, s45 :: v_dual_mov_b32 v19, s44
-; GFX11-NEXT:    v_dual_mov_b32 v20, s43 :: v_dual_mov_b32 v21, s42
-; GFX11-NEXT:    v_dual_mov_b32 v22, s41 :: v_dual_mov_b32 v23, s40
-; GFX11-NEXT:    v_dual_mov_b32 v24, s15 :: v_dual_mov_b32 v25, s14
-; GFX11-NEXT:    v_dual_mov_b32 v26, s13 :: v_dual_mov_b32 v27, s12
-; GFX11-NEXT:    v_dual_mov_b32 v28, s11 :: v_dual_mov_b32 v29, s10
-; GFX11-NEXT:    v_dual_mov_b32 v30, s9 :: v_dual_mov_b32 v31, s7
-; GFX11-NEXT:    v_dual_mov_b32 v32, s6 :: v_dual_mov_b32 v33, s8
-; GFX11-NEXT:    v_dual_mov_b32 v34, s4 :: v_dual_mov_b32 v35, s5
-; GFX11-NEXT:  .LBB59_5: ; %end
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_and_b32_e32 v36, 0xffff, v0
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshl_or_b32 v0, v35, 16, v2
-; GFX11-NEXT:    v_lshl_or_b32 v1, v34, 16, v1
-; GFX11-NEXT:    v_lshl_or_b32 v2, v33, 16, v36
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff, v4
-; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff, v3
-; GFX11-NEXT:    v_lshl_or_b32 v3, v32, 16, v7
-; GFX11-NEXT:    v_lshl_or_b32 v4, v31, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshl_or_b32 v6, v29, 16, v33
-; GFX11-NEXT:    v_lshl_or_b32 v7, v28, 16, v34
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff, v9
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff, v8
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshl_or_b32 v8, v27, 16, v12
-; GFX11-NEXT:    v_lshl_or_b32 v9, v26, 16, v11
-; GFX11-NEXT:    v_lshl_or_b32 v11, v24, 16, v28
-; GFX11-NEXT:    v_lshl_or_b32 v12, v23, 16, v29
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff, v14
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff, v13
-; GFX11-NEXT:    v_lshl_or_b32 v5, v30, 16, v5
-; GFX11-NEXT:    v_lshl_or_b32 v10, v25, 16, v10
-; GFX11-NEXT:    v_lshl_or_b32 v13, v22, 16, v17
-; GFX11-NEXT:    v_lshl_or_b32 v14, v21, 16, v16
-; GFX11-NEXT:    v_lshl_or_b32 v15, v20, 16, v15
-; GFX11-NEXT:    v_lshl_or_b32 v16, v19, 16, v23
-; GFX11-NEXT:    v_lshl_or_b32 v17, v18, 16, v24
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v36f16_to_v36i16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s46, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB59_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s46
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB59_4
+; GFX11-TRUE16-NEXT:  .LBB59_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s29, s29, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s28, s28, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s27, s27, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s26, s26, s42
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s25, s25, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s24, s24, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s23, s15
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s22, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s21, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s20, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s19, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s18, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s17, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s16, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v17, 0x200, s29 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v16, 0x200, s28 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v11, 0x200, s15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v10, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v8, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v7, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v6, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v4, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v17
+; GFX11-TRUE16-NEXT:    s_branch .LBB59_5
+; GFX11-TRUE16-NEXT:  .LBB59_3:
+; GFX11-TRUE16-NEXT:    s_branch .LBB59_2
+; GFX11-TRUE16-NEXT:  .LBB59_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v16, s28
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v14, s26
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v12, s24
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v8, s20
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v4, s16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, s45 :: v_dual_mov_b32 v19, s44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s43 :: v_dual_mov_b32 v21, s42
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, s41 :: v_dual_mov_b32 v23, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v24, s15 :: v_dual_mov_b32 v25, s14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, s13 :: v_dual_mov_b32 v27, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, s11 :: v_dual_mov_b32 v29, s10
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v30, s9 :: v_dual_mov_b32 v31, s7
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v32, s6 :: v_dual_mov_b32 v33, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v34, s4 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:  .LBB59_5: ; %end
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v21 :: v_dual_mov_b32 v20, v20
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v19 :: v_dual_mov_b32 v18, v18
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v25.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v23.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v21.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v19.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v18.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v36f16_to_v36i16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s45, s29, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s44, s28, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s43, s27, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s42, s26, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s41, s25, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s40, s24, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s15, s23, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s14, s22, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s13, s21, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s12, s20, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s11, s19, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s18, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s9, s17, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s7, s16, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s6, s3, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s8, s2, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s5, s0, 16
+; GFX11-FAKE16-NEXT:    s_mov_b32 s46, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB59_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s46
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB59_4
+; GFX11-FAKE16-NEXT:  .LBB59_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s29, s29, s45
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s28, s28, s44
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s27, s27, s43
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s26, s26, s42
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s25, s25, s41
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s24, s24, s40
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s15, s23, s15
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s14, s22, s14
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s13, s21, s13
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s12, s20, s12
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s11, s19, s11
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s10, s18, s10
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s9, s17, s9
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s7, s16, s7
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s6
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s8
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s5
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v13, 0x200, s29 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v14, 0x200, s28 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v16, 0x200, s26 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v17, 0x200, s25 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v8, 0x200, s24 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v9, 0x200, s15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v10, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v11, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v3, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v4, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v5, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v2, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v0, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v7, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v13
+; GFX11-FAKE16-NEXT:    s_branch .LBB59_5
+; GFX11-FAKE16-NEXT:  .LBB59_3:
+; GFX11-FAKE16-NEXT:    s_branch .LBB59_2
+; GFX11-FAKE16-NEXT:  .LBB59_4:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v13, s29 :: v_dual_mov_b32 v14, s28
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s26
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v17, s25 :: v_dual_mov_b32 v8, s24
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v9, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v11, s21 :: v_dual_mov_b32 v12, s20
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v3, s19 :: v_dual_mov_b32 v4, s18
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s16
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v0, s2
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v18, s45 :: v_dual_mov_b32 v19, s44
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v20, s43 :: v_dual_mov_b32 v21, s42
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v22, s41 :: v_dual_mov_b32 v23, s40
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v24, s15 :: v_dual_mov_b32 v25, s14
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v26, s13 :: v_dual_mov_b32 v27, s12
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v28, s11 :: v_dual_mov_b32 v29, s10
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v30, s9 :: v_dual_mov_b32 v31, s7
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, s6 :: v_dual_mov_b32 v33, s8
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, s4 :: v_dual_mov_b32 v35, s5
+; GFX11-FAKE16-NEXT:  .LBB59_5: ; %end
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v36, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v35, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v1, v34, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v2, v33, 16, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v3, v32, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v31, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v29, 16, v33
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v28, 16, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v27, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v26, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v24, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v23, 16, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v30, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v25, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v22, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v21, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v20, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v16, v19, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v17, v18, 16, v24
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
index 93c849ab2a448..1648368af460a 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
@@ -4913,93 +4913,342 @@ define inreg <20 x i32> @bitcast_v40i16_to_v20i32_scalar(<40 x i16> inreg %a, i3
 ; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v20i32_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:192
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:188
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:180
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:172
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:64
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:60
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:52
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:44
+; GFX11-TRUE16-NEXT:    s_clause 0xa
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v0.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v1
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s1, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s2, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s16, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s17, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s18, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s19, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s20, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s21, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s22, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s23, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s24, s15
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s25, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s26, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s27, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s28, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s29, s41
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v185, v1 :: v_dual_mov_b32 v186, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB15_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v5, s8
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s10
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s11 :: v_dual_mov_b32 v9, s12
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s0
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s1 :: v_dual_mov_b32 v17, s2
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB15_3
 ; GFX11-TRUE16-NEXT:  .LBB15_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v4, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v6, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v8, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v10, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v11, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v12, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v13, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v16, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v17, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:  .LBB15_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v17, v170
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xa
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB15_4:
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB15_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v40i16_to_v20i32_scalar:
@@ -8342,93 +8591,342 @@ define inreg <20 x i32> @bitcast_v40f16_to_v20i32_scalar(<40 x half> inreg %a, i
 ; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v20i32_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:192
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:188
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:180
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:172
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:64
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:60
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:52
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:44
+; GFX11-TRUE16-NEXT:    s_clause 0xa
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v0.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v1
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s1, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s2, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s16, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s17, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s18, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s19, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s20, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s21, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s22, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s23, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s24, s15
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s25, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s26, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s27, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s28, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s29, s41
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v185, v1 :: v_dual_mov_b32 v186, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB19_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v5, s8
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s10
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s11 :: v_dual_mov_b32 v9, s12
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s0
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s1 :: v_dual_mov_b32 v17, s2
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB19_3
 ; GFX11-TRUE16-NEXT:  .LBB19_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v4, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v6, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v7, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v8, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v10, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v11, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v12, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v13, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v16, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v17, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:  .LBB19_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v17, v170
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xa
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB19_4:
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB19_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v40f16_to_v20i32_scalar:
@@ -11100,142 +11598,271 @@ define inreg <40 x i16> @bitcast_v20f32_to_v40i16_scalar(<20 x float> inreg %a,
 ; GFX9-NEXT:    ; implicit-def: $vgpr26
 ; GFX9-NEXT:    s_branch .LBB29_2
 ;
-; GFX11-LABEL: bitcast_v20f32_to_v40i16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-NEXT:    v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v19, s1
-; GFX11-NEXT:    v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v3, s16
-; GFX11-NEXT:    v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v9, s17
-; GFX11-NEXT:    v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v7, s19
-; GFX11-NEXT:    v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v5, s21
-; GFX11-NEXT:    v_dual_mov_b32 v14, s22 :: v_dual_mov_b32 v13, s23
-; GFX11-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v11, s25
-; GFX11-NEXT:    v_dual_mov_b32 v10, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT:    v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cbranch_scc0 .LBB29_4
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v20
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccnz .LBB29_3
-; GFX11-NEXT:  .LBB29_2: ; %cmp.true
-; GFX11-NEXT:    v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT:    v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
-; GFX11-NEXT:    v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v10, 1.0, v10
-; GFX11-NEXT:    v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12
-; GFX11-NEXT:    v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14
-; GFX11-NEXT:    v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT:    v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT:    v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT:    v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT:    v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v20
-; GFX11-NEXT:  .LBB29_3: ; %end
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_lshl_or_b32 v21, v21, 16, v19
-; GFX11-NEXT:    v_lshl_or_b32 v2, v2, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v6
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v5
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT:    v_lshl_or_b32 v5, v36, 16, v9
-; GFX11-NEXT:    v_lshl_or_b32 v6, v35, 16, v8
-; GFX11-NEXT:    v_lshl_or_b32 v8, v33, 16, v18
-; GFX11-NEXT:    v_lshl_or_b32 v9, v32, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v11
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v10
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_and_b32_e32 v48, 0xffff, v3
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshl_or_b32 v10, v31, 16, v14
-; GFX11-NEXT:    v_lshl_or_b32 v11, v30, 16, v13
-; GFX11-NEXT:    v_lshl_or_b32 v13, v28, 16, v18
-; GFX11-NEXT:    v_lshl_or_b32 v14, v27, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v16
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshl_or_b32 v19, v22, 16, v1
-; GFX11-NEXT:    v_mov_b32_e32 v1, v21
-; GFX11-NEXT:    v_lshl_or_b32 v20, v39, 16, v20
-; GFX11-NEXT:    v_lshl_or_b32 v3, v38, 16, v4
-; GFX11-NEXT:    v_lshl_or_b32 v4, v37, 16, v48
-; GFX11-NEXT:    v_lshl_or_b32 v7, v34, 16, v7
-; GFX11-NEXT:    v_lshl_or_b32 v12, v29, 16, v12
-; GFX11-NEXT:    v_lshl_or_b32 v15, v26, 16, v15
-; GFX11-NEXT:    v_lshl_or_b32 v16, v25, 16, v17
-; GFX11-NEXT:    v_lshl_or_b32 v17, v24, 16, v18
-; GFX11-NEXT:    v_lshl_or_b32 v18, v23, 16, v0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v20
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB29_4:
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr21
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    ; implicit-def: $vgpr25
-; GFX11-NEXT:    ; implicit-def: $vgpr24
-; GFX11-NEXT:    ; implicit-def: $vgpr23
-; GFX11-NEXT:    ; implicit-def: $vgpr22
-; GFX11-NEXT:    s_branch .LBB29_2
+; GFX11-TRUE16-LABEL: bitcast_v20f32_to_v40i16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB29_4
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB29_3
+; GFX11-TRUE16-NEXT:  .LBB29_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-TRUE16-NEXT:  .LBB29_3: ; %end
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v21 :: v_dual_mov_b32 v20, v20
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v37.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v25.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v23.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v21.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v20.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB29_4:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20
+; GFX11-TRUE16-NEXT:    s_branch .LBB29_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v20f32_to_v40i16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v19, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v3, s16
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v9, s17
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v5, s21
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s22 :: v_dual_mov_b32 v13, s23
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v11, s25
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB29_4
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB29_3
+; GFX11-FAKE16-NEXT:  .LBB29_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-FAKE16-NEXT:  .LBB29_3: ; %end
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v21, v21, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v2, v2, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v36, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v35, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v33, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v32, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v48, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v31, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v30, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v28, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v27, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v19, v22, 16, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v21
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v39, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v3, v38, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v37, 16, v48
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v34, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v29, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v26, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v16, v25, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v17, v24, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v23, 16, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v20
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB29_4:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT:    s_branch .LBB29_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -12629,93 +13256,342 @@ define inreg <20 x float> @bitcast_v40i16_to_v20f32_scalar(<40 x i16> inreg %a,
 ; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v20f32_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:192
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:188
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:180
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:172
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:64
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:60
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:52
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:44
+; GFX11-TRUE16-NEXT:    s_clause 0xa
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v0.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v1
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s1, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s2, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s16, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s17, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s18, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s19, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s20, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s21, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s22, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s23, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s24, s15
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s25, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s26, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s27, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s28, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s29, s41
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v185, v1 :: v_dual_mov_b32 v186, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB31_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v5, s8
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s10
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s11 :: v_dual_mov_b32 v9, s12
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s0
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s1 :: v_dual_mov_b32 v17, s2
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB31_3
 ; GFX11-TRUE16-NEXT:  .LBB31_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v4, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v6, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v8, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v10, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v11, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v12, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v13, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v16, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v17, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:  .LBB31_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v17, v170
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xa
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB31_4:
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB31_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v40i16_to_v20f32_scalar:
@@ -14269,142 +15145,271 @@ define inreg <40 x half> @bitcast_v20f32_to_v40f16_scalar(<20 x float> inreg %a,
 ; GFX9-NEXT:    ; implicit-def: $vgpr26
 ; GFX9-NEXT:    s_branch .LBB33_2
 ;
-; GFX11-LABEL: bitcast_v20f32_to_v40f16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-NEXT:    v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v19, s1
-; GFX11-NEXT:    v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v3, s16
-; GFX11-NEXT:    v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v9, s17
-; GFX11-NEXT:    v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v7, s19
-; GFX11-NEXT:    v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v5, s21
-; GFX11-NEXT:    v_dual_mov_b32 v14, s22 :: v_dual_mov_b32 v13, s23
-; GFX11-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v11, s25
-; GFX11-NEXT:    v_dual_mov_b32 v10, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT:    v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cbranch_scc0 .LBB33_4
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v20
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccnz .LBB33_3
-; GFX11-NEXT:  .LBB33_2: ; %cmp.true
-; GFX11-NEXT:    v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT:    v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
-; GFX11-NEXT:    v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v10, 1.0, v10
-; GFX11-NEXT:    v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12
-; GFX11-NEXT:    v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14
-; GFX11-NEXT:    v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT:    v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT:    v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT:    v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT:    v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v20
-; GFX11-NEXT:  .LBB33_3: ; %end
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_lshl_or_b32 v21, v21, 16, v19
-; GFX11-NEXT:    v_lshl_or_b32 v2, v2, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v6
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v5
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT:    v_lshl_or_b32 v5, v36, 16, v9
-; GFX11-NEXT:    v_lshl_or_b32 v6, v35, 16, v8
-; GFX11-NEXT:    v_lshl_or_b32 v8, v33, 16, v18
-; GFX11-NEXT:    v_lshl_or_b32 v9, v32, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v11
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v10
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_and_b32_e32 v48, 0xffff, v3
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshl_or_b32 v10, v31, 16, v14
-; GFX11-NEXT:    v_lshl_or_b32 v11, v30, 16, v13
-; GFX11-NEXT:    v_lshl_or_b32 v13, v28, 16, v18
-; GFX11-NEXT:    v_lshl_or_b32 v14, v27, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v16
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshl_or_b32 v19, v22, 16, v1
-; GFX11-NEXT:    v_mov_b32_e32 v1, v21
-; GFX11-NEXT:    v_lshl_or_b32 v20, v39, 16, v20
-; GFX11-NEXT:    v_lshl_or_b32 v3, v38, 16, v4
-; GFX11-NEXT:    v_lshl_or_b32 v4, v37, 16, v48
-; GFX11-NEXT:    v_lshl_or_b32 v7, v34, 16, v7
-; GFX11-NEXT:    v_lshl_or_b32 v12, v29, 16, v12
-; GFX11-NEXT:    v_lshl_or_b32 v15, v26, 16, v15
-; GFX11-NEXT:    v_lshl_or_b32 v16, v25, 16, v17
-; GFX11-NEXT:    v_lshl_or_b32 v17, v24, 16, v18
-; GFX11-NEXT:    v_lshl_or_b32 v18, v23, 16, v0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v20
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB33_4:
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr21
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    ; implicit-def: $vgpr25
-; GFX11-NEXT:    ; implicit-def: $vgpr24
-; GFX11-NEXT:    ; implicit-def: $vgpr23
-; GFX11-NEXT:    ; implicit-def: $vgpr22
-; GFX11-NEXT:    s_branch .LBB33_2
+; GFX11-TRUE16-LABEL: bitcast_v20f32_to_v40f16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB33_4
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB33_3
+; GFX11-TRUE16-NEXT:  .LBB33_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-TRUE16-NEXT:  .LBB33_3: ; %end
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v21 :: v_dual_mov_b32 v20, v20
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v37.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v25.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v23.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v21.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v20.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB33_4:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20
+; GFX11-TRUE16-NEXT:    s_branch .LBB33_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v20f32_to_v40f16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v19, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v3, s16
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v9, s17
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v7, s19
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v5, s21
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s22 :: v_dual_mov_b32 v13, s23
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v11, s25
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB33_4
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB33_3
+; GFX11-FAKE16-NEXT:  .LBB33_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-FAKE16-NEXT:  .LBB33_3: ; %end
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v21, v21, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v2, v2, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v36, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v35, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v33, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v32, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v48, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v31, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v30, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v28, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v27, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v19, v22, 16, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v21
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v39, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v3, v38, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v37, 16, v48
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v34, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v29, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v26, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v16, v25, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v17, v24, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v23, 16, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v20
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB33_4:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT:    s_branch .LBB33_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -16043,93 +17048,342 @@ define inreg <20 x float> @bitcast_v40f16_to_v20f32_scalar(<40 x half> inreg %a,
 ; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v20f32_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:192
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:188
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:180
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:172
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:64
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:60
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:52
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:44
+; GFX11-TRUE16-NEXT:    s_clause 0xa
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v0.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v1
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s1, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s2, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s16, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s17, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s18, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s19, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s20, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s21, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s22, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s23, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s24, s15
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s25, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s26, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s27, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s28, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s29, s41
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v185, v1 :: v_dual_mov_b32 v186, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB35_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v5, s8
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s10
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s11 :: v_dual_mov_b32 v9, s12
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s0
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s1 :: v_dual_mov_b32 v17, s2
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB35_3
 ; GFX11-TRUE16-NEXT:  .LBB35_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v4, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v6, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v7, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v8, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v10, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v11, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v12, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v13, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v16, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v17, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:  .LBB35_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v17, v170
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xa
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB35_4:
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB35_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v40f16_to_v20f32_scalar:
@@ -19655,93 +20909,342 @@ define inreg <10 x i64> @bitcast_v40i16_to_v10i64_scalar(<40 x i16> inreg %a, i3
 ; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v10i64_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:192
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:188
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:180
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:172
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:64
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:60
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:52
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:44
+; GFX11-TRUE16-NEXT:    s_clause 0xa
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v0.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v1
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s1, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s2, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s16, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s17, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s18, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s19, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s20, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s21, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s22, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s23, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s24, s15
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s25, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s26, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s27, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s28, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s29, s41
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v185, v1 :: v_dual_mov_b32 v186, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB43_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v5, s8
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s10
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s11 :: v_dual_mov_b32 v9, s12
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s0
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s1 :: v_dual_mov_b32 v17, s2
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB43_3
 ; GFX11-TRUE16-NEXT:  .LBB43_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v4, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v6, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v8, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v10, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v11, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v12, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v13, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v16, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v17, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:  .LBB43_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v17, v170
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xa
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB43_4:
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB43_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v40i16_to_v10i64_scalar:
@@ -23094,93 +24597,342 @@ define inreg <10 x i64> @bitcast_v40f16_to_v10i64_scalar(<40 x half> inreg %a, i
 ; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v10i64_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:192
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:188
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:180
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:172
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:64
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:60
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:52
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:44
+; GFX11-TRUE16-NEXT:    s_clause 0xa
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v0.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v1
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s1, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s2, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s16, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s17, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s18, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s19, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s20, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s21, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s22, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s23, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s24, s15
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s25, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s26, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s27, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s28, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s29, s41
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v185, v1 :: v_dual_mov_b32 v186, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB47_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v5, s8
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s10
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s11 :: v_dual_mov_b32 v9, s12
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s0
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s1 :: v_dual_mov_b32 v17, s2
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB47_3
 ; GFX11-TRUE16-NEXT:  .LBB47_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v4, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v6, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v7, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v8, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v10, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v11, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v12, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v13, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v16, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v17, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:  .LBB47_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v17, v170
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xa
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB47_4:
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB47_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v40f16_to_v10i64_scalar:
@@ -24382,142 +26134,271 @@ define inreg <40 x i16> @bitcast_v10f64_to_v40i16_scalar(<10 x double> inreg %a,
 ; GFX9-NEXT:    ; implicit-def: $vgpr26
 ; GFX9-NEXT:    s_branch .LBB49_2
 ;
-; GFX11-LABEL: bitcast_v10f64_to_v40i16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-NEXT:    v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v21, s1
-; GFX11-NEXT:    v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3
-; GFX11-NEXT:    v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v19, s17
-; GFX11-NEXT:    v_dual_mov_b32 v7, s18 :: v_dual_mov_b32 v8, s19
-; GFX11-NEXT:    v_dual_mov_b32 v5, s20 :: v_dual_mov_b32 v6, s21
-; GFX11-NEXT:    v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v13, s23
-; GFX11-NEXT:    v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v11, s25
-; GFX11-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT:    v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cbranch_scc0 .LBB49_4
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v20
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccnz .LBB49_3
-; GFX11-NEXT:  .LBB49_2: ; %cmp.true
-; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT:    v_add_f64 v[16:17], v[16:17], 1.0
-; GFX11-NEXT:    v_add_f64 v[14:15], v[14:15], 1.0
-; GFX11-NEXT:    v_add_f64 v[10:11], v[10:11], 1.0
-; GFX11-NEXT:    v_add_f64 v[12:13], v[12:13], 1.0
-; GFX11-NEXT:    v_add_f64 v[5:6], v[5:6], 1.0
-; GFX11-NEXT:    v_add_f64 v[7:8], v[7:8], 1.0
-; GFX11-NEXT:    v_add_f64 v[18:19], v[18:19], 1.0
-; GFX11-NEXT:    v_add_f64 v[3:4], v[3:4], 1.0
-; GFX11-NEXT:    v_add_f64 v[20:21], v[20:21], 1.0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v20
-; GFX11-NEXT:  .LBB49_3: ; %end
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshl_or_b32 v2, v2, 16, v3
-; GFX11-NEXT:    v_lshl_or_b32 v3, v37, 16, v4
-; GFX11-NEXT:    v_lshl_or_b32 v4, v36, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v19
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v5
-; GFX11-NEXT:    v_lshl_or_b32 v21, v38, 16, v21
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT:    v_and_b32_e32 v36, 0xffff, v6
-; GFX11-NEXT:    v_lshl_or_b32 v6, v34, 16, v7
-; GFX11-NEXT:    v_lshl_or_b32 v7, v33, 16, v8
-; GFX11-NEXT:    v_lshl_or_b32 v8, v32, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v11
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_lshl_or_b32 v5, v35, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v10
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_lshl_or_b32 v11, v30, 16, v13
-; GFX11-NEXT:    v_lshl_or_b32 v13, v28, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshl_or_b32 v19, v22, 16, v1
-; GFX11-NEXT:    v_mov_b32_e32 v1, v21
-; GFX11-NEXT:    v_lshl_or_b32 v20, v39, 16, v20
-; GFX11-NEXT:    v_lshl_or_b32 v9, v9, 16, v36
-; GFX11-NEXT:    v_lshl_or_b32 v10, v31, 16, v12
-; GFX11-NEXT:    v_lshl_or_b32 v12, v29, 16, v18
-; GFX11-NEXT:    v_lshl_or_b32 v14, v27, 16, v14
-; GFX11-NEXT:    v_lshl_or_b32 v15, v26, 16, v15
-; GFX11-NEXT:    v_lshl_or_b32 v16, v25, 16, v16
-; GFX11-NEXT:    v_lshl_or_b32 v17, v24, 16, v17
-; GFX11-NEXT:    v_lshl_or_b32 v18, v23, 16, v0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v20
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB49_4:
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr9
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    ; implicit-def: $vgpr25
-; GFX11-NEXT:    ; implicit-def: $vgpr24
-; GFX11-NEXT:    ; implicit-def: $vgpr23
-; GFX11-NEXT:    ; implicit-def: $vgpr22
-; GFX11-NEXT:    s_branch .LBB49_2
+; GFX11-TRUE16-LABEL: bitcast_v10f64_to_v40i16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB49_4
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB49_3
+; GFX11-TRUE16-NEXT:  .LBB49_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-TRUE16-NEXT:  .LBB49_3: ; %end
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v21 :: v_dual_mov_b32 v20, v20
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v37.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v25.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v23.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v21.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v20.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB49_4:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20
+; GFX11-TRUE16-NEXT:    s_branch .LBB49_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v10f64_to_v40i16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v21, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v19, s17
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v7, s18 :: v_dual_mov_b32 v8, s19
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, s20 :: v_dual_mov_b32 v6, s21
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v13, s23
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v11, s25
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB49_4
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB49_3
+; GFX11-FAKE16-NEXT:  .LBB49_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[5:6], v[5:6], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[7:8], v[7:8], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[3:4], v[3:4], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-FAKE16-NEXT:  .LBB49_3: ; %end
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v2, v2, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v3, v37, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v36, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v21, v38, 16, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v36, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v34, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v33, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v32, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v35, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v30, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v28, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v19, v22, 16, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v21
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v39, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v9, 16, v36
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v31, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v29, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v27, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v26, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v16, v25, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v17, v24, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v23, 16, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v20
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB49_4:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT:    s_branch .LBB49_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -25911,93 +27792,342 @@ define inreg <10 x double> @bitcast_v40i16_to_v10f64_scalar(<40 x i16> inreg %a,
 ; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v10f64_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:192
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:188
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:180
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:172
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:64
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:60
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:52
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:44
+; GFX11-TRUE16-NEXT:    s_clause 0xa
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v0.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v1
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s1, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s2, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s16, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s17, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s18, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s19, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s20, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s21, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s22, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s23, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s24, s15
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s25, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s26, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s27, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s28, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s29, s41
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v185, v1 :: v_dual_mov_b32 v186, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB51_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v5, s8
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s10
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s11 :: v_dual_mov_b32 v9, s12
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s0
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s1 :: v_dual_mov_b32 v17, s2
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB51_3
 ; GFX11-TRUE16-NEXT:  .LBB51_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v4, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v6, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v8, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v10, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v11, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v12, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v13, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v15, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v16, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v17, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:  .LBB51_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v17, v170
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xa
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB51_4:
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB51_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v40i16_to_v10f64_scalar:
@@ -27484,142 +29614,271 @@ define inreg <40 x half> @bitcast_v10f64_to_v40f16_scalar(<10 x double> inreg %a
 ; GFX9-NEXT:    ; implicit-def: $vgpr26
 ; GFX9-NEXT:    s_branch .LBB53_2
 ;
-; GFX11-LABEL: bitcast_v10f64_to_v40f16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-NEXT:    v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v21, s1
-; GFX11-NEXT:    v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3
-; GFX11-NEXT:    v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v19, s17
-; GFX11-NEXT:    v_dual_mov_b32 v7, s18 :: v_dual_mov_b32 v8, s19
-; GFX11-NEXT:    v_dual_mov_b32 v5, s20 :: v_dual_mov_b32 v6, s21
-; GFX11-NEXT:    v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v13, s23
-; GFX11-NEXT:    v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v11, s25
-; GFX11-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT:    v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cbranch_scc0 .LBB53_4
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v20
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccnz .LBB53_3
-; GFX11-NEXT:  .LBB53_2: ; %cmp.true
-; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT:    v_add_f64 v[16:17], v[16:17], 1.0
-; GFX11-NEXT:    v_add_f64 v[14:15], v[14:15], 1.0
-; GFX11-NEXT:    v_add_f64 v[10:11], v[10:11], 1.0
-; GFX11-NEXT:    v_add_f64 v[12:13], v[12:13], 1.0
-; GFX11-NEXT:    v_add_f64 v[5:6], v[5:6], 1.0
-; GFX11-NEXT:    v_add_f64 v[7:8], v[7:8], 1.0
-; GFX11-NEXT:    v_add_f64 v[18:19], v[18:19], 1.0
-; GFX11-NEXT:    v_add_f64 v[3:4], v[3:4], 1.0
-; GFX11-NEXT:    v_add_f64 v[20:21], v[20:21], 1.0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v20
-; GFX11-NEXT:  .LBB53_3: ; %end
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshl_or_b32 v2, v2, 16, v3
-; GFX11-NEXT:    v_lshl_or_b32 v3, v37, 16, v4
-; GFX11-NEXT:    v_lshl_or_b32 v4, v36, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v19
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v5
-; GFX11-NEXT:    v_lshl_or_b32 v21, v38, 16, v21
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT:    v_and_b32_e32 v36, 0xffff, v6
-; GFX11-NEXT:    v_lshl_or_b32 v6, v34, 16, v7
-; GFX11-NEXT:    v_lshl_or_b32 v7, v33, 16, v8
-; GFX11-NEXT:    v_lshl_or_b32 v8, v32, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v11
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_lshl_or_b32 v5, v35, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v10
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_lshl_or_b32 v11, v30, 16, v13
-; GFX11-NEXT:    v_lshl_or_b32 v13, v28, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshl_or_b32 v19, v22, 16, v1
-; GFX11-NEXT:    v_mov_b32_e32 v1, v21
-; GFX11-NEXT:    v_lshl_or_b32 v20, v39, 16, v20
-; GFX11-NEXT:    v_lshl_or_b32 v9, v9, 16, v36
-; GFX11-NEXT:    v_lshl_or_b32 v10, v31, 16, v12
-; GFX11-NEXT:    v_lshl_or_b32 v12, v29, 16, v18
-; GFX11-NEXT:    v_lshl_or_b32 v14, v27, 16, v14
-; GFX11-NEXT:    v_lshl_or_b32 v15, v26, 16, v15
-; GFX11-NEXT:    v_lshl_or_b32 v16, v25, 16, v16
-; GFX11-NEXT:    v_lshl_or_b32 v17, v24, 16, v17
-; GFX11-NEXT:    v_lshl_or_b32 v18, v23, 16, v0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v20
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB53_4:
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr9
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    ; implicit-def: $vgpr25
-; GFX11-NEXT:    ; implicit-def: $vgpr24
-; GFX11-NEXT:    ; implicit-def: $vgpr23
-; GFX11-NEXT:    ; implicit-def: $vgpr22
-; GFX11-NEXT:    s_branch .LBB53_2
+; GFX11-TRUE16-LABEL: bitcast_v10f64_to_v40f16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB53_4
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB53_3
+; GFX11-TRUE16-NEXT:  .LBB53_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-TRUE16-NEXT:  .LBB53_3: ; %end
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v21 :: v_dual_mov_b32 v20, v20
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v37.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v25.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v23.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v21.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v20.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB53_4:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20
+; GFX11-TRUE16-NEXT:    s_branch .LBB53_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v10f64_to_v40f16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v21, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s3
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v19, s17
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v7, s18 :: v_dual_mov_b32 v8, s19
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, s20 :: v_dual_mov_b32 v6, s21
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v13, s23
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v11, s25
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB53_4
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB53_3
+; GFX11-FAKE16-NEXT:  .LBB53_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[5:6], v[5:6], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[7:8], v[7:8], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[3:4], v[3:4], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-FAKE16-NEXT:  .LBB53_3: ; %end
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v2, v2, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v3, v37, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v36, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v21, v38, 16, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v36, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v34, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v33, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v32, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v35, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v30, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v28, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v19, v22, 16, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v21
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v39, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v9, 16, v36
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v31, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v29, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v27, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v26, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v16, v25, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v17, v24, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v23, 16, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v20
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB53_4:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT:    s_branch .LBB53_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -29258,93 +31517,342 @@ define inreg <10 x double> @bitcast_v40f16_to_v10f64_scalar(<40 x half> inreg %a
 ; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v10f64_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:192
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:188
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:180
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:172
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:64
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:60
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:52
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:44
+; GFX11-TRUE16-NEXT:    s_clause 0xa
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v0.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v1
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s1, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s2, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s16, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s17, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s18, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s19, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s20, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s21, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s22, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s23, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s24, s15
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s25, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s26, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s27, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s28, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s29, s41
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v185, v1 :: v_dual_mov_b32 v186, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB55_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v5, s8
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s10
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s11 :: v_dual_mov_b32 v9, s12
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s14
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s0
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s1 :: v_dual_mov_b32 v17, s2
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB55_3
 ; GFX11-TRUE16-NEXT:  .LBB55_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v33, 16, v35
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v32, 16, v34
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v4, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v6, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v7, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v8, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v10, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v11, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v12, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v13, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v15, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v16, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v17, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:  .LBB55_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v17, v170
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v186 :: v_dual_mov_b32 v19, v185
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xa
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB55_4:
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB55_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v40f16_to_v10f64_scalar:
@@ -31057,12 +33565,10 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
 ; GFX11-TRUE16-LABEL: bitcast_v40i16_to_v40f16_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, 0
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v0.h
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, 0
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s29, 16
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v19.h
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s28, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s27, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s26, 16
@@ -31083,17 +33589,16 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s46, 0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB57_3
-; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v19.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v18.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v20.h
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s46
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB57_4
 ; GFX11-TRUE16-NEXT:  .LBB57_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s29, s29, s45
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s28, s28, s44
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s27, s27, s43
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v19, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v18, 16, v0
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s26, s26, s42
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s25, s25, s41
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s24, s24, s40
@@ -31109,59 +33614,61 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s5
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s9
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v15, s29, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v17, s29, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v16, s28, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v17, s27, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v10, s26, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v11, s25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v13, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v6, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v11, s15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v10, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v8, s12, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v7, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v8, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v21, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v4, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v6, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v21
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v20
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v4
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v4, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v11
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v15
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v15
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v19
 ; GFX11-TRUE16-NEXT:    s_branch .LBB57_5
 ; GFX11-TRUE16-NEXT:  .LBB57_3:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20
 ; GFX11-TRUE16-NEXT:    s_branch .LBB57_2
 ; GFX11-TRUE16-NEXT:  .LBB57_4:
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, s29 :: v_dual_mov_b32 v16, s28
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, s27 :: v_dual_mov_b32 v10, s26
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, s25 :: v_dual_mov_b32 v12, s24
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v13, s23 :: v_dual_mov_b32 v14, s22
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, s21 :: v_dual_mov_b32 v6, s20
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s18
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, s17 :: v_dual_mov_b32 v2, s16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v4, s2
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s1 :: v_dual_mov_b32 v21, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v16, s28
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v14, s26
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v12, s24
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v8, s20
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v4, s16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, s45 :: v_dual_mov_b32 v23, s44
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v24, s43 :: v_dual_mov_b32 v25, s42
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, s41 :: v_dual_mov_b32 v27, s40
@@ -31172,47 +33679,37 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v36, s6 :: v_dual_mov_b32 v37, s5
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v38, s4 :: v_dual_mov_b32 v39, s9
 ; GFX11-TRUE16-NEXT:  .LBB57_5: ; %end
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v37, 16, v4
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v35, 16, v49
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff, v20
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v3, v36, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v33, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v31, 16, v35
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v39, 16, v21
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v38, 16, v48
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v34, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v30, 16, v36
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v30, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v29, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v25, 16, v31
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v32, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v28, 16, v13
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v27, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v26, 16, v30
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v24, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v16, v23, 16, v16
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v17, v22, 16, v25
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v18, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v19, 16, v1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, v20 :: v_dual_mov_b32 v1, v21
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v21 :: v_dual_mov_b32 v20, v20
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v37.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v25.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v23.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v21.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v20.l
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v40i16_to_v40f16_scalar:
@@ -32879,12 +35376,10 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
 ; GFX11-TRUE16-LABEL: bitcast_v40f16_to_v40i16_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, 0
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v0.h
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, 0
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s29, 16
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v19.h
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s28, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s27, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s26, 16
@@ -32905,17 +35400,16 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s46, 0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB59_3
-; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v19.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v18.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v20.h
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s46
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB59_4
 ; GFX11-TRUE16-NEXT:  .LBB59_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s29, s29, s45
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s28, s28, s44
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s27, s27, s43
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v19, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v18, 16, v0
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s26, s26, s42
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s25, s25, s41
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s24, s24, s40
@@ -32931,59 +35425,61 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s5
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s9
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v15, 0x200, s29 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v17, 0x200, s29 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v16, 0x200, s28 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v17, 0x200, s27 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v10, 0x200, s26 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v11, 0x200, s25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v13, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v6, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v11, 0x200, s15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v10, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v8, 0x200, s12 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v7, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v8, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v21, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v4, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v6, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v21
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v20
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v4
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v4, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v11
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v15
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v15
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v19
 ; GFX11-TRUE16-NEXT:    s_branch .LBB59_5
 ; GFX11-TRUE16-NEXT:  .LBB59_3:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20
 ; GFX11-TRUE16-NEXT:    s_branch .LBB59_2
 ; GFX11-TRUE16-NEXT:  .LBB59_4:
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, s29 :: v_dual_mov_b32 v16, s28
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, s27 :: v_dual_mov_b32 v10, s26
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, s25 :: v_dual_mov_b32 v12, s24
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v13, s23 :: v_dual_mov_b32 v14, s22
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, s21 :: v_dual_mov_b32 v6, s20
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s18
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, s17 :: v_dual_mov_b32 v2, s16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v4, s2
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s1 :: v_dual_mov_b32 v21, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v16, s28
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v14, s26
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v12, s24
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v8, s20
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v4, s16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, s45 :: v_dual_mov_b32 v23, s44
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v24, s43 :: v_dual_mov_b32 v25, s42
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, s41 :: v_dual_mov_b32 v27, s40
@@ -32994,47 +35490,37 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v36, s6 :: v_dual_mov_b32 v37, s5
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v38, s4 :: v_dual_mov_b32 v39, s9
 ; GFX11-TRUE16-NEXT:  .LBB59_5: ; %end
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v37, 16, v4
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v35, 16, v49
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff, v20
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v3, v36, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v33, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v31, 16, v35
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v39, 16, v21
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v38, 16, v48
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v34, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v30, 16, v36
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v30, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v29, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v25, 16, v31
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v32, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v28, 16, v13
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v27, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v26, 16, v30
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v24, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v16, v23, 16, v16
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v17, v22, 16, v25
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v18, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v19, 16, v1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, v20 :: v_dual_mov_b32 v1, v21
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v21 :: v_dual_mov_b32 v20, v20
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v37.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v25.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v23.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v21.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v20.l
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v40f16_to_v40i16_scalar:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
index 11f90b9069561..14e17ce49cca0 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
@@ -2411,66 +2411,123 @@ define inreg i64 @bitcast_v4bf16_to_i64_scalar(<4 x bfloat> inreg %a, i32 inreg
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s17
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v4bf16_to_i64_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX11-NEXT:    s_mov_b32 s2, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB23_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s2
-; GFX11-NEXT:    s_cbranch_vccnz .LBB23_4
-; GFX11-NEXT:  .LBB23_2: ; %cmp.true
-; GFX11-NEXT:    s_pack_lh_b32_b16 s2, 0, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX11-NEXT:    v_add_f32_e64 v0, 0x40c00000, s2
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s1, 16
-; GFX11-NEXT:    s_pack_lh_b32_b16 s1, 0, s1
-; GFX11-NEXT:    v_add_f32_e64 v2, 0x40c00000, s0
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s1
-; GFX11-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT:    v_bfe_u32 v6, v2, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v2
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v8, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v8
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v3, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v4, v8, vcc_lo
-; GFX11-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_lshl_or_b32 v1, v3, 16, v2
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB23_3:
-; GFX11-NEXT:    s_branch .LBB23_2
-; GFX11-NEXT:  .LBB23_4:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_i64_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB23_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB23_4
+; GFX11-TRUE16-NEXT:  .LBB23_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s2, 0, s1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, v6, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v8, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v2, v5 :: v_dual_add_nc_u32 v9, v9, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v5, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v9.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB23_3:
+; GFX11-TRUE16-NEXT:    s_branch .LBB23_2
+; GFX11-TRUE16-NEXT:  .LBB23_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_i64_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB23_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB23_4
+; GFX11-FAKE16-NEXT:  .LBB23_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s1, 16
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s1, 0, s1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v8, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v3, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v1, v3, 16, v2
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB23_3:
+; GFX11-FAKE16-NEXT:    s_branch .LBB23_2
+; GFX11-FAKE16-NEXT:  .LBB23_4:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -5542,66 +5599,123 @@ define inreg double @bitcast_v4bf16_to_f64_scalar(<4 x bfloat> inreg %a, i32 inr
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s17
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v4bf16_to_f64_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX11-NEXT:    s_mov_b32 s2, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB47_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s2
-; GFX11-NEXT:    s_cbranch_vccnz .LBB47_4
-; GFX11-NEXT:  .LBB47_2: ; %cmp.true
-; GFX11-NEXT:    s_pack_lh_b32_b16 s2, 0, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX11-NEXT:    v_add_f32_e64 v0, 0x40c00000, s2
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s1, 16
-; GFX11-NEXT:    s_pack_lh_b32_b16 s1, 0, s1
-; GFX11-NEXT:    v_add_f32_e64 v2, 0x40c00000, s0
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s1
-; GFX11-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT:    v_bfe_u32 v6, v2, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v2
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v8, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v8
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v3, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v4, v8, vcc_lo
-; GFX11-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_lshl_or_b32 v1, v3, 16, v2
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB47_3:
-; GFX11-NEXT:    s_branch .LBB47_2
-; GFX11-NEXT:  .LBB47_4:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_f64_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB47_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB47_4
+; GFX11-TRUE16-NEXT:  .LBB47_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s2, 0, s1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, v6, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v8, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v2, v5 :: v_dual_add_nc_u32 v9, v9, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v5, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v9.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB47_3:
+; GFX11-TRUE16-NEXT:    s_branch .LBB47_2
+; GFX11-TRUE16-NEXT:  .LBB47_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_f64_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB47_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB47_4
+; GFX11-FAKE16-NEXT:  .LBB47_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s1, 16
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s1, 0, s1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v8, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v3, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v1, v3, 16, v2
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB47_3:
+; GFX11-FAKE16-NEXT:    s_branch .LBB47_2
+; GFX11-FAKE16-NEXT:  .LBB47_4:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -8386,66 +8500,123 @@ define inreg <2 x i32> @bitcast_v4bf16_to_v2i32_scalar(<4 x bfloat> inreg %a, i3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s17
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v4bf16_to_v2i32_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX11-NEXT:    s_mov_b32 s2, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB67_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s2
-; GFX11-NEXT:    s_cbranch_vccnz .LBB67_4
-; GFX11-NEXT:  .LBB67_2: ; %cmp.true
-; GFX11-NEXT:    s_pack_lh_b32_b16 s2, 0, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX11-NEXT:    v_add_f32_e64 v0, 0x40c00000, s2
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s1, 16
-; GFX11-NEXT:    s_pack_lh_b32_b16 s1, 0, s1
-; GFX11-NEXT:    v_add_f32_e64 v2, 0x40c00000, s0
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s1
-; GFX11-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT:    v_bfe_u32 v6, v2, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v2
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v8, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v8
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v3, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v4, v8, vcc_lo
-; GFX11-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_lshl_or_b32 v1, v3, 16, v2
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB67_3:
-; GFX11-NEXT:    s_branch .LBB67_2
-; GFX11-NEXT:  .LBB67_4:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_v2i32_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB67_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB67_4
+; GFX11-TRUE16-NEXT:  .LBB67_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s2, 0, s1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, v6, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v8, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v2, v5 :: v_dual_add_nc_u32 v9, v9, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v5, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v9.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB67_3:
+; GFX11-TRUE16-NEXT:    s_branch .LBB67_2
+; GFX11-TRUE16-NEXT:  .LBB67_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_v2i32_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB67_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB67_4
+; GFX11-FAKE16-NEXT:  .LBB67_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s1, 16
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s1, 0, s1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v8, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v3, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v1, v3, 16, v2
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB67_3:
+; GFX11-FAKE16-NEXT:    s_branch .LBB67_2
+; GFX11-FAKE16-NEXT:  .LBB67_4:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -10905,66 +11076,123 @@ define inreg <2 x float> @bitcast_v4bf16_to_v2f32_scalar(<4 x bfloat> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s17
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v4bf16_to_v2f32_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX11-NEXT:    s_mov_b32 s2, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB83_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s2
-; GFX11-NEXT:    s_cbranch_vccnz .LBB83_4
-; GFX11-NEXT:  .LBB83_2: ; %cmp.true
-; GFX11-NEXT:    s_pack_lh_b32_b16 s2, 0, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX11-NEXT:    v_add_f32_e64 v0, 0x40c00000, s2
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s1, 16
-; GFX11-NEXT:    s_pack_lh_b32_b16 s1, 0, s1
-; GFX11-NEXT:    v_add_f32_e64 v2, 0x40c00000, s0
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s1
-; GFX11-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT:    v_bfe_u32 v6, v2, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v2
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v8, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v8
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v3, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v4, v8, vcc_lo
-; GFX11-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_lshl_or_b32 v1, v3, 16, v2
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB83_3:
-; GFX11-NEXT:    s_branch .LBB83_2
-; GFX11-NEXT:  .LBB83_4:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_v2f32_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB83_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB83_4
+; GFX11-TRUE16-NEXT:  .LBB83_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s2, 0, s1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, v6, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v8, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v2, v5 :: v_dual_add_nc_u32 v9, v9, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v5, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v9.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB83_3:
+; GFX11-TRUE16-NEXT:    s_branch .LBB83_2
+; GFX11-TRUE16-NEXT:  .LBB83_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_v2f32_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB83_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB83_4
+; GFX11-FAKE16-NEXT:  .LBB83_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s1, 16
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s1, 0, s1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v8, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v3, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v1, v3, 16, v2
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB83_3:
+; GFX11-FAKE16-NEXT:    s_branch .LBB83_2
+; GFX11-FAKE16-NEXT:  .LBB83_4:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -12814,47 +13042,40 @@ define <4 x i16> @bitcast_v4bf16_to_v4i16(<4 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v2
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v4, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v1, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v3, v4, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v2, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v10, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v3, v3, v5 :: v_dual_add_f32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v0, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v9, v11, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v6, v7, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v10, v11, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v0, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v2, 16, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v3.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.h
 ; GFX11-TRUE16-NEXT:  .LBB94_2: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -13071,60 +13292,112 @@ define inreg <4 x i16> @bitcast_v4bf16_to_v4i16_scalar(<4 x bfloat> inreg %a, i3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s17
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v4bf16_to_v4i16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX11-NEXT:    s_mov_b32 s2, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB95_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s2
-; GFX11-NEXT:    s_cbranch_vccnz .LBB95_4
-; GFX11-NEXT:  .LBB95_2: ; %cmp.true
-; GFX11-NEXT:    s_pack_lh_b32_b16 s2, 0, s1
-; GFX11-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-NEXT:    v_add_f32_e64 v0, 0x40c00000, s2
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s1
-; GFX11-NEXT:    s_lshl_b32 s1, s0, 16
-; GFX11-NEXT:    s_pack_lh_b32_b16 s0, 0, s0
-; GFX11-NEXT:    v_add_f32_e64 v2, 0x40c00000, s1
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v0
-; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v6, v2, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v8, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v3, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v8
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_and_or_b32 v1, 0xffff0000, v3, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v8, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff0000, v0, v2
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB95_3:
-; GFX11-NEXT:    s_branch .LBB95_2
-; GFX11-NEXT:  .LBB95_4:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_v4i16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB95_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB95_4
+; GFX11-TRUE16-NEXT:  .LBB95_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s2, 0, s1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s0, 16
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s0, 0, s0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, v6, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v8, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v6, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v4.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB95_3:
+; GFX11-TRUE16-NEXT:    s_branch .LBB95_2
+; GFX11-TRUE16-NEXT:  .LBB95_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_v4i16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB95_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB95_4
+; GFX11-FAKE16-NEXT:  .LBB95_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s2, 0, s1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s0, 0, s0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v8, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v3, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v1, 0xffff0000, v3, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v0, 0xffff0000, v0, v2
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB95_3:
+; GFX11-FAKE16-NEXT:    s_branch .LBB95_2
+; GFX11-FAKE16-NEXT:  .LBB95_4:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -14889,65 +15162,124 @@ define inreg <4 x half> @bitcast_v4bf16_to_v4f16_scalar(<4 x bfloat> inreg %a, i
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s17
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v4bf16_to_v4f16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX11-NEXT:    s_mov_b32 s2, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB103_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s2
-; GFX11-NEXT:    s_cbranch_vccnz .LBB103_4
-; GFX11-NEXT:  .LBB103_2: ; %cmp.true
-; GFX11-NEXT:    s_pack_lh_b32_b16 s2, 0, s1
-; GFX11-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-NEXT:    v_add_f32_e64 v0, 0x40c00000, s2
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s1
-; GFX11-NEXT:    s_lshl_b32 s1, s0, 16
-; GFX11-NEXT:    s_pack_lh_b32_b16 s0, 0, s0
-; GFX11-NEXT:    v_add_f32_e64 v2, 0x40c00000, s1
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT:    v_bfe_u32 v6, v2, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v2
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v8, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v8
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v3, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_dual_cndmask_b32 v3, v4, v8 :: v_dual_and_b32 v2, 0xffff, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshl_or_b32 v1, v4, 16, v1
-; GFX11-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB103_3:
-; GFX11-NEXT:    s_branch .LBB103_2
-; GFX11-NEXT:  .LBB103_4:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_v4f16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB103_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB103_4
+; GFX11-TRUE16-NEXT:  .LBB103_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s2, 0, s1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, v6, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v8, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v3, v7 :: v_dual_add_nc_u32 v5, v5, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v4.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB103_3:
+; GFX11-TRUE16-NEXT:    s_branch .LBB103_2
+; GFX11-TRUE16-NEXT:  .LBB103_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_v4f16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB103_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB103_4
+; GFX11-FAKE16-NEXT:  .LBB103_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s2, 0, s1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s0, 0, s0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v8, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v3, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v4, v8 :: v_dual_and_b32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v1, v4, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB103_3:
+; GFX11-FAKE16-NEXT:    s_branch .LBB103_2
+; GFX11-FAKE16-NEXT:  .LBB103_4:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -16614,88 +16946,172 @@ define inreg <8 x i8> @bitcast_v4bf16_to_v8i8_scalar(<4 x bfloat> inreg %a, i32
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s17
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v4bf16_to_v8i8_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB109_3
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    s_lshr_b64 s[2:3], s[0:1], 24
-; GFX11-NEXT:    s_lshr_b32 s6, s1, 24
-; GFX11-NEXT:    s_lshr_b32 s8, s1, 16
-; GFX11-NEXT:    s_lshr_b32 s7, s1, 8
-; GFX11-NEXT:    s_lshr_b32 s5, s0, 16
-; GFX11-NEXT:    s_lshr_b32 s3, s0, 8
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_vccnz .LBB109_4
-; GFX11-NEXT:  .LBB109_2: ; %cmp.true
-; GFX11-NEXT:    s_pack_lh_b32_b16 s2, 0, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX11-NEXT:    v_add_f32_e64 v0, 0x40c00000, s2
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s1, 16
-; GFX11-NEXT:    s_pack_lh_b32_b16 s1, 0, s1
-; GFX11-NEXT:    v_add_f32_e64 v2, 0x40c00000, s0
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s1
-; GFX11-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v0
-; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v6, v2, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v8, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v8
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, v3, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v4, v8, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v8
-; GFX11-NEXT:    v_lshl_or_b32 v9, v2, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshl_or_b32 v10, v6, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[9:10]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v10
-; GFX11-NEXT:    v_mov_b32_e32 v4, v8
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB109_3:
-; GFX11-NEXT:    ; implicit-def: $sgpr3
-; GFX11-NEXT:    ; implicit-def: $sgpr5
-; GFX11-NEXT:    ; implicit-def: $sgpr2
-; GFX11-NEXT:    ; implicit-def: $sgpr7
-; GFX11-NEXT:    ; implicit-def: $sgpr8
-; GFX11-NEXT:    ; implicit-def: $sgpr6
-; GFX11-NEXT:    s_branch .LBB109_2
-; GFX11-NEXT:  .LBB109_4:
-; GFX11-NEXT:    v_dual_mov_b32 v6, s8 :: v_dual_mov_b32 v7, s6
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s7
-; GFX11-NEXT:    v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT:    v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_v8i8_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB109_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[2:3], s[0:1], 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s1, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s1, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB109_4
+; GFX11-TRUE16-NEXT:  .LBB109_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s2, 0, s1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s1, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v3, v3, v7 :: v_dual_add_nc_u32 v6, v6, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v8, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, v5, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v0, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v1.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v8.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v6.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[9:10]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v10
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v8
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB109_3:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr3
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr5
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr2
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr7
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr8
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr6
+; GFX11-TRUE16-NEXT:    s_branch .LBB109_2
+; GFX11-TRUE16-NEXT:  .LBB109_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s8 :: v_dual_mov_b32 v7, s6
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s7
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v1, s3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s1
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_v8i8_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB109_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[2:3], s[0:1], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s6, s1, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s8, s1, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s7, s1, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s5, s0, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB109_4
+; GFX11-FAKE16-NEXT:  .LBB109_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s2
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s1, 16
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s1, 0, s1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v8, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v6, v6, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v6, v10 :: v_dual_add_nc_u32 v3, v3, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v2, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v6, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[9:10]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v10
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v8
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB109_3:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr6
+; GFX11-FAKE16-NEXT:    s_branch .LBB109_2
+; GFX11-FAKE16-NEXT:  .LBB109_4:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, s8 :: v_dual_mov_b32 v7, s6
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s7
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v1, s3
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v4, s1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
index 294c874a0e835..010c7f18fa513 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
@@ -5328,105 +5328,352 @@ define inreg <22 x i32> @bitcast_v44i16_to_v22i32_scalar(<44 x i16> inreg %a, i3
 ; GFX11-TRUE16-LABEL: bitcast_v44i16_to_v22i32_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:304
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:300
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:192
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:188
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:180
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:172
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:64
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:60
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:52
+; GFX11-TRUE16-NEXT:    s_clause 0xc
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:44
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v187, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v188, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff, v3
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s26, s15
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s27, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s28, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s29, s41
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v185, v3 :: v_dual_mov_b32 v186, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v187, v1 :: v_dual_mov_b32 v188, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB15_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s0
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB15_3
 ; GFX11-TRUE16-NEXT:  .LBB15_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v17, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:  .LBB15_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v21, v185
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xc
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB15_4:
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB15_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v44i16_to_v22i32_scalar:
@@ -9137,105 +9384,352 @@ define inreg <22 x i32> @bitcast_v44f16_to_v22i32_scalar(<44 x half> inreg %a, i
 ; GFX11-TRUE16-LABEL: bitcast_v44f16_to_v22i32_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:304
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:300
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:192
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:188
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:180
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:172
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:64
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:60
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:52
+; GFX11-TRUE16-NEXT:    s_clause 0xc
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:44
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v187, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v188, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff, v3
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s26, s15
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s27, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s28, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s29, s41
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v185, v3 :: v_dual_mov_b32 v186, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v187, v1 :: v_dual_mov_b32 v188, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB19_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s0
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB19_3
 ; GFX11-TRUE16-NEXT:  .LBB19_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v17, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:  .LBB19_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v21, v185
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xc
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB19_4:
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB19_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v44f16_to_v22i32_scalar:
@@ -12099,155 +12593,295 @@ define inreg <44 x i16> @bitcast_v22f32_to_v44i16_scalar(<22 x float> inreg %a,
 ; GFX9-NEXT:    ; implicit-def: $vgpr30
 ; GFX9-NEXT:    s_branch .LBB29_2
 ;
-; GFX11-LABEL: bitcast_v22f32_to_v44i16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-NEXT:    v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v21, s1
-; GFX11-NEXT:    v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v19, s3
-; GFX11-NEXT:    v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v5, s18
-; GFX11-NEXT:    v_dual_mov_b32 v6, s17 :: v_dual_mov_b32 v11, s19
-; GFX11-NEXT:    v_dual_mov_b32 v10, s20 :: v_dual_mov_b32 v9, s21
-; GFX11-NEXT:    v_dual_mov_b32 v8, s22 :: v_dual_mov_b32 v7, s23
-; GFX11-NEXT:    v_dual_mov_b32 v15, s24 :: v_dual_mov_b32 v14, s25
-; GFX11-NEXT:    v_dual_mov_b32 v13, s26 :: v_dual_mov_b32 v12, s27
-; GFX11-NEXT:    v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cbranch_scc0 .LBB29_4
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 16, v22
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccnz .LBB29_3
-; GFX11-NEXT:  .LBB29_2: ; %cmp.true
-; GFX11-NEXT:    v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT:    v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT:    v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
-; GFX11-NEXT:    v_dual_add_f32 v12, 1.0, v12 :: v_dual_add_f32 v13, 1.0, v13
-; GFX11-NEXT:    v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v15, 1.0, v15
-; GFX11-NEXT:    v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT:    v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v10, 1.0, v10
-; GFX11-NEXT:    v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT:    v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT:    v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
-; GFX11-NEXT:    v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 16, v22
-; GFX11-NEXT:  .LBB29_3: ; %end
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshl_or_b32 v23, v23, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v7
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT:    v_lshl_or_b32 v25, v25, 16, v21
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v5
-; GFX11-NEXT:    v_lshl_or_b32 v4, v4, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v8
-; GFX11-NEXT:    v_lshl_or_b32 v7, v48, 16, v11
-; GFX11-NEXT:    v_lshl_or_b32 v11, v36, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v12
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshl_or_b32 v5, v50, 16, v6
-; GFX11-NEXT:    v_lshl_or_b32 v6, v49, 16, v21
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshl_or_b32 v8, v39, 16, v10
-; GFX11-NEXT:    v_lshl_or_b32 v10, v37, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v13
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT:    v_lshl_or_b32 v12, v35, 16, v15
-; GFX11-NEXT:    v_lshl_or_b32 v15, v32, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshl_or_b32 v19, v28, 16, v1
-; GFX11-NEXT:    v_lshl_or_b32 v21, v26, 16, v3
-; GFX11-NEXT:    v_mov_b32_e32 v1, v25
-; GFX11-NEXT:    v_lshl_or_b32 v24, v24, 16, v22
-; GFX11-NEXT:    v_mov_b32_e32 v3, v23
-; GFX11-NEXT:    v_lshl_or_b32 v22, v51, 16, v20
-; GFX11-NEXT:    v_lshl_or_b32 v9, v38, 16, v9
-; GFX11-NEXT:    v_lshl_or_b32 v13, v34, 16, v14
-; GFX11-NEXT:    v_lshl_or_b32 v14, v33, 16, v18
-; GFX11-NEXT:    v_lshl_or_b32 v16, v31, 16, v16
-; GFX11-NEXT:    v_lshl_or_b32 v17, v30, 16, v17
-; GFX11-NEXT:    v_lshl_or_b32 v18, v29, 16, v0
-; GFX11-NEXT:    v_lshl_or_b32 v20, v27, 16, v2
-; GFX11-NEXT:    v_mov_b32_e32 v0, v24
-; GFX11-NEXT:    v_mov_b32_e32 v2, v22
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB29_4:
-; GFX11-NEXT:    ; implicit-def: $vgpr24
-; GFX11-NEXT:    ; implicit-def: $vgpr25
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr23
-; GFX11-NEXT:    ; implicit-def: $vgpr4
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    s_branch .LBB29_2
+; GFX11-TRUE16-LABEL: bitcast_v22f32_to_v44i16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB29_4
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB29_3
+; GFX11-TRUE16-NEXT:  .LBB29_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-TRUE16-NEXT:  .LBB29_3: ; %end
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v51.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v49.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v48.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v37.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v25.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v23.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v22.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB29_4:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-TRUE16-NEXT:    s_branch .LBB29_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v22f32_to_v44i16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v21, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v19, s3
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v5, s18
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, s17 :: v_dual_mov_b32 v11, s19
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, s22 :: v_dual_mov_b32 v7, s23
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v15, s24 :: v_dual_mov_b32 v14, s25
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v13, s26 :: v_dual_mov_b32 v12, s27
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB29_4
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v22
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB29_3
+; GFX11-FAKE16-NEXT:  .LBB29_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v12, 1.0, v12 :: v_dual_add_f32 v13, 1.0, v13
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v15, 1.0, v15
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v22
+; GFX11-FAKE16-NEXT:  .LBB29_3: ; %end
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v23, v23, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v25, v25, 16, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v4, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v48, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v36, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v50, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v49, 16, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v39, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v37, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v35, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v32, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v19, v28, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v21, v26, 16, v3
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v25
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v24, v24, 16, v22
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, v23
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v22, v51, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v38, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v34, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v33, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v16, v31, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v17, v30, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v29, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v27, 16, v2
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v24
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, v22
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB29_4:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    s_branch .LBB29_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -13805,105 +14439,352 @@ define inreg <22 x float> @bitcast_v44i16_to_v22f32_scalar(<44 x i16> inreg %a,
 ; GFX11-TRUE16-LABEL: bitcast_v44i16_to_v22f32_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:304
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:300
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:192
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:188
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:180
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:172
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:64
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:60
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:52
+; GFX11-TRUE16-NEXT:    s_clause 0xc
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:44
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v187, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v188, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff, v3
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s26, s15
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s27, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s28, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s29, s41
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v185, v3 :: v_dual_mov_b32 v186, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v187, v1 :: v_dual_mov_b32 v188, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB31_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s0
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB31_3
 ; GFX11-TRUE16-NEXT:  .LBB31_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v17, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:  .LBB31_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v21, v185
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xc
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB31_4:
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB31_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v44i16_to_v22f32_scalar:
@@ -15630,155 +16511,295 @@ define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a,
 ; GFX9-NEXT:    ; implicit-def: $vgpr30
 ; GFX9-NEXT:    s_branch .LBB33_2
 ;
-; GFX11-LABEL: bitcast_v22f32_to_v44f16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-NEXT:    v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v21, s1
-; GFX11-NEXT:    v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v19, s3
-; GFX11-NEXT:    v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v5, s18
-; GFX11-NEXT:    v_dual_mov_b32 v6, s17 :: v_dual_mov_b32 v11, s19
-; GFX11-NEXT:    v_dual_mov_b32 v10, s20 :: v_dual_mov_b32 v9, s21
-; GFX11-NEXT:    v_dual_mov_b32 v8, s22 :: v_dual_mov_b32 v7, s23
-; GFX11-NEXT:    v_dual_mov_b32 v15, s24 :: v_dual_mov_b32 v14, s25
-; GFX11-NEXT:    v_dual_mov_b32 v13, s26 :: v_dual_mov_b32 v12, s27
-; GFX11-NEXT:    v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cbranch_scc0 .LBB33_4
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 16, v22
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccnz .LBB33_3
-; GFX11-NEXT:  .LBB33_2: ; %cmp.true
-; GFX11-NEXT:    v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT:    v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT:    v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
-; GFX11-NEXT:    v_dual_add_f32 v12, 1.0, v12 :: v_dual_add_f32 v13, 1.0, v13
-; GFX11-NEXT:    v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v15, 1.0, v15
-; GFX11-NEXT:    v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT:    v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v10, 1.0, v10
-; GFX11-NEXT:    v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT:    v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT:    v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
-; GFX11-NEXT:    v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 16, v22
-; GFX11-NEXT:  .LBB33_3: ; %end
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshl_or_b32 v23, v23, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v7
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT:    v_lshl_or_b32 v25, v25, 16, v21
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v5
-; GFX11-NEXT:    v_lshl_or_b32 v4, v4, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v8
-; GFX11-NEXT:    v_lshl_or_b32 v7, v48, 16, v11
-; GFX11-NEXT:    v_lshl_or_b32 v11, v36, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v12
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshl_or_b32 v5, v50, 16, v6
-; GFX11-NEXT:    v_lshl_or_b32 v6, v49, 16, v21
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshl_or_b32 v8, v39, 16, v10
-; GFX11-NEXT:    v_lshl_or_b32 v10, v37, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v13
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT:    v_lshl_or_b32 v12, v35, 16, v15
-; GFX11-NEXT:    v_lshl_or_b32 v15, v32, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshl_or_b32 v19, v28, 16, v1
-; GFX11-NEXT:    v_lshl_or_b32 v21, v26, 16, v3
-; GFX11-NEXT:    v_mov_b32_e32 v1, v25
-; GFX11-NEXT:    v_lshl_or_b32 v24, v24, 16, v22
-; GFX11-NEXT:    v_mov_b32_e32 v3, v23
-; GFX11-NEXT:    v_lshl_or_b32 v22, v51, 16, v20
-; GFX11-NEXT:    v_lshl_or_b32 v9, v38, 16, v9
-; GFX11-NEXT:    v_lshl_or_b32 v13, v34, 16, v14
-; GFX11-NEXT:    v_lshl_or_b32 v14, v33, 16, v18
-; GFX11-NEXT:    v_lshl_or_b32 v16, v31, 16, v16
-; GFX11-NEXT:    v_lshl_or_b32 v17, v30, 16, v17
-; GFX11-NEXT:    v_lshl_or_b32 v18, v29, 16, v0
-; GFX11-NEXT:    v_lshl_or_b32 v20, v27, 16, v2
-; GFX11-NEXT:    v_mov_b32_e32 v0, v24
-; GFX11-NEXT:    v_mov_b32_e32 v2, v22
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB33_4:
-; GFX11-NEXT:    ; implicit-def: $vgpr24
-; GFX11-NEXT:    ; implicit-def: $vgpr25
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr23
-; GFX11-NEXT:    ; implicit-def: $vgpr4
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    s_branch .LBB33_2
+; GFX11-TRUE16-LABEL: bitcast_v22f32_to_v44f16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB33_4
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB33_3
+; GFX11-TRUE16-NEXT:  .LBB33_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-TRUE16-NEXT:  .LBB33_3: ; %end
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v51.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v49.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v48.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v37.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v25.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v23.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v22.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB33_4:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-TRUE16-NEXT:    s_branch .LBB33_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v22f32_to_v44f16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v21, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v19, s3
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v5, s18
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, s17 :: v_dual_mov_b32 v11, s19
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, s20 :: v_dual_mov_b32 v9, s21
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, s22 :: v_dual_mov_b32 v7, s23
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v15, s24 :: v_dual_mov_b32 v14, s25
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v13, s26 :: v_dual_mov_b32 v12, s27
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB33_4
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v22
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB33_3
+; GFX11-FAKE16-NEXT:  .LBB33_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v12, 1.0, v12 :: v_dual_add_f32 v13, 1.0, v13
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v15, 1.0, v15
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v22
+; GFX11-FAKE16-NEXT:  .LBB33_3: ; %end
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v23, v23, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v25, v25, 16, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v4, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v48, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v36, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v50, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v49, 16, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v39, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v37, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v35, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v32, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v19, v28, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v21, v26, 16, v3
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v25
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v24, v24, 16, v22
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, v23
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v22, v51, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v38, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v34, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v33, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v16, v31, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v17, v30, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v29, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v27, 16, v2
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v24
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, v22
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB33_4:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    s_branch .LBB33_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -17607,105 +18628,352 @@ define inreg <22 x float> @bitcast_v44f16_to_v22f32_scalar(<44 x half> inreg %a,
 ; GFX11-TRUE16-LABEL: bitcast_v44f16_to_v22f32_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:304
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:300
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:192
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:188
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:180
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:172
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:64
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:60
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:52
+; GFX11-TRUE16-NEXT:    s_clause 0xc
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:44
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v187, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v188, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff, v3
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s26, s15
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s27, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s28, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s29, s41
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v185, v3 :: v_dual_mov_b32 v186, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v187, v1 :: v_dual_mov_b32 v188, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB35_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s0
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB35_3
 ; GFX11-TRUE16-NEXT:  .LBB35_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v17, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:  .LBB35_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v21, v185
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xc
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB35_4:
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB35_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v44f16_to_v22f32_scalar:
@@ -21568,105 +22836,352 @@ define inreg <11 x i64> @bitcast_v44i16_to_v11i64_scalar(<44 x i16> inreg %a, i3
 ; GFX11-TRUE16-LABEL: bitcast_v44i16_to_v11i64_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:304
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:300
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:192
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:188
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:180
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:172
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:64
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:60
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:52
+; GFX11-TRUE16-NEXT:    s_clause 0xc
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:44
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v187, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v188, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff, v3
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s26, s15
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s27, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s28, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s29, s41
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v185, v3 :: v_dual_mov_b32 v186, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v187, v1 :: v_dual_mov_b32 v188, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB43_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s0
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB43_3
 ; GFX11-TRUE16-NEXT:  .LBB43_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v17, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:  .LBB43_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v21, v185
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xc
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB43_4:
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB43_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v44i16_to_v11i64_scalar:
@@ -25389,105 +26904,352 @@ define inreg <11 x i64> @bitcast_v44f16_to_v11i64_scalar(<44 x half> inreg %a, i
 ; GFX11-TRUE16-LABEL: bitcast_v44f16_to_v11i64_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:304
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:300
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:192
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:188
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:180
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:172
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:64
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:60
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:52
+; GFX11-TRUE16-NEXT:    s_clause 0xc
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:44
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v187, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v188, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff, v3
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s26, s15
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s27, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s28, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s29, s41
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v185, v3 :: v_dual_mov_b32 v186, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v187, v1 :: v_dual_mov_b32 v188, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB47_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s0
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB47_3
 ; GFX11-TRUE16-NEXT:  .LBB47_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v17, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:  .LBB47_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v21, v185
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xc
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB47_4:
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB47_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v44f16_to_v11i64_scalar:
@@ -26793,154 +28555,294 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a,
 ; GFX9-NEXT:    ; implicit-def: $vgpr30
 ; GFX9-NEXT:    s_branch .LBB49_2
 ;
-; GFX11-LABEL: bitcast_v11f64_to_v44i16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-NEXT:    v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v23, s1
-; GFX11-NEXT:    v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v21, s3
-; GFX11-NEXT:    v_dual_mov_b32 v5, s16 :: v_dual_mov_b32 v6, s17
-; GFX11-NEXT:    v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19
-; GFX11-NEXT:    v_dual_mov_b32 v9, s20 :: v_dual_mov_b32 v10, s21
-; GFX11-NEXT:    v_dual_mov_b32 v7, s22 :: v_dual_mov_b32 v8, s23
-; GFX11-NEXT:    v_dual_mov_b32 v14, s24 :: v_dual_mov_b32 v15, s25
-; GFX11-NEXT:    v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v13, s27
-; GFX11-NEXT:    v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cbranch_scc0 .LBB49_4
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 16, v22
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccnz .LBB49_3
-; GFX11-NEXT:  .LBB49_2: ; %cmp.true
-; GFX11-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
-; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT:    v_add_f64 v[16:17], v[16:17], 1.0
-; GFX11-NEXT:    v_add_f64 v[12:13], v[12:13], 1.0
-; GFX11-NEXT:    v_add_f64 v[14:15], v[14:15], 1.0
-; GFX11-NEXT:    v_add_f64 v[7:8], v[7:8], 1.0
-; GFX11-NEXT:    v_add_f64 v[9:10], v[9:10], 1.0
-; GFX11-NEXT:    v_add_f64 v[18:19], v[18:19], 1.0
-; GFX11-NEXT:    v_add_f64 v[5:6], v[5:6], 1.0
-; GFX11-NEXT:    v_add_f64 v[20:21], v[20:21], 1.0
-; GFX11-NEXT:    v_add_f64 v[22:23], v[22:23], 1.0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 16, v22
-; GFX11-NEXT:  .LBB49_3: ; %end
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT:    v_lshl_or_b32 v25, v25, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT:    v_lshl_or_b32 v23, v50, 16, v21
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT:    v_lshl_or_b32 v4, v4, 16, v5
-; GFX11-NEXT:    v_lshl_or_b32 v5, v49, 16, v6
-; GFX11-NEXT:    v_lshl_or_b32 v6, v48, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v19
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v7
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshl_or_b32 v24, v24, 16, v22
-; GFX11-NEXT:    v_lshl_or_b32 v7, v39, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v12
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshl_or_b32 v21, v26, 16, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v23
-; GFX11-NEXT:    v_lshl_or_b32 v22, v51, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v8
-; GFX11-NEXT:    v_lshl_or_b32 v8, v38, 16, v9
-; GFX11-NEXT:    v_lshl_or_b32 v9, v37, 16, v10
-; GFX11-NEXT:    v_lshl_or_b32 v10, v36, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v13
-; GFX11-NEXT:    v_lshl_or_b32 v11, v11, 16, v20
-; GFX11-NEXT:    v_lshl_or_b32 v12, v35, 16, v14
-; GFX11-NEXT:    v_lshl_or_b32 v13, v34, 16, v15
-; GFX11-NEXT:    v_lshl_or_b32 v14, v33, 16, v18
-; GFX11-NEXT:    v_lshl_or_b32 v15, v32, 16, v19
-; GFX11-NEXT:    v_lshl_or_b32 v16, v31, 16, v16
-; GFX11-NEXT:    v_lshl_or_b32 v17, v30, 16, v17
-; GFX11-NEXT:    v_lshl_or_b32 v18, v29, 16, v0
-; GFX11-NEXT:    v_lshl_or_b32 v19, v28, 16, v1
-; GFX11-NEXT:    v_lshl_or_b32 v20, v27, 16, v2
-; GFX11-NEXT:    v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25
-; GFX11-NEXT:    v_mov_b32_e32 v2, v22
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB49_4:
-; GFX11-NEXT:    ; implicit-def: $vgpr24
-; GFX11-NEXT:    ; implicit-def: $vgpr25
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr4
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr11
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    s_branch .LBB49_2
+; GFX11-TRUE16-LABEL: bitcast_v11f64_to_v44i16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB49_4
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB49_3
+; GFX11-TRUE16-NEXT:  .LBB49_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-TRUE16-NEXT:  .LBB49_3: ; %end
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v51.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v49.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v48.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v37.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v25.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v23.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v22.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB49_4:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-TRUE16-NEXT:    s_branch .LBB49_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v11f64_to_v44i16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v23, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v21, s3
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, s16 :: v_dual_mov_b32 v6, s17
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v9, s20 :: v_dual_mov_b32 v10, s21
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v7, s22 :: v_dual_mov_b32 v8, s23
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s24 :: v_dual_mov_b32 v15, s25
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v13, s27
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB49_4
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v22
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB49_3
+; GFX11-FAKE16-NEXT:  .LBB49_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[7:8], v[7:8], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[9:10], v[9:10], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[5:6], v[5:6], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v22
+; GFX11-FAKE16-NEXT:  .LBB49_3: ; %end
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v25, v25, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v23, v50, 16, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v4, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v49, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v48, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v24, v24, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v39, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v21, v26, 16, v3
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, v23
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v22, v51, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v38, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v37, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v36, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v11, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v35, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v34, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v33, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v32, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v16, v31, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v17, v30, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v29, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v19, v28, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v27, 16, v2
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, v22
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB49_4:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    s_branch .LBB49_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -28498,105 +30400,352 @@ define inreg <11 x double> @bitcast_v44i16_to_v11f64_scalar(<44 x i16> inreg %a,
 ; GFX11-TRUE16-LABEL: bitcast_v44i16_to_v11f64_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:304
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:300
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:192
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:188
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:180
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:172
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:64
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:60
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:52
+; GFX11-TRUE16-NEXT:    s_clause 0xc
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:44
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v187, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v188, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff, v3
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s26, s15
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s27, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s28, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s29, s41
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v185, v3 :: v_dual_mov_b32 v186, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v187, v1 :: v_dual_mov_b32 v188, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB51_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s0
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB51_3
 ; GFX11-TRUE16-NEXT:  .LBB51_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v17, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:  .LBB51_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v21, v185
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xc
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB51_4:
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB51_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v44i16_to_v11f64_scalar:
@@ -30248,154 +32397,294 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a
 ; GFX9-NEXT:    ; implicit-def: $vgpr30
 ; GFX9-NEXT:    s_branch .LBB53_2
 ;
-; GFX11-LABEL: bitcast_v11f64_to_v44f16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-NEXT:    v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v23, s1
-; GFX11-NEXT:    v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v21, s3
-; GFX11-NEXT:    v_dual_mov_b32 v5, s16 :: v_dual_mov_b32 v6, s17
-; GFX11-NEXT:    v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19
-; GFX11-NEXT:    v_dual_mov_b32 v9, s20 :: v_dual_mov_b32 v10, s21
-; GFX11-NEXT:    v_dual_mov_b32 v7, s22 :: v_dual_mov_b32 v8, s23
-; GFX11-NEXT:    v_dual_mov_b32 v14, s24 :: v_dual_mov_b32 v15, s25
-; GFX11-NEXT:    v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v13, s27
-; GFX11-NEXT:    v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cbranch_scc0 .LBB53_4
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 16, v22
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccnz .LBB53_3
-; GFX11-NEXT:  .LBB53_2: ; %cmp.true
-; GFX11-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
-; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT:    v_add_f64 v[16:17], v[16:17], 1.0
-; GFX11-NEXT:    v_add_f64 v[12:13], v[12:13], 1.0
-; GFX11-NEXT:    v_add_f64 v[14:15], v[14:15], 1.0
-; GFX11-NEXT:    v_add_f64 v[7:8], v[7:8], 1.0
-; GFX11-NEXT:    v_add_f64 v[9:10], v[9:10], 1.0
-; GFX11-NEXT:    v_add_f64 v[18:19], v[18:19], 1.0
-; GFX11-NEXT:    v_add_f64 v[5:6], v[5:6], 1.0
-; GFX11-NEXT:    v_add_f64 v[20:21], v[20:21], 1.0
-; GFX11-NEXT:    v_add_f64 v[22:23], v[22:23], 1.0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 16, v22
-; GFX11-NEXT:  .LBB53_3: ; %end
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT:    v_lshl_or_b32 v25, v25, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT:    v_lshl_or_b32 v23, v50, 16, v21
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT:    v_lshl_or_b32 v4, v4, 16, v5
-; GFX11-NEXT:    v_lshl_or_b32 v5, v49, 16, v6
-; GFX11-NEXT:    v_lshl_or_b32 v6, v48, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v19
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v7
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshl_or_b32 v24, v24, 16, v22
-; GFX11-NEXT:    v_lshl_or_b32 v7, v39, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v12
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshl_or_b32 v21, v26, 16, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v23
-; GFX11-NEXT:    v_lshl_or_b32 v22, v51, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v8
-; GFX11-NEXT:    v_lshl_or_b32 v8, v38, 16, v9
-; GFX11-NEXT:    v_lshl_or_b32 v9, v37, 16, v10
-; GFX11-NEXT:    v_lshl_or_b32 v10, v36, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v13
-; GFX11-NEXT:    v_lshl_or_b32 v11, v11, 16, v20
-; GFX11-NEXT:    v_lshl_or_b32 v12, v35, 16, v14
-; GFX11-NEXT:    v_lshl_or_b32 v13, v34, 16, v15
-; GFX11-NEXT:    v_lshl_or_b32 v14, v33, 16, v18
-; GFX11-NEXT:    v_lshl_or_b32 v15, v32, 16, v19
-; GFX11-NEXT:    v_lshl_or_b32 v16, v31, 16, v16
-; GFX11-NEXT:    v_lshl_or_b32 v17, v30, 16, v17
-; GFX11-NEXT:    v_lshl_or_b32 v18, v29, 16, v0
-; GFX11-NEXT:    v_lshl_or_b32 v19, v28, 16, v1
-; GFX11-NEXT:    v_lshl_or_b32 v20, v27, 16, v2
-; GFX11-NEXT:    v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25
-; GFX11-NEXT:    v_mov_b32_e32 v2, v22
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB53_4:
-; GFX11-NEXT:    ; implicit-def: $vgpr24
-; GFX11-NEXT:    ; implicit-def: $vgpr25
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr4
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr11
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    s_branch .LBB53_2
+; GFX11-TRUE16-LABEL: bitcast_v11f64_to_v44f16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB53_4
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB53_3
+; GFX11-TRUE16-NEXT:  .LBB53_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-TRUE16-NEXT:  .LBB53_3: ; %end
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v51.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v49.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v48.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v37.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v25.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v23.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v22.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB53_4:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-TRUE16-NEXT:    s_branch .LBB53_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v11f64_to_v44f16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v23, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v21, s3
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, s16 :: v_dual_mov_b32 v6, s17
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v19, s19
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v9, s20 :: v_dual_mov_b32 v10, s21
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v7, s22 :: v_dual_mov_b32 v8, s23
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s24 :: v_dual_mov_b32 v15, s25
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v13, s27
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB53_4
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v22
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB53_3
+; GFX11-FAKE16-NEXT:  .LBB53_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[7:8], v[7:8], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[9:10], v[9:10], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[5:6], v[5:6], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v22
+; GFX11-FAKE16-NEXT:  .LBB53_3: ; %end
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v25, v25, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v23, v50, 16, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v4, v4, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v5, v49, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v48, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v24, v24, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v39, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v21, v26, 16, v3
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, v23
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v22, v51, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v38, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v37, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v36, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v11, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v35, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v34, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v33, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v32, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v16, v31, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v17, v30, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v29, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v19, v28, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v27, 16, v2
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, v22
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB53_4:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    s_branch .LBB53_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -32224,105 +34513,352 @@ define inreg <11 x double> @bitcast_v44f16_to_v11f64_scalar(<44 x half> inreg %a
 ; GFX11-TRUE16-LABEL: bitcast_v44f16_to_v11f64_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:304
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:300
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:192
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:188
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:180
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:172
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:64
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:60
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:52
+; GFX11-TRUE16-NEXT:    s_clause 0xc
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:44
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v187, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v188, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff, v3
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s26, s15
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s27, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s28, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s29, s41
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v185, v3 :: v_dual_mov_b32 v186, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v187, v1 :: v_dual_mov_b32 v188, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB55_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s15 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s0
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB55_3
 ; GFX11-TRUE16-NEXT:  .LBB55_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v35, 16, v39
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v34, 16, v38
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v33, 16, v37
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v32, 16, v36
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v17, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:  .LBB55_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v188
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v187 :: v_dual_mov_b32 v20, v186
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v21, v185
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xc
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB55_4:
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB55_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v44f16_to_v11f64_scalar:
@@ -34283,15 +36819,10 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i
 ; GFX11-TRUE16-LABEL: bitcast_v44i16_to_v44f16_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, 0
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v21.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v21.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v21.h
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, 0
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s29, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s28, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s27, 16
@@ -34313,19 +36844,18 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s46, 0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB57_3
-; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v21.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v20.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v22.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v19.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v22.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v18.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v22.h
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s46
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB57_4
 ; GFX11-TRUE16-NEXT:  .LBB57_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s29, s29, s45
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v3, v21, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v20, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v19, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v18, 16, v0
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s28, s28, s44
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s27, s27, s43
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s26, s26, s42
@@ -34343,63 +36873,67 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s7
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s12
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v17, s29, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v12, s28, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v13, s27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v16, s28, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v15, s25, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v16, s24, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v7, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v8, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v11, s15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v10, s14, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v11, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v4, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v8, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v6, s9, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v25, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v24, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v23, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v22, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v25
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v24
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v23
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v22
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v6
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v4, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v13
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v21
 ; GFX11-TRUE16-NEXT:    s_branch .LBB57_5
 ; GFX11-TRUE16-NEXT:  .LBB57_3:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22
 ; GFX11-TRUE16-NEXT:    s_branch .LBB57_2
 ; GFX11-TRUE16-NEXT:  .LBB57_4:
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v12, s28
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v13, s27 :: v_dual_mov_b32 v14, s26
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, s25 :: v_dual_mov_b32 v16, s24
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, s23 :: v_dual_mov_b32 v8, s22
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s20
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, s19 :: v_dual_mov_b32 v4, s18
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, s3 :: v_dual_mov_b32 v23, s2
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v24, s1 :: v_dual_mov_b32 v25, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v16, s28
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v14, s26
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v12, s24
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v8, s20
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v4, s16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, s45 :: v_dual_mov_b32 v27, s44
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, s43 :: v_dual_mov_b32 v29, s42
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v30, s41 :: v_dual_mov_b32 v31, s40
@@ -34410,53 +36944,40 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v48, s5 :: v_dual_mov_b32 v49, s7
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v50, s4 :: v_dual_mov_b32 v51, s12
 ; GFX11-TRUE16-NEXT:  .LBB57_5: ; %end
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff, v24
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v23
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v51, 16, v25
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v50, 16, v52
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v49, 16, v23
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v48, 16, v53
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v38, 16, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v21, 16, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v23
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v39, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v37, 16, v50
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v36, 16, v11
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v35, 16, v10
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v32, 16, v38
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v33, 16, v37
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v34, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v31, 16, v16
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v30, 16, v15
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v29, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v28, 16, v32
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v16, v27, 16, v33
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v17, v26, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v18, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v19, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v20, 16, v2
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v22
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v51.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v49.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v48.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v37.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v25.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v23.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v22.l
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v44i16_to_v44f16_scalar:
@@ -36279,15 +38800,10 @@ define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i
 ; GFX11-TRUE16-LABEL: bitcast_v44f16_to_v44i16_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, 0
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v21.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v21.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v21.h
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, 0
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s29, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s28, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s27, 16
@@ -36309,19 +38825,18 @@ define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s46, 0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB59_3
-; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v21.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v20.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v22.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v19.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v22.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v18.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v22.h
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s46
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB59_4
 ; GFX11-TRUE16-NEXT:  .LBB59_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s29, s29, s45
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v3, v21, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v20, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v19, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v18, 16, v0
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s28, s28, s44
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s27, s27, s43
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s26, s26, s42
@@ -36339,63 +38854,67 @@ define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s7
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s12
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v17, 0x200, s29 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v12, 0x200, s28 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v13, 0x200, s27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v16, 0x200, s28 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v15, 0x200, s25 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v16, 0x200, s24 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v7, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v8, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v11, 0x200, s15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v10, 0x200, s14 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v11, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v4, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v8, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v7, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v6, 0x200, s9 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v25, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v24, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v23, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v22, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v25
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v24
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v23
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v22
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v6
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v4, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v13
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v21
 ; GFX11-TRUE16-NEXT:    s_branch .LBB59_5
 ; GFX11-TRUE16-NEXT:  .LBB59_3:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22
 ; GFX11-TRUE16-NEXT:    s_branch .LBB59_2
 ; GFX11-TRUE16-NEXT:  .LBB59_4:
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v12, s28
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v13, s27 :: v_dual_mov_b32 v14, s26
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, s25 :: v_dual_mov_b32 v16, s24
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, s23 :: v_dual_mov_b32 v8, s22
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s20
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, s19 :: v_dual_mov_b32 v4, s18
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v6, s16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, s3 :: v_dual_mov_b32 v23, s2
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v24, s1 :: v_dual_mov_b32 v25, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v16, s28
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v14, s26
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v12, s24
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v8, s20
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v4, s16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, s45 :: v_dual_mov_b32 v27, s44
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, s43 :: v_dual_mov_b32 v29, s42
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v30, s41 :: v_dual_mov_b32 v31, s40
@@ -36406,53 +38925,40 @@ define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v48, s5 :: v_dual_mov_b32 v49, s7
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v50, s4 :: v_dual_mov_b32 v51, s12
 ; GFX11-TRUE16-NEXT:  .LBB59_5: ; %end
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff, v24
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v23
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v51, 16, v25
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v50, 16, v52
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v49, 16, v23
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v48, 16, v53
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v38, 16, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v21, 16, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v23
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v39, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v37, 16, v50
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v36, 16, v11
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v35, 16, v10
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v32, 16, v38
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v33, 16, v37
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v34, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v31, 16, v16
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v30, 16, v15
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v29, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v28, 16, v32
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v16, v27, 16, v33
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v17, v26, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v18, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v19, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v20, 16, v2
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v22
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v23 :: v_dual_mov_b32 v22, v22
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v51.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v49.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v48.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v37.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v25.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v23.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v22.l
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v44f16_to_v44i16_scalar:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
index 0ec58c8990329..3fbedf74d9e3a 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
@@ -5805,117 +5805,362 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3
 ; GFX11-TRUE16-LABEL: bitcast_v48i16_to_v24i32_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:312
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:308
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:304
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:300
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:192
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:188
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:180
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:172
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:64
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:60
+; GFX11-TRUE16-NEXT:    s_clause 0xe
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:52
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:44
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v187, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v188, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v189, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v190, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v5.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff, v5
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s26, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s27, s15
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s28, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s29, s41
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v185, v5 :: v_dual_mov_b32 v186, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v187, v3 :: v_dual_mov_b32 v188, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v189, v1 :: v_dual_mov_b32 v190, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB15_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB15_3
 ; GFX11-TRUE16-NEXT:  .LBB15_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:  .LBB15_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v20, v188
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v23, v185
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xe
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB15_4:
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB15_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v48i16_to_v24i32_scalar:
@@ -10044,117 +10289,362 @@ define inreg <24 x i32> @bitcast_v48f16_to_v24i32_scalar(<48 x half> inreg %a, i
 ; GFX11-TRUE16-LABEL: bitcast_v48f16_to_v24i32_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:312
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:308
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:304
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:300
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:192
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:188
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:180
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:172
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:64
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:60
+; GFX11-TRUE16-NEXT:    s_clause 0xe
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:52
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:44
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v187, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v188, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v189, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v190, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v5.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff, v5
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s26, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s27, s15
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s28, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s29, s41
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v185, v5 :: v_dual_mov_b32 v186, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v187, v3 :: v_dual_mov_b32 v188, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v189, v1 :: v_dual_mov_b32 v190, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB19_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB19_3
 ; GFX11-TRUE16-NEXT:  .LBB19_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:  .LBB19_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v20, v188
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v23, v185
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xe
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB19_4:
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB19_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v48f16_to_v24i32_scalar:
@@ -13212,166 +13702,317 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a,
 ; GFX9-NEXT:    ; implicit-def: $vgpr34
 ; GFX9-NEXT:    s_branch .LBB29_2
 ;
-; GFX11-LABEL: bitcast_v24f32_to_v48i16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-NEXT:    v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v23, s1
-; GFX11-NEXT:    v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v21, s3
-; GFX11-NEXT:    v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v19, s17
-; GFX11-NEXT:    v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v7, s20
-; GFX11-NEXT:    v_dual_mov_b32 v8, s19 :: v_dual_mov_b32 v13, s21
-; GFX11-NEXT:    v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v11, s23
-; GFX11-NEXT:    v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v9, s25
-; GFX11-NEXT:    v_dual_mov_b32 v15, s26 :: v_dual_mov_b32 v14, s27
-; GFX11-NEXT:    v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cbranch_scc0 .LBB29_4
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v24
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccnz .LBB29_3
-; GFX11-NEXT:  .LBB29_2: ; %cmp.true
-; GFX11-NEXT:    v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT:    v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT:    v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT:    v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
-; GFX11-NEXT:    v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v15, 1.0, v15
-; GFX11-NEXT:    v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v10, 1.0, v10
-; GFX11-NEXT:    v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12
-; GFX11-NEXT:    v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT:    v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT:    v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
-; GFX11-NEXT:    v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
-; GFX11-NEXT:    v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v24
-; GFX11-NEXT:  .LBB29_3: ; %end
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT:    v_lshl_or_b32 v6, v6, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v10
-; GFX11-NEXT:    v_lshl_or_b32 v10, v51, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshl_or_b32 v25, v25, 16, v23
-; GFX11-NEXT:    v_lshl_or_b32 v12, v49, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v14
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT:    v_lshl_or_b32 v27, v27, 16, v21
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT:    v_lshl_or_b32 v29, v29, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v9
-; GFX11-NEXT:    v_lshl_or_b32 v14, v39, 16, v15
-; GFX11-NEXT:    v_lshl_or_b32 v15, v38, 16, v18
-; GFX11-NEXT:    v_lshl_or_b32 v18, v35, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; GFX11-NEXT:    v_lshl_or_b32 v26, v26, 16, v22
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v7
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshl_or_b32 v9, v52, 16, v13
-; GFX11-NEXT:    v_lshl_or_b32 v13, v48, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v16
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v4
-; GFX11-NEXT:    v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5
-; GFX11-NEXT:    v_lshl_or_b32 v28, v28, 16, v20
-; GFX11-NEXT:    v_lshl_or_b32 v20, v33, 16, v1
-; GFX11-NEXT:    v_mov_b32_e32 v1, v25
-; GFX11-NEXT:    v_lshl_or_b32 v24, v55, 16, v24
-; GFX11-NEXT:    v_lshl_or_b32 v7, v54, 16, v8
-; GFX11-NEXT:    v_lshl_or_b32 v8, v53, 16, v21
-; GFX11-NEXT:    v_lshl_or_b32 v11, v50, 16, v11
-; GFX11-NEXT:    v_lshl_or_b32 v16, v37, 16, v17
-; GFX11-NEXT:    v_lshl_or_b32 v17, v36, 16, v19
-; GFX11-NEXT:    v_lshl_or_b32 v19, v34, 16, v0
-; GFX11-NEXT:    v_lshl_or_b32 v21, v32, 16, v2
-; GFX11-NEXT:    v_lshl_or_b32 v22, v31, 16, v3
-; GFX11-NEXT:    v_lshl_or_b32 v23, v30, 16, v4
-; GFX11-NEXT:    v_mov_b32_e32 v0, v24
-; GFX11-NEXT:    v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
-; GFX11-NEXT:    v_mov_b32_e32 v4, v28
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB29_4:
-; GFX11-NEXT:    ; implicit-def: $vgpr55
-; GFX11-NEXT:    ; implicit-def: $vgpr25
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    s_branch .LBB29_2
+; GFX11-TRUE16-LABEL: bitcast_v24f32_to_v48i16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB29_4
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB29_3
+; GFX11-TRUE16-NEXT:  .LBB29_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-TRUE16-NEXT:  .LBB29_3: ; %end
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v55.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v54.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v53.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v52.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v51.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v49.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v48.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v37.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v25.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v24.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB29_4:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-TRUE16-NEXT:    s_branch .LBB29_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v24f32_to_v48i16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v23, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v21, s3
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v19, s17
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v7, s20
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, s19 :: v_dual_mov_b32 v13, s21
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v9, s25
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v15, s26 :: v_dual_mov_b32 v14, s27
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB29_4
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v24
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB29_3
+; GFX11-FAKE16-NEXT:  .LBB29_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v15, 1.0, v15
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v24
+; GFX11-FAKE16-NEXT:  .LBB29_3: ; %end
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v6, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v51, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v25, v25, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v49, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v27, v27, 16, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v29, v29, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v39, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v38, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v35, 16, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v26, v26, 16, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v52, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v48, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v28, v28, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v33, 16, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v25
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v24, v55, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v54, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v53, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v50, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v16, v37, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v17, v36, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v19, v34, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v21, v32, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v22, v31, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v23, v30, 16, v4
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v24
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v28
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB29_4:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    s_branch .LBB29_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -15153,117 +15794,362 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a,
 ; GFX11-TRUE16-LABEL: bitcast_v48i16_to_v24f32_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:312
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:308
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:304
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:300
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:192
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:188
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:180
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:172
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:64
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:60
+; GFX11-TRUE16-NEXT:    s_clause 0xe
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:52
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:44
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v187, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v188, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v189, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v190, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v5.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff, v5
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s26, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s27, s15
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s28, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s29, s41
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v185, v5 :: v_dual_mov_b32 v186, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v187, v3 :: v_dual_mov_b32 v188, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v189, v1 :: v_dual_mov_b32 v190, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB31_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB31_3
 ; GFX11-TRUE16-NEXT:  .LBB31_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:  .LBB31_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v20, v188
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v23, v185
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xe
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB31_4:
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB31_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v48i16_to_v24f32_scalar:
@@ -17167,166 +18053,317 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a,
 ; GFX9-NEXT:    ; implicit-def: $vgpr34
 ; GFX9-NEXT:    s_branch .LBB33_2
 ;
-; GFX11-LABEL: bitcast_v24f32_to_v48f16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-NEXT:    v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v23, s1
-; GFX11-NEXT:    v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v21, s3
-; GFX11-NEXT:    v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v19, s17
-; GFX11-NEXT:    v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v7, s20
-; GFX11-NEXT:    v_dual_mov_b32 v8, s19 :: v_dual_mov_b32 v13, s21
-; GFX11-NEXT:    v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v11, s23
-; GFX11-NEXT:    v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v9, s25
-; GFX11-NEXT:    v_dual_mov_b32 v15, s26 :: v_dual_mov_b32 v14, s27
-; GFX11-NEXT:    v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cbranch_scc0 .LBB33_4
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v24
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccnz .LBB33_3
-; GFX11-NEXT:  .LBB33_2: ; %cmp.true
-; GFX11-NEXT:    v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT:    v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT:    v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT:    v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
-; GFX11-NEXT:    v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v15, 1.0, v15
-; GFX11-NEXT:    v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v10, 1.0, v10
-; GFX11-NEXT:    v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12
-; GFX11-NEXT:    v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT:    v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT:    v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
-; GFX11-NEXT:    v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
-; GFX11-NEXT:    v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v24
-; GFX11-NEXT:  .LBB33_3: ; %end
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT:    v_lshl_or_b32 v6, v6, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v10
-; GFX11-NEXT:    v_lshl_or_b32 v10, v51, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshl_or_b32 v25, v25, 16, v23
-; GFX11-NEXT:    v_lshl_or_b32 v12, v49, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v14
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT:    v_lshl_or_b32 v27, v27, 16, v21
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT:    v_lshl_or_b32 v29, v29, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v9
-; GFX11-NEXT:    v_lshl_or_b32 v14, v39, 16, v15
-; GFX11-NEXT:    v_lshl_or_b32 v15, v38, 16, v18
-; GFX11-NEXT:    v_lshl_or_b32 v18, v35, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; GFX11-NEXT:    v_lshl_or_b32 v26, v26, 16, v22
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v7
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshl_or_b32 v9, v52, 16, v13
-; GFX11-NEXT:    v_lshl_or_b32 v13, v48, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v16
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v4
-; GFX11-NEXT:    v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5
-; GFX11-NEXT:    v_lshl_or_b32 v28, v28, 16, v20
-; GFX11-NEXT:    v_lshl_or_b32 v20, v33, 16, v1
-; GFX11-NEXT:    v_mov_b32_e32 v1, v25
-; GFX11-NEXT:    v_lshl_or_b32 v24, v55, 16, v24
-; GFX11-NEXT:    v_lshl_or_b32 v7, v54, 16, v8
-; GFX11-NEXT:    v_lshl_or_b32 v8, v53, 16, v21
-; GFX11-NEXT:    v_lshl_or_b32 v11, v50, 16, v11
-; GFX11-NEXT:    v_lshl_or_b32 v16, v37, 16, v17
-; GFX11-NEXT:    v_lshl_or_b32 v17, v36, 16, v19
-; GFX11-NEXT:    v_lshl_or_b32 v19, v34, 16, v0
-; GFX11-NEXT:    v_lshl_or_b32 v21, v32, 16, v2
-; GFX11-NEXT:    v_lshl_or_b32 v22, v31, 16, v3
-; GFX11-NEXT:    v_lshl_or_b32 v23, v30, 16, v4
-; GFX11-NEXT:    v_mov_b32_e32 v0, v24
-; GFX11-NEXT:    v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
-; GFX11-NEXT:    v_mov_b32_e32 v4, v28
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB33_4:
-; GFX11-NEXT:    ; implicit-def: $vgpr55
-; GFX11-NEXT:    ; implicit-def: $vgpr25
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    s_branch .LBB33_2
+; GFX11-TRUE16-LABEL: bitcast_v24f32_to_v48f16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB33_4
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB33_3
+; GFX11-TRUE16-NEXT:  .LBB33_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-TRUE16-NEXT:  .LBB33_3: ; %end
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v55.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v54.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v53.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v52.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v51.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v49.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v48.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v37.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v25.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v24.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB33_4:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-TRUE16-NEXT:    s_branch .LBB33_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v24f32_to_v48f16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v23, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v21, s3
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v19, s17
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v7, s20
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, s19 :: v_dual_mov_b32 v13, s21
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v11, s23
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v9, s25
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v15, s26 :: v_dual_mov_b32 v14, s27
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB33_4
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v24
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB33_3
+; GFX11-FAKE16-NEXT:  .LBB33_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v15, 1.0, v15
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v24
+; GFX11-FAKE16-NEXT:  .LBB33_3: ; %end
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v6, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v51, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v25, v25, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v49, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v27, v27, 16, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v29, v29, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v39, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v38, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v35, 16, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v26, v26, 16, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v52, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v48, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v28, v28, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v33, 16, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v25
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v24, v55, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v54, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v53, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v50, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v16, v37, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v17, v36, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v19, v34, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v21, v32, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v22, v31, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v23, v30, 16, v4
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v24
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v28
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB33_4:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    s_branch .LBB33_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -19382,117 +20419,362 @@ define inreg <24 x float> @bitcast_v48f16_to_v24f32_scalar(<48 x half> inreg %a,
 ; GFX11-TRUE16-LABEL: bitcast_v48f16_to_v24f32_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:312
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:308
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:304
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:300
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:192
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:188
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:180
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:172
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:64
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:60
+; GFX11-TRUE16-NEXT:    s_clause 0xe
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:52
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:44
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v187, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v188, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v189, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v190, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v5.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff, v5
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s26, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s27, s15
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s28, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s29, s41
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v185, v5 :: v_dual_mov_b32 v186, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v187, v3 :: v_dual_mov_b32 v188, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v189, v1 :: v_dual_mov_b32 v190, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB35_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB35_3
 ; GFX11-TRUE16-NEXT:  .LBB35_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:  .LBB35_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v20, v188
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v23, v185
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xe
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB35_4:
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB35_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v48f16_to_v24f32_scalar:
@@ -23764,117 +25046,362 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3
 ; GFX11-TRUE16-LABEL: bitcast_v48i16_to_v12i64_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:312
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:308
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:304
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:300
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:192
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:188
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:180
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:172
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:64
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:60
+; GFX11-TRUE16-NEXT:    s_clause 0xe
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:52
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:44
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v187, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v188, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v189, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v190, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v5.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff, v5
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s26, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s27, s15
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s28, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s29, s41
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v185, v5 :: v_dual_mov_b32 v186, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v187, v3 :: v_dual_mov_b32 v188, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v189, v1 :: v_dual_mov_b32 v190, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB43_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB43_3
 ; GFX11-TRUE16-NEXT:  .LBB43_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:  .LBB43_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v20, v188
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v23, v185
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xe
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB43_4:
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB43_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v48i16_to_v12i64_scalar:
@@ -28015,117 +29542,362 @@ define inreg <12 x i64> @bitcast_v48f16_to_v12i64_scalar(<48 x half> inreg %a, i
 ; GFX11-TRUE16-LABEL: bitcast_v48f16_to_v12i64_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:312
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:308
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:304
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:300
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:192
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:188
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:180
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:172
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:64
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:60
+; GFX11-TRUE16-NEXT:    s_clause 0xe
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:52
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:44
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v187, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v188, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v189, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v190, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v5.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff, v5
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s26, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s27, s15
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s28, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s29, s41
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v185, v5 :: v_dual_mov_b32 v186, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v187, v3 :: v_dual_mov_b32 v188, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v189, v1 :: v_dual_mov_b32 v190, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB47_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB47_3
 ; GFX11-TRUE16-NEXT:  .LBB47_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:  .LBB47_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v20, v188
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v23, v185
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xe
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB47_4:
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB47_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v48f16_to_v12i64_scalar:
@@ -29551,166 +31323,317 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a,
 ; GFX9-NEXT:    ; implicit-def: $vgpr34
 ; GFX9-NEXT:    s_branch .LBB49_2
 ;
-; GFX11-LABEL: bitcast_v12f64_to_v48i16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-NEXT:    v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v25, s1
-; GFX11-NEXT:    v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v23, s3
-; GFX11-NEXT:    v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v21, s17
-; GFX11-NEXT:    v_dual_mov_b32 v7, s18 :: v_dual_mov_b32 v8, s19
-; GFX11-NEXT:    v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v19, s21
-; GFX11-NEXT:    v_dual_mov_b32 v11, s22 :: v_dual_mov_b32 v12, s23
-; GFX11-NEXT:    v_dual_mov_b32 v9, s24 :: v_dual_mov_b32 v10, s25
-; GFX11-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT:    v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cbranch_scc0 .LBB49_4
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v24
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccnz .LBB49_3
-; GFX11-NEXT:  .LBB49_2: ; %cmp.true
-; GFX11-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
-; GFX11-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
-; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT:    v_add_f64 v[16:17], v[16:17], 1.0
-; GFX11-NEXT:    v_add_f64 v[14:15], v[14:15], 1.0
-; GFX11-NEXT:    v_add_f64 v[9:10], v[9:10], 1.0
-; GFX11-NEXT:    v_add_f64 v[11:12], v[11:12], 1.0
-; GFX11-NEXT:    v_add_f64 v[18:19], v[18:19], 1.0
-; GFX11-NEXT:    v_add_f64 v[7:8], v[7:8], 1.0
-; GFX11-NEXT:    v_add_f64 v[20:21], v[20:21], 1.0
-; GFX11-NEXT:    v_add_f64 v[22:23], v[22:23], 1.0
-; GFX11-NEXT:    v_add_f64 v[24:25], v[24:25], 1.0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v24
-; GFX11-NEXT:  .LBB49_3: ; %end
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT:    v_lshl_or_b32 v29, v29, 16, v21
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT:    v_lshl_or_b32 v6, v6, 16, v7
-; GFX11-NEXT:    v_lshl_or_b32 v7, v53, 16, v8
-; GFX11-NEXT:    v_lshl_or_b32 v8, v52, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v19
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshl_or_b32 v25, v54, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT:    v_lshl_or_b32 v27, v27, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v9
-; GFX11-NEXT:    v_lshl_or_b32 v9, v51, 16, v18
-; GFX11-NEXT:    v_lshl_or_b32 v18, v35, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v4
-; GFX11-NEXT:    v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5
-; GFX11-NEXT:    v_lshl_or_b32 v28, v28, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v10
-; GFX11-NEXT:    v_lshl_or_b32 v26, v26, 16, v22
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_lshl_or_b32 v13, v13, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_lshl_or_b32 v20, v33, 16, v1
-; GFX11-NEXT:    v_mov_b32_e32 v1, v25
-; GFX11-NEXT:    v_lshl_or_b32 v24, v55, 16, v24
-; GFX11-NEXT:    v_lshl_or_b32 v10, v50, 16, v11
-; GFX11-NEXT:    v_lshl_or_b32 v11, v49, 16, v12
-; GFX11-NEXT:    v_lshl_or_b32 v12, v48, 16, v19
-; GFX11-NEXT:    v_lshl_or_b32 v14, v39, 16, v14
-; GFX11-NEXT:    v_lshl_or_b32 v15, v38, 16, v15
-; GFX11-NEXT:    v_lshl_or_b32 v16, v37, 16, v16
-; GFX11-NEXT:    v_lshl_or_b32 v17, v36, 16, v17
-; GFX11-NEXT:    v_lshl_or_b32 v19, v34, 16, v0
-; GFX11-NEXT:    v_lshl_or_b32 v21, v32, 16, v2
-; GFX11-NEXT:    v_lshl_or_b32 v22, v31, 16, v3
-; GFX11-NEXT:    v_lshl_or_b32 v23, v30, 16, v4
-; GFX11-NEXT:    v_mov_b32_e32 v0, v24
-; GFX11-NEXT:    v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
-; GFX11-NEXT:    v_mov_b32_e32 v4, v28
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB49_4:
-; GFX11-NEXT:    ; implicit-def: $vgpr55
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr13
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    s_branch .LBB49_2
+; GFX11-TRUE16-LABEL: bitcast_v12f64_to_v48i16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB49_4
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB49_3
+; GFX11-TRUE16-NEXT:  .LBB49_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-TRUE16-NEXT:  .LBB49_3: ; %end
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v55.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v54.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v53.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v52.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v51.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v49.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v48.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v37.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v25.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v24.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB49_4:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-TRUE16-NEXT:    s_branch .LBB49_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v12f64_to_v48i16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v25, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v23, s3
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v21, s17
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v7, s18 :: v_dual_mov_b32 v8, s19
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v19, s21
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v11, s22 :: v_dual_mov_b32 v12, s23
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v9, s24 :: v_dual_mov_b32 v10, s25
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB49_4
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v24
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB49_3
+; GFX11-FAKE16-NEXT:  .LBB49_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[9:10], v[9:10], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[11:12], v[11:12], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[7:8], v[7:8], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v24
+; GFX11-FAKE16-NEXT:  .LBB49_3: ; %end
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v29, v29, 16, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v6, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v53, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v52, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v25, v54, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v27, v27, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v51, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v35, 16, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v28, v28, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v26, v26, 16, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v13, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v33, 16, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v25
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v24, v55, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v50, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v49, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v48, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v39, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v38, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v16, v37, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v17, v36, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v19, v34, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v21, v32, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v22, v31, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v23, v30, 16, v4
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v24
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v28
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB49_4:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    s_branch .LBB49_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -31492,117 +33415,362 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a,
 ; GFX11-TRUE16-LABEL: bitcast_v48i16_to_v12f64_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:312
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:308
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:304
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:300
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:192
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:188
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:180
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:172
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:64
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:60
+; GFX11-TRUE16-NEXT:    s_clause 0xe
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:52
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:44
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v187, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v188, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v189, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v190, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v5.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff, v5
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s26, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s27, s15
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s28, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s29, s41
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v185, v5 :: v_dual_mov_b32 v186, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v187, v3 :: v_dual_mov_b32 v188, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v189, v1 :: v_dual_mov_b32 v190, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB51_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB51_3
 ; GFX11-TRUE16-NEXT:  .LBB51_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v4, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v6, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v7, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v11, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v13, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v15, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v16, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v17, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:  .LBB51_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v20, v188
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v23, v185
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xe
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB51_4:
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB51_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v48i16_to_v12f64_scalar:
@@ -33424,166 +35592,317 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a
 ; GFX9-NEXT:    ; implicit-def: $vgpr34
 ; GFX9-NEXT:    s_branch .LBB53_2
 ;
-; GFX11-LABEL: bitcast_v12f64_to_v48f16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-NEXT:    v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v25, s1
-; GFX11-NEXT:    v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v23, s3
-; GFX11-NEXT:    v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v21, s17
-; GFX11-NEXT:    v_dual_mov_b32 v7, s18 :: v_dual_mov_b32 v8, s19
-; GFX11-NEXT:    v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v19, s21
-; GFX11-NEXT:    v_dual_mov_b32 v11, s22 :: v_dual_mov_b32 v12, s23
-; GFX11-NEXT:    v_dual_mov_b32 v9, s24 :: v_dual_mov_b32 v10, s25
-; GFX11-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT:    v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cbranch_scc0 .LBB53_4
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v24
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccnz .LBB53_3
-; GFX11-NEXT:  .LBB53_2: ; %cmp.true
-; GFX11-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
-; GFX11-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
-; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT:    v_add_f64 v[16:17], v[16:17], 1.0
-; GFX11-NEXT:    v_add_f64 v[14:15], v[14:15], 1.0
-; GFX11-NEXT:    v_add_f64 v[9:10], v[9:10], 1.0
-; GFX11-NEXT:    v_add_f64 v[11:12], v[11:12], 1.0
-; GFX11-NEXT:    v_add_f64 v[18:19], v[18:19], 1.0
-; GFX11-NEXT:    v_add_f64 v[7:8], v[7:8], 1.0
-; GFX11-NEXT:    v_add_f64 v[20:21], v[20:21], 1.0
-; GFX11-NEXT:    v_add_f64 v[22:23], v[22:23], 1.0
-; GFX11-NEXT:    v_add_f64 v[24:25], v[24:25], 1.0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v24
-; GFX11-NEXT:  .LBB53_3: ; %end
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT:    v_lshl_or_b32 v29, v29, 16, v21
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT:    v_lshl_or_b32 v6, v6, 16, v7
-; GFX11-NEXT:    v_lshl_or_b32 v7, v53, 16, v8
-; GFX11-NEXT:    v_lshl_or_b32 v8, v52, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v19
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshl_or_b32 v25, v54, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT:    v_lshl_or_b32 v27, v27, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v9
-; GFX11-NEXT:    v_lshl_or_b32 v9, v51, 16, v18
-; GFX11-NEXT:    v_lshl_or_b32 v18, v35, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v4
-; GFX11-NEXT:    v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5
-; GFX11-NEXT:    v_lshl_or_b32 v28, v28, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v10
-; GFX11-NEXT:    v_lshl_or_b32 v26, v26, 16, v22
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_lshl_or_b32 v13, v13, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_lshl_or_b32 v20, v33, 16, v1
-; GFX11-NEXT:    v_mov_b32_e32 v1, v25
-; GFX11-NEXT:    v_lshl_or_b32 v24, v55, 16, v24
-; GFX11-NEXT:    v_lshl_or_b32 v10, v50, 16, v11
-; GFX11-NEXT:    v_lshl_or_b32 v11, v49, 16, v12
-; GFX11-NEXT:    v_lshl_or_b32 v12, v48, 16, v19
-; GFX11-NEXT:    v_lshl_or_b32 v14, v39, 16, v14
-; GFX11-NEXT:    v_lshl_or_b32 v15, v38, 16, v15
-; GFX11-NEXT:    v_lshl_or_b32 v16, v37, 16, v16
-; GFX11-NEXT:    v_lshl_or_b32 v17, v36, 16, v17
-; GFX11-NEXT:    v_lshl_or_b32 v19, v34, 16, v0
-; GFX11-NEXT:    v_lshl_or_b32 v21, v32, 16, v2
-; GFX11-NEXT:    v_lshl_or_b32 v22, v31, 16, v3
-; GFX11-NEXT:    v_lshl_or_b32 v23, v30, 16, v4
-; GFX11-NEXT:    v_mov_b32_e32 v0, v24
-; GFX11-NEXT:    v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
-; GFX11-NEXT:    v_mov_b32_e32 v4, v28
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB53_4:
-; GFX11-NEXT:    ; implicit-def: $vgpr55
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr13
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    s_branch .LBB53_2
+; GFX11-TRUE16-LABEL: bitcast_v12f64_to_v48f16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB53_4
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB53_3
+; GFX11-TRUE16-NEXT:  .LBB53_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-TRUE16-NEXT:  .LBB53_3: ; %end
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v55.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v54.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v53.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v52.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v51.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v49.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v48.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v37.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v25.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v24.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB53_4:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-TRUE16-NEXT:    s_branch .LBB53_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v12f64_to_v48f16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v24, s0 :: v_dual_mov_b32 v25, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v23, s3
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v20, s16 :: v_dual_mov_b32 v21, s17
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v7, s18 :: v_dual_mov_b32 v8, s19
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v19, s21
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v11, s22 :: v_dual_mov_b32 v12, s23
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v9, s24 :: v_dual_mov_b32 v10, s25
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB53_4
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v24
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB53_3
+; GFX11-FAKE16-NEXT:  .LBB53_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[9:10], v[9:10], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[11:12], v[11:12], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[7:8], v[7:8], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v24
+; GFX11-FAKE16-NEXT:  .LBB53_3: ; %end
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v29, v29, 16, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v6, v6, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v7, v53, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v52, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v25, v54, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v27, v27, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v51, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v35, 16, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v28, v28, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v26, v26, 16, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v13, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v33, 16, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v25
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v24, v55, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v50, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v49, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v48, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v39, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v38, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v16, v37, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v17, v36, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v19, v34, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v21, v32, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v22, v31, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v23, v30, 16, v4
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v24
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v28
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB53_4:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    s_branch .LBB53_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -35639,117 +37958,362 @@ define inreg <12 x double> @bitcast_v48f16_to_v12f64_scalar(<48 x half> inreg %a
 ; GFX11-TRUE16-LABEL: bitcast_v48f16_to_v12f64_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:312
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:308
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:304
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:300
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:192
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:188
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:180
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:172
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:64
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:60
+; GFX11-TRUE16-NEXT:    s_clause 0xe
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:52
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:44
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v187, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v188, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v189, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v190, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v5.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff, v5
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s0, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s16, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s17, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s18, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s19, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s20, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s21, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s22, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s23, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s24, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s25, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s26, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s27, s15
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s28, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s29, s41
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v185, v5 :: v_dual_mov_b32 v186, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v187, v3 :: v_dual_mov_b32 v188, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v189, v1 :: v_dual_mov_b32 v190, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB55_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s12
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s13 :: v_dual_mov_b32 v13, s14
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v15, s15
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v17, s17
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB55_3
 ; GFX11-TRUE16-NEXT:  .LBB55_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v37, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v36, 16, v50
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v35, 16, v49
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v34, 16, v48
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v33, 16, v39
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v32, 16, v38
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v4, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v6, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v7, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v11, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v13, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v15, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v16, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v17, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:  .LBB55_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v20, v188
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v190 :: v_dual_mov_b32 v19, v189
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v187 :: v_dual_mov_b32 v22, v186
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v23, v185
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xe
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB55_4:
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB55_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v48f16_to_v12f64_scalar:
@@ -37964,19 +40528,11 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
 ; GFX11-TRUE16-LABEL: bitcast_v48i16_to_v48f16_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, 0
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v5.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v23.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v23.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v23.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v23.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v23.h
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, 0
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s29, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s28, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s27, 16
@@ -37998,22 +40554,21 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s46, 0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB57_3
-; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v23.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v22.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v24.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v21.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v24.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v20.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v24.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v19.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v24.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v18.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v24.h
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s46
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB57_4
 ; GFX11-TRUE16-NEXT:  .LBB57_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v23, 16, v5
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v22, 16, v4
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v3, v21, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v20, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v19, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v18, 16, v0
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s29, s29, s45
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s28, s28, s44
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s27, s27, s43
@@ -38032,67 +40587,73 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s7
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s14
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s29, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v15, s28, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v16, s27, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v17, s26, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s25, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v10, s24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v17, s29, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v16, s28, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v12, s24, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v11, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v13, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v6, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v10, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v8, s11, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v8, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v28, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v27, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v26, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v25, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v24, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v29, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v27
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v26
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v25
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v24
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v29
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v28
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v8
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v6, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v4, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v15
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v23
 ; GFX11-TRUE16-NEXT:    s_branch .LBB57_5
 ; GFX11-TRUE16-NEXT:  .LBB57_3:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24
 ; GFX11-TRUE16-NEXT:    s_branch .LBB57_2
 ; GFX11-TRUE16-NEXT:  .LBB57_4:
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s29 :: v_dual_mov_b32 v15, s28
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s27 :: v_dual_mov_b32 v17, s26
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, s25 :: v_dual_mov_b32 v10, s24
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v12, s22
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v13, s21 :: v_dual_mov_b32 v6, s20
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s18
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, s17 :: v_dual_mov_b32 v29, s16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v24, s3 :: v_dual_mov_b32 v25, s2
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, s1 :: v_dual_mov_b32 v27, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v16, s28
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v14, s26
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v12, s24
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v8, s20
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v4, s16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v30, s45 :: v_dual_mov_b32 v31, s44
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v32, s43 :: v_dual_mov_b32 v33, s42
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v34, s41 :: v_dual_mov_b32 v35, s40
@@ -38103,58 +40664,43 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v52, s5 :: v_dual_mov_b32 v53, s7
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s4 :: v_dual_mov_b32 v55, s14
 ; GFX11-TRUE16-NEXT:  .LBB57_5: ; %end
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff, v24
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v55, 16, v27
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v52, 16, v65
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff, v25
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v18, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v54, 16, v26
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v29
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff, v28
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v22, 16, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v27
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v53, 16, v64
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v49, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v48, 16, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v39, 16, v53
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v28, v51, 16, v29
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v29, v50, 16, v52
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v38, 16, v13
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v37, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v35, 16, v39
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v34, 16, v48
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v36, 16, v11
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v33, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v32, 16, v16
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v16, v31, 16, v34
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v17, v30, 16, v35
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v19, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v20, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v21, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v23, 16, v4
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v26
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, v28 :: v_dual_mov_b32 v5, v29
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v55.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v54.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v53.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v52.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v51.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v49.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v48.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v37.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v25.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v24.l
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v48i16_to_v48f16_scalar:
@@ -40168,19 +42714,11 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
 ; GFX11-TRUE16-LABEL: bitcast_v48f16_to_v48i16_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, 0
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v5.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v23.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v23.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v23.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v23.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v23.h
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, 0
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s29, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s28, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s27, 16
@@ -40202,22 +42740,21 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s46, 0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB59_3
-; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v23.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v22.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v24.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v21.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v24.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v20.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v24.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v19.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v24.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v18.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v24.h
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s46
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB59_4
 ; GFX11-TRUE16-NEXT:  .LBB59_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v23, 16, v5
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v22, 16, v4
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v3, v21, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v20, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v19, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v18, 16, v0
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s29, s29, s45
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s28, s28, s44
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s27, s27, s43
@@ -40236,67 +42773,73 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s7
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s14
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s29 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v15, 0x200, s28 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v16, 0x200, s27 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v17, 0x200, s26 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s25 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v10, 0x200, s24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v17, 0x200, s29 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v16, 0x200, s28 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v12, 0x200, s24 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v11, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v13, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v6, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v10, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v8, 0x200, s11 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v7, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v8, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v28, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v27, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v26, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v25, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v24, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v29, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v27
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v26
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v25
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v24
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v29
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v28
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v8
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v6, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v4, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v15
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v23
 ; GFX11-TRUE16-NEXT:    s_branch .LBB59_5
 ; GFX11-TRUE16-NEXT:  .LBB59_3:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24
 ; GFX11-TRUE16-NEXT:    s_branch .LBB59_2
 ; GFX11-TRUE16-NEXT:  .LBB59_4:
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s29 :: v_dual_mov_b32 v15, s28
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s27 :: v_dual_mov_b32 v17, s26
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, s25 :: v_dual_mov_b32 v10, s24
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v12, s22
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v13, s21 :: v_dual_mov_b32 v6, s20
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s18
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, s17 :: v_dual_mov_b32 v29, s16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v24, s3 :: v_dual_mov_b32 v25, s2
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, s1 :: v_dual_mov_b32 v27, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v16, s28
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v14, s26
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v12, s24
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v8, s20
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v4, s16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v30, s45 :: v_dual_mov_b32 v31, s44
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v32, s43 :: v_dual_mov_b32 v33, s42
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v34, s41 :: v_dual_mov_b32 v35, s40
@@ -40307,58 +42850,43 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v52, s5 :: v_dual_mov_b32 v53, s7
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s4 :: v_dual_mov_b32 v55, s14
 ; GFX11-TRUE16-NEXT:  .LBB59_5: ; %end
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff, v27
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff, v24
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v55, 16, v27
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v52, 16, v65
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff, v25
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v18, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v54, 16, v26
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v29
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff, v28
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v22, 16, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v27
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v53, 16, v64
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v49, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v48, 16, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v39, 16, v53
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v28, v51, 16, v29
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v29, v50, 16, v52
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v38, 16, v13
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v37, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v35, 16, v39
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v34, 16, v48
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v36, 16, v11
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v33, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v32, 16, v16
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v16, v31, 16, v34
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v17, v30, 16, v35
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v19, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v20, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v21, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v23, 16, v4
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, v24 :: v_dual_mov_b32 v1, v25
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v26
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, v28 :: v_dual_mov_b32 v5, v29
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v25 :: v_dual_mov_b32 v24, v24
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v55.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v54.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v53.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v52.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v51.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v49.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v48.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v37.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v25.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v24.l
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v48f16_to_v48i16_scalar:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
index ac101c905718e..282e7a7953de6 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
@@ -6286,129 +6286,372 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3
 ; GFX11-TRUE16-LABEL: bitcast_v52i16_to_v26i32_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v191, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v7.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v6.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v5.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff, v7
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s27, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s28, s15
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s29, s41
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v186, v6
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v187, v5 :: v_dual_mov_b32 v188, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v189, v3 :: v_dual_mov_b32 v190, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v191, v1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v185, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB15_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB15_3
 ; GFX11-TRUE16-NEXT:  .LBB15_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v15, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v16, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v17, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v191, v191, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:  .LBB15_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v185
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB15_4:
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v53, v25
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v25, v53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB15_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v52i16_to_v26i32_scalar:
@@ -10946,129 +11189,372 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; GFX11-TRUE16-LABEL: bitcast_v52f16_to_v26i32_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v191, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v7.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v6.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v5.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff, v7
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s27, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s28, s15
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s29, s41
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v186, v6
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v187, v5 :: v_dual_mov_b32 v188, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v189, v3 :: v_dual_mov_b32 v190, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v191, v1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v185, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB19_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB19_3
 ; GFX11-TRUE16-NEXT:  .LBB19_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v15, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v16, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v17, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v191, 0x200, v191 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:  .LBB19_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v185
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB19_4:
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v53, v25
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v25, v53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB19_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v52f16_to_v26i32_scalar:
@@ -14389,178 +14875,340 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a,
 ; GFX9-NEXT:    ; implicit-def: $vgpr38
 ; GFX9-NEXT:    s_branch .LBB29_2
 ;
-; GFX11-LABEL: bitcast_v26f32_to_v52i16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-NEXT:    v_dual_mov_b32 v26, s0 :: v_dual_mov_b32 v25, s1
-; GFX11-NEXT:    v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v23, s3
-; GFX11-NEXT:    v_dual_mov_b32 v22, s16 :: v_dual_mov_b32 v21, s17
-; GFX11-NEXT:    v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v19, s19
-; GFX11-NEXT:    v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v9, s22
-; GFX11-NEXT:    v_dual_mov_b32 v10, s21 :: v_dual_mov_b32 v15, s23
-; GFX11-NEXT:    v_dual_mov_b32 v14, s24 :: v_dual_mov_b32 v13, s25
-; GFX11-NEXT:    v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v11, s27
-; GFX11-NEXT:    v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cbranch_scc0 .LBB29_4
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 16, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v26
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccnz .LBB29_3
-; GFX11-NEXT:  .LBB29_2: ; %cmp.true
-; GFX11-NEXT:    v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT:    v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT:    v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT:    v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT:    v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
-; GFX11-NEXT:    v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12
-; GFX11-NEXT:    v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14
-; GFX11-NEXT:    v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v10, 1.0, v10
-; GFX11-NEXT:    v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT:    v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
-; GFX11-NEXT:    v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
-; GFX11-NEXT:    v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
-; GFX11-NEXT:    v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 16, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v26
-; GFX11-NEXT:  .LBB29_3: ; %end
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT:    v_lshl_or_b32 v31, v31, 16, v19
-; GFX11-NEXT:    v_lshl_or_b32 v8, v8, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v12
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v11
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_lshl_or_b32 v33, v33, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; GFX11-NEXT:    v_lshl_or_b32 v27, v27, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT:    v_lshl_or_b32 v29, v29, 16, v21
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT:    v_lshl_or_b32 v11, v64, 16, v15
-; GFX11-NEXT:    v_lshl_or_b32 v12, v55, 16, v14
-; GFX11-NEXT:    v_lshl_or_b32 v14, v53, 16, v18
-; GFX11-NEXT:    v_lshl_or_b32 v15, v52, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v16
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshl_or_b32 v19, v48, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v4
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v9
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_lshl_or_b32 v16, v51, 16, v17
-; GFX11-NEXT:    v_lshl_or_b32 v17, v50, 16, v18
-; GFX11-NEXT:    v_lshl_or_b32 v18, v49, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v6
-; GFX11-NEXT:    v_dual_mov_b32 v7, v31 :: v_dual_and_b32 v4, 0xffff, v7
-; GFX11-NEXT:    v_lshl_or_b32 v30, v30, 16, v20
-; GFX11-NEXT:    v_lshl_or_b32 v20, v39, 16, v2
-; GFX11-NEXT:    v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v2, 0xffff, v5
-; GFX11-NEXT:    v_lshl_or_b32 v28, v28, 16, v22
-; GFX11-NEXT:    v_lshl_or_b32 v22, v37, 16, v1
-; GFX11-NEXT:    v_mov_b32_e32 v1, v33
-; GFX11-NEXT:    v_lshl_or_b32 v32, v32, 16, v26
-; GFX11-NEXT:    v_lshl_or_b32 v26, v67, 16, v24
-; GFX11-NEXT:    v_lshl_or_b32 v9, v66, 16, v10
-; GFX11-NEXT:    v_lshl_or_b32 v10, v65, 16, v21
-; GFX11-NEXT:    v_lshl_or_b32 v13, v54, 16, v13
-; GFX11-NEXT:    v_lshl_or_b32 v21, v38, 16, v0
-; GFX11-NEXT:    v_lshl_or_b32 v23, v36, 16, v2
-; GFX11-NEXT:    v_lshl_or_b32 v24, v35, 16, v3
-; GFX11-NEXT:    v_lshl_or_b32 v25, v34, 16, v4
-; GFX11-NEXT:    v_mov_b32_e32 v0, v32
-; GFX11-NEXT:    v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
-; GFX11-NEXT:    v_mov_b32_e32 v4, v28
-; GFX11-NEXT:    v_mov_b32_e32 v6, v30
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB29_4:
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr67
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr8
-; GFX11-NEXT:    ; implicit-def: $vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr65
-; GFX11-NEXT:    ; implicit-def: $vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr55
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    s_branch .LBB29_2
+; GFX11-TRUE16-LABEL: bitcast_v26f32_to_v52i16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB29_4
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v0
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB29_3
+; GFX11-TRUE16-NEXT:  .LBB29_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v0
+; GFX11-TRUE16-NEXT:  .LBB29_3: ; %end
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v67.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v66.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v65.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v64.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v55.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v54.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v53.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v52.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v51.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v49.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v48.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v37.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v26.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB29_4:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT:    s_branch .LBB29_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v26f32_to_v52i16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v26, s0 :: v_dual_mov_b32 v25, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v23, s3
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v22, s16 :: v_dual_mov_b32 v21, s17
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v19, s19
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v9, s22
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, s21 :: v_dual_mov_b32 v15, s23
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v11, s27
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB29_4
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v26
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB29_3
+; GFX11-FAKE16-NEXT:  .LBB29_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v26
+; GFX11-FAKE16-NEXT:  .LBB29_3: ; %end
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v31, v31, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v8, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v33, v33, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v27, v27, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v29, v29, 16, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v64, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v55, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v53, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v52, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v19, v48, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v16, v51, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v17, v50, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v49, 16, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v7, v31 :: v_dual_and_b32 v4, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v30, v30, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v39, 16, v2
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v2, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v28, v28, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v22, v37, 16, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v33
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v32, v32, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v26, v67, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v66, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v65, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v54, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v21, v38, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v23, v36, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v24, v35, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v25, v34, 16, v4
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v32
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v28
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, v30
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB29_4:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    s_branch .LBB29_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -16527,129 +17175,372 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a,
 ; GFX11-TRUE16-LABEL: bitcast_v52i16_to_v26f32_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v191, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v7.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v6.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v5.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff, v7
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s27, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s28, s15
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s29, s41
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v186, v6
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v187, v5 :: v_dual_mov_b32 v188, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v189, v3 :: v_dual_mov_b32 v190, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v191, v1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v185, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB31_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB31_3
 ; GFX11-TRUE16-NEXT:  .LBB31_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v15, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v16, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v17, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v191, v191, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:  .LBB31_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v185
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB31_4:
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v53, v25
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v25, v53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB31_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v52i16_to_v26f32_scalar:
@@ -18769,178 +19660,340 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a,
 ; GFX9-NEXT:    ; implicit-def: $vgpr38
 ; GFX9-NEXT:    s_branch .LBB33_2
 ;
-; GFX11-LABEL: bitcast_v26f32_to_v52f16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-NEXT:    v_dual_mov_b32 v26, s0 :: v_dual_mov_b32 v25, s1
-; GFX11-NEXT:    v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v23, s3
-; GFX11-NEXT:    v_dual_mov_b32 v22, s16 :: v_dual_mov_b32 v21, s17
-; GFX11-NEXT:    v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v19, s19
-; GFX11-NEXT:    v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v9, s22
-; GFX11-NEXT:    v_dual_mov_b32 v10, s21 :: v_dual_mov_b32 v15, s23
-; GFX11-NEXT:    v_dual_mov_b32 v14, s24 :: v_dual_mov_b32 v13, s25
-; GFX11-NEXT:    v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v11, s27
-; GFX11-NEXT:    v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cbranch_scc0 .LBB33_4
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 16, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v26
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccnz .LBB33_3
-; GFX11-NEXT:  .LBB33_2: ; %cmp.true
-; GFX11-NEXT:    v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT:    v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT:    v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT:    v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT:    v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
-; GFX11-NEXT:    v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12
-; GFX11-NEXT:    v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14
-; GFX11-NEXT:    v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v10, 1.0, v10
-; GFX11-NEXT:    v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT:    v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
-; GFX11-NEXT:    v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
-; GFX11-NEXT:    v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
-; GFX11-NEXT:    v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 16, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v26
-; GFX11-NEXT:  .LBB33_3: ; %end
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT:    v_lshl_or_b32 v31, v31, 16, v19
-; GFX11-NEXT:    v_lshl_or_b32 v8, v8, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v12
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v11
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_lshl_or_b32 v33, v33, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; GFX11-NEXT:    v_lshl_or_b32 v27, v27, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT:    v_lshl_or_b32 v29, v29, 16, v21
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT:    v_lshl_or_b32 v11, v64, 16, v15
-; GFX11-NEXT:    v_lshl_or_b32 v12, v55, 16, v14
-; GFX11-NEXT:    v_lshl_or_b32 v14, v53, 16, v18
-; GFX11-NEXT:    v_lshl_or_b32 v15, v52, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v16
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshl_or_b32 v19, v48, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v4
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v9
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_lshl_or_b32 v16, v51, 16, v17
-; GFX11-NEXT:    v_lshl_or_b32 v17, v50, 16, v18
-; GFX11-NEXT:    v_lshl_or_b32 v18, v49, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v6
-; GFX11-NEXT:    v_dual_mov_b32 v7, v31 :: v_dual_and_b32 v4, 0xffff, v7
-; GFX11-NEXT:    v_lshl_or_b32 v30, v30, 16, v20
-; GFX11-NEXT:    v_lshl_or_b32 v20, v39, 16, v2
-; GFX11-NEXT:    v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v2, 0xffff, v5
-; GFX11-NEXT:    v_lshl_or_b32 v28, v28, 16, v22
-; GFX11-NEXT:    v_lshl_or_b32 v22, v37, 16, v1
-; GFX11-NEXT:    v_mov_b32_e32 v1, v33
-; GFX11-NEXT:    v_lshl_or_b32 v32, v32, 16, v26
-; GFX11-NEXT:    v_lshl_or_b32 v26, v67, 16, v24
-; GFX11-NEXT:    v_lshl_or_b32 v9, v66, 16, v10
-; GFX11-NEXT:    v_lshl_or_b32 v10, v65, 16, v21
-; GFX11-NEXT:    v_lshl_or_b32 v13, v54, 16, v13
-; GFX11-NEXT:    v_lshl_or_b32 v21, v38, 16, v0
-; GFX11-NEXT:    v_lshl_or_b32 v23, v36, 16, v2
-; GFX11-NEXT:    v_lshl_or_b32 v24, v35, 16, v3
-; GFX11-NEXT:    v_lshl_or_b32 v25, v34, 16, v4
-; GFX11-NEXT:    v_mov_b32_e32 v0, v32
-; GFX11-NEXT:    v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
-; GFX11-NEXT:    v_mov_b32_e32 v4, v28
-; GFX11-NEXT:    v_mov_b32_e32 v6, v30
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB33_4:
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr67
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr8
-; GFX11-NEXT:    ; implicit-def: $vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr65
-; GFX11-NEXT:    ; implicit-def: $vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr55
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    s_branch .LBB33_2
+; GFX11-TRUE16-LABEL: bitcast_v26f32_to_v52f16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB33_4
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v0
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB33_3
+; GFX11-TRUE16-NEXT:  .LBB33_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v0
+; GFX11-TRUE16-NEXT:  .LBB33_3: ; %end
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v67.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v66.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v65.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v64.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v55.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v54.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v53.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v52.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v51.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v49.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v48.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v37.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v26.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB33_4:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT:    s_branch .LBB33_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v26f32_to_v52f16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v26, s0 :: v_dual_mov_b32 v25, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v23, s3
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v22, s16 :: v_dual_mov_b32 v21, s17
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v19, s19
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v9, s22
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, s21 :: v_dual_mov_b32 v15, s23
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s24 :: v_dual_mov_b32 v13, s25
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, s26 :: v_dual_mov_b32 v11, s27
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB33_4
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v26
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB33_3
+; GFX11-FAKE16-NEXT:  .LBB33_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v26
+; GFX11-FAKE16-NEXT:  .LBB33_3: ; %end
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v31, v31, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v8, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v33, v33, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v27, v27, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v29, v29, 16, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v64, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v55, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v53, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v52, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v19, v48, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v16, v51, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v17, v50, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v49, 16, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v7, v31 :: v_dual_and_b32 v4, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v30, v30, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v39, 16, v2
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v2, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v28, v28, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v22, v37, 16, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v33
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v32, v32, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v26, v67, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v66, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v65, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v54, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v21, v38, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v23, v36, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v24, v35, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v25, v34, 16, v4
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v32
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v28
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, v30
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB33_4:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    s_branch .LBB33_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -21183,129 +22236,372 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; GFX11-TRUE16-LABEL: bitcast_v52f16_to_v26f32_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v191, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v7.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v6.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v5.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff, v7
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s27, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s28, s15
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s29, s41
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v186, v6
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v187, v5 :: v_dual_mov_b32 v188, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v189, v3 :: v_dual_mov_b32 v190, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v191, v1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v185, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB35_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB35_3
 ; GFX11-TRUE16-NEXT:  .LBB35_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v15, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v16, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v17, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v191, 0x200, v191 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:  .LBB35_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v185
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB35_4:
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v53, v25
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v25, v53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB35_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v52f16_to_v26f32_scalar:
@@ -25980,129 +27276,372 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3
 ; GFX11-TRUE16-LABEL: bitcast_v52i16_to_v13i64_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v191, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v7.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v6.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v5.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff, v7
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s27, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s28, s15
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s29, s41
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v186, v6
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v187, v5 :: v_dual_mov_b32 v188, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v189, v3 :: v_dual_mov_b32 v190, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v191, v1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v185, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB43_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB43_3
 ; GFX11-TRUE16-NEXT:  .LBB43_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v15, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v16, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v17, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v191, v191, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:  .LBB43_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v185
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB43_4:
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v53, v25
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v25, v53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB43_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v52i16_to_v13i64_scalar:
@@ -30655,129 +32194,372 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; GFX11-TRUE16-LABEL: bitcast_v52f16_to_v13i64_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v191, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v7.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v6.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v5.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff, v7
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s27, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s28, s15
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s29, s41
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v186, v6
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v187, v5 :: v_dual_mov_b32 v188, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v189, v3 :: v_dual_mov_b32 v190, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v191, v1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v185, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB47_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB47_3
 ; GFX11-TRUE16-NEXT:  .LBB47_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v15, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v16, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v17, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v191, 0x200, v191 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:  .LBB47_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v185
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB47_4:
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v53, v25
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v25, v53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB47_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v52f16_to_v13i64_scalar:
@@ -32378,178 +34160,340 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a,
 ; GFX9-NEXT:    ; implicit-def: $vgpr25
 ; GFX9-NEXT:    s_branch .LBB49_2
 ;
-; GFX11-LABEL: bitcast_v13f64_to_v52i16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-NEXT:    v_dual_mov_b32 v26, s0 :: v_dual_mov_b32 v27, s1
-; GFX11-NEXT:    v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v25, s3
-; GFX11-NEXT:    v_dual_mov_b32 v22, s16 :: v_dual_mov_b32 v23, s17
-; GFX11-NEXT:    v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v21, s19
-; GFX11-NEXT:    v_dual_mov_b32 v9, s20 :: v_dual_mov_b32 v10, s21
-; GFX11-NEXT:    v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v19, s23
-; GFX11-NEXT:    v_dual_mov_b32 v13, s24 :: v_dual_mov_b32 v14, s25
-; GFX11-NEXT:    v_dual_mov_b32 v11, s26 :: v_dual_mov_b32 v12, s27
-; GFX11-NEXT:    v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cbranch_scc0 .LBB49_4
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 16, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 16, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v26
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccnz .LBB49_3
-; GFX11-NEXT:  .LBB49_2: ; %cmp.true
-; GFX11-NEXT:    v_add_f64 v[6:7], v[6:7], 1.0
-; GFX11-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
-; GFX11-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
-; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT:    v_add_f64 v[16:17], v[16:17], 1.0
-; GFX11-NEXT:    v_add_f64 v[11:12], v[11:12], 1.0
-; GFX11-NEXT:    v_add_f64 v[13:14], v[13:14], 1.0
-; GFX11-NEXT:    v_add_f64 v[18:19], v[18:19], 1.0
-; GFX11-NEXT:    v_add_f64 v[9:10], v[9:10], 1.0
-; GFX11-NEXT:    v_add_f64 v[20:21], v[20:21], 1.0
-; GFX11-NEXT:    v_add_f64 v[22:23], v[22:23], 1.0
-; GFX11-NEXT:    v_add_f64 v[24:25], v[24:25], 1.0
-; GFX11-NEXT:    v_add_f64 v[26:27], v[26:27], 1.0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 16, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 16, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v26
-; GFX11-NEXT:  .LBB49_3: ; %end
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT:    v_lshl_or_b32 v31, v31, 16, v21
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT:    v_lshl_or_b32 v8, v8, 16, v9
-; GFX11-NEXT:    v_lshl_or_b32 v9, v65, 16, v10
-; GFX11-NEXT:    v_lshl_or_b32 v10, v64, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v19
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v11
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_lshl_or_b32 v33, v33, 16, v27
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; GFX11-NEXT:    v_lshl_or_b32 v27, v66, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT:    v_lshl_or_b32 v29, v29, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT:    v_lshl_or_b32 v30, v30, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v12
-; GFX11-NEXT:    v_lshl_or_b32 v12, v54, 16, v13
-; GFX11-NEXT:    v_lshl_or_b32 v13, v53, 16, v14
-; GFX11-NEXT:    v_lshl_or_b32 v14, v52, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshl_or_b32 v19, v48, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v4
-; GFX11-NEXT:    v_lshl_or_b32 v11, v55, 16, v18
-; GFX11-NEXT:    v_lshl_or_b32 v15, v15, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_lshl_or_b32 v18, v49, 16, v0
-; GFX11-NEXT:    v_lshl_or_b32 v20, v39, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v5
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v6
-; GFX11-NEXT:    v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v7
-; GFX11-NEXT:    v_lshl_or_b32 v28, v28, 16, v22
-; GFX11-NEXT:    v_lshl_or_b32 v22, v37, 16, v1
-; GFX11-NEXT:    v_mov_b32_e32 v1, v33
-; GFX11-NEXT:    v_lshl_or_b32 v32, v32, 16, v26
-; GFX11-NEXT:    v_lshl_or_b32 v26, v67, 16, v24
-; GFX11-NEXT:    v_lshl_or_b32 v16, v51, 16, v16
-; GFX11-NEXT:    v_lshl_or_b32 v17, v50, 16, v17
-; GFX11-NEXT:    v_lshl_or_b32 v21, v38, 16, v0
-; GFX11-NEXT:    v_lshl_or_b32 v23, v36, 16, v2
-; GFX11-NEXT:    v_lshl_or_b32 v24, v35, 16, v3
-; GFX11-NEXT:    v_lshl_or_b32 v25, v34, 16, v4
-; GFX11-NEXT:    v_mov_b32_e32 v0, v32
-; GFX11-NEXT:    v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
-; GFX11-NEXT:    v_mov_b32_e32 v4, v28
-; GFX11-NEXT:    v_dual_mov_b32 v6, v30 :: v_dual_mov_b32 v7, v31
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB49_4:
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr67
-; GFX11-NEXT:    ; implicit-def: $vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr8
-; GFX11-NEXT:    ; implicit-def: $vgpr65
-; GFX11-NEXT:    ; implicit-def: $vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr55
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr15
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    s_branch .LBB49_2
+; GFX11-TRUE16-LABEL: bitcast_v13f64_to_v52i16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB49_4
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v0
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB49_3
+; GFX11-TRUE16-NEXT:  .LBB49_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v0
+; GFX11-TRUE16-NEXT:  .LBB49_3: ; %end
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v67.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v66.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v65.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v64.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v55.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v54.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v53.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v52.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v51.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v49.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v48.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v37.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v26.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB49_4:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT:    s_branch .LBB49_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v13f64_to_v52i16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v26, s0 :: v_dual_mov_b32 v27, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v25, s3
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v22, s16 :: v_dual_mov_b32 v23, s17
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v21, s19
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v9, s20 :: v_dual_mov_b32 v10, s21
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v19, s23
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v13, s24 :: v_dual_mov_b32 v14, s25
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v11, s26 :: v_dual_mov_b32 v12, s27
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB49_4
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v26
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB49_3
+; GFX11-FAKE16-NEXT:  .LBB49_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[11:12], v[11:12], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[13:14], v[13:14], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[9:10], v[9:10], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[26:27], v[26:27], 1.0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v26
+; GFX11-FAKE16-NEXT:  .LBB49_3: ; %end
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v31, v31, 16, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v8, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v65, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v64, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v33, v33, 16, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v27, v66, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v29, v29, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v30, v30, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v54, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v53, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v52, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v19, v48, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v55, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v15, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v49, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v39, 16, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v28, v28, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v22, v37, 16, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v33
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v32, v32, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v26, v67, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v16, v51, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v17, v50, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v21, v38, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v23, v36, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v24, v35, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v25, v34, 16, v4
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v32
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v28
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, v30 :: v_dual_mov_b32 v7, v31
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB49_4:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr15
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    s_branch .LBB49_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -34516,129 +36460,372 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a,
 ; GFX11-TRUE16-LABEL: bitcast_v52i16_to_v13f64_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v191, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v7.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v6.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v5.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff, v7
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s27, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s28, s15
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s29, s41
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v186, v6
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v187, v5 :: v_dual_mov_b32 v188, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v189, v3 :: v_dual_mov_b32 v190, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v191, v1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v185, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB51_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB51_3
 ; GFX11-TRUE16-NEXT:  .LBB51_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v15, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v16, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v17, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v191, v191, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:  .LBB51_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v185
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB51_4:
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v53, v25
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v25, v53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB51_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v52i16_to_v13f64_scalar:
@@ -36667,178 +38854,340 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a
 ; GFX9-NEXT:    ; implicit-def: $vgpr25
 ; GFX9-NEXT:    s_branch .LBB53_2
 ;
-; GFX11-LABEL: bitcast_v13f64_to_v52f16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-NEXT:    v_dual_mov_b32 v26, s0 :: v_dual_mov_b32 v27, s1
-; GFX11-NEXT:    v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v25, s3
-; GFX11-NEXT:    v_dual_mov_b32 v22, s16 :: v_dual_mov_b32 v23, s17
-; GFX11-NEXT:    v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v21, s19
-; GFX11-NEXT:    v_dual_mov_b32 v9, s20 :: v_dual_mov_b32 v10, s21
-; GFX11-NEXT:    v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v19, s23
-; GFX11-NEXT:    v_dual_mov_b32 v13, s24 :: v_dual_mov_b32 v14, s25
-; GFX11-NEXT:    v_dual_mov_b32 v11, s26 :: v_dual_mov_b32 v12, s27
-; GFX11-NEXT:    v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cbranch_scc0 .LBB53_4
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 16, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 16, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v26
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccnz .LBB53_3
-; GFX11-NEXT:  .LBB53_2: ; %cmp.true
-; GFX11-NEXT:    v_add_f64 v[6:7], v[6:7], 1.0
-; GFX11-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
-; GFX11-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
-; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT:    v_add_f64 v[16:17], v[16:17], 1.0
-; GFX11-NEXT:    v_add_f64 v[11:12], v[11:12], 1.0
-; GFX11-NEXT:    v_add_f64 v[13:14], v[13:14], 1.0
-; GFX11-NEXT:    v_add_f64 v[18:19], v[18:19], 1.0
-; GFX11-NEXT:    v_add_f64 v[9:10], v[9:10], 1.0
-; GFX11-NEXT:    v_add_f64 v[20:21], v[20:21], 1.0
-; GFX11-NEXT:    v_add_f64 v[22:23], v[22:23], 1.0
-; GFX11-NEXT:    v_add_f64 v[24:25], v[24:25], 1.0
-; GFX11-NEXT:    v_add_f64 v[26:27], v[26:27], 1.0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 16, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 16, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v26
-; GFX11-NEXT:  .LBB53_3: ; %end
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT:    v_lshl_or_b32 v31, v31, 16, v21
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT:    v_lshl_or_b32 v8, v8, 16, v9
-; GFX11-NEXT:    v_lshl_or_b32 v9, v65, 16, v10
-; GFX11-NEXT:    v_lshl_or_b32 v10, v64, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v19
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v11
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_lshl_or_b32 v33, v33, 16, v27
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; GFX11-NEXT:    v_lshl_or_b32 v27, v66, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT:    v_lshl_or_b32 v29, v29, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT:    v_lshl_or_b32 v30, v30, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v12
-; GFX11-NEXT:    v_lshl_or_b32 v12, v54, 16, v13
-; GFX11-NEXT:    v_lshl_or_b32 v13, v53, 16, v14
-; GFX11-NEXT:    v_lshl_or_b32 v14, v52, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshl_or_b32 v19, v48, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v4
-; GFX11-NEXT:    v_lshl_or_b32 v11, v55, 16, v18
-; GFX11-NEXT:    v_lshl_or_b32 v15, v15, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_lshl_or_b32 v18, v49, 16, v0
-; GFX11-NEXT:    v_lshl_or_b32 v20, v39, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v5
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v6
-; GFX11-NEXT:    v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v7
-; GFX11-NEXT:    v_lshl_or_b32 v28, v28, 16, v22
-; GFX11-NEXT:    v_lshl_or_b32 v22, v37, 16, v1
-; GFX11-NEXT:    v_mov_b32_e32 v1, v33
-; GFX11-NEXT:    v_lshl_or_b32 v32, v32, 16, v26
-; GFX11-NEXT:    v_lshl_or_b32 v26, v67, 16, v24
-; GFX11-NEXT:    v_lshl_or_b32 v16, v51, 16, v16
-; GFX11-NEXT:    v_lshl_or_b32 v17, v50, 16, v17
-; GFX11-NEXT:    v_lshl_or_b32 v21, v38, 16, v0
-; GFX11-NEXT:    v_lshl_or_b32 v23, v36, 16, v2
-; GFX11-NEXT:    v_lshl_or_b32 v24, v35, 16, v3
-; GFX11-NEXT:    v_lshl_or_b32 v25, v34, 16, v4
-; GFX11-NEXT:    v_mov_b32_e32 v0, v32
-; GFX11-NEXT:    v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
-; GFX11-NEXT:    v_mov_b32_e32 v4, v28
-; GFX11-NEXT:    v_dual_mov_b32 v6, v30 :: v_dual_mov_b32 v7, v31
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB53_4:
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr67
-; GFX11-NEXT:    ; implicit-def: $vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr8
-; GFX11-NEXT:    ; implicit-def: $vgpr65
-; GFX11-NEXT:    ; implicit-def: $vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr55
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr15
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    s_branch .LBB53_2
+; GFX11-TRUE16-LABEL: bitcast_v13f64_to_v52f16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB53_4
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v0
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB53_3
+; GFX11-TRUE16-NEXT:  .LBB53_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v0
+; GFX11-TRUE16-NEXT:  .LBB53_3: ; %end
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v67.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v66.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v65.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v64.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v55.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v54.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v53.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v52.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v51.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v49.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v48.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v37.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v26.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB53_4:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-TRUE16-NEXT:    s_branch .LBB53_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v13f64_to_v52f16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v26, s0 :: v_dual_mov_b32 v27, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v24, s2 :: v_dual_mov_b32 v25, s3
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v22, s16 :: v_dual_mov_b32 v23, s17
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v21, s19
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v9, s20 :: v_dual_mov_b32 v10, s21
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v19, s23
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v13, s24 :: v_dual_mov_b32 v14, s25
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v11, s26 :: v_dual_mov_b32 v12, s27
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB53_4
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v26
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB53_3
+; GFX11-FAKE16-NEXT:  .LBB53_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[11:12], v[11:12], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[13:14], v[13:14], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[9:10], v[9:10], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[26:27], v[26:27], 1.0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v26
+; GFX11-FAKE16-NEXT:  .LBB53_3: ; %end
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v31, v31, 16, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v8, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v65, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v64, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v33, v33, 16, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v27, v66, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v29, v29, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v30, v30, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v54, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v53, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v52, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v19, v48, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v55, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v15, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v49, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v39, 16, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v28, v28, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v22, v37, 16, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v33
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v32, v32, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v26, v67, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v16, v51, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v17, v50, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v21, v38, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v23, v36, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v24, v35, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v25, v34, 16, v4
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v32
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v28
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, v30 :: v_dual_mov_b32 v7, v31
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB53_4:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr15
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    s_branch .LBB53_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -39081,129 +41430,372 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; GFX11-TRUE16-LABEL: bitcast_v52f16_to_v13f64_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v191, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v7.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v6.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v5.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff, v7
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s40, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s27, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s28, s15
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s29, s41
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v186, v6
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v187, v5 :: v_dual_mov_b32 v188, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v189, v3 :: v_dual_mov_b32 v190, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v191, v1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v185, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB55_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s17
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s15 :: v_dual_mov_b32 v17, s16
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB55_3
 ; GFX11-TRUE16-NEXT:  .LBB55_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v39, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v38, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v37, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v36, 16, v52
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v35, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v34, 16, v50
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v33, 16, v49
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v32, 16, v48
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v15, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v16, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v17, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v191, 0x200, v191 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:  .LBB55_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v185
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v191 :: v_dual_mov_b32 v20, v190
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v189 :: v_dual_mov_b32 v22, v188
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v187 :: v_dual_mov_b32 v24, v186
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v14, v119
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB55_4:
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v53, v25
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v25, v53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB55_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v52f16_to_v13f64_scalar:
@@ -41806,23 +44398,12 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
 ; GFX11-TRUE16-LABEL: bitcast_v52i16_to_v52f16_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, 0
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v7.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v6.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v5.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v25.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v25.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v25.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v25.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v25.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v25.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v25.h
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, 0
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s29, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s28, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s27, 16
@@ -41844,26 +44425,25 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s46, 0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB57_3
-; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v25.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v24.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v26.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v23.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v26.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v22.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v26.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v21.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v26.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.l, v20.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.h, v26.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v19.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, v26.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v18.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v26.h
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s46
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB57_4
 ; GFX11-TRUE16-NEXT:  .LBB57_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v25, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v24, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v23, 16, v5
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v22, 16, v4
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v3, v21, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v20, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v19, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v18, 16, v0
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s29, s29, s45
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s28, s28, s44
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s27, s27, s43
@@ -41882,71 +44462,79 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s7
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s41
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v16, s29, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v17, s28, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v11, s27, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v12, s26, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v17, s29, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v16, s28, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v15, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v8, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v12, s15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v11, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v10, s13, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v10, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v30, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v31, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v26, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v33, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v32, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v29, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v28, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v27, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v33
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v32
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v29
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v28
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v27
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v26
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v31
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v30
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v10
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v8, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v6, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v4, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v15
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v25
 ; GFX11-TRUE16-NEXT:    s_branch .LBB57_5
 ; GFX11-TRUE16-NEXT:  .LBB57_3:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26
 ; GFX11-TRUE16-NEXT:    s_branch .LBB57_2
 ; GFX11-TRUE16-NEXT:  .LBB57_4:
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s29 :: v_dual_mov_b32 v17, s28
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, s27 :: v_dual_mov_b32 v12, s26
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v14, s24
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, s23 :: v_dual_mov_b32 v8, s22
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s20
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v30, s19 :: v_dual_mov_b32 v31, s18
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, s17 :: v_dual_mov_b32 v27, s16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, s3 :: v_dual_mov_b32 v29, s2
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v32, s1 :: v_dual_mov_b32 v33, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v16, s28
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v14, s26
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v12, s24
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v8, s20
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v4, s16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v34, s45 :: v_dual_mov_b32 v35, s44
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v36, s43 :: v_dual_mov_b32 v37, s42
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v38, s40 :: v_dual_mov_b32 v39, s15
@@ -41957,62 +44545,46 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v64, s5 :: v_dual_mov_b32 v65, s7
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v66, s4 :: v_dual_mov_b32 v67, s41
 ; GFX11-TRUE16-NEXT:  .LBB57_5: ; %end
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v33
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v68, 0xffff, v32
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v29
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v28, 0xffff, v28
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v69, 0xffff, v27
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v32, v67, 16, v33
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff, v26
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v65, 16, v29
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v29, v54, 16, v67
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v30
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v64, 16, v28
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v28, v55, 16, v69
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v30, v53, 16, v31
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v50, 16, v9
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v20, 16, v2
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v2, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v31, v52, 16, v54
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v33, v66, 16, v68
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v48, 16, v15
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v36, 16, v50
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff, v16
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v18, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v19, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v31 :: v_dual_and_b32 v4, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v51, 16, v10
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v49, 16, v55
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v39, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v38, 16, v13
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v16, v35, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v17, v34, 16, v36
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v37, 16, v49
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v21, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v22, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v23, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v24, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v25, 16, v4
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v33
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v28
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v30
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v67.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v66.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v65.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v64.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v55.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v54.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v53.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v52.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v51.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v49.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v48.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v37.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v26.l
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v52i16_to_v52f16_scalar:
@@ -44258,23 +46830,12 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
 ; GFX11-TRUE16-LABEL: bitcast_v52f16_to_v52i16_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, 0
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v7.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v6.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v5.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v25.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v25.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v25.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v25.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v25.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v25.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v25.h
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, 0
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s29, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s28, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s27, 16
@@ -44296,26 +46857,25 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s46, 0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB59_3
-; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v25.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v24.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v26.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v23.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v26.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v22.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v26.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v21.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v26.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.l, v20.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.h, v26.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v19.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, v26.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v18.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v26.h
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s46
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB59_4
 ; GFX11-TRUE16-NEXT:  .LBB59_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v25, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v24, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v23, 16, v5
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v22, 16, v4
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v3, v21, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v20, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v19, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v18, 16, v0
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s29, s29, s45
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s28, s28, s44
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s27, s27, s43
@@ -44334,71 +46894,79 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s7
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s41
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v16, 0x200, s29 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v17, 0x200, s28 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v11, 0x200, s27 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v12, 0x200, s26 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v17, 0x200, s29 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v16, 0x200, s28 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v15, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v8, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v12, 0x200, s15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v11, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v10, 0x200, s13 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v10, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v30, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v31, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v26, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v33, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v32, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v29, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v28, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v27, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v33
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v32
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v29
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v28
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v27
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v26
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v31
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v30
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v10
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v8, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v7, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v6, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v4, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v15
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v25
 ; GFX11-TRUE16-NEXT:    s_branch .LBB59_5
 ; GFX11-TRUE16-NEXT:  .LBB59_3:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26
 ; GFX11-TRUE16-NEXT:    s_branch .LBB59_2
 ; GFX11-TRUE16-NEXT:  .LBB59_4:
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s29 :: v_dual_mov_b32 v17, s28
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, s27 :: v_dual_mov_b32 v12, s26
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v14, s24
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, s23 :: v_dual_mov_b32 v8, s22
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s20
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v30, s19 :: v_dual_mov_b32 v31, s18
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, s17 :: v_dual_mov_b32 v27, s16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, s3 :: v_dual_mov_b32 v29, s2
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v32, s1 :: v_dual_mov_b32 v33, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v16, s28
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v14, s26
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v12, s24
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v8, s20
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v4, s16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v34, s45 :: v_dual_mov_b32 v35, s44
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v36, s43 :: v_dual_mov_b32 v37, s42
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v38, s40 :: v_dual_mov_b32 v39, s15
@@ -44409,62 +46977,46 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v64, s5 :: v_dual_mov_b32 v65, s7
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v66, s4 :: v_dual_mov_b32 v67, s41
 ; GFX11-TRUE16-NEXT:  .LBB59_5: ; %end
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v33
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v68, 0xffff, v32
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v29
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v28, 0xffff, v28
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v69, 0xffff, v27
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v32, v67, 16, v33
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff, v26
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v65, 16, v29
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v29, v54, 16, v67
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v30
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v64, 16, v28
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v28, v55, 16, v69
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v30, v53, 16, v31
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v50, 16, v9
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v20, 16, v2
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v2, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v31, v52, 16, v54
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v33, v66, 16, v68
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v48, 16, v15
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v36, 16, v50
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff, v16
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v18, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v19, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v31 :: v_dual_and_b32 v4, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v51, 16, v10
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v49, 16, v55
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v39, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v38, 16, v13
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v16, v35, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v17, v34, 16, v36
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v37, 16, v49
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v21, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v22, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v23, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v24, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v25, 16, v4
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v33
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, v26 :: v_dual_mov_b32 v3, v27
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v28
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v30
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v27, v27 :: v_dual_mov_b32 v26, v26
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v67.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v66.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v65.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v64.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v55.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v54.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v53.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v52.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v51.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v49.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v48.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v37.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v26.l
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v52f16_to_v52i16_scalar:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
index cca9e713a2ef8..f6ff5be918706 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
@@ -6779,141 +6779,376 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3
 ; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v28i32_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v191, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v9.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v8.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v7.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v6.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v5.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff, v9
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v189, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v188, v6 :: v_dual_mov_b32 v191, v3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v190, v4 :: v_dual_mov_b32 v185, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v186, v1 :: v_dual_mov_b32 v187, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB15_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB15_3
 ; GFX11-TRUE16-NEXT:  .LBB15_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v191, v191, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:  .LBB15_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v187
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v28
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB15_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v64, v28 :: v_dual_mov_b32 v53, v26
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v54, v25
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v28, v64
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB15_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v28i32_scalar:
@@ -11885,141 +12120,376 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v28i32_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v191, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v9.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v8.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v7.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v6.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v5.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff, v9
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v189, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v188, v6 :: v_dual_mov_b32 v191, v3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v190, v4 :: v_dual_mov_b32 v185, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v186, v1 :: v_dual_mov_b32 v187, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB19_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB19_3
 ; GFX11-TRUE16-NEXT:  .LBB19_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v191, 0x200, v191 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:  .LBB19_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v187
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v28
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB19_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v64, v28 :: v_dual_mov_b32 v53, v26
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v54, v25
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v28, v64
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB19_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v56f16_to_v28i32_scalar:
@@ -15595,191 +16065,364 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a,
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
 ; GFX9-NEXT:    s_branch .LBB29_2
 ;
-; GFX11-LABEL: bitcast_v28f32_to_v56i16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-NEXT:    v_dual_mov_b32 v28, s0 :: v_dual_mov_b32 v27, s1
-; GFX11-NEXT:    v_dual_mov_b32 v26, s2 :: v_dual_mov_b32 v25, s3
-; GFX11-NEXT:    v_dual_mov_b32 v24, s16 :: v_dual_mov_b32 v23, s17
-; GFX11-NEXT:    v_dual_mov_b32 v22, s18 :: v_dual_mov_b32 v21, s19
-; GFX11-NEXT:    v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v19, s21
-; GFX11-NEXT:    v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v11, s24
-; GFX11-NEXT:    v_dual_mov_b32 v12, s23 :: v_dual_mov_b32 v15, s25
-; GFX11-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v13, s27
-; GFX11-NEXT:    v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cbranch_scc0 .LBB29_4
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v28
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccnz .LBB29_3
-; GFX11-NEXT:  .LBB29_2: ; %cmp.true
-; GFX11-NEXT:    v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT:    v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT:    v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT:    v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT:    v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT:    v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
-; GFX11-NEXT:    v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14
-; GFX11-NEXT:    v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v12, 1.0, v12
-; GFX11-NEXT:    v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT:    v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
-; GFX11-NEXT:    v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
-; GFX11-NEXT:    v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
-; GFX11-NEXT:    v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26
-; GFX11-NEXT:    v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v28, 1.0, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v28
-; GFX11-NEXT:  .LBB29_3: ; %end
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT:    v_lshl_or_b32 v29, v29, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v16
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_lshl_or_b32 v31, v31, 16, v27
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff, v28
-; GFX11-NEXT:    v_lshl_or_b32 v33, v33, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; GFX11-NEXT:    v_lshl_or_b32 v35, v35, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT:    v_lshl_or_b32 v37, v37, 16, v21
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v11
-; GFX11-NEXT:    v_lshl_or_b32 v10, v10, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v13
-; GFX11-NEXT:    v_lshl_or_b32 v16, v65, 16, v17
-; GFX11-NEXT:    v_lshl_or_b32 v17, v64, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshl_or_b32 v19, v54, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v6
-; GFX11-NEXT:    v_lshl_or_b32 v32, v32, 16, v26
-; GFX11-NEXT:    v_lshl_or_b32 v11, v70, 16, v12
-; GFX11-NEXT:    v_lshl_or_b32 v12, v69, 16, v21
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_lshl_or_b32 v13, v68, 16, v15
-; GFX11-NEXT:    v_lshl_or_b32 v15, v66, 16, v18
-; GFX11-NEXT:    v_lshl_or_b32 v18, v55, 16, v0
-; GFX11-NEXT:    v_lshl_or_b32 v21, v52, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v5
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v8
-; GFX11-NEXT:    v_mov_b32_e32 v5, v35
-; GFX11-NEXT:    v_lshl_or_b32 v34, v34, 16, v24
-; GFX11-NEXT:    v_lshl_or_b32 v24, v49, 16, v1
-; GFX11-NEXT:    v_mov_b32_e32 v1, v31
-; GFX11-NEXT:    v_lshl_or_b32 v30, v30, 16, v28
-; GFX11-NEXT:    v_lshl_or_b32 v28, v71, 16, v20
-; GFX11-NEXT:    v_lshl_or_b32 v20, v53, 16, v2
-; GFX11-NEXT:    v_dual_mov_b32 v7, v37 :: v_dual_and_b32 v2, 0xffff, v7
-; GFX11-NEXT:    v_lshl_or_b32 v36, v36, 16, v22
-; GFX11-NEXT:    v_lshl_or_b32 v22, v51, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v9
-; GFX11-NEXT:    v_lshl_or_b32 v14, v67, 16, v14
-; GFX11-NEXT:    v_lshl_or_b32 v23, v50, 16, v0
-; GFX11-NEXT:    v_lshl_or_b32 v25, v48, 16, v2
-; GFX11-NEXT:    v_lshl_or_b32 v26, v39, 16, v3
-; GFX11-NEXT:    v_lshl_or_b32 v27, v38, 16, v4
-; GFX11-NEXT:    v_mov_b32_e32 v0, v30
-; GFX11-NEXT:    v_dual_mov_b32 v2, v32 :: v_dual_mov_b32 v3, v33
-; GFX11-NEXT:    v_mov_b32_e32 v4, v34
-; GFX11-NEXT:    v_mov_b32_e32 v6, v36
-; GFX11-NEXT:    v_dual_mov_b32 v8, v28 :: v_dual_mov_b32 v9, v29
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB29_4:
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr71
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr10
-; GFX11-NEXT:    ; implicit-def: $vgpr70
-; GFX11-NEXT:    ; implicit-def: $vgpr69
-; GFX11-NEXT:    ; implicit-def: $vgpr68
-; GFX11-NEXT:    ; implicit-def: $vgpr67
-; GFX11-NEXT:    ; implicit-def: $vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr65
-; GFX11-NEXT:    ; implicit-def: $vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr55
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    s_branch .LBB29_2
+; GFX11-TRUE16-LABEL: bitcast_v28f32_to_v56i16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, v10 :: v_dual_mov_b32 v27, v9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB29_4
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v0
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB29_3
+; GFX11-TRUE16-NEXT:  .LBB29_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v0
+; GFX11-TRUE16-NEXT:  .LBB29_3: ; %end
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v71, v71 :: v_dual_mov_b32 v70, v70
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v69, v69 :: v_dual_mov_b32 v68, v68
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v71.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v70.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v69.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v68.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v67.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v66.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v65.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v64.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v55.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v54.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v53.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v52.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v51.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v49.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v48.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v37.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v28.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB29_4:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT:    s_branch .LBB29_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v28f32_to_v56i16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v10
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v28, s0 :: v_dual_mov_b32 v27, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v26, s2 :: v_dual_mov_b32 v25, s3
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v24, s16 :: v_dual_mov_b32 v23, s17
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v22, s18 :: v_dual_mov_b32 v21, s19
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v19, s21
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v11, s24
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, s23 :: v_dual_mov_b32 v15, s25
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v13, s27
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB29_4
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v28
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB29_3
+; GFX11-FAKE16-NEXT:  .LBB29_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v28, 1.0, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v28
+; GFX11-FAKE16-NEXT:  .LBB29_3: ; %end
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v29, v29, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v31, v31, 16, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff, v28
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v33, v33, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v35, v35, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v37, v37, 16, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v10, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v16, v65, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v17, v64, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v19, v54, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v32, v32, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v70, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v69, 16, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v68, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v66, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v55, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v21, v52, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v35
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v34, v34, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v24, v49, 16, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v31
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v30, v30, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v28, v71, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v53, 16, v2
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v7, v37 :: v_dual_and_b32 v2, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v36, v36, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v22, v51, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v67, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v23, v50, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v25, v48, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v26, v39, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v27, v38, 16, v4
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v30
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, v32 :: v_dual_mov_b32 v3, v33
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v34
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, v36
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, v28 :: v_dual_mov_b32 v9, v29
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB29_4:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    s_branch .LBB29_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -17915,141 +18558,376 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a,
 ; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v28f32_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v191, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v9.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v8.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v7.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v6.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v5.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff, v9
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v189, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v188, v6 :: v_dual_mov_b32 v191, v3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v190, v4 :: v_dual_mov_b32 v185, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v186, v1 :: v_dual_mov_b32 v187, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB31_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB31_3
 ; GFX11-TRUE16-NEXT:  .LBB31_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v191, v191, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:  .LBB31_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v187
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v28
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB31_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v64, v28 :: v_dual_mov_b32 v53, v26
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v54, v25
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v28, v64
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB31_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v28f32_scalar:
@@ -20379,191 +21257,364 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
 ; GFX9-NEXT:    s_branch .LBB33_2
 ;
-; GFX11-LABEL: bitcast_v28f32_to_v56f16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-NEXT:    v_dual_mov_b32 v28, s0 :: v_dual_mov_b32 v27, s1
-; GFX11-NEXT:    v_dual_mov_b32 v26, s2 :: v_dual_mov_b32 v25, s3
-; GFX11-NEXT:    v_dual_mov_b32 v24, s16 :: v_dual_mov_b32 v23, s17
-; GFX11-NEXT:    v_dual_mov_b32 v22, s18 :: v_dual_mov_b32 v21, s19
-; GFX11-NEXT:    v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v19, s21
-; GFX11-NEXT:    v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v11, s24
-; GFX11-NEXT:    v_dual_mov_b32 v12, s23 :: v_dual_mov_b32 v15, s25
-; GFX11-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v13, s27
-; GFX11-NEXT:    v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cbranch_scc0 .LBB33_4
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v28
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccnz .LBB33_3
-; GFX11-NEXT:  .LBB33_2: ; %cmp.true
-; GFX11-NEXT:    v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT:    v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT:    v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT:    v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT:    v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT:    v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
-; GFX11-NEXT:    v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14
-; GFX11-NEXT:    v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v12, 1.0, v12
-; GFX11-NEXT:    v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT:    v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
-; GFX11-NEXT:    v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
-; GFX11-NEXT:    v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
-; GFX11-NEXT:    v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26
-; GFX11-NEXT:    v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v28, 1.0, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v28
-; GFX11-NEXT:  .LBB33_3: ; %end
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT:    v_lshl_or_b32 v29, v29, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v16
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_lshl_or_b32 v31, v31, 16, v27
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff, v28
-; GFX11-NEXT:    v_lshl_or_b32 v33, v33, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; GFX11-NEXT:    v_lshl_or_b32 v35, v35, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT:    v_lshl_or_b32 v37, v37, 16, v21
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v11
-; GFX11-NEXT:    v_lshl_or_b32 v10, v10, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v13
-; GFX11-NEXT:    v_lshl_or_b32 v16, v65, 16, v17
-; GFX11-NEXT:    v_lshl_or_b32 v17, v64, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshl_or_b32 v19, v54, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v6
-; GFX11-NEXT:    v_lshl_or_b32 v32, v32, 16, v26
-; GFX11-NEXT:    v_lshl_or_b32 v11, v70, 16, v12
-; GFX11-NEXT:    v_lshl_or_b32 v12, v69, 16, v21
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_lshl_or_b32 v13, v68, 16, v15
-; GFX11-NEXT:    v_lshl_or_b32 v15, v66, 16, v18
-; GFX11-NEXT:    v_lshl_or_b32 v18, v55, 16, v0
-; GFX11-NEXT:    v_lshl_or_b32 v21, v52, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v5
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v8
-; GFX11-NEXT:    v_mov_b32_e32 v5, v35
-; GFX11-NEXT:    v_lshl_or_b32 v34, v34, 16, v24
-; GFX11-NEXT:    v_lshl_or_b32 v24, v49, 16, v1
-; GFX11-NEXT:    v_mov_b32_e32 v1, v31
-; GFX11-NEXT:    v_lshl_or_b32 v30, v30, 16, v28
-; GFX11-NEXT:    v_lshl_or_b32 v28, v71, 16, v20
-; GFX11-NEXT:    v_lshl_or_b32 v20, v53, 16, v2
-; GFX11-NEXT:    v_dual_mov_b32 v7, v37 :: v_dual_and_b32 v2, 0xffff, v7
-; GFX11-NEXT:    v_lshl_or_b32 v36, v36, 16, v22
-; GFX11-NEXT:    v_lshl_or_b32 v22, v51, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v9
-; GFX11-NEXT:    v_lshl_or_b32 v14, v67, 16, v14
-; GFX11-NEXT:    v_lshl_or_b32 v23, v50, 16, v0
-; GFX11-NEXT:    v_lshl_or_b32 v25, v48, 16, v2
-; GFX11-NEXT:    v_lshl_or_b32 v26, v39, 16, v3
-; GFX11-NEXT:    v_lshl_or_b32 v27, v38, 16, v4
-; GFX11-NEXT:    v_mov_b32_e32 v0, v30
-; GFX11-NEXT:    v_dual_mov_b32 v2, v32 :: v_dual_mov_b32 v3, v33
-; GFX11-NEXT:    v_mov_b32_e32 v4, v34
-; GFX11-NEXT:    v_mov_b32_e32 v6, v36
-; GFX11-NEXT:    v_dual_mov_b32 v8, v28 :: v_dual_mov_b32 v9, v29
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB33_4:
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr71
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr10
-; GFX11-NEXT:    ; implicit-def: $vgpr70
-; GFX11-NEXT:    ; implicit-def: $vgpr69
-; GFX11-NEXT:    ; implicit-def: $vgpr68
-; GFX11-NEXT:    ; implicit-def: $vgpr67
-; GFX11-NEXT:    ; implicit-def: $vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr65
-; GFX11-NEXT:    ; implicit-def: $vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr55
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    s_branch .LBB33_2
+; GFX11-TRUE16-LABEL: bitcast_v28f32_to_v56f16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, v10 :: v_dual_mov_b32 v27, v9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB33_4
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v0
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB33_3
+; GFX11-TRUE16-NEXT:  .LBB33_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v0
+; GFX11-TRUE16-NEXT:  .LBB33_3: ; %end
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v71, v71 :: v_dual_mov_b32 v70, v70
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v69, v69 :: v_dual_mov_b32 v68, v68
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v71.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v70.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v69.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v68.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v67.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v66.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v65.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v64.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v55.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v54.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v53.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v52.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v51.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v49.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v48.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v37.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v28.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB33_4:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT:    s_branch .LBB33_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v28f32_to_v56f16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v10
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v28, s0 :: v_dual_mov_b32 v27, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v26, s2 :: v_dual_mov_b32 v25, s3
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v24, s16 :: v_dual_mov_b32 v23, s17
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v22, s18 :: v_dual_mov_b32 v21, s19
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v19, s21
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v11, s24
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, s23 :: v_dual_mov_b32 v15, s25
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v13, s27
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB33_4
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v28
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB33_3
+; GFX11-FAKE16-NEXT:  .LBB33_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v28, 1.0, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v28
+; GFX11-FAKE16-NEXT:  .LBB33_3: ; %end
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v29, v29, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v31, v31, 16, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff, v28
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v33, v33, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v35, v35, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v37, v37, 16, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v10, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v16, v65, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v17, v64, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v19, v54, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v32, v32, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v70, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v69, 16, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v68, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v66, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v55, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v21, v52, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v35
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v34, v34, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v24, v49, 16, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v31
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v30, v30, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v28, v71, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v53, 16, v2
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v7, v37 :: v_dual_and_b32 v2, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v36, v36, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v22, v51, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v67, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v23, v50, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v25, v48, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v26, v39, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v27, v38, 16, v4
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v30
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, v32 :: v_dual_mov_b32 v3, v33
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v34
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, v36
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, v28 :: v_dual_mov_b32 v9, v29
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB33_4:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    s_branch .LBB33_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -23006,141 +24057,376 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v28f32_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v191, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v9.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v8.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v7.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v6.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v5.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff, v9
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v189, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v188, v6 :: v_dual_mov_b32 v191, v3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v190, v4 :: v_dual_mov_b32 v185, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v186, v1 :: v_dual_mov_b32 v187, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB35_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB35_3
 ; GFX11-TRUE16-NEXT:  .LBB35_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v191, 0x200, v191 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:  .LBB35_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v187
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v28
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB35_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v64, v28 :: v_dual_mov_b32 v53, v26
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v54, v25
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v28, v64
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB35_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v56f16_to_v28f32_scalar:
@@ -28216,141 +29502,376 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3
 ; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v14i64_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v191, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v9.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v8.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v7.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v6.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v5.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff, v9
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v189, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v188, v6 :: v_dual_mov_b32 v191, v3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v190, v4 :: v_dual_mov_b32 v185, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v186, v1 :: v_dual_mov_b32 v187, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB43_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB43_3
 ; GFX11-TRUE16-NEXT:  .LBB43_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v191, v191, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:  .LBB43_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v187
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v28
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB43_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v64, v28 :: v_dual_mov_b32 v53, v26
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v54, v25
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v28, v64
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB43_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v14i64_scalar:
@@ -33336,141 +34857,376 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v14i64_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v191, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v9.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v8.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v7.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v6.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v5.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff, v9
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v189, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v188, v6 :: v_dual_mov_b32 v191, v3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v190, v4 :: v_dual_mov_b32 v185, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v186, v1 :: v_dual_mov_b32 v187, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB47_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB47_3
 ; GFX11-TRUE16-NEXT:  .LBB47_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v191, 0x200, v191 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:  .LBB47_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v187
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v28
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB47_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v64, v28 :: v_dual_mov_b32 v53, v26
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v54, v25
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v28, v64
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB47_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v56f16_to_v14i64_scalar:
@@ -35225,191 +36981,364 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a,
 ; GFX9-NEXT:    ; implicit-def: $vgpr27
 ; GFX9-NEXT:    s_branch .LBB49_2
 ;
-; GFX11-LABEL: bitcast_v14f64_to_v56i16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-NEXT:    v_dual_mov_b32 v27, s0 :: v_dual_mov_b32 v28, s1
-; GFX11-NEXT:    v_dual_mov_b32 v25, s2 :: v_dual_mov_b32 v26, s3
-; GFX11-NEXT:    v_dual_mov_b32 v23, s16 :: v_dual_mov_b32 v24, s17
-; GFX11-NEXT:    v_dual_mov_b32 v21, s18 :: v_dual_mov_b32 v22, s19
-; GFX11-NEXT:    v_dual_mov_b32 v19, s20 :: v_dual_mov_b32 v20, s21
-; GFX11-NEXT:    v_dual_mov_b32 v11, s22 :: v_dual_mov_b32 v12, s23
-; GFX11-NEXT:    v_dual_mov_b32 v17, s24 :: v_dual_mov_b32 v18, s25
-; GFX11-NEXT:    v_dual_mov_b32 v13, s26 :: v_dual_mov_b32 v14, s27
-; GFX11-NEXT:    v_dual_mov_b32 v15, s28 :: v_dual_mov_b32 v16, s29
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cbranch_scc0 .LBB49_4
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v27
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccnz .LBB49_3
-; GFX11-NEXT:  .LBB49_2: ; %cmp.true
-; GFX11-NEXT:    v_add_f64 v[8:9], v[8:9], 1.0
-; GFX11-NEXT:    v_add_f64 v[6:7], v[6:7], 1.0
-; GFX11-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
-; GFX11-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
-; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT:    v_add_f64 v[15:16], v[15:16], 1.0
-; GFX11-NEXT:    v_add_f64 v[13:14], v[13:14], 1.0
-; GFX11-NEXT:    v_add_f64 v[17:18], v[17:18], 1.0
-; GFX11-NEXT:    v_add_f64 v[11:12], v[11:12], 1.0
-; GFX11-NEXT:    v_add_f64 v[19:20], v[19:20], 1.0
-; GFX11-NEXT:    v_add_f64 v[21:22], v[21:22], 1.0
-; GFX11-NEXT:    v_add_f64 v[23:24], v[23:24], 1.0
-; GFX11-NEXT:    v_add_f64 v[25:26], v[25:26], 1.0
-; GFX11-NEXT:    v_add_f64 v[27:28], v[27:28], 1.0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v27
-; GFX11-NEXT:  .LBB49_3: ; %end
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff, v28
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshl_or_b32 v31, v31, 16, v28
-; GFX11-NEXT:    v_lshl_or_b32 v37, v37, 16, v22
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT:    v_lshl_or_b32 v28, v71, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v14
-; GFX11-NEXT:    v_lshl_or_b32 v29, v29, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshl_or_b32 v36, v36, 16, v21
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT:    v_lshl_or_b32 v32, v32, 16, v25
-; GFX11-NEXT:    v_lshl_or_b32 v10, v10, 16, v11
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v15
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v16
-; GFX11-NEXT:    v_lshl_or_b32 v15, v66, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshl_or_b32 v19, v54, 16, v1
-; GFX11-NEXT:    v_lshl_or_b32 v22, v51, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v6
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v9
-; GFX11-NEXT:    v_mov_b32_e32 v6, v36
-; GFX11-NEXT:    v_lshl_or_b32 v34, v34, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff, v26
-; GFX11-NEXT:    v_mov_b32_e32 v9, v29
-; GFX11-NEXT:    v_lshl_or_b32 v11, v70, 16, v12
-; GFX11-NEXT:    v_lshl_or_b32 v12, v69, 16, v17
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v18
-; GFX11-NEXT:    v_lshl_or_b32 v30, v30, 16, v27
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v13
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshl_or_b32 v13, v68, 16, v17
-; GFX11-NEXT:    v_lshl_or_b32 v17, v64, 16, v21
-; GFX11-NEXT:    v_lshl_or_b32 v21, v52, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v8
-; GFX11-NEXT:    v_lshl_or_b32 v27, v38, 16, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v34
-; GFX11-NEXT:    v_lshl_or_b32 v33, v33, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT:    v_lshl_or_b32 v14, v67, 16, v18
-; GFX11-NEXT:    v_lshl_or_b32 v16, v65, 16, v20
-; GFX11-NEXT:    v_lshl_or_b32 v18, v55, 16, v0
-; GFX11-NEXT:    v_lshl_or_b32 v20, v53, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v5
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v7
-; GFX11-NEXT:    v_lshl_or_b32 v26, v39, 16, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v33
-; GFX11-NEXT:    v_lshl_or_b32 v35, v35, 16, v24
-; GFX11-NEXT:    v_lshl_or_b32 v23, v50, 16, v0
-; GFX11-NEXT:    v_lshl_or_b32 v24, v49, 16, v1
-; GFX11-NEXT:    v_lshl_or_b32 v25, v48, 16, v2
-; GFX11-NEXT:    v_dual_mov_b32 v0, v30 :: v_dual_mov_b32 v1, v31
-; GFX11-NEXT:    v_dual_mov_b32 v2, v32 :: v_dual_mov_b32 v5, v35
-; GFX11-NEXT:    v_dual_mov_b32 v7, v37 :: v_dual_mov_b32 v8, v28
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB49_4:
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr71
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr10
-; GFX11-NEXT:    ; implicit-def: $vgpr70
-; GFX11-NEXT:    ; implicit-def: $vgpr69
-; GFX11-NEXT:    ; implicit-def: $vgpr68
-; GFX11-NEXT:    ; implicit-def: $vgpr67
-; GFX11-NEXT:    ; implicit-def: $vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr65
-; GFX11-NEXT:    ; implicit-def: $vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr55
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    s_branch .LBB49_2
+; GFX11-TRUE16-LABEL: bitcast_v14f64_to_v56i16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, v10 :: v_dual_mov_b32 v27, v9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB49_4
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v0
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB49_3
+; GFX11-TRUE16-NEXT:  .LBB49_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_f64 v[26:27], v[26:27], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v0
+; GFX11-TRUE16-NEXT:  .LBB49_3: ; %end
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v71, v71 :: v_dual_mov_b32 v70, v70
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v69, v69 :: v_dual_mov_b32 v68, v68
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v71.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v70.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v69.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v68.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v67.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v66.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v65.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v64.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v55.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v54.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v53.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v52.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v51.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v49.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v48.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v37.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v28.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB49_4:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT:    s_branch .LBB49_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v14f64_to_v56i16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v10
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v27, s0 :: v_dual_mov_b32 v28, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v25, s2 :: v_dual_mov_b32 v26, s3
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v23, s16 :: v_dual_mov_b32 v24, s17
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v21, s18 :: v_dual_mov_b32 v22, s19
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v19, s20 :: v_dual_mov_b32 v20, s21
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v11, s22 :: v_dual_mov_b32 v12, s23
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v17, s24 :: v_dual_mov_b32 v18, s25
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v13, s26 :: v_dual_mov_b32 v14, s27
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v15, s28 :: v_dual_mov_b32 v16, s29
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB49_4
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v27
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB49_3
+; GFX11-FAKE16-NEXT:  .LBB49_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[15:16], v[15:16], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[13:14], v[13:14], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[17:18], v[17:18], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[11:12], v[11:12], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[19:20], v[19:20], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[21:22], v[21:22], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[23:24], v[23:24], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[25:26], v[25:26], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[27:28], v[27:28], 1.0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v27
+; GFX11-FAKE16-NEXT:  .LBB49_3: ; %end
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v31, v31, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v37, v37, 16, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v28, v71, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v29, v29, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v36, v36, 16, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v32, v32, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v10, 16, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v66, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v19, v54, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v22, v51, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, v36
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v34, v34, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff, v26
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v9, v29
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v70, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v69, 16, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v18
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v30, v30, 16, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v68, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v17, v64, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v21, v52, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v27, v38, 16, v4
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v34
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v33, v33, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v67, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v16, v65, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v55, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v53, 16, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v26, v39, 16, v3
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, v33
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v35, v35, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v23, v50, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v24, v49, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v25, v48, 16, v2
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, v30 :: v_dual_mov_b32 v1, v31
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, v32 :: v_dual_mov_b32 v5, v35
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v7, v37 :: v_dual_mov_b32 v8, v28
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB49_4:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    s_branch .LBB49_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -37545,141 +39474,376 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a,
 ; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v14f64_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v191, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v9.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v8.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v7.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v6.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v5.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff, v9
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v189, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v188, v6 :: v_dual_mov_b32 v191, v3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v190, v4 :: v_dual_mov_b32 v185, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v186, v1 :: v_dual_mov_b32 v187, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB51_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB51_3
 ; GFX11-TRUE16-NEXT:  .LBB51_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v191, v191, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:  .LBB51_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v187
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v28
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB51_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v64, v28 :: v_dual_mov_b32 v53, v26
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v54, v25
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v28, v64
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB51_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v14f64_scalar:
@@ -39918,191 +42082,364 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
 ; GFX9-NEXT:    ; implicit-def: $vgpr27
 ; GFX9-NEXT:    s_branch .LBB53_2
 ;
-; GFX11-LABEL: bitcast_v14f64_to_v56f16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-NEXT:    v_dual_mov_b32 v27, s0 :: v_dual_mov_b32 v28, s1
-; GFX11-NEXT:    v_dual_mov_b32 v25, s2 :: v_dual_mov_b32 v26, s3
-; GFX11-NEXT:    v_dual_mov_b32 v23, s16 :: v_dual_mov_b32 v24, s17
-; GFX11-NEXT:    v_dual_mov_b32 v21, s18 :: v_dual_mov_b32 v22, s19
-; GFX11-NEXT:    v_dual_mov_b32 v19, s20 :: v_dual_mov_b32 v20, s21
-; GFX11-NEXT:    v_dual_mov_b32 v11, s22 :: v_dual_mov_b32 v12, s23
-; GFX11-NEXT:    v_dual_mov_b32 v17, s24 :: v_dual_mov_b32 v18, s25
-; GFX11-NEXT:    v_dual_mov_b32 v13, s26 :: v_dual_mov_b32 v14, s27
-; GFX11-NEXT:    v_dual_mov_b32 v15, s28 :: v_dual_mov_b32 v16, s29
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cbranch_scc0 .LBB53_4
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v27
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccnz .LBB53_3
-; GFX11-NEXT:  .LBB53_2: ; %cmp.true
-; GFX11-NEXT:    v_add_f64 v[8:9], v[8:9], 1.0
-; GFX11-NEXT:    v_add_f64 v[6:7], v[6:7], 1.0
-; GFX11-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
-; GFX11-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
-; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT:    v_add_f64 v[15:16], v[15:16], 1.0
-; GFX11-NEXT:    v_add_f64 v[13:14], v[13:14], 1.0
-; GFX11-NEXT:    v_add_f64 v[17:18], v[17:18], 1.0
-; GFX11-NEXT:    v_add_f64 v[11:12], v[11:12], 1.0
-; GFX11-NEXT:    v_add_f64 v[19:20], v[19:20], 1.0
-; GFX11-NEXT:    v_add_f64 v[21:22], v[21:22], 1.0
-; GFX11-NEXT:    v_add_f64 v[23:24], v[23:24], 1.0
-; GFX11-NEXT:    v_add_f64 v[25:26], v[25:26], 1.0
-; GFX11-NEXT:    v_add_f64 v[27:28], v[27:28], 1.0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v27
-; GFX11-NEXT:  .LBB53_3: ; %end
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff, v28
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshl_or_b32 v31, v31, 16, v28
-; GFX11-NEXT:    v_lshl_or_b32 v37, v37, 16, v22
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT:    v_lshl_or_b32 v28, v71, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v14
-; GFX11-NEXT:    v_lshl_or_b32 v29, v29, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshl_or_b32 v36, v36, 16, v21
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT:    v_lshl_or_b32 v32, v32, 16, v25
-; GFX11-NEXT:    v_lshl_or_b32 v10, v10, 16, v11
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v15
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v16
-; GFX11-NEXT:    v_lshl_or_b32 v15, v66, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshl_or_b32 v19, v54, 16, v1
-; GFX11-NEXT:    v_lshl_or_b32 v22, v51, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v6
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v9
-; GFX11-NEXT:    v_mov_b32_e32 v6, v36
-; GFX11-NEXT:    v_lshl_or_b32 v34, v34, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff, v26
-; GFX11-NEXT:    v_mov_b32_e32 v9, v29
-; GFX11-NEXT:    v_lshl_or_b32 v11, v70, 16, v12
-; GFX11-NEXT:    v_lshl_or_b32 v12, v69, 16, v17
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v18
-; GFX11-NEXT:    v_lshl_or_b32 v30, v30, 16, v27
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v13
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshl_or_b32 v13, v68, 16, v17
-; GFX11-NEXT:    v_lshl_or_b32 v17, v64, 16, v21
-; GFX11-NEXT:    v_lshl_or_b32 v21, v52, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v8
-; GFX11-NEXT:    v_lshl_or_b32 v27, v38, 16, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v34
-; GFX11-NEXT:    v_lshl_or_b32 v33, v33, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT:    v_lshl_or_b32 v14, v67, 16, v18
-; GFX11-NEXT:    v_lshl_or_b32 v16, v65, 16, v20
-; GFX11-NEXT:    v_lshl_or_b32 v18, v55, 16, v0
-; GFX11-NEXT:    v_lshl_or_b32 v20, v53, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v5
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v7
-; GFX11-NEXT:    v_lshl_or_b32 v26, v39, 16, v3
-; GFX11-NEXT:    v_mov_b32_e32 v3, v33
-; GFX11-NEXT:    v_lshl_or_b32 v35, v35, 16, v24
-; GFX11-NEXT:    v_lshl_or_b32 v23, v50, 16, v0
-; GFX11-NEXT:    v_lshl_or_b32 v24, v49, 16, v1
-; GFX11-NEXT:    v_lshl_or_b32 v25, v48, 16, v2
-; GFX11-NEXT:    v_dual_mov_b32 v0, v30 :: v_dual_mov_b32 v1, v31
-; GFX11-NEXT:    v_dual_mov_b32 v2, v32 :: v_dual_mov_b32 v5, v35
-; GFX11-NEXT:    v_dual_mov_b32 v7, v37 :: v_dual_mov_b32 v8, v28
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB53_4:
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr71
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr10
-; GFX11-NEXT:    ; implicit-def: $vgpr70
-; GFX11-NEXT:    ; implicit-def: $vgpr69
-; GFX11-NEXT:    ; implicit-def: $vgpr68
-; GFX11-NEXT:    ; implicit-def: $vgpr67
-; GFX11-NEXT:    ; implicit-def: $vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr65
-; GFX11-NEXT:    ; implicit-def: $vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr55
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    s_branch .LBB53_2
+; GFX11-TRUE16-LABEL: bitcast_v14f64_to_v56f16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, v10 :: v_dual_mov_b32 v27, v9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB53_4
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v0
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB53_3
+; GFX11-TRUE16-NEXT:  .LBB53_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_f64 v[26:27], v[26:27], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v0
+; GFX11-TRUE16-NEXT:  .LBB53_3: ; %end
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v71, v71 :: v_dual_mov_b32 v70, v70
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v69, v69 :: v_dual_mov_b32 v68, v68
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v71.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v70.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v69.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v68.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v67.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v66.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v65.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v64.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v55.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v54.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v53.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v52.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v51.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v49.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v48.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v37.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v28.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB53_4:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-TRUE16-NEXT:    s_branch .LBB53_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v14f64_to_v56f16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v10
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v27, s0 :: v_dual_mov_b32 v28, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v25, s2 :: v_dual_mov_b32 v26, s3
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v23, s16 :: v_dual_mov_b32 v24, s17
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v21, s18 :: v_dual_mov_b32 v22, s19
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v19, s20 :: v_dual_mov_b32 v20, s21
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v11, s22 :: v_dual_mov_b32 v12, s23
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v17, s24 :: v_dual_mov_b32 v18, s25
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v13, s26 :: v_dual_mov_b32 v14, s27
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v15, s28 :: v_dual_mov_b32 v16, s29
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB53_4
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v27
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB53_3
+; GFX11-FAKE16-NEXT:  .LBB53_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[15:16], v[15:16], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[13:14], v[13:14], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[17:18], v[17:18], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[11:12], v[11:12], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[19:20], v[19:20], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[21:22], v[21:22], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[23:24], v[23:24], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[25:26], v[25:26], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[27:28], v[27:28], 1.0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v27
+; GFX11-FAKE16-NEXT:  .LBB53_3: ; %end
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v31, v31, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v37, v37, 16, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v28, v71, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v29, v29, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v36, v36, 16, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v32, v32, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v10, 16, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v66, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v19, v54, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v22, v51, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, v36
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v34, v34, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff, v26
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v9, v29
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v70, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v69, 16, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v18
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v30, v30, 16, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v68, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v17, v64, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v21, v52, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v27, v38, 16, v4
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v34
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v33, v33, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v67, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v16, v65, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v55, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v53, 16, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v26, v39, 16, v3
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, v33
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v35, v35, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v23, v50, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v24, v49, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v25, v48, 16, v2
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, v30 :: v_dual_mov_b32 v1, v31
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, v32 :: v_dual_mov_b32 v5, v35
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v7, v37 :: v_dual_mov_b32 v8, v28
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB53_4:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    s_branch .LBB53_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -42545,141 +44882,376 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v14f64_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v191, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v9.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v8.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v7.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v6.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v5.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v50, 0xffff, v9
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v189, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v188, v6 :: v_dual_mov_b32 v191, v3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v190, v4 :: v_dual_mov_b32 v185, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v186, v1 :: v_dual_mov_b32 v187, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB55_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB55_3
 ; GFX11-TRUE16-NEXT:  .LBB55_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v49, 16, v67
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v48, 16, v66
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v39, 16, v65
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v38, 16, v64
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v37, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v36, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v35, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v34, 16, v52
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v33, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v32, 16, v50
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v191, 0x200, v191 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:  .LBB55_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v187
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v186 :: v_dual_mov_b32 v20, v185
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v191 :: v_dual_mov_b32 v22, v190
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v189 :: v_dual_mov_b32 v24, v188
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v28
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB55_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v64, v28 :: v_dual_mov_b32 v53, v26
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v54, v25
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v28, v64
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB55_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v56f16_to_v14f64_scalar:
@@ -45566,27 +48138,13 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
 ; GFX11-TRUE16-LABEL: bitcast_v56i16_to_v56f16_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, 0
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v9.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v8.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v7.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v27.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v27.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v6.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v27.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v5.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v27.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v27.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v27.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v27.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v27.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v27.h
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, 0
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s29, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s28, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s27, 16
@@ -45608,30 +48166,29 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s46, 0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB57_3
-; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v27.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v26.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v28.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v25.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v28.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.l, v24.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.h, v28.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v23.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, v28.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v22.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v28.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v21.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v28.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v20.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v28.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v19.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v28.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v18.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v28.h
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s46
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB57_4
 ; GFX11-TRUE16-NEXT:  .LBB57_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v27, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v26, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v25, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v24, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v23, 16, v5
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v22, 16, v4
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v3, v21, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v20, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v19, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v18, 16, v0
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s29, s29, s45
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s28, s28, s44
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s27, s27, s42
@@ -45650,75 +48207,85 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s7
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s43
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v13, s29, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s28, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v17, s29, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v16, s28, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v16, s26, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v17, s25, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v10, s15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v12, s15, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v11, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v12, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v28, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v29, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v33, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v34, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v35, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v32, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v31, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v30, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v37, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v36, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v32
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v31
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v30
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v37
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v36
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v35
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v34
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v33
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v29
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v28
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v12
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v10, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v8, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v6, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v4, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v14
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v15
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v27
 ; GFX11-TRUE16-NEXT:    s_branch .LBB57_5
 ; GFX11-TRUE16-NEXT:  .LBB57_3:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28
 ; GFX11-TRUE16-NEXT:    s_branch .LBB57_2
 ; GFX11-TRUE16-NEXT:  .LBB57_4:
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v13, s29 :: v_dual_mov_b32 v14, s28
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s26
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, s25 :: v_dual_mov_b32 v10, s24
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v12, s22
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, s21 :: v_dual_mov_b32 v29, s20
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, s19 :: v_dual_mov_b32 v34, s18
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, s17 :: v_dual_mov_b32 v36, s16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v37, s3 :: v_dual_mov_b32 v30, s2
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, s1 :: v_dual_mov_b32 v32, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v16, s28
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v14, s26
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v12, s24
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v8, s20
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v4, s16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v38, s45 :: v_dual_mov_b32 v39, s44
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v48, s42 :: v_dual_mov_b32 v49, s41
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v50, s40 :: v_dual_mov_b32 v51, s15
@@ -45729,69 +48296,49 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v68, s5 :: v_dual_mov_b32 v69, s7
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v70, s4 :: v_dual_mov_b32 v71, s43
 ; GFX11-TRUE16-NEXT:  .LBB57_5: ; %end
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff, v37
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v35
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v31, v70, 16, v31
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff, v32
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v70, 0xffff, v33
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v33, v68, 16, v37
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v80, 0xffff, v30
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v19, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v21, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v35, v66, 16, v35
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff, v36
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v24, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v26, 16, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v31
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v30, v71, 16, v32
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v33
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v32, v69, 16, v80
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v69, 0xffff, v34
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v37, v64, 16, v70
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v29
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff, v28
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v18, 16, v0
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v35 :: v_dual_and_b32 v0, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v34, v67, 16, v36
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v36, v65, 16, v69
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v28, v55, 16, v29
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v29, v54, 16, v64
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v53, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v52, 16, v11
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v51, 16, v65
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v20, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v22, 16, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v50, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v49, 16, v16
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v48, 16, v15
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v16, v39, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v17, v38, 16, v52
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v23, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v25, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v27, 16, v4
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, v30
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v32
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v34
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, v36 :: v_dual_mov_b32 v7, v37
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, v28 :: v_dual_mov_b32 v9, v29
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v71, v71 :: v_dual_mov_b32 v70, v70
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v69, v69 :: v_dual_mov_b32 v68, v68
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v71.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v70.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v69.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v68.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v67.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v66.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v65.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v64.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v55.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v54.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v53.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v52.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v51.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v49.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v48.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v37.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v28.l
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v56i16_to_v56f16_scalar:
@@ -48280,27 +50827,13 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
 ; GFX11-TRUE16-LABEL: bitcast_v56f16_to_v56i16_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, 0
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v10
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v9.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v8.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v7.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v27.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v27.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v6.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v27.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v5.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v27.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v27.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v27.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v27.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v27.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v27.h
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, 0
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s29, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s28, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s27, 16
@@ -48322,30 +50855,29 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s46, 0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB59_3
-; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v27.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v26.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v28.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v25.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v28.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.l, v24.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.h, v28.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v23.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, v28.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v22.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v28.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v21.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v28.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v20.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v28.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v19.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v28.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v18.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v28.h
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s46
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB59_4
 ; GFX11-TRUE16-NEXT:  .LBB59_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v27, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v26, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v25, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v24, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v23, 16, v5
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v22, 16, v4
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v3, v21, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v20, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v19, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v18, 16, v0
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s29, s29, s45
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s28, s28, s44
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s27, s27, s42
@@ -48364,75 +50896,85 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s7
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s43
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v13, 0x200, s29 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s28 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v17, 0x200, s29 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v16, 0x200, s28 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v16, 0x200, s26 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v17, 0x200, s25 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v10, 0x200, s15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v12, 0x200, s15 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v11, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v12, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v28, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v29, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v33, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v34, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v35, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v32, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v31, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v30, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v37, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v36, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v32
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v31
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v30
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v37
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v36
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v35
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v34
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v33
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v29
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v28
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v12
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v10, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v8, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v7, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v6, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v4, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v14
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v15
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v27
 ; GFX11-TRUE16-NEXT:    s_branch .LBB59_5
 ; GFX11-TRUE16-NEXT:  .LBB59_3:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28
 ; GFX11-TRUE16-NEXT:    s_branch .LBB59_2
 ; GFX11-TRUE16-NEXT:  .LBB59_4:
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v13, s29 :: v_dual_mov_b32 v14, s28
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v16, s26
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, s25 :: v_dual_mov_b32 v10, s24
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v12, s22
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, s21 :: v_dual_mov_b32 v29, s20
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, s19 :: v_dual_mov_b32 v34, s18
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, s17 :: v_dual_mov_b32 v36, s16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v37, s3 :: v_dual_mov_b32 v30, s2
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, s1 :: v_dual_mov_b32 v32, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v16, s28
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v14, s26
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v12, s24
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v8, s20
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v4, s16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v38, s45 :: v_dual_mov_b32 v39, s44
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v48, s42 :: v_dual_mov_b32 v49, s41
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v50, s40 :: v_dual_mov_b32 v51, s15
@@ -48443,69 +50985,49 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v68, s5 :: v_dual_mov_b32 v69, s7
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v70, s4 :: v_dual_mov_b32 v71, s43
 ; GFX11-TRUE16-NEXT:  .LBB59_5: ; %end
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff, v37
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v35
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v31, v70, 16, v31
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff, v32
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v70, 0xffff, v33
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v33, v68, 16, v37
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v80, 0xffff, v30
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v19, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v21, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v35, v66, 16, v35
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff, v36
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v24, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v26, 16, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v31
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v30, v71, 16, v32
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v33
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v32, v69, 16, v80
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v69, 0xffff, v34
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v37, v64, 16, v70
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v29
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff, v28
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v18, 16, v0
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v35 :: v_dual_and_b32 v0, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v34, v67, 16, v36
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v36, v65, 16, v69
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v28, v55, 16, v29
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v29, v54, 16, v64
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v53, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v52, 16, v11
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v51, 16, v65
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v20, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v22, 16, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v50, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v49, 16, v16
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v48, 16, v15
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v16, v39, 16, v51
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v17, v38, 16, v52
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v23, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v25, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v27, 16, v4
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, v30
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v32
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v34
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, v36 :: v_dual_mov_b32 v7, v37
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, v28 :: v_dual_mov_b32 v9, v29
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v71, v71 :: v_dual_mov_b32 v70, v70
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v69, v69 :: v_dual_mov_b32 v68, v68
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, v29 :: v_dual_mov_b32 v28, v28
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v71.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v70.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v69.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v68.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v67.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v66.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v65.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v64.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v55.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v54.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v53.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v52.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v51.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v49.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v48.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v37.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v28.l
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v56f16_to_v56i16_scalar:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
index e2ffae41706a5..134980045bb53 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
@@ -7240,153 +7240,382 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3
 ; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v30i32_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v191, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v11.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v10.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v9.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v8.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v7.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v6.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v5.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v71, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v70, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v69, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v68, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff, v11
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v30, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v191, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v190, v6 :: v_dual_mov_b32 v185, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v186, v3 :: v_dual_mov_b32 v187, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v188, v1 :: v_dual_mov_b32 v189, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB15_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB15_3
 ; GFX11-TRUE16-NEXT:  .LBB15_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v191, v191, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:  .LBB15_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v189
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v30
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB15_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v64, v29 :: v_dual_mov_b32 v65, v28
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v66, v30
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v53, v26 :: v_dual_mov_b32 v54, v25
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, v65 :: v_dual_mov_b32 v29, v64
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v30, v66
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB15_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v30i32_scalar:
@@ -12840,153 +13069,382 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
 ; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v30i32_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v191, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v11.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v10.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v9.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v8.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v7.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v6.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v5.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v71, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v70, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v69, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v68, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff, v11
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v30, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v191, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v190, v6 :: v_dual_mov_b32 v185, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v186, v3 :: v_dual_mov_b32 v187, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v188, v1 :: v_dual_mov_b32 v189, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB19_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB19_3
 ; GFX11-TRUE16-NEXT:  .LBB19_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v191, 0x200, v191 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:  .LBB19_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v189
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v30
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB19_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v64, v29 :: v_dual_mov_b32 v65, v28
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v66, v30
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v53, v26 :: v_dual_mov_b32 v54, v25
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, v65 :: v_dual_mov_b32 v29, v64
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v30, v66
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB19_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v30i32_scalar:
@@ -16802,204 +17260,388 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
 ; GFX9-NEXT:    ; implicit-def: $vgpr54
 ; GFX9-NEXT:    s_branch .LBB29_2
 ;
-; GFX11-LABEL: bitcast_v30f32_to_v60i16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-NEXT:    v_dual_mov_b32 v30, s0 :: v_dual_mov_b32 v29, s1
-; GFX11-NEXT:    v_dual_mov_b32 v28, s2 :: v_dual_mov_b32 v27, s3
-; GFX11-NEXT:    v_dual_mov_b32 v26, s16 :: v_dual_mov_b32 v25, s17
-; GFX11-NEXT:    v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v23, s19
-; GFX11-NEXT:    v_dual_mov_b32 v22, s20 :: v_dual_mov_b32 v21, s21
-; GFX11-NEXT:    v_dual_mov_b32 v20, s22 :: v_dual_mov_b32 v19, s23
-; GFX11-NEXT:    v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v13, s26
-; GFX11-NEXT:    v_dual_mov_b32 v14, s25 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT:    v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cbranch_scc0 .LBB29_4
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 16, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v29
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v30
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccnz .LBB29_3
-; GFX11-NEXT:  .LBB29_2: ; %cmp.true
-; GFX11-NEXT:    v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
-; GFX11-NEXT:    v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT:    v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT:    v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT:    v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT:    v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT:    v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
-; GFX11-NEXT:    v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
-; GFX11-NEXT:    v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT:    v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
-; GFX11-NEXT:    v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
-; GFX11-NEXT:    v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
-; GFX11-NEXT:    v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26
-; GFX11-NEXT:    v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v28, 1.0, v28
-; GFX11-NEXT:    v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v30, 1.0, v30
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 16, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v29
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v30
-; GFX11-NEXT:  .LBB29_3: ; %end
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff, v29
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT:    v_lshl_or_b32 v33, v33, 16, v21
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v13
-; GFX11-NEXT:    v_lshl_or_b32 v35, v35, 16, v19
-; GFX11-NEXT:    v_lshl_or_b32 v12, v12, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v16
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshl_or_b32 v19, v68, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT:    v_lshl_or_b32 v37, v37, 16, v29
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xffff, v30
-; GFX11-NEXT:    v_lshl_or_b32 v39, v39, 16, v27
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff, v28
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT:    v_lshl_or_b32 v49, v49, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; GFX11-NEXT:    v_lshl_or_b32 v31, v31, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT:    v_lshl_or_b32 v13, v82, 16, v14
-; GFX11-NEXT:    v_lshl_or_b32 v14, v81, 16, v21
-; GFX11-NEXT:    v_lshl_or_b32 v16, v71, 16, v17
-; GFX11-NEXT:    v_lshl_or_b32 v17, v70, 16, v18
-; GFX11-NEXT:    v_lshl_or_b32 v18, v69, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v4
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v5
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v6
-; GFX11-NEXT:    v_lshl_or_b32 v21, v66, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v8
-; GFX11-NEXT:    v_lshl_or_b32 v38, v38, 16, v28
-; GFX11-NEXT:    v_lshl_or_b32 v32, v32, 16, v22
-; GFX11-NEXT:    v_lshl_or_b32 v34, v34, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_lshl_or_b32 v20, v67, 16, v0
-; GFX11-NEXT:    v_lshl_or_b32 v22, v65, 16, v2
-; GFX11-NEXT:    v_lshl_or_b32 v23, v64, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v7
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v9
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v10
-; GFX11-NEXT:    v_mov_b32_e32 v5, v49
-; GFX11-NEXT:    v_lshl_or_b32 v48, v48, 16, v26
-; GFX11-NEXT:    v_lshl_or_b32 v26, v53, 16, v1
-; GFX11-NEXT:    v_mov_b32_e32 v1, v37
-; GFX11-NEXT:    v_lshl_or_b32 v36, v36, 16, v30
-; GFX11-NEXT:    v_mov_b32_e32 v7, v31
-; GFX11-NEXT:    v_lshl_or_b32 v30, v83, 16, v24
-; GFX11-NEXT:    v_lshl_or_b32 v24, v55, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v11
-; GFX11-NEXT:    v_lshl_or_b32 v15, v80, 16, v15
-; GFX11-NEXT:    v_lshl_or_b32 v25, v54, 16, v0
-; GFX11-NEXT:    v_lshl_or_b32 v27, v52, 16, v2
-; GFX11-NEXT:    v_lshl_or_b32 v28, v51, 16, v3
-; GFX11-NEXT:    v_lshl_or_b32 v29, v50, 16, v4
-; GFX11-NEXT:    v_mov_b32_e32 v0, v36
-; GFX11-NEXT:    v_dual_mov_b32 v2, v38 :: v_dual_mov_b32 v3, v39
-; GFX11-NEXT:    v_mov_b32_e32 v4, v48
-; GFX11-NEXT:    v_mov_b32_e32 v6, v30
-; GFX11-NEXT:    v_dual_mov_b32 v8, v32 :: v_dual_mov_b32 v9, v33
-; GFX11-NEXT:    v_dual_mov_b32 v10, v34 :: v_dual_mov_b32 v11, v35
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB29_4:
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr83
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr12
-; GFX11-NEXT:    ; implicit-def: $vgpr82
-; GFX11-NEXT:    ; implicit-def: $vgpr81
-; GFX11-NEXT:    ; implicit-def: $vgpr80
-; GFX11-NEXT:    ; implicit-def: $vgpr71
-; GFX11-NEXT:    ; implicit-def: $vgpr70
-; GFX11-NEXT:    ; implicit-def: $vgpr69
-; GFX11-NEXT:    ; implicit-def: $vgpr68
-; GFX11-NEXT:    ; implicit-def: $vgpr67
-; GFX11-NEXT:    ; implicit-def: $vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr65
-; GFX11-NEXT:    ; implicit-def: $vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr55
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    s_branch .LBB29_2
+; GFX11-TRUE16-LABEL: bitcast_v30f32_to_v60i16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB29_4
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v29
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v0
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB29_3
+; GFX11-TRUE16-NEXT:  .LBB29_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v29
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v0
+; GFX11-TRUE16-NEXT:  .LBB29_3: ; %end
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v83, v83 :: v_dual_mov_b32 v82, v82
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v81, v81 :: v_dual_mov_b32 v80, v80
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v71, v71 :: v_dual_mov_b32 v70, v70
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v69, v69 :: v_dual_mov_b32 v68, v68
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v83.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v82.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v81.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v80.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v71.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v70.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v69.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v68.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v67.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v66.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v65.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v64.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v55.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v54.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v53.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v52.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v51.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v49.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v48.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v37.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v30.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB29_4:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT:    s_branch .LBB29_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v30f32_to_v60i16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v30, s0 :: v_dual_mov_b32 v29, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v28, s2 :: v_dual_mov_b32 v27, s3
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v26, s16 :: v_dual_mov_b32 v25, s17
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v23, s19
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v22, s20 :: v_dual_mov_b32 v21, s21
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v20, s22 :: v_dual_mov_b32 v19, s23
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v13, s26
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s25 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB29_4
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v29
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v30
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB29_3
+; GFX11-FAKE16-NEXT:  .LBB29_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v28, 1.0, v28
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v30, 1.0, v30
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v29
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v30
+; GFX11-FAKE16-NEXT:  .LBB29_3: ; %end
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xffff, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v33, v33, 16, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v35, v35, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v12, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v19, v68, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v37, v37, 16, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xffff, v30
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v39, v39, 16, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v49, v49, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v31, v31, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v82, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v81, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v16, v71, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v17, v70, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v69, 16, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v21, v66, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v38, v38, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v32, v32, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v34, v34, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v67, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v22, v65, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v23, v64, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v49
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v48, v48, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v26, v53, 16, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v37
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v36, v36, 16, v30
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v7, v31
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v30, v83, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v24, v55, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v80, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v25, v54, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v27, v52, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v28, v51, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v29, v50, 16, v4
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v36
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, v38 :: v_dual_mov_b32 v3, v39
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v48
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, v30
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, v32 :: v_dual_mov_b32 v9, v33
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, v34 :: v_dual_mov_b32 v11, v35
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB29_4:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr12
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    s_branch .LBB29_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -19290,153 +19932,382 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a,
 ; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v30f32_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v191, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v11.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v10.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v9.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v8.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v7.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v6.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v5.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v71, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v70, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v69, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v68, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff, v11
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v30, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v191, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v190, v6 :: v_dual_mov_b32 v185, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v186, v3 :: v_dual_mov_b32 v187, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v188, v1 :: v_dual_mov_b32 v189, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB31_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB31_3
 ; GFX11-TRUE16-NEXT:  .LBB31_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v191, v191, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:  .LBB31_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v189
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v30
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB31_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v64, v29 :: v_dual_mov_b32 v65, v28
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v66, v30
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v53, v26 :: v_dual_mov_b32 v54, v25
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, v65 :: v_dual_mov_b32 v29, v64
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v30, v66
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB31_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v30f32_scalar:
@@ -21985,204 +22856,388 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
 ; GFX9-NEXT:    ; implicit-def: $vgpr54
 ; GFX9-NEXT:    s_branch .LBB33_2
 ;
-; GFX11-LABEL: bitcast_v30f32_to_v60f16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-NEXT:    v_dual_mov_b32 v30, s0 :: v_dual_mov_b32 v29, s1
-; GFX11-NEXT:    v_dual_mov_b32 v28, s2 :: v_dual_mov_b32 v27, s3
-; GFX11-NEXT:    v_dual_mov_b32 v26, s16 :: v_dual_mov_b32 v25, s17
-; GFX11-NEXT:    v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v23, s19
-; GFX11-NEXT:    v_dual_mov_b32 v22, s20 :: v_dual_mov_b32 v21, s21
-; GFX11-NEXT:    v_dual_mov_b32 v20, s22 :: v_dual_mov_b32 v19, s23
-; GFX11-NEXT:    v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v13, s26
-; GFX11-NEXT:    v_dual_mov_b32 v14, s25 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT:    v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cbranch_scc0 .LBB33_4
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 16, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v29
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v30
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccnz .LBB33_3
-; GFX11-NEXT:  .LBB33_2: ; %cmp.true
-; GFX11-NEXT:    v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
-; GFX11-NEXT:    v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT:    v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT:    v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT:    v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT:    v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT:    v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
-; GFX11-NEXT:    v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
-; GFX11-NEXT:    v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT:    v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
-; GFX11-NEXT:    v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
-; GFX11-NEXT:    v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
-; GFX11-NEXT:    v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26
-; GFX11-NEXT:    v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v28, 1.0, v28
-; GFX11-NEXT:    v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v30, 1.0, v30
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 16, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v29
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v30
-; GFX11-NEXT:  .LBB33_3: ; %end
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff, v29
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT:    v_lshl_or_b32 v33, v33, 16, v21
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v13
-; GFX11-NEXT:    v_lshl_or_b32 v35, v35, 16, v19
-; GFX11-NEXT:    v_lshl_or_b32 v12, v12, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v16
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshl_or_b32 v19, v68, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT:    v_lshl_or_b32 v37, v37, 16, v29
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xffff, v30
-; GFX11-NEXT:    v_lshl_or_b32 v39, v39, 16, v27
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff, v28
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT:    v_lshl_or_b32 v49, v49, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; GFX11-NEXT:    v_lshl_or_b32 v31, v31, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT:    v_lshl_or_b32 v13, v82, 16, v14
-; GFX11-NEXT:    v_lshl_or_b32 v14, v81, 16, v21
-; GFX11-NEXT:    v_lshl_or_b32 v16, v71, 16, v17
-; GFX11-NEXT:    v_lshl_or_b32 v17, v70, 16, v18
-; GFX11-NEXT:    v_lshl_or_b32 v18, v69, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v4
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v5
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v6
-; GFX11-NEXT:    v_lshl_or_b32 v21, v66, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v8
-; GFX11-NEXT:    v_lshl_or_b32 v38, v38, 16, v28
-; GFX11-NEXT:    v_lshl_or_b32 v32, v32, 16, v22
-; GFX11-NEXT:    v_lshl_or_b32 v34, v34, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_lshl_or_b32 v20, v67, 16, v0
-; GFX11-NEXT:    v_lshl_or_b32 v22, v65, 16, v2
-; GFX11-NEXT:    v_lshl_or_b32 v23, v64, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v7
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v9
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v10
-; GFX11-NEXT:    v_mov_b32_e32 v5, v49
-; GFX11-NEXT:    v_lshl_or_b32 v48, v48, 16, v26
-; GFX11-NEXT:    v_lshl_or_b32 v26, v53, 16, v1
-; GFX11-NEXT:    v_mov_b32_e32 v1, v37
-; GFX11-NEXT:    v_lshl_or_b32 v36, v36, 16, v30
-; GFX11-NEXT:    v_mov_b32_e32 v7, v31
-; GFX11-NEXT:    v_lshl_or_b32 v30, v83, 16, v24
-; GFX11-NEXT:    v_lshl_or_b32 v24, v55, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v11
-; GFX11-NEXT:    v_lshl_or_b32 v15, v80, 16, v15
-; GFX11-NEXT:    v_lshl_or_b32 v25, v54, 16, v0
-; GFX11-NEXT:    v_lshl_or_b32 v27, v52, 16, v2
-; GFX11-NEXT:    v_lshl_or_b32 v28, v51, 16, v3
-; GFX11-NEXT:    v_lshl_or_b32 v29, v50, 16, v4
-; GFX11-NEXT:    v_mov_b32_e32 v0, v36
-; GFX11-NEXT:    v_dual_mov_b32 v2, v38 :: v_dual_mov_b32 v3, v39
-; GFX11-NEXT:    v_mov_b32_e32 v4, v48
-; GFX11-NEXT:    v_mov_b32_e32 v6, v30
-; GFX11-NEXT:    v_dual_mov_b32 v8, v32 :: v_dual_mov_b32 v9, v33
-; GFX11-NEXT:    v_dual_mov_b32 v10, v34 :: v_dual_mov_b32 v11, v35
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB33_4:
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr83
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr12
-; GFX11-NEXT:    ; implicit-def: $vgpr82
-; GFX11-NEXT:    ; implicit-def: $vgpr81
-; GFX11-NEXT:    ; implicit-def: $vgpr80
-; GFX11-NEXT:    ; implicit-def: $vgpr71
-; GFX11-NEXT:    ; implicit-def: $vgpr70
-; GFX11-NEXT:    ; implicit-def: $vgpr69
-; GFX11-NEXT:    ; implicit-def: $vgpr68
-; GFX11-NEXT:    ; implicit-def: $vgpr67
-; GFX11-NEXT:    ; implicit-def: $vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr65
-; GFX11-NEXT:    ; implicit-def: $vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr55
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    s_branch .LBB33_2
+; GFX11-TRUE16-LABEL: bitcast_v30f32_to_v60f16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB33_4
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v29
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v0
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB33_3
+; GFX11-TRUE16-NEXT:  .LBB33_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v29
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v0
+; GFX11-TRUE16-NEXT:  .LBB33_3: ; %end
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v83, v83 :: v_dual_mov_b32 v82, v82
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v81, v81 :: v_dual_mov_b32 v80, v80
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v71, v71 :: v_dual_mov_b32 v70, v70
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v69, v69 :: v_dual_mov_b32 v68, v68
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v83.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v82.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v81.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v80.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v71.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v70.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v69.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v68.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v67.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v66.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v65.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v64.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v55.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v54.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v53.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v52.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v51.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v49.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v48.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v37.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v30.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB33_4:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT:    s_branch .LBB33_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v30f32_to_v60f16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v30, s0 :: v_dual_mov_b32 v29, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v28, s2 :: v_dual_mov_b32 v27, s3
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v26, s16 :: v_dual_mov_b32 v25, s17
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v23, s19
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v22, s20 :: v_dual_mov_b32 v21, s21
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v20, s22 :: v_dual_mov_b32 v19, s23
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v13, s26
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s25 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v17, s28 :: v_dual_mov_b32 v16, s29
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB33_4
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v29
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v30
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB33_3
+; GFX11-FAKE16-NEXT:  .LBB33_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v17, 1.0, v17
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v20, 1.0, v20
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v22, 1.0, v22
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v24, 1.0, v24
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v26, 1.0, v26
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v28, 1.0, v28
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v30, 1.0, v30
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v29
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v30
+; GFX11-FAKE16-NEXT:  .LBB33_3: ; %end
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xffff, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v33, v33, 16, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v35, v35, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v12, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v19, v68, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v37, v37, 16, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xffff, v30
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v39, v39, 16, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v49, v49, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v31, v31, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v82, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v81, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v16, v71, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v17, v70, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v69, 16, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v21, v66, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v38, v38, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v32, v32, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v34, v34, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v67, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v22, v65, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v23, v64, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v49
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v48, v48, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v26, v53, 16, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v37
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v36, v36, 16, v30
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v7, v31
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v30, v83, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v24, v55, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v80, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v25, v54, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v27, v52, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v28, v51, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v29, v50, 16, v4
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v36
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, v38 :: v_dual_mov_b32 v3, v39
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v48
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, v30
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, v32 :: v_dual_mov_b32 v9, v33
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, v34 :: v_dual_mov_b32 v11, v35
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB33_4:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr12
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    s_branch .LBB33_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -24867,153 +25922,382 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
 ; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v30f32_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v191, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v11.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v10.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v9.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v8.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v7.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v6.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v5.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v71, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v70, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v69, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v68, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff, v11
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v30, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v191, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v190, v6 :: v_dual_mov_b32 v185, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v186, v3 :: v_dual_mov_b32 v187, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v188, v1 :: v_dual_mov_b32 v189, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB35_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB35_3
 ; GFX11-TRUE16-NEXT:  .LBB35_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v191, 0x200, v191 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:  .LBB35_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v189
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v30
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB35_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v64, v29 :: v_dual_mov_b32 v65, v28
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v66, v30
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v53, v26 :: v_dual_mov_b32 v54, v25
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, v65 :: v_dual_mov_b32 v29, v64
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v30, v66
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB35_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v30f32_scalar:
@@ -30472,153 +31756,382 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3
 ; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v15i64_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v191, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v11.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v10.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v9.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v8.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v7.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v6.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v5.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v71, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v70, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v69, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v68, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff, v11
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v30, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v191, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v190, v6 :: v_dual_mov_b32 v185, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v186, v3 :: v_dual_mov_b32 v187, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v188, v1 :: v_dual_mov_b32 v189, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB43_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB43_3
 ; GFX11-TRUE16-NEXT:  .LBB43_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v191, v191, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:  .LBB43_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v189
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v30
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB43_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v64, v29 :: v_dual_mov_b32 v65, v28
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v66, v30
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v53, v26 :: v_dual_mov_b32 v54, v25
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, v65 :: v_dual_mov_b32 v29, v64
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v30, v66
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB43_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v15i64_scalar:
@@ -36089,153 +37602,382 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
 ; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v15i64_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v191, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v11.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v10.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v9.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v8.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v7.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v6.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v5.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v71, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v70, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v69, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v68, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff, v11
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v30, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v191, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v190, v6 :: v_dual_mov_b32 v185, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v186, v3 :: v_dual_mov_b32 v187, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v188, v1 :: v_dual_mov_b32 v189, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB47_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB47_3
 ; GFX11-TRUE16-NEXT:  .LBB47_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v191, 0x200, v191 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:  .LBB47_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v189
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v30
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB47_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v64, v29 :: v_dual_mov_b32 v65, v28
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v66, v30
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v53, v26 :: v_dual_mov_b32 v54, v25
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, v65 :: v_dual_mov_b32 v29, v64
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v30, v66
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB47_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v15i64_scalar:
@@ -38144,204 +39886,388 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
 ; GFX9-NEXT:    ; implicit-def: $vgpr54
 ; GFX9-NEXT:    s_branch .LBB49_2
 ;
-; GFX11-LABEL: bitcast_v15f64_to_v60i16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-NEXT:    v_dual_mov_b32 v30, s0 :: v_dual_mov_b32 v31, s1
-; GFX11-NEXT:    v_dual_mov_b32 v28, s2 :: v_dual_mov_b32 v29, s3
-; GFX11-NEXT:    v_dual_mov_b32 v26, s16 :: v_dual_mov_b32 v27, s17
-; GFX11-NEXT:    v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v25, s19
-; GFX11-NEXT:    v_dual_mov_b32 v22, s20 :: v_dual_mov_b32 v23, s21
-; GFX11-NEXT:    v_dual_mov_b32 v20, s22 :: v_dual_mov_b32 v21, s23
-; GFX11-NEXT:    v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v19, s25
-; GFX11-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT:    v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cbranch_scc0 .LBB49_4
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 16, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 16, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v29
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v31
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v30
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccnz .LBB49_3
-; GFX11-NEXT:  .LBB49_2: ; %cmp.true
-; GFX11-NEXT:    v_add_f64 v[10:11], v[10:11], 1.0
-; GFX11-NEXT:    v_add_f64 v[8:9], v[8:9], 1.0
-; GFX11-NEXT:    v_add_f64 v[6:7], v[6:7], 1.0
-; GFX11-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
-; GFX11-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
-; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT:    v_add_f64 v[16:17], v[16:17], 1.0
-; GFX11-NEXT:    v_add_f64 v[14:15], v[14:15], 1.0
-; GFX11-NEXT:    v_add_f64 v[18:19], v[18:19], 1.0
-; GFX11-NEXT:    v_add_f64 v[20:21], v[20:21], 1.0
-; GFX11-NEXT:    v_add_f64 v[22:23], v[22:23], 1.0
-; GFX11-NEXT:    v_add_f64 v[24:25], v[24:25], 1.0
-; GFX11-NEXT:    v_add_f64 v[26:27], v[26:27], 1.0
-; GFX11-NEXT:    v_add_f64 v[28:29], v[28:29], 1.0
-; GFX11-NEXT:    v_add_f64 v[30:31], v[30:31], 1.0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 16, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 16, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v29
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v31
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v30
-; GFX11-NEXT:  .LBB49_3: ; %end
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff, v29
-; GFX11-NEXT:    v_lshl_or_b32 v34, v34, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT:    v_lshl_or_b32 v48, v48, 16, v26
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT:    v_lshl_or_b32 v13, v13, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshl_or_b32 v19, v68, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT:    v_lshl_or_b32 v37, v37, 16, v31
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xffff, v30
-; GFX11-NEXT:    v_lshl_or_b32 v39, v39, 16, v29
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff, v28
-; GFX11-NEXT:    v_lshl_or_b32 v31, v82, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff, v27
-; GFX11-NEXT:    v_lshl_or_b32 v33, v33, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT:    v_lshl_or_b32 v35, v35, 16, v21
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT:    v_lshl_or_b32 v12, v12, 16, v18
-; GFX11-NEXT:    v_lshl_or_b32 v18, v69, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v4
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v5
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v6
-; GFX11-NEXT:    v_lshl_or_b32 v21, v66, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v8
-; GFX11-NEXT:    v_lshl_or_b32 v38, v38, 16, v28
-; GFX11-NEXT:    v_lshl_or_b32 v32, v32, 16, v22
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_lshl_or_b32 v20, v67, 16, v0
-; GFX11-NEXT:    v_lshl_or_b32 v22, v65, 16, v2
-; GFX11-NEXT:    v_lshl_or_b32 v23, v64, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v7
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v9
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v10
-; GFX11-NEXT:    v_mov_b32_e32 v7, v31
-; GFX11-NEXT:    v_lshl_or_b32 v49, v49, 16, v26
-; GFX11-NEXT:    v_lshl_or_b32 v26, v53, 16, v1
-; GFX11-NEXT:    v_mov_b32_e32 v1, v37
-; GFX11-NEXT:    v_lshl_or_b32 v36, v36, 16, v30
-; GFX11-NEXT:    v_mov_b32_e32 v9, v33
-; GFX11-NEXT:    v_lshl_or_b32 v30, v83, 16, v24
-; GFX11-NEXT:    v_lshl_or_b32 v24, v55, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v11
-; GFX11-NEXT:    v_lshl_or_b32 v14, v81, 16, v14
-; GFX11-NEXT:    v_lshl_or_b32 v15, v80, 16, v15
-; GFX11-NEXT:    v_lshl_or_b32 v16, v71, 16, v16
-; GFX11-NEXT:    v_lshl_or_b32 v17, v70, 16, v17
-; GFX11-NEXT:    v_lshl_or_b32 v25, v54, 16, v0
-; GFX11-NEXT:    v_lshl_or_b32 v27, v52, 16, v2
-; GFX11-NEXT:    v_lshl_or_b32 v28, v51, 16, v3
-; GFX11-NEXT:    v_lshl_or_b32 v29, v50, 16, v4
-; GFX11-NEXT:    v_mov_b32_e32 v0, v36
-; GFX11-NEXT:    v_dual_mov_b32 v2, v38 :: v_dual_mov_b32 v3, v39
-; GFX11-NEXT:    v_dual_mov_b32 v4, v48 :: v_dual_mov_b32 v5, v49
-; GFX11-NEXT:    v_mov_b32_e32 v6, v30
-; GFX11-NEXT:    v_mov_b32_e32 v8, v32
-; GFX11-NEXT:    v_dual_mov_b32 v10, v34 :: v_dual_mov_b32 v11, v35
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB49_4:
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr83
-; GFX11-NEXT:    ; implicit-def: $vgpr82
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr12
-; GFX11-NEXT:    ; implicit-def: $vgpr13
-; GFX11-NEXT:    ; implicit-def: $vgpr81
-; GFX11-NEXT:    ; implicit-def: $vgpr80
-; GFX11-NEXT:    ; implicit-def: $vgpr71
-; GFX11-NEXT:    ; implicit-def: $vgpr70
-; GFX11-NEXT:    ; implicit-def: $vgpr69
-; GFX11-NEXT:    ; implicit-def: $vgpr68
-; GFX11-NEXT:    ; implicit-def: $vgpr67
-; GFX11-NEXT:    ; implicit-def: $vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr65
-; GFX11-NEXT:    ; implicit-def: $vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr55
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    s_branch .LBB49_2
+; GFX11-TRUE16-LABEL: bitcast_v15f64_to_v60i16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB49_4
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v29
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v0
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB49_3
+; GFX11-TRUE16-NEXT:  .LBB49_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_f64 v[28:29], v[28:29], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[26:27], v[26:27], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v29
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v0
+; GFX11-TRUE16-NEXT:  .LBB49_3: ; %end
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v83, v83 :: v_dual_mov_b32 v82, v82
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v81, v81 :: v_dual_mov_b32 v80, v80
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v71, v71 :: v_dual_mov_b32 v70, v70
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v69, v69 :: v_dual_mov_b32 v68, v68
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v83.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v82.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v81.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v80.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v71.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v70.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v69.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v68.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v67.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v66.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v65.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v64.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v55.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v54.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v53.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v52.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v51.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v49.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v48.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v37.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v30.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB49_4:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT:    s_branch .LBB49_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v15f64_to_v60i16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v30, s0 :: v_dual_mov_b32 v31, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v28, s2 :: v_dual_mov_b32 v29, s3
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v26, s16 :: v_dual_mov_b32 v27, s17
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v25, s19
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v22, s20 :: v_dual_mov_b32 v23, s21
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v20, s22 :: v_dual_mov_b32 v21, s23
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v19, s25
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB49_4
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v29
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v31
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v30
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB49_3
+; GFX11-FAKE16-NEXT:  .LBB49_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[26:27], v[26:27], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[28:29], v[28:29], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[30:31], v[30:31], 1.0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v29
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v31
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v30
+; GFX11-FAKE16-NEXT:  .LBB49_3: ; %end
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xffff, v29
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v34, v34, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v48, v48, 16, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v13, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v19, v68, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v37, v37, 16, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xffff, v30
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v39, v39, 16, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff, v28
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v31, v82, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff, v27
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v33, v33, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v35, v35, 16, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v12, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v69, 16, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v21, v66, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v38, v38, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v32, v32, 16, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v67, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v22, v65, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v23, v64, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v7, v31
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v49, v49, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v26, v53, 16, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v37
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v36, v36, 16, v30
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v9, v33
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v30, v83, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v24, v55, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v81, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v80, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v16, v71, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v17, v70, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v25, v54, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v27, v52, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v28, v51, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v29, v50, 16, v4
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v36
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, v38 :: v_dual_mov_b32 v3, v39
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, v48 :: v_dual_mov_b32 v5, v49
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, v30
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v8, v32
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, v34 :: v_dual_mov_b32 v11, v35
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB49_4:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr12
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    s_branch .LBB49_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -40632,153 +42558,382 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a,
 ; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v15f64_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v191, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v11.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v10.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v9.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v8.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v7.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v6.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v5.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v71, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v70, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v69, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v68, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff, v11
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v30, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v191, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v190, v6 :: v_dual_mov_b32 v185, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v186, v3 :: v_dual_mov_b32 v187, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v188, v1 :: v_dual_mov_b32 v189, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB51_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB51_3
 ; GFX11-TRUE16-NEXT:  .LBB51_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v4, s4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v6, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v7, s7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v8, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v10, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v11, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v12, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v13, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v15, s16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v16, s17, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v17, s18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s40, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s41, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v189, v189, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v188, v188, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v187, v187, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v186, v186, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v185, v185, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v191, v191, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v190, v190, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v27, s4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v35, s5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v44, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v54, s7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v65, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v77, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v90, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v104, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v119, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v135, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v152, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v170, s15, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:  .LBB51_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v189
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v30
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB51_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v64, v29 :: v_dual_mov_b32 v65, v28
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v66, v30
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v53, v26 :: v_dual_mov_b32 v54, v25
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, v65 :: v_dual_mov_b32 v29, v64
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v30, v66
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB51_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v15f64_scalar:
@@ -43227,204 +45382,388 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
 ; GFX9-NEXT:    ; implicit-def: $vgpr54
 ; GFX9-NEXT:    s_branch .LBB53_2
 ;
-; GFX11-LABEL: bitcast_v15f64_to_v60f16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-NEXT:    v_dual_mov_b32 v30, s0 :: v_dual_mov_b32 v31, s1
-; GFX11-NEXT:    v_dual_mov_b32 v28, s2 :: v_dual_mov_b32 v29, s3
-; GFX11-NEXT:    v_dual_mov_b32 v26, s16 :: v_dual_mov_b32 v27, s17
-; GFX11-NEXT:    v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v25, s19
-; GFX11-NEXT:    v_dual_mov_b32 v22, s20 :: v_dual_mov_b32 v23, s21
-; GFX11-NEXT:    v_dual_mov_b32 v20, s22 :: v_dual_mov_b32 v21, s23
-; GFX11-NEXT:    v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v19, s25
-; GFX11-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
-; GFX11-NEXT:    v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cbranch_scc0 .LBB53_4
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 16, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 16, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v29
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v31
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v30
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccnz .LBB53_3
-; GFX11-NEXT:  .LBB53_2: ; %cmp.true
-; GFX11-NEXT:    v_add_f64 v[10:11], v[10:11], 1.0
-; GFX11-NEXT:    v_add_f64 v[8:9], v[8:9], 1.0
-; GFX11-NEXT:    v_add_f64 v[6:7], v[6:7], 1.0
-; GFX11-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
-; GFX11-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
-; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT:    v_add_f64 v[16:17], v[16:17], 1.0
-; GFX11-NEXT:    v_add_f64 v[14:15], v[14:15], 1.0
-; GFX11-NEXT:    v_add_f64 v[18:19], v[18:19], 1.0
-; GFX11-NEXT:    v_add_f64 v[20:21], v[20:21], 1.0
-; GFX11-NEXT:    v_add_f64 v[22:23], v[22:23], 1.0
-; GFX11-NEXT:    v_add_f64 v[24:25], v[24:25], 1.0
-; GFX11-NEXT:    v_add_f64 v[26:27], v[26:27], 1.0
-; GFX11-NEXT:    v_add_f64 v[28:29], v[28:29], 1.0
-; GFX11-NEXT:    v_add_f64 v[30:31], v[30:31], 1.0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 16, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 16, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v29
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v31
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v30
-; GFX11-NEXT:  .LBB53_3: ; %end
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff, v29
-; GFX11-NEXT:    v_lshl_or_b32 v34, v34, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT:    v_lshl_or_b32 v48, v48, 16, v26
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT:    v_lshl_or_b32 v13, v13, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshl_or_b32 v19, v68, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v3
-; GFX11-NEXT:    v_lshl_or_b32 v37, v37, 16, v31
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xffff, v30
-; GFX11-NEXT:    v_lshl_or_b32 v39, v39, 16, v29
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff, v28
-; GFX11-NEXT:    v_lshl_or_b32 v31, v82, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff, v27
-; GFX11-NEXT:    v_lshl_or_b32 v33, v33, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT:    v_lshl_or_b32 v35, v35, 16, v21
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT:    v_lshl_or_b32 v12, v12, 16, v18
-; GFX11-NEXT:    v_lshl_or_b32 v18, v69, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v4
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v5
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v6
-; GFX11-NEXT:    v_lshl_or_b32 v21, v66, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v8
-; GFX11-NEXT:    v_lshl_or_b32 v38, v38, 16, v28
-; GFX11-NEXT:    v_lshl_or_b32 v32, v32, 16, v22
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_lshl_or_b32 v20, v67, 16, v0
-; GFX11-NEXT:    v_lshl_or_b32 v22, v65, 16, v2
-; GFX11-NEXT:    v_lshl_or_b32 v23, v64, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v7
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v9
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v10
-; GFX11-NEXT:    v_mov_b32_e32 v7, v31
-; GFX11-NEXT:    v_lshl_or_b32 v49, v49, 16, v26
-; GFX11-NEXT:    v_lshl_or_b32 v26, v53, 16, v1
-; GFX11-NEXT:    v_mov_b32_e32 v1, v37
-; GFX11-NEXT:    v_lshl_or_b32 v36, v36, 16, v30
-; GFX11-NEXT:    v_mov_b32_e32 v9, v33
-; GFX11-NEXT:    v_lshl_or_b32 v30, v83, 16, v24
-; GFX11-NEXT:    v_lshl_or_b32 v24, v55, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v11
-; GFX11-NEXT:    v_lshl_or_b32 v14, v81, 16, v14
-; GFX11-NEXT:    v_lshl_or_b32 v15, v80, 16, v15
-; GFX11-NEXT:    v_lshl_or_b32 v16, v71, 16, v16
-; GFX11-NEXT:    v_lshl_or_b32 v17, v70, 16, v17
-; GFX11-NEXT:    v_lshl_or_b32 v25, v54, 16, v0
-; GFX11-NEXT:    v_lshl_or_b32 v27, v52, 16, v2
-; GFX11-NEXT:    v_lshl_or_b32 v28, v51, 16, v3
-; GFX11-NEXT:    v_lshl_or_b32 v29, v50, 16, v4
-; GFX11-NEXT:    v_mov_b32_e32 v0, v36
-; GFX11-NEXT:    v_dual_mov_b32 v2, v38 :: v_dual_mov_b32 v3, v39
-; GFX11-NEXT:    v_dual_mov_b32 v4, v48 :: v_dual_mov_b32 v5, v49
-; GFX11-NEXT:    v_mov_b32_e32 v6, v30
-; GFX11-NEXT:    v_mov_b32_e32 v8, v32
-; GFX11-NEXT:    v_dual_mov_b32 v10, v34 :: v_dual_mov_b32 v11, v35
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB53_4:
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr83
-; GFX11-NEXT:    ; implicit-def: $vgpr82
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr12
-; GFX11-NEXT:    ; implicit-def: $vgpr13
-; GFX11-NEXT:    ; implicit-def: $vgpr81
-; GFX11-NEXT:    ; implicit-def: $vgpr80
-; GFX11-NEXT:    ; implicit-def: $vgpr71
-; GFX11-NEXT:    ; implicit-def: $vgpr70
-; GFX11-NEXT:    ; implicit-def: $vgpr69
-; GFX11-NEXT:    ; implicit-def: $vgpr68
-; GFX11-NEXT:    ; implicit-def: $vgpr67
-; GFX11-NEXT:    ; implicit-def: $vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr65
-; GFX11-NEXT:    ; implicit-def: $vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr55
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    s_branch .LBB53_2
+; GFX11-TRUE16-LABEL: bitcast_v15f64_to_v60f16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v29, v11
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, v10 :: v_dual_mov_b32 v27, v9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v24, v6 :: v_dual_mov_b32 v23, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, v4 :: v_dual_mov_b32 v21, v3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, v2 :: v_dual_mov_b32 v19, v1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s19
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s21
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v11, s23
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v13, s25
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v17, s29
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v16, s28
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB53_4
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v29
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v0
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB53_3
+; GFX11-TRUE16-NEXT:  .LBB53_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_f64 v[28:29], v[28:29], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[26:27], v[26:27], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[12:13], v[12:13], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v29
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v0
+; GFX11-TRUE16-NEXT:  .LBB53_3: ; %end
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v83, v83 :: v_dual_mov_b32 v82, v82
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v81, v81 :: v_dual_mov_b32 v80, v80
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v71, v71 :: v_dual_mov_b32 v70, v70
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v69, v69 :: v_dual_mov_b32 v68, v68
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v83.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v82.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v81.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v80.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v71.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v70.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v69.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v68.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v67.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v66.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v65.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v64.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v55.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v54.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v53.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v52.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v51.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v49.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v48.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v37.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v30.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB53_4:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-TRUE16-NEXT:    s_branch .LBB53_2
+;
+; GFX11-FAKE16-LABEL: bitcast_v15f64_to_v60f16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v30, s0 :: v_dual_mov_b32 v31, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v28, s2 :: v_dual_mov_b32 v29, s3
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v26, s16 :: v_dual_mov_b32 v27, s17
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v25, s19
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v22, s20 :: v_dual_mov_b32 v23, s21
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v20, s22 :: v_dual_mov_b32 v21, s23
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v18, s24 :: v_dual_mov_b32 v19, s25
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v16, s28 :: v_dual_mov_b32 v17, s29
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB53_4
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v29
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v31
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v30
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB53_3
+; GFX11-FAKE16-NEXT:  .LBB53_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[6:7], v[6:7], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[14:15], v[14:15], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[20:21], v[20:21], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[22:23], v[22:23], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[24:25], v[24:25], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[26:27], v[26:27], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[28:29], v[28:29], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[30:31], v[30:31], 1.0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v29
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v31
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v30
+; GFX11-FAKE16-NEXT:  .LBB53_3: ; %end
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xffff, v29
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v34, v34, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v48, v48, 16, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v13, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v19, v68, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v37, v37, 16, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xffff, v30
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v39, v39, 16, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff, v28
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v31, v82, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff, v27
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v33, v33, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v35, v35, 16, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v12, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v69, 16, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v21, v66, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v38, v38, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v32, v32, 16, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v67, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v22, v65, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v23, v64, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v7, v31
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v49, v49, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v26, v53, 16, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v37
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v36, v36, 16, v30
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v9, v33
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v30, v83, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v24, v55, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v81, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v80, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v16, v71, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v17, v70, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v25, v54, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v27, v52, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v28, v51, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v29, v50, 16, v4
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v36
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, v38 :: v_dual_mov_b32 v3, v39
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, v48 :: v_dual_mov_b32 v5, v49
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, v30
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v8, v32
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, v34 :: v_dual_mov_b32 v11, v35
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB53_4:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr12
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    s_branch .LBB53_2
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -46109,153 +48448,382 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
 ; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v15f64_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:316
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:312
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:308
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:304
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:300
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:296
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:292
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:288
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:284
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:280
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:276
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:272
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:268
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:264
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:260
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:256
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:252
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:248
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:244
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:240
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:236
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:232
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:228
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:224
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:220
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:216
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:212
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:208
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:204
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:200
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:196
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:192
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:188
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:184
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:180
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:176
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:172
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:168
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:164
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:160
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:156
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:152
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:148
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:144
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:140
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:136
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:132
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:128
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:124
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:120
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:116
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:112
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:108
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:104
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:100
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:96
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:92
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:88
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:84
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:80
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v156, s32 offset:76
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:72
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:68
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:64
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:60
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:56
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:52
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v171, s32 offset:48
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v172, s32 offset:44
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v173, s32 offset:40
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v174, s32 offset:36
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v175, s32 offset:32
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v184, s32 offset:28
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v185, s32 offset:24
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v186, s32 offset:20
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v187, s32 offset:16
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v188, s32 offset:12
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v189, s32 offset:8
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v190, s32 offset:4
+; GFX11-TRUE16-NEXT:    ; meta instruction
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v191, s32
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v11.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v10.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v9.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v8.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v7.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v6.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v5.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.h, v32.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.h, v32.h
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v71, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v70, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v69, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v68, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff, v11
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s29, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s28, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s27, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s26, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s25, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s24, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s23, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s22, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s21, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s20, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s19, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s18, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s17, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s16, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s3, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s2, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s0, 16
-; GFX11-TRUE16-NEXT:    s_mov_b32 s15, 0
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s46
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s45
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s44
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s43
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s16, s4
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s17, s5
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s18, s6
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s19, s7
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s20, s8
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s21, s9
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s22, s10
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s23, s11
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s24, s12
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s25, s13
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s26, s14
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s27, s42
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s28, s41
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s18, s29, s40
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v30, v9 :: v_dual_mov_b32 v25, v7
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v26, v8 :: v_dual_mov_b32 v191, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v190, v6 :: v_dual_mov_b32 v185, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v186, v3 :: v_dual_mov_b32 v187, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v188, v1 :: v_dual_mov_b32 v189, v0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s15, s29, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s28, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s27, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s26, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s25, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s24, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s23, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s22, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s21, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s6, s20, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s19, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s4, s18, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s17, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s16, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s45, s3, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s46, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s41, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s40, s0, 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s42, 0
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s40, s0, s40
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s41, s1, s41
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s2, s46
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s3, s45
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s16, s44
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s17, s43
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s18, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s19, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s20, s6
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s21, s7
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s8, s22, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s9, s23, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s24, s10
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s25, s11
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s12, s26, s12
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s13, s27, s13
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s28, s14
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s29, s15
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB55_4
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v16, s17 :: v_dual_mov_b32 v17, s18
-; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s15
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v5, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s41 :: v_dual_mov_b32 v9, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v27, s4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, s3 :: v_dual_mov_b32 v35, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v44, s6 :: v_dual_mov_b32 v65, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s7 :: v_dual_mov_b32 v77, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v90, s10 :: v_dual_mov_b32 v119, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v104, s11 :: v_dual_mov_b32 v135, s13
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v152, s14
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v170, s15
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s42
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB55_3
 ; GFX11-TRUE16-NEXT:  .LBB55_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v51, 16, v71
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v50, 16, v70
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v49, 16, v69
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v48, 16, v68
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v39, 16, v67
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v38, 16, v66
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v37, 16, v65
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v36, 16, v64
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v35, 16, v55
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v34, 16, v54
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v28, v33, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v29, v32, 16, v52
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v4, 0x200, s4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v6, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v7, 0x200, s7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v8, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v10, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v11, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v12, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v13, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v15, 0x200, s16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v16, 0x200, s17 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v17, 0x200, s18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s40 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s41 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v189, 0x200, v189 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v188, 0x200, v188 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v187, 0x200, v187 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v186, 0x200, v186 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v185, 0x200, v185 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v191, 0x200, v191 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v190, 0x200, v190 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v27, 0x200, s4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v35, 0x200, s5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v44, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v54, 0x200, s7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v65, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v77, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v90, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v104, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v119, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v135, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v152, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v170, 0x200, s15 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:  .LBB55_3: ; %end
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, v20 :: v_dual_mov_b32 v6, v27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, v35 :: v_dual_mov_b32 v8, v44
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v77 :: v_dual_mov_b32 v12, v90
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v104
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, v135 :: v_dual_mov_b32 v16, v152
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v170 :: v_dual_mov_b32 v18, v189
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v188 :: v_dual_mov_b32 v20, v187
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v186 :: v_dual_mov_b32 v22, v185
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v191 :: v_dual_mov_b32 v24, v190
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v191, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v190, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v189, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v188, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v187, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v186, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v185, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v184, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v175, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v171, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v170, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v169, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v168, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v159, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v158, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v157, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v156, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v14
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v54 :: v_dual_mov_b32 v10, v65
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, v119 :: v_dual_mov_b32 v27, v30
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB55_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v64, v29 :: v_dual_mov_b32 v65, v28
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v66, v30
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v53, v26 :: v_dual_mov_b32 v54, v25
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v54 :: v_dual_mov_b32 v26, v53
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, v65 :: v_dual_mov_b32 v29, v64
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v30, v66
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143_vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159_vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175_vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184
 ; GFX11-TRUE16-NEXT:    s_branch .LBB55_2
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v15f64_scalar:
@@ -49421,31 +51989,14 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
 ; GFX11-TRUE16-LABEL: bitcast_v60i16_to_v60f16_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, 0
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v11.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v10.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v9.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v29.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v29.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v8.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v29.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v7.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v29.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v6.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v29.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v5.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v29.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v29.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v29.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v29.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v29.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v29.h
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, 0
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s29, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s28, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s27, 16
@@ -49467,34 +52018,33 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s46, 0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB57_3
-; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v29.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.l, v28.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.h, v30.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v27.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, v30.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v26.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v30.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v25.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v30.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v24.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v30.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v23.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v30.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v22.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v30.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v21.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.h, v30.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v20.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v30.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v19.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.h, v30.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v18.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.h, v30.h
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s46
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB57_4
 ; GFX11-TRUE16-NEXT:  .LBB57_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v29, 16, v11
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v28, 16, v10
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v27, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v26, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v25, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v24, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v23, 16, v5
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v22, 16, v4
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v3, v21, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v20, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v19, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v18, 16, v0
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s29, s29, s44
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s28, s28, s43
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s27, s27, s42
@@ -49513,79 +52063,91 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s7
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s45
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v15, s29, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v17, s29, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v16, s28, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v17, s27, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v12, s26, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v15, s27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s26, 3 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    v_pk_add_u16 v13, s25, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, s15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v34, s14, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v35, s13, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v30, s12, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v31, s11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v32, s10, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v33, s9, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v49, s8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v48, s0, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v39, s1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v38, s2, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v37, s3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_add_u16 v36, s6, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v48
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v39
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v38
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v37
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v36
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v49
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v33
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v32
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v31
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v30
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v35
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v34
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v14
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v12, s15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v11, s14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v10, s13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, s12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v8, s11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v7, s10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v6, s9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, s8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v0, s0, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, s1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, s2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v3, s3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v4, s6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v15
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v15
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v29
 ; GFX11-TRUE16-NEXT:    s_branch .LBB57_5
 ; GFX11-TRUE16-NEXT:  .LBB57_3:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30
 ; GFX11-TRUE16-NEXT:    s_branch .LBB57_2
 ; GFX11-TRUE16-NEXT:  .LBB57_4:
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, s29 :: v_dual_mov_b32 v16, s28
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, s27 :: v_dual_mov_b32 v12, s26
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v14, s24
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v34, s23 :: v_dual_mov_b32 v35, s22
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v30, s21 :: v_dual_mov_b32 v31, s20
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v32, s19 :: v_dual_mov_b32 v33, s18
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v49, s17 :: v_dual_mov_b32 v36, s16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v37, s3 :: v_dual_mov_b32 v38, s2
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v39, s1 :: v_dual_mov_b32 v48, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v16, s28
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v14, s26
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v12, s24
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v8, s20
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v4, s16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v50, s44 :: v_dual_mov_b32 v51, s43
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v52, s42 :: v_dual_mov_b32 v53, s41
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s40 :: v_dual_mov_b32 v55, s15
@@ -49596,75 +52158,52 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v80, s5 :: v_dual_mov_b32 v81, s7
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v82, s4 :: v_dual_mov_b32 v83, s45
 ; GFX11-TRUE16-NEXT:  .LBB57_5: ; %end
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff, v49
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v39
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v33
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v49, v70, 16, v49
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v84, 0xffff, v37
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v19, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v37, v82, 16, v39
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v18, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v23, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, v49
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v39, v80, 16, v84
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff, v38
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff, v48
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v21, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v22, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v28, v28, 16, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v39
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v38, v81, 16, v38
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v85, 0xffff, v36
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff, v32
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v80, 0xffff, v30
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v26, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v27, 16, v2
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v37
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v36, v83, 16, v48
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v38
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v48, v71, 16, v85
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v71, 0xffff, v31
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v30, v69, 16, v33
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v31, v68, 16, v32
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v33, v66, 16, v80
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v35
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v32, v67, 16, v71
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff, v34
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v34, v65, 16, v35
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v35, v64, 16, v66
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v55, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v53, 16, v67
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v20, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v24, 16, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v54, 16, v13
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v52, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v16, v51, 16, v16
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v17, v50, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v25, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v29, v29, 16, v4
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, v36
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v48
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, v30 :: v_dual_mov_b32 v7, v31
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, v32 :: v_dual_mov_b32 v9, v33
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, v34 :: v_dual_mov_b32 v11, v35
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v83, v83 :: v_dual_mov_b32 v82, v82
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v81, v81 :: v_dual_mov_b32 v80, v80
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v71, v71 :: v_dual_mov_b32 v70, v70
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v69, v69 :: v_dual_mov_b32 v68, v68
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v83.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v82.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v81.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v80.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v71.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v70.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v69.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v68.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v67.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v66.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v65.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v64.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v55.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v54.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v53.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v52.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v51.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v49.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v48.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v37.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v30.l
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v60i16_to_v60f16_scalar:
@@ -52368,31 +54907,14 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; GFX11-TRUE16-LABEL: bitcast_v60f16_to_v60i16_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, 0
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v11.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v10.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v9.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v29.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v29.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v8.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v29.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v7.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v29.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v6.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v29.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v5.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v29.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v29.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v29.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v29.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v29.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v0.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v29.h
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v29, v11 :: v_dual_mov_b32 v28, v10
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v27, v9 :: v_dual_mov_b32 v26, v8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v25, v7 :: v_dual_mov_b32 v24, v6
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v23, v5 :: v_dual_mov_b32 v22, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v3 :: v_dual_mov_b32 v20, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, 0
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s44, s29, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s43, s28, 16
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s42, s27, 16
@@ -52414,34 +54936,33 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s46, 0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s47, vcc_lo, exec_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB59_3
-; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v29.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.l, v28.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.h, v30.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v27.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, v30.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v26.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v30.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v25.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v30.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v24.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v30.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v23.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v30.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v22.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v30.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v21.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.h, v30.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v20.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v30.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v19.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.h, v30.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v18.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.h, v30.h
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s46
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB59_4
 ; GFX11-TRUE16-NEXT:  .LBB59_2: ; %cmp.true
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v29, 16, v11
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v28, 16, v10
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v27, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v26, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v25, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v24, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v23, 16, v5
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v22, 16, v4
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v3, v21, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v20, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v19, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v18, 16, v0
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s29, s29, s44
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s28, s28, s43
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s27, s27, s42
@@ -52460,79 +54981,91 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s7
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s0, s45
 ; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v15, 0x200, s29 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v17, 0x200, s29 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v16, 0x200, s28 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v17, 0x200, s27 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v12, 0x200, s26 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v15, 0x200, s27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s26 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v13, 0x200, s25 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, s15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v34, 0x200, s14 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v35, 0x200, s13 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v30, 0x200, s12 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v31, 0x200, s11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v32, 0x200, s10 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v33, 0x200, s9 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v49, 0x200, s8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v48, 0x200, s0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v39, 0x200, s1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v38, 0x200, s2 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v37, 0x200, s3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_pk_add_f16 v36, 0x200, s6 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v48
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v39
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v38
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v37
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v36
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v49
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v33
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v32
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v31
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v30
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v35
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v34
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v14
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v12, 0x200, s15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v11, 0x200, s14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v10, 0x200, s13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, s12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v8, 0x200, s11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v7, 0x200, s10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v6, 0x200, s9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, s8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, 0x200, s0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, s1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, s2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v3, 0x200, s3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v4, 0x200, s6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v15
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v15
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v2
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v29
 ; GFX11-TRUE16-NEXT:    s_branch .LBB59_5
 ; GFX11-TRUE16-NEXT:  .LBB59_3:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30
 ; GFX11-TRUE16-NEXT:    s_branch .LBB59_2
 ; GFX11-TRUE16-NEXT:  .LBB59_4:
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, s29 :: v_dual_mov_b32 v16, s28
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, s27 :: v_dual_mov_b32 v12, s26
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v14, s24
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v34, s23 :: v_dual_mov_b32 v35, s22
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v30, s21 :: v_dual_mov_b32 v31, s20
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v32, s19 :: v_dual_mov_b32 v33, s18
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v49, s17 :: v_dual_mov_b32 v36, s16
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v37, s3 :: v_dual_mov_b32 v38, s2
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v39, s1 :: v_dual_mov_b32 v48, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, s29 :: v_dual_mov_b32 v16, s28
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v15, s27 :: v_dual_mov_b32 v14, s26
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v13, s25 :: v_dual_mov_b32 v12, s24
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v10, s22
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v8, s20
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v6, s18
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v4, s16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v50, s44 :: v_dual_mov_b32 v51, s43
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v52, s42 :: v_dual_mov_b32 v53, s41
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v54, s40 :: v_dual_mov_b32 v55, s15
@@ -52543,75 +55076,52 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v80, s5 :: v_dual_mov_b32 v81, s7
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v82, s4 :: v_dual_mov_b32 v83, s45
 ; GFX11-TRUE16-NEXT:  .LBB59_5: ; %end
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff, v49
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v39
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v33
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v49, v70, 16, v49
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v84, 0xffff, v37
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v19, 16, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v5
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v37, v82, 16, v39
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v18, 16, v0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v4
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v23, 16, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v10
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, v49
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v39, v80, 16, v84
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff, v38
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff, v48
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v21, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v22, 16, v2
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v9
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v28, v28, 16, v3
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v39
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v38, v81, 16, v38
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v85, 0xffff, v36
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff, v32
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v80, 0xffff, v30
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v26, 16, v1
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v27, 16, v2
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v37
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v36, v83, 16, v48
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v38
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v48, v71, 16, v85
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v71, 0xffff, v31
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v30, v69, 16, v33
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v31, v68, 16, v32
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v33, v66, 16, v80
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v35
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v32, v67, 16, v71
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff, v34
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff, v12
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v34, v65, 16, v35
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v35, v64, 16, v66
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v55, 16, v14
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v53, 16, v67
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v15
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v20, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v24, 16, v4
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v7
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v11
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v54, 16, v13
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v52, 16, v17
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v16, v51, 16, v16
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v17, v50, 16, v53
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v25, 16, v0
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v29, v29, 16, v4
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, v36
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v48
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, v30 :: v_dual_mov_b32 v7, v31
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, v32 :: v_dual_mov_b32 v9, v33
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, v34 :: v_dual_mov_b32 v11, v35
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v83, v83 :: v_dual_mov_b32 v82, v82
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v81, v81 :: v_dual_mov_b32 v80, v80
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v71, v71 :: v_dual_mov_b32 v70, v70
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v69, v69 :: v_dual_mov_b32 v68, v68
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v67, v67 :: v_dual_mov_b32 v66, v66
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v65, v65 :: v_dual_mov_b32 v64, v64
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v55, v55 :: v_dual_mov_b32 v54, v54
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v53, v53 :: v_dual_mov_b32 v52, v52
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v51, v51 :: v_dual_mov_b32 v50, v50
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v49, v49 :: v_dual_mov_b32 v48, v48
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v39, v39 :: v_dual_mov_b32 v38, v38
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v37, v37 :: v_dual_mov_b32 v36, v36
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v35, v35 :: v_dual_mov_b32 v34, v34
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, v33 :: v_dual_mov_b32 v32, v32
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v31, v31 :: v_dual_mov_b32 v30, v30
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v83.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v82.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v81.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v80.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v71.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v70.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v69.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v68.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v67.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v66.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v65.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v64.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v55.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v54.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v53.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v52.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v51.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v50.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v49.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v48.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v39.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v37.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v30.l
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: bitcast_v60f16_to_v60i16_scalar:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
index 9a6ea1b93dfc5..6ada0cb8c46f1 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
@@ -2402,89 +2402,171 @@ define inreg <3 x i32> @bitcast_v6bf16_to_v3i32_scalar(<6 x bfloat> inreg %a, i3
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s18
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v6bf16_to_v3i32_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX11-NEXT:    s_mov_b32 s3, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB11_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s3
-; GFX11-NEXT:    s_cbranch_vccnz .LBB11_4
-; GFX11-NEXT:  .LBB11_2: ; %cmp.true
-; GFX11-NEXT:    s_pack_lh_b32_b16 s3, 0, s2
-; GFX11-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX11-NEXT:    v_add_f32_e64 v0, 0x40c00000, s3
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s2
-; GFX11-NEXT:    s_pack_lh_b32_b16 s2, 0, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX11-NEXT:    s_pack_lh_b32_b16 s3, 0, s1
-; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_add_f32_e64 v7, 0x40c00000, s0
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s3
-; GFX11-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v7
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s1
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT:    v_bfe_u32 v10, v4, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v6, 0x40c00000, s2
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v9, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v10, v4
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v4
-; GFX11-NEXT:    v_bfe_u32 v9, v6, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v8, v7
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, v9, v6
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v2, v11 :: v_dual_and_b32 v1, 0xffff, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v9
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v8, v12, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v10, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v2
-; GFX11-NEXT:    v_lshl_or_b32 v2, v0, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v5, v9, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
-; GFX11-NEXT:    v_lshl_or_b32 v1, v3, 16, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshl_or_b32 v0, v4, 16, v5
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB11_3:
-; GFX11-NEXT:    s_branch .LBB11_2
-; GFX11-NEXT:  .LBB11_4:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    v_mov_b32_e32 v2, s2
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v6bf16_to_v3i32_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB11_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s3
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB11_4
+; GFX11-TRUE16-NEXT:  .LBB11_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s3, 0, s2
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s3
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s3
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, v7, v3
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v8, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, v9, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v7, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v1, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v6.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB11_3:
+; GFX11-TRUE16-NEXT:    s_branch .LBB11_2
+; GFX11-TRUE16-NEXT:  .LBB11_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v6bf16_to_v3i32_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB11_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s3
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB11_4
+; GFX11-FAKE16-NEXT:  .LBB11_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s3, 0, s2
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s3
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s2
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s3
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v10, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v8, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, v9, v6
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v11 :: v_dual_and_b32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v8, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v2, v0, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v1, v3, 16, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v4, 16, v5
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB11_3:
+; GFX11-FAKE16-NEXT:    s_branch .LBB11_2
+; GFX11-FAKE16-NEXT:  .LBB11_4:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -5536,89 +5618,171 @@ define inreg <3 x float> @bitcast_v6bf16_to_v3f32_scalar(<6 x bfloat> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s18
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v6bf16_to_v3f32_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX11-NEXT:    s_mov_b32 s3, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB27_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s3
-; GFX11-NEXT:    s_cbranch_vccnz .LBB27_4
-; GFX11-NEXT:  .LBB27_2: ; %cmp.true
-; GFX11-NEXT:    s_pack_lh_b32_b16 s3, 0, s2
-; GFX11-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX11-NEXT:    v_add_f32_e64 v0, 0x40c00000, s3
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s2
-; GFX11-NEXT:    s_pack_lh_b32_b16 s2, 0, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX11-NEXT:    s_pack_lh_b32_b16 s3, 0, s1
-; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_add_f32_e64 v7, 0x40c00000, s0
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s3
-; GFX11-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v7
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s1
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT:    v_bfe_u32 v10, v4, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v6, 0x40c00000, s2
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v9, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v10, v4
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v4
-; GFX11-NEXT:    v_bfe_u32 v9, v6, 16, 1
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v8, v7
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, v9, v6
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v2, v11 :: v_dual_and_b32 v1, 0xffff, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v9
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v8, v12, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v10, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v2
-; GFX11-NEXT:    v_lshl_or_b32 v2, v0, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v5, v9, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
-; GFX11-NEXT:    v_lshl_or_b32 v1, v3, 16, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshl_or_b32 v0, v4, 16, v5
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB27_3:
-; GFX11-NEXT:    s_branch .LBB27_2
-; GFX11-NEXT:  .LBB27_4:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    v_mov_b32_e32 v2, s2
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v6bf16_to_v3f32_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB27_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s3
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB27_4
+; GFX11-TRUE16-NEXT:  .LBB27_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s3, 0, s2
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s3
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s3
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, v7, v3
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v8, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, v9, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v7, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v7, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v1, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v6.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB27_3:
+; GFX11-TRUE16-NEXT:    s_branch .LBB27_2
+; GFX11-TRUE16-NEXT:  .LBB27_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v6bf16_to_v3f32_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB27_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s3
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB27_4
+; GFX11-FAKE16-NEXT:  .LBB27_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s3, 0, s2
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s3
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s2
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s2, 0, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s3
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v10, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v8, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, v9, v6
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v11 :: v_dual_and_b32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v8, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v2, v0, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v1, v3, 16, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v4, 16, v5
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB27_3:
+; GFX11-FAKE16-NEXT:    s_branch .LBB27_2
+; GFX11-FAKE16-NEXT:  .LBB27_4:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -8229,124 +8393,243 @@ define inreg <12 x i8> @bitcast_v6bf16_to_v12i8_scalar(<6 x bfloat> inreg %a, i3
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s17
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v6bf16_to_v12i8_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX11-NEXT:    s_mov_b32 s3, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB39_3
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    s_lshr_b32 s13, s2, 16
-; GFX11-NEXT:    s_lshr_b32 s12, s2, 8
-; GFX11-NEXT:    s_lshr_b32 s8, s1, 24
-; GFX11-NEXT:    s_lshr_b32 s14, s1, 16
-; GFX11-NEXT:    s_lshr_b32 s9, s1, 8
-; GFX11-NEXT:    s_lshr_b32 s11, s0, 16
-; GFX11-NEXT:    s_lshr_b32 s10, s0, 8
-; GFX11-NEXT:    s_lshr_b64 s[6:7], s[2:3], 24
-; GFX11-NEXT:    s_lshr_b64 s[4:5], s[0:1], 24
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s3
-; GFX11-NEXT:    s_cbranch_vccnz .LBB39_4
-; GFX11-NEXT:  .LBB39_2: ; %cmp.true
-; GFX11-NEXT:    s_pack_lh_b32_b16 s3, 0, s1
-; GFX11-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-NEXT:    v_add_f32_e64 v0, 0x40c00000, s3
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s1
-; GFX11-NEXT:    s_pack_lh_b32_b16 s3, 0, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX11-NEXT:    s_pack_lh_b32_b16 s1, 0, s2
-; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s3
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s2
-; GFX11-NEXT:    v_add_f32_e64 v7, 0x40c00000, s1
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT:    v_bfe_u32 v10, v4, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v8
-; GFX11-NEXT:    v_mov_b32_e32 v12, 0x7fc07fc0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v9, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v0, v8, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 16, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, v2, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, v0, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v10, v4
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v0, v11, vcc_lo
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v3
-; GFX11-NEXT:    v_bfe_u32 v3, v7, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v2, v9 :: v_dual_add_nc_u32 v3, v3, v7
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v13
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshl_or_b32 v2, v6, 16, v3
-; GFX11-NEXT:    v_lshl_or_b32 v1, v4, 16, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshl_or_b32 v11, v7, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[1:2]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v11
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[11:12]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-NEXT:    v_mov_b32_e32 v4, v13
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB39_3:
-; GFX11-NEXT:    ; implicit-def: $sgpr10
-; GFX11-NEXT:    ; implicit-def: $sgpr11
-; GFX11-NEXT:    ; implicit-def: $sgpr4
-; GFX11-NEXT:    ; implicit-def: $sgpr9
-; GFX11-NEXT:    ; implicit-def: $sgpr14
-; GFX11-NEXT:    ; implicit-def: $sgpr8
-; GFX11-NEXT:    ; implicit-def: $sgpr12
-; GFX11-NEXT:    ; implicit-def: $sgpr13
-; GFX11-NEXT:    ; implicit-def: $sgpr6
-; GFX11-NEXT:    s_branch .LBB39_2
-; GFX11-NEXT:  .LBB39_4:
-; GFX11-NEXT:    v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v9, s12
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s10
-; GFX11-NEXT:    v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s8
-; GFX11-NEXT:    v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v5, s9
-; GFX11-NEXT:    v_dual_mov_b32 v2, s11 :: v_dual_mov_b32 v11, s6
-; GFX11-NEXT:    v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v6bf16_to_v12i8_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB39_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s13, s2, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s12, s2, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s1, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s14, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s9, s1, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s11, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s10, s0, 8
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[6:7], s[2:3], 24
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[4:5], s[0:1], 24
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s3
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB39_4
+; GFX11-TRUE16-NEXT:  .LBB39_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s3
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s1, 0, s0
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s3, 0, s2
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v4, 16, 1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v12, 0x7fc07fc0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, v8, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v0, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v1, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v8, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, v0, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v13.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v6.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v8.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v4.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v3.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[1:2]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[11:12]
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v13
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB39_3:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr10
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr11
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr9
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr14
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr8
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr12
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr13
+; GFX11-TRUE16-NEXT:    ; implicit-def: $sgpr6
+; GFX11-TRUE16-NEXT:    s_branch .LBB39_2
+; GFX11-TRUE16-NEXT:  .LBB39_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v9, s12
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s10
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s8
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v5, s9
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s11 :: v_dual_mov_b32 v11, s6
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s1
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v6bf16_to_v12i8_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB39_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s13, s2, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s12, s2, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s8, s1, 24
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s14, s1, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s9, s1, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s11, s0, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s10, s0, 8
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[6:7], s[2:3], 24
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[4:5], s[0:1], 24
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s3
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB39_4
+; GFX11-FAKE16-NEXT:  .LBB39_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s3
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s3, 0, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s1, 0, s2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s3
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s2
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v12, 0x7fc07fc0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v0, v8, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, v2, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, v0, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v10, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v0, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v9 :: v_dual_add_nc_u32 v3, v3, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v2, v6, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v1, v4, 16, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v7, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[1:2]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[11:12]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v13
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB39_3:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr12
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $sgpr6
+; GFX11-FAKE16-NEXT:    s_branch .LBB39_2
+; GFX11-FAKE16-NEXT:  .LBB39_4:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v9, s12
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s10
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s8
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v5, s9
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s11 :: v_dual_mov_b32 v11, s6
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -11712,89 +11995,169 @@ define inreg <6 x half> @bitcast_v6bf16_to_v6f16_scalar(<6 x bfloat> inreg %a, i
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s19
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v6bf16_to_v6f16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB49_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_vccnz .LBB49_4
-; GFX11-NEXT:  .LBB49_2: ; %cmp.true
-; GFX11-NEXT:    s_pack_lh_b32_b16 s3, 0, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX11-NEXT:    v_add_f32_e64 v0, 0x40c00000, s3
-; GFX11-NEXT:    v_add_f32_e64 v2, 0x40c00000, s0
-; GFX11-NEXT:    s_pack_lh_b32_b16 s3, 0, s1
-; GFX11-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-NEXT:    s_pack_lh_b32_b16 s4, 0, s2
-; GFX11-NEXT:    v_add_f32_e64 v4, 0x40c00000, s1
-; GFX11-NEXT:    v_bfe_u32 v6, v2, 16, 1
-; GFX11-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v7, 0x40c00000, s2
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, v6, v2
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT:    v_add_f32_e64 v3, 0x40c00000, s3
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v7
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, v1, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_bfe_u32 v10, v3, 16, 1
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, v10, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v8, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v1, v4, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v8, v7, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v6, v9 :: v_dual_add_nc_u32 v1, v1, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v10
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, v8, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_bfe_u32 v9, v5, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, v9, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v8, v12, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v9
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v6, v10, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc_lo
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshl_or_b32 v0, v0, 16, v6
-; GFX11-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_lshl_or_b32 v2, v4, 16, v5
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB49_3:
-; GFX11-NEXT:    s_branch .LBB49_2
-; GFX11-NEXT:  .LBB49_4:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v6bf16_to_v6f16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB49_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB49_4
+; GFX11-TRUE16-NEXT:  .LBB49_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s3, 0, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s3
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s0, 0, s2
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v10, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v0, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, v0, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v10, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v4.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v8, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v6.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB49_3:
+; GFX11-TRUE16-NEXT:    s_branch .LBB49_2
+; GFX11-TRUE16-NEXT:  .LBB49_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v6bf16_to_v6f16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB49_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB49_4
+; GFX11-FAKE16-NEXT:  .LBB49_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s3, 0, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s3
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v2, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s4, 0, s2
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v4, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v7, 0x40c00000, s2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, v6, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, v1, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, v10, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v1, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v6, v9 :: v_dual_add_nc_u32 v1, v1, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0x7fff, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v8, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 0x7fff, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, v9, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v8, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v6, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v2, v4, 16, v5
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB49_3:
+; GFX11-FAKE16-NEXT:    s_branch .LBB49_2
+; GFX11-FAKE16-NEXT:  .LBB49_4:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -12306,64 +12669,57 @@ define <6 x i16> @bitcast_v6bf16_to_v6i16(<6 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v0.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v5
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v4
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v9, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v2.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v4, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v9, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc_lo
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v9
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v3, 16, 1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v6
-; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v10, v9, 0x7fff
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v1, 16, 1
-; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v11, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc_lo
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v12, v5, 16, 1
-; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v10, v1, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v3
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v9
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v12, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v3, v9, v10 :: v_dual_add_f32 v0, 0x40c00000, v0
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v7, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v11, v12, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v0, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v7, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v0
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v8, v2, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v5.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.h
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v0, 16, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v1, 16, v3
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v5, 16, v4
+; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v8, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v4.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v8, v9, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v5.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v6, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v3.h
 ; GFX11-TRUE16-NEXT:  .LBB52_2: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -12651,80 +13007,151 @@ define inreg <6 x i16> @bitcast_v6bf16_to_v6i16_scalar(<6 x bfloat> inreg %a, i3
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s19
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v6bf16_to_v6i16_scalar:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB53_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
-; GFX11-NEXT:    s_cbranch_vccnz .LBB53_4
-; GFX11-NEXT:  .LBB53_2: ; %cmp.true
-; GFX11-NEXT:    s_pack_lh_b32_b16 s3, 0, s0
-; GFX11-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX11-NEXT:    v_add_f32_e64 v0, 0x40c00000, s3
-; GFX11-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
-; GFX11-NEXT:    s_pack_lh_b32_b16 s3, 0, s1
-; GFX11-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-NEXT:    v_add_f32_e64 v3, 0x40c00000, s3
-; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT:    s_pack_lh_b32_b16 s0, 0, s2
-; GFX11-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX11-NEXT:    v_add_f32_e64 v5, 0x40c00000, s1
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT:    v_add_f32_e64 v6, 0x40c00000, s2
-; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT:    v_bfe_u32 v10, v6, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v5
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v8 :: v_dual_add_nc_u32 v7, v7, v3
-; GFX11-NEXT:    v_bfe_u32 v2, v5, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_add_f32_e64 v8, 0x40c00000, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v4, 0x7fff, v7
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, v10, v6
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v5
-; GFX11-NEXT:    v_bfe_u32 v7, v8, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, 0x7fff, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v2, v11 :: v_dual_add_nc_u32 v7, v7, v8
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v7
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v8
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v9, v12, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v4, v10, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_or_b32 v2, 0xffff0000, v4, v5
-; GFX11-NEXT:    v_and_or_b32 v1, 0xffff0000, v3, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffff0000, v0, v7
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB53_3:
-; GFX11-NEXT:    s_branch .LBB53_2
-; GFX11-NEXT:  .LBB53_4:
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v6bf16_to_v6i16_scalar:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB53_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB53_4
+; GFX11-TRUE16-NEXT:  .LBB53_2: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s3, 0, s0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s3
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s3
+; GFX11-TRUE16-NEXT:    s_pack_lh_b32_b16 s0, 0, s2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, v7, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v4, v4, v9 :: v_dual_add_nc_u32 v9, v10, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v4.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 0x7fff, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, v2, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v7
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v8, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, v7, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x7fff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v1, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v9, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v5.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v6.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB53_3:
+; GFX11-TRUE16-NEXT:    s_branch .LBB53_2
+; GFX11-TRUE16-NEXT:  .LBB53_4:
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v6bf16_to_v6i16_scalar:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB53_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB53_4
+; GFX11-FAKE16-NEXT:  .LBB53_2: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s3, 0, s0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, 0x40c00000, s3
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s3, 0, s1
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v3, 0x40c00000, s3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT:    s_pack_lh_b32_b16 s0, 0, s2
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v5, 0x40c00000, s1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v6, 0x40c00000, s2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v2, v8 :: v_dual_add_nc_u32 v7, v7, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v8, 0x40c00000, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v4, v9 :: v_dual_add_nc_u32 v4, 0x7fff, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, v10, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v8, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 0x7fff, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v2, v11 :: v_dual_add_nc_u32 v7, v7, v8
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0x7fff, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v9, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v4, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v2, 0xffff0000, v4, v5
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v1, 0xffff0000, v3, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v0, 0xffff0000, v0, v7
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB53_3:
+; GFX11-FAKE16-NEXT:    s_branch .LBB53_2
+; GFX11-FAKE16-NEXT:  .LBB53_4:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
index e71bf15384727..e34aaf205cb95 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
@@ -136,7 +136,7 @@ define i32 @select_sdiv_lhs_opaque_const0_i32(i1 %cond) {
 ; GCN-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 5, v1, vcc
 ; GCN-NEXT:    v_sub_u32_e32 v1, vcc, 0, v0
-; GCN-NEXT:    v_max_i32_e32 v1, v0, v1
+; GCN-NEXT:    v_max_i32_e32 v1, v1, v0
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v1
 ; GCN-NEXT:    v_sub_u32_e32 v3, vcc, 0, v1
 ; GCN-NEXT:    s_mov_b32 s4, 0xf4240
@@ -218,7 +218,7 @@ define i32 @select_sdiv_lhs_opaque_const1_i32(i1 %cond) {
 ; GCN-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, 5, vcc
 ; GCN-NEXT:    v_sub_u32_e32 v1, vcc, 0, v0
-; GCN-NEXT:    v_max_i32_e32 v1, v0, v1
+; GCN-NEXT:    v_max_i32_e32 v1, v1, v0
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v1
 ; GCN-NEXT:    v_sub_u32_e32 v3, vcc, 0, v1
 ; GCN-NEXT:    s_mov_b32 s4, 0xf4240
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index e27164c2d6d69..948811ea45f77 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -6191,37 +6191,34 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
 ; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_lshl_b32 s3, 0x1000, s3
-; GFX6-NEXT:    s_ashr_i32 s8, s3, 31
-; GFX6-NEXT:    s_add_i32 s3, s3, s8
-; GFX6-NEXT:    s_xor_b32 s3, s3, s8
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX6-NEXT:    s_sub_i32 s4, 0, s3
-; GFX6-NEXT:    s_ashr_i32 s9, s2, 31
-; GFX6-NEXT:    s_add_i32 s2, s2, s9
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    s_xor_b32 s2, s2, s9
+; GFX6-NEXT:    s_abs_i32 s8, s3
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
+; GFX6-NEXT:    s_sub_i32 s4, 0, s8
+; GFX6-NEXT:    s_abs_i32 s9, s2
 ; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v1, s4, v0
 ; GFX6-NEXT:    s_mov_b32 s4, s0
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT:    v_mul_hi_u32 v0, s2, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s9, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX6-NEXT:    s_mul_i32 s0, s0, s3
-; GFX6-NEXT:    s_sub_i32 s0, s2, s0
-; GFX6-NEXT:    s_sub_i32 s1, s0, s3
+; GFX6-NEXT:    s_mul_i32 s0, s0, s8
+; GFX6-NEXT:    s_sub_i32 s0, s9, s0
+; GFX6-NEXT:    s_sub_i32 s1, s0, s8
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
-; GFX6-NEXT:    s_cmp_ge_u32 s0, s3
+; GFX6-NEXT:    s_cmp_ge_u32 s0, s8
 ; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX6-NEXT:    s_cselect_b32 s0, s1, s0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
-; GFX6-NEXT:    s_cmp_ge_u32 s0, s3
+; GFX6-NEXT:    s_cmp_ge_u32 s0, s8
 ; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX6-NEXT:    s_xor_b32 s0, s2, s3
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX6-NEXT:    s_xor_b32 s0, s9, s8
+; GFX6-NEXT:    s_ashr_i32 s0, s0, 31
 ; GFX6-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
@@ -6233,35 +6230,32 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s3
-; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
-; GFX9-NEXT:    s_add_i32 s3, s3, s4
-; GFX9-NEXT:    s_xor_b32 s3, s3, s4
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT:    s_sub_i32 s6, 0, s3
-; GFX9-NEXT:    s_ashr_i32 s5, s2, 31
-; GFX9-NEXT:    s_add_i32 s2, s2, s5
+; GFX9-NEXT:    s_abs_i32 s4, s3
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GFX9-NEXT:    s_sub_i32 s6, 0, s4
+; GFX9-NEXT:    s_abs_i32 s5, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_xor_b32 s2, s2, s5
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s7, v0
 ; GFX9-NEXT:    s_mul_i32 s6, s6, s7
 ; GFX9-NEXT:    s_mul_hi_u32 s6, s7, s6
 ; GFX9-NEXT:    s_add_i32 s7, s7, s6
-; GFX9-NEXT:    s_mul_hi_u32 s6, s2, s7
-; GFX9-NEXT:    s_mul_i32 s8, s6, s3
-; GFX9-NEXT:    s_sub_i32 s2, s2, s8
+; GFX9-NEXT:    s_mul_hi_u32 s6, s5, s7
+; GFX9-NEXT:    s_mul_i32 s8, s6, s4
+; GFX9-NEXT:    s_sub_i32 s5, s5, s8
 ; GFX9-NEXT:    s_add_i32 s7, s6, 1
-; GFX9-NEXT:    s_sub_i32 s8, s2, s3
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX9-NEXT:    s_sub_i32 s8, s5, s4
+; GFX9-NEXT:    s_cmp_ge_u32 s5, s4
 ; GFX9-NEXT:    s_cselect_b32 s6, s7, s6
-; GFX9-NEXT:    s_cselect_b32 s2, s8, s2
+; GFX9-NEXT:    s_cselect_b32 s5, s8, s5
 ; GFX9-NEXT:    s_add_i32 s7, s6, 1
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX9-NEXT:    s_cselect_b32 s2, s7, s6
-; GFX9-NEXT:    s_xor_b32 s3, s5, s4
+; GFX9-NEXT:    s_cmp_ge_u32 s5, s4
+; GFX9-NEXT:    s_cselect_b32 s4, s7, s6
 ; GFX9-NEXT:    s_xor_b32 s2, s2, s3
-; GFX9-NEXT:    s_sub_i32 s2, s2, s3
+; GFX9-NEXT:    s_ashr_i32 s2, s2, 31
+; GFX9-NEXT:    s_xor_b32 s3, s4, s2
+; GFX9-NEXT:    s_sub_i32 s2, s3, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
@@ -6706,38 +6700,37 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
 ; GFX6-LABEL: srem_i32_pow2_shl_denom:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_lshl_b32 s3, 0x1000, s3
-; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
-; GFX6-NEXT:    s_add_i32 s3, s3, s4
-; GFX6-NEXT:    s_xor_b32 s4, s3, s4
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s4
-; GFX6-NEXT:    s_sub_i32 s3, 0, s4
-; GFX6-NEXT:    s_ashr_i32 s5, s2, 31
-; GFX6-NEXT:    s_add_i32 s2, s2, s5
+; GFX6-NEXT:    s_abs_i32 s3, s3
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX6-NEXT:    s_sub_i32 s4, 0, s3
+; GFX6-NEXT:    s_abs_i32 s8, s2
+; GFX6-NEXT:    s_mov_b32 s5, s1
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    s_xor_b32 s6, s2, s5
-; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v0
-; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    v_mul_lo_u32 v1, s4, v0
+; GFX6-NEXT:    s_mov_b32 s4, s0
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT:    v_mul_hi_u32 v0, s6, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s7, v0
-; GFX6-NEXT:    s_mul_i32 s7, s7, s4
-; GFX6-NEXT:    s_sub_i32 s6, s6, s7
-; GFX6-NEXT:    s_sub_i32 s7, s6, s4
-; GFX6-NEXT:    s_cmp_ge_u32 s6, s4
-; GFX6-NEXT:    s_cselect_b32 s6, s7, s6
-; GFX6-NEXT:    s_sub_i32 s7, s6, s4
-; GFX6-NEXT:    s_cmp_ge_u32 s6, s4
-; GFX6-NEXT:    s_cselect_b32 s4, s7, s6
-; GFX6-NEXT:    s_xor_b32 s4, s4, s5
-; GFX6-NEXT:    s_sub_i32 s4, s4, s5
-; GFX6-NEXT:    v_mov_b32_e32 v0, s4
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s8, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX6-NEXT:    s_mul_i32 s0, s0, s3
+; GFX6-NEXT:    s_sub_i32 s0, s8, s0
+; GFX6-NEXT:    s_sub_i32 s1, s0, s3
+; GFX6-NEXT:    s_cmp_ge_u32 s0, s3
+; GFX6-NEXT:    s_cselect_b32 s0, s1, s0
+; GFX6-NEXT:    s_sub_i32 s1, s0, s3
+; GFX6-NEXT:    s_cmp_ge_u32 s0, s3
+; GFX6-NEXT:    s_cselect_b32 s0, s1, s0
+; GFX6-NEXT:    s_ashr_i32 s1, s2, 31
+; GFX6-NEXT:    s_xor_b32 s0, s0, s1
+; GFX6-NEXT:    s_sub_i32 s0, s0, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: srem_i32_pow2_shl_denom:
@@ -6746,32 +6739,29 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s3
-; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
-; GFX9-NEXT:    s_add_i32 s3, s3, s4
-; GFX9-NEXT:    s_xor_b32 s3, s3, s4
+; GFX9-NEXT:    s_abs_i32 s3, s3
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
 ; GFX9-NEXT:    s_sub_i32 s5, 0, s3
-; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
-; GFX9-NEXT:    s_add_i32 s2, s2, s4
+; GFX9-NEXT:    s_abs_i32 s4, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_xor_b32 s2, s2, s4
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
 ; GFX9-NEXT:    s_mul_i32 s5, s5, s6
 ; GFX9-NEXT:    s_mul_hi_u32 s5, s6, s5
 ; GFX9-NEXT:    s_add_i32 s6, s6, s5
-; GFX9-NEXT:    s_mul_hi_u32 s5, s2, s6
+; GFX9-NEXT:    s_mul_hi_u32 s5, s4, s6
 ; GFX9-NEXT:    s_mul_i32 s5, s5, s3
-; GFX9-NEXT:    s_sub_i32 s2, s2, s5
-; GFX9-NEXT:    s_sub_i32 s5, s2, s3
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
-; GFX9-NEXT:    s_sub_i32 s5, s2, s3
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
-; GFX9-NEXT:    s_xor_b32 s2, s2, s4
-; GFX9-NEXT:    s_sub_i32 s2, s2, s4
+; GFX9-NEXT:    s_sub_i32 s4, s4, s5
+; GFX9-NEXT:    s_sub_i32 s5, s4, s3
+; GFX9-NEXT:    s_cmp_ge_u32 s4, s3
+; GFX9-NEXT:    s_cselect_b32 s4, s5, s4
+; GFX9-NEXT:    s_sub_i32 s5, s4, s3
+; GFX9-NEXT:    s_cmp_ge_u32 s4, s3
+; GFX9-NEXT:    s_cselect_b32 s3, s5, s4
+; GFX9-NEXT:    s_ashr_i32 s2, s2, 31
+; GFX9-NEXT:    s_xor_b32 s3, s3, s2
+; GFX9-NEXT:    s_sub_i32 s2, s3, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
index 53b2542cf9a7e..142290a39f8f4 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
@@ -3611,10 +3611,10 @@ define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) inreg %buf) {
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    buffer_load_dword v0, off, s[16:19], 0 glc
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    buffer_load_ushort v6, off, s[16:19], 0 offset:4 glc
-; SDAG-NEXT:    s_waitcnt vmcnt(1)
-; SDAG-NEXT:    v_lshrrev_b32_e32 v7, 8, v0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    v_lshrrev_b32_e32 v7, 8, v0
 ; SDAG-NEXT:    v_and_b32_e32 v1, 0xffff, v6
 ; SDAG-NEXT:    v_lshrrev_b64 v[3:4], 24, v[0:1]
 ; SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
@@ -3627,12 +3627,12 @@ define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) inreg %buf) {
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0 glc
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    buffer_load_ushort v4, off, s[16:19], 0 offset:4 glc
-; GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
-; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -3652,6 +3652,7 @@ define void @volatile_store_v6i8(<6 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; SDAG-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    buffer_store_short v4, off, s[16:19], 0 offset:4
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -3671,6 +3672,7 @@ define void @volatile_store_v6i8(<6 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GISEL-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    buffer_store_short v2, off, s[16:19], 0 offset:4
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll b/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll
index 861621bd92af1..c1b8bc6031b18 100644
--- a/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll
@@ -410,26 +410,14 @@ define void @undef_lo2_v4i16(<2 x i16> %arg0) {
 ; GFX11-FAKE16-NEXT:    ;;#ASMEND
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-TRUE16-SDAG-LABEL: undef_lo2_v4i16:
-; GFX11-TRUE16-SDAG:       ; %bb.0:
-; GFX11-TRUE16-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-SDAG-NEXT:    v_mov_b16_e32 v1.l, v0.h
-; GFX11-TRUE16-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-SDAG-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-TRUE16-SDAG-NEXT:    ;;#ASMSTART
-; GFX11-TRUE16-SDAG-NEXT:    ; use v[0:1]
-; GFX11-TRUE16-SDAG-NEXT:    ;;#ASMEND
-; GFX11-TRUE16-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-TRUE16-GISEL-LABEL: undef_lo2_v4i16:
-; GFX11-TRUE16-GISEL:       ; %bb.0:
-; GFX11-TRUE16-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-GISEL-NEXT:    v_mov_b16_e32 v0.l, v0.h
-; GFX11-TRUE16-GISEL-NEXT:    ;;#ASMSTART
-; GFX11-TRUE16-GISEL-NEXT:    ; use v[0:1]
-; GFX11-TRUE16-GISEL-NEXT:    ;;#ASMEND
-; GFX11-TRUE16-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: undef_lo2_v4i16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    ;;#ASMSTART
+; GFX11-TRUE16-NEXT:    ; use v[0:1]
+; GFX11-TRUE16-NEXT:    ;;#ASMEND
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
   %undef.lo = shufflevector <2 x i16> %arg0, <2 x i16> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 3>
   call void asm sideeffect "; use $0", "v"(<4 x i16> %undef.lo);
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/bypass-div.ll b/llvm/test/CodeGen/AMDGPU/bypass-div.ll
index 3cf70c42390c2..d7d697ef85b9f 100644
--- a/llvm/test/CodeGen/AMDGPU/bypass-div.ll
+++ b/llvm/test/CodeGen/AMDGPU/bypass-div.ll
@@ -576,11 +576,11 @@ define i32 @sdiv32(i32 %a, i32 %b) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_e32 v2, 0, v1
-; GFX9-NEXT:    v_max_i32_e32 v2, v1, v2
+; GFX9-NEXT:    v_max_i32_e32 v2, v2, v1
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v2
 ; GFX9-NEXT:    v_sub_u32_e32 v4, 0, v2
 ; GFX9-NEXT:    v_sub_u32_e32 v5, 0, v0
-; GFX9-NEXT:    v_max_i32_e32 v5, v0, v5
+; GFX9-NEXT:    v_max_i32_e32 v5, v5, v0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
@@ -640,11 +640,11 @@ define i32 @srem32(i32 %a, i32 %b) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_e32 v2, 0, v1
-; GFX9-NEXT:    v_max_i32_e32 v1, v1, v2
+; GFX9-NEXT:    v_max_i32_e32 v1, v2, v1
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v3, 0, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v4, 0, v0
-; GFX9-NEXT:    v_max_i32_e32 v4, v0, v4
+; GFX9-NEXT:    v_max_i32_e32 v4, v4, v0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
index 9c59b4236cae4..ab96dcf1f6069 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
@@ -563,10 +563,9 @@ define i32 @divergent_vec_i16_HH(i32 %a, i32 %b) {
 ; GFX11-TRUE16-LABEL: divergent_vec_i16_HH:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: divergent_vec_i16_HH:
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
index 043bcc343d265..f64615dcc78f0 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
@@ -264,3 +264,90 @@ body:             |
     $sgpr0 = COPY %16:sreg_32
     SI_RETURN_TO_EPILOG $sgpr0
 ...
+
+---
+name:            s_pack_ll_b32_b16
+body:             |
+  bb.0:
+    ; GCN-LABEL: name: s_pack_ll_b32_b16
+    ; GCN: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 [[DEF]], implicit $exec
+    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_MOV_B32_e32_]].lo16, %subreg.lo16, [[DEF1]].lo16, %subreg.hi16
+    %0:sreg_32 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:sreg_32 = COPY %1:vgpr_32
+    %3:sreg_32 = S_PACK_LL_B32_B16 %0:sreg_32, %2:sreg_32, implicit-def dead $scc
+...
+
+---
+name:            s_pack_lh_b32_b16
+body:             |
+  bb.0:
+    ; GCN-LABEL: name: s_pack_lh_b32_b16
+    ; GCN: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 [[DEF]], implicit $exec
+    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_MOV_B32_e32_]].lo16, %subreg.lo16, [[DEF1]].hi16, %subreg.hi16
+    %0:sreg_32 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:sreg_32 = COPY %1:vgpr_32
+    %3:sreg_32 = S_PACK_LH_B32_B16 %0:sreg_32, %2:sreg_32, implicit-def dead $scc
+...
+
+---
+name:            s_pack_hl_b32_b16
+body:             |
+  bb.0:
+    ; GCN-LABEL: name: s_pack_hl_b32_b16
+    ; GCN: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 [[DEF]], implicit $exec
+    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_MOV_B32_e32_]].hi16, %subreg.lo16, [[DEF1]].lo16, %subreg.hi16
+    %0:sreg_32 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:sreg_32 = COPY %1:vgpr_32
+    %3:sreg_32 = S_PACK_HL_B32_B16 %0:sreg_32, %2:sreg_32, implicit-def dead $scc
+...
+
+---
+name:            s_pack_hh_b32_b16
+body:             |
+  bb.0:
+    ; GCN-LABEL: name: s_pack_hh_b32_b16
+    ; GCN: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 [[DEF]], implicit $exec
+    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_MOV_B32_e32_]].hi16, %subreg.lo16, [[DEF1]].hi16, %subreg.hi16
+    %0:sreg_32 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:sreg_32 = COPY %1:vgpr_32
+    %3:sreg_32 = S_PACK_HH_B32_B16 %0:sreg_32, %2:sreg_32, implicit-def dead $scc
+...
+
+---
+name:            s_pack_ll_b32_b16_use_SALU16
+body:             |
+  bb.0:
+    ; GCN-LABEL: name: s_pack_ll_b32_b16_use_SALU16
+    ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[V_FMAC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_FMAC_F16_t16_e64 0, [[DEF]].lo16, 0, [[DEF]].lo16, 0, [[DEF]].lo16, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_FMAC_F16_t16_e64_]], %subreg.lo16, [[DEF]].lo16, %subreg.hi16
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:sreg_32 = COPY %0:vgpr_32
+    %2:sreg_32 = S_FMAC_F16 %1:sreg_32, %1:sreg_32, %1:sreg_32, implicit $mode
+    %3:sreg_32 = S_PACK_LL_B32_B16 %2:sreg_32, %1:sreg_32, implicit-def dead $scc
+...
+
+---
+name:            s_pack_ll_b32_b16_use_imm
+body:             |
+  bb.0:
+    ; GCN-LABEL: name: s_pack_ll_b32_b16_use_imm
+    ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_MOV_B32_e32_]].lo16, %subreg.lo16, [[DEF]].lo16, %subreg.hi16
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:sreg_32 = COPY %0:vgpr_32
+    %2:sreg_32 = S_PACK_LL_B32_B16 1, %1:sreg_32, implicit-def dead $scc
+...
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll
index 9e152253bb6ca..3145a272ae48f 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll
@@ -10,7 +10,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
 
-define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
 ; SI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f32:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -120,7 +120,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o
   ret void
 }
 
-define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
 ; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_f32:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -231,7 +231,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt
   ret void
 }
 
-define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
 ; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -342,7 +342,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1)
   ret void
 }
 
-define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
 ; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -453,7 +453,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1)
   ret void
 }
 
-define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
 ; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -569,7 +569,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp
   ret void
 }
 
-define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
 ; SI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -740,7 +740,7 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1
   ret void
 }
 
-define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
 ; SI-SDAG-LABEL: v_test_fmed3_r_i_i_f64:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -955,14 +955,14 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out,
   %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid
   %a = load float, ptr addrspace(1) %gep0
 
-  %max = call float @llvm.maxnum.f32(float %a, float 2.0)
-  %med = call float @llvm.minnum.f32(float %max, float 4.0)
+  %max = call nnan float @llvm.maxnum.f32(float %a, float 2.0)
+  %med = call nnan float @llvm.minnum.f32(float %max, float 4.0)
 
   store float %med, ptr addrspace(1) %outgep
   ret void
 }
 
-define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
 ; SI-SDAG-LABEL: v_test_legacy_fmed3_r_i_i_f32:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -1297,10 +1297,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa
   %b = load volatile float, ptr addrspace(1) %gep1
   %c = load volatile float, ptr addrspace(1) %gep2
   %a.fneg = fsub float -0.0, %a
-  %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b)
-  %tmp1 = call float @llvm.maxnum.f32(float %a.fneg, float %b)
-  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
-  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  %tmp0 = call nnan float @llvm.minnum.f32(float %a.fneg, float %b)
+  %tmp1 = call nnan float @llvm.maxnum.f32(float %a.fneg, float %b)
+  %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c)
+  %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
   store float %med3, ptr addrspace(1) %outgep
   ret void
 }
@@ -1487,10 +1487,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa
   %b = load volatile float, ptr addrspace(1) %gep1
   %c = load volatile float, ptr addrspace(1) %gep2
   %b.fneg = fsub float -0.0, %b
-  %tmp0 = call float @llvm.minnum.f32(float %a, float %b.fneg)
-  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b.fneg)
-  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
-  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b.fneg)
+  %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b.fneg)
+  %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c)
+  %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
   store float %med3, ptr addrspace(1) %outgep
   ret void
 }
@@ -1677,10 +1677,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa
   %b = load volatile float, ptr addrspace(1) %gep1
   %c = load volatile float, ptr addrspace(1) %gep2
   %c.fneg = fsub float -0.0, %c
-  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
-  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
-  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fneg)
-  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b)
+  %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b)
+  %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c.fneg)
+  %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
   store float %med3, ptr addrspace(1) %outgep
   ret void
 }
@@ -1872,14 +1872,14 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs
   %c = load volatile float, ptr addrspace(1) %gep2
 
   %a.fneg = fsub float -0.0, %a
-  %b.fabs = call float @llvm.fabs.f32(float %b)
-  %c.fabs = call float @llvm.fabs.f32(float %c)
+  %b.fabs = call nnan float @llvm.fabs.f32(float %b)
+  %c.fabs = call nnan float @llvm.fabs.f32(float %c)
   %c.fabs.fneg = fsub float -0.0, %c.fabs
 
-  %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b.fabs)
-  %tmp1 = call float @llvm.maxnum.f32(float %a.fneg, float %b.fabs)
-  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg)
-  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  %tmp0 = call nnan float @llvm.minnum.f32(float %a.fneg, float %b.fabs)
+  %tmp1 = call nnan float @llvm.maxnum.f32(float %a.fneg, float %b.fabs)
+  %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg)
+  %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
 
   store float %med3, ptr addrspace(1) %outgep
   ret void
@@ -2082,16 +2082,16 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs
   %c.fabs = call float @llvm.fabs.f32(float %c)
   %c.fabs.fneg = fsub float -0.0, %c.fabs
 
-  %tmp0 = call float @llvm.minnum.f32(float %a.fabs.fneg, float %b.fabs.fneg)
-  %tmp1 = call float @llvm.maxnum.f32(float %a.fabs.fneg, float %b.fabs.fneg)
-  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg)
-  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  %tmp0 = call nnan float @llvm.minnum.f32(float %a.fabs.fneg, float %b.fabs.fneg)
+  %tmp1 = call nnan float @llvm.maxnum.f32(float %a.fabs.fneg, float %b.fabs.fneg)
+  %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg)
+  %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
 
   store float %med3, ptr addrspace(1) %outgep
   ret void
 }
 
-define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
 ; SI-SDAG-LABEL: v_nnan_inputs_med3_f32_pat0:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -2266,7 +2266,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt
   ret void
 }
 
-define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
 ; SI-SDAG-LABEL: v_nnan_input_calls_med3_f32_pat0:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -2418,7 +2418,7 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou
   ret void
 }
 
-define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
 ; SI-SDAG-LABEL: v_nnan_call_med3_f32_pat0:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -2570,7 +2570,7 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr
   ret void
 }
 
-define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
 ; SI-SDAG-LABEL: v_fast_call_med3_f32_pat0:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -2878,10 +2878,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o
   %a = load volatile float, ptr addrspace(1) %gep0
   %b = load volatile float, ptr addrspace(1) %gep1
   %c = load volatile float, ptr addrspace(1) %gep2
-  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
-  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
-  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
-  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b)
+  %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b)
+  %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c)
+  %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
   store float %med3, ptr addrspace(1) %outgep
   ret void
 }
@@ -3030,10 +3030,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o
   %a = load volatile float, ptr addrspace(1) %gep0
   %b = load volatile float, ptr addrspace(1) %gep1
   %c = load volatile float, ptr addrspace(1) %gep2
-  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
-  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
-  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
-  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b)
+  %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a)
+  %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c)
+  %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
   store float %med3, ptr addrspace(1) %outgep
   ret void
 }
@@ -3220,10 +3220,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa
   %b = load volatile float, ptr addrspace(1) %gep1
   %c = load volatile float, ptr addrspace(1) %gep2
   %a.fneg = fsub float -0.0, %a
-  %tmp0 = call float @llvm.maxnum.f32(float %a.fneg, float %b)
-  %tmp1 = call float @llvm.minnum.f32(float %a.fneg, float %b)
-  %tmp2 = call float @llvm.maxnum.f32(float %tmp1, float %c)
-  %med3 = call float @llvm.minnum.f32(float %tmp0, float %tmp2)
+  %tmp0 = call nnan float @llvm.maxnum.f32(float %a.fneg, float %b)
+  %tmp1 = call nnan float @llvm.minnum.f32(float %a.fneg, float %b)
+  %tmp2 = call nnan float @llvm.maxnum.f32(float %tmp1, float %c)
+  %med3 = call nnan float @llvm.minnum.f32(float %tmp0, float %tmp2)
   store float %med3, ptr addrspace(1) %outgep
   ret void
 }
@@ -3372,10 +3372,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o
   %a = load volatile float, ptr addrspace(1) %gep0
   %b = load volatile float, ptr addrspace(1) %gep1
   %c = load volatile float, ptr addrspace(1) %gep2
-  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
-  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
-  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
-  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b)
+  %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b)
+  %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1)
+  %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
   store float %med3, ptr addrspace(1) %outgep
   ret void
 }
@@ -3524,10 +3524,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o
   %a = load volatile float, ptr addrspace(1) %gep0
   %b = load volatile float, ptr addrspace(1) %gep1
   %c = load volatile float, ptr addrspace(1) %gep2
-  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
-  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
-  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
-  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b)
+  %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a)
+  %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1)
+  %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
   store float %med3, ptr addrspace(1) %outgep
   ret void
 }
@@ -3676,10 +3676,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o
   %a = load volatile float, ptr addrspace(1) %gep0
   %b = load volatile float, ptr addrspace(1) %gep1
   %c = load volatile float, ptr addrspace(1) %gep2
-  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
-  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
-  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
-  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a)
+  %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a)
+  %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1)
+  %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
   store float %med3, ptr addrspace(1) %outgep
   ret void
 }
@@ -3828,10 +3828,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o
   %a = load volatile float, ptr addrspace(1) %gep0
   %b = load volatile float, ptr addrspace(1) %gep1
   %c = load volatile float, ptr addrspace(1) %gep2
-  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
-  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
-  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
-  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a)
+  %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a)
+  %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c)
+  %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
   store float %med3, ptr addrspace(1) %outgep
   ret void
 }
@@ -3980,10 +3980,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o
   %a = load volatile float, ptr addrspace(1) %gep0
   %b = load volatile float, ptr addrspace(1) %gep1
   %c = load volatile float, ptr addrspace(1) %gep2
-  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
-  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
-  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
-  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a)
+  %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b)
+  %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1)
+  %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
   store float %med3, ptr addrspace(1) %outgep
   ret void
 }
@@ -4132,10 +4132,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o
   %a = load volatile float, ptr addrspace(1) %gep0
   %b = load volatile float, ptr addrspace(1) %gep1
   %c = load volatile float, ptr addrspace(1) %gep2
-  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
-  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
-  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
-  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a)
+  %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a)
+  %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1)
+  %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
   store float %med3, ptr addrspace(1) %outgep
   ret void
 }
@@ -4284,10 +4284,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o
   %a = load volatile float, ptr addrspace(1) %gep0
   %b = load volatile float, ptr addrspace(1) %gep1
   %c = load volatile float, ptr addrspace(1) %gep2
-  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
-  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
-  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
-  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+  %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b)
+  %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b)
+  %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c)
+  %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0)
   store float %med3, ptr addrspace(1) %outgep
   ret void
 }
@@ -4436,10 +4436,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o
   %a = load volatile float, ptr addrspace(1) %gep0
   %b = load volatile float, ptr addrspace(1) %gep1
   %c = load volatile float, ptr addrspace(1) %gep2
-  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
-  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
-  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
-  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+  %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b)
+  %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a)
+  %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c)
+  %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0)
   store float %med3, ptr addrspace(1) %outgep
   ret void
 }
@@ -4588,10 +4588,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) %
   %a = load volatile float, ptr addrspace(1) %gep0
   %b = load volatile float, ptr addrspace(1) %gep1
   %c = load volatile float, ptr addrspace(1) %gep2
-  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
-  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
-  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
-  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+  %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b)
+  %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b)
+  %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1)
+  %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0)
   store float %med3, ptr addrspace(1) %outgep
   ret void
 }
@@ -4740,10 +4740,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) %
   %a = load volatile float, ptr addrspace(1) %gep0
   %b = load volatile float, ptr addrspace(1) %gep1
   %c = load volatile float, ptr addrspace(1) %gep2
-  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
-  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
-  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
-  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+  %tmp0 = call nnan float @llvm.minnum.f32(float %a, float %b)
+  %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a)
+  %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1)
+  %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0)
   store float %med3, ptr addrspace(1) %outgep
   ret void
 }
@@ -4892,10 +4892,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) %
   %a = load volatile float, ptr addrspace(1) %gep0
   %b = load volatile float, ptr addrspace(1) %gep1
   %c = load volatile float, ptr addrspace(1) %gep2
-  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
-  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
-  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
-  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+  %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a)
+  %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a)
+  %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1)
+  %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0)
   store float %med3, ptr addrspace(1) %outgep
   ret void
 }
@@ -5044,10 +5044,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) %
   %a = load volatile float, ptr addrspace(1) %gep0
   %b = load volatile float, ptr addrspace(1) %gep1
   %c = load volatile float, ptr addrspace(1) %gep2
-  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
-  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
-  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
-  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+  %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a)
+  %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a)
+  %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c)
+  %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0)
   store float %med3, ptr addrspace(1) %outgep
   ret void
 }
@@ -5196,10 +5196,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) %
   %a = load volatile float, ptr addrspace(1) %gep0
   %b = load volatile float, ptr addrspace(1) %gep1
   %c = load volatile float, ptr addrspace(1) %gep2
-  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
-  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
-  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
-  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+  %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a)
+  %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b)
+  %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1)
+  %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0)
   store float %med3, ptr addrspace(1) %outgep
   ret void
 }
@@ -5348,10 +5348,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) %
   %a = load volatile float, ptr addrspace(1) %gep0
   %b = load volatile float, ptr addrspace(1) %gep1
   %c = load volatile float, ptr addrspace(1) %gep2
-  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
-  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
-  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
-  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+  %tmp0 = call nnan float @llvm.minnum.f32(float %b, float %a)
+  %tmp1 = call nnan float @llvm.maxnum.f32(float %b, float %a)
+  %tmp2 = call nnan float @llvm.minnum.f32(float %c, float %tmp1)
+  %med3 = call nnan float @llvm.maxnum.f32(float %tmp2, float %tmp0)
   store float %med3, ptr addrspace(1) %outgep
   ret void
 }
@@ -5503,10 +5503,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) %
   %a = load volatile float, ptr addrspace(1) %gep0
   %b = load volatile float, ptr addrspace(1) %gep1
   %c = load volatile float, ptr addrspace(1) %gep2
-  %tmp0 = call float @llvm.maxnum.f32(float %a, float %b)
-  %tmp1 = call float @llvm.minnum.f32(float %a, float %b)
-  %tmp2 = call float @llvm.maxnum.f32(float %tmp1, float %c)
-  %med3 = call float @llvm.minnum.f32(float %tmp0, float %tmp2)
+  %tmp0 = call nnan float @llvm.maxnum.f32(float %a, float %b)
+  %tmp1 = call nnan float @llvm.minnum.f32(float %a, float %b)
+  %tmp2 = call nnan float @llvm.maxnum.f32(float %tmp1, float %c)
+  %med3 = call nnan float @llvm.minnum.f32(float %tmp0, float %tmp2)
   store float %med3, ptr addrspace(1) %outgep
   ret void
 }
@@ -5515,7 +5515,7 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) %
 ; Negative patterns
 ; ---------------------------------------------------------------------
 
-define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
 ; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use0:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -5717,7 +5717,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1)
   ret void
 }
 
-define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
 ; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use1:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -5944,7 +5944,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1)
   ret void
 }
 
-define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
 ; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use2:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -6146,7 +6146,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1)
   ret void
 }
 
-define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
 ; SI-SDAG-LABEL: v_test_safe_med3_f32_pat0:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -6352,7 +6352,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr
   ret void
 }
 
-define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
 ; SI-SDAG-LABEL: v_nnan_inputs_missing0_med3_f32_pat0:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -6527,7 +6527,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1)
   ret void
 }
 
-define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
 ; SI-SDAG-LABEL: v_nnan_inputs_missing1_med3_f32_pat0:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -6702,7 +6702,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1)
   ret void
 }
 
-define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
 ; SI-SDAG-LABEL: v_nnan_inputs_missing2_med3_f32_pat0:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -6877,7 +6877,7 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1)
   ret void
 }
 
-define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
 ; SI-SDAG-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -7270,10 +7270,10 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt
   %b = load volatile float, ptr addrspace(1) %gep1
   %c = load volatile float, ptr addrspace(1) %gep2
   %a.fneg = fsub float -0.0, %a
-  %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b)
-  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
-  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
-  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  %tmp0 = call nnan float @llvm.minnum.f32(float %a.fneg, float %b)
+  %tmp1 = call nnan float @llvm.maxnum.f32(float %a, float %b)
+  %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %c)
+  %med3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
   store float %med3, ptr addrspace(1) %outgep
   ret void
 }
@@ -7428,13 +7428,13 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out
   %a = load volatile float, ptr addrspace(1) %gep0
   %b = load volatile float, ptr addrspace(1) %gep1
   %c = load volatile float, ptr addrspace(1) %gep2
-  %max = call float @llvm.maxnum.f32(float %a, float %b)
-  %minmax = call float @llvm.minnum.f32(float %max, float %c)
+  %max = call nnan float @llvm.maxnum.f32(float %a, float %b)
+  %minmax = call nnan float @llvm.minnum.f32(float %max, float %c)
   store float %minmax, ptr addrspace(1) %outgep
   ret void
 }
 
-define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
 ; SI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f16:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -7597,7 +7597,7 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o
   ret void
 }
 
-define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 {
+define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
 ; SI-SDAG-LABEL: v_nnan_inputs_med3_f16_pat0:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
@@ -7865,7 +7865,7 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt
   ret void
 }
 
-define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
 ; SI-SDAG-LABEL: two_non_inline_constant:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -7998,7 +7998,7 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad
 }
 
 ; FIXME: Simple stores do not work as a multiple use because they are bitcasted to integer constants.
-define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
 ; SI-SDAG-LABEL: one_non_inline_constant:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -8137,7 +8137,7 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad
   ret void
 }
 
-define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
+define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
 ; SI-SDAG-LABEL: two_non_inline_constant_multi_use:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -8343,7 +8343,7 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o
   ret void
 }
 
-define float @v_test_fmed3_r_i_i_f32_minimumnum_maximumnum(float %a) #1 {
+define float @v_test_fmed3_r_i_i_f32_minimumnum_maximumnum(float %a) {
 ; SI-LABEL: v_test_fmed3_r_i_i_f32_minimumnum_maximumnum:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8384,7 +8384,7 @@ define float @v_test_fmed3_r_i_i_f32_minimumnum_maximumnum(float %a) #1 {
   ret float %med
 }
 
-define <2 x float> @v_test_fmed3_r_i_i_v2f32_minimumnum_maximumnum(<2 x float> %a) #1 {
+define <2 x float> @v_test_fmed3_r_i_i_v2f32_minimumnum_maximumnum(<2 x float> %a) {
 ; SI-SDAG-LABEL: v_test_fmed3_r_i_i_v2f32_minimumnum_maximumnum:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8452,7 +8452,7 @@ define <2 x float> @v_test_fmed3_r_i_i_v2f32_minimumnum_maximumnum(<2 x float> %
   ret <2 x float> %med
 }
 
-define { float, float } @v_test_fmed3_r_i_i_f32_minimumnum_maximumnum_multi_use(float %a) #1 {
+define { float, float } @v_test_fmed3_r_i_i_f32_minimumnum_maximumnum_multi_use(float %a) {
 ; SI-SDAG-LABEL: v_test_fmed3_r_i_i_f32_minimumnum_maximumnum_multi_use:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8525,7 +8525,7 @@ define { float, float } @v_test_fmed3_r_i_i_f32_minimumnum_maximumnum_multi_use(
   ret { float, float } %ins.1
 }
 
-define float @v_test_nnan_input_fmed3_r_i_i_f32_minimumnum_maximumnum(float %a) #1 {
+define float @v_test_nnan_input_fmed3_r_i_i_f32_minimumnum_maximumnum(float %a) {
 ; SI-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_minimumnum_maximumnum:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8567,7 +8567,7 @@ define float @v_test_nnan_input_fmed3_r_i_i_f32_minimumnum_maximumnum(float %a)
   ret float %med
 }
 
-define float @v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minimumnum(float %a) #1 {
+define float @v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minimumnum(float %a) {
 ; SI-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minimumnum:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8609,7 +8609,7 @@ define float @v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minimumnum(float %a)
   ret float %med
 }
 
-define float @v_test_nnan_input_fmed3_r_i_i_f32_maxnum_minimumnum(float %a) #1 {
+define float @v_test_nnan_input_fmed3_r_i_i_f32_maxnum_minimumnum(float %a) {
 ; SI-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_maxnum_minimumnum:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8651,7 +8651,7 @@ define float @v_test_nnan_input_fmed3_r_i_i_f32_maxnum_minimumnum(float %a) #1 {
   ret float %med
 }
 
-define float @v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minnum(float %a) #1 {
+define float @v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minnum(float %a) {
 ; SI-LABEL: v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minnum:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8693,7 +8693,7 @@ define float @v_test_nnan_input_fmed3_r_i_i_f32_maximumnum_minnum(float %a) #1 {
   ret float %med
 }
 
-define half @v_test_fmed3_r_i_i_f16_minimumnum_maximumnum(half %a) #1 {
+define half @v_test_fmed3_r_i_i_f16_minimumnum_maximumnum(half %a) {
 ; SI-SDAG-LABEL: v_test_fmed3_r_i_i_f16_minimumnum_maximumnum:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8772,7 +8772,7 @@ define half @v_test_fmed3_r_i_i_f16_minimumnum_maximumnum(half %a) #1 {
   ret half %med
 }
 
-define <2 x half> @v_test_fmed3_r_i_i_v2f16_minimumnum_maximumnum(<2 x half> %a) #1 {
+define <2 x half> @v_test_fmed3_r_i_i_v2f16_minimumnum_maximumnum(<2 x half> %a) {
 ; SI-SDAG-LABEL: v_test_fmed3_r_i_i_v2f16_minimumnum_maximumnum:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8848,7 +8848,7 @@ define <2 x half> @v_test_fmed3_r_i_i_v2f16_minimumnum_maximumnum(<2 x half> %a)
   ret <2 x half> %med
 }
 
-define double @v_test_fmed3_r_i_i_f64_minimumnum_maximumnum(double %a) #1 {
+define double @v_test_fmed3_r_i_i_f64_minimumnum_maximumnum(double %a) {
 ; SI-LABEL: v_test_fmed3_r_i_i_f64_minimumnum_maximumnum:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -8905,5 +8905,4 @@ declare half @llvm.minnum.f16(half, half) #0
 declare half @llvm.maxnum.f16(half, half) #0
 
 attributes #0 = { nounwind readnone }
-attributes #1 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="false" }
 attributes #2 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" }
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll
index 76da0aaf251b2..10c60dfc9b34c 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll
@@ -478,41 +478,76 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_non_bc_src(ptr addrspace(1) %out,
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: s_fneg_fabs_v2bf16_non_bc_src:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x8
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b32 s1, s0, 16
-; GFX11-NEXT:    s_and_b32 s0, s0, 0xffff0000
-; GFX11-NEXT:    v_add_f32_e64 v0, s1, 1.0
-; GFX11-NEXT:    v_add_f32_e64 v1, s0, 2.0
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, 0x80008000, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: s_fneg_fabs_v2bf16_non_bc_src:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, s0, 0xffff0000
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v0, s1, 2.0
+; GFX11-TRUE16-NEXT:    v_add_f32_e64 v1, s0, 1.0
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, 0x80008000, v1
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: s_fneg_fabs_v2bf16_non_bc_src:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0xffff0000
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v0, s1, 1.0
+; GFX11-FAKE16-NEXT:    v_add_f32_e64 v1, s0, 2.0
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, 0x80008000, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %add = fadd <2 x bfloat> %in, <bfloat 1.0, bfloat 2.0>
   %fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %add)
   %fneg.fabs = fsub <2 x bfloat> <bfloat -0.0, bfloat -0.0>, %fabs
@@ -752,42 +787,78 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2bf16(ptr addrspace(1) %out, <2
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: fold_user_fneg_fabs_v2bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x8
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_and_b32 s1, s0, 0x7fff
-; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX11-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX11-NEXT:    s_and_b32 s0, s0, 0x7fff
-; GFX11-NEXT:    v_mul_f32_e64 v0, s1, -4.0
-; GFX11-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e64 v1, s0, -4.0
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: fold_user_fneg_fabs_v2bf16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, 0x7fff
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, s1, 0x7fff
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT:    v_mul_f32_e64 v1, s0, -4.0
+; GFX11-TRUE16-NEXT:    v_mul_f32_e64 v0, s1, -4.0
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_store_b32 v2, v1, s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: fold_user_fneg_fabs_v2bf16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s0, 0x7fff
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0x7fff
+; GFX11-FAKE16-NEXT:    v_mul_f32_e64 v0, s1, -4.0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_mul_f32_e64 v1, s0, -4.0
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v3, v5 :: v_dual_and_b32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %in)
   %fneg.fabs = fsub <2 x bfloat> <bfloat -0.0, bfloat -0.0>, %fabs
   %mul = fmul <2 x bfloat> %fneg.fabs, <bfloat 4.0, bfloat 4.0>
@@ -975,46 +1046,88 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2bf16(ptr addrspa
 ; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x10
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_and_b32 s0, s6, 0x7fff
-; GFX11-NEXT:    s_lshr_b32 s1, s6, 16
-; GFX11-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX11-NEXT:    s_and_b32 s1, s1, 0x7fff
-; GFX11-NEXT:    v_mul_f32_e64 v0, s0, -4.0
-; GFX11-NEXT:    s_lshl_b32 s0, s1, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e64 v1, s0, -4.0
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v1
-; GFX11-NEXT:    s_and_b32 s4, s6, 0x7fff7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
-; GFX11-NEXT:    v_dual_mov_b32 v3, s4 :: v_dual_and_b32 v0, 0xffff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_store_b32 v2, v3, s[0:1]
-; GFX11-NEXT:    global_store_b32 v2, v0, s[2:3]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2bf16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b32 s6, s[4:5], 0x10
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s0, s6, 16
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, s6, 0x7fff
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT:    v_mul_f32_e64 v0, s0, -4.0
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s0, s1, 16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mul_f32_e64 v1, s0, -4.0
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s6, 0x7fff7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    global_store_b32 v2, v3, s[0:1]
+; GFX11-TRUE16-NEXT:    global_store_b32 v2, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2bf16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b32 s6, s[4:5], 0x10
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s6, 0x7fff
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s1, s6, 16
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_mul_f32_e64 v0, s0, -4.0
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s0, s1, 16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_mul_f32_e64 v1, s0, -4.0
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s6, 0x7fff7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v3, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v3, s4 :: v_dual_and_b32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    global_store_b32 v2, v3, s[0:1]
+; GFX11-FAKE16-NEXT:    global_store_b32 v2, v0, s[2:3]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %in)
   %fneg = fsub <2 x bfloat> <bfloat -0.0, bfloat -0.0>, %fabs
   %mul = fmul <2 x bfloat> %fneg, <bfloat 4.0, bfloat 4.0>
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll b/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll
index 98044a72870fb..84b904ff67151 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll
@@ -712,47 +712,88 @@ define amdgpu_kernel void @v_fneg_fold_v2bf16(ptr addrspace(1) %out, ptr addrspa
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: v_fneg_fold_v2bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-NEXT:    v_xor_b32_e32 v3, 0x8000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mul_f32 v3, v3, v4 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-NEXT:    v_mul_f32_e32 v1, v2, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v2, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
-; GFX11-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, v4, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: v_fneg_fold_v2bf16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT:    v_xor_b32_e32 v3, 0x8000, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_mul_f32 v1, v3, v1 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v2, v2, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v3, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 0x7fff, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX11-TRUE16-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_fneg_fold_v2bf16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT:    v_xor_b32_e32 v3, 0x8000, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_mul_f32 v3, v3, v4 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_mul_f32_e32 v1, v2, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, v2, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 0x7fff, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, v4, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 0x7fff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
+; GFX11-FAKE16-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %val = load <2 x bfloat>, ptr addrspace(1) %in
   %fsub = fsub <2 x bfloat> <bfloat -0.0, bfloat -0.0>, %val
   %fmul = fmul <2 x bfloat> %fsub, %val
diff --git a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
index f048dc5d059f6..a43292d9e5021 100644
--- a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
@@ -330,11 +330,8 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i16(
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-TRUE16-NEXT:    v_cvt_i16_f16_e32 v0.l, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cvt_i16_f16_e32 v1.l, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cvt_i16_f16_e32 v0.h, v1.l
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
index 96abb3a47ad01..96cb62110001e 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
@@ -329,11 +329,8 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i16(
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-TRUE16-NEXT:    v_cvt_u16_f16_e32 v0.l, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cvt_u16_f16_e32 v1.l, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cvt_u16_f16_e32 v0.h, v1.l
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 415828f32f920..6f91222b2f396 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -5972,16 +5972,14 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-TRUE16-NEXT:  .LBB9_16: ; %Flow54
 ; GFX11-TRUE16-NEXT:    v_cmp_lg_f16_e32 vcc_lo, 0, v1.l
 ; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e64 s2, 0x7c00, |v0.l|
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_lg_f16_e32 vcc_lo, 0, v4.l
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v2.l, s2
 ; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e64 s2, 0x7c00, |v3.l|
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0xffff, v0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, 0x7e00, v7.l, s2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x7e00, v7.l, s2
 ; GFX11-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
@@ -6422,19 +6420,16 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1150-TRUE16-NEXT:    s_cselect_b32 s3, -1, 0
 ; GFX1150-TRUE16-NEXT:    s_cmp_nge_f16 s2, 0x7c00
 ; GFX1150-TRUE16-NEXT:    s_cselect_b32 s2, -1, 0
-; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1150-TRUE16-NEXT:    s_and_b32 s2, s2, s3
 ; GFX1150-TRUE16-NEXT:    s_cmp_lg_f16 s5, 0
 ; GFX1150-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, s2
 ; GFX1150-TRUE16-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX1150-TRUE16-NEXT:    s_cmp_nge_f16 s4, 0x7c00
-; GFX1150-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX1150-TRUE16-NEXT:    s_cselect_b32 s3, -1, 0
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1150-TRUE16-NEXT:    s_and_b32 s2, s3, s2
-; GFX1150-TRUE16-NEXT:    v_cndmask_b16 v1.l, 0x7e00, v1.l, s2
-; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX1150-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x7e00, v1.l, s2
 ; GFX1150-TRUE16-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX1150-TRUE16-NEXT:    s_endpgm
 ;
@@ -6902,20 +6897,17 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-TRUE16-NEXT:    s_cselect_b32 s3, -1, 0
 ; GFX1200-TRUE16-NEXT:    s_cmp_nge_f16 s2, 0x7c00
 ; GFX1200-TRUE16-NEXT:    s_cselect_b32 s2, -1, 0
-; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1200-TRUE16-NEXT:    s_and_b32 s2, s2, s3
 ; GFX1200-TRUE16-NEXT:    s_cmp_lg_f16 s5, 0
 ; GFX1200-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, s2
 ; GFX1200-TRUE16-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX1200-TRUE16-NEXT:    s_cmp_nge_f16 s4, 0x7c00
-; GFX1200-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX1200-TRUE16-NEXT:    s_cselect_b32 s3, -1, 0
 ; GFX1200-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX1200-TRUE16-NEXT:    s_and_b32 s2, s3, s2
 ; GFX1200-TRUE16-NEXT:    s_wait_alu 0xfffe
-; GFX1200-TRUE16-NEXT:    v_cndmask_b16 v1.l, 0x7e00, v1.l, s2
-; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX1200-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x7e00, v1.l, s2
 ; GFX1200-TRUE16-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX1200-TRUE16-NEXT:    s_endpgm
 ;
@@ -9346,29 +9338,23 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-TRUE16-NEXT:  .LBB10_32: ; %Flow124
 ; GFX11-TRUE16-NEXT:    v_cmp_lg_f16_e32 vcc_lo, 0, v2.l
 ; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e64 s2, 0x7c00, |v0.l|
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_lg_f16_e32 vcc_lo, 0, v6.l
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v4.l, s2
 ; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e64 s2, 0x7c00, |v5.l|
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_lg_f16_e32 vcc_lo, 0, v3.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, 0x7e00, v7.l, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x7e00, v7.l, s2
 ; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e64 s2, 0x7c00, |v1.l|
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_lg_f16_e32 vcc_lo, 0, v10.l
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, 0x7e00, v8.l, s2
 ; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e64 s2, 0x7c00, |v9.l|
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, 0x7e00, v11.l, s2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v4, 16, v1
-; GFX11-TRUE16-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, 0x7e00, v11.l, s2
+; GFX11-TRUE16-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: frem_v4f16:
@@ -10209,21 +10195,19 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1150-TRUE16-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX1150-TRUE16-NEXT:    s_cmp_nge_f16 s3, 0x7c00
 ; GFX1150-TRUE16-NEXT:    s_cselect_b32 s3, -1, 0
-; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1150-TRUE16-NEXT:    s_and_b32 s3, s3, s4
 ; GFX1150-TRUE16-NEXT:    s_cmp_lg_f16 s6, 0
 ; GFX1150-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, s3
 ; GFX1150-TRUE16-NEXT:    s_cselect_b32 s3, -1, 0
 ; GFX1150-TRUE16-NEXT:    s_cmp_nge_f16 s5, 0x7c00
-; GFX1150-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX1150-TRUE16-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1150-TRUE16-NEXT:    s_and_b32 s3, s4, s3
 ; GFX1150-TRUE16-NEXT:    s_cmp_lg_f16 s2, 0
-; GFX1150-TRUE16-NEXT:    v_cndmask_b16 v4.l, 0x7e00, v1.l, s3
+; GFX1150-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x7e00, v1.l, s3
 ; GFX1150-TRUE16-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX1150-TRUE16-NEXT:    s_cmp_nge_f16 s8, 0x7c00
-; GFX1150-TRUE16-NEXT:    v_lshl_or_b32 v0, v4, 16, v0
 ; GFX1150-TRUE16-NEXT:    s_cselect_b32 s3, -1, 0
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1150-TRUE16-NEXT:    s_and_b32 s2, s3, s2
@@ -10232,13 +10216,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1150-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1150-TRUE16-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX1150-TRUE16-NEXT:    s_cmp_nge_f16 s7, 0x7c00
-; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1150-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX1150-TRUE16-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1150-TRUE16-NEXT:    s_and_b32 s2, s3, s2
-; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_cndmask_b16 v3.l, 0x7e00, v3.l, s2
-; GFX1150-TRUE16-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
+; GFX1150-TRUE16-NEXT:    v_cndmask_b16 v1.h, 0x7e00, v3.l, s2
 ; GFX1150-TRUE16-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX1150-TRUE16-NEXT:    s_endpgm
 ;
@@ -11147,18 +11128,14 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, s3
 ; GFX1200-TRUE16-NEXT:    s_cselect_b32 s3, -1, 0
 ; GFX1200-TRUE16-NEXT:    s_cmp_nge_f16 s5, 0x7c00
-; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX1200-TRUE16-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX1200-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX1200-TRUE16-NEXT:    s_and_b32 s3, s4, s3
 ; GFX1200-TRUE16-NEXT:    s_cmp_lg_f16 s2, 0
 ; GFX1200-TRUE16-NEXT:    s_wait_alu 0xfffe
-; GFX1200-TRUE16-NEXT:    v_cndmask_b16 v4.l, 0x7e00, v1.l, s3
+; GFX1200-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x7e00, v1.l, s3
 ; GFX1200-TRUE16-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX1200-TRUE16-NEXT:    s_cmp_nge_f16 s8, 0x7c00
-; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT:    v_lshl_or_b32 v0, v4, 16, v0
 ; GFX1200-TRUE16-NEXT:    s_cselect_b32 s3, -1, 0
 ; GFX1200-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX1200-TRUE16-NEXT:    s_and_b32 s2, s3, s2
@@ -11168,15 +11145,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1200-TRUE16-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX1200-TRUE16-NEXT:    s_cmp_nge_f16 s7, 0x7c00
-; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1200-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX1200-TRUE16-NEXT:    s_cselect_b32 s3, -1, 0
 ; GFX1200-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX1200-TRUE16-NEXT:    s_and_b32 s2, s3, s2
 ; GFX1200-TRUE16-NEXT:    s_wait_alu 0xfffe
-; GFX1200-TRUE16-NEXT:    v_cndmask_b16 v3.l, 0x7e00, v3.l, s2
-; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
+; GFX1200-TRUE16-NEXT:    v_cndmask_b16 v1.h, 0x7e00, v3.l, s2
 ; GFX1200-TRUE16-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX1200-TRUE16-NEXT:    s_endpgm
 ;
@@ -17589,5 +17562,1363 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
    ret void
 }
 
-attributes #0 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
-attributes #1 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
+
+define amdgpu_kernel void @frem_v2f64_const_zero_num(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-LABEL: frem_v2f64_const_zero_num:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s4, s2
+; SI-NEXT:    s_mov_b32 s5, s3
+; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_nlg_f64_e32 vcc, 0, v[0:1]
+; SI-NEXT:    s_and_b64 s[2:3], vcc, exec
+; SI-NEXT:    s_cselect_b32 s8, 0x7ff80000, 0
+; SI-NEXT:    s_mov_b32 s2, s6
+; SI-NEXT:    s_mov_b32 s3, s7
+; SI-NEXT:    v_cmp_nlg_f64_e32 vcc, 0, v[2:3]
+; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-NEXT:    s_cselect_b32 s4, 0x7ff80000, 0
+; SI-NEXT:    v_mov_b32_e32 v0, 0
+; SI-NEXT:    v_mov_b32_e32 v1, s8
+; SI-NEXT:    v_mov_b32_e32 v3, s4
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; CI-LABEL: frem_v2f64_const_zero_num:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 s4, s2
+; CI-NEXT:    s_mov_b32 s5, s3
+; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_cmp_nlg_f64_e32 vcc, 0, v[0:1]
+; CI-NEXT:    v_mov_b32_e32 v0, 0
+; CI-NEXT:    s_and_b64 s[2:3], vcc, exec
+; CI-NEXT:    v_cmp_nlg_f64_e32 vcc, 0, v[2:3]
+; CI-NEXT:    s_cselect_b32 s8, 0x7ff80000, 0
+; CI-NEXT:    s_mov_b32 s2, s6
+; CI-NEXT:    s_mov_b32 s3, s7
+; CI-NEXT:    v_mov_b32_e32 v1, s8
+; CI-NEXT:    v_mov_b32_e32 v2, v0
+; CI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; CI-NEXT:    s_cselect_b32 s4, 0x7ff80000, 0
+; CI-NEXT:    v_mov_b32_e32 v3, s4
+; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; CI-NEXT:    s_endpgm
+;
+; VI-LABEL: frem_v2f64_const_zero_num:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_cmp_nlg_f64_e32 vcc, 0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    s_and_b64 s[2:3], vcc, exec
+; VI-NEXT:    v_cmp_nlg_f64_e32 vcc, 0, v[2:3]
+; VI-NEXT:    s_cselect_b32 s2, 0x7ff80000, 0
+; VI-NEXT:    v_mov_b32_e32 v1, s2
+; VI-NEXT:    v_mov_b32_e32 v2, v0
+; VI-NEXT:    s_and_b64 s[0:1], vcc, exec
+; VI-NEXT:    s_cselect_b32 s0, 0x7ff80000, 0
+; VI-NEXT:    v_mov_b32_e32 v3, s0
+; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: frem_v2f64_const_zero_num:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[1:4], v0, s[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_nlg_f64_e32 vcc, 0, v[1:2]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_and_b64 s[2:3], vcc, exec
+; GFX9-NEXT:    v_cmp_nlg_f64_e32 vcc, 0, v[3:4]
+; GFX9-NEXT:    s_cselect_b32 s4, 0x7ff80000, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    s_and_b64 s[2:3], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s2, 0x7ff80000, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-NEXT:    global_store_dwordx4 v0, v[0:3], s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: frem_v2f64_const_zero_num:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx4 v[1:4], v0, s[2:3]
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_cmp_nlg_f64_e32 vcc_lo, 0, v[1:2]
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    s_and_b32 s2, vcc_lo, exec_lo
+; GFX10-NEXT:    v_cmp_nlg_f64_e32 vcc_lo, 0, v[3:4]
+; GFX10-NEXT:    s_cselect_b32 s2, 0x7ff80000, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-NEXT:    s_and_b32 s3, vcc_lo, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s3, 0x7ff80000, 0
+; GFX10-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-NEXT:    global_store_dwordx4 v0, v[0:3], s[0:1]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: frem_v2f64_const_zero_num:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_b128 v[1:4], v0, s[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_cmp_nlg_f64_e32 vcc_lo, 0, v[1:2]
+; GFX11-NEXT:    s_and_b32 s2, vcc_lo, exec_lo
+; GFX11-NEXT:    v_cmp_nlg_f64_e32 vcc_lo, 0, v[3:4]
+; GFX11-NEXT:    s_cselect_b32 s2, 0x7ff80000, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, v0
+; GFX11-NEXT:    s_and_b32 s3, vcc_lo, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s3, 0x7ff80000, 0
+; GFX11-NEXT:    v_mov_b32_e32 v3, s3
+; GFX11-NEXT:    global_store_b128 v0, v[0:3], s[0:1]
+; GFX11-NEXT:    s_endpgm
+;
+; GFX1150-LABEL: frem_v2f64_const_zero_num:
+; GFX1150:       ; %bb.0:
+; GFX1150-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-NEXT:    global_load_b128 v[1:4], v0, s[2:3]
+; GFX1150-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-NEXT:    v_cmp_nlg_f64_e32 vcc_lo, 0, v[1:2]
+; GFX1150-NEXT:    s_and_b32 s2, vcc_lo, exec_lo
+; GFX1150-NEXT:    v_cmp_nlg_f64_e32 vcc_lo, 0, v[3:4]
+; GFX1150-NEXT:    s_cselect_b32 s2, 0x7ff80000, 0
+; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT:    v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, v0
+; GFX1150-NEXT:    s_and_b32 s3, vcc_lo, exec_lo
+; GFX1150-NEXT:    s_cselect_b32 s3, 0x7ff80000, 0
+; GFX1150-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1150-NEXT:    global_store_b128 v0, v[0:3], s[0:1]
+; GFX1150-NEXT:    s_endpgm
+;
+; GFX1200-LABEL: frem_v2f64_const_zero_num:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1200-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-NEXT:    global_load_b128 v[1:4], v0, s[2:3]
+; GFX1200-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-NEXT:    v_cmp_nlg_f64_e32 vcc_lo, 0, v[1:2]
+; GFX1200-NEXT:    s_and_b32 s2, vcc_lo, exec_lo
+; GFX1200-NEXT:    v_cmp_nlg_f64_e32 vcc_lo, 0, v[3:4]
+; GFX1200-NEXT:    s_cselect_b32 s2, 0x7ff80000, 0
+; GFX1200-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-NEXT:    v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, v0
+; GFX1200-NEXT:    s_and_b32 s3, vcc_lo, exec_lo
+; GFX1200-NEXT:    s_cselect_b32 s3, 0x7ff80000, 0
+; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1200-NEXT:    global_store_b128 v0, v[0:3], s[0:1]
+; GFX1200-NEXT:    s_endpgm
+   %r0 = load <2 x double>, ptr addrspace(1) %in, align 16
+   %r1 = frem  <2 x double> <double 0.0, double 0.0>, %r0
+   store <2 x double> %r1, ptr addrspace(1) %out, align 16
+   ret void
+}
+
+define amdgpu_kernel void @frem_v2f64_const_one_denum(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-LABEL: frem_v2f64_const_one_denum:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s4, s2
+; SI-NEXT:    s_mov_b32 s5, s3
+; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, 1.0
+; SI-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; SI-NEXT:    s_cbranch_vccz .LBB15_2
+; SI-NEXT:  ; %bb.1: ; %frem.else16
+; SI-NEXT:    v_and_b32_e32 v4, 0x80000000, v1
+; SI-NEXT:    v_cmp_eq_f64_e64 vcc, |v[0:1]|, 1.0
+; SI-NEXT:    v_cndmask_b32_e32 v5, v1, v4, vcc
+; SI-NEXT:    v_cndmask_b32_e64 v4, v0, 0, vcc
+; SI-NEXT:    s_mov_b64 vcc, exec
+; SI-NEXT:    s_cbranch_execz .LBB15_3
+; SI-NEXT:    s_branch .LBB15_8
+; SI-NEXT:  .LBB15_2:
+; SI-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; SI-NEXT:    s_mov_b64 vcc, 0
+; SI-NEXT:  .LBB15_3: ; %frem.compute15
+; SI-NEXT:    s_brev_b32 s4, -2
+; SI-NEXT:    v_and_b32_e32 v6, 0x7fffffff, v1
+; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    s_mov_b32 s3, 0x7ff00000
+; SI-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[2:3]
+; SI-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[0:1]|
+; SI-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
+; SI-NEXT:    v_frexp_exp_i32_f64_e32 v6, v[0:1]
+; SI-NEXT:    s_and_b64 s[2:3], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s2, v6
+; SI-NEXT:    s_cselect_b32 s3, s2, 0
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_add_i32 s5, s3, -1
+; SI-NEXT:    v_ldexp_f64 v[5:6], v[4:5], 26
+; SI-NEXT:    s_cmp_lt_i32 s5, 27
+; SI-NEXT:    s_cbranch_scc1 .LBB15_7
+; SI-NEXT:  ; %bb.4: ; %frem.loop_body23.preheader
+; SI-NEXT:    s_add_i32 s5, s3, 25
+; SI-NEXT:    v_mov_b32_e32 v9, 0x43300000
+; SI-NEXT:    v_mov_b32_e32 v4, 0
+; SI-NEXT:    s_mov_b32 s3, 0x432fffff
+; SI-NEXT:  .LBB15_5: ; %frem.loop_body23
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    v_mov_b32_e32 v8, v6
+; SI-NEXT:    v_mov_b32_e32 v7, v5
+; SI-NEXT:    v_bfi_b32 v5, s4, v9, v8
+; SI-NEXT:    v_add_f64 v[10:11], v[7:8], v[4:5]
+; SI-NEXT:    v_add_f64 v[5:6], v[10:11], -v[4:5]
+; SI-NEXT:    v_cmp_gt_f64_e64 vcc, |v[7:8]|, s[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; SI-NEXT:    v_add_f64 v[5:6], v[7:8], -v[5:6]
+; SI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[5:6]
+; SI-NEXT:    v_add_f64 v[10:11], v[5:6], 1.0
+; SI-NEXT:    v_cndmask_b32_e32 v6, v6, v11, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v10, vcc
+; SI-NEXT:    v_ldexp_f64 v[5:6], v[5:6], 26
+; SI-NEXT:    s_sub_i32 s5, s5, 26
+; SI-NEXT:    s_cmp_gt_i32 s5, 26
+; SI-NEXT:    s_cbranch_scc1 .LBB15_5
+; SI-NEXT:  ; %bb.6: ; %Flow50
+; SI-NEXT:    v_mov_b32_e32 v5, v7
+; SI-NEXT:    v_mov_b32_e32 v6, v8
+; SI-NEXT:  .LBB15_7: ; %frem.loop_exit24
+; SI-NEXT:    s_sub_i32 s2, s5, 25
+; SI-NEXT:    v_ldexp_f64 v[4:5], v[5:6], s2
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s3, 0x432fffff
+; SI-NEXT:    v_cmp_gt_f64_e64 vcc, |v[4:5]|, s[2:3]
+; SI-NEXT:    s_brev_b32 s2, -2
+; SI-NEXT:    v_mov_b32_e32 v6, 0x43300000
+; SI-NEXT:    v_bfi_b32 v7, s2, v6, v5
+; SI-NEXT:    v_mov_b32_e32 v6, 0
+; SI-NEXT:    v_add_f64 v[8:9], v[4:5], v[6:7]
+; SI-NEXT:    v_add_f64 v[6:7], v[8:9], -v[6:7]
+; SI-NEXT:    v_cndmask_b32_e32 v7, v7, v5, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v6, v6, v4, vcc
+; SI-NEXT:    v_add_f64 v[4:5], v[4:5], -v[6:7]
+; SI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; SI-NEXT:    v_add_f64 v[6:7], v[4:5], 1.0
+; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; SI-NEXT:    v_bfi_b32 v5, s2, v5, v1
+; SI-NEXT:  .LBB15_8:
+; SI-NEXT:    v_cmp_ngt_f64_e64 s[2:3], |v[2:3]|, 1.0
+; SI-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; SI-NEXT:    s_cbranch_vccz .LBB15_10
+; SI-NEXT:  ; %bb.9: ; %frem.else
+; SI-NEXT:    v_and_b32_e32 v6, 0x80000000, v3
+; SI-NEXT:    v_cmp_eq_f64_e64 vcc, |v[2:3]|, 1.0
+; SI-NEXT:    v_cndmask_b32_e32 v7, v3, v6, vcc
+; SI-NEXT:    v_cndmask_b32_e64 v6, v2, 0, vcc
+; SI-NEXT:    s_mov_b64 vcc, exec
+; SI-NEXT:    s_cbranch_execz .LBB15_11
+; SI-NEXT:    s_branch .LBB15_16
+; SI-NEXT:  .LBB15_10:
+; SI-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; SI-NEXT:    s_mov_b64 vcc, 0
+; SI-NEXT:  .LBB15_11: ; %frem.compute
+; SI-NEXT:    s_brev_b32 s4, -2
+; SI-NEXT:    v_and_b32_e32 v8, 0x7fffffff, v3
+; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    s_mov_b32 s3, 0x7ff00000
+; SI-NEXT:    v_cmp_lt_f64_e64 vcc, |v[2:3]|, s[2:3]
+; SI-NEXT:    v_frexp_mant_f64_e64 v[6:7], |v[2:3]|
+; SI-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v6, v2, v6, vcc
+; SI-NEXT:    v_frexp_exp_i32_f64_e32 v8, v[2:3]
+; SI-NEXT:    s_and_b64 s[2:3], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s2, v8
+; SI-NEXT:    s_cselect_b32 s3, s2, 0
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_add_i32 s5, s3, -1
+; SI-NEXT:    v_ldexp_f64 v[7:8], v[6:7], 26
+; SI-NEXT:    s_cmp_lt_i32 s5, 27
+; SI-NEXT:    s_cbranch_scc1 .LBB15_15
+; SI-NEXT:  ; %bb.12: ; %frem.loop_body.preheader
+; SI-NEXT:    s_add_i32 s5, s3, 25
+; SI-NEXT:    v_mov_b32_e32 v11, 0x43300000
+; SI-NEXT:    v_mov_b32_e32 v6, 0
+; SI-NEXT:    s_mov_b32 s3, 0x432fffff
+; SI-NEXT:  .LBB15_13: ; %frem.loop_body
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    v_mov_b32_e32 v10, v8
+; SI-NEXT:    v_mov_b32_e32 v9, v7
+; SI-NEXT:    v_bfi_b32 v7, s4, v11, v10
+; SI-NEXT:    v_add_f64 v[12:13], v[9:10], v[6:7]
+; SI-NEXT:    v_add_f64 v[7:8], v[12:13], -v[6:7]
+; SI-NEXT:    v_cmp_gt_f64_e64 vcc, |v[9:10]|, s[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
+; SI-NEXT:    v_add_f64 v[7:8], v[9:10], -v[7:8]
+; SI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[7:8]
+; SI-NEXT:    v_add_f64 v[12:13], v[7:8], 1.0
+; SI-NEXT:    v_cndmask_b32_e32 v8, v8, v13, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v7, v7, v12, vcc
+; SI-NEXT:    v_ldexp_f64 v[7:8], v[7:8], 26
+; SI-NEXT:    s_sub_i32 s5, s5, 26
+; SI-NEXT:    s_cmp_gt_i32 s5, 26
+; SI-NEXT:    s_cbranch_scc1 .LBB15_13
+; SI-NEXT:  ; %bb.14: ; %Flow
+; SI-NEXT:    v_mov_b32_e32 v7, v9
+; SI-NEXT:    v_mov_b32_e32 v8, v10
+; SI-NEXT:  .LBB15_15: ; %frem.loop_exit
+; SI-NEXT:    s_sub_i32 s2, s5, 25
+; SI-NEXT:    v_ldexp_f64 v[6:7], v[7:8], s2
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s3, 0x432fffff
+; SI-NEXT:    v_cmp_gt_f64_e64 vcc, |v[6:7]|, s[2:3]
+; SI-NEXT:    s_brev_b32 s2, -2
+; SI-NEXT:    v_mov_b32_e32 v8, 0x43300000
+; SI-NEXT:    v_bfi_b32 v9, s2, v8, v7
+; SI-NEXT:    v_mov_b32_e32 v8, 0
+; SI-NEXT:    v_add_f64 v[10:11], v[6:7], v[8:9]
+; SI-NEXT:    v_add_f64 v[8:9], v[10:11], -v[8:9]
+; SI-NEXT:    v_cndmask_b32_e32 v9, v9, v7, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v8, v8, v6, vcc
+; SI-NEXT:    v_add_f64 v[6:7], v[6:7], -v[8:9]
+; SI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[6:7]
+; SI-NEXT:    v_add_f64 v[8:9], v[6:7], 1.0
+; SI-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
+; SI-NEXT:    v_bfi_b32 v7, s2, v7, v3
+; SI-NEXT:  .LBB15_16: ; %Flow49
+; SI-NEXT:    s_mov_b32 s4, 0
+; SI-NEXT:    s_mov_b32 s5, 0x7ff00000
+; SI-NEXT:    v_cmp_nge_f64_e64 vcc, |v[0:1]|, s[4:5]
+; SI-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; SI-NEXT:    v_cndmask_b32_e32 v1, v8, v5, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_cmp_nge_f64_e64 vcc, |v[2:3]|, s[4:5]
+; SI-NEXT:    v_cndmask_b32_e32 v3, v8, v7, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; CI-LABEL: frem_v2f64_const_one_denum:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 s4, s2
+; CI-NEXT:    s_mov_b32 s5, s3
+; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, 1.0
+; CI-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; CI-NEXT:    s_cbranch_vccz .LBB15_2
+; CI-NEXT:  ; %bb.1: ; %frem.else16
+; CI-NEXT:    v_cmp_eq_f64_e64 vcc, |v[0:1]|, 1.0
+; CI-NEXT:    v_and_b32_e32 v4, 0x80000000, v1
+; CI-NEXT:    v_cndmask_b32_e32 v5, v1, v4, vcc
+; CI-NEXT:    v_cndmask_b32_e64 v4, v0, 0, vcc
+; CI-NEXT:    s_cbranch_execz .LBB15_3
+; CI-NEXT:    s_branch .LBB15_8
+; CI-NEXT:  .LBB15_2:
+; CI-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; CI-NEXT:  .LBB15_3: ; %frem.compute15
+; CI-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[0:1]|
+; CI-NEXT:    v_frexp_exp_i32_f64_e32 v6, v[0:1]
+; CI-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 26
+; CI-NEXT:    v_add_i32_e32 v8, vcc, -1, v6
+; CI-NEXT:    v_cmp_gt_i32_e32 vcc, 27, v8
+; CI-NEXT:    s_cbranch_vccnz .LBB15_7
+; CI-NEXT:  ; %bb.4: ; %frem.loop_body23.preheader
+; CI-NEXT:    v_add_i32_e32 v8, vcc, 25, v6
+; CI-NEXT:  .LBB15_5: ; %frem.loop_body23
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v7, v5
+; CI-NEXT:    v_mov_b32_e32 v6, v4
+; CI-NEXT:    v_rndne_f64_e32 v[4:5], v[6:7]
+; CI-NEXT:    v_add_f64 v[4:5], v[6:7], -v[4:5]
+; CI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; CI-NEXT:    v_add_f64 v[9:10], v[4:5], 1.0
+; CI-NEXT:    v_cndmask_b32_e32 v5, v5, v10, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc
+; CI-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 26
+; CI-NEXT:    v_subrev_i32_e32 v8, vcc, 26, v8
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 26, v8
+; CI-NEXT:    s_cbranch_vccnz .LBB15_5
+; CI-NEXT:  ; %bb.6: ; %Flow50
+; CI-NEXT:    v_mov_b32_e32 v4, v6
+; CI-NEXT:    v_mov_b32_e32 v5, v7
+; CI-NEXT:  .LBB15_7: ; %frem.loop_exit24
+; CI-NEXT:    v_subrev_i32_e32 v6, vcc, 25, v8
+; CI-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; CI-NEXT:    s_brev_b32 s2, -2
+; CI-NEXT:    v_rndne_f64_e32 v[6:7], v[4:5]
+; CI-NEXT:    v_add_f64 v[4:5], v[4:5], -v[6:7]
+; CI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; CI-NEXT:    v_add_f64 v[6:7], v[4:5], 1.0
+; CI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; CI-NEXT:    v_bfi_b32 v5, s2, v5, v1
+; CI-NEXT:  .LBB15_8:
+; CI-NEXT:    v_cmp_ngt_f64_e64 s[2:3], |v[2:3]|, 1.0
+; CI-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; CI-NEXT:    s_cbranch_vccz .LBB15_10
+; CI-NEXT:  ; %bb.9: ; %frem.else
+; CI-NEXT:    v_cmp_eq_f64_e64 vcc, |v[2:3]|, 1.0
+; CI-NEXT:    v_and_b32_e32 v6, 0x80000000, v3
+; CI-NEXT:    v_cndmask_b32_e32 v7, v3, v6, vcc
+; CI-NEXT:    v_cndmask_b32_e64 v6, v2, 0, vcc
+; CI-NEXT:    s_cbranch_execz .LBB15_11
+; CI-NEXT:    s_branch .LBB15_16
+; CI-NEXT:  .LBB15_10:
+; CI-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; CI-NEXT:  .LBB15_11: ; %frem.compute
+; CI-NEXT:    v_frexp_mant_f64_e64 v[6:7], |v[2:3]|
+; CI-NEXT:    v_frexp_exp_i32_f64_e32 v8, v[2:3]
+; CI-NEXT:    v_ldexp_f64 v[6:7], v[6:7], 26
+; CI-NEXT:    v_add_i32_e32 v10, vcc, -1, v8
+; CI-NEXT:    v_cmp_gt_i32_e32 vcc, 27, v10
+; CI-NEXT:    s_cbranch_vccnz .LBB15_15
+; CI-NEXT:  ; %bb.12: ; %frem.loop_body.preheader
+; CI-NEXT:    v_add_i32_e32 v10, vcc, 25, v8
+; CI-NEXT:  .LBB15_13: ; %frem.loop_body
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v9, v7
+; CI-NEXT:    v_mov_b32_e32 v8, v6
+; CI-NEXT:    v_rndne_f64_e32 v[6:7], v[8:9]
+; CI-NEXT:    v_add_f64 v[6:7], v[8:9], -v[6:7]
+; CI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[6:7]
+; CI-NEXT:    v_add_f64 v[11:12], v[6:7], 1.0
+; CI-NEXT:    v_cndmask_b32_e32 v7, v7, v12, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v6, v6, v11, vcc
+; CI-NEXT:    v_ldexp_f64 v[6:7], v[6:7], 26
+; CI-NEXT:    v_subrev_i32_e32 v10, vcc, 26, v10
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 26, v10
+; CI-NEXT:    s_cbranch_vccnz .LBB15_13
+; CI-NEXT:  ; %bb.14: ; %Flow
+; CI-NEXT:    v_mov_b32_e32 v6, v8
+; CI-NEXT:    v_mov_b32_e32 v7, v9
+; CI-NEXT:  .LBB15_15: ; %frem.loop_exit
+; CI-NEXT:    v_subrev_i32_e32 v8, vcc, 25, v10
+; CI-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; CI-NEXT:    s_brev_b32 s2, -2
+; CI-NEXT:    v_rndne_f64_e32 v[8:9], v[6:7]
+; CI-NEXT:    v_add_f64 v[6:7], v[6:7], -v[8:9]
+; CI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[6:7]
+; CI-NEXT:    v_add_f64 v[8:9], v[6:7], 1.0
+; CI-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
+; CI-NEXT:    v_bfi_b32 v7, s2, v7, v3
+; CI-NEXT:  .LBB15_16: ; %Flow49
+; CI-NEXT:    s_mov_b32 s4, 0
+; CI-NEXT:    s_mov_b32 s5, 0x7ff00000
+; CI-NEXT:    v_cmp_nge_f64_e64 vcc, |v[0:1]|, s[4:5]
+; CI-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_cndmask_b32_e32 v1, v8, v5, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
+; CI-NEXT:    v_cmp_nge_f64_e64 vcc, |v[2:3]|, s[4:5]
+; CI-NEXT:    v_cndmask_b32_e32 v3, v8, v7, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc
+; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; CI-NEXT:    s_endpgm
+;
+; VI-LABEL: frem_v2f64_const_one_denum:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, 1.0
+; VI-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; VI-NEXT:    s_cbranch_vccz .LBB15_2
+; VI-NEXT:  ; %bb.1: ; %frem.else16
+; VI-NEXT:    v_cmp_eq_f64_e64 vcc, |v[0:1]|, 1.0
+; VI-NEXT:    v_and_b32_e32 v4, 0x80000000, v1
+; VI-NEXT:    v_cndmask_b32_e32 v5, v1, v4, vcc
+; VI-NEXT:    v_cndmask_b32_e64 v4, v0, 0, vcc
+; VI-NEXT:    s_cbranch_execz .LBB15_3
+; VI-NEXT:    s_branch .LBB15_8
+; VI-NEXT:  .LBB15_2:
+; VI-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; VI-NEXT:  .LBB15_3: ; %frem.compute15
+; VI-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[0:1]|
+; VI-NEXT:    v_frexp_exp_i32_f64_e32 v6, v[0:1]
+; VI-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 26
+; VI-NEXT:    v_add_u32_e32 v8, vcc, -1, v6
+; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 27, v8
+; VI-NEXT:    s_cbranch_vccnz .LBB15_7
+; VI-NEXT:  ; %bb.4: ; %frem.loop_body23.preheader
+; VI-NEXT:    v_add_u32_e32 v8, vcc, 25, v6
+; VI-NEXT:  .LBB15_5: ; %frem.loop_body23
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v7, v5
+; VI-NEXT:    v_mov_b32_e32 v6, v4
+; VI-NEXT:    v_rndne_f64_e32 v[4:5], v[6:7]
+; VI-NEXT:    v_add_f64 v[4:5], v[6:7], -v[4:5]
+; VI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; VI-NEXT:    v_add_f64 v[9:10], v[4:5], 1.0
+; VI-NEXT:    v_cndmask_b32_e32 v5, v5, v10, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc
+; VI-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 26
+; VI-NEXT:    v_subrev_u32_e32 v8, vcc, 26, v8
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 26, v8
+; VI-NEXT:    s_cbranch_vccnz .LBB15_5
+; VI-NEXT:  ; %bb.6: ; %Flow50
+; VI-NEXT:    v_mov_b32_e32 v4, v6
+; VI-NEXT:    v_mov_b32_e32 v5, v7
+; VI-NEXT:  .LBB15_7: ; %frem.loop_exit24
+; VI-NEXT:    v_subrev_u32_e32 v6, vcc, 25, v8
+; VI-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; VI-NEXT:    s_brev_b32 s2, -2
+; VI-NEXT:    v_rndne_f64_e32 v[6:7], v[4:5]
+; VI-NEXT:    v_add_f64 v[4:5], v[4:5], -v[6:7]
+; VI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; VI-NEXT:    v_add_f64 v[6:7], v[4:5], 1.0
+; VI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; VI-NEXT:    v_bfi_b32 v5, s2, v5, v1
+; VI-NEXT:  .LBB15_8:
+; VI-NEXT:    v_cmp_ngt_f64_e64 s[2:3], |v[2:3]|, 1.0
+; VI-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; VI-NEXT:    s_cbranch_vccz .LBB15_10
+; VI-NEXT:  ; %bb.9: ; %frem.else
+; VI-NEXT:    v_cmp_eq_f64_e64 vcc, |v[2:3]|, 1.0
+; VI-NEXT:    v_and_b32_e32 v6, 0x80000000, v3
+; VI-NEXT:    v_cndmask_b32_e32 v7, v3, v6, vcc
+; VI-NEXT:    v_cndmask_b32_e64 v6, v2, 0, vcc
+; VI-NEXT:    s_cbranch_execz .LBB15_11
+; VI-NEXT:    s_branch .LBB15_16
+; VI-NEXT:  .LBB15_10:
+; VI-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; VI-NEXT:  .LBB15_11: ; %frem.compute
+; VI-NEXT:    v_frexp_mant_f64_e64 v[6:7], |v[2:3]|
+; VI-NEXT:    v_frexp_exp_i32_f64_e32 v8, v[2:3]
+; VI-NEXT:    v_ldexp_f64 v[6:7], v[6:7], 26
+; VI-NEXT:    v_add_u32_e32 v10, vcc, -1, v8
+; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 27, v10
+; VI-NEXT:    s_cbranch_vccnz .LBB15_15
+; VI-NEXT:  ; %bb.12: ; %frem.loop_body.preheader
+; VI-NEXT:    v_add_u32_e32 v10, vcc, 25, v8
+; VI-NEXT:  .LBB15_13: ; %frem.loop_body
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v9, v7
+; VI-NEXT:    v_mov_b32_e32 v8, v6
+; VI-NEXT:    v_rndne_f64_e32 v[6:7], v[8:9]
+; VI-NEXT:    v_add_f64 v[6:7], v[8:9], -v[6:7]
+; VI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[6:7]
+; VI-NEXT:    v_add_f64 v[11:12], v[6:7], 1.0
+; VI-NEXT:    v_cndmask_b32_e32 v7, v7, v12, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v6, v6, v11, vcc
+; VI-NEXT:    v_ldexp_f64 v[6:7], v[6:7], 26
+; VI-NEXT:    v_subrev_u32_e32 v10, vcc, 26, v10
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 26, v10
+; VI-NEXT:    s_cbranch_vccnz .LBB15_13
+; VI-NEXT:  ; %bb.14: ; %Flow
+; VI-NEXT:    v_mov_b32_e32 v6, v8
+; VI-NEXT:    v_mov_b32_e32 v7, v9
+; VI-NEXT:  .LBB15_15: ; %frem.loop_exit
+; VI-NEXT:    v_subrev_u32_e32 v8, vcc, 25, v10
+; VI-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; VI-NEXT:    s_brev_b32 s2, -2
+; VI-NEXT:    v_rndne_f64_e32 v[8:9], v[6:7]
+; VI-NEXT:    v_add_f64 v[6:7], v[6:7], -v[8:9]
+; VI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[6:7]
+; VI-NEXT:    v_add_f64 v[8:9], v[6:7], 1.0
+; VI-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
+; VI-NEXT:    v_bfi_b32 v7, s2, v7, v3
+; VI-NEXT:  .LBB15_16: ; %Flow49
+; VI-NEXT:    s_mov_b32 s2, 0
+; VI-NEXT:    s_mov_b32 s3, 0x7ff00000
+; VI-NEXT:    v_cmp_nge_f64_e64 vcc, |v[0:1]|, s[2:3]
+; VI-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; VI-NEXT:    v_cndmask_b32_e32 v1, v8, v5, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
+; VI-NEXT:    v_cmp_nge_f64_e64 vcc, |v[2:3]|, s[2:3]
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    v_cndmask_b32_e32 v3, v8, v7, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc
+; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: frem_v2f64_const_one_denum:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, 1.0
+; GFX9-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_vccz .LBB15_2
+; GFX9-NEXT:  ; %bb.1: ; %frem.else16
+; GFX9-NEXT:    v_cmp_eq_f64_e64 vcc, |v[0:1]|, 1.0
+; GFX9-NEXT:    v_and_b32_e32 v4, 0x80000000, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v1, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v0, 0, vcc
+; GFX9-NEXT:    s_cbranch_execz .LBB15_3
+; GFX9-NEXT:    s_branch .LBB15_8
+; GFX9-NEXT:  .LBB15_2:
+; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX9-NEXT:  .LBB15_3: ; %frem.compute15
+; GFX9-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[0:1]|
+; GFX9-NEXT:    v_frexp_exp_i32_f64_e32 v6, v[0:1]
+; GFX9-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 26
+; GFX9-NEXT:    v_add_u32_e32 v8, -1, v6
+; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 27, v8
+; GFX9-NEXT:    s_cbranch_vccnz .LBB15_7
+; GFX9-NEXT:  ; %bb.4: ; %frem.loop_body23.preheader
+; GFX9-NEXT:    v_add_u32_e32 v8, 25, v6
+; GFX9-NEXT:  .LBB15_5: ; %frem.loop_body23
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    v_rndne_f64_e32 v[4:5], v[6:7]
+; GFX9-NEXT:    v_subrev_u32_e32 v8, 26, v8
+; GFX9-NEXT:    v_add_f64 v[4:5], v[6:7], -v[4:5]
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; GFX9-NEXT:    v_add_f64 v[9:10], v[4:5], 1.0
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v10, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc
+; GFX9-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 26
+; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 26, v8
+; GFX9-NEXT:    s_cbranch_vccnz .LBB15_5
+; GFX9-NEXT:  ; %bb.6: ; %Flow50
+; GFX9-NEXT:    v_mov_b32_e32 v4, v6
+; GFX9-NEXT:    v_mov_b32_e32 v5, v7
+; GFX9-NEXT:  .LBB15_7: ; %frem.loop_exit24
+; GFX9-NEXT:    v_subrev_u32_e32 v6, 25, v8
+; GFX9-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; GFX9-NEXT:    s_brev_b32 s2, -2
+; GFX9-NEXT:    v_rndne_f64_e32 v[6:7], v[4:5]
+; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], -v[6:7]
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; GFX9-NEXT:    v_add_f64 v[6:7], v[4:5], 1.0
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX9-NEXT:    v_bfi_b32 v5, s2, v5, v1
+; GFX9-NEXT:  .LBB15_8:
+; GFX9-NEXT:    v_cmp_ngt_f64_e64 s[2:3], |v[2:3]|, 1.0
+; GFX9-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_vccz .LBB15_10
+; GFX9-NEXT:  ; %bb.9: ; %frem.else
+; GFX9-NEXT:    v_cmp_eq_f64_e64 vcc, |v[2:3]|, 1.0
+; GFX9-NEXT:    v_and_b32_e32 v6, 0x80000000, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v3, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v2, 0, vcc
+; GFX9-NEXT:    s_cbranch_execz .LBB15_11
+; GFX9-NEXT:    s_branch .LBB15_16
+; GFX9-NEXT:  .LBB15_10:
+; GFX9-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX9-NEXT:  .LBB15_11: ; %frem.compute
+; GFX9-NEXT:    v_frexp_mant_f64_e64 v[6:7], |v[2:3]|
+; GFX9-NEXT:    v_frexp_exp_i32_f64_e32 v8, v[2:3]
+; GFX9-NEXT:    v_ldexp_f64 v[6:7], v[6:7], 26
+; GFX9-NEXT:    v_add_u32_e32 v10, -1, v8
+; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 27, v10
+; GFX9-NEXT:    s_cbranch_vccnz .LBB15_15
+; GFX9-NEXT:  ; %bb.12: ; %frem.loop_body.preheader
+; GFX9-NEXT:    v_add_u32_e32 v10, 25, v8
+; GFX9-NEXT:  .LBB15_13: ; %frem.loop_body
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_mov_b32_e32 v9, v7
+; GFX9-NEXT:    v_mov_b32_e32 v8, v6
+; GFX9-NEXT:    v_rndne_f64_e32 v[6:7], v[8:9]
+; GFX9-NEXT:    v_subrev_u32_e32 v10, 26, v10
+; GFX9-NEXT:    v_add_f64 v[6:7], v[8:9], -v[6:7]
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[6:7]
+; GFX9-NEXT:    v_add_f64 v[11:12], v[6:7], 1.0
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v12, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v11, vcc
+; GFX9-NEXT:    v_ldexp_f64 v[6:7], v[6:7], 26
+; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 26, v10
+; GFX9-NEXT:    s_cbranch_vccnz .LBB15_13
+; GFX9-NEXT:  ; %bb.14: ; %Flow
+; GFX9-NEXT:    v_mov_b32_e32 v6, v8
+; GFX9-NEXT:    v_mov_b32_e32 v7, v9
+; GFX9-NEXT:  .LBB15_15: ; %frem.loop_exit
+; GFX9-NEXT:    v_subrev_u32_e32 v8, 25, v10
+; GFX9-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; GFX9-NEXT:    s_brev_b32 s2, -2
+; GFX9-NEXT:    v_rndne_f64_e32 v[8:9], v[6:7]
+; GFX9-NEXT:    v_add_f64 v[6:7], v[6:7], -v[8:9]
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[6:7]
+; GFX9-NEXT:    v_add_f64 v[8:9], v[6:7], 1.0
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
+; GFX9-NEXT:    v_bfi_b32 v7, s2, v7, v3
+; GFX9-NEXT:  .LBB15_16: ; %Flow49
+; GFX9-NEXT:    s_mov_b32 s2, 0
+; GFX9-NEXT:    s_mov_b32 s3, 0x7ff00000
+; GFX9-NEXT:    v_cmp_nge_f64_e64 vcc, |v[0:1]|, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v8, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
+; GFX9-NEXT:    v_cmp_nge_f64_e64 vcc, |v[2:3]|, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc
+; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: frem_v2f64_const_one_denum:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_cmp_ngt_f64_e64 s2, |v[0:1]|, 1.0
+; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX10-NEXT:    s_cbranch_vccz .LBB15_2
+; GFX10-NEXT:  ; %bb.1: ; %frem.else16
+; GFX10-NEXT:    v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, 1.0
+; GFX10-NEXT:    v_and_b32_e32 v4, 0x80000000, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v1, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v0, 0, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB15_3
+; GFX10-NEXT:    s_branch .LBB15_8
+; GFX10-NEXT:  .LBB15_2:
+; GFX10-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX10-NEXT:  .LBB15_3: ; %frem.compute15
+; GFX10-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[0:1]|
+; GFX10-NEXT:    v_frexp_exp_i32_f64_e32 v6, v[0:1]
+; GFX10-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 26
+; GFX10-NEXT:    v_add_nc_u32_e32 v8, -1, v6
+; GFX10-NEXT:    v_readfirstlane_b32 s2, v6
+; GFX10-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 27, v8
+; GFX10-NEXT:    s_cbranch_vccnz .LBB15_7
+; GFX10-NEXT:  ; %bb.4: ; %frem.loop_body23.preheader
+; GFX10-NEXT:    s_add_i32 s2, s2, 25
+; GFX10-NEXT:  .LBB15_5: ; %frem.loop_body23
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_mov_b32_e32 v7, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v4
+; GFX10-NEXT:    s_sub_i32 s2, s2, 26
+; GFX10-NEXT:    s_cmp_gt_i32 s2, 26
+; GFX10-NEXT:    v_rndne_f64_e32 v[4:5], v[6:7]
+; GFX10-NEXT:    v_add_f64 v[4:5], v[6:7], -v[4:5]
+; GFX10-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5]
+; GFX10-NEXT:    v_add_f64 v[8:9], v[4:5], 1.0
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc_lo
+; GFX10-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 26
+; GFX10-NEXT:    s_cbranch_scc1 .LBB15_5
+; GFX10-NEXT:  ; %bb.6: ; %Flow50
+; GFX10-NEXT:    v_mov_b32_e32 v4, v6
+; GFX10-NEXT:    v_mov_b32_e32 v8, s2
+; GFX10-NEXT:    v_mov_b32_e32 v5, v7
+; GFX10-NEXT:  .LBB15_7: ; %frem.loop_exit24
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, 25, v8
+; GFX10-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; GFX10-NEXT:    v_rndne_f64_e32 v[6:7], v[4:5]
+; GFX10-NEXT:    v_add_f64 v[4:5], v[4:5], -v[6:7]
+; GFX10-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5]
+; GFX10-NEXT:    v_add_f64 v[6:7], v[4:5], 1.0
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX10-NEXT:    v_bfi_b32 v5, 0x7fffffff, v5, v1
+; GFX10-NEXT:  .LBB15_8:
+; GFX10-NEXT:    v_cmp_ngt_f64_e64 s2, |v[2:3]|, 1.0
+; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX10-NEXT:    s_cbranch_vccz .LBB15_10
+; GFX10-NEXT:  ; %bb.9: ; %frem.else
+; GFX10-NEXT:    v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, 1.0
+; GFX10-NEXT:    v_and_b32_e32 v6, 0x80000000, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v3, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v2, 0, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB15_11
+; GFX10-NEXT:    s_branch .LBB15_16
+; GFX10-NEXT:  .LBB15_10:
+; GFX10-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX10-NEXT:  .LBB15_11: ; %frem.compute
+; GFX10-NEXT:    v_frexp_mant_f64_e64 v[6:7], |v[2:3]|
+; GFX10-NEXT:    v_frexp_exp_i32_f64_e32 v8, v[2:3]
+; GFX10-NEXT:    v_ldexp_f64 v[6:7], v[6:7], 26
+; GFX10-NEXT:    v_add_nc_u32_e32 v10, -1, v8
+; GFX10-NEXT:    v_readfirstlane_b32 s2, v8
+; GFX10-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 27, v10
+; GFX10-NEXT:    s_cbranch_vccnz .LBB15_15
+; GFX10-NEXT:  ; %bb.12: ; %frem.loop_body.preheader
+; GFX10-NEXT:    s_add_i32 s2, s2, 25
+; GFX10-NEXT:  .LBB15_13: ; %frem.loop_body
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_mov_b32_e32 v9, v7
+; GFX10-NEXT:    v_mov_b32_e32 v8, v6
+; GFX10-NEXT:    s_sub_i32 s2, s2, 26
+; GFX10-NEXT:    s_cmp_gt_i32 s2, 26
+; GFX10-NEXT:    v_rndne_f64_e32 v[6:7], v[8:9]
+; GFX10-NEXT:    v_add_f64 v[6:7], v[8:9], -v[6:7]
+; GFX10-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7]
+; GFX10-NEXT:    v_add_f64 v[10:11], v[6:7], 1.0
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v11, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc_lo
+; GFX10-NEXT:    v_ldexp_f64 v[6:7], v[6:7], 26
+; GFX10-NEXT:    s_cbranch_scc1 .LBB15_13
+; GFX10-NEXT:  ; %bb.14: ; %Flow
+; GFX10-NEXT:    v_mov_b32_e32 v6, v8
+; GFX10-NEXT:    v_mov_b32_e32 v10, s2
+; GFX10-NEXT:    v_mov_b32_e32 v7, v9
+; GFX10-NEXT:  .LBB15_15: ; %frem.loop_exit
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v8, 25, v10
+; GFX10-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; GFX10-NEXT:    v_rndne_f64_e32 v[8:9], v[6:7]
+; GFX10-NEXT:    v_add_f64 v[6:7], v[6:7], -v[8:9]
+; GFX10-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7]
+; GFX10-NEXT:    v_add_f64 v[8:9], v[6:7], 1.0
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc_lo
+; GFX10-NEXT:    v_bfi_b32 v7, 0x7fffffff, v7, v3
+; GFX10-NEXT:  .LBB15_16: ; %Flow49
+; GFX10-NEXT:    v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]|
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7ff80000, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[2:3]|
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0x7ff80000, v7, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc_lo
+; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: frem_v2f64_const_one_denum:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_cmp_ngt_f64_e64 s2, |v[0:1]|, 1.0
+; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX11-NEXT:    s_cbranch_vccz .LBB15_2
+; GFX11-NEXT:  ; %bb.1: ; %frem.else16
+; GFX11-NEXT:    v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, 1.0
+; GFX11-NEXT:    v_and_b32_e32 v4, 0x80000000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v1, v4, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v0, 0, vcc_lo
+; GFX11-NEXT:    s_cbranch_execz .LBB15_3
+; GFX11-NEXT:    s_branch .LBB15_8
+; GFX11-NEXT:  .LBB15_2:
+; GFX11-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX11-NEXT:  .LBB15_3: ; %frem.compute15
+; GFX11-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[0:1]|
+; GFX11-NEXT:    v_frexp_exp_i32_f64_e32 v6, v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 26
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, -1, v6
+; GFX11-NEXT:    v_readfirstlane_b32 s2, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 27, v8
+; GFX11-NEXT:    s_cbranch_vccnz .LBB15_7
+; GFX11-NEXT:  ; %bb.4: ; %frem.loop_body23.preheader
+; GFX11-NEXT:    s_add_i32 s2, s2, 25
+; GFX11-NEXT:  .LBB15_5: ; %frem.loop_body23
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    s_sub_i32 s2, s2, 26
+; GFX11-NEXT:    s_cmp_gt_i32 s2, 26
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_rndne_f64_e32 v[4:5], v[6:7]
+; GFX11-NEXT:    v_add_f64 v[4:5], v[6:7], -v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5]
+; GFX11-NEXT:    v_add_f64 v[8:9], v[4:5], 1.0
+; GFX11-NEXT:    v_dual_cndmask_b32 v5, v5, v9 :: v_dual_cndmask_b32 v4, v4, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 26
+; GFX11-NEXT:    s_cbranch_scc1 .LBB15_5
+; GFX11-NEXT:  ; %bb.6: ; %Flow50
+; GFX11-NEXT:    v_mov_b32_e32 v4, v6
+; GFX11-NEXT:    v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v5, v7
+; GFX11-NEXT:  .LBB15_7: ; %frem.loop_exit24
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v6, 25, v8
+; GFX11-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_rndne_f64_e32 v[6:7], v[4:5]
+; GFX11-NEXT:    v_add_f64 v[4:5], v[4:5], -v[6:7]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5]
+; GFX11-NEXT:    v_add_f64 v[6:7], v[4:5], 1.0
+; GFX11-NEXT:    v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v5, v5, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_bfi_b32 v5, 0x7fffffff, v5, v1
+; GFX11-NEXT:  .LBB15_8:
+; GFX11-NEXT:    v_cmp_ngt_f64_e64 s2, |v[2:3]|, 1.0
+; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX11-NEXT:    s_cbranch_vccz .LBB15_10
+; GFX11-NEXT:  ; %bb.9: ; %frem.else
+; GFX11-NEXT:    v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, 1.0
+; GFX11-NEXT:    v_and_b32_e32 v6, 0x80000000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, v3, v6, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v2, 0, vcc_lo
+; GFX11-NEXT:    s_cbranch_execz .LBB15_11
+; GFX11-NEXT:    s_branch .LBB15_16
+; GFX11-NEXT:  .LBB15_10:
+; GFX11-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX11-NEXT:  .LBB15_11: ; %frem.compute
+; GFX11-NEXT:    v_frexp_mant_f64_e64 v[6:7], |v[2:3]|
+; GFX11-NEXT:    v_frexp_exp_i32_f64_e32 v8, v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_ldexp_f64 v[6:7], v[6:7], 26
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, -1, v8
+; GFX11-NEXT:    v_readfirstlane_b32 s2, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 27, v10
+; GFX11-NEXT:    s_cbranch_vccnz .LBB15_15
+; GFX11-NEXT:  ; %bb.12: ; %frem.loop_body.preheader
+; GFX11-NEXT:    s_add_i32 s2, s2, 25
+; GFX11-NEXT:  .LBB15_13: ; %frem.loop_body
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
+; GFX11-NEXT:    s_sub_i32 s2, s2, 26
+; GFX11-NEXT:    s_cmp_gt_i32 s2, 26
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_rndne_f64_e32 v[6:7], v[8:9]
+; GFX11-NEXT:    v_add_f64 v[6:7], v[8:9], -v[6:7]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7]
+; GFX11-NEXT:    v_add_f64 v[10:11], v[6:7], 1.0
+; GFX11-NEXT:    v_dual_cndmask_b32 v7, v7, v11 :: v_dual_cndmask_b32 v6, v6, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f64 v[6:7], v[6:7], 26
+; GFX11-NEXT:    s_cbranch_scc1 .LBB15_13
+; GFX11-NEXT:  ; %bb.14: ; %Flow
+; GFX11-NEXT:    v_mov_b32_e32 v6, v8
+; GFX11-NEXT:    v_dual_mov_b32 v10, s2 :: v_dual_mov_b32 v7, v9
+; GFX11-NEXT:  .LBB15_15: ; %frem.loop_exit
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v8, 25, v10
+; GFX11-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_rndne_f64_e32 v[8:9], v[6:7]
+; GFX11-NEXT:    v_add_f64 v[6:7], v[6:7], -v[8:9]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7]
+; GFX11-NEXT:    v_add_f64 v[8:9], v[6:7], 1.0
+; GFX11-NEXT:    v_dual_cndmask_b32 v6, v6, v8 :: v_dual_cndmask_b32 v7, v7, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_bfi_b32 v7, 0x7fffffff, v7, v3
+; GFX11-NEXT:  .LBB15_16: ; %Flow49
+; GFX11-NEXT:    v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]|
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7ff80000, v5, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[2:3]|
+; GFX11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_cndmask_b32 v3, 0x7ff80000, v7
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc_lo
+; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT:    s_endpgm
+;
+; GFX1150-LABEL: frem_v2f64_const_one_denum:
+; GFX1150:       ; %bb.0:
+; GFX1150-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
+; GFX1150-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-NEXT:    v_cmp_ngt_f64_e64 s2, |v[0:1]|, 1.0
+; GFX1150-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX1150-NEXT:    s_cbranch_vccz .LBB15_2
+; GFX1150-NEXT:  ; %bb.1: ; %frem.else16
+; GFX1150-NEXT:    v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, 1.0
+; GFX1150-NEXT:    v_and_b32_e32 v4, 0x80000000, v1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1150-NEXT:    v_cndmask_b32_e32 v5, v1, v4, vcc_lo
+; GFX1150-NEXT:    v_cndmask_b32_e64 v4, v0, 0, vcc_lo
+; GFX1150-NEXT:    s_cbranch_execz .LBB15_3
+; GFX1150-NEXT:    s_branch .LBB15_8
+; GFX1150-NEXT:  .LBB15_2:
+; GFX1150-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1150-NEXT:  .LBB15_3: ; %frem.compute15
+; GFX1150-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[0:1]|
+; GFX1150-NEXT:    v_frexp_exp_i32_f64_e32 v6, v[0:1]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1150-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 26
+; GFX1150-NEXT:    v_add_nc_u32_e32 v8, -1, v6
+; GFX1150-NEXT:    v_readfirstlane_b32 s2, v6
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1150-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 27, v8
+; GFX1150-NEXT:    s_cbranch_vccnz .LBB15_7
+; GFX1150-NEXT:  ; %bb.4: ; %frem.loop_body23.preheader
+; GFX1150-NEXT:    s_add_i32 s2, s2, 25
+; GFX1150-NEXT:  .LBB15_5: ; %frem.loop_body23
+; GFX1150-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX1150-NEXT:    s_sub_i32 s2, s2, 26
+; GFX1150-NEXT:    s_cmp_gt_i32 s2, 26
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_rndne_f64_e32 v[4:5], v[6:7]
+; GFX1150-NEXT:    v_add_f64 v[4:5], v[6:7], -v[4:5]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5]
+; GFX1150-NEXT:    v_add_f64 v[8:9], v[4:5], 1.0
+; GFX1150-NEXT:    v_dual_cndmask_b32 v5, v5, v9 :: v_dual_cndmask_b32 v4, v4, v8
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 26
+; GFX1150-NEXT:    s_cbranch_scc1 .LBB15_5
+; GFX1150-NEXT:  ; %bb.6: ; %Flow50
+; GFX1150-NEXT:    v_mov_b32_e32 v4, v6
+; GFX1150-NEXT:    v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v5, v7
+; GFX1150-NEXT:  .LBB15_7: ; %frem.loop_exit24
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_subrev_nc_u32_e32 v6, 25, v8
+; GFX1150-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_rndne_f64_e32 v[6:7], v[4:5]
+; GFX1150-NEXT:    v_add_f64 v[4:5], v[4:5], -v[6:7]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5]
+; GFX1150-NEXT:    v_add_f64 v[6:7], v[4:5], 1.0
+; GFX1150-NEXT:    v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v5, v5, v7
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_bfi_b32 v5, 0x7fffffff, v5, v1
+; GFX1150-NEXT:  .LBB15_8:
+; GFX1150-NEXT:    v_cmp_ngt_f64_e64 s2, |v[2:3]|, 1.0
+; GFX1150-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX1150-NEXT:    s_cbranch_vccz .LBB15_10
+; GFX1150-NEXT:  ; %bb.9: ; %frem.else
+; GFX1150-NEXT:    v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, 1.0
+; GFX1150-NEXT:    v_and_b32_e32 v6, 0x80000000, v3
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1150-NEXT:    v_cndmask_b32_e32 v7, v3, v6, vcc_lo
+; GFX1150-NEXT:    v_cndmask_b32_e64 v6, v2, 0, vcc_lo
+; GFX1150-NEXT:    s_cbranch_execz .LBB15_11
+; GFX1150-NEXT:    s_branch .LBB15_16
+; GFX1150-NEXT:  .LBB15_10:
+; GFX1150-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX1150-NEXT:  .LBB15_11: ; %frem.compute
+; GFX1150-NEXT:    v_frexp_mant_f64_e64 v[6:7], |v[2:3]|
+; GFX1150-NEXT:    v_frexp_exp_i32_f64_e32 v8, v[2:3]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1150-NEXT:    v_ldexp_f64 v[6:7], v[6:7], 26
+; GFX1150-NEXT:    v_add_nc_u32_e32 v10, -1, v8
+; GFX1150-NEXT:    v_readfirstlane_b32 s2, v8
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1150-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 27, v10
+; GFX1150-NEXT:    s_cbranch_vccnz .LBB15_15
+; GFX1150-NEXT:  ; %bb.12: ; %frem.loop_body.preheader
+; GFX1150-NEXT:    s_add_i32 s2, s2, 25
+; GFX1150-NEXT:  .LBB15_13: ; %frem.loop_body
+; GFX1150-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT:    v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
+; GFX1150-NEXT:    s_sub_i32 s2, s2, 26
+; GFX1150-NEXT:    s_cmp_gt_i32 s2, 26
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_rndne_f64_e32 v[6:7], v[8:9]
+; GFX1150-NEXT:    v_add_f64 v[6:7], v[8:9], -v[6:7]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7]
+; GFX1150-NEXT:    v_add_f64 v[10:11], v[6:7], 1.0
+; GFX1150-NEXT:    v_dual_cndmask_b32 v7, v7, v11 :: v_dual_cndmask_b32 v6, v6, v10
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_ldexp_f64 v[6:7], v[6:7], 26
+; GFX1150-NEXT:    s_cbranch_scc1 .LBB15_13
+; GFX1150-NEXT:  ; %bb.14: ; %Flow
+; GFX1150-NEXT:    v_mov_b32_e32 v6, v8
+; GFX1150-NEXT:    v_dual_mov_b32 v10, s2 :: v_dual_mov_b32 v7, v9
+; GFX1150-NEXT:  .LBB15_15: ; %frem.loop_exit
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_subrev_nc_u32_e32 v8, 25, v10
+; GFX1150-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_rndne_f64_e32 v[8:9], v[6:7]
+; GFX1150-NEXT:    v_add_f64 v[6:7], v[6:7], -v[8:9]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7]
+; GFX1150-NEXT:    v_add_f64 v[8:9], v[6:7], 1.0
+; GFX1150-NEXT:    v_dual_cndmask_b32 v6, v6, v8 :: v_dual_cndmask_b32 v7, v7, v9
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_bfi_b32 v7, 0x7fffffff, v7, v3
+; GFX1150-NEXT:  .LBB15_16: ; %Flow49
+; GFX1150-NEXT:    v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]|
+; GFX1150-NEXT:    v_cndmask_b32_e32 v1, 0x7ff80000, v5, vcc_lo
+; GFX1150-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc_lo
+; GFX1150-NEXT:    v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[2:3]|
+; GFX1150-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_cndmask_b32 v3, 0x7ff80000, v7
+; GFX1150-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc_lo
+; GFX1150-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
+; GFX1150-NEXT:    s_endpgm
+;
+; GFX1200-LABEL: frem_v2f64_const_one_denum:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1200-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
+; GFX1200-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-NEXT:    v_cmp_ngt_f64_e64 s2, |v[0:1]|, 1.0
+; GFX1200-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX1200-NEXT:    s_cbranch_vccz .LBB15_2
+; GFX1200-NEXT:  ; %bb.1: ; %frem.else16
+; GFX1200-NEXT:    v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, 1.0
+; GFX1200-NEXT:    v_and_b32_e32 v4, 0x80000000, v1
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1200-NEXT:    v_cndmask_b32_e32 v5, v1, v4, vcc_lo
+; GFX1200-NEXT:    v_cndmask_b32_e64 v4, v0, 0, vcc_lo
+; GFX1200-NEXT:    s_cbranch_execz .LBB15_3
+; GFX1200-NEXT:    s_branch .LBB15_8
+; GFX1200-NEXT:  .LBB15_2:
+; GFX1200-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1200-NEXT:  .LBB15_3: ; %frem.compute15
+; GFX1200-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[0:1]|
+; GFX1200-NEXT:    v_frexp_exp_i32_f64_e32 v6, v[0:1]
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 26
+; GFX1200-NEXT:    v_add_nc_u32_e32 v8, -1, v6
+; GFX1200-NEXT:    v_readfirstlane_b32 s2, v6
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1200-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 27, v8
+; GFX1200-NEXT:    s_cbranch_vccnz .LBB15_7
+; GFX1200-NEXT:  ; %bb.4: ; %frem.loop_body23.preheader
+; GFX1200-NEXT:    s_add_co_i32 s2, s2, 25
+; GFX1200-NEXT:  .LBB15_5: ; %frem.loop_body23
+; GFX1200-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1200-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX1200-NEXT:    s_sub_co_i32 s2, s2, 26
+; GFX1200-NEXT:    s_cmp_gt_i32 s2, 26
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_rndne_f64_e32 v[4:5], v[6:7]
+; GFX1200-NEXT:    v_add_f64_e64 v[4:5], v[6:7], -v[4:5]
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5]
+; GFX1200-NEXT:    v_add_f64_e32 v[8:9], 1.0, v[4:5]
+; GFX1200-NEXT:    s_wait_alu 0xfffd
+; GFX1200-NEXT:    v_dual_cndmask_b32 v5, v5, v9 :: v_dual_cndmask_b32 v4, v4, v8
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 26
+; GFX1200-NEXT:    s_cbranch_scc1 .LBB15_5
+; GFX1200-NEXT:  ; %bb.6: ; %Flow50
+; GFX1200-NEXT:    v_mov_b32_e32 v4, v6
+; GFX1200-NEXT:    v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v5, v7
+; GFX1200-NEXT:  .LBB15_7: ; %frem.loop_exit24
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_subrev_nc_u32_e32 v6, 25, v8
+; GFX1200-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_rndne_f64_e32 v[6:7], v[4:5]
+; GFX1200-NEXT:    v_add_f64_e64 v[4:5], v[4:5], -v[6:7]
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5]
+; GFX1200-NEXT:    v_add_f64_e32 v[6:7], 1.0, v[4:5]
+; GFX1200-NEXT:    s_wait_alu 0xfffd
+; GFX1200-NEXT:    v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v5, v5, v7
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT:    v_bfi_b32 v5, 0x7fffffff, v5, v1
+; GFX1200-NEXT:  .LBB15_8:
+; GFX1200-NEXT:    v_cmp_ngt_f64_e64 s2, |v[2:3]|, 1.0
+; GFX1200-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    s_cbranch_vccz .LBB15_10
+; GFX1200-NEXT:  ; %bb.9: ; %frem.else
+; GFX1200-NEXT:    v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, 1.0
+; GFX1200-NEXT:    v_and_b32_e32 v6, 0x80000000, v3
+; GFX1200-NEXT:    s_wait_alu 0xfffd
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1200-NEXT:    v_cndmask_b32_e32 v7, v3, v6, vcc_lo
+; GFX1200-NEXT:    v_cndmask_b32_e64 v6, v2, 0, vcc_lo
+; GFX1200-NEXT:    s_cbranch_execz .LBB15_11
+; GFX1200-NEXT:    s_branch .LBB15_16
+; GFX1200-NEXT:  .LBB15_10:
+; GFX1200-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX1200-NEXT:  .LBB15_11: ; %frem.compute
+; GFX1200-NEXT:    v_frexp_mant_f64_e64 v[6:7], |v[2:3]|
+; GFX1200-NEXT:    v_frexp_exp_i32_f64_e32 v8, v[2:3]
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-NEXT:    v_ldexp_f64 v[6:7], v[6:7], 26
+; GFX1200-NEXT:    v_add_nc_u32_e32 v10, -1, v8
+; GFX1200-NEXT:    v_readfirstlane_b32 s2, v8
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1200-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 27, v10
+; GFX1200-NEXT:    s_cbranch_vccnz .LBB15_15
+; GFX1200-NEXT:  ; %bb.12: ; %frem.loop_body.preheader
+; GFX1200-NEXT:    s_add_co_i32 s2, s2, 25
+; GFX1200-NEXT:  .LBB15_13: ; %frem.loop_body
+; GFX1200-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT:    v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
+; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    s_sub_co_i32 s2, s2, 26
+; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    s_cmp_gt_i32 s2, 26
+; GFX1200-NEXT:    v_rndne_f64_e32 v[6:7], v[8:9]
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_add_f64_e64 v[6:7], v[8:9], -v[6:7]
+; GFX1200-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7]
+; GFX1200-NEXT:    v_add_f64_e32 v[10:11], 1.0, v[6:7]
+; GFX1200-NEXT:    s_wait_alu 0xfffd
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_dual_cndmask_b32 v7, v7, v11 :: v_dual_cndmask_b32 v6, v6, v10
+; GFX1200-NEXT:    v_ldexp_f64 v[6:7], v[6:7], 26
+; GFX1200-NEXT:    s_cbranch_scc1 .LBB15_13
+; GFX1200-NEXT:  ; %bb.14: ; %Flow
+; GFX1200-NEXT:    v_mov_b32_e32 v6, v8
+; GFX1200-NEXT:    v_dual_mov_b32 v10, s2 :: v_dual_mov_b32 v7, v9
+; GFX1200-NEXT:  .LBB15_15: ; %frem.loop_exit
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_subrev_nc_u32_e32 v8, 25, v10
+; GFX1200-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_rndne_f64_e32 v[8:9], v[6:7]
+; GFX1200-NEXT:    v_add_f64_e64 v[6:7], v[6:7], -v[8:9]
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7]
+; GFX1200-NEXT:    v_add_f64_e32 v[8:9], 1.0, v[6:7]
+; GFX1200-NEXT:    s_wait_alu 0xfffd
+; GFX1200-NEXT:    v_dual_cndmask_b32 v6, v6, v8 :: v_dual_cndmask_b32 v7, v7, v9
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT:    v_bfi_b32 v7, 0x7fffffff, v7, v3
+; GFX1200-NEXT:  .LBB15_16: ; %Flow49
+; GFX1200-NEXT:    v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]|
+; GFX1200-NEXT:    s_wait_alu 0xfffd
+; GFX1200-NEXT:    v_cndmask_b32_e32 v1, 0x7ff80000, v5, vcc_lo
+; GFX1200-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc_lo
+; GFX1200-NEXT:    v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[2:3]|
+; GFX1200-NEXT:    s_wait_alu 0xfffd
+; GFX1200-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_cndmask_b32 v3, 0x7ff80000, v7
+; GFX1200-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc_lo
+; GFX1200-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
+; GFX1200-NEXT:    s_endpgm
+   %r0 = load <2 x double>, ptr addrspace(1) %in, align 16
+   %r1 = frem  <2 x double> %r0, <double 1.0, double 1.0>
+   store <2 x double> %r1, ptr addrspace(1) %out, align 16
+   ret void
+}
+
+define amdgpu_kernel void @frem_v2f64_const(ptr addrspace(1) %out) #0 {
+; SI-LABEL: frem_v2f64_const:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0
+; SI-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    v_mov_b32_e32 v3, v0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; CI-LABEL: frem_v2f64_const:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; CI-NEXT:    v_mov_b32_e32 v0, 0
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
+; CI-NEXT:    v_mov_b32_e32 v2, v0
+; CI-NEXT:    v_mov_b32_e32 v3, v0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; CI-NEXT:    s_endpgm
+;
+; VI-LABEL: frem_v2f64_const:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
+; VI-NEXT:    v_mov_b32_e32 v2, v0
+; VI-NEXT:    v_mov_b32_e32 v3, v0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: frem_v2f64_const:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_dwordx4 v0, v[0:3], s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: frem_v2f64_const:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    global_store_dwordx4 v0, v[0:3], s[0:1]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: frem_v2f64_const:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff00000
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_store_b128 v0, v[0:3], s[0:1]
+; GFX11-NEXT:    s_endpgm
+;
+; GFX1150-LABEL: frem_v2f64_const:
+; GFX1150:       ; %bb.0:
+; GFX1150-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1150-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff00000
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1150-NEXT:    v_mov_b32_e32 v3, v0
+; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-NEXT:    global_store_b128 v0, v[0:3], s[0:1]
+; GFX1150-NEXT:    s_endpgm
+;
+; GFX1200-LABEL: frem_v2f64_const:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1200-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff00000
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT:    v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v0
+; GFX1200-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-NEXT:    global_store_b128 v0, v[0:3], s[0:1]
+; GFX1200-NEXT:    s_endpgm
+   %r0 = frem  <2 x double> <double 1.0, double 1.0>, <double 2.0, double 1.0>
+   store <2 x double> %r0, ptr addrspace(1) %out, align 16
+   ret void
+}
+
+
+
+attributes #0 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
+attributes #1 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
+
+
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index cffafb73963a3..5b24f8879c2cd 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -48,7 +48,6 @@ bb:
   ret void
 }
 
-; FIXME: This generates "instid1(/* invalid instid value */)".
 define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg4, i1 %arg5, ptr %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i1 %arg11) {
 ; GFX11-LABEL: f2:
 ; GFX11:       ; %bb.0: ; %bb
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index 792d7db26d076..76016e46426bd 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -850,15 +850,13 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out,
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-TRUE16-NEXT:    s_load_b32 s4, s[4:5], 0x10
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e64 v2, 16, s4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.h
 ; GFX11-TRUE16-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
index b91963f08681c..d23509b5aa812 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
@@ -150,7 +150,6 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc,
 ; GFX10CU-NEXT:    buffer_load_dword v0, s[8:11], 0 offen lds
 ; GFX10CU-NEXT:    v_mov_b32_e32 v0, s13
 ; GFX10CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10CU-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10CU-NEXT:    s_barrier
 ; GFX10CU-NEXT:    ds_read_b32 v0, v0
 ; GFX10CU-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.b128.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.b128.ll
new file mode 100644
index 0000000000000..00b6bcf79f79c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.b128.ll
@@ -0,0 +1,2826 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942          < %s  | FileCheck -check-prefixes=GFX,GFX-SDAG,GFX942-SDAG                 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950          < %s  | FileCheck -check-prefixes=GFX,GFX-SDAG,GFX950-SDAG                 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx9-4-generic  < %s  | FileCheck -check-prefixes=GFX,GFX-SDAG,GFX9-4-GENERIC-SDAG %s
+
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942          < %s  | FileCheck -check-prefixes=GFX,GFX-ISEL,GFX942-ISEL                 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950          < %s  | FileCheck -check-prefixes=GFX,GFX-ISEL,GFX950-ISEL                 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx9-4-generic  < %s  | FileCheck -check-prefixes=GFX,GFX-ISEL,GFX9-4-GENERIC-ISEL %s
+
+
+;;==============================================================================
+;; A few basic test cases
+;;==============================================================================
+define <4 x i32> @global_load_b128_0_00(ptr addrspace(1) %addr) {
+; GFX-LABEL: global_load_b128_0_00:
+; GFX:       ; %bb.0: ; %entry
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %data = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %addr, metadata !0)
+  ret <4 x i32> %data
+}
+
+define <4 x i32> @global_load_b128_0_01(ptr addrspace(1) %addr) {
+; GFX-LABEL: global_load_b128_0_01:
+; GFX:       ; %bb.0: ; %entry
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc0
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %data = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %addr, metadata !1)
+  ret <4 x i32> %data
+}
+
+define <4 x i32> @global_load_b128_0_10(ptr addrspace(1) %addr) {
+; GFX-LABEL: global_load_b128_0_10:
+; GFX:       ; %bb.0: ; %entry
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc1
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %data = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %addr, metadata !2)
+  ret <4 x i32> %data
+}
+
+define <4 x i32> @global_load_b128_0_11(ptr addrspace(1) %addr) {
+; GFX-LABEL: global_load_b128_0_11:
+; GFX:       ; %bb.0: ; %entry
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc0 sc1
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %data = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %addr, metadata !3)
+  ret <4 x i32> %data
+}
+
+define <4 x i32> @global_load_b128_saddr_0_00(ptr addrspace(1) inreg %addr) {
+; GFX-LABEL: global_load_b128_saddr_0_00:
+; GFX:       ; %bb.0: ; %entry
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    v_mov_b32_e32 v0, 0
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1]
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %data = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %addr, metadata !0)
+  ret <4 x i32> %data
+}
+
+define <4 x i32> @global_load_b128_saddr_0_01(ptr addrspace(1) inreg %addr) {
+; GFX-LABEL: global_load_b128_saddr_0_01:
+; GFX:       ; %bb.0: ; %entry
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    v_mov_b32_e32 v0, 0
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1] sc0
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %data = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %addr, metadata !1)
+  ret <4 x i32> %data
+}
+
+define <4 x i32> @global_load_b128_saddr_0_02(ptr addrspace(1) inreg %addr) {
+; GFX-LABEL: global_load_b128_saddr_0_02:
+; GFX:       ; %bb.0: ; %entry
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    v_mov_b32_e32 v0, 0
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1] sc1
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %data = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %addr, metadata !2)
+  ret <4 x i32> %data
+}
+
+define <4 x i32> @global_load_b128_saddr_0_03(ptr addrspace(1) inreg %addr) {
+; GFX-LABEL: global_load_b128_saddr_0_03:
+; GFX:       ; %bb.0: ; %entry
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    v_mov_b32_e32 v0, 0
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1] sc0 sc1
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %data = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %addr, metadata !3)
+  ret <4 x i32> %data
+}
+
+;;==============================================================================
+;; Signed offset addressing modes (derived from global-saddr-load.ll) {
+;;==============================================================================
+;;------------------------------------------------------------------------------
+;; No vgpr offset, constants
+;;------------------------------------------------------------------------------
+
+;; base only
+define <4 x float> @global_load_i8_offset_0(ptr addrspace(1) %sbase) {
+; GFX-LABEL: global_load_i8_offset_0:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %sbase, metadata !0)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; base with maximum gfx9 immediate offset
+define <4 x float> @global_load_i8_offset_4095(ptr addrspace(1) %sbase) {
+; GFX-LABEL: global_load_i8_offset_4095:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:4095 sc0
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4095
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !1)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; base with maximum gfx9 immediate offset + 1
+define <4 x float> @global_load_i8_offset_4096(ptr addrspace(1) %sbase) {
+; GFX-LABEL: global_load_i8_offset_4096:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
+; GFX-NEXT:    s_nop 1
+; GFX-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc1
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4096
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !2)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; base with maximum gfx9 immediate offset + 2
+define <4 x float> @global_load_i8_offset_4097(ptr addrspace(1) %sbase) {
+;
+; GFX-SDAG-LABEL: global_load_i8_offset_4097:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
+; GFX-SDAG-NEXT:    s_nop 1
+; GFX-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:1 sc0 sc1
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_i8_offset_4097:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1001, v0
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc0 sc1
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4097
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !3)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; base with maximum negative gfx9 immediate offset
+define <4 x float> @global_load_i8_offset_neg4096(ptr addrspace(1) %sbase) {
+; GFX-LABEL: global_load_i8_offset_neg4096:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:-4096
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4096
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !0)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; base with maximum negative gfx9 immediate offset -1
+define <4 x float> @global_load_i8_offset_neg4097(ptr addrspace(1) %sbase) {
+;
+; GFX-SDAG-LABEL: global_load_i8_offset_neg4097:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff000, v0
+; GFX-SDAG-NEXT:    s_nop 1
+; GFX-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:-1 sc0
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_i8_offset_neg4097:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffefff, v0
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc0
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4097
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !1)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; base with maximum negative gfx9 immediate offset -2
+define <4 x float> @global_load_i8_offset_neg4098(ptr addrspace(1) %sbase) {
+;
+; GFX-SDAG-LABEL: global_load_i8_offset_neg4098:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff000, v0
+; GFX-SDAG-NEXT:    s_nop 1
+; GFX-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:-2 sc1
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_i8_offset_neg4098:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffeffe, v0
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc1
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4098
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !2)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; base with maximum gfx10 immediate offset
+define <4 x float> @global_load_i8_offset_2048(ptr addrspace(1) %sbase) {
+; GFX-LABEL: global_load_i8_offset_2048:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:2048 sc0 sc1
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 2048
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !3)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; base with maximum gfx10 immediate offset + 1
+define <4 x float> @global_load_i8_offset_2049(ptr addrspace(1) %sbase) {
+; GFX-LABEL: global_load_i8_offset_2049:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:2049
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 2049
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !0)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; base with maximum gfx10 immediate offset + 2
+define <4 x float> @global_load_i8_offset_2050(ptr addrspace(1) %sbase) {
+; GFX-LABEL: global_load_i8_offset_2050:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:2050 sc0
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 2050
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !1)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; base with maximum negative gfx10 immediate offset
+define <4 x float> @global_load_i8_offset_neg2048(ptr addrspace(1) %sbase) {
+; GFX-LABEL: global_load_i8_offset_neg2048:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:-2048 sc1
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -2048
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !2)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; base with maximum negative gfx10 immediate offset - 1
+define <4 x float> @global_load_i8_offset_neg2049(ptr addrspace(1) %sbase) {
+; GFX-LABEL: global_load_i8_offset_neg2049:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:-2049 sc0 sc1
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -2049
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !3)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; base with maximum negative gfx10 immediate offset - 1
+define <4 x float> @global_load_i8_offset_neg2050(ptr addrspace(1) %sbase) {
+; GFX-LABEL: global_load_i8_offset_neg2050:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:-2050
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -2050
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !0)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+define <4 x float> @global_load_i8_offset_0x7FFFFF(ptr addrspace(1) %sbase) {
+;
+; GFX-SDAG-LABEL: global_load_i8_offset_0x7FFFFF:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7ff000, v0
+; GFX-SDAG-NEXT:    s_nop 1
+; GFX-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:4095 sc0
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_i8_offset_0x7FFFFF:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fffff, v0
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc0
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 8388607
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !1)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+define <4 x float> @global_load_i8_offset_0xFFFFFF(ptr addrspace(1) %sbase) {
+; GFX-LABEL: global_load_i8_offset_0xFFFFFF:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    v_add_co_u32_e32 v0, vcc, 0xff800000, v0
+; GFX-NEXT:    s_nop 1
+; GFX-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc1
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -8388608
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !2)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+define <4 x float> @global_load_i8_offset_0xFFFFFFFF(ptr addrspace(1) %sbase) {
+;
+; GFX-SDAG-LABEL: global_load_i8_offset_0xFFFFFFFF:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff000, v0
+; GFX-SDAG-NEXT:    s_nop 1
+; GFX-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:4095 sc0 sc1
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_i8_offset_0xFFFFFFFF:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, -1, v0
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc0 sc1
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294967295
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !3)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+define <4 x float> @global_load_i8_offset_0x100000000(ptr addrspace(1) %sbase) {
+;
+; GFX-SDAG-LABEL: global_load_i8_offset_0x100000000:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_add_u32_e32 v1, 1, v1
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_i8_offset_0x100000000:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0, v0
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 1, v1, vcc
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294967296
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !0)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+define <4 x float> @global_load_i8_offset_0x100000001(ptr addrspace(1) %sbase) {
+;
+; GFX-SDAG-LABEL: global_load_i8_offset_0x100000001:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0, v0
+; GFX-SDAG-NEXT:    s_nop 1
+; GFX-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 1, v1, vcc
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:1 sc0
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_i8_offset_0x100000001:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v0
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 1, v1, vcc
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc0
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294967297
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !1)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+define <4 x float> @global_load_i8_offset_0x100000FFF(ptr addrspace(1) %sbase) {
+;
+; GFX-SDAG-LABEL: global_load_i8_offset_0x100000FFF:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0, v0
+; GFX-SDAG-NEXT:    s_nop 1
+; GFX-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 1, v1, vcc
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:4095 sc1
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_i8_offset_0x100000FFF:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfff, v0
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 1, v1, vcc
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc1
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294971391
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !2)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+define <4 x float> @global_load_i8_offset_0x100001000(ptr addrspace(1) %sbase) {
+; GFX-LABEL: global_load_i8_offset_0x100001000:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
+; GFX-NEXT:    s_nop 1
+; GFX-NEXT:    v_addc_co_u32_e32 v1, vcc, 1, v1, vcc
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc0 sc1
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294971392
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !3)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+define <4 x float> @global_load_i8_offset_neg0xFFFFFFFF(ptr addrspace(1) %sbase) {
+;
+; GFX-SDAG-LABEL: global_load_i8_offset_neg0xFFFFFFFF:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
+; GFX-SDAG-NEXT:    s_nop 1
+; GFX-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:-4095
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_i8_offset_neg0xFFFFFFFF:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v0
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4294967295
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !0)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+define <4 x float> @global_load_i8_offset_neg0x100000000(ptr addrspace(1) %sbase) {
+;
+; GFX-SDAG-LABEL: global_load_i8_offset_neg0x100000000:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_add_u32_e32 v1, -1, v1
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc0
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_i8_offset_neg0x100000000:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0, v0
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc0
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4294967296
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !1)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+define <4 x float> @global_load_i8_offset_neg0x100000001(ptr addrspace(1) %sbase) {
+;
+; GFX-SDAG-LABEL: global_load_i8_offset_neg0x100000001:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0, v0
+; GFX-SDAG-NEXT:    s_nop 1
+; GFX-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:-1 sc1
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_i8_offset_neg0x100000001:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, -1, v0
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, -2, v1, vcc
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc1
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4294967297
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !2)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;;------------------------------------------------------------------------------
+;; Basic addressing patterns
+;;------------------------------------------------------------------------------
+
+;; Basic pattern, no immediate offset.
+define <4 x float> @global_load_i8_zext_vgpr(ptr addrspace(1) %sbase, i32 %voffset) {
+;
+; GFX-SDAG-LABEL: global_load_i8_zext_vgpr:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc0 sc1
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_i8_zext_vgpr:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc0 sc1
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !3)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; Maximum positive offset on gfx9
+define <4 x float> @global_load_i8_zext_vgpr_offset_4095(ptr addrspace(1) %sbase, i32 %voffset) {
+;
+; GFX-SDAG-LABEL: global_load_i8_zext_vgpr_offset_4095:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:4095
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_i8_zext_vgpr_offset_4095:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:4095
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 4095
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep1, metadata !0)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; Maximum positive offset on gfx9 + 1
+define <4 x float> @global_load_i8_zext_vgpr_offset_4096(ptr addrspace(1) %sbase, i32 %voffset) {
+;
+; GFX-SDAG-LABEL: global_load_i8_zext_vgpr_offset_4096:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
+; GFX-SDAG-NEXT:    s_nop 1
+; GFX-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc0
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_i8_zext_vgpr_offset_4096:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc0
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 4096
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep1, metadata !1)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; Maximum negative offset on gfx9
+define <4 x float> @global_load_i8_zext_vgpr_offset_neg4096(ptr addrspace(1) %sbase, i32 %voffset) {
+;
+; GFX-SDAG-LABEL: global_load_i8_zext_vgpr_offset_neg4096:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:-4096 sc1
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_i8_zext_vgpr_offset_neg4096:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:-4096 sc1
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -4096
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep1, metadata !2)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; Maximum negative offset on gfx9 - 1
+define <4 x float> @global_load_i8_zext_vgpr_offset_neg4097(ptr addrspace(1) %sbase, i32 %voffset) {
+;
+; GFX-SDAG-LABEL: global_load_i8_zext_vgpr_offset_neg4097:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff000, v0
+; GFX-SDAG-NEXT:    s_nop 1
+; GFX-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:-1 sc0 sc1
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_i8_zext_vgpr_offset_neg4097:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffefff, v0
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc0 sc1
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -4097
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep1, metadata !3)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; Maximum positive offset on gfx10
+define <4 x float> @global_load_i8_zext_vgpr_offset_2047(ptr addrspace(1) %sbase, i32 %voffset) {
+;
+; GFX-SDAG-LABEL: global_load_i8_zext_vgpr_offset_2047:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:2047
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_i8_zext_vgpr_offset_2047:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:2047
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 2047
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep1, metadata !0)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; Maximum positive offset on gfx10 + 1
+define <4 x float> @global_load_i8_zext_vgpr_offset_2048(ptr addrspace(1) %sbase, i32 %voffset) {
+;
+; GFX-SDAG-LABEL: global_load_i8_zext_vgpr_offset_2048:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:2048 sc0
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_i8_zext_vgpr_offset_2048:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:2048 sc0
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 2048
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep1, metadata !1)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; Maximum negative offset on gfx10
+define <4 x float> @global_load_i8_zext_vgpr_offset_neg2048(ptr addrspace(1) %sbase, i32 %voffset) {
+;
+; GFX-SDAG-LABEL: global_load_i8_zext_vgpr_offset_neg2048:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:-2048 sc1
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_i8_zext_vgpr_offset_neg2048:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:-2048 sc1
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -2048
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep1, metadata !2)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; Maximum negative offset on gfx10 - 1
+define <4 x float> @global_load_i8_zext_vgpr_offset_neg2049(ptr addrspace(1) %sbase, i32 %voffset) {
+;
+; GFX-SDAG-LABEL: global_load_i8_zext_vgpr_offset_neg2049:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:-2049 sc0 sc1
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_i8_zext_vgpr_offset_neg2049:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:-2049 sc0 sc1
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -2049
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep1, metadata !3)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; Maximum positive offset on gfx12.
+define <4 x float> @global_load_i8_zext_vgpr_offset_0x7FFFFF(ptr addrspace(1) %sbase, i32 %voffset) { %zext.offset = zext i32 %voffset to i64
+;
+; GFX-SDAG-LABEL: global_load_i8_zext_vgpr_offset_0x7FFFFF:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7ff000, v0
+; GFX-SDAG-NEXT:    s_nop 1
+; GFX-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:4095
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_i8_zext_vgpr_offset_0x7FFFFF:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fffff, v0
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 8388607
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep1, metadata !0)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; Minimum offset on gfx12.
+define <4 x float> @global_load_i8_zext_vgpr_offset_0xFFFFFF(ptr addrspace(1) %sbase, i32 %voffset) { %zext.offset = zext i32 %voffset to i64
+;
+; GFX-SDAG-LABEL: global_load_i8_zext_vgpr_offset_0xFFFFFF:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xff800000, v0
+; GFX-SDAG-NEXT:    s_nop 1
+; GFX-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc0
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_i8_zext_vgpr_offset_0xFFFFFF:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0xff800000, v0
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc0
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -8388608
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep1, metadata !1)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+
+;; Maximum positive offset on gfx9, and immediate needs to be moved lower.
+define <4 x float> @global_load_i8_zext_vgpr_offset_4095_gep_order(ptr addrspace(1) %sbase, i32 %voffset) {
+;
+; GFX-SDAG-LABEL: global_load_i8_zext_vgpr_offset_4095_gep_order:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:4095 sc1
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_i8_zext_vgpr_offset_4095_gep_order:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:4095 sc1
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4095
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 %zext.offset
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep1, metadata !2)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; pointer addressing done in integers
+define <4 x float> @global_load_i8_zext_vgpr_ptrtoint(ptr addrspace(1) %sbase, i32 %voffset) {
+;
+; GFX-SDAG-LABEL: global_load_i8_zext_vgpr_ptrtoint:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc0 sc1
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_i8_zext_vgpr_ptrtoint:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc0 sc1
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset = zext i32 %voffset to i64
+  %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64
+  %add = add i64 %sbase.as.int, %zext.offset
+  %dirty.gep = inttoptr i64 %add to ptr addrspace(1)
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %dirty.gep, metadata !3)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; zext forced to LHS of addressing expression
+define <4 x float> @global_load_i8_zext_vgpr_ptrtoint_commute_add(ptr addrspace(1) %sbase, i32 %voffset) {
+;
+; GFX-SDAG-LABEL: global_load_i8_zext_vgpr_ptrtoint_commute_add:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1]
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_i8_zext_vgpr_ptrtoint_commute_add:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset = zext i32 %voffset to i64
+  %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64
+  %add = add i64 %zext.offset, %sbase.as.int
+  %dirty.gep = inttoptr i64 %add to ptr addrspace(1)
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %dirty.gep, metadata !0)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; zext forced to LHS of addressing expression, with immediate offset
+define <4 x float> @global_load_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0(ptr addrspace(1) %sbase, i32 %voffset) {
+;
+; GFX-SDAG-LABEL: global_load_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1]
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:128 sc0
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:128 sc0
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset = zext i32 %voffset to i64
+  %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64
+  %add = add i64 %zext.offset, %sbase.as.int
+  %add.immoffset = add i64 %add, 128
+  %dirty.gep = inttoptr i64 %add.immoffset to ptr addrspace(1)
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %dirty.gep, metadata !1)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; zext forced to LHS of addressing expression, with immediate offset in non-canonical position
+define <4 x float> @global_load_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1(ptr addrspace(1) %sbase, i32 %voffset) {
+;
+; GFX-SDAG-LABEL: global_load_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:128 sc1
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:128 sc1
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset = zext i32 %voffset to i64
+  %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64
+  %add.immoffset = add i64 %sbase.as.int, 128
+  %add = add i64 %zext.offset, %add.immoffset
+  %dirty.gep = inttoptr i64 %add to ptr addrspace(1)
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %dirty.gep, metadata !2)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;;------------------------------------------------------------------------------
+;; Uniformity edge cases
+;;------------------------------------------------------------------------------
+
+;; Both 64-bit base and 32-bit offset are scalar
+define <4 x float> @global_load_i8_zext_uniform_offset(ptr addrspace(1) %sbase, i32 %soffset) {
+;
+; GFX-SDAG-LABEL: global_load_i8_zext_uniform_offset:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc0 sc1
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_i8_zext_uniform_offset:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc0 sc1
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset = zext i32 %soffset to i64
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !3)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; Both 64-bit base and 32-bit offset are scalar, with immediate offset.
+define <4 x float> @global_load_i8_zext_uniform_offset_immoffset(ptr addrspace(1) %sbase, i32 %soffset) {
+;
+; GFX-SDAG-LABEL: global_load_i8_zext_uniform_offset_immoffset:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:-24
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_i8_zext_uniform_offset_immoffset:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:-24
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset = zext i32 %soffset to i64
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -24
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep1, metadata !0)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; Both components uniform, zext forced to LHS of addressing expression
+define <4 x float> @global_load_i8_zext_sgpr_ptrtoint_commute_add(ptr addrspace(1) %sbase, i32 %soffset) {
+;
+; GFX-SDAG-LABEL: global_load_i8_zext_sgpr_ptrtoint_commute_add:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1]
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc0
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_i8_zext_sgpr_ptrtoint_commute_add:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc0
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset = zext i32 %soffset to i64
+  %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64
+  %add = add i64 %zext.offset, %sbase.as.int
+  %dirty.gep = inttoptr i64 %add to ptr addrspace(1)
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %dirty.gep, metadata !1)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; Both components uniform, zext forced to LHS of addressing expression, with immediate offset
+define <4 x float> @global_load_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0(ptr addrspace(1) %sbase, i32 %soffset) {
+;
+; GFX-SDAG-LABEL: global_load_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1]
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:128 sc1
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:128 sc1
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset = zext i32 %soffset to i64
+  %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64
+  %add = add i64 %zext.offset, %sbase.as.int
+  %add.immoffset = add i64 %add, 128
+  %dirty.gep = inttoptr i64 %add.immoffset to ptr addrspace(1)
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %dirty.gep, metadata !2)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; divergent 64-bit base, 32-bit scalar offset.
+define <4 x float> @global_load_i8_vgpr64_sgpr32(ptr addrspace(1) %vbase, i32 %soffset) {
+;
+; GFX-SDAG-LABEL: global_load_i8_vgpr64_sgpr32:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc0 sc1
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_i8_vgpr64_sgpr32:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc0 sc1
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset = zext i32 %soffset to i64
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %vbase, i64 %zext.offset
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !3)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; divergent 64-bit base, 32-bit scalar offset, with imm offset
+define <4 x float> @global_load_i8_vgpr64_sgpr32_offset_4095(ptr addrspace(1) %vbase, i32 %soffset) {
+;
+; GFX-SDAG-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:4095
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:4095
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset = zext i32 %soffset to i64
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %vbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 4095
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep1, metadata !0)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;;------------------------------------------------------------------------------
+;; Natural addressing shifts with restricted range
+;;------------------------------------------------------------------------------
+
+;; Cannot push the shift into 32-bits, and cannot match.
+define <4 x float> @global_load_f32_natural_addressing(ptr addrspace(1) %sbase, ptr addrspace(1) %voffset.ptr) {
+; GFX-LABEL: global_load_f32_natural_addressing:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    global_load_dword v2, v[2:3], off
+; GFX-NEXT:    v_mov_b32_e32 v3, 0
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc0
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %voffset = load i32, ptr addrspace(1) %voffset.ptr
+  %zext.offset = zext i32 %voffset to i64
+  %gep = getelementptr inbounds float, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep, metadata !1)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; Cannot push the shift into 32-bits, with an immediate offset.
+define <4 x float> @global_load_f32_natural_addressing_immoffset(ptr addrspace(1) %sbase, ptr addrspace(1) %voffset.ptr) {
+;
+; GFX-SDAG-LABEL: global_load_f32_natural_addressing_immoffset:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    global_load_dword v2, v[2:3], off
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:128 sc1
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_f32_natural_addressing_immoffset:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    global_load_dword v2, v[2:3], off
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:128 sc1
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %voffset = load i32, ptr addrspace(1) %voffset.ptr
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 128
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep1, metadata !2)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; Range is sufficiently restricted to push the shift into 32-bits.
+define <4 x float> @global_load_f32_zext_vgpr_range(ptr addrspace(1) %sbase, ptr addrspace(1) %voffset.ptr) {
+;
+; GFX-SDAG-LABEL: global_load_f32_zext_vgpr_range:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    global_load_dword v2, v[2:3], off
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v2
+; GFX-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc0 sc1
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_f32_zext_vgpr_range:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    global_load_dword v2, v[2:3], off
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v2
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc0 sc1
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %voffset = load i32, ptr addrspace(1) %voffset.ptr, !range !4, !noundef !{}
+  %zext.offset = zext i32 %voffset to i64
+  %gep = getelementptr inbounds float, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep, metadata !3)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; Range is sufficiently restricted to push the shift into 32-bits, with an imm offset
+define <4 x float> @global_load_f32_zext_vgpr_range_imm_offset(ptr addrspace(1) %sbase, ptr addrspace(1) %voffset.ptr) {
+;
+; GFX-SDAG-LABEL: global_load_f32_zext_vgpr_range_imm_offset:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    global_load_dword v2, v[2:3], off
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v2
+; GFX-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:400
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_f32_zext_vgpr_range_imm_offset:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    global_load_dword v2, v[2:3], off
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v2
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:400
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %voffset = load i32, ptr addrspace(1) %voffset.ptr, !range !4, !noundef !{}
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds float, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds float, ptr addrspace(1) %gep0, i64 100
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep1, metadata !0)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; Range is 1 beyond the limit where we can move the shift into 32-bits.
+define <4 x float> @global_load_f32_zext_vgpr_range_too_large(ptr addrspace(1) %sbase, ptr addrspace(1) %voffset.ptr) {
+; GFX-LABEL: global_load_f32_zext_vgpr_range_too_large:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    global_load_dword v2, v[2:3], off
+; GFX-NEXT:    v_mov_b32_e32 v3, 0
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc0
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %voffset = load i32, ptr addrspace(1) %voffset.ptr, !range !5, !noundef !{}
+  %zext.offset = zext i32 %voffset to i64
+  %gep = getelementptr inbounds float, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep, metadata !1)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;;------------------------------------------------------------------------------
+;; or-with-constant as add
+;;------------------------------------------------------------------------------
+
+;; Check add-as-or with split 64-bit or.
+define <4 x float> @global_load_i8_offset_or_i64_imm_offset_16(ptr addrspace(6) %sbase, i32 %idx) {
+;
+; GFX-SDAG-LABEL: global_load_i8_offset_or_i64_imm_offset_16:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_or_b32_e32 v0, 16, v1
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc1
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_i8_offset_or_i64_imm_offset_16:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX-ISEL-NEXT:    v_or_b32_e32 v2, 16, v1
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[2:3], off sc1
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %zext.idx = zext i32 %idx to i64
+  %or = or i64 %zext.idx, 16
+  %addr = inttoptr i64 %or to ptr addrspace(1)
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %addr, metadata !2)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+define <4 x float> @global_load_i8_offset_or_i64_imm_offset_4160(ptr addrspace(6) %sbase, i32 %idx) {
+;
+; GFX-SDAG-LABEL: global_load_i8_offset_or_i64_imm_offset_4160:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_or_b32_e32 v0, 0x1040, v1
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc0 sc1
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_i8_offset_or_i64_imm_offset_4160:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX-ISEL-NEXT:    v_or_b32_e32 v2, 0x1040, v1
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[2:3], off sc0 sc1
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %zext.idx = zext i32 %idx to i64
+  %or = or i64 %zext.idx, 4160
+  %addr = inttoptr i64 %or to ptr addrspace(1)
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %addr, metadata !3)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;;------------------------------------------------------------------------------
+;; Full 64-bit scalar add.
+;;------------------------------------------------------------------------------
+define <4 x float> @global_addr_64bit_lsr_iv(ptr addrspace(1) %arg) {
+;
+; GFX-SDAG-LABEL: global_addr_64bit_lsr_iv:
+; GFX-SDAG:       ; %bb.0: ; %bb
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    s_mov_b32 s0, -1
+; GFX-SDAG-NEXT:  .LBB60_1: ; %bb3
+; GFX-SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX-SDAG-NEXT:    s_add_i32 s0, s0, 1
+; GFX-SDAG-NEXT:    s_cmpk_eq_i32 s0, 0xff
+; GFX-SDAG-NEXT:    s_cbranch_scc0 .LBB60_1
+; GFX-SDAG-NEXT:  ; %bb.2: ; %bb2
+; GFX-SDAG-NEXT:    s_mov_b32 s1, 0
+; GFX-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[0:1], 2, v[0:1]
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_addr_64bit_lsr_iv:
+; GFX-ISEL:       ; %bb.0: ; %bb
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    s_mov_b32 s0, -1
+; GFX-ISEL-NEXT:    v_mov_b32_e32 v3, 0xff
+; GFX-ISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX-ISEL-NEXT:  .LBB60_1: ; %bb3
+; GFX-ISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX-ISEL-NEXT:    v_add_u32_e32 v2, 1, v2
+; GFX-ISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX-ISEL-NEXT:    s_cbranch_vccz .LBB60_1
+; GFX-ISEL-NEXT:  ; %bb.2: ; %bb2
+; GFX-ISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX-ISEL-NEXT:    v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+bb:
+  br label %bb3
+
+bb2:                                              ; preds = %bb3
+  ret <4 x float> %i6
+
+bb3:                                              ; preds = %bb3, %bb
+  %i = phi i32 [ 0, %bb ], [ %i8, %bb3 ]
+  %i4 = zext i32 %i to i64
+  %i5 = getelementptr inbounds float, ptr addrspace(1) %arg, i64 %i4
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %i5, metadata !0)
+  %i6 = bitcast <4 x i32> %load to <4 x float>
+  %i8 = add nuw nsw i32 %i, 1
+  %i9 = icmp eq i32 %i8, 256
+  br i1 %i9, label %bb2, label %bb3
+}
+
+;; Make sure we only have a single zero vaddr initialization.
+
+define <4 x float> @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) %arg, ptr addrspace(1) %arg.1, i32 %x) {
+;
+; GFX-SDAG-LABEL: global_addr_64bit_lsr_iv_multiload:
+; GFX-SDAG:       ; %bb.0: ; %bb
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    s_mov_b32 s0, -1
+; GFX-SDAG-NEXT:  .LBB61_1: ; %bb5
+; GFX-SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX-SDAG-NEXT:    s_add_i32 s0, s0, 1
+; GFX-SDAG-NEXT:    s_cmpk_eq_i32 s0, 0xff
+; GFX-SDAG-NEXT:    s_cbranch_scc0 .LBB61_1
+; GFX-SDAG-NEXT:  ; %bb.2: ; %bb2
+; GFX-SDAG-NEXT:    s_mov_b32 s1, 0
+; GFX-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[0:1], 2, v[0:1]
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc0
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_addr_64bit_lsr_iv_multiload:
+; GFX-ISEL:       ; %bb.0: ; %bb
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    s_mov_b32 s0, -1
+; GFX-ISEL-NEXT:    v_mov_b32_e32 v3, 0xff
+; GFX-ISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX-ISEL-NEXT:  .LBB61_1: ; %bb5
+; GFX-ISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX-ISEL-NEXT:    v_add_u32_e32 v2, 1, v2
+; GFX-ISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX-ISEL-NEXT:    s_cbranch_vccz .LBB61_1
+; GFX-ISEL-NEXT:  ; %bb.2: ; %bb2
+; GFX-ISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX-ISEL-NEXT:    v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc0
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+bb:
+  br label %bb5
+
+bb2:
+  %y = icmp eq i32 %x, 0
+  br i1 %y, label %bb3, label %bb4
+
+bb3:
+  ret <4 x float> %i6
+
+bb4:
+  ret <4 x float> %i6.1
+
+bb5:
+  %i = phi i32 [ 0, %bb ], [ %i8, %bb5 ]
+  %i4 = zext i32 %i to i64
+  %i5 = getelementptr inbounds float, ptr addrspace(1) %arg, i64 %i4
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %i5, metadata !1)
+  %i6 = bitcast <4 x i32> %load to <4 x float>
+  %i5.1 = getelementptr inbounds float, ptr addrspace(1) %arg.1, i64 %i4
+  %load.1 = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %i5, metadata !2)
+  %i6.1 = bitcast <4 x i32> %load to <4 x float>
+  %i8 = add nuw nsw i32 %i, 1
+  %i9 = icmp eq i32 %i8, 256
+  br i1 %i9, label %bb2, label %bb5
+}
+;;==============================================================================
+;; } end signed offset addressing modes
+;;==============================================================================
+
+;;==============================================================================
+;; Various saddr addressing modes (derived from global-saddr-load.ll) {
+;;==============================================================================
+
+;;------------------------------------------------------------------------------
+;; No vgpr offset, constants
+;;------------------------------------------------------------------------------
+
+;; SGPR base only
+define <4 x float> @global_load_saddr_i8_offset_0(ptr addrspace(1) inreg %sbase) {
+; GFX-LABEL: global_load_saddr_i8_offset_0:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    v_mov_b32_e32 v0, 0
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1] sc0 sc1
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %sbase, metadata !3)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; SGPR base with maximum gfx9 immediate offset
+define <4 x float> @global_load_saddr_i8_offset_4095(ptr addrspace(1) inreg %sbase) {
+; GFX-LABEL: global_load_saddr_i8_offset_4095:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    v_mov_b32_e32 v0, 0
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1] offset:4095
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4095
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !0)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; SGPR base with maximum gfx9 immediate offset + 1
+define <4 x float> @global_load_saddr_i8_offset_4096(ptr addrspace(1) inreg %sbase) {
+; GFX-LABEL: global_load_saddr_i8_offset_4096:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    v_mov_b32_e32 v0, 0x1000
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1] sc0
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4096
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !1)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; SGPR base with maximum gfx9 immediate offset + 2
+define <4 x float> @global_load_saddr_i8_offset_4097(ptr addrspace(1) inreg %sbase) {
+; GFX-LABEL: global_load_saddr_i8_offset_4097:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    v_mov_b32_e32 v0, 0x1000
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1] offset:1 sc1
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4097
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !2)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; SGPR base with maximum negative gfx9 immediate offset
+define <4 x float> @global_load_saddr_i8_offset_neg4096(ptr addrspace(1) inreg %sbase) {
+; GFX-LABEL: global_load_saddr_i8_offset_neg4096:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    v_mov_b32_e32 v0, 0
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1] offset:-4096 sc0 sc1
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4096
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !3)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; SGPR base with maximum negative gfx9 immediate offset -1
+define <4 x float> @global_load_saddr_i8_offset_neg4097(ptr addrspace(1) inreg %sbase) {
+; GFX-LABEL: global_load_saddr_i8_offset_neg4097:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    s_add_u32 s0, s0, 0xffffefff
+; GFX-NEXT:    s_addc_u32 s1, s1, -1
+; GFX-NEXT:    v_mov_b32_e32 v0, 0
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1]
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4097
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !0)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; SGPR base with maximum negative gfx9 immediate offset -2
+define <4 x float> @global_load_saddr_i8_offset_neg4098(ptr addrspace(1) inreg %sbase) {
+; GFX-LABEL: global_load_saddr_i8_offset_neg4098:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    s_add_u32 s0, s0, 0xffffeffe
+; GFX-NEXT:    s_addc_u32 s1, s1, -1
+; GFX-NEXT:    v_mov_b32_e32 v0, 0
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1] sc0
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4098
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !1)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; SGPR base with maximum gfx10 immediate offset
+define <4 x float> @global_load_saddr_i8_offset_2048(ptr addrspace(1) inreg %sbase) {
+; GFX-LABEL: global_load_saddr_i8_offset_2048:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    v_mov_b32_e32 v0, 0
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1] offset:2048 sc1
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 2048
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !2)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; SGPR base with maximum gfx10 immediate offset + 1
+define <4 x float> @global_load_saddr_i8_offset_2049(ptr addrspace(1) inreg %sbase) {
+; GFX-LABEL: global_load_saddr_i8_offset_2049:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    v_mov_b32_e32 v0, 0
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1] offset:2049 sc0 sc1
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 2049
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !3)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; SGPR base with maximum gfx10 immediate offset + 2
+define <4 x float> @global_load_saddr_i8_offset_2050(ptr addrspace(1) inreg %sbase) {
+; GFX-LABEL: global_load_saddr_i8_offset_2050:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    v_mov_b32_e32 v0, 0
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1] offset:2050
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 2050
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !0)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; SGPR base with maximum negative gfx10 immediate offset
+define <4 x float> @global_load_saddr_i8_offset_neg2048(ptr addrspace(1) inreg %sbase) {
+; GFX-LABEL: global_load_saddr_i8_offset_neg2048:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    v_mov_b32_e32 v0, 0
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1] offset:-2048 sc0
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -2048
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !1)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; SGPR base with maximum negative gfx10 immediate offset - 1
+define <4 x float> @global_load_saddr_i8_offset_neg2049(ptr addrspace(1) inreg %sbase) {
+; GFX-LABEL: global_load_saddr_i8_offset_neg2049:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    v_mov_b32_e32 v0, 0
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1] offset:-2049 sc1
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -2049
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !2)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; SGPR base with maximum negative gfx10 immediate offset - 1
+define <4 x float> @global_load_saddr_i8_offset_neg2050(ptr addrspace(1) inreg %sbase) {
+; GFX-LABEL: global_load_saddr_i8_offset_neg2050:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    v_mov_b32_e32 v0, 0
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1] offset:-2050 sc0 sc1
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -2050
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !3)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+define <4 x float> @global_load_saddr_i8_offset_0x7FFFFF(ptr addrspace(1) inreg %sbase) {
+; GFX-LABEL: global_load_saddr_i8_offset_0x7FFFFF:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    v_mov_b32_e32 v0, 0x7ff000
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1] offset:4095
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 8388607
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !0)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+define <4 x float> @global_load_saddr_i8_offset_0xFFFFFF(ptr addrspace(1) inreg %sbase) {
+; GFX-LABEL: global_load_saddr_i8_offset_0xFFFFFF:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    s_add_u32 s0, s0, 0xff800000
+; GFX-NEXT:    s_addc_u32 s1, s1, -1
+; GFX-NEXT:    v_mov_b32_e32 v0, 0
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1] sc0
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -8388608
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !1)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+define <4 x float> @global_load_saddr_i8_offset_0xFFFFFFFF(ptr addrspace(1) inreg %sbase) {
+; GFX-LABEL: global_load_saddr_i8_offset_0xFFFFFFFF:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    v_mov_b32_e32 v0, 0xfffff000
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1] offset:4095 sc1
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294967295
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !2)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+define <4 x float> @global_load_saddr_i8_offset_0x100000000(ptr addrspace(1) inreg %sbase) {
+;
+; GFX-SDAG-LABEL: global_load_saddr_i8_offset_0x100000000:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    s_add_i32 s1, s1, 1
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1] sc0 sc1
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_saddr_i8_offset_0x100000000:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    s_add_u32 s0, s0, 0
+; GFX-ISEL-NEXT:    s_addc_u32 s1, s1, 1
+; GFX-ISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc0 sc1
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294967296
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !3)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+define <4 x float> @global_load_saddr_i8_offset_0x100000001(ptr addrspace(1) inreg %sbase) {
+;
+; GFX-SDAG-LABEL: global_load_saddr_i8_offset_0x100000001:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX-SDAG-NEXT:    v_add_co_u32_e64 v0, vcc, 0, s0
+; GFX-SDAG-NEXT:    s_nop 1
+; GFX-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 1, v1, vcc
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:1
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_saddr_i8_offset_0x100000001:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    s_add_u32 s0, s0, 1
+; GFX-ISEL-NEXT:    s_addc_u32 s1, s1, 1
+; GFX-ISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294967297
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !0)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+define <4 x float> @global_load_saddr_i8_offset_0x100000FFF(ptr addrspace(1) inreg %sbase) {
+; GFX-LABEL: global_load_saddr_i8_offset_0x100000FFF:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    s_add_u32 s0, s0, 0xfff
+; GFX-NEXT:    s_addc_u32 s1, s1, 1
+; GFX-NEXT:    v_mov_b32_e32 v0, 0
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1] sc0
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294971391
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !1)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+define <4 x float> @global_load_saddr_i8_offset_0x100001000(ptr addrspace(1) inreg %sbase) {
+; GFX-LABEL: global_load_saddr_i8_offset_0x100001000:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    s_add_u32 s0, s0, 0x1000
+; GFX-NEXT:    s_addc_u32 s1, s1, 1
+; GFX-NEXT:    v_mov_b32_e32 v0, 0
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1] sc1
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294971392
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !2)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+define <4 x float> @global_load_saddr_i8_offset_neg0xFFFFFFFF(ptr addrspace(1) inreg %sbase) {
+;
+; GFX-SDAG-LABEL: global_load_saddr_i8_offset_neg0xFFFFFFFF:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX-SDAG-NEXT:    s_nop 0
+; GFX-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:-4095 sc0 sc1
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_saddr_i8_offset_neg0xFFFFFFFF:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    s_add_u32 s0, s0, 1
+; GFX-ISEL-NEXT:    s_addc_u32 s1, s1, -1
+; GFX-ISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc0 sc1
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4294967295
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !3)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+define <4 x float> @global_load_saddr_i8_offset_neg0x100000000(ptr addrspace(1) inreg %sbase) {
+;
+; GFX-SDAG-LABEL: global_load_saddr_i8_offset_neg0x100000000:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    s_add_i32 s1, s1, -1
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1]
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_saddr_i8_offset_neg0x100000000:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    s_add_u32 s0, s0, 0
+; GFX-ISEL-NEXT:    s_addc_u32 s1, s1, -1
+; GFX-ISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4294967296
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !0)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+define <4 x float> @global_load_saddr_i8_offset_neg0x100000001(ptr addrspace(1) inreg %sbase) {
+;
+; GFX-SDAG-LABEL: global_load_saddr_i8_offset_neg0x100000001:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX-SDAG-NEXT:    v_add_co_u32_e64 v0, vcc, 0, s0
+; GFX-SDAG-NEXT:    s_nop 1
+; GFX-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:-1 sc0
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_saddr_i8_offset_neg0x100000001:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    s_add_u32 s0, s0, -1
+; GFX-ISEL-NEXT:    s_addc_u32 s1, s1, -2
+; GFX-ISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc0
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4294967297
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !1)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;;------------------------------------------------------------------------------
+;; Basic addressing patterns
+;;------------------------------------------------------------------------------
+
+;; Basic pattern, no immediate offset.
+define <4 x float> @global_load_saddr_i8_zext_vgpr(ptr addrspace(1) inreg %sbase, i32 %voffset) {
+; GFX-LABEL: global_load_saddr_i8_zext_vgpr:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1] sc1
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !2)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; Maximum positive offset on gfx9
+define <4 x float> @global_load_saddr_i8_zext_vgpr_offset_4095(ptr addrspace(1) inreg %sbase, i32 %voffset) {
+; GFX-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1] offset:4095 sc0 sc1
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 4095
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep1, metadata !3)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; Maximum positive offset on gfx9 + 1
+define <4 x float> @global_load_saddr_i8_zext_vgpr_offset_4096(ptr addrspace(1) inreg %sbase, i32 %voffset) {
+;
+; GFX-SDAG-LABEL: global_load_saddr_i8_zext_vgpr_offset_4096:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
+; GFX-SDAG-NEXT:    s_nop 1
+; GFX-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_saddr_i8_zext_vgpr_offset_4096:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 4096
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep1, metadata !0)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; Maximum negative offset on gfx9
+define <4 x float> @global_load_saddr_i8_zext_vgpr_offset_neg4096(ptr addrspace(1) inreg %sbase, i32 %voffset) {
+; GFX-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4096:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1] offset:-4096 sc0
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -4096
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep1, metadata !1)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; Maximum negative offset on gfx9 - 1
+define <4 x float> @global_load_saddr_i8_zext_vgpr_offset_neg4097(ptr addrspace(1) inreg %sbase, i32 %voffset) {
+;
+; GFX-SDAG-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4097:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff000, v0
+; GFX-SDAG-NEXT:    s_nop 1
+; GFX-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:-1 sc1
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4097:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffefff, v0
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc1
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -4097
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep1, metadata !2)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; Maximum positive offset on gfx10
+define <4 x float> @global_load_saddr_i8_zext_vgpr_offset_2047(ptr addrspace(1) inreg %sbase, i32 %voffset) {
+; GFX-LABEL: global_load_saddr_i8_zext_vgpr_offset_2047:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1] offset:2047 sc0 sc1
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 2047
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep1, metadata !3)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; Maximum positive offset on gfx10 + 1
+define <4 x float> @global_load_saddr_i8_zext_vgpr_offset_2048(ptr addrspace(1) inreg %sbase, i32 %voffset) {
+; GFX-LABEL: global_load_saddr_i8_zext_vgpr_offset_2048:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1] offset:2048
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 2048
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep1, metadata !0)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; Maximum negative offset on gfx10
+define <4 x float> @global_load_saddr_i8_zext_vgpr_offset_neg2048(ptr addrspace(1) inreg %sbase, i32 %voffset) {
+; GFX-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2048:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1] offset:-2048 sc0
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -2048
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep1, metadata !1)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; Maximum negative offset on gfx10 - 1
+define <4 x float> @global_load_saddr_i8_zext_vgpr_offset_neg2049(ptr addrspace(1) inreg %sbase, i32 %voffset) {
+; GFX-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2049:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1] offset:-2049 sc1
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -2049
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep1, metadata !2)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; Maximum positive offset on gfx12.
+define <4 x float> @global_load_saddr_i8_zext_vgpr_offset_0x7FFFFF(ptr addrspace(1) inreg %sbase, i32 %voffset) { %zext.offset = zext i32 %voffset to i64
+;
+; GFX-SDAG-LABEL: global_load_saddr_i8_zext_vgpr_offset_0x7FFFFF:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7ff000, v0
+; GFX-SDAG-NEXT:    s_nop 1
+; GFX-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:4095 sc0 sc1
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_saddr_i8_zext_vgpr_offset_0x7FFFFF:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fffff, v0
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc0 sc1
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 8388607
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep1, metadata !3)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; Minimum offset on gfx12.
+define <4 x float> @global_load_saddr_i8_zext_vgpr_offset_0xFFFFFF(ptr addrspace(1) inreg %sbase, i32 %voffset) { %zext.offset = zext i32 %voffset to i64
+;
+; GFX-SDAG-LABEL: global_load_saddr_i8_zext_vgpr_offset_0xFFFFFF:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX-SDAG-NEXT:    v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xff800000, v0
+; GFX-SDAG-NEXT:    s_nop 1
+; GFX-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_saddr_i8_zext_vgpr_offset_0xFFFFFF:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0xff800000, v0
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -8388608
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep1, metadata !0)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+
+;; Maximum positive offset on gfx9, and immediate needs to be moved lower.
+define <4 x float> @global_load_saddr_i8_zext_vgpr_offset_4095_gep_order(ptr addrspace(1) inreg %sbase, i32 %voffset) {
+; GFX-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095_gep_order:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1] offset:4095 sc0
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4095
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 %zext.offset
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep1, metadata !1)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; pointer addressing done in integers
+define <4 x float> @global_load_saddr_i8_zext_vgpr_ptrtoint(ptr addrspace(1) inreg %sbase, i32 %voffset) {
+; GFX-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1] sc1
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset = zext i32 %voffset to i64
+  %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64
+  %add = add i64 %sbase.as.int, %zext.offset
+  %dirty.gep = inttoptr i64 %add to ptr addrspace(1)
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %dirty.gep, metadata !2)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; zext forced to LHS of addressing expression
+define <4 x float> @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add(ptr addrspace(1) inreg %sbase, i32 %voffset) {
+; GFX-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1] sc0 sc1
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset = zext i32 %voffset to i64
+  %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64
+  %add = add i64 %zext.offset, %sbase.as.int
+  %dirty.gep = inttoptr i64 %add to ptr addrspace(1)
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %dirty.gep, metadata !3)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; zext forced to LHS of addressing expression, with immediate offset
+define <4 x float> @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0(ptr addrspace(1) inreg %sbase, i32 %voffset) {
+; GFX-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1] offset:128
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset = zext i32 %voffset to i64
+  %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64
+  %add = add i64 %zext.offset, %sbase.as.int
+  %add.immoffset = add i64 %add, 128
+  %dirty.gep = inttoptr i64 %add.immoffset to ptr addrspace(1)
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %dirty.gep, metadata !0)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; zext forced to LHS of addressing expression, with immediate offset in non-canonical position
+define <4 x float> @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1(ptr addrspace(1) inreg %sbase, i32 %voffset) {
+; GFX-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1] offset:128 sc0
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset = zext i32 %voffset to i64
+  %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64
+  %add.immoffset = add i64 %sbase.as.int, 128
+  %add = add i64 %zext.offset, %add.immoffset
+  %dirty.gep = inttoptr i64 %add to ptr addrspace(1)
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %dirty.gep, metadata !1)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;;------------------------------------------------------------------------------
+;; Uniformity edge cases
+;;------------------------------------------------------------------------------
+
+;; Both 64-bit base and 32-bit offset are scalar
+define <4 x float> @global_load_saddr_i8_zext_uniform_offset(ptr addrspace(1) inreg %sbase, i32 inreg %soffset) {
+; GFX-LABEL: global_load_saddr_i8_zext_uniform_offset:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    v_mov_b32_e32 v0, s2
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1] sc1
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset = zext i32 %soffset to i64
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !2)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; Both 64-bit base and 32-bit offset are scalar, with immediate offset.
+define <4 x float> @global_load_saddr_i8_zext_uniform_offset_immoffset(ptr addrspace(1) inreg %sbase, i32 inreg %soffset) {
+; GFX-LABEL: global_load_saddr_i8_zext_uniform_offset_immoffset:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    v_mov_b32_e32 v0, s2
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1] offset:-24 sc0 sc1
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset = zext i32 %soffset to i64
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -24
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep1, metadata !3)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; Both components uniform, zext forced to LHS of addressing expression
+define <4 x float> @global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add(ptr addrspace(1) inreg %sbase, i32 inreg %soffset) {
+; GFX-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    v_mov_b32_e32 v0, s2
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1]
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset = zext i32 %soffset to i64
+  %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64
+  %add = add i64 %zext.offset, %sbase.as.int
+  %dirty.gep = inttoptr i64 %add to ptr addrspace(1)
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %dirty.gep, metadata !0)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; Both components uniform, zext forced to LHS of addressing expression, with immediate offset
+define <4 x float> @global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0(ptr addrspace(1) inreg %sbase, i32 inreg %soffset) {
+; GFX-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    v_mov_b32_e32 v0, s2
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1] offset:128 sc0
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset = zext i32 %soffset to i64
+  %sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64
+  %add = add i64 %zext.offset, %sbase.as.int
+  %add.immoffset = add i64 %add, 128
+  %dirty.gep = inttoptr i64 %add.immoffset to ptr addrspace(1)
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %dirty.gep, metadata !1)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; divergent 64-bit base, 32-bit scalar offset.
+define <4 x float> @global_load_saddr_i8_vgpr64_sgpr32(ptr addrspace(1) %vbase, i32 inreg %soffset) {
+;
+; GFX-SDAG-LABEL: global_load_saddr_i8_vgpr64_sgpr32:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    s_mov_b32 s1, 0
+; GFX-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc1
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_saddr_i8_vgpr64_sgpr32:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    s_mov_b32 s1, 0
+; GFX-ISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc1
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset = zext i32 %soffset to i64
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %vbase, i64 %zext.offset
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep0, metadata !2)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; divergent 64-bit base, 32-bit scalar offset, with imm offset
+define <4 x float> @global_load_saddr_i8_vgpr64_sgpr32_offset_4095(ptr addrspace(1) %vbase, i32 inreg %soffset) {
+;
+; GFX-SDAG-LABEL: global_load_saddr_i8_vgpr64_sgpr32_offset_4095:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    s_mov_b32 s1, 0
+; GFX-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:4095 sc0 sc1
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_saddr_i8_vgpr64_sgpr32_offset_4095:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    s_mov_b32 s1, 0
+; GFX-ISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:4095 sc0 sc1
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset = zext i32 %soffset to i64
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %vbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 4095
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep1, metadata !3)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;;------------------------------------------------------------------------------
+;; Natural addressing shifts with restricted range
+;;------------------------------------------------------------------------------
+
+;; Cannot push the shift into 32-bits, and cannot match.
+define <4 x float> @global_load_saddr_f32_natural_addressing(ptr addrspace(1) inreg %sbase, ptr addrspace(1) %voffset.ptr) {
+;
+; GFX-SDAG-LABEL: global_load_saddr_f32_natural_addressing:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    global_load_dword v0, v[0:1], off
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 2, v[2:3]
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_saddr_f32_natural_addressing:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    global_load_dword v0, v[0:1], off
+; GFX-ISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 2, s[0:1]
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %voffset = load i32, ptr addrspace(1) %voffset.ptr
+  %zext.offset = zext i32 %voffset to i64
+  %gep = getelementptr inbounds float, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep, metadata !0)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; Cannot push the shift into 32-bits, with an immediate offset.
+define <4 x float> @global_load_saddr_f32_natural_addressing_immoffset(ptr addrspace(1) inreg %sbase, ptr addrspace(1) %voffset.ptr) {
+; GFX-LABEL: global_load_saddr_f32_natural_addressing_immoffset:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    global_load_dword v0, v[0:1], off
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1] offset:128 sc0
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %voffset = load i32, ptr addrspace(1) %voffset.ptr
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 128
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep1, metadata !1)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; Range is sufficiently restricted to push the shift into 32-bits.
+define <4 x float> @global_load_f32_saddr_zext_vgpr_range(ptr addrspace(1) inreg %sbase, ptr addrspace(1) %voffset.ptr) {
+; GFX-LABEL: global_load_f32_saddr_zext_vgpr_range:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    global_load_dword v0, v[0:1], off
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1] sc1
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %voffset = load i32, ptr addrspace(1) %voffset.ptr, !range !4, !noundef !{}
+  %zext.offset = zext i32 %voffset to i64
+  %gep = getelementptr inbounds float, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep, metadata !2)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; Range is sufficiently restricted to push the shift into 32-bits, with an imm offset
+define <4 x float> @global_load_f32_saddr_zext_vgpr_range_imm_offset(ptr addrspace(1) inreg %sbase, ptr addrspace(1) %voffset.ptr) {
+; GFX-LABEL: global_load_f32_saddr_zext_vgpr_range_imm_offset:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    global_load_dword v0, v[0:1], off
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1] offset:400 sc0 sc1
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %voffset = load i32, ptr addrspace(1) %voffset.ptr, !range !4, !noundef !{}
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds float, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds float, ptr addrspace(1) %gep0, i64 100
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep1, metadata !3)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;; Range is 1 beyond the limit where we can move the shift into 32-bits.
+define <4 x float> @global_load_f32_saddr_zext_vgpr_range_too_large(ptr addrspace(1) inreg %sbase, ptr addrspace(1) %voffset.ptr) {
+;
+; GFX-SDAG-LABEL: global_load_f32_saddr_zext_vgpr_range_too_large:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    global_load_dword v0, v[0:1], off
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 2, v[2:3]
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_f32_saddr_zext_vgpr_range_too_large:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    global_load_dword v0, v[0:1], off
+; GFX-ISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 2, s[0:1]
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %voffset = load i32, ptr addrspace(1) %voffset.ptr, !range !5, !noundef !{}
+  %zext.offset = zext i32 %voffset to i64
+  %gep = getelementptr inbounds float, ptr addrspace(1) %sbase, i64 %zext.offset
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %gep, metadata !0)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;;------------------------------------------------------------------------------
+;; or-with-constant as add
+;;------------------------------------------------------------------------------
+
+;; Check add-as-or with split 64-bit or.
+define <4 x float> @global_load_saddr_i8_offset_or_i64_imm_offset_16(ptr addrspace(6) inreg %sbase, i32 %idx) {
+;
+; GFX-SDAG-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_16:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_or_b32_e32 v0, 16, v0
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc0
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_16:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX-ISEL-NEXT:    v_or_b32_e32 v0, 16, v0
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc0
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %zext.idx = zext i32 %idx to i64
+  %or = or i64 %zext.idx, 16
+  %addr = inttoptr i64 %or to ptr addrspace(1)
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %addr, metadata !1)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+define <4 x float> @global_load_saddr_i8_offset_or_i64_imm_offset_4160(ptr addrspace(6) inreg %sbase, i32 %idx) {
+;
+; GFX-SDAG-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_4160:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_or_b32_e32 v0, 0x1040, v0
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc1
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_4160:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX-ISEL-NEXT:    v_or_b32_e32 v0, 0x1040, v0
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc1
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %zext.idx = zext i32 %idx to i64
+  %or = or i64 %zext.idx, 4160
+  %addr = inttoptr i64 %or to ptr addrspace(1)
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %addr, metadata !2)
+  %cast.load = bitcast <4 x i32> %load to <4 x float>
+  ret <4 x float> %cast.load
+}
+
+;;------------------------------------------------------------------------------
+;; Full 64-bit scalar add.
+;;------------------------------------------------------------------------------
+define <4 x float> @global_saddr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) {
+;
+; GFX-SDAG-LABEL: global_saddr_64bit_lsr_iv:
+; GFX-SDAG:       ; %bb.0: ; %bb
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    s_mov_b32 s2, -1
+; GFX-SDAG-NEXT:  .LBB114_1: ; %bb3
+; GFX-SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX-SDAG-NEXT:    s_add_i32 s2, s2, 1
+; GFX-SDAG-NEXT:    s_cmpk_eq_i32 s2, 0xff
+; GFX-SDAG-NEXT:    s_cbranch_scc0 .LBB114_1
+; GFX-SDAG-NEXT:  ; %bb.2: ; %bb2
+; GFX-SDAG-NEXT:    s_mov_b32 s3, 0
+; GFX-SDAG-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX-SDAG-NEXT:    s_add_u32 s0, s0, s2
+; GFX-SDAG-NEXT:    s_addc_u32 s1, s1, s3
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1] sc0 sc1
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_saddr_64bit_lsr_iv:
+; GFX-ISEL:       ; %bb.0: ; %bb
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    s_mov_b32 s2, -1
+; GFX-ISEL-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX-ISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX-ISEL-NEXT:  .LBB114_1: ; %bb3
+; GFX-ISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX-ISEL-NEXT:    v_add_u32_e32 v0, 1, v0
+; GFX-ISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX-ISEL-NEXT:    s_cbranch_vccz .LBB114_1
+; GFX-ISEL-NEXT:  ; %bb.2: ; %bb2
+; GFX-ISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX-ISEL-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 2, s[0:1]
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc0 sc1
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+bb:
+  br label %bb3
+
+bb2:                                              ; preds = %bb3
+  ret <4 x float> %i6
+
+bb3:                                              ; preds = %bb3, %bb
+  %i = phi i32 [ 0, %bb ], [ %i8, %bb3 ]
+  %i4 = zext i32 %i to i64
+  %i5 = getelementptr inbounds float, ptr addrspace(1) %arg, i64 %i4
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %i5, metadata !3)
+  %i6 = bitcast <4 x i32> %load to <4 x float>
+  %i8 = add nuw nsw i32 %i, 1
+  %i9 = icmp eq i32 %i8, 256
+  br i1 %i9, label %bb2, label %bb3
+}
+
+;; Make sure we only have a single zero vaddr initialization.
+
+define <4 x float> @global_saddr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg %arg, ptr addrspace(1) inreg %arg.1, i32 %x) {
+;
+; GFX-SDAG-LABEL: global_saddr_64bit_lsr_iv_multiload:
+; GFX-SDAG:       ; %bb.0: ; %bb
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    s_mov_b32 s2, -1
+; GFX-SDAG-NEXT:  .LBB115_1: ; %bb5
+; GFX-SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX-SDAG-NEXT:    s_add_i32 s2, s2, 1
+; GFX-SDAG-NEXT:    s_cmpk_eq_i32 s2, 0xff
+; GFX-SDAG-NEXT:    s_cbranch_scc0 .LBB115_1
+; GFX-SDAG-NEXT:  ; %bb.2: ; %bb2
+; GFX-SDAG-NEXT:    s_mov_b32 s3, 0
+; GFX-SDAG-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX-SDAG-NEXT:    s_add_u32 s0, s0, s2
+; GFX-SDAG-NEXT:    s_addc_u32 s1, s1, s3
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1]
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_saddr_64bit_lsr_iv_multiload:
+; GFX-ISEL:       ; %bb.0: ; %bb
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    s_mov_b32 s2, -1
+; GFX-ISEL-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX-ISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX-ISEL-NEXT:  .LBB115_1: ; %bb5
+; GFX-ISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX-ISEL-NEXT:    v_add_u32_e32 v0, 1, v0
+; GFX-ISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX-ISEL-NEXT:    s_cbranch_vccz .LBB115_1
+; GFX-ISEL-NEXT:  ; %bb.2: ; %bb2
+; GFX-ISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX-ISEL-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 2, s[0:1]
+; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+bb:
+  br label %bb5
+
+bb2:
+  %y = icmp eq i32 %x, 0
+  br i1 %y, label %bb3, label %bb4
+
+bb3:
+  ret <4 x float> %i6
+
+bb4:
+  ret <4 x float> %i6.1
+
+bb5:
+  %i = phi i32 [ 0, %bb ], [ %i8, %bb5 ]
+  %i4 = zext i32 %i to i64
+  %i5 = getelementptr inbounds float, ptr addrspace(1) %arg, i64 %i4
+  %load = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %i5, metadata !0)
+  %i6 = bitcast <4 x i32> %load to <4 x float>
+  %i5.1 = getelementptr inbounds float, ptr addrspace(1) %arg.1, i64 %i4
+  %load.1 = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %i5, metadata !1)
+  %i6.1 = bitcast <4 x i32> %load to <4 x float>
+  %i8 = add nuw nsw i32 %i, 1
+  %i9 = icmp eq i32 %i8, 256
+  br i1 %i9, label %bb2, label %bb5
+}
+;;==============================================================================
+;; } End saddr addressing modes
+;;==============================================================================
+
+!0 = !{!"wavefront"}
+!1 = !{!"workgroup"}
+!2 = !{!"agent"}
+!3 = !{!""}
+
+!4 = !{i32 0, i32 1073741824} ; (1 << 30)
+!5 = !{i32 0, i32 1073741825} ; (1 << 30) + 1
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX9-4-GENERIC-ISEL: {{.*}}
+; GFX9-4-GENERIC-SDAG: {{.*}}
+; GFX942-ISEL: {{.*}}
+; GFX942-SDAG: {{.*}}
+; GFX950-ISEL: {{.*}}
+; GFX950-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.store.b128.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.store.b128.ll
new file mode 100644
index 0000000000000..5d03cb0a094db
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.store.b128.ll
@@ -0,0 +1,385 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942          < %s  | FileCheck -check-prefixes=GFX,GFX-SDAG,GFX942-SDAG          %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950          < %s  | FileCheck -check-prefixes=GFX,GFX-SDAG,GFX950-SDAG          %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx9-4-generic  < %s  | FileCheck -check-prefixes=GFX,GFX-SDAG,GFX9-4-GENERIC-SDAG  %s
+
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942          < %s  | FileCheck -check-prefixes=GFX,GFX-ISEL,GFX942-ISEL          %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950          < %s  | FileCheck -check-prefixes=GFX,GFX-ISEL,GFX950-ISEL          %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx9-4-generic  < %s  | FileCheck -check-prefixes=GFX,GFX-ISEL,GFX9-4-GENERIC-ISEL  %s
+
+;;==============================================================================
+;; A few basic test cases
+;;==============================================================================
+define void @global_store_b128_0_00(ptr addrspace(1) %addr, <4 x i32> %data) {
+; GFX-LABEL: global_store_b128_0_00:
+; GFX:       ; %bb.0: ; %entry
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  call void @llvm.amdgcn.global.store.b128(ptr addrspace(1) %addr, <4 x i32> %data, metadata !0)
+  ret void
+}
+
+define void @global_store_b128_0_01(ptr addrspace(1) %addr, <4 x i32> %data) {
+; GFX-LABEL: global_store_b128_0_01:
+; GFX:       ; %bb.0: ; %entry
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off sc0
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  call void @llvm.amdgcn.global.store.b128(ptr addrspace(1) %addr, <4 x i32> %data, metadata !1)
+  ret void
+}
+
+define void @global_store_b128_0_10(ptr addrspace(1) %addr, <4 x i32> %data) {
+; GFX-LABEL: global_store_b128_0_10:
+; GFX:       ; %bb.0: ; %entry
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off sc1
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  call void @llvm.amdgcn.global.store.b128(ptr addrspace(1) %addr, <4 x i32> %data, metadata !2)
+  ret void
+}
+
+define void @global_store_b128_0_11(ptr addrspace(1) %addr, <4 x i32> %data) {
+; GFX-LABEL: global_store_b128_0_11:
+; GFX:       ; %bb.0: ; %entry
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  call void @llvm.amdgcn.global.store.b128(ptr addrspace(1) %addr, <4 x i32> %data, metadata !3)
+  ret void
+}
+
+;;==============================================================================
+;; Signed offset addressing modes (derived from global-saddr-store.ll) {
+;;==============================================================================
+
+define void @global_store_i8_zext_vgpr(ptr addrspace(1) %sbase, ptr addrspace(1) %voffset.ptr, <4 x i32> %data) {
+; GFX-SDAG-LABEL: global_store_i8_zext_vgpr:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    global_load_dword v2, v[2:3], off
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_store_i8_zext_vgpr:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    global_load_dword v2, v[2:3], off
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX-ISEL-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %voffset = load i32, ptr addrspace(1) %voffset.ptr
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  call void @llvm.amdgcn.global.store.b128(ptr addrspace(1) %gep0, <4 x i32> %data, metadata !0)
+  ret void
+}
+
+define void @global_store_v4i32_zext_vgpr_offset_neg128(ptr addrspace(1) %sbase, i32 %voffset, <4 x i32> %data) {
+; GFX-SDAG-LABEL: global_store_v4i32_zext_vgpr_offset_neg128:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v7, v6
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v6, v5
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v5, v4
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v4, v3
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:-128 sc0
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_store_v4i32_zext_vgpr_offset_neg128:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX-ISEL-NEXT:    v_mov_b32_e32 v8, v3
+; GFX-ISEL-NEXT:    v_mov_b32_e32 v9, v4
+; GFX-ISEL-NEXT:    v_mov_b32_e32 v10, v5
+; GFX-ISEL-NEXT:    v_mov_b32_e32 v11, v6
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX-ISEL-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off offset:-128 sc0
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  call void @llvm.amdgcn.global.store.b128(ptr addrspace(1) %gep1, <4 x i32> %data, metadata !1)
+  ret void
+}
+
+;; Maximum positive offset on gfx10
+define void @global_store_i8_zext_vgpr_offset_2047(ptr addrspace(1) %sbase, ptr addrspace(1) %voffset.ptr, <4 x i32> %data) {
+; GFX-SDAG-LABEL: global_store_i8_zext_vgpr_offset_2047:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    global_load_dword v2, v[2:3], off
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:2047 sc1
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_store_i8_zext_vgpr_offset_2047:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    global_load_dword v2, v[2:3], off
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX-ISEL-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:2047 sc1
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %voffset = load i32, ptr addrspace(1) %voffset.ptr
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 2047
+  call void @llvm.amdgcn.global.store.b128(ptr addrspace(1) %gep1, <4 x i32> %data, metadata !2)
+  ret void
+}
+
+;; Maximum negative offset on gfx10
+define void @global_store_i8_zext_vgpr_offset_neg2048(ptr addrspace(1) %sbase, ptr addrspace(1) %voffset.ptr, <4 x i32> %data) {
+; GFX-SDAG-LABEL: global_store_i8_zext_vgpr_offset_neg2048:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    global_load_dword v2, v[2:3], off
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:-2048 sc0 sc1
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_store_i8_zext_vgpr_offset_neg2048:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    global_load_dword v2, v[2:3], off
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX-ISEL-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:-2048 sc0 sc1
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %voffset = load i32, ptr addrspace(1) %voffset.ptr
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -2048
+  call void @llvm.amdgcn.global.store.b128(ptr addrspace(1) %gep1, <4 x i32> %data, metadata !3)
+  ret void
+}
+;;==============================================================================
+;; } end signed offset addressing modes
+;;==============================================================================
+
+;;==============================================================================
+;; Various saddr addressing modes (derived from global-saddr-load.ll) {
+;;==============================================================================
+
+define void @global_store_saddr_i8_zext_vgpr(ptr addrspace(1) inreg %sbase, ptr addrspace(1) %voffset.ptr, <4 x i32> %data) {
+; GFX-LABEL: global_store_saddr_i8_zext_vgpr:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    global_load_dword v0, v[0:1], off
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %voffset = load i32, ptr addrspace(1) %voffset.ptr
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  call void @llvm.amdgcn.global.store.b128(ptr addrspace(1) %gep0, <4 x i32> %data, metadata !0)
+  ret void
+}
+
+define void @global_store_saddr_v4i32_zext_vgpr_offset_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, <4 x i32> %data) {
+; GFX-SDAG-LABEL: global_store_saddr_v4i32_zext_vgpr_offset_neg128:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v5, v4
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v4, v3
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v3, v2
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v2, v1
+; GFX-SDAG-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1] offset:-128 sc0
+; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX-ISEL-LABEL: global_store_saddr_v4i32_zext_vgpr_offset_neg128:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-ISEL-NEXT:    v_mov_b32_e32 v6, v1
+; GFX-ISEL-NEXT:    v_mov_b32_e32 v7, v2
+; GFX-ISEL-NEXT:    v_mov_b32_e32 v8, v3
+; GFX-ISEL-NEXT:    v_mov_b32_e32 v9, v4
+; GFX-ISEL-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:-128 sc0
+; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
+  call void @llvm.amdgcn.global.store.b128(ptr addrspace(1) %gep1, <4 x i32> %data, metadata !1)
+  ret void
+}
+
+;; Maximum positive offset on gfx10
+define void @global_store_saddr_i8_zext_vgpr_offset_2047(ptr addrspace(1) inreg %sbase, ptr addrspace(1) %voffset.ptr, <4 x i32> %data) {
+; GFX-LABEL: global_store_saddr_i8_zext_vgpr_offset_2047:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    global_load_dword v0, v[0:1], off
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1] offset:2047 sc1
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %voffset = load i32, ptr addrspace(1) %voffset.ptr
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 2047
+  call void @llvm.amdgcn.global.store.b128(ptr addrspace(1) %gep1, <4 x i32> %data, metadata !2)
+  ret void
+}
+
+;; Maximum negative offset on gfx10
+define void @global_store_saddr_i8_zext_vgpr_offset_neg2048(ptr addrspace(1) inreg %sbase, ptr addrspace(1) %voffset.ptr, <4 x i32> %data) {
+; GFX-LABEL: global_store_saddr_i8_zext_vgpr_offset_neg2048:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    global_load_dword v0, v[0:1], off
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1] offset:-2048 sc0 sc1
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
+  %voffset = load i32, ptr addrspace(1) %voffset.ptr
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -2048
+  call void @llvm.amdgcn.global.store.b128(ptr addrspace(1) %gep1, <4 x i32> %data, metadata !3)
+  ret void
+}
+
+;;------------------------------------------------------------------------------
+;; Uniformity edge cases
+;;------------------------------------------------------------------------------
+
+@ptr.in.lds = internal addrspace(3) global ptr addrspace(1) poison
+
+;; Base pointer is uniform, but also in VGPRs
+define amdgpu_kernel void @global_store_saddr_uniform_ptr_in_vgprs(i32 %voffset, <4 x i32> %data) {
+; GFX-SDAG-LABEL: global_store_saddr_uniform_ptr_in_vgprs:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x24
+; GFX-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX-SDAG-NEXT:    ds_read_b64 v[0:1], v0
+; GFX-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v2, s6
+; GFX-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[2:3]
+; GFX-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX-SDAG-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX-SDAG-NEXT:    s_nop 4
+; GFX-SDAG-NEXT:    global_store_dwordx4 v2, v[4:7], s[0:1]
+; GFX-SDAG-NEXT:    s_endpgm
+;
+; GFX-ISEL-LABEL: global_store_saddr_uniform_ptr_in_vgprs:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_load_dword s6, s[4:5], 0x24
+; GFX-ISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GFX-ISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX-ISEL-NEXT:    ds_read_b64 v[0:1], v0
+; GFX-ISEL-NEXT:    s_mov_b32 s7, 0
+; GFX-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX-ISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v4, vcc, v0, v2
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v5, vcc, v1, v3, vcc
+; GFX-ISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX-ISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX-ISEL-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX-ISEL-NEXT:    s_endpgm
+  %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  call void @llvm.amdgcn.global.store.b128(ptr addrspace(1) %gep0, <4 x i32> %data, metadata !0)
+  ret void
+}
+
+;; Base pointer is uniform, but also in VGPRs, with imm offset
+define amdgpu_kernel void @global_store_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voffset, <4 x i32> %data) {
+; GFX-SDAG-LABEL: global_store_saddr_uniform_ptr_in_vgprs_immoffset:
+; GFX-SDAG:       ; %bb.0:
+; GFX-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x24
+; GFX-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX-SDAG-NEXT:    ds_read_b64 v[0:1], v0
+; GFX-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX-SDAG-NEXT:    v_mov_b32_e32 v2, s6
+; GFX-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[2:3]
+; GFX-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX-SDAG-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX-SDAG-NEXT:    s_nop 4
+; GFX-SDAG-NEXT:    global_store_dwordx4 v2, v[4:7], s[0:1] offset:-120 sc0
+; GFX-SDAG-NEXT:    s_endpgm
+;
+; GFX-ISEL-LABEL: global_store_saddr_uniform_ptr_in_vgprs_immoffset:
+; GFX-ISEL:       ; %bb.0:
+; GFX-ISEL-NEXT:    s_load_dword s6, s[4:5], 0x24
+; GFX-ISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GFX-ISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX-ISEL-NEXT:    ds_read_b64 v[0:1], v0
+; GFX-ISEL-NEXT:    s_mov_b32 s7, 0
+; GFX-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX-ISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX-ISEL-NEXT:    v_add_co_u32_e32 v4, vcc, v0, v2
+; GFX-ISEL-NEXT:    s_nop 1
+; GFX-ISEL-NEXT:    v_addc_co_u32_e32 v5, vcc, v1, v3, vcc
+; GFX-ISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX-ISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX-ISEL-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off offset:-120 sc0
+; GFX-ISEL-NEXT:    s_endpgm
+  %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds
+  %zext.offset = zext i32 %voffset to i64
+  %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
+  %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -120
+  call void @llvm.amdgcn.global.store.b128(ptr addrspace(1) %gep1, <4 x i32> %data, metadata !1)
+  ret void
+}
+
+;;==============================================================================
+;; } End saddr addressing modes
+;;==============================================================================
+
+
+!0 = !{!"wavefront"}
+!1 = !{!"workgroup"}
+!2 = !{!"agent"}
+!3 = !{!""}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX9-4-GENERIC-ISEL: {{.*}}
+; GFX9-4-GENERIC-SDAG: {{.*}}
+; GFX942-ISEL: {{.*}}
+; GFX942-SDAG: {{.*}}
+; GFX950-ISEL: {{.*}}
+; GFX950-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
index b07dec326327e..689d1472d6010 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
@@ -6,1153 +6,1147 @@
   define amdgpu_kernel void @largeInterleave() #0 { ret void }
   ; GCN-LABEL: largeInterleave:
   ; GCN:       ; %bb.0:
-  ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-  ; GCN-NEXT:    ; implicit-def: $vgpr0
-  ; GCN-NEXT:    ; implicit-def: $vgpr2
-  ; GCN-NEXT:    ; implicit-def: $vgpr1
-  ; GCN-NEXT:    ; implicit-def: $vgpr8
-  ; GCN-NEXT:    ; implicit-def: $vgpr94
-  ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
-  ; GCN-NEXT:    ; implicit-def: $vgpr106
-  ; GCN-NEXT:    ; implicit-def: $vgpr132
-  ; GCN-NEXT:    ; implicit-def: $vgpr133
-  ; GCN-NEXT:    ; implicit-def: $vgpr139
-  ; GCN-NEXT:    ; implicit-def: $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127
-  ; GCN-NEXT:    ; iglp_opt mask(0x00000002)
-  ; GCN-NEXT:    ; implicit-def: $sgpr0
+  ; GCN-NEXT:    ; implicit-def: $vgpr16
+  ; GCN-NEXT:    ; implicit-def: $vgpr25
   ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-  ; GCN-NEXT:    v_readfirstlane_b32 s7, v0
+  ; GCN-NEXT:    v_readfirstlane_b32 s17, v16
+  ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+  ; GCN-NEXT:    ; implicit-def: $vgpr17
+  ; GCN-NEXT:    ; implicit-def: $sgpr15
   ; GCN-NEXT:    ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11
-  ; GCN-NEXT:    ; kill: killed $sgpr8_sgpr9_sgpr10_sgpr11
-  ; GCN-NEXT:    ; implicit-def: $sgpr5
-  ; GCN-NEXT:    s_nop 1
-  ; GCN-NEXT:    v_lshl_add_u32 v0, s7, 4, v2
-  ; GCN-NEXT:    v_mul_lo_u32 v0, v0, s6
-  ; GCN-NEXT:    v_add_lshl_u32 v92, v0, v1, 1
-  ; GCN-NEXT:    v_add_u32_e32 v93, s0, v92
-  ; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v92, s[8:11], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_lshl_b32 s18, s17, 7
+  ; GCN-NEXT:    ; implicit-def: $vgpr18
+  ; GCN-NEXT:    v_add_lshl_u32 v230, v18, s18, 1
+  ; GCN-NEXT:    v_lshl_add_u32 v25, s17, 4, v25
+  ; GCN-NEXT:    v_mul_lo_u32 v25, v25, s6
+  ; GCN-NEXT:    v_add_lshl_u32 v226, v25, v17, 1
+  ; GCN-NEXT:    v_add_u32_e32 v17, s15, v226
+  ; GCN-NEXT:    buffer_load_dwordx4 v[64:67], v226, s[8:11], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v93, s[8:11], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx4 v[68:71], v17, s[8:11], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    s_lshl_b32 s0, s7, 7
-  ; GCN-NEXT:    v_add_lshl_u32 v95, v8, s0, 1
-  ; GCN-NEXT:    v_add_u32_e32 v8, 64, v93
-  ; GCN-NEXT:    ; kill: killed $vgpr8
+  ; GCN-NEXT:    v_add_u32_e32 v72, 64, v17
+  ; GCN-NEXT:    ; implicit-def: $vgpr213
+  ; GCN-NEXT:    ; implicit-def: $vgpr152_vgpr153_vgpr154_vgpr155
+  ; GCN-NEXT:    ; implicit-def: $vgpr246
+  ; GCN-NEXT:    v_add_u32_e32 v188, 0x80, v17
+  ; GCN-NEXT:    ; implicit-def: $vgpr156_vgpr157_vgpr158_vgpr159
+  ; GCN-NEXT:    ; implicit-def: $vgpr144_vgpr145_vgpr146_vgpr147
+  ; GCN-NEXT:    ; implicit-def: $vgpr19
+  ; GCN-NEXT:    ; implicit-def: $vgpr26
+  ; GCN-NEXT:    ; implicit-def: $vgpr27
+  ; GCN-NEXT:    v_add_u32_e32 v227, 0xc0, v17
+  ; GCN-NEXT:    v_add_u32_e32 v231, v19, v26
+  ; GCN-NEXT:    v_add_u32_e32 v232, v19, v27
   ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GCN-NEXT:    ; kill: killed $vgpr92
-  ; GCN-NEXT:    ; implicit-def: $sgpr6
+  ; GCN-NEXT:    ; implicit-def: $vgpr28
+  ; GCN-NEXT:    ; implicit-def: $vgpr29
+  ; GCN-NEXT:    v_add_u32_e32 v233, v19, v28
+  ; GCN-NEXT:    v_add_u32_e32 v234, v19, v29
+  ; GCN-NEXT:    ; implicit-def: $vgpr140_vgpr141_vgpr142_vgpr143
+  ; GCN-NEXT:    ; implicit-def: $sgpr5
+  ; GCN-NEXT:    ; implicit-def: $sgpr7
+  ; GCN-NEXT:    ; implicit-def: $vgpr148_vgpr149_vgpr150_vgpr151
+  ; GCN-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139
+  ; GCN-NEXT:    ; implicit-def: $vgpr132_vgpr133_vgpr134_vgpr135
+  ; GCN-NEXT:    ; implicit-def: $vgpr20
+  ; GCN-NEXT:    v_add_u32_e32 v18, s17, v20
+  ; GCN-NEXT:    v_and_b32_e32 v18, 0x1fffffff, v18
+  ; GCN-NEXT:    ; implicit-def: $sgpr16
+  ; GCN-NEXT:    v_mul_lo_u32 v18, v18, s16
+  ; GCN-NEXT:    ; implicit-def: $vgpr21
+  ; GCN-NEXT:    v_add_lshl_u32 v199, v21, v18, 1
+  ; GCN-NEXT:    ; implicit-def: $vgpr22
+  ; GCN-NEXT:    v_lshl_add_u32 v200, v22, 1, v199
+  ; GCN-NEXT:    ; implicit-def: $vgpr23
+  ; GCN-NEXT:    v_lshl_add_u32 v201, v23, 1, v200
+  ; GCN-NEXT:    ; implicit-def: $vgpr24
+  ; GCN-NEXT:    v_lshl_add_u32 v202, v24, 1, v201
+  ; GCN-NEXT:    ; implicit-def: $vgpr16
+  ; GCN-NEXT:    ; implicit-def: $vgpr18
+  ; GCN-NEXT:    ; implicit-def: $vgpr20
+  ; GCN-NEXT:    ; implicit-def: $vgpr24
+  ; GCN-NEXT:    v_add_u32_e32 v247, v19, v24
+  ; GCN-NEXT:    v_add_u32_e32 v248, v19, v16
+  ; GCN-NEXT:    v_add_u32_e32 v249, v19, v18
+  ; GCN-NEXT:    v_add_u32_e32 v250, v19, v20
+  ; GCN-NEXT:    ; implicit-def: $vgpr128_vgpr129_vgpr130_vgpr131
+  ; GCN-NEXT:    ; implicit-def: $sgpr14
+  ; GCN-NEXT:    ; implicit-def: $vgpr196
+  ; GCN-NEXT:    ; implicit-def: $sgpr12_sgpr13
+  ; GCN-NEXT:    ; implicit-def: $vgpr211
+  ; GCN-NEXT:    v_max_f32_e32 v212, v211, v211
+  ; GCN-NEXT:    ; implicit-def: $vgpr198
+  ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+  ; GCN-NEXT:    ; implicit-def: $vgpr32
+  ; GCN-NEXT:    ; implicit-def: $vgpr33
+  ; GCN-NEXT:    ; implicit-def: $vgpr34
+  ; GCN-NEXT:    v_add_u32_e32 v210, v19, v34
+  ; GCN-NEXT:    v_add_u32_e32 v206, v19, v33
+  ; GCN-NEXT:    v_add_u32_e32 v205, v19, v32
+  ; GCN-NEXT:    ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+  ; GCN-NEXT:    ; implicit-def: $vgpr21
+  ; GCN-NEXT:    ; implicit-def: $vgpr22
+  ; GCN-NEXT:    ; implicit-def: $vgpr23
+  ; GCN-NEXT:    ; implicit-def: $vgpr30
+  ; GCN-NEXT:    ; implicit-def: $vgpr31
+  ; GCN-NEXT:    v_add_u32_e32 v207, v19, v21
+  ; GCN-NEXT:    v_add_u32_e32 v208, v19, v22
+  ; GCN-NEXT:    v_add_u32_e32 v209, v19, v23
+  ; GCN-NEXT:    v_add_u32_e32 v203, v19, v30
+  ; GCN-NEXT:    v_add_u32_e32 v204, v19, v31
+  ; GCN-NEXT:    ; kill: killed $vgpr17
+  ; GCN-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+  ; GCN-NEXT:    ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+  ; GCN-NEXT:    ; implicit-def: $vgpr197
+  ; GCN-NEXT:    ; iglp_opt mask(0x00000002)
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
-  ; GCN-NEXT:    ds_write_b128 v95, v[0:3]
+  ; GCN-NEXT:    ds_write_b128 v230, v[64:67]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b128 v95, v[4:7] offset:1024
+  ; GCN-NEXT:    ds_write_b128 v230, v[68:71] offset:1024
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:64 sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx4 v[160:163], v226, s[8:11], 0 offen offset:64 sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    buffer_load_dwordx4 v[68:71], v8, s[8:11], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx4 v[164:167], v72, s[8:11], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    ds_read_b128 v[72:75], v94
+  ; GCN-NEXT:    ds_read_b128 v[64:67], v213
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[80:83], v94 offset:512
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[64:65], v[152:153], 0
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[66:67], v[154:155], v[112:127]
+  ; GCN-NEXT:    ds_read_b128 v[64:67], v213 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[84:87], v94 offset:1024
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[64:65], v[152:153], 0
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[66:67], v[154:155], v[96:111]
+  ; GCN-NEXT:    ds_read_b128 v[64:67], v213 offset:1024
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], 0
-  ; GCN-NEXT:    ds_read_b128 v[88:91], v94 offset:1536
+  ; GCN-NEXT:    ds_read_b128 v[168:171], v213 offset:1536
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
-  ; GCN-NEXT:    ds_read_b128 v[72:75], v106
+  ; GCN-NEXT:    ds_read_b128 v[172:175], v246
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], 0
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], 0
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], 0
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47]
-  ; GCN-NEXT:    ds_read_b128 v[80:83], v106 offset:512
+  ; GCN-NEXT:    ds_read_b128 v[176:179], v246 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31]
-  ; GCN-NEXT:    ds_read_b128 v[84:87], v106 offset:1024
+  ; GCN-NEXT:    ds_read_b128 v[180:183], v246 offset:1024
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15]
-  ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
-  ; GCN-NEXT:    ds_read_b128 v[88:91], v106 offset:1536
+  ; GCN-NEXT:    ds_read_b128 v[184:187], v246 offset:1536
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[64:65], v[152:153], 0
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
-  ; GCN-NEXT:    ds_write_b128 v95, v[64:67]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
-  ; GCN-NEXT:    v_add_u32_e32 v72, 0x80, v93
+  ; GCN-NEXT:    ds_write_b128 v230, v[160:163]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[66:67], v[154:155], v[80:95]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b128 v95, v[68:71] offset:1024
+  ; GCN-NEXT:    ds_write_b128 v230, v[164:167] offset:1024
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[168:169], v[152:153], 0
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[170:171], v[154:155], v[64:79]
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:128 sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx4 v[152:155], v226, s[8:11], 0 offen offset:128 sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx4 v[160:163], v188, s[8:11], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    ; kill: killed $vgpr72
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
-  ; GCN-NEXT:    ds_read_b128 v[72:75], v94
+  ; GCN-NEXT:    ds_read_b128 v[188:191], v213
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], v[32:47]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], v[16:31]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], v[0:15]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47]
-  ; GCN-NEXT:    ds_read_b128 v[80:83], v94 offset:512
+  ; GCN-NEXT:    ds_read_b128 v[192:195], v213 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31]
-  ; GCN-NEXT:    ds_read_b128 v[84:87], v94 offset:1024
+  ; GCN-NEXT:    ds_read_b128 v[164:167], v213 offset:1024
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15]
-  ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
-  ; GCN-NEXT:    ds_read_b128 v[88:91], v94 offset:1536
+  ; GCN-NEXT:    ds_read_b128 v[214:217], v213 offset:1536
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
-  ; GCN-NEXT:    ds_read_b128 v[72:75], v106
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[172:173], v[156:157], v[112:127]
+  ; GCN-NEXT:    ds_read_b128 v[218:221], v246
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], v[32:47]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], v[16:31]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], v[0:15]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15]
-  ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
-  ; GCN-NEXT:    ds_read_b128 v[72:75], v106 offset:512
+  ; GCN-NEXT:    ds_read_b128 v[222:225], v246 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47]
-  ; GCN-NEXT:    ds_read_b128 v[72:75], v106 offset:1024
+  ; GCN-NEXT:    ds_read_b128 v[168:171], v246 offset:1024
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31]
-  ; GCN-NEXT:    ds_read_b128 v[72:75], v106 offset:1536
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[174:175], v[158:159], v[112:127]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[188:189], v[144:145], v[112:127]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[190:191], v[146:147], v[112:127]
+  ; GCN-NEXT:    ds_read_b128 v[188:191], v246 offset:1536
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
-  ; GCN-NEXT:    ds_write_b128 v95, v[64:67]
+  ; GCN-NEXT:    ds_write_b128 v230, v[152:155]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b128 v95, v[68:71] offset:1024
-  ; GCN-NEXT:    ; implicit-def: $vgpr64
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
-  ; GCN-NEXT:    v_add_u32_e32 v72, 0xc0, v93
-  ; GCN-NEXT:    ; implicit-def: $vgpr73
-  ; GCN-NEXT:    v_add_u32_e32 v76, v132, v64
+  ; GCN-NEXT:    ds_write_b128 v230, v[160:163] offset:1024
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:192 sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx4 v[152:155], v226, s[8:11], 0 offen offset:192 sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[184:185], v[156:157], v[64:79]
+  ; GCN-NEXT:    buffer_load_dwordx4 v[226:229], v227, s[8:11], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ; kill: killed $vgpr72
-  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v73
-  ; GCN-NEXT:    buffer_load_dwordx2 v[98:99], v76, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[160:161], v231, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    buffer_load_dwordx2 v[102:103], v72, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[162:163], v232, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
-  ; GCN-NEXT:    ; implicit-def: $vgpr74
-  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v74
-  ; GCN-NEXT:    ; implicit-def: $vgpr75
-  ; GCN-NEXT:    buffer_load_dwordx2 v[100:101], v72, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[172:173], v233, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v75
-  ; GCN-NEXT:    buffer_load_dwordx2 v[104:105], v72, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[174:175], v234, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    ds_read_b128 v[72:75], v94
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[186:187], v[158:159], v[64:79]
+  ; GCN-NEXT:    v_perm_b32 v238, v162, v160, s5
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[218:219], v[140:141], v[112:127]
+  ; GCN-NEXT:    v_perm_b32 v240, v162, v160, s7
+  ; GCN-NEXT:    v_perm_b32 v242, v163, v161, s5
+  ; GCN-NEXT:    v_perm_b32 v244, v163, v161, s7
+  ; GCN-NEXT:    ds_read_b128 v[160:163], v213
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ; kill: killed $vgpr76
-  ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
-  ; GCN-NEXT:    ; implicit-def: $sgpr8
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
-  ; GCN-NEXT:    ds_read_b128 v[72:75], v94 offset:512
+  ; GCN-NEXT:    v_perm_b32 v239, v174, v172, s5
+  ; GCN-NEXT:    v_perm_b32 v241, v174, v172, s7
+  ; GCN-NEXT:    v_perm_b32 v243, v175, v173, s5
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[214:215], v[144:145], v[64:79]
+  ; GCN-NEXT:    v_perm_b32 v245, v175, v173, s7
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[176:177], v[156:157], v[96:111]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[220:221], v[142:143], v[112:127]
+  ; GCN-NEXT:    ds_read_b128 v[218:221], v213 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47]
-  ; GCN-NEXT:    ds_read_b128 v[72:75], v94 offset:1024
+  ; GCN-NEXT:    ds_read_b128 v[172:175], v213 offset:1024
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31]
-  ; GCN-NEXT:    ds_read_b128 v[72:75], v94 offset:1536
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[216:217], v[146:147], v[64:79]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[178:179], v[158:159], v[96:111]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[160:161], v[148:149], v[112:127]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[188:189], v[140:141], v[64:79]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[192:193], v[144:145], v[96:111]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[162:163], v[150:151], v[112:127]
+  ; GCN-NEXT:    ds_read_b128 v[160:163], v213 offset:1536
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
-  ; GCN-NEXT:    ds_read_b128 v[72:75], v106
+  ; GCN-NEXT:    ds_read_b128 v[184:187], v246
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
-  ; GCN-NEXT:    ds_read_b128 v[72:75], v106 offset:512
+  ; GCN-NEXT:    ds_read_b128 v[214:217], v246 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47]
-  ; GCN-NEXT:    ds_read_b128 v[72:75], v106 offset:1024
+  ; GCN-NEXT:    ds_read_b128 v[176:179], v246 offset:1024
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31]
-  ; GCN-NEXT:    ds_read_b128 v[72:75], v106 offset:1536
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[190:191], v[142:143], v[64:79]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[194:195], v[146:147], v[96:111]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[160:161], v[148:149], v[64:79]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[180:181], v[156:157], v[80:95]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[184:185], v[136:137], v[112:127]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[222:223], v[140:141], v[96:111]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[162:163], v[150:151], v[64:79]
+  ; GCN-NEXT:    ds_read_b128 v[160:163], v246 offset:1536
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
-  ; GCN-NEXT:    ds_write_b128 v95, v[64:67]
+  ; GCN-NEXT:    ds_write_b128 v230, v[152:155]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b128 v95, v[68:71] offset:1024
+  ; GCN-NEXT:    ds_write_b128 v230, v[226:229] offset:1024
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[182:183], v[158:159], v[80:95]
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_read_b128 v[64:67], v94
+  ; GCN-NEXT:    ds_read_b128 v[156:159], v213
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[90:93], v94 offset:512
+  ; GCN-NEXT:    ds_read_b128 v[226:229], v213 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
-  ; GCN-NEXT:    ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71
-  ; GCN-NEXT:    ds_read_b128 v[84:87], v94 offset:1024
+  ; GCN-NEXT:    ds_read_b128 v[180:183], v213 offset:1024
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[64:65], v[68:69], v[48:63]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
-  ; GCN-NEXT:    ds_read_b128 v[76:79], v94 offset:1536
+  ; GCN-NEXT:    ds_read_b128 v[152:155], v213 offset:1536
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[94:97], v106
+  ; GCN-NEXT:    ds_read_b128 v[230:233], v246
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[66:67], v[70:71], v[48:63]
-  ; GCN-NEXT:    ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[90:91], v[68:69], v[32:47]
-  ; GCN-NEXT:    ds_read_b128 v[88:91], v106 offset:512
+  ; GCN-NEXT:    ds_read_b128 v[234:237], v246 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[80:83], v106 offset:1024
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[186:187], v[138:139], v[112:127]
+  ; GCN-NEXT:    ds_read_b128 v[184:187], v246 offset:1024
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[72:75], v106 offset:1536
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[224:225], v[142:143], v[96:111]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[156:157], v[132:133], v[112:127]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[218:219], v[148:149], v[96:111]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[158:159], v[134:135], v[112:127]
+  ; GCN-NEXT:    ds_read_b128 v[156:159], v246 offset:1536
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[94:95], v[64:65], v[48:63]
-  ; GCN-NEXT:    v_perm_b32 v94, v102, v98, s5
-  ; GCN-NEXT:    v_perm_b32 v98, v102, v98, s8
-  ; GCN-NEXT:    v_perm_b32 v102, v103, v99, s5
-  ; GCN-NEXT:    v_perm_b32 v95, v104, v100, s5
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[92:93], v[70:71], v[32:47]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[68:69], v[16:31]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[96:97], v[66:67], v[48:63]
-  ; GCN-NEXT:    v_perm_b32 v96, v103, v99, s8
-  ; GCN-NEXT:    v_perm_b32 v99, v104, v100, s8
-  ; GCN-NEXT:    v_perm_b32 v103, v105, v101, s5
-  ; GCN-NEXT:    v_perm_b32 v97, v105, v101, s8
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[88:89], v[64:65], v[32:47]
-  ; GCN-NEXT:    s_nop 5
-  ; GCN-NEXT:    v_mul_f32_e32 v100, s4, v48
-  ; GCN-NEXT:    v_mul_f32_e32 v101, s4, v49
-  ; GCN-NEXT:    v_max3_f32 v92, v100, s6, v101
-  ; GCN-NEXT:    v_mul_f32_e32 v93, s4, v50
-  ; GCN-NEXT:    v_mul_f32_e32 v100, s4, v51
-  ; GCN-NEXT:    v_max3_f32 v92, v92, v93, v100
-  ; GCN-NEXT:    v_mul_f32_e32 v93, s4, v52
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[70:71], v[16:31]
-  ; GCN-NEXT:    v_mul_f32_e32 v100, s4, v53
-  ; GCN-NEXT:    v_max3_f32 v92, v92, v93, v100
-  ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v54
-  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v55
-  ; GCN-NEXT:    v_max3_f32 v84, v92, v84, v85
-  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v56
-  ; GCN-NEXT:    v_mul_f32_e32 v92, s4, v57
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[76:77], v[68:69], v[0:15]
-  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v92
-  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v58
-  ; GCN-NEXT:    v_mul_f32_e32 v88, s4, v59
-  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v88
-  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v60
-  ; GCN-NEXT:    v_mul_f32_e32 v88, s4, v61
-  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v88
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[90:91], v[66:67], v[32:47]
-  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v62
-  ; GCN-NEXT:    v_mul_f32_e32 v88, s4, v63
-  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v88
-  ; GCN-NEXT:    ; implicit-def: $sgpr6
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[80:81], v[64:65], v[16:31]
-  ; GCN-NEXT:    s_nop 6
-  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v32
-  ; GCN-NEXT:    v_mul_f32_e32 v88, s4, v33
-  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v88
-  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v34
-  ; GCN-NEXT:    v_mul_f32_e32 v88, s4, v35
-  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v88
-  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v36
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[78:79], v[70:71], v[0:15]
-  ; GCN-NEXT:    v_mul_f32_e32 v86, s4, v37
-  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v86
-  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v38
-  ; GCN-NEXT:    v_mul_f32_e32 v86, s4, v39
-  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v86
-  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v40
-  ; GCN-NEXT:    v_mul_f32_e32 v80, s4, v41
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[82:83], v[66:67], v[16:31]
-  ; GCN-NEXT:    v_max3_f32 v80, v84, v85, v80
-  ; GCN-NEXT:    v_mul_f32_e32 v81, s4, v42
-  ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v43
-  ; GCN-NEXT:    v_max3_f32 v80, v80, v81, v84
-  ; GCN-NEXT:    v_mul_f32_e32 v81, s4, v44
-  ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v45
-  ; GCN-NEXT:    v_max3_f32 v80, v80, v81, v84
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[64:65], v[0:15]
-  ; GCN-NEXT:    v_mul_f32_e32 v81, s4, v46
-  ; GCN-NEXT:    v_mul_f32_e32 v82, s4, v47
-  ; GCN-NEXT:    v_max3_f32 v80, v80, v81, v82
-  ; GCN-NEXT:    v_mul_f32_e32 v81, s4, v16
-  ; GCN-NEXT:    v_mul_f32_e32 v82, s4, v17
-  ; GCN-NEXT:    v_max3_f32 v80, v80, v81, v82
-  ; GCN-NEXT:    v_mul_f32_e32 v68, s4, v18
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[66:67], v[0:15]
-  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v19
-  ; GCN-NEXT:    v_max3_f32 v68, v80, v68, v69
-  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v20
-  ; GCN-NEXT:    v_mul_f32_e32 v76, s4, v21
-  ; GCN-NEXT:    v_max3_f32 v68, v68, v69, v76
-  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v22
-  ; GCN-NEXT:    v_mul_f32_e32 v70, s4, v23
-  ; GCN-NEXT:    v_max3_f32 v68, v68, v69, v70
-  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v24
-  ; GCN-NEXT:    v_mul_f32_e32 v70, s4, v25
-  ; GCN-NEXT:    v_max3_f32 v68, v68, v69, v70
-  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v26
-  ; GCN-NEXT:    v_mul_f32_e32 v70, s4, v27
-  ; GCN-NEXT:    v_max3_f32 v64, v68, v69, v70
-  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v28
-  ; GCN-NEXT:    v_mul_f32_e32 v68, s4, v29
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v68
-  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v30
-  ; GCN-NEXT:    v_mul_f32_e32 v68, s4, v31
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v68
-  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v0
-  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v1
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
-  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v2
-  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v3
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
-  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v4
-  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v5
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
-  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v6
-  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v7
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
-  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v8
-  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v9
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
-  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v10
-  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v11
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
-  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v12
-  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v13
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
-  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v14
-  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v15
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
-  ; GCN-NEXT:    ; implicit-def: $vgpr65
-  ; GCN-NEXT:    ; implicit-def: $vgpr66
-  ; GCN-NEXT:    ; implicit-def: $vgpr68
-  ; GCN-NEXT:    ; implicit-def: $vgpr67
-  ; GCN-NEXT:    v_add_u32_e32 v65, s7, v65
-  ; GCN-NEXT:    v_and_b32_e32 v65, 0x1fffffff, v65
-  ; GCN-NEXT:    v_mul_lo_u32 v65, v65, s6
-  ; GCN-NEXT:    v_add_lshl_u32 v135, v66, v65, 1
-  ; GCN-NEXT:    ds_bpermute_b32 v65, v133, v64
-  ; GCN-NEXT:    ; implicit-def: $vgpr66
-  ; GCN-NEXT:    v_lshl_add_u32 v136, v66, 1, v135
-  ; GCN-NEXT:    ; implicit-def: $vgpr66
-  ; GCN-NEXT:    v_lshl_add_u32 v137, v66, 1, v136
-  ; GCN-NEXT:    ; implicit-def: $vgpr66
-  ; GCN-NEXT:    ; implicit-def: $sgpr6_sgpr7
-  ; GCN-NEXT:    v_lshl_add_u32 v138, v66, 1, v137
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v135, v[94:95]
-  ; GCN-NEXT:    v_max_f32_e32 v65, v65, v65
-  ; GCN-NEXT:    v_max_f32_e32 v64, v64, v65
-  ; GCN-NEXT:    ds_bpermute_b32 v65, v133, v64
+  ; GCN-NEXT:    ds_write_b64 v199, v[238:239]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v136, v[98:99]
+  ; GCN-NEXT:    ds_write_b64 v200, v[240:241]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v137, v[102:103]
+  ; GCN-NEXT:    ds_write_b64 v201, v[242:243]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v138, v[96:97]
-  ; GCN-NEXT:    v_add_u32_e32 v68, v132, v68
-  ; GCN-NEXT:    v_cndmask_b32_e64 v64, v65, v64, s[6:7]
-  ; GCN-NEXT:    v_max_f32_e32 v64, v64, v64
-  ; GCN-NEXT:    ; implicit-def: $vgpr65
-  ; GCN-NEXT:    v_max_f32_e32 v66, v65, v65
-  ; GCN-NEXT:    v_max_f32_e32 v134, v66, v64
-  ; GCN-NEXT:    ; implicit-def: $vgpr64
+  ; GCN-NEXT:    ds_write_b64 v202, v[244:245]
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_load_dwordx2 v[156:157], v68, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[192:193], v247, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_add_u32_e32 v64, v132, v64
-  ; GCN-NEXT:    buffer_load_dwordx2 v[158:159], v64, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[220:221], v[150:151], v[96:111]
+  ; GCN-NEXT:    buffer_load_dwordx2 v[194:195], v248, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ; implicit-def: $vgpr66
-  ; GCN-NEXT:    v_add_u32_e32 v64, v132, v66
-  ; GCN-NEXT:    buffer_load_dwordx2 v[128:129], v64, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[218:219], v249, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_add_u32_e32 v64, v132, v67
-  ; GCN-NEXT:    buffer_load_dwordx2 v[130:131], v64, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[220:221], v250, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_fma_f32 v57, s4, v57, -v134
-  ; GCN-NEXT:    v_fma_f32 v48, s4, v48, -v134
-  ; GCN-NEXT:    v_fma_f32 v96, s4, v58, -v134
-  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v57
-  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v48
-  ; GCN-NEXT:    v_fma_f32 v64, s4, v49, -v134
-  ; GCN-NEXT:    v_exp_f32_e32 v163, v57
-  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v96
-  ; GCN-NEXT:    v_fma_f32 v66, s4, v50, -v134
-  ; GCN-NEXT:    v_exp_f32_e32 v164, v57
-  ; GCN-NEXT:    v_exp_f32_e32 v49, v48
-  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v64
-  ; GCN-NEXT:    v_fma_f32 v67, s4, v51, -v134
-  ; GCN-NEXT:    v_exp_f32_e32 v50, v48
-  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v66
-  ; GCN-NEXT:    v_fma_f32 v68, s4, v52, -v134
-  ; GCN-NEXT:    v_exp_f32_e32 v51, v48
-  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v67
-  ; GCN-NEXT:    v_fma_f32 v69, s4, v53, -v134
-  ; GCN-NEXT:    v_exp_f32_e32 v52, v48
-  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v68
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_fma_f32 v70, s4, v54, -v134
-  ; GCN-NEXT:    v_exp_f32_e32 v53, v48
-  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v69
-  ; GCN-NEXT:    v_fma_f32 v71, s4, v55, -v134
-  ; GCN-NEXT:    ds_read_b128 v[140:143], v139
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_exp_f32_e32 v54, v48
-  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v70
-  ; GCN-NEXT:    v_exp_f32_e32 v55, v48
-  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v71
-  ; GCN-NEXT:    ds_read_b128 v[144:147], v139 offset:576
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_fma_f32 v66, s4, v56, -v134
-  ; GCN-NEXT:    v_exp_f32_e32 v56, v48
-  ; GCN-NEXT:    v_sub_f32_e32 v48, v65, v134
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v64, v49
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v67, v50
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v68, v51
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v58, v52
-  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v48
-  ; GCN-NEXT:    ds_read_b128 v[148:151], v139 offset:1152
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_exp_f32_e32 v48, v48
-  ; GCN-NEXT:    v_pack_b32_f16 v161, v68, v58
-  ; GCN-NEXT:    v_pack_b32_f16 v160, v64, v67
-  ; GCN-NEXT:    v_mul_f32_e32 v58, 0x3fb8aa3b, v66
-  ; GCN-NEXT:    ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79
-  ; GCN-NEXT:    ds_read_b128 v[152:155], v139 offset:1728
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_fma_f32 v162, s4, v61, -v134
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v61, v55
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v57, v56
-  ; GCN-NEXT:    v_pk_mul_f32 v[64:65], v[64:65], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[66:67], v[66:67], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[68:69], v[68:69], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[70:71], v[70:71], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[72:73], v[72:73], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[74:75], v[74:75], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[76:77], v[76:77], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[78:79], v[78:79], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    ; implicit-def: $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95
-  ; GCN-NEXT:    v_fma_f32 v59, s4, v59, -v134
-  ; GCN-NEXT:    v_pk_mul_f32 v[80:81], v[80:81], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[140:141], v[160:161], v[64:79]
-  ; GCN-NEXT:    v_mul_f32_e64 v82, v82, v48
-  ; GCN-NEXT:    v_mul_f32_e64 v83, v83, v48
-  ; GCN-NEXT:    v_mul_f32_e64 v84, v84, v48
-  ; GCN-NEXT:    v_mul_f32_e64 v85, v85, v48
-  ; GCN-NEXT:    v_mul_f32_e64 v86, v86, v48
-  ; GCN-NEXT:    v_mul_f32_e64 v87, v87, v48
-  ; GCN-NEXT:    v_pk_mul_f32 v[88:89], v[88:89], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[90:91], v[90:91], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[92:93], v[92:93], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[94:95], v[94:95], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    ; implicit-def: $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111
-  ; GCN-NEXT:    v_exp_f32_e32 v58, v58
-  ; GCN-NEXT:    v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[144:145], v[160:161], v[80:95]
-  ; GCN-NEXT:    v_mul_f32_e64 v98, v98, v48
-  ; GCN-NEXT:    v_mul_f32_e64 v99, v99, v48
-  ; GCN-NEXT:    v_mul_f32_e64 v100, v100, v48
-  ; GCN-NEXT:    v_mul_f32_e64 v101, v101, v48
-  ; GCN-NEXT:    v_mul_f32_e64 v102, v102, v48
-  ; GCN-NEXT:    v_mul_f32_e64 v103, v103, v48
-  ; GCN-NEXT:    v_pk_mul_f32 v[104:105], v[104:105], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[106:107], v[106:107], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[108:109], v[108:109], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[110:111], v[110:111], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pack_b32_f16 v145, v61, v57
-  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v59
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v140, v53
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v141, v54
-  ; GCN-NEXT:    v_exp_f32_e32 v59, v57
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[148:149], v[160:161], v[96:111]
-  ; GCN-NEXT:    v_fma_f32 v60, s4, v60, -v134
-  ; GCN-NEXT:    v_mul_f32_e64 v112, v112, v48
-  ; GCN-NEXT:    v_mul_f32_e64 v113, v113, v48
-  ; GCN-NEXT:    v_mul_f32_e64 v114, v114, v48
-  ; GCN-NEXT:    v_mul_f32_e64 v115, v115, v48
-  ; GCN-NEXT:    v_pk_mul_f32 v[116:117], v[116:117], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[118:119], v[118:119], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[120:121], v[120:121], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[122:123], v[122:123], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[124:125], v[124:125], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[126:127], v[126:127], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_fma_f32 v148, s4, v62, -v134
-  ; GCN-NEXT:    v_pack_b32_f16 v144, v140, v141
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[152:153], v[160:161], v[112:127]
-  ; GCN-NEXT:    v_fma_f32 v152, s4, v63, -v134
-  ; GCN-NEXT:    v_mul_f32_e32 v149, 0x3fb8aa3b, v60
-  ; GCN-NEXT:    ; implicit-def: $vgpr57
-  ; GCN-NEXT:    ds_read_b128 v[60:63], v57
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_exp_f32_e32 v160, v149
-  ; GCN-NEXT:    v_fma_f32 v161, s4, v33, -v134
-  ; GCN-NEXT:    v_mul_f32_e32 v33, 0x3fb8aa3b, v148
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v153, v58
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[144:145], v[64:79]
-  ; GCN-NEXT:    v_fma_f32 v32, s4, v32, -v134
-  ; GCN-NEXT:    ds_read_b128 v[140:143], v57 offset:576
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_fma_f32 v40, s4, v40, -v134
-  ; GCN-NEXT:    v_fma_f32 v44, s4, v44, -v134
-  ; GCN-NEXT:    v_fma_f32 v16, s4, v16, -v134
-  ; GCN-NEXT:    v_fma_f32 v166, s4, v20, -v134
-  ; GCN-NEXT:    v_fma_f32 v24, s4, v24, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[146:147], v[144:145], v[80:95]
-  ; GCN-NEXT:    v_mul_f32_e32 v146, 0x3fb8aa3b, v162
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v147, v163
-  ; GCN-NEXT:    v_exp_f32_e32 v162, v146
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v146, v164
-  ; GCN-NEXT:    v_fma_f32 v28, s4, v28, -v134
-  ; GCN-NEXT:    v_pack_b32_f16 v148, v153, v147
-  ; GCN-NEXT:    v_fma_f32 v0, s4, v0, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[150:151], v[144:145], v[96:111]
-  ; GCN-NEXT:    v_exp_f32_e32 v151, v33
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v33, v59
-  ; GCN-NEXT:    v_fma_f32 v150, s4, v34, -v134
-  ; GCN-NEXT:    v_fma_f32 v8, s4, v8, -v134
-  ; GCN-NEXT:    v_fma_f32 v12, s4, v12, -v134
-  ; GCN-NEXT:    v_pack_b32_f16 v149, v146, v33
-  ; GCN-NEXT:    v_mul_f32_e32 v33, 0x3fb8aa3b, v152
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[154:155], v[144:145], v[112:127]
-  ; GCN-NEXT:    v_fma_f32 v152, s4, v35, -v134
-  ; GCN-NEXT:    v_exp_f32_e32 v153, v33
-  ; GCN-NEXT:    v_fma_f32 v155, s4, v36, -v134
-  ; GCN-NEXT:    v_perm_b32 v36, v158, v156, s5
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v154, v160
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[60:61], v[148:149], v[64:79]
-  ; GCN-NEXT:    v_mul_f32_e32 v60, 0x3fb8aa3b, v32
-  ; GCN-NEXT:    ds_read_b128 v[32:35], v57 offset:1152
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[144:147], v57 offset:1728
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mul_f32_e32 v61, 0x3fb8aa3b, v161
-  ; GCN-NEXT:    v_exp_f32_e32 v165, v60
-  ; GCN-NEXT:    v_perm_b32 v60, v158, v156, s8
-  ; GCN-NEXT:    v_fma_f32 v158, s4, v37, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[140:141], v[148:149], v[80:95]
-  ; GCN-NEXT:    v_exp_f32_e32 v161, v61
-  ; GCN-NEXT:    v_perm_b32 v140, v159, v157, s8
-  ; GCN-NEXT:    v_perm_b32 v37, v130, v128, s5
-  ; GCN-NEXT:    v_perm_b32 v61, v130, v128, s8
-  ; GCN-NEXT:    v_perm_b32 v141, v131, v129, s8
+  ; GCN-NEXT:    v_perm_b32 v188, v194, v192, s5
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[164:165], v[144:145], v[80:95]
+  ; GCN-NEXT:    v_perm_b32 v189, v220, v218, s5
+  ; GCN-NEXT:    v_perm_b32 v191, v220, v218, s7
+  ; GCN-NEXT:    v_perm_b32 v190, v194, v192, s7
+  ; GCN-NEXT:    v_perm_b32 v192, v195, v193, s5
+  ; GCN-NEXT:    v_perm_b32 v194, v195, v193, s7
+  ; GCN-NEXT:    v_perm_b32 v193, v221, v219, s5
+  ; GCN-NEXT:    v_perm_b32 v195, v221, v219, s7
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[166:167], v[146:147], v[80:95]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[168:169], v[140:141], v[80:95]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[170:171], v[142:143], v[80:95]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[172:173], v[148:149], v[80:95]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[214:215], v[136:137], v[96:111]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[174:175], v[150:151], v[80:95]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[216:217], v[138:139], v[96:111]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[176:177], v[136:137], v[80:95]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[226:227], v[132:133], v[96:111]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[178:179], v[138:139], v[80:95]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[160:161], v[136:137], v[64:79]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[230:231], v[128:129], v[112:127]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[228:229], v[134:135], v[96:111]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[180:181], v[132:133], v[80:95]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[162:163], v[138:139], v[64:79]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[232:233], v[130:131], v[112:127]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[234:235], v[128:129], v[96:111]
+  ; GCN-NEXT:    s_nop 9
+  ; GCN-NEXT:    v_mul_f32_e32 v213, s4, v112
+  ; GCN-NEXT:    v_mul_f32_e32 v218, s4, v113
+  ; GCN-NEXT:    v_max3_f32 v213, v213, s14, v218
+  ; GCN-NEXT:    v_mul_f32_e32 v218, s4, v114
+  ; GCN-NEXT:    v_mul_f32_e32 v219, s4, v115
+  ; GCN-NEXT:    v_max3_f32 v213, v213, v218, v219
+  ; GCN-NEXT:    v_mul_f32_e32 v218, s4, v116
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[182:183], v[134:135], v[80:95]
+  ; GCN-NEXT:    v_mul_f32_e32 v219, s4, v117
+  ; GCN-NEXT:    v_max3_f32 v213, v213, v218, v219
+  ; GCN-NEXT:    v_mul_f32_e32 v218, s4, v118
+  ; GCN-NEXT:    v_mul_f32_e32 v219, s4, v119
+  ; GCN-NEXT:    v_max3_f32 v213, v213, v218, v219
+  ; GCN-NEXT:    v_mul_f32_e32 v218, s4, v120
+  ; GCN-NEXT:    v_mul_f32_e32 v219, s4, v121
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[152:153], v[132:133], v[64:79]
+  ; GCN-NEXT:    v_max3_f32 v213, v213, v218, v219
+  ; GCN-NEXT:    v_mul_f32_e32 v218, s4, v122
+  ; GCN-NEXT:    v_mul_f32_e32 v219, s4, v123
+  ; GCN-NEXT:    v_max3_f32 v213, v213, v218, v219
+  ; GCN-NEXT:    v_mul_f32_e32 v218, s4, v124
+  ; GCN-NEXT:    v_mul_f32_e32 v219, s4, v125
+  ; GCN-NEXT:    v_max3_f32 v213, v213, v218, v219
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[236:237], v[130:131], v[96:111]
+  ; GCN-NEXT:    v_mul_f32_e32 v218, s4, v126
+  ; GCN-NEXT:    v_mul_f32_e32 v219, s4, v127
+  ; GCN-NEXT:    v_max3_f32 v213, v213, v218, v219
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[184:185], v[128:129], v[80:95]
+  ; GCN-NEXT:    s_nop 6
+  ; GCN-NEXT:    v_mul_f32_e32 v214, s4, v96
+  ; GCN-NEXT:    v_mul_f32_e32 v215, s4, v97
+  ; GCN-NEXT:    v_max3_f32 v213, v213, v214, v215
+  ; GCN-NEXT:    v_mul_f32_e32 v214, s4, v98
+  ; GCN-NEXT:    v_mul_f32_e32 v215, s4, v99
+  ; GCN-NEXT:    v_max3_f32 v213, v213, v214, v215
+  ; GCN-NEXT:    v_mul_f32_e32 v214, s4, v100
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[154:155], v[134:135], v[64:79]
+  ; GCN-NEXT:    v_mul_f32_e32 v215, s4, v101
+  ; GCN-NEXT:    v_max3_f32 v213, v213, v214, v215
+  ; GCN-NEXT:    v_mul_f32_e32 v214, s4, v102
+  ; GCN-NEXT:    v_mul_f32_e32 v215, s4, v103
+  ; GCN-NEXT:    v_max3_f32 v213, v213, v214, v215
+  ; GCN-NEXT:    v_mul_f32_e32 v214, s4, v104
+  ; GCN-NEXT:    v_mul_f32_e32 v215, s4, v105
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[186:187], v[130:131], v[80:95]
+  ; GCN-NEXT:    v_max3_f32 v213, v213, v214, v215
+  ; GCN-NEXT:    v_mul_f32_e32 v214, s4, v106
+  ; GCN-NEXT:    v_mul_f32_e32 v215, s4, v107
+  ; GCN-NEXT:    v_max3_f32 v213, v213, v214, v215
+  ; GCN-NEXT:    v_mul_f32_e32 v214, s4, v108
+  ; GCN-NEXT:    v_mul_f32_e32 v215, s4, v109
+  ; GCN-NEXT:    v_max3_f32 v213, v213, v214, v215
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[156:157], v[128:129], v[64:79]
+  ; GCN-NEXT:    v_mul_f32_e32 v214, s4, v110
+  ; GCN-NEXT:    v_mul_f32_e32 v215, s4, v111
+  ; GCN-NEXT:    v_max3_f32 v213, v213, v214, v215
+  ; GCN-NEXT:    v_mul_f32_e32 v140, s4, v80
+  ; GCN-NEXT:    v_mul_f32_e32 v141, s4, v81
+  ; GCN-NEXT:    v_max3_f32 v140, v213, v140, v141
+  ; GCN-NEXT:    v_mul_f32_e32 v141, s4, v82
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[158:159], v[130:131], v[64:79]
+  ; GCN-NEXT:    v_mul_f32_e32 v142, s4, v83
+  ; GCN-NEXT:    v_max3_f32 v140, v140, v141, v142
+  ; GCN-NEXT:    v_mul_f32_e32 v141, s4, v84
+  ; GCN-NEXT:    v_mul_f32_e32 v142, s4, v85
+  ; GCN-NEXT:    v_max3_f32 v140, v140, v141, v142
+  ; GCN-NEXT:    v_mul_f32_e32 v141, s4, v86
+  ; GCN-NEXT:    v_mul_f32_e32 v142, s4, v87
+  ; GCN-NEXT:    v_max3_f32 v140, v140, v141, v142
+  ; GCN-NEXT:    v_mul_f32_e32 v141, s4, v88
+  ; GCN-NEXT:    v_mul_f32_e32 v142, s4, v89
+  ; GCN-NEXT:    v_max3_f32 v140, v140, v141, v142
+  ; GCN-NEXT:    v_mul_f32_e32 v141, s4, v90
+  ; GCN-NEXT:    v_mul_f32_e32 v142, s4, v91
+  ; GCN-NEXT:    v_max3_f32 v140, v140, v141, v142
+  ; GCN-NEXT:    v_mul_f32_e32 v141, s4, v92
+  ; GCN-NEXT:    v_mul_f32_e32 v142, s4, v93
+  ; GCN-NEXT:    v_max3_f32 v140, v140, v141, v142
+  ; GCN-NEXT:    v_mul_f32_e32 v141, s4, v94
+  ; GCN-NEXT:    v_mul_f32_e32 v142, s4, v95
+  ; GCN-NEXT:    v_max3_f32 v140, v140, v141, v142
+  ; GCN-NEXT:    v_mul_f32_e32 v128, s4, v64
+  ; GCN-NEXT:    v_mul_f32_e32 v129, s4, v65
+  ; GCN-NEXT:    v_max3_f32 v128, v140, v128, v129
+  ; GCN-NEXT:    v_mul_f32_e32 v129, s4, v66
+  ; GCN-NEXT:    v_mul_f32_e32 v130, s4, v67
+  ; GCN-NEXT:    v_max3_f32 v128, v128, v129, v130
+  ; GCN-NEXT:    v_mul_f32_e32 v129, s4, v68
+  ; GCN-NEXT:    v_mul_f32_e32 v130, s4, v69
+  ; GCN-NEXT:    v_max3_f32 v128, v128, v129, v130
+  ; GCN-NEXT:    v_mul_f32_e32 v129, s4, v70
+  ; GCN-NEXT:    v_mul_f32_e32 v130, s4, v71
+  ; GCN-NEXT:    v_max3_f32 v128, v128, v129, v130
+  ; GCN-NEXT:    v_mul_f32_e32 v129, s4, v72
+  ; GCN-NEXT:    v_mul_f32_e32 v130, s4, v73
+  ; GCN-NEXT:    v_max3_f32 v128, v128, v129, v130
+  ; GCN-NEXT:    v_mul_f32_e32 v129, s4, v74
+  ; GCN-NEXT:    v_mul_f32_e32 v130, s4, v75
+  ; GCN-NEXT:    v_max3_f32 v128, v128, v129, v130
+  ; GCN-NEXT:    v_mul_f32_e32 v129, s4, v76
+  ; GCN-NEXT:    v_mul_f32_e32 v130, s4, v77
+  ; GCN-NEXT:    v_max3_f32 v128, v128, v129, v130
+  ; GCN-NEXT:    v_mul_f32_e32 v129, s4, v78
+  ; GCN-NEXT:    v_mul_f32_e32 v130, s4, v79
+  ; GCN-NEXT:    v_max3_f32 v128, v128, v129, v130
+  ; GCN-NEXT:    ds_bpermute_b32 v129, v196, v128
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_read_b128 v[130:133], v198
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[134:137], v198 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_max_f32_e32 v129, v129, v129
+  ; GCN-NEXT:    v_max_f32_e32 v128, v128, v129
+  ; GCN-NEXT:    ds_bpermute_b32 v129, v196, v128
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    v_cndmask_b32_e64 v128, v129, v128, s[12:13]
+  ; GCN-NEXT:    v_max_f32_e32 v128, v128, v128
+  ; GCN-NEXT:    v_max_f32_e32 v128, v212, v128
+  ; GCN-NEXT:    v_fma_f32 v113, s4, v113, -v128
+  ; GCN-NEXT:    v_mul_f32_e32 v138, 0x3fb8aa3b, v113
+  ; GCN-NEXT:    v_fma_f32 v113, s4, v114, -v128
+  ; GCN-NEXT:    v_mul_f32_e32 v139, 0x3fb8aa3b, v113
+  ; GCN-NEXT:    v_fma_f32 v113, s4, v115, -v128
+  ; GCN-NEXT:    v_mul_f32_e32 v140, 0x3fb8aa3b, v113
+  ; GCN-NEXT:    v_fma_f32 v113, s4, v116, -v128
+  ; GCN-NEXT:    v_mul_f32_e32 v141, 0x3fb8aa3b, v113
+  ; GCN-NEXT:    v_fma_f32 v113, s4, v117, -v128
+  ; GCN-NEXT:    v_mul_f32_e32 v142, 0x3fb8aa3b, v113
+  ; GCN-NEXT:    v_fma_f32 v113, s4, v118, -v128
+  ; GCN-NEXT:    v_fma_f32 v112, s4, v112, -v128
+  ; GCN-NEXT:    v_mul_f32_e32 v143, 0x3fb8aa3b, v113
+  ; GCN-NEXT:    v_fma_f32 v113, s4, v119, -v128
+  ; GCN-NEXT:    v_fma_f32 v118, s4, v120, -v128
+  ; GCN-NEXT:    v_fma_f32 v120, s4, v121, -v128
+  ; GCN-NEXT:    v_mul_f32_e32 v112, 0x3fb8aa3b, v112
+  ; GCN-NEXT:    v_mul_f32_e32 v144, 0x3fb8aa3b, v113
+  ; GCN-NEXT:    v_mul_f32_e32 v149, 0x3fb8aa3b, v120
+  ; GCN-NEXT:    v_fma_f32 v120, s4, v122, -v128
+  ; GCN-NEXT:    v_exp_f32_e32 v114, v138
+  ; GCN-NEXT:    v_exp_f32_e32 v115, v139
+  ; GCN-NEXT:    v_exp_f32_e32 v116, v140
+  ; GCN-NEXT:    v_exp_f32_e32 v117, v141
+  ; GCN-NEXT:    v_mul_f32_e32 v148, 0x3fb8aa3b, v118
+  ; GCN-NEXT:    v_exp_f32_e32 v118, v142
+  ; GCN-NEXT:    v_mul_f32_e32 v150, 0x3fb8aa3b, v120
+  ; GCN-NEXT:    v_exp_f32_e32 v120, v144
+  ; GCN-NEXT:    v_exp_f32_e32 v113, v112
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v119, v114
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v121, v116
+  ; GCN-NEXT:    v_sub_f32_e32 v129, v211, v128
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v112, v113
+  ; GCN-NEXT:    v_mul_f32_e32 v129, 0x3fb8aa3b, v129
+  ; GCN-NEXT:    ds_read_b128 v[138:141], v198 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_fma_f32 v122, s4, v123, -v128
+  ; GCN-NEXT:    v_pack_b32_f16 v146, v112, v119
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v112, v115
+  ; GCN-NEXT:    v_mul_f32_e32 v151, 0x3fb8aa3b, v122
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v123, v117
+  ; GCN-NEXT:    v_fma_f32 v122, s4, v124, -v128
+  ; GCN-NEXT:    v_pack_b32_f16 v147, v112, v121
+  ; GCN-NEXT:    v_exp_f32_e32 v112, v129
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v124, v118
+  ; GCN-NEXT:    v_mul_f32_e32 v129, 0x3fb8aa3b, v122
+  ; GCN-NEXT:    v_fma_f32 v125, s4, v125, -v128
+  ; GCN-NEXT:    v_pk_mul_f32 v[0:1], v[0:1], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[2:3], v[2:3], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[4:5], v[4:5], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[6:7], v[6:7], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[8:9], v[8:9], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[10:11], v[10:11], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[12:13], v[12:13], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[14:15], v[14:15], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[32:33], v[32:33], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[34:35], v[34:35], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[146:147], v[0:15]
+  ; GCN-NEXT:    v_exp_f32_e32 v119, v143
+  ; GCN-NEXT:    ds_read_b128 v[142:145], v198 offset:1728
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_pk_mul_f32 v[36:37], v[36:37], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[38:39], v[38:39], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[40:41], v[40:41], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[42:43], v[42:43], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[44:45], v[44:45], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[46:47], v[46:47], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[16:17], v[16:17], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[18:19], v[18:19], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[134:135], v[146:147], v[32:47]
+  ; GCN-NEXT:    v_mul_f32_e64 v20, v20, v112
+  ; GCN-NEXT:    v_mul_f32_e64 v21, v21, v112
+  ; GCN-NEXT:    v_mul_f32_e64 v22, v22, v112
+  ; GCN-NEXT:    v_mul_f32_e64 v23, v23, v112
+  ; GCN-NEXT:    v_mul_f32_e64 v24, v24, v112
+  ; GCN-NEXT:    v_mul_f32_e64 v25, v25, v112
+  ; GCN-NEXT:    v_pk_mul_f32 v[26:27], v[26:27], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[28:29], v[28:29], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[30:31], v[30:31], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[48:49], v[48:49], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[50:51], v[50:51], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[52:53], v[52:53], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[54:55], v[54:55], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[56:57], v[56:57], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[58:59], v[58:59], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[60:61], v[60:61], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[62:63], v[62:63], v[112:113] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pack_b32_f16 v134, v123, v124
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v130, v119
+  ; GCN-NEXT:    v_fma_f32 v124, s4, v126, -v128
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v126, v120
+  ; GCN-NEXT:    v_exp_f32_e32 v121, v148
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[138:139], v[146:147], v[16:31]
+  ; GCN-NEXT:    v_exp_f32_e32 v122, v149
+  ; GCN-NEXT:    v_pack_b32_f16 v135, v130, v126
+  ; GCN-NEXT:    v_mul_f32_e32 v138, 0x3fb8aa3b, v124
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v126, v121
+  ; GCN-NEXT:    v_mul_f32_e32 v125, 0x3fb8aa3b, v125
+  ; GCN-NEXT:    v_fma_f32 v139, s4, v96, -v128
+  ; GCN-NEXT:    v_fma_f32 v127, s4, v127, -v128
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[142:143], v[146:147], v[48:63]
+  ; GCN-NEXT:    v_exp_f32_e32 v123, v150
+  ; GCN-NEXT:    v_mul_f32_e32 v127, 0x3fb8aa3b, v127
+  ; GCN-NEXT:    v_fma_f32 v143, s4, v101, -v128
+  ; GCN-NEXT:    v_fma_f32 v64, s4, v64, -v128
+  ; GCN-NEXT:    v_fma_f32 v65, s4, v65, -v128
+  ; GCN-NEXT:    v_fma_f32 v68, s4, v68, -v128
+  ; GCN-NEXT:    v_fma_f32 v69, s4, v69, -v128
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[134:135], v[0:15]
+  ; GCN-NEXT:    v_exp_f32_e32 v124, v151
+  ; GCN-NEXT:    ds_read_b128 v[130:133], v197
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[146:149], v197 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[136:137], v[134:135], v[32:47]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v136, v122
+  ; GCN-NEXT:    v_exp_f32_e32 v96, v129
+  ; GCN-NEXT:    v_fma_f32 v137, s4, v97, -v128
+  ; GCN-NEXT:    v_mul_f32_e32 v129, 0x3fb8aa3b, v139
+  ; GCN-NEXT:    v_pack_b32_f16 v126, v126, v136
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v136, v123
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[140:141], v[134:135], v[16:31]
+  ; GCN-NEXT:    v_exp_f32_e32 v97, v125
+  ; GCN-NEXT:    v_mul_f32_e32 v125, 0x3fb8aa3b, v137
+  ; GCN-NEXT:    v_fma_f32 v137, s4, v98, -v128
+  ; GCN-NEXT:    v_mul_f32_e32 v142, 0x3fb8aa3b, v137
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[144:145], v[134:135], v[48:63]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v134, v124
+  ; GCN-NEXT:    v_fma_f32 v135, s4, v99, -v128
+  ; GCN-NEXT:    v_exp_f32_e32 v98, v138
+  ; GCN-NEXT:    v_exp_f32_e32 v99, v127
+  ; GCN-NEXT:    v_mul_f32_e32 v150, 0x3fb8aa3b, v135
+  ; GCN-NEXT:    v_pack_b32_f16 v127, v136, v134
+  ; GCN-NEXT:    ds_read_b128 v[134:137], v197 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[138:141], v197 offset:1728
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[126:127], v[0:15]
+  ; GCN-NEXT:    v_fma_f32 v131, s4, v100, -v128
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v130, v96
+  ; GCN-NEXT:    v_exp_f32_e32 v100, v129
+  ; GCN-NEXT:    v_mul_f32_e32 v129, 0x3fb8aa3b, v131
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v131, v97
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
-  ; GCN-NEXT:    ds_write_b64 v135, v[36:37]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[148:149], v[96:111]
-  ; GCN-NEXT:    v_perm_b32 v32, v159, v157, s5
-  ; GCN-NEXT:    v_mul_f32_e32 v33, 0x3fb8aa3b, v150
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v150, v151
-  ; GCN-NEXT:    v_fma_f32 v157, s4, v38, -v134
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v38, v153
-  ; GCN-NEXT:    v_exp_f32_e32 v159, v33
-  ; GCN-NEXT:    v_perm_b32 v33, v131, v129, s5
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[144:145], v[148:149], v[112:127]
-  ; GCN-NEXT:    v_pack_b32_f16 v129, v150, v38
-  ; GCN-NEXT:    v_mul_f32_e32 v38, 0x3fb8aa3b, v152
-  ; GCN-NEXT:    v_exp_f32_e32 v152, v38
+  ; GCN-NEXT:    ds_write_b64 v199, v[188:189]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v136, v[60:61]
+  ; GCN-NEXT:    ds_write_b64 v200, v[190:191]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v137, v[32:33]
-  ; GCN-NEXT:    ; implicit-def: $vgpr33
-  ; GCN-NEXT:    ; implicit-def: $vgpr38
+  ; GCN-NEXT:    ds_write_b64 v201, v[192:193]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v138, v[140:141]
-  ; GCN-NEXT:    v_add_u32_e32 v38, v132, v38
-  ; GCN-NEXT:    v_add_u32_e32 v33, v132, v33
+  ; GCN-NEXT:    ds_write_b64 v202, v[194:195]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[126:127], v[32:47]
+  ; GCN-NEXT:    v_exp_f32_e32 v101, v125
+  ; GCN-NEXT:    v_pack_b32_f16 v146, v130, v131
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_load_dwordx2 v[130:131], v38, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[130:131], v210, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    buffer_load_dwordx2 v[140:141], v33, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    v_mul_f32_e32 v125, 0x3fb8aa3b, v143
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v147, v98
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[126:127], v[16:31]
+  ; GCN-NEXT:    v_fma_f32 v134, s4, v102, -v128
+  ; GCN-NEXT:    v_mul_f32_e32 v156, 0x3fb8aa3b, v134
+  ; GCN-NEXT:    buffer_load_dwordx2 v[134:135], v207, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ; implicit-def: $vgpr36
-  ; GCN-NEXT:    v_add_u32_e32 v33, v132, v36
-  ; GCN-NEXT:    ; implicit-def: $vgpr37
-  ; GCN-NEXT:    buffer_load_dwordx2 v[144:145], v33, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    v_exp_f32_e32 v102, v142
+  ; GCN-NEXT:    buffer_load_dwordx2 v[142:143], v208, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_add_u32_e32 v33, v132, v37
-  ; GCN-NEXT:    buffer_load_dwordx2 v[148:149], v33, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[144:145], v209, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v156, v162
-  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v155
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v33, v165
-  ; GCN-NEXT:    v_pack_b32_f16 v128, v154, v156
-  ; GCN-NEXT:    v_fma_f32 v150, s4, v39, -v134
-  ; GCN-NEXT:    ds_read_b128 v[36:39], v139
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[62:63], v[128:129], v[64:79]
-  ; GCN-NEXT:    v_exp_f32_e32 v154, v32
-  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v158
-  ; GCN-NEXT:    ds_read_b128 v[60:63], v139 offset:576
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_fma_f32 v156, s4, v42, -v134
-  ; GCN-NEXT:    v_perm_b32 v20, v140, v130, s5
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[142:143], v[128:129], v[80:95]
-  ; GCN-NEXT:    v_exp_f32_e32 v155, v32
-  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v157
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v142, v161
-  ; GCN-NEXT:    v_fma_f32 v143, s4, v41, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[128:129], v[96:111]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v34, v159
-  ; GCN-NEXT:    v_exp_f32_e32 v157, v32
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v32, v152
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[146:147], v[128:129], v[112:127]
-  ; GCN-NEXT:    v_pack_b32_f16 v129, v34, v32
-  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v150
-  ; GCN-NEXT:    v_pack_b32_f16 v128, v33, v142
-  ; GCN-NEXT:    v_exp_f32_e32 v146, v32
-  ; GCN-NEXT:    ds_read_b128 v[32:35], v139 offset:1152
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_fma_f32 v142, s4, v43, -v134
-  ; GCN-NEXT:    v_fma_f32 v150, s4, v46, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[128:129], v[64:79]
-  ; GCN-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v40
-  ; GCN-NEXT:    ds_read_b128 v[40:43], v139 offset:1728
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_exp_f32_e32 v147, v36
-  ; GCN-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v143
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v37, v154
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[60:61], v[128:129], v[80:95]
-  ; GCN-NEXT:    v_exp_f32_e32 v143, v36
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v60, v155
-  ; GCN-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v142
-  ; GCN-NEXT:    v_fma_f32 v61, s4, v45, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[128:129], v[96:111]
-  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v156
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v33, v157
-  ; GCN-NEXT:    v_exp_f32_e32 v156, v32
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v32, v146
-  ; GCN-NEXT:    v_pack_b32_f16 v33, v33, v32
-  ; GCN-NEXT:    v_pack_b32_f16 v32, v37, v60
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[40:41], v[128:129], v[112:127]
-  ; GCN-NEXT:    v_exp_f32_e32 v129, v36
-  ; GCN-NEXT:    v_mul_f32_e32 v40, 0x3fb8aa3b, v44
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v60, v147
-  ; GCN-NEXT:    v_fma_f32 v128, s4, v47, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[32:33], v[64:79]
-  ; GCN-NEXT:    ds_read_b128 v[36:39], v57
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_exp_f32_e32 v142, v40
-  ; GCN-NEXT:    v_mul_f32_e32 v40, 0x3fb8aa3b, v61
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v61, v143
-  ; GCN-NEXT:    ds_read_b128 v[44:47], v57 offset:576
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[62:63], v[32:33], v[80:95]
-  ; GCN-NEXT:    v_fma_f32 v62, s4, v17, -v134
-  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v150
-  ; GCN-NEXT:    v_exp_f32_e32 v63, v40
-  ; GCN-NEXT:    v_pack_b32_f16 v40, v60, v61
-  ; GCN-NEXT:    v_fma_f32 v150, s4, v18, -v134
-  ; GCN-NEXT:    v_fma_f32 v60, s4, v19, -v134
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v61, v142
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[32:33], v[96:111]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v34, v156
-  ; GCN-NEXT:    v_exp_f32_e32 v158, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v129
-  ; GCN-NEXT:    v_pack_b32_f16 v41, v34, v17
-  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v128
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[42:43], v[32:33], v[112:127]
-  ; GCN-NEXT:    v_exp_f32_e32 v128, v17
-  ; GCN-NEXT:    v_perm_b32 v42, v141, v131, s8
-  ; GCN-NEXT:    v_perm_b32 v43, v149, v145, s8
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[40:41], v[64:79]
-  ; GCN-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v16
-  ; GCN-NEXT:    ds_read_b128 v[16:19], v57 offset:1152
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[32:35], v57 offset:1728
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mul_f32_e32 v37, 0x3fb8aa3b, v62
-  ; GCN-NEXT:    v_exp_f32_e32 v167, v36
-  ; GCN-NEXT:    v_perm_b32 v36, v140, v130, s8
-  ; GCN-NEXT:    v_fma_f32 v62, s4, v21, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[44:45], v[40:41], v[80:95]
-  ; GCN-NEXT:    v_exp_f32_e32 v130, v37
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v45, v158
-  ; GCN-NEXT:    v_perm_b32 v21, v148, v144, s5
-  ; GCN-NEXT:    v_perm_b32 v37, v148, v144, s8
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v44, v63
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[126:127], v[48:63]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v126, v99
+  ; GCN-NEXT:    v_fma_f32 v127, s4, v103, -v128
+  ; GCN-NEXT:    v_exp_f32_e32 v103, v150
+  ; GCN-NEXT:    v_fma_f32 v139, s4, v105, -v128
+  ; GCN-NEXT:    v_pack_b32_f16 v147, v147, v126
+  ; GCN-NEXT:    v_mul_f32_e32 v138, 0x3fb8aa3b, v127
+  ; GCN-NEXT:    v_perm_b32 v152, v135, v131, s5
+  ; GCN-NEXT:    v_perm_b32 v154, v135, v131, s7
+  ; GCN-NEXT:    v_fma_f32 v135, s4, v104, -v128
+  ; GCN-NEXT:    v_perm_b32 v126, v134, v130, s5
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[146:147], v[0:15]
+  ; GCN-NEXT:    v_perm_b32 v150, v134, v130, s7
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v134, v100
+  ; GCN-NEXT:    v_exp_f32_e32 v104, v129
+  ; GCN-NEXT:    v_mul_f32_e32 v129, 0x3fb8aa3b, v135
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v135, v101
+  ; GCN-NEXT:    ds_read_b128 v[130:133], v198
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_perm_b32 v127, v144, v142, s5
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[146:147], v[32:47]
+  ; GCN-NEXT:    v_pack_b32_f16 v148, v134, v135
+  ; GCN-NEXT:    v_fma_f32 v135, s4, v106, -v128
+  ; GCN-NEXT:    v_exp_f32_e32 v105, v125
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v134, v102
+  ; GCN-NEXT:    v_perm_b32 v151, v144, v142, s7
+  ; GCN-NEXT:    v_perm_b32 v153, v145, v143, s5
+  ; GCN-NEXT:    v_perm_b32 v155, v145, v143, s7
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[146:147], v[16:31]
+  ; GCN-NEXT:    v_exp_f32_e32 v106, v156
+  ; GCN-NEXT:    v_mul_f32_e32 v156, 0x3fb8aa3b, v135
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v135, v103
+  ; GCN-NEXT:    v_fma_f32 v136, s4, v107, -v128
+  ; GCN-NEXT:    ds_read_b128 v[142:145], v198 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mul_f32_e32 v125, 0x3fb8aa3b, v139
+  ; GCN-NEXT:    v_pack_b32_f16 v149, v134, v135
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[146:147], v[48:63]
+  ; GCN-NEXT:    v_mul_f32_e32 v146, 0x3fb8aa3b, v136
+  ; GCN-NEXT:    ds_read_b128 v[134:137], v198 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_exp_f32_e32 v107, v138
+  ; GCN-NEXT:    ds_read_b128 v[138:141], v198 offset:1728
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[148:149], v[0:15]
+  ; GCN-NEXT:    v_fma_f32 v131, s4, v108, -v128
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v130, v104
+  ; GCN-NEXT:    v_exp_f32_e32 v108, v129
+  ; GCN-NEXT:    v_mul_f32_e32 v129, 0x3fb8aa3b, v131
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v131, v105
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[142:143], v[148:149], v[32:47]
+  ; GCN-NEXT:    v_fma_f32 v142, s4, v109, -v128
+  ; GCN-NEXT:    v_exp_f32_e32 v109, v125
+  ; GCN-NEXT:    v_mul_f32_e32 v125, 0x3fb8aa3b, v142
+  ; GCN-NEXT:    v_pack_b32_f16 v142, v130, v131
+  ; GCN-NEXT:    v_fma_f32 v131, s4, v110, -v128
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v130, v106
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[148:149], v[16:31]
+  ; GCN-NEXT:    v_mul_f32_e32 v134, 0x3fb8aa3b, v131
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v131, v107
+  ; GCN-NEXT:    v_exp_f32_e32 v110, v156
+  ; GCN-NEXT:    v_fma_f32 v135, s4, v111, -v128
+  ; GCN-NEXT:    v_mul_f32_e32 v135, 0x3fb8aa3b, v135
+  ; GCN-NEXT:    v_pack_b32_f16 v143, v130, v131
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[148:149], v[48:63]
+  ; GCN-NEXT:    v_exp_f32_e32 v111, v146
+  ; GCN-NEXT:    v_fma_f32 v139, s4, v80, -v128
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v138, v108
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[142:143], v[0:15]
+  ; GCN-NEXT:    v_exp_f32_e32 v80, v129
+  ; GCN-NEXT:    ds_read_b128 v[130:133], v197
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[146:149], v197 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mul_f32_e32 v129, 0x3fb8aa3b, v139
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v139, v109
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[144:145], v[142:143], v[32:47]
+  ; GCN-NEXT:    v_fma_f32 v144, s4, v81, -v128
+  ; GCN-NEXT:    v_exp_f32_e32 v81, v125
+  ; GCN-NEXT:    v_mul_f32_e32 v125, 0x3fb8aa3b, v144
+  ; GCN-NEXT:    v_pack_b32_f16 v144, v138, v139
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[142:143], v[16:31]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v136, v110
+  ; GCN-NEXT:    v_fma_f32 v137, s4, v82, -v128
+  ; GCN-NEXT:    v_exp_f32_e32 v82, v134
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v134, v111
+  ; GCN-NEXT:    v_mul_f32_e32 v156, 0x3fb8aa3b, v137
+  ; GCN-NEXT:    v_fma_f32 v137, s4, v83, -v128
+  ; GCN-NEXT:    v_mul_f32_e32 v157, 0x3fb8aa3b, v137
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[142:143], v[48:63]
+  ; GCN-NEXT:    v_exp_f32_e32 v83, v135
+  ; GCN-NEXT:    v_pack_b32_f16 v145, v136, v134
+  ; GCN-NEXT:    ds_read_b128 v[134:137], v197 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[138:141], v197 offset:1728
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
-  ; GCN-NEXT:    ds_write_b64 v135, v[20:21]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[40:41], v[96:111]
-  ; GCN-NEXT:    v_perm_b32 v16, v141, v131, s5
-  ; GCN-NEXT:    v_fma_f32 v131, s4, v22, -v134
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v128
-  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v150
-  ; GCN-NEXT:    v_exp_f32_e32 v140, v17
-  ; GCN-NEXT:    v_perm_b32 v17, v149, v145, s5
+  ; GCN-NEXT:    ds_write_b64 v199, v[126:127]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v136, v[36:37]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[32:33], v[40:41], v[112:127]
-  ; GCN-NEXT:    v_pack_b32_f16 v33, v45, v22
-  ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v60
-  ; GCN-NEXT:    v_exp_f32_e32 v144, v22
+  ; GCN-NEXT:    ds_write_b64 v200, v[150:151]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[144:145], v[0:15]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v137, v[16:17]
-  ; GCN-NEXT:    ; implicit-def: $vgpr17
-  ; GCN-NEXT:    ; implicit-def: $vgpr22
+  ; GCN-NEXT:    ds_write_b64 v201, v[152:153]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v138, v[42:43]
-  ; GCN-NEXT:    v_add_u32_e32 v22, v132, v22
-  ; GCN-NEXT:    v_add_u32_e32 v17, v132, v17
-  ; GCN-NEXT:    ; implicit-def: $vgpr20
-  ; GCN-NEXT:    ; implicit-def: $vgpr21
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_load_dwordx2 v[40:41], v22, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    ds_write_b64 v202, v[154:155]
+  ; GCN-NEXT:    v_fma_f32 v127, s4, v84, -v128
+  ; GCN-NEXT:    v_exp_f32_e32 v84, v129
+  ; GCN-NEXT:    v_fma_f32 v130, s4, v85, -v128
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v126, v80
+  ; GCN-NEXT:    v_mul_f32_e32 v129, 0x3fb8aa3b, v127
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[144:145], v[32:47]
+  ; GCN-NEXT:    v_exp_f32_e32 v85, v125
+  ; GCN-NEXT:    v_mul_f32_e32 v125, 0x3fb8aa3b, v130
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_load_dwordx2 v[130:131], v206, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    buffer_load_dwordx2 v[42:43], v17, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v127, v81
+  ; GCN-NEXT:    v_pack_b32_f16 v126, v126, v127
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[144:145], v[16:31]
+  ; GCN-NEXT:    v_fma_f32 v134, s4, v86, -v128
+  ; GCN-NEXT:    v_mul_f32_e32 v158, 0x3fb8aa3b, v134
+  ; GCN-NEXT:    buffer_load_dwordx2 v[134:135], v203, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_add_u32_e32 v20, v132, v20
-  ; GCN-NEXT:    v_add_u32_e32 v21, v132, v21
-  ; GCN-NEXT:    v_pack_b32_f16 v32, v61, v44
-  ; GCN-NEXT:    buffer_load_dwordx2 v[44:45], v20, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[142:143], v204, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    buffer_load_dwordx2 v[60:61], v21, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[146:147], v205, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v166
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[32:33], v[64:79]
-  ; GCN-NEXT:    v_exp_f32_e32 v132, v16
-  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v62
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v127, v82
+  ; GCN-NEXT:    v_exp_f32_e32 v86, v156
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[144:145], v[48:63]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v138, v83
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v167
-  ; GCN-NEXT:    v_fma_f32 v141, s4, v23, -v134
-  ; GCN-NEXT:    ds_read_b128 v[20:23], v139
+  ; GCN-NEXT:    v_fma_f32 v139, s4, v87, -v128
+  ; GCN-NEXT:    v_exp_f32_e32 v87, v157
+  ; GCN-NEXT:    v_pack_b32_f16 v127, v127, v138
+  ; GCN-NEXT:    v_fma_f32 v138, s4, v89, -v128
+  ; GCN-NEXT:    v_mul_f32_e32 v139, 0x3fb8aa3b, v139
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[126:127], v[0:15]
+  ; GCN-NEXT:    ; implicit-def: $sgpr0
+  ; GCN-NEXT:    v_perm_b32 v154, v135, v131, s5
+  ; GCN-NEXT:    v_perm_b32 v156, v135, v131, s7
+  ; GCN-NEXT:    v_fma_f32 v135, s4, v88, -v128
+  ; GCN-NEXT:    v_perm_b32 v150, v134, v130, s5
+  ; GCN-NEXT:    v_perm_b32 v152, v134, v130, s7
+  ; GCN-NEXT:    ds_read_b128 v[130:133], v198
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v134, v84
+  ; GCN-NEXT:    v_exp_f32_e32 v88, v129
+  ; GCN-NEXT:    v_mul_f32_e32 v129, 0x3fb8aa3b, v135
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v135, v85
+  ; GCN-NEXT:    v_perm_b32 v151, v146, v142, s5
+  ; GCN-NEXT:    v_perm_b32 v153, v146, v142, s7
+  ; GCN-NEXT:    v_perm_b32 v155, v147, v143, s5
+  ; GCN-NEXT:    v_perm_b32 v157, v147, v143, s7
+  ; GCN-NEXT:    ds_read_b128 v[142:145], v198 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[126:127], v[32:47]
+  ; GCN-NEXT:    v_exp_f32_e32 v89, v125
+  ; GCN-NEXT:    v_pack_b32_f16 v146, v134, v135
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v134, v86
+  ; GCN-NEXT:    v_fma_f32 v135, s4, v90, -v128
+  ; GCN-NEXT:    v_mul_f32_e32 v125, 0x3fb8aa3b, v138
+  ; GCN-NEXT:    v_mul_f32_e32 v148, 0x3fb8aa3b, v135
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[126:127], v[16:31]
+  ; GCN-NEXT:    v_exp_f32_e32 v90, v158
+  ; GCN-NEXT:    v_mul_f32_e32 v158, 0x3fb8aa3b, v64
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[126:127], v[48:63]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v126, v87
+  ; GCN-NEXT:    v_fma_f32 v127, s4, v91, -v128
+  ; GCN-NEXT:    v_exp_f32_e32 v91, v139
+  ; GCN-NEXT:    v_mul_f32_e32 v127, 0x3fb8aa3b, v127
+  ; GCN-NEXT:    v_pack_b32_f16 v147, v134, v126
+  ; GCN-NEXT:    ds_read_b128 v[134:137], v198 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[138:141], v198 offset:1728
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[146:147], v[0:15]
+  ; GCN-NEXT:    v_fma_f32 v130, s4, v92, -v128
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v126, v88
+  ; GCN-NEXT:    v_exp_f32_e32 v92, v129
+  ; GCN-NEXT:    v_mul_f32_e32 v129, 0x3fb8aa3b, v130
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v130, v89
+  ; GCN-NEXT:    v_fma_f32 v131, s4, v93, -v128
+  ; GCN-NEXT:    v_pack_b32_f16 v130, v126, v130
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[142:143], v[146:147], v[32:47]
+  ; GCN-NEXT:    v_exp_f32_e32 v93, v125
+  ; GCN-NEXT:    v_fma_f32 v126, s4, v94, -v128
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v125, v90
+  ; GCN-NEXT:    v_mul_f32_e32 v143, 0x3fb8aa3b, v126
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v126, v91
+  ; GCN-NEXT:    v_mul_f32_e32 v142, 0x3fb8aa3b, v131
+  ; GCN-NEXT:    v_fma_f32 v131, s4, v95, -v128
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[146:147], v[16:31]
+  ; GCN-NEXT:    v_exp_f32_e32 v94, v148
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v64, v93
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[146:147], v[48:63]
+  ; GCN-NEXT:    v_exp_f32_e32 v95, v127
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v127, v92
+  ; GCN-NEXT:    v_mul_f32_e32 v138, 0x3fb8aa3b, v131
+  ; GCN-NEXT:    v_pack_b32_f16 v131, v125, v126
+  ; GCN-NEXT:    s_nop 1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[130:131], v[0:15]
+  ; GCN-NEXT:    v_exp_f32_e32 v125, v129
+  ; GCN-NEXT:    ds_read_b128 v[132:135], v197
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[36:39], v139 offset:576
+  ; GCN-NEXT:    ds_read_b128 v[146:149], v197 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[144:145], v[130:131], v[32:47]
+  ; GCN-NEXT:    v_mul_f32_e32 v144, 0x3fb8aa3b, v65
+  ; GCN-NEXT:    v_fma_f32 v65, s4, v66, -v128
+  ; GCN-NEXT:    v_exp_f32_e32 v126, v142
+  ; GCN-NEXT:    v_pack_b32_f16 v142, v127, v64
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v64, v94
+  ; GCN-NEXT:    v_mul_f32_e32 v145, 0x3fb8aa3b, v65
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v65, v95
+  ; GCN-NEXT:    v_fma_f32 v66, s4, v67, -v128
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[130:131], v[16:31]
+  ; GCN-NEXT:    v_exp_f32_e32 v127, v143
+  ; GCN-NEXT:    v_pack_b32_f16 v143, v64, v65
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[130:131], v[48:63]
+  ; GCN-NEXT:    v_exp_f32_e32 v129, v138
+  ; GCN-NEXT:    v_mul_f32_e32 v141, 0x3fb8aa3b, v66
+  ; GCN-NEXT:    ds_read_b128 v[64:67], v197 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[136:139], v197 offset:1728
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[46:47], v[32:33], v[80:95]
-  ; GCN-NEXT:    v_exp_f32_e32 v62, v16
-  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v131
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v46, v130
-  ; GCN-NEXT:    v_fma_f32 v47, s4, v25, -v134
-  ; GCN-NEXT:    v_fma_f32 v131, s4, v26, -v134
-  ; GCN-NEXT:    v_fma_f32 v149, s4, v4, -v134
-  ; GCN-NEXT:    ; implicit-def: $sgpr0
-  ; GCN-NEXT:    v_perm_b32 v4, v42, v40, s5
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[18:19], v[32:33], v[96:111]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v140
-  ; GCN-NEXT:    v_exp_f32_e32 v145, v16
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v144
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[34:35], v[32:33], v[112:127]
-  ; GCN-NEXT:    v_pack_b32_f16 v33, v18, v16
-  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v141
-  ; GCN-NEXT:    v_pack_b32_f16 v32, v17, v46
-  ; GCN-NEXT:    v_exp_f32_e32 v35, v16
-  ; GCN-NEXT:    ds_read_b128 v[16:19], v139 offset:1152
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_fma_f32 v34, s4, v27, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[20:21], v[32:33], v[64:79]
-  ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v24
-  ; GCN-NEXT:    ds_read_b128 v[24:27], v139 offset:1728
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_exp_f32_e32 v46, v20
-  ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v47
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v132
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[36:37], v[32:33], v[80:95]
-  ; GCN-NEXT:    v_exp_f32_e32 v47, v20
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v36, v62
-  ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v34
-  ; GCN-NEXT:    v_fma_f32 v37, s4, v29, -v134
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v34, v46
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[32:33], v[96:111]
-  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v131
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v145
-  ; GCN-NEXT:    v_exp_f32_e32 v141, v16
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v35
-  ; GCN-NEXT:    v_fma_f32 v131, s4, v30, -v134
-  ; GCN-NEXT:    v_pack_b32_f16 v17, v17, v16
-  ; GCN-NEXT:    v_pack_b32_f16 v16, v21, v36
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[24:25], v[32:33], v[112:127]
-  ; GCN-NEXT:    v_exp_f32_e32 v33, v20
-  ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v28
-  ; GCN-NEXT:    v_fma_f32 v32, s4, v31, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79]
-  ; GCN-NEXT:    ds_read_b128 v[20:23], v57
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_exp_f32_e32 v36, v24
-  ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v37
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v37, v47
-  ; GCN-NEXT:    ds_read_b128 v[28:31], v57 offset:576
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[38:39], v[16:17], v[80:95]
-  ; GCN-NEXT:    v_fma_f32 v38, s4, v1, -v134
-  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v131
-  ; GCN-NEXT:    v_exp_f32_e32 v39, v24
-  ; GCN-NEXT:    v_pack_b32_f16 v24, v34, v37
-  ; GCN-NEXT:    v_fma_f32 v131, s4, v2, -v134
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v37, v36
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[18:19], v[16:17], v[96:111]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v141
-  ; GCN-NEXT:    v_exp_f32_e32 v148, v1
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v33
-  ; GCN-NEXT:    v_pack_b32_f16 v25, v18, v1
-  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v32
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[26:27], v[16:17], v[112:127]
-  ; GCN-NEXT:    v_fma_f32 v32, s4, v3, -v134
-  ; GCN-NEXT:    v_exp_f32_e32 v34, v1
-  ; GCN-NEXT:    v_perm_b32 v26, v43, v41, s8
-  ; GCN-NEXT:    v_perm_b32 v27, v61, v45, s8
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[20:21], v[24:25], v[64:79]
-  ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v0
-  ; GCN-NEXT:    ds_read_b128 v[0:3], v57 offset:1152
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[16:19], v57 offset:1728
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mul_f32_e32 v21, 0x3fb8aa3b, v38
-  ; GCN-NEXT:    v_exp_f32_e32 v150, v20
-  ; GCN-NEXT:    v_perm_b32 v20, v42, v40, s8
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v40, v148
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[28:29], v[24:25], v[80:95]
-  ; GCN-NEXT:    v_exp_f32_e32 v38, v21
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v28, v39
-  ; GCN-NEXT:    v_fma_f32 v29, s4, v5, -v134
-  ; GCN-NEXT:    v_perm_b32 v5, v60, v44, s5
-  ; GCN-NEXT:    v_perm_b32 v21, v60, v44, s8
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
-  ; GCN-NEXT:    ds_write_b64 v135, v[4:5]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[24:25], v[96:111]
-  ; GCN-NEXT:    v_perm_b32 v0, v43, v41, s5
-  ; GCN-NEXT:    v_fma_f32 v41, s4, v6, -v134
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v34
-  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v131
-  ; GCN-NEXT:    v_exp_f32_e32 v42, v1
-  ; GCN-NEXT:    v_perm_b32 v1, v61, v45, s5
+  ; GCN-NEXT:    ds_write_b64 v199, v[150:151]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v136, v[20:21]
+  ; GCN-NEXT:    ds_write_b64 v200, v[152:153]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[142:143], v[0:15]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v132, v125
+  ; GCN-NEXT:    v_exp_f32_e32 v130, v158
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v137, v[0:1]
+  ; GCN-NEXT:    ds_write_b64 v201, v[154:155]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v138, v[26:27]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[16:17], v[24:25], v[112:127]
-  ; GCN-NEXT:    v_pack_b32_f16 v17, v40, v6
-  ; GCN-NEXT:    v_mul_f32_e32 v6, 0x3fb8aa3b, v32
+  ; GCN-NEXT:    ds_write_b64 v202, v[156:157]
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_pack_b32_f16 v16, v37, v28
-  ; GCN-NEXT:    v_fma_f32 v24, s4, v7, -v134
-  ; GCN-NEXT:    v_exp_f32_e32 v25, v6
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_read_b128 v[4:7], v139
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79]
-  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v149
-  ; GCN-NEXT:    v_exp_f32_e32 v26, v0
-  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v29
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v150
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v27, v38
-  ; GCN-NEXT:    ds_read_b128 v[20:23], v139 offset:576
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_fma_f32 v28, s4, v9, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[30:31], v[16:17], v[80:95]
-  ; GCN-NEXT:    v_exp_f32_e32 v29, v0
-  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v41
-  ; GCN-NEXT:    v_fma_f32 v30, s4, v10, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[2:3], v[16:17], v[96:111]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v42
-  ; GCN-NEXT:    v_exp_f32_e32 v31, v0
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v25
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[18:19], v[16:17], v[112:127]
-  ; GCN-NEXT:    v_pack_b32_f16 v17, v2, v0
-  ; GCN-NEXT:    v_pack_b32_f16 v16, v1, v27
-  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v24
-  ; GCN-NEXT:    v_fma_f32 v18, s4, v11, -v134
-  ; GCN-NEXT:    v_exp_f32_e32 v19, v0
-  ; GCN-NEXT:    ds_read_b128 v[0:3], v139 offset:1152
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[4:5], v[16:17], v[64:79]
-  ; GCN-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v8
-  ; GCN-NEXT:    ds_read_b128 v[8:11], v139 offset:1728
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_exp_f32_e32 v24, v4
-  ; GCN-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v28
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v26
-  ; GCN-NEXT:    v_exp_f32_e32 v27, v4
-  ; GCN-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v18
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[20:21], v[16:17], v[80:95]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v29
-  ; GCN-NEXT:    v_fma_f32 v21, s4, v13, -v134
-  ; GCN-NEXT:    v_fma_f32 v28, s4, v14, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[16:17], v[96:111]
-  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v30
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v31
-  ; GCN-NEXT:    v_exp_f32_e32 v30, v0
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v19
-  ; GCN-NEXT:    v_pack_b32_f16 v1, v1, v0
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[8:9], v[16:17], v[112:127]
-  ; GCN-NEXT:    v_exp_f32_e32 v16, v4
-  ; GCN-NEXT:    v_pack_b32_f16 v0, v5, v20
-  ; GCN-NEXT:    v_mul_f32_e32 v9, 0x3fb8aa3b, v12
-  ; GCN-NEXT:    v_exp_f32_e32 v18, v9
-  ; GCN-NEXT:    v_mul_f32_e32 v9, 0x3fb8aa3b, v21
-  ; GCN-NEXT:    v_exp_f32_e32 v21, v9
-  ; GCN-NEXT:    v_fma_f32 v8, s4, v15, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79]
-  ; GCN-NEXT:    ds_read_b128 v[4:7], v57
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[12:15], v57 offset:576
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v24
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v27
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[22:23], v[0:1], v[80:95]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v21
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v23, v18
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[2:3], v[0:1], v[96:111]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v30
-  ; GCN-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v28
-  ; GCN-NEXT:    v_exp_f32_e32 v2, v2
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[10:11], v[0:1], v[112:127]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v16
-  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v8
-  ; GCN-NEXT:    v_exp_f32_e32 v10, v1
-  ; GCN-NEXT:    v_pack_b32_f16 v8, v17, v20
-  ; GCN-NEXT:    v_pack_b32_f16 v9, v3, v0
-  ; GCN-NEXT:    v_add_f32_e32 v3, 0, v49
-  ; GCN-NEXT:    v_add_f32_e32 v3, v50, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v51, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v52, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v53, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v54, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v55, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v56, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v58, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v163, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v164, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v59, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v160, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v162, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v151, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v153, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v165, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v161, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v159, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v152, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v154, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v155, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v157, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v146, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v147, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v143, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v156, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v129, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v142, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v63, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v158, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v128, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v167, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v130, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v140, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v144, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v132, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v62, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v145, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v35, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v46, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v47, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v141, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v33, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v36, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v39, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v148, v3
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[12:13], v[8:9], v[80:95]
-  ; GCN-NEXT:    v_add_f32_e32 v3, v34, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v150, v3
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v10
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v11, v2
-  ; GCN-NEXT:    v_add_f32_e32 v3, v38, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v42, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v25, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v26, v3
-  ; GCN-NEXT:    v_pack_b32_f16 v1, v11, v1
-  ; GCN-NEXT:    v_pack_b32_f16 v0, v23, v22
-  ; GCN-NEXT:    v_add_f32_e32 v3, v29, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v31, v3
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[14:15], v[0:1], v[80:95]
-  ; GCN-NEXT:    v_add_f32_e32 v3, v19, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v24, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v27, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v30, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v16, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v18, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v21, v3
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[4:5], v[8:9], v[64:79]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79]
-  ; GCN-NEXT:    v_add_f32_e32 v0, v2, v3
-  ; GCN-NEXT:    v_add_f32_e32 v4, v10, v0
-  ; GCN-NEXT:    ds_bpermute_b32 v5, v133, v4
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_read_b128 v[0:3], v57 offset:1152
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[142:143], v[32:47]
+  ; GCN-NEXT:    v_mul_f32_e32 v146, 0x3fb8aa3b, v68
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v68, v126
+  ; GCN-NEXT:    v_exp_f32_e32 v131, v144
+  ; GCN-NEXT:    v_mul_f32_e32 v144, 0x3fb8aa3b, v69
+  ; GCN-NEXT:    v_fma_f32 v69, s4, v71, -v128
+  ; GCN-NEXT:    v_pack_b32_f16 v140, v132, v68
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v68, v129
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[64:65], v[142:143], v[16:31]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v64, v127
+  ; GCN-NEXT:    v_exp_f32_e32 v132, v145
+  ; GCN-NEXT:    v_fma_f32 v65, s4, v70, -v128
+  ; GCN-NEXT:    v_mul_f32_e32 v65, 0x3fb8aa3b, v65
+  ; GCN-NEXT:    v_fma_f32 v145, s4, v73, -v128
+  ; GCN-NEXT:    v_mul_f32_e32 v147, 0x3fb8aa3b, v145
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[136:137], v[142:143], v[48:63]
+  ; GCN-NEXT:    v_exp_f32_e32 v133, v141
+  ; GCN-NEXT:    v_mul_f32_e32 v142, 0x3fb8aa3b, v69
+  ; GCN-NEXT:    v_pack_b32_f16 v141, v64, v68
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_read_b128 v[68:71], v198
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_fma_f32 v143, s4, v72, -v128
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v64, v130
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[134:135], v[140:141], v[0:15]
+  ; GCN-NEXT:    v_exp_f32_e32 v72, v146
+  ; GCN-NEXT:    v_mul_f32_e32 v146, 0x3fb8aa3b, v143
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v143, v131
+  ; GCN-NEXT:    ds_read_b128 v[134:137], v198 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_pack_b32_f16 v64, v64, v143
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[140:141], v[32:47]
+  ; GCN-NEXT:    v_exp_f32_e32 v73, v144
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[66:67], v[140:141], v[16:31]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v66, v132
+  ; GCN-NEXT:    v_fma_f32 v67, s4, v74, -v128
+  ; GCN-NEXT:    v_exp_f32_e32 v74, v65
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v65, v133
+  ; GCN-NEXT:    v_mul_f32_e32 v67, 0x3fb8aa3b, v67
+  ; GCN-NEXT:    v_pack_b32_f16 v65, v66, v65
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[140:141], v[48:63]
+  ; GCN-NEXT:    v_fma_f32 v138, s4, v75, -v128
+  ; GCN-NEXT:    v_exp_f32_e32 v75, v142
+  ; GCN-NEXT:    v_mul_f32_e32 v148, 0x3fb8aa3b, v138
+  ; GCN-NEXT:    ds_read_b128 v[138:141], v198 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[142:145], v198 offset:1728
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v66, v72
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[64:65], v[0:15]
+  ; GCN-NEXT:    v_fma_f32 v68, s4, v76, -v128
+  ; GCN-NEXT:    v_exp_f32_e32 v76, v146
+  ; GCN-NEXT:    v_mul_f32_e32 v146, 0x3fb8aa3b, v68
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v68, v73
+  ; GCN-NEXT:    v_fma_f32 v69, s4, v77, -v128
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[134:135], v[64:65], v[32:47]
+  ; GCN-NEXT:    v_exp_f32_e32 v77, v147
+  ; GCN-NEXT:    v_pack_b32_f16 v134, v66, v68
+  ; GCN-NEXT:    v_fma_f32 v68, s4, v78, -v128
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v66, v74
+  ; GCN-NEXT:    v_mul_f32_e32 v147, 0x3fb8aa3b, v69
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[138:139], v[64:65], v[16:31]
+  ; GCN-NEXT:    v_exp_f32_e32 v78, v67
+  ; GCN-NEXT:    v_mul_f32_e32 v138, 0x3fb8aa3b, v68
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v139, v76
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[142:143], v[64:65], v[48:63]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v64, v75
+  ; GCN-NEXT:    v_fma_f32 v65, s4, v79, -v128
+  ; GCN-NEXT:    v_exp_f32_e32 v79, v148
+  ; GCN-NEXT:    v_mul_f32_e32 v128, 0x3fb8aa3b, v65
+  ; GCN-NEXT:    v_pack_b32_f16 v135, v66, v64
+  ; GCN-NEXT:    s_nop 1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[70:71], v[134:135], v[0:15]
+  ; GCN-NEXT:    v_exp_f32_e32 v142, v146
+  ; GCN-NEXT:    ds_read_b128 v[68:71], v197
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[64:67], v197 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[136:137], v[134:135], v[32:47]
+  ; GCN-NEXT:    v_exp_f32_e32 v137, v147
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v136, v77
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[140:141], v[134:135], v[16:31]
+  ; GCN-NEXT:    v_exp_f32_e32 v138, v138
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v140, v78
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[144:145], v[134:135], v[48:63]
+  ; GCN-NEXT:    s_nop 10
+  ; GCN-NEXT:    v_exp_f32_e32 v52, v128
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v50, v137
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v51, v142
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v54, v138
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v53, v52
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v49, v79
+  ; GCN-NEXT:    v_pack_b32_f16 v50, v51, v50
+  ; GCN-NEXT:    v_pack_b32_f16 v48, v139, v136
+  ; GCN-NEXT:    v_pack_b32_f16 v51, v54, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, 0, v113
+  ; GCN-NEXT:    v_add_f32_e32 v53, v114, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v115, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v116, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v117, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v118, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v119, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v120, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v121, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v122, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v123, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v124, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v96, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v97, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v98, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v99, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v100, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v101, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v102, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v103, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v104, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v105, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v106, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v107, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v108, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v109, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v110, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v111, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v80, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v81, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v82, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v83, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v84, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v85, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v86, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v87, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v88, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v89, v53
+  ; GCN-NEXT:    v_pack_b32_f16 v49, v140, v49
+  ; GCN-NEXT:    v_add_f32_e32 v53, v90, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v91, v53
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[48:49], v[0:15]
+  ; GCN-NEXT:    v_add_f32_e32 v53, v92, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v93, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v94, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v95, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v125, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v126, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v127, v53
+  ; GCN-NEXT:    v_add_f32_e32 v53, v129, v53
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[70:71], v[50:51], v[0:15]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[64:65], v[48:49], v[32:47]
+  ; GCN-NEXT:    s_nop 9
+  ; GCN-NEXT:    v_add_f32_e32 v0, v130, v53
+  ; GCN-NEXT:    v_add_f32_e32 v0, v131, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v132, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v133, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v72, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v73, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v74, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v75, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v76, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v77, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v78, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v79, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v142, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v137, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v138, v0
+  ; GCN-NEXT:    v_add_f32_e32 v4, v52, v0
+  ; GCN-NEXT:    ds_bpermute_b32 v5, v196, v4
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_read_b128 v[0:3], v197 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[0:1], v[48:49], v[16:31]
   ; GCN-NEXT:    v_add_f32_e32 v2, v4, v5
-  ; GCN-NEXT:    ds_bpermute_b32 v3, v133, v2
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[8:9], v[96:111]
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    v_cndmask_b32_e64 v0, v3, v2, s[6:7]
+  ; GCN-NEXT:    ds_bpermute_b32 v3, v196, v2
   ; GCN-NEXT:    ; implicit-def: $vgpr4
-  ; GCN-NEXT:    v_fmac_f32_e32 v0, v4, v48
-  ; GCN-NEXT:    ds_read_b128 v[0:3], v57 offset:1728
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    v_cndmask_b32_e64 v0, v3, v2, s[12:13]
+  ; GCN-NEXT:    v_fmac_f32_e32 v0, v4, v112
+  ; GCN-NEXT:    ds_read_b128 v[0:3], v197 offset:1728
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[66:67], v[50:51], v[32:47]
   ; GCN-NEXT:    s_endpgm
 
   attributes #0 = {"amdgpu-flat-work-group-size"="256,256"}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
index 7959cee49b93f..e174fc17e98fe 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
@@ -156,62 +156,62 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
 ; GCN-NEXT:    v_and_b32_e32 v0, 0x1ff80, v0
 ; GCN-NEXT:    v_mov_b32_e32 v2, 1.0
-; GCN-NEXT:    v_mov_b32_e32 v3, 2.0
+; GCN-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_add_u32_e32 v1, s0, v0
-; GCN-NEXT:    ds_read_b128 a[28:31], v1 offset:112
-; GCN-NEXT:    ds_read_b128 a[24:27], v1 offset:96
-; GCN-NEXT:    ds_read_b128 a[20:23], v1 offset:80
-; GCN-NEXT:    ds_read_b128 a[16:19], v1 offset:64
-; GCN-NEXT:    ds_read_b128 a[0:3], v1
-; GCN-NEXT:    ds_read_b128 a[4:7], v1 offset:16
-; GCN-NEXT:    ds_read_b128 a[8:11], v1 offset:32
-; GCN-NEXT:    ds_read_b128 a[12:15], v1 offset:48
+; GCN-NEXT:    v_add_u32_e32 v3, s0, v0
+; GCN-NEXT:    ds_read_b128 a[28:31], v3 offset:112
+; GCN-NEXT:    ds_read_b128 a[24:27], v3 offset:96
+; GCN-NEXT:    ds_read_b128 a[20:23], v3 offset:80
+; GCN-NEXT:    ds_read_b128 a[16:19], v3 offset:64
+; GCN-NEXT:    ds_read_b128 a[0:3], v3
+; GCN-NEXT:    ds_read_b128 a[4:7], v3 offset:16
+; GCN-NEXT:    ds_read_b128 a[8:11], v3 offset:32
+; GCN-NEXT:    ds_read_b128 a[12:15], v3 offset:48
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; GCN-NEXT:    ds_read_b128 a[156:159], v1 offset:8304
-; GCN-NEXT:    ds_read_b128 a[152:155], v1 offset:8288
-; GCN-NEXT:    ds_read_b128 a[148:151], v1 offset:8272
-; GCN-NEXT:    ds_read_b128 a[144:147], v1 offset:8256
-; GCN-NEXT:    ds_read_b128 a[140:143], v1 offset:8240
-; GCN-NEXT:    ds_read_b128 a[136:139], v1 offset:8224
-; GCN-NEXT:    ds_read_b128 a[132:135], v1 offset:8208
-; GCN-NEXT:    ds_read_b128 a[128:131], v1 offset:8192
+; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31]
+; GCN-NEXT:    ds_read_b128 a[156:159], v3 offset:8304
+; GCN-NEXT:    ds_read_b128 a[152:155], v3 offset:8288
+; GCN-NEXT:    ds_read_b128 a[148:151], v3 offset:8272
+; GCN-NEXT:    ds_read_b128 a[144:147], v3 offset:8256
+; GCN-NEXT:    ds_read_b128 a[140:143], v3 offset:8240
+; GCN-NEXT:    ds_read_b128 a[136:139], v3 offset:8224
+; GCN-NEXT:    ds_read_b128 a[132:135], v3 offset:8208
+; GCN-NEXT:    ds_read_b128 a[128:131], v3 offset:8192
+; GCN-NEXT:    v_add_u32_e32 v4, 0x6000, v3
 ; GCN-NEXT:    v_add_u32_e32 v0, s1, v0
 ; GCN-NEXT:    ; iglp_opt mask(0x00000001)
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[128:159], v2, v3, a[128:159]
-; GCN-NEXT:    ds_read_b128 a[124:127], v1 offset:24688
-; GCN-NEXT:    ds_read_b128 a[120:123], v1 offset:24672
-; GCN-NEXT:    ds_read_b128 a[116:119], v1 offset:24656
-; GCN-NEXT:    ds_read_b128 a[112:115], v1 offset:24640
-; GCN-NEXT:    ds_read_b128 a[108:111], v1 offset:24624
-; GCN-NEXT:    ds_read_b128 a[104:107], v1 offset:24608
-; GCN-NEXT:    ds_read_b128 a[100:103], v1 offset:24592
-; GCN-NEXT:    ds_read_b128 a[96:99], v1 offset:24576
+; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159]
+; GCN-NEXT:    ds_read_b128 a[124:127], v3 offset:24688
+; GCN-NEXT:    ds_read_b128 a[120:123], v3 offset:24672
+; GCN-NEXT:    ds_read_b128 a[116:119], v3 offset:24656
+; GCN-NEXT:    ds_read_b128 a[112:115], v3 offset:24640
+; GCN-NEXT:    ds_read_b128 a[108:111], v3 offset:24624
+; GCN-NEXT:    ds_read_b128 a[104:107], v3 offset:24608
+; GCN-NEXT:    ds_read_b128 a[100:103], v3 offset:24592
+; GCN-NEXT:    ds_read_b128 a[96:99], v3 offset:24576
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[96:127], v2, v3, a[96:127]
-; GCN-NEXT:    ds_read_b128 a[92:95], v1 offset:49264
-; GCN-NEXT:    ds_read_b128 a[88:91], v1 offset:49248
-; GCN-NEXT:    ds_read_b128 a[84:87], v1 offset:49232
-; GCN-NEXT:    ds_read_b128 a[80:83], v1 offset:49216
-; GCN-NEXT:    ds_read_b128 a[76:79], v1 offset:49200
-; GCN-NEXT:    ds_read_b128 a[72:75], v1 offset:49184
-; GCN-NEXT:    ds_read_b128 a[68:71], v1 offset:49168
-; GCN-NEXT:    ds_read_b128 a[64:67], v1 offset:49152
-; GCN-NEXT:    v_add_u32_e32 v1, 0x6000, v1
+; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[96:127], v2, v1, a[96:127]
+; GCN-NEXT:    ds_read_b128 a[92:95], v3 offset:49264
+; GCN-NEXT:    ds_read_b128 a[88:91], v3 offset:49248
+; GCN-NEXT:    ds_read_b128 a[84:87], v3 offset:49232
+; GCN-NEXT:    ds_read_b128 a[80:83], v3 offset:49216
+; GCN-NEXT:    ds_read_b128 a[76:79], v3 offset:49200
+; GCN-NEXT:    ds_read_b128 a[72:75], v3 offset:49184
+; GCN-NEXT:    ds_read_b128 a[68:71], v3 offset:49168
+; GCN-NEXT:    ds_read_b128 a[64:67], v3 offset:49152
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95]
-; GCN-NEXT:    ds_read_b128 a[60:63], v1 offset:57456
-; GCN-NEXT:    ds_read_b128 a[56:59], v1 offset:57440
-; GCN-NEXT:    ds_read_b128 a[52:55], v1 offset:57424
-; GCN-NEXT:    ds_read_b128 a[48:51], v1 offset:57408
-; GCN-NEXT:    ds_read_b128 a[32:35], v1 offset:57344
-; GCN-NEXT:    ds_read_b128 a[36:39], v1 offset:57360
-; GCN-NEXT:    ds_read_b128 a[40:43], v1 offset:57376
-; GCN-NEXT:    ds_read_b128 a[44:47], v1 offset:57392
+; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95]
+; GCN-NEXT:    ds_read_b128 a[60:63], v4 offset:57456
+; GCN-NEXT:    ds_read_b128 a[56:59], v4 offset:57440
+; GCN-NEXT:    ds_read_b128 a[52:55], v4 offset:57424
+; GCN-NEXT:    ds_read_b128 a[48:51], v4 offset:57408
+; GCN-NEXT:    ds_read_b128 a[32:35], v4 offset:57344
+; GCN-NEXT:    ds_read_b128 a[36:39], v4 offset:57360
+; GCN-NEXT:    ds_read_b128 a[40:43], v4 offset:57376
+; GCN-NEXT:    ds_read_b128 a[44:47], v4 offset:57392
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63]
+; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63]
 ; GCN-NEXT:    ds_write_b128 v0, a[28:31] offset:112
 ; GCN-NEXT:    ds_write_b128 v0, a[24:27] offset:96
 ; GCN-NEXT:    ds_write_b128 v0, a[20:23] offset:80
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll
index 5d03dfb56c8cc..352af044b0a6d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll
@@ -218,3 +218,174 @@ main_body:
   ret void
 }
 
+define amdgpu_ps void @global_load_lds_dword_volatile(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: global_load_lds_dword_volatile:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_mov_b32 m0, s0
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    global_load_dword v[0:1], off glc lds
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    global_load_dword v[0:1], off offset:256 lds
+; GFX90A-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: global_load_lds_dword_volatile:
+; GFX942:       ; %bb.0: ; %main_body
+; GFX942-NEXT:    s_mov_b32 m0, s0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off offset:256
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-NEXT:    s_endpgm
+;
+; GFX10-LABEL: global_load_lds_dword_volatile:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    s_mov_b32 m0, s0
+; GFX10-NEXT:    global_load_dword v[0:1], off glc dlc lds
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_load_dword v[0:1], off offset:256 lds
+; GFX10-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX10-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: global_load_lds_dword_volatile:
+; GFX942-GISEL:       ; %bb.0: ; %main_body
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s0
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off offset:256
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-GISEL-NEXT:    s_endpgm
+main_body:
+  call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648)
+  call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 256, i32 0)
+  call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @buffer_load_lds_dword_volatile(ptr addrspace(7) nocapture inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: buffer_load_lds_dword_volatile:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX90A-NEXT:    s_mov_b32 m0, s5
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc lds
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:256 lds
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: buffer_load_lds_dword_volatile:
+; GFX942:       ; %bb.0: ; %main_body
+; GFX942-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX942-NEXT:    s_mov_b32 m0, s5
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:256 lds
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-NEXT:    s_endpgm
+;
+; GFX10-LABEL: buffer_load_lds_dword_volatile:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, s4, v0
+; GFX10-NEXT:    s_mov_b32 m0, s5
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc dlc lds
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:256 lds
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX10-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: buffer_load_lds_dword_volatile:
+; GFX942-GISEL:       ; %bb.0: ; %main_body
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s5
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:256 lds
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-GISEL-NEXT:    s_endpgm
+main_body:
+  %gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
+  call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648)
+  call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 256, i32 0)
+  call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @global_load_lds_dword_nontemporal(ptr addrspace(1) nocapture inreg %gptr, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: global_load_lds_dword_nontemporal:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    s_mov_b32 m0, s2
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    global_load_dword v0, s[0:1] glc slc lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: global_load_lds_dword_nontemporal:
+; GFX942:       ; %bb.0: ; %main_body
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_mov_b32 m0, s2
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_load_lds_dword v0, s[0:1] nt
+; GFX942-NEXT:    s_endpgm
+;
+; GFX10-LABEL: global_load_lds_dword_nontemporal:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_mov_b32 m0, s2
+; GFX10-NEXT:    global_load_dword v0, s[0:1] slc lds
+; GFX10-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: global_load_lds_dword_nontemporal:
+; GFX942-GISEL:       ; %bb.0: ; %main_body
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s2
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    global_load_lds_dword v0, s[0:1] nt
+; GFX942-GISEL-NEXT:    s_endpgm
+main_body:
+  call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 0, i32 0), !nontemporal !0
+  ret void
+}
+
+define amdgpu_ps void @buffer_load_lds_dword_nontemporal(ptr addrspace(7) nocapture inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: buffer_load_lds_dword_nontemporal:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX90A-NEXT:    s_mov_b32 m0, s5
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc slc lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: buffer_load_lds_dword_nontemporal:
+; GFX942:       ; %bb.0: ; %main_body
+; GFX942-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX942-NEXT:    s_mov_b32 m0, s5
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen nt lds
+; GFX942-NEXT:    s_endpgm
+;
+; GFX10-LABEL: buffer_load_lds_dword_nontemporal:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, s4, v0
+; GFX10-NEXT:    s_mov_b32 m0, s5
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen slc lds
+; GFX10-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: buffer_load_lds_dword_nontemporal:
+; GFX942-GISEL:       ; %bb.0: ; %main_body
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s5
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen nt lds
+; GFX942-GISEL-NEXT:    s_endpgm
+main_body:
+  %gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
+  call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 0), !nontemporal !0
+  ret void
+}
+
+!0 = !{i32 1}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll
index a2c1545743039..447a5f20748f3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll
@@ -361,12 +361,10 @@ define amdgpu_kernel void @raw_atomic_buffer_load_v4i16(<4 x i32> %addr) {
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-TRUE16-NEXT:    buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
 ; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SDAG-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; GFX11-SDAG-TRUE16-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX11-SDAG-TRUE16-NEXT:  ; %bb.2: ; %bb2
@@ -444,12 +442,10 @@ define amdgpu_kernel void @raw_atomic_buffer_load_v4i16(<4 x i32> %addr) {
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
 ; GFX12-SDAG-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-SDAG-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; GFX12-SDAG-TRUE16-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX12-SDAG-TRUE16-NEXT:  ; %bb.2: ; %bb2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll
index 6f7c001e03e26..2e0e42026d75c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll
@@ -361,12 +361,10 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) %pt
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-TRUE16-NEXT:    buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
 ; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SDAG-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; GFX11-SDAG-TRUE16-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX11-SDAG-TRUE16-NEXT:  ; %bb.2: ; %bb2
@@ -444,12 +442,10 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) %pt
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    buffer_load_b64 v[2:3], off, s[0:3], null offset:4 th:TH_LOAD_NT
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
-; GFX12-SDAG-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
 ; GFX12-SDAG-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-SDAG-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; GFX12-SDAG-TRUE16-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX12-SDAG-TRUE16-NEXT:  ; %bb.2: ; %bb2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.ll
index f0204bd81470a..1dcd032e14c6a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.ll
@@ -110,3 +110,43 @@ main_body:
   call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 1, i32 0, i32 0, i32 2048, i32 0)
   ret void
 }
+
+define amdgpu_ps float @buffer_load_lds_dword_volatile(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) {
+; GCN-LABEL: buffer_load_lds_dword_volatile:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    s_mov_b32 m0, s4
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    buffer_load_dword off, s[0:3], 0 glc lds
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword off, s[0:3], 0 offset:256 lds
+; GCN-NEXT:    buffer_load_dword off, s[0:3], 0 offset:512 lds
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ds_read_b32 v0, v0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
+main_body:
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 2147483648)
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 256, i32 0)
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 512, i32 0)
+  %res = load float, ptr addrspace(3) %lds
+  ret float %res
+}
+
+define amdgpu_ps float @buffer_load_lds_dword_nontemporal(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) {
+; GCN-LABEL: buffer_load_lds_dword_nontemporal:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    s_mov_b32 m0, s4
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    buffer_load_dword off, s[0:3], 0 glc slc lds
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ds_read_b32 v0, v0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
+main_body:
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 0), !nontemporal !0
+  %res = load float, ptr addrspace(3) %lds
+  ret float %res
+}
+
+!0 = !{i32 1}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
index b5d741b99c582..a9799993f5cdc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
@@ -80,25 +80,29 @@ define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load_volatile(p
 ; PREGFX10-LABEL: buffer_load_volatile:
 ; PREGFX10:       ; %bb.0: ; %main_body
 ; PREGFX10-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
+; PREGFX10-NEXT:    s_waitcnt vmcnt(0)
 ; PREGFX10-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc
+; PREGFX10-NEXT:    s_waitcnt vmcnt(0)
 ; PREGFX10-NEXT:    buffer_load_dwordx4 v[8:11], off, s[0:3], 0 glc slc
 ; PREGFX10-NEXT:    s_waitcnt vmcnt(0)
 ; PREGFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: buffer_load_volatile:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_clause 0x2
 ; GFX10-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_load_dwordx4 v[8:11], off, s[0:3], 0 glc slc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: buffer_load_volatile:
 ; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    s_clause 0x2
 ; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_load_b128 v[4:7], off, s[0:3], 0 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_load_b128 v[8:11], off, s[0:3], 0 glc slc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
index aa099b60ef16d..b65a1a8e06c7d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
@@ -623,62 +623,62 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
 ; GCN-NEXT:    v_and_b32_e32 v0, 0x1ff80, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, 1.0
+; GCN-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_add_u32_e32 v1, s0, v0
-; GCN-NEXT:    ds_read_b128 a[156:159], v1 offset:112
-; GCN-NEXT:    ds_read_b128 a[152:155], v1 offset:96
-; GCN-NEXT:    ds_read_b128 a[148:151], v1 offset:80
-; GCN-NEXT:    ds_read_b128 a[144:147], v1 offset:64
-; GCN-NEXT:    ds_read_b128 a[128:131], v1
-; GCN-NEXT:    ds_read_b128 a[132:135], v1 offset:16
-; GCN-NEXT:    ds_read_b128 a[136:139], v1 offset:32
-; GCN-NEXT:    ds_read_b128 a[140:143], v1 offset:48
-; GCN-NEXT:    ds_read_b128 a[28:31], v1 offset:8304
-; GCN-NEXT:    ds_read_b128 a[24:27], v1 offset:8288
-; GCN-NEXT:    ds_read_b128 a[20:23], v1 offset:8272
-; GCN-NEXT:    ds_read_b128 a[16:19], v1 offset:8256
-; GCN-NEXT:    ds_read_b128 a[12:15], v1 offset:8240
-; GCN-NEXT:    ds_read_b128 a[8:11], v1 offset:8224
-; GCN-NEXT:    ds_read_b128 a[4:7], v1 offset:8208
-; GCN-NEXT:    ds_read_b128 a[0:3], v1 offset:8192
-; GCN-NEXT:    v_add_u32_e32 v2, 0x6000, v1
-; GCN-NEXT:    ds_read_b128 a[124:127], v1 offset:24688
-; GCN-NEXT:    ds_read_b128 a[120:123], v1 offset:24672
-; GCN-NEXT:    ds_read_b128 a[116:119], v1 offset:24656
-; GCN-NEXT:    ds_read_b128 a[112:115], v1 offset:24640
-; GCN-NEXT:    ds_read_b128 a[108:111], v1 offset:24624
-; GCN-NEXT:    ds_read_b128 a[104:107], v1 offset:24608
-; GCN-NEXT:    ds_read_b128 a[100:103], v1 offset:24592
-; GCN-NEXT:    ds_read_b128 a[96:99], v1 offset:24576
-; GCN-NEXT:    ds_read_b128 a[92:95], v1 offset:49264
-; GCN-NEXT:    ds_read_b128 a[88:91], v1 offset:49248
-; GCN-NEXT:    ds_read_b128 a[84:87], v1 offset:49232
-; GCN-NEXT:    ds_read_b128 a[80:83], v1 offset:49216
-; GCN-NEXT:    ds_read_b128 a[76:79], v1 offset:49200
-; GCN-NEXT:    ds_read_b128 a[72:75], v1 offset:49184
-; GCN-NEXT:    ds_read_b128 a[68:71], v1 offset:49168
-; GCN-NEXT:    ds_read_b128 a[64:67], v1 offset:49152
-; GCN-NEXT:    v_mov_b32_e32 v1, 1.0
-; GCN-NEXT:    ds_read_b128 a[60:63], v2 offset:57456
-; GCN-NEXT:    ds_read_b128 a[56:59], v2 offset:57440
-; GCN-NEXT:    ds_read_b128 a[52:55], v2 offset:57424
-; GCN-NEXT:    ds_read_b128 a[48:51], v2 offset:57408
-; GCN-NEXT:    ds_read_b128 a[32:35], v2 offset:57344
-; GCN-NEXT:    ds_read_b128 a[36:39], v2 offset:57360
-; GCN-NEXT:    ds_read_b128 a[40:43], v2 offset:57376
-; GCN-NEXT:    ds_read_b128 a[44:47], v2 offset:57392
-; GCN-NEXT:    v_mov_b32_e32 v2, 2.0
+; GCN-NEXT:    v_add_u32_e32 v3, s0, v0
+; GCN-NEXT:    ds_read_b128 a[156:159], v3 offset:112
+; GCN-NEXT:    ds_read_b128 a[152:155], v3 offset:96
+; GCN-NEXT:    ds_read_b128 a[148:151], v3 offset:80
+; GCN-NEXT:    ds_read_b128 a[144:147], v3 offset:64
+; GCN-NEXT:    ds_read_b128 a[128:131], v3
+; GCN-NEXT:    ds_read_b128 a[132:135], v3 offset:16
+; GCN-NEXT:    ds_read_b128 a[136:139], v3 offset:32
+; GCN-NEXT:    ds_read_b128 a[140:143], v3 offset:48
+; GCN-NEXT:    v_add_u32_e32 v4, 0x6000, v3
+; GCN-NEXT:    ds_read_b128 a[28:31], v3 offset:8304
+; GCN-NEXT:    ds_read_b128 a[24:27], v3 offset:8288
+; GCN-NEXT:    ds_read_b128 a[20:23], v3 offset:8272
+; GCN-NEXT:    ds_read_b128 a[16:19], v3 offset:8256
+; GCN-NEXT:    ds_read_b128 a[12:15], v3 offset:8240
+; GCN-NEXT:    ds_read_b128 a[8:11], v3 offset:8224
+; GCN-NEXT:    ds_read_b128 a[4:7], v3 offset:8208
+; GCN-NEXT:    ds_read_b128 a[0:3], v3 offset:8192
+; GCN-NEXT:    ds_read_b128 a[124:127], v3 offset:24688
+; GCN-NEXT:    ds_read_b128 a[120:123], v3 offset:24672
+; GCN-NEXT:    ds_read_b128 a[116:119], v3 offset:24656
+; GCN-NEXT:    ds_read_b128 a[112:115], v3 offset:24640
+; GCN-NEXT:    ds_read_b128 a[108:111], v3 offset:24624
+; GCN-NEXT:    ds_read_b128 a[104:107], v3 offset:24608
+; GCN-NEXT:    ds_read_b128 a[100:103], v3 offset:24592
+; GCN-NEXT:    ds_read_b128 a[96:99], v3 offset:24576
+; GCN-NEXT:    ds_read_b128 a[92:95], v3 offset:49264
+; GCN-NEXT:    ds_read_b128 a[88:91], v3 offset:49248
+; GCN-NEXT:    ds_read_b128 a[84:87], v3 offset:49232
+; GCN-NEXT:    ds_read_b128 a[80:83], v3 offset:49216
+; GCN-NEXT:    ds_read_b128 a[76:79], v3 offset:49200
+; GCN-NEXT:    ds_read_b128 a[72:75], v3 offset:49184
+; GCN-NEXT:    ds_read_b128 a[68:71], v3 offset:49168
+; GCN-NEXT:    ds_read_b128 a[64:67], v3 offset:49152
+; GCN-NEXT:    ds_read_b128 a[60:63], v4 offset:57456
+; GCN-NEXT:    ds_read_b128 a[56:59], v4 offset:57440
+; GCN-NEXT:    ds_read_b128 a[52:55], v4 offset:57424
+; GCN-NEXT:    ds_read_b128 a[48:51], v4 offset:57408
+; GCN-NEXT:    ds_read_b128 a[32:35], v4 offset:57344
+; GCN-NEXT:    ds_read_b128 a[36:39], v4 offset:57360
+; GCN-NEXT:    ds_read_b128 a[40:43], v4 offset:57376
+; GCN-NEXT:    ds_read_b128 a[44:47], v4 offset:57392
+; GCN-NEXT:    s_waitcnt lgkmcnt(14)
+; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159]
 ; GCN-NEXT:    v_add_u32_e32 v0, s1, v0
 ; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(40) SyncID(0)
-; GCN-NEXT:    s_waitcnt lgkmcnt(14)
-; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[128:159], v1, v2, a[128:159]
-; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
-; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[96:127], v1, v2, a[96:127]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(8)
-; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95]
+; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95]
+; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[96:127], v2, v1, a[96:127]
+; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63]
-; GCN-NEXT:    s_nop 12
+; GCN-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63]
+; GCN-NEXT:    s_nop 11
 ; GCN-NEXT:    ds_write_b128 v0, a[156:159] offset:112
 ; GCN-NEXT:    ds_write_b128 v0, a[152:155] offset:96
 ; GCN-NEXT:    ds_write_b128 v0, a[148:151] offset:80
@@ -729,62 +729,62 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad
 ; EXACTCUTOFF-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; EXACTCUTOFF-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
 ; EXACTCUTOFF-NEXT:    v_and_b32_e32 v0, 0x1ff80, v0
+; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v2, 1.0
+; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v1, 2.0
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT:    v_add_u32_e32 v1, s0, v0
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[156:159], v1 offset:112
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[152:155], v1 offset:96
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[148:151], v1 offset:80
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[144:147], v1 offset:64
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[128:131], v1
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[132:135], v1 offset:16
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[136:139], v1 offset:32
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[140:143], v1 offset:48
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[28:31], v1 offset:8304
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[24:27], v1 offset:8288
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[20:23], v1 offset:8272
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[16:19], v1 offset:8256
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[12:15], v1 offset:8240
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[8:11], v1 offset:8224
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[4:7], v1 offset:8208
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[0:3], v1 offset:8192
-; EXACTCUTOFF-NEXT:    v_add_u32_e32 v2, 0x6000, v1
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[124:127], v1 offset:24688
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[120:123], v1 offset:24672
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[116:119], v1 offset:24656
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[112:115], v1 offset:24640
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[108:111], v1 offset:24624
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[104:107], v1 offset:24608
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[100:103], v1 offset:24592
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[96:99], v1 offset:24576
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[92:95], v1 offset:49264
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[88:91], v1 offset:49248
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[84:87], v1 offset:49232
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[80:83], v1 offset:49216
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[76:79], v1 offset:49200
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[72:75], v1 offset:49184
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[68:71], v1 offset:49168
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[64:67], v1 offset:49152
-; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v1, 1.0
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[60:63], v2 offset:57456
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[56:59], v2 offset:57440
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[52:55], v2 offset:57424
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[48:51], v2 offset:57408
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[32:35], v2 offset:57344
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[36:39], v2 offset:57360
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[40:43], v2 offset:57376
-; EXACTCUTOFF-NEXT:    ds_read_b128 a[44:47], v2 offset:57392
-; EXACTCUTOFF-NEXT:    v_mov_b32_e32 v2, 2.0
+; EXACTCUTOFF-NEXT:    v_add_u32_e32 v3, s0, v0
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[156:159], v3 offset:112
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[152:155], v3 offset:96
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[148:151], v3 offset:80
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[144:147], v3 offset:64
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[128:131], v3
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[132:135], v3 offset:16
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[136:139], v3 offset:32
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[140:143], v3 offset:48
+; EXACTCUTOFF-NEXT:    v_add_u32_e32 v4, 0x6000, v3
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[28:31], v3 offset:8304
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[24:27], v3 offset:8288
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[20:23], v3 offset:8272
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[16:19], v3 offset:8256
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[12:15], v3 offset:8240
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[8:11], v3 offset:8224
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[4:7], v3 offset:8208
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[0:3], v3 offset:8192
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[124:127], v3 offset:24688
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[120:123], v3 offset:24672
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[116:119], v3 offset:24656
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[112:115], v3 offset:24640
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[108:111], v3 offset:24624
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[104:107], v3 offset:24608
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[100:103], v3 offset:24592
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[96:99], v3 offset:24576
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[92:95], v3 offset:49264
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[88:91], v3 offset:49248
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[84:87], v3 offset:49232
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[80:83], v3 offset:49216
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[76:79], v3 offset:49200
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[72:75], v3 offset:49184
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[68:71], v3 offset:49168
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[64:67], v3 offset:49152
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[60:63], v4 offset:57456
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[56:59], v4 offset:57440
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[52:55], v4 offset:57424
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[48:51], v4 offset:57408
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[32:35], v4 offset:57344
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[36:39], v4 offset:57360
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[40:43], v4 offset:57376
+; EXACTCUTOFF-NEXT:    ds_read_b128 a[44:47], v4 offset:57392
+; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(14)
+; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[128:159], v2, v1, a[128:159]
 ; EXACTCUTOFF-NEXT:    v_add_u32_e32 v0, s1, v0
 ; EXACTCUTOFF-NEXT:    ; sched_group_barrier mask(0x00000100) size(40) SyncID(0)
-; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(14)
-; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[128:159], v1, v2, a[128:159]
-; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
-; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[96:127], v1, v2, a[96:127]
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(8)
-; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95]
+; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[64:95], v2, v1, a[64:95]
+; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[96:127], v2, v1, a[96:127]
+; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31]
 ; EXACTCUTOFF-NEXT:    s_waitcnt lgkmcnt(0)
-; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63]
-; EXACTCUTOFF-NEXT:    s_nop 12
+; EXACTCUTOFF-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v2, v1, a[32:63]
+; EXACTCUTOFF-NEXT:    s_nop 11
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[156:159] offset:112
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[152:155] offset:96
 ; EXACTCUTOFF-NEXT:    ds_write_b128 v0, a[148:151] offset:80
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll
index 88963643218a5..ebb3368414ef2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll
@@ -455,12 +455,10 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v4i16(<4 x i32> %addr, i32
 ; GFX11-SDAG-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-SDAG-TRUE16-NEXT:    buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
 ; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SDAG-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; GFX11-SDAG-TRUE16-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX11-SDAG-TRUE16-NEXT:  ; %bb.2: ; %bb2
@@ -550,12 +548,10 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v4i16(<4 x i32> %addr, i32
 ; GFX12-SDAG-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-SDAG-TRUE16-NEXT:    buffer_load_b64 v[2:3], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-SDAG-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
 ; GFX12-SDAG-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-SDAG-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; GFX12-SDAG-TRUE16-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX12-SDAG-TRUE16-NEXT:  ; %bb.2: ; %bb2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll
index 23db2479f66bb..40be5673543fb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll
@@ -455,12 +455,10 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i16(ptr addrspace(8)
 ; GFX11-SDAG-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-SDAG-TRUE16-NEXT:    buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
 ; GFX11-SDAG-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SDAG-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; GFX11-SDAG-TRUE16-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX11-SDAG-TRUE16-NEXT:  ; %bb.2: ; %bb2
@@ -550,12 +548,10 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i16(ptr addrspace(8)
 ; GFX12-SDAG-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-SDAG-TRUE16-NEXT:    buffer_load_b64 v[2:3], v1, s[0:3], null idxen offset:4 th:TH_LOAD_NT
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-SDAG-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
 ; GFX12-SDAG-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-SDAG-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
 ; GFX12-SDAG-TRUE16-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX12-SDAG-TRUE16-NEXT:  ; %bb.2: ; %bb2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
index e8b8d05ab26ad..e8eccb0e408c1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
@@ -147,14 +147,13 @@ define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr {
 ; GFX8-OPT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8-OPT-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-OPT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-OPT-NEXT:    s_barrier
-; GFX8-OPT-NEXT:    v_add_u32_e32 v1, vcc, v1, v1
-; GFX8-OPT-NEXT:    s_nop 1
-; GFX8-OPT-NEXT:    v_mov_b32_dpp v2, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
-; GFX8-OPT-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; GFX8-OPT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-OPT-NEXT:    v_add_u32_e32 v4, vcc, v1, v1
+; GFX8-OPT-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX8-OPT-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-OPT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-OPT-NEXT:    v_mov_b32_dpp v2, v4 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
+; GFX8-OPT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-OPT-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
+; GFX8-OPT-NEXT:    s_barrier
 ; GFX8-OPT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-OPT-NEXT:    s_endpgm
 ;
@@ -194,14 +193,14 @@ define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr {
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    ds_read_b32 v1, v0
-; GFX10-NEXT:    s_barrier
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, s0, s0, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v1
-; GFX10-NEXT:    v_mov_b32_dpp v2, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
-; GFX10-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, v1, v1
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
+; GFX10-NEXT:    v_mov_b32_dpp v2, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, v2, v3
+; GFX10-NEXT:    s_barrier
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -213,15 +212,15 @@ define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffc, v0
 ; GFX11-NEXT:    ds_load_b32 v1, v0
-; GFX11-NEXT:    s_barrier
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v0, s0, s0, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mov_b32_dpp v2, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, v1, v1
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s1, 0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_dpp v2, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v3
+; GFX11-NEXT:    s_barrier
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_endpgm
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
index 92a2f54841eed..068a989fa2366 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
@@ -1053,19 +1053,15 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
 ; GFX11-TRUE16-LABEL: s_maximum_v2f16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cmp_o_f16_e64 s2, s0, s1
 ; GFX11-TRUE16-NEXT:    v_pk_max_f16 v0, s0, s1
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s1, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s2, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX11-TRUE16-NEXT:    v_cmp_o_f16_e64 s0, s0, s1
+; GFX11-TRUE16-NEXT:    v_cmp_o_f16_e64 s1, s3, s2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, s2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, 0x7e00, v1.l, s0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x7e00, v1.l, s1
 ; GFX11-TRUE16-NEXT:    ;;#ASMSTART
 ; GFX11-TRUE16-NEXT:    ; use v0
 ; GFX11-TRUE16-NEXT:    ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
index 9e82b41bb9585..2482d106b8053 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
@@ -866,19 +866,15 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
 ; GFX11-TRUE16-LABEL: s_minimum_v2f16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cmp_o_f16_e64 s2, s0, s1
 ; GFX11-TRUE16-NEXT:    v_pk_min_f16 v0, s0, s1
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s1, s1, 16
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s2, s1, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX11-TRUE16-NEXT:    v_cmp_o_f16_e64 s0, s0, s1
+; GFX11-TRUE16-NEXT:    v_cmp_o_f16_e64 s1, s3, s2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, s2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, 0x7e00, v1.l, s0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x7e00, v1.l, s1
 ; GFX11-TRUE16-NEXT:    ;;#ASMSTART
 ; GFX11-TRUE16-NEXT:    ; use v0
 ; GFX11-TRUE16-NEXT:    ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll
index dcf01f744945f..818dff4924f40 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll
@@ -63,14 +63,10 @@ define amdgpu_kernel void @sqrt_v2bf16(ptr addrspace(1) %r, ptr addrspace(1) %a)
 ; GFX12-TRUE16-NEXT:    buffer_load_b32 v0, off, s[8:11], null
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_sqrt_bf16_e32 v1.l, v0.l
-; GFX12-TRUE16-NEXT:    v_nop
-; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_2)
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX12-TRUE16-NEXT:    v_sqrt_bf16_e32 v0.l, v0.l
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_sqrt_bf16_e32 v0.h, v1.l
 ; GFX12-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], null
 ; GFX12-TRUE16-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index b534c2c267fad..6f63384be90fd 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -9604,11 +9604,11 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v1
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, v2
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
 ; GFX12-TRUE16-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-TRUE16-NEXT:    s_endpgm
 ;
@@ -9738,11 +9738,11 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_bfe_i32 v2, v1, 0, 16
 ; GFX12-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, v2
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
 ; GFX12-TRUE16-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-TRUE16-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll
index 97db15ba637a5..1d1d3e4a68fee 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll
@@ -107,6 +107,7 @@ define amdgpu_kernel void @buffer_last_use_and_volatile_load(ptr addrspace(7) %i
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
 ; GFX12-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_BYPASS scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b32 s13, s[4:5], 0x30
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
@@ -120,7 +121,6 @@ define amdgpu_kernel void @buffer_last_use_and_volatile_load(ptr addrspace(7) %i
 ; GFX12-NEXT:    s_mov_b32 s3, s12
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_store_b32 v0, v1, s[4:7], null offen
 ; GFX12-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
index 9dac2393fd966..1e4b63327651e 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
@@ -354,6 +354,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX9-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[10:11]
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-SDAG-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen glc
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_load_dword s11, s[8:9], 0x30
 ; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
 ; GFX9-SDAG-NEXT:    s_mov_b32 s5, s10
@@ -365,8 +366,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX9-SDAG-NEXT:    s_mov_b32 s3, s10
 ; GFX9-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[10:11]
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -384,6 +385,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX9-GISEL-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-GISEL-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen glc
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
 ; GFX9-GISEL-NEXT:    s_load_dword s7, s[8:9], 0x30
 ; GFX9-GISEL-NEXT:    s_mov_b32 s4, s11
@@ -395,8 +397,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX9-GISEL-NEXT:    s_mov_b32 s10, s3
 ; GFX9-GISEL-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX942-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -414,6 +416,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX942-SDAG-NEXT:    s_or_b64 s[8:9], s[2:3], s[12:13]
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX942-SDAG-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen sc0 sc1
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-SDAG-NEXT:    s_load_dword s13, s[4:5], 0x30
 ; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x20
 ; GFX942-SDAG-NEXT:    s_mov_b32 s5, s12
@@ -425,8 +428,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX942-SDAG-NEXT:    s_mov_b32 s3, s12
 ; GFX942-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s0
-; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-SDAG-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen sc0 sc1
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-SDAG-NEXT:    s_endpgm
 ;
 ; GFX942-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -444,6 +447,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX942-GISEL-NEXT:    s_or_b64 s[10:11], s[6:7], s[10:11]
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX942-GISEL-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen sc0 sc1
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x20
 ; GFX942-GISEL-NEXT:    s_load_dword s9, s[4:5], 0x30
 ; GFX942-GISEL-NEXT:    s_mov_b32 s4, s7
@@ -455,8 +459,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX942-GISEL-NEXT:    s_mov_b32 s6, s3
 ; GFX942-GISEL-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, s0
-; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen sc0 sc1
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-GISEL-NEXT:    s_endpgm
 ;
 ; GFX10-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -475,6 +479,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX10-SDAG-NEXT:    s_mov_b32 s11, s2
 ; GFX10-SDAG-NEXT:    s_or_b64 s[4:5], s[12:13], s[10:11]
 ; GFX10-SDAG-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen glc dlc
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    s_clause 0x1
 ; GFX10-SDAG-NEXT:    s_load_dword s11, s[8:9], 0x30
 ; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
@@ -488,8 +493,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX10-SDAG-NEXT:    s_mov_b32 s2, s1
 ; GFX10-SDAG-NEXT:    s_mov_b32 s3, s10
 ; GFX10-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[10:11]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
+; GFX10-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -508,6 +513,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX10-GISEL-NEXT:    s_mov_b32 s6, s3
 ; GFX10-GISEL-NEXT:    s_or_b64 s[2:3], s[6:7], s[4:5]
 ; GFX10-GISEL-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen glc dlc
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    s_clause 0x1
 ; GFX10-GISEL-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
@@ -519,8 +525,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX10-GISEL-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
 ; GFX10-GISEL-NEXT:    s_mov_b32 s6, s3
 ; GFX10-GISEL-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
+; GFX10-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -541,6 +547,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
 ; GFX11-SDAG-NEXT:    buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    s_clause 0x1
 ; GFX11-SDAG-NEXT:    s_load_b32 s13, s[4:5], 0x30
 ; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
@@ -554,8 +561,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX11-SDAG-NEXT:    s_mov_b32 s3, s12
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    buffer_store_b32 v0, v1, s[4:7], 0 offen dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -576,6 +583,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
 ; GFX11-GISEL-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen glc dlc
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_clause 0x1
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX11-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x30
@@ -588,8 +596,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX11-GISEL-NEXT:    s_mov_b32 s8, s3
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    buffer_store_b32 v0, v1, s[4:7], 0 offen dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -610,6 +618,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-SDAG-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
 ; GFX12-SDAG-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    s_clause 0x1
 ; GFX12-SDAG-NEXT:    s_load_b32 s13, s[4:5], 0x30
 ; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
@@ -623,8 +632,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX12-SDAG-NEXT:    s_mov_b32 s3, s12
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -645,6 +654,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
 ; GFX12-GISEL-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_clause 0x1
 ; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX12-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x30
@@ -657,8 +667,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX12-GISEL-NEXT:    s_mov_b32 s8, s3
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
-; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %val = load volatile i32, ptr addrspace(7) %in, !nontemporal !0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll
index 516c3946f63dc..282a7ae7ea2fd 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll
@@ -15,7 +15,6 @@ define amdgpu_kernel void @test_s_barrier() {
 ;
 ; GFX10-CU-LABEL: test_s_barrier:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-CU-NEXT:    s_barrier
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -26,7 +25,6 @@ define amdgpu_kernel void @test_s_barrier() {
 ;
 ; GFX11-CU-LABEL: test_s_barrier:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX11-CU-NEXT:    s_barrier
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -38,7 +36,6 @@ define amdgpu_kernel void @test_s_barrier() {
 ;
 ; GFX12-CU-LABEL: test_s_barrier:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_wait_alu 0xffe3
 ; GFX12-CU-NEXT:    s_barrier_signal -1
 ; GFX12-CU-NEXT:    s_barrier_wait -1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -63,8 +60,8 @@ define amdgpu_kernel void @test_s_barrier_workgroup_fence() {
 ;
 ; GFX10-CU-LABEL: test_s_barrier_workgroup_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_barrier
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -77,8 +74,8 @@ define amdgpu_kernel void @test_s_barrier_workgroup_fence() {
 ;
 ; GFX11-CU-LABEL: test_s_barrier_workgroup_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_barrier
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -94,8 +91,10 @@ define amdgpu_kernel void @test_s_barrier_workgroup_fence() {
 ;
 ; GFX12-CU-LABEL: test_s_barrier_workgroup_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    s_wait_alu 0xffe3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_barrier_signal -1
 ; GFX12-CU-NEXT:    s_barrier_wait -1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -125,7 +124,6 @@ define amdgpu_kernel void @test_s_barrier_agent_fence() {
 ; GFX10-CU:       ; %bb.0: ; %entry
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-CU-NEXT:    s_barrier
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -140,7 +138,6 @@ define amdgpu_kernel void @test_s_barrier_agent_fence() {
 ; GFX11-CU:       ; %bb.0: ; %entry
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX11-CU-NEXT:    s_barrier
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -160,7 +157,6 @@ define amdgpu_kernel void @test_s_barrier_agent_fence() {
 ; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-CU-NEXT:    s_wait_alu 0xffe3
 ; GFX12-CU-NEXT:    s_barrier_signal -1
 ; GFX12-CU-NEXT:    s_barrier_wait -1
 ; GFX12-CU-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
index 6a76f4307dcad..7efbff9c637c5 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
@@ -107,6 +107,8 @@ define amdgpu_kernel void @workgroup_release_fence() {
 ;
 ; GFX10-CU-LABEL: workgroup_release_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: workgroup_release_fence:
@@ -139,6 +141,8 @@ define amdgpu_kernel void @workgroup_release_fence() {
 ;
 ; GFX11-CU-LABEL: workgroup_release_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: workgroup_release_fence:
@@ -151,6 +155,10 @@ define amdgpu_kernel void @workgroup_release_fence() {
 ;
 ; GFX12-CU-LABEL: workgroup_release_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: workgroup_release_fence:
@@ -181,6 +189,8 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
 ;
 ; GFX10-CU-LABEL: workgroup_acq_rel_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: workgroup_acq_rel_fence:
@@ -216,6 +226,8 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
 ;
 ; GFX11-CU-LABEL: workgroup_acq_rel_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: workgroup_acq_rel_fence:
@@ -229,6 +241,10 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
 ;
 ; GFX12-CU-LABEL: workgroup_acq_rel_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: workgroup_acq_rel_fence:
@@ -259,6 +275,8 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
 ;
 ; GFX10-CU-LABEL: workgroup_seq_cst_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: workgroup_seq_cst_fence:
@@ -294,6 +312,8 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
 ;
 ; GFX11-CU-LABEL: workgroup_seq_cst_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: workgroup_seq_cst_fence:
@@ -307,6 +327,10 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
 ;
 ; GFX12-CU-LABEL: workgroup_seq_cst_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: workgroup_seq_cst_fence:
@@ -412,6 +436,8 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
 ;
 ; GFX10-CU-LABEL: workgroup_one_as_release_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: workgroup_one_as_release_fence:
@@ -444,6 +470,8 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
 ;
 ; GFX11-CU-LABEL: workgroup_one_as_release_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: workgroup_one_as_release_fence:
@@ -456,6 +484,10 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
 ;
 ; GFX12-CU-LABEL: workgroup_one_as_release_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: workgroup_one_as_release_fence:
@@ -486,6 +518,8 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
 ;
 ; GFX10-CU-LABEL: workgroup_one_as_acq_rel_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: workgroup_one_as_acq_rel_fence:
@@ -521,6 +555,8 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
 ;
 ; GFX11-CU-LABEL: workgroup_one_as_acq_rel_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: workgroup_one_as_acq_rel_fence:
@@ -534,6 +570,10 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
 ;
 ; GFX12-CU-LABEL: workgroup_one_as_acq_rel_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: workgroup_one_as_acq_rel_fence:
@@ -564,6 +604,8 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
 ;
 ; GFX10-CU-LABEL: workgroup_one_as_seq_cst_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: workgroup_one_as_seq_cst_fence:
@@ -599,6 +641,8 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
 ;
 ; GFX11-CU-LABEL: workgroup_one_as_seq_cst_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: workgroup_one_as_seq_cst_fence:
@@ -612,6 +656,10 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
 ;
 ; GFX12-CU-LABEL: workgroup_one_as_seq_cst_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: workgroup_one_as_seq_cst_fence:
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
index d288bfc6a09db..1cca64ad6d2b4 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
@@ -1093,7 +1093,8 @@ define amdgpu_kernel void @workgroup_release_fence() {
 ;
 ; GFX10-CU-LABEL: workgroup_release_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: workgroup_release_fence:
@@ -1129,7 +1130,8 @@ define amdgpu_kernel void @workgroup_release_fence() {
 ;
 ; GFX11-CU-LABEL: workgroup_release_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: workgroup_release_fence:
@@ -1142,7 +1144,10 @@ define amdgpu_kernel void @workgroup_release_fence() {
 ;
 ; GFX12-CU-LABEL: workgroup_release_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: workgroup_release_fence:
@@ -1175,7 +1180,8 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
 ;
 ; GFX10-CU-LABEL: workgroup_acq_rel_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: workgroup_acq_rel_fence:
@@ -1214,7 +1220,8 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
 ;
 ; GFX11-CU-LABEL: workgroup_acq_rel_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: workgroup_acq_rel_fence:
@@ -1228,7 +1235,10 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
 ;
 ; GFX12-CU-LABEL: workgroup_acq_rel_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: workgroup_acq_rel_fence:
@@ -1261,7 +1271,8 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
 ;
 ; GFX10-CU-LABEL: workgroup_seq_cst_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: workgroup_seq_cst_fence:
@@ -1300,7 +1311,8 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
 ;
 ; GFX11-CU-LABEL: workgroup_seq_cst_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: workgroup_seq_cst_fence:
@@ -1314,7 +1326,10 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
 ;
 ; GFX12-CU-LABEL: workgroup_seq_cst_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: workgroup_seq_cst_fence:
@@ -1420,6 +1435,8 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
 ;
 ; GFX10-CU-LABEL: workgroup_one_as_release_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: workgroup_one_as_release_fence:
@@ -1452,6 +1469,8 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
 ;
 ; GFX11-CU-LABEL: workgroup_one_as_release_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: workgroup_one_as_release_fence:
@@ -1464,6 +1483,10 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
 ;
 ; GFX12-CU-LABEL: workgroup_one_as_release_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: workgroup_one_as_release_fence:
@@ -1494,6 +1517,8 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
 ;
 ; GFX10-CU-LABEL: workgroup_one_as_acq_rel_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: workgroup_one_as_acq_rel_fence:
@@ -1529,6 +1554,8 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
 ;
 ; GFX11-CU-LABEL: workgroup_one_as_acq_rel_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: workgroup_one_as_acq_rel_fence:
@@ -1542,6 +1569,10 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
 ;
 ; GFX12-CU-LABEL: workgroup_one_as_acq_rel_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: workgroup_one_as_acq_rel_fence:
@@ -1572,6 +1603,8 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
 ;
 ; GFX10-CU-LABEL: workgroup_one_as_seq_cst_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: workgroup_one_as_seq_cst_fence:
@@ -1607,6 +1640,8 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
 ;
 ; GFX11-CU-LABEL: workgroup_one_as_seq_cst_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: workgroup_one_as_seq_cst_fence:
@@ -1620,6 +1655,10 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
 ;
 ; GFX12-CU-LABEL: workgroup_one_as_seq_cst_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: workgroup_one_as_seq_cst_fence:
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
index 4caaad646378d..9e2906cf85432 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
@@ -795,8 +795,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -816,8 +814,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -2942,8 +2938,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -2964,8 +2958,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -3196,8 +3188,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -3218,8 +3208,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9001,8 +8989,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -9027,8 +9013,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9349,8 +9333,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -9375,8 +9357,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9677,8 +9657,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -9699,8 +9677,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -10335,8 +10311,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -10361,8 +10335,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -10683,8 +10655,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -10709,8 +10679,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -11031,8 +10999,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -11057,8 +11023,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -11379,8 +11343,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -11405,8 +11367,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -12071,8 +12031,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -12097,8 +12055,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -12419,8 +12375,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -12445,8 +12399,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -12767,8 +12719,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -12793,8 +12743,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -13632,8 +13580,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -13654,8 +13600,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -15789,8 +15733,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -15812,8 +15754,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -16054,8 +15994,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -16077,8 +16015,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -21829,8 +21765,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -21856,8 +21790,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -22188,8 +22120,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -22215,8 +22145,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -22527,8 +22455,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -22550,8 +22476,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -23207,8 +23131,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -23234,8 +23156,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -23566,8 +23486,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -23593,8 +23511,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -23925,8 +23841,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -23952,8 +23866,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -24284,8 +24196,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -24311,8 +24221,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -24998,8 +24906,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -25025,8 +24931,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -25357,8 +25261,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -25384,8 +25286,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -25716,8 +25616,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -25743,8 +25641,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-cluster.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-cluster.ll
index 9ea9f1125c726..27283beb4b877 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-cluster.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-cluster.ll
@@ -795,8 +795,6 @@ define amdgpu_kernel void @flat_cluster_seq_cst_load(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -816,8 +814,6 @@ define amdgpu_kernel void @flat_cluster_seq_cst_load(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -2937,8 +2933,6 @@ define amdgpu_kernel void @flat_cluster_acq_rel_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -2959,8 +2953,6 @@ define amdgpu_kernel void @flat_cluster_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -3190,8 +3182,6 @@ define amdgpu_kernel void @flat_cluster_seq_cst_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -3212,8 +3202,6 @@ define amdgpu_kernel void @flat_cluster_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8982,8 +8970,6 @@ define amdgpu_kernel void @flat_cluster_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -9008,8 +8994,6 @@ define amdgpu_kernel void @flat_cluster_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9329,8 +9313,6 @@ define amdgpu_kernel void @flat_cluster_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -9355,8 +9337,6 @@ define amdgpu_kernel void @flat_cluster_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9656,8 +9636,6 @@ define amdgpu_kernel void @flat_cluster_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -9678,8 +9656,6 @@ define amdgpu_kernel void @flat_cluster_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -10314,8 +10290,6 @@ define amdgpu_kernel void @flat_cluster_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -10340,8 +10314,6 @@ define amdgpu_kernel void @flat_cluster_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -10661,8 +10633,6 @@ define amdgpu_kernel void @flat_cluster_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -10687,8 +10657,6 @@ define amdgpu_kernel void @flat_cluster_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -11008,8 +10976,6 @@ define amdgpu_kernel void @flat_cluster_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -11034,8 +11000,6 @@ define amdgpu_kernel void @flat_cluster_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -11355,8 +11319,6 @@ define amdgpu_kernel void @flat_cluster_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -11381,8 +11343,6 @@ define amdgpu_kernel void @flat_cluster_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -12045,8 +12005,6 @@ define amdgpu_kernel void @flat_cluster_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -12071,8 +12029,6 @@ define amdgpu_kernel void @flat_cluster_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -12392,8 +12348,6 @@ define amdgpu_kernel void @flat_cluster_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -12418,8 +12372,6 @@ define amdgpu_kernel void @flat_cluster_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -12739,8 +12691,6 @@ define amdgpu_kernel void @flat_cluster_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -12765,8 +12715,6 @@ define amdgpu_kernel void @flat_cluster_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -13603,8 +13551,6 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_load(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -13625,8 +13571,6 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_load(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -15755,8 +15699,6 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -15778,8 +15720,6 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -16019,8 +15959,6 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -16042,8 +15980,6 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -21781,8 +21717,6 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -21808,8 +21742,6 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -22139,8 +22071,6 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -22166,8 +22096,6 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -22477,8 +22405,6 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -22500,8 +22426,6 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -23157,8 +23081,6 @@ define amdgpu_kernel void @flat_cluster_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -23184,8 +23106,6 @@ define amdgpu_kernel void @flat_cluster_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -23515,8 +23435,6 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -23542,8 +23460,6 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -23873,8 +23789,6 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -23900,8 +23814,6 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -24231,8 +24143,6 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -24258,8 +24168,6 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -24943,8 +24851,6 @@ define amdgpu_kernel void @flat_cluster_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -24970,8 +24876,6 @@ define amdgpu_kernel void @flat_cluster_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -25301,8 +25205,6 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -25328,8 +25230,6 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -25659,8 +25559,6 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -25686,8 +25584,6 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
index 265c8c409f2d6..74065146fd385 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
@@ -799,8 +799,6 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -820,8 +818,6 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -2979,8 +2975,6 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -3002,8 +2996,6 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -3239,8 +3231,6 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -3262,8 +3252,6 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9126,8 +9114,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -9153,8 +9139,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9480,8 +9464,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -9507,8 +9489,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9811,8 +9791,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -9833,8 +9811,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -10476,8 +10452,6 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -10503,8 +10477,6 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -10830,8 +10802,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -10857,8 +10827,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -11184,8 +11152,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -11211,8 +11177,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -11538,8 +11502,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -11565,8 +11527,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -12242,8 +12202,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -12269,8 +12227,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -12596,8 +12552,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -12623,8 +12577,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -12950,8 +12902,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -12977,8 +12927,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -13820,8 +13768,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -13842,8 +13788,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -16010,8 +15954,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -16034,8 +15976,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -16281,8 +16221,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -16305,8 +16243,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -22138,8 +22074,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -22166,8 +22100,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -22503,8 +22435,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -22531,8 +22461,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -22845,8 +22773,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -22868,8 +22794,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -23532,8 +23456,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -23560,8 +23482,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -23897,8 +23817,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -23925,8 +23843,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -24262,8 +24178,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -24290,8 +24204,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -24627,8 +24539,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -24655,8 +24565,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -25353,8 +25261,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -25381,8 +25287,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -25718,8 +25622,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -25746,8 +25648,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -26083,8 +25983,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -26111,8 +26009,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
index d277441d422d9..2afa5779c7522 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
@@ -1072,7 +1072,8 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1109,7 +1110,8 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1136,7 +1138,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
index e7f348e9f7d2f..d384aec2a2b19 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
@@ -656,12 +656,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -765,12 +765,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -786,8 +786,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -802,12 +800,14 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1195,7 +1195,8 @@ define amdgpu_kernel void @flat_workgroup_release_store(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1280,7 +1281,8 @@ define amdgpu_kernel void @flat_workgroup_release_store(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1307,7 +1309,10 @@ define amdgpu_kernel void @flat_workgroup_release_store(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1374,7 +1379,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1459,7 +1465,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1486,7 +1493,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1893,7 +1903,8 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1978,7 +1989,8 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -2005,7 +2017,10 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -2076,9 +2091,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_atomicrmw:
@@ -2172,9 +2189,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_atomicrmw:
@@ -2202,9 +2221,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_acq_rel_atomicrmw:
@@ -2275,9 +2297,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_atomicrmw:
@@ -2371,9 +2395,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_atomicrmw:
@@ -2401,9 +2427,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_seq_cst_atomicrmw:
@@ -2699,12 +2728,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -2815,12 +2844,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -2837,8 +2866,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -2854,12 +2881,14 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -2939,12 +2968,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -3055,12 +3084,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -3077,8 +3106,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -3094,12 +3121,14 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -3737,7 +3766,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -3860,7 +3890,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -3895,7 +3926,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -4013,9 +4047,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
@@ -4147,9 +4183,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
@@ -4185,9 +4223,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
@@ -4305,9 +4346,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
@@ -4439,9 +4482,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
@@ -4477,9 +4522,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
@@ -5143,9 +5191,11 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_release_acquire_cmpxchg:
@@ -5277,9 +5327,11 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg:
@@ -5315,9 +5367,12 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_release_acquire_cmpxchg:
@@ -5435,9 +5490,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
@@ -5569,9 +5626,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
@@ -5607,9 +5666,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
@@ -5727,9 +5789,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
@@ -5861,9 +5925,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
@@ -5899,9 +5965,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
@@ -6019,9 +6088,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
@@ -6153,9 +6224,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
@@ -6191,9 +6264,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
@@ -6929,7 +7005,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
@@ -7076,7 +7153,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
@@ -7119,7 +7197,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
@@ -7251,12 +7332,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -7405,12 +7486,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -7431,8 +7512,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -7452,12 +7531,14 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -7585,12 +7666,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -7739,12 +7820,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -7765,8 +7846,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -7786,12 +7865,14 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -8083,8 +8164,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -8547,12 +8626,12 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -8701,12 +8780,12 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -8727,8 +8806,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -8748,12 +8825,14 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -8881,12 +8960,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -9035,12 +9114,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -9061,8 +9140,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -9082,12 +9159,14 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -9215,12 +9294,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -9369,12 +9448,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -9395,8 +9474,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -9416,12 +9493,14 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -9549,7 +9628,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -9703,7 +9783,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9729,8 +9810,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -9750,7 +9829,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9883,7 +9965,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -10037,7 +10120,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -10082,7 +10166,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -10215,12 +10302,12 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -10369,12 +10456,12 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -10395,8 +10482,6 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -10416,12 +10501,14 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -10549,12 +10636,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -10703,12 +10790,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -10729,8 +10816,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -10750,12 +10835,14 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -10883,12 +10970,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -11037,12 +11124,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -11063,8 +11150,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -11084,12 +11169,14 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -11758,10 +11845,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -11860,10 +11950,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -11879,8 +11972,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -11896,10 +11987,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -12286,6 +12382,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -12367,6 +12465,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -12393,6 +12493,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -12458,6 +12562,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -12539,6 +12645,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -12565,6 +12673,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -12961,6 +13073,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -13042,6 +13156,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -13068,6 +13184,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -13135,7 +13255,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
@@ -13222,7 +13345,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
@@ -13250,7 +13376,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
@@ -13318,7 +13449,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
@@ -13405,7 +13539,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
@@ -13433,7 +13570,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
@@ -13724,10 +13866,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -13833,10 +13978,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -13853,8 +14001,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -13871,10 +14017,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -13953,10 +14104,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -14062,10 +14216,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -14082,8 +14239,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -14100,10 +14255,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -14731,6 +14891,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -14850,6 +15012,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -14884,6 +15048,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -14998,7 +15166,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
@@ -15123,7 +15294,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
@@ -15159,7 +15333,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
@@ -15274,7 +15453,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
@@ -15399,7 +15581,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
@@ -15435,7 +15620,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
@@ -16078,7 +16268,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
@@ -16203,7 +16396,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
@@ -16239,7 +16435,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
@@ -16354,7 +16555,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
@@ -16479,7 +16683,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
@@ -16515,7 +16722,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
@@ -16630,7 +16842,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
@@ -16755,7 +16970,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
@@ -16791,7 +17009,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
@@ -16906,6 +17129,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -17031,6 +17256,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -17067,6 +17294,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -17182,6 +17413,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -17307,6 +17540,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -17343,6 +17578,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -17458,7 +17697,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
@@ -17583,7 +17825,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
@@ -17619,7 +17864,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
@@ -17734,7 +17984,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
@@ -17859,7 +18112,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
@@ -17895,7 +18151,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
@@ -18010,7 +18271,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
@@ -18135,7 +18399,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
@@ -18171,7 +18438,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
@@ -18902,6 +19174,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
@@ -19045,6 +19319,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
@@ -19087,6 +19363,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
@@ -19217,10 +19497,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -19364,10 +19647,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -19388,8 +19674,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -19410,10 +19694,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -19540,10 +19829,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -19687,10 +19979,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -19711,8 +20006,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -19733,10 +20026,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -20024,8 +20322,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -20483,10 +20779,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -20630,10 +20929,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -20654,8 +20956,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -20676,10 +20976,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -20806,10 +21111,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -20953,10 +21261,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -20977,8 +21288,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -20999,10 +21308,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -21129,10 +21443,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -21276,10 +21593,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -21300,8 +21620,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -21322,10 +21640,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -21452,6 +21775,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
@@ -21599,6 +21924,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
@@ -21623,8 +21950,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -21645,6 +21970,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
@@ -21775,6 +22104,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
@@ -21922,6 +22253,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
@@ -21966,6 +22299,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
@@ -22096,10 +22433,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -22243,10 +22583,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -22267,8 +22610,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -22289,10 +22630,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -22419,10 +22765,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -22566,10 +22915,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -22590,8 +22942,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -22612,10 +22962,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -22742,10 +23097,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -22889,10 +23247,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -22913,8 +23274,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -22935,10 +23294,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
index 5c2d8eb4f5ec0..493f985c84701 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
@@ -804,8 +804,6 @@ define amdgpu_kernel void @global_agent_seq_cst_load(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -822,8 +820,6 @@ define amdgpu_kernel void @global_agent_seq_cst_load(
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -2984,8 +2980,6 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -3003,8 +2997,6 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -3229,8 +3221,6 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -3248,8 +3238,6 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -8606,8 +8594,6 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -8629,8 +8615,6 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -8923,8 +8907,6 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -8946,8 +8928,6 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9219,8 +9199,6 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9238,8 +9216,6 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9814,8 +9790,6 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9837,8 +9811,6 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10131,8 +10103,6 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10154,8 +10124,6 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10448,8 +10416,6 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10471,8 +10437,6 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10765,8 +10729,6 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10788,8 +10750,6 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -11395,8 +11355,6 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -11418,8 +11376,6 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -11712,8 +11668,6 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -11735,8 +11689,6 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -12029,8 +11981,6 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -12052,8 +12002,6 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -12878,8 +12826,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -12896,8 +12842,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -15058,8 +15002,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -15077,8 +15019,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -15303,8 +15243,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -15322,8 +15260,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20384,8 +20320,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20407,8 +20341,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20701,8 +20633,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20724,8 +20654,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20997,8 +20925,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21016,8 +20942,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21592,8 +21516,6 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21615,8 +21537,6 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21909,8 +21829,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21932,8 +21850,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -22226,8 +22142,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -22249,8 +22163,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -22543,8 +22455,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -22566,8 +22476,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -23173,8 +23081,6 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -23196,8 +23102,6 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -23490,8 +23394,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -23513,8 +23415,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -23807,8 +23707,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -23830,8 +23728,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-cluster.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-cluster.ll
index 7c0a2ad5cdb78..40257df684990 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-cluster.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-cluster.ll
@@ -804,8 +804,6 @@ define amdgpu_kernel void @global_cluster_seq_cst_load(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -822,8 +820,6 @@ define amdgpu_kernel void @global_cluster_seq_cst_load(
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -2979,8 +2975,6 @@ define amdgpu_kernel void @global_cluster_acq_rel_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -2998,8 +2992,6 @@ define amdgpu_kernel void @global_cluster_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -3223,8 +3215,6 @@ define amdgpu_kernel void @global_cluster_seq_cst_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -3242,8 +3232,6 @@ define amdgpu_kernel void @global_cluster_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -8587,8 +8575,6 @@ define amdgpu_kernel void @global_cluster_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -8610,8 +8596,6 @@ define amdgpu_kernel void @global_cluster_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -8903,8 +8887,6 @@ define amdgpu_kernel void @global_cluster_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -8926,8 +8908,6 @@ define amdgpu_kernel void @global_cluster_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9198,8 +9178,6 @@ define amdgpu_kernel void @global_cluster_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9217,8 +9195,6 @@ define amdgpu_kernel void @global_cluster_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9793,8 +9769,6 @@ define amdgpu_kernel void @global_cluster_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9816,8 +9790,6 @@ define amdgpu_kernel void @global_cluster_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10109,8 +10081,6 @@ define amdgpu_kernel void @global_cluster_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10132,8 +10102,6 @@ define amdgpu_kernel void @global_cluster_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10425,8 +10393,6 @@ define amdgpu_kernel void @global_cluster_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10448,8 +10414,6 @@ define amdgpu_kernel void @global_cluster_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10741,8 +10705,6 @@ define amdgpu_kernel void @global_cluster_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10764,8 +10726,6 @@ define amdgpu_kernel void @global_cluster_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -11369,8 +11329,6 @@ define amdgpu_kernel void @global_cluster_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -11392,8 +11350,6 @@ define amdgpu_kernel void @global_cluster_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -11685,8 +11641,6 @@ define amdgpu_kernel void @global_cluster_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -11708,8 +11662,6 @@ define amdgpu_kernel void @global_cluster_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -12001,8 +11953,6 @@ define amdgpu_kernel void @global_cluster_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -12024,8 +11974,6 @@ define amdgpu_kernel void @global_cluster_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -12849,8 +12797,6 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_load(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -12867,8 +12813,6 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_load(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -15024,8 +14968,6 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -15043,8 +14985,6 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -15268,8 +15208,6 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -15287,8 +15225,6 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20337,8 +20273,6 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20360,8 +20294,6 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20653,8 +20585,6 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20676,8 +20606,6 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20948,8 +20876,6 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20967,8 +20893,6 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21543,8 +21467,6 @@ define amdgpu_kernel void @global_cluster_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21566,8 +21488,6 @@ define amdgpu_kernel void @global_cluster_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21859,8 +21779,6 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21882,8 +21800,6 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -22175,8 +22091,6 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -22198,8 +22112,6 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -22491,8 +22403,6 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -22514,8 +22424,6 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -23119,8 +23027,6 @@ define amdgpu_kernel void @global_cluster_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -23142,8 +23048,6 @@ define amdgpu_kernel void @global_cluster_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -23435,8 +23339,6 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -23458,8 +23360,6 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -23751,8 +23651,6 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -23774,8 +23672,6 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
index e7880a81800fd..ee5a8bf742fe7 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
@@ -808,8 +808,6 @@ define amdgpu_kernel void @global_system_seq_cst_load(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -826,8 +824,6 @@ define amdgpu_kernel void @global_system_seq_cst_load(
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -3021,8 +3017,6 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -3041,8 +3035,6 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -3272,8 +3264,6 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -3292,8 +3282,6 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -7235,8 +7223,6 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -7259,8 +7245,6 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -7558,8 +7542,6 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -7582,8 +7564,6 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -7857,8 +7837,6 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -7876,8 +7854,6 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -8459,8 +8435,6 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -8483,8 +8457,6 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -8782,8 +8754,6 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -8806,8 +8776,6 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9105,8 +9073,6 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9129,8 +9095,6 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9428,8 +9392,6 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9452,8 +9414,6 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10070,8 +10030,6 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10094,8 +10052,6 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10393,8 +10349,6 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10417,8 +10371,6 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10716,8 +10668,6 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10740,8 +10690,6 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -11570,8 +11518,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -11588,8 +11534,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -13783,8 +13727,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -13803,8 +13745,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -14034,8 +13974,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -14054,8 +13992,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -19493,8 +19429,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -19517,8 +19451,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -19816,8 +19748,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -19840,8 +19770,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20115,8 +20043,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20134,8 +20060,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20717,8 +20641,6 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20741,8 +20663,6 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21040,8 +20960,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21064,8 +20982,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21363,8 +21279,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21387,8 +21301,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21686,8 +21598,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21710,8 +21620,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -22328,8 +22236,6 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -22352,8 +22258,6 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -22651,8 +22555,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -22675,8 +22577,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -22974,8 +22874,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -22998,8 +22896,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
index 3bf5ed8b2397f..c326edfdd490e 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
@@ -959,7 +959,8 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1001,7 +1002,8 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1026,7 +1028,10 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
index 885edec03c2b6..868b438151558 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
@@ -667,7 +667,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -763,7 +764,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -780,8 +782,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -792,7 +792,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
@@ -1206,7 +1209,8 @@ define amdgpu_kernel void @global_workgroup_release_store(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1292,7 +1296,8 @@ define amdgpu_kernel void @global_workgroup_release_store(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1317,7 +1322,10 @@ define amdgpu_kernel void @global_workgroup_release_store(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1393,7 +1401,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1479,7 +1488,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1504,7 +1514,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1920,7 +1933,8 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw(
 ; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -2005,7 +2019,8 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw(
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -2030,7 +2045,10 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw(
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -2107,8 +2125,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw(
 ; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_atomicrmw:
@@ -2198,8 +2218,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw(
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_acq_rel_atomicrmw:
@@ -2225,8 +2247,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw(
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_acq_rel_atomicrmw:
@@ -2303,8 +2329,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw(
 ; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_atomicrmw:
@@ -2394,8 +2422,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw(
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_seq_cst_atomicrmw:
@@ -2421,8 +2451,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw(
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_seq_cst_atomicrmw:
@@ -2707,7 +2741,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw(
 ; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -2809,7 +2844,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw(
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -2827,8 +2863,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -2841,7 +2875,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -2930,7 +2967,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw(
 ; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -3032,7 +3070,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw(
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -3050,8 +3089,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -3064,7 +3101,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -3650,7 +3690,8 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -3764,7 +3805,8 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -3797,7 +3839,10 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -3906,8 +3951,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
@@ -4026,8 +4073,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
@@ -4061,8 +4110,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
@@ -4171,8 +4224,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
@@ -4291,8 +4346,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
@@ -4326,8 +4383,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
@@ -4926,8 +4987,10 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_release_acquire_cmpxchg:
@@ -5046,8 +5109,10 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_release_acquire_cmpxchg:
@@ -5081,8 +5146,12 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_release_acquire_cmpxchg:
@@ -5191,8 +5260,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
@@ -5311,8 +5382,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
@@ -5346,8 +5419,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
@@ -5456,8 +5533,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
@@ -5576,8 +5655,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
@@ -5611,8 +5692,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
@@ -5721,7 +5806,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -5841,7 +5927,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -5876,7 +5963,10 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -5986,7 +6076,8 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -6106,7 +6197,8 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -6141,7 +6233,10 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -6251,8 +6346,10 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_release_seq_cst_cmpxchg:
@@ -6371,8 +6468,10 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_release_seq_cst_cmpxchg:
@@ -6406,8 +6505,12 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_release_seq_cst_cmpxchg:
@@ -6516,8 +6619,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
@@ -6636,8 +6741,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
@@ -6671,8 +6778,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
@@ -6781,8 +6892,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
@@ -6901,8 +7014,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
@@ -6936,8 +7051,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
@@ -7594,7 +7713,8 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -7723,7 +7843,8 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -7760,7 +7881,10 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -7883,7 +8007,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -8015,7 +8140,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -8037,8 +8163,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -8055,7 +8179,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -8178,7 +8305,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -8310,7 +8438,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -8332,8 +8461,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -8350,7 +8477,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -8610,8 +8740,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9021,7 +9149,8 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -9153,7 +9282,8 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9175,8 +9305,6 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9193,7 +9321,10 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9316,7 +9447,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -9448,7 +9580,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9470,8 +9603,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9488,7 +9619,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9611,7 +9745,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -9743,7 +9878,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9765,8 +9901,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9783,7 +9917,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9906,7 +10043,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -10038,7 +10176,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10060,8 +10199,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10078,7 +10215,10 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10201,7 +10341,8 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -10333,7 +10474,8 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10371,7 +10513,10 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10494,7 +10639,8 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -10626,7 +10772,8 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10648,8 +10795,6 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10666,7 +10811,10 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10789,7 +10937,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -10921,7 +11070,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10943,8 +11093,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10961,7 +11109,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -11084,7 +11235,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -11216,7 +11368,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -11238,8 +11391,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -11256,7 +11407,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -11940,7 +12094,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -12035,7 +12190,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -12052,8 +12208,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -12064,6 +12218,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
@@ -12475,6 +12633,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -12557,6 +12717,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -12581,6 +12743,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -12654,6 +12820,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -12736,6 +12904,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -12760,6 +12930,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -13173,6 +13347,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw(
 ; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -13254,6 +13430,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw(
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -13278,6 +13456,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw(
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -13352,7 +13534,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw(
 ; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
@@ -13439,7 +13624,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw(
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
@@ -13465,7 +13653,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw(
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
@@ -13540,7 +13733,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw(
 ; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
@@ -13627,7 +13823,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw(
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
@@ -13653,7 +13852,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw(
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
@@ -13936,6 +14140,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw(
 ; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -14034,6 +14240,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw(
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -14051,8 +14259,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -14065,6 +14271,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -14151,6 +14361,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw(
 ; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -14249,6 +14461,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw(
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -14266,8 +14480,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -14280,6 +14492,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -14863,6 +15079,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -14973,6 +15191,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -15005,6 +15225,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -15111,7 +15335,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
@@ -15227,7 +15454,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
@@ -15261,7 +15491,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
@@ -15368,7 +15603,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
@@ -15484,7 +15722,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
@@ -15518,7 +15759,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
@@ -16115,7 +16361,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
@@ -16231,7 +16480,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
@@ -16265,7 +16517,12 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
@@ -16372,7 +16629,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
@@ -16488,7 +16748,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
@@ -16522,7 +16785,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
@@ -16629,7 +16897,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
@@ -16745,7 +17016,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
@@ -16779,7 +17053,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
@@ -16886,6 +17165,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -17002,6 +17283,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -17036,6 +17319,10 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -17143,6 +17430,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -17259,6 +17548,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -17293,6 +17584,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -17400,7 +17695,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
@@ -17516,7 +17814,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
@@ -17550,7 +17851,12 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
@@ -17657,7 +17963,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
@@ -17773,7 +18082,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
@@ -17807,7 +18119,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
@@ -17914,7 +18231,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
@@ -18030,7 +18350,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
@@ -18064,7 +18387,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
@@ -18719,6 +19047,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -18844,6 +19174,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -18880,6 +19212,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -19000,6 +19336,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -19128,6 +19466,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -19149,8 +19489,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -19167,6 +19505,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -19287,6 +19629,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -19415,6 +19759,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -19436,8 +19782,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -19454,6 +19798,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -19713,8 +20061,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg
 ; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20122,6 +20468,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -20250,6 +20598,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20271,8 +20621,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20289,6 +20637,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20409,6 +20761,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -20537,6 +20891,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20558,8 +20914,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20576,6 +20930,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20696,6 +21054,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -20824,6 +21184,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20845,8 +21207,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20863,6 +21223,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20983,6 +21347,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -21111,6 +21477,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21132,8 +21500,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21150,6 +21516,10 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21270,6 +21640,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -21398,6 +21770,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21435,6 +21809,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21555,6 +21933,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -21683,6 +22063,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21704,8 +22086,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21722,6 +22102,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21842,6 +22226,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -21970,6 +22356,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21991,8 +22379,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -22009,6 +22395,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -22129,6 +22519,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -22257,6 +22649,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -22278,8 +22672,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -22296,6 +22688,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-lds-dma-volatile-and-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-lds-dma-volatile-and-nontemporal.ll
new file mode 100644
index 0000000000000..0b40c81af414d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-lds-dma-volatile-and-nontemporal.ll
@@ -0,0 +1,542 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx90a -global-isel=0 < %s | FileCheck --check-prefixes=GFX90A %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx942 -global-isel=0 < %s | FileCheck --check-prefixes=GFX942 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx942 -global-isel=1 < %s | FileCheck --check-prefixes=GFX942-GISEL %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -global-isel=0 < %s | FileCheck --check-prefixes=GFX10 %s
+
+define amdgpu_ps void @global_load_lds_dword_volatile(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: global_load_lds_dword_volatile:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX90A-NEXT:    s_mov_b32 m0, s2
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    global_load_dword v[0:1], off glc lds
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: global_load_lds_dword_volatile:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX942-NEXT:    s_mov_b32 m0, s2
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: global_load_lds_dword_volatile:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s2
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: global_load_lds_dword_volatile:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 m0, s2
+; GFX10-NEXT:    global_load_dword v[0:1], off glc dlc lds
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648)
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @global_load_lds_dword_nontemporal(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: global_load_lds_dword_nontemporal:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX90A-NEXT:    s_mov_b32 m0, s2
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX90A-NEXT:    global_load_dword v[0:1], off glc slc lds
+; GFX90A-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: global_load_lds_dword_nontemporal:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b32 m0, s2
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off nt
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: global_load_lds_dword_nontemporal:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s2
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off nt
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: global_load_lds_dword_nontemporal:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 m0, s2
+; GFX10-NEXT:    global_load_dword v[0:1], off slc lds
+; GFX10-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 0), !nontemporal !0
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @global_load_lds_dword_volatile_nontemporal(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: global_load_lds_dword_volatile_nontemporal:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX90A-NEXT:    s_mov_b32 m0, s2
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    global_load_dword v[0:1], off glc lds
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: global_load_lds_dword_volatile_nontemporal:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX942-NEXT:    s_mov_b32 m0, s2
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: global_load_lds_dword_volatile_nontemporal:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s2
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: global_load_lds_dword_volatile_nontemporal:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 m0, s2
+; GFX10-NEXT:    global_load_dword v[0:1], off glc dlc lds
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648), !nontemporal !0
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @load_to_lds_p1_dword_volatile(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: load_to_lds_p1_dword_volatile:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX90A-NEXT:    s_mov_b32 m0, s2
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    global_load_dword v[0:1], off glc lds
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: load_to_lds_p1_dword_volatile:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX942-NEXT:    s_mov_b32 m0, s2
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: load_to_lds_p1_dword_volatile:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s2
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: load_to_lds_p1_dword_volatile:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 m0, s2
+; GFX10-NEXT:    global_load_dword v[0:1], off glc dlc lds
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off
+  call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648)
+  call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @load_to_lds_p1_dword_nontemporal(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: load_to_lds_p1_dword_nontemporal:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX90A-NEXT:    s_mov_b32 m0, s2
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX90A-NEXT:    global_load_dword v[0:1], off glc slc lds
+; GFX90A-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: load_to_lds_p1_dword_nontemporal:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b32 m0, s2
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off nt
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: load_to_lds_p1_dword_nontemporal:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s2
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off nt
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: load_to_lds_p1_dword_nontemporal:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 m0, s2
+; GFX10-NEXT:    global_load_dword v[0:1], off slc lds
+; GFX10-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off
+  call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 0), !nontemporal !0
+  call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @load_to_lds_p1_dword_volatile_nontemporal(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: load_to_lds_p1_dword_volatile_nontemporal:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX90A-NEXT:    s_mov_b32 m0, s2
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    global_load_dword v[0:1], off glc lds
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: load_to_lds_p1_dword_volatile_nontemporal:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX942-NEXT:    s_mov_b32 m0, s2
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: load_to_lds_p1_dword_volatile_nontemporal:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s2
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: load_to_lds_p1_dword_volatile_nontemporal:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 m0, s2
+; GFX10-NEXT:    global_load_dword v[0:1], off glc dlc lds
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off
+  call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648), !nontemporal !0
+  call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @load_to_lds_p7_dword_volatile(ptr addrspace(7) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: load_to_lds_p7_dword_volatile:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX90A-NEXT:    s_mov_b32 m0, s5
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc lds
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: load_to_lds_p7_dword_volatile:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX942-NEXT:    s_mov_b32 m0, s5
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: load_to_lds_p7_dword_volatile:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s5
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: load_to_lds_p7_dword_volatile:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, s4, v0
+; GFX10-NEXT:    s_mov_b32 m0, s5
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc dlc lds
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  %gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
+  call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648)
+  call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @load_to_lds_p7_dword_nontemporal(ptr addrspace(7) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: load_to_lds_p7_dword_nontemporal:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX90A-NEXT:    s_mov_b32 m0, s5
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc slc lds
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: load_to_lds_p7_dword_nontemporal:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX942-NEXT:    s_mov_b32 m0, s5
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen nt lds
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: load_to_lds_p7_dword_nontemporal:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s5
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen nt lds
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: load_to_lds_p7_dword_nontemporal:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, s4, v0
+; GFX10-NEXT:    s_mov_b32 m0, s5
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen slc lds
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  %gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
+  call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 0), !nontemporal !0
+  call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @load_to_lds_p7_dword_volatile_nontemporal(ptr addrspace(7) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: load_to_lds_p7_dword_volatile_nontemporal:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX90A-NEXT:    s_mov_b32 m0, s5
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc lds
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: load_to_lds_p7_dword_volatile_nontemporal:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX942-NEXT:    s_mov_b32 m0, s5
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: load_to_lds_p7_dword_volatile_nontemporal:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s5
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: load_to_lds_p7_dword_volatile_nontemporal:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, s4, v0
+; GFX10-NEXT:    s_mov_b32 m0, s5
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc dlc lds
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  %gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
+  call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648), !nontemporal !0
+  call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @raw_ptr_buffer_load_lds_dword_volatile(ptr addrspace(8) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: raw_ptr_buffer_load_lds_dword_volatile:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_mov_b32 m0, s4
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc lds
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: raw_ptr_buffer_load_lds_dword_volatile:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b32 m0, s4
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: raw_ptr_buffer_load_lds_dword_volatile:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s4
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: raw_ptr_buffer_load_lds_dword_volatile:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_mov_b32 m0, s4
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc dlc lds
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 0, i32 2147483648)
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @raw_ptr_buffer_load_lds_dword_nontemporal(ptr addrspace(8) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: raw_ptr_buffer_load_lds_dword_nontemporal:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_mov_b32 m0, s4
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc slc lds
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: raw_ptr_buffer_load_lds_dword_nontemporal:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b32 m0, s4
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen nt lds
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: raw_ptr_buffer_load_lds_dword_nontemporal:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s4
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen nt lds
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: raw_ptr_buffer_load_lds_dword_nontemporal:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_mov_b32 m0, s4
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen slc lds
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 0, i32 0), !nontemporal !0
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @raw_ptr_buffer_load_lds_dword_volatile_nontemporal(ptr addrspace(8) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: raw_ptr_buffer_load_lds_dword_volatile_nontemporal:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_mov_b32 m0, s4
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc lds
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: raw_ptr_buffer_load_lds_dword_volatile_nontemporal:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b32 m0, s4
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: raw_ptr_buffer_load_lds_dword_volatile_nontemporal:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s4
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: raw_ptr_buffer_load_lds_dword_volatile_nontemporal:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_mov_b32 m0, s4
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc dlc lds
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 0, i32 2147483648), !nontemporal !0
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 512, i32 0)
+  ret void
+}
+
+!0 = !{i32 1}
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
index 986b48b60a443..712109d2f67f9 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
@@ -622,7 +622,8 @@ define amdgpu_kernel void @local_agent_seq_cst_load(
 ; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_read_b32 v1, v0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -719,7 +720,8 @@ define amdgpu_kernel void @local_agent_seq_cst_load(
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_load_b32 v1, v0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -749,7 +751,10 @@ define amdgpu_kernel void @local_agent_seq_cst_load(
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_load_b32 v1, v0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -1121,7 +1126,8 @@ define amdgpu_kernel void @local_agent_release_store(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1200,7 +1206,8 @@ define amdgpu_kernel void @local_agent_release_store(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1225,7 +1232,10 @@ define amdgpu_kernel void @local_agent_release_store(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1291,7 +1301,8 @@ define amdgpu_kernel void @local_agent_seq_cst_store(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1370,7 +1381,8 @@ define amdgpu_kernel void @local_agent_seq_cst_store(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1395,7 +1407,10 @@ define amdgpu_kernel void @local_agent_seq_cst_store(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1778,7 +1793,8 @@ define amdgpu_kernel void @local_agent_release_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1857,7 +1873,8 @@ define amdgpu_kernel void @local_agent_release_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1882,7 +1899,10 @@ define amdgpu_kernel void @local_agent_release_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1952,7 +1972,8 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -2039,7 +2060,8 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -2067,7 +2089,10 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -2139,7 +2164,8 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -2226,7 +2252,8 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -2254,7 +2281,10 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -2535,7 +2565,8 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -2639,7 +2670,8 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -2671,7 +2703,10 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -2756,7 +2791,8 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -2860,7 +2896,8 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -2892,7 +2929,10 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -3348,7 +3388,8 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -3441,7 +3482,8 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -3470,7 +3512,10 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -3551,7 +3596,8 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -3652,7 +3698,8 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -3684,7 +3731,10 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -3767,7 +3817,8 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -3868,7 +3919,8 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -3900,7 +3952,10 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4375,7 +4430,8 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -4476,7 +4532,8 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -4508,7 +4565,10 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4591,7 +4651,8 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -4692,7 +4753,8 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -4724,7 +4786,10 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4807,7 +4872,8 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -4908,7 +4974,8 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -4940,7 +5007,10 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5023,7 +5093,8 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5124,7 +5195,8 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5156,7 +5228,10 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5239,7 +5314,8 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5340,7 +5416,8 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5372,7 +5449,10 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5455,7 +5535,8 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5556,7 +5637,8 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5588,7 +5670,10 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5671,7 +5756,8 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5772,7 +5858,8 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5804,7 +5891,10 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5887,7 +5977,8 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5988,7 +6079,8 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -6020,7 +6112,10 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -6567,7 +6662,8 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6682,7 +6778,8 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6717,7 +6814,10 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
@@ -6814,7 +6914,8 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -6932,7 +7033,8 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -6968,7 +7070,10 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7065,7 +7170,8 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -7183,7 +7289,8 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7219,7 +7326,10 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7778,7 +7888,8 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -7896,7 +8007,8 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7932,7 +8044,10 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8029,7 +8144,8 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8147,7 +8263,8 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8183,7 +8300,10 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8280,7 +8400,8 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8398,7 +8519,8 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8434,7 +8556,10 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8531,7 +8656,8 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8649,7 +8775,8 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8685,7 +8812,10 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8782,7 +8912,8 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8900,7 +9031,8 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8936,7 +9068,10 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9033,7 +9168,8 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -9151,7 +9287,8 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9187,7 +9324,10 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9284,7 +9424,8 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -9402,7 +9543,8 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9438,7 +9580,10 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9535,7 +9680,8 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -9653,7 +9799,8 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9689,7 +9836,10 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll
index 8926893c68dbc..6d1e4e6a96119 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll
@@ -622,7 +622,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_load(
 ; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_read_b32 v1, v0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -719,7 +720,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_load(
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_load_b32 v1, v0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -749,7 +751,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_load(
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_load_b32 v1, v0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -1121,7 +1126,8 @@ define amdgpu_kernel void @local_cluster_release_store(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1200,7 +1206,8 @@ define amdgpu_kernel void @local_cluster_release_store(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1225,7 +1232,10 @@ define amdgpu_kernel void @local_cluster_release_store(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1291,7 +1301,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_store(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1370,7 +1381,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_store(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1395,7 +1407,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_store(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1778,7 +1793,8 @@ define amdgpu_kernel void @local_cluster_release_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1857,7 +1873,8 @@ define amdgpu_kernel void @local_cluster_release_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1882,7 +1899,10 @@ define amdgpu_kernel void @local_cluster_release_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1952,7 +1972,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -2039,7 +2060,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -2067,7 +2089,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -2139,7 +2164,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -2226,7 +2252,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -2254,7 +2281,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -2535,7 +2565,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_ret_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -2639,7 +2670,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_ret_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -2671,7 +2703,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -2756,7 +2791,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_ret_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -2860,7 +2896,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_ret_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -2892,7 +2929,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -3348,7 +3388,8 @@ define amdgpu_kernel void @local_cluster_release_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -3441,7 +3482,8 @@ define amdgpu_kernel void @local_cluster_release_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -3470,7 +3512,10 @@ define amdgpu_kernel void @local_cluster_release_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -3551,7 +3596,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -3652,7 +3698,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -3684,7 +3731,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -3767,7 +3817,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -3868,7 +3919,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -3900,7 +3952,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4375,7 +4430,8 @@ define amdgpu_kernel void @local_cluster_release_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -4476,7 +4532,8 @@ define amdgpu_kernel void @local_cluster_release_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -4508,7 +4565,10 @@ define amdgpu_kernel void @local_cluster_release_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4591,7 +4651,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -4692,7 +4753,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -4724,7 +4786,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4807,7 +4872,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -4908,7 +4974,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -4940,7 +5007,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5023,7 +5093,8 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5124,7 +5195,8 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5156,7 +5228,10 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5239,7 +5314,8 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5340,7 +5416,8 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5372,7 +5449,10 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5455,7 +5535,8 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5556,7 +5637,8 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5588,7 +5670,10 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5671,7 +5756,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5772,7 +5858,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5804,7 +5891,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5887,7 +5977,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5988,7 +6079,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -6020,7 +6112,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -6567,7 +6662,8 @@ define amdgpu_kernel void @local_cluster_release_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6682,7 +6778,8 @@ define amdgpu_kernel void @local_cluster_release_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6717,7 +6814,10 @@ define amdgpu_kernel void @local_cluster_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
@@ -6814,7 +6914,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -6932,7 +7033,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -6968,7 +7070,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7065,7 +7170,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -7183,7 +7289,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7219,7 +7326,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7778,7 +7888,8 @@ define amdgpu_kernel void @local_cluster_release_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -7896,7 +8007,8 @@ define amdgpu_kernel void @local_cluster_release_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7932,7 +8044,10 @@ define amdgpu_kernel void @local_cluster_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8029,7 +8144,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8147,7 +8263,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8183,7 +8300,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8280,7 +8400,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8398,7 +8519,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8434,7 +8556,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8531,7 +8656,8 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8649,7 +8775,8 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8685,7 +8812,10 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8782,7 +8912,8 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8900,7 +9031,8 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8936,7 +9068,10 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9033,7 +9168,8 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -9151,7 +9287,8 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9187,7 +9324,10 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9284,7 +9424,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -9402,7 +9543,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9438,7 +9580,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9535,7 +9680,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -9653,7 +9799,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9689,7 +9836,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
index 81bbe0a78203e..577d2ca9514bb 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
@@ -622,7 +622,8 @@ define amdgpu_kernel void @local_system_seq_cst_load(
 ; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_read_b32 v1, v0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -719,7 +720,8 @@ define amdgpu_kernel void @local_system_seq_cst_load(
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_load_b32 v1, v0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -749,7 +751,10 @@ define amdgpu_kernel void @local_system_seq_cst_load(
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_load_b32 v1, v0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -1121,7 +1126,8 @@ define amdgpu_kernel void @local_system_release_store(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1200,7 +1206,8 @@ define amdgpu_kernel void @local_system_release_store(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1225,7 +1232,10 @@ define amdgpu_kernel void @local_system_release_store(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1291,7 +1301,8 @@ define amdgpu_kernel void @local_system_seq_cst_store(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1370,7 +1381,8 @@ define amdgpu_kernel void @local_system_seq_cst_store(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1395,7 +1407,10 @@ define amdgpu_kernel void @local_system_seq_cst_store(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1778,7 +1793,8 @@ define amdgpu_kernel void @local_system_release_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1857,7 +1873,8 @@ define amdgpu_kernel void @local_system_release_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1882,7 +1899,10 @@ define amdgpu_kernel void @local_system_release_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1952,7 +1972,8 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -2039,7 +2060,8 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -2067,7 +2089,10 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -2139,7 +2164,8 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -2226,7 +2252,8 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -2254,7 +2281,10 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -2535,7 +2565,8 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -2639,7 +2670,8 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -2671,7 +2703,10 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -2756,7 +2791,8 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -2860,7 +2896,8 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -2892,7 +2929,10 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -3348,7 +3388,8 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -3441,7 +3482,8 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -3470,7 +3512,10 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -3551,7 +3596,8 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -3652,7 +3698,8 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -3684,7 +3731,10 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -3767,7 +3817,8 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -3868,7 +3919,8 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -3900,7 +3952,10 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4375,7 +4430,8 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -4476,7 +4532,8 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -4508,7 +4565,10 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4591,7 +4651,8 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -4692,7 +4753,8 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -4724,7 +4786,10 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4807,7 +4872,8 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -4908,7 +4974,8 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -4940,7 +5007,10 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5023,7 +5093,8 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5124,7 +5195,8 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5156,7 +5228,10 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5239,7 +5314,8 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5340,7 +5416,8 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5372,7 +5449,10 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5455,7 +5535,8 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5556,7 +5637,8 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5588,7 +5670,10 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5671,7 +5756,8 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5772,7 +5858,8 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5804,7 +5891,10 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5887,7 +5977,8 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5988,7 +6079,8 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -6020,7 +6112,10 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -6567,7 +6662,8 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6682,7 +6778,8 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6717,7 +6814,10 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
@@ -6814,7 +6914,8 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -6932,7 +7033,8 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -6968,7 +7070,10 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7065,7 +7170,8 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -7183,7 +7289,8 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7219,7 +7326,10 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7778,7 +7888,8 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -7896,7 +8007,8 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7932,7 +8044,10 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8029,7 +8144,8 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8147,7 +8263,8 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8183,7 +8300,10 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8280,7 +8400,8 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8398,7 +8519,8 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8434,7 +8556,10 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8531,7 +8656,8 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8649,7 +8775,8 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8685,7 +8812,10 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8782,7 +8912,8 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8900,7 +9031,8 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8936,7 +9068,10 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9033,7 +9168,8 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -9151,7 +9287,8 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9187,7 +9324,10 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9284,7 +9424,8 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -9402,7 +9543,8 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9438,7 +9580,10 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9535,7 +9680,8 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -9653,7 +9799,8 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9689,7 +9836,10 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
index 980141a87ecf3..d686e7a2d5b4c 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
@@ -819,7 +819,8 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -854,7 +855,8 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -879,7 +881,10 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
index 6a233a2c9013b..ab4d7834b23a5 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
@@ -622,7 +622,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load(
 ; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_read_b32 v1, v0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -719,7 +720,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load(
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_load_b32 v1, v0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -749,7 +751,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load(
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_load_b32 v1, v0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -1121,7 +1126,8 @@ define amdgpu_kernel void @local_workgroup_release_store(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1200,7 +1206,8 @@ define amdgpu_kernel void @local_workgroup_release_store(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1225,7 +1232,10 @@ define amdgpu_kernel void @local_workgroup_release_store(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1291,7 +1301,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1370,7 +1381,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1395,7 +1407,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1778,7 +1793,8 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1857,7 +1873,8 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1882,7 +1899,10 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1952,7 +1972,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -2039,7 +2060,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -2067,7 +2089,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -2139,7 +2164,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -2226,7 +2252,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -2254,7 +2281,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -2535,7 +2565,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -2639,7 +2670,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -2671,7 +2703,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -2756,7 +2791,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -2860,7 +2896,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -2892,7 +2929,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -3348,7 +3388,8 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -3441,7 +3482,8 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -3470,7 +3512,10 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -3551,7 +3596,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -3652,7 +3698,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -3684,7 +3731,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -3767,7 +3817,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -3868,7 +3919,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -3900,7 +3952,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4375,7 +4430,8 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -4476,7 +4532,8 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -4508,7 +4565,10 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4591,7 +4651,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -4692,7 +4753,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -4724,7 +4786,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4807,7 +4872,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -4908,7 +4974,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -4940,7 +5007,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5023,7 +5093,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5124,7 +5195,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5156,7 +5228,10 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5239,7 +5314,8 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5340,7 +5416,8 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5372,7 +5449,10 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5455,7 +5535,8 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5556,7 +5637,8 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5588,7 +5670,10 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5671,7 +5756,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5772,7 +5858,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5804,7 +5891,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5887,7 +5977,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5988,7 +6079,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -6020,7 +6112,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -6567,7 +6662,8 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6682,7 +6778,8 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6717,7 +6814,10 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
@@ -6814,7 +6914,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -6932,7 +7033,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -6968,7 +7070,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7065,7 +7170,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -7183,7 +7289,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7219,7 +7326,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7778,7 +7888,8 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -7896,7 +8007,8 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7932,7 +8044,10 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8029,7 +8144,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8147,7 +8263,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8183,7 +8300,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8280,7 +8400,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8398,7 +8519,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8434,7 +8556,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8531,7 +8656,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8649,7 +8775,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8685,7 +8812,10 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8782,7 +8912,8 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8900,7 +9031,8 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8936,7 +9068,10 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9033,7 +9168,8 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -9151,7 +9287,8 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9187,7 +9324,10 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9284,7 +9424,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -9402,7 +9543,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9438,7 +9580,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9535,7 +9680,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -9653,7 +9799,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9689,7 +9836,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/minmax.ll b/llvm/test/CodeGen/AMDGPU/minmax.ll
index 56f9c5dfe5068..d578d2e9720f0 100644
--- a/llvm/test/CodeGen/AMDGPU/minmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/minmax.ll
@@ -612,10 +612,10 @@ define void @test_med3_f32(ptr addrspace(1) %arg, float %x, float %y, float %z)
 ; GFX1250-NEXT:    v_med3_num_f32 v2, v2, v3, v4
 ; GFX1250-NEXT:    global_store_b32 v[0:1], v2, off
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
-  %tmp0 = call float @llvm.minnum.f32(float %x, float %y)
-  %tmp1 = call float @llvm.maxnum.f32(float %x, float %y)
-  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %z)
-  %tmp3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  %tmp0 = call nnan float @llvm.minnum.f32(float %x, float %y)
+  %tmp1 = call nnan float @llvm.maxnum.f32(float %x, float %y)
+  %tmp2 = call nnan float @llvm.minnum.f32(float %tmp1, float %z)
+  %tmp3 = call nnan float @llvm.maxnum.f32(float %tmp0, float %tmp2)
   store float %tmp3, ptr addrspace(1) %arg
   ret void
 }
@@ -646,10 +646,10 @@ define void @test_med3_minimumnum_maximumnum_f32(ptr addrspace(1) %arg, float %x
 ; GFX1250-NEXT:    v_med3_num_f32 v2, v2, v3, v4
 ; GFX1250-NEXT:    global_store_b32 v[0:1], v2, off
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
-  %tmp0 = call float @llvm.minimumnum.f32(float %x, float %y)
-  %tmp1 = call float @llvm.maximumnum.f32(float %x, float %y)
-  %tmp2 = call float @llvm.minimumnum.f32(float %tmp1, float %z)
-  %tmp3 = call float @llvm.maximumnum.f32(float %tmp0, float %tmp2)
+  %tmp0 = call nnan float @llvm.minimumnum.f32(float %x, float %y)
+  %tmp1 = call nnan float @llvm.maximumnum.f32(float %x, float %y)
+  %tmp2 = call nnan float @llvm.minimumnum.f32(float %tmp1, float %z)
+  %tmp3 = call nnan float @llvm.maximumnum.f32(float %tmp0, float %tmp2)
   store float %tmp3, ptr addrspace(1) %arg
   ret void
 }
@@ -1280,10 +1280,10 @@ define void @test_med3_f16(ptr addrspace(1) %arg, half %x, half %y, half %z) #0
 ; GISEL-GFX1250-FAKE16-NEXT:    v_med3_num_f16 v2, v2, v3, v4
 ; GISEL-GFX1250-FAKE16-NEXT:    global_store_b16 v[0:1], v2, off
 ; GISEL-GFX1250-FAKE16-NEXT:    s_set_pc_i64 s[30:31]
-  %tmp0 = call half @llvm.minnum.f16(half %x, half %y)
-  %tmp1 = call half @llvm.maxnum.f16(half %x, half %y)
-  %tmp2 = call half @llvm.minnum.f16(half %tmp1, half %z)
-  %tmp3 = call half @llvm.maxnum.f16(half %tmp0, half %tmp2)
+  %tmp0 = call nnan half @llvm.minnum.f16(half %x, half %y)
+  %tmp1 = call nnan half @llvm.maxnum.f16(half %x, half %y)
+  %tmp2 = call nnan half @llvm.minnum.f16(half %tmp1, half %z)
+  %tmp3 = call nnan half @llvm.maxnum.f16(half %tmp0, half %tmp2)
   store half %tmp3, ptr addrspace(1) %arg
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/readsteadycounter.ll b/llvm/test/CodeGen/AMDGPU/readsteadycounter.ll
index ddbae6493aceb..a95d8c788c34c 100644
--- a/llvm/test/CodeGen/AMDGPU/readsteadycounter.ll
+++ b/llvm/test/CodeGen/AMDGPU/readsteadycounter.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck %s -check-prefixes=GCN,GFX700
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GCN,GFX900
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GCN,GFX900
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GCN,GFX900
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GCN,GFX1100
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GCN,GFX1100
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GCN,GFX1100
 
 declare i64 @llvm.readsteadycounter() #0
 
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
index 9a23788f8855a..8803f3ae4906f 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
@@ -367,77 +367,76 @@ bb:
 define amdgpu_kernel void @illegal_mfma_after_rewrite() #1 {
 ; CHECK-LABEL: illegal_mfma_after_rewrite:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    s_mov_b32 s0, 0
-; CHECK-NEXT:    s_mov_b32 s1, s0
-; CHECK-NEXT:    v_mov_b64_e32 v[28:29], s[0:1]
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    s_mov_b32 s5, s4
+; CHECK-NEXT:    v_mov_b64_e32 v[26:27], s[4:5]
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:3]
 ; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def v[16:19]
+; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    v_mov_b64_e32 v[6:7], s[2:3]
-; CHECK-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; CHECK-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; CHECK-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; CHECK-NEXT:    s_mov_b32 s0, 0x3c003c00
 ; CHECK-NEXT:    s_mov_b32 s1, s0
-; CHECK-NEXT:    v_mov_b64_e32 v[30:31], s[0:1]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[4:7], v[26:27], v[26:27], v[0:3]
+; CHECK-NEXT:    v_mov_b64_e32 v[28:29], s[0:1]
 ; CHECK-NEXT:    s_mov_b32 s0, 0x7e007e00
 ; CHECK-NEXT:    s_mov_b32 s1, s0
-; CHECK-NEXT:    v_accvgpr_write_b32 a0, s0
-; CHECK-NEXT:    v_accvgpr_write_b32 a1, s1
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[28:29], v[28:29], v[4:7]
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[30:31], v[4:7]
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[12:15], v[28:29], a[0:1], v[4:7]
-; CHECK-NEXT:    s_nop 2
-; CHECK-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; CHECK-NEXT:    v_mov_b32_e32 v5, v4
-; CHECK-NEXT:    v_mov_b32_e32 v6, v4
-; CHECK-NEXT:    v_mov_b32_e32 v7, v4
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[8:11]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[4:7], v[26:27], v[26:27], v[4:7]
+; CHECK-NEXT:    v_mov_b64_e32 v[30:31], s[0:1]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[28:29], v[0:3]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[26:27], v[6:9]
+; CHECK-NEXT:    s_nop 3
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v24, v4
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[12:15], v[26:27], v[30:31], v[0:3]
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[16:19], v[28:29], v[28:29], v[4:7]
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def v[4:7]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[16:19], v[28:29], v[28:29], v[16:19]
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[24:27], v[28:29], v[30:31], v[4:7]
-; CHECK-NEXT:    s_nop 5
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v17, v8
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[12:15]
-; CHECK-NEXT:    s_nop 2
-; CHECK-NEXT:    v_mov_b64_e32 v[12:13], 0
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[28:29], v[28:29], v[0:3]
-; CHECK-NEXT:    global_store_short v[12:13], v17, off
+; CHECK-NEXT:    v_mov_b32_e32 v8, 0x7fc00000
+; CHECK-NEXT:    v_mov_b32_e32 v9, v8
+; CHECK-NEXT:    v_mov_b32_e32 v10, v8
+; CHECK-NEXT:    v_mov_b32_e32 v11, v8
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v2, v6
+; CHECK-NEXT:    v_mov_b64_e32 v[0:1], 0
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[8:11], v[26:27], v[26:27], v[8:11]
+; CHECK-NEXT:    global_store_short v[0:1], v2, off
 ; CHECK-NEXT:    buffer_wbl2 sc0 sc1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_inv sc0 sc1
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v9, v16
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[20:23], v[28:29], v[28:29], v[4:7]
-; CHECK-NEXT:    global_store_short v[12:13], v9, off
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v1, v8
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[24:27]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[2:5], v[26:27], v[28:29], v[16:19]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[26:27], v[8:11]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[20:23], v[26:27], v[26:27], v[16:19]
+; CHECK-NEXT:    s_nop 5
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v10, v6
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[26:27], v[12:15]
+; CHECK-NEXT:    global_store_short v[0:1], v10, off
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[2:5], v[26:27], v[26:27], v[2:5]
 ; CHECK-NEXT:    buffer_wbl2 sc0 sc1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_inv sc0 sc1
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v14, v0
-; CHECK-NEXT:    global_store_short v[12:13], v1, off
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[4:7], v[28:29], v[28:29], v[20:23]
+; CHECK-NEXT:    s_nop 1
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; CHECK-NEXT:    global_store_short v[0:1], v6, off
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[16:19], v[26:27], v[26:27], v[20:23]
 ; CHECK-NEXT:    buffer_wbl2 sc0 sc1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_inv sc0 sc1
-; CHECK-NEXT:    global_store_short v[12:13], v14, off
+; CHECK-NEXT:    global_store_short v[0:1], v24, off
 ; CHECK-NEXT:    buffer_wbl2 sc0 sc1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_inv sc0 sc1
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[30:31], v[28:29], v[8:11]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[2:5], v[28:29], v[26:27], v[2:5]
 ; CHECK-NEXT:    s_nop 6
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v8, v0
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], a[0:1], v[28:29], v[4:7]
-; CHECK-NEXT:    global_store_short v[12:13], v8, off
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[2:5], v[30:31], v[26:27], v[16:19]
+; CHECK-NEXT:    global_store_short v[0:1], v6, off
 ; CHECK-NEXT:    buffer_wbl2 sc0 sc1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_inv sc0 sc1
 ; CHECK-NEXT:    s_nop 2
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; CHECK-NEXT:    global_store_short v[12:13], v0, off
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; CHECK-NEXT:    global_store_short v[0:1], v2, off
 ; CHECK-NEXT:    s_endpgm
 entry:
   %k0 = call <4 x float> asm sideeffect "; def $0", "=s"()
@@ -546,100 +545,14 @@ define void @test_rewrite_mfma_subreg_insert2(double %arg0, double %arg1, ptr ad
 define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) #0 {
 ; CHECK-LABEL: test_rewrite_mfma_direct_copy_from_agpr_class:
 ; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_accvgpr_write_b32 a34, 2.0
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def a[0:31]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    v_accvgpr_write_b32 a32, v0
-; CHECK-NEXT:    v_accvgpr_read_b32 v63, a31
-; CHECK-NEXT:    v_accvgpr_read_b32 v62, a30
-; CHECK-NEXT:    v_accvgpr_read_b32 v61, a29
-; CHECK-NEXT:    v_accvgpr_read_b32 v60, a28
-; CHECK-NEXT:    v_accvgpr_read_b32 v59, a27
-; CHECK-NEXT:    v_accvgpr_read_b32 v58, a26
-; CHECK-NEXT:    v_accvgpr_read_b32 v57, a25
-; CHECK-NEXT:    v_accvgpr_read_b32 v56, a24
-; CHECK-NEXT:    v_accvgpr_read_b32 v55, a23
-; CHECK-NEXT:    v_accvgpr_read_b32 v54, a22
-; CHECK-NEXT:    v_accvgpr_read_b32 v53, a21
-; CHECK-NEXT:    v_accvgpr_read_b32 v52, a20
-; CHECK-NEXT:    v_accvgpr_read_b32 v51, a19
-; CHECK-NEXT:    v_accvgpr_read_b32 v50, a18
-; CHECK-NEXT:    v_accvgpr_read_b32 v49, a17
-; CHECK-NEXT:    v_accvgpr_read_b32 v48, a16
-; CHECK-NEXT:    v_accvgpr_read_b32 v47, a15
-; CHECK-NEXT:    v_accvgpr_read_b32 v46, a14
-; CHECK-NEXT:    v_accvgpr_read_b32 v45, a13
-; CHECK-NEXT:    v_accvgpr_read_b32 v44, a12
-; CHECK-NEXT:    v_accvgpr_read_b32 v43, a11
-; CHECK-NEXT:    v_accvgpr_read_b32 v42, a10
-; CHECK-NEXT:    v_accvgpr_read_b32 v41, a9
-; CHECK-NEXT:    v_accvgpr_read_b32 v40, a8
-; CHECK-NEXT:    v_accvgpr_read_b32 v39, a7
-; CHECK-NEXT:    v_accvgpr_read_b32 v38, a6
-; CHECK-NEXT:    v_accvgpr_read_b32 v37, a5
-; CHECK-NEXT:    v_accvgpr_read_b32 v36, a4
-; CHECK-NEXT:    v_accvgpr_read_b32 v35, a3
-; CHECK-NEXT:    v_accvgpr_read_b32 v34, a2
-; CHECK-NEXT:    v_accvgpr_read_b32 v33, a1
-; CHECK-NEXT:    v_accvgpr_read_b32 v32, a0
-; CHECK-NEXT:    v_accvgpr_write_b32 a0, 2.0
-; CHECK-NEXT:    v_accvgpr_write_b32 a1, 4.0
-; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[0:31], a0, a1, v[32:63]
-; CHECK-NEXT:    v_accvgpr_write_b32 a0, v32
-; CHECK-NEXT:    v_accvgpr_write_b32 a1, v33
-; CHECK-NEXT:    v_accvgpr_write_b32 a2, v34
-; CHECK-NEXT:    v_accvgpr_write_b32 a3, v35
-; CHECK-NEXT:    v_accvgpr_write_b32 a4, v36
-; CHECK-NEXT:    v_accvgpr_write_b32 a5, v37
-; CHECK-NEXT:    v_accvgpr_write_b32 a6, v38
-; CHECK-NEXT:    v_accvgpr_write_b32 a7, v39
-; CHECK-NEXT:    v_accvgpr_write_b32 a8, v40
-; CHECK-NEXT:    v_accvgpr_write_b32 a9, v41
-; CHECK-NEXT:    v_accvgpr_write_b32 a10, v42
-; CHECK-NEXT:    v_accvgpr_write_b32 a11, v43
-; CHECK-NEXT:    v_accvgpr_write_b32 a12, v44
-; CHECK-NEXT:    v_accvgpr_write_b32 a13, v45
-; CHECK-NEXT:    v_accvgpr_write_b32 a14, v46
-; CHECK-NEXT:    v_accvgpr_write_b32 a15, v47
-; CHECK-NEXT:    v_accvgpr_write_b32 a16, v48
-; CHECK-NEXT:    v_accvgpr_write_b32 a17, v49
-; CHECK-NEXT:    v_accvgpr_write_b32 a18, v50
-; CHECK-NEXT:    v_accvgpr_write_b32 a19, v51
-; CHECK-NEXT:    v_accvgpr_write_b32 a20, v52
-; CHECK-NEXT:    v_accvgpr_write_b32 a21, v53
-; CHECK-NEXT:    v_accvgpr_write_b32 a22, v54
-; CHECK-NEXT:    v_accvgpr_write_b32 a23, v55
-; CHECK-NEXT:    v_accvgpr_write_b32 a24, v56
-; CHECK-NEXT:    v_accvgpr_write_b32 a25, v57
-; CHECK-NEXT:    v_accvgpr_write_b32 a26, v58
-; CHECK-NEXT:    v_accvgpr_write_b32 a27, v59
-; CHECK-NEXT:    v_accvgpr_write_b32 a28, v60
-; CHECK-NEXT:    v_accvgpr_write_b32 a29, v61
-; CHECK-NEXT:    v_accvgpr_write_b32 a30, v62
-; CHECK-NEXT:    v_accvgpr_write_b32 a31, v63
-; CHECK-NEXT:    v_mov_b32_e32 v33, 0x41000000
-; CHECK-NEXT:    v_mov_b32_e32 v34, 0x41800000
-; CHECK-NEXT:    v_accvgpr_read_b32 v32, a32
-; CHECK-NEXT:    v_and_b32_e32 v32, 0x3ff, v32
-; CHECK-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v33, v34, a[0:31]
-; CHECK-NEXT:    v_lshlrev_b32_e32 v32, 7, v32
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
-; CHECK-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
-; CHECK-NEXT:    global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
-; CHECK-NEXT:    global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
-; CHECK-NEXT:    global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
-; CHECK-NEXT:    global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
-; CHECK-NEXT:    global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
-; CHECK-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1]
-; CHECK-NEXT:    s_nop 7
 ; CHECK-NEXT:    v_accvgpr_read_b32 v0, a0
-; CHECK-NEXT:    v_accvgpr_read_b32 v24, a24
-; CHECK-NEXT:    v_accvgpr_read_b32 v25, a25
-; CHECK-NEXT:    v_accvgpr_read_b32 v26, a26
-; CHECK-NEXT:    v_accvgpr_read_b32 v27, a27
 ; CHECK-NEXT:    v_accvgpr_read_b32 v1, a1
 ; CHECK-NEXT:    v_accvgpr_read_b32 v2, a2
 ; CHECK-NEXT:    v_accvgpr_read_b32 v3, a3
@@ -663,18 +576,60 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class(ptr add
 ; CHECK-NEXT:    v_accvgpr_read_b32 v21, a21
 ; CHECK-NEXT:    v_accvgpr_read_b32 v22, a22
 ; CHECK-NEXT:    v_accvgpr_read_b32 v23, a23
+; CHECK-NEXT:    v_accvgpr_read_b32 v24, a24
+; CHECK-NEXT:    v_accvgpr_read_b32 v25, a25
+; CHECK-NEXT:    v_accvgpr_read_b32 v26, a26
+; CHECK-NEXT:    v_accvgpr_read_b32 v27, a27
 ; CHECK-NEXT:    v_accvgpr_read_b32 v28, a28
 ; CHECK-NEXT:    v_accvgpr_read_b32 v29, a29
 ; CHECK-NEXT:    v_accvgpr_read_b32 v30, a30
 ; CHECK-NEXT:    v_accvgpr_read_b32 v31, a31
-; CHECK-NEXT:    global_store_dwordx4 v32, v[24:27], s[2:3] offset:96
-; CHECK-NEXT:    global_store_dwordx4 v32, v[28:31], s[2:3] offset:112
-; CHECK-NEXT:    global_store_dwordx4 v32, v[16:19], s[2:3] offset:64
-; CHECK-NEXT:    global_store_dwordx4 v32, v[20:23], s[2:3] offset:80
-; CHECK-NEXT:    global_store_dwordx4 v32, v[8:11], s[2:3] offset:32
-; CHECK-NEXT:    global_store_dwordx4 v32, v[12:15], s[2:3] offset:48
-; CHECK-NEXT:    global_store_dwordx4 v32, v[0:3], s[2:3]
-; CHECK-NEXT:    global_store_dwordx4 v32, v[4:7], s[2:3] offset:16
+; CHECK-NEXT:    v_accvgpr_write_b32 a33, 4.0
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[32:63], a34, a33, v[0:31]
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0x41000000
+; CHECK-NEXT:    v_accvgpr_read_b32 v0, a32
+; CHECK-NEXT:    s_nop 15
+; CHECK-NEXT:    v_mov_b64_e32 v[2:3], v[32:33]
+; CHECK-NEXT:    v_mov_b64_e32 v[4:5], v[34:35]
+; CHECK-NEXT:    v_mov_b64_e32 v[6:7], v[36:37]
+; CHECK-NEXT:    v_mov_b64_e32 v[8:9], v[38:39]
+; CHECK-NEXT:    v_mov_b64_e32 v[10:11], v[40:41]
+; CHECK-NEXT:    v_mov_b64_e32 v[12:13], v[42:43]
+; CHECK-NEXT:    v_mov_b64_e32 v[14:15], v[44:45]
+; CHECK-NEXT:    v_mov_b64_e32 v[16:17], v[46:47]
+; CHECK-NEXT:    v_mov_b64_e32 v[18:19], v[48:49]
+; CHECK-NEXT:    v_mov_b64_e32 v[20:21], v[50:51]
+; CHECK-NEXT:    v_mov_b64_e32 v[22:23], v[52:53]
+; CHECK-NEXT:    v_mov_b64_e32 v[24:25], v[54:55]
+; CHECK-NEXT:    v_mov_b64_e32 v[26:27], v[56:57]
+; CHECK-NEXT:    v_mov_b64_e32 v[28:29], v[58:59]
+; CHECK-NEXT:    v_mov_b64_e32 v[30:31], v[60:61]
+; CHECK-NEXT:    v_mov_b64_e32 v[32:33], v[62:63]
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v0, v[30:33], s[0:1] offset:112
+; CHECK-NEXT:    global_store_dwordx4 v0, v[26:29], s[0:1] offset:96
+; CHECK-NEXT:    global_store_dwordx4 v0, v[22:25], s[0:1] offset:80
+; CHECK-NEXT:    global_store_dwordx4 v0, v[18:21], s[0:1] offset:64
+; CHECK-NEXT:    global_store_dwordx4 v0, v[14:17], s[0:1] offset:48
+; CHECK-NEXT:    global_store_dwordx4 v0, v[10:13], s[0:1] offset:32
+; CHECK-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
+; CHECK-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; CHECK-NEXT:    s_nop 1
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0x41800000
+; CHECK-NEXT:    s_nop 1
+; CHECK-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31]
+; CHECK-NEXT:    s_nop 15
+; CHECK-NEXT:    s_nop 1
+; CHECK-NEXT:    global_store_dwordx4 v0, a[24:27], s[2:3] offset:96
+; CHECK-NEXT:    global_store_dwordx4 v0, a[28:31], s[2:3] offset:112
+; CHECK-NEXT:    global_store_dwordx4 v0, a[16:19], s[2:3] offset:64
+; CHECK-NEXT:    global_store_dwordx4 v0, a[20:23], s[2:3] offset:80
+; CHECK-NEXT:    global_store_dwordx4 v0, a[8:11], s[2:3] offset:32
+; CHECK-NEXT:    global_store_dwordx4 v0, a[12:15], s[2:3] offset:48
+; CHECK-NEXT:    global_store_dwordx4 v0, a[0:3], s[2:3]
+; CHECK-NEXT:    global_store_dwordx4 v0, a[4:7], s[2:3] offset:16
 ; CHECK-NEXT:    s_endpgm
   %src2 = call <32 x float> asm sideeffect "; def $0", "=a"()
   %mai0 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 2.0, float 4.0, <32 x float> %src2, i32 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-barrier-latency.mir b/llvm/test/CodeGen/AMDGPU/schedule-barrier-latency.mir
new file mode 100644
index 0000000000000..93f7bcc478737
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/schedule-barrier-latency.mir
@@ -0,0 +1,83 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -passes=postmisched -o - %s | FileCheck %s
+
+# Ensure WMMA operations stay before the final atomic fence and barrier group.
+# This allows the latency of the WMMA operations to be hidden by barrier wait.
+---
+name: test
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr12, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr1_vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7_vgpr8, $vgpr9_vgpr10_vgpr11_vgpr12, $vgpr13_vgpr14_vgpr15_vgpr16, $vgpr17_vgpr18_vgpr19_vgpr20, $vgpr21_vgpr22_vgpr23_vgpr24, $vgpr25_vgpr26_vgpr27_vgpr28, $vgpr29_vgpr30_vgpr31_vgpr32
+
+    ; CHECK-LABEL: name: test
+    ; CHECK: liveins: $sgpr0, $sgpr12, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr1_vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7_vgpr8, $vgpr9_vgpr10_vgpr11_vgpr12, $vgpr13_vgpr14_vgpr15_vgpr16, $vgpr17_vgpr18_vgpr19_vgpr20, $vgpr21_vgpr22_vgpr23_vgpr24, $vgpr25_vgpr26_vgpr27_vgpr28, $vgpr29_vgpr30_vgpr31_vgpr32
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: ATOMIC_FENCE 5, 2
+    ; CHECK-NEXT: S_BARRIER
+    ; CHECK-NEXT: ATOMIC_FENCE 4, 2
+    ; CHECK-NEXT: BUNDLE implicit-def $vgpr41_vgpr42_vgpr43_vgpr44, implicit-def $vgpr41, implicit-def $vgpr41_lo16, implicit-def $vgpr41_hi16, implicit-def $vgpr42, implicit-def $vgpr42_lo16, implicit-def $vgpr42_hi16, implicit-def $vgpr43, implicit-def $vgpr43_lo16, implicit-def $vgpr43_hi16, implicit-def $vgpr44, implicit-def $vgpr44_lo16, implicit-def $vgpr44_hi16, implicit-def $vgpr41_vgpr42, implicit-def $vgpr41_vgpr42_vgpr43, implicit-def $vgpr42_vgpr43, implicit-def $vgpr42_vgpr43_vgpr44, implicit-def $vgpr43_vgpr44, implicit-def $vgpr45_vgpr46_vgpr47_vgpr48, implicit-def $vgpr45, implicit-def $vgpr45_lo16, implicit-def $vgpr45_hi16, implicit-def $vgpr46, implicit-def $vgpr46_lo16, implicit-def $vgpr46_hi16, implicit-def $vgpr47, implicit-def $vgpr47_lo16, implicit-def $vgpr47_hi16, implicit-def $vgpr48, implicit-def $vgpr48_lo16, implicit-def $vgpr48_hi16, implicit-def $vgpr45_vgpr46, implicit-def $vgpr45_vgpr46_vgpr47, implicit-def $vgpr46_vgpr47, implicit-def $vgpr46_vgpr47_vgpr48, implicit-def $vgpr47_vgpr48, implicit-def $vgpr49_vgpr50_vgpr51_vgpr52, implicit-def $vgpr49, implicit-def $vgpr49_lo16, implicit-def $vgpr49_hi16, implicit-def $vgpr50, implicit-def $vgpr50_lo16, implicit-def $vgpr50_hi16, implicit-def $vgpr51, implicit-def $vgpr51_lo16, implicit-def $vgpr51_hi16, implicit-def $vgpr52, implicit-def $vgpr52_lo16, implicit-def $vgpr52_hi16, implicit-def $vgpr49_vgpr50, implicit-def $vgpr49_vgpr50_vgpr51, implicit-def $vgpr50_vgpr51, implicit-def $vgpr50_vgpr51_vgpr52, implicit-def $vgpr51_vgpr52, implicit-def $vgpr53_vgpr54_vgpr55_vgpr56, implicit-def $vgpr53, implicit-def $vgpr53_lo16, implicit-def $vgpr53_hi16, implicit-def $vgpr54, implicit-def $vgpr54_lo16, implicit-def $vgpr54_hi16, implicit-def $vgpr55, implicit-def $vgpr55_lo16, implicit-def $vgpr55_hi16, implicit-def $vgpr56, implicit-def $vgpr56_lo16, implicit-def $vgpr56_hi16, implicit-def $vgpr53_vgpr54, implicit-def $vgpr53_vgpr54_vgpr55, implicit-def $vgpr54_vgpr55, implicit-def $vgpr54_vgpr55_vgpr56, implicit-def $vgpr55_vgpr56, implicit-def $vgpr57_vgpr58_vgpr59_vgpr60, implicit-def $vgpr57, implicit-def $vgpr57_lo16, implicit-def $vgpr57_hi16, implicit-def $vgpr58, implicit-def $vgpr58_lo16, implicit-def $vgpr58_hi16, implicit-def $vgpr59, implicit-def $vgpr59_lo16, implicit-def $vgpr59_hi16, implicit-def $vgpr60, implicit-def $vgpr60_lo16, implicit-def $vgpr60_hi16, implicit-def $vgpr57_vgpr58, implicit-def $vgpr57_vgpr58_vgpr59, implicit-def $vgpr58_vgpr59, implicit-def $vgpr58_vgpr59_vgpr60, implicit-def $vgpr59_vgpr60, implicit-def $vgpr61_vgpr62_vgpr63_vgpr64, implicit-def $vgpr61, implicit-def $vgpr61_lo16, implicit-def $vgpr61_hi16, implicit-def $vgpr62, implicit-def $vgpr62_lo16, implicit-def $vgpr62_hi16, implicit-def $vgpr63, implicit-def $vgpr63_lo16, implicit-def $vgpr63_hi16, implicit-def $vgpr64, implicit-def $vgpr64_lo16, implicit-def $vgpr64_hi16, implicit-def $vgpr61_vgpr62, implicit-def $vgpr61_vgpr62_vgpr63, implicit-def $vgpr62_vgpr63, implicit-def $vgpr62_vgpr63_vgpr64, implicit-def $vgpr63_vgpr64, implicit-def $vgpr65_vgpr66_vgpr67_vgpr68, implicit-def $vgpr65, implicit-def $vgpr65_lo16, implicit-def $vgpr65_hi16, implicit-def $vgpr66, implicit-def $vgpr66_lo16, implicit-def $vgpr66_hi16, implicit-def $vgpr67, implicit-def $vgpr67_lo16, implicit-def $vgpr67_hi16, implicit-def $vgpr68, implicit-def $vgpr68_lo16, implicit-def $vgpr68_hi16, implicit-def $vgpr65_vgpr66, implicit-def $vgpr65_vgpr66_vgpr67, implicit-def $vgpr66_vgpr67, implicit-def $vgpr66_vgpr67_vgpr68, implicit-def $vgpr67_vgpr68, implicit-def $vgpr69_vgpr70_vgpr71_vgpr72, implicit-def $vgpr69, implicit-def $vgpr69_lo16, implicit-def $vgpr69_hi16, implicit-def $vgpr70, implicit-def $vgpr70_lo16, implicit-def $vgpr70_hi16, implicit-def $vgpr71, implicit-def $vgpr71_lo16, implicit-def $vgpr71_hi16, implicit-def $vgpr72, implicit-def $vgpr72_lo16, implicit-def $vgpr72_hi16, implicit-def $vgpr69_vgpr70, implicit-def $vgpr69_vgpr70_vgpr71, implicit-def $vgpr70_vgpr71, implicit-def $vgpr70_vgpr71_vgpr72, implicit-def $vgpr71_vgpr72, implicit-def $vgpr73_vgpr74_vgpr75_vgpr76, implicit-def $vgpr73, implicit-def $vgpr73_lo16, implicit-def $vgpr73_hi16, implicit-def $vgpr74, implicit-def $vgpr74_lo16, implicit-def $vgpr74_hi16, implicit-def $vgpr75, implicit-def $vgpr75_lo16, implicit-def $vgpr75_hi16, implicit-def $vgpr76, implicit-def $vgpr76_lo16, implicit-def $vgpr76_hi16, implicit-def $vgpr73_vgpr74, implicit-def $vgpr73_vgpr74_vgpr75, implicit-def $vgpr74_vgpr75, implicit-def $vgpr74_vgpr75_vgpr76, implicit-def $vgpr75_vgpr76, implicit-def $vgpr77_vgpr78_vgpr79_vgpr80, implicit-def $vgpr77, implicit-def $vgpr77_lo16, implicit-def $vgpr77_hi16, implicit-def $vgpr78, implicit-def $vgpr78_lo16, implicit-def $vgpr78_hi16, implicit-def $vgpr79, implicit-def $vgpr79_lo16, implicit-def $vgpr79_hi16, implicit-def $vgpr80, implicit-def $vgpr80_lo16, implicit-def $vgpr80_hi16, implicit-def $vgpr77_vgpr78, implicit-def $vgpr77_vgpr78_vgpr79, implicit-def $vgpr78_vgpr79, implicit-def $vgpr78_vgpr79_vgpr80, implicit-def $vgpr79_vgpr80, implicit-def $vgpr81_vgpr82_vgpr83_vgpr84, implicit-def $vgpr81, implicit-def $vgpr81_lo16, implicit-def $vgpr81_hi16, implicit-def $vgpr82, implicit-def $vgpr82_lo16, implicit-def $vgpr82_hi16, implicit-def $vgpr83, implicit-def $vgpr83_lo16, implicit-def $vgpr83_hi16, implicit-def $vgpr84, implicit-def $vgpr84_lo16, implicit-def $vgpr84_hi16, implicit-def $vgpr81_vgpr82, implicit-def $vgpr81_vgpr82_vgpr83, implicit-def $vgpr82_vgpr83, implicit-def $vgpr82_vgpr83_vgpr84, implicit-def $vgpr83_vgpr84, implicit-def $vgpr85_vgpr86_vgpr87_vgpr88, implicit-def $vgpr85, implicit-def $vgpr85_lo16, implicit-def $vgpr85_hi16, implicit-def $vgpr86, implicit-def $vgpr86_lo16, implicit-def $vgpr86_hi16, implicit-def $vgpr87, implicit-def $vgpr87_lo16, implicit-def $vgpr87_hi16, implicit-def $vgpr88, implicit-def $vgpr88_lo16, implicit-def $vgpr88_hi16, implicit-def $vgpr85_vgpr86, implicit-def $vgpr85_vgpr86_vgpr87, implicit-def $vgpr86_vgpr87, implicit-def $vgpr86_vgpr87_vgpr88, implicit-def $vgpr87_vgpr88, implicit killed $vgpr36, implicit $exec, implicit killed $vgpr37, implicit killed $vgpr38 {
+    ; CHECK-NEXT:   $vgpr41_vgpr42_vgpr43_vgpr44 = DS_READ_B128_gfx9 $vgpr36, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT:   $vgpr45_vgpr46_vgpr47_vgpr48 = DS_READ2_B64_gfx9 $vgpr36, 2, 3, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT:   $vgpr49_vgpr50_vgpr51_vgpr52 = DS_READ_B128_gfx9 $vgpr37, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT:   $vgpr53_vgpr54_vgpr55_vgpr56 = DS_READ2_B64_gfx9 $vgpr37, 2, 3, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT:   $vgpr57_vgpr58_vgpr59_vgpr60 = DS_READ_B128_gfx9 $vgpr37, 768, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT:   $vgpr61_vgpr62_vgpr63_vgpr64 = DS_READ2_B64_gfx9 killed $vgpr37, 98, 99, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT:   $vgpr65_vgpr66_vgpr67_vgpr68 = DS_READ_B128_gfx9 $vgpr36, 768, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT:   $vgpr69_vgpr70_vgpr71_vgpr72 = DS_READ2_B64_gfx9 $vgpr36, 98, 99, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT:   $vgpr73_vgpr74_vgpr75_vgpr76 = DS_READ_B128_gfx9 $vgpr36, 1536, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT:   $vgpr77_vgpr78_vgpr79_vgpr80 = DS_READ2_B64_gfx9 $vgpr36, 194, 195, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT:   $vgpr81_vgpr82_vgpr83_vgpr84 = DS_READ_B128_gfx9 killed $vgpr36, 2304, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT:   $vgpr85_vgpr86_vgpr87_vgpr88 = DS_READ2_B64_gfx9 killed $vgpr38, 0, 1, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: $sgpr1 = S_ADD_I32 $sgpr0, 16, implicit-def dead $scc
+    ; CHECK-NEXT: $vgpr39 = V_ADD_U32_e32 32, killed $vgpr39, implicit $exec
+    ; CHECK-NEXT: $vgpr40 = V_ADD_U32_e32 32, killed $vgpr40, implicit $exec
+    ; CHECK-NEXT: S_CMP_LT_U32 killed $sgpr0, killed $sgpr12, implicit-def $scc
+    ; CHECK-NEXT: $sgpr0 = S_MOV_B32 killed $sgpr1
+    ; CHECK-NEXT: early-clobber $vgpr29_vgpr30_vgpr31_vgpr32 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48, 8, $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr29_vgpr30_vgpr31_vgpr32, 0, 0, implicit $exec
+    ; CHECK-NEXT: early-clobber $vgpr25_vgpr26_vgpr27_vgpr28 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48, 8, $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr25_vgpr26_vgpr27_vgpr28, 0, 0, implicit $exec
+    ; CHECK-NEXT: early-clobber $vgpr21_vgpr22_vgpr23_vgpr24 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72, 8, $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr21_vgpr22_vgpr23_vgpr24, 0, 0, implicit $exec
+    ; CHECK-NEXT: early-clobber $vgpr17_vgpr18_vgpr19_vgpr20 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72, 8, $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr17_vgpr18_vgpr19_vgpr20, 0, 0, implicit $exec
+    ; CHECK-NEXT: early-clobber $vgpr13_vgpr14_vgpr15_vgpr16 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80, 8, $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr13_vgpr14_vgpr15_vgpr16, 0, 0, implicit $exec
+    ; CHECK-NEXT: early-clobber $vgpr9_vgpr10_vgpr11_vgpr12 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80, 8, $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr9_vgpr10_vgpr11_vgpr12, 0, 0, implicit $exec
+    ; CHECK-NEXT: early-clobber $vgpr5_vgpr6_vgpr7_vgpr8 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88, 8, killed $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr5_vgpr6_vgpr7_vgpr8, 0, 0, implicit $exec
+    ; CHECK-NEXT: early-clobber $vgpr1_vgpr2_vgpr3_vgpr4 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88, 8, killed $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr1_vgpr2_vgpr3_vgpr4, 0, 0, implicit $exec
+    ; CHECK-NEXT: ATOMIC_FENCE 5, 2
+    ; CHECK-NEXT: S_BARRIER
+    ; CHECK-NEXT: ATOMIC_FENCE 4, 2
+    ATOMIC_FENCE 5, 2
+    S_BARRIER
+    ATOMIC_FENCE 4, 2
+    BUNDLE implicit-def $vgpr41_vgpr42_vgpr43_vgpr44, implicit-def $vgpr41, implicit-def $vgpr41_lo16, implicit-def $vgpr41_hi16, implicit-def $vgpr42, implicit-def $vgpr42_lo16, implicit-def $vgpr42_hi16, implicit-def $vgpr43, implicit-def $vgpr43_lo16, implicit-def $vgpr43_hi16, implicit-def $vgpr44, implicit-def $vgpr44_lo16, implicit-def $vgpr44_hi16, implicit-def $vgpr41_vgpr42, implicit-def $vgpr41_vgpr42_vgpr43, implicit-def $vgpr42_vgpr43, implicit-def $vgpr42_vgpr43_vgpr44, implicit-def $vgpr43_vgpr44, implicit-def $vgpr45_vgpr46_vgpr47_vgpr48, implicit-def $vgpr45, implicit-def $vgpr45_lo16, implicit-def $vgpr45_hi16, implicit-def $vgpr46, implicit-def $vgpr46_lo16, implicit-def $vgpr46_hi16, implicit-def $vgpr47, implicit-def $vgpr47_lo16, implicit-def $vgpr47_hi16, implicit-def $vgpr48, implicit-def $vgpr48_lo16, implicit-def $vgpr48_hi16, implicit-def $vgpr45_vgpr46, implicit-def $vgpr45_vgpr46_vgpr47, implicit-def $vgpr46_vgpr47, implicit-def $vgpr46_vgpr47_vgpr48, implicit-def $vgpr47_vgpr48, implicit-def $vgpr49_vgpr50_vgpr51_vgpr52, implicit-def $vgpr49, implicit-def $vgpr49_lo16, implicit-def $vgpr49_hi16, implicit-def $vgpr50, implicit-def $vgpr50_lo16, implicit-def $vgpr50_hi16, implicit-def $vgpr51, implicit-def $vgpr51_lo16, implicit-def $vgpr51_hi16, implicit-def $vgpr52, implicit-def $vgpr52_lo16, implicit-def $vgpr52_hi16, implicit-def $vgpr49_vgpr50, implicit-def $vgpr49_vgpr50_vgpr51, implicit-def $vgpr50_vgpr51, implicit-def $vgpr50_vgpr51_vgpr52, implicit-def $vgpr51_vgpr52, implicit-def $vgpr53_vgpr54_vgpr55_vgpr56, implicit-def $vgpr53, implicit-def $vgpr53_lo16, implicit-def $vgpr53_hi16, implicit-def $vgpr54, implicit-def $vgpr54_lo16, implicit-def $vgpr54_hi16, implicit-def $vgpr55, implicit-def $vgpr55_lo16, implicit-def $vgpr55_hi16, implicit-def $vgpr56, implicit-def $vgpr56_lo16, implicit-def $vgpr56_hi16, implicit-def $vgpr53_vgpr54, implicit-def $vgpr53_vgpr54_vgpr55, implicit-def $vgpr54_vgpr55, implicit-def $vgpr54_vgpr55_vgpr56, implicit-def $vgpr55_vgpr56, implicit-def $vgpr57_vgpr58_vgpr59_vgpr60, implicit-def $vgpr57, implicit-def $vgpr57_lo16, implicit-def $vgpr57_hi16, implicit-def $vgpr58, implicit-def $vgpr58_lo16, implicit-def $vgpr58_hi16, implicit-def $vgpr59, implicit-def $vgpr59_lo16, implicit-def $vgpr59_hi16, implicit-def $vgpr60, implicit-def $vgpr60_lo16, implicit-def $vgpr60_hi16, implicit-def $vgpr57_vgpr58, implicit-def $vgpr57_vgpr58_vgpr59, implicit-def $vgpr58_vgpr59, implicit-def $vgpr58_vgpr59_vgpr60, implicit-def $vgpr59_vgpr60, implicit-def $vgpr61_vgpr62_vgpr63_vgpr64, implicit-def $vgpr61, implicit-def $vgpr61_lo16, implicit-def $vgpr61_hi16, implicit-def $vgpr62, implicit-def $vgpr62_lo16, implicit-def $vgpr62_hi16, implicit-def $vgpr63, implicit-def $vgpr63_lo16, implicit-def $vgpr63_hi16, implicit-def $vgpr64, implicit-def $vgpr64_lo16, implicit-def $vgpr64_hi16, implicit-def $vgpr61_vgpr62, implicit-def $vgpr61_vgpr62_vgpr63, implicit-def $vgpr62_vgpr63, implicit-def $vgpr62_vgpr63_vgpr64, implicit-def $vgpr63_vgpr64, implicit-def $vgpr65_vgpr66_vgpr67_vgpr68, implicit-def $vgpr65, implicit-def $vgpr65_lo16, implicit-def $vgpr65_hi16, implicit-def $vgpr66, implicit-def $vgpr66_lo16, implicit-def $vgpr66_hi16, implicit-def $vgpr67, implicit-def $vgpr67_lo16, implicit-def $vgpr67_hi16, implicit-def $vgpr68, implicit-def $vgpr68_lo16, implicit-def $vgpr68_hi16, implicit-def $vgpr65_vgpr66, implicit-def $vgpr65_vgpr66_vgpr67, implicit-def $vgpr66_vgpr67, implicit-def $vgpr66_vgpr67_vgpr68, implicit-def $vgpr67_vgpr68, implicit-def $vgpr69_vgpr70_vgpr71_vgpr72, implicit-def $vgpr69, implicit-def $vgpr69_lo16, implicit-def $vgpr69_hi16, implicit-def $vgpr70, implicit-def $vgpr70_lo16, implicit-def $vgpr70_hi16, implicit-def $vgpr71, implicit-def $vgpr71_lo16, implicit-def $vgpr71_hi16, implicit-def $vgpr72, implicit-def $vgpr72_lo16, implicit-def $vgpr72_hi16, implicit-def $vgpr69_vgpr70, implicit-def $vgpr69_vgpr70_vgpr71, implicit-def $vgpr70_vgpr71, implicit-def $vgpr70_vgpr71_vgpr72, implicit-def $vgpr71_vgpr72, implicit-def $vgpr73_vgpr74_vgpr75_vgpr76, implicit-def $vgpr73, implicit-def $vgpr73_lo16, implicit-def $vgpr73_hi16, implicit-def $vgpr74, implicit-def $vgpr74_lo16, implicit-def $vgpr74_hi16, implicit-def $vgpr75, implicit-def $vgpr75_lo16, implicit-def $vgpr75_hi16, implicit-def $vgpr76, implicit-def $vgpr76_lo16, implicit-def $vgpr76_hi16, implicit-def $vgpr73_vgpr74, implicit-def $vgpr73_vgpr74_vgpr75, implicit-def $vgpr74_vgpr75, implicit-def $vgpr74_vgpr75_vgpr76, implicit-def $vgpr75_vgpr76, implicit-def $vgpr77_vgpr78_vgpr79_vgpr80, implicit-def $vgpr77, implicit-def $vgpr77_lo16, implicit-def $vgpr77_hi16, implicit-def $vgpr78, implicit-def $vgpr78_lo16, implicit-def $vgpr78_hi16, implicit-def $vgpr79, implicit-def $vgpr79_lo16, implicit-def $vgpr79_hi16, implicit-def $vgpr80, implicit-def $vgpr80_lo16, implicit-def $vgpr80_hi16, implicit-def $vgpr77_vgpr78, implicit-def $vgpr77_vgpr78_vgpr79, implicit-def $vgpr78_vgpr79, implicit-def $vgpr78_vgpr79_vgpr80, implicit-def $vgpr79_vgpr80, implicit-def $vgpr81_vgpr82_vgpr83_vgpr84, implicit-def $vgpr81, implicit-def $vgpr81_lo16, implicit-def $vgpr81_hi16, implicit-def $vgpr82, implicit-def $vgpr82_lo16, implicit-def $vgpr82_hi16, implicit-def $vgpr83, implicit-def $vgpr83_lo16, implicit-def $vgpr83_hi16, implicit-def $vgpr84, implicit-def $vgpr84_lo16, implicit-def $vgpr84_hi16, implicit-def $vgpr81_vgpr82, implicit-def $vgpr81_vgpr82_vgpr83, implicit-def $vgpr82_vgpr83, implicit-def $vgpr82_vgpr83_vgpr84, implicit-def $vgpr83_vgpr84, implicit-def $vgpr85_vgpr86_vgpr87_vgpr88, implicit-def $vgpr85, implicit-def $vgpr85_lo16, implicit-def $vgpr85_hi16, implicit-def $vgpr86, implicit-def $vgpr86_lo16, implicit-def $vgpr86_hi16, implicit-def $vgpr87, implicit-def $vgpr87_lo16, implicit-def $vgpr87_hi16, implicit-def $vgpr88, implicit-def $vgpr88_lo16, implicit-def $vgpr88_hi16, implicit-def $vgpr85_vgpr86, implicit-def $vgpr85_vgpr86_vgpr87, implicit-def $vgpr86_vgpr87, implicit-def $vgpr86_vgpr87_vgpr88, implicit-def $vgpr87_vgpr88, implicit $vgpr36, implicit $exec, implicit $vgpr37, implicit $vgpr38 {
+      $vgpr41_vgpr42_vgpr43_vgpr44 = DS_READ_B128_gfx9 $vgpr36, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+      $vgpr45_vgpr46_vgpr47_vgpr48 = DS_READ2_B64_gfx9 $vgpr36, 2, 3, 0, implicit $exec :: (load (s128), addrspace 3)
+      $vgpr49_vgpr50_vgpr51_vgpr52 = DS_READ_B128_gfx9 $vgpr37, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+      $vgpr53_vgpr54_vgpr55_vgpr56 = DS_READ2_B64_gfx9 $vgpr37, 2, 3, 0, implicit $exec :: (load (s128), addrspace 3)
+      $vgpr57_vgpr58_vgpr59_vgpr60 = DS_READ_B128_gfx9 $vgpr37, 768, 0, implicit $exec :: (load (s128), addrspace 3)
+      $vgpr61_vgpr62_vgpr63_vgpr64 = DS_READ2_B64_gfx9 $vgpr37, 98, 99, 0, implicit $exec :: (load (s128), addrspace 3)
+      $vgpr65_vgpr66_vgpr67_vgpr68 = DS_READ_B128_gfx9 $vgpr36, 768, 0, implicit $exec :: (load (s128), addrspace 3)
+      $vgpr69_vgpr70_vgpr71_vgpr72 = DS_READ2_B64_gfx9 $vgpr36, 98, 99, 0, implicit $exec :: (load (s128), addrspace 3)
+      $vgpr73_vgpr74_vgpr75_vgpr76 = DS_READ_B128_gfx9 $vgpr36, 1536, 0, implicit $exec :: (load (s128), addrspace 3)
+      $vgpr77_vgpr78_vgpr79_vgpr80 = DS_READ2_B64_gfx9 $vgpr36, 194, 195, 0, implicit $exec :: (load (s128), addrspace 3)
+      $vgpr81_vgpr82_vgpr83_vgpr84 = DS_READ_B128_gfx9 $vgpr36, 2304, 0, implicit $exec :: (load (s128), addrspace 3)
+      $vgpr85_vgpr86_vgpr87_vgpr88 = DS_READ2_B64_gfx9 $vgpr38, 0, 1, 0, implicit $exec :: (load (s128), addrspace 3)
+    }
+    $sgpr1 = S_ADD_I32 $sgpr0, 16, implicit-def dead $scc
+    $vgpr39 = V_ADD_U32_e32 32, killed $vgpr39, implicit $exec
+    $vgpr40 = V_ADD_U32_e32 32, killed $vgpr40, implicit $exec
+    S_CMP_LT_U32 killed $sgpr0, $sgpr12, implicit-def $scc
+    early-clobber $vgpr29_vgpr30_vgpr31_vgpr32 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48, 8, $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr29_vgpr30_vgpr31_vgpr32, 0, 0, implicit $exec
+    early-clobber $vgpr25_vgpr26_vgpr27_vgpr28 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48, 8, $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr25_vgpr26_vgpr27_vgpr28, 0, 0, implicit $exec
+    early-clobber $vgpr21_vgpr22_vgpr23_vgpr24 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72, 8, $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr21_vgpr22_vgpr23_vgpr24, 0, 0, implicit $exec
+    early-clobber $vgpr17_vgpr18_vgpr19_vgpr20 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72, 8, $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr17_vgpr18_vgpr19_vgpr20, 0, 0, implicit $exec
+    early-clobber $vgpr13_vgpr14_vgpr15_vgpr16 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80, 8, $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr13_vgpr14_vgpr15_vgpr16, 0, 0, implicit $exec
+    early-clobber $vgpr9_vgpr10_vgpr11_vgpr12 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80, 8, $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr9_vgpr10_vgpr11_vgpr12, 0, 0, implicit $exec
+    early-clobber $vgpr5_vgpr6_vgpr7_vgpr8 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88, 8, killed $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr5_vgpr6_vgpr7_vgpr8, 0, 0, implicit $exec
+    early-clobber $vgpr1_vgpr2_vgpr3_vgpr4 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88, 8, killed $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr1_vgpr2_vgpr3_vgpr4, 0, 0, implicit $exec
+    $sgpr0 = S_MOV_B32 killed $sgpr1
+    ATOMIC_FENCE 5, 2
+    S_BARRIER
+    ATOMIC_FENCE 4, 2
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-pending-queue.mir b/llvm/test/CodeGen/AMDGPU/schedule-pending-queue.mir
new file mode 100644
index 0000000000000..33b2f69039f48
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/schedule-pending-queue.mir
@@ -0,0 +1,32 @@
+# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass machine-scheduler --misched-prera-direction=topdown -verify-machineinstrs %s -o - -debug-only=machine-scheduler 2>&1 | FileCheck %s
+# REQUIRES: asserts
+
+# Check that cycle counts are consistent with hazards.
+
+# CHECK: Cycle: 3 TopQ.A
+# CHECK: hazard:  SU(6) HWXDL[0]=9c, is later than CurrCycle = 3c
+# CHECK-NOT: Cycle: 9 TopQ.A
+# CHECK: Cycle: 83 TopQ.A
+# CHECK: Checking pending node SU(6)
+# CHECK: Move SU(6) into Available Q
+
+---
+name:            pending_queue_ready_cycle
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $sgpr4_sgpr5
+
+    %2:sgpr_128 = IMPLICIT_DEF
+    %14:vgpr_32 = IMPLICIT_DEF
+    %15:vgpr_32 = IMPLICIT_DEF
+    %18:areg_512 = IMPLICIT_DEF
+    %18:areg_512 = V_MFMA_F32_16X16X1F32_mac_e64 %15, %14, %18, 0, 0, 0, implicit $mode, implicit $exec
+    %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, implicit $exec
+    %18:areg_512 = V_MFMA_F32_16X16X1F32_mac_e64 %15, %14, %18, 0, 0, 0, implicit $mode, implicit $exec
+    undef %84.sub0:vreg_128_align2 = V_ADD_U32_e32 %5.sub0, %14, implicit $exec
+    %7:vreg_512 = COPY %18
+    SCHED_BARRIER 0
+    S_NOP 0, implicit %18, implicit %7, implicit %84
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll
index 5c0f813c8c829..441509ba01f64 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll
@@ -391,156 +391,144 @@ define amdgpu_kernel void @slow_sdiv_i32_3435(ptr addrspace(1) %out, ptr addrspa
 define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: sdiv_v2i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    s_mov_b32 s10, s2
-; GCN-NEXT:    s_mov_b32 s11, s3
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s10, s6
+; GCN-NEXT:    s_mov_b32 s11, s7
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s6
-; GCN-NEXT:    s_mov_b32 s9, s7
+; GCN-NEXT:    s_mov_b32 s8, s2
+; GCN-NEXT:    s_mov_b32 s9, s3
 ; GCN-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readfirstlane_b32 s0, v2
-; GCN-NEXT:    s_abs_i32 s1, s0
-; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s1
-; GCN-NEXT:    s_sub_i32 s6, 0, s1
-; GCN-NEXT:    v_readfirstlane_b32 s8, v3
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GCN-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GCN-NEXT:    v_mul_lo_u32 v4, s6, v2
-; GCN-NEXT:    v_readfirstlane_b32 s6, v0
-; GCN-NEXT:    s_abs_i32 s7, s6
-; GCN-NEXT:    s_xor_b32 s0, s6, s0
-; GCN-NEXT:    v_mul_hi_u32 v4, v2, v4
-; GCN-NEXT:    s_ashr_i32 s6, s0, 31
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v4
-; GCN-NEXT:    v_mul_hi_u32 v0, s7, v0
-; GCN-NEXT:    v_readfirstlane_b32 s0, v0
-; GCN-NEXT:    s_mul_i32 s0, s0, s1
-; GCN-NEXT:    s_sub_i32 s0, s7, s0
-; GCN-NEXT:    s_sub_i32 s7, s0, s1
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
-; GCN-NEXT:    s_cmp_ge_u32 s0, s1
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT:    s_cselect_b32 s0, s7, s0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
-; GCN-NEXT:    s_cmp_ge_u32 s0, s1
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_abs_i32 s7, s8
-; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s7
-; GCN-NEXT:    s_mov_b32 s0, s4
-; GCN-NEXT:    s_sub_i32 s4, 0, s7
-; GCN-NEXT:    s_mov_b32 s1, s5
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, s6, v0
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s6, v0
-; GCN-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GCN-NEXT:    v_mul_lo_u32 v4, s4, v3
-; GCN-NEXT:    v_readfirstlane_b32 s4, v1
-; GCN-NEXT:    s_xor_b32 s5, s4, s8
-; GCN-NEXT:    s_abs_i32 s4, s4
-; GCN-NEXT:    v_mul_hi_u32 v1, v3, v4
-; GCN-NEXT:    s_ashr_i32 s5, s5, 31
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
-; GCN-NEXT:    v_mul_hi_u32 v1, s4, v1
-; GCN-NEXT:    v_readfirstlane_b32 s6, v1
-; GCN-NEXT:    s_mul_i32 s6, s6, s7
-; GCN-NEXT:    s_sub_i32 s4, s4, s6
-; GCN-NEXT:    s_sub_i32 s6, s4, s7
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 1, v1
-; GCN-NEXT:    s_cmp_ge_u32 s4, s7
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-NEXT:    s_cselect_b32 s4, s6, s4
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 1, v1
-; GCN-NEXT:    s_cmp_ge_u32 s4, s7
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-NEXT:    v_xor_b32_e32 v1, s5, v1
-; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s5, v1
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT:    v_sub_i32_e32 v6, vcc, 0, v2
+; GCN-NEXT:    v_sub_i32_e32 v9, vcc, 0, v3
+; GCN-NEXT:    v_xor_b32_e32 v4, v0, v2
+; GCN-NEXT:    v_xor_b32_e32 v7, v1, v3
+; GCN-NEXT:    v_max_i32_e32 v2, v2, v6
+; GCN-NEXT:    v_max_i32_e32 v3, v3, v9
+; GCN-NEXT:    v_cvt_f32_u32_e32 v6, v2
+; GCN-NEXT:    v_cvt_f32_u32_e32 v9, v3
+; GCN-NEXT:    v_sub_i32_e32 v5, vcc, 0, v0
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v6, v6
+; GCN-NEXT:    v_max_i32_e32 v0, v0, v5
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v9
+; GCN-NEXT:    v_sub_i32_e32 v9, vcc, 0, v2
+; GCN-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
+; GCN-NEXT:    v_mul_f32_e32 v5, 0x4f7ffffe, v5
+; GCN-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GCN-NEXT:    v_sub_i32_e32 v10, vcc, 0, v3
+; GCN-NEXT:    v_mul_lo_u32 v9, v9, v6
+; GCN-NEXT:    v_mul_lo_u32 v10, v10, v5
+; GCN-NEXT:    v_sub_i32_e32 v8, vcc, 0, v1
+; GCN-NEXT:    v_mul_hi_u32 v9, v6, v9
+; GCN-NEXT:    v_max_i32_e32 v1, v1, v8
+; GCN-NEXT:    v_mul_hi_u32 v8, v5, v10
+; GCN-NEXT:    v_ashrrev_i32_e32 v4, 31, v4
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; GCN-NEXT:    v_mul_hi_u32 v6, v0, v6
+; GCN-NEXT:    v_mul_hi_u32 v5, v1, v5
+; GCN-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
+; GCN-NEXT:    v_mul_lo_u32 v8, v6, v2
+; GCN-NEXT:    v_mul_lo_u32 v10, v5, v3
+; GCN-NEXT:    v_add_i32_e32 v9, vcc, 1, v6
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
+; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v1, v10
+; GCN-NEXT:    v_add_i32_e32 v11, vcc, 1, v5
+; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v2
+; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v3
+; GCN-NEXT:    v_sub_i32_e32 v8, vcc, v0, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[0:1]
+; GCN-NEXT:    v_sub_i32_e32 v9, vcc, v1, v3
+; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[2:3]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[0:1]
+; GCN-NEXT:    v_add_i32_e32 v8, vcc, 1, v6
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[2:3]
+; GCN-NEXT:    v_add_i32_e32 v9, vcc, 1, v5
+; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v6, v8, vcc
+; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
+; GCN-NEXT:    v_xor_b32_e32 v0, v0, v4
+; GCN-NEXT:    v_xor_b32_e32 v1, v1, v7
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
+; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v1, v7
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
 ;
 ; TONGA-LABEL: sdiv_v2i32:
 ; TONGA:       ; %bb.0:
-; TONGA-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
-; TONGA-NEXT:    s_mov_b32 s3, 0xf000
-; TONGA-NEXT:    s_mov_b32 s2, -1
-; TONGA-NEXT:    s_mov_b32 s10, s2
-; TONGA-NEXT:    s_mov_b32 s11, s3
+; TONGA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; TONGA-NEXT:    s_mov_b32 s7, 0xf000
+; TONGA-NEXT:    s_mov_b32 s6, -1
+; TONGA-NEXT:    s_mov_b32 s10, s6
+; TONGA-NEXT:    s_mov_b32 s11, s7
 ; TONGA-NEXT:    s_waitcnt lgkmcnt(0)
-; TONGA-NEXT:    s_mov_b32 s8, s6
-; TONGA-NEXT:    s_mov_b32 s9, s7
+; TONGA-NEXT:    s_mov_b32 s8, s2
+; TONGA-NEXT:    s_mov_b32 s9, s3
 ; TONGA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; TONGA-NEXT:    s_mov_b32 s4, s0
+; TONGA-NEXT:    s_mov_b32 s5, s1
 ; TONGA-NEXT:    s_waitcnt vmcnt(0)
-; TONGA-NEXT:    v_readfirstlane_b32 s0, v2
-; TONGA-NEXT:    s_abs_i32 s1, s0
-; TONGA-NEXT:    v_cvt_f32_u32_e32 v2, s1
-; TONGA-NEXT:    s_sub_i32 s6, 0, s1
-; TONGA-NEXT:    v_readfirstlane_b32 s8, v3
-; TONGA-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; TONGA-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; TONGA-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; TONGA-NEXT:    v_mul_lo_u32 v4, s6, v2
-; TONGA-NEXT:    v_readfirstlane_b32 s6, v0
-; TONGA-NEXT:    s_abs_i32 s7, s6
-; TONGA-NEXT:    s_xor_b32 s0, s6, s0
-; TONGA-NEXT:    v_mul_hi_u32 v4, v2, v4
-; TONGA-NEXT:    s_ashr_i32 s6, s0, 31
-; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v2, v4
-; TONGA-NEXT:    v_mul_hi_u32 v0, s7, v0
-; TONGA-NEXT:    v_readfirstlane_b32 s0, v0
-; TONGA-NEXT:    s_mul_i32 s0, s0, s1
-; TONGA-NEXT:    s_sub_i32 s0, s7, s0
-; TONGA-NEXT:    s_sub_i32 s7, s0, s1
-; TONGA-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
-; TONGA-NEXT:    s_cmp_ge_u32 s0, s1
-; TONGA-NEXT:    s_cselect_b64 vcc, -1, 0
-; TONGA-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; TONGA-NEXT:    s_cselect_b32 s0, s7, s0
-; TONGA-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
-; TONGA-NEXT:    s_cmp_ge_u32 s0, s1
-; TONGA-NEXT:    s_cselect_b64 vcc, -1, 0
-; TONGA-NEXT:    s_abs_i32 s7, s8
-; TONGA-NEXT:    v_cvt_f32_u32_e32 v3, s7
-; TONGA-NEXT:    s_mov_b32 s0, s4
-; TONGA-NEXT:    s_sub_i32 s4, 0, s7
-; TONGA-NEXT:    s_mov_b32 s1, s5
-; TONGA-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; TONGA-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; TONGA-NEXT:    v_xor_b32_e32 v0, s6, v0
-; TONGA-NEXT:    v_subrev_u32_e32 v0, vcc, s6, v0
-; TONGA-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; TONGA-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; TONGA-NEXT:    v_mul_lo_u32 v4, s4, v3
-; TONGA-NEXT:    v_readfirstlane_b32 s4, v1
-; TONGA-NEXT:    s_xor_b32 s5, s4, s8
-; TONGA-NEXT:    s_abs_i32 s4, s4
-; TONGA-NEXT:    v_mul_hi_u32 v1, v3, v4
-; TONGA-NEXT:    s_ashr_i32 s5, s5, 31
-; TONGA-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
-; TONGA-NEXT:    v_mul_hi_u32 v1, s4, v1
-; TONGA-NEXT:    v_readfirstlane_b32 s6, v1
-; TONGA-NEXT:    s_mul_i32 s6, s6, s7
-; TONGA-NEXT:    s_sub_i32 s4, s4, s6
-; TONGA-NEXT:    s_sub_i32 s6, s4, s7
-; TONGA-NEXT:    v_add_u32_e32 v2, vcc, 1, v1
-; TONGA-NEXT:    s_cmp_ge_u32 s4, s7
-; TONGA-NEXT:    s_cselect_b64 vcc, -1, 0
-; TONGA-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; TONGA-NEXT:    s_cselect_b32 s4, s6, s4
-; TONGA-NEXT:    v_add_u32_e32 v2, vcc, 1, v1
-; TONGA-NEXT:    s_cmp_ge_u32 s4, s7
-; TONGA-NEXT:    s_cselect_b64 vcc, -1, 0
-; TONGA-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; TONGA-NEXT:    v_xor_b32_e32 v1, s5, v1
-; TONGA-NEXT:    v_subrev_u32_e32 v1, vcc, s5, v1
-; TONGA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; TONGA-NEXT:    v_sub_u32_e32 v6, vcc, 0, v2
+; TONGA-NEXT:    v_sub_u32_e32 v9, vcc, 0, v3
+; TONGA-NEXT:    v_xor_b32_e32 v4, v0, v2
+; TONGA-NEXT:    v_xor_b32_e32 v7, v1, v3
+; TONGA-NEXT:    v_max_i32_e32 v2, v2, v6
+; TONGA-NEXT:    v_max_i32_e32 v3, v3, v9
+; TONGA-NEXT:    v_cvt_f32_u32_e32 v6, v2
+; TONGA-NEXT:    v_cvt_f32_u32_e32 v9, v3
+; TONGA-NEXT:    v_sub_u32_e32 v5, vcc, 0, v0
+; TONGA-NEXT:    v_rcp_iflag_f32_e32 v6, v6
+; TONGA-NEXT:    v_max_i32_e32 v0, v0, v5
+; TONGA-NEXT:    v_rcp_iflag_f32_e32 v5, v9
+; TONGA-NEXT:    v_sub_u32_e32 v9, vcc, 0, v2
+; TONGA-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
+; TONGA-NEXT:    v_mul_f32_e32 v5, 0x4f7ffffe, v5
+; TONGA-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; TONGA-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; TONGA-NEXT:    v_sub_u32_e32 v10, vcc, 0, v3
+; TONGA-NEXT:    v_mul_lo_u32 v9, v9, v6
+; TONGA-NEXT:    v_mul_lo_u32 v10, v10, v5
+; TONGA-NEXT:    v_sub_u32_e32 v8, vcc, 0, v1
+; TONGA-NEXT:    v_mul_hi_u32 v9, v6, v9
+; TONGA-NEXT:    v_max_i32_e32 v1, v1, v8
+; TONGA-NEXT:    v_mul_hi_u32 v8, v5, v10
+; TONGA-NEXT:    v_ashrrev_i32_e32 v4, 31, v4
+; TONGA-NEXT:    v_add_u32_e32 v6, vcc, v6, v9
+; TONGA-NEXT:    v_add_u32_e32 v5, vcc, v5, v8
+; TONGA-NEXT:    v_mul_hi_u32 v6, v0, v6
+; TONGA-NEXT:    v_mul_hi_u32 v5, v1, v5
+; TONGA-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
+; TONGA-NEXT:    v_mul_lo_u32 v8, v6, v2
+; TONGA-NEXT:    v_mul_lo_u32 v10, v5, v3
+; TONGA-NEXT:    v_add_u32_e32 v9, vcc, 1, v6
+; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v0, v8
+; TONGA-NEXT:    v_sub_u32_e32 v1, vcc, v1, v10
+; TONGA-NEXT:    v_add_u32_e32 v11, vcc, 1, v5
+; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v2
+; TONGA-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v3
+; TONGA-NEXT:    v_sub_u32_e32 v8, vcc, v0, v2
+; TONGA-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[0:1]
+; TONGA-NEXT:    v_sub_u32_e32 v9, vcc, v1, v3
+; TONGA-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[2:3]
+; TONGA-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[0:1]
+; TONGA-NEXT:    v_add_u32_e32 v8, vcc, 1, v6
+; TONGA-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[2:3]
+; TONGA-NEXT:    v_add_u32_e32 v9, vcc, 1, v5
+; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
+; TONGA-NEXT:    v_cndmask_b32_e32 v0, v6, v8, vcc
+; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
+; TONGA-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
+; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v4
+; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v7
+; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v0, v4
+; TONGA-NEXT:    v_sub_u32_e32 v1, vcc, v1, v7
+; TONGA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; TONGA-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: sdiv_v2i32:
@@ -558,44 +546,44 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v2
 ; GFX9-NEXT:    s_abs_i32 s1, s0
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s1
-; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX9-NEXT:    s_xor_b32 s0, s4, s0
+; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX9-NEXT:    s_xor_b32 s0, s5, s0
 ; GFX9-NEXT:    s_ashr_i32 s6, s0, 31
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GFX9-NEXT:    s_sub_i32 s0, 0, s1
-; GFX9-NEXT:    s_abs_i32 s4, s4
-; GFX9-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX9-NEXT:    s_abs_i32 s5, s5
+; GFX9-NEXT:    v_readfirstlane_b32 s4, v3
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s7, v0
 ; GFX9-NEXT:    s_mul_i32 s0, s0, s7
 ; GFX9-NEXT:    s_mul_hi_u32 s0, s7, s0
 ; GFX9-NEXT:    s_add_i32 s7, s7, s0
-; GFX9-NEXT:    s_mul_hi_u32 s0, s4, s7
+; GFX9-NEXT:    s_mul_hi_u32 s0, s5, s7
 ; GFX9-NEXT:    s_mul_i32 s7, s0, s1
-; GFX9-NEXT:    s_sub_i32 s4, s4, s7
+; GFX9-NEXT:    s_sub_i32 s5, s5, s7
 ; GFX9-NEXT:    s_add_i32 s10, s0, 1
-; GFX9-NEXT:    s_sub_i32 s7, s4, s1
-; GFX9-NEXT:    s_cmp_ge_u32 s4, s1
+; GFX9-NEXT:    s_sub_i32 s7, s5, s1
+; GFX9-NEXT:    s_cmp_ge_u32 s5, s1
 ; GFX9-NEXT:    s_cselect_b32 s0, s10, s0
-; GFX9-NEXT:    s_cselect_b32 s4, s7, s4
+; GFX9-NEXT:    s_cselect_b32 s5, s7, s5
 ; GFX9-NEXT:    s_add_i32 s7, s0, 1
-; GFX9-NEXT:    s_cmp_ge_u32 s4, s1
-; GFX9-NEXT:    s_cselect_b32 s4, s7, s0
-; GFX9-NEXT:    s_abs_i32 s7, s5
+; GFX9-NEXT:    s_cmp_ge_u32 s5, s1
+; GFX9-NEXT:    s_cselect_b32 s5, s7, s0
+; GFX9-NEXT:    s_abs_i32 s7, s4
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
-; GFX9-NEXT:    s_xor_b32 s4, s4, s6
+; GFX9-NEXT:    s_xor_b32 s5, s5, s6
 ; GFX9-NEXT:    s_mov_b32 s1, s9
 ; GFX9-NEXT:    s_sub_i32 s9, 0, s7
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_sub_i32 s4, s4, s6
+; GFX9-NEXT:    s_sub_i32 s5, s5, s6
 ; GFX9-NEXT:    s_mov_b32 s0, s8
 ; GFX9-NEXT:    v_readfirstlane_b32 s8, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    s_xor_b32 s5, s8, s5
+; GFX9-NEXT:    s_xor_b32 s4, s8, s4
 ; GFX9-NEXT:    s_abs_i32 s8, s8
-; GFX9-NEXT:    s_ashr_i32 s5, s5, 31
+; GFX9-NEXT:    s_ashr_i32 s4, s4, 31
 ; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
 ; GFX9-NEXT:    s_mul_i32 s9, s9, s6
 ; GFX9-NEXT:    s_mul_hi_u32 s9, s6, s9
@@ -611,10 +599,10 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX9-NEXT:    s_add_i32 s9, s6, 1
 ; GFX9-NEXT:    s_cmp_ge_u32 s8, s7
 ; GFX9-NEXT:    s_cselect_b32 s6, s9, s6
-; GFX9-NEXT:    s_xor_b32 s6, s6, s5
-; GFX9-NEXT:    s_sub_i32 s5, s6, s5
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    s_xor_b32 s6, s6, s4
+; GFX9-NEXT:    s_sub_i32 s4, s6, s4
+; GFX9-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -804,255 +792,255 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-LABEL: sdiv_v4i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-NEXT:    s_mov_b32 s10, -1
-; GCN-NEXT:    s_mov_b32 s6, s10
-; GCN-NEXT:    s_mov_b32 s7, s11
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s10, s6
+; GCN-NEXT:    s_mov_b32 s11, s7
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s4, s2
-; GCN-NEXT:    s_mov_b32 s5, s3
-; GCN-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16
-; GCN-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0
-; GCN-NEXT:    s_mov_b32 s8, s0
-; GCN-NEXT:    s_mov_b32 s9, s1
+; GCN-NEXT:    s_mov_b32 s8, s2
+; GCN-NEXT:    s_mov_b32 s9, s3
+; GCN-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GCN-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_readfirstlane_b32 s0, v0
-; GCN-NEXT:    v_readfirstlane_b32 s1, v1
-; GCN-NEXT:    v_readfirstlane_b32 s2, v2
-; GCN-NEXT:    s_abs_i32 s13, s0
-; GCN-NEXT:    s_abs_i32 s14, s1
-; GCN-NEXT:    s_abs_i32 s15, s2
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s13
-; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s14
-; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s15
-; GCN-NEXT:    v_readfirstlane_b32 s6, v3
+; GCN-NEXT:    v_sub_i32_e32 v9, vcc, 0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_sub_i32_e32 v10, vcc, 0, v4
+; GCN-NEXT:    v_xor_b32_e32 v8, v0, v4
+; GCN-NEXT:    v_max_i32_e32 v4, v4, v10
+; GCN-NEXT:    v_cvt_f32_u32_e32 v10, v4
+; GCN-NEXT:    v_sub_i32_e32 v13, vcc, 0, v5
+; GCN-NEXT:    v_xor_b32_e32 v11, v1, v5
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v10, v10
+; GCN-NEXT:    v_max_i32_e32 v5, v5, v13
+; GCN-NEXT:    v_cvt_f32_u32_e32 v13, v5
+; GCN-NEXT:    v_sub_i32_e32 v16, vcc, 0, v4
+; GCN-NEXT:    v_mul_f32_e32 v10, 0x4f7ffffe, v10
+; GCN-NEXT:    v_cvt_u32_f32_e32 v10, v10
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v13, v13
+; GCN-NEXT:    v_sub_i32_e32 v12, vcc, 0, v1
+; GCN-NEXT:    v_mul_lo_u32 v16, v16, v10
+; GCN-NEXT:    v_mul_f32_e32 v13, 0x4f7ffffe, v13
+; GCN-NEXT:    v_cvt_u32_f32_e32 v13, v13
+; GCN-NEXT:    v_max_i32_e32 v0, v0, v9
+; GCN-NEXT:    v_mul_hi_u32 v16, v10, v16
+; GCN-NEXT:    v_max_i32_e32 v1, v1, v12
+; GCN-NEXT:    v_sub_i32_e32 v15, vcc, 0, v6
+; GCN-NEXT:    v_add_i32_e32 v10, vcc, v10, v16
+; GCN-NEXT:    v_sub_i32_e32 v16, vcc, 0, v5
+; GCN-NEXT:    v_mul_lo_u32 v16, v16, v13
+; GCN-NEXT:    v_mul_hi_u32 v10, v0, v10
+; GCN-NEXT:    v_xor_b32_e32 v14, v2, v6
+; GCN-NEXT:    v_max_i32_e32 v6, v6, v15
+; GCN-NEXT:    v_mul_hi_u32 v12, v13, v16
+; GCN-NEXT:    v_cvt_f32_u32_e32 v15, v6
+; GCN-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
+; GCN-NEXT:    v_ashrrev_i32_e32 v11, 31, v11
+; GCN-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; GCN-NEXT:    v_mul_lo_u32 v13, v10, v4
+; GCN-NEXT:    v_mul_hi_u32 v12, v1, v12
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v9, v15
+; GCN-NEXT:    v_ashrrev_i32_e32 v14, 31, v14
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v13
+; GCN-NEXT:    v_add_i32_e32 v13, vcc, 1, v10
+; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v4
+; GCN-NEXT:    v_cndmask_b32_e64 v10, v10, v13, s[0:1]
+; GCN-NEXT:    v_sub_i32_e32 v13, vcc, v0, v4
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v13, s[0:1]
+; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v4
+; GCN-NEXT:    v_mul_lo_u32 v0, v12, v5
+; GCN-NEXT:    v_mul_f32_e32 v9, 0x4f7ffffe, v9
+; GCN-NEXT:    v_cvt_u32_f32_e32 v9, v9
+; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v6
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v1, v0
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, 1, v12
+; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], v0, v5
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v12, v1, s[2:3]
+; GCN-NEXT:    v_sub_i32_e32 v12, vcc, v0, v5
+; GCN-NEXT:    v_mul_lo_u32 v4, v4, v9
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v12, s[2:3]
+; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], v0, v5
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, 0, v7
+; GCN-NEXT:    v_max_i32_e32 v5, v7, v0
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v5
+; GCN-NEXT:    v_mul_hi_u32 v4, v9, v4
+; GCN-NEXT:    v_add_i32_e32 v13, vcc, 1, v10
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GCN-NEXT:    s_abs_i32 s17, s6
-; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s17
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
+; GCN-NEXT:    v_sub_i32_e32 v9, vcc, 0, v2
+; GCN-NEXT:    v_max_i32_e32 v2, v2, v9
+; GCN-NEXT:    v_mul_hi_u32 v4, v2, v4
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GCN-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
-; GCN-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readfirstlane_b32 s3, v4
-; GCN-NEXT:    v_readfirstlane_b32 s4, v5
-; GCN-NEXT:    v_readfirstlane_b32 s5, v6
-; GCN-NEXT:    s_xor_b32 s12, s3, s0
-; GCN-NEXT:    s_xor_b32 s0, s4, s1
-; GCN-NEXT:    s_xor_b32 s1, s5, s2
-; GCN-NEXT:    s_sub_i32 s2, 0, s13
-; GCN-NEXT:    s_ashr_i32 s18, s0, 31
-; GCN-NEXT:    s_sub_i32 s0, 0, s14
-; GCN-NEXT:    s_ashr_i32 s19, s1, 31
-; GCN-NEXT:    s_sub_i32 s1, 0, s15
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; GCN-NEXT:    v_mul_lo_u32 v4, s2, v0
-; GCN-NEXT:    v_mul_lo_u32 v5, s0, v1
-; GCN-NEXT:    v_mul_lo_u32 v6, s1, v2
-; GCN-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GCN-NEXT:    v_mul_hi_u32 v4, v0, v4
-; GCN-NEXT:    v_mul_hi_u32 v5, v1, v5
-; GCN-NEXT:    v_mul_hi_u32 v6, v2, v6
-; GCN-NEXT:    s_sub_i32 s20, 0, s17
-; GCN-NEXT:    v_readfirstlane_b32 s7, v7
-; GCN-NEXT:    s_abs_i32 s3, s3
-; GCN-NEXT:    s_abs_i32 s4, s4
-; GCN-NEXT:    s_abs_i32 s5, s5
-; GCN-NEXT:    v_mul_lo_u32 v7, s20, v3
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
-; GCN-NEXT:    v_mul_hi_u32 v0, s3, v0
-; GCN-NEXT:    v_mul_hi_u32 v1, s4, v1
-; GCN-NEXT:    v_mul_hi_u32 v2, s5, v2
-; GCN-NEXT:    v_mul_hi_u32 v7, v3, v7
-; GCN-NEXT:    v_mul_lo_u32 v4, v0, s13
-; GCN-NEXT:    v_mul_lo_u32 v6, v1, s14
-; GCN-NEXT:    v_mul_lo_u32 v8, v2, s15
-; GCN-NEXT:    s_abs_i32 s16, s7
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
-; GCN-NEXT:    v_mul_hi_u32 v3, s16, v3
-; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s3, v4
-; GCN-NEXT:    v_sub_i32_e32 v6, vcc, s4, v6
-; GCN-NEXT:    v_sub_i32_e32 v8, vcc, s5, v8
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v0
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, 1, v1
-; GCN-NEXT:    v_add_i32_e32 v9, vcc, 1, v2
-; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v4
-; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s14, v6
-; GCN-NEXT:    v_cmp_le_u32_e64 s[4:5], s15, v8
-; GCN-NEXT:    v_subrev_i32_e32 v10, vcc, s13, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[0:1]
-; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s14, v6
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[2:3]
-; GCN-NEXT:    v_subrev_i32_e32 v7, vcc, s15, v8
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[0:1]
-; GCN-NEXT:    v_add_i32_e32 v9, vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v5, v6, v5, s[2:3]
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v7, v8, v7, s[4:5]
-; GCN-NEXT:    v_add_i32_e32 v8, vcc, 1, v2
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s13, v4
-; GCN-NEXT:    v_mul_lo_u32 v4, v3, s17
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s14, v5
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s15, v7
-; GCN-NEXT:    s_ashr_i32 s12, s12, 31
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, s12, v0
-; GCN-NEXT:    v_xor_b32_e32 v1, s18, v1
-; GCN-NEXT:    v_xor_b32_e32 v2, s19, v2
-; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s16, v4
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s12, v0
-; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s18, v1
-; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s19, v2
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v3
-; GCN-NEXT:    v_subrev_i32_e32 v6, vcc, s17, v4
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s17, v4
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v3
-; GCN-NEXT:    s_xor_b32 s0, s7, s6
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s17, v4
-; GCN-NEXT:    s_ashr_i32 s0, s0, 31
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GCN-NEXT:    v_xor_b32_e32 v3, s0, v3
-; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s0, v3
-; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; GCN-NEXT:    v_cvt_u32_f32_e32 v9, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v10, v13, s[0:1]
+; GCN-NEXT:    v_xor_b32_e32 v0, v0, v8
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
+; GCN-NEXT:    v_mul_lo_u32 v8, v4, v6
+; GCN-NEXT:    v_add_i32_e32 v12, vcc, 1, v1
+; GCN-NEXT:    v_sub_i32_e32 v10, vcc, 0, v5
+; GCN-NEXT:    v_sub_i32_e32 v2, vcc, v2, v8
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v12, s[2:3]
+; GCN-NEXT:    v_mul_lo_u32 v10, v10, v9
+; GCN-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
+; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v2, v6
+; GCN-NEXT:    v_xor_b32_e32 v1, v1, v11
+; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v8, s[0:1]
+; GCN-NEXT:    v_sub_i32_e32 v8, vcc, v2, v6
+; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v1, v11
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v8, s[0:1]
+; GCN-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
+; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v6
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v4, v8, vcc
+; GCN-NEXT:    v_mul_hi_u32 v4, v9, v10
+; GCN-NEXT:    v_sub_i32_e32 v6, vcc, 0, v3
+; GCN-NEXT:    v_max_i32_e32 v6, v3, v6
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
+; GCN-NEXT:    v_mul_hi_u32 v4, v6, v4
+; GCN-NEXT:    v_xor_b32_e32 v2, v2, v14
+; GCN-NEXT:    v_sub_i32_e32 v2, vcc, v2, v14
+; GCN-NEXT:    v_mul_lo_u32 v8, v4, v5
+; GCN-NEXT:    v_xor_b32_e32 v3, v3, v7
+; GCN-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
+; GCN-NEXT:    v_sub_i32_e32 v6, vcc, v6, v8
+; GCN-NEXT:    v_sub_i32_e32 v8, vcc, v6, v5
+; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v5
+; GCN-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
+; GCN-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
+; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v5
+; GCN-NEXT:    v_ashrrev_i32_e32 v3, 31, v3
+; GCN-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
+; GCN-NEXT:    v_xor_b32_e32 v4, v4, v3
+; GCN-NEXT:    v_sub_i32_e32 v3, vcc, v4, v3
+; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
 ;
 ; TONGA-LABEL: sdiv_v4i32:
 ; TONGA:       ; %bb.0:
 ; TONGA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; TONGA-NEXT:    s_mov_b32 s11, 0xf000
-; TONGA-NEXT:    s_mov_b32 s10, -1
-; TONGA-NEXT:    s_mov_b32 s6, s10
-; TONGA-NEXT:    s_mov_b32 s7, s11
+; TONGA-NEXT:    s_mov_b32 s7, 0xf000
+; TONGA-NEXT:    s_mov_b32 s6, -1
+; TONGA-NEXT:    s_mov_b32 s10, s6
+; TONGA-NEXT:    s_mov_b32 s11, s7
 ; TONGA-NEXT:    s_waitcnt lgkmcnt(0)
-; TONGA-NEXT:    s_mov_b32 s4, s2
-; TONGA-NEXT:    s_mov_b32 s5, s3
-; TONGA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16
-; TONGA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0
-; TONGA-NEXT:    s_mov_b32 s8, s0
-; TONGA-NEXT:    s_mov_b32 s9, s1
+; TONGA-NEXT:    s_mov_b32 s8, s2
+; TONGA-NEXT:    s_mov_b32 s9, s3
+; TONGA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; TONGA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; TONGA-NEXT:    s_mov_b32 s4, s0
+; TONGA-NEXT:    s_mov_b32 s5, s1
 ; TONGA-NEXT:    s_waitcnt vmcnt(1)
-; TONGA-NEXT:    v_readfirstlane_b32 s0, v0
-; TONGA-NEXT:    v_readfirstlane_b32 s1, v1
-; TONGA-NEXT:    v_readfirstlane_b32 s2, v2
-; TONGA-NEXT:    s_abs_i32 s13, s0
-; TONGA-NEXT:    s_abs_i32 s14, s1
-; TONGA-NEXT:    s_abs_i32 s15, s2
-; TONGA-NEXT:    v_cvt_f32_u32_e32 v0, s13
-; TONGA-NEXT:    v_cvt_f32_u32_e32 v1, s14
-; TONGA-NEXT:    v_cvt_f32_u32_e32 v2, s15
-; TONGA-NEXT:    v_readfirstlane_b32 s6, v3
+; TONGA-NEXT:    v_sub_u32_e32 v9, vcc, 0, v0
+; TONGA-NEXT:    s_waitcnt vmcnt(0)
+; TONGA-NEXT:    v_sub_u32_e32 v10, vcc, 0, v4
+; TONGA-NEXT:    v_xor_b32_e32 v8, v0, v4
+; TONGA-NEXT:    v_max_i32_e32 v4, v4, v10
+; TONGA-NEXT:    v_cvt_f32_u32_e32 v10, v4
+; TONGA-NEXT:    v_sub_u32_e32 v13, vcc, 0, v5
+; TONGA-NEXT:    v_xor_b32_e32 v11, v1, v5
+; TONGA-NEXT:    v_rcp_iflag_f32_e32 v10, v10
+; TONGA-NEXT:    v_max_i32_e32 v5, v5, v13
+; TONGA-NEXT:    v_cvt_f32_u32_e32 v13, v5
+; TONGA-NEXT:    v_sub_u32_e32 v16, vcc, 0, v4
+; TONGA-NEXT:    v_mul_f32_e32 v10, 0x4f7ffffe, v10
+; TONGA-NEXT:    v_cvt_u32_f32_e32 v10, v10
+; TONGA-NEXT:    v_rcp_iflag_f32_e32 v13, v13
+; TONGA-NEXT:    v_sub_u32_e32 v12, vcc, 0, v1
+; TONGA-NEXT:    v_mul_lo_u32 v16, v16, v10
+; TONGA-NEXT:    v_mul_f32_e32 v13, 0x4f7ffffe, v13
+; TONGA-NEXT:    v_cvt_u32_f32_e32 v13, v13
+; TONGA-NEXT:    v_max_i32_e32 v0, v0, v9
+; TONGA-NEXT:    v_mul_hi_u32 v16, v10, v16
+; TONGA-NEXT:    v_max_i32_e32 v1, v1, v12
+; TONGA-NEXT:    v_sub_u32_e32 v15, vcc, 0, v6
+; TONGA-NEXT:    v_add_u32_e32 v10, vcc, v10, v16
+; TONGA-NEXT:    v_sub_u32_e32 v16, vcc, 0, v5
+; TONGA-NEXT:    v_mul_lo_u32 v16, v16, v13
+; TONGA-NEXT:    v_mul_hi_u32 v10, v0, v10
+; TONGA-NEXT:    v_xor_b32_e32 v14, v2, v6
+; TONGA-NEXT:    v_max_i32_e32 v6, v6, v15
+; TONGA-NEXT:    v_mul_hi_u32 v12, v13, v16
+; TONGA-NEXT:    v_cvt_f32_u32_e32 v15, v6
+; TONGA-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
+; TONGA-NEXT:    v_ashrrev_i32_e32 v11, 31, v11
+; TONGA-NEXT:    v_add_u32_e32 v12, vcc, v13, v12
+; TONGA-NEXT:    v_mul_lo_u32 v13, v10, v4
+; TONGA-NEXT:    v_mul_hi_u32 v12, v1, v12
+; TONGA-NEXT:    v_rcp_iflag_f32_e32 v9, v15
+; TONGA-NEXT:    v_ashrrev_i32_e32 v14, 31, v14
+; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v0, v13
+; TONGA-NEXT:    v_add_u32_e32 v13, vcc, 1, v10
+; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v4
+; TONGA-NEXT:    v_cndmask_b32_e64 v10, v10, v13, s[0:1]
+; TONGA-NEXT:    v_sub_u32_e32 v13, vcc, v0, v4
+; TONGA-NEXT:    v_cndmask_b32_e64 v0, v0, v13, s[0:1]
+; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v4
+; TONGA-NEXT:    v_mul_lo_u32 v0, v12, v5
+; TONGA-NEXT:    v_mul_f32_e32 v9, 0x4f7ffffe, v9
+; TONGA-NEXT:    v_cvt_u32_f32_e32 v9, v9
+; TONGA-NEXT:    v_sub_u32_e32 v4, vcc, 0, v6
+; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v1, v0
+; TONGA-NEXT:    v_add_u32_e32 v1, vcc, 1, v12
+; TONGA-NEXT:    v_cmp_ge_u32_e64 s[2:3], v0, v5
+; TONGA-NEXT:    v_cndmask_b32_e64 v1, v12, v1, s[2:3]
+; TONGA-NEXT:    v_sub_u32_e32 v12, vcc, v0, v5
+; TONGA-NEXT:    v_mul_lo_u32 v4, v4, v9
+; TONGA-NEXT:    v_cndmask_b32_e64 v0, v0, v12, s[2:3]
+; TONGA-NEXT:    v_cmp_ge_u32_e64 s[2:3], v0, v5
+; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, 0, v7
+; TONGA-NEXT:    v_max_i32_e32 v5, v7, v0
+; TONGA-NEXT:    v_cvt_f32_u32_e32 v0, v5
+; TONGA-NEXT:    v_mul_hi_u32 v4, v9, v4
+; TONGA-NEXT:    v_add_u32_e32 v13, vcc, 1, v10
 ; TONGA-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; TONGA-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; TONGA-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; TONGA-NEXT:    s_abs_i32 s17, s6
-; TONGA-NEXT:    v_cvt_f32_u32_e32 v3, s17
+; TONGA-NEXT:    v_add_u32_e32 v4, vcc, v9, v4
+; TONGA-NEXT:    v_sub_u32_e32 v9, vcc, 0, v2
+; TONGA-NEXT:    v_max_i32_e32 v2, v2, v9
+; TONGA-NEXT:    v_mul_hi_u32 v4, v2, v4
 ; TONGA-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; TONGA-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
-; TONGA-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; TONGA-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; TONGA-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; TONGA-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; TONGA-NEXT:    s_waitcnt vmcnt(0)
-; TONGA-NEXT:    v_readfirstlane_b32 s3, v4
-; TONGA-NEXT:    v_readfirstlane_b32 s4, v5
-; TONGA-NEXT:    v_readfirstlane_b32 s5, v6
-; TONGA-NEXT:    s_xor_b32 s12, s3, s0
-; TONGA-NEXT:    s_xor_b32 s0, s4, s1
-; TONGA-NEXT:    s_xor_b32 s1, s5, s2
-; TONGA-NEXT:    s_sub_i32 s2, 0, s13
-; TONGA-NEXT:    s_ashr_i32 s18, s0, 31
-; TONGA-NEXT:    s_sub_i32 s0, 0, s14
-; TONGA-NEXT:    s_ashr_i32 s19, s1, 31
-; TONGA-NEXT:    s_sub_i32 s1, 0, s15
-; TONGA-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; TONGA-NEXT:    v_mul_lo_u32 v4, s2, v0
-; TONGA-NEXT:    v_mul_lo_u32 v5, s0, v1
-; TONGA-NEXT:    v_mul_lo_u32 v6, s1, v2
-; TONGA-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; TONGA-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; TONGA-NEXT:    v_mul_hi_u32 v4, v0, v4
-; TONGA-NEXT:    v_mul_hi_u32 v5, v1, v5
-; TONGA-NEXT:    v_mul_hi_u32 v6, v2, v6
-; TONGA-NEXT:    s_sub_i32 s20, 0, s17
-; TONGA-NEXT:    v_readfirstlane_b32 s7, v7
-; TONGA-NEXT:    s_abs_i32 s3, s3
-; TONGA-NEXT:    s_abs_i32 s4, s4
-; TONGA-NEXT:    s_abs_i32 s5, s5
-; TONGA-NEXT:    v_mul_lo_u32 v7, s20, v3
-; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v0, v4
-; TONGA-NEXT:    v_add_u32_e32 v1, vcc, v1, v5
-; TONGA-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
-; TONGA-NEXT:    v_mul_hi_u32 v0, s3, v0
-; TONGA-NEXT:    v_mul_hi_u32 v1, s4, v1
-; TONGA-NEXT:    v_mul_hi_u32 v2, s5, v2
-; TONGA-NEXT:    v_mul_hi_u32 v7, v3, v7
-; TONGA-NEXT:    v_mul_lo_u32 v4, v0, s13
-; TONGA-NEXT:    v_mul_lo_u32 v6, v1, s14
-; TONGA-NEXT:    v_mul_lo_u32 v8, v2, s15
-; TONGA-NEXT:    s_abs_i32 s16, s7
-; TONGA-NEXT:    v_add_u32_e32 v3, vcc, v3, v7
-; TONGA-NEXT:    v_mul_hi_u32 v3, s16, v3
-; TONGA-NEXT:    v_sub_u32_e32 v4, vcc, s3, v4
-; TONGA-NEXT:    v_sub_u32_e32 v6, vcc, s4, v6
-; TONGA-NEXT:    v_sub_u32_e32 v8, vcc, s5, v8
-; TONGA-NEXT:    v_add_u32_e32 v5, vcc, 1, v0
-; TONGA-NEXT:    v_add_u32_e32 v7, vcc, 1, v1
-; TONGA-NEXT:    v_add_u32_e32 v9, vcc, 1, v2
-; TONGA-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v4
-; TONGA-NEXT:    v_cmp_le_u32_e64 s[2:3], s14, v6
-; TONGA-NEXT:    v_cmp_le_u32_e64 s[4:5], s15, v8
-; TONGA-NEXT:    v_subrev_u32_e32 v10, vcc, s13, v4
-; TONGA-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[0:1]
-; TONGA-NEXT:    v_subrev_u32_e32 v5, vcc, s14, v6
-; TONGA-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[2:3]
-; TONGA-NEXT:    v_subrev_u32_e32 v7, vcc, s15, v8
-; TONGA-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[4:5]
-; TONGA-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[0:1]
-; TONGA-NEXT:    v_add_u32_e32 v9, vcc, 1, v0
-; TONGA-NEXT:    v_cndmask_b32_e64 v5, v6, v5, s[2:3]
-; TONGA-NEXT:    v_add_u32_e32 v6, vcc, 1, v1
-; TONGA-NEXT:    v_cndmask_b32_e64 v7, v8, v7, s[4:5]
-; TONGA-NEXT:    v_add_u32_e32 v8, vcc, 1, v2
-; TONGA-NEXT:    v_cmp_le_u32_e32 vcc, s13, v4
-; TONGA-NEXT:    v_mul_lo_u32 v4, v3, s17
-; TONGA-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
-; TONGA-NEXT:    v_cmp_le_u32_e32 vcc, s14, v5
-; TONGA-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
-; TONGA-NEXT:    v_cmp_le_u32_e32 vcc, s15, v7
-; TONGA-NEXT:    s_ashr_i32 s12, s12, 31
-; TONGA-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
-; TONGA-NEXT:    v_xor_b32_e32 v0, s12, v0
-; TONGA-NEXT:    v_xor_b32_e32 v1, s18, v1
-; TONGA-NEXT:    v_xor_b32_e32 v2, s19, v2
-; TONGA-NEXT:    v_sub_u32_e32 v4, vcc, s16, v4
-; TONGA-NEXT:    v_subrev_u32_e32 v0, vcc, s12, v0
-; TONGA-NEXT:    v_subrev_u32_e32 v1, vcc, s18, v1
-; TONGA-NEXT:    v_subrev_u32_e32 v2, vcc, s19, v2
-; TONGA-NEXT:    v_add_u32_e32 v5, vcc, 1, v3
-; TONGA-NEXT:    v_subrev_u32_e32 v6, vcc, s17, v4
-; TONGA-NEXT:    v_cmp_le_u32_e32 vcc, s17, v4
-; TONGA-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; TONGA-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; TONGA-NEXT:    v_add_u32_e32 v5, vcc, 1, v3
-; TONGA-NEXT:    s_xor_b32 s0, s7, s6
-; TONGA-NEXT:    v_cmp_le_u32_e32 vcc, s17, v4
-; TONGA-NEXT:    s_ashr_i32 s0, s0, 31
-; TONGA-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; TONGA-NEXT:    v_xor_b32_e32 v3, s0, v3
-; TONGA-NEXT:    v_subrev_u32_e32 v3, vcc, s0, v3
-; TONGA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; TONGA-NEXT:    v_cvt_u32_f32_e32 v9, v0
+; TONGA-NEXT:    v_cndmask_b32_e64 v0, v10, v13, s[0:1]
+; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v8
+; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v0, v8
+; TONGA-NEXT:    v_mul_lo_u32 v8, v4, v6
+; TONGA-NEXT:    v_add_u32_e32 v12, vcc, 1, v1
+; TONGA-NEXT:    v_sub_u32_e32 v10, vcc, 0, v5
+; TONGA-NEXT:    v_sub_u32_e32 v2, vcc, v2, v8
+; TONGA-NEXT:    v_cndmask_b32_e64 v1, v1, v12, s[2:3]
+; TONGA-NEXT:    v_mul_lo_u32 v10, v10, v9
+; TONGA-NEXT:    v_add_u32_e32 v8, vcc, 1, v4
+; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v2, v6
+; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v11
+; TONGA-NEXT:    v_cndmask_b32_e64 v4, v4, v8, s[0:1]
+; TONGA-NEXT:    v_sub_u32_e32 v8, vcc, v2, v6
+; TONGA-NEXT:    v_sub_u32_e32 v1, vcc, v1, v11
+; TONGA-NEXT:    v_cndmask_b32_e64 v2, v2, v8, s[0:1]
+; TONGA-NEXT:    v_add_u32_e32 v8, vcc, 1, v4
+; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v6
+; TONGA-NEXT:    v_cndmask_b32_e32 v2, v4, v8, vcc
+; TONGA-NEXT:    v_mul_hi_u32 v4, v9, v10
+; TONGA-NEXT:    v_sub_u32_e32 v6, vcc, 0, v3
+; TONGA-NEXT:    v_max_i32_e32 v6, v3, v6
+; TONGA-NEXT:    v_add_u32_e32 v4, vcc, v9, v4
+; TONGA-NEXT:    v_mul_hi_u32 v4, v6, v4
+; TONGA-NEXT:    v_xor_b32_e32 v2, v2, v14
+; TONGA-NEXT:    v_sub_u32_e32 v2, vcc, v2, v14
+; TONGA-NEXT:    v_mul_lo_u32 v8, v4, v5
+; TONGA-NEXT:    v_xor_b32_e32 v3, v3, v7
+; TONGA-NEXT:    v_add_u32_e32 v7, vcc, 1, v4
+; TONGA-NEXT:    v_sub_u32_e32 v6, vcc, v6, v8
+; TONGA-NEXT:    v_sub_u32_e32 v8, vcc, v6, v5
+; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v5
+; TONGA-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
+; TONGA-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
+; TONGA-NEXT:    v_add_u32_e32 v7, vcc, 1, v4
+; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v5
+; TONGA-NEXT:    v_ashrrev_i32_e32 v3, 31, v3
+; TONGA-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
+; TONGA-NEXT:    v_xor_b32_e32 v4, v4, v3
+; TONGA-NEXT:    v_sub_u32_e32 v3, vcc, v4, v3
+; TONGA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; TONGA-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: sdiv_v4i32:
@@ -2006,7 +1994,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_mul_lo_u32 v1, v3, v2
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v3
 ; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v5, v1
-; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, v2, v1
+; GCN-NEXT:    v_sub_i32_e32 v5, vcc, v1, v2
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
@@ -2014,7 +2002,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GCN-NEXT:    v_xor_b32_e32 v1, v1, v0
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v0, v1
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v1, v0
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 25
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
@@ -2053,7 +2041,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_mul_lo_u32 v1, v3, v2
 ; TONGA-NEXT:    v_add_u32_e32 v4, vcc, 1, v3
 ; TONGA-NEXT:    v_sub_u32_e32 v1, vcc, v5, v1
-; TONGA-NEXT:    v_subrev_u32_e32 v5, vcc, v2, v1
+; TONGA-NEXT:    v_sub_u32_e32 v5, vcc, v1, v2
 ; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
 ; TONGA-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; TONGA-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
@@ -2061,7 +2049,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
 ; TONGA-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v0
-; TONGA-NEXT:    v_subrev_u32_e32 v0, vcc, v0, v1
+; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v1, v0
 ; TONGA-NEXT:    v_bfe_i32 v0, v0, 0, 25
 ; TONGA-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; TONGA-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll
index bbdfc767208cc..da454eeed8759 100644
--- a/llvm/test/CodeGen/AMDGPU/select.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll
@@ -852,19 +852,19 @@ define amdgpu_kernel void @select_v2f16(
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s22, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s23, s3
-; GFX11-TRUE16-NEXT:    s_mov_b32 s26, s2
-; GFX11-TRUE16-NEXT:    s_mov_b32 s27, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s18, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s19, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s26, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s27, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s6, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s7, s3
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s20, s12
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s21, s13
-; GFX11-TRUE16-NEXT:    s_mov_b32 s24, s14
-; GFX11-TRUE16-NEXT:    s_mov_b32 s25, s15
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s16, s10
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s17, s11
+; GFX11-TRUE16-NEXT:    s_mov_b32 s24, s14
+; GFX11-TRUE16-NEXT:    s_mov_b32 s25, s15
 ; GFX11-TRUE16-NEXT:    buffer_load_b32 v0, off, s[20:23], 0
 ; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, off, s[16:19], 0
 ; GFX11-TRUE16-NEXT:    buffer_load_b32 v2, off, s[24:27], 0
@@ -874,20 +874,18 @@ define amdgpu_kernel void @select_v2f16(
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v1.l, v0.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.l
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v1.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_cmp_lt_f16_e64 s0, v5.l, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v6.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v2.l, v1.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v7.l, v1.l, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, s8
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
@@ -1058,21 +1056,18 @@ define amdgpu_kernel void @select_v2f16_imm_a(
 ; GFX11-TRUE16-NEXT:    buffer_load_b32 v2, off, s[20:23], 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f16_e32 vcc_lo, 0.5, v0.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f16_e32 vcc_lo, 0.5, v0.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; GFX11-TRUE16-NEXT:    v_cmp_lt_f16_e64 s0, 0x3900, v3.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v4.l
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v1.l, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, s4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
@@ -1236,21 +1231,18 @@ define amdgpu_kernel void @select_v2f16_imm_b(
 ; GFX11-TRUE16-NEXT:    buffer_load_b32 v2, off, s[20:23], 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0.5, v0.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0.5, v0.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; GFX11-TRUE16-NEXT:    v_cmp_gt_f16_e64 s0, 0x3900, v3.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v4.l
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v1.l, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, s4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
@@ -1402,8 +1394,6 @@ define amdgpu_kernel void @select_v2f16_imm_c(
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s19, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s14, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s15, s3
-; GFX11-TRUE16-NEXT:    s_mov_b32 s22, s2
-; GFX11-TRUE16-NEXT:    s_mov_b32 s23, s3
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s16, s8
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s17, s9
@@ -1411,10 +1401,10 @@ define amdgpu_kernel void @select_v2f16_imm_c(
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s7
 ; GFX11-TRUE16-NEXT:    buffer_load_b32 v0, off, s[16:19], 0
 ; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-TRUE16-NEXT:    s_mov_b32 s20, s10
-; GFX11-TRUE16-NEXT:    s_mov_b32 s21, s11
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s10
+; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s11
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s1, s5
-; GFX11-TRUE16-NEXT:    buffer_load_b32 v2, off, s[20:23], 0
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v2, off, s[12:15], 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
@@ -1425,12 +1415,9 @@ define amdgpu_kernel void @select_v2f16_imm_c(
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_cmp_nlt_f16_e64 s0, v4.l, v3.l
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, 0x3800, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x3900, v0.l, s0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, 0x3900, v0.l, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, s4
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
-; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-TRUE16-NEXT:    buffer_store_b32 v1, off, s[0:3], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: select_v2f16_imm_c:
@@ -1581,8 +1568,6 @@ define amdgpu_kernel void @select_v2f16_imm_d(
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s19, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s14, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s15, s3
-; GFX11-TRUE16-NEXT:    s_mov_b32 s22, s2
-; GFX11-TRUE16-NEXT:    s_mov_b32 s23, s3
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s16, s8
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s17, s9
@@ -1590,10 +1575,10 @@ define amdgpu_kernel void @select_v2f16_imm_d(
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s7
 ; GFX11-TRUE16-NEXT:    buffer_load_b32 v0, off, s[16:19], 0
 ; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-TRUE16-NEXT:    s_mov_b32 s20, s10
-; GFX11-TRUE16-NEXT:    s_mov_b32 s21, s11
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s10
+; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s11
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s1, s5
-; GFX11-TRUE16-NEXT:    buffer_load_b32 v2, off, s[20:23], 0
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v2, off, s[12:15], 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
@@ -1604,12 +1589,9 @@ define amdgpu_kernel void @select_v2f16_imm_d(
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_cmp_lt_f16_e64 s0, v4.l, v3.l
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, 0x3800, v2.l, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x3900, v0.l, s0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, 0x3900, v0.l, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, s4
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
-; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-TRUE16-NEXT:    buffer_store_b32 v1, off, s[0:3], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: select_v2f16_imm_d:
diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll
index 5944342b2642a..bbd179364374c 100644
--- a/llvm/test/CodeGen/AMDGPU/srem.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem.ll
@@ -467,28 +467,28 @@ define amdgpu_kernel void @srem_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_readfirstlane_b32 s2, v2
 ; GCN-NEXT:    s_abs_i32 s2, s2
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s2
-; GCN-NEXT:    v_readfirstlane_b32 s3, v0
+; GCN-NEXT:    v_readfirstlane_b32 s4, v0
 ; GCN-NEXT:    s_sub_i32 s6, 0, s2
-; GCN-NEXT:    s_ashr_i32 s5, s3, 31
+; GCN-NEXT:    s_ashr_i32 s5, s4, 31
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GCN-NEXT:    s_abs_i32 s3, s3
-; GCN-NEXT:    v_readfirstlane_b32 s4, v3
+; GCN-NEXT:    s_abs_i32 s4, s4
+; GCN-NEXT:    v_readfirstlane_b32 s3, v3
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_readfirstlane_b32 s7, v0
 ; GCN-NEXT:    s_mul_i32 s6, s6, s7
 ; GCN-NEXT:    s_mul_hi_u32 s6, s7, s6
 ; GCN-NEXT:    s_add_i32 s7, s7, s6
-; GCN-NEXT:    s_mul_hi_u32 s6, s3, s7
+; GCN-NEXT:    s_mul_hi_u32 s6, s4, s7
 ; GCN-NEXT:    s_mul_i32 s6, s6, s2
-; GCN-NEXT:    s_sub_i32 s3, s3, s6
-; GCN-NEXT:    s_sub_i32 s6, s3, s2
-; GCN-NEXT:    s_cmp_ge_u32 s3, s2
-; GCN-NEXT:    s_cselect_b32 s3, s6, s3
-; GCN-NEXT:    s_sub_i32 s6, s3, s2
-; GCN-NEXT:    s_cmp_ge_u32 s3, s2
-; GCN-NEXT:    s_cselect_b32 s2, s6, s3
-; GCN-NEXT:    s_abs_i32 s3, s4
+; GCN-NEXT:    s_sub_i32 s4, s4, s6
+; GCN-NEXT:    s_sub_i32 s6, s4, s2
+; GCN-NEXT:    s_cmp_ge_u32 s4, s2
+; GCN-NEXT:    s_cselect_b32 s4, s6, s4
+; GCN-NEXT:    s_sub_i32 s6, s4, s2
+; GCN-NEXT:    s_cmp_ge_u32 s4, s2
+; GCN-NEXT:    s_cselect_b32 s2, s6, s4
+; GCN-NEXT:    s_abs_i32 s3, s3
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s3
 ; GCN-NEXT:    s_xor_b32 s2, s2, s5
 ; GCN-NEXT:    s_sub_i32 s7, 0, s3
diff --git a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll
index 4a6202ea82944..6daea572f58c6 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll
@@ -788,12 +788,10 @@ define amdgpu_ps <2 x half> @s_constained_fsub_v2f16_fpexcept_strict(<2 x half>
 ;
 ; GFX11-SDAG-TRUE16-LABEL: s_constained_fsub_v2f16_fpexcept_strict:
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    v_sub_f16_e64 v0.l, s2, s3
 ; GFX11-SDAG-TRUE16-NEXT:    s_lshr_b32 s0, s3, 16
 ; GFX11-SDAG-TRUE16-NEXT:    s_lshr_b32 s1, s2, 16
-; GFX11-SDAG-TRUE16-NEXT:    v_sub_f16_e64 v1.l, s1, s0
-; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_sub_f16_e64 v0.l, s2, s3
+; GFX11-SDAG-TRUE16-NEXT:    v_sub_f16_e64 v0.h, s1, s0
 ; GFX11-SDAG-TRUE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-SDAG-FAKE16-LABEL: s_constained_fsub_v2f16_fpexcept_strict:
diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
index cd1c532e9ae18..6a273e55fd9a8 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
@@ -813,7 +813,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out,
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, 0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
@@ -825,11 +825,9 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out,
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX11-TRUE16-NEXT:    v_pk_sub_i16 v0, v1, v0
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v2, 16, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.h
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX11-TRUE16-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/unsupported-global-load.ll b/llvm/test/CodeGen/AMDGPU/unsupported-global-load.ll
new file mode 100644
index 0000000000000..c04f33056b207
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/unsupported-global-load.ll
@@ -0,0 +1,32 @@
+; RUN: not llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx9-generic    < %s 2>&1 | FileCheck -check-prefixes=GFX9-GENERIC    %s
+; xxx: not llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx9-4-generic  < %s 2>&1 | FileCheck -check-prefixes=GFX9-4-GENERIC  %s
+; RUN: not llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx10-1-generic < %s 2>&1 | FileCheck -check-prefixes=GFX10-1-GENERIC %s
+; RUN: not llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx10-3-generic < %s 2>&1 | FileCheck -check-prefixes=GFX10-3-GENERIC %s
+; RUN: not llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx11-generic   < %s 2>&1 | FileCheck -check-prefixes=GFX11-GENERIC   %s
+; RUN: not llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx12-generic   < %s 2>&1 | FileCheck -check-prefixes=GFX12-GENERIC   %s
+
+; RUN: not llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx9-generic    < %s 2>&1 | FileCheck -check-prefixes=GFX9-GENERIC-GBL-ISEL    %s
+; xxx: not llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx9-4-generic  < %s 2>&1 | FileCheck -check-prefixes=GFX9-4-GENERIC-GBL-ISEL  %s
+; RUN: not llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx10-1-generic < %s 2>&1 | FileCheck -check-prefixes=GFX10-1-GENERIC-GBL-ISEL %s
+; RUN: not llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx10-3-generic < %s 2>&1 | FileCheck -check-prefixes=GFX10-3-GENERIC-GBL-ISEL %s
+; RUN: not llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx11-generic   < %s 2>&1 | FileCheck -check-prefixes=GFX11-GENERIC-GBL-ISEL   %s
+; RUN: not llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx12-generic   < %s 2>&1 | FileCheck -check-prefixes=GFX12-GENERIC-GBL-ISEL   %s
+
+define <4 x i32> @global_load_b128(ptr addrspace(1) %addr) {
+; GFX9-GENERIC:    LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.global.load.b128
+; GFX10-1-GENERIC: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.global.load.b128
+; GFX10-3-GENERIC: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.global.load.b128
+; GFX11-GENERIC:   LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.global.load.b128
+; GFX12-GENERIC:   LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.global.load.b128
+
+; GFX9-GENERIC-GBL-ISEL:    LLVM ERROR: cannot select: {{.*}} intrinsic(@llvm.amdgcn.global.load.b128)
+; GFX10-1-GENERIC-GBL-ISEL: LLVM ERROR: cannot select: {{.*}} intrinsic(@llvm.amdgcn.global.load.b128)
+; GFX10-3-GENERIC-GBL-ISEL: LLVM ERROR: cannot select: {{.*}} intrinsic(@llvm.amdgcn.global.load.b128)
+; GFX11-GENERIC-GBL-ISEL:   LLVM ERROR: cannot select: {{.*}} intrinsic(@llvm.amdgcn.global.load.b128)
+; GFX12-GENERIC-GBL-ISEL:   LLVM ERROR: cannot select: {{.*}} intrinsic(@llvm.amdgcn.global.load.b128)
+entry:
+  %data = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %addr, metadata !0)
+  ret <4 x i32> %data
+}
+
+!0 = !{!""}
diff --git a/llvm/test/CodeGen/AMDGPU/unsupported-global-store.ll b/llvm/test/CodeGen/AMDGPU/unsupported-global-store.ll
new file mode 100644
index 0000000000000..8b4dde936f5fc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/unsupported-global-store.ll
@@ -0,0 +1,32 @@
+; RUN: not llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx9-generic    < %s 2>&1 | FileCheck -check-prefixes=GFX9-GENERIC    %s
+; xxx: not llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx9-4-generic  < %s 2>&1 | FileCheck -check-prefixes=GFX9-4-GENERIC  %s
+; RUN: not llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx10-1-generic < %s 2>&1 | FileCheck -check-prefixes=GFX10-1-GENERIC %s
+; RUN: not llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx10-3-generic < %s 2>&1 | FileCheck -check-prefixes=GFX10-3-GENERIC %s
+; RUN: not llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx11-generic   < %s 2>&1 | FileCheck -check-prefixes=GFX11-GENERIC   %s
+; RUN: not llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx12-generic   < %s 2>&1 | FileCheck -check-prefixes=GFX12-GENERIC   %s
+
+; RUN: not llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx9-generic    < %s 2>&1 | FileCheck -check-prefixes=GFX9-GENERIC-GBL-ISEL    %s
+; xxx: not llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx9-4-generic  < %s 2>&1 | FileCheck -check-prefixes=GFX9-4-GENERIC-GBL-ISEL  %s
+; RUN: not llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx10-1-generic < %s 2>&1 | FileCheck -check-prefixes=GFX10-1-GENERIC-GBL-ISEL %s
+; RUN: not llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx10-3-generic < %s 2>&1 | FileCheck -check-prefixes=GFX10-3-GENERIC-GBL-ISEL %s
+; RUN: not llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx11-generic   < %s 2>&1 | FileCheck -check-prefixes=GFX11-GENERIC-GBL-ISEL   %s
+; RUN: not llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx12-generic   < %s 2>&1 | FileCheck -check-prefixes=GFX12-GENERIC-GBL-ISEL   %s
+
+define void @global_store_b128(ptr addrspace(1) %addr, <4 x i32> %data) {
+; GFX9-GENERIC:    LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.global.store.b128
+; GFX10-1-GENERIC: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.global.store.b128
+; GFX10-3-GENERIC: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.global.store.b128
+; GFX11-GENERIC:   LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.global.store.b128
+; GFX12-GENERIC:   LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.global.store.b128
+
+; GFX9-GENERIC-GBL-ISEL:    LLVM ERROR: cannot select: {{.*}} intrinsic(@llvm.amdgcn.global.store.b128)
+; GFX10-1-GENERIC-GBL-ISEL: LLVM ERROR: cannot select: {{.*}} intrinsic(@llvm.amdgcn.global.store.b128)
+; GFX10-3-GENERIC-GBL-ISEL: LLVM ERROR: cannot select: {{.*}} intrinsic(@llvm.amdgcn.global.store.b128)
+; GFX11-GENERIC-GBL-ISEL:   LLVM ERROR: cannot select: {{.*}} intrinsic(@llvm.amdgcn.global.store.b128)
+; GFX12-GENERIC-GBL-ISEL:   LLVM ERROR: cannot select: {{.*}} intrinsic(@llvm.amdgcn.global.store.b128)
+entry:
+  call void @llvm.amdgcn.global.store.b128(ptr addrspace(1) %addr, <4 x i32> %data, metadata !0)
+  ret void
+}
+
+!0 = !{!""}
diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
index c9b94e096fde1..99b6ab7a6401b 100644
--- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
@@ -189,14 +189,11 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg
 ; SDAG-GFX11-TRUE16-LABEL: basic_smax_smin_sgpr:
 ; SDAG-GFX11-TRUE16:       ; %bb.0:
 ; SDAG-GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; SDAG-GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; SDAG-GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-TRUE16-NEXT:    v_med3_i16 v0.l, s2, 0, 0xff
-; SDAG-GFX11-TRUE16-NEXT:    v_med3_i16 v1.l, s3, 0, 0xff
-; SDAG-GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SDAG-GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; SDAG-GFX11-TRUE16-NEXT:    global_store_b32 v2, v0, s[0:1]
+; SDAG-GFX11-TRUE16-NEXT:    v_med3_i16 v1.l, s2, 0, 0xff
+; SDAG-GFX11-TRUE16-NEXT:    v_med3_i16 v1.h, s3, 0, 0xff
+; SDAG-GFX11-TRUE16-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; SDAG-GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; SDAG-GFX11-FAKE16-LABEL: basic_smax_smin_sgpr:
@@ -215,14 +212,11 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg
 ; SDAG-GFX12-TRUE16-LABEL: basic_smax_smin_sgpr:
 ; SDAG-GFX12-TRUE16:       ; %bb.0:
 ; SDAG-GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; SDAG-GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; SDAG-GFX12-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; SDAG-GFX12-TRUE16-NEXT:    v_med3_i16 v0.l, s2, 0, 0xff
-; SDAG-GFX12-TRUE16-NEXT:    v_med3_i16 v1.l, s3, 0, 0xff
-; SDAG-GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SDAG-GFX12-TRUE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; SDAG-GFX12-TRUE16-NEXT:    global_store_b32 v2, v0, s[0:1]
+; SDAG-GFX12-TRUE16-NEXT:    v_med3_i16 v1.l, s2, 0, 0xff
+; SDAG-GFX12-TRUE16-NEXT:    v_med3_i16 v1.h, s3, 0, 0xff
+; SDAG-GFX12-TRUE16-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; SDAG-GFX12-TRUE16-NEXT:    s_endpgm
 ;
 ; SDAG-GFX12-FAKE16-LABEL: basic_smax_smin_sgpr:
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll
index 801324eec454e..dfc59f612e1cd 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll
@@ -1023,10 +1023,11 @@ define i16 @test_vector_reduce_and_v2i16(<2 x i16> %v) {
 ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_and_v2i16:
 ; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
-; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, v0, v2
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_and_v2i16:
@@ -1052,10 +1053,11 @@ define i16 @test_vector_reduce_and_v2i16(<2 x i16> %v) {
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
-; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, v0, v2
 ; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_and_v2i16:
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll
index 98919f565d902..4d5ade4abcef7 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll
@@ -1024,10 +1024,11 @@ define i16 @test_vector_reduce_mul_v2i16(<2 x i16> %v) {
 ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_mul_v2i16:
 ; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
-; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_mul_v2i16:
@@ -1053,10 +1054,11 @@ define i16 @test_vector_reduce_mul_v2i16(<2 x i16> %v) {
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
-; GFX12-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
 ; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_mul_v2i16:
@@ -1298,11 +1300,12 @@ define i16 @test_vector_reduce_mul_v4i16(<4 x i16> %v) {
 ; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_mul_v4i16:
@@ -1331,11 +1334,12 @@ define i16 @test_vector_reduce_mul_v4i16(<4 x i16> %v) {
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-TRUE16-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
 ; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
 ; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_mul_v4i16:
@@ -1468,12 +1472,13 @@ define i16 @test_vector_reduce_mul_v8i16(<8 x i16> %v) {
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
 ; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_mul_v8i16:
@@ -1509,12 +1514,13 @@ define i16 @test_vector_reduce_mul_v8i16(<8 x i16> %v) {
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
 ; GFX12-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX12-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
 ; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_mul_v8i16:
@@ -1706,12 +1712,13 @@ define i16 @test_vector_reduce_mul_v16i16(<16 x i16> %v) {
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
 ; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_mul_v16i16:
@@ -1762,12 +1769,13 @@ define i16 @test_vector_reduce_mul_v16i16(<16 x i16> %v) {
 ; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
 ; GFX12-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX12-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
 ; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_mul_v16i16:
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll
index bdb1c22ce7267..9e033f51feb0f 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll
@@ -1046,10 +1046,11 @@ define i16 @test_vector_reduce_or_v2i16(<2 x i16> %v) {
 ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_or_v2i16:
 ; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
-; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_or_v2i16:
@@ -1075,10 +1076,11 @@ define i16 @test_vector_reduce_or_v2i16(<2 x i16> %v) {
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
-; GFX12-SDAG-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_or_v2i16:
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-xor.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-xor.ll
index cf344ea9b92d4..166e6c43b87f8 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-xor.ll
@@ -992,10 +992,11 @@ define i16 @test_vector_reduce_xor_v2i16(<2 x i16> %v) {
 ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_xor_v2i16:
 ; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
-; GFX11-SDAG-TRUE16-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_xor_v2i16:
@@ -1021,10 +1022,11 @@ define i16 @test_vector_reduce_xor_v2i16(<2 x i16> %v) {
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
-; GFX12-SDAG-TRUE16-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_xor_v2i16:
diff --git a/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll b/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll
index 07e9325095017..504554037c536 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll
@@ -455,10 +455,7 @@ define <2 x i16> @shuffle_v2i16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: shuffle_v2i16_rebroadcast:
@@ -499,10 +496,8 @@ define <4 x i16> @shuffle_v4i16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -550,10 +545,8 @@ define <8 x i16> @shuffle_v8i16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v0
@@ -613,10 +606,8 @@ define <16 x i16> @shuffle_v16i16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v0
@@ -700,10 +691,8 @@ define <32 x i16> @shuffle_v32i16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v0
diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index b01e92d6979a3..6bf6d540299f1 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -1288,9 +1288,8 @@ define <4 x i16> @shuffle_v4i16_2356(ptr addrspace(1) %arg0, ptr addrspace(1) %a
 ; GFX11-TRUE16-NEXT:    global_load_b64 v[2:3], v[2:3], off
 ; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off offset:4
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v3.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2571,10 +2570,9 @@ define <2 x i16> @i16_hi16low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1)
 ; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-TRUE16-NEXT:    global_load_b32 v1, v[2:3], off
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: i16_hi16low16bits:
@@ -2626,14 +2624,10 @@ define <2 x i16> @i16_hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX11-TRUE16-LABEL: i16_hi16bits:
 ; GFX11-TRUE16:       ; %bb.0: ; %entry
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_b32 v2, v[2:3], off
-; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.h
+; GFX11-TRUE16-NEXT:    global_load_b32 v1, v[0:1], off
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[2:3], off
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.h
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: i16_hi16bits:
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
index f3cb5a783a08c..30f52773bbce5 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
@@ -26,17 +26,17 @@ define amdgpu_kernel void @barrier_vmcnt_global(ptr addrspace(1) %arg) {
 ; GFX9-LABEL: barrier_vmcnt_global:
 ; GFX9:         s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX9-NEXT:    v_add_u32_e32 v2, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v1, s[0:1]
-; GFX9-NEXT:    v_add_u32_e32 v1, 1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 30, v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    global_load_dword v3, v1, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 30, v[1:2]
+; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_barrier
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    global_store_dword v[0:1], v3, off
 ; GFX9-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -369,10 +369,9 @@ define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(ptr %arg) {
 ; GFX8-NEXT:    flat_load_dword v3, v[2:3]
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 30, v[1:2]
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_barrier
 ; GFX8-NEXT:    flat_store_dword v[0:1], v3
 ; GFX8-NEXT:    s_endpgm
@@ -393,10 +392,9 @@ define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(ptr %arg) {
 ; GFX9-NEXT:    flat_load_dword v3, v[2:3]
 ; GFX9-NEXT:    v_add_u32_e32 v2, 1, v0
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 30, v[1:2]
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_barrier
 ; GFX9-NEXT:    flat_store_dword v[0:1], v3
 ; GFX9-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
index 4e2df394d65b1..75e06aed64748 100644
--- a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
@@ -792,6 +792,136 @@ define amdgpu_gfx_whole_wave void @sgpr_spill_only(i1 %active, i32 %a, i32 %b) {
   ret void
 }
 
+define amdgpu_gfx_whole_wave void @realign_stack(i1 %active, i32 %x) {
+; DAGISEL-LABEL: realign_stack:
+; DAGISEL:       ; %bb.0:
+; DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT:    s_wait_expcnt 0x0
+; DAGISEL-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL-NEXT:    s_mov_b32 s1, s33
+; DAGISEL-NEXT:    s_add_co_i32 s33, s32, 0x3ff
+; DAGISEL-NEXT:    s_wait_alu 0xfffe
+; DAGISEL-NEXT:    s_and_b32 s33, s33, 0xfffffc00
+; DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; DAGISEL-NEXT:    s_mov_b32 s2, s34
+; DAGISEL-NEXT:    s_mov_b32 s34, s32
+; DAGISEL-NEXT:    s_addk_co_i32 s32, 0x800
+; DAGISEL-NEXT:    s_wait_storecnt 0x0
+; DAGISEL-NEXT:    scratch_store_b32 off, v0, s33 scope:SCOPE_SYS
+; DAGISEL-NEXT:    s_wait_storecnt 0x0
+; DAGISEL-NEXT:    s_wait_alu 0xfffe
+; DAGISEL-NEXT:    s_mov_b32 s32, s34
+; DAGISEL-NEXT:    s_mov_b32 s34, s2
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; DAGISEL-NEXT:    s_mov_b32 s33, s1
+; DAGISEL-NEXT:    s_wait_alu 0xfffe
+; DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: realign_stack:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT:    s_wait_expcnt 0x0
+; GISEL-NEXT:    s_wait_samplecnt 0x0
+; GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GISEL-NEXT:    s_wait_kmcnt 0x0
+; GISEL-NEXT:    s_mov_b32 s1, s33
+; GISEL-NEXT:    s_add_co_i32 s33, s32, 0x3ff
+; GISEL-NEXT:    s_wait_alu 0xfffe
+; GISEL-NEXT:    s_and_b32 s33, s33, 0xfffffc00
+; GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GISEL-NEXT:    s_mov_b32 s2, s34
+; GISEL-NEXT:    s_mov_b32 s34, s32
+; GISEL-NEXT:    s_addk_co_i32 s32, 0x800
+; GISEL-NEXT:    s_wait_storecnt 0x0
+; GISEL-NEXT:    scratch_store_b32 off, v0, s33 scope:SCOPE_SYS
+; GISEL-NEXT:    s_wait_storecnt 0x0
+; GISEL-NEXT:    s_wait_alu 0xfffe
+; GISEL-NEXT:    s_mov_b32 s32, s34
+; GISEL-NEXT:    s_mov_b32 s34, s2
+; GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GISEL-NEXT:    s_mov_b32 s33, s1
+; GISEL-NEXT:    s_wait_alu 0xfffe
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: realign_stack:
+; DAGISEL64:       ; %bb.0:
+; DAGISEL64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT:    s_wait_expcnt 0x0
+; DAGISEL64-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL64-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL64-NEXT:    s_mov_b32 s2, s33
+; DAGISEL64-NEXT:    s_add_co_i32 s33, s32, 0x3ff
+; DAGISEL64-NEXT:    s_wait_alu 0xfffe
+; DAGISEL64-NEXT:    s_and_b32 s33, s33, 0xfffffc00
+; DAGISEL64-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; DAGISEL64-NEXT:    s_mov_b32 s3, s34
+; DAGISEL64-NEXT:    s_mov_b32 s34, s32
+; DAGISEL64-NEXT:    s_addk_co_i32 s32, 0x800
+; DAGISEL64-NEXT:    s_wait_storecnt 0x0
+; DAGISEL64-NEXT:    scratch_store_b32 off, v0, s33 scope:SCOPE_SYS
+; DAGISEL64-NEXT:    s_wait_storecnt 0x0
+; DAGISEL64-NEXT:    s_wait_alu 0xfffe
+; DAGISEL64-NEXT:    s_mov_b32 s32, s34
+; DAGISEL64-NEXT:    s_mov_b32 s34, s3
+; DAGISEL64-NEXT:    s_mov_b64 exec, s[0:1]
+; DAGISEL64-NEXT:    s_mov_b32 s33, s2
+; DAGISEL64-NEXT:    s_wait_alu 0xfffe
+; DAGISEL64-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: realign_stack:
+; GISEL64:       ; %bb.0:
+; GISEL64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT:    s_wait_expcnt 0x0
+; GISEL64-NEXT:    s_wait_samplecnt 0x0
+; GISEL64-NEXT:    s_wait_bvhcnt 0x0
+; GISEL64-NEXT:    s_wait_kmcnt 0x0
+; GISEL64-NEXT:    s_mov_b32 s2, s33
+; GISEL64-NEXT:    s_add_co_i32 s33, s32, 0x3ff
+; GISEL64-NEXT:    s_wait_alu 0xfffe
+; GISEL64-NEXT:    s_and_b32 s33, s33, 0xfffffc00
+; GISEL64-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GISEL64-NEXT:    s_mov_b32 s3, s34
+; GISEL64-NEXT:    s_mov_b32 s34, s32
+; GISEL64-NEXT:    s_addk_co_i32 s32, 0x800
+; GISEL64-NEXT:    s_wait_storecnt 0x0
+; GISEL64-NEXT:    scratch_store_b32 off, v0, s33 scope:SCOPE_SYS
+; GISEL64-NEXT:    s_wait_storecnt 0x0
+; GISEL64-NEXT:    s_wait_alu 0xfffe
+; GISEL64-NEXT:    s_mov_b32 s32, s34
+; GISEL64-NEXT:    s_mov_b32 s34, s3
+; GISEL64-NEXT:    s_mov_b64 exec, s[0:1]
+; GISEL64-NEXT:    s_mov_b32 s33, s2
+; GISEL64-NEXT:    s_wait_alu 0xfffe
+; GISEL64-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-DAGISEL-LABEL: realign_stack:
+; GFX1250-DAGISEL:       ; %bb.0:
+; GFX1250-DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-DAGISEL-NEXT:    s_mov_b32 s1, s33
+; GFX1250-DAGISEL-NEXT:    s_add_co_i32 s33, s32, 0x3ff
+; GFX1250-DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-DAGISEL-NEXT:    s_and_b32 s33, s33, 0xfffffc00
+; GFX1250-DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX1250-DAGISEL-NEXT:    s_mov_b32 s2, s34
+; GFX1250-DAGISEL-NEXT:    s_mov_b32 s34, s32
+; GFX1250-DAGISEL-NEXT:    s_addk_co_i32 s32, 0x800
+; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v0, s33 scope:SCOPE_SYS
+; GFX1250-DAGISEL-NEXT:    s_wait_storecnt 0x0
+; GFX1250-DAGISEL-NEXT:    s_mov_b32 s32, s34
+; GFX1250-DAGISEL-NEXT:    s_mov_b32 s34, s2
+; GFX1250-DAGISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1250-DAGISEL-NEXT:    s_mov_b32 s33, s1
+; GFX1250-DAGISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %fussy = alloca i32, align 1024, addrspace(5)
+  store volatile i32 %x, ptr addrspace(5) %fussy, align 1024
+  ret void
+}
+
 define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) {
 ; DAGISEL-LABEL: multiple_blocks:
 ; DAGISEL:       ; %bb.0:
diff --git a/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll b/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll
index a2d6ca9cd6bb5..972a4708994db 100644
--- a/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll
+++ b/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll
@@ -27,7 +27,7 @@ entry:
 !1 = !{i64 0, !"_ZTSFivE.generalized"}
 !2 = !{i64 0, !"_ZTSFviE.generalized"}
 
-; CHECK: .section .callgraph,"o",%progbits,.text
+; CHECK: .section .llvm.callgraph,"o",%progbits,.text
 ;; Version
 ; CHECK-NEXT: .byte   0
 ;; Flags -- Potential indirect target so LSB is set to 1. Other bits are 0.
diff --git a/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll b/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll
index bf5249eb6c208..ec8d5b8ad94af 100644
--- a/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll
+++ b/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll
@@ -1,8 +1,8 @@
 ;; Test if temporary labels are generated for each indirect callsite.
-;; Test if the .callgraph section contains the MD5 hash of callees' type (type id)
+;; Test if the .llvm.callgraph section contains the MD5 hash of callees' type (type id)
 ;; is correctly paired with its corresponding temporary label generated for indirect
 ;; call sites annotated with !callee_type metadata.
-;; Test if the .callgraph section contains unique direct callees.
+;; Test if the .llvm.callgraph section contains unique direct callees.
 
 ; RUN: llc -mtriple=arm-unknown-linux --call-graph-section -o - < %s | FileCheck %s
 
@@ -36,7 +36,7 @@ entry:
 !4 = !{!5}
 !5 = !{i64 0, !"_ZTSFPvS_E.generalized"}
 
-; CHECK: .section .callgraph,"o",%progbits,.text
+; CHECK: .section .llvm.callgraph,"o",%progbits,.text
 ;; Version
 ; CHECK-NEXT: .byte   0
 ;; Flags
diff --git a/llvm/test/CodeGen/ARM/call-graph-section-tailcall.ll b/llvm/test/CodeGen/ARM/call-graph-section-tailcall.ll
index d5776030a8564..80360041c106a 100644
--- a/llvm/test/CodeGen/ARM/call-graph-section-tailcall.ll
+++ b/llvm/test/CodeGen/ARM/call-graph-section-tailcall.ll
@@ -1,7 +1,7 @@
-;; Tests that we store the type identifiers in .callgraph section of the object file for tailcalls.
+;; Tests that we store the type identifiers in .llvm.callgraph section of the object file for tailcalls.
 
 ; RUN: llc -mtriple=arm-unknown-linux --call-graph-section -filetype=obj -o - < %s | \
-; RUN: llvm-readelf -x .callgraph - | FileCheck %s
+; RUN: llvm-readelf -x .llvm.callgraph - | FileCheck %s
 
 define i32 @check_tailcall(ptr %func, i8 %x) !type !0 {
 entry:
@@ -27,7 +27,7 @@ declare !type !2 i32 @bar(i8 signext)
 !2 = !{i64 0, !"_ZTSFicE.generalized"}
 !3 = !{i64 0, !"_ZTSFiiE.generalized"}
 
-; CHECK:      Hex dump of section '.callgraph':
+; CHECK:      Hex dump of section '.llvm.callgraph':
 ; CHECK-NEXT: 0x00000000 00050000 00008e19 0b7f3326 e3000154
 ; CHECK-NEXT: 0x00000010 86bc5981 4b8e3000 05100000 00a150b8
 ;; Verify that the type id 0x308e4b8159bc8654 is in section.
diff --git a/llvm/test/CodeGen/ARM/call-graph-section.ll b/llvm/test/CodeGen/ARM/call-graph-section.ll
index 928a1067c81e4..167cc6f3c73bd 100644
--- a/llvm/test/CodeGen/ARM/call-graph-section.ll
+++ b/llvm/test/CodeGen/ARM/call-graph-section.ll
@@ -1,7 +1,7 @@
-;; Tests that we store the type identifiers in .callgraph section of the object file.
+;; Tests that we store the type identifiers in .llvm.callgraph section of the object file.
 
 ; RUN: llc -mtriple=arm-unknown-linux --call-graph-section -filetype=obj -o - < %s | \
-; RUN: llvm-readelf -x .callgraph - | FileCheck %s
+; RUN: llvm-readelf -x .llvm.callgraph - | FileCheck %s
 
 declare !type !0 void @foo()
 
@@ -31,7 +31,7 @@ entry:
 
 ;; Make sure following type IDs are in call graph section
 ;; 0x5eecb3e2444f731f, 0x814b8e305486bc59, 0xf897fd777ade6814
-; CHECK: Hex dump of section '.callgraph':
+; CHECK: Hex dump of section '.llvm.callgraph':
 ; CHECK-NEXT: 0x00000000 00050000 00000000 00000000 00000324
 ; CHECK-NEXT: 0x00000010 44f731f5 eecb3e54 86bc5981 4b8e307a
 ; CHECK-NEXT: 0x00000020 de6814f8 97fd77
diff --git a/llvm/test/CodeGen/ARM/llround-conv.ll b/llvm/test/CodeGen/ARM/llround-conv.ll
index f734db89af2f7..20fe2728d26d5 100644
--- a/llvm/test/CodeGen/ARM/llround-conv.ll
+++ b/llvm/test/CodeGen/ARM/llround-conv.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=armv7-none-eabi -float-abi=soft | FileCheck %s --check-prefixes=CHECK,CHECK-SOFT
 ; RUN: llc < %s -mtriple=armv7-none-eabihf -mattr=+vfp2 -float-abi=hard | FileCheck %s --check-prefixes=CHECK,CHECK-NOFP16
+; RUN: llc < %s -mtriple=armv8-none-eabihf -mattr=+fp-armv8 -float-abi=hard | FileCheck %s --check-prefixes=CHECK,CHECK-FPv8
 ; RUN: llc < %s -mtriple=armv8-none-eabihf -mattr=+fp-armv8,+fullfp16 -float-abi=hard | FileCheck %s --check-prefixes=CHECK,CHECK-FP16
 
 define i64 @testmsxh_builtin(half %x) {
@@ -22,6 +23,14 @@ define i64 @testmsxh_builtin(half %x) {
 ; CHECK-NOFP16-NEXT:    bl llroundf
 ; CHECK-NOFP16-NEXT:    pop {r11, pc}
 ;
+; CHECK-FPv8-LABEL: testmsxh_builtin:
+; CHECK-FPv8:       @ %bb.0: @ %entry
+; CHECK-FPv8-NEXT:    .save {r11, lr}
+; CHECK-FPv8-NEXT:    push {r11, lr}
+; CHECK-FPv8-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-FPv8-NEXT:    bl llroundf
+; CHECK-FPv8-NEXT:    pop {r11, pc}
+;
 ; CHECK-FP16-LABEL: testmsxh_builtin:
 ; CHECK-FP16:       @ %bb.0: @ %entry
 ; CHECK-FP16-NEXT:    .save {r11, lr}
diff --git a/llvm/test/CodeGen/ARM/lround-conv.ll b/llvm/test/CodeGen/ARM/lround-conv.ll
index 03f7a0d7a44c4..7466bcb982efb 100644
--- a/llvm/test/CodeGen/ARM/lround-conv.ll
+++ b/llvm/test/CodeGen/ARM/lround-conv.ll
@@ -4,11 +4,39 @@
 ; RUN: llc < %s -mtriple=armv8-none-eabihf -mattr=+fp-armv8 -float-abi=hard | FileCheck %s --check-prefixes=CHECK,CHECK-FPv8
 ; RUN: llc < %s -mtriple=armv8-none-eabihf -mattr=+fp-armv8,+fullfp16 -float-abi=hard | FileCheck %s --check-prefixes=CHECK,CHECK-FP16
 
-;define i32 @testmswh_builtin(half %x) {
-;entry:
-;  %0 = tail call i32 @llvm.lround.i32.f16(half %x)
-;  ret i32 %0
-;}
+define i32 @testmswh_builtin(half %x) {
+; CHECK-SOFT-LABEL: testmswh_builtin:
+; CHECK-SOFT:       @ %bb.0: @ %entry
+; CHECK-SOFT-NEXT:    .save {r11, lr}
+; CHECK-SOFT-NEXT:    push {r11, lr}
+; CHECK-SOFT-NEXT:    bl __aeabi_h2f
+; CHECK-SOFT-NEXT:    pop {r11, lr}
+; CHECK-SOFT-NEXT:    b lroundf
+;
+; CHECK-NOFP16-LABEL: testmswh_builtin:
+; CHECK-NOFP16:       @ %bb.0: @ %entry
+; CHECK-NOFP16-NEXT:    .save {r11, lr}
+; CHECK-NOFP16-NEXT:    push {r11, lr}
+; CHECK-NOFP16-NEXT:    vmov r0, s0
+; CHECK-NOFP16-NEXT:    bl __aeabi_h2f
+; CHECK-NOFP16-NEXT:    vmov s0, r0
+; CHECK-NOFP16-NEXT:    pop {r11, lr}
+; CHECK-NOFP16-NEXT:    b lroundf
+;
+; CHECK-FPv8-LABEL: testmswh_builtin:
+; CHECK-FPv8:       @ %bb.0: @ %entry
+; CHECK-FPv8-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-FPv8-NEXT:    b lroundf
+;
+; CHECK-FP16-LABEL: testmswh_builtin:
+; CHECK-FP16:       @ %bb.0: @ %entry
+; CHECK-FP16-NEXT:    vcvta.s32.f16 s0, s0
+; CHECK-FP16-NEXT:    vmov r0, s0
+; CHECK-FP16-NEXT:    bx lr
+entry:
+  %0 = tail call i32 @llvm.lround.i32.f16(half %x)
+  ret i32 %0
+}
 
 define i32 @testmsws_builtin(float %x) {
 ; CHECK-LABEL: testmsws_builtin:
@@ -40,8 +68,3 @@ entry:
   ret i32 %0
 }
 
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-FP16: {{.*}}
-; CHECK-FPv8: {{.*}}
-; CHECK-NOFP16: {{.*}}
-; CHECK-SOFT: {{.*}}
diff --git a/llvm/test/CodeGen/ARM/nnan-fsub.ll b/llvm/test/CodeGen/ARM/nnan-fsub.ll
index 01839083547be..78dd36f954913 100644
--- a/llvm/test/CodeGen/ARM/nnan-fsub.ll
+++ b/llvm/test/CodeGen/ARM/nnan-fsub.ll
@@ -1,18 +1,22 @@
-; RUN: llc -mcpu=cortex-a9 < %s | FileCheck -check-prefix=SAFE %s
-; RUN: llc -mcpu=cortex-a9 --enable-no-nans-fp-math < %s | FileCheck -check-prefix=FAST %s
+; RUN: llc -mcpu=cortex-a9 < %s | FileCheck %s
 
 target triple = "armv7-apple-ios"
 
-; SAFE: test
-; FAST: test
+; CHECK-LABEL: test
 define float @test(float %x, float %y) {
 entry:
-; SAFE: vmul.f32
-; SAFE: vsub.f32
-; FAST: mov r0, #0
+; CHECK: vmul.f32
+; CHECK-NEXT: vsub.f32
   %0 = fmul float %x, %y
   %1 = fsub float %0, %0
   ret float %1
 }
 
-
+; CHECK-LABEL: test_nnan
+define float @test_nnan(float %x, float %y) {
+entry:
+; CHECK: mov r0, #0
+  %0 = fmul float %x, %y
+  %1 = fsub nnan float %0, %0
+  ret float %1
+}
diff --git a/llvm/test/CodeGen/BPF/BTF/variant-part.ll b/llvm/test/CodeGen/BPF/BTF/variant-part.ll
new file mode 100644
index 0000000000000..1071e618f601b
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/BTF/variant-part.ll
@@ -0,0 +1,87 @@
+; RUN: llc -mtriple=bpfel -filetype=obj -o %t1 %s
+; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1
+; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s
+; RUN: llc -mtriple=bpfeb -filetype=obj -o %t1 %s
+; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1
+; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s
+;
+; Source:
+;   #![no_std]
+;   #![no_main]
+;
+;   pub enum MyEnum {
+;       First { a: u32, b: i32 },
+;       Second(u32),
+;   }
+;
+;   #[unsafe(no_mangle)]
+;   pub static X: MyEnum = MyEnum::First { a: 54, b: -23 };
+;
+;   #[cfg(not(test))]
+;   #[panic_handler]
+;   fn panic(_info: &core::panic::PanicInfo) -> ! {
+;       loop {}
+;   }
+; Compilation flag:
+;   cargo +nightly rustc -Zbuild-std=core --target=bpfel-unknown-none -- --emit=llvm-bc
+;   llvm-extract --glob=X $(find target/ -name "*.bc" | head -n 1) -o variant-part.bc
+;   llvm-dis variant-part.bc -o variant-part.ll
+
+; ModuleID = 'variant-part.bc'
+source_filename = "c0znihgkvro8hs0n88fgrtg6x"
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "bpfel"
+
+@X = constant [12 x i8] c"\00\00\00\006\00\00\00\E9\FF\FF\FF", align 4, !dbg !0
+
+!llvm.module.flags = !{!22, !23, !24, !25}
+!llvm.ident = !{!26}
+!llvm.dbg.cu = !{!27}
+
+; CHECK-BTF:      [1] STRUCT 'MyEnum' size=12 vlen=1
+; CHECK-BTF-NEXT:         '(anon)' type_id=3 bits_offset=0
+; CHECK-BTF-NEXT: [2] INT 'u32' size=4 bits_offset=0 nr_bits=32 encoding=(none)
+; CHECK-BTF-NEXT: [3] UNION '(anon)' size=12 vlen=3
+; CHECK-BTF-NEXT:         '(anon)' type_id=2 bits_offset=0
+; CHECK-BTF-NEXT:         'First' type_id=4 bits_offset=0
+; CHECK-BTF-NEXT:         'Second' type_id=6 bits_offset=0
+; CHECK-BTF-NEXT: [4] STRUCT 'First' size=12 vlen=2
+; CHECK-BTF-NEXT:         'a' type_id=2 bits_offset=32
+; CHECK-BTF-NEXT:         'b' type_id=5 bits_offset=64
+; CHECK-BTF-NEXT: [5] INT 'i32' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED
+; CHECK-BTF-NEXT: [6] STRUCT 'Second' size=12 vlen=1
+; CHECK-BTF-NEXT:         '__0' type_id=2 bits_offset=32
+; CHECK-BTF-NEXT: [7] VAR 'X' type_id=1, linkage=global
+; CHECK-BTF-NEXT: [8] DATASEC '.rodata' size=0 vlen=1
+; CHECK-BTF-NEXT:         type_id=7 offset=0 size=12
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "X", scope: !2, file: !3, line: 10, type: !4, isLocal: false, isDefinition: true, align: 32)
+!2 = !DINamespace(name: "variant_part", scope: null)
+!3 = !DIFile(filename: "variant-part/src/main.rs", directory: "/tmp/variant-part", checksumkind: CSK_MD5, checksum: "b94cd53886ea8f14cbc116b36bc7dd36")
+!4 = !DICompositeType(tag: DW_TAG_structure_type, name: "MyEnum", scope: !2, file: !5, size: 96, align: 32, flags: DIFlagPublic, elements: !6, templateParams: !16, identifier: "faba668fd9f71e9b7cf3b9ac5e8b93cb")
+!5 = !DIFile(filename: "<unknown>", directory: "")
+!6 = !{!7}
+!7 = !DICompositeType(tag: DW_TAG_variant_part, scope: !4, file: !5, size: 96, align: 32, elements: !8, templateParams: !16, identifier: "e4aee046fc86d111657622fdcb8c42f7", discriminator: !21)
+!8 = !{!9, !17}
+!9 = !DIDerivedType(tag: DW_TAG_member, name: "First", scope: !7, file: !5, baseType: !10, size: 96, align: 32, extraData: i32 0)
+!10 = !DICompositeType(tag: DW_TAG_structure_type, name: "First", scope: !4, file: !5, size: 96, align: 32, flags: DIFlagPublic, elements: !11, templateParams: !16, identifier: "cc7748c842e275452db4205b190c8ff7")
+!11 = !{!12, !14}
+!12 = !DIDerivedType(tag: DW_TAG_member, name: "a", scope: !10, file: !5, baseType: !13, size: 32, align: 32, offset: 32, flags: DIFlagPublic)
+!13 = !DIBasicType(name: "u32", size: 32, encoding: DW_ATE_unsigned)
+!14 = !DIDerivedType(tag: DW_TAG_member, name: "b", scope: !10, file: !5, baseType: !15, size: 32, align: 32, offset: 64, flags: DIFlagPublic)
+!15 = !DIBasicType(name: "i32", size: 32, encoding: DW_ATE_signed)
+!16 = !{}
+!17 = !DIDerivedType(tag: DW_TAG_member, name: "Second", scope: !7, file: !5, baseType: !18, size: 96, align: 32, extraData: i32 1)
+!18 = !DICompositeType(tag: DW_TAG_structure_type, name: "Second", scope: !4, file: !5, size: 96, align: 32, flags: DIFlagPublic, elements: !19, templateParams: !16, identifier: "a2094b1381f3082d504fbd0903aa7c06")
+!19 = !{!20}
+!20 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !18, file: !5, baseType: !13, size: 32, align: 32, offset: 32, flags: DIFlagPublic)
+!21 = !DIDerivedType(tag: DW_TAG_member, scope: !4, file: !5, baseType: !13, size: 32, align: 32, flags: DIFlagArtificial)
+!22 = !{i32 8, !"PIC Level", i32 2}
+!23 = !{i32 7, !"PIE Level", i32 2}
+!24 = !{i32 7, !"Dwarf Version", i32 4}
+!25 = !{i32 2, !"Debug Info Version", i32 3}
+!26 = !{!"rustc version 1.91.0-nightly (160e7623e 2025-08-26)"}
+!27 = distinct !DICompileUnit(language: DW_LANG_Rust, file: !28, producer: "clang LLVM (rustc version 1.91.0-nightly (160e7623e 2025-08-26))", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !29, splitDebugInlining: false, nameTableKind: None)
+!28 = !DIFile(filename: "variant-part/src/main.rs/@/c0znihgkvro8hs0n88fgrtg6x", directory: "/tmp/variant-part")
+!29 = !{!0}
diff --git a/llvm/test/CodeGen/DirectX/CBufferAccess/memcpy.ll b/llvm/test/CodeGen/DirectX/CBufferAccess/memcpy.ll
index a78fdd5037f93..f1486f974fb36 100644
--- a/llvm/test/CodeGen/DirectX/CBufferAccess/memcpy.ll
+++ b/llvm/test/CodeGen/DirectX/CBufferAccess/memcpy.ll
@@ -74,7 +74,7 @@ entry:
 ; CHECK:    [[UPTO1:%.*]] = insertelement <3 x double> [[UPTO0]], double [[Y]], i32 1
 ; CHECK:    [[UPTO2:%.*]] = insertelement <3 x double> [[UPTO1]], double [[Z]], i32 2
 ; CHECK:    [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A2_COPY:%.*]], i32 0
-; CHECK:    store <3 x double> [[UPTO2]], ptr [[DEST]], align 32
+; CHECK:    store <3 x double> [[UPTO2]], ptr [[DEST]], align 8
 ; CHECK:    [[LOAD:%.*]] = call { double, double } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 5)
 ; CHECK:    [[X:%.*]] = extractvalue { double, double } [[LOAD]], 0
 ; CHECK:    [[Y:%.*]] = extractvalue { double, double } [[LOAD]], 1
@@ -83,9 +83,9 @@ entry:
 ; CHECK:    [[UPTO0:%.*]] = insertelement <3 x double> poison, double [[X]], i32 0
 ; CHECK:    [[UPTO1:%.*]] = insertelement <3 x double> [[UPTO0]], double [[Y]], i32 1
 ; CHECK:    [[UPTO2:%.*]] = insertelement <3 x double> [[UPTO1]], double [[Z]], i32 2
-; CHECK:    [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A2_COPY]], i32 32
-; CHECK:    store <3 x double> [[UPTO2]], ptr [[DEST]], align 32
-  call void @llvm.memcpy.p0.p2.i32(ptr align 32 %a2.copy, ptr addrspace(2) align 32 @a2, i32 64, i1 false)
+; CHECK:    [[DEST:%.*]] = getelementptr inbounds i8, ptr [[A2_COPY]], i32 24
+; CHECK:    store <3 x double> [[UPTO2]], ptr [[DEST]], align 8
+  call void @llvm.memcpy.p0.p2.i32(ptr align 32 %a2.copy, ptr addrspace(2) align 32 @a2, i32 48, i1 false)
 
 ; CHECK:    [[CB:%.*]] = load target("dx.CBuffer", {{.*}})), ptr @CB.cb, align 4
 ; CHECK:    [[LOAD:%.*]] = call { half, half, half, half, half, half, half, half } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", {{.*}})) [[CB]], i32 7)
diff --git a/llvm/test/CodeGen/DirectX/CBufferLoadLegacy-errors.ll b/llvm/test/CodeGen/DirectX/CBufferLoadLegacy-errors.ll
index 71dcf11b9dc88..196560f551f5f 100644
--- a/llvm/test/CodeGen/DirectX/CBufferLoadLegacy-errors.ll
+++ b/llvm/test/CodeGen/DirectX/CBufferLoadLegacy-errors.ll
@@ -11,11 +11,11 @@ declare void @f16_user(half)
 ; CHECK-SAME: in function four64
 ; CHECK-SAME: Type mismatch between intrinsic and DXIL op
 define void @four64() "hlsl.export" {
-  %buffer = call target("dx.CBuffer", target("dx.Layout", {double}, 8, 0))
+  %buffer = call target("dx.CBuffer", <{ double }>)
       @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
 
   %load = call {double, double, double, double} @llvm.dx.resource.load.cbufferrow.4(
-      target("dx.CBuffer", target("dx.Layout", {double}, 8, 0)) %buffer,
+      target("dx.CBuffer", <{ double }>) %buffer,
       i32 0)
   %data = extractvalue {double, double, double, double} %load, 0
 
@@ -28,11 +28,11 @@ define void @four64() "hlsl.export" {
 ; CHECK-SAME: in function two32
 ; CHECK-SAME: Type mismatch between intrinsic and DXIL op
 define void @two32() "hlsl.export" {
-  %buffer = call target("dx.CBuffer", target("dx.Layout", {float}, 4, 0))
+  %buffer = call target("dx.CBuffer", <{ float }>)
       @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
 
   %load = call {float, float} @llvm.dx.resource.load.cbufferrow.2(
-      target("dx.CBuffer", target("dx.Layout", {float}, 4, 0)) %buffer,
+      target("dx.CBuffer", <{ float }>) %buffer,
       i32 0)
   %data = extractvalue {float, float} %load, 0
 
@@ -41,5 +41,5 @@ define void @two32() "hlsl.export" {
   ret void
 }
 
-declare { double, double, double, double } @llvm.dx.resource.load.cbufferrow.4.f64.f64.f64.f64.tdx.CBuffer_tdx.Layout_sl_f64s_8_0tt(target("dx.CBuffer", target("dx.Layout", { double }, 8, 0)), i32)
-declare { float, float } @llvm.dx.resource.load.cbufferrow.2.f32.f32.tdx.CBuffer_tdx.Layout_sl_f32s_4_0tt(target("dx.CBuffer", target("dx.Layout", { float }, 4, 0)), i32)
+declare { double, double, double, double } @llvm.dx.resource.load.cbufferrow.4.f64.f64.f64.f64.tdx.CBuffer_sl_f64st(target("dx.CBuffer", <{ double }>), i32)
+declare { float, float } @llvm.dx.resource.load.cbufferrow.2.f32.f32.tdx.CBuffer_sl_f32st(target("dx.CBuffer", <{ float }>), i32)
diff --git a/llvm/test/CodeGen/DirectX/CBufferLoadLegacy.ll b/llvm/test/CodeGen/DirectX/CBufferLoadLegacy.ll
index d6906516b716d..dd40aa8d8d31c 100644
--- a/llvm/test/CodeGen/DirectX/CBufferLoadLegacy.ll
+++ b/llvm/test/CodeGen/DirectX/CBufferLoadLegacy.ll
@@ -8,12 +8,12 @@ declare void @f16_user(half)
 
 ; CHECK-LABEL: define void @loadf32
 define void @loadf32() {
-  %buffer = call target("dx.CBuffer", target("dx.Layout", {float}, 4, 0))
+  %buffer = call target("dx.CBuffer", <{ float }>)
       @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
 
   ; CHECK: [[DATA:%.*]] = call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(i32 59, %dx.types.Handle %{{.*}}, i32 0)
   %load = call {float, float, float, float} @llvm.dx.resource.load.cbufferrow.4(
-      target("dx.CBuffer", target("dx.Layout", {float}, 4, 0)) %buffer,
+      target("dx.CBuffer", <{ float }>) %buffer,
       i32 0)
   %data = extractvalue {float, float, float, float} %load, 0
 
@@ -27,12 +27,12 @@ define void @loadf32() {
 ; CHECK-LABEL: define void @loadf64
 define void @loadf64() {
   %buffer = call
-      target("dx.CBuffer", target("dx.Layout", {double, double, double, double}, 64, 0, 8, 16, 24))
+      target("dx.CBuffer", <{ <4 x double> }>)
       @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
 
   ; CHECK: [[DATA:%.*]] = call %dx.types.CBufRet.f64 @dx.op.cbufferLoadLegacy.f64(i32 59, %dx.types.Handle %{{.*}}, i32 1)
   %load = call {double, double} @llvm.dx.resource.load.cbufferrow.2(
-      target("dx.CBuffer", target("dx.Layout", {double, double, double, double}, 64, 0, 8, 16, 24)) %buffer,
+      target("dx.CBuffer", <{ <4 x double> }>) %buffer,
       i32 1)
   %data = extractvalue {double, double} %load, 1
 
@@ -46,12 +46,12 @@ define void @loadf64() {
 ; CHECK-LABEL: define void @loadf16
 define void @loadf16() {
   %buffer = call
-      target("dx.CBuffer", target("dx.Layout", {half}, 2, 0))
+      target("dx.CBuffer", <{ half }>)
       @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
 
   ; CHECK: [[DATA:%.*]] = call %dx.types.CBufRet.f16.8 @dx.op.cbufferLoadLegacy.f16(i32 59, %dx.types.Handle %{{.*}}, i32 0)
   %load = call {half, half, half, half, half, half, half, half} @llvm.dx.resource.load.cbufferrow.8(
-      target("dx.CBuffer", target("dx.Layout", {half}, 2, 0)) %buffer,
+      target("dx.CBuffer", <{ half }>) %buffer,
       i32 0)
   %data = extractvalue {half, half, half, half, half, half, half, half} %load, 0
 
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/PSVResources-order.ll b/llvm/test/CodeGen/DirectX/ContainerData/PSVResources-order.ll
index bcf82a67a55df..5cd67be1a4a28 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/PSVResources-order.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/PSVResources-order.ll
@@ -18,7 +18,7 @@ define void @main() #0 {
   %srv0 = call target("dx.RawBuffer", i8, 0, 0)
       @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0t(
           i32 1, i32 8, i32 1, i32 0, ptr null)
-  %cbuf = call target("dx.CBuffer", target("dx.Layout", {float}, 4, 0))
+  %cbuf = call target("dx.CBuffer", <{ float }>)
       @llvm.dx.resource.handlefrombinding(i32 3, i32 2, i32 1, i32 0, ptr null)
   ret void
 }
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/PSVResources.ll b/llvm/test/CodeGen/DirectX/ContainerData/PSVResources.ll
index bea03102e4ccf..d792078b8cbb7 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/PSVResources.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/PSVResources.ll
@@ -14,7 +14,7 @@ define void @main() #0 {
 ; CHECK:          Kind:            CBuffer
 ; CHECK:          Flags:
 ; CHECK:            UsedByAtomic64:  false
-  %cbuf = call target("dx.CBuffer", target("dx.Layout", {float}, 4, 0))
+  %cbuf = call target("dx.CBuffer", <{ float }>)
       @llvm.dx.resource.handlefrombinding(i32 3, i32 2, i32 1, i32 0, ptr null)
 
   ; ByteAddressBuffer Buf : register(t8, space1)
@@ -94,6 +94,18 @@ define void @main() #0 {
   %uav2_2 = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0)
               @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_f32_1_0(
                   i32 4, i32 0, i32 10, i32 5, ptr null)
+
+  ; RWBuffer<float4> UnboundedArray[] : register(u10, space5)
+; CHECK:        - Type:            UAVTyped
+; CHECK:          Space:           5
+; CHECK:          LowerBound:      10
+; CHECK:          UpperBound:      4294967295
+; CHECK:          Kind:            TypedBuffer
+; CHECK:          Flags:
+; CHECK:            UsedByAtomic64:  false
+  ; RWBuffer<float4> Buf = BufferArray[100];
+  %uav3 = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0)
+              @llvm.dx.resource.handlefrombinding(i32 5, i32 10, i32 -1, i32 100, ptr null)
   ret void
 }
 
diff --git a/llvm/test/CodeGen/DirectX/CreateHandleFromBinding.ll b/llvm/test/CodeGen/DirectX/CreateHandleFromBinding.ll
index 38f2de28dbe59..671fcef281314 100644
--- a/llvm/test/CodeGen/DirectX/CreateHandleFromBinding.ll
+++ b/llvm/test/CodeGen/DirectX/CreateHandleFromBinding.ll
@@ -72,7 +72,7 @@ define void @test_bindings() {
   ; CHECK: call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[BUF5]], %dx.types.ResourceProperties { i32 10, i32 1033 }) #[[#ATTR]]
 
   ; cbuffer cb0 : register(b0) { int4 i; float4 f; }
-  %cb0 = call target("dx.CBuffer", target("dx.Layout", {<4 x i32>, <4 x float>}, 32, 0, 16))
+  %cb0 = call target("dx.CBuffer", <{ <4 x i32>, <4 x float> }>)
       @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
   ; CHECK: [[BUF6:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 0, i32 0, i32 0, i8 2 }, i32 0, i1 false) #[[#ATTR]]
   ; CHECK: call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[BUF6]], %dx.types.ResourceProperties { i32 13, i32 32 }) #[[#ATTR]]
diff --git a/llvm/test/CodeGen/DirectX/ForwardHandleAccesses/cbuffer-access.ll b/llvm/test/CodeGen/DirectX/ForwardHandleAccesses/cbuffer-access.ll
index 26b157f35a3e4..d67486324d220 100644
--- a/llvm/test/CodeGen/DirectX/ForwardHandleAccesses/cbuffer-access.ll
+++ b/llvm/test/CodeGen/DirectX/ForwardHandleAccesses/cbuffer-access.ll
@@ -4,27 +4,27 @@
 %__cblayout_CB2 = type <{ float }>
 %struct.Scalars = type { float, i32, i32 }
 
-@CB.cb = local_unnamed_addr global target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 12, 0, 4, 8)) poison
-@CB2.cb = local_unnamed_addr global target("dx.CBuffer", target("dx.Layout", %__cblayout_CB2, 4, 0)) poison
+@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison
+@CB2.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB2) poison
 
 define void @main() local_unnamed_addr #1 {
 entry:
   ; CHECK: [[CB:%.*]] = tail call target({{.*}}) @llvm.dx.resource.handlefrombinding
-  %h = tail call target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 12, 0, 4, 8)) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
-  store target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 12, 0, 4, 8)) %h, ptr @CB.cb, align 4
+  %h = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
+  store target("dx.CBuffer", %__cblayout_CB) %h, ptr @CB.cb, align 4
   %_ZL3Out_h.i.i = tail call target("dx.RawBuffer", %struct.Scalars, 1, 0) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
   ; CHECK-NOT: load target({{.*}}), ptr @CB.cb
-  %cb = load target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 12, 0, 4, 8)), ptr @CB.cb, align 4
+  %cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 4
   ; CHECK: call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target({{.*}}) [[CB]], i32 0)
-  %0 = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4(target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 12, 0, 4, 8)) %cb, i32 0)
+  %0 = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4(target("dx.CBuffer", %__cblayout_CB) %cb, i32 0)
   %1 = extractvalue { float, float, float, float } %0, 0
   call void @llvm.dx.resource.store.rawbuffer(target("dx.RawBuffer", %struct.Scalars, 1, 0) %_ZL3Out_h.i.i, i32 0, i32 0, float %1)
-  
+
   ; CHECK: [[CB2:%.*]] = tail call target({{.*}}) @llvm.dx.resource.handlefromimplicitbinding
-  %h2 = tail call target("dx.CBuffer", target("dx.Layout", %__cblayout_CB2, 4, 0)) @llvm.dx.resource.handlefromimplicitbinding(i32 100, i32 0, i32 1, i32 0, ptr null)
-  store target("dx.CBuffer", target("dx.Layout", %__cblayout_CB2, 4, 0)) %h2, ptr @CB2.cb, align 4
+  %h2 = tail call target("dx.CBuffer", %__cblayout_CB2) @llvm.dx.resource.handlefromimplicitbinding(i32 100, i32 0, i32 1, i32 0, ptr null)
+  store target("dx.CBuffer", %__cblayout_CB2) %h2, ptr @CB2.cb, align 4
   ; CHECK-NOT: load target({{.*}}), ptr @CB2.cb
-  %cb2 = load target("dx.CBuffer", target("dx.Layout", %__cblayout_CB2, 4, 0)), ptr @CB2.cb, align 4
+  %cb2 = load target("dx.CBuffer", %__cblayout_CB2), ptr @CB2.cb, align 4
 
   ret void
 }
diff --git a/llvm/test/CodeGen/DirectX/Metadata/cbuffer_metadata.ll b/llvm/test/CodeGen/DirectX/Metadata/cbuffer-layouttype.ll
similarity index 85%
rename from llvm/test/CodeGen/DirectX/Metadata/cbuffer_metadata.ll
rename to llvm/test/CodeGen/DirectX/Metadata/cbuffer-layouttype.ll
index 7ba2ed2988312..85952c9ae4e83 100644
--- a/llvm/test/CodeGen/DirectX/Metadata/cbuffer_metadata.ll
+++ b/llvm/test/CodeGen/DirectX/Metadata/cbuffer-layouttype.ll
@@ -1,3 +1,6 @@
+; TODO: Remove this test once we've updated the frontend to use explicit
+; padding. The cbuffer-metadata.ll test covers the newer logic.
+
 ; RUN: opt -S -dxil-translate-metadata < %s | FileCheck %s
 ; RUN: opt -S --passes="dxil-pretty-printer" < %s 2>&1 | FileCheck %s --check-prefix=PRINT
 ; RUN: llc %s --filetype=asm -o - < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,PRINT
@@ -19,11 +22,11 @@ target triple = "dxil-pc-shadermodel6.6-compute"
 
 ; PRINT:; Resource Bindings:
 ; PRINT-NEXT:;
-; PRINT-NEXT:; Name                                 Type  Format         Dim      ID      HLSL Bind  Count
-; PRINT-NEXT:; ------------------------------ ---------- ------- ----------- ------- -------------- ------
-; PRINT-NEXT:; CB1                               cbuffer      NA          NA     CB0            cb0     1
-; PRINT-NEXT:; CB2                               cbuffer      NA          NA     CB1            cb1     1
-; PRINT-NEXT:; MyConstants                       cbuffer      NA          NA     CB2    cb5,space15     1
+; PRINT-NEXT:; Name            Type  Format  Dim   ID    HLSL Bind  Count
+; PRINT-NEXT:; ----
+; PRINT-NEXT:; CB1          cbuffer      NA   NA  CB0          cb0     1
+; PRINT-NEXT:; CB2          cbuffer      NA   NA  CB1          cb1     1
+; PRINT-NEXT:; MyConstants  cbuffer      NA   NA  CB2  cb5,space15     1
 
 define void @test() #0 {
 
diff --git a/llvm/test/CodeGen/DirectX/Metadata/cbuffer-metadata.ll b/llvm/test/CodeGen/DirectX/Metadata/cbuffer-metadata.ll
new file mode 100644
index 0000000000000..6b90e17816991
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/Metadata/cbuffer-metadata.ll
@@ -0,0 +1,89 @@
+; RUN: opt -S -dxil-translate-metadata < %s | FileCheck %s
+; RUN: opt -S --passes="dxil-pretty-printer" < %s 2>&1 | FileCheck %s --check-prefix=PRINT
+; RUN: llc %s --filetype=asm -o - < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,PRINT
+
+target triple = "dxil-pc-shadermodel6.6-compute"
+
+%__cblayout_CB1 = type <{ float, i32, double, <2 x i32> }>
+@CB1.cb = global target("dx.CBuffer", %__cblayout_CB1) poison
+@CB1.str = private unnamed_addr constant [4 x i8] c"CB1\00", align 1
+
+%__cblayout_CB2 = type <{ float, target("dx.Padding", 4), double, float, half, i16, i64, i32 }>
+@CB2.cb = global target("dx.CBuffer", %__cblayout_CB2) poison
+@CB2.str = private unnamed_addr constant [4 x i8] c"CB2\00", align 1
+
+%__cblayout_MyConstants = type <{
+  double, target("dx.Padding", 8),
+  <3 x float>, float,
+  <3 x double>, half, target("dx.Padding", 6),
+  <2 x double>,
+  float, <3 x half>, <3 x half>
+}>
+@MyConstants.cb = global target("dx.CBuffer", %__cblayout_MyConstants) poison
+@MyConstants.str = private unnamed_addr constant [12 x i8] c"MyConstants\00", align 1
+
+; PRINT:; Resource Bindings:
+; PRINT-NEXT:;
+; PRINT-NEXT:; Name            Type  Format  Dim   ID    HLSL Bind  Count
+; PRINT-NEXT:; ----
+; PRINT-NEXT:; CB1          cbuffer      NA   NA  CB0          cb0     1
+; PRINT-NEXT:; CB2          cbuffer      NA   NA  CB1          cb1     1
+; PRINT-NEXT:; MyConstants  cbuffer      NA   NA  CB2  cb5,space15     1
+
+define void @test() #0 {
+
+  ; cbuffer CB1 : register(b0) {
+  ;   float a;
+  ;   int b;
+  ;   double c;
+  ;   int2 d;
+  ; }
+  %CB1.cb_h = call target("dx.CBuffer", %__cblayout_CB1)
+            @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr @CB1.str)
+
+  ; cbuffer CB2 : register(b0) {
+  ;   float a;
+  ;   double b;
+  ;   float c;
+  ;   half d;
+  ;   uint16_t e;
+  ;   int64_t f;
+  ;   int g;
+  ;}
+  %CB2.cb_h = call target("dx.CBuffer", %__cblayout_CB2)
+            @llvm.dx.resource.handlefrombinding(i32 0, i32 1, i32 1, i32 0, ptr @CB2.str)
+
+  ; cbuffer CB3 : register(b5) {
+  ;   double B0;
+  ;   float3 B1;
+  ;   float B2;
+  ;   double3 B3;
+  ;   half B4;
+  ;   double2 B5;
+  ;   float B6;
+  ;   half3 B7;
+  ;   half3 B8;
+  ; }
+  %CB3.cb_h = call target("dx.CBuffer", %__cblayout_MyConstants)
+            @llvm.dx.resource.handlefrombinding(i32 15, i32 5, i32 1, i32 0, ptr @MyConstants.str)
+
+  ret void
+}
+
+attributes #0 = { noinline nounwind "hlsl.shader"="compute" }
+
+; CHECK: %CBuffer.CB1 = type { { float, i32, double, <2 x i32> } }
+; CHECK: %CBuffer.CB2 = type { { float, double, float, half, i16, i64, i32 } }
+; CHECK: %CBuffer.MyConstants = type { { double, <3 x float>, float, <3 x double>, half, <2 x double>, float, <3 x half>, <3 x half> } }
+
+; CHECK: @CB1 = external constant %CBuffer.CB1
+; CHECK: @CB2 = external constant %CBuffer.CB2
+; CHECK: @MyConstants = external constant %CBuffer.MyConstants
+
+; CHECK: !dx.resources = !{[[ResList:[!][0-9]+]]}
+
+; CHECK: [[ResList]] = !{null, null, [[CBList:[!][0-9]+]], null}
+; CHECK: [[CBList]] = !{![[CB1:[0-9]+]], ![[CB2:[0-9]+]], ![[MYCONSTANTS:[0-9]+]]}
+; CHECK: ![[CB1]] = !{i32 0, ptr @CB1, !"CB1", i32 0, i32 0, i32 1, i32 24, null}
+; CHECK: ![[CB2]] = !{i32 1, ptr @CB2, !"CB2", i32 0, i32 1, i32 1, i32 36, null}
+; CHECK: ![[MYCONSTANTS]] = !{i32 2, ptr @MyConstants, !"MyConstants", i32 15, i32 5, i32 1, i32 96, null}
diff --git a/llvm/test/CodeGen/DirectX/Metadata/cbuffer-only.ll b/llvm/test/CodeGen/DirectX/Metadata/cbuffer-only.ll
index e2a1c09c13038..0b454c12a74a2 100644
--- a/llvm/test/CodeGen/DirectX/Metadata/cbuffer-only.ll
+++ b/llvm/test/CodeGen/DirectX/Metadata/cbuffer-only.ll
@@ -7,7 +7,7 @@
 target triple = "dxil-pc-shadermodel6.6-compute"
 
 define void @cbuffer_is_only_binding() {
-  %cbuf = call target("dx.CBuffer", target("dx.Layout", {float}, 4, 0))
+  %cbuf = call target("dx.CBuffer", <{ float }>)
       @llvm.dx.resource.handlefrombinding(i32 1, i32 8, i32 1, i32 0, ptr null)
   ; CHECK: %CBuffer = type { float }
 
diff --git a/llvm/test/CodeGen/DirectX/bufferGetDimensions.ll b/llvm/test/CodeGen/DirectX/bufferGetDimensions.ll
new file mode 100644
index 0000000000000..ff03bf1150fdf
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/bufferGetDimensions.ll
@@ -0,0 +1,16 @@
+; RUN: opt -S -dxil-op-lower %s | FileCheck %s
+
+target triple = "dxil-pc-shadermodel6.6-compute"
+
+define i32 @test_getdimensions_no_mips() {
+  ; CHECK: %[[HANDLE:.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, 
+  ; CHECK-NEXT: %[[ANNOT_HANDLE:.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %[[HANDLE]]
+  %handle = call target("dx.TypedBuffer", <4 x float>, 0, 0, 0) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null)
+
+  ; CHECK-NEXT: %[[RETVAL:.*]] = call %dx.types.Dimensions @dx.op.getDimensions(i32 72, %dx.types.Handle %[[ANNOT_HANDLE]], i32 undef)
+  ; CHECK-NEXT: %[[DIM:.*]] = extractvalue %dx.types.Dimensions %[[RETVAL]], 0
+  %1 = call i32 @llvm.dx.resource.getdimensions.x(target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %handle)
+  
+  ; CHECK-NEXT: ret i32 %[[DIM]]
+  ret i32 %1
+}
diff --git a/llvm/test/CodeGen/DirectX/strip-llvm-errno-tbaa.ll b/llvm/test/CodeGen/DirectX/strip-llvm-errno-tbaa.ll
new file mode 100644
index 0000000000000..9190d0305d63f
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/strip-llvm-errno-tbaa.ll
@@ -0,0 +1,19 @@
+; RUN: opt -S -dxil-prepare < %s | FileCheck %s
+
+; Ensures that dxil-prepare will remove the llvm.errno.tbaa metadata
+
+target triple = "dxil-unknown-shadermodel6.0-compute"
+
+define void @main() {
+entry:
+  ret void
+}
+
+; CHECK-NOT: !llvm.errno.tbaa
+; CHECK-NOT: {{^!}}
+
+!llvm.errno.tbaa = !{!0}
+
+!0 = !{!1, !1, i64 0}
+!1 = !{!"omnipotent char", !2}
+!2 = !{!"Simple C/C++ TBAA"}
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/vector-align-tbaa.ll b/llvm/test/CodeGen/Hexagon/autohvx/vector-align-tbaa.ll
index 6eece708fa337..26ccf598e773f 100644
--- a/llvm/test/CodeGen/Hexagon/autohvx/vector-align-tbaa.ll
+++ b/llvm/test/CodeGen/Hexagon/autohvx/vector-align-tbaa.ll
@@ -149,10 +149,10 @@ define void @f3(ptr %a0, i32 %a1, <64 x i16> %a2, <64 x i16> %a3) #0 {
 ; CHECK-NEXT:    [[CUP16:%.*]] = call <32 x i32> @llvm.hexagon.V6.vlalignb.128B(<32 x i32> zeroinitializer, <32 x i32> splat (i32 -1), i32 [[PTI1]])
 ; CHECK-NEXT:    [[CST17:%.*]] = bitcast <32 x i32> [[CUP16]] to <128 x i8>
 ; CHECK-NEXT:    [[TRN:%.*]] = trunc <128 x i8> [[CST6]] to <128 x i1>
-; CHECK-NEXT:    call void @llvm.masked.store.v128i8.p0(<128 x i8> [[CST4]], ptr [[ITP]], i32 128, <128 x i1> [[TRN]]), !tbaa [[TBAA5:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.masked.store.v128i8.p0(<128 x i8> [[CST4]], ptr align 128 [[ITP]], <128 x i1> [[TRN]]), !tbaa [[TBAA5:![0-9]+]]
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[ITP]], i32 128
 ; CHECK-NEXT:    [[TRN18:%.*]] = trunc <128 x i8> [[CST12]] to <128 x i1>
-; CHECK-NEXT:    call void @llvm.masked.store.v128i8.p0(<128 x i8> [[CST10]], ptr [[GEP]], i32 128, <128 x i1> [[TRN18]]), !tbaa [[TBAA5]]
+; CHECK-NEXT:    call void @llvm.masked.store.v128i8.p0(<128 x i8> [[CST10]], ptr align 128 [[GEP]], <128 x i1> [[TRN18]]), !tbaa [[TBAA5]]
 ; CHECK-NEXT:    [[GEP19:%.*]] = getelementptr i8, ptr [[ITP]], i32 256
 ; CHECK-NEXT:    [[AND20:%.*]] = and i32 [[PTI1]], 127
 ; CHECK-NEXT:    [[ISZ:%.*]] = icmp ne i32 [[AND20]], 0
@@ -202,10 +202,10 @@ define void @f4(ptr %a0, i32 %a1, <64 x i16> %a2, <64 x i16> %a3) #0 {
 ; CHECK-NEXT:    [[CUP16:%.*]] = call <32 x i32> @llvm.hexagon.V6.vlalignb.128B(<32 x i32> zeroinitializer, <32 x i32> splat (i32 -1), i32 [[PTI1]])
 ; CHECK-NEXT:    [[CST17:%.*]] = bitcast <32 x i32> [[CUP16]] to <128 x i8>
 ; CHECK-NEXT:    [[TRN:%.*]] = trunc <128 x i8> [[CST6]] to <128 x i1>
-; CHECK-NEXT:    call void @llvm.masked.store.v128i8.p0(<128 x i8> [[CST4]], ptr [[ITP]], i32 128, <128 x i1> [[TRN]])
+; CHECK-NEXT:    call void @llvm.masked.store.v128i8.p0(<128 x i8> [[CST4]], ptr align 128 [[ITP]], <128 x i1> [[TRN]])
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[ITP]], i32 128
 ; CHECK-NEXT:    [[TRN18:%.*]] = trunc <128 x i8> [[CST12]] to <128 x i1>
-; CHECK-NEXT:    call void @llvm.masked.store.v128i8.p0(<128 x i8> [[CST10]], ptr [[GEP]], i32 128, <128 x i1> [[TRN18]])
+; CHECK-NEXT:    call void @llvm.masked.store.v128i8.p0(<128 x i8> [[CST10]], ptr align 128 [[GEP]], <128 x i1> [[TRN18]])
 ; CHECK-NEXT:    [[GEP19:%.*]] = getelementptr i8, ptr [[ITP]], i32 256
 ; CHECK-NEXT:    [[AND20:%.*]] = and i32 [[PTI1]], 127
 ; CHECK-NEXT:    [[ISZ:%.*]] = icmp ne i32 [[AND20]], 0
@@ -255,10 +255,10 @@ define void @f5(ptr %a0, i32 %a1, <64 x i16> %a2, <64 x i16> %a3) #0 {
 ; CHECK-NEXT:    [[CUP16:%.*]] = call <32 x i32> @llvm.hexagon.V6.vlalignb.128B(<32 x i32> zeroinitializer, <32 x i32> splat (i32 -1), i32 [[PTI1]])
 ; CHECK-NEXT:    [[CST17:%.*]] = bitcast <32 x i32> [[CUP16]] to <128 x i8>
 ; CHECK-NEXT:    [[TRN:%.*]] = trunc <128 x i8> [[CST6]] to <128 x i1>
-; CHECK-NEXT:    call void @llvm.masked.store.v128i8.p0(<128 x i8> [[CST4]], ptr [[ITP]], i32 128, <128 x i1> [[TRN]]), !tbaa [[TBAA5]]
+; CHECK-NEXT:    call void @llvm.masked.store.v128i8.p0(<128 x i8> [[CST4]], ptr align 128 [[ITP]], <128 x i1> [[TRN]]), !tbaa [[TBAA5]]
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[ITP]], i32 128
 ; CHECK-NEXT:    [[TRN18:%.*]] = trunc <128 x i8> [[CST12]] to <128 x i1>
-; CHECK-NEXT:    call void @llvm.masked.store.v128i8.p0(<128 x i8> [[CST10]], ptr [[GEP]], i32 128, <128 x i1> [[TRN18]])
+; CHECK-NEXT:    call void @llvm.masked.store.v128i8.p0(<128 x i8> [[CST10]], ptr align 128 [[GEP]], <128 x i1> [[TRN18]])
 ; CHECK-NEXT:    [[GEP19:%.*]] = getelementptr i8, ptr [[ITP]], i32 256
 ; CHECK-NEXT:    [[AND20:%.*]] = and i32 [[PTI1]], 127
 ; CHECK-NEXT:    [[ISZ:%.*]] = icmp ne i32 [[AND20]], 0
diff --git a/llvm/test/CodeGen/Hexagon/bitcast-i64-to-v64i1.ll b/llvm/test/CodeGen/Hexagon/bitcast-i64-to-v64i1.ll
new file mode 100644
index 0000000000000..f7e5cdbaecee5
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/bitcast-i64-to-v64i1.ll
@@ -0,0 +1,33 @@
+; RUN: llc --mtriple=hexagon -mattr=+hvxv79,+hvx-length128b < %s | FileCheck %s
+; CHECK-DAG: r[[REGH:([0-9]+)]]:[[REGL:([0-9]+)]] = combine(##.LCPI0_0,#-1)
+; CHECK-DAG: [[VREG1:v([0-9]+)]] = vmem(r[[REGH]]+#0)
+; CHECK-DAG: [[REG1:(r[0-9]+)]] = memw(r{{[0-9]+}}+#4)
+; CHECK-DAG: [[VREG2:v([0-9]+)]] = vsplat([[REG1]])
+; CHECK-DAG: [[REG2:(r[0-9]+)]] = memw(r{{[0-9]+}}+#0)
+; CHECK-DAG: [[VREG3:v([0-9]+)]] = vsplat([[REG2]])
+; CHECK-DAG: [[VREG4:v([0-9]+)]] = vand([[VREG2]],[[VREG1]])
+; CHECK-DAG: [[VREG5:v([0-9]+)]] = vand([[VREG3]],[[VREG1]])
+; CHECK-DAG: [[QREG:q[0-9]+]] = vand([[VREG4]],r{{[0-9]+}})
+; CHECK-DAG: [[VREG6:v([0-9]+)]] = vand([[QREG]],r{{[0-9]+}})
+; CHECK-DAG: [[QREG1:q[0-9]+]] = vand([[VREG5]],r{{[0-9]+}})
+; CHECK-DAG: [[VREG7:v([0-9]+)]] = vand([[QREG1]],r{{[0-9]+}})
+; CHECK-DAG: v{{[0-9]+}}.b = vpacke(v{{[0-9]+}}.h,v{{[0-9]+}}.h)
+; CHECK-DAG: v{{[0-9]+}}.b = vpacke(v{{[0-9]+}}.h,v{{[0-9]+}}.h)
+; CHECK-DAG: [[VREG8:v([0-9]+)]] = vror(v{{[0-9]+}},r{{[0-9]+}})
+; CHECK-DAG: [[VREG9:v([0-9]+)]] = vor([[VREG8]],v{{[0-9]+}})
+; CHECK-DAG: q{{[0-9]+}} = vand([[VREG9]],r{{[0-9]+}})
+define void @bitcast_i64_to_v64i1_full(ptr %in, ptr %out) {
+entry:
+  %load = load i64, ptr %in, align 4
+  %bitcast = bitcast i64 %load to <64 x i1>
+  %e0 = extractelement <64 x i1> %bitcast, i32 0
+  %e1 = extractelement <64 x i1> %bitcast, i32 1
+  %z0 = zext i1 %e0 to i8
+  %z1 = zext i1 %e1 to i8
+  %ptr0 = getelementptr i8, ptr %out, i32 0
+  %ptr1 = getelementptr i8, ptr %out, i32 1
+  store i8 %z0, ptr %ptr0, align 1
+  store i8 %z1, ptr %ptr1, align 1
+  ret void
+}
+
diff --git a/llvm/test/CodeGen/Hexagon/insert-big.ll b/llvm/test/CodeGen/Hexagon/insert-big.ll
new file mode 100644
index 0000000000000..8735a6679bf54
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/insert-big.ll
@@ -0,0 +1,47 @@
+; Check that llc does not abort, which happened due to incorrect MIR.
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=1 < %s
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=2 < %s
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=3 < %s
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=4 < %s
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=5 < %s
+
+; Look for this symptom, in case llc does not check invalid IR.
+; CHECK-NOT: insert(%14,%5,#5,#5)
+
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=1 -debug-only=hexinsert -stop-after hexinsert < %s 2>&1 | FileCheck %s
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=2 -debug-only=hexinsert -stop-after hexinsert < %s 2>&1 | FileCheck %s
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=3 -debug-only=hexinsert -stop-after hexinsert < %s 2>&1 | FileCheck %s
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=4 -debug-only=hexinsert -stop-after hexinsert < %s 2>&1 | FileCheck %s
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=5 -debug-only=hexinsert -stop-after hexinsert < %s 2>&1 | FileCheck %s
+
+; REQUIRES: asserts
+
+define i32 @f(i32 %0, i32 %1, i32 %2) {
+entry:
+  switch i32 %0, label %common.ret1 [
+    i32 8907, label %3
+    i32 4115, label %6
+  ]
+
+common.ret1:
+  %common.ret1.op = phi i32 [ %5, %3 ], [ %526, %6 ], [ 0, %entry ]
+  ret i32 %common.ret1.op
+
+3:
+  %4 = shl i32 %2, 5
+  %5 = and i32 %4, 992
+  br label %common.ret1
+
+6:
+  %7 = shl i32 %0, 10
+  %8 = and i32 %7, 7168
+  %9 = shl i32 %0, 5
+  %10 = and i32 %9, 992
+  %11 = or i32 %10, %8
+  %12 = and i32 %0, 1
+  %13 = or i32 %11, %12
+  %14 = shl i32 %1, 1
+  %15 = and i32 %14, 2031616
+  %526 = or i32 %13, %15
+  br label %common.ret1
+}
diff --git a/llvm/test/CodeGen/Hexagon/qfp-conv.ll b/llvm/test/CodeGen/Hexagon/qfp-conv.ll
new file mode 100644
index 0000000000000..d2d393e1a859d
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/qfp-conv.ll
@@ -0,0 +1,35 @@
+; RUN: llc -mtriple=hexagon -mattr=+hvxv68,+hvx,+hvx-length128b < %s | FileCheck %s
+
+; Test that the Qfloat optimization pass doesn't crash due to an invalid
+; instructions.
+
+; CHECK: v{{[0-9]+}}.hf = v{{[0-9]:[0-9]}}.qf32
+
+define void @test(
+    <32 x i32>* %optr,
+    <64 x i32> %in64,
+    <32 x i32> %va,
+    <32 x i32> %vb
+) local_unnamed_addr #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %optr.068 = phi <32 x i32>* [ %optr, %entry ], [ %incdec.ptr6, %for.body ]
+  %0 = tail call <32 x i32> @llvm.hexagon.V6.vconv.hf.qf32.128B(<64 x i32> %in64) #2
+  %1 = tail call <32 x i32> @llvm.hexagon.V6.vdealh.128B(<32 x i32> %0) #2
+  %2 = tail call <128 x i1> @llvm.hexagon.V6.vgth.128B(<32 x i32> %va, <32 x i32> %1) #2
+  %3 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<128 x i1> %2, <32 x i32> %va, <32 x i32> %vb) #2
+  %4 = tail call <32 x i32> @llvm.hexagon.V6.vaddhsat.128B(<32 x i32> %3, <32 x i32> %vb) #2
+  %5 = tail call <32 x i32> @llvm.hexagon.V6.vpackhub.sat.128B(<32 x i32> %va, <32 x i32> %4) #2
+  store <32 x i32> %5, <32 x i32>* %optr.068, align 1
+  %incdec.ptr6 = getelementptr inbounds <32 x i32>, <32 x i32>* %optr.068, i32 1
+  br label %for.body
+}
+
+declare <32 x i32> @llvm.hexagon.V6.vdealh.128B(<32 x i32>) #1
+declare <32 x i32> @llvm.hexagon.V6.vconv.hf.qf32.128B(<64 x i32>) #1
+declare <32 x i32> @llvm.hexagon.V6.vaddhsat.128B(<32 x i32>, <32 x i32>) #1
+declare <32 x i32> @llvm.hexagon.V6.vpackhub.sat.128B(<32 x i32>, <32 x i32>) #1
+declare <128 x i1> @llvm.hexagon.V6.vgth.128B(<32 x i32>, <32 x i32>) #1
+declare <32 x i32> @llvm.hexagon.V6.vmux.128B(<128 x i1>, <32 x i32>, <32 x i32>) #1
diff --git a/llvm/test/CodeGen/Hexagon/qfp-enabled.ll b/llvm/test/CodeGen/Hexagon/qfp-enabled.ll
new file mode 100644
index 0000000000000..a5cc5fa43167e
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/qfp-enabled.ll
@@ -0,0 +1,19 @@
+; Tests if the flag to disable qfp optimizer pass works or not.
+
+; RUN: llc -march=hexagon -mcpu=hexagonv69 -mattr=+hvxv69,+hvx-length128b \
+; RUN: < %s -o -| FileCheck %s --check-prefix=ENABLED
+; RUN: llc -march=hexagon -mcpu=hexagonv69 -mattr=+hvxv69,+hvx-length128b \
+; RUN: -disable-qfp-opt < %s -o -| FileCheck %s --check-prefix=DISABLED
+
+define dso_local <32 x i32> @conv1_qf32(<32 x i32> noundef %input1, <32 x i32> noundef %input2) local_unnamed_addr {
+entry:
+; DISABLED: [[V2:v[0-9]+]].qf32 = vadd(v0.sf,v1.sf)
+; DISABLED: [[V3:v[0-9]+]].sf = [[V2]].qf32
+; DISABLED: qf32 = vadd(v0.sf,[[V3]].sf)
+; ENABLED: [[V4:v[0-9]+]].qf32 = vadd(v0.sf,v1.sf)
+; ENABLED: qf32 = vadd([[V4]].qf32,v0.sf)
+  %0 = tail call <32 x i32> @llvm.hexagon.V6.vadd.sf.128B(<32 x i32> %input1, <32 x i32> %input2)
+  %1 = tail call <32 x i32> @llvm.hexagon.V6.vconv.sf.qf32.128B(<32 x i32> %0)
+  %2 = tail call <32 x i32> @llvm.hexagon.V6.vadd.sf.128B(<32 x i32> %input1, <32 x i32> %1)
+  ret <32 x i32> %2
+}
diff --git a/llvm/test/CodeGen/Hexagon/qfp-remove-kill.mir b/llvm/test/CodeGen/Hexagon/qfp-remove-kill.mir
new file mode 100644
index 0000000000000..d8dde7d70885b
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/qfp-remove-kill.mir
@@ -0,0 +1,95 @@
+# RUN: llc -march=hexagon -mcpu=hexagonv68 -mattr=+hvxv68,+hvx-length128b \
+# RUN: -run-pass hexagon-qfp-optimizer -run-pass machineverifier %s -o - | FileCheck %s
+
+# Test that the killed RegState from DefMI operands are removed
+# killed RegState should be set for MI operands
+# CHECK-LABEL: name: qfpAdd
+# CHECK: %{{[0-9]+}}:hvxvr = V6_vconv_sf_qf32 %[[REG1:([0-9]+)]]
+# CHECK-NEXT: %{{[0-9]+}}:hvxvr = V6_vconv_sf_qf32 %[[REG2:([0-9]+)]]
+# CHECK-NEXT: V6_vadd_qf32 killed %[[REG1]], killed %[[REG2]]
+# CHECK-NEXT: %{{[0-9]+}}:hvxvr = V6_vconv_sf_qf32 %[[REG3:([0-9]+)]]
+# CHECK-NEXT: %{{[0-9]+}}:hvxvr = V6_vconv_sf_qf32 %[[REG4:([0-9]+)]]
+# CHECK-NEXT: V6_vadd_qf32 killed %[[REG3]], killed %[[REG4]]
+
+---
+name: qfpAdd
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $r0, $r1, $r2, $r3
+    %0:intregs = COPY $r0
+    %1:intregs = COPY $r1
+    %2:intregs = COPY $r2
+    %3:intregs = COPY $r3
+    %4:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+    %5:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+    %6:hvxvr = V6_vL32Ub_ai %2:intregs, 0
+    %7:hvxvr = V6_vL32Ub_ai %3:intregs, 0
+    %8:hvxvr = V6_vconv_sf_qf32 killed %4:hvxvr
+    %9:hvxvr = V6_vconv_sf_qf32 killed %5:hvxvr
+    %10:hvxvr = V6_vadd_sf %8:hvxvr, %9:hvxvr
+    %11:hvxvr = V6_vconv_sf_qf32 killed %6:hvxvr
+    %12:hvxvr = V6_vconv_sf_qf32 killed %7:hvxvr
+    %13:hvxvr = V6_vadd_sf killed %11:hvxvr, killed %12:hvxvr
+...
+
+
+# Test that the killed RegState from DefMI operands are removed
+# CHECK-LABEL: name: qfpAddMix
+# CHECK: %{{[0-9]+}}:hvxvr = V6_vconv_sf_qf32 %[[REG1:([0-9]+)]]
+# CHECK-NEXT: V6_vadd_qf32_mix killed %[[REG1]], %{{[0-9]+}}
+# CHECK: %{{[0-9]+}}:hvxvr = V6_vconv_sf_qf32 %[[REG2:([0-9]+)]]
+# CHECK-NEXT: V6_vadd_qf32_mix killed %[[REG2]], %{{[0-9]+}}
+
+---
+name: qfpAddMix
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $r0, $r1, $r2
+    %0:intregs = COPY $r0
+    %1:intregs = COPY $r1
+    %2:intregs = COPY $r2
+    %3:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+    %4:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+    %5:hvxvr = V6_vL32Ub_ai %2:intregs, 0
+    %6:hvxvr = V6_vmpy_qf32_sf %4, %5
+    %7:hvxvr = V6_vconv_sf_qf32 killed %6:hvxvr
+    %8:hvxvr = V6_vadd_sf %3:hvxvr, %7:hvxvr
+    %9:hvxvr = V6_vmpy_qf32_sf %4, %5
+    %10:hvxvr = V6_vconv_sf_qf32 killed %9:hvxvr
+    %11:hvxvr = V6_vadd_sf %3:hvxvr, killed %10:hvxvr
+...
+
+
+# Test that we do generate V6_vsub_qf32_mix for the below test.
+# V6_vsub_qf32_mix only allowes qf32 as first operand. In the test qf32
+# is passed as first operand. So, V6_vsub_qf32_mix must be generated.
+# CHECK-LABEL: name: qfpAddSwapMix
+# CHECK: %{{[0-9]+}}:hvxvr = V6_vconv_sf_qf32 %[[REG1:([0-9]+)]]
+# CHECK-NEXT: V6_vadd_qf32_mix killed %[[REG1]], %{{[0-9]+}}
+# CHECK: %{{[0-9]+}}:hvxvr = V6_vconv_sf_qf32 %[[REG2:([0-9]+)]]
+# CHECK-NEXT: V6_vadd_qf32_mix killed %[[REG2]], %{{[0-9]+}}
+
+---
+name: qfpAddSwapMix
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $r0, $r1, $r2
+    %0:intregs = COPY $r0
+    %1:intregs = COPY $r1
+    %2:intregs = COPY $r2
+    %3:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+    %4:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+    %5:hvxvr = V6_vL32Ub_ai %2:intregs, 0
+    %6:hvxvr = V6_vmpy_qf32_sf %4, %5
+    %7:hvxvr = V6_vconv_sf_qf32 killed %6:hvxvr
+    %8:hvxvr = V6_vadd_sf %7:hvxvr, %3:hvxvr
+    %9:hvxvr = V6_vmpy_qf32_sf %4, %5
+    %10:hvxvr = V6_vconv_sf_qf32 killed %9:hvxvr
+    %11:hvxvr = V6_vadd_sf killed %10:hvxvr, %3:hvxvr
+...
diff --git a/llvm/test/CodeGen/Hexagon/qfp-subreg-bug.mir b/llvm/test/CodeGen/Hexagon/qfp-subreg-bug.mir
new file mode 100644
index 0000000000000..1d78203cf5d5a
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/qfp-subreg-bug.mir
@@ -0,0 +1,33 @@
+# RUN: llc -march=hexagon -mcpu=hexagonv69 -mattr=+hvxv69,+hvx-length128b -run-pass hexagon-qfp-optimizer %s -o - | FileCheck %s
+
+# CHECK: V6_vshuffvdd
+# CHECK: V6_vadd_sf
+# CHECK: V6_vadd_qf32_mix{{.*}}vsub_lo
+# CHECK: V6_vadd_qf32_mix{{.*}}vsub_hi
+
+---
+name:            qfp_subreg_fix
+alignment:       16
+tracksRegLiveness: true
+
+body:             |
+  bb.0:
+    %10:intregs = IMPLICIT_DEF
+    %9:hvxvr = V6_vL32Ub_ai %10, 0 :: (load (s1024) from `ptr undef`, align 4)
+    %11:intregs = A2_tfrsi 15360
+    %12:hvxvr = V6_lvsplath %11
+    %13:hvxwr = V6_vmpy_qf32_hf %9, %12
+    %15:hvxvr = V6_vconv_sf_qf32 %13.vsub_lo
+    %17:hvxvr = V6_vconv_sf_qf32 %13.vsub_hi
+    %18:intregslow8 = A2_tfrsi -4
+    %19:hvxwr = V6_vshuffvdd %17, %15, %18
+    %21:hvxvr = V6_vadd_sf %19.vsub_hi, %19.vsub_hi
+    %22:hvxvr = V6_vconv_sf_qf32 %21
+    %24:hvxvr = V6_vadd_sf %19.vsub_lo, %19.vsub_lo
+    %25:hvxvr = V6_vconv_sf_qf32 %24
+    %26:hvxvr = V6_vadd_sf %25, %19.vsub_lo
+    %27:hvxvr = V6_vconv_sf_qf32 %26
+    %28:hvxvr = V6_vadd_sf %22, %19.vsub_hi
+    %29:hvxvr = V6_vconv_sf_qf32 %28
+
+...
diff --git a/llvm/test/CodeGen/Hexagon/qfpopt-rem-conv-add.ll b/llvm/test/CodeGen/Hexagon/qfpopt-rem-conv-add.ll
new file mode 100644
index 0000000000000..c16370c3b907d
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/qfpopt-rem-conv-add.ll
@@ -0,0 +1,21 @@
+; Tests if generated vadd instruction takes in qf32
+; type as first parameter instead of a sf type without
+; any conversion instruction of type sf = qf32
+
+; RUN: llc -mtriple=hexagon < %s -o - | FileCheck %s
+
+; CHECK: [[V2:v[0-9]+]] = vxor([[V2]],[[V2]])
+; CHECK: [[V0:v[0-9]+]].qf32 = vmpy([[V0]].sf,[[V2]].sf)
+; CHECK: [[V1:v[0-9]+]].qf32 = vmpy([[V1]].sf,[[V2]].sf)
+; CHECK: [[V4:v[0-9]+]].qf32 = vadd([[V0]].qf32,[[V2]].sf)
+; CHECK: [[V5:v[0-9]+]].qf32 = vadd([[V1]].qf32,[[V2]].sf)
+
+define void @_Z19compute_ripple_geluIDF16_EviPT_PKS0_(ptr %out_ptr, <64 x float> %conv14.ripple.vectorized) #0 {
+entry:
+  %mul16.ripple.vectorized = fmul <64 x float> %conv14.ripple.vectorized, zeroinitializer
+  %conv17.ripple.vectorized = fptrunc <64 x float> %mul16.ripple.vectorized to <64 x half>
+  store <64 x half> %conv17.ripple.vectorized, ptr %out_ptr, align 2
+  ret void
+}
+
+attributes #0 = { "target-features"="+hvx-length128b,+hvxv75,+v75,-long-calls,-small-data" }
diff --git a/llvm/test/CodeGen/Hexagon/swp-many-stores.mir b/llvm/test/CodeGen/Hexagon/swp-many-stores.mir
new file mode 100644
index 0000000000000..bf14dcf3c4fb3
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-many-stores.mir
@@ -0,0 +1,88 @@
+# RUN: llc -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null -pipeliner-max-num-stores=5 2>&1 | FileCheck %s
+# REQUIRES: asserts
+
+# This loop has six stores, which exceeds the limit set by
+# `pipeliner-max-num-stores`.
+
+# CHECK: Too many stores
+
+--- |
+  target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+  target triple = "hexagon-unknown-linux-musl"
+  
+  define void @f(ptr %a, i32 %n) #0 {
+  entry:
+    %guard = icmp sgt i32 %n, 0
+    %btc = sub nsw i32 %n, 1
+    br i1 %guard, label %loop.preheader, label %exit
+  
+  loop.preheader:                                   ; preds = %entry
+    %0 = add i32 %n, 1
+    %cgep = getelementptr i8, ptr %a, i32 %0
+    br label %loop
+  
+  loop:                                             ; preds = %loop.preheader, %loop
+    %lsr.iv = phi ptr [ %cgep, %loop.preheader ], [ %cgep8, %loop ]
+    %i = phi i32 [ %i.dec, %loop ], [ %btc, %loop.preheader ]
+    %cgep7 = getelementptr i8, ptr %lsr.iv, i32 -2
+    store i8 0, ptr %cgep7, align 1
+    %cgep8 = getelementptr i8, ptr %lsr.iv, i32 -1
+    store i8 1, ptr %cgep8, align 1
+    store i8 2, ptr %lsr.iv, align 1
+    %cgep9 = getelementptr i8, ptr %lsr.iv, i32 1
+    store i8 3, ptr %cgep9, align 1
+    %cgep10 = getelementptr i8, ptr %lsr.iv, i32 2
+    store i8 4, ptr %cgep10, align 1
+    %cgep11 = getelementptr i8, ptr %lsr.iv, i32 3
+    store i8 5, ptr %cgep11, align 1
+    %i.dec = sub i32 %i, 1
+    %ec = icmp eq i32 %i.dec, 0
+    br i1 %ec, label %exit, label %loop
+  
+  exit:                                             ; preds = %loop, %entry
+    ret void
+  }
+  
+  attributes #0 = { "target-cpu"="hexagonv79" }
+...
+---
+name:            f
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    successors: %bb.1(0x50000000), %bb.3(0x30000000)
+    liveins: $r0, $r1
+  
+    %7:intregs = COPY $r1
+    %6:intregs = COPY $r0
+    %8:predregs = C2_cmpgti %7, 0
+    J2_jumpf %8, %bb.3, implicit-def dead $pc
+    J2_jump %bb.1, implicit-def dead $pc
+  
+  bb.1.loop.preheader:
+    successors: %bb.2(0x80000000)
+  
+    %0:intregs = A2_addi %7, -1
+    %1:intregs = S4_addaddi %7, %6, 1
+    %10:intregs = A2_tfrsi 0
+    %11:intregs = A2_tfrsi 1
+    %14:intregs = COPY %0
+    J2_loop0r %bb.2, %14, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+  
+  bb.2.loop (machine-block-address-taken):
+    successors: %bb.3(0x04000000), %bb.2(0x7c000000)
+  
+    %2:intregs = PHI %1, %bb.1, %4, %bb.2
+    S2_storerb_io %2, -2, %10 :: (store (s8) into %ir.cgep7)
+    %4:intregs = A2_addi %2, -1
+    S2_storerb_io %2, -1, %11 :: (store (s8) into %ir.cgep8)
+    S4_storeirb_io %2, 0, 2 :: (store (s8) into %ir.lsr.iv)
+    S4_storeirb_io %2, 1, 3 :: (store (s8) into %ir.cgep9)
+    S4_storeirb_io %2, 2, 4 :: (store (s8) into %ir.cgep10)
+    S4_storeirb_io %2, 3, 5 :: (store (s8) into %ir.cgep11)
+    ENDLOOP0 %bb.2, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+    J2_jump %bb.3, implicit-def dead $pc
+  
+  bb.3.exit:
+    PS_jmpret $r31, implicit-def dead $pc
+...
diff --git a/llvm/test/CodeGen/Hexagon/vect/qfp-mix.mir b/llvm/test/CodeGen/Hexagon/vect/qfp-mix.mir
new file mode 100644
index 0000000000000..9a9e938f35d85
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/vect/qfp-mix.mir
@@ -0,0 +1,79 @@
+# RUN: llc -march=hexagon -mcpu=hexagonv68 -mattr=+hvxv68,+hvx-length128b \
+# RUN: -run-pass hexagon-qfp-optimizer %s -o - | FileCheck %s
+
+
+# Test that the operands are swapped for Add if the second operand
+# is a qf32 to sf conversion. V6_vadd_qf32_mix supports first operand
+# as qf32.
+# CHECK-LABEL: name: qfpAddMix
+# CHECK: %[[REG:([0-9]+)]]:hvxvr = V6_vmpy_qf32_sf
+# CHECK: V6_vadd_qf32_mix %[[REG]]
+
+---
+name: qfpAddMix
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $r0, $r1, $r2, $r3
+    %0:intregs = COPY $r0
+    %1:intregs = COPY $r1
+    %2:intregs = COPY $r2
+    %3:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+    %4:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+    %5:hvxvr = V6_vL32Ub_ai %2:intregs, 0
+    %6:hvxvr = V6_vmpy_qf32_sf %4, %5
+    %7:hvxvr = V6_vconv_sf_qf32 %6:hvxvr
+    %8:hvxvr = V6_vadd_sf %3:hvxvr, %7:hvxvr
+...
+
+
+# Test that we do not generate V6_vsub_qf32_mix for the below test.
+# V6_vsub_qf32_mix only allowes qf32 as first operand. In the test qf32
+# is passed as second operand. As sub is not commutative, we should not
+# generate the mix instruction.
+# CHECK-LABEL: name: qfpSubNoMix
+# CHECK-NOT: V6_vsub_qf32_mix
+
+---
+name: qfpSubNoMix
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $r0, $r1, $r2, $r3
+    %0:intregs = COPY $r0
+    %1:intregs = COPY $r1
+    %2:intregs = COPY $r2
+    %3:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+    %4:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+    %5:hvxvr = V6_vL32Ub_ai %2:intregs, 0
+    %6:hvxvr = V6_vmpy_qf32_sf %4, %5
+    %7:hvxvr = V6_vconv_sf_qf32 %6:hvxvr
+    %8:hvxvr = V6_vsub_sf %3:hvxvr, %7:hvxvr
+...
+
+
+# Test that we do generate V6_vsub_qf32_mix for the below test.
+# V6_vsub_qf32_mix only allowes qf32 as first operand. In the test qf32
+# is passed as first operand. So, V6_vsub_qf32_mix must be generated.
+# CHECK-LABEL: name: qfpSubMix
+# CHECK: V6_vsub_qf32_mix
+
+---
+name: qfpSubMix
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $r0, $r1, $r2, $r3
+    %0:intregs = COPY $r0
+    %1:intregs = COPY $r1
+    %2:intregs = COPY $r2
+    %3:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+    %4:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+    %5:hvxvr = V6_vL32Ub_ai %2:intregs, 0
+    %6:hvxvr = V6_vmpy_qf32_sf %4, %5
+    %7:hvxvr = V6_vconv_sf_qf32 %6:hvxvr
+    %8:hvxvr = V6_vsub_sf %7:hvxvr, %3:hvxvr
+...
diff --git a/llvm/test/CodeGen/Hexagon/vect/qfp-zeroinit.mir b/llvm/test/CodeGen/Hexagon/vect/qfp-zeroinit.mir
new file mode 100644
index 0000000000000..f0b1d3c96bbb3
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/vect/qfp-zeroinit.mir
@@ -0,0 +1,23 @@
+# RUN: llc -march=hexagon -mcpu=hexagonv68 -mattr=+hvxv68,+hvx-length128b -run-pass hexagon-qfp-optimizer %s -o - | FileCheck %s
+
+# CHECK-LABEL: name: qfpAdd32
+# CHECK: V6_vd0
+# CHECK-NEXT: V6_vL32Ub_ai
+# CHECK-NEXT: V6_vadd_sf
+# CHECK-NEXT: V6_vconv_sf_qf32
+# CHECK-NEXT: V6_vS32Ub_ai
+---
+name: qfpAdd32
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $r0, $r1
+    %0:intregs = COPY $r0
+    %1:intregs = COPY $r1
+    %3:hvxvr = V6_vd0
+    %4:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+    %5:hvxvr = V6_vadd_sf %3:hvxvr, %4:hvxvr
+    %6:hvxvr = V6_vconv_sf_qf32 %5:hvxvr
+    V6_vS32Ub_ai %1:intregs, 0, %6:hvxvr
+...
diff --git a/llvm/test/CodeGen/Hexagon/vect/unique-vreg-def.ll b/llvm/test/CodeGen/Hexagon/vect/unique-vreg-def.ll
new file mode 100644
index 0000000000000..2d46da7a039bc
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/vect/unique-vreg-def.ll
@@ -0,0 +1,32 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; REQUIRES: hexagon
+
+; This test was asserting because getVRegDef() was called on a register with
+; multiple defs.
+; Checks that the test does not assert and vsub is generated.
+; CHECK: vsub
+
+target triple = "hexagon"
+
+@v = common dso_local local_unnamed_addr global <32 x i32> zeroinitializer, align 128
+
+; Function Attrs: nounwind
+define dso_local void @hvx_twoSum(<32 x i32>* nocapture noundef writeonly %s2lo) local_unnamed_addr #0 {
+entry:
+  %0 = load <32 x i32>, <32 x i32>* @v, align 128
+  %call = tail call inreg <32 x i32> @MY_Vsf_equals_Vqf32(<32 x i32> noundef %0) #3
+  %1 = tail call <32 x i32> @llvm.hexagon.V6.vsub.sf.128B(<32 x i32> %call, <32 x i32> %call)
+  store <32 x i32> %1, <32 x i32>* @v, align 128
+  store <32 x i32> %1, <32 x i32>* %s2lo, align 128
+  ret void
+}
+
+declare dso_local inreg <32 x i32> @MY_Vsf_equals_Vqf32(<32 x i32> noundef) local_unnamed_addr #1
+
+; Function Attrs: nofree nosync nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vsub.sf.128B(<32 x i32>, <32 x i32>) #2
+
+attributes #0 = { nounwind "frame-pointer"="all" "min-legal-vector-width"="1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv73" "target-features"="+hvx-length128b,+hvxv73,+v73,-long-calls" }
+attributes #1 = { "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv73" "target-features"="+hvx-length128b,+hvxv73,+v73,-long-calls" }
+attributes #2 = { nofree nosync nounwind readnone }
+attributes #3 = { nounwind }
diff --git a/llvm/test/CodeGen/LoongArch/lasx/abs.ll b/llvm/test/CodeGen/LoongArch/lasx/abs.ll
index e3b0d04d92d75..dd60c67d8c249 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/abs.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/abs.ll
@@ -6,8 +6,7 @@ define void @vabs_b(ptr %dst, ptr %src) {
 ; CHECK-LABEL: vabs_b:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
-; CHECK-NEXT:    xvneg.b $xr1, $xr0
-; CHECK-NEXT:    xvmax.b $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsigncov.b $xr0, $xr0, $xr0
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -21,8 +20,7 @@ define void @vabs_b_1(ptr %dst, ptr %src) {
 ; CHECK-LABEL: vabs_b_1:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
-; CHECK-NEXT:    xvneg.b $xr1, $xr0
-; CHECK-NEXT:    xvmax.b $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsigncov.b $xr0, $xr0, $xr0
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -36,8 +34,7 @@ define void @vabs_h(ptr %dst, ptr %src) {
 ; CHECK-LABEL: vabs_h:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
-; CHECK-NEXT:    xvneg.h $xr1, $xr0
-; CHECK-NEXT:    xvmax.h $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsigncov.h $xr0, $xr0, $xr0
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -51,8 +48,7 @@ define void @vabs_h_1(ptr %dst, ptr %src) {
 ; CHECK-LABEL: vabs_h_1:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
-; CHECK-NEXT:    xvneg.h $xr1, $xr0
-; CHECK-NEXT:    xvmax.h $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsigncov.h $xr0, $xr0, $xr0
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -66,8 +62,7 @@ define void @vabs_w(ptr %dst, ptr %src) {
 ; CHECK-LABEL: vabs_w:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
-; CHECK-NEXT:    xvneg.w $xr1, $xr0
-; CHECK-NEXT:    xvmax.w $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsigncov.w $xr0, $xr0, $xr0
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -81,8 +76,7 @@ define void @vabs_w_1(ptr %dst, ptr %src) {
 ; CHECK-LABEL: vabs_w_1:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
-; CHECK-NEXT:    xvneg.w $xr1, $xr0
-; CHECK-NEXT:    xvmax.w $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsigncov.w $xr0, $xr0, $xr0
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -96,8 +90,7 @@ define void @vabs_d(ptr %dst, ptr %src) {
 ; CHECK-LABEL: vabs_d:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
-; CHECK-NEXT:    xvneg.d $xr1, $xr0
-; CHECK-NEXT:    xvmax.d $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsigncov.d $xr0, $xr0, $xr0
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -111,8 +104,7 @@ define void @vabs_d_1(ptr %dst, ptr %src) {
 ; CHECK-LABEL: vabs_d_1:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
-; CHECK-NEXT:    xvneg.d $xr1, $xr0
-; CHECK-NEXT:    xvmax.d $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsigncov.d $xr0, $xr0, $xr0
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf4i.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf4i.ll
index 37b62ca989edb..69437a24282b2 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf4i.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf4i.ll
@@ -33,6 +33,18 @@ define <8 x i32> @shufflevector_xvshuf4i_v8i32(<8 x i32> %a, <8 x i32> %b) {
     ret <8 x i32> %c
 }
 
+;; xvshuf4i.d
+define <4 x i64> @shufflevector_xvshuf4i_v4i64(<4 x i64> %a, <4 x i64> %b) {
+; CHECK-LABEL: shufflevector_xvshuf4i_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvbsrl.v $xr0, $xr0, 8
+; CHECK-NEXT:    xvbsll.v $xr1, $xr1, 8
+; CHECK-NEXT:    xvor.v $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 4, i32 3, i32 6>
+    ret <4 x i64> %c
+}
+
 ;; xvshuf4i.w
 define <8 x float> @shufflevector_xvshuf4i_v8f32(<8 x float> %a, <8 x float> %b) {
 ; CHECK-LABEL: shufflevector_xvshuf4i_v8f32:
@@ -42,3 +54,16 @@ define <8 x float> @shufflevector_xvshuf4i_v8f32(<8 x float> %a, <8 x float> %b)
     %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
     ret <8 x float> %c
 }
+
+;; xvshuf4i.d
+define <4 x double> @shufflevector_xvshuf4i_v4f64(<4 x double> %a, <4 x double> %b) {
+; CHECK-LABEL: shufflevector_xvshuf4i_v4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_0)
+; CHECK-NEXT:    xvld $xr2, $a0, %pc_lo12(.LCPI5_0)
+; CHECK-NEXT:    xvshuf.d $xr2, $xr1, $xr0
+; CHECK-NEXT:    xvori.b $xr0, $xr2, 0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+    ret <4 x double> %c
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/shufflevector-reverse.ll b/llvm/test/CodeGen/LoongArch/lasx/shufflevector-reverse.ll
new file mode 100644
index 0000000000000..b57d90c162c3b
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/shufflevector-reverse.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
+
+define void @shufflevector_reverse_v32i8(ptr %res, ptr %a) nounwind {
+; CHECK-LABEL: shufflevector_reverse_v32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI0_0)
+; CHECK-NEXT:    xvld $xr1, $a1, %pc_lo12(.LCPI0_0)
+; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 78
+; CHECK-NEXT:    xvshuf.b $xr0, $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <32 x i8>, ptr %a
+  %b = shufflevector <32 x i8> %va, <32 x i8> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  store <32 x i8> %b, ptr %res
+  ret void
+}
+
+define void @shufflevector_reverse_v16i16(ptr %res, ptr %a) nounwind {
+; CHECK-LABEL: shufflevector_reverse_v16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI1_0)
+; CHECK-NEXT:    xvld $xr1, $a1, %pc_lo12(.LCPI1_0)
+; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 78
+; CHECK-NEXT:    xvshuf.h $xr1, $xr0, $xr0
+; CHECK-NEXT:    xvst $xr1, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <16 x i16>, ptr %a
+  %b = shufflevector <16 x i16> %va, <16 x i16> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  store <16 x i16> %b, ptr %res
+  ret void
+}
+
+define void @shufflevector_reverse_v8i32(ptr %res, ptr %a) nounwind {
+; CHECK-LABEL: shufflevector_reverse_v8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 78
+; CHECK-NEXT:    xvshuf4i.w $xr0, $xr0, 27
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <8 x i32>, ptr %a
+  %b = shufflevector <8 x i32> %va, <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  store <8 x i32> %b, ptr %res
+  ret void
+}
+
+define void @shufflevector_reverse_v4i64(ptr %res, ptr %a) nounwind {
+; CHECK-LABEL: shufflevector_reverse_v4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 27
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <4 x i64>, ptr %a
+  %b = shufflevector <4 x i64> %va, <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  store <4 x i64> %b, ptr %res
+  ret void
+}
+
+define void @shufflevector_reverse_v8f32(ptr %res, ptr %a) nounwind {
+; CHECK-LABEL: shufflevector_reverse_v8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 78
+; CHECK-NEXT:    xvshuf4i.w $xr0, $xr0, 27
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <8 x float>, ptr %a
+  %b = shufflevector <8 x float> %va, <8 x float> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  store <8 x float> %b, ptr %res
+  ret void
+}
+
+define void @shufflevector_reverse_v4f64(ptr %res, ptr %a) nounwind {
+; CHECK-LABEL: shufflevector_reverse_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 27
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <4 x double>, ptr %a
+  %b = shufflevector <4 x double> %va, <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  store <4 x double> %b, ptr %res
+  ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/vselect.ll b/llvm/test/CodeGen/LoongArch/lasx/vselect.ll
index bf31ccb1d0104..559cc53062566 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/vselect.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/vselect.ll
@@ -32,6 +32,40 @@ define void @select_v32i8(ptr %res, ptr %a0, ptr %a1) nounwind {
   ret void
 }
 
+define void @select_v32i8_1(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: select_v32i8_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI2_0)
+; CHECK-NEXT:    xvld $xr2, $a1, %pc_lo12(.LCPI2_0)
+; CHECK-NEXT:    xvbitsel.v $xr0, $xr1, $xr0, $xr2
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <32 x i8>, ptr %a0
+  %v1 = load <32 x i8>, ptr %a1
+  %sel = select <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <32 x i8> %v0, <32 x i8> %v1
+  store <32 x i8> %sel, ptr %res
+  ret void
+}
+
+define void @select_v32i8_2(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: select_v32i8_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI3_0)
+; CHECK-NEXT:    xvld $xr2, $a1, %pc_lo12(.LCPI3_0)
+; CHECK-NEXT:    xvbitsel.v $xr0, $xr1, $xr0, $xr2
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <32 x i8>, ptr %a0
+  %v1 = load <32 x i8>, ptr %a1
+  %sel = select <32 x i1> <i1 false, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <32 x i8> %v0, <32 x i8> %v1
+  store <32 x i8> %sel, ptr %res
+  ret void
+}
+
 define void @select_v16i16(ptr %res, ptr %a0, ptr %a1) nounwind {
 ; CHECK-LABEL: select_v16i16:
 ; CHECK:       # %bb.0:
@@ -49,6 +83,40 @@ define void @select_v16i16(ptr %res, ptr %a0, ptr %a1) nounwind {
   ret void
 }
 
+define void @select_v16i16_1(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: select_v16i16_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI5_0)
+; CHECK-NEXT:    xvld $xr2, $a1, %pc_lo12(.LCPI5_0)
+; CHECK-NEXT:    xvbitsel.v $xr0, $xr1, $xr0, $xr2
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <16 x i16>, ptr %a0
+  %v1 = load <16 x i16>, ptr %a1
+  %sel = select <16 x i1> <i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i16> %v0, <16 x i16> %v1
+  store <16 x i16> %sel, ptr %res
+  ret void
+}
+
+define void @select_v16i16_2(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: select_v16i16_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI6_0)
+; CHECK-NEXT:    xvld $xr2, $a1, %pc_lo12(.LCPI6_0)
+; CHECK-NEXT:    xvbitsel.v $xr0, $xr1, $xr0, $xr2
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <16 x i16>, ptr %a0
+  %v1 = load <16 x i16>, ptr %a1
+  %sel = select <16 x i1> <i1 false, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false>, <16 x i16> %v0, <16 x i16> %v1
+  store <16 x i16> %sel, ptr %res
+  ret void
+}
+
 define void @select_v8i32(ptr %res, ptr %a0, ptr %a1) nounwind {
 ; CHECK-LABEL: select_v8i32:
 ; CHECK:       # %bb.0:
@@ -65,19 +133,70 @@ define void @select_v8i32(ptr %res, ptr %a0, ptr %a1) nounwind {
   ret void
 }
 
+define void @select_v8i32_1(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: select_v8i32_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI8_0)
+; CHECK-NEXT:    xvld $xr2, $a1, %pc_lo12(.LCPI8_0)
+; CHECK-NEXT:    xvbitsel.v $xr0, $xr1, $xr0, $xr2
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <8 x i32>, ptr %a0
+  %v1 = load <8 x i32>, ptr %a1
+  %sel = select <8 x i1> <i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <8 x i32> %v0, <8 x i32> %v1
+  store <8 x i32> %sel, ptr %res
+  ret void
+}
+
+define void @select_v8f32(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: select_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI9_0)
+; CHECK-NEXT:    xvld $xr2, $a1, %pc_lo12(.LCPI9_0)
+; CHECK-NEXT:    xvbitsel.v $xr0, $xr1, $xr0, $xr2
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <8 x float>, ptr %a0
+  %v1 = load <8 x float>, ptr %a1
+  %sel = select <8 x i1> <i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false>, <8 x float> %v0, <8 x float> %v1
+  store <8 x float> %sel, ptr %res
+  ret void
+}
+
 define void @select_v4i64(ptr %res, ptr %a0, ptr %a1) nounwind {
 ; CHECK-LABEL: select_v4i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
 ; CHECK-NEXT:    xvld $xr1, $a2, 0
-; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI4_0)
-; CHECK-NEXT:    xvld $xr2, $a1, %pc_lo12(.LCPI4_0)
+; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI10_0)
+; CHECK-NEXT:    xvld $xr2, $a1, %pc_lo12(.LCPI10_0)
 ; CHECK-NEXT:    xvbitsel.v $xr0, $xr1, $xr0, $xr2
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
   %v0 = load <4 x i64>, ptr %a0
   %v1 = load <4 x i64>, ptr %a1
-  %sel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i64> %v0, <4 x i64> %v1
+  %sel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i64> %v0, <4 x i64> %v1
   store <4 x i64> %sel, ptr %res
   ret void
 }
+
+define void @select_v4f64(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: select_v4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI11_0)
+; CHECK-NEXT:    xvld $xr2, $a1, %pc_lo12(.LCPI11_0)
+; CHECK-NEXT:    xvbitsel.v $xr0, $xr1, $xr0, $xr2
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <4 x double>, ptr %a0
+  %v1 = load <4 x double>, ptr %a1
+  %sel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x double> %v0, <4 x double> %v1
+  store <4 x double> %sel, ptr %res
+  ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/abs.ll b/llvm/test/CodeGen/LoongArch/lsx/abs.ll
index 85fe1fe5c0da7..6f8ccf4926ea1 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/abs.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/abs.ll
@@ -6,8 +6,7 @@ define void @vabs_b(ptr %dst, ptr %src) {
 ; CHECK-LABEL: vabs_b:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
-; CHECK-NEXT:    vneg.b $vr1, $vr0
-; CHECK-NEXT:    vmax.b $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsigncov.b $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -21,8 +20,7 @@ define void @vabs_b_1(ptr %dst, ptr %src) {
 ; CHECK-LABEL: vabs_b_1:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
-; CHECK-NEXT:    vneg.b $vr1, $vr0
-; CHECK-NEXT:    vmax.b $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsigncov.b $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -36,8 +34,7 @@ define void @vabs_h(ptr %dst, ptr %src) {
 ; CHECK-LABEL: vabs_h:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
-; CHECK-NEXT:    vneg.h $vr1, $vr0
-; CHECK-NEXT:    vmax.h $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsigncov.h $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -51,8 +48,7 @@ define void @vabs_h_1(ptr %dst, ptr %src) {
 ; CHECK-LABEL: vabs_h_1:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
-; CHECK-NEXT:    vneg.h $vr1, $vr0
-; CHECK-NEXT:    vmax.h $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsigncov.h $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -66,8 +62,7 @@ define void @vabs_w(ptr %dst, ptr %src) {
 ; CHECK-LABEL: vabs_w:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
-; CHECK-NEXT:    vneg.w $vr1, $vr0
-; CHECK-NEXT:    vmax.w $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsigncov.w $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -81,8 +76,7 @@ define void @vabs_w_1(ptr %dst, ptr %src) {
 ; CHECK-LABEL: vabs_w_1:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
-; CHECK-NEXT:    vneg.w $vr1, $vr0
-; CHECK-NEXT:    vmax.w $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsigncov.w $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -96,8 +90,7 @@ define void @vabs_d(ptr %dst, ptr %src) {
 ; CHECK-LABEL: vabs_d:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
-; CHECK-NEXT:    vneg.d $vr1, $vr0
-; CHECK-NEXT:    vmax.d $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsigncov.d $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -111,8 +104,7 @@ define void @vabs_d_1(ptr %dst, ptr %src) {
 ; CHECK-LABEL: vabs_d_1:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
-; CHECK-NEXT:    vneg.d $vr1, $vr0
-; CHECK-NEXT:    vmax.d $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsigncov.d $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/LoongArch/lsx/shufflevector-reverse.ll b/llvm/test/CodeGen/LoongArch/lsx/shufflevector-reverse.ll
new file mode 100644
index 0000000000000..29f038adb9765
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/shufflevector-reverse.ll
@@ -0,0 +1,91 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
+
+define void @shufflevector_reverse_v16i8(ptr %res, ptr %a) nounwind {
+; CHECK-LABEL: shufflevector_reverse_v16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI0_0)
+; CHECK-NEXT:    vld $vr1, $a1, %pc_lo12(.LCPI0_0)
+; CHECK-NEXT:    vshuf.b $vr0, $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <16 x i8>, ptr %a
+  %b = shufflevector <16 x i8> %va, <16 x i8> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  store <16 x i8> %b, ptr %res
+  ret void
+}
+
+define void @shufflevector_reverse_v8i16(ptr %res, ptr %a) nounwind {
+; CHECK-LABEL: shufflevector_reverse_v8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI1_0)
+; CHECK-NEXT:    vld $vr1, $a1, %pc_lo12(.LCPI1_0)
+; CHECK-NEXT:    vshuf.h $vr1, $vr0, $vr0
+; CHECK-NEXT:    vst $vr1, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <8 x i16>, ptr %a
+  %b = shufflevector <8 x i16> %va, <8 x i16> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  store <8 x i16> %b, ptr %res
+  ret void
+}
+
+define void @shufflevector_reverse_v4i32(ptr %res, ptr %a) nounwind {
+; CHECK-LABEL: shufflevector_reverse_v4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vshuf4i.w $vr0, $vr0, 27
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <4 x i32>, ptr %a
+  %b = shufflevector <4 x i32> %va, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  store <4 x i32> %b, ptr %res
+  ret void
+}
+
+define void @shufflevector_reverse_v2i64(ptr %res, ptr %a) nounwind {
+; CHECK-LABEL: shufflevector_reverse_v2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vshuf4i.d $vr0, $vr0, 1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <2 x i64>, ptr %a
+  %b = shufflevector <2 x i64> %va, <2 x i64> poison, <2 x i32> <i32 1, i32 0>
+  store <2 x i64> %b, ptr %res
+  ret void
+}
+
+define void @shufflevector_reverse_v4f32(ptr %res, ptr %a) nounwind {
+; CHECK-LABEL: shufflevector_reverse_v4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vshuf4i.w $vr0, $vr0, 27
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <4 x float>, ptr %a
+  %b = shufflevector <4 x float> %va, <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  store <4 x float> %b, ptr %res
+  ret void
+}
+
+define void @shufflevector_reverse_v2f64(ptr %res, ptr %a) nounwind {
+; CHECK-LABEL: shufflevector_reverse_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vshuf4i.d $vr0, $vr0, 1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <2 x double>, ptr %a
+  %b = shufflevector <2 x double> %va, <2 x double> poison, <2 x i32> <i32 1, i32 0>
+  store <2 x double> %b, ptr %res
+  ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/vselect.ll b/llvm/test/CodeGen/LoongArch/lsx/vselect.ll
index 8f25a6ba62f9f..25c4f099099db 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/vselect.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/vselect.ll
@@ -16,6 +16,20 @@ define void @select_v16i8_imm(ptr %res, ptr %a0) nounwind {
   ret void
 }
 
+define void @select_v16i8_imm_1(ptr %res, ptr %a0) nounwind {
+; CHECK-LABEL: select_v16i8_imm_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vrepli.h $vr1, -256
+; CHECK-NEXT:    vbitseli.b $vr1, $vr0, 1
+; CHECK-NEXT:    vst $vr1, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <16 x i8>, ptr %a0
+  %sel = select <16 x i1> <i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, <16 x i8> %v0
+  store <16 x i8> %sel, ptr %res
+  ret void
+}
+
 define void @select_v16i8(ptr %res, ptr %a0, ptr %a1) nounwind {
 ; CHECK-LABEL: select_v16i8:
 ; CHECK:       # %bb.0:
@@ -32,6 +46,40 @@ define void @select_v16i8(ptr %res, ptr %a0, ptr %a1) nounwind {
   ret void
 }
 
+define void @select_v16i8_1(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: select_v16i8_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI3_0)
+; CHECK-NEXT:    vld $vr2, $a1, %pc_lo12(.LCPI3_0)
+; CHECK-NEXT:    vbitsel.v $vr0, $vr1, $vr0, $vr2
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <16 x i8>, ptr %a0
+  %v1 = load <16 x i8>, ptr %a1
+  %sel = select <16 x i1> <i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> %v0, <16 x i8> %v1
+  store <16 x i8> %sel, ptr %res
+  ret void
+}
+
+define void @select_v16i8_2(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: select_v16i8_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI4_0)
+; CHECK-NEXT:    vld $vr2, $a1, %pc_lo12(.LCPI4_0)
+; CHECK-NEXT:    vbitsel.v $vr0, $vr1, $vr0, $vr2
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <16 x i8>, ptr %a0
+  %v1 = load <16 x i8>, ptr %a1
+  %sel = select <16 x i1> <i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false>, <16 x i8> %v0, <16 x i8> %v1
+  store <16 x i8> %sel, ptr %res
+  ret void
+}
+
 define void @select_v8i16(ptr %res, ptr %a0, ptr %a1) nounwind {
 ; CHECK-LABEL: select_v8i16:
 ; CHECK:       # %bb.0:
@@ -49,6 +97,40 @@ define void @select_v8i16(ptr %res, ptr %a0, ptr %a1) nounwind {
   ret void
 }
 
+define void @select_v8i16_1(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: select_v8i16_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI6_0)
+; CHECK-NEXT:    vld $vr2, $a1, %pc_lo12(.LCPI6_0)
+; CHECK-NEXT:    vbitsel.v $vr0, $vr1, $vr0, $vr2
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <8 x i16>, ptr %a0
+  %v1 = load <8 x i16>, ptr %a1
+  %sel = select <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x i16> %v0, <8 x i16> %v1
+  store <8 x i16> %sel, ptr %res
+  ret void
+}
+
+define void @select_v8i16_2(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: select_v8i16_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI7_0)
+; CHECK-NEXT:    vld $vr2, $a1, %pc_lo12(.LCPI7_0)
+; CHECK-NEXT:    vbitsel.v $vr0, $vr1, $vr0, $vr2
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <8 x i16>, ptr %a0
+  %v1 = load <8 x i16>, ptr %a1
+  %sel = select <8 x i1> <i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x i16> %v0, <8 x i16> %v1
+  store <8 x i16> %sel, ptr %res
+  ret void
+}
+
 define void @select_v4i32(ptr %res, ptr %a0, ptr %a1) nounwind {
 ; CHECK-LABEL: select_v4i32:
 ; CHECK:       # %bb.0:
@@ -65,13 +147,47 @@ define void @select_v4i32(ptr %res, ptr %a0, ptr %a1) nounwind {
   ret void
 }
 
+define void @select_v4i32_1(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: select_v4i32_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI9_0)
+; CHECK-NEXT:    vld $vr2, $a1, %pc_lo12(.LCPI9_0)
+; CHECK-NEXT:    vbitsel.v $vr0, $vr1, $vr0, $vr2
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <4 x i32>, ptr %a0
+  %v1 = load <4 x i32>, ptr %a1
+  %sel = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x i32> %v0, <4 x i32> %v1
+  store <4 x i32> %sel, ptr %res
+  ret void
+}
+
+define void @select_v4f32(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: select_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI10_0)
+; CHECK-NEXT:    vld $vr2, $a1, %pc_lo12(.LCPI10_0)
+; CHECK-NEXT:    vbitsel.v $vr0, $vr1, $vr0, $vr2
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <4 x float>, ptr %a0
+  %v1 = load <4 x float>, ptr %a1
+  %sel = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %v0, <4 x float> %v1
+  store <4 x float> %sel, ptr %res
+  ret void
+}
+
 define void @select_v2i64(ptr %res, ptr %a0, ptr %a1) nounwind {
 ; CHECK-LABEL: select_v2i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vld $vr0, $a1, 0
 ; CHECK-NEXT:    vld $vr1, $a2, 0
-; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI4_0)
-; CHECK-NEXT:    vld $vr2, $a1, %pc_lo12(.LCPI4_0)
+; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI11_0)
+; CHECK-NEXT:    vld $vr2, $a1, %pc_lo12(.LCPI11_0)
 ; CHECK-NEXT:    vbitsel.v $vr0, $vr1, $vr0, $vr2
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
@@ -81,3 +197,20 @@ define void @select_v2i64(ptr %res, ptr %a0, ptr %a1) nounwind {
   store <2 x i64> %sel, ptr %res
   ret void
 }
+
+define void @select_v2f64(ptr %res, ptr %a0, ptr %a1) nounwind {
+; CHECK-LABEL: select_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI12_0)
+; CHECK-NEXT:    vld $vr2, $a1, %pc_lo12(.LCPI12_0)
+; CHECK-NEXT:    vbitsel.v $vr0, $vr1, $vr0, $vr2
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <2 x double>, ptr %a0
+  %v1 = load <2 x double>, ptr %a1
+  %sel = select <2 x i1> <i1 false, i1 true>, <2 x double> %v0, <2 x double> %v1
+  store <2 x double> %sel, ptr %res
+  ret void
+}
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json
new file mode 100644
index 0000000000000..5de715bf80917
--- /dev/null
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json
@@ -0,0 +1,22 @@
+{
+  "entities": {
+    "KILL": [0.1, 0.2, 0.3],
+    "MOV": [0.4, 0.5, 0.6],
+    "LEA": [0.7, 0.8, 0.9],
+    "RET": [1.0, 1.1, 1.2],
+    "ADD": [1.3, 1.4, 1.5],
+    "SUB": [1.6, 1.7, 1.8],
+    "IMUL": [1.9, 2.0, 2.1],
+    "AND": [2.2, 2.3, 2.4],
+    "OR": [2.5, 2.6, 2.7],
+    "XOR": [2.8, 2.9, 3.0],
+    "CMP": [3.1, 3.2, 3.3],
+    "TEST": [3.4, 3.5, 3.6],
+    "JMP": [3.7, 3.8, 3.9],
+    "CALL": [4.0, 4.1, 4.2],
+    "PUSH": [4.3, 4.4, 4.5],
+    "POP": [4.6, 4.7, 4.8],
+    "NOP": [4.9, 5.0, 5.1],
+    "COPY": [5.2, 5.3, 5.4]
+  }
+}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/MIR2Vec/if-else.mir b/llvm/test/CodeGen/MIR2Vec/if-else.mir
new file mode 100644
index 0000000000000..5734a23c9bcb6
--- /dev/null
+++ b/llvm/test/CodeGen/MIR2Vec/if-else.mir
@@ -0,0 +1,144 @@
+# REQUIRES: x86-registered-target
+# RUN: llc -mtriple=x86_64-unknown-linux-gnu -run-pass=none -print-mir2vec -mir2vec-vocab-path=%S/Inputs/mir2vec_dummy_3D_vocab.json %s -o /dev/null 2>&1 | FileCheck %s
+
+--- |
+  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+  
+  define dso_local i32 @abc(i32 noundef %a, i32 noundef %b) {
+  entry:
+    %retval = alloca i32, align 4
+    %a.addr = alloca i32, align 4
+    %b.addr = alloca i32, align 4
+    store i32 %a, ptr %a.addr, align 4
+    store i32 %b, ptr %b.addr, align 4
+    %0 = load i32, ptr %a.addr, align 4
+    %1 = load i32, ptr %b.addr, align 4
+    %cmp = icmp sgt i32 %0, %1
+    br i1 %cmp, label %if.then, label %if.else
+  
+  if.then:                                          ; preds = %entry
+    %2 = load i32, ptr %b.addr, align 4
+    store i32 %2, ptr %retval, align 4
+    br label %return
+  
+  if.else:                                          ; preds = %entry
+    %3 = load i32, ptr %a.addr, align 4
+    store i32 %3, ptr %retval, align 4
+    br label %return
+  
+  return:                                           ; preds = %if.else, %if.then
+    %4 = load i32, ptr %retval, align 4
+    ret i32 %4
+  }
+...
+---
+name:            abc
+alignment:       16
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+noPhis:          false
+isSSA:           true
+noVRegs:         false
+hasFakeUses:     false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHContTarget: false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   true
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+  - { id: 0, class: gr32, preferred-register: '', flags: [  ] }
+  - { id: 1, class: gr32, preferred-register: '', flags: [  ] }
+  - { id: 2, class: gr32, preferred-register: '', flags: [  ] }
+  - { id: 3, class: gr32, preferred-register: '', flags: [  ] }
+  - { id: 4, class: gr32, preferred-register: '', flags: [  ] }
+  - { id: 5, class: gr32, preferred-register: '', flags: [  ] }
+liveins:
+  - { reg: '$edi', virtual-reg: '%0' }
+  - { reg: '$esi', virtual-reg: '%1' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  isCalleeSavedInfoValid: false
+  localFrameSize:  0
+fixedStack:      []
+stack:
+  - { id: 0, name: retval, type: default, offset: 0, size: 4, alignment: 4, 
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true, 
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: a.addr, type: default, offset: 0, size: 4, alignment: 4, 
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true, 
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 2, name: b.addr, type: default, offset: 0, size: 4, alignment: 4, 
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true, 
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo:
+  amxProgModel:    None
+body:             |
+  bb.0.entry:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: $edi, $esi
+  
+    %1:gr32 = COPY $esi
+    %0:gr32 = COPY $edi
+    MOV32mr %stack.1.a.addr, 1, $noreg, 0, $noreg, %0 :: (store (s32) into %ir.a.addr)
+    MOV32mr %stack.2.b.addr, 1, $noreg, 0, $noreg, %1 :: (store (s32) into %ir.b.addr)
+    %2:gr32 = SUB32rr %0, %1, implicit-def $eflags
+    JCC_1 %bb.2, 14, implicit $eflags
+    JMP_1 %bb.1
+  
+  bb.1.if.then:
+    successors: %bb.3(0x80000000)
+  
+    %4:gr32 = MOV32rm %stack.2.b.addr, 1, $noreg, 0, $noreg :: (dereferenceable load (s32) from %ir.b.addr)
+    MOV32mr %stack.0.retval, 1, $noreg, 0, $noreg, killed %4 :: (store (s32) into %ir.retval)
+    JMP_1 %bb.3
+  
+  bb.2.if.else:
+    successors: %bb.3(0x80000000)
+  
+    %3:gr32 = MOV32rm %stack.1.a.addr, 1, $noreg, 0, $noreg :: (dereferenceable load (s32) from %ir.a.addr)
+    MOV32mr %stack.0.retval, 1, $noreg, 0, $noreg, killed %3 :: (store (s32) into %ir.retval)
+  
+  bb.3.return:
+    %5:gr32 = MOV32rm %stack.0.retval, 1, $noreg, 0, $noreg :: (dereferenceable load (s32) from %ir.retval)
+    $eax = COPY %5
+    RET 0, $eax
+...
+
+# CHECK: Machine basic block vectors:
+# CHECK-NEXT: Machine basic block: abc:entry:
+# CHECK-NEXT:  [ 16.50  17.10  17.70 ]
+# CHECK-NEXT: Machine basic block: abc:if.then:
+# CHECK-NEXT:  [ 4.50  4.80  5.10 ]
+# CHECK-NEXT: Machine basic block: abc:if.else:
+# CHECK-NEXT:  [ 0.80  1.00  1.20 ]
+# CHECK-NEXT: Machine basic block: abc:return:
+# CHECK-NEXT:  [ 6.60  6.90  7.20 ]
\ No newline at end of file
diff --git a/llvm/test/CodeGen/MIR2Vec/mir2vec-basic-symbolic.mir b/llvm/test/CodeGen/MIR2Vec/mir2vec-basic-symbolic.mir
new file mode 100644
index 0000000000000..338cb637bac80
--- /dev/null
+++ b/llvm/test/CodeGen/MIR2Vec/mir2vec-basic-symbolic.mir
@@ -0,0 +1,76 @@
+# REQUIRES: x86-registered-target
+# RUN: llc -mtriple=x86_64-unknown-linux-gnu -run-pass=none -print-mir2vec -mir2vec-vocab-path=%S/Inputs/mir2vec_dummy_3D_vocab.json %s -o /dev/null 2>&1 | FileCheck %s
+
+--- |
+  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+  
+  define dso_local noundef i32 @add_function(i32 noundef %a, i32 noundef %b) {
+  entry:
+    %sum = add nsw i32 %a, %b
+    %result = mul nsw i32 %sum, 2
+    ret i32 %result
+  }
+  
+  define dso_local void @simple_function() {
+  entry:
+    ret void
+  }
+...
+---
+name:            add_function
+alignment:       16
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr32 }
+  - { id: 1, class: gr32 }
+  - { id: 2, class: gr32 }
+  - { id: 3, class: gr32 }
+liveins:
+  - { reg: '$edi', virtual-reg: '%0' }
+  - { reg: '$esi', virtual-reg: '%1' }
+body:             |
+  bb.0.entry:
+    liveins: $edi, $esi
+  
+    %1:gr32 = COPY $esi
+    %0:gr32 = COPY $edi
+    %2:gr32 = nsw ADD32rr %0, %1, implicit-def dead $eflags
+    %3:gr32 = ADD32rr %2, %2, implicit-def dead $eflags
+    $eax = COPY %3
+    RET 0, $eax
+
+---
+name:            simple_function
+alignment:       16
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    RET 0
+
+# CHECK: MIR2Vec embeddings for machine function add_function:
+# CHECK: Function vector:  [ 19.20  19.80  20.40 ]
+# CHECK-NEXT: Machine basic block vectors:
+# CHECK-NEXT: Machine basic block: add_function:entry:
+# CHECK-NEXT:  [ 19.20  19.80  20.40 ]
+# CHECK-NEXT: Machine instruction vectors:
+# CHECK-NEXT: Machine instruction: %1:gr32 = COPY $esi
+# CHECK-NEXT:  [ 5.20  5.30  5.40 ]
+# CHECK-NEXT: Machine instruction: %0:gr32 = COPY $edi
+# CHECK-NEXT:  [ 5.20  5.30  5.40 ]
+# CHECK-NEXT: Machine instruction: %2:gr32 = nsw ADD32rr %0:gr32(tied-def 0), %1:gr32, implicit-def dead $eflags
+# CHECK-NEXT:  [ 1.30  1.40  1.50 ]
+# CHECK-NEXT: Machine instruction: %3:gr32 = ADD32rr %2:gr32(tied-def 0), %2:gr32, implicit-def dead $eflags
+# CHECK-NEXT:  [ 1.30  1.40  1.50 ]
+# CHECK-NEXT: Machine instruction: $eax = COPY %3:gr32
+# CHECK-NEXT:  [ 5.20  5.30  5.40 ]
+# CHECK-NEXT: Machine instruction: RET 0, $eax
+# CHECK-NEXT:  [ 1.00  1.10  1.20 ]
+
+# CHECK: MIR2Vec embeddings for machine function simple_function:
+# CHECK-NEXT:Function vector:  [ 1.00  1.10  1.20 ]
+# CHECK-NEXT: Machine basic block vectors:
+# CHECK-NEXT: Machine basic block: simple_function:entry:
+# CHECK-NEXT:  [ 1.00  1.10  1.20 ]
+# CHECK-NEXT: Machine instruction vectors:
+# CHECK-NEXT: Machine instruction: RET 0
+# CHECK-NEXT:  [ 1.00  1.10  1.20 ]
\ No newline at end of file
diff --git a/llvm/test/CodeGen/MIR2Vec/vocab-error-handling.ll b/llvm/test/CodeGen/MIR2Vec/vocab-error-handling.ll
index 80b4048cea0c3..c6554bc0b7eca 100644
--- a/llvm/test/CodeGen/MIR2Vec/vocab-error-handling.ll
+++ b/llvm/test/CodeGen/MIR2Vec/vocab-error-handling.ll
@@ -1,8 +1,8 @@
-; REQUIRES: x86_64-linux
-; RUN: llc -o /dev/null -print-mir2vec-vocab %s 2>&1 | FileCheck %s --check-prefix=CHECK-INVALID
-; RUN: llc -o /dev/null -print-mir2vec-vocab -mir2vec-vocab-path=%S/Inputs/mir2vec_zero_vocab.json %s 2>&1 | FileCheck %s --check-prefix=CHECK-ZERO-DIM
-; RUN: llc -o /dev/null -print-mir2vec-vocab -mir2vec-vocab-path=%S/Inputs/mir2vec_invalid_vocab.json %s 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ENTITIES
-; RUN: llc -o /dev/null -print-mir2vec-vocab -mir2vec-vocab-path=%S/Inputs/mir2vec_inconsistent_dims.json %s 2>&1 | FileCheck %s --check-prefix=CHECK-INCONSISTENT-DIMS
+; REQUIRES: x86-registered-target
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -o /dev/null -print-mir2vec-vocab %s 2>&1 | FileCheck %s --check-prefix=CHECK-INVALID
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -o /dev/null -print-mir2vec-vocab -mir2vec-vocab-path=%S/Inputs/mir2vec_zero_vocab.json %s 2>&1 | FileCheck %s --check-prefix=CHECK-ZERO-DIM
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -o /dev/null -print-mir2vec-vocab -mir2vec-vocab-path=%S/Inputs/mir2vec_invalid_vocab.json %s 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ENTITIES
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -o /dev/null -print-mir2vec-vocab -mir2vec-vocab-path=%S/Inputs/mir2vec_inconsistent_dims.json %s 2>&1 | FileCheck %s --check-prefix=CHECK-INCONSISTENT-DIMS
 
 define dso_local void @test() {
   entry:
diff --git a/llvm/test/CodeGen/NVPTX/i32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i32x2-instructions.ll
index 153ca1054ee1b..72f10aeb06a17 100644
--- a/llvm/test/CodeGen/NVPTX/i32x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i32x2-instructions.ll
@@ -1141,29 +1141,88 @@ define <2 x i32> @test_select_cc(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x
   ret <2 x i32> %r
 }
 
-define <2 x i16> @test_trunc_2xi32(<2 x i32> %a) #0 {
-; CHECK-NOI32X2-LABEL: test_trunc_2xi32(
+define <2 x i16> @test_trunc_2xi32_to_2xi16(<2 x i32> %a) #0 {
+; CHECK-NOI32X2-LABEL: test_trunc_2xi32_to_2xi16(
 ; CHECK-NOI32X2:       {
 ; CHECK-NOI32X2-NEXT:    .reg .b32 %r<4>;
 ; CHECK-NOI32X2-EMPTY:
 ; CHECK-NOI32X2-NEXT:  // %bb.0:
-; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_trunc_2xi32_param_0];
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_trunc_2xi32_to_2xi16_param_0];
 ; CHECK-NOI32X2-NEXT:    prmt.b32 %r3, %r1, %r2, 0x5410U;
 ; CHECK-NOI32X2-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NOI32X2-NEXT:    ret;
 ;
-; CHECK-I32X2-LABEL: test_trunc_2xi32(
+; CHECK-I32X2-LABEL: test_trunc_2xi32_to_2xi16(
 ; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .b32 %r<4>;
 ; CHECK-I32X2-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-I32X2-EMPTY:
 ; CHECK-I32X2-NEXT:  // %bb.0:
-; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_trunc_2xi32_param_0];
-; CHECK-I32X2-NEXT:    st.param.b32 [func_retval0], %rd1;
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_trunc_2xi32_to_2xi16_param_0];
+; CHECK-I32X2-NEXT:    mov.b64 {%r1, %r2}, %rd1;
+; CHECK-I32X2-NEXT:    prmt.b32 %r3, %r1, %r2, 0x5410U;
+; CHECK-I32X2-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-I32X2-NEXT:    ret;
   %r = trunc <2 x i32> %a to <2 x i16>
   ret <2 x i16> %r
 }
 
+define <2 x i8> @test_trunc_2xi32_to_2xi8(<2 x i32> %a) #0 {
+; CHECK-NOI32X2-LABEL: test_trunc_2xi32_to_2xi8(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<3>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0:
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_trunc_2xi32_to_2xi8_param_0];
+; CHECK-NOI32X2-NEXT:    cvt.u16.u32 %rs1, %r2;
+; CHECK-NOI32X2-NEXT:    cvt.u16.u32 %rs2, %r1;
+; CHECK-NOI32X2-NEXT:    st.param.v2.b8 [func_retval0], {%rs2, %rs1};
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_trunc_2xi32_to_2xi8(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .b16 %rs<3>;
+; CHECK-I32X2-NEXT:    .reg .b32 %r<3>;
+; CHECK-I32X2-NEXT:    .reg .b64 %rd<2>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0:
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_trunc_2xi32_to_2xi8_param_0];
+; CHECK-I32X2-NEXT:    mov.b64 {%r1, %r2}, %rd1;
+; CHECK-I32X2-NEXT:    cvt.u16.u32 %rs1, %r2;
+; CHECK-I32X2-NEXT:    cvt.u16.u32 %rs2, %r1;
+; CHECK-I32X2-NEXT:    st.param.v2.b8 [func_retval0], {%rs2, %rs1};
+; CHECK-I32X2-NEXT:    ret;
+  %r = trunc <2 x i32> %a to <2 x i8>
+  ret <2 x i8> %r
+}
+
+define <2 x i1> @test_trunc_2xi32_to_2xi1(<2 x i32> %a) #0 {
+; CHECK-NOI32X2-LABEL: test_trunc_2xi32_to_2xi1(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<3>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0:
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_trunc_2xi32_to_2xi1_param_0];
+; CHECK-NOI32X2-NEXT:    st.param.b8 [func_retval0], %r1;
+; CHECK-NOI32X2-NEXT:    st.param.b8 [func_retval0+1], %r2;
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_trunc_2xi32_to_2xi1(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .b32 %r<3>;
+; CHECK-I32X2-NEXT:    .reg .b64 %rd<2>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0:
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_trunc_2xi32_to_2xi1_param_0];
+; CHECK-I32X2-NEXT:    mov.b64 {%r1, %r2}, %rd1;
+; CHECK-I32X2-NEXT:    st.param.b8 [func_retval0], %r1;
+; CHECK-I32X2-NEXT:    st.param.b8 [func_retval0+1], %r2;
+; CHECK-I32X2-NEXT:    ret;
+  %r = trunc <2 x i32> %a to <2 x i1>
+  ret <2 x i1> %r
+}
+
 define <2 x i32> @test_trunc_2xi64(<2 x i64> %a) #0 {
 ; CHECK-LABEL: test_trunc_2xi64(
 ; CHECK:       {
@@ -1180,14 +1239,14 @@ define <2 x i32> @test_trunc_2xi64(<2 x i64> %a) #0 {
   ret <2 x i32> %r
 }
 
-define <2 x i32> @test_zext_2xi32(<2 x i16> %a) #0 {
-; CHECK-LABEL: test_zext_2xi32(
+define <2 x i32> @test_zext_2xi16_to_2xi32(<2 x i16> %a) #0 {
+; CHECK-LABEL: test_zext_2xi16_to_2xi32(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<3>;
 ; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_zext_2xi32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_zext_2xi16_to_2xi32_param_0];
 ; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    cvt.u32.u16 %r2, %rs2;
 ; CHECK-NEXT:    cvt.u32.u16 %r3, %rs1;
@@ -1197,6 +1256,47 @@ define <2 x i32> @test_zext_2xi32(<2 x i16> %a) #0 {
   ret <2 x i32> %r
 }
 
+define <2 x i32> @test_zext_2xi8_to_2xi32(<2 x i8> %a) #0 {
+; CHECK-LABEL: test_zext_2xi8_to_2xi32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b8 {%rs1, %rs2}, [test_zext_2xi8_to_2xi32_param_0];
+; CHECK-NEXT:    mov.b32 %r1, {%rs1, %rs2};
+; CHECK-NEXT:    cvt.u32.u16 %r2, %rs2;
+; CHECK-NEXT:    cvt.u32.u16 %r3, %rs1;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r3, %r2};
+; CHECK-NEXT:    ret;
+  %r = zext <2 x i8> %a to <2 x i32>
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @test_zext_2xi1_to_2xi32(<2 x i1> %a) #0 {
+; CHECK-LABEL: test_zext_2xi1_to_2xi32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<3>;
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b8 %rs1, [test_zext_2xi1_to_2xi32_param_0+1];
+; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT:    setp.ne.b16 %p2, %rs2, 0;
+; CHECK-NEXT:    ld.param.b8 %rs3, [test_zext_2xi1_to_2xi32_param_0];
+; CHECK-NEXT:    and.b16 %rs4, %rs3, 1;
+; CHECK-NEXT:    setp.ne.b16 %p1, %rs4, 0;
+; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
+; CHECK-NEXT:    and.b32 %r2, %r1, 1;
+; CHECK-NEXT:    cvt.u32.u16 %r3, %rs3;
+; CHECK-NEXT:    and.b32 %r4, %r3, 1;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r2};
+; CHECK-NEXT:    ret;
+  %r = zext <2 x i1> %a to <2 x i32>
+  ret <2 x i32> %r
+}
+
 define <2 x i64> @test_zext_2xi64(<2 x i32> %a) #0 {
 ; CHECK-NOI32X2-LABEL: test_zext_2xi64(
 ; CHECK-NOI32X2:       {
@@ -1566,6 +1666,55 @@ entry:
   ret void
 }
 
+define <2 x i32> @test_sext_v2i8_to_v2i32 (<2 x i8> %a) {
+; CHECK-LABEL: test_sext_v2i8_to_v2i32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b8 {%rs1, %rs2}, [test_sext_v2i8_to_v2i32_param_0];
+; CHECK-NEXT:    mov.b32 %r1, {%rs1, %rs2};
+; CHECK-NEXT:    cvt.u32.u16 %r2, %rs2;
+; CHECK-NEXT:    cvt.s32.s8 %r3, %r2;
+; CHECK-NEXT:    cvt.u32.u16 %r4, %rs1;
+; CHECK-NEXT:    cvt.s32.s8 %r5, %r4;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r5, %r3};
+; CHECK-NEXT:    ret;
+  %r = sext <2 x i8> %a to <2 x i32>
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @test_sext_v2i16_to_v2i32 (<2 x i16> %a) {
+; CHECK-NOI32X2-LABEL: test_sext_v2i16_to_v2i32(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<4>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0:
+; CHECK-NOI32X2-NEXT:    ld.param.b32 %r1, [test_sext_v2i16_to_v2i32_param_0];
+; CHECK-NOI32X2-NEXT:    cvt.s32.s16 %r2, %r1;
+; CHECK-NOI32X2-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r1; }
+; CHECK-NOI32X2-NEXT:    cvt.s32.s16 %r3, %rs1;
+; CHECK-NOI32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r2, %r3};
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_sext_v2i16_to_v2i32(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .b16 %rs<2>;
+; CHECK-I32X2-NEXT:    .reg .b32 %r<4>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0:
+; CHECK-I32X2-NEXT:    ld.param.b32 %r1, [test_sext_v2i16_to_v2i32_param_0];
+; CHECK-I32X2-NEXT:    cvt.s32.s16 %r2, %r1;
+; CHECK-I32X2-NEXT:    mov.b32 {_, %rs1}, %r1;
+; CHECK-I32X2-NEXT:    cvt.s32.s16 %r3, %rs1;
+; CHECK-I32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r2, %r3};
+; CHECK-I32X2-NEXT:    ret;
+  %r = sext <2 x i16> %a to <2 x i32>
+  ret <2 x i32> %r
+}
+
 define <2 x float> @test_uitofp_v2i32(<2 x i32> %a) {
 ; CHECK-NOI32X2-LABEL: test_uitofp_v2i32(
 ; CHECK-NOI32X2:       {
diff --git a/llvm/test/CodeGen/PowerPC/addition-vector-all-ones.ll b/llvm/test/CodeGen/PowerPC/addition-vector-all-ones.ll
new file mode 100644
index 0000000000000..e67d031b1813f
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/addition-vector-all-ones.ll
@@ -0,0 +1,60 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
+
+; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc64-ibm-aix \
+; RUN:     -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
+
+; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc-ibm-aix \
+; RUN:     -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
+
+; The addition of vector `A` with vector of 1s currently uses `vspltisw` to generate vector of 1s followed by add operation.
+
+; Function for the vector type v2i64 `a + {1, 1}`
+define <2 x i64> @test_v2i64(<2 x i64> %a) {
+; CHECK-LABEL: test_v2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vspltisw v3, 1
+; CHECK-NEXT:    vupklsw v3, v3
+; CHECK-NEXT:    vaddudm v2, v2, v3
+; CHECK-NEXT:    blr
+entry:
+  %add = add <2 x i64> %a, splat (i64 1)
+  ret <2 x i64> %add
+}
+
+; Function for the vector type v4i32 `a + {1, 1, 1, 1}`
+define <4 x i32> @test_v4i32(<4 x i32> %a) {
+; CHECK-LABEL: test_v4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vspltisw v3, 1
+; CHECK-NEXT:    vadduwm v2, v2, v3
+; CHECK-NEXT:    blr
+entry:
+  %add = add <4 x i32> %a, splat (i32 1)
+  ret <4 x i32> %add
+}
+
+; Function for the vector type v8i16 `a + {1, 1, 1, 1, 1, 1, 1, 1}`
+define <8 x i16> @test_v8i16(<8 x i16> %a) {
+; CHECK-LABEL: test_v8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vspltish v3, 1
+; CHECK-NEXT:    vadduhm v2, v2, v3
+; CHECK-NEXT:    blr
+entry:
+  %add = add <8 x i16> %a, splat (i16 1)
+  ret <8 x i16> %add
+}
+
+; Function for the vector type v16i8 `a + {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}`
+define <16 x i8> @test_16i8(<16 x i8> %a) {
+; CHECK-LABEL: test_16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltib v3, 1
+; CHECK-NEXT:    vaddubm v2, v2, v3
+; CHECK-NEXT:    blr
+entry:
+  %add = add <16 x i8> %a, splat (i8 1)
+  ret <16 x i8> %add
+}
diff --git a/llvm/test/CodeGen/PowerPC/all-atomics.ll b/llvm/test/CodeGen/PowerPC/all-atomics.ll
index 7e892fc4ae6eb..93968b7153bf8 100644
--- a/llvm/test/CodeGen/PowerPC/all-atomics.ll
+++ b/llvm/test/CodeGen/PowerPC/all-atomics.ll
@@ -33,7 +33,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lbarx 5, 0, 4
 ; CHECK-NEXT:    addi 5, 5, 1
 ; CHECK-NEXT:    stbcx. 5, 0, 4
-; CHECK-NEXT:    bne 0, .LBB0_1
+; CHECK-NEXT:    bne- 0, .LBB0_1
 ; CHECK-NEXT:  # %bb.2: # %entry
 ; CHECK-NEXT:    addis 5, 2, uc@toc@ha
 ; CHECK-NEXT:    lwsync
@@ -44,7 +44,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lbarx 6, 0, 5
 ; CHECK-NEXT:    addi 6, 6, 1
 ; CHECK-NEXT:    stbcx. 6, 0, 5
-; CHECK-NEXT:    bne 0, .LBB0_3
+; CHECK-NEXT:    bne- 0, .LBB0_3
 ; CHECK-NEXT:  # %bb.4: # %entry
 ; CHECK-NEXT:    addis 6, 2, ss@toc@ha
 ; CHECK-NEXT:    lwsync
@@ -55,7 +55,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lharx 7, 0, 6
 ; CHECK-NEXT:    addi 7, 7, 1
 ; CHECK-NEXT:    sthcx. 7, 0, 6
-; CHECK-NEXT:    bne 0, .LBB0_5
+; CHECK-NEXT:    bne- 0, .LBB0_5
 ; CHECK-NEXT:  # %bb.6: # %entry
 ; CHECK-NEXT:    addis 7, 2, us@toc@ha
 ; CHECK-NEXT:    lwsync
@@ -66,7 +66,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lharx 7, 0, 8
 ; CHECK-NEXT:    addi 7, 7, 1
 ; CHECK-NEXT:    sthcx. 7, 0, 8
-; CHECK-NEXT:    bne 0, .LBB0_7
+; CHECK-NEXT:    bne- 0, .LBB0_7
 ; CHECK-NEXT:  # %bb.8: # %entry
 ; CHECK-NEXT:    addis 7, 2, si@toc@ha
 ; CHECK-NEXT:    lwsync
@@ -77,7 +77,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lwarx 7, 0, 9
 ; CHECK-NEXT:    addi 7, 7, 1
 ; CHECK-NEXT:    stwcx. 7, 0, 9
-; CHECK-NEXT:    bne 0, .LBB0_9
+; CHECK-NEXT:    bne- 0, .LBB0_9
 ; CHECK-NEXT:  # %bb.10: # %entry
 ; CHECK-NEXT:    addis 7, 2, ui@toc@ha
 ; CHECK-NEXT:    lwsync
@@ -88,7 +88,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lwarx 7, 0, 10
 ; CHECK-NEXT:    addi 7, 7, 1
 ; CHECK-NEXT:    stwcx. 7, 0, 10
-; CHECK-NEXT:    bne 0, .LBB0_11
+; CHECK-NEXT:    bne- 0, .LBB0_11
 ; CHECK-NEXT:  # %bb.12: # %entry
 ; CHECK-NEXT:    addis 7, 2, sll@toc@ha
 ; CHECK-NEXT:    lwsync
@@ -100,7 +100,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    ldarx 12, 0, 11
 ; CHECK-NEXT:    addi 12, 12, 1
 ; CHECK-NEXT:    stdcx. 12, 0, 11
-; CHECK-NEXT:    bne 0, .LBB0_13
+; CHECK-NEXT:    bne- 0, .LBB0_13
 ; CHECK-NEXT:  # %bb.14: # %entry
 ; CHECK-NEXT:    addis 12, 2, ull@toc@ha
 ; CHECK-NEXT:    lwsync
@@ -111,7 +111,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    ldarx 30, 0, 12
 ; CHECK-NEXT:    addi 0, 30, 1
 ; CHECK-NEXT:    stdcx. 0, 0, 12
-; CHECK-NEXT:    bne 0, .LBB0_15
+; CHECK-NEXT:    bne- 0, .LBB0_15
 ; CHECK-NEXT:  # %bb.16: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sync
@@ -120,7 +120,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lbarx 0, 0, 4
 ; CHECK-NEXT:    sub 0, 0, 3
 ; CHECK-NEXT:    stbcx. 0, 0, 4
-; CHECK-NEXT:    bne 0, .LBB0_17
+; CHECK-NEXT:    bne- 0, .LBB0_17
 ; CHECK-NEXT:  # %bb.18: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sync
@@ -129,7 +129,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lbarx 0, 0, 5
 ; CHECK-NEXT:    sub 0, 0, 3
 ; CHECK-NEXT:    stbcx. 0, 0, 5
-; CHECK-NEXT:    bne 0, .LBB0_19
+; CHECK-NEXT:    bne- 0, .LBB0_19
 ; CHECK-NEXT:  # %bb.20: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sync
@@ -138,7 +138,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lharx 0, 0, 6
 ; CHECK-NEXT:    sub 0, 0, 3
 ; CHECK-NEXT:    sthcx. 0, 0, 6
-; CHECK-NEXT:    bne 0, .LBB0_21
+; CHECK-NEXT:    bne- 0, .LBB0_21
 ; CHECK-NEXT:  # %bb.22: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sync
@@ -147,7 +147,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lharx 0, 0, 8
 ; CHECK-NEXT:    sub 0, 0, 3
 ; CHECK-NEXT:    sthcx. 0, 0, 8
-; CHECK-NEXT:    bne 0, .LBB0_23
+; CHECK-NEXT:    bne- 0, .LBB0_23
 ; CHECK-NEXT:  # %bb.24: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sync
@@ -156,7 +156,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lwarx 0, 0, 9
 ; CHECK-NEXT:    sub 0, 0, 3
 ; CHECK-NEXT:    stwcx. 0, 0, 9
-; CHECK-NEXT:    bne 0, .LBB0_25
+; CHECK-NEXT:    bne- 0, .LBB0_25
 ; CHECK-NEXT:  # %bb.26: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sync
@@ -165,7 +165,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lwarx 0, 0, 10
 ; CHECK-NEXT:    sub 0, 0, 3
 ; CHECK-NEXT:    stwcx. 0, 0, 10
-; CHECK-NEXT:    bne 0, .LBB0_27
+; CHECK-NEXT:    bne- 0, .LBB0_27
 ; CHECK-NEXT:  # %bb.28: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sync
@@ -174,7 +174,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    ldarx 0, 0, 11
 ; CHECK-NEXT:    sub 0, 0, 7
 ; CHECK-NEXT:    stdcx. 0, 0, 11
-; CHECK-NEXT:    bne 0, .LBB0_29
+; CHECK-NEXT:    bne- 0, .LBB0_29
 ; CHECK-NEXT:  # %bb.30: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sync
@@ -183,7 +183,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    ldarx 0, 0, 12
 ; CHECK-NEXT:    sub 0, 0, 7
 ; CHECK-NEXT:    stdcx. 0, 0, 12
-; CHECK-NEXT:    bne 0, .LBB0_31
+; CHECK-NEXT:    bne- 0, .LBB0_31
 ; CHECK-NEXT:  # %bb.32: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sync
@@ -192,7 +192,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lbarx 0, 0, 4
 ; CHECK-NEXT:    ori 0, 0, 1
 ; CHECK-NEXT:    stbcx. 0, 0, 4
-; CHECK-NEXT:    bne 0, .LBB0_33
+; CHECK-NEXT:    bne- 0, .LBB0_33
 ; CHECK-NEXT:  # %bb.34: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sync
@@ -201,7 +201,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lbarx 0, 0, 5
 ; CHECK-NEXT:    ori 0, 0, 1
 ; CHECK-NEXT:    stbcx. 0, 0, 5
-; CHECK-NEXT:    bne 0, .LBB0_35
+; CHECK-NEXT:    bne- 0, .LBB0_35
 ; CHECK-NEXT:  # %bb.36: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sync
@@ -210,7 +210,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lharx 0, 0, 6
 ; CHECK-NEXT:    ori 0, 0, 1
 ; CHECK-NEXT:    sthcx. 0, 0, 6
-; CHECK-NEXT:    bne 0, .LBB0_37
+; CHECK-NEXT:    bne- 0, .LBB0_37
 ; CHECK-NEXT:  # %bb.38: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sync
@@ -219,7 +219,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lharx 0, 0, 8
 ; CHECK-NEXT:    ori 0, 0, 1
 ; CHECK-NEXT:    sthcx. 0, 0, 8
-; CHECK-NEXT:    bne 0, .LBB0_39
+; CHECK-NEXT:    bne- 0, .LBB0_39
 ; CHECK-NEXT:  # %bb.40: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sync
@@ -228,7 +228,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lwarx 0, 0, 9
 ; CHECK-NEXT:    ori 0, 0, 1
 ; CHECK-NEXT:    stwcx. 0, 0, 9
-; CHECK-NEXT:    bne 0, .LBB0_41
+; CHECK-NEXT:    bne- 0, .LBB0_41
 ; CHECK-NEXT:  # %bb.42: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sync
@@ -237,7 +237,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lwarx 0, 0, 10
 ; CHECK-NEXT:    ori 0, 0, 1
 ; CHECK-NEXT:    stwcx. 0, 0, 10
-; CHECK-NEXT:    bne 0, .LBB0_43
+; CHECK-NEXT:    bne- 0, .LBB0_43
 ; CHECK-NEXT:  # %bb.44: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sync
@@ -246,7 +246,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    ldarx 0, 0, 11
 ; CHECK-NEXT:    ori 0, 0, 1
 ; CHECK-NEXT:    stdcx. 0, 0, 11
-; CHECK-NEXT:    bne 0, .LBB0_45
+; CHECK-NEXT:    bne- 0, .LBB0_45
 ; CHECK-NEXT:  # %bb.46: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sync
@@ -255,7 +255,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    ldarx 0, 0, 12
 ; CHECK-NEXT:    ori 0, 0, 1
 ; CHECK-NEXT:    stdcx. 0, 0, 12
-; CHECK-NEXT:    bne 0, .LBB0_47
+; CHECK-NEXT:    bne- 0, .LBB0_47
 ; CHECK-NEXT:  # %bb.48: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sync
@@ -264,7 +264,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lbarx 0, 0, 4
 ; CHECK-NEXT:    xori 0, 0, 1
 ; CHECK-NEXT:    stbcx. 0, 0, 4
-; CHECK-NEXT:    bne 0, .LBB0_49
+; CHECK-NEXT:    bne- 0, .LBB0_49
 ; CHECK-NEXT:  # %bb.50: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sync
@@ -273,7 +273,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lbarx 0, 0, 5
 ; CHECK-NEXT:    xori 0, 0, 1
 ; CHECK-NEXT:    stbcx. 0, 0, 5
-; CHECK-NEXT:    bne 0, .LBB0_51
+; CHECK-NEXT:    bne- 0, .LBB0_51
 ; CHECK-NEXT:  # %bb.52: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sync
@@ -282,7 +282,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lharx 0, 0, 6
 ; CHECK-NEXT:    xori 0, 0, 1
 ; CHECK-NEXT:    sthcx. 0, 0, 6
-; CHECK-NEXT:    bne 0, .LBB0_53
+; CHECK-NEXT:    bne- 0, .LBB0_53
 ; CHECK-NEXT:  # %bb.54: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sync
@@ -291,7 +291,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lharx 0, 0, 8
 ; CHECK-NEXT:    xori 0, 0, 1
 ; CHECK-NEXT:    sthcx. 0, 0, 8
-; CHECK-NEXT:    bne 0, .LBB0_55
+; CHECK-NEXT:    bne- 0, .LBB0_55
 ; CHECK-NEXT:  # %bb.56: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sync
@@ -300,7 +300,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lwarx 0, 0, 9
 ; CHECK-NEXT:    xori 0, 0, 1
 ; CHECK-NEXT:    stwcx. 0, 0, 9
-; CHECK-NEXT:    bne 0, .LBB0_57
+; CHECK-NEXT:    bne- 0, .LBB0_57
 ; CHECK-NEXT:  # %bb.58: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sync
@@ -309,7 +309,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lwarx 0, 0, 10
 ; CHECK-NEXT:    xori 0, 0, 1
 ; CHECK-NEXT:    stwcx. 0, 0, 10
-; CHECK-NEXT:    bne 0, .LBB0_59
+; CHECK-NEXT:    bne- 0, .LBB0_59
 ; CHECK-NEXT:  # %bb.60: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sync
@@ -318,7 +318,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    ldarx 0, 0, 11
 ; CHECK-NEXT:    xori 0, 0, 1
 ; CHECK-NEXT:    stdcx. 0, 0, 11
-; CHECK-NEXT:    bne 0, .LBB0_61
+; CHECK-NEXT:    bne- 0, .LBB0_61
 ; CHECK-NEXT:  # %bb.62: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sync
@@ -327,7 +327,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    ldarx 0, 0, 12
 ; CHECK-NEXT:    xori 0, 0, 1
 ; CHECK-NEXT:    stdcx. 0, 0, 12
-; CHECK-NEXT:    bne 0, .LBB0_63
+; CHECK-NEXT:    bne- 0, .LBB0_63
 ; CHECK-NEXT:  # %bb.64: # %entry
 ; CHECK-NEXT:    addis 30, 2, u128@toc@ha
 ; CHECK-NEXT:    lwsync
@@ -361,7 +361,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lbarx 0, 0, 4
 ; CHECK-NEXT:    nand 0, 3, 0
 ; CHECK-NEXT:    stbcx. 0, 0, 4
-; CHECK-NEXT:    bne 0, .LBB0_69
+; CHECK-NEXT:    bne- 0, .LBB0_69
 ; CHECK-NEXT:  # %bb.70: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sync
@@ -370,7 +370,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lbarx 0, 0, 5
 ; CHECK-NEXT:    nand 0, 3, 0
 ; CHECK-NEXT:    stbcx. 0, 0, 5
-; CHECK-NEXT:    bne 0, .LBB0_71
+; CHECK-NEXT:    bne- 0, .LBB0_71
 ; CHECK-NEXT:  # %bb.72: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sync
@@ -379,7 +379,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lharx 0, 0, 6
 ; CHECK-NEXT:    nand 0, 3, 0
 ; CHECK-NEXT:    sthcx. 0, 0, 6
-; CHECK-NEXT:    bne 0, .LBB0_73
+; CHECK-NEXT:    bne- 0, .LBB0_73
 ; CHECK-NEXT:  # %bb.74: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sync
@@ -388,7 +388,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lharx 0, 0, 8
 ; CHECK-NEXT:    nand 0, 3, 0
 ; CHECK-NEXT:    sthcx. 0, 0, 8
-; CHECK-NEXT:    bne 0, .LBB0_75
+; CHECK-NEXT:    bne- 0, .LBB0_75
 ; CHECK-NEXT:  # %bb.76: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sync
@@ -397,7 +397,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lwarx 0, 0, 9
 ; CHECK-NEXT:    nand 0, 3, 0
 ; CHECK-NEXT:    stwcx. 0, 0, 9
-; CHECK-NEXT:    bne 0, .LBB0_77
+; CHECK-NEXT:    bne- 0, .LBB0_77
 ; CHECK-NEXT:  # %bb.78: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sync
@@ -406,7 +406,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lwarx 0, 0, 10
 ; CHECK-NEXT:    nand 0, 3, 0
 ; CHECK-NEXT:    stwcx. 0, 0, 10
-; CHECK-NEXT:    bne 0, .LBB0_79
+; CHECK-NEXT:    bne- 0, .LBB0_79
 ; CHECK-NEXT:  # %bb.80: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sync
@@ -415,7 +415,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    ldarx 0, 0, 11
 ; CHECK-NEXT:    nand 0, 7, 0
 ; CHECK-NEXT:    stdcx. 0, 0, 11
-; CHECK-NEXT:    bne 0, .LBB0_81
+; CHECK-NEXT:    bne- 0, .LBB0_81
 ; CHECK-NEXT:  # %bb.82: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sync
@@ -424,7 +424,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    ldarx 0, 0, 12
 ; CHECK-NEXT:    nand 0, 7, 0
 ; CHECK-NEXT:    stdcx. 0, 0, 12
-; CHECK-NEXT:    bne 0, .LBB0_83
+; CHECK-NEXT:    bne- 0, .LBB0_83
 ; CHECK-NEXT:  # %bb.84: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sync
@@ -433,7 +433,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lbarx 0, 0, 4
 ; CHECK-NEXT:    and 0, 3, 0
 ; CHECK-NEXT:    stbcx. 0, 0, 4
-; CHECK-NEXT:    bne 0, .LBB0_85
+; CHECK-NEXT:    bne- 0, .LBB0_85
 ; CHECK-NEXT:  # %bb.86: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sync
@@ -442,7 +442,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lbarx 4, 0, 5
 ; CHECK-NEXT:    and 4, 3, 4
 ; CHECK-NEXT:    stbcx. 4, 0, 5
-; CHECK-NEXT:    bne 0, .LBB0_87
+; CHECK-NEXT:    bne- 0, .LBB0_87
 ; CHECK-NEXT:  # %bb.88: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sync
@@ -451,7 +451,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lharx 4, 0, 6
 ; CHECK-NEXT:    and 4, 3, 4
 ; CHECK-NEXT:    sthcx. 4, 0, 6
-; CHECK-NEXT:    bne 0, .LBB0_89
+; CHECK-NEXT:    bne- 0, .LBB0_89
 ; CHECK-NEXT:  # %bb.90: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sync
@@ -460,7 +460,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lharx 4, 0, 8
 ; CHECK-NEXT:    and 4, 3, 4
 ; CHECK-NEXT:    sthcx. 4, 0, 8
-; CHECK-NEXT:    bne 0, .LBB0_91
+; CHECK-NEXT:    bne- 0, .LBB0_91
 ; CHECK-NEXT:  # %bb.92: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sync
@@ -469,7 +469,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lwarx 4, 0, 9
 ; CHECK-NEXT:    and 4, 3, 4
 ; CHECK-NEXT:    stwcx. 4, 0, 9
-; CHECK-NEXT:    bne 0, .LBB0_93
+; CHECK-NEXT:    bne- 0, .LBB0_93
 ; CHECK-NEXT:  # %bb.94: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sync
@@ -478,7 +478,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lwarx 4, 0, 10
 ; CHECK-NEXT:    and 4, 3, 4
 ; CHECK-NEXT:    stwcx. 4, 0, 10
-; CHECK-NEXT:    bne 0, .LBB0_95
+; CHECK-NEXT:    bne- 0, .LBB0_95
 ; CHECK-NEXT:  # %bb.96: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sync
@@ -487,7 +487,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    ldarx 3, 0, 11
 ; CHECK-NEXT:    and 3, 7, 3
 ; CHECK-NEXT:    stdcx. 3, 0, 11
-; CHECK-NEXT:    bne 0, .LBB0_97
+; CHECK-NEXT:    bne- 0, .LBB0_97
 ; CHECK-NEXT:  # %bb.98: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sync
@@ -496,7 +496,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; CHECK-NEXT:    ldarx 3, 0, 12
 ; CHECK-NEXT:    and 3, 7, 3
 ; CHECK-NEXT:    stdcx. 3, 0, 12
-; CHECK-NEXT:    bne 0, .LBB0_99
+; CHECK-NEXT:    bne- 0, .LBB0_99
 ; CHECK-NEXT:  # %bb.100: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    ld 30, -16(1) # 8-byte Folded Reload
@@ -545,7 +545,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 4, 5, 4
 ; AIX32-NEXT:    stwcx. 4, 0, 28
-; AIX32-NEXT:    bne 0, L..BB0_1
+; AIX32-NEXT:    bne- 0, L..BB0_1
 ; AIX32-NEXT:  # %bb.2: # %entry
 ; AIX32-NEXT:    lwz 3, L..C1(2) # @uc
 ; AIX32-NEXT:    lwsync
@@ -564,7 +564,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 4, 5, 4
 ; AIX32-NEXT:    stwcx. 4, 0, 27
-; AIX32-NEXT:    bne 0, L..BB0_3
+; AIX32-NEXT:    bne- 0, L..BB0_3
 ; AIX32-NEXT:  # %bb.4: # %entry
 ; AIX32-NEXT:    lwz 3, L..C2(2) # @ss
 ; AIX32-NEXT:    lwsync
@@ -584,7 +584,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 4, 5, 4
 ; AIX32-NEXT:    stwcx. 4, 0, 25
-; AIX32-NEXT:    bne 0, L..BB0_5
+; AIX32-NEXT:    bne- 0, L..BB0_5
 ; AIX32-NEXT:  # %bb.6: # %entry
 ; AIX32-NEXT:    lwz 3, L..C3(2) # @us
 ; AIX32-NEXT:    lwsync
@@ -604,7 +604,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 4, 5, 4
 ; AIX32-NEXT:    stwcx. 4, 0, 23
-; AIX32-NEXT:    bne 0, L..BB0_7
+; AIX32-NEXT:    bne- 0, L..BB0_7
 ; AIX32-NEXT:  # %bb.8: # %entry
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    lwz 20, L..C4(2) # @si
@@ -614,7 +614,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    lwarx 3, 0, 20
 ; AIX32-NEXT:    addi 3, 3, 1
 ; AIX32-NEXT:    stwcx. 3, 0, 20
-; AIX32-NEXT:    bne 0, L..BB0_9
+; AIX32-NEXT:    bne- 0, L..BB0_9
 ; AIX32-NEXT:  # %bb.10: # %entry
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    lwz 19, L..C5(2) # @ui
@@ -624,7 +624,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    lwarx 3, 0, 19
 ; AIX32-NEXT:    addi 3, 3, 1
 ; AIX32-NEXT:    stwcx. 3, 0, 19
-; AIX32-NEXT:    bne 0, L..BB0_11
+; AIX32-NEXT:    bne- 0, L..BB0_11
 ; AIX32-NEXT:  # %bb.12: # %entry
 ; AIX32-NEXT:    lwz 31, L..C6(2) # @sll
 ; AIX32-NEXT:    lwsync
@@ -652,7 +652,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 4, 5, 4
 ; AIX32-NEXT:    stwcx. 4, 0, 28
-; AIX32-NEXT:    bne 0, L..BB0_13
+; AIX32-NEXT:    bne- 0, L..BB0_13
 ; AIX32-NEXT:  # %bb.14: # %entry
 ; AIX32-NEXT:    li 3, 255
 ; AIX32-NEXT:    lwsync
@@ -666,7 +666,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 4, 5, 4
 ; AIX32-NEXT:    stwcx. 4, 0, 27
-; AIX32-NEXT:    bne 0, L..BB0_15
+; AIX32-NEXT:    bne- 0, L..BB0_15
 ; AIX32-NEXT:  # %bb.16: # %entry
 ; AIX32-NEXT:    li 3, 0
 ; AIX32-NEXT:    lwsync
@@ -681,7 +681,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 4, 5, 4
 ; AIX32-NEXT:    stwcx. 4, 0, 25
-; AIX32-NEXT:    bne 0, L..BB0_17
+; AIX32-NEXT:    bne- 0, L..BB0_17
 ; AIX32-NEXT:  # %bb.18: # %entry
 ; AIX32-NEXT:    li 3, 0
 ; AIX32-NEXT:    lwsync
@@ -696,7 +696,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 4, 5, 4
 ; AIX32-NEXT:    stwcx. 4, 0, 23
-; AIX32-NEXT:    bne 0, L..BB0_19
+; AIX32-NEXT:    bne- 0, L..BB0_19
 ; AIX32-NEXT:  # %bb.20: # %entry
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    sync
@@ -705,7 +705,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    lwarx 3, 0, 20
 ; AIX32-NEXT:    sub 3, 3, 15
 ; AIX32-NEXT:    stwcx. 3, 0, 20
-; AIX32-NEXT:    bne 0, L..BB0_21
+; AIX32-NEXT:    bne- 0, L..BB0_21
 ; AIX32-NEXT:  # %bb.22: # %entry
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    sync
@@ -714,7 +714,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    lwarx 3, 0, 19
 ; AIX32-NEXT:    sub 3, 3, 15
 ; AIX32-NEXT:    stwcx. 3, 0, 19
-; AIX32-NEXT:    bne 0, L..BB0_23
+; AIX32-NEXT:    bne- 0, L..BB0_23
 ; AIX32-NEXT:  # %bb.24: # %entry
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    li 4, 0
@@ -740,7 +740,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 4, 5, 4
 ; AIX32-NEXT:    stwcx. 4, 0, 28
-; AIX32-NEXT:    bne 0, L..BB0_25
+; AIX32-NEXT:    bne- 0, L..BB0_25
 ; AIX32-NEXT:  # %bb.26: # %entry
 ; AIX32-NEXT:    li 3, 255
 ; AIX32-NEXT:    lwsync
@@ -754,7 +754,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 4, 5, 4
 ; AIX32-NEXT:    stwcx. 4, 0, 27
-; AIX32-NEXT:    bne 0, L..BB0_27
+; AIX32-NEXT:    bne- 0, L..BB0_27
 ; AIX32-NEXT:  # %bb.28: # %entry
 ; AIX32-NEXT:    li 3, 0
 ; AIX32-NEXT:    lwsync
@@ -769,7 +769,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 4, 5, 4
 ; AIX32-NEXT:    stwcx. 4, 0, 25
-; AIX32-NEXT:    bne 0, L..BB0_29
+; AIX32-NEXT:    bne- 0, L..BB0_29
 ; AIX32-NEXT:  # %bb.30: # %entry
 ; AIX32-NEXT:    li 3, 0
 ; AIX32-NEXT:    lwsync
@@ -784,7 +784,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 4, 5, 4
 ; AIX32-NEXT:    stwcx. 4, 0, 23
-; AIX32-NEXT:    bne 0, L..BB0_31
+; AIX32-NEXT:    bne- 0, L..BB0_31
 ; AIX32-NEXT:  # %bb.32: # %entry
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    sync
@@ -793,7 +793,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    lwarx 3, 0, 20
 ; AIX32-NEXT:    ori 3, 3, 1
 ; AIX32-NEXT:    stwcx. 3, 0, 20
-; AIX32-NEXT:    bne 0, L..BB0_33
+; AIX32-NEXT:    bne- 0, L..BB0_33
 ; AIX32-NEXT:  # %bb.34: # %entry
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    sync
@@ -802,7 +802,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    lwarx 3, 0, 19
 ; AIX32-NEXT:    ori 3, 3, 1
 ; AIX32-NEXT:    stwcx. 3, 0, 19
-; AIX32-NEXT:    bne 0, L..BB0_35
+; AIX32-NEXT:    bne- 0, L..BB0_35
 ; AIX32-NEXT:  # %bb.36: # %entry
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    li 4, 0
@@ -828,7 +828,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 4, 5, 4
 ; AIX32-NEXT:    stwcx. 4, 0, 28
-; AIX32-NEXT:    bne 0, L..BB0_37
+; AIX32-NEXT:    bne- 0, L..BB0_37
 ; AIX32-NEXT:  # %bb.38: # %entry
 ; AIX32-NEXT:    li 3, 255
 ; AIX32-NEXT:    lwsync
@@ -842,7 +842,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 4, 5, 4
 ; AIX32-NEXT:    stwcx. 4, 0, 27
-; AIX32-NEXT:    bne 0, L..BB0_39
+; AIX32-NEXT:    bne- 0, L..BB0_39
 ; AIX32-NEXT:  # %bb.40: # %entry
 ; AIX32-NEXT:    li 3, 0
 ; AIX32-NEXT:    lwsync
@@ -857,7 +857,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 4, 5, 4
 ; AIX32-NEXT:    stwcx. 4, 0, 25
-; AIX32-NEXT:    bne 0, L..BB0_41
+; AIX32-NEXT:    bne- 0, L..BB0_41
 ; AIX32-NEXT:  # %bb.42: # %entry
 ; AIX32-NEXT:    li 3, 0
 ; AIX32-NEXT:    lwsync
@@ -872,7 +872,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 4, 5, 4
 ; AIX32-NEXT:    stwcx. 4, 0, 23
-; AIX32-NEXT:    bne 0, L..BB0_43
+; AIX32-NEXT:    bne- 0, L..BB0_43
 ; AIX32-NEXT:  # %bb.44: # %entry
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    sync
@@ -881,7 +881,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    lwarx 3, 0, 20
 ; AIX32-NEXT:    xori 3, 3, 1
 ; AIX32-NEXT:    stwcx. 3, 0, 20
-; AIX32-NEXT:    bne 0, L..BB0_45
+; AIX32-NEXT:    bne- 0, L..BB0_45
 ; AIX32-NEXT:  # %bb.46: # %entry
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    sync
@@ -890,7 +890,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    lwarx 3, 0, 19
 ; AIX32-NEXT:    xori 3, 3, 1
 ; AIX32-NEXT:    stwcx. 3, 0, 19
-; AIX32-NEXT:    bne 0, L..BB0_47
+; AIX32-NEXT:    bne- 0, L..BB0_47
 ; AIX32-NEXT:  # %bb.48: # %entry
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    li 4, 0
@@ -986,7 +986,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 4, 5, 4
 ; AIX32-NEXT:    stwcx. 4, 0, 28
-; AIX32-NEXT:    bne 0, L..BB0_53
+; AIX32-NEXT:    bne- 0, L..BB0_53
 ; AIX32-NEXT:  # %bb.54: # %atomicrmw.end
 ; AIX32-NEXT:    li 3, 255
 ; AIX32-NEXT:    lwsync
@@ -1001,7 +1001,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 4, 5, 4
 ; AIX32-NEXT:    stwcx. 4, 0, 27
-; AIX32-NEXT:    bne 0, L..BB0_55
+; AIX32-NEXT:    bne- 0, L..BB0_55
 ; AIX32-NEXT:  # %bb.56: # %atomicrmw.end
 ; AIX32-NEXT:    li 3, 0
 ; AIX32-NEXT:    lwsync
@@ -1017,7 +1017,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 4, 5, 4
 ; AIX32-NEXT:    stwcx. 4, 0, 25
-; AIX32-NEXT:    bne 0, L..BB0_57
+; AIX32-NEXT:    bne- 0, L..BB0_57
 ; AIX32-NEXT:  # %bb.58: # %atomicrmw.end
 ; AIX32-NEXT:    li 3, 0
 ; AIX32-NEXT:    lwsync
@@ -1033,7 +1033,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 4, 5, 4
 ; AIX32-NEXT:    stwcx. 4, 0, 23
-; AIX32-NEXT:    bne 0, L..BB0_59
+; AIX32-NEXT:    bne- 0, L..BB0_59
 ; AIX32-NEXT:  # %bb.60: # %atomicrmw.end
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    sync
@@ -1042,7 +1042,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    lwarx 3, 0, 20
 ; AIX32-NEXT:    nand 3, 29, 3
 ; AIX32-NEXT:    stwcx. 3, 0, 20
-; AIX32-NEXT:    bne 0, L..BB0_61
+; AIX32-NEXT:    bne- 0, L..BB0_61
 ; AIX32-NEXT:  # %bb.62: # %atomicrmw.end
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    sync
@@ -1051,7 +1051,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    lwarx 3, 0, 19
 ; AIX32-NEXT:    nand 3, 29, 3
 ; AIX32-NEXT:    stwcx. 3, 0, 19
-; AIX32-NEXT:    bne 0, L..BB0_63
+; AIX32-NEXT:    bne- 0, L..BB0_63
 ; AIX32-NEXT:  # %bb.64: # %atomicrmw.end
 ; AIX32-NEXT:    lwz 31, L..C6(2) # @sll
 ; AIX32-NEXT:    lwsync
@@ -1079,7 +1079,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 4, 5, 4
 ; AIX32-NEXT:    stwcx. 4, 0, 28
-; AIX32-NEXT:    bne 0, L..BB0_65
+; AIX32-NEXT:    bne- 0, L..BB0_65
 ; AIX32-NEXT:  # %bb.66: # %atomicrmw.end
 ; AIX32-NEXT:    li 3, 255
 ; AIX32-NEXT:    lwsync
@@ -1093,7 +1093,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 4, 5, 4
 ; AIX32-NEXT:    stwcx. 4, 0, 27
-; AIX32-NEXT:    bne 0, L..BB0_67
+; AIX32-NEXT:    bne- 0, L..BB0_67
 ; AIX32-NEXT:  # %bb.68: # %atomicrmw.end
 ; AIX32-NEXT:    li 3, 0
 ; AIX32-NEXT:    lwsync
@@ -1108,7 +1108,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 4, 5, 4
 ; AIX32-NEXT:    stwcx. 4, 0, 25
-; AIX32-NEXT:    bne 0, L..BB0_69
+; AIX32-NEXT:    bne- 0, L..BB0_69
 ; AIX32-NEXT:  # %bb.70: # %atomicrmw.end
 ; AIX32-NEXT:    li 3, 0
 ; AIX32-NEXT:    lwsync
@@ -1123,7 +1123,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 4, 5, 4
 ; AIX32-NEXT:    stwcx. 4, 0, 23
-; AIX32-NEXT:    bne 0, L..BB0_71
+; AIX32-NEXT:    bne- 0, L..BB0_71
 ; AIX32-NEXT:  # %bb.72: # %atomicrmw.end
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    sync
@@ -1132,7 +1132,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    lwarx 3, 0, 20
 ; AIX32-NEXT:    and 3, 29, 3
 ; AIX32-NEXT:    stwcx. 3, 0, 20
-; AIX32-NEXT:    bne 0, L..BB0_73
+; AIX32-NEXT:    bne- 0, L..BB0_73
 ; AIX32-NEXT:  # %bb.74: # %atomicrmw.end
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    sync
@@ -1141,7 +1141,7 @@ define dso_local void @test_op_ignore() local_unnamed_addr #0 {
 ; AIX32-NEXT:    lwarx 3, 0, 19
 ; AIX32-NEXT:    and 3, 29, 3
 ; AIX32-NEXT:    stwcx. 3, 0, 19
-; AIX32-NEXT:    bne 0, L..BB0_75
+; AIX32-NEXT:    bne- 0, L..BB0_75
 ; AIX32-NEXT:  # %bb.76: # %atomicrmw.end
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    li 4, 0
@@ -1252,7 +1252,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lbarx 5, 0, 6
 ; CHECK-NEXT:    addi 7, 5, 11
 ; CHECK-NEXT:    stbcx. 7, 0, 6
-; CHECK-NEXT:    bne 0, .LBB1_1
+; CHECK-NEXT:    bne- 0, .LBB1_1
 ; CHECK-NEXT:  # %bb.2: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stb 5, sc@toc@l(4)
@@ -1264,7 +1264,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lbarx 7, 0, 8
 ; CHECK-NEXT:    addi 9, 7, 11
 ; CHECK-NEXT:    stbcx. 9, 0, 8
-; CHECK-NEXT:    bne 0, .LBB1_3
+; CHECK-NEXT:    bne- 0, .LBB1_3
 ; CHECK-NEXT:  # %bb.4: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stb 7, uc@toc@l(5)
@@ -1276,7 +1276,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lharx 9, 0, 10
 ; CHECK-NEXT:    addi 11, 9, 11
 ; CHECK-NEXT:    sthcx. 11, 0, 10
-; CHECK-NEXT:    bne 0, .LBB1_5
+; CHECK-NEXT:    bne- 0, .LBB1_5
 ; CHECK-NEXT:  # %bb.6: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sth 9, ss@toc@l(7)
@@ -1288,7 +1288,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lharx 11, 0, 0
 ; CHECK-NEXT:    addi 12, 11, 11
 ; CHECK-NEXT:    sthcx. 12, 0, 0
-; CHECK-NEXT:    bne 0, .LBB1_7
+; CHECK-NEXT:    bne- 0, .LBB1_7
 ; CHECK-NEXT:  # %bb.8: # %entry
 ; CHECK-NEXT:    addis 12, 2, si@toc@ha
 ; CHECK-NEXT:    lwsync
@@ -1300,7 +1300,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lwarx 11, 0, 29
 ; CHECK-NEXT:    addi 30, 11, 11
 ; CHECK-NEXT:    stwcx. 30, 0, 29
-; CHECK-NEXT:    bne 0, .LBB1_9
+; CHECK-NEXT:    bne- 0, .LBB1_9
 ; CHECK-NEXT:  # %bb.10: # %entry
 ; CHECK-NEXT:    addis 30, 2, ui@toc@ha
 ; CHECK-NEXT:    lwsync
@@ -1312,7 +1312,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lwarx 11, 0, 27
 ; CHECK-NEXT:    addi 28, 11, 11
 ; CHECK-NEXT:    stwcx. 28, 0, 27
-; CHECK-NEXT:    bne 0, .LBB1_11
+; CHECK-NEXT:    bne- 0, .LBB1_11
 ; CHECK-NEXT:  # %bb.12: # %entry
 ; CHECK-NEXT:    addis 28, 2, sll@toc@ha
 ; CHECK-NEXT:    lwsync
@@ -1325,7 +1325,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    ldarx 26, 0, 25
 ; CHECK-NEXT:    addi 24, 26, 11
 ; CHECK-NEXT:    stdcx. 24, 0, 25
-; CHECK-NEXT:    bne 0, .LBB1_13
+; CHECK-NEXT:    bne- 0, .LBB1_13
 ; CHECK-NEXT:  # %bb.14: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    std 26, sll@toc@l(28)
@@ -1337,7 +1337,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    ldarx 23, 0, 24
 ; CHECK-NEXT:    addi 22, 23, 11
 ; CHECK-NEXT:    stdcx. 22, 0, 24
-; CHECK-NEXT:    bne 0, .LBB1_15
+; CHECK-NEXT:    bne- 0, .LBB1_15
 ; CHECK-NEXT:  # %bb.16: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    std 23, ull@toc@l(26)
@@ -1347,7 +1347,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lbarx 23, 0, 6
 ; CHECK-NEXT:    sub 22, 23, 3
 ; CHECK-NEXT:    stbcx. 22, 0, 6
-; CHECK-NEXT:    bne 0, .LBB1_17
+; CHECK-NEXT:    bne- 0, .LBB1_17
 ; CHECK-NEXT:  # %bb.18: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stb 23, sc@toc@l(4)
@@ -1357,7 +1357,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lbarx 23, 0, 8
 ; CHECK-NEXT:    sub 22, 23, 3
 ; CHECK-NEXT:    stbcx. 22, 0, 8
-; CHECK-NEXT:    bne 0, .LBB1_19
+; CHECK-NEXT:    bne- 0, .LBB1_19
 ; CHECK-NEXT:  # %bb.20: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stb 23, uc@toc@l(5)
@@ -1367,7 +1367,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lharx 23, 0, 10
 ; CHECK-NEXT:    sub 22, 23, 3
 ; CHECK-NEXT:    sthcx. 22, 0, 10
-; CHECK-NEXT:    bne 0, .LBB1_21
+; CHECK-NEXT:    bne- 0, .LBB1_21
 ; CHECK-NEXT:  # %bb.22: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sth 23, ss@toc@l(7)
@@ -1377,7 +1377,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lharx 23, 0, 0
 ; CHECK-NEXT:    sub 22, 23, 3
 ; CHECK-NEXT:    sthcx. 22, 0, 0
-; CHECK-NEXT:    bne 0, .LBB1_23
+; CHECK-NEXT:    bne- 0, .LBB1_23
 ; CHECK-NEXT:  # %bb.24: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sth 23, us@toc@l(9)
@@ -1387,7 +1387,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lwarx 23, 0, 29
 ; CHECK-NEXT:    sub 22, 23, 3
 ; CHECK-NEXT:    stwcx. 22, 0, 29
-; CHECK-NEXT:    bne 0, .LBB1_25
+; CHECK-NEXT:    bne- 0, .LBB1_25
 ; CHECK-NEXT:  # %bb.26: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stw 23, si@toc@l(12)
@@ -1397,7 +1397,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lwarx 23, 0, 27
 ; CHECK-NEXT:    sub 22, 23, 3
 ; CHECK-NEXT:    stwcx. 22, 0, 27
-; CHECK-NEXT:    bne 0, .LBB1_27
+; CHECK-NEXT:    bne- 0, .LBB1_27
 ; CHECK-NEXT:  # %bb.28: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stw 23, ui@toc@l(30)
@@ -1407,7 +1407,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    ldarx 23, 0, 25
 ; CHECK-NEXT:    sub 22, 23, 11
 ; CHECK-NEXT:    stdcx. 22, 0, 25
-; CHECK-NEXT:    bne 0, .LBB1_29
+; CHECK-NEXT:    bne- 0, .LBB1_29
 ; CHECK-NEXT:  # %bb.30: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    std 23, sll@toc@l(28)
@@ -1417,7 +1417,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    ldarx 23, 0, 24
 ; CHECK-NEXT:    sub 22, 23, 11
 ; CHECK-NEXT:    stdcx. 22, 0, 24
-; CHECK-NEXT:    bne 0, .LBB1_31
+; CHECK-NEXT:    bne- 0, .LBB1_31
 ; CHECK-NEXT:  # %bb.32: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    std 23, ull@toc@l(26)
@@ -1427,7 +1427,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lbarx 23, 0, 6
 ; CHECK-NEXT:    ori 22, 23, 11
 ; CHECK-NEXT:    stbcx. 22, 0, 6
-; CHECK-NEXT:    bne 0, .LBB1_33
+; CHECK-NEXT:    bne- 0, .LBB1_33
 ; CHECK-NEXT:  # %bb.34: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stb 23, sc@toc@l(4)
@@ -1437,7 +1437,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lbarx 23, 0, 8
 ; CHECK-NEXT:    ori 22, 23, 11
 ; CHECK-NEXT:    stbcx. 22, 0, 8
-; CHECK-NEXT:    bne 0, .LBB1_35
+; CHECK-NEXT:    bne- 0, .LBB1_35
 ; CHECK-NEXT:  # %bb.36: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stb 23, uc@toc@l(5)
@@ -1447,7 +1447,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lharx 23, 0, 10
 ; CHECK-NEXT:    ori 22, 23, 11
 ; CHECK-NEXT:    sthcx. 22, 0, 10
-; CHECK-NEXT:    bne 0, .LBB1_37
+; CHECK-NEXT:    bne- 0, .LBB1_37
 ; CHECK-NEXT:  # %bb.38: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sth 23, ss@toc@l(7)
@@ -1457,7 +1457,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lharx 23, 0, 0
 ; CHECK-NEXT:    ori 22, 23, 11
 ; CHECK-NEXT:    sthcx. 22, 0, 0
-; CHECK-NEXT:    bne 0, .LBB1_39
+; CHECK-NEXT:    bne- 0, .LBB1_39
 ; CHECK-NEXT:  # %bb.40: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sth 23, us@toc@l(9)
@@ -1467,7 +1467,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lwarx 23, 0, 29
 ; CHECK-NEXT:    ori 22, 23, 11
 ; CHECK-NEXT:    stwcx. 22, 0, 29
-; CHECK-NEXT:    bne 0, .LBB1_41
+; CHECK-NEXT:    bne- 0, .LBB1_41
 ; CHECK-NEXT:  # %bb.42: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stw 23, si@toc@l(12)
@@ -1477,7 +1477,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lwarx 23, 0, 27
 ; CHECK-NEXT:    ori 22, 23, 11
 ; CHECK-NEXT:    stwcx. 22, 0, 27
-; CHECK-NEXT:    bne 0, .LBB1_43
+; CHECK-NEXT:    bne- 0, .LBB1_43
 ; CHECK-NEXT:  # %bb.44: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stw 23, ui@toc@l(30)
@@ -1487,7 +1487,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    ldarx 23, 0, 25
 ; CHECK-NEXT:    ori 22, 23, 11
 ; CHECK-NEXT:    stdcx. 22, 0, 25
-; CHECK-NEXT:    bne 0, .LBB1_45
+; CHECK-NEXT:    bne- 0, .LBB1_45
 ; CHECK-NEXT:  # %bb.46: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    std 23, sll@toc@l(28)
@@ -1497,7 +1497,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    ldarx 23, 0, 24
 ; CHECK-NEXT:    ori 22, 23, 11
 ; CHECK-NEXT:    stdcx. 22, 0, 24
-; CHECK-NEXT:    bne 0, .LBB1_47
+; CHECK-NEXT:    bne- 0, .LBB1_47
 ; CHECK-NEXT:  # %bb.48: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    std 23, ull@toc@l(26)
@@ -1507,7 +1507,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lbarx 23, 0, 6
 ; CHECK-NEXT:    xori 22, 23, 11
 ; CHECK-NEXT:    stbcx. 22, 0, 6
-; CHECK-NEXT:    bne 0, .LBB1_49
+; CHECK-NEXT:    bne- 0, .LBB1_49
 ; CHECK-NEXT:  # %bb.50: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stb 23, sc@toc@l(4)
@@ -1517,7 +1517,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lbarx 23, 0, 8
 ; CHECK-NEXT:    xori 22, 23, 11
 ; CHECK-NEXT:    stbcx. 22, 0, 8
-; CHECK-NEXT:    bne 0, .LBB1_51
+; CHECK-NEXT:    bne- 0, .LBB1_51
 ; CHECK-NEXT:  # %bb.52: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stb 23, uc@toc@l(5)
@@ -1527,7 +1527,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lharx 23, 0, 10
 ; CHECK-NEXT:    xori 22, 23, 11
 ; CHECK-NEXT:    sthcx. 22, 0, 10
-; CHECK-NEXT:    bne 0, .LBB1_53
+; CHECK-NEXT:    bne- 0, .LBB1_53
 ; CHECK-NEXT:  # %bb.54: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sth 23, ss@toc@l(7)
@@ -1537,7 +1537,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lharx 23, 0, 0
 ; CHECK-NEXT:    xori 22, 23, 11
 ; CHECK-NEXT:    sthcx. 22, 0, 0
-; CHECK-NEXT:    bne 0, .LBB1_55
+; CHECK-NEXT:    bne- 0, .LBB1_55
 ; CHECK-NEXT:  # %bb.56: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sth 23, us@toc@l(9)
@@ -1547,7 +1547,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lwarx 23, 0, 29
 ; CHECK-NEXT:    xori 22, 23, 11
 ; CHECK-NEXT:    stwcx. 22, 0, 29
-; CHECK-NEXT:    bne 0, .LBB1_57
+; CHECK-NEXT:    bne- 0, .LBB1_57
 ; CHECK-NEXT:  # %bb.58: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stw 23, si@toc@l(12)
@@ -1557,7 +1557,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lwarx 23, 0, 27
 ; CHECK-NEXT:    xori 22, 23, 11
 ; CHECK-NEXT:    stwcx. 22, 0, 27
-; CHECK-NEXT:    bne 0, .LBB1_59
+; CHECK-NEXT:    bne- 0, .LBB1_59
 ; CHECK-NEXT:  # %bb.60: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stw 23, ui@toc@l(30)
@@ -1567,7 +1567,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    ldarx 23, 0, 25
 ; CHECK-NEXT:    xori 22, 23, 11
 ; CHECK-NEXT:    stdcx. 22, 0, 25
-; CHECK-NEXT:    bne 0, .LBB1_61
+; CHECK-NEXT:    bne- 0, .LBB1_61
 ; CHECK-NEXT:  # %bb.62: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    std 23, sll@toc@l(28)
@@ -1577,7 +1577,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    ldarx 23, 0, 24
 ; CHECK-NEXT:    xori 22, 23, 11
 ; CHECK-NEXT:    stdcx. 22, 0, 24
-; CHECK-NEXT:    bne 0, .LBB1_63
+; CHECK-NEXT:    bne- 0, .LBB1_63
 ; CHECK-NEXT:  # %bb.64: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    std 23, ull@toc@l(26)
@@ -1587,7 +1587,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lbarx 23, 0, 6
 ; CHECK-NEXT:    nand 22, 3, 23
 ; CHECK-NEXT:    stbcx. 22, 0, 6
-; CHECK-NEXT:    bne 0, .LBB1_65
+; CHECK-NEXT:    bne- 0, .LBB1_65
 ; CHECK-NEXT:  # %bb.66: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stb 23, sc@toc@l(4)
@@ -1597,7 +1597,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lbarx 23, 0, 8
 ; CHECK-NEXT:    nand 22, 3, 23
 ; CHECK-NEXT:    stbcx. 22, 0, 8
-; CHECK-NEXT:    bne 0, .LBB1_67
+; CHECK-NEXT:    bne- 0, .LBB1_67
 ; CHECK-NEXT:  # %bb.68: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stb 23, uc@toc@l(5)
@@ -1607,7 +1607,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lharx 23, 0, 10
 ; CHECK-NEXT:    nand 22, 3, 23
 ; CHECK-NEXT:    sthcx. 22, 0, 10
-; CHECK-NEXT:    bne 0, .LBB1_69
+; CHECK-NEXT:    bne- 0, .LBB1_69
 ; CHECK-NEXT:  # %bb.70: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sth 23, ss@toc@l(7)
@@ -1617,7 +1617,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lharx 23, 0, 0
 ; CHECK-NEXT:    nand 22, 3, 23
 ; CHECK-NEXT:    sthcx. 22, 0, 0
-; CHECK-NEXT:    bne 0, .LBB1_71
+; CHECK-NEXT:    bne- 0, .LBB1_71
 ; CHECK-NEXT:  # %bb.72: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sth 23, us@toc@l(9)
@@ -1627,7 +1627,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lwarx 23, 0, 29
 ; CHECK-NEXT:    nand 22, 3, 23
 ; CHECK-NEXT:    stwcx. 22, 0, 29
-; CHECK-NEXT:    bne 0, .LBB1_73
+; CHECK-NEXT:    bne- 0, .LBB1_73
 ; CHECK-NEXT:  # %bb.74: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stw 23, si@toc@l(12)
@@ -1637,7 +1637,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lwarx 23, 0, 27
 ; CHECK-NEXT:    nand 22, 3, 23
 ; CHECK-NEXT:    stwcx. 22, 0, 27
-; CHECK-NEXT:    bne 0, .LBB1_75
+; CHECK-NEXT:    bne- 0, .LBB1_75
 ; CHECK-NEXT:  # %bb.76: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stw 23, ui@toc@l(30)
@@ -1647,7 +1647,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    ldarx 23, 0, 25
 ; CHECK-NEXT:    nand 22, 11, 23
 ; CHECK-NEXT:    stdcx. 22, 0, 25
-; CHECK-NEXT:    bne 0, .LBB1_77
+; CHECK-NEXT:    bne- 0, .LBB1_77
 ; CHECK-NEXT:  # %bb.78: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    std 23, sll@toc@l(28)
@@ -1657,7 +1657,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    ldarx 23, 0, 24
 ; CHECK-NEXT:    nand 22, 11, 23
 ; CHECK-NEXT:    stdcx. 22, 0, 24
-; CHECK-NEXT:    bne 0, .LBB1_79
+; CHECK-NEXT:    bne- 0, .LBB1_79
 ; CHECK-NEXT:  # %bb.80: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    std 23, ull@toc@l(26)
@@ -1667,7 +1667,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lbarx 23, 0, 6
 ; CHECK-NEXT:    and 22, 3, 23
 ; CHECK-NEXT:    stbcx. 22, 0, 6
-; CHECK-NEXT:    bne 0, .LBB1_81
+; CHECK-NEXT:    bne- 0, .LBB1_81
 ; CHECK-NEXT:  # %bb.82: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stb 23, sc@toc@l(4)
@@ -1677,7 +1677,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lbarx 4, 0, 8
 ; CHECK-NEXT:    and 6, 3, 4
 ; CHECK-NEXT:    stbcx. 6, 0, 8
-; CHECK-NEXT:    bne 0, .LBB1_83
+; CHECK-NEXT:    bne- 0, .LBB1_83
 ; CHECK-NEXT:  # %bb.84: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stb 4, uc@toc@l(5)
@@ -1687,7 +1687,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lharx 4, 0, 10
 ; CHECK-NEXT:    and 5, 3, 4
 ; CHECK-NEXT:    sthcx. 5, 0, 10
-; CHECK-NEXT:    bne 0, .LBB1_85
+; CHECK-NEXT:    bne- 0, .LBB1_85
 ; CHECK-NEXT:  # %bb.86: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sth 4, ss@toc@l(7)
@@ -1697,7 +1697,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lharx 4, 0, 0
 ; CHECK-NEXT:    and 5, 3, 4
 ; CHECK-NEXT:    sthcx. 5, 0, 0
-; CHECK-NEXT:    bne 0, .LBB1_87
+; CHECK-NEXT:    bne- 0, .LBB1_87
 ; CHECK-NEXT:  # %bb.88: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sth 4, us@toc@l(9)
@@ -1707,7 +1707,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lwarx 4, 0, 29
 ; CHECK-NEXT:    and 5, 3, 4
 ; CHECK-NEXT:    stwcx. 5, 0, 29
-; CHECK-NEXT:    bne 0, .LBB1_89
+; CHECK-NEXT:    bne- 0, .LBB1_89
 ; CHECK-NEXT:  # %bb.90: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stw 4, si@toc@l(12)
@@ -1717,7 +1717,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lwarx 4, 0, 27
 ; CHECK-NEXT:    and 5, 3, 4
 ; CHECK-NEXT:    stwcx. 5, 0, 27
-; CHECK-NEXT:    bne 0, .LBB1_91
+; CHECK-NEXT:    bne- 0, .LBB1_91
 ; CHECK-NEXT:  # %bb.92: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stw 4, ui@toc@l(30)
@@ -1727,7 +1727,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    ldarx 3, 0, 25
 ; CHECK-NEXT:    and 4, 11, 3
 ; CHECK-NEXT:    stdcx. 4, 0, 25
-; CHECK-NEXT:    bne 0, .LBB1_93
+; CHECK-NEXT:    bne- 0, .LBB1_93
 ; CHECK-NEXT:  # %bb.94: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    std 3, sll@toc@l(28)
@@ -1737,7 +1737,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; CHECK-NEXT:    ldarx 3, 0, 24
 ; CHECK-NEXT:    and 4, 11, 3
 ; CHECK-NEXT:    stdcx. 4, 0, 24
-; CHECK-NEXT:    bne 0, .LBB1_95
+; CHECK-NEXT:    bne- 0, .LBB1_95
 ; CHECK-NEXT:  # %bb.96: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    std 3, ull@toc@l(26)
@@ -1794,7 +1794,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 5, 5, 6
 ; AIX32-NEXT:    stwcx. 5, 0, 25
-; AIX32-NEXT:    bne 0, L..BB1_1
+; AIX32-NEXT:    bne- 0, L..BB1_1
 ; AIX32-NEXT:  # %bb.2: # %entry
 ; AIX32-NEXT:    srw 3, 4, 26
 ; AIX32-NEXT:    lwsync
@@ -1817,7 +1817,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 5, 5, 6
 ; AIX32-NEXT:    stwcx. 5, 0, 21
-; AIX32-NEXT:    bne 0, L..BB1_3
+; AIX32-NEXT:    bne- 0, L..BB1_3
 ; AIX32-NEXT:  # %bb.4: # %entry
 ; AIX32-NEXT:    srw 3, 4, 22
 ; AIX32-NEXT:    lwz 23, L..C2(2) # @ss
@@ -1840,7 +1840,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 5, 5, 6
 ; AIX32-NEXT:    stwcx. 5, 0, 17
-; AIX32-NEXT:    bne 0, L..BB1_5
+; AIX32-NEXT:    bne- 0, L..BB1_5
 ; AIX32-NEXT:  # %bb.6: # %entry
 ; AIX32-NEXT:    srw 3, 4, 18
 ; AIX32-NEXT:    lwz 19, L..C3(2) # @us
@@ -1863,7 +1863,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 5, 5, 6
 ; AIX32-NEXT:    stwcx. 5, 0, 14
-; AIX32-NEXT:    bne 0, L..BB1_7
+; AIX32-NEXT:    bne- 0, L..BB1_7
 ; AIX32-NEXT:  # %bb.8: # %entry
 ; AIX32-NEXT:    srw 3, 4, 15
 ; AIX32-NEXT:    lwsync
@@ -1876,7 +1876,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; AIX32-NEXT:    lwarx 3, 0, 29
 ; AIX32-NEXT:    addi 4, 3, 11
 ; AIX32-NEXT:    stwcx. 4, 0, 29
-; AIX32-NEXT:    bne 0, L..BB1_9
+; AIX32-NEXT:    bne- 0, L..BB1_9
 ; AIX32-NEXT:  # %bb.10: # %entry
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    stw 3, 0(29)
@@ -1887,7 +1887,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; AIX32-NEXT:    lwarx 3, 0, 28
 ; AIX32-NEXT:    addi 4, 3, 11
 ; AIX32-NEXT:    stwcx. 4, 0, 28
-; AIX32-NEXT:    bne 0, L..BB1_11
+; AIX32-NEXT:    bne- 0, L..BB1_11
 ; AIX32-NEXT:  # %bb.12: # %entry
 ; AIX32-NEXT:    lwz 31, L..C6(2) # @sll
 ; AIX32-NEXT:    lwsync
@@ -1920,7 +1920,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 5, 5, 6
 ; AIX32-NEXT:    stwcx. 5, 0, 25
-; AIX32-NEXT:    bne 0, L..BB1_13
+; AIX32-NEXT:    bne- 0, L..BB1_13
 ; AIX32-NEXT:  # %bb.14: # %entry
 ; AIX32-NEXT:    srw 3, 4, 26
 ; AIX32-NEXT:    lwsync
@@ -1938,7 +1938,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 5, 5, 6
 ; AIX32-NEXT:    stwcx. 5, 0, 21
-; AIX32-NEXT:    bne 0, L..BB1_15
+; AIX32-NEXT:    bne- 0, L..BB1_15
 ; AIX32-NEXT:  # %bb.16: # %entry
 ; AIX32-NEXT:    srw 3, 4, 22
 ; AIX32-NEXT:    lwsync
@@ -1957,7 +1957,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 5, 5, 6
 ; AIX32-NEXT:    stwcx. 5, 0, 17
-; AIX32-NEXT:    bne 0, L..BB1_17
+; AIX32-NEXT:    bne- 0, L..BB1_17
 ; AIX32-NEXT:  # %bb.18: # %entry
 ; AIX32-NEXT:    srw 3, 4, 18
 ; AIX32-NEXT:    lwsync
@@ -1975,7 +1975,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 5, 5, 6
 ; AIX32-NEXT:    stwcx. 5, 0, 14
-; AIX32-NEXT:    bne 0, L..BB1_19
+; AIX32-NEXT:    bne- 0, L..BB1_19
 ; AIX32-NEXT:  # %bb.20: # %entry
 ; AIX32-NEXT:    srw 3, 4, 15
 ; AIX32-NEXT:    lwsync
@@ -1987,7 +1987,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; AIX32-NEXT:    lwarx 3, 0, 29
 ; AIX32-NEXT:    sub 4, 3, 7
 ; AIX32-NEXT:    stwcx. 4, 0, 29
-; AIX32-NEXT:    bne 0, L..BB1_21
+; AIX32-NEXT:    bne- 0, L..BB1_21
 ; AIX32-NEXT:  # %bb.22: # %entry
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    stw 3, 0(29)
@@ -1997,7 +1997,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; AIX32-NEXT:    lwarx 3, 0, 28
 ; AIX32-NEXT:    sub 4, 3, 7
 ; AIX32-NEXT:    stwcx. 4, 0, 28
-; AIX32-NEXT:    bne 0, L..BB1_23
+; AIX32-NEXT:    bne- 0, L..BB1_23
 ; AIX32-NEXT:  # %bb.24: # %entry
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    li 4, 0
@@ -2028,7 +2028,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 5, 5, 6
 ; AIX32-NEXT:    stwcx. 5, 0, 25
-; AIX32-NEXT:    bne 0, L..BB1_25
+; AIX32-NEXT:    bne- 0, L..BB1_25
 ; AIX32-NEXT:  # %bb.26: # %entry
 ; AIX32-NEXT:    srw 3, 4, 26
 ; AIX32-NEXT:    lwsync
@@ -2046,7 +2046,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 5, 5, 6
 ; AIX32-NEXT:    stwcx. 5, 0, 21
-; AIX32-NEXT:    bne 0, L..BB1_27
+; AIX32-NEXT:    bne- 0, L..BB1_27
 ; AIX32-NEXT:  # %bb.28: # %entry
 ; AIX32-NEXT:    srw 3, 4, 22
 ; AIX32-NEXT:    lwsync
@@ -2064,7 +2064,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 5, 5, 6
 ; AIX32-NEXT:    stwcx. 5, 0, 17
-; AIX32-NEXT:    bne 0, L..BB1_29
+; AIX32-NEXT:    bne- 0, L..BB1_29
 ; AIX32-NEXT:  # %bb.30: # %entry
 ; AIX32-NEXT:    srw 3, 4, 18
 ; AIX32-NEXT:    lwsync
@@ -2082,7 +2082,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 5, 5, 6
 ; AIX32-NEXT:    stwcx. 5, 0, 14
-; AIX32-NEXT:    bne 0, L..BB1_31
+; AIX32-NEXT:    bne- 0, L..BB1_31
 ; AIX32-NEXT:  # %bb.32: # %entry
 ; AIX32-NEXT:    srw 3, 4, 15
 ; AIX32-NEXT:    lwsync
@@ -2094,7 +2094,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; AIX32-NEXT:    lwarx 3, 0, 29
 ; AIX32-NEXT:    ori 4, 3, 11
 ; AIX32-NEXT:    stwcx. 4, 0, 29
-; AIX32-NEXT:    bne 0, L..BB1_33
+; AIX32-NEXT:    bne- 0, L..BB1_33
 ; AIX32-NEXT:  # %bb.34: # %entry
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    stw 3, 0(29)
@@ -2104,7 +2104,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; AIX32-NEXT:    lwarx 3, 0, 28
 ; AIX32-NEXT:    ori 4, 3, 11
 ; AIX32-NEXT:    stwcx. 4, 0, 28
-; AIX32-NEXT:    bne 0, L..BB1_35
+; AIX32-NEXT:    bne- 0, L..BB1_35
 ; AIX32-NEXT:  # %bb.36: # %entry
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    li 4, 0
@@ -2135,7 +2135,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 5, 5, 6
 ; AIX32-NEXT:    stwcx. 5, 0, 25
-; AIX32-NEXT:    bne 0, L..BB1_37
+; AIX32-NEXT:    bne- 0, L..BB1_37
 ; AIX32-NEXT:  # %bb.38: # %entry
 ; AIX32-NEXT:    srw 3, 4, 26
 ; AIX32-NEXT:    lwsync
@@ -2153,7 +2153,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 5, 5, 6
 ; AIX32-NEXT:    stwcx. 5, 0, 21
-; AIX32-NEXT:    bne 0, L..BB1_39
+; AIX32-NEXT:    bne- 0, L..BB1_39
 ; AIX32-NEXT:  # %bb.40: # %entry
 ; AIX32-NEXT:    srw 3, 4, 22
 ; AIX32-NEXT:    lwsync
@@ -2171,7 +2171,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 5, 5, 6
 ; AIX32-NEXT:    stwcx. 5, 0, 17
-; AIX32-NEXT:    bne 0, L..BB1_41
+; AIX32-NEXT:    bne- 0, L..BB1_41
 ; AIX32-NEXT:  # %bb.42: # %entry
 ; AIX32-NEXT:    srw 3, 4, 18
 ; AIX32-NEXT:    lwsync
@@ -2189,7 +2189,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 5, 5, 6
 ; AIX32-NEXT:    stwcx. 5, 0, 14
-; AIX32-NEXT:    bne 0, L..BB1_43
+; AIX32-NEXT:    bne- 0, L..BB1_43
 ; AIX32-NEXT:  # %bb.44: # %entry
 ; AIX32-NEXT:    srw 3, 4, 15
 ; AIX32-NEXT:    lwsync
@@ -2201,7 +2201,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; AIX32-NEXT:    lwarx 3, 0, 29
 ; AIX32-NEXT:    xori 4, 3, 11
 ; AIX32-NEXT:    stwcx. 4, 0, 29
-; AIX32-NEXT:    bne 0, L..BB1_45
+; AIX32-NEXT:    bne- 0, L..BB1_45
 ; AIX32-NEXT:  # %bb.46: # %entry
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    stw 3, 0(29)
@@ -2211,7 +2211,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; AIX32-NEXT:    lwarx 3, 0, 28
 ; AIX32-NEXT:    xori 4, 3, 11
 ; AIX32-NEXT:    stwcx. 4, 0, 28
-; AIX32-NEXT:    bne 0, L..BB1_47
+; AIX32-NEXT:    bne- 0, L..BB1_47
 ; AIX32-NEXT:  # %bb.48: # %entry
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    li 4, 0
@@ -2242,7 +2242,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 5, 5, 6
 ; AIX32-NEXT:    stwcx. 5, 0, 25
-; AIX32-NEXT:    bne 0, L..BB1_49
+; AIX32-NEXT:    bne- 0, L..BB1_49
 ; AIX32-NEXT:  # %bb.50: # %entry
 ; AIX32-NEXT:    srw 3, 4, 26
 ; AIX32-NEXT:    lwsync
@@ -2261,7 +2261,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 5, 5, 6
 ; AIX32-NEXT:    stwcx. 5, 0, 21
-; AIX32-NEXT:    bne 0, L..BB1_51
+; AIX32-NEXT:    bne- 0, L..BB1_51
 ; AIX32-NEXT:  # %bb.52: # %entry
 ; AIX32-NEXT:    srw 3, 4, 22
 ; AIX32-NEXT:    lwsync
@@ -2279,7 +2279,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 5, 5, 6
 ; AIX32-NEXT:    stwcx. 5, 0, 17
-; AIX32-NEXT:    bne 0, L..BB1_53
+; AIX32-NEXT:    bne- 0, L..BB1_53
 ; AIX32-NEXT:  # %bb.54: # %entry
 ; AIX32-NEXT:    srw 3, 4, 18
 ; AIX32-NEXT:    lwsync
@@ -2297,7 +2297,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 5, 5, 6
 ; AIX32-NEXT:    stwcx. 5, 0, 14
-; AIX32-NEXT:    bne 0, L..BB1_55
+; AIX32-NEXT:    bne- 0, L..BB1_55
 ; AIX32-NEXT:  # %bb.56: # %entry
 ; AIX32-NEXT:    srw 3, 4, 15
 ; AIX32-NEXT:    lwsync
@@ -2309,7 +2309,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; AIX32-NEXT:    lwarx 3, 0, 29
 ; AIX32-NEXT:    nand 4, 7, 3
 ; AIX32-NEXT:    stwcx. 4, 0, 29
-; AIX32-NEXT:    bne 0, L..BB1_57
+; AIX32-NEXT:    bne- 0, L..BB1_57
 ; AIX32-NEXT:  # %bb.58: # %entry
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    stw 3, 0(29)
@@ -2319,7 +2319,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; AIX32-NEXT:    lwarx 3, 0, 28
 ; AIX32-NEXT:    nand 4, 7, 3
 ; AIX32-NEXT:    stwcx. 4, 0, 28
-; AIX32-NEXT:    bne 0, L..BB1_59
+; AIX32-NEXT:    bne- 0, L..BB1_59
 ; AIX32-NEXT:  # %bb.60: # %entry
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    li 4, 0
@@ -2350,7 +2350,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 5, 5, 6
 ; AIX32-NEXT:    stwcx. 5, 0, 25
-; AIX32-NEXT:    bne 0, L..BB1_61
+; AIX32-NEXT:    bne- 0, L..BB1_61
 ; AIX32-NEXT:  # %bb.62: # %entry
 ; AIX32-NEXT:    srw 3, 4, 26
 ; AIX32-NEXT:    lwsync
@@ -2368,7 +2368,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 5, 5, 6
 ; AIX32-NEXT:    stwcx. 5, 0, 21
-; AIX32-NEXT:    bne 0, L..BB1_63
+; AIX32-NEXT:    bne- 0, L..BB1_63
 ; AIX32-NEXT:  # %bb.64: # %entry
 ; AIX32-NEXT:    srw 3, 4, 22
 ; AIX32-NEXT:    lwsync
@@ -2387,7 +2387,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 5, 5, 6
 ; AIX32-NEXT:    stwcx. 5, 0, 17
-; AIX32-NEXT:    bne 0, L..BB1_65
+; AIX32-NEXT:    bne- 0, L..BB1_65
 ; AIX32-NEXT:  # %bb.66: # %entry
 ; AIX32-NEXT:    srw 3, 4, 18
 ; AIX32-NEXT:    lwsync
@@ -2405,7 +2405,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 5, 5, 3
 ; AIX32-NEXT:    or 5, 5, 6
 ; AIX32-NEXT:    stwcx. 5, 0, 14
-; AIX32-NEXT:    bne 0, L..BB1_67
+; AIX32-NEXT:    bne- 0, L..BB1_67
 ; AIX32-NEXT:  # %bb.68: # %entry
 ; AIX32-NEXT:    srw 3, 4, 15
 ; AIX32-NEXT:    lwsync
@@ -2417,7 +2417,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; AIX32-NEXT:    lwarx 3, 0, 29
 ; AIX32-NEXT:    and 4, 7, 3
 ; AIX32-NEXT:    stwcx. 4, 0, 29
-; AIX32-NEXT:    bne 0, L..BB1_69
+; AIX32-NEXT:    bne- 0, L..BB1_69
 ; AIX32-NEXT:  # %bb.70: # %entry
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    stw 3, 0(29)
@@ -2427,7 +2427,7 @@ define dso_local void @test_fetch_and_op() local_unnamed_addr #0 {
 ; AIX32-NEXT:    lwarx 3, 0, 28
 ; AIX32-NEXT:    and 4, 7, 3
 ; AIX32-NEXT:    stwcx. 4, 0, 28
-; AIX32-NEXT:    bne 0, L..BB1_71
+; AIX32-NEXT:    bne- 0, L..BB1_71
 ; AIX32-NEXT:  # %bb.72: # %entry
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    li 4, 0
@@ -2599,7 +2599,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lbarx 8, 0, 7
 ; CHECK-NEXT:    add 8, 6, 8
 ; CHECK-NEXT:    stbcx. 8, 0, 7
-; CHECK-NEXT:    bne 0, .LBB2_1
+; CHECK-NEXT:    bne- 0, .LBB2_1
 ; CHECK-NEXT:  # %bb.2: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stb 8, sc@toc@l(5)
@@ -2610,7 +2610,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lbarx 8, 0, 4
 ; CHECK-NEXT:    add 8, 6, 8
 ; CHECK-NEXT:    stbcx. 8, 0, 4
-; CHECK-NEXT:    bne 0, .LBB2_3
+; CHECK-NEXT:    bne- 0, .LBB2_3
 ; CHECK-NEXT:  # %bb.4: # %entry
 ; CHECK-NEXT:    addis 6, 2, ss@toc@ha
 ; CHECK-NEXT:    lwsync
@@ -2623,7 +2623,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lharx 10, 0, 9
 ; CHECK-NEXT:    add 10, 8, 10
 ; CHECK-NEXT:    sthcx. 10, 0, 9
-; CHECK-NEXT:    bne 0, .LBB2_5
+; CHECK-NEXT:    bne- 0, .LBB2_5
 ; CHECK-NEXT:  # %bb.6: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    addis 8, 2, us@toc@ha
@@ -2636,7 +2636,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lharx 12, 0, 11
 ; CHECK-NEXT:    add 12, 10, 12
 ; CHECK-NEXT:    sthcx. 12, 0, 11
-; CHECK-NEXT:    bne 0, .LBB2_7
+; CHECK-NEXT:    bne- 0, .LBB2_7
 ; CHECK-NEXT:  # %bb.8: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    addis 10, 2, si@toc@ha
@@ -2649,7 +2649,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lwarx 30, 0, 0
 ; CHECK-NEXT:    add 30, 12, 30
 ; CHECK-NEXT:    stwcx. 30, 0, 0
-; CHECK-NEXT:    bne 0, .LBB2_9
+; CHECK-NEXT:    bne- 0, .LBB2_9
 ; CHECK-NEXT:  # %bb.10: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    addis 12, 2, ui@toc@ha
@@ -2662,7 +2662,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lwarx 28, 0, 29
 ; CHECK-NEXT:    add 28, 30, 28
 ; CHECK-NEXT:    stwcx. 28, 0, 29
-; CHECK-NEXT:    bne 0, .LBB2_11
+; CHECK-NEXT:    bne- 0, .LBB2_11
 ; CHECK-NEXT:  # %bb.12: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    addis 30, 2, sll@toc@ha
@@ -2675,7 +2675,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    ldarx 26, 0, 27
 ; CHECK-NEXT:    add 26, 28, 26
 ; CHECK-NEXT:    stdcx. 26, 0, 27
-; CHECK-NEXT:    bne 0, .LBB2_13
+; CHECK-NEXT:    bne- 0, .LBB2_13
 ; CHECK-NEXT:  # %bb.14: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    addis 28, 2, ull@toc@ha
@@ -2688,7 +2688,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    ldarx 24, 0, 26
 ; CHECK-NEXT:    add 24, 25, 24
 ; CHECK-NEXT:    stdcx. 24, 0, 26
-; CHECK-NEXT:    bne 0, .LBB2_15
+; CHECK-NEXT:    bne- 0, .LBB2_15
 ; CHECK-NEXT:  # %bb.16: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    std 24, ull@toc@l(28)
@@ -2699,7 +2699,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lbarx 24, 0, 7
 ; CHECK-NEXT:    sub 24, 24, 25
 ; CHECK-NEXT:    stbcx. 24, 0, 7
-; CHECK-NEXT:    bne 0, .LBB2_17
+; CHECK-NEXT:    bne- 0, .LBB2_17
 ; CHECK-NEXT:  # %bb.18: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stb 24, sc@toc@l(5)
@@ -2710,7 +2710,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lbarx 24, 0, 4
 ; CHECK-NEXT:    sub 24, 24, 25
 ; CHECK-NEXT:    stbcx. 24, 0, 4
-; CHECK-NEXT:    bne 0, .LBB2_19
+; CHECK-NEXT:    bne- 0, .LBB2_19
 ; CHECK-NEXT:  # %bb.20: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stb 24, uc@toc@l(3)
@@ -2721,7 +2721,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lharx 24, 0, 9
 ; CHECK-NEXT:    sub 24, 24, 25
 ; CHECK-NEXT:    sthcx. 24, 0, 9
-; CHECK-NEXT:    bne 0, .LBB2_21
+; CHECK-NEXT:    bne- 0, .LBB2_21
 ; CHECK-NEXT:  # %bb.22: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sth 24, ss@toc@l(6)
@@ -2732,7 +2732,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lharx 24, 0, 11
 ; CHECK-NEXT:    sub 24, 24, 25
 ; CHECK-NEXT:    sthcx. 24, 0, 11
-; CHECK-NEXT:    bne 0, .LBB2_23
+; CHECK-NEXT:    bne- 0, .LBB2_23
 ; CHECK-NEXT:  # %bb.24: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sth 24, us@toc@l(8)
@@ -2743,7 +2743,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lwarx 24, 0, 0
 ; CHECK-NEXT:    sub 24, 24, 25
 ; CHECK-NEXT:    stwcx. 24, 0, 0
-; CHECK-NEXT:    bne 0, .LBB2_25
+; CHECK-NEXT:    bne- 0, .LBB2_25
 ; CHECK-NEXT:  # %bb.26: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stw 24, si@toc@l(10)
@@ -2754,7 +2754,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lwarx 24, 0, 29
 ; CHECK-NEXT:    sub 24, 24, 25
 ; CHECK-NEXT:    stwcx. 24, 0, 29
-; CHECK-NEXT:    bne 0, .LBB2_27
+; CHECK-NEXT:    bne- 0, .LBB2_27
 ; CHECK-NEXT:  # %bb.28: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stw 24, ui@toc@l(12)
@@ -2765,7 +2765,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    ldarx 24, 0, 27
 ; CHECK-NEXT:    sub 24, 24, 25
 ; CHECK-NEXT:    stdcx. 24, 0, 27
-; CHECK-NEXT:    bne 0, .LBB2_29
+; CHECK-NEXT:    bne- 0, .LBB2_29
 ; CHECK-NEXT:  # %bb.30: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    std 24, sll@toc@l(30)
@@ -2776,7 +2776,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    ldarx 24, 0, 26
 ; CHECK-NEXT:    sub 24, 24, 25
 ; CHECK-NEXT:    stdcx. 24, 0, 26
-; CHECK-NEXT:    bne 0, .LBB2_31
+; CHECK-NEXT:    bne- 0, .LBB2_31
 ; CHECK-NEXT:  # %bb.32: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    std 24, ull@toc@l(28)
@@ -2787,7 +2787,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lbarx 24, 0, 7
 ; CHECK-NEXT:    or 24, 25, 24
 ; CHECK-NEXT:    stbcx. 24, 0, 7
-; CHECK-NEXT:    bne 0, .LBB2_33
+; CHECK-NEXT:    bne- 0, .LBB2_33
 ; CHECK-NEXT:  # %bb.34: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stb 24, sc@toc@l(5)
@@ -2798,7 +2798,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lbarx 24, 0, 4
 ; CHECK-NEXT:    or 24, 25, 24
 ; CHECK-NEXT:    stbcx. 24, 0, 4
-; CHECK-NEXT:    bne 0, .LBB2_35
+; CHECK-NEXT:    bne- 0, .LBB2_35
 ; CHECK-NEXT:  # %bb.36: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stb 24, uc@toc@l(3)
@@ -2809,7 +2809,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lharx 24, 0, 9
 ; CHECK-NEXT:    or 24, 25, 24
 ; CHECK-NEXT:    sthcx. 24, 0, 9
-; CHECK-NEXT:    bne 0, .LBB2_37
+; CHECK-NEXT:    bne- 0, .LBB2_37
 ; CHECK-NEXT:  # %bb.38: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sth 24, ss@toc@l(6)
@@ -2820,7 +2820,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lharx 24, 0, 11
 ; CHECK-NEXT:    or 24, 25, 24
 ; CHECK-NEXT:    sthcx. 24, 0, 11
-; CHECK-NEXT:    bne 0, .LBB2_39
+; CHECK-NEXT:    bne- 0, .LBB2_39
 ; CHECK-NEXT:  # %bb.40: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sth 24, us@toc@l(8)
@@ -2831,7 +2831,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lwarx 24, 0, 0
 ; CHECK-NEXT:    or 24, 25, 24
 ; CHECK-NEXT:    stwcx. 24, 0, 0
-; CHECK-NEXT:    bne 0, .LBB2_41
+; CHECK-NEXT:    bne- 0, .LBB2_41
 ; CHECK-NEXT:  # %bb.42: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stw 24, si@toc@l(10)
@@ -2842,7 +2842,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lwarx 24, 0, 29
 ; CHECK-NEXT:    or 24, 25, 24
 ; CHECK-NEXT:    stwcx. 24, 0, 29
-; CHECK-NEXT:    bne 0, .LBB2_43
+; CHECK-NEXT:    bne- 0, .LBB2_43
 ; CHECK-NEXT:  # %bb.44: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stw 24, ui@toc@l(12)
@@ -2853,7 +2853,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    ldarx 24, 0, 27
 ; CHECK-NEXT:    or 24, 25, 24
 ; CHECK-NEXT:    stdcx. 24, 0, 27
-; CHECK-NEXT:    bne 0, .LBB2_45
+; CHECK-NEXT:    bne- 0, .LBB2_45
 ; CHECK-NEXT:  # %bb.46: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    std 24, sll@toc@l(30)
@@ -2864,7 +2864,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    ldarx 24, 0, 26
 ; CHECK-NEXT:    or 24, 25, 24
 ; CHECK-NEXT:    stdcx. 24, 0, 26
-; CHECK-NEXT:    bne 0, .LBB2_47
+; CHECK-NEXT:    bne- 0, .LBB2_47
 ; CHECK-NEXT:  # %bb.48: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    std 24, ull@toc@l(28)
@@ -2875,7 +2875,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lbarx 24, 0, 7
 ; CHECK-NEXT:    xor 24, 25, 24
 ; CHECK-NEXT:    stbcx. 24, 0, 7
-; CHECK-NEXT:    bne 0, .LBB2_49
+; CHECK-NEXT:    bne- 0, .LBB2_49
 ; CHECK-NEXT:  # %bb.50: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stb 24, sc@toc@l(5)
@@ -2886,7 +2886,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lbarx 24, 0, 4
 ; CHECK-NEXT:    xor 24, 25, 24
 ; CHECK-NEXT:    stbcx. 24, 0, 4
-; CHECK-NEXT:    bne 0, .LBB2_51
+; CHECK-NEXT:    bne- 0, .LBB2_51
 ; CHECK-NEXT:  # %bb.52: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stb 24, uc@toc@l(3)
@@ -2897,7 +2897,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lharx 24, 0, 9
 ; CHECK-NEXT:    xor 24, 25, 24
 ; CHECK-NEXT:    sthcx. 24, 0, 9
-; CHECK-NEXT:    bne 0, .LBB2_53
+; CHECK-NEXT:    bne- 0, .LBB2_53
 ; CHECK-NEXT:  # %bb.54: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sth 24, ss@toc@l(6)
@@ -2908,7 +2908,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lharx 24, 0, 11
 ; CHECK-NEXT:    xor 24, 25, 24
 ; CHECK-NEXT:    sthcx. 24, 0, 11
-; CHECK-NEXT:    bne 0, .LBB2_55
+; CHECK-NEXT:    bne- 0, .LBB2_55
 ; CHECK-NEXT:  # %bb.56: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sth 24, us@toc@l(8)
@@ -2919,7 +2919,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lwarx 24, 0, 0
 ; CHECK-NEXT:    xor 24, 25, 24
 ; CHECK-NEXT:    stwcx. 24, 0, 0
-; CHECK-NEXT:    bne 0, .LBB2_57
+; CHECK-NEXT:    bne- 0, .LBB2_57
 ; CHECK-NEXT:  # %bb.58: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stw 24, si@toc@l(10)
@@ -2930,7 +2930,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lwarx 24, 0, 29
 ; CHECK-NEXT:    xor 24, 25, 24
 ; CHECK-NEXT:    stwcx. 24, 0, 29
-; CHECK-NEXT:    bne 0, .LBB2_59
+; CHECK-NEXT:    bne- 0, .LBB2_59
 ; CHECK-NEXT:  # %bb.60: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stw 24, ui@toc@l(12)
@@ -2941,7 +2941,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    ldarx 24, 0, 27
 ; CHECK-NEXT:    xor 24, 25, 24
 ; CHECK-NEXT:    stdcx. 24, 0, 27
-; CHECK-NEXT:    bne 0, .LBB2_61
+; CHECK-NEXT:    bne- 0, .LBB2_61
 ; CHECK-NEXT:  # %bb.62: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    std 24, sll@toc@l(30)
@@ -2952,7 +2952,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    ldarx 24, 0, 26
 ; CHECK-NEXT:    xor 24, 25, 24
 ; CHECK-NEXT:    stdcx. 24, 0, 26
-; CHECK-NEXT:    bne 0, .LBB2_63
+; CHECK-NEXT:    bne- 0, .LBB2_63
 ; CHECK-NEXT:  # %bb.64: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    std 24, ull@toc@l(28)
@@ -2963,7 +2963,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lbarx 24, 0, 7
 ; CHECK-NEXT:    nand 24, 25, 24
 ; CHECK-NEXT:    stbcx. 24, 0, 7
-; CHECK-NEXT:    bne 0, .LBB2_65
+; CHECK-NEXT:    bne- 0, .LBB2_65
 ; CHECK-NEXT:  # %bb.66: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stb 24, sc@toc@l(5)
@@ -2974,7 +2974,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lbarx 24, 0, 4
 ; CHECK-NEXT:    nand 24, 25, 24
 ; CHECK-NEXT:    stbcx. 24, 0, 4
-; CHECK-NEXT:    bne 0, .LBB2_67
+; CHECK-NEXT:    bne- 0, .LBB2_67
 ; CHECK-NEXT:  # %bb.68: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stb 24, uc@toc@l(3)
@@ -2985,7 +2985,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lharx 24, 0, 9
 ; CHECK-NEXT:    nand 24, 25, 24
 ; CHECK-NEXT:    sthcx. 24, 0, 9
-; CHECK-NEXT:    bne 0, .LBB2_69
+; CHECK-NEXT:    bne- 0, .LBB2_69
 ; CHECK-NEXT:  # %bb.70: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sth 24, ss@toc@l(6)
@@ -2996,7 +2996,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lharx 24, 0, 11
 ; CHECK-NEXT:    nand 24, 25, 24
 ; CHECK-NEXT:    sthcx. 24, 0, 11
-; CHECK-NEXT:    bne 0, .LBB2_71
+; CHECK-NEXT:    bne- 0, .LBB2_71
 ; CHECK-NEXT:  # %bb.72: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sth 24, us@toc@l(8)
@@ -3007,7 +3007,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lwarx 24, 0, 0
 ; CHECK-NEXT:    nand 24, 25, 24
 ; CHECK-NEXT:    stwcx. 24, 0, 0
-; CHECK-NEXT:    bne 0, .LBB2_73
+; CHECK-NEXT:    bne- 0, .LBB2_73
 ; CHECK-NEXT:  # %bb.74: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stw 24, si@toc@l(10)
@@ -3018,7 +3018,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lwarx 24, 0, 29
 ; CHECK-NEXT:    nand 24, 25, 24
 ; CHECK-NEXT:    stwcx. 24, 0, 29
-; CHECK-NEXT:    bne 0, .LBB2_75
+; CHECK-NEXT:    bne- 0, .LBB2_75
 ; CHECK-NEXT:  # %bb.76: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stw 24, ui@toc@l(12)
@@ -3029,7 +3029,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    ldarx 24, 0, 27
 ; CHECK-NEXT:    nand 24, 25, 24
 ; CHECK-NEXT:    stdcx. 24, 0, 27
-; CHECK-NEXT:    bne 0, .LBB2_77
+; CHECK-NEXT:    bne- 0, .LBB2_77
 ; CHECK-NEXT:  # %bb.78: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    std 24, sll@toc@l(30)
@@ -3040,7 +3040,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    ldarx 24, 0, 26
 ; CHECK-NEXT:    nand 24, 25, 24
 ; CHECK-NEXT:    stdcx. 24, 0, 26
-; CHECK-NEXT:    bne 0, .LBB2_79
+; CHECK-NEXT:    bne- 0, .LBB2_79
 ; CHECK-NEXT:  # %bb.80: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    std 24, ull@toc@l(28)
@@ -3085,7 +3085,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lbarx 24, 0, 7
 ; CHECK-NEXT:    and 24, 25, 24
 ; CHECK-NEXT:    stbcx. 24, 0, 7
-; CHECK-NEXT:    bne 0, .LBB2_85
+; CHECK-NEXT:    bne- 0, .LBB2_85
 ; CHECK-NEXT:  # %bb.86: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stb 24, sc@toc@l(5)
@@ -3096,7 +3096,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lbarx 5, 0, 4
 ; CHECK-NEXT:    and 5, 7, 5
 ; CHECK-NEXT:    stbcx. 5, 0, 4
-; CHECK-NEXT:    bne 0, .LBB2_87
+; CHECK-NEXT:    bne- 0, .LBB2_87
 ; CHECK-NEXT:  # %bb.88: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stb 5, uc@toc@l(3)
@@ -3106,7 +3106,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lharx 4, 0, 9
 ; CHECK-NEXT:    and 4, 5, 4
 ; CHECK-NEXT:    sthcx. 4, 0, 9
-; CHECK-NEXT:    bne 0, .LBB2_89
+; CHECK-NEXT:    bne- 0, .LBB2_89
 ; CHECK-NEXT:  # %bb.90: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sth 4, ss@toc@l(6)
@@ -3117,7 +3117,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lharx 5, 0, 11
 ; CHECK-NEXT:    and 5, 4, 5
 ; CHECK-NEXT:    sthcx. 5, 0, 11
-; CHECK-NEXT:    bne 0, .LBB2_91
+; CHECK-NEXT:    bne- 0, .LBB2_91
 ; CHECK-NEXT:  # %bb.92: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sth 5, us@toc@l(8)
@@ -3128,7 +3128,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lwarx 5, 0, 0
 ; CHECK-NEXT:    and 5, 4, 5
 ; CHECK-NEXT:    stwcx. 5, 0, 0
-; CHECK-NEXT:    bne 0, .LBB2_93
+; CHECK-NEXT:    bne- 0, .LBB2_93
 ; CHECK-NEXT:  # %bb.94: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stw 5, si@toc@l(10)
@@ -3139,7 +3139,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    lwarx 5, 0, 29
 ; CHECK-NEXT:    and 5, 4, 5
 ; CHECK-NEXT:    stwcx. 5, 0, 29
-; CHECK-NEXT:    bne 0, .LBB2_95
+; CHECK-NEXT:    bne- 0, .LBB2_95
 ; CHECK-NEXT:  # %bb.96: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stw 5, ui@toc@l(12)
@@ -3150,7 +3150,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    ldarx 5, 0, 27
 ; CHECK-NEXT:    and 5, 4, 5
 ; CHECK-NEXT:    stdcx. 5, 0, 27
-; CHECK-NEXT:    bne 0, .LBB2_97
+; CHECK-NEXT:    bne- 0, .LBB2_97
 ; CHECK-NEXT:  # %bb.98: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    std 5, sll@toc@l(30)
@@ -3161,7 +3161,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; CHECK-NEXT:    ldarx 4, 0, 26
 ; CHECK-NEXT:    and 4, 3, 4
 ; CHECK-NEXT:    stdcx. 4, 0, 26
-; CHECK-NEXT:    bne 0, .LBB2_99
+; CHECK-NEXT:    bne- 0, .LBB2_99
 ; CHECK-NEXT:  # %bb.100: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    std 4, ull@toc@l(28)
@@ -3225,7 +3225,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 7, 7, 5
 ; AIX32-NEXT:    or 7, 7, 8
 ; AIX32-NEXT:    stwcx. 7, 0, 22
-; AIX32-NEXT:    bne 0, L..BB2_1
+; AIX32-NEXT:    bne- 0, L..BB2_1
 ; AIX32-NEXT:  # %bb.2: # %entry
 ; AIX32-NEXT:    srw 4, 6, 24
 ; AIX32-NEXT:    lwsync
@@ -3248,7 +3248,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 7, 7, 5
 ; AIX32-NEXT:    or 7, 7, 8
 ; AIX32-NEXT:    stwcx. 7, 0, 19
-; AIX32-NEXT:    bne 0, L..BB2_3
+; AIX32-NEXT:    bne- 0, L..BB2_3
 ; AIX32-NEXT:  # %bb.4: # %entry
 ; AIX32-NEXT:    srw 4, 6, 21
 ; AIX32-NEXT:    lwz 23, L..C2(2) # @ss
@@ -3273,7 +3273,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 7, 7, 5
 ; AIX32-NEXT:    or 7, 7, 8
 ; AIX32-NEXT:    stwcx. 7, 0, 16
-; AIX32-NEXT:    bne 0, L..BB2_5
+; AIX32-NEXT:    bne- 0, L..BB2_5
 ; AIX32-NEXT:  # %bb.6: # %entry
 ; AIX32-NEXT:    srw 4, 6, 18
 ; AIX32-NEXT:    lwz 20, L..C3(2) # @us
@@ -3298,7 +3298,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 7, 7, 5
 ; AIX32-NEXT:    or 7, 7, 8
 ; AIX32-NEXT:    stwcx. 7, 0, 14
-; AIX32-NEXT:    bne 0, L..BB2_7
+; AIX32-NEXT:    bne- 0, L..BB2_7
 ; AIX32-NEXT:  # %bb.8: # %entry
 ; AIX32-NEXT:    srw 4, 6, 15
 ; AIX32-NEXT:    lwsync
@@ -3313,7 +3313,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    lwarx 4, 0, 13
 ; AIX32-NEXT:    add 4, 3, 4
 ; AIX32-NEXT:    stwcx. 4, 0, 13
-; AIX32-NEXT:    bne 0, L..BB2_9
+; AIX32-NEXT:    bne- 0, L..BB2_9
 ; AIX32-NEXT:  # %bb.10: # %entry
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    stw 4, 0(13)
@@ -3325,7 +3325,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    lwarx 4, 0, 25
 ; AIX32-NEXT:    add 4, 3, 4
 ; AIX32-NEXT:    stwcx. 4, 0, 25
-; AIX32-NEXT:    bne 0, L..BB2_11
+; AIX32-NEXT:    bne- 0, L..BB2_11
 ; AIX32-NEXT:  # %bb.12: # %entry
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    lwz 31, L..C6(2) # @sll
@@ -3367,7 +3367,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 7, 7, 5
 ; AIX32-NEXT:    or 7, 7, 8
 ; AIX32-NEXT:    stwcx. 7, 0, 22
-; AIX32-NEXT:    bne 0, L..BB2_13
+; AIX32-NEXT:    bne- 0, L..BB2_13
 ; AIX32-NEXT:  # %bb.14: # %entry
 ; AIX32-NEXT:    srw 4, 6, 24
 ; AIX32-NEXT:    lwsync
@@ -3387,7 +3387,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 7, 7, 5
 ; AIX32-NEXT:    or 7, 7, 8
 ; AIX32-NEXT:    stwcx. 7, 0, 19
-; AIX32-NEXT:    bne 0, L..BB2_15
+; AIX32-NEXT:    bne- 0, L..BB2_15
 ; AIX32-NEXT:  # %bb.16: # %entry
 ; AIX32-NEXT:    srw 4, 6, 21
 ; AIX32-NEXT:    li 5, 0
@@ -3408,7 +3408,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 7, 7, 5
 ; AIX32-NEXT:    or 7, 7, 8
 ; AIX32-NEXT:    stwcx. 7, 0, 16
-; AIX32-NEXT:    bne 0, L..BB2_17
+; AIX32-NEXT:    bne- 0, L..BB2_17
 ; AIX32-NEXT:  # %bb.18: # %entry
 ; AIX32-NEXT:    srw 4, 6, 18
 ; AIX32-NEXT:    lwsync
@@ -3429,7 +3429,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 7, 7, 5
 ; AIX32-NEXT:    or 7, 7, 8
 ; AIX32-NEXT:    stwcx. 7, 0, 14
-; AIX32-NEXT:    bne 0, L..BB2_19
+; AIX32-NEXT:    bne- 0, L..BB2_19
 ; AIX32-NEXT:  # %bb.20: # %entry
 ; AIX32-NEXT:    srw 4, 6, 15
 ; AIX32-NEXT:    lwsync
@@ -3443,7 +3443,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    lwarx 4, 0, 13
 ; AIX32-NEXT:    sub 4, 4, 3
 ; AIX32-NEXT:    stwcx. 4, 0, 13
-; AIX32-NEXT:    bne 0, L..BB2_21
+; AIX32-NEXT:    bne- 0, L..BB2_21
 ; AIX32-NEXT:  # %bb.22: # %entry
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    stw 4, 0(13)
@@ -3454,7 +3454,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    lwarx 4, 0, 25
 ; AIX32-NEXT:    sub 4, 4, 3
 ; AIX32-NEXT:    stwcx. 4, 0, 25
-; AIX32-NEXT:    bne 0, L..BB2_23
+; AIX32-NEXT:    bne- 0, L..BB2_23
 ; AIX32-NEXT:  # %bb.24: # %entry
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    stw 4, 0(25)
@@ -3493,7 +3493,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 7, 7, 5
 ; AIX32-NEXT:    or 7, 7, 8
 ; AIX32-NEXT:    stwcx. 7, 0, 22
-; AIX32-NEXT:    bne 0, L..BB2_25
+; AIX32-NEXT:    bne- 0, L..BB2_25
 ; AIX32-NEXT:  # %bb.26: # %entry
 ; AIX32-NEXT:    srw 4, 6, 24
 ; AIX32-NEXT:    lwsync
@@ -3513,7 +3513,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 7, 7, 5
 ; AIX32-NEXT:    or 7, 7, 8
 ; AIX32-NEXT:    stwcx. 7, 0, 19
-; AIX32-NEXT:    bne 0, L..BB2_27
+; AIX32-NEXT:    bne- 0, L..BB2_27
 ; AIX32-NEXT:  # %bb.28: # %entry
 ; AIX32-NEXT:    srw 4, 6, 21
 ; AIX32-NEXT:    li 5, 0
@@ -3534,7 +3534,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 7, 7, 5
 ; AIX32-NEXT:    or 7, 7, 8
 ; AIX32-NEXT:    stwcx. 7, 0, 16
-; AIX32-NEXT:    bne 0, L..BB2_29
+; AIX32-NEXT:    bne- 0, L..BB2_29
 ; AIX32-NEXT:  # %bb.30: # %entry
 ; AIX32-NEXT:    srw 4, 6, 18
 ; AIX32-NEXT:    lwsync
@@ -3555,7 +3555,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 7, 7, 5
 ; AIX32-NEXT:    or 7, 7, 8
 ; AIX32-NEXT:    stwcx. 7, 0, 14
-; AIX32-NEXT:    bne 0, L..BB2_31
+; AIX32-NEXT:    bne- 0, L..BB2_31
 ; AIX32-NEXT:  # %bb.32: # %entry
 ; AIX32-NEXT:    srw 4, 6, 15
 ; AIX32-NEXT:    lwsync
@@ -3569,7 +3569,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    lwarx 4, 0, 13
 ; AIX32-NEXT:    or 4, 3, 4
 ; AIX32-NEXT:    stwcx. 4, 0, 13
-; AIX32-NEXT:    bne 0, L..BB2_33
+; AIX32-NEXT:    bne- 0, L..BB2_33
 ; AIX32-NEXT:  # %bb.34: # %entry
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    stw 4, 0(13)
@@ -3580,7 +3580,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    lwarx 4, 0, 25
 ; AIX32-NEXT:    or 4, 3, 4
 ; AIX32-NEXT:    stwcx. 4, 0, 25
-; AIX32-NEXT:    bne 0, L..BB2_35
+; AIX32-NEXT:    bne- 0, L..BB2_35
 ; AIX32-NEXT:  # %bb.36: # %entry
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    stw 4, 0(25)
@@ -3617,7 +3617,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 7, 7, 5
 ; AIX32-NEXT:    or 7, 7, 8
 ; AIX32-NEXT:    stwcx. 7, 0, 22
-; AIX32-NEXT:    bne 0, L..BB2_37
+; AIX32-NEXT:    bne- 0, L..BB2_37
 ; AIX32-NEXT:  # %bb.38: # %entry
 ; AIX32-NEXT:    srw 4, 6, 24
 ; AIX32-NEXT:    lwsync
@@ -3637,7 +3637,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 7, 7, 5
 ; AIX32-NEXT:    or 7, 7, 8
 ; AIX32-NEXT:    stwcx. 7, 0, 19
-; AIX32-NEXT:    bne 0, L..BB2_39
+; AIX32-NEXT:    bne- 0, L..BB2_39
 ; AIX32-NEXT:  # %bb.40: # %entry
 ; AIX32-NEXT:    srw 4, 6, 21
 ; AIX32-NEXT:    li 5, 0
@@ -3658,7 +3658,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 7, 7, 5
 ; AIX32-NEXT:    or 7, 7, 8
 ; AIX32-NEXT:    stwcx. 7, 0, 16
-; AIX32-NEXT:    bne 0, L..BB2_41
+; AIX32-NEXT:    bne- 0, L..BB2_41
 ; AIX32-NEXT:  # %bb.42: # %entry
 ; AIX32-NEXT:    srw 4, 6, 18
 ; AIX32-NEXT:    lwsync
@@ -3679,7 +3679,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 7, 7, 5
 ; AIX32-NEXT:    or 7, 7, 8
 ; AIX32-NEXT:    stwcx. 7, 0, 14
-; AIX32-NEXT:    bne 0, L..BB2_43
+; AIX32-NEXT:    bne- 0, L..BB2_43
 ; AIX32-NEXT:  # %bb.44: # %entry
 ; AIX32-NEXT:    srw 4, 6, 15
 ; AIX32-NEXT:    lwsync
@@ -3693,7 +3693,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    lwarx 4, 0, 13
 ; AIX32-NEXT:    xor 4, 3, 4
 ; AIX32-NEXT:    stwcx. 4, 0, 13
-; AIX32-NEXT:    bne 0, L..BB2_45
+; AIX32-NEXT:    bne- 0, L..BB2_45
 ; AIX32-NEXT:  # %bb.46: # %entry
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    stw 4, 0(13)
@@ -3704,7 +3704,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    lwarx 4, 0, 25
 ; AIX32-NEXT:    xor 4, 3, 4
 ; AIX32-NEXT:    stwcx. 4, 0, 25
-; AIX32-NEXT:    bne 0, L..BB2_47
+; AIX32-NEXT:    bne- 0, L..BB2_47
 ; AIX32-NEXT:  # %bb.48: # %entry
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    stw 4, 0(25)
@@ -3741,7 +3741,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 7, 7, 5
 ; AIX32-NEXT:    or 7, 7, 8
 ; AIX32-NEXT:    stwcx. 7, 0, 22
-; AIX32-NEXT:    bne 0, L..BB2_49
+; AIX32-NEXT:    bne- 0, L..BB2_49
 ; AIX32-NEXT:  # %bb.50: # %entry
 ; AIX32-NEXT:    srw 4, 6, 24
 ; AIX32-NEXT:    lwsync
@@ -3761,7 +3761,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 7, 7, 5
 ; AIX32-NEXT:    or 7, 7, 8
 ; AIX32-NEXT:    stwcx. 7, 0, 19
-; AIX32-NEXT:    bne 0, L..BB2_51
+; AIX32-NEXT:    bne- 0, L..BB2_51
 ; AIX32-NEXT:  # %bb.52: # %entry
 ; AIX32-NEXT:    srw 4, 6, 21
 ; AIX32-NEXT:    li 5, 0
@@ -3782,7 +3782,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 7, 7, 5
 ; AIX32-NEXT:    or 7, 7, 8
 ; AIX32-NEXT:    stwcx. 7, 0, 16
-; AIX32-NEXT:    bne 0, L..BB2_53
+; AIX32-NEXT:    bne- 0, L..BB2_53
 ; AIX32-NEXT:  # %bb.54: # %entry
 ; AIX32-NEXT:    srw 4, 6, 18
 ; AIX32-NEXT:    lwsync
@@ -3803,7 +3803,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 7, 7, 5
 ; AIX32-NEXT:    or 7, 7, 8
 ; AIX32-NEXT:    stwcx. 7, 0, 14
-; AIX32-NEXT:    bne 0, L..BB2_55
+; AIX32-NEXT:    bne- 0, L..BB2_55
 ; AIX32-NEXT:  # %bb.56: # %entry
 ; AIX32-NEXT:    srw 4, 6, 15
 ; AIX32-NEXT:    lwsync
@@ -3817,7 +3817,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    lwarx 4, 0, 13
 ; AIX32-NEXT:    nand 4, 3, 4
 ; AIX32-NEXT:    stwcx. 4, 0, 13
-; AIX32-NEXT:    bne 0, L..BB2_57
+; AIX32-NEXT:    bne- 0, L..BB2_57
 ; AIX32-NEXT:  # %bb.58: # %entry
 ; AIX32-NEXT:    stw 23, 56(1) # 4-byte Folded Spill
 ; AIX32-NEXT:    stw 27, 60(1) # 4-byte Folded Spill
@@ -3830,7 +3830,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    lwarx 4, 0, 25
 ; AIX32-NEXT:    nand 4, 3, 4
 ; AIX32-NEXT:    stwcx. 4, 0, 25
-; AIX32-NEXT:    bne 0, L..BB2_59
+; AIX32-NEXT:    bne- 0, L..BB2_59
 ; AIX32-NEXT:  # %bb.60: # %entry
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    stw 4, 0(25)
@@ -3951,7 +3951,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 7, 7, 5
 ; AIX32-NEXT:    or 7, 7, 8
 ; AIX32-NEXT:    stwcx. 7, 0, 22
-; AIX32-NEXT:    bne 0, L..BB2_65
+; AIX32-NEXT:    bne- 0, L..BB2_65
 ; AIX32-NEXT:  # %bb.66: # %atomicrmw.end
 ; AIX32-NEXT:    srw 4, 6, 24
 ; AIX32-NEXT:    lwsync
@@ -3973,7 +3973,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 7, 7, 5
 ; AIX32-NEXT:    or 7, 7, 8
 ; AIX32-NEXT:    stwcx. 7, 0, 19
-; AIX32-NEXT:    bne 0, L..BB2_67
+; AIX32-NEXT:    bne- 0, L..BB2_67
 ; AIX32-NEXT:  # %bb.68: # %atomicrmw.end
 ; AIX32-NEXT:    srw 4, 6, 21
 ; AIX32-NEXT:    li 5, 0
@@ -3993,7 +3993,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 7, 7, 5
 ; AIX32-NEXT:    or 7, 7, 8
 ; AIX32-NEXT:    stwcx. 7, 0, 16
-; AIX32-NEXT:    bne 0, L..BB2_69
+; AIX32-NEXT:    bne- 0, L..BB2_69
 ; AIX32-NEXT:  # %bb.70: # %atomicrmw.end
 ; AIX32-NEXT:    srw 4, 6, 18
 ; AIX32-NEXT:    lwsync
@@ -4014,7 +4014,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    and 7, 7, 5
 ; AIX32-NEXT:    or 7, 7, 8
 ; AIX32-NEXT:    stwcx. 7, 0, 14
-; AIX32-NEXT:    bne 0, L..BB2_71
+; AIX32-NEXT:    bne- 0, L..BB2_71
 ; AIX32-NEXT:  # %bb.72: # %atomicrmw.end
 ; AIX32-NEXT:    srw 4, 6, 15
 ; AIX32-NEXT:    lwsync
@@ -4028,7 +4028,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    lwarx 4, 0, 13
 ; AIX32-NEXT:    and 4, 3, 4
 ; AIX32-NEXT:    stwcx. 4, 0, 13
-; AIX32-NEXT:    bne 0, L..BB2_73
+; AIX32-NEXT:    bne- 0, L..BB2_73
 ; AIX32-NEXT:  # %bb.74: # %atomicrmw.end
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    stw 4, 0(13)
@@ -4039,7 +4039,7 @@ define dso_local void @test_op_and_fetch() local_unnamed_addr #0 {
 ; AIX32-NEXT:    lwarx 4, 0, 25
 ; AIX32-NEXT:    and 4, 3, 4
 ; AIX32-NEXT:    stwcx. 4, 0, 25
-; AIX32-NEXT:    bne 0, L..BB2_75
+; AIX32-NEXT:    bne- 0, L..BB2_75
 ; AIX32-NEXT:  # %bb.76: # %atomicrmw.end
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    stw 4, 0(25)
@@ -5371,7 +5371,7 @@ define dso_local void @test_lock() local_unnamed_addr #0 {
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    lbarx 5, 0, 4
 ; CHECK-NEXT:    stbcx. 7, 0, 4
-; CHECK-NEXT:    bne 0, .LBB4_1
+; CHECK-NEXT:    bne- 0, .LBB4_1
 ; CHECK-NEXT:  # %bb.2: # %entry
 ; CHECK-NEXT:    addis 4, 2, uc@toc@ha
 ; CHECK-NEXT:    lwsync
@@ -5382,7 +5382,7 @@ define dso_local void @test_lock() local_unnamed_addr #0 {
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    lbarx 5, 0, 6
 ; CHECK-NEXT:    stbcx. 7, 0, 6
-; CHECK-NEXT:    bne 0, .LBB4_3
+; CHECK-NEXT:    bne- 0, .LBB4_3
 ; CHECK-NEXT:  # %bb.4: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stb 5, uc@toc@l(4)
@@ -5393,7 +5393,7 @@ define dso_local void @test_lock() local_unnamed_addr #0 {
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    lharx 6, 0, 8
 ; CHECK-NEXT:    sthcx. 7, 0, 8
-; CHECK-NEXT:    bne 0, .LBB4_5
+; CHECK-NEXT:    bne- 0, .LBB4_5
 ; CHECK-NEXT:  # %bb.6: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sth 6, ss@toc@l(5)
@@ -5404,7 +5404,7 @@ define dso_local void @test_lock() local_unnamed_addr #0 {
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    lharx 8, 0, 9
 ; CHECK-NEXT:    sthcx. 7, 0, 9
-; CHECK-NEXT:    bne 0, .LBB4_7
+; CHECK-NEXT:    bne- 0, .LBB4_7
 ; CHECK-NEXT:  # %bb.8: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    sth 8, us@toc@l(6)
@@ -5415,7 +5415,7 @@ define dso_local void @test_lock() local_unnamed_addr #0 {
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    lwarx 9, 0, 10
 ; CHECK-NEXT:    stwcx. 7, 0, 10
-; CHECK-NEXT:    bne 0, .LBB4_9
+; CHECK-NEXT:    bne- 0, .LBB4_9
 ; CHECK-NEXT:  # %bb.10: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stw 9, si@toc@l(8)
@@ -5426,7 +5426,7 @@ define dso_local void @test_lock() local_unnamed_addr #0 {
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    lwarx 10, 0, 11
 ; CHECK-NEXT:    stwcx. 7, 0, 11
-; CHECK-NEXT:    bne 0, .LBB4_11
+; CHECK-NEXT:    bne- 0, .LBB4_11
 ; CHECK-NEXT:  # %bb.12: # %entry
 ; CHECK-NEXT:    addis 7, 2, sll@toc@ha
 ; CHECK-NEXT:    lwsync
@@ -5438,7 +5438,7 @@ define dso_local void @test_lock() local_unnamed_addr #0 {
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    ldarx 12, 0, 10
 ; CHECK-NEXT:    stdcx. 11, 0, 10
-; CHECK-NEXT:    bne 0, .LBB4_13
+; CHECK-NEXT:    bne- 0, .LBB4_13
 ; CHECK-NEXT:  # %bb.14: # %entry
 ; CHECK-NEXT:    addis 10, 2, ull@toc@ha
 ; CHECK-NEXT:    lwsync
@@ -5449,7 +5449,7 @@ define dso_local void @test_lock() local_unnamed_addr #0 {
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    ldarx 12, 0, 0
 ; CHECK-NEXT:    stdcx. 11, 0, 0
-; CHECK-NEXT:    bne 0, .LBB4_15
+; CHECK-NEXT:    bne- 0, .LBB4_15
 ; CHECK-NEXT:  # %bb.16: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    std 12, ull@toc@l(10)
@@ -5504,7 +5504,7 @@ define dso_local void @test_lock() local_unnamed_addr #0 {
 ; AIX32-NEXT:    andc 9, 8, 6
 ; AIX32-NEXT:    or 9, 7, 9
 ; AIX32-NEXT:    stwcx. 9, 0, 5
-; AIX32-NEXT:    bne 0, L..BB4_1
+; AIX32-NEXT:    bne- 0, L..BB4_1
 ; AIX32-NEXT:  # %bb.2: # %entry
 ; AIX32-NEXT:    srw 4, 8, 4
 ; AIX32-NEXT:    lwz 28, L..C1(2) # @uc
@@ -5525,7 +5525,7 @@ define dso_local void @test_lock() local_unnamed_addr #0 {
 ; AIX32-NEXT:    andc 9, 8, 6
 ; AIX32-NEXT:    or 9, 7, 9
 ; AIX32-NEXT:    stwcx. 9, 0, 5
-; AIX32-NEXT:    bne 0, L..BB4_3
+; AIX32-NEXT:    bne- 0, L..BB4_3
 ; AIX32-NEXT:  # %bb.4: # %entry
 ; AIX32-NEXT:    srw 4, 8, 4
 ; AIX32-NEXT:    lwz 27, L..C2(2) # @ss
@@ -5547,7 +5547,7 @@ define dso_local void @test_lock() local_unnamed_addr #0 {
 ; AIX32-NEXT:    andc 9, 8, 6
 ; AIX32-NEXT:    or 9, 7, 9
 ; AIX32-NEXT:    stwcx. 9, 0, 5
-; AIX32-NEXT:    bne 0, L..BB4_5
+; AIX32-NEXT:    bne- 0, L..BB4_5
 ; AIX32-NEXT:  # %bb.6: # %entry
 ; AIX32-NEXT:    srw 4, 8, 4
 ; AIX32-NEXT:    lwz 26, L..C3(2) # @us
@@ -5569,7 +5569,7 @@ define dso_local void @test_lock() local_unnamed_addr #0 {
 ; AIX32-NEXT:    andc 9, 8, 6
 ; AIX32-NEXT:    or 9, 7, 9
 ; AIX32-NEXT:    stwcx. 9, 0, 5
-; AIX32-NEXT:    bne 0, L..BB4_7
+; AIX32-NEXT:    bne- 0, L..BB4_7
 ; AIX32-NEXT:  # %bb.8: # %entry
 ; AIX32-NEXT:    srw 4, 8, 4
 ; AIX32-NEXT:    lwsync
@@ -5581,7 +5581,7 @@ define dso_local void @test_lock() local_unnamed_addr #0 {
 ; AIX32-NEXT:    #
 ; AIX32-NEXT:    lwarx 4, 0, 25
 ; AIX32-NEXT:    stwcx. 3, 0, 25
-; AIX32-NEXT:    bne 0, L..BB4_9
+; AIX32-NEXT:    bne- 0, L..BB4_9
 ; AIX32-NEXT:  # %bb.10: # %entry
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    stw 4, 0(25)
@@ -5591,7 +5591,7 @@ define dso_local void @test_lock() local_unnamed_addr #0 {
 ; AIX32-NEXT:    #
 ; AIX32-NEXT:    lwarx 4, 0, 24
 ; AIX32-NEXT:    stwcx. 3, 0, 24
-; AIX32-NEXT:    bne 0, L..BB4_11
+; AIX32-NEXT:    bne- 0, L..BB4_11
 ; AIX32-NEXT:  # %bb.12: # %entry
 ; AIX32-NEXT:    lwz 31, L..C6(2) # @sll
 ; AIX32-NEXT:    lwsync
@@ -5695,7 +5695,7 @@ define dso_local void @test_atomic() local_unnamed_addr #0 {
 ; CHECK-NEXT:  # %bb.2: # %entry
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    stwcx. 3, 0, 6
-; CHECK-NEXT:    bne 0, .LBB5_1
+; CHECK-NEXT:    bne- 0, .LBB5_1
 ; CHECK-NEXT:  .LBB5_3: # %entry
 ; CHECK-NEXT:    stw 5, ui@toc@l(4)
 ; CHECK-NEXT:    addis 5, 2, si@toc@ha
@@ -5709,7 +5709,7 @@ define dso_local void @test_atomic() local_unnamed_addr #0 {
 ; CHECK-NEXT:  # %bb.5: # %entry
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    stwcx. 3, 0, 7
-; CHECK-NEXT:    bne 0, .LBB5_4
+; CHECK-NEXT:    bne- 0, .LBB5_4
 ; CHECK-NEXT:  .LBB5_6: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stw 8, si@toc@l(5)
@@ -5721,7 +5721,7 @@ define dso_local void @test_atomic() local_unnamed_addr #0 {
 ; CHECK-NEXT:  # %bb.8: # %entry
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    stwcx. 3, 0, 6
-; CHECK-NEXT:    bne 0, .LBB5_7
+; CHECK-NEXT:    bne- 0, .LBB5_7
 ; CHECK-NEXT:  .LBB5_9: # %entry
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    stw 8, ui@toc@l(4)
@@ -5734,7 +5734,7 @@ define dso_local void @test_atomic() local_unnamed_addr #0 {
 ; CHECK-NEXT:  # %bb.11: # %entry
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    stwcx. 3, 0, 7
-; CHECK-NEXT:    bne 0, .LBB5_10
+; CHECK-NEXT:    bne- 0, .LBB5_10
 ; CHECK-NEXT:  .LBB5_12: # %entry
 ; CHECK-NEXT:    stw 4, si@toc@l(5)
 ; CHECK-NEXT:    blr
@@ -5751,7 +5751,7 @@ define dso_local void @test_atomic() local_unnamed_addr #0 {
 ; AIX32-NEXT:  # %bb.2: # %entry
 ; AIX32-NEXT:    #
 ; AIX32-NEXT:    stwcx. 3, 0, 4
-; AIX32-NEXT:    bne 0, L..BB5_1
+; AIX32-NEXT:    bne- 0, L..BB5_1
 ; AIX32-NEXT:  L..BB5_3: # %entry
 ; AIX32-NEXT:    stw 5, 0(4)
 ; AIX32-NEXT:    lwz 5, L..C4(2) # @si
@@ -5764,7 +5764,7 @@ define dso_local void @test_atomic() local_unnamed_addr #0 {
 ; AIX32-NEXT:  # %bb.5: # %entry
 ; AIX32-NEXT:    #
 ; AIX32-NEXT:    stwcx. 3, 0, 5
-; AIX32-NEXT:    bne 0, L..BB5_4
+; AIX32-NEXT:    bne- 0, L..BB5_4
 ; AIX32-NEXT:  L..BB5_6: # %entry
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    stw 6, 0(5)
@@ -5776,7 +5776,7 @@ define dso_local void @test_atomic() local_unnamed_addr #0 {
 ; AIX32-NEXT:  # %bb.8: # %entry
 ; AIX32-NEXT:    #
 ; AIX32-NEXT:    stwcx. 3, 0, 4
-; AIX32-NEXT:    bne 0, L..BB5_7
+; AIX32-NEXT:    bne- 0, L..BB5_7
 ; AIX32-NEXT:  L..BB5_9: # %entry
 ; AIX32-NEXT:    lwsync
 ; AIX32-NEXT:    stw 6, 0(4)
@@ -5789,7 +5789,7 @@ define dso_local void @test_atomic() local_unnamed_addr #0 {
 ; AIX32-NEXT:  # %bb.11: # %entry
 ; AIX32-NEXT:    #
 ; AIX32-NEXT:    stwcx. 3, 0, 5
-; AIX32-NEXT:    bne 0, L..BB5_10
+; AIX32-NEXT:    bne- 0, L..BB5_10
 ; AIX32-NEXT:  L..BB5_12: # %entry
 ; AIX32-NEXT:    stw 4, 0(5)
 ; AIX32-NEXT:    blr
@@ -5870,7 +5870,7 @@ define dso_local i64 @atommax8(ptr nocapture noundef %ptr, i64 noundef %val) loc
 ; CHECK-NEXT:  # %bb.2: # %entry
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    stdcx. 4, 0, 3
-; CHECK-NEXT:    bne 0, .LBB7_1
+; CHECK-NEXT:    bne- 0, .LBB7_1
 ; CHECK-NEXT:  .LBB7_3: # %entry
 ; CHECK-NEXT:    li 3, 55
 ; CHECK-NEXT:    li 4, 66
@@ -5954,7 +5954,7 @@ define dso_local signext i32 @atommax4(ptr nocapture noundef %ptr, i32 noundef s
 ; CHECK-NEXT:  # %bb.2: # %entry
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    stwcx. 4, 0, 3
-; CHECK-NEXT:    bne 0, .LBB8_1
+; CHECK-NEXT:    bne- 0, .LBB8_1
 ; CHECK-NEXT:  .LBB8_3: # %entry
 ; CHECK-NEXT:    li 3, 55
 ; CHECK-NEXT:    li 4, 66
@@ -5973,7 +5973,7 @@ define dso_local signext i32 @atommax4(ptr nocapture noundef %ptr, i32 noundef s
 ; AIX32-NEXT:  # %bb.2: # %entry
 ; AIX32-NEXT:    #
 ; AIX32-NEXT:    stwcx. 4, 0, 3
-; AIX32-NEXT:    bne 0, L..BB8_1
+; AIX32-NEXT:    bne- 0, L..BB8_1
 ; AIX32-NEXT:  L..BB8_3: # %entry
 ; AIX32-NEXT:    li 3, 55
 ; AIX32-NEXT:    li 4, 66
@@ -6000,7 +6000,7 @@ define dso_local signext i16 @atommax2(ptr nocapture noundef %ptr, i16 noundef s
 ; CHECK-NEXT:  # %bb.2: # %entry
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    sthcx. 4, 0, 3
-; CHECK-NEXT:    bne 0, .LBB9_1
+; CHECK-NEXT:    bne- 0, .LBB9_1
 ; CHECK-NEXT:  .LBB9_3: # %entry
 ; CHECK-NEXT:    li 3, 55
 ; CHECK-NEXT:    li 4, 66
@@ -6033,7 +6033,7 @@ define dso_local signext i16 @atommax2(ptr nocapture noundef %ptr, i16 noundef s
 ; AIX32-NEXT:    andc 10, 9, 7
 ; AIX32-NEXT:    or 10, 8, 10
 ; AIX32-NEXT:    stwcx. 10, 0, 3
-; AIX32-NEXT:    bne 0, L..BB9_1
+; AIX32-NEXT:    bne- 0, L..BB9_1
 ; AIX32-NEXT:  L..BB9_3: # %entry
 ; AIX32-NEXT:    srw 3, 9, 6
 ; AIX32-NEXT:    lwsync
@@ -6063,7 +6063,7 @@ define dso_local zeroext i8 @atommax1(ptr nocapture noundef %ptr, i8 noundef zer
 ; CHECK-NEXT:  # %bb.2: # %entry
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    stbcx. 4, 0, 3
-; CHECK-NEXT:    bne 0, .LBB10_1
+; CHECK-NEXT:    bne- 0, .LBB10_1
 ; CHECK-NEXT:  .LBB10_3: # %entry
 ; CHECK-NEXT:    li 3, 55
 ; CHECK-NEXT:    li 4, 66
@@ -6092,7 +6092,7 @@ define dso_local zeroext i8 @atommax1(ptr nocapture noundef %ptr, i8 noundef zer
 ; AIX32-NEXT:    andc 10, 9, 7
 ; AIX32-NEXT:    or 10, 8, 10
 ; AIX32-NEXT:    stwcx. 10, 0, 3
-; AIX32-NEXT:    bne 0, L..BB10_1
+; AIX32-NEXT:    bne- 0, L..BB10_1
 ; AIX32-NEXT:  L..BB10_3: # %entry
 ; AIX32-NEXT:    srw 3, 9, 5
 ; AIX32-NEXT:    lwsync
diff --git a/llvm/test/CodeGen/PowerPC/atomic-minmax.ll b/llvm/test/CodeGen/PowerPC/atomic-minmax.ll
index 747d9e565161e..44a4f16ff38f2 100644
--- a/llvm/test/CodeGen/PowerPC/atomic-minmax.ll
+++ b/llvm/test/CodeGen/PowerPC/atomic-minmax.ll
@@ -14,7 +14,7 @@ define void @a32min(ptr nocapture dereferenceable(4) %minimum, i32 %val) #0 {
 ; CHECK-NEXT:  # %bb.2: # %entry
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    stwcx. 4, 0, 3
-; CHECK-NEXT:    bne 0, .LBB0_1
+; CHECK-NEXT:    bne- 0, .LBB0_1
 ; CHECK-NEXT:  # %bb.3: # %entry
 ; CHECK-NEXT:    blr
 entry:
@@ -34,7 +34,7 @@ define void @a32max(ptr nocapture dereferenceable(4) %minimum, i32 %val) #0 {
 ; CHECK-NEXT:  # %bb.2: # %entry
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    stwcx. 4, 0, 3
-; CHECK-NEXT:    bne 0, .LBB1_1
+; CHECK-NEXT:    bne- 0, .LBB1_1
 ; CHECK-NEXT:  # %bb.3: # %entry
 ; CHECK-NEXT:    blr
 entry:
@@ -54,7 +54,7 @@ define void @a32umin(ptr nocapture dereferenceable(4) %minimum, i32 %val) #0 {
 ; CHECK-NEXT:  # %bb.2: # %entry
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    stwcx. 4, 0, 3
-; CHECK-NEXT:    bne 0, .LBB2_1
+; CHECK-NEXT:    bne- 0, .LBB2_1
 ; CHECK-NEXT:  # %bb.3: # %entry
 ; CHECK-NEXT:    blr
 entry:
@@ -74,7 +74,7 @@ define void @a32umax(ptr nocapture dereferenceable(4) %minimum, i32 %val) #0 {
 ; CHECK-NEXT:  # %bb.2: # %entry
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    stwcx. 4, 0, 3
-; CHECK-NEXT:    bne 0, .LBB3_1
+; CHECK-NEXT:    bne- 0, .LBB3_1
 ; CHECK-NEXT:  # %bb.3: # %entry
 ; CHECK-NEXT:    blr
 entry:
@@ -96,7 +96,7 @@ define void @a16min(ptr nocapture dereferenceable(4) %minimum, i16 %val) #1 {
 ; CHECK-NEXT:  # %bb.2: # %entry
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    sthcx. 4, 0, 3
-; CHECK-NEXT:    bne 0, .LBB4_1
+; CHECK-NEXT:    bne- 0, .LBB4_1
 ; CHECK-NEXT:  # %bb.3: # %entry
 ; CHECK-NEXT:    blr
 entry:
@@ -118,7 +118,7 @@ define void @a16max(ptr nocapture dereferenceable(4) %minimum, i16 %val) #1 {
 ; CHECK-NEXT:  # %bb.2: # %entry
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    sthcx. 4, 0, 3
-; CHECK-NEXT:    bne 0, .LBB5_1
+; CHECK-NEXT:    bne- 0, .LBB5_1
 ; CHECK-NEXT:  # %bb.3: # %entry
 ; CHECK-NEXT:    blr
 entry:
@@ -138,7 +138,7 @@ define void @a16umin(ptr nocapture dereferenceable(4) %minimum, i16 %val) #1 {
 ; CHECK-NEXT:  # %bb.2: # %entry
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    sthcx. 4, 0, 3
-; CHECK-NEXT:    bne 0, .LBB6_1
+; CHECK-NEXT:    bne- 0, .LBB6_1
 ; CHECK-NEXT:  # %bb.3: # %entry
 ; CHECK-NEXT:    blr
 entry:
@@ -158,7 +158,7 @@ define void @a16umax(ptr nocapture dereferenceable(4) %minimum, i16 %val) #1 {
 ; CHECK-NEXT:  # %bb.2: # %entry
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    sthcx. 4, 0, 3
-; CHECK-NEXT:    bne 0, .LBB7_1
+; CHECK-NEXT:    bne- 0, .LBB7_1
 ; CHECK-NEXT:  # %bb.3: # %entry
 ; CHECK-NEXT:    blr
 entry:
@@ -180,7 +180,7 @@ define void @a8min(ptr nocapture dereferenceable(4) %minimum, i8 %val) #1 {
 ; CHECK-NEXT:  # %bb.2: # %entry
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    stbcx. 4, 0, 3
-; CHECK-NEXT:    bne 0, .LBB8_1
+; CHECK-NEXT:    bne- 0, .LBB8_1
 ; CHECK-NEXT:  # %bb.3: # %entry
 ; CHECK-NEXT:    blr
 entry:
@@ -202,7 +202,7 @@ define void @a8max(ptr nocapture dereferenceable(4) %minimum, i8 %val) #1 {
 ; CHECK-NEXT:  # %bb.2: # %entry
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    stbcx. 4, 0, 3
-; CHECK-NEXT:    bne 0, .LBB9_1
+; CHECK-NEXT:    bne- 0, .LBB9_1
 ; CHECK-NEXT:  # %bb.3: # %entry
 ; CHECK-NEXT:    blr
 entry:
@@ -222,7 +222,7 @@ define void @a8umin(ptr nocapture dereferenceable(4) %minimum, i8 %val) #1 {
 ; CHECK-NEXT:  # %bb.2: # %entry
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    stbcx. 4, 0, 3
-; CHECK-NEXT:    bne 0, .LBB10_1
+; CHECK-NEXT:    bne- 0, .LBB10_1
 ; CHECK-NEXT:  # %bb.3: # %entry
 ; CHECK-NEXT:    blr
 entry:
@@ -242,7 +242,7 @@ define void @a8umax(ptr nocapture dereferenceable(4) %minimum, i8 %val) #1 {
 ; CHECK-NEXT:  # %bb.2: # %entry
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    stbcx. 4, 0, 3
-; CHECK-NEXT:    bne 0, .LBB11_1
+; CHECK-NEXT:    bne- 0, .LBB11_1
 ; CHECK-NEXT:  # %bb.3: # %entry
 ; CHECK-NEXT:    blr
 entry:
@@ -262,7 +262,7 @@ define void @a64min(ptr nocapture dereferenceable(4) %minimum, i64 %val) #0 {
 ; CHECK-NEXT:  # %bb.2: # %entry
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    stdcx. 4, 0, 3
-; CHECK-NEXT:    bne 0, .LBB12_1
+; CHECK-NEXT:    bne- 0, .LBB12_1
 ; CHECK-NEXT:  # %bb.3: # %entry
 ; CHECK-NEXT:    blr
 entry:
@@ -282,7 +282,7 @@ define void @a64max(ptr nocapture dereferenceable(4) %minimum, i64 %val) #0 {
 ; CHECK-NEXT:  # %bb.2: # %entry
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    stdcx. 4, 0, 3
-; CHECK-NEXT:    bne 0, .LBB13_1
+; CHECK-NEXT:    bne- 0, .LBB13_1
 ; CHECK-NEXT:  # %bb.3: # %entry
 ; CHECK-NEXT:    blr
 entry:
@@ -302,7 +302,7 @@ define void @a64umin(ptr nocapture dereferenceable(4) %minimum, i64 %val) #0 {
 ; CHECK-NEXT:  # %bb.2: # %entry
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    stdcx. 4, 0, 3
-; CHECK-NEXT:    bne 0, .LBB14_1
+; CHECK-NEXT:    bne- 0, .LBB14_1
 ; CHECK-NEXT:  # %bb.3: # %entry
 ; CHECK-NEXT:    blr
 entry:
@@ -322,7 +322,7 @@ define void @a64umax(ptr nocapture dereferenceable(4) %minimum, i64 %val) #0 {
 ; CHECK-NEXT:  # %bb.2: # %entry
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    stdcx. 4, 0, 3
-; CHECK-NEXT:    bne 0, .LBB15_1
+; CHECK-NEXT:    bne- 0, .LBB15_1
 ; CHECK-NEXT:  # %bb.3: # %entry
 ; CHECK-NEXT:    blr
 entry:
@@ -356,7 +356,7 @@ define void @ae16min(ptr nocapture dereferenceable(4) %minimum, i16 %val) #0 {
 ; CHECK-NEXT:    andc 8, 8, 6
 ; CHECK-NEXT:    or 8, 7, 8
 ; CHECK-NEXT:    stwcx. 8, 0, 3
-; CHECK-NEXT:    bne 0, .LBB16_1
+; CHECK-NEXT:    bne- 0, .LBB16_1
 ; CHECK-NEXT:  # %bb.3: # %entry
 ; CHECK-NEXT:    blr
 entry:
@@ -390,7 +390,7 @@ define void @ae16max(ptr nocapture dereferenceable(4) %minimum, i16 %val) #0 {
 ; CHECK-NEXT:    andc 8, 8, 6
 ; CHECK-NEXT:    or 8, 7, 8
 ; CHECK-NEXT:    stwcx. 8, 0, 3
-; CHECK-NEXT:    bne 0, .LBB17_1
+; CHECK-NEXT:    bne- 0, .LBB17_1
 ; CHECK-NEXT:  # %bb.3: # %entry
 ; CHECK-NEXT:    blr
 entry:
@@ -421,7 +421,7 @@ define void @ae16umin(ptr nocapture dereferenceable(4) %minimum, i16 %val) #0 {
 ; CHECK-NEXT:    andc 7, 7, 5
 ; CHECK-NEXT:    or 7, 6, 7
 ; CHECK-NEXT:    stwcx. 7, 0, 3
-; CHECK-NEXT:    bne 0, .LBB18_1
+; CHECK-NEXT:    bne- 0, .LBB18_1
 ; CHECK-NEXT:  # %bb.3: # %entry
 ; CHECK-NEXT:    blr
 entry:
@@ -452,7 +452,7 @@ define void @ae16umax(ptr nocapture dereferenceable(4) %minimum, i16 %val) #0 {
 ; CHECK-NEXT:    andc 7, 7, 5
 ; CHECK-NEXT:    or 7, 6, 7
 ; CHECK-NEXT:    stwcx. 7, 0, 3
-; CHECK-NEXT:    bne 0, .LBB19_1
+; CHECK-NEXT:    bne- 0, .LBB19_1
 ; CHECK-NEXT:  # %bb.3: # %entry
 ; CHECK-NEXT:    blr
 entry:
@@ -485,7 +485,7 @@ define void @ae8min(ptr nocapture dereferenceable(4) %minimum, i8 %val) #0 {
 ; CHECK-NEXT:    andc 8, 8, 6
 ; CHECK-NEXT:    or 8, 7, 8
 ; CHECK-NEXT:    stwcx. 8, 0, 3
-; CHECK-NEXT:    bne 0, .LBB20_1
+; CHECK-NEXT:    bne- 0, .LBB20_1
 ; CHECK-NEXT:  # %bb.3: # %entry
 ; CHECK-NEXT:    blr
 entry:
@@ -518,7 +518,7 @@ define void @ae8max(ptr nocapture dereferenceable(4) %minimum, i8 %val) #0 {
 ; CHECK-NEXT:    andc 8, 8, 6
 ; CHECK-NEXT:    or 8, 7, 8
 ; CHECK-NEXT:    stwcx. 8, 0, 3
-; CHECK-NEXT:    bne 0, .LBB21_1
+; CHECK-NEXT:    bne- 0, .LBB21_1
 ; CHECK-NEXT:  # %bb.3: # %entry
 ; CHECK-NEXT:    blr
 entry:
@@ -548,7 +548,7 @@ define void @ae8umin(ptr nocapture dereferenceable(4) %minimum, i8 %val) #0 {
 ; CHECK-NEXT:    andc 7, 7, 5
 ; CHECK-NEXT:    or 7, 6, 7
 ; CHECK-NEXT:    stwcx. 7, 0, 3
-; CHECK-NEXT:    bne 0, .LBB22_1
+; CHECK-NEXT:    bne- 0, .LBB22_1
 ; CHECK-NEXT:  # %bb.3: # %entry
 ; CHECK-NEXT:    blr
 entry:
@@ -578,7 +578,7 @@ define void @ae8umax(ptr nocapture dereferenceable(4) %minimum, i8 %val) #0 {
 ; CHECK-NEXT:    andc 7, 7, 5
 ; CHECK-NEXT:    or 7, 6, 7
 ; CHECK-NEXT:    stwcx. 7, 0, 3
-; CHECK-NEXT:    bne 0, .LBB23_1
+; CHECK-NEXT:    bne- 0, .LBB23_1
 ; CHECK-NEXT:  # %bb.3: # %entry
 ; CHECK-NEXT:    blr
 entry:
diff --git a/llvm/test/CodeGen/PowerPC/atomics-regression.ll b/llvm/test/CodeGen/PowerPC/atomics-regression.ll
index 90990bbb4124d..cfc3a99ab9fe9 100644
--- a/llvm/test/CodeGen/PowerPC/atomics-regression.ll
+++ b/llvm/test/CodeGen/PowerPC/atomics-regression.ll
@@ -2291,7 +2291,7 @@ define i8 @test120(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:  .LBB120_1:
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    stbcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB120_1
+; PPC64LE-NEXT:    bne- 0, .LBB120_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -2306,7 +2306,7 @@ define i8 @test121(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:  .LBB121_1:
 ; PPC64LE-NEXT:    lbarx 3, 0, 5
 ; PPC64LE-NEXT:    stbcx. 4, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB121_1
+; PPC64LE-NEXT:    bne- 0, .LBB121_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -2321,7 +2321,7 @@ define i8 @test122(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:  .LBB122_1:
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    stbcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB122_1
+; PPC64LE-NEXT:    bne- 0, .LBB122_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -2336,7 +2336,7 @@ define i8 @test123(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:  .LBB123_1:
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    stbcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB123_1
+; PPC64LE-NEXT:    bne- 0, .LBB123_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -2352,7 +2352,7 @@ define i8 @test124(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:  .LBB124_1:
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    stbcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB124_1
+; PPC64LE-NEXT:    bne- 0, .LBB124_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -2367,7 +2367,7 @@ define i16 @test125(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:  .LBB125_1:
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    sthcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB125_1
+; PPC64LE-NEXT:    bne- 0, .LBB125_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -2382,7 +2382,7 @@ define i16 @test126(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:  .LBB126_1:
 ; PPC64LE-NEXT:    lharx 3, 0, 5
 ; PPC64LE-NEXT:    sthcx. 4, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB126_1
+; PPC64LE-NEXT:    bne- 0, .LBB126_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -2397,7 +2397,7 @@ define i16 @test127(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:  .LBB127_1:
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    sthcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB127_1
+; PPC64LE-NEXT:    bne- 0, .LBB127_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -2412,7 +2412,7 @@ define i16 @test128(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:  .LBB128_1:
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    sthcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB128_1
+; PPC64LE-NEXT:    bne- 0, .LBB128_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -2428,7 +2428,7 @@ define i16 @test129(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:  .LBB129_1:
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    sthcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB129_1
+; PPC64LE-NEXT:    bne- 0, .LBB129_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -2443,7 +2443,7 @@ define i32 @test130(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:  .LBB130_1:
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    stwcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB130_1
+; PPC64LE-NEXT:    bne- 0, .LBB130_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -2458,7 +2458,7 @@ define i32 @test131(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:  .LBB131_1:
 ; PPC64LE-NEXT:    lwarx 3, 0, 5
 ; PPC64LE-NEXT:    stwcx. 4, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB131_1
+; PPC64LE-NEXT:    bne- 0, .LBB131_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -2473,7 +2473,7 @@ define i32 @test132(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:  .LBB132_1:
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    stwcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB132_1
+; PPC64LE-NEXT:    bne- 0, .LBB132_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -2488,7 +2488,7 @@ define i32 @test133(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:  .LBB133_1:
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    stwcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB133_1
+; PPC64LE-NEXT:    bne- 0, .LBB133_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -2504,7 +2504,7 @@ define i32 @test134(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:  .LBB134_1:
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    stwcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB134_1
+; PPC64LE-NEXT:    bne- 0, .LBB134_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -2519,7 +2519,7 @@ define i64 @test135(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:  .LBB135_1:
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    stdcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB135_1
+; PPC64LE-NEXT:    bne- 0, .LBB135_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -2534,7 +2534,7 @@ define i64 @test136(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:  .LBB136_1:
 ; PPC64LE-NEXT:    ldarx 3, 0, 5
 ; PPC64LE-NEXT:    stdcx. 4, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB136_1
+; PPC64LE-NEXT:    bne- 0, .LBB136_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -2549,7 +2549,7 @@ define i64 @test137(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:  .LBB137_1:
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    stdcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB137_1
+; PPC64LE-NEXT:    bne- 0, .LBB137_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -2564,7 +2564,7 @@ define i64 @test138(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:  .LBB138_1:
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    stdcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB138_1
+; PPC64LE-NEXT:    bne- 0, .LBB138_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -2580,7 +2580,7 @@ define i64 @test139(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:  .LBB139_1:
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    stdcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB139_1
+; PPC64LE-NEXT:    bne- 0, .LBB139_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -2596,7 +2596,7 @@ define i8 @test140(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    add 6, 4, 5
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB140_1
+; PPC64LE-NEXT:    bne- 0, .LBB140_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -2612,7 +2612,7 @@ define i8 @test141(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 3, 0, 5
 ; PPC64LE-NEXT:    add 6, 4, 3
 ; PPC64LE-NEXT:    stbcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB141_1
+; PPC64LE-NEXT:    bne- 0, .LBB141_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -2628,7 +2628,7 @@ define i8 @test142(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    add 6, 4, 5
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB142_1
+; PPC64LE-NEXT:    bne- 0, .LBB142_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -2644,7 +2644,7 @@ define i8 @test143(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    add 6, 4, 5
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB143_1
+; PPC64LE-NEXT:    bne- 0, .LBB143_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -2661,7 +2661,7 @@ define i8 @test144(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    add 6, 4, 5
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB144_1
+; PPC64LE-NEXT:    bne- 0, .LBB144_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -2677,7 +2677,7 @@ define i16 @test145(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    add 6, 4, 5
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB145_1
+; PPC64LE-NEXT:    bne- 0, .LBB145_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -2693,7 +2693,7 @@ define i16 @test146(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 3, 0, 5
 ; PPC64LE-NEXT:    add 6, 4, 3
 ; PPC64LE-NEXT:    sthcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB146_1
+; PPC64LE-NEXT:    bne- 0, .LBB146_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -2709,7 +2709,7 @@ define i16 @test147(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    add 6, 4, 5
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB147_1
+; PPC64LE-NEXT:    bne- 0, .LBB147_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -2725,7 +2725,7 @@ define i16 @test148(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    add 6, 4, 5
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB148_1
+; PPC64LE-NEXT:    bne- 0, .LBB148_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -2742,7 +2742,7 @@ define i16 @test149(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    add 6, 4, 5
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB149_1
+; PPC64LE-NEXT:    bne- 0, .LBB149_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -2758,7 +2758,7 @@ define i32 @test150(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    add 6, 4, 5
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB150_1
+; PPC64LE-NEXT:    bne- 0, .LBB150_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -2774,7 +2774,7 @@ define i32 @test151(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 3, 0, 5
 ; PPC64LE-NEXT:    add 6, 4, 3
 ; PPC64LE-NEXT:    stwcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB151_1
+; PPC64LE-NEXT:    bne- 0, .LBB151_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -2790,7 +2790,7 @@ define i32 @test152(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    add 6, 4, 5
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB152_1
+; PPC64LE-NEXT:    bne- 0, .LBB152_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -2806,7 +2806,7 @@ define i32 @test153(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    add 6, 4, 5
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB153_1
+; PPC64LE-NEXT:    bne- 0, .LBB153_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -2823,7 +2823,7 @@ define i32 @test154(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    add 6, 4, 5
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB154_1
+; PPC64LE-NEXT:    bne- 0, .LBB154_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -2839,7 +2839,7 @@ define i64 @test155(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    add 6, 4, 5
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB155_1
+; PPC64LE-NEXT:    bne- 0, .LBB155_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -2855,7 +2855,7 @@ define i64 @test156(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 3, 0, 5
 ; PPC64LE-NEXT:    add 6, 4, 3
 ; PPC64LE-NEXT:    stdcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB156_1
+; PPC64LE-NEXT:    bne- 0, .LBB156_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -2871,7 +2871,7 @@ define i64 @test157(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    add 6, 4, 5
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB157_1
+; PPC64LE-NEXT:    bne- 0, .LBB157_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -2887,7 +2887,7 @@ define i64 @test158(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    add 6, 4, 5
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB158_1
+; PPC64LE-NEXT:    bne- 0, .LBB158_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -2904,7 +2904,7 @@ define i64 @test159(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    add 6, 4, 5
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB159_1
+; PPC64LE-NEXT:    bne- 0, .LBB159_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -2920,7 +2920,7 @@ define i8 @test160(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    sub 6, 5, 4
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB160_1
+; PPC64LE-NEXT:    bne- 0, .LBB160_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -2936,7 +2936,7 @@ define i8 @test161(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 3, 0, 5
 ; PPC64LE-NEXT:    sub 6, 3, 4
 ; PPC64LE-NEXT:    stbcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB161_1
+; PPC64LE-NEXT:    bne- 0, .LBB161_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -2952,7 +2952,7 @@ define i8 @test162(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    sub 6, 5, 4
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB162_1
+; PPC64LE-NEXT:    bne- 0, .LBB162_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -2968,7 +2968,7 @@ define i8 @test163(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    sub 6, 5, 4
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB163_1
+; PPC64LE-NEXT:    bne- 0, .LBB163_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -2985,7 +2985,7 @@ define i8 @test164(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    sub 6, 5, 4
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB164_1
+; PPC64LE-NEXT:    bne- 0, .LBB164_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -3001,7 +3001,7 @@ define i16 @test165(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    sub 6, 5, 4
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB165_1
+; PPC64LE-NEXT:    bne- 0, .LBB165_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -3017,7 +3017,7 @@ define i16 @test166(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 3, 0, 5
 ; PPC64LE-NEXT:    sub 6, 3, 4
 ; PPC64LE-NEXT:    sthcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB166_1
+; PPC64LE-NEXT:    bne- 0, .LBB166_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -3033,7 +3033,7 @@ define i16 @test167(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    sub 6, 5, 4
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB167_1
+; PPC64LE-NEXT:    bne- 0, .LBB167_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -3049,7 +3049,7 @@ define i16 @test168(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    sub 6, 5, 4
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB168_1
+; PPC64LE-NEXT:    bne- 0, .LBB168_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -3066,7 +3066,7 @@ define i16 @test169(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    sub 6, 5, 4
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB169_1
+; PPC64LE-NEXT:    bne- 0, .LBB169_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -3082,7 +3082,7 @@ define i32 @test170(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    sub 6, 5, 4
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB170_1
+; PPC64LE-NEXT:    bne- 0, .LBB170_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -3098,7 +3098,7 @@ define i32 @test171(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 3, 0, 5
 ; PPC64LE-NEXT:    sub 6, 3, 4
 ; PPC64LE-NEXT:    stwcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB171_1
+; PPC64LE-NEXT:    bne- 0, .LBB171_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -3114,7 +3114,7 @@ define i32 @test172(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    sub 6, 5, 4
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB172_1
+; PPC64LE-NEXT:    bne- 0, .LBB172_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -3130,7 +3130,7 @@ define i32 @test173(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    sub 6, 5, 4
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB173_1
+; PPC64LE-NEXT:    bne- 0, .LBB173_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -3147,7 +3147,7 @@ define i32 @test174(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    sub 6, 5, 4
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB174_1
+; PPC64LE-NEXT:    bne- 0, .LBB174_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -3163,7 +3163,7 @@ define i64 @test175(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    sub 6, 5, 4
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB175_1
+; PPC64LE-NEXT:    bne- 0, .LBB175_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -3179,7 +3179,7 @@ define i64 @test176(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 3, 0, 5
 ; PPC64LE-NEXT:    sub 6, 3, 4
 ; PPC64LE-NEXT:    stdcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB176_1
+; PPC64LE-NEXT:    bne- 0, .LBB176_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -3195,7 +3195,7 @@ define i64 @test177(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    sub 6, 5, 4
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB177_1
+; PPC64LE-NEXT:    bne- 0, .LBB177_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -3211,7 +3211,7 @@ define i64 @test178(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    sub 6, 5, 4
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB178_1
+; PPC64LE-NEXT:    bne- 0, .LBB178_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -3228,7 +3228,7 @@ define i64 @test179(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    sub 6, 5, 4
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB179_1
+; PPC64LE-NEXT:    bne- 0, .LBB179_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -3244,7 +3244,7 @@ define i8 @test180(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    and 6, 4, 5
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB180_1
+; PPC64LE-NEXT:    bne- 0, .LBB180_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -3260,7 +3260,7 @@ define i8 @test181(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 3, 0, 5
 ; PPC64LE-NEXT:    and 6, 4, 3
 ; PPC64LE-NEXT:    stbcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB181_1
+; PPC64LE-NEXT:    bne- 0, .LBB181_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -3276,7 +3276,7 @@ define i8 @test182(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    and 6, 4, 5
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB182_1
+; PPC64LE-NEXT:    bne- 0, .LBB182_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -3292,7 +3292,7 @@ define i8 @test183(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    and 6, 4, 5
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB183_1
+; PPC64LE-NEXT:    bne- 0, .LBB183_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -3309,7 +3309,7 @@ define i8 @test184(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    and 6, 4, 5
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB184_1
+; PPC64LE-NEXT:    bne- 0, .LBB184_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -3325,7 +3325,7 @@ define i16 @test185(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    and 6, 4, 5
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB185_1
+; PPC64LE-NEXT:    bne- 0, .LBB185_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -3341,7 +3341,7 @@ define i16 @test186(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 3, 0, 5
 ; PPC64LE-NEXT:    and 6, 4, 3
 ; PPC64LE-NEXT:    sthcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB186_1
+; PPC64LE-NEXT:    bne- 0, .LBB186_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -3357,7 +3357,7 @@ define i16 @test187(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    and 6, 4, 5
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB187_1
+; PPC64LE-NEXT:    bne- 0, .LBB187_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -3373,7 +3373,7 @@ define i16 @test188(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    and 6, 4, 5
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB188_1
+; PPC64LE-NEXT:    bne- 0, .LBB188_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -3390,7 +3390,7 @@ define i16 @test189(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    and 6, 4, 5
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB189_1
+; PPC64LE-NEXT:    bne- 0, .LBB189_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -3406,7 +3406,7 @@ define i32 @test190(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    and 6, 4, 5
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB190_1
+; PPC64LE-NEXT:    bne- 0, .LBB190_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -3422,7 +3422,7 @@ define i32 @test191(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 3, 0, 5
 ; PPC64LE-NEXT:    and 6, 4, 3
 ; PPC64LE-NEXT:    stwcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB191_1
+; PPC64LE-NEXT:    bne- 0, .LBB191_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -3438,7 +3438,7 @@ define i32 @test192(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    and 6, 4, 5
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB192_1
+; PPC64LE-NEXT:    bne- 0, .LBB192_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -3454,7 +3454,7 @@ define i32 @test193(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    and 6, 4, 5
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB193_1
+; PPC64LE-NEXT:    bne- 0, .LBB193_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -3471,7 +3471,7 @@ define i32 @test194(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    and 6, 4, 5
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB194_1
+; PPC64LE-NEXT:    bne- 0, .LBB194_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -3487,7 +3487,7 @@ define i64 @test195(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    and 6, 4, 5
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB195_1
+; PPC64LE-NEXT:    bne- 0, .LBB195_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -3503,7 +3503,7 @@ define i64 @test196(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 3, 0, 5
 ; PPC64LE-NEXT:    and 6, 4, 3
 ; PPC64LE-NEXT:    stdcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB196_1
+; PPC64LE-NEXT:    bne- 0, .LBB196_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -3519,7 +3519,7 @@ define i64 @test197(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    and 6, 4, 5
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB197_1
+; PPC64LE-NEXT:    bne- 0, .LBB197_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -3535,7 +3535,7 @@ define i64 @test198(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    and 6, 4, 5
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB198_1
+; PPC64LE-NEXT:    bne- 0, .LBB198_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -3552,7 +3552,7 @@ define i64 @test199(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    and 6, 4, 5
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB199_1
+; PPC64LE-NEXT:    bne- 0, .LBB199_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -3568,7 +3568,7 @@ define i8 @test200(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    nand 6, 4, 5
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB200_1
+; PPC64LE-NEXT:    bne- 0, .LBB200_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -3584,7 +3584,7 @@ define i8 @test201(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 3, 0, 5
 ; PPC64LE-NEXT:    nand 6, 4, 3
 ; PPC64LE-NEXT:    stbcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB201_1
+; PPC64LE-NEXT:    bne- 0, .LBB201_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -3600,7 +3600,7 @@ define i8 @test202(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    nand 6, 4, 5
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB202_1
+; PPC64LE-NEXT:    bne- 0, .LBB202_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -3616,7 +3616,7 @@ define i8 @test203(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    nand 6, 4, 5
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB203_1
+; PPC64LE-NEXT:    bne- 0, .LBB203_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -3633,7 +3633,7 @@ define i8 @test204(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    nand 6, 4, 5
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB204_1
+; PPC64LE-NEXT:    bne- 0, .LBB204_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -3649,7 +3649,7 @@ define i16 @test205(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    nand 6, 4, 5
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB205_1
+; PPC64LE-NEXT:    bne- 0, .LBB205_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -3665,7 +3665,7 @@ define i16 @test206(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 3, 0, 5
 ; PPC64LE-NEXT:    nand 6, 4, 3
 ; PPC64LE-NEXT:    sthcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB206_1
+; PPC64LE-NEXT:    bne- 0, .LBB206_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -3681,7 +3681,7 @@ define i16 @test207(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    nand 6, 4, 5
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB207_1
+; PPC64LE-NEXT:    bne- 0, .LBB207_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -3697,7 +3697,7 @@ define i16 @test208(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    nand 6, 4, 5
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB208_1
+; PPC64LE-NEXT:    bne- 0, .LBB208_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -3714,7 +3714,7 @@ define i16 @test209(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    nand 6, 4, 5
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB209_1
+; PPC64LE-NEXT:    bne- 0, .LBB209_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -3730,7 +3730,7 @@ define i32 @test210(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    nand 6, 4, 5
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB210_1
+; PPC64LE-NEXT:    bne- 0, .LBB210_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -3746,7 +3746,7 @@ define i32 @test211(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 3, 0, 5
 ; PPC64LE-NEXT:    nand 6, 4, 3
 ; PPC64LE-NEXT:    stwcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB211_1
+; PPC64LE-NEXT:    bne- 0, .LBB211_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -3762,7 +3762,7 @@ define i32 @test212(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    nand 6, 4, 5
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB212_1
+; PPC64LE-NEXT:    bne- 0, .LBB212_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -3778,7 +3778,7 @@ define i32 @test213(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    nand 6, 4, 5
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB213_1
+; PPC64LE-NEXT:    bne- 0, .LBB213_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -3795,7 +3795,7 @@ define i32 @test214(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    nand 6, 4, 5
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB214_1
+; PPC64LE-NEXT:    bne- 0, .LBB214_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -3811,7 +3811,7 @@ define i64 @test215(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    nand 6, 4, 5
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB215_1
+; PPC64LE-NEXT:    bne- 0, .LBB215_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -3827,7 +3827,7 @@ define i64 @test216(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 3, 0, 5
 ; PPC64LE-NEXT:    nand 6, 4, 3
 ; PPC64LE-NEXT:    stdcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB216_1
+; PPC64LE-NEXT:    bne- 0, .LBB216_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -3843,7 +3843,7 @@ define i64 @test217(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    nand 6, 4, 5
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB217_1
+; PPC64LE-NEXT:    bne- 0, .LBB217_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -3859,7 +3859,7 @@ define i64 @test218(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    nand 6, 4, 5
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB218_1
+; PPC64LE-NEXT:    bne- 0, .LBB218_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -3876,7 +3876,7 @@ define i64 @test219(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    nand 6, 4, 5
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB219_1
+; PPC64LE-NEXT:    bne- 0, .LBB219_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -3892,7 +3892,7 @@ define i8 @test220(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    or 6, 4, 5
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB220_1
+; PPC64LE-NEXT:    bne- 0, .LBB220_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -3908,7 +3908,7 @@ define i8 @test221(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 3, 0, 5
 ; PPC64LE-NEXT:    or 6, 4, 3
 ; PPC64LE-NEXT:    stbcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB221_1
+; PPC64LE-NEXT:    bne- 0, .LBB221_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -3924,7 +3924,7 @@ define i8 @test222(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    or 6, 4, 5
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB222_1
+; PPC64LE-NEXT:    bne- 0, .LBB222_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -3940,7 +3940,7 @@ define i8 @test223(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    or 6, 4, 5
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB223_1
+; PPC64LE-NEXT:    bne- 0, .LBB223_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -3957,7 +3957,7 @@ define i8 @test224(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    or 6, 4, 5
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB224_1
+; PPC64LE-NEXT:    bne- 0, .LBB224_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -3973,7 +3973,7 @@ define i16 @test225(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    or 6, 4, 5
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB225_1
+; PPC64LE-NEXT:    bne- 0, .LBB225_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -3989,7 +3989,7 @@ define i16 @test226(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 3, 0, 5
 ; PPC64LE-NEXT:    or 6, 4, 3
 ; PPC64LE-NEXT:    sthcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB226_1
+; PPC64LE-NEXT:    bne- 0, .LBB226_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -4005,7 +4005,7 @@ define i16 @test227(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    or 6, 4, 5
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB227_1
+; PPC64LE-NEXT:    bne- 0, .LBB227_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -4021,7 +4021,7 @@ define i16 @test228(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    or 6, 4, 5
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB228_1
+; PPC64LE-NEXT:    bne- 0, .LBB228_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -4038,7 +4038,7 @@ define i16 @test229(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    or 6, 4, 5
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB229_1
+; PPC64LE-NEXT:    bne- 0, .LBB229_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -4054,7 +4054,7 @@ define i32 @test230(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    or 6, 4, 5
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB230_1
+; PPC64LE-NEXT:    bne- 0, .LBB230_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -4070,7 +4070,7 @@ define i32 @test231(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 3, 0, 5
 ; PPC64LE-NEXT:    or 6, 4, 3
 ; PPC64LE-NEXT:    stwcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB231_1
+; PPC64LE-NEXT:    bne- 0, .LBB231_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -4086,7 +4086,7 @@ define i32 @test232(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    or 6, 4, 5
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB232_1
+; PPC64LE-NEXT:    bne- 0, .LBB232_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -4102,7 +4102,7 @@ define i32 @test233(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    or 6, 4, 5
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB233_1
+; PPC64LE-NEXT:    bne- 0, .LBB233_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -4119,7 +4119,7 @@ define i32 @test234(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    or 6, 4, 5
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB234_1
+; PPC64LE-NEXT:    bne- 0, .LBB234_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -4135,7 +4135,7 @@ define i64 @test235(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    or 6, 4, 5
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB235_1
+; PPC64LE-NEXT:    bne- 0, .LBB235_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -4151,7 +4151,7 @@ define i64 @test236(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 3, 0, 5
 ; PPC64LE-NEXT:    or 6, 4, 3
 ; PPC64LE-NEXT:    stdcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB236_1
+; PPC64LE-NEXT:    bne- 0, .LBB236_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -4167,7 +4167,7 @@ define i64 @test237(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    or 6, 4, 5
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB237_1
+; PPC64LE-NEXT:    bne- 0, .LBB237_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -4183,7 +4183,7 @@ define i64 @test238(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    or 6, 4, 5
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB238_1
+; PPC64LE-NEXT:    bne- 0, .LBB238_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -4200,7 +4200,7 @@ define i64 @test239(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    or 6, 4, 5
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB239_1
+; PPC64LE-NEXT:    bne- 0, .LBB239_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -4216,7 +4216,7 @@ define i8 @test240(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    xor 6, 4, 5
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB240_1
+; PPC64LE-NEXT:    bne- 0, .LBB240_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -4232,7 +4232,7 @@ define i8 @test241(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 3, 0, 5
 ; PPC64LE-NEXT:    xor 6, 4, 3
 ; PPC64LE-NEXT:    stbcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB241_1
+; PPC64LE-NEXT:    bne- 0, .LBB241_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -4248,7 +4248,7 @@ define i8 @test242(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    xor 6, 4, 5
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB242_1
+; PPC64LE-NEXT:    bne- 0, .LBB242_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -4264,7 +4264,7 @@ define i8 @test243(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    xor 6, 4, 5
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB243_1
+; PPC64LE-NEXT:    bne- 0, .LBB243_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -4281,7 +4281,7 @@ define i8 @test244(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    xor 6, 4, 5
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB244_1
+; PPC64LE-NEXT:    bne- 0, .LBB244_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -4297,7 +4297,7 @@ define i16 @test245(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    xor 6, 4, 5
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB245_1
+; PPC64LE-NEXT:    bne- 0, .LBB245_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -4313,7 +4313,7 @@ define i16 @test246(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 3, 0, 5
 ; PPC64LE-NEXT:    xor 6, 4, 3
 ; PPC64LE-NEXT:    sthcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB246_1
+; PPC64LE-NEXT:    bne- 0, .LBB246_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -4329,7 +4329,7 @@ define i16 @test247(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    xor 6, 4, 5
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB247_1
+; PPC64LE-NEXT:    bne- 0, .LBB247_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -4345,7 +4345,7 @@ define i16 @test248(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    xor 6, 4, 5
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB248_1
+; PPC64LE-NEXT:    bne- 0, .LBB248_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -4362,7 +4362,7 @@ define i16 @test249(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    xor 6, 4, 5
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB249_1
+; PPC64LE-NEXT:    bne- 0, .LBB249_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -4378,7 +4378,7 @@ define i32 @test250(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    xor 6, 4, 5
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB250_1
+; PPC64LE-NEXT:    bne- 0, .LBB250_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -4394,7 +4394,7 @@ define i32 @test251(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 3, 0, 5
 ; PPC64LE-NEXT:    xor 6, 4, 3
 ; PPC64LE-NEXT:    stwcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB251_1
+; PPC64LE-NEXT:    bne- 0, .LBB251_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -4410,7 +4410,7 @@ define i32 @test252(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    xor 6, 4, 5
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB252_1
+; PPC64LE-NEXT:    bne- 0, .LBB252_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -4426,7 +4426,7 @@ define i32 @test253(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    xor 6, 4, 5
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB253_1
+; PPC64LE-NEXT:    bne- 0, .LBB253_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -4443,7 +4443,7 @@ define i32 @test254(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    xor 6, 4, 5
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB254_1
+; PPC64LE-NEXT:    bne- 0, .LBB254_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -4459,7 +4459,7 @@ define i64 @test255(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    xor 6, 4, 5
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB255_1
+; PPC64LE-NEXT:    bne- 0, .LBB255_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -4475,7 +4475,7 @@ define i64 @test256(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 3, 0, 5
 ; PPC64LE-NEXT:    xor 6, 4, 3
 ; PPC64LE-NEXT:    stdcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB256_1
+; PPC64LE-NEXT:    bne- 0, .LBB256_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -4491,7 +4491,7 @@ define i64 @test257(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    xor 6, 4, 5
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB257_1
+; PPC64LE-NEXT:    bne- 0, .LBB257_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -4507,7 +4507,7 @@ define i64 @test258(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    xor 6, 4, 5
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB258_1
+; PPC64LE-NEXT:    bne- 0, .LBB258_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -4524,7 +4524,7 @@ define i64 @test259(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    xor 6, 4, 5
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB259_1
+; PPC64LE-NEXT:    bne- 0, .LBB259_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -4544,7 +4544,7 @@ define i8 @test260(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB260_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB260_1
+; PPC64LE-NEXT:    bne- 0, .LBB260_1
 ; PPC64LE-NEXT:  .LBB260_3:
 ; PPC64LE-NEXT:    mr 3, 4
 ; PPC64LE-NEXT:    blr
@@ -4563,7 +4563,7 @@ define i8 @test261(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB261_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB261_1
+; PPC64LE-NEXT:    bne- 0, .LBB261_1
 ; PPC64LE-NEXT:  .LBB261_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 4
@@ -4584,7 +4584,7 @@ define i8 @test262(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB262_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB262_1
+; PPC64LE-NEXT:    bne- 0, .LBB262_1
 ; PPC64LE-NEXT:  .LBB262_3:
 ; PPC64LE-NEXT:    mr 3, 4
 ; PPC64LE-NEXT:    blr
@@ -4604,7 +4604,7 @@ define i8 @test263(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB263_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB263_1
+; PPC64LE-NEXT:    bne- 0, .LBB263_1
 ; PPC64LE-NEXT:  .LBB263_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 4
@@ -4625,7 +4625,7 @@ define i8 @test264(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB264_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB264_1
+; PPC64LE-NEXT:    bne- 0, .LBB264_1
 ; PPC64LE-NEXT:  .LBB264_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 4
@@ -4645,7 +4645,7 @@ define i16 @test265(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB265_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB265_1
+; PPC64LE-NEXT:    bne- 0, .LBB265_1
 ; PPC64LE-NEXT:  .LBB265_3:
 ; PPC64LE-NEXT:    mr 3, 4
 ; PPC64LE-NEXT:    blr
@@ -4664,7 +4664,7 @@ define i16 @test266(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB266_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB266_1
+; PPC64LE-NEXT:    bne- 0, .LBB266_1
 ; PPC64LE-NEXT:  .LBB266_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 4
@@ -4685,7 +4685,7 @@ define i16 @test267(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB267_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB267_1
+; PPC64LE-NEXT:    bne- 0, .LBB267_1
 ; PPC64LE-NEXT:  .LBB267_3:
 ; PPC64LE-NEXT:    mr 3, 4
 ; PPC64LE-NEXT:    blr
@@ -4705,7 +4705,7 @@ define i16 @test268(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB268_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB268_1
+; PPC64LE-NEXT:    bne- 0, .LBB268_1
 ; PPC64LE-NEXT:  .LBB268_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 4
@@ -4726,7 +4726,7 @@ define i16 @test269(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB269_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB269_1
+; PPC64LE-NEXT:    bne- 0, .LBB269_1
 ; PPC64LE-NEXT:  .LBB269_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 4
@@ -4744,7 +4744,7 @@ define i32 @test270(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB270_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stwcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB270_1
+; PPC64LE-NEXT:    bne- 0, .LBB270_1
 ; PPC64LE-NEXT:  .LBB270_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -4762,7 +4762,7 @@ define i32 @test271(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB271_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stwcx. 4, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB271_1
+; PPC64LE-NEXT:    bne- 0, .LBB271_1
 ; PPC64LE-NEXT:  .LBB271_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -4780,7 +4780,7 @@ define i32 @test272(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB272_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stwcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB272_1
+; PPC64LE-NEXT:    bne- 0, .LBB272_1
 ; PPC64LE-NEXT:  .LBB272_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -4798,7 +4798,7 @@ define i32 @test273(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB273_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stwcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB273_1
+; PPC64LE-NEXT:    bne- 0, .LBB273_1
 ; PPC64LE-NEXT:  .LBB273_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -4817,7 +4817,7 @@ define i32 @test274(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB274_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stwcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB274_1
+; PPC64LE-NEXT:    bne- 0, .LBB274_1
 ; PPC64LE-NEXT:  .LBB274_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -4835,7 +4835,7 @@ define i64 @test275(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB275_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stdcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB275_1
+; PPC64LE-NEXT:    bne- 0, .LBB275_1
 ; PPC64LE-NEXT:  .LBB275_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -4853,7 +4853,7 @@ define i64 @test276(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB276_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stdcx. 4, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB276_1
+; PPC64LE-NEXT:    bne- 0, .LBB276_1
 ; PPC64LE-NEXT:  .LBB276_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -4871,7 +4871,7 @@ define i64 @test277(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB277_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stdcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB277_1
+; PPC64LE-NEXT:    bne- 0, .LBB277_1
 ; PPC64LE-NEXT:  .LBB277_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -4889,7 +4889,7 @@ define i64 @test278(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB278_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stdcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB278_1
+; PPC64LE-NEXT:    bne- 0, .LBB278_1
 ; PPC64LE-NEXT:  .LBB278_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -4908,7 +4908,7 @@ define i64 @test279(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB279_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stdcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB279_1
+; PPC64LE-NEXT:    bne- 0, .LBB279_1
 ; PPC64LE-NEXT:  .LBB279_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -4928,7 +4928,7 @@ define i8 @test280(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB280_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB280_1
+; PPC64LE-NEXT:    bne- 0, .LBB280_1
 ; PPC64LE-NEXT:  .LBB280_3:
 ; PPC64LE-NEXT:    mr 3, 4
 ; PPC64LE-NEXT:    blr
@@ -4947,7 +4947,7 @@ define i8 @test281(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB281_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB281_1
+; PPC64LE-NEXT:    bne- 0, .LBB281_1
 ; PPC64LE-NEXT:  .LBB281_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 4
@@ -4968,7 +4968,7 @@ define i8 @test282(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB282_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB282_1
+; PPC64LE-NEXT:    bne- 0, .LBB282_1
 ; PPC64LE-NEXT:  .LBB282_3:
 ; PPC64LE-NEXT:    mr 3, 4
 ; PPC64LE-NEXT:    blr
@@ -4988,7 +4988,7 @@ define i8 @test283(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB283_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB283_1
+; PPC64LE-NEXT:    bne- 0, .LBB283_1
 ; PPC64LE-NEXT:  .LBB283_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 4
@@ -5009,7 +5009,7 @@ define i8 @test284(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB284_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB284_1
+; PPC64LE-NEXT:    bne- 0, .LBB284_1
 ; PPC64LE-NEXT:  .LBB284_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 4
@@ -5029,7 +5029,7 @@ define i16 @test285(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB285_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB285_1
+; PPC64LE-NEXT:    bne- 0, .LBB285_1
 ; PPC64LE-NEXT:  .LBB285_3:
 ; PPC64LE-NEXT:    mr 3, 4
 ; PPC64LE-NEXT:    blr
@@ -5048,7 +5048,7 @@ define i16 @test286(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB286_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB286_1
+; PPC64LE-NEXT:    bne- 0, .LBB286_1
 ; PPC64LE-NEXT:  .LBB286_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 4
@@ -5069,7 +5069,7 @@ define i16 @test287(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB287_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB287_1
+; PPC64LE-NEXT:    bne- 0, .LBB287_1
 ; PPC64LE-NEXT:  .LBB287_3:
 ; PPC64LE-NEXT:    mr 3, 4
 ; PPC64LE-NEXT:    blr
@@ -5089,7 +5089,7 @@ define i16 @test288(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB288_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB288_1
+; PPC64LE-NEXT:    bne- 0, .LBB288_1
 ; PPC64LE-NEXT:  .LBB288_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 4
@@ -5110,7 +5110,7 @@ define i16 @test289(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB289_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB289_1
+; PPC64LE-NEXT:    bne- 0, .LBB289_1
 ; PPC64LE-NEXT:  .LBB289_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 4
@@ -5128,7 +5128,7 @@ define i32 @test290(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB290_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stwcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB290_1
+; PPC64LE-NEXT:    bne- 0, .LBB290_1
 ; PPC64LE-NEXT:  .LBB290_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -5146,7 +5146,7 @@ define i32 @test291(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB291_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stwcx. 4, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB291_1
+; PPC64LE-NEXT:    bne- 0, .LBB291_1
 ; PPC64LE-NEXT:  .LBB291_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -5164,7 +5164,7 @@ define i32 @test292(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB292_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stwcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB292_1
+; PPC64LE-NEXT:    bne- 0, .LBB292_1
 ; PPC64LE-NEXT:  .LBB292_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -5182,7 +5182,7 @@ define i32 @test293(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB293_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stwcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB293_1
+; PPC64LE-NEXT:    bne- 0, .LBB293_1
 ; PPC64LE-NEXT:  .LBB293_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -5201,7 +5201,7 @@ define i32 @test294(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB294_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stwcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB294_1
+; PPC64LE-NEXT:    bne- 0, .LBB294_1
 ; PPC64LE-NEXT:  .LBB294_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -5219,7 +5219,7 @@ define i64 @test295(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB295_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stdcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB295_1
+; PPC64LE-NEXT:    bne- 0, .LBB295_1
 ; PPC64LE-NEXT:  .LBB295_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -5237,7 +5237,7 @@ define i64 @test296(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB296_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stdcx. 4, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB296_1
+; PPC64LE-NEXT:    bne- 0, .LBB296_1
 ; PPC64LE-NEXT:  .LBB296_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -5255,7 +5255,7 @@ define i64 @test297(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB297_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stdcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB297_1
+; PPC64LE-NEXT:    bne- 0, .LBB297_1
 ; PPC64LE-NEXT:  .LBB297_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -5273,7 +5273,7 @@ define i64 @test298(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB298_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stdcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB298_1
+; PPC64LE-NEXT:    bne- 0, .LBB298_1
 ; PPC64LE-NEXT:  .LBB298_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -5292,7 +5292,7 @@ define i64 @test299(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB299_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stdcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB299_1
+; PPC64LE-NEXT:    bne- 0, .LBB299_1
 ; PPC64LE-NEXT:  .LBB299_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -5310,7 +5310,7 @@ define i8 @test300(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB300_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stbcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB300_1
+; PPC64LE-NEXT:    bne- 0, .LBB300_1
 ; PPC64LE-NEXT:  .LBB300_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -5328,7 +5328,7 @@ define i8 @test301(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB301_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stbcx. 4, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB301_1
+; PPC64LE-NEXT:    bne- 0, .LBB301_1
 ; PPC64LE-NEXT:  .LBB301_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -5346,7 +5346,7 @@ define i8 @test302(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB302_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stbcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB302_1
+; PPC64LE-NEXT:    bne- 0, .LBB302_1
 ; PPC64LE-NEXT:  .LBB302_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -5364,7 +5364,7 @@ define i8 @test303(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB303_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stbcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB303_1
+; PPC64LE-NEXT:    bne- 0, .LBB303_1
 ; PPC64LE-NEXT:  .LBB303_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -5383,7 +5383,7 @@ define i8 @test304(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB304_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stbcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB304_1
+; PPC64LE-NEXT:    bne- 0, .LBB304_1
 ; PPC64LE-NEXT:  .LBB304_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -5401,7 +5401,7 @@ define i16 @test305(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB305_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    sthcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB305_1
+; PPC64LE-NEXT:    bne- 0, .LBB305_1
 ; PPC64LE-NEXT:  .LBB305_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -5419,7 +5419,7 @@ define i16 @test306(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB306_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    sthcx. 4, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB306_1
+; PPC64LE-NEXT:    bne- 0, .LBB306_1
 ; PPC64LE-NEXT:  .LBB306_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -5437,7 +5437,7 @@ define i16 @test307(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB307_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    sthcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB307_1
+; PPC64LE-NEXT:    bne- 0, .LBB307_1
 ; PPC64LE-NEXT:  .LBB307_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -5455,7 +5455,7 @@ define i16 @test308(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB308_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    sthcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB308_1
+; PPC64LE-NEXT:    bne- 0, .LBB308_1
 ; PPC64LE-NEXT:  .LBB308_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -5474,7 +5474,7 @@ define i16 @test309(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB309_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    sthcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB309_1
+; PPC64LE-NEXT:    bne- 0, .LBB309_1
 ; PPC64LE-NEXT:  .LBB309_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -5492,7 +5492,7 @@ define i32 @test310(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB310_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stwcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB310_1
+; PPC64LE-NEXT:    bne- 0, .LBB310_1
 ; PPC64LE-NEXT:  .LBB310_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -5510,7 +5510,7 @@ define i32 @test311(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB311_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stwcx. 4, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB311_1
+; PPC64LE-NEXT:    bne- 0, .LBB311_1
 ; PPC64LE-NEXT:  .LBB311_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -5528,7 +5528,7 @@ define i32 @test312(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB312_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stwcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB312_1
+; PPC64LE-NEXT:    bne- 0, .LBB312_1
 ; PPC64LE-NEXT:  .LBB312_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -5546,7 +5546,7 @@ define i32 @test313(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB313_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stwcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB313_1
+; PPC64LE-NEXT:    bne- 0, .LBB313_1
 ; PPC64LE-NEXT:  .LBB313_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -5565,7 +5565,7 @@ define i32 @test314(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB314_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stwcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB314_1
+; PPC64LE-NEXT:    bne- 0, .LBB314_1
 ; PPC64LE-NEXT:  .LBB314_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -5583,7 +5583,7 @@ define i64 @test315(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB315_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stdcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB315_1
+; PPC64LE-NEXT:    bne- 0, .LBB315_1
 ; PPC64LE-NEXT:  .LBB315_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -5601,7 +5601,7 @@ define i64 @test316(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB316_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stdcx. 4, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB316_1
+; PPC64LE-NEXT:    bne- 0, .LBB316_1
 ; PPC64LE-NEXT:  .LBB316_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -5619,7 +5619,7 @@ define i64 @test317(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB317_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stdcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB317_1
+; PPC64LE-NEXT:    bne- 0, .LBB317_1
 ; PPC64LE-NEXT:  .LBB317_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -5637,7 +5637,7 @@ define i64 @test318(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB318_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stdcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB318_1
+; PPC64LE-NEXT:    bne- 0, .LBB318_1
 ; PPC64LE-NEXT:  .LBB318_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -5656,7 +5656,7 @@ define i64 @test319(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB319_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stdcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB319_1
+; PPC64LE-NEXT:    bne- 0, .LBB319_1
 ; PPC64LE-NEXT:  .LBB319_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -5674,7 +5674,7 @@ define i8 @test320(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB320_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stbcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB320_1
+; PPC64LE-NEXT:    bne- 0, .LBB320_1
 ; PPC64LE-NEXT:  .LBB320_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -5692,7 +5692,7 @@ define i8 @test321(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB321_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stbcx. 4, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB321_1
+; PPC64LE-NEXT:    bne- 0, .LBB321_1
 ; PPC64LE-NEXT:  .LBB321_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -5710,7 +5710,7 @@ define i8 @test322(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB322_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stbcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB322_1
+; PPC64LE-NEXT:    bne- 0, .LBB322_1
 ; PPC64LE-NEXT:  .LBB322_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -5728,7 +5728,7 @@ define i8 @test323(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB323_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stbcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB323_1
+; PPC64LE-NEXT:    bne- 0, .LBB323_1
 ; PPC64LE-NEXT:  .LBB323_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -5747,7 +5747,7 @@ define i8 @test324(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB324_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stbcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB324_1
+; PPC64LE-NEXT:    bne- 0, .LBB324_1
 ; PPC64LE-NEXT:  .LBB324_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -5765,7 +5765,7 @@ define i16 @test325(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB325_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    sthcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB325_1
+; PPC64LE-NEXT:    bne- 0, .LBB325_1
 ; PPC64LE-NEXT:  .LBB325_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -5783,7 +5783,7 @@ define i16 @test326(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB326_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    sthcx. 4, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB326_1
+; PPC64LE-NEXT:    bne- 0, .LBB326_1
 ; PPC64LE-NEXT:  .LBB326_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -5801,7 +5801,7 @@ define i16 @test327(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB327_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    sthcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB327_1
+; PPC64LE-NEXT:    bne- 0, .LBB327_1
 ; PPC64LE-NEXT:  .LBB327_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -5819,7 +5819,7 @@ define i16 @test328(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB328_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    sthcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB328_1
+; PPC64LE-NEXT:    bne- 0, .LBB328_1
 ; PPC64LE-NEXT:  .LBB328_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -5838,7 +5838,7 @@ define i16 @test329(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB329_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    sthcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB329_1
+; PPC64LE-NEXT:    bne- 0, .LBB329_1
 ; PPC64LE-NEXT:  .LBB329_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -5856,7 +5856,7 @@ define i32 @test330(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB330_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stwcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB330_1
+; PPC64LE-NEXT:    bne- 0, .LBB330_1
 ; PPC64LE-NEXT:  .LBB330_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -5874,7 +5874,7 @@ define i32 @test331(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB331_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stwcx. 4, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB331_1
+; PPC64LE-NEXT:    bne- 0, .LBB331_1
 ; PPC64LE-NEXT:  .LBB331_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -5892,7 +5892,7 @@ define i32 @test332(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB332_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stwcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB332_1
+; PPC64LE-NEXT:    bne- 0, .LBB332_1
 ; PPC64LE-NEXT:  .LBB332_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -5910,7 +5910,7 @@ define i32 @test333(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB333_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stwcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB333_1
+; PPC64LE-NEXT:    bne- 0, .LBB333_1
 ; PPC64LE-NEXT:  .LBB333_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -5929,7 +5929,7 @@ define i32 @test334(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB334_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stwcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB334_1
+; PPC64LE-NEXT:    bne- 0, .LBB334_1
 ; PPC64LE-NEXT:  .LBB334_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -5947,7 +5947,7 @@ define i64 @test335(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB335_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stdcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB335_1
+; PPC64LE-NEXT:    bne- 0, .LBB335_1
 ; PPC64LE-NEXT:  .LBB335_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -5965,7 +5965,7 @@ define i64 @test336(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB336_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stdcx. 4, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB336_1
+; PPC64LE-NEXT:    bne- 0, .LBB336_1
 ; PPC64LE-NEXT:  .LBB336_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -5983,7 +5983,7 @@ define i64 @test337(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB337_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stdcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB337_1
+; PPC64LE-NEXT:    bne- 0, .LBB337_1
 ; PPC64LE-NEXT:  .LBB337_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -6001,7 +6001,7 @@ define i64 @test338(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB338_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stdcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB338_1
+; PPC64LE-NEXT:    bne- 0, .LBB338_1
 ; PPC64LE-NEXT:  .LBB338_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -6020,7 +6020,7 @@ define i64 @test339(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB339_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stdcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB339_1
+; PPC64LE-NEXT:    bne- 0, .LBB339_1
 ; PPC64LE-NEXT:  .LBB339_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -6035,7 +6035,7 @@ define i8 @test340(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:  .LBB340_1:
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    stbcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB340_1
+; PPC64LE-NEXT:    bne- 0, .LBB340_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -6050,7 +6050,7 @@ define i8 @test341(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:  .LBB341_1:
 ; PPC64LE-NEXT:    lbarx 3, 0, 5
 ; PPC64LE-NEXT:    stbcx. 4, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB341_1
+; PPC64LE-NEXT:    bne- 0, .LBB341_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -6065,7 +6065,7 @@ define i8 @test342(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:  .LBB342_1:
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    stbcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB342_1
+; PPC64LE-NEXT:    bne- 0, .LBB342_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -6080,7 +6080,7 @@ define i8 @test343(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:  .LBB343_1:
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    stbcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB343_1
+; PPC64LE-NEXT:    bne- 0, .LBB343_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -6096,7 +6096,7 @@ define i8 @test344(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:  .LBB344_1:
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    stbcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB344_1
+; PPC64LE-NEXT:    bne- 0, .LBB344_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -6111,7 +6111,7 @@ define i16 @test345(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:  .LBB345_1:
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    sthcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB345_1
+; PPC64LE-NEXT:    bne- 0, .LBB345_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -6126,7 +6126,7 @@ define i16 @test346(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:  .LBB346_1:
 ; PPC64LE-NEXT:    lharx 3, 0, 5
 ; PPC64LE-NEXT:    sthcx. 4, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB346_1
+; PPC64LE-NEXT:    bne- 0, .LBB346_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -6141,7 +6141,7 @@ define i16 @test347(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:  .LBB347_1:
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    sthcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB347_1
+; PPC64LE-NEXT:    bne- 0, .LBB347_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -6156,7 +6156,7 @@ define i16 @test348(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:  .LBB348_1:
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    sthcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB348_1
+; PPC64LE-NEXT:    bne- 0, .LBB348_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -6172,7 +6172,7 @@ define i16 @test349(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:  .LBB349_1:
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    sthcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB349_1
+; PPC64LE-NEXT:    bne- 0, .LBB349_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -6187,7 +6187,7 @@ define i32 @test350(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:  .LBB350_1:
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    stwcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB350_1
+; PPC64LE-NEXT:    bne- 0, .LBB350_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -6202,7 +6202,7 @@ define i32 @test351(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:  .LBB351_1:
 ; PPC64LE-NEXT:    lwarx 3, 0, 5
 ; PPC64LE-NEXT:    stwcx. 4, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB351_1
+; PPC64LE-NEXT:    bne- 0, .LBB351_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -6217,7 +6217,7 @@ define i32 @test352(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:  .LBB352_1:
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    stwcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB352_1
+; PPC64LE-NEXT:    bne- 0, .LBB352_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -6232,7 +6232,7 @@ define i32 @test353(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:  .LBB353_1:
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    stwcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB353_1
+; PPC64LE-NEXT:    bne- 0, .LBB353_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -6248,7 +6248,7 @@ define i32 @test354(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:  .LBB354_1:
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    stwcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB354_1
+; PPC64LE-NEXT:    bne- 0, .LBB354_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -6263,7 +6263,7 @@ define i64 @test355(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:  .LBB355_1:
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    stdcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB355_1
+; PPC64LE-NEXT:    bne- 0, .LBB355_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -6278,7 +6278,7 @@ define i64 @test356(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:  .LBB356_1:
 ; PPC64LE-NEXT:    ldarx 3, 0, 5
 ; PPC64LE-NEXT:    stdcx. 4, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB356_1
+; PPC64LE-NEXT:    bne- 0, .LBB356_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -6293,7 +6293,7 @@ define i64 @test357(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:  .LBB357_1:
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    stdcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB357_1
+; PPC64LE-NEXT:    bne- 0, .LBB357_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -6308,7 +6308,7 @@ define i64 @test358(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:  .LBB358_1:
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    stdcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB358_1
+; PPC64LE-NEXT:    bne- 0, .LBB358_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -6324,7 +6324,7 @@ define i64 @test359(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:  .LBB359_1:
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    stdcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB359_1
+; PPC64LE-NEXT:    bne- 0, .LBB359_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -6340,7 +6340,7 @@ define i8 @test360(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    add 6, 4, 5
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB360_1
+; PPC64LE-NEXT:    bne- 0, .LBB360_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -6356,7 +6356,7 @@ define i8 @test361(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 3, 0, 5
 ; PPC64LE-NEXT:    add 6, 4, 3
 ; PPC64LE-NEXT:    stbcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB361_1
+; PPC64LE-NEXT:    bne- 0, .LBB361_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -6372,7 +6372,7 @@ define i8 @test362(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    add 6, 4, 5
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB362_1
+; PPC64LE-NEXT:    bne- 0, .LBB362_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -6388,7 +6388,7 @@ define i8 @test363(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    add 6, 4, 5
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB363_1
+; PPC64LE-NEXT:    bne- 0, .LBB363_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -6405,7 +6405,7 @@ define i8 @test364(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    add 6, 4, 5
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB364_1
+; PPC64LE-NEXT:    bne- 0, .LBB364_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -6421,7 +6421,7 @@ define i16 @test365(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    add 6, 4, 5
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB365_1
+; PPC64LE-NEXT:    bne- 0, .LBB365_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -6437,7 +6437,7 @@ define i16 @test366(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 3, 0, 5
 ; PPC64LE-NEXT:    add 6, 4, 3
 ; PPC64LE-NEXT:    sthcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB366_1
+; PPC64LE-NEXT:    bne- 0, .LBB366_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -6453,7 +6453,7 @@ define i16 @test367(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    add 6, 4, 5
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB367_1
+; PPC64LE-NEXT:    bne- 0, .LBB367_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -6469,7 +6469,7 @@ define i16 @test368(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    add 6, 4, 5
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB368_1
+; PPC64LE-NEXT:    bne- 0, .LBB368_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -6486,7 +6486,7 @@ define i16 @test369(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    add 6, 4, 5
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB369_1
+; PPC64LE-NEXT:    bne- 0, .LBB369_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -6502,7 +6502,7 @@ define i32 @test370(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    add 6, 4, 5
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB370_1
+; PPC64LE-NEXT:    bne- 0, .LBB370_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -6518,7 +6518,7 @@ define i32 @test371(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 3, 0, 5
 ; PPC64LE-NEXT:    add 6, 4, 3
 ; PPC64LE-NEXT:    stwcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB371_1
+; PPC64LE-NEXT:    bne- 0, .LBB371_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -6534,7 +6534,7 @@ define i32 @test372(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    add 6, 4, 5
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB372_1
+; PPC64LE-NEXT:    bne- 0, .LBB372_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -6550,7 +6550,7 @@ define i32 @test373(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    add 6, 4, 5
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB373_1
+; PPC64LE-NEXT:    bne- 0, .LBB373_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -6567,7 +6567,7 @@ define i32 @test374(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    add 6, 4, 5
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB374_1
+; PPC64LE-NEXT:    bne- 0, .LBB374_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -6583,7 +6583,7 @@ define i64 @test375(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    add 6, 4, 5
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB375_1
+; PPC64LE-NEXT:    bne- 0, .LBB375_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -6599,7 +6599,7 @@ define i64 @test376(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 3, 0, 5
 ; PPC64LE-NEXT:    add 6, 4, 3
 ; PPC64LE-NEXT:    stdcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB376_1
+; PPC64LE-NEXT:    bne- 0, .LBB376_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -6615,7 +6615,7 @@ define i64 @test377(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    add 6, 4, 5
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB377_1
+; PPC64LE-NEXT:    bne- 0, .LBB377_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -6631,7 +6631,7 @@ define i64 @test378(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    add 6, 4, 5
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB378_1
+; PPC64LE-NEXT:    bne- 0, .LBB378_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -6648,7 +6648,7 @@ define i64 @test379(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    add 6, 4, 5
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB379_1
+; PPC64LE-NEXT:    bne- 0, .LBB379_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -6664,7 +6664,7 @@ define i8 @test380(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    sub 6, 5, 4
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB380_1
+; PPC64LE-NEXT:    bne- 0, .LBB380_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -6680,7 +6680,7 @@ define i8 @test381(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 3, 0, 5
 ; PPC64LE-NEXT:    sub 6, 3, 4
 ; PPC64LE-NEXT:    stbcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB381_1
+; PPC64LE-NEXT:    bne- 0, .LBB381_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -6696,7 +6696,7 @@ define i8 @test382(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    sub 6, 5, 4
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB382_1
+; PPC64LE-NEXT:    bne- 0, .LBB382_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -6712,7 +6712,7 @@ define i8 @test383(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    sub 6, 5, 4
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB383_1
+; PPC64LE-NEXT:    bne- 0, .LBB383_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -6729,7 +6729,7 @@ define i8 @test384(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    sub 6, 5, 4
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB384_1
+; PPC64LE-NEXT:    bne- 0, .LBB384_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -6745,7 +6745,7 @@ define i16 @test385(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    sub 6, 5, 4
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB385_1
+; PPC64LE-NEXT:    bne- 0, .LBB385_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -6761,7 +6761,7 @@ define i16 @test386(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 3, 0, 5
 ; PPC64LE-NEXT:    sub 6, 3, 4
 ; PPC64LE-NEXT:    sthcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB386_1
+; PPC64LE-NEXT:    bne- 0, .LBB386_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -6777,7 +6777,7 @@ define i16 @test387(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    sub 6, 5, 4
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB387_1
+; PPC64LE-NEXT:    bne- 0, .LBB387_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -6793,7 +6793,7 @@ define i16 @test388(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    sub 6, 5, 4
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB388_1
+; PPC64LE-NEXT:    bne- 0, .LBB388_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -6810,7 +6810,7 @@ define i16 @test389(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    sub 6, 5, 4
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB389_1
+; PPC64LE-NEXT:    bne- 0, .LBB389_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -6826,7 +6826,7 @@ define i32 @test390(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    sub 6, 5, 4
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB390_1
+; PPC64LE-NEXT:    bne- 0, .LBB390_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -6842,7 +6842,7 @@ define i32 @test391(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 3, 0, 5
 ; PPC64LE-NEXT:    sub 6, 3, 4
 ; PPC64LE-NEXT:    stwcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB391_1
+; PPC64LE-NEXT:    bne- 0, .LBB391_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -6858,7 +6858,7 @@ define i32 @test392(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    sub 6, 5, 4
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB392_1
+; PPC64LE-NEXT:    bne- 0, .LBB392_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -6874,7 +6874,7 @@ define i32 @test393(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    sub 6, 5, 4
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB393_1
+; PPC64LE-NEXT:    bne- 0, .LBB393_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -6891,7 +6891,7 @@ define i32 @test394(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    sub 6, 5, 4
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB394_1
+; PPC64LE-NEXT:    bne- 0, .LBB394_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -6907,7 +6907,7 @@ define i64 @test395(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    sub 6, 5, 4
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB395_1
+; PPC64LE-NEXT:    bne- 0, .LBB395_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -6923,7 +6923,7 @@ define i64 @test396(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 3, 0, 5
 ; PPC64LE-NEXT:    sub 6, 3, 4
 ; PPC64LE-NEXT:    stdcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB396_1
+; PPC64LE-NEXT:    bne- 0, .LBB396_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -6939,7 +6939,7 @@ define i64 @test397(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    sub 6, 5, 4
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB397_1
+; PPC64LE-NEXT:    bne- 0, .LBB397_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -6955,7 +6955,7 @@ define i64 @test398(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    sub 6, 5, 4
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB398_1
+; PPC64LE-NEXT:    bne- 0, .LBB398_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -6972,7 +6972,7 @@ define i64 @test399(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    sub 6, 5, 4
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB399_1
+; PPC64LE-NEXT:    bne- 0, .LBB399_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -6988,7 +6988,7 @@ define i8 @test400(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    and 6, 4, 5
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB400_1
+; PPC64LE-NEXT:    bne- 0, .LBB400_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -7004,7 +7004,7 @@ define i8 @test401(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 3, 0, 5
 ; PPC64LE-NEXT:    and 6, 4, 3
 ; PPC64LE-NEXT:    stbcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB401_1
+; PPC64LE-NEXT:    bne- 0, .LBB401_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -7020,7 +7020,7 @@ define i8 @test402(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    and 6, 4, 5
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB402_1
+; PPC64LE-NEXT:    bne- 0, .LBB402_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -7036,7 +7036,7 @@ define i8 @test403(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    and 6, 4, 5
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB403_1
+; PPC64LE-NEXT:    bne- 0, .LBB403_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -7053,7 +7053,7 @@ define i8 @test404(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    and 6, 4, 5
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB404_1
+; PPC64LE-NEXT:    bne- 0, .LBB404_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -7069,7 +7069,7 @@ define i16 @test405(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    and 6, 4, 5
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB405_1
+; PPC64LE-NEXT:    bne- 0, .LBB405_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -7085,7 +7085,7 @@ define i16 @test406(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 3, 0, 5
 ; PPC64LE-NEXT:    and 6, 4, 3
 ; PPC64LE-NEXT:    sthcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB406_1
+; PPC64LE-NEXT:    bne- 0, .LBB406_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -7101,7 +7101,7 @@ define i16 @test407(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    and 6, 4, 5
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB407_1
+; PPC64LE-NEXT:    bne- 0, .LBB407_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -7117,7 +7117,7 @@ define i16 @test408(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    and 6, 4, 5
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB408_1
+; PPC64LE-NEXT:    bne- 0, .LBB408_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -7134,7 +7134,7 @@ define i16 @test409(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    and 6, 4, 5
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB409_1
+; PPC64LE-NEXT:    bne- 0, .LBB409_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -7150,7 +7150,7 @@ define i32 @test410(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    and 6, 4, 5
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB410_1
+; PPC64LE-NEXT:    bne- 0, .LBB410_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -7166,7 +7166,7 @@ define i32 @test411(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 3, 0, 5
 ; PPC64LE-NEXT:    and 6, 4, 3
 ; PPC64LE-NEXT:    stwcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB411_1
+; PPC64LE-NEXT:    bne- 0, .LBB411_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -7182,7 +7182,7 @@ define i32 @test412(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    and 6, 4, 5
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB412_1
+; PPC64LE-NEXT:    bne- 0, .LBB412_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -7198,7 +7198,7 @@ define i32 @test413(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    and 6, 4, 5
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB413_1
+; PPC64LE-NEXT:    bne- 0, .LBB413_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -7215,7 +7215,7 @@ define i32 @test414(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    and 6, 4, 5
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB414_1
+; PPC64LE-NEXT:    bne- 0, .LBB414_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -7231,7 +7231,7 @@ define i64 @test415(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    and 6, 4, 5
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB415_1
+; PPC64LE-NEXT:    bne- 0, .LBB415_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -7247,7 +7247,7 @@ define i64 @test416(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 3, 0, 5
 ; PPC64LE-NEXT:    and 6, 4, 3
 ; PPC64LE-NEXT:    stdcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB416_1
+; PPC64LE-NEXT:    bne- 0, .LBB416_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -7263,7 +7263,7 @@ define i64 @test417(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    and 6, 4, 5
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB417_1
+; PPC64LE-NEXT:    bne- 0, .LBB417_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -7279,7 +7279,7 @@ define i64 @test418(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    and 6, 4, 5
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB418_1
+; PPC64LE-NEXT:    bne- 0, .LBB418_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -7296,7 +7296,7 @@ define i64 @test419(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    and 6, 4, 5
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB419_1
+; PPC64LE-NEXT:    bne- 0, .LBB419_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -7312,7 +7312,7 @@ define i8 @test420(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    nand 6, 4, 5
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB420_1
+; PPC64LE-NEXT:    bne- 0, .LBB420_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -7328,7 +7328,7 @@ define i8 @test421(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 3, 0, 5
 ; PPC64LE-NEXT:    nand 6, 4, 3
 ; PPC64LE-NEXT:    stbcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB421_1
+; PPC64LE-NEXT:    bne- 0, .LBB421_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -7344,7 +7344,7 @@ define i8 @test422(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    nand 6, 4, 5
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB422_1
+; PPC64LE-NEXT:    bne- 0, .LBB422_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -7360,7 +7360,7 @@ define i8 @test423(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    nand 6, 4, 5
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB423_1
+; PPC64LE-NEXT:    bne- 0, .LBB423_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -7377,7 +7377,7 @@ define i8 @test424(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    nand 6, 4, 5
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB424_1
+; PPC64LE-NEXT:    bne- 0, .LBB424_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -7393,7 +7393,7 @@ define i16 @test425(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    nand 6, 4, 5
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB425_1
+; PPC64LE-NEXT:    bne- 0, .LBB425_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -7409,7 +7409,7 @@ define i16 @test426(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 3, 0, 5
 ; PPC64LE-NEXT:    nand 6, 4, 3
 ; PPC64LE-NEXT:    sthcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB426_1
+; PPC64LE-NEXT:    bne- 0, .LBB426_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -7425,7 +7425,7 @@ define i16 @test427(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    nand 6, 4, 5
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB427_1
+; PPC64LE-NEXT:    bne- 0, .LBB427_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -7441,7 +7441,7 @@ define i16 @test428(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    nand 6, 4, 5
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB428_1
+; PPC64LE-NEXT:    bne- 0, .LBB428_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -7458,7 +7458,7 @@ define i16 @test429(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    nand 6, 4, 5
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB429_1
+; PPC64LE-NEXT:    bne- 0, .LBB429_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -7474,7 +7474,7 @@ define i32 @test430(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    nand 6, 4, 5
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB430_1
+; PPC64LE-NEXT:    bne- 0, .LBB430_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -7490,7 +7490,7 @@ define i32 @test431(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 3, 0, 5
 ; PPC64LE-NEXT:    nand 6, 4, 3
 ; PPC64LE-NEXT:    stwcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB431_1
+; PPC64LE-NEXT:    bne- 0, .LBB431_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -7506,7 +7506,7 @@ define i32 @test432(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    nand 6, 4, 5
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB432_1
+; PPC64LE-NEXT:    bne- 0, .LBB432_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -7522,7 +7522,7 @@ define i32 @test433(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    nand 6, 4, 5
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB433_1
+; PPC64LE-NEXT:    bne- 0, .LBB433_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -7539,7 +7539,7 @@ define i32 @test434(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    nand 6, 4, 5
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB434_1
+; PPC64LE-NEXT:    bne- 0, .LBB434_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -7555,7 +7555,7 @@ define i64 @test435(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    nand 6, 4, 5
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB435_1
+; PPC64LE-NEXT:    bne- 0, .LBB435_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -7571,7 +7571,7 @@ define i64 @test436(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 3, 0, 5
 ; PPC64LE-NEXT:    nand 6, 4, 3
 ; PPC64LE-NEXT:    stdcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB436_1
+; PPC64LE-NEXT:    bne- 0, .LBB436_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -7587,7 +7587,7 @@ define i64 @test437(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    nand 6, 4, 5
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB437_1
+; PPC64LE-NEXT:    bne- 0, .LBB437_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -7603,7 +7603,7 @@ define i64 @test438(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    nand 6, 4, 5
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB438_1
+; PPC64LE-NEXT:    bne- 0, .LBB438_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -7620,7 +7620,7 @@ define i64 @test439(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    nand 6, 4, 5
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB439_1
+; PPC64LE-NEXT:    bne- 0, .LBB439_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -7636,7 +7636,7 @@ define i8 @test440(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    or 6, 4, 5
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB440_1
+; PPC64LE-NEXT:    bne- 0, .LBB440_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -7652,7 +7652,7 @@ define i8 @test441(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 3, 0, 5
 ; PPC64LE-NEXT:    or 6, 4, 3
 ; PPC64LE-NEXT:    stbcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB441_1
+; PPC64LE-NEXT:    bne- 0, .LBB441_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -7668,7 +7668,7 @@ define i8 @test442(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    or 6, 4, 5
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB442_1
+; PPC64LE-NEXT:    bne- 0, .LBB442_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -7684,7 +7684,7 @@ define i8 @test443(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    or 6, 4, 5
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB443_1
+; PPC64LE-NEXT:    bne- 0, .LBB443_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -7701,7 +7701,7 @@ define i8 @test444(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    or 6, 4, 5
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB444_1
+; PPC64LE-NEXT:    bne- 0, .LBB444_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -7717,7 +7717,7 @@ define i16 @test445(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    or 6, 4, 5
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB445_1
+; PPC64LE-NEXT:    bne- 0, .LBB445_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -7733,7 +7733,7 @@ define i16 @test446(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 3, 0, 5
 ; PPC64LE-NEXT:    or 6, 4, 3
 ; PPC64LE-NEXT:    sthcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB446_1
+; PPC64LE-NEXT:    bne- 0, .LBB446_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -7749,7 +7749,7 @@ define i16 @test447(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    or 6, 4, 5
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB447_1
+; PPC64LE-NEXT:    bne- 0, .LBB447_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -7765,7 +7765,7 @@ define i16 @test448(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    or 6, 4, 5
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB448_1
+; PPC64LE-NEXT:    bne- 0, .LBB448_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -7782,7 +7782,7 @@ define i16 @test449(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    or 6, 4, 5
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB449_1
+; PPC64LE-NEXT:    bne- 0, .LBB449_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -7798,7 +7798,7 @@ define i32 @test450(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    or 6, 4, 5
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB450_1
+; PPC64LE-NEXT:    bne- 0, .LBB450_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -7814,7 +7814,7 @@ define i32 @test451(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 3, 0, 5
 ; PPC64LE-NEXT:    or 6, 4, 3
 ; PPC64LE-NEXT:    stwcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB451_1
+; PPC64LE-NEXT:    bne- 0, .LBB451_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -7830,7 +7830,7 @@ define i32 @test452(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    or 6, 4, 5
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB452_1
+; PPC64LE-NEXT:    bne- 0, .LBB452_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -7846,7 +7846,7 @@ define i32 @test453(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    or 6, 4, 5
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB453_1
+; PPC64LE-NEXT:    bne- 0, .LBB453_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -7863,7 +7863,7 @@ define i32 @test454(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    or 6, 4, 5
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB454_1
+; PPC64LE-NEXT:    bne- 0, .LBB454_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -7879,7 +7879,7 @@ define i64 @test455(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    or 6, 4, 5
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB455_1
+; PPC64LE-NEXT:    bne- 0, .LBB455_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -7895,7 +7895,7 @@ define i64 @test456(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 3, 0, 5
 ; PPC64LE-NEXT:    or 6, 4, 3
 ; PPC64LE-NEXT:    stdcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB456_1
+; PPC64LE-NEXT:    bne- 0, .LBB456_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -7911,7 +7911,7 @@ define i64 @test457(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    or 6, 4, 5
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB457_1
+; PPC64LE-NEXT:    bne- 0, .LBB457_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -7927,7 +7927,7 @@ define i64 @test458(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    or 6, 4, 5
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB458_1
+; PPC64LE-NEXT:    bne- 0, .LBB458_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -7944,7 +7944,7 @@ define i64 @test459(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    or 6, 4, 5
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB459_1
+; PPC64LE-NEXT:    bne- 0, .LBB459_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -7960,7 +7960,7 @@ define i8 @test460(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    xor 6, 4, 5
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB460_1
+; PPC64LE-NEXT:    bne- 0, .LBB460_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -7976,7 +7976,7 @@ define i8 @test461(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 3, 0, 5
 ; PPC64LE-NEXT:    xor 6, 4, 3
 ; PPC64LE-NEXT:    stbcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB461_1
+; PPC64LE-NEXT:    bne- 0, .LBB461_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -7992,7 +7992,7 @@ define i8 @test462(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    xor 6, 4, 5
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB462_1
+; PPC64LE-NEXT:    bne- 0, .LBB462_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -8008,7 +8008,7 @@ define i8 @test463(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    xor 6, 4, 5
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB463_1
+; PPC64LE-NEXT:    bne- 0, .LBB463_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -8025,7 +8025,7 @@ define i8 @test464(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    lbarx 5, 0, 3
 ; PPC64LE-NEXT:    xor 6, 4, 5
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB464_1
+; PPC64LE-NEXT:    bne- 0, .LBB464_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -8041,7 +8041,7 @@ define i16 @test465(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    xor 6, 4, 5
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB465_1
+; PPC64LE-NEXT:    bne- 0, .LBB465_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -8057,7 +8057,7 @@ define i16 @test466(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 3, 0, 5
 ; PPC64LE-NEXT:    xor 6, 4, 3
 ; PPC64LE-NEXT:    sthcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB466_1
+; PPC64LE-NEXT:    bne- 0, .LBB466_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -8073,7 +8073,7 @@ define i16 @test467(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    xor 6, 4, 5
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB467_1
+; PPC64LE-NEXT:    bne- 0, .LBB467_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -8089,7 +8089,7 @@ define i16 @test468(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    xor 6, 4, 5
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB468_1
+; PPC64LE-NEXT:    bne- 0, .LBB468_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -8106,7 +8106,7 @@ define i16 @test469(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    lharx 5, 0, 3
 ; PPC64LE-NEXT:    xor 6, 4, 5
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB469_1
+; PPC64LE-NEXT:    bne- 0, .LBB469_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -8122,7 +8122,7 @@ define i32 @test470(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    xor 6, 4, 5
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB470_1
+; PPC64LE-NEXT:    bne- 0, .LBB470_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -8138,7 +8138,7 @@ define i32 @test471(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 3, 0, 5
 ; PPC64LE-NEXT:    xor 6, 4, 3
 ; PPC64LE-NEXT:    stwcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB471_1
+; PPC64LE-NEXT:    bne- 0, .LBB471_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -8154,7 +8154,7 @@ define i32 @test472(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    xor 6, 4, 5
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB472_1
+; PPC64LE-NEXT:    bne- 0, .LBB472_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -8170,7 +8170,7 @@ define i32 @test473(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    xor 6, 4, 5
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB473_1
+; PPC64LE-NEXT:    bne- 0, .LBB473_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -8187,7 +8187,7 @@ define i32 @test474(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    lwarx 5, 0, 3
 ; PPC64LE-NEXT:    xor 6, 4, 5
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB474_1
+; PPC64LE-NEXT:    bne- 0, .LBB474_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -8203,7 +8203,7 @@ define i64 @test475(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    xor 6, 4, 5
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB475_1
+; PPC64LE-NEXT:    bne- 0, .LBB475_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -8219,7 +8219,7 @@ define i64 @test476(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 3, 0, 5
 ; PPC64LE-NEXT:    xor 6, 4, 3
 ; PPC64LE-NEXT:    stdcx. 6, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB476_1
+; PPC64LE-NEXT:    bne- 0, .LBB476_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -8235,7 +8235,7 @@ define i64 @test477(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    xor 6, 4, 5
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB477_1
+; PPC64LE-NEXT:    bne- 0, .LBB477_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -8251,7 +8251,7 @@ define i64 @test478(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    xor 6, 4, 5
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB478_1
+; PPC64LE-NEXT:    bne- 0, .LBB478_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -8268,7 +8268,7 @@ define i64 @test479(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    ldarx 5, 0, 3
 ; PPC64LE-NEXT:    xor 6, 4, 5
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB479_1
+; PPC64LE-NEXT:    bne- 0, .LBB479_1
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -8288,7 +8288,7 @@ define i8 @test480(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB480_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB480_1
+; PPC64LE-NEXT:    bne- 0, .LBB480_1
 ; PPC64LE-NEXT:  .LBB480_3:
 ; PPC64LE-NEXT:    mr 3, 4
 ; PPC64LE-NEXT:    blr
@@ -8307,7 +8307,7 @@ define i8 @test481(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB481_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB481_1
+; PPC64LE-NEXT:    bne- 0, .LBB481_1
 ; PPC64LE-NEXT:  .LBB481_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 4
@@ -8328,7 +8328,7 @@ define i8 @test482(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB482_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB482_1
+; PPC64LE-NEXT:    bne- 0, .LBB482_1
 ; PPC64LE-NEXT:  .LBB482_3:
 ; PPC64LE-NEXT:    mr 3, 4
 ; PPC64LE-NEXT:    blr
@@ -8348,7 +8348,7 @@ define i8 @test483(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB483_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB483_1
+; PPC64LE-NEXT:    bne- 0, .LBB483_1
 ; PPC64LE-NEXT:  .LBB483_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 4
@@ -8369,7 +8369,7 @@ define i8 @test484(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB484_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB484_1
+; PPC64LE-NEXT:    bne- 0, .LBB484_1
 ; PPC64LE-NEXT:  .LBB484_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 4
@@ -8389,7 +8389,7 @@ define i16 @test485(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB485_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB485_1
+; PPC64LE-NEXT:    bne- 0, .LBB485_1
 ; PPC64LE-NEXT:  .LBB485_3:
 ; PPC64LE-NEXT:    mr 3, 4
 ; PPC64LE-NEXT:    blr
@@ -8408,7 +8408,7 @@ define i16 @test486(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB486_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB486_1
+; PPC64LE-NEXT:    bne- 0, .LBB486_1
 ; PPC64LE-NEXT:  .LBB486_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 4
@@ -8429,7 +8429,7 @@ define i16 @test487(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB487_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB487_1
+; PPC64LE-NEXT:    bne- 0, .LBB487_1
 ; PPC64LE-NEXT:  .LBB487_3:
 ; PPC64LE-NEXT:    mr 3, 4
 ; PPC64LE-NEXT:    blr
@@ -8449,7 +8449,7 @@ define i16 @test488(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB488_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB488_1
+; PPC64LE-NEXT:    bne- 0, .LBB488_1
 ; PPC64LE-NEXT:  .LBB488_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 4
@@ -8470,7 +8470,7 @@ define i16 @test489(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB489_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB489_1
+; PPC64LE-NEXT:    bne- 0, .LBB489_1
 ; PPC64LE-NEXT:  .LBB489_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 4
@@ -8488,7 +8488,7 @@ define i32 @test490(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB490_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stwcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB490_1
+; PPC64LE-NEXT:    bne- 0, .LBB490_1
 ; PPC64LE-NEXT:  .LBB490_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -8506,7 +8506,7 @@ define i32 @test491(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB491_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stwcx. 4, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB491_1
+; PPC64LE-NEXT:    bne- 0, .LBB491_1
 ; PPC64LE-NEXT:  .LBB491_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -8524,7 +8524,7 @@ define i32 @test492(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB492_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stwcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB492_1
+; PPC64LE-NEXT:    bne- 0, .LBB492_1
 ; PPC64LE-NEXT:  .LBB492_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -8542,7 +8542,7 @@ define i32 @test493(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB493_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stwcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB493_1
+; PPC64LE-NEXT:    bne- 0, .LBB493_1
 ; PPC64LE-NEXT:  .LBB493_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -8561,7 +8561,7 @@ define i32 @test494(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB494_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stwcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB494_1
+; PPC64LE-NEXT:    bne- 0, .LBB494_1
 ; PPC64LE-NEXT:  .LBB494_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -8579,7 +8579,7 @@ define i64 @test495(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB495_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stdcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB495_1
+; PPC64LE-NEXT:    bne- 0, .LBB495_1
 ; PPC64LE-NEXT:  .LBB495_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -8597,7 +8597,7 @@ define i64 @test496(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB496_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stdcx. 4, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB496_1
+; PPC64LE-NEXT:    bne- 0, .LBB496_1
 ; PPC64LE-NEXT:  .LBB496_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -8615,7 +8615,7 @@ define i64 @test497(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB497_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stdcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB497_1
+; PPC64LE-NEXT:    bne- 0, .LBB497_1
 ; PPC64LE-NEXT:  .LBB497_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -8633,7 +8633,7 @@ define i64 @test498(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB498_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stdcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB498_1
+; PPC64LE-NEXT:    bne- 0, .LBB498_1
 ; PPC64LE-NEXT:  .LBB498_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -8652,7 +8652,7 @@ define i64 @test499(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB499_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stdcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB499_1
+; PPC64LE-NEXT:    bne- 0, .LBB499_1
 ; PPC64LE-NEXT:  .LBB499_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -8672,7 +8672,7 @@ define i8 @test500(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB500_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB500_1
+; PPC64LE-NEXT:    bne- 0, .LBB500_1
 ; PPC64LE-NEXT:  .LBB500_3:
 ; PPC64LE-NEXT:    mr 3, 4
 ; PPC64LE-NEXT:    blr
@@ -8691,7 +8691,7 @@ define i8 @test501(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB501_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB501_1
+; PPC64LE-NEXT:    bne- 0, .LBB501_1
 ; PPC64LE-NEXT:  .LBB501_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 4
@@ -8712,7 +8712,7 @@ define i8 @test502(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB502_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB502_1
+; PPC64LE-NEXT:    bne- 0, .LBB502_1
 ; PPC64LE-NEXT:  .LBB502_3:
 ; PPC64LE-NEXT:    mr 3, 4
 ; PPC64LE-NEXT:    blr
@@ -8732,7 +8732,7 @@ define i8 @test503(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB503_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB503_1
+; PPC64LE-NEXT:    bne- 0, .LBB503_1
 ; PPC64LE-NEXT:  .LBB503_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 4
@@ -8753,7 +8753,7 @@ define i8 @test504(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB504_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB504_1
+; PPC64LE-NEXT:    bne- 0, .LBB504_1
 ; PPC64LE-NEXT:  .LBB504_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 4
@@ -8773,7 +8773,7 @@ define i16 @test505(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB505_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB505_1
+; PPC64LE-NEXT:    bne- 0, .LBB505_1
 ; PPC64LE-NEXT:  .LBB505_3:
 ; PPC64LE-NEXT:    mr 3, 4
 ; PPC64LE-NEXT:    blr
@@ -8792,7 +8792,7 @@ define i16 @test506(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB506_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB506_1
+; PPC64LE-NEXT:    bne- 0, .LBB506_1
 ; PPC64LE-NEXT:  .LBB506_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 4
@@ -8813,7 +8813,7 @@ define i16 @test507(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB507_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB507_1
+; PPC64LE-NEXT:    bne- 0, .LBB507_1
 ; PPC64LE-NEXT:  .LBB507_3:
 ; PPC64LE-NEXT:    mr 3, 4
 ; PPC64LE-NEXT:    blr
@@ -8833,7 +8833,7 @@ define i16 @test508(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB508_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB508_1
+; PPC64LE-NEXT:    bne- 0, .LBB508_1
 ; PPC64LE-NEXT:  .LBB508_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 4
@@ -8854,7 +8854,7 @@ define i16 @test509(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB509_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB509_1
+; PPC64LE-NEXT:    bne- 0, .LBB509_1
 ; PPC64LE-NEXT:  .LBB509_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 4
@@ -8872,7 +8872,7 @@ define i32 @test510(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB510_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stwcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB510_1
+; PPC64LE-NEXT:    bne- 0, .LBB510_1
 ; PPC64LE-NEXT:  .LBB510_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -8890,7 +8890,7 @@ define i32 @test511(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB511_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stwcx. 4, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB511_1
+; PPC64LE-NEXT:    bne- 0, .LBB511_1
 ; PPC64LE-NEXT:  .LBB511_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -8908,7 +8908,7 @@ define i32 @test512(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB512_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stwcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB512_1
+; PPC64LE-NEXT:    bne- 0, .LBB512_1
 ; PPC64LE-NEXT:  .LBB512_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -8926,7 +8926,7 @@ define i32 @test513(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB513_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stwcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB513_1
+; PPC64LE-NEXT:    bne- 0, .LBB513_1
 ; PPC64LE-NEXT:  .LBB513_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -8945,7 +8945,7 @@ define i32 @test514(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB514_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stwcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB514_1
+; PPC64LE-NEXT:    bne- 0, .LBB514_1
 ; PPC64LE-NEXT:  .LBB514_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -8963,7 +8963,7 @@ define i64 @test515(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB515_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stdcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB515_1
+; PPC64LE-NEXT:    bne- 0, .LBB515_1
 ; PPC64LE-NEXT:  .LBB515_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -8981,7 +8981,7 @@ define i64 @test516(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB516_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stdcx. 4, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB516_1
+; PPC64LE-NEXT:    bne- 0, .LBB516_1
 ; PPC64LE-NEXT:  .LBB516_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -8999,7 +8999,7 @@ define i64 @test517(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB517_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stdcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB517_1
+; PPC64LE-NEXT:    bne- 0, .LBB517_1
 ; PPC64LE-NEXT:  .LBB517_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -9017,7 +9017,7 @@ define i64 @test518(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB518_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stdcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB518_1
+; PPC64LE-NEXT:    bne- 0, .LBB518_1
 ; PPC64LE-NEXT:  .LBB518_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -9036,7 +9036,7 @@ define i64 @test519(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB519_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stdcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB519_1
+; PPC64LE-NEXT:    bne- 0, .LBB519_1
 ; PPC64LE-NEXT:  .LBB519_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -9054,7 +9054,7 @@ define i8 @test520(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB520_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stbcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB520_1
+; PPC64LE-NEXT:    bne- 0, .LBB520_1
 ; PPC64LE-NEXT:  .LBB520_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -9072,7 +9072,7 @@ define i8 @test521(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB521_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stbcx. 4, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB521_1
+; PPC64LE-NEXT:    bne- 0, .LBB521_1
 ; PPC64LE-NEXT:  .LBB521_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -9090,7 +9090,7 @@ define i8 @test522(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB522_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stbcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB522_1
+; PPC64LE-NEXT:    bne- 0, .LBB522_1
 ; PPC64LE-NEXT:  .LBB522_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -9108,7 +9108,7 @@ define i8 @test523(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB523_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stbcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB523_1
+; PPC64LE-NEXT:    bne- 0, .LBB523_1
 ; PPC64LE-NEXT:  .LBB523_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -9127,7 +9127,7 @@ define i8 @test524(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB524_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stbcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB524_1
+; PPC64LE-NEXT:    bne- 0, .LBB524_1
 ; PPC64LE-NEXT:  .LBB524_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -9145,7 +9145,7 @@ define i16 @test525(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB525_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    sthcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB525_1
+; PPC64LE-NEXT:    bne- 0, .LBB525_1
 ; PPC64LE-NEXT:  .LBB525_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -9163,7 +9163,7 @@ define i16 @test526(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB526_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    sthcx. 4, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB526_1
+; PPC64LE-NEXT:    bne- 0, .LBB526_1
 ; PPC64LE-NEXT:  .LBB526_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -9181,7 +9181,7 @@ define i16 @test527(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB527_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    sthcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB527_1
+; PPC64LE-NEXT:    bne- 0, .LBB527_1
 ; PPC64LE-NEXT:  .LBB527_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -9199,7 +9199,7 @@ define i16 @test528(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB528_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    sthcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB528_1
+; PPC64LE-NEXT:    bne- 0, .LBB528_1
 ; PPC64LE-NEXT:  .LBB528_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -9218,7 +9218,7 @@ define i16 @test529(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB529_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    sthcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB529_1
+; PPC64LE-NEXT:    bne- 0, .LBB529_1
 ; PPC64LE-NEXT:  .LBB529_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -9236,7 +9236,7 @@ define i32 @test530(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB530_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stwcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB530_1
+; PPC64LE-NEXT:    bne- 0, .LBB530_1
 ; PPC64LE-NEXT:  .LBB530_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -9254,7 +9254,7 @@ define i32 @test531(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB531_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stwcx. 4, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB531_1
+; PPC64LE-NEXT:    bne- 0, .LBB531_1
 ; PPC64LE-NEXT:  .LBB531_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -9272,7 +9272,7 @@ define i32 @test532(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB532_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stwcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB532_1
+; PPC64LE-NEXT:    bne- 0, .LBB532_1
 ; PPC64LE-NEXT:  .LBB532_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -9290,7 +9290,7 @@ define i32 @test533(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB533_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stwcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB533_1
+; PPC64LE-NEXT:    bne- 0, .LBB533_1
 ; PPC64LE-NEXT:  .LBB533_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -9309,7 +9309,7 @@ define i32 @test534(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB534_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stwcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB534_1
+; PPC64LE-NEXT:    bne- 0, .LBB534_1
 ; PPC64LE-NEXT:  .LBB534_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -9327,7 +9327,7 @@ define i64 @test535(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB535_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stdcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB535_1
+; PPC64LE-NEXT:    bne- 0, .LBB535_1
 ; PPC64LE-NEXT:  .LBB535_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -9345,7 +9345,7 @@ define i64 @test536(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB536_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stdcx. 4, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB536_1
+; PPC64LE-NEXT:    bne- 0, .LBB536_1
 ; PPC64LE-NEXT:  .LBB536_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -9363,7 +9363,7 @@ define i64 @test537(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB537_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stdcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB537_1
+; PPC64LE-NEXT:    bne- 0, .LBB537_1
 ; PPC64LE-NEXT:  .LBB537_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -9381,7 +9381,7 @@ define i64 @test538(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB538_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stdcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB538_1
+; PPC64LE-NEXT:    bne- 0, .LBB538_1
 ; PPC64LE-NEXT:  .LBB538_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -9400,7 +9400,7 @@ define i64 @test539(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    bgt 0, .LBB539_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stdcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB539_1
+; PPC64LE-NEXT:    bne- 0, .LBB539_1
 ; PPC64LE-NEXT:  .LBB539_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -9418,7 +9418,7 @@ define i8 @test540(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB540_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stbcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB540_1
+; PPC64LE-NEXT:    bne- 0, .LBB540_1
 ; PPC64LE-NEXT:  .LBB540_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -9436,7 +9436,7 @@ define i8 @test541(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB541_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stbcx. 4, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB541_1
+; PPC64LE-NEXT:    bne- 0, .LBB541_1
 ; PPC64LE-NEXT:  .LBB541_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -9454,7 +9454,7 @@ define i8 @test542(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB542_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stbcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB542_1
+; PPC64LE-NEXT:    bne- 0, .LBB542_1
 ; PPC64LE-NEXT:  .LBB542_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -9472,7 +9472,7 @@ define i8 @test543(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB543_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stbcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB543_1
+; PPC64LE-NEXT:    bne- 0, .LBB543_1
 ; PPC64LE-NEXT:  .LBB543_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -9491,7 +9491,7 @@ define i8 @test544(ptr %ptr, i8 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB544_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stbcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB544_1
+; PPC64LE-NEXT:    bne- 0, .LBB544_1
 ; PPC64LE-NEXT:  .LBB544_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -9509,7 +9509,7 @@ define i16 @test545(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB545_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    sthcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB545_1
+; PPC64LE-NEXT:    bne- 0, .LBB545_1
 ; PPC64LE-NEXT:  .LBB545_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -9527,7 +9527,7 @@ define i16 @test546(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB546_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    sthcx. 4, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB546_1
+; PPC64LE-NEXT:    bne- 0, .LBB546_1
 ; PPC64LE-NEXT:  .LBB546_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -9545,7 +9545,7 @@ define i16 @test547(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB547_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    sthcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB547_1
+; PPC64LE-NEXT:    bne- 0, .LBB547_1
 ; PPC64LE-NEXT:  .LBB547_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -9563,7 +9563,7 @@ define i16 @test548(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB548_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    sthcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB548_1
+; PPC64LE-NEXT:    bne- 0, .LBB548_1
 ; PPC64LE-NEXT:  .LBB548_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -9582,7 +9582,7 @@ define i16 @test549(ptr %ptr, i16 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB549_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    sthcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB549_1
+; PPC64LE-NEXT:    bne- 0, .LBB549_1
 ; PPC64LE-NEXT:  .LBB549_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -9600,7 +9600,7 @@ define i32 @test550(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB550_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stwcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB550_1
+; PPC64LE-NEXT:    bne- 0, .LBB550_1
 ; PPC64LE-NEXT:  .LBB550_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -9618,7 +9618,7 @@ define i32 @test551(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB551_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stwcx. 4, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB551_1
+; PPC64LE-NEXT:    bne- 0, .LBB551_1
 ; PPC64LE-NEXT:  .LBB551_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -9636,7 +9636,7 @@ define i32 @test552(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB552_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stwcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB552_1
+; PPC64LE-NEXT:    bne- 0, .LBB552_1
 ; PPC64LE-NEXT:  .LBB552_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -9654,7 +9654,7 @@ define i32 @test553(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB553_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stwcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB553_1
+; PPC64LE-NEXT:    bne- 0, .LBB553_1
 ; PPC64LE-NEXT:  .LBB553_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -9673,7 +9673,7 @@ define i32 @test554(ptr %ptr, i32 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB554_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stwcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB554_1
+; PPC64LE-NEXT:    bne- 0, .LBB554_1
 ; PPC64LE-NEXT:  .LBB554_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -9691,7 +9691,7 @@ define i64 @test555(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB555_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stdcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB555_1
+; PPC64LE-NEXT:    bne- 0, .LBB555_1
 ; PPC64LE-NEXT:  .LBB555_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -9709,7 +9709,7 @@ define i64 @test556(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB556_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stdcx. 4, 0, 5
-; PPC64LE-NEXT:    bne 0, .LBB556_1
+; PPC64LE-NEXT:    bne- 0, .LBB556_1
 ; PPC64LE-NEXT:  .LBB556_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
@@ -9727,7 +9727,7 @@ define i64 @test557(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB557_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stdcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB557_1
+; PPC64LE-NEXT:    bne- 0, .LBB557_1
 ; PPC64LE-NEXT:  .LBB557_3:
 ; PPC64LE-NEXT:    mr 3, 5
 ; PPC64LE-NEXT:    blr
@@ -9745,7 +9745,7 @@ define i64 @test558(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB558_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stdcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB558_1
+; PPC64LE-NEXT:    bne- 0, .LBB558_1
 ; PPC64LE-NEXT:  .LBB558_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
@@ -9764,7 +9764,7 @@ define i64 @test559(ptr %ptr, i64 %val) {
 ; PPC64LE-NEXT:    blt 0, .LBB559_3
 ; PPC64LE-NEXT:  # %bb.2:
 ; PPC64LE-NEXT:    stdcx. 4, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB559_1
+; PPC64LE-NEXT:    bne- 0, .LBB559_1
 ; PPC64LE-NEXT:  .LBB559_3:
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    mr 3, 5
diff --git a/llvm/test/CodeGen/PowerPC/atomics.ll b/llvm/test/CodeGen/PowerPC/atomics.ll
index 183c8e1323f2e..ff1a7222cc92c 100644
--- a/llvm/test/CodeGen/PowerPC/atomics.ll
+++ b/llvm/test/CodeGen/PowerPC/atomics.ll
@@ -341,7 +341,7 @@ define i8 @add_i8_monotonic(ptr %mem, i8 %operand) {
 ; PPC32-NEXT:    and r8, r8, r6
 ; PPC32-NEXT:    or r8, r8, r9
 ; PPC32-NEXT:    stwcx. r8, 0, r5
-; PPC32-NEXT:    bne cr0, .LBB12_1
+; PPC32-NEXT:    bne- cr0, .LBB12_1
 ; PPC32-NEXT:  # %bb.2:
 ; PPC32-NEXT:    srw r3, r7, r3
 ; PPC32-NEXT:    clrlwi r3, r3, 24
@@ -362,7 +362,7 @@ define i8 @add_i8_monotonic(ptr %mem, i8 %operand) {
 ; PPC64-NEXT:    and r8, r8, r6
 ; PPC64-NEXT:    or r8, r8, r9
 ; PPC64-NEXT:    stwcx. r8, 0, r5
-; PPC64-NEXT:    bne cr0, .LBB12_1
+; PPC64-NEXT:    bne- cr0, .LBB12_1
 ; PPC64-NEXT:  # %bb.2:
 ; PPC64-NEXT:    srw r3, r7, r3
 ; PPC64-NEXT:    clrlwi r3, r3, 24
@@ -388,7 +388,7 @@ define i16 @xor_i16_seq_cst(ptr %mem, i16 %operand) {
 ; PPC32-NEXT:    and r8, r8, r6
 ; PPC32-NEXT:    or r8, r8, r9
 ; PPC32-NEXT:    stwcx. r8, 0, r3
-; PPC32-NEXT:    bne cr0, .LBB13_1
+; PPC32-NEXT:    bne- cr0, .LBB13_1
 ; PPC32-NEXT:  # %bb.2:
 ; PPC32-NEXT:    srw r3, r7, r5
 ; PPC32-NEXT:    clrlwi r3, r3, 16
@@ -412,7 +412,7 @@ define i16 @xor_i16_seq_cst(ptr %mem, i16 %operand) {
 ; PPC64-NEXT:    and r8, r8, r6
 ; PPC64-NEXT:    or r8, r8, r9
 ; PPC64-NEXT:    stwcx. r8, 0, r3
-; PPC64-NEXT:    bne cr0, .LBB13_1
+; PPC64-NEXT:    bne- cr0, .LBB13_1
 ; PPC64-NEXT:  # %bb.2:
 ; PPC64-NEXT:    srw r3, r7, r5
 ; PPC64-NEXT:    clrlwi r3, r3, 16
@@ -428,7 +428,7 @@ define i32 @xchg_i32_acq_rel(ptr %mem, i32 %operand) {
 ; CHECK-NEXT:  .LBB14_1:
 ; CHECK-NEXT:    lwarx r5, 0, r3
 ; CHECK-NEXT:    stwcx. r4, 0, r3
-; CHECK-NEXT:    bne cr0, .LBB14_1
+; CHECK-NEXT:    bne- cr0, .LBB14_1
 ; CHECK-NEXT:  # %bb.2:
 ; CHECK-NEXT:    mr r3, r5
 ; CHECK-NEXT:    lwsync
@@ -458,7 +458,7 @@ define i64 @and_i64_release(ptr %mem, i64 %operand) {
 ; PPC64-NEXT:    ldarx r5, 0, r3
 ; PPC64-NEXT:    and r6, r4, r5
 ; PPC64-NEXT:    stdcx. r6, 0, r3
-; PPC64-NEXT:    bne cr0, .LBB15_1
+; PPC64-NEXT:    bne- cr0, .LBB15_1
 ; PPC64-NEXT:  # %bb.2:
 ; PPC64-NEXT:    mr r3, r5
 ; PPC64-NEXT:    blr
diff --git a/llvm/test/CodeGen/PowerPC/fmf-propagation.ll b/llvm/test/CodeGen/PowerPC/fmf-propagation.ll
index e71f59c79ce4d..cad684ecf0170 100644
--- a/llvm/test/CodeGen/PowerPC/fmf-propagation.ll
+++ b/llvm/test/CodeGen/PowerPC/fmf-propagation.ll
@@ -325,24 +325,21 @@ define float @sqrt_afn_ieee(float %x) #0 {
 ;
 ; GLOBAL-LABEL: sqrt_afn_ieee:
 ; GLOBAL:       # %bb.0:
-; GLOBAL-NEXT:    addis 3, 2, .LCPI11_1@toc@ha
-; GLOBAL-NEXT:    xsabsdp 0, 1
-; GLOBAL-NEXT:    lfs 2, .LCPI11_1@toc@l(3)
-; GLOBAL-NEXT:    fcmpu 0, 0, 2
-; GLOBAL-NEXT:    xxlxor 0, 0, 0
-; GLOBAL-NEXT:    blt 0, .LBB11_2
-; GLOBAL-NEXT:  # %bb.1:
 ; GLOBAL-NEXT:    xsrsqrtesp 0, 1
 ; GLOBAL-NEXT:    vspltisw 2, -3
 ; GLOBAL-NEXT:    addis 3, 2, .LCPI11_0@toc@ha
-; GLOBAL-NEXT:    xvcvsxwdp 2, 34
-; GLOBAL-NEXT:    xsmulsp 1, 1, 0
-; GLOBAL-NEXT:    xsmaddasp 2, 1, 0
+; GLOBAL-NEXT:    xvcvsxwdp 3, 34
+; GLOBAL-NEXT:    xsmulsp 2, 1, 0
+; GLOBAL-NEXT:    xsabsdp 1, 1
+; GLOBAL-NEXT:    xsmaddasp 3, 2, 0
 ; GLOBAL-NEXT:    lfs 0, .LCPI11_0@toc@l(3)
-; GLOBAL-NEXT:    xsmulsp 0, 1, 0
-; GLOBAL-NEXT:    xsmulsp 0, 0, 2
-; GLOBAL-NEXT:  .LBB11_2:
-; GLOBAL-NEXT:    fmr 1, 0
+; GLOBAL-NEXT:    addis 3, 2, .LCPI11_1@toc@ha
+; GLOBAL-NEXT:    xsmulsp 0, 2, 0
+; GLOBAL-NEXT:    lfs 2, .LCPI11_1@toc@l(3)
+; GLOBAL-NEXT:    xssubsp 1, 1, 2
+; GLOBAL-NEXT:    xxlxor 2, 2, 2
+; GLOBAL-NEXT:    xsmulsp 0, 0, 3
+; GLOBAL-NEXT:    fsel 1, 1, 0, 2
 ; GLOBAL-NEXT:    blr
   %rt = call afn ninf float @llvm.sqrt.f32(float %x)
   ret float %rt
@@ -393,21 +390,19 @@ define float @sqrt_afn_preserve_sign(float %x) #1 {
 ;
 ; GLOBAL-LABEL: sqrt_afn_preserve_sign:
 ; GLOBAL:       # %bb.0:
-; GLOBAL-NEXT:    xxlxor 0, 0, 0
-; GLOBAL-NEXT:    fcmpu 0, 1, 0
-; GLOBAL-NEXT:    beq 0, .LBB13_2
-; GLOBAL-NEXT:  # %bb.1:
 ; GLOBAL-NEXT:    xsrsqrtesp 0, 1
 ; GLOBAL-NEXT:    vspltisw 2, -3
 ; GLOBAL-NEXT:    addis 3, 2, .LCPI13_0@toc@ha
-; GLOBAL-NEXT:    xvcvsxwdp 2, 34
-; GLOBAL-NEXT:    xsmulsp 1, 1, 0
-; GLOBAL-NEXT:    xsmaddasp 2, 1, 0
+; GLOBAL-NEXT:    xvcvsxwdp 3, 34
+; GLOBAL-NEXT:    xsmulsp 2, 1, 0
+; GLOBAL-NEXT:    xsmaddasp 3, 2, 0
 ; GLOBAL-NEXT:    lfs 0, .LCPI13_0@toc@l(3)
-; GLOBAL-NEXT:    xsmulsp 0, 1, 0
-; GLOBAL-NEXT:    xsmulsp 0, 0, 2
-; GLOBAL-NEXT:  .LBB13_2:
-; GLOBAL-NEXT:    fmr 1, 0
+; GLOBAL-NEXT:    xsmulsp 0, 2, 0
+; GLOBAL-NEXT:    xxlxor 2, 2, 2
+; GLOBAL-NEXT:    xsmulsp 0, 0, 3
+; GLOBAL-NEXT:    fsel 2, 1, 2, 0
+; GLOBAL-NEXT:    xsnegdp 1, 1
+; GLOBAL-NEXT:    fsel 1, 1, 2, 0
 ; GLOBAL-NEXT:    blr
   %rt = call afn ninf float @llvm.sqrt.f32(float %x)
   ret float %rt
@@ -462,24 +457,21 @@ define float @sqrt_fast_ieee(float %x) #0 {
 ;
 ; GLOBAL-LABEL: sqrt_fast_ieee:
 ; GLOBAL:       # %bb.0:
-; GLOBAL-NEXT:    addis 3, 2, .LCPI15_1@toc@ha
-; GLOBAL-NEXT:    xsabsdp 0, 1
-; GLOBAL-NEXT:    lfs 2, .LCPI15_1@toc@l(3)
-; GLOBAL-NEXT:    fcmpu 0, 0, 2
-; GLOBAL-NEXT:    xxlxor 0, 0, 0
-; GLOBAL-NEXT:    blt 0, .LBB15_2
-; GLOBAL-NEXT:  # %bb.1:
 ; GLOBAL-NEXT:    xsrsqrtesp 0, 1
 ; GLOBAL-NEXT:    vspltisw 2, -3
 ; GLOBAL-NEXT:    addis 3, 2, .LCPI15_0@toc@ha
-; GLOBAL-NEXT:    xvcvsxwdp 2, 34
-; GLOBAL-NEXT:    xsmulsp 1, 1, 0
-; GLOBAL-NEXT:    xsmaddasp 2, 1, 0
+; GLOBAL-NEXT:    xvcvsxwdp 3, 34
+; GLOBAL-NEXT:    xsmulsp 2, 1, 0
+; GLOBAL-NEXT:    xsabsdp 1, 1
+; GLOBAL-NEXT:    xsmaddasp 3, 2, 0
 ; GLOBAL-NEXT:    lfs 0, .LCPI15_0@toc@l(3)
-; GLOBAL-NEXT:    xsmulsp 0, 1, 0
-; GLOBAL-NEXT:    xsmulsp 0, 0, 2
-; GLOBAL-NEXT:  .LBB15_2:
-; GLOBAL-NEXT:    fmr 1, 0
+; GLOBAL-NEXT:    addis 3, 2, .LCPI15_1@toc@ha
+; GLOBAL-NEXT:    xsmulsp 0, 2, 0
+; GLOBAL-NEXT:    lfs 2, .LCPI15_1@toc@l(3)
+; GLOBAL-NEXT:    xssubsp 1, 1, 2
+; GLOBAL-NEXT:    xxlxor 2, 2, 2
+; GLOBAL-NEXT:    xsmulsp 0, 0, 3
+; GLOBAL-NEXT:    fsel 1, 1, 0, 2
 ; GLOBAL-NEXT:    blr
   %rt = call contract reassoc afn ninf float @llvm.sqrt.f32(float %x)
   ret float %rt
@@ -517,21 +509,19 @@ define float @sqrt_fast_preserve_sign(float %x) #1 {
 ;
 ; GLOBAL-LABEL: sqrt_fast_preserve_sign:
 ; GLOBAL:       # %bb.0:
-; GLOBAL-NEXT:    xxlxor 0, 0, 0
-; GLOBAL-NEXT:    fcmpu 0, 1, 0
-; GLOBAL-NEXT:    beq 0, .LBB16_2
-; GLOBAL-NEXT:  # %bb.1:
 ; GLOBAL-NEXT:    xsrsqrtesp 0, 1
 ; GLOBAL-NEXT:    vspltisw 2, -3
 ; GLOBAL-NEXT:    addis 3, 2, .LCPI16_0@toc@ha
-; GLOBAL-NEXT:    xvcvsxwdp 2, 34
-; GLOBAL-NEXT:    xsmulsp 1, 1, 0
-; GLOBAL-NEXT:    xsmaddasp 2, 1, 0
+; GLOBAL-NEXT:    xvcvsxwdp 3, 34
+; GLOBAL-NEXT:    xsmulsp 2, 1, 0
+; GLOBAL-NEXT:    xsmaddasp 3, 2, 0
 ; GLOBAL-NEXT:    lfs 0, .LCPI16_0@toc@l(3)
-; GLOBAL-NEXT:    xsmulsp 0, 1, 0
-; GLOBAL-NEXT:    xsmulsp 0, 0, 2
-; GLOBAL-NEXT:  .LBB16_2:
-; GLOBAL-NEXT:    fmr 1, 0
+; GLOBAL-NEXT:    xsmulsp 0, 2, 0
+; GLOBAL-NEXT:    xxlxor 2, 2, 2
+; GLOBAL-NEXT:    xsmulsp 0, 0, 3
+; GLOBAL-NEXT:    fsel 2, 1, 2, 0
+; GLOBAL-NEXT:    xsnegdp 1, 1
+; GLOBAL-NEXT:    fsel 1, 1, 2, 0
 ; GLOBAL-NEXT:    blr
   %rt = call contract reassoc ninf afn float @llvm.sqrt.f32(float %x)
   ret float %rt
diff --git a/llvm/test/CodeGen/PowerPC/i64_fp_round.ll b/llvm/test/CodeGen/PowerPC/i64_fp_round.ll
index f7df003fcc3f8..b1fb907e0ee2d 100644
--- a/llvm/test/CodeGen/PowerPC/i64_fp_round.ll
+++ b/llvm/test/CodeGen/PowerPC/i64_fp_round.ll
@@ -4,10 +4,9 @@
 ; for minor code generation differences.
 ; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mattr=-fpcvt < %s | FileCheck %s
 ; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mattr=-fpcvt -mattr=-isel < %s | FileCheck %s --check-prefix=CHECK-NO-ISEL
-; Also check that with -enable-unsafe-fp-math we do not get that extra
+; Also check that with fpexcept.ignore we do not get that extra
 ; code sequence.  Simply verify that there is no "isel" present.
-; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mattr=-fpcvt -enable-unsafe-fp-math < %s | FileCheck %s -check-prefix=CHECK-UNSAFE
-; CHECK-UNSAFE-NOT: isel
+
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
@@ -15,9 +14,8 @@ define float @test(i64 %x) nounwind readnone {
 ; Verify that we get the code sequence needed to avoid double-rounding.
 ; Note that only parts of the sequence are checked for here, to allow
 ; for minor code generation differences.
-; Also check that with -enable-unsafe-fp-math we do not get that extra
+; Also check that with fpexcept.ignore we do not get that extra
 ; code sequence.  Simply verify that there is no "isel" present.
-; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mattr=-fpcvt -enable-unsafe-fp-math < %s | FileCheck %s -check-prefix=CHECK-UNSAFE
 ; CHECK-LABEL: test:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    clrldi 4, 3, 53
@@ -51,18 +49,10 @@ define float @test(i64 %x) nounwind readnone {
 ; CHECK-NO-ISEL-NEXT:    xscvsxddp 0, 0
 ; CHECK-NO-ISEL-NEXT:    frsp 1, 0
 ; CHECK-NO-ISEL-NEXT:    blr
-;
-; CHECK-UNSAFE-LABEL: test:
-; CHECK-UNSAFE:       # %bb.0: # %entry
-; CHECK-UNSAFE-NEXT:    std 3, -8(1)
-; CHECK-UNSAFE-NEXT:    lfd 0, -8(1)
-; CHECK-UNSAFE-NEXT:    xscvsxddp 0, 0
-; CHECK-UNSAFE-NEXT:    frsp 1, 0
-; CHECK-UNSAFE-NEXT:    blr
 
 entry:
   %conv = sitofp i64 %x to float
   ret float %conv
 }
 
-
+; TODO: Add sitofp afn test.
diff --git a/llvm/test/CodeGen/PowerPC/lxvkq-vec-constant.ll b/llvm/test/CodeGen/PowerPC/lxvkq-vec-constant.ll
new file mode 100644
index 0000000000000..0ee4524a6c68a
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/lxvkq-vec-constant.ll
@@ -0,0 +1,307 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr10 -mtriple=powerpc64le-unknown-unknown \
+; RUN:   -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=POWERPC64-LE-10
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr10 -mtriple=powerpc64-unknown-unknown \
+; RUN:   -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=POWERPC64-BE-10
+
+; Test LXVKQ instruction generation for special vector constants matching 128 bit patterns:
+; 0x8000_0000_0000_0000_0000_0000_0000_0000 (MSB set pattern)
+; 0x0000_0000_0000_0000_0000_0000_0000_0001 (LSB set pattern)
+
+; =============================================================================
+; v2i64 tests - MSB set pattern (0x8000_0000_0000_0000_0000_0000_0000_0000)
+; =============================================================================
+
+; Big-Endian: 0x8000_0000_0000_0000_0000_0000_0000_0000 represents <-9223372036854775808, 0>
+define dso_local noundef <2 x i64> @test_v2i64_msb_set_bigendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v2i64_msb_set_bigendian:
+; POWERPC64-LE-10:       # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT:    plxv v2, .LCPI0_0@PCREL(0), 1
+; POWERPC64-LE-10-NEXT:    blr
+;
+; POWERPC64-BE-10-LABEL: test_v2i64_msb_set_bigendian:
+; POWERPC64-BE-10:       # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT:    lxvkq v2, 16
+; POWERPC64-BE-10-NEXT:    blr
+entry:
+  ret <2 x i64> <i64 -9223372036854775808, i64 0>
+}
+
+; Little-Endian: 0x8000_0000_0000_0000_0000_0000_0000_0000 represents <0, -9223372036854775808>
+define dso_local noundef <2 x i64> @test_v2i64_msb_set_littleendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v2i64_msb_set_littleendian:
+; POWERPC64-LE-10:       # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT:    lxvkq v2, 16
+; POWERPC64-LE-10-NEXT:    blr
+;
+; POWERPC64-BE-10-LABEL: test_v2i64_msb_set_littleendian:
+; POWERPC64-BE-10:       # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT:    addis r3, r2, .LCPI1_0@toc@ha
+; POWERPC64-BE-10-NEXT:    addi r3, r3, .LCPI1_0@toc@l
+; POWERPC64-BE-10-NEXT:    lxv v2, 0(r3)
+; POWERPC64-BE-10-NEXT:    blr
+entry:
+  ret <2 x i64> <i64 0, i64 -9223372036854775808>
+}
+
+; =============================================================================
+; v4i32 tests - MSB set pattern (0x8000_0000_0000_0000_0000_0000_0000_0000)
+; =============================================================================
+
+; Big-Endian: 0x8000_0000_0000_0000_0000_0000_0000_0000 represents <-2147483648, 0, 0, 0>
+define dso_local noundef <4 x i32> @test_v4i32_msb_set_bigendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v4i32_msb_set_bigendian:
+; POWERPC64-LE-10:       # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT:    plxv v2, .LCPI2_0@PCREL(0), 1
+; POWERPC64-LE-10-NEXT:    blr
+;
+; POWERPC64-BE-10-LABEL: test_v4i32_msb_set_bigendian:
+; POWERPC64-BE-10:       # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT:    lxvkq v2, 16
+; POWERPC64-BE-10-NEXT:    blr
+entry:
+  ret <4 x i32> <i32 -2147483648, i32 0, i32 0, i32 0>
+}
+
+; Little-Endian: 0x8000_0000_0000_0000_0000_0000_0000_0000 represents <0, 0, 0, -2147483648>
+define dso_local noundef <4 x i32> @test_v4i32_msb_set_littleendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v4i32_msb_set_littleendian:
+; POWERPC64-LE-10:       # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT:    lxvkq v2, 16
+; POWERPC64-LE-10-NEXT:    blr
+;
+; POWERPC64-BE-10-LABEL: test_v4i32_msb_set_littleendian:
+; POWERPC64-BE-10:       # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT:    addis r3, r2, .LCPI3_0@toc@ha
+; POWERPC64-BE-10-NEXT:    addi r3, r3, .LCPI3_0@toc@l
+; POWERPC64-BE-10-NEXT:    lxv v2, 0(r3)
+; POWERPC64-BE-10-NEXT:    blr
+entry:
+  ret <4 x i32> <i32 0, i32 0, i32 0, i32 -2147483648>
+}
+
+; =============================================================================
+; v8i16 tests - MSB set pattern (0x8000_0000_0000_0000_0000_0000_0000_0000)
+; =============================================================================
+
+; Big-Endian: 0x8000_0000_0000_0000_0000_0000_0000_0000 represents <-32768, 0, 0, 0, 0, 0, 0, 0>
+define dso_local noundef <8 x i16> @test_v8i16_msb_set_bigendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v8i16_msb_set_bigendian:
+; POWERPC64-LE-10:       # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT:    plxv v2, .LCPI4_0@PCREL(0), 1
+; POWERPC64-LE-10-NEXT:    blr
+;
+; POWERPC64-BE-10-LABEL: test_v8i16_msb_set_bigendian:
+; POWERPC64-BE-10:       # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT:    lxvkq v2, 16
+; POWERPC64-BE-10-NEXT:    blr
+entry:
+  ret <8 x i16> <i16 -32768, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
+}
+
+; Little-Endian: 0x8000_0000_0000_0000_0000_0000_0000_0000 represents <0, 0, 0, 0, 0, 0, 0, -32768>
+define dso_local noundef <8 x i16> @test_v8i16_msb_set_littleendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v8i16_msb_set_littleendian:
+; POWERPC64-LE-10:       # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT:    lxvkq v2, 16
+; POWERPC64-LE-10-NEXT:    blr
+;
+; POWERPC64-BE-10-LABEL: test_v8i16_msb_set_littleendian:
+; POWERPC64-BE-10:       # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT:    addis r3, r2, .LCPI5_0@toc@ha
+; POWERPC64-BE-10-NEXT:    addi r3, r3, .LCPI5_0@toc@l
+; POWERPC64-BE-10-NEXT:    lxv v2, 0(r3)
+; POWERPC64-BE-10-NEXT:    blr
+entry:
+  ret <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 -32768>
+}
+
+; =============================================================================
+; v16i8 tests - MSB set pattern (0x8000_0000_0000_0000_0000_0000_0000_0000)
+; =============================================================================
+
+; Big-Endian: 0x8000_0000_0000_0000_0000_0000_0000_0000 represents <-128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>
+define dso_local noundef <16 x i8> @test_v16i8_msb_set_bigendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v16i8_msb_set_bigendian:
+; POWERPC64-LE-10:       # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT:    plxv v2, .LCPI6_0@PCREL(0), 1
+; POWERPC64-LE-10-NEXT:    blr
+;
+; POWERPC64-BE-10-LABEL: test_v16i8_msb_set_bigendian:
+; POWERPC64-BE-10:       # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT:    lxvkq v2, 16
+; POWERPC64-BE-10-NEXT:    blr
+entry:
+  ret <16 x i8> <i8 -128, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+}
+
+; Little-Endian: 0x8000_0000_0000_0000_0000_0000_0000_0000 represents <0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -128>
+define dso_local noundef <16 x i8> @test_v16i8_msb_set_littleendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v16i8_msb_set_littleendian:
+; POWERPC64-LE-10:       # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT:    lxvkq v2, 16
+; POWERPC64-LE-10-NEXT:    blr
+;
+; POWERPC64-BE-10-LABEL: test_v16i8_msb_set_littleendian:
+; POWERPC64-BE-10:       # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT:    addis r3, r2, .LCPI7_0@toc@ha
+; POWERPC64-BE-10-NEXT:    addi r3, r3, .LCPI7_0@toc@l
+; POWERPC64-BE-10-NEXT:    lxv v2, 0(r3)
+; POWERPC64-BE-10-NEXT:    blr
+entry:
+  ret <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 -128>
+}
+
+; =============================================================================
+; v2i64 tests - LSB set pattern (0x0000_0000_0000_0000_0000_0000_0000_0001)
+; =============================================================================
+
+; Big-Endian: 0x0000_0000_0000_0000_0000_0000_0000_0001 represents <0, 1>
+define dso_local noundef <2 x i64> @test_v2i64_lsb_set_bigendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v2i64_lsb_set_bigendian:
+; POWERPC64-LE-10:       # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT:    plxv v2, .LCPI8_0@PCREL(0), 1
+; POWERPC64-LE-10-NEXT:    blr
+;
+; POWERPC64-BE-10-LABEL: test_v2i64_lsb_set_bigendian:
+; POWERPC64-BE-10:       # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT:    xxspltib v2, 255
+; POWERPC64-BE-10-NEXT:    vsrq v2, v2, v2
+; POWERPC64-BE-10-NEXT:    blr
+entry:
+  ret <2 x i64> <i64 0, i64 1>
+}
+
+; Little-Endian: 0x0000_0000_0000_0000_0000_0000_0000_0001 represents <1, 0>
+define dso_local noundef <2 x i64> @test_v2i64_lsb_set_littleendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v2i64_lsb_set_littleendian:
+; POWERPC64-LE-10:       # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT:    xxspltib v2, 255
+; POWERPC64-LE-10-NEXT:    vsrq v2, v2, v2
+; POWERPC64-LE-10-NEXT:    blr
+;
+; POWERPC64-BE-10-LABEL: test_v2i64_lsb_set_littleendian:
+; POWERPC64-BE-10:       # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT:    addis r3, r2, .LCPI9_0@toc@ha
+; POWERPC64-BE-10-NEXT:    addi r3, r3, .LCPI9_0@toc@l
+; POWERPC64-BE-10-NEXT:    lxv v2, 0(r3)
+; POWERPC64-BE-10-NEXT:    blr
+entry:
+  ret <2 x i64> <i64 1, i64 0>
+}
+
+; =============================================================================
+; v4i32 tests - LSB set pattern (0x0000_0000_0000_0000_0000_0000_0000_0001)
+; =============================================================================
+
+; Big-Endian: 0x0000_0000_0000_0000_0000_0000_0000_0001 represents <0, 0, 0, 1>
+define dso_local noundef <4 x i32> @test_v4i32_lsb_set_bigendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v4i32_lsb_set_bigendian:
+; POWERPC64-LE-10:       # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT:    plxv v2, .LCPI10_0@PCREL(0), 1
+; POWERPC64-LE-10-NEXT:    blr
+;
+; POWERPC64-BE-10-LABEL: test_v4i32_lsb_set_bigendian:
+; POWERPC64-BE-10:       # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT:    xxspltib v2, 255
+; POWERPC64-BE-10-NEXT:    vsrq v2, v2, v2
+; POWERPC64-BE-10-NEXT:    blr
+entry:
+  ret <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+}
+
+; Little-Endian: 0x0000_0000_0000_0000_0000_0000_0000_0001 represents <1, 0, 0, 0>
+define dso_local noundef <4 x i32> @test_v4i32_lsb_set_littleendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v4i32_lsb_set_littleendian:
+; POWERPC64-LE-10:       # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT:    xxspltib v2, 255
+; POWERPC64-LE-10-NEXT:    vsrq v2, v2, v2
+; POWERPC64-LE-10-NEXT:    blr
+;
+; POWERPC64-BE-10-LABEL: test_v4i32_lsb_set_littleendian:
+; POWERPC64-BE-10:       # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT:    addis r3, r2, .LCPI11_0@toc@ha
+; POWERPC64-BE-10-NEXT:    addi r3, r3, .LCPI11_0@toc@l
+; POWERPC64-BE-10-NEXT:    lxv v2, 0(r3)
+; POWERPC64-BE-10-NEXT:    blr
+entry:
+  ret <4 x i32> <i32 1, i32 0, i32 0, i32 0>
+}
+
+; =============================================================================
+; v8i16 tests - LSB set pattern (0x0000_0000_0000_0000_0000_0000_0000_0001)
+; =============================================================================
+
+; Big-Endian: 0x0000_0000_0000_0000_0000_0000_0000_0001 represents <0, 0, 0, 0, 0, 0, 0, 1>
+define dso_local noundef <8 x i16> @test_v8i16_lsb_set_bigendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v8i16_lsb_set_bigendian:
+; POWERPC64-LE-10:       # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT:    plxv v2, .LCPI12_0@PCREL(0), 1
+; POWERPC64-LE-10-NEXT:    blr
+;
+; POWERPC64-BE-10-LABEL: test_v8i16_lsb_set_bigendian:
+; POWERPC64-BE-10:       # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT:    xxspltib v2, 255
+; POWERPC64-BE-10-NEXT:    vsrq v2, v2, v2
+; POWERPC64-BE-10-NEXT:    blr
+entry:
+  ret <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 1>
+}
+
+; Little-Endian: 0x0000_0000_0000_0000_0000_0000_0000_0001 represents <1, 0, 0, 0, 0, 0, 0, 0>
+define dso_local noundef <8 x i16> @test_v8i16_lsb_set_littleendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v8i16_lsb_set_littleendian:
+; POWERPC64-LE-10:       # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT:    xxspltib v2, 255
+; POWERPC64-LE-10-NEXT:    vsrq v2, v2, v2
+; POWERPC64-LE-10-NEXT:    blr
+;
+; POWERPC64-BE-10-LABEL: test_v8i16_lsb_set_littleendian:
+; POWERPC64-BE-10:       # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT:    addis r3, r2, .LCPI13_0@toc@ha
+; POWERPC64-BE-10-NEXT:    addi r3, r3, .LCPI13_0@toc@l
+; POWERPC64-BE-10-NEXT:    lxv v2, 0(r3)
+; POWERPC64-BE-10-NEXT:    blr
+entry:
+  ret <8 x i16> <i16 1, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
+}
+
+; =============================================================================
+; v16i8 tests - LSB set pattern (0x0000_0000_0000_0000_0000_0000_0000_0001)
+; =============================================================================
+
+; Big-Endian: 0x0000_0000_0000_0000_0000_0000_0000_0001 represents <0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1>
+define dso_local noundef <16 x i8> @test_v16i8_lsb_set_bigendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v16i8_lsb_set_bigendian:
+; POWERPC64-LE-10:       # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT:    plxv v2, .LCPI14_0@PCREL(0), 1
+; POWERPC64-LE-10-NEXT:    blr
+;
+; POWERPC64-BE-10-LABEL: test_v16i8_lsb_set_bigendian:
+; POWERPC64-BE-10:       # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT:    xxspltib v2, 255
+; POWERPC64-BE-10-NEXT:    vsrq v2, v2, v2
+; POWERPC64-BE-10-NEXT:    blr
+entry:
+  ret <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 1>
+}
+
+; Little-Endian: 0x0000_0000_0000_0000_0000_0000_0000_0001 represents <1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>
+define dso_local noundef <16 x i8> @test_v16i8_lsb_set_littleendian() local_unnamed_addr {
+; POWERPC64-LE-10-LABEL: test_v16i8_lsb_set_littleendian:
+; POWERPC64-LE-10:       # %bb.0: # %entry
+; POWERPC64-LE-10-NEXT:    xxspltib v2, 255
+; POWERPC64-LE-10-NEXT:    vsrq v2, v2, v2
+; POWERPC64-LE-10-NEXT:    blr
+;
+; POWERPC64-BE-10-LABEL: test_v16i8_lsb_set_littleendian:
+; POWERPC64-BE-10:       # %bb.0: # %entry
+; POWERPC64-BE-10-NEXT:    addis r3, r2, .LCPI15_0@toc@ha
+; POWERPC64-BE-10-NEXT:    addi r3, r3, .LCPI15_0@toc@l
+; POWERPC64-BE-10-NEXT:    lxv v2, 0(r3)
+; POWERPC64-BE-10-NEXT:    blr
+entry:
+  ret <16 x i8> <i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/PowerPC/ppc-partword-atomic.ll b/llvm/test/CodeGen/PowerPC/ppc-partword-atomic.ll
index 9e3eea159473a..a6f5c618f2e11 100644
--- a/llvm/test/CodeGen/PowerPC/ppc-partword-atomic.ll
+++ b/llvm/test/CodeGen/PowerPC/ppc-partword-atomic.ll
@@ -25,7 +25,7 @@ define dso_local zeroext i32 @testI8(i8 zeroext %val) local_unnamed_addr #0 {
 ; PWR7-NEXT:    andc 8, 7, 3
 ; PWR7-NEXT:    or 8, 6, 8
 ; PWR7-NEXT:    stwcx. 8, 0, 5
-; PWR7-NEXT:    bne 0, .LBB0_1
+; PWR7-NEXT:    bne- 0, .LBB0_1
 ; PWR7-NEXT:  # %bb.2: # %entry
 ; PWR7-NEXT:    srw 3, 7, 4
 ; PWR7-NEXT:    addis 4, 2, global_int@toc@ha
@@ -44,7 +44,7 @@ define dso_local zeroext i32 @testI8(i8 zeroext %val) local_unnamed_addr #0 {
 ; PWR9-NEXT:    #
 ; PWR9-NEXT:    lbarx 4, 0, 5
 ; PWR9-NEXT:    stbcx. 3, 0, 5
-; PWR9-NEXT:    bne 0, .LBB0_1
+; PWR9-NEXT:    bne- 0, .LBB0_1
 ; PWR9-NEXT:  # %bb.2: # %entry
 ; PWR9-NEXT:    addis 3, 2, global_int@toc@ha
 ; PWR9-NEXT:    lwsync
@@ -78,7 +78,7 @@ define dso_local zeroext i32 @testI16(i16 zeroext %val) local_unnamed_addr #0 {
 ; PWR7-NEXT:    andc 8, 7, 3
 ; PWR7-NEXT:    or 8, 6, 8
 ; PWR7-NEXT:    stwcx. 8, 0, 5
-; PWR7-NEXT:    bne 0, .LBB1_1
+; PWR7-NEXT:    bne- 0, .LBB1_1
 ; PWR7-NEXT:  # %bb.2: # %entry
 ; PWR7-NEXT:    srw 3, 7, 4
 ; PWR7-NEXT:    addis 4, 2, global_int@toc@ha
@@ -97,7 +97,7 @@ define dso_local zeroext i32 @testI16(i16 zeroext %val) local_unnamed_addr #0 {
 ; PWR9-NEXT:    #
 ; PWR9-NEXT:    lharx 4, 0, 5
 ; PWR9-NEXT:    sthcx. 3, 0, 5
-; PWR9-NEXT:    bne 0, .LBB1_1
+; PWR9-NEXT:    bne- 0, .LBB1_1
 ; PWR9-NEXT:  # %bb.2: # %entry
 ; PWR9-NEXT:    addis 3, 2, global_int@toc@ha
 ; PWR9-NEXT:    lwsync
diff --git a/llvm/test/CodeGen/PowerPC/pr61882.ll b/llvm/test/CodeGen/PowerPC/pr61882.ll
index c649fe0c7613b..062d97c971c1a 100644
--- a/llvm/test/CodeGen/PowerPC/pr61882.ll
+++ b/llvm/test/CodeGen/PowerPC/pr61882.ll
@@ -27,7 +27,7 @@ define void @foo(ptr %a, i32 %x) {
 ; CHECK-NEXT:    andc r8, r8, r6
 ; CHECK-NEXT:    or r8, r7, r8
 ; CHECK-NEXT:    stwcx. r8, 0, r3
-; CHECK-NEXT:    bne cr0, .LBB0_1
+; CHECK-NEXT:    bne- cr0, .LBB0_1
 ; CHECK-NEXT:  .LBB0_3:
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    blr
@@ -43,7 +43,7 @@ define void @foo(ptr %a, i32 %x) {
 ; PWR8-NEXT:    bgt cr0, .LBB0_3
 ; PWR8-NEXT:  # %bb.2:
 ; PWR8-NEXT:    stbcx. r4, 0, r3
-; PWR8-NEXT:    bne cr0, .LBB0_1
+; PWR8-NEXT:    bne- cr0, .LBB0_1
 ; PWR8-NEXT:  .LBB0_3:
 ; PWR8-NEXT:    lwsync
 ; PWR8-NEXT:    blr
diff --git a/llvm/test/CodeGen/PowerPC/scalar-rounding-ops.ll b/llvm/test/CodeGen/PowerPC/scalar-rounding-ops.ll
index 2be370f638d5b..af48bf22a7669 100644
--- a/llvm/test/CodeGen/PowerPC/scalar-rounding-ops.ll
+++ b/llvm/test/CodeGen/PowerPC/scalar-rounding-ops.ll
@@ -5,9 +5,6 @@
 ; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
 ; RUN:   -mtriple=powerpc64le-unknown-unknown -verify-machineinstrs < %s | \
 ; RUN:   FileCheck %s
-; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
-; RUN:   -mtriple=powerpc64le-unknown-unknown -verify-machineinstrs < %s \
-; RUN:   --enable-unsafe-fp-math | FileCheck %s --check-prefix=FAST
 define dso_local i64 @test_lrint(double %d) local_unnamed_addr {
 ; BE-LABEL: test_lrint:
 ; BE:       # %bb.0: # %entry
@@ -36,17 +33,36 @@ define dso_local i64 @test_lrint(double %d) local_unnamed_addr {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: test_lrint:
-; FAST:       # %bb.0: # %entry
-; FAST-NEXT:    fctid f0, f1
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    blr
 entry:
   %0 = tail call i64 @llvm.lrint.i64.f64(double %d)
   ret i64 %0
 }
 
+define dso_local i64 @test_constrained_lrint(double %d) local_unnamed_addr {
+; BE-LABEL: test_constrained_lrint:
+; BE:       # %bb.0: # %entry
+; BE-NEXT:    mflr r0
+; BE-NEXT:    stdu r1, -112(r1)
+; BE-NEXT:    std r0, 128(r1)
+; BE-NEXT:    .cfi_def_cfa_offset 112
+; BE-NEXT:    .cfi_offset lr, 16
+; BE-NEXT:    bl lrint
+; BE-NEXT:    nop
+; BE-NEXT:    addi r1, r1, 112
+; BE-NEXT:    ld r0, 16(r1)
+; BE-NEXT:    mtlr r0
+; BE-NEXT:    blr
+;
+; CHECK-LABEL: test_constrained_lrint:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fctid f0, f1
+; CHECK-NEXT:    mffprd r3, f0
+; CHECK-NEXT:    blr
+entry:
+  %0 = tail call i64 @llvm.experimental.constrained.lrint(double %d, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+  ret i64 %0
+}
+
 declare i64 @llvm.lrint.i64.f64(double)
 
 define dso_local i64 @test_lrintf(float %f) local_unnamed_addr {
@@ -77,17 +93,36 @@ define dso_local i64 @test_lrintf(float %f) local_unnamed_addr {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: test_lrintf:
-; FAST:       # %bb.0: # %entry
-; FAST-NEXT:    fctid f0, f1
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    blr
 entry:
   %0 = tail call i64 @llvm.lrint.i64.f32(float %f)
   ret i64 %0
 }
 
+define dso_local i64 @test_constrained_lrintf(float %f) local_unnamed_addr {
+; BE-LABEL: test_constrained_lrintf:
+; BE:       # %bb.0: # %entry
+; BE-NEXT:    mflr r0
+; BE-NEXT:    stdu r1, -112(r1)
+; BE-NEXT:    std r0, 128(r1)
+; BE-NEXT:    .cfi_def_cfa_offset 112
+; BE-NEXT:    .cfi_offset lr, 16
+; BE-NEXT:    bl lrintf
+; BE-NEXT:    nop
+; BE-NEXT:    addi r1, r1, 112
+; BE-NEXT:    ld r0, 16(r1)
+; BE-NEXT:    mtlr r0
+; BE-NEXT:    blr
+;
+; CHECK-LABEL: test_constrained_lrintf:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fctid f0, f1
+; CHECK-NEXT:    mffprd r3, f0
+; CHECK-NEXT:    blr
+entry:
+  %0 = tail call i64 @llvm.experimental.constrained.lrint(float %f, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+  ret i64 %0
+}
+
 declare i64 @llvm.lrint.i64.f32(float)
 
 define dso_local i64 @test_llrint(double %d) local_unnamed_addr {
@@ -118,17 +153,36 @@ define dso_local i64 @test_llrint(double %d) local_unnamed_addr {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: test_llrint:
-; FAST:       # %bb.0: # %entry
-; FAST-NEXT:    fctid f0, f1
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    blr
 entry:
   %0 = tail call i64 @llvm.llrint.i64.f64(double %d)
   ret i64 %0
 }
 
+define dso_local i64 @test_constrained_llrint(double %d) local_unnamed_addr {
+; BE-LABEL: test_constrained_llrint:
+; BE:       # %bb.0: # %entry
+; BE-NEXT:    mflr r0
+; BE-NEXT:    stdu r1, -112(r1)
+; BE-NEXT:    std r0, 128(r1)
+; BE-NEXT:    .cfi_def_cfa_offset 112
+; BE-NEXT:    .cfi_offset lr, 16
+; BE-NEXT:    bl llrint
+; BE-NEXT:    nop
+; BE-NEXT:    addi r1, r1, 112
+; BE-NEXT:    ld r0, 16(r1)
+; BE-NEXT:    mtlr r0
+; BE-NEXT:    blr
+;
+; CHECK-LABEL: test_constrained_llrint:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fctid f0, f1
+; CHECK-NEXT:    mffprd r3, f0
+; CHECK-NEXT:    blr
+entry:
+  %0 = tail call i64 @llvm.experimental.constrained.llrint(double %d, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+  ret i64 %0
+}
+
 declare i64 @llvm.llrint.i64.f64(double)
 
 define dso_local i64 @test_llrintf(float %f) local_unnamed_addr {
@@ -159,17 +213,36 @@ define dso_local i64 @test_llrintf(float %f) local_unnamed_addr {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: test_llrintf:
-; FAST:       # %bb.0: # %entry
-; FAST-NEXT:    fctid f0, f1
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    blr
 entry:
   %0 = tail call i64 @llvm.llrint.i64.f32(float %f)
   ret i64 %0
 }
 
+define dso_local i64 @test_constrained_llrintf(float %f) local_unnamed_addr {
+; BE-LABEL: test_constrained_llrintf:
+; BE:       # %bb.0: # %entry
+; BE-NEXT:    mflr r0
+; BE-NEXT:    stdu r1, -112(r1)
+; BE-NEXT:    std r0, 128(r1)
+; BE-NEXT:    .cfi_def_cfa_offset 112
+; BE-NEXT:    .cfi_offset lr, 16
+; BE-NEXT:    bl llrintf
+; BE-NEXT:    nop
+; BE-NEXT:    addi r1, r1, 112
+; BE-NEXT:    ld r0, 16(r1)
+; BE-NEXT:    mtlr r0
+; BE-NEXT:    blr
+;
+; CHECK-LABEL: test_constrained_llrintf:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fctid f0, f1
+; CHECK-NEXT:    mffprd r3, f0
+; CHECK-NEXT:    blr
+entry:
+  %0 = tail call i64 @llvm.experimental.constrained.llrint(float %f, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+  ret i64 %0
+}
+
 declare i64 @llvm.llrint.i64.f32(float)
 
 define dso_local i64 @test_lround(double %d) local_unnamed_addr {
@@ -200,18 +273,37 @@ define dso_local i64 @test_lround(double %d) local_unnamed_addr {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: test_lround:
-; FAST:       # %bb.0: # %entry
-; FAST-NEXT:    xsrdpi f0, f1
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    blr
 entry:
   %0 = tail call i64 @llvm.lround.i64.f64(double %d)
   ret i64 %0
 }
 
+define dso_local i64 @test_constrained_lround(double %d) local_unnamed_addr {
+; BE-LABEL: test_constrained_lround:
+; BE:       # %bb.0: # %entry
+; BE-NEXT:    mflr r0
+; BE-NEXT:    stdu r1, -112(r1)
+; BE-NEXT:    std r0, 128(r1)
+; BE-NEXT:    .cfi_def_cfa_offset 112
+; BE-NEXT:    .cfi_offset lr, 16
+; BE-NEXT:    bl lround
+; BE-NEXT:    nop
+; BE-NEXT:    addi r1, r1, 112
+; BE-NEXT:    ld r0, 16(r1)
+; BE-NEXT:    mtlr r0
+; BE-NEXT:    blr
+;
+; CHECK-LABEL: test_constrained_lround:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xsrdpi f0, f1
+; CHECK-NEXT:    fctid f0, f0
+; CHECK-NEXT:    mffprd r3, f0
+; CHECK-NEXT:    blr
+entry:
+  %0 = tail call i64 @llvm.experimental.constrained.lround(double %d, metadata !"fpexcept.ignore")
+  ret i64 %0
+}
+
 declare i64 @llvm.lround.i64.f64(double)
 
 define dso_local i32 @test_lroundi32f64(double %d) local_unnamed_addr {
@@ -242,18 +334,37 @@ define dso_local i32 @test_lroundi32f64(double %d) local_unnamed_addr {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: test_lroundi32f64:
-; FAST:       # %bb.0: # %entry
-; FAST-NEXT:    xsrdpi f0, f1
-; FAST-NEXT:    fctiw f0, f0
-; FAST-NEXT:    mffprwz r3, f0
-; FAST-NEXT:    blr
 entry:
   %0 = tail call i32 @llvm.lround.i32.f64(double %d)
   ret i32 %0
 }
 
+define dso_local i32 @test_constrained_lroundi32f64(double %d) local_unnamed_addr {
+; BE-LABEL: test_constrained_lroundi32f64:
+; BE:       # %bb.0: # %entry
+; BE-NEXT:    mflr r0
+; BE-NEXT:    stdu r1, -112(r1)
+; BE-NEXT:    std r0, 128(r1)
+; BE-NEXT:    .cfi_def_cfa_offset 112
+; BE-NEXT:    .cfi_offset lr, 16
+; BE-NEXT:    bl lround
+; BE-NEXT:    nop
+; BE-NEXT:    addi r1, r1, 112
+; BE-NEXT:    ld r0, 16(r1)
+; BE-NEXT:    mtlr r0
+; BE-NEXT:    blr
+;
+; CHECK-LABEL: test_constrained_lroundi32f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xsrdpi f0, f1
+; CHECK-NEXT:    fctiw f0, f0
+; CHECK-NEXT:    mffprwz r3, f0
+; CHECK-NEXT:    blr
+entry:
+  %0 = tail call i32 @llvm.experimental.constrained.lround(double %d, metadata !"fpexcept.ignore")
+  ret i32 %0
+}
+
 declare i32 @llvm.lround.i32.f64(double)
 
 define dso_local i64 @test_lroundf(float %f) local_unnamed_addr {
@@ -284,18 +395,37 @@ define dso_local i64 @test_lroundf(float %f) local_unnamed_addr {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: test_lroundf:
-; FAST:       # %bb.0: # %entry
-; FAST-NEXT:    xsrdpi f0, f1
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    blr
 entry:
   %0 = tail call i64 @llvm.lround.i64.f32(float %f)
   ret i64 %0
 }
 
+define dso_local i64 @test_constrained_lroundf(float %f) local_unnamed_addr {
+; BE-LABEL: test_constrained_lroundf:
+; BE:       # %bb.0: # %entry
+; BE-NEXT:    mflr r0
+; BE-NEXT:    stdu r1, -112(r1)
+; BE-NEXT:    std r0, 128(r1)
+; BE-NEXT:    .cfi_def_cfa_offset 112
+; BE-NEXT:    .cfi_offset lr, 16
+; BE-NEXT:    bl lroundf
+; BE-NEXT:    nop
+; BE-NEXT:    addi r1, r1, 112
+; BE-NEXT:    ld r0, 16(r1)
+; BE-NEXT:    mtlr r0
+; BE-NEXT:    blr
+;
+; CHECK-LABEL: test_constrained_lroundf:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xsrdpi f0, f1
+; CHECK-NEXT:    fctid f0, f0
+; CHECK-NEXT:    mffprd r3, f0
+; CHECK-NEXT:    blr
+entry:
+  %0 = tail call i64 @llvm.experimental.constrained.lround(float %f, metadata !"fpexcept.ignore")
+  ret i64 %0
+}
+
 declare i64 @llvm.lround.i64.f32(float)
 
 define dso_local i32 @test_lroundi32f32(float %d) local_unnamed_addr {
@@ -326,18 +456,37 @@ define dso_local i32 @test_lroundi32f32(float %d) local_unnamed_addr {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: test_lroundi32f32:
-; FAST:       # %bb.0: # %entry
-; FAST-NEXT:    xsrdpi f0, f1
-; FAST-NEXT:    fctiw f0, f0
-; FAST-NEXT:    mffprwz r3, f0
-; FAST-NEXT:    blr
 entry:
   %0 = tail call i32 @llvm.lround.i32.f32(float %d)
   ret i32 %0
 }
 
+define dso_local i32 @test_constrained_lroundi32f32(float %f) local_unnamed_addr {
+; BE-LABEL: test_constrained_lroundi32f32:
+; BE:       # %bb.0: # %entry
+; BE-NEXT:    mflr r0
+; BE-NEXT:    stdu r1, -112(r1)
+; BE-NEXT:    std r0, 128(r1)
+; BE-NEXT:    .cfi_def_cfa_offset 112
+; BE-NEXT:    .cfi_offset lr, 16
+; BE-NEXT:    bl lroundf
+; BE-NEXT:    nop
+; BE-NEXT:    addi r1, r1, 112
+; BE-NEXT:    ld r0, 16(r1)
+; BE-NEXT:    mtlr r0
+; BE-NEXT:    blr
+;
+; CHECK-LABEL: test_constrained_lroundi32f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xsrdpi f0, f1
+; CHECK-NEXT:    fctiw f0, f0
+; CHECK-NEXT:    mffprwz r3, f0
+; CHECK-NEXT:    blr
+entry:
+  %0 = tail call i32 @llvm.experimental.constrained.lround(float %f, metadata !"fpexcept.ignore")
+  ret i32 %0
+}
+
 declare i32 @llvm.lround.i32.f32(float)
 
 define dso_local i64 @test_llround(double %d) local_unnamed_addr {
@@ -368,18 +517,37 @@ define dso_local i64 @test_llround(double %d) local_unnamed_addr {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: test_llround:
-; FAST:       # %bb.0: # %entry
-; FAST-NEXT:    xsrdpi f0, f1
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    blr
 entry:
   %0 = tail call i64 @llvm.llround.i64.f64(double %d)
   ret i64 %0
 }
 
+define dso_local i64 @test_constrained_llround(double %d) local_unnamed_addr {
+; BE-LABEL: test_constrained_llround:
+; BE:       # %bb.0: # %entry
+; BE-NEXT:    mflr r0
+; BE-NEXT:    stdu r1, -112(r1)
+; BE-NEXT:    std r0, 128(r1)
+; BE-NEXT:    .cfi_def_cfa_offset 112
+; BE-NEXT:    .cfi_offset lr, 16
+; BE-NEXT:    bl llround
+; BE-NEXT:    nop
+; BE-NEXT:    addi r1, r1, 112
+; BE-NEXT:    ld r0, 16(r1)
+; BE-NEXT:    mtlr r0
+; BE-NEXT:    blr
+;
+; CHECK-LABEL: test_constrained_llround:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xsrdpi f0, f1
+; CHECK-NEXT:    fctid f0, f0
+; CHECK-NEXT:    mffprd r3, f0
+; CHECK-NEXT:    blr
+entry:
+  %0 = tail call i64 @llvm.experimental.constrained.llround(double %d, metadata !"fpexcept.ignore")
+  ret i64 %0
+}
+
 declare i64 @llvm.llround.i64.f64(double)
 
 define dso_local i64 @test_llroundf(float %f) local_unnamed_addr {
@@ -410,18 +578,37 @@ define dso_local i64 @test_llroundf(float %f) local_unnamed_addr {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: test_llroundf:
-; FAST:       # %bb.0: # %entry
-; FAST-NEXT:    xsrdpi f0, f1
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    blr
 entry:
   %0 = tail call i64 @llvm.llround.i64.f32(float %f)
   ret i64 %0
 }
 
+define dso_local i64 @test_constrained_llroundf(float %f) local_unnamed_addr {
+; BE-LABEL: test_constrained_llroundf:
+; BE:       # %bb.0: # %entry
+; BE-NEXT:    mflr r0
+; BE-NEXT:    stdu r1, -112(r1)
+; BE-NEXT:    std r0, 128(r1)
+; BE-NEXT:    .cfi_def_cfa_offset 112
+; BE-NEXT:    .cfi_offset lr, 16
+; BE-NEXT:    bl llroundf
+; BE-NEXT:    nop
+; BE-NEXT:    addi r1, r1, 112
+; BE-NEXT:    ld r0, 16(r1)
+; BE-NEXT:    mtlr r0
+; BE-NEXT:    blr
+;
+; CHECK-LABEL: test_constrained_llroundf:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xsrdpi f0, f1
+; CHECK-NEXT:    fctid f0, f0
+; CHECK-NEXT:    mffprd r3, f0
+; CHECK-NEXT:    blr
+entry:
+  %0 = tail call i64 @llvm.experimental.constrained.llround(float %f, metadata !"fpexcept.ignore")
+  ret i64 %0
+}
+
 declare i64 @llvm.llround.i64.f32(float)
 
 define dso_local double @test_nearbyint(double %d) local_unnamed_addr {
@@ -452,16 +639,26 @@ define dso_local double @test_nearbyint(double %d) local_unnamed_addr {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: test_nearbyint:
-; FAST:       # %bb.0: # %entry
-; FAST-NEXT:    xsrdpic f1, f1
-; FAST-NEXT:    blr
 entry:
   %0 = tail call double @llvm.nearbyint.f64(double %d)
   ret double %0
 }
 
+define dso_local double @test_constrained_nearbyint(double %d) local_unnamed_addr {
+; BE-LABEL: test_constrained_nearbyint:
+; BE:       # %bb.0: # %entry
+; BE-NEXT:    xsrdpic f1, f1
+; BE-NEXT:    blr
+;
+; CHECK-LABEL: test_constrained_nearbyint:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xsrdpic f1, f1
+; CHECK-NEXT:    blr
+entry:
+  %0 = tail call double @llvm.experimental.constrained.nearbyint(double %d, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+  ret double %0
+}
+
 declare double @llvm.nearbyint.f64(double)
 
 define dso_local float @test_nearbyintf(float %f) local_unnamed_addr {
@@ -492,16 +689,26 @@ define dso_local float @test_nearbyintf(float %f) local_unnamed_addr {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: test_nearbyintf:
-; FAST:       # %bb.0: # %entry
-; FAST-NEXT:    xsrdpic f1, f1
-; FAST-NEXT:    blr
 entry:
   %0 = tail call float @llvm.nearbyint.f32(float %f)
   ret float %0
 }
 
+define dso_local float @test_constrained_nearbyintf(float %f) local_unnamed_addr {
+; BE-LABEL: test_constrained_nearbyintf:
+; BE:       # %bb.0: # %entry
+; BE-NEXT:    xsrdpic f1, f1
+; BE-NEXT:    blr
+;
+; CHECK-LABEL: test_constrained_nearbyintf:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xsrdpic f1, f1
+; CHECK-NEXT:    blr
+entry:
+  %0 = tail call float @llvm.experimental.constrained.nearbyint(float %f, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+  ret float %0
+}
+
 declare float @llvm.nearbyint.f32(float)
 
 define dso_local double @test_round(double %d) local_unnamed_addr {
@@ -514,16 +721,26 @@ define dso_local double @test_round(double %d) local_unnamed_addr {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xsrdpi f1, f1
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: test_round:
-; FAST:       # %bb.0: # %entry
-; FAST-NEXT:    xsrdpi f1, f1
-; FAST-NEXT:    blr
 entry:
   %0 = tail call double @llvm.round.f64(double %d)
   ret double %0
 }
 
+define dso_local double @test_constrained_round(double %d) local_unnamed_addr {
+; BE-LABEL: test_constrained_round:
+; BE:       # %bb.0: # %entry
+; BE-NEXT:    xsrdpi f1, f1
+; BE-NEXT:    blr
+;
+; CHECK-LABEL: test_constrained_round:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xsrdpi f1, f1
+; CHECK-NEXT:    blr
+entry:
+  %0 = tail call double @llvm.experimental.constrained.round(double %d, metadata !"fpexcept.ignore")
+  ret double %0
+}
+
 declare double @llvm.round.f64(double)
 
 define dso_local float @test_roundf(float %f) local_unnamed_addr {
@@ -536,16 +753,26 @@ define dso_local float @test_roundf(float %f) local_unnamed_addr {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xsrdpi f1, f1
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: test_roundf:
-; FAST:       # %bb.0: # %entry
-; FAST-NEXT:    xsrdpi f1, f1
-; FAST-NEXT:    blr
 entry:
   %0 = tail call float @llvm.round.f32(float %f)
   ret float %0
 }
 
+define dso_local float @test_constrained_roundf(float %f) local_unnamed_addr {
+; BE-LABEL: test_constrained_roundf:
+; BE:       # %bb.0: # %entry
+; BE-NEXT:    xsrdpi f1, f1
+; BE-NEXT:    blr
+;
+; CHECK-LABEL: test_constrained_roundf:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xsrdpi f1, f1
+; CHECK-NEXT:    blr
+entry:
+  %0 = tail call float @llvm.experimental.constrained.round(float %f, metadata !"fpexcept.ignore")
+  ret float %0
+}
+
 declare float @llvm.round.f32(float)
 
 define dso_local double @test_trunc(double %d) local_unnamed_addr {
@@ -558,11 +785,6 @@ define dso_local double @test_trunc(double %d) local_unnamed_addr {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xsrdpiz f1, f1
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: test_trunc:
-; FAST:       # %bb.0: # %entry
-; FAST-NEXT:    xsrdpiz f1, f1
-; FAST-NEXT:    blr
 entry:
   %0 = tail call double @llvm.trunc.f64(double %d)
   ret double %0
@@ -580,11 +802,6 @@ define dso_local float @test_truncf(float %f) local_unnamed_addr {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xsrdpiz f1, f1
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: test_truncf:
-; FAST:       # %bb.0: # %entry
-; FAST-NEXT:    xsrdpiz f1, f1
-; FAST-NEXT:    blr
 entry:
   %0 = tail call float @llvm.trunc.f32(float %f)
   ret float %0
@@ -602,11 +819,6 @@ define dso_local double @test_floor(double %d) local_unnamed_addr {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xsrdpim f1, f1
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: test_floor:
-; FAST:       # %bb.0: # %entry
-; FAST-NEXT:    xsrdpim f1, f1
-; FAST-NEXT:    blr
 entry:
   %0 = tail call double @llvm.floor.f64(double %d)
   ret double %0
@@ -624,11 +836,6 @@ define dso_local float @test_floorf(float %f) local_unnamed_addr {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xsrdpim f1, f1
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: test_floorf:
-; FAST:       # %bb.0: # %entry
-; FAST-NEXT:    xsrdpim f1, f1
-; FAST-NEXT:    blr
 entry:
   %0 = tail call float @llvm.floor.f32(float %f)
   ret float %0
@@ -646,11 +853,6 @@ define dso_local double @test_ceil(double %d) local_unnamed_addr {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xsrdpip f1, f1
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: test_ceil:
-; FAST:       # %bb.0: # %entry
-; FAST-NEXT:    xsrdpip f1, f1
-; FAST-NEXT:    blr
 entry:
   %0 = tail call double @llvm.ceil.f64(double %d)
   ret double %0
@@ -668,11 +870,6 @@ define dso_local float @test_ceilf(float %f) local_unnamed_addr {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xsrdpip f1, f1
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: test_ceilf:
-; FAST:       # %bb.0: # %entry
-; FAST-NEXT:    xsrdpip f1, f1
-; FAST-NEXT:    blr
 entry:
   %0 = tail call float @llvm.ceil.f32(float %f)
   ret float %0
@@ -690,11 +887,6 @@ define dso_local double @test_rint(double %d) local_unnamed_addr {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xsrdpic f1, f1
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: test_rint:
-; FAST:       # %bb.0: # %entry
-; FAST-NEXT:    xsrdpic f1, f1
-; FAST-NEXT:    blr
 entry:
   %0 = tail call double @llvm.rint.f64(double %d)
   ret double %0
@@ -712,11 +904,6 @@ define dso_local float @test_rintf(float %f) local_unnamed_addr {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xsrdpic f1, f1
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: test_rintf:
-; FAST:       # %bb.0: # %entry
-; FAST-NEXT:    xsrdpic f1, f1
-; FAST-NEXT:    blr
 entry:
   %0 = tail call float @llvm.rint.f32(float %f)
   ret float %0
diff --git a/llvm/test/CodeGen/PowerPC/sign-ext-atomics.ll b/llvm/test/CodeGen/PowerPC/sign-ext-atomics.ll
index 128d546d176f8..da4c192a64033 100644
--- a/llvm/test/CodeGen/PowerPC/sign-ext-atomics.ll
+++ b/llvm/test/CodeGen/PowerPC/sign-ext-atomics.ll
@@ -16,7 +16,7 @@ define i16 @SEXTParam(i16 signext %0) #0 {
 ; CHECK-NEXT:  # %bb.2: # %top
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    sthcx. 3, 0, 4
-; CHECK-NEXT:    bne 0, .LBB0_1
+; CHECK-NEXT:    bne- 0, .LBB0_1
 ; CHECK-NEXT:  .LBB0_3: # %top
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    lhz 3, -4(1)
@@ -49,7 +49,7 @@ define i16 @noSEXTParam(i16 %0) #0 {
 ; CHECK-NEXT:  # %bb.2: # %top
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    sthcx. 3, 0, 4
-; CHECK-NEXT:    bne 0, .LBB1_1
+; CHECK-NEXT:    bne- 0, .LBB1_1
 ; CHECK-NEXT:  .LBB1_3: # %top
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    lhz 3, -4(1)
@@ -82,7 +82,7 @@ define i16 @noSEXTLoad(ptr %p) #0 {
 ; CHECK-NEXT:  # %bb.2: # %top
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    sthcx. 3, 0, 4
-; CHECK-NEXT:    bne 0, .LBB2_1
+; CHECK-NEXT:    bne- 0, .LBB2_1
 ; CHECK-NEXT:  .LBB2_3: # %top
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    lhz 3, -4(1)
diff --git a/llvm/test/CodeGen/PowerPC/vector-all-ones.ll b/llvm/test/CodeGen/PowerPC/vector-all-ones.ll
deleted file mode 100644
index e4c93adcf50a6..0000000000000
--- a/llvm/test/CodeGen/PowerPC/vector-all-ones.ll
+++ /dev/null
@@ -1,23 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu \
-; RUN:     -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
-
-; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc64-ibm-aix \
-; RUN:     -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
-
-; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc-ibm-aix \
-; RUN:     -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
-
-; Currently the generated code uses `vspltisw` to generate vector of 1s followed by add operation.
-; This pattern is expected to be optimized in a future patch by using `xxleqv` to generate vector of -1s
-; followed by subtraction operation.
-define dso_local noundef <4 x i32> @test1(<4 x i32> %a) {
-; CHECK-LABEL: test1:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vspltisw v3, 1
-; CHECK-NEXT:    vadduwm v2, v2, v3
-; CHECK-NEXT:    blr
-entry:
-  %add = add <4 x i32> %a, splat (i32 1)
-  ret <4 x i32> %add
-}
diff --git a/llvm/test/CodeGen/PowerPC/vector-llrint.ll b/llvm/test/CodeGen/PowerPC/vector-llrint.ll
index 8a9e48e002381..d2fb6ca1ec5b7 100644
--- a/llvm/test/CodeGen/PowerPC/vector-llrint.ll
+++ b/llvm/test/CodeGen/PowerPC/vector-llrint.ll
@@ -9,9 +9,6 @@
 ; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
 ; RUN:   -mtriple=powerpc64le-unknown-unknown -verify-machineinstrs < %s | \
 ; RUN:   FileCheck %s
-; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
-; RUN:   -mtriple=powerpc64le-unknown-unknown -verify-machineinstrs < %s \
-; RUN:   --enable-unsafe-fp-math | FileCheck %s --check-prefix=FAST
 
 define <1 x i64> @llrint_v1i64_v1f16(<1 x half> %x) nounwind {
 ; BE-LABEL: llrint_v1i64_v1f16:
@@ -47,23 +44,6 @@ define <1 x i64> @llrint_v1i64_v1f16(<1 x half> %x) nounwind {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: llrint_v1i64_v1f16:
-; FAST:       # %bb.0:
-; FAST-NEXT:    mflr r0
-; FAST-NEXT:    stdu r1, -32(r1)
-; FAST-NEXT:    std r0, 48(r1)
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fctid f0, f1
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    addi r1, r1, 32
-; FAST-NEXT:    ld r0, 16(r1)
-; FAST-NEXT:    mtlr r0
-; FAST-NEXT:    blr
   %a = call <1 x i64> @llvm.llrint.v1i64.v1f16(<1 x half> %x)
   ret <1 x i64> %a
 }
@@ -147,41 +127,6 @@ define <2 x i64> @llrint_v1i64_v2f16(<2 x half> %x) nounwind {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: llrint_v1i64_v2f16:
-; FAST:       # %bb.0:
-; FAST-NEXT:    mflr r0
-; FAST-NEXT:    stfd f30, -16(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f31, -8(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stdu r1, -48(r1)
-; FAST-NEXT:    fmr f31, f1
-; FAST-NEXT:    fmr f1, f2
-; FAST-NEXT:    std r0, 64(r1)
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f30, f1
-; FAST-NEXT:    fmr f1, f31
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fctid f0, f1
-; FAST-NEXT:    fctid f1, f30
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    mffprd r3, f1
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    xxmrghd v2, vs1, vs0
-; FAST-NEXT:    addi r1, r1, 48
-; FAST-NEXT:    ld r0, 16(r1)
-; FAST-NEXT:    lfd f31, -8(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f30, -16(r1) # 8-byte Folded Reload
-; FAST-NEXT:    mtlr r0
-; FAST-NEXT:    blr
   %a = call <2 x i64> @llvm.llrint.v2i64.v2f16(<2 x half> %x)
   ret <2 x i64> %a
 }
@@ -341,68 +286,6 @@ define <4 x i64> @llrint_v4i64_v4f16(<4 x half> %x) nounwind {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: llrint_v4i64_v4f16:
-; FAST:       # %bb.0:
-; FAST-NEXT:    mflr r0
-; FAST-NEXT:    stfd f28, -32(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f29, -24(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f30, -16(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f31, -8(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stdu r1, -64(r1)
-; FAST-NEXT:    fmr f29, f1
-; FAST-NEXT:    fmr f1, f4
-; FAST-NEXT:    std r0, 80(r1)
-; FAST-NEXT:    fmr f31, f3
-; FAST-NEXT:    fmr f30, f2
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f28, f1
-; FAST-NEXT:    fmr f1, f31
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f31, f1
-; FAST-NEXT:    fmr f1, f30
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f30, f1
-; FAST-NEXT:    fmr f1, f29
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fctid f0, f30
-; FAST-NEXT:    fctid f2, f31
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    fctid f1, f1
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    mffprd r3, f2
-; FAST-NEXT:    mtfprd f2, r3
-; FAST-NEXT:    mffprd r3, f1
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    xxmrghd v2, vs0, vs1
-; FAST-NEXT:    fctid f0, f28
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    xxmrghd v3, vs0, vs2
-; FAST-NEXT:    addi r1, r1, 64
-; FAST-NEXT:    ld r0, 16(r1)
-; FAST-NEXT:    lfd f31, -8(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f30, -16(r1) # 8-byte Folded Reload
-; FAST-NEXT:    mtlr r0
-; FAST-NEXT:    lfd f29, -24(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f28, -32(r1) # 8-byte Folded Reload
-; FAST-NEXT:    blr
   %a = call <4 x i64> @llvm.llrint.v4i64.v4f16(<4 x half> %x)
   ret <4 x i64> %a
 }
@@ -714,122 +597,6 @@ define <8 x i64> @llrint_v8i64_v8f16(<8 x half> %x) nounwind {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: llrint_v8i64_v8f16:
-; FAST:       # %bb.0:
-; FAST-NEXT:    mflr r0
-; FAST-NEXT:    stfd f24, -64(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f25, -56(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f26, -48(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f27, -40(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f28, -32(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f29, -24(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f30, -16(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f31, -8(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stdu r1, -96(r1)
-; FAST-NEXT:    fmr f24, f1
-; FAST-NEXT:    fmr f1, f8
-; FAST-NEXT:    std r0, 112(r1)
-; FAST-NEXT:    fmr f30, f7
-; FAST-NEXT:    fmr f29, f6
-; FAST-NEXT:    fmr f28, f5
-; FAST-NEXT:    fmr f27, f4
-; FAST-NEXT:    fmr f26, f3
-; FAST-NEXT:    fmr f25, f2
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f31, f1
-; FAST-NEXT:    fmr f1, f30
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f30, f1
-; FAST-NEXT:    fmr f1, f29
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f29, f1
-; FAST-NEXT:    fmr f1, f28
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f28, f1
-; FAST-NEXT:    fmr f1, f27
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f27, f1
-; FAST-NEXT:    fmr f1, f26
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f26, f1
-; FAST-NEXT:    fmr f1, f25
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f25, f1
-; FAST-NEXT:    fmr f1, f24
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fctid f0, f25
-; FAST-NEXT:    fctid f2, f26
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    fctid f3, f27
-; FAST-NEXT:    fctid f4, f28
-; FAST-NEXT:    fctid f5, f29
-; FAST-NEXT:    fctid f6, f30
-; FAST-NEXT:    fctid f1, f1
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    mffprd r3, f2
-; FAST-NEXT:    mtfprd f2, r3
-; FAST-NEXT:    mffprd r3, f3
-; FAST-NEXT:    mtfprd f3, r3
-; FAST-NEXT:    mffprd r3, f4
-; FAST-NEXT:    mtfprd f4, r3
-; FAST-NEXT:    mffprd r3, f5
-; FAST-NEXT:    mtfprd f5, r3
-; FAST-NEXT:    mffprd r3, f6
-; FAST-NEXT:    mtfprd f6, r3
-; FAST-NEXT:    mffprd r3, f1
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    xxmrghd v3, vs3, vs2
-; FAST-NEXT:    xxmrghd v4, vs5, vs4
-; FAST-NEXT:    xxmrghd v2, vs0, vs1
-; FAST-NEXT:    fctid f0, f31
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    xxmrghd v5, vs0, vs6
-; FAST-NEXT:    addi r1, r1, 96
-; FAST-NEXT:    ld r0, 16(r1)
-; FAST-NEXT:    lfd f31, -8(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f30, -16(r1) # 8-byte Folded Reload
-; FAST-NEXT:    mtlr r0
-; FAST-NEXT:    lfd f29, -24(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f28, -32(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f27, -40(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f26, -48(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f25, -56(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f24, -64(r1) # 8-byte Folded Reload
-; FAST-NEXT:    blr
   %a = call <8 x i64> @llvm.llrint.v8i64.v8f16(<8 x half> %x)
   ret <8 x i64> %a
 }
@@ -1439,228 +1206,6 @@ define <16 x i64> @llrint_v16i64_v16f16(<16 x half> %x) nounwind {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: llrint_v16i64_v16f16:
-; FAST:       # %bb.0:
-; FAST-NEXT:    mflr r0
-; FAST-NEXT:    stfd f16, -128(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f17, -120(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f18, -112(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f19, -104(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f20, -96(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f21, -88(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f22, -80(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f23, -72(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f24, -64(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f25, -56(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f26, -48(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f27, -40(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f28, -32(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f29, -24(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f30, -16(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f31, -8(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stdu r1, -160(r1)
-; FAST-NEXT:    fmr f26, f1
-; FAST-NEXT:    lfs f1, 312(r1)
-; FAST-NEXT:    std r0, 176(r1)
-; FAST-NEXT:    fmr f28, f13
-; FAST-NEXT:    fmr f27, f12
-; FAST-NEXT:    fmr f24, f11
-; FAST-NEXT:    fmr f21, f10
-; FAST-NEXT:    fmr f19, f9
-; FAST-NEXT:    fmr f18, f8
-; FAST-NEXT:    fmr f17, f7
-; FAST-NEXT:    fmr f16, f6
-; FAST-NEXT:    fmr f20, f5
-; FAST-NEXT:    fmr f22, f4
-; FAST-NEXT:    fmr f23, f3
-; FAST-NEXT:    fmr f25, f2
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f31, f1
-; FAST-NEXT:    lfs f1, 304(r1)
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f30, f1
-; FAST-NEXT:    lfs f1, 296(r1)
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f29, f1
-; FAST-NEXT:    fmr f1, f28
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f28, f1
-; FAST-NEXT:    fmr f1, f27
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f27, f1
-; FAST-NEXT:    fmr f1, f24
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f24, f1
-; FAST-NEXT:    fmr f1, f21
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f21, f1
-; FAST-NEXT:    fmr f1, f19
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f19, f1
-; FAST-NEXT:    fmr f1, f18
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f18, f1
-; FAST-NEXT:    fmr f1, f17
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f17, f1
-; FAST-NEXT:    fmr f1, f16
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f16, f1
-; FAST-NEXT:    fmr f1, f20
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f20, f1
-; FAST-NEXT:    fmr f1, f22
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f22, f1
-; FAST-NEXT:    fmr f1, f23
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f23, f1
-; FAST-NEXT:    fmr f1, f25
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f25, f1
-; FAST-NEXT:    fmr f1, f26
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fctid f0, f25
-; FAST-NEXT:    fctid f2, f23
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    fctid f3, f22
-; FAST-NEXT:    fctid f4, f20
-; FAST-NEXT:    fctid f5, f16
-; FAST-NEXT:    fctid f6, f17
-; FAST-NEXT:    fctid f7, f18
-; FAST-NEXT:    fctid f8, f19
-; FAST-NEXT:    fctid f9, f21
-; FAST-NEXT:    fctid f10, f24
-; FAST-NEXT:    fctid f1, f1
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    mffprd r3, f2
-; FAST-NEXT:    mtfprd f2, r3
-; FAST-NEXT:    mffprd r3, f3
-; FAST-NEXT:    mtfprd f3, r3
-; FAST-NEXT:    mffprd r3, f4
-; FAST-NEXT:    mtfprd f4, r3
-; FAST-NEXT:    mffprd r3, f5
-; FAST-NEXT:    mtfprd f5, r3
-; FAST-NEXT:    mffprd r3, f6
-; FAST-NEXT:    mtfprd f6, r3
-; FAST-NEXT:    mffprd r3, f7
-; FAST-NEXT:    mtfprd f7, r3
-; FAST-NEXT:    mffprd r3, f8
-; FAST-NEXT:    mtfprd f8, r3
-; FAST-NEXT:    mffprd r3, f9
-; FAST-NEXT:    mtfprd f9, r3
-; FAST-NEXT:    mffprd r3, f10
-; FAST-NEXT:    mtfprd f10, r3
-; FAST-NEXT:    mffprd r3, f1
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    xxmrghd v3, vs3, vs2
-; FAST-NEXT:    xxmrghd v4, vs5, vs4
-; FAST-NEXT:    xxmrghd v5, vs7, vs6
-; FAST-NEXT:    xxmrghd v6, vs9, vs8
-; FAST-NEXT:    xxmrghd v2, vs0, vs1
-; FAST-NEXT:    fctid f0, f27
-; FAST-NEXT:    fctid f1, f29
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    xxmrghd v7, vs0, vs10
-; FAST-NEXT:    fctid f0, f28
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    mffprd r3, f1
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    xxmrghd v8, vs1, vs0
-; FAST-NEXT:    fctid f0, f30
-; FAST-NEXT:    fctid f1, f31
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    mffprd r3, f1
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    xxmrghd v9, vs1, vs0
-; FAST-NEXT:    addi r1, r1, 160
-; FAST-NEXT:    ld r0, 16(r1)
-; FAST-NEXT:    lfd f31, -8(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f30, -16(r1) # 8-byte Folded Reload
-; FAST-NEXT:    mtlr r0
-; FAST-NEXT:    lfd f29, -24(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f28, -32(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f27, -40(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f26, -48(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f25, -56(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f24, -64(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f23, -72(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f22, -80(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f21, -88(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f20, -96(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f19, -104(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f18, -112(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f17, -120(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f16, -128(r1) # 8-byte Folded Reload
-; FAST-NEXT:    blr
   %a = call <16 x i64> @llvm.llrint.v16i64.v16f16(<16 x half> %x)
   ret <16 x i64> %a
 }
@@ -2839,523 +2384,6 @@ define <32 x i64> @llrint_v32i64_v32f16(<32 x half> %x) nounwind {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: llrint_v32i64_v32f16:
-; FAST:       # %bb.0:
-; FAST-NEXT:    mflr r0
-; FAST-NEXT:    stdu r1, -480(r1)
-; FAST-NEXT:    li r4, 128
-; FAST-NEXT:    std r0, 496(r1)
-; FAST-NEXT:    std r30, 320(r1) # 8-byte Folded Spill
-; FAST-NEXT:    mr r30, r3
-; FAST-NEXT:    stfd f14, 336(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f15, 344(r1) # 8-byte Folded Spill
-; FAST-NEXT:    fmr f14, f5
-; FAST-NEXT:    stfd f16, 352(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stxvd2x v20, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT:    li r4, 144
-; FAST-NEXT:    fmr f16, f4
-; FAST-NEXT:    stfd f17, 360(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f18, 368(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f19, 376(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f20, 384(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f21, 392(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stxvd2x v21, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT:    li r4, 160
-; FAST-NEXT:    stfd f22, 400(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f23, 408(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f24, 416(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f25, 424(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f26, 432(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f27, 440(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stxvd2x v22, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT:    li r4, 176
-; FAST-NEXT:    xxlor v22, f3, f3
-; FAST-NEXT:    stfd f28, 448(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f29, 456(r1) # 8-byte Folded Spill
-; FAST-NEXT:    fmr f29, f9
-; FAST-NEXT:    stfd f30, 464(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f31, 472(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stxvd2x v23, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT:    li r4, 192
-; FAST-NEXT:    xxlor v23, f2, f2
-; FAST-NEXT:    stxvd2x v24, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT:    li r4, 208
-; FAST-NEXT:    stxvd2x v25, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT:    li r4, 224
-; FAST-NEXT:    xxlor v25, f13, f13
-; FAST-NEXT:    stxvd2x v26, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT:    li r4, 240
-; FAST-NEXT:    xxlor v26, f12, f12
-; FAST-NEXT:    stxvd2x v27, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT:    li r4, 256
-; FAST-NEXT:    xxlor v27, f11, f11
-; FAST-NEXT:    stxvd2x v28, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT:    li r4, 272
-; FAST-NEXT:    xxlor v28, f10, f10
-; FAST-NEXT:    stxvd2x v29, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT:    li r4, 288
-; FAST-NEXT:    xxlor v29, f8, f8
-; FAST-NEXT:    stxvd2x v30, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT:    li r4, 304
-; FAST-NEXT:    xxlor v30, f7, f7
-; FAST-NEXT:    stxvd2x v31, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT:    li r4, 44
-; FAST-NEXT:    xxlor v31, f6, f6
-; FAST-NEXT:    stxsspx f1, r1, r4 # 4-byte Folded Spill
-; FAST-NEXT:    lfs f1, 768(r1)
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    li r3, 120
-; FAST-NEXT:    stxsdx f1, r1, r3 # 8-byte Folded Spill
-; FAST-NEXT:    lfs f1, 760(r1)
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    li r3, 112
-; FAST-NEXT:    stxsdx f1, r1, r3 # 8-byte Folded Spill
-; FAST-NEXT:    lfs f1, 752(r1)
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    li r3, 104
-; FAST-NEXT:    stxsdx f1, r1, r3 # 8-byte Folded Spill
-; FAST-NEXT:    lfs f1, 744(r1)
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    li r3, 96
-; FAST-NEXT:    stxsdx f1, r1, r3 # 8-byte Folded Spill
-; FAST-NEXT:    lfs f1, 736(r1)
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    li r3, 88
-; FAST-NEXT:    stxsdx f1, r1, r3 # 8-byte Folded Spill
-; FAST-NEXT:    lfs f1, 728(r1)
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    li r3, 80
-; FAST-NEXT:    stxsdx f1, r1, r3 # 8-byte Folded Spill
-; FAST-NEXT:    lfs f1, 720(r1)
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    li r3, 72
-; FAST-NEXT:    stxsdx f1, r1, r3 # 8-byte Folded Spill
-; FAST-NEXT:    lfs f1, 712(r1)
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    li r3, 64
-; FAST-NEXT:    stxsdx f1, r1, r3 # 8-byte Folded Spill
-; FAST-NEXT:    lfs f1, 704(r1)
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    li r3, 56
-; FAST-NEXT:    stxsdx f1, r1, r3 # 8-byte Folded Spill
-; FAST-NEXT:    lfs f1, 696(r1)
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    li r3, 48
-; FAST-NEXT:    stxsdx f1, r1, r3 # 8-byte Folded Spill
-; FAST-NEXT:    lfs f1, 688(r1)
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    xxlor v21, f1, f1
-; FAST-NEXT:    lfs f1, 680(r1)
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    xxlor v20, f1, f1
-; FAST-NEXT:    lfs f1, 672(r1)
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    xxlor v24, f1, f1
-; FAST-NEXT:    lfs f1, 664(r1)
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f31, f1
-; FAST-NEXT:    lfs f1, 656(r1)
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f30, f1
-; FAST-NEXT:    lfs f1, 648(r1)
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f28, f1
-; FAST-NEXT:    lfs f1, 640(r1)
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f27, f1
-; FAST-NEXT:    lfs f1, 632(r1)
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f26, f1
-; FAST-NEXT:    lfs f1, 624(r1)
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f25, f1
-; FAST-NEXT:    xxlor f1, v25, v25
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f24, f1
-; FAST-NEXT:    xxlor f1, v26, v26
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f23, f1
-; FAST-NEXT:    xxlor f1, v27, v27
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f22, f1
-; FAST-NEXT:    xxlor f1, v28, v28
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f21, f1
-; FAST-NEXT:    fmr f1, f29
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f20, f1
-; FAST-NEXT:    xxlor f1, v29, v29
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f19, f1
-; FAST-NEXT:    xxlor f1, v30, v30
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f18, f1
-; FAST-NEXT:    xxlor f1, v31, v31
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f29, f1
-; FAST-NEXT:    fmr f1, f14
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f14, f1
-; FAST-NEXT:    fmr f1, f16
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f16, f1
-; FAST-NEXT:    xxlor f1, v22, v22
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f17, f1
-; FAST-NEXT:    xxlor f1, v23, v23
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    li r3, 44
-; FAST-NEXT:    fmr f15, f1
-; FAST-NEXT:    lxsspx f1, r1, r3 # 4-byte Folded Reload
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fctid f3, f15
-; FAST-NEXT:    fctid f4, f17
-; FAST-NEXT:    mffprd r3, f3
-; FAST-NEXT:    fctid f5, f16
-; FAST-NEXT:    fctid f6, f14
-; FAST-NEXT:    fctid f7, f18
-; FAST-NEXT:    fctid f8, f19
-; FAST-NEXT:    fctid f13, f1
-; FAST-NEXT:    fctid f9, f20
-; FAST-NEXT:    fctid f10, f22
-; FAST-NEXT:    fctid f11, f24
-; FAST-NEXT:    fctid f12, f25
-; FAST-NEXT:    fctid f2, f23
-; FAST-NEXT:    fctid f0, f21
-; FAST-NEXT:    mtvsrd v2, r3
-; FAST-NEXT:    mffprd r3, f4
-; FAST-NEXT:    mtvsrd v3, r3
-; FAST-NEXT:    mffprd r3, f5
-; FAST-NEXT:    mtfprd f5, r3
-; FAST-NEXT:    mffprd r3, f6
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    mffprd r3, f7
-; FAST-NEXT:    mtfprd f6, r3
-; FAST-NEXT:    mffprd r3, f8
-; FAST-NEXT:    mtfprd f7, r3
-; FAST-NEXT:    mffprd r3, f9
-; FAST-NEXT:    mtfprd f3, r3
-; FAST-NEXT:    mffprd r3, f10
-; FAST-NEXT:    mtfprd f4, r3
-; FAST-NEXT:    mffprd r3, f11
-; FAST-NEXT:    fctid f11, f31
-; FAST-NEXT:    lfd f31, 56(r1) # 8-byte Folded Reload
-; FAST-NEXT:    mtfprd f8, r3
-; FAST-NEXT:    mffprd r3, f12
-; FAST-NEXT:    xxlor f12, v24, v24
-; FAST-NEXT:    fctid f31, f31
-; FAST-NEXT:    fctid f12, f12
-; FAST-NEXT:    mtfprd f9, r3
-; FAST-NEXT:    mffprd r3, f13
-; FAST-NEXT:    lfd f13, 48(r1) # 8-byte Folded Reload
-; FAST-NEXT:    mtfprd f10, r3
-; FAST-NEXT:    fctid f13, f13
-; FAST-NEXT:    xxmrghd v3, vs5, v3
-; FAST-NEXT:    fctid f5, f26
-; FAST-NEXT:    mffprd r3, f5
-; FAST-NEXT:    mtfprd f5, r3
-; FAST-NEXT:    xxmrghd v4, vs7, vs6
-; FAST-NEXT:    fctid f6, f27
-; FAST-NEXT:    fctid f7, f28
-; FAST-NEXT:    mffprd r3, f6
-; FAST-NEXT:    lfd f28, 96(r1) # 8-byte Folded Reload
-; FAST-NEXT:    fctid f28, f28
-; FAST-NEXT:    mtfprd f6, r3
-; FAST-NEXT:    mffprd r3, f7
-; FAST-NEXT:    mtfprd f7, r3
-; FAST-NEXT:    xxmrghd v2, v2, vs10
-; FAST-NEXT:    fctid f10, f30
-; FAST-NEXT:    mffprd r3, f10
-; FAST-NEXT:    lfd f30, 80(r1) # 8-byte Folded Reload
-; FAST-NEXT:    fctid f30, f30
-; FAST-NEXT:    mtfprd f10, r3
-; FAST-NEXT:    mffprd r3, f11
-; FAST-NEXT:    mtfprd f11, r3
-; FAST-NEXT:    mffprd r3, f12
-; FAST-NEXT:    mtfprd f12, r3
-; FAST-NEXT:    xxmrghd v5, vs12, vs11
-; FAST-NEXT:    xxlor f11, v20, v20
-; FAST-NEXT:    xxlor f12, v21, v21
-; FAST-NEXT:    fctid f11, f11
-; FAST-NEXT:    fctid f12, f12
-; FAST-NEXT:    mffprd r3, f11
-; FAST-NEXT:    mtfprd f11, r3
-; FAST-NEXT:    mffprd r3, f12
-; FAST-NEXT:    mtfprd f12, r3
-; FAST-NEXT:    mffprd r3, f13
-; FAST-NEXT:    mtfprd f13, r3
-; FAST-NEXT:    mffprd r3, f31
-; FAST-NEXT:    lfd f31, 64(r1) # 8-byte Folded Reload
-; FAST-NEXT:    fctid f31, f31
-; FAST-NEXT:    mtvsrd v0, r3
-; FAST-NEXT:    mffprd r3, f31
-; FAST-NEXT:    lfd f31, 72(r1) # 8-byte Folded Reload
-; FAST-NEXT:    mtvsrd v1, r3
-; FAST-NEXT:    mffprd r3, f30
-; FAST-NEXT:    lfd f30, 88(r1) # 8-byte Folded Reload
-; FAST-NEXT:    fctid f31, f31
-; FAST-NEXT:    mtvsrd v6, r3
-; FAST-NEXT:    mffprd r3, f28
-; FAST-NEXT:    lfd f28, 104(r1) # 8-byte Folded Reload
-; FAST-NEXT:    fctid f30, f30
-; FAST-NEXT:    fctid f28, f28
-; FAST-NEXT:    mtvsrd v7, r3
-; FAST-NEXT:    mffprd r3, f28
-; FAST-NEXT:    lfd f28, 112(r1) # 8-byte Folded Reload
-; FAST-NEXT:    fctid f28, f28
-; FAST-NEXT:    mtvsrd v8, r3
-; FAST-NEXT:    mffprd r3, f28
-; FAST-NEXT:    lfd f28, 120(r1) # 8-byte Folded Reload
-; FAST-NEXT:    fctid f28, f28
-; FAST-NEXT:    xxmrghd v10, vs12, vs11
-; FAST-NEXT:    xxmrghd v0, v0, vs13
-; FAST-NEXT:    xxswapd vs12, v0
-; FAST-NEXT:    xxmrghd v0, vs9, vs8
-; FAST-NEXT:    xxmrghd v7, v8, v7
-; FAST-NEXT:    mtvsrd v8, r3
-; FAST-NEXT:    mffprd r3, f28
-; FAST-NEXT:    mtvsrd v9, r3
-; FAST-NEXT:    mffprd r3, f30
-; FAST-NEXT:    xxswapd v7, v7
-; FAST-NEXT:    xxmrghd v8, v9, v8
-; FAST-NEXT:    mtvsrd v9, r3
-; FAST-NEXT:    mffprd r3, f31
-; FAST-NEXT:    xxswapd v8, v8
-; FAST-NEXT:    xxmrghd v6, v9, v6
-; FAST-NEXT:    mtvsrd v9, r3
-; FAST-NEXT:    li r3, 240
-; FAST-NEXT:    stxvd2x v8, r30, r3
-; FAST-NEXT:    li r3, 224
-; FAST-NEXT:    stxvd2x v7, r30, r3
-; FAST-NEXT:    li r3, 208
-; FAST-NEXT:    xxswapd vs11, v6
-; FAST-NEXT:    xxmrghd v6, vs10, vs7
-; FAST-NEXT:    stxvd2x vs11, r30, r3
-; FAST-NEXT:    li r3, 192
-; FAST-NEXT:    xxmrghd v1, v9, v1
-; FAST-NEXT:    xxswapd vs11, v1
-; FAST-NEXT:    xxmrghd v1, vs6, vs5
-; FAST-NEXT:    xxswapd vs5, v10
-; FAST-NEXT:    xxswapd vs6, v5
-; FAST-NEXT:    stxvd2x vs11, r30, r3
-; FAST-NEXT:    li r3, 176
-; FAST-NEXT:    stxvd2x vs12, r30, r3
-; FAST-NEXT:    li r3, 160
-; FAST-NEXT:    stxvd2x vs5, r30, r3
-; FAST-NEXT:    li r3, 144
-; FAST-NEXT:    stxvd2x vs6, r30, r3
-; FAST-NEXT:    mffprd r3, f2
-; FAST-NEXT:    mtfprd f7, r3
-; FAST-NEXT:    li r3, 128
-; FAST-NEXT:    xxswapd vs5, v6
-; FAST-NEXT:    stxvd2x vs5, r30, r3
-; FAST-NEXT:    li r3, 112
-; FAST-NEXT:    xxswapd vs2, v1
-; FAST-NEXT:    xxswapd vs6, v0
-; FAST-NEXT:    stxvd2x vs2, r30, r3
-; FAST-NEXT:    li r3, 96
-; FAST-NEXT:    fctid f2, f29
-; FAST-NEXT:    stxvd2x vs6, r30, r3
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    mffprd r3, f2
-; FAST-NEXT:    mtfprd f2, r3
-; FAST-NEXT:    li r3, 80
-; FAST-NEXT:    xxmrghd v5, vs7, vs4
-; FAST-NEXT:    xxswapd vs4, v2
-; FAST-NEXT:    xxmrghd v0, vs0, vs3
-; FAST-NEXT:    xxswapd vs0, v5
-; FAST-NEXT:    xxswapd vs3, v3
-; FAST-NEXT:    stxvd2x vs0, r30, r3
-; FAST-NEXT:    li r3, 64
-; FAST-NEXT:    xxswapd vs0, v0
-; FAST-NEXT:    stxvd2x vs0, r30, r3
-; FAST-NEXT:    li r3, 48
-; FAST-NEXT:    xxmrghd v5, vs2, vs1
-; FAST-NEXT:    xxswapd vs1, v4
-; FAST-NEXT:    stxvd2x vs1, r30, r3
-; FAST-NEXT:    li r3, 32
-; FAST-NEXT:    xxswapd vs2, v5
-; FAST-NEXT:    stxvd2x vs2, r30, r3
-; FAST-NEXT:    li r3, 16
-; FAST-NEXT:    stxvd2x vs3, r30, r3
-; FAST-NEXT:    li r3, 304
-; FAST-NEXT:    stxvd2x vs4, 0, r30
-; FAST-NEXT:    lfd f31, 472(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f30, 464(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f29, 456(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f28, 448(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f27, 440(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f26, 432(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f25, 424(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f24, 416(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f23, 408(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f22, 400(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f21, 392(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f20, 384(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f19, 376(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f18, 368(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f17, 360(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f16, 352(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f15, 344(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f14, 336(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lxvd2x v31, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 288
-; FAST-NEXT:    ld r30, 320(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lxvd2x v30, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 272
-; FAST-NEXT:    lxvd2x v29, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 256
-; FAST-NEXT:    lxvd2x v28, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 240
-; FAST-NEXT:    lxvd2x v27, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 224
-; FAST-NEXT:    lxvd2x v26, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 208
-; FAST-NEXT:    lxvd2x v25, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 192
-; FAST-NEXT:    lxvd2x v24, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 176
-; FAST-NEXT:    lxvd2x v23, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 160
-; FAST-NEXT:    lxvd2x v22, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 144
-; FAST-NEXT:    lxvd2x v21, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 128
-; FAST-NEXT:    lxvd2x v20, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    addi r1, r1, 480
-; FAST-NEXT:    ld r0, 16(r1)
-; FAST-NEXT:    mtlr r0
-; FAST-NEXT:    blr
   %a = call <32 x i64> @llvm.llrint.v32i64.v32f16(<32 x half> %x)
   ret <32 x i64> %a
 }
@@ -3385,12 +2413,6 @@ define <1 x i64> @llrint_v1i64_v1f32(<1 x float> %x) nounwind {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: llrint_v1i64_v1f32:
-; FAST:       # %bb.0:
-; FAST-NEXT:    fctid f0, f1
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    blr
   %a = call <1 x i64> @llvm.llrint.v1i64.v1f32(<1 x float> %x)
   ret <1 x i64> %a
 }
@@ -3444,21 +2466,6 @@ define <2 x i64> @llrint_v2i64_v2f32(<2 x float> %x) nounwind {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: llrint_v2i64_v2f32:
-; FAST:       # %bb.0:
-; FAST-NEXT:    xxsldwi vs0, v2, v2, 3
-; FAST-NEXT:    xxswapd vs1, v2
-; FAST-NEXT:    xscvspdpn f0, vs0
-; FAST-NEXT:    xscvspdpn f1, vs1
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    fctid f1, f1
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    mffprd r3, f1
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    xxmrghd v2, vs1, vs0
-; FAST-NEXT:    blr
   %a = call <2 x i64> @llvm.llrint.v2i64.v2f32(<2 x float> %x)
   ret <2 x i64> %a
 }
@@ -3537,32 +2544,6 @@ define <4 x i64> @llrint_v4i64_v4f32(<4 x float> %x) nounwind {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: llrint_v4i64_v4f32:
-; FAST:       # %bb.0:
-; FAST-NEXT:    xxsldwi vs0, v2, v2, 3
-; FAST-NEXT:    xxswapd vs1, v2
-; FAST-NEXT:    xscvspdpn f0, vs0
-; FAST-NEXT:    xxsldwi vs2, v2, v2, 1
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    xscvspdpn f0, vs1
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    xxmrghd v4, vs0, vs1
-; FAST-NEXT:    xscvspdpn f0, v2
-; FAST-NEXT:    vmr v2, v4
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    xscvspdpn f0, vs2
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    xxmrghd v3, vs1, vs0
-; FAST-NEXT:    blr
   %a = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> %x)
   ret <4 x i64> %a
 }
@@ -3695,54 +2676,6 @@ define <8 x i64> @llrint_v8i64_v8f32(<8 x float> %x) nounwind {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: llrint_v8i64_v8f32:
-; FAST:       # %bb.0:
-; FAST-NEXT:    xxsldwi vs0, v2, v2, 3
-; FAST-NEXT:    xxswapd vs1, v2
-; FAST-NEXT:    xscvspdpn f0, vs0
-; FAST-NEXT:    xxsldwi vs2, v2, v2, 1
-; FAST-NEXT:    xxsldwi vs3, v3, v3, 3
-; FAST-NEXT:    xxswapd vs4, v3
-; FAST-NEXT:    xxsldwi vs5, v3, v3, 1
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    xscvspdpn f0, vs1
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    xxmrghd v0, vs0, vs1
-; FAST-NEXT:    xscvspdpn f0, v2
-; FAST-NEXT:    vmr v2, v0
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    xscvspdpn f0, vs2
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    xxmrghd v1, vs1, vs0
-; FAST-NEXT:    xscvspdpn f0, vs3
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    xscvspdpn f0, vs4
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    xxmrghd v4, vs0, vs1
-; FAST-NEXT:    xscvspdpn f0, v3
-; FAST-NEXT:    vmr v3, v1
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    xscvspdpn f0, vs5
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    xxmrghd v5, vs1, vs0
-; FAST-NEXT:    blr
   %a = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> %x)
   ret <8 x i64> %a
 }
@@ -3983,98 +2916,6 @@ define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) nounwind {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: llrint_v16i64_v16f32:
-; FAST:       # %bb.0:
-; FAST-NEXT:    xxsldwi vs0, v2, v2, 3
-; FAST-NEXT:    xxswapd vs1, v2
-; FAST-NEXT:    xscvspdpn f0, vs0
-; FAST-NEXT:    xxsldwi vs2, v2, v2, 1
-; FAST-NEXT:    xxsldwi vs3, v3, v3, 3
-; FAST-NEXT:    xxswapd vs4, v3
-; FAST-NEXT:    xxsldwi vs5, v3, v3, 1
-; FAST-NEXT:    xxsldwi vs6, v4, v4, 3
-; FAST-NEXT:    xxswapd vs7, v4
-; FAST-NEXT:    xxsldwi vs8, v4, v4, 1
-; FAST-NEXT:    xxsldwi vs9, v5, v5, 3
-; FAST-NEXT:    xxswapd vs10, v5
-; FAST-NEXT:    xxsldwi vs11, v5, v5, 1
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    xscvspdpn f0, vs1
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    xxmrghd v0, vs0, vs1
-; FAST-NEXT:    xscvspdpn f0, v2
-; FAST-NEXT:    vmr v2, v0
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    xscvspdpn f0, vs2
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    xxmrghd v1, vs1, vs0
-; FAST-NEXT:    xscvspdpn f0, vs3
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    xscvspdpn f0, vs4
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    xxmrghd v10, vs0, vs1
-; FAST-NEXT:    xscvspdpn f0, v3
-; FAST-NEXT:    vmr v3, v1
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    xscvspdpn f0, vs5
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    xxmrghd v11, vs1, vs0
-; FAST-NEXT:    xscvspdpn f0, vs6
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    xscvspdpn f0, vs7
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    xxmrghd v6, vs0, vs1
-; FAST-NEXT:    xscvspdpn f0, v4
-; FAST-NEXT:    xscvspdpn f1, vs8
-; FAST-NEXT:    vmr v4, v10
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    fctid f1, f1
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    mffprd r3, f1
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    xxmrghd v7, vs0, vs1
-; FAST-NEXT:    xscvspdpn f0, vs9
-; FAST-NEXT:    xscvspdpn f1, vs10
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    fctid f1, f1
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    mffprd r3, f1
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    xxmrghd v8, vs1, vs0
-; FAST-NEXT:    xscvspdpn f0, v5
-; FAST-NEXT:    xscvspdpn f1, vs11
-; FAST-NEXT:    vmr v5, v11
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    fctid f1, f1
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    mffprd r3, f1
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    xxmrghd v9, vs0, vs1
-; FAST-NEXT:    blr
   %a = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> %x)
   ret <16 x i64> %a
 }
@@ -4104,12 +2945,6 @@ define <1 x i64> @llrint_v1i64_v1f64(<1 x double> %x) nounwind {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: llrint_v1i64_v1f64:
-; FAST:       # %bb.0:
-; FAST-NEXT:    fctid f0, f1
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    blr
   %a = call <1 x i64> @llvm.llrint.v1i64.v1f64(<1 x double> %x)
   ret <1 x i64> %a
 }
@@ -4164,19 +2999,6 @@ define <2 x i64> @llrint_v2i64_v2f64(<2 x double> %x) nounwind {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: llrint_v2i64_v2f64:
-; FAST:       # %bb.0:
-; FAST-NEXT:    xxlor f1, v2, v2
-; FAST-NEXT:    xxswapd vs0, v2
-; FAST-NEXT:    fctid f1, f1
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f1
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    xxmrghd v2, vs1, vs0
-; FAST-NEXT:    blr
   %a = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> %x)
   ret <2 x i64> %a
 }
@@ -4261,28 +3083,6 @@ define <4 x i64> @llrint_v4i64_v4f64(<4 x double> %x) nounwind {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: llrint_v4i64_v4f64:
-; FAST:       # %bb.0:
-; FAST-NEXT:    xxswapd vs0, v2
-; FAST-NEXT:    xxlor f2, v2, v2
-; FAST-NEXT:    xxswapd vs1, v3
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    fctid f2, f2
-; FAST-NEXT:    fctid f1, f1
-; FAST-NEXT:    mffprd r4, f0
-; FAST-NEXT:    xxlor f0, v3, v3
-; FAST-NEXT:    mffprd r3, f2
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mtfprd f2, r4
-; FAST-NEXT:    mffprd r5, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    mffprd r3, f1
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    xxmrghd v2, vs0, vs2
-; FAST-NEXT:    mtfprd f0, r5
-; FAST-NEXT:    xxmrghd v3, vs0, vs1
-; FAST-NEXT:    blr
   %a = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> %x)
   ret <4 x i64> %a
 }
@@ -4427,46 +3227,6 @@ define <8 x i64> @llrint_v8i64_v8f64(<8 x double> %x) nounwind {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: llrint_v8i64_v8f64:
-; FAST:       # %bb.0:
-; FAST-NEXT:    xxswapd vs0, v2
-; FAST-NEXT:    xxswapd vs1, v3
-; FAST-NEXT:    xxlor f4, v2, v2
-; FAST-NEXT:    xxswapd vs2, v4
-; FAST-NEXT:    xxswapd vs3, v5
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    fctid f4, f4
-; FAST-NEXT:    mffprd r4, f0
-; FAST-NEXT:    xxlor f0, v3, v3
-; FAST-NEXT:    mffprd r3, f4
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r5, f0
-; FAST-NEXT:    fctid f0, f1
-; FAST-NEXT:    mtfprd f1, r4
-; FAST-NEXT:    mffprd r6, f0
-; FAST-NEXT:    xxlor f0, v4, v4
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mtfprd f4, r6
-; FAST-NEXT:    mffprd r7, f0
-; FAST-NEXT:    fctid f0, f2
-; FAST-NEXT:    mtfprd f2, r5
-; FAST-NEXT:    mtfprd f5, r7
-; FAST-NEXT:    mffprd r8, f0
-; FAST-NEXT:    xxlor f0, v5, v5
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mtfprd f6, r8
-; FAST-NEXT:    mffprd r9, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    xxmrghd v3, vs2, vs4
-; FAST-NEXT:    xxmrghd v4, vs5, vs6
-; FAST-NEXT:    xxmrghd v2, vs0, vs1
-; FAST-NEXT:    fctid f1, f3
-; FAST-NEXT:    mtfprd f0, r9
-; FAST-NEXT:    mffprd r3, f1
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    xxmrghd v5, vs0, vs1
-; FAST-NEXT:    blr
   %a = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> %x)
   ret <8 x i64> %a
 }
@@ -4496,18 +3256,6 @@ define <1 x i64> @llrint_v1i64_v1f128(<1 x fp128> %x) nounwind {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: llrint_v1i64_v1f128:
-; FAST:       # %bb.0:
-; FAST-NEXT:    mflr r0
-; FAST-NEXT:    stdu r1, -32(r1)
-; FAST-NEXT:    std r0, 48(r1)
-; FAST-NEXT:    bl llrintf128
-; FAST-NEXT:    nop
-; FAST-NEXT:    addi r1, r1, 32
-; FAST-NEXT:    ld r0, 16(r1)
-; FAST-NEXT:    mtlr r0
-; FAST-NEXT:    blr
   %a = call <1 x i64> @llvm.llrint.v1i64.v1f128(<1 x fp128> %x)
   ret <1 x i64> %a
 }
@@ -4565,33 +3313,6 @@ define <2 x i64> @llrint_v2i64_v2f128(<2 x fp128> %x) nounwind {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: llrint_v2i64_v2f128:
-; FAST:       # %bb.0:
-; FAST-NEXT:    mflr r0
-; FAST-NEXT:    stdu r1, -80(r1)
-; FAST-NEXT:    li r3, 48
-; FAST-NEXT:    std r0, 96(r1)
-; FAST-NEXT:    stxvd2x v30, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT:    li r3, 64
-; FAST-NEXT:    stxvd2x v31, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT:    vmr v31, v3
-; FAST-NEXT:    bl llrintf128
-; FAST-NEXT:    nop
-; FAST-NEXT:    vmr v2, v31
-; FAST-NEXT:    mtvsrd v30, r3
-; FAST-NEXT:    bl llrintf128
-; FAST-NEXT:    nop
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    li r3, 64
-; FAST-NEXT:    lxvd2x v31, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 48
-; FAST-NEXT:    xxmrghd v2, vs0, v30
-; FAST-NEXT:    lxvd2x v30, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    addi r1, r1, 80
-; FAST-NEXT:    ld r0, 16(r1)
-; FAST-NEXT:    mtlr r0
-; FAST-NEXT:    blr
   %a = call <2 x i64> @llvm.llrint.v2i64.v2f128(<2 x fp128> %x)
   ret <2 x i64> %a
 }
@@ -4689,53 +3410,6 @@ define <4 x i64> @llrint_v4i64_v4f128(<4 x fp128> %x) nounwind {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: llrint_v4i64_v4f128:
-; FAST:       # %bb.0:
-; FAST-NEXT:    mflr r0
-; FAST-NEXT:    stdu r1, -112(r1)
-; FAST-NEXT:    li r3, 48
-; FAST-NEXT:    std r0, 128(r1)
-; FAST-NEXT:    stxvd2x v28, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT:    li r3, 64
-; FAST-NEXT:    stxvd2x v29, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT:    li r3, 80
-; FAST-NEXT:    vmr v29, v3
-; FAST-NEXT:    stxvd2x v30, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT:    li r3, 96
-; FAST-NEXT:    vmr v30, v4
-; FAST-NEXT:    stxvd2x v31, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT:    vmr v31, v5
-; FAST-NEXT:    bl llrintf128
-; FAST-NEXT:    nop
-; FAST-NEXT:    vmr v2, v29
-; FAST-NEXT:    mtvsrd v28, r3
-; FAST-NEXT:    bl llrintf128
-; FAST-NEXT:    nop
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    vmr v2, v30
-; FAST-NEXT:    xxmrghd v29, vs0, v28
-; FAST-NEXT:    bl llrintf128
-; FAST-NEXT:    nop
-; FAST-NEXT:    vmr v2, v31
-; FAST-NEXT:    mtvsrd v30, r3
-; FAST-NEXT:    bl llrintf128
-; FAST-NEXT:    nop
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    li r3, 96
-; FAST-NEXT:    vmr v2, v29
-; FAST-NEXT:    lxvd2x v31, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 80
-; FAST-NEXT:    xxmrghd v3, vs0, v30
-; FAST-NEXT:    lxvd2x v30, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 64
-; FAST-NEXT:    lxvd2x v29, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 48
-; FAST-NEXT:    lxvd2x v28, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    addi r1, r1, 112
-; FAST-NEXT:    ld r0, 16(r1)
-; FAST-NEXT:    mtlr r0
-; FAST-NEXT:    blr
   %a = call <4 x i64> @llvm.llrint.v4i64.v4f128(<4 x fp128> %x)
   ret <4 x i64> %a
 }
@@ -4913,93 +3587,6 @@ define <8 x i64> @llrint_v8i64_v8f128(<8 x fp128> %x) nounwind {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: llrint_v8i64_v8f128:
-; FAST:       # %bb.0:
-; FAST-NEXT:    mflr r0
-; FAST-NEXT:    stdu r1, -176(r1)
-; FAST-NEXT:    li r3, 48
-; FAST-NEXT:    std r0, 192(r1)
-; FAST-NEXT:    stxvd2x v24, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT:    li r3, 64
-; FAST-NEXT:    stxvd2x v25, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT:    li r3, 80
-; FAST-NEXT:    vmr v25, v3
-; FAST-NEXT:    stxvd2x v26, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT:    li r3, 96
-; FAST-NEXT:    vmr v26, v4
-; FAST-NEXT:    stxvd2x v27, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT:    li r3, 112
-; FAST-NEXT:    vmr v27, v5
-; FAST-NEXT:    stxvd2x v28, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT:    li r3, 128
-; FAST-NEXT:    vmr v28, v6
-; FAST-NEXT:    stxvd2x v29, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT:    li r3, 144
-; FAST-NEXT:    vmr v29, v7
-; FAST-NEXT:    stxvd2x v30, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT:    li r3, 160
-; FAST-NEXT:    vmr v30, v8
-; FAST-NEXT:    stxvd2x v31, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT:    vmr v31, v9
-; FAST-NEXT:    bl llrintf128
-; FAST-NEXT:    nop
-; FAST-NEXT:    vmr v2, v25
-; FAST-NEXT:    mtvsrd v24, r3
-; FAST-NEXT:    bl llrintf128
-; FAST-NEXT:    nop
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    vmr v2, v26
-; FAST-NEXT:    xxmrghd v25, vs0, v24
-; FAST-NEXT:    bl llrintf128
-; FAST-NEXT:    nop
-; FAST-NEXT:    vmr v2, v27
-; FAST-NEXT:    mtvsrd v26, r3
-; FAST-NEXT:    bl llrintf128
-; FAST-NEXT:    nop
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    vmr v2, v28
-; FAST-NEXT:    xxmrghd v27, vs0, v26
-; FAST-NEXT:    bl llrintf128
-; FAST-NEXT:    nop
-; FAST-NEXT:    vmr v2, v29
-; FAST-NEXT:    mtvsrd v28, r3
-; FAST-NEXT:    bl llrintf128
-; FAST-NEXT:    nop
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    vmr v2, v30
-; FAST-NEXT:    xxmrghd v29, vs0, v28
-; FAST-NEXT:    bl llrintf128
-; FAST-NEXT:    nop
-; FAST-NEXT:    vmr v2, v31
-; FAST-NEXT:    mtvsrd v30, r3
-; FAST-NEXT:    bl llrintf128
-; FAST-NEXT:    nop
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    li r3, 160
-; FAST-NEXT:    vmr v4, v29
-; FAST-NEXT:    lxvd2x v31, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 144
-; FAST-NEXT:    vmr v3, v27
-; FAST-NEXT:    vmr v2, v25
-; FAST-NEXT:    xxmrghd v5, vs0, v30
-; FAST-NEXT:    lxvd2x v30, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 128
-; FAST-NEXT:    lxvd2x v29, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 112
-; FAST-NEXT:    lxvd2x v28, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 96
-; FAST-NEXT:    lxvd2x v27, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 80
-; FAST-NEXT:    lxvd2x v26, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 64
-; FAST-NEXT:    lxvd2x v25, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 48
-; FAST-NEXT:    lxvd2x v24, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    addi r1, r1, 176
-; FAST-NEXT:    ld r0, 16(r1)
-; FAST-NEXT:    mtlr r0
-; FAST-NEXT:    blr
   %a = call <8 x i64> @llvm.llrint.v8i64.v8f128(<8 x fp128> %x)
   ret <8 x i64> %a
 }
diff --git a/llvm/test/CodeGen/PowerPC/vector-lrint.ll b/llvm/test/CodeGen/PowerPC/vector-lrint.ll
index f4375362f861c..af5704bf3c852 100644
--- a/llvm/test/CodeGen/PowerPC/vector-lrint.ll
+++ b/llvm/test/CodeGen/PowerPC/vector-lrint.ll
@@ -9,10 +9,6 @@
 ; RUN: sed 's/iXLen/i32/g' %s | llc -mcpu=pwr8 -ppc-asm-full-reg-names \
 ; RUN:   -ppc-vsr-nums-as-vr -mtriple=powerpc64le-unknown-unknown \
 ; RUN:   -verify-machineinstrs | FileCheck %s
-; RUN: sed 's/iXLen/i32/g' %s | llc -mcpu=pwr8 -ppc-asm-full-reg-names \
-; RUN:   -ppc-vsr-nums-as-vr -mtriple=powerpc64le-unknown-unknown \
-; RUN:   -verify-machineinstrs --enable-unsafe-fp-math | \
-; RUN:   FileCheck %s --check-prefixes=FAST
 ; FIXME: crash "Input type needs to be promoted!"
 ; SKIP: sed 's/iXLen/i64/g' %s | llc -ppc-asm-full-reg-names \
 ; SKIP:   -ppc-vsr-nums-as-vr -mtriple=powerpc-unknown-unknown \
@@ -23,10 +19,6 @@
 ; RUN: sed 's/iXLen/i64/g' %s | llc -mcpu=pwr8 -ppc-asm-full-reg-names \
 ; RUN:   -ppc-vsr-nums-as-vr -mtriple=powerpc64le-unknown-unknown \
 ; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
-; RUN: sed 's/iXLen/i64/g' %s | llc -mcpu=pwr8 -ppc-asm-full-reg-names \
-; RUN:   -ppc-vsr-nums-as-vr -mtriple=powerpc64le-unknown-unknown \
-; RUN:   -verify-machineinstrs --enable-unsafe-fp-math | \
-; RUN:   FileCheck %s --check-prefixes=FAST
 
 define <1 x i64> @lrint_v1f16(<1 x half> %x) nounwind {
 ; BE-LABEL: lrint_v1f16:
@@ -62,23 +54,6 @@ define <1 x i64> @lrint_v1f16(<1 x half> %x) nounwind {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: lrint_v1f16:
-; FAST:       # %bb.0:
-; FAST-NEXT:    mflr r0
-; FAST-NEXT:    stdu r1, -32(r1)
-; FAST-NEXT:    std r0, 48(r1)
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fctid f0, f1
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    addi r1, r1, 32
-; FAST-NEXT:    ld r0, 16(r1)
-; FAST-NEXT:    mtlr r0
-; FAST-NEXT:    blr
   %a = call <1 x i64> @llvm.lrint.v1i64.v1f16(<1 x half> %x)
   ret <1 x i64> %a
 }
@@ -162,41 +137,6 @@ define <2 x i64> @lrint_v2f16(<2 x half> %x) nounwind {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: lrint_v2f16:
-; FAST:       # %bb.0:
-; FAST-NEXT:    mflr r0
-; FAST-NEXT:    stfd f30, -16(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f31, -8(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stdu r1, -48(r1)
-; FAST-NEXT:    fmr f31, f1
-; FAST-NEXT:    fmr f1, f2
-; FAST-NEXT:    std r0, 64(r1)
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f30, f1
-; FAST-NEXT:    fmr f1, f31
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fctid f0, f1
-; FAST-NEXT:    fctid f1, f30
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    mffprd r3, f1
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    xxmrghd v2, vs1, vs0
-; FAST-NEXT:    addi r1, r1, 48
-; FAST-NEXT:    ld r0, 16(r1)
-; FAST-NEXT:    lfd f31, -8(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f30, -16(r1) # 8-byte Folded Reload
-; FAST-NEXT:    mtlr r0
-; FAST-NEXT:    blr
   %a = call <2 x i64> @llvm.lrint.v2i64.v2f16(<2 x half> %x)
   ret <2 x i64> %a
 }
@@ -356,68 +296,6 @@ define <4 x i64> @lrint_v4f16(<4 x half> %x) nounwind {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: lrint_v4f16:
-; FAST:       # %bb.0:
-; FAST-NEXT:    mflr r0
-; FAST-NEXT:    stfd f28, -32(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f29, -24(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f30, -16(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f31, -8(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stdu r1, -64(r1)
-; FAST-NEXT:    fmr f29, f1
-; FAST-NEXT:    fmr f1, f4
-; FAST-NEXT:    std r0, 80(r1)
-; FAST-NEXT:    fmr f31, f3
-; FAST-NEXT:    fmr f30, f2
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f28, f1
-; FAST-NEXT:    fmr f1, f31
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f31, f1
-; FAST-NEXT:    fmr f1, f30
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f30, f1
-; FAST-NEXT:    fmr f1, f29
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fctid f0, f30
-; FAST-NEXT:    fctid f2, f31
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    fctid f1, f1
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    mffprd r3, f2
-; FAST-NEXT:    mtfprd f2, r3
-; FAST-NEXT:    mffprd r3, f1
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    xxmrghd v2, vs0, vs1
-; FAST-NEXT:    fctid f0, f28
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    xxmrghd v3, vs0, vs2
-; FAST-NEXT:    addi r1, r1, 64
-; FAST-NEXT:    ld r0, 16(r1)
-; FAST-NEXT:    lfd f31, -8(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f30, -16(r1) # 8-byte Folded Reload
-; FAST-NEXT:    mtlr r0
-; FAST-NEXT:    lfd f29, -24(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f28, -32(r1) # 8-byte Folded Reload
-; FAST-NEXT:    blr
   %a = call <4 x i64> @llvm.lrint.v4i64.v4f16(<4 x half> %x)
   ret <4 x i64> %a
 }
@@ -729,122 +607,6 @@ define <8 x i64> @lrint_v8f16(<8 x half> %x) nounwind {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: lrint_v8f16:
-; FAST:       # %bb.0:
-; FAST-NEXT:    mflr r0
-; FAST-NEXT:    stfd f24, -64(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f25, -56(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f26, -48(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f27, -40(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f28, -32(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f29, -24(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f30, -16(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f31, -8(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stdu r1, -96(r1)
-; FAST-NEXT:    fmr f24, f1
-; FAST-NEXT:    fmr f1, f8
-; FAST-NEXT:    std r0, 112(r1)
-; FAST-NEXT:    fmr f30, f7
-; FAST-NEXT:    fmr f29, f6
-; FAST-NEXT:    fmr f28, f5
-; FAST-NEXT:    fmr f27, f4
-; FAST-NEXT:    fmr f26, f3
-; FAST-NEXT:    fmr f25, f2
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f31, f1
-; FAST-NEXT:    fmr f1, f30
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f30, f1
-; FAST-NEXT:    fmr f1, f29
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f29, f1
-; FAST-NEXT:    fmr f1, f28
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f28, f1
-; FAST-NEXT:    fmr f1, f27
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f27, f1
-; FAST-NEXT:    fmr f1, f26
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f26, f1
-; FAST-NEXT:    fmr f1, f25
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f25, f1
-; FAST-NEXT:    fmr f1, f24
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fctid f0, f25
-; FAST-NEXT:    fctid f2, f26
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    fctid f3, f27
-; FAST-NEXT:    fctid f4, f28
-; FAST-NEXT:    fctid f5, f29
-; FAST-NEXT:    fctid f6, f30
-; FAST-NEXT:    fctid f1, f1
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    mffprd r3, f2
-; FAST-NEXT:    mtfprd f2, r3
-; FAST-NEXT:    mffprd r3, f3
-; FAST-NEXT:    mtfprd f3, r3
-; FAST-NEXT:    mffprd r3, f4
-; FAST-NEXT:    mtfprd f4, r3
-; FAST-NEXT:    mffprd r3, f5
-; FAST-NEXT:    mtfprd f5, r3
-; FAST-NEXT:    mffprd r3, f6
-; FAST-NEXT:    mtfprd f6, r3
-; FAST-NEXT:    mffprd r3, f1
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    xxmrghd v3, vs3, vs2
-; FAST-NEXT:    xxmrghd v4, vs5, vs4
-; FAST-NEXT:    xxmrghd v2, vs0, vs1
-; FAST-NEXT:    fctid f0, f31
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    xxmrghd v5, vs0, vs6
-; FAST-NEXT:    addi r1, r1, 96
-; FAST-NEXT:    ld r0, 16(r1)
-; FAST-NEXT:    lfd f31, -8(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f30, -16(r1) # 8-byte Folded Reload
-; FAST-NEXT:    mtlr r0
-; FAST-NEXT:    lfd f29, -24(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f28, -32(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f27, -40(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f26, -48(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f25, -56(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f24, -64(r1) # 8-byte Folded Reload
-; FAST-NEXT:    blr
   %a = call <8 x i64> @llvm.lrint.v8i64.v8f16(<8 x half> %x)
   ret <8 x i64> %a
 }
@@ -1454,228 +1216,6 @@ define <16 x i64> @lrint_v16i64_v16f16(<16 x half> %x) nounwind {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: lrint_v16i64_v16f16:
-; FAST:       # %bb.0:
-; FAST-NEXT:    mflr r0
-; FAST-NEXT:    stfd f16, -128(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f17, -120(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f18, -112(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f19, -104(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f20, -96(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f21, -88(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f22, -80(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f23, -72(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f24, -64(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f25, -56(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f26, -48(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f27, -40(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f28, -32(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f29, -24(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f30, -16(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f31, -8(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stdu r1, -160(r1)
-; FAST-NEXT:    fmr f26, f1
-; FAST-NEXT:    lfs f1, 312(r1)
-; FAST-NEXT:    std r0, 176(r1)
-; FAST-NEXT:    fmr f28, f13
-; FAST-NEXT:    fmr f27, f12
-; FAST-NEXT:    fmr f24, f11
-; FAST-NEXT:    fmr f21, f10
-; FAST-NEXT:    fmr f19, f9
-; FAST-NEXT:    fmr f18, f8
-; FAST-NEXT:    fmr f17, f7
-; FAST-NEXT:    fmr f16, f6
-; FAST-NEXT:    fmr f20, f5
-; FAST-NEXT:    fmr f22, f4
-; FAST-NEXT:    fmr f23, f3
-; FAST-NEXT:    fmr f25, f2
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f31, f1
-; FAST-NEXT:    lfs f1, 304(r1)
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f30, f1
-; FAST-NEXT:    lfs f1, 296(r1)
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f29, f1
-; FAST-NEXT:    fmr f1, f28
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f28, f1
-; FAST-NEXT:    fmr f1, f27
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f27, f1
-; FAST-NEXT:    fmr f1, f24
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f24, f1
-; FAST-NEXT:    fmr f1, f21
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f21, f1
-; FAST-NEXT:    fmr f1, f19
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f19, f1
-; FAST-NEXT:    fmr f1, f18
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f18, f1
-; FAST-NEXT:    fmr f1, f17
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f17, f1
-; FAST-NEXT:    fmr f1, f16
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f16, f1
-; FAST-NEXT:    fmr f1, f20
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f20, f1
-; FAST-NEXT:    fmr f1, f22
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f22, f1
-; FAST-NEXT:    fmr f1, f23
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f23, f1
-; FAST-NEXT:    fmr f1, f25
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f25, f1
-; FAST-NEXT:    fmr f1, f26
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fctid f0, f25
-; FAST-NEXT:    fctid f2, f23
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    fctid f3, f22
-; FAST-NEXT:    fctid f4, f20
-; FAST-NEXT:    fctid f5, f16
-; FAST-NEXT:    fctid f6, f17
-; FAST-NEXT:    fctid f7, f18
-; FAST-NEXT:    fctid f8, f19
-; FAST-NEXT:    fctid f9, f21
-; FAST-NEXT:    fctid f10, f24
-; FAST-NEXT:    fctid f1, f1
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    mffprd r3, f2
-; FAST-NEXT:    mtfprd f2, r3
-; FAST-NEXT:    mffprd r3, f3
-; FAST-NEXT:    mtfprd f3, r3
-; FAST-NEXT:    mffprd r3, f4
-; FAST-NEXT:    mtfprd f4, r3
-; FAST-NEXT:    mffprd r3, f5
-; FAST-NEXT:    mtfprd f5, r3
-; FAST-NEXT:    mffprd r3, f6
-; FAST-NEXT:    mtfprd f6, r3
-; FAST-NEXT:    mffprd r3, f7
-; FAST-NEXT:    mtfprd f7, r3
-; FAST-NEXT:    mffprd r3, f8
-; FAST-NEXT:    mtfprd f8, r3
-; FAST-NEXT:    mffprd r3, f9
-; FAST-NEXT:    mtfprd f9, r3
-; FAST-NEXT:    mffprd r3, f10
-; FAST-NEXT:    mtfprd f10, r3
-; FAST-NEXT:    mffprd r3, f1
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    xxmrghd v3, vs3, vs2
-; FAST-NEXT:    xxmrghd v4, vs5, vs4
-; FAST-NEXT:    xxmrghd v5, vs7, vs6
-; FAST-NEXT:    xxmrghd v6, vs9, vs8
-; FAST-NEXT:    xxmrghd v2, vs0, vs1
-; FAST-NEXT:    fctid f0, f27
-; FAST-NEXT:    fctid f1, f29
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    xxmrghd v7, vs0, vs10
-; FAST-NEXT:    fctid f0, f28
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    mffprd r3, f1
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    xxmrghd v8, vs1, vs0
-; FAST-NEXT:    fctid f0, f30
-; FAST-NEXT:    fctid f1, f31
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    mffprd r3, f1
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    xxmrghd v9, vs1, vs0
-; FAST-NEXT:    addi r1, r1, 160
-; FAST-NEXT:    ld r0, 16(r1)
-; FAST-NEXT:    lfd f31, -8(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f30, -16(r1) # 8-byte Folded Reload
-; FAST-NEXT:    mtlr r0
-; FAST-NEXT:    lfd f29, -24(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f28, -32(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f27, -40(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f26, -48(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f25, -56(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f24, -64(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f23, -72(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f22, -80(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f21, -88(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f20, -96(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f19, -104(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f18, -112(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f17, -120(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f16, -128(r1) # 8-byte Folded Reload
-; FAST-NEXT:    blr
   %a = call <16 x i64> @llvm.lrint.v16i64.v16f16(<16 x half> %x)
   ret <16 x i64> %a
 }
@@ -2854,523 +2394,6 @@ define <32 x i64> @lrint_v32i64_v32f16(<32 x half> %x) nounwind {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: lrint_v32i64_v32f16:
-; FAST:       # %bb.0:
-; FAST-NEXT:    mflr r0
-; FAST-NEXT:    stdu r1, -480(r1)
-; FAST-NEXT:    li r4, 128
-; FAST-NEXT:    std r0, 496(r1)
-; FAST-NEXT:    std r30, 320(r1) # 8-byte Folded Spill
-; FAST-NEXT:    mr r30, r3
-; FAST-NEXT:    stfd f14, 336(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f15, 344(r1) # 8-byte Folded Spill
-; FAST-NEXT:    fmr f14, f5
-; FAST-NEXT:    stfd f16, 352(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stxvd2x v20, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT:    li r4, 144
-; FAST-NEXT:    fmr f16, f4
-; FAST-NEXT:    stfd f17, 360(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f18, 368(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f19, 376(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f20, 384(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f21, 392(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stxvd2x v21, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT:    li r4, 160
-; FAST-NEXT:    stfd f22, 400(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f23, 408(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f24, 416(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f25, 424(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f26, 432(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f27, 440(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stxvd2x v22, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT:    li r4, 176
-; FAST-NEXT:    xxlor v22, f3, f3
-; FAST-NEXT:    stfd f28, 448(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f29, 456(r1) # 8-byte Folded Spill
-; FAST-NEXT:    fmr f29, f9
-; FAST-NEXT:    stfd f30, 464(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stfd f31, 472(r1) # 8-byte Folded Spill
-; FAST-NEXT:    stxvd2x v23, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT:    li r4, 192
-; FAST-NEXT:    xxlor v23, f2, f2
-; FAST-NEXT:    stxvd2x v24, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT:    li r4, 208
-; FAST-NEXT:    stxvd2x v25, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT:    li r4, 224
-; FAST-NEXT:    xxlor v25, f13, f13
-; FAST-NEXT:    stxvd2x v26, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT:    li r4, 240
-; FAST-NEXT:    xxlor v26, f12, f12
-; FAST-NEXT:    stxvd2x v27, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT:    li r4, 256
-; FAST-NEXT:    xxlor v27, f11, f11
-; FAST-NEXT:    stxvd2x v28, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT:    li r4, 272
-; FAST-NEXT:    xxlor v28, f10, f10
-; FAST-NEXT:    stxvd2x v29, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT:    li r4, 288
-; FAST-NEXT:    xxlor v29, f8, f8
-; FAST-NEXT:    stxvd2x v30, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT:    li r4, 304
-; FAST-NEXT:    xxlor v30, f7, f7
-; FAST-NEXT:    stxvd2x v31, r1, r4 # 16-byte Folded Spill
-; FAST-NEXT:    li r4, 44
-; FAST-NEXT:    xxlor v31, f6, f6
-; FAST-NEXT:    stxsspx f1, r1, r4 # 4-byte Folded Spill
-; FAST-NEXT:    lfs f1, 768(r1)
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    li r3, 120
-; FAST-NEXT:    stxsdx f1, r1, r3 # 8-byte Folded Spill
-; FAST-NEXT:    lfs f1, 760(r1)
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    li r3, 112
-; FAST-NEXT:    stxsdx f1, r1, r3 # 8-byte Folded Spill
-; FAST-NEXT:    lfs f1, 752(r1)
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    li r3, 104
-; FAST-NEXT:    stxsdx f1, r1, r3 # 8-byte Folded Spill
-; FAST-NEXT:    lfs f1, 744(r1)
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    li r3, 96
-; FAST-NEXT:    stxsdx f1, r1, r3 # 8-byte Folded Spill
-; FAST-NEXT:    lfs f1, 736(r1)
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    li r3, 88
-; FAST-NEXT:    stxsdx f1, r1, r3 # 8-byte Folded Spill
-; FAST-NEXT:    lfs f1, 728(r1)
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    li r3, 80
-; FAST-NEXT:    stxsdx f1, r1, r3 # 8-byte Folded Spill
-; FAST-NEXT:    lfs f1, 720(r1)
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    li r3, 72
-; FAST-NEXT:    stxsdx f1, r1, r3 # 8-byte Folded Spill
-; FAST-NEXT:    lfs f1, 712(r1)
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    li r3, 64
-; FAST-NEXT:    stxsdx f1, r1, r3 # 8-byte Folded Spill
-; FAST-NEXT:    lfs f1, 704(r1)
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    li r3, 56
-; FAST-NEXT:    stxsdx f1, r1, r3 # 8-byte Folded Spill
-; FAST-NEXT:    lfs f1, 696(r1)
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    li r3, 48
-; FAST-NEXT:    stxsdx f1, r1, r3 # 8-byte Folded Spill
-; FAST-NEXT:    lfs f1, 688(r1)
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    xxlor v21, f1, f1
-; FAST-NEXT:    lfs f1, 680(r1)
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    xxlor v20, f1, f1
-; FAST-NEXT:    lfs f1, 672(r1)
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    xxlor v24, f1, f1
-; FAST-NEXT:    lfs f1, 664(r1)
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f31, f1
-; FAST-NEXT:    lfs f1, 656(r1)
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f30, f1
-; FAST-NEXT:    lfs f1, 648(r1)
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f28, f1
-; FAST-NEXT:    lfs f1, 640(r1)
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f27, f1
-; FAST-NEXT:    lfs f1, 632(r1)
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f26, f1
-; FAST-NEXT:    lfs f1, 624(r1)
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f25, f1
-; FAST-NEXT:    xxlor f1, v25, v25
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f24, f1
-; FAST-NEXT:    xxlor f1, v26, v26
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f23, f1
-; FAST-NEXT:    xxlor f1, v27, v27
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f22, f1
-; FAST-NEXT:    xxlor f1, v28, v28
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f21, f1
-; FAST-NEXT:    fmr f1, f29
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f20, f1
-; FAST-NEXT:    xxlor f1, v29, v29
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f19, f1
-; FAST-NEXT:    xxlor f1, v30, v30
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f18, f1
-; FAST-NEXT:    xxlor f1, v31, v31
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f29, f1
-; FAST-NEXT:    fmr f1, f14
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f14, f1
-; FAST-NEXT:    fmr f1, f16
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f16, f1
-; FAST-NEXT:    xxlor f1, v22, v22
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fmr f17, f1
-; FAST-NEXT:    xxlor f1, v23, v23
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    li r3, 44
-; FAST-NEXT:    fmr f15, f1
-; FAST-NEXT:    lxsspx f1, r1, r3 # 4-byte Folded Reload
-; FAST-NEXT:    bl __truncsfhf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    clrldi r3, r3, 48
-; FAST-NEXT:    bl __extendhfsf2
-; FAST-NEXT:    nop
-; FAST-NEXT:    fctid f3, f15
-; FAST-NEXT:    fctid f4, f17
-; FAST-NEXT:    mffprd r3, f3
-; FAST-NEXT:    fctid f5, f16
-; FAST-NEXT:    fctid f6, f14
-; FAST-NEXT:    fctid f7, f18
-; FAST-NEXT:    fctid f8, f19
-; FAST-NEXT:    fctid f13, f1
-; FAST-NEXT:    fctid f9, f20
-; FAST-NEXT:    fctid f10, f22
-; FAST-NEXT:    fctid f11, f24
-; FAST-NEXT:    fctid f12, f25
-; FAST-NEXT:    fctid f2, f23
-; FAST-NEXT:    fctid f0, f21
-; FAST-NEXT:    mtvsrd v2, r3
-; FAST-NEXT:    mffprd r3, f4
-; FAST-NEXT:    mtvsrd v3, r3
-; FAST-NEXT:    mffprd r3, f5
-; FAST-NEXT:    mtfprd f5, r3
-; FAST-NEXT:    mffprd r3, f6
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    mffprd r3, f7
-; FAST-NEXT:    mtfprd f6, r3
-; FAST-NEXT:    mffprd r3, f8
-; FAST-NEXT:    mtfprd f7, r3
-; FAST-NEXT:    mffprd r3, f9
-; FAST-NEXT:    mtfprd f3, r3
-; FAST-NEXT:    mffprd r3, f10
-; FAST-NEXT:    mtfprd f4, r3
-; FAST-NEXT:    mffprd r3, f11
-; FAST-NEXT:    fctid f11, f31
-; FAST-NEXT:    lfd f31, 56(r1) # 8-byte Folded Reload
-; FAST-NEXT:    mtfprd f8, r3
-; FAST-NEXT:    mffprd r3, f12
-; FAST-NEXT:    xxlor f12, v24, v24
-; FAST-NEXT:    fctid f31, f31
-; FAST-NEXT:    fctid f12, f12
-; FAST-NEXT:    mtfprd f9, r3
-; FAST-NEXT:    mffprd r3, f13
-; FAST-NEXT:    lfd f13, 48(r1) # 8-byte Folded Reload
-; FAST-NEXT:    mtfprd f10, r3
-; FAST-NEXT:    fctid f13, f13
-; FAST-NEXT:    xxmrghd v3, vs5, v3
-; FAST-NEXT:    fctid f5, f26
-; FAST-NEXT:    mffprd r3, f5
-; FAST-NEXT:    mtfprd f5, r3
-; FAST-NEXT:    xxmrghd v4, vs7, vs6
-; FAST-NEXT:    fctid f6, f27
-; FAST-NEXT:    fctid f7, f28
-; FAST-NEXT:    mffprd r3, f6
-; FAST-NEXT:    lfd f28, 96(r1) # 8-byte Folded Reload
-; FAST-NEXT:    fctid f28, f28
-; FAST-NEXT:    mtfprd f6, r3
-; FAST-NEXT:    mffprd r3, f7
-; FAST-NEXT:    mtfprd f7, r3
-; FAST-NEXT:    xxmrghd v2, v2, vs10
-; FAST-NEXT:    fctid f10, f30
-; FAST-NEXT:    mffprd r3, f10
-; FAST-NEXT:    lfd f30, 80(r1) # 8-byte Folded Reload
-; FAST-NEXT:    fctid f30, f30
-; FAST-NEXT:    mtfprd f10, r3
-; FAST-NEXT:    mffprd r3, f11
-; FAST-NEXT:    mtfprd f11, r3
-; FAST-NEXT:    mffprd r3, f12
-; FAST-NEXT:    mtfprd f12, r3
-; FAST-NEXT:    xxmrghd v5, vs12, vs11
-; FAST-NEXT:    xxlor f11, v20, v20
-; FAST-NEXT:    xxlor f12, v21, v21
-; FAST-NEXT:    fctid f11, f11
-; FAST-NEXT:    fctid f12, f12
-; FAST-NEXT:    mffprd r3, f11
-; FAST-NEXT:    mtfprd f11, r3
-; FAST-NEXT:    mffprd r3, f12
-; FAST-NEXT:    mtfprd f12, r3
-; FAST-NEXT:    mffprd r3, f13
-; FAST-NEXT:    mtfprd f13, r3
-; FAST-NEXT:    mffprd r3, f31
-; FAST-NEXT:    lfd f31, 64(r1) # 8-byte Folded Reload
-; FAST-NEXT:    fctid f31, f31
-; FAST-NEXT:    mtvsrd v0, r3
-; FAST-NEXT:    mffprd r3, f31
-; FAST-NEXT:    lfd f31, 72(r1) # 8-byte Folded Reload
-; FAST-NEXT:    mtvsrd v1, r3
-; FAST-NEXT:    mffprd r3, f30
-; FAST-NEXT:    lfd f30, 88(r1) # 8-byte Folded Reload
-; FAST-NEXT:    fctid f31, f31
-; FAST-NEXT:    mtvsrd v6, r3
-; FAST-NEXT:    mffprd r3, f28
-; FAST-NEXT:    lfd f28, 104(r1) # 8-byte Folded Reload
-; FAST-NEXT:    fctid f30, f30
-; FAST-NEXT:    fctid f28, f28
-; FAST-NEXT:    mtvsrd v7, r3
-; FAST-NEXT:    mffprd r3, f28
-; FAST-NEXT:    lfd f28, 112(r1) # 8-byte Folded Reload
-; FAST-NEXT:    fctid f28, f28
-; FAST-NEXT:    mtvsrd v8, r3
-; FAST-NEXT:    mffprd r3, f28
-; FAST-NEXT:    lfd f28, 120(r1) # 8-byte Folded Reload
-; FAST-NEXT:    fctid f28, f28
-; FAST-NEXT:    xxmrghd v10, vs12, vs11
-; FAST-NEXT:    xxmrghd v0, v0, vs13
-; FAST-NEXT:    xxswapd vs12, v0
-; FAST-NEXT:    xxmrghd v0, vs9, vs8
-; FAST-NEXT:    xxmrghd v7, v8, v7
-; FAST-NEXT:    mtvsrd v8, r3
-; FAST-NEXT:    mffprd r3, f28
-; FAST-NEXT:    mtvsrd v9, r3
-; FAST-NEXT:    mffprd r3, f30
-; FAST-NEXT:    xxswapd v7, v7
-; FAST-NEXT:    xxmrghd v8, v9, v8
-; FAST-NEXT:    mtvsrd v9, r3
-; FAST-NEXT:    mffprd r3, f31
-; FAST-NEXT:    xxswapd v8, v8
-; FAST-NEXT:    xxmrghd v6, v9, v6
-; FAST-NEXT:    mtvsrd v9, r3
-; FAST-NEXT:    li r3, 240
-; FAST-NEXT:    stxvd2x v8, r30, r3
-; FAST-NEXT:    li r3, 224
-; FAST-NEXT:    stxvd2x v7, r30, r3
-; FAST-NEXT:    li r3, 208
-; FAST-NEXT:    xxswapd vs11, v6
-; FAST-NEXT:    xxmrghd v6, vs10, vs7
-; FAST-NEXT:    stxvd2x vs11, r30, r3
-; FAST-NEXT:    li r3, 192
-; FAST-NEXT:    xxmrghd v1, v9, v1
-; FAST-NEXT:    xxswapd vs11, v1
-; FAST-NEXT:    xxmrghd v1, vs6, vs5
-; FAST-NEXT:    xxswapd vs5, v10
-; FAST-NEXT:    xxswapd vs6, v5
-; FAST-NEXT:    stxvd2x vs11, r30, r3
-; FAST-NEXT:    li r3, 176
-; FAST-NEXT:    stxvd2x vs12, r30, r3
-; FAST-NEXT:    li r3, 160
-; FAST-NEXT:    stxvd2x vs5, r30, r3
-; FAST-NEXT:    li r3, 144
-; FAST-NEXT:    stxvd2x vs6, r30, r3
-; FAST-NEXT:    mffprd r3, f2
-; FAST-NEXT:    mtfprd f7, r3
-; FAST-NEXT:    li r3, 128
-; FAST-NEXT:    xxswapd vs5, v6
-; FAST-NEXT:    stxvd2x vs5, r30, r3
-; FAST-NEXT:    li r3, 112
-; FAST-NEXT:    xxswapd vs2, v1
-; FAST-NEXT:    xxswapd vs6, v0
-; FAST-NEXT:    stxvd2x vs2, r30, r3
-; FAST-NEXT:    li r3, 96
-; FAST-NEXT:    fctid f2, f29
-; FAST-NEXT:    stxvd2x vs6, r30, r3
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    mffprd r3, f2
-; FAST-NEXT:    mtfprd f2, r3
-; FAST-NEXT:    li r3, 80
-; FAST-NEXT:    xxmrghd v5, vs7, vs4
-; FAST-NEXT:    xxswapd vs4, v2
-; FAST-NEXT:    xxmrghd v0, vs0, vs3
-; FAST-NEXT:    xxswapd vs0, v5
-; FAST-NEXT:    xxswapd vs3, v3
-; FAST-NEXT:    stxvd2x vs0, r30, r3
-; FAST-NEXT:    li r3, 64
-; FAST-NEXT:    xxswapd vs0, v0
-; FAST-NEXT:    stxvd2x vs0, r30, r3
-; FAST-NEXT:    li r3, 48
-; FAST-NEXT:    xxmrghd v5, vs2, vs1
-; FAST-NEXT:    xxswapd vs1, v4
-; FAST-NEXT:    stxvd2x vs1, r30, r3
-; FAST-NEXT:    li r3, 32
-; FAST-NEXT:    xxswapd vs2, v5
-; FAST-NEXT:    stxvd2x vs2, r30, r3
-; FAST-NEXT:    li r3, 16
-; FAST-NEXT:    stxvd2x vs3, r30, r3
-; FAST-NEXT:    li r3, 304
-; FAST-NEXT:    stxvd2x vs4, 0, r30
-; FAST-NEXT:    lfd f31, 472(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f30, 464(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f29, 456(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f28, 448(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f27, 440(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f26, 432(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f25, 424(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f24, 416(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f23, 408(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f22, 400(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f21, 392(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f20, 384(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f19, 376(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f18, 368(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f17, 360(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f16, 352(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f15, 344(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lfd f14, 336(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lxvd2x v31, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 288
-; FAST-NEXT:    ld r30, 320(r1) # 8-byte Folded Reload
-; FAST-NEXT:    lxvd2x v30, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 272
-; FAST-NEXT:    lxvd2x v29, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 256
-; FAST-NEXT:    lxvd2x v28, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 240
-; FAST-NEXT:    lxvd2x v27, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 224
-; FAST-NEXT:    lxvd2x v26, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 208
-; FAST-NEXT:    lxvd2x v25, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 192
-; FAST-NEXT:    lxvd2x v24, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 176
-; FAST-NEXT:    lxvd2x v23, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 160
-; FAST-NEXT:    lxvd2x v22, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 144
-; FAST-NEXT:    lxvd2x v21, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 128
-; FAST-NEXT:    lxvd2x v20, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    addi r1, r1, 480
-; FAST-NEXT:    ld r0, 16(r1)
-; FAST-NEXT:    mtlr r0
-; FAST-NEXT:    blr
   %a = call <32 x i64> @llvm.lrint.v32i64.v32f16(<32 x half> %x)
   ret <32 x i64> %a
 }
@@ -3400,12 +2423,6 @@ define <1 x i64> @lrint_v1f32(<1 x float> %x) nounwind {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: lrint_v1f32:
-; FAST:       # %bb.0:
-; FAST-NEXT:    fctid f0, f1
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    blr
   %a = call <1 x i64> @llvm.lrint.v1i64.v1f32(<1 x float> %x)
   ret <1 x i64> %a
 }
@@ -3459,21 +2476,6 @@ define <2 x i64> @lrint_v2f32(<2 x float> %x) nounwind {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: lrint_v2f32:
-; FAST:       # %bb.0:
-; FAST-NEXT:    xxsldwi vs0, v2, v2, 3
-; FAST-NEXT:    xxswapd vs1, v2
-; FAST-NEXT:    xscvspdpn f0, vs0
-; FAST-NEXT:    xscvspdpn f1, vs1
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    fctid f1, f1
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    mffprd r3, f1
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    xxmrghd v2, vs1, vs0
-; FAST-NEXT:    blr
   %a = call <2 x i64> @llvm.lrint.v2i64.v2f32(<2 x float> %x)
   ret <2 x i64> %a
 }
@@ -3552,32 +2554,6 @@ define <4 x i64> @lrint_v4f32(<4 x float> %x) nounwind {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: lrint_v4f32:
-; FAST:       # %bb.0:
-; FAST-NEXT:    xxsldwi vs0, v2, v2, 3
-; FAST-NEXT:    xxswapd vs1, v2
-; FAST-NEXT:    xscvspdpn f0, vs0
-; FAST-NEXT:    xxsldwi vs2, v2, v2, 1
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    xscvspdpn f0, vs1
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    xxmrghd v4, vs0, vs1
-; FAST-NEXT:    xscvspdpn f0, v2
-; FAST-NEXT:    vmr v2, v4
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    xscvspdpn f0, vs2
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    xxmrghd v3, vs1, vs0
-; FAST-NEXT:    blr
   %a = call <4 x i64> @llvm.lrint.v4i64.v4f32(<4 x float> %x)
   ret <4 x i64> %a
 }
@@ -3710,54 +2686,6 @@ define <8 x i64> @lrint_v8f32(<8 x float> %x) nounwind {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: lrint_v8f32:
-; FAST:       # %bb.0:
-; FAST-NEXT:    xxsldwi vs0, v2, v2, 3
-; FAST-NEXT:    xxswapd vs1, v2
-; FAST-NEXT:    xscvspdpn f0, vs0
-; FAST-NEXT:    xxsldwi vs2, v2, v2, 1
-; FAST-NEXT:    xxsldwi vs3, v3, v3, 3
-; FAST-NEXT:    xxswapd vs4, v3
-; FAST-NEXT:    xxsldwi vs5, v3, v3, 1
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    xscvspdpn f0, vs1
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    xxmrghd v0, vs0, vs1
-; FAST-NEXT:    xscvspdpn f0, v2
-; FAST-NEXT:    vmr v2, v0
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    xscvspdpn f0, vs2
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    xxmrghd v1, vs1, vs0
-; FAST-NEXT:    xscvspdpn f0, vs3
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    xscvspdpn f0, vs4
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    xxmrghd v4, vs0, vs1
-; FAST-NEXT:    xscvspdpn f0, v3
-; FAST-NEXT:    vmr v3, v1
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    xscvspdpn f0, vs5
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    xxmrghd v5, vs1, vs0
-; FAST-NEXT:    blr
   %a = call <8 x i64> @llvm.lrint.v8i64.v8f32(<8 x float> %x)
   ret <8 x i64> %a
 }
@@ -3998,98 +2926,6 @@ define <16 x i64> @lrint_v16i64_v16f32(<16 x float> %x) nounwind {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: lrint_v16i64_v16f32:
-; FAST:       # %bb.0:
-; FAST-NEXT:    xxsldwi vs0, v2, v2, 3
-; FAST-NEXT:    xxswapd vs1, v2
-; FAST-NEXT:    xscvspdpn f0, vs0
-; FAST-NEXT:    xxsldwi vs2, v2, v2, 1
-; FAST-NEXT:    xxsldwi vs3, v3, v3, 3
-; FAST-NEXT:    xxswapd vs4, v3
-; FAST-NEXT:    xxsldwi vs5, v3, v3, 1
-; FAST-NEXT:    xxsldwi vs6, v4, v4, 3
-; FAST-NEXT:    xxswapd vs7, v4
-; FAST-NEXT:    xxsldwi vs8, v4, v4, 1
-; FAST-NEXT:    xxsldwi vs9, v5, v5, 3
-; FAST-NEXT:    xxswapd vs10, v5
-; FAST-NEXT:    xxsldwi vs11, v5, v5, 1
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    xscvspdpn f0, vs1
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    xxmrghd v0, vs0, vs1
-; FAST-NEXT:    xscvspdpn f0, v2
-; FAST-NEXT:    vmr v2, v0
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    xscvspdpn f0, vs2
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    xxmrghd v1, vs1, vs0
-; FAST-NEXT:    xscvspdpn f0, vs3
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    xscvspdpn f0, vs4
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    xxmrghd v10, vs0, vs1
-; FAST-NEXT:    xscvspdpn f0, v3
-; FAST-NEXT:    vmr v3, v1
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    xscvspdpn f0, vs5
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    xxmrghd v11, vs1, vs0
-; FAST-NEXT:    xscvspdpn f0, vs6
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    xscvspdpn f0, vs7
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    xxmrghd v6, vs0, vs1
-; FAST-NEXT:    xscvspdpn f0, v4
-; FAST-NEXT:    xscvspdpn f1, vs8
-; FAST-NEXT:    vmr v4, v10
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    fctid f1, f1
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    mffprd r3, f1
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    xxmrghd v7, vs0, vs1
-; FAST-NEXT:    xscvspdpn f0, vs9
-; FAST-NEXT:    xscvspdpn f1, vs10
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    fctid f1, f1
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    mffprd r3, f1
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    xxmrghd v8, vs1, vs0
-; FAST-NEXT:    xscvspdpn f0, v5
-; FAST-NEXT:    xscvspdpn f1, vs11
-; FAST-NEXT:    vmr v5, v11
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    fctid f1, f1
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    mffprd r3, f1
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    xxmrghd v9, vs0, vs1
-; FAST-NEXT:    blr
   %a = call <16 x i64> @llvm.lrint.v16i64.v16f32(<16 x float> %x)
   ret <16 x i64> %a
 }
@@ -4119,12 +2955,6 @@ define <1 x i64> @lrint_v1f64(<1 x double> %x) nounwind {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: lrint_v1f64:
-; FAST:       # %bb.0:
-; FAST-NEXT:    fctid f0, f1
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    blr
   %a = call <1 x i64> @llvm.lrint.v1i64.v1f64(<1 x double> %x)
   ret <1 x i64> %a
 }
@@ -4179,19 +3009,6 @@ define <2 x i64> @lrint_v2f64(<2 x double> %x) nounwind {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: lrint_v2f64:
-; FAST:       # %bb.0:
-; FAST-NEXT:    xxlor f1, v2, v2
-; FAST-NEXT:    xxswapd vs0, v2
-; FAST-NEXT:    fctid f1, f1
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r3, f1
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    mffprd r3, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    xxmrghd v2, vs1, vs0
-; FAST-NEXT:    blr
   %a = call <2 x i64> @llvm.lrint.v2i64.v2f64(<2 x double> %x)
   ret <2 x i64> %a
 }
@@ -4276,28 +3093,6 @@ define <4 x i64> @lrint_v4f64(<4 x double> %x) nounwind {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: lrint_v4f64:
-; FAST:       # %bb.0:
-; FAST-NEXT:    xxswapd vs0, v2
-; FAST-NEXT:    xxlor f2, v2, v2
-; FAST-NEXT:    xxswapd vs1, v3
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    fctid f2, f2
-; FAST-NEXT:    fctid f1, f1
-; FAST-NEXT:    mffprd r4, f0
-; FAST-NEXT:    xxlor f0, v3, v3
-; FAST-NEXT:    mffprd r3, f2
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mtfprd f2, r4
-; FAST-NEXT:    mffprd r5, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    mffprd r3, f1
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    xxmrghd v2, vs0, vs2
-; FAST-NEXT:    mtfprd f0, r5
-; FAST-NEXT:    xxmrghd v3, vs0, vs1
-; FAST-NEXT:    blr
   %a = call <4 x i64> @llvm.lrint.v4i64.v4f64(<4 x double> %x)
   ret <4 x i64> %a
 }
@@ -4442,46 +3237,6 @@ define <8 x i64> @lrint_v8f64(<8 x double> %x) nounwind {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: lrint_v8f64:
-; FAST:       # %bb.0:
-; FAST-NEXT:    xxswapd vs0, v2
-; FAST-NEXT:    xxswapd vs1, v3
-; FAST-NEXT:    xxlor f4, v2, v2
-; FAST-NEXT:    xxswapd vs2, v4
-; FAST-NEXT:    xxswapd vs3, v5
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    fctid f4, f4
-; FAST-NEXT:    mffprd r4, f0
-; FAST-NEXT:    xxlor f0, v3, v3
-; FAST-NEXT:    mffprd r3, f4
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mffprd r5, f0
-; FAST-NEXT:    fctid f0, f1
-; FAST-NEXT:    mtfprd f1, r4
-; FAST-NEXT:    mffprd r6, f0
-; FAST-NEXT:    xxlor f0, v4, v4
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mtfprd f4, r6
-; FAST-NEXT:    mffprd r7, f0
-; FAST-NEXT:    fctid f0, f2
-; FAST-NEXT:    mtfprd f2, r5
-; FAST-NEXT:    mtfprd f5, r7
-; FAST-NEXT:    mffprd r8, f0
-; FAST-NEXT:    xxlor f0, v5, v5
-; FAST-NEXT:    fctid f0, f0
-; FAST-NEXT:    mtfprd f6, r8
-; FAST-NEXT:    mffprd r9, f0
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    xxmrghd v3, vs2, vs4
-; FAST-NEXT:    xxmrghd v4, vs5, vs6
-; FAST-NEXT:    xxmrghd v2, vs0, vs1
-; FAST-NEXT:    fctid f1, f3
-; FAST-NEXT:    mtfprd f0, r9
-; FAST-NEXT:    mffprd r3, f1
-; FAST-NEXT:    mtfprd f1, r3
-; FAST-NEXT:    xxmrghd v5, vs0, vs1
-; FAST-NEXT:    blr
   %a = call <8 x i64> @llvm.lrint.v8i64.v8f64(<8 x double> %x)
   ret <8 x i64> %a
 }
@@ -4511,18 +3266,6 @@ define <1 x i64> @lrint_v1f128(<1 x fp128> %x) nounwind {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: lrint_v1f128:
-; FAST:       # %bb.0:
-; FAST-NEXT:    mflr r0
-; FAST-NEXT:    stdu r1, -32(r1)
-; FAST-NEXT:    std r0, 48(r1)
-; FAST-NEXT:    bl lrintf128
-; FAST-NEXT:    nop
-; FAST-NEXT:    addi r1, r1, 32
-; FAST-NEXT:    ld r0, 16(r1)
-; FAST-NEXT:    mtlr r0
-; FAST-NEXT:    blr
   %a = call <1 x i64> @llvm.lrint.v1i64.v1f128(<1 x fp128> %x)
   ret <1 x i64> %a
 }
@@ -4580,33 +3323,6 @@ define <2 x i64> @lrint_v2f128(<2 x fp128> %x) nounwind {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: lrint_v2f128:
-; FAST:       # %bb.0:
-; FAST-NEXT:    mflr r0
-; FAST-NEXT:    stdu r1, -80(r1)
-; FAST-NEXT:    li r3, 48
-; FAST-NEXT:    std r0, 96(r1)
-; FAST-NEXT:    stxvd2x v30, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT:    li r3, 64
-; FAST-NEXT:    stxvd2x v31, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT:    vmr v31, v3
-; FAST-NEXT:    bl lrintf128
-; FAST-NEXT:    nop
-; FAST-NEXT:    vmr v2, v31
-; FAST-NEXT:    mtvsrd v30, r3
-; FAST-NEXT:    bl lrintf128
-; FAST-NEXT:    nop
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    li r3, 64
-; FAST-NEXT:    lxvd2x v31, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 48
-; FAST-NEXT:    xxmrghd v2, vs0, v30
-; FAST-NEXT:    lxvd2x v30, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    addi r1, r1, 80
-; FAST-NEXT:    ld r0, 16(r1)
-; FAST-NEXT:    mtlr r0
-; FAST-NEXT:    blr
   %a = call <2 x i64> @llvm.lrint.v2i64.v2f128(<2 x fp128> %x)
   ret <2 x i64> %a
 }
@@ -4704,53 +3420,6 @@ define <4 x i64> @lrint_v4f128(<4 x fp128> %x) nounwind {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: lrint_v4f128:
-; FAST:       # %bb.0:
-; FAST-NEXT:    mflr r0
-; FAST-NEXT:    stdu r1, -112(r1)
-; FAST-NEXT:    li r3, 48
-; FAST-NEXT:    std r0, 128(r1)
-; FAST-NEXT:    stxvd2x v28, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT:    li r3, 64
-; FAST-NEXT:    stxvd2x v29, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT:    li r3, 80
-; FAST-NEXT:    vmr v29, v3
-; FAST-NEXT:    stxvd2x v30, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT:    li r3, 96
-; FAST-NEXT:    vmr v30, v4
-; FAST-NEXT:    stxvd2x v31, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT:    vmr v31, v5
-; FAST-NEXT:    bl lrintf128
-; FAST-NEXT:    nop
-; FAST-NEXT:    vmr v2, v29
-; FAST-NEXT:    mtvsrd v28, r3
-; FAST-NEXT:    bl lrintf128
-; FAST-NEXT:    nop
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    vmr v2, v30
-; FAST-NEXT:    xxmrghd v29, vs0, v28
-; FAST-NEXT:    bl lrintf128
-; FAST-NEXT:    nop
-; FAST-NEXT:    vmr v2, v31
-; FAST-NEXT:    mtvsrd v30, r3
-; FAST-NEXT:    bl lrintf128
-; FAST-NEXT:    nop
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    li r3, 96
-; FAST-NEXT:    vmr v2, v29
-; FAST-NEXT:    lxvd2x v31, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 80
-; FAST-NEXT:    xxmrghd v3, vs0, v30
-; FAST-NEXT:    lxvd2x v30, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 64
-; FAST-NEXT:    lxvd2x v29, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 48
-; FAST-NEXT:    lxvd2x v28, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    addi r1, r1, 112
-; FAST-NEXT:    ld r0, 16(r1)
-; FAST-NEXT:    mtlr r0
-; FAST-NEXT:    blr
   %a = call <4 x i64> @llvm.lrint.v4i64.v4f128(<4 x fp128> %x)
   ret <4 x i64> %a
 }
@@ -4928,93 +3597,6 @@ define <8 x i64> @lrint_v8f128(<8 x fp128> %x) nounwind {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: lrint_v8f128:
-; FAST:       # %bb.0:
-; FAST-NEXT:    mflr r0
-; FAST-NEXT:    stdu r1, -176(r1)
-; FAST-NEXT:    li r3, 48
-; FAST-NEXT:    std r0, 192(r1)
-; FAST-NEXT:    stxvd2x v24, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT:    li r3, 64
-; FAST-NEXT:    stxvd2x v25, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT:    li r3, 80
-; FAST-NEXT:    vmr v25, v3
-; FAST-NEXT:    stxvd2x v26, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT:    li r3, 96
-; FAST-NEXT:    vmr v26, v4
-; FAST-NEXT:    stxvd2x v27, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT:    li r3, 112
-; FAST-NEXT:    vmr v27, v5
-; FAST-NEXT:    stxvd2x v28, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT:    li r3, 128
-; FAST-NEXT:    vmr v28, v6
-; FAST-NEXT:    stxvd2x v29, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT:    li r3, 144
-; FAST-NEXT:    vmr v29, v7
-; FAST-NEXT:    stxvd2x v30, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT:    li r3, 160
-; FAST-NEXT:    vmr v30, v8
-; FAST-NEXT:    stxvd2x v31, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT:    vmr v31, v9
-; FAST-NEXT:    bl lrintf128
-; FAST-NEXT:    nop
-; FAST-NEXT:    vmr v2, v25
-; FAST-NEXT:    mtvsrd v24, r3
-; FAST-NEXT:    bl lrintf128
-; FAST-NEXT:    nop
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    vmr v2, v26
-; FAST-NEXT:    xxmrghd v25, vs0, v24
-; FAST-NEXT:    bl lrintf128
-; FAST-NEXT:    nop
-; FAST-NEXT:    vmr v2, v27
-; FAST-NEXT:    mtvsrd v26, r3
-; FAST-NEXT:    bl lrintf128
-; FAST-NEXT:    nop
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    vmr v2, v28
-; FAST-NEXT:    xxmrghd v27, vs0, v26
-; FAST-NEXT:    bl lrintf128
-; FAST-NEXT:    nop
-; FAST-NEXT:    vmr v2, v29
-; FAST-NEXT:    mtvsrd v28, r3
-; FAST-NEXT:    bl lrintf128
-; FAST-NEXT:    nop
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    vmr v2, v30
-; FAST-NEXT:    xxmrghd v29, vs0, v28
-; FAST-NEXT:    bl lrintf128
-; FAST-NEXT:    nop
-; FAST-NEXT:    vmr v2, v31
-; FAST-NEXT:    mtvsrd v30, r3
-; FAST-NEXT:    bl lrintf128
-; FAST-NEXT:    nop
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    li r3, 160
-; FAST-NEXT:    vmr v4, v29
-; FAST-NEXT:    lxvd2x v31, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 144
-; FAST-NEXT:    vmr v3, v27
-; FAST-NEXT:    vmr v2, v25
-; FAST-NEXT:    xxmrghd v5, vs0, v30
-; FAST-NEXT:    lxvd2x v30, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 128
-; FAST-NEXT:    lxvd2x v29, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 112
-; FAST-NEXT:    lxvd2x v28, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 96
-; FAST-NEXT:    lxvd2x v27, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 80
-; FAST-NEXT:    lxvd2x v26, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 64
-; FAST-NEXT:    lxvd2x v25, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 48
-; FAST-NEXT:    lxvd2x v24, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    addi r1, r1, 176
-; FAST-NEXT:    ld r0, 16(r1)
-; FAST-NEXT:    mtlr r0
-; FAST-NEXT:    blr
   %a = call <8 x i64> @llvm.lrint.v8i64.v8f128(<8 x fp128> %x)
   ret <8 x i64> %a
 }
@@ -5355,176 +3937,6 @@ define <16 x i64> @lrint_v16i64_v16f128(<16 x fp128> %x) nounwind {
 ; CHECK-NEXT:    ld r0, 16(r1)
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-;
-; FAST-LABEL: lrint_v16i64_v16f128:
-; FAST:       # %bb.0:
-; FAST-NEXT:    mflr r0
-; FAST-NEXT:    stdu r1, -304(r1)
-; FAST-NEXT:    li r3, 112
-; FAST-NEXT:    std r0, 320(r1)
-; FAST-NEXT:    stxvd2x v20, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT:    li r3, 128
-; FAST-NEXT:    stxvd2x v21, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT:    li r3, 144
-; FAST-NEXT:    vmr v21, v4
-; FAST-NEXT:    stxvd2x v22, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT:    li r3, 160
-; FAST-NEXT:    vmr v22, v6
-; FAST-NEXT:    stxvd2x v23, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT:    li r3, 176
-; FAST-NEXT:    vmr v23, v8
-; FAST-NEXT:    stxvd2x v24, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT:    li r3, 192
-; FAST-NEXT:    vmr v24, v9
-; FAST-NEXT:    stxvd2x v25, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT:    li r3, 208
-; FAST-NEXT:    vmr v25, v7
-; FAST-NEXT:    stxvd2x v26, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT:    li r3, 224
-; FAST-NEXT:    vmr v26, v10
-; FAST-NEXT:    stxvd2x v27, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT:    li r3, 240
-; FAST-NEXT:    vmr v27, v5
-; FAST-NEXT:    stxvd2x v28, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT:    li r3, 256
-; FAST-NEXT:    vmr v28, v11
-; FAST-NEXT:    stxvd2x v29, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT:    li r3, 272
-; FAST-NEXT:    vmr v29, v12
-; FAST-NEXT:    stxvd2x v30, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT:    li r3, 288
-; FAST-NEXT:    vmr v30, v3
-; FAST-NEXT:    stxvd2x v31, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT:    li r3, 64
-; FAST-NEXT:    stxvd2x v13, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT:    addi r3, r1, 576
-; FAST-NEXT:    lxvd2x vs0, 0, r3
-; FAST-NEXT:    addi r3, r1, 560
-; FAST-NEXT:    lxvd2x vs1, 0, r3
-; FAST-NEXT:    addi r3, r1, 544
-; FAST-NEXT:    lxvd2x vs2, 0, r3
-; FAST-NEXT:    li r3, 96
-; FAST-NEXT:    xxswapd vs0, vs0
-; FAST-NEXT:    stxvd2x vs0, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT:    li r3, 80
-; FAST-NEXT:    xxswapd vs0, vs1
-; FAST-NEXT:    stxvd2x vs0, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT:    li r3, 48
-; FAST-NEXT:    xxswapd vs0, vs2
-; FAST-NEXT:    stxvd2x vs0, r1, r3 # 16-byte Folded Spill
-; FAST-NEXT:    addi r3, r1, 528
-; FAST-NEXT:    lxvd2x vs0, 0, r3
-; FAST-NEXT:    xxswapd v31, vs0
-; FAST-NEXT:    bl lrintf128
-; FAST-NEXT:    nop
-; FAST-NEXT:    vmr v2, v30
-; FAST-NEXT:    mtvsrd v20, r3
-; FAST-NEXT:    bl lrintf128
-; FAST-NEXT:    nop
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    vmr v2, v21
-; FAST-NEXT:    xxmrghd v30, vs0, v20
-; FAST-NEXT:    bl lrintf128
-; FAST-NEXT:    nop
-; FAST-NEXT:    vmr v2, v27
-; FAST-NEXT:    mtvsrd v21, r3
-; FAST-NEXT:    bl lrintf128
-; FAST-NEXT:    nop
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    vmr v2, v22
-; FAST-NEXT:    xxmrghd v27, vs0, v21
-; FAST-NEXT:    bl lrintf128
-; FAST-NEXT:    nop
-; FAST-NEXT:    vmr v2, v25
-; FAST-NEXT:    mtvsrd v22, r3
-; FAST-NEXT:    bl lrintf128
-; FAST-NEXT:    nop
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    vmr v2, v23
-; FAST-NEXT:    xxmrghd v25, vs0, v22
-; FAST-NEXT:    bl lrintf128
-; FAST-NEXT:    nop
-; FAST-NEXT:    vmr v2, v24
-; FAST-NEXT:    mtvsrd v23, r3
-; FAST-NEXT:    bl lrintf128
-; FAST-NEXT:    nop
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    vmr v2, v26
-; FAST-NEXT:    xxmrghd v24, vs0, v23
-; FAST-NEXT:    bl lrintf128
-; FAST-NEXT:    nop
-; FAST-NEXT:    vmr v2, v28
-; FAST-NEXT:    mtvsrd v26, r3
-; FAST-NEXT:    bl lrintf128
-; FAST-NEXT:    nop
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    vmr v2, v29
-; FAST-NEXT:    xxmrghd v28, vs0, v26
-; FAST-NEXT:    bl lrintf128
-; FAST-NEXT:    nop
-; FAST-NEXT:    mtvsrd v29, r3
-; FAST-NEXT:    li r3, 64
-; FAST-NEXT:    lxvd2x v2, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    bl lrintf128
-; FAST-NEXT:    nop
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    vmr v2, v31
-; FAST-NEXT:    xxmrghd v29, vs0, v29
-; FAST-NEXT:    bl lrintf128
-; FAST-NEXT:    nop
-; FAST-NEXT:    mtvsrd v31, r3
-; FAST-NEXT:    li r3, 48
-; FAST-NEXT:    lxvd2x v2, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    bl lrintf128
-; FAST-NEXT:    nop
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    li r3, 80
-; FAST-NEXT:    lxvd2x v2, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    xxmrghd v31, vs0, v31
-; FAST-NEXT:    bl lrintf128
-; FAST-NEXT:    nop
-; FAST-NEXT:    mtvsrd v26, r3
-; FAST-NEXT:    li r3, 96
-; FAST-NEXT:    lxvd2x v2, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    bl lrintf128
-; FAST-NEXT:    nop
-; FAST-NEXT:    mtfprd f0, r3
-; FAST-NEXT:    li r3, 288
-; FAST-NEXT:    vmr v8, v31
-; FAST-NEXT:    lxvd2x v31, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 272
-; FAST-NEXT:    vmr v2, v30
-; FAST-NEXT:    vmr v7, v29
-; FAST-NEXT:    vmr v6, v28
-; FAST-NEXT:    vmr v3, v27
-; FAST-NEXT:    lxvd2x v30, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 256
-; FAST-NEXT:    vmr v4, v25
-; FAST-NEXT:    vmr v5, v24
-; FAST-NEXT:    lxvd2x v29, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 240
-; FAST-NEXT:    lxvd2x v28, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 224
-; FAST-NEXT:    lxvd2x v27, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 208
-; FAST-NEXT:    xxmrghd v9, vs0, v26
-; FAST-NEXT:    lxvd2x v26, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 192
-; FAST-NEXT:    lxvd2x v25, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 176
-; FAST-NEXT:    lxvd2x v24, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 160
-; FAST-NEXT:    lxvd2x v23, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 144
-; FAST-NEXT:    lxvd2x v22, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 128
-; FAST-NEXT:    lxvd2x v21, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    li r3, 112
-; FAST-NEXT:    lxvd2x v20, r1, r3 # 16-byte Folded Reload
-; FAST-NEXT:    addi r1, r1, 304
-; FAST-NEXT:    ld r0, 16(r1)
-; FAST-NEXT:    mtlr r0
-; FAST-NEXT:    blr
   %a = call <16 x i64> @llvm.lrint.v16i64.v16f128(<16 x fp128> %x)
   ret <16 x i64> %a
 }
diff --git a/llvm/test/CodeGen/PowerPC/vector-reduce-add.ll b/llvm/test/CodeGen/PowerPC/vector-reduce-add.ll
index 0892210fc7443..d506d2062be5c 100644
--- a/llvm/test/CodeGen/PowerPC/vector-reduce-add.ll
+++ b/llvm/test/CodeGen/PowerPC/vector-reduce-add.ll
@@ -1566,12 +1566,16 @@ define dso_local i64 @v16i8tov16i64_sign(<16 x i8> %a) local_unnamed_addr #0 {
 ; PWR10BE-LABEL: v16i8tov16i64_sign:
 ; PWR10BE:       # %bb.0: # %entry
 ; PWR10BE-NEXT:    addis r3, r2, .LCPI23_0@toc@ha
+; PWR10BE-NEXT:    xxspltib v1, 255
 ; PWR10BE-NEXT:    addi r3, r3, .LCPI23_0@toc@l
+; PWR10BE-NEXT:    vsrq v1, v1, v1
 ; PWR10BE-NEXT:    lxv v3, 0(r3)
 ; PWR10BE-NEXT:    addis r3, r2, .LCPI23_1@toc@ha
 ; PWR10BE-NEXT:    addi r3, r3, .LCPI23_1@toc@l
+; PWR10BE-NEXT:    vperm v1, v2, v2, v1
 ; PWR10BE-NEXT:    lxv v4, 0(r3)
 ; PWR10BE-NEXT:    addis r3, r2, .LCPI23_2@toc@ha
+; PWR10BE-NEXT:    vextsb2d v1, v1
 ; PWR10BE-NEXT:    vperm v3, v2, v2, v3
 ; PWR10BE-NEXT:    addi r3, r3, .LCPI23_2@toc@l
 ; PWR10BE-NEXT:    vextsb2d v3, v3
@@ -1585,23 +1589,18 @@ define dso_local i64 @v16i8tov16i64_sign(<16 x i8> %a) local_unnamed_addr #0 {
 ; PWR10BE-NEXT:    vperm v5, v2, v2, v5
 ; PWR10BE-NEXT:    addi r3, r3, .LCPI23_4@toc@l
 ; PWR10BE-NEXT:    vextsb2d v5, v5
-; PWR10BE-NEXT:    lxv v1, 0(r3)
+; PWR10BE-NEXT:    lxv v6, 0(r3)
 ; PWR10BE-NEXT:    addis r3, r2, .LCPI23_5@toc@ha
 ; PWR10BE-NEXT:    vperm v0, v2, v2, v0
 ; PWR10BE-NEXT:    addi r3, r3, .LCPI23_5@toc@l
 ; PWR10BE-NEXT:    vextsb2d v0, v0
-; PWR10BE-NEXT:    lxv v6, 0(r3)
+; PWR10BE-NEXT:    lxv v7, 0(r3)
 ; PWR10BE-NEXT:    addis r3, r2, .LCPI23_6@toc@ha
-; PWR10BE-NEXT:    vperm v1, v2, v2, v1
+; PWR10BE-NEXT:    vperm v6, v2, v2, v6
 ; PWR10BE-NEXT:    vaddudm v5, v0, v5
 ; PWR10BE-NEXT:    vaddudm v3, v4, v3
 ; PWR10BE-NEXT:    vaddudm v3, v3, v5
 ; PWR10BE-NEXT:    addi r3, r3, .LCPI23_6@toc@l
-; PWR10BE-NEXT:    vextsb2d v1, v1
-; PWR10BE-NEXT:    lxv v7, 0(r3)
-; PWR10BE-NEXT:    addis r3, r2, .LCPI23_7@toc@ha
-; PWR10BE-NEXT:    vperm v6, v2, v2, v6
-; PWR10BE-NEXT:    addi r3, r3, .LCPI23_7@toc@l
 ; PWR10BE-NEXT:    vextsb2d v6, v6
 ; PWR10BE-NEXT:    lxv v8, 0(r3)
 ; PWR10BE-NEXT:    vperm v7, v2, v2, v7
@@ -1609,7 +1608,7 @@ define dso_local i64 @v16i8tov16i64_sign(<16 x i8> %a) local_unnamed_addr #0 {
 ; PWR10BE-NEXT:    vperm v2, v2, v2, v8
 ; PWR10BE-NEXT:    vextsb2d v2, v2
 ; PWR10BE-NEXT:    vaddudm v2, v2, v7
-; PWR10BE-NEXT:    vaddudm v4, v6, v1
+; PWR10BE-NEXT:    vaddudm v4, v1, v6
 ; PWR10BE-NEXT:    vaddudm v2, v4, v2
 ; PWR10BE-NEXT:    vaddudm v2, v2, v3
 ; PWR10BE-NEXT:    xxswapd v3, v2
diff --git a/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-eqv.ll b/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-eqv.ll
index 24a1724b8d335..ba7680b27cc17 100644
--- a/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-eqv.ll
+++ b/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-eqv.ll
@@ -15,11 +15,9 @@ define <4 x i32> @ternary_A_or_BC_eqv_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i3
 ; CHECK-LABEL: ternary_A_or_BC_eqv_BC_4x32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxleqv v5, v5, v5
-; CHECK-NEXT:    xxlor vs0, v3, v4
-; CHECK-NEXT:    xxleqv vs1, v3, v4
 ; CHECK-NEXT:    vslw v2, v2, v5
 ; CHECK-NEXT:    vsraw v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 151
 ; CHECK-NEXT:    blr
 entry:
   %or = or <4 x i32> %B, %C
@@ -34,12 +32,10 @@ define <2 x i64> @ternary_A_or_BC_eqv_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i6
 ; CHECK-LABEL: ternary_A_or_BC_eqv_BC_2x64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxlxor v5, v5, v5
-; CHECK-NEXT:    xxlor vs0, v3, v4
-; CHECK-NEXT:    xxleqv vs1, v3, v4
 ; CHECK-NEXT:    xxsplti32dx v5, 1, 63
 ; CHECK-NEXT:    vsld v2, v2, v5
 ; CHECK-NEXT:    vsrad v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 151
 ; CHECK-NEXT:    blr
 entry:
   %or = or <2 x i64> %B, %C
@@ -54,11 +50,9 @@ define <16 x i8> @ternary_A_or_BC_eqv_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16 x
 ; CHECK-LABEL: ternary_A_or_BC_eqv_BC_16x8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltib v5, 7
-; CHECK-NEXT:    xxlor vs0, v3, v4
-; CHECK-NEXT:    xxleqv vs1, v3, v4
 ; CHECK-NEXT:    vslb v2, v2, v5
 ; CHECK-NEXT:    vsrab v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 151
 ; CHECK-NEXT:    blr
 entry:
   %or = or <16 x i8> %B, %C
@@ -73,11 +67,9 @@ define <8 x i16> @ternary_A_or_BC_eqv_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i1
 ; CHECK-LABEL: ternary_A_or_BC_eqv_BC_8x16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltiw v5, 983055
-; CHECK-NEXT:    xxlor vs0, v3, v4
-; CHECK-NEXT:    xxleqv vs1, v3, v4
 ; CHECK-NEXT:    vslh v2, v2, v5
 ; CHECK-NEXT:    vsrah v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 151
 ; CHECK-NEXT:    blr
 entry:
   %or = or <8 x i16> %B, %C
@@ -92,11 +84,9 @@ define <4 x i32> @ternary_A_nor_BC_eqv_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i
 ; CHECK-LABEL: ternary_A_nor_BC_eqv_BC_4x32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxleqv v5, v5, v5
-; CHECK-NEXT:    xxlnor vs0, v3, v4
-; CHECK-NEXT:    xxleqv vs1, v3, v4
 ; CHECK-NEXT:    vslw v2, v2, v5
 ; CHECK-NEXT:    vsraw v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 152
 ; CHECK-NEXT:    blr
 entry:
   %or = or <4 x i32> %B, %C
@@ -112,12 +102,10 @@ define <2 x i64> @ternary_A_nor_BC_eqv_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i
 ; CHECK-LABEL: ternary_A_nor_BC_eqv_BC_2x64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxlxor v5, v5, v5
-; CHECK-NEXT:    xxlnor vs0, v3, v4
-; CHECK-NEXT:    xxleqv vs1, v3, v4
 ; CHECK-NEXT:    xxsplti32dx v5, 1, 63
 ; CHECK-NEXT:    vsld v2, v2, v5
 ; CHECK-NEXT:    vsrad v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 152
 ; CHECK-NEXT:    blr
 entry:
   %or = or <2 x i64> %B, %C
@@ -133,11 +121,9 @@ define <16 x i8> @ternary_A_nor_BC_eqv_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16 x
 ; CHECK-LABEL: ternary_A_nor_BC_eqv_BC_16x8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltib v5, 7
-; CHECK-NEXT:    xxlnor vs0, v3, v4
-; CHECK-NEXT:    xxleqv vs1, v3, v4
 ; CHECK-NEXT:    vslb v2, v2, v5
 ; CHECK-NEXT:    vsrab v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 152
 ; CHECK-NEXT:    blr
 entry:
   %or = or <16 x i8> %B, %C
@@ -153,11 +139,9 @@ define <8 x i16> @ternary_A_nor_BC_eqv_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i
 ; CHECK-LABEL: ternary_A_nor_BC_eqv_BC_8x16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltiw v5, 983055
-; CHECK-NEXT:    xxlnor vs0, v3, v4
-; CHECK-NEXT:    xxleqv vs1, v3, v4
 ; CHECK-NEXT:    vslh v2, v2, v5
 ; CHECK-NEXT:    vsrah v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 152
 ; CHECK-NEXT:    blr
 entry:
   %or = or <8 x i16> %B, %C
@@ -173,10 +157,9 @@ define <4 x i32> @ternary_A_not_C_eqv_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i3
 ; CHECK-LABEL: ternary_A_not_C_eqv_BC_4x32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxleqv v5, v5, v5
-; CHECK-NEXT:    xxlnor vs0, v4, v4
 ; CHECK-NEXT:    vslw v2, v2, v5
 ; CHECK-NEXT:    vsraw v2, v2, v5
-; CHECK-NEXT:    xxeval v2, v2, vs0, v3, 99
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 154
 ; CHECK-NEXT:    blr
 entry:
   %not = xor <4 x i32> %C, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector not operation
@@ -191,12 +174,10 @@ define <2 x i64> @ternary_A_not_C_eqv_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i6
 ; CHECK-LABEL: ternary_A_not_C_eqv_BC_2x64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxlxor v5, v5, v5
-; CHECK-NEXT:    xxlnor vs0, v4, v4
-; CHECK-NEXT:    xxleqv vs1, v4, v3
 ; CHECK-NEXT:    xxsplti32dx v5, 1, 63
 ; CHECK-NEXT:    vsld v2, v2, v5
 ; CHECK-NEXT:    vsrad v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 154
 ; CHECK-NEXT:    blr
 entry:
   %not = xor <2 x i64> %C, <i64 -1, i64 -1>  ; Vector not operation
@@ -211,11 +192,9 @@ define <16 x i8> @ternary_A_not_C_eqv_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16 x
 ; CHECK-LABEL: ternary_A_not_C_eqv_BC_16x8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltib v5, 7
-; CHECK-NEXT:    xxlnor vs0, v4, v4
-; CHECK-NEXT:    xxleqv vs1, v4, v3
 ; CHECK-NEXT:    vslb v2, v2, v5
 ; CHECK-NEXT:    vsrab v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 154
 ; CHECK-NEXT:    blr
 entry:
   %not = xor <16 x i8> %C, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>  ; Vector not operation
@@ -230,11 +209,9 @@ define <8 x i16> @ternary_A_not_C_eqv_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i1
 ; CHECK-LABEL: ternary_A_not_C_eqv_BC_8x16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltiw v5, 983055
-; CHECK-NEXT:    xxlnor vs0, v4, v4
-; CHECK-NEXT:    xxleqv vs1, v4, v3
 ; CHECK-NEXT:    vslh v2, v2, v5
 ; CHECK-NEXT:    vsrah v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 154
 ; CHECK-NEXT:    blr
 entry:
   %not = xor <8 x i16> %C, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>  ; Vector not operation
@@ -249,11 +226,9 @@ define <4 x i32> @ternary_A_nand_BC_eqv_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x
 ; CHECK-LABEL: ternary_A_nand_BC_eqv_BC_4x32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxleqv v5, v5, v5
-; CHECK-NEXT:    xxlnand vs0, v3, v4
-; CHECK-NEXT:    xxleqv vs1, v3, v4
 ; CHECK-NEXT:    vslw v2, v2, v5
 ; CHECK-NEXT:    vsraw v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 158
 ; CHECK-NEXT:    blr
 entry:
   %and = and <4 x i32> %B, %C
@@ -269,12 +244,10 @@ define <2 x i64> @ternary_A_nand_BC_eqv_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x
 ; CHECK-LABEL: ternary_A_nand_BC_eqv_BC_2x64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxlxor v5, v5, v5
-; CHECK-NEXT:    xxlnand vs0, v3, v4
-; CHECK-NEXT:    xxleqv vs1, v3, v4
 ; CHECK-NEXT:    xxsplti32dx v5, 1, 63
 ; CHECK-NEXT:    vsld v2, v2, v5
 ; CHECK-NEXT:    vsrad v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 158
 ; CHECK-NEXT:    blr
 entry:
   %and = and <2 x i64> %B, %C
@@ -290,11 +263,9 @@ define <16 x i8> @ternary_A_nand_BC_eqv_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16
 ; CHECK-LABEL: ternary_A_nand_BC_eqv_BC_16x8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltib v5, 7
-; CHECK-NEXT:    xxlnand vs0, v3, v4
-; CHECK-NEXT:    xxleqv vs1, v3, v4
 ; CHECK-NEXT:    vslb v2, v2, v5
 ; CHECK-NEXT:    vsrab v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 158
 ; CHECK-NEXT:    blr
 entry:
   %and = and <16 x i8> %B, %C
@@ -310,11 +281,9 @@ define <8 x i16> @ternary_A_nand_BC_eqv_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x
 ; CHECK-LABEL: ternary_A_nand_BC_eqv_BC_8x16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltiw v5, 983055
-; CHECK-NEXT:    xxlnand vs0, v3, v4
-; CHECK-NEXT:    xxleqv vs1, v3, v4
 ; CHECK-NEXT:    vslh v2, v2, v5
 ; CHECK-NEXT:    vsrah v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 158
 ; CHECK-NEXT:    blr
 entry:
   %and = and <8 x i16> %B, %C
diff --git a/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-nand.ll b/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-nand.ll
index 7a6733d3b5510..067b089e7ec93 100644
--- a/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-nand.ll
+++ b/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-nand.ll
@@ -15,10 +15,9 @@ define <4 x i32> @ternary_A_B_nand_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32>
 ; CHECK-LABEL: ternary_A_B_nand_BC_4x32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxleqv v5, v5, v5
-; CHECK-NEXT:    xxlnand vs0, v3, v4
 ; CHECK-NEXT:    vslw v2, v2, v5
 ; CHECK-NEXT:    vsraw v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs0, v3, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 227
 ; CHECK-NEXT:    blr
 entry:
   %and = and <4 x i32> %B, %C
@@ -32,11 +31,10 @@ define <2 x i64> @ternary_A_B_nand_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64>
 ; CHECK-LABEL: ternary_A_B_nand_BC_2x64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxlxor v5, v5, v5
-; CHECK-NEXT:    xxlnand vs0, v3, v4
 ; CHECK-NEXT:    xxsplti32dx v5, 1, 63
 ; CHECK-NEXT:    vsld v2, v2, v5
 ; CHECK-NEXT:    vsrad v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs0, v3, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 227
 ; CHECK-NEXT:    blr
 entry:
   %and = and <2 x i64> %B, %C
@@ -50,10 +48,9 @@ define <16 x i8> @ternary_A_B_nand_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16 x i8>
 ; CHECK-LABEL: ternary_A_B_nand_BC_16x8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltib v5, 7
-; CHECK-NEXT:    xxlnand vs0, v3, v4
 ; CHECK-NEXT:    vslb v2, v2, v5
 ; CHECK-NEXT:    vsrab v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs0, v3, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 227
 ; CHECK-NEXT:    blr
 entry:
   %and = and <16 x i8> %B, %C
@@ -67,10 +64,9 @@ define <8 x i16> @ternary_A_B_nand_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i16>
 ; CHECK-LABEL: ternary_A_B_nand_BC_8x16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltiw v5, 983055
-; CHECK-NEXT:    xxlnand vs0, v3, v4
 ; CHECK-NEXT:    vslh v2, v2, v5
 ; CHECK-NEXT:    vsrah v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs0, v3, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 227
 ; CHECK-NEXT:    blr
 entry:
   %and = and <8 x i16> %B, %C
@@ -84,10 +80,9 @@ define <4 x i32> @ternary_A_C_nand_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32>
 ; CHECK-LABEL: ternary_A_C_nand_BC_4x32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxleqv v5, v5, v5
-; CHECK-NEXT:    xxlnand vs0, v3, v4
 ; CHECK-NEXT:    vslw v2, v2, v5
 ; CHECK-NEXT:    vsraw v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs0, v4, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 229
 ; CHECK-NEXT:    blr
 entry:
   %and = and <4 x i32> %B, %C
@@ -101,11 +96,10 @@ define <2 x i64> @ternary_A_C_nand_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64>
 ; CHECK-LABEL: ternary_A_C_nand_BC_2x64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxlxor v5, v5, v5
-; CHECK-NEXT:    xxlnand vs0, v3, v4
 ; CHECK-NEXT:    xxsplti32dx v5, 1, 63
 ; CHECK-NEXT:    vsld v2, v2, v5
 ; CHECK-NEXT:    vsrad v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs0, v4, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 229
 ; CHECK-NEXT:    blr
 entry:
   %and = and <2 x i64> %B, %C
@@ -119,10 +113,9 @@ define <16 x i8> @ternary_A_C_nand_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16 x i8>
 ; CHECK-LABEL: ternary_A_C_nand_BC_16x8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltib v5, 7
-; CHECK-NEXT:    xxlnand vs0, v3, v4
 ; CHECK-NEXT:    vslb v2, v2, v5
 ; CHECK-NEXT:    vsrab v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs0, v4, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 229
 ; CHECK-NEXT:    blr
 entry:
   %and = and <16 x i8> %B, %C
@@ -136,10 +129,9 @@ define <8 x i16> @ternary_A_C_nand_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i16>
 ; CHECK-LABEL: ternary_A_C_nand_BC_8x16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltiw v5, 983055
-; CHECK-NEXT:    xxlnand vs0, v3, v4
 ; CHECK-NEXT:    vslh v2, v2, v5
 ; CHECK-NEXT:    vsrah v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs0, v4, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 229
 ; CHECK-NEXT:    blr
 entry:
   %and = and <8 x i16> %B, %C
@@ -153,11 +145,9 @@ define <4 x i32> @ternary_A_xor_BC_nand_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x
 ; CHECK-LABEL: ternary_A_xor_BC_nand_BC_4x32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxleqv v5, v5, v5
-; CHECK-NEXT:    xxlxor vs0, v3, v4
-; CHECK-NEXT:    xxlnand vs1, v3, v4
 ; CHECK-NEXT:    vslw v2, v2, v5
 ; CHECK-NEXT:    vsraw v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 230
 ; CHECK-NEXT:    blr
 entry:
   %xor = xor <4 x i32> %B, %C
@@ -172,12 +162,10 @@ define <2 x i64> @ternary_A_xor_BC_nand_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x
 ; CHECK-LABEL: ternary_A_xor_BC_nand_BC_2x64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxlxor v5, v5, v5
-; CHECK-NEXT:    xxlxor vs0, v3, v4
-; CHECK-NEXT:    xxlnand vs1, v3, v4
 ; CHECK-NEXT:    xxsplti32dx v5, 1, 63
 ; CHECK-NEXT:    vsld v2, v2, v5
 ; CHECK-NEXT:    vsrad v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 230
 ; CHECK-NEXT:    blr
 entry:
   %xor = xor <2 x i64> %B, %C
@@ -192,11 +180,9 @@ define <16 x i8> @ternary_A_xor_BC_nand_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16
 ; CHECK-LABEL: ternary_A_xor_BC_nand_BC_16x8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltib v5, 7
-; CHECK-NEXT:    xxlxor vs0, v3, v4
-; CHECK-NEXT:    xxlnand vs1, v3, v4
 ; CHECK-NEXT:    vslb v2, v2, v5
 ; CHECK-NEXT:    vsrab v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 230
 ; CHECK-NEXT:    blr
 entry:
   %xor = xor <16 x i8> %B, %C
@@ -211,11 +197,9 @@ define <8 x i16> @ternary_A_xor_BC_nand_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x
 ; CHECK-LABEL: ternary_A_xor_BC_nand_BC_8x16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltiw v5, 983055
-; CHECK-NEXT:    xxlxor vs0, v3, v4
-; CHECK-NEXT:    xxlnand vs1, v3, v4
 ; CHECK-NEXT:    vslh v2, v2, v5
 ; CHECK-NEXT:    vsrah v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 230
 ; CHECK-NEXT:    blr
 entry:
   %xor = xor <8 x i16> %B, %C
@@ -230,11 +214,9 @@ define <4 x i32> @ternary_A_or_BC_nand_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i
 ; CHECK-LABEL: ternary_A_or_BC_nand_BC_4x32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxleqv v5, v5, v5
-; CHECK-NEXT:    xxlor vs0, v3, v4
-; CHECK-NEXT:    xxlnand vs1, v3, v4
 ; CHECK-NEXT:    vslw v2, v2, v5
 ; CHECK-NEXT:    vsraw v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 231
 ; CHECK-NEXT:    blr
 entry:
   %or = or <4 x i32> %B, %C
@@ -249,12 +231,10 @@ define <2 x i64> @ternary_A_or_BC_nand_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i
 ; CHECK-LABEL: ternary_A_or_BC_nand_BC_2x64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxlxor v5, v5, v5
-; CHECK-NEXT:    xxlor vs0, v3, v4
-; CHECK-NEXT:    xxlnand vs1, v3, v4
 ; CHECK-NEXT:    xxsplti32dx v5, 1, 63
 ; CHECK-NEXT:    vsld v2, v2, v5
 ; CHECK-NEXT:    vsrad v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 231
 ; CHECK-NEXT:    blr
 entry:
   %or = or <2 x i64> %B, %C
@@ -269,11 +249,9 @@ define <16 x i8> @ternary_A_or_BC_nand_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16 x
 ; CHECK-LABEL: ternary_A_or_BC_nand_BC_16x8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltib v5, 7
-; CHECK-NEXT:    xxlor vs0, v3, v4
-; CHECK-NEXT:    xxlnand vs1, v3, v4
 ; CHECK-NEXT:    vslb v2, v2, v5
 ; CHECK-NEXT:    vsrab v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 231
 ; CHECK-NEXT:    blr
 entry:
   %or = or <16 x i8> %B, %C
@@ -288,11 +266,9 @@ define <8 x i16> @ternary_A_or_BC_nand_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i
 ; CHECK-LABEL: ternary_A_or_BC_nand_BC_8x16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltiw v5, 983055
-; CHECK-NEXT:    xxlor vs0, v3, v4
-; CHECK-NEXT:    xxlnand vs1, v3, v4
 ; CHECK-NEXT:    vslh v2, v2, v5
 ; CHECK-NEXT:    vsrah v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 231
 ; CHECK-NEXT:    blr
 entry:
   %or = or <8 x i16> %B, %C
@@ -307,11 +283,9 @@ define <4 x i32> @ternary_A_eqv_BC_nand_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x
 ; CHECK-LABEL: ternary_A_eqv_BC_nand_BC_4x32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxleqv v5, v5, v5
-; CHECK-NEXT:    xxleqv vs0, v3, v4
-; CHECK-NEXT:    xxlnand vs1, v3, v4
 ; CHECK-NEXT:    vslw v2, v2, v5
 ; CHECK-NEXT:    vsraw v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 233
 ; CHECK-NEXT:    blr
 entry:
   %xor = xor <4 x i32> %B, %C
@@ -327,12 +301,10 @@ define <2 x i64> @ternary_A_eqv_BC_nand_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x
 ; CHECK-LABEL: ternary_A_eqv_BC_nand_BC_2x64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxlxor v5, v5, v5
-; CHECK-NEXT:    xxleqv vs0, v3, v4
-; CHECK-NEXT:    xxlnand vs1, v3, v4
 ; CHECK-NEXT:    xxsplti32dx v5, 1, 63
 ; CHECK-NEXT:    vsld v2, v2, v5
 ; CHECK-NEXT:    vsrad v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 233
 ; CHECK-NEXT:    blr
 entry:
   %xor = xor <2 x i64> %B, %C
@@ -348,11 +320,9 @@ define <16 x i8> @ternary_A_eqv_BC_nand_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16
 ; CHECK-LABEL: ternary_A_eqv_BC_nand_BC_16x8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltib v5, 7
-; CHECK-NEXT:    xxleqv vs0, v3, v4
-; CHECK-NEXT:    xxlnand vs1, v3, v4
 ; CHECK-NEXT:    vslb v2, v2, v5
 ; CHECK-NEXT:    vsrab v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 233
 ; CHECK-NEXT:    blr
 entry:
   %xor = xor <16 x i8> %B, %C
@@ -368,11 +338,9 @@ define <8 x i16> @ternary_A_eqv_BC_nand_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x
 ; CHECK-LABEL: ternary_A_eqv_BC_nand_BC_8x16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltiw v5, 983055
-; CHECK-NEXT:    xxleqv vs0, v3, v4
-; CHECK-NEXT:    xxlnand vs1, v3, v4
 ; CHECK-NEXT:    vslh v2, v2, v5
 ; CHECK-NEXT:    vsrah v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 233
 ; CHECK-NEXT:    blr
 entry:
   %xor = xor <8 x i16> %B, %C
diff --git a/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-nor.ll b/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-nor.ll
index d635952e5d8f2..369587454a7c1 100644
--- a/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-nor.ll
+++ b/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-nor.ll
@@ -15,11 +15,9 @@ define <4 x i32> @ternary_A_and_BC_nor_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i
 ; CHECK-LABEL: ternary_A_and_BC_nor_BC_4x32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxleqv v5, v5, v5
-; CHECK-NEXT:    xxland vs0, v3, v4
-; CHECK-NEXT:    xxlnor vs1, v3, v4
 ; CHECK-NEXT:    vslw v2, v2, v5
 ; CHECK-NEXT:    vsraw v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 129
 ; CHECK-NEXT:    blr
 entry:
   %and = and <4 x i32> %B, %C
@@ -34,12 +32,10 @@ define <2 x i64> @ternary_A_and_BC_nor_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i
 ; CHECK-LABEL: ternary_A_and_BC_nor_BC_2x64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxlxor v5, v5, v5
-; CHECK-NEXT:    xxland vs0, v3, v4
-; CHECK-NEXT:    xxlnor vs1, v3, v4
 ; CHECK-NEXT:    xxsplti32dx v5, 1, 63
 ; CHECK-NEXT:    vsld v2, v2, v5
 ; CHECK-NEXT:    vsrad v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 129
 ; CHECK-NEXT:    blr
 entry:
   %and = and <2 x i64> %B, %C
@@ -54,11 +50,9 @@ define <16 x i8> @ternary_A_and_BC_nor_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16 x
 ; CHECK-LABEL: ternary_A_and_BC_nor_BC_16x8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltib v5, 7
-; CHECK-NEXT:    xxland vs0, v3, v4
-; CHECK-NEXT:    xxlnor vs1, v3, v4
 ; CHECK-NEXT:    vslb v2, v2, v5
 ; CHECK-NEXT:    vsrab v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 129
 ; CHECK-NEXT:    blr
 entry:
   %and = and <16 x i8> %B, %C
@@ -73,11 +67,9 @@ define <8 x i16> @ternary_A_and_BC_nor_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i
 ; CHECK-LABEL: ternary_A_and_BC_nor_BC_8x16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltiw v5, 983055
-; CHECK-NEXT:    xxland vs0, v3, v4
-; CHECK-NEXT:    xxlnor vs1, v3, v4
 ; CHECK-NEXT:    vslh v2, v2, v5
 ; CHECK-NEXT:    vsrah v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 129
 ; CHECK-NEXT:    blr
 entry:
   %and = and <8 x i16> %B, %C
@@ -92,10 +84,9 @@ define <4 x i32> @ternary_A_B_nor_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %
 ; CHECK-LABEL: ternary_A_B_nor_BC_4x32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxleqv v5, v5, v5
-; CHECK-NEXT:    xxlnor vs0, v3, v4
 ; CHECK-NEXT:    vslw v2, v2, v5
 ; CHECK-NEXT:    vsraw v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs0, v3, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 131
 ; CHECK-NEXT:    blr
 entry:
   %or = or <4 x i32> %B, %C
@@ -109,11 +100,10 @@ define <2 x i64> @ternary_A_B_nor_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %
 ; CHECK-LABEL: ternary_A_B_nor_BC_2x64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxlxor v5, v5, v5
-; CHECK-NEXT:    xxlnor vs0, v3, v4
 ; CHECK-NEXT:    xxsplti32dx v5, 1, 63
 ; CHECK-NEXT:    vsld v2, v2, v5
 ; CHECK-NEXT:    vsrad v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs0, v3, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 131
 ; CHECK-NEXT:    blr
 entry:
   %or = or <2 x i64> %B, %C
@@ -127,10 +117,9 @@ define <16 x i8> @ternary_A_B_nor_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16 x i8>
 ; CHECK-LABEL: ternary_A_B_nor_BC_16x8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltib v5, 7
-; CHECK-NEXT:    xxlnor vs0, v3, v4
 ; CHECK-NEXT:    vslb v2, v2, v5
 ; CHECK-NEXT:    vsrab v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs0, v3, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 131
 ; CHECK-NEXT:    blr
 entry:
   %or = or <16 x i8> %B, %C
@@ -144,10 +133,9 @@ define <8 x i16> @ternary_A_B_nor_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i16> %
 ; CHECK-LABEL: ternary_A_B_nor_BC_8x16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltiw v5, 983055
-; CHECK-NEXT:    xxlnor vs0, v3, v4
 ; CHECK-NEXT:    vslh v2, v2, v5
 ; CHECK-NEXT:    vsrah v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs0, v3, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 131
 ; CHECK-NEXT:    blr
 entry:
   %or = or <8 x i16> %B, %C
@@ -161,10 +149,9 @@ define <4 x i32> @ternary_A_C_nor_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %
 ; CHECK-LABEL: ternary_A_C_nor_BC_4x32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxleqv v5, v5, v5
-; CHECK-NEXT:    xxlnor vs0, v3, v4
 ; CHECK-NEXT:    vslw v2, v2, v5
 ; CHECK-NEXT:    vsraw v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs0, v4, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 133
 ; CHECK-NEXT:    blr
 entry:
   %or = or <4 x i32> %B, %C
@@ -178,11 +165,10 @@ define <2 x i64> @ternary_A_C_nor_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %
 ; CHECK-LABEL: ternary_A_C_nor_BC_2x64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxlxor v5, v5, v5
-; CHECK-NEXT:    xxlnor vs0, v3, v4
 ; CHECK-NEXT:    xxsplti32dx v5, 1, 63
 ; CHECK-NEXT:    vsld v2, v2, v5
 ; CHECK-NEXT:    vsrad v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs0, v4, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 133
 ; CHECK-NEXT:    blr
 entry:
   %or = or <2 x i64> %B, %C
@@ -196,10 +182,9 @@ define <16 x i8> @ternary_A_C_nor_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16 x i8>
 ; CHECK-LABEL: ternary_A_C_nor_BC_16x8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltib v5, 7
-; CHECK-NEXT:    xxlnor vs0, v3, v4
 ; CHECK-NEXT:    vslb v2, v2, v5
 ; CHECK-NEXT:    vsrab v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs0, v4, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 133
 ; CHECK-NEXT:    blr
 entry:
   %or = or <16 x i8> %B, %C
@@ -213,10 +198,9 @@ define <8 x i16> @ternary_A_C_nor_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i16> %
 ; CHECK-LABEL: ternary_A_C_nor_BC_8x16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltiw v5, 983055
-; CHECK-NEXT:    xxlnor vs0, v3, v4
 ; CHECK-NEXT:    vslh v2, v2, v5
 ; CHECK-NEXT:    vsrah v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs0, v4, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 133
 ; CHECK-NEXT:    blr
 entry:
   %or = or <8 x i16> %B, %C
@@ -230,11 +214,9 @@ define <4 x i32> @ternary_A_xor_BC_nor_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i
 ; CHECK-LABEL: ternary_A_xor_BC_nor_BC_4x32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxleqv v5, v5, v5
-; CHECK-NEXT:    xxlxor vs0, v3, v4
-; CHECK-NEXT:    xxlnor vs1, v3, v4
 ; CHECK-NEXT:    vslw v2, v2, v5
 ; CHECK-NEXT:    vsraw v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 134
 ; CHECK-NEXT:    blr
 entry:
   %xor = xor <4 x i32> %B, %C
@@ -249,12 +231,10 @@ define <2 x i64> @ternary_A_xor_BC_nor_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i
 ; CHECK-LABEL: ternary_A_xor_BC_nor_BC_2x64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxlxor v5, v5, v5
-; CHECK-NEXT:    xxlxor vs0, v3, v4
-; CHECK-NEXT:    xxlnor vs1, v3, v4
 ; CHECK-NEXT:    xxsplti32dx v5, 1, 63
 ; CHECK-NEXT:    vsld v2, v2, v5
 ; CHECK-NEXT:    vsrad v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 134
 ; CHECK-NEXT:    blr
 entry:
   %xor = xor <2 x i64> %B, %C
@@ -269,11 +249,9 @@ define <16 x i8> @ternary_A_xor_BC_nor_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16 x
 ; CHECK-LABEL: ternary_A_xor_BC_nor_BC_16x8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltib v5, 7
-; CHECK-NEXT:    xxlxor vs0, v3, v4
-; CHECK-NEXT:    xxlnor vs1, v3, v4
 ; CHECK-NEXT:    vslb v2, v2, v5
 ; CHECK-NEXT:    vsrab v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 134
 ; CHECK-NEXT:    blr
 entry:
   %xor = xor <16 x i8> %B, %C
@@ -288,11 +266,9 @@ define <8 x i16> @ternary_A_xor_BC_nor_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i
 ; CHECK-LABEL: ternary_A_xor_BC_nor_BC_8x16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltiw v5, 983055
-; CHECK-NEXT:    xxlxor vs0, v3, v4
-; CHECK-NEXT:    xxlnor vs1, v3, v4
 ; CHECK-NEXT:    vslh v2, v2, v5
 ; CHECK-NEXT:    vsrah v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 134
 ; CHECK-NEXT:    blr
 entry:
   %xor = xor <8 x i16> %B, %C
@@ -307,11 +283,9 @@ define <4 x i32> @ternary_A_not_C_nor_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i3
 ; CHECK-LABEL: ternary_A_not_C_nor_BC_4x32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxleqv v5, v5, v5
-; CHECK-NEXT:    xxlnor vs0, v4, v4
-; CHECK-NEXT:    xxlnor vs1, v3, v4
 ; CHECK-NEXT:    vslw v2, v2, v5
 ; CHECK-NEXT:    vsraw v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 138
 ; CHECK-NEXT:    blr
 entry:
   %not = xor <4 x i32> %C, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector not operation
@@ -326,12 +300,10 @@ define <2 x i64> @ternary_A_not_C_nor_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i6
 ; CHECK-LABEL: ternary_A_not_C_nor_BC_2x64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxlxor v5, v5, v5
-; CHECK-NEXT:    xxlnor vs0, v4, v4
-; CHECK-NEXT:    xxlnor vs1, v3, v4
 ; CHECK-NEXT:    xxsplti32dx v5, 1, 63
 ; CHECK-NEXT:    vsld v2, v2, v5
 ; CHECK-NEXT:    vsrad v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 138
 ; CHECK-NEXT:    blr
 entry:
   %not = xor <2 x i64> %C, <i64 -1, i64 -1>  ; Vector not operation
@@ -346,11 +318,9 @@ define <16 x i8> @ternary_A_not_C_nor_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16 x
 ; CHECK-LABEL: ternary_A_not_C_nor_BC_16x8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltib v5, 7
-; CHECK-NEXT:    xxlnor vs0, v4, v4
-; CHECK-NEXT:    xxlnor vs1, v3, v4
 ; CHECK-NEXT:    vslb v2, v2, v5
 ; CHECK-NEXT:    vsrab v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 138
 ; CHECK-NEXT:    blr
 entry:
   %not = xor <16 x i8> %C, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>  ; Vector not operation
@@ -365,11 +335,9 @@ define <8 x i16> @ternary_A_not_C_nor_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i1
 ; CHECK-LABEL: ternary_A_not_C_nor_BC_8x16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltiw v5, 983055
-; CHECK-NEXT:    xxlnor vs0, v4, v4
-; CHECK-NEXT:    xxlnor vs1, v3, v4
 ; CHECK-NEXT:    vslh v2, v2, v5
 ; CHECK-NEXT:    vsrah v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 138
 ; CHECK-NEXT:    blr
 entry:
   %not = xor <8 x i16> %C, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>  ; Vector not operation
@@ -384,11 +352,9 @@ define <4 x i32> @ternary_A_not_B_nor_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i3
 ; CHECK-LABEL: ternary_A_not_B_nor_BC_4x32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxleqv v5, v5, v5
-; CHECK-NEXT:    xxlnor vs0, v3, v3
-; CHECK-NEXT:    xxlnor vs1, v3, v4
 ; CHECK-NEXT:    vslw v2, v2, v5
 ; CHECK-NEXT:    vsraw v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 140
 ; CHECK-NEXT:    blr
 entry:
   %not = xor <4 x i32> %B, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector not operation
@@ -403,12 +369,10 @@ define <2 x i64> @ternary_A_not_B_nor_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i6
 ; CHECK-LABEL: ternary_A_not_B_nor_BC_2x64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxlxor v5, v5, v5
-; CHECK-NEXT:    xxlnor vs0, v3, v3
-; CHECK-NEXT:    xxlnor vs1, v3, v4
 ; CHECK-NEXT:    xxsplti32dx v5, 1, 63
 ; CHECK-NEXT:    vsld v2, v2, v5
 ; CHECK-NEXT:    vsrad v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 140
 ; CHECK-NEXT:    blr
 entry:
   %not = xor <2 x i64> %B, <i64 -1, i64 -1>  ; Vector not operation
@@ -423,11 +387,9 @@ define <16 x i8> @ternary_A_not_B_nor_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16 x
 ; CHECK-LABEL: ternary_A_not_B_nor_BC_16x8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltib v5, 7
-; CHECK-NEXT:    xxlnor vs0, v3, v3
-; CHECK-NEXT:    xxlnor vs1, v3, v4
 ; CHECK-NEXT:    vslb v2, v2, v5
 ; CHECK-NEXT:    vsrab v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 140
 ; CHECK-NEXT:    blr
 entry:
   %not = xor <16 x i8> %B, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>  ; Vector not operation
@@ -442,11 +404,9 @@ define <8 x i16> @ternary_A_not_B_nor_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i1
 ; CHECK-LABEL: ternary_A_not_B_nor_BC_8x16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltiw v5, 983055
-; CHECK-NEXT:    xxlnor vs0, v3, v3
-; CHECK-NEXT:    xxlnor vs1, v3, v4
 ; CHECK-NEXT:    vslh v2, v2, v5
 ; CHECK-NEXT:    vsrah v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 140
 ; CHECK-NEXT:    blr
 entry:
   %not = xor <8 x i16> %B, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>  ; Vector not operation
@@ -461,11 +421,9 @@ define <4 x i32> @ternary_A_nand_BC_nor_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x
 ; CHECK-LABEL: ternary_A_nand_BC_nor_BC_4x32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxleqv v5, v5, v5
-; CHECK-NEXT:    xxlnand vs0, v3, v4
-; CHECK-NEXT:    xxlnor vs1, v3, v4
 ; CHECK-NEXT:    vslw v2, v2, v5
 ; CHECK-NEXT:    vsraw v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 142
 ; CHECK-NEXT:    blr
 entry:
   %and = and <4 x i32> %B, %C
@@ -481,12 +439,10 @@ define <2 x i64> @ternary_A_nand_BC_nor_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x
 ; CHECK-LABEL: ternary_A_nand_BC_nor_BC_2x64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxlxor v5, v5, v5
-; CHECK-NEXT:    xxlnand vs0, v3, v4
-; CHECK-NEXT:    xxlnor vs1, v3, v4
 ; CHECK-NEXT:    xxsplti32dx v5, 1, 63
 ; CHECK-NEXT:    vsld v2, v2, v5
 ; CHECK-NEXT:    vsrad v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 142
 ; CHECK-NEXT:    blr
 entry:
   %and = and <2 x i64> %B, %C
@@ -502,11 +458,9 @@ define <16 x i8> @ternary_A_nand_BC_nor_BC_16x8(<16 x i1> %A, <16 x i8> %B, <16
 ; CHECK-LABEL: ternary_A_nand_BC_nor_BC_16x8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltib v5, 7
-; CHECK-NEXT:    xxlnand vs0, v3, v4
-; CHECK-NEXT:    xxlnor vs1, v3, v4
 ; CHECK-NEXT:    vslb v2, v2, v5
 ; CHECK-NEXT:    vsrab v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 142
 ; CHECK-NEXT:    blr
 entry:
   %and = and <16 x i8> %B, %C
@@ -522,11 +476,9 @@ define <8 x i16> @ternary_A_nand_BC_nor_BC_8x16(<8 x i1> %A, <8 x i16> %B, <8 x
 ; CHECK-LABEL: ternary_A_nand_BC_nor_BC_8x16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltiw v5, 983055
-; CHECK-NEXT:    xxlnand vs0, v3, v4
-; CHECK-NEXT:    xxlnor vs1, v3, v4
 ; CHECK-NEXT:    vslh v2, v2, v5
 ; CHECK-NEXT:    vsrah v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 142
 ; CHECK-NEXT:    blr
 entry:
   %and = and <8 x i16> %B, %C
diff --git a/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-not-b.ll b/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-not-b.ll
index 6203a96555395..a67d9cf91ac09 100644
--- a/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-not-b.ll
+++ b/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-not-b.ll
@@ -15,11 +15,9 @@ define <4 x i32> @ternary_A_and_BC_not_B_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i3
 ; CHECK-LABEL: ternary_A_and_BC_not_B_4x32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxleqv v5, v5, v5
-; CHECK-NEXT:    xxland vs0, v3, v4
-; CHECK-NEXT:    xxlnor vs1, v3, v3
 ; CHECK-NEXT:    vslw v2, v2, v5
 ; CHECK-NEXT:    vsraw v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 193
 ; CHECK-NEXT:    blr
 entry:
   %and = and <4 x i32> %B, %C
@@ -33,12 +31,10 @@ define <2 x i64> @ternary_A_and_BC_not_B_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i6
 ; CHECK-LABEL: ternary_A_and_BC_not_B_2x64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxlxor v5, v5, v5
-; CHECK-NEXT:    xxland vs0, v3, v4
-; CHECK-NEXT:    xxlnor vs1, v3, v3
 ; CHECK-NEXT:    xxsplti32dx v5, 1, 63
 ; CHECK-NEXT:    vsld v2, v2, v5
 ; CHECK-NEXT:    vsrad v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 193
 ; CHECK-NEXT:    blr
 entry:
   %and = and <2 x i64> %B, %C
@@ -52,11 +48,9 @@ define <16 x i8> @ternary_A_and_BC_not_B_16x8(<16 x i1> %A, <16 x i8> %B, <16 x
 ; CHECK-LABEL: ternary_A_and_BC_not_B_16x8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltib v5, 7
-; CHECK-NEXT:    xxland vs0, v3, v4
-; CHECK-NEXT:    xxlnor vs1, v3, v3
 ; CHECK-NEXT:    vslb v2, v2, v5
 ; CHECK-NEXT:    vsrab v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 193
 ; CHECK-NEXT:    blr
 entry:
   %and = and <16 x i8> %B, %C
@@ -70,11 +64,9 @@ define <8 x i16> @ternary_A_and_BC_not_B_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i1
 ; CHECK-LABEL: ternary_A_and_BC_not_B_8x16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltiw v5, 983055
-; CHECK-NEXT:    xxland vs0, v3, v4
-; CHECK-NEXT:    xxlnor vs1, v3, v3
 ; CHECK-NEXT:    vslh v2, v2, v5
 ; CHECK-NEXT:    vsrah v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 193
 ; CHECK-NEXT:    blr
 entry:
   %and = and <8 x i16> %B, %C
@@ -88,11 +80,9 @@ define <4 x i32> @ternary_A_xor_BC_not_B_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i3
 ; CHECK-LABEL: ternary_A_xor_BC_not_B_4x32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxleqv v5, v5, v5
-; CHECK-NEXT:    xxlxor vs0, v3, v4
-; CHECK-NEXT:    xxlnor vs1, v3, v3
 ; CHECK-NEXT:    vslw v2, v2, v5
 ; CHECK-NEXT:    vsraw v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 198
 ; CHECK-NEXT:    blr
 entry:
   %xor = xor <4 x i32> %B, %C
@@ -106,12 +96,10 @@ define <2 x i64> @ternary_A_xor_BC_not_B_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i6
 ; CHECK-LABEL: ternary_A_xor_BC_not_B_2x64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxlxor v5, v5, v5
-; CHECK-NEXT:    xxlxor vs0, v3, v4
-; CHECK-NEXT:    xxlnor vs1, v3, v3
 ; CHECK-NEXT:    xxsplti32dx v5, 1, 63
 ; CHECK-NEXT:    vsld v2, v2, v5
 ; CHECK-NEXT:    vsrad v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 198
 ; CHECK-NEXT:    blr
 entry:
   %xor = xor <2 x i64> %B, %C
@@ -125,11 +113,9 @@ define <16 x i8> @ternary_A_xor_BC_not_B_16x8(<16 x i1> %A, <16 x i8> %B, <16 x
 ; CHECK-LABEL: ternary_A_xor_BC_not_B_16x8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltib v5, 7
-; CHECK-NEXT:    xxlxor vs0, v3, v4
-; CHECK-NEXT:    xxlnor vs1, v3, v3
 ; CHECK-NEXT:    vslb v2, v2, v5
 ; CHECK-NEXT:    vsrab v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 198
 ; CHECK-NEXT:    blr
 entry:
   %xor = xor <16 x i8> %B, %C
@@ -143,11 +129,9 @@ define <8 x i16> @ternary_A_xor_BC_not_B_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i1
 ; CHECK-LABEL: ternary_A_xor_BC_not_B_8x16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltiw v5, 983055
-; CHECK-NEXT:    xxlxor vs0, v3, v4
-; CHECK-NEXT:    xxlnor vs1, v3, v3
 ; CHECK-NEXT:    vslh v2, v2, v5
 ; CHECK-NEXT:    vsrah v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 198
 ; CHECK-NEXT:    blr
 entry:
   %xor = xor <8 x i16> %B, %C
@@ -161,11 +145,9 @@ define <4 x i32> @ternary_A_or_BC_not_B_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32
 ; CHECK-LABEL: ternary_A_or_BC_not_B_4x32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxleqv v5, v5, v5
-; CHECK-NEXT:    xxlor vs0, v3, v4
-; CHECK-NEXT:    xxlnor vs1, v3, v3
 ; CHECK-NEXT:    vslw v2, v2, v5
 ; CHECK-NEXT:    vsraw v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 199
 ; CHECK-NEXT:    blr
 entry:
   %or = or <4 x i32> %B, %C
@@ -179,12 +161,10 @@ define <2 x i64> @ternary_A_or_BC_not_B_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64
 ; CHECK-LABEL: ternary_A_or_BC_not_B_2x64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxlxor v5, v5, v5
-; CHECK-NEXT:    xxlor vs0, v3, v4
-; CHECK-NEXT:    xxlnor vs1, v3, v3
 ; CHECK-NEXT:    xxsplti32dx v5, 1, 63
 ; CHECK-NEXT:    vsld v2, v2, v5
 ; CHECK-NEXT:    vsrad v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 199
 ; CHECK-NEXT:    blr
 entry:
   %or = or <2 x i64> %B, %C
@@ -198,11 +178,9 @@ define <16 x i8> @ternary_A_or_BC_not_B_16x8(<16 x i1> %A, <16 x i8> %B, <16 x i
 ; CHECK-LABEL: ternary_A_or_BC_not_B_16x8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltib v5, 7
-; CHECK-NEXT:    xxlor vs0, v3, v4
-; CHECK-NEXT:    xxlnor vs1, v3, v3
 ; CHECK-NEXT:    vslb v2, v2, v5
 ; CHECK-NEXT:    vsrab v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 199
 ; CHECK-NEXT:    blr
 entry:
   %or = or <16 x i8> %B, %C
@@ -216,11 +194,9 @@ define <8 x i16> @ternary_A_or_BC_not_B_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i16
 ; CHECK-LABEL: ternary_A_or_BC_not_B_8x16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltiw v5, 983055
-; CHECK-NEXT:    xxlor vs0, v3, v4
-; CHECK-NEXT:    xxlnor vs1, v3, v3
 ; CHECK-NEXT:    vslh v2, v2, v5
 ; CHECK-NEXT:    vsrah v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 199
 ; CHECK-NEXT:    blr
 entry:
   %or = or <8 x i16> %B, %C
@@ -234,11 +210,9 @@ define <4 x i32> @ternary_A_nand_BC_not_B_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i
 ; CHECK-LABEL: ternary_A_nand_BC_not_B_4x32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxleqv v5, v5, v5
-; CHECK-NEXT:    xxlnand vs0, v3, v4
-; CHECK-NEXT:    xxlnor vs1, v3, v3
 ; CHECK-NEXT:    vslw v2, v2, v5
 ; CHECK-NEXT:    vsraw v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 206
 ; CHECK-NEXT:    blr
 entry:
   %and = and <4 x i32> %B, %C
@@ -253,12 +227,10 @@ define <2 x i64> @ternary_A_nand_BC_not_B_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i
 ; CHECK-LABEL: ternary_A_nand_BC_not_B_2x64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxlxor v5, v5, v5
-; CHECK-NEXT:    xxlnand vs0, v3, v4
-; CHECK-NEXT:    xxlnor vs1, v3, v3
 ; CHECK-NEXT:    xxsplti32dx v5, 1, 63
 ; CHECK-NEXT:    vsld v2, v2, v5
 ; CHECK-NEXT:    vsrad v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 206
 ; CHECK-NEXT:    blr
 entry:
   %and = and <2 x i64> %B, %C
@@ -273,11 +245,9 @@ define <16 x i8> @ternary_A_nand_BC_not_B_16x8(<16 x i1> %A, <16 x i8> %B, <16 x
 ; CHECK-LABEL: ternary_A_nand_BC_not_B_16x8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltib v5, 7
-; CHECK-NEXT:    xxlnand vs0, v3, v4
-; CHECK-NEXT:    xxlnor vs1, v3, v3
 ; CHECK-NEXT:    vslb v2, v2, v5
 ; CHECK-NEXT:    vsrab v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 206
 ; CHECK-NEXT:    blr
 entry:
   %and = and <16 x i8> %B, %C
@@ -292,11 +262,9 @@ define <8 x i16> @ternary_A_nand_BC_not_B_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i
 ; CHECK-LABEL: ternary_A_nand_BC_not_B_8x16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltiw v5, 983055
-; CHECK-NEXT:    xxlnand vs0, v3, v4
-; CHECK-NEXT:    xxlnor vs1, v3, v3
 ; CHECK-NEXT:    vslh v2, v2, v5
 ; CHECK-NEXT:    vsrah v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 206
 ; CHECK-NEXT:    blr
 entry:
   %and = and <8 x i16> %B, %C
diff --git a/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-not-c.ll b/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-not-c.ll
index 3479d949439be..98c1f28d49d8e 100644
--- a/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-not-c.ll
+++ b/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-not-c.ll
@@ -15,11 +15,9 @@ define <4 x i32> @ternary_A_and_BC_not_C_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i3
 ; CHECK-LABEL: ternary_A_and_BC_not_C_4x32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxleqv v5, v5, v5
-; CHECK-NEXT:    xxland vs0, v3, v4
-; CHECK-NEXT:    xxlnor vs1, v4, v4
 ; CHECK-NEXT:    vslw v2, v2, v5
 ; CHECK-NEXT:    vsraw v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 161
 ; CHECK-NEXT:    blr
 entry:
   %and = and <4 x i32> %B, %C
@@ -33,12 +31,10 @@ define <2 x i64> @ternary_A_and_BC_not_C_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i6
 ; CHECK-LABEL: ternary_A_and_BC_not_C_2x64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxlxor v5, v5, v5
-; CHECK-NEXT:    xxland vs0, v3, v4
-; CHECK-NEXT:    xxlnor vs1, v4, v4
 ; CHECK-NEXT:    xxsplti32dx v5, 1, 63
 ; CHECK-NEXT:    vsld v2, v2, v5
 ; CHECK-NEXT:    vsrad v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 161
 ; CHECK-NEXT:    blr
 entry:
   %and = and <2 x i64> %B, %C
@@ -52,11 +48,9 @@ define <16 x i8> @ternary_A_and_BC_not_C_16x8(<16 x i1> %A, <16 x i8> %B, <16 x
 ; CHECK-LABEL: ternary_A_and_BC_not_C_16x8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltib v5, 7
-; CHECK-NEXT:    xxland vs0, v3, v4
-; CHECK-NEXT:    xxlnor vs1, v4, v4
 ; CHECK-NEXT:    vslb v2, v2, v5
 ; CHECK-NEXT:    vsrab v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 161
 ; CHECK-NEXT:    blr
 entry:
   %and = and <16 x i8> %B, %C
@@ -70,11 +64,9 @@ define <8 x i16> @ternary_A_and_BC_not_C_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i1
 ; CHECK-LABEL: ternary_A_and_BC_not_C_8x16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltiw v5, 983055
-; CHECK-NEXT:    xxland vs0, v3, v4
-; CHECK-NEXT:    xxlnor vs1, v4, v4
 ; CHECK-NEXT:    vslh v2, v2, v5
 ; CHECK-NEXT:    vsrah v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 161
 ; CHECK-NEXT:    blr
 entry:
   %and = and <8 x i16> %B, %C
@@ -88,10 +80,9 @@ define <4 x i32> @ternary_A_B_not_C_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C
 ; CHECK-LABEL: ternary_A_B_not_C_4x32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxleqv v5, v5, v5
-; CHECK-NEXT:    xxlnor vs0, v4, v4
 ; CHECK-NEXT:    vslw v2, v2, v5
 ; CHECK-NEXT:    vsraw v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs0, v3, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 163
 ; CHECK-NEXT:    blr
 entry:
   %not = xor <4 x i32> %C, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector not operation
@@ -104,11 +95,10 @@ define <2 x i64> @ternary_A_B_not_C_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C
 ; CHECK-LABEL: ternary_A_B_not_C_2x64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxlxor v5, v5, v5
-; CHECK-NEXT:    xxlnor vs0, v4, v4
 ; CHECK-NEXT:    xxsplti32dx v5, 1, 63
 ; CHECK-NEXT:    vsld v2, v2, v5
 ; CHECK-NEXT:    vsrad v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs0, v3, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 163
 ; CHECK-NEXT:    blr
 entry:
   %not = xor <2 x i64> %C, <i64 -1, i64 -1>  ; Vector not operation
@@ -121,10 +111,9 @@ define <16 x i8> @ternary_A_B_not_C_16x8(<16 x i1> %A, <16 x i8> %B, <16 x i8> %
 ; CHECK-LABEL: ternary_A_B_not_C_16x8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltib v5, 7
-; CHECK-NEXT:    xxlnor vs0, v4, v4
 ; CHECK-NEXT:    vslb v2, v2, v5
 ; CHECK-NEXT:    vsrab v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs0, v3, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 163
 ; CHECK-NEXT:    blr
 entry:
   %not = xor <16 x i8> %C, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>  ; Vector not operation
@@ -137,10 +126,9 @@ define <8 x i16> @ternary_A_B_not_C_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i16> %C
 ; CHECK-LABEL: ternary_A_B_not_C_8x16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltiw v5, 983055
-; CHECK-NEXT:    xxlnor vs0, v4, v4
 ; CHECK-NEXT:    vslh v2, v2, v5
 ; CHECK-NEXT:    vsrah v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs0, v3, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 163
 ; CHECK-NEXT:    blr
 entry:
   %not = xor <8 x i16> %C, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>  ; Vector not operation
@@ -153,11 +141,9 @@ define <4 x i32> @ternary_A_xor_BC_not_C_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i3
 ; CHECK-LABEL: ternary_A_xor_BC_not_C_4x32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxleqv v5, v5, v5
-; CHECK-NEXT:    xxlxor vs0, v3, v4
-; CHECK-NEXT:    xxlnor vs1, v4, v4
 ; CHECK-NEXT:    vslw v2, v2, v5
 ; CHECK-NEXT:    vsraw v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 166
 ; CHECK-NEXT:    blr
 entry:
   %xor = xor <4 x i32> %B, %C
@@ -171,12 +157,10 @@ define <2 x i64> @ternary_A_xor_BC_not_C_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i6
 ; CHECK-LABEL: ternary_A_xor_BC_not_C_2x64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxlxor v5, v5, v5
-; CHECK-NEXT:    xxlxor vs0, v3, v4
-; CHECK-NEXT:    xxlnor vs1, v4, v4
 ; CHECK-NEXT:    xxsplti32dx v5, 1, 63
 ; CHECK-NEXT:    vsld v2, v2, v5
 ; CHECK-NEXT:    vsrad v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 166
 ; CHECK-NEXT:    blr
 entry:
   %xor = xor <2 x i64> %B, %C
@@ -190,11 +174,9 @@ define <16 x i8> @ternary_A_xor_BC_not_C_16x8(<16 x i1> %A, <16 x i8> %B, <16 x
 ; CHECK-LABEL: ternary_A_xor_BC_not_C_16x8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltib v5, 7
-; CHECK-NEXT:    xxlxor vs0, v3, v4
-; CHECK-NEXT:    xxlnor vs1, v4, v4
 ; CHECK-NEXT:    vslb v2, v2, v5
 ; CHECK-NEXT:    vsrab v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 166
 ; CHECK-NEXT:    blr
 entry:
   %xor = xor <16 x i8> %B, %C
@@ -208,11 +190,9 @@ define <8 x i16> @ternary_A_xor_BC_not_C_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i1
 ; CHECK-LABEL: ternary_A_xor_BC_not_C_8x16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltiw v5, 983055
-; CHECK-NEXT:    xxlxor vs0, v3, v4
-; CHECK-NEXT:    xxlnor vs1, v4, v4
 ; CHECK-NEXT:    vslh v2, v2, v5
 ; CHECK-NEXT:    vsrah v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 166
 ; CHECK-NEXT:    blr
 entry:
   %xor = xor <8 x i16> %B, %C
@@ -226,11 +206,9 @@ define <4 x i32> @ternary_A_or_BC_not_C_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32
 ; CHECK-LABEL: ternary_A_or_BC_not_C_4x32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxleqv v5, v5, v5
-; CHECK-NEXT:    xxlor vs0, v3, v4
-; CHECK-NEXT:    xxlnor vs1, v4, v4
 ; CHECK-NEXT:    vslw v2, v2, v5
 ; CHECK-NEXT:    vsraw v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 167
 ; CHECK-NEXT:    blr
 entry:
   %or = or <4 x i32> %B, %C
@@ -244,12 +222,10 @@ define <2 x i64> @ternary_A_or_BC_not_C_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64
 ; CHECK-LABEL: ternary_A_or_BC_not_C_2x64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxlxor v5, v5, v5
-; CHECK-NEXT:    xxlor vs0, v3, v4
-; CHECK-NEXT:    xxlnor vs1, v4, v4
 ; CHECK-NEXT:    xxsplti32dx v5, 1, 63
 ; CHECK-NEXT:    vsld v2, v2, v5
 ; CHECK-NEXT:    vsrad v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 167
 ; CHECK-NEXT:    blr
 entry:
   %or = or <2 x i64> %B, %C
@@ -263,11 +239,9 @@ define <16 x i8> @ternary_A_or_BC_not_C_16x8(<16 x i1> %A, <16 x i8> %B, <16 x i
 ; CHECK-LABEL: ternary_A_or_BC_not_C_16x8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltib v5, 7
-; CHECK-NEXT:    xxlor vs0, v3, v4
-; CHECK-NEXT:    xxlnor vs1, v4, v4
 ; CHECK-NEXT:    vslb v2, v2, v5
 ; CHECK-NEXT:    vsrab v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 167
 ; CHECK-NEXT:    blr
 entry:
   %or = or <16 x i8> %B, %C
@@ -281,11 +255,9 @@ define <8 x i16> @ternary_A_or_BC_not_C_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i16
 ; CHECK-LABEL: ternary_A_or_BC_not_C_8x16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltiw v5, 983055
-; CHECK-NEXT:    xxlor vs0, v3, v4
-; CHECK-NEXT:    xxlnor vs1, v4, v4
 ; CHECK-NEXT:    vslh v2, v2, v5
 ; CHECK-NEXT:    vsrah v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 167
 ; CHECK-NEXT:    blr
 entry:
   %or = or <8 x i16> %B, %C
@@ -299,11 +271,9 @@ define <4 x i32> @ternary_A_not_B_not_C_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32
 ; CHECK-LABEL: ternary_A_not_B_not_C_4x32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxleqv v5, v5, v5
-; CHECK-NEXT:    xxlnor vs0, v3, v3
-; CHECK-NEXT:    xxlnor vs1, v4, v4
 ; CHECK-NEXT:    vslw v2, v2, v5
 ; CHECK-NEXT:    vsraw v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 172
 ; CHECK-NEXT:    blr
 entry:
   %not_b = xor <4 x i32> %B, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector not operation
@@ -317,12 +287,10 @@ define <2 x i64> @ternary_A_not_B_not_C_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64
 ; CHECK-LABEL: ternary_A_not_B_not_C_2x64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxlxor v5, v5, v5
-; CHECK-NEXT:    xxlnor vs0, v3, v3
-; CHECK-NEXT:    xxlnor vs1, v4, v4
 ; CHECK-NEXT:    xxsplti32dx v5, 1, 63
 ; CHECK-NEXT:    vsld v2, v2, v5
 ; CHECK-NEXT:    vsrad v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 172
 ; CHECK-NEXT:    blr
 entry:
   %not_b = xor <2 x i64> %B, <i64 -1, i64 -1>  ; Vector not operation
@@ -336,11 +304,9 @@ define <16 x i8> @ternary_A_not_B_not_C_16x8(<16 x i1> %A, <16 x i8> %B, <16 x i
 ; CHECK-LABEL: ternary_A_not_B_not_C_16x8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltib v5, 7
-; CHECK-NEXT:    xxlnor vs0, v3, v3
-; CHECK-NEXT:    xxlnor vs1, v4, v4
 ; CHECK-NEXT:    vslb v2, v2, v5
 ; CHECK-NEXT:    vsrab v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 172
 ; CHECK-NEXT:    blr
 entry:
   %not_b = xor <16 x i8> %B, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>  ; Vector not operation
@@ -354,11 +320,9 @@ define <8 x i16> @ternary_A_not_B_not_C_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i16
 ; CHECK-LABEL: ternary_A_not_B_not_C_8x16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltiw v5, 983055
-; CHECK-NEXT:    xxlnor vs0, v3, v3
-; CHECK-NEXT:    xxlnor vs1, v4, v4
 ; CHECK-NEXT:    vslh v2, v2, v5
 ; CHECK-NEXT:    vsrah v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 172
 ; CHECK-NEXT:    blr
 entry:
   %not_b = xor <8 x i16> %B, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>  ; Vector not operation
@@ -372,11 +336,9 @@ define <4 x i32> @ternary_A_nand_BC_not_C_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i
 ; CHECK-LABEL: ternary_A_nand_BC_not_C_4x32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxleqv v5, v5, v5
-; CHECK-NEXT:    xxlnand vs0, v3, v4
-; CHECK-NEXT:    xxlnor vs1, v4, v4
 ; CHECK-NEXT:    vslw v2, v2, v5
 ; CHECK-NEXT:    vsraw v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 174
 ; CHECK-NEXT:    blr
 entry:
   %and = and <4 x i32> %B, %C
@@ -391,12 +353,10 @@ define <2 x i64> @ternary_A_nand_BC_not_C_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i
 ; CHECK-LABEL: ternary_A_nand_BC_not_C_2x64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxlxor v5, v5, v5
-; CHECK-NEXT:    xxlnand vs0, v3, v4
-; CHECK-NEXT:    xxlnor vs1, v4, v4
 ; CHECK-NEXT:    xxsplti32dx v5, 1, 63
 ; CHECK-NEXT:    vsld v2, v2, v5
 ; CHECK-NEXT:    vsrad v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 174
 ; CHECK-NEXT:    blr
 entry:
   %and = and <2 x i64> %B, %C
@@ -411,11 +371,9 @@ define <16 x i8> @ternary_A_nand_BC_not_C_16x8(<16 x i1> %A, <16 x i8> %B, <16 x
 ; CHECK-LABEL: ternary_A_nand_BC_not_C_16x8:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltib v5, 7
-; CHECK-NEXT:    xxlnand vs0, v3, v4
-; CHECK-NEXT:    xxlnor vs1, v4, v4
 ; CHECK-NEXT:    vslb v2, v2, v5
 ; CHECK-NEXT:    vsrab v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 174
 ; CHECK-NEXT:    blr
 entry:
   %and = and <16 x i8> %B, %C
@@ -430,11 +388,9 @@ define <8 x i16> @ternary_A_nand_BC_not_C_8x16(<8 x i1> %A, <8 x i16> %B, <8 x i
 ; CHECK-LABEL: ternary_A_nand_BC_not_C_8x16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xxspltiw v5, 983055
-; CHECK-NEXT:    xxlnand vs0, v3, v4
-; CHECK-NEXT:    xxlnor vs1, v4, v4
 ; CHECK-NEXT:    vslh v2, v2, v5
 ; CHECK-NEXT:    vsrah v2, v2, v5
-; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    xxeval v2, v2, v3, v4, 174
 ; CHECK-NEXT:    blr
 entry:
   %and = and <8 x i16> %B, %C
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir
index 2e500d537eedd..da7546e12e58b 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir
@@ -689,8 +689,8 @@
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: G_INSERT_VECTOR_ELT (opcode {{[0-9]+}}): 3 type indices, 0 imm indices
-# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: G_EXTRACT_VECTOR_ELT (opcode {{[0-9]+}}): 3 type indices, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insertelement-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insertelement-rv32.mir
new file mode 100644
index 0000000000000..d7c0e80b11264
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insertelement-rv32.mir
@@ -0,0 +1,1742 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+v -run-pass=legalizer %s -o - | FileCheck %s
+
+---
+name:            insertelement_nxv1i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv1i1_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s32)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 1 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %1:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+    %2:_(s1) = G_CONSTANT i1 false
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s32)
+    $v0 = COPY %0(<vscale x 1 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv1i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv1i1_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s32)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 1 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %1:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+    %2:_(s1) = G_CONSTANT i1 true
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s32)
+    $v0 = COPY %0(<vscale x 1 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv1i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: insertelement_nxv1i1_2
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[COPY1]](s32)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 1 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %2:_(s32) = COPY $x10
+    %0:_(s1) = G_TRUNC %2(s32)
+    %1:_(s32) = COPY $x11
+    %4:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+    %3:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT %4, %0(s1), %1(s32)
+    $v0 = COPY %3(<vscale x 1 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv2i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv2i1_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s32)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 2 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %1:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    %2:_(s1) = G_CONSTANT i1 false
+    %3:_(s32) = G_CONSTANT i32 1
+    %0:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s32)
+    $v0 = COPY %0(<vscale x 2 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv2i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv2i1_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s32)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 2 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %1:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    %2:_(s1) = G_CONSTANT i1 true
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s32)
+    $v0 = COPY %0(<vscale x 2 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv2i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: insertelement_nxv2i1_2
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[COPY1]](s32)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 2 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %2:_(s32) = COPY $x10
+    %0:_(s1) = G_TRUNC %2(s32)
+    %1:_(s32) = COPY $x11
+    %4:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    %3:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT %4, %0(s1), %1(s32)
+    $v0 = COPY %3(<vscale x 2 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv4i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv4i1_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s32)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 4 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %1:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    %2:_(s1) = G_CONSTANT i1 false
+    %3:_(s32) = G_CONSTANT i32 2
+    %0:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s32)
+    $v0 = COPY %0(<vscale x 4 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv4i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv4i1_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s32)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 4 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %1:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    %2:_(s1) = G_CONSTANT i1 true
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s32)
+    $v0 = COPY %0(<vscale x 4 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv4i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv4i1_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C]](s32)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 4 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %1:_(s32) = COPY $x10
+    %0:_(s1) = G_TRUNC %1(s32)
+    %3:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    %4:_(s32) = G_CONSTANT i32 0
+    %2:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT %3, %0(s1), %4(s32)
+    $v0 = COPY %2(<vscale x 4 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv8i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv8i1_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s32)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 8 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %1:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    %2:_(s1) = G_CONSTANT i1 false
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s32)
+    $v0 = COPY %0(<vscale x 8 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv8i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv8i1_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s32)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 8 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %1:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    %2:_(s1) = G_CONSTANT i1 true
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s32)
+    $v0 = COPY %0(<vscale x 8 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv8i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: insertelement_nxv8i1_2
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[COPY1]](s32)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 8 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %2:_(s32) = COPY $x10
+    %0:_(s1) = G_TRUNC %2(s32)
+    %1:_(s32) = COPY $x11
+    %4:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    %3:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT %4, %0(s1), %1(s32)
+    $v0 = COPY %3(<vscale x 8 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv16i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv16i1_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 15
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s32)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 16 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %1:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+    %2:_(s1) = G_CONSTANT i1 false
+    %3:_(s32) = G_CONSTANT i32 15
+    %0:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s32)
+    $v0 = COPY %0(<vscale x 16 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv16i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv16i1_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s32)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 16 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %1:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+    %2:_(s1) = G_CONSTANT i1 true
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s32)
+    $v0 = COPY %0(<vscale x 16 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv16i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: insertelement_nxv16i1_2
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[COPY1]](s32)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 16 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %2:_(s32) = COPY $x10
+    %0:_(s1) = G_TRUNC %2(s32)
+    %1:_(s32) = COPY $x11
+    %4:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+    %3:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT %4, %0(s1), %1(s32)
+    $v0 = COPY %3(<vscale x 16 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv4i1_3
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0, $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv4i1_3
+    ; CHECK: liveins: $v0, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY1]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT [[COPY]], [[TRUNC]](s1), [[C]](s32)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 4 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(<vscale x 4 x s1>) = COPY $v0
+    %2:_(s32) = COPY $x10
+    %1:_(s1) = G_TRUNC %2(s32)
+    %4:_(s32) = G_CONSTANT i32 0
+    %3:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT %0, %1(s1), %4(s32)
+    $v0 = COPY %3(<vscale x 4 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv1i8_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv1i8_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    %2:_(s8) = G_CONSTANT i8 0
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s32)
+    $v8 = COPY %0(<vscale x 1 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv1i8_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv1i8_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    %2:_(s8) = G_CONSTANT i8 -1
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s32)
+    $v8 = COPY %0(<vscale x 1 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv1i8_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv1i8_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s32) = COPY $x10
+    %0:_(s8) = G_TRUNC %1(s32)
+    %3:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    %4:_(s32) = G_CONSTANT i32 0
+    %2:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT %3, %0(s8), %4(s32)
+    $v8 = COPY %2(<vscale x 1 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv2i8_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv2i8_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    %2:_(s8) = G_CONSTANT i8 0
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s32)
+    $v8 = COPY %0(<vscale x 2 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv2i8_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv2i8_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    %2:_(s8) = G_CONSTANT i8 -1
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s32)
+    $v8 = COPY %0(<vscale x 2 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv2i8_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv2i8_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s32) = COPY $x10
+    %0:_(s8) = G_TRUNC %1(s32)
+    %3:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    %4:_(s32) = G_CONSTANT i32 0
+    %2:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT %3, %0(s8), %4(s32)
+    $v8 = COPY %2(<vscale x 2 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv4i8_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv4i8_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    %2:_(s8) = G_CONSTANT i8 0
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s32)
+    $v8 = COPY %0(<vscale x 4 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv4i8_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv4i8_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    %2:_(s8) = G_CONSTANT i8 -1
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s32)
+    $v8 = COPY %0(<vscale x 4 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv4i8_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv4i8_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s32) = COPY $x10
+    %0:_(s8) = G_TRUNC %1(s32)
+    %3:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    %4:_(s32) = G_CONSTANT i32 0
+    %2:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT %3, %0(s8), %4(s32)
+    $v8 = COPY %2(<vscale x 4 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv8i8_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv8i8_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 8 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    %2:_(s8) = G_CONSTANT i8 0
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s32)
+    $v8 = COPY %0(<vscale x 8 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv8i8_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv8i8_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 8 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    %2:_(s8) = G_CONSTANT i8 -1
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s32)
+    $v8 = COPY %0(<vscale x 8 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv8i8_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv8i8_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 8 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s32) = COPY $x10
+    %0:_(s8) = G_TRUNC %1(s32)
+    %3:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    %4:_(s32) = G_CONSTANT i32 0
+    %2:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT %3, %0(s8), %4(s32)
+    $v8 = COPY %2(<vscale x 8 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv16i8_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv16i8_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s32)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 16 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+    %2:_(s8) = G_CONSTANT i8 0
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s32)
+    $v8m2 = COPY %0(<vscale x 16 x s8>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv16i8_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv16i8_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s32)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 16 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+    %2:_(s8) = G_CONSTANT i8 -1
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s32)
+    $v8m2 = COPY %0(<vscale x 16 x s8>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv16i8_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11, $x12
+
+    ; CHECK-LABEL: name: insertelement_nxv16i8_2
+    ; CHECK: liveins: $x10, $x11, $x12
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[COPY1]](s32)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 16 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %2:_(s32) = COPY $x10
+    %0:_(s8) = G_TRUNC %2(s32)
+    %3:_(s32) = COPY $x11
+    %4:_(s32) = COPY $x12
+    %1:_(s64) = G_MERGE_VALUES %3(s32), %4(s32)
+    %6:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+    %7:_(s32) = G_TRUNC %1(s64)
+    %5:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT %6, %0(s8), %7(s32)
+    $v8m2 = COPY %5(<vscale x 16 x s8>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv4i8_3
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv4i8_3
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY1]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT [[COPY]], [[TRUNC]](s8), [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 4 x s8>) = COPY $v8
+    %2:_(s32) = COPY $x10
+    %1:_(s8) = G_TRUNC %2(s32)
+    %4:_(s32) = G_CONSTANT i32 0
+    %3:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT %0, %1(s8), %4(s32)
+    $v8 = COPY %3(<vscale x 4 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv1i16_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv1i16_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    %2:_(s16) = G_CONSTANT i16 0
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s32)
+    $v8 = COPY %0(<vscale x 1 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv1i16_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv1i16_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    %2:_(s16) = G_CONSTANT i16 -1
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s32)
+    $v8 = COPY %0(<vscale x 1 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv1i16_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv1i16_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s32) = COPY $x10
+    %0:_(s16) = G_TRUNC %1(s32)
+    %3:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    %4:_(s32) = G_CONSTANT i32 0
+    %2:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT %3, %0(s16), %4(s32)
+    $v8 = COPY %2(<vscale x 1 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv2i16_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv2i16_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+    %2:_(s16) = G_CONSTANT i16 0
+    %3:_(s32) = G_CONSTANT i32 1
+    %0:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s32)
+    $v8 = COPY %0(<vscale x 2 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv2i16_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv2i16_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+    %2:_(s16) = G_CONSTANT i16 -1
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s32)
+    $v8 = COPY %0(<vscale x 2 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv2i16_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv2i16_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s32) = COPY $x10
+    %0:_(s16) = G_TRUNC %1(s32)
+    %3:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+    %4:_(s32) = G_CONSTANT i32 0
+    %2:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT %3, %0(s16), %4(s32)
+    $v8 = COPY %2(<vscale x 2 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv4i16_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv4i16_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    %2:_(s16) = G_CONSTANT i16 0
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s32)
+    $v8 = COPY %0(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv4i16_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv4i16_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    %2:_(s16) = G_CONSTANT i16 -1
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s32)
+    $v8 = COPY %0(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv4i16_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv4i16_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s32) = COPY $x10
+    %0:_(s16) = G_TRUNC %1(s32)
+    %3:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    %4:_(s32) = G_CONSTANT i32 0
+    %2:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT %3, %0(s16), %4(s32)
+    $v8 = COPY %2(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv8i16_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv8i16_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s32)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 8 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+    %2:_(s16) = G_CONSTANT i16 0
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s32)
+    $v8m2 = COPY %0(<vscale x 8 x s16>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv8i16_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv8i16_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s32)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 8 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+    %2:_(s16) = G_CONSTANT i16 -1
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s32)
+    $v8m2 = COPY %0(<vscale x 8 x s16>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv8i16_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv8i16_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C]](s32)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 8 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %1:_(s32) = COPY $x10
+    %0:_(s16) = G_TRUNC %1(s32)
+    %3:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+    %4:_(s32) = G_CONSTANT i32 0
+    %2:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT %3, %0(s16), %4(s32)
+    $v8m2 = COPY %2(<vscale x 8 x s16>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv16i16_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv16i16_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s32)
+    ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 16 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+    %2:_(s16) = G_CONSTANT i16 0
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s32)
+    $v8m4 = COPY %0(<vscale x 16 x s16>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            insertelement_nxv16i16_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv16i16_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s32)
+    ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 16 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+    %2:_(s16) = G_CONSTANT i16 -1
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s32)
+    $v8m4 = COPY %0(<vscale x 16 x s16>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            insertelement_nxv16i16_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv16i16_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C]](s32)
+    ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 16 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %1:_(s32) = COPY $x10
+    %0:_(s16) = G_TRUNC %1(s32)
+    %3:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+    %4:_(s32) = G_CONSTANT i32 0
+    %2:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT %3, %0(s16), %4(s32)
+    $v8m4 = COPY %2(<vscale x 16 x s16>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            insertelement_nxv4i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv4i16
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT [[COPY]], [[TRUNC]](s16), [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 4 x s16>) = COPY $v8
+    %2:_(s32) = COPY $x10
+    %1:_(s16) = G_TRUNC %2(s32)
+    %4:_(s32) = G_CONSTANT i32 0
+    %3:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT %0, %1(s16), %4(s32)
+    $v8 = COPY %3(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv1i32_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv1i32_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s32), [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+    %2:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %2(s32)
+    $v8 = COPY %0(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv1i32_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv1i32_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s32), [[C1]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+    %2:_(s32) = G_CONSTANT i32 -1
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s32)
+    $v8 = COPY %0(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv1i32_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv1i32_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[COPY]](s32), [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(s32) = COPY $x10
+    %2:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+    %3:_(s32) = G_CONSTANT i32 0
+    %1:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT %2, %0(s32), %3(s32)
+    $v8 = COPY %1(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv2i32_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv2i32_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s32), [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    %2:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %2(s32)
+    $v8 = COPY %0(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv2i32_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv2i32_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s32), [[C1]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    %2:_(s32) = G_CONSTANT i32 -1
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s32)
+    $v8 = COPY %0(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv2i32_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv2i32_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[COPY]](s32), [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(s32) = COPY $x10
+    %2:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    %3:_(s32) = G_CONSTANT i32 0
+    %1:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT %2, %0(s32), %3(s32)
+    $v8 = COPY %1(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv4i32_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv4i32_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s32), [[C]](s32)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 4 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    %2:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %2(s32)
+    $v8m2 = COPY %0(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv4i32_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv4i32_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s32), [[C1]](s32)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 4 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    %2:_(s32) = G_CONSTANT i32 -1
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s32)
+    $v8m2 = COPY %0(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv4i32_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv4i32_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[COPY]](s32), [[C]](s32)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 4 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %0:_(s32) = COPY $x10
+    %2:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    %3:_(s32) = G_CONSTANT i32 0
+    %1:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT %2, %0(s32), %3(s32)
+    $v8m2 = COPY %1(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv8i32_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv8i32_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s32), [[C]](s32)
+    ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 8 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    %2:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %2(s32)
+    $v8m4 = COPY %0(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            insertelement_nxv8i32_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv8i32_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s32), [[C1]](s32)
+    ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 8 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    %2:_(s32) = G_CONSTANT i32 -1
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s32)
+    $v8m4 = COPY %0(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            insertelement_nxv8i32_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv8i32_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[COPY]](s32), [[C]](s32)
+    ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 8 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %0:_(s32) = COPY $x10
+    %2:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    %3:_(s32) = G_CONSTANT i32 0
+    %1:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT %2, %0(s32), %3(s32)
+    $v8m4 = COPY %1(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            insertelement_nxv16i32_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv16i32_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s32), [[C]](s32)
+    ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 16 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+    %2:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %2(s32)
+    $v8m8 = COPY %0(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            insertelement_nxv16i32_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv16i32_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s32), [[C1]](s32)
+    ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 16 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+    %2:_(s32) = G_CONSTANT i32 -1
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s32)
+    $v8m8 = COPY %0(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            insertelement_nxv16i32_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv16i32_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[COPY]](s32), [[C]](s32)
+    ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 16 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m8
+    %0:_(s32) = COPY $x10
+    %2:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+    %3:_(s32) = G_CONSTANT i32 0
+    %1:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT %2, %0(s32), %3(s32)
+    $v8m8 = COPY %1(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            insertelement_nxv4i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10, $v8m2
+
+    ; CHECK-LABEL: name: insertelement_nxv4i32
+    ; CHECK: liveins: $x10, $v8m2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $v8m2
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s32), [[C]](s32)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 4 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 4 x s32>) = COPY $v8m2
+    %1:_(s32) = COPY $x10
+    %3:_(s32) = G_CONSTANT i32 0
+    %2:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT %0, %1(s32), %3(s32)
+    $v8m2 = COPY %2(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv1i64_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv1i64_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C]](s32)
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+    %2:_(s64) = G_CONSTANT i64 0
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s32)
+    $v8 = COPY %0(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv1i64_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv1i64_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C1]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+    %2:_(s64) = G_CONSTANT i64 -1
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s32)
+    $v8 = COPY %0(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv1i64_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: insertelement_nxv1i64_2
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s32) = COPY $x10
+    %2:_(s32) = COPY $x11
+    %0:_(s64) = G_MERGE_VALUES %1(s32), %2(s32)
+    %4:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+    %5:_(s32) = G_CONSTANT i32 0
+    %3:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT %4, %0(s64), %5(s32)
+    $v8 = COPY %3(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv2i64_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv2i64_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C]](s32)
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C]](s32)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 2 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    %2:_(s64) = G_CONSTANT i64 0
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s32)
+    $v8m2 = COPY %0(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv2i64_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv2i64_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C1]](s32)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 2 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    %2:_(s64) = G_CONSTANT i64 -1
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s32)
+    $v8m2 = COPY %0(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv2i64_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: insertelement_nxv2i64_2
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C]](s32)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 2 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %1:_(s32) = COPY $x10
+    %2:_(s32) = COPY $x11
+    %0:_(s64) = G_MERGE_VALUES %1(s32), %2(s32)
+    %4:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    %5:_(s32) = G_CONSTANT i32 0
+    %3:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT %4, %0(s64), %5(s32)
+    $v8m2 = COPY %3(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv4i64_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv4i64_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C]](s32)
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C]](s32)
+    ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 4 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+    %2:_(s64) = G_CONSTANT i64 0
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s32)
+    $v8m4 = COPY %0(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            insertelement_nxv4i64_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv4i64_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C1]](s32)
+    ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 4 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+    %2:_(s64) = G_CONSTANT i64 -1
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s32)
+    $v8m4 = COPY %0(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            insertelement_nxv4i64_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: insertelement_nxv4i64_2
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C]](s32)
+    ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 4 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %1:_(s32) = COPY $x10
+    %2:_(s32) = COPY $x11
+    %0:_(s64) = G_MERGE_VALUES %1(s32), %2(s32)
+    %4:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+    %5:_(s32) = G_CONSTANT i32 0
+    %3:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT %4, %0(s64), %5(s32)
+    $v8m4 = COPY %3(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            insertelement_nxv8i64_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv8i64_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C]](s32)
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C]](s32)
+    ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 8 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    %2:_(s64) = G_CONSTANT i64 0
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s32)
+    $v8m8 = COPY %0(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            insertelement_nxv8i64_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv8i64_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C1]](s32)
+    ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 8 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    %2:_(s64) = G_CONSTANT i64 -1
+    %3:_(s32) = G_CONSTANT i32 0
+    %0:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s32)
+    $v8m8 = COPY %0(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            insertelement_nxv8i64_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: insertelement_nxv8i64_2
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV]](s64), [[C]](s32)
+    ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 8 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m8
+    %1:_(s32) = COPY $x10
+    %2:_(s32) = COPY $x11
+    %0:_(s64) = G_MERGE_VALUES %1(s32), %2(s32)
+    %4:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    %5:_(s32) = G_CONSTANT i32 0
+    %3:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT %4, %0(s64), %5(s32)
+    $v8m8 = COPY %3(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            insertelement_nxv4i64
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11, $v8m4
+
+    ; CHECK-LABEL: name: insertelement_nxv4i64
+    ; CHECK: liveins: $x10, $x11, $v8m4
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s64>) = COPY $v8m4
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT [[COPY]], [[MV]](s64), [[C]](s32)
+    ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 4 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %0:_(<vscale x 4 x s64>) = COPY $v8m4
+    %2:_(s32) = COPY $x10
+    %3:_(s32) = COPY $x11
+    %1:_(s64) = G_MERGE_VALUES %2(s32), %3(s32)
+    %5:_(s32) = G_CONSTANT i32 0
+    %4:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT %0, %1(s64), %5(s32)
+    $v8m4 = COPY %4(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insertelement-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insertelement-rv64.mir
new file mode 100644
index 0000000000000..4c33ddccdac95
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insertelement-rv64.mir
@@ -0,0 +1,1731 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv64 -mattr=+v -run-pass=legalizer %s -o - | FileCheck %s
+
+---
+name:            insertelement_nxv1i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv1i1_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s64)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 1 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %1:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+    %2:_(s1) = G_CONSTANT i1 false
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s64)
+    $v0 = COPY %0(<vscale x 1 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv1i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv1i1_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s64)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 1 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %1:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+    %2:_(s1) = G_CONSTANT i1 true
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s64)
+    $v0 = COPY %0(<vscale x 1 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv1i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: insertelement_nxv1i1_2
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]]
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[AND]](s64)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 1 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %2:_(s64) = COPY $x10
+    %0:_(s1) = G_TRUNC %2(s64)
+    %3:_(s64) = COPY $x11
+    %1:_(s32) = G_TRUNC %3(s64)
+    %5:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+    %6:_(s64) = G_ZEXT %1(s32)
+    %4:_(<vscale x 1 x s1>) = G_INSERT_VECTOR_ELT %5, %0(s1), %6(s64)
+    $v0 = COPY %4(<vscale x 1 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv2i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv2i1_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s64)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 2 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %1:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    %2:_(s1) = G_CONSTANT i1 false
+    %3:_(s64) = G_CONSTANT i64 1
+    %0:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s64)
+    $v0 = COPY %0(<vscale x 2 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv2i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv2i1_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s64)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 2 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %1:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    %2:_(s1) = G_CONSTANT i1 true
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s64)
+    $v0 = COPY %0(<vscale x 2 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv2i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: insertelement_nxv2i1_2
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]]
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[AND]](s64)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 2 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %2:_(s64) = COPY $x10
+    %0:_(s1) = G_TRUNC %2(s64)
+    %3:_(s64) = COPY $x11
+    %1:_(s32) = G_TRUNC %3(s64)
+    %5:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    %6:_(s64) = G_ZEXT %1(s32)
+    %4:_(<vscale x 2 x s1>) = G_INSERT_VECTOR_ELT %5, %0(s1), %6(s64)
+    $v0 = COPY %4(<vscale x 2 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv4i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv4i1_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s64)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 4 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %1:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    %2:_(s1) = G_CONSTANT i1 false
+    %3:_(s64) = G_CONSTANT i64 2
+    %0:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s64)
+    $v0 = COPY %0(<vscale x 4 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv4i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv4i1_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s64)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 4 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %1:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    %2:_(s1) = G_CONSTANT i1 true
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s64)
+    $v0 = COPY %0(<vscale x 4 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv4i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv4i1_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C]](s64)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 4 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %1:_(s64) = COPY $x10
+    %0:_(s1) = G_TRUNC %1(s64)
+    %3:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    %4:_(s64) = G_CONSTANT i64 0
+    %2:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT %3, %0(s1), %4(s64)
+    $v0 = COPY %2(<vscale x 4 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv8i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv8i1_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s64)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 8 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %1:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    %2:_(s1) = G_CONSTANT i1 false
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s64)
+    $v0 = COPY %0(<vscale x 8 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv8i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv8i1_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s64)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 8 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %1:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    %2:_(s1) = G_CONSTANT i1 true
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s64)
+    $v0 = COPY %0(<vscale x 8 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv8i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: insertelement_nxv8i1_2
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]]
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[AND]](s64)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 8 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %2:_(s64) = COPY $x10
+    %0:_(s1) = G_TRUNC %2(s64)
+    %3:_(s64) = COPY $x11
+    %1:_(s32) = G_TRUNC %3(s64)
+    %5:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    %6:_(s64) = G_ZEXT %1(s32)
+    %4:_(<vscale x 8 x s1>) = G_INSERT_VECTOR_ELT %5, %0(s1), %6(s64)
+    $v0 = COPY %4(<vscale x 8 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv16i1_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv16i1_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s64)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 16 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %1:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+    %2:_(s1) = G_CONSTANT i1 false
+    %3:_(s64) = G_CONSTANT i64 15
+    %0:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s64)
+    $v0 = COPY %0(<vscale x 16 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv16i1_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv16i1_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[C1]](s64)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 16 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %1:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+    %2:_(s1) = G_CONSTANT i1 true
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT %1, %2(s1), %3(s64)
+    $v0 = COPY %0(<vscale x 16 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv16i1_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: insertelement_nxv16i1_2
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]]
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s1), [[AND]](s64)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 16 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %2:_(s64) = COPY $x10
+    %0:_(s1) = G_TRUNC %2(s64)
+    %3:_(s64) = COPY $x11
+    %1:_(s32) = G_TRUNC %3(s64)
+    %5:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+    %6:_(s64) = G_ZEXT %1(s32)
+    %4:_(<vscale x 16 x s1>) = G_INSERT_VECTOR_ELT %5, %0(s1), %6(s64)
+    $v0 = COPY %4(<vscale x 16 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv4i1_3
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v0, $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv4i1_3
+    ; CHECK: liveins: $v0, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT [[COPY]], [[TRUNC]](s1), [[C]](s64)
+    ; CHECK-NEXT: $v0 = COPY [[IVEC]](<vscale x 4 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v0
+    %0:_(<vscale x 4 x s1>) = COPY $v0
+    %2:_(s64) = COPY $x10
+    %1:_(s1) = G_TRUNC %2(s64)
+    %4:_(s64) = G_CONSTANT i64 0
+    %3:_(<vscale x 4 x s1>) = G_INSERT_VECTOR_ELT %0, %1(s1), %4(s64)
+    $v0 = COPY %3(<vscale x 4 x s1>)
+    PseudoRET implicit $v0
+...
+---
+name:            insertelement_nxv1i8_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv1i8_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    %2:_(s8) = G_CONSTANT i8 0
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s64)
+    $v8 = COPY %0(<vscale x 1 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv1i8_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv1i8_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    %2:_(s8) = G_CONSTANT i8 -1
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s64)
+    $v8 = COPY %0(<vscale x 1 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv1i8_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv1i8_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s64) = COPY $x10
+    %0:_(s8) = G_TRUNC %1(s64)
+    %3:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    %4:_(s64) = G_CONSTANT i64 0
+    %2:_(<vscale x 1 x s8>) = G_INSERT_VECTOR_ELT %3, %0(s8), %4(s64)
+    $v8 = COPY %2(<vscale x 1 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv2i8_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv2i8_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    %2:_(s8) = G_CONSTANT i8 0
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s64)
+    $v8 = COPY %0(<vscale x 2 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv2i8_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv2i8_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    %2:_(s8) = G_CONSTANT i8 -1
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s64)
+    $v8 = COPY %0(<vscale x 2 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv2i8_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv2i8_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s64) = COPY $x10
+    %0:_(s8) = G_TRUNC %1(s64)
+    %3:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    %4:_(s64) = G_CONSTANT i64 0
+    %2:_(<vscale x 2 x s8>) = G_INSERT_VECTOR_ELT %3, %0(s8), %4(s64)
+    $v8 = COPY %2(<vscale x 2 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv4i8_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv4i8_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    %2:_(s8) = G_CONSTANT i8 0
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s64)
+    $v8 = COPY %0(<vscale x 4 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv4i8_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv4i8_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    %2:_(s8) = G_CONSTANT i8 -1
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s64)
+    $v8 = COPY %0(<vscale x 4 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv4i8_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv4i8_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s64) = COPY $x10
+    %0:_(s8) = G_TRUNC %1(s64)
+    %3:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    %4:_(s64) = G_CONSTANT i64 0
+    %2:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT %3, %0(s8), %4(s64)
+    $v8 = COPY %2(<vscale x 4 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv8i8_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv8i8_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 8 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    %2:_(s8) = G_CONSTANT i8 0
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s64)
+    $v8 = COPY %0(<vscale x 8 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv8i8_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv8i8_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 8 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    %2:_(s8) = G_CONSTANT i8 -1
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s64)
+    $v8 = COPY %0(<vscale x 8 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv8i8_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv8i8_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 8 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s64) = COPY $x10
+    %0:_(s8) = G_TRUNC %1(s64)
+    %3:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    %4:_(s64) = G_CONSTANT i64 0
+    %2:_(<vscale x 8 x s8>) = G_INSERT_VECTOR_ELT %3, %0(s8), %4(s64)
+    $v8 = COPY %2(<vscale x 8 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv16i8_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv16i8_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s64)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 16 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+    %2:_(s8) = G_CONSTANT i8 0
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s64)
+    $v8m2 = COPY %0(<vscale x 16 x s8>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv16i8_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv16i8_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[C1]](s64)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 16 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+    %2:_(s8) = G_CONSTANT i8 -1
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT %1, %2(s8), %3(s64)
+    $v8m2 = COPY %0(<vscale x 16 x s8>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv16i8_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: insertelement_nxv16i8_2
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s8), [[COPY1]](s64)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 16 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %2:_(s64) = COPY $x10
+    %0:_(s8) = G_TRUNC %2(s64)
+    %1:_(s64) = COPY $x11
+    %4:_(<vscale x 16 x s8>) = G_IMPLICIT_DEF
+    %3:_(<vscale x 16 x s8>) = G_INSERT_VECTOR_ELT %4, %0(s8), %1(s64)
+    $v8m2 = COPY %3(<vscale x 16 x s8>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv4i8_3
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv4i8_3
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s8>) = COPY $v8
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT [[COPY]], [[TRUNC]](s8), [[C]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 4 x s8>) = COPY $v8
+    %2:_(s64) = COPY $x10
+    %1:_(s8) = G_TRUNC %2(s64)
+    %4:_(s64) = G_CONSTANT i64 0
+    %3:_(<vscale x 4 x s8>) = G_INSERT_VECTOR_ELT %0, %1(s8), %4(s64)
+    $v8 = COPY %3(<vscale x 4 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv1i16_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv1i16_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    %2:_(s16) = G_CONSTANT i16 0
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s64)
+    $v8 = COPY %0(<vscale x 1 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv1i16_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv1i16_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    %2:_(s16) = G_CONSTANT i16 -1
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s64)
+    $v8 = COPY %0(<vscale x 1 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv1i16_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv1i16_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s64) = COPY $x10
+    %0:_(s16) = G_TRUNC %1(s64)
+    %3:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    %4:_(s64) = G_CONSTANT i64 0
+    %2:_(<vscale x 1 x s16>) = G_INSERT_VECTOR_ELT %3, %0(s16), %4(s64)
+    $v8 = COPY %2(<vscale x 1 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv2i16_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv2i16_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+    %2:_(s16) = G_CONSTANT i16 0
+    %3:_(s64) = G_CONSTANT i64 1
+    %0:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s64)
+    $v8 = COPY %0(<vscale x 2 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv2i16_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv2i16_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+    %2:_(s16) = G_CONSTANT i16 -1
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s64)
+    $v8 = COPY %0(<vscale x 2 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv2i16_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv2i16_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s64) = COPY $x10
+    %0:_(s16) = G_TRUNC %1(s64)
+    %3:_(<vscale x 2 x s16>) = G_IMPLICIT_DEF
+    %4:_(s64) = G_CONSTANT i64 0
+    %2:_(<vscale x 2 x s16>) = G_INSERT_VECTOR_ELT %3, %0(s16), %4(s64)
+    $v8 = COPY %2(<vscale x 2 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv4i16_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv4i16_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    %2:_(s16) = G_CONSTANT i16 0
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s64)
+    $v8 = COPY %0(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv4i16_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv4i16_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    %2:_(s16) = G_CONSTANT i16 -1
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s64)
+    $v8 = COPY %0(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv4i16_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv4i16_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s64) = COPY $x10
+    %0:_(s16) = G_TRUNC %1(s64)
+    %3:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    %4:_(s64) = G_CONSTANT i64 0
+    %2:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT %3, %0(s16), %4(s64)
+    $v8 = COPY %2(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv8i16_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv8i16_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s64)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 8 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+    %2:_(s16) = G_CONSTANT i16 0
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s64)
+    $v8m2 = COPY %0(<vscale x 8 x s16>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv8i16_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv8i16_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s64)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 8 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+    %2:_(s16) = G_CONSTANT i16 -1
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s64)
+    $v8m2 = COPY %0(<vscale x 8 x s16>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv8i16_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv8i16_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C]](s64)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 8 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %1:_(s64) = COPY $x10
+    %0:_(s16) = G_TRUNC %1(s64)
+    %3:_(<vscale x 8 x s16>) = G_IMPLICIT_DEF
+    %4:_(s64) = G_CONSTANT i64 0
+    %2:_(<vscale x 8 x s16>) = G_INSERT_VECTOR_ELT %3, %0(s16), %4(s64)
+    $v8m2 = COPY %2(<vscale x 8 x s16>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv16i16_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv16i16_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s64)
+    ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 16 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+    %2:_(s16) = G_CONSTANT i16 0
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s64)
+    $v8m4 = COPY %0(<vscale x 16 x s16>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            insertelement_nxv16i16_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv16i16_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C1]](s64)
+    ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 16 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+    %2:_(s16) = G_CONSTANT i16 -1
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT %1, %2(s16), %3(s64)
+    $v8m4 = COPY %0(<vscale x 16 x s16>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            insertelement_nxv16i16_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv16i16_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s16), [[C]](s64)
+    ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 16 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %1:_(s64) = COPY $x10
+    %0:_(s16) = G_TRUNC %1(s64)
+    %3:_(<vscale x 16 x s16>) = G_IMPLICIT_DEF
+    %4:_(s64) = G_CONSTANT i64 0
+    %2:_(<vscale x 16 x s16>) = G_INSERT_VECTOR_ELT %3, %0(s16), %4(s64)
+    $v8m4 = COPY %2(<vscale x 16 x s16>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            insertelement_nxv4i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $v8, $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv4i16
+    ; CHECK: liveins: $v8, $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT [[COPY]], [[TRUNC]](s16), [[C]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 4 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 4 x s16>) = COPY $v8
+    %2:_(s64) = COPY $x10
+    %1:_(s16) = G_TRUNC %2(s64)
+    %4:_(s64) = G_CONSTANT i64 0
+    %3:_(<vscale x 4 x s16>) = G_INSERT_VECTOR_ELT %0, %1(s16), %4(s64)
+    $v8 = COPY %3(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv1i32_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv1i32_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C1]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+    %2:_(s32) = G_CONSTANT i32 0
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s64)
+    $v8 = COPY %0(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv1i32_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv1i32_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C1]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+    %2:_(s32) = G_CONSTANT i32 -1
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s64)
+    $v8 = COPY %0(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv1i32_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv1i32_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s64) = COPY $x10
+    %0:_(s32) = G_TRUNC %1(s64)
+    %3:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+    %4:_(s64) = G_CONSTANT i64 0
+    %2:_(<vscale x 1 x s32>) = G_INSERT_VECTOR_ELT %3, %0(s32), %4(s64)
+    $v8 = COPY %2(<vscale x 1 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv2i32_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv2i32_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C1]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    %2:_(s32) = G_CONSTANT i32 0
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s64)
+    $v8 = COPY %0(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv2i32_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv2i32_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C1]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    %2:_(s32) = G_CONSTANT i32 -1
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s64)
+    $v8 = COPY %0(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv2i32_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv2i32_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 2 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(s64) = COPY $x10
+    %0:_(s32) = G_TRUNC %1(s64)
+    %3:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    %4:_(s64) = G_CONSTANT i64 0
+    %2:_(<vscale x 2 x s32>) = G_INSERT_VECTOR_ELT %3, %0(s32), %4(s64)
+    $v8 = COPY %2(<vscale x 2 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv4i32_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv4i32_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C1]](s64)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 4 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    %2:_(s32) = G_CONSTANT i32 0
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s64)
+    $v8m2 = COPY %0(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv4i32_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv4i32_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C1]](s64)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 4 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    %2:_(s32) = G_CONSTANT i32 -1
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s64)
+    $v8m2 = COPY %0(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv4i32_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv4i32_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C]](s64)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 4 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %1:_(s64) = COPY $x10
+    %0:_(s32) = G_TRUNC %1(s64)
+    %3:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    %4:_(s64) = G_CONSTANT i64 0
+    %2:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT %3, %0(s32), %4(s64)
+    $v8m2 = COPY %2(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv8i32_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv8i32_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C1]](s64)
+    ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 8 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    %2:_(s32) = G_CONSTANT i32 0
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s64)
+    $v8m4 = COPY %0(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            insertelement_nxv8i32_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv8i32_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C1]](s64)
+    ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 8 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    %2:_(s32) = G_CONSTANT i32 -1
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s64)
+    $v8m4 = COPY %0(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            insertelement_nxv8i32_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv8i32_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C]](s64)
+    ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 8 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %1:_(s64) = COPY $x10
+    %0:_(s32) = G_TRUNC %1(s64)
+    %3:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    %4:_(s64) = G_CONSTANT i64 0
+    %2:_(<vscale x 8 x s32>) = G_INSERT_VECTOR_ELT %3, %0(s32), %4(s64)
+    $v8m4 = COPY %2(<vscale x 8 x s32>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            insertelement_nxv16i32_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv16i32_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C1]](s64)
+    ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 16 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+    %2:_(s32) = G_CONSTANT i32 0
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s64)
+    $v8m8 = COPY %0(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            insertelement_nxv16i32_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv16i32_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C1]](s64)
+    ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 16 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+    %2:_(s32) = G_CONSTANT i32 -1
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT %1, %2(s32), %3(s64)
+    $v8m8 = COPY %0(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            insertelement_nxv16i32_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv16i32_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[TRUNC]](s32), [[C]](s64)
+    ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 16 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m8
+    %1:_(s64) = COPY $x10
+    %0:_(s32) = G_TRUNC %1(s64)
+    %3:_(<vscale x 16 x s32>) = G_IMPLICIT_DEF
+    %4:_(s64) = G_CONSTANT i64 0
+    %2:_(<vscale x 16 x s32>) = G_INSERT_VECTOR_ELT %3, %0(s32), %4(s64)
+    $v8m8 = COPY %2(<vscale x 16 x s32>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            insertelement_nxv4i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10, $v8m2
+
+    ; CHECK-LABEL: name: insertelement_nxv4i32
+    ; CHECK: liveins: $x10, $v8m2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s32>) = COPY $v8m2
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT [[COPY]], [[TRUNC]](s32), [[C]](s64)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 4 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %0:_(<vscale x 4 x s32>) = COPY $v8m2
+    %2:_(s64) = COPY $x10
+    %1:_(s32) = G_TRUNC %2(s64)
+    %4:_(s64) = G_CONSTANT i64 0
+    %3:_(<vscale x 4 x s32>) = G_INSERT_VECTOR_ELT %0, %1(s32), %4(s64)
+    $v8m2 = COPY %3(<vscale x 4 x s32>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv1i64_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv1i64_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s64), [[C]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+    %2:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %2(s64)
+    $v8 = COPY %0(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv1i64_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv1i64_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s64), [[C1]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %1:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+    %2:_(s64) = G_CONSTANT i64 -1
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s64)
+    $v8 = COPY %0(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv1i64_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv1i64_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[COPY]](s64), [[C]](s64)
+    ; CHECK-NEXT: $v8 = COPY [[IVEC]](<vscale x 1 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(s64) = COPY $x10
+    %2:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+    %3:_(s64) = G_CONSTANT i64 0
+    %1:_(<vscale x 1 x s64>) = G_INSERT_VECTOR_ELT %2, %0(s64), %3(s64)
+    $v8 = COPY %1(<vscale x 1 x s64>)
+    PseudoRET implicit $v8
+...
+---
+name:            insertelement_nxv2i64_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv2i64_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s64), [[C]](s64)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 2 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    %2:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %2(s64)
+    $v8m2 = COPY %0(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv2i64_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv2i64_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s64), [[C1]](s64)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 2 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %1:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    %2:_(s64) = G_CONSTANT i64 -1
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s64)
+    $v8m2 = COPY %0(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv2i64_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv2i64_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[COPY]](s64), [[C]](s64)
+    ; CHECK-NEXT: $v8m2 = COPY [[IVEC]](<vscale x 2 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m2
+    %0:_(s64) = COPY $x10
+    %2:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    %3:_(s64) = G_CONSTANT i64 0
+    %1:_(<vscale x 2 x s64>) = G_INSERT_VECTOR_ELT %2, %0(s64), %3(s64)
+    $v8m2 = COPY %1(<vscale x 2 x s64>)
+    PseudoRET implicit $v8m2
+...
+---
+name:            insertelement_nxv4i64_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv4i64_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s64), [[C]](s64)
+    ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 4 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+    %2:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %2(s64)
+    $v8m4 = COPY %0(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            insertelement_nxv4i64_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv4i64_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s64), [[C1]](s64)
+    ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 4 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %1:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+    %2:_(s64) = G_CONSTANT i64 -1
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s64)
+    $v8m4 = COPY %0(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            insertelement_nxv4i64_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv4i64_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[COPY]](s64), [[C]](s64)
+    ; CHECK-NEXT: $v8m4 = COPY [[IVEC]](<vscale x 4 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m4
+    %0:_(s64) = COPY $x10
+    %2:_(<vscale x 4 x s64>) = G_IMPLICIT_DEF
+    %3:_(s64) = G_CONSTANT i64 0
+    %1:_(<vscale x 4 x s64>) = G_INSERT_VECTOR_ELT %2, %0(s64), %3(s64)
+    $v8m4 = COPY %1(<vscale x 4 x s64>)
+    PseudoRET implicit $v8m4
+...
+---
+name:            insertelement_nxv8i64_0
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv8i64_0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s64), [[C]](s64)
+    ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 8 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    %2:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %2(s64)
+    $v8m8 = COPY %0(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            insertelement_nxv8i64_1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insertelement_nxv8i64_1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[C]](s64), [[C1]](s64)
+    ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 8 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m8
+    %1:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    %2:_(s64) = G_CONSTANT i64 -1
+    %3:_(s64) = G_CONSTANT i64 0
+    %0:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT %1, %2(s64), %3(s64)
+    $v8m8 = COPY %0(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+...
+---
+name:            insertelement_nxv8i64_2
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: insertelement_nxv8i64_2
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[COPY]](s64), [[C]](s64)
+    ; CHECK-NEXT: $v8m8 = COPY [[IVEC]](<vscale x 8 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8m8
+    %0:_(s64) = COPY $x10
+    %2:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    %3:_(s64) = G_CONSTANT i64 0
+    %1:_(<vscale x 8 x s64>) = G_INSERT_VECTOR_ELT %2, %0(s64), %3(s64)
+    $v8m8 = COPY %1(<vscale x 8 x s64>)
+    PseudoRET implicit $v8m8
+...
diff --git a/llvm/test/CodeGen/RISCV/atomic-fence.ll b/llvm/test/CodeGen/RISCV/atomic-fence.ll
index 7103345ce7bc2..77148f64a3166 100644
--- a/llvm/test/CodeGen/RISCV/atomic-fence.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-fence.ll
@@ -1,12 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck --check-prefixes=CHECK,WMO %s
+; RUN: llc -mtriple=riscv32 -mattr=+zalrsc -verify-machineinstrs < %s \
+; RUN:   | FileCheck --check-prefixes=CHECK,WMO %s
 ; RUN: llc -mtriple=riscv32 -mattr=+a -verify-machineinstrs < %s \
 ; RUN:   | FileCheck --check-prefixes=CHECK,WMO %s
 ; RUN: llc -mtriple=riscv32 -mattr=+a,+ztso -verify-machineinstrs < %s \
 ; RUN:   | FileCheck --check-prefixes=CHECK,TSO %s
 ; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck --check-prefixes=CHECK,WMO %s
+; RUN: llc -mtriple=riscv64 -mattr=+zalrsc -verify-machineinstrs < %s \
+; RUN:   | FileCheck --check-prefixes=CHECK,WMO %s
 ; RUN: llc -mtriple=riscv64 -mattr=+a -verify-machineinstrs < %s \
 ; RUN:   | FileCheck --check-prefixes=CHECK,WMO %s
 ; RUN: llc -mtriple=riscv64 -mattr=+a,+ztso -verify-machineinstrs < %s \
diff --git a/llvm/test/CodeGen/RISCV/atomic-load-store.ll b/llvm/test/CodeGen/RISCV/atomic-load-store.ll
index 7e3abc753333b..c6234dedcef36 100644
--- a/llvm/test/CodeGen/RISCV/atomic-load-store.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-load-store.ll
@@ -1,12 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefix=RV32I %s
+; RUN: llc -mtriple=riscv32 -mattr=+zalrsc -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=RV32I-ZALRSC %s
 ; RUN: llc -mtriple=riscv32 -mattr=+a,+no-trailing-seq-cst-fence -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV32IA,RV32IA-WMO %s
 ; RUN: llc -mtriple=riscv32 -mattr=+a,+ztso,+no-trailing-seq-cst-fence -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV32IA,RV32IA-TSO %s
 ; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefix=RV64I %s
+; RUN: llc -mtriple=riscv64 -mattr=+zalrsc -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=RV64I-ZALRSC %s
 ; RUN: llc -mtriple=riscv64 -mattr=+a,+no-trailing-seq-cst-fence -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-WMO %s
 ; RUN: llc -mtriple=riscv64 -mattr=+a,+ztso,+no-trailing-seq-cst-fence -verify-machineinstrs < %s \
@@ -44,6 +48,11 @@ define i8 @atomic_load_i8_unordered(ptr %a) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomic_load_i8_unordered:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    lb a0, 0(a0)
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomic_load_i8_unordered:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    lb a0, 0(a0)
@@ -59,6 +68,11 @@ define i8 @atomic_load_i8_unordered(ptr %a) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomic_load_i8_unordered:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    lb a0, 0(a0)
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-LABEL: atomic_load_i8_unordered:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    lb a0, 0(a0)
@@ -78,6 +92,11 @@ define i8 @atomic_load_i8_monotonic(ptr %a) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomic_load_i8_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    lb a0, 0(a0)
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomic_load_i8_monotonic:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    lb a0, 0(a0)
@@ -93,6 +112,11 @@ define i8 @atomic_load_i8_monotonic(ptr %a) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomic_load_i8_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    lb a0, 0(a0)
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-LABEL: atomic_load_i8_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    lb a0, 0(a0)
@@ -112,6 +136,12 @@ define i8 @atomic_load_i8_acquire(ptr %a) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomic_load_i8_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    lb a0, 0(a0)
+; RV32I-ZALRSC-NEXT:    fence r, rw
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomic_load_i8_acquire:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    lb a0, 0(a0)
@@ -133,6 +163,12 @@ define i8 @atomic_load_i8_acquire(ptr %a) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomic_load_i8_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    lb a0, 0(a0)
+; RV64I-ZALRSC-NEXT:    fence r, rw
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomic_load_i8_acquire:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    lb a0, 0(a0)
@@ -200,6 +236,13 @@ define i8 @atomic_load_i8_seq_cst(ptr %a) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomic_load_i8_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    fence rw, rw
+; RV32I-ZALRSC-NEXT:    lb a0, 0(a0)
+; RV32I-ZALRSC-NEXT:    fence r, rw
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomic_load_i8_seq_cst:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    fence rw, rw
@@ -223,6 +266,13 @@ define i8 @atomic_load_i8_seq_cst(ptr %a) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomic_load_i8_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    fence rw, rw
+; RV64I-ZALRSC-NEXT:    lb a0, 0(a0)
+; RV64I-ZALRSC-NEXT:    fence r, rw
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomic_load_i8_seq_cst:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    fence rw, rw
@@ -286,6 +336,11 @@ define i16 @atomic_load_i16_unordered(ptr %a) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomic_load_i16_unordered:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    lh a0, 0(a0)
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomic_load_i16_unordered:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    lh a0, 0(a0)
@@ -301,6 +356,11 @@ define i16 @atomic_load_i16_unordered(ptr %a) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomic_load_i16_unordered:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    lh a0, 0(a0)
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-LABEL: atomic_load_i16_unordered:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    lh a0, 0(a0)
@@ -320,6 +380,11 @@ define i16 @atomic_load_i16_monotonic(ptr %a) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomic_load_i16_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    lh a0, 0(a0)
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomic_load_i16_monotonic:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    lh a0, 0(a0)
@@ -335,6 +400,11 @@ define i16 @atomic_load_i16_monotonic(ptr %a) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomic_load_i16_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    lh a0, 0(a0)
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-LABEL: atomic_load_i16_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    lh a0, 0(a0)
@@ -354,6 +424,12 @@ define i16 @atomic_load_i16_acquire(ptr %a) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomic_load_i16_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    lh a0, 0(a0)
+; RV32I-ZALRSC-NEXT:    fence r, rw
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomic_load_i16_acquire:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    lh a0, 0(a0)
@@ -375,6 +451,12 @@ define i16 @atomic_load_i16_acquire(ptr %a) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomic_load_i16_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    lh a0, 0(a0)
+; RV64I-ZALRSC-NEXT:    fence r, rw
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomic_load_i16_acquire:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    lh a0, 0(a0)
@@ -442,6 +524,13 @@ define i16 @atomic_load_i16_seq_cst(ptr %a) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomic_load_i16_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    fence rw, rw
+; RV32I-ZALRSC-NEXT:    lh a0, 0(a0)
+; RV32I-ZALRSC-NEXT:    fence r, rw
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomic_load_i16_seq_cst:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    fence rw, rw
@@ -465,6 +554,13 @@ define i16 @atomic_load_i16_seq_cst(ptr %a) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomic_load_i16_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    fence rw, rw
+; RV64I-ZALRSC-NEXT:    lh a0, 0(a0)
+; RV64I-ZALRSC-NEXT:    fence r, rw
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomic_load_i16_seq_cst:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    fence rw, rw
@@ -528,6 +624,11 @@ define i32 @atomic_load_i32_unordered(ptr %a) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomic_load_i32_unordered:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    lw a0, 0(a0)
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomic_load_i32_unordered:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    lw a0, 0(a0)
@@ -543,6 +644,11 @@ define i32 @atomic_load_i32_unordered(ptr %a) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomic_load_i32_unordered:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    lw a0, 0(a0)
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-LABEL: atomic_load_i32_unordered:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    lw a0, 0(a0)
@@ -562,6 +668,11 @@ define i32 @atomic_load_i32_monotonic(ptr %a) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomic_load_i32_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    lw a0, 0(a0)
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomic_load_i32_monotonic:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    lw a0, 0(a0)
@@ -577,6 +688,11 @@ define i32 @atomic_load_i32_monotonic(ptr %a) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomic_load_i32_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    lw a0, 0(a0)
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-LABEL: atomic_load_i32_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    lw a0, 0(a0)
@@ -596,6 +712,12 @@ define i32 @atomic_load_i32_acquire(ptr %a) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomic_load_i32_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    lw a0, 0(a0)
+; RV32I-ZALRSC-NEXT:    fence r, rw
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomic_load_i32_acquire:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    lw a0, 0(a0)
@@ -617,6 +739,12 @@ define i32 @atomic_load_i32_acquire(ptr %a) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomic_load_i32_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    lw a0, 0(a0)
+; RV64I-ZALRSC-NEXT:    fence r, rw
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomic_load_i32_acquire:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    lw a0, 0(a0)
@@ -684,6 +812,13 @@ define i32 @atomic_load_i32_seq_cst(ptr %a) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomic_load_i32_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    fence rw, rw
+; RV32I-ZALRSC-NEXT:    lw a0, 0(a0)
+; RV32I-ZALRSC-NEXT:    fence r, rw
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomic_load_i32_seq_cst:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    fence rw, rw
@@ -707,6 +842,13 @@ define i32 @atomic_load_i32_seq_cst(ptr %a) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomic_load_i32_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    fence rw, rw
+; RV64I-ZALRSC-NEXT:    lw a0, 0(a0)
+; RV64I-ZALRSC-NEXT:    fence r, rw
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomic_load_i32_seq_cst:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    fence rw, rw
@@ -770,6 +912,16 @@ define i64 @atomic_load_i64_unordered(ptr %a) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomic_load_i64_unordered:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a1, 0
+; RV32I-ZALRSC-NEXT:    call __atomic_load_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomic_load_i64_unordered:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -16
@@ -790,6 +942,11 @@ define i64 @atomic_load_i64_unordered(ptr %a) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomic_load_i64_unordered:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    ld a0, 0(a0)
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-LABEL: atomic_load_i64_unordered:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    ld a0, 0(a0)
@@ -809,6 +966,16 @@ define i64 @atomic_load_i64_monotonic(ptr %a) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomic_load_i64_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a1, 0
+; RV32I-ZALRSC-NEXT:    call __atomic_load_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomic_load_i64_monotonic:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -16
@@ -829,6 +996,11 @@ define i64 @atomic_load_i64_monotonic(ptr %a) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomic_load_i64_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    ld a0, 0(a0)
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-LABEL: atomic_load_i64_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    ld a0, 0(a0)
@@ -848,6 +1020,16 @@ define i64 @atomic_load_i64_acquire(ptr %a) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomic_load_i64_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a1, 2
+; RV32I-ZALRSC-NEXT:    call __atomic_load_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomic_load_i64_acquire:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -16
@@ -868,6 +1050,12 @@ define i64 @atomic_load_i64_acquire(ptr %a) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomic_load_i64_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    ld a0, 0(a0)
+; RV64I-ZALRSC-NEXT:    fence r, rw
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomic_load_i64_acquire:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    ld a0, 0(a0)
@@ -914,6 +1102,16 @@ define i64 @atomic_load_i64_seq_cst(ptr %a) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomic_load_i64_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a1, 5
+; RV32I-ZALRSC-NEXT:    call __atomic_load_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomic_load_i64_seq_cst:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -16
@@ -934,6 +1132,13 @@ define i64 @atomic_load_i64_seq_cst(ptr %a) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomic_load_i64_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    fence rw, rw
+; RV64I-ZALRSC-NEXT:    ld a0, 0(a0)
+; RV64I-ZALRSC-NEXT:    fence r, rw
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomic_load_i64_seq_cst:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    fence rw, rw
@@ -979,6 +1184,11 @@ define void @atomic_store_i8_unordered(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomic_store_i8_unordered:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    sb a1, 0(a0)
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomic_store_i8_unordered:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    sb a1, 0(a0)
@@ -994,6 +1204,11 @@ define void @atomic_store_i8_unordered(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomic_store_i8_unordered:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    sb a1, 0(a0)
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-LABEL: atomic_store_i8_unordered:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    sb a1, 0(a0)
@@ -1013,6 +1228,11 @@ define void @atomic_store_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomic_store_i8_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    sb a1, 0(a0)
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomic_store_i8_monotonic:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    sb a1, 0(a0)
@@ -1028,6 +1248,11 @@ define void @atomic_store_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomic_store_i8_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    sb a1, 0(a0)
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-LABEL: atomic_store_i8_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    sb a1, 0(a0)
@@ -1047,6 +1272,12 @@ define void @atomic_store_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomic_store_i8_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    fence rw, w
+; RV32I-ZALRSC-NEXT:    sb a1, 0(a0)
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomic_store_i8_release:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    fence rw, w
@@ -1068,6 +1299,12 @@ define void @atomic_store_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomic_store_i8_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    fence rw, w
+; RV64I-ZALRSC-NEXT:    sb a1, 0(a0)
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomic_store_i8_release:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    fence rw, w
@@ -1135,6 +1372,13 @@ define void @atomic_store_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomic_store_i8_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    fence rw, w
+; RV32I-ZALRSC-NEXT:    sb a1, 0(a0)
+; RV32I-ZALRSC-NEXT:    fence rw, rw
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomic_store_i8_seq_cst:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    fence rw, w
@@ -1157,6 +1401,13 @@ define void @atomic_store_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomic_store_i8_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    fence rw, w
+; RV64I-ZALRSC-NEXT:    sb a1, 0(a0)
+; RV64I-ZALRSC-NEXT:    fence rw, rw
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomic_store_i8_seq_cst:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    fence rw, w
@@ -1219,6 +1470,11 @@ define void @atomic_store_i16_unordered(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomic_store_i16_unordered:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    sh a1, 0(a0)
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomic_store_i16_unordered:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    sh a1, 0(a0)
@@ -1234,6 +1490,11 @@ define void @atomic_store_i16_unordered(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomic_store_i16_unordered:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    sh a1, 0(a0)
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-LABEL: atomic_store_i16_unordered:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    sh a1, 0(a0)
@@ -1253,6 +1514,11 @@ define void @atomic_store_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomic_store_i16_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    sh a1, 0(a0)
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomic_store_i16_monotonic:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    sh a1, 0(a0)
@@ -1268,6 +1534,11 @@ define void @atomic_store_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomic_store_i16_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    sh a1, 0(a0)
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-LABEL: atomic_store_i16_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    sh a1, 0(a0)
@@ -1287,6 +1558,12 @@ define void @atomic_store_i16_release(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomic_store_i16_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    fence rw, w
+; RV32I-ZALRSC-NEXT:    sh a1, 0(a0)
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomic_store_i16_release:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    fence rw, w
@@ -1308,6 +1585,12 @@ define void @atomic_store_i16_release(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomic_store_i16_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    fence rw, w
+; RV64I-ZALRSC-NEXT:    sh a1, 0(a0)
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomic_store_i16_release:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    fence rw, w
@@ -1375,6 +1658,13 @@ define void @atomic_store_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomic_store_i16_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    fence rw, w
+; RV32I-ZALRSC-NEXT:    sh a1, 0(a0)
+; RV32I-ZALRSC-NEXT:    fence rw, rw
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomic_store_i16_seq_cst:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    fence rw, w
@@ -1397,6 +1687,13 @@ define void @atomic_store_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomic_store_i16_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    fence rw, w
+; RV64I-ZALRSC-NEXT:    sh a1, 0(a0)
+; RV64I-ZALRSC-NEXT:    fence rw, rw
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomic_store_i16_seq_cst:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    fence rw, w
@@ -1459,6 +1756,11 @@ define void @atomic_store_i32_unordered(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomic_store_i32_unordered:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    sw a1, 0(a0)
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomic_store_i32_unordered:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    sw a1, 0(a0)
@@ -1474,6 +1776,11 @@ define void @atomic_store_i32_unordered(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomic_store_i32_unordered:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    sw a1, 0(a0)
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-LABEL: atomic_store_i32_unordered:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    sw a1, 0(a0)
@@ -1493,6 +1800,11 @@ define void @atomic_store_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomic_store_i32_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    sw a1, 0(a0)
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomic_store_i32_monotonic:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    sw a1, 0(a0)
@@ -1508,6 +1820,11 @@ define void @atomic_store_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomic_store_i32_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    sw a1, 0(a0)
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-LABEL: atomic_store_i32_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    sw a1, 0(a0)
@@ -1527,6 +1844,12 @@ define void @atomic_store_i32_release(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomic_store_i32_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    fence rw, w
+; RV32I-ZALRSC-NEXT:    sw a1, 0(a0)
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomic_store_i32_release:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    fence rw, w
@@ -1548,6 +1871,12 @@ define void @atomic_store_i32_release(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomic_store_i32_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    fence rw, w
+; RV64I-ZALRSC-NEXT:    sw a1, 0(a0)
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomic_store_i32_release:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    fence rw, w
@@ -1615,6 +1944,13 @@ define void @atomic_store_i32_seq_cst(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomic_store_i32_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    fence rw, w
+; RV32I-ZALRSC-NEXT:    sw a1, 0(a0)
+; RV32I-ZALRSC-NEXT:    fence rw, rw
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomic_store_i32_seq_cst:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    fence rw, w
@@ -1637,6 +1973,13 @@ define void @atomic_store_i32_seq_cst(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomic_store_i32_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    fence rw, w
+; RV64I-ZALRSC-NEXT:    sw a1, 0(a0)
+; RV64I-ZALRSC-NEXT:    fence rw, rw
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomic_store_i32_seq_cst:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    fence rw, w
@@ -1699,6 +2042,16 @@ define void @atomic_store_i64_unordered(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomic_store_i64_unordered:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a3, 0
+; RV32I-ZALRSC-NEXT:    call __atomic_store_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomic_store_i64_unordered:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -16
@@ -1719,6 +2072,11 @@ define void @atomic_store_i64_unordered(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomic_store_i64_unordered:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    sd a1, 0(a0)
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-LABEL: atomic_store_i64_unordered:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    sd a1, 0(a0)
@@ -1738,6 +2096,16 @@ define void @atomic_store_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomic_store_i64_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a3, 0
+; RV32I-ZALRSC-NEXT:    call __atomic_store_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomic_store_i64_monotonic:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -16
@@ -1758,6 +2126,11 @@ define void @atomic_store_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomic_store_i64_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    sd a1, 0(a0)
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-LABEL: atomic_store_i64_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    sd a1, 0(a0)
@@ -1777,6 +2150,16 @@ define void @atomic_store_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomic_store_i64_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a3, 3
+; RV32I-ZALRSC-NEXT:    call __atomic_store_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomic_store_i64_release:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -16
@@ -1797,6 +2180,12 @@ define void @atomic_store_i64_release(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomic_store_i64_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    fence rw, w
+; RV64I-ZALRSC-NEXT:    sd a1, 0(a0)
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomic_store_i64_release:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    fence rw, w
@@ -1843,6 +2232,16 @@ define void @atomic_store_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomic_store_i64_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a3, 5
+; RV32I-ZALRSC-NEXT:    call __atomic_store_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomic_store_i64_seq_cst:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -16
@@ -1863,6 +2262,13 @@ define void @atomic_store_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomic_store_i64_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    fence rw, w
+; RV64I-ZALRSC-NEXT:    sd a1, 0(a0)
+; RV64I-ZALRSC-NEXT:    fence rw, rw
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomic_store_i64_seq_cst:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    fence rw, w
diff --git a/llvm/test/CodeGen/RISCV/atomic-rmw-sub.ll b/llvm/test/CodeGen/RISCV/atomic-rmw-sub.ll
index 4dafd6a08d973..d5238ab941a14 100644
--- a/llvm/test/CodeGen/RISCV/atomic-rmw-sub.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-rmw-sub.ll
@@ -3,10 +3,14 @@
 ; RUN:   | FileCheck -check-prefix=RV32I %s
 ; RUN: llc -mtriple=riscv32 -mattr=+a -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV32IA %s
+; RUN: llc -mtriple=riscv32 -mattr=+zalrsc -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefixes=RV32I-ZALRSC %s
 ; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefix=RV64I %s
 ; RUN: llc -mtriple=riscv64 -mattr=+a -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA %s
+; RUN: llc -mtriple=riscv64 -mattr=+zalrsc -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefixes=RV64I-ZALRSC %s
 
 define i32 @atomicrmw_sub_i32_constant(ptr %a) nounwind {
 ; RV32I-LABEL: atomicrmw_sub_i32_constant:
@@ -26,6 +30,18 @@ define i32 @atomicrmw_sub_i32_constant(ptr %a) nounwind {
 ; RV32IA-NEXT:    amoadd.w.aqrl a0, a1, (a0)
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_sub_i32_constant:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    li a2, 1
+; RV32I-ZALRSC-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aqrl a1, (a0)
+; RV32I-ZALRSC-NEXT:    sub a3, a1, a2
+; RV32I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB0_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    mv a0, a1
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_sub_i32_constant:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
@@ -42,6 +58,18 @@ define i32 @atomicrmw_sub_i32_constant(ptr %a) nounwind {
 ; RV64IA-NEXT:    li a1, -1
 ; RV64IA-NEXT:    amoadd.w.aqrl a0, a1, (a0)
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_sub_i32_constant:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    li a2, 1
+; RV64I-ZALRSC-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aqrl a1, (a0)
+; RV64I-ZALRSC-NEXT:    sub a3, a1, a2
+; RV64I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB0_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a1
+; RV64I-ZALRSC-NEXT:    ret
   %1 = atomicrmw sub ptr %a, i32 1 seq_cst
   ret i32 %1
 }
@@ -71,6 +99,18 @@ define i64 @atomicrmw_sub_i64_constant(ptr %a) nounwind {
 ; RV32IA-NEXT:    addi sp, sp, 16
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_sub_i64_constant:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a1, 1
+; RV32I-ZALRSC-NEXT:    li a3, 5
+; RV32I-ZALRSC-NEXT:    li a2, 0
+; RV32I-ZALRSC-NEXT:    call __atomic_fetch_sub_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_sub_i64_constant:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
@@ -87,6 +127,18 @@ define i64 @atomicrmw_sub_i64_constant(ptr %a) nounwind {
 ; RV64IA-NEXT:    li a1, -1
 ; RV64IA-NEXT:    amoadd.d.aqrl a0, a1, (a0)
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_sub_i64_constant:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    li a2, 1
+; RV64I-ZALRSC-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d.aqrl a1, (a0)
+; RV64I-ZALRSC-NEXT:    sub a3, a1, a2
+; RV64I-ZALRSC-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB1_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a1
+; RV64I-ZALRSC-NEXT:    ret
   %1 = atomicrmw sub ptr %a, i64 1 seq_cst
   ret i64 %1
 }
@@ -109,6 +161,18 @@ define i32 @atomicrmw_sub_i32_neg(ptr %a, i32 %x, i32 %y) nounwind {
 ; RV32IA-NEXT:    amoadd.w.aqrl a0, a2, (a0)
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_sub_i32_neg:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    sub a2, a1, a2
+; RV32I-ZALRSC-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aqrl a1, (a0)
+; RV32I-ZALRSC-NEXT:    sub a3, a1, a2
+; RV32I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB2_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    mv a0, a1
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_sub_i32_neg:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
@@ -125,6 +189,18 @@ define i32 @atomicrmw_sub_i32_neg(ptr %a, i32 %x, i32 %y) nounwind {
 ; RV64IA-NEXT:    sub a2, a2, a1
 ; RV64IA-NEXT:    amoadd.w.aqrl a0, a2, (a0)
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_sub_i32_neg:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    subw a2, a1, a2
+; RV64I-ZALRSC-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aqrl a1, (a0)
+; RV64I-ZALRSC-NEXT:    sub a3, a1, a2
+; RV64I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB2_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a1
+; RV64I-ZALRSC-NEXT:    ret
   %b = sub i32 %x, %y
   %1 = atomicrmw sub ptr %a, i32 %b seq_cst
   ret i32 %1
@@ -159,6 +235,20 @@ define i64 @atomicrmw_sub_i64_neg(ptr %a, i64 %x, i64 %y) nounwind {
 ; RV32IA-NEXT:    addi sp, sp, 16
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_sub_i64_neg:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sltu a5, a1, a3
+; RV32I-ZALRSC-NEXT:    sub a2, a2, a4
+; RV32I-ZALRSC-NEXT:    sub a2, a2, a5
+; RV32I-ZALRSC-NEXT:    sub a1, a1, a3
+; RV32I-ZALRSC-NEXT:    li a3, 5
+; RV32I-ZALRSC-NEXT:    call __atomic_fetch_sub_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_sub_i64_neg:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
@@ -175,6 +265,18 @@ define i64 @atomicrmw_sub_i64_neg(ptr %a, i64 %x, i64 %y) nounwind {
 ; RV64IA-NEXT:    sub a2, a2, a1
 ; RV64IA-NEXT:    amoadd.d.aqrl a0, a2, (a0)
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_sub_i64_neg:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    sub a2, a1, a2
+; RV64I-ZALRSC-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d.aqrl a1, (a0)
+; RV64I-ZALRSC-NEXT:    sub a3, a1, a2
+; RV64I-ZALRSC-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB3_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a1
+; RV64I-ZALRSC-NEXT:    ret
   %b = sub i64 %x, %y
   %1 = atomicrmw sub ptr %a, i64 %b seq_cst
   ret i64 %1
diff --git a/llvm/test/CodeGen/RISCV/atomic-rmw.ll b/llvm/test/CodeGen/RISCV/atomic-rmw.ll
index 12132563c97d1..26feb8325dec0 100644
--- a/llvm/test/CodeGen/RISCV/atomic-rmw.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-rmw.ll
@@ -1,12 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefix=RV32I %s
+; RUN: llc -mtriple=riscv32 -mattr=+zalrsc -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefixes=RV32I-ZALRSC %s
 ; RUN: llc -mtriple=riscv32 -mattr=+a -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV32IA,RV32IA-NOZACAS,RV32IA-WMO,RV32IA-WMO-NOZACAS %s
 ; RUN: llc -mtriple=riscv32 -mattr=+a,+ztso -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV32IA,RV32IA-NOZACAS,RV32IA-TSO,RV32IA-TSO-NOZACAS %s
 ; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefix=RV64I %s
+; RUN: llc -mtriple=riscv64 -mattr=+zalrsc -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefixes=RV64I-ZALRSC %s
 ; RUN: llc -mtriple=riscv64 -mattr=+a -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-NOZACAS,RV64IA-WMO,RV64IA-WMO-NOZACAS %s
 ; RUN: llc -mtriple=riscv64 -mattr=+a,+ztso -verify-machineinstrs < %s \
@@ -50,6 +54,26 @@ define i8 @atomicrmw_xchg_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xchg_i8_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a4, (a2)
+; RV32I-ZALRSC-NEXT:    mv a5, a1
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a3
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB0_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a4, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-NOZACAS-LABEL: atomicrmw_xchg_i8_monotonic:
 ; RV32IA-NOZACAS:       # %bb.0:
 ; RV32IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -80,6 +104,26 @@ define i8 @atomicrmw_xchg_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xchg_i8_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a4, (a2)
+; RV64I-ZALRSC-NEXT:    mv a5, a1
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a3
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB0_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a4, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_xchg_i8_monotonic:
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -174,6 +218,26 @@ define i8 @atomicrmw_xchg_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xchg_i8_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a4, (a2)
+; RV32I-ZALRSC-NEXT:    mv a5, a1
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a3
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB1_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a4, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_i8_acquire:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -224,6 +288,26 @@ define i8 @atomicrmw_xchg_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xchg_i8_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a4, (a2)
+; RV64I-ZALRSC-NEXT:    mv a5, a1
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a3
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB1_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a4, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_i8_acquire:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -378,6 +462,26 @@ define i8 @atomicrmw_xchg_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xchg_i8_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a4, (a2)
+; RV32I-ZALRSC-NEXT:    mv a5, a1
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a3
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB2_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a4, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_i8_release:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -428,6 +532,26 @@ define i8 @atomicrmw_xchg_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xchg_i8_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a4, (a2)
+; RV64I-ZALRSC-NEXT:    mv a5, a1
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a3
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB2_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a4, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_i8_release:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -582,6 +706,26 @@ define i8 @atomicrmw_xchg_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xchg_i8_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a4, (a2)
+; RV32I-ZALRSC-NEXT:    mv a5, a1
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a3
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB3_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a4, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_i8_acq_rel:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -632,6 +776,26 @@ define i8 @atomicrmw_xchg_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xchg_i8_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a4, (a2)
+; RV64I-ZALRSC-NEXT:    mv a5, a1
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a3
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB3_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a4, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_i8_acq_rel:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -786,6 +950,26 @@ define i8 @atomicrmw_xchg_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xchg_i8_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aqrl a4, (a2)
+; RV32I-ZALRSC-NEXT:    mv a5, a1
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a3
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB4_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a4, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-NOZACAS-LABEL: atomicrmw_xchg_i8_seq_cst:
 ; RV32IA-NOZACAS:       # %bb.0:
 ; RV32IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -816,6 +1000,26 @@ define i8 @atomicrmw_xchg_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xchg_i8_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aqrl a4, (a2)
+; RV64I-ZALRSC-NEXT:    mv a5, a1
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a3
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB4_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a4, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_xchg_i8_seq_cst:
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -914,6 +1118,22 @@ define i8 @atomicrmw_xchg_0_i8_monotonic(ptr %a) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xchg_0_i8_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a1, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a2, 255
+; RV32I-ZALRSC-NEXT:    sll a2, a2, a0
+; RV32I-ZALRSC-NEXT:    not a2, a2
+; RV32I-ZALRSC-NEXT:  .LBB5_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a3, (a1)
+; RV32I-ZALRSC-NEXT:    and a4, a3, a2
+; RV32I-ZALRSC-NEXT:    sc.w a4, a4, (a1)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB5_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-NOZACAS-LABEL: atomicrmw_xchg_0_i8_monotonic:
 ; RV32IA-NOZACAS:       # %bb.0:
 ; RV32IA-NOZACAS-NEXT:    andi a1, a0, -4
@@ -936,6 +1156,22 @@ define i8 @atomicrmw_xchg_0_i8_monotonic(ptr %a) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xchg_0_i8_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a1, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a2, 255
+; RV64I-ZALRSC-NEXT:    sllw a2, a2, a0
+; RV64I-ZALRSC-NEXT:    not a2, a2
+; RV64I-ZALRSC-NEXT:  .LBB5_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a3, (a1)
+; RV64I-ZALRSC-NEXT:    and a4, a3, a2
+; RV64I-ZALRSC-NEXT:    sc.w a4, a4, (a1)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB5_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_xchg_0_i8_monotonic:
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a1, a0, -4
@@ -1004,6 +1240,22 @@ define i8 @atomicrmw_xchg_0_i8_acquire(ptr %a) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xchg_0_i8_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a1, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a2, 255
+; RV32I-ZALRSC-NEXT:    sll a2, a2, a0
+; RV32I-ZALRSC-NEXT:    not a2, a2
+; RV32I-ZALRSC-NEXT:  .LBB6_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a3, (a1)
+; RV32I-ZALRSC-NEXT:    and a4, a3, a2
+; RV32I-ZALRSC-NEXT:    sc.w a4, a4, (a1)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB6_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_0_i8_acquire:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a1, a0, -4
@@ -1037,6 +1289,22 @@ define i8 @atomicrmw_xchg_0_i8_acquire(ptr %a) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xchg_0_i8_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a1, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a2, 255
+; RV64I-ZALRSC-NEXT:    sllw a2, a2, a0
+; RV64I-ZALRSC-NEXT:    not a2, a2
+; RV64I-ZALRSC-NEXT:  .LBB6_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a3, (a1)
+; RV64I-ZALRSC-NEXT:    and a4, a3, a2
+; RV64I-ZALRSC-NEXT:    sc.w a4, a4, (a1)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB6_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_0_i8_acquire:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a0, -4
@@ -1138,6 +1406,22 @@ define i8 @atomicrmw_xchg_0_i8_release(ptr %a) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xchg_0_i8_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a1, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a2, 255
+; RV32I-ZALRSC-NEXT:    sll a2, a2, a0
+; RV32I-ZALRSC-NEXT:    not a2, a2
+; RV32I-ZALRSC-NEXT:  .LBB7_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a3, (a1)
+; RV32I-ZALRSC-NEXT:    and a4, a3, a2
+; RV32I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a1)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB7_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_0_i8_release:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a1, a0, -4
@@ -1171,6 +1455,22 @@ define i8 @atomicrmw_xchg_0_i8_release(ptr %a) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xchg_0_i8_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a1, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a2, 255
+; RV64I-ZALRSC-NEXT:    sllw a2, a2, a0
+; RV64I-ZALRSC-NEXT:    not a2, a2
+; RV64I-ZALRSC-NEXT:  .LBB7_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a3, (a1)
+; RV64I-ZALRSC-NEXT:    and a4, a3, a2
+; RV64I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a1)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB7_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_0_i8_release:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a0, -4
@@ -1272,6 +1572,22 @@ define i8 @atomicrmw_xchg_0_i8_acq_rel(ptr %a) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xchg_0_i8_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a1, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a2, 255
+; RV32I-ZALRSC-NEXT:    sll a2, a2, a0
+; RV32I-ZALRSC-NEXT:    not a2, a2
+; RV32I-ZALRSC-NEXT:  .LBB8_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a3, (a1)
+; RV32I-ZALRSC-NEXT:    and a4, a3, a2
+; RV32I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a1)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB8_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_0_i8_acq_rel:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a1, a0, -4
@@ -1305,6 +1621,22 @@ define i8 @atomicrmw_xchg_0_i8_acq_rel(ptr %a) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xchg_0_i8_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a1, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a2, 255
+; RV64I-ZALRSC-NEXT:    sllw a2, a2, a0
+; RV64I-ZALRSC-NEXT:    not a2, a2
+; RV64I-ZALRSC-NEXT:  .LBB8_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a3, (a1)
+; RV64I-ZALRSC-NEXT:    and a4, a3, a2
+; RV64I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a1)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB8_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_0_i8_acq_rel:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a0, -4
@@ -1406,6 +1738,22 @@ define i8 @atomicrmw_xchg_0_i8_seq_cst(ptr %a) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xchg_0_i8_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a1, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a2, 255
+; RV32I-ZALRSC-NEXT:    sll a2, a2, a0
+; RV32I-ZALRSC-NEXT:    not a2, a2
+; RV32I-ZALRSC-NEXT:  .LBB9_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aqrl a3, (a1)
+; RV32I-ZALRSC-NEXT:    and a4, a3, a2
+; RV32I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a1)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB9_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_0_i8_seq_cst:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a1, a0, -4
@@ -1439,6 +1787,22 @@ define i8 @atomicrmw_xchg_0_i8_seq_cst(ptr %a) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xchg_0_i8_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a1, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a2, 255
+; RV64I-ZALRSC-NEXT:    sllw a2, a2, a0
+; RV64I-ZALRSC-NEXT:    not a2, a2
+; RV64I-ZALRSC-NEXT:  .LBB9_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aqrl a3, (a1)
+; RV64I-ZALRSC-NEXT:    and a4, a3, a2
+; RV64I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a1)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB9_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_0_i8_seq_cst:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a0, -4
@@ -1540,6 +1904,21 @@ define i8 @atomicrmw_xchg_minus_1_i8_monotonic(ptr %a) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xchg_minus_1_i8_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a1, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a2, 255
+; RV32I-ZALRSC-NEXT:    sll a2, a2, a0
+; RV32I-ZALRSC-NEXT:  .LBB10_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a3, (a1)
+; RV32I-ZALRSC-NEXT:    or a4, a3, a2
+; RV32I-ZALRSC-NEXT:    sc.w a4, a4, (a1)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB10_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-NOZACAS-LABEL: atomicrmw_xchg_minus_1_i8_monotonic:
 ; RV32IA-NOZACAS:       # %bb.0:
 ; RV32IA-NOZACAS-NEXT:    andi a1, a0, -4
@@ -1561,6 +1940,21 @@ define i8 @atomicrmw_xchg_minus_1_i8_monotonic(ptr %a) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xchg_minus_1_i8_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a1, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a2, 255
+; RV64I-ZALRSC-NEXT:    sllw a2, a2, a0
+; RV64I-ZALRSC-NEXT:  .LBB10_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a3, (a1)
+; RV64I-ZALRSC-NEXT:    or a4, a3, a2
+; RV64I-ZALRSC-NEXT:    sc.w a4, a4, (a1)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB10_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_xchg_minus_1_i8_monotonic:
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a1, a0, -4
@@ -1630,6 +2024,21 @@ define i8 @atomicrmw_xchg_minus_1_i8_acquire(ptr %a) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xchg_minus_1_i8_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a1, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a2, 255
+; RV32I-ZALRSC-NEXT:    sll a2, a2, a0
+; RV32I-ZALRSC-NEXT:  .LBB11_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a3, (a1)
+; RV32I-ZALRSC-NEXT:    or a4, a3, a2
+; RV32I-ZALRSC-NEXT:    sc.w a4, a4, (a1)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB11_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_minus_1_i8_acquire:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a1, a0, -4
@@ -1661,6 +2070,21 @@ define i8 @atomicrmw_xchg_minus_1_i8_acquire(ptr %a) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xchg_minus_1_i8_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a1, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a2, 255
+; RV64I-ZALRSC-NEXT:    sllw a2, a2, a0
+; RV64I-ZALRSC-NEXT:  .LBB11_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a3, (a1)
+; RV64I-ZALRSC-NEXT:    or a4, a3, a2
+; RV64I-ZALRSC-NEXT:    sc.w a4, a4, (a1)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB11_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_minus_1_i8_acquire:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a0, -4
@@ -1760,6 +2184,21 @@ define i8 @atomicrmw_xchg_minus_1_i8_release(ptr %a) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xchg_minus_1_i8_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a1, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a2, 255
+; RV32I-ZALRSC-NEXT:    sll a2, a2, a0
+; RV32I-ZALRSC-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a3, (a1)
+; RV32I-ZALRSC-NEXT:    or a4, a3, a2
+; RV32I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a1)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB12_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_minus_1_i8_release:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a1, a0, -4
@@ -1791,6 +2230,21 @@ define i8 @atomicrmw_xchg_minus_1_i8_release(ptr %a) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xchg_minus_1_i8_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a1, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a2, 255
+; RV64I-ZALRSC-NEXT:    sllw a2, a2, a0
+; RV64I-ZALRSC-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a3, (a1)
+; RV64I-ZALRSC-NEXT:    or a4, a3, a2
+; RV64I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a1)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB12_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_minus_1_i8_release:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a0, -4
@@ -1890,6 +2344,21 @@ define i8 @atomicrmw_xchg_minus_1_i8_acq_rel(ptr %a) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xchg_minus_1_i8_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a1, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a2, 255
+; RV32I-ZALRSC-NEXT:    sll a2, a2, a0
+; RV32I-ZALRSC-NEXT:  .LBB13_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a3, (a1)
+; RV32I-ZALRSC-NEXT:    or a4, a3, a2
+; RV32I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a1)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB13_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_minus_1_i8_acq_rel:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a1, a0, -4
@@ -1921,6 +2390,21 @@ define i8 @atomicrmw_xchg_minus_1_i8_acq_rel(ptr %a) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xchg_minus_1_i8_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a1, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a2, 255
+; RV64I-ZALRSC-NEXT:    sllw a2, a2, a0
+; RV64I-ZALRSC-NEXT:  .LBB13_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a3, (a1)
+; RV64I-ZALRSC-NEXT:    or a4, a3, a2
+; RV64I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a1)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB13_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_minus_1_i8_acq_rel:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a0, -4
@@ -2020,6 +2504,21 @@ define i8 @atomicrmw_xchg_minus_1_i8_seq_cst(ptr %a) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xchg_minus_1_i8_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a1, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a2, 255
+; RV32I-ZALRSC-NEXT:    sll a2, a2, a0
+; RV32I-ZALRSC-NEXT:  .LBB14_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aqrl a3, (a1)
+; RV32I-ZALRSC-NEXT:    or a4, a3, a2
+; RV32I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a1)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB14_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_minus_1_i8_seq_cst:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a1, a0, -4
@@ -2051,6 +2550,21 @@ define i8 @atomicrmw_xchg_minus_1_i8_seq_cst(ptr %a) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xchg_minus_1_i8_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a1, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a2, 255
+; RV64I-ZALRSC-NEXT:    sllw a2, a2, a0
+; RV64I-ZALRSC-NEXT:  .LBB14_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aqrl a3, (a1)
+; RV64I-ZALRSC-NEXT:    or a4, a3, a2
+; RV64I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a1)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB14_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_minus_1_i8_seq_cst:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a0, -4
@@ -2149,6 +2663,26 @@ define i8 @atomicrmw_add_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_add_i8_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB15_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a4, (a2)
+; RV32I-ZALRSC-NEXT:    add a5, a4, a1
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a3
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB15_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a4, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-NOZACAS-LABEL: atomicrmw_add_i8_monotonic:
 ; RV32IA-NOZACAS:       # %bb.0:
 ; RV32IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -2179,6 +2713,26 @@ define i8 @atomicrmw_add_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_add_i8_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB15_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a4, (a2)
+; RV64I-ZALRSC-NEXT:    add a5, a4, a1
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a3
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB15_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a4, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_add_i8_monotonic:
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -2273,6 +2827,26 @@ define i8 @atomicrmw_add_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_add_i8_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB16_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a4, (a2)
+; RV32I-ZALRSC-NEXT:    add a5, a4, a1
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a3
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB16_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a4, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_add_i8_acquire:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -2323,6 +2897,26 @@ define i8 @atomicrmw_add_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_add_i8_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB16_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a4, (a2)
+; RV64I-ZALRSC-NEXT:    add a5, a4, a1
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a3
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB16_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a4, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_add_i8_acquire:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -2477,6 +3071,26 @@ define i8 @atomicrmw_add_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_add_i8_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB17_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a4, (a2)
+; RV32I-ZALRSC-NEXT:    add a5, a4, a1
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a3
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB17_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a4, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_add_i8_release:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -2527,6 +3141,26 @@ define i8 @atomicrmw_add_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_add_i8_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB17_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a4, (a2)
+; RV64I-ZALRSC-NEXT:    add a5, a4, a1
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a3
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB17_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a4, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_add_i8_release:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -2681,6 +3315,26 @@ define i8 @atomicrmw_add_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_add_i8_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB18_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a4, (a2)
+; RV32I-ZALRSC-NEXT:    add a5, a4, a1
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a3
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB18_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a4, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_add_i8_acq_rel:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -2731,6 +3385,26 @@ define i8 @atomicrmw_add_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_add_i8_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB18_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a4, (a2)
+; RV64I-ZALRSC-NEXT:    add a5, a4, a1
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a3
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB18_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a4, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_add_i8_acq_rel:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -2885,6 +3559,26 @@ define i8 @atomicrmw_add_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_add_i8_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB19_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aqrl a4, (a2)
+; RV32I-ZALRSC-NEXT:    add a5, a4, a1
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a3
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB19_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a4, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-NOZACAS-LABEL: atomicrmw_add_i8_seq_cst:
 ; RV32IA-NOZACAS:       # %bb.0:
 ; RV32IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -2915,6 +3609,26 @@ define i8 @atomicrmw_add_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_add_i8_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB19_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aqrl a4, (a2)
+; RV64I-ZALRSC-NEXT:    add a5, a4, a1
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a3
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB19_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a4, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_add_i8_seq_cst:
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -3009,6 +3723,26 @@ define i8 @atomicrmw_sub_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_sub_i8_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB20_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a4, (a2)
+; RV32I-ZALRSC-NEXT:    sub a5, a4, a1
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a3
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB20_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a4, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-NOZACAS-LABEL: atomicrmw_sub_i8_monotonic:
 ; RV32IA-NOZACAS:       # %bb.0:
 ; RV32IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -3039,6 +3773,26 @@ define i8 @atomicrmw_sub_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_sub_i8_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB20_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a4, (a2)
+; RV64I-ZALRSC-NEXT:    sub a5, a4, a1
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a3
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB20_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a4, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_sub_i8_monotonic:
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -3137,6 +3891,26 @@ define i8 @atomicrmw_sub_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_sub_i8_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB21_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a4, (a2)
+; RV32I-ZALRSC-NEXT:    sub a5, a4, a1
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a3
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB21_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a4, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_sub_i8_acquire:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -3187,6 +3961,26 @@ define i8 @atomicrmw_sub_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_sub_i8_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB21_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a4, (a2)
+; RV64I-ZALRSC-NEXT:    sub a5, a4, a1
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a3
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB21_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a4, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_sub_i8_acquire:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -3345,6 +4139,26 @@ define i8 @atomicrmw_sub_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_sub_i8_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB22_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a4, (a2)
+; RV32I-ZALRSC-NEXT:    sub a5, a4, a1
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a3
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB22_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a4, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_sub_i8_release:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -3395,6 +4209,26 @@ define i8 @atomicrmw_sub_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_sub_i8_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB22_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a4, (a2)
+; RV64I-ZALRSC-NEXT:    sub a5, a4, a1
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a3
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB22_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a4, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_sub_i8_release:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -3553,6 +4387,26 @@ define i8 @atomicrmw_sub_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_sub_i8_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB23_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a4, (a2)
+; RV32I-ZALRSC-NEXT:    sub a5, a4, a1
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a3
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB23_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a4, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_sub_i8_acq_rel:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -3603,6 +4457,26 @@ define i8 @atomicrmw_sub_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_sub_i8_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB23_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a4, (a2)
+; RV64I-ZALRSC-NEXT:    sub a5, a4, a1
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a3
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB23_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a4, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_sub_i8_acq_rel:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -3761,6 +4635,26 @@ define i8 @atomicrmw_sub_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_sub_i8_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB24_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aqrl a4, (a2)
+; RV32I-ZALRSC-NEXT:    sub a5, a4, a1
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a3
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB24_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a4, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-NOZACAS-LABEL: atomicrmw_sub_i8_seq_cst:
 ; RV32IA-NOZACAS:       # %bb.0:
 ; RV32IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -3791,6 +4685,26 @@ define i8 @atomicrmw_sub_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_sub_i8_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB24_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aqrl a4, (a2)
+; RV64I-ZALRSC-NEXT:    sub a5, a4, a1
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a3
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB24_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a4, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_sub_i8_seq_cst:
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -3889,6 +4803,25 @@ define i8 @atomicrmw_and_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_and_i8_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    not a3, a3
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:    or a1, a1, a3
+; RV32I-ZALRSC-NEXT:  .LBB25_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV32I-ZALRSC-NEXT:    and a4, a3, a1
+; RV32I-ZALRSC-NEXT:    sc.w a4, a4, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB25_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-NOZACAS-LABEL: atomicrmw_and_i8_monotonic:
 ; RV32IA-NOZACAS:       # %bb.0:
 ; RV32IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -3913,6 +4846,25 @@ define i8 @atomicrmw_and_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_and_i8_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    not a3, a3
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:    or a1, a1, a3
+; RV64I-ZALRSC-NEXT:  .LBB25_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV64I-ZALRSC-NEXT:    and a4, a3, a1
+; RV64I-ZALRSC-NEXT:    sc.w a4, a4, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB25_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_and_i8_monotonic:
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -3989,6 +4941,25 @@ define i8 @atomicrmw_and_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_and_i8_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    not a3, a3
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:    or a1, a1, a3
+; RV32I-ZALRSC-NEXT:  .LBB26_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV32I-ZALRSC-NEXT:    and a4, a3, a1
+; RV32I-ZALRSC-NEXT:    sc.w a4, a4, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB26_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_and_i8_acquire:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -4027,6 +4998,25 @@ define i8 @atomicrmw_and_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_and_i8_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    not a3, a3
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:    or a1, a1, a3
+; RV64I-ZALRSC-NEXT:  .LBB26_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV64I-ZALRSC-NEXT:    and a4, a3, a1
+; RV64I-ZALRSC-NEXT:    sc.w a4, a4, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB26_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_and_i8_acquire:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -4145,6 +5135,25 @@ define i8 @atomicrmw_and_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_and_i8_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    not a3, a3
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:    or a1, a1, a3
+; RV32I-ZALRSC-NEXT:  .LBB27_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV32I-ZALRSC-NEXT:    and a4, a3, a1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB27_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_and_i8_release:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -4183,6 +5192,25 @@ define i8 @atomicrmw_and_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_and_i8_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    not a3, a3
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:    or a1, a1, a3
+; RV64I-ZALRSC-NEXT:  .LBB27_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV64I-ZALRSC-NEXT:    and a4, a3, a1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB27_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_and_i8_release:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -4301,6 +5329,25 @@ define i8 @atomicrmw_and_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_and_i8_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    not a3, a3
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:    or a1, a1, a3
+; RV32I-ZALRSC-NEXT:  .LBB28_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV32I-ZALRSC-NEXT:    and a4, a3, a1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB28_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_and_i8_acq_rel:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -4339,6 +5386,25 @@ define i8 @atomicrmw_and_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_and_i8_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    not a3, a3
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:    or a1, a1, a3
+; RV64I-ZALRSC-NEXT:  .LBB28_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV64I-ZALRSC-NEXT:    and a4, a3, a1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB28_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_and_i8_acq_rel:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -4457,6 +5523,25 @@ define i8 @atomicrmw_and_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_and_i8_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    not a3, a3
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:    or a1, a1, a3
+; RV32I-ZALRSC-NEXT:  .LBB29_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aqrl a3, (a2)
+; RV32I-ZALRSC-NEXT:    and a4, a3, a1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB29_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_and_i8_seq_cst:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -4495,6 +5580,25 @@ define i8 @atomicrmw_and_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_and_i8_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    not a3, a3
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:    or a1, a1, a3
+; RV64I-ZALRSC-NEXT:  .LBB29_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aqrl a3, (a2)
+; RV64I-ZALRSC-NEXT:    and a4, a3, a1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB29_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_and_i8_seq_cst:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -4613,6 +5717,27 @@ define i8 @atomicrmw_nand_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_nand_i8_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB30_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a4, (a2)
+; RV32I-ZALRSC-NEXT:    and a5, a4, a1
+; RV32I-ZALRSC-NEXT:    not a5, a5
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a3
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB30_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a4, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-NOZACAS-LABEL: atomicrmw_nand_i8_monotonic:
 ; RV32IA-NOZACAS:       # %bb.0:
 ; RV32IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -4644,6 +5769,27 @@ define i8 @atomicrmw_nand_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_nand_i8_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB30_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a4, (a2)
+; RV64I-ZALRSC-NEXT:    and a5, a4, a1
+; RV64I-ZALRSC-NEXT:    not a5, a5
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a3
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB30_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a4, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_nand_i8_monotonic:
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -4865,6 +6011,27 @@ define i8 @atomicrmw_nand_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_nand_i8_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB31_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a4, (a2)
+; RV32I-ZALRSC-NEXT:    and a5, a4, a1
+; RV32I-ZALRSC-NEXT:    not a5, a5
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a3
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB31_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a4, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_nand_i8_acquire:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -4917,6 +6084,27 @@ define i8 @atomicrmw_nand_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_nand_i8_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB31_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a4, (a2)
+; RV64I-ZALRSC-NEXT:    and a5, a4, a1
+; RV64I-ZALRSC-NEXT:    not a5, a5
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a3
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB31_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a4, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_nand_i8_acquire:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -5201,6 +6389,27 @@ define i8 @atomicrmw_nand_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_nand_i8_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB32_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a4, (a2)
+; RV32I-ZALRSC-NEXT:    and a5, a4, a1
+; RV32I-ZALRSC-NEXT:    not a5, a5
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a3
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB32_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a4, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_nand_i8_release:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -5253,6 +6462,27 @@ define i8 @atomicrmw_nand_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_nand_i8_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB32_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a4, (a2)
+; RV64I-ZALRSC-NEXT:    and a5, a4, a1
+; RV64I-ZALRSC-NEXT:    not a5, a5
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a3
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB32_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a4, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_nand_i8_release:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -5537,6 +6767,27 @@ define i8 @atomicrmw_nand_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_nand_i8_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB33_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a4, (a2)
+; RV32I-ZALRSC-NEXT:    and a5, a4, a1
+; RV32I-ZALRSC-NEXT:    not a5, a5
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a3
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB33_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a4, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_nand_i8_acq_rel:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -5589,6 +6840,27 @@ define i8 @atomicrmw_nand_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_nand_i8_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB33_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a4, (a2)
+; RV64I-ZALRSC-NEXT:    and a5, a4, a1
+; RV64I-ZALRSC-NEXT:    not a5, a5
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a3
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB33_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a4, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_nand_i8_acq_rel:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -5873,6 +7145,27 @@ define i8 @atomicrmw_nand_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_nand_i8_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB34_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aqrl a4, (a2)
+; RV32I-ZALRSC-NEXT:    and a5, a4, a1
+; RV32I-ZALRSC-NEXT:    not a5, a5
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a3
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB34_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a4, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-NOZACAS-LABEL: atomicrmw_nand_i8_seq_cst:
 ; RV32IA-NOZACAS:       # %bb.0:
 ; RV32IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -5904,6 +7197,27 @@ define i8 @atomicrmw_nand_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_nand_i8_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB34_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aqrl a4, (a2)
+; RV64I-ZALRSC-NEXT:    and a5, a4, a1
+; RV64I-ZALRSC-NEXT:    not a5, a5
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a3
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB34_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a4, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_nand_i8_seq_cst:
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -6129,6 +7443,21 @@ define i8 @atomicrmw_or_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_or_i8_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB35_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV32I-ZALRSC-NEXT:    or a4, a3, a1
+; RV32I-ZALRSC-NEXT:    sc.w a4, a4, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB35_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-NOZACAS-LABEL: atomicrmw_or_i8_monotonic:
 ; RV32IA-NOZACAS:       # %bb.0:
 ; RV32IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -6149,6 +7478,21 @@ define i8 @atomicrmw_or_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_or_i8_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB35_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV64I-ZALRSC-NEXT:    or a4, a3, a1
+; RV64I-ZALRSC-NEXT:    sc.w a4, a4, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB35_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_or_i8_monotonic:
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -6213,6 +7557,21 @@ define i8 @atomicrmw_or_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_or_i8_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB36_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV32I-ZALRSC-NEXT:    or a4, a3, a1
+; RV32I-ZALRSC-NEXT:    sc.w a4, a4, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB36_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_or_i8_acquire:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -6243,6 +7602,21 @@ define i8 @atomicrmw_or_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_or_i8_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB36_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV64I-ZALRSC-NEXT:    or a4, a3, a1
+; RV64I-ZALRSC-NEXT:    sc.w a4, a4, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB36_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_or_i8_acquire:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -6337,6 +7711,21 @@ define i8 @atomicrmw_or_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_or_i8_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB37_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV32I-ZALRSC-NEXT:    or a4, a3, a1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB37_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_or_i8_release:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -6367,6 +7756,21 @@ define i8 @atomicrmw_or_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_or_i8_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB37_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV64I-ZALRSC-NEXT:    or a4, a3, a1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB37_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_or_i8_release:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -6461,6 +7865,21 @@ define i8 @atomicrmw_or_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_or_i8_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB38_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV32I-ZALRSC-NEXT:    or a4, a3, a1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB38_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_or_i8_acq_rel:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -6491,6 +7910,21 @@ define i8 @atomicrmw_or_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_or_i8_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB38_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV64I-ZALRSC-NEXT:    or a4, a3, a1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB38_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_or_i8_acq_rel:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -6585,6 +8019,21 @@ define i8 @atomicrmw_or_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_or_i8_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB39_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aqrl a3, (a2)
+; RV32I-ZALRSC-NEXT:    or a4, a3, a1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB39_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_or_i8_seq_cst:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -6615,6 +8064,21 @@ define i8 @atomicrmw_or_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_or_i8_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB39_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aqrl a3, (a2)
+; RV64I-ZALRSC-NEXT:    or a4, a3, a1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB39_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_or_i8_seq_cst:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -6709,6 +8173,21 @@ define i8 @atomicrmw_xor_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xor_i8_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB40_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV32I-ZALRSC-NEXT:    xor a4, a3, a1
+; RV32I-ZALRSC-NEXT:    sc.w a4, a4, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB40_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-NOZACAS-LABEL: atomicrmw_xor_i8_monotonic:
 ; RV32IA-NOZACAS:       # %bb.0:
 ; RV32IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -6729,6 +8208,21 @@ define i8 @atomicrmw_xor_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xor_i8_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB40_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV64I-ZALRSC-NEXT:    xor a4, a3, a1
+; RV64I-ZALRSC-NEXT:    sc.w a4, a4, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB40_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_xor_i8_monotonic:
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -6793,6 +8287,21 @@ define i8 @atomicrmw_xor_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xor_i8_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB41_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV32I-ZALRSC-NEXT:    xor a4, a3, a1
+; RV32I-ZALRSC-NEXT:    sc.w a4, a4, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB41_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_xor_i8_acquire:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -6823,6 +8332,21 @@ define i8 @atomicrmw_xor_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xor_i8_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB41_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV64I-ZALRSC-NEXT:    xor a4, a3, a1
+; RV64I-ZALRSC-NEXT:    sc.w a4, a4, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB41_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xor_i8_acquire:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -6917,6 +8441,21 @@ define i8 @atomicrmw_xor_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xor_i8_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB42_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV32I-ZALRSC-NEXT:    xor a4, a3, a1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB42_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_xor_i8_release:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -6947,6 +8486,21 @@ define i8 @atomicrmw_xor_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xor_i8_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB42_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV64I-ZALRSC-NEXT:    xor a4, a3, a1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB42_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xor_i8_release:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -7041,6 +8595,21 @@ define i8 @atomicrmw_xor_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xor_i8_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB43_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV32I-ZALRSC-NEXT:    xor a4, a3, a1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB43_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_xor_i8_acq_rel:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -7071,6 +8640,21 @@ define i8 @atomicrmw_xor_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xor_i8_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB43_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV64I-ZALRSC-NEXT:    xor a4, a3, a1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB43_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xor_i8_acq_rel:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -7165,6 +8749,21 @@ define i8 @atomicrmw_xor_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xor_i8_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB44_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aqrl a3, (a2)
+; RV32I-ZALRSC-NEXT:    xor a4, a3, a1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB44_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_xor_i8_seq_cst:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -7195,6 +8794,21 @@ define i8 @atomicrmw_xor_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xor_i8_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB44_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aqrl a3, (a2)
+; RV64I-ZALRSC-NEXT:    xor a4, a3, a1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB44_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xor_i8_seq_cst:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -7321,6 +8935,35 @@ define i8 @atomicrmw_max_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_max_i8_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    slli a1, a1, 24
+; RV32I-ZALRSC-NEXT:    andi a4, a0, 24
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    srai a1, a1, 24
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:    xori a4, a4, 24
+; RV32I-ZALRSC-NEXT:  .LBB45_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a5, (a2)
+; RV32I-ZALRSC-NEXT:    and a7, a5, a3
+; RV32I-ZALRSC-NEXT:    mv a6, a5
+; RV32I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV32I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV32I-ZALRSC-NEXT:    bge a7, a1, .LBB45_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB45_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV32I-ZALRSC-NEXT:    and a6, a6, a3
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV32I-ZALRSC-NEXT:  .LBB45_3: # in Loop: Header=BB45_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w a6, a6, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a6, .LBB45_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a5, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-NOZACAS-LABEL: atomicrmw_max_i8_monotonic:
 ; RV32IA-NOZACAS:       # %bb.0:
 ; RV32IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -7392,6 +9035,35 @@ define i8 @atomicrmw_max_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_max_i8_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    slli a1, a1, 56
+; RV64I-ZALRSC-NEXT:    andi a4, a0, 24
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    srai a1, a1, 56
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:    xori a4, a4, 56
+; RV64I-ZALRSC-NEXT:  .LBB45_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a5, (a2)
+; RV64I-ZALRSC-NEXT:    and a7, a5, a3
+; RV64I-ZALRSC-NEXT:    mv a6, a5
+; RV64I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV64I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV64I-ZALRSC-NEXT:    bge a7, a1, .LBB45_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB45_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV64I-ZALRSC-NEXT:    and a6, a6, a3
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV64I-ZALRSC-NEXT:  .LBB45_3: # in Loop: Header=BB45_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w a6, a6, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a6, .LBB45_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a5, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_max_i8_monotonic:
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -7545,6 +9217,35 @@ define i8 @atomicrmw_max_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_max_i8_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    slli a1, a1, 24
+; RV32I-ZALRSC-NEXT:    andi a4, a0, 24
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    srai a1, a1, 24
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:    xori a4, a4, 24
+; RV32I-ZALRSC-NEXT:  .LBB46_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a5, (a2)
+; RV32I-ZALRSC-NEXT:    and a7, a5, a3
+; RV32I-ZALRSC-NEXT:    mv a6, a5
+; RV32I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV32I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV32I-ZALRSC-NEXT:    bge a7, a1, .LBB46_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB46_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV32I-ZALRSC-NEXT:    and a6, a6, a3
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV32I-ZALRSC-NEXT:  .LBB46_3: # in Loop: Header=BB46_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w a6, a6, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a6, .LBB46_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a5, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_max_i8_acquire:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -7645,6 +9346,35 @@ define i8 @atomicrmw_max_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_max_i8_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    slli a1, a1, 56
+; RV64I-ZALRSC-NEXT:    andi a4, a0, 24
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    srai a1, a1, 56
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:    xori a4, a4, 56
+; RV64I-ZALRSC-NEXT:  .LBB46_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a5, (a2)
+; RV64I-ZALRSC-NEXT:    and a7, a5, a3
+; RV64I-ZALRSC-NEXT:    mv a6, a5
+; RV64I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV64I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV64I-ZALRSC-NEXT:    bge a7, a1, .LBB46_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB46_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV64I-ZALRSC-NEXT:    and a6, a6, a3
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV64I-ZALRSC-NEXT:  .LBB46_3: # in Loop: Header=BB46_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w a6, a6, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a6, .LBB46_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a5, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_max_i8_acquire:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -7885,6 +9615,35 @@ define i8 @atomicrmw_max_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_max_i8_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    slli a1, a1, 24
+; RV32I-ZALRSC-NEXT:    andi a4, a0, 24
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    srai a1, a1, 24
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:    xori a4, a4, 24
+; RV32I-ZALRSC-NEXT:  .LBB47_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a5, (a2)
+; RV32I-ZALRSC-NEXT:    and a7, a5, a3
+; RV32I-ZALRSC-NEXT:    mv a6, a5
+; RV32I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV32I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV32I-ZALRSC-NEXT:    bge a7, a1, .LBB47_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB47_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV32I-ZALRSC-NEXT:    and a6, a6, a3
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV32I-ZALRSC-NEXT:  .LBB47_3: # in Loop: Header=BB47_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a6, a6, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a6, .LBB47_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a5, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_max_i8_release:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -7985,6 +9744,35 @@ define i8 @atomicrmw_max_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_max_i8_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    slli a1, a1, 56
+; RV64I-ZALRSC-NEXT:    andi a4, a0, 24
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    srai a1, a1, 56
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:    xori a4, a4, 56
+; RV64I-ZALRSC-NEXT:  .LBB47_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a5, (a2)
+; RV64I-ZALRSC-NEXT:    and a7, a5, a3
+; RV64I-ZALRSC-NEXT:    mv a6, a5
+; RV64I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV64I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV64I-ZALRSC-NEXT:    bge a7, a1, .LBB47_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB47_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV64I-ZALRSC-NEXT:    and a6, a6, a3
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV64I-ZALRSC-NEXT:  .LBB47_3: # in Loop: Header=BB47_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a6, a6, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a6, .LBB47_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a5, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_max_i8_release:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -8225,6 +10013,35 @@ define i8 @atomicrmw_max_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_max_i8_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    slli a1, a1, 24
+; RV32I-ZALRSC-NEXT:    andi a4, a0, 24
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    srai a1, a1, 24
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:    xori a4, a4, 24
+; RV32I-ZALRSC-NEXT:  .LBB48_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a5, (a2)
+; RV32I-ZALRSC-NEXT:    and a7, a5, a3
+; RV32I-ZALRSC-NEXT:    mv a6, a5
+; RV32I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV32I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV32I-ZALRSC-NEXT:    bge a7, a1, .LBB48_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB48_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV32I-ZALRSC-NEXT:    and a6, a6, a3
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV32I-ZALRSC-NEXT:  .LBB48_3: # in Loop: Header=BB48_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a6, a6, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a6, .LBB48_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a5, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_max_i8_acq_rel:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -8325,6 +10142,35 @@ define i8 @atomicrmw_max_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_max_i8_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    slli a1, a1, 56
+; RV64I-ZALRSC-NEXT:    andi a4, a0, 24
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    srai a1, a1, 56
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:    xori a4, a4, 56
+; RV64I-ZALRSC-NEXT:  .LBB48_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a5, (a2)
+; RV64I-ZALRSC-NEXT:    and a7, a5, a3
+; RV64I-ZALRSC-NEXT:    mv a6, a5
+; RV64I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV64I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV64I-ZALRSC-NEXT:    bge a7, a1, .LBB48_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB48_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV64I-ZALRSC-NEXT:    and a6, a6, a3
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV64I-ZALRSC-NEXT:  .LBB48_3: # in Loop: Header=BB48_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a6, a6, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a6, .LBB48_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a5, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_max_i8_acq_rel:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -8565,6 +10411,35 @@ define i8 @atomicrmw_max_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_max_i8_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    slli a1, a1, 24
+; RV32I-ZALRSC-NEXT:    andi a4, a0, 24
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    srai a1, a1, 24
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:    xori a4, a4, 24
+; RV32I-ZALRSC-NEXT:  .LBB49_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aqrl a5, (a2)
+; RV32I-ZALRSC-NEXT:    and a7, a5, a3
+; RV32I-ZALRSC-NEXT:    mv a6, a5
+; RV32I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV32I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV32I-ZALRSC-NEXT:    bge a7, a1, .LBB49_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB49_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV32I-ZALRSC-NEXT:    and a6, a6, a3
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV32I-ZALRSC-NEXT:  .LBB49_3: # in Loop: Header=BB49_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a6, a6, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a6, .LBB49_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a5, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-NOZACAS-LABEL: atomicrmw_max_i8_seq_cst:
 ; RV32IA-NOZACAS:       # %bb.0:
 ; RV32IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -8636,6 +10511,35 @@ define i8 @atomicrmw_max_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_max_i8_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    slli a1, a1, 56
+; RV64I-ZALRSC-NEXT:    andi a4, a0, 24
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    srai a1, a1, 56
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:    xori a4, a4, 56
+; RV64I-ZALRSC-NEXT:  .LBB49_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aqrl a5, (a2)
+; RV64I-ZALRSC-NEXT:    and a7, a5, a3
+; RV64I-ZALRSC-NEXT:    mv a6, a5
+; RV64I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV64I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV64I-ZALRSC-NEXT:    bge a7, a1, .LBB49_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB49_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV64I-ZALRSC-NEXT:    and a6, a6, a3
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV64I-ZALRSC-NEXT:  .LBB49_3: # in Loop: Header=BB49_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a6, a6, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a6, .LBB49_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a5, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_max_i8_seq_cst:
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -8789,6 +10693,35 @@ define i8 @atomicrmw_min_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_min_i8_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    slli a1, a1, 24
+; RV32I-ZALRSC-NEXT:    andi a4, a0, 24
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    srai a1, a1, 24
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:    xori a4, a4, 24
+; RV32I-ZALRSC-NEXT:  .LBB50_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a5, (a2)
+; RV32I-ZALRSC-NEXT:    and a7, a5, a3
+; RV32I-ZALRSC-NEXT:    mv a6, a5
+; RV32I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV32I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV32I-ZALRSC-NEXT:    bge a1, a7, .LBB50_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB50_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV32I-ZALRSC-NEXT:    and a6, a6, a3
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV32I-ZALRSC-NEXT:  .LBB50_3: # in Loop: Header=BB50_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w a6, a6, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a6, .LBB50_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a5, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-NOZACAS-LABEL: atomicrmw_min_i8_monotonic:
 ; RV32IA-NOZACAS:       # %bb.0:
 ; RV32IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -8860,6 +10793,35 @@ define i8 @atomicrmw_min_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_min_i8_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    slli a1, a1, 56
+; RV64I-ZALRSC-NEXT:    andi a4, a0, 24
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    srai a1, a1, 56
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:    xori a4, a4, 56
+; RV64I-ZALRSC-NEXT:  .LBB50_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a5, (a2)
+; RV64I-ZALRSC-NEXT:    and a7, a5, a3
+; RV64I-ZALRSC-NEXT:    mv a6, a5
+; RV64I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV64I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV64I-ZALRSC-NEXT:    bge a1, a7, .LBB50_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB50_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV64I-ZALRSC-NEXT:    and a6, a6, a3
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV64I-ZALRSC-NEXT:  .LBB50_3: # in Loop: Header=BB50_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w a6, a6, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a6, .LBB50_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a5, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_min_i8_monotonic:
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -9013,6 +10975,35 @@ define i8 @atomicrmw_min_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_min_i8_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    slli a1, a1, 24
+; RV32I-ZALRSC-NEXT:    andi a4, a0, 24
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    srai a1, a1, 24
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:    xori a4, a4, 24
+; RV32I-ZALRSC-NEXT:  .LBB51_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a5, (a2)
+; RV32I-ZALRSC-NEXT:    and a7, a5, a3
+; RV32I-ZALRSC-NEXT:    mv a6, a5
+; RV32I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV32I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV32I-ZALRSC-NEXT:    bge a1, a7, .LBB51_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB51_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV32I-ZALRSC-NEXT:    and a6, a6, a3
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV32I-ZALRSC-NEXT:  .LBB51_3: # in Loop: Header=BB51_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w a6, a6, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a6, .LBB51_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a5, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_min_i8_acquire:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -9113,6 +11104,35 @@ define i8 @atomicrmw_min_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_min_i8_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    slli a1, a1, 56
+; RV64I-ZALRSC-NEXT:    andi a4, a0, 24
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    srai a1, a1, 56
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:    xori a4, a4, 56
+; RV64I-ZALRSC-NEXT:  .LBB51_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a5, (a2)
+; RV64I-ZALRSC-NEXT:    and a7, a5, a3
+; RV64I-ZALRSC-NEXT:    mv a6, a5
+; RV64I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV64I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV64I-ZALRSC-NEXT:    bge a1, a7, .LBB51_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB51_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV64I-ZALRSC-NEXT:    and a6, a6, a3
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV64I-ZALRSC-NEXT:  .LBB51_3: # in Loop: Header=BB51_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w a6, a6, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a6, .LBB51_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a5, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_min_i8_acquire:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -9353,6 +11373,35 @@ define i8 @atomicrmw_min_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_min_i8_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    slli a1, a1, 24
+; RV32I-ZALRSC-NEXT:    andi a4, a0, 24
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    srai a1, a1, 24
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:    xori a4, a4, 24
+; RV32I-ZALRSC-NEXT:  .LBB52_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a5, (a2)
+; RV32I-ZALRSC-NEXT:    and a7, a5, a3
+; RV32I-ZALRSC-NEXT:    mv a6, a5
+; RV32I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV32I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV32I-ZALRSC-NEXT:    bge a1, a7, .LBB52_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB52_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV32I-ZALRSC-NEXT:    and a6, a6, a3
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV32I-ZALRSC-NEXT:  .LBB52_3: # in Loop: Header=BB52_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a6, a6, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a6, .LBB52_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a5, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_min_i8_release:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -9453,6 +11502,35 @@ define i8 @atomicrmw_min_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_min_i8_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    slli a1, a1, 56
+; RV64I-ZALRSC-NEXT:    andi a4, a0, 24
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    srai a1, a1, 56
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:    xori a4, a4, 56
+; RV64I-ZALRSC-NEXT:  .LBB52_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a5, (a2)
+; RV64I-ZALRSC-NEXT:    and a7, a5, a3
+; RV64I-ZALRSC-NEXT:    mv a6, a5
+; RV64I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV64I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV64I-ZALRSC-NEXT:    bge a1, a7, .LBB52_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB52_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV64I-ZALRSC-NEXT:    and a6, a6, a3
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV64I-ZALRSC-NEXT:  .LBB52_3: # in Loop: Header=BB52_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a6, a6, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a6, .LBB52_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a5, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_min_i8_release:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -9693,6 +11771,35 @@ define i8 @atomicrmw_min_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_min_i8_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    slli a1, a1, 24
+; RV32I-ZALRSC-NEXT:    andi a4, a0, 24
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    srai a1, a1, 24
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:    xori a4, a4, 24
+; RV32I-ZALRSC-NEXT:  .LBB53_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a5, (a2)
+; RV32I-ZALRSC-NEXT:    and a7, a5, a3
+; RV32I-ZALRSC-NEXT:    mv a6, a5
+; RV32I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV32I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV32I-ZALRSC-NEXT:    bge a1, a7, .LBB53_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB53_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV32I-ZALRSC-NEXT:    and a6, a6, a3
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV32I-ZALRSC-NEXT:  .LBB53_3: # in Loop: Header=BB53_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a6, a6, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a6, .LBB53_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a5, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_min_i8_acq_rel:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -9793,6 +11900,35 @@ define i8 @atomicrmw_min_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_min_i8_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    slli a1, a1, 56
+; RV64I-ZALRSC-NEXT:    andi a4, a0, 24
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    srai a1, a1, 56
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:    xori a4, a4, 56
+; RV64I-ZALRSC-NEXT:  .LBB53_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a5, (a2)
+; RV64I-ZALRSC-NEXT:    and a7, a5, a3
+; RV64I-ZALRSC-NEXT:    mv a6, a5
+; RV64I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV64I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV64I-ZALRSC-NEXT:    bge a1, a7, .LBB53_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB53_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV64I-ZALRSC-NEXT:    and a6, a6, a3
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV64I-ZALRSC-NEXT:  .LBB53_3: # in Loop: Header=BB53_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a6, a6, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a6, .LBB53_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a5, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_min_i8_acq_rel:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -10033,6 +12169,35 @@ define i8 @atomicrmw_min_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_min_i8_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    slli a1, a1, 24
+; RV32I-ZALRSC-NEXT:    andi a4, a0, 24
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    srai a1, a1, 24
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:    xori a4, a4, 24
+; RV32I-ZALRSC-NEXT:  .LBB54_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aqrl a5, (a2)
+; RV32I-ZALRSC-NEXT:    and a7, a5, a3
+; RV32I-ZALRSC-NEXT:    mv a6, a5
+; RV32I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV32I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV32I-ZALRSC-NEXT:    bge a1, a7, .LBB54_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB54_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV32I-ZALRSC-NEXT:    and a6, a6, a3
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV32I-ZALRSC-NEXT:  .LBB54_3: # in Loop: Header=BB54_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a6, a6, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a6, .LBB54_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a5, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-NOZACAS-LABEL: atomicrmw_min_i8_seq_cst:
 ; RV32IA-NOZACAS:       # %bb.0:
 ; RV32IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -10104,6 +12269,35 @@ define i8 @atomicrmw_min_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_min_i8_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    slli a1, a1, 56
+; RV64I-ZALRSC-NEXT:    andi a4, a0, 24
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    srai a1, a1, 56
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:    xori a4, a4, 56
+; RV64I-ZALRSC-NEXT:  .LBB54_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aqrl a5, (a2)
+; RV64I-ZALRSC-NEXT:    and a7, a5, a3
+; RV64I-ZALRSC-NEXT:    mv a6, a5
+; RV64I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV64I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV64I-ZALRSC-NEXT:    bge a1, a7, .LBB54_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB54_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV64I-ZALRSC-NEXT:    and a6, a6, a3
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV64I-ZALRSC-NEXT:  .LBB54_3: # in Loop: Header=BB54_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a6, a6, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a6, .LBB54_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a5, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_min_i8_seq_cst:
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -10255,6 +12449,30 @@ define i8 @atomicrmw_umax_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umax_i8_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB55_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a4, (a2)
+; RV32I-ZALRSC-NEXT:    and a6, a4, a3
+; RV32I-ZALRSC-NEXT:    mv a5, a4
+; RV32I-ZALRSC-NEXT:    bgeu a6, a1, .LBB55_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB55_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a1
+; RV32I-ZALRSC-NEXT:    and a5, a5, a3
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:  .LBB55_3: # in Loop: Header=BB55_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB55_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a4, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-NOZACAS-LABEL: atomicrmw_umax_i8_monotonic:
 ; RV32IA-NOZACAS:       # %bb.0:
 ; RV32IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -10319,6 +12537,30 @@ define i8 @atomicrmw_umax_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_umax_i8_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB55_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a4, (a2)
+; RV64I-ZALRSC-NEXT:    and a6, a4, a3
+; RV64I-ZALRSC-NEXT:    mv a5, a4
+; RV64I-ZALRSC-NEXT:    bgeu a6, a1, .LBB55_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB55_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a1
+; RV64I-ZALRSC-NEXT:    and a5, a5, a3
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:  .LBB55_3: # in Loop: Header=BB55_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB55_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a4, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_umax_i8_monotonic:
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -10455,6 +12697,30 @@ define i8 @atomicrmw_umax_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umax_i8_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB56_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a4, (a2)
+; RV32I-ZALRSC-NEXT:    and a6, a4, a3
+; RV32I-ZALRSC-NEXT:    mv a5, a4
+; RV32I-ZALRSC-NEXT:    bgeu a6, a1, .LBB56_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB56_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a1
+; RV32I-ZALRSC-NEXT:    and a5, a5, a3
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:  .LBB56_3: # in Loop: Header=BB56_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB56_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a4, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_umax_i8_acquire:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -10543,6 +12809,30 @@ define i8 @atomicrmw_umax_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_umax_i8_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB56_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a4, (a2)
+; RV64I-ZALRSC-NEXT:    and a6, a4, a3
+; RV64I-ZALRSC-NEXT:    mv a5, a4
+; RV64I-ZALRSC-NEXT:    bgeu a6, a1, .LBB56_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB56_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a1
+; RV64I-ZALRSC-NEXT:    and a5, a5, a3
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:  .LBB56_3: # in Loop: Header=BB56_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB56_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a4, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_umax_i8_acquire:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -10751,6 +13041,30 @@ define i8 @atomicrmw_umax_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umax_i8_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB57_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a4, (a2)
+; RV32I-ZALRSC-NEXT:    and a6, a4, a3
+; RV32I-ZALRSC-NEXT:    mv a5, a4
+; RV32I-ZALRSC-NEXT:    bgeu a6, a1, .LBB57_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB57_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a1
+; RV32I-ZALRSC-NEXT:    and a5, a5, a3
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:  .LBB57_3: # in Loop: Header=BB57_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB57_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a4, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_umax_i8_release:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -10839,6 +13153,30 @@ define i8 @atomicrmw_umax_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_umax_i8_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB57_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a4, (a2)
+; RV64I-ZALRSC-NEXT:    and a6, a4, a3
+; RV64I-ZALRSC-NEXT:    mv a5, a4
+; RV64I-ZALRSC-NEXT:    bgeu a6, a1, .LBB57_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB57_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a1
+; RV64I-ZALRSC-NEXT:    and a5, a5, a3
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:  .LBB57_3: # in Loop: Header=BB57_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB57_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a4, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_umax_i8_release:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -11047,6 +13385,30 @@ define i8 @atomicrmw_umax_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umax_i8_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB58_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a4, (a2)
+; RV32I-ZALRSC-NEXT:    and a6, a4, a3
+; RV32I-ZALRSC-NEXT:    mv a5, a4
+; RV32I-ZALRSC-NEXT:    bgeu a6, a1, .LBB58_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB58_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a1
+; RV32I-ZALRSC-NEXT:    and a5, a5, a3
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:  .LBB58_3: # in Loop: Header=BB58_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB58_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a4, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_umax_i8_acq_rel:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -11135,6 +13497,30 @@ define i8 @atomicrmw_umax_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_umax_i8_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB58_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a4, (a2)
+; RV64I-ZALRSC-NEXT:    and a6, a4, a3
+; RV64I-ZALRSC-NEXT:    mv a5, a4
+; RV64I-ZALRSC-NEXT:    bgeu a6, a1, .LBB58_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB58_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a1
+; RV64I-ZALRSC-NEXT:    and a5, a5, a3
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:  .LBB58_3: # in Loop: Header=BB58_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB58_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a4, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_umax_i8_acq_rel:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -11343,6 +13729,30 @@ define i8 @atomicrmw_umax_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umax_i8_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB59_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aqrl a4, (a2)
+; RV32I-ZALRSC-NEXT:    and a6, a4, a3
+; RV32I-ZALRSC-NEXT:    mv a5, a4
+; RV32I-ZALRSC-NEXT:    bgeu a6, a1, .LBB59_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB59_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a1
+; RV32I-ZALRSC-NEXT:    and a5, a5, a3
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:  .LBB59_3: # in Loop: Header=BB59_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB59_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a4, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-NOZACAS-LABEL: atomicrmw_umax_i8_seq_cst:
 ; RV32IA-NOZACAS:       # %bb.0:
 ; RV32IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -11407,6 +13817,30 @@ define i8 @atomicrmw_umax_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_umax_i8_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB59_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aqrl a4, (a2)
+; RV64I-ZALRSC-NEXT:    and a6, a4, a3
+; RV64I-ZALRSC-NEXT:    mv a5, a4
+; RV64I-ZALRSC-NEXT:    bgeu a6, a1, .LBB59_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB59_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a1
+; RV64I-ZALRSC-NEXT:    and a5, a5, a3
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:  .LBB59_3: # in Loop: Header=BB59_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB59_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a4, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_umax_i8_seq_cst:
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -11543,6 +13977,30 @@ define i8 @atomicrmw_umin_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umin_i8_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB60_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a4, (a2)
+; RV32I-ZALRSC-NEXT:    and a6, a4, a3
+; RV32I-ZALRSC-NEXT:    mv a5, a4
+; RV32I-ZALRSC-NEXT:    bgeu a1, a6, .LBB60_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB60_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a1
+; RV32I-ZALRSC-NEXT:    and a5, a5, a3
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:  .LBB60_3: # in Loop: Header=BB60_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB60_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a4, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-NOZACAS-LABEL: atomicrmw_umin_i8_monotonic:
 ; RV32IA-NOZACAS:       # %bb.0:
 ; RV32IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -11607,6 +14065,30 @@ define i8 @atomicrmw_umin_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_umin_i8_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB60_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a4, (a2)
+; RV64I-ZALRSC-NEXT:    and a6, a4, a3
+; RV64I-ZALRSC-NEXT:    mv a5, a4
+; RV64I-ZALRSC-NEXT:    bgeu a1, a6, .LBB60_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB60_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a1
+; RV64I-ZALRSC-NEXT:    and a5, a5, a3
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:  .LBB60_3: # in Loop: Header=BB60_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB60_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a4, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_umin_i8_monotonic:
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -11743,6 +14225,30 @@ define i8 @atomicrmw_umin_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umin_i8_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB61_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a4, (a2)
+; RV32I-ZALRSC-NEXT:    and a6, a4, a3
+; RV32I-ZALRSC-NEXT:    mv a5, a4
+; RV32I-ZALRSC-NEXT:    bgeu a1, a6, .LBB61_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB61_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a1
+; RV32I-ZALRSC-NEXT:    and a5, a5, a3
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:  .LBB61_3: # in Loop: Header=BB61_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB61_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a4, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_umin_i8_acquire:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -11831,6 +14337,30 @@ define i8 @atomicrmw_umin_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_umin_i8_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB61_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a4, (a2)
+; RV64I-ZALRSC-NEXT:    and a6, a4, a3
+; RV64I-ZALRSC-NEXT:    mv a5, a4
+; RV64I-ZALRSC-NEXT:    bgeu a1, a6, .LBB61_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB61_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a1
+; RV64I-ZALRSC-NEXT:    and a5, a5, a3
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:  .LBB61_3: # in Loop: Header=BB61_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB61_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a4, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_umin_i8_acquire:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -12039,6 +14569,30 @@ define i8 @atomicrmw_umin_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umin_i8_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB62_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a4, (a2)
+; RV32I-ZALRSC-NEXT:    and a6, a4, a3
+; RV32I-ZALRSC-NEXT:    mv a5, a4
+; RV32I-ZALRSC-NEXT:    bgeu a1, a6, .LBB62_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB62_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a1
+; RV32I-ZALRSC-NEXT:    and a5, a5, a3
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:  .LBB62_3: # in Loop: Header=BB62_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB62_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a4, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_umin_i8_release:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -12127,6 +14681,30 @@ define i8 @atomicrmw_umin_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_umin_i8_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB62_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a4, (a2)
+; RV64I-ZALRSC-NEXT:    and a6, a4, a3
+; RV64I-ZALRSC-NEXT:    mv a5, a4
+; RV64I-ZALRSC-NEXT:    bgeu a1, a6, .LBB62_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB62_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a1
+; RV64I-ZALRSC-NEXT:    and a5, a5, a3
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:  .LBB62_3: # in Loop: Header=BB62_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB62_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a4, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_umin_i8_release:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -12335,6 +14913,30 @@ define i8 @atomicrmw_umin_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umin_i8_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB63_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a4, (a2)
+; RV32I-ZALRSC-NEXT:    and a6, a4, a3
+; RV32I-ZALRSC-NEXT:    mv a5, a4
+; RV32I-ZALRSC-NEXT:    bgeu a1, a6, .LBB63_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB63_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a1
+; RV32I-ZALRSC-NEXT:    and a5, a5, a3
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:  .LBB63_3: # in Loop: Header=BB63_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB63_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a4, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_umin_i8_acq_rel:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -12423,6 +15025,30 @@ define i8 @atomicrmw_umin_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_umin_i8_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB63_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a4, (a2)
+; RV64I-ZALRSC-NEXT:    and a6, a4, a3
+; RV64I-ZALRSC-NEXT:    mv a5, a4
+; RV64I-ZALRSC-NEXT:    bgeu a1, a6, .LBB63_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB63_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a1
+; RV64I-ZALRSC-NEXT:    and a5, a5, a3
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:  .LBB63_3: # in Loop: Header=BB63_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB63_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a4, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_umin_i8_acq_rel:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -12631,6 +15257,30 @@ define i8 @atomicrmw_umin_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umin_i8_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB64_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aqrl a4, (a2)
+; RV32I-ZALRSC-NEXT:    and a6, a4, a3
+; RV32I-ZALRSC-NEXT:    mv a5, a4
+; RV32I-ZALRSC-NEXT:    bgeu a1, a6, .LBB64_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB64_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a1
+; RV32I-ZALRSC-NEXT:    and a5, a5, a3
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:  .LBB64_3: # in Loop: Header=BB64_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB64_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a4, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-NOZACAS-LABEL: atomicrmw_umin_i8_seq_cst:
 ; RV32IA-NOZACAS:       # %bb.0:
 ; RV32IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -12695,6 +15345,30 @@ define i8 @atomicrmw_umin_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_umin_i8_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB64_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aqrl a4, (a2)
+; RV64I-ZALRSC-NEXT:    and a6, a4, a3
+; RV64I-ZALRSC-NEXT:    mv a5, a4
+; RV64I-ZALRSC-NEXT:    bgeu a1, a6, .LBB64_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB64_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a1
+; RV64I-ZALRSC-NEXT:    and a5, a5, a3
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:  .LBB64_3: # in Loop: Header=BB64_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB64_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a4, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_umin_i8_seq_cst:
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -12801,6 +15475,27 @@ define i16 @atomicrmw_xchg_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xchg_i16_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    sll a4, a3, a0
+; RV32I-ZALRSC-NEXT:    and a1, a1, a3
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB65_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV32I-ZALRSC-NEXT:    mv a5, a1
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a4
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB65_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-NOZACAS-LABEL: atomicrmw_xchg_i16_monotonic:
 ; RV32IA-NOZACAS:       # %bb.0:
 ; RV32IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -12832,6 +15527,27 @@ define i16 @atomicrmw_xchg_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xchg_i16_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    sllw a4, a3, a0
+; RV64I-ZALRSC-NEXT:    and a1, a1, a3
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB65_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV64I-ZALRSC-NEXT:    mv a5, a1
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a4
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB65_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_xchg_i16_monotonic:
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -12929,6 +15645,27 @@ define i16 @atomicrmw_xchg_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xchg_i16_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    sll a4, a3, a0
+; RV32I-ZALRSC-NEXT:    and a1, a1, a3
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB66_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV32I-ZALRSC-NEXT:    mv a5, a1
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a4
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB66_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_i16_acquire:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -12981,6 +15718,27 @@ define i16 @atomicrmw_xchg_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xchg_i16_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    sllw a4, a3, a0
+; RV64I-ZALRSC-NEXT:    and a1, a1, a3
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB66_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV64I-ZALRSC-NEXT:    mv a5, a1
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a4
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB66_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_i16_acquire:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -13141,6 +15899,27 @@ define i16 @atomicrmw_xchg_i16_release(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xchg_i16_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    sll a4, a3, a0
+; RV32I-ZALRSC-NEXT:    and a1, a1, a3
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB67_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV32I-ZALRSC-NEXT:    mv a5, a1
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a4
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB67_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_i16_release:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -13193,6 +15972,27 @@ define i16 @atomicrmw_xchg_i16_release(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xchg_i16_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    sllw a4, a3, a0
+; RV64I-ZALRSC-NEXT:    and a1, a1, a3
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB67_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV64I-ZALRSC-NEXT:    mv a5, a1
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a4
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB67_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_i16_release:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -13353,6 +16153,27 @@ define i16 @atomicrmw_xchg_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xchg_i16_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    sll a4, a3, a0
+; RV32I-ZALRSC-NEXT:    and a1, a1, a3
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB68_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV32I-ZALRSC-NEXT:    mv a5, a1
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a4
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB68_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_i16_acq_rel:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -13405,6 +16226,27 @@ define i16 @atomicrmw_xchg_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xchg_i16_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    sllw a4, a3, a0
+; RV64I-ZALRSC-NEXT:    and a1, a1, a3
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB68_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV64I-ZALRSC-NEXT:    mv a5, a1
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a4
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB68_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_i16_acq_rel:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -13565,6 +16407,27 @@ define i16 @atomicrmw_xchg_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xchg_i16_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    sll a4, a3, a0
+; RV32I-ZALRSC-NEXT:    and a1, a1, a3
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB69_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aqrl a3, (a2)
+; RV32I-ZALRSC-NEXT:    mv a5, a1
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a4
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB69_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-NOZACAS-LABEL: atomicrmw_xchg_i16_seq_cst:
 ; RV32IA-NOZACAS:       # %bb.0:
 ; RV32IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -13596,6 +16459,27 @@ define i16 @atomicrmw_xchg_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xchg_i16_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    sllw a4, a3, a0
+; RV64I-ZALRSC-NEXT:    and a1, a1, a3
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB69_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aqrl a3, (a2)
+; RV64I-ZALRSC-NEXT:    mv a5, a1
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a4
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB69_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_xchg_i16_seq_cst:
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -13697,6 +16581,23 @@ define i16 @atomicrmw_xchg_0_i16_monotonic(ptr %a) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xchg_0_i16_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a1, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a2, 16
+; RV32I-ZALRSC-NEXT:    addi a2, a2, -1
+; RV32I-ZALRSC-NEXT:    sll a2, a2, a0
+; RV32I-ZALRSC-NEXT:    not a2, a2
+; RV32I-ZALRSC-NEXT:  .LBB70_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a3, (a1)
+; RV32I-ZALRSC-NEXT:    and a4, a3, a2
+; RV32I-ZALRSC-NEXT:    sc.w a4, a4, (a1)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB70_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-NOZACAS-LABEL: atomicrmw_xchg_0_i16_monotonic:
 ; RV32IA-NOZACAS:       # %bb.0:
 ; RV32IA-NOZACAS-NEXT:    andi a1, a0, -4
@@ -13720,6 +16621,23 @@ define i16 @atomicrmw_xchg_0_i16_monotonic(ptr %a) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xchg_0_i16_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a1, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a2, 16
+; RV64I-ZALRSC-NEXT:    addi a2, a2, -1
+; RV64I-ZALRSC-NEXT:    sllw a2, a2, a0
+; RV64I-ZALRSC-NEXT:    not a2, a2
+; RV64I-ZALRSC-NEXT:  .LBB70_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a3, (a1)
+; RV64I-ZALRSC-NEXT:    and a4, a3, a2
+; RV64I-ZALRSC-NEXT:    sc.w a4, a4, (a1)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB70_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_xchg_0_i16_monotonic:
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a1, a0, -4
@@ -13791,6 +16709,23 @@ define i16 @atomicrmw_xchg_0_i16_acquire(ptr %a) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xchg_0_i16_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a1, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a2, 16
+; RV32I-ZALRSC-NEXT:    addi a2, a2, -1
+; RV32I-ZALRSC-NEXT:    sll a2, a2, a0
+; RV32I-ZALRSC-NEXT:    not a2, a2
+; RV32I-ZALRSC-NEXT:  .LBB71_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a3, (a1)
+; RV32I-ZALRSC-NEXT:    and a4, a3, a2
+; RV32I-ZALRSC-NEXT:    sc.w a4, a4, (a1)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB71_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_0_i16_acquire:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a1, a0, -4
@@ -13826,6 +16761,23 @@ define i16 @atomicrmw_xchg_0_i16_acquire(ptr %a) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xchg_0_i16_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a1, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a2, 16
+; RV64I-ZALRSC-NEXT:    addi a2, a2, -1
+; RV64I-ZALRSC-NEXT:    sllw a2, a2, a0
+; RV64I-ZALRSC-NEXT:    not a2, a2
+; RV64I-ZALRSC-NEXT:  .LBB71_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a3, (a1)
+; RV64I-ZALRSC-NEXT:    and a4, a3, a2
+; RV64I-ZALRSC-NEXT:    sc.w a4, a4, (a1)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB71_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_0_i16_acquire:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a0, -4
@@ -13933,6 +16885,23 @@ define i16 @atomicrmw_xchg_0_i16_release(ptr %a) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xchg_0_i16_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a1, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a2, 16
+; RV32I-ZALRSC-NEXT:    addi a2, a2, -1
+; RV32I-ZALRSC-NEXT:    sll a2, a2, a0
+; RV32I-ZALRSC-NEXT:    not a2, a2
+; RV32I-ZALRSC-NEXT:  .LBB72_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a3, (a1)
+; RV32I-ZALRSC-NEXT:    and a4, a3, a2
+; RV32I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a1)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB72_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_0_i16_release:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a1, a0, -4
@@ -13968,6 +16937,23 @@ define i16 @atomicrmw_xchg_0_i16_release(ptr %a) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xchg_0_i16_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a1, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a2, 16
+; RV64I-ZALRSC-NEXT:    addi a2, a2, -1
+; RV64I-ZALRSC-NEXT:    sllw a2, a2, a0
+; RV64I-ZALRSC-NEXT:    not a2, a2
+; RV64I-ZALRSC-NEXT:  .LBB72_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a3, (a1)
+; RV64I-ZALRSC-NEXT:    and a4, a3, a2
+; RV64I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a1)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB72_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_0_i16_release:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a0, -4
@@ -14075,6 +17061,23 @@ define i16 @atomicrmw_xchg_0_i16_acq_rel(ptr %a) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xchg_0_i16_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a1, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a2, 16
+; RV32I-ZALRSC-NEXT:    addi a2, a2, -1
+; RV32I-ZALRSC-NEXT:    sll a2, a2, a0
+; RV32I-ZALRSC-NEXT:    not a2, a2
+; RV32I-ZALRSC-NEXT:  .LBB73_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a3, (a1)
+; RV32I-ZALRSC-NEXT:    and a4, a3, a2
+; RV32I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a1)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB73_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_0_i16_acq_rel:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a1, a0, -4
@@ -14110,6 +17113,23 @@ define i16 @atomicrmw_xchg_0_i16_acq_rel(ptr %a) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xchg_0_i16_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a1, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a2, 16
+; RV64I-ZALRSC-NEXT:    addi a2, a2, -1
+; RV64I-ZALRSC-NEXT:    sllw a2, a2, a0
+; RV64I-ZALRSC-NEXT:    not a2, a2
+; RV64I-ZALRSC-NEXT:  .LBB73_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a3, (a1)
+; RV64I-ZALRSC-NEXT:    and a4, a3, a2
+; RV64I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a1)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB73_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_0_i16_acq_rel:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a0, -4
@@ -14217,6 +17237,23 @@ define i16 @atomicrmw_xchg_0_i16_seq_cst(ptr %a) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xchg_0_i16_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a1, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a2, 16
+; RV32I-ZALRSC-NEXT:    addi a2, a2, -1
+; RV32I-ZALRSC-NEXT:    sll a2, a2, a0
+; RV32I-ZALRSC-NEXT:    not a2, a2
+; RV32I-ZALRSC-NEXT:  .LBB74_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aqrl a3, (a1)
+; RV32I-ZALRSC-NEXT:    and a4, a3, a2
+; RV32I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a1)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB74_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_0_i16_seq_cst:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a1, a0, -4
@@ -14252,6 +17289,23 @@ define i16 @atomicrmw_xchg_0_i16_seq_cst(ptr %a) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xchg_0_i16_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a1, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a2, 16
+; RV64I-ZALRSC-NEXT:    addi a2, a2, -1
+; RV64I-ZALRSC-NEXT:    sllw a2, a2, a0
+; RV64I-ZALRSC-NEXT:    not a2, a2
+; RV64I-ZALRSC-NEXT:  .LBB74_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aqrl a3, (a1)
+; RV64I-ZALRSC-NEXT:    and a4, a3, a2
+; RV64I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a1)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB74_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_0_i16_seq_cst:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a0, -4
@@ -14360,6 +17414,22 @@ define i16 @atomicrmw_xchg_minus_1_i16_monotonic(ptr %a) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xchg_minus_1_i16_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a1, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a2, 16
+; RV32I-ZALRSC-NEXT:    addi a2, a2, -1
+; RV32I-ZALRSC-NEXT:    sll a2, a2, a0
+; RV32I-ZALRSC-NEXT:  .LBB75_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a3, (a1)
+; RV32I-ZALRSC-NEXT:    or a4, a3, a2
+; RV32I-ZALRSC-NEXT:    sc.w a4, a4, (a1)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB75_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-NOZACAS-LABEL: atomicrmw_xchg_minus_1_i16_monotonic:
 ; RV32IA-NOZACAS:       # %bb.0:
 ; RV32IA-NOZACAS-NEXT:    andi a1, a0, -4
@@ -14383,6 +17453,22 @@ define i16 @atomicrmw_xchg_minus_1_i16_monotonic(ptr %a) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xchg_minus_1_i16_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a1, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a2, 16
+; RV64I-ZALRSC-NEXT:    addi a2, a2, -1
+; RV64I-ZALRSC-NEXT:    sllw a2, a2, a0
+; RV64I-ZALRSC-NEXT:  .LBB75_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a3, (a1)
+; RV64I-ZALRSC-NEXT:    or a4, a3, a2
+; RV64I-ZALRSC-NEXT:    sc.w a4, a4, (a1)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB75_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_xchg_minus_1_i16_monotonic:
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a1, a0, -4
@@ -14456,6 +17542,22 @@ define i16 @atomicrmw_xchg_minus_1_i16_acquire(ptr %a) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xchg_minus_1_i16_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a1, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a2, 16
+; RV32I-ZALRSC-NEXT:    addi a2, a2, -1
+; RV32I-ZALRSC-NEXT:    sll a2, a2, a0
+; RV32I-ZALRSC-NEXT:  .LBB76_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a3, (a1)
+; RV32I-ZALRSC-NEXT:    or a4, a3, a2
+; RV32I-ZALRSC-NEXT:    sc.w a4, a4, (a1)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB76_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_minus_1_i16_acquire:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a1, a0, -4
@@ -14490,6 +17592,22 @@ define i16 @atomicrmw_xchg_minus_1_i16_acquire(ptr %a) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xchg_minus_1_i16_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a1, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a2, 16
+; RV64I-ZALRSC-NEXT:    addi a2, a2, -1
+; RV64I-ZALRSC-NEXT:    sllw a2, a2, a0
+; RV64I-ZALRSC-NEXT:  .LBB76_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a3, (a1)
+; RV64I-ZALRSC-NEXT:    or a4, a3, a2
+; RV64I-ZALRSC-NEXT:    sc.w a4, a4, (a1)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB76_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_minus_1_i16_acquire:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a0, -4
@@ -14596,6 +17714,22 @@ define i16 @atomicrmw_xchg_minus_1_i16_release(ptr %a) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xchg_minus_1_i16_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a1, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a2, 16
+; RV32I-ZALRSC-NEXT:    addi a2, a2, -1
+; RV32I-ZALRSC-NEXT:    sll a2, a2, a0
+; RV32I-ZALRSC-NEXT:  .LBB77_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a3, (a1)
+; RV32I-ZALRSC-NEXT:    or a4, a3, a2
+; RV32I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a1)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB77_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_minus_1_i16_release:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a1, a0, -4
@@ -14630,6 +17764,22 @@ define i16 @atomicrmw_xchg_minus_1_i16_release(ptr %a) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xchg_minus_1_i16_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a1, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a2, 16
+; RV64I-ZALRSC-NEXT:    addi a2, a2, -1
+; RV64I-ZALRSC-NEXT:    sllw a2, a2, a0
+; RV64I-ZALRSC-NEXT:  .LBB77_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a3, (a1)
+; RV64I-ZALRSC-NEXT:    or a4, a3, a2
+; RV64I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a1)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB77_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_minus_1_i16_release:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a0, -4
@@ -14736,6 +17886,22 @@ define i16 @atomicrmw_xchg_minus_1_i16_acq_rel(ptr %a) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xchg_minus_1_i16_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a1, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a2, 16
+; RV32I-ZALRSC-NEXT:    addi a2, a2, -1
+; RV32I-ZALRSC-NEXT:    sll a2, a2, a0
+; RV32I-ZALRSC-NEXT:  .LBB78_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a3, (a1)
+; RV32I-ZALRSC-NEXT:    or a4, a3, a2
+; RV32I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a1)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB78_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_minus_1_i16_acq_rel:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a1, a0, -4
@@ -14770,6 +17936,22 @@ define i16 @atomicrmw_xchg_minus_1_i16_acq_rel(ptr %a) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xchg_minus_1_i16_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a1, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a2, 16
+; RV64I-ZALRSC-NEXT:    addi a2, a2, -1
+; RV64I-ZALRSC-NEXT:    sllw a2, a2, a0
+; RV64I-ZALRSC-NEXT:  .LBB78_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a3, (a1)
+; RV64I-ZALRSC-NEXT:    or a4, a3, a2
+; RV64I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a1)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB78_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_minus_1_i16_acq_rel:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a0, -4
@@ -14876,6 +18058,22 @@ define i16 @atomicrmw_xchg_minus_1_i16_seq_cst(ptr %a) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xchg_minus_1_i16_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a1, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a2, 16
+; RV32I-ZALRSC-NEXT:    addi a2, a2, -1
+; RV32I-ZALRSC-NEXT:    sll a2, a2, a0
+; RV32I-ZALRSC-NEXT:  .LBB79_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aqrl a3, (a1)
+; RV32I-ZALRSC-NEXT:    or a4, a3, a2
+; RV32I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a1)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB79_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_minus_1_i16_seq_cst:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a1, a0, -4
@@ -14910,6 +18108,22 @@ define i16 @atomicrmw_xchg_minus_1_i16_seq_cst(ptr %a) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xchg_minus_1_i16_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a1, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a2, 16
+; RV64I-ZALRSC-NEXT:    addi a2, a2, -1
+; RV64I-ZALRSC-NEXT:    sllw a2, a2, a0
+; RV64I-ZALRSC-NEXT:  .LBB79_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aqrl a3, (a1)
+; RV64I-ZALRSC-NEXT:    or a4, a3, a2
+; RV64I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a1)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB79_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xchg_minus_1_i16_seq_cst:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a0, -4
@@ -15014,6 +18228,27 @@ define i16 @atomicrmw_add_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_add_i16_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    sll a4, a3, a0
+; RV32I-ZALRSC-NEXT:    and a1, a1, a3
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB80_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV32I-ZALRSC-NEXT:    add a5, a3, a1
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a4
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB80_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-NOZACAS-LABEL: atomicrmw_add_i16_monotonic:
 ; RV32IA-NOZACAS:       # %bb.0:
 ; RV32IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -15045,6 +18280,27 @@ define i16 @atomicrmw_add_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_add_i16_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    sllw a4, a3, a0
+; RV64I-ZALRSC-NEXT:    and a1, a1, a3
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB80_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV64I-ZALRSC-NEXT:    add a5, a3, a1
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a4
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB80_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_add_i16_monotonic:
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -15142,6 +18398,27 @@ define i16 @atomicrmw_add_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_add_i16_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    sll a4, a3, a0
+; RV32I-ZALRSC-NEXT:    and a1, a1, a3
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB81_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV32I-ZALRSC-NEXT:    add a5, a3, a1
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a4
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB81_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_add_i16_acquire:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -15194,6 +18471,27 @@ define i16 @atomicrmw_add_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_add_i16_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    sllw a4, a3, a0
+; RV64I-ZALRSC-NEXT:    and a1, a1, a3
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB81_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV64I-ZALRSC-NEXT:    add a5, a3, a1
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a4
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB81_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_add_i16_acquire:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -15354,6 +18652,27 @@ define i16 @atomicrmw_add_i16_release(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_add_i16_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    sll a4, a3, a0
+; RV32I-ZALRSC-NEXT:    and a1, a1, a3
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB82_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV32I-ZALRSC-NEXT:    add a5, a3, a1
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a4
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB82_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_add_i16_release:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -15406,6 +18725,27 @@ define i16 @atomicrmw_add_i16_release(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_add_i16_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    sllw a4, a3, a0
+; RV64I-ZALRSC-NEXT:    and a1, a1, a3
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB82_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV64I-ZALRSC-NEXT:    add a5, a3, a1
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a4
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB82_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_add_i16_release:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -15566,6 +18906,27 @@ define i16 @atomicrmw_add_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_add_i16_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    sll a4, a3, a0
+; RV32I-ZALRSC-NEXT:    and a1, a1, a3
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB83_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV32I-ZALRSC-NEXT:    add a5, a3, a1
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a4
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB83_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_add_i16_acq_rel:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -15618,6 +18979,27 @@ define i16 @atomicrmw_add_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_add_i16_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    sllw a4, a3, a0
+; RV64I-ZALRSC-NEXT:    and a1, a1, a3
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB83_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV64I-ZALRSC-NEXT:    add a5, a3, a1
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a4
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB83_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_add_i16_acq_rel:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -15778,6 +19160,27 @@ define i16 @atomicrmw_add_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_add_i16_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    sll a4, a3, a0
+; RV32I-ZALRSC-NEXT:    and a1, a1, a3
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB84_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aqrl a3, (a2)
+; RV32I-ZALRSC-NEXT:    add a5, a3, a1
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a4
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB84_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-NOZACAS-LABEL: atomicrmw_add_i16_seq_cst:
 ; RV32IA-NOZACAS:       # %bb.0:
 ; RV32IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -15809,6 +19212,27 @@ define i16 @atomicrmw_add_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_add_i16_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    sllw a4, a3, a0
+; RV64I-ZALRSC-NEXT:    and a1, a1, a3
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB84_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aqrl a3, (a2)
+; RV64I-ZALRSC-NEXT:    add a5, a3, a1
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a4
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB84_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_add_i16_seq_cst:
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -15906,6 +19330,27 @@ define i16 @atomicrmw_sub_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_sub_i16_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    sll a4, a3, a0
+; RV32I-ZALRSC-NEXT:    and a1, a1, a3
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB85_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV32I-ZALRSC-NEXT:    sub a5, a3, a1
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a4
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB85_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-NOZACAS-LABEL: atomicrmw_sub_i16_monotonic:
 ; RV32IA-NOZACAS:       # %bb.0:
 ; RV32IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -15937,6 +19382,27 @@ define i16 @atomicrmw_sub_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_sub_i16_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    sllw a4, a3, a0
+; RV64I-ZALRSC-NEXT:    and a1, a1, a3
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB85_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV64I-ZALRSC-NEXT:    sub a5, a3, a1
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a4
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB85_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_sub_i16_monotonic:
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -16038,6 +19504,27 @@ define i16 @atomicrmw_sub_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_sub_i16_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    sll a4, a3, a0
+; RV32I-ZALRSC-NEXT:    and a1, a1, a3
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB86_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV32I-ZALRSC-NEXT:    sub a5, a3, a1
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a4
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB86_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_sub_i16_acquire:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -16090,6 +19577,27 @@ define i16 @atomicrmw_sub_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_sub_i16_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    sllw a4, a3, a0
+; RV64I-ZALRSC-NEXT:    and a1, a1, a3
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB86_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV64I-ZALRSC-NEXT:    sub a5, a3, a1
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a4
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB86_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_sub_i16_acquire:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -16254,6 +19762,27 @@ define i16 @atomicrmw_sub_i16_release(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_sub_i16_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    sll a4, a3, a0
+; RV32I-ZALRSC-NEXT:    and a1, a1, a3
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB87_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV32I-ZALRSC-NEXT:    sub a5, a3, a1
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a4
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB87_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_sub_i16_release:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -16306,6 +19835,27 @@ define i16 @atomicrmw_sub_i16_release(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_sub_i16_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    sllw a4, a3, a0
+; RV64I-ZALRSC-NEXT:    and a1, a1, a3
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB87_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV64I-ZALRSC-NEXT:    sub a5, a3, a1
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a4
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB87_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_sub_i16_release:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -16470,6 +20020,27 @@ define i16 @atomicrmw_sub_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_sub_i16_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    sll a4, a3, a0
+; RV32I-ZALRSC-NEXT:    and a1, a1, a3
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB88_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV32I-ZALRSC-NEXT:    sub a5, a3, a1
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a4
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB88_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_sub_i16_acq_rel:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -16522,6 +20093,27 @@ define i16 @atomicrmw_sub_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_sub_i16_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    sllw a4, a3, a0
+; RV64I-ZALRSC-NEXT:    and a1, a1, a3
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB88_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV64I-ZALRSC-NEXT:    sub a5, a3, a1
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a4
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB88_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_sub_i16_acq_rel:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -16686,6 +20278,27 @@ define i16 @atomicrmw_sub_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_sub_i16_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    sll a4, a3, a0
+; RV32I-ZALRSC-NEXT:    and a1, a1, a3
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB89_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aqrl a3, (a2)
+; RV32I-ZALRSC-NEXT:    sub a5, a3, a1
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a4
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB89_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-NOZACAS-LABEL: atomicrmw_sub_i16_seq_cst:
 ; RV32IA-NOZACAS:       # %bb.0:
 ; RV32IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -16717,6 +20330,27 @@ define i16 @atomicrmw_sub_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_sub_i16_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    sllw a4, a3, a0
+; RV64I-ZALRSC-NEXT:    and a1, a1, a3
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB89_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aqrl a3, (a2)
+; RV64I-ZALRSC-NEXT:    sub a5, a3, a1
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a4
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB89_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_sub_i16_seq_cst:
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -16818,6 +20452,26 @@ define i16 @atomicrmw_and_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_and_i16_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    sll a4, a3, a0
+; RV32I-ZALRSC-NEXT:    and a1, a1, a3
+; RV32I-ZALRSC-NEXT:    not a3, a4
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:    or a1, a1, a3
+; RV32I-ZALRSC-NEXT:  .LBB90_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV32I-ZALRSC-NEXT:    and a4, a3, a1
+; RV32I-ZALRSC-NEXT:    sc.w a4, a4, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB90_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-NOZACAS-LABEL: atomicrmw_and_i16_monotonic:
 ; RV32IA-NOZACAS:       # %bb.0:
 ; RV32IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -16843,6 +20497,26 @@ define i16 @atomicrmw_and_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_and_i16_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    sllw a4, a3, a0
+; RV64I-ZALRSC-NEXT:    and a1, a1, a3
+; RV64I-ZALRSC-NEXT:    not a3, a4
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:    or a1, a1, a3
+; RV64I-ZALRSC-NEXT:  .LBB90_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV64I-ZALRSC-NEXT:    and a4, a3, a1
+; RV64I-ZALRSC-NEXT:    sc.w a4, a4, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB90_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_and_i16_monotonic:
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -16922,6 +20596,26 @@ define i16 @atomicrmw_and_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_and_i16_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    sll a4, a3, a0
+; RV32I-ZALRSC-NEXT:    and a1, a1, a3
+; RV32I-ZALRSC-NEXT:    not a3, a4
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:    or a1, a1, a3
+; RV32I-ZALRSC-NEXT:  .LBB91_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV32I-ZALRSC-NEXT:    and a4, a3, a1
+; RV32I-ZALRSC-NEXT:    sc.w a4, a4, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB91_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_and_i16_acquire:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -16962,6 +20656,26 @@ define i16 @atomicrmw_and_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_and_i16_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    sllw a4, a3, a0
+; RV64I-ZALRSC-NEXT:    and a1, a1, a3
+; RV64I-ZALRSC-NEXT:    not a3, a4
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:    or a1, a1, a3
+; RV64I-ZALRSC-NEXT:  .LBB91_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV64I-ZALRSC-NEXT:    and a4, a3, a1
+; RV64I-ZALRSC-NEXT:    sc.w a4, a4, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB91_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_and_i16_acquire:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -17086,6 +20800,26 @@ define i16 @atomicrmw_and_i16_release(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_and_i16_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    sll a4, a3, a0
+; RV32I-ZALRSC-NEXT:    and a1, a1, a3
+; RV32I-ZALRSC-NEXT:    not a3, a4
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:    or a1, a1, a3
+; RV32I-ZALRSC-NEXT:  .LBB92_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV32I-ZALRSC-NEXT:    and a4, a3, a1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB92_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_and_i16_release:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -17126,6 +20860,26 @@ define i16 @atomicrmw_and_i16_release(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_and_i16_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    sllw a4, a3, a0
+; RV64I-ZALRSC-NEXT:    and a1, a1, a3
+; RV64I-ZALRSC-NEXT:    not a3, a4
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:    or a1, a1, a3
+; RV64I-ZALRSC-NEXT:  .LBB92_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV64I-ZALRSC-NEXT:    and a4, a3, a1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB92_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_and_i16_release:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -17250,6 +21004,26 @@ define i16 @atomicrmw_and_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_and_i16_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    sll a4, a3, a0
+; RV32I-ZALRSC-NEXT:    and a1, a1, a3
+; RV32I-ZALRSC-NEXT:    not a3, a4
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:    or a1, a1, a3
+; RV32I-ZALRSC-NEXT:  .LBB93_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV32I-ZALRSC-NEXT:    and a4, a3, a1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB93_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_and_i16_acq_rel:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -17290,6 +21064,26 @@ define i16 @atomicrmw_and_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_and_i16_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    sllw a4, a3, a0
+; RV64I-ZALRSC-NEXT:    and a1, a1, a3
+; RV64I-ZALRSC-NEXT:    not a3, a4
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:    or a1, a1, a3
+; RV64I-ZALRSC-NEXT:  .LBB93_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV64I-ZALRSC-NEXT:    and a4, a3, a1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB93_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_and_i16_acq_rel:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -17414,6 +21208,26 @@ define i16 @atomicrmw_and_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_and_i16_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    sll a4, a3, a0
+; RV32I-ZALRSC-NEXT:    and a1, a1, a3
+; RV32I-ZALRSC-NEXT:    not a3, a4
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:    or a1, a1, a3
+; RV32I-ZALRSC-NEXT:  .LBB94_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aqrl a3, (a2)
+; RV32I-ZALRSC-NEXT:    and a4, a3, a1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB94_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_and_i16_seq_cst:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -17454,6 +21268,26 @@ define i16 @atomicrmw_and_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_and_i16_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    sllw a4, a3, a0
+; RV64I-ZALRSC-NEXT:    and a1, a1, a3
+; RV64I-ZALRSC-NEXT:    not a3, a4
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:    or a1, a1, a3
+; RV64I-ZALRSC-NEXT:  .LBB94_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aqrl a3, (a2)
+; RV64I-ZALRSC-NEXT:    and a4, a3, a1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB94_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_and_i16_seq_cst:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -17578,6 +21412,28 @@ define i16 @atomicrmw_nand_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_nand_i16_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    sll a4, a3, a0
+; RV32I-ZALRSC-NEXT:    and a1, a1, a3
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB95_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV32I-ZALRSC-NEXT:    and a5, a3, a1
+; RV32I-ZALRSC-NEXT:    not a5, a5
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a4
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB95_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-NOZACAS-LABEL: atomicrmw_nand_i16_monotonic:
 ; RV32IA-NOZACAS:       # %bb.0:
 ; RV32IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -17610,6 +21466,28 @@ define i16 @atomicrmw_nand_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_nand_i16_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    sllw a4, a3, a0
+; RV64I-ZALRSC-NEXT:    and a1, a1, a3
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB95_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV64I-ZALRSC-NEXT:    and a5, a3, a1
+; RV64I-ZALRSC-NEXT:    not a5, a5
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a4
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB95_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_nand_i16_monotonic:
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -17838,6 +21716,28 @@ define i16 @atomicrmw_nand_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_nand_i16_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    sll a4, a3, a0
+; RV32I-ZALRSC-NEXT:    and a1, a1, a3
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB96_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV32I-ZALRSC-NEXT:    and a5, a3, a1
+; RV32I-ZALRSC-NEXT:    not a5, a5
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a4
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB96_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_nand_i16_acquire:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -17892,6 +21792,28 @@ define i16 @atomicrmw_nand_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_nand_i16_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    sllw a4, a3, a0
+; RV64I-ZALRSC-NEXT:    and a1, a1, a3
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB96_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV64I-ZALRSC-NEXT:    and a5, a3, a1
+; RV64I-ZALRSC-NEXT:    not a5, a5
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a4
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB96_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_nand_i16_acquire:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -18186,6 +22108,28 @@ define i16 @atomicrmw_nand_i16_release(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_nand_i16_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    sll a4, a3, a0
+; RV32I-ZALRSC-NEXT:    and a1, a1, a3
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB97_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV32I-ZALRSC-NEXT:    and a5, a3, a1
+; RV32I-ZALRSC-NEXT:    not a5, a5
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a4
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB97_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_nand_i16_release:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -18240,6 +22184,28 @@ define i16 @atomicrmw_nand_i16_release(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_nand_i16_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    sllw a4, a3, a0
+; RV64I-ZALRSC-NEXT:    and a1, a1, a3
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB97_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV64I-ZALRSC-NEXT:    and a5, a3, a1
+; RV64I-ZALRSC-NEXT:    not a5, a5
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a4
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB97_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_nand_i16_release:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -18534,6 +22500,28 @@ define i16 @atomicrmw_nand_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_nand_i16_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    sll a4, a3, a0
+; RV32I-ZALRSC-NEXT:    and a1, a1, a3
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB98_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV32I-ZALRSC-NEXT:    and a5, a3, a1
+; RV32I-ZALRSC-NEXT:    not a5, a5
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a4
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB98_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_nand_i16_acq_rel:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -18588,6 +22576,28 @@ define i16 @atomicrmw_nand_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_nand_i16_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    sllw a4, a3, a0
+; RV64I-ZALRSC-NEXT:    and a1, a1, a3
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB98_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV64I-ZALRSC-NEXT:    and a5, a3, a1
+; RV64I-ZALRSC-NEXT:    not a5, a5
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a4
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB98_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_nand_i16_acq_rel:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -18882,6 +22892,28 @@ define i16 @atomicrmw_nand_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_nand_i16_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    sll a4, a3, a0
+; RV32I-ZALRSC-NEXT:    and a1, a1, a3
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB99_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aqrl a3, (a2)
+; RV32I-ZALRSC-NEXT:    and a5, a3, a1
+; RV32I-ZALRSC-NEXT:    not a5, a5
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a4
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB99_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-NOZACAS-LABEL: atomicrmw_nand_i16_seq_cst:
 ; RV32IA-NOZACAS:       # %bb.0:
 ; RV32IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -18914,6 +22946,28 @@ define i16 @atomicrmw_nand_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_nand_i16_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    sllw a4, a3, a0
+; RV64I-ZALRSC-NEXT:    and a1, a1, a3
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB99_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aqrl a3, (a2)
+; RV64I-ZALRSC-NEXT:    and a5, a3, a1
+; RV64I-ZALRSC-NEXT:    not a5, a5
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a4
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB99_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_nand_i16_seq_cst:
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -19146,6 +23200,22 @@ define i16 @atomicrmw_or_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_or_i16_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    slli a1, a1, 16
+; RV32I-ZALRSC-NEXT:    srli a1, a1, 16
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB100_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV32I-ZALRSC-NEXT:    or a4, a3, a1
+; RV32I-ZALRSC-NEXT:    sc.w a4, a4, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB100_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-NOZACAS-LABEL: atomicrmw_or_i16_monotonic:
 ; RV32IA-NOZACAS:       # %bb.0:
 ; RV32IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -19167,6 +23237,22 @@ define i16 @atomicrmw_or_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_or_i16_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    slli a1, a1, 48
+; RV64I-ZALRSC-NEXT:    srli a1, a1, 48
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB100_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV64I-ZALRSC-NEXT:    or a4, a3, a1
+; RV64I-ZALRSC-NEXT:    sc.w a4, a4, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB100_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_or_i16_monotonic:
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -19234,6 +23320,22 @@ define i16 @atomicrmw_or_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_or_i16_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    slli a1, a1, 16
+; RV32I-ZALRSC-NEXT:    srli a1, a1, 16
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB101_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV32I-ZALRSC-NEXT:    or a4, a3, a1
+; RV32I-ZALRSC-NEXT:    sc.w a4, a4, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB101_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_or_i16_acquire:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -19266,6 +23368,22 @@ define i16 @atomicrmw_or_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_or_i16_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    slli a1, a1, 48
+; RV64I-ZALRSC-NEXT:    srli a1, a1, 48
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB101_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV64I-ZALRSC-NEXT:    or a4, a3, a1
+; RV64I-ZALRSC-NEXT:    sc.w a4, a4, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB101_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_or_i16_acquire:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -19366,6 +23484,22 @@ define i16 @atomicrmw_or_i16_release(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_or_i16_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    slli a1, a1, 16
+; RV32I-ZALRSC-NEXT:    srli a1, a1, 16
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB102_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV32I-ZALRSC-NEXT:    or a4, a3, a1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB102_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_or_i16_release:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -19398,6 +23532,22 @@ define i16 @atomicrmw_or_i16_release(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_or_i16_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    slli a1, a1, 48
+; RV64I-ZALRSC-NEXT:    srli a1, a1, 48
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB102_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV64I-ZALRSC-NEXT:    or a4, a3, a1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB102_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_or_i16_release:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -19498,6 +23648,22 @@ define i16 @atomicrmw_or_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_or_i16_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    slli a1, a1, 16
+; RV32I-ZALRSC-NEXT:    srli a1, a1, 16
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB103_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV32I-ZALRSC-NEXT:    or a4, a3, a1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB103_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_or_i16_acq_rel:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -19530,6 +23696,22 @@ define i16 @atomicrmw_or_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_or_i16_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    slli a1, a1, 48
+; RV64I-ZALRSC-NEXT:    srli a1, a1, 48
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB103_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV64I-ZALRSC-NEXT:    or a4, a3, a1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB103_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_or_i16_acq_rel:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -19630,6 +23812,22 @@ define i16 @atomicrmw_or_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_or_i16_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    slli a1, a1, 16
+; RV32I-ZALRSC-NEXT:    srli a1, a1, 16
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB104_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aqrl a3, (a2)
+; RV32I-ZALRSC-NEXT:    or a4, a3, a1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB104_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_or_i16_seq_cst:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -19662,6 +23860,22 @@ define i16 @atomicrmw_or_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_or_i16_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    slli a1, a1, 48
+; RV64I-ZALRSC-NEXT:    srli a1, a1, 48
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB104_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aqrl a3, (a2)
+; RV64I-ZALRSC-NEXT:    or a4, a3, a1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB104_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_or_i16_seq_cst:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -19762,6 +23976,22 @@ define i16 @atomicrmw_xor_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xor_i16_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    slli a1, a1, 16
+; RV32I-ZALRSC-NEXT:    srli a1, a1, 16
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB105_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV32I-ZALRSC-NEXT:    xor a4, a3, a1
+; RV32I-ZALRSC-NEXT:    sc.w a4, a4, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB105_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-NOZACAS-LABEL: atomicrmw_xor_i16_monotonic:
 ; RV32IA-NOZACAS:       # %bb.0:
 ; RV32IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -19783,6 +24013,22 @@ define i16 @atomicrmw_xor_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xor_i16_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    slli a1, a1, 48
+; RV64I-ZALRSC-NEXT:    srli a1, a1, 48
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB105_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV64I-ZALRSC-NEXT:    xor a4, a3, a1
+; RV64I-ZALRSC-NEXT:    sc.w a4, a4, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB105_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_xor_i16_monotonic:
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -19850,6 +24096,22 @@ define i16 @atomicrmw_xor_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xor_i16_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    slli a1, a1, 16
+; RV32I-ZALRSC-NEXT:    srli a1, a1, 16
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB106_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV32I-ZALRSC-NEXT:    xor a4, a3, a1
+; RV32I-ZALRSC-NEXT:    sc.w a4, a4, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB106_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_xor_i16_acquire:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -19882,6 +24144,22 @@ define i16 @atomicrmw_xor_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xor_i16_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    slli a1, a1, 48
+; RV64I-ZALRSC-NEXT:    srli a1, a1, 48
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB106_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV64I-ZALRSC-NEXT:    xor a4, a3, a1
+; RV64I-ZALRSC-NEXT:    sc.w a4, a4, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB106_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xor_i16_acquire:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -19982,6 +24260,22 @@ define i16 @atomicrmw_xor_i16_release(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xor_i16_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    slli a1, a1, 16
+; RV32I-ZALRSC-NEXT:    srli a1, a1, 16
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB107_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV32I-ZALRSC-NEXT:    xor a4, a3, a1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB107_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_xor_i16_release:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -20014,6 +24308,22 @@ define i16 @atomicrmw_xor_i16_release(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xor_i16_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    slli a1, a1, 48
+; RV64I-ZALRSC-NEXT:    srli a1, a1, 48
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB107_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV64I-ZALRSC-NEXT:    xor a4, a3, a1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB107_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xor_i16_release:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -20114,6 +24424,22 @@ define i16 @atomicrmw_xor_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xor_i16_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    slli a1, a1, 16
+; RV32I-ZALRSC-NEXT:    srli a1, a1, 16
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB108_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV32I-ZALRSC-NEXT:    xor a4, a3, a1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB108_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_xor_i16_acq_rel:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -20146,6 +24472,22 @@ define i16 @atomicrmw_xor_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xor_i16_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    slli a1, a1, 48
+; RV64I-ZALRSC-NEXT:    srli a1, a1, 48
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB108_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV64I-ZALRSC-NEXT:    xor a4, a3, a1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB108_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xor_i16_acq_rel:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -20246,6 +24588,22 @@ define i16 @atomicrmw_xor_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xor_i16_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    slli a1, a1, 16
+; RV32I-ZALRSC-NEXT:    srli a1, a1, 16
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB109_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aqrl a3, (a2)
+; RV32I-ZALRSC-NEXT:    xor a4, a3, a1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB109_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_xor_i16_seq_cst:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -20278,6 +24636,22 @@ define i16 @atomicrmw_xor_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xor_i16_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    slli a1, a1, 48
+; RV64I-ZALRSC-NEXT:    srli a1, a1, 48
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB109_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aqrl a3, (a2)
+; RV64I-ZALRSC-NEXT:    xor a4, a3, a1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a4, a4, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB109_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_xor_i16_seq_cst:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -20410,6 +24784,37 @@ define i16 @atomicrmw_max_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_max_i16_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    slli a1, a1, 16
+; RV32I-ZALRSC-NEXT:    li a4, 16
+; RV32I-ZALRSC-NEXT:    andi a5, a0, 24
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    srai a1, a1, 16
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:    sub a4, a4, a5
+; RV32I-ZALRSC-NEXT:  .LBB110_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a5, (a2)
+; RV32I-ZALRSC-NEXT:    and a7, a5, a3
+; RV32I-ZALRSC-NEXT:    mv a6, a5
+; RV32I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV32I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV32I-ZALRSC-NEXT:    bge a7, a1, .LBB110_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB110_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV32I-ZALRSC-NEXT:    and a6, a6, a3
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV32I-ZALRSC-NEXT:  .LBB110_3: # in Loop: Header=BB110_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w a6, a6, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a6, .LBB110_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a5, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-NOZACAS-LABEL: atomicrmw_max_i16_monotonic:
 ; RV32IA-NOZACAS:       # %bb.0:
 ; RV32IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -20483,6 +24888,37 @@ define i16 @atomicrmw_max_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_max_i16_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    slli a1, a1, 48
+; RV64I-ZALRSC-NEXT:    li a4, 48
+; RV64I-ZALRSC-NEXT:    andi a5, a0, 24
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    srai a1, a1, 48
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:    sub a4, a4, a5
+; RV64I-ZALRSC-NEXT:  .LBB110_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a5, (a2)
+; RV64I-ZALRSC-NEXT:    and a7, a5, a3
+; RV64I-ZALRSC-NEXT:    mv a6, a5
+; RV64I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV64I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV64I-ZALRSC-NEXT:    bge a7, a1, .LBB110_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB110_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV64I-ZALRSC-NEXT:    and a6, a6, a3
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV64I-ZALRSC-NEXT:  .LBB110_3: # in Loop: Header=BB110_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w a6, a6, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a6, .LBB110_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a5, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_max_i16_monotonic:
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -20642,6 +25078,37 @@ define i16 @atomicrmw_max_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_max_i16_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    slli a1, a1, 16
+; RV32I-ZALRSC-NEXT:    li a4, 16
+; RV32I-ZALRSC-NEXT:    andi a5, a0, 24
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    srai a1, a1, 16
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:    sub a4, a4, a5
+; RV32I-ZALRSC-NEXT:  .LBB111_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a5, (a2)
+; RV32I-ZALRSC-NEXT:    and a7, a5, a3
+; RV32I-ZALRSC-NEXT:    mv a6, a5
+; RV32I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV32I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV32I-ZALRSC-NEXT:    bge a7, a1, .LBB111_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB111_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV32I-ZALRSC-NEXT:    and a6, a6, a3
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV32I-ZALRSC-NEXT:  .LBB111_3: # in Loop: Header=BB111_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w a6, a6, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a6, .LBB111_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a5, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_max_i16_acquire:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -20746,6 +25213,37 @@ define i16 @atomicrmw_max_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_max_i16_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    slli a1, a1, 48
+; RV64I-ZALRSC-NEXT:    li a4, 48
+; RV64I-ZALRSC-NEXT:    andi a5, a0, 24
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    srai a1, a1, 48
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:    sub a4, a4, a5
+; RV64I-ZALRSC-NEXT:  .LBB111_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a5, (a2)
+; RV64I-ZALRSC-NEXT:    and a7, a5, a3
+; RV64I-ZALRSC-NEXT:    mv a6, a5
+; RV64I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV64I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV64I-ZALRSC-NEXT:    bge a7, a1, .LBB111_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB111_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV64I-ZALRSC-NEXT:    and a6, a6, a3
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV64I-ZALRSC-NEXT:  .LBB111_3: # in Loop: Header=BB111_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w a6, a6, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a6, .LBB111_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a5, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_max_i16_acquire:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -20998,6 +25496,37 @@ define i16 @atomicrmw_max_i16_release(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_max_i16_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    slli a1, a1, 16
+; RV32I-ZALRSC-NEXT:    li a4, 16
+; RV32I-ZALRSC-NEXT:    andi a5, a0, 24
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    srai a1, a1, 16
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:    sub a4, a4, a5
+; RV32I-ZALRSC-NEXT:  .LBB112_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a5, (a2)
+; RV32I-ZALRSC-NEXT:    and a7, a5, a3
+; RV32I-ZALRSC-NEXT:    mv a6, a5
+; RV32I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV32I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV32I-ZALRSC-NEXT:    bge a7, a1, .LBB112_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB112_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV32I-ZALRSC-NEXT:    and a6, a6, a3
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV32I-ZALRSC-NEXT:  .LBB112_3: # in Loop: Header=BB112_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a6, a6, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a6, .LBB112_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a5, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_max_i16_release:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -21102,6 +25631,37 @@ define i16 @atomicrmw_max_i16_release(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_max_i16_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    slli a1, a1, 48
+; RV64I-ZALRSC-NEXT:    li a4, 48
+; RV64I-ZALRSC-NEXT:    andi a5, a0, 24
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    srai a1, a1, 48
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:    sub a4, a4, a5
+; RV64I-ZALRSC-NEXT:  .LBB112_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a5, (a2)
+; RV64I-ZALRSC-NEXT:    and a7, a5, a3
+; RV64I-ZALRSC-NEXT:    mv a6, a5
+; RV64I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV64I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV64I-ZALRSC-NEXT:    bge a7, a1, .LBB112_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB112_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV64I-ZALRSC-NEXT:    and a6, a6, a3
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV64I-ZALRSC-NEXT:  .LBB112_3: # in Loop: Header=BB112_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a6, a6, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a6, .LBB112_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a5, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_max_i16_release:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -21354,6 +25914,37 @@ define i16 @atomicrmw_max_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_max_i16_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    slli a1, a1, 16
+; RV32I-ZALRSC-NEXT:    li a4, 16
+; RV32I-ZALRSC-NEXT:    andi a5, a0, 24
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    srai a1, a1, 16
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:    sub a4, a4, a5
+; RV32I-ZALRSC-NEXT:  .LBB113_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a5, (a2)
+; RV32I-ZALRSC-NEXT:    and a7, a5, a3
+; RV32I-ZALRSC-NEXT:    mv a6, a5
+; RV32I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV32I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV32I-ZALRSC-NEXT:    bge a7, a1, .LBB113_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB113_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV32I-ZALRSC-NEXT:    and a6, a6, a3
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV32I-ZALRSC-NEXT:  .LBB113_3: # in Loop: Header=BB113_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a6, a6, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a6, .LBB113_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a5, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_max_i16_acq_rel:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -21458,6 +26049,37 @@ define i16 @atomicrmw_max_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_max_i16_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    slli a1, a1, 48
+; RV64I-ZALRSC-NEXT:    li a4, 48
+; RV64I-ZALRSC-NEXT:    andi a5, a0, 24
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    srai a1, a1, 48
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:    sub a4, a4, a5
+; RV64I-ZALRSC-NEXT:  .LBB113_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a5, (a2)
+; RV64I-ZALRSC-NEXT:    and a7, a5, a3
+; RV64I-ZALRSC-NEXT:    mv a6, a5
+; RV64I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV64I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV64I-ZALRSC-NEXT:    bge a7, a1, .LBB113_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB113_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV64I-ZALRSC-NEXT:    and a6, a6, a3
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV64I-ZALRSC-NEXT:  .LBB113_3: # in Loop: Header=BB113_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a6, a6, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a6, .LBB113_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a5, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_max_i16_acq_rel:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -21710,6 +26332,37 @@ define i16 @atomicrmw_max_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_max_i16_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    slli a1, a1, 16
+; RV32I-ZALRSC-NEXT:    li a4, 16
+; RV32I-ZALRSC-NEXT:    andi a5, a0, 24
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    srai a1, a1, 16
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:    sub a4, a4, a5
+; RV32I-ZALRSC-NEXT:  .LBB114_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aqrl a5, (a2)
+; RV32I-ZALRSC-NEXT:    and a7, a5, a3
+; RV32I-ZALRSC-NEXT:    mv a6, a5
+; RV32I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV32I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV32I-ZALRSC-NEXT:    bge a7, a1, .LBB114_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB114_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV32I-ZALRSC-NEXT:    and a6, a6, a3
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV32I-ZALRSC-NEXT:  .LBB114_3: # in Loop: Header=BB114_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a6, a6, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a6, .LBB114_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a5, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-NOZACAS-LABEL: atomicrmw_max_i16_seq_cst:
 ; RV32IA-NOZACAS:       # %bb.0:
 ; RV32IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -21783,6 +26436,37 @@ define i16 @atomicrmw_max_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_max_i16_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    slli a1, a1, 48
+; RV64I-ZALRSC-NEXT:    li a4, 48
+; RV64I-ZALRSC-NEXT:    andi a5, a0, 24
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    srai a1, a1, 48
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:    sub a4, a4, a5
+; RV64I-ZALRSC-NEXT:  .LBB114_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aqrl a5, (a2)
+; RV64I-ZALRSC-NEXT:    and a7, a5, a3
+; RV64I-ZALRSC-NEXT:    mv a6, a5
+; RV64I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV64I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV64I-ZALRSC-NEXT:    bge a7, a1, .LBB114_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB114_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV64I-ZALRSC-NEXT:    and a6, a6, a3
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV64I-ZALRSC-NEXT:  .LBB114_3: # in Loop: Header=BB114_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a6, a6, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a6, .LBB114_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a5, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_max_i16_seq_cst:
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -21942,6 +26626,37 @@ define i16 @atomicrmw_min_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_min_i16_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    slli a1, a1, 16
+; RV32I-ZALRSC-NEXT:    li a4, 16
+; RV32I-ZALRSC-NEXT:    andi a5, a0, 24
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    srai a1, a1, 16
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:    sub a4, a4, a5
+; RV32I-ZALRSC-NEXT:  .LBB115_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a5, (a2)
+; RV32I-ZALRSC-NEXT:    and a7, a5, a3
+; RV32I-ZALRSC-NEXT:    mv a6, a5
+; RV32I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV32I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV32I-ZALRSC-NEXT:    bge a1, a7, .LBB115_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB115_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV32I-ZALRSC-NEXT:    and a6, a6, a3
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV32I-ZALRSC-NEXT:  .LBB115_3: # in Loop: Header=BB115_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w a6, a6, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a6, .LBB115_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a5, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-NOZACAS-LABEL: atomicrmw_min_i16_monotonic:
 ; RV32IA-NOZACAS:       # %bb.0:
 ; RV32IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -22015,6 +26730,37 @@ define i16 @atomicrmw_min_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_min_i16_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    slli a1, a1, 48
+; RV64I-ZALRSC-NEXT:    li a4, 48
+; RV64I-ZALRSC-NEXT:    andi a5, a0, 24
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    srai a1, a1, 48
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:    sub a4, a4, a5
+; RV64I-ZALRSC-NEXT:  .LBB115_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a5, (a2)
+; RV64I-ZALRSC-NEXT:    and a7, a5, a3
+; RV64I-ZALRSC-NEXT:    mv a6, a5
+; RV64I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV64I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV64I-ZALRSC-NEXT:    bge a1, a7, .LBB115_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB115_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV64I-ZALRSC-NEXT:    and a6, a6, a3
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV64I-ZALRSC-NEXT:  .LBB115_3: # in Loop: Header=BB115_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w a6, a6, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a6, .LBB115_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a5, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_min_i16_monotonic:
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -22174,6 +26920,37 @@ define i16 @atomicrmw_min_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_min_i16_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    slli a1, a1, 16
+; RV32I-ZALRSC-NEXT:    li a4, 16
+; RV32I-ZALRSC-NEXT:    andi a5, a0, 24
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    srai a1, a1, 16
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:    sub a4, a4, a5
+; RV32I-ZALRSC-NEXT:  .LBB116_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a5, (a2)
+; RV32I-ZALRSC-NEXT:    and a7, a5, a3
+; RV32I-ZALRSC-NEXT:    mv a6, a5
+; RV32I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV32I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV32I-ZALRSC-NEXT:    bge a1, a7, .LBB116_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB116_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV32I-ZALRSC-NEXT:    and a6, a6, a3
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV32I-ZALRSC-NEXT:  .LBB116_3: # in Loop: Header=BB116_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w a6, a6, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a6, .LBB116_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a5, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_min_i16_acquire:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -22278,6 +27055,37 @@ define i16 @atomicrmw_min_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_min_i16_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    slli a1, a1, 48
+; RV64I-ZALRSC-NEXT:    li a4, 48
+; RV64I-ZALRSC-NEXT:    andi a5, a0, 24
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    srai a1, a1, 48
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:    sub a4, a4, a5
+; RV64I-ZALRSC-NEXT:  .LBB116_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a5, (a2)
+; RV64I-ZALRSC-NEXT:    and a7, a5, a3
+; RV64I-ZALRSC-NEXT:    mv a6, a5
+; RV64I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV64I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV64I-ZALRSC-NEXT:    bge a1, a7, .LBB116_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB116_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV64I-ZALRSC-NEXT:    and a6, a6, a3
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV64I-ZALRSC-NEXT:  .LBB116_3: # in Loop: Header=BB116_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w a6, a6, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a6, .LBB116_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a5, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_min_i16_acquire:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -22530,6 +27338,37 @@ define i16 @atomicrmw_min_i16_release(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_min_i16_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    slli a1, a1, 16
+; RV32I-ZALRSC-NEXT:    li a4, 16
+; RV32I-ZALRSC-NEXT:    andi a5, a0, 24
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    srai a1, a1, 16
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:    sub a4, a4, a5
+; RV32I-ZALRSC-NEXT:  .LBB117_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a5, (a2)
+; RV32I-ZALRSC-NEXT:    and a7, a5, a3
+; RV32I-ZALRSC-NEXT:    mv a6, a5
+; RV32I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV32I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV32I-ZALRSC-NEXT:    bge a1, a7, .LBB117_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB117_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV32I-ZALRSC-NEXT:    and a6, a6, a3
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV32I-ZALRSC-NEXT:  .LBB117_3: # in Loop: Header=BB117_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a6, a6, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a6, .LBB117_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a5, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_min_i16_release:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -22634,6 +27473,37 @@ define i16 @atomicrmw_min_i16_release(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_min_i16_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    slli a1, a1, 48
+; RV64I-ZALRSC-NEXT:    li a4, 48
+; RV64I-ZALRSC-NEXT:    andi a5, a0, 24
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    srai a1, a1, 48
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:    sub a4, a4, a5
+; RV64I-ZALRSC-NEXT:  .LBB117_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a5, (a2)
+; RV64I-ZALRSC-NEXT:    and a7, a5, a3
+; RV64I-ZALRSC-NEXT:    mv a6, a5
+; RV64I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV64I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV64I-ZALRSC-NEXT:    bge a1, a7, .LBB117_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB117_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV64I-ZALRSC-NEXT:    and a6, a6, a3
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV64I-ZALRSC-NEXT:  .LBB117_3: # in Loop: Header=BB117_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a6, a6, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a6, .LBB117_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a5, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_min_i16_release:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -22886,6 +27756,37 @@ define i16 @atomicrmw_min_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_min_i16_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    slli a1, a1, 16
+; RV32I-ZALRSC-NEXT:    li a4, 16
+; RV32I-ZALRSC-NEXT:    andi a5, a0, 24
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    srai a1, a1, 16
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:    sub a4, a4, a5
+; RV32I-ZALRSC-NEXT:  .LBB118_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a5, (a2)
+; RV32I-ZALRSC-NEXT:    and a7, a5, a3
+; RV32I-ZALRSC-NEXT:    mv a6, a5
+; RV32I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV32I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV32I-ZALRSC-NEXT:    bge a1, a7, .LBB118_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB118_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV32I-ZALRSC-NEXT:    and a6, a6, a3
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV32I-ZALRSC-NEXT:  .LBB118_3: # in Loop: Header=BB118_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a6, a6, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a6, .LBB118_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a5, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_min_i16_acq_rel:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -22990,6 +27891,37 @@ define i16 @atomicrmw_min_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_min_i16_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    slli a1, a1, 48
+; RV64I-ZALRSC-NEXT:    li a4, 48
+; RV64I-ZALRSC-NEXT:    andi a5, a0, 24
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    srai a1, a1, 48
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:    sub a4, a4, a5
+; RV64I-ZALRSC-NEXT:  .LBB118_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a5, (a2)
+; RV64I-ZALRSC-NEXT:    and a7, a5, a3
+; RV64I-ZALRSC-NEXT:    mv a6, a5
+; RV64I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV64I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV64I-ZALRSC-NEXT:    bge a1, a7, .LBB118_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB118_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV64I-ZALRSC-NEXT:    and a6, a6, a3
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV64I-ZALRSC-NEXT:  .LBB118_3: # in Loop: Header=BB118_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a6, a6, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a6, .LBB118_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a5, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_min_i16_acq_rel:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -23242,6 +28174,37 @@ define i16 @atomicrmw_min_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_min_i16_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    slli a1, a1, 16
+; RV32I-ZALRSC-NEXT:    li a4, 16
+; RV32I-ZALRSC-NEXT:    andi a5, a0, 24
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    srai a1, a1, 16
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:    sub a4, a4, a5
+; RV32I-ZALRSC-NEXT:  .LBB119_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aqrl a5, (a2)
+; RV32I-ZALRSC-NEXT:    and a7, a5, a3
+; RV32I-ZALRSC-NEXT:    mv a6, a5
+; RV32I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV32I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV32I-ZALRSC-NEXT:    bge a1, a7, .LBB119_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB119_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV32I-ZALRSC-NEXT:    and a6, a6, a3
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV32I-ZALRSC-NEXT:  .LBB119_3: # in Loop: Header=BB119_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a6, a6, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a6, .LBB119_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a5, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-NOZACAS-LABEL: atomicrmw_min_i16_seq_cst:
 ; RV32IA-NOZACAS:       # %bb.0:
 ; RV32IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -23315,6 +28278,37 @@ define i16 @atomicrmw_min_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_min_i16_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    slli a1, a1, 48
+; RV64I-ZALRSC-NEXT:    li a4, 48
+; RV64I-ZALRSC-NEXT:    andi a5, a0, 24
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    srai a1, a1, 48
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:    sub a4, a4, a5
+; RV64I-ZALRSC-NEXT:  .LBB119_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aqrl a5, (a2)
+; RV64I-ZALRSC-NEXT:    and a7, a5, a3
+; RV64I-ZALRSC-NEXT:    mv a6, a5
+; RV64I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV64I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV64I-ZALRSC-NEXT:    bge a1, a7, .LBB119_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB119_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV64I-ZALRSC-NEXT:    and a6, a6, a3
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV64I-ZALRSC-NEXT:  .LBB119_3: # in Loop: Header=BB119_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a6, a6, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a6, .LBB119_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a5, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_min_i16_seq_cst:
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -23476,6 +28470,31 @@ define i16 @atomicrmw_umax_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umax_i16_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    sll a4, a3, a0
+; RV32I-ZALRSC-NEXT:    and a1, a1, a3
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB120_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV32I-ZALRSC-NEXT:    and a6, a3, a4
+; RV32I-ZALRSC-NEXT:    mv a5, a3
+; RV32I-ZALRSC-NEXT:    bgeu a6, a1, .LBB120_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB120_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a1
+; RV32I-ZALRSC-NEXT:    and a5, a5, a4
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:  .LBB120_3: # in Loop: Header=BB120_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB120_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-NOZACAS-LABEL: atomicrmw_umax_i16_monotonic:
 ; RV32IA-NOZACAS:       # %bb.0:
 ; RV32IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -23545,6 +28564,31 @@ define i16 @atomicrmw_umax_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_umax_i16_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    sllw a4, a3, a0
+; RV64I-ZALRSC-NEXT:    and a1, a1, a3
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB120_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV64I-ZALRSC-NEXT:    and a6, a3, a4
+; RV64I-ZALRSC-NEXT:    mv a5, a3
+; RV64I-ZALRSC-NEXT:    bgeu a6, a1, .LBB120_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB120_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a1
+; RV64I-ZALRSC-NEXT:    and a5, a5, a4
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:  .LBB120_3: # in Loop: Header=BB120_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB120_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_umax_i16_monotonic:
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -23688,6 +28732,31 @@ define i16 @atomicrmw_umax_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umax_i16_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    sll a4, a3, a0
+; RV32I-ZALRSC-NEXT:    and a1, a1, a3
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB121_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV32I-ZALRSC-NEXT:    and a6, a3, a4
+; RV32I-ZALRSC-NEXT:    mv a5, a3
+; RV32I-ZALRSC-NEXT:    bgeu a6, a1, .LBB121_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB121_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a1
+; RV32I-ZALRSC-NEXT:    and a5, a5, a4
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:  .LBB121_3: # in Loop: Header=BB121_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB121_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_umax_i16_acquire:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -23782,6 +28851,31 @@ define i16 @atomicrmw_umax_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_umax_i16_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    sllw a4, a3, a0
+; RV64I-ZALRSC-NEXT:    and a1, a1, a3
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB121_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV64I-ZALRSC-NEXT:    and a6, a3, a4
+; RV64I-ZALRSC-NEXT:    mv a5, a3
+; RV64I-ZALRSC-NEXT:    bgeu a6, a1, .LBB121_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB121_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a1
+; RV64I-ZALRSC-NEXT:    and a5, a5, a4
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:  .LBB121_3: # in Loop: Header=BB121_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB121_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_umax_i16_acquire:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -24000,6 +29094,31 @@ define i16 @atomicrmw_umax_i16_release(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umax_i16_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    sll a4, a3, a0
+; RV32I-ZALRSC-NEXT:    and a1, a1, a3
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB122_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV32I-ZALRSC-NEXT:    and a6, a3, a4
+; RV32I-ZALRSC-NEXT:    mv a5, a3
+; RV32I-ZALRSC-NEXT:    bgeu a6, a1, .LBB122_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB122_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a1
+; RV32I-ZALRSC-NEXT:    and a5, a5, a4
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:  .LBB122_3: # in Loop: Header=BB122_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB122_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_umax_i16_release:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -24094,6 +29213,31 @@ define i16 @atomicrmw_umax_i16_release(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_umax_i16_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    sllw a4, a3, a0
+; RV64I-ZALRSC-NEXT:    and a1, a1, a3
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB122_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV64I-ZALRSC-NEXT:    and a6, a3, a4
+; RV64I-ZALRSC-NEXT:    mv a5, a3
+; RV64I-ZALRSC-NEXT:    bgeu a6, a1, .LBB122_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB122_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a1
+; RV64I-ZALRSC-NEXT:    and a5, a5, a4
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:  .LBB122_3: # in Loop: Header=BB122_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB122_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_umax_i16_release:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -24312,6 +29456,31 @@ define i16 @atomicrmw_umax_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umax_i16_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    sll a4, a3, a0
+; RV32I-ZALRSC-NEXT:    and a1, a1, a3
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB123_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV32I-ZALRSC-NEXT:    and a6, a3, a4
+; RV32I-ZALRSC-NEXT:    mv a5, a3
+; RV32I-ZALRSC-NEXT:    bgeu a6, a1, .LBB123_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB123_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a1
+; RV32I-ZALRSC-NEXT:    and a5, a5, a4
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:  .LBB123_3: # in Loop: Header=BB123_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB123_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_umax_i16_acq_rel:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -24406,6 +29575,31 @@ define i16 @atomicrmw_umax_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_umax_i16_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    sllw a4, a3, a0
+; RV64I-ZALRSC-NEXT:    and a1, a1, a3
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB123_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV64I-ZALRSC-NEXT:    and a6, a3, a4
+; RV64I-ZALRSC-NEXT:    mv a5, a3
+; RV64I-ZALRSC-NEXT:    bgeu a6, a1, .LBB123_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB123_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a1
+; RV64I-ZALRSC-NEXT:    and a5, a5, a4
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:  .LBB123_3: # in Loop: Header=BB123_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB123_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_umax_i16_acq_rel:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -24624,6 +29818,31 @@ define i16 @atomicrmw_umax_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umax_i16_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    sll a4, a3, a0
+; RV32I-ZALRSC-NEXT:    and a1, a1, a3
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB124_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aqrl a3, (a2)
+; RV32I-ZALRSC-NEXT:    and a6, a3, a4
+; RV32I-ZALRSC-NEXT:    mv a5, a3
+; RV32I-ZALRSC-NEXT:    bgeu a6, a1, .LBB124_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB124_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a1
+; RV32I-ZALRSC-NEXT:    and a5, a5, a4
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:  .LBB124_3: # in Loop: Header=BB124_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB124_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-NOZACAS-LABEL: atomicrmw_umax_i16_seq_cst:
 ; RV32IA-NOZACAS:       # %bb.0:
 ; RV32IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -24693,6 +29912,31 @@ define i16 @atomicrmw_umax_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_umax_i16_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    sllw a4, a3, a0
+; RV64I-ZALRSC-NEXT:    and a1, a1, a3
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB124_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aqrl a3, (a2)
+; RV64I-ZALRSC-NEXT:    and a6, a3, a4
+; RV64I-ZALRSC-NEXT:    mv a5, a3
+; RV64I-ZALRSC-NEXT:    bgeu a6, a1, .LBB124_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB124_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a1
+; RV64I-ZALRSC-NEXT:    and a5, a5, a4
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:  .LBB124_3: # in Loop: Header=BB124_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB124_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_umax_i16_seq_cst:
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -24836,6 +30080,31 @@ define i16 @atomicrmw_umin_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umin_i16_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    sll a4, a3, a0
+; RV32I-ZALRSC-NEXT:    and a1, a1, a3
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB125_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV32I-ZALRSC-NEXT:    and a6, a3, a4
+; RV32I-ZALRSC-NEXT:    mv a5, a3
+; RV32I-ZALRSC-NEXT:    bgeu a1, a6, .LBB125_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB125_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a1
+; RV32I-ZALRSC-NEXT:    and a5, a5, a4
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:  .LBB125_3: # in Loop: Header=BB125_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB125_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-NOZACAS-LABEL: atomicrmw_umin_i16_monotonic:
 ; RV32IA-NOZACAS:       # %bb.0:
 ; RV32IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -24905,6 +30174,31 @@ define i16 @atomicrmw_umin_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_umin_i16_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    sllw a4, a3, a0
+; RV64I-ZALRSC-NEXT:    and a1, a1, a3
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB125_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV64I-ZALRSC-NEXT:    and a6, a3, a4
+; RV64I-ZALRSC-NEXT:    mv a5, a3
+; RV64I-ZALRSC-NEXT:    bgeu a1, a6, .LBB125_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB125_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a1
+; RV64I-ZALRSC-NEXT:    and a5, a5, a4
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:  .LBB125_3: # in Loop: Header=BB125_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB125_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_umin_i16_monotonic:
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -25048,6 +30342,31 @@ define i16 @atomicrmw_umin_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umin_i16_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    sll a4, a3, a0
+; RV32I-ZALRSC-NEXT:    and a1, a1, a3
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB126_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV32I-ZALRSC-NEXT:    and a6, a3, a4
+; RV32I-ZALRSC-NEXT:    mv a5, a3
+; RV32I-ZALRSC-NEXT:    bgeu a1, a6, .LBB126_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB126_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a1
+; RV32I-ZALRSC-NEXT:    and a5, a5, a4
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:  .LBB126_3: # in Loop: Header=BB126_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB126_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_umin_i16_acquire:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -25142,6 +30461,31 @@ define i16 @atomicrmw_umin_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_umin_i16_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    sllw a4, a3, a0
+; RV64I-ZALRSC-NEXT:    and a1, a1, a3
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB126_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV64I-ZALRSC-NEXT:    and a6, a3, a4
+; RV64I-ZALRSC-NEXT:    mv a5, a3
+; RV64I-ZALRSC-NEXT:    bgeu a1, a6, .LBB126_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB126_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a1
+; RV64I-ZALRSC-NEXT:    and a5, a5, a4
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:  .LBB126_3: # in Loop: Header=BB126_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB126_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_umin_i16_acquire:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -25360,6 +30704,31 @@ define i16 @atomicrmw_umin_i16_release(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umin_i16_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    sll a4, a3, a0
+; RV32I-ZALRSC-NEXT:    and a1, a1, a3
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB127_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV32I-ZALRSC-NEXT:    and a6, a3, a4
+; RV32I-ZALRSC-NEXT:    mv a5, a3
+; RV32I-ZALRSC-NEXT:    bgeu a1, a6, .LBB127_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB127_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a1
+; RV32I-ZALRSC-NEXT:    and a5, a5, a4
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:  .LBB127_3: # in Loop: Header=BB127_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB127_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_umin_i16_release:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -25454,6 +30823,31 @@ define i16 @atomicrmw_umin_i16_release(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_umin_i16_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    sllw a4, a3, a0
+; RV64I-ZALRSC-NEXT:    and a1, a1, a3
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB127_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV64I-ZALRSC-NEXT:    and a6, a3, a4
+; RV64I-ZALRSC-NEXT:    mv a5, a3
+; RV64I-ZALRSC-NEXT:    bgeu a1, a6, .LBB127_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB127_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a1
+; RV64I-ZALRSC-NEXT:    and a5, a5, a4
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:  .LBB127_3: # in Loop: Header=BB127_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB127_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_umin_i16_release:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -25672,6 +31066,31 @@ define i16 @atomicrmw_umin_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umin_i16_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    sll a4, a3, a0
+; RV32I-ZALRSC-NEXT:    and a1, a1, a3
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB128_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV32I-ZALRSC-NEXT:    and a6, a3, a4
+; RV32I-ZALRSC-NEXT:    mv a5, a3
+; RV32I-ZALRSC-NEXT:    bgeu a1, a6, .LBB128_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB128_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a1
+; RV32I-ZALRSC-NEXT:    and a5, a5, a4
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:  .LBB128_3: # in Loop: Header=BB128_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB128_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_umin_i16_acq_rel:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -25766,6 +31185,31 @@ define i16 @atomicrmw_umin_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_umin_i16_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    sllw a4, a3, a0
+; RV64I-ZALRSC-NEXT:    and a1, a1, a3
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB128_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a3, (a2)
+; RV64I-ZALRSC-NEXT:    and a6, a3, a4
+; RV64I-ZALRSC-NEXT:    mv a5, a3
+; RV64I-ZALRSC-NEXT:    bgeu a1, a6, .LBB128_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB128_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a1
+; RV64I-ZALRSC-NEXT:    and a5, a5, a4
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:  .LBB128_3: # in Loop: Header=BB128_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB128_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_umin_i16_acq_rel:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
@@ -25984,6 +31428,31 @@ define i16 @atomicrmw_umin_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umin_i16_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    sll a4, a3, a0
+; RV32I-ZALRSC-NEXT:    and a1, a1, a3
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB129_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aqrl a3, (a2)
+; RV32I-ZALRSC-NEXT:    and a6, a3, a4
+; RV32I-ZALRSC-NEXT:    mv a5, a3
+; RV32I-ZALRSC-NEXT:    bgeu a1, a6, .LBB129_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB129_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a1
+; RV32I-ZALRSC-NEXT:    and a5, a5, a4
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:  .LBB129_3: # in Loop: Header=BB129_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB129_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-NOZACAS-LABEL: atomicrmw_umin_i16_seq_cst:
 ; RV32IA-NOZACAS:       # %bb.0:
 ; RV32IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -26053,6 +31522,31 @@ define i16 @atomicrmw_umin_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_umin_i16_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    sllw a4, a3, a0
+; RV64I-ZALRSC-NEXT:    and a1, a1, a3
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB129_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aqrl a3, (a2)
+; RV64I-ZALRSC-NEXT:    and a6, a3, a4
+; RV64I-ZALRSC-NEXT:    mv a5, a3
+; RV64I-ZALRSC-NEXT:    bgeu a1, a6, .LBB129_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB129_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a1
+; RV64I-ZALRSC-NEXT:    and a5, a5, a4
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:  .LBB129_3: # in Loop: Header=BB129_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB129_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_umin_i16_seq_cst:
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
@@ -26162,6 +31656,17 @@ define i32 @atomicrmw_xchg_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xchg_i32_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB130_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV32I-ZALRSC-NEXT:    mv a3, a1
+; RV32I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB130_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_xchg_i32_monotonic:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    amoswap.w a0, a1, (a0)
@@ -26177,6 +31682,17 @@ define i32 @atomicrmw_xchg_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xchg_i32_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB130_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB130_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-LABEL: atomicrmw_xchg_i32_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    amoswap.w a0, a1, (a0)
@@ -26196,6 +31712,17 @@ define i32 @atomicrmw_xchg_i32_acquire(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xchg_i32_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB131_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a2, (a0)
+; RV32I-ZALRSC-NEXT:    mv a3, a1
+; RV32I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB131_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomicrmw_xchg_i32_acquire:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    amoswap.w.aq a0, a1, (a0)
@@ -26216,6 +31743,17 @@ define i32 @atomicrmw_xchg_i32_acquire(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xchg_i32_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB131_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a2, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB131_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_xchg_i32_acquire:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amoswap.w.aq a0, a1, (a0)
@@ -26240,6 +31778,17 @@ define i32 @atomicrmw_xchg_i32_release(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xchg_i32_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB132_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV32I-ZALRSC-NEXT:    mv a3, a1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB132_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomicrmw_xchg_i32_release:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    amoswap.w.rl a0, a1, (a0)
@@ -26260,6 +31809,17 @@ define i32 @atomicrmw_xchg_i32_release(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xchg_i32_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB132_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB132_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_xchg_i32_release:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amoswap.w.rl a0, a1, (a0)
@@ -26284,6 +31844,17 @@ define i32 @atomicrmw_xchg_i32_acq_rel(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xchg_i32_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB133_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a2, (a0)
+; RV32I-ZALRSC-NEXT:    mv a3, a1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB133_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomicrmw_xchg_i32_acq_rel:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    amoswap.w.aqrl a0, a1, (a0)
@@ -26304,6 +31875,17 @@ define i32 @atomicrmw_xchg_i32_acq_rel(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xchg_i32_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB133_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a2, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB133_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_xchg_i32_acq_rel:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amoswap.w.aqrl a0, a1, (a0)
@@ -26328,6 +31910,17 @@ define i32 @atomicrmw_xchg_i32_seq_cst(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xchg_i32_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB134_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aqrl a2, (a0)
+; RV32I-ZALRSC-NEXT:    mv a3, a1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB134_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomicrmw_xchg_i32_seq_cst:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    amoswap.w.aqrl a0, a1, (a0)
@@ -26348,6 +31941,17 @@ define i32 @atomicrmw_xchg_i32_seq_cst(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xchg_i32_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB134_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aqrl a2, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB134_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_xchg_i32_seq_cst:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amoswap.w.aqrl a0, a1, (a0)
@@ -26372,6 +31976,17 @@ define i32 @atomicrmw_add_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_add_i32_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB135_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV32I-ZALRSC-NEXT:    add a3, a2, a1
+; RV32I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB135_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_add_i32_monotonic:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    amoadd.w a0, a1, (a0)
@@ -26387,6 +32002,17 @@ define i32 @atomicrmw_add_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_add_i32_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB135_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV64I-ZALRSC-NEXT:    add a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB135_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-LABEL: atomicrmw_add_i32_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    amoadd.w a0, a1, (a0)
@@ -26406,6 +32032,17 @@ define i32 @atomicrmw_add_i32_acquire(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_add_i32_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB136_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a2, (a0)
+; RV32I-ZALRSC-NEXT:    add a3, a2, a1
+; RV32I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB136_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomicrmw_add_i32_acquire:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    amoadd.w.aq a0, a1, (a0)
@@ -26426,6 +32063,17 @@ define i32 @atomicrmw_add_i32_acquire(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_add_i32_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB136_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a2, (a0)
+; RV64I-ZALRSC-NEXT:    add a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB136_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_add_i32_acquire:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amoadd.w.aq a0, a1, (a0)
@@ -26450,6 +32098,17 @@ define i32 @atomicrmw_add_i32_release(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_add_i32_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB137_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV32I-ZALRSC-NEXT:    add a3, a2, a1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB137_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomicrmw_add_i32_release:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    amoadd.w.rl a0, a1, (a0)
@@ -26470,6 +32129,17 @@ define i32 @atomicrmw_add_i32_release(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_add_i32_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB137_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV64I-ZALRSC-NEXT:    add a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB137_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_add_i32_release:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amoadd.w.rl a0, a1, (a0)
@@ -26494,6 +32164,17 @@ define i32 @atomicrmw_add_i32_acq_rel(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_add_i32_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB138_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a2, (a0)
+; RV32I-ZALRSC-NEXT:    add a3, a2, a1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB138_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomicrmw_add_i32_acq_rel:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    amoadd.w.aqrl a0, a1, (a0)
@@ -26514,6 +32195,17 @@ define i32 @atomicrmw_add_i32_acq_rel(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_add_i32_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB138_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a2, (a0)
+; RV64I-ZALRSC-NEXT:    add a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB138_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_add_i32_acq_rel:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amoadd.w.aqrl a0, a1, (a0)
@@ -26538,6 +32230,17 @@ define i32 @atomicrmw_add_i32_seq_cst(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_add_i32_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB139_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aqrl a2, (a0)
+; RV32I-ZALRSC-NEXT:    add a3, a2, a1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB139_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomicrmw_add_i32_seq_cst:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    amoadd.w.aqrl a0, a1, (a0)
@@ -26558,6 +32261,17 @@ define i32 @atomicrmw_add_i32_seq_cst(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_add_i32_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB139_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aqrl a2, (a0)
+; RV64I-ZALRSC-NEXT:    add a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB139_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_add_i32_seq_cst:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amoadd.w.aqrl a0, a1, (a0)
@@ -26582,6 +32296,17 @@ define i32 @atomicrmw_sub_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_sub_i32_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB140_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV32I-ZALRSC-NEXT:    sub a3, a2, a1
+; RV32I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB140_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_sub_i32_monotonic:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    neg a1, a1
@@ -26598,6 +32323,17 @@ define i32 @atomicrmw_sub_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_sub_i32_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB140_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV64I-ZALRSC-NEXT:    sub a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB140_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-LABEL: atomicrmw_sub_i32_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    neg a1, a1
@@ -26618,6 +32354,17 @@ define i32 @atomicrmw_sub_i32_acquire(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_sub_i32_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB141_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a2, (a0)
+; RV32I-ZALRSC-NEXT:    sub a3, a2, a1
+; RV32I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB141_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomicrmw_sub_i32_acquire:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    neg a1, a1
@@ -26640,6 +32387,17 @@ define i32 @atomicrmw_sub_i32_acquire(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_sub_i32_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB141_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a2, (a0)
+; RV64I-ZALRSC-NEXT:    sub a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB141_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_sub_i32_acquire:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    neg a1, a1
@@ -26666,6 +32424,17 @@ define i32 @atomicrmw_sub_i32_release(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_sub_i32_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB142_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV32I-ZALRSC-NEXT:    sub a3, a2, a1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB142_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomicrmw_sub_i32_release:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    neg a1, a1
@@ -26688,6 +32457,17 @@ define i32 @atomicrmw_sub_i32_release(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_sub_i32_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB142_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV64I-ZALRSC-NEXT:    sub a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB142_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_sub_i32_release:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    neg a1, a1
@@ -26714,6 +32494,17 @@ define i32 @atomicrmw_sub_i32_acq_rel(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_sub_i32_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB143_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a2, (a0)
+; RV32I-ZALRSC-NEXT:    sub a3, a2, a1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB143_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomicrmw_sub_i32_acq_rel:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    neg a1, a1
@@ -26736,6 +32527,17 @@ define i32 @atomicrmw_sub_i32_acq_rel(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_sub_i32_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB143_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a2, (a0)
+; RV64I-ZALRSC-NEXT:    sub a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB143_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_sub_i32_acq_rel:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    neg a1, a1
@@ -26762,6 +32564,17 @@ define i32 @atomicrmw_sub_i32_seq_cst(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_sub_i32_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB144_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aqrl a2, (a0)
+; RV32I-ZALRSC-NEXT:    sub a3, a2, a1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB144_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomicrmw_sub_i32_seq_cst:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    neg a1, a1
@@ -26784,6 +32597,17 @@ define i32 @atomicrmw_sub_i32_seq_cst(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_sub_i32_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB144_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aqrl a2, (a0)
+; RV64I-ZALRSC-NEXT:    sub a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB144_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_sub_i32_seq_cst:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    neg a1, a1
@@ -26810,6 +32634,17 @@ define i32 @atomicrmw_and_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_and_i32_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB145_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV32I-ZALRSC-NEXT:    and a3, a2, a1
+; RV32I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB145_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_and_i32_monotonic:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    amoand.w a0, a1, (a0)
@@ -26825,6 +32660,17 @@ define i32 @atomicrmw_and_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_and_i32_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB145_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV64I-ZALRSC-NEXT:    and a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB145_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-LABEL: atomicrmw_and_i32_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    amoand.w a0, a1, (a0)
@@ -26844,6 +32690,17 @@ define i32 @atomicrmw_and_i32_acquire(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_and_i32_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB146_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a2, (a0)
+; RV32I-ZALRSC-NEXT:    and a3, a2, a1
+; RV32I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB146_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomicrmw_and_i32_acquire:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    amoand.w.aq a0, a1, (a0)
@@ -26864,6 +32721,17 @@ define i32 @atomicrmw_and_i32_acquire(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_and_i32_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB146_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a2, (a0)
+; RV64I-ZALRSC-NEXT:    and a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB146_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_and_i32_acquire:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amoand.w.aq a0, a1, (a0)
@@ -26888,6 +32756,17 @@ define i32 @atomicrmw_and_i32_release(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_and_i32_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB147_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV32I-ZALRSC-NEXT:    and a3, a2, a1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB147_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomicrmw_and_i32_release:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    amoand.w.rl a0, a1, (a0)
@@ -26908,6 +32787,17 @@ define i32 @atomicrmw_and_i32_release(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_and_i32_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB147_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV64I-ZALRSC-NEXT:    and a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB147_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_and_i32_release:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amoand.w.rl a0, a1, (a0)
@@ -26932,6 +32822,17 @@ define i32 @atomicrmw_and_i32_acq_rel(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_and_i32_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB148_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a2, (a0)
+; RV32I-ZALRSC-NEXT:    and a3, a2, a1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB148_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomicrmw_and_i32_acq_rel:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    amoand.w.aqrl a0, a1, (a0)
@@ -26952,6 +32853,17 @@ define i32 @atomicrmw_and_i32_acq_rel(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_and_i32_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB148_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a2, (a0)
+; RV64I-ZALRSC-NEXT:    and a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB148_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_and_i32_acq_rel:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amoand.w.aqrl a0, a1, (a0)
@@ -26976,6 +32888,17 @@ define i32 @atomicrmw_and_i32_seq_cst(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_and_i32_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB149_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aqrl a2, (a0)
+; RV32I-ZALRSC-NEXT:    and a3, a2, a1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB149_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomicrmw_and_i32_seq_cst:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    amoand.w.aqrl a0, a1, (a0)
@@ -26996,6 +32919,17 @@ define i32 @atomicrmw_and_i32_seq_cst(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_and_i32_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB149_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aqrl a2, (a0)
+; RV64I-ZALRSC-NEXT:    and a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB149_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_and_i32_seq_cst:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amoand.w.aqrl a0, a1, (a0)
@@ -27020,6 +32954,18 @@ define i32 @atomicrmw_nand_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_nand_i32_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB150_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV32I-ZALRSC-NEXT:    and a3, a2, a1
+; RV32I-ZALRSC-NEXT:    not a3, a3
+; RV32I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB150_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-NOZACAS-LABEL: atomicrmw_nand_i32_monotonic:
 ; RV32IA-NOZACAS:       # %bb.0:
 ; RV32IA-NOZACAS-NEXT:  .LBB150_1: # =>This Inner Loop Header: Depth=1
@@ -27042,6 +32988,18 @@ define i32 @atomicrmw_nand_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_nand_i32_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB150_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV64I-ZALRSC-NEXT:    and a3, a2, a1
+; RV64I-ZALRSC-NEXT:    not a3, a3
+; RV64I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB150_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_nand_i32_monotonic:
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:  .LBB150_1: # =>This Inner Loop Header: Depth=1
@@ -27200,6 +33158,18 @@ define i32 @atomicrmw_nand_i32_acquire(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_nand_i32_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB151_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a2, (a0)
+; RV32I-ZALRSC-NEXT:    and a3, a2, a1
+; RV32I-ZALRSC-NEXT:    not a3, a3
+; RV32I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB151_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_nand_i32_acquire:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:  .LBB151_1: # =>This Inner Loop Header: Depth=1
@@ -27234,6 +33204,18 @@ define i32 @atomicrmw_nand_i32_acquire(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_nand_i32_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB151_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a2, (a0)
+; RV64I-ZALRSC-NEXT:    and a3, a2, a1
+; RV64I-ZALRSC-NEXT:    not a3, a3
+; RV64I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB151_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_nand_i32_acquire:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB151_1: # =>This Inner Loop Header: Depth=1
@@ -27432,6 +33414,18 @@ define i32 @atomicrmw_nand_i32_release(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_nand_i32_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB152_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV32I-ZALRSC-NEXT:    and a3, a2, a1
+; RV32I-ZALRSC-NEXT:    not a3, a3
+; RV32I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB152_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_nand_i32_release:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:  .LBB152_1: # =>This Inner Loop Header: Depth=1
@@ -27466,6 +33460,18 @@ define i32 @atomicrmw_nand_i32_release(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_nand_i32_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB152_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV64I-ZALRSC-NEXT:    and a3, a2, a1
+; RV64I-ZALRSC-NEXT:    not a3, a3
+; RV64I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB152_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_nand_i32_release:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB152_1: # =>This Inner Loop Header: Depth=1
@@ -27664,6 +33670,18 @@ define i32 @atomicrmw_nand_i32_acq_rel(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_nand_i32_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB153_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a2, (a0)
+; RV32I-ZALRSC-NEXT:    and a3, a2, a1
+; RV32I-ZALRSC-NEXT:    not a3, a3
+; RV32I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB153_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-NOZACAS-LABEL: atomicrmw_nand_i32_acq_rel:
 ; RV32IA-WMO-NOZACAS:       # %bb.0:
 ; RV32IA-WMO-NOZACAS-NEXT:  .LBB153_1: # =>This Inner Loop Header: Depth=1
@@ -27698,6 +33716,18 @@ define i32 @atomicrmw_nand_i32_acq_rel(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_nand_i32_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB153_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a2, (a0)
+; RV64I-ZALRSC-NEXT:    and a3, a2, a1
+; RV64I-ZALRSC-NEXT:    not a3, a3
+; RV64I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB153_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_nand_i32_acq_rel:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB153_1: # =>This Inner Loop Header: Depth=1
@@ -27896,6 +33926,18 @@ define i32 @atomicrmw_nand_i32_seq_cst(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_nand_i32_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB154_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aqrl a2, (a0)
+; RV32I-ZALRSC-NEXT:    and a3, a2, a1
+; RV32I-ZALRSC-NEXT:    not a3, a3
+; RV32I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB154_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-NOZACAS-LABEL: atomicrmw_nand_i32_seq_cst:
 ; RV32IA-NOZACAS:       # %bb.0:
 ; RV32IA-NOZACAS-NEXT:  .LBB154_1: # =>This Inner Loop Header: Depth=1
@@ -27918,6 +33960,18 @@ define i32 @atomicrmw_nand_i32_seq_cst(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_nand_i32_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB154_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aqrl a2, (a0)
+; RV64I-ZALRSC-NEXT:    and a3, a2, a1
+; RV64I-ZALRSC-NEXT:    not a3, a3
+; RV64I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB154_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_nand_i32_seq_cst:
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:  .LBB154_1: # =>This Inner Loop Header: Depth=1
@@ -28112,6 +34166,17 @@ define i32 @atomicrmw_or_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_or_i32_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB155_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV32I-ZALRSC-NEXT:    or a3, a2, a1
+; RV32I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB155_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_or_i32_monotonic:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    amoor.w a0, a1, (a0)
@@ -28127,6 +34192,17 @@ define i32 @atomicrmw_or_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_or_i32_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB155_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV64I-ZALRSC-NEXT:    or a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB155_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-LABEL: atomicrmw_or_i32_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    amoor.w a0, a1, (a0)
@@ -28146,6 +34222,17 @@ define i32 @atomicrmw_or_i32_acquire(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_or_i32_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB156_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a2, (a0)
+; RV32I-ZALRSC-NEXT:    or a3, a2, a1
+; RV32I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB156_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomicrmw_or_i32_acquire:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    amoor.w.aq a0, a1, (a0)
@@ -28166,6 +34253,17 @@ define i32 @atomicrmw_or_i32_acquire(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_or_i32_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB156_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a2, (a0)
+; RV64I-ZALRSC-NEXT:    or a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB156_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_or_i32_acquire:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amoor.w.aq a0, a1, (a0)
@@ -28190,6 +34288,17 @@ define i32 @atomicrmw_or_i32_release(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_or_i32_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB157_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV32I-ZALRSC-NEXT:    or a3, a2, a1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB157_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomicrmw_or_i32_release:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    amoor.w.rl a0, a1, (a0)
@@ -28210,6 +34319,17 @@ define i32 @atomicrmw_or_i32_release(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_or_i32_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB157_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV64I-ZALRSC-NEXT:    or a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB157_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_or_i32_release:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amoor.w.rl a0, a1, (a0)
@@ -28234,6 +34354,17 @@ define i32 @atomicrmw_or_i32_acq_rel(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_or_i32_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB158_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a2, (a0)
+; RV32I-ZALRSC-NEXT:    or a3, a2, a1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB158_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomicrmw_or_i32_acq_rel:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    amoor.w.aqrl a0, a1, (a0)
@@ -28254,6 +34385,17 @@ define i32 @atomicrmw_or_i32_acq_rel(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_or_i32_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB158_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a2, (a0)
+; RV64I-ZALRSC-NEXT:    or a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB158_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_or_i32_acq_rel:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amoor.w.aqrl a0, a1, (a0)
@@ -28278,6 +34420,17 @@ define i32 @atomicrmw_or_i32_seq_cst(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_or_i32_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB159_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aqrl a2, (a0)
+; RV32I-ZALRSC-NEXT:    or a3, a2, a1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB159_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomicrmw_or_i32_seq_cst:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    amoor.w.aqrl a0, a1, (a0)
@@ -28298,6 +34451,17 @@ define i32 @atomicrmw_or_i32_seq_cst(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_or_i32_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB159_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aqrl a2, (a0)
+; RV64I-ZALRSC-NEXT:    or a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB159_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_or_i32_seq_cst:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amoor.w.aqrl a0, a1, (a0)
@@ -28322,6 +34486,17 @@ define i32 @atomicrmw_xor_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xor_i32_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB160_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV32I-ZALRSC-NEXT:    xor a3, a2, a1
+; RV32I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB160_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_xor_i32_monotonic:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    amoxor.w a0, a1, (a0)
@@ -28337,6 +34512,17 @@ define i32 @atomicrmw_xor_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xor_i32_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB160_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV64I-ZALRSC-NEXT:    xor a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB160_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-LABEL: atomicrmw_xor_i32_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    amoxor.w a0, a1, (a0)
@@ -28356,6 +34542,17 @@ define i32 @atomicrmw_xor_i32_acquire(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xor_i32_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB161_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a2, (a0)
+; RV32I-ZALRSC-NEXT:    xor a3, a2, a1
+; RV32I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB161_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomicrmw_xor_i32_acquire:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    amoxor.w.aq a0, a1, (a0)
@@ -28376,6 +34573,17 @@ define i32 @atomicrmw_xor_i32_acquire(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xor_i32_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB161_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a2, (a0)
+; RV64I-ZALRSC-NEXT:    xor a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB161_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_xor_i32_acquire:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amoxor.w.aq a0, a1, (a0)
@@ -28400,6 +34608,17 @@ define i32 @atomicrmw_xor_i32_release(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xor_i32_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB162_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV32I-ZALRSC-NEXT:    xor a3, a2, a1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB162_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomicrmw_xor_i32_release:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    amoxor.w.rl a0, a1, (a0)
@@ -28420,6 +34639,17 @@ define i32 @atomicrmw_xor_i32_release(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xor_i32_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB162_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV64I-ZALRSC-NEXT:    xor a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB162_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_xor_i32_release:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amoxor.w.rl a0, a1, (a0)
@@ -28444,6 +34674,17 @@ define i32 @atomicrmw_xor_i32_acq_rel(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xor_i32_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB163_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a2, (a0)
+; RV32I-ZALRSC-NEXT:    xor a3, a2, a1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB163_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomicrmw_xor_i32_acq_rel:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    amoxor.w.aqrl a0, a1, (a0)
@@ -28464,6 +34705,17 @@ define i32 @atomicrmw_xor_i32_acq_rel(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xor_i32_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB163_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a2, (a0)
+; RV64I-ZALRSC-NEXT:    xor a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB163_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_xor_i32_acq_rel:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amoxor.w.aqrl a0, a1, (a0)
@@ -28488,6 +34740,17 @@ define i32 @atomicrmw_xor_i32_seq_cst(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xor_i32_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB164_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aqrl a2, (a0)
+; RV32I-ZALRSC-NEXT:    xor a3, a2, a1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB164_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomicrmw_xor_i32_seq_cst:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    amoxor.w.aqrl a0, a1, (a0)
@@ -28508,6 +34771,17 @@ define i32 @atomicrmw_xor_i32_seq_cst(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xor_i32_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB164_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aqrl a2, (a0)
+; RV64I-ZALRSC-NEXT:    xor a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB164_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_xor_i32_seq_cst:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amoxor.w.aqrl a0, a1, (a0)
@@ -28558,6 +34832,21 @@ define i32 @atomicrmw_max_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_max_i32_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB165_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV32I-ZALRSC-NEXT:    mv a3, a2
+; RV32I-ZALRSC-NEXT:    bge a3, a1, .LBB165_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB165_1 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a3, a1
+; RV32I-ZALRSC-NEXT:  .LBB165_3: # in Loop: Header=BB165_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB165_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_max_i32_monotonic:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    amomax.w a0, a1, (a0)
@@ -28602,6 +34891,22 @@ define i32 @atomicrmw_max_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_max_i32_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    sext.w a2, a1
+; RV64I-ZALRSC-NEXT:  .LBB165_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a1, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:    bge a3, a2, .LBB165_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB165_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:  .LBB165_3: # in Loop: Header=BB165_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB165_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a1
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-LABEL: atomicrmw_max_i32_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    amomax.w a0, a1, (a0)
@@ -28647,6 +34952,21 @@ define i32 @atomicrmw_max_i32_acquire(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_max_i32_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB166_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a2, (a0)
+; RV32I-ZALRSC-NEXT:    mv a3, a2
+; RV32I-ZALRSC-NEXT:    bge a3, a1, .LBB166_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB166_1 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a3, a1
+; RV32I-ZALRSC-NEXT:  .LBB166_3: # in Loop: Header=BB166_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB166_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomicrmw_max_i32_acquire:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    amomax.w.aq a0, a1, (a0)
@@ -28696,6 +35016,22 @@ define i32 @atomicrmw_max_i32_acquire(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_max_i32_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    sext.w a2, a1
+; RV64I-ZALRSC-NEXT:  .LBB166_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a1, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:    bge a3, a2, .LBB166_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB166_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:  .LBB166_3: # in Loop: Header=BB166_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB166_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a1
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_max_i32_acquire:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amomax.w.aq a0, a1, (a0)
@@ -28746,6 +35082,21 @@ define i32 @atomicrmw_max_i32_release(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_max_i32_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB167_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV32I-ZALRSC-NEXT:    mv a3, a2
+; RV32I-ZALRSC-NEXT:    bge a3, a1, .LBB167_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB167_1 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a3, a1
+; RV32I-ZALRSC-NEXT:  .LBB167_3: # in Loop: Header=BB167_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB167_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomicrmw_max_i32_release:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    amomax.w.rl a0, a1, (a0)
@@ -28795,6 +35146,22 @@ define i32 @atomicrmw_max_i32_release(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_max_i32_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    sext.w a2, a1
+; RV64I-ZALRSC-NEXT:  .LBB167_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a1, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:    bge a3, a2, .LBB167_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB167_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:  .LBB167_3: # in Loop: Header=BB167_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB167_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a1
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_max_i32_release:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amomax.w.rl a0, a1, (a0)
@@ -28845,6 +35212,21 @@ define i32 @atomicrmw_max_i32_acq_rel(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_max_i32_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB168_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a2, (a0)
+; RV32I-ZALRSC-NEXT:    mv a3, a2
+; RV32I-ZALRSC-NEXT:    bge a3, a1, .LBB168_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB168_1 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a3, a1
+; RV32I-ZALRSC-NEXT:  .LBB168_3: # in Loop: Header=BB168_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB168_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomicrmw_max_i32_acq_rel:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    amomax.w.aqrl a0, a1, (a0)
@@ -28894,6 +35276,22 @@ define i32 @atomicrmw_max_i32_acq_rel(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_max_i32_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    sext.w a2, a1
+; RV64I-ZALRSC-NEXT:  .LBB168_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a1, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:    bge a3, a2, .LBB168_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB168_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:  .LBB168_3: # in Loop: Header=BB168_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB168_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a1
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_max_i32_acq_rel:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amomax.w.aqrl a0, a1, (a0)
@@ -28944,6 +35342,21 @@ define i32 @atomicrmw_max_i32_seq_cst(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_max_i32_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB169_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aqrl a2, (a0)
+; RV32I-ZALRSC-NEXT:    mv a3, a2
+; RV32I-ZALRSC-NEXT:    bge a3, a1, .LBB169_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB169_1 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a3, a1
+; RV32I-ZALRSC-NEXT:  .LBB169_3: # in Loop: Header=BB169_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB169_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomicrmw_max_i32_seq_cst:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    amomax.w.aqrl a0, a1, (a0)
@@ -28993,6 +35406,22 @@ define i32 @atomicrmw_max_i32_seq_cst(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_max_i32_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    sext.w a2, a1
+; RV64I-ZALRSC-NEXT:  .LBB169_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aqrl a1, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:    bge a3, a2, .LBB169_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB169_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:  .LBB169_3: # in Loop: Header=BB169_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB169_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a1
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_max_i32_seq_cst:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amomax.w.aqrl a0, a1, (a0)
@@ -29043,6 +35472,21 @@ define i32 @atomicrmw_min_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_min_i32_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB170_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV32I-ZALRSC-NEXT:    mv a3, a2
+; RV32I-ZALRSC-NEXT:    bge a1, a3, .LBB170_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB170_1 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a3, a1
+; RV32I-ZALRSC-NEXT:  .LBB170_3: # in Loop: Header=BB170_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB170_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_min_i32_monotonic:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    amomin.w a0, a1, (a0)
@@ -29087,6 +35531,22 @@ define i32 @atomicrmw_min_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_min_i32_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    sext.w a2, a1
+; RV64I-ZALRSC-NEXT:  .LBB170_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a1, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:    bge a2, a3, .LBB170_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB170_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:  .LBB170_3: # in Loop: Header=BB170_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB170_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a1
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-LABEL: atomicrmw_min_i32_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    amomin.w a0, a1, (a0)
@@ -29132,6 +35592,21 @@ define i32 @atomicrmw_min_i32_acquire(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_min_i32_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB171_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a2, (a0)
+; RV32I-ZALRSC-NEXT:    mv a3, a2
+; RV32I-ZALRSC-NEXT:    bge a1, a3, .LBB171_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB171_1 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a3, a1
+; RV32I-ZALRSC-NEXT:  .LBB171_3: # in Loop: Header=BB171_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB171_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomicrmw_min_i32_acquire:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    amomin.w.aq a0, a1, (a0)
@@ -29181,6 +35656,22 @@ define i32 @atomicrmw_min_i32_acquire(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_min_i32_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    sext.w a2, a1
+; RV64I-ZALRSC-NEXT:  .LBB171_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a1, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:    bge a2, a3, .LBB171_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB171_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:  .LBB171_3: # in Loop: Header=BB171_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB171_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a1
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_min_i32_acquire:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amomin.w.aq a0, a1, (a0)
@@ -29231,6 +35722,21 @@ define i32 @atomicrmw_min_i32_release(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_min_i32_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB172_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV32I-ZALRSC-NEXT:    mv a3, a2
+; RV32I-ZALRSC-NEXT:    bge a1, a3, .LBB172_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB172_1 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a3, a1
+; RV32I-ZALRSC-NEXT:  .LBB172_3: # in Loop: Header=BB172_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB172_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomicrmw_min_i32_release:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    amomin.w.rl a0, a1, (a0)
@@ -29280,6 +35786,22 @@ define i32 @atomicrmw_min_i32_release(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_min_i32_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    sext.w a2, a1
+; RV64I-ZALRSC-NEXT:  .LBB172_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a1, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:    bge a2, a3, .LBB172_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB172_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:  .LBB172_3: # in Loop: Header=BB172_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB172_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a1
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_min_i32_release:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amomin.w.rl a0, a1, (a0)
@@ -29330,6 +35852,21 @@ define i32 @atomicrmw_min_i32_acq_rel(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_min_i32_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB173_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a2, (a0)
+; RV32I-ZALRSC-NEXT:    mv a3, a2
+; RV32I-ZALRSC-NEXT:    bge a1, a3, .LBB173_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB173_1 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a3, a1
+; RV32I-ZALRSC-NEXT:  .LBB173_3: # in Loop: Header=BB173_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB173_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomicrmw_min_i32_acq_rel:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    amomin.w.aqrl a0, a1, (a0)
@@ -29379,6 +35916,22 @@ define i32 @atomicrmw_min_i32_acq_rel(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_min_i32_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    sext.w a2, a1
+; RV64I-ZALRSC-NEXT:  .LBB173_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a1, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:    bge a2, a3, .LBB173_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB173_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:  .LBB173_3: # in Loop: Header=BB173_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB173_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a1
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_min_i32_acq_rel:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amomin.w.aqrl a0, a1, (a0)
@@ -29429,6 +35982,21 @@ define i32 @atomicrmw_min_i32_seq_cst(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_min_i32_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB174_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aqrl a2, (a0)
+; RV32I-ZALRSC-NEXT:    mv a3, a2
+; RV32I-ZALRSC-NEXT:    bge a1, a3, .LBB174_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB174_1 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a3, a1
+; RV32I-ZALRSC-NEXT:  .LBB174_3: # in Loop: Header=BB174_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB174_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomicrmw_min_i32_seq_cst:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    amomin.w.aqrl a0, a1, (a0)
@@ -29478,6 +36046,22 @@ define i32 @atomicrmw_min_i32_seq_cst(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_min_i32_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    sext.w a2, a1
+; RV64I-ZALRSC-NEXT:  .LBB174_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aqrl a1, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:    bge a2, a3, .LBB174_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB174_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:  .LBB174_3: # in Loop: Header=BB174_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB174_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a1
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_min_i32_seq_cst:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amomin.w.aqrl a0, a1, (a0)
@@ -29528,6 +36112,21 @@ define i32 @atomicrmw_umax_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umax_i32_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB175_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV32I-ZALRSC-NEXT:    mv a3, a2
+; RV32I-ZALRSC-NEXT:    bgeu a3, a1, .LBB175_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB175_1 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a3, a1
+; RV32I-ZALRSC-NEXT:  .LBB175_3: # in Loop: Header=BB175_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB175_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_umax_i32_monotonic:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    amomaxu.w a0, a1, (a0)
@@ -29572,6 +36171,22 @@ define i32 @atomicrmw_umax_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_umax_i32_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    sext.w a2, a1
+; RV64I-ZALRSC-NEXT:  .LBB175_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a1, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:    bgeu a3, a2, .LBB175_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB175_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:  .LBB175_3: # in Loop: Header=BB175_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB175_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a1
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-LABEL: atomicrmw_umax_i32_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    amomaxu.w a0, a1, (a0)
@@ -29617,6 +36232,21 @@ define i32 @atomicrmw_umax_i32_acquire(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umax_i32_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB176_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a2, (a0)
+; RV32I-ZALRSC-NEXT:    mv a3, a2
+; RV32I-ZALRSC-NEXT:    bgeu a3, a1, .LBB176_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB176_1 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a3, a1
+; RV32I-ZALRSC-NEXT:  .LBB176_3: # in Loop: Header=BB176_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB176_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomicrmw_umax_i32_acquire:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    amomaxu.w.aq a0, a1, (a0)
@@ -29666,6 +36296,22 @@ define i32 @atomicrmw_umax_i32_acquire(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_umax_i32_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    sext.w a2, a1
+; RV64I-ZALRSC-NEXT:  .LBB176_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a1, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:    bgeu a3, a2, .LBB176_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB176_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:  .LBB176_3: # in Loop: Header=BB176_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB176_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a1
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_umax_i32_acquire:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amomaxu.w.aq a0, a1, (a0)
@@ -29716,6 +36362,21 @@ define i32 @atomicrmw_umax_i32_release(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umax_i32_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB177_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV32I-ZALRSC-NEXT:    mv a3, a2
+; RV32I-ZALRSC-NEXT:    bgeu a3, a1, .LBB177_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB177_1 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a3, a1
+; RV32I-ZALRSC-NEXT:  .LBB177_3: # in Loop: Header=BB177_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB177_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomicrmw_umax_i32_release:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    amomaxu.w.rl a0, a1, (a0)
@@ -29765,6 +36426,22 @@ define i32 @atomicrmw_umax_i32_release(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_umax_i32_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    sext.w a2, a1
+; RV64I-ZALRSC-NEXT:  .LBB177_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a1, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:    bgeu a3, a2, .LBB177_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB177_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:  .LBB177_3: # in Loop: Header=BB177_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB177_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a1
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_umax_i32_release:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amomaxu.w.rl a0, a1, (a0)
@@ -29815,6 +36492,21 @@ define i32 @atomicrmw_umax_i32_acq_rel(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umax_i32_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB178_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a2, (a0)
+; RV32I-ZALRSC-NEXT:    mv a3, a2
+; RV32I-ZALRSC-NEXT:    bgeu a3, a1, .LBB178_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB178_1 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a3, a1
+; RV32I-ZALRSC-NEXT:  .LBB178_3: # in Loop: Header=BB178_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB178_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomicrmw_umax_i32_acq_rel:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    amomaxu.w.aqrl a0, a1, (a0)
@@ -29864,6 +36556,22 @@ define i32 @atomicrmw_umax_i32_acq_rel(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_umax_i32_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    sext.w a2, a1
+; RV64I-ZALRSC-NEXT:  .LBB178_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a1, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:    bgeu a3, a2, .LBB178_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB178_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:  .LBB178_3: # in Loop: Header=BB178_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB178_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a1
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_umax_i32_acq_rel:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amomaxu.w.aqrl a0, a1, (a0)
@@ -29914,6 +36622,21 @@ define i32 @atomicrmw_umax_i32_seq_cst(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umax_i32_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB179_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aqrl a2, (a0)
+; RV32I-ZALRSC-NEXT:    mv a3, a2
+; RV32I-ZALRSC-NEXT:    bgeu a3, a1, .LBB179_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB179_1 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a3, a1
+; RV32I-ZALRSC-NEXT:  .LBB179_3: # in Loop: Header=BB179_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB179_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomicrmw_umax_i32_seq_cst:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    amomaxu.w.aqrl a0, a1, (a0)
@@ -29963,6 +36686,22 @@ define i32 @atomicrmw_umax_i32_seq_cst(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_umax_i32_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    sext.w a2, a1
+; RV64I-ZALRSC-NEXT:  .LBB179_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aqrl a1, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:    bgeu a3, a2, .LBB179_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB179_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:  .LBB179_3: # in Loop: Header=BB179_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB179_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a1
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_umax_i32_seq_cst:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amomaxu.w.aqrl a0, a1, (a0)
@@ -30013,6 +36752,21 @@ define i32 @atomicrmw_umin_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umin_i32_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB180_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV32I-ZALRSC-NEXT:    mv a3, a2
+; RV32I-ZALRSC-NEXT:    bgeu a1, a3, .LBB180_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB180_1 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a3, a1
+; RV32I-ZALRSC-NEXT:  .LBB180_3: # in Loop: Header=BB180_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB180_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_umin_i32_monotonic:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    amominu.w a0, a1, (a0)
@@ -30057,6 +36811,22 @@ define i32 @atomicrmw_umin_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_umin_i32_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    sext.w a2, a1
+; RV64I-ZALRSC-NEXT:  .LBB180_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a1, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:    bgeu a2, a3, .LBB180_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB180_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:  .LBB180_3: # in Loop: Header=BB180_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB180_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a1
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-LABEL: atomicrmw_umin_i32_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    amominu.w a0, a1, (a0)
@@ -30102,6 +36872,21 @@ define i32 @atomicrmw_umin_i32_acquire(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umin_i32_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB181_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a2, (a0)
+; RV32I-ZALRSC-NEXT:    mv a3, a2
+; RV32I-ZALRSC-NEXT:    bgeu a1, a3, .LBB181_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB181_1 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a3, a1
+; RV32I-ZALRSC-NEXT:  .LBB181_3: # in Loop: Header=BB181_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB181_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomicrmw_umin_i32_acquire:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    amominu.w.aq a0, a1, (a0)
@@ -30151,6 +36936,22 @@ define i32 @atomicrmw_umin_i32_acquire(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_umin_i32_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    sext.w a2, a1
+; RV64I-ZALRSC-NEXT:  .LBB181_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a1, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:    bgeu a2, a3, .LBB181_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB181_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:  .LBB181_3: # in Loop: Header=BB181_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB181_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a1
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_umin_i32_acquire:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amominu.w.aq a0, a1, (a0)
@@ -30201,6 +37002,21 @@ define i32 @atomicrmw_umin_i32_release(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umin_i32_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB182_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV32I-ZALRSC-NEXT:    mv a3, a2
+; RV32I-ZALRSC-NEXT:    bgeu a1, a3, .LBB182_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB182_1 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a3, a1
+; RV32I-ZALRSC-NEXT:  .LBB182_3: # in Loop: Header=BB182_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB182_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomicrmw_umin_i32_release:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    amominu.w.rl a0, a1, (a0)
@@ -30250,6 +37066,22 @@ define i32 @atomicrmw_umin_i32_release(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_umin_i32_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    sext.w a2, a1
+; RV64I-ZALRSC-NEXT:  .LBB182_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a1, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:    bgeu a2, a3, .LBB182_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB182_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:  .LBB182_3: # in Loop: Header=BB182_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB182_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a1
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_umin_i32_release:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amominu.w.rl a0, a1, (a0)
@@ -30300,6 +37132,21 @@ define i32 @atomicrmw_umin_i32_acq_rel(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umin_i32_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB183_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aq a2, (a0)
+; RV32I-ZALRSC-NEXT:    mv a3, a2
+; RV32I-ZALRSC-NEXT:    bgeu a1, a3, .LBB183_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB183_1 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a3, a1
+; RV32I-ZALRSC-NEXT:  .LBB183_3: # in Loop: Header=BB183_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB183_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomicrmw_umin_i32_acq_rel:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    amominu.w.aqrl a0, a1, (a0)
@@ -30349,6 +37196,22 @@ define i32 @atomicrmw_umin_i32_acq_rel(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_umin_i32_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    sext.w a2, a1
+; RV64I-ZALRSC-NEXT:  .LBB183_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aq a1, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:    bgeu a2, a3, .LBB183_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB183_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:  .LBB183_3: # in Loop: Header=BB183_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB183_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a1
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_umin_i32_acq_rel:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amominu.w.aqrl a0, a1, (a0)
@@ -30399,6 +37262,21 @@ define i32 @atomicrmw_umin_i32_seq_cst(ptr %a, i32 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umin_i32_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB184_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aqrl a2, (a0)
+; RV32I-ZALRSC-NEXT:    mv a3, a2
+; RV32I-ZALRSC-NEXT:    bgeu a1, a3, .LBB184_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB184_1 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a3, a1
+; RV32I-ZALRSC-NEXT:  .LBB184_3: # in Loop: Header=BB184_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB184_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-WMO-LABEL: atomicrmw_umin_i32_seq_cst:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    amominu.w.aqrl a0, a1, (a0)
@@ -30448,6 +37326,22 @@ define i32 @atomicrmw_umin_i32_seq_cst(ptr %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 48
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_umin_i32_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    sext.w a2, a1
+; RV64I-ZALRSC-NEXT:  .LBB184_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aqrl a1, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:    bgeu a2, a3, .LBB184_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB184_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:  .LBB184_3: # in Loop: Header=BB184_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB184_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a1
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_umin_i32_seq_cst:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amominu.w.aqrl a0, a1, (a0)
@@ -30472,6 +37366,16 @@ define i64 @atomicrmw_xchg_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xchg_i64_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a3, 0
+; RV32I-ZALRSC-NEXT:    call __atomic_exchange_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_xchg_i64_monotonic:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -16
@@ -30492,6 +37396,17 @@ define i64 @atomicrmw_xchg_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xchg_i64_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB185_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d a2, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:    sc.d a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB185_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-LABEL: atomicrmw_xchg_i64_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    amoswap.d a0, a1, (a0)
@@ -30511,6 +37426,16 @@ define i64 @atomicrmw_xchg_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xchg_i64_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a3, 2
+; RV32I-ZALRSC-NEXT:    call __atomic_exchange_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_xchg_i64_acquire:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -16
@@ -30531,6 +37456,17 @@ define i64 @atomicrmw_xchg_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xchg_i64_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB186_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d.aq a2, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:    sc.d a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB186_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_xchg_i64_acquire:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amoswap.d.aq a0, a1, (a0)
@@ -30555,6 +37491,16 @@ define i64 @atomicrmw_xchg_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xchg_i64_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a3, 3
+; RV32I-ZALRSC-NEXT:    call __atomic_exchange_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_xchg_i64_release:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -16
@@ -30575,6 +37521,17 @@ define i64 @atomicrmw_xchg_i64_release(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xchg_i64_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB187_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d a2, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB187_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_xchg_i64_release:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amoswap.d.rl a0, a1, (a0)
@@ -30599,6 +37556,16 @@ define i64 @atomicrmw_xchg_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xchg_i64_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a3, 4
+; RV32I-ZALRSC-NEXT:    call __atomic_exchange_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_xchg_i64_acq_rel:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -16
@@ -30619,6 +37586,17 @@ define i64 @atomicrmw_xchg_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xchg_i64_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB188_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d.aq a2, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB188_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_xchg_i64_acq_rel:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amoswap.d.aqrl a0, a1, (a0)
@@ -30643,6 +37621,16 @@ define i64 @atomicrmw_xchg_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xchg_i64_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a3, 5
+; RV32I-ZALRSC-NEXT:    call __atomic_exchange_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_xchg_i64_seq_cst:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -16
@@ -30663,6 +37651,17 @@ define i64 @atomicrmw_xchg_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xchg_i64_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB189_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d.aqrl a2, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB189_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_xchg_i64_seq_cst:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amoswap.d.aqrl a0, a1, (a0)
@@ -30687,6 +37686,16 @@ define i64 @atomicrmw_add_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_add_i64_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a3, 0
+; RV32I-ZALRSC-NEXT:    call __atomic_fetch_add_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_add_i64_monotonic:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -16
@@ -30707,6 +37716,17 @@ define i64 @atomicrmw_add_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_add_i64_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB190_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d a2, (a0)
+; RV64I-ZALRSC-NEXT:    add a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.d a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB190_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-LABEL: atomicrmw_add_i64_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    amoadd.d a0, a1, (a0)
@@ -30726,6 +37746,16 @@ define i64 @atomicrmw_add_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_add_i64_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a3, 2
+; RV32I-ZALRSC-NEXT:    call __atomic_fetch_add_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_add_i64_acquire:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -16
@@ -30746,6 +37776,17 @@ define i64 @atomicrmw_add_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_add_i64_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB191_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d.aq a2, (a0)
+; RV64I-ZALRSC-NEXT:    add a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.d a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB191_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_add_i64_acquire:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amoadd.d.aq a0, a1, (a0)
@@ -30770,6 +37811,16 @@ define i64 @atomicrmw_add_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_add_i64_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a3, 3
+; RV32I-ZALRSC-NEXT:    call __atomic_fetch_add_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_add_i64_release:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -16
@@ -30790,6 +37841,17 @@ define i64 @atomicrmw_add_i64_release(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_add_i64_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB192_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d a2, (a0)
+; RV64I-ZALRSC-NEXT:    add a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB192_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_add_i64_release:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amoadd.d.rl a0, a1, (a0)
@@ -30814,6 +37876,16 @@ define i64 @atomicrmw_add_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_add_i64_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a3, 4
+; RV32I-ZALRSC-NEXT:    call __atomic_fetch_add_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_add_i64_acq_rel:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -16
@@ -30834,6 +37906,17 @@ define i64 @atomicrmw_add_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_add_i64_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB193_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d.aq a2, (a0)
+; RV64I-ZALRSC-NEXT:    add a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB193_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_add_i64_acq_rel:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amoadd.d.aqrl a0, a1, (a0)
@@ -30858,6 +37941,16 @@ define i64 @atomicrmw_add_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_add_i64_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a3, 5
+; RV32I-ZALRSC-NEXT:    call __atomic_fetch_add_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_add_i64_seq_cst:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -16
@@ -30878,6 +37971,17 @@ define i64 @atomicrmw_add_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_add_i64_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB194_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d.aqrl a2, (a0)
+; RV64I-ZALRSC-NEXT:    add a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB194_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_add_i64_seq_cst:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amoadd.d.aqrl a0, a1, (a0)
@@ -30902,6 +38006,16 @@ define i64 @atomicrmw_sub_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_sub_i64_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a3, 0
+; RV32I-ZALRSC-NEXT:    call __atomic_fetch_sub_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_sub_i64_monotonic:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -16
@@ -30922,6 +38036,17 @@ define i64 @atomicrmw_sub_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_sub_i64_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB195_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d a2, (a0)
+; RV64I-ZALRSC-NEXT:    sub a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.d a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB195_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-LABEL: atomicrmw_sub_i64_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    neg a1, a1
@@ -30942,6 +38067,16 @@ define i64 @atomicrmw_sub_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_sub_i64_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a3, 2
+; RV32I-ZALRSC-NEXT:    call __atomic_fetch_sub_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_sub_i64_acquire:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -16
@@ -30962,6 +38097,17 @@ define i64 @atomicrmw_sub_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_sub_i64_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB196_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d.aq a2, (a0)
+; RV64I-ZALRSC-NEXT:    sub a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.d a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB196_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_sub_i64_acquire:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    neg a1, a1
@@ -30988,6 +38134,16 @@ define i64 @atomicrmw_sub_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_sub_i64_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a3, 3
+; RV32I-ZALRSC-NEXT:    call __atomic_fetch_sub_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_sub_i64_release:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -16
@@ -31008,6 +38164,17 @@ define i64 @atomicrmw_sub_i64_release(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_sub_i64_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB197_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d a2, (a0)
+; RV64I-ZALRSC-NEXT:    sub a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB197_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_sub_i64_release:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    neg a1, a1
@@ -31034,6 +38201,16 @@ define i64 @atomicrmw_sub_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_sub_i64_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a3, 4
+; RV32I-ZALRSC-NEXT:    call __atomic_fetch_sub_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_sub_i64_acq_rel:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -16
@@ -31054,6 +38231,17 @@ define i64 @atomicrmw_sub_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_sub_i64_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB198_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d.aq a2, (a0)
+; RV64I-ZALRSC-NEXT:    sub a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB198_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_sub_i64_acq_rel:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    neg a1, a1
@@ -31080,6 +38268,16 @@ define i64 @atomicrmw_sub_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_sub_i64_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a3, 5
+; RV32I-ZALRSC-NEXT:    call __atomic_fetch_sub_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_sub_i64_seq_cst:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -16
@@ -31100,6 +38298,17 @@ define i64 @atomicrmw_sub_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_sub_i64_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB199_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d.aqrl a2, (a0)
+; RV64I-ZALRSC-NEXT:    sub a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB199_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_sub_i64_seq_cst:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    neg a1, a1
@@ -31126,6 +38335,16 @@ define i64 @atomicrmw_and_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_and_i64_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a3, 0
+; RV32I-ZALRSC-NEXT:    call __atomic_fetch_and_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_and_i64_monotonic:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -16
@@ -31146,6 +38365,17 @@ define i64 @atomicrmw_and_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_and_i64_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB200_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d a2, (a0)
+; RV64I-ZALRSC-NEXT:    and a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.d a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB200_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-LABEL: atomicrmw_and_i64_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    amoand.d a0, a1, (a0)
@@ -31165,6 +38395,16 @@ define i64 @atomicrmw_and_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_and_i64_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a3, 2
+; RV32I-ZALRSC-NEXT:    call __atomic_fetch_and_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_and_i64_acquire:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -16
@@ -31185,6 +38425,17 @@ define i64 @atomicrmw_and_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_and_i64_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB201_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d.aq a2, (a0)
+; RV64I-ZALRSC-NEXT:    and a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.d a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB201_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_and_i64_acquire:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amoand.d.aq a0, a1, (a0)
@@ -31209,6 +38460,16 @@ define i64 @atomicrmw_and_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_and_i64_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a3, 3
+; RV32I-ZALRSC-NEXT:    call __atomic_fetch_and_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_and_i64_release:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -16
@@ -31229,6 +38490,17 @@ define i64 @atomicrmw_and_i64_release(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_and_i64_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB202_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d a2, (a0)
+; RV64I-ZALRSC-NEXT:    and a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB202_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_and_i64_release:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amoand.d.rl a0, a1, (a0)
@@ -31253,6 +38525,16 @@ define i64 @atomicrmw_and_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_and_i64_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a3, 4
+; RV32I-ZALRSC-NEXT:    call __atomic_fetch_and_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_and_i64_acq_rel:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -16
@@ -31273,6 +38555,17 @@ define i64 @atomicrmw_and_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_and_i64_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB203_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d.aq a2, (a0)
+; RV64I-ZALRSC-NEXT:    and a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB203_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_and_i64_acq_rel:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amoand.d.aqrl a0, a1, (a0)
@@ -31297,6 +38590,16 @@ define i64 @atomicrmw_and_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_and_i64_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a3, 5
+; RV32I-ZALRSC-NEXT:    call __atomic_fetch_and_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_and_i64_seq_cst:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -16
@@ -31317,6 +38620,17 @@ define i64 @atomicrmw_and_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_and_i64_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB204_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d.aqrl a2, (a0)
+; RV64I-ZALRSC-NEXT:    and a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB204_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_and_i64_seq_cst:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amoand.d.aqrl a0, a1, (a0)
@@ -31341,6 +38655,16 @@ define i64 @atomicrmw_nand_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_nand_i64_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a3, 0
+; RV32I-ZALRSC-NEXT:    call __atomic_fetch_nand_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_nand_i64_monotonic:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -16
@@ -31361,6 +38685,18 @@ define i64 @atomicrmw_nand_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_nand_i64_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB205_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d a2, (a0)
+; RV64I-ZALRSC-NEXT:    and a3, a2, a1
+; RV64I-ZALRSC-NEXT:    not a3, a3
+; RV64I-ZALRSC-NEXT:    sc.d a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB205_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_nand_i64_monotonic:
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:  .LBB205_1: # =>This Inner Loop Header: Depth=1
@@ -31453,6 +38789,16 @@ define i64 @atomicrmw_nand_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_nand_i64_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a3, 2
+; RV32I-ZALRSC-NEXT:    call __atomic_fetch_nand_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_nand_i64_acquire:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -16
@@ -31473,6 +38819,18 @@ define i64 @atomicrmw_nand_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_nand_i64_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB206_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d.aq a2, (a0)
+; RV64I-ZALRSC-NEXT:    and a3, a2, a1
+; RV64I-ZALRSC-NEXT:    not a3, a3
+; RV64I-ZALRSC-NEXT:    sc.d a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB206_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_nand_i64_acquire:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB206_1: # =>This Inner Loop Header: Depth=1
@@ -31591,6 +38949,16 @@ define i64 @atomicrmw_nand_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_nand_i64_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a3, 3
+; RV32I-ZALRSC-NEXT:    call __atomic_fetch_nand_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_nand_i64_release:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -16
@@ -31611,6 +38979,18 @@ define i64 @atomicrmw_nand_i64_release(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_nand_i64_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB207_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d a2, (a0)
+; RV64I-ZALRSC-NEXT:    and a3, a2, a1
+; RV64I-ZALRSC-NEXT:    not a3, a3
+; RV64I-ZALRSC-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB207_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_nand_i64_release:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB207_1: # =>This Inner Loop Header: Depth=1
@@ -31729,6 +39109,16 @@ define i64 @atomicrmw_nand_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_nand_i64_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a3, 4
+; RV32I-ZALRSC-NEXT:    call __atomic_fetch_nand_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_nand_i64_acq_rel:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -16
@@ -31749,6 +39139,18 @@ define i64 @atomicrmw_nand_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_nand_i64_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB208_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d.aq a2, (a0)
+; RV64I-ZALRSC-NEXT:    and a3, a2, a1
+; RV64I-ZALRSC-NEXT:    not a3, a3
+; RV64I-ZALRSC-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB208_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-NOZACAS-LABEL: atomicrmw_nand_i64_acq_rel:
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB208_1: # =>This Inner Loop Header: Depth=1
@@ -31867,6 +39269,16 @@ define i64 @atomicrmw_nand_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_nand_i64_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a3, 5
+; RV32I-ZALRSC-NEXT:    call __atomic_fetch_nand_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_nand_i64_seq_cst:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -16
@@ -31887,6 +39299,18 @@ define i64 @atomicrmw_nand_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_nand_i64_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB209_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d.aqrl a2, (a0)
+; RV64I-ZALRSC-NEXT:    and a3, a2, a1
+; RV64I-ZALRSC-NEXT:    not a3, a3
+; RV64I-ZALRSC-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB209_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-NOZACAS-LABEL: atomicrmw_nand_i64_seq_cst:
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:  .LBB209_1: # =>This Inner Loop Header: Depth=1
@@ -31997,6 +39421,16 @@ define i64 @atomicrmw_or_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_or_i64_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a3, 0
+; RV32I-ZALRSC-NEXT:    call __atomic_fetch_or_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_or_i64_monotonic:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -16
@@ -32017,6 +39451,17 @@ define i64 @atomicrmw_or_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_or_i64_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB210_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d a2, (a0)
+; RV64I-ZALRSC-NEXT:    or a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.d a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB210_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-LABEL: atomicrmw_or_i64_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    amoor.d a0, a1, (a0)
@@ -32036,6 +39481,16 @@ define i64 @atomicrmw_or_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_or_i64_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a3, 2
+; RV32I-ZALRSC-NEXT:    call __atomic_fetch_or_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_or_i64_acquire:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -16
@@ -32056,6 +39511,17 @@ define i64 @atomicrmw_or_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_or_i64_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB211_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d.aq a2, (a0)
+; RV64I-ZALRSC-NEXT:    or a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.d a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB211_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_or_i64_acquire:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amoor.d.aq a0, a1, (a0)
@@ -32080,6 +39546,16 @@ define i64 @atomicrmw_or_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_or_i64_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a3, 3
+; RV32I-ZALRSC-NEXT:    call __atomic_fetch_or_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_or_i64_release:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -16
@@ -32100,6 +39576,17 @@ define i64 @atomicrmw_or_i64_release(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_or_i64_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB212_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d a2, (a0)
+; RV64I-ZALRSC-NEXT:    or a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB212_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_or_i64_release:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amoor.d.rl a0, a1, (a0)
@@ -32124,6 +39611,16 @@ define i64 @atomicrmw_or_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_or_i64_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a3, 4
+; RV32I-ZALRSC-NEXT:    call __atomic_fetch_or_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_or_i64_acq_rel:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -16
@@ -32144,6 +39641,17 @@ define i64 @atomicrmw_or_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_or_i64_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB213_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d.aq a2, (a0)
+; RV64I-ZALRSC-NEXT:    or a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB213_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_or_i64_acq_rel:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amoor.d.aqrl a0, a1, (a0)
@@ -32168,6 +39676,16 @@ define i64 @atomicrmw_or_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_or_i64_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a3, 5
+; RV32I-ZALRSC-NEXT:    call __atomic_fetch_or_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_or_i64_seq_cst:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -16
@@ -32188,6 +39706,17 @@ define i64 @atomicrmw_or_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_or_i64_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB214_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d.aqrl a2, (a0)
+; RV64I-ZALRSC-NEXT:    or a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB214_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_or_i64_seq_cst:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amoor.d.aqrl a0, a1, (a0)
@@ -32212,6 +39741,16 @@ define i64 @atomicrmw_xor_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xor_i64_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a3, 0
+; RV32I-ZALRSC-NEXT:    call __atomic_fetch_xor_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_xor_i64_monotonic:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -16
@@ -32232,6 +39771,17 @@ define i64 @atomicrmw_xor_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xor_i64_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB215_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d a2, (a0)
+; RV64I-ZALRSC-NEXT:    xor a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.d a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB215_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-LABEL: atomicrmw_xor_i64_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    amoxor.d a0, a1, (a0)
@@ -32251,6 +39801,16 @@ define i64 @atomicrmw_xor_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xor_i64_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a3, 2
+; RV32I-ZALRSC-NEXT:    call __atomic_fetch_xor_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_xor_i64_acquire:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -16
@@ -32271,6 +39831,17 @@ define i64 @atomicrmw_xor_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xor_i64_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB216_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d.aq a2, (a0)
+; RV64I-ZALRSC-NEXT:    xor a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.d a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB216_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_xor_i64_acquire:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amoxor.d.aq a0, a1, (a0)
@@ -32295,6 +39866,16 @@ define i64 @atomicrmw_xor_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xor_i64_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a3, 3
+; RV32I-ZALRSC-NEXT:    call __atomic_fetch_xor_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_xor_i64_release:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -16
@@ -32315,6 +39896,17 @@ define i64 @atomicrmw_xor_i64_release(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xor_i64_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB217_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d a2, (a0)
+; RV64I-ZALRSC-NEXT:    xor a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB217_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_xor_i64_release:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amoxor.d.rl a0, a1, (a0)
@@ -32339,6 +39931,16 @@ define i64 @atomicrmw_xor_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xor_i64_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a3, 4
+; RV32I-ZALRSC-NEXT:    call __atomic_fetch_xor_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_xor_i64_acq_rel:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -16
@@ -32359,6 +39961,17 @@ define i64 @atomicrmw_xor_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xor_i64_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB218_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d.aq a2, (a0)
+; RV64I-ZALRSC-NEXT:    xor a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB218_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_xor_i64_acq_rel:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amoxor.d.aqrl a0, a1, (a0)
@@ -32383,6 +39996,16 @@ define i64 @atomicrmw_xor_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xor_i64_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a3, 5
+; RV32I-ZALRSC-NEXT:    call __atomic_fetch_xor_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_xor_i64_seq_cst:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -16
@@ -32403,6 +40026,17 @@ define i64 @atomicrmw_xor_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_xor_i64_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB219_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d.aqrl a2, (a0)
+; RV64I-ZALRSC-NEXT:    xor a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB219_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_xor_i64_seq_cst:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amoxor.d.aqrl a0, a1, (a0)
@@ -32471,6 +40105,60 @@ define i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_max_i64_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -32
+; RV32I-ZALRSC-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    mv s0, a2
+; RV32I-ZALRSC-NEXT:    mv s1, a0
+; RV32I-ZALRSC-NEXT:    lw a4, 0(a0)
+; RV32I-ZALRSC-NEXT:    lw a5, 4(a0)
+; RV32I-ZALRSC-NEXT:    mv s2, a1
+; RV32I-ZALRSC-NEXT:    j .LBB220_2
+; RV32I-ZALRSC-NEXT:  .LBB220_1: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB220_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    sw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    addi a1, sp, 8
+; RV32I-ZALRSC-NEXT:    mv a0, s1
+; RV32I-ZALRSC-NEXT:    li a4, 0
+; RV32I-ZALRSC-NEXT:    li a5, 0
+; RV32I-ZALRSC-NEXT:    call __atomic_compare_exchange_8
+; RV32I-ZALRSC-NEXT:    lw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    lw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    bnez a0, .LBB220_7
+; RV32I-ZALRSC-NEXT:  .LBB220_2: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    beq a5, s0, .LBB220_4
+; RV32I-ZALRSC-NEXT:  # %bb.3: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB220_2 Depth=1
+; RV32I-ZALRSC-NEXT:    slt a0, s0, a5
+; RV32I-ZALRSC-NEXT:    j .LBB220_5
+; RV32I-ZALRSC-NEXT:  .LBB220_4: # in Loop: Header=BB220_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sltu a0, s2, a4
+; RV32I-ZALRSC-NEXT:  .LBB220_5: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB220_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, a4
+; RV32I-ZALRSC-NEXT:    mv a3, a5
+; RV32I-ZALRSC-NEXT:    bnez a0, .LBB220_1
+; RV32I-ZALRSC-NEXT:  # %bb.6: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB220_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, s2
+; RV32I-ZALRSC-NEXT:    mv a3, s0
+; RV32I-ZALRSC-NEXT:    j .LBB220_1
+; RV32I-ZALRSC-NEXT:  .LBB220_7: # %atomicrmw.end
+; RV32I-ZALRSC-NEXT:    mv a0, a4
+; RV32I-ZALRSC-NEXT:    mv a1, a5
+; RV32I-ZALRSC-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 32
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_max_i64_monotonic:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -32
@@ -32561,6 +40249,21 @@ define i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_max_i64_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB220_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d a2, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:    bge a3, a1, .LBB220_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB220_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:  .LBB220_3: # in Loop: Header=BB220_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.d a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB220_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-LABEL: atomicrmw_max_i64_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    amomax.d a0, a1, (a0)
@@ -32624,6 +40327,60 @@ define i64 @atomicrmw_max_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_max_i64_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -32
+; RV32I-ZALRSC-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    mv s0, a2
+; RV32I-ZALRSC-NEXT:    mv s1, a0
+; RV32I-ZALRSC-NEXT:    lw a4, 0(a0)
+; RV32I-ZALRSC-NEXT:    lw a5, 4(a0)
+; RV32I-ZALRSC-NEXT:    mv s2, a1
+; RV32I-ZALRSC-NEXT:    j .LBB221_2
+; RV32I-ZALRSC-NEXT:  .LBB221_1: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB221_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    sw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    addi a1, sp, 8
+; RV32I-ZALRSC-NEXT:    li a4, 2
+; RV32I-ZALRSC-NEXT:    li a5, 2
+; RV32I-ZALRSC-NEXT:    mv a0, s1
+; RV32I-ZALRSC-NEXT:    call __atomic_compare_exchange_8
+; RV32I-ZALRSC-NEXT:    lw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    lw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    bnez a0, .LBB221_7
+; RV32I-ZALRSC-NEXT:  .LBB221_2: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    beq a5, s0, .LBB221_4
+; RV32I-ZALRSC-NEXT:  # %bb.3: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB221_2 Depth=1
+; RV32I-ZALRSC-NEXT:    slt a0, s0, a5
+; RV32I-ZALRSC-NEXT:    j .LBB221_5
+; RV32I-ZALRSC-NEXT:  .LBB221_4: # in Loop: Header=BB221_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sltu a0, s2, a4
+; RV32I-ZALRSC-NEXT:  .LBB221_5: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB221_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, a4
+; RV32I-ZALRSC-NEXT:    mv a3, a5
+; RV32I-ZALRSC-NEXT:    bnez a0, .LBB221_1
+; RV32I-ZALRSC-NEXT:  # %bb.6: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB221_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, s2
+; RV32I-ZALRSC-NEXT:    mv a3, s0
+; RV32I-ZALRSC-NEXT:    j .LBB221_1
+; RV32I-ZALRSC-NEXT:  .LBB221_7: # %atomicrmw.end
+; RV32I-ZALRSC-NEXT:    mv a0, a4
+; RV32I-ZALRSC-NEXT:    mv a1, a5
+; RV32I-ZALRSC-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 32
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_max_i64_acquire:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -32
@@ -32714,6 +40471,21 @@ define i64 @atomicrmw_max_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_max_i64_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB221_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d.aq a2, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:    bge a3, a1, .LBB221_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB221_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:  .LBB221_3: # in Loop: Header=BB221_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.d a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB221_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_max_i64_acquire:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amomax.d.aq a0, a1, (a0)
@@ -32782,6 +40554,60 @@ define i64 @atomicrmw_max_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_max_i64_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -32
+; RV32I-ZALRSC-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    mv s0, a2
+; RV32I-ZALRSC-NEXT:    mv s1, a0
+; RV32I-ZALRSC-NEXT:    lw a4, 0(a0)
+; RV32I-ZALRSC-NEXT:    lw a5, 4(a0)
+; RV32I-ZALRSC-NEXT:    mv s2, a1
+; RV32I-ZALRSC-NEXT:    j .LBB222_2
+; RV32I-ZALRSC-NEXT:  .LBB222_1: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB222_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    sw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    addi a1, sp, 8
+; RV32I-ZALRSC-NEXT:    li a4, 3
+; RV32I-ZALRSC-NEXT:    mv a0, s1
+; RV32I-ZALRSC-NEXT:    li a5, 0
+; RV32I-ZALRSC-NEXT:    call __atomic_compare_exchange_8
+; RV32I-ZALRSC-NEXT:    lw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    lw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    bnez a0, .LBB222_7
+; RV32I-ZALRSC-NEXT:  .LBB222_2: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    beq a5, s0, .LBB222_4
+; RV32I-ZALRSC-NEXT:  # %bb.3: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB222_2 Depth=1
+; RV32I-ZALRSC-NEXT:    slt a0, s0, a5
+; RV32I-ZALRSC-NEXT:    j .LBB222_5
+; RV32I-ZALRSC-NEXT:  .LBB222_4: # in Loop: Header=BB222_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sltu a0, s2, a4
+; RV32I-ZALRSC-NEXT:  .LBB222_5: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB222_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, a4
+; RV32I-ZALRSC-NEXT:    mv a3, a5
+; RV32I-ZALRSC-NEXT:    bnez a0, .LBB222_1
+; RV32I-ZALRSC-NEXT:  # %bb.6: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB222_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, s2
+; RV32I-ZALRSC-NEXT:    mv a3, s0
+; RV32I-ZALRSC-NEXT:    j .LBB222_1
+; RV32I-ZALRSC-NEXT:  .LBB222_7: # %atomicrmw.end
+; RV32I-ZALRSC-NEXT:    mv a0, a4
+; RV32I-ZALRSC-NEXT:    mv a1, a5
+; RV32I-ZALRSC-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 32
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_max_i64_release:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -32
@@ -32872,6 +40698,21 @@ define i64 @atomicrmw_max_i64_release(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_max_i64_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB222_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d a2, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:    bge a3, a1, .LBB222_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB222_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:  .LBB222_3: # in Loop: Header=BB222_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB222_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_max_i64_release:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amomax.d.rl a0, a1, (a0)
@@ -32940,6 +40781,60 @@ define i64 @atomicrmw_max_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_max_i64_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -32
+; RV32I-ZALRSC-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    mv s0, a2
+; RV32I-ZALRSC-NEXT:    mv s1, a0
+; RV32I-ZALRSC-NEXT:    lw a4, 0(a0)
+; RV32I-ZALRSC-NEXT:    lw a5, 4(a0)
+; RV32I-ZALRSC-NEXT:    mv s2, a1
+; RV32I-ZALRSC-NEXT:    j .LBB223_2
+; RV32I-ZALRSC-NEXT:  .LBB223_1: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB223_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    sw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    addi a1, sp, 8
+; RV32I-ZALRSC-NEXT:    li a4, 4
+; RV32I-ZALRSC-NEXT:    li a5, 2
+; RV32I-ZALRSC-NEXT:    mv a0, s1
+; RV32I-ZALRSC-NEXT:    call __atomic_compare_exchange_8
+; RV32I-ZALRSC-NEXT:    lw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    lw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    bnez a0, .LBB223_7
+; RV32I-ZALRSC-NEXT:  .LBB223_2: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    beq a5, s0, .LBB223_4
+; RV32I-ZALRSC-NEXT:  # %bb.3: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB223_2 Depth=1
+; RV32I-ZALRSC-NEXT:    slt a0, s0, a5
+; RV32I-ZALRSC-NEXT:    j .LBB223_5
+; RV32I-ZALRSC-NEXT:  .LBB223_4: # in Loop: Header=BB223_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sltu a0, s2, a4
+; RV32I-ZALRSC-NEXT:  .LBB223_5: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB223_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, a4
+; RV32I-ZALRSC-NEXT:    mv a3, a5
+; RV32I-ZALRSC-NEXT:    bnez a0, .LBB223_1
+; RV32I-ZALRSC-NEXT:  # %bb.6: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB223_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, s2
+; RV32I-ZALRSC-NEXT:    mv a3, s0
+; RV32I-ZALRSC-NEXT:    j .LBB223_1
+; RV32I-ZALRSC-NEXT:  .LBB223_7: # %atomicrmw.end
+; RV32I-ZALRSC-NEXT:    mv a0, a4
+; RV32I-ZALRSC-NEXT:    mv a1, a5
+; RV32I-ZALRSC-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 32
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_max_i64_acq_rel:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -32
@@ -33030,6 +40925,21 @@ define i64 @atomicrmw_max_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_max_i64_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB223_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d.aq a2, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:    bge a3, a1, .LBB223_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB223_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:  .LBB223_3: # in Loop: Header=BB223_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB223_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_max_i64_acq_rel:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amomax.d.aqrl a0, a1, (a0)
@@ -33098,6 +41008,60 @@ define i64 @atomicrmw_max_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_max_i64_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -32
+; RV32I-ZALRSC-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    mv s0, a2
+; RV32I-ZALRSC-NEXT:    mv s1, a0
+; RV32I-ZALRSC-NEXT:    lw a4, 0(a0)
+; RV32I-ZALRSC-NEXT:    lw a5, 4(a0)
+; RV32I-ZALRSC-NEXT:    mv s2, a1
+; RV32I-ZALRSC-NEXT:    j .LBB224_2
+; RV32I-ZALRSC-NEXT:  .LBB224_1: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB224_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    sw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    addi a1, sp, 8
+; RV32I-ZALRSC-NEXT:    li a4, 5
+; RV32I-ZALRSC-NEXT:    li a5, 5
+; RV32I-ZALRSC-NEXT:    mv a0, s1
+; RV32I-ZALRSC-NEXT:    call __atomic_compare_exchange_8
+; RV32I-ZALRSC-NEXT:    lw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    lw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    bnez a0, .LBB224_7
+; RV32I-ZALRSC-NEXT:  .LBB224_2: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    beq a5, s0, .LBB224_4
+; RV32I-ZALRSC-NEXT:  # %bb.3: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB224_2 Depth=1
+; RV32I-ZALRSC-NEXT:    slt a0, s0, a5
+; RV32I-ZALRSC-NEXT:    j .LBB224_5
+; RV32I-ZALRSC-NEXT:  .LBB224_4: # in Loop: Header=BB224_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sltu a0, s2, a4
+; RV32I-ZALRSC-NEXT:  .LBB224_5: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB224_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, a4
+; RV32I-ZALRSC-NEXT:    mv a3, a5
+; RV32I-ZALRSC-NEXT:    bnez a0, .LBB224_1
+; RV32I-ZALRSC-NEXT:  # %bb.6: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB224_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, s2
+; RV32I-ZALRSC-NEXT:    mv a3, s0
+; RV32I-ZALRSC-NEXT:    j .LBB224_1
+; RV32I-ZALRSC-NEXT:  .LBB224_7: # %atomicrmw.end
+; RV32I-ZALRSC-NEXT:    mv a0, a4
+; RV32I-ZALRSC-NEXT:    mv a1, a5
+; RV32I-ZALRSC-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 32
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_max_i64_seq_cst:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -32
@@ -33188,6 +41152,21 @@ define i64 @atomicrmw_max_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_max_i64_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB224_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d.aqrl a2, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:    bge a3, a1, .LBB224_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB224_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:  .LBB224_3: # in Loop: Header=BB224_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB224_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_max_i64_seq_cst:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amomax.d.aqrl a0, a1, (a0)
@@ -33256,6 +41235,60 @@ define i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_min_i64_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -32
+; RV32I-ZALRSC-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    mv s0, a2
+; RV32I-ZALRSC-NEXT:    mv s1, a0
+; RV32I-ZALRSC-NEXT:    lw a4, 0(a0)
+; RV32I-ZALRSC-NEXT:    lw a5, 4(a0)
+; RV32I-ZALRSC-NEXT:    mv s2, a1
+; RV32I-ZALRSC-NEXT:    j .LBB225_2
+; RV32I-ZALRSC-NEXT:  .LBB225_1: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB225_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    sw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    addi a1, sp, 8
+; RV32I-ZALRSC-NEXT:    mv a0, s1
+; RV32I-ZALRSC-NEXT:    li a4, 0
+; RV32I-ZALRSC-NEXT:    li a5, 0
+; RV32I-ZALRSC-NEXT:    call __atomic_compare_exchange_8
+; RV32I-ZALRSC-NEXT:    lw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    lw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    bnez a0, .LBB225_7
+; RV32I-ZALRSC-NEXT:  .LBB225_2: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    beq a5, s0, .LBB225_4
+; RV32I-ZALRSC-NEXT:  # %bb.3: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB225_2 Depth=1
+; RV32I-ZALRSC-NEXT:    slt a0, s0, a5
+; RV32I-ZALRSC-NEXT:    j .LBB225_5
+; RV32I-ZALRSC-NEXT:  .LBB225_4: # in Loop: Header=BB225_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sltu a0, s2, a4
+; RV32I-ZALRSC-NEXT:  .LBB225_5: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB225_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, a4
+; RV32I-ZALRSC-NEXT:    mv a3, a5
+; RV32I-ZALRSC-NEXT:    beqz a0, .LBB225_1
+; RV32I-ZALRSC-NEXT:  # %bb.6: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB225_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, s2
+; RV32I-ZALRSC-NEXT:    mv a3, s0
+; RV32I-ZALRSC-NEXT:    j .LBB225_1
+; RV32I-ZALRSC-NEXT:  .LBB225_7: # %atomicrmw.end
+; RV32I-ZALRSC-NEXT:    mv a0, a4
+; RV32I-ZALRSC-NEXT:    mv a1, a5
+; RV32I-ZALRSC-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 32
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_min_i64_monotonic:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -32
@@ -33346,6 +41379,21 @@ define i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_min_i64_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB225_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d a2, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:    bge a1, a3, .LBB225_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB225_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:  .LBB225_3: # in Loop: Header=BB225_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.d a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB225_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-LABEL: atomicrmw_min_i64_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    amomin.d a0, a1, (a0)
@@ -33409,6 +41457,60 @@ define i64 @atomicrmw_min_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_min_i64_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -32
+; RV32I-ZALRSC-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    mv s0, a2
+; RV32I-ZALRSC-NEXT:    mv s1, a0
+; RV32I-ZALRSC-NEXT:    lw a4, 0(a0)
+; RV32I-ZALRSC-NEXT:    lw a5, 4(a0)
+; RV32I-ZALRSC-NEXT:    mv s2, a1
+; RV32I-ZALRSC-NEXT:    j .LBB226_2
+; RV32I-ZALRSC-NEXT:  .LBB226_1: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB226_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    sw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    addi a1, sp, 8
+; RV32I-ZALRSC-NEXT:    li a4, 2
+; RV32I-ZALRSC-NEXT:    li a5, 2
+; RV32I-ZALRSC-NEXT:    mv a0, s1
+; RV32I-ZALRSC-NEXT:    call __atomic_compare_exchange_8
+; RV32I-ZALRSC-NEXT:    lw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    lw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    bnez a0, .LBB226_7
+; RV32I-ZALRSC-NEXT:  .LBB226_2: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    beq a5, s0, .LBB226_4
+; RV32I-ZALRSC-NEXT:  # %bb.3: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB226_2 Depth=1
+; RV32I-ZALRSC-NEXT:    slt a0, s0, a5
+; RV32I-ZALRSC-NEXT:    j .LBB226_5
+; RV32I-ZALRSC-NEXT:  .LBB226_4: # in Loop: Header=BB226_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sltu a0, s2, a4
+; RV32I-ZALRSC-NEXT:  .LBB226_5: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB226_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, a4
+; RV32I-ZALRSC-NEXT:    mv a3, a5
+; RV32I-ZALRSC-NEXT:    beqz a0, .LBB226_1
+; RV32I-ZALRSC-NEXT:  # %bb.6: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB226_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, s2
+; RV32I-ZALRSC-NEXT:    mv a3, s0
+; RV32I-ZALRSC-NEXT:    j .LBB226_1
+; RV32I-ZALRSC-NEXT:  .LBB226_7: # %atomicrmw.end
+; RV32I-ZALRSC-NEXT:    mv a0, a4
+; RV32I-ZALRSC-NEXT:    mv a1, a5
+; RV32I-ZALRSC-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 32
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_min_i64_acquire:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -32
@@ -33499,6 +41601,21 @@ define i64 @atomicrmw_min_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_min_i64_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB226_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d.aq a2, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:    bge a1, a3, .LBB226_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB226_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:  .LBB226_3: # in Loop: Header=BB226_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.d a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB226_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_min_i64_acquire:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amomin.d.aq a0, a1, (a0)
@@ -33567,6 +41684,60 @@ define i64 @atomicrmw_min_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_min_i64_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -32
+; RV32I-ZALRSC-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    mv s0, a2
+; RV32I-ZALRSC-NEXT:    mv s1, a0
+; RV32I-ZALRSC-NEXT:    lw a4, 0(a0)
+; RV32I-ZALRSC-NEXT:    lw a5, 4(a0)
+; RV32I-ZALRSC-NEXT:    mv s2, a1
+; RV32I-ZALRSC-NEXT:    j .LBB227_2
+; RV32I-ZALRSC-NEXT:  .LBB227_1: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB227_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    sw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    addi a1, sp, 8
+; RV32I-ZALRSC-NEXT:    li a4, 3
+; RV32I-ZALRSC-NEXT:    mv a0, s1
+; RV32I-ZALRSC-NEXT:    li a5, 0
+; RV32I-ZALRSC-NEXT:    call __atomic_compare_exchange_8
+; RV32I-ZALRSC-NEXT:    lw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    lw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    bnez a0, .LBB227_7
+; RV32I-ZALRSC-NEXT:  .LBB227_2: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    beq a5, s0, .LBB227_4
+; RV32I-ZALRSC-NEXT:  # %bb.3: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB227_2 Depth=1
+; RV32I-ZALRSC-NEXT:    slt a0, s0, a5
+; RV32I-ZALRSC-NEXT:    j .LBB227_5
+; RV32I-ZALRSC-NEXT:  .LBB227_4: # in Loop: Header=BB227_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sltu a0, s2, a4
+; RV32I-ZALRSC-NEXT:  .LBB227_5: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB227_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, a4
+; RV32I-ZALRSC-NEXT:    mv a3, a5
+; RV32I-ZALRSC-NEXT:    beqz a0, .LBB227_1
+; RV32I-ZALRSC-NEXT:  # %bb.6: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB227_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, s2
+; RV32I-ZALRSC-NEXT:    mv a3, s0
+; RV32I-ZALRSC-NEXT:    j .LBB227_1
+; RV32I-ZALRSC-NEXT:  .LBB227_7: # %atomicrmw.end
+; RV32I-ZALRSC-NEXT:    mv a0, a4
+; RV32I-ZALRSC-NEXT:    mv a1, a5
+; RV32I-ZALRSC-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 32
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_min_i64_release:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -32
@@ -33657,6 +41828,21 @@ define i64 @atomicrmw_min_i64_release(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_min_i64_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB227_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d a2, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:    bge a1, a3, .LBB227_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB227_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:  .LBB227_3: # in Loop: Header=BB227_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB227_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_min_i64_release:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amomin.d.rl a0, a1, (a0)
@@ -33725,6 +41911,60 @@ define i64 @atomicrmw_min_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_min_i64_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -32
+; RV32I-ZALRSC-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    mv s0, a2
+; RV32I-ZALRSC-NEXT:    mv s1, a0
+; RV32I-ZALRSC-NEXT:    lw a4, 0(a0)
+; RV32I-ZALRSC-NEXT:    lw a5, 4(a0)
+; RV32I-ZALRSC-NEXT:    mv s2, a1
+; RV32I-ZALRSC-NEXT:    j .LBB228_2
+; RV32I-ZALRSC-NEXT:  .LBB228_1: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB228_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    sw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    addi a1, sp, 8
+; RV32I-ZALRSC-NEXT:    li a4, 4
+; RV32I-ZALRSC-NEXT:    li a5, 2
+; RV32I-ZALRSC-NEXT:    mv a0, s1
+; RV32I-ZALRSC-NEXT:    call __atomic_compare_exchange_8
+; RV32I-ZALRSC-NEXT:    lw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    lw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    bnez a0, .LBB228_7
+; RV32I-ZALRSC-NEXT:  .LBB228_2: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    beq a5, s0, .LBB228_4
+; RV32I-ZALRSC-NEXT:  # %bb.3: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB228_2 Depth=1
+; RV32I-ZALRSC-NEXT:    slt a0, s0, a5
+; RV32I-ZALRSC-NEXT:    j .LBB228_5
+; RV32I-ZALRSC-NEXT:  .LBB228_4: # in Loop: Header=BB228_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sltu a0, s2, a4
+; RV32I-ZALRSC-NEXT:  .LBB228_5: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB228_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, a4
+; RV32I-ZALRSC-NEXT:    mv a3, a5
+; RV32I-ZALRSC-NEXT:    beqz a0, .LBB228_1
+; RV32I-ZALRSC-NEXT:  # %bb.6: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB228_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, s2
+; RV32I-ZALRSC-NEXT:    mv a3, s0
+; RV32I-ZALRSC-NEXT:    j .LBB228_1
+; RV32I-ZALRSC-NEXT:  .LBB228_7: # %atomicrmw.end
+; RV32I-ZALRSC-NEXT:    mv a0, a4
+; RV32I-ZALRSC-NEXT:    mv a1, a5
+; RV32I-ZALRSC-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 32
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_min_i64_acq_rel:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -32
@@ -33815,6 +42055,21 @@ define i64 @atomicrmw_min_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_min_i64_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB228_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d.aq a2, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:    bge a1, a3, .LBB228_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB228_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:  .LBB228_3: # in Loop: Header=BB228_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB228_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_min_i64_acq_rel:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amomin.d.aqrl a0, a1, (a0)
@@ -33883,6 +42138,60 @@ define i64 @atomicrmw_min_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_min_i64_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -32
+; RV32I-ZALRSC-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    mv s0, a2
+; RV32I-ZALRSC-NEXT:    mv s1, a0
+; RV32I-ZALRSC-NEXT:    lw a4, 0(a0)
+; RV32I-ZALRSC-NEXT:    lw a5, 4(a0)
+; RV32I-ZALRSC-NEXT:    mv s2, a1
+; RV32I-ZALRSC-NEXT:    j .LBB229_2
+; RV32I-ZALRSC-NEXT:  .LBB229_1: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB229_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    sw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    addi a1, sp, 8
+; RV32I-ZALRSC-NEXT:    li a4, 5
+; RV32I-ZALRSC-NEXT:    li a5, 5
+; RV32I-ZALRSC-NEXT:    mv a0, s1
+; RV32I-ZALRSC-NEXT:    call __atomic_compare_exchange_8
+; RV32I-ZALRSC-NEXT:    lw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    lw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    bnez a0, .LBB229_7
+; RV32I-ZALRSC-NEXT:  .LBB229_2: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    beq a5, s0, .LBB229_4
+; RV32I-ZALRSC-NEXT:  # %bb.3: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB229_2 Depth=1
+; RV32I-ZALRSC-NEXT:    slt a0, s0, a5
+; RV32I-ZALRSC-NEXT:    j .LBB229_5
+; RV32I-ZALRSC-NEXT:  .LBB229_4: # in Loop: Header=BB229_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sltu a0, s2, a4
+; RV32I-ZALRSC-NEXT:  .LBB229_5: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB229_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, a4
+; RV32I-ZALRSC-NEXT:    mv a3, a5
+; RV32I-ZALRSC-NEXT:    beqz a0, .LBB229_1
+; RV32I-ZALRSC-NEXT:  # %bb.6: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB229_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, s2
+; RV32I-ZALRSC-NEXT:    mv a3, s0
+; RV32I-ZALRSC-NEXT:    j .LBB229_1
+; RV32I-ZALRSC-NEXT:  .LBB229_7: # %atomicrmw.end
+; RV32I-ZALRSC-NEXT:    mv a0, a4
+; RV32I-ZALRSC-NEXT:    mv a1, a5
+; RV32I-ZALRSC-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 32
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_min_i64_seq_cst:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -32
@@ -33973,6 +42282,21 @@ define i64 @atomicrmw_min_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_min_i64_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB229_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d.aqrl a2, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:    bge a1, a3, .LBB229_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB229_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:  .LBB229_3: # in Loop: Header=BB229_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB229_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_min_i64_seq_cst:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amomin.d.aqrl a0, a1, (a0)
@@ -34041,6 +42365,60 @@ define i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umax_i64_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -32
+; RV32I-ZALRSC-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    mv s0, a2
+; RV32I-ZALRSC-NEXT:    mv s1, a0
+; RV32I-ZALRSC-NEXT:    lw a4, 0(a0)
+; RV32I-ZALRSC-NEXT:    lw a5, 4(a0)
+; RV32I-ZALRSC-NEXT:    mv s2, a1
+; RV32I-ZALRSC-NEXT:    j .LBB230_2
+; RV32I-ZALRSC-NEXT:  .LBB230_1: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB230_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    sw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    addi a1, sp, 8
+; RV32I-ZALRSC-NEXT:    mv a0, s1
+; RV32I-ZALRSC-NEXT:    li a4, 0
+; RV32I-ZALRSC-NEXT:    li a5, 0
+; RV32I-ZALRSC-NEXT:    call __atomic_compare_exchange_8
+; RV32I-ZALRSC-NEXT:    lw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    lw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    bnez a0, .LBB230_7
+; RV32I-ZALRSC-NEXT:  .LBB230_2: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    beq a5, s0, .LBB230_4
+; RV32I-ZALRSC-NEXT:  # %bb.3: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB230_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sltu a0, s0, a5
+; RV32I-ZALRSC-NEXT:    j .LBB230_5
+; RV32I-ZALRSC-NEXT:  .LBB230_4: # in Loop: Header=BB230_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sltu a0, s2, a4
+; RV32I-ZALRSC-NEXT:  .LBB230_5: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB230_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, a4
+; RV32I-ZALRSC-NEXT:    mv a3, a5
+; RV32I-ZALRSC-NEXT:    bnez a0, .LBB230_1
+; RV32I-ZALRSC-NEXT:  # %bb.6: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB230_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, s2
+; RV32I-ZALRSC-NEXT:    mv a3, s0
+; RV32I-ZALRSC-NEXT:    j .LBB230_1
+; RV32I-ZALRSC-NEXT:  .LBB230_7: # %atomicrmw.end
+; RV32I-ZALRSC-NEXT:    mv a0, a4
+; RV32I-ZALRSC-NEXT:    mv a1, a5
+; RV32I-ZALRSC-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 32
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_umax_i64_monotonic:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -32
@@ -34131,6 +42509,21 @@ define i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_umax_i64_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB230_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d a2, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:    bgeu a3, a1, .LBB230_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB230_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:  .LBB230_3: # in Loop: Header=BB230_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.d a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB230_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-LABEL: atomicrmw_umax_i64_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    amomaxu.d a0, a1, (a0)
@@ -34194,6 +42587,60 @@ define i64 @atomicrmw_umax_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umax_i64_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -32
+; RV32I-ZALRSC-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    mv s0, a2
+; RV32I-ZALRSC-NEXT:    mv s1, a0
+; RV32I-ZALRSC-NEXT:    lw a4, 0(a0)
+; RV32I-ZALRSC-NEXT:    lw a5, 4(a0)
+; RV32I-ZALRSC-NEXT:    mv s2, a1
+; RV32I-ZALRSC-NEXT:    j .LBB231_2
+; RV32I-ZALRSC-NEXT:  .LBB231_1: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB231_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    sw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    addi a1, sp, 8
+; RV32I-ZALRSC-NEXT:    li a4, 2
+; RV32I-ZALRSC-NEXT:    li a5, 2
+; RV32I-ZALRSC-NEXT:    mv a0, s1
+; RV32I-ZALRSC-NEXT:    call __atomic_compare_exchange_8
+; RV32I-ZALRSC-NEXT:    lw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    lw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    bnez a0, .LBB231_7
+; RV32I-ZALRSC-NEXT:  .LBB231_2: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    beq a5, s0, .LBB231_4
+; RV32I-ZALRSC-NEXT:  # %bb.3: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB231_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sltu a0, s0, a5
+; RV32I-ZALRSC-NEXT:    j .LBB231_5
+; RV32I-ZALRSC-NEXT:  .LBB231_4: # in Loop: Header=BB231_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sltu a0, s2, a4
+; RV32I-ZALRSC-NEXT:  .LBB231_5: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB231_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, a4
+; RV32I-ZALRSC-NEXT:    mv a3, a5
+; RV32I-ZALRSC-NEXT:    bnez a0, .LBB231_1
+; RV32I-ZALRSC-NEXT:  # %bb.6: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB231_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, s2
+; RV32I-ZALRSC-NEXT:    mv a3, s0
+; RV32I-ZALRSC-NEXT:    j .LBB231_1
+; RV32I-ZALRSC-NEXT:  .LBB231_7: # %atomicrmw.end
+; RV32I-ZALRSC-NEXT:    mv a0, a4
+; RV32I-ZALRSC-NEXT:    mv a1, a5
+; RV32I-ZALRSC-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 32
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_umax_i64_acquire:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -32
@@ -34284,6 +42731,21 @@ define i64 @atomicrmw_umax_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_umax_i64_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB231_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d.aq a2, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:    bgeu a3, a1, .LBB231_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB231_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:  .LBB231_3: # in Loop: Header=BB231_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.d a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB231_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_umax_i64_acquire:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amomaxu.d.aq a0, a1, (a0)
@@ -34352,6 +42814,60 @@ define i64 @atomicrmw_umax_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umax_i64_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -32
+; RV32I-ZALRSC-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    mv s0, a2
+; RV32I-ZALRSC-NEXT:    mv s1, a0
+; RV32I-ZALRSC-NEXT:    lw a4, 0(a0)
+; RV32I-ZALRSC-NEXT:    lw a5, 4(a0)
+; RV32I-ZALRSC-NEXT:    mv s2, a1
+; RV32I-ZALRSC-NEXT:    j .LBB232_2
+; RV32I-ZALRSC-NEXT:  .LBB232_1: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB232_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    sw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    addi a1, sp, 8
+; RV32I-ZALRSC-NEXT:    li a4, 3
+; RV32I-ZALRSC-NEXT:    mv a0, s1
+; RV32I-ZALRSC-NEXT:    li a5, 0
+; RV32I-ZALRSC-NEXT:    call __atomic_compare_exchange_8
+; RV32I-ZALRSC-NEXT:    lw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    lw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    bnez a0, .LBB232_7
+; RV32I-ZALRSC-NEXT:  .LBB232_2: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    beq a5, s0, .LBB232_4
+; RV32I-ZALRSC-NEXT:  # %bb.3: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB232_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sltu a0, s0, a5
+; RV32I-ZALRSC-NEXT:    j .LBB232_5
+; RV32I-ZALRSC-NEXT:  .LBB232_4: # in Loop: Header=BB232_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sltu a0, s2, a4
+; RV32I-ZALRSC-NEXT:  .LBB232_5: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB232_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, a4
+; RV32I-ZALRSC-NEXT:    mv a3, a5
+; RV32I-ZALRSC-NEXT:    bnez a0, .LBB232_1
+; RV32I-ZALRSC-NEXT:  # %bb.6: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB232_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, s2
+; RV32I-ZALRSC-NEXT:    mv a3, s0
+; RV32I-ZALRSC-NEXT:    j .LBB232_1
+; RV32I-ZALRSC-NEXT:  .LBB232_7: # %atomicrmw.end
+; RV32I-ZALRSC-NEXT:    mv a0, a4
+; RV32I-ZALRSC-NEXT:    mv a1, a5
+; RV32I-ZALRSC-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 32
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_umax_i64_release:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -32
@@ -34442,6 +42958,21 @@ define i64 @atomicrmw_umax_i64_release(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_umax_i64_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB232_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d a2, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:    bgeu a3, a1, .LBB232_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB232_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:  .LBB232_3: # in Loop: Header=BB232_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB232_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_umax_i64_release:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amomaxu.d.rl a0, a1, (a0)
@@ -34510,6 +43041,60 @@ define i64 @atomicrmw_umax_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umax_i64_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -32
+; RV32I-ZALRSC-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    mv s0, a2
+; RV32I-ZALRSC-NEXT:    mv s1, a0
+; RV32I-ZALRSC-NEXT:    lw a4, 0(a0)
+; RV32I-ZALRSC-NEXT:    lw a5, 4(a0)
+; RV32I-ZALRSC-NEXT:    mv s2, a1
+; RV32I-ZALRSC-NEXT:    j .LBB233_2
+; RV32I-ZALRSC-NEXT:  .LBB233_1: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB233_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    sw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    addi a1, sp, 8
+; RV32I-ZALRSC-NEXT:    li a4, 4
+; RV32I-ZALRSC-NEXT:    li a5, 2
+; RV32I-ZALRSC-NEXT:    mv a0, s1
+; RV32I-ZALRSC-NEXT:    call __atomic_compare_exchange_8
+; RV32I-ZALRSC-NEXT:    lw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    lw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    bnez a0, .LBB233_7
+; RV32I-ZALRSC-NEXT:  .LBB233_2: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    beq a5, s0, .LBB233_4
+; RV32I-ZALRSC-NEXT:  # %bb.3: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB233_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sltu a0, s0, a5
+; RV32I-ZALRSC-NEXT:    j .LBB233_5
+; RV32I-ZALRSC-NEXT:  .LBB233_4: # in Loop: Header=BB233_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sltu a0, s2, a4
+; RV32I-ZALRSC-NEXT:  .LBB233_5: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB233_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, a4
+; RV32I-ZALRSC-NEXT:    mv a3, a5
+; RV32I-ZALRSC-NEXT:    bnez a0, .LBB233_1
+; RV32I-ZALRSC-NEXT:  # %bb.6: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB233_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, s2
+; RV32I-ZALRSC-NEXT:    mv a3, s0
+; RV32I-ZALRSC-NEXT:    j .LBB233_1
+; RV32I-ZALRSC-NEXT:  .LBB233_7: # %atomicrmw.end
+; RV32I-ZALRSC-NEXT:    mv a0, a4
+; RV32I-ZALRSC-NEXT:    mv a1, a5
+; RV32I-ZALRSC-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 32
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_umax_i64_acq_rel:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -32
@@ -34600,6 +43185,21 @@ define i64 @atomicrmw_umax_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_umax_i64_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB233_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d.aq a2, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:    bgeu a3, a1, .LBB233_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB233_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:  .LBB233_3: # in Loop: Header=BB233_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB233_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_umax_i64_acq_rel:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amomaxu.d.aqrl a0, a1, (a0)
@@ -34668,6 +43268,60 @@ define i64 @atomicrmw_umax_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umax_i64_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -32
+; RV32I-ZALRSC-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    mv s0, a2
+; RV32I-ZALRSC-NEXT:    mv s1, a0
+; RV32I-ZALRSC-NEXT:    lw a4, 0(a0)
+; RV32I-ZALRSC-NEXT:    lw a5, 4(a0)
+; RV32I-ZALRSC-NEXT:    mv s2, a1
+; RV32I-ZALRSC-NEXT:    j .LBB234_2
+; RV32I-ZALRSC-NEXT:  .LBB234_1: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB234_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    sw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    addi a1, sp, 8
+; RV32I-ZALRSC-NEXT:    li a4, 5
+; RV32I-ZALRSC-NEXT:    li a5, 5
+; RV32I-ZALRSC-NEXT:    mv a0, s1
+; RV32I-ZALRSC-NEXT:    call __atomic_compare_exchange_8
+; RV32I-ZALRSC-NEXT:    lw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    lw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    bnez a0, .LBB234_7
+; RV32I-ZALRSC-NEXT:  .LBB234_2: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    beq a5, s0, .LBB234_4
+; RV32I-ZALRSC-NEXT:  # %bb.3: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB234_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sltu a0, s0, a5
+; RV32I-ZALRSC-NEXT:    j .LBB234_5
+; RV32I-ZALRSC-NEXT:  .LBB234_4: # in Loop: Header=BB234_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sltu a0, s2, a4
+; RV32I-ZALRSC-NEXT:  .LBB234_5: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB234_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, a4
+; RV32I-ZALRSC-NEXT:    mv a3, a5
+; RV32I-ZALRSC-NEXT:    bnez a0, .LBB234_1
+; RV32I-ZALRSC-NEXT:  # %bb.6: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB234_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, s2
+; RV32I-ZALRSC-NEXT:    mv a3, s0
+; RV32I-ZALRSC-NEXT:    j .LBB234_1
+; RV32I-ZALRSC-NEXT:  .LBB234_7: # %atomicrmw.end
+; RV32I-ZALRSC-NEXT:    mv a0, a4
+; RV32I-ZALRSC-NEXT:    mv a1, a5
+; RV32I-ZALRSC-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 32
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_umax_i64_seq_cst:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -32
@@ -34758,6 +43412,21 @@ define i64 @atomicrmw_umax_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_umax_i64_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB234_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d.aqrl a2, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:    bgeu a3, a1, .LBB234_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB234_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:  .LBB234_3: # in Loop: Header=BB234_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB234_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_umax_i64_seq_cst:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amomaxu.d.aqrl a0, a1, (a0)
@@ -34826,6 +43495,60 @@ define i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umin_i64_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -32
+; RV32I-ZALRSC-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    mv s0, a2
+; RV32I-ZALRSC-NEXT:    mv s1, a0
+; RV32I-ZALRSC-NEXT:    lw a4, 0(a0)
+; RV32I-ZALRSC-NEXT:    lw a5, 4(a0)
+; RV32I-ZALRSC-NEXT:    mv s2, a1
+; RV32I-ZALRSC-NEXT:    j .LBB235_2
+; RV32I-ZALRSC-NEXT:  .LBB235_1: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB235_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    sw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    addi a1, sp, 8
+; RV32I-ZALRSC-NEXT:    mv a0, s1
+; RV32I-ZALRSC-NEXT:    li a4, 0
+; RV32I-ZALRSC-NEXT:    li a5, 0
+; RV32I-ZALRSC-NEXT:    call __atomic_compare_exchange_8
+; RV32I-ZALRSC-NEXT:    lw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    lw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    bnez a0, .LBB235_7
+; RV32I-ZALRSC-NEXT:  .LBB235_2: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    beq a5, s0, .LBB235_4
+; RV32I-ZALRSC-NEXT:  # %bb.3: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB235_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sltu a0, s0, a5
+; RV32I-ZALRSC-NEXT:    j .LBB235_5
+; RV32I-ZALRSC-NEXT:  .LBB235_4: # in Loop: Header=BB235_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sltu a0, s2, a4
+; RV32I-ZALRSC-NEXT:  .LBB235_5: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB235_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, a4
+; RV32I-ZALRSC-NEXT:    mv a3, a5
+; RV32I-ZALRSC-NEXT:    beqz a0, .LBB235_1
+; RV32I-ZALRSC-NEXT:  # %bb.6: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB235_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, s2
+; RV32I-ZALRSC-NEXT:    mv a3, s0
+; RV32I-ZALRSC-NEXT:    j .LBB235_1
+; RV32I-ZALRSC-NEXT:  .LBB235_7: # %atomicrmw.end
+; RV32I-ZALRSC-NEXT:    mv a0, a4
+; RV32I-ZALRSC-NEXT:    mv a1, a5
+; RV32I-ZALRSC-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 32
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_umin_i64_monotonic:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -32
@@ -34916,6 +43639,21 @@ define i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_umin_i64_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB235_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d a2, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:    bgeu a1, a3, .LBB235_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB235_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:  .LBB235_3: # in Loop: Header=BB235_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.d a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB235_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-LABEL: atomicrmw_umin_i64_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    amominu.d a0, a1, (a0)
@@ -34979,6 +43717,60 @@ define i64 @atomicrmw_umin_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umin_i64_acquire:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -32
+; RV32I-ZALRSC-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    mv s0, a2
+; RV32I-ZALRSC-NEXT:    mv s1, a0
+; RV32I-ZALRSC-NEXT:    lw a4, 0(a0)
+; RV32I-ZALRSC-NEXT:    lw a5, 4(a0)
+; RV32I-ZALRSC-NEXT:    mv s2, a1
+; RV32I-ZALRSC-NEXT:    j .LBB236_2
+; RV32I-ZALRSC-NEXT:  .LBB236_1: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB236_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    sw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    addi a1, sp, 8
+; RV32I-ZALRSC-NEXT:    li a4, 2
+; RV32I-ZALRSC-NEXT:    li a5, 2
+; RV32I-ZALRSC-NEXT:    mv a0, s1
+; RV32I-ZALRSC-NEXT:    call __atomic_compare_exchange_8
+; RV32I-ZALRSC-NEXT:    lw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    lw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    bnez a0, .LBB236_7
+; RV32I-ZALRSC-NEXT:  .LBB236_2: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    beq a5, s0, .LBB236_4
+; RV32I-ZALRSC-NEXT:  # %bb.3: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB236_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sltu a0, s0, a5
+; RV32I-ZALRSC-NEXT:    j .LBB236_5
+; RV32I-ZALRSC-NEXT:  .LBB236_4: # in Loop: Header=BB236_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sltu a0, s2, a4
+; RV32I-ZALRSC-NEXT:  .LBB236_5: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB236_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, a4
+; RV32I-ZALRSC-NEXT:    mv a3, a5
+; RV32I-ZALRSC-NEXT:    beqz a0, .LBB236_1
+; RV32I-ZALRSC-NEXT:  # %bb.6: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB236_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, s2
+; RV32I-ZALRSC-NEXT:    mv a3, s0
+; RV32I-ZALRSC-NEXT:    j .LBB236_1
+; RV32I-ZALRSC-NEXT:  .LBB236_7: # %atomicrmw.end
+; RV32I-ZALRSC-NEXT:    mv a0, a4
+; RV32I-ZALRSC-NEXT:    mv a1, a5
+; RV32I-ZALRSC-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 32
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_umin_i64_acquire:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -32
@@ -35069,6 +43861,21 @@ define i64 @atomicrmw_umin_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_umin_i64_acquire:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB236_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d.aq a2, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:    bgeu a1, a3, .LBB236_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB236_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:  .LBB236_3: # in Loop: Header=BB236_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.d a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB236_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_umin_i64_acquire:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amominu.d.aq a0, a1, (a0)
@@ -35137,6 +43944,60 @@ define i64 @atomicrmw_umin_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umin_i64_release:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -32
+; RV32I-ZALRSC-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    mv s0, a2
+; RV32I-ZALRSC-NEXT:    mv s1, a0
+; RV32I-ZALRSC-NEXT:    lw a4, 0(a0)
+; RV32I-ZALRSC-NEXT:    lw a5, 4(a0)
+; RV32I-ZALRSC-NEXT:    mv s2, a1
+; RV32I-ZALRSC-NEXT:    j .LBB237_2
+; RV32I-ZALRSC-NEXT:  .LBB237_1: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB237_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    sw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    addi a1, sp, 8
+; RV32I-ZALRSC-NEXT:    li a4, 3
+; RV32I-ZALRSC-NEXT:    mv a0, s1
+; RV32I-ZALRSC-NEXT:    li a5, 0
+; RV32I-ZALRSC-NEXT:    call __atomic_compare_exchange_8
+; RV32I-ZALRSC-NEXT:    lw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    lw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    bnez a0, .LBB237_7
+; RV32I-ZALRSC-NEXT:  .LBB237_2: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    beq a5, s0, .LBB237_4
+; RV32I-ZALRSC-NEXT:  # %bb.3: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB237_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sltu a0, s0, a5
+; RV32I-ZALRSC-NEXT:    j .LBB237_5
+; RV32I-ZALRSC-NEXT:  .LBB237_4: # in Loop: Header=BB237_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sltu a0, s2, a4
+; RV32I-ZALRSC-NEXT:  .LBB237_5: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB237_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, a4
+; RV32I-ZALRSC-NEXT:    mv a3, a5
+; RV32I-ZALRSC-NEXT:    beqz a0, .LBB237_1
+; RV32I-ZALRSC-NEXT:  # %bb.6: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB237_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, s2
+; RV32I-ZALRSC-NEXT:    mv a3, s0
+; RV32I-ZALRSC-NEXT:    j .LBB237_1
+; RV32I-ZALRSC-NEXT:  .LBB237_7: # %atomicrmw.end
+; RV32I-ZALRSC-NEXT:    mv a0, a4
+; RV32I-ZALRSC-NEXT:    mv a1, a5
+; RV32I-ZALRSC-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 32
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_umin_i64_release:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -32
@@ -35227,6 +44088,21 @@ define i64 @atomicrmw_umin_i64_release(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_umin_i64_release:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB237_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d a2, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:    bgeu a1, a3, .LBB237_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB237_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:  .LBB237_3: # in Loop: Header=BB237_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB237_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_umin_i64_release:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amominu.d.rl a0, a1, (a0)
@@ -35295,6 +44171,60 @@ define i64 @atomicrmw_umin_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umin_i64_acq_rel:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -32
+; RV32I-ZALRSC-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    mv s0, a2
+; RV32I-ZALRSC-NEXT:    mv s1, a0
+; RV32I-ZALRSC-NEXT:    lw a4, 0(a0)
+; RV32I-ZALRSC-NEXT:    lw a5, 4(a0)
+; RV32I-ZALRSC-NEXT:    mv s2, a1
+; RV32I-ZALRSC-NEXT:    j .LBB238_2
+; RV32I-ZALRSC-NEXT:  .LBB238_1: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB238_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    sw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    addi a1, sp, 8
+; RV32I-ZALRSC-NEXT:    li a4, 4
+; RV32I-ZALRSC-NEXT:    li a5, 2
+; RV32I-ZALRSC-NEXT:    mv a0, s1
+; RV32I-ZALRSC-NEXT:    call __atomic_compare_exchange_8
+; RV32I-ZALRSC-NEXT:    lw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    lw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    bnez a0, .LBB238_7
+; RV32I-ZALRSC-NEXT:  .LBB238_2: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    beq a5, s0, .LBB238_4
+; RV32I-ZALRSC-NEXT:  # %bb.3: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB238_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sltu a0, s0, a5
+; RV32I-ZALRSC-NEXT:    j .LBB238_5
+; RV32I-ZALRSC-NEXT:  .LBB238_4: # in Loop: Header=BB238_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sltu a0, s2, a4
+; RV32I-ZALRSC-NEXT:  .LBB238_5: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB238_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, a4
+; RV32I-ZALRSC-NEXT:    mv a3, a5
+; RV32I-ZALRSC-NEXT:    beqz a0, .LBB238_1
+; RV32I-ZALRSC-NEXT:  # %bb.6: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB238_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, s2
+; RV32I-ZALRSC-NEXT:    mv a3, s0
+; RV32I-ZALRSC-NEXT:    j .LBB238_1
+; RV32I-ZALRSC-NEXT:  .LBB238_7: # %atomicrmw.end
+; RV32I-ZALRSC-NEXT:    mv a0, a4
+; RV32I-ZALRSC-NEXT:    mv a1, a5
+; RV32I-ZALRSC-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 32
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_umin_i64_acq_rel:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -32
@@ -35385,6 +44315,21 @@ define i64 @atomicrmw_umin_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_umin_i64_acq_rel:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB238_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d.aq a2, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:    bgeu a1, a3, .LBB238_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB238_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:  .LBB238_3: # in Loop: Header=BB238_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB238_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_umin_i64_acq_rel:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amominu.d.aqrl a0, a1, (a0)
@@ -35453,6 +44398,60 @@ define i64 @atomicrmw_umin_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umin_i64_seq_cst:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -32
+; RV32I-ZALRSC-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    mv s0, a2
+; RV32I-ZALRSC-NEXT:    mv s1, a0
+; RV32I-ZALRSC-NEXT:    lw a4, 0(a0)
+; RV32I-ZALRSC-NEXT:    lw a5, 4(a0)
+; RV32I-ZALRSC-NEXT:    mv s2, a1
+; RV32I-ZALRSC-NEXT:    j .LBB239_2
+; RV32I-ZALRSC-NEXT:  .LBB239_1: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB239_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    sw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    addi a1, sp, 8
+; RV32I-ZALRSC-NEXT:    li a4, 5
+; RV32I-ZALRSC-NEXT:    li a5, 5
+; RV32I-ZALRSC-NEXT:    mv a0, s1
+; RV32I-ZALRSC-NEXT:    call __atomic_compare_exchange_8
+; RV32I-ZALRSC-NEXT:    lw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    lw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    bnez a0, .LBB239_7
+; RV32I-ZALRSC-NEXT:  .LBB239_2: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    beq a5, s0, .LBB239_4
+; RV32I-ZALRSC-NEXT:  # %bb.3: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB239_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sltu a0, s0, a5
+; RV32I-ZALRSC-NEXT:    j .LBB239_5
+; RV32I-ZALRSC-NEXT:  .LBB239_4: # in Loop: Header=BB239_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sltu a0, s2, a4
+; RV32I-ZALRSC-NEXT:  .LBB239_5: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB239_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, a4
+; RV32I-ZALRSC-NEXT:    mv a3, a5
+; RV32I-ZALRSC-NEXT:    beqz a0, .LBB239_1
+; RV32I-ZALRSC-NEXT:  # %bb.6: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB239_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, s2
+; RV32I-ZALRSC-NEXT:    mv a3, s0
+; RV32I-ZALRSC-NEXT:    j .LBB239_1
+; RV32I-ZALRSC-NEXT:  .LBB239_7: # %atomicrmw.end
+; RV32I-ZALRSC-NEXT:    mv a0, a4
+; RV32I-ZALRSC-NEXT:    mv a1, a5
+; RV32I-ZALRSC-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 32
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV32IA-LABEL: atomicrmw_umin_i64_seq_cst:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    addi sp, sp, -32
@@ -35543,6 +44542,21 @@ define i64 @atomicrmw_umin_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, 32
 ; RV64I-NEXT:    ret
 ;
+; RV64I-ZALRSC-LABEL: atomicrmw_umin_i64_seq_cst:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB239_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d.aqrl a2, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:    bgeu a1, a3, .LBB239_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB239_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:  .LBB239_3: # in Loop: Header=BB239_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.d.rl a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB239_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
+;
 ; RV64IA-WMO-LABEL: atomicrmw_umin_i64_seq_cst:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    amominu.d.aqrl a0, a1, (a0)
diff --git a/llvm/test/CodeGen/RISCV/atomic-signext.ll b/llvm/test/CodeGen/RISCV/atomic-signext.ll
index 7d29ac9944834..7fe5fa7365eb5 100644
--- a/llvm/test/CodeGen/RISCV/atomic-signext.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-signext.ll
@@ -5,12 +5,16 @@
 ; RUN:   | FileCheck -check-prefixes=RV32IA,RV32IA-NOZACAS %s
 ; RUN: llc -mtriple=riscv32 -mattr=+a,+zacas -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV32IA,RV32IA-ZACAS %s
+; RUN: llc -mtriple=riscv32 -mattr=+zalrsc -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefixes=RV32I-ZALRSC %s
 ; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefix=RV64I %s
 ; RUN: llc -mtriple=riscv64 -mattr=+a -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-NOZACAS %s
 ; RUN: llc -mtriple=riscv64 -mattr=+a,+zacas -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-ZACAS %s
+; RUN: llc -mtriple=riscv64 -mattr=+zalrsc -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefixes=RV64I-ZALRSC %s
 
 define signext i8 @atomic_load_i8_unordered(ptr %a) nounwind {
 ; RV32I-LABEL: atomic_load_i8_unordered:
@@ -30,6 +34,11 @@ define signext i8 @atomic_load_i8_unordered(ptr %a) nounwind {
 ; RV32IA-NEXT:    lb a0, 0(a0)
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomic_load_i8_unordered:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    lb a0, 0(a0)
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomic_load_i8_unordered:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
@@ -46,6 +55,11 @@ define signext i8 @atomic_load_i8_unordered(ptr %a) nounwind {
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    lb a0, 0(a0)
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomic_load_i8_unordered:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    lb a0, 0(a0)
+; RV64I-ZALRSC-NEXT:    ret
   %1 = load atomic i8, ptr %a unordered, align 1
   ret i8 %1
 }
@@ -68,6 +82,11 @@ define signext i16 @atomic_load_i16_unordered(ptr %a) nounwind {
 ; RV32IA-NEXT:    lh a0, 0(a0)
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomic_load_i16_unordered:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    lh a0, 0(a0)
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomic_load_i16_unordered:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
@@ -84,6 +103,11 @@ define signext i16 @atomic_load_i16_unordered(ptr %a) nounwind {
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    lh a0, 0(a0)
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomic_load_i16_unordered:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    lh a0, 0(a0)
+; RV64I-ZALRSC-NEXT:    ret
   %1 = load atomic i16, ptr %a unordered, align 2
   ret i16 %1
 }
@@ -104,6 +128,11 @@ define signext i32 @atomic_load_i32_unordered(ptr %a) nounwind {
 ; RV32IA-NEXT:    lw a0, 0(a0)
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomic_load_i32_unordered:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    lw a0, 0(a0)
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomic_load_i32_unordered:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
@@ -119,6 +148,11 @@ define signext i32 @atomic_load_i32_unordered(ptr %a) nounwind {
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    lw a0, 0(a0)
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomic_load_i32_unordered:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    lw a0, 0(a0)
+; RV64I-ZALRSC-NEXT:    ret
   %1 = load atomic i32, ptr %a unordered, align 4
   ret i32 %1
 }
@@ -159,6 +193,28 @@ define signext i8 @atomicrmw_xchg_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    srai a0, a0, 24
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xchg_i8_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a4, (a2)
+; RV32I-ZALRSC-NEXT:    mv a5, a1
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a3
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB3_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a4, a0
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 24
+; RV32I-ZALRSC-NEXT:    srai a0, a0, 24
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_xchg_i8_monotonic:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
@@ -192,6 +248,28 @@ define signext i8 @atomicrmw_xchg_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-NEXT:    slli a0, a0, 56
 ; RV64IA-NEXT:    srai a0, a0, 56
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_xchg_i8_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a4, (a2)
+; RV64I-ZALRSC-NEXT:    mv a5, a1
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a3
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB3_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a4, a0
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 56
+; RV64I-ZALRSC-NEXT:    srai a0, a0, 56
+; RV64I-ZALRSC-NEXT:    ret
   %1 = atomicrmw xchg ptr %a, i8 %b monotonic
   ret i8 %1
 }
@@ -231,6 +309,28 @@ define signext i8 @atomicrmw_add_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    srai a0, a0, 24
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_add_i8_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a4, (a2)
+; RV32I-ZALRSC-NEXT:    add a5, a4, a1
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a3
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB4_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a4, a0
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 24
+; RV32I-ZALRSC-NEXT:    srai a0, a0, 24
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_add_i8_monotonic:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
@@ -264,6 +364,28 @@ define signext i8 @atomicrmw_add_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-NEXT:    slli a0, a0, 56
 ; RV64IA-NEXT:    srai a0, a0, 56
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_add_i8_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a4, (a2)
+; RV64I-ZALRSC-NEXT:    add a5, a4, a1
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a3
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB4_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a4, a0
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 56
+; RV64I-ZALRSC-NEXT:    srai a0, a0, 56
+; RV64I-ZALRSC-NEXT:    ret
   %1 = atomicrmw add ptr %a, i8 %b monotonic
   ret i8 %1
 }
@@ -303,6 +425,28 @@ define signext i8 @atomicrmw_sub_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    srai a0, a0, 24
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_sub_i8_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB5_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a4, (a2)
+; RV32I-ZALRSC-NEXT:    sub a5, a4, a1
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a3
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB5_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a4, a0
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 24
+; RV32I-ZALRSC-NEXT:    srai a0, a0, 24
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_sub_i8_monotonic:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
@@ -336,6 +480,28 @@ define signext i8 @atomicrmw_sub_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-NEXT:    slli a0, a0, 56
 ; RV64IA-NEXT:    srai a0, a0, 56
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_sub_i8_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB5_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a4, (a2)
+; RV64I-ZALRSC-NEXT:    sub a5, a4, a1
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a3
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB5_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a4, a0
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 56
+; RV64I-ZALRSC-NEXT:    srai a0, a0, 56
+; RV64I-ZALRSC-NEXT:    ret
   %1 = atomicrmw sub ptr %a, i8 %b monotonic
   ret i8 %1
 }
@@ -369,6 +535,27 @@ define signext i8 @atomicrmw_and_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    srai a0, a0, 24
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_and_i8_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    not a3, a3
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:    or a1, a1, a3
+; RV32I-ZALRSC-NEXT:  .LBB6_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV32I-ZALRSC-NEXT:    and a4, a3, a1
+; RV32I-ZALRSC-NEXT:    sc.w a4, a4, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB6_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 24
+; RV32I-ZALRSC-NEXT:    srai a0, a0, 24
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_and_i8_monotonic:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
@@ -396,6 +583,27 @@ define signext i8 @atomicrmw_and_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-NEXT:    slli a0, a0, 56
 ; RV64IA-NEXT:    srai a0, a0, 56
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_and_i8_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    not a3, a3
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:    or a1, a1, a3
+; RV64I-ZALRSC-NEXT:  .LBB6_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV64I-ZALRSC-NEXT:    and a4, a3, a1
+; RV64I-ZALRSC-NEXT:    sc.w a4, a4, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB6_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 56
+; RV64I-ZALRSC-NEXT:    srai a0, a0, 56
+; RV64I-ZALRSC-NEXT:    ret
   %1 = atomicrmw and ptr %a, i8 %b monotonic
   ret i8 %1
 }
@@ -436,6 +644,29 @@ define signext i8 @atomicrmw_nand_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    srai a0, a0, 24
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_nand_i8_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB7_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a4, (a2)
+; RV32I-ZALRSC-NEXT:    and a5, a4, a1
+; RV32I-ZALRSC-NEXT:    not a5, a5
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a3
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB7_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a4, a0
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 24
+; RV32I-ZALRSC-NEXT:    srai a0, a0, 24
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_nand_i8_monotonic:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
@@ -470,6 +701,29 @@ define signext i8 @atomicrmw_nand_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-NEXT:    slli a0, a0, 56
 ; RV64IA-NEXT:    srai a0, a0, 56
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_nand_i8_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB7_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a4, (a2)
+; RV64I-ZALRSC-NEXT:    and a5, a4, a1
+; RV64I-ZALRSC-NEXT:    not a5, a5
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a3
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB7_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a4, a0
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 56
+; RV64I-ZALRSC-NEXT:    srai a0, a0, 56
+; RV64I-ZALRSC-NEXT:    ret
   %1 = atomicrmw nand ptr %a, i8 %b monotonic
   ret i8 %1
 }
@@ -499,6 +753,23 @@ define signext i8 @atomicrmw_or_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    srai a0, a0, 24
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_or_i8_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB8_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV32I-ZALRSC-NEXT:    or a4, a3, a1
+; RV32I-ZALRSC-NEXT:    sc.w a4, a4, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB8_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 24
+; RV32I-ZALRSC-NEXT:    srai a0, a0, 24
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_or_i8_monotonic:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
@@ -522,6 +793,23 @@ define signext i8 @atomicrmw_or_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-NEXT:    slli a0, a0, 56
 ; RV64IA-NEXT:    srai a0, a0, 56
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_or_i8_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB8_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV64I-ZALRSC-NEXT:    or a4, a3, a1
+; RV64I-ZALRSC-NEXT:    sc.w a4, a4, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB8_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 56
+; RV64I-ZALRSC-NEXT:    srai a0, a0, 56
+; RV64I-ZALRSC-NEXT:    ret
   %1 = atomicrmw or ptr %a, i8 %b monotonic
   ret i8 %1
 }
@@ -551,6 +839,23 @@ define signext i8 @atomicrmw_xor_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    srai a0, a0, 24
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xor_i8_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB9_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV32I-ZALRSC-NEXT:    xor a4, a3, a1
+; RV32I-ZALRSC-NEXT:    sc.w a4, a4, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB9_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 24
+; RV32I-ZALRSC-NEXT:    srai a0, a0, 24
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_xor_i8_monotonic:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
@@ -574,6 +879,23 @@ define signext i8 @atomicrmw_xor_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-NEXT:    slli a0, a0, 56
 ; RV64IA-NEXT:    srai a0, a0, 56
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_xor_i8_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB9_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV64I-ZALRSC-NEXT:    xor a4, a3, a1
+; RV64I-ZALRSC-NEXT:    sc.w a4, a4, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB9_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 56
+; RV64I-ZALRSC-NEXT:    srai a0, a0, 56
+; RV64I-ZALRSC-NEXT:    ret
   %1 = atomicrmw xor ptr %a, i8 %b monotonic
   ret i8 %1
 }
@@ -653,6 +975,37 @@ define signext i8 @atomicrmw_max_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    srai a0, a0, 24
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_max_i8_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    slli a1, a1, 24
+; RV32I-ZALRSC-NEXT:    andi a4, a0, 24
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    srai a1, a1, 24
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:    xori a4, a4, 24
+; RV32I-ZALRSC-NEXT:  .LBB10_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a5, (a2)
+; RV32I-ZALRSC-NEXT:    and a7, a5, a3
+; RV32I-ZALRSC-NEXT:    mv a6, a5
+; RV32I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV32I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV32I-ZALRSC-NEXT:    bge a7, a1, .LBB10_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB10_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV32I-ZALRSC-NEXT:    and a6, a6, a3
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV32I-ZALRSC-NEXT:  .LBB10_3: # in Loop: Header=BB10_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w a6, a6, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a6, .LBB10_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a5, a0
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 24
+; RV32I-ZALRSC-NEXT:    srai a0, a0, 24
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_max_i8_monotonic:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -48
@@ -726,6 +1079,37 @@ define signext i8 @atomicrmw_max_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-NEXT:    slli a0, a0, 56
 ; RV64IA-NEXT:    srai a0, a0, 56
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_max_i8_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    slli a1, a1, 56
+; RV64I-ZALRSC-NEXT:    andi a4, a0, 24
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    srai a1, a1, 56
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:    xori a4, a4, 56
+; RV64I-ZALRSC-NEXT:  .LBB10_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a5, (a2)
+; RV64I-ZALRSC-NEXT:    and a7, a5, a3
+; RV64I-ZALRSC-NEXT:    mv a6, a5
+; RV64I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV64I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV64I-ZALRSC-NEXT:    bge a7, a1, .LBB10_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB10_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV64I-ZALRSC-NEXT:    and a6, a6, a3
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV64I-ZALRSC-NEXT:  .LBB10_3: # in Loop: Header=BB10_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w a6, a6, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a6, .LBB10_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a5, a0
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 56
+; RV64I-ZALRSC-NEXT:    srai a0, a0, 56
+; RV64I-ZALRSC-NEXT:    ret
   %1 = atomicrmw max ptr %a, i8 %b monotonic
   ret i8 %1
 }
@@ -805,6 +1189,37 @@ define signext i8 @atomicrmw_min_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    srai a0, a0, 24
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_min_i8_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    slli a1, a1, 24
+; RV32I-ZALRSC-NEXT:    andi a4, a0, 24
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    srai a1, a1, 24
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:    xori a4, a4, 24
+; RV32I-ZALRSC-NEXT:  .LBB11_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a5, (a2)
+; RV32I-ZALRSC-NEXT:    and a7, a5, a3
+; RV32I-ZALRSC-NEXT:    mv a6, a5
+; RV32I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV32I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV32I-ZALRSC-NEXT:    bge a1, a7, .LBB11_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB11_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV32I-ZALRSC-NEXT:    and a6, a6, a3
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV32I-ZALRSC-NEXT:  .LBB11_3: # in Loop: Header=BB11_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w a6, a6, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a6, .LBB11_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a5, a0
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 24
+; RV32I-ZALRSC-NEXT:    srai a0, a0, 24
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_min_i8_monotonic:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -48
@@ -878,6 +1293,37 @@ define signext i8 @atomicrmw_min_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-NEXT:    slli a0, a0, 56
 ; RV64IA-NEXT:    srai a0, a0, 56
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_min_i8_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    slli a1, a1, 56
+; RV64I-ZALRSC-NEXT:    andi a4, a0, 24
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    srai a1, a1, 56
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:    xori a4, a4, 56
+; RV64I-ZALRSC-NEXT:  .LBB11_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a5, (a2)
+; RV64I-ZALRSC-NEXT:    and a7, a5, a3
+; RV64I-ZALRSC-NEXT:    mv a6, a5
+; RV64I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV64I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV64I-ZALRSC-NEXT:    bge a1, a7, .LBB11_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB11_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV64I-ZALRSC-NEXT:    and a6, a6, a3
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV64I-ZALRSC-NEXT:  .LBB11_3: # in Loop: Header=BB11_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w a6, a6, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a6, .LBB11_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a5, a0
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 56
+; RV64I-ZALRSC-NEXT:    srai a0, a0, 56
+; RV64I-ZALRSC-NEXT:    ret
   %1 = atomicrmw min ptr %a, i8 %b monotonic
   ret i8 %1
 }
@@ -950,6 +1396,32 @@ define signext i8 @atomicrmw_umax_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    srai a0, a0, 24
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umax_i8_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a4, (a2)
+; RV32I-ZALRSC-NEXT:    and a6, a4, a3
+; RV32I-ZALRSC-NEXT:    mv a5, a4
+; RV32I-ZALRSC-NEXT:    bgeu a6, a1, .LBB12_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB12_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a1
+; RV32I-ZALRSC-NEXT:    and a5, a5, a3
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:  .LBB12_3: # in Loop: Header=BB12_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB12_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a4, a0
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 24
+; RV32I-ZALRSC-NEXT:    srai a0, a0, 24
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_umax_i8_monotonic:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -48
@@ -1016,6 +1488,32 @@ define signext i8 @atomicrmw_umax_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-NEXT:    slli a0, a0, 56
 ; RV64IA-NEXT:    srai a0, a0, 56
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_umax_i8_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a4, (a2)
+; RV64I-ZALRSC-NEXT:    and a6, a4, a3
+; RV64I-ZALRSC-NEXT:    mv a5, a4
+; RV64I-ZALRSC-NEXT:    bgeu a6, a1, .LBB12_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB12_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a1
+; RV64I-ZALRSC-NEXT:    and a5, a5, a3
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:  .LBB12_3: # in Loop: Header=BB12_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB12_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a4, a0
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 56
+; RV64I-ZALRSC-NEXT:    srai a0, a0, 56
+; RV64I-ZALRSC-NEXT:    ret
   %1 = atomicrmw umax ptr %a, i8 %b monotonic
   ret i8 %1
 }
@@ -1088,6 +1586,32 @@ define signext i8 @atomicrmw_umin_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    srai a0, a0, 24
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umin_i8_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a3, 255
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB13_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a4, (a2)
+; RV32I-ZALRSC-NEXT:    and a6, a4, a3
+; RV32I-ZALRSC-NEXT:    mv a5, a4
+; RV32I-ZALRSC-NEXT:    bgeu a1, a6, .LBB13_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB13_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a1
+; RV32I-ZALRSC-NEXT:    and a5, a5, a3
+; RV32I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV32I-ZALRSC-NEXT:  .LBB13_3: # in Loop: Header=BB13_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB13_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a4, a0
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 24
+; RV32I-ZALRSC-NEXT:    srai a0, a0, 24
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_umin_i8_monotonic:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -48
@@ -1154,6 +1678,32 @@ define signext i8 @atomicrmw_umin_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-NEXT:    slli a0, a0, 56
 ; RV64IA-NEXT:    srai a0, a0, 56
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_umin_i8_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a3, 255
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB13_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a4, (a2)
+; RV64I-ZALRSC-NEXT:    and a6, a4, a3
+; RV64I-ZALRSC-NEXT:    mv a5, a4
+; RV64I-ZALRSC-NEXT:    bgeu a1, a6, .LBB13_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB13_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a1
+; RV64I-ZALRSC-NEXT:    and a5, a5, a3
+; RV64I-ZALRSC-NEXT:    xor a5, a4, a5
+; RV64I-ZALRSC-NEXT:  .LBB13_3: # in Loop: Header=BB13_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB13_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a4, a0
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 56
+; RV64I-ZALRSC-NEXT:    srai a0, a0, 56
+; RV64I-ZALRSC-NEXT:    ret
   %1 = atomicrmw umin ptr %a, i8 %b monotonic
   ret i8 %1
 }
@@ -1194,6 +1744,29 @@ define signext i16 @atomicrmw_xchg_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV32IA-NEXT:    srai a0, a0, 16
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xchg_i16_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    sll a4, a3, a0
+; RV32I-ZALRSC-NEXT:    and a1, a1, a3
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB14_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV32I-ZALRSC-NEXT:    mv a5, a1
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a4
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB14_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 16
+; RV32I-ZALRSC-NEXT:    srai a0, a0, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_xchg_i16_monotonic:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
@@ -1228,6 +1801,29 @@ define signext i16 @atomicrmw_xchg_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV64IA-NEXT:    slli a0, a0, 48
 ; RV64IA-NEXT:    srai a0, a0, 48
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_xchg_i16_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    sllw a4, a3, a0
+; RV64I-ZALRSC-NEXT:    and a1, a1, a3
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB14_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV64I-ZALRSC-NEXT:    mv a5, a1
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a4
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB14_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 48
+; RV64I-ZALRSC-NEXT:    srai a0, a0, 48
+; RV64I-ZALRSC-NEXT:    ret
   %1 = atomicrmw xchg ptr %a, i16 %b monotonic
   ret i16 %1
 }
@@ -1268,6 +1864,29 @@ define signext i16 @atomicrmw_add_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV32IA-NEXT:    srai a0, a0, 16
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_add_i16_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    sll a4, a3, a0
+; RV32I-ZALRSC-NEXT:    and a1, a1, a3
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB15_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV32I-ZALRSC-NEXT:    add a5, a3, a1
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a4
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB15_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 16
+; RV32I-ZALRSC-NEXT:    srai a0, a0, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_add_i16_monotonic:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
@@ -1302,6 +1921,29 @@ define signext i16 @atomicrmw_add_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV64IA-NEXT:    slli a0, a0, 48
 ; RV64IA-NEXT:    srai a0, a0, 48
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_add_i16_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    sllw a4, a3, a0
+; RV64I-ZALRSC-NEXT:    and a1, a1, a3
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB15_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV64I-ZALRSC-NEXT:    add a5, a3, a1
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a4
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB15_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 48
+; RV64I-ZALRSC-NEXT:    srai a0, a0, 48
+; RV64I-ZALRSC-NEXT:    ret
   %1 = atomicrmw add ptr %a, i16 %b monotonic
   ret i16 %1
 }
@@ -1342,6 +1984,29 @@ define signext i16 @atomicrmw_sub_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV32IA-NEXT:    srai a0, a0, 16
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_sub_i16_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    sll a4, a3, a0
+; RV32I-ZALRSC-NEXT:    and a1, a1, a3
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB16_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV32I-ZALRSC-NEXT:    sub a5, a3, a1
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a4
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB16_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 16
+; RV32I-ZALRSC-NEXT:    srai a0, a0, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_sub_i16_monotonic:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
@@ -1376,6 +2041,29 @@ define signext i16 @atomicrmw_sub_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV64IA-NEXT:    slli a0, a0, 48
 ; RV64IA-NEXT:    srai a0, a0, 48
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_sub_i16_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    sllw a4, a3, a0
+; RV64I-ZALRSC-NEXT:    and a1, a1, a3
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB16_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV64I-ZALRSC-NEXT:    sub a5, a3, a1
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a4
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB16_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 48
+; RV64I-ZALRSC-NEXT:    srai a0, a0, 48
+; RV64I-ZALRSC-NEXT:    ret
   %1 = atomicrmw sub ptr %a, i16 %b monotonic
   ret i16 %1
 }
@@ -1410,6 +2098,28 @@ define signext i16 @atomicrmw_and_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV32IA-NEXT:    srai a0, a0, 16
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_and_i16_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    sll a4, a3, a0
+; RV32I-ZALRSC-NEXT:    and a1, a1, a3
+; RV32I-ZALRSC-NEXT:    not a3, a4
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:    or a1, a1, a3
+; RV32I-ZALRSC-NEXT:  .LBB17_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV32I-ZALRSC-NEXT:    and a4, a3, a1
+; RV32I-ZALRSC-NEXT:    sc.w a4, a4, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB17_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 16
+; RV32I-ZALRSC-NEXT:    srai a0, a0, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_and_i16_monotonic:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
@@ -1438,6 +2148,28 @@ define signext i16 @atomicrmw_and_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV64IA-NEXT:    slli a0, a0, 48
 ; RV64IA-NEXT:    srai a0, a0, 48
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_and_i16_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    sllw a4, a3, a0
+; RV64I-ZALRSC-NEXT:    and a1, a1, a3
+; RV64I-ZALRSC-NEXT:    not a3, a4
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:    or a1, a1, a3
+; RV64I-ZALRSC-NEXT:  .LBB17_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV64I-ZALRSC-NEXT:    and a4, a3, a1
+; RV64I-ZALRSC-NEXT:    sc.w a4, a4, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB17_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 48
+; RV64I-ZALRSC-NEXT:    srai a0, a0, 48
+; RV64I-ZALRSC-NEXT:    ret
   %1 = atomicrmw and ptr %a, i16 %b monotonic
   ret i16 %1
 }
@@ -1479,6 +2211,30 @@ define signext i16 @atomicrmw_nand_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV32IA-NEXT:    srai a0, a0, 16
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_nand_i16_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    sll a4, a3, a0
+; RV32I-ZALRSC-NEXT:    and a1, a1, a3
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB18_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV32I-ZALRSC-NEXT:    and a5, a3, a1
+; RV32I-ZALRSC-NEXT:    not a5, a5
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    and a5, a5, a4
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB18_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 16
+; RV32I-ZALRSC-NEXT:    srai a0, a0, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_nand_i16_monotonic:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
@@ -1514,6 +2270,30 @@ define signext i16 @atomicrmw_nand_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV64IA-NEXT:    slli a0, a0, 48
 ; RV64IA-NEXT:    srai a0, a0, 48
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_nand_i16_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    sllw a4, a3, a0
+; RV64I-ZALRSC-NEXT:    and a1, a1, a3
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB18_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV64I-ZALRSC-NEXT:    and a5, a3, a1
+; RV64I-ZALRSC-NEXT:    not a5, a5
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    and a5, a5, a4
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB18_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 48
+; RV64I-ZALRSC-NEXT:    srai a0, a0, 48
+; RV64I-ZALRSC-NEXT:    ret
   %1 = atomicrmw nand ptr %a, i16 %b monotonic
   ret i16 %1
 }
@@ -1544,6 +2324,24 @@ define signext i16 @atomicrmw_or_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV32IA-NEXT:    srai a0, a0, 16
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_or_i16_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    slli a1, a1, 16
+; RV32I-ZALRSC-NEXT:    srli a1, a1, 16
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB19_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV32I-ZALRSC-NEXT:    or a4, a3, a1
+; RV32I-ZALRSC-NEXT:    sc.w a4, a4, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB19_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 16
+; RV32I-ZALRSC-NEXT:    srai a0, a0, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_or_i16_monotonic:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
@@ -1568,6 +2366,24 @@ define signext i16 @atomicrmw_or_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV64IA-NEXT:    slli a0, a0, 48
 ; RV64IA-NEXT:    srai a0, a0, 48
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_or_i16_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    slli a1, a1, 48
+; RV64I-ZALRSC-NEXT:    srli a1, a1, 48
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB19_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV64I-ZALRSC-NEXT:    or a4, a3, a1
+; RV64I-ZALRSC-NEXT:    sc.w a4, a4, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB19_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 48
+; RV64I-ZALRSC-NEXT:    srai a0, a0, 48
+; RV64I-ZALRSC-NEXT:    ret
   %1 = atomicrmw or ptr %a, i16 %b monotonic
   ret i16 %1
 }
@@ -1598,6 +2414,24 @@ define signext i16 @atomicrmw_xor_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV32IA-NEXT:    srai a0, a0, 16
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xor_i16_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    slli a1, a1, 16
+; RV32I-ZALRSC-NEXT:    srli a1, a1, 16
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB20_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV32I-ZALRSC-NEXT:    xor a4, a3, a1
+; RV32I-ZALRSC-NEXT:    sc.w a4, a4, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB20_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 16
+; RV32I-ZALRSC-NEXT:    srai a0, a0, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_xor_i16_monotonic:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
@@ -1622,6 +2456,24 @@ define signext i16 @atomicrmw_xor_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV64IA-NEXT:    slli a0, a0, 48
 ; RV64IA-NEXT:    srai a0, a0, 48
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_xor_i16_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    slli a1, a1, 48
+; RV64I-ZALRSC-NEXT:    srli a1, a1, 48
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB20_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV64I-ZALRSC-NEXT:    xor a4, a3, a1
+; RV64I-ZALRSC-NEXT:    sc.w a4, a4, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB20_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 48
+; RV64I-ZALRSC-NEXT:    srai a0, a0, 48
+; RV64I-ZALRSC-NEXT:    ret
   %1 = atomicrmw xor ptr %a, i16 %b monotonic
   ret i16 %1
 }
@@ -1703,6 +2555,39 @@ define signext i16 @atomicrmw_max_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV32IA-NEXT:    srai a0, a0, 16
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_max_i16_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    slli a1, a1, 16
+; RV32I-ZALRSC-NEXT:    li a4, 16
+; RV32I-ZALRSC-NEXT:    andi a5, a0, 24
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    srai a1, a1, 16
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:    sub a4, a4, a5
+; RV32I-ZALRSC-NEXT:  .LBB21_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a5, (a2)
+; RV32I-ZALRSC-NEXT:    and a7, a5, a3
+; RV32I-ZALRSC-NEXT:    mv a6, a5
+; RV32I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV32I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV32I-ZALRSC-NEXT:    bge a7, a1, .LBB21_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB21_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV32I-ZALRSC-NEXT:    and a6, a6, a3
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV32I-ZALRSC-NEXT:  .LBB21_3: # in Loop: Header=BB21_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w a6, a6, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a6, .LBB21_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a5, a0
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 16
+; RV32I-ZALRSC-NEXT:    srai a0, a0, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_max_i16_monotonic:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -48
@@ -1778,6 +2663,39 @@ define signext i16 @atomicrmw_max_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV64IA-NEXT:    slli a0, a0, 48
 ; RV64IA-NEXT:    srai a0, a0, 48
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_max_i16_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    slli a1, a1, 48
+; RV64I-ZALRSC-NEXT:    li a4, 48
+; RV64I-ZALRSC-NEXT:    andi a5, a0, 24
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    srai a1, a1, 48
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:    sub a4, a4, a5
+; RV64I-ZALRSC-NEXT:  .LBB21_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a5, (a2)
+; RV64I-ZALRSC-NEXT:    and a7, a5, a3
+; RV64I-ZALRSC-NEXT:    mv a6, a5
+; RV64I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV64I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV64I-ZALRSC-NEXT:    bge a7, a1, .LBB21_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB21_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV64I-ZALRSC-NEXT:    and a6, a6, a3
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV64I-ZALRSC-NEXT:  .LBB21_3: # in Loop: Header=BB21_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w a6, a6, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a6, .LBB21_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a5, a0
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 48
+; RV64I-ZALRSC-NEXT:    srai a0, a0, 48
+; RV64I-ZALRSC-NEXT:    ret
   %1 = atomicrmw max ptr %a, i16 %b monotonic
   ret i16 %1
 }
@@ -1859,6 +2777,39 @@ define signext i16 @atomicrmw_min_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV32IA-NEXT:    srai a0, a0, 16
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_min_i16_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    slli a1, a1, 16
+; RV32I-ZALRSC-NEXT:    li a4, 16
+; RV32I-ZALRSC-NEXT:    andi a5, a0, 24
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    srai a1, a1, 16
+; RV32I-ZALRSC-NEXT:    sll a3, a3, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:    sub a4, a4, a5
+; RV32I-ZALRSC-NEXT:  .LBB22_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a5, (a2)
+; RV32I-ZALRSC-NEXT:    and a7, a5, a3
+; RV32I-ZALRSC-NEXT:    mv a6, a5
+; RV32I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV32I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV32I-ZALRSC-NEXT:    bge a1, a7, .LBB22_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB22_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV32I-ZALRSC-NEXT:    and a6, a6, a3
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV32I-ZALRSC-NEXT:  .LBB22_3: # in Loop: Header=BB22_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w a6, a6, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a6, .LBB22_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a5, a0
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 16
+; RV32I-ZALRSC-NEXT:    srai a0, a0, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_min_i16_monotonic:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -48
@@ -1934,6 +2885,39 @@ define signext i16 @atomicrmw_min_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV64IA-NEXT:    slli a0, a0, 48
 ; RV64IA-NEXT:    srai a0, a0, 48
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_min_i16_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    slli a1, a1, 48
+; RV64I-ZALRSC-NEXT:    li a4, 48
+; RV64I-ZALRSC-NEXT:    andi a5, a0, 24
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    srai a1, a1, 48
+; RV64I-ZALRSC-NEXT:    sllw a3, a3, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:    sub a4, a4, a5
+; RV64I-ZALRSC-NEXT:  .LBB22_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a5, (a2)
+; RV64I-ZALRSC-NEXT:    and a7, a5, a3
+; RV64I-ZALRSC-NEXT:    mv a6, a5
+; RV64I-ZALRSC-NEXT:    sll a7, a7, a4
+; RV64I-ZALRSC-NEXT:    sra a7, a7, a4
+; RV64I-ZALRSC-NEXT:    bge a1, a7, .LBB22_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB22_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a1
+; RV64I-ZALRSC-NEXT:    and a6, a6, a3
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV64I-ZALRSC-NEXT:  .LBB22_3: # in Loop: Header=BB22_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w a6, a6, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a6, .LBB22_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a5, a0
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 48
+; RV64I-ZALRSC-NEXT:    srai a0, a0, 48
+; RV64I-ZALRSC-NEXT:    ret
   %1 = atomicrmw min ptr %a, i16 %b monotonic
   ret i16 %1
 }
@@ -2011,6 +2995,33 @@ define signext i16 @atomicrmw_umax_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV32IA-NEXT:    srai a0, a0, 16
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umax_i16_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    sll a4, a3, a0
+; RV32I-ZALRSC-NEXT:    and a1, a1, a3
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB23_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV32I-ZALRSC-NEXT:    and a6, a3, a4
+; RV32I-ZALRSC-NEXT:    mv a5, a3
+; RV32I-ZALRSC-NEXT:    bgeu a6, a1, .LBB23_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB23_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a1
+; RV32I-ZALRSC-NEXT:    and a5, a5, a4
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:  .LBB23_3: # in Loop: Header=BB23_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB23_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 16
+; RV32I-ZALRSC-NEXT:    srai a0, a0, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_umax_i16_monotonic:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -48
@@ -2082,6 +3093,33 @@ define signext i16 @atomicrmw_umax_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV64IA-NEXT:    slli a0, a0, 48
 ; RV64IA-NEXT:    srai a0, a0, 48
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_umax_i16_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    sllw a4, a3, a0
+; RV64I-ZALRSC-NEXT:    and a1, a1, a3
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB23_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV64I-ZALRSC-NEXT:    and a6, a3, a4
+; RV64I-ZALRSC-NEXT:    mv a5, a3
+; RV64I-ZALRSC-NEXT:    bgeu a6, a1, .LBB23_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB23_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a1
+; RV64I-ZALRSC-NEXT:    and a5, a5, a4
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:  .LBB23_3: # in Loop: Header=BB23_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB23_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 48
+; RV64I-ZALRSC-NEXT:    srai a0, a0, 48
+; RV64I-ZALRSC-NEXT:    ret
   %1 = atomicrmw umax ptr %a, i16 %b monotonic
   ret i16 %1
 }
@@ -2159,6 +3197,33 @@ define signext i16 @atomicrmw_umin_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV32IA-NEXT:    srai a0, a0, 16
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umin_i16_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a3, 16
+; RV32I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV32I-ZALRSC-NEXT:    sll a4, a3, a0
+; RV32I-ZALRSC-NEXT:    and a1, a1, a3
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:  .LBB24_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV32I-ZALRSC-NEXT:    and a6, a3, a4
+; RV32I-ZALRSC-NEXT:    mv a5, a3
+; RV32I-ZALRSC-NEXT:    bgeu a1, a6, .LBB24_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB24_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a1
+; RV32I-ZALRSC-NEXT:    and a5, a5, a4
+; RV32I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV32I-ZALRSC-NEXT:  .LBB24_3: # in Loop: Header=BB24_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB24_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    srl a0, a3, a0
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 16
+; RV32I-ZALRSC-NEXT:    srai a0, a0, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_umin_i16_monotonic:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -48
@@ -2230,6 +3295,33 @@ define signext i16 @atomicrmw_umin_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV64IA-NEXT:    slli a0, a0, 48
 ; RV64IA-NEXT:    srai a0, a0, 48
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_umin_i16_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a3, 16
+; RV64I-ZALRSC-NEXT:    addi a3, a3, -1
+; RV64I-ZALRSC-NEXT:    sllw a4, a3, a0
+; RV64I-ZALRSC-NEXT:    and a1, a1, a3
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:  .LBB24_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a3, (a2)
+; RV64I-ZALRSC-NEXT:    and a6, a3, a4
+; RV64I-ZALRSC-NEXT:    mv a5, a3
+; RV64I-ZALRSC-NEXT:    bgeu a1, a6, .LBB24_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB24_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a1
+; RV64I-ZALRSC-NEXT:    and a5, a5, a4
+; RV64I-ZALRSC-NEXT:    xor a5, a3, a5
+; RV64I-ZALRSC-NEXT:  .LBB24_3: # in Loop: Header=BB24_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w a5, a5, (a2)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB24_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    srlw a0, a3, a0
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 48
+; RV64I-ZALRSC-NEXT:    srai a0, a0, 48
+; RV64I-ZALRSC-NEXT:    ret
   %1 = atomicrmw umin ptr %a, i16 %b monotonic
   ret i16 %1
 }
@@ -2250,6 +3342,17 @@ define signext i32 @atomicrmw_xchg_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV32IA-NEXT:    amoswap.w a0, a1, (a0)
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xchg_i32_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB25_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV32I-ZALRSC-NEXT:    mv a3, a1
+; RV32I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB25_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_xchg_i32_monotonic:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
@@ -2265,6 +3368,17 @@ define signext i32 @atomicrmw_xchg_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    amoswap.w a0, a1, (a0)
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_xchg_i32_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB25_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB25_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
   %1 = atomicrmw xchg ptr %a, i32 %b monotonic
   ret i32 %1
 }
@@ -2285,6 +3399,17 @@ define signext i32 @atomicrmw_add_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV32IA-NEXT:    amoadd.w a0, a1, (a0)
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_add_i32_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB26_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV32I-ZALRSC-NEXT:    add a3, a2, a1
+; RV32I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB26_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_add_i32_monotonic:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
@@ -2300,6 +3425,17 @@ define signext i32 @atomicrmw_add_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    amoadd.w a0, a1, (a0)
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_add_i32_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB26_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV64I-ZALRSC-NEXT:    add a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB26_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
   %1 = atomicrmw add ptr %a, i32 %b monotonic
   ret i32 %1
 }
@@ -2321,6 +3457,17 @@ define signext i32 @atomicrmw_sub_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV32IA-NEXT:    amoadd.w a0, a1, (a0)
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_sub_i32_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB27_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV32I-ZALRSC-NEXT:    sub a3, a2, a1
+; RV32I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB27_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_sub_i32_monotonic:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
@@ -2337,6 +3484,17 @@ define signext i32 @atomicrmw_sub_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV64IA-NEXT:    neg a1, a1
 ; RV64IA-NEXT:    amoadd.w a0, a1, (a0)
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_sub_i32_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB27_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV64I-ZALRSC-NEXT:    sub a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB27_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
   %1 = atomicrmw sub ptr %a, i32 %b monotonic
   ret i32 %1
 }
@@ -2357,6 +3515,17 @@ define signext i32 @atomicrmw_and_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV32IA-NEXT:    amoand.w a0, a1, (a0)
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_and_i32_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB28_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV32I-ZALRSC-NEXT:    and a3, a2, a1
+; RV32I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB28_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_and_i32_monotonic:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
@@ -2372,6 +3541,17 @@ define signext i32 @atomicrmw_and_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    amoand.w a0, a1, (a0)
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_and_i32_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB28_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV64I-ZALRSC-NEXT:    and a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB28_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
   %1 = atomicrmw and ptr %a, i32 %b monotonic
   ret i32 %1
 }
@@ -2413,6 +3593,18 @@ define signext i32 @atomicrmw_nand_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV32IA-ZACAS-NEXT:  # %bb.2: # %atomicrmw.end
 ; RV32IA-ZACAS-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_nand_i32_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB29_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV32I-ZALRSC-NEXT:    and a3, a2, a1
+; RV32I-ZALRSC-NEXT:    not a3, a3
+; RV32I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB29_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_nand_i32_monotonic:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
@@ -2449,6 +3641,18 @@ define signext i32 @atomicrmw_nand_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV64IA-ZACAS-NEXT:    bne a0, a3, .LBB29_1
 ; RV64IA-ZACAS-NEXT:  # %bb.2: # %atomicrmw.end
 ; RV64IA-ZACAS-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_nand_i32_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB29_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV64I-ZALRSC-NEXT:    and a3, a2, a1
+; RV64I-ZALRSC-NEXT:    not a3, a3
+; RV64I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB29_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
   %1 = atomicrmw nand ptr %a, i32 %b monotonic
   ret i32 %1
 }
@@ -2469,6 +3673,17 @@ define signext i32 @atomicrmw_or_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV32IA-NEXT:    amoor.w a0, a1, (a0)
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_or_i32_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB30_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV32I-ZALRSC-NEXT:    or a3, a2, a1
+; RV32I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB30_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_or_i32_monotonic:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
@@ -2484,6 +3699,17 @@ define signext i32 @atomicrmw_or_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    amoor.w a0, a1, (a0)
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_or_i32_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB30_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV64I-ZALRSC-NEXT:    or a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB30_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
   %1 = atomicrmw or ptr %a, i32 %b monotonic
   ret i32 %1
 }
@@ -2504,6 +3730,17 @@ define signext i32 @atomicrmw_xor_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV32IA-NEXT:    amoxor.w a0, a1, (a0)
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xor_i32_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB31_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV32I-ZALRSC-NEXT:    xor a3, a2, a1
+; RV32I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB31_1
+; RV32I-ZALRSC-NEXT:  # %bb.2:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_xor_i32_monotonic:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
@@ -2519,6 +3756,17 @@ define signext i32 @atomicrmw_xor_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    amoxor.w a0, a1, (a0)
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_xor_i32_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB31_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV64I-ZALRSC-NEXT:    xor a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB31_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
   %1 = atomicrmw xor ptr %a, i32 %b monotonic
   ret i32 %1
 }
@@ -2565,6 +3813,21 @@ define signext i32 @atomicrmw_max_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV32IA-NEXT:    amomax.w a0, a1, (a0)
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_max_i32_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB32_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV32I-ZALRSC-NEXT:    mv a3, a2
+; RV32I-ZALRSC-NEXT:    bge a3, a1, .LBB32_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB32_1 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a3, a1
+; RV32I-ZALRSC-NEXT:  .LBB32_3: # in Loop: Header=BB32_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB32_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_max_i32_monotonic:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -48
@@ -2608,6 +3871,22 @@ define signext i32 @atomicrmw_max_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    amomax.w a0, a1, (a0)
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_max_i32_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    sext.w a2, a1
+; RV64I-ZALRSC-NEXT:  .LBB32_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a1, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:    bge a3, a2, .LBB32_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB32_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:  .LBB32_3: # in Loop: Header=BB32_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB32_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a1
+; RV64I-ZALRSC-NEXT:    ret
   %1 = atomicrmw max ptr %a, i32 %b monotonic
   ret i32 %1
 }
@@ -2654,6 +3933,21 @@ define signext i32 @atomicrmw_min_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV32IA-NEXT:    amomin.w a0, a1, (a0)
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_min_i32_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB33_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV32I-ZALRSC-NEXT:    mv a3, a2
+; RV32I-ZALRSC-NEXT:    bge a1, a3, .LBB33_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB33_1 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a3, a1
+; RV32I-ZALRSC-NEXT:  .LBB33_3: # in Loop: Header=BB33_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB33_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_min_i32_monotonic:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -48
@@ -2697,6 +3991,22 @@ define signext i32 @atomicrmw_min_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    amomin.w a0, a1, (a0)
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_min_i32_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    sext.w a2, a1
+; RV64I-ZALRSC-NEXT:  .LBB33_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a1, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:    bge a2, a3, .LBB33_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB33_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:  .LBB33_3: # in Loop: Header=BB33_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB33_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a1
+; RV64I-ZALRSC-NEXT:    ret
   %1 = atomicrmw min ptr %a, i32 %b monotonic
   ret i32 %1
 }
@@ -2743,6 +4053,21 @@ define signext i32 @atomicrmw_umax_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV32IA-NEXT:    amomaxu.w a0, a1, (a0)
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umax_i32_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB34_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV32I-ZALRSC-NEXT:    mv a3, a2
+; RV32I-ZALRSC-NEXT:    bgeu a3, a1, .LBB34_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB34_1 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a3, a1
+; RV32I-ZALRSC-NEXT:  .LBB34_3: # in Loop: Header=BB34_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB34_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_umax_i32_monotonic:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -48
@@ -2786,6 +4111,22 @@ define signext i32 @atomicrmw_umax_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    amomaxu.w a0, a1, (a0)
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_umax_i32_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    sext.w a2, a1
+; RV64I-ZALRSC-NEXT:  .LBB34_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a1, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:    bgeu a3, a2, .LBB34_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB34_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:  .LBB34_3: # in Loop: Header=BB34_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB34_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a1
+; RV64I-ZALRSC-NEXT:    ret
   %1 = atomicrmw umax ptr %a, i32 %b monotonic
   ret i32 %1
 }
@@ -2832,6 +4173,21 @@ define signext i32 @atomicrmw_umin_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV32IA-NEXT:    amominu.w a0, a1, (a0)
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umin_i32_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB35_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a2, (a0)
+; RV32I-ZALRSC-NEXT:    mv a3, a2
+; RV32I-ZALRSC-NEXT:    bgeu a1, a3, .LBB35_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB35_1 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a3, a1
+; RV32I-ZALRSC-NEXT:  .LBB35_3: # in Loop: Header=BB35_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB35_1
+; RV32I-ZALRSC-NEXT:  # %bb.4:
+; RV32I-ZALRSC-NEXT:    mv a0, a2
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_umin_i32_monotonic:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -48
@@ -2875,6 +4231,22 @@ define signext i32 @atomicrmw_umin_i32_monotonic(ptr %a, i32 %b) nounwind {
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    amominu.w a0, a1, (a0)
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_umin_i32_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    sext.w a2, a1
+; RV64I-ZALRSC-NEXT:  .LBB35_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a1, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:    bgeu a2, a3, .LBB35_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB35_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:  .LBB35_3: # in Loop: Header=BB35_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB35_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a1
+; RV64I-ZALRSC-NEXT:    ret
   %1 = atomicrmw umin ptr %a, i32 %b monotonic
   ret i32 %1
 }
@@ -2900,6 +4272,16 @@ define signext i64 @atomicrmw_xchg_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    addi sp, sp, 16
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xchg_i64_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a3, 0
+; RV32I-ZALRSC-NEXT:    call __atomic_exchange_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_xchg_i64_monotonic:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
@@ -2914,6 +4296,17 @@ define signext i64 @atomicrmw_xchg_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    amoswap.d a0, a1, (a0)
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_xchg_i64_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB36_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d a2, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:    sc.d a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB36_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
   %1 = atomicrmw xchg ptr %a, i64 %b monotonic
   ret i64 %1
 }
@@ -2939,6 +4332,16 @@ define signext i64 @atomicrmw_add_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    addi sp, sp, 16
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_add_i64_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a3, 0
+; RV32I-ZALRSC-NEXT:    call __atomic_fetch_add_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_add_i64_monotonic:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
@@ -2953,6 +4356,17 @@ define signext i64 @atomicrmw_add_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    amoadd.d a0, a1, (a0)
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_add_i64_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB37_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d a2, (a0)
+; RV64I-ZALRSC-NEXT:    add a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.d a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB37_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
   %1 = atomicrmw add ptr %a, i64 %b monotonic
   ret i64 %1
 }
@@ -2978,6 +4392,16 @@ define signext i64 @atomicrmw_sub_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    addi sp, sp, 16
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_sub_i64_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a3, 0
+; RV32I-ZALRSC-NEXT:    call __atomic_fetch_sub_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_sub_i64_monotonic:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
@@ -2993,6 +4417,17 @@ define signext i64 @atomicrmw_sub_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV64IA-NEXT:    neg a1, a1
 ; RV64IA-NEXT:    amoadd.d a0, a1, (a0)
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_sub_i64_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB38_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d a2, (a0)
+; RV64I-ZALRSC-NEXT:    sub a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.d a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB38_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
   %1 = atomicrmw sub ptr %a, i64 %b monotonic
   ret i64 %1
 }
@@ -3018,6 +4453,16 @@ define signext i64 @atomicrmw_and_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    addi sp, sp, 16
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_and_i64_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a3, 0
+; RV32I-ZALRSC-NEXT:    call __atomic_fetch_and_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_and_i64_monotonic:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
@@ -3032,6 +4477,17 @@ define signext i64 @atomicrmw_and_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    amoand.d a0, a1, (a0)
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_and_i64_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB39_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d a2, (a0)
+; RV64I-ZALRSC-NEXT:    and a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.d a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB39_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
   %1 = atomicrmw and ptr %a, i64 %b monotonic
   ret i64 %1
 }
@@ -3057,6 +4513,16 @@ define signext i64 @atomicrmw_nand_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    addi sp, sp, 16
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_nand_i64_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a3, 0
+; RV32I-ZALRSC-NEXT:    call __atomic_fetch_nand_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_nand_i64_monotonic:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
@@ -3092,6 +4558,18 @@ define signext i64 @atomicrmw_nand_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV64IA-ZACAS-NEXT:    bne a0, a3, .LBB40_1
 ; RV64IA-ZACAS-NEXT:  # %bb.2: # %atomicrmw.end
 ; RV64IA-ZACAS-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_nand_i64_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB40_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d a2, (a0)
+; RV64I-ZALRSC-NEXT:    and a3, a2, a1
+; RV64I-ZALRSC-NEXT:    not a3, a3
+; RV64I-ZALRSC-NEXT:    sc.d a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB40_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
   %1 = atomicrmw nand ptr %a, i64 %b monotonic
   ret i64 %1
 }
@@ -3117,6 +4595,16 @@ define signext i64 @atomicrmw_or_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    addi sp, sp, 16
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_or_i64_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a3, 0
+; RV32I-ZALRSC-NEXT:    call __atomic_fetch_or_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_or_i64_monotonic:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
@@ -3131,6 +4619,17 @@ define signext i64 @atomicrmw_or_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    amoor.d a0, a1, (a0)
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_or_i64_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB41_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d a2, (a0)
+; RV64I-ZALRSC-NEXT:    or a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.d a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB41_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
   %1 = atomicrmw or ptr %a, i64 %b monotonic
   ret i64 %1
 }
@@ -3156,6 +4655,16 @@ define signext i64 @atomicrmw_xor_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    addi sp, sp, 16
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xor_i64_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -16
+; RV32I-ZALRSC-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    li a3, 0
+; RV32I-ZALRSC-NEXT:    call __atomic_fetch_xor_8
+; RV32I-ZALRSC-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_xor_i64_monotonic:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
@@ -3170,6 +4679,17 @@ define signext i64 @atomicrmw_xor_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    amoxor.d a0, a1, (a0)
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_xor_i64_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB42_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d a2, (a0)
+; RV64I-ZALRSC-NEXT:    xor a3, a2, a1
+; RV64I-ZALRSC-NEXT:    sc.d a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB42_1
+; RV64I-ZALRSC-NEXT:  # %bb.2:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
   %1 = atomicrmw xor ptr %a, i64 %b monotonic
   ret i64 %1
 }
@@ -3283,6 +4803,60 @@ define signext i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    addi sp, sp, 32
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_max_i64_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -32
+; RV32I-ZALRSC-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    mv s0, a2
+; RV32I-ZALRSC-NEXT:    mv s1, a0
+; RV32I-ZALRSC-NEXT:    lw a4, 0(a0)
+; RV32I-ZALRSC-NEXT:    lw a5, 4(a0)
+; RV32I-ZALRSC-NEXT:    mv s2, a1
+; RV32I-ZALRSC-NEXT:    j .LBB43_2
+; RV32I-ZALRSC-NEXT:  .LBB43_1: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB43_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    sw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    addi a1, sp, 8
+; RV32I-ZALRSC-NEXT:    mv a0, s1
+; RV32I-ZALRSC-NEXT:    li a4, 0
+; RV32I-ZALRSC-NEXT:    li a5, 0
+; RV32I-ZALRSC-NEXT:    call __atomic_compare_exchange_8
+; RV32I-ZALRSC-NEXT:    lw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    lw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    bnez a0, .LBB43_7
+; RV32I-ZALRSC-NEXT:  .LBB43_2: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    beq a5, s0, .LBB43_4
+; RV32I-ZALRSC-NEXT:  # %bb.3: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB43_2 Depth=1
+; RV32I-ZALRSC-NEXT:    slt a0, s0, a5
+; RV32I-ZALRSC-NEXT:    j .LBB43_5
+; RV32I-ZALRSC-NEXT:  .LBB43_4: # in Loop: Header=BB43_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sltu a0, s2, a4
+; RV32I-ZALRSC-NEXT:  .LBB43_5: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB43_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, a4
+; RV32I-ZALRSC-NEXT:    mv a3, a5
+; RV32I-ZALRSC-NEXT:    bnez a0, .LBB43_1
+; RV32I-ZALRSC-NEXT:  # %bb.6: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB43_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, s2
+; RV32I-ZALRSC-NEXT:    mv a3, s0
+; RV32I-ZALRSC-NEXT:    j .LBB43_1
+; RV32I-ZALRSC-NEXT:  .LBB43_7: # %atomicrmw.end
+; RV32I-ZALRSC-NEXT:    mv a0, a4
+; RV32I-ZALRSC-NEXT:    mv a1, a5
+; RV32I-ZALRSC-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 32
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_max_i64_monotonic:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -32
@@ -3323,6 +4897,21 @@ define signext i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    amomax.d a0, a1, (a0)
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_max_i64_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB43_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d a2, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:    bge a3, a1, .LBB43_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB43_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:  .LBB43_3: # in Loop: Header=BB43_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.d a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB43_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
   %1 = atomicrmw max ptr %a, i64 %b monotonic
   ret i64 %1
 }
@@ -3436,6 +5025,60 @@ define signext i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    addi sp, sp, 32
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_min_i64_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -32
+; RV32I-ZALRSC-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    mv s0, a2
+; RV32I-ZALRSC-NEXT:    mv s1, a0
+; RV32I-ZALRSC-NEXT:    lw a4, 0(a0)
+; RV32I-ZALRSC-NEXT:    lw a5, 4(a0)
+; RV32I-ZALRSC-NEXT:    mv s2, a1
+; RV32I-ZALRSC-NEXT:    j .LBB44_2
+; RV32I-ZALRSC-NEXT:  .LBB44_1: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB44_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    sw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    addi a1, sp, 8
+; RV32I-ZALRSC-NEXT:    mv a0, s1
+; RV32I-ZALRSC-NEXT:    li a4, 0
+; RV32I-ZALRSC-NEXT:    li a5, 0
+; RV32I-ZALRSC-NEXT:    call __atomic_compare_exchange_8
+; RV32I-ZALRSC-NEXT:    lw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    lw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    bnez a0, .LBB44_7
+; RV32I-ZALRSC-NEXT:  .LBB44_2: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    beq a5, s0, .LBB44_4
+; RV32I-ZALRSC-NEXT:  # %bb.3: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB44_2 Depth=1
+; RV32I-ZALRSC-NEXT:    slt a0, s0, a5
+; RV32I-ZALRSC-NEXT:    j .LBB44_5
+; RV32I-ZALRSC-NEXT:  .LBB44_4: # in Loop: Header=BB44_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sltu a0, s2, a4
+; RV32I-ZALRSC-NEXT:  .LBB44_5: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB44_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, a4
+; RV32I-ZALRSC-NEXT:    mv a3, a5
+; RV32I-ZALRSC-NEXT:    beqz a0, .LBB44_1
+; RV32I-ZALRSC-NEXT:  # %bb.6: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB44_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, s2
+; RV32I-ZALRSC-NEXT:    mv a3, s0
+; RV32I-ZALRSC-NEXT:    j .LBB44_1
+; RV32I-ZALRSC-NEXT:  .LBB44_7: # %atomicrmw.end
+; RV32I-ZALRSC-NEXT:    mv a0, a4
+; RV32I-ZALRSC-NEXT:    mv a1, a5
+; RV32I-ZALRSC-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 32
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_min_i64_monotonic:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -32
@@ -3476,6 +5119,21 @@ define signext i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    amomin.d a0, a1, (a0)
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_min_i64_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB44_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d a2, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:    bge a1, a3, .LBB44_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB44_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:  .LBB44_3: # in Loop: Header=BB44_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.d a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB44_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
   %1 = atomicrmw min ptr %a, i64 %b monotonic
   ret i64 %1
 }
@@ -3589,6 +5247,60 @@ define signext i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    addi sp, sp, 32
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umax_i64_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -32
+; RV32I-ZALRSC-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    mv s0, a2
+; RV32I-ZALRSC-NEXT:    mv s1, a0
+; RV32I-ZALRSC-NEXT:    lw a4, 0(a0)
+; RV32I-ZALRSC-NEXT:    lw a5, 4(a0)
+; RV32I-ZALRSC-NEXT:    mv s2, a1
+; RV32I-ZALRSC-NEXT:    j .LBB45_2
+; RV32I-ZALRSC-NEXT:  .LBB45_1: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB45_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    sw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    addi a1, sp, 8
+; RV32I-ZALRSC-NEXT:    mv a0, s1
+; RV32I-ZALRSC-NEXT:    li a4, 0
+; RV32I-ZALRSC-NEXT:    li a5, 0
+; RV32I-ZALRSC-NEXT:    call __atomic_compare_exchange_8
+; RV32I-ZALRSC-NEXT:    lw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    lw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    bnez a0, .LBB45_7
+; RV32I-ZALRSC-NEXT:  .LBB45_2: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    beq a5, s0, .LBB45_4
+; RV32I-ZALRSC-NEXT:  # %bb.3: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB45_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sltu a0, s0, a5
+; RV32I-ZALRSC-NEXT:    j .LBB45_5
+; RV32I-ZALRSC-NEXT:  .LBB45_4: # in Loop: Header=BB45_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sltu a0, s2, a4
+; RV32I-ZALRSC-NEXT:  .LBB45_5: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB45_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, a4
+; RV32I-ZALRSC-NEXT:    mv a3, a5
+; RV32I-ZALRSC-NEXT:    bnez a0, .LBB45_1
+; RV32I-ZALRSC-NEXT:  # %bb.6: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB45_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, s2
+; RV32I-ZALRSC-NEXT:    mv a3, s0
+; RV32I-ZALRSC-NEXT:    j .LBB45_1
+; RV32I-ZALRSC-NEXT:  .LBB45_7: # %atomicrmw.end
+; RV32I-ZALRSC-NEXT:    mv a0, a4
+; RV32I-ZALRSC-NEXT:    mv a1, a5
+; RV32I-ZALRSC-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 32
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_umax_i64_monotonic:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -32
@@ -3629,6 +5341,21 @@ define signext i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    amomaxu.d a0, a1, (a0)
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_umax_i64_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB45_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d a2, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:    bgeu a3, a1, .LBB45_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB45_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:  .LBB45_3: # in Loop: Header=BB45_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.d a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB45_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
   %1 = atomicrmw umax ptr %a, i64 %b monotonic
   ret i64 %1
 }
@@ -3742,6 +5469,60 @@ define signext i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    addi sp, sp, 32
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umin_i64_monotonic:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    addi sp, sp, -32
+; RV32I-ZALRSC-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-ZALRSC-NEXT:    mv s0, a2
+; RV32I-ZALRSC-NEXT:    mv s1, a0
+; RV32I-ZALRSC-NEXT:    lw a4, 0(a0)
+; RV32I-ZALRSC-NEXT:    lw a5, 4(a0)
+; RV32I-ZALRSC-NEXT:    mv s2, a1
+; RV32I-ZALRSC-NEXT:    j .LBB46_2
+; RV32I-ZALRSC-NEXT:  .LBB46_1: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB46_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    sw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    addi a1, sp, 8
+; RV32I-ZALRSC-NEXT:    mv a0, s1
+; RV32I-ZALRSC-NEXT:    li a4, 0
+; RV32I-ZALRSC-NEXT:    li a5, 0
+; RV32I-ZALRSC-NEXT:    call __atomic_compare_exchange_8
+; RV32I-ZALRSC-NEXT:    lw a4, 8(sp)
+; RV32I-ZALRSC-NEXT:    lw a5, 12(sp)
+; RV32I-ZALRSC-NEXT:    bnez a0, .LBB46_7
+; RV32I-ZALRSC-NEXT:  .LBB46_2: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    beq a5, s0, .LBB46_4
+; RV32I-ZALRSC-NEXT:  # %bb.3: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB46_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sltu a0, s0, a5
+; RV32I-ZALRSC-NEXT:    j .LBB46_5
+; RV32I-ZALRSC-NEXT:  .LBB46_4: # in Loop: Header=BB46_2 Depth=1
+; RV32I-ZALRSC-NEXT:    sltu a0, s2, a4
+; RV32I-ZALRSC-NEXT:  .LBB46_5: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB46_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, a4
+; RV32I-ZALRSC-NEXT:    mv a3, a5
+; RV32I-ZALRSC-NEXT:    beqz a0, .LBB46_1
+; RV32I-ZALRSC-NEXT:  # %bb.6: # %atomicrmw.start
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB46_2 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a2, s2
+; RV32I-ZALRSC-NEXT:    mv a3, s0
+; RV32I-ZALRSC-NEXT:    j .LBB46_1
+; RV32I-ZALRSC-NEXT:  .LBB46_7: # %atomicrmw.end
+; RV32I-ZALRSC-NEXT:    mv a0, a4
+; RV32I-ZALRSC-NEXT:    mv a1, a5
+; RV32I-ZALRSC-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-ZALRSC-NEXT:    addi sp, sp, 32
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_umin_i64_monotonic:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -32
@@ -3782,6 +5563,21 @@ define signext i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    amominu.d a0, a1, (a0)
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_umin_i64_monotonic:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB46_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.d a2, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:    bgeu a1, a3, .LBB46_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB46_1 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:  .LBB46_3: # in Loop: Header=BB46_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.d a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB46_1
+; RV64I-ZALRSC-NEXT:  # %bb.4:
+; RV64I-ZALRSC-NEXT:    mv a0, a2
+; RV64I-ZALRSC-NEXT:    ret
   %1 = atomicrmw umin ptr %a, i64 %b monotonic
   ret i64 %1
 }
@@ -3827,6 +5623,32 @@ define signext i8 @cmpxchg_i8_monotonic_monotonic_val0(ptr %ptr, i8 signext %cmp
 ; RV32IA-NEXT:    srai a0, a0, 24
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: cmpxchg_i8_monotonic_monotonic_val0:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a3, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a4, 255
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    zext.b a2, a2
+; RV32I-ZALRSC-NEXT:    sll a4, a4, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:    sll a2, a2, a0
+; RV32I-ZALRSC-NEXT:  .LBB47_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a5, (a3)
+; RV32I-ZALRSC-NEXT:    and a6, a5, a4
+; RV32I-ZALRSC-NEXT:    bne a6, a1, .LBB47_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB47_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a2
+; RV32I-ZALRSC-NEXT:    and a6, a6, a4
+; RV32I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV32I-ZALRSC-NEXT:    sc.w a6, a6, (a3)
+; RV32I-ZALRSC-NEXT:    bnez a6, .LBB47_1
+; RV32I-ZALRSC-NEXT:  .LBB47_3:
+; RV32I-ZALRSC-NEXT:    srl a0, a5, a0
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 24
+; RV32I-ZALRSC-NEXT:    srai a0, a0, 24
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: cmpxchg_i8_monotonic_monotonic_val0:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
@@ -3866,6 +5688,32 @@ define signext i8 @cmpxchg_i8_monotonic_monotonic_val0(ptr %ptr, i8 signext %cmp
 ; RV64IA-NEXT:    slli a0, a0, 56
 ; RV64IA-NEXT:    srai a0, a0, 56
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: cmpxchg_i8_monotonic_monotonic_val0:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a3, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a4, 255
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    zext.b a2, a2
+; RV64I-ZALRSC-NEXT:    sllw a4, a4, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:    sllw a2, a2, a0
+; RV64I-ZALRSC-NEXT:  .LBB47_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a5, (a3)
+; RV64I-ZALRSC-NEXT:    and a6, a5, a4
+; RV64I-ZALRSC-NEXT:    bne a6, a1, .LBB47_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB47_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a2
+; RV64I-ZALRSC-NEXT:    and a6, a6, a4
+; RV64I-ZALRSC-NEXT:    xor a6, a5, a6
+; RV64I-ZALRSC-NEXT:    sc.w a6, a6, (a3)
+; RV64I-ZALRSC-NEXT:    bnez a6, .LBB47_1
+; RV64I-ZALRSC-NEXT:  .LBB47_3:
+; RV64I-ZALRSC-NEXT:    srlw a0, a5, a0
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 56
+; RV64I-ZALRSC-NEXT:    srai a0, a0, 56
+; RV64I-ZALRSC-NEXT:    ret
   %1 = cmpxchg ptr %ptr, i8 %cmp, i8 %val monotonic monotonic
   %2 = extractvalue { i8, i1 } %1, 0
   ret i8 %2
@@ -3911,6 +5759,32 @@ define i1 @cmpxchg_i8_monotonic_monotonic_val1(ptr %ptr, i8 signext %cmp, i8 sig
 ; RV32IA-NEXT:    seqz a0, a1
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: cmpxchg_i8_monotonic_monotonic_val1:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a3, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    li a4, 255
+; RV32I-ZALRSC-NEXT:    zext.b a1, a1
+; RV32I-ZALRSC-NEXT:    zext.b a2, a2
+; RV32I-ZALRSC-NEXT:    sll a4, a4, a0
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:    sll a0, a2, a0
+; RV32I-ZALRSC-NEXT:  .LBB48_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a2, (a3)
+; RV32I-ZALRSC-NEXT:    and a5, a2, a4
+; RV32I-ZALRSC-NEXT:    bne a5, a1, .LBB48_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB48_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a5, a2, a0
+; RV32I-ZALRSC-NEXT:    and a5, a5, a4
+; RV32I-ZALRSC-NEXT:    xor a5, a2, a5
+; RV32I-ZALRSC-NEXT:    sc.w a5, a5, (a3)
+; RV32I-ZALRSC-NEXT:    bnez a5, .LBB48_1
+; RV32I-ZALRSC-NEXT:  .LBB48_3:
+; RV32I-ZALRSC-NEXT:    and a2, a2, a4
+; RV32I-ZALRSC-NEXT:    xor a1, a1, a2
+; RV32I-ZALRSC-NEXT:    seqz a0, a1
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: cmpxchg_i8_monotonic_monotonic_val1:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
@@ -3949,6 +5823,32 @@ define i1 @cmpxchg_i8_monotonic_monotonic_val1(ptr %ptr, i8 signext %cmp, i8 sig
 ; RV64IA-NEXT:    xor a1, a1, a2
 ; RV64IA-NEXT:    seqz a0, a1
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: cmpxchg_i8_monotonic_monotonic_val1:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a3, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    li a4, 255
+; RV64I-ZALRSC-NEXT:    zext.b a1, a1
+; RV64I-ZALRSC-NEXT:    zext.b a2, a2
+; RV64I-ZALRSC-NEXT:    sllw a4, a4, a0
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:    sllw a0, a2, a0
+; RV64I-ZALRSC-NEXT:  .LBB48_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a2, (a3)
+; RV64I-ZALRSC-NEXT:    and a5, a2, a4
+; RV64I-ZALRSC-NEXT:    bne a5, a1, .LBB48_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB48_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a5, a2, a0
+; RV64I-ZALRSC-NEXT:    and a5, a5, a4
+; RV64I-ZALRSC-NEXT:    xor a5, a2, a5
+; RV64I-ZALRSC-NEXT:    sc.w a5, a5, (a3)
+; RV64I-ZALRSC-NEXT:    bnez a5, .LBB48_1
+; RV64I-ZALRSC-NEXT:  .LBB48_3:
+; RV64I-ZALRSC-NEXT:    and a2, a2, a4
+; RV64I-ZALRSC-NEXT:    xor a1, a1, a2
+; RV64I-ZALRSC-NEXT:    seqz a0, a1
+; RV64I-ZALRSC-NEXT:    ret
   %1 = cmpxchg ptr %ptr, i8 %cmp, i8 %val monotonic monotonic
   %2 = extractvalue { i8, i1 } %1, 1
   ret i1 %2
@@ -3996,6 +5896,33 @@ define signext i16 @cmpxchg_i16_monotonic_monotonic_val0(ptr %ptr, i16 signext %
 ; RV32IA-NEXT:    srai a0, a0, 16
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: cmpxchg_i16_monotonic_monotonic_val0:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a3, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a4, 16
+; RV32I-ZALRSC-NEXT:    addi a4, a4, -1
+; RV32I-ZALRSC-NEXT:    sll a5, a4, a0
+; RV32I-ZALRSC-NEXT:    and a1, a1, a4
+; RV32I-ZALRSC-NEXT:    and a2, a2, a4
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:    sll a2, a2, a0
+; RV32I-ZALRSC-NEXT:  .LBB49_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a4, (a3)
+; RV32I-ZALRSC-NEXT:    and a6, a4, a5
+; RV32I-ZALRSC-NEXT:    bne a6, a1, .LBB49_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB49_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a6, a4, a2
+; RV32I-ZALRSC-NEXT:    and a6, a6, a5
+; RV32I-ZALRSC-NEXT:    xor a6, a4, a6
+; RV32I-ZALRSC-NEXT:    sc.w a6, a6, (a3)
+; RV32I-ZALRSC-NEXT:    bnez a6, .LBB49_1
+; RV32I-ZALRSC-NEXT:  .LBB49_3:
+; RV32I-ZALRSC-NEXT:    srl a0, a4, a0
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 16
+; RV32I-ZALRSC-NEXT:    srai a0, a0, 16
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: cmpxchg_i16_monotonic_monotonic_val0:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
@@ -4036,6 +5963,33 @@ define signext i16 @cmpxchg_i16_monotonic_monotonic_val0(ptr %ptr, i16 signext %
 ; RV64IA-NEXT:    slli a0, a0, 48
 ; RV64IA-NEXT:    srai a0, a0, 48
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: cmpxchg_i16_monotonic_monotonic_val0:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a3, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a4, 16
+; RV64I-ZALRSC-NEXT:    addi a4, a4, -1
+; RV64I-ZALRSC-NEXT:    sllw a5, a4, a0
+; RV64I-ZALRSC-NEXT:    and a1, a1, a4
+; RV64I-ZALRSC-NEXT:    and a2, a2, a4
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:    sllw a2, a2, a0
+; RV64I-ZALRSC-NEXT:  .LBB49_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a4, (a3)
+; RV64I-ZALRSC-NEXT:    and a6, a4, a5
+; RV64I-ZALRSC-NEXT:    bne a6, a1, .LBB49_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB49_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a6, a4, a2
+; RV64I-ZALRSC-NEXT:    and a6, a6, a5
+; RV64I-ZALRSC-NEXT:    xor a6, a4, a6
+; RV64I-ZALRSC-NEXT:    sc.w a6, a6, (a3)
+; RV64I-ZALRSC-NEXT:    bnez a6, .LBB49_1
+; RV64I-ZALRSC-NEXT:  .LBB49_3:
+; RV64I-ZALRSC-NEXT:    srlw a0, a4, a0
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 48
+; RV64I-ZALRSC-NEXT:    srai a0, a0, 48
+; RV64I-ZALRSC-NEXT:    ret
   %1 = cmpxchg ptr %ptr, i16 %cmp, i16 %val monotonic monotonic
   %2 = extractvalue { i16, i1 } %1, 0
   ret i16 %2
@@ -4082,6 +6036,33 @@ define i1 @cmpxchg_i16_monotonic_monotonic_val1(ptr %ptr, i16 signext %cmp, i16
 ; RV32IA-NEXT:    seqz a0, a1
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: cmpxchg_i16_monotonic_monotonic_val1:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a3, a0, -4
+; RV32I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV32I-ZALRSC-NEXT:    lui a4, 16
+; RV32I-ZALRSC-NEXT:    addi a4, a4, -1
+; RV32I-ZALRSC-NEXT:    sll a5, a4, a0
+; RV32I-ZALRSC-NEXT:    and a1, a1, a4
+; RV32I-ZALRSC-NEXT:    and a2, a2, a4
+; RV32I-ZALRSC-NEXT:    sll a1, a1, a0
+; RV32I-ZALRSC-NEXT:    sll a0, a2, a0
+; RV32I-ZALRSC-NEXT:  .LBB50_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a2, (a3)
+; RV32I-ZALRSC-NEXT:    and a4, a2, a5
+; RV32I-ZALRSC-NEXT:    bne a4, a1, .LBB50_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB50_1 Depth=1
+; RV32I-ZALRSC-NEXT:    xor a4, a2, a0
+; RV32I-ZALRSC-NEXT:    and a4, a4, a5
+; RV32I-ZALRSC-NEXT:    xor a4, a2, a4
+; RV32I-ZALRSC-NEXT:    sc.w a4, a4, (a3)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB50_1
+; RV32I-ZALRSC-NEXT:  .LBB50_3:
+; RV32I-ZALRSC-NEXT:    and a2, a2, a5
+; RV32I-ZALRSC-NEXT:    xor a1, a1, a2
+; RV32I-ZALRSC-NEXT:    seqz a0, a1
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: cmpxchg_i16_monotonic_monotonic_val1:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
@@ -4121,6 +6102,33 @@ define i1 @cmpxchg_i16_monotonic_monotonic_val1(ptr %ptr, i16 signext %cmp, i16
 ; RV64IA-NEXT:    xor a1, a1, a2
 ; RV64IA-NEXT:    seqz a0, a1
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: cmpxchg_i16_monotonic_monotonic_val1:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a3, a0, -4
+; RV64I-ZALRSC-NEXT:    slli a0, a0, 3
+; RV64I-ZALRSC-NEXT:    lui a4, 16
+; RV64I-ZALRSC-NEXT:    addi a4, a4, -1
+; RV64I-ZALRSC-NEXT:    sllw a5, a4, a0
+; RV64I-ZALRSC-NEXT:    and a1, a1, a4
+; RV64I-ZALRSC-NEXT:    and a2, a2, a4
+; RV64I-ZALRSC-NEXT:    sllw a1, a1, a0
+; RV64I-ZALRSC-NEXT:    sllw a0, a2, a0
+; RV64I-ZALRSC-NEXT:  .LBB50_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a2, (a3)
+; RV64I-ZALRSC-NEXT:    and a4, a2, a5
+; RV64I-ZALRSC-NEXT:    bne a4, a1, .LBB50_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB50_1 Depth=1
+; RV64I-ZALRSC-NEXT:    xor a4, a2, a0
+; RV64I-ZALRSC-NEXT:    and a4, a4, a5
+; RV64I-ZALRSC-NEXT:    xor a4, a2, a4
+; RV64I-ZALRSC-NEXT:    sc.w a4, a4, (a3)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB50_1
+; RV64I-ZALRSC-NEXT:  .LBB50_3:
+; RV64I-ZALRSC-NEXT:    and a2, a2, a5
+; RV64I-ZALRSC-NEXT:    xor a1, a1, a2
+; RV64I-ZALRSC-NEXT:    seqz a0, a1
+; RV64I-ZALRSC-NEXT:    ret
   %1 = cmpxchg ptr %ptr, i16 %cmp, i16 %val monotonic monotonic
   %2 = extractvalue { i16, i1 } %1, 1
   ret i1 %2
@@ -4159,6 +6167,18 @@ define signext i32 @cmpxchg_i32_monotonic_monotonic_val0(ptr %ptr, i32 signext %
 ; RV32IA-ZACAS-NEXT:    mv a0, a1
 ; RV32IA-ZACAS-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: cmpxchg_i32_monotonic_monotonic_val0:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB51_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a3, (a0)
+; RV32I-ZALRSC-NEXT:    bne a3, a1, .LBB51_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB51_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w a4, a2, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB51_1
+; RV32I-ZALRSC-NEXT:  .LBB51_3:
+; RV32I-ZALRSC-NEXT:    mv a0, a3
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: cmpxchg_i32_monotonic_monotonic_val0:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
@@ -4190,6 +6210,18 @@ define signext i32 @cmpxchg_i32_monotonic_monotonic_val0(ptr %ptr, i32 signext %
 ; RV64IA-ZACAS-NEXT:    amocas.w a1, a2, (a0)
 ; RV64IA-ZACAS-NEXT:    mv a0, a1
 ; RV64IA-ZACAS-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: cmpxchg_i32_monotonic_monotonic_val0:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB51_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a3, (a0)
+; RV64I-ZALRSC-NEXT:    bne a3, a1, .LBB51_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB51_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w a4, a2, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB51_1
+; RV64I-ZALRSC-NEXT:  .LBB51_3:
+; RV64I-ZALRSC-NEXT:    mv a0, a3
+; RV64I-ZALRSC-NEXT:    ret
   %1 = cmpxchg ptr %ptr, i32 %cmp, i32 %val monotonic monotonic
   %2 = extractvalue { i32, i1 } %1, 0
   ret i32 %2
@@ -4230,6 +6262,19 @@ define i1 @cmpxchg_i32_monotonic_monotonic_val1(ptr %ptr, i32 signext %cmp, i32
 ; RV32IA-ZACAS-NEXT:    seqz a0, a1
 ; RV32IA-ZACAS-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: cmpxchg_i32_monotonic_monotonic_val1:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:  .LBB52_1: # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a3, (a0)
+; RV32I-ZALRSC-NEXT:    bne a3, a1, .LBB52_3
+; RV32I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB52_1 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w a4, a2, (a0)
+; RV32I-ZALRSC-NEXT:    bnez a4, .LBB52_1
+; RV32I-ZALRSC-NEXT:  .LBB52_3:
+; RV32I-ZALRSC-NEXT:    xor a1, a3, a1
+; RV32I-ZALRSC-NEXT:    seqz a0, a1
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: cmpxchg_i32_monotonic_monotonic_val1:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
@@ -4263,6 +6308,19 @@ define i1 @cmpxchg_i32_monotonic_monotonic_val1(ptr %ptr, i32 signext %cmp, i32
 ; RV64IA-ZACAS-NEXT:    xor a1, a3, a1
 ; RV64IA-ZACAS-NEXT:    seqz a0, a1
 ; RV64IA-ZACAS-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: cmpxchg_i32_monotonic_monotonic_val1:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:  .LBB52_1: # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a3, (a0)
+; RV64I-ZALRSC-NEXT:    bne a3, a1, .LBB52_3
+; RV64I-ZALRSC-NEXT:  # %bb.2: # in Loop: Header=BB52_1 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w a4, a2, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB52_1
+; RV64I-ZALRSC-NEXT:  .LBB52_3:
+; RV64I-ZALRSC-NEXT:    xor a1, a3, a1
+; RV64I-ZALRSC-NEXT:    seqz a0, a1
+; RV64I-ZALRSC-NEXT:    ret
   %1 = cmpxchg ptr %ptr, i32 %cmp, i32 %val monotonic monotonic
   %2 = extractvalue { i32, i1 } %1, 1
   ret i1 %2
@@ -4304,6 +6362,27 @@ define signext i32 @atomicrmw_xchg_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ; RV32IA-NEXT:    sw a2, 0(a1)
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xchg_i32_monotonic_crossbb:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a1, 1
+; RV32I-ZALRSC-NEXT:    mv a1, a0
+; RV32I-ZALRSC-NEXT:    beqz a2, .LBB53_2
+; RV32I-ZALRSC-NEXT:  # %bb.1: # %then
+; RV32I-ZALRSC-NEXT:    li a2, 1
+; RV32I-ZALRSC-NEXT:  .LBB53_3: # %then
+; RV32I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a0, (a1)
+; RV32I-ZALRSC-NEXT:    mv a3, a2
+; RV32I-ZALRSC-NEXT:    sc.w a3, a3, (a1)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB53_3
+; RV32I-ZALRSC-NEXT:  # %bb.4: # %then
+; RV32I-ZALRSC-NEXT:    ret
+; RV32I-ZALRSC-NEXT:  .LBB53_2: # %else
+; RV32I-ZALRSC-NEXT:    lw a0, 0(a1)
+; RV32I-ZALRSC-NEXT:    li a2, 1
+; RV32I-ZALRSC-NEXT:    sw a2, 0(a1)
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_xchg_i32_monotonic_crossbb:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    andi a1, a1, 1
@@ -4339,6 +6418,28 @@ define signext i32 @atomicrmw_xchg_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ; RV64IA-NEXT:    li a2, 1
 ; RV64IA-NEXT:    sw a2, 0(a1)
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_xchg_i32_monotonic_crossbb:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a1, a1, 1
+; RV64I-ZALRSC-NEXT:    beqz a1, .LBB53_2
+; RV64I-ZALRSC-NEXT:  # %bb.1: # %then
+; RV64I-ZALRSC-NEXT:    li a2, 1
+; RV64I-ZALRSC-NEXT:  .LBB53_3: # %then
+; RV64I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a1, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB53_3
+; RV64I-ZALRSC-NEXT:  # %bb.4: # %then
+; RV64I-ZALRSC-NEXT:    sext.w a0, a1
+; RV64I-ZALRSC-NEXT:    ret
+; RV64I-ZALRSC-NEXT:  .LBB53_2: # %else
+; RV64I-ZALRSC-NEXT:    lw a1, 0(a0)
+; RV64I-ZALRSC-NEXT:    li a2, 1
+; RV64I-ZALRSC-NEXT:    sw a2, 0(a0)
+; RV64I-ZALRSC-NEXT:    sext.w a0, a1
+; RV64I-ZALRSC-NEXT:    ret
   br i1 %c, label %then, label %else
 
 then:
@@ -4391,6 +6492,27 @@ define signext i32 @atomicrmw_add_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ; RV32IA-NEXT:    sw a2, 0(a1)
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_add_i32_monotonic_crossbb:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a1, 1
+; RV32I-ZALRSC-NEXT:    mv a1, a0
+; RV32I-ZALRSC-NEXT:    beqz a2, .LBB54_2
+; RV32I-ZALRSC-NEXT:  # %bb.1: # %then
+; RV32I-ZALRSC-NEXT:    li a2, 1
+; RV32I-ZALRSC-NEXT:  .LBB54_3: # %then
+; RV32I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a0, (a1)
+; RV32I-ZALRSC-NEXT:    add a3, a0, a2
+; RV32I-ZALRSC-NEXT:    sc.w a3, a3, (a1)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB54_3
+; RV32I-ZALRSC-NEXT:  # %bb.4: # %then
+; RV32I-ZALRSC-NEXT:    ret
+; RV32I-ZALRSC-NEXT:  .LBB54_2: # %else
+; RV32I-ZALRSC-NEXT:    lw a0, 0(a1)
+; RV32I-ZALRSC-NEXT:    addi a2, a0, 1
+; RV32I-ZALRSC-NEXT:    sw a2, 0(a1)
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_add_i32_monotonic_crossbb:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    andi a1, a1, 1
@@ -4426,6 +6548,28 @@ define signext i32 @atomicrmw_add_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ; RV64IA-NEXT:    addi a2, a0, 1
 ; RV64IA-NEXT:    sw a2, 0(a1)
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_add_i32_monotonic_crossbb:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a1, a1, 1
+; RV64I-ZALRSC-NEXT:    beqz a1, .LBB54_2
+; RV64I-ZALRSC-NEXT:  # %bb.1: # %then
+; RV64I-ZALRSC-NEXT:    li a2, 1
+; RV64I-ZALRSC-NEXT:  .LBB54_3: # %then
+; RV64I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a1, (a0)
+; RV64I-ZALRSC-NEXT:    add a3, a1, a2
+; RV64I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB54_3
+; RV64I-ZALRSC-NEXT:  # %bb.4: # %then
+; RV64I-ZALRSC-NEXT:    sext.w a0, a1
+; RV64I-ZALRSC-NEXT:    ret
+; RV64I-ZALRSC-NEXT:  .LBB54_2: # %else
+; RV64I-ZALRSC-NEXT:    lw a1, 0(a0)
+; RV64I-ZALRSC-NEXT:    addi a2, a1, 1
+; RV64I-ZALRSC-NEXT:    sw a2, 0(a0)
+; RV64I-ZALRSC-NEXT:    sext.w a0, a1
+; RV64I-ZALRSC-NEXT:    ret
   br i1 %c, label %then, label %else
 
 then:
@@ -4479,6 +6623,27 @@ define signext i32 @atomicrmw_sub_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ; RV32IA-NEXT:    sw a2, 0(a1)
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_sub_i32_monotonic_crossbb:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a1, 1
+; RV32I-ZALRSC-NEXT:    mv a1, a0
+; RV32I-ZALRSC-NEXT:    beqz a2, .LBB55_2
+; RV32I-ZALRSC-NEXT:  # %bb.1: # %then
+; RV32I-ZALRSC-NEXT:    li a2, 1
+; RV32I-ZALRSC-NEXT:  .LBB55_3: # %then
+; RV32I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a0, (a1)
+; RV32I-ZALRSC-NEXT:    sub a3, a0, a2
+; RV32I-ZALRSC-NEXT:    sc.w a3, a3, (a1)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB55_3
+; RV32I-ZALRSC-NEXT:  # %bb.4: # %then
+; RV32I-ZALRSC-NEXT:    ret
+; RV32I-ZALRSC-NEXT:  .LBB55_2: # %else
+; RV32I-ZALRSC-NEXT:    lw a0, 0(a1)
+; RV32I-ZALRSC-NEXT:    addi a2, a0, -1
+; RV32I-ZALRSC-NEXT:    sw a2, 0(a1)
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_sub_i32_monotonic_crossbb:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    andi a1, a1, 1
@@ -4514,6 +6679,28 @@ define signext i32 @atomicrmw_sub_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ; RV64IA-NEXT:    addi a2, a0, -1
 ; RV64IA-NEXT:    sw a2, 0(a1)
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_sub_i32_monotonic_crossbb:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a1, a1, 1
+; RV64I-ZALRSC-NEXT:    beqz a1, .LBB55_2
+; RV64I-ZALRSC-NEXT:  # %bb.1: # %then
+; RV64I-ZALRSC-NEXT:    li a2, 1
+; RV64I-ZALRSC-NEXT:  .LBB55_3: # %then
+; RV64I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a1, (a0)
+; RV64I-ZALRSC-NEXT:    sub a3, a1, a2
+; RV64I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB55_3
+; RV64I-ZALRSC-NEXT:  # %bb.4: # %then
+; RV64I-ZALRSC-NEXT:    sext.w a0, a1
+; RV64I-ZALRSC-NEXT:    ret
+; RV64I-ZALRSC-NEXT:  .LBB55_2: # %else
+; RV64I-ZALRSC-NEXT:    lw a1, 0(a0)
+; RV64I-ZALRSC-NEXT:    addi a2, a1, -1
+; RV64I-ZALRSC-NEXT:    sw a2, 0(a0)
+; RV64I-ZALRSC-NEXT:    sext.w a0, a1
+; RV64I-ZALRSC-NEXT:    ret
   br i1 %c, label %then, label %else
 
 then:
@@ -4567,6 +6754,27 @@ define signext i32 @atomicrmw_and_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ; RV32IA-NEXT:    sw a2, 0(a1)
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_and_i32_monotonic_crossbb:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a1, 1
+; RV32I-ZALRSC-NEXT:    mv a1, a0
+; RV32I-ZALRSC-NEXT:    beqz a2, .LBB56_2
+; RV32I-ZALRSC-NEXT:  # %bb.1: # %then
+; RV32I-ZALRSC-NEXT:    li a2, 1
+; RV32I-ZALRSC-NEXT:  .LBB56_3: # %then
+; RV32I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a0, (a1)
+; RV32I-ZALRSC-NEXT:    and a3, a0, a2
+; RV32I-ZALRSC-NEXT:    sc.w a3, a3, (a1)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB56_3
+; RV32I-ZALRSC-NEXT:  # %bb.4: # %then
+; RV32I-ZALRSC-NEXT:    ret
+; RV32I-ZALRSC-NEXT:  .LBB56_2: # %else
+; RV32I-ZALRSC-NEXT:    lw a0, 0(a1)
+; RV32I-ZALRSC-NEXT:    andi a2, a0, 1
+; RV32I-ZALRSC-NEXT:    sw a2, 0(a1)
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_and_i32_monotonic_crossbb:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    andi a1, a1, 1
@@ -4602,6 +6810,28 @@ define signext i32 @atomicrmw_and_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ; RV64IA-NEXT:    andi a2, a0, 1
 ; RV64IA-NEXT:    sw a2, 0(a1)
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_and_i32_monotonic_crossbb:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a1, a1, 1
+; RV64I-ZALRSC-NEXT:    beqz a1, .LBB56_2
+; RV64I-ZALRSC-NEXT:  # %bb.1: # %then
+; RV64I-ZALRSC-NEXT:    li a2, 1
+; RV64I-ZALRSC-NEXT:  .LBB56_3: # %then
+; RV64I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a1, (a0)
+; RV64I-ZALRSC-NEXT:    and a3, a1, a2
+; RV64I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB56_3
+; RV64I-ZALRSC-NEXT:  # %bb.4: # %then
+; RV64I-ZALRSC-NEXT:    sext.w a0, a1
+; RV64I-ZALRSC-NEXT:    ret
+; RV64I-ZALRSC-NEXT:  .LBB56_2: # %else
+; RV64I-ZALRSC-NEXT:    lw a1, 0(a0)
+; RV64I-ZALRSC-NEXT:    andi a2, a1, 1
+; RV64I-ZALRSC-NEXT:    sw a2, 0(a0)
+; RV64I-ZALRSC-NEXT:    sext.w a0, a1
+; RV64I-ZALRSC-NEXT:    ret
   br i1 %c, label %then, label %else
 
 then:
@@ -4685,6 +6915,28 @@ define signext i32 @atomicrmw_nand_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ; RV32IA-ZACAS-NEXT:    mv a0, a1
 ; RV32IA-ZACAS-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_nand_i32_monotonic_crossbb:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a1, 1
+; RV32I-ZALRSC-NEXT:    mv a1, a0
+; RV32I-ZALRSC-NEXT:    beqz a2, .LBB57_2
+; RV32I-ZALRSC-NEXT:  # %bb.1: # %then
+; RV32I-ZALRSC-NEXT:    li a2, 1
+; RV32I-ZALRSC-NEXT:  .LBB57_3: # %then
+; RV32I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a0, (a1)
+; RV32I-ZALRSC-NEXT:    and a3, a0, a2
+; RV32I-ZALRSC-NEXT:    not a3, a3
+; RV32I-ZALRSC-NEXT:    sc.w a3, a3, (a1)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB57_3
+; RV32I-ZALRSC-NEXT:  # %bb.4: # %then
+; RV32I-ZALRSC-NEXT:    ret
+; RV32I-ZALRSC-NEXT:  .LBB57_2: # %else
+; RV32I-ZALRSC-NEXT:    lw a0, 0(a1)
+; RV32I-ZALRSC-NEXT:    andi a2, a0, 1
+; RV32I-ZALRSC-NEXT:    sw a2, 0(a1)
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_nand_i32_monotonic_crossbb:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    andi a1, a1, 1
@@ -4750,6 +7002,28 @@ define signext i32 @atomicrmw_nand_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ; RV64IA-ZACAS-NEXT:    sw a2, 0(a0)
 ; RV64IA-ZACAS-NEXT:    mv a0, a1
 ; RV64IA-ZACAS-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_nand_i32_monotonic_crossbb:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a1, 1
+; RV64I-ZALRSC-NEXT:    mv a1, a0
+; RV64I-ZALRSC-NEXT:    beqz a2, .LBB57_2
+; RV64I-ZALRSC-NEXT:  # %bb.1: # %then
+; RV64I-ZALRSC-NEXT:    li a2, 1
+; RV64I-ZALRSC-NEXT:  .LBB57_3: # %then
+; RV64I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a0, (a1)
+; RV64I-ZALRSC-NEXT:    and a3, a0, a2
+; RV64I-ZALRSC-NEXT:    not a3, a3
+; RV64I-ZALRSC-NEXT:    sc.w a3, a3, (a1)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB57_3
+; RV64I-ZALRSC-NEXT:  # %bb.4: # %then
+; RV64I-ZALRSC-NEXT:    ret
+; RV64I-ZALRSC-NEXT:  .LBB57_2: # %else
+; RV64I-ZALRSC-NEXT:    lw a0, 0(a1)
+; RV64I-ZALRSC-NEXT:    andi a2, a0, 1
+; RV64I-ZALRSC-NEXT:    sw a2, 0(a1)
+; RV64I-ZALRSC-NEXT:    ret
   br i1 %c, label %then, label %else
 
 then:
@@ -4803,6 +7077,27 @@ define signext i32 @atomicrmw_or_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind {
 ; RV32IA-NEXT:    sw a2, 0(a1)
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_or_i32_monotonic_crossbb:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a1, 1
+; RV32I-ZALRSC-NEXT:    mv a1, a0
+; RV32I-ZALRSC-NEXT:    beqz a2, .LBB58_2
+; RV32I-ZALRSC-NEXT:  # %bb.1: # %then
+; RV32I-ZALRSC-NEXT:    li a2, 1
+; RV32I-ZALRSC-NEXT:  .LBB58_3: # %then
+; RV32I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a0, (a1)
+; RV32I-ZALRSC-NEXT:    or a3, a0, a2
+; RV32I-ZALRSC-NEXT:    sc.w a3, a3, (a1)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB58_3
+; RV32I-ZALRSC-NEXT:  # %bb.4: # %then
+; RV32I-ZALRSC-NEXT:    ret
+; RV32I-ZALRSC-NEXT:  .LBB58_2: # %else
+; RV32I-ZALRSC-NEXT:    lw a0, 0(a1)
+; RV32I-ZALRSC-NEXT:    ori a2, a0, 1
+; RV32I-ZALRSC-NEXT:    sw a2, 0(a1)
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_or_i32_monotonic_crossbb:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    andi a1, a1, 1
@@ -4838,6 +7133,28 @@ define signext i32 @atomicrmw_or_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind {
 ; RV64IA-NEXT:    ori a2, a0, 1
 ; RV64IA-NEXT:    sw a2, 0(a1)
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_or_i32_monotonic_crossbb:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a1, a1, 1
+; RV64I-ZALRSC-NEXT:    beqz a1, .LBB58_2
+; RV64I-ZALRSC-NEXT:  # %bb.1: # %then
+; RV64I-ZALRSC-NEXT:    li a2, 1
+; RV64I-ZALRSC-NEXT:  .LBB58_3: # %then
+; RV64I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a1, (a0)
+; RV64I-ZALRSC-NEXT:    or a3, a1, a2
+; RV64I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB58_3
+; RV64I-ZALRSC-NEXT:  # %bb.4: # %then
+; RV64I-ZALRSC-NEXT:    sext.w a0, a1
+; RV64I-ZALRSC-NEXT:    ret
+; RV64I-ZALRSC-NEXT:  .LBB58_2: # %else
+; RV64I-ZALRSC-NEXT:    lw a1, 0(a0)
+; RV64I-ZALRSC-NEXT:    ori a2, a1, 1
+; RV64I-ZALRSC-NEXT:    sw a2, 0(a0)
+; RV64I-ZALRSC-NEXT:    sext.w a0, a1
+; RV64I-ZALRSC-NEXT:    ret
   br i1 %c, label %then, label %else
 
 then:
@@ -4891,6 +7208,27 @@ define signext i32 @atomicrmw_xor_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ; RV32IA-NEXT:    sw a2, 0(a1)
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_xor_i32_monotonic_crossbb:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a1, 1
+; RV32I-ZALRSC-NEXT:    mv a1, a0
+; RV32I-ZALRSC-NEXT:    beqz a2, .LBB59_2
+; RV32I-ZALRSC-NEXT:  # %bb.1: # %then
+; RV32I-ZALRSC-NEXT:    li a2, 1
+; RV32I-ZALRSC-NEXT:  .LBB59_3: # %then
+; RV32I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a0, (a1)
+; RV32I-ZALRSC-NEXT:    xor a3, a0, a2
+; RV32I-ZALRSC-NEXT:    sc.w a3, a3, (a1)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB59_3
+; RV32I-ZALRSC-NEXT:  # %bb.4: # %then
+; RV32I-ZALRSC-NEXT:    ret
+; RV32I-ZALRSC-NEXT:  .LBB59_2: # %else
+; RV32I-ZALRSC-NEXT:    lw a0, 0(a1)
+; RV32I-ZALRSC-NEXT:    xori a2, a0, 1
+; RV32I-ZALRSC-NEXT:    sw a2, 0(a1)
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_xor_i32_monotonic_crossbb:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    andi a1, a1, 1
@@ -4926,6 +7264,28 @@ define signext i32 @atomicrmw_xor_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ; RV64IA-NEXT:    xori a2, a0, 1
 ; RV64IA-NEXT:    sw a2, 0(a1)
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_xor_i32_monotonic_crossbb:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a1, a1, 1
+; RV64I-ZALRSC-NEXT:    beqz a1, .LBB59_2
+; RV64I-ZALRSC-NEXT:  # %bb.1: # %then
+; RV64I-ZALRSC-NEXT:    li a2, 1
+; RV64I-ZALRSC-NEXT:  .LBB59_3: # %then
+; RV64I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a1, (a0)
+; RV64I-ZALRSC-NEXT:    xor a3, a1, a2
+; RV64I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB59_3
+; RV64I-ZALRSC-NEXT:  # %bb.4: # %then
+; RV64I-ZALRSC-NEXT:    sext.w a0, a1
+; RV64I-ZALRSC-NEXT:    ret
+; RV64I-ZALRSC-NEXT:  .LBB59_2: # %else
+; RV64I-ZALRSC-NEXT:    lw a1, 0(a0)
+; RV64I-ZALRSC-NEXT:    xori a2, a1, 1
+; RV64I-ZALRSC-NEXT:    sw a2, 0(a0)
+; RV64I-ZALRSC-NEXT:    sext.w a0, a1
+; RV64I-ZALRSC-NEXT:    ret
   br i1 %c, label %then, label %else
 
 then:
@@ -5007,6 +7367,37 @@ define signext i32 @atomicrmw_max_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ; RV32IA-NEXT:    sw a2, 0(a1)
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_max_i32_monotonic_crossbb:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a1, 1
+; RV32I-ZALRSC-NEXT:    mv a1, a0
+; RV32I-ZALRSC-NEXT:    beqz a2, .LBB60_2
+; RV32I-ZALRSC-NEXT:  # %bb.1: # %then
+; RV32I-ZALRSC-NEXT:    li a2, 1
+; RV32I-ZALRSC-NEXT:  .LBB60_5: # %then
+; RV32I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a0, (a1)
+; RV32I-ZALRSC-NEXT:    mv a3, a0
+; RV32I-ZALRSC-NEXT:    bge a3, a2, .LBB60_7
+; RV32I-ZALRSC-NEXT:  # %bb.6: # %then
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB60_5 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a3, a2
+; RV32I-ZALRSC-NEXT:  .LBB60_7: # %then
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB60_5 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w a3, a3, (a1)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB60_5
+; RV32I-ZALRSC-NEXT:  # %bb.8: # %then
+; RV32I-ZALRSC-NEXT:    ret
+; RV32I-ZALRSC-NEXT:  .LBB60_2: # %else
+; RV32I-ZALRSC-NEXT:    lw a0, 0(a1)
+; RV32I-ZALRSC-NEXT:    mv a2, a0
+; RV32I-ZALRSC-NEXT:    bgtz a0, .LBB60_4
+; RV32I-ZALRSC-NEXT:  # %bb.3: # %else
+; RV32I-ZALRSC-NEXT:    li a2, 1
+; RV32I-ZALRSC-NEXT:  .LBB60_4: # %else
+; RV32I-ZALRSC-NEXT:    sw a2, 0(a1)
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_max_i32_monotonic_crossbb:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -32
@@ -5070,6 +7461,37 @@ define signext i32 @atomicrmw_max_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ; RV64IA-NEXT:  .LBB60_4: # %else
 ; RV64IA-NEXT:    sw a2, 0(a1)
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_max_i32_monotonic_crossbb:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a1, 1
+; RV64I-ZALRSC-NEXT:    mv a1, a0
+; RV64I-ZALRSC-NEXT:    beqz a2, .LBB60_2
+; RV64I-ZALRSC-NEXT:  # %bb.1: # %then
+; RV64I-ZALRSC-NEXT:    li a2, 1
+; RV64I-ZALRSC-NEXT:  .LBB60_5: # %then
+; RV64I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a0, (a1)
+; RV64I-ZALRSC-NEXT:    mv a3, a0
+; RV64I-ZALRSC-NEXT:    bge a3, a2, .LBB60_7
+; RV64I-ZALRSC-NEXT:  # %bb.6: # %then
+; RV64I-ZALRSC-NEXT:    # in Loop: Header=BB60_5 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:  .LBB60_7: # %then
+; RV64I-ZALRSC-NEXT:    # in Loop: Header=BB60_5 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w a3, a3, (a1)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB60_5
+; RV64I-ZALRSC-NEXT:  # %bb.8: # %then
+; RV64I-ZALRSC-NEXT:    ret
+; RV64I-ZALRSC-NEXT:  .LBB60_2: # %else
+; RV64I-ZALRSC-NEXT:    lw a0, 0(a1)
+; RV64I-ZALRSC-NEXT:    mv a2, a0
+; RV64I-ZALRSC-NEXT:    bgtz a0, .LBB60_4
+; RV64I-ZALRSC-NEXT:  # %bb.3: # %else
+; RV64I-ZALRSC-NEXT:    li a2, 1
+; RV64I-ZALRSC-NEXT:  .LBB60_4: # %else
+; RV64I-ZALRSC-NEXT:    sw a2, 0(a1)
+; RV64I-ZALRSC-NEXT:    ret
   br i1 %c, label %then, label %else
 
 then:
@@ -5155,6 +7577,37 @@ define signext i32 @atomicrmw_min_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ; RV32IA-NEXT:    sw a2, 0(a1)
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_min_i32_monotonic_crossbb:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a1, 1
+; RV32I-ZALRSC-NEXT:    mv a1, a0
+; RV32I-ZALRSC-NEXT:    beqz a2, .LBB61_2
+; RV32I-ZALRSC-NEXT:  # %bb.1: # %then
+; RV32I-ZALRSC-NEXT:    li a2, 1
+; RV32I-ZALRSC-NEXT:  .LBB61_5: # %then
+; RV32I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a0, (a1)
+; RV32I-ZALRSC-NEXT:    mv a3, a0
+; RV32I-ZALRSC-NEXT:    bge a2, a3, .LBB61_7
+; RV32I-ZALRSC-NEXT:  # %bb.6: # %then
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB61_5 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a3, a2
+; RV32I-ZALRSC-NEXT:  .LBB61_7: # %then
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB61_5 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w a3, a3, (a1)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB61_5
+; RV32I-ZALRSC-NEXT:  # %bb.8: # %then
+; RV32I-ZALRSC-NEXT:    ret
+; RV32I-ZALRSC-NEXT:  .LBB61_2: # %else
+; RV32I-ZALRSC-NEXT:    lw a0, 0(a1)
+; RV32I-ZALRSC-NEXT:    mv a2, a0
+; RV32I-ZALRSC-NEXT:    blez a0, .LBB61_4
+; RV32I-ZALRSC-NEXT:  # %bb.3: # %else
+; RV32I-ZALRSC-NEXT:    li a2, 1
+; RV32I-ZALRSC-NEXT:  .LBB61_4: # %else
+; RV32I-ZALRSC-NEXT:    sw a2, 0(a1)
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_min_i32_monotonic_crossbb:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -32
@@ -5220,6 +7673,37 @@ define signext i32 @atomicrmw_min_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ; RV64IA-NEXT:  .LBB61_4: # %else
 ; RV64IA-NEXT:    sw a2, 0(a1)
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_min_i32_monotonic_crossbb:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a1, 1
+; RV64I-ZALRSC-NEXT:    mv a1, a0
+; RV64I-ZALRSC-NEXT:    beqz a2, .LBB61_2
+; RV64I-ZALRSC-NEXT:  # %bb.1: # %then
+; RV64I-ZALRSC-NEXT:    li a2, 1
+; RV64I-ZALRSC-NEXT:  .LBB61_5: # %then
+; RV64I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a0, (a1)
+; RV64I-ZALRSC-NEXT:    mv a3, a0
+; RV64I-ZALRSC-NEXT:    bge a2, a3, .LBB61_7
+; RV64I-ZALRSC-NEXT:  # %bb.6: # %then
+; RV64I-ZALRSC-NEXT:    # in Loop: Header=BB61_5 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:  .LBB61_7: # %then
+; RV64I-ZALRSC-NEXT:    # in Loop: Header=BB61_5 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w a3, a3, (a1)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB61_5
+; RV64I-ZALRSC-NEXT:  # %bb.8: # %then
+; RV64I-ZALRSC-NEXT:    ret
+; RV64I-ZALRSC-NEXT:  .LBB61_2: # %else
+; RV64I-ZALRSC-NEXT:    lw a0, 0(a1)
+; RV64I-ZALRSC-NEXT:    mv a2, a0
+; RV64I-ZALRSC-NEXT:    blez a0, .LBB61_4
+; RV64I-ZALRSC-NEXT:  # %bb.3: # %else
+; RV64I-ZALRSC-NEXT:    li a2, 1
+; RV64I-ZALRSC-NEXT:  .LBB61_4: # %else
+; RV64I-ZALRSC-NEXT:    sw a2, 0(a1)
+; RV64I-ZALRSC-NEXT:    ret
   br i1 %c, label %then, label %else
 
 then:
@@ -5290,6 +7774,34 @@ define signext i32 @atomicrmw_umax_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ; RV32IA-NEXT:    sw a2, 0(a1)
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umax_i32_monotonic_crossbb:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a1, 1
+; RV32I-ZALRSC-NEXT:    mv a1, a0
+; RV32I-ZALRSC-NEXT:    beqz a2, .LBB62_2
+; RV32I-ZALRSC-NEXT:  # %bb.1: # %then
+; RV32I-ZALRSC-NEXT:    li a2, 1
+; RV32I-ZALRSC-NEXT:  .LBB62_3: # %then
+; RV32I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a0, (a1)
+; RV32I-ZALRSC-NEXT:    mv a3, a0
+; RV32I-ZALRSC-NEXT:    bgeu a3, a2, .LBB62_5
+; RV32I-ZALRSC-NEXT:  # %bb.4: # %then
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB62_3 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a3, a2
+; RV32I-ZALRSC-NEXT:  .LBB62_5: # %then
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB62_3 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w a3, a3, (a1)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB62_3
+; RV32I-ZALRSC-NEXT:  # %bb.6: # %then
+; RV32I-ZALRSC-NEXT:    ret
+; RV32I-ZALRSC-NEXT:  .LBB62_2: # %else
+; RV32I-ZALRSC-NEXT:    lw a0, 0(a1)
+; RV32I-ZALRSC-NEXT:    seqz a2, a0
+; RV32I-ZALRSC-NEXT:    add a2, a0, a2
+; RV32I-ZALRSC-NEXT:    sw a2, 0(a1)
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_umax_i32_monotonic_crossbb:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -32
@@ -5347,6 +7859,35 @@ define signext i32 @atomicrmw_umax_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ; RV64IA-NEXT:    add a2, a0, a2
 ; RV64IA-NEXT:    sw a2, 0(a1)
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_umax_i32_monotonic_crossbb:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a1, a1, 1
+; RV64I-ZALRSC-NEXT:    beqz a1, .LBB62_2
+; RV64I-ZALRSC-NEXT:  # %bb.1: # %then
+; RV64I-ZALRSC-NEXT:    li a2, 1
+; RV64I-ZALRSC-NEXT:  .LBB62_3: # %then
+; RV64I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a1, (a0)
+; RV64I-ZALRSC-NEXT:    mv a3, a1
+; RV64I-ZALRSC-NEXT:    bgeu a3, a2, .LBB62_5
+; RV64I-ZALRSC-NEXT:  # %bb.4: # %then
+; RV64I-ZALRSC-NEXT:    # in Loop: Header=BB62_3 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:  .LBB62_5: # %then
+; RV64I-ZALRSC-NEXT:    # in Loop: Header=BB62_3 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w a3, a3, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB62_3
+; RV64I-ZALRSC-NEXT:  # %bb.6: # %then
+; RV64I-ZALRSC-NEXT:    sext.w a0, a1
+; RV64I-ZALRSC-NEXT:    ret
+; RV64I-ZALRSC-NEXT:  .LBB62_2: # %else
+; RV64I-ZALRSC-NEXT:    lw a1, 0(a0)
+; RV64I-ZALRSC-NEXT:    seqz a2, a1
+; RV64I-ZALRSC-NEXT:    add a2, a1, a2
+; RV64I-ZALRSC-NEXT:    sw a2, 0(a0)
+; RV64I-ZALRSC-NEXT:    sext.w a0, a1
+; RV64I-ZALRSC-NEXT:    ret
   br i1 %c, label %then, label %else
 
 then:
@@ -5434,6 +7975,38 @@ define signext i32 @atomicrmw_umin_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ; RV32IA-NEXT:    sw a2, 0(a1)
 ; RV32IA-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: atomicrmw_umin_i32_monotonic_crossbb:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    andi a2, a1, 1
+; RV32I-ZALRSC-NEXT:    mv a1, a0
+; RV32I-ZALRSC-NEXT:    beqz a2, .LBB63_2
+; RV32I-ZALRSC-NEXT:  # %bb.1: # %then
+; RV32I-ZALRSC-NEXT:    li a2, 1
+; RV32I-ZALRSC-NEXT:  .LBB63_5: # %then
+; RV32I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w a0, (a1)
+; RV32I-ZALRSC-NEXT:    mv a3, a0
+; RV32I-ZALRSC-NEXT:    bgeu a2, a3, .LBB63_7
+; RV32I-ZALRSC-NEXT:  # %bb.6: # %then
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB63_5 Depth=1
+; RV32I-ZALRSC-NEXT:    mv a3, a2
+; RV32I-ZALRSC-NEXT:  .LBB63_7: # %then
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB63_5 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w a3, a3, (a1)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB63_5
+; RV32I-ZALRSC-NEXT:  # %bb.8: # %then
+; RV32I-ZALRSC-NEXT:    ret
+; RV32I-ZALRSC-NEXT:  .LBB63_2: # %else
+; RV32I-ZALRSC-NEXT:    lw a0, 0(a1)
+; RV32I-ZALRSC-NEXT:    li a3, 1
+; RV32I-ZALRSC-NEXT:    mv a2, a0
+; RV32I-ZALRSC-NEXT:    bltu a0, a3, .LBB63_4
+; RV32I-ZALRSC-NEXT:  # %bb.3: # %else
+; RV32I-ZALRSC-NEXT:    li a2, 1
+; RV32I-ZALRSC-NEXT:  .LBB63_4: # %else
+; RV32I-ZALRSC-NEXT:    sw a2, 0(a1)
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: atomicrmw_umin_i32_monotonic_crossbb:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -32
@@ -5501,6 +8074,38 @@ define signext i32 @atomicrmw_umin_i32_monotonic_crossbb(ptr %a, i1 %c) nounwind
 ; RV64IA-NEXT:  .LBB63_4: # %else
 ; RV64IA-NEXT:    sw a2, 0(a1)
 ; RV64IA-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: atomicrmw_umin_i32_monotonic_crossbb:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    andi a2, a1, 1
+; RV64I-ZALRSC-NEXT:    mv a1, a0
+; RV64I-ZALRSC-NEXT:    beqz a2, .LBB63_2
+; RV64I-ZALRSC-NEXT:  # %bb.1: # %then
+; RV64I-ZALRSC-NEXT:    li a2, 1
+; RV64I-ZALRSC-NEXT:  .LBB63_5: # %then
+; RV64I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w a0, (a1)
+; RV64I-ZALRSC-NEXT:    mv a3, a0
+; RV64I-ZALRSC-NEXT:    bgeu a2, a3, .LBB63_7
+; RV64I-ZALRSC-NEXT:  # %bb.6: # %then
+; RV64I-ZALRSC-NEXT:    # in Loop: Header=BB63_5 Depth=1
+; RV64I-ZALRSC-NEXT:    mv a3, a2
+; RV64I-ZALRSC-NEXT:  .LBB63_7: # %then
+; RV64I-ZALRSC-NEXT:    # in Loop: Header=BB63_5 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w a3, a3, (a1)
+; RV64I-ZALRSC-NEXT:    bnez a3, .LBB63_5
+; RV64I-ZALRSC-NEXT:  # %bb.8: # %then
+; RV64I-ZALRSC-NEXT:    ret
+; RV64I-ZALRSC-NEXT:  .LBB63_2: # %else
+; RV64I-ZALRSC-NEXT:    lw a0, 0(a1)
+; RV64I-ZALRSC-NEXT:    li a3, 1
+; RV64I-ZALRSC-NEXT:    mv a2, a0
+; RV64I-ZALRSC-NEXT:    bltu a0, a3, .LBB63_4
+; RV64I-ZALRSC-NEXT:  # %bb.3: # %else
+; RV64I-ZALRSC-NEXT:    li a2, 1
+; RV64I-ZALRSC-NEXT:  .LBB63_4: # %else
+; RV64I-ZALRSC-NEXT:    sw a2, 0(a1)
+; RV64I-ZALRSC-NEXT:    ret
   br i1 %c, label %then, label %else
 
 then:
@@ -5570,6 +8175,25 @@ define signext i32 @cmpxchg_i32_monotonic_crossbb(ptr %ptr, i32 signext %cmp, i3
 ; RV32IA-ZACAS-NEXT:    lw a0, 0(a0)
 ; RV32IA-ZACAS-NEXT:    ret
 ;
+; RV32I-ZALRSC-LABEL: cmpxchg_i32_monotonic_crossbb:
+; RV32I-ZALRSC:       # %bb.0:
+; RV32I-ZALRSC-NEXT:    mv a4, a0
+; RV32I-ZALRSC-NEXT:    beqz a3, .LBB64_2
+; RV32I-ZALRSC-NEXT:  # %bb.1: # %then
+; RV32I-ZALRSC-NEXT:  .LBB64_3: # %then
+; RV32I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-ZALRSC-NEXT:    lr.w.aqrl a0, (a4)
+; RV32I-ZALRSC-NEXT:    bne a0, a1, .LBB64_5
+; RV32I-ZALRSC-NEXT:  # %bb.4: # %then
+; RV32I-ZALRSC-NEXT:    # in Loop: Header=BB64_3 Depth=1
+; RV32I-ZALRSC-NEXT:    sc.w.rl a3, a2, (a4)
+; RV32I-ZALRSC-NEXT:    bnez a3, .LBB64_3
+; RV32I-ZALRSC-NEXT:  .LBB64_5: # %then
+; RV32I-ZALRSC-NEXT:    ret
+; RV32I-ZALRSC-NEXT:  .LBB64_2: # %else
+; RV32I-ZALRSC-NEXT:    lw a0, 0(a4)
+; RV32I-ZALRSC-NEXT:    ret
+;
 ; RV64I-LABEL: cmpxchg_i32_monotonic_crossbb:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    beqz a3, .LBB64_2
@@ -5620,6 +8244,26 @@ define signext i32 @cmpxchg_i32_monotonic_crossbb(ptr %ptr, i32 signext %cmp, i3
 ; RV64IA-ZACAS-NEXT:  .LBB64_2: # %else
 ; RV64IA-ZACAS-NEXT:    lw a0, 0(a0)
 ; RV64IA-ZACAS-NEXT:    ret
+;
+; RV64I-ZALRSC-LABEL: cmpxchg_i32_monotonic_crossbb:
+; RV64I-ZALRSC:       # %bb.0:
+; RV64I-ZALRSC-NEXT:    beqz a3, .LBB64_2
+; RV64I-ZALRSC-NEXT:  # %bb.1: # %then
+; RV64I-ZALRSC-NEXT:  .LBB64_3: # %then
+; RV64I-ZALRSC-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64I-ZALRSC-NEXT:    lr.w.aqrl a3, (a0)
+; RV64I-ZALRSC-NEXT:    bne a3, a1, .LBB64_5
+; RV64I-ZALRSC-NEXT:  # %bb.4: # %then
+; RV64I-ZALRSC-NEXT:    # in Loop: Header=BB64_3 Depth=1
+; RV64I-ZALRSC-NEXT:    sc.w.rl a4, a2, (a0)
+; RV64I-ZALRSC-NEXT:    bnez a4, .LBB64_3
+; RV64I-ZALRSC-NEXT:  .LBB64_5: # %then
+; RV64I-ZALRSC-NEXT:    sext.w a0, a3
+; RV64I-ZALRSC-NEXT:    ret
+; RV64I-ZALRSC-NEXT:  .LBB64_2: # %else
+; RV64I-ZALRSC-NEXT:    lw a3, 0(a0)
+; RV64I-ZALRSC-NEXT:    sext.w a0, a3
+; RV64I-ZALRSC-NEXT:    ret
   br i1 %c, label %then, label %else
 
 then:
diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll
index f3529b1a76611..22c2d8102b5ca 100644
--- a/llvm/test/CodeGen/RISCV/attributes.ll
+++ b/llvm/test/CodeGen/RISCV/attributes.ll
@@ -80,6 +80,7 @@
 ; RUN: llc -mtriple=riscv32 -mattr=+xwchc %s -o - | FileCheck --check-prefix=RV32XWCHC %s
 ; RUN: llc -mtriple=riscv32 -mattr=+zaamo %s -o - | FileCheck --check-prefix=RV32ZAAMO %s
 ; RUN: llc -mtriple=riscv32 -mattr=+zalrsc %s -o - | FileCheck --check-prefix=RV32ZALRSC %s
+; RUN: llc -mtriple=riscv32 -mattr=+zaamo,+zalrsc %s -o - | FileCheck --check-prefixes=CHECK,RV32COMBINEINTOA %s
 ; RUN: llc -mtriple=riscv32 -mattr=+zca %s -o - | FileCheck --check-prefixes=CHECK,RV32ZCA %s
 ; RUN: llc -mtriple=riscv32 -mattr=+zcb %s -o - | FileCheck --check-prefixes=CHECK,RV32ZCB %s
 ; RUN: llc -mtriple=riscv32 -mattr=+zcd %s -o - | FileCheck --check-prefixes=CHECK,RV32ZCD %s
@@ -227,6 +228,7 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+ztso %s -o - | FileCheck --check-prefixes=CHECK,RV64ZTSO %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zaamo %s -o - | FileCheck --check-prefix=RV64ZAAMO %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zalrsc %s -o - | FileCheck --check-prefix=RV64ZALRSC %s
+; RUN: llc -mtriple=riscv64 -mattr=+zaamo,+zalrsc %s -o - | FileCheck --check-prefixes=CHECK,RV64COMBINEINTOA %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zca %s -o - | FileCheck --check-prefixes=CHECK,RV64ZCA %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zcb %s -o - | FileCheck --check-prefixes=CHECK,RV64ZCB %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zcd %s -o - | FileCheck --check-prefixes=CHECK,RV64ZCD %s
@@ -392,6 +394,7 @@
 ; RV32XWCHC: .attribute 5, "rv32i2p1_zca1p0_xwchc2p2"
 ; RV32ZAAMO: .attribute 5, "rv32i2p1_zaamo1p0"
 ; RV32ZALRSC: .attribute 5, "rv32i2p1_zalrsc1p0"
+; RV32COMBINEINTOA: .attribute 5, "rv32i2p1_a2p1_zaamo1p0_zalrsc1p0"
 ; RV32ZCA: .attribute 5, "rv32i2p1_zca1p0"
 ; RV32ZCB: .attribute 5, "rv32i2p1_zca1p0_zcb1p0"
 ; RV32ZCD: .attribute 5, "rv32i2p1_f2p2_d2p2_zicsr2p0_zca1p0_zcd1p0"
@@ -537,6 +540,7 @@
 ; RV64ZTSO: .attribute 5, "rv64i2p1_ztso1p0"
 ; RV64ZAAMO: .attribute 5, "rv64i2p1_zaamo1p0"
 ; RV64ZALRSC: .attribute 5, "rv64i2p1_zalrsc1p0"
+; RV64COMBINEINTOA: .attribute 5, "rv64i2p1_a2p1_zaamo1p0_zalrsc1p0"
 ; RV64ZCA: .attribute 5, "rv64i2p1_zca1p0"
 ; RV64ZCB: .attribute 5, "rv64i2p1_zca1p0_zcb1p0"
 ; RV64ZCD: .attribute 5, "rv64i2p1_f2p2_d2p2_zicsr2p0_zca1p0_zcd1p0"
diff --git a/llvm/test/CodeGen/RISCV/branch-rel.mir b/llvm/test/CodeGen/RISCV/branch-rel.mir
new file mode 100644
index 0000000000000..1ed5f5715a825
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/branch-rel.mir
@@ -0,0 +1,39 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc %s -mtriple=riscv64 -run-pass=branch-relaxation -o - -verify-machineinstrs | FileCheck %s
+
+--- |
+  define void @foo() {
+    ret void
+  }
+...
+---
+name:            foo
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: foo
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT:   liveins: $x1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   PseudoBR %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT:   liveins: $x1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   INLINEASM &".space 4096", 1 /* sideeffect attdialect */
+  ; CHECK-NEXT:   BGE $x1, $x0, %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   PseudoRET
+  bb.0:
+    liveins: $x1
+    BNE $x1, $x0, %bb.3
+    PseudoBR %bb.3
+  bb.1:
+    liveins: $x1
+    INLINEASM &".space 4096", 1
+    BGE $x1, $x0, %bb.3
+  bb.3:
+    PseudoRET
+## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
diff --git a/llvm/test/CodeGen/RISCV/div_minsize.ll b/llvm/test/CodeGen/RISCV/div_minsize.ll
index 601821b932a52..794af2f55d9a0 100644
--- a/llvm/test/CodeGen/RISCV/div_minsize.ll
+++ b/llvm/test/CodeGen/RISCV/div_minsize.ll
@@ -68,3 +68,151 @@ define i32 @testsize4(i32 %x) minsize nounwind {
   %div = udiv i32 %x, 33
   ret i32 %div
 }
+
+define i128 @i128_sdiv(i128 %arg0) minsize nounwind {
+; RV32IM-LABEL: i128_sdiv:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    lw a2, 12(a1)
+; RV32IM-NEXT:    lw a3, 8(a1)
+; RV32IM-NEXT:    lw a4, 0(a1)
+; RV32IM-NEXT:    lw a1, 4(a1)
+; RV32IM-NEXT:    srai a5, a2, 31
+; RV32IM-NEXT:    srli a5, a5, 30
+; RV32IM-NEXT:    add a5, a4, a5
+; RV32IM-NEXT:    sltu a4, a5, a4
+; RV32IM-NEXT:    srli a5, a5, 2
+; RV32IM-NEXT:    add a6, a1, a4
+; RV32IM-NEXT:    sltu a1, a6, a1
+; RV32IM-NEXT:    and a1, a4, a1
+; RV32IM-NEXT:    srli a4, a6, 2
+; RV32IM-NEXT:    slli a6, a6, 30
+; RV32IM-NEXT:    or a5, a5, a6
+; RV32IM-NEXT:    add a1, a3, a1
+; RV32IM-NEXT:    srli a6, a1, 2
+; RV32IM-NEXT:    sltu a3, a1, a3
+; RV32IM-NEXT:    slli a1, a1, 30
+; RV32IM-NEXT:    add a2, a2, a3
+; RV32IM-NEXT:    or a1, a4, a1
+; RV32IM-NEXT:    slli a3, a2, 30
+; RV32IM-NEXT:    srai a2, a2, 2
+; RV32IM-NEXT:    or a3, a6, a3
+; RV32IM-NEXT:    sw a5, 0(a0)
+; RV32IM-NEXT:    sw a1, 4(a0)
+; RV32IM-NEXT:    sw a3, 8(a0)
+; RV32IM-NEXT:    sw a2, 12(a0)
+; RV32IM-NEXT:    ret
+;
+; RV64IM-LABEL: i128_sdiv:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    addi sp, sp, -16
+; RV64IM-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64IM-NEXT:    li a2, 4
+; RV64IM-NEXT:    li a3, 0
+; RV64IM-NEXT:    call __divti3
+; RV64IM-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64IM-NEXT:    addi sp, sp, 16
+; RV64IM-NEXT:    ret
+  %div = sdiv i128 %arg0, 4
+  ret i128 %div
+}
+
+define i256 @i256_sdiv(i256 %arg0) minsize nounwind {
+; RV32IM-LABEL: i256_sdiv:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    lw a5, 16(a1)
+; RV32IM-NEXT:    lw a4, 20(a1)
+; RV32IM-NEXT:    lw a2, 24(a1)
+; RV32IM-NEXT:    lw a3, 28(a1)
+; RV32IM-NEXT:    lw a6, 0(a1)
+; RV32IM-NEXT:    lw a7, 4(a1)
+; RV32IM-NEXT:    lw t0, 8(a1)
+; RV32IM-NEXT:    lw t1, 12(a1)
+; RV32IM-NEXT:    srai a1, a3, 31
+; RV32IM-NEXT:    srli a1, a1, 30
+; RV32IM-NEXT:    add a1, a6, a1
+; RV32IM-NEXT:    sltu t2, a1, a6
+; RV32IM-NEXT:    add a6, a7, t2
+; RV32IM-NEXT:    sltu a7, a6, a7
+; RV32IM-NEXT:    and t2, t2, a7
+; RV32IM-NEXT:    add a7, t0, t2
+; RV32IM-NEXT:    sltu t3, a7, t0
+; RV32IM-NEXT:    add t0, t1, t3
+; RV32IM-NEXT:    beqz t2, .LBB5_2
+; RV32IM-NEXT:  # %bb.1:
+; RV32IM-NEXT:    sltu t1, t0, t1
+; RV32IM-NEXT:    and t2, t3, t1
+; RV32IM-NEXT:  .LBB5_2:
+; RV32IM-NEXT:    add t2, a5, t2
+; RV32IM-NEXT:    srli t1, t0, 2
+; RV32IM-NEXT:    srli t3, a7, 2
+; RV32IM-NEXT:    slli t0, t0, 30
+; RV32IM-NEXT:    slli a7, a7, 30
+; RV32IM-NEXT:    or t0, t3, t0
+; RV32IM-NEXT:    srli t3, a6, 2
+; RV32IM-NEXT:    srli a1, a1, 2
+; RV32IM-NEXT:    slli a6, a6, 30
+; RV32IM-NEXT:    sltu a5, t2, a5
+; RV32IM-NEXT:    or a7, t3, a7
+; RV32IM-NEXT:    srli t3, t2, 2
+; RV32IM-NEXT:    slli t2, t2, 30
+; RV32IM-NEXT:    or a1, a1, a6
+; RV32IM-NEXT:    add a6, a4, a5
+; RV32IM-NEXT:    or t1, t1, t2
+; RV32IM-NEXT:    sltu a4, a6, a4
+; RV32IM-NEXT:    srli t2, a6, 2
+; RV32IM-NEXT:    slli a6, a6, 30
+; RV32IM-NEXT:    sw a1, 0(a0)
+; RV32IM-NEXT:    sw a7, 4(a0)
+; RV32IM-NEXT:    sw t0, 8(a0)
+; RV32IM-NEXT:    sw t1, 12(a0)
+; RV32IM-NEXT:    and a4, a5, a4
+; RV32IM-NEXT:    or a1, t3, a6
+; RV32IM-NEXT:    add a4, a2, a4
+; RV32IM-NEXT:    srli a5, a4, 2
+; RV32IM-NEXT:    sltu a2, a4, a2
+; RV32IM-NEXT:    slli a4, a4, 30
+; RV32IM-NEXT:    add a2, a3, a2
+; RV32IM-NEXT:    or a3, t2, a4
+; RV32IM-NEXT:    slli a4, a2, 30
+; RV32IM-NEXT:    srai a2, a2, 2
+; RV32IM-NEXT:    or a4, a5, a4
+; RV32IM-NEXT:    sw a1, 16(a0)
+; RV32IM-NEXT:    sw a3, 20(a0)
+; RV32IM-NEXT:    sw a4, 24(a0)
+; RV32IM-NEXT:    sw a2, 28(a0)
+; RV32IM-NEXT:    ret
+;
+; RV64IM-LABEL: i256_sdiv:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    ld a2, 24(a1)
+; RV64IM-NEXT:    ld a3, 16(a1)
+; RV64IM-NEXT:    ld a4, 0(a1)
+; RV64IM-NEXT:    ld a1, 8(a1)
+; RV64IM-NEXT:    srai a5, a2, 63
+; RV64IM-NEXT:    srli a5, a5, 62
+; RV64IM-NEXT:    add a5, a4, a5
+; RV64IM-NEXT:    sltu a4, a5, a4
+; RV64IM-NEXT:    srli a5, a5, 2
+; RV64IM-NEXT:    add a6, a1, a4
+; RV64IM-NEXT:    sltu a1, a6, a1
+; RV64IM-NEXT:    and a1, a4, a1
+; RV64IM-NEXT:    srli a4, a6, 2
+; RV64IM-NEXT:    slli a6, a6, 62
+; RV64IM-NEXT:    or a5, a5, a6
+; RV64IM-NEXT:    add a1, a3, a1
+; RV64IM-NEXT:    srli a6, a1, 2
+; RV64IM-NEXT:    sltu a3, a1, a3
+; RV64IM-NEXT:    slli a1, a1, 62
+; RV64IM-NEXT:    add a2, a2, a3
+; RV64IM-NEXT:    or a1, a4, a1
+; RV64IM-NEXT:    slli a3, a2, 62
+; RV64IM-NEXT:    srai a2, a2, 2
+; RV64IM-NEXT:    or a3, a6, a3
+; RV64IM-NEXT:    sd a5, 0(a0)
+; RV64IM-NEXT:    sd a1, 8(a0)
+; RV64IM-NEXT:    sd a3, 16(a0)
+; RV64IM-NEXT:    sd a2, 24(a0)
+; RV64IM-NEXT:    ret
+  %div = sdiv i256 %arg0, 4
+  ret i256 %div
+}
diff --git a/llvm/test/CodeGen/RISCV/idiv_large.ll b/llvm/test/CodeGen/RISCV/idiv_large.ll
index 9937627962208..d7b00f61a50b9 100644
--- a/llvm/test/CodeGen/RISCV/idiv_large.ll
+++ b/llvm/test/CodeGen/RISCV/idiv_large.ll
@@ -1,16 +1,2315 @@
-; RUN: llc -mtriple=riscv32 < %s | FileCheck %s
-; RUN: llc -mtriple=riscv64 < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=riscv32 < %s | FileCheck %s --check-prefix=RV32
+; RUN: llc -mtriple=riscv64 < %s | FileCheck %s --check-prefix=RV64
+
+define i64 @udiv_i64(i64 %x, i64 %y) nounwind {
+; RV32-LABEL: udiv_i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    call __udivdi3
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: udiv_i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    tail __udivdi3
+  %res = udiv i64 %x, %y
+  ret i64 %res
+}
+
+define i65 @udiv_i65(i65 %x, i65 %y) nounwind {
+; RV32-LABEL: udiv_i65:
+; RV32:       # %bb.0: # %_udiv-special-cases
+; RV32-NEXT:    lw a3, 0(a2)
+; RV32-NEXT:    lw a4, 4(a2)
+; RV32-NEXT:    lw t1, 8(a2)
+; RV32-NEXT:    lui a2, 349525
+; RV32-NEXT:    lui a5, 209715
+; RV32-NEXT:    lui a6, 61681
+; RV32-NEXT:    addi t0, a2, 1365
+; RV32-NEXT:    addi a7, a5, 819
+; RV32-NEXT:    addi a6, a6, -241
+; RV32-NEXT:    srli a2, a4, 1
+; RV32-NEXT:    slli a5, t1, 31
+; RV32-NEXT:    slli t3, a4, 31
+; RV32-NEXT:    or t2, a5, a2
+; RV32-NEXT:    srli a2, a3, 1
+; RV32-NEXT:    or t4, a2, t3
+; RV32-NEXT:    bnez t2, .LBB1_2
+; RV32-NEXT:  # %bb.1: # %_udiv-special-cases
+; RV32-NEXT:    srli a2, t4, 1
+; RV32-NEXT:    or a2, t4, a2
+; RV32-NEXT:    srli a5, a2, 2
+; RV32-NEXT:    or a2, a2, a5
+; RV32-NEXT:    srli a5, a2, 4
+; RV32-NEXT:    or a2, a2, a5
+; RV32-NEXT:    srli a5, a2, 8
+; RV32-NEXT:    or a2, a2, a5
+; RV32-NEXT:    srli a5, a2, 16
+; RV32-NEXT:    or a2, a2, a5
+; RV32-NEXT:    not a2, a2
+; RV32-NEXT:    srli a5, a2, 1
+; RV32-NEXT:    and a5, a5, t0
+; RV32-NEXT:    sub a2, a2, a5
+; RV32-NEXT:    and a5, a2, a7
+; RV32-NEXT:    srli a2, a2, 2
+; RV32-NEXT:    and a2, a2, a7
+; RV32-NEXT:    add a2, a5, a2
+; RV32-NEXT:    srli a5, a2, 4
+; RV32-NEXT:    add a2, a2, a5
+; RV32-NEXT:    and a2, a2, a6
+; RV32-NEXT:    slli a5, a2, 8
+; RV32-NEXT:    add a2, a2, a5
+; RV32-NEXT:    slli a5, a2, 16
+; RV32-NEXT:    add a2, a2, a5
+; RV32-NEXT:    srli a2, a2, 24
+; RV32-NEXT:    addi t3, a2, 32
+; RV32-NEXT:    j .LBB1_3
+; RV32-NEXT:  .LBB1_2:
+; RV32-NEXT:    srli a2, t2, 1
+; RV32-NEXT:    or a2, t2, a2
+; RV32-NEXT:    srli a5, a2, 2
+; RV32-NEXT:    or a2, a2, a5
+; RV32-NEXT:    srli a5, a2, 4
+; RV32-NEXT:    or a2, a2, a5
+; RV32-NEXT:    srli a5, a2, 8
+; RV32-NEXT:    or a2, a2, a5
+; RV32-NEXT:    srli a5, a2, 16
+; RV32-NEXT:    or a2, a2, a5
+; RV32-NEXT:    not a2, a2
+; RV32-NEXT:    srli a5, a2, 1
+; RV32-NEXT:    and a5, a5, t0
+; RV32-NEXT:    sub a2, a2, a5
+; RV32-NEXT:    and a5, a2, a7
+; RV32-NEXT:    srli a2, a2, 2
+; RV32-NEXT:    and a2, a2, a7
+; RV32-NEXT:    add a2, a5, a2
+; RV32-NEXT:    srli a5, a2, 4
+; RV32-NEXT:    add a2, a2, a5
+; RV32-NEXT:    and a2, a2, a6
+; RV32-NEXT:    slli a5, a2, 8
+; RV32-NEXT:    add a2, a2, a5
+; RV32-NEXT:    slli a5, a2, 16
+; RV32-NEXT:    add a2, a2, a5
+; RV32-NEXT:    srli t3, a2, 24
+; RV32-NEXT:  .LBB1_3: # %_udiv-special-cases
+; RV32-NEXT:    addi sp, sp, -96
+; RV32-NEXT:    sw s0, 92(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 88(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s2, 84(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s3, 80(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s4, 76(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s5, 72(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s6, 68(sp) # 4-byte Folded Spill
+; RV32-NEXT:    slli a2, a3, 31
+; RV32-NEXT:    li t5, 64
+; RV32-NEXT:    bnez a2, .LBB1_5
+; RV32-NEXT:  # %bb.4: # %_udiv-special-cases
+; RV32-NEXT:    li s0, 64
+; RV32-NEXT:    j .LBB1_6
+; RV32-NEXT:  .LBB1_5:
+; RV32-NEXT:    srli a5, a2, 1
+; RV32-NEXT:    or a2, a2, a5
+; RV32-NEXT:    srli a5, a2, 2
+; RV32-NEXT:    or a2, a2, a5
+; RV32-NEXT:    srli a5, a2, 4
+; RV32-NEXT:    or a2, a2, a5
+; RV32-NEXT:    srli a5, a2, 8
+; RV32-NEXT:    or a2, a2, a5
+; RV32-NEXT:    srli a5, a2, 16
+; RV32-NEXT:    or a2, a2, a5
+; RV32-NEXT:    not a2, a2
+; RV32-NEXT:    srli a5, a2, 1
+; RV32-NEXT:    and a5, a5, t0
+; RV32-NEXT:    sub a2, a2, a5
+; RV32-NEXT:    and a5, a2, a7
+; RV32-NEXT:    srli a2, a2, 2
+; RV32-NEXT:    and a2, a2, a7
+; RV32-NEXT:    add a2, a5, a2
+; RV32-NEXT:    srli a5, a2, 4
+; RV32-NEXT:    add a2, a2, a5
+; RV32-NEXT:    and a2, a2, a6
+; RV32-NEXT:    slli a5, a2, 8
+; RV32-NEXT:    add a2, a2, a5
+; RV32-NEXT:    slli a5, a2, 16
+; RV32-NEXT:    add a2, a2, a5
+; RV32-NEXT:    srli s0, a2, 24
+; RV32-NEXT:  .LBB1_6: # %_udiv-special-cases
+; RV32-NEXT:    lw a5, 0(a1)
+; RV32-NEXT:    lw a2, 4(a1)
+; RV32-NEXT:    lw s2, 8(a1)
+; RV32-NEXT:    or a1, t4, t2
+; RV32-NEXT:    addi s1, s0, 64
+; RV32-NEXT:    bnez a1, .LBB1_8
+; RV32-NEXT:  # %bb.7: # %_udiv-special-cases
+; RV32-NEXT:    mv t3, s1
+; RV32-NEXT:  .LBB1_8: # %_udiv-special-cases
+; RV32-NEXT:    snez s4, a1
+; RV32-NEXT:    srli a1, a2, 1
+; RV32-NEXT:    slli t2, s2, 31
+; RV32-NEXT:    slli t4, a2, 31
+; RV32-NEXT:    or a1, t2, a1
+; RV32-NEXT:    srli t2, a5, 1
+; RV32-NEXT:    or t6, t2, t4
+; RV32-NEXT:    bnez a1, .LBB1_10
+; RV32-NEXT:  # %bb.9: # %_udiv-special-cases
+; RV32-NEXT:    srli t2, t6, 1
+; RV32-NEXT:    or t2, t6, t2
+; RV32-NEXT:    srli t4, t2, 2
+; RV32-NEXT:    or t2, t2, t4
+; RV32-NEXT:    srli t4, t2, 4
+; RV32-NEXT:    or t2, t2, t4
+; RV32-NEXT:    srli t4, t2, 8
+; RV32-NEXT:    or t2, t2, t4
+; RV32-NEXT:    srli t4, t2, 16
+; RV32-NEXT:    or t2, t2, t4
+; RV32-NEXT:    not t2, t2
+; RV32-NEXT:    srli t4, t2, 1
+; RV32-NEXT:    and t4, t4, t0
+; RV32-NEXT:    sub t2, t2, t4
+; RV32-NEXT:    and t4, t2, a7
+; RV32-NEXT:    srli t2, t2, 2
+; RV32-NEXT:    and t2, t2, a7
+; RV32-NEXT:    add t2, t4, t2
+; RV32-NEXT:    srli t4, t2, 4
+; RV32-NEXT:    add t2, t2, t4
+; RV32-NEXT:    and t2, t2, a6
+; RV32-NEXT:    slli t4, t2, 8
+; RV32-NEXT:    add t2, t2, t4
+; RV32-NEXT:    slli t4, t2, 16
+; RV32-NEXT:    add t2, t2, t4
+; RV32-NEXT:    srli t2, t2, 24
+; RV32-NEXT:    addi s3, t2, 32
+; RV32-NEXT:    j .LBB1_11
+; RV32-NEXT:  .LBB1_10:
+; RV32-NEXT:    srli t2, a1, 1
+; RV32-NEXT:    or t2, a1, t2
+; RV32-NEXT:    srli t4, t2, 2
+; RV32-NEXT:    or t2, t2, t4
+; RV32-NEXT:    srli t4, t2, 4
+; RV32-NEXT:    or t2, t2, t4
+; RV32-NEXT:    srli t4, t2, 8
+; RV32-NEXT:    or t2, t2, t4
+; RV32-NEXT:    srli t4, t2, 16
+; RV32-NEXT:    or t2, t2, t4
+; RV32-NEXT:    not t2, t2
+; RV32-NEXT:    srli t4, t2, 1
+; RV32-NEXT:    and t4, t4, t0
+; RV32-NEXT:    sub t2, t2, t4
+; RV32-NEXT:    and t4, t2, a7
+; RV32-NEXT:    srli t2, t2, 2
+; RV32-NEXT:    and t2, t2, a7
+; RV32-NEXT:    add t2, t4, t2
+; RV32-NEXT:    srli t4, t2, 4
+; RV32-NEXT:    add t2, t2, t4
+; RV32-NEXT:    and t2, t2, a6
+; RV32-NEXT:    slli t4, t2, 8
+; RV32-NEXT:    add t2, t2, t4
+; RV32-NEXT:    slli t4, t2, 16
+; RV32-NEXT:    add t2, t2, t4
+; RV32-NEXT:    srli s3, t2, 24
+; RV32-NEXT:  .LBB1_11: # %_udiv-special-cases
+; RV32-NEXT:    andi t4, s2, 1
+; RV32-NEXT:    andi t1, t1, 1
+; RV32-NEXT:    or t2, a3, a4
+; RV32-NEXT:    or s2, a5, a2
+; RV32-NEXT:    sltu s0, s1, s0
+; RV32-NEXT:    slli s1, a5, 31
+; RV32-NEXT:    addi s4, s4, -1
+; RV32-NEXT:    beqz s1, .LBB1_13
+; RV32-NEXT:  # %bb.12:
+; RV32-NEXT:    srli t5, s1, 1
+; RV32-NEXT:    or t5, s1, t5
+; RV32-NEXT:    srli s1, t5, 2
+; RV32-NEXT:    or t5, t5, s1
+; RV32-NEXT:    srli s1, t5, 4
+; RV32-NEXT:    or t5, t5, s1
+; RV32-NEXT:    srli s1, t5, 8
+; RV32-NEXT:    or t5, t5, s1
+; RV32-NEXT:    srli s1, t5, 16
+; RV32-NEXT:    or t5, t5, s1
+; RV32-NEXT:    not t5, t5
+; RV32-NEXT:    srli s1, t5, 1
+; RV32-NEXT:    and t0, s1, t0
+; RV32-NEXT:    sub t0, t5, t0
+; RV32-NEXT:    and t5, t0, a7
+; RV32-NEXT:    srli t0, t0, 2
+; RV32-NEXT:    and a7, t0, a7
+; RV32-NEXT:    add a7, t5, a7
+; RV32-NEXT:    srli t0, a7, 4
+; RV32-NEXT:    add a7, a7, t0
+; RV32-NEXT:    and a6, a7, a6
+; RV32-NEXT:    slli a7, a6, 8
+; RV32-NEXT:    add a6, a6, a7
+; RV32-NEXT:    slli a7, a6, 16
+; RV32-NEXT:    add a6, a6, a7
+; RV32-NEXT:    srli t5, a6, 24
+; RV32-NEXT:  .LBB1_13: # %_udiv-special-cases
+; RV32-NEXT:    or t0, t2, t1
+; RV32-NEXT:    or a6, s2, t4
+; RV32-NEXT:    and a7, s4, s0
+; RV32-NEXT:    or t6, t6, a1
+; RV32-NEXT:    addi s0, t5, 64
+; RV32-NEXT:    bnez t6, .LBB1_15
+; RV32-NEXT:  # %bb.14: # %_udiv-special-cases
+; RV32-NEXT:    mv s3, s0
+; RV32-NEXT:  .LBB1_15: # %_udiv-special-cases
+; RV32-NEXT:    seqz a1, t0
+; RV32-NEXT:    sltu t0, s0, t5
+; RV32-NEXT:    snez t5, t6
+; RV32-NEXT:    addi t5, t5, -1
+; RV32-NEXT:    and t0, t5, t0
+; RV32-NEXT:    sltu t5, t3, s3
+; RV32-NEXT:    seqz a6, a6
+; RV32-NEXT:    mv t6, t5
+; RV32-NEXT:    beq a7, t0, .LBB1_17
+; RV32-NEXT:  # %bb.16: # %_udiv-special-cases
+; RV32-NEXT:    sltu t6, a7, t0
+; RV32-NEXT:  .LBB1_17: # %_udiv-special-cases
+; RV32-NEXT:    or a1, a1, a6
+; RV32-NEXT:    andi a6, t6, 1
+; RV32-NEXT:    sub a7, a7, t0
+; RV32-NEXT:    sub t5, a7, t5
+; RV32-NEXT:    sub a7, t3, s3
+; RV32-NEXT:    beqz a6, .LBB1_19
+; RV32-NEXT:  # %bb.18: # %_udiv-special-cases
+; RV32-NEXT:    mv t0, a6
+; RV32-NEXT:    j .LBB1_20
+; RV32-NEXT:  .LBB1_19:
+; RV32-NEXT:    sltiu t0, a7, 65
+; RV32-NEXT:    xori t0, t0, 1
+; RV32-NEXT:    snez t3, t5
+; RV32-NEXT:    or t0, t0, t3
+; RV32-NEXT:  .LBB1_20: # %_udiv-special-cases
+; RV32-NEXT:    or t6, a1, t0
+; RV32-NEXT:    addi a1, t6, -1
+; RV32-NEXT:    and t3, t4, a1
+; RV32-NEXT:    and t0, a1, a2
+; RV32-NEXT:    and a1, a1, a5
+; RV32-NEXT:    bnez t6, .LBB1_30
+; RV32-NEXT:  # %bb.21: # %_udiv-special-cases
+; RV32-NEXT:    xori t6, a7, 64
+; RV32-NEXT:    or t6, t6, a6
+; RV32-NEXT:    or t6, t6, t5
+; RV32-NEXT:    beqz t6, .LBB1_30
+; RV32-NEXT:  # %bb.22: # %udiv-bb1
+; RV32-NEXT:    addi a1, a7, 1
+; RV32-NEXT:    sw zero, 32(sp)
+; RV32-NEXT:    sw zero, 36(sp)
+; RV32-NEXT:    sw zero, 40(sp)
+; RV32-NEXT:    sw zero, 44(sp)
+; RV32-NEXT:    sw a5, 48(sp)
+; RV32-NEXT:    sw a2, 52(sp)
+; RV32-NEXT:    sw t4, 56(sp)
+; RV32-NEXT:    li t0, 64
+; RV32-NEXT:    addi t3, sp, 48
+; RV32-NEXT:    neg s1, a7
+; RV32-NEXT:    seqz t6, a1
+; RV32-NEXT:    sub a7, t0, a7
+; RV32-NEXT:    add t5, t5, t6
+; RV32-NEXT:    andi t0, a7, 31
+; RV32-NEXT:    srli a7, a7, 3
+; RV32-NEXT:    or t6, a1, t5
+; RV32-NEXT:    xori s2, t0, 31
+; RV32-NEXT:    andi a7, a7, 12
+; RV32-NEXT:    seqz t0, t6
+; RV32-NEXT:    sub s3, t3, a7
+; RV32-NEXT:    add a6, a6, t0
+; RV32-NEXT:    lw t3, 0(s3)
+; RV32-NEXT:    lw s4, 4(s3)
+; RV32-NEXT:    andi a7, a6, 1
+; RV32-NEXT:    or t6, t6, a7
+; RV32-NEXT:    srli a6, t3, 1
+; RV32-NEXT:    sll t0, s4, s1
+; RV32-NEXT:    srl a6, a6, s2
+; RV32-NEXT:    or t0, t0, a6
+; RV32-NEXT:    sll a6, t3, s1
+; RV32-NEXT:    li t3, 0
+; RV32-NEXT:    beqz t6, .LBB1_28
+; RV32-NEXT:  # %bb.23: # %udiv-preheader
+; RV32-NEXT:    li t6, 0
+; RV32-NEXT:    li s0, 0
+; RV32-NEXT:    srli s4, s4, 1
+; RV32-NEXT:    lw s3, 8(s3)
+; RV32-NEXT:    sw zero, 16(sp)
+; RV32-NEXT:    sw zero, 20(sp)
+; RV32-NEXT:    sw zero, 24(sp)
+; RV32-NEXT:    sw zero, 28(sp)
+; RV32-NEXT:    sw a5, 0(sp)
+; RV32-NEXT:    sw a2, 4(sp)
+; RV32-NEXT:    sw t4, 8(sp)
+; RV32-NEXT:    sw zero, 12(sp)
+; RV32-NEXT:    srli a2, a1, 3
+; RV32-NEXT:    srl a5, s4, s2
+; RV32-NEXT:    mv t4, sp
+; RV32-NEXT:    snez t2, t2
+; RV32-NEXT:    andi a2, a2, 12
+; RV32-NEXT:    add t1, t1, t2
+; RV32-NEXT:    add a2, t4, a2
+; RV32-NEXT:    lw t2, 0(a2)
+; RV32-NEXT:    lw t4, 4(a2)
+; RV32-NEXT:    lw a2, 8(a2)
+; RV32-NEXT:    sll s1, s3, s1
+; RV32-NEXT:    andi s2, a1, 31
+; RV32-NEXT:    xori s2, s2, 31
+; RV32-NEXT:    or s3, s1, a5
+; RV32-NEXT:    slli a2, a2, 1
+; RV32-NEXT:    slli a5, t4, 1
+; RV32-NEXT:    sll a2, a2, s2
+; RV32-NEXT:    sll s2, a5, s2
+; RV32-NEXT:    srl s1, t4, a1
+; RV32-NEXT:    or s1, s1, a2
+; RV32-NEXT:    seqz a2, a3
+; RV32-NEXT:    sub a2, a4, a2
+; RV32-NEXT:    addi a5, t1, 1
+; RV32-NEXT:    andi a5, a5, 1
+; RV32-NEXT:    andi s3, s3, 1
+; RV32-NEXT:    srl t1, t2, a1
+; RV32-NEXT:    or s2, t1, s2
+; RV32-NEXT:    addi t1, a3, -1
+; RV32-NEXT:    j .LBB1_26
+; RV32-NEXT:  .LBB1_24: # %udiv-do-while
+; RV32-NEXT:    # in Loop: Header=BB1_26 Depth=1
+; RV32-NEXT:    sltu t2, a2, s4
+; RV32-NEXT:  .LBB1_25: # %udiv-do-while
+; RV32-NEXT:    # in Loop: Header=BB1_26 Depth=1
+; RV32-NEXT:    srli s1, s1, 31
+; RV32-NEXT:    sub t4, a5, s1
+; RV32-NEXT:    sub t2, t4, t2
+; RV32-NEXT:    slli t2, t2, 31
+; RV32-NEXT:    srai s1, t2, 31
+; RV32-NEXT:    and s3, s1, a4
+; RV32-NEXT:    li t2, 0
+; RV32-NEXT:    li t4, 0
+; RV32-NEXT:    srli s5, a6, 31
+; RV32-NEXT:    sub s4, s4, s3
+; RV32-NEXT:    slli s3, t0, 1
+; RV32-NEXT:    or s3, s3, s5
+; RV32-NEXT:    srli t0, t0, 31
+; RV32-NEXT:    slli a6, a6, 1
+; RV32-NEXT:    or a6, t3, a6
+; RV32-NEXT:    seqz t3, a1
+; RV32-NEXT:    or s0, s0, t0
+; RV32-NEXT:    or s5, a1, t5
+; RV32-NEXT:    sub t5, t5, t3
+; RV32-NEXT:    and s6, s1, a3
+; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    andi t3, s1, 1
+; RV32-NEXT:    or t0, t6, s3
+; RV32-NEXT:    sltu t6, s2, s6
+; RV32-NEXT:    snez s5, s5
+; RV32-NEXT:    andi s3, s0, 1
+; RV32-NEXT:    sub s1, s4, t6
+; RV32-NEXT:    add a7, a7, s5
+; RV32-NEXT:    addi a7, a7, 1
+; RV32-NEXT:    andi a7, a7, 1
+; RV32-NEXT:    or t6, a1, t5
+; RV32-NEXT:    or s4, t6, a7
+; RV32-NEXT:    sub s2, s2, s6
+; RV32-NEXT:    li t6, 0
+; RV32-NEXT:    li s0, 0
+; RV32-NEXT:    beqz s4, .LBB1_29
+; RV32-NEXT:  .LBB1_26: # %udiv-do-while
+; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32-NEXT:    srli t2, s2, 31
+; RV32-NEXT:    slli t4, s1, 1
+; RV32-NEXT:    slli s2, s2, 1
+; RV32-NEXT:    or s4, t4, t2
+; RV32-NEXT:    andi t2, s3, 1
+; RV32-NEXT:    or s2, s2, t2
+; RV32-NEXT:    bne a2, s4, .LBB1_24
+; RV32-NEXT:  # %bb.27: # in Loop: Header=BB1_26 Depth=1
+; RV32-NEXT:    sltu t2, t1, s2
+; RV32-NEXT:    j .LBB1_25
+; RV32-NEXT:  .LBB1_28:
+; RV32-NEXT:    li t2, 0
+; RV32-NEXT:    li t4, 0
+; RV32-NEXT:  .LBB1_29: # %udiv-loop-exit
+; RV32-NEXT:    srli a2, a6, 31
+; RV32-NEXT:    slli a3, t0, 1
+; RV32-NEXT:    srli a4, t0, 31
+; RV32-NEXT:    slli a6, a6, 1
+; RV32-NEXT:    or a1, t3, a6
+; RV32-NEXT:    or a2, t2, a2
+; RV32-NEXT:    or a4, t4, a4
+; RV32-NEXT:    or t0, a2, a3
+; RV32-NEXT:    andi t3, a4, 1
+; RV32-NEXT:  .LBB1_30: # %udiv-end
+; RV32-NEXT:    andi a2, t3, 1
+; RV32-NEXT:    sw a1, 0(a0)
+; RV32-NEXT:    sw t0, 4(a0)
+; RV32-NEXT:    sb a2, 8(a0)
+; RV32-NEXT:    lw s0, 92(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 88(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s2, 84(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s3, 80(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s4, 76(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s5, 72(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s6, 68(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 96
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: udiv_i65:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    andi a1, a1, 1
+; RV64-NEXT:    andi a3, a3, 1
+; RV64-NEXT:    call __udivti3
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
+  %res = udiv i65 %x, %y
+  ret i65 %res
+}
 
 define i128 @udiv_i128(i128 %x, i128 %y) nounwind {
-; CHECK-LABEL: udiv_i128:
-; CHECK:    call __udivti3
+; RV32-LABEL: udiv_i128:
+; RV32:       # %bb.0: # %_udiv-special-cases
+; RV32-NEXT:    addi sp, sp, -160
+; RV32-NEXT:    sw ra, 156(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 152(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 148(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s2, 144(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s3, 140(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s4, 136(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s5, 132(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s6, 128(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s7, 124(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s8, 120(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s9, 116(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s10, 112(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s11, 108(sp) # 4-byte Folded Spill
+; RV32-NEXT:    mv s7, a0
+; RV32-NEXT:    lw s8, 0(a2)
+; RV32-NEXT:    lw s9, 4(a2)
+; RV32-NEXT:    lw s11, 8(a2)
+; RV32-NEXT:    lw ra, 12(a2)
+; RV32-NEXT:    lui t4, 349525
+; RV32-NEXT:    addi t4, t4, 1365
+; RV32-NEXT:    lui t3, 209715
+; RV32-NEXT:    addi t3, t3, 819
+; RV32-NEXT:    lui t2, 61681
+; RV32-NEXT:    addi t2, t2, -241
+; RV32-NEXT:    bnez s9, .LBB2_2
+; RV32-NEXT:  # %bb.1: # %_udiv-special-cases
+; RV32-NEXT:    srli a0, s8, 1
+; RV32-NEXT:    or a0, s8, a0
+; RV32-NEXT:    srli a3, a0, 2
+; RV32-NEXT:    or a0, a0, a3
+; RV32-NEXT:    srli a3, a0, 4
+; RV32-NEXT:    or a0, a0, a3
+; RV32-NEXT:    srli a3, a0, 8
+; RV32-NEXT:    or a0, a0, a3
+; RV32-NEXT:    srli a3, a0, 16
+; RV32-NEXT:    or a0, a0, a3
+; RV32-NEXT:    not a0, a0
+; RV32-NEXT:    srli a3, a0, 1
+; RV32-NEXT:    and a3, a3, t4
+; RV32-NEXT:    sub a0, a0, a3
+; RV32-NEXT:    and a3, a0, t3
+; RV32-NEXT:    srli a0, a0, 2
+; RV32-NEXT:    and a0, a0, t3
+; RV32-NEXT:    add a0, a3, a0
+; RV32-NEXT:    srli a3, a0, 4
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    and a0, a0, t2
+; RV32-NEXT:    slli a3, a0, 8
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    slli a3, a0, 16
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    srli a0, a0, 24
+; RV32-NEXT:    addi t6, a0, 32
+; RV32-NEXT:    j .LBB2_3
+; RV32-NEXT:  .LBB2_2:
+; RV32-NEXT:    srli a0, s9, 1
+; RV32-NEXT:    or a0, s9, a0
+; RV32-NEXT:    srli a3, a0, 2
+; RV32-NEXT:    or a0, a0, a3
+; RV32-NEXT:    srli a3, a0, 4
+; RV32-NEXT:    or a0, a0, a3
+; RV32-NEXT:    srli a3, a0, 8
+; RV32-NEXT:    or a0, a0, a3
+; RV32-NEXT:    srli a3, a0, 16
+; RV32-NEXT:    or a0, a0, a3
+; RV32-NEXT:    not a0, a0
+; RV32-NEXT:    srli a3, a0, 1
+; RV32-NEXT:    and a3, a3, t4
+; RV32-NEXT:    sub a0, a0, a3
+; RV32-NEXT:    and a3, a0, t3
+; RV32-NEXT:    srli a0, a0, 2
+; RV32-NEXT:    and a0, a0, t3
+; RV32-NEXT:    add a0, a3, a0
+; RV32-NEXT:    srli a3, a0, 4
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    and a0, a0, t2
+; RV32-NEXT:    slli a3, a0, 8
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    slli a3, a0, 16
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    srli t6, a0, 24
+; RV32-NEXT:  .LBB2_3: # %_udiv-special-cases
+; RV32-NEXT:    lw a6, 4(a1)
+; RV32-NEXT:    or s0, s11, ra
+; RV32-NEXT:    bnez ra, .LBB2_5
+; RV32-NEXT:  # %bb.4: # %_udiv-special-cases
+; RV32-NEXT:    srli a0, s11, 1
+; RV32-NEXT:    or a0, s11, a0
+; RV32-NEXT:    srli a3, a0, 2
+; RV32-NEXT:    or a0, a0, a3
+; RV32-NEXT:    srli a3, a0, 4
+; RV32-NEXT:    or a0, a0, a3
+; RV32-NEXT:    srli a3, a0, 8
+; RV32-NEXT:    or a0, a0, a3
+; RV32-NEXT:    srli a3, a0, 16
+; RV32-NEXT:    or a0, a0, a3
+; RV32-NEXT:    not a0, a0
+; RV32-NEXT:    srli a3, a0, 1
+; RV32-NEXT:    and a3, a3, t4
+; RV32-NEXT:    sub a0, a0, a3
+; RV32-NEXT:    and a3, a0, t3
+; RV32-NEXT:    srli a0, a0, 2
+; RV32-NEXT:    and a0, a0, t3
+; RV32-NEXT:    add a0, a3, a0
+; RV32-NEXT:    srli a3, a0, 4
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    and a0, a0, t2
+; RV32-NEXT:    slli a3, a0, 8
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    slli a3, a0, 16
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    srli a0, a0, 24
+; RV32-NEXT:    addi t5, a0, 32
+; RV32-NEXT:    j .LBB2_6
+; RV32-NEXT:  .LBB2_5:
+; RV32-NEXT:    srli a0, ra, 1
+; RV32-NEXT:    or a0, ra, a0
+; RV32-NEXT:    srli a3, a0, 2
+; RV32-NEXT:    or a0, a0, a3
+; RV32-NEXT:    srli a3, a0, 4
+; RV32-NEXT:    or a0, a0, a3
+; RV32-NEXT:    srli a3, a0, 8
+; RV32-NEXT:    or a0, a0, a3
+; RV32-NEXT:    srli a3, a0, 16
+; RV32-NEXT:    or a0, a0, a3
+; RV32-NEXT:    not a0, a0
+; RV32-NEXT:    srli a3, a0, 1
+; RV32-NEXT:    and a3, a3, t4
+; RV32-NEXT:    sub a0, a0, a3
+; RV32-NEXT:    and a3, a0, t3
+; RV32-NEXT:    srli a0, a0, 2
+; RV32-NEXT:    and a0, a0, t3
+; RV32-NEXT:    add a0, a3, a0
+; RV32-NEXT:    srli a3, a0, 4
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    and a0, a0, t2
+; RV32-NEXT:    slli a3, a0, 8
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    slli a3, a0, 16
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    srli t5, a0, 24
+; RV32-NEXT:  .LBB2_6: # %_udiv-special-cases
+; RV32-NEXT:    lw a7, 12(a1)
+; RV32-NEXT:    addi a0, t6, 64
+; RV32-NEXT:    bnez s0, .LBB2_8
+; RV32-NEXT:  # %bb.7: # %_udiv-special-cases
+; RV32-NEXT:    mv t5, a0
+; RV32-NEXT:  .LBB2_8: # %_udiv-special-cases
+; RV32-NEXT:    lw t1, 0(a1)
+; RV32-NEXT:    lw t0, 8(a1)
+; RV32-NEXT:    snez s3, s0
+; RV32-NEXT:    bnez a6, .LBB2_10
+; RV32-NEXT:  # %bb.9: # %_udiv-special-cases
+; RV32-NEXT:    srli a1, t1, 1
+; RV32-NEXT:    or a1, t1, a1
+; RV32-NEXT:    srli a3, a1, 2
+; RV32-NEXT:    or a1, a1, a3
+; RV32-NEXT:    srli a3, a1, 4
+; RV32-NEXT:    or a1, a1, a3
+; RV32-NEXT:    srli a3, a1, 8
+; RV32-NEXT:    or a1, a1, a3
+; RV32-NEXT:    srli a3, a1, 16
+; RV32-NEXT:    or a1, a1, a3
+; RV32-NEXT:    not a1, a1
+; RV32-NEXT:    srli a3, a1, 1
+; RV32-NEXT:    and a3, a3, t4
+; RV32-NEXT:    sub a1, a1, a3
+; RV32-NEXT:    and a3, a1, t3
+; RV32-NEXT:    srli a1, a1, 2
+; RV32-NEXT:    and a1, a1, t3
+; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    srli a3, a1, 4
+; RV32-NEXT:    add a1, a1, a3
+; RV32-NEXT:    and a1, a1, t2
+; RV32-NEXT:    slli a3, a1, 8
+; RV32-NEXT:    add a1, a1, a3
+; RV32-NEXT:    slli a3, a1, 16
+; RV32-NEXT:    add a1, a1, a3
+; RV32-NEXT:    srli a1, a1, 24
+; RV32-NEXT:    addi a3, a1, 32
+; RV32-NEXT:    j .LBB2_11
+; RV32-NEXT:  .LBB2_10:
+; RV32-NEXT:    srli a1, a6, 1
+; RV32-NEXT:    or a1, a6, a1
+; RV32-NEXT:    srli a3, a1, 2
+; RV32-NEXT:    or a1, a1, a3
+; RV32-NEXT:    srli a3, a1, 4
+; RV32-NEXT:    or a1, a1, a3
+; RV32-NEXT:    srli a3, a1, 8
+; RV32-NEXT:    or a1, a1, a3
+; RV32-NEXT:    srli a3, a1, 16
+; RV32-NEXT:    or a1, a1, a3
+; RV32-NEXT:    not a1, a1
+; RV32-NEXT:    srli a3, a1, 1
+; RV32-NEXT:    and a3, a3, t4
+; RV32-NEXT:    sub a1, a1, a3
+; RV32-NEXT:    and a3, a1, t3
+; RV32-NEXT:    srli a1, a1, 2
+; RV32-NEXT:    and a1, a1, t3
+; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    srli a3, a1, 4
+; RV32-NEXT:    add a1, a1, a3
+; RV32-NEXT:    and a1, a1, t2
+; RV32-NEXT:    slli a3, a1, 8
+; RV32-NEXT:    add a1, a1, a3
+; RV32-NEXT:    slli a3, a1, 16
+; RV32-NEXT:    add a1, a1, a3
+; RV32-NEXT:    srli a3, a1, 24
+; RV32-NEXT:  .LBB2_11: # %_udiv-special-cases
+; RV32-NEXT:    or a1, s9, ra
+; RV32-NEXT:    or s0, s8, s11
+; RV32-NEXT:    or s1, a6, a7
+; RV32-NEXT:    or s2, t1, t0
+; RV32-NEXT:    sltu t6, a0, t6
+; RV32-NEXT:    addi s3, s3, -1
+; RV32-NEXT:    addi a0, a3, 64
+; RV32-NEXT:    or s4, t0, a7
+; RV32-NEXT:    sltu s5, a0, a3
+; RV32-NEXT:    snez s6, s4
+; RV32-NEXT:    addi s6, s6, -1
+; RV32-NEXT:    bnez a7, .LBB2_13
+; RV32-NEXT:  # %bb.12: # %_udiv-special-cases
+; RV32-NEXT:    srli a3, t0, 1
+; RV32-NEXT:    or a3, t0, a3
+; RV32-NEXT:    srli a4, a3, 2
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    srli a4, a3, 4
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    srli a4, a3, 8
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    srli a4, a3, 16
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    not a3, a3
+; RV32-NEXT:    srli a4, a3, 1
+; RV32-NEXT:    and a4, a4, t4
+; RV32-NEXT:    sub a3, a3, a4
+; RV32-NEXT:    and a4, a3, t3
+; RV32-NEXT:    srli a3, a3, 2
+; RV32-NEXT:    and a3, a3, t3
+; RV32-NEXT:    add a3, a4, a3
+; RV32-NEXT:    srli a4, a3, 4
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    and a3, a3, t2
+; RV32-NEXT:    slli a4, a3, 8
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    slli a4, a3, 16
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    srli a3, a3, 24
+; RV32-NEXT:    addi a3, a3, 32
+; RV32-NEXT:    j .LBB2_14
+; RV32-NEXT:  .LBB2_13:
+; RV32-NEXT:    srli a3, a7, 1
+; RV32-NEXT:    or a3, a7, a3
+; RV32-NEXT:    srli a4, a3, 2
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    srli a4, a3, 4
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    srli a4, a3, 8
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    srli a4, a3, 16
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    not a3, a3
+; RV32-NEXT:    srli a4, a3, 1
+; RV32-NEXT:    and a4, a4, t4
+; RV32-NEXT:    sub a3, a3, a4
+; RV32-NEXT:    and a4, a3, t3
+; RV32-NEXT:    srli a3, a3, 2
+; RV32-NEXT:    and a3, a3, t3
+; RV32-NEXT:    add a3, a4, a3
+; RV32-NEXT:    srli a4, a3, 4
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    and a3, a3, t2
+; RV32-NEXT:    slli a4, a3, 8
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    slli a4, a3, 16
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    srli a3, a3, 24
+; RV32-NEXT:  .LBB2_14: # %_udiv-special-cases
+; RV32-NEXT:    or s0, s0, a1
+; RV32-NEXT:    or a5, s2, s1
+; RV32-NEXT:    and a1, s3, t6
+; RV32-NEXT:    and a4, s6, s5
+; RV32-NEXT:    bnez s4, .LBB2_16
+; RV32-NEXT:  # %bb.15: # %_udiv-special-cases
+; RV32-NEXT:    mv a3, a0
+; RV32-NEXT:  .LBB2_16: # %_udiv-special-cases
+; RV32-NEXT:    seqz a0, s0
+; RV32-NEXT:    seqz a5, a5
+; RV32-NEXT:    sltu t2, t5, a3
+; RV32-NEXT:    sub t4, a1, a4
+; RV32-NEXT:    mv t3, t2
+; RV32-NEXT:    beq a1, a4, .LBB2_18
+; RV32-NEXT:  # %bb.17: # %_udiv-special-cases
+; RV32-NEXT:    sltu t3, a1, a4
+; RV32-NEXT:  .LBB2_18: # %_udiv-special-cases
+; RV32-NEXT:    sub t2, t4, t2
+; RV32-NEXT:    or a0, a0, a5
+; RV32-NEXT:    neg t4, t3
+; RV32-NEXT:    seqz t6, t3
+; RV32-NEXT:    addi t6, t6, -1
+; RV32-NEXT:    or a1, t4, t6
+; RV32-NEXT:    sub t3, t5, a3
+; RV32-NEXT:    beqz a1, .LBB2_20
+; RV32-NEXT:  # %bb.19: # %_udiv-special-cases
+; RV32-NEXT:    snez a1, a1
+; RV32-NEXT:    j .LBB2_21
+; RV32-NEXT:  .LBB2_20:
+; RV32-NEXT:    snez a1, t2
+; RV32-NEXT:    sltiu a3, t3, 128
+; RV32-NEXT:    xori a3, a3, 1
+; RV32-NEXT:    or a1, a3, a1
+; RV32-NEXT:  .LBB2_21: # %_udiv-special-cases
+; RV32-NEXT:    or a5, a0, a1
+; RV32-NEXT:    addi a3, a5, -1
+; RV32-NEXT:    and a0, a3, a7
+; RV32-NEXT:    and a1, a3, t0
+; RV32-NEXT:    and a4, a3, a6
+; RV32-NEXT:    and a3, a3, t1
+; RV32-NEXT:    bnez a5, .LBB2_26
+; RV32-NEXT:  # %bb.22: # %_udiv-special-cases
+; RV32-NEXT:    xori a5, t3, 127
+; RV32-NEXT:    or a5, a5, t4
+; RV32-NEXT:    or t5, t2, t6
+; RV32-NEXT:    or a5, a5, t5
+; RV32-NEXT:    beqz a5, .LBB2_26
+; RV32-NEXT:  # %bb.23: # %udiv-bb1
+; RV32-NEXT:    sw s7, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    addi a1, t3, 1
+; RV32-NEXT:    sw zero, 72(sp)
+; RV32-NEXT:    sw zero, 76(sp)
+; RV32-NEXT:    sw zero, 80(sp)
+; RV32-NEXT:    sw zero, 84(sp)
+; RV32-NEXT:    sw t1, 88(sp)
+; RV32-NEXT:    sw a6, 92(sp)
+; RV32-NEXT:    sw t0, 96(sp)
+; RV32-NEXT:    sw a7, 100(sp)
+; RV32-NEXT:    li a0, 127
+; RV32-NEXT:    addi a2, sp, 88
+; RV32-NEXT:    seqz a3, a1
+; RV32-NEXT:    sub a0, a0, t3
+; RV32-NEXT:    add t2, t2, a3
+; RV32-NEXT:    andi a3, a0, 31
+; RV32-NEXT:    srli a0, a0, 3
+; RV32-NEXT:    or a4, a1, t2
+; RV32-NEXT:    xori a3, a3, 31
+; RV32-NEXT:    andi a0, a0, 12
+; RV32-NEXT:    seqz t5, a4
+; RV32-NEXT:    sub a2, a2, a0
+; RV32-NEXT:    add t5, t4, t5
+; RV32-NEXT:    lw a0, 0(a2)
+; RV32-NEXT:    lw a4, 4(a2)
+; RV32-NEXT:    lw a5, 8(a2)
+; RV32-NEXT:    lw a2, 12(a2)
+; RV32-NEXT:    sltu t4, t5, t4
+; RV32-NEXT:    or s0, a1, t5
+; RV32-NEXT:    add t4, t6, t4
+; RV32-NEXT:    or t6, t2, t4
+; RV32-NEXT:    or s0, s0, t6
+; RV32-NEXT:    srli t6, a5, 1
+; RV32-NEXT:    srli s1, a4, 1
+; RV32-NEXT:    srli s2, a0, 1
+; RV32-NEXT:    srl t6, t6, a3
+; RV32-NEXT:    srl s1, s1, a3
+; RV32-NEXT:    srl a3, s2, a3
+; RV32-NEXT:    not t3, t3
+; RV32-NEXT:    sll a2, a2, t3
+; RV32-NEXT:    or s2, a2, t6
+; RV32-NEXT:    sll a2, a5, t3
+; RV32-NEXT:    sll a4, a4, t3
+; RV32-NEXT:    or s1, a2, s1
+; RV32-NEXT:    or t6, a4, a3
+; RV32-NEXT:    sll t3, a0, t3
+; RV32-NEXT:    bnez s0, .LBB2_27
+; RV32-NEXT:  # %bb.24:
+; RV32-NEXT:    li s6, 0
+; RV32-NEXT:    li s7, 0
+; RV32-NEXT:    li s8, 0
+; RV32-NEXT:  .LBB2_25: # %udiv-loop-exit
+; RV32-NEXT:    srli a0, s1, 31
+; RV32-NEXT:    slli s2, s2, 1
+; RV32-NEXT:    or a0, s2, a0
+; RV32-NEXT:    srli a1, t6, 31
+; RV32-NEXT:    slli s1, s1, 1
+; RV32-NEXT:    or a1, s1, a1
+; RV32-NEXT:    srli a2, t3, 31
+; RV32-NEXT:    slli t6, t6, 1
+; RV32-NEXT:    slli a3, t3, 1
+; RV32-NEXT:    or a3, s0, a3
+; RV32-NEXT:    or a2, s6, a2
+; RV32-NEXT:    or a4, a2, t6
+; RV32-NEXT:    or a1, s7, a1
+; RV32-NEXT:    or a0, s8, a0
+; RV32-NEXT:    lw s7, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:  .LBB2_26: # %udiv-end
+; RV32-NEXT:    sw a3, 0(s7)
+; RV32-NEXT:    sw a4, 4(s7)
+; RV32-NEXT:    sw a1, 8(s7)
+; RV32-NEXT:    sw a0, 12(s7)
+; RV32-NEXT:    lw ra, 156(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 152(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 148(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s2, 144(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s3, 140(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s4, 136(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s5, 132(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s6, 128(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s7, 124(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s8, 120(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s9, 116(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s10, 112(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s11, 108(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 160
+; RV32-NEXT:    ret
+; RV32-NEXT:  .LBB2_27: # %udiv-preheader
+; RV32-NEXT:    li s0, 0
+; RV32-NEXT:    li s5, 0
+; RV32-NEXT:    li s3, 0
+; RV32-NEXT:    li s4, 0
+; RV32-NEXT:    sw zero, 56(sp)
+; RV32-NEXT:    sw zero, 60(sp)
+; RV32-NEXT:    sw zero, 64(sp)
+; RV32-NEXT:    sw zero, 68(sp)
+; RV32-NEXT:    sw t1, 40(sp)
+; RV32-NEXT:    sw a6, 44(sp)
+; RV32-NEXT:    sw t0, 48(sp)
+; RV32-NEXT:    sw a7, 52(sp)
+; RV32-NEXT:    srli a0, a1, 3
+; RV32-NEXT:    addi a2, sp, 40
+; RV32-NEXT:    andi a0, a0, 12
+; RV32-NEXT:    add a0, a2, a0
+; RV32-NEXT:    lw a2, 4(a0)
+; RV32-NEXT:    lw a3, 8(a0)
+; RV32-NEXT:    lw a4, 12(a0)
+; RV32-NEXT:    lw a0, 0(a0)
+; RV32-NEXT:    andi a5, a1, 31
+; RV32-NEXT:    xori a5, a5, 31
+; RV32-NEXT:    slli a6, a4, 1
+; RV32-NEXT:    slli a7, a3, 1
+; RV32-NEXT:    slli t0, a2, 1
+; RV32-NEXT:    sll a6, a6, a5
+; RV32-NEXT:    sll a7, a7, a5
+; RV32-NEXT:    sll a5, t0, a5
+; RV32-NEXT:    seqz t0, s8
+; RV32-NEXT:    srl a3, a3, a1
+; RV32-NEXT:    or s10, a3, a6
+; RV32-NEXT:    or a3, s8, s9
+; RV32-NEXT:    sw s9, 32(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sub a6, s9, t0
+; RV32-NEXT:    seqz a3, a3
+; RV32-NEXT:    srl a2, a2, a1
+; RV32-NEXT:    or s9, a2, a7
+; RV32-NEXT:    sub a7, s11, a3
+; RV32-NEXT:    sw s11, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sltu a2, s11, a3
+; RV32-NEXT:    sw ra, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sub a2, ra, a2
+; RV32-NEXT:    sw a2, 20(sp) # 4-byte Folded Spill
+; RV32-NEXT:    srl a0, a0, a1
+; RV32-NEXT:    srl ra, a4, a1
+; RV32-NEXT:    or t1, a0, a5
+; RV32-NEXT:    sw s8, 36(sp) # 4-byte Folded Spill
+; RV32-NEXT:    addi s8, s8, -1
+; RV32-NEXT:    sw s8, 16(sp) # 4-byte Folded Spill
+; RV32-NEXT:    li s7, 0
+; RV32-NEXT:    li s8, 0
+; RV32-NEXT:    j .LBB2_29
+; RV32-NEXT:  .LBB2_28: # %udiv-do-while
+; RV32-NEXT:    # in Loop: Header=BB2_29 Depth=1
+; RV32-NEXT:    li s6, 0
+; RV32-NEXT:    sub a0, a0, a5
+; RV32-NEXT:    srli a5, s1, 31
+; RV32-NEXT:    slli s2, s2, 1
+; RV32-NEXT:    or a5, s2, a5
+; RV32-NEXT:    srli s2, t6, 31
+; RV32-NEXT:    slli s1, s1, 1
+; RV32-NEXT:    or s1, s1, s2
+; RV32-NEXT:    srli s2, t3, 31
+; RV32-NEXT:    slli t6, t6, 1
+; RV32-NEXT:    slli t3, t3, 1
+; RV32-NEXT:    or t6, t6, s2
+; RV32-NEXT:    lw a2, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT:    and s2, s10, a2
+; RV32-NEXT:    or t3, s0, t3
+; RV32-NEXT:    sub a2, a3, s2
+; RV32-NEXT:    sltu a3, a3, s2
+; RV32-NEXT:    lw t0, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT:    and s0, s10, t0
+; RV32-NEXT:    sub t0, s9, s0
+; RV32-NEXT:    or s2, a1, t2
+; RV32-NEXT:    sub s9, a0, a4
+; RV32-NEXT:    seqz a0, a1
+; RV32-NEXT:    sub t2, t2, a0
+; RV32-NEXT:    or t6, s5, t6
+; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    andi s0, s10, 1
+; RV32-NEXT:    seqz a0, s2
+; RV32-NEXT:    or s1, s3, s1
+; RV32-NEXT:    or s2, s4, a5
+; RV32-NEXT:    sub s10, a2, ra
+; RV32-NEXT:    sltu a2, a2, ra
+; RV32-NEXT:    sub a3, t0, a3
+; RV32-NEXT:    sltu a4, t5, a0
+; RV32-NEXT:    sub t5, t5, a0
+; RV32-NEXT:    sub ra, a3, a2
+; RV32-NEXT:    sub t4, t4, a4
+; RV32-NEXT:    or a0, t2, t4
+; RV32-NEXT:    or a2, a1, t5
+; RV32-NEXT:    or a0, a2, a0
+; RV32-NEXT:    sub t1, s11, t1
+; RV32-NEXT:    li s5, 0
+; RV32-NEXT:    li s3, 0
+; RV32-NEXT:    li s4, 0
+; RV32-NEXT:    beqz a0, .LBB2_25
+; RV32-NEXT:  .LBB2_29: # %udiv-do-while
+; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32-NEXT:    srli a0, t1, 31
+; RV32-NEXT:    slli a3, s9, 1
+; RV32-NEXT:    slli t1, t1, 1
+; RV32-NEXT:    or a0, a3, a0
+; RV32-NEXT:    srli a3, s2, 31
+; RV32-NEXT:    or s11, t1, a3
+; RV32-NEXT:    beq a6, a0, .LBB2_31
+; RV32-NEXT:  # %bb.30: # %udiv-do-while
+; RV32-NEXT:    # in Loop: Header=BB2_29 Depth=1
+; RV32-NEXT:    sltu a4, a6, a0
+; RV32-NEXT:    j .LBB2_32
+; RV32-NEXT:  .LBB2_31: # in Loop: Header=BB2_29 Depth=1
+; RV32-NEXT:    lw a2, 16(sp) # 4-byte Folded Reload
+; RV32-NEXT:    sltu a4, a2, s11
+; RV32-NEXT:  .LBB2_32: # %udiv-do-while
+; RV32-NEXT:    # in Loop: Header=BB2_29 Depth=1
+; RV32-NEXT:    lw a2, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT:    srli a3, s10, 31
+; RV32-NEXT:    slli ra, ra, 1
+; RV32-NEXT:    srli a5, s9, 31
+; RV32-NEXT:    slli s10, s10, 1
+; RV32-NEXT:    or s9, ra, a3
+; RV32-NEXT:    or a3, s10, a5
+; RV32-NEXT:    sub a5, a7, a3
+; RV32-NEXT:    sltu t1, a7, a3
+; RV32-NEXT:    lw t0, 20(sp) # 4-byte Folded Reload
+; RV32-NEXT:    sub s6, t0, s9
+; RV32-NEXT:    sltu a4, a5, a4
+; RV32-NEXT:    sub a5, s6, t1
+; RV32-NEXT:    sub a5, a5, a4
+; RV32-NEXT:    srai s10, a5, 31
+; RV32-NEXT:    and t1, s10, a2
+; RV32-NEXT:    lw a2, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT:    and a5, s10, a2
+; RV32-NEXT:    sltu a4, s11, t1
+; RV32-NEXT:    mv ra, a4
+; RV32-NEXT:    beq a0, a5, .LBB2_28
+; RV32-NEXT:  # %bb.33: # %udiv-do-while
+; RV32-NEXT:    # in Loop: Header=BB2_29 Depth=1
+; RV32-NEXT:    sltu ra, a0, a5
+; RV32-NEXT:    j .LBB2_28
+;
+; RV64-LABEL: udiv_i128:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    call __udivti3
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
   %res = udiv i128 %x, %y
   ret i128 %res
 }
 
 define i129 @udiv_i129(i129 %x, i129 %y) nounwind {
-; CHECK-LABEL: udiv_i129:
-; CHECK-NOT: call{{.*}}div
+; RV32-LABEL: udiv_i129:
+; RV32:       # %bb.0: # %_udiv-special-cases
+; RV32-NEXT:    addi sp, sp, -240
+; RV32-NEXT:    sw ra, 236(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 232(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 228(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s2, 224(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s3, 220(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s4, 216(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s5, 212(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s6, 208(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s7, 204(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s8, 200(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s9, 196(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s10, 192(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s11, 188(sp) # 4-byte Folded Spill
+; RV32-NEXT:    mv ra, a0
+; RV32-NEXT:    lw t2, 16(a2)
+; RV32-NEXT:    lw a4, 0(a2)
+; RV32-NEXT:    lw a5, 4(a2)
+; RV32-NEXT:    lw a6, 8(a2)
+; RV32-NEXT:    lw a0, 12(a2)
+; RV32-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT:    lui a0, 349525
+; RV32-NEXT:    lui a2, 209715
+; RV32-NEXT:    lui a3, 61681
+; RV32-NEXT:    addi t5, a0, 1365
+; RV32-NEXT:    addi t4, a2, 819
+; RV32-NEXT:    addi t3, a3, -241
+; RV32-NEXT:    sw a6, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT:    slli a0, a6, 31
+; RV32-NEXT:    srli a2, a5, 1
+; RV32-NEXT:    sw a5, 20(sp) # 4-byte Folded Spill
+; RV32-NEXT:    slli a3, a5, 31
+; RV32-NEXT:    or a0, a2, a0
+; RV32-NEXT:    sw a4, 32(sp) # 4-byte Folded Spill
+; RV32-NEXT:    srli a2, a4, 1
+; RV32-NEXT:    or a2, a2, a3
+; RV32-NEXT:    bnez a0, .LBB3_2
+; RV32-NEXT:  # %bb.1: # %_udiv-special-cases
+; RV32-NEXT:    srli a3, a2, 1
+; RV32-NEXT:    or a3, a2, a3
+; RV32-NEXT:    srli a4, a3, 2
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    srli a4, a3, 4
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    srli a4, a3, 8
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    srli a4, a3, 16
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    not a3, a3
+; RV32-NEXT:    srli a4, a3, 1
+; RV32-NEXT:    and a4, a4, t5
+; RV32-NEXT:    sub a3, a3, a4
+; RV32-NEXT:    and a4, a3, t4
+; RV32-NEXT:    srli a3, a3, 2
+; RV32-NEXT:    and a3, a3, t4
+; RV32-NEXT:    add a3, a4, a3
+; RV32-NEXT:    srli a4, a3, 4
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    and a3, a3, t3
+; RV32-NEXT:    slli a4, a3, 8
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    slli a4, a3, 16
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    srli a3, a3, 24
+; RV32-NEXT:    addi a6, a3, 32
+; RV32-NEXT:    j .LBB3_3
+; RV32-NEXT:  .LBB3_2:
+; RV32-NEXT:    srli a3, a0, 1
+; RV32-NEXT:    or a3, a0, a3
+; RV32-NEXT:    srli a4, a3, 2
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    srli a4, a3, 4
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    srli a4, a3, 8
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    srli a4, a3, 16
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    not a3, a3
+; RV32-NEXT:    srli a4, a3, 1
+; RV32-NEXT:    and a4, a4, t5
+; RV32-NEXT:    sub a3, a3, a4
+; RV32-NEXT:    and a4, a3, t4
+; RV32-NEXT:    srli a3, a3, 2
+; RV32-NEXT:    and a3, a3, t4
+; RV32-NEXT:    add a3, a4, a3
+; RV32-NEXT:    srli a4, a3, 4
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    and a3, a3, t3
+; RV32-NEXT:    slli a4, a3, 8
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    slli a4, a3, 16
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    srli a6, a3, 24
+; RV32-NEXT:  .LBB3_3: # %_udiv-special-cases
+; RV32-NEXT:    lw a7, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT:    srli a3, a7, 1
+; RV32-NEXT:    slli a5, t2, 31
+; RV32-NEXT:    slli a7, a7, 31
+; RV32-NEXT:    lw a4, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT:    srli t0, a4, 1
+; RV32-NEXT:    lw a4, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT:    slli a4, a4, 31
+; RV32-NEXT:    li s2, 64
+; RV32-NEXT:    bnez a4, .LBB3_5
+; RV32-NEXT:  # %bb.4: # %_udiv-special-cases
+; RV32-NEXT:    li t6, 64
+; RV32-NEXT:    j .LBB3_6
+; RV32-NEXT:  .LBB3_5:
+; RV32-NEXT:    srli t1, a4, 1
+; RV32-NEXT:    or t1, a4, t1
+; RV32-NEXT:    srli t6, t1, 2
+; RV32-NEXT:    or t1, t1, t6
+; RV32-NEXT:    srli t6, t1, 4
+; RV32-NEXT:    or t1, t1, t6
+; RV32-NEXT:    srli t6, t1, 8
+; RV32-NEXT:    or t1, t1, t6
+; RV32-NEXT:    srli t6, t1, 16
+; RV32-NEXT:    or t1, t1, t6
+; RV32-NEXT:    not t1, t1
+; RV32-NEXT:    srli t6, t1, 1
+; RV32-NEXT:    and t6, t6, t5
+; RV32-NEXT:    sub t1, t1, t6
+; RV32-NEXT:    and t6, t1, t4
+; RV32-NEXT:    srli t1, t1, 2
+; RV32-NEXT:    and t1, t1, t4
+; RV32-NEXT:    add t1, t6, t1
+; RV32-NEXT:    srli t6, t1, 4
+; RV32-NEXT:    add t1, t1, t6
+; RV32-NEXT:    and t1, t1, t3
+; RV32-NEXT:    slli t6, t1, 8
+; RV32-NEXT:    add t1, t1, t6
+; RV32-NEXT:    slli t6, t1, 16
+; RV32-NEXT:    add t1, t1, t6
+; RV32-NEXT:    srli t6, t1, 24
+; RV32-NEXT:  .LBB3_6: # %_udiv-special-cases
+; RV32-NEXT:    or t1, a5, a3
+; RV32-NEXT:    or a7, t0, a7
+; RV32-NEXT:    bnez a4, .LBB3_8
+; RV32-NEXT:  # %bb.7: # %_udiv-special-cases
+; RV32-NEXT:    li t6, 128
+; RV32-NEXT:  .LBB3_8: # %_udiv-special-cases
+; RV32-NEXT:    or a5, a7, t1
+; RV32-NEXT:    addi a4, a6, 64
+; RV32-NEXT:    addi a3, t6, 128
+; RV32-NEXT:    or a0, a0, t1
+; RV32-NEXT:    or a2, a2, a7
+; RV32-NEXT:    or s3, a2, a0
+; RV32-NEXT:    sltu s0, a3, t6
+; RV32-NEXT:    bnez s3, .LBB3_11
+; RV32-NEXT:  # %bb.9: # %_udiv-special-cases
+; RV32-NEXT:    mv t6, s0
+; RV32-NEXT:    beqz t1, .LBB3_12
+; RV32-NEXT:  .LBB3_10:
+; RV32-NEXT:    srli a0, t1, 1
+; RV32-NEXT:    or a0, t1, a0
+; RV32-NEXT:    srli a2, a0, 2
+; RV32-NEXT:    or a0, a0, a2
+; RV32-NEXT:    srli a2, a0, 4
+; RV32-NEXT:    or a0, a0, a2
+; RV32-NEXT:    srli a2, a0, 8
+; RV32-NEXT:    or a0, a0, a2
+; RV32-NEXT:    srli a2, a0, 16
+; RV32-NEXT:    or a0, a0, a2
+; RV32-NEXT:    not a0, a0
+; RV32-NEXT:    srli a2, a0, 1
+; RV32-NEXT:    and a2, a2, t5
+; RV32-NEXT:    sub a0, a0, a2
+; RV32-NEXT:    and a2, a0, t4
+; RV32-NEXT:    srli a0, a0, 2
+; RV32-NEXT:    and a0, a0, t4
+; RV32-NEXT:    add a0, a2, a0
+; RV32-NEXT:    srli a2, a0, 4
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    and a0, a0, t3
+; RV32-NEXT:    slli a2, a0, 8
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    slli a2, a0, 16
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    srli s1, a0, 24
+; RV32-NEXT:    beqz a5, .LBB3_13
+; RV32-NEXT:    j .LBB3_14
+; RV32-NEXT:  .LBB3_11:
+; RV32-NEXT:    snez a0, a5
+; RV32-NEXT:    sltu a2, a4, a6
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    and t6, a0, a2
+; RV32-NEXT:    bnez t1, .LBB3_10
+; RV32-NEXT:  .LBB3_12: # %_udiv-special-cases
+; RV32-NEXT:    srli a0, a7, 1
+; RV32-NEXT:    or a0, a7, a0
+; RV32-NEXT:    srli a2, a0, 2
+; RV32-NEXT:    or a0, a0, a2
+; RV32-NEXT:    srli a2, a0, 4
+; RV32-NEXT:    or a0, a0, a2
+; RV32-NEXT:    srli a2, a0, 8
+; RV32-NEXT:    or a0, a0, a2
+; RV32-NEXT:    srli a2, a0, 16
+; RV32-NEXT:    or a0, a0, a2
+; RV32-NEXT:    not a0, a0
+; RV32-NEXT:    srli a2, a0, 1
+; RV32-NEXT:    and a2, a2, t5
+; RV32-NEXT:    sub a0, a0, a2
+; RV32-NEXT:    and a2, a0, t4
+; RV32-NEXT:    srli a0, a0, 2
+; RV32-NEXT:    and a0, a0, t4
+; RV32-NEXT:    add a0, a2, a0
+; RV32-NEXT:    srli a2, a0, 4
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    and a0, a0, t3
+; RV32-NEXT:    slli a2, a0, 8
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    slli a2, a0, 16
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    srli a0, a0, 24
+; RV32-NEXT:    addi s1, a0, 32
+; RV32-NEXT:    bnez a5, .LBB3_14
+; RV32-NEXT:  .LBB3_13: # %_udiv-special-cases
+; RV32-NEXT:    mv s1, a4
+; RV32-NEXT:  .LBB3_14: # %_udiv-special-cases
+; RV32-NEXT:    lw a7, 0(a1)
+; RV32-NEXT:    lw t0, 4(a1)
+; RV32-NEXT:    lw a6, 8(a1)
+; RV32-NEXT:    bnez s3, .LBB3_16
+; RV32-NEXT:  # %bb.15: # %_udiv-special-cases
+; RV32-NEXT:    mv s1, a3
+; RV32-NEXT:  .LBB3_16: # %_udiv-special-cases
+; RV32-NEXT:    lw t1, 12(a1)
+; RV32-NEXT:    lw a1, 16(a1)
+; RV32-NEXT:    slli a0, a6, 31
+; RV32-NEXT:    srli a2, t0, 1
+; RV32-NEXT:    or a0, a2, a0
+; RV32-NEXT:    slli a2, t0, 31
+; RV32-NEXT:    srli a3, a7, 1
+; RV32-NEXT:    or a2, a3, a2
+; RV32-NEXT:    bnez a0, .LBB3_18
+; RV32-NEXT:  # %bb.17: # %_udiv-special-cases
+; RV32-NEXT:    srli a3, a2, 1
+; RV32-NEXT:    or a3, a2, a3
+; RV32-NEXT:    srli a4, a3, 2
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    srli a4, a3, 4
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    srli a4, a3, 8
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    srli a4, a3, 16
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    not a3, a3
+; RV32-NEXT:    srli a4, a3, 1
+; RV32-NEXT:    and a4, a4, t5
+; RV32-NEXT:    sub a3, a3, a4
+; RV32-NEXT:    and a4, a3, t4
+; RV32-NEXT:    srli a3, a3, 2
+; RV32-NEXT:    and a3, a3, t4
+; RV32-NEXT:    add a3, a4, a3
+; RV32-NEXT:    srli a4, a3, 4
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    and a3, a3, t3
+; RV32-NEXT:    slli a4, a3, 8
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    slli a4, a3, 16
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    srli a3, a3, 24
+; RV32-NEXT:    addi s5, a3, 32
+; RV32-NEXT:    j .LBB3_19
+; RV32-NEXT:  .LBB3_18:
+; RV32-NEXT:    srli a3, a0, 1
+; RV32-NEXT:    or a3, a0, a3
+; RV32-NEXT:    srli a4, a3, 2
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    srli a4, a3, 4
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    srli a4, a3, 8
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    srli a4, a3, 16
+; RV32-NEXT:    or a3, a3, a4
+; RV32-NEXT:    not a3, a3
+; RV32-NEXT:    srli a4, a3, 1
+; RV32-NEXT:    and a4, a4, t5
+; RV32-NEXT:    sub a3, a3, a4
+; RV32-NEXT:    and a4, a3, t4
+; RV32-NEXT:    srli a3, a3, 2
+; RV32-NEXT:    and a3, a3, t4
+; RV32-NEXT:    add a3, a4, a3
+; RV32-NEXT:    srli a4, a3, 4
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    and a3, a3, t3
+; RV32-NEXT:    slli a4, a3, 8
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    slli a4, a3, 16
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    srli s5, a3, 24
+; RV32-NEXT:  .LBB3_19: # %_udiv-special-cases
+; RV32-NEXT:    srli a3, t1, 1
+; RV32-NEXT:    slli a4, a1, 31
+; RV32-NEXT:    slli a5, t1, 31
+; RV32-NEXT:    slli s4, a7, 31
+; RV32-NEXT:    srli s6, a6, 1
+; RV32-NEXT:    beqz s4, .LBB3_21
+; RV32-NEXT:  # %bb.20:
+; RV32-NEXT:    srli s2, s4, 1
+; RV32-NEXT:    or s2, s4, s2
+; RV32-NEXT:    srli s7, s2, 2
+; RV32-NEXT:    or s2, s2, s7
+; RV32-NEXT:    srli s7, s2, 4
+; RV32-NEXT:    or s2, s2, s7
+; RV32-NEXT:    srli s7, s2, 8
+; RV32-NEXT:    or s2, s2, s7
+; RV32-NEXT:    srli s7, s2, 16
+; RV32-NEXT:    or s2, s2, s7
+; RV32-NEXT:    not s2, s2
+; RV32-NEXT:    srli s7, s2, 1
+; RV32-NEXT:    and s7, s7, t5
+; RV32-NEXT:    sub s2, s2, s7
+; RV32-NEXT:    and s7, s2, t4
+; RV32-NEXT:    srli s2, s2, 2
+; RV32-NEXT:    and s2, s2, t4
+; RV32-NEXT:    add s2, s7, s2
+; RV32-NEXT:    srli s7, s2, 4
+; RV32-NEXT:    add s2, s2, s7
+; RV32-NEXT:    and s2, s2, t3
+; RV32-NEXT:    slli s7, s2, 8
+; RV32-NEXT:    add s2, s2, s7
+; RV32-NEXT:    slli s7, s2, 16
+; RV32-NEXT:    add s2, s2, s7
+; RV32-NEXT:    srli s2, s2, 24
+; RV32-NEXT:  .LBB3_21: # %_udiv-special-cases
+; RV32-NEXT:    or s7, a4, a3
+; RV32-NEXT:    or s6, s6, a5
+; RV32-NEXT:    bnez s4, .LBB3_23
+; RV32-NEXT:  # %bb.22: # %_udiv-special-cases
+; RV32-NEXT:    li s2, 128
+; RV32-NEXT:  .LBB3_23: # %_udiv-special-cases
+; RV32-NEXT:    or s4, s6, s7
+; RV32-NEXT:    addi a5, s5, 64
+; RV32-NEXT:    addi a3, s2, 128
+; RV32-NEXT:    or a0, a0, s7
+; RV32-NEXT:    or a4, a2, s6
+; RV32-NEXT:    or a4, a4, a0
+; RV32-NEXT:    sltu a0, a3, s2
+; RV32-NEXT:    bnez a4, .LBB3_26
+; RV32-NEXT:  # %bb.24: # %_udiv-special-cases
+; RV32-NEXT:    mv a2, a0
+; RV32-NEXT:    snez s2, s3
+; RV32-NEXT:    beqz s7, .LBB3_27
+; RV32-NEXT:  .LBB3_25:
+; RV32-NEXT:    srli s3, s7, 1
+; RV32-NEXT:    or s3, s7, s3
+; RV32-NEXT:    srli s5, s3, 2
+; RV32-NEXT:    or s3, s3, s5
+; RV32-NEXT:    srli s5, s3, 4
+; RV32-NEXT:    or s3, s3, s5
+; RV32-NEXT:    srli s5, s3, 8
+; RV32-NEXT:    or s3, s3, s5
+; RV32-NEXT:    srli s5, s3, 16
+; RV32-NEXT:    or s3, s3, s5
+; RV32-NEXT:    not s3, s3
+; RV32-NEXT:    srli s5, s3, 1
+; RV32-NEXT:    and t5, s5, t5
+; RV32-NEXT:    sub t5, s3, t5
+; RV32-NEXT:    and s3, t5, t4
+; RV32-NEXT:    srli t5, t5, 2
+; RV32-NEXT:    and t4, t5, t4
+; RV32-NEXT:    add t4, s3, t4
+; RV32-NEXT:    srli t5, t4, 4
+; RV32-NEXT:    add t4, t4, t5
+; RV32-NEXT:    and t3, t4, t3
+; RV32-NEXT:    slli t4, t3, 8
+; RV32-NEXT:    add t3, t3, t4
+; RV32-NEXT:    slli t4, t3, 16
+; RV32-NEXT:    add t3, t3, t4
+; RV32-NEXT:    srli t3, t3, 24
+; RV32-NEXT:    j .LBB3_28
+; RV32-NEXT:  .LBB3_26:
+; RV32-NEXT:    snez a2, s4
+; RV32-NEXT:    sltu s2, a5, s5
+; RV32-NEXT:    addi a2, a2, -1
+; RV32-NEXT:    and a2, a2, s2
+; RV32-NEXT:    snez s2, s3
+; RV32-NEXT:    bnez s7, .LBB3_25
+; RV32-NEXT:  .LBB3_27: # %_udiv-special-cases
+; RV32-NEXT:    srli s3, s6, 1
+; RV32-NEXT:    or s3, s6, s3
+; RV32-NEXT:    srli s5, s3, 2
+; RV32-NEXT:    or s3, s3, s5
+; RV32-NEXT:    srli s5, s3, 4
+; RV32-NEXT:    or s3, s3, s5
+; RV32-NEXT:    srli s5, s3, 8
+; RV32-NEXT:    or s3, s3, s5
+; RV32-NEXT:    srli s5, s3, 16
+; RV32-NEXT:    or s3, s3, s5
+; RV32-NEXT:    not s3, s3
+; RV32-NEXT:    srli s5, s3, 1
+; RV32-NEXT:    and t5, s5, t5
+; RV32-NEXT:    sub t5, s3, t5
+; RV32-NEXT:    and s3, t5, t4
+; RV32-NEXT:    srli t5, t5, 2
+; RV32-NEXT:    and t4, t5, t4
+; RV32-NEXT:    add t4, s3, t4
+; RV32-NEXT:    srli t5, t4, 4
+; RV32-NEXT:    add t4, t4, t5
+; RV32-NEXT:    and t3, t4, t3
+; RV32-NEXT:    slli t4, t3, 8
+; RV32-NEXT:    add t3, t3, t4
+; RV32-NEXT:    slli t4, t3, 16
+; RV32-NEXT:    add t3, t3, t4
+; RV32-NEXT:    srli t3, t3, 24
+; RV32-NEXT:    addi t3, t3, 32
+; RV32-NEXT:  .LBB3_28: # %_udiv-special-cases
+; RV32-NEXT:    xori t4, s0, 1
+; RV32-NEXT:    addi s2, s2, -1
+; RV32-NEXT:    bnez s4, .LBB3_30
+; RV32-NEXT:  # %bb.29: # %_udiv-special-cases
+; RV32-NEXT:    mv t3, a5
+; RV32-NEXT:  .LBB3_30: # %_udiv-special-cases
+; RV32-NEXT:    andi s11, a1, 1
+; RV32-NEXT:    andi s8, t2, 1
+; RV32-NEXT:    lw a1, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw a5, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT:    or s9, a1, a5
+; RV32-NEXT:    or t2, a7, a6
+; RV32-NEXT:    neg a1, t4
+; RV32-NEXT:    and s0, s2, s0
+; RV32-NEXT:    bnez a4, .LBB3_32
+; RV32-NEXT:  # %bb.31: # %_udiv-special-cases
+; RV32-NEXT:    mv t3, a3
+; RV32-NEXT:  .LBB3_32: # %_udiv-special-cases
+; RV32-NEXT:    lw a3, 20(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw a5, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT:    or s10, a3, a5
+; RV32-NEXT:    or a5, s9, s8
+; RV32-NEXT:    or t4, t0, t1
+; RV32-NEXT:    or t5, t2, s11
+; RV32-NEXT:    and a1, s0, a1
+; RV32-NEXT:    xori a3, a0, 1
+; RV32-NEXT:    snez a4, a4
+; RV32-NEXT:    neg a3, a3
+; RV32-NEXT:    addi a4, a4, -1
+; RV32-NEXT:    and a0, a4, a0
+; RV32-NEXT:    sltu a4, s1, t3
+; RV32-NEXT:    and t2, a0, a3
+; RV32-NEXT:    mv a3, a4
+; RV32-NEXT:    beq t6, a2, .LBB3_34
+; RV32-NEXT:  # %bb.33: # %_udiv-special-cases
+; RV32-NEXT:    sltu a3, t6, a2
+; RV32-NEXT:  .LBB3_34: # %_udiv-special-cases
+; RV32-NEXT:    or a0, a5, s10
+; RV32-NEXT:    or t5, t5, t4
+; RV32-NEXT:    sltu t4, a1, t2
+; RV32-NEXT:    mv s0, a3
+; RV32-NEXT:    beq a1, t2, .LBB3_36
+; RV32-NEXT:  # %bb.35: # %_udiv-special-cases
+; RV32-NEXT:    mv s0, t4
+; RV32-NEXT:  .LBB3_36: # %_udiv-special-cases
+; RV32-NEXT:    seqz a5, a0
+; RV32-NEXT:    seqz t5, t5
+; RV32-NEXT:    andi a0, s0, 1
+; RV32-NEXT:    sub a2, t6, a2
+; RV32-NEXT:    sub a1, a1, t2
+; RV32-NEXT:    sub t2, a2, a4
+; RV32-NEXT:    sltu a2, a1, a3
+; RV32-NEXT:    add a2, t4, a2
+; RV32-NEXT:    neg t4, a2
+; RV32-NEXT:    sub a4, a1, a3
+; RV32-NEXT:    or a1, a4, t4
+; RV32-NEXT:    sub a3, s1, t3
+; RV32-NEXT:    beqz a1, .LBB3_38
+; RV32-NEXT:  # %bb.37: # %_udiv-special-cases
+; RV32-NEXT:    snez a1, a1
+; RV32-NEXT:    or a2, a5, t5
+; RV32-NEXT:    bnez a0, .LBB3_39
+; RV32-NEXT:    j .LBB3_40
+; RV32-NEXT:  .LBB3_38:
+; RV32-NEXT:    snez a1, t2
+; RV32-NEXT:    sltiu a2, a3, 129
+; RV32-NEXT:    xori a2, a2, 1
+; RV32-NEXT:    or a1, a2, a1
+; RV32-NEXT:    or a2, a5, t5
+; RV32-NEXT:    beqz a0, .LBB3_40
+; RV32-NEXT:  .LBB3_39: # %_udiv-special-cases
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:  .LBB3_40: # %_udiv-special-cases
+; RV32-NEXT:    or t6, a2, a1
+; RV32-NEXT:    addi a1, t6, -1
+; RV32-NEXT:    and a2, s11, a1
+; RV32-NEXT:    and a5, a1, t1
+; RV32-NEXT:    and t3, a1, a6
+; RV32-NEXT:    and t5, a1, t0
+; RV32-NEXT:    and a1, a1, a7
+; RV32-NEXT:    bnez t6, .LBB3_57
+; RV32-NEXT:  # %bb.41: # %_udiv-special-cases
+; RV32-NEXT:    or t6, t2, t4
+; RV32-NEXT:    xori s0, a3, 128
+; RV32-NEXT:    or s0, s0, a0
+; RV32-NEXT:    or s0, s0, a4
+; RV32-NEXT:    or t6, s0, t6
+; RV32-NEXT:    beqz t6, .LBB3_57
+; RV32-NEXT:  # %bb.42: # %udiv-bb1
+; RV32-NEXT:    sw ra, 8(sp) # 4-byte Folded Spill
+; RV32-NEXT:    addi a1, a3, 1
+; RV32-NEXT:    sw zero, 136(sp)
+; RV32-NEXT:    sw zero, 140(sp)
+; RV32-NEXT:    sw zero, 144(sp)
+; RV32-NEXT:    sw zero, 148(sp)
+; RV32-NEXT:    sw zero, 120(sp)
+; RV32-NEXT:    sw zero, 124(sp)
+; RV32-NEXT:    sw zero, 128(sp)
+; RV32-NEXT:    sw zero, 132(sp)
+; RV32-NEXT:    sw a7, 152(sp)
+; RV32-NEXT:    sw t0, 156(sp)
+; RV32-NEXT:    sw a6, 160(sp)
+; RV32-NEXT:    sw t1, 164(sp)
+; RV32-NEXT:    sw s11, 168(sp)
+; RV32-NEXT:    li a5, 128
+; RV32-NEXT:    addi t3, sp, 152
+; RV32-NEXT:    neg a2, a3
+; RV32-NEXT:    seqz t5, a1
+; RV32-NEXT:    sub a5, a5, a3
+; RV32-NEXT:    add t2, t2, t5
+; RV32-NEXT:    andi a3, a5, 31
+; RV32-NEXT:    srli t5, a5, 3
+; RV32-NEXT:    or t6, a1, t2
+; RV32-NEXT:    xori a5, a3, 31
+; RV32-NEXT:    andi a3, t5, 28
+; RV32-NEXT:    seqz t6, t6
+; RV32-NEXT:    sub ra, t3, a3
+; RV32-NEXT:    add t6, a4, t6
+; RV32-NEXT:    lw t3, 0(ra)
+; RV32-NEXT:    lw s0, 4(ra)
+; RV32-NEXT:    lw s1, 8(ra)
+; RV32-NEXT:    lw a3, 12(ra)
+; RV32-NEXT:    sltu a4, t6, a4
+; RV32-NEXT:    or t5, a1, t6
+; RV32-NEXT:    add t4, t4, a4
+; RV32-NEXT:    or a4, t2, t4
+; RV32-NEXT:    or a4, t5, a4
+; RV32-NEXT:    srli t5, s1, 1
+; RV32-NEXT:    seqz s2, a4
+; RV32-NEXT:    add a0, a0, s2
+; RV32-NEXT:    sll s2, a3, a2
+; RV32-NEXT:    srl t5, t5, a5
+; RV32-NEXT:    or t5, s2, t5
+; RV32-NEXT:    srli s2, s0, 1
+; RV32-NEXT:    sll s1, s1, a2
+; RV32-NEXT:    srl s2, s2, a5
+; RV32-NEXT:    or s2, s1, s2
+; RV32-NEXT:    srli s1, t3, 1
+; RV32-NEXT:    sll s0, s0, a2
+; RV32-NEXT:    srl s1, s1, a5
+; RV32-NEXT:    andi s3, a0, 1
+; RV32-NEXT:    or s1, s0, s1
+; RV32-NEXT:    or a0, a4, s3
+; RV32-NEXT:    sll t3, t3, a2
+; RV32-NEXT:    beqz a0, .LBB3_55
+; RV32-NEXT:  # %bb.43: # %udiv-preheader
+; RV32-NEXT:    sw zero, 52(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw zero, 48(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw zero, 44(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw zero, 40(sp) # 4-byte Folded Spill
+; RV32-NEXT:    li s7, 0
+; RV32-NEXT:    srli a3, a3, 1
+; RV32-NEXT:    lw a0, 16(ra)
+; RV32-NEXT:    sw zero, 104(sp)
+; RV32-NEXT:    sw zero, 108(sp)
+; RV32-NEXT:    sw zero, 112(sp)
+; RV32-NEXT:    sw zero, 116(sp)
+; RV32-NEXT:    sw zero, 88(sp)
+; RV32-NEXT:    sw zero, 92(sp)
+; RV32-NEXT:    sw zero, 96(sp)
+; RV32-NEXT:    sw zero, 100(sp)
+; RV32-NEXT:    sw s11, 72(sp)
+; RV32-NEXT:    sw zero, 76(sp)
+; RV32-NEXT:    sw zero, 80(sp)
+; RV32-NEXT:    sw zero, 84(sp)
+; RV32-NEXT:    sw a7, 56(sp)
+; RV32-NEXT:    sw t0, 60(sp)
+; RV32-NEXT:    sw a6, 64(sp)
+; RV32-NEXT:    sw t1, 68(sp)
+; RV32-NEXT:    srli a4, a1, 3
+; RV32-NEXT:    addi a6, sp, 56
+; RV32-NEXT:    andi a7, a1, 31
+; RV32-NEXT:    or t0, s9, s10
+; RV32-NEXT:    srl a3, a3, a5
+; RV32-NEXT:    andi a4, a4, 28
+; RV32-NEXT:    xori a5, a7, 31
+; RV32-NEXT:    snez a7, t0
+; RV32-NEXT:    add a4, a6, a4
+; RV32-NEXT:    add a7, s8, a7
+; RV32-NEXT:    lw a6, 16(a4)
+; RV32-NEXT:    lw t0, 0(a4)
+; RV32-NEXT:    lw t1, 4(a4)
+; RV32-NEXT:    lw s0, 8(a4)
+; RV32-NEXT:    lw a4, 12(a4)
+; RV32-NEXT:    sll a0, a0, a2
+; RV32-NEXT:    or a3, a0, a3
+; RV32-NEXT:    slli a6, a6, 1
+; RV32-NEXT:    slli a0, a4, 1
+; RV32-NEXT:    slli a2, s0, 1
+; RV32-NEXT:    slli s4, t1, 1
+; RV32-NEXT:    sll a6, a6, a5
+; RV32-NEXT:    sll a0, a0, a5
+; RV32-NEXT:    sll s8, a2, a5
+; RV32-NEXT:    sll s4, s4, a5
+; RV32-NEXT:    srl a2, a4, a1
+; RV32-NEXT:    or ra, a2, a6
+; RV32-NEXT:    lw a6, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT:    seqz a4, a6
+; RV32-NEXT:    srl a2, s0, a1
+; RV32-NEXT:    or a2, a2, a0
+; RV32-NEXT:    lw a5, 20(sp) # 4-byte Folded Reload
+; RV32-NEXT:    or a0, a6, a5
+; RV32-NEXT:    sub s5, a5, a4
+; RV32-NEXT:    seqz a4, a0
+; RV32-NEXT:    srl a0, t1, a1
+; RV32-NEXT:    or a0, a0, s8
+; RV32-NEXT:    lw a5, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT:    sub t1, a5, a4
+; RV32-NEXT:    sw t1, 36(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sltu a4, a5, a4
+; RV32-NEXT:    addi a7, a7, 1
+; RV32-NEXT:    lw a5, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT:    sub s6, a5, a4
+; RV32-NEXT:    andi a4, a7, 1
+; RV32-NEXT:    sw a4, 16(sp) # 4-byte Folded Spill
+; RV32-NEXT:    andi a5, a3, 1
+; RV32-NEXT:    srl a3, t0, a1
+; RV32-NEXT:    or a4, a3, s4
+; RV32-NEXT:    addi a6, a6, -1
+; RV32-NEXT:    sw a6, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    li s11, 0
+; RV32-NEXT:    li s10, 0
+; RV32-NEXT:    j .LBB3_45
+; RV32-NEXT:  .LBB3_44: # %udiv-do-while
+; RV32-NEXT:    # in Loop: Header=BB3_45 Depth=1
+; RV32-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT:    and s0, a5, s0
+; RV32-NEXT:    xor s8, t1, a7
+; RV32-NEXT:    xor s9, a2, s0
+; RV32-NEXT:    or s8, s9, s8
+; RV32-NEXT:    li s9, 0
+; RV32-NEXT:    li s8, 0
+; RV32-NEXT:    sltu s4, a2, s0
+; RV32-NEXT:    sub s0, a2, s0
+; RV32-NEXT:    sub a7, t1, a7
+; RV32-NEXT:    srli a2, s2, 31
+; RV32-NEXT:    sub a0, a0, t0
+; RV32-NEXT:    slli t0, t5, 1
+; RV32-NEXT:    or t0, t0, a2
+; RV32-NEXT:    srli a2, s1, 31
+; RV32-NEXT:    slli s2, s2, 1
+; RV32-NEXT:    or t1, s2, a2
+; RV32-NEXT:    srli a2, t3, 31
+; RV32-NEXT:    slli s1, s1, 1
+; RV32-NEXT:    or s1, s1, a2
+; RV32-NEXT:    slli t3, t3, 1
+; RV32-NEXT:    lw a2, 52(sp) # 4-byte Folded Reload
+; RV32-NEXT:    or t3, a2, t3
+; RV32-NEXT:    srli a2, t5, 31
+; RV32-NEXT:    or s7, s7, a2
+; RV32-NEXT:    sub a2, s0, ra
+; RV32-NEXT:    sltu s0, s0, ra
+; RV32-NEXT:    or t5, a1, t6
+; RV32-NEXT:    sub a7, a7, s4
+; RV32-NEXT:    or s2, t2, t4
+; RV32-NEXT:    sub a0, a0, a6
+; RV32-NEXT:    or a6, a1, t2
+; RV32-NEXT:    or s4, t5, s2
+; RV32-NEXT:    seqz t5, a1
+; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    andi a5, a5, 1
+; RV32-NEXT:    sw a5, 52(sp) # 4-byte Folded Spill
+; RV32-NEXT:    seqz a6, a6
+; RV32-NEXT:    sub t2, t2, t5
+; RV32-NEXT:    lw a5, 48(sp) # 4-byte Folded Reload
+; RV32-NEXT:    or s1, a5, s1
+; RV32-NEXT:    lw a5, 44(sp) # 4-byte Folded Reload
+; RV32-NEXT:    or s2, a5, t1
+; RV32-NEXT:    lw a5, 40(sp) # 4-byte Folded Reload
+; RV32-NEXT:    or t5, a5, t0
+; RV32-NEXT:    andi a5, s7, 1
+; RV32-NEXT:    sub ra, a7, s0
+; RV32-NEXT:    snez a7, s4
+; RV32-NEXT:    sltu t0, t6, a6
+; RV32-NEXT:    sub t6, t6, a6
+; RV32-NEXT:    add a7, s3, a7
+; RV32-NEXT:    sub t4, t4, t0
+; RV32-NEXT:    or a6, a1, t6
+; RV32-NEXT:    addi a7, a7, 1
+; RV32-NEXT:    or t0, t2, t4
+; RV32-NEXT:    andi s3, a7, 1
+; RV32-NEXT:    or a6, a6, t0
+; RV32-NEXT:    or a6, a6, s3
+; RV32-NEXT:    sub a4, a4, a3
+; RV32-NEXT:    sw zero, 48(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw zero, 44(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw zero, 40(sp) # 4-byte Folded Spill
+; RV32-NEXT:    li s7, 0
+; RV32-NEXT:    beqz a6, .LBB3_56
+; RV32-NEXT:  .LBB3_45: # %udiv-do-while
+; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32-NEXT:    srli a3, a2, 31
+; RV32-NEXT:    slli a6, ra, 1
+; RV32-NEXT:    or t1, a6, a3
+; RV32-NEXT:    srli a3, a0, 31
+; RV32-NEXT:    slli a2, a2, 1
+; RV32-NEXT:    or a2, a2, a3
+; RV32-NEXT:    beq s6, t1, .LBB3_47
+; RV32-NEXT:  # %bb.46: # %udiv-do-while
+; RV32-NEXT:    # in Loop: Header=BB3_45 Depth=1
+; RV32-NEXT:    sltu a3, s6, t1
+; RV32-NEXT:    j .LBB3_48
+; RV32-NEXT:  .LBB3_47: # in Loop: Header=BB3_45 Depth=1
+; RV32-NEXT:    lw a3, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT:    sltu a3, a3, a2
+; RV32-NEXT:  .LBB3_48: # %udiv-do-while
+; RV32-NEXT:    # in Loop: Header=BB3_45 Depth=1
+; RV32-NEXT:    srli a6, a4, 31
+; RV32-NEXT:    slli a0, a0, 1
+; RV32-NEXT:    slli a4, a4, 1
+; RV32-NEXT:    or a0, a0, a6
+; RV32-NEXT:    andi a5, a5, 1
+; RV32-NEXT:    or a4, a4, a5
+; RV32-NEXT:    beq s5, a0, .LBB3_50
+; RV32-NEXT:  # %bb.49: # %udiv-do-while
+; RV32-NEXT:    # in Loop: Header=BB3_45 Depth=1
+; RV32-NEXT:    sltu a5, s5, a0
+; RV32-NEXT:    j .LBB3_51
+; RV32-NEXT:  .LBB3_50: # in Loop: Header=BB3_45 Depth=1
+; RV32-NEXT:    lw a5, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    sltu a5, a5, a4
+; RV32-NEXT:  .LBB3_51: # %udiv-do-while
+; RV32-NEXT:    # in Loop: Header=BB3_45 Depth=1
+; RV32-NEXT:    lw a6, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT:    xor a6, a6, a2
+; RV32-NEXT:    xor a7, s6, t1
+; RV32-NEXT:    or a6, a6, a7
+; RV32-NEXT:    beqz a6, .LBB3_53
+; RV32-NEXT:  # %bb.52: # %udiv-do-while
+; RV32-NEXT:    # in Loop: Header=BB3_45 Depth=1
+; RV32-NEXT:    mv a5, a3
+; RV32-NEXT:  .LBB3_53: # %udiv-do-while
+; RV32-NEXT:    # in Loop: Header=BB3_45 Depth=1
+; RV32-NEXT:    srli a3, ra, 31
+; RV32-NEXT:    lw a6, 16(sp) # 4-byte Folded Reload
+; RV32-NEXT:    sub a3, a6, a3
+; RV32-NEXT:    sub a3, a3, a5
+; RV32-NEXT:    slli a3, a3, 31
+; RV32-NEXT:    srai a5, a3, 31
+; RV32-NEXT:    lw a3, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT:    and a7, a5, a3
+; RV32-NEXT:    lw a3, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT:    and a3, a5, a3
+; RV32-NEXT:    lw a6, 20(sp) # 4-byte Folded Reload
+; RV32-NEXT:    and t0, a5, a6
+; RV32-NEXT:    sltu a6, a4, a3
+; RV32-NEXT:    mv ra, a6
+; RV32-NEXT:    beq a0, t0, .LBB3_44
+; RV32-NEXT:  # %bb.54: # %udiv-do-while
+; RV32-NEXT:    # in Loop: Header=BB3_45 Depth=1
+; RV32-NEXT:    sltu ra, a0, t0
+; RV32-NEXT:    j .LBB3_44
+; RV32-NEXT:  .LBB3_55:
+; RV32-NEXT:    sw zero, 52(sp) # 4-byte Folded Spill
+; RV32-NEXT:    li s11, 0
+; RV32-NEXT:    li s9, 0
+; RV32-NEXT:    li s10, 0
+; RV32-NEXT:    li s8, 0
+; RV32-NEXT:  .LBB3_56: # %udiv-loop-exit
+; RV32-NEXT:    srli a0, s2, 31
+; RV32-NEXT:    slli a1, t5, 1
+; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    srli a1, s1, 31
+; RV32-NEXT:    slli s2, s2, 1
+; RV32-NEXT:    or a2, s2, a1
+; RV32-NEXT:    srli a3, t3, 31
+; RV32-NEXT:    slli s1, s1, 1
+; RV32-NEXT:    srli a4, t5, 31
+; RV32-NEXT:    slli t3, t3, 1
+; RV32-NEXT:    lw a1, 52(sp) # 4-byte Folded Reload
+; RV32-NEXT:    or a1, a1, t3
+; RV32-NEXT:    or a3, s11, a3
+; RV32-NEXT:    or a4, s8, a4
+; RV32-NEXT:    or t5, a3, s1
+; RV32-NEXT:    or t3, s9, a2
+; RV32-NEXT:    or a5, s10, a0
+; RV32-NEXT:    andi a2, a4, 1
+; RV32-NEXT:    lw ra, 8(sp) # 4-byte Folded Reload
+; RV32-NEXT:  .LBB3_57: # %udiv-end
+; RV32-NEXT:    sw a1, 0(ra)
+; RV32-NEXT:    sw t5, 4(ra)
+; RV32-NEXT:    sw t3, 8(ra)
+; RV32-NEXT:    sw a5, 12(ra)
+; RV32-NEXT:    andi a2, a2, 1
+; RV32-NEXT:    sb a2, 16(ra)
+; RV32-NEXT:    lw ra, 236(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 232(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 228(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s2, 224(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s3, 220(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s4, 216(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s5, 212(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s6, 208(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s7, 204(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s8, 200(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s9, 196(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s10, 192(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s11, 188(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 240
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: udiv_i129:
+; RV64:       # %bb.0: # %_udiv-special-cases
+; RV64-NEXT:    ld a3, 0(a2)
+; RV64-NEXT:    ld a4, 8(a2)
+; RV64-NEXT:    ld t1, 16(a2)
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a5, 209715
+; RV64-NEXT:    lui a6, 61681
+; RV64-NEXT:    addi t0, a2, 1365
+; RV64-NEXT:    addi a7, a5, 819
+; RV64-NEXT:    addi a6, a6, -241
+; RV64-NEXT:    slli a2, t0, 32
+; RV64-NEXT:    slli a5, a7, 32
+; RV64-NEXT:    slli t2, a6, 32
+; RV64-NEXT:    add t0, t0, a2
+; RV64-NEXT:    add a7, a7, a5
+; RV64-NEXT:    add a6, a6, t2
+; RV64-NEXT:    srli a2, a4, 1
+; RV64-NEXT:    slli a5, t1, 63
+; RV64-NEXT:    slli t2, a4, 63
+; RV64-NEXT:    or t3, a5, a2
+; RV64-NEXT:    srli a2, a3, 1
+; RV64-NEXT:    or t4, a2, t2
+; RV64-NEXT:    bnez t3, .LBB3_2
+; RV64-NEXT:  # %bb.1: # %_udiv-special-cases
+; RV64-NEXT:    srli a2, t4, 1
+; RV64-NEXT:    or a2, t4, a2
+; RV64-NEXT:    srli a5, a2, 2
+; RV64-NEXT:    or a2, a2, a5
+; RV64-NEXT:    srli a5, a2, 4
+; RV64-NEXT:    or a2, a2, a5
+; RV64-NEXT:    srli a5, a2, 8
+; RV64-NEXT:    or a2, a2, a5
+; RV64-NEXT:    srli a5, a2, 16
+; RV64-NEXT:    or a2, a2, a5
+; RV64-NEXT:    srli a5, a2, 32
+; RV64-NEXT:    or a2, a2, a5
+; RV64-NEXT:    not a2, a2
+; RV64-NEXT:    srli a5, a2, 1
+; RV64-NEXT:    and a5, a5, t0
+; RV64-NEXT:    sub a2, a2, a5
+; RV64-NEXT:    and a5, a2, a7
+; RV64-NEXT:    srli a2, a2, 2
+; RV64-NEXT:    and a2, a2, a7
+; RV64-NEXT:    add a2, a5, a2
+; RV64-NEXT:    srli a5, a2, 4
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    and a2, a2, a6
+; RV64-NEXT:    slli a5, a2, 8
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a2, 16
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    srli a2, a2, 56
+; RV64-NEXT:    addi t2, a2, 64
+; RV64-NEXT:    j .LBB3_3
+; RV64-NEXT:  .LBB3_2:
+; RV64-NEXT:    srli a2, t3, 1
+; RV64-NEXT:    or a2, t3, a2
+; RV64-NEXT:    srli a5, a2, 2
+; RV64-NEXT:    or a2, a2, a5
+; RV64-NEXT:    srli a5, a2, 4
+; RV64-NEXT:    or a2, a2, a5
+; RV64-NEXT:    srli a5, a2, 8
+; RV64-NEXT:    or a2, a2, a5
+; RV64-NEXT:    srli a5, a2, 16
+; RV64-NEXT:    or a2, a2, a5
+; RV64-NEXT:    srli a5, a2, 32
+; RV64-NEXT:    or a2, a2, a5
+; RV64-NEXT:    not a2, a2
+; RV64-NEXT:    srli a5, a2, 1
+; RV64-NEXT:    and a5, a5, t0
+; RV64-NEXT:    sub a2, a2, a5
+; RV64-NEXT:    and a5, a2, a7
+; RV64-NEXT:    srli a2, a2, 2
+; RV64-NEXT:    and a2, a2, a7
+; RV64-NEXT:    add a2, a5, a2
+; RV64-NEXT:    srli a5, a2, 4
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    and a2, a2, a6
+; RV64-NEXT:    slli a5, a2, 8
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a2, 16
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    srli t2, a2, 56
+; RV64-NEXT:  .LBB3_3: # %_udiv-special-cases
+; RV64-NEXT:    addi sp, sp, -192
+; RV64-NEXT:    sd s0, 184(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s1, 176(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s2, 168(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s3, 160(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s4, 152(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s5, 144(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s6, 136(sp) # 8-byte Folded Spill
+; RV64-NEXT:    slli a2, a3, 63
+; RV64-NEXT:    li t5, 128
+; RV64-NEXT:    bnez a2, .LBB3_5
+; RV64-NEXT:  # %bb.4: # %_udiv-special-cases
+; RV64-NEXT:    li s0, 128
+; RV64-NEXT:    j .LBB3_6
+; RV64-NEXT:  .LBB3_5:
+; RV64-NEXT:    srli a5, a2, 1
+; RV64-NEXT:    or a2, a2, a5
+; RV64-NEXT:    srli a5, a2, 2
+; RV64-NEXT:    or a2, a2, a5
+; RV64-NEXT:    srli a5, a2, 4
+; RV64-NEXT:    or a2, a2, a5
+; RV64-NEXT:    srli a5, a2, 8
+; RV64-NEXT:    or a2, a2, a5
+; RV64-NEXT:    srli a5, a2, 16
+; RV64-NEXT:    or a2, a2, a5
+; RV64-NEXT:    srli a5, a2, 32
+; RV64-NEXT:    or a2, a2, a5
+; RV64-NEXT:    not a2, a2
+; RV64-NEXT:    srli a5, a2, 1
+; RV64-NEXT:    and a5, a5, t0
+; RV64-NEXT:    sub a2, a2, a5
+; RV64-NEXT:    and a5, a2, a7
+; RV64-NEXT:    srli a2, a2, 2
+; RV64-NEXT:    and a2, a2, a7
+; RV64-NEXT:    add a2, a5, a2
+; RV64-NEXT:    srli a5, a2, 4
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    and a2, a2, a6
+; RV64-NEXT:    slli a5, a2, 8
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a2, 16
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    srli s0, a2, 56
+; RV64-NEXT:  .LBB3_6: # %_udiv-special-cases
+; RV64-NEXT:    ld a5, 0(a1)
+; RV64-NEXT:    ld a2, 8(a1)
+; RV64-NEXT:    ld s2, 16(a1)
+; RV64-NEXT:    or a1, t4, t3
+; RV64-NEXT:    addi s1, s0, 128
+; RV64-NEXT:    bnez a1, .LBB3_8
+; RV64-NEXT:  # %bb.7: # %_udiv-special-cases
+; RV64-NEXT:    mv t2, s1
+; RV64-NEXT:  .LBB3_8: # %_udiv-special-cases
+; RV64-NEXT:    snez s3, a1
+; RV64-NEXT:    srli a1, a2, 1
+; RV64-NEXT:    slli t3, s2, 63
+; RV64-NEXT:    slli t4, a2, 63
+; RV64-NEXT:    or a1, t3, a1
+; RV64-NEXT:    srli t3, a5, 1
+; RV64-NEXT:    or t6, t3, t4
+; RV64-NEXT:    bnez a1, .LBB3_10
+; RV64-NEXT:  # %bb.9: # %_udiv-special-cases
+; RV64-NEXT:    srli t3, t6, 1
+; RV64-NEXT:    or t3, t6, t3
+; RV64-NEXT:    srli t4, t3, 2
+; RV64-NEXT:    or t3, t3, t4
+; RV64-NEXT:    srli t4, t3, 4
+; RV64-NEXT:    or t3, t3, t4
+; RV64-NEXT:    srli t4, t3, 8
+; RV64-NEXT:    or t3, t3, t4
+; RV64-NEXT:    srli t4, t3, 16
+; RV64-NEXT:    or t3, t3, t4
+; RV64-NEXT:    srli t4, t3, 32
+; RV64-NEXT:    or t3, t3, t4
+; RV64-NEXT:    not t3, t3
+; RV64-NEXT:    srli t4, t3, 1
+; RV64-NEXT:    and t4, t4, t0
+; RV64-NEXT:    sub t3, t3, t4
+; RV64-NEXT:    and t4, t3, a7
+; RV64-NEXT:    srli t3, t3, 2
+; RV64-NEXT:    and t3, t3, a7
+; RV64-NEXT:    add t3, t4, t3
+; RV64-NEXT:    srli t4, t3, 4
+; RV64-NEXT:    add t3, t3, t4
+; RV64-NEXT:    and t3, t3, a6
+; RV64-NEXT:    slli t4, t3, 8
+; RV64-NEXT:    add t3, t3, t4
+; RV64-NEXT:    slli t4, t3, 16
+; RV64-NEXT:    add t3, t3, t4
+; RV64-NEXT:    slli t4, t3, 32
+; RV64-NEXT:    add t3, t3, t4
+; RV64-NEXT:    srli t3, t3, 56
+; RV64-NEXT:    addi s4, t3, 64
+; RV64-NEXT:    j .LBB3_11
+; RV64-NEXT:  .LBB3_10:
+; RV64-NEXT:    srli t3, a1, 1
+; RV64-NEXT:    or t3, a1, t3
+; RV64-NEXT:    srli t4, t3, 2
+; RV64-NEXT:    or t3, t3, t4
+; RV64-NEXT:    srli t4, t3, 4
+; RV64-NEXT:    or t3, t3, t4
+; RV64-NEXT:    srli t4, t3, 8
+; RV64-NEXT:    or t3, t3, t4
+; RV64-NEXT:    srli t4, t3, 16
+; RV64-NEXT:    or t3, t3, t4
+; RV64-NEXT:    srli t4, t3, 32
+; RV64-NEXT:    or t3, t3, t4
+; RV64-NEXT:    not t3, t3
+; RV64-NEXT:    srli t4, t3, 1
+; RV64-NEXT:    and t4, t4, t0
+; RV64-NEXT:    sub t3, t3, t4
+; RV64-NEXT:    and t4, t3, a7
+; RV64-NEXT:    srli t3, t3, 2
+; RV64-NEXT:    and t3, t3, a7
+; RV64-NEXT:    add t3, t4, t3
+; RV64-NEXT:    srli t4, t3, 4
+; RV64-NEXT:    add t3, t3, t4
+; RV64-NEXT:    and t3, t3, a6
+; RV64-NEXT:    slli t4, t3, 8
+; RV64-NEXT:    add t3, t3, t4
+; RV64-NEXT:    slli t4, t3, 16
+; RV64-NEXT:    add t3, t3, t4
+; RV64-NEXT:    slli t4, t3, 32
+; RV64-NEXT:    add t3, t3, t4
+; RV64-NEXT:    srli s4, t3, 56
+; RV64-NEXT:  .LBB3_11: # %_udiv-special-cases
+; RV64-NEXT:    andi t4, s2, 1
+; RV64-NEXT:    andi t1, t1, 1
+; RV64-NEXT:    or t3, a3, a4
+; RV64-NEXT:    or s2, a5, a2
+; RV64-NEXT:    sltu s0, s1, s0
+; RV64-NEXT:    slli s1, a5, 63
+; RV64-NEXT:    addi s3, s3, -1
+; RV64-NEXT:    beqz s1, .LBB3_13
+; RV64-NEXT:  # %bb.12:
+; RV64-NEXT:    srli t5, s1, 1
+; RV64-NEXT:    or t5, s1, t5
+; RV64-NEXT:    srli s1, t5, 2
+; RV64-NEXT:    or t5, t5, s1
+; RV64-NEXT:    srli s1, t5, 4
+; RV64-NEXT:    or t5, t5, s1
+; RV64-NEXT:    srli s1, t5, 8
+; RV64-NEXT:    or t5, t5, s1
+; RV64-NEXT:    srli s1, t5, 16
+; RV64-NEXT:    or t5, t5, s1
+; RV64-NEXT:    srli s1, t5, 32
+; RV64-NEXT:    or t5, t5, s1
+; RV64-NEXT:    not t5, t5
+; RV64-NEXT:    srli s1, t5, 1
+; RV64-NEXT:    and t0, s1, t0
+; RV64-NEXT:    sub t0, t5, t0
+; RV64-NEXT:    and t5, t0, a7
+; RV64-NEXT:    srli t0, t0, 2
+; RV64-NEXT:    and a7, t0, a7
+; RV64-NEXT:    add a7, t5, a7
+; RV64-NEXT:    srli t0, a7, 4
+; RV64-NEXT:    add a7, a7, t0
+; RV64-NEXT:    and a6, a7, a6
+; RV64-NEXT:    slli a7, a6, 8
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    slli a7, a6, 16
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    slli a7, a6, 32
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    srli t5, a6, 56
+; RV64-NEXT:  .LBB3_13: # %_udiv-special-cases
+; RV64-NEXT:    or t0, t3, t1
+; RV64-NEXT:    or a6, s2, t4
+; RV64-NEXT:    and a7, s3, s0
+; RV64-NEXT:    or t6, t6, a1
+; RV64-NEXT:    addi s0, t5, 128
+; RV64-NEXT:    bnez t6, .LBB3_15
+; RV64-NEXT:  # %bb.14: # %_udiv-special-cases
+; RV64-NEXT:    mv s4, s0
+; RV64-NEXT:  .LBB3_15: # %_udiv-special-cases
+; RV64-NEXT:    seqz a1, t0
+; RV64-NEXT:    sltu t0, s0, t5
+; RV64-NEXT:    snez t5, t6
+; RV64-NEXT:    addi t5, t5, -1
+; RV64-NEXT:    and t0, t5, t0
+; RV64-NEXT:    sltu t5, t2, s4
+; RV64-NEXT:    seqz a6, a6
+; RV64-NEXT:    mv t6, t5
+; RV64-NEXT:    beq a7, t0, .LBB3_17
+; RV64-NEXT:  # %bb.16: # %_udiv-special-cases
+; RV64-NEXT:    sltu t6, a7, t0
+; RV64-NEXT:  .LBB3_17: # %_udiv-special-cases
+; RV64-NEXT:    or a1, a1, a6
+; RV64-NEXT:    andi a6, t6, 1
+; RV64-NEXT:    sub a7, a7, t0
+; RV64-NEXT:    sub t5, a7, t5
+; RV64-NEXT:    sub a7, t2, s4
+; RV64-NEXT:    beqz a6, .LBB3_19
+; RV64-NEXT:  # %bb.18: # %_udiv-special-cases
+; RV64-NEXT:    mv t0, a6
+; RV64-NEXT:    j .LBB3_20
+; RV64-NEXT:  .LBB3_19:
+; RV64-NEXT:    sltiu t0, a7, 129
+; RV64-NEXT:    xori t0, t0, 1
+; RV64-NEXT:    snez t2, t5
+; RV64-NEXT:    or t0, t0, t2
+; RV64-NEXT:  .LBB3_20: # %_udiv-special-cases
+; RV64-NEXT:    or t6, a1, t0
+; RV64-NEXT:    addi a1, t6, -1
+; RV64-NEXT:    and t2, t4, a1
+; RV64-NEXT:    and t0, a1, a2
+; RV64-NEXT:    and a1, a1, a5
+; RV64-NEXT:    bnez t6, .LBB3_30
+; RV64-NEXT:  # %bb.21: # %_udiv-special-cases
+; RV64-NEXT:    xori t6, a7, 128
+; RV64-NEXT:    or t6, t6, a6
+; RV64-NEXT:    or t6, t6, t5
+; RV64-NEXT:    beqz t6, .LBB3_30
+; RV64-NEXT:  # %bb.22: # %udiv-bb1
+; RV64-NEXT:    addi a1, a7, 1
+; RV64-NEXT:    sd zero, 64(sp)
+; RV64-NEXT:    sd zero, 72(sp)
+; RV64-NEXT:    sd zero, 80(sp)
+; RV64-NEXT:    sd zero, 88(sp)
+; RV64-NEXT:    sd a5, 96(sp)
+; RV64-NEXT:    sd a2, 104(sp)
+; RV64-NEXT:    sd t4, 112(sp)
+; RV64-NEXT:    li t0, 128
+; RV64-NEXT:    addi t2, sp, 96
+; RV64-NEXT:    neg s1, a7
+; RV64-NEXT:    seqz t6, a1
+; RV64-NEXT:    sub a7, t0, a7
+; RV64-NEXT:    add t5, t5, t6
+; RV64-NEXT:    andi t0, a7, 63
+; RV64-NEXT:    srli a7, a7, 3
+; RV64-NEXT:    or t6, a1, t5
+; RV64-NEXT:    xori s2, t0, 63
+; RV64-NEXT:    andi a7, a7, 24
+; RV64-NEXT:    seqz t0, t6
+; RV64-NEXT:    sub s3, t2, a7
+; RV64-NEXT:    add a6, a6, t0
+; RV64-NEXT:    ld t2, 0(s3)
+; RV64-NEXT:    ld s4, 8(s3)
+; RV64-NEXT:    andi a7, a6, 1
+; RV64-NEXT:    or t6, t6, a7
+; RV64-NEXT:    srli a6, t2, 1
+; RV64-NEXT:    sll t0, s4, s1
+; RV64-NEXT:    srl a6, a6, s2
+; RV64-NEXT:    or t0, t0, a6
+; RV64-NEXT:    sll a6, t2, s1
+; RV64-NEXT:    li t2, 0
+; RV64-NEXT:    beqz t6, .LBB3_28
+; RV64-NEXT:  # %bb.23: # %udiv-preheader
+; RV64-NEXT:    li t6, 0
+; RV64-NEXT:    li s0, 0
+; RV64-NEXT:    srli s4, s4, 1
+; RV64-NEXT:    ld s3, 16(s3)
+; RV64-NEXT:    sd zero, 32(sp)
+; RV64-NEXT:    sd zero, 40(sp)
+; RV64-NEXT:    sd zero, 48(sp)
+; RV64-NEXT:    sd zero, 56(sp)
+; RV64-NEXT:    sd a5, 0(sp)
+; RV64-NEXT:    sd a2, 8(sp)
+; RV64-NEXT:    sd t4, 16(sp)
+; RV64-NEXT:    sd zero, 24(sp)
+; RV64-NEXT:    srli a2, a1, 3
+; RV64-NEXT:    srl a5, s4, s2
+; RV64-NEXT:    mv t4, sp
+; RV64-NEXT:    snez t3, t3
+; RV64-NEXT:    andi a2, a2, 24
+; RV64-NEXT:    add t1, t1, t3
+; RV64-NEXT:    add a2, t4, a2
+; RV64-NEXT:    ld t3, 0(a2)
+; RV64-NEXT:    ld t4, 8(a2)
+; RV64-NEXT:    ld a2, 16(a2)
+; RV64-NEXT:    sll s1, s3, s1
+; RV64-NEXT:    andi s2, a1, 63
+; RV64-NEXT:    xori s2, s2, 63
+; RV64-NEXT:    or s3, s1, a5
+; RV64-NEXT:    slli a2, a2, 1
+; RV64-NEXT:    slli a5, t4, 1
+; RV64-NEXT:    sll a2, a2, s2
+; RV64-NEXT:    sll s2, a5, s2
+; RV64-NEXT:    srl s1, t4, a1
+; RV64-NEXT:    or s1, s1, a2
+; RV64-NEXT:    seqz a2, a3
+; RV64-NEXT:    sub a2, a4, a2
+; RV64-NEXT:    addi a5, t1, 1
+; RV64-NEXT:    andi a5, a5, 1
+; RV64-NEXT:    andi s3, s3, 1
+; RV64-NEXT:    srl t1, t3, a1
+; RV64-NEXT:    or s2, t1, s2
+; RV64-NEXT:    addi t1, a3, -1
+; RV64-NEXT:    j .LBB3_26
+; RV64-NEXT:  .LBB3_24: # %udiv-do-while
+; RV64-NEXT:    # in Loop: Header=BB3_26 Depth=1
+; RV64-NEXT:    sltu t3, a2, s4
+; RV64-NEXT:  .LBB3_25: # %udiv-do-while
+; RV64-NEXT:    # in Loop: Header=BB3_26 Depth=1
+; RV64-NEXT:    srli s1, s1, 63
+; RV64-NEXT:    sub t4, a5, s1
+; RV64-NEXT:    sub t3, t4, t3
+; RV64-NEXT:    slli t3, t3, 63
+; RV64-NEXT:    srai s1, t3, 63
+; RV64-NEXT:    and s3, s1, a4
+; RV64-NEXT:    li t3, 0
+; RV64-NEXT:    li t4, 0
+; RV64-NEXT:    srli s5, a6, 63
+; RV64-NEXT:    sub s4, s4, s3
+; RV64-NEXT:    slli s3, t0, 1
+; RV64-NEXT:    or s3, s3, s5
+; RV64-NEXT:    srli t0, t0, 63
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    or a6, t2, a6
+; RV64-NEXT:    seqz t2, a1
+; RV64-NEXT:    or s0, s0, t0
+; RV64-NEXT:    or s5, a1, t5
+; RV64-NEXT:    sub t5, t5, t2
+; RV64-NEXT:    and s6, s1, a3
+; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    andi t2, s1, 1
+; RV64-NEXT:    or t0, t6, s3
+; RV64-NEXT:    sltu t6, s2, s6
+; RV64-NEXT:    snez s5, s5
+; RV64-NEXT:    andi s3, s0, 1
+; RV64-NEXT:    sub s1, s4, t6
+; RV64-NEXT:    add a7, a7, s5
+; RV64-NEXT:    addi a7, a7, 1
+; RV64-NEXT:    andi a7, a7, 1
+; RV64-NEXT:    or t6, a1, t5
+; RV64-NEXT:    or s4, t6, a7
+; RV64-NEXT:    sub s2, s2, s6
+; RV64-NEXT:    li t6, 0
+; RV64-NEXT:    li s0, 0
+; RV64-NEXT:    beqz s4, .LBB3_29
+; RV64-NEXT:  .LBB3_26: # %udiv-do-while
+; RV64-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64-NEXT:    srli t3, s2, 63
+; RV64-NEXT:    slli t4, s1, 1
+; RV64-NEXT:    slli s2, s2, 1
+; RV64-NEXT:    or s4, t4, t3
+; RV64-NEXT:    andi t3, s3, 1
+; RV64-NEXT:    or s2, s2, t3
+; RV64-NEXT:    bne a2, s4, .LBB3_24
+; RV64-NEXT:  # %bb.27: # in Loop: Header=BB3_26 Depth=1
+; RV64-NEXT:    sltu t3, t1, s2
+; RV64-NEXT:    j .LBB3_25
+; RV64-NEXT:  .LBB3_28:
+; RV64-NEXT:    li t3, 0
+; RV64-NEXT:    li t4, 0
+; RV64-NEXT:  .LBB3_29: # %udiv-loop-exit
+; RV64-NEXT:    srli a2, a6, 63
+; RV64-NEXT:    slli a3, t0, 1
+; RV64-NEXT:    srli a4, t0, 63
+; RV64-NEXT:    slli a6, a6, 1
+; RV64-NEXT:    or a1, t2, a6
+; RV64-NEXT:    or a2, t3, a2
+; RV64-NEXT:    or a4, t4, a4
+; RV64-NEXT:    or t0, a2, a3
+; RV64-NEXT:    andi t2, a4, 1
+; RV64-NEXT:  .LBB3_30: # %udiv-end
+; RV64-NEXT:    andi a2, t2, 1
+; RV64-NEXT:    sd a1, 0(a0)
+; RV64-NEXT:    sd t0, 8(a0)
+; RV64-NEXT:    sb a2, 16(a0)
+; RV64-NEXT:    ld s0, 184(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s1, 176(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s2, 168(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s3, 160(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s4, 152(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s5, 144(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s6, 136(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 192
+; RV64-NEXT:    ret
   %res = udiv i129 %x, %y
   ret i129 %res
 }
diff --git a/llvm/test/CodeGen/RISCV/min-max.ll b/llvm/test/CodeGen/RISCV/min-max.ll
index acde8adf5d08b..e7f6899f18d16 100644
--- a/llvm/test/CodeGen/RISCV/min-max.ll
+++ b/llvm/test/CodeGen/RISCV/min-max.ll
@@ -5,6 +5,12 @@
 ; RUN:   FileCheck %s --check-prefixes=ZBB,RV32ZBB
 ; RUN: llc < %s -mtriple=riscv64 -mattr=+zbb | \
 ; RUN:   FileCheck %s --check-prefixes=ZBB,RV64ZBB
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcicm,+experimental-xqcics,+experimental-xqcicli,+zca,+short-forward-branch-opt,+conditional-cmv-fusion -verify-machineinstrs < %s | \
+; RUN:   FileCheck %s --check-prefixes=XQCI
+; RUN: llc < %s -mtriple=riscv32 -mattr=+short-forward-branch-opt | \
+; RUN:   FileCheck %s --check-prefixes=RV32I-SFB
+; RUN: llc < %s -mtriple=riscv64 -mattr=+short-forward-branch-opt | \
+; RUN:   FileCheck %s --check-prefixes=RV64I-SFB
 
 ; Basic tests.
 
@@ -23,6 +29,27 @@ define signext i8 @smax_i8(i8 signext %a, i8 signext %b) {
 ; ZBB:       # %bb.0:
 ; ZBB-NEXT:    max a0, a0, a1
 ; ZBB-NEXT:    ret
+;
+; XQCI-LABEL: smax_i8:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    qc.mvge a0, a1, a0, a1
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: smax_i8:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    blt a1, a0, .LBB0_2
+; RV32I-SFB-NEXT:  # %bb.1:
+; RV32I-SFB-NEXT:    mv a0, a1
+; RV32I-SFB-NEXT:  .LBB0_2:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: smax_i8:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    blt a1, a0, .LBB0_2
+; RV64I-SFB-NEXT:  # %bb.1:
+; RV64I-SFB-NEXT:    mv a0, a1
+; RV64I-SFB-NEXT:  .LBB0_2:
+; RV64I-SFB-NEXT:    ret
   %c = call i8 @llvm.smax.i8(i8 %a, i8 %b)
   ret i8 %c
 }
@@ -42,6 +69,27 @@ define signext i16 @smax_i16(i16 signext %a, i16 signext %b) {
 ; ZBB:       # %bb.0:
 ; ZBB-NEXT:    max a0, a0, a1
 ; ZBB-NEXT:    ret
+;
+; XQCI-LABEL: smax_i16:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    qc.mvge a0, a1, a0, a1
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: smax_i16:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    blt a1, a0, .LBB1_2
+; RV32I-SFB-NEXT:  # %bb.1:
+; RV32I-SFB-NEXT:    mv a0, a1
+; RV32I-SFB-NEXT:  .LBB1_2:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: smax_i16:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    blt a1, a0, .LBB1_2
+; RV64I-SFB-NEXT:  # %bb.1:
+; RV64I-SFB-NEXT:    mv a0, a1
+; RV64I-SFB-NEXT:  .LBB1_2:
+; RV64I-SFB-NEXT:    ret
   %c = call i16 @llvm.smax.i16(i16 %a, i16 %b)
   ret i16 %c
 }
@@ -61,6 +109,27 @@ define signext i32 @smax_i32(i32 signext %a, i32 signext %b) {
 ; ZBB:       # %bb.0:
 ; ZBB-NEXT:    max a0, a0, a1
 ; ZBB-NEXT:    ret
+;
+; XQCI-LABEL: smax_i32:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    qc.mvge a0, a1, a0, a1
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: smax_i32:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    blt a1, a0, .LBB2_2
+; RV32I-SFB-NEXT:  # %bb.1:
+; RV32I-SFB-NEXT:    mv a0, a1
+; RV32I-SFB-NEXT:  .LBB2_2:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: smax_i32:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    blt a1, a0, .LBB2_2
+; RV64I-SFB-NEXT:  # %bb.1:
+; RV64I-SFB-NEXT:    mv a0, a1
+; RV64I-SFB-NEXT:  .LBB2_2:
+; RV64I-SFB-NEXT:    ret
   %c = call i32 @llvm.smax.i32(i32 %a, i32 %b)
   ret i32 %c
 }
@@ -112,6 +181,41 @@ define i64 @smax_i64(i64 %a, i64 %b) {
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    max a0, a0, a1
 ; RV64ZBB-NEXT:    ret
+;
+; XQCI-LABEL: smax_i64:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    sltu a4, a2, a0
+; XQCI-NEXT:    slt a5, a3, a1
+; XQCI-NEXT:    qc.mveq a5, a1, a3, a4
+; XQCI-NEXT:    qc.mveqi a0, a5, 0, a2
+; XQCI-NEXT:    qc.mveqi a1, a5, 0, a3
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: smax_i64:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    sltu a4, a2, a0
+; RV32I-SFB-NEXT:    slt a5, a3, a1
+; RV32I-SFB-NEXT:    bne a1, a3, .LBB3_2
+; RV32I-SFB-NEXT:  # %bb.1:
+; RV32I-SFB-NEXT:    mv a5, a4
+; RV32I-SFB-NEXT:  .LBB3_2:
+; RV32I-SFB-NEXT:    bnez a5, .LBB3_4
+; RV32I-SFB-NEXT:  # %bb.3:
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB3_4:
+; RV32I-SFB-NEXT:    bnez a5, .LBB3_6
+; RV32I-SFB-NEXT:  # %bb.5:
+; RV32I-SFB-NEXT:    mv a1, a3
+; RV32I-SFB-NEXT:  .LBB3_6:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: smax_i64:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    blt a1, a0, .LBB3_2
+; RV64I-SFB-NEXT:  # %bb.1:
+; RV64I-SFB-NEXT:    mv a0, a1
+; RV64I-SFB-NEXT:  .LBB3_2:
+; RV64I-SFB-NEXT:    ret
   %c = call i64 @llvm.smax.i64(i64 %a, i64 %b)
   ret i64 %c
 }
@@ -131,6 +235,27 @@ define signext i8 @smin_i8(i8 signext %a, i8 signext %b) {
 ; ZBB:       # %bb.0:
 ; ZBB-NEXT:    min a0, a0, a1
 ; ZBB-NEXT:    ret
+;
+; XQCI-LABEL: smin_i8:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    qc.mvge a0, a0, a1, a1
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: smin_i8:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    blt a0, a1, .LBB4_2
+; RV32I-SFB-NEXT:  # %bb.1:
+; RV32I-SFB-NEXT:    mv a0, a1
+; RV32I-SFB-NEXT:  .LBB4_2:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: smin_i8:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    blt a0, a1, .LBB4_2
+; RV64I-SFB-NEXT:  # %bb.1:
+; RV64I-SFB-NEXT:    mv a0, a1
+; RV64I-SFB-NEXT:  .LBB4_2:
+; RV64I-SFB-NEXT:    ret
   %c = call i8 @llvm.smin.i8(i8 %a, i8 %b)
   ret i8 %c
 }
@@ -150,6 +275,27 @@ define signext i16 @smin_i16(i16 signext %a, i16 signext %b) {
 ; ZBB:       # %bb.0:
 ; ZBB-NEXT:    min a0, a0, a1
 ; ZBB-NEXT:    ret
+;
+; XQCI-LABEL: smin_i16:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    qc.mvge a0, a0, a1, a1
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: smin_i16:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    blt a0, a1, .LBB5_2
+; RV32I-SFB-NEXT:  # %bb.1:
+; RV32I-SFB-NEXT:    mv a0, a1
+; RV32I-SFB-NEXT:  .LBB5_2:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: smin_i16:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    blt a0, a1, .LBB5_2
+; RV64I-SFB-NEXT:  # %bb.1:
+; RV64I-SFB-NEXT:    mv a0, a1
+; RV64I-SFB-NEXT:  .LBB5_2:
+; RV64I-SFB-NEXT:    ret
   %c = call i16 @llvm.smin.i16(i16 %a, i16 %b)
   ret i16 %c
 }
@@ -169,6 +315,27 @@ define signext i32 @smin_i32(i32 signext %a, i32 signext %b) {
 ; ZBB:       # %bb.0:
 ; ZBB-NEXT:    min a0, a0, a1
 ; ZBB-NEXT:    ret
+;
+; XQCI-LABEL: smin_i32:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    qc.mvge a0, a0, a1, a1
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: smin_i32:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    blt a0, a1, .LBB6_2
+; RV32I-SFB-NEXT:  # %bb.1:
+; RV32I-SFB-NEXT:    mv a0, a1
+; RV32I-SFB-NEXT:  .LBB6_2:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: smin_i32:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    blt a0, a1, .LBB6_2
+; RV64I-SFB-NEXT:  # %bb.1:
+; RV64I-SFB-NEXT:    mv a0, a1
+; RV64I-SFB-NEXT:  .LBB6_2:
+; RV64I-SFB-NEXT:    ret
   %c = call i32 @llvm.smin.i32(i32 %a, i32 %b)
   ret i32 %c
 }
@@ -220,6 +387,41 @@ define i64 @smin_i64(i64 %a, i64 %b) {
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    min a0, a0, a1
 ; RV64ZBB-NEXT:    ret
+;
+; XQCI-LABEL: smin_i64:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    sltu a4, a0, a2
+; XQCI-NEXT:    slt a5, a1, a3
+; XQCI-NEXT:    qc.mveq a5, a1, a3, a4
+; XQCI-NEXT:    qc.mveqi a0, a5, 0, a2
+; XQCI-NEXT:    qc.mveqi a1, a5, 0, a3
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: smin_i64:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    sltu a4, a0, a2
+; RV32I-SFB-NEXT:    slt a5, a1, a3
+; RV32I-SFB-NEXT:    bne a1, a3, .LBB7_2
+; RV32I-SFB-NEXT:  # %bb.1:
+; RV32I-SFB-NEXT:    mv a5, a4
+; RV32I-SFB-NEXT:  .LBB7_2:
+; RV32I-SFB-NEXT:    bnez a5, .LBB7_4
+; RV32I-SFB-NEXT:  # %bb.3:
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB7_4:
+; RV32I-SFB-NEXT:    bnez a5, .LBB7_6
+; RV32I-SFB-NEXT:  # %bb.5:
+; RV32I-SFB-NEXT:    mv a1, a3
+; RV32I-SFB-NEXT:  .LBB7_6:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: smin_i64:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    blt a0, a1, .LBB7_2
+; RV64I-SFB-NEXT:  # %bb.1:
+; RV64I-SFB-NEXT:    mv a0, a1
+; RV64I-SFB-NEXT:  .LBB7_2:
+; RV64I-SFB-NEXT:    ret
   %c = call i64 @llvm.smin.i64(i64 %a, i64 %b)
   ret i64 %c
 }
@@ -239,6 +441,27 @@ define i8 @umax_i8(i8 zeroext %a, i8 zeroext %b) {
 ; ZBB:       # %bb.0:
 ; ZBB-NEXT:    maxu a0, a0, a1
 ; ZBB-NEXT:    ret
+;
+; XQCI-LABEL: umax_i8:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    qc.mvgeu a0, a1, a0, a1
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: umax_i8:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    bltu a1, a0, .LBB8_2
+; RV32I-SFB-NEXT:  # %bb.1:
+; RV32I-SFB-NEXT:    mv a0, a1
+; RV32I-SFB-NEXT:  .LBB8_2:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: umax_i8:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    bltu a1, a0, .LBB8_2
+; RV64I-SFB-NEXT:  # %bb.1:
+; RV64I-SFB-NEXT:    mv a0, a1
+; RV64I-SFB-NEXT:  .LBB8_2:
+; RV64I-SFB-NEXT:    ret
   %c = call i8 @llvm.umax.i8(i8 %a, i8 %b)
   ret i8 %c
 }
@@ -258,6 +481,27 @@ define i16 @umax_i16(i16 zeroext %a, i16 zeroext %b) {
 ; ZBB:       # %bb.0:
 ; ZBB-NEXT:    maxu a0, a0, a1
 ; ZBB-NEXT:    ret
+;
+; XQCI-LABEL: umax_i16:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    qc.mvgeu a0, a1, a0, a1
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: umax_i16:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    bltu a1, a0, .LBB9_2
+; RV32I-SFB-NEXT:  # %bb.1:
+; RV32I-SFB-NEXT:    mv a0, a1
+; RV32I-SFB-NEXT:  .LBB9_2:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: umax_i16:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    bltu a1, a0, .LBB9_2
+; RV64I-SFB-NEXT:  # %bb.1:
+; RV64I-SFB-NEXT:    mv a0, a1
+; RV64I-SFB-NEXT:  .LBB9_2:
+; RV64I-SFB-NEXT:    ret
   %c = call i16 @llvm.umax.i16(i16 %a, i16 %b)
   ret i16 %c
 }
@@ -277,6 +521,27 @@ define signext i32 @umax_i32(i32 signext %a, i32 signext %b) {
 ; ZBB:       # %bb.0:
 ; ZBB-NEXT:    maxu a0, a0, a1
 ; ZBB-NEXT:    ret
+;
+; XQCI-LABEL: umax_i32:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    qc.mvgeu a0, a1, a0, a1
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: umax_i32:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    bltu a1, a0, .LBB10_2
+; RV32I-SFB-NEXT:  # %bb.1:
+; RV32I-SFB-NEXT:    mv a0, a1
+; RV32I-SFB-NEXT:  .LBB10_2:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: umax_i32:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    bltu a1, a0, .LBB10_2
+; RV64I-SFB-NEXT:  # %bb.1:
+; RV64I-SFB-NEXT:    mv a0, a1
+; RV64I-SFB-NEXT:  .LBB10_2:
+; RV64I-SFB-NEXT:    ret
   %c = call i32 @llvm.umax.i32(i32 %a, i32 %b)
   ret i32 %c
 }
@@ -328,6 +593,41 @@ define i64 @umax_i64(i64 %a, i64 %b) {
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    maxu a0, a0, a1
 ; RV64ZBB-NEXT:    ret
+;
+; XQCI-LABEL: umax_i64:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    sltu a4, a2, a0
+; XQCI-NEXT:    sltu a5, a3, a1
+; XQCI-NEXT:    qc.mveq a5, a1, a3, a4
+; XQCI-NEXT:    qc.mveqi a0, a5, 0, a2
+; XQCI-NEXT:    qc.mveqi a1, a5, 0, a3
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: umax_i64:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    sltu a4, a2, a0
+; RV32I-SFB-NEXT:    sltu a5, a3, a1
+; RV32I-SFB-NEXT:    bne a1, a3, .LBB11_2
+; RV32I-SFB-NEXT:  # %bb.1:
+; RV32I-SFB-NEXT:    mv a5, a4
+; RV32I-SFB-NEXT:  .LBB11_2:
+; RV32I-SFB-NEXT:    bnez a5, .LBB11_4
+; RV32I-SFB-NEXT:  # %bb.3:
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB11_4:
+; RV32I-SFB-NEXT:    bnez a5, .LBB11_6
+; RV32I-SFB-NEXT:  # %bb.5:
+; RV32I-SFB-NEXT:    mv a1, a3
+; RV32I-SFB-NEXT:  .LBB11_6:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: umax_i64:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    bltu a1, a0, .LBB11_2
+; RV64I-SFB-NEXT:  # %bb.1:
+; RV64I-SFB-NEXT:    mv a0, a1
+; RV64I-SFB-NEXT:  .LBB11_2:
+; RV64I-SFB-NEXT:    ret
   %c = call i64 @llvm.umax.i64(i64 %a, i64 %b)
   ret i64 %c
 }
@@ -347,6 +647,27 @@ define zeroext i8 @umin_i8(i8 zeroext %a, i8 zeroext %b) {
 ; ZBB:       # %bb.0:
 ; ZBB-NEXT:    minu a0, a0, a1
 ; ZBB-NEXT:    ret
+;
+; XQCI-LABEL: umin_i8:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    qc.mvgeu a0, a0, a1, a1
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: umin_i8:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    bltu a0, a1, .LBB12_2
+; RV32I-SFB-NEXT:  # %bb.1:
+; RV32I-SFB-NEXT:    mv a0, a1
+; RV32I-SFB-NEXT:  .LBB12_2:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: umin_i8:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    bltu a0, a1, .LBB12_2
+; RV64I-SFB-NEXT:  # %bb.1:
+; RV64I-SFB-NEXT:    mv a0, a1
+; RV64I-SFB-NEXT:  .LBB12_2:
+; RV64I-SFB-NEXT:    ret
   %c = call i8 @llvm.umin.i8(i8 %a, i8 %b)
   ret i8 %c
 }
@@ -366,6 +687,27 @@ define zeroext i16 @umin_i16(i16 zeroext %a, i16 zeroext %b) {
 ; ZBB:       # %bb.0:
 ; ZBB-NEXT:    minu a0, a0, a1
 ; ZBB-NEXT:    ret
+;
+; XQCI-LABEL: umin_i16:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    qc.mvgeu a0, a0, a1, a1
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: umin_i16:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    bltu a0, a1, .LBB13_2
+; RV32I-SFB-NEXT:  # %bb.1:
+; RV32I-SFB-NEXT:    mv a0, a1
+; RV32I-SFB-NEXT:  .LBB13_2:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: umin_i16:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    bltu a0, a1, .LBB13_2
+; RV64I-SFB-NEXT:  # %bb.1:
+; RV64I-SFB-NEXT:    mv a0, a1
+; RV64I-SFB-NEXT:  .LBB13_2:
+; RV64I-SFB-NEXT:    ret
   %c = call i16 @llvm.umin.i16(i16 %a, i16 %b)
   ret i16 %c
 }
@@ -385,6 +727,27 @@ define signext i32 @umin_i32(i32 signext %a, i32 signext %b) {
 ; ZBB:       # %bb.0:
 ; ZBB-NEXT:    minu a0, a0, a1
 ; ZBB-NEXT:    ret
+;
+; XQCI-LABEL: umin_i32:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    qc.mvgeu a0, a0, a1, a1
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: umin_i32:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    bltu a0, a1, .LBB14_2
+; RV32I-SFB-NEXT:  # %bb.1:
+; RV32I-SFB-NEXT:    mv a0, a1
+; RV32I-SFB-NEXT:  .LBB14_2:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: umin_i32:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    bltu a0, a1, .LBB14_2
+; RV64I-SFB-NEXT:  # %bb.1:
+; RV64I-SFB-NEXT:    mv a0, a1
+; RV64I-SFB-NEXT:  .LBB14_2:
+; RV64I-SFB-NEXT:    ret
   %c = call i32 @llvm.umin.i32(i32 %a, i32 %b)
   ret i32 %c
 }
@@ -436,6 +799,41 @@ define i64 @umin_i64(i64 %a, i64 %b) {
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    minu a0, a0, a1
 ; RV64ZBB-NEXT:    ret
+;
+; XQCI-LABEL: umin_i64:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    sltu a4, a0, a2
+; XQCI-NEXT:    sltu a5, a1, a3
+; XQCI-NEXT:    qc.mveq a5, a1, a3, a4
+; XQCI-NEXT:    qc.mveqi a0, a5, 0, a2
+; XQCI-NEXT:    qc.mveqi a1, a5, 0, a3
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: umin_i64:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    sltu a4, a0, a2
+; RV32I-SFB-NEXT:    sltu a5, a1, a3
+; RV32I-SFB-NEXT:    bne a1, a3, .LBB15_2
+; RV32I-SFB-NEXT:  # %bb.1:
+; RV32I-SFB-NEXT:    mv a5, a4
+; RV32I-SFB-NEXT:  .LBB15_2:
+; RV32I-SFB-NEXT:    bnez a5, .LBB15_4
+; RV32I-SFB-NEXT:  # %bb.3:
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB15_4:
+; RV32I-SFB-NEXT:    bnez a5, .LBB15_6
+; RV32I-SFB-NEXT:  # %bb.5:
+; RV32I-SFB-NEXT:    mv a1, a3
+; RV32I-SFB-NEXT:  .LBB15_6:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: umin_i64:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    bltu a0, a1, .LBB15_2
+; RV64I-SFB-NEXT:  # %bb.1:
+; RV64I-SFB-NEXT:    mv a0, a1
+; RV64I-SFB-NEXT:  .LBB15_2:
+; RV64I-SFB-NEXT:    ret
   %c = call i64 @llvm.umin.i64(i64 %a, i64 %b)
   ret i64 %c
 }
@@ -450,6 +848,18 @@ define signext i32 @smin_same_op_i32(i32 signext %a) {
 ; ZBB-LABEL: smin_same_op_i32:
 ; ZBB:       # %bb.0:
 ; ZBB-NEXT:    ret
+;
+; XQCI-LABEL: smin_same_op_i32:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: smin_same_op_i32:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: smin_same_op_i32:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    ret
   %c = call i32 @llvm.smin.i32(i32 %a, i32 %a)
   ret i32 %c
 }
@@ -462,6 +872,18 @@ define signext i32 @smax_same_op_i32(i32 signext %a) {
 ; ZBB-LABEL: smax_same_op_i32:
 ; ZBB:       # %bb.0:
 ; ZBB-NEXT:    ret
+;
+; XQCI-LABEL: smax_same_op_i32:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: smax_same_op_i32:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: smax_same_op_i32:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    ret
   %c = call i32 @llvm.smax.i32(i32 %a, i32 %a)
   ret i32 %c
 }
@@ -474,6 +896,18 @@ define signext i32 @umin_same_op_i32(i32 signext %a) {
 ; ZBB-LABEL: umin_same_op_i32:
 ; ZBB:       # %bb.0:
 ; ZBB-NEXT:    ret
+;
+; XQCI-LABEL: umin_same_op_i32:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: umin_same_op_i32:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: umin_same_op_i32:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    ret
   %c = call i32 @llvm.umin.i32(i32 %a, i32 %a)
   ret i32 %c
 }
@@ -486,6 +920,18 @@ define signext i32 @umax_same_op_i32(i32 signext %a) {
 ; ZBB-LABEL: umax_same_op_i32:
 ; ZBB:       # %bb.0:
 ; ZBB-NEXT:    ret
+;
+; XQCI-LABEL: umax_same_op_i32:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: umax_same_op_i32:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: umax_same_op_i32:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    ret
   %c = call i32 @llvm.umax.i32(i32 %a, i32 %a)
   ret i32 %c
 }
@@ -510,6 +956,19 @@ define signext i32 @smin_undef_i32() {
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    li a0, 0
 ; RV64ZBB-NEXT:    ret
+;
+; XQCI-LABEL: smin_undef_i32:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: smin_undef_i32:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: smin_undef_i32:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    li a0, 0
+; RV64I-SFB-NEXT:    ret
   %c = call i32 @llvm.smin.i32(i32 undef, i32 undef)
   ret i32 %c
 }
@@ -532,6 +991,19 @@ define signext i32 @smax_undef_i32() {
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    li a0, 0
 ; RV64ZBB-NEXT:    ret
+;
+; XQCI-LABEL: smax_undef_i32:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: smax_undef_i32:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: smax_undef_i32:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    li a0, 0
+; RV64I-SFB-NEXT:    ret
   %c = call i32 @llvm.smax.i32(i32 undef, i32 undef)
   ret i32 %c
 }
@@ -554,6 +1026,19 @@ define signext i32 @umin_undef_i32() {
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    li a0, 0
 ; RV64ZBB-NEXT:    ret
+;
+; XQCI-LABEL: umin_undef_i32:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: umin_undef_i32:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: umin_undef_i32:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    li a0, 0
+; RV64I-SFB-NEXT:    ret
   %c = call i32 @llvm.umin.i32(i32 undef, i32 undef)
   ret i32 %c
 }
@@ -576,6 +1061,19 @@ define signext i32 @umax_undef_i32() {
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    li a0, 0
 ; RV64ZBB-NEXT:    ret
+;
+; XQCI-LABEL: umax_undef_i32:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: umax_undef_i32:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: umax_undef_i32:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    li a0, 0
+; RV64I-SFB-NEXT:    ret
   %c = call i32 @llvm.umax.i32(i32 undef, i32 undef)
   ret i32 %c
 }
@@ -595,6 +1093,29 @@ define signext i32 @smax_i32_pos_constant(i32 signext %a) {
 ; ZBB-NEXT:    li a1, 10
 ; ZBB-NEXT:    max a0, a0, a1
 ; ZBB-NEXT:    ret
+;
+; XQCI-LABEL: smax_i32_pos_constant:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    qc.lilti a0, a0, 11, 10
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: smax_i32_pos_constant:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    li a1, 10
+; RV32I-SFB-NEXT:    blt a1, a0, .LBB24_2
+; RV32I-SFB-NEXT:  # %bb.1:
+; RV32I-SFB-NEXT:    mv a0, a1
+; RV32I-SFB-NEXT:  .LBB24_2:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: smax_i32_pos_constant:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    li a1, 10
+; RV64I-SFB-NEXT:    blt a1, a0, .LBB24_2
+; RV64I-SFB-NEXT:  # %bb.1:
+; RV64I-SFB-NEXT:    mv a0, a1
+; RV64I-SFB-NEXT:  .LBB24_2:
+; RV64I-SFB-NEXT:    ret
   %c = call i32 @llvm.smax.i32(i32 %a, i32 10)
   ret i32 %c
 }
@@ -616,6 +1137,33 @@ define signext i32 @smax_i32_pos_constant_trailing_zeros(i32 signext %a) {
 ; ZBB-NEXT:    li a1, 16
 ; ZBB-NEXT:    max a0, a0, a1
 ; ZBB-NEXT:    ret
+;
+; XQCI-LABEL: smax_i32_pos_constant_trailing_zeros:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    andi a1, a0, -8
+; XQCI-NEXT:    li a0, 16
+; XQCI-NEXT:    qc.mvlt a0, a0, a1, a1
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: smax_i32_pos_constant_trailing_zeros:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    andi a1, a0, -8
+; RV32I-SFB-NEXT:    li a0, 16
+; RV32I-SFB-NEXT:    bge a0, a1, .LBB25_2
+; RV32I-SFB-NEXT:  # %bb.1:
+; RV32I-SFB-NEXT:    mv a0, a1
+; RV32I-SFB-NEXT:  .LBB25_2:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: smax_i32_pos_constant_trailing_zeros:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    andi a1, a0, -8
+; RV64I-SFB-NEXT:    li a0, 16
+; RV64I-SFB-NEXT:    bge a0, a1, .LBB25_2
+; RV64I-SFB-NEXT:  # %bb.1:
+; RV64I-SFB-NEXT:    mv a0, a1
+; RV64I-SFB-NEXT:  .LBB25_2:
+; RV64I-SFB-NEXT:    ret
   %b = and i32 %a, -8
   %c = call i32 @llvm.smax.i32(i32 %b, i32 16)
   %d = and i32 %c, -4
@@ -635,6 +1183,29 @@ define signext i32 @smin_i32_negone(i32 signext %a) {
 ; ZBB-NEXT:    li a1, -1
 ; ZBB-NEXT:    min a0, a0, a1
 ; ZBB-NEXT:    ret
+;
+; XQCI-LABEL: smin_i32_negone:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    qc.ligei a0, a0, 0, -1
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: smin_i32_negone:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    li a1, -1
+; RV32I-SFB-NEXT:    bltz a0, .LBB26_2
+; RV32I-SFB-NEXT:  # %bb.1:
+; RV32I-SFB-NEXT:    mv a0, a1
+; RV32I-SFB-NEXT:  .LBB26_2:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: smin_i32_negone:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    li a1, -1
+; RV64I-SFB-NEXT:    bltz a0, .LBB26_2
+; RV64I-SFB-NEXT:  # %bb.1:
+; RV64I-SFB-NEXT:    mv a0, a1
+; RV64I-SFB-NEXT:  .LBB26_2:
+; RV64I-SFB-NEXT:    ret
   %c = call i32 @llvm.smin.i32(i32 %a, i32 -1)
   ret i32 %c
 }
@@ -672,6 +1243,34 @@ define i64 @smin_i64_negone(i64 %a) {
 ; RV64ZBB-NEXT:    li a1, -1
 ; RV64ZBB-NEXT:    min a0, a0, a1
 ; RV64ZBB-NEXT:    ret
+;
+; XQCI-LABEL: smin_i64_negone:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    qc.ligei a0, a1, 0, -1
+; XQCI-NEXT:    qc.ligei a1, a1, 0, -1
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: smin_i64_negone:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    li a2, -1
+; RV32I-SFB-NEXT:    bltz a1, .LBB27_2
+; RV32I-SFB-NEXT:  # %bb.1:
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB27_2:
+; RV32I-SFB-NEXT:    bltz a1, .LBB27_4
+; RV32I-SFB-NEXT:  # %bb.3:
+; RV32I-SFB-NEXT:    mv a1, a2
+; RV32I-SFB-NEXT:  .LBB27_4:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: smin_i64_negone:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    li a1, -1
+; RV64I-SFB-NEXT:    bltz a0, .LBB27_2
+; RV64I-SFB-NEXT:  # %bb.1:
+; RV64I-SFB-NEXT:    mv a0, a1
+; RV64I-SFB-NEXT:  .LBB27_2:
+; RV64I-SFB-NEXT:    ret
   %c = call i64 @llvm.smin.i64(i64 %a, i64 -1)
   ret i64 %c
 }
@@ -720,6 +1319,41 @@ define i64 @umax_i64_one(i64 %a, i64 %b) {
 ; RV64ZBB-NEXT:    li a1, 1
 ; RV64ZBB-NEXT:    maxu a0, a0, a1
 ; RV64ZBB-NEXT:    ret
+;
+; XQCI-LABEL: umax_i64_one:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    mv a2, a1
+; XQCI-NEXT:    qc.selectinei a2, 0, a0, 1
+; XQCI-NEXT:    qc.liltui a0, a0, 2, 1
+; XQCI-NEXT:    qc.mvnei a0, a1, 0, a2
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: umax_i64_one:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    li a2, 1
+; RV32I-SFB-NEXT:    li a3, 1
+; RV32I-SFB-NEXT:    beqz a1, .LBB28_2
+; RV32I-SFB-NEXT:  # %bb.1:
+; RV32I-SFB-NEXT:    mv a3, a0
+; RV32I-SFB-NEXT:  .LBB28_2:
+; RV32I-SFB-NEXT:    bnez a0, .LBB28_4
+; RV32I-SFB-NEXT:  # %bb.3:
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB28_4:
+; RV32I-SFB-NEXT:    beqz a1, .LBB28_6
+; RV32I-SFB-NEXT:  # %bb.5:
+; RV32I-SFB-NEXT:    mv a0, a3
+; RV32I-SFB-NEXT:  .LBB28_6:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: umax_i64_one:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    li a1, 1
+; RV64I-SFB-NEXT:    bnez a0, .LBB28_2
+; RV64I-SFB-NEXT:  # %bb.1:
+; RV64I-SFB-NEXT:    mv a0, a1
+; RV64I-SFB-NEXT:  .LBB28_2:
+; RV64I-SFB-NEXT:    ret
   %c = call i64 @llvm.umax.i64(i64 %a, i64 1)
   ret i64 %c
 }
diff --git a/llvm/test/CodeGen/RISCV/rv32zbs.ll b/llvm/test/CodeGen/RISCV/rv32zbs.ll
index dcb70f88fd4ac..f9527ef79272b 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbs.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbs.ll
@@ -45,6 +45,32 @@ define i32 @bclr_i32_no_mask(i32 %a, i32 %b) nounwind {
   ret i32 %and1
 }
 
+define i32 @bclr_i32_mask_multiple(i32 %a, i32 %b, i32 %shamt) nounwind {
+; RV32I-LABEL: bclr_i32_mask_multiple:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a3, 1
+; RV32I-NEXT:    sll a2, a3, a2
+; RV32I-NEXT:    not a3, a2
+; RV32I-NEXT:    and a0, a3, a0
+; RV32I-NEXT:    or a1, a1, a2
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBS-LABEL: bclr_i32_mask_multiple:
+; RV32ZBS:       # %bb.0:
+; RV32ZBS-NEXT:    bclr a0, a0, a2
+; RV32ZBS-NEXT:    bset a1, a1, a2
+; RV32ZBS-NEXT:    add a0, a0, a1
+; RV32ZBS-NEXT:    ret
+  %shamt_masked = and i32 %shamt, 63
+  %shl = shl nuw i32 1, %shamt_masked
+  %neg = xor i32 %shl, -1
+  %and = and i32 %neg, %a
+  %or = or i32 %b, %shl
+  %c = add i32 %and, %or
+  ret i32 %c
+}
+
 define i64 @bclr_i64(i64 %a, i64 %b) nounwind {
 ; RV32I-LABEL: bclr_i64:
 ; RV32I:       # %bb.0:
@@ -301,17 +327,17 @@ define i64 @bext_i64(i64 %a, i64 %b) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    andi a3, a2, 63
 ; CHECK-NEXT:    addi a4, a3, -32
-; CHECK-NEXT:    bltz a4, .LBB12_2
+; CHECK-NEXT:    bltz a4, .LBB13_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    srl a0, a1, a3
-; CHECK-NEXT:    j .LBB12_3
-; CHECK-NEXT:  .LBB12_2:
+; CHECK-NEXT:    j .LBB13_3
+; CHECK-NEXT:  .LBB13_2:
 ; CHECK-NEXT:    srl a0, a0, a2
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    not a2, a3
 ; CHECK-NEXT:    sll a1, a1, a2
 ; CHECK-NEXT:    or a0, a0, a1
-; CHECK-NEXT:  .LBB12_3:
+; CHECK-NEXT:  .LBB13_3:
 ; CHECK-NEXT:    andi a0, a0, 1
 ; CHECK-NEXT:    li a1, 0
 ; CHECK-NEXT:    ret
@@ -789,17 +815,17 @@ define i64 @bset_trailing_ones_i64_mask(i64 %a) nounwind {
 ; CHECK-NEXT:    li a3, -1
 ; CHECK-NEXT:    addi a1, a2, -32
 ; CHECK-NEXT:    sll a0, a3, a0
-; CHECK-NEXT:    bltz a1, .LBB43_2
+; CHECK-NEXT:    bltz a1, .LBB44_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    sll a2, a3, a2
-; CHECK-NEXT:    j .LBB43_3
-; CHECK-NEXT:  .LBB43_2:
+; CHECK-NEXT:    j .LBB44_3
+; CHECK-NEXT:  .LBB44_2:
 ; CHECK-NEXT:    not a2, a2
 ; CHECK-NEXT:    lui a3, 524288
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    srl a2, a3, a2
 ; CHECK-NEXT:    or a2, a0, a2
-; CHECK-NEXT:  .LBB43_3:
+; CHECK-NEXT:  .LBB44_3:
 ; CHECK-NEXT:    srai a1, a1, 31
 ; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    not a1, a2
@@ -817,17 +843,17 @@ define i64 @bset_trailing_ones_i64_no_mask(i64 %a) nounwind {
 ; CHECK-NEXT:    li a1, -1
 ; CHECK-NEXT:    addi a2, a0, -32
 ; CHECK-NEXT:    sll a1, a1, a0
-; CHECK-NEXT:    bltz a2, .LBB44_2
+; CHECK-NEXT:    bltz a2, .LBB45_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
-; CHECK-NEXT:    j .LBB44_3
-; CHECK-NEXT:  .LBB44_2:
+; CHECK-NEXT:    j .LBB45_3
+; CHECK-NEXT:  .LBB45_2:
 ; CHECK-NEXT:    not a0, a0
 ; CHECK-NEXT:    lui a3, 524288
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    srl a0, a3, a0
 ; CHECK-NEXT:    or a0, a1, a0
-; CHECK-NEXT:  .LBB44_3:
+; CHECK-NEXT:  .LBB45_3:
 ; CHECK-NEXT:    srai a2, a2, 31
 ; CHECK-NEXT:    and a2, a2, a1
 ; CHECK-NEXT:    not a1, a0
diff --git a/llvm/test/CodeGen/RISCV/rv64zbs.ll b/llvm/test/CodeGen/RISCV/rv64zbs.ll
index b4edcf6cc55cf..d42bc8e128082 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbs.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbs.ll
@@ -110,6 +110,32 @@ define i64 @bclr_i64_no_mask(i64 %a, i64 %b) nounwind {
   ret i64 %and1
 }
 
+define i64 @bclr_i64_mask_multiple(i64 %a, i64 %b, i64 %shamt) nounwind {
+; RV64I-LABEL: bclr_i64_mask_multiple:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a3, 1
+; RV64I-NEXT:    sll a2, a3, a2
+; RV64I-NEXT:    not a3, a2
+; RV64I-NEXT:    and a0, a3, a0
+; RV64I-NEXT:    or a1, a1, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: bclr_i64_mask_multiple:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    bclr a0, a0, a2
+; RV64ZBS-NEXT:    bset a1, a1, a2
+; RV64ZBS-NEXT:    add a0, a0, a1
+; RV64ZBS-NEXT:    ret
+  %shamt_masked = and i64 %shamt, 63
+  %shl = shl nuw i64 1, %shamt_masked
+  %neg = xor i64 %shl, -1
+  %and = and i64 %neg, %a
+  %or = or i64 %b, %shl
+  %c = add i64 %and, %or
+  ret i64 %c
+}
+
 define signext i32 @bset_i32(i32 signext %a, i32 signext %b) nounwind {
 ; RV64I-LABEL: bset_i32:
 ; RV64I:       # %bb.0:
@@ -372,19 +398,19 @@ define void @bext_i32_trunc(i32 signext %0, i32 signext %1) {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srlw a0, a0, a1
 ; RV64I-NEXT:    andi a0, a0, 1
-; RV64I-NEXT:    beqz a0, .LBB19_2
+; RV64I-NEXT:    beqz a0, .LBB20_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    ret
-; RV64I-NEXT:  .LBB19_2:
+; RV64I-NEXT:  .LBB20_2:
 ; RV64I-NEXT:    tail bar
 ;
 ; RV64ZBS-LABEL: bext_i32_trunc:
 ; RV64ZBS:       # %bb.0:
 ; RV64ZBS-NEXT:    bext a0, a0, a1
-; RV64ZBS-NEXT:    beqz a0, .LBB19_2
+; RV64ZBS-NEXT:    beqz a0, .LBB20_2
 ; RV64ZBS-NEXT:  # %bb.1:
 ; RV64ZBS-NEXT:    ret
-; RV64ZBS-NEXT:  .LBB19_2:
+; RV64ZBS-NEXT:  .LBB20_2:
 ; RV64ZBS-NEXT:    tail bar
   %3 = shl i32 1, %1
   %4 = and i32 %3, %0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-negative.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-negative.ll
index bbdacac7ab1fa..8f7d738fe6d91 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-negative.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-negative.ll
@@ -15,7 +15,7 @@ define void @gather_bad_or(ptr noalias nocapture %A, ptr noalias nocapture reado
 ; CHECK-NEXT:    [[I:%.*]] = mul nuw nsw <32 x i64> [[VEC_IND]], splat (i64 5)
 ; CHECK-NEXT:    [[OR:%.*]] = or <32 x i64> [[I]], splat (i64 1)
 ; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], <32 x i64> [[OR]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> [[I1]], i32 1, <32 x i1> splat (i1 true), <32 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> align 1 [[I1]], <32 x i1> splat (i1 true), <32 x i8> poison)
 ; CHECK-NEXT:    [[I2:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[I2]], align 1
 ; CHECK-NEXT:    [[I4:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
@@ -60,7 +60,7 @@ define void @gather_narrow_index(ptr noalias nocapture %A, ptr noalias nocapture
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <32 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = mul nuw nsw <32 x i32> [[VEC_IND]], splat (i32 5)
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], <32 x i32> [[TMP0]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> [[TMP1]], i32 1, <32 x i1> splat (i1 true), <32 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> align 1 [[TMP1]], <32 x i1> splat (i1 true), <32 x i8> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[TMP2]], align 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
@@ -105,7 +105,7 @@ define void @gather_broken_stride(ptr noalias nocapture %A, ptr noalias nocaptur
 ; CHECK-NEXT:    [[I:%.*]] = mul nuw nsw <32 x i64> [[VEC_IND]], splat (i64 5)
 ; CHECK-NEXT:    [[OR:%.*]] = or <32 x i64> [[I]], splat (i64 1)
 ; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], <32 x i64> [[OR]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> [[I1]], i32 1, <32 x i1> splat (i1 true), <32 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> align 1 [[I1]], <32 x i1> splat (i1 true), <32 x i8> poison)
 ; CHECK-NEXT:    [[I2:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[I2]], align 1
 ; CHECK-NEXT:    [[I4:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll
index 4a525466c7cf8..62b65ddd3d19a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll
@@ -370,7 +370,7 @@ define void @negative_shl_non_commute(ptr noalias nocapture %A, ptr noalias noca
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[I:%.*]] = shl nsw <8 x i64> [[DOTSPLAT]], [[VEC_IND]]
 ; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], <8 x i64> [[I]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[I1]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 4 [[I1]], <8 x i1> splat (i1 true), <8 x i32> poison)
 ; CHECK-NEXT:    [[I2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[I2]], align 1
 ; CHECK-NEXT:    [[I4:%.*]] = add <8 x i32> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
@@ -663,8 +663,8 @@ define void @gather_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptur
 ; ZVE32F-NEXT:    [[I6:%.*]] = add <2 x i64> [[I5]], splat (i64 10)
 ; ZVE32F-NEXT:    [[I7:%.*]] = getelementptr inbounds ptr, ptr [[ARG1:%.*]], <2 x i64> [[I4]]
 ; ZVE32F-NEXT:    [[I8:%.*]] = getelementptr inbounds ptr, ptr [[ARG1]], <2 x i64> [[I6]]
-; ZVE32F-NEXT:    [[I9:%.*]] = call <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr> [[I7]], i32 8, <2 x i1> splat (i1 true), <2 x ptr> poison)
-; ZVE32F-NEXT:    [[I10:%.*]] = call <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr> [[I8]], i32 8, <2 x i1> splat (i1 true), <2 x ptr> poison)
+; ZVE32F-NEXT:    [[I9:%.*]] = call <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr> align 8 [[I7]], <2 x i1> splat (i1 true), <2 x ptr> poison)
+; ZVE32F-NEXT:    [[I10:%.*]] = call <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr> align 8 [[I8]], <2 x i1> splat (i1 true), <2 x ptr> poison)
 ; ZVE32F-NEXT:    [[I11:%.*]] = getelementptr inbounds ptr, ptr [[ARG:%.*]], i64 [[I]]
 ; ZVE32F-NEXT:    store <2 x ptr> [[I9]], ptr [[I11]], align 8
 ; ZVE32F-NEXT:    [[I13:%.*]] = getelementptr inbounds ptr, ptr [[I11]], i64 2
@@ -744,8 +744,8 @@ define void @scatter_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptu
 ; ZVE32F-NEXT:    [[I12:%.*]] = add <2 x i64> [[I11]], splat (i64 10)
 ; ZVE32F-NEXT:    [[I13:%.*]] = getelementptr inbounds ptr, ptr [[ARG:%.*]], <2 x i64> [[I10]]
 ; ZVE32F-NEXT:    [[I14:%.*]] = getelementptr inbounds ptr, ptr [[ARG]], <2 x i64> [[I12]]
-; ZVE32F-NEXT:    call void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr> [[I6]], <2 x ptr> [[I13]], i32 8, <2 x i1> splat (i1 true))
-; ZVE32F-NEXT:    call void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr> [[I9]], <2 x ptr> [[I14]], i32 8, <2 x i1> splat (i1 true))
+; ZVE32F-NEXT:    call void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr> [[I6]], <2 x ptr> align 8 [[I13]], <2 x i1> splat (i1 true))
+; ZVE32F-NEXT:    call void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr> [[I9]], <2 x ptr> align 8 [[I14]], <2 x i1> splat (i1 true))
 ; ZVE32F-NEXT:    [[I15]] = add nuw i64 [[I]], 4
 ; ZVE32F-NEXT:    [[I16]] = add <2 x i64> [[I3]], splat (i64 4)
 ; ZVE32F-NEXT:    [[I17:%.*]] = icmp eq i64 [[I15]], 1024
@@ -975,7 +975,7 @@ define void @gather_narrow_idx(ptr noalias nocapture %A, ptr noalias nocapture r
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <32 x i16> [ <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 20, i16 21, i16 22, i16 23, i16 24, i16 25, i16 26, i16 27, i16 28, i16 29, i16 30, i16 31>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[I:%.*]] = mul nuw nsw <32 x i16> [[VEC_IND]], splat (i16 5)
 ; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], <32 x i16> [[I]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> [[I1]], i32 1, <32 x i1> splat (i1 true), <32 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> align 1 [[I1]], <32 x i1> splat (i1 true), <32 x i8> poison)
 ; CHECK-NEXT:    [[I2:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[I2]], align 1
 ; CHECK-NEXT:    [[I4:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
diff --git a/llvm/test/CodeGen/RISCV/rvv/mixed-float-bf16-arith.ll b/llvm/test/CodeGen/RISCV/rvv/mixed-float-bf16-arith.ll
new file mode 100644
index 0000000000000..489323b323110
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/mixed-float-bf16-arith.ll
@@ -0,0 +1,186 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfh,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfh,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x half> @llvm.riscv.vfadd.nxv1f16.nxv1f16(
+  <vscale x 1 x half>,
+  <vscale x 1 x half>,
+  <vscale x 1 x half>,
+  iXLen, iXLen);
+
+declare <vscale x 1 x i32> @llvm.riscv.vadd.nxv1i32.nxv1i32(
+  <vscale x 1 x i32>,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i32>,
+  iXLen);
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfadd.nxv1bf16.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @test_half_bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, iXLen %2, <vscale x 1 x half> %3, <vscale x 1 x half> %4, ptr %ptr) nounwind {
+; CHECK-LABEL: test_half_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a2, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vfadd.vv v10, v10, v11
+; CHECK-NEXT:    vsetvli zero, zero, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfadd.vv v8, v8, v9
+; CHECK-NEXT:    fsrm a2
+; CHECK-NEXT:    vse16.v v10, (a1)
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vfadd.nxv1f16.nxv1f16(
+    <vscale x 1 x half> poison,
+    <vscale x 1 x half> %3,
+    <vscale x 1 x half> %4,
+    iXLen 0, iXLen %2)
+
+  %b = call <vscale x 1 x bfloat> @llvm.riscv.vfadd.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> poison,
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    iXLen 0, iXLen %2)
+
+  call void @llvm.riscv.vse(<vscale x 1 x half> %a, ptr %ptr, iXLen %2)
+
+  ret <vscale x 1 x bfloat> %b
+}
+
+define <vscale x 1 x bfloat> @test_i32_bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, iXLen %2, <vscale x 1 x i32> %3, <vscale x 1 x i32> %4, ptr %ptr) nounwind {
+; CHECK-LABEL: test_i32_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vadd.vv v10, v10, v11
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfadd.vv v8, v8, v9
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    vse32.v v10, (a1)
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vadd.nxv1i32.nxv1i32(
+    <vscale x 1 x i32> poison,
+    <vscale x 1 x i32> %3,
+    <vscale x 1 x i32> %4,
+    iXLen %2)
+
+  %b = call <vscale x 1 x bfloat> @llvm.riscv.vfadd.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> poison,
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    iXLen 0, iXLen %2)
+
+  call void @llvm.riscv.vse(<vscale x 1 x i32> %a, ptr %ptr, iXLen %2)
+
+  ret <vscale x 1 x bfloat> %b
+}
+
+define <vscale x 1 x bfloat> @test_half_bf16_half(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, iXLen %2, <vscale x 1 x half> %3, <vscale x 1 x half> %4, ptr %ptr) nounwind {
+; CHECK-LABEL: test_half_bf16_half:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a2, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vfadd.vv v10, v10, v11
+; CHECK-NEXT:    vsetvli zero, zero, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfadd.vv v8, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vfadd.vv v9, v10, v11
+; CHECK-NEXT:    fsrm a2
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vse16.v v9, (a1)
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vfadd.nxv1f16.nxv1f16(
+    <vscale x 1 x half> poison,
+    <vscale x 1 x half> %3,
+    <vscale x 1 x half> %4,
+    iXLen 0, iXLen %2)
+
+  %b = call <vscale x 1 x bfloat> @llvm.riscv.vfadd.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> poison,
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    iXLen 0, iXLen %2)
+
+  %c = call <vscale x 1 x half> @llvm.riscv.vfadd.nxv1f16.nxv1f16(
+    <vscale x 1 x half> poison,
+    <vscale x 1 x half> %a,
+    <vscale x 1 x half> %4,
+    iXLen 0, iXLen %2)
+
+  store <vscale x 1 x half> %c, ptr %ptr
+
+  ret <vscale x 1 x bfloat> %b
+}
+
+define <vscale x 1 x bfloat> @test_bf16_half_bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, iXLen %2, <vscale x 1 x half> %3, <vscale x 1 x half> %4, ptr %ptr) nounwind {
+; CHECK-LABEL: test_bf16_half_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a2, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfadd.vv v8, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vfadd.vv v10, v10, v11
+; CHECK-NEXT:    vsetvli zero, zero, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfadd.vv v8, v8, v9
+; CHECK-NEXT:    fsrm a2
+; CHECK-NEXT:    vsetvli a0, zero, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vse16.v v10, (a1)
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfadd.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> poison,
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    iXLen 0, iXLen %2)
+
+  %b = call <vscale x 1 x half> @llvm.riscv.vfadd.nxv1f16.nxv1f16(
+    <vscale x 1 x half> poison,
+    <vscale x 1 x half> %3,
+    <vscale x 1 x half> %4,
+    iXLen 0, iXLen %2)
+
+  %c = call <vscale x 1 x bfloat> @llvm.riscv.vfadd.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> poison,
+    <vscale x 1 x bfloat> %a,
+    <vscale x 1 x bfloat> %1,
+    iXLen 0, iXLen %2)
+
+  store <vscale x 1 x half> %b, ptr %ptr
+
+  ret <vscale x 1 x bfloat> %c
+}
+
+define <vscale x 1 x bfloat> @test_bf16_i16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, iXLen %2, <vscale x 1 x i16> %3, <vscale x 1 x i16> %4, ptr %ptr) nounwind {
+; CHECK-LABEL: test_bf16_i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a2, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfadd.vv v8, v8, v9
+; CHECK-NEXT:    vadd.vv v9, v10, v11
+; CHECK-NEXT:    fsrm a2
+; CHECK-NEXT:    vsetvli a0, zero, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vse16.v v9, (a1)
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfadd.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> poison,
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    iXLen 0, iXLen %2)
+
+  %b = call <vscale x 1 x i16> @llvm.riscv.vadd.nxv1i16.nxv1i16(
+    <vscale x 1 x i16> poison,
+    <vscale x 1 x i16> %3,
+    <vscale x 1 x i16> %4,
+    iXLen %2)
+
+  store <vscale x 1 x i16> %b, ptr %ptr
+
+  ret <vscale x 1 x bfloat> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/regcoal-liveinterval-pruning-crash.ll b/llvm/test/CodeGen/RISCV/rvv/regcoal-liveinterval-pruning-crash.ll
new file mode 100644
index 0000000000000..c19e93d1f6c21
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/regcoal-liveinterval-pruning-crash.ll
@@ -0,0 +1,76 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -O1 -mtriple=riscv64 -mattr=+v < %s | FileCheck %s
+
+define i32 @pr134424(i64 %input_value, i32 %base_value, i1 %cond_flag1, i1 %cond_flag2, i1 %cond_flag3) {
+; CHECK-LABEL: pr134424:
+; CHECK:       # %bb.0: # %for.body.us.preheader.i
+; CHECK-NEXT:    andi a3, a3, 1
+; CHECK-NEXT:    andi a5, a2, 1
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vmv.v.x v8, a0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, tu, ma
+; CHECK-NEXT:    vmv.s.x v8, zero
+; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v0, 14
+; CHECK-NEXT:    mv a2, a1
+; CHECK-NEXT:    bnez a5, .LBB0_2
+; CHECK-NEXT:  # %bb.1: # %for.body.us.preheader.i
+; CHECK-NEXT:    li a2, 1
+; CHECK-NEXT:  .LBB0_2: # %for.body.us.preheader.i
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
+; CHECK-NEXT:    andi a4, a4, 1
+; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:    bnez a3, .LBB0_4
+; CHECK-NEXT:  # %bb.3: # %for.body.us.preheader.i
+; CHECK-NEXT:    li a0, 1
+; CHECK-NEXT:  .LBB0_4: # %for.body.us.preheader.i
+; CHECK-NEXT:    vmsle.vi v0, v8, 0
+; CHECK-NEXT:    sext.w a2, a2
+; CHECK-NEXT:    bnez a4, .LBB0_6
+; CHECK-NEXT:  # %bb.5: # %for.body.us.preheader.i
+; CHECK-NEXT:    li a1, 1
+; CHECK-NEXT:  .LBB0_6: # %for.body.us.preheader.i
+; CHECK-NEXT:    sext.w a0, a0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vredmin.vs v8, v8, v8
+; CHECK-NEXT:    vmv.x.s a3, v8
+; CHECK-NEXT:    sext.w a1, a1
+; CHECK-NEXT:    bge a3, a2, .LBB0_11
+; CHECK-NEXT:  # %bb.7: # %for.body.us.preheader.i
+; CHECK-NEXT:    bge a0, a1, .LBB0_12
+; CHECK-NEXT:  .LBB0_8: # %for.body.us.preheader.i
+; CHECK-NEXT:    blt a3, a0, .LBB0_10
+; CHECK-NEXT:  .LBB0_9: # %for.body.us.preheader.i
+; CHECK-NEXT:    mv a3, a0
+; CHECK-NEXT:  .LBB0_10: # %for.body.us.preheader.i
+; CHECK-NEXT:    sw a3, 0(zero)
+; CHECK-NEXT:    li a0, 0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB0_11: # %for.body.us.preheader.i
+; CHECK-NEXT:    mv a3, a2
+; CHECK-NEXT:    blt a0, a1, .LBB0_8
+; CHECK-NEXT:  .LBB0_12: # %for.body.us.preheader.i
+; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:    bge a3, a0, .LBB0_9
+; CHECK-NEXT:    j .LBB0_10
+for.body.us.preheader.i:
+  %partial_vector = insertelement <4 x i64> zeroinitializer, i64 %input_value, i64 1
+  %comparison_vector = shufflevector <4 x i64> %partial_vector, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
+  %comparison_result = icmp sle <4 x i64> %comparison_vector, zeroinitializer
+  %selected_value1 = select i1 %cond_flag1, i32 %base_value, i32 1
+  %selected_value2 = select i1 %cond_flag2, i32 %base_value, i32 1
+  %selected_value3 = select i1 %cond_flag3, i32 %base_value, i32 1
+  %bool_to_int = zext <4 x i1> %comparison_result to <4 x i32>
+  %extended_vector = shufflevector <4 x i32> %bool_to_int, <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+  %vector_min = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> %extended_vector)
+  %min1 = call i32 @llvm.smin.i32(i32 %vector_min, i32 %selected_value1)
+  %min2 = call i32 @llvm.smin.i32(i32 %selected_value2, i32 %selected_value3)
+  %final_min = call i32 @llvm.smin.i32(i32 %min1, i32 %min2)
+  store i32 %final_min, ptr null, align 4
+  ret i32 0
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/regcoal-liveinterval-pruning-crash.mir b/llvm/test/CodeGen/RISCV/rvv/regcoal-liveinterval-pruning-crash.mir
new file mode 100644
index 0000000000000..aeab8f69f1602
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/regcoal-liveinterval-pruning-crash.mir
@@ -0,0 +1,57 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=riscv64 -mattr=+v -run-pass=register-coalescer -o - %s | FileCheck %s
+
+---
+name:           pr71023
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: pr71023
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $x10, $v8, $v10
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   dead [[DEF:%[0-9]+]]:gpr = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[PseudoVMV_V_I_M1_:%[0-9]+]].sub_vrm1_2:vrn8m1 = PseudoVMV_V_I_M1 undef [[PseudoVMV_V_I_M1_]].sub_vrm1_2, 0, -1, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+  ; CHECK-NEXT:   [[PseudoVMV_V_I_M1_:%[0-9]+]].sub_vrm1_6:vrn8m1 = COPY undef [[PseudoVMV_V_I_M1_]].sub_vrm1_2
+  ; CHECK-NEXT:   BNE undef [[DEF]], $x0, %bb.3
+  ; CHECK-NEXT:   PseudoBR %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   BNE undef [[DEF]], $x0, %bb.3
+  ; CHECK-NEXT:   PseudoBR %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   dead [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF
+  ; CHECK-NEXT:   early-clobber [[PseudoVMV_V_I_M1_]].sub_vrm1_0:vrn8m1 = PseudoVRGATHER_VI_M1 undef [[PseudoVMV_V_I_M1_]].sub_vrm1_0, [[PseudoVMV_V_I_M1_]].sub_vrm1_2, 0, 0, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+  ; CHECK-NEXT:   PseudoVSSEG6E8_V_M1_MASK [[PseudoVMV_V_I_M1_]].sub_vrm1_0_sub_vrm1_1_sub_vrm1_2_sub_vrm1_3_sub_vrm1_4_sub_vrm1_5, undef [[DEF]], killed undef $v0, 0, 3 /* e8 */, implicit $vl, implicit $vtype :: (store unknown-size, align 1)
+  ; CHECK-NEXT:   PseudoRET
+  bb.0:
+    successors: %bb.3(0x40000000), %bb.1(0x40000000)
+    liveins: $x10, $v8, $v10
+    %0:gpr = IMPLICIT_DEF
+    %1:vrnov0 = PseudoVMV_V_I_M1 undef %1, 0, -1, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+    %2:vrnov0 = IMPLICIT_DEF
+    undef %3.sub_vrm1_0:vrn6m1nov0 = COPY undef %1
+    %3.sub_vrm1_3:vrn6m1nov0 = COPY %2
+    %3.sub_vrm1_4:vrn6m1nov0 = COPY undef %1
+    BNE undef %0, $x0, %bb.3
+    PseudoBR %bb.1
+  bb.1:
+    successors: %bb.3(0x40000000), %bb.2(0x40000000)
+    BNE killed undef %0, $x0, %bb.3
+    PseudoBR %bb.2
+  bb.2:
+    successors: %bb.3(0x80000000)
+  bb.3:
+    %4:vr = IMPLICIT_DEF
+    early-clobber %4:vr = PseudoVRGATHER_VI_M1 undef %4, killed %1, 0, 0, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+    undef %5.sub_vrm1_0:vrn6m1 = COPY killed %4
+    %5.sub_vrm1_5:vrn6m1 = COPY killed %2
+    PseudoVSSEG6E8_V_M1_MASK killed %5, undef %0, killed undef $v0, 0, 3 /* e8 */, implicit $vl, implicit $vtype :: (store unknown-size, align 1)
+    PseudoRET
+...
diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll b/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll
index f54ab9fe8b9d8..d801c5187b592 100644
--- a/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll
@@ -507,7 +507,7 @@ define <vscale x 1 x i64> @neg_shl_is_not_commutative(ptr %p) {
 ; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[SPLAT_INSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[OFFSET:%.*]] = shl <vscale x 1 x i64> [[SPLAT]], [[STEP]]
 ; CHECK-NEXT:    [[PTRS:%.*]] = getelementptr i32, ptr [[P:%.*]], <vscale x 1 x i64> [[OFFSET]]
-; CHECK-NEXT:    [[X:%.*]] = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> [[PTRS]], i32 8, <vscale x 1 x i1> splat (i1 true), <vscale x 1 x i64> poison)
+; CHECK-NEXT:    [[X:%.*]] = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> align 8 [[PTRS]], <vscale x 1 x i1> splat (i1 true), <vscale x 1 x i64> poison)
 ; CHECK-NEXT:    ret <vscale x 1 x i64> [[X]]
 ;
   %step = call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
@@ -571,7 +571,7 @@ define void @scatter_loopless(<vscale x 1 x i64> %x, ptr %p, i64 %stride) {
 define void @constant_stride(<vscale x 1 x i64> %x, ptr %p, i64 %stride) {
 ; CHECK-LABEL: @constant_stride(
 ; CHECK-NEXT:    [[PTRS:%.*]] = getelementptr i32, ptr [[P:%.*]], <vscale x 1 x i64> zeroinitializer
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x ptr> [[PTRS]], i32 8, <vscale x 1 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x ptr> align 8 [[PTRS]], <vscale x 1 x i1> splat (i1 true))
 ; CHECK-NEXT:    ret void
 ;
   %ptrs = getelementptr i32, ptr %p, <vscale x 1 x i64> zeroinitializer
@@ -627,7 +627,7 @@ define <vscale x 1 x i64> @nonstrided_base_scalar_offset(ptr %p, <vscale x 1 x i
 ; CHECK-LABEL: @nonstrided_base_scalar_offset(
 ; CHECK-NEXT:    [[PTRS1:%.*]] = getelementptr i64, ptr [[P:%.*]], <vscale x 1 x i64> [[V:%.*]]
 ; CHECK-NEXT:    [[PTRS2:%.*]] = getelementptr i64, <vscale x 1 x ptr> [[PTRS1]], i64 [[OFFSET:%.*]]
-; CHECK-NEXT:    [[X:%.*]] = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> [[PTRS2]], i32 8, <vscale x 1 x i1> splat (i1 true), <vscale x 1 x i64> poison)
+; CHECK-NEXT:    [[X:%.*]] = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> align 8 [[PTRS2]], <vscale x 1 x i1> splat (i1 true), <vscale x 1 x i64> poison)
 ; CHECK-NEXT:    ret <vscale x 1 x i64> [[X]]
 ;
   %ptrs1 = getelementptr i64, ptr %p, <vscale x 1 x i64> %v
@@ -647,7 +647,7 @@ define <vscale x 1 x i64> @vector_base_vector_offset(ptr %p, <vscale x 1 x i64>
 ; CHECK-NEXT:    [[STEP:%.*]] = call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
 ; CHECK-NEXT:    [[PTRS1:%.*]] = getelementptr i64, ptr [[P:%.*]], <vscale x 1 x i64> [[STEP]]
 ; CHECK-NEXT:    [[PTRS2:%.*]] = getelementptr i64, <vscale x 1 x ptr> [[PTRS1]], <vscale x 1 x i64> [[OFFSET:%.*]]
-; CHECK-NEXT:    [[X:%.*]] = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> [[PTRS2]], i32 8, <vscale x 1 x i1> splat (i1 true), <vscale x 1 x i64> poison)
+; CHECK-NEXT:    [[X:%.*]] = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> align 8 [[PTRS2]], <vscale x 1 x i1> splat (i1 true), <vscale x 1 x i64> poison)
 ; CHECK-NEXT:    ret <vscale x 1 x i64> [[X]]
 ;
   %step = call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfadd-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfadd-bf.ll
new file mode 100644
index 0000000000000..db1b081258d5f
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfadd-bf.ll
@@ -0,0 +1,607 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfadd.nxv1bf16.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfadd_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfadd.vv v8, v8, v9
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfadd.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> poison,
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfadd.mask.nxv1bf16.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfadd_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vfadd.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfadd.mask.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfadd.nxv2bf16.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfadd_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vfadd.vv v8, v8, v9
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfadd.nxv2bf16.nxv2bf16(
+    <vscale x 2 x bfloat> poison,
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfadd.mask.nxv2bf16.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfadd_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vfadd.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfadd.mask.nxv2bf16.nxv2bf16(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x bfloat> %2,
+    <vscale x 2 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfadd.nxv4bf16.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfadd_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vfadd.vv v8, v8, v9
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfadd.nxv4bf16.nxv4bf16(
+    <vscale x 4 x bfloat> poison,
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfadd.mask.nxv4bf16.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfadd_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vfadd.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfadd.mask.nxv4bf16.nxv4bf16(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x bfloat> %2,
+    <vscale x 4 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfadd.nxv8bf16.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfadd_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vfadd.vv v8, v8, v10
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfadd.nxv8bf16.nxv8bf16(
+    <vscale x 8 x bfloat> poison,
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfadd.mask.nxv8bf16.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfadd_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vfadd.vv v8, v10, v12, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfadd.mask.nxv8bf16.nxv8bf16(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x bfloat> %2,
+    <vscale x 8 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfadd.nxv16bf16.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfadd_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vfadd.vv v8, v8, v12
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfadd.nxv16bf16.nxv16bf16(
+    <vscale x 16 x bfloat> poison,
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfadd.mask.nxv16bf16.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfadd_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT:    vfadd.vv v8, v12, v16, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfadd.mask.nxv16bf16.nxv16bf16(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x bfloat> %2,
+    <vscale x 16 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfadd.nxv32bf16.nxv32bf16(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfadd_vv_nxv32bf16_nxv32bf16_nxv32bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_vv_nxv32bf16_nxv32bf16_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT:    vfadd.vv v8, v8, v16
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfadd.nxv32bf16.nxv32bf16(
+    <vscale x 32 x bfloat> poison,
+    <vscale x 32 x bfloat> %0,
+    <vscale x 32 x bfloat> %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfadd.mask.nxv32bf16.nxv32bf16(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfadd_mask_vv_nxv32bf16_nxv32bf16_nxv32bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, <vscale x 32 x bfloat> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_mask_vv_nxv32bf16_nxv32bf16_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vl8re16.v v24, (a0)
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16alt, m8, ta, mu
+; CHECK-NEXT:    vfadd.vv v8, v16, v24, v0.t
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfadd.mask.nxv32bf16.nxv32bf16(
+    <vscale x 32 x bfloat> %0,
+    <vscale x 32 x bfloat> %1,
+    <vscale x 32 x bfloat> %2,
+    <vscale x 32 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfadd.nxv1bf16.bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfadd_vf_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_vf_nxv1bf16_nxv1bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfadd.vf v8, v8, fa0
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfadd.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> poison,
+    <vscale x 1 x bfloat> %0,
+    bfloat %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfadd.mask.nxv1bf16.bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  bfloat,
+  <vscale x 1 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfadd_mask_vf_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, bfloat %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_mask_vf_nxv1bf16_nxv1bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vfadd.vf v8, v9, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfadd.mask.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    bfloat %2,
+    <vscale x 1 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfadd.nxv2bf16.bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfadd_vf_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_vf_nxv2bf16_nxv2bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vfadd.vf v8, v8, fa0
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfadd.nxv2bf16.bf16(
+    <vscale x 2 x bfloat> poison,
+    <vscale x 2 x bfloat> %0,
+    bfloat %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfadd.mask.nxv2bf16.bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  bfloat,
+  <vscale x 2 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfadd_mask_vf_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, bfloat %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_mask_vf_nxv2bf16_nxv2bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vfadd.vf v8, v9, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfadd.mask.nxv2bf16.bf16(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    bfloat %2,
+    <vscale x 2 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfadd.nxv4bf16.bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfadd_vf_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_vf_nxv4bf16_nxv4bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vfadd.vf v8, v8, fa0
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfadd.nxv4bf16.bf16(
+    <vscale x 4 x bfloat> poison,
+    <vscale x 4 x bfloat> %0,
+    bfloat %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfadd.mask.nxv4bf16.bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  bfloat,
+  <vscale x 4 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfadd_mask_vf_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, bfloat %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_mask_vf_nxv4bf16_nxv4bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vfadd.vf v8, v9, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfadd.mask.nxv4bf16.bf16(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    bfloat %2,
+    <vscale x 4 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfadd.nxv8bf16.bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfadd_vf_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_vf_nxv8bf16_nxv8bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vfadd.vf v8, v8, fa0
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfadd.nxv8bf16.bf16(
+    <vscale x 8 x bfloat> poison,
+    <vscale x 8 x bfloat> %0,
+    bfloat %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfadd.mask.nxv8bf16.bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  bfloat,
+  <vscale x 8 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfadd_mask_vf_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, bfloat %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_mask_vf_nxv8bf16_nxv8bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vfadd.vf v8, v10, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfadd.mask.nxv8bf16.bf16(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    bfloat %2,
+    <vscale x 8 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfadd.nxv16bf16.bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfadd_vf_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_vf_nxv16bf16_nxv16bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vfadd.vf v8, v8, fa0
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfadd.nxv16bf16.bf16(
+    <vscale x 16 x bfloat> poison,
+    <vscale x 16 x bfloat> %0,
+    bfloat %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfadd.mask.nxv16bf16.bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  bfloat,
+  <vscale x 16 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfadd_mask_vf_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, bfloat %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_mask_vf_nxv16bf16_nxv16bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT:    vfadd.vf v8, v12, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfadd.mask.nxv16bf16.bf16(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    bfloat %2,
+    <vscale x 16 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfadd.nxv32bf16.bf16(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfadd_vf_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_vf_nxv32bf16_nxv32bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT:    vfadd.vf v8, v8, fa0
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfadd.nxv32bf16.bf16(
+    <vscale x 32 x bfloat> poison,
+    <vscale x 32 x bfloat> %0,
+    bfloat %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfadd.mask.nxv32bf16.bf16(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  bfloat,
+  <vscale x 32 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfadd_mask_vf_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, bfloat %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfadd_mask_vf_nxv32bf16_nxv32bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, ta, mu
+; CHECK-NEXT:    vfadd.vf v8, v16, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfadd.mask.nxv32bf16.bf16(
+    <vscale x 32 x bfloat> %0,
+    <vscale x 32 x bfloat> %1,
+    bfloat %2,
+    <vscale x 32 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfclass-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfclass-bf.ll
new file mode 100644
index 0000000000000..d7d49b379b5a4
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfclass-bf.ll
@@ -0,0 +1,294 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x i16> @llvm.riscv.vfclass.nxv1i16.nxv1bf16(
+  <vscale x 1 x i16>,
+  <vscale x 1 x bfloat>,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vfclass_v_nxv1i16_nxv1bf16(
+; CHECK-LABEL: intrinsic_vfclass_v_nxv1i16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfclass.v v8, v8
+; CHECK-NEXT:    ret
+  <vscale x 1 x bfloat> %0,
+  iXLen %1) nounwind {
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vfclass.nxv1i16.nxv1bf16(
+    <vscale x 1 x i16> poison,
+    <vscale x 1 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vfclass.mask.nxv1i16.nxv1bf16(
+  <vscale x 1 x i16>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen, iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vfclass_mask_v_nxv1i16_nxv1bf16(
+; CHECK-LABEL: intrinsic_vfclass_mask_v_nxv1i16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT:    vfclass.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+  <vscale x 1 x i16> %0,
+  <vscale x 1 x bfloat> %1,
+  <vscale x 1 x i1> %2,
+  iXLen %3) nounwind {
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vfclass.mask.nxv1i16.nxv1bf16(
+    <vscale x 1 x i16> %0,
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x i1> %2,
+    iXLen %3, iXLen 0)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vfclass.nxv2i16.nxv2bf16(
+  <vscale x 2 x i16>,
+  <vscale x 2 x bfloat>,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vfclass_v_nxv2i16_nxv2bf16(
+; CHECK-LABEL: intrinsic_vfclass_v_nxv2i16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vfclass.v v8, v8
+; CHECK-NEXT:    ret
+  <vscale x 2 x bfloat> %0,
+  iXLen %1) nounwind {
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vfclass.nxv2i16.nxv2bf16(
+    <vscale x 2 x i16> poison,
+    <vscale x 2 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vfclass.mask.nxv2i16.nxv2bf16(
+  <vscale x 2 x i16>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen, iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vfclass_mask_v_nxv2i16_nxv2bf16(
+; CHECK-LABEL: intrinsic_vfclass_mask_v_nxv2i16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT:    vfclass.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+  <vscale x 2 x i16> %0,
+  <vscale x 2 x bfloat> %1,
+  <vscale x 2 x i1> %2,
+  iXLen %3) nounwind {
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vfclass.mask.nxv2i16.nxv2bf16(
+    <vscale x 2 x i16> %0,
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x i1> %2,
+    iXLen %3, iXLen 0)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vfclass.nxv4i16.nxv4bf16(
+  <vscale x 4 x i16>,
+  <vscale x 4 x bfloat>,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vfclass_v_nxv4i16_nxv4bf16(
+; CHECK-LABEL: intrinsic_vfclass_v_nxv4i16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vfclass.v v8, v8
+; CHECK-NEXT:    ret
+  <vscale x 4 x bfloat> %0,
+  iXLen %1) nounwind {
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vfclass.nxv4i16.nxv4bf16(
+    <vscale x 4 x i16> poison,
+    <vscale x 4 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vfclass.mask.nxv4i16.nxv4bf16(
+  <vscale x 4 x i16>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen, iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vfclass_mask_v_nxv4i16_nxv4bf16(
+; CHECK-LABEL: intrinsic_vfclass_mask_v_nxv4i16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT:    vfclass.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+  <vscale x 4 x i16> %0,
+  <vscale x 4 x bfloat> %1,
+  <vscale x 4 x i1> %2,
+  iXLen %3) nounwind {
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vfclass.mask.nxv4i16.nxv4bf16(
+    <vscale x 4 x i16> %0,
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x i1> %2,
+    iXLen %3, iXLen 0)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vfclass.nxv8i16.nxv8bf16(
+  <vscale x 8 x i16>,
+  <vscale x 8 x bfloat>,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vfclass_v_nxv8i16_nxv8bf16(
+; CHECK-LABEL: intrinsic_vfclass_v_nxv8i16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vfclass.v v8, v8
+; CHECK-NEXT:    ret
+  <vscale x 8 x bfloat> %0,
+  iXLen %1) nounwind {
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vfclass.nxv8i16.nxv8bf16(
+    <vscale x 8 x i16> poison,
+    <vscale x 8 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vfclass.mask.nxv8i16.nxv8bf16(
+  <vscale x 8 x i16>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen, iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vfclass_mask_v_nxv8i16_nxv8bf16(
+; CHECK-LABEL: intrinsic_vfclass_mask_v_nxv8i16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT:    vfclass.v v8, v10, v0.t
+; CHECK-NEXT:    ret
+  <vscale x 8 x i16> %0,
+  <vscale x 8 x bfloat> %1,
+  <vscale x 8 x i1> %2,
+  iXLen %3) nounwind {
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vfclass.mask.nxv8i16.nxv8bf16(
+    <vscale x 8 x i16> %0,
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x i1> %2,
+    iXLen %3, iXLen 0)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vfclass.nxv16i16.nxv16bf16(
+  <vscale x 16 x i16>,
+  <vscale x 16 x bfloat>,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vfclass_v_nxv16i16_nxv16bf16(
+; CHECK-LABEL: intrinsic_vfclass_v_nxv16i16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vfclass.v v8, v8
+; CHECK-NEXT:    ret
+  <vscale x 16 x bfloat> %0,
+  iXLen %1) nounwind {
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vfclass.nxv16i16.nxv16bf16(
+    <vscale x 16 x i16> poison,
+    <vscale x 16 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vfclass.mask.nxv16i16.nxv16bf16(
+  <vscale x 16 x i16>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen, iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vfclass_mask_v_nxv16i16_nxv16bf16(
+; CHECK-LABEL: intrinsic_vfclass_mask_v_nxv16i16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT:    vfclass.v v8, v12, v0.t
+; CHECK-NEXT:    ret
+  <vscale x 16 x i16> %0,
+  <vscale x 16 x bfloat> %1,
+  <vscale x 16 x i1> %2,
+  iXLen %3) nounwind {
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vfclass.mask.nxv16i16.nxv16bf16(
+    <vscale x 16 x i16> %0,
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x i1> %2,
+    iXLen %3, iXLen 0)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 32 x i16> @llvm.riscv.vfclass.nxv32i16.nxv32bf16(
+  <vscale x 32 x i16>,
+  <vscale x 32 x bfloat>,
+  iXLen);
+
+define <vscale x 32 x i16> @intrinsic_vfclass_v_nxv32i16_nxv32bf16(
+; CHECK-LABEL: intrinsic_vfclass_v_nxv32i16_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT:    vfclass.v v8, v8
+; CHECK-NEXT:    ret
+  <vscale x 32 x bfloat> %0,
+  iXLen %1) nounwind {
+entry:
+  %a = call <vscale x 32 x i16> @llvm.riscv.vfclass.nxv32i16.nxv32bf16(
+    <vscale x 32 x i16> poison,
+    <vscale x 32 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 32 x i16> %a
+}
+
+declare <vscale x 32 x i16> @llvm.riscv.vfclass.mask.nxv32i16.nxv32bf16(
+  <vscale x 32 x i16>,
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x i1>,
+  iXLen, iXLen);
+
+define <vscale x 32 x i16> @intrinsic_vfclass_mask_v_nxv32i16_nxv32bf16(
+; CHECK-LABEL: intrinsic_vfclass_mask_v_nxv32i16_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, tu, mu
+; CHECK-NEXT:    vfclass.v v8, v16, v0.t
+; CHECK-NEXT:    ret
+  <vscale x 32 x i16> %0,
+  <vscale x 32 x bfloat> %1,
+  <vscale x 32 x i1> %2,
+  iXLen %3) nounwind {
+entry:
+  %a = call <vscale x 32 x i16> @llvm.riscv.vfclass.mask.nxv32i16.nxv32bf16(
+    <vscale x 32 x i16> %0,
+    <vscale x 32 x bfloat> %1,
+    <vscale x 32 x i1> %2,
+    iXLen %3, iXLen 0)
+
+  ret <vscale x 32 x i16> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmacc-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfmacc-bf.ll
new file mode 100644
index 0000000000000..13821d745846f
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmacc-bf.ll
@@ -0,0 +1,553 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmacc.nxv1bf16.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat>  @intrinsic_vfmacc_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmacc_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, ma
+; CHECK-NEXT:    vfmacc.vv v8, v9, v10
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmacc.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmacc.mask.nxv1bf16.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat>  @intrinsic_vfmacc_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmacc_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT:    vfmacc.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmacc.mask.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmacc.nxv2bf16.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat>  @intrinsic_vfmacc_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmacc_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, ma
+; CHECK-NEXT:    vfmacc.vv v8, v9, v10
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmacc.nxv2bf16.nxv2bf16(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmacc.mask.nxv2bf16.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat>  @intrinsic_vfmacc_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmacc_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT:    vfmacc.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmacc.mask.nxv2bf16.nxv2bf16(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x bfloat> %2,
+    <vscale x 2 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmacc.nxv4bf16.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat>  @intrinsic_vfmacc_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmacc_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT:    vfmacc.vv v8, v9, v10
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmacc.nxv4bf16.nxv4bf16(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmacc.mask.nxv4bf16.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat>  @intrinsic_vfmacc_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmacc_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT:    vfmacc.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmacc.mask.nxv4bf16.nxv4bf16(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x bfloat> %2,
+    <vscale x 4 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmacc.nxv8bf16.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat>  @intrinsic_vfmacc_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmacc_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, ma
+; CHECK-NEXT:    vfmacc.vv v8, v10, v12
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmacc.nxv8bf16.nxv8bf16(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmacc.mask.nxv8bf16.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat>  @intrinsic_vfmacc_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmacc_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT:    vfmacc.vv v8, v10, v12, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmacc.mask.nxv8bf16.nxv8bf16(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x bfloat> %2,
+    <vscale x 8 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmacc.nxv16bf16.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat>  @intrinsic_vfmacc_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmacc_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, ma
+; CHECK-NEXT:    vfmacc.vv v8, v12, v16
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmacc.nxv16bf16.nxv16bf16(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmacc.mask.nxv16bf16.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat>  @intrinsic_vfmacc_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmacc_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT:    vfmacc.vv v8, v12, v16, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmacc.mask.nxv16bf16.nxv16bf16(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x bfloat> %2,
+    <vscale x 16 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmacc.nxv1bf16.bf16(
+  <vscale x 1 x bfloat>,
+  bfloat,
+  <vscale x 1 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat>  @intrinsic_vfmacc_vf_nxv1bf16_bf16_nxv1bf16(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmacc_vf_nxv1bf16_bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, ma
+; CHECK-NEXT:    vfmacc.vf v8, fa0, v9
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmacc.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> %0,
+    bfloat %1,
+    <vscale x 1 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmacc.mask.nxv1bf16.bf16(
+  <vscale x 1 x bfloat>,
+  bfloat,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmacc_mask_vf_nxv1bf16_bf16_nxv1bf16(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmacc_mask_vf_nxv1bf16_bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT:    vfmacc.vf v8, fa0, v9, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmacc.mask.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> %0,
+    bfloat %1,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmacc.nxv2bf16.bf16(
+  <vscale x 2 x bfloat>,
+  bfloat,
+  <vscale x 2 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat>  @intrinsic_vfmacc_vf_nxv2bf16_bf16_nxv2bf16(<vscale x 2 x bfloat> %0, bfloat %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmacc_vf_nxv2bf16_bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, ma
+; CHECK-NEXT:    vfmacc.vf v8, fa0, v9
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmacc.nxv2bf16.bf16(
+    <vscale x 2 x bfloat> %0,
+    bfloat %1,
+    <vscale x 2 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmacc.mask.nxv2bf16.bf16(
+  <vscale x 2 x bfloat>,
+  bfloat,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmacc_mask_vf_nxv2bf16_bf16_nxv2bf16(<vscale x 2 x bfloat> %0, bfloat %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmacc_mask_vf_nxv2bf16_bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT:    vfmacc.vf v8, fa0, v9, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmacc.mask.nxv2bf16.bf16(
+    <vscale x 2 x bfloat> %0,
+    bfloat %1,
+    <vscale x 2 x bfloat> %2,
+    <vscale x 2 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmacc.nxv4bf16.bf16(
+  <vscale x 4 x bfloat>,
+  bfloat,
+  <vscale x 4 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat>  @intrinsic_vfmacc_vf_nxv4bf16_bf16_nxv4bf16(<vscale x 4 x bfloat> %0, bfloat %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmacc_vf_nxv4bf16_bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT:    vfmacc.vf v8, fa0, v9
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmacc.nxv4bf16.bf16(
+    <vscale x 4 x bfloat> %0,
+    bfloat %1,
+    <vscale x 4 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmacc.mask.nxv4bf16.bf16(
+  <vscale x 4 x bfloat>,
+  bfloat,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmacc_mask_vf_nxv4bf16_bf16_nxv4bf16(<vscale x 4 x bfloat> %0, bfloat %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmacc_mask_vf_nxv4bf16_bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT:    vfmacc.vf v8, fa0, v9, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmacc.mask.nxv4bf16.bf16(
+    <vscale x 4 x bfloat> %0,
+    bfloat %1,
+    <vscale x 4 x bfloat> %2,
+    <vscale x 4 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmacc.nxv8bf16.bf16(
+  <vscale x 8 x bfloat>,
+  bfloat,
+  <vscale x 8 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat>  @intrinsic_vfmacc_vf_nxv8bf16_bf16_nxv8bf16(<vscale x 8 x bfloat> %0, bfloat %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmacc_vf_nxv8bf16_bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, ma
+; CHECK-NEXT:    vfmacc.vf v8, fa0, v10
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmacc.nxv8bf16.bf16(
+    <vscale x 8 x bfloat> %0,
+    bfloat %1,
+    <vscale x 8 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmacc.mask.nxv8bf16.bf16(
+  <vscale x 8 x bfloat>,
+  bfloat,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmacc_mask_vf_nxv8bf16_bf16_nxv8bf16(<vscale x 8 x bfloat> %0, bfloat %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmacc_mask_vf_nxv8bf16_bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT:    vfmacc.vf v8, fa0, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmacc.mask.nxv8bf16.bf16(
+    <vscale x 8 x bfloat> %0,
+    bfloat %1,
+    <vscale x 8 x bfloat> %2,
+    <vscale x 8 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmacc.nxv16bf16.bf16(
+  <vscale x 16 x bfloat>,
+  bfloat,
+  <vscale x 16 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat>  @intrinsic_vfmacc_vf_nxv16bf16_bf16_nxv16bf16(<vscale x 16 x bfloat> %0, bfloat %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmacc_vf_nxv16bf16_bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, ma
+; CHECK-NEXT:    vfmacc.vf v8, fa0, v12
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmacc.nxv16bf16.bf16(
+    <vscale x 16 x bfloat> %0,
+    bfloat %1,
+    <vscale x 16 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmacc.mask.nxv16bf16.bf16(
+  <vscale x 16 x bfloat>,
+  bfloat,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmacc_mask_vf_nxv16bf16_bf16_nxv16bf16(<vscale x 16 x bfloat> %0, bfloat %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmacc_mask_vf_nxv16bf16_bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT:    vfmacc.vf v8, fa0, v12, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmacc.mask.nxv16bf16.bf16(
+    <vscale x 16 x bfloat> %0,
+    bfloat %1,
+    <vscale x 16 x bfloat> %2,
+    <vscale x 16 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat>  @intrinsic_vfmacc_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmacc_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfmadd.vv v8, v10, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmacc.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %2,
+    iXLen 7, iXLen %3, iXLen 3)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat>  @intrinsic_vfmacc_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute2(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmacc_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfmadd.vv v8, v10, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmacc.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x bfloat> %0,
+    iXLen 7, iXLen %3, iXLen 3)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat>  @intrinsic_vfmacc_vf_nxv1bf16_bf16_nxv1bf16_commute(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmacc_vf_nxv1bf16_bf16_nxv1bf16_commute:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfmadd.vf v8, fa0, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmacc.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> %2,
+    bfloat %1,
+    <vscale x 1 x bfloat> %0,
+    iXLen 7, iXLen %3, iXLen 3)
+
+  ret <vscale x 1 x bfloat> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmadd-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfmadd-bf.ll
new file mode 100644
index 0000000000000..09fc199c29d23
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmadd-bf.ll
@@ -0,0 +1,553 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmadd.nxv1bf16.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat>  @intrinsic_vfmadd_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmadd_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, ma
+; CHECK-NEXT:    vfmadd.vv v8, v9, v10
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmadd.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmadd.mask.nxv1bf16.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat>  @intrinsic_vfmadd_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmadd_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT:    vfmadd.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmadd.mask.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmadd.nxv2bf16.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat>  @intrinsic_vfmadd_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmadd_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, ma
+; CHECK-NEXT:    vfmadd.vv v8, v9, v10
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmadd.nxv2bf16.nxv2bf16(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmadd.mask.nxv2bf16.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat>  @intrinsic_vfmadd_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmadd_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT:    vfmadd.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmadd.mask.nxv2bf16.nxv2bf16(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x bfloat> %2,
+    <vscale x 2 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmadd.nxv4bf16.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat>  @intrinsic_vfmadd_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmadd_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT:    vfmadd.vv v8, v9, v10
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmadd.nxv4bf16.nxv4bf16(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmadd.mask.nxv4bf16.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat>  @intrinsic_vfmadd_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmadd_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT:    vfmadd.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmadd.mask.nxv4bf16.nxv4bf16(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x bfloat> %2,
+    <vscale x 4 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmadd.nxv8bf16.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat>  @intrinsic_vfmadd_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmadd_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, ma
+; CHECK-NEXT:    vfmadd.vv v8, v10, v12
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmadd.nxv8bf16.nxv8bf16(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmadd.mask.nxv8bf16.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat>  @intrinsic_vfmadd_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmadd_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT:    vfmadd.vv v8, v10, v12, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmadd.mask.nxv8bf16.nxv8bf16(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x bfloat> %2,
+    <vscale x 8 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmadd.nxv16bf16.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat>  @intrinsic_vfmadd_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmadd_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, ma
+; CHECK-NEXT:    vfmadd.vv v8, v12, v16
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmadd.nxv16bf16.nxv16bf16(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmadd.mask.nxv16bf16.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat>  @intrinsic_vfmadd_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmadd_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT:    vfmadd.vv v8, v12, v16, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmadd.mask.nxv16bf16.nxv16bf16(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x bfloat> %2,
+    <vscale x 16 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmadd.nxv1bf16.bf16(
+  <vscale x 1 x bfloat>,
+  bfloat,
+  <vscale x 1 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat>  @intrinsic_vfmadd_vf_nxv1bf16_bf16_nxv1bf16(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmadd_vf_nxv1bf16_bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, ma
+; CHECK-NEXT:    vfmadd.vf v8, fa0, v9
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmadd.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> %0,
+    bfloat %1,
+    <vscale x 1 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmadd.mask.nxv1bf16.bf16(
+  <vscale x 1 x bfloat>,
+  bfloat,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmadd_mask_vf_nxv1bf16_bf16_nxv1bf16(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmadd_mask_vf_nxv1bf16_bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT:    vfmadd.vf v8, fa0, v9, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmadd.mask.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> %0,
+    bfloat %1,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmadd.nxv2bf16.bf16(
+  <vscale x 2 x bfloat>,
+  bfloat,
+  <vscale x 2 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat>  @intrinsic_vfmadd_vf_nxv2bf16_bf16_nxv2bf16(<vscale x 2 x bfloat> %0, bfloat %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmadd_vf_nxv2bf16_bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, ma
+; CHECK-NEXT:    vfmadd.vf v8, fa0, v9
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmadd.nxv2bf16.bf16(
+    <vscale x 2 x bfloat> %0,
+    bfloat %1,
+    <vscale x 2 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmadd.mask.nxv2bf16.bf16(
+  <vscale x 2 x bfloat>,
+  bfloat,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmadd_mask_vf_nxv2bf16_bf16_nxv2bf16(<vscale x 2 x bfloat> %0, bfloat %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmadd_mask_vf_nxv2bf16_bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT:    vfmadd.vf v8, fa0, v9, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmadd.mask.nxv2bf16.bf16(
+    <vscale x 2 x bfloat> %0,
+    bfloat %1,
+    <vscale x 2 x bfloat> %2,
+    <vscale x 2 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmadd.nxv4bf16.bf16(
+  <vscale x 4 x bfloat>,
+  bfloat,
+  <vscale x 4 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat>  @intrinsic_vfmadd_vf_nxv4bf16_bf16_nxv4bf16(<vscale x 4 x bfloat> %0, bfloat %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmadd_vf_nxv4bf16_bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT:    vfmadd.vf v8, fa0, v9
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmadd.nxv4bf16.bf16(
+    <vscale x 4 x bfloat> %0,
+    bfloat %1,
+    <vscale x 4 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmadd.mask.nxv4bf16.bf16(
+  <vscale x 4 x bfloat>,
+  bfloat,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmadd_mask_vf_nxv4bf16_bf16_nxv4bf16(<vscale x 4 x bfloat> %0, bfloat %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmadd_mask_vf_nxv4bf16_bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT:    vfmadd.vf v8, fa0, v9, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmadd.mask.nxv4bf16.bf16(
+    <vscale x 4 x bfloat> %0,
+    bfloat %1,
+    <vscale x 4 x bfloat> %2,
+    <vscale x 4 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmadd.nxv8bf16.bf16(
+  <vscale x 8 x bfloat>,
+  bfloat,
+  <vscale x 8 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat>  @intrinsic_vfmadd_vf_nxv8bf16_bf16_nxv8bf16(<vscale x 8 x bfloat> %0, bfloat %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmadd_vf_nxv8bf16_bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, ma
+; CHECK-NEXT:    vfmadd.vf v8, fa0, v10
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmadd.nxv8bf16.bf16(
+    <vscale x 8 x bfloat> %0,
+    bfloat %1,
+    <vscale x 8 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmadd.mask.nxv8bf16.bf16(
+  <vscale x 8 x bfloat>,
+  bfloat,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmadd_mask_vf_nxv8bf16_bf16_nxv8bf16(<vscale x 8 x bfloat> %0, bfloat %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmadd_mask_vf_nxv8bf16_bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT:    vfmadd.vf v8, fa0, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmadd.mask.nxv8bf16.bf16(
+    <vscale x 8 x bfloat> %0,
+    bfloat %1,
+    <vscale x 8 x bfloat> %2,
+    <vscale x 8 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmadd.nxv16bf16.bf16(
+  <vscale x 16 x bfloat>,
+  bfloat,
+  <vscale x 16 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat>  @intrinsic_vfmadd_vf_nxv16bf16_bf16_nxv16bf16(<vscale x 16 x bfloat> %0, bfloat %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmadd_vf_nxv16bf16_bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, ma
+; CHECK-NEXT:    vfmadd.vf v8, fa0, v12
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmadd.nxv16bf16.bf16(
+    <vscale x 16 x bfloat> %0,
+    bfloat %1,
+    <vscale x 16 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmadd.mask.nxv16bf16.bf16(
+  <vscale x 16 x bfloat>,
+  bfloat,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmadd_mask_vf_nxv16bf16_bf16_nxv16bf16(<vscale x 16 x bfloat> %0, bfloat %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmadd_mask_vf_nxv16bf16_bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT:    vfmadd.vf v8, fa0, v12, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmadd.mask.nxv16bf16.bf16(
+    <vscale x 16 x bfloat> %0,
+    bfloat %1,
+    <vscale x 16 x bfloat> %2,
+    <vscale x 16 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat>  @intrinsic_vfmadd_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmadd_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfmadd.vv v8, v9, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmadd.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %2,
+    iXLen 7, iXLen %3, iXLen 3)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat>  @intrinsic_vfmadd_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute2(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmadd_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfmacc.vv v8, v10, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmadd.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x bfloat> %0,
+    iXLen 7, iXLen %3, iXLen 3)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat>  @intrinsic_vfmadd_vf_nxv1bf16_bf16_nxv1bf16_commute(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmadd_vf_nxv1bf16_bf16_nxv1bf16_commute:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfmacc.vf v8, fa0, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmadd.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> %2,
+    bfloat %1,
+    <vscale x 1 x bfloat> %0,
+    iXLen 7, iXLen %3, iXLen 3)
+
+  ret <vscale x 1 x bfloat> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmax-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfmax-bf.ll
new file mode 100644
index 0000000000000..a337d3061ce78
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmax-bf.ll
@@ -0,0 +1,571 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmax.nxv1bf16.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmax_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfmax.vv v8, v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmax.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> poison,
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmax.mask.nxv1bf16.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmax_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vfmax.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmax.mask.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmax.nxv2bf16.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmax_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vfmax.vv v8, v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmax.nxv2bf16.nxv2bf16(
+    <vscale x 2 x bfloat> poison,
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmax.mask.nxv2bf16.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmax_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vfmax.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmax.mask.nxv2bf16.nxv2bf16(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x bfloat> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmax.nxv4bf16.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmax_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vfmax.vv v8, v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmax.nxv4bf16.nxv4bf16(
+    <vscale x 4 x bfloat> poison,
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmax.mask.nxv4bf16.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmax_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vfmax.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmax.mask.nxv4bf16.nxv4bf16(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x bfloat> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmax.nxv8bf16.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmax_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vfmax.vv v8, v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmax.nxv8bf16.nxv8bf16(
+    <vscale x 8 x bfloat> poison,
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmax.mask.nxv8bf16.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmax_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vfmax.vv v8, v10, v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmax.mask.nxv8bf16.nxv8bf16(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x bfloat> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmax.nxv16bf16.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmax_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vfmax.vv v8, v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmax.nxv16bf16.nxv16bf16(
+    <vscale x 16 x bfloat> poison,
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmax.mask.nxv16bf16.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmax_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT:    vfmax.vv v8, v12, v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmax.mask.nxv16bf16.nxv16bf16(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x bfloat> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfmax.nxv32bf16.nxv32bf16(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfmax_vv_nxv32bf16_nxv32bf16_nxv32bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_vv_nxv32bf16_nxv32bf16_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT:    vfmax.vv v8, v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfmax.nxv32bf16.nxv32bf16(
+    <vscale x 32 x bfloat> poison,
+    <vscale x 32 x bfloat> %0,
+    <vscale x 32 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfmax.mask.nxv32bf16.nxv32bf16(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfmax_mask_vv_nxv32bf16_nxv32bf16_nxv32bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, <vscale x 32 x bfloat> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_mask_vv_nxv32bf16_nxv32bf16_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vl8re16.v v24, (a0)
+; CHECK-NEXT:    vsetvli zero, a1, e16alt, m8, ta, mu
+; CHECK-NEXT:    vfmax.vv v8, v16, v24, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfmax.mask.nxv32bf16.nxv32bf16(
+    <vscale x 32 x bfloat> %0,
+    <vscale x 32 x bfloat> %1,
+    <vscale x 32 x bfloat> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmax.nxv1bf16.bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmax_vf_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_vf_nxv1bf16_nxv1bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfmax.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmax.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> poison,
+    <vscale x 1 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmax.mask.nxv1bf16.bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  bfloat,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmax_mask_vf_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, bfloat %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_mask_vf_nxv1bf16_nxv1bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vfmax.vf v8, v9, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmax.mask.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    bfloat %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmax.nxv2bf16.bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmax_vf_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_vf_nxv2bf16_nxv2bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vfmax.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmax.nxv2bf16.bf16(
+    <vscale x 2 x bfloat> poison,
+    <vscale x 2 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmax.mask.nxv2bf16.bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  bfloat,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmax_mask_vf_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, bfloat %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_mask_vf_nxv2bf16_nxv2bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vfmax.vf v8, v9, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmax.mask.nxv2bf16.bf16(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    bfloat %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmax.nxv4bf16.bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmax_vf_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_vf_nxv4bf16_nxv4bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vfmax.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmax.nxv4bf16.bf16(
+    <vscale x 4 x bfloat> poison,
+    <vscale x 4 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmax.mask.nxv4bf16.bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  bfloat,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmax_mask_vf_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, bfloat %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_mask_vf_nxv4bf16_nxv4bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vfmax.vf v8, v9, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmax.mask.nxv4bf16.bf16(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    bfloat %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmax.nxv8bf16.bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmax_vf_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_vf_nxv8bf16_nxv8bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vfmax.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmax.nxv8bf16.bf16(
+    <vscale x 8 x bfloat> poison,
+    <vscale x 8 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmax.mask.nxv8bf16.bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  bfloat,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmax_mask_vf_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, bfloat %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_mask_vf_nxv8bf16_nxv8bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vfmax.vf v8, v10, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmax.mask.nxv8bf16.bf16(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    bfloat %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmax.nxv16bf16.bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmax_vf_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_vf_nxv16bf16_nxv16bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vfmax.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmax.nxv16bf16.bf16(
+    <vscale x 16 x bfloat> poison,
+    <vscale x 16 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmax.mask.nxv16bf16.bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  bfloat,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmax_mask_vf_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, bfloat %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_mask_vf_nxv16bf16_nxv16bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT:    vfmax.vf v8, v12, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmax.mask.nxv16bf16.bf16(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    bfloat %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfmax.nxv32bf16.bf16(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfmax_vf_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_vf_nxv32bf16_nxv32bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT:    vfmax.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfmax.nxv32bf16.bf16(
+    <vscale x 32 x bfloat> poison,
+    <vscale x 32 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfmax.mask.nxv32bf16.bf16(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  bfloat,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfmax_mask_vf_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, bfloat %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmax_mask_vf_nxv32bf16_nxv32bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, ta, mu
+; CHECK-NEXT:    vfmax.vf v8, v16, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfmax.mask.nxv32bf16.bf16(
+    <vscale x 32 x bfloat> %0,
+    <vscale x 32 x bfloat> %1,
+    bfloat %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmerge-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfmerge-bf.ll
new file mode 100644
index 0000000000000..86ba7c7fb7fe6
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmerge-bf.ll
@@ -0,0 +1,258 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmerge.nxv1bf16.bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  bfloat,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmerge_vfm_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmerge_vfm_nxv1bf16_nxv1bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfmerge.vfm v8, v8, fa0, v0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmerge.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> poison,
+    <vscale x 1 x bfloat> %0,
+    bfloat %1,
+    <vscale x 1 x i1> %2,
+    iXLen %3)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmerge.nxv2bf16.bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  bfloat,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmerge_vfm_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, bfloat %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmerge_vfm_nxv2bf16_nxv2bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vfmerge.vfm v8, v8, fa0, v0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmerge.nxv2bf16.bf16(
+    <vscale x 2 x bfloat> poison,
+    <vscale x 2 x bfloat> %0,
+    bfloat %1,
+    <vscale x 2 x i1> %2,
+    iXLen %3)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmerge.nxv4bf16.bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  bfloat,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmerge_vfm_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, bfloat %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmerge_vfm_nxv4bf16_nxv4bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vfmerge.vfm v8, v8, fa0, v0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmerge.nxv4bf16.bf16(
+    <vscale x 4 x bfloat> poison,
+    <vscale x 4 x bfloat> %0,
+    bfloat %1,
+    <vscale x 4 x i1> %2,
+    iXLen %3)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmerge.nxv8bf16.bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  bfloat,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmerge_vfm_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, bfloat %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmerge_vfm_nxv8bf16_nxv8bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vfmerge.vfm v8, v8, fa0, v0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmerge.nxv8bf16.bf16(
+    <vscale x 8 x bfloat> poison,
+    <vscale x 8 x bfloat> %0,
+    bfloat %1,
+    <vscale x 8 x i1> %2,
+    iXLen %3)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmerge.nxv16bf16.bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  bfloat,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmerge_vfm_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, bfloat %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmerge_vfm_nxv16bf16_nxv16bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vfmerge.vfm v8, v8, fa0, v0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmerge.nxv16bf16.bf16(
+    <vscale x 16 x bfloat> poison,
+    <vscale x 16 x bfloat> %0,
+    bfloat %1,
+    <vscale x 16 x i1> %2,
+    iXLen %3)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfmerge.nxv32bf16.bf16(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  bfloat,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfmerge_vfm_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, bfloat %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmerge_vfm_nxv32bf16_nxv32bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT:    vfmerge.vfm v8, v8, fa0, v0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfmerge.nxv32bf16.bf16(
+    <vscale x 32 x bfloat> poison,
+    <vscale x 32 x bfloat> %0,
+    bfloat %1,
+    <vscale x 32 x i1> %2,
+    iXLen %3)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat> @intrinsic_vfmerge_vzm_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x i1> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmerge_vzm_nxv1bf16_nxv1bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fmv.h.x fa5, zero
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfmerge.vfm v8, v8, fa5, v0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmerge.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> poison,
+    <vscale x 1 x bfloat> %0,
+    bfloat zeroinitializer,
+    <vscale x 1 x i1> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 2 x bfloat> @intrinsic_vfmerge_vzm_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x i1> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmerge_vzm_nxv2bf16_nxv2bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fmv.h.x fa5, zero
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vfmerge.vfm v8, v8, fa5, v0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmerge.nxv2bf16.bf16(
+    <vscale x 2 x bfloat> poison,
+    <vscale x 2 x bfloat> %0,
+    bfloat zeroinitializer,
+    <vscale x 2 x i1> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+define <vscale x 4 x bfloat> @intrinsic_vfmerge_vzm_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x i1> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmerge_vzm_nxv4bf16_nxv4bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fmv.h.x fa5, zero
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vfmerge.vfm v8, v8, fa5, v0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmerge.nxv4bf16.bf16(
+    <vscale x 4 x bfloat> poison,
+    <vscale x 4 x bfloat> %0,
+    bfloat zeroinitializer,
+    <vscale x 4 x i1> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+define <vscale x 8 x bfloat> @intrinsic_vfmerge_vzm_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x i1> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmerge_vzm_nxv8bf16_nxv8bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fmv.h.x fa5, zero
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vfmerge.vfm v8, v8, fa5, v0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmerge.nxv8bf16.bf16(
+    <vscale x 8 x bfloat> poison,
+    <vscale x 8 x bfloat> %0,
+    bfloat zeroinitializer,
+    <vscale x 8 x i1> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+define <vscale x 16 x bfloat> @intrinsic_vfmerge_vzm_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x i1> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmerge_vzm_nxv16bf16_nxv16bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fmv.h.x fa5, zero
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vfmerge.vfm v8, v8, fa5, v0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmerge.nxv16bf16.bf16(
+    <vscale x 16 x bfloat> poison,
+    <vscale x 16 x bfloat> %0,
+    bfloat zeroinitializer,
+    <vscale x 16 x i1> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+define <vscale x 32 x bfloat> @intrinsic_vfmerge_vzm_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x i1> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmerge_vzm_nxv32bf16_nxv32bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fmv.h.x fa5, zero
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT:    vfmerge.vfm v8, v8, fa5, v0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfmerge.nxv32bf16.bf16(
+    <vscale x 32 x bfloat> poison,
+    <vscale x 32 x bfloat> %0,
+    bfloat zeroinitializer,
+    <vscale x 32 x i1> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmin-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfmin-bf.ll
new file mode 100644
index 0000000000000..37c0cf506a6fa
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmin-bf.ll
@@ -0,0 +1,571 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmin.nxv1bf16.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmin_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfmin.vv v8, v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmin.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> poison,
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmin.mask.nxv1bf16.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmin_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vfmin.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmin.mask.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmin.nxv2bf16.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmin_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vfmin.vv v8, v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmin.nxv2bf16.nxv2bf16(
+    <vscale x 2 x bfloat> poison,
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmin.mask.nxv2bf16.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmin_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vfmin.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmin.mask.nxv2bf16.nxv2bf16(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x bfloat> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmin.nxv4bf16.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmin_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vfmin.vv v8, v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmin.nxv4bf16.nxv4bf16(
+    <vscale x 4 x bfloat> poison,
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmin.mask.nxv4bf16.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmin_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vfmin.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmin.mask.nxv4bf16.nxv4bf16(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x bfloat> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmin.nxv8bf16.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmin_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vfmin.vv v8, v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmin.nxv8bf16.nxv8bf16(
+    <vscale x 8 x bfloat> poison,
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmin.mask.nxv8bf16.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmin_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vfmin.vv v8, v10, v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmin.mask.nxv8bf16.nxv8bf16(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x bfloat> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmin.nxv16bf16.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmin_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vfmin.vv v8, v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmin.nxv16bf16.nxv16bf16(
+    <vscale x 16 x bfloat> poison,
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmin.mask.nxv16bf16.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmin_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT:    vfmin.vv v8, v12, v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmin.mask.nxv16bf16.nxv16bf16(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x bfloat> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfmin.nxv32bf16.nxv32bf16(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfmin_vv_nxv32bf16_nxv32bf16_nxv32bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_vv_nxv32bf16_nxv32bf16_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT:    vfmin.vv v8, v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfmin.nxv32bf16.nxv32bf16(
+    <vscale x 32 x bfloat> poison,
+    <vscale x 32 x bfloat> %0,
+    <vscale x 32 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfmin.mask.nxv32bf16.nxv32bf16(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfmin_mask_vv_nxv32bf16_nxv32bf16_nxv32bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, <vscale x 32 x bfloat> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_mask_vv_nxv32bf16_nxv32bf16_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vl8re16.v v24, (a0)
+; CHECK-NEXT:    vsetvli zero, a1, e16alt, m8, ta, mu
+; CHECK-NEXT:    vfmin.vv v8, v16, v24, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfmin.mask.nxv32bf16.nxv32bf16(
+    <vscale x 32 x bfloat> %0,
+    <vscale x 32 x bfloat> %1,
+    <vscale x 32 x bfloat> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmin.nxv1bf16.bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmin_vf_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_vf_nxv1bf16_nxv1bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfmin.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmin.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> poison,
+    <vscale x 1 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmin.mask.nxv1bf16.bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  bfloat,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmin_mask_vf_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, bfloat %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_mask_vf_nxv1bf16_nxv1bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vfmin.vf v8, v9, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmin.mask.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    bfloat %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmin.nxv2bf16.bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmin_vf_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_vf_nxv2bf16_nxv2bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vfmin.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmin.nxv2bf16.bf16(
+    <vscale x 2 x bfloat> poison,
+    <vscale x 2 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmin.mask.nxv2bf16.bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  bfloat,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmin_mask_vf_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, bfloat %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_mask_vf_nxv2bf16_nxv2bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vfmin.vf v8, v9, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmin.mask.nxv2bf16.bf16(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    bfloat %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmin.nxv4bf16.bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmin_vf_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_vf_nxv4bf16_nxv4bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vfmin.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmin.nxv4bf16.bf16(
+    <vscale x 4 x bfloat> poison,
+    <vscale x 4 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmin.mask.nxv4bf16.bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  bfloat,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmin_mask_vf_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, bfloat %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_mask_vf_nxv4bf16_nxv4bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vfmin.vf v8, v9, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmin.mask.nxv4bf16.bf16(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    bfloat %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmin.nxv8bf16.bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmin_vf_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_vf_nxv8bf16_nxv8bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vfmin.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmin.nxv8bf16.bf16(
+    <vscale x 8 x bfloat> poison,
+    <vscale x 8 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmin.mask.nxv8bf16.bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  bfloat,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmin_mask_vf_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, bfloat %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_mask_vf_nxv8bf16_nxv8bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vfmin.vf v8, v10, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmin.mask.nxv8bf16.bf16(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    bfloat %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmin.nxv16bf16.bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmin_vf_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_vf_nxv16bf16_nxv16bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vfmin.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmin.nxv16bf16.bf16(
+    <vscale x 16 x bfloat> poison,
+    <vscale x 16 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmin.mask.nxv16bf16.bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  bfloat,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmin_mask_vf_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, bfloat %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_mask_vf_nxv16bf16_nxv16bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT:    vfmin.vf v8, v12, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmin.mask.nxv16bf16.bf16(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    bfloat %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfmin.nxv32bf16.bf16(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfmin_vf_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_vf_nxv32bf16_nxv32bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT:    vfmin.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfmin.nxv32bf16.bf16(
+    <vscale x 32 x bfloat> poison,
+    <vscale x 32 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfmin.mask.nxv32bf16.bf16(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  bfloat,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfmin_mask_vf_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, bfloat %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmin_mask_vf_nxv32bf16_nxv32bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, ta, mu
+; CHECK-NEXT:    vfmin.vf v8, v16, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfmin.mask.nxv32bf16.bf16(
+    <vscale x 32 x bfloat> %0,
+    <vscale x 32 x bfloat> %1,
+    bfloat %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmsac-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfmsac-bf.ll
new file mode 100644
index 0000000000000..948d2196f2bb4
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmsac-bf.ll
@@ -0,0 +1,553 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmsac.nxv1bf16.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat>  @intrinsic_vfmsac_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsac_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, ma
+; CHECK-NEXT:    vfmsac.vv v8, v9, v10
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmsac.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmsac.mask.nxv1bf16.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat>  @intrinsic_vfmsac_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmsac_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT:    vfmsac.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmsac.mask.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmsac.nxv2bf16.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat>  @intrinsic_vfmsac_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsac_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, ma
+; CHECK-NEXT:    vfmsac.vv v8, v9, v10
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmsac.nxv2bf16.nxv2bf16(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmsac.mask.nxv2bf16.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat>  @intrinsic_vfmsac_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmsac_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT:    vfmsac.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmsac.mask.nxv2bf16.nxv2bf16(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x bfloat> %2,
+    <vscale x 2 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmsac.nxv4bf16.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat>  @intrinsic_vfmsac_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsac_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT:    vfmsac.vv v8, v9, v10
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmsac.nxv4bf16.nxv4bf16(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmsac.mask.nxv4bf16.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat>  @intrinsic_vfmsac_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmsac_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT:    vfmsac.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmsac.mask.nxv4bf16.nxv4bf16(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x bfloat> %2,
+    <vscale x 4 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmsac.nxv8bf16.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat>  @intrinsic_vfmsac_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsac_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, ma
+; CHECK-NEXT:    vfmsac.vv v8, v10, v12
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmsac.nxv8bf16.nxv8bf16(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmsac.mask.nxv8bf16.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat>  @intrinsic_vfmsac_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmsac_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT:    vfmsac.vv v8, v10, v12, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmsac.mask.nxv8bf16.nxv8bf16(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x bfloat> %2,
+    <vscale x 8 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmsac.nxv16bf16.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat>  @intrinsic_vfmsac_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsac_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, ma
+; CHECK-NEXT:    vfmsac.vv v8, v12, v16
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmsac.nxv16bf16.nxv16bf16(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmsac.mask.nxv16bf16.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat>  @intrinsic_vfmsac_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmsac_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT:    vfmsac.vv v8, v12, v16, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmsac.mask.nxv16bf16.nxv16bf16(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x bfloat> %2,
+    <vscale x 16 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmsac.nxv1bf16.bf16(
+  <vscale x 1 x bfloat>,
+  bfloat,
+  <vscale x 1 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat>  @intrinsic_vfmsac_vf_nxv1bf16_bf16_nxv1bf16(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsac_vf_nxv1bf16_bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, ma
+; CHECK-NEXT:    vfmsac.vf v8, fa0, v9
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmsac.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> %0,
+    bfloat %1,
+    <vscale x 1 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmsac.mask.nxv1bf16.bf16(
+  <vscale x 1 x bfloat>,
+  bfloat,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmsac_mask_vf_nxv1bf16_bf16_nxv1bf16(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmsac_mask_vf_nxv1bf16_bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT:    vfmsac.vf v8, fa0, v9, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmsac.mask.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> %0,
+    bfloat %1,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmsac.nxv2bf16.bf16(
+  <vscale x 2 x bfloat>,
+  bfloat,
+  <vscale x 2 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat>  @intrinsic_vfmsac_vf_nxv2bf16_bf16_nxv2bf16(<vscale x 2 x bfloat> %0, bfloat %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsac_vf_nxv2bf16_bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, ma
+; CHECK-NEXT:    vfmsac.vf v8, fa0, v9
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmsac.nxv2bf16.bf16(
+    <vscale x 2 x bfloat> %0,
+    bfloat %1,
+    <vscale x 2 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmsac.mask.nxv2bf16.bf16(
+  <vscale x 2 x bfloat>,
+  bfloat,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmsac_mask_vf_nxv2bf16_bf16_nxv2bf16(<vscale x 2 x bfloat> %0, bfloat %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmsac_mask_vf_nxv2bf16_bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT:    vfmsac.vf v8, fa0, v9, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmsac.mask.nxv2bf16.bf16(
+    <vscale x 2 x bfloat> %0,
+    bfloat %1,
+    <vscale x 2 x bfloat> %2,
+    <vscale x 2 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmsac.nxv4bf16.bf16(
+  <vscale x 4 x bfloat>,
+  bfloat,
+  <vscale x 4 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat>  @intrinsic_vfmsac_vf_nxv4bf16_bf16_nxv4bf16(<vscale x 4 x bfloat> %0, bfloat %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsac_vf_nxv4bf16_bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT:    vfmsac.vf v8, fa0, v9
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmsac.nxv4bf16.bf16(
+    <vscale x 4 x bfloat> %0,
+    bfloat %1,
+    <vscale x 4 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmsac.mask.nxv4bf16.bf16(
+  <vscale x 4 x bfloat>,
+  bfloat,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmsac_mask_vf_nxv4bf16_bf16_nxv4bf16(<vscale x 4 x bfloat> %0, bfloat %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmsac_mask_vf_nxv4bf16_bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT:    vfmsac.vf v8, fa0, v9, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmsac.mask.nxv4bf16.bf16(
+    <vscale x 4 x bfloat> %0,
+    bfloat %1,
+    <vscale x 4 x bfloat> %2,
+    <vscale x 4 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmsac.nxv8bf16.bf16(
+  <vscale x 8 x bfloat>,
+  bfloat,
+  <vscale x 8 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat>  @intrinsic_vfmsac_vf_nxv8bf16_bf16_nxv8bf16(<vscale x 8 x bfloat> %0, bfloat %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsac_vf_nxv8bf16_bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, ma
+; CHECK-NEXT:    vfmsac.vf v8, fa0, v10
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmsac.nxv8bf16.bf16(
+    <vscale x 8 x bfloat> %0,
+    bfloat %1,
+    <vscale x 8 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmsac.mask.nxv8bf16.bf16(
+  <vscale x 8 x bfloat>,
+  bfloat,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmsac_mask_vf_nxv8bf16_bf16_nxv8bf16(<vscale x 8 x bfloat> %0, bfloat %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmsac_mask_vf_nxv8bf16_bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT:    vfmsac.vf v8, fa0, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmsac.mask.nxv8bf16.bf16(
+    <vscale x 8 x bfloat> %0,
+    bfloat %1,
+    <vscale x 8 x bfloat> %2,
+    <vscale x 8 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmsac.nxv16bf16.bf16(
+  <vscale x 16 x bfloat>,
+  bfloat,
+  <vscale x 16 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat>  @intrinsic_vfmsac_vf_nxv16bf16_bf16_nxv16bf16(<vscale x 16 x bfloat> %0, bfloat %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsac_vf_nxv16bf16_bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, ma
+; CHECK-NEXT:    vfmsac.vf v8, fa0, v12
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmsac.nxv16bf16.bf16(
+    <vscale x 16 x bfloat> %0,
+    bfloat %1,
+    <vscale x 16 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmsac.mask.nxv16bf16.bf16(
+  <vscale x 16 x bfloat>,
+  bfloat,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmsac_mask_vf_nxv16bf16_bf16_nxv16bf16(<vscale x 16 x bfloat> %0, bfloat %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmsac_mask_vf_nxv16bf16_bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT:    vfmsac.vf v8, fa0, v12, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmsac.mask.nxv16bf16.bf16(
+    <vscale x 16 x bfloat> %0,
+    bfloat %1,
+    <vscale x 16 x bfloat> %2,
+    <vscale x 16 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat>  @intrinsic_vfmsac_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsac_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfmsub.vv v8, v10, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmsac.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %2,
+    iXLen 7, iXLen %3, iXLen 3)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat>  @intrinsic_vfmsac_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute2(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsac_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfmsub.vv v8, v10, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmsac.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x bfloat> %0,
+    iXLen 7, iXLen %3, iXLen 3)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat>  @intrinsic_vfmsac_vf_nxv1bf16_bf16_nxv1bf16_commute(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsac_vf_nxv1bf16_bf16_nxv1bf16_commute:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfmsub.vf v8, fa0, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmsac.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> %2,
+    bfloat %1,
+    <vscale x 1 x bfloat> %0,
+    iXLen 7, iXLen %3, iXLen 3)
+
+  ret <vscale x 1 x bfloat> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmsub-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfmsub-bf.ll
new file mode 100644
index 0000000000000..6838f37339e98
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmsub-bf.ll
@@ -0,0 +1,553 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmsub.nxv1bf16.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat>  @intrinsic_vfmsub_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsub_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, ma
+; CHECK-NEXT:    vfmsub.vv v8, v9, v10
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmsub.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmsub.mask.nxv1bf16.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat>  @intrinsic_vfmsub_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmsub_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT:    vfmsub.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmsub.mask.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmsub.nxv2bf16.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat>  @intrinsic_vfmsub_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsub_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, ma
+; CHECK-NEXT:    vfmsub.vv v8, v9, v10
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmsub.nxv2bf16.nxv2bf16(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmsub.mask.nxv2bf16.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat>  @intrinsic_vfmsub_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmsub_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT:    vfmsub.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmsub.mask.nxv2bf16.nxv2bf16(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x bfloat> %2,
+    <vscale x 2 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmsub.nxv4bf16.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat>  @intrinsic_vfmsub_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsub_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT:    vfmsub.vv v8, v9, v10
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmsub.nxv4bf16.nxv4bf16(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmsub.mask.nxv4bf16.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat>  @intrinsic_vfmsub_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmsub_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT:    vfmsub.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmsub.mask.nxv4bf16.nxv4bf16(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x bfloat> %2,
+    <vscale x 4 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmsub.nxv8bf16.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat>  @intrinsic_vfmsub_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsub_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, ma
+; CHECK-NEXT:    vfmsub.vv v8, v10, v12
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmsub.nxv8bf16.nxv8bf16(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmsub.mask.nxv8bf16.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat>  @intrinsic_vfmsub_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmsub_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT:    vfmsub.vv v8, v10, v12, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmsub.mask.nxv8bf16.nxv8bf16(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x bfloat> %2,
+    <vscale x 8 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmsub.nxv16bf16.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat>  @intrinsic_vfmsub_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsub_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, ma
+; CHECK-NEXT:    vfmsub.vv v8, v12, v16
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmsub.nxv16bf16.nxv16bf16(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmsub.mask.nxv16bf16.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat>  @intrinsic_vfmsub_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmsub_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT:    vfmsub.vv v8, v12, v16, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmsub.mask.nxv16bf16.nxv16bf16(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x bfloat> %2,
+    <vscale x 16 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmsub.nxv1bf16.bf16(
+  <vscale x 1 x bfloat>,
+  bfloat,
+  <vscale x 1 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat>  @intrinsic_vfmsub_vf_nxv1bf16_bf16_nxv1bf16(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsub_vf_nxv1bf16_bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, ma
+; CHECK-NEXT:    vfmsub.vf v8, fa0, v9
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmsub.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> %0,
+    bfloat %1,
+    <vscale x 1 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmsub.mask.nxv1bf16.bf16(
+  <vscale x 1 x bfloat>,
+  bfloat,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmsub_mask_vf_nxv1bf16_bf16_nxv1bf16(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmsub_mask_vf_nxv1bf16_bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT:    vfmsub.vf v8, fa0, v9, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmsub.mask.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> %0,
+    bfloat %1,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmsub.nxv2bf16.bf16(
+  <vscale x 2 x bfloat>,
+  bfloat,
+  <vscale x 2 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat>  @intrinsic_vfmsub_vf_nxv2bf16_bf16_nxv2bf16(<vscale x 2 x bfloat> %0, bfloat %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsub_vf_nxv2bf16_bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, ma
+; CHECK-NEXT:    vfmsub.vf v8, fa0, v9
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmsub.nxv2bf16.bf16(
+    <vscale x 2 x bfloat> %0,
+    bfloat %1,
+    <vscale x 2 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmsub.mask.nxv2bf16.bf16(
+  <vscale x 2 x bfloat>,
+  bfloat,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmsub_mask_vf_nxv2bf16_bf16_nxv2bf16(<vscale x 2 x bfloat> %0, bfloat %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmsub_mask_vf_nxv2bf16_bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT:    vfmsub.vf v8, fa0, v9, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmsub.mask.nxv2bf16.bf16(
+    <vscale x 2 x bfloat> %0,
+    bfloat %1,
+    <vscale x 2 x bfloat> %2,
+    <vscale x 2 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmsub.nxv4bf16.bf16(
+  <vscale x 4 x bfloat>,
+  bfloat,
+  <vscale x 4 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat>  @intrinsic_vfmsub_vf_nxv4bf16_bf16_nxv4bf16(<vscale x 4 x bfloat> %0, bfloat %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsub_vf_nxv4bf16_bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT:    vfmsub.vf v8, fa0, v9
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmsub.nxv4bf16.bf16(
+    <vscale x 4 x bfloat> %0,
+    bfloat %1,
+    <vscale x 4 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmsub.mask.nxv4bf16.bf16(
+  <vscale x 4 x bfloat>,
+  bfloat,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmsub_mask_vf_nxv4bf16_bf16_nxv4bf16(<vscale x 4 x bfloat> %0, bfloat %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmsub_mask_vf_nxv4bf16_bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT:    vfmsub.vf v8, fa0, v9, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmsub.mask.nxv4bf16.bf16(
+    <vscale x 4 x bfloat> %0,
+    bfloat %1,
+    <vscale x 4 x bfloat> %2,
+    <vscale x 4 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmsub.nxv8bf16.bf16(
+  <vscale x 8 x bfloat>,
+  bfloat,
+  <vscale x 8 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat>  @intrinsic_vfmsub_vf_nxv8bf16_bf16_nxv8bf16(<vscale x 8 x bfloat> %0, bfloat %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsub_vf_nxv8bf16_bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, ma
+; CHECK-NEXT:    vfmsub.vf v8, fa0, v10
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmsub.nxv8bf16.bf16(
+    <vscale x 8 x bfloat> %0,
+    bfloat %1,
+    <vscale x 8 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmsub.mask.nxv8bf16.bf16(
+  <vscale x 8 x bfloat>,
+  bfloat,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmsub_mask_vf_nxv8bf16_bf16_nxv8bf16(<vscale x 8 x bfloat> %0, bfloat %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmsub_mask_vf_nxv8bf16_bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT:    vfmsub.vf v8, fa0, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmsub.mask.nxv8bf16.bf16(
+    <vscale x 8 x bfloat> %0,
+    bfloat %1,
+    <vscale x 8 x bfloat> %2,
+    <vscale x 8 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmsub.nxv16bf16.bf16(
+  <vscale x 16 x bfloat>,
+  bfloat,
+  <vscale x 16 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat>  @intrinsic_vfmsub_vf_nxv16bf16_bf16_nxv16bf16(<vscale x 16 x bfloat> %0, bfloat %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsub_vf_nxv16bf16_bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, ma
+; CHECK-NEXT:    vfmsub.vf v8, fa0, v12
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmsub.nxv16bf16.bf16(
+    <vscale x 16 x bfloat> %0,
+    bfloat %1,
+    <vscale x 16 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmsub.mask.nxv16bf16.bf16(
+  <vscale x 16 x bfloat>,
+  bfloat,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmsub_mask_vf_nxv16bf16_bf16_nxv16bf16(<vscale x 16 x bfloat> %0, bfloat %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmsub_mask_vf_nxv16bf16_bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT:    vfmsub.vf v8, fa0, v12, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmsub.mask.nxv16bf16.bf16(
+    <vscale x 16 x bfloat> %0,
+    bfloat %1,
+    <vscale x 16 x bfloat> %2,
+    <vscale x 16 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat>  @intrinsic_vfmsub_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsub_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfmsub.vv v8, v9, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmsub.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %2,
+    iXLen 7, iXLen %3, iXLen 3)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat>  @intrinsic_vfmsub_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute2(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsub_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfmsac.vv v8, v10, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmsub.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x bfloat> %0,
+    iXLen 7, iXLen %3, iXLen 3)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat>  @intrinsic_vfmsub_vf_nxv1bf16_bf16_nxv1bf16_commute(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfmsub_vf_nxv1bf16_bf16_nxv1bf16_commute:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfmsac.vf v8, fa0, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmsub.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> %2,
+    bfloat %1,
+    <vscale x 1 x bfloat> %0,
+    iXLen 7, iXLen %3, iXLen 3)
+
+  ret <vscale x 1 x bfloat> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmul-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfmul-bf.ll
new file mode 100644
index 0000000000000..44bce723c39d4
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmul-bf.ll
@@ -0,0 +1,607 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmul.nxv1bf16.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmul_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfmul.vv v8, v8, v9
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmul.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> poison,
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmul.mask.nxv1bf16.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmul_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vfmul.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmul.mask.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmul.nxv2bf16.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmul_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vfmul.vv v8, v8, v9
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmul.nxv2bf16.nxv2bf16(
+    <vscale x 2 x bfloat> poison,
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmul.mask.nxv2bf16.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmul_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vfmul.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmul.mask.nxv2bf16.nxv2bf16(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x bfloat> %2,
+    <vscale x 2 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmul.nxv4bf16.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmul_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vfmul.vv v8, v8, v9
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmul.nxv4bf16.nxv4bf16(
+    <vscale x 4 x bfloat> poison,
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmul.mask.nxv4bf16.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmul_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vfmul.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmul.mask.nxv4bf16.nxv4bf16(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x bfloat> %2,
+    <vscale x 4 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmul.nxv8bf16.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmul_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vfmul.vv v8, v8, v10
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmul.nxv8bf16.nxv8bf16(
+    <vscale x 8 x bfloat> poison,
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmul.mask.nxv8bf16.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmul_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vfmul.vv v8, v10, v12, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmul.mask.nxv8bf16.nxv8bf16(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x bfloat> %2,
+    <vscale x 8 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmul.nxv16bf16.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmul_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vfmul.vv v8, v8, v12
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmul.nxv16bf16.nxv16bf16(
+    <vscale x 16 x bfloat> poison,
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmul.mask.nxv16bf16.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmul_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT:    vfmul.vv v8, v12, v16, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmul.mask.nxv16bf16.nxv16bf16(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x bfloat> %2,
+    <vscale x 16 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfmul.nxv32bf16.nxv32bf16(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfmul_vv_nxv32bf16_nxv32bf16_nxv32bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_vv_nxv32bf16_nxv32bf16_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT:    vfmul.vv v8, v8, v16
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfmul.nxv32bf16.nxv32bf16(
+    <vscale x 32 x bfloat> poison,
+    <vscale x 32 x bfloat> %0,
+    <vscale x 32 x bfloat> %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfmul.mask.nxv32bf16.nxv32bf16(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfmul_mask_vv_nxv32bf16_nxv32bf16_nxv32bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, <vscale x 32 x bfloat> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_mask_vv_nxv32bf16_nxv32bf16_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vl8re16.v v24, (a0)
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16alt, m8, ta, mu
+; CHECK-NEXT:    vfmul.vv v8, v16, v24, v0.t
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfmul.mask.nxv32bf16.nxv32bf16(
+    <vscale x 32 x bfloat> %0,
+    <vscale x 32 x bfloat> %1,
+    <vscale x 32 x bfloat> %2,
+    <vscale x 32 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmul.nxv1bf16.bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmul_vf_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_vf_nxv1bf16_nxv1bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfmul.vf v8, v8, fa0
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmul.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> poison,
+    <vscale x 1 x bfloat> %0,
+    bfloat %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmul.mask.nxv1bf16.bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  bfloat,
+  <vscale x 1 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmul_mask_vf_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, bfloat %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_mask_vf_nxv1bf16_nxv1bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vfmul.vf v8, v9, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmul.mask.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    bfloat %2,
+    <vscale x 1 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmul.nxv2bf16.bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmul_vf_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_vf_nxv2bf16_nxv2bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vfmul.vf v8, v8, fa0
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmul.nxv2bf16.bf16(
+    <vscale x 2 x bfloat> poison,
+    <vscale x 2 x bfloat> %0,
+    bfloat %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmul.mask.nxv2bf16.bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  bfloat,
+  <vscale x 2 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmul_mask_vf_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, bfloat %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_mask_vf_nxv2bf16_nxv2bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vfmul.vf v8, v9, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmul.mask.nxv2bf16.bf16(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    bfloat %2,
+    <vscale x 2 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmul.nxv4bf16.bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmul_vf_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_vf_nxv4bf16_nxv4bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vfmul.vf v8, v8, fa0
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmul.nxv4bf16.bf16(
+    <vscale x 4 x bfloat> poison,
+    <vscale x 4 x bfloat> %0,
+    bfloat %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmul.mask.nxv4bf16.bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  bfloat,
+  <vscale x 4 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmul_mask_vf_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, bfloat %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_mask_vf_nxv4bf16_nxv4bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vfmul.vf v8, v9, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmul.mask.nxv4bf16.bf16(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    bfloat %2,
+    <vscale x 4 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmul.nxv8bf16.bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmul_vf_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_vf_nxv8bf16_nxv8bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vfmul.vf v8, v8, fa0
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmul.nxv8bf16.bf16(
+    <vscale x 8 x bfloat> poison,
+    <vscale x 8 x bfloat> %0,
+    bfloat %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmul.mask.nxv8bf16.bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  bfloat,
+  <vscale x 8 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmul_mask_vf_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, bfloat %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_mask_vf_nxv8bf16_nxv8bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vfmul.vf v8, v10, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmul.mask.nxv8bf16.bf16(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    bfloat %2,
+    <vscale x 8 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmul.nxv16bf16.bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmul_vf_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_vf_nxv16bf16_nxv16bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vfmul.vf v8, v8, fa0
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmul.nxv16bf16.bf16(
+    <vscale x 16 x bfloat> poison,
+    <vscale x 16 x bfloat> %0,
+    bfloat %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmul.mask.nxv16bf16.bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  bfloat,
+  <vscale x 16 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmul_mask_vf_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, bfloat %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_mask_vf_nxv16bf16_nxv16bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT:    vfmul.vf v8, v12, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmul.mask.nxv16bf16.bf16(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    bfloat %2,
+    <vscale x 16 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfmul.nxv32bf16.bf16(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfmul_vf_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_vf_nxv32bf16_nxv32bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT:    vfmul.vf v8, v8, fa0
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfmul.nxv32bf16.bf16(
+    <vscale x 32 x bfloat> poison,
+    <vscale x 32 x bfloat> %0,
+    bfloat %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfmul.mask.nxv32bf16.bf16(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  bfloat,
+  <vscale x 32 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfmul_mask_vf_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, bfloat %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfmul_mask_vf_nxv32bf16_nxv32bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, ta, mu
+; CHECK-NEXT:    vfmul.vf v8, v16, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfmul.mask.nxv32bf16.bf16(
+    <vscale x 32 x bfloat> %0,
+    <vscale x 32 x bfloat> %1,
+    bfloat %2,
+    <vscale x 32 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmv-bf-s.ll b/llvm/test/CodeGen/RISCV/rvv/vfmv-bf-s.ll
new file mode 100644
index 0000000000000..fbc7311945c8b
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmv-bf-s.ll
@@ -0,0 +1,88 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+d,+v,+experimental-zvfbfa -target-abi lp64d -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv32 -mattr=+d,+v,+experimental-zvfbfa -target-abi ilp32d -verify-machineinstrs < %s | FileCheck %s
+
+declare bfloat @llvm.riscv.vfmv.f.s.nxv1bf16(<vscale x 1 x bfloat>)
+
+define bfloat @intrinsic_vfmv.f.s_s_nxv1bf16(<vscale x 1 x bfloat> %0) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.f.s_s_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    fmv.h.x fa0, a0
+; CHECK-NEXT:    ret
+entry:
+  %a = call bfloat @llvm.riscv.vfmv.f.s.nxv1bf16(<vscale x 1 x bfloat> %0)
+  ret bfloat %a
+}
+
+declare bfloat @llvm.riscv.vfmv.f.s.nxv2bf16(<vscale x 2 x bfloat>)
+
+define bfloat @intrinsic_vfmv.f.s_s_nxv2bf16(<vscale x 2 x bfloat> %0) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.f.s_s_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    fmv.h.x fa0, a0
+; CHECK-NEXT:    ret
+entry:
+  %a = call bfloat @llvm.riscv.vfmv.f.s.nxv2bf16(<vscale x 2 x bfloat> %0)
+  ret bfloat %a
+}
+
+declare bfloat @llvm.riscv.vfmv.f.s.nxv4bf16(<vscale x 4 x bfloat>)
+
+define bfloat @intrinsic_vfmv.f.s_s_nxv4bf16(<vscale x 4 x bfloat> %0) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.f.s_s_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    fmv.h.x fa0, a0
+; CHECK-NEXT:    ret
+entry:
+  %a = call bfloat @llvm.riscv.vfmv.f.s.nxv4bf16(<vscale x 4 x bfloat> %0)
+  ret bfloat %a
+}
+
+declare bfloat @llvm.riscv.vfmv.f.s.nxv8bf16(<vscale x 8 x bfloat>)
+
+define bfloat @intrinsic_vfmv.f.s_s_nxv8bf16(<vscale x 8 x bfloat> %0) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.f.s_s_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    fmv.h.x fa0, a0
+; CHECK-NEXT:    ret
+entry:
+  %a = call bfloat @llvm.riscv.vfmv.f.s.nxv8bf16(<vscale x 8 x bfloat> %0)
+  ret bfloat %a
+}
+
+declare bfloat @llvm.riscv.vfmv.f.s.nxv16bf16(<vscale x 16 x bfloat>)
+
+define bfloat @intrinsic_vfmv.f.s_s_nxv16bf16(<vscale x 16 x bfloat> %0) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.f.s_s_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    fmv.h.x fa0, a0
+; CHECK-NEXT:    ret
+entry:
+  %a = call bfloat @llvm.riscv.vfmv.f.s.nxv16bf16(<vscale x 16 x bfloat> %0)
+  ret bfloat %a
+}
+
+declare bfloat @llvm.riscv.vfmv.f.s.nxv32bf16(<vscale x 32 x bfloat>)
+
+define bfloat @intrinsic_vfmv.f.s_s_nxv32bf16(<vscale x 32 x bfloat> %0) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.f.s_s_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    fmv.h.x fa0, a0
+; CHECK-NEXT:    ret
+entry:
+  %a = call bfloat @llvm.riscv.vfmv.f.s.nxv32bf16(<vscale x 32 x bfloat> %0)
+  ret bfloat %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmv-s-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfmv-s-bf.ll
new file mode 100644
index 0000000000000..a810809fca515
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmv-s-bf.ll
@@ -0,0 +1,161 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s --check-prefixes=CHECK
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmv.s.f.nxv1bf16(<vscale x 1 x bfloat>, bfloat, iXLen)
+
+define <vscale x 1 x bfloat> @intrinsic_vfmv.s.f_f_nxv1bf16(<vscale x 1 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.s.f_f_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT:    vfmv.s.f v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmv.s.f.nxv1bf16(<vscale x 1 x bfloat> %0, bfloat %1, iXLen %2)
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmv.s.f.nxv2bf16(<vscale x 2 x bfloat>, bfloat, iXLen)
+
+define <vscale x 2 x bfloat> @intrinsic_vfmv.s.f_f_nxv2bf16(<vscale x 2 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.s.f_f_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT:    vfmv.s.f v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmv.s.f.nxv2bf16(<vscale x 2 x bfloat> %0, bfloat %1, iXLen %2)
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmv.s.f.nxv4bf16(<vscale x 4 x bfloat>, bfloat, iXLen)
+
+define <vscale x 4 x bfloat> @intrinsic_vfmv.s.f_f_nxv4bf16(<vscale x 4 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.s.f_f_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT:    vfmv.s.f v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmv.s.f.nxv4bf16(<vscale x 4 x bfloat> %0, bfloat %1, iXLen %2)
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmv.s.f.nxv8bf16(<vscale x 8 x bfloat>, bfloat, iXLen)
+
+define <vscale x 8 x bfloat> @intrinsic_vfmv.s.f_f_nxv8bf16(<vscale x 8 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.s.f_f_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT:    vfmv.s.f v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmv.s.f.nxv8bf16(<vscale x 8 x bfloat> %0, bfloat %1, iXLen %2)
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmv.s.f.nxv16bf16(<vscale x 16 x bfloat>, bfloat, iXLen)
+
+define <vscale x 16 x bfloat> @intrinsic_vfmv.s.f_f_nxv16bf16(<vscale x 16 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.s.f_f_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT:    vfmv.s.f v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmv.s.f.nxv16bf16(<vscale x 16 x bfloat> %0, bfloat %1, iXLen %2)
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfmv.s.f.nxv32bf16(<vscale x 32 x bfloat>, bfloat, iXLen)
+
+define <vscale x 32 x bfloat> @intrinsic_vfmv.s.f_f_nxv32bf16(<vscale x 32 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.s.f_f_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT:    vfmv.s.f v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfmv.s.f.nxv32bf16(<vscale x 32 x bfloat> %0, bfloat %1, iXLen %2)
+  ret <vscale x 32 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat> @intrinsic_vfmv.s.f_f_zero_nxv1bf16(<vscale x 1 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.s.f_f_zero_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
+; CHECK-NEXT:    vmv.s.x v8, zero
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmv.s.f.nxv1bf16(<vscale x 1 x bfloat> %0, bfloat 0.0, iXLen %1)
+  ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 2 x bfloat> @intrinsic_vfmv.s.f_f_zero_nxv2bf16(<vscale x 2 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.s.f_f_zero_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
+; CHECK-NEXT:    vmv.s.x v8, zero
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmv.s.f.nxv2bf16(<vscale x 2 x bfloat> %0, bfloat 0.0, iXLen %1)
+  ret <vscale x 2 x bfloat> %a
+}
+
+define <vscale x 4 x bfloat> @intrinsic_vfmv.s.f_f_zero_nxv4bf16(<vscale x 4 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.s.f_f_zero_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
+; CHECK-NEXT:    vmv.s.x v8, zero
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmv.s.f.nxv4bf16(<vscale x 4 x bfloat> %0, bfloat 0.0, iXLen %1)
+  ret <vscale x 4 x bfloat> %a
+}
+
+define <vscale x 8 x bfloat> @intrinsic_vfmv.s.f_f_zero_nxv8bf16(<vscale x 8 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.s.f_f_zero_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
+; CHECK-NEXT:    vmv.s.x v8, zero
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmv.s.f.nxv8bf16(<vscale x 8 x bfloat> %0, bfloat 0.0, iXLen %1)
+  ret <vscale x 8 x bfloat> %a
+}
+
+define <vscale x 16 x bfloat> @intrinsic_vfmv.s.f_f_zero_nxv16bf16(<vscale x 16 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.s.f_f_zero_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
+; CHECK-NEXT:    vmv.s.x v8, zero
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmv.s.f.nxv16bf16(<vscale x 16 x bfloat> %0, bfloat 0.0, iXLen %1)
+  ret <vscale x 16 x bfloat> %a
+}
+
+define <vscale x 32 x bfloat> @intrinsic_vfmv.s.f_f_zero_nxv32bf16(<vscale x 32 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.s.f_f_zero_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
+; CHECK-NEXT:    vmv.s.x v8, zero
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfmv.s.f.nxv32bf16(<vscale x 32 x bfloat> %0, bfloat 0.0, iXLen %1)
+  ret <vscale x 32 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat> @intrinsic_vfmv.s.f_f_nxv1bf16_negzero(<vscale x 1 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.s.f_f_nxv1bf16_negzero:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a1, 1048568
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
+; CHECK-NEXT:    vmv.s.x v8, a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmv.s.f.nxv1bf16(<vscale x 1 x bfloat> %0, bfloat -0.0, iXLen %1)
+  ret <vscale x 1 x bfloat> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmv-v-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfmv-v-bf.ll
new file mode 100644
index 0000000000000..f3293ddc83ef9
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmv-v-bf.ll
@@ -0,0 +1,216 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfmv.v.f.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfmv.v.f_f_nxv1bf16(bfloat %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.v.f_f_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfmv.v.f v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmv.v.f.nxv1bf16(
+    <vscale x 1 x bfloat> poison,
+    bfloat %0,
+    iXLen %1)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfmv.v.f.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfmv.v.f_f_nxv2bf16(bfloat %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.v.f_f_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vfmv.v.f v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmv.v.f.nxv2bf16(
+    <vscale x 2 x bfloat> poison,
+    bfloat %0,
+    iXLen %1)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfmv.v.f.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfmv.v.f_f_nxv4bf16(bfloat %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.v.f_f_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vfmv.v.f v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmv.v.f.nxv4bf16(
+    <vscale x 4 x bfloat> poison,
+    bfloat %0,
+    iXLen %1)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfmv.v.f.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfmv.v.f_f_nxv8bf16(bfloat %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.v.f_f_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vfmv.v.f v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmv.v.f.nxv8bf16(
+    <vscale x 8 x bfloat> poison,
+    bfloat %0,
+    iXLen %1)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfmv.v.f.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfmv.v.f_f_nxv16bf16(bfloat %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.v.f_f_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vfmv.v.f v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmv.v.f.nxv16bf16(
+    <vscale x 16 x bfloat> poison,
+    bfloat %0,
+    iXLen %1)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfmv.v.f.nxv32bf16(
+  <vscale x 32 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfmv.v.f_f_nxv32bf16(bfloat %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.v.f_f_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT:    vfmv.v.f v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfmv.v.f.nxv32bf16(
+    <vscale x 32 x bfloat> poison,
+    bfloat %0,
+    iXLen %1)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat> @intrinsic_vfmv.v.f_zero_nxv1bf16(iXLen %0) nounwind {
+; CHECK-LABEL: intrinsic_vfmv.v.f_zero_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfmv.v.f.nxv1bf16(
+    <vscale x 1 x bfloat> poison,
+    bfloat 0.0,
+    iXLen %0)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 2 x bfloat> @intrinsic_vmv.v.i_zero_nxv2bf16(iXLen %0) nounwind {
+; CHECK-LABEL: intrinsic_vmv.v.i_zero_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfmv.v.f.nxv2bf16(
+    <vscale x 2 x bfloat> poison,
+    bfloat 0.0,
+    iXLen %0)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+define <vscale x 4 x bfloat> @intrinsic_vmv.v.i_zero_nxv4bf16(iXLen %0) nounwind {
+; CHECK-LABEL: intrinsic_vmv.v.i_zero_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfmv.v.f.nxv4bf16(
+    <vscale x 4 x bfloat> poison,
+    bfloat 0.0,
+    iXLen %0)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+define <vscale x 8 x bfloat> @intrinsic_vmv.v.i_zero_nxv8bf16(iXLen %0) nounwind {
+; CHECK-LABEL: intrinsic_vmv.v.i_zero_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfmv.v.f.nxv8bf16(
+    <vscale x 8 x bfloat> poison,
+    bfloat 0.0,
+    iXLen %0)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+define <vscale x 16 x bfloat> @intrinsic_vmv.v.i_zero_nxv16bf16(iXLen %0) nounwind {
+; CHECK-LABEL: intrinsic_vmv.v.i_zero_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfmv.v.f.nxv16bf16(
+    <vscale x 16 x bfloat> poison,
+    bfloat 0.0,
+    iXLen %0)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+define <vscale x 32 x bfloat> @intrinsic_vmv.v.i_zero_nxv32bf16(iXLen %0) nounwind {
+; CHECK-LABEL: intrinsic_vmv.v.i_zero_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfmv.v.f.nxv32bf16(
+    <vscale x 32 x bfloat> poison,
+    bfloat 0.0,
+    iXLen %0)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfncvt-rod-bf-f.ll b/llvm/test/CodeGen/RISCV/rvv/vfncvt-rod-bf-f.ll
new file mode 100644
index 0000000000000..7d587fd55cd83
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfncvt-rod-bf-f.ll
@@ -0,0 +1,226 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.nxv1bf16.nxv1f32(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x float>,
+  iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfncvt_rod.f.f.w_nxv1bf16_nxv1f32(<vscale x 1 x float> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_rod.f.f.w_nxv1bf16_nxv1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfncvt.rod.f.f.w v9, v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.nxv1bf16.nxv1f32(
+    <vscale x 1 x bfloat> poison,
+    <vscale x 1 x float> %0,
+    iXLen %1)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv1bf16.nxv1f32(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x float>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfncvt_mask_rod.f.f.w_nxv1bf16_nxv1f32(<vscale x 1 x bfloat> %0, <vscale x 1 x float> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_rod.f.f.w_nxv1bf16_nxv1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vfncvt.rod.f.f.w v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv1bf16.nxv1f32(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x float> %1,
+    <vscale x 1 x i1> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.nxv2bf16.nxv2f32(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x float>,
+  iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfncvt_rod.f.f.w_nxv2bf16_nxv2f32(<vscale x 2 x float> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_rod.f.f.w_nxv2bf16_nxv2f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vfncvt.rod.f.f.w v9, v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.nxv2bf16.nxv2f32(
+    <vscale x 2 x bfloat> poison,
+    <vscale x 2 x float> %0,
+    iXLen %1)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv2bf16.nxv2f32(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x float>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfncvt_mask_rod.f.f.w_nxv2bf16_nxv2f32(<vscale x 2 x bfloat> %0, <vscale x 2 x float> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_rod.f.f.w_nxv2bf16_nxv2f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vfncvt.rod.f.f.w v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv2bf16.nxv2f32(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x float> %1,
+    <vscale x 2 x i1> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.nxv4bf16.nxv4f32(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x float>,
+  iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfncvt_rod.f.f.w_nxv4bf16_nxv4f32(<vscale x 4 x float> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_rod.f.f.w_nxv4bf16_nxv4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vfncvt.rod.f.f.w v10, v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.nxv4bf16.nxv4f32(
+    <vscale x 4 x bfloat> poison,
+    <vscale x 4 x float> %0,
+    iXLen %1)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv4bf16.nxv4f32(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x float>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfncvt_mask_rod.f.f.w_nxv4bf16_nxv4f32(<vscale x 4 x bfloat> %0, <vscale x 4 x float> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_rod.f.f.w_nxv4bf16_nxv4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vfncvt.rod.f.f.w v8, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv4bf16.nxv4f32(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x float> %1,
+    <vscale x 4 x i1> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.nxv8bf16.nxv8f32(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x float>,
+  iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfncvt_rod.f.f.w_nxv8bf16_nxv8f32(<vscale x 8 x float> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_rod.f.f.w_nxv8bf16_nxv8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vfncvt.rod.f.f.w v12, v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.nxv8bf16.nxv8f32(
+    <vscale x 8 x bfloat> poison,
+    <vscale x 8 x float> %0,
+    iXLen %1)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv8bf16.nxv8f32(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x float>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfncvt_mask_rod.f.f.w_nxv8bf16_nxv8f32(<vscale x 8 x bfloat> %0, <vscale x 8 x float> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_rod.f.f.w_nxv8bf16_nxv8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vfncvt.rod.f.f.w v8, v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv8bf16.nxv8f32(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x float> %1,
+    <vscale x 8 x i1> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.nxv16bf16.nxv16f32(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x float>,
+  iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfncvt_rod.f.f.w_nxv16bf16_nxv16f32(<vscale x 16 x float> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_rod.f.f.w_nxv16bf16_nxv16f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vfncvt.rod.f.f.w v16, v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.nxv16bf16.nxv16f32(
+    <vscale x 16 x bfloat> poison,
+    <vscale x 16 x float> %0,
+    iXLen %1)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv16bf16.nxv16f32(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x float>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfncvt_mask_rod.f.f.w_nxv16bf16_nxv16f32(<vscale x 16 x bfloat> %0, <vscale x 16 x float> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_rod.f.f.w_nxv16bf16_nxv16f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT:    vfncvt.rod.f.f.w v8, v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfncvt.rod.f.f.w.mask.nxv16bf16.nxv16f32(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x float> %1,
+    <vscale x 16 x i1> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfncvt-rtz-x-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfncvt-rtz-x-bf.ll
new file mode 100644
index 0000000000000..ee9e3d1b9f630
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfncvt-rtz-x-bf.ll
@@ -0,0 +1,270 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv1i8.nxv1bf16(
+  <vscale x 1 x i8>,
+  <vscale x 1 x bfloat>,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vfncvt_rtz.x.f.w_nxv1i8_nxv1bf16(<vscale x 1 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_rtz.x.f.w_nxv1i8_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, mf8, ta, ma
+; CHECK-NEXT:    vfncvt.rtz.x.f.w v9, v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv1i8.nxv1bf16(
+    <vscale x 1 x i8> poison,
+    <vscale x 1 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv1i8.nxv1bf16(
+  <vscale x 1 x i8>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vfncvt_mask_rtz.x.f.w_nxv1i8_nxv1bf16(<vscale x 1 x i8> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_rtz.x.f.w_nxv1i8_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, mf8, ta, mu
+; CHECK-NEXT:    vfncvt.rtz.x.f.w v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv1i8.nxv1bf16(
+    <vscale x 1 x i8> %0,
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x i1> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv2i8.nxv2bf16(
+  <vscale x 2 x i8>,
+  <vscale x 2 x bfloat>,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vfncvt_rtz.x.f.w_nxv2i8_nxv2bf16(<vscale x 2 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_rtz.x.f.w_nxv2i8_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, mf4, ta, ma
+; CHECK-NEXT:    vfncvt.rtz.x.f.w v9, v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv2i8.nxv2bf16(
+    <vscale x 2 x i8> poison,
+    <vscale x 2 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv2i8.nxv2bf16(
+  <vscale x 2 x i8>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vfncvt_mask_rtz.x.f.w_nxv2i8_nxv2bf16(<vscale x 2 x i8> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_rtz.x.f.w_nxv2i8_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, mf4, ta, mu
+; CHECK-NEXT:    vfncvt.rtz.x.f.w v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv2i8.nxv2bf16(
+    <vscale x 2 x i8> %0,
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x i1> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv4i8.nxv4bf16(
+  <vscale x 4 x i8>,
+  <vscale x 4 x bfloat>,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vfncvt_rtz.x.f.w_nxv4i8_nxv4bf16(<vscale x 4 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_rtz.x.f.w_nxv4i8_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, mf2, ta, ma
+; CHECK-NEXT:    vfncvt.rtz.x.f.w v9, v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv4i8.nxv4bf16(
+    <vscale x 4 x i8> poison,
+    <vscale x 4 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv4i8.nxv4bf16(
+  <vscale x 4 x i8>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vfncvt_mask_rtz.x.f.w_nxv4i8_nxv4bf16(<vscale x 4 x i8> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_rtz.x.f.w_nxv4i8_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, mf2, ta, mu
+; CHECK-NEXT:    vfncvt.rtz.x.f.w v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv4i8.nxv4bf16(
+    <vscale x 4 x i8> %0,
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x i1> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv8i8.nxv8bf16(
+  <vscale x 8 x i8>,
+  <vscale x 8 x bfloat>,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vfncvt_rtz.x.f.w_nxv8i8_nxv8bf16(<vscale x 8 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_rtz.x.f.w_nxv8i8_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, m1, ta, ma
+; CHECK-NEXT:    vfncvt.rtz.x.f.w v10, v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv8i8.nxv8bf16(
+    <vscale x 8 x i8> poison,
+    <vscale x 8 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv8i8.nxv8bf16(
+  <vscale x 8 x i8>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vfncvt_mask_rtz.x.f.w_nxv8i8_nxv8bf16(<vscale x 8 x i8> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_rtz.x.f.w_nxv8i8_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, m1, ta, mu
+; CHECK-NEXT:    vfncvt.rtz.x.f.w v8, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv8i8.nxv8bf16(
+    <vscale x 8 x i8> %0,
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x i1> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv16i8.nxv16bf16(
+  <vscale x 16 x i8>,
+  <vscale x 16 x bfloat>,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vfncvt_rtz.x.f.w_nxv16i8_nxv16bf16(<vscale x 16 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_rtz.x.f.w_nxv16i8_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, m2, ta, ma
+; CHECK-NEXT:    vfncvt.rtz.x.f.w v12, v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv16i8.nxv16bf16(
+    <vscale x 16 x i8> poison,
+    <vscale x 16 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv16i8.nxv16bf16(
+  <vscale x 16 x i8>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vfncvt_mask_rtz.x.f.w_nxv16i8_nxv16bf16(<vscale x 16 x i8> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_rtz.x.f.w_nxv16i8_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, m2, ta, mu
+; CHECK-NEXT:    vfncvt.rtz.x.f.w v8, v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv16i8.nxv16bf16(
+    <vscale x 16 x i8> %0,
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x i1> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv32i8.nxv32bf16(
+  <vscale x 32 x i8>,
+  <vscale x 32 x bfloat>,
+  iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vfncvt_rtz.x.f.w_nxv32i8_nxv32bf16(<vscale x 32 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_rtz.x.f.w_nxv32i8_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, m4, ta, ma
+; CHECK-NEXT:    vfncvt.rtz.x.f.w v16, v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.nxv32i8.nxv32bf16(
+    <vscale x 32 x i8> poison,
+    <vscale x 32 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 32 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv32i8.nxv32bf16(
+  <vscale x 32 x i8>,
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vfncvt_mask_rtz.x.f.w_nxv32i8_nxv32bf16(<vscale x 32 x i8> %0, <vscale x 32 x bfloat> %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_rtz.x.f.w_nxv32i8_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, m4, ta, mu
+; CHECK-NEXT:    vfncvt.rtz.x.f.w v8, v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i8> @llvm.riscv.vfncvt.rtz.x.f.w.mask.nxv32i8.nxv32bf16(
+    <vscale x 32 x i8> %0,
+    <vscale x 32 x bfloat> %1,
+    <vscale x 32 x i1> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 32 x i8> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfncvt-rtz-xu-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfncvt-rtz-xu-bf.ll
new file mode 100644
index 0000000000000..521f7274dc5c9
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfncvt-rtz-xu-bf.ll
@@ -0,0 +1,270 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv1i8.nxv1bf16(
+  <vscale x 1 x i8>,
+  <vscale x 1 x bfloat>,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vfncvt_rtz.xu.f.w_nxv1i8_nxv1bf16(<vscale x 1 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_rtz.xu.f.w_nxv1i8_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, mf8, ta, ma
+; CHECK-NEXT:    vfncvt.rtz.xu.f.w v9, v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv1i8.nxv1bf16(
+    <vscale x 1 x i8> poison,
+    <vscale x 1 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv1i8.nxv1bf16(
+  <vscale x 1 x i8>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vfncvt_mask_rtz.xu.f.w_nxv1i8_nxv1bf16(<vscale x 1 x i8> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_rtz.xu.f.w_nxv1i8_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, mf8, ta, mu
+; CHECK-NEXT:    vfncvt.rtz.xu.f.w v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv1i8.nxv1bf16(
+    <vscale x 1 x i8> %0,
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x i1> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv2i8.nxv2bf16(
+  <vscale x 2 x i8>,
+  <vscale x 2 x bfloat>,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vfncvt_rtz.xu.f.w_nxv2i8_nxv2bf16(<vscale x 2 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_rtz.xu.f.w_nxv2i8_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, mf4, ta, ma
+; CHECK-NEXT:    vfncvt.rtz.xu.f.w v9, v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv2i8.nxv2bf16(
+    <vscale x 2 x i8> poison,
+    <vscale x 2 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv2i8.nxv2bf16(
+  <vscale x 2 x i8>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vfncvt_mask_rtz.xu.f.w_nxv2i8_nxv2bf16(<vscale x 2 x i8> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_rtz.xu.f.w_nxv2i8_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, mf4, ta, mu
+; CHECK-NEXT:    vfncvt.rtz.xu.f.w v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv2i8.nxv2bf16(
+    <vscale x 2 x i8> %0,
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x i1> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv4i8.nxv4bf16(
+  <vscale x 4 x i8>,
+  <vscale x 4 x bfloat>,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vfncvt_rtz.xu.f.w_nxv4i8_nxv4bf16(<vscale x 4 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_rtz.xu.f.w_nxv4i8_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, mf2, ta, ma
+; CHECK-NEXT:    vfncvt.rtz.xu.f.w v9, v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv4i8.nxv4bf16(
+    <vscale x 4 x i8> poison,
+    <vscale x 4 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv4i8.nxv4bf16(
+  <vscale x 4 x i8>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vfncvt_mask_rtz.xu.f.w_nxv4i8_nxv4bf16(<vscale x 4 x i8> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_rtz.xu.f.w_nxv4i8_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, mf2, ta, mu
+; CHECK-NEXT:    vfncvt.rtz.xu.f.w v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv4i8.nxv4bf16(
+    <vscale x 4 x i8> %0,
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x i1> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv8i8.nxv8bf16(
+  <vscale x 8 x i8>,
+  <vscale x 8 x bfloat>,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vfncvt_rtz.xu.f.w_nxv8i8_nxv8bf16(<vscale x 8 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_rtz.xu.f.w_nxv8i8_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, m1, ta, ma
+; CHECK-NEXT:    vfncvt.rtz.xu.f.w v10, v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv8i8.nxv8bf16(
+    <vscale x 8 x i8> poison,
+    <vscale x 8 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv8i8.nxv8bf16(
+  <vscale x 8 x i8>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vfncvt_mask_rtz.xu.f.w_nxv8i8_nxv8bf16(<vscale x 8 x i8> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_rtz.xu.f.w_nxv8i8_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, m1, ta, mu
+; CHECK-NEXT:    vfncvt.rtz.xu.f.w v8, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv8i8.nxv8bf16(
+    <vscale x 8 x i8> %0,
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x i1> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv16i8.nxv16bf16(
+  <vscale x 16 x i8>,
+  <vscale x 16 x bfloat>,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vfncvt_rtz.xu.f.w_nxv16i8_nxv16bf16(<vscale x 16 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_rtz.xu.f.w_nxv16i8_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, m2, ta, ma
+; CHECK-NEXT:    vfncvt.rtz.xu.f.w v12, v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv16i8.nxv16bf16(
+    <vscale x 16 x i8> poison,
+    <vscale x 16 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv16i8.nxv16bf16(
+  <vscale x 16 x i8>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vfncvt_mask_rtz.xu.f.w_nxv16i8_nxv16bf16(<vscale x 16 x i8> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_rtz.xu.f.w_nxv16i8_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, m2, ta, mu
+; CHECK-NEXT:    vfncvt.rtz.xu.f.w v8, v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv16i8.nxv16bf16(
+    <vscale x 16 x i8> %0,
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x i1> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv32i8.nxv32bf16(
+  <vscale x 32 x i8>,
+  <vscale x 32 x bfloat>,
+  iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vfncvt_rtz.xu.f.w_nxv32i8_nxv32bf16(<vscale x 32 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_rtz.xu.f.w_nxv32i8_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, m4, ta, ma
+; CHECK-NEXT:    vfncvt.rtz.xu.f.w v16, v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.nxv32i8.nxv32bf16(
+    <vscale x 32 x i8> poison,
+    <vscale x 32 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 32 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv32i8.nxv32bf16(
+  <vscale x 32 x i8>,
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vfncvt_mask_rtz.xu.f.w_nxv32i8_nxv32bf16(<vscale x 32 x i8> %0, <vscale x 32 x bfloat> %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_rtz.xu.f.w_nxv32i8_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, m4, ta, mu
+; CHECK-NEXT:    vfncvt.rtz.xu.f.w v8, v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i8> @llvm.riscv.vfncvt.rtz.xu.f.w.mask.nxv32i8.nxv32bf16(
+    <vscale x 32 x i8> %0,
+    <vscale x 32 x bfloat> %1,
+    <vscale x 32 x i1> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 32 x i8> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfncvt-x-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfncvt-x-bf.ll
new file mode 100644
index 0000000000000..ab9ebade287e6
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfncvt-x-bf.ll
@@ -0,0 +1,288 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x i8> @llvm.riscv.vfncvt.x.f.w.nxv1i8.nxv1bf16(
+  <vscale x 1 x i8>,
+  <vscale x 1 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vfncvt_x.f.w_nxv1i8_nxv1bf16(<vscale x 1 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_x.f.w_nxv1i8_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, mf8, ta, ma
+; CHECK-NEXT:    vfncvt.x.f.w v9, v8
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vfncvt.x.f.w.nxv1i8.nxv1bf16(
+    <vscale x 1 x i8> poison,
+    <vscale x 1 x bfloat> %0,
+    iXLen 0, iXLen %1)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv1i8.nxv1bf16(
+  <vscale x 1 x i8>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vfncvt_mask_x.f.w_nxv1i8_nxv1bf16(<vscale x 1 x i8> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_x.f.w_nxv1i8_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, mf8, ta, mu
+; CHECK-NEXT:    vfncvt.x.f.w v8, v9, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv1i8.nxv1bf16(
+    <vscale x 1 x i8> %0,
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x i1> %2,
+    iXLen 0, iXLen %3, iXLen 1)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vfncvt.x.f.w.nxv2i8.nxv2bf16(
+  <vscale x 2 x i8>,
+  <vscale x 2 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vfncvt_x.f.w_nxv2i8_nxv2bf16(<vscale x 2 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_x.f.w_nxv2i8_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, mf4, ta, ma
+; CHECK-NEXT:    vfncvt.x.f.w v9, v8
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vfncvt.x.f.w.nxv2i8.nxv2bf16(
+    <vscale x 2 x i8> poison,
+    <vscale x 2 x bfloat> %0,
+    iXLen 0, iXLen %1)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv2i8.nxv2bf16(
+  <vscale x 2 x i8>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vfncvt_mask_x.f.w_nxv2i8_nxv2bf16(<vscale x 2 x i8> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_x.f.w_nxv2i8_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, mf4, ta, mu
+; CHECK-NEXT:    vfncvt.x.f.w v8, v9, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv2i8.nxv2bf16(
+    <vscale x 2 x i8> %0,
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x i1> %2,
+    iXLen 0, iXLen %3, iXLen 1)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vfncvt.x.f.w.nxv4i8.nxv4bf16(
+  <vscale x 4 x i8>,
+  <vscale x 4 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vfncvt_x.f.w_nxv4i8_nxv4bf16(<vscale x 4 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_x.f.w_nxv4i8_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, mf2, ta, ma
+; CHECK-NEXT:    vfncvt.x.f.w v9, v8
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vfncvt.x.f.w.nxv4i8.nxv4bf16(
+    <vscale x 4 x i8> poison,
+    <vscale x 4 x bfloat> %0,
+    iXLen 0, iXLen %1)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv4i8.nxv4bf16(
+  <vscale x 4 x i8>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vfncvt_mask_x.f.w_nxv4i8_nxv4bf16(<vscale x 4 x i8> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_x.f.w_nxv4i8_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, mf2, ta, mu
+; CHECK-NEXT:    vfncvt.x.f.w v8, v9, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv4i8.nxv4bf16(
+    <vscale x 4 x i8> %0,
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x i1> %2,
+    iXLen 0, iXLen %3, iXLen 1)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vfncvt.x.f.w.nxv8i8.nxv8bf16(
+  <vscale x 8 x i8>,
+  <vscale x 8 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vfncvt_x.f.w_nxv8i8_nxv8bf16(<vscale x 8 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_x.f.w_nxv8i8_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, m1, ta, ma
+; CHECK-NEXT:    vfncvt.x.f.w v10, v8
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vfncvt.x.f.w.nxv8i8.nxv8bf16(
+    <vscale x 8 x i8> poison,
+    <vscale x 8 x bfloat> %0,
+    iXLen 0, iXLen %1)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv8i8.nxv8bf16(
+  <vscale x 8 x i8>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vfncvt_mask_x.f.w_nxv8i8_nxv8bf16(<vscale x 8 x i8> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_x.f.w_nxv8i8_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, m1, ta, mu
+; CHECK-NEXT:    vfncvt.x.f.w v8, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv8i8.nxv8bf16(
+    <vscale x 8 x i8> %0,
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x i1> %2,
+    iXLen 0, iXLen %3, iXLen 1)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vfncvt.x.f.w.nxv16i8.nxv16bf16(
+  <vscale x 16 x i8>,
+  <vscale x 16 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vfncvt_x.f.w_nxv16i8_nxv16bf16(<vscale x 16 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_x.f.w_nxv16i8_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, m2, ta, ma
+; CHECK-NEXT:    vfncvt.x.f.w v12, v8
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vfncvt.x.f.w.nxv16i8.nxv16bf16(
+    <vscale x 16 x i8> poison,
+    <vscale x 16 x bfloat> %0,
+    iXLen 0, iXLen %1)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv16i8.nxv16bf16(
+  <vscale x 16 x i8>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vfncvt_mask_x.f.w_nxv16i8_nxv16bf16(<vscale x 16 x i8> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_x.f.w_nxv16i8_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, m2, ta, mu
+; CHECK-NEXT:    vfncvt.x.f.w v8, v12, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv16i8.nxv16bf16(
+    <vscale x 16 x i8> %0,
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x i1> %2,
+    iXLen 0, iXLen %3, iXLen 1)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vfncvt.x.f.w.nxv32i8.nxv32bf16(
+  <vscale x 32 x i8>,
+  <vscale x 32 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vfncvt_x.f.w_nxv32i8_nxv32bf16(<vscale x 32 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_x.f.w_nxv32i8_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, m4, ta, ma
+; CHECK-NEXT:    vfncvt.x.f.w v16, v8
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i8> @llvm.riscv.vfncvt.x.f.w.nxv32i8.nxv32bf16(
+    <vscale x 32 x i8> poison,
+    <vscale x 32 x bfloat> %0,
+    iXLen 0, iXLen %1)
+
+  ret <vscale x 32 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv32i8.nxv32bf16(
+  <vscale x 32 x i8>,
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vfncvt_mask_x.f.w_nxv32i8_nxv32bf16(<vscale x 32 x i8> %0, <vscale x 32 x bfloat> %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_x.f.w_nxv32i8_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, m4, ta, mu
+; CHECK-NEXT:    vfncvt.x.f.w v8, v16, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i8> @llvm.riscv.vfncvt.x.f.w.mask.nxv32i8.nxv32bf16(
+    <vscale x 32 x i8> %0,
+    <vscale x 32 x bfloat> %1,
+    <vscale x 32 x i1> %2,
+    iXLen 0, iXLen %3, iXLen 1)
+
+  ret <vscale x 32 x i8> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfncvt-xu-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfncvt-xu-bf.ll
new file mode 100644
index 0000000000000..61c6803ce12bd
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfncvt-xu-bf.ll
@@ -0,0 +1,288 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv1i8.nxv1bf16(
+  <vscale x 1 x i8>,
+  <vscale x 1 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vfncvt_xu.f.w_nxv1i8_nxv1bf16(<vscale x 1 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_xu.f.w_nxv1i8_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, mf8, ta, ma
+; CHECK-NEXT:    vfncvt.xu.f.w v9, v8
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv1i8.nxv1bf16(
+    <vscale x 1 x i8> poison,
+    <vscale x 1 x bfloat> %0,
+    iXLen 0, iXLen %1)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv1i8.nxv1bf16(
+  <vscale x 1 x i8>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vfncvt_mask_xu.f.w_nxv1i8_nxv1bf16(<vscale x 1 x i8> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_xu.f.w_nxv1i8_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, mf8, ta, mu
+; CHECK-NEXT:    vfncvt.xu.f.w v8, v9, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv1i8.nxv1bf16(
+    <vscale x 1 x i8> %0,
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x i1> %2,
+    iXLen 0, iXLen %3, iXLen 1)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv2i8.nxv2bf16(
+  <vscale x 2 x i8>,
+  <vscale x 2 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vfncvt_xu.f.w_nxv2i8_nxv2bf16(<vscale x 2 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_xu.f.w_nxv2i8_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, mf4, ta, ma
+; CHECK-NEXT:    vfncvt.xu.f.w v9, v8
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv2i8.nxv2bf16(
+    <vscale x 2 x i8> poison,
+    <vscale x 2 x bfloat> %0,
+    iXLen 0, iXLen %1)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv2i8.nxv2bf16(
+  <vscale x 2 x i8>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vfncvt_mask_xu.f.w_nxv2i8_nxv2bf16(<vscale x 2 x i8> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_xu.f.w_nxv2i8_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, mf4, ta, mu
+; CHECK-NEXT:    vfncvt.xu.f.w v8, v9, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv2i8.nxv2bf16(
+    <vscale x 2 x i8> %0,
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x i1> %2,
+    iXLen 0, iXLen %3, iXLen 1)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv4i8.nxv4bf16(
+  <vscale x 4 x i8>,
+  <vscale x 4 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vfncvt_xu.f.w_nxv4i8_nxv4bf16(<vscale x 4 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_xu.f.w_nxv4i8_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, mf2, ta, ma
+; CHECK-NEXT:    vfncvt.xu.f.w v9, v8
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv4i8.nxv4bf16(
+    <vscale x 4 x i8> poison,
+    <vscale x 4 x bfloat> %0,
+    iXLen 0, iXLen %1)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv4i8.nxv4bf16(
+  <vscale x 4 x i8>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vfncvt_mask_xu.f.w_nxv4i8_nxv4bf16(<vscale x 4 x i8> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_xu.f.w_nxv4i8_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, mf2, ta, mu
+; CHECK-NEXT:    vfncvt.xu.f.w v8, v9, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv4i8.nxv4bf16(
+    <vscale x 4 x i8> %0,
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x i1> %2,
+    iXLen 0, iXLen %3, iXLen 1)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv8i8.nxv8bf16(
+  <vscale x 8 x i8>,
+  <vscale x 8 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vfncvt_xu.f.w_nxv8i8_nxv8bf16(<vscale x 8 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_xu.f.w_nxv8i8_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, m1, ta, ma
+; CHECK-NEXT:    vfncvt.xu.f.w v10, v8
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv8i8.nxv8bf16(
+    <vscale x 8 x i8> poison,
+    <vscale x 8 x bfloat> %0,
+    iXLen 0, iXLen %1)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv8i8.nxv8bf16(
+  <vscale x 8 x i8>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vfncvt_mask_xu.f.w_nxv8i8_nxv8bf16(<vscale x 8 x i8> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_xu.f.w_nxv8i8_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, m1, ta, mu
+; CHECK-NEXT:    vfncvt.xu.f.w v8, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv8i8.nxv8bf16(
+    <vscale x 8 x i8> %0,
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x i1> %2,
+    iXLen 0, iXLen %3, iXLen 1)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv16i8.nxv16bf16(
+  <vscale x 16 x i8>,
+  <vscale x 16 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vfncvt_xu.f.w_nxv16i8_nxv16bf16(<vscale x 16 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_xu.f.w_nxv16i8_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, m2, ta, ma
+; CHECK-NEXT:    vfncvt.xu.f.w v12, v8
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv16i8.nxv16bf16(
+    <vscale x 16 x i8> poison,
+    <vscale x 16 x bfloat> %0,
+    iXLen 0, iXLen %1)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv16i8.nxv16bf16(
+  <vscale x 16 x i8>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vfncvt_mask_xu.f.w_nxv16i8_nxv16bf16(<vscale x 16 x i8> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_xu.f.w_nxv16i8_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, m2, ta, mu
+; CHECK-NEXT:    vfncvt.xu.f.w v8, v12, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv16i8.nxv16bf16(
+    <vscale x 16 x i8> %0,
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x i1> %2,
+    iXLen 0, iXLen %3, iXLen 1)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv32i8.nxv32bf16(
+  <vscale x 32 x i8>,
+  <vscale x 32 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vfncvt_xu.f.w_nxv32i8_nxv32bf16(<vscale x 32 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_xu.f.w_nxv32i8_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, m4, ta, ma
+; CHECK-NEXT:    vfncvt.xu.f.w v16, v8
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i8> @llvm.riscv.vfncvt.xu.f.w.nxv32i8.nxv32bf16(
+    <vscale x 32 x i8> poison,
+    <vscale x 32 x bfloat> %0,
+    iXLen 0, iXLen %1)
+
+  ret <vscale x 32 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv32i8.nxv32bf16(
+  <vscale x 32 x i8>,
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vfncvt_mask_xu.f.w_nxv32i8_nxv32bf16(<vscale x 32 x i8> %0, <vscale x 32 x bfloat> %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfncvt_mask_xu.f.w_nxv32i8_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, m4, ta, mu
+; CHECK-NEXT:    vfncvt.xu.f.w v8, v16, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i8> @llvm.riscv.vfncvt.xu.f.w.mask.nxv32i8.nxv32bf16(
+    <vscale x 32 x i8> %0,
+    <vscale x 32 x bfloat> %1,
+    <vscale x 32 x i1> %2,
+    iXLen 0, iXLen %3, iXLen 1)
+
+  ret <vscale x 32 x i8> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfnmacc-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfnmacc-bf.ll
new file mode 100644
index 0000000000000..4b4091ba7acbe
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfnmacc-bf.ll
@@ -0,0 +1,553 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfnmacc.nxv1bf16.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat>  @intrinsic_vfnmacc_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmacc_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, ma
+; CHECK-NEXT:    vfnmacc.vv v8, v9, v10
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmacc.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfnmacc.mask.nxv1bf16.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat>  @intrinsic_vfnmacc_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmacc_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT:    vfnmacc.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmacc.mask.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfnmacc.nxv2bf16.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat>  @intrinsic_vfnmacc_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmacc_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, ma
+; CHECK-NEXT:    vfnmacc.vv v8, v9, v10
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfnmacc.nxv2bf16.nxv2bf16(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfnmacc.mask.nxv2bf16.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat>  @intrinsic_vfnmacc_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmacc_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT:    vfnmacc.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfnmacc.mask.nxv2bf16.nxv2bf16(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x bfloat> %2,
+    <vscale x 2 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfnmacc.nxv4bf16.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat>  @intrinsic_vfnmacc_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmacc_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT:    vfnmacc.vv v8, v9, v10
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfnmacc.nxv4bf16.nxv4bf16(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfnmacc.mask.nxv4bf16.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat>  @intrinsic_vfnmacc_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmacc_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT:    vfnmacc.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfnmacc.mask.nxv4bf16.nxv4bf16(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x bfloat> %2,
+    <vscale x 4 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfnmacc.nxv8bf16.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat>  @intrinsic_vfnmacc_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmacc_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, ma
+; CHECK-NEXT:    vfnmacc.vv v8, v10, v12
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfnmacc.nxv8bf16.nxv8bf16(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfnmacc.mask.nxv8bf16.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat>  @intrinsic_vfnmacc_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmacc_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT:    vfnmacc.vv v8, v10, v12, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfnmacc.mask.nxv8bf16.nxv8bf16(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x bfloat> %2,
+    <vscale x 8 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfnmacc.nxv16bf16.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat>  @intrinsic_vfnmacc_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmacc_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, ma
+; CHECK-NEXT:    vfnmacc.vv v8, v12, v16
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfnmacc.nxv16bf16.nxv16bf16(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfnmacc.mask.nxv16bf16.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat>  @intrinsic_vfnmacc_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmacc_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT:    vfnmacc.vv v8, v12, v16, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfnmacc.mask.nxv16bf16.nxv16bf16(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x bfloat> %2,
+    <vscale x 16 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfnmacc.nxv1bf16.bf16(
+  <vscale x 1 x bfloat>,
+  bfloat,
+  <vscale x 1 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat>  @intrinsic_vfnmacc_vf_nxv1bf16_bf16_nxv1bf16(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmacc_vf_nxv1bf16_bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, ma
+; CHECK-NEXT:    vfnmacc.vf v8, fa0, v9
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmacc.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> %0,
+    bfloat %1,
+    <vscale x 1 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfnmacc.mask.nxv1bf16.bf16(
+  <vscale x 1 x bfloat>,
+  bfloat,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfnmacc_mask_vf_nxv1bf16_bf16_nxv1bf16(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmacc_mask_vf_nxv1bf16_bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT:    vfnmacc.vf v8, fa0, v9, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmacc.mask.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> %0,
+    bfloat %1,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfnmacc.nxv2bf16.bf16(
+  <vscale x 2 x bfloat>,
+  bfloat,
+  <vscale x 2 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat>  @intrinsic_vfnmacc_vf_nxv2bf16_bf16_nxv2bf16(<vscale x 2 x bfloat> %0, bfloat %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmacc_vf_nxv2bf16_bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, ma
+; CHECK-NEXT:    vfnmacc.vf v8, fa0, v9
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfnmacc.nxv2bf16.bf16(
+    <vscale x 2 x bfloat> %0,
+    bfloat %1,
+    <vscale x 2 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfnmacc.mask.nxv2bf16.bf16(
+  <vscale x 2 x bfloat>,
+  bfloat,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfnmacc_mask_vf_nxv2bf16_bf16_nxv2bf16(<vscale x 2 x bfloat> %0, bfloat %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmacc_mask_vf_nxv2bf16_bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT:    vfnmacc.vf v8, fa0, v9, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfnmacc.mask.nxv2bf16.bf16(
+    <vscale x 2 x bfloat> %0,
+    bfloat %1,
+    <vscale x 2 x bfloat> %2,
+    <vscale x 2 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfnmacc.nxv4bf16.bf16(
+  <vscale x 4 x bfloat>,
+  bfloat,
+  <vscale x 4 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat>  @intrinsic_vfnmacc_vf_nxv4bf16_bf16_nxv4bf16(<vscale x 4 x bfloat> %0, bfloat %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmacc_vf_nxv4bf16_bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT:    vfnmacc.vf v8, fa0, v9
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfnmacc.nxv4bf16.bf16(
+    <vscale x 4 x bfloat> %0,
+    bfloat %1,
+    <vscale x 4 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfnmacc.mask.nxv4bf16.bf16(
+  <vscale x 4 x bfloat>,
+  bfloat,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfnmacc_mask_vf_nxv4bf16_bf16_nxv4bf16(<vscale x 4 x bfloat> %0, bfloat %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmacc_mask_vf_nxv4bf16_bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT:    vfnmacc.vf v8, fa0, v9, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfnmacc.mask.nxv4bf16.bf16(
+    <vscale x 4 x bfloat> %0,
+    bfloat %1,
+    <vscale x 4 x bfloat> %2,
+    <vscale x 4 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfnmacc.nxv8bf16.bf16(
+  <vscale x 8 x bfloat>,
+  bfloat,
+  <vscale x 8 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat>  @intrinsic_vfnmacc_vf_nxv8bf16_bf16_nxv8bf16(<vscale x 8 x bfloat> %0, bfloat %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmacc_vf_nxv8bf16_bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, ma
+; CHECK-NEXT:    vfnmacc.vf v8, fa0, v10
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfnmacc.nxv8bf16.bf16(
+    <vscale x 8 x bfloat> %0,
+    bfloat %1,
+    <vscale x 8 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfnmacc.mask.nxv8bf16.bf16(
+  <vscale x 8 x bfloat>,
+  bfloat,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfnmacc_mask_vf_nxv8bf16_bf16_nxv8bf16(<vscale x 8 x bfloat> %0, bfloat %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmacc_mask_vf_nxv8bf16_bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT:    vfnmacc.vf v8, fa0, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfnmacc.mask.nxv8bf16.bf16(
+    <vscale x 8 x bfloat> %0,
+    bfloat %1,
+    <vscale x 8 x bfloat> %2,
+    <vscale x 8 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfnmacc.nxv16bf16.bf16(
+  <vscale x 16 x bfloat>,
+  bfloat,
+  <vscale x 16 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat>  @intrinsic_vfnmacc_vf_nxv16bf16_bf16_nxv16bf16(<vscale x 16 x bfloat> %0, bfloat %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmacc_vf_nxv16bf16_bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, ma
+; CHECK-NEXT:    vfnmacc.vf v8, fa0, v12
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfnmacc.nxv16bf16.bf16(
+    <vscale x 16 x bfloat> %0,
+    bfloat %1,
+    <vscale x 16 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfnmacc.mask.nxv16bf16.bf16(
+  <vscale x 16 x bfloat>,
+  bfloat,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfnmacc_mask_vf_nxv16bf16_bf16_nxv16bf16(<vscale x 16 x bfloat> %0, bfloat %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmacc_mask_vf_nxv16bf16_bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT:    vfnmacc.vf v8, fa0, v12, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfnmacc.mask.nxv16bf16.bf16(
+    <vscale x 16 x bfloat> %0,
+    bfloat %1,
+    <vscale x 16 x bfloat> %2,
+    <vscale x 16 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat>  @intrinsic_vfnmacc_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmacc_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfnmadd.vv v8, v10, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmacc.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %2,
+    iXLen 7, iXLen %3, iXLen 3)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat>  @intrinsic_vfnmacc_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute2(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmacc_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfnmadd.vv v8, v10, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmacc.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x bfloat> %0,
+    iXLen 7, iXLen %3, iXLen 3)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat>  @intrinsic_vfnmacc_vf_nxv1bf16_bf16_nxv1bf16_commute(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmacc_vf_nxv1bf16_bf16_nxv1bf16_commute:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfnmadd.vf v8, fa0, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmacc.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> %2,
+    bfloat %1,
+    <vscale x 1 x bfloat> %0,
+    iXLen 7, iXLen %3, iXLen 3)
+
+  ret <vscale x 1 x bfloat> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfnmadd-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfnmadd-bf.ll
new file mode 100644
index 0000000000000..2bb6bf5ae9e26
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfnmadd-bf.ll
@@ -0,0 +1,553 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfnmadd.nxv1bf16.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat>  @intrinsic_vfnmadd_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmadd_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, ma
+; CHECK-NEXT:    vfnmadd.vv v8, v9, v10
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmadd.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfnmadd.mask.nxv1bf16.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat>  @intrinsic_vfnmadd_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmadd_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT:    vfnmadd.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmadd.mask.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfnmadd.nxv2bf16.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat>  @intrinsic_vfnmadd_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmadd_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, ma
+; CHECK-NEXT:    vfnmadd.vv v8, v9, v10
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfnmadd.nxv2bf16.nxv2bf16(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfnmadd.mask.nxv2bf16.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat>  @intrinsic_vfnmadd_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmadd_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT:    vfnmadd.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfnmadd.mask.nxv2bf16.nxv2bf16(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x bfloat> %2,
+    <vscale x 2 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfnmadd.nxv4bf16.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat>  @intrinsic_vfnmadd_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmadd_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT:    vfnmadd.vv v8, v9, v10
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfnmadd.nxv4bf16.nxv4bf16(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfnmadd.mask.nxv4bf16.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat>  @intrinsic_vfnmadd_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmadd_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT:    vfnmadd.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfnmadd.mask.nxv4bf16.nxv4bf16(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x bfloat> %2,
+    <vscale x 4 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfnmadd.nxv8bf16.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat>  @intrinsic_vfnmadd_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmadd_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, ma
+; CHECK-NEXT:    vfnmadd.vv v8, v10, v12
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfnmadd.nxv8bf16.nxv8bf16(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfnmadd.mask.nxv8bf16.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat>  @intrinsic_vfnmadd_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmadd_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT:    vfnmadd.vv v8, v10, v12, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfnmadd.mask.nxv8bf16.nxv8bf16(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x bfloat> %2,
+    <vscale x 8 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfnmadd.nxv16bf16.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat>  @intrinsic_vfnmadd_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmadd_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, ma
+; CHECK-NEXT:    vfnmadd.vv v8, v12, v16
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfnmadd.nxv16bf16.nxv16bf16(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfnmadd.mask.nxv16bf16.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat>  @intrinsic_vfnmadd_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmadd_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT:    vfnmadd.vv v8, v12, v16, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfnmadd.mask.nxv16bf16.nxv16bf16(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x bfloat> %2,
+    <vscale x 16 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfnmadd.nxv1bf16.bf16(
+  <vscale x 1 x bfloat>,
+  bfloat,
+  <vscale x 1 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat>  @intrinsic_vfnmadd_vf_nxv1bf16_bf16_nxv1bf16(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmadd_vf_nxv1bf16_bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, ma
+; CHECK-NEXT:    vfnmadd.vf v8, fa0, v9
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmadd.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> %0,
+    bfloat %1,
+    <vscale x 1 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfnmadd.mask.nxv1bf16.bf16(
+  <vscale x 1 x bfloat>,
+  bfloat,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfnmadd_mask_vf_nxv1bf16_bf16_nxv1bf16(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmadd_mask_vf_nxv1bf16_bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT:    vfnmadd.vf v8, fa0, v9, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmadd.mask.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> %0,
+    bfloat %1,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfnmadd.nxv2bf16.bf16(
+  <vscale x 2 x bfloat>,
+  bfloat,
+  <vscale x 2 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat>  @intrinsic_vfnmadd_vf_nxv2bf16_bf16_nxv2bf16(<vscale x 2 x bfloat> %0, bfloat %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmadd_vf_nxv2bf16_bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, ma
+; CHECK-NEXT:    vfnmadd.vf v8, fa0, v9
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfnmadd.nxv2bf16.bf16(
+    <vscale x 2 x bfloat> %0,
+    bfloat %1,
+    <vscale x 2 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfnmadd.mask.nxv2bf16.bf16(
+  <vscale x 2 x bfloat>,
+  bfloat,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfnmadd_mask_vf_nxv2bf16_bf16_nxv2bf16(<vscale x 2 x bfloat> %0, bfloat %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmadd_mask_vf_nxv2bf16_bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT:    vfnmadd.vf v8, fa0, v9, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfnmadd.mask.nxv2bf16.bf16(
+    <vscale x 2 x bfloat> %0,
+    bfloat %1,
+    <vscale x 2 x bfloat> %2,
+    <vscale x 2 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfnmadd.nxv4bf16.bf16(
+  <vscale x 4 x bfloat>,
+  bfloat,
+  <vscale x 4 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat>  @intrinsic_vfnmadd_vf_nxv4bf16_bf16_nxv4bf16(<vscale x 4 x bfloat> %0, bfloat %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmadd_vf_nxv4bf16_bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT:    vfnmadd.vf v8, fa0, v9
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfnmadd.nxv4bf16.bf16(
+    <vscale x 4 x bfloat> %0,
+    bfloat %1,
+    <vscale x 4 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfnmadd.mask.nxv4bf16.bf16(
+  <vscale x 4 x bfloat>,
+  bfloat,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfnmadd_mask_vf_nxv4bf16_bf16_nxv4bf16(<vscale x 4 x bfloat> %0, bfloat %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmadd_mask_vf_nxv4bf16_bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT:    vfnmadd.vf v8, fa0, v9, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfnmadd.mask.nxv4bf16.bf16(
+    <vscale x 4 x bfloat> %0,
+    bfloat %1,
+    <vscale x 4 x bfloat> %2,
+    <vscale x 4 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfnmadd.nxv8bf16.bf16(
+  <vscale x 8 x bfloat>,
+  bfloat,
+  <vscale x 8 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat>  @intrinsic_vfnmadd_vf_nxv8bf16_bf16_nxv8bf16(<vscale x 8 x bfloat> %0, bfloat %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmadd_vf_nxv8bf16_bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, ma
+; CHECK-NEXT:    vfnmadd.vf v8, fa0, v10
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfnmadd.nxv8bf16.bf16(
+    <vscale x 8 x bfloat> %0,
+    bfloat %1,
+    <vscale x 8 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfnmadd.mask.nxv8bf16.bf16(
+  <vscale x 8 x bfloat>,
+  bfloat,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfnmadd_mask_vf_nxv8bf16_bf16_nxv8bf16(<vscale x 8 x bfloat> %0, bfloat %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmadd_mask_vf_nxv8bf16_bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT:    vfnmadd.vf v8, fa0, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfnmadd.mask.nxv8bf16.bf16(
+    <vscale x 8 x bfloat> %0,
+    bfloat %1,
+    <vscale x 8 x bfloat> %2,
+    <vscale x 8 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfnmadd.nxv16bf16.bf16(
+  <vscale x 16 x bfloat>,
+  bfloat,
+  <vscale x 16 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat>  @intrinsic_vfnmadd_vf_nxv16bf16_bf16_nxv16bf16(<vscale x 16 x bfloat> %0, bfloat %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmadd_vf_nxv16bf16_bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, ma
+; CHECK-NEXT:    vfnmadd.vf v8, fa0, v12
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfnmadd.nxv16bf16.bf16(
+    <vscale x 16 x bfloat> %0,
+    bfloat %1,
+    <vscale x 16 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfnmadd.mask.nxv16bf16.bf16(
+  <vscale x 16 x bfloat>,
+  bfloat,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfnmadd_mask_vf_nxv16bf16_bf16_nxv16bf16(<vscale x 16 x bfloat> %0, bfloat %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmadd_mask_vf_nxv16bf16_bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT:    vfnmadd.vf v8, fa0, v12, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfnmadd.mask.nxv16bf16.bf16(
+    <vscale x 16 x bfloat> %0,
+    bfloat %1,
+    <vscale x 16 x bfloat> %2,
+    <vscale x 16 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat>  @intrinsic_vfnmadd_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmadd_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfnmadd.vv v8, v9, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmadd.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %2,
+    iXLen 7, iXLen %3, iXLen 3)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat>  @intrinsic_vfnmadd_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute2(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmadd_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfnmacc.vv v8, v10, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmadd.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x bfloat> %0,
+    iXLen 7, iXLen %3, iXLen 3)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat>  @intrinsic_vfnmadd_vf_nxv1bf16_bf16_nxv1bf16_commute(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmadd_vf_nxv1bf16_bf16_nxv1bf16_commute:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfnmacc.vf v8, fa0, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmadd.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> %2,
+    bfloat %1,
+    <vscale x 1 x bfloat> %0,
+    iXLen 7, iXLen %3, iXLen 3)
+
+  ret <vscale x 1 x bfloat> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfnmsac-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfnmsac-bf.ll
new file mode 100644
index 0000000000000..cfbaafa00c043
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfnmsac-bf.ll
@@ -0,0 +1,553 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfnmsac.nxv1bf16.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat>  @intrinsic_vfnmsac_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsac_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, ma
+; CHECK-NEXT:    vfnmsac.vv v8, v9, v10
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsac.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfnmsac.mask.nxv1bf16.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat>  @intrinsic_vfnmsac_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsac_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT:    vfnmsac.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsac.mask.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfnmsac.nxv2bf16.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat>  @intrinsic_vfnmsac_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsac_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, ma
+; CHECK-NEXT:    vfnmsac.vv v8, v9, v10
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsac.nxv2bf16.nxv2bf16(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfnmsac.mask.nxv2bf16.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat>  @intrinsic_vfnmsac_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsac_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT:    vfnmsac.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsac.mask.nxv2bf16.nxv2bf16(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x bfloat> %2,
+    <vscale x 2 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfnmsac.nxv4bf16.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat>  @intrinsic_vfnmsac_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsac_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT:    vfnmsac.vv v8, v9, v10
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsac.nxv4bf16.nxv4bf16(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfnmsac.mask.nxv4bf16.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat>  @intrinsic_vfnmsac_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsac_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT:    vfnmsac.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsac.mask.nxv4bf16.nxv4bf16(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x bfloat> %2,
+    <vscale x 4 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfnmsac.nxv8bf16.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat>  @intrinsic_vfnmsac_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsac_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, ma
+; CHECK-NEXT:    vfnmsac.vv v8, v10, v12
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsac.nxv8bf16.nxv8bf16(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfnmsac.mask.nxv8bf16.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat>  @intrinsic_vfnmsac_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsac_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT:    vfnmsac.vv v8, v10, v12, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsac.mask.nxv8bf16.nxv8bf16(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x bfloat> %2,
+    <vscale x 8 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfnmsac.nxv16bf16.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat>  @intrinsic_vfnmsac_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsac_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, ma
+; CHECK-NEXT:    vfnmsac.vv v8, v12, v16
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsac.nxv16bf16.nxv16bf16(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfnmsac.mask.nxv16bf16.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat>  @intrinsic_vfnmsac_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsac_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT:    vfnmsac.vv v8, v12, v16, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsac.mask.nxv16bf16.nxv16bf16(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x bfloat> %2,
+    <vscale x 16 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfnmsac.nxv1bf16.bf16(
+  <vscale x 1 x bfloat>,
+  bfloat,
+  <vscale x 1 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat>  @intrinsic_vfnmsac_vf_nxv1bf16_bf16_nxv1bf16(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsac_vf_nxv1bf16_bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, ma
+; CHECK-NEXT:    vfnmsac.vf v8, fa0, v9
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsac.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> %0,
+    bfloat %1,
+    <vscale x 1 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfnmsac.mask.nxv1bf16.bf16(
+  <vscale x 1 x bfloat>,
+  bfloat,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfnmsac_mask_vf_nxv1bf16_bf16_nxv1bf16(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsac_mask_vf_nxv1bf16_bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT:    vfnmsac.vf v8, fa0, v9, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsac.mask.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> %0,
+    bfloat %1,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfnmsac.nxv2bf16.bf16(
+  <vscale x 2 x bfloat>,
+  bfloat,
+  <vscale x 2 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat>  @intrinsic_vfnmsac_vf_nxv2bf16_bf16_nxv2bf16(<vscale x 2 x bfloat> %0, bfloat %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsac_vf_nxv2bf16_bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, ma
+; CHECK-NEXT:    vfnmsac.vf v8, fa0, v9
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsac.nxv2bf16.bf16(
+    <vscale x 2 x bfloat> %0,
+    bfloat %1,
+    <vscale x 2 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfnmsac.mask.nxv2bf16.bf16(
+  <vscale x 2 x bfloat>,
+  bfloat,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfnmsac_mask_vf_nxv2bf16_bf16_nxv2bf16(<vscale x 2 x bfloat> %0, bfloat %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsac_mask_vf_nxv2bf16_bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT:    vfnmsac.vf v8, fa0, v9, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsac.mask.nxv2bf16.bf16(
+    <vscale x 2 x bfloat> %0,
+    bfloat %1,
+    <vscale x 2 x bfloat> %2,
+    <vscale x 2 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfnmsac.nxv4bf16.bf16(
+  <vscale x 4 x bfloat>,
+  bfloat,
+  <vscale x 4 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat>  @intrinsic_vfnmsac_vf_nxv4bf16_bf16_nxv4bf16(<vscale x 4 x bfloat> %0, bfloat %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsac_vf_nxv4bf16_bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT:    vfnmsac.vf v8, fa0, v9
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsac.nxv4bf16.bf16(
+    <vscale x 4 x bfloat> %0,
+    bfloat %1,
+    <vscale x 4 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfnmsac.mask.nxv4bf16.bf16(
+  <vscale x 4 x bfloat>,
+  bfloat,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfnmsac_mask_vf_nxv4bf16_bf16_nxv4bf16(<vscale x 4 x bfloat> %0, bfloat %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsac_mask_vf_nxv4bf16_bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT:    vfnmsac.vf v8, fa0, v9, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsac.mask.nxv4bf16.bf16(
+    <vscale x 4 x bfloat> %0,
+    bfloat %1,
+    <vscale x 4 x bfloat> %2,
+    <vscale x 4 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfnmsac.nxv8bf16.bf16(
+  <vscale x 8 x bfloat>,
+  bfloat,
+  <vscale x 8 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat>  @intrinsic_vfnmsac_vf_nxv8bf16_bf16_nxv8bf16(<vscale x 8 x bfloat> %0, bfloat %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsac_vf_nxv8bf16_bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, ma
+; CHECK-NEXT:    vfnmsac.vf v8, fa0, v10
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsac.nxv8bf16.bf16(
+    <vscale x 8 x bfloat> %0,
+    bfloat %1,
+    <vscale x 8 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfnmsac.mask.nxv8bf16.bf16(
+  <vscale x 8 x bfloat>,
+  bfloat,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfnmsac_mask_vf_nxv8bf16_bf16_nxv8bf16(<vscale x 8 x bfloat> %0, bfloat %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsac_mask_vf_nxv8bf16_bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT:    vfnmsac.vf v8, fa0, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsac.mask.nxv8bf16.bf16(
+    <vscale x 8 x bfloat> %0,
+    bfloat %1,
+    <vscale x 8 x bfloat> %2,
+    <vscale x 8 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfnmsac.nxv16bf16.bf16(
+  <vscale x 16 x bfloat>,
+  bfloat,
+  <vscale x 16 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat>  @intrinsic_vfnmsac_vf_nxv16bf16_bf16_nxv16bf16(<vscale x 16 x bfloat> %0, bfloat %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsac_vf_nxv16bf16_bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, ma
+; CHECK-NEXT:    vfnmsac.vf v8, fa0, v12
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsac.nxv16bf16.bf16(
+    <vscale x 16 x bfloat> %0,
+    bfloat %1,
+    <vscale x 16 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfnmsac.mask.nxv16bf16.bf16(
+  <vscale x 16 x bfloat>,
+  bfloat,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfnmsac_mask_vf_nxv16bf16_bf16_nxv16bf16(<vscale x 16 x bfloat> %0, bfloat %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsac_mask_vf_nxv16bf16_bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT:    vfnmsac.vf v8, fa0, v12, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsac.mask.nxv16bf16.bf16(
+    <vscale x 16 x bfloat> %0,
+    bfloat %1,
+    <vscale x 16 x bfloat> %2,
+    <vscale x 16 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat>  @intrinsic_vfnmsac_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsac_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfnmsub.vv v8, v10, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsac.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %2,
+    iXLen 7, iXLen %3, iXLen 3)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat>  @intrinsic_vfnmsac_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute2(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsac_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfnmsub.vv v8, v10, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsac.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x bfloat> %0,
+    iXLen 7, iXLen %3, iXLen 3)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat>  @intrinsic_vfnmsac_vf_nxv1bf16_bf16_nxv1bf16_commute(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsac_vf_nxv1bf16_bf16_nxv1bf16_commute:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfnmsub.vf v8, fa0, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsac.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> %2,
+    bfloat %1,
+    <vscale x 1 x bfloat> %0,
+    iXLen 7, iXLen %3, iXLen 3)
+
+  ret <vscale x 1 x bfloat> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfnmsub-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfnmsub-bf.ll
new file mode 100644
index 0000000000000..5ebbb90c4c5a2
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfnmsub-bf.ll
@@ -0,0 +1,553 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfnmsub.nxv1bf16.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat>  @intrinsic_vfnmsub_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsub_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, ma
+; CHECK-NEXT:    vfnmsub.vv v8, v9, v10
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsub.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfnmsub.mask.nxv1bf16.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat>  @intrinsic_vfnmsub_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsub_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT:    vfnmsub.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsub.mask.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfnmsub.nxv2bf16.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat>  @intrinsic_vfnmsub_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsub_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, ma
+; CHECK-NEXT:    vfnmsub.vv v8, v9, v10
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsub.nxv2bf16.nxv2bf16(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfnmsub.mask.nxv2bf16.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat>  @intrinsic_vfnmsub_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsub_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT:    vfnmsub.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsub.mask.nxv2bf16.nxv2bf16(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x bfloat> %2,
+    <vscale x 2 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfnmsub.nxv4bf16.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat>  @intrinsic_vfnmsub_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsub_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT:    vfnmsub.vv v8, v9, v10
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsub.nxv4bf16.nxv4bf16(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfnmsub.mask.nxv4bf16.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat>  @intrinsic_vfnmsub_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsub_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT:    vfnmsub.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsub.mask.nxv4bf16.nxv4bf16(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x bfloat> %2,
+    <vscale x 4 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfnmsub.nxv8bf16.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat>  @intrinsic_vfnmsub_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsub_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, ma
+; CHECK-NEXT:    vfnmsub.vv v8, v10, v12
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsub.nxv8bf16.nxv8bf16(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfnmsub.mask.nxv8bf16.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat>  @intrinsic_vfnmsub_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsub_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT:    vfnmsub.vv v8, v10, v12, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsub.mask.nxv8bf16.nxv8bf16(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x bfloat> %2,
+    <vscale x 8 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfnmsub.nxv16bf16.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat>  @intrinsic_vfnmsub_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsub_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, ma
+; CHECK-NEXT:    vfnmsub.vv v8, v12, v16
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsub.nxv16bf16.nxv16bf16(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfnmsub.mask.nxv16bf16.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat>  @intrinsic_vfnmsub_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsub_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT:    vfnmsub.vv v8, v12, v16, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsub.mask.nxv16bf16.nxv16bf16(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x bfloat> %2,
+    <vscale x 16 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfnmsub.nxv1bf16.bf16(
+  <vscale x 1 x bfloat>,
+  bfloat,
+  <vscale x 1 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat>  @intrinsic_vfnmsub_vf_nxv1bf16_bf16_nxv1bf16(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsub_vf_nxv1bf16_bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, ma
+; CHECK-NEXT:    vfnmsub.vf v8, fa0, v9
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsub.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> %0,
+    bfloat %1,
+    <vscale x 1 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfnmsub.mask.nxv1bf16.bf16(
+  <vscale x 1 x bfloat>,
+  bfloat,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfnmsub_mask_vf_nxv1bf16_bf16_nxv1bf16(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsub_mask_vf_nxv1bf16_bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT:    vfnmsub.vf v8, fa0, v9, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsub.mask.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> %0,
+    bfloat %1,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfnmsub.nxv2bf16.bf16(
+  <vscale x 2 x bfloat>,
+  bfloat,
+  <vscale x 2 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat>  @intrinsic_vfnmsub_vf_nxv2bf16_bf16_nxv2bf16(<vscale x 2 x bfloat> %0, bfloat %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsub_vf_nxv2bf16_bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, ma
+; CHECK-NEXT:    vfnmsub.vf v8, fa0, v9
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsub.nxv2bf16.bf16(
+    <vscale x 2 x bfloat> %0,
+    bfloat %1,
+    <vscale x 2 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfnmsub.mask.nxv2bf16.bf16(
+  <vscale x 2 x bfloat>,
+  bfloat,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfnmsub_mask_vf_nxv2bf16_bf16_nxv2bf16(<vscale x 2 x bfloat> %0, bfloat %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsub_mask_vf_nxv2bf16_bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT:    vfnmsub.vf v8, fa0, v9, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfnmsub.mask.nxv2bf16.bf16(
+    <vscale x 2 x bfloat> %0,
+    bfloat %1,
+    <vscale x 2 x bfloat> %2,
+    <vscale x 2 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfnmsub.nxv4bf16.bf16(
+  <vscale x 4 x bfloat>,
+  bfloat,
+  <vscale x 4 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat>  @intrinsic_vfnmsub_vf_nxv4bf16_bf16_nxv4bf16(<vscale x 4 x bfloat> %0, bfloat %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsub_vf_nxv4bf16_bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT:    vfnmsub.vf v8, fa0, v9
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsub.nxv4bf16.bf16(
+    <vscale x 4 x bfloat> %0,
+    bfloat %1,
+    <vscale x 4 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfnmsub.mask.nxv4bf16.bf16(
+  <vscale x 4 x bfloat>,
+  bfloat,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfnmsub_mask_vf_nxv4bf16_bf16_nxv4bf16(<vscale x 4 x bfloat> %0, bfloat %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsub_mask_vf_nxv4bf16_bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT:    vfnmsub.vf v8, fa0, v9, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfnmsub.mask.nxv4bf16.bf16(
+    <vscale x 4 x bfloat> %0,
+    bfloat %1,
+    <vscale x 4 x bfloat> %2,
+    <vscale x 4 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfnmsub.nxv8bf16.bf16(
+  <vscale x 8 x bfloat>,
+  bfloat,
+  <vscale x 8 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat>  @intrinsic_vfnmsub_vf_nxv8bf16_bf16_nxv8bf16(<vscale x 8 x bfloat> %0, bfloat %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsub_vf_nxv8bf16_bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, ma
+; CHECK-NEXT:    vfnmsub.vf v8, fa0, v10
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsub.nxv8bf16.bf16(
+    <vscale x 8 x bfloat> %0,
+    bfloat %1,
+    <vscale x 8 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfnmsub.mask.nxv8bf16.bf16(
+  <vscale x 8 x bfloat>,
+  bfloat,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfnmsub_mask_vf_nxv8bf16_bf16_nxv8bf16(<vscale x 8 x bfloat> %0, bfloat %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsub_mask_vf_nxv8bf16_bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT:    vfnmsub.vf v8, fa0, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfnmsub.mask.nxv8bf16.bf16(
+    <vscale x 8 x bfloat> %0,
+    bfloat %1,
+    <vscale x 8 x bfloat> %2,
+    <vscale x 8 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfnmsub.nxv16bf16.bf16(
+  <vscale x 16 x bfloat>,
+  bfloat,
+  <vscale x 16 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat>  @intrinsic_vfnmsub_vf_nxv16bf16_bf16_nxv16bf16(<vscale x 16 x bfloat> %0, bfloat %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsub_vf_nxv16bf16_bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, ma
+; CHECK-NEXT:    vfnmsub.vf v8, fa0, v12
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsub.nxv16bf16.bf16(
+    <vscale x 16 x bfloat> %0,
+    bfloat %1,
+    <vscale x 16 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfnmsub.mask.nxv16bf16.bf16(
+  <vscale x 16 x bfloat>,
+  bfloat,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfnmsub_mask_vf_nxv16bf16_bf16_nxv16bf16(<vscale x 16 x bfloat> %0, bfloat %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsub_mask_vf_nxv16bf16_bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT:    vfnmsub.vf v8, fa0, v12, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfnmsub.mask.nxv16bf16.bf16(
+    <vscale x 16 x bfloat> %0,
+    bfloat %1,
+    <vscale x 16 x bfloat> %2,
+    <vscale x 16 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0);
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat>  @intrinsic_vfnmsub_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsub_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfnmsub.vv v8, v9, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsub.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %2,
+    iXLen 7, iXLen %3, iXLen 3)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat>  @intrinsic_vfnmsub_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute2(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsub_vv_nxv1bf16_nxv1bf16_nxv1bf16_commute2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfnmsac.vv v8, v10, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsub.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x bfloat> %0,
+    iXLen 7, iXLen %3, iXLen 3)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat>  @intrinsic_vfnmsub_vf_nxv1bf16_bf16_nxv1bf16_commute(<vscale x 1 x bfloat> %0, bfloat %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfnmsub_vf_nxv1bf16_bf16_nxv1bf16_commute:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfnmsac.vf v8, fa0, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfnmsub.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> %2,
+    bfloat %1,
+    <vscale x 1 x bfloat> %0,
+    iXLen 7, iXLen %3, iXLen 3)
+
+  ret <vscale x 1 x bfloat> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfrec7-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfrec7-bf.ll
new file mode 100644
index 0000000000000..1211415ffe432
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfrec7-bf.ll
@@ -0,0 +1,282 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfrec7.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfrec7_v_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_v_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfrec7.v v8, v8
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfrec7.nxv1bf16(
+    <vscale x 1 x bfloat> poison,
+    <vscale x 1 x bfloat> %0,
+    iXLen 0, iXLen %1)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfrec7.mask.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfrec7_mask_v_nxv1bf16_nxv1bf16(<vscale x 1 x i1> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vfrec7.v v8, v9, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfrec7.mask.nxv1bf16(
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x i1> %0,
+    iXLen 0, iXLen %3, iXLen 1)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfrec7.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfrec7_v_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_v_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vfrec7.v v8, v8
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfrec7.nxv2bf16(
+    <vscale x 2 x bfloat> poison,
+    <vscale x 2 x bfloat> %0,
+    iXLen 0, iXLen %1)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfrec7.mask.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfrec7_mask_v_nxv2bf16_nxv2bf16(<vscale x 2 x i1> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vfrec7.v v8, v9, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfrec7.mask.nxv2bf16(
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x bfloat> %2,
+    <vscale x 2 x i1> %0,
+    iXLen 0, iXLen %3, iXLen 1)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfrec7.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfrec7_v_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_v_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vfrec7.v v8, v8
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfrec7.nxv4bf16(
+    <vscale x 4 x bfloat> poison,
+    <vscale x 4 x bfloat> %0,
+    iXLen 0, iXLen %1)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfrec7.mask.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfrec7_mask_v_nxv4bf16_nxv4bf16(<vscale x 4 x i1> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vfrec7.v v8, v9, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfrec7.mask.nxv4bf16(
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x bfloat> %2,
+    <vscale x 4 x i1> %0,
+    iXLen 0, iXLen %3, iXLen 1)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfrec7.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfrec7_v_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_v_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vfrec7.v v8, v8
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfrec7.nxv8bf16(
+    <vscale x 8 x bfloat> poison,
+    <vscale x 8 x bfloat> %0,
+    iXLen 0, iXLen %1)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfrec7.mask.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfrec7_mask_v_nxv8bf16_nxv8bf16(<vscale x 8 x i1> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vfrec7.v v8, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfrec7.mask.nxv8bf16(
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x bfloat> %2,
+    <vscale x 8 x i1> %0,
+    iXLen 0, iXLen %3, iXLen 1)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfrec7.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfrec7_v_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_v_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vfrec7.v v8, v8
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfrec7.nxv16bf16(
+    <vscale x 16 x bfloat> poison,
+    <vscale x 16 x bfloat> %0,
+    iXLen 0, iXLen %1)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfrec7.mask.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfrec7_mask_v_nxv16bf16_nxv16bf16(<vscale x 16 x i1> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT:    vfrec7.v v8, v12, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfrec7.mask.nxv16bf16(
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x bfloat> %2,
+    <vscale x 16 x i1> %0,
+    iXLen 0, iXLen %3, iXLen 1)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfrec7.nxv32bf16(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfrec7_v_nxv32bf16_nxv32bf16(<vscale x 32 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_v_nxv32bf16_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT:    vfrec7.v v8, v8
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfrec7.nxv32bf16(
+    <vscale x 32 x bfloat> poison,
+    <vscale x 32 x bfloat> %0,
+    iXLen 0, iXLen %1)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfrec7.mask.nxv32bf16(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfrec7_mask_v_nxv32bf16_nxv32bf16(<vscale x 32 x i1> %0, <vscale x 32 x bfloat> %1, <vscale x 32 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrec7_mask_v_nxv32bf16_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, ta, mu
+; CHECK-NEXT:    vfrec7.v v8, v16, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfrec7.mask.nxv32bf16(
+    <vscale x 32 x bfloat> %1,
+    <vscale x 32 x bfloat> %2,
+    <vscale x 32 x i1> %0,
+    iXLen 0, iXLen %3, iXLen 1)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfrsqrt7-bf16.ll b/llvm/test/CodeGen/RISCV/rvv/vfrsqrt7-bf16.ll
new file mode 100644
index 0000000000000..4626b865ab454
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfrsqrt7-bf16.ll
@@ -0,0 +1,264 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfrsqrt7.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfrsqrt7_v_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_v_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfrsqrt7.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfrsqrt7.nxv1bf16(
+    <vscale x 1 x bfloat> poison,
+    <vscale x 1 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfrsqrt7_mask_v_nxv1bf16_nxv1bf16(<vscale x 1 x i1> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_mask_v_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv1bf16(
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x i1> %0,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfrsqrt7.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfrsqrt7_v_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_v_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vfrsqrt7.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfrsqrt7.nxv2bf16(
+    <vscale x 2 x bfloat> poison,
+    <vscale x 2 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfrsqrt7_mask_v_nxv2bf16_nxv2bf16(<vscale x 2 x i1> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_mask_v_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv2bf16(
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x bfloat> %2,
+    <vscale x 2 x i1> %0,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfrsqrt7.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfrsqrt7_v_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_v_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vfrsqrt7.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfrsqrt7.nxv4bf16(
+    <vscale x 4 x bfloat> poison,
+    <vscale x 4 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfrsqrt7_mask_v_nxv4bf16_nxv4bf16(<vscale x 4 x i1> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_mask_v_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv4bf16(
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x bfloat> %2,
+    <vscale x 4 x i1> %0,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfrsqrt7.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfrsqrt7_v_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_v_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vfrsqrt7.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfrsqrt7.nxv8bf16(
+    <vscale x 8 x bfloat> poison,
+    <vscale x 8 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfrsqrt7_mask_v_nxv8bf16_nxv8bf16(<vscale x 8 x i1> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_mask_v_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv8bf16(
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x bfloat> %2,
+    <vscale x 8 x i1> %0,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfrsqrt7.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfrsqrt7_v_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_v_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vfrsqrt7.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfrsqrt7.nxv16bf16(
+    <vscale x 16 x bfloat> poison,
+    <vscale x 16 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfrsqrt7_mask_v_nxv16bf16_nxv16bf16(<vscale x 16 x i1> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_mask_v_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv16bf16(
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x bfloat> %2,
+    <vscale x 16 x i1> %0,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfrsqrt7.nxv32bf16(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfrsqrt7_v_nxv32bf16_nxv32bf16(<vscale x 32 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_v_nxv32bf16_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT:    vfrsqrt7.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfrsqrt7.nxv32bf16(
+    <vscale x 32 x bfloat> poison,
+    <vscale x 32 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv32bf16(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfrsqrt7_mask_v_nxv32bf16_nxv32bf16(<vscale x 32 x i1> %0, <vscale x 32 x bfloat> %1, <vscale x 32 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfrsqrt7_mask_v_nxv32bf16_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, ta, mu
+; CHECK-NEXT:    vfrsqrt7.v v8, v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfrsqrt7.mask.nxv32bf16(
+    <vscale x 32 x bfloat> %1,
+    <vscale x 32 x bfloat> %2,
+    <vscale x 32 x i1> %0,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfrsub-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfrsub-bf.ll
new file mode 100644
index 0000000000000..54a6d48cfcc5b
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfrsub-bf.ll
@@ -0,0 +1,282 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfrsub.nxv1bf16.bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfrsub_vf_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfrsub_vf_nxv1bf16_nxv1bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfrsub.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfrsub.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> poison,
+    <vscale x 1 x bfloat> %0,
+    bfloat %1,
+    iXLen 7, iXLen %2)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfrsub.mask.nxv1bf16.bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  bfloat,
+  <vscale x 1 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfrsub_mask_vf_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, bfloat %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfrsub_mask_vf_nxv1bf16_nxv1bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vfrsub.vf v8, v9, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfrsub.mask.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    bfloat %2,
+    <vscale x 1 x i1> %3,
+    iXLen 7, iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfrsub.nxv2bf16.bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfrsub_vf_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfrsub_vf_nxv2bf16_nxv2bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vfrsub.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfrsub.nxv2bf16.bf16(
+    <vscale x 2 x bfloat> poison,
+    <vscale x 2 x bfloat> %0,
+    bfloat %1,
+    iXLen 7, iXLen %2)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfrsub.mask.nxv2bf16.bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  bfloat,
+  <vscale x 2 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfrsub_mask_vf_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, bfloat %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfrsub_mask_vf_nxv2bf16_nxv2bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vfrsub.vf v8, v9, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfrsub.mask.nxv2bf16.bf16(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    bfloat %2,
+    <vscale x 2 x i1> %3,
+    iXLen 7, iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfrsub.nxv4bf16.bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfrsub_vf_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfrsub_vf_nxv4bf16_nxv4bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vfrsub.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfrsub.nxv4bf16.bf16(
+    <vscale x 4 x bfloat> poison,
+    <vscale x 4 x bfloat> %0,
+    bfloat %1,
+    iXLen 7, iXLen %2)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfrsub.mask.nxv4bf16.bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  bfloat,
+  <vscale x 4 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfrsub_mask_vf_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, bfloat %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfrsub_mask_vf_nxv4bf16_nxv4bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vfrsub.vf v8, v9, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfrsub.mask.nxv4bf16.bf16(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    bfloat %2,
+    <vscale x 4 x i1> %3,
+    iXLen 7, iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfrsub.nxv8bf16.bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfrsub_vf_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfrsub_vf_nxv8bf16_nxv8bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vfrsub.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfrsub.nxv8bf16.bf16(
+    <vscale x 8 x bfloat> poison,
+    <vscale x 8 x bfloat> %0,
+    bfloat %1,
+    iXLen 7, iXLen %2)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfrsub.mask.nxv8bf16.bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  bfloat,
+  <vscale x 8 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfrsub_mask_vf_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, bfloat %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfrsub_mask_vf_nxv8bf16_nxv8bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vfrsub.vf v8, v10, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfrsub.mask.nxv8bf16.bf16(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    bfloat %2,
+    <vscale x 8 x i1> %3,
+    iXLen 7, iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfrsub.nxv16bf16.bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfrsub_vf_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfrsub_vf_nxv16bf16_nxv16bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vfrsub.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfrsub.nxv16bf16.bf16(
+    <vscale x 16 x bfloat> poison,
+    <vscale x 16 x bfloat> %0,
+    bfloat %1,
+    iXLen 7, iXLen %2)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfrsub.mask.nxv16bf16.bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  bfloat,
+  <vscale x 16 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfrsub_mask_vf_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, bfloat %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfrsub_mask_vf_nxv16bf16_nxv16bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT:    vfrsub.vf v8, v12, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfrsub.mask.nxv16bf16.bf16(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    bfloat %2,
+    <vscale x 16 x i1> %3,
+    iXLen 7, iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfrsub.nxv32bf16.bf16(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfrsub_vf_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfrsub_vf_nxv32bf16_nxv32bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT:    vfrsub.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfrsub.nxv32bf16.bf16(
+    <vscale x 32 x bfloat> poison,
+    <vscale x 32 x bfloat> %0,
+    bfloat %1,
+    iXLen 7, iXLen %2)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfrsub.mask.nxv32bf16.bf16(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  bfloat,
+  <vscale x 32 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfrsub_mask_vf_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, bfloat %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfrsub_mask_vf_nxv32bf16_nxv32bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, ta, mu
+; CHECK-NEXT:    vfrsub.vf v8, v16, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfrsub.mask.nxv32bf16.bf16(
+    <vscale x 32 x bfloat> %0,
+    <vscale x 32 x bfloat> %1,
+    bfloat %2,
+    <vscale x 32 x i1> %3,
+    iXLen 7, iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsgnj-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfsgnj-bf.ll
new file mode 100644
index 0000000000000..2cd698d9aaa3c
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsgnj-bf.ll
@@ -0,0 +1,571 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfsgnj.nxv1bf16.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfsgnj_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfsgnj.vv v8, v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnj.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> poison,
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfsgnj.mask.nxv1bf16.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfsgnj_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vfsgnj.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnj.mask.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfsgnj.nxv2bf16.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfsgnj_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vfsgnj.vv v8, v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnj.nxv2bf16.nxv2bf16(
+    <vscale x 2 x bfloat> poison,
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfsgnj.mask.nxv2bf16.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfsgnj_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vfsgnj.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnj.mask.nxv2bf16.nxv2bf16(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x bfloat> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfsgnj.nxv4bf16.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfsgnj_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vfsgnj.vv v8, v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnj.nxv4bf16.nxv4bf16(
+    <vscale x 4 x bfloat> poison,
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfsgnj.mask.nxv4bf16.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfsgnj_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vfsgnj.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnj.mask.nxv4bf16.nxv4bf16(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x bfloat> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfsgnj.nxv8bf16.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfsgnj_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vfsgnj.vv v8, v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnj.nxv8bf16.nxv8bf16(
+    <vscale x 8 x bfloat> poison,
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfsgnj.mask.nxv8bf16.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfsgnj_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vfsgnj.vv v8, v10, v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnj.mask.nxv8bf16.nxv8bf16(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x bfloat> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfsgnj.nxv16bf16.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfsgnj_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vfsgnj.vv v8, v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnj.nxv16bf16.nxv16bf16(
+    <vscale x 16 x bfloat> poison,
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfsgnj.mask.nxv16bf16.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfsgnj_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT:    vfsgnj.vv v8, v12, v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnj.mask.nxv16bf16.nxv16bf16(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x bfloat> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfsgnj.nxv32bf16.nxv32bf16(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfsgnj_vv_nxv32bf16_nxv32bf16_nxv32bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_vv_nxv32bf16_nxv32bf16_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT:    vfsgnj.vv v8, v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnj.nxv32bf16.nxv32bf16(
+    <vscale x 32 x bfloat> poison,
+    <vscale x 32 x bfloat> %0,
+    <vscale x 32 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfsgnj.mask.nxv32bf16.nxv32bf16(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfsgnj_mask_vv_nxv32bf16_nxv32bf16_nxv32bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, <vscale x 32 x bfloat> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_mask_vv_nxv32bf16_nxv32bf16_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vl8re16.v v24, (a0)
+; CHECK-NEXT:    vsetvli zero, a1, e16alt, m8, ta, mu
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v24, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnj.mask.nxv32bf16.nxv32bf16(
+    <vscale x 32 x bfloat> %0,
+    <vscale x 32 x bfloat> %1,
+    <vscale x 32 x bfloat> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfsgnj.nxv1bf16.bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfsgnj_vf_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_vf_nxv1bf16_nxv1bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfsgnj.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnj.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> poison,
+    <vscale x 1 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfsgnj.mask.nxv1bf16.bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  bfloat,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfsgnj_mask_vf_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, bfloat %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_mask_vf_nxv1bf16_nxv1bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vfsgnj.vf v8, v9, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnj.mask.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    bfloat %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfsgnj.nxv2bf16.bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfsgnj_vf_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_vf_nxv2bf16_nxv2bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vfsgnj.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnj.nxv2bf16.bf16(
+    <vscale x 2 x bfloat> poison,
+    <vscale x 2 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfsgnj.mask.nxv2bf16.bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  bfloat,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfsgnj_mask_vf_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, bfloat %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_mask_vf_nxv2bf16_nxv2bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vfsgnj.vf v8, v9, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnj.mask.nxv2bf16.bf16(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    bfloat %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfsgnj.nxv4bf16.bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfsgnj_vf_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_vf_nxv4bf16_nxv4bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vfsgnj.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnj.nxv4bf16.bf16(
+    <vscale x 4 x bfloat> poison,
+    <vscale x 4 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfsgnj.mask.nxv4bf16.bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  bfloat,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfsgnj_mask_vf_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, bfloat %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_mask_vf_nxv4bf16_nxv4bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vfsgnj.vf v8, v9, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnj.mask.nxv4bf16.bf16(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    bfloat %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfsgnj.nxv8bf16.bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfsgnj_vf_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_vf_nxv8bf16_nxv8bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vfsgnj.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnj.nxv8bf16.bf16(
+    <vscale x 8 x bfloat> poison,
+    <vscale x 8 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfsgnj.mask.nxv8bf16.bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  bfloat,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfsgnj_mask_vf_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, bfloat %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_mask_vf_nxv8bf16_nxv8bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vfsgnj.vf v8, v10, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnj.mask.nxv8bf16.bf16(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    bfloat %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfsgnj.nxv16bf16.bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfsgnj_vf_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_vf_nxv16bf16_nxv16bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vfsgnj.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnj.nxv16bf16.bf16(
+    <vscale x 16 x bfloat> poison,
+    <vscale x 16 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfsgnj.mask.nxv16bf16.bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  bfloat,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfsgnj_mask_vf_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, bfloat %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_mask_vf_nxv16bf16_nxv16bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT:    vfsgnj.vf v8, v12, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnj.mask.nxv16bf16.bf16(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    bfloat %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfsgnj.nxv32bf16.bf16(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfsgnj_vf_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_vf_nxv32bf16_nxv32bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT:    vfsgnj.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnj.nxv32bf16.bf16(
+    <vscale x 32 x bfloat> poison,
+    <vscale x 32 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfsgnj.mask.nxv32bf16.bf16(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  bfloat,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfsgnj_mask_vf_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, bfloat %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnj_mask_vf_nxv32bf16_nxv32bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, ta, mu
+; CHECK-NEXT:    vfsgnj.vf v8, v16, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnj.mask.nxv32bf16.bf16(
+    <vscale x 32 x bfloat> %0,
+    <vscale x 32 x bfloat> %1,
+    bfloat %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsgnjn-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfsgnjn-bf.ll
new file mode 100644
index 0000000000000..08340becc9ed4
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsgnjn-bf.ll
@@ -0,0 +1,571 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfsgnjn.nxv1bf16.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfsgnjn_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfsgnjn.vv v8, v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjn.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> poison,
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv1bf16.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfsgnjn_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vfsgnjn.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfsgnjn.nxv2bf16.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfsgnjn_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vfsgnjn.vv v8, v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjn.nxv2bf16.nxv2bf16(
+    <vscale x 2 x bfloat> poison,
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv2bf16.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfsgnjn_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vfsgnjn.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv2bf16.nxv2bf16(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x bfloat> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfsgnjn.nxv4bf16.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfsgnjn_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vfsgnjn.vv v8, v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjn.nxv4bf16.nxv4bf16(
+    <vscale x 4 x bfloat> poison,
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv4bf16.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfsgnjn_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vfsgnjn.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv4bf16.nxv4bf16(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x bfloat> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfsgnjn.nxv8bf16.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfsgnjn_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vfsgnjn.vv v8, v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjn.nxv8bf16.nxv8bf16(
+    <vscale x 8 x bfloat> poison,
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv8bf16.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfsgnjn_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vfsgnjn.vv v8, v10, v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv8bf16.nxv8bf16(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x bfloat> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfsgnjn.nxv16bf16.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfsgnjn_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vfsgnjn.vv v8, v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjn.nxv16bf16.nxv16bf16(
+    <vscale x 16 x bfloat> poison,
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv16bf16.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfsgnjn_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT:    vfsgnjn.vv v8, v12, v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv16bf16.nxv16bf16(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x bfloat> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfsgnjn.nxv32bf16.nxv32bf16(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfsgnjn_vv_nxv32bf16_nxv32bf16_nxv32bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_vv_nxv32bf16_nxv32bf16_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT:    vfsgnjn.vv v8, v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjn.nxv32bf16.nxv32bf16(
+    <vscale x 32 x bfloat> poison,
+    <vscale x 32 x bfloat> %0,
+    <vscale x 32 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv32bf16.nxv32bf16(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfsgnjn_mask_vv_nxv32bf16_nxv32bf16_nxv32bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, <vscale x 32 x bfloat> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_mask_vv_nxv32bf16_nxv32bf16_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vl8re16.v v24, (a0)
+; CHECK-NEXT:    vsetvli zero, a1, e16alt, m8, ta, mu
+; CHECK-NEXT:    vfsgnjn.vv v8, v16, v24, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv32bf16.nxv32bf16(
+    <vscale x 32 x bfloat> %0,
+    <vscale x 32 x bfloat> %1,
+    <vscale x 32 x bfloat> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfsgnjn.nxv1bf16.bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfsgnjn_vf_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_vf_nxv1bf16_nxv1bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfsgnjn.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjn.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> poison,
+    <vscale x 1 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv1bf16.bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  bfloat,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfsgnjn_mask_vf_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, bfloat %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_mask_vf_nxv1bf16_nxv1bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vfsgnjn.vf v8, v9, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    bfloat %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfsgnjn.nxv2bf16.bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfsgnjn_vf_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_vf_nxv2bf16_nxv2bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vfsgnjn.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjn.nxv2bf16.bf16(
+    <vscale x 2 x bfloat> poison,
+    <vscale x 2 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv2bf16.bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  bfloat,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfsgnjn_mask_vf_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, bfloat %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_mask_vf_nxv2bf16_nxv2bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vfsgnjn.vf v8, v9, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv2bf16.bf16(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    bfloat %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfsgnjn.nxv4bf16.bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfsgnjn_vf_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_vf_nxv4bf16_nxv4bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vfsgnjn.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjn.nxv4bf16.bf16(
+    <vscale x 4 x bfloat> poison,
+    <vscale x 4 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv4bf16.bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  bfloat,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfsgnjn_mask_vf_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, bfloat %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_mask_vf_nxv4bf16_nxv4bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vfsgnjn.vf v8, v9, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv4bf16.bf16(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    bfloat %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfsgnjn.nxv8bf16.bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfsgnjn_vf_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_vf_nxv8bf16_nxv8bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vfsgnjn.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjn.nxv8bf16.bf16(
+    <vscale x 8 x bfloat> poison,
+    <vscale x 8 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv8bf16.bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  bfloat,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfsgnjn_mask_vf_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, bfloat %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_mask_vf_nxv8bf16_nxv8bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vfsgnjn.vf v8, v10, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv8bf16.bf16(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    bfloat %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfsgnjn.nxv16bf16.bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfsgnjn_vf_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_vf_nxv16bf16_nxv16bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vfsgnjn.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjn.nxv16bf16.bf16(
+    <vscale x 16 x bfloat> poison,
+    <vscale x 16 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv16bf16.bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  bfloat,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfsgnjn_mask_vf_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, bfloat %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_mask_vf_nxv16bf16_nxv16bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT:    vfsgnjn.vf v8, v12, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv16bf16.bf16(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    bfloat %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfsgnjn.nxv32bf16.bf16(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfsgnjn_vf_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_vf_nxv32bf16_nxv32bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT:    vfsgnjn.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjn.nxv32bf16.bf16(
+    <vscale x 32 x bfloat> poison,
+    <vscale x 32 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv32bf16.bf16(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  bfloat,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfsgnjn_mask_vf_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, bfloat %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjn_mask_vf_nxv32bf16_nxv32bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, ta, mu
+; CHECK-NEXT:    vfsgnjn.vf v8, v16, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjn.mask.nxv32bf16.bf16(
+    <vscale x 32 x bfloat> %0,
+    <vscale x 32 x bfloat> %1,
+    bfloat %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsgnjx-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfsgnjx-bf.ll
new file mode 100644
index 0000000000000..e51a42e2b8cea
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsgnjx-bf.ll
@@ -0,0 +1,571 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfsgnjx.nxv1bf16.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfsgnjx_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfsgnjx.vv v8, v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjx.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> poison,
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv1bf16.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfsgnjx_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vfsgnjx.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfsgnjx.nxv2bf16.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfsgnjx_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vfsgnjx.vv v8, v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjx.nxv2bf16.nxv2bf16(
+    <vscale x 2 x bfloat> poison,
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv2bf16.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfsgnjx_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vfsgnjx.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv2bf16.nxv2bf16(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x bfloat> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfsgnjx.nxv4bf16.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfsgnjx_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vfsgnjx.vv v8, v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjx.nxv4bf16.nxv4bf16(
+    <vscale x 4 x bfloat> poison,
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv4bf16.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfsgnjx_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vfsgnjx.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv4bf16.nxv4bf16(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x bfloat> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfsgnjx.nxv8bf16.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfsgnjx_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vfsgnjx.vv v8, v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjx.nxv8bf16.nxv8bf16(
+    <vscale x 8 x bfloat> poison,
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv8bf16.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfsgnjx_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vfsgnjx.vv v8, v10, v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv8bf16.nxv8bf16(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x bfloat> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfsgnjx.nxv16bf16.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfsgnjx_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vfsgnjx.vv v8, v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjx.nxv16bf16.nxv16bf16(
+    <vscale x 16 x bfloat> poison,
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv16bf16.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfsgnjx_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT:    vfsgnjx.vv v8, v12, v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv16bf16.nxv16bf16(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x bfloat> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfsgnjx.nxv32bf16.nxv32bf16(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfsgnjx_vv_nxv32bf16_nxv32bf16_nxv32bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_vv_nxv32bf16_nxv32bf16_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT:    vfsgnjx.vv v8, v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjx.nxv32bf16.nxv32bf16(
+    <vscale x 32 x bfloat> poison,
+    <vscale x 32 x bfloat> %0,
+    <vscale x 32 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv32bf16.nxv32bf16(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfsgnjx_mask_vv_nxv32bf16_nxv32bf16_nxv32bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, <vscale x 32 x bfloat> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_mask_vv_nxv32bf16_nxv32bf16_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vl8re16.v v24, (a0)
+; CHECK-NEXT:    vsetvli zero, a1, e16alt, m8, ta, mu
+; CHECK-NEXT:    vfsgnjx.vv v8, v16, v24, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv32bf16.nxv32bf16(
+    <vscale x 32 x bfloat> %0,
+    <vscale x 32 x bfloat> %1,
+    <vscale x 32 x bfloat> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfsgnjx.nxv1bf16.bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfsgnjx_vf_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_vf_nxv1bf16_nxv1bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfsgnjx.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjx.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> poison,
+    <vscale x 1 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv1bf16.bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  bfloat,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfsgnjx_mask_vf_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, bfloat %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_mask_vf_nxv1bf16_nxv1bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vfsgnjx.vf v8, v9, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    bfloat %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfsgnjx.nxv2bf16.bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfsgnjx_vf_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_vf_nxv2bf16_nxv2bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vfsgnjx.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjx.nxv2bf16.bf16(
+    <vscale x 2 x bfloat> poison,
+    <vscale x 2 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv2bf16.bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  bfloat,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfsgnjx_mask_vf_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, bfloat %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_mask_vf_nxv2bf16_nxv2bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vfsgnjx.vf v8, v9, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv2bf16.bf16(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    bfloat %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfsgnjx.nxv4bf16.bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfsgnjx_vf_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_vf_nxv4bf16_nxv4bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vfsgnjx.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjx.nxv4bf16.bf16(
+    <vscale x 4 x bfloat> poison,
+    <vscale x 4 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv4bf16.bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  bfloat,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfsgnjx_mask_vf_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, bfloat %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_mask_vf_nxv4bf16_nxv4bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vfsgnjx.vf v8, v9, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv4bf16.bf16(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    bfloat %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfsgnjx.nxv8bf16.bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfsgnjx_vf_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_vf_nxv8bf16_nxv8bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vfsgnjx.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjx.nxv8bf16.bf16(
+    <vscale x 8 x bfloat> poison,
+    <vscale x 8 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv8bf16.bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  bfloat,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfsgnjx_mask_vf_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, bfloat %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_mask_vf_nxv8bf16_nxv8bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vfsgnjx.vf v8, v10, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv8bf16.bf16(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    bfloat %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfsgnjx.nxv16bf16.bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfsgnjx_vf_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_vf_nxv16bf16_nxv16bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vfsgnjx.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjx.nxv16bf16.bf16(
+    <vscale x 16 x bfloat> poison,
+    <vscale x 16 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv16bf16.bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  bfloat,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfsgnjx_mask_vf_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, bfloat %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_mask_vf_nxv16bf16_nxv16bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT:    vfsgnjx.vf v8, v12, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv16bf16.bf16(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    bfloat %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfsgnjx.nxv32bf16.bf16(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfsgnjx_vf_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_vf_nxv32bf16_nxv32bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT:    vfsgnjx.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjx.nxv32bf16.bf16(
+    <vscale x 32 x bfloat> poison,
+    <vscale x 32 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv32bf16.bf16(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  bfloat,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfsgnjx_mask_vf_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, bfloat %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsgnjx_mask_vf_nxv32bf16_nxv32bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, ta, mu
+; CHECK-NEXT:    vfsgnjx.vf v8, v16, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfsgnjx.mask.nxv32bf16.bf16(
+    <vscale x 32 x bfloat> %0,
+    <vscale x 32 x bfloat> %1,
+    bfloat %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfslide1down-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfslide1down-bf.ll
new file mode 100644
index 0000000000000..c65719c3a4c1a
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfslide1down-bf.ll
@@ -0,0 +1,288 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfslide1down.nxv1bf16.bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfslide1down_vf_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1down_vf_nxv1bf16_nxv1bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfslide1down.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> poison,
+    <vscale x 1 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfslide1down.mask.nxv1bf16.bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  bfloat,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfslide1down_mask_vf_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, bfloat %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1down_mask_vf_nxv1bf16_nxv1bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vfslide1down.vf v8, v9, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfslide1down.mask.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    bfloat %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfslide1down.nxv2bf16.bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfslide1down_vf_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1down_vf_nxv2bf16_nxv2bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfslide1down.nxv2bf16.bf16(
+    <vscale x 2 x bfloat> poison,
+    <vscale x 2 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfslide1down.mask.nxv2bf16.bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  bfloat,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfslide1down_mask_vf_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, bfloat %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1down_mask_vf_nxv2bf16_nxv2bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vfslide1down.vf v8, v9, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfslide1down.mask.nxv2bf16.bf16(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    bfloat %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfslide1down.nxv4bf16.bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfslide1down_vf_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1down_vf_nxv4bf16_nxv4bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfslide1down.nxv4bf16.bf16(
+    <vscale x 4 x bfloat> poison,
+    <vscale x 4 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfslide1down.mask.nxv4bf16.bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  bfloat,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfslide1down_mask_vf_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, bfloat %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1down_mask_vf_nxv4bf16_nxv4bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vfslide1down.vf v8, v9, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfslide1down.mask.nxv4bf16.bf16(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    bfloat %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfslide1down.nxv8bf16.bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfslide1down_vf_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1down_vf_nxv8bf16_nxv8bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfslide1down.nxv8bf16.bf16(
+    <vscale x 8 x bfloat> poison,
+    <vscale x 8 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfslide1down.mask.nxv8bf16.bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  bfloat,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfslide1down_mask_vf_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, bfloat %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1down_mask_vf_nxv8bf16_nxv8bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vfslide1down.vf v8, v10, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfslide1down.mask.nxv8bf16.bf16(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    bfloat %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfslide1down.nxv16bf16.bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfslide1down_vf_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1down_vf_nxv16bf16_nxv16bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfslide1down.nxv16bf16.bf16(
+    <vscale x 16 x bfloat> poison,
+    <vscale x 16 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfslide1down.mask.nxv16bf16.bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  bfloat,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfslide1down_mask_vf_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, bfloat %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1down_mask_vf_nxv16bf16_nxv16bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT:    vfslide1down.vf v8, v12, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfslide1down.mask.nxv16bf16.bf16(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    bfloat %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfslide1down.nxv32bf16.bf16(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfslide1down_vf_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1down_vf_nxv32bf16_nxv32bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT:    vfslide1down.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfslide1down.nxv32bf16.bf16(
+    <vscale x 32 x bfloat> poison,
+    <vscale x 32 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfslide1down.mask.nxv32bf16.bf16(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  bfloat,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfslide1down_mask_vf_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, bfloat %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1down_mask_vf_nxv32bf16_nxv32bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, ta, mu
+; CHECK-NEXT:    vfslide1down.vf v8, v16, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfslide1down.mask.nxv32bf16.bf16(
+    <vscale x 32 x bfloat> %0,
+    <vscale x 32 x bfloat> %1,
+    bfloat %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfslide1up-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfslide1up-bf.ll
new file mode 100644
index 0000000000000..57a48986fdfcd
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfslide1up-bf.ll
@@ -0,0 +1,294 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfslide1up.nxv1bf16.bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfslide1up_vf_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1up_vf_nxv1bf16_nxv1bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfslide1up.vf v9, v8, fa0
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfslide1up.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> poison,
+    <vscale x 1 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfslide1up.mask.nxv1bf16.bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  bfloat,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfslide1up_mask_vf_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, bfloat %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1up_mask_vf_nxv1bf16_nxv1bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vfslide1up.vf v8, v9, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfslide1up.mask.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    bfloat %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfslide1up.nxv2bf16.bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfslide1up_vf_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1up_vf_nxv2bf16_nxv2bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vfslide1up.vf v9, v8, fa0
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfslide1up.nxv2bf16.bf16(
+    <vscale x 2 x bfloat> poison,
+    <vscale x 2 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfslide1up.mask.nxv2bf16.bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  bfloat,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfslide1up_mask_vf_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, bfloat %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1up_mask_vf_nxv2bf16_nxv2bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vfslide1up.vf v8, v9, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfslide1up.mask.nxv2bf16.bf16(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    bfloat %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfslide1up.nxv4bf16.bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfslide1up_vf_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1up_vf_nxv4bf16_nxv4bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vfslide1up.vf v9, v8, fa0
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfslide1up.nxv4bf16.bf16(
+    <vscale x 4 x bfloat> poison,
+    <vscale x 4 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfslide1up.mask.nxv4bf16.bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  bfloat,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfslide1up_mask_vf_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, bfloat %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1up_mask_vf_nxv4bf16_nxv4bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vfslide1up.vf v8, v9, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfslide1up.mask.nxv4bf16.bf16(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    bfloat %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfslide1up.nxv8bf16.bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfslide1up_vf_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1up_vf_nxv8bf16_nxv8bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vfslide1up.vf v10, v8, fa0
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfslide1up.nxv8bf16.bf16(
+    <vscale x 8 x bfloat> poison,
+    <vscale x 8 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfslide1up.mask.nxv8bf16.bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  bfloat,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfslide1up_mask_vf_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, bfloat %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1up_mask_vf_nxv8bf16_nxv8bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vfslide1up.vf v8, v10, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfslide1up.mask.nxv8bf16.bf16(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    bfloat %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfslide1up.nxv16bf16.bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfslide1up_vf_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1up_vf_nxv16bf16_nxv16bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vfslide1up.vf v12, v8, fa0
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfslide1up.nxv16bf16.bf16(
+    <vscale x 16 x bfloat> poison,
+    <vscale x 16 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfslide1up.mask.nxv16bf16.bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  bfloat,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfslide1up_mask_vf_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, bfloat %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1up_mask_vf_nxv16bf16_nxv16bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT:    vfslide1up.vf v8, v12, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfslide1up.mask.nxv16bf16.bf16(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    bfloat %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfslide1up.nxv32bf16.bf16(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfslide1up_vf_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1up_vf_nxv32bf16_nxv32bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT:    vfslide1up.vf v16, v8, fa0
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfslide1up.nxv32bf16.bf16(
+    <vscale x 32 x bfloat> poison,
+    <vscale x 32 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfslide1up.mask.nxv32bf16.bf16(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  bfloat,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfslide1up_mask_vf_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, bfloat %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfslide1up_mask_vf_nxv32bf16_nxv32bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, ta, mu
+; CHECK-NEXT:    vfslide1up.vf v8, v16, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfslide1up.mask.nxv32bf16.bf16(
+    <vscale x 32 x bfloat> %0,
+    <vscale x 32 x bfloat> %1,
+    bfloat %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsub-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfsub-bf.ll
new file mode 100644
index 0000000000000..aea75211b70b5
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsub-bf.ll
@@ -0,0 +1,559 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfsub.nxv1bf16.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfsub_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfsub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfsub.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> poison,
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    iXLen 7, iXLen %2)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfsub.mask.nxv1bf16.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfsub_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_mask_vv_nxv1bf16_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vfsub.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfsub.mask.nxv1bf16.nxv1bf16(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x i1> %3,
+    iXLen 7, iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfsub.nxv2bf16.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfsub_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vfsub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfsub.nxv2bf16.nxv2bf16(
+    <vscale x 2 x bfloat> poison,
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    iXLen 7, iXLen %2)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfsub.mask.nxv2bf16.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfsub_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_mask_vv_nxv2bf16_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vfsub.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfsub.mask.nxv2bf16.nxv2bf16(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x bfloat> %2,
+    <vscale x 2 x i1> %3,
+    iXLen 7, iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfsub.nxv4bf16.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfsub_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vfsub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfsub.nxv4bf16.nxv4bf16(
+    <vscale x 4 x bfloat> poison,
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    iXLen 7, iXLen %2)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfsub.mask.nxv4bf16.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfsub_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_mask_vv_nxv4bf16_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vfsub.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfsub.mask.nxv4bf16.nxv4bf16(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x bfloat> %2,
+    <vscale x 4 x i1> %3,
+    iXLen 7, iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfsub.nxv8bf16.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfsub_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vfsub.vv v8, v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfsub.nxv8bf16.nxv8bf16(
+    <vscale x 8 x bfloat> poison,
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    iXLen 7, iXLen %2)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfsub.mask.nxv8bf16.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfsub_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_mask_vv_nxv8bf16_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vfsub.vv v8, v10, v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfsub.mask.nxv8bf16.nxv8bf16(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x bfloat> %2,
+    <vscale x 8 x i1> %3,
+    iXLen 7, iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfsub.nxv16bf16.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfsub_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vfsub.vv v8, v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfsub.nxv16bf16.nxv16bf16(
+    <vscale x 16 x bfloat> poison,
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    iXLen 7, iXLen %2)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfsub.mask.nxv16bf16.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfsub_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_mask_vv_nxv16bf16_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT:    vfsub.vv v8, v12, v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfsub.mask.nxv16bf16.nxv16bf16(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x bfloat> %2,
+    <vscale x 16 x i1> %3,
+    iXLen 7, iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfsub.nxv32bf16.nxv32bf16(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfsub_vv_nxv32bf16_nxv32bf16_nxv32bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_vv_nxv32bf16_nxv32bf16_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT:    vfsub.vv v8, v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfsub.nxv32bf16.nxv32bf16(
+    <vscale x 32 x bfloat> poison,
+    <vscale x 32 x bfloat> %0,
+    <vscale x 32 x bfloat> %1,
+    iXLen 7, iXLen %2)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfsub.mask.nxv32bf16.nxv32bf16(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfsub_mask_vv_nxv32bf16_nxv32bf16_nxv32bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, <vscale x 32 x bfloat> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_mask_vv_nxv32bf16_nxv32bf16_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vl8re16.v v24, (a0)
+; CHECK-NEXT:    vsetvli zero, a1, e16alt, m8, ta, mu
+; CHECK-NEXT:    vfsub.vv v8, v16, v24, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfsub.mask.nxv32bf16.nxv32bf16(
+    <vscale x 32 x bfloat> %0,
+    <vscale x 32 x bfloat> %1,
+    <vscale x 32 x bfloat> %2,
+    <vscale x 32 x i1> %3,
+    iXLen 7, iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfsub.nxv1bf16.bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfsub_vf_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_vf_nxv1bf16_nxv1bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfsub.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfsub.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> poison,
+    <vscale x 1 x bfloat> %0,
+    bfloat %1,
+    iXLen 7, iXLen %2)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfsub.mask.nxv1bf16.bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  bfloat,
+  <vscale x 1 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfsub_mask_vf_nxv1bf16_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, bfloat %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_mask_vf_nxv1bf16_nxv1bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vfsub.vf v8, v9, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfsub.mask.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    bfloat %2,
+    <vscale x 1 x i1> %3,
+    iXLen 7, iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfsub.nxv2bf16.bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfsub_vf_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_vf_nxv2bf16_nxv2bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vfsub.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfsub.nxv2bf16.bf16(
+    <vscale x 2 x bfloat> poison,
+    <vscale x 2 x bfloat> %0,
+    bfloat %1,
+    iXLen 7, iXLen %2)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfsub.mask.nxv2bf16.bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  bfloat,
+  <vscale x 2 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfsub_mask_vf_nxv2bf16_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, bfloat %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_mask_vf_nxv2bf16_nxv2bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vfsub.vf v8, v9, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfsub.mask.nxv2bf16.bf16(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    bfloat %2,
+    <vscale x 2 x i1> %3,
+    iXLen 7, iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfsub.nxv4bf16.bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfsub_vf_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_vf_nxv4bf16_nxv4bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vfsub.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfsub.nxv4bf16.bf16(
+    <vscale x 4 x bfloat> poison,
+    <vscale x 4 x bfloat> %0,
+    bfloat %1,
+    iXLen 7, iXLen %2)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfsub.mask.nxv4bf16.bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  bfloat,
+  <vscale x 4 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfsub_mask_vf_nxv4bf16_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, bfloat %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_mask_vf_nxv4bf16_nxv4bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vfsub.vf v8, v9, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfsub.mask.nxv4bf16.bf16(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    bfloat %2,
+    <vscale x 4 x i1> %3,
+    iXLen 7, iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfsub.nxv8bf16.bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfsub_vf_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_vf_nxv8bf16_nxv8bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vfsub.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfsub.nxv8bf16.bf16(
+    <vscale x 8 x bfloat> poison,
+    <vscale x 8 x bfloat> %0,
+    bfloat %1,
+    iXLen 7, iXLen %2)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfsub.mask.nxv8bf16.bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  bfloat,
+  <vscale x 8 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfsub_mask_vf_nxv8bf16_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, bfloat %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_mask_vf_nxv8bf16_nxv8bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vfsub.vf v8, v10, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfsub.mask.nxv8bf16.bf16(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    bfloat %2,
+    <vscale x 8 x i1> %3,
+    iXLen 7, iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfsub.nxv16bf16.bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfsub_vf_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_vf_nxv16bf16_nxv16bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vfsub.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfsub.nxv16bf16.bf16(
+    <vscale x 16 x bfloat> poison,
+    <vscale x 16 x bfloat> %0,
+    bfloat %1,
+    iXLen 7, iXLen %2)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfsub.mask.nxv16bf16.bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  bfloat,
+  <vscale x 16 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfsub_mask_vf_nxv16bf16_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, bfloat %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_mask_vf_nxv16bf16_nxv16bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT:    vfsub.vf v8, v12, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfsub.mask.nxv16bf16.bf16(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    bfloat %2,
+    <vscale x 16 x i1> %3,
+    iXLen 7, iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfsub.nxv32bf16.bf16(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfsub_vf_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_vf_nxv32bf16_nxv32bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT:    vfsub.vf v8, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfsub.nxv32bf16.bf16(
+    <vscale x 32 x bfloat> poison,
+    <vscale x 32 x bfloat> %0,
+    bfloat %1,
+    iXLen 7, iXLen %2)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfsub.mask.nxv32bf16.bf16(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x bfloat>,
+  bfloat,
+  <vscale x 32 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfsub_mask_vf_nxv32bf16_nxv32bf16_bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, bfloat %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfsub_mask_vf_nxv32bf16_nxv32bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, ta, mu
+; CHECK-NEXT:    vfsub.vf v8, v16, fa0, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfsub.mask.nxv32bf16.bf16(
+    <vscale x 32 x bfloat> %0,
+    <vscale x 32 x bfloat> %1,
+    bfloat %2,
+    <vscale x 32 x i1> %3,
+    iXLen 7, iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwadd-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfwadd-bf.ll
new file mode 100644
index 0000000000000..62feac824efad
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwadd-bf.ll
@@ -0,0 +1,519 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x float> @llvm.riscv.vfwadd.nxv1f32.nxv1bf16.nxv1bf16(
+  <vscale x 1 x float>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwadd_vv_nxv1f32_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd_vv_nxv1f32_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfwadd.vv v10, v8, v9
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfwadd.nxv1f32.nxv1bf16.nxv1bf16(
+    <vscale x 1 x float> poison,
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwadd.mask.nxv1f32.nxv1bf16.nxv1bf16(
+  <vscale x 1 x float>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwadd_mask_vv_nxv1f32_nxv1bf16_nxv1bf16(<vscale x 1 x float> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd_mask_vv_nxv1f32_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vfwadd.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfwadd.mask.nxv1f32.nxv1bf16.nxv1bf16(
+    <vscale x 1 x float> %0,
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwadd.nxv2f32.nxv2bf16.nxv2bf16(
+  <vscale x 2 x float>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwadd_vv_nxv2f32_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd_vv_nxv2f32_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vfwadd.vv v10, v8, v9
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfwadd.nxv2f32.nxv2bf16.nxv2bf16(
+    <vscale x 2 x float> poison,
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwadd.mask.nxv2f32.nxv2bf16.nxv2bf16(
+  <vscale x 2 x float>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwadd_mask_vv_nxv2f32_nxv2bf16_nxv2bf16(<vscale x 2 x float> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd_mask_vv_nxv2f32_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vfwadd.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfwadd.mask.nxv2f32.nxv2bf16.nxv2bf16(
+    <vscale x 2 x float> %0,
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x bfloat> %2,
+    <vscale x 2 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwadd.nxv4f32.nxv4bf16.nxv4bf16(
+  <vscale x 4 x float>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwadd_vv_nxv4f32_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd_vv_nxv4f32_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwadd.vv v8, v11, v10
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vfwadd.nxv4f32.nxv4bf16.nxv4bf16(
+    <vscale x 4 x float> poison,
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwadd.mask.nxv4f32.nxv4bf16.nxv4bf16(
+  <vscale x 4 x float>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwadd_mask_vv_nxv4f32_nxv4bf16_nxv4bf16(<vscale x 4 x float> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd_mask_vv_nxv4f32_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vfwadd.vv v8, v10, v11, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vfwadd.mask.nxv4f32.nxv4bf16.nxv4bf16(
+    <vscale x 4 x float> %0,
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x bfloat> %2,
+    <vscale x 4 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwadd.nxv8f32.nxv8bf16.nxv8bf16(
+  <vscale x 8 x float>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwadd_vv_nxv8f32_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd_vv_nxv8f32_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwadd.vv v8, v14, v12
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vfwadd.nxv8f32.nxv8bf16.nxv8bf16(
+    <vscale x 8 x float> poison,
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwadd.mask.nxv8f32.nxv8bf16.nxv8bf16(
+  <vscale x 8 x float>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwadd_mask_vv_nxv8f32_nxv8bf16_nxv8bf16(<vscale x 8 x float> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd_mask_vv_nxv8f32_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vfwadd.vv v8, v12, v14, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vfwadd.mask.nxv8f32.nxv8bf16.nxv8bf16(
+    <vscale x 8 x float> %0,
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x bfloat> %2,
+    <vscale x 8 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwadd.nxv16f32.nxv16bf16.nxv16bf16(
+  <vscale x 16 x float>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwadd_vv_nxv16f32_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd_vv_nxv16f32_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vmv4r.v v20, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwadd.vv v8, v20, v16
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vfwadd.nxv16f32.nxv16bf16.nxv16bf16(
+    <vscale x 16 x float> poison,
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwadd.mask.nxv16f32.nxv16bf16.nxv16bf16(
+  <vscale x 16 x float>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwadd_mask_vv_nxv16f32_nxv16bf16_nxv16bf16(<vscale x 16 x float> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd_mask_vv_nxv16f32_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT:    vfwadd.vv v8, v16, v20, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vfwadd.mask.nxv16f32.nxv16bf16.nxv16bf16(
+    <vscale x 16 x float> %0,
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x bfloat> %2,
+    <vscale x 16 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwadd.nxv1f32.nxv1bf16.bf16(
+  <vscale x 1 x float>,
+  <vscale x 1 x bfloat>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwadd_vf_nxv1f32_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd_vf_nxv1f32_nxv1bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfwadd.vf v9, v8, fa0
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfwadd.nxv1f32.nxv1bf16.bf16(
+    <vscale x 1 x float> poison,
+    <vscale x 1 x bfloat> %0,
+    bfloat %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwadd.mask.nxv1f32.nxv1bf16.bf16(
+  <vscale x 1 x float>,
+  <vscale x 1 x bfloat>,
+  bfloat,
+  <vscale x 1 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwadd_mask_vf_nxv1f32_nxv1bf16_bf16(<vscale x 1 x float> %0, <vscale x 1 x bfloat> %1, bfloat %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd_mask_vf_nxv1f32_nxv1bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vfwadd.vf v8, v9, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfwadd.mask.nxv1f32.nxv1bf16.bf16(
+    <vscale x 1 x float> %0,
+    <vscale x 1 x bfloat> %1,
+    bfloat %2,
+    <vscale x 1 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwadd.nxv2f32.nxv2bf16.bf16(
+  <vscale x 2 x float>,
+  <vscale x 2 x bfloat>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwadd_vf_nxv2f32_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd_vf_nxv2f32_nxv2bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vfwadd.vf v9, v8, fa0
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfwadd.nxv2f32.nxv2bf16.bf16(
+    <vscale x 2 x float> poison,
+    <vscale x 2 x bfloat> %0,
+    bfloat %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwadd.mask.nxv2f32.nxv2bf16.bf16(
+  <vscale x 2 x float>,
+  <vscale x 2 x bfloat>,
+  bfloat,
+  <vscale x 2 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwadd_mask_vf_nxv2f32_nxv2bf16_bf16(<vscale x 2 x float> %0, <vscale x 2 x bfloat> %1, bfloat %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd_mask_vf_nxv2f32_nxv2bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vfwadd.vf v8, v9, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfwadd.mask.nxv2f32.nxv2bf16.bf16(
+    <vscale x 2 x float> %0,
+    <vscale x 2 x bfloat> %1,
+    bfloat %2,
+    <vscale x 2 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwadd.nxv4f32.nxv4bf16.bf16(
+  <vscale x 4 x float>,
+  <vscale x 4 x bfloat>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwadd_vf_nxv4f32_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd_vf_nxv4f32_nxv4bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwadd.vf v8, v10, fa0
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vfwadd.nxv4f32.nxv4bf16.bf16(
+    <vscale x 4 x float> poison,
+    <vscale x 4 x bfloat> %0,
+    bfloat %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwadd.mask.nxv4f32.nxv4bf16.bf16(
+  <vscale x 4 x float>,
+  <vscale x 4 x bfloat>,
+  bfloat,
+  <vscale x 4 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwadd_mask_vf_nxv4f32_nxv4bf16_bf16(<vscale x 4 x float> %0, <vscale x 4 x bfloat> %1, bfloat %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd_mask_vf_nxv4f32_nxv4bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vfwadd.vf v8, v10, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vfwadd.mask.nxv4f32.nxv4bf16.bf16(
+    <vscale x 4 x float> %0,
+    <vscale x 4 x bfloat> %1,
+    bfloat %2,
+    <vscale x 4 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwadd.nxv8f32.nxv8bf16.bf16(
+  <vscale x 8 x float>,
+  <vscale x 8 x bfloat>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwadd_vf_nxv8f32_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd_vf_nxv8f32_nxv8bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwadd.vf v8, v12, fa0
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vfwadd.nxv8f32.nxv8bf16.bf16(
+    <vscale x 8 x float> poison,
+    <vscale x 8 x bfloat> %0,
+    bfloat %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwadd.mask.nxv8f32.nxv8bf16.bf16(
+  <vscale x 8 x float>,
+  <vscale x 8 x bfloat>,
+  bfloat,
+  <vscale x 8 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwadd_mask_vf_nxv8f32_nxv8bf16_bf16(<vscale x 8 x float> %0, <vscale x 8 x bfloat> %1, bfloat %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd_mask_vf_nxv8f32_nxv8bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vfwadd.vf v8, v12, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vfwadd.mask.nxv8f32.nxv8bf16.bf16(
+    <vscale x 8 x float> %0,
+    <vscale x 8 x bfloat> %1,
+    bfloat %2,
+    <vscale x 8 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwadd.nxv16f32.nxv16bf16.bf16(
+  <vscale x 16 x float>,
+  <vscale x 16 x bfloat>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwadd_vf_nxv16f32_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd_vf_nxv16f32_nxv16bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwadd.vf v8, v16, fa0
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vfwadd.nxv16f32.nxv16bf16.bf16(
+    <vscale x 16 x float> poison,
+    <vscale x 16 x bfloat> %0,
+    bfloat %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwadd.mask.nxv16f32.nxv16bf16.bf16(
+  <vscale x 16 x float>,
+  <vscale x 16 x bfloat>,
+  bfloat,
+  <vscale x 16 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwadd_mask_vf_nxv16f32_nxv16bf16_bf16(<vscale x 16 x float> %0, <vscale x 16 x bfloat> %1, bfloat %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd_mask_vf_nxv16f32_nxv16bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT:    vfwadd.vf v8, v16, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vfwadd.mask.nxv16f32.nxv16bf16.bf16(
+    <vscale x 16 x float> %0,
+    <vscale x 16 x bfloat> %1,
+    bfloat %2,
+    <vscale x 16 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x float> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwadd-w-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfwadd-w-bf.ll
new file mode 100644
index 0000000000000..c5417e826bf41
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwadd-w-bf.ll
@@ -0,0 +1,773 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x float> @llvm.riscv.vfwadd.w.nxv1f32.nxv1bf16(
+  <vscale x 1 x float>,
+  <vscale x 1 x float>,
+  <vscale x 1 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwadd.w_wv_nxv1f32_nxv1f32_nxv1bf16(<vscale x 1 x float> %0, <vscale x 1 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_wv_nxv1f32_nxv1f32_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfwadd.wv v8, v8, v9
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.nxv1f32.nxv1bf16(
+    <vscale x 1 x float> poison,
+    <vscale x 1 x float> %0,
+    <vscale x 1 x bfloat> %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.nxv1bf16(
+  <vscale x 1 x float>,
+  <vscale x 1 x float>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwadd.w_mask_wv_nxv1f32_nxv1f32_nxv1bf16(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_mask_wv_nxv1f32_nxv1f32_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vfwadd.wv v8, v9, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.nxv1bf16(
+    <vscale x 1 x float> %0,
+    <vscale x 1 x float> %1,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwadd.w.nxv2f32.nxv2bf16(
+  <vscale x 2 x float>,
+  <vscale x 2 x float>,
+  <vscale x 2 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwadd.w_wv_nxv2f32_nxv2f32_nxv2bf16(<vscale x 2 x float> %0, <vscale x 2 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_wv_nxv2f32_nxv2f32_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vfwadd.wv v8, v8, v9
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.nxv2f32.nxv2bf16(
+    <vscale x 2 x float> poison,
+    <vscale x 2 x float> %0,
+    <vscale x 2 x bfloat> %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.nxv2bf16(
+  <vscale x 2 x float>,
+  <vscale x 2 x float>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwadd.w_mask_wv_nxv2f32_nxv2f32_nxv2bf16(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_mask_wv_nxv2f32_nxv2f32_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vfwadd.wv v8, v9, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.nxv2bf16(
+    <vscale x 2 x float> %0,
+    <vscale x 2 x float> %1,
+    <vscale x 2 x bfloat> %2,
+    <vscale x 2 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwadd.w.nxv4f32.nxv4bf16(
+  <vscale x 4 x float>,
+  <vscale x 4 x float>,
+  <vscale x 4 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwadd.w_wv_nxv4f32_nxv4f32_nxv4bf16(<vscale x 4 x float> %0, <vscale x 4 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_wv_nxv4f32_nxv4f32_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vfwadd.wv v8, v8, v10
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.nxv4f32.nxv4bf16(
+    <vscale x 4 x float> poison,
+    <vscale x 4 x float> %0,
+    <vscale x 4 x bfloat> %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.nxv4bf16(
+  <vscale x 4 x float>,
+  <vscale x 4 x float>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwadd.w_mask_wv_nxv4f32_nxv4f32_nxv4bf16(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_mask_wv_nxv4f32_nxv4f32_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vfwadd.wv v8, v10, v12, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.nxv4bf16(
+    <vscale x 4 x float> %0,
+    <vscale x 4 x float> %1,
+    <vscale x 4 x bfloat> %2,
+    <vscale x 4 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwadd.w.nxv8f32.nxv8bf16(
+  <vscale x 8 x float>,
+  <vscale x 8 x float>,
+  <vscale x 8 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwadd.w_wv_nxv8f32_nxv8f32_nxv8bf16(<vscale x 8 x float> %0, <vscale x 8 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_wv_nxv8f32_nxv8f32_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vfwadd.wv v8, v8, v12
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.nxv8f32.nxv8bf16(
+    <vscale x 8 x float> poison,
+    <vscale x 8 x float> %0,
+    <vscale x 8 x bfloat> %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.nxv8bf16(
+  <vscale x 8 x float>,
+  <vscale x 8 x float>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwadd.w_mask_wv_nxv8f32_nxv8f32_nxv8bf16(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_mask_wv_nxv8f32_nxv8f32_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vfwadd.wv v8, v12, v16, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.nxv8bf16(
+    <vscale x 8 x float> %0,
+    <vscale x 8 x float> %1,
+    <vscale x 8 x bfloat> %2,
+    <vscale x 8 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwadd.w.nxv16f32.nxv16bf16(
+  <vscale x 16 x float>,
+  <vscale x 16 x float>,
+  <vscale x 16 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwadd.w_wv_nxv16f32_nxv16f32_nxv16bf16(<vscale x 16 x float> %0, <vscale x 16 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_wv_nxv16f32_nxv16f32_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vfwadd.wv v8, v8, v16
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.nxv16f32.nxv16bf16(
+    <vscale x 16 x float> poison,
+    <vscale x 16 x float> %0,
+    <vscale x 16 x bfloat> %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.nxv16bf16(
+  <vscale x 16 x float>,
+  <vscale x 16 x float>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwadd.w_mask_wv_nxv16f32_nxv16f32_nxv16bf16(<vscale x 16 x float> %0, <vscale x 16 x float> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_mask_wv_nxv16f32_nxv16f32_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vl4re16.v v24, (a0)
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16alt, m4, ta, mu
+; CHECK-NEXT:    vfwadd.wv v8, v16, v24, v0.t
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.nxv16bf16(
+    <vscale x 16 x float> %0,
+    <vscale x 16 x float> %1,
+    <vscale x 16 x bfloat> %2,
+    <vscale x 16 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwadd.w.nxv1f32.bf16(
+  <vscale x 1 x float>,
+  <vscale x 1 x float>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwadd.w_wf_nxv1f32_nxv1f32_bf16(<vscale x 1 x float> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_wf_nxv1f32_nxv1f32_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfwadd.wf v8, v8, fa0
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.nxv1f32.bf16(
+    <vscale x 1 x float> poison,
+    <vscale x 1 x float> %0,
+    bfloat %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.bf16(
+  <vscale x 1 x float>,
+  <vscale x 1 x float>,
+  bfloat,
+  <vscale x 1 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwadd.w_mask_wf_nxv1f32_nxv1f32_bf16(<vscale x 1 x float> %0, <vscale x 1 x float> %1, bfloat %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_mask_wf_nxv1f32_nxv1f32_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vfwadd.wf v8, v9, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.bf16(
+    <vscale x 1 x float> %0,
+    <vscale x 1 x float> %1,
+    bfloat %2,
+    <vscale x 1 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwadd.w.nxv2f32.bf16(
+  <vscale x 2 x float>,
+  <vscale x 2 x float>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwadd.w_wf_nxv2f32_nxv2f32_bf16(<vscale x 2 x float> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_wf_nxv2f32_nxv2f32_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vfwadd.wf v8, v8, fa0
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.nxv2f32.bf16(
+    <vscale x 2 x float> poison,
+    <vscale x 2 x float> %0,
+    bfloat %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.bf16(
+  <vscale x 2 x float>,
+  <vscale x 2 x float>,
+  bfloat,
+  <vscale x 2 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwadd.w_mask_wf_nxv2f32_nxv2f32_bf16(<vscale x 2 x float> %0, <vscale x 2 x float> %1, bfloat %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_mask_wf_nxv2f32_nxv2f32_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vfwadd.wf v8, v9, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.bf16(
+    <vscale x 2 x float> %0,
+    <vscale x 2 x float> %1,
+    bfloat %2,
+    <vscale x 2 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwadd.w.nxv4f32.bf16(
+  <vscale x 4 x float>,
+  <vscale x 4 x float>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwadd.w_wf_nxv4f32_nxv4f32_bf16(<vscale x 4 x float> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_wf_nxv4f32_nxv4f32_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vfwadd.wf v8, v8, fa0
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.nxv4f32.bf16(
+    <vscale x 4 x float> poison,
+    <vscale x 4 x float> %0,
+    bfloat %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.bf16(
+  <vscale x 4 x float>,
+  <vscale x 4 x float>,
+  bfloat,
+  <vscale x 4 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwadd.w_mask_wf_nxv4f32_nxv4f32_bf16(<vscale x 4 x float> %0, <vscale x 4 x float> %1, bfloat %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_mask_wf_nxv4f32_nxv4f32_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vfwadd.wf v8, v10, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.bf16(
+    <vscale x 4 x float> %0,
+    <vscale x 4 x float> %1,
+    bfloat %2,
+    <vscale x 4 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwadd.w.nxv8f32.bf16(
+  <vscale x 8 x float>,
+  <vscale x 8 x float>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwadd.w_wf_nxv8f32_nxv8f32_bf16(<vscale x 8 x float> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_wf_nxv8f32_nxv8f32_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vfwadd.wf v8, v8, fa0
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.nxv8f32.bf16(
+    <vscale x 8 x float> poison,
+    <vscale x 8 x float> %0,
+    bfloat %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.bf16(
+  <vscale x 8 x float>,
+  <vscale x 8 x float>,
+  bfloat,
+  <vscale x 8 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwadd.w_mask_wf_nxv8f32_nxv8f32_bf16(<vscale x 8 x float> %0, <vscale x 8 x float> %1, bfloat %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_mask_wf_nxv8f32_nxv8f32_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vfwadd.wf v8, v12, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.bf16(
+    <vscale x 8 x float> %0,
+    <vscale x 8 x float> %1,
+    bfloat %2,
+    <vscale x 8 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwadd.w.nxv16f32.bf16(
+  <vscale x 16 x float>,
+  <vscale x 16 x float>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwadd.w_wf_nxv16f32_nxv16f32_bf16(<vscale x 16 x float> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_wf_nxv16f32_nxv16f32_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vfwadd.wf v8, v8, fa0
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.nxv16f32.bf16(
+    <vscale x 16 x float> poison,
+    <vscale x 16 x float> %0,
+    bfloat %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.bf16(
+  <vscale x 16 x float>,
+  <vscale x 16 x float>,
+  bfloat,
+  <vscale x 16 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwadd.w_mask_wf_nxv16f32_nxv16f32_bf16(<vscale x 16 x float> %0, <vscale x 16 x float> %1, bfloat %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_mask_wf_nxv16f32_nxv16f32_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT:    vfwadd.wf v8, v16, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.bf16(
+    <vscale x 16 x float> %0,
+    <vscale x 16 x float> %1,
+    bfloat %2,
+    <vscale x 16 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x float> %a
+}
+
+define <vscale x 1 x float> @intrinsic_vfwadd.w_mask_wv_tie_nxv1f32_nxv1f32_nxv1bf16(<vscale x 1 x float> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_mask_wv_tie_nxv1f32_nxv1f32_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vfwadd.wv v8, v8, v9, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.nxv1bf16(
+    <vscale x 1 x float> %0,
+    <vscale x 1 x float> %0,
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x i1> %2,
+    iXLen 0, iXLen %3, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+define <vscale x 2 x float> @intrinsic_vfwadd.w_mask_wv_tie_nxv2f32_nxv2f32_nxv2bf16(<vscale x 2 x float> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_mask_wv_tie_nxv2f32_nxv2f32_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vfwadd.wv v8, v8, v9, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.nxv2bf16(
+    <vscale x 2 x float> %0,
+    <vscale x 2 x float> %0,
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x i1> %2,
+    iXLen 0, iXLen %3, iXLen 1)
+
+  ret <vscale x 2 x float> %a
+}
+
+define <vscale x 4 x float> @intrinsic_vfwadd.w_mask_wv_tie_nxv4f32_nxv4f32_nxv4bf16(<vscale x 4 x float> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_mask_wv_tie_nxv4f32_nxv4f32_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vfwadd.wv v8, v8, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.nxv4bf16(
+    <vscale x 4 x float> %0,
+    <vscale x 4 x float> %0,
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x i1> %2,
+    iXLen 0, iXLen %3, iXLen 1)
+
+  ret <vscale x 4 x float> %a
+}
+
+define <vscale x 8 x float> @intrinsic_vfwadd.w_mask_wv_tie_nxv8f32_nxv8f32_nxv8bf16(<vscale x 8 x float> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_mask_wv_tie_nxv8f32_nxv8f32_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vfwadd.wv v8, v8, v12, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.nxv8bf16(
+    <vscale x 8 x float> %0,
+    <vscale x 8 x float> %0,
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x i1> %2,
+    iXLen 0, iXLen %3, iXLen 1)
+
+  ret <vscale x 8 x float> %a
+}
+
+define <vscale x 16 x float> @intrinsic_vfwadd.w_mask_wv_tie_nxv16f32_nxv16f32_nxv16bf16(<vscale x 16 x float> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_mask_wv_tie_nxv16f32_nxv16f32_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT:    vfwadd.wv v8, v8, v16, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.nxv16bf16(
+    <vscale x 16 x float> %0,
+    <vscale x 16 x float> %0,
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x i1> %2,
+    iXLen 0, iXLen %3, iXLen 1)
+
+  ret <vscale x 16 x float> %a
+}
+
+define <vscale x 1 x float> @intrinsic_vfwadd.w_mask_wf_tie_nxv1f32_nxv1f32_bf16(<vscale x 1 x float> %0, bfloat %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_mask_wf_tie_nxv1f32_nxv1f32_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vfwadd.wf v8, v8, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.mask.nxv1f32.bf16(
+    <vscale x 1 x float> %0,
+    <vscale x 1 x float> %0,
+    bfloat %1,
+    <vscale x 1 x i1> %2,
+    iXLen 0, iXLen %3, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+define <vscale x 2 x float> @intrinsic_vfwadd.w_mask_wf_tie_nxv2f32_nxv2f32_bf16(<vscale x 2 x float> %0, bfloat %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_mask_wf_tie_nxv2f32_nxv2f32_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vfwadd.wf v8, v8, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.mask.nxv2f32.bf16(
+    <vscale x 2 x float> %0,
+    <vscale x 2 x float> %0,
+    bfloat %1,
+    <vscale x 2 x i1> %2,
+    iXLen 0, iXLen %3, iXLen 1)
+
+  ret <vscale x 2 x float> %a
+}
+
+define <vscale x 4 x float> @intrinsic_vfwadd.w_mask_wf_tie_nxv4f32_nxv4f32_bf16(<vscale x 4 x float> %0, bfloat %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_mask_wf_tie_nxv4f32_nxv4f32_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vfwadd.wf v8, v8, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.mask.nxv4f32.bf16(
+    <vscale x 4 x float> %0,
+    <vscale x 4 x float> %0,
+    bfloat %1,
+    <vscale x 4 x i1> %2,
+    iXLen 0, iXLen %3, iXLen 1)
+
+  ret <vscale x 4 x float> %a
+}
+
+define <vscale x 8 x float> @intrinsic_vfwadd.w_mask_wf_tie_nxv8f32_nxv8f32_bf16(<vscale x 8 x float> %0, bfloat %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_mask_wf_tie_nxv8f32_nxv8f32_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vfwadd.wf v8, v8, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.mask.nxv8f32.bf16(
+    <vscale x 8 x float> %0,
+    <vscale x 8 x float> %0,
+    bfloat %1,
+    <vscale x 8 x i1> %2,
+    iXLen 0, iXLen %3, iXLen 1)
+
+  ret <vscale x 8 x float> %a
+}
+
+define <vscale x 16 x float> @intrinsic_vfwadd.w_mask_wf_tie_nxv16f32_nxv16f32_bf16(<vscale x 16 x float> %0, bfloat %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_mask_wf_tie_nxv16f32_nxv16f32_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT:    vfwadd.wf v8, v8, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vfwadd.w.mask.nxv16f32.bf16(
+    <vscale x 16 x float> %0,
+    <vscale x 16 x float> %0,
+    bfloat %1,
+    <vscale x 16 x i1> %2,
+    iXLen 0, iXLen %3, iXLen 1)
+
+  ret <vscale x 16 x float> %a
+}
+
+define <vscale x 1 x float> @intrinsic_vfwadd.w_wv_untie_nxv1f32_nxv1f32_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x float> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_wv_untie_nxv1f32_nxv1f32_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfwadd.wv v10, v9, v8
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfwadd.w.nxv1f32.nxv1bf16(
+    <vscale x 1 x float> poison,
+    <vscale x 1 x float> %1,
+    <vscale x 1 x bfloat> %0,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 1 x float> %a
+}
+
+define <vscale x 2 x float> @intrinsic_vfwadd.w_wv_untie_nxv2f32_nxv2f32_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x float> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_wv_untie_nxv2f32_nxv2f32_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vfwadd.wv v10, v9, v8
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfwadd.w.nxv2f32.nxv2bf16(
+    <vscale x 2 x float> poison,
+    <vscale x 2 x float> %1,
+    <vscale x 2 x bfloat> %0,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 2 x float> %a
+}
+
+define <vscale x 4 x float> @intrinsic_vfwadd.w_wv_untie_nxv4f32_nxv4f32_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x float> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_wv_untie_nxv4f32_nxv4f32_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwadd.wv v8, v10, v12
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vfwadd.w.nxv4f32.nxv4bf16(
+    <vscale x 4 x float> poison,
+    <vscale x 4 x float> %1,
+    <vscale x 4 x bfloat> %0,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 4 x float> %a
+}
+
+define <vscale x 8 x float> @intrinsic_vfwadd.w_wv_untie_nxv8f32_nxv8f32_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x float> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwadd.w_wv_untie_nxv8f32_nxv8f32_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwadd.wv v8, v12, v16
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vfwadd.w.nxv8f32.nxv8bf16(
+    <vscale x 8 x float> poison,
+    <vscale x 8 x float> %1,
+    <vscale x 8 x bfloat> %0,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 8 x float> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwcvt-bf-x.ll b/llvm/test/CodeGen/RISCV/rvv/vfwcvt-bf-x.ll
new file mode 100644
index 0000000000000..b7df45bad36e6
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwcvt-bf-x.ll
@@ -0,0 +1,264 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv1bf16.nxv1i8(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfwcvt_f.x.v_nxv1bf16_nxv1i8(<vscale x 1 x i8> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_f.x.v_nxv1bf16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, mf8, ta, ma
+; CHECK-NEXT:    vfwcvt.f.x.v v9, v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv1bf16.nxv1i8(
+    <vscale x 1 x bfloat> poison,
+    <vscale x 1 x i8> %0,
+    iXLen %1)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv1bf16.nxv1i8(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfwcvt_mask_f.x.v_nxv1bf16_nxv1i8(<vscale x 1 x bfloat> %0, <vscale x 1 x i8> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_mask_f.x.v_nxv1bf16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, mf8, ta, mu
+; CHECK-NEXT:    vfwcvt.f.x.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv1bf16.nxv1i8(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x i8> %1,
+    <vscale x 1 x i1> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv2bf16.nxv2i8(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfwcvt_f.x.v_nxv2bf16_nxv2i8(<vscale x 2 x i8> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_f.x.v_nxv2bf16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, mf4, ta, ma
+; CHECK-NEXT:    vfwcvt.f.x.v v9, v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv2bf16.nxv2i8(
+    <vscale x 2 x bfloat> poison,
+    <vscale x 2 x i8> %0,
+    iXLen %1)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv2bf16.nxv2i8(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfwcvt_mask_f.x.v_nxv2bf16_nxv2i8(<vscale x 2 x bfloat> %0, <vscale x 2 x i8> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_mask_f.x.v_nxv2bf16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, mf4, ta, mu
+; CHECK-NEXT:    vfwcvt.f.x.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv2bf16.nxv2i8(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x i8> %1,
+    <vscale x 2 x i1> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv4bf16.nxv4i8(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfwcvt_f.x.v_nxv4bf16_nxv4i8(<vscale x 4 x i8> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_f.x.v_nxv4bf16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, mf2, ta, ma
+; CHECK-NEXT:    vfwcvt.f.x.v v9, v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv4bf16.nxv4i8(
+    <vscale x 4 x bfloat> poison,
+    <vscale x 4 x i8> %0,
+    iXLen %1)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv4bf16.nxv4i8(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfwcvt_mask_f.x.v_nxv4bf16_nxv4i8(<vscale x 4 x bfloat> %0, <vscale x 4 x i8> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_mask_f.x.v_nxv4bf16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, mf2, ta, mu
+; CHECK-NEXT:    vfwcvt.f.x.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv4bf16.nxv4i8(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x i8> %1,
+    <vscale x 4 x i1> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv8bf16.nxv8i8(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfwcvt_f.x.v_nxv8bf16_nxv8i8(<vscale x 8 x i8> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_f.x.v_nxv8bf16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.f.x.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv8bf16.nxv8i8(
+    <vscale x 8 x bfloat> poison,
+    <vscale x 8 x i8> %0,
+    iXLen %1)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv8bf16.nxv8i8(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfwcvt_mask_f.x.v_nxv8bf16_nxv8i8(<vscale x 8 x bfloat> %0, <vscale x 8 x i8> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_mask_f.x.v_nxv8bf16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, m1, ta, mu
+; CHECK-NEXT:    vfwcvt.f.x.v v8, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv8bf16.nxv8i8(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x i8> %1,
+    <vscale x 8 x i1> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv16bf16.nxv16i8(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfwcvt_f.x.v_nxv16bf16_nxv16i8(<vscale x 16 x i8> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_f.x.v_nxv16bf16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, m2, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvt.f.x.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv16bf16.nxv16i8(
+    <vscale x 16 x bfloat> poison,
+    <vscale x 16 x i8> %0,
+    iXLen %1)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv16bf16.nxv16i8(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfwcvt_mask_f.x.v_nxv16bf16_nxv16i8(<vscale x 16 x bfloat> %0, <vscale x 16 x i8> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_mask_f.x.v_nxv16bf16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, m2, ta, mu
+; CHECK-NEXT:    vfwcvt.f.x.v v8, v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv16bf16.nxv16i8(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x i8> %1,
+    <vscale x 16 x i1> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv32bf16.nxv32i8(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfwcvt_f.x.v_nxv32bf16_nxv32i8(<vscale x 32 x i8> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_f.x.v_nxv32bf16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, m4, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vfwcvt.f.x.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfwcvt.f.x.v.nxv32bf16.nxv32i8(
+    <vscale x 32 x bfloat> poison,
+    <vscale x 32 x i8> %0,
+    iXLen %1)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv32bf16.nxv32i8(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen, iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfwcvt_mask_f.x.v_nxv32bf16_nxv32i8(<vscale x 32 x bfloat> %0, <vscale x 32 x i8> %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_mask_f.x.v_nxv32bf16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, m4, ta, mu
+; CHECK-NEXT:    vfwcvt.f.x.v v8, v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfwcvt.f.x.v.mask.nxv32bf16.nxv32i8(
+    <vscale x 32 x bfloat> %0,
+    <vscale x 32 x i8> %1,
+    <vscale x 32 x i1> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwcvt-bf-xu.ll b/llvm/test/CodeGen/RISCV/rvv/vfwcvt-bf-xu.ll
new file mode 100644
index 0000000000000..c370261a77bc0
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwcvt-bf-xu.ll
@@ -0,0 +1,264 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv1bf16.nxv1i8(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfwcvt_f.xu.v_nxv1bf16_nxv1i8(<vscale x 1 x i8> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_f.xu.v_nxv1bf16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, mf8, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv1bf16.nxv1i8(
+    <vscale x 1 x bfloat> poison,
+    <vscale x 1 x i8> %0,
+    iXLen %1)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 1 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv1bf16.nxv1i8(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen, iXLen);
+
+define <vscale x 1 x bfloat> @intrinsic_vfwcvt_mask_f.xu.v_nxv1bf16_nxv1i8(<vscale x 1 x bfloat> %0, <vscale x 1 x i8> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_mask_f.xu.v_nxv1bf16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, mf8, ta, mu
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv1bf16.nxv1i8(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x i8> %1,
+    <vscale x 1 x i1> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv2bf16.nxv2i8(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfwcvt_f.xu.v_nxv2bf16_nxv2i8(<vscale x 2 x i8> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_f.xu.v_nxv2bf16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, mf4, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv2bf16.nxv2i8(
+    <vscale x 2 x bfloat> poison,
+    <vscale x 2 x i8> %0,
+    iXLen %1)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 2 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv2bf16.nxv2i8(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen, iXLen);
+
+define <vscale x 2 x bfloat> @intrinsic_vfwcvt_mask_f.xu.v_nxv2bf16_nxv2i8(<vscale x 2 x bfloat> %0, <vscale x 2 x i8> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_mask_f.xu.v_nxv2bf16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, mf4, ta, mu
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv2bf16.nxv2i8(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x i8> %1,
+    <vscale x 2 x i1> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv4bf16.nxv4i8(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfwcvt_f.xu.v_nxv4bf16_nxv4i8(<vscale x 4 x i8> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_f.xu.v_nxv4bf16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, mf2, ta, ma
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv4bf16.nxv4i8(
+    <vscale x 4 x bfloat> poison,
+    <vscale x 4 x i8> %0,
+    iXLen %1)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 4 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv4bf16.nxv4i8(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen, iXLen);
+
+define <vscale x 4 x bfloat> @intrinsic_vfwcvt_mask_f.xu.v_nxv4bf16_nxv4i8(<vscale x 4 x bfloat> %0, <vscale x 4 x i8> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_mask_f.xu.v_nxv4bf16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, mf2, ta, mu
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv4bf16.nxv4i8(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x i8> %1,
+    <vscale x 4 x i1> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv8bf16.nxv8i8(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfwcvt_f.xu.v_nxv8bf16_nxv8i8(<vscale x 8 x i8> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_f.xu.v_nxv8bf16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv8bf16.nxv8i8(
+    <vscale x 8 x bfloat> poison,
+    <vscale x 8 x i8> %0,
+    iXLen %1)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 8 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv8bf16.nxv8i8(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen, iXLen);
+
+define <vscale x 8 x bfloat> @intrinsic_vfwcvt_mask_f.xu.v_nxv8bf16_nxv8i8(<vscale x 8 x bfloat> %0, <vscale x 8 x i8> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_mask_f.xu.v_nxv8bf16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, m1, ta, mu
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv8bf16.nxv8i8(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x i8> %1,
+    <vscale x 8 x i1> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv16bf16.nxv16i8(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfwcvt_f.xu.v_nxv16bf16_nxv16i8(<vscale x 16 x i8> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_f.xu.v_nxv16bf16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, m2, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv16bf16.nxv16i8(
+    <vscale x 16 x bfloat> poison,
+    <vscale x 16 x i8> %0,
+    iXLen %1)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 16 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv16bf16.nxv16i8(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen, iXLen);
+
+define <vscale x 16 x bfloat> @intrinsic_vfwcvt_mask_f.xu.v_nxv16bf16_nxv16i8(<vscale x 16 x bfloat> %0, <vscale x 16 x i8> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_mask_f.xu.v_nxv16bf16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, m2, ta, mu
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv16bf16.nxv16i8(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x i8> %1,
+    <vscale x 16 x i1> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv32bf16.nxv32i8(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfwcvt_f.xu.v_nxv32bf16_nxv32i8(<vscale x 32 x i8> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_f.xu.v_nxv32bf16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, m4, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.nxv32bf16.nxv32i8(
+    <vscale x 32 x bfloat> poison,
+    <vscale x 32 x i8> %0,
+    iXLen %1)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
+declare <vscale x 32 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv32bf16.nxv32i8(
+  <vscale x 32 x bfloat>,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen, iXLen);
+
+define <vscale x 32 x bfloat> @intrinsic_vfwcvt_mask_f.xu.v_nxv32bf16_nxv32i8(<vscale x 32 x bfloat> %0, <vscale x 32 x i8> %1, <vscale x 32 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwcvt_mask_f.xu.v_nxv32bf16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e8alt, m4, ta, mu
+; CHECK-NEXT:    vfwcvt.f.xu.v v8, v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.vfwcvt.f.xu.v.mask.nxv32bf16.nxv32i8(
+    <vscale x 32 x bfloat> %0,
+    <vscale x 32 x i8> %1,
+    <vscale x 32 x i1> %2,
+    iXLen %3, iXLen 1)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwmsac-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfwmsac-bf.ll
new file mode 100644
index 0000000000000..a3f667818ab0a
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwmsac-bf.ll
@@ -0,0 +1,506 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x float> @llvm.riscv.vfwmsac.nxv1f32.nxv1bf16(
+  <vscale x 1 x float>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x float>  @intrinsic_vfwmsac_vv_nxv1f32_nxv1bf16_nxv1bf16(<vscale x 1 x float> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwmsac_vv_nxv1f32_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, ma
+; CHECK-NEXT:    vfwmsac.vv v8, v9, v10
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfwmsac.nxv1f32.nxv1bf16(
+    <vscale x 1 x float> %0,
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwmsac.mask.nxv1f32.nxv1bf16(
+  <vscale x 1 x float>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x float>  @intrinsic_vfwmsac_mask_vv_nxv1f32_nxv1bf16_nxv1bf16(<vscale x 1 x float> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwmsac_mask_vv_nxv1f32_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT:    vfwmsac.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfwmsac.mask.nxv1f32.nxv1bf16(
+    <vscale x 1 x float> %0,
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwmsac.nxv2f32.nxv2bf16(
+  <vscale x 2 x float>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x float>  @intrinsic_vfwmsac_vv_nxv2f32_nxv2bf16_nxv2bf16(<vscale x 2 x float> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwmsac_vv_nxv2f32_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, ma
+; CHECK-NEXT:    vfwmsac.vv v8, v9, v10
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfwmsac.nxv2f32.nxv2bf16(
+    <vscale x 2 x float> %0,
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwmsac.mask.nxv2f32.nxv2bf16(
+  <vscale x 2 x float>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x float>  @intrinsic_vfwmsac_mask_vv_nxv2f32_nxv2bf16_nxv2bf16(<vscale x 2 x float> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwmsac_mask_vv_nxv2f32_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT:    vfwmsac.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfwmsac.mask.nxv2f32.nxv2bf16(
+    <vscale x 2 x float> %0,
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x bfloat> %2,
+    <vscale x 2 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwmsac.nxv4f32.nxv4bf16(
+  <vscale x 4 x float>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x float>  @intrinsic_vfwmsac_vv_nxv4f32_nxv4bf16_nxv4bf16(<vscale x 4 x float> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwmsac_vv_nxv4f32_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT:    vfwmsac.vv v8, v10, v11
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vfwmsac.nxv4f32.nxv4bf16(
+    <vscale x 4 x float> %0,
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwmsac.mask.nxv4f32.nxv4bf16(
+  <vscale x 4 x float>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x float>  @intrinsic_vfwmsac_mask_vv_nxv4f32_nxv4bf16_nxv4bf16(<vscale x 4 x float> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwmsac_mask_vv_nxv4f32_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT:    vfwmsac.vv v8, v10, v11, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vfwmsac.mask.nxv4f32.nxv4bf16(
+    <vscale x 4 x float> %0,
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x bfloat> %2,
+    <vscale x 4 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwmsac.nxv8f32.nxv8bf16(
+  <vscale x 8 x float>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x float>  @intrinsic_vfwmsac_vv_nxv8f32_nxv8bf16_nxv8bf16(<vscale x 8 x float> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwmsac_vv_nxv8f32_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, ma
+; CHECK-NEXT:    vfwmsac.vv v8, v12, v14
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vfwmsac.nxv8f32.nxv8bf16(
+    <vscale x 8 x float> %0,
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwmsac.mask.nxv8f32.nxv8bf16(
+  <vscale x 8 x float>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x float>  @intrinsic_vfwmsac_mask_vv_nxv8f32_nxv8bf16_nxv8bf16(<vscale x 8 x float> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwmsac_mask_vv_nxv8f32_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT:    vfwmsac.vv v8, v12, v14, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vfwmsac.mask.nxv8f32.nxv8bf16(
+    <vscale x 8 x float> %0,
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x bfloat> %2,
+    <vscale x 8 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwmsac.nxv16f32.nxv16bf16(
+  <vscale x 16 x float>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x float>  @intrinsic_vfwmsac_vv_nxv16f32_nxv16bf16_nxv16bf16(<vscale x 16 x float> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwmsac_vv_nxv16f32_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, ma
+; CHECK-NEXT:    vfwmsac.vv v8, v16, v20
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vfwmsac.nxv16f32.nxv16bf16(
+    <vscale x 16 x float> %0,
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwmsac.mask.nxv16f32.nxv16bf16(
+  <vscale x 16 x float>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x float>  @intrinsic_vfwmsac_mask_vv_nxv16f32_nxv16bf16_nxv16bf16(<vscale x 16 x float> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwmsac_mask_vv_nxv16f32_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT:    vfwmsac.vv v8, v16, v20, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vfwmsac.mask.nxv16f32.nxv16bf16(
+    <vscale x 16 x float> %0,
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x bfloat> %2,
+    <vscale x 16 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwmsac.nxv1f32.bf16(
+  <vscale x 1 x float>,
+  bfloat,
+  <vscale x 1 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x float>  @intrinsic_vfwmsac_vf_nxv1f32_bf16_nxv1bf16(<vscale x 1 x float> %0, bfloat %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwmsac_vf_nxv1f32_bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, ma
+; CHECK-NEXT:    vfwmsac.vf v8, fa0, v9
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfwmsac.nxv1f32.bf16(
+    <vscale x 1 x float> %0,
+    bfloat %1,
+    <vscale x 1 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwmsac.mask.nxv1f32.bf16(
+  <vscale x 1 x float>,
+  bfloat,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwmsac_mask_vf_nxv1f32_bf16_nxv1bf16(<vscale x 1 x float> %0, bfloat %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwmsac_mask_vf_nxv1f32_bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT:    vfwmsac.vf v8, fa0, v9, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfwmsac.mask.nxv1f32.bf16(
+    <vscale x 1 x float> %0,
+    bfloat %1,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwmsac.nxv2f32.bf16(
+  <vscale x 2 x float>,
+  bfloat,
+  <vscale x 2 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x float>  @intrinsic_vfwmsac_vf_nxv2f32_bf16_nxv2bf16(<vscale x 2 x float> %0, bfloat %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwmsac_vf_nxv2f32_bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, ma
+; CHECK-NEXT:    vfwmsac.vf v8, fa0, v9
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfwmsac.nxv2f32.bf16(
+    <vscale x 2 x float> %0,
+    bfloat %1,
+    <vscale x 2 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwmsac.mask.nxv2f32.bf16(
+  <vscale x 2 x float>,
+  bfloat,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwmsac_mask_vf_nxv2f32_bf16_nxv2bf16(<vscale x 2 x float> %0, bfloat %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwmsac_mask_vf_nxv2f32_bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT:    vfwmsac.vf v8, fa0, v9, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfwmsac.mask.nxv2f32.bf16(
+    <vscale x 2 x float> %0,
+    bfloat %1,
+    <vscale x 2 x bfloat> %2,
+    <vscale x 2 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwmsac.nxv4f32.bf16(
+  <vscale x 4 x float>,
+  bfloat,
+  <vscale x 4 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x float>  @intrinsic_vfwmsac_vf_nxv4f32_bf16_nxv4bf16(<vscale x 4 x float> %0, bfloat %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwmsac_vf_nxv4f32_bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT:    vfwmsac.vf v8, fa0, v10
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vfwmsac.nxv4f32.bf16(
+    <vscale x 4 x float> %0,
+    bfloat %1,
+    <vscale x 4 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwmsac.mask.nxv4f32.bf16(
+  <vscale x 4 x float>,
+  bfloat,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwmsac_mask_vf_nxv4f32_bf16_nxv4bf16(<vscale x 4 x float> %0, bfloat %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwmsac_mask_vf_nxv4f32_bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT:    vfwmsac.vf v8, fa0, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vfwmsac.mask.nxv4f32.bf16(
+    <vscale x 4 x float> %0,
+    bfloat %1,
+    <vscale x 4 x bfloat> %2,
+    <vscale x 4 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwmsac.nxv8f32.bf16(
+  <vscale x 8 x float>,
+  bfloat,
+  <vscale x 8 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x float>  @intrinsic_vfwmsac_vf_nxv8f32_bf16_nxv8bf16(<vscale x 8 x float> %0, bfloat %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwmsac_vf_nxv8f32_bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, ma
+; CHECK-NEXT:    vfwmsac.vf v8, fa0, v12
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vfwmsac.nxv8f32.bf16(
+    <vscale x 8 x float> %0,
+    bfloat %1,
+    <vscale x 8 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwmsac.mask.nxv8f32.bf16(
+  <vscale x 8 x float>,
+  bfloat,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwmsac_mask_vf_nxv8f32_bf16_nxv8bf16(<vscale x 8 x float> %0, bfloat %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwmsac_mask_vf_nxv8f32_bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT:    vfwmsac.vf v8, fa0, v12, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vfwmsac.mask.nxv8f32.bf16(
+    <vscale x 8 x float> %0,
+    bfloat %1,
+    <vscale x 8 x bfloat> %2,
+    <vscale x 8 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwmsac.nxv16f32.bf16(
+  <vscale x 16 x float>,
+  bfloat,
+  <vscale x 16 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x float>  @intrinsic_vfwmsac_vf_nxv16f32_bf16_nxv16bf16(<vscale x 16 x float> %0, bfloat %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwmsac_vf_nxv16f32_bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, ma
+; CHECK-NEXT:    vfwmsac.vf v8, fa0, v16
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vfwmsac.nxv16f32.bf16(
+    <vscale x 16 x float> %0,
+    bfloat %1,
+    <vscale x 16 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwmsac.mask.nxv16f32.bf16(
+  <vscale x 16 x float>,
+  bfloat,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwmsac_mask_vf_nxv16f32_bf16_nxv16bf16(<vscale x 16 x float> %0, bfloat %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwmsac_mask_vf_nxv16f32_bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT:    vfwmsac.vf v8, fa0, v16, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vfwmsac.mask.nxv16f32.bf16(
+    <vscale x 16 x float> %0,
+    bfloat %1,
+    <vscale x 16 x bfloat> %2,
+    <vscale x 16 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0)
+
+  ret <vscale x 16 x float> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwmul-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfwmul-bf.ll
new file mode 100644
index 0000000000000..577b93af7a918
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwmul-bf.ll
@@ -0,0 +1,519 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x float> @llvm.riscv.vfwmul.nxv1f32.nxv1bf16.nxv1bf16(
+  <vscale x 1 x float>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwmul_vv_nxv1f32_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwmul_vv_nxv1f32_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfwmul.vv v10, v8, v9
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfwmul.nxv1f32.nxv1bf16.nxv1bf16(
+    <vscale x 1 x float> poison,
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwmul.mask.nxv1f32.nxv1bf16.nxv1bf16(
+  <vscale x 1 x float>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwmul_mask_vv_nxv1f32_nxv1bf16_nxv1bf16(<vscale x 1 x float> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwmul_mask_vv_nxv1f32_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vfwmul.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfwmul.mask.nxv1f32.nxv1bf16.nxv1bf16(
+    <vscale x 1 x float> %0,
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwmul.nxv2f32.nxv2bf16.nxv2bf16(
+  <vscale x 2 x float>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwmul_vv_nxv2f32_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwmul_vv_nxv2f32_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vfwmul.vv v10, v8, v9
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfwmul.nxv2f32.nxv2bf16.nxv2bf16(
+    <vscale x 2 x float> poison,
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwmul.mask.nxv2f32.nxv2bf16.nxv2bf16(
+  <vscale x 2 x float>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwmul_mask_vv_nxv2f32_nxv2bf16_nxv2bf16(<vscale x 2 x float> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwmul_mask_vv_nxv2f32_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vfwmul.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfwmul.mask.nxv2f32.nxv2bf16.nxv2bf16(
+    <vscale x 2 x float> %0,
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x bfloat> %2,
+    <vscale x 2 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwmul.nxv4f32.nxv4bf16.nxv4bf16(
+  <vscale x 4 x float>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwmul_vv_nxv4f32_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwmul_vv_nxv4f32_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwmul.vv v8, v11, v10
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vfwmul.nxv4f32.nxv4bf16.nxv4bf16(
+    <vscale x 4 x float> poison,
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwmul.mask.nxv4f32.nxv4bf16.nxv4bf16(
+  <vscale x 4 x float>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwmul_mask_vv_nxv4f32_nxv4bf16_nxv4bf16(<vscale x 4 x float> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwmul_mask_vv_nxv4f32_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vfwmul.vv v8, v10, v11, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vfwmul.mask.nxv4f32.nxv4bf16.nxv4bf16(
+    <vscale x 4 x float> %0,
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x bfloat> %2,
+    <vscale x 4 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwmul.nxv8f32.nxv8bf16.nxv8bf16(
+  <vscale x 8 x float>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwmul_vv_nxv8f32_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwmul_vv_nxv8f32_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwmul.vv v8, v14, v12
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vfwmul.nxv8f32.nxv8bf16.nxv8bf16(
+    <vscale x 8 x float> poison,
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwmul.mask.nxv8f32.nxv8bf16.nxv8bf16(
+  <vscale x 8 x float>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwmul_mask_vv_nxv8f32_nxv8bf16_nxv8bf16(<vscale x 8 x float> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwmul_mask_vv_nxv8f32_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vfwmul.vv v8, v12, v14, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vfwmul.mask.nxv8f32.nxv8bf16.nxv8bf16(
+    <vscale x 8 x float> %0,
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x bfloat> %2,
+    <vscale x 8 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwmul.nxv16f32.nxv16bf16.nxv16bf16(
+  <vscale x 16 x float>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwmul_vv_nxv16f32_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwmul_vv_nxv16f32_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vmv4r.v v20, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwmul.vv v8, v20, v16
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vfwmul.nxv16f32.nxv16bf16.nxv16bf16(
+    <vscale x 16 x float> poison,
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwmul.mask.nxv16f32.nxv16bf16.nxv16bf16(
+  <vscale x 16 x float>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwmul_mask_vv_nxv16f32_nxv16bf16_nxv16bf16(<vscale x 16 x float> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwmul_mask_vv_nxv16f32_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT:    vfwmul.vv v8, v16, v20, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vfwmul.mask.nxv16f32.nxv16bf16.nxv16bf16(
+    <vscale x 16 x float> %0,
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x bfloat> %2,
+    <vscale x 16 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwmul.nxv1f32.nxv1bf16.bf16(
+  <vscale x 1 x float>,
+  <vscale x 1 x bfloat>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwmul_vf_nxv1f32_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwmul_vf_nxv1f32_nxv1bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfwmul.vf v9, v8, fa0
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfwmul.nxv1f32.nxv1bf16.bf16(
+    <vscale x 1 x float> poison,
+    <vscale x 1 x bfloat> %0,
+    bfloat %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwmul.mask.nxv1f32.nxv1bf16.bf16(
+  <vscale x 1 x float>,
+  <vscale x 1 x bfloat>,
+  bfloat,
+  <vscale x 1 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwmul_mask_vf_nxv1f32_nxv1bf16_bf16(<vscale x 1 x float> %0, <vscale x 1 x bfloat> %1, bfloat %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwmul_mask_vf_nxv1f32_nxv1bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vfwmul.vf v8, v9, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfwmul.mask.nxv1f32.nxv1bf16.bf16(
+    <vscale x 1 x float> %0,
+    <vscale x 1 x bfloat> %1,
+    bfloat %2,
+    <vscale x 1 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwmul.nxv2f32.nxv2bf16.bf16(
+  <vscale x 2 x float>,
+  <vscale x 2 x bfloat>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwmul_vf_nxv2f32_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwmul_vf_nxv2f32_nxv2bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vfwmul.vf v9, v8, fa0
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfwmul.nxv2f32.nxv2bf16.bf16(
+    <vscale x 2 x float> poison,
+    <vscale x 2 x bfloat> %0,
+    bfloat %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwmul.mask.nxv2f32.nxv2bf16.bf16(
+  <vscale x 2 x float>,
+  <vscale x 2 x bfloat>,
+  bfloat,
+  <vscale x 2 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwmul_mask_vf_nxv2f32_nxv2bf16_bf16(<vscale x 2 x float> %0, <vscale x 2 x bfloat> %1, bfloat %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwmul_mask_vf_nxv2f32_nxv2bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vfwmul.vf v8, v9, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfwmul.mask.nxv2f32.nxv2bf16.bf16(
+    <vscale x 2 x float> %0,
+    <vscale x 2 x bfloat> %1,
+    bfloat %2,
+    <vscale x 2 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwmul.nxv4f32.nxv4bf16.bf16(
+  <vscale x 4 x float>,
+  <vscale x 4 x bfloat>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwmul_vf_nxv4f32_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwmul_vf_nxv4f32_nxv4bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwmul.vf v8, v10, fa0
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vfwmul.nxv4f32.nxv4bf16.bf16(
+    <vscale x 4 x float> poison,
+    <vscale x 4 x bfloat> %0,
+    bfloat %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwmul.mask.nxv4f32.nxv4bf16.bf16(
+  <vscale x 4 x float>,
+  <vscale x 4 x bfloat>,
+  bfloat,
+  <vscale x 4 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwmul_mask_vf_nxv4f32_nxv4bf16_bf16(<vscale x 4 x float> %0, <vscale x 4 x bfloat> %1, bfloat %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwmul_mask_vf_nxv4f32_nxv4bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vfwmul.vf v8, v10, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vfwmul.mask.nxv4f32.nxv4bf16.bf16(
+    <vscale x 4 x float> %0,
+    <vscale x 4 x bfloat> %1,
+    bfloat %2,
+    <vscale x 4 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwmul.nxv8f32.nxv8bf16.bf16(
+  <vscale x 8 x float>,
+  <vscale x 8 x bfloat>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwmul_vf_nxv8f32_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwmul_vf_nxv8f32_nxv8bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwmul.vf v8, v12, fa0
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vfwmul.nxv8f32.nxv8bf16.bf16(
+    <vscale x 8 x float> poison,
+    <vscale x 8 x bfloat> %0,
+    bfloat %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwmul.mask.nxv8f32.nxv8bf16.bf16(
+  <vscale x 8 x float>,
+  <vscale x 8 x bfloat>,
+  bfloat,
+  <vscale x 8 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwmul_mask_vf_nxv8f32_nxv8bf16_bf16(<vscale x 8 x float> %0, <vscale x 8 x bfloat> %1, bfloat %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwmul_mask_vf_nxv8f32_nxv8bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vfwmul.vf v8, v12, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vfwmul.mask.nxv8f32.nxv8bf16.bf16(
+    <vscale x 8 x float> %0,
+    <vscale x 8 x bfloat> %1,
+    bfloat %2,
+    <vscale x 8 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwmul.nxv16f32.nxv16bf16.bf16(
+  <vscale x 16 x float>,
+  <vscale x 16 x bfloat>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwmul_vf_nxv16f32_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwmul_vf_nxv16f32_nxv16bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwmul.vf v8, v16, fa0
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vfwmul.nxv16f32.nxv16bf16.bf16(
+    <vscale x 16 x float> poison,
+    <vscale x 16 x bfloat> %0,
+    bfloat %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwmul.mask.nxv16f32.nxv16bf16.bf16(
+  <vscale x 16 x float>,
+  <vscale x 16 x bfloat>,
+  bfloat,
+  <vscale x 16 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwmul_mask_vf_nxv16f32_nxv16bf16_bf16(<vscale x 16 x float> %0, <vscale x 16 x bfloat> %1, bfloat %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwmul_mask_vf_nxv16f32_nxv16bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT:    vfwmul.vf v8, v16, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vfwmul.mask.nxv16f32.nxv16bf16.bf16(
+    <vscale x 16 x float> %0,
+    <vscale x 16 x bfloat> %1,
+    bfloat %2,
+    <vscale x 16 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x float> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwnmacc-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfwnmacc-bf.ll
new file mode 100644
index 0000000000000..1e05e4c7acf25
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwnmacc-bf.ll
@@ -0,0 +1,506 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x float> @llvm.riscv.vfwnmacc.nxv1f32.nxv1bf16(
+  <vscale x 1 x float>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x float>  @intrinsic_vfwnmacc_vv_nxv1f32_nxv1bf16_nxv1bf16(<vscale x 1 x float> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmacc_vv_nxv1f32_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, ma
+; CHECK-NEXT:    vfwnmacc.vv v8, v9, v10
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.nxv1f32.nxv1bf16(
+    <vscale x 1 x float> %0,
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwnmacc.mask.nxv1f32.nxv1bf16(
+  <vscale x 1 x float>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x float>  @intrinsic_vfwnmacc_mask_vv_nxv1f32_nxv1bf16_nxv1bf16(<vscale x 1 x float> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmacc_mask_vv_nxv1f32_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT:    vfwnmacc.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.mask.nxv1f32.nxv1bf16(
+    <vscale x 1 x float> %0,
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwnmacc.nxv2f32.nxv2bf16(
+  <vscale x 2 x float>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x float>  @intrinsic_vfwnmacc_vv_nxv2f32_nxv2bf16_nxv2bf16(<vscale x 2 x float> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmacc_vv_nxv2f32_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, ma
+; CHECK-NEXT:    vfwnmacc.vv v8, v9, v10
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.nxv2f32.nxv2bf16(
+    <vscale x 2 x float> %0,
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwnmacc.mask.nxv2f32.nxv2bf16(
+  <vscale x 2 x float>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x float>  @intrinsic_vfwnmacc_mask_vv_nxv2f32_nxv2bf16_nxv2bf16(<vscale x 2 x float> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmacc_mask_vv_nxv2f32_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT:    vfwnmacc.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.mask.nxv2f32.nxv2bf16(
+    <vscale x 2 x float> %0,
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x bfloat> %2,
+    <vscale x 2 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwnmacc.nxv4f32.nxv4bf16(
+  <vscale x 4 x float>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x float>  @intrinsic_vfwnmacc_vv_nxv4f32_nxv4bf16_nxv4bf16(<vscale x 4 x float> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmacc_vv_nxv4f32_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT:    vfwnmacc.vv v8, v10, v11
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.nxv4f32.nxv4bf16(
+    <vscale x 4 x float> %0,
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwnmacc.mask.nxv4f32.nxv4bf16(
+  <vscale x 4 x float>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x float>  @intrinsic_vfwnmacc_mask_vv_nxv4f32_nxv4bf16_nxv4bf16(<vscale x 4 x float> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmacc_mask_vv_nxv4f32_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT:    vfwnmacc.vv v8, v10, v11, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.mask.nxv4f32.nxv4bf16(
+    <vscale x 4 x float> %0,
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x bfloat> %2,
+    <vscale x 4 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwnmacc.nxv8f32.nxv8bf16(
+  <vscale x 8 x float>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x float>  @intrinsic_vfwnmacc_vv_nxv8f32_nxv8bf16_nxv8bf16(<vscale x 8 x float> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmacc_vv_nxv8f32_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, ma
+; CHECK-NEXT:    vfwnmacc.vv v8, v12, v14
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.nxv8f32.nxv8bf16(
+    <vscale x 8 x float> %0,
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwnmacc.mask.nxv8f32.nxv8bf16(
+  <vscale x 8 x float>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x float>  @intrinsic_vfwnmacc_mask_vv_nxv8f32_nxv8bf16_nxv8bf16(<vscale x 8 x float> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmacc_mask_vv_nxv8f32_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT:    vfwnmacc.vv v8, v12, v14, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.mask.nxv8f32.nxv8bf16(
+    <vscale x 8 x float> %0,
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x bfloat> %2,
+    <vscale x 8 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwnmacc.nxv16f32.nxv16bf16(
+  <vscale x 16 x float>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x float>  @intrinsic_vfwnmacc_vv_nxv16f32_nxv16bf16_nxv16bf16(<vscale x 16 x float> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmacc_vv_nxv16f32_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, ma
+; CHECK-NEXT:    vfwnmacc.vv v8, v16, v20
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.nxv16f32.nxv16bf16(
+    <vscale x 16 x float> %0,
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwnmacc.mask.nxv16f32.nxv16bf16(
+  <vscale x 16 x float>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x float>  @intrinsic_vfwnmacc_mask_vv_nxv16f32_nxv16bf16_nxv16bf16(<vscale x 16 x float> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmacc_mask_vv_nxv16f32_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT:    vfwnmacc.vv v8, v16, v20, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.mask.nxv16f32.nxv16bf16(
+    <vscale x 16 x float> %0,
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x bfloat> %2,
+    <vscale x 16 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwnmacc.nxv1f32.bf16(
+  <vscale x 1 x float>,
+  bfloat,
+  <vscale x 1 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x float>  @intrinsic_vfwnmacc_vf_nxv1f32_bf16_nxv1bf16(<vscale x 1 x float> %0, bfloat %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmacc_vf_nxv1f32_bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, ma
+; CHECK-NEXT:    vfwnmacc.vf v8, fa0, v9
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.nxv1f32.bf16(
+    <vscale x 1 x float> %0,
+    bfloat %1,
+    <vscale x 1 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwnmacc.mask.nxv1f32.bf16(
+  <vscale x 1 x float>,
+  bfloat,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwnmacc_mask_vf_nxv1f32_bf16_nxv1bf16(<vscale x 1 x float> %0, bfloat %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmacc_mask_vf_nxv1f32_bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT:    vfwnmacc.vf v8, fa0, v9, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfwnmacc.mask.nxv1f32.bf16(
+    <vscale x 1 x float> %0,
+    bfloat %1,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwnmacc.nxv2f32.bf16(
+  <vscale x 2 x float>,
+  bfloat,
+  <vscale x 2 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x float>  @intrinsic_vfwnmacc_vf_nxv2f32_bf16_nxv2bf16(<vscale x 2 x float> %0, bfloat %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmacc_vf_nxv2f32_bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, ma
+; CHECK-NEXT:    vfwnmacc.vf v8, fa0, v9
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.nxv2f32.bf16(
+    <vscale x 2 x float> %0,
+    bfloat %1,
+    <vscale x 2 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwnmacc.mask.nxv2f32.bf16(
+  <vscale x 2 x float>,
+  bfloat,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwnmacc_mask_vf_nxv2f32_bf16_nxv2bf16(<vscale x 2 x float> %0, bfloat %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmacc_mask_vf_nxv2f32_bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT:    vfwnmacc.vf v8, fa0, v9, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfwnmacc.mask.nxv2f32.bf16(
+    <vscale x 2 x float> %0,
+    bfloat %1,
+    <vscale x 2 x bfloat> %2,
+    <vscale x 2 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwnmacc.nxv4f32.bf16(
+  <vscale x 4 x float>,
+  bfloat,
+  <vscale x 4 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x float>  @intrinsic_vfwnmacc_vf_nxv4f32_bf16_nxv4bf16(<vscale x 4 x float> %0, bfloat %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmacc_vf_nxv4f32_bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT:    vfwnmacc.vf v8, fa0, v10
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.nxv4f32.bf16(
+    <vscale x 4 x float> %0,
+    bfloat %1,
+    <vscale x 4 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwnmacc.mask.nxv4f32.bf16(
+  <vscale x 4 x float>,
+  bfloat,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwnmacc_mask_vf_nxv4f32_bf16_nxv4bf16(<vscale x 4 x float> %0, bfloat %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmacc_mask_vf_nxv4f32_bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT:    vfwnmacc.vf v8, fa0, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vfwnmacc.mask.nxv4f32.bf16(
+    <vscale x 4 x float> %0,
+    bfloat %1,
+    <vscale x 4 x bfloat> %2,
+    <vscale x 4 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwnmacc.nxv8f32.bf16(
+  <vscale x 8 x float>,
+  bfloat,
+  <vscale x 8 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x float>  @intrinsic_vfwnmacc_vf_nxv8f32_bf16_nxv8bf16(<vscale x 8 x float> %0, bfloat %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmacc_vf_nxv8f32_bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, ma
+; CHECK-NEXT:    vfwnmacc.vf v8, fa0, v12
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.nxv8f32.bf16(
+    <vscale x 8 x float> %0,
+    bfloat %1,
+    <vscale x 8 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwnmacc.mask.nxv8f32.bf16(
+  <vscale x 8 x float>,
+  bfloat,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwnmacc_mask_vf_nxv8f32_bf16_nxv8bf16(<vscale x 8 x float> %0, bfloat %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmacc_mask_vf_nxv8f32_bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT:    vfwnmacc.vf v8, fa0, v12, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vfwnmacc.mask.nxv8f32.bf16(
+    <vscale x 8 x float> %0,
+    bfloat %1,
+    <vscale x 8 x bfloat> %2,
+    <vscale x 8 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwnmacc.nxv16f32.bf16(
+  <vscale x 16 x float>,
+  bfloat,
+  <vscale x 16 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x float>  @intrinsic_vfwnmacc_vf_nxv16f32_bf16_nxv16bf16(<vscale x 16 x float> %0, bfloat %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmacc_vf_nxv16f32_bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, ma
+; CHECK-NEXT:    vfwnmacc.vf v8, fa0, v16
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.nxv16f32.bf16(
+    <vscale x 16 x float> %0,
+    bfloat %1,
+    <vscale x 16 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwnmacc.mask.nxv16f32.bf16(
+  <vscale x 16 x float>,
+  bfloat,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwnmacc_mask_vf_nxv16f32_bf16_nxv16bf16(<vscale x 16 x float> %0, bfloat %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmacc_mask_vf_nxv16f32_bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT:    vfwnmacc.vf v8, fa0, v16, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vfwnmacc.mask.nxv16f32.bf16(
+    <vscale x 16 x float> %0,
+    bfloat %1,
+    <vscale x 16 x bfloat> %2,
+    <vscale x 16 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0)
+
+  ret <vscale x 16 x float> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwnmsac-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfwnmsac-bf.ll
new file mode 100644
index 0000000000000..223ad4f7483f6
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwnmsac-bf.ll
@@ -0,0 +1,506 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x float> @llvm.riscv.vfwnmsac.nxv1f32.nxv1bf16(
+  <vscale x 1 x float>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x float>  @intrinsic_vfwnmsac_vv_nxv1f32_nxv1bf16_nxv1bf16(<vscale x 1 x float> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmsac_vv_nxv1f32_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, ma
+; CHECK-NEXT:    vfwnmsac.vv v8, v9, v10
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.nxv1f32.nxv1bf16(
+    <vscale x 1 x float> %0,
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwnmsac.mask.nxv1f32.nxv1bf16(
+  <vscale x 1 x float>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x float>  @intrinsic_vfwnmsac_mask_vv_nxv1f32_nxv1bf16_nxv1bf16(<vscale x 1 x float> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmsac_mask_vv_nxv1f32_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT:    vfwnmsac.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.mask.nxv1f32.nxv1bf16(
+    <vscale x 1 x float> %0,
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwnmsac.nxv2f32.nxv2bf16(
+  <vscale x 2 x float>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x float>  @intrinsic_vfwnmsac_vv_nxv2f32_nxv2bf16_nxv2bf16(<vscale x 2 x float> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmsac_vv_nxv2f32_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, ma
+; CHECK-NEXT:    vfwnmsac.vv v8, v9, v10
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.nxv2f32.nxv2bf16(
+    <vscale x 2 x float> %0,
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwnmsac.mask.nxv2f32.nxv2bf16(
+  <vscale x 2 x float>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x float>  @intrinsic_vfwnmsac_mask_vv_nxv2f32_nxv2bf16_nxv2bf16(<vscale x 2 x float> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmsac_mask_vv_nxv2f32_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT:    vfwnmsac.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.mask.nxv2f32.nxv2bf16(
+    <vscale x 2 x float> %0,
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x bfloat> %2,
+    <vscale x 2 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwnmsac.nxv4f32.nxv4bf16(
+  <vscale x 4 x float>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x float>  @intrinsic_vfwnmsac_vv_nxv4f32_nxv4bf16_nxv4bf16(<vscale x 4 x float> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmsac_vv_nxv4f32_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT:    vfwnmsac.vv v8, v10, v11
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.nxv4f32.nxv4bf16(
+    <vscale x 4 x float> %0,
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwnmsac.mask.nxv4f32.nxv4bf16(
+  <vscale x 4 x float>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x float>  @intrinsic_vfwnmsac_mask_vv_nxv4f32_nxv4bf16_nxv4bf16(<vscale x 4 x float> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmsac_mask_vv_nxv4f32_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT:    vfwnmsac.vv v8, v10, v11, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.mask.nxv4f32.nxv4bf16(
+    <vscale x 4 x float> %0,
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x bfloat> %2,
+    <vscale x 4 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwnmsac.nxv8f32.nxv8bf16(
+  <vscale x 8 x float>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x float>  @intrinsic_vfwnmsac_vv_nxv8f32_nxv8bf16_nxv8bf16(<vscale x 8 x float> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmsac_vv_nxv8f32_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, ma
+; CHECK-NEXT:    vfwnmsac.vv v8, v12, v14
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.nxv8f32.nxv8bf16(
+    <vscale x 8 x float> %0,
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwnmsac.mask.nxv8f32.nxv8bf16(
+  <vscale x 8 x float>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x float>  @intrinsic_vfwnmsac_mask_vv_nxv8f32_nxv8bf16_nxv8bf16(<vscale x 8 x float> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmsac_mask_vv_nxv8f32_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT:    vfwnmsac.vv v8, v12, v14, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.mask.nxv8f32.nxv8bf16(
+    <vscale x 8 x float> %0,
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x bfloat> %2,
+    <vscale x 8 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwnmsac.nxv16f32.nxv16bf16(
+  <vscale x 16 x float>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x float>  @intrinsic_vfwnmsac_vv_nxv16f32_nxv16bf16_nxv16bf16(<vscale x 16 x float> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmsac_vv_nxv16f32_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, ma
+; CHECK-NEXT:    vfwnmsac.vv v8, v16, v20
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.nxv16f32.nxv16bf16(
+    <vscale x 16 x float> %0,
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwnmsac.mask.nxv16f32.nxv16bf16(
+  <vscale x 16 x float>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x float>  @intrinsic_vfwnmsac_mask_vv_nxv16f32_nxv16bf16_nxv16bf16(<vscale x 16 x float> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmsac_mask_vv_nxv16f32_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT:    vfwnmsac.vv v8, v16, v20, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.mask.nxv16f32.nxv16bf16(
+    <vscale x 16 x float> %0,
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x bfloat> %2,
+    <vscale x 16 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwnmsac.nxv1f32.bf16(
+  <vscale x 1 x float>,
+  bfloat,
+  <vscale x 1 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x float>  @intrinsic_vfwnmsac_vf_nxv1f32_bf16_nxv1bf16(<vscale x 1 x float> %0, bfloat %1, <vscale x 1 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmsac_vf_nxv1f32_bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, ma
+; CHECK-NEXT:    vfwnmsac.vf v8, fa0, v9
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.nxv1f32.bf16(
+    <vscale x 1 x float> %0,
+    bfloat %1,
+    <vscale x 1 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwnmsac.mask.nxv1f32.bf16(
+  <vscale x 1 x float>,
+  bfloat,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwnmsac_mask_vf_nxv1f32_bf16_nxv1bf16(<vscale x 1 x float> %0, bfloat %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmsac_mask_vf_nxv1f32_bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT:    vfwnmsac.vf v8, fa0, v9, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfwnmsac.mask.nxv1f32.bf16(
+    <vscale x 1 x float> %0,
+    bfloat %1,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwnmsac.nxv2f32.bf16(
+  <vscale x 2 x float>,
+  bfloat,
+  <vscale x 2 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x float>  @intrinsic_vfwnmsac_vf_nxv2f32_bf16_nxv2bf16(<vscale x 2 x float> %0, bfloat %1, <vscale x 2 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmsac_vf_nxv2f32_bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, ma
+; CHECK-NEXT:    vfwnmsac.vf v8, fa0, v9
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.nxv2f32.bf16(
+    <vscale x 2 x float> %0,
+    bfloat %1,
+    <vscale x 2 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwnmsac.mask.nxv2f32.bf16(
+  <vscale x 2 x float>,
+  bfloat,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwnmsac_mask_vf_nxv2f32_bf16_nxv2bf16(<vscale x 2 x float> %0, bfloat %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmsac_mask_vf_nxv2f32_bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT:    vfwnmsac.vf v8, fa0, v9, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfwnmsac.mask.nxv2f32.bf16(
+    <vscale x 2 x float> %0,
+    bfloat %1,
+    <vscale x 2 x bfloat> %2,
+    <vscale x 2 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwnmsac.nxv4f32.bf16(
+  <vscale x 4 x float>,
+  bfloat,
+  <vscale x 4 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x float>  @intrinsic_vfwnmsac_vf_nxv4f32_bf16_nxv4bf16(<vscale x 4 x float> %0, bfloat %1, <vscale x 4 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmsac_vf_nxv4f32_bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, ma
+; CHECK-NEXT:    vfwnmsac.vf v8, fa0, v10
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.nxv4f32.bf16(
+    <vscale x 4 x float> %0,
+    bfloat %1,
+    <vscale x 4 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwnmsac.mask.nxv4f32.bf16(
+  <vscale x 4 x float>,
+  bfloat,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwnmsac_mask_vf_nxv4f32_bf16_nxv4bf16(<vscale x 4 x float> %0, bfloat %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmsac_mask_vf_nxv4f32_bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT:    vfwnmsac.vf v8, fa0, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vfwnmsac.mask.nxv4f32.bf16(
+    <vscale x 4 x float> %0,
+    bfloat %1,
+    <vscale x 4 x bfloat> %2,
+    <vscale x 4 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwnmsac.nxv8f32.bf16(
+  <vscale x 8 x float>,
+  bfloat,
+  <vscale x 8 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x float>  @intrinsic_vfwnmsac_vf_nxv8f32_bf16_nxv8bf16(<vscale x 8 x float> %0, bfloat %1, <vscale x 8 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmsac_vf_nxv8f32_bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, ma
+; CHECK-NEXT:    vfwnmsac.vf v8, fa0, v12
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.nxv8f32.bf16(
+    <vscale x 8 x float> %0,
+    bfloat %1,
+    <vscale x 8 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwnmsac.mask.nxv8f32.bf16(
+  <vscale x 8 x float>,
+  bfloat,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwnmsac_mask_vf_nxv8f32_bf16_nxv8bf16(<vscale x 8 x float> %0, bfloat %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmsac_mask_vf_nxv8f32_bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT:    vfwnmsac.vf v8, fa0, v12, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vfwnmsac.mask.nxv8f32.bf16(
+    <vscale x 8 x float> %0,
+    bfloat %1,
+    <vscale x 8 x bfloat> %2,
+    <vscale x 8 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwnmsac.nxv16f32.bf16(
+  <vscale x 16 x float>,
+  bfloat,
+  <vscale x 16 x bfloat>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x float>  @intrinsic_vfwnmsac_vf_nxv16f32_bf16_nxv16bf16(<vscale x 16 x float> %0, bfloat %1, <vscale x 16 x bfloat> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmsac_vf_nxv16f32_bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, ma
+; CHECK-NEXT:    vfwnmsac.vf v8, fa0, v16
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.nxv16f32.bf16(
+    <vscale x 16 x float> %0,
+    bfloat %1,
+    <vscale x 16 x bfloat> %2,
+    iXLen 0, iXLen %3, iXLen 0)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwnmsac.mask.nxv16f32.bf16(
+  <vscale x 16 x float>,
+  bfloat,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwnmsac_mask_vf_nxv16f32_bf16_nxv16bf16(<vscale x 16 x float> %0, bfloat %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwnmsac_mask_vf_nxv16f32_bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT:    vfwnmsac.vf v8, fa0, v16, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vfwnmsac.mask.nxv16f32.bf16(
+    <vscale x 16 x float> %0,
+    bfloat %1,
+    <vscale x 16 x bfloat> %2,
+    <vscale x 16 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 0)
+
+  ret <vscale x 16 x float> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwsub-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfwsub-bf.ll
new file mode 100644
index 0000000000000..d993e4e610d2c
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwsub-bf.ll
@@ -0,0 +1,519 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x float> @llvm.riscv.vfwsub.nxv1f32.nxv1bf16.nxv1bf16(
+  <vscale x 1 x float>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwsub_vv_nxv1f32_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub_vv_nxv1f32_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfwsub.vv v10, v8, v9
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfwsub.nxv1f32.nxv1bf16.nxv1bf16(
+    <vscale x 1 x float> poison,
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwsub.mask.nxv1f32.nxv1bf16.nxv1bf16(
+  <vscale x 1 x float>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwsub_mask_vv_nxv1f32_nxv1bf16_nxv1bf16(<vscale x 1 x float> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub_mask_vv_nxv1f32_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vfwsub.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfwsub.mask.nxv1f32.nxv1bf16.nxv1bf16(
+    <vscale x 1 x float> %0,
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwsub.nxv2f32.nxv2bf16.nxv2bf16(
+  <vscale x 2 x float>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwsub_vv_nxv2f32_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub_vv_nxv2f32_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vfwsub.vv v10, v8, v9
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfwsub.nxv2f32.nxv2bf16.nxv2bf16(
+    <vscale x 2 x float> poison,
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwsub.mask.nxv2f32.nxv2bf16.nxv2bf16(
+  <vscale x 2 x float>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwsub_mask_vv_nxv2f32_nxv2bf16_nxv2bf16(<vscale x 2 x float> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub_mask_vv_nxv2f32_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vfwsub.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfwsub.mask.nxv2f32.nxv2bf16.nxv2bf16(
+    <vscale x 2 x float> %0,
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x bfloat> %2,
+    <vscale x 2 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwsub.nxv4f32.nxv4bf16.nxv4bf16(
+  <vscale x 4 x float>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwsub_vv_nxv4f32_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub_vv_nxv4f32_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vmv1r.v v11, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwsub.vv v8, v11, v10
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vfwsub.nxv4f32.nxv4bf16.nxv4bf16(
+    <vscale x 4 x float> poison,
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwsub.mask.nxv4f32.nxv4bf16.nxv4bf16(
+  <vscale x 4 x float>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwsub_mask_vv_nxv4f32_nxv4bf16_nxv4bf16(<vscale x 4 x float> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub_mask_vv_nxv4f32_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vfwsub.vv v8, v10, v11, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vfwsub.mask.nxv4f32.nxv4bf16.nxv4bf16(
+    <vscale x 4 x float> %0,
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x bfloat> %2,
+    <vscale x 4 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwsub.nxv8f32.nxv8bf16.nxv8bf16(
+  <vscale x 8 x float>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwsub_vv_nxv8f32_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub_vv_nxv8f32_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vmv2r.v v14, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwsub.vv v8, v14, v12
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vfwsub.nxv8f32.nxv8bf16.nxv8bf16(
+    <vscale x 8 x float> poison,
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwsub.mask.nxv8f32.nxv8bf16.nxv8bf16(
+  <vscale x 8 x float>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwsub_mask_vv_nxv8f32_nxv8bf16_nxv8bf16(<vscale x 8 x float> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub_mask_vv_nxv8f32_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vfwsub.vv v8, v12, v14, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vfwsub.mask.nxv8f32.nxv8bf16.nxv8bf16(
+    <vscale x 8 x float> %0,
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x bfloat> %2,
+    <vscale x 8 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwsub.nxv16f32.nxv16bf16.nxv16bf16(
+  <vscale x 16 x float>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwsub_vv_nxv16f32_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub_vv_nxv16f32_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vmv4r.v v20, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwsub.vv v8, v20, v16
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vfwsub.nxv16f32.nxv16bf16.nxv16bf16(
+    <vscale x 16 x float> poison,
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwsub.mask.nxv16f32.nxv16bf16.nxv16bf16(
+  <vscale x 16 x float>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwsub_mask_vv_nxv16f32_nxv16bf16_nxv16bf16(<vscale x 16 x float> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub_mask_vv_nxv16f32_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT:    vfwsub.vv v8, v16, v20, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vfwsub.mask.nxv16f32.nxv16bf16.nxv16bf16(
+    <vscale x 16 x float> %0,
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x bfloat> %2,
+    <vscale x 16 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwsub.nxv1f32.nxv1bf16.bf16(
+  <vscale x 1 x float>,
+  <vscale x 1 x bfloat>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwsub_vf_nxv1f32_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub_vf_nxv1f32_nxv1bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfwsub.vf v9, v8, fa0
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfwsub.nxv1f32.nxv1bf16.bf16(
+    <vscale x 1 x float> poison,
+    <vscale x 1 x bfloat> %0,
+    bfloat %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwsub.mask.nxv1f32.nxv1bf16.bf16(
+  <vscale x 1 x float>,
+  <vscale x 1 x bfloat>,
+  bfloat,
+  <vscale x 1 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwsub_mask_vf_nxv1f32_nxv1bf16_bf16(<vscale x 1 x float> %0, <vscale x 1 x bfloat> %1, bfloat %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub_mask_vf_nxv1f32_nxv1bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vfwsub.vf v8, v9, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfwsub.mask.nxv1f32.nxv1bf16.bf16(
+    <vscale x 1 x float> %0,
+    <vscale x 1 x bfloat> %1,
+    bfloat %2,
+    <vscale x 1 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwsub.nxv2f32.nxv2bf16.bf16(
+  <vscale x 2 x float>,
+  <vscale x 2 x bfloat>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwsub_vf_nxv2f32_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub_vf_nxv2f32_nxv2bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vfwsub.vf v9, v8, fa0
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfwsub.nxv2f32.nxv2bf16.bf16(
+    <vscale x 2 x float> poison,
+    <vscale x 2 x bfloat> %0,
+    bfloat %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwsub.mask.nxv2f32.nxv2bf16.bf16(
+  <vscale x 2 x float>,
+  <vscale x 2 x bfloat>,
+  bfloat,
+  <vscale x 2 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwsub_mask_vf_nxv2f32_nxv2bf16_bf16(<vscale x 2 x float> %0, <vscale x 2 x bfloat> %1, bfloat %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub_mask_vf_nxv2f32_nxv2bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vfwsub.vf v8, v9, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfwsub.mask.nxv2f32.nxv2bf16.bf16(
+    <vscale x 2 x float> %0,
+    <vscale x 2 x bfloat> %1,
+    bfloat %2,
+    <vscale x 2 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwsub.nxv4f32.nxv4bf16.bf16(
+  <vscale x 4 x float>,
+  <vscale x 4 x bfloat>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwsub_vf_nxv4f32_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub_vf_nxv4f32_nxv4bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwsub.vf v8, v10, fa0
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vfwsub.nxv4f32.nxv4bf16.bf16(
+    <vscale x 4 x float> poison,
+    <vscale x 4 x bfloat> %0,
+    bfloat %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwsub.mask.nxv4f32.nxv4bf16.bf16(
+  <vscale x 4 x float>,
+  <vscale x 4 x bfloat>,
+  bfloat,
+  <vscale x 4 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwsub_mask_vf_nxv4f32_nxv4bf16_bf16(<vscale x 4 x float> %0, <vscale x 4 x bfloat> %1, bfloat %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub_mask_vf_nxv4f32_nxv4bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vfwsub.vf v8, v10, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vfwsub.mask.nxv4f32.nxv4bf16.bf16(
+    <vscale x 4 x float> %0,
+    <vscale x 4 x bfloat> %1,
+    bfloat %2,
+    <vscale x 4 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwsub.nxv8f32.nxv8bf16.bf16(
+  <vscale x 8 x float>,
+  <vscale x 8 x bfloat>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwsub_vf_nxv8f32_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub_vf_nxv8f32_nxv8bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwsub.vf v8, v12, fa0
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vfwsub.nxv8f32.nxv8bf16.bf16(
+    <vscale x 8 x float> poison,
+    <vscale x 8 x bfloat> %0,
+    bfloat %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwsub.mask.nxv8f32.nxv8bf16.bf16(
+  <vscale x 8 x float>,
+  <vscale x 8 x bfloat>,
+  bfloat,
+  <vscale x 8 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwsub_mask_vf_nxv8f32_nxv8bf16_bf16(<vscale x 8 x float> %0, <vscale x 8 x bfloat> %1, bfloat %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub_mask_vf_nxv8f32_nxv8bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vfwsub.vf v8, v12, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vfwsub.mask.nxv8f32.nxv8bf16.bf16(
+    <vscale x 8 x float> %0,
+    <vscale x 8 x bfloat> %1,
+    bfloat %2,
+    <vscale x 8 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwsub.nxv16f32.nxv16bf16.bf16(
+  <vscale x 16 x float>,
+  <vscale x 16 x bfloat>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwsub_vf_nxv16f32_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub_vf_nxv16f32_nxv16bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwsub.vf v8, v16, fa0
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vfwsub.nxv16f32.nxv16bf16.bf16(
+    <vscale x 16 x float> poison,
+    <vscale x 16 x bfloat> %0,
+    bfloat %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwsub.mask.nxv16f32.nxv16bf16.bf16(
+  <vscale x 16 x float>,
+  <vscale x 16 x bfloat>,
+  bfloat,
+  <vscale x 16 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwsub_mask_vf_nxv16f32_nxv16bf16_bf16(<vscale x 16 x float> %0, <vscale x 16 x bfloat> %1, bfloat %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub_mask_vf_nxv16f32_nxv16bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT:    vfwsub.vf v8, v16, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vfwsub.mask.nxv16f32.nxv16bf16.bf16(
+    <vscale x 16 x float> %0,
+    <vscale x 16 x bfloat> %1,
+    bfloat %2,
+    <vscale x 16 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x float> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwsub-w-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vfwsub-w-bf.ll
new file mode 100644
index 0000000000000..b22899a100e4a
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwsub-w-bf.ll
@@ -0,0 +1,773 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x float> @llvm.riscv.vfwsub.w.nxv1f32.nxv1bf16(
+  <vscale x 1 x float>,
+  <vscale x 1 x float>,
+  <vscale x 1 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwsub.w_wv_nxv1f32_nxv1f32_nxv1bf16(<vscale x 1 x float> %0, <vscale x 1 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_wv_nxv1f32_nxv1f32_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfwsub.wv v8, v8, v9
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.nxv1f32.nxv1bf16(
+    <vscale x 1 x float> poison,
+    <vscale x 1 x float> %0,
+    <vscale x 1 x bfloat> %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.nxv1bf16(
+  <vscale x 1 x float>,
+  <vscale x 1 x float>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwsub.w_mask_wv_nxv1f32_nxv1f32_nxv1bf16(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_mask_wv_nxv1f32_nxv1f32_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vfwsub.wv v8, v9, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.nxv1bf16(
+    <vscale x 1 x float> %0,
+    <vscale x 1 x float> %1,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwsub.w.nxv2f32.nxv2bf16(
+  <vscale x 2 x float>,
+  <vscale x 2 x float>,
+  <vscale x 2 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwsub.w_wv_nxv2f32_nxv2f32_nxv2bf16(<vscale x 2 x float> %0, <vscale x 2 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_wv_nxv2f32_nxv2f32_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vfwsub.wv v8, v8, v9
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.nxv2f32.nxv2bf16(
+    <vscale x 2 x float> poison,
+    <vscale x 2 x float> %0,
+    <vscale x 2 x bfloat> %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.nxv2bf16(
+  <vscale x 2 x float>,
+  <vscale x 2 x float>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwsub.w_mask_wv_nxv2f32_nxv2f32_nxv2bf16(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_mask_wv_nxv2f32_nxv2f32_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vfwsub.wv v8, v9, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.nxv2bf16(
+    <vscale x 2 x float> %0,
+    <vscale x 2 x float> %1,
+    <vscale x 2 x bfloat> %2,
+    <vscale x 2 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwsub.w.nxv4f32.nxv4bf16(
+  <vscale x 4 x float>,
+  <vscale x 4 x float>,
+  <vscale x 4 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwsub.w_wv_nxv4f32_nxv4f32_nxv4bf16(<vscale x 4 x float> %0, <vscale x 4 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_wv_nxv4f32_nxv4f32_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vfwsub.wv v8, v8, v10
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.nxv4f32.nxv4bf16(
+    <vscale x 4 x float> poison,
+    <vscale x 4 x float> %0,
+    <vscale x 4 x bfloat> %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.nxv4bf16(
+  <vscale x 4 x float>,
+  <vscale x 4 x float>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwsub.w_mask_wv_nxv4f32_nxv4f32_nxv4bf16(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_mask_wv_nxv4f32_nxv4f32_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vfwsub.wv v8, v10, v12, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.nxv4bf16(
+    <vscale x 4 x float> %0,
+    <vscale x 4 x float> %1,
+    <vscale x 4 x bfloat> %2,
+    <vscale x 4 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwsub.w.nxv8f32.nxv8bf16(
+  <vscale x 8 x float>,
+  <vscale x 8 x float>,
+  <vscale x 8 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwsub.w_wv_nxv8f32_nxv8f32_nxv8bf16(<vscale x 8 x float> %0, <vscale x 8 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_wv_nxv8f32_nxv8f32_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vfwsub.wv v8, v8, v12
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.nxv8f32.nxv8bf16(
+    <vscale x 8 x float> poison,
+    <vscale x 8 x float> %0,
+    <vscale x 8 x bfloat> %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.nxv8bf16(
+  <vscale x 8 x float>,
+  <vscale x 8 x float>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwsub.w_mask_wv_nxv8f32_nxv8f32_nxv8bf16(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_mask_wv_nxv8f32_nxv8f32_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vfwsub.wv v8, v12, v16, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.nxv8bf16(
+    <vscale x 8 x float> %0,
+    <vscale x 8 x float> %1,
+    <vscale x 8 x bfloat> %2,
+    <vscale x 8 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwsub.w.nxv16f32.nxv16bf16(
+  <vscale x 16 x float>,
+  <vscale x 16 x float>,
+  <vscale x 16 x bfloat>,
+  iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwsub.w_wv_nxv16f32_nxv16f32_nxv16bf16(<vscale x 16 x float> %0, <vscale x 16 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_wv_nxv16f32_nxv16f32_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vfwsub.wv v8, v8, v16
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.nxv16f32.nxv16bf16(
+    <vscale x 16 x float> poison,
+    <vscale x 16 x float> %0,
+    <vscale x 16 x bfloat> %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.nxv16bf16(
+  <vscale x 16 x float>,
+  <vscale x 16 x float>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwsub.w_mask_wv_nxv16f32_nxv16f32_nxv16bf16(<vscale x 16 x float> %0, <vscale x 16 x float> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_mask_wv_nxv16f32_nxv16f32_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vl4re16.v v24, (a0)
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vsetvli zero, a1, e16alt, m4, ta, mu
+; CHECK-NEXT:    vfwsub.wv v8, v16, v24, v0.t
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.nxv16bf16(
+    <vscale x 16 x float> %0,
+    <vscale x 16 x float> %1,
+    <vscale x 16 x bfloat> %2,
+    <vscale x 16 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwsub.w.nxv1f32.bf16(
+  <vscale x 1 x float>,
+  <vscale x 1 x float>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwsub.w_wf_nxv1f32_nxv1f32_bf16(<vscale x 1 x float> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_wf_nxv1f32_nxv1f32_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfwsub.wf v8, v8, fa0
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.nxv1f32.bf16(
+    <vscale x 1 x float> poison,
+    <vscale x 1 x float> %0,
+    bfloat %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.bf16(
+  <vscale x 1 x float>,
+  <vscale x 1 x float>,
+  bfloat,
+  <vscale x 1 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 1 x float> @intrinsic_vfwsub.w_mask_wf_nxv1f32_nxv1f32_bf16(<vscale x 1 x float> %0, <vscale x 1 x float> %1, bfloat %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_mask_wf_nxv1f32_nxv1f32_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vfwsub.wf v8, v9, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.bf16(
+    <vscale x 1 x float> %0,
+    <vscale x 1 x float> %1,
+    bfloat %2,
+    <vscale x 1 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwsub.w.nxv2f32.bf16(
+  <vscale x 2 x float>,
+  <vscale x 2 x float>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwsub.w_wf_nxv2f32_nxv2f32_bf16(<vscale x 2 x float> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_wf_nxv2f32_nxv2f32_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vfwsub.wf v8, v8, fa0
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.nxv2f32.bf16(
+    <vscale x 2 x float> poison,
+    <vscale x 2 x float> %0,
+    bfloat %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.bf16(
+  <vscale x 2 x float>,
+  <vscale x 2 x float>,
+  bfloat,
+  <vscale x 2 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 2 x float> @intrinsic_vfwsub.w_mask_wf_nxv2f32_nxv2f32_bf16(<vscale x 2 x float> %0, <vscale x 2 x float> %1, bfloat %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_mask_wf_nxv2f32_nxv2f32_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vfwsub.wf v8, v9, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.bf16(
+    <vscale x 2 x float> %0,
+    <vscale x 2 x float> %1,
+    bfloat %2,
+    <vscale x 2 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwsub.w.nxv4f32.bf16(
+  <vscale x 4 x float>,
+  <vscale x 4 x float>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwsub.w_wf_nxv4f32_nxv4f32_bf16(<vscale x 4 x float> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_wf_nxv4f32_nxv4f32_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vfwsub.wf v8, v8, fa0
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.nxv4f32.bf16(
+    <vscale x 4 x float> poison,
+    <vscale x 4 x float> %0,
+    bfloat %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.bf16(
+  <vscale x 4 x float>,
+  <vscale x 4 x float>,
+  bfloat,
+  <vscale x 4 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 4 x float> @intrinsic_vfwsub.w_mask_wf_nxv4f32_nxv4f32_bf16(<vscale x 4 x float> %0, <vscale x 4 x float> %1, bfloat %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_mask_wf_nxv4f32_nxv4f32_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vfwsub.wf v8, v10, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.bf16(
+    <vscale x 4 x float> %0,
+    <vscale x 4 x float> %1,
+    bfloat %2,
+    <vscale x 4 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwsub.w.nxv8f32.bf16(
+  <vscale x 8 x float>,
+  <vscale x 8 x float>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwsub.w_wf_nxv8f32_nxv8f32_bf16(<vscale x 8 x float> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_wf_nxv8f32_nxv8f32_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vfwsub.wf v8, v8, fa0
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.nxv8f32.bf16(
+    <vscale x 8 x float> poison,
+    <vscale x 8 x float> %0,
+    bfloat %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.bf16(
+  <vscale x 8 x float>,
+  <vscale x 8 x float>,
+  bfloat,
+  <vscale x 8 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 8 x float> @intrinsic_vfwsub.w_mask_wf_nxv8f32_nxv8f32_bf16(<vscale x 8 x float> %0, <vscale x 8 x float> %1, bfloat %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_mask_wf_nxv8f32_nxv8f32_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vfwsub.wf v8, v12, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.bf16(
+    <vscale x 8 x float> %0,
+    <vscale x 8 x float> %1,
+    bfloat %2,
+    <vscale x 8 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwsub.w.nxv16f32.bf16(
+  <vscale x 16 x float>,
+  <vscale x 16 x float>,
+  bfloat,
+  iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwsub.w_wf_nxv16f32_nxv16f32_bf16(<vscale x 16 x float> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_wf_nxv16f32_nxv16f32_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vfwsub.wf v8, v8, fa0
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.nxv16f32.bf16(
+    <vscale x 16 x float> poison,
+    <vscale x 16 x float> %0,
+    bfloat %1,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.bf16(
+  <vscale x 16 x float>,
+  <vscale x 16 x float>,
+  bfloat,
+  <vscale x 16 x i1>,
+  iXLen, iXLen, iXLen);
+
+define <vscale x 16 x float> @intrinsic_vfwsub.w_mask_wf_nxv16f32_nxv16f32_bf16(<vscale x 16 x float> %0, <vscale x 16 x float> %1, bfloat %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_mask_wf_nxv16f32_nxv16f32_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT:    vfwsub.wf v8, v16, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.bf16(
+    <vscale x 16 x float> %0,
+    <vscale x 16 x float> %1,
+    bfloat %2,
+    <vscale x 16 x i1> %3,
+    iXLen 0, iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x float> %a
+}
+
+define <vscale x 1 x float> @intrinsic_vfwsub.w_mask_wv_tie_nxv1f32_nxv1f32_nxv1bf16(<vscale x 1 x float> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_mask_wv_tie_nxv1f32_nxv1f32_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vfwsub.wv v8, v8, v9, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.nxv1bf16(
+    <vscale x 1 x float> %0,
+    <vscale x 1 x float> %0,
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x i1> %2,
+    iXLen 0, iXLen %3, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+define <vscale x 2 x float> @intrinsic_vfwsub.w_mask_wv_tie_nxv2f32_nxv2f32_nxv2bf16(<vscale x 2 x float> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_mask_wv_tie_nxv2f32_nxv2f32_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vfwsub.wv v8, v8, v9, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.nxv2bf16(
+    <vscale x 2 x float> %0,
+    <vscale x 2 x float> %0,
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x i1> %2,
+    iXLen 0, iXLen %3, iXLen 1)
+
+  ret <vscale x 2 x float> %a
+}
+
+define <vscale x 4 x float> @intrinsic_vfwsub.w_mask_wv_tie_nxv4f32_nxv4f32_nxv4bf16(<vscale x 4 x float> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_mask_wv_tie_nxv4f32_nxv4f32_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vfwsub.wv v8, v8, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.nxv4bf16(
+    <vscale x 4 x float> %0,
+    <vscale x 4 x float> %0,
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x i1> %2,
+    iXLen 0, iXLen %3, iXLen 1)
+
+  ret <vscale x 4 x float> %a
+}
+
+define <vscale x 8 x float> @intrinsic_vfwsub.w_mask_wv_tie_nxv8f32_nxv8f32_nxv8bf16(<vscale x 8 x float> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_mask_wv_tie_nxv8f32_nxv8f32_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vfwsub.wv v8, v8, v12, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.nxv8bf16(
+    <vscale x 8 x float> %0,
+    <vscale x 8 x float> %0,
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x i1> %2,
+    iXLen 0, iXLen %3, iXLen 1)
+
+  ret <vscale x 8 x float> %a
+}
+
+define <vscale x 16 x float> @intrinsic_vfwsub.w_mask_wv_tie_nxv16f32_nxv16f32_nxv16bf16(<vscale x 16 x float> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_mask_wv_tie_nxv16f32_nxv16f32_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT:    vfwsub.wv v8, v8, v16, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.nxv16bf16(
+    <vscale x 16 x float> %0,
+    <vscale x 16 x float> %0,
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x i1> %2,
+    iXLen 0, iXLen %3, iXLen 1)
+
+  ret <vscale x 16 x float> %a
+}
+
+define <vscale x 1 x float> @intrinsic_vfwsub.w_mask_wf_tie_nxv1f32_nxv1f32_bf16(<vscale x 1 x float> %0, bfloat %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_mask_wf_tie_nxv1f32_nxv1f32_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vfwsub.wf v8, v8, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.mask.nxv1f32.bf16(
+    <vscale x 1 x float> %0,
+    <vscale x 1 x float> %0,
+    bfloat %1,
+    <vscale x 1 x i1> %2,
+    iXLen 0, iXLen %3, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+define <vscale x 2 x float> @intrinsic_vfwsub.w_mask_wf_tie_nxv2f32_nxv2f32_bf16(<vscale x 2 x float> %0, bfloat %1, <vscale x 2 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_mask_wf_tie_nxv2f32_nxv2f32_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vfwsub.wf v8, v8, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.mask.nxv2f32.bf16(
+    <vscale x 2 x float> %0,
+    <vscale x 2 x float> %0,
+    bfloat %1,
+    <vscale x 2 x i1> %2,
+    iXLen 0, iXLen %3, iXLen 1)
+
+  ret <vscale x 2 x float> %a
+}
+
+define <vscale x 4 x float> @intrinsic_vfwsub.w_mask_wf_tie_nxv4f32_nxv4f32_bf16(<vscale x 4 x float> %0, bfloat %1, <vscale x 4 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_mask_wf_tie_nxv4f32_nxv4f32_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vfwsub.wf v8, v8, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.mask.nxv4f32.bf16(
+    <vscale x 4 x float> %0,
+    <vscale x 4 x float> %0,
+    bfloat %1,
+    <vscale x 4 x i1> %2,
+    iXLen 0, iXLen %3, iXLen 1)
+
+  ret <vscale x 4 x float> %a
+}
+
+define <vscale x 8 x float> @intrinsic_vfwsub.w_mask_wf_tie_nxv8f32_nxv8f32_bf16(<vscale x 8 x float> %0, bfloat %1, <vscale x 8 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_mask_wf_tie_nxv8f32_nxv8f32_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vfwsub.wf v8, v8, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.mask.nxv8f32.bf16(
+    <vscale x 8 x float> %0,
+    <vscale x 8 x float> %0,
+    bfloat %1,
+    <vscale x 8 x i1> %2,
+    iXLen 0, iXLen %3, iXLen 1)
+
+  ret <vscale x 8 x float> %a
+}
+
+define <vscale x 16 x float> @intrinsic_vfwsub.w_mask_wf_tie_nxv16f32_nxv16f32_bf16(<vscale x 16 x float> %0, bfloat %1, <vscale x 16 x i1> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_mask_wf_tie_nxv16f32_nxv16f32_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT:    vfwsub.wf v8, v8, fa0, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vfwsub.w.mask.nxv16f32.bf16(
+    <vscale x 16 x float> %0,
+    <vscale x 16 x float> %0,
+    bfloat %1,
+    <vscale x 16 x i1> %2,
+    iXLen 0, iXLen %3, iXLen 1)
+
+  ret <vscale x 16 x float> %a
+}
+
+define <vscale x 1 x float> @intrinsic_vfwsub.w_wv_untie_nxv1f32_nxv1f32_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x float> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_wv_untie_nxv1f32_nxv1f32_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vfwsub.wv v10, v9, v8
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vfwsub.w.nxv1f32.nxv1bf16(
+    <vscale x 1 x float> poison,
+    <vscale x 1 x float> %1,
+    <vscale x 1 x bfloat> %0,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 1 x float> %a
+}
+
+define <vscale x 2 x float> @intrinsic_vfwsub.w_wv_untie_nxv2f32_nxv2f32_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x float> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_wv_untie_nxv2f32_nxv2f32_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vfwsub.wv v10, v9, v8
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vfwsub.w.nxv2f32.nxv2bf16(
+    <vscale x 2 x float> poison,
+    <vscale x 2 x float> %1,
+    <vscale x 2 x bfloat> %0,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 2 x float> %a
+}
+
+define <vscale x 4 x float> @intrinsic_vfwsub.w_wv_untie_nxv4f32_nxv4f32_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x float> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_wv_untie_nxv4f32_nxv4f32_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwsub.wv v8, v10, v12
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vfwsub.w.nxv4f32.nxv4bf16(
+    <vscale x 4 x float> poison,
+    <vscale x 4 x float> %1,
+    <vscale x 4 x bfloat> %0,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 4 x float> %a
+}
+
+define <vscale x 8 x float> @intrinsic_vfwsub.w_wv_untie_nxv8f32_nxv8f32_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x float> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vfwsub.w_wv_untie_nxv8f32_nxv8f32_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    fsrmi a0, 0
+; CHECK-NEXT:    vfwsub.wv v8, v12, v16
+; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vfwsub.w.nxv8f32.nxv8bf16(
+    <vscale x 8 x float> poison,
+    <vscale x 8 x float> %1,
+    <vscale x 8 x bfloat> %0,
+    iXLen 0, iXLen %2)
+
+  ret <vscale x 8 x float> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmfeq-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vmfeq-bf.ll
new file mode 100644
index 0000000000000..9bd859b3452f2
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vmfeq-bf.ll
@@ -0,0 +1,496 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x i1> @llvm.riscv.vmfeq.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmfeq_vv_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfeq_vv_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i1> @llvm.riscv.vmfeq.nxv1bf16(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 1 x i1> @llvm.riscv.vmfeq.mask.nxv1bf16(
+  <vscale x 1 x i1>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmfeq_mask_vv_nxv1bf16_nxv1bf16(<vscale x 1 x i1> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfeq_mask_vv_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfeq.vv v0, v8, v9
+; CHECK-NEXT:    vmfeq.vv v11, v9, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
+; CHECK-NEXT:    ret
+entry:
+  %mask = call <vscale x 1 x i1> @llvm.riscv.vmfeq.nxv1bf16(
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    iXLen %4)
+  %a = call <vscale x 1 x i1> @llvm.riscv.vmfeq.mask.nxv1bf16(
+    <vscale x 1 x i1> %0,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x bfloat> %3,
+    <vscale x 1 x i1> %mask,
+    iXLen %4)
+
+  ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmfeq.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmfeq_vv_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfeq_vv_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i1> @llvm.riscv.vmfeq.nxv2bf16(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmfeq.mask.nxv2bf16(
+  <vscale x 2 x i1>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmfeq_mask_vv_nxv2bf16_nxv2bf16(<vscale x 2 x i1> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfeq_mask_vv_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfeq.vv v0, v8, v9
+; CHECK-NEXT:    vmfeq.vv v11, v9, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
+; CHECK-NEXT:    ret
+entry:
+  %mask = call <vscale x 2 x i1> @llvm.riscv.vmfeq.nxv2bf16(
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x bfloat> %2,
+    iXLen %4)
+  %a = call <vscale x 2 x i1> @llvm.riscv.vmfeq.mask.nxv2bf16(
+    <vscale x 2 x i1> %0,
+    <vscale x 2 x bfloat> %2,
+    <vscale x 2 x bfloat> %3,
+    <vscale x 2 x i1> %mask,
+    iXLen %4)
+
+  ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmfeq.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmfeq_vv_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfeq_vv_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i1> @llvm.riscv.vmfeq.nxv4bf16(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmfeq.mask.nxv4bf16(
+  <vscale x 4 x i1>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmfeq_mask_vv_nxv4bf16_nxv4bf16(<vscale x 4 x i1> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfeq_mask_vv_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfeq.vv v0, v8, v9
+; CHECK-NEXT:    vmfeq.vv v11, v9, v10, v0.t
+; CHECK-NEXT:    vmv.v.v v0, v11
+; CHECK-NEXT:    ret
+entry:
+  %mask = call <vscale x 4 x i1> @llvm.riscv.vmfeq.nxv4bf16(
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x bfloat> %2,
+    iXLen %4)
+  %a = call <vscale x 4 x i1> @llvm.riscv.vmfeq.mask.nxv4bf16(
+    <vscale x 4 x i1> %0,
+    <vscale x 4 x bfloat> %2,
+    <vscale x 4 x bfloat> %3,
+    <vscale x 4 x i1> %mask,
+    iXLen %4)
+
+  ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmfeq.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmfeq_vv_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfeq_vv_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i1> @llvm.riscv.vmfeq.nxv8bf16(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmfeq.mask.nxv8bf16(
+  <vscale x 8 x i1>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmfeq_mask_vv_nxv8bf16_nxv8bf16(<vscale x 8 x i1> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfeq_mask_vv_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vmv1r.v v14, v0
+; CHECK-NEXT:    vmfeq.vv v0, v8, v10
+; CHECK-NEXT:    vmfeq.vv v14, v10, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v14
+; CHECK-NEXT:    ret
+entry:
+  %mask = call <vscale x 8 x i1> @llvm.riscv.vmfeq.nxv8bf16(
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x bfloat> %2,
+    iXLen %4)
+  %a = call <vscale x 8 x i1> @llvm.riscv.vmfeq.mask.nxv8bf16(
+    <vscale x 8 x i1> %0,
+    <vscale x 8 x bfloat> %2,
+    <vscale x 8 x bfloat> %3,
+    <vscale x 8 x i1> %mask,
+    iXLen %4)
+
+  ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmfeq.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmfeq_vv_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfeq_vv_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i1> @llvm.riscv.vmfeq.nxv16bf16(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmfeq.mask.nxv16bf16(
+  <vscale x 16 x i1>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmfeq_mask_vv_nxv16bf16_nxv16bf16(<vscale x 16 x i1> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfeq_mask_vv_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT:    vmv1r.v v20, v0
+; CHECK-NEXT:    vmfeq.vv v0, v8, v12
+; CHECK-NEXT:    vmfeq.vv v20, v12, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v20
+; CHECK-NEXT:    ret
+entry:
+  %mask = call <vscale x 16 x i1> @llvm.riscv.vmfeq.nxv16bf16(
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x bfloat> %2,
+    iXLen %4)
+  %a = call <vscale x 16 x i1> @llvm.riscv.vmfeq.mask.nxv16bf16(
+    <vscale x 16 x i1> %0,
+    <vscale x 16 x bfloat> %2,
+    <vscale x 16 x bfloat> %3,
+    <vscale x 16 x i1> %mask,
+    iXLen %4)
+
+  ret <vscale x 16 x i1> %a
+}
+
+declare <vscale x 1 x i1> @llvm.riscv.vmfeq.nxv1bf16.bf16(
+  <vscale x 1 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmfeq_vf_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfeq_vf_nxv1bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vmfeq.vf v0, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i1> @llvm.riscv.vmfeq.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 1 x i1> @llvm.riscv.vmfeq.mask.nxv1bf16.bf16(
+  <vscale x 1 x i1>,
+  <vscale x 1 x bfloat>,
+  bfloat,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmfeq_mask_vf_nxv1bf16_bf16(<vscale x 1 x i1> %0, <vscale x 1 x bfloat> %1, bfloat %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfeq_mask_vf_nxv1bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmfeq.vf v10, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i1> @llvm.riscv.vmfeq.mask.nxv1bf16.bf16(
+    <vscale x 1 x i1> %0,
+    <vscale x 1 x bfloat> %1,
+    bfloat %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmfeq.nxv2bf16.bf16(
+  <vscale x 2 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmfeq_vf_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfeq_vf_nxv2bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vmfeq.vf v0, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i1> @llvm.riscv.vmfeq.nxv2bf16.bf16(
+    <vscale x 2 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmfeq.mask.nxv2bf16.bf16(
+  <vscale x 2 x i1>,
+  <vscale x 2 x bfloat>,
+  bfloat,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmfeq_mask_vf_nxv2bf16_bf16(<vscale x 2 x i1> %0, <vscale x 2 x bfloat> %1, bfloat %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfeq_mask_vf_nxv2bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmfeq.vf v10, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i1> @llvm.riscv.vmfeq.mask.nxv2bf16.bf16(
+    <vscale x 2 x i1> %0,
+    <vscale x 2 x bfloat> %1,
+    bfloat %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmfeq.nxv4bf16.bf16(
+  <vscale x 4 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmfeq_vf_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfeq_vf_nxv4bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vmfeq.vf v0, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i1> @llvm.riscv.vmfeq.nxv4bf16.bf16(
+    <vscale x 4 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmfeq.mask.nxv4bf16.bf16(
+  <vscale x 4 x i1>,
+  <vscale x 4 x bfloat>,
+  bfloat,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmfeq_mask_vf_nxv4bf16_bf16(<vscale x 4 x i1> %0, <vscale x 4 x bfloat> %1, bfloat %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfeq_mask_vf_nxv4bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmfeq.vf v10, v8, fa0, v0.t
+; CHECK-NEXT:    vmv.v.v v0, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i1> @llvm.riscv.vmfeq.mask.nxv4bf16.bf16(
+    <vscale x 4 x i1> %0,
+    <vscale x 4 x bfloat> %1,
+    bfloat %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmfeq.nxv8bf16.bf16(
+  <vscale x 8 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmfeq_vf_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfeq_vf_nxv8bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vmfeq.vf v0, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i1> @llvm.riscv.vmfeq.nxv8bf16.bf16(
+    <vscale x 8 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmfeq.mask.nxv8bf16.bf16(
+  <vscale x 8 x i1>,
+  <vscale x 8 x bfloat>,
+  bfloat,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmfeq_mask_vf_nxv8bf16_bf16(<vscale x 8 x i1> %0, <vscale x 8 x bfloat> %1, bfloat %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfeq_mask_vf_nxv8bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmfeq.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i1> @llvm.riscv.vmfeq.mask.nxv8bf16.bf16(
+    <vscale x 8 x i1> %0,
+    <vscale x 8 x bfloat> %1,
+    bfloat %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmfeq.nxv16bf16.bf16(
+  <vscale x 16 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmfeq_vf_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfeq_vf_nxv16bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vmfeq.vf v0, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i1> @llvm.riscv.vmfeq.nxv16bf16.bf16(
+    <vscale x 16 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmfeq.mask.nxv16bf16.bf16(
+  <vscale x 16 x i1>,
+  <vscale x 16 x bfloat>,
+  bfloat,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmfeq_mask_vf_nxv16bf16_bf16(<vscale x 16 x i1> %0, <vscale x 16 x bfloat> %1, bfloat %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfeq_mask_vf_nxv16bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmfeq.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i1> @llvm.riscv.vmfeq.mask.nxv16bf16.bf16(
+    <vscale x 16 x i1> %0,
+    <vscale x 16 x bfloat> %1,
+    bfloat %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret <vscale x 16 x i1> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmfge-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vmfge-bf.ll
new file mode 100644
index 0000000000000..73946dc1a744c
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vmfge-bf.ll
@@ -0,0 +1,496 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x i1> @llvm.riscv.vmfge.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmfge_vv_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfge_vv_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vmfle.vv v0, v9, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i1> @llvm.riscv.vmfge.nxv1bf16(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 1 x i1> @llvm.riscv.vmfge.mask.nxv1bf16(
+  <vscale x 1 x i1>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmfge_mask_vv_nxv1bf16_nxv1bf16(<vscale x 1 x i1> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfge_mask_vv_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfle.vv v0, v9, v8
+; CHECK-NEXT:    vmfle.vv v11, v10, v9, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
+; CHECK-NEXT:    ret
+entry:
+  %mask = call <vscale x 1 x i1> @llvm.riscv.vmfge.nxv1bf16(
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    iXLen %4)
+  %a = call <vscale x 1 x i1> @llvm.riscv.vmfge.mask.nxv1bf16(
+    <vscale x 1 x i1> %0,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x bfloat> %3,
+    <vscale x 1 x i1> %mask,
+    iXLen %4)
+
+  ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmfge.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmfge_vv_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfge_vv_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vmfle.vv v0, v9, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i1> @llvm.riscv.vmfge.nxv2bf16(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmfge.mask.nxv2bf16(
+  <vscale x 2 x i1>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmfge_mask_vv_nxv2bf16_nxv2bf16(<vscale x 2 x i1> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfge_mask_vv_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfle.vv v0, v9, v8
+; CHECK-NEXT:    vmfle.vv v11, v10, v9, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
+; CHECK-NEXT:    ret
+entry:
+  %mask = call <vscale x 2 x i1> @llvm.riscv.vmfge.nxv2bf16(
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x bfloat> %2,
+    iXLen %4)
+  %a = call <vscale x 2 x i1> @llvm.riscv.vmfge.mask.nxv2bf16(
+    <vscale x 2 x i1> %0,
+    <vscale x 2 x bfloat> %2,
+    <vscale x 2 x bfloat> %3,
+    <vscale x 2 x i1> %mask,
+    iXLen %4)
+
+  ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmfge.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmfge_vv_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfge_vv_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vmfle.vv v0, v9, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i1> @llvm.riscv.vmfge.nxv4bf16(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmfge.mask.nxv4bf16(
+  <vscale x 4 x i1>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmfge_mask_vv_nxv4bf16_nxv4bf16(<vscale x 4 x i1> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfge_mask_vv_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfle.vv v0, v9, v8
+; CHECK-NEXT:    vmfle.vv v11, v10, v9, v0.t
+; CHECK-NEXT:    vmv.v.v v0, v11
+; CHECK-NEXT:    ret
+entry:
+  %mask = call <vscale x 4 x i1> @llvm.riscv.vmfge.nxv4bf16(
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x bfloat> %2,
+    iXLen %4)
+  %a = call <vscale x 4 x i1> @llvm.riscv.vmfge.mask.nxv4bf16(
+    <vscale x 4 x i1> %0,
+    <vscale x 4 x bfloat> %2,
+    <vscale x 4 x bfloat> %3,
+    <vscale x 4 x i1> %mask,
+    iXLen %4)
+
+  ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmfge.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmfge_vv_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfge_vv_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vmfle.vv v0, v10, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i1> @llvm.riscv.vmfge.nxv8bf16(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmfge.mask.nxv8bf16(
+  <vscale x 8 x i1>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmfge_mask_vv_nxv8bf16_nxv8bf16(<vscale x 8 x i1> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfge_mask_vv_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vmv1r.v v14, v0
+; CHECK-NEXT:    vmfle.vv v0, v10, v8
+; CHECK-NEXT:    vmfle.vv v14, v12, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v14
+; CHECK-NEXT:    ret
+entry:
+  %mask = call <vscale x 8 x i1> @llvm.riscv.vmfge.nxv8bf16(
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x bfloat> %2,
+    iXLen %4)
+  %a = call <vscale x 8 x i1> @llvm.riscv.vmfge.mask.nxv8bf16(
+    <vscale x 8 x i1> %0,
+    <vscale x 8 x bfloat> %2,
+    <vscale x 8 x bfloat> %3,
+    <vscale x 8 x i1> %mask,
+    iXLen %4)
+
+  ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmfge.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmfge_vv_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfge_vv_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vmfle.vv v0, v12, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i1> @llvm.riscv.vmfge.nxv16bf16(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmfge.mask.nxv16bf16(
+  <vscale x 16 x i1>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmfge_mask_vv_nxv16bf16_nxv16bf16(<vscale x 16 x i1> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfge_mask_vv_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT:    vmv1r.v v20, v0
+; CHECK-NEXT:    vmfle.vv v0, v12, v8
+; CHECK-NEXT:    vmfle.vv v20, v16, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v20
+; CHECK-NEXT:    ret
+entry:
+  %mask = call <vscale x 16 x i1> @llvm.riscv.vmfge.nxv16bf16(
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x bfloat> %2,
+    iXLen %4)
+  %a = call <vscale x 16 x i1> @llvm.riscv.vmfge.mask.nxv16bf16(
+    <vscale x 16 x i1> %0,
+    <vscale x 16 x bfloat> %2,
+    <vscale x 16 x bfloat> %3,
+    <vscale x 16 x i1> %mask,
+    iXLen %4)
+
+  ret <vscale x 16 x i1> %a
+}
+
+declare <vscale x 1 x i1> @llvm.riscv.vmfge.nxv1bf16.bf16(
+  <vscale x 1 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmfge_vf_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfge_vf_nxv1bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vmfge.vf v0, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i1> @llvm.riscv.vmfge.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 1 x i1> @llvm.riscv.vmfge.mask.nxv1bf16.bf16(
+  <vscale x 1 x i1>,
+  <vscale x 1 x bfloat>,
+  bfloat,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmfge_mask_vf_nxv1bf16_bf16(<vscale x 1 x i1> %0, <vscale x 1 x bfloat> %1, bfloat %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfge_mask_vf_nxv1bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmfge.vf v10, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i1> @llvm.riscv.vmfge.mask.nxv1bf16.bf16(
+    <vscale x 1 x i1> %0,
+    <vscale x 1 x bfloat> %1,
+    bfloat %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmfge.nxv2bf16.bf16(
+  <vscale x 2 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmfge_vf_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfge_vf_nxv2bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vmfge.vf v0, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i1> @llvm.riscv.vmfge.nxv2bf16.bf16(
+    <vscale x 2 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmfge.mask.nxv2bf16.bf16(
+  <vscale x 2 x i1>,
+  <vscale x 2 x bfloat>,
+  bfloat,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmfge_mask_vf_nxv2bf16_bf16(<vscale x 2 x i1> %0, <vscale x 2 x bfloat> %1, bfloat %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfge_mask_vf_nxv2bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmfge.vf v10, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i1> @llvm.riscv.vmfge.mask.nxv2bf16.bf16(
+    <vscale x 2 x i1> %0,
+    <vscale x 2 x bfloat> %1,
+    bfloat %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmfge.nxv4bf16.bf16(
+  <vscale x 4 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmfge_vf_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfge_vf_nxv4bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vmfge.vf v0, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i1> @llvm.riscv.vmfge.nxv4bf16.bf16(
+    <vscale x 4 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmfge.mask.nxv4bf16.bf16(
+  <vscale x 4 x i1>,
+  <vscale x 4 x bfloat>,
+  bfloat,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmfge_mask_vf_nxv4bf16_bf16(<vscale x 4 x i1> %0, <vscale x 4 x bfloat> %1, bfloat %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfge_mask_vf_nxv4bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmfge.vf v10, v8, fa0, v0.t
+; CHECK-NEXT:    vmv.v.v v0, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i1> @llvm.riscv.vmfge.mask.nxv4bf16.bf16(
+    <vscale x 4 x i1> %0,
+    <vscale x 4 x bfloat> %1,
+    bfloat %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmfge.nxv8bf16.bf16(
+  <vscale x 8 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmfge_vf_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfge_vf_nxv8bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vmfge.vf v0, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i1> @llvm.riscv.vmfge.nxv8bf16.bf16(
+    <vscale x 8 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmfge.mask.nxv8bf16.bf16(
+  <vscale x 8 x i1>,
+  <vscale x 8 x bfloat>,
+  bfloat,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmfge_mask_vf_nxv8bf16_bf16(<vscale x 8 x i1> %0, <vscale x 8 x bfloat> %1, bfloat %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfge_mask_vf_nxv8bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmfge.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i1> @llvm.riscv.vmfge.mask.nxv8bf16.bf16(
+    <vscale x 8 x i1> %0,
+    <vscale x 8 x bfloat> %1,
+    bfloat %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmfge.nxv16bf16.bf16(
+  <vscale x 16 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmfge_vf_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfge_vf_nxv16bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vmfge.vf v0, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i1> @llvm.riscv.vmfge.nxv16bf16.bf16(
+    <vscale x 16 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmfge.mask.nxv16bf16.bf16(
+  <vscale x 16 x i1>,
+  <vscale x 16 x bfloat>,
+  bfloat,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmfge_mask_vf_nxv16bf16_bf16(<vscale x 16 x i1> %0, <vscale x 16 x bfloat> %1, bfloat %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfge_mask_vf_nxv16bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmfge.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i1> @llvm.riscv.vmfge.mask.nxv16bf16.bf16(
+    <vscale x 16 x i1> %0,
+    <vscale x 16 x bfloat> %1,
+    bfloat %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret <vscale x 16 x i1> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmfgt-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vmfgt-bf.ll
new file mode 100644
index 0000000000000..fac324ca5c125
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vmfgt-bf.ll
@@ -0,0 +1,496 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x i1> @llvm.riscv.vmfgt.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmfgt_vv_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfgt_vv_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vmflt.vv v0, v9, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i1> @llvm.riscv.vmfgt.nxv1bf16(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 1 x i1> @llvm.riscv.vmfgt.mask.nxv1bf16(
+  <vscale x 1 x i1>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmfgt_mask_vv_nxv1bf16_nxv1bf16(<vscale x 1 x i1> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfgt_mask_vv_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmflt.vv v0, v9, v8
+; CHECK-NEXT:    vmflt.vv v11, v10, v9, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
+; CHECK-NEXT:    ret
+entry:
+  %mask = call <vscale x 1 x i1> @llvm.riscv.vmfgt.nxv1bf16(
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    iXLen %4)
+  %a = call <vscale x 1 x i1> @llvm.riscv.vmfgt.mask.nxv1bf16(
+    <vscale x 1 x i1> %0,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x bfloat> %3,
+    <vscale x 1 x i1> %mask,
+    iXLen %4)
+
+  ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmfgt.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmfgt_vv_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfgt_vv_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vmflt.vv v0, v9, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i1> @llvm.riscv.vmfgt.nxv2bf16(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmfgt.mask.nxv2bf16(
+  <vscale x 2 x i1>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmfgt_mask_vv_nxv2bf16_nxv2bf16(<vscale x 2 x i1> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfgt_mask_vv_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmflt.vv v0, v9, v8
+; CHECK-NEXT:    vmflt.vv v11, v10, v9, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
+; CHECK-NEXT:    ret
+entry:
+  %mask = call <vscale x 2 x i1> @llvm.riscv.vmfgt.nxv2bf16(
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x bfloat> %2,
+    iXLen %4)
+  %a = call <vscale x 2 x i1> @llvm.riscv.vmfgt.mask.nxv2bf16(
+    <vscale x 2 x i1> %0,
+    <vscale x 2 x bfloat> %2,
+    <vscale x 2 x bfloat> %3,
+    <vscale x 2 x i1> %mask,
+    iXLen %4)
+
+  ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmfgt.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmfgt_vv_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfgt_vv_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vmflt.vv v0, v9, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i1> @llvm.riscv.vmfgt.nxv4bf16(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmfgt.mask.nxv4bf16(
+  <vscale x 4 x i1>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmfgt_mask_vv_nxv4bf16_nxv4bf16(<vscale x 4 x i1> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfgt_mask_vv_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmflt.vv v0, v9, v8
+; CHECK-NEXT:    vmflt.vv v11, v10, v9, v0.t
+; CHECK-NEXT:    vmv.v.v v0, v11
+; CHECK-NEXT:    ret
+entry:
+  %mask = call <vscale x 4 x i1> @llvm.riscv.vmfgt.nxv4bf16(
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x bfloat> %2,
+    iXLen %4)
+  %a = call <vscale x 4 x i1> @llvm.riscv.vmfgt.mask.nxv4bf16(
+    <vscale x 4 x i1> %0,
+    <vscale x 4 x bfloat> %2,
+    <vscale x 4 x bfloat> %3,
+    <vscale x 4 x i1> %mask,
+    iXLen %4)
+
+  ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmfgt.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmfgt_vv_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfgt_vv_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vmflt.vv v0, v10, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i1> @llvm.riscv.vmfgt.nxv8bf16(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmfgt.mask.nxv8bf16(
+  <vscale x 8 x i1>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmfgt_mask_vv_nxv8bf16_nxv8bf16(<vscale x 8 x i1> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfgt_mask_vv_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vmv1r.v v14, v0
+; CHECK-NEXT:    vmflt.vv v0, v10, v8
+; CHECK-NEXT:    vmflt.vv v14, v12, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v14
+; CHECK-NEXT:    ret
+entry:
+  %mask = call <vscale x 8 x i1> @llvm.riscv.vmfgt.nxv8bf16(
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x bfloat> %2,
+    iXLen %4)
+  %a = call <vscale x 8 x i1> @llvm.riscv.vmfgt.mask.nxv8bf16(
+    <vscale x 8 x i1> %0,
+    <vscale x 8 x bfloat> %2,
+    <vscale x 8 x bfloat> %3,
+    <vscale x 8 x i1> %mask,
+    iXLen %4)
+
+  ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmfgt.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmfgt_vv_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfgt_vv_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vmflt.vv v0, v12, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i1> @llvm.riscv.vmfgt.nxv16bf16(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmfgt.mask.nxv16bf16(
+  <vscale x 16 x i1>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmfgt_mask_vv_nxv16bf16_nxv16bf16(<vscale x 16 x i1> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfgt_mask_vv_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT:    vmv1r.v v20, v0
+; CHECK-NEXT:    vmflt.vv v0, v12, v8
+; CHECK-NEXT:    vmflt.vv v20, v16, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v20
+; CHECK-NEXT:    ret
+entry:
+  %mask = call <vscale x 16 x i1> @llvm.riscv.vmfgt.nxv16bf16(
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x bfloat> %2,
+    iXLen %4)
+  %a = call <vscale x 16 x i1> @llvm.riscv.vmfgt.mask.nxv16bf16(
+    <vscale x 16 x i1> %0,
+    <vscale x 16 x bfloat> %2,
+    <vscale x 16 x bfloat> %3,
+    <vscale x 16 x i1> %mask,
+    iXLen %4)
+
+  ret <vscale x 16 x i1> %a
+}
+
+declare <vscale x 1 x i1> @llvm.riscv.vmfgt.nxv1bf16.bf16(
+  <vscale x 1 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmfgt_vf_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfgt_vf_nxv1bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vmfgt.vf v0, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i1> @llvm.riscv.vmfgt.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 1 x i1> @llvm.riscv.vmfgt.mask.nxv1bf16.bf16(
+  <vscale x 1 x i1>,
+  <vscale x 1 x bfloat>,
+  bfloat,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmfgt_mask_vf_nxv1bf16_bf16(<vscale x 1 x i1> %0, <vscale x 1 x bfloat> %1, bfloat %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfgt_mask_vf_nxv1bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i1> @llvm.riscv.vmfgt.mask.nxv1bf16.bf16(
+    <vscale x 1 x i1> %0,
+    <vscale x 1 x bfloat> %1,
+    bfloat %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmfgt.nxv2bf16.bf16(
+  <vscale x 2 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmfgt_vf_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfgt_vf_nxv2bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vmfgt.vf v0, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i1> @llvm.riscv.vmfgt.nxv2bf16.bf16(
+    <vscale x 2 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmfgt.mask.nxv2bf16.bf16(
+  <vscale x 2 x i1>,
+  <vscale x 2 x bfloat>,
+  bfloat,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmfgt_mask_vf_nxv2bf16_bf16(<vscale x 2 x i1> %0, <vscale x 2 x bfloat> %1, bfloat %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfgt_mask_vf_nxv2bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i1> @llvm.riscv.vmfgt.mask.nxv2bf16.bf16(
+    <vscale x 2 x i1> %0,
+    <vscale x 2 x bfloat> %1,
+    bfloat %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmfgt.nxv4bf16.bf16(
+  <vscale x 4 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmfgt_vf_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfgt_vf_nxv4bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vmfgt.vf v0, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i1> @llvm.riscv.vmfgt.nxv4bf16.bf16(
+    <vscale x 4 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmfgt.mask.nxv4bf16.bf16(
+  <vscale x 4 x i1>,
+  <vscale x 4 x bfloat>,
+  bfloat,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmfgt_mask_vf_nxv4bf16_bf16(<vscale x 4 x i1> %0, <vscale x 4 x bfloat> %1, bfloat %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfgt_mask_vf_nxv4bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmfgt.vf v10, v8, fa0, v0.t
+; CHECK-NEXT:    vmv.v.v v0, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i1> @llvm.riscv.vmfgt.mask.nxv4bf16.bf16(
+    <vscale x 4 x i1> %0,
+    <vscale x 4 x bfloat> %1,
+    bfloat %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmfgt.nxv8bf16.bf16(
+  <vscale x 8 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmfgt_vf_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfgt_vf_nxv8bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vmfgt.vf v0, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i1> @llvm.riscv.vmfgt.nxv8bf16.bf16(
+    <vscale x 8 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmfgt.mask.nxv8bf16.bf16(
+  <vscale x 8 x i1>,
+  <vscale x 8 x bfloat>,
+  bfloat,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmfgt_mask_vf_nxv8bf16_bf16(<vscale x 8 x i1> %0, <vscale x 8 x bfloat> %1, bfloat %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfgt_mask_vf_nxv8bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i1> @llvm.riscv.vmfgt.mask.nxv8bf16.bf16(
+    <vscale x 8 x i1> %0,
+    <vscale x 8 x bfloat> %1,
+    bfloat %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmfgt.nxv16bf16.bf16(
+  <vscale x 16 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmfgt_vf_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfgt_vf_nxv16bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vmfgt.vf v0, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i1> @llvm.riscv.vmfgt.nxv16bf16.bf16(
+    <vscale x 16 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmfgt.mask.nxv16bf16.bf16(
+  <vscale x 16 x i1>,
+  <vscale x 16 x bfloat>,
+  bfloat,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmfgt_mask_vf_nxv16bf16_bf16(<vscale x 16 x i1> %0, <vscale x 16 x bfloat> %1, bfloat %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfgt_mask_vf_nxv16bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmfgt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i1> @llvm.riscv.vmfgt.mask.nxv16bf16.bf16(
+    <vscale x 16 x i1> %0,
+    <vscale x 16 x bfloat> %1,
+    bfloat %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret <vscale x 16 x i1> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmfle-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vmfle-bf.ll
new file mode 100644
index 0000000000000..8356b7bbd3ff7
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vmfle-bf.ll
@@ -0,0 +1,496 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x i1> @llvm.riscv.vmfle.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmfle_vv_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfle_vv_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vmfle.vv v0, v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i1> @llvm.riscv.vmfle.nxv1bf16(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 1 x i1> @llvm.riscv.vmfle.mask.nxv1bf16(
+  <vscale x 1 x i1>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmfle_mask_vv_nxv1bf16_nxv1bf16(<vscale x 1 x i1> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfle_mask_vv_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfle.vv v0, v8, v9
+; CHECK-NEXT:    vmfle.vv v11, v9, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
+; CHECK-NEXT:    ret
+entry:
+  %mask = call <vscale x 1 x i1> @llvm.riscv.vmfle.nxv1bf16(
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    iXLen %4)
+  %a = call <vscale x 1 x i1> @llvm.riscv.vmfle.mask.nxv1bf16(
+    <vscale x 1 x i1> %0,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x bfloat> %3,
+    <vscale x 1 x i1> %mask,
+    iXLen %4)
+
+  ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmfle.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmfle_vv_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfle_vv_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vmfle.vv v0, v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i1> @llvm.riscv.vmfle.nxv2bf16(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmfle.mask.nxv2bf16(
+  <vscale x 2 x i1>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmfle_mask_vv_nxv2bf16_nxv2bf16(<vscale x 2 x i1> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfle_mask_vv_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfle.vv v0, v8, v9
+; CHECK-NEXT:    vmfle.vv v11, v9, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
+; CHECK-NEXT:    ret
+entry:
+  %mask = call <vscale x 2 x i1> @llvm.riscv.vmfle.nxv2bf16(
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x bfloat> %2,
+    iXLen %4)
+  %a = call <vscale x 2 x i1> @llvm.riscv.vmfle.mask.nxv2bf16(
+    <vscale x 2 x i1> %0,
+    <vscale x 2 x bfloat> %2,
+    <vscale x 2 x bfloat> %3,
+    <vscale x 2 x i1> %mask,
+    iXLen %4)
+
+  ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmfle.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmfle_vv_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfle_vv_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vmfle.vv v0, v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i1> @llvm.riscv.vmfle.nxv4bf16(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmfle.mask.nxv4bf16(
+  <vscale x 4 x i1>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmfle_mask_vv_nxv4bf16_nxv4bf16(<vscale x 4 x i1> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfle_mask_vv_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfle.vv v0, v8, v9
+; CHECK-NEXT:    vmfle.vv v11, v9, v10, v0.t
+; CHECK-NEXT:    vmv.v.v v0, v11
+; CHECK-NEXT:    ret
+entry:
+  %mask = call <vscale x 4 x i1> @llvm.riscv.vmfle.nxv4bf16(
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x bfloat> %2,
+    iXLen %4)
+  %a = call <vscale x 4 x i1> @llvm.riscv.vmfle.mask.nxv4bf16(
+    <vscale x 4 x i1> %0,
+    <vscale x 4 x bfloat> %2,
+    <vscale x 4 x bfloat> %3,
+    <vscale x 4 x i1> %mask,
+    iXLen %4)
+
+  ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmfle.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmfle_vv_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfle_vv_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vmfle.vv v0, v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i1> @llvm.riscv.vmfle.nxv8bf16(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmfle.mask.nxv8bf16(
+  <vscale x 8 x i1>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmfle_mask_vv_nxv8bf16_nxv8bf16(<vscale x 8 x i1> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfle_mask_vv_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vmv1r.v v14, v0
+; CHECK-NEXT:    vmfle.vv v0, v8, v10
+; CHECK-NEXT:    vmfle.vv v14, v10, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v14
+; CHECK-NEXT:    ret
+entry:
+  %mask = call <vscale x 8 x i1> @llvm.riscv.vmfle.nxv8bf16(
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x bfloat> %2,
+    iXLen %4)
+  %a = call <vscale x 8 x i1> @llvm.riscv.vmfle.mask.nxv8bf16(
+    <vscale x 8 x i1> %0,
+    <vscale x 8 x bfloat> %2,
+    <vscale x 8 x bfloat> %3,
+    <vscale x 8 x i1> %mask,
+    iXLen %4)
+
+  ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmfle.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmfle_vv_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfle_vv_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vmfle.vv v0, v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i1> @llvm.riscv.vmfle.nxv16bf16(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmfle.mask.nxv16bf16(
+  <vscale x 16 x i1>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmfle_mask_vv_nxv16bf16_nxv16bf16(<vscale x 16 x i1> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfle_mask_vv_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT:    vmv1r.v v20, v0
+; CHECK-NEXT:    vmfle.vv v0, v8, v12
+; CHECK-NEXT:    vmfle.vv v20, v12, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v20
+; CHECK-NEXT:    ret
+entry:
+  %mask = call <vscale x 16 x i1> @llvm.riscv.vmfle.nxv16bf16(
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x bfloat> %2,
+    iXLen %4)
+  %a = call <vscale x 16 x i1> @llvm.riscv.vmfle.mask.nxv16bf16(
+    <vscale x 16 x i1> %0,
+    <vscale x 16 x bfloat> %2,
+    <vscale x 16 x bfloat> %3,
+    <vscale x 16 x i1> %mask,
+    iXLen %4)
+
+  ret <vscale x 16 x i1> %a
+}
+
+declare <vscale x 1 x i1> @llvm.riscv.vmfle.nxv1bf16.bf16(
+  <vscale x 1 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmfle_vf_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfle_vf_nxv1bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vmfle.vf v0, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i1> @llvm.riscv.vmfle.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 1 x i1> @llvm.riscv.vmfle.mask.nxv1bf16.bf16(
+  <vscale x 1 x i1>,
+  <vscale x 1 x bfloat>,
+  bfloat,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmfle_mask_vf_nxv1bf16_bf16(<vscale x 1 x i1> %0, <vscale x 1 x bfloat> %1, bfloat %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfle_mask_vf_nxv1bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmfle.vf v10, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i1> @llvm.riscv.vmfle.mask.nxv1bf16.bf16(
+    <vscale x 1 x i1> %0,
+    <vscale x 1 x bfloat> %1,
+    bfloat %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmfle.nxv2bf16.bf16(
+  <vscale x 2 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmfle_vf_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfle_vf_nxv2bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vmfle.vf v0, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i1> @llvm.riscv.vmfle.nxv2bf16.bf16(
+    <vscale x 2 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmfle.mask.nxv2bf16.bf16(
+  <vscale x 2 x i1>,
+  <vscale x 2 x bfloat>,
+  bfloat,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmfle_mask_vf_nxv2bf16_bf16(<vscale x 2 x i1> %0, <vscale x 2 x bfloat> %1, bfloat %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfle_mask_vf_nxv2bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmfle.vf v10, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i1> @llvm.riscv.vmfle.mask.nxv2bf16.bf16(
+    <vscale x 2 x i1> %0,
+    <vscale x 2 x bfloat> %1,
+    bfloat %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmfle.nxv4bf16.bf16(
+  <vscale x 4 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmfle_vf_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfle_vf_nxv4bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vmfle.vf v0, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i1> @llvm.riscv.vmfle.nxv4bf16.bf16(
+    <vscale x 4 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmfle.mask.nxv4bf16.bf16(
+  <vscale x 4 x i1>,
+  <vscale x 4 x bfloat>,
+  bfloat,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmfle_mask_vf_nxv4bf16_bf16(<vscale x 4 x i1> %0, <vscale x 4 x bfloat> %1, bfloat %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfle_mask_vf_nxv4bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmfle.vf v10, v8, fa0, v0.t
+; CHECK-NEXT:    vmv.v.v v0, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i1> @llvm.riscv.vmfle.mask.nxv4bf16.bf16(
+    <vscale x 4 x i1> %0,
+    <vscale x 4 x bfloat> %1,
+    bfloat %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmfle.nxv8bf16.bf16(
+  <vscale x 8 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmfle_vf_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfle_vf_nxv8bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vmfle.vf v0, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i1> @llvm.riscv.vmfle.nxv8bf16.bf16(
+    <vscale x 8 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmfle.mask.nxv8bf16.bf16(
+  <vscale x 8 x i1>,
+  <vscale x 8 x bfloat>,
+  bfloat,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmfle_mask_vf_nxv8bf16_bf16(<vscale x 8 x i1> %0, <vscale x 8 x bfloat> %1, bfloat %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfle_mask_vf_nxv8bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmfle.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i1> @llvm.riscv.vmfle.mask.nxv8bf16.bf16(
+    <vscale x 8 x i1> %0,
+    <vscale x 8 x bfloat> %1,
+    bfloat %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmfle.nxv16bf16.bf16(
+  <vscale x 16 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmfle_vf_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfle_vf_nxv16bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vmfle.vf v0, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i1> @llvm.riscv.vmfle.nxv16bf16.bf16(
+    <vscale x 16 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmfle.mask.nxv16bf16.bf16(
+  <vscale x 16 x i1>,
+  <vscale x 16 x bfloat>,
+  bfloat,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmfle_mask_vf_nxv16bf16_bf16(<vscale x 16 x i1> %0, <vscale x 16 x bfloat> %1, bfloat %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfle_mask_vf_nxv16bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmfle.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i1> @llvm.riscv.vmfle.mask.nxv16bf16.bf16(
+    <vscale x 16 x i1> %0,
+    <vscale x 16 x bfloat> %1,
+    bfloat %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret <vscale x 16 x i1> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmflt-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vmflt-bf.ll
new file mode 100644
index 0000000000000..2e1bcc5e87bfc
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vmflt-bf.ll
@@ -0,0 +1,496 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x i1> @llvm.riscv.vmflt.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmflt_vv_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmflt_vv_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vmflt.vv v0, v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i1> @llvm.riscv.vmflt.nxv1bf16(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 1 x i1> @llvm.riscv.vmflt.mask.nxv1bf16(
+  <vscale x 1 x i1>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmflt_mask_vv_nxv1bf16_nxv1bf16(<vscale x 1 x i1> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmflt_mask_vv_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmflt.vv v0, v8, v9
+; CHECK-NEXT:    vmflt.vv v11, v9, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
+; CHECK-NEXT:    ret
+entry:
+  %mask = call <vscale x 1 x i1> @llvm.riscv.vmflt.nxv1bf16(
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    iXLen %4)
+  %a = call <vscale x 1 x i1> @llvm.riscv.vmflt.mask.nxv1bf16(
+    <vscale x 1 x i1> %0,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x bfloat> %3,
+    <vscale x 1 x i1> %mask,
+    iXLen %4)
+
+  ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmflt.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmflt_vv_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmflt_vv_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vmflt.vv v0, v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i1> @llvm.riscv.vmflt.nxv2bf16(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmflt.mask.nxv2bf16(
+  <vscale x 2 x i1>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmflt_mask_vv_nxv2bf16_nxv2bf16(<vscale x 2 x i1> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmflt_mask_vv_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmflt.vv v0, v8, v9
+; CHECK-NEXT:    vmflt.vv v11, v9, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
+; CHECK-NEXT:    ret
+entry:
+  %mask = call <vscale x 2 x i1> @llvm.riscv.vmflt.nxv2bf16(
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x bfloat> %2,
+    iXLen %4)
+  %a = call <vscale x 2 x i1> @llvm.riscv.vmflt.mask.nxv2bf16(
+    <vscale x 2 x i1> %0,
+    <vscale x 2 x bfloat> %2,
+    <vscale x 2 x bfloat> %3,
+    <vscale x 2 x i1> %mask,
+    iXLen %4)
+
+  ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmflt.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmflt_vv_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmflt_vv_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vmflt.vv v0, v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i1> @llvm.riscv.vmflt.nxv4bf16(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmflt.mask.nxv4bf16(
+  <vscale x 4 x i1>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmflt_mask_vv_nxv4bf16_nxv4bf16(<vscale x 4 x i1> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmflt_mask_vv_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmflt.vv v0, v8, v9
+; CHECK-NEXT:    vmflt.vv v11, v9, v10, v0.t
+; CHECK-NEXT:    vmv.v.v v0, v11
+; CHECK-NEXT:    ret
+entry:
+  %mask = call <vscale x 4 x i1> @llvm.riscv.vmflt.nxv4bf16(
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x bfloat> %2,
+    iXLen %4)
+  %a = call <vscale x 4 x i1> @llvm.riscv.vmflt.mask.nxv4bf16(
+    <vscale x 4 x i1> %0,
+    <vscale x 4 x bfloat> %2,
+    <vscale x 4 x bfloat> %3,
+    <vscale x 4 x i1> %mask,
+    iXLen %4)
+
+  ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmflt.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmflt_vv_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmflt_vv_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vmflt.vv v0, v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i1> @llvm.riscv.vmflt.nxv8bf16(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmflt.mask.nxv8bf16(
+  <vscale x 8 x i1>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmflt_mask_vv_nxv8bf16_nxv8bf16(<vscale x 8 x i1> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmflt_mask_vv_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vmv1r.v v14, v0
+; CHECK-NEXT:    vmflt.vv v0, v8, v10
+; CHECK-NEXT:    vmflt.vv v14, v10, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v14
+; CHECK-NEXT:    ret
+entry:
+  %mask = call <vscale x 8 x i1> @llvm.riscv.vmflt.nxv8bf16(
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x bfloat> %2,
+    iXLen %4)
+  %a = call <vscale x 8 x i1> @llvm.riscv.vmflt.mask.nxv8bf16(
+    <vscale x 8 x i1> %0,
+    <vscale x 8 x bfloat> %2,
+    <vscale x 8 x bfloat> %3,
+    <vscale x 8 x i1> %mask,
+    iXLen %4)
+
+  ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmflt.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmflt_vv_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmflt_vv_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vmflt.vv v0, v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i1> @llvm.riscv.vmflt.nxv16bf16(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmflt.mask.nxv16bf16(
+  <vscale x 16 x i1>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmflt_mask_vv_nxv16bf16_nxv16bf16(<vscale x 16 x i1> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmflt_mask_vv_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT:    vmv1r.v v20, v0
+; CHECK-NEXT:    vmflt.vv v0, v8, v12
+; CHECK-NEXT:    vmflt.vv v20, v12, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v20
+; CHECK-NEXT:    ret
+entry:
+  %mask = call <vscale x 16 x i1> @llvm.riscv.vmflt.nxv16bf16(
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x bfloat> %2,
+    iXLen %4)
+  %a = call <vscale x 16 x i1> @llvm.riscv.vmflt.mask.nxv16bf16(
+    <vscale x 16 x i1> %0,
+    <vscale x 16 x bfloat> %2,
+    <vscale x 16 x bfloat> %3,
+    <vscale x 16 x i1> %mask,
+    iXLen %4)
+
+  ret <vscale x 16 x i1> %a
+}
+
+declare <vscale x 1 x i1> @llvm.riscv.vmflt.nxv1bf16.bf16(
+  <vscale x 1 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmflt_vf_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmflt_vf_nxv1bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vmflt.vf v0, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i1> @llvm.riscv.vmflt.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 1 x i1> @llvm.riscv.vmflt.mask.nxv1bf16.bf16(
+  <vscale x 1 x i1>,
+  <vscale x 1 x bfloat>,
+  bfloat,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmflt_mask_vf_nxv1bf16_bf16(<vscale x 1 x i1> %0, <vscale x 1 x bfloat> %1, bfloat %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmflt_mask_vf_nxv1bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i1> @llvm.riscv.vmflt.mask.nxv1bf16.bf16(
+    <vscale x 1 x i1> %0,
+    <vscale x 1 x bfloat> %1,
+    bfloat %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmflt.nxv2bf16.bf16(
+  <vscale x 2 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmflt_vf_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmflt_vf_nxv2bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vmflt.vf v0, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i1> @llvm.riscv.vmflt.nxv2bf16.bf16(
+    <vscale x 2 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmflt.mask.nxv2bf16.bf16(
+  <vscale x 2 x i1>,
+  <vscale x 2 x bfloat>,
+  bfloat,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmflt_mask_vf_nxv2bf16_bf16(<vscale x 2 x i1> %0, <vscale x 2 x bfloat> %1, bfloat %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmflt_mask_vf_nxv2bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i1> @llvm.riscv.vmflt.mask.nxv2bf16.bf16(
+    <vscale x 2 x i1> %0,
+    <vscale x 2 x bfloat> %1,
+    bfloat %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmflt.nxv4bf16.bf16(
+  <vscale x 4 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmflt_vf_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmflt_vf_nxv4bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vmflt.vf v0, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i1> @llvm.riscv.vmflt.nxv4bf16.bf16(
+    <vscale x 4 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmflt.mask.nxv4bf16.bf16(
+  <vscale x 4 x i1>,
+  <vscale x 4 x bfloat>,
+  bfloat,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmflt_mask_vf_nxv4bf16_bf16(<vscale x 4 x i1> %0, <vscale x 4 x bfloat> %1, bfloat %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmflt_mask_vf_nxv4bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmflt.vf v10, v8, fa0, v0.t
+; CHECK-NEXT:    vmv.v.v v0, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i1> @llvm.riscv.vmflt.mask.nxv4bf16.bf16(
+    <vscale x 4 x i1> %0,
+    <vscale x 4 x bfloat> %1,
+    bfloat %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmflt.nxv8bf16.bf16(
+  <vscale x 8 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmflt_vf_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmflt_vf_nxv8bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vmflt.vf v0, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i1> @llvm.riscv.vmflt.nxv8bf16.bf16(
+    <vscale x 8 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmflt.mask.nxv8bf16.bf16(
+  <vscale x 8 x i1>,
+  <vscale x 8 x bfloat>,
+  bfloat,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmflt_mask_vf_nxv8bf16_bf16(<vscale x 8 x i1> %0, <vscale x 8 x bfloat> %1, bfloat %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmflt_mask_vf_nxv8bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i1> @llvm.riscv.vmflt.mask.nxv8bf16.bf16(
+    <vscale x 8 x i1> %0,
+    <vscale x 8 x bfloat> %1,
+    bfloat %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmflt.nxv16bf16.bf16(
+  <vscale x 16 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmflt_vf_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmflt_vf_nxv16bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vmflt.vf v0, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i1> @llvm.riscv.vmflt.nxv16bf16.bf16(
+    <vscale x 16 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmflt.mask.nxv16bf16.bf16(
+  <vscale x 16 x i1>,
+  <vscale x 16 x bfloat>,
+  bfloat,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmflt_mask_vf_nxv16bf16_bf16(<vscale x 16 x i1> %0, <vscale x 16 x bfloat> %1, bfloat %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmflt_mask_vf_nxv16bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmflt.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i1> @llvm.riscv.vmflt.mask.nxv16bf16.bf16(
+    <vscale x 16 x i1> %0,
+    <vscale x 16 x bfloat> %1,
+    bfloat %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret <vscale x 16 x i1> %a
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmfne-bf.ll b/llvm/test/CodeGen/RISCV/rvv/vmfne-bf.ll
new file mode 100644
index 0000000000000..283ffc500fdde
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vmfne-bf.ll
@@ -0,0 +1,496 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfa \
+; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x i1> @llvm.riscv.vmfne.nxv1bf16(
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmfne_vv_nxv1bf16_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfne_vv_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vmfne.vv v0, v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i1> @llvm.riscv.vmfne.nxv1bf16(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 1 x i1> @llvm.riscv.vmfne.mask.nxv1bf16(
+  <vscale x 1 x i1>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x bfloat>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmfne_mask_vv_nxv1bf16_nxv1bf16(<vscale x 1 x i1> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x bfloat> %2, <vscale x 1 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfne_mask_vv_nxv1bf16_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfne.vv v0, v8, v9
+; CHECK-NEXT:    vmfne.vv v11, v9, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
+; CHECK-NEXT:    ret
+entry:
+  %mask = call <vscale x 1 x i1> @llvm.riscv.vmfne.nxv1bf16(
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x bfloat> %2,
+    iXLen %4)
+  %a = call <vscale x 1 x i1> @llvm.riscv.vmfne.mask.nxv1bf16(
+    <vscale x 1 x i1> %0,
+    <vscale x 1 x bfloat> %2,
+    <vscale x 1 x bfloat> %3,
+    <vscale x 1 x i1> %mask,
+    iXLen %4)
+
+  ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmfne.nxv2bf16(
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmfne_vv_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfne_vv_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vmfne.vv v0, v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i1> @llvm.riscv.vmfne.nxv2bf16(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmfne.mask.nxv2bf16(
+  <vscale x 2 x i1>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x bfloat>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmfne_mask_vv_nxv2bf16_nxv2bf16(<vscale x 2 x i1> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x bfloat> %2, <vscale x 2 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfne_mask_vv_nxv2bf16_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfne.vv v0, v8, v9
+; CHECK-NEXT:    vmfne.vv v11, v9, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
+; CHECK-NEXT:    ret
+entry:
+  %mask = call <vscale x 2 x i1> @llvm.riscv.vmfne.nxv2bf16(
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x bfloat> %2,
+    iXLen %4)
+  %a = call <vscale x 2 x i1> @llvm.riscv.vmfne.mask.nxv2bf16(
+    <vscale x 2 x i1> %0,
+    <vscale x 2 x bfloat> %2,
+    <vscale x 2 x bfloat> %3,
+    <vscale x 2 x i1> %mask,
+    iXLen %4)
+
+  ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmfne.nxv4bf16(
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmfne_vv_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfne_vv_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vmfne.vv v0, v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i1> @llvm.riscv.vmfne.nxv4bf16(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmfne.mask.nxv4bf16(
+  <vscale x 4 x i1>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x bfloat>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmfne_mask_vv_nxv4bf16_nxv4bf16(<vscale x 4 x i1> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x bfloat> %2, <vscale x 4 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfne_mask_vv_nxv4bf16_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmfne.vv v0, v8, v9
+; CHECK-NEXT:    vmfne.vv v11, v9, v10, v0.t
+; CHECK-NEXT:    vmv.v.v v0, v11
+; CHECK-NEXT:    ret
+entry:
+  %mask = call <vscale x 4 x i1> @llvm.riscv.vmfne.nxv4bf16(
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x bfloat> %2,
+    iXLen %4)
+  %a = call <vscale x 4 x i1> @llvm.riscv.vmfne.mask.nxv4bf16(
+    <vscale x 4 x i1> %0,
+    <vscale x 4 x bfloat> %2,
+    <vscale x 4 x bfloat> %3,
+    <vscale x 4 x i1> %mask,
+    iXLen %4)
+
+  ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmfne.nxv8bf16(
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmfne_vv_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfne_vv_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vmfne.vv v0, v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i1> @llvm.riscv.vmfne.nxv8bf16(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmfne.mask.nxv8bf16(
+  <vscale x 8 x i1>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x bfloat>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmfne_mask_vv_nxv8bf16_nxv8bf16(<vscale x 8 x i1> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> %2, <vscale x 8 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfne_mask_vv_nxv8bf16_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vmv1r.v v14, v0
+; CHECK-NEXT:    vmfne.vv v0, v8, v10
+; CHECK-NEXT:    vmfne.vv v14, v10, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v14
+; CHECK-NEXT:    ret
+entry:
+  %mask = call <vscale x 8 x i1> @llvm.riscv.vmfne.nxv8bf16(
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x bfloat> %2,
+    iXLen %4)
+  %a = call <vscale x 8 x i1> @llvm.riscv.vmfne.mask.nxv8bf16(
+    <vscale x 8 x i1> %0,
+    <vscale x 8 x bfloat> %2,
+    <vscale x 8 x bfloat> %3,
+    <vscale x 8 x i1> %mask,
+    iXLen %4)
+
+  ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmfne.nxv16bf16(
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmfne_vv_nxv16bf16_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfne_vv_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vmfne.vv v0, v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i1> @llvm.riscv.vmfne.nxv16bf16(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmfne.mask.nxv16bf16(
+  <vscale x 16 x i1>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x bfloat>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmfne_mask_vv_nxv16bf16_nxv16bf16(<vscale x 16 x i1> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x bfloat> %2, <vscale x 16 x bfloat> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfne_mask_vv_nxv16bf16_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT:    vmv1r.v v20, v0
+; CHECK-NEXT:    vmfne.vv v0, v8, v12
+; CHECK-NEXT:    vmfne.vv v20, v12, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v20
+; CHECK-NEXT:    ret
+entry:
+  %mask = call <vscale x 16 x i1> @llvm.riscv.vmfne.nxv16bf16(
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x bfloat> %2,
+    iXLen %4)
+  %a = call <vscale x 16 x i1> @llvm.riscv.vmfne.mask.nxv16bf16(
+    <vscale x 16 x i1> %0,
+    <vscale x 16 x bfloat> %2,
+    <vscale x 16 x bfloat> %3,
+    <vscale x 16 x i1> %mask,
+    iXLen %4)
+
+  ret <vscale x 16 x i1> %a
+}
+
+declare <vscale x 1 x i1> @llvm.riscv.vmfne.nxv1bf16.bf16(
+  <vscale x 1 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmfne_vf_nxv1bf16_bf16(<vscale x 1 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfne_vf_nxv1bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    vmfne.vf v0, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i1> @llvm.riscv.vmfne.nxv1bf16.bf16(
+    <vscale x 1 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 1 x i1> @llvm.riscv.vmfne.mask.nxv1bf16.bf16(
+  <vscale x 1 x i1>,
+  <vscale x 1 x bfloat>,
+  bfloat,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define <vscale x 1 x i1> @intrinsic_vmfne_mask_vf_nxv1bf16_bf16(<vscale x 1 x i1> %0, <vscale x 1 x bfloat> %1, bfloat %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfne_mask_vf_nxv1bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, mu
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmfne.vf v10, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i1> @llvm.riscv.vmfne.mask.nxv1bf16.bf16(
+    <vscale x 1 x i1> %0,
+    <vscale x 1 x bfloat> %1,
+    bfloat %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret <vscale x 1 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmfne.nxv2bf16.bf16(
+  <vscale x 2 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmfne_vf_nxv2bf16_bf16(<vscale x 2 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfne_vf_nxv2bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    vmfne.vf v0, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i1> @llvm.riscv.vmfne.nxv2bf16.bf16(
+    <vscale x 2 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 2 x i1> @llvm.riscv.vmfne.mask.nxv2bf16.bf16(
+  <vscale x 2 x i1>,
+  <vscale x 2 x bfloat>,
+  bfloat,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define <vscale x 2 x i1> @intrinsic_vmfne_mask_vf_nxv2bf16_bf16(<vscale x 2 x i1> %0, <vscale x 2 x bfloat> %1, bfloat %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfne_mask_vf_nxv2bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, mu
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmfne.vf v10, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i1> @llvm.riscv.vmfne.mask.nxv2bf16.bf16(
+    <vscale x 2 x i1> %0,
+    <vscale x 2 x bfloat> %1,
+    bfloat %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret <vscale x 2 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmfne.nxv4bf16.bf16(
+  <vscale x 4 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmfne_vf_nxv4bf16_bf16(<vscale x 4 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfne_vf_nxv4bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    vmfne.vf v0, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i1> @llvm.riscv.vmfne.nxv4bf16.bf16(
+    <vscale x 4 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 4 x i1> @llvm.riscv.vmfne.mask.nxv4bf16.bf16(
+  <vscale x 4 x i1>,
+  <vscale x 4 x bfloat>,
+  bfloat,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define <vscale x 4 x i1> @intrinsic_vmfne_mask_vf_nxv4bf16_bf16(<vscale x 4 x i1> %0, <vscale x 4 x bfloat> %1, bfloat %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfne_mask_vf_nxv4bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, mu
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmfne.vf v10, v8, fa0, v0.t
+; CHECK-NEXT:    vmv.v.v v0, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i1> @llvm.riscv.vmfne.mask.nxv4bf16.bf16(
+    <vscale x 4 x i1> %0,
+    <vscale x 4 x bfloat> %1,
+    bfloat %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret <vscale x 4 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmfne.nxv8bf16.bf16(
+  <vscale x 8 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmfne_vf_nxv8bf16_bf16(<vscale x 8 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfne_vf_nxv8bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    vmfne.vf v0, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i1> @llvm.riscv.vmfne.nxv8bf16.bf16(
+    <vscale x 8 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 8 x i1> @llvm.riscv.vmfne.mask.nxv8bf16.bf16(
+  <vscale x 8 x i1>,
+  <vscale x 8 x bfloat>,
+  bfloat,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define <vscale x 8 x i1> @intrinsic_vmfne_mask_vf_nxv8bf16_bf16(<vscale x 8 x i1> %0, <vscale x 8 x bfloat> %1, bfloat %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfne_mask_vf_nxv8bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, mu
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmfne.vf v11, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v11
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i1> @llvm.riscv.vmfne.mask.nxv8bf16.bf16(
+    <vscale x 8 x i1> %0,
+    <vscale x 8 x bfloat> %1,
+    bfloat %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret <vscale x 8 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmfne.nxv16bf16.bf16(
+  <vscale x 16 x bfloat>,
+  bfloat,
+  iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmfne_vf_nxv16bf16_bf16(<vscale x 16 x bfloat> %0, bfloat %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vmfne_vf_nxv16bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    vmfne.vf v0, v8, fa0
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i1> @llvm.riscv.vmfne.nxv16bf16.bf16(
+    <vscale x 16 x bfloat> %0,
+    bfloat %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i1> %a
+}
+
+declare <vscale x 16 x i1> @llvm.riscv.vmfne.mask.nxv16bf16.bf16(
+  <vscale x 16 x i1>,
+  <vscale x 16 x bfloat>,
+  bfloat,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define <vscale x 16 x i1> @intrinsic_vmfne_mask_vf_nxv16bf16_bf16(<vscale x 16 x i1> %0, <vscale x 16 x bfloat> %1, bfloat %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vmfne_mask_vf_nxv16bf16_bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, mu
+; CHECK-NEXT:    vmv1r.v v13, v0
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmfne.vf v13, v8, fa0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i1> @llvm.riscv.vmfne.mask.nxv16bf16.bf16(
+    <vscale x 16 x i1> %0,
+    <vscale x 16 x bfloat> %1,
+    bfloat %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret <vscale x 16 x i1> %a
+}
+
diff --git a/llvm/test/CodeGen/SPIRV/FCmpFalse.ll b/llvm/test/CodeGen/SPIRV/FCmpFalse.ll
new file mode 100644
index 0000000000000..55d64196cafaa
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/FCmpFalse.ll
@@ -0,0 +1,10 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK: %[[#FalseVal:]] = OpConstantFalse %[[#]]
+; CHECK: OpReturnValue %[[#FalseVal:]]
+
+define spir_func i1 @f(float %0) {
+ %2 = fcmp false float %0, %0
+ ret i1 %2
+}
diff --git a/llvm/test/CodeGen/SPIRV/FCmpFalse_Vec.ll b/llvm/test/CodeGen/SPIRV/FCmpFalse_Vec.ll
new file mode 100644
index 0000000000000..c410b64c6997a
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/FCmpFalse_Vec.ll
@@ -0,0 +1,13 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK: %[[#BoolTy:]] = OpTypeBool
+; CHECK: %[[#VecTy:]] = OpTypeVector %[[#BoolTy]] 4
+; CHECK: %[[#False:]] = OpConstantFalse %[[#BoolTy]]
+; CHECK: %[[#Composite:]] = OpConstantComposite %[[#VecTy]] %[[#False]] %[[#False]] %[[#False]] %[[#False]]
+; CHECK: OpReturnValue %[[#Composite]]
+
+define spir_func <4 x i1> @test(<4 x float> %a) {
+ %compare = fcmp false <4 x float> %a, %a
+ ret <4 x i1> %compare
+}
diff --git a/llvm/test/CodeGen/SPIRV/builtin_duplicate.ll b/llvm/test/CodeGen/SPIRV/builtin_duplicate.ll
new file mode 100644
index 0000000000000..878655445990a
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/builtin_duplicate.ll
@@ -0,0 +1,20 @@
+;; This test checks if we generate a single builtin variable for the following
+;; LLVM IR.
+;; @__spirv_BuiltInLocalInvocationId - A global variable
+;; %3 = tail call i64 @_Z12get_local_idj(i32 0) - A function call
+
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK: OpName %[[#]] "__spirv_BuiltInLocalInvocationId"
+; CHECK-NOT: OpName %[[#]] "__spirv_BuiltInLocalInvocationId.1"
+
+@__spirv_BuiltInLocalInvocationId = external dso_local local_unnamed_addr addrspace(1) constant <3 x i64>, align 32
+
+declare spir_func i64 @_Z12get_local_idj(i32) local_unnamed_addr
+
+define spir_kernel void @test(i32 %a) {
+entry:
+  %builtin_call = tail call i64 @_Z12get_local_idj(i32 0)
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/complex-constexpr.ll b/llvm/test/CodeGen/SPIRV/complex-constexpr.ll
new file mode 100644
index 0000000000000..e2c1d00ba4c0e
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/complex-constexpr.ll
@@ -0,0 +1,21 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+@.str.1 = private unnamed_addr addrspace(1) constant [1 x i8] zeroinitializer, align 1
+
+define linkonce_odr hidden spir_func void @test() {
+entry:
+; CHECK: %[[#MinusOne:]] = OpConstant %[[#]] 18446744073709551615
+; CHECK: %[[#Ptr:]] = OpConvertUToPtr %[[#]]  %[[#MinusOne]]
+; CHECK: %[[#PtrCast:]] = OpPtrCastToGeneric %[[#]] %[[#]]
+; CHECK: %[[#]] = OpFunctionCall %[[#]] %[[#]] %[[#PtrCast]] %[[#Ptr]]
+
+  %cast = bitcast ptr addrspace(4) inttoptr (i64 -1 to ptr addrspace(4)) to ptr addrspace(4)
+  call spir_func void @bar(ptr addrspace(4) addrspacecast (ptr addrspace(1) @.str.1 to ptr addrspace(4)), ptr addrspace(4) %cast)
+  ret void
+}
+
+define linkonce_odr hidden spir_func void @bar(ptr addrspace(4) %begin, ptr addrspace(4) %end) {
+entry:
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/dominator-order.ll b/llvm/test/CodeGen/SPIRV/dominator-order.ll
new file mode 100644
index 0000000000000..2ecdddcd71ff0
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/dominator-order.ll
@@ -0,0 +1,25 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; This test checks that basic blocks are reordered in SPIR-V so that dominators
+; are emitted ahead of their dominated blocks as required by the SPIR-V
+; specification.
+
+; CHECK-DAG: OpName %[[#ENTRY:]] "entry"
+; CHECK-DAG: OpName %[[#FOR_BODY137_LR_PH:]] "for.body137.lr.ph"
+; CHECK-DAG: OpName %[[#FOR_BODY:]] "for.body"
+
+; CHECK: %[[#ENTRY]] = OpLabel
+; CHECK: %[[#FOR_BODY]] = OpLabel
+; CHECK: %[[#FOR_BODY137_LR_PH]] = OpLabel
+
+define spir_kernel void @test(ptr addrspace(1) %arg, i1 %cond) {
+entry:
+  br label %for.body
+
+for.body137.lr.ph:                                ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  br i1 %cond, label %for.body, label %for.body137.lr.ph
+}
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_maximal_reconvergence/enable-maximal-reconvergence.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_maximal_reconvergence/enable-maximal-reconvergence.ll
new file mode 100644
index 0000000000000..105f4a4a70900
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_maximal_reconvergence/enable-maximal-reconvergence.ll
@@ -0,0 +1,21 @@
+; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.6-unknown-vulkan1.3-compute --spirv-ext=+SPV_KHR_maximal_reconvergence %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.6-unknown-vulkan1.3-compute --spirv-ext=+SPV_KHR_maximal_reconvergence %s -o - -filetype=obj | spirv-val %}
+
+; CHECK: OpCapability Shader
+; CHECK: OpExtension "SPV_KHR_maximal_reconvergence"
+; CHECK-NOT: OpExecutionMode {{.*}} MaximallyReconvergesKHR
+; CHECK: OpExecutionMode [[main:%[0-9]+]] MaximallyReconvergesKHR
+; CHECK-NOT: OpExecutionMode {{.*}} MaximallyReconvergesKHR
+; CHECK: OpName [[main]] "main"
+define void @main() local_unnamed_addr #0 {
+entry:
+  ret void
+}
+
+define void @negative() local_unnamed_addr #1 {
+entry:
+  ret void
+}
+
+attributes #0 = { "enable-maximal-reconvergence"="true" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
+attributes #1 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/DynamicIdx/RWBufferDynamicIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/DynamicIdx/RWBufferDynamicIdx.ll
new file mode 100644
index 0000000000000..1aee688bc37ea
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/DynamicIdx/RWBufferDynamicIdx.ll
@@ -0,0 +1,26 @@
+; RUN: llc -O0 -mtriple=spirv1.6-unknown-vulkan1.3-compute %s -o - | FileCheck %s --match-full-lines
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.6-unknown-vulkan1.3-compute %s -o - -filetype=obj | spirv-val %}
+
+%"__cblayout_$Globals" = type <{ i32 }>
+
+@i = external hidden local_unnamed_addr addrspace(12) global i32, align 4
+@ReadWriteBuf.str = private unnamed_addr constant [13 x i8] c"ReadWriteBuf\00", align 1
+@"$Globals.cb" = local_unnamed_addr global target("spirv.VulkanBuffer", target("spirv.Layout", %"__cblayout_$Globals", 4, 0), 2, 0) poison
+@"$Globals.str" = private unnamed_addr constant [9 x i8] c"$Globals\00", align 1
+
+; CHECK: OpCapability Shader
+; CHECK: OpCapability StorageTexelBufferArrayDynamicIndexingEXT
+define void @main() local_unnamed_addr #0 {
+entry:
+  %"$Globals.cb_h.i.i" = tail call target("spirv.VulkanBuffer", target("spirv.Layout", %"__cblayout_$Globals", 4, 0), 2, 0) @"llvm.spv.resource.handlefromimplicitbinding.tspirv.VulkanBuffer_tspirv.Layout_s___cblayout_$Globalss_4_0t_2_0t"(i32 1, i32 0, i32 1, i32 0, ptr nonnull @"$Globals.str")
+  store target("spirv.VulkanBuffer", target("spirv.Layout", %"__cblayout_$Globals", 4, 0), 2, 0) %"$Globals.cb_h.i.i", ptr @"$Globals.cb", align 8
+  %0 = load i32, ptr addrspace(12) @i, align 4
+  %1 = tail call target("spirv.Image", i32, 5, 2, 0, 0, 2, 33) @llvm.spv.resource.handlefromimplicitbinding.tspirv.Image_i32_5_2_0_0_2_33t(i32 0, i32 0, i32 64, i32 %0, ptr nonnull @ReadWriteBuf.str)
+  %2 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.Image_i32_5_2_0_0_2_33t(target("spirv.Image", i32, 5, 2, 0, 0, 2, 33) %1, i32 98)
+  store i32 99, ptr addrspace(11) %2, align 4
+  ret void
+}
+
+!hlsl.cbs = !{!0}
+
+!0 = !{ptr @"$Globals.cb", ptr addrspace(12) @i}
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/DynamicIdx/RWStructuredBufferDynamicIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/DynamicIdx/RWStructuredBufferDynamicIdx.ll
new file mode 100644
index 0000000000000..163fc9d97c544
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/DynamicIdx/RWStructuredBufferDynamicIdx.ll
@@ -0,0 +1,26 @@
+; RUN: llc -O0 -mtriple=spirv1.6-unknown-vulkan1.3-compute %s -o - | FileCheck %s --match-full-lines
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.6-unknown-vulkan1.3-compute %s -o - -filetype=obj | spirv-val %}
+
+%"__cblayout_$Globals" = type <{ i32 }>
+
+@i = external hidden local_unnamed_addr addrspace(12) global i32, align 4
+@ReadWriteStructuredBuf.str = private unnamed_addr constant [23 x i8] c"ReadWriteStructuredBuf\00", align 1
+@"$Globals.cb" = local_unnamed_addr global target("spirv.VulkanBuffer", target("spirv.Layout", %"__cblayout_$Globals", 4, 0), 2, 0) poison
+@"$Globals.str" = private unnamed_addr constant [9 x i8] c"$Globals\00", align 1
+
+; CHECK: OpCapability Shader
+; CHECK: OpCapability StorageBufferArrayDynamicIndexing
+define void @main() local_unnamed_addr #0 {
+entry:
+  %"$Globals.cb_h.i.i" = tail call target("spirv.VulkanBuffer", target("spirv.Layout", %"__cblayout_$Globals", 4, 0), 2, 0) @"llvm.spv.resource.handlefromimplicitbinding.tspirv.VulkanBuffer_tspirv.Layout_s___cblayout_$Globalss_4_0t_2_0t"(i32 2, i32 0, i32 1, i32 0, ptr nonnull @"$Globals.str")
+  store target("spirv.VulkanBuffer", target("spirv.Layout", %"__cblayout_$Globals", 4, 0), 2, 0) %"$Globals.cb_h.i.i", ptr @"$Globals.cb", align 8
+  %0 = load i32, ptr addrspace(12) @i, align 4
+  %1 = tail call target("spirv.VulkanBuffer", [0 x i32], 12, 1) @llvm.spv.resource.handlefromimplicitbinding.tspirv.VulkanBuffer_a0i32_12_1t(i32 0, i32 0, i32 64, i32 %0, ptr nonnull @ReadWriteStructuredBuf.str)
+  %2 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0i32_12_1t(target("spirv.VulkanBuffer", [0 x i32], 12, 1) %1, i32 99)
+  store i32 98, ptr addrspace(11) %2, align 4
+  ret void
+}
+
+!hlsl.cbs = !{!0}
+
+!0 = !{ptr @"$Globals.cb", ptr addrspace(12) @i}
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/NonUniformIdx/StructuredBufferNonUniformIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/NonUniformIdx/RWBufferNonUniformIdx.ll
similarity index 100%
rename from llvm/test/CodeGen/SPIRV/hlsl-resources/NonUniformIdx/StructuredBufferNonUniformIdx.ll
rename to llvm/test/CodeGen/SPIRV/hlsl-resources/NonUniformIdx/RWBufferNonUniformIdx.ll
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/NonUniformIdx/RWStructuredBufferNonUniformIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/NonUniformIdx/RWStructuredBufferNonUniformIdx.ll
index 2a12baf1e3ed4..a820e7a8ce06e 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/NonUniformIdx/RWStructuredBufferNonUniformIdx.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/NonUniformIdx/RWStructuredBufferNonUniformIdx.ll
@@ -3,6 +3,7 @@
 
 ; CHECK-DAG: OpCapability Shader
 ; CHECK-DAG: OpCapability ShaderNonUniformEXT
+; CHECK-DAG: OpCapability StorageBufferArrayNonUniformIndexingEXT
 ; CHECK-DAG: OpDecorate {{%[0-9]+}} NonUniformEXT
 ; CHECK-DAG: OpDecorate {{%[0-9]+}} NonUniformEXT
 ; CHECK-DAG: OpDecorate {{%[0-9]+}} NonUniformEXT
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageDynIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageConstIdx.ll
similarity index 97%
rename from llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageDynIdx.ll
rename to llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageConstIdx.ll
index d00209778e94e..e4ec23171490f 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageDynIdx.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageConstIdx.ll
@@ -4,8 +4,8 @@
 @.str.b0 = private unnamed_addr constant [3 x i8] c"B0\00", align 1
 
 ; CHECK-DAG: OpCapability Shader
-; CHECK-DAG: OpCapability StorageImageArrayDynamicIndexing
 ; CHECK-DAG: OpCapability Image1D
+; CHECK-DAG: OpCapability Int8
 ; CHECK-NOT: OpCapability
 
 ; CHECK-DAG: OpDecorate [[Var:%[0-9]+]] DescriptorSet 3
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/TypedBufferLoad.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/TypedBufferLoad.ll
new file mode 100644
index 0000000000000..7c44b6de74325
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/TypedBufferLoad.ll
@@ -0,0 +1,43 @@
+; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.6-unknown-vulkan1.3-compute %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.6-unknown-vulkan1.3-compute %s -o - -filetype=obj | spirv-val %}
+
+; When accessing read-only `Buffer` types, SPIR-V should use `OpImageFetch` instead of `OpImageRead`.
+; https://github.com/llvm/llvm-project/issues/162891
+
+; CHECK-DAG: OpCapability SampledBuffer
+; CHECK-DAG: OpCapability ImageBuffer
+; CHECK-DAG: [[TypeInt:%[0-9]+]] = OpTypeInt 32 0
+; CHECK-DAG: [[TypeImageBuffer:%[0-9]+]] = OpTypeImage [[TypeInt]] Buffer 2 0 0 1 Unknown
+; CHECK-DAG: [[TypePtrImageBuffer:%[0-9]+]] = OpTypePointer UniformConstant [[TypeImageBuffer]]
+; CHECK-DAG: [[TypeVector:%[0-9]+]] = OpTypeVector [[TypeInt]] 4
+; CHECK-DAG: [[Index:%[0-9]+]] = OpConstant [[TypeInt]] 98
+; CHECK-DAG: [[Variable:%[0-9]+]] = OpVariable [[TypePtrImageBuffer]] UniformConstant
+@.str = private unnamed_addr constant [7 x i8] c"rwbuff\00", align 1
+@.str.2 = private unnamed_addr constant [5 x i8] c"buff\00", align 1
+@.str.4 = private unnamed_addr constant [8 x i8] c"unknown\00", align 1
+
+define void @main() local_unnamed_addr #0 {
+  %1 = tail call target("spirv.Image", i32, 5, 2, 0, 0, 2, 33) @llvm.spv.resource.handlefromimplicitbinding.tspirv.Image_i32_5_2_0_0_2_33t(i32 0, i32 0, i32 1, i32 0, ptr nonnull @.str)
+  %2 = tail call target("spirv.Image", i32, 5, 2, 0, 0, 1, 0) @llvm.spv.resource.handlefromimplicitbinding.tspirv.Image_i32_5_2_0_0_1_0t(i32 1, i32 0, i32 1, i32 0, ptr nonnull @.str.2)
+  %3 = tail call target("spirv.Image", i32, 5, 2, 0, 0, 0, 0) @llvm.spv.resource.handlefromimplicitbinding.tspirv.Image_i32_5_2_0_0_0_0t(i32 2, i32 0, i32 1, i32 0, ptr nonnull @.str.4)
+  %4 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.Image_i32_5_2_0_0_1_0t(target("spirv.Image", i32, 5, 2, 0, 0, 1, 0) %2, i32 98)
+; CHECK: [[Load:%[0-9]+]] = OpLoad [[TypeImageBuffer]] [[Variable]]
+; CHECK: [[ImageFetch:%[0-9]+]] = OpImageFetch [[TypeVector]] [[Load]] [[Index]]
+; CHECK: {{.*}} = OpCompositeExtract [[TypeInt]] [[ImageFetch]] 0
+  %5 = load i32, ptr addrspace(11) %4, align 4
+  %6 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.Image_i32_5_2_0_0_2_33t(target("spirv.Image", i32, 5, 2, 0, 0, 2, 33) %1, i32 99)
+  store i32 %5, ptr addrspace(11) %6, align 4
+  %7 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.Image_i32_5_2_0_0_2_33t(target("spirv.Image", i32, 5, 2, 0, 0, 2, 33) %1, i32 96)
+; CHECK: {{%[0-9]+}} = OpLoad {{.*}}
+; CHECK: {{%[0-9]+}} = OpImageRead {{.*}}
+  %8 = load i32, ptr addrspace(11) %7, align 4
+  %9 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.Image_i32_5_2_0_0_2_33t(target("spirv.Image", i32, 5, 2, 0, 0, 2, 33) %1, i32 97)
+  store i32 %8, ptr addrspace(11) %9, align 4
+  %10 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.Image_i32_5_2_0_0_0_0t(target("spirv.Image", i32, 5, 2, 0, 0, 0, 0) %3, i32 94)
+; CHECK: {{%[0-9]+}} = OpLoad {{.*}}
+; CHECK: {{%[0-9]+}} = OpImageRead {{.*}}
+  %11 = load i32, ptr addrspace(11) %10, align 4
+  %12 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.Image_i32_5_2_0_0_2_33t(target("spirv.Image", i32, 5, 2, 0, 0, 2, 33) %1, i32 95)
+  store i32 %11, ptr addrspace(11) %12, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/llvm-compiler-used.ll b/llvm/test/CodeGen/SPIRV/llvm-compiler-used.ll
new file mode 100644
index 0000000000000..ddc2585f2da7e
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/llvm-compiler-used.ll
@@ -0,0 +1,19 @@
+; RUN: llc -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+; RUN: llc -verify-machineinstrs -mtriple=spirv-unknown-vulkan %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -mtriple=spirv-unknown-vulkan %s -o - -filetype=obj | spirv-val %}
+
+; Verify that llvm.compiler.used is not lowered.
+; CHECK: OpName %{{[0-9]+}} "unused"
+; CHECK-NOT: OpName %{{[0-9]+}} "llvm.compiler.used"
+
+; Check that the type of llvm.compiler.used is not emitted too.
+; CHECK-NOT: OpTypeArray
+
+@unused = private addrspace(3) global i32 0
+@llvm.compiler.used = appending addrspace(2) global [1 x ptr addrspace (4)] [ptr addrspace(4) addrspacecast (ptr addrspace(3) @unused to ptr addrspace(4))]
+
+define spir_func void @foo() {
+entry:
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fake_use.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fake_use.ll
new file mode 100644
index 0000000000000..5370b51e9e1b7
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/fake_use.ll
@@ -0,0 +1,13 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: OpCapability Addresses
+; CHECK-DAG: OpName %[[#]] "foo"
+
+declare void @llvm.fake.use(...)
+
+define spir_kernel void @foo(ptr addrspace(1) %a) {
+entry:
+  call void (...) @llvm.fake.use(ptr addrspace(1) %a)
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/AtomicCompareExchange_cl20.ll b/llvm/test/CodeGen/SPIRV/transcoding/AtomicCompareExchange_cl20.ll
new file mode 100644
index 0000000000000..83573737df9eb
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/transcoding/AtomicCompareExchange_cl20.ll
@@ -0,0 +1,84 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s 
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64v1.2-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-NOT: OpCapability Int64Atomics
+
+; CHECK-DAG: %[[#int:]] = OpTypeInt 32 0
+; CHECK-DAG: %[[#int8:]] = OpTypeInt 8 0
+; CHECK-DAG: %[[#DeviceScope:]] =  OpConstant %[[#int]] 1
+; CHECK-DAG: %[[#SequentiallyConsistent_MS:]] = OpConstant %[[#int]] 16
+; CHECK-DAG: %[[#int_ptr:]] = OpTypePointer Generic %[[#int]]
+; CHECK-DAG: %[[#int_ptr8:]] = OpTypePointer Generic %[[#int8]]
+; CHECK-DAG: %[[#bool:]] = OpTypeBool
+
+define spir_func void @test(ptr addrspace(4) %object, ptr addrspace(4) %expected, i32 %desired) {
+
+; CHECK: %[[#object:]] = OpFunctionParameter %[[#int_ptr8]]
+; CHECK: %[[#expected:]] = OpFunctionParameter %[[#int_ptr8]] 
+; CHECK: %[[#desired:]] = OpFunctionParameter %[[#int]]
+
+entry:
+  %object.addr = alloca ptr addrspace(4), align 4
+  %expected.addr = alloca ptr addrspace(4), align 4
+  %desired.addr = alloca i32, align 4
+  %strong_res = alloca i8, align 1
+  %res = alloca i8, align 1
+  %weak_res = alloca i8, align 1
+  store ptr addrspace(4) %object, ptr %object.addr, align 4
+  store ptr addrspace(4) %expected, ptr %expected.addr, align 4
+  store i32 %desired, ptr %desired.addr, align 4
+  %0 = load ptr addrspace(4), ptr %object.addr, align 4
+  %1 = load ptr addrspace(4), ptr %expected.addr, align 4
+  %2 = load i32, ptr %desired.addr, align 4
+
+; CHECK-DAG: OpStore %[[#object_addr:]] %[[#object]]
+; CHECK-DAG: OpStore %[[#expected_addr:]] %[[#expected]]
+; CHECK-DAG: OpStore %[[#desired_addr:]] %[[#desired]]
+  
+; CHECK: %[[#Pointer:]] = OpLoad %[[#int_ptr]] %[[#]]
+; CHECK: %[[#exp:]] = OpLoad %[[#int_ptr]] %[[#]]
+; CHECK: %[[#Value:]] = OpLoad %[[#int]] %[[#desired_addr]]
+; CHECK: %[[#Comparator:]] = OpLoad %[[#int]] %[[#exp]]
+
+; CHECK: %[[#Result:]] = OpAtomicCompareExchange %[[#int]] %[[#]] %[[#DeviceScope]] %[[#SequentiallyConsistent_MS]] %[[#SequentiallyConsistent_MS]] %[[#Value]] %[[#Comparator]]
+  %call = call spir_func zeroext i1 @_Z30atomic_compare_exchange_strongPVU3AS4U7_AtomiciPU3AS4ii(ptr addrspace(4) %0, ptr addrspace(4) %1, i32 %2)
+
+; CHECK-NEXT: OpStore %[[#exp]] %[[#Result]]
+; CHECK-NEXT: %[[#CallRes:]] = OpIEqual %[[#bool]] %[[#Result]] %[[#Comparator]]
+; CHECK-NOT: %[[#Result]]
+
+  %frombool = zext i1 %call to i8
+  store i8 %frombool, ptr %strong_res, align 1
+  %3 = load i8, ptr %strong_res, align 1
+  %tobool = trunc i8 %3 to i1
+  %lnot = xor i1 %tobool, true
+  %frombool1 = zext i1 %lnot to i8
+  store i8 %frombool1, ptr %res, align 1
+  %4 = load ptr addrspace(4), ptr %object.addr, align 4
+  %5 = load ptr addrspace(4), ptr %expected.addr, align 4
+  %6 = load i32, ptr %desired.addr, align 4
+
+; CHECK: %[[#Pointer:]] = OpLoad %[[#int_ptr]] %[[#]]
+; CHECK: %[[#exp:]] = OpLoad %[[#int_ptr]] %[[#]]
+; CHECK: %[[#Value:]] = OpLoad %[[#int]] %[[#desired_addr]]
+; CHECK: %[[#ComparatorWeak:]] = OpLoad %[[#int]] %[[#exp]]
+
+; CHECK: %[[#Result:]] = OpAtomicCompareExchangeWeak %[[#int]] %[[#]] %[[#DeviceScope]] %[[#SequentiallyConsistent_MS]] %[[#SequentiallyConsistent_MS]] %[[#Value]] %[[#ComparatorWeak]]
+  %call2 = call spir_func zeroext i1 @_Z28atomic_compare_exchange_weakPVU3AS4U7_AtomiciPU3AS4ii(ptr addrspace(4) %4, ptr addrspace(4) %5, i32 %6)
+
+; CHECK-NEXT: OpStore %[[#exp]] %[[#Result]]
+; CHECK-NEXT: %[[#CallRes:]] = OpIEqual %[[#bool]] %[[#Result]] %[[#ComparatorWeak]]
+; CHECK-NOT: %[[#Result]]
+
+  %frombool3 = zext i1 %call2 to i8
+  store i8 %frombool3, ptr %weak_res, align 1
+  %7 = load i8, ptr %weak_res, align 1
+  %tobool4 = trunc i8 %7 to i1
+  %lnot5 = xor i1 %tobool4, true
+  %frombool6 = zext i1 %lnot5 to i8
+  store i8 %frombool6, ptr %res, align 1
+  ret void
+}
+
+declare spir_func zeroext i1 @_Z30atomic_compare_exchange_strongPVU3AS4U7_AtomiciPU3AS4ii(ptr addrspace(4), ptr addrspace(4), i32) #1
+declare spir_func zeroext i1 @_Z28atomic_compare_exchange_weakPVU3AS4U7_AtomiciPU3AS4ii(ptr addrspace(4), ptr addrspace(4), i32) #1
diff --git a/llvm/test/CodeGen/SystemZ/htm-intrinsics.ll b/llvm/test/CodeGen/SystemZ/htm-intrinsics.ll
index c6ee8042a5f2a..07fbed9bd0dd7 100644
--- a/llvm/test/CodeGen/SystemZ/htm-intrinsics.ll
+++ b/llvm/test/CodeGen/SystemZ/htm-intrinsics.ll
@@ -90,7 +90,7 @@ define i32 @test_tbegin_nofloat4(i32 %pad, ptr %ptr) {
 ; CHECK: tbegin 0, 65292
 ; CHECK: ipm %r2
 ; CHECK: srl %r2, 28
-; CHECK: ciblh %r2, 2, 0(%r14)
+; CHECK: bnhr %r14
 ; CHECK: mvhi 0(%r3), 0
 ; CHECK: br %r14
   %res = call i32 @llvm.s390.tbegin.nofloat(ptr null, i32 65292)
@@ -219,7 +219,7 @@ define i32 @test_tend2(i32 %pad, ptr %ptr) {
 ; CHECK: tend
 ; CHECK: ipm %r2
 ; CHECK: srl %r2, 28
-; CHECK: ciblh %r2, 2, 0(%r14)
+; CHECK: bnhr %r14
 ; CHECK: mvhi 0(%r3), 0
 ; CHECK: br %r14
   %res = call i32 @llvm.s390.tend()
diff --git a/llvm/test/CodeGen/SystemZ/inline-asm-flag-output-01.ll b/llvm/test/CodeGen/SystemZ/inline-asm-flag-output-01.ll
new file mode 100644
index 0000000000000..6b8746e05704c
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/inline-asm-flag-output-01.ll
@@ -0,0 +1,738 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -verify-machineinstrs -mtriple=s390x-linux-gnu -O2 | FileCheck %s
+; Test implementation of combining br_ccmask for flag output operand, and
+; optimizing ipm sequence using conditional branches.
+
+declare void @dummy()
+
+; Check a case where the cc is used as an integer.
+; Just (srl (ipm)) sequence without optimization.
+define i32 @test(ptr %a) {
+; CHECK-LABEL: test:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    ipm %r2
+; CHECK-NEXT:    srl %r2, 28
+; CHECK-NEXT:    br %r14
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  ret i32 %cc
+}
+
+; Test-1(f1_0_*). Test all 14 valid combinations, where cc is being used for
+; branching.
+
+; Check (cc == 0).
+define void @f1_0_eq_0(ptr %a) {
+; CHECK-LABEL: f1_0_eq_0:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jge dummy@PLT
+; CHECK-NEXT:  .LBB1_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %cmp = icmp eq i32 %cc, 0
+  br i1 %cmp, label %branch, label %exit
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check (cc != 0).
+define void @f1_0_ne_0(ptr %a) {
+; CHECK-LABEL: f1_0_ne_0:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jgne dummy@PLT
+; CHECK-NEXT:  .LBB2_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %cmp = icmp ugt i32 %cc, 0
+  br i1 %cmp, label %branch, label %exit
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check (cc == 1).
+define void @f1_0_eq_1(ptr %a) {
+; CHECK-LABEL: f1_0_eq_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jgl dummy@PLT
+; CHECK-NEXT:  .LBB3_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %cmp = icmp eq i32 %cc, 1
+  br i1 %cmp, label %branch, label %exit
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check (cc != 1).
+define void @f1_0_ne_1(ptr %a) {
+; CHECK-LABEL: f1_0_ne_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jgnl dummy@PLT
+; CHECK-NEXT:  .LBB4_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %cmp = icmp ne i32 %cc, 1
+  br i1 %cmp, label %branch, label %exit
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check (cc == 2).
+define void @f1_0_eq_2(ptr %a) {
+; CHECK-LABEL: f1_0_eq_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jgh dummy@PLT
+; CHECK-NEXT:  .LBB5_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %cmp = icmp eq i32 %cc, 2
+  br i1 %cmp, label %branch, label %exit
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check (cc != 2).
+define void @f1_0_ne_2(ptr %a) {
+; CHECK-LABEL: f1_0_ne_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jgnh dummy@PLT
+; CHECK-NEXT:  .LBB6_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %cmp = icmp ne i32 %cc, 2
+  br i1 %cmp, label %branch, label %exit
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check (cc == 3).
+define void @f1_0_eq_3(ptr %a) {
+; CHECK-LABEL: f1_0_eq_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jgo dummy@PLT
+; CHECK-NEXT:  .LBB7_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %cmp = icmp eq i32 %cc, 3
+  br i1 %cmp, label %branch, label %exit
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check (cc != 3).
+define void @f1_0_ne_3(ptr %a) {
+; CHECK-LABEL: f1_0_ne_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jgno dummy@PLT
+; CHECK-NEXT:  .LBB8_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %cmp = icmp ult i32 %cc, 3
+  br i1 %cmp, label %branch, label %exit
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check (cc == 0|1).
+define void @f1_0_01(ptr %a) {
+; CHECK-LABEL: f1_0_01:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jgle dummy@PLT
+; CHECK-NEXT:  .LBB9_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %cmp = icmp ult i32 %cc, 2
+  br i1 %cmp, label %branch, label %exit
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check (cc == 0|2).
+define void @f1_0_02(ptr %a) {
+; CHECK-LABEL: f1_0_02:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jghe dummy@PLT
+; CHECK-NEXT:  .LBB10_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %and = and i32 %cc, 1
+  %cmp = icmp eq i32 %and, 0
+  br i1 %cmp, label %branch, label %exit
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check (cc == 0|3).
+define void @f1_0_03(ptr %a) {
+; CHECK-LABEL: f1_0_03:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jgnlh dummy@PLT
+; CHECK-NEXT:  .LBB11_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %cmp0 = icmp ne i32 %cc, 0
+  %cmp3 = icmp ne i32 %cc, 3
+  %cmp.inv = and i1 %cmp0, %cmp3
+  br i1 %cmp.inv, label %exit, label %branch
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check (cc == 1|2).
+define void @f1_0_12(ptr %a) {
+; CHECK-LABEL: f1_0_12:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jglh dummy@PLT
+; CHECK-NEXT:  .LBB12_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %cmpeq1 = icmp eq i32 %cc, 1
+  %cmpeq2 = icmp eq i32 %cc, 2
+  %cmp = or i1 %cmpeq1, %cmpeq2
+  br i1 %cmp, label %branch, label %exit
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check (cc == 1|3).
+define void @f1_0_13(ptr %a) {
+; CHECK-LABEL: f1_0_13:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jgnhe dummy@PLT
+; CHECK-NEXT:  .LBB13_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %cmpeq1 = icmp eq i32 %cc, 1
+  %cmpeq3 = icmp eq i32 %cc, 3
+  %cmp = or i1 %cmpeq1, %cmpeq3
+  br i1 %cmp, label %branch, label %exit
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check (cc == 2|3).
+define void @f1_0_23(ptr %a) {
+; CHECK-LABEL: f1_0_23:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jgnle dummy@PLT
+; CHECK-NEXT:  .LBB14_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %cmp = icmp ugt i32 %cc, 1
+  br i1 %cmp, label %branch, label %exit
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Test-2(f1_1_*/f1_2_*/fl_3_*/f1_4_*).
+; Test Mixed patterns involving Binary Ops.
+
+; Check 'add' for (cc != 0).
+define void @f1_1_1(ptr %a) {
+; CHECK-LABEL: f1_1_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jgne dummy@PLT
+; CHECK-NEXT:  .LBB15_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %add = add nsw i32 %cc, -1
+  %cmp = icmp ult i32 %add, 3
+  br i1 %cmp, label %branch, label %exit
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check 'add' for (cc == 1|2).
+define void @f1_1_2(ptr %a) {
+; CHECK-LABEL: f1_1_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jglh dummy@PLT
+; CHECK-NEXT:  .LBB16_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %add = add nsw i32 %cc, -1
+  %cmp = icmp ult i32 %add, 2
+  br i1 %cmp, label %branch, label %exit
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check 'add' for (cc == 1|2).
+define void @f1_1_3(ptr %a) {
+; CHECK-LABEL: f1_1_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jglh dummy@PLT
+; CHECK-NEXT:  .LBB17_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %add = add nsw i32 %cc, -3
+  %cmp.inv = icmp ult i32 %add, -2
+  br i1 %cmp.inv, label %exit, label %branch
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check 'and' with one operand cc and other select_ccmask(cc !=1).
+define void @f1_2_1(ptr %a) {
+; CHECK-LABEL: f1_2_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jgnl dummy@PLT
+; CHECK-NEXT:  .LBB18_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %andcc =  and i32 %cc, 1
+  %cmpne0 = icmp ne i32 %andcc, 0
+  %cmpne3 = icmp ne i32 %cc, 3
+  %cmp.inv = and i1 %cmpne3, %cmpne0
+  br i1 %cmp.inv, label %exit, label %branch
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check 'and' with both operands select_ccmask(cc != 2).
+define void @f1_2_2(ptr %a) {
+; CHECK-LABEL: f1_2_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jgnh dummy@PLT
+; CHECK-NEXT:  .LBB19_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %ugt1 = icmp samesign ugt i32 %cc, 1
+  %cmpne3 = icmp ne i32 %cc, 3
+  %and.cond.inv = and i1 %ugt1, %cmpne3
+  br i1 %and.cond.inv, label %exit, label %branch
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check 'and/tm' for (cc == 0|2).
+define void @f1_2_3(ptr %a) {
+; CHECK-LABEL: f1_2_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jghe dummy@PLT
+; CHECK-NEXT:  .LBB20_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %and = and i32 %cc, 1
+  %cmp = icmp eq i32 %and, 0
+  br i1 %cmp, label %branch, label %exit
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check  'and/tm' for (cc == 1|3).
+define void @f1_2_4(ptr %a) {
+; CHECK-LABEL: f1_2_4:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jgnhe dummy@PLT
+; CHECK-NEXT:  .LBB21_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %and = and i32 %cc, 1
+  %cmp = icmp eq i32 %and, 0
+  br i1 %cmp, label %exit, label %branch
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check 'icmp' with one operand 'and' and other 'select_ccmask'(cc != 1).
+define void @f1_2_5(ptr %a) {
+; CHECK-LABEL: f1_2_5:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jgnl dummy@PLT
+; CHECK-NEXT:  .LBB22_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %trunc =  trunc i32 %cc to i1
+  %cmpne3 = icmp ne i32 %cc, 3
+  %cmp = xor i1 %cmpne3, %trunc
+  br i1 %cmp, label %branch, label %exit
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check nested 'xor' cc with select_ccmask(cc != 1).
+define void @f1_3_1(ptr %a) {
+; CHECK-LABEL: f1_3_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jgnl dummy@PLT
+; CHECK-NEXT:  .LBB23_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %cmpeq0 = icmp eq i32 %cc, 0
+  %cmpeq2 = icmp eq i32 %cc, 2
+  %xor = xor i1 %cmpeq0, %cmpeq2
+  %cmpne3 = icmp ne i32 %cc, 3
+  %cmp.inv = xor i1 %cmpne3, %xor
+  br i1 %cmp.inv, label %exit, label %branch
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check branching on 'tm' and 'xor' with one operand cc and the other
+; select_ccmask(cc !=1).
+define void @f1_3_2(ptr %a) {
+; CHECK-LABEL: f1_3_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jgnl dummy@PLT
+; CHECK-NEXT:  .LBB24_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %trunc =  trunc i32 %cc to i1
+  %cmpeq3 = icmp eq i32 %cc, 3
+  %cmp.inv = xor i1 %cmpeq3, %trunc
+  br i1 %cmp.inv, label %exit, label %branch
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check branching on 'tm' and 'xor' with one operand cc and the other
+; select_ccmask(cc !=2).
+define void @f1_3_3(ptr %a) {
+; CHECK-LABEL: f1_3_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jgnh dummy@PLT
+; CHECK-NEXT:  .LBB25_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %trunc =  trunc i32 %cc to i1
+  %cmpne0 = icmp ne i32 %cc, 0
+  %cmp.cond.inv = xor i1 %cmpne0, %trunc
+  br i1 %cmp.cond.inv, label %exit, label %branch
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check 'or' with both operands are select_ccmask one with TM and other with
+; ICMP(cc == 1).
+define void @f1_4_1(ptr %a) {
+; CHECK-LABEL: f1_4_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jgl dummy@PLT
+; CHECK-NEXT:  .LBB26_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %andcc =  and i32 %cc, 1
+  %cmpeq0 = icmp eq i32 %andcc, 0
+  %cmpeq3 = icmp eq i32 %cc, 3
+  %cmp.cond.inv = or i1 %cmpeq3, %cmpeq0
+  br i1 %cmp.cond.inv, label %exit, label %branch
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check 'or' for (cc == 0|1).
+define void @f1_4_2(ptr %a) {
+; CHECK-LABEL: f1_4_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jgle dummy@PLT
+; CHECK-NEXT:  .LBB27_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %or = or disjoint i32 %cc, -4
+  %cmp.inv = icmp samesign ugt i32 %or, -3
+  br i1 %cmp.inv, label %exit, label %branch
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check 'or' for (cc == 0|1).
+define void @f1_4_3(ptr %a) {
+; CHECK-LABEL: f1_4_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jgle dummy@PLT
+; CHECK-NEXT:  .LBB28_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %or = or disjoint i32 %cc, -4
+  %cmp = icmp samesign ult i32 %or, -2
+  br i1 %cmp, label %branch, label %exit
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
diff --git a/llvm/test/CodeGen/SystemZ/inline-asm-flag-output-02.ll b/llvm/test/CodeGen/SystemZ/inline-asm-flag-output-02.ll
new file mode 100644
index 0000000000000..b9b9a4b779b8a
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/inline-asm-flag-output-02.ll
@@ -0,0 +1,1665 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -verify-machineinstrs -mtriple=s390x-linux-gnu -O2 | FileCheck %s
+; Test implementation of combining select_ccmask for flag output operand and
+; optimizing ipm sequence using conditional branches.
+
+; Test-1(f2_0_*): Both TrueVal and FalseVal non-const(14-valid CCMask).
+
+; Check (cc == 0).
+define i64 @f2_0_eq_0(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_0_eq_0:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    ber %r14
+; CHECK-NEXT:  .LBB0_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %cmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %cmp)
+  %cond = icmp eq i32 %cc, 0
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+; Check (cc != 0).
+define i64 @f2_0_ne_0(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_0_ne_0:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bner %r14
+; CHECK-NEXT:  .LBB1_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %cmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %cmp)
+  %cond = icmp ugt i32 %cc, 0
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+; Check (cc == 1).
+define i64 @f2_0_eq_1(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_0_eq_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    blr %r14
+; CHECK-NEXT:  .LBB2_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %cmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %cmp)
+  %cond = icmp eq i32 %cc, 1
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+; Check (cc != 1).
+define i64 @f2_0_ne_1(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_0_ne_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bnlr %r14
+; CHECK-NEXT:  .LBB3_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %cmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %cmp)
+  %cond = icmp ne i32 %cc, 1
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+; Check (cc == 2).
+define i64 @f2_0_eq_2(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_0_eq_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bhr %r14
+; CHECK-NEXT:  .LBB4_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %cmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %cmp)
+  %cond = icmp eq i32 %cc, 2
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+; Check (cc != 2).
+define i64 @f2_0_ne_2(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_0_ne_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bnhr %r14
+; CHECK-NEXT:  .LBB5_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %cmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %cmp)
+  %cond = icmp ne i32 %cc, 2
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+; Check (cc == 3).
+define i64 @f2_0_eq_3(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_0_eq_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bor %r14
+; CHECK-NEXT:  .LBB6_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %cmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %cmp)
+  %cond = icmp eq i32 %cc, 3
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+; Check (cc != 3).
+define i64 @f2_0_ne_3(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_0_ne_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bnor %r14
+; CHECK-NEXT:  .LBB7_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %cmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %cmp)
+  %cond = icmp ult i32 %cc, 3
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+; Check (cc == 0|1).
+define i64 @f2_0_01(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_0_01:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bler %r14
+; CHECK-NEXT:  .LBB8_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %cmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %cmp)
+  %cond = icmp ult i32 %cc, 2
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+; Check (cc == 0|2).
+define i64 @f2_0_02(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_0_02:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bher %r14
+; CHECK-NEXT:  .LBB9_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %cmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %cmp)
+  %and = and i32 %cc, 1
+  %cond = icmp eq i32 %and, 0
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+; Check (cc == 0|3).
+define i64 @f2_0_03(i64 %y, i64 %x, ptr %a) {
+; CHECK-LABEL: f2_0_03:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    blhr %r14
+; CHECK-NEXT:  .LBB10_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %cmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %cmp)
+  %cmp0 = icmp ne i32 %cc, 0
+  %cmp3 = icmp ne i32 %cc, 3
+  %cond.inv = and i1 %cmp0, %cmp3
+  %res = select i1 %cond.inv, i64 %y, i64 %x
+  ret i64 %res
+}
+
+; Check (cc == 1|2).
+define i64 @f2_0_12(i64 %y, i64 %x, ptr %a) {
+; CHECK-LABEL: f2_0_12:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bnlhr %r14
+; CHECK-NEXT:  .LBB11_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %cmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %cmp)
+  %add = add nsw i32 %cc, -3
+  %cond.inv = icmp ult i32 %add, -2
+  %res = select i1 %cond.inv, i64 %y, i64 %x
+  ret i64 %res
+}
+
+; Check (cc == 1|3).
+define i64 @f2_0_13(i64 %y, i64 %x, ptr %a) {
+; CHECK-LABEL: f2_0_13:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bher %r14
+; CHECK-NEXT:  .LBB12_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %cmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %cmp)
+  %and = and i32 %cc, 1
+  %cond.inv = icmp eq i32 %and, 0
+  %res = select i1 %cond.inv, i64 %y, i64 %x
+  ret i64 %res
+}
+
+; Check (cc == 2|3).
+define i64 @f2_0_23(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_0_23:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bnler %r14
+; CHECK-NEXT:  .LBB13_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %cmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %cmp)
+  %cond = icmp ugt i32 %cc, 1
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+; Test-2(f2_1_*/f2_2_*/f2_3_*/f2_4_*).
+; Both TrueVal and FalseVal are non-const with mixed patterns involving
+; Binary Ops.
+
+; Check 'add' for (cc != 0).
+define i64 @f2_1_1(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_1_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bner %r14
+; CHECK-NEXT:  .LBB14_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %add = add nsw i32 %cc, -1
+  %cond = icmp ult i32 %add, 3
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+; Check 'add' for (cc == 1|2).
+define i64 @f2_1_2(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_1_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    blhr %r14
+; CHECK-NEXT:  .LBB15_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %add = add nsw i32 %cc, -1
+  %cond = icmp ult i32 %add, 2
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+; Check 'add' for (cc == 1|2).
+define i64 @f2_1_3(i64 %y, i64 %x, ptr %a) {
+; CHECK-LABEL: f2_1_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bnlhr %r14
+; CHECK-NEXT:  .LBB16_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %add = add nsw i32 %cc, -3
+  %cond.inv = icmp ult i32 %add, -2
+  %res = select i1 %cond.inv, i64 %y, i64 %x
+  ret i64 %res
+}
+
+; Check 'and' with one operand cc and other select_ccmask(cc !=1).
+define i64 @f2_2_1(i64 %y, i64 %x, ptr %a) {
+; CHECK-LABEL: f2_2_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    blr %r14
+; CHECK-NEXT:  .LBB17_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %andcc =  and i32 %cc, 1
+  %cmpne0 = icmp ne i32 %andcc, 0
+  %cmpne3 = icmp ne i32 %cc, 3
+  %cond.inv = and i1 %cmpne3, %cmpne0
+  %res = select i1 %cond.inv, i64 %y, i64 %x
+  ret i64 %res
+}
+
+; Check 'and' with both operands select_ccmask(cc != 2).
+define i64 @f2_2_2(i64 %y, i64 %x, ptr %a) {
+; CHECK-LABEL: f2_2_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bhr %r14
+; CHECK-NEXT:  .LBB18_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %ugt1 = icmp samesign ugt i32 %cc, 1
+  %cmpne3 = icmp ne i32 %cc, 3
+  %cond.inv = and i1 %ugt1, %cmpne3
+  %res = select i1 %cond.inv, i64 %y, i64 %x
+  ret i64 %res
+}
+
+; Check 'and/tm' for (cc == 0|2).
+define i64 @f2_2_3(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_2_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bher %r14
+; CHECK-NEXT:  .LBB19_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %and = and i32 %cc, 1
+  %cond = icmp eq i32 %and, 0
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+; Check  'and/tm' for (cc == 1|3).
+define i64 @f2_2_4(i64 %y, i64 %x, ptr %a) {
+; CHECK-LABEL: f2_2_4:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bher %r14
+; CHECK-NEXT:  .LBB20_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %and = and i32 %cc, 1
+  %cond.inv = icmp eq i32 %and, 0
+  %res = select i1 %cond.inv, i64 %y, i64 %x
+  ret i64 %res
+}
+
+; Check 'icmp' with one operand 'and' and other 'select_ccmask'(cc != 1).
+define i64 @f2_2_5(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_2_5:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bnlr %r14
+; CHECK-NEXT:  .LBB21_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %trunc =  trunc i32 %cc to i1
+  %cmpne3 = icmp ne i32 %cc, 3
+  %cond = xor i1 %cmpne3, %trunc
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+
+; Check nested 'xor' cc with select_ccmask(cc != 1).
+define i64 @f2_3_1(i64 %y, i64 %x, ptr %a) {
+; CHECK-LABEL: f2_3_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    blr %r14
+; CHECK-NEXT:  .LBB22_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %cmpeq0 = icmp eq i32 %cc, 0
+  %cmpeq2 = icmp eq i32 %cc, 2
+  %xor = xor i1 %cmpeq0, %cmpeq2
+  %cmpne3 = icmp ne i32 %cc, 3
+  %cond.inv = xor i1 %cmpne3, %xor
+  %res = select i1 %cond.inv, i64 %y, i64 %x
+  ret i64 %res
+}
+
+; Check branching on 'tm' and 'xor' with one operand cc and the other
+; select_ccmask(cc !=1).
+define i64 @f2_3_2(i64 %y, i64 %x, ptr %a) {
+; CHECK-LABEL: f2_3_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    blr %r14
+; CHECK-NEXT:  .LBB23_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %trunc =  trunc i32 %cc to i1
+  %cmpeq3 = icmp eq i32 %cc, 3
+  %cond.inv = xor i1 %cmpeq3, %trunc
+  %res = select i1 %cond.inv, i64 %y, i64 %x
+  ret i64 %res
+}
+
+; Check branching on 'tm' and 'xor' with one operand cc and the other
+; select_ccmask(cc !=2).
+define i64 @f2_3_3(i64 %y, i64 %x, ptr %a) {
+; CHECK-LABEL: f2_3_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bhr %r14
+; CHECK-NEXT:  .LBB24_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %trunc =  trunc i32 %cc to i1
+  %cmpne0 = icmp ne i32 %cc, 0
+  %cond.inv = xor i1 %cmpne0, %trunc
+  %res = select i1 %cond.inv, i64 %y, i64 %x
+  ret i64 %res
+}
+
+; Check 'or' with both operands select_ccmask with TM and ICMP(cc == 1).
+define i64 @f2_4_1(i64 %y, i64 %x, ptr %a) {
+; CHECK-LABEL: f2_4_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bnlr %r14
+; CHECK-NEXT:  .LBB25_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %andcc =  and i32 %cc, 1
+  %cmpeq0 = icmp eq i32 %andcc, 0
+  %cmpeq3 = icmp eq i32 %cc, 3
+  %cond.inv = or i1 %cmpeq3, %cmpeq0
+  %res = select i1 %cond.inv, i64 %y, i64 %x
+  ret i64 %res
+}
+
+; Check 'or' for (cc == 0|1).
+define i64 @f2_4_2(i64 %y, i64 %x, ptr %a) {
+; CHECK-LABEL: f2_4_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bnler %r14
+; CHECK-NEXT:  .LBB26_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %or = or disjoint i32 %cc, -4
+  %cond.inv = icmp samesign ugt i32 %or, -3
+  %res = select i1 %cond.inv, i64 %y, i64 %x
+  ret i64 %res
+}
+
+; Check 'or' for (cc == 0|1).
+define i64 @f2_4_3(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_4_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bler %r14
+; CHECK-NEXT:  .LBB27_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %or = or disjoint i32 %cc, -4
+  %cond = icmp samesign ult i32 %or, -2
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+; Test-3(f3_1_*/f3_2_*/f3_3_*/f3_4_*).
+; TrueVal is non-const and FalseVal is const with mixed patterns involving
+; Binary Ops.
+
+; Check 'add' for (cc != 0).
+define i64 @f3_1_1(i64 %x, ptr %a) {
+; CHECK-LABEL: f3_1_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r3), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bner %r14
+; CHECK-NEXT:  .LBB28_1: # %entry
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %add = add nsw i32 %cc, -1
+  %cond = icmp ult i32 %add, 3
+  %res = select i1 %cond, i64 %x, i64 5
+  ret i64 %res
+}
+
+; Check 'add' for (cc == 1|2).
+define i64 @f3_1_2(i64 %x, ptr %a) {
+; CHECK-LABEL: f3_1_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r3), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    blhr %r14
+; CHECK-NEXT:  .LBB29_1: # %entry
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %add = add nsw i32 %cc, -1
+  %cond = icmp ult i32 %add, 2
+  %res = select i1 %cond, i64 %x, i64 5
+  ret i64 %res
+}
+
+; Check 'add' for (cc == 1|2).
+define i64 @f3_1_3(ptr %a, i64 %x) {
+; CHECK-LABEL: f3_1_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    bnlhr %r14
+; CHECK-NEXT:  .LBB30_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %add = add nsw i32 %cc, -3
+  %cond.inv = icmp ult i32 %add, -2
+  %res = select i1 %cond.inv, i64 5, i64 %x
+  ret i64 %res
+}
+
+; Check 'and' with one operand cc and other select_ccmask(cc !=1).
+define i64 @f3_2_1(ptr %a, i64 %x) {
+; CHECK-LABEL: f3_2_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    blr %r14
+; CHECK-NEXT:  .LBB31_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %andcc =  and i32 %cc, 1
+  %cmpne0 = icmp ne i32 %andcc, 0
+  %cmpne3 = icmp ne i32 %cc, 3
+  %cond.inv = and i1 %cmpne3, %cmpne0
+  %res = select i1 %cond.inv, i64 5, i64 %x
+  ret i64 %res
+}
+
+; Check 'and' with both operands select_ccmask(cc != 2).
+define i64 @f3_2_2(ptr %a, i64 %x) {
+; CHECK-LABEL: f3_2_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    bhr %r14
+; CHECK-NEXT:  .LBB32_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %ugt1 = icmp samesign ugt i32 %cc, 1
+  %cmpne3 = icmp ne i32 %cc, 3
+  %cond.inv = and i1 %ugt1, %cmpne3
+  %res = select i1 %cond.inv, i64 5, i64 %x
+  ret i64 %res
+}
+
+; Check 'and/tm' for (cc == 0|2).
+define i64 @f3_2_3(i64 %x, ptr %a) {
+; CHECK-LABEL: f3_2_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r3), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bher %r14
+; CHECK-NEXT:  .LBB33_1: # %entry
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %and = and i32 %cc, 1
+  %cond = icmp eq i32 %and, 0
+  %res = select i1 %cond, i64 %x, i64 5
+  ret i64 %res
+}
+
+; Check  'and/tm' for (cc == 1|3).
+define i64 @f3_2_4(ptr %a, i64 %x) {
+; CHECK-LABEL: f3_2_4:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    bher %r14
+; CHECK-NEXT:  .LBB34_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %and = and i32 %cc, 1
+  %cond.inv = icmp eq i32 %and, 0
+  %res = select i1 %cond.inv, i64 5, i64 %x
+  ret i64 %res
+}
+
+; Check 'icmp' with one operand 'and' and other 'select_ccmask'(cc != 1).
+define i64 @f3_2_5(i64 %x, ptr %a) {
+; CHECK-LABEL: f3_2_5:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r3), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bnlr %r14
+; CHECK-NEXT:  .LBB35_1: # %entry
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %trunc =  trunc i32 %cc to i1
+  %cmpne3 = icmp ne i32 %cc, 3
+  %cond = xor i1 %cmpne3, %trunc
+  %res = select i1 %cond, i64 %x, i64 5
+  ret i64 %res
+}
+
+
+; Check nested 'xor' cc with select_ccmask(cc != 1).
+define i64 @f3_3_1(ptr %a, i64 %x) {
+; CHECK-LABEL: f3_3_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    blr %r14
+; CHECK-NEXT:  .LBB36_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %cmpeq0 = icmp eq i32 %cc, 0
+  %cmpeq2 = icmp eq i32 %cc, 2
+  %xor = xor i1 %cmpeq0, %cmpeq2
+  %cmpne3 = icmp ne i32 %cc, 3
+  %cond.inv = xor i1 %cmpne3, %xor
+  %res = select i1 %cond.inv, i64 5, i64 %x
+  ret i64 %res
+}
+
+; Check branching on 'tm' and 'xor' with one operand cc and the other
+; select_ccmask(cc !=1).
+define i64 @f3_3_2(ptr %a, i64 %x) {
+; CHECK-LABEL: f3_3_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    blr %r14
+; CHECK-NEXT:  .LBB37_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %trunc =  trunc i32 %cc to i1
+  %cmpeq3 = icmp eq i32 %cc, 3
+  %cond.inv = xor i1 %cmpeq3, %trunc
+  %res = select i1 %cond.inv, i64 5, i64 %x
+  ret i64 %res
+}
+
+; Check branching on 'tm' and 'xor' with one operand cc and the other
+; select_ccmask(cc !=2).
+define i64 @f3_3_3(ptr %a, i64 %x) {
+; CHECK-LABEL: f3_3_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    bhr %r14
+; CHECK-NEXT:  .LBB38_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %trunc =  trunc i32 %cc to i1
+  %cmpne0 = icmp ne i32 %cc, 0
+  %cond.inv = xor i1 %cmpne0, %trunc
+  %res = select i1 %cond.inv, i64 5, i64 %x
+  ret i64 %res
+}
+
+; Check 'or' with both operands select_ccmask with TM and ICMP(cc == 1).
+define i64 @f3_4_1(ptr %a, i64 %x) {
+; CHECK-LABEL: f3_4_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    bnlr %r14
+; CHECK-NEXT:  .LBB39_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %andcc =  and i32 %cc, 1
+  %cmpeq0 = icmp eq i32 %andcc, 0
+  %cmpeq3 = icmp eq i32 %cc, 3
+  %cond.inv = or i1 %cmpeq3, %cmpeq0
+  %res = select i1 %cond.inv, i64 5, i64 %x
+  ret i64 %res
+}
+
+; Check 'or' for (cc == 0|1).
+define i64 @f3_4_2(ptr %a, i64 %x) {
+; CHECK-LABEL: f3_4_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    bnler %r14
+; CHECK-NEXT:  .LBB40_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %or = or disjoint i32 %cc, -4
+  %cond.inv = icmp samesign ugt i32 %or, -3
+  %res = select i1 %cond.inv, i64 5, i64 %x
+  ret i64 %res
+}
+
+; Check 'or' for (cc == 0|1).
+define i64 @f3_4_3(i64 %x, ptr %a) {
+; CHECK-LABEL: f3_4_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r3), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bler %r14
+; CHECK-NEXT:  .LBB41_1: # %entry
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %or = or disjoint i32 %cc, -4
+  %cond = icmp samesign ult i32 %or, -2
+  %res = select i1 %cond, i64 %x, i64 5
+  ret i64 %res
+}
+
+
+; Test-4(f4_1_*/f4_2_*/f4_3_*/f4_4_*).
+; TrueVal is const and FalseVal is non-const with mixed patterns involving
+; Binary Ops.
+
+; Check 'add' for (cc != 0).
+define i64 @f4_1_1(ptr %a, i64 %y) {
+; CHECK-LABEL: f4_1_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    bner %r14
+; CHECK-NEXT:  .LBB42_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %add = add nsw i32 %cc, -1
+  %cond = icmp ult i32 %add, 3
+  %res = select i1 %cond, i64 15, i64 %y
+  ret i64 %res
+}
+
+; Check 'add' for (cc == 1|2).
+define i64 @f4_1_2(ptr %a, i64 %y) {
+; CHECK-LABEL: f4_1_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    blhr %r14
+; CHECK-NEXT:  .LBB43_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %add = add nsw i32 %cc, -1
+  %cond = icmp ult i32 %add, 2
+  %res = select i1 %cond, i64 15, i64 %y
+  ret i64 %res
+}
+
+; Check 'add' for (cc == 1|2).
+define i64 @f4_1_3(i64 %y, ptr %a) {
+; CHECK-LABEL: f4_1_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r3), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bnlhr %r14
+; CHECK-NEXT:  .LBB44_1: # %entry
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %add = add nsw i32 %cc, -3
+  %cond.inv = icmp ult i32 %add, -2
+  %res = select i1 %cond.inv, i64 %y, i64 15
+  ret i64 %res
+}
+
+; Check 'and' with one operand cc and other select_ccmask(cc !=1).
+define i64 @f4_2_1(i64 %y, ptr %a) {
+; CHECK-LABEL: f4_2_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r3), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    blr %r14
+; CHECK-NEXT:  .LBB45_1: # %entry
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %andcc =  and i32 %cc, 1
+  %cmpne0 = icmp ne i32 %andcc, 0
+  %cmpne3 = icmp ne i32 %cc, 3
+  %cond.inv = and i1 %cmpne3, %cmpne0
+  %res = select i1 %cond.inv, i64 %y, i64 15
+  ret i64 %res
+}
+
+; Check 'and' with both operands select_ccmask(cc != 2).
+define i64 @f4_2_2(i64 %y, ptr %a) {
+; CHECK-LABEL: f4_2_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r3), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bhr %r14
+; CHECK-NEXT:  .LBB46_1: # %entry
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %ugt1 = icmp samesign ugt i32 %cc, 1
+  %cmpne3 = icmp ne i32 %cc, 3
+  %cond.inv = and i1 %ugt1, %cmpne3
+  %res = select i1 %cond.inv, i64 %y, i64 15
+  ret i64 %res
+}
+
+; Check 'and/tm' for (cc == 0|2).
+define i64 @f4_2_3(ptr %a, i64 %y) {
+; CHECK-LABEL: f4_2_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    bher %r14
+; CHECK-NEXT:  .LBB47_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %and = and i32 %cc, 1
+  %cond = icmp eq i32 %and, 0
+  %res = select i1 %cond, i64 15, i64 %y
+  ret i64 %res
+}
+
+; Check  'and/tm' for (cc == 1|3).
+define i64 @f4_2_4(i64 %y, ptr %a) {
+; CHECK-LABEL: f4_2_4:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r3), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bher %r14
+; CHECK-NEXT:  .LBB48_1: # %entry
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %and = and i32 %cc, 1
+  %cond.inv = icmp eq i32 %and, 0
+  %res = select i1 %cond.inv, i64 %y, i64 15
+  ret i64 %res
+}
+
+; Check 'icmp' with one operand 'and' and other 'select_ccmask'(cc != 1).
+define i64 @f4_2_5(ptr %a, i64 %y) {
+; CHECK-LABEL: f4_2_5:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    bnlr %r14
+; CHECK-NEXT:  .LBB49_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %trunc =  trunc i32 %cc to i1
+  %cmpne3 = icmp ne i32 %cc, 3
+  %cond = xor i1 %cmpne3, %trunc
+  %res = select i1 %cond, i64 15, i64 %y
+  ret i64 %res
+}
+
+
+; Check nested 'xor' cc with select_ccmask(cc != 1).
+define i64 @f4_3_1(i64 %y, ptr %a) {
+; CHECK-LABEL: f4_3_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r3), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    blr %r14
+; CHECK-NEXT:  .LBB50_1: # %entry
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %cmpeq0 = icmp eq i32 %cc, 0
+  %cmpeq2 = icmp eq i32 %cc, 2
+  %xor = xor i1 %cmpeq0, %cmpeq2
+  %cmpne3 = icmp ne i32 %cc, 3
+  %cond.inv = xor i1 %cmpne3, %xor
+  %res = select i1 %cond.inv, i64 %y, i64 15
+  ret i64 %res
+}
+
+; Check branching on 'tm' and 'xor' with one operand cc and the other
+; select_ccmask(cc !=1).
+define i64 @f4_3_2(i64 %y, ptr %a) {
+; CHECK-LABEL: f4_3_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r3), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    blr %r14
+; CHECK-NEXT:  .LBB51_1: # %entry
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %trunc =  trunc i32 %cc to i1
+  %cmpeq3 = icmp eq i32 %cc, 3
+  %cond.inv = xor i1 %cmpeq3, %trunc
+  %res = select i1 %cond.inv, i64 %y, i64 15
+  ret i64 %res
+}
+
+; Check branching on 'tm' and 'xor' with one operand cc and the other
+; select_ccmask(cc !=2).
+define i64 @f4_3_3(i64 %y, ptr %a) {
+; CHECK-LABEL: f4_3_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r3), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bhr %r14
+; CHECK-NEXT:  .LBB52_1: # %entry
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %trunc =  trunc i32 %cc to i1
+  %cmpne0 = icmp ne i32 %cc, 0
+  %cond.inv = xor i1 %cmpne0, %trunc
+  %res = select i1 %cond.inv, i64 %y, i64 15
+  ret i64 %res
+}
+
+; Check 'or' with both operands select_ccmask with TM and ICMP(cc == 1).
+define i64 @f4_4_1(i64 %y,ptr %a) {
+; CHECK-LABEL: f4_4_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r3), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bnlr %r14
+; CHECK-NEXT:  .LBB53_1: # %entry
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %andcc =  and i32 %cc, 1
+  %cmpeq0 = icmp eq i32 %andcc, 0
+  %cmpeq3 = icmp eq i32 %cc, 3
+  %cond.inv = or i1 %cmpeq3, %cmpeq0
+  %res = select i1 %cond.inv, i64 %y, i64 15
+  ret i64 %res
+}
+
+; Check 'or' for (cc == 0|1).
+define i64 @f4_4_2(i64 %y, ptr %a) {
+; CHECK-LABEL: f4_4_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r3), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bnler %r14
+; CHECK-NEXT:  .LBB54_1: # %entry
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %or = or disjoint i32 %cc, -4
+  %cond.inv = icmp samesign ugt i32 %or, -3
+  %res = select i1 %cond.inv, i64 %y, i64 15
+  ret i64 %res
+}
+
+; Check 'or' for (cc == 0|1).
+define i64 @f4_4_3(ptr %a, i64 %y) {
+; CHECK-LABEL: f4_4_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    bler %r14
+; CHECK-NEXT:  .LBB55_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %or = or disjoint i32 %cc, -4
+  %cond = icmp samesign ult i32 %or, -2
+  %res = select i1 %cond, i64 15, i64 %y
+  ret i64 %res
+}
+
+; Test-5(f5_1_*/f5_2_*/f5_3_*/f5_4_*).
+; Both TrueVal and FalseVal are const with mixed patterns involving
+; Binary Ops.
+
+
+; Check 'add' for (cc != 0).
+define i64 @f5_1_1(ptr %a) {
+; CHECK-LABEL: f5_1_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    bner %r14
+; CHECK-NEXT:  .LBB56_1: # %entry
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %add = add nsw i32 %cc, -1
+  %cond = icmp ult i32 %add, 3
+  %res = select i1 %cond, i64 15, i64 5
+  ret i64 %res
+}
+
+; Check 'add' for (cc == 1|2).
+define i64 @f5_1_2(ptr %a) {
+; CHECK-LABEL: f5_1_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    blhr %r14
+; CHECK-NEXT:  .LBB57_1: # %entry
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %add = add nsw i32 %cc, -1
+  %cond = icmp ult i32 %add, 2
+  %res = select i1 %cond, i64 15, i64 5
+  ret i64 %res
+}
+
+; Check 'add' for (cc == 1|2).
+define i64 @f5_1_3(ptr %a) {
+; CHECK-LABEL: f5_1_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    bnlhr %r14
+; CHECK-NEXT:  .LBB58_1: # %entry
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %add = add nsw i32 %cc, -3
+  %cond.inv = icmp ult i32 %add, -2
+  %res = select i1 %cond.inv, i64 5, i64 15
+  ret i64 %res
+}
+
+; Check 'and' with one operand cc and other select_ccmask(cc !=1).
+define i64 @f5_2_1(ptr %a) {
+; CHECK-LABEL: f5_2_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    blr %r14
+; CHECK-NEXT:  .LBB59_1: # %entry
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %andcc =  and i32 %cc, 1
+  %cmpne0 = icmp ne i32 %andcc, 0
+  %cmpne3 = icmp ne i32 %cc, 3
+  %cond.inv = and i1 %cmpne3, %cmpne0
+  %res = select i1 %cond.inv, i64 5, i64 15
+  ret i64 %res
+}
+
+; Check 'and' with both operands select_ccmask(cc != 2).
+define i64 @f5_2_2(ptr %a) {
+; CHECK-LABEL: f5_2_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    bhr %r14
+; CHECK-NEXT:  .LBB60_1: # %entry
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %ugt1 = icmp samesign ugt i32 %cc, 1
+  %cmpne3 = icmp ne i32 %cc, 3
+  %cond.inv = and i1 %ugt1, %cmpne3
+  %res = select i1 %cond.inv, i64 5, i64 15
+  ret i64 %res
+}
+
+; Check 'and/tm' for (cc == 0|2).
+define i64 @f5_2_3(ptr %a) {
+; CHECK-LABEL: f5_2_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    bher %r14
+; CHECK-NEXT:  .LBB61_1: # %entry
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %and = and i32 %cc, 1
+  %cond = icmp eq i32 %and, 0
+  %res = select i1 %cond, i64 15, i64 5
+  ret i64 %res
+}
+
+; Check  'and/tm' for (cc == 1|3).
+define i64 @f5_2_4(ptr %a) {
+; CHECK-LABEL: f5_2_4:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    bher %r14
+; CHECK-NEXT:  .LBB62_1: # %entry
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %and = and i32 %cc, 1
+  %cond.inv = icmp eq i32 %and, 0
+  %res = select i1 %cond.inv, i64 5, i64 15
+  ret i64 %res
+}
+
+; Check 'icmp' with one operand 'and' and other 'select_ccmask'(cc != 1).
+define i64 @f5_2_5(ptr %a) {
+; CHECK-LABEL: f5_2_5:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    bnlr %r14
+; CHECK-NEXT:  .LBB63_1: # %entry
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %trunc =  trunc i32 %cc to i1
+  %cmpne3 = icmp ne i32 %cc, 3
+  %cond = xor i1 %cmpne3, %trunc
+  %res = select i1 %cond, i64 15, i64 5
+  ret i64 %res
+}
+
+
+; Check nested 'xor' cc with select_ccmask(cc != 1).
+define i64 @f5_3_1(ptr %a) {
+; CHECK-LABEL: f5_3_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    blr %r14
+; CHECK-NEXT:  .LBB64_1: # %entry
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %cmpeq0 = icmp eq i32 %cc, 0
+  %cmpeq2 = icmp eq i32 %cc, 2
+  %xor = xor i1 %cmpeq0, %cmpeq2
+  %cmpne3 = icmp ne i32 %cc, 3
+  %cond.inv = xor i1 %cmpne3, %xor
+  %res = select i1 %cond.inv, i64 5, i64 15
+  ret i64 %res
+}
+
+; Check branching on 'tm' and 'xor' with one operand cc and the other
+; select_ccmask(cc !=1).
+define i64 @f5_3_2(ptr %a) {
+; CHECK-LABEL: f5_3_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    blr %r14
+; CHECK-NEXT:  .LBB65_1: # %entry
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %trunc =  trunc i32 %cc to i1
+  %cmpeq3 = icmp eq i32 %cc, 3
+  %cond.inv = xor i1 %cmpeq3, %trunc
+  %res = select i1 %cond.inv, i64 5, i64 15
+  ret i64 %res
+}
+
+; Check branching on 'tm' and 'xor' with one operand cc and the other
+; select_ccmask(cc !=2).
+define i64 @f5_3_3(ptr %a) {
+; CHECK-LABEL: f5_3_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    bhr %r14
+; CHECK-NEXT:  .LBB66_1: # %entry
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %trunc =  trunc i32 %cc to i1
+  %cmpne0 = icmp ne i32 %cc, 0
+  %cond.inv = xor i1 %cmpne0, %trunc
+  %res = select i1 %cond.inv, i64 5, i64 15
+  ret i64 %res
+}
+
+; Check 'or' with both operands select_ccmask with TM and ICMP(cc == 1).
+define i64 @f5_4_1(ptr %a) {
+; CHECK-LABEL: f5_4_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    bnlr %r14
+; CHECK-NEXT:  .LBB67_1: # %entry
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %andcc =  and i32 %cc, 1
+  %cmpeq0 = icmp eq i32 %andcc, 0
+  %cmpeq3 = icmp eq i32 %cc, 3
+  %cond.inv = or i1 %cmpeq3, %cmpeq0
+  %res = select i1 %cond.inv, i64 5, i64 15
+  ret i64 %res
+}
+
+; Check 'or' for (cc == 0|1).
+define i64 @f5_4_2(ptr %a) {
+; CHECK-LABEL: f5_4_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    bnler %r14
+; CHECK-NEXT:  .LBB68_1: # %entry
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %or = or disjoint i32 %cc, -4
+  %cond.inv = icmp samesign ugt i32 %or, -3
+  %res = select i1 %cond.inv, i64 5, i64 15
+  ret i64 %res
+}
+
+; Check 'or' for (cc == 0|1).
+define i64 @f5_4_3(ptr %a) {
+; CHECK-LABEL: f5_4_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    bler %r14
+; CHECK-NEXT:  .LBB69_1: # %entry
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %or = or disjoint i32 %cc, -4
+  %cond = icmp samesign ult i32 %or, -2
+  %res = select i1 %cond, i64 15, i64 5
+  ret i64 %res
+}
+
+; Nested select_ccmask with TrueVal and FalseVal swapped with each other.
+define i64 @f6_1(ptr %a) {
+; CHECK-LABEL: f6_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    bher %r14
+; CHECK-NEXT:  .LBB70_1: # %entry
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %cmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %cmp)
+  %andcc = and i32 %cc, 1
+  %cmpeq0 = icmp eq i32 %andcc, 0
+  %cmpeq3 = icmp eq i32 %cc, 3
+  %select = select i1 %cmpeq3, i64 5, i64 15
+  %res = select i1 %cmpeq0, i64 %select, i64 5
+  ret i64 %res
+}
+
diff --git a/llvm/test/CodeGen/SystemZ/int-conv-14.ll b/llvm/test/CodeGen/SystemZ/int-conv-14.ll
index 98dc88f289620..baab5ac7f4b5c 100644
--- a/llvm/test/CodeGen/SystemZ/int-conv-14.ll
+++ b/llvm/test/CodeGen/SystemZ/int-conv-14.ll
@@ -55,14 +55,15 @@ define i128 @f4(ptr %ptr) {
 }
 
 ; Truncation to i64.
-define i64 @f5(i128 %a) {
+define i64 @f5(i128 %a, i128 %b) {
 ; CHECK-LABEL: f5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vl %v0, 0(%r2), 3
-; CHECK-NEXT:    vaq %v0, %v0, %v0
+; CHECK-NEXT:    vl %v0, 0(%r3), 3
+; CHECK-NEXT:    vl %v1, 0(%r2), 3
+; CHECK-NEXT:    vaq %v0, %v1, %v0
 ; CHECK-NEXT:    vlgvg %r2, %v0, 1
 ; CHECK-NEXT:    br %r14
-  %op = add i128 %a, %a
+  %op = add i128 %a, %b
   %res = trunc i128 %op to i64
   ret i64 %res
 }
@@ -134,15 +135,16 @@ define i128 @f10(ptr %ptr) {
 }
 
 ; Truncation to i32.
-define i32 @f11(i128 %a) {
+define i32 @f11(i128 %a, i128 %b) {
 ; CHECK-LABEL: f11:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vl %v0, 0(%r2), 3
-; CHECK-NEXT:    vaq %v0, %v0, %v0
+; CHECK-NEXT:    vl %v0, 0(%r3), 3
+; CHECK-NEXT:    vl %v1, 0(%r2), 3
+; CHECK-NEXT:    vaq %v0, %v1, %v0
 ; CHECK-NEXT:    vlgvf %r2, %v0, 3
 ; CHECK-NEXT:    # kill: def $r2l killed $r2l killed $r2d
 ; CHECK-NEXT:    br %r14
-  %op = add i128 %a, %a
+  %op = add i128 %a, %b
   %res = trunc i128 %op to i32
   ret i32 %res
 }
@@ -215,15 +217,16 @@ define i128 @f16(ptr %ptr) {
 }
 
 ; Truncation to i16.
-define i16 @f17(i128 %a) {
+define i16 @f17(i128 %a, i128 %b) {
 ; CHECK-LABEL: f17:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vl %v0, 0(%r2), 3
-; CHECK-NEXT:    vaq %v0, %v0, %v0
+; CHECK-NEXT:    vl %v0, 0(%r3), 3
+; CHECK-NEXT:    vl %v1, 0(%r2), 3
+; CHECK-NEXT:    vaq %v0, %v1, %v0
 ; CHECK-NEXT:    vlgvf %r2, %v0, 3
 ; CHECK-NEXT:    # kill: def $r2l killed $r2l killed $r2d
 ; CHECK-NEXT:    br %r14
-  %op = add i128 %a, %a
+  %op = add i128 %a, %b
   %res = trunc i128 %op to i16
   ret i16 %res
 }
@@ -296,15 +299,16 @@ define i128 @f22(ptr %ptr) {
 }
 
 ; Truncation to i8.
-define i8 @f23(i128 %a) {
+define i8 @f23(i128 %a, i128 %b) {
 ; CHECK-LABEL: f23:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vl %v0, 0(%r2), 3
-; CHECK-NEXT:    vaq %v0, %v0, %v0
+; CHECK-NEXT:    vl %v0, 0(%r3), 3
+; CHECK-NEXT:    vl %v1, 0(%r2), 3
+; CHECK-NEXT:    vaq %v0, %v1, %v0
 ; CHECK-NEXT:    vlgvf %r2, %v0, 3
 ; CHECK-NEXT:    # kill: def $r2l killed $r2l killed $r2d
 ; CHECK-NEXT:    br %r14
-  %op = add i128 %a, %a
+  %op = add i128 %a, %b
   %res = trunc i128 %op to i8
   ret i8 %res
 }
@@ -385,15 +389,16 @@ define i128 @f28(ptr %ptr) {
 }
 
 ; Truncation to i1.
-define i1 @f29(i128 %a) {
+define i1 @f29(i128 %a, i128 %b) {
 ; CHECK-LABEL: f29:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vl %v0, 0(%r2), 3
-; CHECK-NEXT:    vaq %v0, %v0, %v0
+; CHECK-NEXT:    vl %v0, 0(%r3), 3
+; CHECK-NEXT:    vl %v1, 0(%r2), 3
+; CHECK-NEXT:    vaq %v0, %v1, %v0
 ; CHECK-NEXT:    vlgvf %r2, %v0, 3
 ; CHECK-NEXT:    # kill: def $r2l killed $r2l killed $r2d
 ; CHECK-NEXT:    br %r14
-  %op = add i128 %a, %a
+  %op = add i128 %a, %b
   %res = trunc i128 %op to i1
   ret i1 %res
 }
diff --git a/llvm/test/CodeGen/SystemZ/int-conv-15.ll b/llvm/test/CodeGen/SystemZ/int-conv-15.ll
index 0d8ee75b10b85..f2c9ee5fa1f57 100644
--- a/llvm/test/CodeGen/SystemZ/int-conv-15.ll
+++ b/llvm/test/CodeGen/SystemZ/int-conv-15.ll
@@ -55,14 +55,15 @@ define i128 @f4(ptr %ptr) {
 }
 
 ; Truncation to i64.
-define i64 @f5(i128 %a) {
+define i64 @f5(i128 %a, i128 %b) {
 ; CHECK-LABEL: f5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vl %v0, 0(%r2), 3
-; CHECK-NEXT:    vaq %v0, %v0, %v0
+; CHECK-NEXT:    vl %v0, 0(%r3), 3
+; CHECK-NEXT:    vl %v1, 0(%r2), 3
+; CHECK-NEXT:    vaq %v0, %v1, %v0
 ; CHECK-NEXT:    vlgvg %r2, %v0, 1
 ; CHECK-NEXT:    br %r14
-  %op = add i128 %a, %a
+  %op = add i128 %a, %b
   %res = trunc i128 %op to i64
   ret i64 %res
 }
@@ -134,15 +135,16 @@ define i128 @f10(ptr %ptr) {
 }
 
 ; Truncation to i32.
-define i32 @f11(i128 %a) {
+define i32 @f11(i128 %a, i128 %b) {
 ; CHECK-LABEL: f11:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vl %v0, 0(%r2), 3
-; CHECK-NEXT:    vaq %v0, %v0, %v0
+; CHECK-NEXT:    vl %v0, 0(%r3), 3
+; CHECK-NEXT:    vl %v1, 0(%r2), 3
+; CHECK-NEXT:    vaq %v0, %v1, %v0
 ; CHECK-NEXT:    vlgvf %r2, %v0, 3
 ; CHECK-NEXT:    # kill: def $r2l killed $r2l killed $r2d
 ; CHECK-NEXT:    br %r14
-  %op = add i128 %a, %a
+  %op = add i128 %a, %b
   %res = trunc i128 %op to i32
   ret i32 %res
 }
@@ -215,15 +217,16 @@ define i128 @f16(ptr %ptr) {
 }
 
 ; Truncation to i16.
-define i16 @f17(i128 %a) {
+define i16 @f17(i128 %a, i128 %b) {
 ; CHECK-LABEL: f17:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vl %v0, 0(%r2), 3
-; CHECK-NEXT:    vaq %v0, %v0, %v0
+; CHECK-NEXT:    vl %v0, 0(%r3), 3
+; CHECK-NEXT:    vl %v1, 0(%r2), 3
+; CHECK-NEXT:    vaq %v0, %v1, %v0
 ; CHECK-NEXT:    vlgvf %r2, %v0, 3
 ; CHECK-NEXT:    # kill: def $r2l killed $r2l killed $r2d
 ; CHECK-NEXT:    br %r14
-  %op = add i128 %a, %a
+  %op = add i128 %a, %b
   %res = trunc i128 %op to i16
   ret i16 %res
 }
@@ -296,15 +299,16 @@ define i128 @f22(ptr %ptr) {
 }
 
 ; Truncation to i8.
-define i8 @f23(i128 %a) {
+define i8 @f23(i128 %a, i128 %b) {
 ; CHECK-LABEL: f23:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vl %v0, 0(%r2), 3
-; CHECK-NEXT:    vaq %v0, %v0, %v0
+; CHECK-NEXT:    vl %v0, 0(%r3), 3
+; CHECK-NEXT:    vl %v1, 0(%r2), 3
+; CHECK-NEXT:    vaq %v0, %v1, %v0
 ; CHECK-NEXT:    vlgvf %r2, %v0, 3
 ; CHECK-NEXT:    # kill: def $r2l killed $r2l killed $r2d
 ; CHECK-NEXT:    br %r14
-  %op = add i128 %a, %a
+  %op = add i128 %a, %b
   %res = trunc i128 %op to i8
   ret i8 %res
 }
@@ -383,15 +387,16 @@ define i128 @f28(ptr %ptr) {
 }
 
 ; Truncation to i1.
-define i1 @f29(i128 %a) {
+define i1 @f29(i128 %a, i128 %b) {
 ; CHECK-LABEL: f29:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vl %v0, 0(%r2), 3
-; CHECK-NEXT:    vaq %v0, %v0, %v0
+; CHECK-NEXT:    vl %v0, 0(%r3), 3
+; CHECK-NEXT:    vl %v1, 0(%r2), 3
+; CHECK-NEXT:    vaq %v0, %v1, %v0
 ; CHECK-NEXT:    vlgvf %r2, %v0, 3
 ; CHECK-NEXT:    # kill: def $r2l killed $r2l killed $r2d
 ; CHECK-NEXT:    br %r14
-  %op = add i128 %a, %a
+  %op = add i128 %a, %b
   %res = trunc i128 %op to i1
   ret i1 %res
 }
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/clear-maskedinsts.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/clear-maskedinsts.ll
index 9621897c9510f..b6f4d40a252ad 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/clear-maskedinsts.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/clear-maskedinsts.ll
@@ -36,7 +36,7 @@ define hidden i32 @_Z4loopPiPjiS0_i(ptr noalias nocapture readonly %s1, ptr noal
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP5]])
 ; CHECK-NEXT:    [[TMP9]] = sub i32 [[TMP5]], 4
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT72]], ptr [[LSR_IV9]], i32 4, <4 x i1> [[TMP8]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT72]], ptr align 4 [[LSR_IV9]], <4 x i1> [[TMP8]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[SCEVGEP10]] = getelementptr i32, ptr [[LSR_IV9]], i32 4
 ; CHECK-NEXT:    [[TMP10]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP4]], i32 1)
@@ -54,10 +54,10 @@ define hidden i32 @_Z4loopPiPjiS0_i(ptr noalias nocapture readonly %s1, ptr noal
 ; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_183]], i32 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP13]], <4 x i32> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp ule <4 x i32> [[INDUCTION86]], [[TMP14]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV6]], i32 4, <4 x i1> [[TMP15]], <4 x i32> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD89:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV3]], i32 4, <4 x i1> [[TMP15]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV6]], <4 x i1> [[TMP15]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD89:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV3]], <4 x i1> [[TMP15]], <4 x i32> undef)
 ; CHECK-NEXT:    [[TMP16:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[WIDE_MASKED_LOAD89]], <4 x i32> [[WIDE_MASKED_LOAD]])
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP16]], ptr [[LSR_IV]], i32 4, <4 x i1> [[TMP15]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP16]], ptr align 4 [[LSR_IV]], <4 x i1> [[TMP15]])
 ; CHECK-NEXT:    [[INDEX_NEXT81]] = add i32 [[INDEX80]], 4
 ; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4
 ; CHECK-NEXT:    [[SCEVGEP4]] = getelementptr i32, ptr [[LSR_IV3]], i32 4
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll
index 0ddbbb4d53dd1..c01d4ebee58b1 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll
@@ -30,10 +30,10 @@ define void @mat_vec_sext_i16(ptr nocapture readonly %A, ptr nocapture readonly
 ; CHECK-NEXT:    [[TT6:%.*]] = getelementptr inbounds i16, ptr [[TT3]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]])
 ; CHECK-NEXT:    [[TMP2]] = sub i32 [[TMP0]], 4
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr [[TT6]], i32 2, <4 x i1> [[TMP1]], <4 x i16> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 2 [[TT6]], <4 x i1> [[TMP1]], <4 x i16> undef)
 ; CHECK-NEXT:    [[TT9:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD]] to <4 x i32>
 ; CHECK-NEXT:    [[TT10:%.*]] = getelementptr inbounds i16, ptr [[B:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD30:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr [[TT10]], i32 2, <4 x i1> [[TMP1]], <4 x i16> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD30:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 2 [[TT10]], <4 x i1> [[TMP1]], <4 x i16> undef)
 ; CHECK-NEXT:    [[TT12:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD30]] to <4 x i32>
 ; CHECK-NEXT:    [[TT13:%.*]] = mul nsw <4 x i32> [[TT12]], [[TT9]]
 ; CHECK-NEXT:    [[TT14]] = add nsw <4 x i32> [[TT13]], [[VEC_PHI]]
@@ -132,9 +132,9 @@ define void @mat_vec_i32(ptr nocapture readonly %A, ptr nocapture readonly %B, p
 ; CHECK-NEXT:    [[TT6:%.*]] = getelementptr inbounds i32, ptr [[TT3]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]])
 ; CHECK-NEXT:    [[TMP2]] = sub i32 [[TMP0]], 4
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TT6]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TT6]], <4 x i1> [[TMP1]], <4 x i32> undef)
 ; CHECK-NEXT:    [[TT9:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD29:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TT9]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD29:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TT9]], <4 x i1> [[TMP1]], <4 x i32> undef)
 ; CHECK-NEXT:    [[TT11:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD29]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TT12]] = add nsw <4 x i32> [[VEC_PHI]], [[TT11]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll
index c2b4494d148a9..3e8c462660cf1 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll
@@ -1,16 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve,+lob %s -S -o - | FileCheck %s
 
-; CHECK-LABEL: mul_v16i8
-; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1
-; CHECK: vector.body:
-; CHECK: %index = phi i32
-; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
-; CHECK: [[VCTP:%[^ ]+]] = call <16 x i1> @llvm.arm.mve.vctp8(i32 [[ELEMS]])
-; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 16
-; CHECK: [[LD0:%[^ ]+]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr {{.*}}, i32 4, <16 x i1> [[VCTP]], <16 x i8> undef)
-; CHECK: [[LD1:%[^ ]+]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr {{.*}}, i32 4, <16 x i1> [[VCTP]], <16 x i8> undef)
-; CHECK: tail call void @llvm.masked.store.v16i8.p0(<16 x i8> {{.*}}, ptr {{.*}}, i32 4, <16 x i1> [[VCTP]])
 define dso_local arm_aapcs_vfpcc void @mul_v16i8(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
+; CHECK-LABEL: define dso_local arm_aapcs_vfpcc void @mul_v16i8(
+; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias captures(none) [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP8:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[N]], 15
+; CHECK-NEXT:    [[TMP9:%.*]] = lshr i32 [[TMP8]], 4
+; CHECK-NEXT:    [[TMP10:%.*]] = shl nuw i32 [[TMP9]], 4
+; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], -16
+; CHECK-NEXT:    [[TMP12:%.*]] = lshr i32 [[TMP11]], 4
+; CHECK-NEXT:    [[TMP13:%.*]] = add nuw nsw i32 [[TMP12]], 1
+; CHECK-NEXT:    br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP13]])
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds i8, ptr [[A]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.vctp8(i32 [[TMP0]])
+; CHECK-NEXT:    [[TMP2]] = sub i32 [[TMP0]], 16
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 4 [[TMP]], <16 x i1> [[TMP1]], <16 x i8> undef)
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 4 [[TMP3]], <16 x i1> [[TMP1]], <16 x i8> undef)
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw <16 x i8> [[WIDE_MASKED_LOAD2]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[C]], i32 [[INDEX]]
+; CHECK-NEXT:    tail call void @llvm.masked.store.v16i8.p0(<16 x i8> [[MUL]], ptr align 4 [[TMP6]], <16 x i1> [[TMP1]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1)
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+; CHECK-NEXT:    br i1 [[TMP16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   %cmp8 = icmp eq i32 %N, 0
   %tmp8 = add i32 %N, 15
@@ -45,17 +70,41 @@ for.cond.cleanup:                                 ; preds = %vector.body, %entry
   ret void
 }
 
-; CHECK-LABEL: mul_v8i16
-; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1
-; CHECK: vector.body:
-; CHECK: %index = phi i32
-; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
-; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[ELEMS]])
-; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 8
-; CHECK: [[LD0:%[^ ]+]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
-; CHECK: [[LD1:%[^ ]+]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
-; CHECK: tail call void @llvm.masked.store.v8i16.p0(<8 x i16> {{.*}}, ptr {{.*}}, i32 4, <8 x i1> [[VCTP]])
 define dso_local arm_aapcs_vfpcc void @mul_v8i16(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
+; CHECK-LABEL: define dso_local arm_aapcs_vfpcc void @mul_v8i16(
+; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias captures(none) [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP8:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[N]], 7
+; CHECK-NEXT:    [[TMP9:%.*]] = lshr i32 [[TMP8]], 3
+; CHECK-NEXT:    [[TMP10:%.*]] = shl nuw i32 [[TMP9]], 3
+; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], -8
+; CHECK-NEXT:    [[TMP12:%.*]] = lshr i32 [[TMP11]], 3
+; CHECK-NEXT:    [[TMP13:%.*]] = add nuw nsw i32 [[TMP12]], 1
+; CHECK-NEXT:    br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP13]])
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds i16, ptr [[A]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[TMP0]])
+; CHECK-NEXT:    [[TMP2]] = sub i32 [[TMP0]], 8
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 4 [[TMP]], <8 x i1> [[TMP1]], <8 x i16> undef)
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 4 [[TMP3]], <8 x i1> [[TMP1]], <8 x i16> undef)
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw <8 x i16> [[WIDE_MASKED_LOAD2]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[C]], i32 [[INDEX]]
+; CHECK-NEXT:    tail call void @llvm.masked.store.v8i16.p0(<8 x i16> [[MUL]], ptr align 4 [[TMP6]], <8 x i1> [[TMP1]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1)
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+; CHECK-NEXT:    br i1 [[TMP16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   %cmp8 = icmp eq i32 %N, 0
   %tmp8 = add i32 %N, 7
@@ -90,16 +139,41 @@ for.cond.cleanup:                                 ; preds = %vector.body, %entry
   ret void
 }
 
-; CHECK-LABEL: mul_v4i32
-; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1
-; CHECK: vector.body:
-; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
-; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]])
-; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
-; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
-; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
-; CHECK: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> {{.*}}, ptr {{.*}}, i32 4, <4 x i1> [[VCTP]])
 define dso_local arm_aapcs_vfpcc void @mul_v4i32(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
+; CHECK-LABEL: define dso_local arm_aapcs_vfpcc void @mul_v4i32(
+; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias captures(none) [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP8:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[N]], 3
+; CHECK-NEXT:    [[TMP9:%.*]] = lshr i32 [[TMP8]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = shl nuw i32 [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], -4
+; CHECK-NEXT:    [[TMP12:%.*]] = lshr i32 [[TMP11]], 2
+; CHECK-NEXT:    [[TMP13:%.*]] = add nuw nsw i32 [[TMP12]], 1
+; CHECK-NEXT:    br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP13]])
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]])
+; CHECK-NEXT:    [[TMP2]] = sub i32 [[TMP0]], 4
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP]], <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP3]], <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD2]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[INDEX]]
+; CHECK-NEXT:    tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[MUL]], ptr align 4 [[TMP6]], <4 x i1> [[TMP1]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1)
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+; CHECK-NEXT:    br i1 [[TMP16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   %cmp8 = icmp eq i32 %N, 0
   %tmp8 = add i32 %N, 3
@@ -134,17 +208,47 @@ for.cond.cleanup:                                 ; preds = %vector.body, %entry
   ret void
 }
 
-; CHECK-LABEL: split_vector
-; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1
-; CHECK: vector.body:
-; CHECK: %index = phi i32
-; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
-; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]])
-; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
-; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
-; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
-; CHECK: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> {{.*}}, ptr {{.*}}, i32 4, <4 x i1> [[VCTP]])
 define dso_local arm_aapcs_vfpcc void @split_vector(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
+; CHECK-LABEL: define dso_local arm_aapcs_vfpcc void @split_vector(
+; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias captures(none) [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP8:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[N]], 3
+; CHECK-NEXT:    [[TMP9:%.*]] = lshr i32 [[TMP8]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = shl nuw i32 [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], -4
+; CHECK-NEXT:    [[TMP12:%.*]] = lshr i32 [[TMP11]], 2
+; CHECK-NEXT:    [[TMP13:%.*]] = add nuw nsw i32 [[TMP12]], 1
+; CHECK-NEXT:    br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP13]])
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]])
+; CHECK-NEXT:    [[TMP2]] = sub i32 [[TMP0]], 4
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP]], <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[EXTRACT_1_LOW:%.*]] = shufflevector <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[EXTRACT_1_HIGH:%.*]] = shufflevector <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP3]], <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[EXTRACT_2_LOW:%.*]] = shufflevector <4 x i32> [[WIDE_MASKED_LOAD2]], <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[EXTRACT_2_HIGH:%.*]] = shufflevector <4 x i32> [[WIDE_MASKED_LOAD2]], <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw <2 x i32> [[EXTRACT_1_LOW]], [[EXTRACT_2_LOW]]
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw <2 x i32> [[EXTRACT_1_HIGH]], [[EXTRACT_2_HIGH]]
+; CHECK-NEXT:    [[COMBINE:%.*]] = shufflevector <2 x i32> [[MUL]], <2 x i32> [[SUB]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[INDEX]]
+; CHECK-NEXT:    tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[COMBINE]], ptr align 4 [[TMP6]], <4 x i1> [[TMP1]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1)
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+; CHECK-NEXT:    br i1 [[TMP16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   %cmp8 = icmp eq i32 %N, 0
   %tmp8 = add i32 %N, 3
@@ -186,14 +290,48 @@ for.cond.cleanup:                                 ; preds = %vector.body, %entry
 }
 
 ; One of the loads now uses ult predicate.
-; CHECK-LABEL: mismatch_load_pred
-; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
-; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]])
-; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
-; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
-; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> %wrong, <4 x i32> undef)
-; CHECK: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> {{.*}}, ptr {{.*}}, i32 4, <4 x i1> [[VCTP]])
 define dso_local arm_aapcs_vfpcc void @mismatch_load_pred(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
+; CHECK-LABEL: define dso_local arm_aapcs_vfpcc void @mismatch_load_pred(
+; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias captures(none) [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP8:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[N]], 3
+; CHECK-NEXT:    [[TMP9:%.*]] = lshr i32 [[TMP8]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = shl nuw i32 [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], -4
+; CHECK-NEXT:    [[TMP12:%.*]] = lshr i32 [[TMP11]], 2
+; CHECK-NEXT:    [[TMP13:%.*]] = add nuw nsw i32 [[TMP12]], 1
+; CHECK-NEXT:    br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT11:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT10]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP13]])
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]])
+; CHECK-NEXT:    [[TMP2]] = sub i32 [[TMP0]], 4
+; CHECK-NEXT:    [[WRONG:%.*]] = icmp ult <4 x i32> [[INDUCTION]], [[BROADCAST_SPLAT11]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP]], <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP3]], <4 x i1> [[WRONG]], <4 x i32> undef)
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD12]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[INDEX]]
+; CHECK-NEXT:    tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP5]], ptr align 4 [[TMP6]], <4 x i1> [[TMP1]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1)
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+; CHECK-NEXT:    br i1 [[TMP16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   %cmp8 = icmp eq i32 %N, 0
   %tmp8 = add i32 %N, 3
@@ -236,17 +374,48 @@ for.cond.cleanup:                                 ; preds = %vector.body, %entry
 }
 
 ; The store now uses ult predicate.
-; CHECK-LABEL: mismatch_store_pred
-; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1
-; CHECK: vector.body:
-; CHECK: %index = phi i32
-; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
-; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]])
-; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
-; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
-; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
-; CHECK: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> {{.*}}, ptr {{.*}}, i32 4, <4 x i1> %wrong)
 define dso_local arm_aapcs_vfpcc void @mismatch_store_pred(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
+; CHECK-LABEL: define dso_local arm_aapcs_vfpcc void @mismatch_store_pred(
+; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias captures(none) [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP8:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[N]], 3
+; CHECK-NEXT:    [[TMP9:%.*]] = lshr i32 [[TMP8]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = shl nuw i32 [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], -4
+; CHECK-NEXT:    [[TMP12:%.*]] = lshr i32 [[TMP11]], 2
+; CHECK-NEXT:    [[TMP13:%.*]] = add nuw nsw i32 [[TMP12]], 1
+; CHECK-NEXT:    br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT11:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT10]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP13]])
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]])
+; CHECK-NEXT:    [[TMP2]] = sub i32 [[TMP0]], 4
+; CHECK-NEXT:    [[WRONG:%.*]] = icmp ult <4 x i32> [[INDUCTION]], [[BROADCAST_SPLAT11]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP]], <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP3]], <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD12]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[INDEX]]
+; CHECK-NEXT:    tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP5]], ptr align 4 [[TMP6]], <4 x i1> [[WRONG]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1)
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+; CHECK-NEXT:    br i1 [[TMP16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   %cmp8 = icmp eq i32 %N, 0
   %tmp8 = add i32 %N, 3
@@ -294,14 +463,72 @@ for.cond.cleanup:                                 ; preds = %vector.body, %entry
 ;
 ;   Step value 16 doesn't match vector width 4
 ;
-; CHECK-LABEL: interleave4
-; CHECK: vector.body:
-; CHECK:  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
-; CHECK:  %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v7, i32 %N)
-; CHECK:  %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v8, i32 %N)
-; CHECK:  %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v9, i32 %N)
-;
 define dso_local void @interleave4(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @interleave4(
+; CHECK-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias readonly captures(none) [[C:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    [[V0:%.*]] = add i32 [[N]], 15
+; CHECK-NEXT:    [[V1:%.*]] = lshr i32 [[V0]], 4
+; CHECK-NEXT:    [[V2:%.*]] = shl nuw i32 [[V1]], 4
+; CHECK-NEXT:    [[V3:%.*]] = add i32 [[V2]], -16
+; CHECK-NEXT:    [[V4:%.*]] = lshr i32 [[V3]], 4
+; CHECK-NEXT:    [[V5:%.*]] = add nuw nsw i32 [[V4]], 1
+; CHECK-NEXT:    br i1 [[CMP8]], label %[[VECTOR_PH:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, ptr [[A]], i32 8
+; CHECK-NEXT:    [[SCEVGEP30:%.*]] = getelementptr i32, ptr [[C]], i32 8
+; CHECK-NEXT:    [[SCEVGEP37:%.*]] = getelementptr i32, ptr [[B]], i32 8
+; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[V5]])
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[LSR_IV38:%.*]] = phi ptr [ [[SCEVGEP39:%.*]], %[[VECTOR_BODY]] ], [ [[SCEVGEP37]], %[[VECTOR_PH]] ]
+; CHECK-NEXT:    [[LSR_IV31:%.*]] = phi ptr [ [[SCEVGEP32:%.*]], %[[VECTOR_BODY]] ], [ [[SCEVGEP30]], %[[VECTOR_PH]] ]
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP25:%.*]], %[[VECTOR_BODY]] ], [ [[SCEVGEP]], %[[VECTOR_PH]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[V14:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[V6:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[V15:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
+; CHECK-NEXT:    [[V7:%.*]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK15:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[V7]], i32 [[N]])
+; CHECK-NEXT:    [[V8:%.*]] = add i32 [[V7]], 4
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK16:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[V8]], i32 [[N]])
+; CHECK-NEXT:    [[V9:%.*]] = add i32 [[V8]], 4
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK17:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[V9]], i32 [[N]])
+; CHECK-NEXT:    [[SCEVGEP42:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV38]], i32 -2
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[SCEVGEP42]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
+; CHECK-NEXT:    [[SCEVGEP43:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV38]], i32 -1
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD18:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[SCEVGEP43]], <4 x i1> [[ACTIVE_LANE_MASK15]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD19:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV38]], <4 x i1> [[ACTIVE_LANE_MASK16]], <4 x i32> undef)
+; CHECK-NEXT:    [[SCEVGEP41:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV38]], i32 1
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD20:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[SCEVGEP41]], <4 x i1> [[ACTIVE_LANE_MASK17]], <4 x i32> undef)
+; CHECK-NEXT:    [[SCEVGEP34:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV31]], i32 -2
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD21:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[SCEVGEP34]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
+; CHECK-NEXT:    [[SCEVGEP35:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV31]], i32 -1
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD22:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[SCEVGEP35]], <4 x i1> [[ACTIVE_LANE_MASK15]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD23:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV31]], <4 x i1> [[ACTIVE_LANE_MASK16]], <4 x i32> undef)
+; CHECK-NEXT:    [[SCEVGEP36:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV31]], i32 1
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD24:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[SCEVGEP36]], <4 x i1> [[ACTIVE_LANE_MASK17]], <4 x i32> undef)
+; CHECK-NEXT:    [[V10:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD21]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    [[V11:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD22]], [[WIDE_MASKED_LOAD18]]
+; CHECK-NEXT:    [[V12:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD23]], [[WIDE_MASKED_LOAD19]]
+; CHECK-NEXT:    [[V13:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD24]], [[WIDE_MASKED_LOAD20]]
+; CHECK-NEXT:    [[SCEVGEP27:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV]], i32 -2
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V10]], ptr align 4 [[SCEVGEP27]], <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[SCEVGEP28:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV]], i32 -1
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V11]], ptr align 4 [[SCEVGEP28]], <4 x i1> [[ACTIVE_LANE_MASK15]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V12]], ptr align 4 [[LSR_IV]], <4 x i1> [[ACTIVE_LANE_MASK16]])
+; CHECK-NEXT:    [[SCEVGEP29:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV]], i32 1
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V13]], ptr align 4 [[SCEVGEP29]], <4 x i1> [[ACTIVE_LANE_MASK17]])
+; CHECK-NEXT:    [[SCEVGEP25]] = getelementptr i32, ptr [[LSR_IV]], i32 16
+; CHECK-NEXT:    [[SCEVGEP32]] = getelementptr i32, ptr [[LSR_IV31]], i32 16
+; CHECK-NEXT:    [[SCEVGEP39]] = getelementptr i32, ptr [[LSR_IV38]], i32 16
+; CHECK-NEXT:    [[V14]] = add i32 [[V9]], 4
+; CHECK-NEXT:    [[V15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[V6]], i32 1)
+; CHECK-NEXT:    [[V16:%.*]] = icmp ne i32 [[V15]], 0
+; CHECK-NEXT:    br i1 [[V16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   %cmp8 = icmp sgt i32 %N, 0
   %v0 = add i32 %N, 15
@@ -370,12 +597,42 @@ for.cond.cleanup:
   ret void
 }
 
-; CHECK-LABEL: const_expected_in_set_loop
-; CHECK:       call <4 x i1> @llvm.get.active.lane.mask
-; CHECK-NOT:   vctp
-; CHECK:       ret void
-;
 define dso_local void @const_expected_in_set_loop(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @const_expected_in_set_loop(
+; CHECK-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias readonly captures(none) [[C:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], 3
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw i32 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], -4
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw i32 [[TMP4]], 1
+; CHECK-NEXT:    br i1 [[CMP8]], label %[[VECTOR_PH:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP5]])
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[LSR_IV17:%.*]] = phi ptr [ [[SCEVGEP18:%.*]], %[[VECTOR_BODY]] ], [ [[A]], %[[VECTOR_PH]] ]
+; CHECK-NEXT:    [[LSR_IV14:%.*]] = phi ptr [ [[SCEVGEP15:%.*]], %[[VECTOR_BODY]] ], [ [[C]], %[[VECTOR_PH]] ]
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[VECTOR_BODY]] ], [ [[B]], %[[VECTOR_PH]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 42)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV14]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
+; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD12]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP7]], ptr align 4 [[LSR_IV17]], <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4
+; CHECK-NEXT:    [[SCEVGEP15]] = getelementptr i32, ptr [[LSR_IV14]], i32 4
+; CHECK-NEXT:    [[SCEVGEP18]] = getelementptr i32, ptr [[LSR_IV17]], i32 4
+; CHECK-NEXT:    [[TMP8]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP6]], i32 1)
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   %cmp8 = icmp sgt i32 %N, 0
   %0 = add i32 %N, 3
@@ -413,12 +670,42 @@ for.cond.cleanup:                                 ; preds = %vector.body, %entry
   ret void
 }
 
-; CHECK-LABEL: tripcount_arg_not_invariant
-; CHECK:       call <4 x i1> @llvm.get.active.lane.mask
-; CHECK-NOT:   vctp
-; CHECK:       ret void
-;
 define dso_local void @tripcount_arg_not_invariant(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @tripcount_arg_not_invariant(
+; CHECK-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias readonly captures(none) [[C:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], 3
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw i32 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], -4
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw i32 [[TMP4]], 1
+; CHECK-NEXT:    br i1 [[CMP8]], label %[[VECTOR_PH:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP5]])
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[LSR_IV17:%.*]] = phi ptr [ [[SCEVGEP18:%.*]], %[[VECTOR_BODY]] ], [ [[A]], %[[VECTOR_PH]] ]
+; CHECK-NEXT:    [[LSR_IV14:%.*]] = phi ptr [ [[SCEVGEP15:%.*]], %[[VECTOR_BODY]] ], [ [[C]], %[[VECTOR_PH]] ]
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[VECTOR_BODY]] ], [ [[B]], %[[VECTOR_PH]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[INDEX]])
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV14]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
+; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD12]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP7]], ptr align 4 [[LSR_IV17]], <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4
+; CHECK-NEXT:    [[SCEVGEP15]] = getelementptr i32, ptr [[LSR_IV14]], i32 4
+; CHECK-NEXT:    [[SCEVGEP18]] = getelementptr i32, ptr [[LSR_IV17]], i32 4
+; CHECK-NEXT:    [[TMP8]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP6]], i32 1)
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[VECTOR_BODY]], label %[[VECTOR_PH]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   %cmp8 = icmp sgt i32 %N, 0
   %0 = add i32 %N, 3
@@ -458,12 +745,42 @@ for.cond.cleanup:                                 ; preds = %vector.body, %entry
   ret void
 }
 
-; CHECK-LABEL: addrec_base_not_zero
-; CHECK:       call <4 x i1> @llvm.get.active.lane.mask
-; CHECK-NOT:   vctp
-; CHECK:       ret void
-;
 define dso_local void @addrec_base_not_zero(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @addrec_base_not_zero(
+; CHECK-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias readonly captures(none) [[C:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], 3
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw i32 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], -4
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw i32 [[TMP4]], 1
+; CHECK-NEXT:    br i1 [[CMP8]], label %[[VECTOR_PH:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP5]])
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[LSR_IV17:%.*]] = phi ptr [ [[SCEVGEP18:%.*]], %[[VECTOR_BODY]] ], [ [[A]], %[[VECTOR_PH]] ]
+; CHECK-NEXT:    [[LSR_IV14:%.*]] = phi ptr [ [[SCEVGEP15:%.*]], %[[VECTOR_BODY]] ], [ [[C]], %[[VECTOR_PH]] ]
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[VECTOR_BODY]] ], [ [[B]], %[[VECTOR_PH]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 1, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV14]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
+; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD12]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP7]], ptr align 4 [[LSR_IV17]], <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4
+; CHECK-NEXT:    [[SCEVGEP15]] = getelementptr i32, ptr [[LSR_IV14]], i32 4
+; CHECK-NEXT:    [[SCEVGEP18]] = getelementptr i32, ptr [[LSR_IV17]], i32 4
+; CHECK-NEXT:    [[TMP8]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP6]], i32 1)
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[VECTOR_BODY]], label %[[VECTOR_PH]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   %cmp8 = icmp sgt i32 %N, 0
   %0 = add i32 %N, 3
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll
index e70868d4ffeb0..7303efb6bf01c 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll
@@ -14,10 +14,10 @@ define dso_local void @foo(ptr noalias nocapture %A, ptr noalias nocapture reado
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ 32003, [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP1]])
 ; CHECK-NEXT:    [[TMP3]] = sub i32 [[TMP1]], 4
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[TMP2]], <4 x i32> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV11]], i32 4, <4 x i1> [[TMP2]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV]], <4 x i1> [[TMP2]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV11]], <4 x i1> [[TMP2]], <4 x i32> undef)
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP4]], ptr [[LSR_IV14]], i32 4, <4 x i1> [[TMP2]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP4]], ptr align 4 [[LSR_IV14]], <4 x i1> [[TMP2]])
 ; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4
 ; CHECK-NEXT:    [[SCEVGEP12]] = getelementptr i32, ptr [[LSR_IV11]], i32 4
 ; CHECK-NEXT:    [[SCEVGEP15]] = getelementptr i32, ptr [[LSR_IV14]], i32 4
@@ -120,10 +120,10 @@ define dso_local void @foo3(ptr noalias nocapture %A, ptr noalias nocapture read
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt <4 x i32> [[INDUCTION]], splat (i32 32002)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV11]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV]], <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV11]], <4 x i1> [[TMP1]], <4 x i32> undef)
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr [[LSR_IV14]], i32 4, <4 x i1> [[TMP1]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr align 4 [[LSR_IV14]], <4 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4
 ; CHECK-NEXT:    [[SCEVGEP12]] = getelementptr i32, ptr [[LSR_IV11]], i32 4
@@ -182,10 +182,10 @@ define dso_local void @foo5(ptr noalias nocapture %A, ptr noalias nocapture read
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <4 x i32> [[INDUCTION]], <i32 0, i32 3200, i32 32002, i32 32002>
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV11]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV]], <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV11]], <4 x i1> [[TMP1]], <4 x i32> undef)
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr [[LSR_IV14]], i32 4, <4 x i1> [[TMP1]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr align 4 [[LSR_IV14]], <4 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4
 ; CHECK-NEXT:    [[SCEVGEP12]] = getelementptr i32, ptr [[LSR_IV11]], i32 4
@@ -239,10 +239,10 @@ define dso_local void @inconsistent_tripcounts(ptr noalias nocapture %A, ptr noa
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 -1)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV11]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV]], <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV11]], <4 x i1> [[TMP1]], <4 x i32> undef)
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr [[LSR_IV14]], i32 4, <4 x i1> [[TMP1]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr align 4 [[LSR_IV14]], <4 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4
 ; CHECK-NEXT:    [[SCEVGEP12]] = getelementptr i32, ptr [[LSR_IV11]], i32 4
@@ -294,10 +294,10 @@ define dso_local void @overflow_in_sub(ptr noalias nocapture %A, ptr noalias noc
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 32003)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV11]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV]], <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV11]], <4 x i1> [[TMP1]], <4 x i32> undef)
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr [[LSR_IV14]], i32 4, <4 x i1> [[TMP1]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr align 4 [[LSR_IV14]], <4 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4
 ; CHECK-NEXT:    [[SCEVGEP12]] = getelementptr i32, ptr [[LSR_IV11]], i32 4
@@ -349,10 +349,10 @@ define dso_local void @IV_not_an_induction(ptr noalias nocapture %A, ptr noalias
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[N:%.*]], i32 32003)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV11]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV]], <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV11]], <4 x i1> [[TMP1]], <4 x i32> undef)
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr [[LSR_IV14]], i32 4, <4 x i1> [[TMP1]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr align 4 [[LSR_IV14]], <4 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4
 ; CHECK-NEXT:    [[SCEVGEP12]] = getelementptr i32, ptr [[LSR_IV11]], i32 4
@@ -404,10 +404,10 @@ define dso_local void @IV_wrong_step(ptr noalias nocapture %A, ptr noalias nocap
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 32003)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV11]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV]], <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV11]], <4 x i1> [[TMP1]], <4 x i32> undef)
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr [[LSR_IV14]], i32 4, <4 x i1> [[TMP1]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr align 4 [[LSR_IV14]], <4 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 3
 ; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4
 ; CHECK-NEXT:    [[SCEVGEP12]] = getelementptr i32, ptr [[LSR_IV11]], i32 4
@@ -462,10 +462,10 @@ define dso_local void @IV_step_not_constant(ptr noalias nocapture %A, ptr noalia
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 32003)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV11]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV]], <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV11]], <4 x i1> [[TMP1]], <4 x i32> undef)
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr [[LSR_IV14]], i32 4, <4 x i1> [[TMP1]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr align 4 [[LSR_IV14]], <4 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[N:%.*]]
 ; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4
 ; CHECK-NEXT:    [[SCEVGEP12]] = getelementptr i32, ptr [[LSR_IV11]], i32 4
@@ -528,10 +528,10 @@ define dso_local void @outerloop_phi(ptr noalias nocapture %A, ptr noalias nocap
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[START]], [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[J_025]], i32 4096)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV38]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD27:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV33]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV38]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD27:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV33]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD27]], [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP1]], ptr [[LSR_IV28]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP1]], ptr align 4 [[LSR_IV28]], <4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[SCEVGEP29]] = getelementptr i32, ptr [[LSR_IV28]], i32 4
 ; CHECK-NEXT:    [[SCEVGEP34]] = getelementptr i32, ptr [[LSR_IV33]], i32 4
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll
index fa6a66b95f654..3271e98c9b744 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll
@@ -1,15 +1,55 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --prefix-filecheck-ir-name INST --version 6
 ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve %s -S -o - | FileCheck %s
 
-; CHECK-LABEL: reduction_i32
-; CHECK: phi i32 [ 0, %vector.ph ]
-; CHECK: phi <8 x i16> [ zeroinitializer, %vector.ph ]
-; CHECK: phi i32
-; CHECK: [[PHI:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[ELEMS:%[^ ]+]], %vector.body ]
-; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[PHI]])
-; CHECK: [[ELEMS]] = sub i32 [[PHI]], 8
-; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %tmp2, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
-; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %tmp5, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
 define i16 @reduction_i32(ptr nocapture readonly %A, ptr nocapture readonly %B, i32 %N) {
+; CHECK-LABEL: define i16 @reduction_i32(
+; CHECK-SAME: ptr readonly captures(none) [[A:%.*]], ptr readonly captures(none) [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP8:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N_VEC]], -8
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[TMP0]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 1
+; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP2]])
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i16> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INSTTMP2:%.*]] = getelementptr inbounds i16, ptr [[A]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[TMP4]])
+; CHECK-NEXT:    [[TMP6]] = sub i32 [[TMP4]], 8
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 4 [[INSTTMP2]], <8 x i1> [[TMP5]], <8 x i16> undef)
+; CHECK-NEXT:    [[INSTTMP5:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 4 [[INSTTMP5]], <8 x i1> [[TMP5]], <8 x i16> undef)
+; CHECK-NEXT:    [[TMP7:%.*]] = add <8 x i16> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP8]] = add <8 x i16> [[TMP7]], [[WIDE_MASKED_LOAD3]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP9]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP3]], i32 1)
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP9]], 0
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[VECTOR_BODY]], label %[[MIDDLE_BLOCK:.*]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[VEC_PHI_LCSSA:%.*]] = phi <8 x i16> [ [[VEC_PHI]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[DOTLCSSA3:%.*]] = phi <8 x i1> [ [[TMP5]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi <8 x i16> [ [[TMP8]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[DOTLCSSA3]], <8 x i16> [[DOTLCSSA]], <8 x i16> [[VEC_PHI_LCSSA]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i16> [[TMP10]], <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <8 x i16> [[RDX_SHUF]], [[TMP10]]
+; CHECK-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <8 x i16> [[BIN_RDX]], <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[BIN_RDX5:%.*]] = add <8 x i16> [[RDX_SHUF4]], [[BIN_RDX]]
+; CHECK-NEXT:    [[RDX_SHUF6:%.*]] = shufflevector <8 x i16> [[BIN_RDX5]], <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[BIN_RDX7:%.*]] = add <8 x i16> [[RDX_SHUF6]], [[BIN_RDX5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i16> [[BIN_RDX7]], i32 0
+; CHECK-NEXT:    ret i16 [[TMP11]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    [[RES_0:%.*]] = phi i16 [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    ret i16 [[RES_0]]
+;
 entry:
   %cmp8 = icmp eq i32 %N, 0
   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
@@ -59,16 +99,52 @@ for.cond.cleanup:
   ret i16 %res.0
 }
 
-; CHECK-LABEL: reduction_i32_with_scalar
-; CHECK: vector.body:
-; CHECK: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; CHECK: %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %{{.*}}, %vector.body ]
-; CHECK: %{{.*}} = phi i32 [ %{{.*}}, %vector.ph ], [ %{{.*}}, %vector.body ]
-; CHECK: [[PHI:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[ELEMS:%[^ ]+]], %vector.body ]
-; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[PHI]])
-; CHECK: [[ELEMS]] = sub i32 [[PHI]], 8
-; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %tmp2, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
 define i16 @reduction_i32_with_scalar(ptr nocapture readonly %A, i16 %B, i32 %N) local_unnamed_addr {
+; CHECK-LABEL: define i16 @reduction_i32_with_scalar(
+; CHECK-SAME: ptr readonly captures(none) [[A:%.*]], i16 [[B:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP8:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add nuw nsw i32 [[TMP]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <8 x i16> undef, i16 [[B]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT3]], <8 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N_VEC]], -8
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[TMP0]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1
+; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP2]])
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i16> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[INSTTMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INSTTMP2:%.*]] = getelementptr inbounds i16, ptr [[A]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[TMP4]])
+; CHECK-NEXT:    [[TMP6]] = sub i32 [[TMP4]], 8
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 4 [[INSTTMP2]], <8 x i1> [[TMP5]], <8 x i16> undef)
+; CHECK-NEXT:    [[INSTTMP5:%.*]] = add <8 x i16> [[VEC_PHI]], [[BROADCAST_SPLAT4]]
+; CHECK-NEXT:    [[INSTTMP6]] = add <8 x i16> [[INSTTMP5]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw nsw i32 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP7]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP3]], i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i32 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[VECTOR_BODY]], label %[[MIDDLE_BLOCK:.*]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[INSTTMP8:%.*]] = select <8 x i1> [[TMP5]], <8 x i16> [[INSTTMP6]], <8 x i16> [[VEC_PHI]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i16> [[INSTTMP8]], <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <8 x i16> [[RDX_SHUF]], [[INSTTMP8]]
+; CHECK-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <8 x i16> [[BIN_RDX]], <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[BIN_RDX6:%.*]] = add <8 x i16> [[RDX_SHUF5]], [[BIN_RDX]]
+; CHECK-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <8 x i16> [[BIN_RDX6]], <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[BIN_RDX8:%.*]] = add <8 x i16> [[RDX_SHUF7]], [[BIN_RDX6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[BIN_RDX8]], i32 0
+; CHECK-NEXT:    ret i16 [[TMP9]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    [[RES_0:%.*]] = phi i16 [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    ret i16 [[RES_0]]
+;
 entry:
   %cmp8 = icmp eq i32 %N, 0
   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
@@ -119,15 +195,46 @@ for.cond.cleanup:
 ; despite this we can still calculate a precise enough range so that the
 ; the overflow checks for get.active.active.lane.mask don't reject
 ; tail-predication.
-;
-; CHECK-LABEL: @reduction_not_guarded
-;
-; CHECK:     vector.body:
-; CHECK:     @llvm.arm.mve.vctp
-; CHECK-NOT: @llvm.get.active.lane.mask.v8i1.i32
-; CHECK:     ret
-;
 define i16 @reduction_not_guarded(ptr nocapture readonly %A, i16 %B, i32 %N) local_unnamed_addr {
+; CHECK-LABEL: define i16 @reduction_not_guarded(
+; CHECK-SAME: ptr readonly captures(none) [[A:%.*]], i16 [[B:%.*]], i32 [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add nuw nsw i32 [[TMP]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <8 x i16> undef, i16 [[B]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT3]], <8 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N_VEC]], -8
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[TMP0]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1
+; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP2]])
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i16> [ zeroinitializer, %[[ENTRY]] ], [ [[INSTTMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i32 [ [[N]], %[[ENTRY]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INSTTMP2:%.*]] = getelementptr inbounds i16, ptr [[A]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[TMP4]])
+; CHECK-NEXT:    [[TMP6]] = sub i32 [[TMP4]], 8
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 4 [[INSTTMP2]], <8 x i1> [[TMP5]], <8 x i16> undef)
+; CHECK-NEXT:    [[INSTTMP5:%.*]] = add <8 x i16> [[VEC_PHI]], [[BROADCAST_SPLAT4]]
+; CHECK-NEXT:    [[INSTTMP6]] = add <8 x i16> [[INSTTMP5]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw nsw i32 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP7]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP3]], i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i32 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[VECTOR_BODY]], label %[[MIDDLE_BLOCK:.*]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[INSTTMP8:%.*]] = select <8 x i1> [[TMP5]], <8 x i16> [[INSTTMP6]], <8 x i16> [[VEC_PHI]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i16> [[INSTTMP8]], <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <8 x i16> [[RDX_SHUF]], [[INSTTMP8]]
+; CHECK-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <8 x i16> [[BIN_RDX]], <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[BIN_RDX6:%.*]] = add <8 x i16> [[RDX_SHUF5]], [[BIN_RDX]]
+; CHECK-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <8 x i16> [[BIN_RDX6]], <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[BIN_RDX8:%.*]] = add <8 x i16> [[RDX_SHUF7]], [[BIN_RDX6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[BIN_RDX8]], i32 0
+; CHECK-NEXT:    ret i16 [[TMP9]]
+;
 entry:
   %tmp = add i32 %N, -1
   %n.rnd.up = add nuw nsw i32 %tmp, 8
@@ -166,12 +273,76 @@ middle.block:                                     ; preds = %vector.body
   ret i16 %tmp9
 }
 
-; CHECK-LABEL: @Correlation
-; CHECK:       vector.body:
-; CHECK:       @llvm.arm.mve.vctp
-; CHECK-NOT:   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask
-;
 define dso_local void @Correlation(ptr nocapture readonly %Input, ptr nocapture %Output, i16 signext %Size, i16 signext %N, i16 signext %Scale) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @Correlation(
+; CHECK-SAME: ptr readonly captures(none) [[INPUT:%.*]], ptr captures(none) [[OUTPUT:%.*]], i16 signext [[SIZE:%.*]], i16 signext [[N:%.*]], i16 signext [[SCALE:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[N]] to i32
+; CHECK-NEXT:    [[CMP36:%.*]] = icmp sgt i16 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP36]], label %[[FOR_BODY_LR_PH:.*]], label %[[FOR_END17:.*]]
+; CHECK:       [[FOR_BODY_LR_PH]]:
+; CHECK-NEXT:    [[CONV2:%.*]] = sext i16 [[SIZE]] to i32
+; CHECK-NEXT:    [[CONV1032:%.*]] = zext i16 [[SCALE]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[CONV2]], 3
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[LSR_IV51:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], %[[FOR_END:.*]] ], [ [[TMP0]], %[[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[LSR_IV46:%.*]] = phi ptr [ [[SCEVGEP47:%.*]], %[[FOR_END]] ], [ [[INPUT]], %[[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[I_037:%.*]] = phi i32 [ 0, %[[FOR_BODY_LR_PH]] ], [ [[INC16:%.*]], %[[FOR_END]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw i32 [[I_037]], -1
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[CONV2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nsw i32 [[I_037]], -1
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP0]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr i32 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw i32 [[TMP5]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], -4
+; CHECK-NEXT:    [[TMP8:%.*]] = lshr i32 [[TMP7]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = add nuw nsw i32 [[TMP8]], 1
+; CHECK-NEXT:    [[CMP433:%.*]] = icmp slt i32 [[I_037]], [[CONV2]]
+; CHECK-NEXT:    br i1 [[CMP433]], label %[[VECTOR_PH:.*]], label %[[FOR_END]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP9]])
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[LSR_IV48:%.*]] = phi ptr [ [[SCEVGEP49:%.*]], %[[VECTOR_BODY]] ], [ [[LSR_IV46]], %[[VECTOR_PH]] ]
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[VECTOR_BODY]] ], [ [[INPUT]], %[[VECTOR_PH]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP20:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP21:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP11:%.*]] = phi i32 [ [[TMP2]], %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP11]])
+; CHECK-NEXT:    [[TMP13]] = sub i32 [[TMP11]], 4
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 2 [[LSR_IV]], <4 x i1> [[TMP12]], <4 x i16> undef)
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD]] to <4 x i32>
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD42:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 2 [[LSR_IV48]], <4 x i1> [[TMP12]], <4 x i16> undef)
+; CHECK-NEXT:    [[TMP15:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD42]] to <4 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = mul nsw <4 x i32> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> undef, i32 [[CONV1032]], i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = ashr <4 x i32> [[TMP16]], [[TMP18]]
+; CHECK-NEXT:    [[TMP20]] = add <4 x i32> [[TMP19]], [[VEC_PHI]]
+; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i16, ptr [[LSR_IV]], i32 4
+; CHECK-NEXT:    [[SCEVGEP49]] = getelementptr i16, ptr [[LSR_IV48]], i32 4
+; CHECK-NEXT:    [[TMP21]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP10]], i32 1)
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
+; CHECK-NEXT:    br i1 [[TMP22]], label %[[VECTOR_BODY]], label %[[MIDDLE_BLOCK:.*]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP23:%.*]] = select <4 x i1> [[TMP12]], <4 x i32> [[TMP20]], <4 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP24:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP23]])
+; CHECK-NEXT:    br label %[[FOR_END]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, %[[FOR_BODY]] ], [ [[TMP24]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[TMP25:%.*]] = lshr i32 [[SUM_0_LCSSA]], 16
+; CHECK-NEXT:    [[CONV13:%.*]] = trunc i32 [[TMP25]] to i16
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i16, ptr [[OUTPUT]], i32 [[I_037]]
+; CHECK-NEXT:    store i16 [[CONV13]], ptr [[ARRAYIDX14]], align 2
+; CHECK-NEXT:    [[INC16]] = add nuw nsw i32 [[I_037]], 1
+; CHECK-NEXT:    [[SCEVGEP47]] = getelementptr i16, ptr [[LSR_IV46]], i32 1
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add i32 [[LSR_IV51]], -1
+; CHECK-NEXT:    [[EXITCOND39:%.*]] = icmp eq i32 [[INC16]], [[CONV]]
+; CHECK-NEXT:    br i1 [[EXITCOND39]], label %[[FOR_END17]], label %[[FOR_BODY]]
+; CHECK:       [[FOR_END17]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   %conv = sext i16 %N to i32
   %cmp36 = icmp sgt i16 %N, 0
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll
index a8ad3605375a2..88ae2271236fe 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll
@@ -1,8 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve,+lob %s -S -o - | FileCheck %s
 
-; CHECK-LABEL: expand_v8i16_v8i32
-; CHECK-NOT: call i32 @llvm.arm.mve.vctp
 define void @expand_v8i16_v8i32(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
+; CHECK-LABEL: define void @expand_v8i16_v8i32(
+; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias captures(none) [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP8:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[N]], 7
+; CHECK-NEXT:    [[TMP9:%.*]] = lshr i32 [[TMP8]], 3
+; CHECK-NEXT:    [[TMP10:%.*]] = shl nuw i32 [[TMP9]], 3
+; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], -8
+; CHECK-NEXT:    [[TMP12:%.*]] = lshr i32 [[TMP11]], 3
+; CHECK-NEXT:    [[TMP13:%.*]] = add nuw nsw i32 [[TMP12]], 1
+; CHECK-NEXT:    br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP13]])
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds i16, ptr [[A]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[TMP0]])
+; CHECK-NEXT:    [[TMP2]] = sub i32 [[TMP0]], 8
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 4 [[TMP]], <8 x i1> [[TMP1]], <8 x i16> undef)
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 4 [[TMP3]], <8 x i1> [[TMP1]], <8 x i16> undef)
+; CHECK-NEXT:    [[EXPAND_1:%.*]] = zext <8 x i16> [[WIDE_MASKED_LOAD]] to <8 x i32>
+; CHECK-NEXT:    [[EXPAND_2:%.*]] = zext <8 x i16> [[WIDE_MASKED_LOAD2]] to <8 x i32>
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw <8 x i32> [[EXPAND_2]], [[EXPAND_1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[INDEX]]
+; CHECK-NEXT:    tail call void @llvm.masked.store.v8i32.p0(<8 x i32> [[MUL]], ptr align 4 [[TMP6]], <8 x i1> [[TMP1]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1)
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+; CHECK-NEXT:    br i1 [[TMP16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   %cmp8 = icmp eq i32 %N, 0
   %tmp8 = add i32 %N, 7
@@ -39,15 +74,57 @@ for.cond.cleanup:                                 ; preds = %vector.body, %entry
   ret void
 }
 
-; CHECK-LABEL: expand_v8i16_v4i32
-; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[ELEMS_REM:%[^ ]+]], %vector.body ]
-; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[ELEMS]])
-; CHECK: [[ELEMS_REM]] = sub i32 [[ELEMS]], 8
-; CHECK: tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
-; CHECK: %store.pred = icmp ule <4 x i32> %induction.store
-; CHECK: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> {{.*}}, ptr {{.*}}, i32 4, <4 x i1> %store.pred)
-; CHECK: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> {{.*}}, ptr {{.*}}, i32 4, <4 x i1> %store.pred)
 define void @expand_v8i16_v4i32(ptr readonly %a, ptr readonly %b, ptr %c, ptr %d, i32 %N) {
+; CHECK-LABEL: define void @expand_v8i16_v4i32(
+; CHECK-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP8:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[N]], 7
+; CHECK-NEXT:    [[TMP9:%.*]] = lshr i32 [[TMP8]], 3
+; CHECK-NEXT:    [[TMP10:%.*]] = shl nuw i32 [[TMP9]], 3
+; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], -8
+; CHECK-NEXT:    [[TMP12:%.*]] = lshr i32 [[TMP11]], 3
+; CHECK-NEXT:    [[TMP13:%.*]] = add nuw nsw i32 [[TMP12]], 1
+; CHECK-NEXT:    br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT10_STORE:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT11_STORE:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT10_STORE]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP13]])
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[STORE_IDX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[STORE_IDX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds i16, ptr [[A]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[TMP0]])
+; CHECK-NEXT:    [[TMP2]] = sub i32 [[TMP0]], 8
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 4 [[TMP]], <8 x i1> [[TMP1]], <8 x i16> undef)
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 4 [[TMP3]], <8 x i1> [[TMP1]], <8 x i16> undef)
+; CHECK-NEXT:    [[EXTRACT_2_LOW:%.*]] = shufflevector <8 x i16> [[WIDE_MASKED_LOAD2]], <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT_2_HIGH:%.*]] = shufflevector <8 x i16> [[WIDE_MASKED_LOAD2]], <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[EXPAND_1:%.*]] = zext <4 x i16> [[EXTRACT_2_LOW]] to <4 x i32>
+; CHECK-NEXT:    [[EXPAND_2:%.*]] = zext <4 x i16> [[EXTRACT_2_HIGH]] to <4 x i32>
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw <4 x i32> [[EXPAND_2]], [[EXPAND_1]]
+; CHECK-NEXT:    [[SUB:%.*]] = mul nsw <4 x i32> [[EXPAND_1]], [[EXPAND_2]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT_STORE:%.*]] = insertelement <4 x i32> undef, i32 [[STORE_IDX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT_STORE:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT_STORE]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION_STORE:%.*]] = add <4 x i32> [[BROADCAST_SPLAT_STORE]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[STORE_PRED:%.*]] = icmp ule <4 x i32> [[INDUCTION_STORE]], [[BROADCAST_SPLAT11_STORE]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[STORE_IDX]]
+; CHECK-NEXT:    tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[MUL]], ptr align 4 [[TMP6]], <4 x i1> [[STORE_PRED]])
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[D]], i32 [[STORE_IDX]]
+; CHECK-NEXT:    tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[SUB]], ptr align 4 [[GEP]], <4 x i1> [[STORE_PRED]])
+; CHECK-NEXT:    [[STORE_IDX_NEXT]] = add i32 [[STORE_IDX]], 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1)
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+; CHECK-NEXT:    br i1 [[TMP16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   %cmp8 = icmp eq i32 %N, 0
   %tmp8 = add i32 %N, 7
@@ -98,9 +175,43 @@ for.cond.cleanup:                                 ; preds = %vector.body, %entry
   ret void
 }
 
-; CHECK-LABEL: expand_v4i32_v4i64
-; CHECK-NOT: call i32 @llvm.arm.mve.vctp
 define void @expand_v4i32_v4i64(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
+; CHECK-LABEL: define void @expand_v4i32_v4i64(
+; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], ptr noalias captures(none) [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP8:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[N]], 3
+; CHECK-NEXT:    [[TMP9:%.*]] = lshr i32 [[TMP8]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = shl nuw i32 [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], -4
+; CHECK-NEXT:    [[TMP12:%.*]] = lshr i32 [[TMP11]], 2
+; CHECK-NEXT:    [[TMP13:%.*]] = add nuw nsw i32 [[TMP12]], 1
+; CHECK-NEXT:    br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP13]])
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]])
+; CHECK-NEXT:    [[TMP2]] = sub i32 [[TMP0]], 4
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP]], <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP3]], <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[EXPAND_1:%.*]] = zext <4 x i32> [[WIDE_MASKED_LOAD]] to <4 x i64>
+; CHECK-NEXT:    [[EXPAND_2:%.*]] = zext <4 x i32> [[WIDE_MASKED_LOAD2]] to <4 x i64>
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw <4 x i64> [[EXPAND_2]], [[EXPAND_1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[C]], i32 [[INDEX]]
+; CHECK-NEXT:    tail call void @llvm.masked.store.v4i64.p0(<4 x i64> [[MUL]], ptr align 4 [[TMP6]], <4 x i1> [[TMP1]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1)
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+; CHECK-NEXT:    br i1 [[TMP16]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   %cmp8 = icmp eq i32 %N, 0
   %tmp8 = add i32 %N, 3
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
index ec542df07e684..3ae75d2ba3abf 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
@@ -1,24 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 
 ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve %s -S -o - | FileCheck %s
 
-; CHECK-LABEL: vec_mul_reduce_add
-
-; CHECK: vector.ph:
-; CHECK:  %start = call i32 @llvm.start.loop.iterations.i32
-; CHECK:  br label %vector.body
-
-; CHECK: vector.body:
-; CHECK: [[ELTS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[SUB:%[^ ]+]], %vector.body ]
-; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELTS]])
-; CHECK: [[SUB]] = sub i32 [[ELTS]], 4
-; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]]
-; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]],
-
-; CHECK: middle.block:
-; CHECK: [[VPSEL:%[^ ]+]] = select <4 x i1> [[VCTP]],
-; CHECK: call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[VPSEL]])
-
 define i32 @vec_mul_reduce_add(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, i32 %N) {
+; CHECK-LABEL: define i32 @vec_mul_reduce_add(
+; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP8:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], 3
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw i32 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], -4
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw i32 [[TMP4]], 1
+; CHECK-NEXT:    br i1 [[CMP8]], label %[[FOR_COND_CLEANUP:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 [[TMP5]])
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[LSR_IV2:%.*]] = phi ptr [ [[SCEVGEP3:%.*]], %[[VECTOR_BODY]] ], [ [[A]], %[[VECTOR_PH]] ]
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[VECTOR_BODY]] ], [ [[B]], %[[VECTOR_PH]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP7]])
+; CHECK-NEXT:    [[TMP9]] = sub i32 [[TMP7]], 4
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV2]], <4 x i1> [[TMP8]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV]], <4 x i1> [[TMP8]], <4 x i32> undef)
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD13]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    [[TMP11]] = add nsw <4 x i32> [[TMP10]], [[VEC_PHI]]
+; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4
+; CHECK-NEXT:    [[SCEVGEP3]] = getelementptr i32, ptr [[LSR_IV2]], i32 4
+; CHECK-NEXT:    [[TMP12]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP6]], i32 1)
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i32 [[TMP12]], 0
+; CHECK-NEXT:    br i1 [[TMP13]], label %[[VECTOR_BODY]], label %[[MIDDLE_BLOCK:.*]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP14:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> [[TMP11]], <4 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP14]])
+; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    [[RES_0_LCSSA:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[TMP15]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[RES_0_LCSSA]]
+;
 entry:
   %cmp8 = icmp eq i32 %N, 0
   %0 = add i32 %N, 3
diff --git a/llvm/test/CodeGen/WebAssembly/bulk-memory.ll b/llvm/test/CodeGen/WebAssembly/bulk-memory.ll
index ae170d757a305..d9490688bedd1 100644
--- a/llvm/test/CodeGen/WebAssembly/bulk-memory.ll
+++ b/llvm/test/CodeGen/WebAssembly/bulk-memory.ll
@@ -104,6 +104,31 @@ define void @memset_i32(ptr %dest, i8 %val, i32 %len) {
   ret void
 }
 
+; CHECK-LABEL: memcpy_0:
+; CHECK-NEXT: .functype memcpy_0 (i32, i32) -> ()
+; CHECK-NEXT: return
+define void @memcpy_0(ptr %dest, ptr %src) {
+  call void @llvm.memcpy.p0.p0.i32(ptr %dest, ptr %src, i32 0, i1 0)
+  ret void
+}
+
+; CHECK-LABEL: memmove_0:
+; CHECK-NEXT: .functype memmove_0 (i32, i32) -> ()
+; CHECK-NEXT: return
+define void @memmove_0(ptr %dest, ptr %src) {
+  call void @llvm.memmove.p0.p0.i32(ptr %dest, ptr %src, i32 0, i1 0)
+  ret void
+}
+
+; CHECK-LABEL: memset_0:
+; NO-BULK-MEM-NOT: memory.fill
+; BULK-MEM-NEXT: .functype memset_0 (i32, i32) -> ()
+; BULK-MEM-NEXT: return
+define void @memset_0(ptr %dest, i8 %val) {
+  call void @llvm.memset.p0.i32(ptr %dest, i8 %val, i32 0, i1 0)
+  ret void
+}
+
 ; CHECK-LABEL: memcpy_1:
 ; CHECK-NEXT: .functype memcpy_1 (i32, i32) -> ()
 ; CHECK-NEXT: i32.load8_u $push[[L0:[0-9]+]]=, 0($1)
@@ -137,14 +162,8 @@ define void @memset_1(ptr %dest, i8 %val) {
 ; CHECK-LABEL: memcpy_1024:
 ; NO-BULK-MEM-NOT: memory.copy
 ; BULK-MEM-NEXT: .functype memcpy_1024 (i32, i32) -> ()
-; BULK-MEM-NEXT: block
 ; BULK-MEM-NEXT: i32.const $push[[L0:[0-9]+]]=, 1024
-; BULK-MEM-NEXT: i32.eqz $push[[L1:[0-9]+]]=, $pop[[L0]]
-; BULK-MEM-NEXT: br_if 0, $pop[[L1]]
-; BULK-MEM-NEXT: i32.const $push[[L2:[0-9]+]]=, 1024
-; BULK-MEM-NEXT: memory.copy 0, 0, $0, $1, $pop[[L2]]
-; BULK-MEM-NEXT: .LBB{{.*}}:
-; BULK-MEM-NEXT: end_block
+; BULK-MEM-NEXT: memory.copy 0, 0, $0, $1, $pop[[L0]]
 ; BULK-MEM-NEXT: return
 define void @memcpy_1024(ptr %dest, ptr %src) {
   call void @llvm.memcpy.p0.p0.i32(ptr %dest, ptr %src, i32 1024, i1 0)
@@ -154,14 +173,8 @@ define void @memcpy_1024(ptr %dest, ptr %src) {
 ; CHECK-LABEL: memmove_1024:
 ; NO-BULK-MEM-NOT: memory.copy
 ; BULK-MEM-NEXT: .functype memmove_1024 (i32, i32) -> ()
-; BULK-MEM-NEXT: block
 ; BULK-MEM-NEXT: i32.const $push[[L0:[0-9]+]]=, 1024
-; BULK-MEM-NEXT: i32.eqz $push[[L1:[0-9]+]]=, $pop[[L0]]
-; BULK-MEM-NEXT: br_if 0, $pop[[L1]]
-; BULK-MEM-NEXT: i32.const $push[[L2:[0-9]+]]=, 1024
-; BULK-MEM-NEXT: memory.copy 0, 0, $0, $1, $pop[[L2]]
-; BULK-MEM-NEXT: .LBB{{.*}}:
-; BULK-MEM-NEXT: end_block
+; BULK-MEM-NEXT: memory.copy 0, 0, $0, $1, $pop[[L0]]
 ; BULK-MEM-NEXT: return
 define void @memmove_1024(ptr %dest, ptr %src) {
   call void @llvm.memmove.p0.p0.i32(ptr %dest, ptr %src, i32 1024, i1 0)
@@ -171,14 +184,8 @@ define void @memmove_1024(ptr %dest, ptr %src) {
 ; CHECK-LABEL: memset_1024:
 ; NO-BULK-MEM-NOT: memory.fill
 ; BULK-MEM-NEXT: .functype memset_1024 (i32, i32) -> ()
-; BULK-MEM-NEXT: block
 ; BULK-MEM-NEXT: i32.const $push[[L0:[0-9]+]]=, 1024
-; BULK-MEM-NEXT: i32.eqz $push[[L1:[0-9]+]]=, $pop[[L0]]
-; BULK-MEM-NEXT: br_if 0, $pop[[L1]]
-; BULK-MEM-NEXT: i32.const $push[[L2:[0-9]+]]=, 1024
-; BULK-MEM-NEXT: memory.fill 0, $0, $1, $pop[[L2]]
-; BULK-MEM-NEXT: .LBB{{.*}}:
-; BULK-MEM-NEXT: end_block
+; BULK-MEM-NEXT: memory.fill 0, $0, $1, $pop[[L0]]
 ; BULK-MEM-NEXT: return
 define void @memset_1024(ptr %dest, i8 %val) {
   call void @llvm.memset.p0.i32(ptr %dest, i8 %val, i32 1024, i1 0)
@@ -201,17 +208,11 @@ define void @memset_1024(ptr %dest, i8 %val) {
 ; BULK-MEM-NEXT: .functype memcpy_alloca_src (i32) -> ()
 ; BULK-MEM-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer
 ; BULK-MEM-NEXT: i32.const $push[[L1:[0-9]+]]=, 112
-; BULK-MEM-NEXT: i32.sub $[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]
-; BULK-MEM-NEXT: block
-; BULK-MEM-NEXT: i32.const $push[[L3:[0-9]+]]=, 100
-; BULK-MEM-NEXT: i32.eqz $push[[L4:[0-9]+]]=, $pop[[L3]]
-; BULK-MEM-NEXT: br_if 0, $pop[[L4]]
-; BULK-MEM-NEXT: i32.const $push[[L5:[0-9]+]]=, 12
-; BULK-MEM-NEXT: i32.add $push[[L6:[0-9]+]]=, $[[L2]], $pop[[L5]]
-; BULK-MEM-NEXT: i32.const $push[[L7:[0-9]+]]=, 100
-; BULK-MEM-NEXT: memory.copy 0, 0, $0, $pop[[L6]], $pop[[L7]]
-; BULK-MEM-NEXT: .LBB{{.*}}:
-; BULK-MEM-NEXT: end_block
+; BULK-MEM-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]
+; BULK-MEM-NEXT: i32.const $push[[L3:[0-9]+]]=, 12
+; BULK-MEM-NEXT: i32.add $push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]]
+; BULK-MEM-NEXT: i32.const $push[[L5:[0-9]+]]=, 100
+; BULK-MEM-NEXT: memory.copy 0, 0, $0, $pop[[L4]], $pop[[L5]]
 ; BULK-MEM-NEXT: return
 define void @memcpy_alloca_src(ptr %dst) {
   %a = alloca [100 x i8]
@@ -224,17 +225,11 @@ define void @memcpy_alloca_src(ptr %dst) {
 ; BULK-MEM-NEXT: .functype memcpy_alloca_dst (i32) -> ()
 ; BULK-MEM-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer
 ; BULK-MEM-NEXT: i32.const $push[[L1:[0-9]+]]=, 112
-; BULK-MEM-NEXT: i32.sub $[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]
-; BULK-MEM-NEXT: block
-; BULK-MEM-NEXT: i32.const $push[[L3:[0-9]+]]=, 100
-; BULK-MEM-NEXT: i32.eqz $push[[L4:[0-9]+]]=, $pop[[L3]]
-; BULK-MEM-NEXT: br_if 0, $pop[[L4]]
-; BULK-MEM-NEXT: i32.const $push[[L5:[0-9]+]]=, 12
-; BULK-MEM-NEXT: i32.add $push[[L6:[0-9]+]]=, $[[L2]], $pop[[L5]]
-; BULK-MEM-NEXT: i32.const $push[[L7:[0-9]+]]=, 100
-; BULK-MEM-NEXT: memory.copy 0, 0, $pop[[L6]], $0, $pop[[L7]]
-; BULK-MEM-NEXT: .LBB{{.*}}:
-; BULK-MEM-NEXT: end_block
+; BULK-MEM-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]
+; BULK-MEM-NEXT: i32.const $push[[L3:[0-9]+]]=, 12
+; BULK-MEM-NEXT: i32.add $push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]]
+; BULK-MEM-NEXT: i32.const $push[[L5:[0-9]+]]=, 100
+; BULK-MEM-NEXT: memory.copy 0, 0, $pop[[L4]], $0, $pop[[L5]]
 ; BULK-MEM-NEXT: return
 define void @memcpy_alloca_dst(ptr %src) {
   %a = alloca [100 x i8]
@@ -247,17 +242,11 @@ define void @memcpy_alloca_dst(ptr %src) {
 ; BULK-MEM-NEXT: .functype memset_alloca (i32) -> ()
 ; BULK-MEM-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer
 ; BULK-MEM-NEXT: i32.const $push[[L1:[0-9]+]]=, 112
-; BULK-MEM-NEXT: i32.sub $1=, $pop[[L0]], $pop[[L1]]
-; BULK-MEM-NEXT: block
-; BULK-MEM-NEXT: i32.const $push[[L2:[0-9]+]]=, 100
-; BULK-MEM-NEXT: i32.eqz $push[[L3:[0-9]+]]=, $pop[[L2]]
-; BULK-MEM-NEXT: br_if 0, $pop[[L3]]
-; BULK-MEM-NEXT: i32.const $push[[L4:[0-9]+]]=, 12
-; BULK-MEM-NEXT: i32.add $push[[L5:[0-9]+]]=, $1, $pop[[L4]]
-; BULK-MEM-NEXT: i32.const $push[[L6:[0-9]+]]=, 100
-; BULK-MEM-NEXT: memory.fill 0, $pop[[L5]], $0, $pop[[L6]]
-; BULK-MEM-NEXT: .LBB{{.*}}:
-; BULK-MEM-NEXT: end_block
+; BULK-MEM-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]
+; BULK-MEM-NEXT: i32.const $push[[L3:[0-9]+]]=, 12
+; BULK-MEM-NEXT: i32.add $push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]]
+; BULK-MEM-NEXT: i32.const $push[[L5:[0-9]+]]=, 100
+; BULK-MEM-NEXT: memory.fill 0, $pop[[L4]], $0, $pop[[L5]]
 ; BULK-MEM-NEXT: return
 define void @memset_alloca(i8 %val) {
   %a = alloca [100 x i8]
diff --git a/llvm/test/CodeGen/WebAssembly/bulk-memory64.ll b/llvm/test/CodeGen/WebAssembly/bulk-memory64.ll
index 0cf8493a995f9..d0206a399a9b2 100644
--- a/llvm/test/CodeGen/WebAssembly/bulk-memory64.ll
+++ b/llvm/test/CodeGen/WebAssembly/bulk-memory64.ll
@@ -110,6 +110,31 @@ define void @memset_i32(ptr %dest, i8 %val, i64 %len) {
   ret void
 }
 
+; CHECK-LABEL: memcpy_0:
+; CHECK-NEXT: .functype memcpy_0 (i64, i64) -> ()
+; CHECK-NEXT: return
+define void @memcpy_0(ptr %dest, ptr %src) {
+  call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 0, i1 0)
+  ret void
+}
+
+; CHECK-LABEL: memmove_0:
+; CHECK-NEXT: .functype memmove_0 (i64, i64) -> ()
+; CHECK-NEXT: return
+define void @memmove_0(ptr %dest, ptr %src) {
+  call void @llvm.memmove.p0.p0.i64(ptr %dest, ptr %src, i64 0, i1 0)
+  ret void
+}
+
+; CHECK-LABEL: memset_0:
+; NO-BULK-MEM-NOT: memory.fill
+; BULK-MEM-NEXT: .functype memset_0 (i64, i32) -> ()
+; BULK-MEM-NEXT: return
+define void @memset_0(ptr %dest, i8 %val) {
+  call void @llvm.memset.p0.i64(ptr %dest, i8 %val, i64 0, i1 0)
+  ret void
+}
+
 ; CHECK-LABEL: memcpy_1:
 ; CHECK-NEXT: .functype memcpy_1 (i64, i64) -> ()
 ; CHECK-NEXT: i32.load8_u $push[[L0:[0-9]+]]=, 0($1)
@@ -143,14 +168,8 @@ define void @memset_1(ptr %dest, i8 %val) {
 ; CHECK-LABEL: memcpy_1024:
 ; NO-BULK-MEM-NOT: memory.copy
 ; BULK-MEM-NEXT: .functype memcpy_1024 (i64, i64) -> ()
-; BULK-MEM-NEXT: block
-; BULK-MEM-NEXT: i64.const	$push[[L1:[0-9]+]]=, 1024
-; BULK-MEM-NEXT: i64.eqz 	$push0=, $pop[[L1]]
-; BULK-MEM-NEXT: br_if   	0, $pop0
 ; BULK-MEM-NEXT: i64.const	$push[[L0:[0-9]+]]=, 1024
 ; BULK-MEM-NEXT: memory.copy	0, 0, $0, $1, $pop[[L0]]
-; BULK-MEM-NEXT: .LBB{{.*}}:
-; BULK-MEM-NEXT: end_block
 ; BULK-MEM-NEXT: return
 define void @memcpy_1024(ptr %dest, ptr %src) {
   call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 1024, i1 0)
@@ -160,14 +179,8 @@ define void @memcpy_1024(ptr %dest, ptr %src) {
 ; CHECK-LABEL: memmove_1024:
 ; NO-BULK-MEM-NOT: memory.copy
 ; BULK-MEM-NEXT: .functype memmove_1024 (i64, i64) -> ()
-; BULK-MEM-NEXT: block
-; BULK-MEM-NEXT: i64.const	$push[[L1:[0-9]+]]=, 1024
-; BULK-MEM-NEXT: i64.eqz 	$push0=, $pop[[L1]]
-; BULK-MEM-NEXT: br_if   	0, $pop0
 ; BULK-MEM-NEXT: i64.const	$push[[L0:[0-9]+]]=, 1024
 ; BULK-MEM-NEXT: memory.copy	0, 0, $0, $1, $pop[[L0]]
-; BULK-MEM-NEXT: .LBB{{.*}}:
-; BULK-MEM-NEXT: end_block
 ; BULK-MEM-NEXT: return
 define void @memmove_1024(ptr %dest, ptr %src) {
   call void @llvm.memmove.p0.p0.i64(ptr %dest, ptr %src, i64 1024, i1 0)
@@ -177,14 +190,8 @@ define void @memmove_1024(ptr %dest, ptr %src) {
 ; CHECK-LABEL: memset_1024:
 ; NO-BULK-MEM-NOT: memory.fill
 ; BULK-MEM-NEXT: .functype memset_1024 (i64, i32) -> ()
-; BULK-MEM-NEXT: block
-; BULK-MEM-NEXT: i64.const	$push[[L1:[0-9]+]]=, 1024
-; BULK-MEM-NEXT: i64.eqz 	$push0=, $pop[[L1]]
-; BULK-MEM-NEXT: br_if   	0, $pop0
 ; BULK-MEM-NEXT: i64.const	$push[[L0:[0-9]+]]=, 1024
 ; BULK-MEM-NEXT: memory.fill	0, $0, $1, $pop[[L0]]
-; BULK-MEM-NEXT: .LBB{{.*}}:
-; BULK-MEM-NEXT: end_block
 ; BULK-MEM-NEXT: return
 define void @memset_1024(ptr %dest, i8 %val) {
   call void @llvm.memset.p0.i64(ptr %dest, i8 %val, i64 1024, i1 0)
@@ -207,17 +214,11 @@ define void @memset_1024(ptr %dest, i8 %val) {
 ; BULK-MEM-NEXT: .functype memcpy_alloca_src (i64) -> ()
 ; BULK-MEM-NEXT: global.get	$push[[L1:[0-9]+]]=, __stack_pointer
 ; BULK-MEM-NEXT: i64.const	$push[[L0:[0-9]+]]=, 112
-; BULK-MEM-NEXT: i64.sub 	$[[L2:[0-9]+]]=, $pop[[L1]], $pop[[L0]]
-; BULK-MEM-NEXT: block
-; BULK-MEM-NEXT: i64.const	$push[[L3:[0-9]+]]=, 100
-; BULK-MEM-NEXT: i64.eqz 	$push[[L4:[0-9]+]]=, $pop[[L3]]
-; BULK-MEM-NEXT: br_if   	0, $pop[[L4]]
-; BULK-MEM-NEXT: i64.const	$push[[L5:[0-9]+]]=, 12
-; BULK-MEM-NEXT: i64.add 	$push[[L6:[0-9]+]]=, $[[L2]], $pop[[L5]]
-; BULK-MEM-NEXT: i64.const	$push[[L7:[0-9]+]]=, 100
-; BULK-MEM-NEXT: memory.copy	0, 0, $0, $pop[[L6]], $pop[[L7]]
-; BULK-MEM-NEXT: .LBB{{.*}}:
-; BULK-MEM-NEXT: end_block
+; BULK-MEM-NEXT: i64.sub 	$push[[L2:[0-9]+]]=, $pop[[L1]], $pop[[L0]]
+; BULK-MEM-NEXT: i64.const	$push[[L3:[0-9]+]]=, 12
+; BULK-MEM-NEXT: i64.add 	$push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]]
+; BULK-MEM-NEXT: i64.const	$push[[L5:[0-9]+]]=, 100
+; BULK-MEM-NEXT: memory.copy	0, 0, $0, $pop[[L4]], $pop[[L5]]
 ; BULK-MEM-NEXT: return
 define void @memcpy_alloca_src(ptr %dst) {
   %a = alloca [100 x i8]
@@ -230,17 +231,11 @@ define void @memcpy_alloca_src(ptr %dst) {
 ; BULK-MEM-NEXT: .functype memcpy_alloca_dst (i64) -> ()
 ; BULK-MEM-NEXT: global.get	$push[[L1:[0-9]+]]=, __stack_pointer
 ; BULK-MEM-NEXT: i64.const	$push[[L0:[0-9]+]]=, 112
-; BULK-MEM-NEXT: i64.sub 	$[[L2:[0-9]+]]=, $pop[[L1]], $pop[[L0]]
-; BULK-MEM-NEXT: block
-; BULK-MEM-NEXT: i64.const	$push[[L3:[0-9]+]]=, 100
-; BULK-MEM-NEXT: i64.eqz 	$push[[L4:[0-9]+]]=, $pop[[L3]]
-; BULK-MEM-NEXT: br_if   	0, $pop[[L4]]
-; BULK-MEM-NEXT: i64.const	$push[[L5:[0-9]+]]=, 12
-; BULK-MEM-NEXT: i64.add 	$push[[L6:[0-9]+]]=, $[[L2]], $pop[[L5]]
-; BULK-MEM-NEXT: i64.const	$push[[L7:[0-9]+]]=, 100
-; BULK-MEM-NEXT: memory.copy	0, 0, $pop[[L6]], $0, $pop[[L7]]
-; BULK-MEM-NEXT: .LBB{{.*}}:
-; BULK-MEM-NEXT: end_block
+; BULK-MEM-NEXT: i64.sub 	$push[[L2:[0-9]+]]=, $pop[[L1]], $pop[[L0]]
+; BULK-MEM-NEXT: i64.const	$push[[L3:[0-9]+]]=, 12
+; BULK-MEM-NEXT: i64.add 	$push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]]
+; BULK-MEM-NEXT: i64.const	$push[[L5:[0-9]+]]=, 100
+; BULK-MEM-NEXT: memory.copy	0, 0, $pop[[L4]], $0, $pop[[L5]]
 ; BULK-MEM-NEXT: return
 define void @memcpy_alloca_dst(ptr %src) {
   %a = alloca [100 x i8]
@@ -253,17 +248,11 @@ define void @memcpy_alloca_dst(ptr %src) {
 ; BULK-MEM-NEXT: .functype memset_alloca (i32) -> ()
 ; BULK-MEM-NEXT: global.get	$push[[L1:[0-9]+]]=, __stack_pointer
 ; BULK-MEM-NEXT: i64.const	$push[[L0:[0-9]+]]=, 112
-; BULK-MEM-NEXT: i64.sub 	$1=, $pop[[L1]], $pop[[L0]]
-; BULK-MEM-NEXT: block
-; BULK-MEM-NEXT: i64.const	$push[[L2:[0-9]+]]=, 100
-; BULK-MEM-NEXT: i64.eqz 	$push[[L3:[0-9]+]]=, $pop[[L2]]
-; BULK-MEM-NEXT: br_if   	0, $pop[[L3]]
-; BULK-MEM-NEXT: i64.const	$push[[L4:[0-9]+]]=, 12
-; BULK-MEM-NEXT: i64.add 	$push[[L5:[0-9]+]]=, $1, $pop[[L4]]
-; BULK-MEM-NEXT: i64.const	$push[[L6:[0-9]+]]=, 100
-; BULK-MEM-NEXT: memory.fill	0, $pop[[L5]], $0, $pop[[L6]]
-; BULK-MEM-NEXT: .LBB{{.*}}:
-; BULK-MEM-NEXT: end_block
+; BULK-MEM-NEXT: i64.sub 	$push[[L2:[0-9]+]]=, $pop[[L1]], $pop[[L0]]
+; BULK-MEM-NEXT: i64.const	$push[[L3:[0-9]+]]=, 12
+; BULK-MEM-NEXT: i64.add 	$push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]]
+; BULK-MEM-NEXT: i64.const	$push[[L5:[0-9]+]]=, 100
+; BULK-MEM-NEXT: memory.fill	0, $pop[[L4]], $0, $pop[[L5]]
 ; BULK-MEM-NEXT: return
 define void @memset_alloca(i8 %val) {
   %a = alloca [100 x i8]
diff --git a/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll b/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll
index 04a2268db1755..91cd3dd1ca4e7 100644
--- a/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll
+++ b/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll
@@ -1,5 +1,6 @@
 ; RUN: opt -mattr=+simd128 -passes=loop-vectorize %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s
 ; RUN: opt -mattr=+simd128 -passes=loop-vectorize -vectorizer-maximize-bandwidth %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s --check-prefix=MAX-BANDWIDTH
+; RUN: opt -mattr=+simd128,+relaxed-simd -passes=loop-vectorize -vectorizer-maximize-bandwidth %s | llc -mtriple=wasm32 -mattr=+simd128,+relaxed-simd -verify-machineinstrs -o - | FileCheck %s --check-prefix=RELAXED-MAX-BANDWIDTH
 
 target triple = "wasm32"
 
@@ -13,16 +14,18 @@ define hidden i32 @i32_mac_s8(ptr nocapture noundef readonly %a, ptr nocapture n
 ; CHECK:    i32x4.add
 
 ; MAX-BANDWIDTH: v128.load
-; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
 ; MAX-BANDWIDTH: v128.load
-; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
-; MAX-BANDWIDTH: i32x4.dot_i16x8_s
-; MAX-BANDWIDTH: i16x8.extend_high_i8x16_s
-; MAX-BANDWIDTH: i16x8.extend_high_i8x16_s
-; MAX-BANDWIDTH: i32x4.dot_i16x8_s
+; MAX-BANDWIDTH: i16x8.extmul_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
+; MAX-BANDWIDTH: i16x8.extmul_high_i8x16_s
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
 ; MAX-BANDWIDTH: i32x4.add
 ; MAX-BANDWIDTH: i32x4.add
 
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.relaxed_dot_i8x16_i7x16_add_s
+
 entry:
   %cmp7.not = icmp eq i32 %N, 0
   br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
@@ -47,6 +50,109 @@ for.body:                                         ; preds = %entry, %for.body
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
 
+define hidden i32 @i32_mac_u8_s8(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
+; CHECK-LABEL: i32_mac_u8_s8:
+; CHECK: loop
+; CHECK: v128.load32_zero
+; CHECK: i16x8.extend_low_i8x16_u
+; CHECK: i32x4.extend_low_i16x8_u
+; CHECK: v128.load32_zero
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: i32x4.mul
+; CHECK: i32x4.add
+
+; MAX-BANDWIDTH: loop
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i8x16.shuffle   12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i8x16.shuffle   12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i32x4.mul
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i8x16.shuffle   8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; MAX-BANDWIDTH: i8x16.shuffle   8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i32x4.mul
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i8x16.shuffle   4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; MAX-BANDWIDTH: i8x16.shuffle   4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i32x4.mul
+; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i32x4.mul
+; MAX-BANDWIDTH: i32x4.add
+
+; RELAXED-MAX-BANDWIDTH: loop
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle   12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle   12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle   8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle   8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle   4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle   4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+entry:
+  %cmp7.not = icmp eq i32 %N, 0
+  br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  ret i32 %res.0.lcssa
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %res.08 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %a, i32 %i.09
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = sext i8 %0 to i32
+  %arrayidx1 = getelementptr inbounds i8, ptr %b, i32 %i.09
+  %1 = load i8, ptr %arrayidx1, align 1
+  %conv2 = zext i8 %1 to i32
+  %mul = mul nsw i32 %conv2, %conv
+  %add = add nsw i32 %mul, %res.08
+  %inc = add nuw i32 %i.09, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
 define hidden i32 @i32_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
 ; CHECK-LABEL: i32_mac_s16:
 ; CHECK:    i32x4.load16x4_s 0:p2align=1
@@ -57,6 +163,12 @@ define hidden i32 @i32_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture
 ; MAX-BANDWIDTH: v128.load
 ; MAX-BANDWIDTH: v128.load
 ; MAX-BANDWIDTH: i32x4.dot_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
+
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.dot_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i32x4.add
 
 entry:
   %cmp7.not = icmp eq i32 %N, 0
@@ -116,6 +228,31 @@ define hidden i64 @i64_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture
 ; MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
 ; MAX-BANDWIDTH: i64x2.add
 
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_s
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+
 entry:
   %cmp7.not = icmp eq i32 %N, 0
   br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
@@ -156,6 +293,14 @@ define hidden i64 @i64_mac_s32(ptr nocapture noundef readonly %a, ptr nocapture
 ; MAX-BANDWIDTH: i64x2.extend_low_i32x4_s
 ; MAX-BANDWIDTH: i64x2.add
 
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i64x2.extend_high_i32x4_s
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i64x2.extend_low_i32x4_s
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+
 entry:
   %cmp6.not = icmp eq i32 %N, 0
   br i1 %cmp6.not, label %for.cond.cleanup, label %for.body
@@ -197,6 +342,15 @@ define hidden i32 @i32_mac_u8(ptr nocapture noundef readonly %a, ptr nocapture n
 ; MAX-BANDWIDTH: i32x4.add
 ; MAX-BANDWIDTH: i32x4.add
 
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i16x8.extmul_low_i8x16_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i16x8.extmul_high_i8x16_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+
 entry:
   %cmp7.not = icmp eq i32 %N, 0
   br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
@@ -235,6 +389,13 @@ define hidden i32 @i32_mac_u16(ptr nocapture noundef readonly %a, ptr nocapture
 ; MAX-BANDWIDTH: i32x4.add
 ; MAX-BANDWIDTH: i32x4.add
 
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.extmul_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extmul_high_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+
 entry:
   %cmp7.not = icmp eq i32 %N, 0
   br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
@@ -277,6 +438,17 @@ define hidden i32 @i32_mac_u16_s16(ptr nocapture noundef readonly %a, ptr nocapt
 ; MAX-BANDWIDTH: i32x4.add
 ; MAX-BANDWIDTH: i32x4.add
 
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_high_i16x8_s
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_high_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+; RELAXED-MAX-BANDWIDTH: i32x4.add
+
 entry:
   %cmp7.not = icmp eq i32 %N, 0
   br i1 %cmp7.not, label %for.cond.cleanup, label %for.body
@@ -335,6 +507,32 @@ define hidden i64 @i64_mac_u16(ptr nocapture noundef readonly %a, ptr nocapture
 ; MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
 ; MAX-BANDWIDTH: i64x2.add
 
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	12, 13, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	8, 9, 10, 11, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i8x16.shuffle	4, 5, 6, 7, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i64x2.extmul_low_i32x4_u
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+
+
 entry:
   %cmp8.not = icmp eq i32 %N, 0
   br i1 %cmp8.not, label %for.cond.cleanup, label %for.body
@@ -375,6 +573,14 @@ define hidden i64 @i64_mac_u32(ptr nocapture noundef readonly %a, ptr nocapture
 ; MAX-BANDWIDTH: i64x2.extend_low_i32x4_u
 ; MAX-BANDWIDTH: i64x2.add
 
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: v128.load
+; RELAXED-MAX-BANDWIDTH: i32x4.mul
+; RELAXED-MAX-BANDWIDTH: i64x2.extend_high_i32x4_u
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+; RELAXED-MAX-BANDWIDTH: i64x2.extend_low_i32x4_u
+; RELAXED-MAX-BANDWIDTH: i64x2.add
+
 entry:
   %cmp6.not = icmp eq i32 %N, 0
   br i1 %cmp6.not, label %for.cond.cleanup, label %for.body
diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-dot.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-dot.ll
new file mode 100644
index 0000000000000..9716cbe077080
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-dot.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers  -mattr=+fp16,+simd128,+relaxed-simd | FileCheck %s
+
+target triple = "wasm32"
+; relaxed_dot stands for relaxed_dot_i8x16_i7x16_s, as in td
+; relaxed_dot_add stands for i32x4.relaxed_dot_i8x16_i7x16_add_s, as in td
+
+define <8 x i16> @relaxed_dot_sext_1(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: relaxed_dot_sext_1:
+; CHECK:         .functype relaxed_dot_sext_1 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i16x8.relaxed_dot_i8x16_i7x16_s $push0=, $0, $1
+; CHECK-NEXT:    return $pop0
+  %sext1 = sext <16 x i8> %a to <16 x i16>
+  %sext2 = sext <16 x i8> %b to <16 x i16>
+  %mul = mul <16 x i16> %sext1, %sext2
+  %shuffle1 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %shuffle2 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %res = add <8 x i16> %shuffle1, %shuffle2
+  ret <8 x i16> %res
+}
+
+
+define <8 x i16> @relaxed_dot_sext_2(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: relaxed_dot_sext_2:
+; CHECK:         .functype relaxed_dot_sext_2 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i16x8.relaxed_dot_i8x16_i7x16_s $push0=, $0, $1
+; CHECK-NEXT:    return $pop0
+  %sext1 = sext <16 x i8> %a to <16 x i16>
+  %sext2 = sext <16 x i8> %b to <16 x i16>
+  %mul = mul <16 x i16> %sext1, %sext2
+  %shuffle1 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %shuffle2 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %res = add <8 x i16> %shuffle2, %shuffle1
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @relaxed_dot_sext_self(<16 x i8> %v) {
+; CHECK-LABEL: relaxed_dot_sext_self:
+; CHECK:         .functype relaxed_dot_sext_self (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i16x8.relaxed_dot_i8x16_i7x16_s $push0=, $0, $0
+; CHECK-NEXT:    return $pop0
+  %sext = sext <16 x i8> %v to <16 x i16>
+  %mul = mul <16 x i16> %sext, %sext
+  %shuffle1 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %shuffle2 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %res = add <8 x i16> %shuffle1, %shuffle2
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @relaxed_dot_add_from_relaxed_dot(<16 x i8> %a, <16 x i8> %b, <4 x i32> %c) {
+; CHECK-LABEL: relaxed_dot_add_from_relaxed_dot:
+; CHECK:         .functype relaxed_dot_add_from_relaxed_dot (v128, v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32x4.relaxed_dot_i8x16_i7x16_add_s $push0=, $0, $1, $2
+; CHECK-NEXT:    return $pop0
+  %relaxed_dot_call = call <8 x i16> @llvm.wasm.relaxed.dot.i8x16.i7x16.signed(<16 x i8> %a, <16 x i8> %b)
+  %sext = call <4 x i32> @llvm.wasm.extadd.pairwise.signed.v4i32(<8 x i16> %relaxed_dot_call)
+  %res = add <4 x i32> %sext, %c
+  ret <4 x i32> %res
+}
+
+; INFO: Negative test
+define <8 x i16> @relaxed_dot_zext(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: relaxed_dot_zext:
+; CHECK:         .functype relaxed_dot_zext (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i16x8.extmul_low_i8x16_u $push6=, $0, $1
+; CHECK-NEXT:    local.tee $push5=, $2=, $pop6
+; CHECK-NEXT:    i16x8.extmul_high_i8x16_u $push4=, $0, $1
+; CHECK-NEXT:    local.tee $push3=, $1=, $pop4
+; CHECK-NEXT:    i8x16.shuffle $push1=, $pop5, $pop3, 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
+; CHECK-NEXT:    i8x16.shuffle $push0=, $2, $1, 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
+; CHECK-NEXT:    i16x8.add $push2=, $pop1, $pop0
+; CHECK-NEXT:    return $pop2
+  %zext1 = zext <16 x i8> %a to <16 x i16>
+  %zext2 = zext <16 x i8> %b to <16 x i16>
+  %mul = mul <16 x i16> %zext1, %zext2
+  %shuffle1 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %shuffle2 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %res = add <8 x i16> %shuffle1, %shuffle2
+  ret <8 x i16> %res
+
+}
+
+; INFO: Negative test
+define <8 x i16> @relaxed_dot_wrong_shuffle(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: relaxed_dot_wrong_shuffle:
+; CHECK:         .functype relaxed_dot_wrong_shuffle (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i16x8.extmul_low_i8x16_s $push1=, $0, $1
+; CHECK-NEXT:    i16x8.extmul_high_i8x16_s $push0=, $0, $1
+; CHECK-NEXT:    i16x8.add $push2=, $pop1, $pop0
+; CHECK-NEXT:    return $pop2
+  %sext1 = sext <16 x i8> %a to <16 x i16>
+  %sext2 = sext <16 x i8> %b to <16 x i16>
+  %mul = mul <16 x i16> %sext1, %sext2
+  %shuffle1 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %shuffle2 = shufflevector <16 x i16> %mul, <16 x i16> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %res = add <8 x i16> %shuffle1, %shuffle2
+  ret <8 x i16> %res
+}
diff --git a/llvm/test/CodeGen/X86/2007-08-09-IllegalX86-64Asm.ll b/llvm/test/CodeGen/X86/2007-08-09-IllegalX86-64Asm.ll
index 28b4541c1bfc7..7bdc4e19a1cf6 100644
--- a/llvm/test/CodeGen/X86/2007-08-09-IllegalX86-64Asm.ll
+++ b/llvm/test/CodeGen/X86/2007-08-09-IllegalX86-64Asm.ll
@@ -44,7 +44,7 @@ define ptr @ubyte_divmod(ptr %a, ptr %b) {
 ; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
 ; CHECK-NEXT:    callq __ubyte_convert_to_ctype
 ; CHECK-NEXT:    testl %eax, %eax
-; CHECK-NEXT:    js LBB0_6
+; CHECK-NEXT:    js LBB0_4
 ; CHECK-NEXT:  ## %bb.1: ## %cond_next.i
 ; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
 ; CHECK-NEXT:    movq %rbx, %rdi
@@ -53,84 +53,81 @@ define ptr @ubyte_divmod(ptr %a, ptr %b) {
 ; CHECK-NEXT:    sarl $31, %ecx
 ; CHECK-NEXT:    andl %eax, %ecx
 ; CHECK-NEXT:    cmpl $-2, %ecx
-; CHECK-NEXT:    je LBB0_10
+; CHECK-NEXT:    je LBB0_8
 ; CHECK-NEXT:  ## %bb.2: ## %cond_next.i
 ; CHECK-NEXT:    cmpl $-1, %ecx
-; CHECK-NEXT:    jne LBB0_3
-; CHECK-NEXT:  LBB0_8: ## %bb4
+; CHECK-NEXT:    jne LBB0_6
+; CHECK-NEXT:  LBB0_3: ## %bb4
 ; CHECK-NEXT:    movq _PyArray_API@GOTPCREL(%rip), %rax
 ; CHECK-NEXT:    movq (%rax), %rax
 ; CHECK-NEXT:    movq 16(%rax), %rax
-; CHECK-NEXT:    jmp LBB0_9
-; CHECK-NEXT:  LBB0_6: ## %_ubyte_convert2_to_ctypes.exit
+; CHECK-NEXT:    jmp LBB0_10
+; CHECK-NEXT:  LBB0_4: ## %_ubyte_convert2_to_ctypes.exit
 ; CHECK-NEXT:    cmpl $-2, %eax
-; CHECK-NEXT:    je LBB0_10
-; CHECK-NEXT:  ## %bb.7: ## %_ubyte_convert2_to_ctypes.exit
-; CHECK-NEXT:    cmpl $-1, %eax
 ; CHECK-NEXT:    je LBB0_8
-; CHECK-NEXT:  LBB0_3: ## %bb35
+; CHECK-NEXT:  ## %bb.5: ## %_ubyte_convert2_to_ctypes.exit
+; CHECK-NEXT:    cmpl $-1, %eax
+; CHECK-NEXT:    je LBB0_3
+; CHECK-NEXT:  LBB0_6: ## %bb35
 ; CHECK-NEXT:    movq _PyUFunc_API@GOTPCREL(%rip), %r14
 ; CHECK-NEXT:    movq (%r14), %rax
 ; CHECK-NEXT:    callq *216(%rax)
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %edx
 ; CHECK-NEXT:    testb %dl, %dl
-; CHECK-NEXT:    je LBB0_4
-; CHECK-NEXT:  ## %bb.12: ## %cond_false.i
-; CHECK-NEXT:    setne %dil
+; CHECK-NEXT:    je LBB0_11
+; CHECK-NEXT:  ## %bb.7: ## %cond_false.i
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %esi
 ; CHECK-NEXT:    movzbl %sil, %ecx
 ; CHECK-NEXT:    movl %ecx, %eax
 ; CHECK-NEXT:    divb %dl
 ; CHECK-NEXT:    movl %eax, %r15d
 ; CHECK-NEXT:    testb %cl, %cl
-; CHECK-NEXT:    setne %al
-; CHECK-NEXT:    testb %dil, %al
-; CHECK-NEXT:    jne LBB0_5
-; CHECK-NEXT:  LBB0_13: ## %cond_true.i200
-; CHECK-NEXT:    testb %dl, %dl
-; CHECK-NEXT:    jne LBB0_15
-; CHECK-NEXT:  ## %bb.14: ## %cond_true14.i
-; CHECK-NEXT:    movl $4, %edi
-; CHECK-NEXT:    callq _feraiseexcept
-; CHECK-NEXT:  LBB0_15: ## %ubyte_ctype_remainder.exit
-; CHECK-NEXT:    xorl %ebx, %ebx
-; CHECK-NEXT:    jmp LBB0_16
-; CHECK-NEXT:  LBB0_10: ## %bb17
+; CHECK-NEXT:    jne LBB0_12
+; CHECK-NEXT:    jmp LBB0_14
+; CHECK-NEXT:  LBB0_8: ## %bb17
 ; CHECK-NEXT:    callq _PyErr_Occurred
 ; CHECK-NEXT:    testq %rax, %rax
-; CHECK-NEXT:    jne LBB0_23
-; CHECK-NEXT:  ## %bb.11: ## %cond_next
+; CHECK-NEXT:    jne LBB0_27
+; CHECK-NEXT:  ## %bb.9: ## %cond_next
 ; CHECK-NEXT:    movq _PyArray_API@GOTPCREL(%rip), %rax
 ; CHECK-NEXT:    movq (%rax), %rax
 ; CHECK-NEXT:    movq 80(%rax), %rax
-; CHECK-NEXT:  LBB0_9: ## %bb4
+; CHECK-NEXT:  LBB0_10: ## %bb4
 ; CHECK-NEXT:    movq 96(%rax), %rax
 ; CHECK-NEXT:    movq %r14, %rdi
 ; CHECK-NEXT:    movq %rbx, %rsi
 ; CHECK-NEXT:    callq *40(%rax)
-; CHECK-NEXT:    jmp LBB0_24
-; CHECK-NEXT:  LBB0_4: ## %cond_true.i
+; CHECK-NEXT:    jmp LBB0_28
+; CHECK-NEXT:  LBB0_11: ## %cond_true.i
 ; CHECK-NEXT:    movl $4, %edi
 ; CHECK-NEXT:    callq _feraiseexcept
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %edx
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %esi
+; CHECK-NEXT:    xorl %r15d, %r15d
 ; CHECK-NEXT:    testb %sil, %sil
-; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    je LBB0_14
+; CHECK-NEXT:  LBB0_12: ## %cond_false.i
 ; CHECK-NEXT:    testb %dl, %dl
-; CHECK-NEXT:    sete %cl
-; CHECK-NEXT:    xorl %r15d, %r15d
-; CHECK-NEXT:    orb %al, %cl
-; CHECK-NEXT:    jne LBB0_13
-; CHECK-NEXT:  LBB0_5: ## %cond_next17.i
+; CHECK-NEXT:    je LBB0_14
+; CHECK-NEXT:  ## %bb.13: ## %cond_next17.i
 ; CHECK-NEXT:    movzbl %sil, %eax
 ; CHECK-NEXT:    divb %dl
 ; CHECK-NEXT:    movzbl %ah, %ebx
-; CHECK-NEXT:  LBB0_16: ## %ubyte_ctype_remainder.exit
+; CHECK-NEXT:    jmp LBB0_18
+; CHECK-NEXT:  LBB0_14: ## %cond_true.i200
+; CHECK-NEXT:    testb %dl, %dl
+; CHECK-NEXT:    jne LBB0_17
+; CHECK-NEXT:  ## %bb.16: ## %cond_true14.i
+; CHECK-NEXT:    movl $4, %edi
+; CHECK-NEXT:    callq _feraiseexcept
+; CHECK-NEXT:  LBB0_17: ## %ubyte_ctype_remainder.exit
+; CHECK-NEXT:    xorl %ebx, %ebx
+; CHECK-NEXT:  LBB0_18: ## %ubyte_ctype_remainder.exit
 ; CHECK-NEXT:    movq (%r14), %rax
 ; CHECK-NEXT:    callq *224(%rax)
 ; CHECK-NEXT:    testl %eax, %eax
-; CHECK-NEXT:    je LBB0_19
-; CHECK-NEXT:  ## %bb.17: ## %cond_true61
+; CHECK-NEXT:    je LBB0_21
+; CHECK-NEXT:  ## %bb.19: ## %cond_true61
 ; CHECK-NEXT:    movl %eax, %ebp
 ; CHECK-NEXT:    movq (%r14), %rax
 ; CHECK-NEXT:    movq _.str5@GOTPCREL(%rip), %rdi
@@ -139,8 +136,8 @@ define ptr @ubyte_divmod(ptr %a, ptr %b) {
 ; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
 ; CHECK-NEXT:    callq *200(%rax)
 ; CHECK-NEXT:    testl %eax, %eax
-; CHECK-NEXT:    js LBB0_23
-; CHECK-NEXT:  ## %bb.18: ## %cond_next73
+; CHECK-NEXT:    js LBB0_27
+; CHECK-NEXT:  ## %bb.20: ## %cond_next73
 ; CHECK-NEXT:    movl $1, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movq (%r14), %rax
 ; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
@@ -149,13 +146,13 @@ define ptr @ubyte_divmod(ptr %a, ptr %b) {
 ; CHECK-NEXT:    movl %ebp, %edx
 ; CHECK-NEXT:    callq *232(%rax)
 ; CHECK-NEXT:    testl %eax, %eax
-; CHECK-NEXT:    jne LBB0_23
-; CHECK-NEXT:  LBB0_19: ## %cond_next89
+; CHECK-NEXT:    jne LBB0_27
+; CHECK-NEXT:  LBB0_21: ## %cond_next89
 ; CHECK-NEXT:    movl $2, %edi
 ; CHECK-NEXT:    callq _PyTuple_New
 ; CHECK-NEXT:    testq %rax, %rax
-; CHECK-NEXT:    je LBB0_23
-; CHECK-NEXT:  ## %bb.20: ## %cond_next97
+; CHECK-NEXT:    je LBB0_27
+; CHECK-NEXT:  ## %bb.22: ## %cond_next97
 ; CHECK-NEXT:    movq %rax, %r14
 ; CHECK-NEXT:    movq _PyArray_API@GOTPCREL(%rip), %r12
 ; CHECK-NEXT:    movq (%r12), %rax
@@ -163,8 +160,8 @@ define ptr @ubyte_divmod(ptr %a, ptr %b) {
 ; CHECK-NEXT:    xorl %esi, %esi
 ; CHECK-NEXT:    callq *304(%rdi)
 ; CHECK-NEXT:    testq %rax, %rax
-; CHECK-NEXT:    je LBB0_21
-; CHECK-NEXT:  ## %bb.25: ## %cond_next135
+; CHECK-NEXT:    je LBB0_25
+; CHECK-NEXT:  ## %bb.23: ## %cond_next135
 ; CHECK-NEXT:    movb %r15b, 16(%rax)
 ; CHECK-NEXT:    movq %rax, 24(%r14)
 ; CHECK-NEXT:    movq (%r12), %rax
@@ -172,22 +169,22 @@ define ptr @ubyte_divmod(ptr %a, ptr %b) {
 ; CHECK-NEXT:    xorl %esi, %esi
 ; CHECK-NEXT:    callq *304(%rdi)
 ; CHECK-NEXT:    testq %rax, %rax
-; CHECK-NEXT:    je LBB0_21
-; CHECK-NEXT:  ## %bb.26: ## %cond_next182
+; CHECK-NEXT:    je LBB0_25
+; CHECK-NEXT:  ## %bb.24: ## %cond_next182
 ; CHECK-NEXT:    movb %bl, 16(%rax)
 ; CHECK-NEXT:    movq %rax, 32(%r14)
 ; CHECK-NEXT:    movq %r14, %rax
-; CHECK-NEXT:    jmp LBB0_24
-; CHECK-NEXT:  LBB0_21: ## %cond_true113
+; CHECK-NEXT:    jmp LBB0_28
+; CHECK-NEXT:  LBB0_25: ## %cond_true113
 ; CHECK-NEXT:    decq (%r14)
-; CHECK-NEXT:    jne LBB0_23
-; CHECK-NEXT:  ## %bb.22: ## %cond_true126
+; CHECK-NEXT:    jne LBB0_27
+; CHECK-NEXT:  ## %bb.26: ## %cond_true126
 ; CHECK-NEXT:    movq 8(%r14), %rax
 ; CHECK-NEXT:    movq %r14, %rdi
 ; CHECK-NEXT:    callq *48(%rax)
-; CHECK-NEXT:  LBB0_23: ## %UnifiedReturnBlock
+; CHECK-NEXT:  LBB0_27: ## %UnifiedReturnBlock
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:  LBB0_24: ## %UnifiedReturnBlock
+; CHECK-NEXT:  LBB0_28: ## %UnifiedReturnBlock
 ; CHECK-NEXT:    addq $32, %rsp
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    popq %r12
diff --git a/llvm/test/CodeGen/X86/GlobalISel/legalize-leading-zeros.mir b/llvm/test/CodeGen/X86/GlobalISel/legalize-leading-zeros.mir
index 470a30fd36b62..bd4e9a47d4882 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/legalize-leading-zeros.mir
+++ b/llvm/test/CodeGen/X86/GlobalISel/legalize-leading-zeros.mir
@@ -37,9 +37,9 @@ body:             |
     ; X86-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
     ; X86-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[CTLZ]], [[C1]]
     ; X86-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s64) = G_CTLZ_ZERO_UNDEF [[UV1]](s32)
+    ; X86-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP]](s1)
     ; X86-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ADD]](s64)
     ; X86-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[CTLZ_ZERO_UNDEF]](s64)
-    ; X86-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP]](s1)
     ; X86-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ZEXT1]](s32), [[UV2]], [[UV4]]
     ; X86-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ZEXT1]](s32), [[UV3]], [[UV5]]
     ; X86-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[SELECT]](s32), [[SELECT1]](s32)
@@ -111,9 +111,9 @@ body:             |
     ; X86-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
     ; X86-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[CTLZ]], [[C1]]
     ; X86-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s64) = G_CTLZ_ZERO_UNDEF [[UV1]](s32)
+    ; X86-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP]](s1)
     ; X86-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ADD]](s64)
     ; X86-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[CTLZ_ZERO_UNDEF]](s64)
-    ; X86-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP]](s1)
     ; X86-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ZEXT]](s32), [[UV2]], [[UV4]]
     ; X86-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ZEXT]](s32), [[UV3]], [[UV5]]
     ; X86-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[SELECT]](s32), [[SELECT1]](s32)
diff --git a/llvm/test/CodeGen/X86/GlobalISel/legalize-select.mir b/llvm/test/CodeGen/X86/GlobalISel/legalize-select.mir
index a7cbb35e3f74c..6ab424eeaa780 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/legalize-select.mir
+++ b/llvm/test/CodeGen/X86/GlobalISel/legalize-select.mir
@@ -33,9 +33,9 @@ body:             |
     ; X86: [[DEF:%[0-9]+]]:_(s64) = IMPLICIT_DEF
     ; X86-NEXT: [[DEF1:%[0-9]+]]:_(s64) = IMPLICIT_DEF
     ; X86-NEXT: [[DEF2:%[0-9]+]]:_(s1) = IMPLICIT_DEF
+    ; X86-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[DEF2]](s1)
     ; X86-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF1]](s64)
     ; X86-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](s64)
-    ; X86-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[DEF2]](s1)
     ; X86-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ZEXT]](s32), [[UV]], [[UV2]]
     ; X86-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ZEXT]](s32), [[UV1]], [[UV3]]
     ; X86-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[SELECT]](s32), [[SELECT1]](s32)
@@ -115,9 +115,9 @@ body:             |
     ; X64: [[DEF:%[0-9]+]]:_(s8) = IMPLICIT_DEF
     ; X64-NEXT: [[DEF1:%[0-9]+]]:_(s8) = IMPLICIT_DEF
     ; X64-NEXT: [[DEF2:%[0-9]+]]:_(s1) = IMPLICIT_DEF
+    ; X64-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[DEF2]](s1)
     ; X64-NEXT: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[DEF1]](s8)
     ; X64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s16) = G_ANYEXT [[DEF]](s8)
-    ; X64-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[DEF2]](s1)
     ; X64-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[ZEXT]](s32), [[ANYEXT]], [[ANYEXT1]]
     ; X64-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[SELECT]](s16)
     ; X64-NEXT: [[COPY:%[0-9]+]]:_(s8) = COPY [[TRUNC]](s8)
diff --git a/llvm/test/CodeGen/X86/absolute-symbol-kernel-code-model.ll b/llvm/test/CodeGen/X86/absolute-symbol-kernel-code-model.ll
new file mode 100644
index 0000000000000..ce7024dcc438b
--- /dev/null
+++ b/llvm/test/CodeGen/X86/absolute-symbol-kernel-code-model.ll
@@ -0,0 +1,34 @@
+; RUN: llc --code-model=kernel < %s -asm-verbose=0 | FileCheck %s
+
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK-LABEL: func_no_abs_sym
+define i64 @func_no_abs_sym() nounwind {
+  ; CHECK: movq $no_abs_sym, %rax
+  %1 = ptrtoint ptr @no_abs_sym to i64
+  ret i64 %1
+}
+
+; CHECK-LABEL: func_abs_sym
+define i64 @func_abs_sym() nounwind {
+  ; CHECK: movabsq $abs_sym, %rax
+  %1 = ptrtoint ptr @abs_sym to i64
+  ret i64 %1
+}
+
+; CHECK-LABEL: func_abs_sym_in_range
+define i64 @func_abs_sym_in_range() nounwind {
+  ;; The absolute_symbol range fits in 32 bits but we still use movabs
+  ;; since there's no benefit to using the sign extending instruction
+  ;; with absolute symbols.
+  ; CHECK: movabsq $abs_sym_in_range, %rax
+  %1 = ptrtoint ptr @abs_sym_in_range to i64
+  ret i64 %1
+}
+
+@no_abs_sym = external hidden global [0 x i8]
+@abs_sym = external hidden global [0 x i8], !absolute_symbol !0
+@abs_sym_in_range = external hidden global [0 x i8], !absolute_symbol !1
+
+!0 = !{i64 -1, i64 -1}  ;; Full range
+!1 = !{i64 -2147483648, i64 2147483648}  ;; In range
diff --git a/llvm/test/CodeGen/X86/apx/cf.ll b/llvm/test/CodeGen/X86/apx/cf.ll
index b2651e91134ee..de9caa5b6d989 100644
--- a/llvm/test/CodeGen/X86/apx/cf.ll
+++ b/llvm/test/CodeGen/X86/apx/cf.ll
@@ -230,6 +230,24 @@ entry:
   ret void
 }
 
+define void @and_cond(i32 %a, i1 %b) {
+; CHECK-LABEL: and_cond:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    setg %al
+; CHECK-NEXT:    notb %sil
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    testb %al, %sil
+; CHECK-NEXT:    cfcmovnel %ecx, 0
+; CHECK-NEXT:    retq
+  %is_pos = icmp sgt i32 %a, 0
+  %not_b = xor i1 %b, true
+  %cond = and i1 %not_b, %is_pos
+  %mask = insertelement <1 x i1> zeroinitializer, i1 %cond, i64 0
+  call void @llvm.masked.store.v1i32.p0(<1 x i32> zeroinitializer, ptr null, i32 1, <1 x i1> %mask)
+  ret void
+}
+
 define i64 @redundant_test(i64 %num, ptr %p1, i64 %in) {
 ; CHECK-LABEL: redundant_test:
 ; CHECK:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/avx-shift.ll b/llvm/test/CodeGen/X86/avx-shift.ll
index c9c09d7d10346..3bce84348b7f0 100644
--- a/llvm/test/CodeGen/X86/avx-shift.ll
+++ b/llvm/test/CodeGen/X86/avx-shift.ll
@@ -201,7 +201,7 @@ define <8 x i32> @vshift08_add(<8 x i32> %a, <8 x i32> %y)  {
 define <4 x i32> @vshift13(<4 x i32> %in) {
 ; CHECK-LABEL: vshift13:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,16]
 ; CHECK-NEXT:    retq
   %T = shl <4 x i32> %in, <i32 0, i32 1, i32 2, i32 4>
   ret <4 x i32> %T
diff --git a/llvm/test/CodeGen/X86/avx2-arith.ll b/llvm/test/CodeGen/X86/avx2-arith.ll
index 70b3b99b46ce9..1133cdfd083be 100644
--- a/llvm/test/CodeGen/X86/avx2-arith.ll
+++ b/llvm/test/CodeGen/X86/avx2-arith.ll
@@ -199,12 +199,12 @@ define <8 x i32> @mul_const5(<8 x i32> %x) {
 define <8 x i32> @mul_const6(<8 x i32> %x) {
 ; X86-LABEL: mul_const6:
 ; X86:       # %bb.0:
-; X86-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
+; X86-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # [0,0,0,2,0,2,0,0]
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: mul_const6:
 ; X64:       # %bb.0:
-; X64-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; X64-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,0,0,2,0,2,0,0]
 ; X64-NEXT:    retq
   %y = mul <8 x i32> %x, <i32 0, i32 0, i32 0, i32 2, i32 0, i32 2, i32 0, i32 0>
   ret <8 x i32> %y
diff --git a/llvm/test/CodeGen/X86/bitcnt-big-integer.ll b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
new file mode 100644
index 0000000000000..13149d78b16fb
--- /dev/null
+++ b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
@@ -0,0 +1,3021 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 -mattr=+avx512vpopcntdq | FileCheck %s --check-prefixes=CHECK,AVX512
+
+;
+; CTPOP
+;
+
+define i32 @test_ctpop_i128(i128 %a0) nounwind {
+; CHECK-LABEL: test_ctpop_i128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    popcntq %rsi, %rcx
+; CHECK-NEXT:    popcntq %rdi, %rax
+; CHECK-NEXT:    addl %ecx, %eax
+; CHECK-NEXT:    # kill: def $eax killed $eax killed $rax
+; CHECK-NEXT:    retq
+  %cnt = call i128 @llvm.ctpop.i128(i128 %a0)
+  %res = trunc i128 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @load_ctpop_i128(ptr %p0) nounwind {
+; CHECK-LABEL: load_ctpop_i128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    popcntq 8(%rdi), %rcx
+; CHECK-NEXT:    popcntq (%rdi), %rax
+; CHECK-NEXT:    addl %ecx, %eax
+; CHECK-NEXT:    # kill: def $eax killed $eax killed $rax
+; CHECK-NEXT:    retq
+  %a0 = load i128, ptr %p0
+  %cnt = call i128 @llvm.ctpop.i128(i128 %a0)
+  %res = trunc i128 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @test_ctpop_i256(i256 %a0) nounwind {
+; CHECK-LABEL: test_ctpop_i256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    popcntq %rcx, %rax
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    popcntq %rdx, %rcx
+; CHECK-NEXT:    addl %eax, %ecx
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    popcntq %rsi, %rdx
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    popcntq %rdi, %rax
+; CHECK-NEXT:    addl %edx, %eax
+; CHECK-NEXT:    addl %ecx, %eax
+; CHECK-NEXT:    # kill: def $eax killed $eax killed $rax
+; CHECK-NEXT:    retq
+  %cnt = call i256 @llvm.ctpop.i256(i256 %a0)
+  %res = trunc i256 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @load_ctpop_i256(ptr %p0) nounwind {
+; SSE-LABEL: load_ctpop_i256:
+; SSE:       # %bb.0:
+; SSE-NEXT:    popcntq 24(%rdi), %rcx
+; SSE-NEXT:    popcntq 16(%rdi), %rdx
+; SSE-NEXT:    popcntq 8(%rdi), %rsi
+; SSE-NEXT:    popcntq (%rdi), %rax
+; SSE-NEXT:    addl %ecx, %edx
+; SSE-NEXT:    addl %esi, %eax
+; SSE-NEXT:    addl %edx, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: load_ctpop_i256:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    popcntq 24(%rdi), %rax
+; AVX2-NEXT:    popcntq 16(%rdi), %rcx
+; AVX2-NEXT:    addl %eax, %ecx
+; AVX2-NEXT:    popcntq 8(%rdi), %rdx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    popcntq (%rdi), %rax
+; AVX2-NEXT:    addl %edx, %eax
+; AVX2-NEXT:    addl %ecx, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_ctpop_i256:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    popcntq 24(%rdi), %rax
+; AVX512-NEXT:    popcntq 16(%rdi), %rcx
+; AVX512-NEXT:    addl %eax, %ecx
+; AVX512-NEXT:    popcntq 8(%rdi), %rdx
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    popcntq (%rdi), %rax
+; AVX512-NEXT:    addl %edx, %eax
+; AVX512-NEXT:    addl %ecx, %eax
+; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    retq
+  %a0 = load i256, ptr %p0
+  %cnt = call i256 @llvm.ctpop.i256(i256 %a0)
+  %res = trunc i256 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @test_ctpop_i512(i512 %a0) nounwind {
+; CHECK-LABEL: test_ctpop_i512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    popcntq {{[0-9]+}}(%rsp), %rax
+; CHECK-NEXT:    popcntq {{[0-9]+}}(%rsp), %r10
+; CHECK-NEXT:    addl %eax, %r10d
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    popcntq %r9, %rax
+; CHECK-NEXT:    popcntq %r8, %r8
+; CHECK-NEXT:    addl %eax, %r8d
+; CHECK-NEXT:    addl %r10d, %r8d
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    popcntq %rcx, %rax
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    popcntq %rdx, %rcx
+; CHECK-NEXT:    addl %eax, %ecx
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    popcntq %rsi, %rdx
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    popcntq %rdi, %rax
+; CHECK-NEXT:    addl %edx, %eax
+; CHECK-NEXT:    addl %ecx, %eax
+; CHECK-NEXT:    addl %r8d, %eax
+; CHECK-NEXT:    # kill: def $eax killed $eax killed $rax
+; CHECK-NEXT:    retq
+  %cnt = call i512 @llvm.ctpop.i512(i512 %a0)
+  %res = trunc i512 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @load_ctpop_i512(ptr %p0) nounwind {
+; SSE-LABEL: load_ctpop_i512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    popcntq 56(%rdi), %rax
+; SSE-NEXT:    popcntq 48(%rdi), %rcx
+; SSE-NEXT:    popcntq 40(%rdi), %rdx
+; SSE-NEXT:    popcntq 32(%rdi), %rsi
+; SSE-NEXT:    addl %eax, %ecx
+; SSE-NEXT:    addl %edx, %esi
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    popcntq 24(%rdi), %rax
+; SSE-NEXT:    addl %ecx, %esi
+; SSE-NEXT:    xorl %ecx, %ecx
+; SSE-NEXT:    popcntq 16(%rdi), %rcx
+; SSE-NEXT:    addl %eax, %ecx
+; SSE-NEXT:    xorl %edx, %edx
+; SSE-NEXT:    popcntq 8(%rdi), %rdx
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    popcntq (%rdi), %rax
+; SSE-NEXT:    addl %edx, %eax
+; SSE-NEXT:    addl %ecx, %eax
+; SSE-NEXT:    addl %esi, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: load_ctpop_i512:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    popcntq 56(%rdi), %rax
+; AVX2-NEXT:    popcntq 48(%rdi), %rcx
+; AVX2-NEXT:    addl %eax, %ecx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    popcntq 40(%rdi), %rax
+; AVX2-NEXT:    popcntq 32(%rdi), %rdx
+; AVX2-NEXT:    addl %eax, %edx
+; AVX2-NEXT:    addl %ecx, %edx
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    popcntq 24(%rdi), %rcx
+; AVX2-NEXT:    popcntq 16(%rdi), %rsi
+; AVX2-NEXT:    popcntq 8(%rdi), %r8
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    popcntq (%rdi), %rax
+; AVX2-NEXT:    addl %ecx, %esi
+; AVX2-NEXT:    addl %r8d, %eax
+; AVX2-NEXT:    addl %esi, %eax
+; AVX2-NEXT:    addl %edx, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_ctpop_i512:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    popcntq 56(%rdi), %rax
+; AVX512-NEXT:    popcntq 48(%rdi), %rcx
+; AVX512-NEXT:    addl %eax, %ecx
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    popcntq 40(%rdi), %rax
+; AVX512-NEXT:    popcntq 32(%rdi), %rdx
+; AVX512-NEXT:    addl %eax, %edx
+; AVX512-NEXT:    addl %ecx, %edx
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    popcntq 24(%rdi), %rax
+; AVX512-NEXT:    xorl %ecx, %ecx
+; AVX512-NEXT:    popcntq 16(%rdi), %rcx
+; AVX512-NEXT:    popcntq 8(%rdi), %rsi
+; AVX512-NEXT:    addl %eax, %ecx
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    popcntq (%rdi), %rax
+; AVX512-NEXT:    addl %esi, %eax
+; AVX512-NEXT:    addl %ecx, %eax
+; AVX512-NEXT:    addl %edx, %eax
+; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    retq
+  %a0 = load i512, ptr %p0
+  %cnt = call i512 @llvm.ctpop.i512(i512 %a0)
+  %res = trunc i512 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @test_ctpop_i1024(i1024 %a0) nounwind {
+; SSE-LABEL: test_ctpop_i1024:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    popcntq {{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    popcntq {{[0-9]+}}(%rsp), %r10
+; SSE-NEXT:    popcntq {{[0-9]+}}(%rsp), %r11
+; SSE-NEXT:    addl %eax, %r10d
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    popcntq {{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    addl %r11d, %eax
+; SSE-NEXT:    xorl %r11d, %r11d
+; SSE-NEXT:    popcntq {{[0-9]+}}(%rsp), %r11
+; SSE-NEXT:    xorl %ebx, %ebx
+; SSE-NEXT:    popcntq {{[0-9]+}}(%rsp), %rbx
+; SSE-NEXT:    addl %r10d, %eax
+; SSE-NEXT:    addl %r11d, %ebx
+; SSE-NEXT:    xorl %r11d, %r11d
+; SSE-NEXT:    popcntq {{[0-9]+}}(%rsp), %r11
+; SSE-NEXT:    xorl %r10d, %r10d
+; SSE-NEXT:    popcntq {{[0-9]+}}(%rsp), %r10
+; SSE-NEXT:    addl %r11d, %r10d
+; SSE-NEXT:    addl %ebx, %r10d
+; SSE-NEXT:    xorl %r11d, %r11d
+; SSE-NEXT:    popcntq {{[0-9]+}}(%rsp), %r11
+; SSE-NEXT:    xorl %ebx, %ebx
+; SSE-NEXT:    popcntq {{[0-9]+}}(%rsp), %rbx
+; SSE-NEXT:    addl %eax, %r10d
+; SSE-NEXT:    addl %r11d, %ebx
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    popcntq %r9, %rax
+; SSE-NEXT:    popcntq %r8, %r8
+; SSE-NEXT:    addl %eax, %r8d
+; SSE-NEXT:    addl %ebx, %r8d
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    popcntq %rcx, %rax
+; SSE-NEXT:    xorl %ecx, %ecx
+; SSE-NEXT:    popcntq %rdx, %rcx
+; SSE-NEXT:    addl %eax, %ecx
+; SSE-NEXT:    xorl %edx, %edx
+; SSE-NEXT:    popcntq %rsi, %rdx
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    popcntq %rdi, %rax
+; SSE-NEXT:    addl %edx, %eax
+; SSE-NEXT:    addl %ecx, %eax
+; SSE-NEXT:    addl %r8d, %eax
+; SSE-NEXT:    addl %r10d, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: test_ctpop_i1024:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    popcntq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    popcntq {{[0-9]+}}(%rsp), %r10
+; AVX2-NEXT:    addl %eax, %r10d
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    popcntq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    popcntq {{[0-9]+}}(%rsp), %r11
+; AVX2-NEXT:    addl %eax, %r11d
+; AVX2-NEXT:    addl %r10d, %r11d
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    popcntq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    xorl %ebx, %ebx
+; AVX2-NEXT:    popcntq {{[0-9]+}}(%rsp), %rbx
+; AVX2-NEXT:    xorl %r14d, %r14d
+; AVX2-NEXT:    popcntq {{[0-9]+}}(%rsp), %r14
+; AVX2-NEXT:    addl %eax, %ebx
+; AVX2-NEXT:    xorl %r10d, %r10d
+; AVX2-NEXT:    popcntq {{[0-9]+}}(%rsp), %r10
+; AVX2-NEXT:    addl %r14d, %r10d
+; AVX2-NEXT:    addl %ebx, %r10d
+; AVX2-NEXT:    addl %r11d, %r10d
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    popcntq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    xorl %r11d, %r11d
+; AVX2-NEXT:    popcntq {{[0-9]+}}(%rsp), %r11
+; AVX2-NEXT:    addl %eax, %r11d
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    popcntq %r9, %rax
+; AVX2-NEXT:    popcntq %r8, %r8
+; AVX2-NEXT:    addl %eax, %r8d
+; AVX2-NEXT:    addl %r11d, %r8d
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    popcntq %rcx, %rax
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    popcntq %rdx, %rcx
+; AVX2-NEXT:    addl %eax, %ecx
+; AVX2-NEXT:    xorl %edx, %edx
+; AVX2-NEXT:    popcntq %rsi, %rdx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    popcntq %rdi, %rax
+; AVX2-NEXT:    addl %edx, %eax
+; AVX2-NEXT:    addl %ecx, %eax
+; AVX2-NEXT:    addl %r8d, %eax
+; AVX2-NEXT:    addl %r10d, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ctpop_i1024:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    popcntq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    popcntq {{[0-9]+}}(%rsp), %r10
+; AVX512-NEXT:    addl %eax, %r10d
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    popcntq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    popcntq {{[0-9]+}}(%rsp), %r11
+; AVX512-NEXT:    addl %eax, %r11d
+; AVX512-NEXT:    addl %r10d, %r11d
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    popcntq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    xorl %ebx, %ebx
+; AVX512-NEXT:    popcntq {{[0-9]+}}(%rsp), %rbx
+; AVX512-NEXT:    xorl %r14d, %r14d
+; AVX512-NEXT:    popcntq {{[0-9]+}}(%rsp), %r14
+; AVX512-NEXT:    addl %eax, %ebx
+; AVX512-NEXT:    xorl %r10d, %r10d
+; AVX512-NEXT:    popcntq {{[0-9]+}}(%rsp), %r10
+; AVX512-NEXT:    addl %r14d, %r10d
+; AVX512-NEXT:    addl %ebx, %r10d
+; AVX512-NEXT:    addl %r11d, %r10d
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    popcntq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    xorl %r11d, %r11d
+; AVX512-NEXT:    popcntq {{[0-9]+}}(%rsp), %r11
+; AVX512-NEXT:    addl %eax, %r11d
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    popcntq %r9, %rax
+; AVX512-NEXT:    popcntq %r8, %r8
+; AVX512-NEXT:    addl %eax, %r8d
+; AVX512-NEXT:    addl %r11d, %r8d
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    popcntq %rcx, %rax
+; AVX512-NEXT:    xorl %ecx, %ecx
+; AVX512-NEXT:    popcntq %rdx, %rcx
+; AVX512-NEXT:    addl %eax, %ecx
+; AVX512-NEXT:    xorl %edx, %edx
+; AVX512-NEXT:    popcntq %rsi, %rdx
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    popcntq %rdi, %rax
+; AVX512-NEXT:    addl %edx, %eax
+; AVX512-NEXT:    addl %ecx, %eax
+; AVX512-NEXT:    addl %r8d, %eax
+; AVX512-NEXT:    addl %r10d, %eax
+; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    retq
+  %cnt = call i1024 @llvm.ctpop.i1024(i1024 %a0)
+  %res = trunc i1024 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @load_ctpop_i1024(ptr %p0) nounwind {
+; SSE-LABEL: load_ctpop_i1024:
+; SSE:       # %bb.0:
+; SSE-NEXT:    popcntq 120(%rdi), %rax
+; SSE-NEXT:    popcntq 112(%rdi), %rcx
+; SSE-NEXT:    popcntq 104(%rdi), %rdx
+; SSE-NEXT:    popcntq 96(%rdi), %rsi
+; SSE-NEXT:    addl %eax, %ecx
+; SSE-NEXT:    addl %edx, %esi
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    popcntq 88(%rdi), %rax
+; SSE-NEXT:    addl %ecx, %esi
+; SSE-NEXT:    xorl %edx, %edx
+; SSE-NEXT:    popcntq 80(%rdi), %rdx
+; SSE-NEXT:    addl %eax, %edx
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    popcntq 72(%rdi), %rax
+; SSE-NEXT:    xorl %ecx, %ecx
+; SSE-NEXT:    popcntq 64(%rdi), %rcx
+; SSE-NEXT:    addl %eax, %ecx
+; SSE-NEXT:    addl %edx, %ecx
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    popcntq 56(%rdi), %rax
+; SSE-NEXT:    addl %esi, %ecx
+; SSE-NEXT:    xorl %edx, %edx
+; SSE-NEXT:    popcntq 48(%rdi), %rdx
+; SSE-NEXT:    addl %eax, %edx
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    popcntq 40(%rdi), %rax
+; SSE-NEXT:    xorl %esi, %esi
+; SSE-NEXT:    popcntq 32(%rdi), %rsi
+; SSE-NEXT:    addl %eax, %esi
+; SSE-NEXT:    addl %edx, %esi
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    popcntq 24(%rdi), %rax
+; SSE-NEXT:    xorl %edx, %edx
+; SSE-NEXT:    popcntq 16(%rdi), %rdx
+; SSE-NEXT:    popcntq 8(%rdi), %r8
+; SSE-NEXT:    addl %eax, %edx
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    popcntq (%rdi), %rax
+; SSE-NEXT:    addl %r8d, %eax
+; SSE-NEXT:    addl %edx, %eax
+; SSE-NEXT:    addl %esi, %eax
+; SSE-NEXT:    addl %ecx, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: load_ctpop_i1024:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    popcntq 120(%rdi), %rax
+; AVX2-NEXT:    popcntq 112(%rdi), %rcx
+; AVX2-NEXT:    addl %eax, %ecx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    popcntq 104(%rdi), %rax
+; AVX2-NEXT:    popcntq 96(%rdi), %rdx
+; AVX2-NEXT:    addl %eax, %edx
+; AVX2-NEXT:    addl %ecx, %edx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    popcntq 88(%rdi), %rax
+; AVX2-NEXT:    popcntq 80(%rdi), %rsi
+; AVX2-NEXT:    popcntq 72(%rdi), %r8
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    popcntq 64(%rdi), %rcx
+; AVX2-NEXT:    addl %eax, %esi
+; AVX2-NEXT:    addl %r8d, %ecx
+; AVX2-NEXT:    addl %esi, %ecx
+; AVX2-NEXT:    addl %edx, %ecx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    popcntq 56(%rdi), %rax
+; AVX2-NEXT:    xorl %edx, %edx
+; AVX2-NEXT:    popcntq 48(%rdi), %rdx
+; AVX2-NEXT:    xorl %esi, %esi
+; AVX2-NEXT:    popcntq 40(%rdi), %rsi
+; AVX2-NEXT:    xorl %r8d, %r8d
+; AVX2-NEXT:    popcntq 32(%rdi), %r8
+; AVX2-NEXT:    addl %eax, %edx
+; AVX2-NEXT:    addl %esi, %r8d
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    popcntq 24(%rdi), %rax
+; AVX2-NEXT:    addl %edx, %r8d
+; AVX2-NEXT:    xorl %edx, %edx
+; AVX2-NEXT:    popcntq 16(%rdi), %rdx
+; AVX2-NEXT:    addl %eax, %edx
+; AVX2-NEXT:    xorl %esi, %esi
+; AVX2-NEXT:    popcntq 8(%rdi), %rsi
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    popcntq (%rdi), %rax
+; AVX2-NEXT:    addl %esi, %eax
+; AVX2-NEXT:    addl %edx, %eax
+; AVX2-NEXT:    addl %r8d, %eax
+; AVX2-NEXT:    addl %ecx, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_ctpop_i1024:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    popcntq 120(%rdi), %rax
+; AVX512-NEXT:    popcntq 112(%rdi), %rcx
+; AVX512-NEXT:    addl %eax, %ecx
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    popcntq 104(%rdi), %rax
+; AVX512-NEXT:    popcntq 96(%rdi), %rdx
+; AVX512-NEXT:    addl %eax, %edx
+; AVX512-NEXT:    addl %ecx, %edx
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    popcntq 88(%rdi), %rax
+; AVX512-NEXT:    popcntq 80(%rdi), %rsi
+; AVX512-NEXT:    popcntq 72(%rdi), %r8
+; AVX512-NEXT:    addl %eax, %esi
+; AVX512-NEXT:    xorl %ecx, %ecx
+; AVX512-NEXT:    popcntq 64(%rdi), %rcx
+; AVX512-NEXT:    addl %r8d, %ecx
+; AVX512-NEXT:    addl %esi, %ecx
+; AVX512-NEXT:    addl %edx, %ecx
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    popcntq 56(%rdi), %rax
+; AVX512-NEXT:    xorl %edx, %edx
+; AVX512-NEXT:    popcntq 48(%rdi), %rdx
+; AVX512-NEXT:    xorl %esi, %esi
+; AVX512-NEXT:    popcntq 40(%rdi), %rsi
+; AVX512-NEXT:    addl %eax, %edx
+; AVX512-NEXT:    xorl %r8d, %r8d
+; AVX512-NEXT:    popcntq 32(%rdi), %r8
+; AVX512-NEXT:    addl %esi, %r8d
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    popcntq 24(%rdi), %rax
+; AVX512-NEXT:    addl %edx, %r8d
+; AVX512-NEXT:    xorl %edx, %edx
+; AVX512-NEXT:    popcntq 16(%rdi), %rdx
+; AVX512-NEXT:    addl %eax, %edx
+; AVX512-NEXT:    xorl %esi, %esi
+; AVX512-NEXT:    popcntq 8(%rdi), %rsi
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    popcntq (%rdi), %rax
+; AVX512-NEXT:    addl %esi, %eax
+; AVX512-NEXT:    addl %edx, %eax
+; AVX512-NEXT:    addl %r8d, %eax
+; AVX512-NEXT:    addl %ecx, %eax
+; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    retq
+  %a0 = load i1024, ptr %p0
+  %cnt = call i1024 @llvm.ctpop.i1024(i1024 %a0)
+  %res = trunc i1024 %cnt to i32
+  ret i32 %res
+}
+
+;
+; CTLZ
+;
+
+define i32 @test_ctlz_i128(i128 %a0) nounwind {
+; SSE-LABEL: test_ctlz_i128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    bsrq %rsi, %rcx
+; SSE-NEXT:    xorl $63, %ecx
+; SSE-NEXT:    movl $127, %eax
+; SSE-NEXT:    bsrq %rdi, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rsi, %rsi
+; SSE-NEXT:    cmovnel %ecx, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: test_ctlz_i128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    lzcntq %rsi, %rcx
+; AVX2-NEXT:    lzcntq %rdi, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rsi, %rsi
+; AVX2-NEXT:    cmovnel %ecx, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ctlz_i128:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    lzcntq %rsi, %rcx
+; AVX512-NEXT:    lzcntq %rdi, %rax
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    testq %rsi, %rsi
+; AVX512-NEXT:    cmovnel %ecx, %eax
+; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    retq
+  %cnt = call i128 @llvm.ctlz.i128(i128 %a0, i1 0)
+  %res = trunc i128 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @load_ctlz_i128(ptr %p0) nounwind {
+; SSE-LABEL: load_ctlz_i128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq 8(%rdi), %rcx
+; SSE-NEXT:    bsrq %rcx, %rdx
+; SSE-NEXT:    xorl $63, %edx
+; SSE-NEXT:    movl $127, %eax
+; SSE-NEXT:    bsrq (%rdi), %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rcx, %rcx
+; SSE-NEXT:    cmovnel %edx, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: load_ctlz_i128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movq 8(%rdi), %rcx
+; AVX2-NEXT:    lzcntq %rcx, %rdx
+; AVX2-NEXT:    lzcntq (%rdi), %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %edx, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_ctlz_i128:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    movq 8(%rdi), %rcx
+; AVX512-NEXT:    lzcntq %rcx, %rdx
+; AVX512-NEXT:    lzcntq (%rdi), %rax
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    testq %rcx, %rcx
+; AVX512-NEXT:    cmovnel %edx, %eax
+; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    retq
+  %a0 = load i128, ptr %p0
+  %cnt = call i128 @llvm.ctlz.i128(i128 %a0, i1 0)
+  %res = trunc i128 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @test_ctlz_i256(i256 %a0) nounwind {
+; SSE-LABEL: test_ctlz_i256:
+; SSE:       # %bb.0:
+; SSE-NEXT:    bsrq %rcx, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    bsrq %rdx, %r8
+; SSE-NEXT:    xorl $63, %r8d
+; SSE-NEXT:    orl $64, %r8d
+; SSE-NEXT:    testq %rcx, %rcx
+; SSE-NEXT:    cmovnel %eax, %r8d
+; SSE-NEXT:    bsrq %rsi, %r9
+; SSE-NEXT:    xorl $63, %r9d
+; SSE-NEXT:    movl $127, %eax
+; SSE-NEXT:    bsrq %rdi, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rsi, %rsi
+; SSE-NEXT:    cmovnel %r9d, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    orq %rcx, %rdx
+; SSE-NEXT:    cmovnel %r8d, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: test_ctlz_i256:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    lzcntq %rcx, %rax
+; AVX2-NEXT:    lzcntq %rdx, %r8
+; AVX2-NEXT:    addl $64, %r8d
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %eax, %r8d
+; AVX2-NEXT:    lzcntq %rsi, %r9
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    lzcntq %rdi, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rsi, %rsi
+; AVX2-NEXT:    cmovnel %r9d, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    cmovnel %r8d, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ctlz_i256:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    lzcntq %rcx, %rax
+; AVX512-NEXT:    lzcntq %rdx, %r8
+; AVX512-NEXT:    addl $64, %r8d
+; AVX512-NEXT:    testq %rcx, %rcx
+; AVX512-NEXT:    cmovnel %eax, %r8d
+; AVX512-NEXT:    lzcntq %rsi, %r9
+; AVX512-NEXT:    lzcntq %rdi, %rax
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    testq %rsi, %rsi
+; AVX512-NEXT:    cmovnel %r9d, %eax
+; AVX512-NEXT:    subl $-128, %eax
+; AVX512-NEXT:    orq %rcx, %rdx
+; AVX512-NEXT:    cmovnel %r8d, %eax
+; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    retq
+  %cnt = call i256 @llvm.ctlz.i256(i256 %a0, i1 0)
+  %res = trunc i256 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @load_ctlz_i256(ptr %p0) nounwind {
+; SSE-LABEL: load_ctlz_i256:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq 16(%rdi), %rcx
+; SSE-NEXT:    movq 24(%rdi), %rdx
+; SSE-NEXT:    bsrq %rdx, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    bsrq %rcx, %rsi
+; SSE-NEXT:    xorl $63, %esi
+; SSE-NEXT:    orl $64, %esi
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovnel %eax, %esi
+; SSE-NEXT:    movq 8(%rdi), %r8
+; SSE-NEXT:    bsrq %r8, %r9
+; SSE-NEXT:    xorl $63, %r9d
+; SSE-NEXT:    movl $127, %eax
+; SSE-NEXT:    bsrq (%rdi), %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %r8, %r8
+; SSE-NEXT:    cmovnel %r9d, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    orq %rdx, %rcx
+; SSE-NEXT:    cmovnel %esi, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: load_ctlz_i256:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movq 16(%rdi), %rcx
+; AVX2-NEXT:    movq 24(%rdi), %rdx
+; AVX2-NEXT:    lzcntq %rdx, %rax
+; AVX2-NEXT:    lzcntq %rcx, %rsi
+; AVX2-NEXT:    addl $64, %esi
+; AVX2-NEXT:    testq %rdx, %rdx
+; AVX2-NEXT:    cmovnel %eax, %esi
+; AVX2-NEXT:    movq 8(%rdi), %r8
+; AVX2-NEXT:    lzcntq %r8, %r9
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    lzcntq (%rdi), %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %r8, %r8
+; AVX2-NEXT:    cmovnel %r9d, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %rdx, %rcx
+; AVX2-NEXT:    cmovnel %esi, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_ctlz_i256:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    movq 8(%rdi), %rcx
+; AVX512-NEXT:    movq 16(%rdi), %rdx
+; AVX512-NEXT:    movq 24(%rdi), %rsi
+; AVX512-NEXT:    lzcntq %rsi, %rax
+; AVX512-NEXT:    lzcntq %rdx, %r8
+; AVX512-NEXT:    addl $64, %r8d
+; AVX512-NEXT:    testq %rsi, %rsi
+; AVX512-NEXT:    cmovnel %eax, %r8d
+; AVX512-NEXT:    lzcntq %rcx, %r9
+; AVX512-NEXT:    lzcntq (%rdi), %rax
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    testq %rcx, %rcx
+; AVX512-NEXT:    cmovnel %r9d, %eax
+; AVX512-NEXT:    subl $-128, %eax
+; AVX512-NEXT:    orq %rsi, %rdx
+; AVX512-NEXT:    cmovnel %r8d, %eax
+; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    retq
+  %a0 = load i256, ptr %p0
+  %cnt = call i256 @llvm.ctlz.i256(i256 %a0, i1 0)
+  %res = trunc i256 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @test_ctlz_i512(i512 %a0) nounwind {
+; SSE-LABEL: test_ctlz_i512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %r15
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; SSE-NEXT:    bsrq %r11, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    bsrq %r10, %r14
+; SSE-NEXT:    xorl $63, %r14d
+; SSE-NEXT:    orl $64, %r14d
+; SSE-NEXT:    testq %r11, %r11
+; SSE-NEXT:    cmovnel %eax, %r14d
+; SSE-NEXT:    bsrq %r9, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    bsrq %r8, %rbx
+; SSE-NEXT:    xorl $63, %ebx
+; SSE-NEXT:    orl $64, %ebx
+; SSE-NEXT:    testq %r9, %r9
+; SSE-NEXT:    cmovnel %eax, %ebx
+; SSE-NEXT:    subl $-128, %ebx
+; SSE-NEXT:    movq %r10, %rax
+; SSE-NEXT:    orq %r11, %rax
+; SSE-NEXT:    cmovnel %r14d, %ebx
+; SSE-NEXT:    bsrq %rcx, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    bsrq %rdx, %r14
+; SSE-NEXT:    xorl $63, %r14d
+; SSE-NEXT:    orl $64, %r14d
+; SSE-NEXT:    testq %rcx, %rcx
+; SSE-NEXT:    cmovnel %eax, %r14d
+; SSE-NEXT:    bsrq %rsi, %r15
+; SSE-NEXT:    xorl $63, %r15d
+; SSE-NEXT:    movl $127, %eax
+; SSE-NEXT:    bsrq %rdi, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rsi, %rsi
+; SSE-NEXT:    cmovnel %r15d, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    orq %rcx, %rdx
+; SSE-NEXT:    cmovnel %r14d, %eax
+; SSE-NEXT:    addl $256, %eax # imm = 0x100
+; SSE-NEXT:    orq %r11, %r9
+; SSE-NEXT:    orq %r10, %r8
+; SSE-NEXT:    orq %r9, %r8
+; SSE-NEXT:    cmovnel %ebx, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    popq %r15
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: test_ctlz_i512:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX2-NEXT:    lzcntq %r11, %rax
+; AVX2-NEXT:    xorl %r14d, %r14d
+; AVX2-NEXT:    lzcntq %r10, %r14
+; AVX2-NEXT:    addl $64, %r14d
+; AVX2-NEXT:    testq %r11, %r11
+; AVX2-NEXT:    cmovnel %eax, %r14d
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    lzcntq %r9, %rax
+; AVX2-NEXT:    xorl %ebx, %ebx
+; AVX2-NEXT:    lzcntq %r8, %rbx
+; AVX2-NEXT:    addl $64, %ebx
+; AVX2-NEXT:    testq %r9, %r9
+; AVX2-NEXT:    cmovnel %eax, %ebx
+; AVX2-NEXT:    subl $-128, %ebx
+; AVX2-NEXT:    movq %r10, %rax
+; AVX2-NEXT:    orq %r11, %rax
+; AVX2-NEXT:    cmovnel %r14d, %ebx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    lzcntq %rcx, %rax
+; AVX2-NEXT:    xorl %r14d, %r14d
+; AVX2-NEXT:    lzcntq %rdx, %r14
+; AVX2-NEXT:    addl $64, %r14d
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %eax, %r14d
+; AVX2-NEXT:    xorl %r15d, %r15d
+; AVX2-NEXT:    lzcntq %rsi, %r15
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    lzcntq %rdi, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rsi, %rsi
+; AVX2-NEXT:    cmovnel %r15d, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    cmovnel %r14d, %eax
+; AVX2-NEXT:    addl $256, %eax # imm = 0x100
+; AVX2-NEXT:    orq %r11, %r9
+; AVX2-NEXT:    orq %r10, %r8
+; AVX2-NEXT:    orq %r9, %r8
+; AVX2-NEXT:    cmovnel %ebx, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ctlz_i512:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %r15
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX512-NEXT:    lzcntq %r11, %rax
+; AVX512-NEXT:    lzcntq %r10, %r14
+; AVX512-NEXT:    addl $64, %r14d
+; AVX512-NEXT:    testq %r11, %r11
+; AVX512-NEXT:    cmovnel %eax, %r14d
+; AVX512-NEXT:    lzcntq %r9, %rax
+; AVX512-NEXT:    lzcntq %r8, %rbx
+; AVX512-NEXT:    addl $64, %ebx
+; AVX512-NEXT:    testq %r9, %r9
+; AVX512-NEXT:    cmovnel %eax, %ebx
+; AVX512-NEXT:    subl $-128, %ebx
+; AVX512-NEXT:    movq %r10, %rax
+; AVX512-NEXT:    orq %r11, %rax
+; AVX512-NEXT:    cmovnel %r14d, %ebx
+; AVX512-NEXT:    lzcntq %rcx, %rax
+; AVX512-NEXT:    lzcntq %rdx, %r14
+; AVX512-NEXT:    addl $64, %r14d
+; AVX512-NEXT:    testq %rcx, %rcx
+; AVX512-NEXT:    cmovnel %eax, %r14d
+; AVX512-NEXT:    lzcntq %rsi, %r15
+; AVX512-NEXT:    lzcntq %rdi, %rax
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    testq %rsi, %rsi
+; AVX512-NEXT:    cmovnel %r15d, %eax
+; AVX512-NEXT:    subl $-128, %eax
+; AVX512-NEXT:    orq %rcx, %rdx
+; AVX512-NEXT:    cmovnel %r14d, %eax
+; AVX512-NEXT:    addl $256, %eax # imm = 0x100
+; AVX512-NEXT:    orq %r11, %r9
+; AVX512-NEXT:    orq %r10, %r8
+; AVX512-NEXT:    orq %r9, %r8
+; AVX512-NEXT:    cmovnel %ebx, %eax
+; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    popq %r15
+; AVX512-NEXT:    retq
+  %cnt = call i512 @llvm.ctlz.i512(i512 %a0, i1 0)
+  %res = trunc i512 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @load_ctlz_i512(ptr %p0) nounwind {
+; SSE-LABEL: load_ctlz_i512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %r15
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    movq 8(%rdi), %r10
+; SSE-NEXT:    movq 16(%rdi), %r9
+; SSE-NEXT:    movq 32(%rdi), %rcx
+; SSE-NEXT:    movq 40(%rdi), %rdx
+; SSE-NEXT:    movq 48(%rdi), %rsi
+; SSE-NEXT:    movq 56(%rdi), %r8
+; SSE-NEXT:    bsrq %r8, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    bsrq %rsi, %r14
+; SSE-NEXT:    xorl $63, %r14d
+; SSE-NEXT:    orl $64, %r14d
+; SSE-NEXT:    testq %r8, %r8
+; SSE-NEXT:    cmovnel %eax, %r14d
+; SSE-NEXT:    bsrq %rdx, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    bsrq %rcx, %r11
+; SSE-NEXT:    xorl $63, %r11d
+; SSE-NEXT:    orl $64, %r11d
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovnel %eax, %r11d
+; SSE-NEXT:    movq 24(%rdi), %rbx
+; SSE-NEXT:    subl $-128, %r11d
+; SSE-NEXT:    movq %rsi, %rax
+; SSE-NEXT:    orq %r8, %rax
+; SSE-NEXT:    cmovnel %r14d, %r11d
+; SSE-NEXT:    bsrq %rbx, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    bsrq %r9, %r14
+; SSE-NEXT:    xorl $63, %r14d
+; SSE-NEXT:    orl $64, %r14d
+; SSE-NEXT:    testq %rbx, %rbx
+; SSE-NEXT:    cmovnel %eax, %r14d
+; SSE-NEXT:    bsrq %r10, %r15
+; SSE-NEXT:    xorl $63, %r15d
+; SSE-NEXT:    movl $127, %eax
+; SSE-NEXT:    bsrq (%rdi), %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %r10, %r10
+; SSE-NEXT:    cmovnel %r15d, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    orq %rbx, %r9
+; SSE-NEXT:    cmovnel %r14d, %eax
+; SSE-NEXT:    addl $256, %eax # imm = 0x100
+; SSE-NEXT:    orq %r8, %rdx
+; SSE-NEXT:    orq %rsi, %rcx
+; SSE-NEXT:    orq %rdx, %rcx
+; SSE-NEXT:    cmovnel %r11d, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    popq %r15
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: load_ctlz_i512:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    movq 8(%rdi), %r10
+; AVX2-NEXT:    movq 16(%rdi), %r9
+; AVX2-NEXT:    movq 32(%rdi), %rcx
+; AVX2-NEXT:    movq 40(%rdi), %rdx
+; AVX2-NEXT:    movq 48(%rdi), %rsi
+; AVX2-NEXT:    movq 56(%rdi), %r8
+; AVX2-NEXT:    lzcntq %r8, %rax
+; AVX2-NEXT:    xorl %ebx, %ebx
+; AVX2-NEXT:    lzcntq %rsi, %rbx
+; AVX2-NEXT:    addl $64, %ebx
+; AVX2-NEXT:    testq %r8, %r8
+; AVX2-NEXT:    cmovnel %eax, %ebx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    lzcntq %rdx, %rax
+; AVX2-NEXT:    lzcntq %rcx, %r11
+; AVX2-NEXT:    addl $64, %r11d
+; AVX2-NEXT:    testq %rdx, %rdx
+; AVX2-NEXT:    cmovnel %eax, %r11d
+; AVX2-NEXT:    subl $-128, %r11d
+; AVX2-NEXT:    movq %rsi, %rax
+; AVX2-NEXT:    orq %r8, %rax
+; AVX2-NEXT:    cmovnel %ebx, %r11d
+; AVX2-NEXT:    movq 24(%rdi), %rbx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    lzcntq %rbx, %rax
+; AVX2-NEXT:    xorl %r14d, %r14d
+; AVX2-NEXT:    lzcntq %r9, %r14
+; AVX2-NEXT:    addl $64, %r14d
+; AVX2-NEXT:    testq %rbx, %rbx
+; AVX2-NEXT:    cmovnel %eax, %r14d
+; AVX2-NEXT:    xorl %r15d, %r15d
+; AVX2-NEXT:    lzcntq %r10, %r15
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    lzcntq (%rdi), %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %r10, %r10
+; AVX2-NEXT:    cmovnel %r15d, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %rbx, %r9
+; AVX2-NEXT:    cmovnel %r14d, %eax
+; AVX2-NEXT:    addl $256, %eax # imm = 0x100
+; AVX2-NEXT:    orq %r8, %rdx
+; AVX2-NEXT:    orq %rsi, %rcx
+; AVX2-NEXT:    orq %rdx, %rcx
+; AVX2-NEXT:    cmovnel %r11d, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_ctlz_i512:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    movq 8(%rdi), %r11
+; AVX512-NEXT:    movq 16(%rdi), %r9
+; AVX512-NEXT:    movq 24(%rdi), %r10
+; AVX512-NEXT:    movq 32(%rdi), %rcx
+; AVX512-NEXT:    movq 40(%rdi), %rdx
+; AVX512-NEXT:    movq 48(%rdi), %rsi
+; AVX512-NEXT:    movq 56(%rdi), %r8
+; AVX512-NEXT:    lzcntq %r8, %rax
+; AVX512-NEXT:    lzcntq %rsi, %r14
+; AVX512-NEXT:    addl $64, %r14d
+; AVX512-NEXT:    testq %r8, %r8
+; AVX512-NEXT:    cmovnel %eax, %r14d
+; AVX512-NEXT:    lzcntq %rdx, %rax
+; AVX512-NEXT:    lzcntq %rcx, %rbx
+; AVX512-NEXT:    addl $64, %ebx
+; AVX512-NEXT:    testq %rdx, %rdx
+; AVX512-NEXT:    cmovnel %eax, %ebx
+; AVX512-NEXT:    subl $-128, %ebx
+; AVX512-NEXT:    movq %rsi, %rax
+; AVX512-NEXT:    orq %r8, %rax
+; AVX512-NEXT:    cmovnel %r14d, %ebx
+; AVX512-NEXT:    lzcntq %r10, %rax
+; AVX512-NEXT:    lzcntq %r9, %r14
+; AVX512-NEXT:    addl $64, %r14d
+; AVX512-NEXT:    testq %r10, %r10
+; AVX512-NEXT:    cmovnel %eax, %r14d
+; AVX512-NEXT:    lzcntq (%rdi), %rax
+; AVX512-NEXT:    lzcntq %r11, %rdi
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    testq %r11, %r11
+; AVX512-NEXT:    cmovnel %edi, %eax
+; AVX512-NEXT:    subl $-128, %eax
+; AVX512-NEXT:    orq %r10, %r9
+; AVX512-NEXT:    cmovnel %r14d, %eax
+; AVX512-NEXT:    addl $256, %eax # imm = 0x100
+; AVX512-NEXT:    orq %r8, %rdx
+; AVX512-NEXT:    orq %rsi, %rcx
+; AVX512-NEXT:    orq %rdx, %rcx
+; AVX512-NEXT:    cmovnel %ebx, %eax
+; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    retq
+  %a0 = load i512, ptr %p0
+  %cnt = call i512 @llvm.ctlz.i512(i512 %a0, i1 0)
+  %res = trunc i512 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @test_ctlz_i1024(i1024 %a0) nounwind {
+; SSE-LABEL: test_ctlz_i1024:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %rbp
+; SSE-NEXT:    pushq %r15
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %r13
+; SSE-NEXT:    pushq %r12
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    movq %r9, %r11
+; SSE-NEXT:    movq %r8, %r9
+; SSE-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq %rdx, %r12
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; SSE-NEXT:    bsrq %r8, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    bsrq %r15, %rcx
+; SSE-NEXT:    xorl $63, %ecx
+; SSE-NEXT:    orl $64, %ecx
+; SSE-NEXT:    testq %r8, %r8
+; SSE-NEXT:    cmovnel %eax, %ecx
+; SSE-NEXT:    bsrq %r14, %rdx
+; SSE-NEXT:    xorl $63, %edx
+; SSE-NEXT:    bsrq {{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    orl $64, %eax
+; SSE-NEXT:    testq %r14, %r14
+; SSE-NEXT:    cmovnel %edx, %eax
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r13
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    movq %r15, %rdx
+; SSE-NEXT:    orq %r8, %rdx
+; SSE-NEXT:    movq %r8, %r14
+; SSE-NEXT:    cmovnel %ecx, %eax
+; SSE-NEXT:    bsrq %r13, %rcx
+; SSE-NEXT:    xorl $63, %ecx
+; SSE-NEXT:    bsrq %rbx, %rdx
+; SSE-NEXT:    xorl $63, %edx
+; SSE-NEXT:    orl $64, %edx
+; SSE-NEXT:    testq %r13, %r13
+; SSE-NEXT:    cmovnel %ecx, %edx
+; SSE-NEXT:    bsrq %r10, %rcx
+; SSE-NEXT:    xorl $63, %ecx
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; SSE-NEXT:    bsrq %r8, %rbp
+; SSE-NEXT:    xorl $63, %ebp
+; SSE-NEXT:    orl $64, %ebp
+; SSE-NEXT:    testq %r10, %r10
+; SSE-NEXT:    cmovnel %ecx, %ebp
+; SSE-NEXT:    subl $-128, %ebp
+; SSE-NEXT:    movq %rbx, %rcx
+; SSE-NEXT:    orq %r13, %rcx
+; SSE-NEXT:    cmovnel %edx, %ebp
+; SSE-NEXT:    addl $256, %ebp # imm = 0x100
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    orq %r14, %rcx
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; SSE-NEXT:    orq %r15, %rdx
+; SSE-NEXT:    orq %rcx, %rdx
+; SSE-NEXT:    cmovnel %eax, %ebp
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; SSE-NEXT:    bsrq %r14, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; SSE-NEXT:    bsrq %r15, %rcx
+; SSE-NEXT:    xorl $63, %ecx
+; SSE-NEXT:    orl $64, %ecx
+; SSE-NEXT:    testq %r14, %r14
+; SSE-NEXT:    cmovnel %eax, %ecx
+; SSE-NEXT:    bsrq %r11, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    bsrq %r9, %rdx
+; SSE-NEXT:    xorl $63, %edx
+; SSE-NEXT:    orl $64, %edx
+; SSE-NEXT:    testq %r11, %r11
+; SSE-NEXT:    cmovnel %eax, %edx
+; SSE-NEXT:    subl $-128, %edx
+; SSE-NEXT:    movq %r15, %rax
+; SSE-NEXT:    orq %r14, %rax
+; SSE-NEXT:    cmovnel %ecx, %edx
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE-NEXT:    bsrq %r15, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    bsrq %r12, %rcx
+; SSE-NEXT:    xorl $63, %ecx
+; SSE-NEXT:    orl $64, %ecx
+; SSE-NEXT:    testq %r15, %r15
+; SSE-NEXT:    cmovnel %eax, %ecx
+; SSE-NEXT:    movl $127, %eax
+; SSE-NEXT:    bsrq %rdi, %rax
+; SSE-NEXT:    bsrq %rsi, %rdi
+; SSE-NEXT:    xorl $63, %edi
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rsi, %rsi
+; SSE-NEXT:    cmovnel %edi, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    orq %r15, %r12
+; SSE-NEXT:    cmovnel %ecx, %eax
+; SSE-NEXT:    orq %r14, %r11
+; SSE-NEXT:    orq {{[0-9]+}}(%rsp), %r9
+; SSE-NEXT:    addl $256, %eax # imm = 0x100
+; SSE-NEXT:    orq %r11, %r9
+; SSE-NEXT:    cmovnel %edx, %eax
+; SSE-NEXT:    orq {{[0-9]+}}(%rsp), %r13
+; SSE-NEXT:    orq {{[0-9]+}}(%rsp), %r10
+; SSE-NEXT:    orq %r13, %r10
+; SSE-NEXT:    orq {{[0-9]+}}(%rsp), %rbx
+; SSE-NEXT:    orq {{[0-9]+}}(%rsp), %r8
+; SSE-NEXT:    orq %rbx, %r8
+; SSE-NEXT:    addl $512, %eax # imm = 0x200
+; SSE-NEXT:    orq %r10, %r8
+; SSE-NEXT:    cmovnel %ebp, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r12
+; SSE-NEXT:    popq %r13
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    popq %r15
+; SSE-NEXT:    popq %rbp
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: test_ctlz_i1024:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %r13
+; AVX2-NEXT:    pushq %r12
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    movq %r9, %r14
+; AVX2-NEXT:    movq %r8, %r11
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    lzcntq %r12, %rcx
+; AVX2-NEXT:    xorl %r9d, %r9d
+; AVX2-NEXT:    lzcntq %r8, %r9
+; AVX2-NEXT:    addl $64, %r9d
+; AVX2-NEXT:    testq %r12, %r12
+; AVX2-NEXT:    cmovnel %ecx, %r9d
+; AVX2-NEXT:    xorl %esi, %esi
+; AVX2-NEXT:    lzcntq %r10, %rsi
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    lzcntq %rax, %rcx
+; AVX2-NEXT:    addl $64, %ecx
+; AVX2-NEXT:    testq %r10, %r10
+; AVX2-NEXT:    cmovnel %esi, %ecx
+; AVX2-NEXT:    subl $-128, %ecx
+; AVX2-NEXT:    movq %r8, %rsi
+; AVX2-NEXT:    orq %r12, %rsi
+; AVX2-NEXT:    cmovnel %r9d, %ecx
+; AVX2-NEXT:    xorl %edi, %edi
+; AVX2-NEXT:    lzcntq %rbx, %rdi
+; AVX2-NEXT:    xorl %esi, %esi
+; AVX2-NEXT:    lzcntq %r15, %rsi
+; AVX2-NEXT:    addl $64, %esi
+; AVX2-NEXT:    testq %rbx, %rbx
+; AVX2-NEXT:    cmovnel %edi, %esi
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r13
+; AVX2-NEXT:    xorl %ebp, %ebp
+; AVX2-NEXT:    lzcntq %r13, %rbp
+; AVX2-NEXT:    addl $64, %ebp
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r9
+; AVX2-NEXT:    xorl %edi, %edi
+; AVX2-NEXT:    lzcntq %r9, %rdi
+; AVX2-NEXT:    testq %r9, %r9
+; AVX2-NEXT:    cmovnel %edi, %ebp
+; AVX2-NEXT:    subl $-128, %ebp
+; AVX2-NEXT:    movq %r15, %rdi
+; AVX2-NEXT:    orq %rbx, %rdi
+; AVX2-NEXT:    cmovnel %esi, %ebp
+; AVX2-NEXT:    addl $256, %ebp # imm = 0x100
+; AVX2-NEXT:    movq %r10, %rdi
+; AVX2-NEXT:    orq %r12, %rdi
+; AVX2-NEXT:    movq %rax, %rsi
+; AVX2-NEXT:    orq %r8, %rsi
+; AVX2-NEXT:    orq %rdi, %rsi
+; AVX2-NEXT:    cmovnel %ecx, %ebp
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    lzcntq %rdi, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    lzcntq %r12, %rcx
+; AVX2-NEXT:    testq %r12, %r12
+; AVX2-NEXT:    cmovnel %ecx, %eax
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    lzcntq %r11, %rcx
+; AVX2-NEXT:    addl $64, %ecx
+; AVX2-NEXT:    xorl %esi, %esi
+; AVX2-NEXT:    lzcntq %r14, %rsi
+; AVX2-NEXT:    testq %r14, %r14
+; AVX2-NEXT:    cmovnel %esi, %ecx
+; AVX2-NEXT:    subl $-128, %ecx
+; AVX2-NEXT:    movq %rdi, %rsi
+; AVX2-NEXT:    orq %r12, %rsi
+; AVX2-NEXT:    cmovnel %eax, %ecx
+; AVX2-NEXT:    movq %rdx, %rdi
+; AVX2-NEXT:    lzcntq %rdx, %rdx
+; AVX2-NEXT:    addl $64, %edx
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    lzcntq %r10, %rax
+; AVX2-NEXT:    testq %r10, %r10
+; AVX2-NEXT:    cmovnel %eax, %edx
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT:    lzcntq %rax, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; AVX2-NEXT:    lzcntq %rsi, %r8
+; AVX2-NEXT:    testq %rsi, %rsi
+; AVX2-NEXT:    cmovnel %r8d, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %r10, %rdi
+; AVX2-NEXT:    cmovnel %edx, %eax
+; AVX2-NEXT:    orq %r12, %r14
+; AVX2-NEXT:    orq {{[0-9]+}}(%rsp), %r11
+; AVX2-NEXT:    addl $256, %eax # imm = 0x100
+; AVX2-NEXT:    orq %r14, %r11
+; AVX2-NEXT:    cmovnel %ecx, %eax
+; AVX2-NEXT:    orq {{[0-9]+}}(%rsp), %rbx
+; AVX2-NEXT:    orq {{[0-9]+}}(%rsp), %r9
+; AVX2-NEXT:    orq %rbx, %r9
+; AVX2-NEXT:    orq {{[0-9]+}}(%rsp), %r15
+; AVX2-NEXT:    orq {{[0-9]+}}(%rsp), %r13
+; AVX2-NEXT:    orq %r15, %r13
+; AVX2-NEXT:    addl $512, %eax # imm = 0x200
+; AVX2-NEXT:    orq %r9, %r13
+; AVX2-NEXT:    cmovnel %ebp, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r12
+; AVX2-NEXT:    popq %r13
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_ctlz_i1024:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %rbp
+; AVX512-NEXT:    pushq %r15
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    pushq %r13
+; AVX512-NEXT:    pushq %r12
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    movq %r9, %r14
+; AVX512-NEXT:    movq %r8, %r11
+; AVX512-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; AVX512-NEXT:    lzcntq %r12, %rcx
+; AVX512-NEXT:    lzcntq %r8, %r9
+; AVX512-NEXT:    addl $64, %r9d
+; AVX512-NEXT:    testq %r12, %r12
+; AVX512-NEXT:    cmovnel %ecx, %r9d
+; AVX512-NEXT:    lzcntq %r10, %rsi
+; AVX512-NEXT:    lzcntq %rax, %rcx
+; AVX512-NEXT:    addl $64, %ecx
+; AVX512-NEXT:    testq %r10, %r10
+; AVX512-NEXT:    cmovnel %esi, %ecx
+; AVX512-NEXT:    subl $-128, %ecx
+; AVX512-NEXT:    movq %r8, %rsi
+; AVX512-NEXT:    orq %r12, %rsi
+; AVX512-NEXT:    cmovnel %r9d, %ecx
+; AVX512-NEXT:    lzcntq %rbx, %rdi
+; AVX512-NEXT:    lzcntq %r15, %rsi
+; AVX512-NEXT:    addl $64, %esi
+; AVX512-NEXT:    testq %rbx, %rbx
+; AVX512-NEXT:    cmovnel %edi, %esi
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r13
+; AVX512-NEXT:    lzcntq %r13, %rbp
+; AVX512-NEXT:    addl $64, %ebp
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r9
+; AVX512-NEXT:    lzcntq %r9, %rdi
+; AVX512-NEXT:    testq %r9, %r9
+; AVX512-NEXT:    cmovnel %edi, %ebp
+; AVX512-NEXT:    subl $-128, %ebp
+; AVX512-NEXT:    movq %r15, %rdi
+; AVX512-NEXT:    orq %rbx, %rdi
+; AVX512-NEXT:    cmovnel %esi, %ebp
+; AVX512-NEXT:    addl $256, %ebp # imm = 0x100
+; AVX512-NEXT:    movq %r10, %rdi
+; AVX512-NEXT:    orq %r12, %rdi
+; AVX512-NEXT:    movq %rax, %rsi
+; AVX512-NEXT:    orq %r8, %rsi
+; AVX512-NEXT:    orq %rdi, %rsi
+; AVX512-NEXT:    cmovnel %ecx, %ebp
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; AVX512-NEXT:    lzcntq %rdi, %rax
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; AVX512-NEXT:    lzcntq %r12, %rcx
+; AVX512-NEXT:    testq %r12, %r12
+; AVX512-NEXT:    cmovnel %ecx, %eax
+; AVX512-NEXT:    lzcntq %r11, %rcx
+; AVX512-NEXT:    addl $64, %ecx
+; AVX512-NEXT:    lzcntq %r14, %rsi
+; AVX512-NEXT:    testq %r14, %r14
+; AVX512-NEXT:    cmovnel %esi, %ecx
+; AVX512-NEXT:    subl $-128, %ecx
+; AVX512-NEXT:    movq %rdi, %rsi
+; AVX512-NEXT:    orq %r12, %rsi
+; AVX512-NEXT:    cmovnel %eax, %ecx
+; AVX512-NEXT:    movq %rdx, %rdi
+; AVX512-NEXT:    lzcntq %rdx, %rdx
+; AVX512-NEXT:    addl $64, %edx
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX512-NEXT:    lzcntq %r10, %rax
+; AVX512-NEXT:    testq %r10, %r10
+; AVX512-NEXT:    cmovnel %eax, %edx
+; AVX512-NEXT:    lzcntq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; AVX512-NEXT:    lzcntq %rsi, %r8
+; AVX512-NEXT:    testq %rsi, %rsi
+; AVX512-NEXT:    cmovnel %r8d, %eax
+; AVX512-NEXT:    subl $-128, %eax
+; AVX512-NEXT:    orq %r10, %rdi
+; AVX512-NEXT:    cmovnel %edx, %eax
+; AVX512-NEXT:    orq %r12, %r14
+; AVX512-NEXT:    orq {{[0-9]+}}(%rsp), %r11
+; AVX512-NEXT:    addl $256, %eax # imm = 0x100
+; AVX512-NEXT:    orq %r14, %r11
+; AVX512-NEXT:    cmovnel %ecx, %eax
+; AVX512-NEXT:    orq {{[0-9]+}}(%rsp), %rbx
+; AVX512-NEXT:    orq {{[0-9]+}}(%rsp), %r9
+; AVX512-NEXT:    orq %rbx, %r9
+; AVX512-NEXT:    orq {{[0-9]+}}(%rsp), %r15
+; AVX512-NEXT:    orq {{[0-9]+}}(%rsp), %r13
+; AVX512-NEXT:    orq %r15, %r13
+; AVX512-NEXT:    addl $512, %eax # imm = 0x200
+; AVX512-NEXT:    orq %r9, %r13
+; AVX512-NEXT:    cmovnel %ebp, %eax
+; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    popq %r12
+; AVX512-NEXT:    popq %r13
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    popq %r15
+; AVX512-NEXT:    popq %rbp
+; AVX512-NEXT:    retq
+  %cnt = call i1024 @llvm.ctlz.i1024(i1024 %a0, i1 0)
+  %res = trunc i1024 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @load_ctlz_i1024(ptr %p0) nounwind {
+; SSE-LABEL: load_ctlz_i1024:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %rbp
+; SSE-NEXT:    pushq %r15
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %r13
+; SSE-NEXT:    pushq %r12
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    movq 40(%rdi), %rbp
+; SSE-NEXT:    movq 64(%rdi), %rbx
+; SSE-NEXT:    movq 72(%rdi), %r11
+; SSE-NEXT:    movq 80(%rdi), %r12
+; SSE-NEXT:    movq 88(%rdi), %r14
+; SSE-NEXT:    movq 96(%rdi), %rsi
+; SSE-NEXT:    movq 104(%rdi), %r9
+; SSE-NEXT:    movq 112(%rdi), %r10
+; SSE-NEXT:    movq 120(%rdi), %r8
+; SSE-NEXT:    bsrq %r8, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    bsrq %r10, %rcx
+; SSE-NEXT:    xorl $63, %ecx
+; SSE-NEXT:    orl $64, %ecx
+; SSE-NEXT:    testq %r8, %r8
+; SSE-NEXT:    cmovnel %eax, %ecx
+; SSE-NEXT:    bsrq %r9, %rdx
+; SSE-NEXT:    xorl $63, %edx
+; SSE-NEXT:    bsrq %rsi, %rax
+; SSE-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    orl $64, %eax
+; SSE-NEXT:    testq %r9, %r9
+; SSE-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    cmovnel %edx, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    movq %r10, %rdx
+; SSE-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    orq %r8, %rdx
+; SSE-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    cmovnel %ecx, %eax
+; SSE-NEXT:    bsrq %r14, %rcx
+; SSE-NEXT:    xorl $63, %ecx
+; SSE-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    bsrq %r12, %rdx
+; SSE-NEXT:    xorl $63, %edx
+; SSE-NEXT:    orl $64, %edx
+; SSE-NEXT:    testq %r14, %r14
+; SSE-NEXT:    cmovnel %ecx, %edx
+; SSE-NEXT:    bsrq %r11, %rcx
+; SSE-NEXT:    xorl $63, %ecx
+; SSE-NEXT:    bsrq %rbx, %r15
+; SSE-NEXT:    xorl $63, %r15d
+; SSE-NEXT:    orl $64, %r15d
+; SSE-NEXT:    testq %r11, %r11
+; SSE-NEXT:    cmovnel %ecx, %r15d
+; SSE-NEXT:    subl $-128, %r15d
+; SSE-NEXT:    movq %r12, %rcx
+; SSE-NEXT:    orq %r14, %rcx
+; SSE-NEXT:    cmovnel %edx, %r15d
+; SSE-NEXT:    movq 48(%rdi), %r12
+; SSE-NEXT:    addl $256, %r15d # imm = 0x100
+; SSE-NEXT:    movq %r9, %rcx
+; SSE-NEXT:    orq %r8, %rcx
+; SSE-NEXT:    movq %rsi, %rdx
+; SSE-NEXT:    orq %r10, %rdx
+; SSE-NEXT:    orq %rcx, %rdx
+; SSE-NEXT:    movq 56(%rdi), %r13
+; SSE-NEXT:    cmovnel %eax, %r15d
+; SSE-NEXT:    bsrq %r13, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    bsrq %r12, %rcx
+; SSE-NEXT:    xorl $63, %ecx
+; SSE-NEXT:    orl $64, %ecx
+; SSE-NEXT:    testq %r13, %r13
+; SSE-NEXT:    cmovnel %eax, %ecx
+; SSE-NEXT:    movq %rbp, %r10
+; SSE-NEXT:    bsrq %rbp, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    movq 32(%rdi), %r8
+; SSE-NEXT:    bsrq %r8, %rbp
+; SSE-NEXT:    xorl $63, %ebp
+; SSE-NEXT:    orl $64, %ebp
+; SSE-NEXT:    testq %r10, %r10
+; SSE-NEXT:    cmovnel %eax, %ebp
+; SSE-NEXT:    subl $-128, %ebp
+; SSE-NEXT:    movq %r12, %rax
+; SSE-NEXT:    orq %r13, %rax
+; SSE-NEXT:    cmovnel %ecx, %ebp
+; SSE-NEXT:    movq 24(%rdi), %r9
+; SSE-NEXT:    bsrq %r9, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    movq 16(%rdi), %rsi
+; SSE-NEXT:    bsrq %rsi, %rcx
+; SSE-NEXT:    xorl $63, %ecx
+; SSE-NEXT:    orl $64, %ecx
+; SSE-NEXT:    testq %r9, %r9
+; SSE-NEXT:    cmovnel %eax, %ecx
+; SSE-NEXT:    movl $127, %eax
+; SSE-NEXT:    bsrq (%rdi), %rax
+; SSE-NEXT:    movq 8(%rdi), %rdi
+; SSE-NEXT:    bsrq %rdi, %rdx
+; SSE-NEXT:    xorl $63, %edx
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rdi, %rdi
+; SSE-NEXT:    cmovnel %edx, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    orq %r9, %rsi
+; SSE-NEXT:    cmovnel %ecx, %eax
+; SSE-NEXT:    orq %r13, %r10
+; SSE-NEXT:    orq %r12, %r8
+; SSE-NEXT:    addl $256, %eax # imm = 0x100
+; SSE-NEXT:    orq %r10, %r8
+; SSE-NEXT:    cmovnel %ebp, %eax
+; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; SSE-NEXT:    orq %r14, %r11
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; SSE-NEXT:    orq %rcx, %rbx
+; SSE-NEXT:    addl $512, %eax # imm = 0x200
+; SSE-NEXT:    orq %r11, %rbx
+; SSE-NEXT:    cmovnel %r15d, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r12
+; SSE-NEXT:    popq %r13
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    popq %r15
+; SSE-NEXT:    popq %rbp
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: load_ctlz_i1024:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %r13
+; AVX2-NEXT:    pushq %r12
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    movq 48(%rdi), %r9
+; AVX2-NEXT:    movq 56(%rdi), %rbp
+; AVX2-NEXT:    movq 64(%rdi), %r11
+; AVX2-NEXT:    movq 72(%rdi), %r10
+; AVX2-NEXT:    movq 80(%rdi), %r14
+; AVX2-NEXT:    movq 88(%rdi), %rbx
+; AVX2-NEXT:    movq 96(%rdi), %rdx
+; AVX2-NEXT:    movq 104(%rdi), %r8
+; AVX2-NEXT:    movq 112(%rdi), %rsi
+; AVX2-NEXT:    movq 120(%rdi), %r15
+; AVX2-NEXT:    lzcntq %r15, %rax
+; AVX2-NEXT:    lzcntq %rsi, %rcx
+; AVX2-NEXT:    addl $64, %ecx
+; AVX2-NEXT:    testq %r15, %r15
+; AVX2-NEXT:    cmovnel %eax, %ecx
+; AVX2-NEXT:    xorl %r12d, %r12d
+; AVX2-NEXT:    lzcntq %r8, %r12
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    lzcntq %rdx, %rax
+; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %r8, %r8
+; AVX2-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    cmovnel %r12d, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    movq %rsi, %r12
+; AVX2-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    orq %r15, %r12
+; AVX2-NEXT:    cmovnel %ecx, %eax
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    lzcntq %rbx, %rcx
+; AVX2-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    xorl %r13d, %r13d
+; AVX2-NEXT:    lzcntq %r14, %r13
+; AVX2-NEXT:    addl $64, %r13d
+; AVX2-NEXT:    testq %rbx, %rbx
+; AVX2-NEXT:    cmovnel %ecx, %r13d
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    lzcntq %r10, %rcx
+; AVX2-NEXT:    xorl %r12d, %r12d
+; AVX2-NEXT:    lzcntq %r11, %r12
+; AVX2-NEXT:    addl $64, %r12d
+; AVX2-NEXT:    testq %r10, %r10
+; AVX2-NEXT:    cmovnel %ecx, %r12d
+; AVX2-NEXT:    subl $-128, %r12d
+; AVX2-NEXT:    movq %r14, %rcx
+; AVX2-NEXT:    orq %rbx, %rcx
+; AVX2-NEXT:    cmovnel %r13d, %r12d
+; AVX2-NEXT:    addl $256, %r12d # imm = 0x100
+; AVX2-NEXT:    movq %r8, %rcx
+; AVX2-NEXT:    orq %r15, %rcx
+; AVX2-NEXT:    orq %rsi, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    cmovnel %eax, %r12d
+; AVX2-NEXT:    movq %rbp, %r14
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    lzcntq %rbp, %rcx
+; AVX2-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    lzcntq %r9, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rbp, %rbp
+; AVX2-NEXT:    cmovnel %ecx, %eax
+; AVX2-NEXT:    movq 32(%rdi), %r13
+; AVX2-NEXT:    xorl %ebp, %ebp
+; AVX2-NEXT:    lzcntq %r13, %rbp
+; AVX2-NEXT:    addl $64, %ebp
+; AVX2-NEXT:    movq 40(%rdi), %r8
+; AVX2-NEXT:    xorl %edx, %edx
+; AVX2-NEXT:    lzcntq %r8, %rdx
+; AVX2-NEXT:    testq %r8, %r8
+; AVX2-NEXT:    cmovnel %edx, %ebp
+; AVX2-NEXT:    subl $-128, %ebp
+; AVX2-NEXT:    movq %r9, %rdx
+; AVX2-NEXT:    orq %r14, %rdx
+; AVX2-NEXT:    cmovnel %eax, %ebp
+; AVX2-NEXT:    movq 16(%rdi), %r9
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    lzcntq %r9, %rcx
+; AVX2-NEXT:    addl $64, %ecx
+; AVX2-NEXT:    movq 24(%rdi), %rdx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    lzcntq %rdx, %rax
+; AVX2-NEXT:    testq %rdx, %rdx
+; AVX2-NEXT:    cmovnel %eax, %ecx
+; AVX2-NEXT:    movq 8(%rdi), %rsi
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    lzcntq (%rdi), %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    lzcntq %rsi, %rdi
+; AVX2-NEXT:    testq %rsi, %rsi
+; AVX2-NEXT:    cmovnel %edi, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %rdx, %r9
+; AVX2-NEXT:    cmovnel %ecx, %eax
+; AVX2-NEXT:    orq %r14, %r8
+; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; AVX2-NEXT:    addl $256, %eax # imm = 0x100
+; AVX2-NEXT:    orq %r8, %r13
+; AVX2-NEXT:    cmovnel %ebp, %eax
+; AVX2-NEXT:    orq %r15, %rbx
+; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
+; AVX2-NEXT:    orq %rbx, %r10
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; AVX2-NEXT:    orq %rcx, %r11
+; AVX2-NEXT:    addl $512, %eax # imm = 0x200
+; AVX2-NEXT:    orq %r10, %r11
+; AVX2-NEXT:    cmovnel %r12d, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r12
+; AVX2-NEXT:    popq %r13
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_ctlz_i1024:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %rbp
+; AVX512-NEXT:    pushq %r15
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    pushq %r13
+; AVX512-NEXT:    pushq %r12
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    movq 32(%rdi), %r14
+; AVX512-NEXT:    movq 48(%rdi), %rbp
+; AVX512-NEXT:    movq 64(%rdi), %r11
+; AVX512-NEXT:    movq 72(%rdi), %r10
+; AVX512-NEXT:    movq 80(%rdi), %rdx
+; AVX512-NEXT:    movq 88(%rdi), %rbx
+; AVX512-NEXT:    movq 96(%rdi), %rsi
+; AVX512-NEXT:    movq 104(%rdi), %r9
+; AVX512-NEXT:    movq 112(%rdi), %r8
+; AVX512-NEXT:    movq 120(%rdi), %r15
+; AVX512-NEXT:    lzcntq %r15, %rax
+; AVX512-NEXT:    lzcntq %r8, %rcx
+; AVX512-NEXT:    addl $64, %ecx
+; AVX512-NEXT:    testq %r15, %r15
+; AVX512-NEXT:    cmovnel %eax, %ecx
+; AVX512-NEXT:    lzcntq %r9, %r12
+; AVX512-NEXT:    lzcntq %rsi, %rax
+; AVX512-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    testq %r9, %r9
+; AVX512-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    cmovnel %r12d, %eax
+; AVX512-NEXT:    subl $-128, %eax
+; AVX512-NEXT:    movq %r8, %r12
+; AVX512-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    orq %r15, %r12
+; AVX512-NEXT:    cmovnel %ecx, %eax
+; AVX512-NEXT:    lzcntq %rbx, %rcx
+; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    lzcntq %rdx, %r13
+; AVX512-NEXT:    addl $64, %r13d
+; AVX512-NEXT:    testq %rbx, %rbx
+; AVX512-NEXT:    cmovnel %ecx, %r13d
+; AVX512-NEXT:    lzcntq %r10, %rcx
+; AVX512-NEXT:    lzcntq %r11, %r12
+; AVX512-NEXT:    addl $64, %r12d
+; AVX512-NEXT:    testq %r10, %r10
+; AVX512-NEXT:    cmovnel %ecx, %r12d
+; AVX512-NEXT:    subl $-128, %r12d
+; AVX512-NEXT:    movq %rdx, %rcx
+; AVX512-NEXT:    orq %rbx, %rcx
+; AVX512-NEXT:    cmovnel %r13d, %r12d
+; AVX512-NEXT:    addl $256, %r12d # imm = 0x100
+; AVX512-NEXT:    movq %r9, %rcx
+; AVX512-NEXT:    orq %r15, %rcx
+; AVX512-NEXT:    orq %r8, %rsi
+; AVX512-NEXT:    orq %rcx, %rsi
+; AVX512-NEXT:    movq 56(%rdi), %r13
+; AVX512-NEXT:    cmovnel %eax, %r12d
+; AVX512-NEXT:    lzcntq %r13, %rcx
+; AVX512-NEXT:    movq %rbp, %rsi
+; AVX512-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    lzcntq %rbp, %rax
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    testq %r13, %r13
+; AVX512-NEXT:    cmovnel %ecx, %eax
+; AVX512-NEXT:    lzcntq %r14, %rbp
+; AVX512-NEXT:    addl $64, %ebp
+; AVX512-NEXT:    movq 40(%rdi), %r8
+; AVX512-NEXT:    lzcntq %r8, %rdx
+; AVX512-NEXT:    testq %r8, %r8
+; AVX512-NEXT:    cmovnel %edx, %ebp
+; AVX512-NEXT:    subl $-128, %ebp
+; AVX512-NEXT:    movq %rsi, %rdx
+; AVX512-NEXT:    orq %r13, %rdx
+; AVX512-NEXT:    cmovnel %eax, %ebp
+; AVX512-NEXT:    movq 16(%rdi), %r9
+; AVX512-NEXT:    lzcntq %r9, %rcx
+; AVX512-NEXT:    addl $64, %ecx
+; AVX512-NEXT:    movq 24(%rdi), %rdx
+; AVX512-NEXT:    lzcntq %rdx, %rax
+; AVX512-NEXT:    testq %rdx, %rdx
+; AVX512-NEXT:    cmovnel %eax, %ecx
+; AVX512-NEXT:    movq 8(%rdi), %rsi
+; AVX512-NEXT:    lzcntq (%rdi), %rax
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    lzcntq %rsi, %rdi
+; AVX512-NEXT:    testq %rsi, %rsi
+; AVX512-NEXT:    cmovnel %edi, %eax
+; AVX512-NEXT:    subl $-128, %eax
+; AVX512-NEXT:    orq %rdx, %r9
+; AVX512-NEXT:    cmovnel %ecx, %eax
+; AVX512-NEXT:    orq %r13, %r8
+; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; AVX512-NEXT:    addl $256, %eax # imm = 0x100
+; AVX512-NEXT:    orq %r8, %r14
+; AVX512-NEXT:    cmovnel %ebp, %eax
+; AVX512-NEXT:    orq %r15, %rbx
+; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
+; AVX512-NEXT:    orq %rbx, %r10
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; AVX512-NEXT:    orq %rcx, %r11
+; AVX512-NEXT:    addl $512, %eax # imm = 0x200
+; AVX512-NEXT:    orq %r10, %r11
+; AVX512-NEXT:    cmovnel %r12d, %eax
+; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    popq %r12
+; AVX512-NEXT:    popq %r13
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    popq %r15
+; AVX512-NEXT:    popq %rbp
+; AVX512-NEXT:    retq
+  %a0 = load i1024, ptr %p0
+  %cnt = call i1024 @llvm.ctlz.i1024(i1024 %a0, i1 0)
+  %res = trunc i1024 %cnt to i32
+  ret i32 %res
+}
+
+;
+; CTTZ
+;
+
+define i32 @test_cttz_i128(i128 %a0) nounwind {
+; SSE-LABEL: test_cttz_i128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    rep bsfq %rdi, %rcx
+; SSE-NEXT:    movl $64, %eax
+; SSE-NEXT:    rep bsfq %rsi, %rax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rdi, %rdi
+; SSE-NEXT:    cmovnel %ecx, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: test_cttz_i128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    tzcntq %rdi, %rcx
+; AVX2-NEXT:    tzcntq %rsi, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rdi, %rdi
+; AVX2-NEXT:    cmovnel %ecx, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_cttz_i128:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    tzcntq %rdi, %rcx
+; AVX512-NEXT:    tzcntq %rsi, %rax
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    testq %rdi, %rdi
+; AVX512-NEXT:    cmovnel %ecx, %eax
+; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    retq
+  %cnt = call i128 @llvm.cttz.i128(i128 %a0, i1 0)
+  %res = trunc i128 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @load_cttz_i128(ptr %p0) nounwind {
+; SSE-LABEL: load_cttz_i128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq (%rdi), %rcx
+; SSE-NEXT:    rep bsfq %rcx, %rdx
+; SSE-NEXT:    movl $64, %eax
+; SSE-NEXT:    rep bsfq 8(%rdi), %rax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rcx, %rcx
+; SSE-NEXT:    cmovnel %edx, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: load_cttz_i128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movq (%rdi), %rcx
+; AVX2-NEXT:    tzcntq %rcx, %rdx
+; AVX2-NEXT:    tzcntq 8(%rdi), %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %edx, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_cttz_i128:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    movq (%rdi), %rcx
+; AVX512-NEXT:    tzcntq %rcx, %rdx
+; AVX512-NEXT:    tzcntq 8(%rdi), %rax
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    testq %rcx, %rcx
+; AVX512-NEXT:    cmovnel %edx, %eax
+; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    retq
+  %a0 = load i128, ptr %p0
+  %cnt = call i128 @llvm.cttz.i128(i128 %a0, i1 0)
+  %res = trunc i128 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @test_cttz_i256(i256 %a0) nounwind {
+; SSE-LABEL: test_cttz_i256:
+; SSE:       # %bb.0:
+; SSE-NEXT:    rep bsfq %rdi, %rax
+; SSE-NEXT:    rep bsfq %rsi, %r8
+; SSE-NEXT:    addl $64, %r8d
+; SSE-NEXT:    testq %rdi, %rdi
+; SSE-NEXT:    cmovnel %eax, %r8d
+; SSE-NEXT:    rep bsfq %rdx, %r9
+; SSE-NEXT:    movl $64, %eax
+; SSE-NEXT:    rep bsfq %rcx, %rax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovnel %r9d, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    orq %rsi, %rdi
+; SSE-NEXT:    cmovnel %r8d, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: test_cttz_i256:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    tzcntq %rdi, %rax
+; AVX2-NEXT:    tzcntq %rsi, %r8
+; AVX2-NEXT:    addl $64, %r8d
+; AVX2-NEXT:    testq %rdi, %rdi
+; AVX2-NEXT:    cmovnel %eax, %r8d
+; AVX2-NEXT:    tzcntq %rdx, %r9
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    tzcntq %rcx, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rdx, %rdx
+; AVX2-NEXT:    cmovnel %r9d, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %rsi, %rdi
+; AVX2-NEXT:    cmovnel %r8d, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_cttz_i256:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    tzcntq %rdi, %rax
+; AVX512-NEXT:    tzcntq %rsi, %r8
+; AVX512-NEXT:    addl $64, %r8d
+; AVX512-NEXT:    testq %rdi, %rdi
+; AVX512-NEXT:    cmovnel %eax, %r8d
+; AVX512-NEXT:    tzcntq %rdx, %r9
+; AVX512-NEXT:    tzcntq %rcx, %rax
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    testq %rdx, %rdx
+; AVX512-NEXT:    cmovnel %r9d, %eax
+; AVX512-NEXT:    subl $-128, %eax
+; AVX512-NEXT:    orq %rsi, %rdi
+; AVX512-NEXT:    cmovnel %r8d, %eax
+; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    retq
+  %cnt = call i256 @llvm.cttz.i256(i256 %a0, i1 0)
+  %res = trunc i256 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @load_cttz_i256(ptr %p0) nounwind {
+; SSE-LABEL: load_cttz_i256:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq 16(%rdi), %rcx
+; SSE-NEXT:    movq (%rdi), %rdx
+; SSE-NEXT:    movq 8(%rdi), %rsi
+; SSE-NEXT:    rep bsfq %rdx, %rax
+; SSE-NEXT:    rep bsfq %rsi, %r8
+; SSE-NEXT:    addl $64, %r8d
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovnel %eax, %r8d
+; SSE-NEXT:    rep bsfq %rcx, %r9
+; SSE-NEXT:    movl $64, %eax
+; SSE-NEXT:    rep bsfq 24(%rdi), %rax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rcx, %rcx
+; SSE-NEXT:    cmovnel %r9d, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    orq %rsi, %rdx
+; SSE-NEXT:    cmovnel %r8d, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: load_cttz_i256:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movq (%rdi), %rcx
+; AVX2-NEXT:    movq 8(%rdi), %rdx
+; AVX2-NEXT:    tzcntq %rcx, %rax
+; AVX2-NEXT:    tzcntq %rdx, %rsi
+; AVX2-NEXT:    addl $64, %esi
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %eax, %esi
+; AVX2-NEXT:    movq 16(%rdi), %r8
+; AVX2-NEXT:    tzcntq %r8, %r9
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    tzcntq 24(%rdi), %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %r8, %r8
+; AVX2-NEXT:    cmovnel %r9d, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %rdx, %rcx
+; AVX2-NEXT:    cmovnel %esi, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_cttz_i256:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    movq 16(%rdi), %rcx
+; AVX512-NEXT:    movq (%rdi), %rdx
+; AVX512-NEXT:    movq 8(%rdi), %rsi
+; AVX512-NEXT:    tzcntq %rdx, %rax
+; AVX512-NEXT:    tzcntq %rsi, %r8
+; AVX512-NEXT:    addl $64, %r8d
+; AVX512-NEXT:    testq %rdx, %rdx
+; AVX512-NEXT:    cmovnel %eax, %r8d
+; AVX512-NEXT:    tzcntq %rcx, %r9
+; AVX512-NEXT:    tzcntq 24(%rdi), %rax
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    testq %rcx, %rcx
+; AVX512-NEXT:    cmovnel %r9d, %eax
+; AVX512-NEXT:    subl $-128, %eax
+; AVX512-NEXT:    orq %rsi, %rdx
+; AVX512-NEXT:    cmovnel %r8d, %eax
+; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    retq
+  %a0 = load i256, ptr %p0
+  %cnt = call i256 @llvm.cttz.i256(i256 %a0, i1 0)
+  %res = trunc i256 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @test_cttz_i512(i512 %a0) nounwind {
+; SSE-LABEL: test_cttz_i512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    rep bsfq %rdi, %rax
+; SSE-NEXT:    rep bsfq %rsi, %r11
+; SSE-NEXT:    addl $64, %r11d
+; SSE-NEXT:    testq %rdi, %rdi
+; SSE-NEXT:    cmovnel %eax, %r11d
+; SSE-NEXT:    rep bsfq %rdx, %rax
+; SSE-NEXT:    rep bsfq %rcx, %r10
+; SSE-NEXT:    addl $64, %r10d
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovnel %eax, %r10d
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; SSE-NEXT:    subl $-128, %r10d
+; SSE-NEXT:    movq %rdi, %rax
+; SSE-NEXT:    orq %rsi, %rax
+; SSE-NEXT:    cmovnel %r11d, %r10d
+; SSE-NEXT:    rep bsfq %r8, %rax
+; SSE-NEXT:    rep bsfq %r9, %r11
+; SSE-NEXT:    addl $64, %r11d
+; SSE-NEXT:    testq %r8, %r8
+; SSE-NEXT:    cmovnel %eax, %r11d
+; SSE-NEXT:    rep bsfq %rbx, %r14
+; SSE-NEXT:    movl $64, %eax
+; SSE-NEXT:    rep bsfq {{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rbx, %rbx
+; SSE-NEXT:    cmovnel %r14d, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    orq %r9, %r8
+; SSE-NEXT:    cmovnel %r11d, %eax
+; SSE-NEXT:    addl $256, %eax # imm = 0x100
+; SSE-NEXT:    orq %rcx, %rsi
+; SSE-NEXT:    orq %rdx, %rdi
+; SSE-NEXT:    orq %rsi, %rdi
+; SSE-NEXT:    cmovnel %r10d, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: test_cttz_i512:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    tzcntq %rdi, %rax
+; AVX2-NEXT:    tzcntq %rsi, %r11
+; AVX2-NEXT:    addl $64, %r11d
+; AVX2-NEXT:    testq %rdi, %rdi
+; AVX2-NEXT:    cmovnel %eax, %r11d
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    tzcntq %rdx, %rax
+; AVX2-NEXT:    tzcntq %rcx, %r10
+; AVX2-NEXT:    addl $64, %r10d
+; AVX2-NEXT:    testq %rdx, %rdx
+; AVX2-NEXT:    cmovnel %eax, %r10d
+; AVX2-NEXT:    subl $-128, %r10d
+; AVX2-NEXT:    movq %rdi, %rax
+; AVX2-NEXT:    orq %rsi, %rax
+; AVX2-NEXT:    cmovnel %r11d, %r10d
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    tzcntq %r8, %rax
+; AVX2-NEXT:    xorl %ebx, %ebx
+; AVX2-NEXT:    tzcntq %r9, %rbx
+; AVX2-NEXT:    addl $64, %ebx
+; AVX2-NEXT:    testq %r8, %r8
+; AVX2-NEXT:    cmovnel %eax, %ebx
+; AVX2-NEXT:    xorl %r14d, %r14d
+; AVX2-NEXT:    tzcntq %r11, %r14
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    tzcntq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %r11, %r11
+; AVX2-NEXT:    cmovnel %r14d, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %r9, %r8
+; AVX2-NEXT:    cmovnel %ebx, %eax
+; AVX2-NEXT:    addl $256, %eax # imm = 0x100
+; AVX2-NEXT:    orq %rcx, %rsi
+; AVX2-NEXT:    orq %rdx, %rdi
+; AVX2-NEXT:    orq %rsi, %rdi
+; AVX2-NEXT:    cmovnel %r10d, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_cttz_i512:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX512-NEXT:    tzcntq %rdi, %rax
+; AVX512-NEXT:    tzcntq %rsi, %rbx
+; AVX512-NEXT:    addl $64, %ebx
+; AVX512-NEXT:    testq %rdi, %rdi
+; AVX512-NEXT:    cmovnel %eax, %ebx
+; AVX512-NEXT:    tzcntq %rdx, %rax
+; AVX512-NEXT:    tzcntq %rcx, %r10
+; AVX512-NEXT:    addl $64, %r10d
+; AVX512-NEXT:    testq %rdx, %rdx
+; AVX512-NEXT:    cmovnel %eax, %r10d
+; AVX512-NEXT:    subl $-128, %r10d
+; AVX512-NEXT:    movq %rdi, %rax
+; AVX512-NEXT:    orq %rsi, %rax
+; AVX512-NEXT:    cmovnel %ebx, %r10d
+; AVX512-NEXT:    tzcntq %r8, %rax
+; AVX512-NEXT:    tzcntq %r9, %rbx
+; AVX512-NEXT:    addl $64, %ebx
+; AVX512-NEXT:    testq %r8, %r8
+; AVX512-NEXT:    cmovnel %eax, %ebx
+; AVX512-NEXT:    tzcntq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    tzcntq %r11, %r14
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    testq %r11, %r11
+; AVX512-NEXT:    cmovnel %r14d, %eax
+; AVX512-NEXT:    subl $-128, %eax
+; AVX512-NEXT:    orq %r9, %r8
+; AVX512-NEXT:    cmovnel %ebx, %eax
+; AVX512-NEXT:    addl $256, %eax # imm = 0x100
+; AVX512-NEXT:    orq %rcx, %rsi
+; AVX512-NEXT:    orq %rdx, %rdi
+; AVX512-NEXT:    orq %rsi, %rdi
+; AVX512-NEXT:    cmovnel %r10d, %eax
+; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    retq
+  %cnt = call i512 @llvm.cttz.i512(i512 %a0, i1 0)
+  %res = trunc i512 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @load_cttz_i512(ptr %p0) nounwind {
+; SSE-LABEL: load_cttz_i512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %r15
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    movq 48(%rdi), %r10
+; SSE-NEXT:    movq 40(%rdi), %r9
+; SSE-NEXT:    movq 24(%rdi), %r8
+; SSE-NEXT:    movq 16(%rdi), %rdx
+; SSE-NEXT:    movq (%rdi), %rcx
+; SSE-NEXT:    movq 8(%rdi), %rsi
+; SSE-NEXT:    rep bsfq %rcx, %rax
+; SSE-NEXT:    rep bsfq %rsi, %rbx
+; SSE-NEXT:    addl $64, %ebx
+; SSE-NEXT:    testq %rcx, %rcx
+; SSE-NEXT:    cmovnel %eax, %ebx
+; SSE-NEXT:    rep bsfq %rdx, %rax
+; SSE-NEXT:    rep bsfq %r8, %r11
+; SSE-NEXT:    addl $64, %r11d
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovnel %eax, %r11d
+; SSE-NEXT:    movq 32(%rdi), %r14
+; SSE-NEXT:    subl $-128, %r11d
+; SSE-NEXT:    movq %rcx, %rax
+; SSE-NEXT:    orq %rsi, %rax
+; SSE-NEXT:    cmovnel %ebx, %r11d
+; SSE-NEXT:    rep bsfq %r14, %rax
+; SSE-NEXT:    rep bsfq %r9, %rbx
+; SSE-NEXT:    addl $64, %ebx
+; SSE-NEXT:    testq %r14, %r14
+; SSE-NEXT:    cmovnel %eax, %ebx
+; SSE-NEXT:    rep bsfq %r10, %r15
+; SSE-NEXT:    movl $64, %eax
+; SSE-NEXT:    rep bsfq 56(%rdi), %rax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %r10, %r10
+; SSE-NEXT:    cmovnel %r15d, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    orq %r9, %r14
+; SSE-NEXT:    cmovnel %ebx, %eax
+; SSE-NEXT:    addl $256, %eax # imm = 0x100
+; SSE-NEXT:    orq %r8, %rsi
+; SSE-NEXT:    orq %rdx, %rcx
+; SSE-NEXT:    orq %rsi, %rcx
+; SSE-NEXT:    cmovnel %r11d, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    popq %r15
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: load_cttz_i512:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    movq 48(%rdi), %r10
+; AVX2-NEXT:    movq 40(%rdi), %r9
+; AVX2-NEXT:    movq 24(%rdi), %r8
+; AVX2-NEXT:    movq 16(%rdi), %rdx
+; AVX2-NEXT:    movq (%rdi), %rcx
+; AVX2-NEXT:    movq 8(%rdi), %rsi
+; AVX2-NEXT:    tzcntq %rcx, %rax
+; AVX2-NEXT:    xorl %ebx, %ebx
+; AVX2-NEXT:    tzcntq %rsi, %rbx
+; AVX2-NEXT:    addl $64, %ebx
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %eax, %ebx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    tzcntq %rdx, %rax
+; AVX2-NEXT:    tzcntq %r8, %r11
+; AVX2-NEXT:    addl $64, %r11d
+; AVX2-NEXT:    testq %rdx, %rdx
+; AVX2-NEXT:    cmovnel %eax, %r11d
+; AVX2-NEXT:    subl $-128, %r11d
+; AVX2-NEXT:    movq %rcx, %rax
+; AVX2-NEXT:    orq %rsi, %rax
+; AVX2-NEXT:    cmovnel %ebx, %r11d
+; AVX2-NEXT:    movq 32(%rdi), %rbx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    tzcntq %rbx, %rax
+; AVX2-NEXT:    xorl %r14d, %r14d
+; AVX2-NEXT:    tzcntq %r9, %r14
+; AVX2-NEXT:    addl $64, %r14d
+; AVX2-NEXT:    testq %rbx, %rbx
+; AVX2-NEXT:    cmovnel %eax, %r14d
+; AVX2-NEXT:    xorl %r15d, %r15d
+; AVX2-NEXT:    tzcntq %r10, %r15
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    tzcntq 56(%rdi), %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %r10, %r10
+; AVX2-NEXT:    cmovnel %r15d, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %r9, %rbx
+; AVX2-NEXT:    cmovnel %r14d, %eax
+; AVX2-NEXT:    addl $256, %eax # imm = 0x100
+; AVX2-NEXT:    orq %r8, %rsi
+; AVX2-NEXT:    orq %rdx, %rcx
+; AVX2-NEXT:    orq %rsi, %rcx
+; AVX2-NEXT:    cmovnel %r11d, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_cttz_i512:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    movq 48(%rdi), %r11
+; AVX512-NEXT:    movq 40(%rdi), %r9
+; AVX512-NEXT:    movq 32(%rdi), %r10
+; AVX512-NEXT:    movq 24(%rdi), %r8
+; AVX512-NEXT:    movq 16(%rdi), %rdx
+; AVX512-NEXT:    movq (%rdi), %rcx
+; AVX512-NEXT:    movq 8(%rdi), %rsi
+; AVX512-NEXT:    tzcntq %rcx, %rax
+; AVX512-NEXT:    tzcntq %rsi, %r14
+; AVX512-NEXT:    addl $64, %r14d
+; AVX512-NEXT:    testq %rcx, %rcx
+; AVX512-NEXT:    cmovnel %eax, %r14d
+; AVX512-NEXT:    tzcntq %rdx, %rax
+; AVX512-NEXT:    tzcntq %r8, %rbx
+; AVX512-NEXT:    addl $64, %ebx
+; AVX512-NEXT:    testq %rdx, %rdx
+; AVX512-NEXT:    cmovnel %eax, %ebx
+; AVX512-NEXT:    subl $-128, %ebx
+; AVX512-NEXT:    movq %rcx, %rax
+; AVX512-NEXT:    orq %rsi, %rax
+; AVX512-NEXT:    cmovnel %r14d, %ebx
+; AVX512-NEXT:    tzcntq %r10, %rax
+; AVX512-NEXT:    tzcntq %r9, %r14
+; AVX512-NEXT:    addl $64, %r14d
+; AVX512-NEXT:    testq %r10, %r10
+; AVX512-NEXT:    cmovnel %eax, %r14d
+; AVX512-NEXT:    tzcntq 56(%rdi), %rax
+; AVX512-NEXT:    tzcntq %r11, %rdi
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    testq %r11, %r11
+; AVX512-NEXT:    cmovnel %edi, %eax
+; AVX512-NEXT:    subl $-128, %eax
+; AVX512-NEXT:    orq %r9, %r10
+; AVX512-NEXT:    cmovnel %r14d, %eax
+; AVX512-NEXT:    addl $256, %eax # imm = 0x100
+; AVX512-NEXT:    orq %r8, %rsi
+; AVX512-NEXT:    orq %rdx, %rcx
+; AVX512-NEXT:    orq %rsi, %rcx
+; AVX512-NEXT:    cmovnel %ebx, %eax
+; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    retq
+  %a0 = load i512, ptr %p0
+  %cnt = call i512 @llvm.cttz.i512(i512 %a0, i1 0)
+  %res = trunc i512 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @test_cttz_i1024(i1024 %a0) nounwind {
+; SSE-LABEL: test_cttz_i1024:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %rbp
+; SSE-NEXT:    pushq %r15
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %r13
+; SSE-NEXT:    pushq %r12
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    movq %r9, %r13
+; SSE-NEXT:    movq %r8, %r14
+; SSE-NEXT:    movq %rcx, %rbx
+; SSE-NEXT:    movq %rdx, %r10
+; SSE-NEXT:    movq %rsi, %r9
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; SSE-NEXT:    rep bsfq %rdi, %rax
+; SSE-NEXT:    rep bsfq %r9, %r15
+; SSE-NEXT:    addl $64, %r15d
+; SSE-NEXT:    testq %rdi, %rdi
+; SSE-NEXT:    cmovnel %eax, %r15d
+; SSE-NEXT:    rep bsfq %r10, %r12
+; SSE-NEXT:    rep bsfq %rcx, %rax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %r10, %r10
+; SSE-NEXT:    cmovnel %r12d, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    movq %rdi, %r12
+; SSE-NEXT:    orq %r9, %r12
+; SSE-NEXT:    cmovnel %r15d, %eax
+; SSE-NEXT:    rep bsfq %r8, %r15
+; SSE-NEXT:    movq %r13, %rcx
+; SSE-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    rep bsfq %r13, %r13
+; SSE-NEXT:    addl $64, %r13d
+; SSE-NEXT:    testq %r8, %r8
+; SSE-NEXT:    cmovnel %r15d, %r13d
+; SSE-NEXT:    rep bsfq %rdx, %r12
+; SSE-NEXT:    rep bsfq {{[0-9]+}}(%rsp), %r15
+; SSE-NEXT:    addl $64, %r15d
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovnel %r12d, %r15d
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; SSE-NEXT:    subl $-128, %r15d
+; SSE-NEXT:    movq %r8, %rbp
+; SSE-NEXT:    orq %rcx, %rbp
+; SSE-NEXT:    cmovnel %r13d, %r15d
+; SSE-NEXT:    addl $256, %r15d # imm = 0x100
+; SSE-NEXT:    movq %r9, %r13
+; SSE-NEXT:    orq %rbx, %r13
+; SSE-NEXT:    movq %rdi, %rbp
+; SSE-NEXT:    orq %r10, %rbp
+; SSE-NEXT:    orq %r13, %rbp
+; SSE-NEXT:    cmovnel %eax, %r15d
+; SSE-NEXT:    rep bsfq %r11, %r13
+; SSE-NEXT:    rep bsfq %r12, %rax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %r11, %r11
+; SSE-NEXT:    cmovnel %r13d, %eax
+; SSE-NEXT:    rep bsfq {{[0-9]+}}(%rsp), %r13
+; SSE-NEXT:    addl $64, %r13d
+; SSE-NEXT:    rep bsfq %rsi, %rcx
+; SSE-NEXT:    testq %rsi, %rsi
+; SSE-NEXT:    cmovnel %ecx, %r13d
+; SSE-NEXT:    subl $-128, %r13d
+; SSE-NEXT:    movq %r11, %rcx
+; SSE-NEXT:    orq %r12, %rcx
+; SSE-NEXT:    cmovnel %eax, %r13d
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
+; SSE-NEXT:    rep bsfq %rbp, %rcx
+; SSE-NEXT:    addl $64, %ecx
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; SSE-NEXT:    rep bsfq %rdx, %rax
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovnel %eax, %ecx
+; SSE-NEXT:    movl $64, %eax
+; SSE-NEXT:    rep bsfq {{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; SSE-NEXT:    rep bsfq %r8, %rsi
+; SSE-NEXT:    testq %r8, %r8
+; SSE-NEXT:    cmovnel %esi, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    orq %rbp, %rdx
+; SSE-NEXT:    cmovnel %ecx, %eax
+; SSE-NEXT:    orq {{[0-9]+}}(%rsp), %r12
+; SSE-NEXT:    orq {{[0-9]+}}(%rsp), %r11
+; SSE-NEXT:    addl $256, %eax # imm = 0x100
+; SSE-NEXT:    orq %r12, %r11
+; SSE-NEXT:    cmovnel %r13d, %eax
+; SSE-NEXT:    orq {{[0-9]+}}(%rsp), %rbx
+; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; SSE-NEXT:    orq %rbx, %r9
+; SSE-NEXT:    orq {{[0-9]+}}(%rsp), %r10
+; SSE-NEXT:    orq %r14, %rdi
+; SSE-NEXT:    orq %r10, %rdi
+; SSE-NEXT:    addl $512, %eax # imm = 0x200
+; SSE-NEXT:    orq %r9, %rdi
+; SSE-NEXT:    cmovnel %r15d, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r12
+; SSE-NEXT:    popq %r13
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    popq %r15
+; SSE-NEXT:    popq %rbp
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: test_cttz_i1024:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %r13
+; AVX2-NEXT:    pushq %r12
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    movq %r9, %rbx
+; AVX2-NEXT:    movq %r8, %r14
+; AVX2-NEXT:    movq %rcx, %r11
+; AVX2-NEXT:    movq %rdx, %r10
+; AVX2-NEXT:    movq %rsi, %r9
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT:    tzcntq %rdi, %rax
+; AVX2-NEXT:    xorl %r15d, %r15d
+; AVX2-NEXT:    tzcntq %r9, %r15
+; AVX2-NEXT:    addl $64, %r15d
+; AVX2-NEXT:    testq %rdi, %rdi
+; AVX2-NEXT:    cmovnel %eax, %r15d
+; AVX2-NEXT:    xorl %r12d, %r12d
+; AVX2-NEXT:    tzcntq %r10, %r12
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    tzcntq %r11, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %r10, %r10
+; AVX2-NEXT:    cmovnel %r12d, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    movq %rdi, %r12
+; AVX2-NEXT:    orq %r9, %r12
+; AVX2-NEXT:    cmovnel %r15d, %eax
+; AVX2-NEXT:    xorl %r15d, %r15d
+; AVX2-NEXT:    tzcntq %r14, %r15
+; AVX2-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    xorl %r12d, %r12d
+; AVX2-NEXT:    tzcntq %rbx, %r12
+; AVX2-NEXT:    addl $64, %r12d
+; AVX2-NEXT:    testq %r14, %r14
+; AVX2-NEXT:    cmovnel %r15d, %r12d
+; AVX2-NEXT:    xorl %r13d, %r13d
+; AVX2-NEXT:    tzcntq %rcx, %r13
+; AVX2-NEXT:    xorl %r15d, %r15d
+; AVX2-NEXT:    tzcntq %rdx, %r15
+; AVX2-NEXT:    addl $64, %r15d
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %r13d, %r15d
+; AVX2-NEXT:    subl $-128, %r15d
+; AVX2-NEXT:    movq %r14, %r13
+; AVX2-NEXT:    orq %rbx, %r13
+; AVX2-NEXT:    cmovnel %r12d, %r15d
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; AVX2-NEXT:    addl $256, %r15d # imm = 0x100
+; AVX2-NEXT:    movq %r9, %r13
+; AVX2-NEXT:    orq %r11, %r13
+; AVX2-NEXT:    movq %rdi, %rbp
+; AVX2-NEXT:    orq %r10, %rbp
+; AVX2-NEXT:    orq %r13, %rbp
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r13
+; AVX2-NEXT:    cmovnel %eax, %r15d
+; AVX2-NEXT:    xorl %ebp, %ebp
+; AVX2-NEXT:    tzcntq %r12, %rbp
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    tzcntq %r13, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %r12, %r12
+; AVX2-NEXT:    cmovnel %ebp, %eax
+; AVX2-NEXT:    xorl %ebp, %ebp
+; AVX2-NEXT:    tzcntq %r8, %rbp
+; AVX2-NEXT:    addl $64, %ebp
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    tzcntq %rsi, %rcx
+; AVX2-NEXT:    testq %rsi, %rsi
+; AVX2-NEXT:    cmovnel %ecx, %ebp
+; AVX2-NEXT:    subl $-128, %ebp
+; AVX2-NEXT:    movq %r12, %rcx
+; AVX2-NEXT:    orq %r13, %rcx
+; AVX2-NEXT:    cmovnel %eax, %ebp
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    tzcntq %rbx, %rcx
+; AVX2-NEXT:    addl $64, %ecx
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    tzcntq %rdx, %rax
+; AVX2-NEXT:    testq %rdx, %rdx
+; AVX2-NEXT:    cmovnel %eax, %ecx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    tzcntq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; AVX2-NEXT:    tzcntq %r8, %rsi
+; AVX2-NEXT:    testq %r8, %r8
+; AVX2-NEXT:    cmovnel %esi, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %rbx, %rdx
+; AVX2-NEXT:    cmovnel %ecx, %eax
+; AVX2-NEXT:    orq {{[0-9]+}}(%rsp), %r13
+; AVX2-NEXT:    orq {{[0-9]+}}(%rsp), %r12
+; AVX2-NEXT:    addl $256, %eax # imm = 0x100
+; AVX2-NEXT:    orq %r13, %r12
+; AVX2-NEXT:    cmovnel %ebp, %eax
+; AVX2-NEXT:    orq {{[0-9]+}}(%rsp), %r11
+; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; AVX2-NEXT:    orq %r11, %r9
+; AVX2-NEXT:    orq {{[0-9]+}}(%rsp), %r10
+; AVX2-NEXT:    orq %r14, %rdi
+; AVX2-NEXT:    orq %r10, %rdi
+; AVX2-NEXT:    addl $512, %eax # imm = 0x200
+; AVX2-NEXT:    orq %r9, %rdi
+; AVX2-NEXT:    cmovnel %r15d, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r12
+; AVX2-NEXT:    popq %r13
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_cttz_i1024:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %rbp
+; AVX512-NEXT:    pushq %r15
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    pushq %r13
+; AVX512-NEXT:    pushq %r12
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    movq %r9, %r14
+; AVX512-NEXT:    movq %r8, %r15
+; AVX512-NEXT:    movq %rcx, %r11
+; AVX512-NEXT:    movq %rdx, %r10
+; AVX512-NEXT:    movq %rsi, %r9
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    tzcntq %rdi, %rax
+; AVX512-NEXT:    tzcntq %r9, %r12
+; AVX512-NEXT:    addl $64, %r12d
+; AVX512-NEXT:    testq %rdi, %rdi
+; AVX512-NEXT:    cmovnel %eax, %r12d
+; AVX512-NEXT:    tzcntq %rdx, %r13
+; AVX512-NEXT:    tzcntq %r11, %rax
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    testq %rdx, %rdx
+; AVX512-NEXT:    cmovnel %r13d, %eax
+; AVX512-NEXT:    subl $-128, %eax
+; AVX512-NEXT:    movq %rdi, %r13
+; AVX512-NEXT:    orq %r9, %r13
+; AVX512-NEXT:    cmovnel %r12d, %eax
+; AVX512-NEXT:    tzcntq %r8, %r12
+; AVX512-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    tzcntq %r14, %r13
+; AVX512-NEXT:    addl $64, %r13d
+; AVX512-NEXT:    testq %r8, %r8
+; AVX512-NEXT:    cmovnel %r12d, %r13d
+; AVX512-NEXT:    tzcntq %rcx, %rbp
+; AVX512-NEXT:    tzcntq {{[0-9]+}}(%rsp), %r12
+; AVX512-NEXT:    addl $64, %r12d
+; AVX512-NEXT:    testq %rcx, %rcx
+; AVX512-NEXT:    cmovnel %ebp, %r12d
+; AVX512-NEXT:    subl $-128, %r12d
+; AVX512-NEXT:    movq %r8, %rbp
+; AVX512-NEXT:    orq %r14, %rbp
+; AVX512-NEXT:    cmovnel %r13d, %r12d
+; AVX512-NEXT:    addl $256, %r12d # imm = 0x100
+; AVX512-NEXT:    movq %r9, %r13
+; AVX512-NEXT:    orq %r11, %r13
+; AVX512-NEXT:    movq %rdi, %rbp
+; AVX512-NEXT:    orq %rdx, %rbp
+; AVX512-NEXT:    orq %r13, %rbp
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r13
+; AVX512-NEXT:    cmovnel %eax, %r12d
+; AVX512-NEXT:    tzcntq %rbx, %rbp
+; AVX512-NEXT:    tzcntq %r13, %rax
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    testq %rbx, %rbx
+; AVX512-NEXT:    cmovnel %ebp, %eax
+; AVX512-NEXT:    tzcntq {{[0-9]+}}(%rsp), %rbp
+; AVX512-NEXT:    addl $64, %ebp
+; AVX512-NEXT:    tzcntq %rsi, %rcx
+; AVX512-NEXT:    testq %rsi, %rsi
+; AVX512-NEXT:    cmovnel %ecx, %ebp
+; AVX512-NEXT:    subl $-128, %ebp
+; AVX512-NEXT:    movq %rbx, %rcx
+; AVX512-NEXT:    orq %r13, %rcx
+; AVX512-NEXT:    cmovnel %eax, %ebp
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; AVX512-NEXT:    tzcntq %r14, %rcx
+; AVX512-NEXT:    addl $64, %ecx
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; AVX512-NEXT:    tzcntq %rdx, %rax
+; AVX512-NEXT:    testq %rdx, %rdx
+; AVX512-NEXT:    cmovnel %eax, %ecx
+; AVX512-NEXT:    tzcntq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; AVX512-NEXT:    tzcntq %r8, %rsi
+; AVX512-NEXT:    testq %r8, %r8
+; AVX512-NEXT:    cmovnel %esi, %eax
+; AVX512-NEXT:    subl $-128, %eax
+; AVX512-NEXT:    orq %r14, %rdx
+; AVX512-NEXT:    cmovnel %ecx, %eax
+; AVX512-NEXT:    orq {{[0-9]+}}(%rsp), %r13
+; AVX512-NEXT:    orq {{[0-9]+}}(%rsp), %rbx
+; AVX512-NEXT:    addl $256, %eax # imm = 0x100
+; AVX512-NEXT:    orq %r13, %rbx
+; AVX512-NEXT:    cmovnel %ebp, %eax
+; AVX512-NEXT:    orq {{[0-9]+}}(%rsp), %r11
+; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; AVX512-NEXT:    orq %r11, %r9
+; AVX512-NEXT:    orq {{[0-9]+}}(%rsp), %r10
+; AVX512-NEXT:    orq %r15, %rdi
+; AVX512-NEXT:    orq %r10, %rdi
+; AVX512-NEXT:    addl $512, %eax # imm = 0x200
+; AVX512-NEXT:    orq %r9, %rdi
+; AVX512-NEXT:    cmovnel %r12d, %eax
+; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    popq %r12
+; AVX512-NEXT:    popq %r13
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    popq %r15
+; AVX512-NEXT:    popq %rbp
+; AVX512-NEXT:    retq
+  %cnt = call i1024 @llvm.cttz.i1024(i1024 %a0, i1 0)
+  %res = trunc i1024 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @load_cttz_i1024(ptr %p0) nounwind {
+; SSE-LABEL: load_cttz_i1024:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %rbp
+; SSE-NEXT:    pushq %r15
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %r13
+; SSE-NEXT:    pushq %r12
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    movq 88(%rdi), %r10
+; SSE-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 56(%rdi), %rcx
+; SSE-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 40(%rdi), %rsi
+; SSE-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 24(%rdi), %r9
+; SSE-NEXT:    movq 16(%rdi), %r15
+; SSE-NEXT:    movq (%rdi), %r8
+; SSE-NEXT:    movq 8(%rdi), %r11
+; SSE-NEXT:    rep bsfq %r8, %rax
+; SSE-NEXT:    rep bsfq %r11, %rdx
+; SSE-NEXT:    addl $64, %edx
+; SSE-NEXT:    testq %r8, %r8
+; SSE-NEXT:    cmovnel %eax, %edx
+; SSE-NEXT:    rep bsfq %r15, %rbx
+; SSE-NEXT:    rep bsfq %r9, %rax
+; SSE-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %r15, %r15
+; SSE-NEXT:    cmovnel %ebx, %eax
+; SSE-NEXT:    movq 32(%rdi), %rbx
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    movq %r8, %r14
+; SSE-NEXT:    orq %r11, %r14
+; SSE-NEXT:    cmovnel %edx, %eax
+; SSE-NEXT:    rep bsfq %rbx, %rdx
+; SSE-NEXT:    rep bsfq %rsi, %r12
+; SSE-NEXT:    addl $64, %r12d
+; SSE-NEXT:    testq %rbx, %rbx
+; SSE-NEXT:    cmovnel %edx, %r12d
+; SSE-NEXT:    movq 48(%rdi), %r13
+; SSE-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    rep bsfq %r13, %rdx
+; SSE-NEXT:    rep bsfq %rcx, %r14
+; SSE-NEXT:    addl $64, %r14d
+; SSE-NEXT:    testq %r13, %r13
+; SSE-NEXT:    cmovnel %edx, %r14d
+; SSE-NEXT:    subl $-128, %r14d
+; SSE-NEXT:    movq %rbx, %rdx
+; SSE-NEXT:    orq %rsi, %rdx
+; SSE-NEXT:    cmovnel %r12d, %r14d
+; SSE-NEXT:    movq 72(%rdi), %r12
+; SSE-NEXT:    addl $256, %r14d # imm = 0x100
+; SSE-NEXT:    movq %r11, %rdx
+; SSE-NEXT:    orq %r9, %rdx
+; SSE-NEXT:    movq %r8, %r13
+; SSE-NEXT:    orq %r15, %r13
+; SSE-NEXT:    orq %rdx, %r13
+; SSE-NEXT:    movq 64(%rdi), %r13
+; SSE-NEXT:    cmovnel %eax, %r14d
+; SSE-NEXT:    rep bsfq %r13, %rdx
+; SSE-NEXT:    rep bsfq %r12, %rax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %r13, %r13
+; SSE-NEXT:    cmovnel %edx, %eax
+; SSE-NEXT:    rep bsfq %r10, %rbp
+; SSE-NEXT:    addl $64, %ebp
+; SSE-NEXT:    movq 80(%rdi), %r10
+; SSE-NEXT:    rep bsfq %r10, %rcx
+; SSE-NEXT:    testq %r10, %r10
+; SSE-NEXT:    cmovnel %ecx, %ebp
+; SSE-NEXT:    subl $-128, %ebp
+; SSE-NEXT:    movq %r13, %rcx
+; SSE-NEXT:    orq %r12, %rcx
+; SSE-NEXT:    cmovnel %eax, %ebp
+; SSE-NEXT:    movq 104(%rdi), %r9
+; SSE-NEXT:    rep bsfq %r9, %rcx
+; SSE-NEXT:    addl $64, %ecx
+; SSE-NEXT:    movq 96(%rdi), %rdx
+; SSE-NEXT:    rep bsfq %rdx, %rax
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovnel %eax, %ecx
+; SSE-NEXT:    movl $64, %eax
+; SSE-NEXT:    rep bsfq 120(%rdi), %rax
+; SSE-NEXT:    movq 112(%rdi), %rdi
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    rep bsfq %rdi, %rsi
+; SSE-NEXT:    testq %rdi, %rdi
+; SSE-NEXT:    cmovnel %esi, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    orq %r9, %rdx
+; SSE-NEXT:    cmovnel %ecx, %eax
+; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; SSE-NEXT:    orq %r10, %r13
+; SSE-NEXT:    addl $256, %eax # imm = 0x100
+; SSE-NEXT:    orq %r12, %r13
+; SSE-NEXT:    cmovnel %ebp, %eax
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; SSE-NEXT:    orq %rcx, %r11
+; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
+; SSE-NEXT:    orq %rbx, %r8
+; SSE-NEXT:    orq %r15, %r8
+; SSE-NEXT:    addl $512, %eax # imm = 0x200
+; SSE-NEXT:    orq %r11, %r8
+; SSE-NEXT:    cmovnel %r14d, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r12
+; SSE-NEXT:    popq %r13
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    popq %r15
+; SSE-NEXT:    popq %rbp
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: load_cttz_i1024:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %r13
+; AVX2-NEXT:    pushq %r12
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    movq 72(%rdi), %r14
+; AVX2-NEXT:    movq 64(%rdi), %r15
+; AVX2-NEXT:    movq 56(%rdi), %r9
+; AVX2-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 48(%rdi), %rcx
+; AVX2-NEXT:    movq 40(%rdi), %r10
+; AVX2-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 32(%rdi), %rsi
+; AVX2-NEXT:    movq 24(%rdi), %rbp
+; AVX2-NEXT:    movq 16(%rdi), %rbx
+; AVX2-NEXT:    movq (%rdi), %r8
+; AVX2-NEXT:    movq 8(%rdi), %r11
+; AVX2-NEXT:    tzcntq %r8, %rax
+; AVX2-NEXT:    tzcntq %r11, %rdx
+; AVX2-NEXT:    addl $64, %edx
+; AVX2-NEXT:    testq %r8, %r8
+; AVX2-NEXT:    cmovnel %eax, %edx
+; AVX2-NEXT:    xorl %r12d, %r12d
+; AVX2-NEXT:    tzcntq %rbx, %r12
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    tzcntq %rbp, %rax
+; AVX2-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rbx, %rbx
+; AVX2-NEXT:    cmovnel %r12d, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    movq %r8, %r12
+; AVX2-NEXT:    orq %r11, %r12
+; AVX2-NEXT:    cmovnel %edx, %eax
+; AVX2-NEXT:    xorl %edx, %edx
+; AVX2-NEXT:    tzcntq %rsi, %rdx
+; AVX2-NEXT:    xorl %r13d, %r13d
+; AVX2-NEXT:    tzcntq %r10, %r13
+; AVX2-NEXT:    addl $64, %r13d
+; AVX2-NEXT:    testq %rsi, %rsi
+; AVX2-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    cmovnel %edx, %r13d
+; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    xorl %edx, %edx
+; AVX2-NEXT:    tzcntq %rcx, %rdx
+; AVX2-NEXT:    xorl %r12d, %r12d
+; AVX2-NEXT:    tzcntq %r9, %r12
+; AVX2-NEXT:    addl $64, %r12d
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %edx, %r12d
+; AVX2-NEXT:    subl $-128, %r12d
+; AVX2-NEXT:    movq %rsi, %rdx
+; AVX2-NEXT:    orq %r10, %rdx
+; AVX2-NEXT:    cmovnel %r13d, %r12d
+; AVX2-NEXT:    addl $256, %r12d # imm = 0x100
+; AVX2-NEXT:    movq %r11, %rdx
+; AVX2-NEXT:    orq %rbp, %rdx
+; AVX2-NEXT:    movq %r8, %r13
+; AVX2-NEXT:    orq %rbx, %r13
+; AVX2-NEXT:    orq %rdx, %r13
+; AVX2-NEXT:    cmovnel %eax, %r12d
+; AVX2-NEXT:    xorl %edx, %edx
+; AVX2-NEXT:    tzcntq %r15, %rdx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    tzcntq %r14, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %r15, %r15
+; AVX2-NEXT:    cmovnel %edx, %eax
+; AVX2-NEXT:    movq 88(%rdi), %rbp
+; AVX2-NEXT:    xorl %r13d, %r13d
+; AVX2-NEXT:    tzcntq %rbp, %r13
+; AVX2-NEXT:    addl $64, %r13d
+; AVX2-NEXT:    movq 80(%rdi), %r10
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    tzcntq %r10, %rcx
+; AVX2-NEXT:    testq %r10, %r10
+; AVX2-NEXT:    cmovnel %ecx, %r13d
+; AVX2-NEXT:    subl $-128, %r13d
+; AVX2-NEXT:    movq %r15, %rcx
+; AVX2-NEXT:    orq %r14, %rcx
+; AVX2-NEXT:    cmovnel %eax, %r13d
+; AVX2-NEXT:    movq 104(%rdi), %r9
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    tzcntq %r9, %rcx
+; AVX2-NEXT:    addl $64, %ecx
+; AVX2-NEXT:    movq 96(%rdi), %rdx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    tzcntq %rdx, %rax
+; AVX2-NEXT:    testq %rdx, %rdx
+; AVX2-NEXT:    cmovnel %eax, %ecx
+; AVX2-NEXT:    movq 112(%rdi), %rsi
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    tzcntq 120(%rdi), %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    tzcntq %rsi, %rdi
+; AVX2-NEXT:    testq %rsi, %rsi
+; AVX2-NEXT:    cmovnel %edi, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %r9, %rdx
+; AVX2-NEXT:    cmovnel %ecx, %eax
+; AVX2-NEXT:    orq %rbp, %r14
+; AVX2-NEXT:    orq %r10, %r15
+; AVX2-NEXT:    addl $256, %eax # imm = 0x100
+; AVX2-NEXT:    orq %r14, %r15
+; AVX2-NEXT:    cmovnel %r13d, %eax
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; AVX2-NEXT:    orq %rcx, %r11
+; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; AVX2-NEXT:    orq %rbx, %r8
+; AVX2-NEXT:    addl $512, %eax # imm = 0x200
+; AVX2-NEXT:    orq %r11, %r8
+; AVX2-NEXT:    cmovnel %r12d, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r12
+; AVX2-NEXT:    popq %r13
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_cttz_i1024:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %rbp
+; AVX512-NEXT:    pushq %r15
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    pushq %r13
+; AVX512-NEXT:    pushq %r12
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    movq 88(%rdi), %rbp
+; AVX512-NEXT:    movq 72(%rdi), %r15
+; AVX512-NEXT:    movq 56(%rdi), %r9
+; AVX512-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 48(%rdi), %rcx
+; AVX512-NEXT:    movq 40(%rdi), %r10
+; AVX512-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 32(%rdi), %rsi
+; AVX512-NEXT:    movq 24(%rdi), %r14
+; AVX512-NEXT:    movq 16(%rdi), %rbx
+; AVX512-NEXT:    movq (%rdi), %r8
+; AVX512-NEXT:    movq 8(%rdi), %r11
+; AVX512-NEXT:    tzcntq %r8, %rax
+; AVX512-NEXT:    tzcntq %r11, %rdx
+; AVX512-NEXT:    addl $64, %edx
+; AVX512-NEXT:    testq %r8, %r8
+; AVX512-NEXT:    cmovnel %eax, %edx
+; AVX512-NEXT:    tzcntq %rbx, %r12
+; AVX512-NEXT:    tzcntq %r14, %rax
+; AVX512-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    testq %rbx, %rbx
+; AVX512-NEXT:    cmovnel %r12d, %eax
+; AVX512-NEXT:    subl $-128, %eax
+; AVX512-NEXT:    movq %r8, %r12
+; AVX512-NEXT:    orq %r11, %r12
+; AVX512-NEXT:    cmovnel %edx, %eax
+; AVX512-NEXT:    tzcntq %rsi, %rdx
+; AVX512-NEXT:    tzcntq %r10, %r13
+; AVX512-NEXT:    addl $64, %r13d
+; AVX512-NEXT:    testq %rsi, %rsi
+; AVX512-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    cmovnel %edx, %r13d
+; AVX512-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    tzcntq %rcx, %rdx
+; AVX512-NEXT:    tzcntq %r9, %r12
+; AVX512-NEXT:    addl $64, %r12d
+; AVX512-NEXT:    testq %rcx, %rcx
+; AVX512-NEXT:    cmovnel %edx, %r12d
+; AVX512-NEXT:    subl $-128, %r12d
+; AVX512-NEXT:    movq %rsi, %rdx
+; AVX512-NEXT:    orq %r10, %rdx
+; AVX512-NEXT:    cmovnel %r13d, %r12d
+; AVX512-NEXT:    addl $256, %r12d # imm = 0x100
+; AVX512-NEXT:    movq %r11, %rdx
+; AVX512-NEXT:    orq %r14, %rdx
+; AVX512-NEXT:    movq %r8, %r13
+; AVX512-NEXT:    orq %rbx, %r13
+; AVX512-NEXT:    orq %rdx, %r13
+; AVX512-NEXT:    movq 64(%rdi), %r13
+; AVX512-NEXT:    cmovnel %eax, %r12d
+; AVX512-NEXT:    tzcntq %r13, %rdx
+; AVX512-NEXT:    tzcntq %r15, %rax
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    testq %r13, %r13
+; AVX512-NEXT:    cmovnel %edx, %eax
+; AVX512-NEXT:    movq %rbp, %r14
+; AVX512-NEXT:    tzcntq %rbp, %rbp
+; AVX512-NEXT:    addl $64, %ebp
+; AVX512-NEXT:    movq 80(%rdi), %r10
+; AVX512-NEXT:    tzcntq %r10, %rcx
+; AVX512-NEXT:    testq %r10, %r10
+; AVX512-NEXT:    cmovnel %ecx, %ebp
+; AVX512-NEXT:    subl $-128, %ebp
+; AVX512-NEXT:    movq %r13, %rcx
+; AVX512-NEXT:    orq %r15, %rcx
+; AVX512-NEXT:    cmovnel %eax, %ebp
+; AVX512-NEXT:    movq 104(%rdi), %r9
+; AVX512-NEXT:    tzcntq %r9, %rcx
+; AVX512-NEXT:    addl $64, %ecx
+; AVX512-NEXT:    movq 96(%rdi), %rdx
+; AVX512-NEXT:    tzcntq %rdx, %rax
+; AVX512-NEXT:    testq %rdx, %rdx
+; AVX512-NEXT:    cmovnel %eax, %ecx
+; AVX512-NEXT:    movq 112(%rdi), %rsi
+; AVX512-NEXT:    tzcntq 120(%rdi), %rax
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    tzcntq %rsi, %rdi
+; AVX512-NEXT:    testq %rsi, %rsi
+; AVX512-NEXT:    cmovnel %edi, %eax
+; AVX512-NEXT:    subl $-128, %eax
+; AVX512-NEXT:    orq %r9, %rdx
+; AVX512-NEXT:    cmovnel %ecx, %eax
+; AVX512-NEXT:    orq %r14, %r15
+; AVX512-NEXT:    orq %r10, %r13
+; AVX512-NEXT:    addl $256, %eax # imm = 0x100
+; AVX512-NEXT:    orq %r15, %r13
+; AVX512-NEXT:    cmovnel %ebp, %eax
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; AVX512-NEXT:    orq %rcx, %r11
+; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; AVX512-NEXT:    orq %rbx, %r8
+; AVX512-NEXT:    addl $512, %eax # imm = 0x200
+; AVX512-NEXT:    orq %r11, %r8
+; AVX512-NEXT:    cmovnel %r12d, %eax
+; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    popq %r12
+; AVX512-NEXT:    popq %r13
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    popq %r15
+; AVX512-NEXT:    popq %rbp
+; AVX512-NEXT:    retq
+  %a0 = load i1024, ptr %p0
+  %cnt = call i1024 @llvm.cttz.i1024(i1024 %a0, i1 0)
+  %res = trunc i1024 %cnt to i32
+  ret i32 %res
+}
diff --git a/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll b/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll
index 2aea9c1dde1c5..632d90d825d69 100644
--- a/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll
+++ b/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll
@@ -27,7 +27,7 @@ entry:
 !1 = !{i64 0, !"_ZTSFivE.generalized"}
 !2 = !{i64 0, !"_ZTSFviE.generalized"}
 
-; CHECK: .section .callgraph,"o",@progbits,.text
+; CHECK: .section .llvm.callgraph,"o",@progbits,.text
 ;; Version
 ; CHECK-NEXT: .byte   0
 ;; Flags -- Potential indirect target so LSB is set to 1. Other bits are 0.
diff --git a/llvm/test/CodeGen/X86/call-graph-section-assembly.ll b/llvm/test/CodeGen/X86/call-graph-section-assembly.ll
index 1aabf66d471c8..ed6849a4e452d 100644
--- a/llvm/test/CodeGen/X86/call-graph-section-assembly.ll
+++ b/llvm/test/CodeGen/X86/call-graph-section-assembly.ll
@@ -1,8 +1,8 @@
 ;; Test if temporary labels are generated for each indirect callsite.
-;; Test if the .callgraph section contains the MD5 hash of callees' type (type id)
+;; Test if the .llvm.callgraph section contains the MD5 hash of callees' type (type id)
 ;; is correctly paired with its corresponding temporary label generated for indirect
 ;; call sites annotated with !callee_type metadata.
-;; Test if the .callgraph section contains unique direct callees.
+;; Test if the .llvm.callgraph section contains unique direct callees.
 
 ; RUN: llc -mtriple=x86_64-unknown-linux --call-graph-section -o - < %s | FileCheck %s
 
@@ -36,7 +36,7 @@ entry:
 !4 = !{!5}
 !5 = !{i64 0, !"_ZTSFPvS_E.generalized"}
 
-; CHECK: .section .callgraph,"o",@progbits,.text
+; CHECK: .section .llvm.callgraph,"o",@progbits,.text
 ;; Version
 ; CHECK-NEXT: .byte   0
 ;; Flags
diff --git a/llvm/test/CodeGen/X86/call-graph-section-tailcall.ll b/llvm/test/CodeGen/X86/call-graph-section-tailcall.ll
index 34dc5b831de6d..49cc335bf7379 100644
--- a/llvm/test/CodeGen/X86/call-graph-section-tailcall.ll
+++ b/llvm/test/CodeGen/X86/call-graph-section-tailcall.ll
@@ -1,7 +1,10 @@
-;; Tests that we store the type identifiers in .callgraph section of the object file for tailcalls.
+;; Tests that we store the type identifiers in .llvm.callgraph section of the object file for tailcalls.
+
+; REQUIRES: x86-registered-target
+; REQUIRES: arm-registered-target
 
 ; RUN: llc -mtriple=x86_64-unknown-linux --call-graph-section -filetype=obj -o - < %s | \
-; RUN: llvm-readelf -x .callgraph - | FileCheck %s
+; RUN: llvm-readelf -x .llvm.callgraph - | FileCheck %s
 
 define i32 @check_tailcall(ptr %func, i8 %x) !type !0 {
 entry:
@@ -27,7 +30,7 @@ declare !type !2 i32 @bar(i8 signext)
 !2 = !{i64 0, !"_ZTSFicE.generalized"}
 !3 = !{i64 0, !"_ZTSFiiE.generalized"}
 
-; CHECK: Hex dump of section '.callgraph':
+; CHECK: Hex dump of section '.llvm.callgraph':
 ; CHECK-NEXT: 0x00000000 00050000 00000000 00008e19 0b7f3326
 ; CHECK-NEXT: 0x00000010 e3000154 86bc5981 4b8e3000 05000000
 ;; Verify that the type id 0x308e4b8159bc8654 is in section.
diff --git a/llvm/test/CodeGen/X86/call-graph-section.ll b/llvm/test/CodeGen/X86/call-graph-section.ll
index c144a24439725..8a1c6ca627cb5 100644
--- a/llvm/test/CodeGen/X86/call-graph-section.ll
+++ b/llvm/test/CodeGen/X86/call-graph-section.ll
@@ -1,7 +1,10 @@
-;; Tests that we store the type identifiers in .callgraph section of the object file.
+;; Tests that we store the type identifiers in .llvm.callgraph section of the object file.
+
+; REQUIRES: x86-registered-target
+; REQUIRES: arm-registered-target
 
 ; RUN: llc -mtriple=x86_64-unknown-linux --call-graph-section -filetype=obj -o - < %s | \
-; RUN: llvm-readelf -x .callgraph - | FileCheck %s
+; RUN: llvm-readelf -x .llvm.callgraph - | FileCheck %s
 
 declare !type !0 void @foo()
 
@@ -31,7 +34,7 @@ entry:
 
 ;; Make sure following type IDs are in call graph section
 ;; 0x5eecb3e2444f731f, 0x814b8e305486bc59, 0xf897fd777ade6814
-; CHECK:      Hex dump of section '.callgraph':
+; CHECK:      Hex dump of section '.llvm.callgraph':
 ; CHECK-NEXT: 0x00000000 00050000 00000000 00000000 00000000
 ; CHECK-NEXT: 0x00000010 00000324 44f731f5 eecb3e54 86bc5981
 ; CHECK-NEXT: 0x00000020 4b8e307a de6814f8 97fd77
diff --git a/llvm/test/CodeGen/X86/combine-mul.ll b/llvm/test/CodeGen/X86/combine-mul.ll
index ae4d24f91ffc0..29c41cac222b2 100644
--- a/llvm/test/CodeGen/X86/combine-mul.ll
+++ b/llvm/test/CodeGen/X86/combine-mul.ll
@@ -66,7 +66,7 @@ define <4 x i32> @combine_vec_mul_pow2a(<4 x i32> %x) {
 define <4 x i32> @combine_vec_mul_pow2b(<4 x i32> %x) {
 ; SSE-LABEL: combine_vec_mul_pow2b:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,16]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_mul_pow2b:
@@ -120,12 +120,12 @@ define <4 x i32> @combine_vec_mul_negpow2a(<4 x i32> %x) {
 define <4 x i32> @combine_vec_mul_negpow2b(<4 x i32> %x) {
 ; SSE-LABEL: combine_vec_mul_negpow2b:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4294967295,4294967294,4294967292,4294967280]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_mul_negpow2b:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4294967295,4294967294,4294967292,4294967280]
 ; AVX-NEXT:    retq
   %1 = mul <4 x i32> %x, <i32 -1, i32 -2, i32 -4, i32 -16>
   ret <4 x i32> %1
@@ -176,12 +176,12 @@ define <4 x i64> @combine_vec_mul_negpow2c(<4 x i64> %x) {
 define <4 x i32> @combine_vec_mul_shl_const(<4 x i32> %x) {
 ; SSE-LABEL: combine_vec_mul_shl_const:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,12,1280,458752]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_mul_shl_const:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2,12,1280,458752]
 ; AVX-NEXT:    retq
   %1 = shl <4 x i32> %x, <i32 1, i32 2, i32 8, i32 16>
   %2 = mul <4 x i32> %1, <i32 1, i32 3, i32 5, i32 7>
@@ -193,7 +193,7 @@ define <4 x i32> @combine_vec_mul_shl_oneuse0(<4 x i32> %x, <4 x i32> %y) {
 ; SSE-LABEL: combine_vec_mul_shl_oneuse0:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pmulld %xmm1, %xmm0
-; SSE-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,4,256,65536]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_mul_shl_oneuse0:
@@ -210,7 +210,7 @@ define <4 x i32> @combine_vec_mul_shl_oneuse1(<4 x i32> %x, <4 x i32> %y) {
 ; SSE-LABEL: combine_vec_mul_shl_oneuse1:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pmulld %xmm1, %xmm0
-; SSE-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,4,256,65536]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_mul_shl_oneuse1:
@@ -226,7 +226,7 @@ define <4 x i32> @combine_vec_mul_shl_oneuse1(<4 x i32> %x, <4 x i32> %y) {
 define <4 x i32> @combine_vec_mul_shl_multiuse0(<4 x i32> %x, <4 x i32> %y) {
 ; SSE-LABEL: combine_vec_mul_shl_multiuse0:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,4,256,65536]
 ; SSE-NEXT:    pmulld %xmm0, %xmm1
 ; SSE-NEXT:    paddd %xmm1, %xmm0
 ; SSE-NEXT:    retq
@@ -246,7 +246,7 @@ define <4 x i32> @combine_vec_mul_shl_multiuse0(<4 x i32> %x, <4 x i32> %y) {
 define <4 x i32> @combine_vec_mul_shl_multiuse1(<4 x i32> %x, <4 x i32> %y) {
 ; SSE-LABEL: combine_vec_mul_shl_multiuse1:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,4,256,65536]
 ; SSE-NEXT:    pmulld %xmm0, %xmm1
 ; SSE-NEXT:    paddd %xmm1, %xmm0
 ; SSE-NEXT:    retq
@@ -268,13 +268,13 @@ define <4 x i32> @combine_vec_mul_shl_multiuse1(<4 x i32> %x, <4 x i32> %y) {
 define <4 x i32> @combine_vec_mul_add(<4 x i32> %x) {
 ; SSE-LABEL: combine_vec_mul_add:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4,6,2,0]
 ; SSE-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_mul_add:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4,6,2,0]
 ; AVX-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %1 = add <4 x i32> %x, <i32 1, i32 2, i32 8, i32 16>
diff --git a/llvm/test/CodeGen/X86/combine-multiplies.ll b/llvm/test/CodeGen/X86/combine-multiplies.ll
index a5d9846c09c6e..4bdf20da30636 100644
--- a/llvm/test/CodeGen/X86/combine-multiplies.ll
+++ b/llvm/test/CodeGen/X86/combine-multiplies.ll
@@ -142,9 +142,9 @@ define void @testCombineMultiplies_non_splat(<4 x i32> %v1) nounwind {
 ; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [11,22,33,44]
 ; CHECK-NEXT:    paddd %xmm0, %xmm1
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; CHECK-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; CHECK-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [22,33,44,55]
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
+; CHECK-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 # [33,u,55,u]
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
 ; CHECK-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [242,726,1452,2420]
diff --git a/llvm/test/CodeGen/X86/combine-pmuldq.ll b/llvm/test/CodeGen/X86/combine-pmuldq.ll
index 70335f834291d..ff5329c637251 100644
--- a/llvm/test/CodeGen/X86/combine-pmuldq.ll
+++ b/llvm/test/CodeGen/X86/combine-pmuldq.ll
@@ -204,16 +204,16 @@ define i32 @PR43159(ptr %a0) {
 ; SSE:       # %bb.0: # %entry
 ; SSE-NEXT:    movdqa (%rdi), %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [344322273,344322273,1916962805,1916962805]
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
 ; SSE-NEXT:    psrld $1, %xmm2
 ; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5],xmm0[6,7]
 ; SSE-NEXT:    psubd %xmm1, %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,2147483648,2147483648]
 ; SSE-NEXT:    paddd %xmm1, %xmm0
 ; SSE-NEXT:    psrld $7, %xmm0
-; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1645975491,344322273,2164392969,1916962805]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
 ; SSE-NEXT:    psrld $6, %xmm1
 ; SSE-NEXT:    movd %xmm1, %edi
@@ -226,15 +226,15 @@ define i32 @PR43159(ptr %a0) {
 ; AVX1:       # %bb.0: # %entry
 ; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [344322273,344322273,1916962805,1916962805]
 ; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm2
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [2147483648,2147483648,2147483648,2147483648]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
 ; AVX1-NEXT:    vpsrld $7, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm2
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5],xmm0[6,7]
-; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1645975491,344322273,2164392969,1916962805]
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; AVX1-NEXT:    vpsrld $6, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %edi
@@ -247,9 +247,9 @@ define i32 @PR43159(ptr %a0) {
 ; AVX2:       # %bb.0: # %entry
 ; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX2-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [344322273,u,1916962805,u]
 ; AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX2-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [1645975491,344322273,2164392969,1916962805]
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
 ; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
@@ -270,9 +270,9 @@ define i32 @PR43159(ptr %a0) {
 ; AVX512VL:       # %bb.0: # %entry
 ; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512VL-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512VL-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [344322273,u,1916962805,u]
 ; AVX512VL-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX512VL-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX512VL-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [1645975491,344322273,2164392969,1916962805]
 ; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
 ; AVX512VL-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
@@ -293,9 +293,9 @@ define i32 @PR43159(ptr %a0) {
 ; AVX512DQVL:       # %bb.0: # %entry
 ; AVX512DQVL-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512DQVL-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512DQVL-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [344322273,u,1916962805,u]
 ; AVX512DQVL-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX512DQVL-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX512DQVL-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [1645975491,344322273,2164392969,1916962805]
 ; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; AVX512DQVL-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
 ; AVX512DQVL-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/combine-rotates.ll b/llvm/test/CodeGen/X86/combine-rotates.ll
index 65d74c8f262a3..e7152ecedc57b 100644
--- a/llvm/test/CodeGen/X86/combine-rotates.ll
+++ b/llvm/test/CodeGen/X86/combine-rotates.ll
@@ -10,9 +10,9 @@ define <4 x i32> @combine_vec_rot_rot(<4 x i32> %x) {
 ; SSE2-LABEL: combine_vec_rot_rot:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [524288,131072,32768,8192]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [131072,u,8192,u]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll
index 6bcbfe1808933..f7baee9c8e99e 100644
--- a/llvm/test/CodeGen/X86/combine-sdiv.ll
+++ b/llvm/test/CodeGen/X86/combine-sdiv.ll
@@ -2927,7 +2927,7 @@ define <16 x i8> @pr38658(<16 x i8> %x) {
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
 ; SSE2-NEXT:    pxor %xmm3, %xmm3
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
-; SSE2-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [0,0,0,0,0,0,0,37632]
+; SSE2-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,147]
 ; SSE2-NEXT:    psrlw $8, %xmm3
 ; SSE2-NEXT:    packuswb %xmm3, %xmm1
 ; SSE2-NEXT:    paddb %xmm1, %xmm0
@@ -2947,7 +2947,7 @@ define <16 x i8> @pr38658(<16 x i8> %x) {
 ; SSE41-NEXT:    pxor %xmm1, %xmm1
 ; SSE41-NEXT:    pxor %xmm2, %xmm2
 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; SSE41-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,0,0,0,0,0,0,37632]
+; SSE41-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,147]
 ; SSE41-NEXT:    psrlw $8, %xmm2
 ; SSE41-NEXT:    packuswb %xmm2, %xmm1
 ; SSE41-NEXT:    paddb %xmm0, %xmm1
@@ -2971,7 +2971,7 @@ define <16 x i8> @pr38658(<16 x i8> %x) {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,0,0,0,0,0,0,37632]
+; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,147]
 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm1
@@ -3044,7 +3044,7 @@ define <16 x i8> @pr38658(<16 x i8> %x) {
 ; XOP:       # %bb.0:
 ; XOP-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; XOP-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; XOP-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,0,0,0,0,0,0,37632]
+; XOP-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,147]
 ; XOP-NEXT:    vpperm {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15],xmm2[1,3,5,7,9,11,13,15]
 ; XOP-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; XOP-NEXT:    vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
diff --git a/llvm/test/CodeGen/X86/combine-shl.ll b/llvm/test/CodeGen/X86/combine-shl.ll
index 1ce10c3708d58..9548967fd0592 100644
--- a/llvm/test/CodeGen/X86/combine-shl.ll
+++ b/llvm/test/CodeGen/X86/combine-shl.ll
@@ -88,7 +88,7 @@ define <4 x i32> @combine_vec_shl_known_zero1(<4 x i32> %x) {
 ; SSE2-NEXT:    pmuludq %xmm0, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32768,u,8192,u]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
@@ -97,7 +97,7 @@ define <4 x i32> @combine_vec_shl_known_zero1(<4 x i32> %x) {
 ; SSE41-LABEL: combine_vec_shl_known_zero1:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [65536,32768,16384,8192]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_shl_known_zero1:
@@ -198,16 +198,16 @@ define <4 x i32> @combine_vec_shl_shl1(<4 x i32> %x) {
 ; SSE2-LABEL: combine_vec_shl_shl1:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,64,256,1024]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [64,u,1024,u]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: combine_vec_shl_shl1:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,64,256,1024]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_shl_shl1:
@@ -304,17 +304,17 @@ define <8 x i32> @combine_vec_shl_ext_shl2(<8 x i16> %x) {
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; SSE2-NEXT:    psrad $16, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [131072,524288,2097152,8388608]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [524288,u,8388608,u]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    psrad $16, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [33554432,134217728,536870912,2147483648]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [134217728,u,2147483648,u]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
@@ -323,10 +323,10 @@ define <8 x i32> @combine_vec_shl_ext_shl2(<8 x i16> %x) {
 ; SSE41-LABEL: combine_vec_shl_ext_shl2:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    pmovsxwd %xmm0, %xmm2
-; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [131072,524288,2097152,8388608]
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; SSE41-NEXT:    pmovsxwd %xmm0, %xmm1
-; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [33554432,134217728,536870912,2147483648]
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -673,9 +673,9 @@ define <4 x i32> @combine_vec_shl_add1(<4 x i32> %x) {
 ; SSE2-LABEL: combine_vec_shl_add1:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,4,8,16]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4,u,16,u]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -683,7 +683,7 @@ define <4 x i32> @combine_vec_shl_add1(<4 x i32> %x) {
 ;
 ; SSE41-LABEL: combine_vec_shl_add1:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,4,8,16]
 ; SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -726,9 +726,9 @@ define <4 x i32> @combine_vec_shl_or1(<4 x i32> %x) {
 ; SSE2-LABEL: combine_vec_shl_or1:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,4,8,16]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4,u,16,u]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -736,7 +736,7 @@ define <4 x i32> @combine_vec_shl_or1(<4 x i32> %x) {
 ;
 ; SSE41-LABEL: combine_vec_shl_or1:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,4,8,16]
 ; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -765,7 +765,7 @@ define <4 x i32> @combine_vec_shl_mul0(<4 x i32> %x) {
 ;
 ; SSE41-LABEL: combine_vec_shl_mul0:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [20,20,20,20]
 ; SSE41-NEXT:    retq
 ;
 ; AVX2-LABEL: combine_vec_shl_mul0:
@@ -787,21 +787,21 @@ define <4 x i32> @combine_vec_shl_mul1(<4 x i32> %x) {
 ; SSE2-LABEL: combine_vec_shl_mul1:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [10,24,56,128]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [24,u,128,u]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: combine_vec_shl_mul1:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [10,24,56,128]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_shl_mul1:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [10,24,56,128]
 ; AVX-NEXT:    retq
   %1 = mul <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8>
   %2 = shl <4 x i32> %1, <i32 1, i32 2, i32 3, i32 4>
@@ -813,9 +813,9 @@ define <4 x i32> @combine_vec_add_shl_nonsplat(<4 x i32> %a0)  {
 ; SSE2-LABEL: combine_vec_add_shl_nonsplat:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4,8,16,32]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [8,u,32,u]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -823,7 +823,7 @@ define <4 x i32> @combine_vec_add_shl_nonsplat(<4 x i32> %a0)  {
 ;
 ; SSE41-LABEL: combine_vec_add_shl_nonsplat:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4,8,16,32]
 ; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -852,7 +852,7 @@ define <4 x i32> @combine_vec_add_shl_and_nonsplat(<4 x i32> %a0)  {
 ; SSE2-NEXT:    pmuludq %xmm0, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [8,u,32,u]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
@@ -863,7 +863,7 @@ define <4 x i32> @combine_vec_add_shl_and_nonsplat(<4 x i32> %a0)  {
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    pxor %xmm1, %xmm1
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
-; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4,8,16,32]
 ; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/combine-srem.ll b/llvm/test/CodeGen/X86/combine-srem.ll
index 4b01c16a6324e..0ca79adf5392e 100644
--- a/llvm/test/CodeGen/X86/combine-srem.ll
+++ b/llvm/test/CodeGen/X86/combine-srem.ll
@@ -272,7 +272,7 @@ define <4 x i32> @combine_vec_srem_by_pow2b(<4 x i32> %x) {
 ; SSE-NEXT:    psrad $2, %xmm2
 ; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
 ; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
-; SSE-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1,2,4,8]
 ; SSE-NEXT:    psubd %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
@@ -291,7 +291,7 @@ define <4 x i32> @combine_vec_srem_by_pow2b(<4 x i32> %x) {
 ; AVX1-NEXT:    vpsrad $2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,2,4,8]
 ; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
@@ -336,7 +336,7 @@ define <4 x i32> @combine_vec_srem_by_pow2b_neg(<4 x i32> %x) {
 ; SSE-NEXT:    psrld $1, %xmm1
 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; SSE-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294967294,4294967292,4294967288,4294967280]
 ; SSE-NEXT:    paddd %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
@@ -358,7 +358,7 @@ define <4 x i32> @combine_vec_srem_by_pow2b_neg(<4 x i32> %x) {
 ; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [4294967294,4294967292,4294967288,4294967280]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
@@ -368,7 +368,7 @@ define <4 x i32> @combine_vec_srem_by_pow2b_neg(<4 x i32> %x) {
 ; AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
 ; AVX2-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [4294967294,4294967292,4294967288,4294967280]
 ; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
   %1 = srem <4 x i32> %x, <i32 -2, i32 -4, i32 -8, i32 -16>
diff --git a/llvm/test/CodeGen/X86/combine-udiv.ll b/llvm/test/CodeGen/X86/combine-udiv.ll
index 55715197830b1..233735da6aae9 100644
--- a/llvm/test/CodeGen/X86/combine-udiv.ll
+++ b/llvm/test/CodeGen/X86/combine-udiv.ll
@@ -502,11 +502,11 @@ define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) {
 ; SSE2-NEXT:    por %xmm2, %xmm1
 ; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [25645,61681,8195,9363,512,32769,32897,2]
 ; SSE2-NEXT:    psubw %xmm1, %xmm0
-; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32768,0,0,0]
 ; SSE2-NEXT:    paddw %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,0]
 ; SSE2-NEXT:    pandn %xmm0, %xmm1
-; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,16,0,8,8,0,0,0,0,0,2,0,2,0,0,0]
 ; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -517,7 +517,7 @@ define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) {
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
 ; SSE41-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [25645,61681,8195,9363,512,32769,32897,2]
 ; SSE41-NEXT:    psubw %xmm1, %xmm0
-; SSE41-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32768,0,0,0]
 ; SSE41-NEXT:    paddw %xmm1, %xmm0
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [4096,2048,8,u,u,2,2,u]
 ; SSE41-NEXT:    pmulhuw %xmm0, %xmm1
@@ -530,7 +530,7 @@ define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) {
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
 ; AVX-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [25645,61681,8195,9363,512,32769,32897,2]
 ; AVX-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32768,0,0,0]
 ; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [4096,2048,8,u,u,2,2,u]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6],xmm0[7]
@@ -541,7 +541,7 @@ define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) {
 ; XOP-NEXT:    vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; XOP-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [25645,61681,8195,9363,512,32769,32897,2]
 ; XOP-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
-; XOP-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; XOP-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32768,0,0,0]
 ; XOP-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; XOP-NEXT:    vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; XOP-NEXT:    retq
@@ -630,7 +630,7 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
 ; SSE2-NEXT:    pand %xmm1, %xmm2
 ; SSE2-NEXT:    pxor %xmm3, %xmm3
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [171,0,0,0]
 ; SSE2-NEXT:    psrlw $15, %xmm0
 ; SSE2-NEXT:    pandn %xmm0, %xmm1
 ; SSE2-NEXT:    por %xmm2, %xmm1
@@ -641,7 +641,7 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [171,0,0,0]
 ; SSE41-NEXT:    psrlw $8, %xmm2
 ; SSE41-NEXT:    packuswb %xmm2, %xmm2
 ; SSE41-NEXT:    psrlw $7, %xmm2
@@ -654,7 +654,7 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
 ; AVX-LABEL: combine_vec_udiv_nonuniform4:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [171,0,0,0]
 ; AVX-NEXT:    vpsrlw $8, %xmm1, %xmm1
 ; AVX-NEXT:    vpackuswb %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vpsrlw $7, %xmm1, %xmm1
@@ -665,14 +665,12 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
 ;
 ; XOP-LABEL: combine_vec_udiv_nonuniform4:
 ; XOP:       # %bb.0:
-; XOP-NEXT:    movl $171, %eax
+; XOP-NEXT:    movl $249, %eax
 ; XOP-NEXT:    vmovd %eax, %xmm1
 ; XOP-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; XOP-NEXT:    vpmullw %xmm1, %xmm2, %xmm1
-; XOP-NEXT:    vpsrlw $8, %xmm1, %xmm1
-; XOP-NEXT:    movl $249, %eax
-; XOP-NEXT:    vmovd %eax, %xmm2
-; XOP-NEXT:    vpshlb %xmm2, %xmm1, %xmm1
+; XOP-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [171,0,0,0]
+; XOP-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; XOP-NEXT:    vpshlb %xmm1, %xmm2, %xmm1
 ; XOP-NEXT:    vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551615]
 ; XOP-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
 ; XOP-NEXT:    retq
@@ -691,7 +689,7 @@ define <8 x i16> @pr38477(<8 x i16> %a0) {
 ; SSE2-NEXT:    psubw %xmm3, %xmm0
 ; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [u,32768,0,0,0,0,0,32768]
 ; SSE2-NEXT:    paddw %xmm3, %xmm0
-; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [u,u,0,4,0,4,16,0,4,0,0,4,0,0,0,16]
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
 ; SSE2-NEXT:    por %xmm3, %xmm0
 ; SSE2-NEXT:    pand %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/combine-umax.ll b/llvm/test/CodeGen/X86/combine-umax.ll
index 25f8ec891a247..482b4fcd744ed 100644
--- a/llvm/test/CodeGen/X86/combine-umax.ll
+++ b/llvm/test/CodeGen/X86/combine-umax.ll
@@ -60,7 +60,7 @@ define <16 x i8> @test_v16i8_reassociation(<16 x i8> %a) {
 define <16 x i8> @test_v16i8_demandedbits(<16 x i8> %x, <16 x i8> %y, <16 x i8> %a, <16 x i8> %b) {
 ; SSE2-LABEL: test_v16i8_demandedbits:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pmaxub %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
 ; SSE2-NEXT:    pand %xmm1, %xmm3
diff --git a/llvm/test/CodeGen/X86/combine-umin.ll b/llvm/test/CodeGen/X86/combine-umin.ll
index 76dbcb50bf8c7..e2757d00d1f61 100644
--- a/llvm/test/CodeGen/X86/combine-umin.ll
+++ b/llvm/test/CodeGen/X86/combine-umin.ll
@@ -77,7 +77,7 @@ define <16 x i8> @test_v16i8_reassociation(<16 x i8> %a) {
 define <16 x i8> @test_v16i8_demandedbits(<16 x i8> %x, <16 x i8> %y, <16 x i8> %a, <16 x i8> %b) {
 ; SSE2-LABEL: test_v16i8_demandedbits:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pminub %xmm1, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
 ; SSE2-NEXT:    pand %xmm1, %xmm3
diff --git a/llvm/test/CodeGen/X86/combine-urem.ll b/llvm/test/CodeGen/X86/combine-urem.ll
index 715d5c7b28f11..34c7d3deea023 100644
--- a/llvm/test/CodeGen/X86/combine-urem.ll
+++ b/llvm/test/CodeGen/X86/combine-urem.ll
@@ -327,7 +327,7 @@ define <4 x i32> @combine_vec_urem_by_shl_pow2b(<4 x i32> %x, <4 x i32> %y) {
 ; SSE-NEXT:    pslld $23, %xmm1
 ; SSE-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE-NEXT:    cvttps2dq %xmm1, %xmm1
-; SSE-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,4,8,16]
 ; SSE-NEXT:    pcmpeqd %xmm2, %xmm2
 ; SSE-NEXT:    paddd %xmm1, %xmm2
 ; SSE-NEXT:    pand %xmm2, %xmm0
@@ -338,7 +338,7 @@ define <4 x i32> @combine_vec_urem_by_shl_pow2b(<4 x i32> %x, <4 x i32> %y) {
 ; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
 ; AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
-; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,4,8,16]
 ; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/cpus-intel.ll b/llvm/test/CodeGen/X86/cpus-intel.ll
index 40c38c2e82849..646629d49ec84 100644
--- a/llvm/test/CodeGen/X86/cpus-intel.ll
+++ b/llvm/test/CodeGen/X86/cpus-intel.ll
@@ -38,6 +38,8 @@
 ; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=lunarlake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
 ; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=gracemont 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
 ; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=pantherlake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=wildcatlake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=novalake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
 ; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=clearwaterforest 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
 ; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=diamondrapids 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
 
@@ -104,6 +106,8 @@
 ; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=lunarlake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
 ; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=gracemont 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
 ; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=pantherlake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=wildcatlake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=novalake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
 ; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=clearwaterforest 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
 ; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=diamondrapids 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
 
diff --git a/llvm/test/CodeGen/X86/dagcombine-shifts.ll b/llvm/test/CodeGen/X86/dagcombine-shifts.ll
index 345b2b9309f9a..19b9452e7117e 100644
--- a/llvm/test/CodeGen/X86/dagcombine-shifts.ll
+++ b/llvm/test/CodeGen/X86/dagcombine-shifts.ll
@@ -437,9 +437,9 @@ define <4 x i32> @shift_zext_shl2_vec(<4 x i8> %x) nounwind {
 ; X64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X64-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [512,256,128,64]
 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [256,u,64,u]
 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X64-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/fcmove.ll b/llvm/test/CodeGen/X86/fcmove.ll
deleted file mode 100644
index 6bb014858d048..0000000000000
--- a/llvm/test/CodeGen/X86/fcmove.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; RUN: llc %s -o - -verify-machineinstrs | FileCheck %s
-
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-unknown"
-
-; Test that we can generate an fcmove, and also that it passes verification.
-
-; CHECK-LABEL: cmove_f
-; CHECK: fcmove %st({{[0-7]}}), %st
-define x86_fp80 @cmove_f(x86_fp80 %a, x86_fp80 %b, i32 %c) {
-  %test = icmp eq i32 %c, 0
-  %add = fadd x86_fp80 %a, %b
-  %ret = select i1 %test, x86_fp80 %add, x86_fp80 %b
-  ret x86_fp80 %ret
-}
diff --git a/llvm/test/CodeGen/X86/funnel-shift.ll b/llvm/test/CodeGen/X86/funnel-shift.ll
index df97f49440f74..252cb3333f1d1 100644
--- a/llvm/test/CodeGen/X86/funnel-shift.ll
+++ b/llvm/test/CodeGen/X86/funnel-shift.ll
@@ -574,9 +574,9 @@ define <4 x i32> @fshl_v4i32_undef1_cst(<4 x i32> %a0) nounwind {
 ; X86-SSE2-LABEL: fshl_v4i32_undef1_cst:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [512,1024,2048,4096]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [1024,u,4096,u]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X86-SSE2-NEXT:    retl
@@ -746,9 +746,9 @@ define <4 x i32> @fshr_v4i32_undef1_cst(<4 x i32> %a0) nounwind {
 ; X86-SSE2-LABEL: fshr_v4i32_undef1_cst:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [8388608,4194304,2097152,1048576]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [4194304,u,1048576,u]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X86-SSE2-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll b/llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll
index f3950b75a969f..b2b0a6dab843f 100644
--- a/llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll
+++ b/llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll
@@ -1,17 +1,101 @@
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-;; A minimal test case. Subsequent PRs will expand on this test case
-;; (e.g., with more functions, variables and profiles) and test the hotness
-;; reconcillation implementation.
+;; Requires asserts for -debug-only.
+; REQUIRES: asserts
+
+; RUN: rm -rf %t && split-file %s %t && cd %t
+
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -relocation-model=pic \
+; RUN:     -partition-static-data-sections=true \
+; RUN:     -debug-only=static-data-profile-info \
+; RUN:     -data-sections=true  -unique-section-names=false \
+; RUN:     input-with-data-access-prof-on.ll -o - 2>&1 | FileCheck %s --check-prefixes=LOG,IR
+
 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -relocation-model=pic \
 ; RUN:     -partition-static-data-sections=true \
+; RUN:     -debug-only=static-data-profile-info \
 ; RUN:     -data-sections=true  -unique-section-names=false \
-; RUN:     %s -o - 2>&1 | FileCheck %s --check-prefix=IR
+; RUN:     input-with-data-access-prof-off.ll -o - 2>&1 | FileCheck %s --check-prefixes=OFF
+
+; LOG: hot_bss has section prefix hot, the max from data access profiles as hot and PGO counters as hot
+; LOG: data_unknown_hotness has section prefix <empty>, the max from data access profiles as <empty> and PGO counters as unlikely
+; LOG: external_relro_array has section prefix unlikely, solely from data access profiles
+
+; IR:          .type   hot_bss,@object
+; IR-NEXT:     .section .bss.hot.,"aw"
+; IR:          .type   data_unknown_hotness,@object
+; IR-NEXT:    .section .data,"aw"
+; IR:          .type   external_relro_array,@object
+; IR-NEXT:     .section        .data.rel.ro.unlikely.,"aw"
+
+
+; OFF:        .type   hot_bss,@object
+; OFF-NEXT:   .section        .bss.hot.,"aw"
+; OFF:        .type   data_unknown_hotness,@object
+; OFF-NEXT:   .section        .data.unlikely.,"aw"
+;; Global variable section prefix metadata is not used when
+;; module flag `EnableDataAccessProf` is 0, and @external_relro_array has
+;; external linkage, so analysis based on PGO counters doesn't apply. 
+; OFF:        .type   external_relro_array,@object    # @external_relro_array
+; OFF-NEXT:   .section        .data.rel.ro,"aw"
+
+;--- input-with-data-access-prof-on.ll
+; Internal vars
+@hot_bss = internal global i32 0, !section_prefix !17
+@data_unknown_hotness = internal global i32 1
+; External vars
+@external_relro_array = constant [2 x ptr] [ptr @hot_bss, ptr @data_unknown_hotness], !section_prefix !18
+
+define void @cold_func() !prof !15 {
+  %9 = load i32, ptr @data_unknown_hotness
+  %11 = call i32 (...) @func_taking_arbitrary_param(i32 %9)
+  ret void
+}
+
+define void @hot_func() !prof !14 {
+  %9 = load i32, ptr @hot_bss
+  %11 = call i32 (...) @func_taking_arbitrary_param(i32 %9)
+  ret void
+}
+
+declare i32 @func_taking_arbitrary_param(...)
 
-; IR: .section .bss.hot.,"aw"
+!llvm.module.flags = !{!0, !1}
 
+!0 = !{i32 2, !"EnableDataAccessProf", i32 1}
+!1 = !{i32 1, !"ProfileSummary", !2}
+!2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
+!3 = !{!"ProfileFormat", !"InstrProf"}
+!4 = !{!"TotalCount", i64 1460183}
+!5 = !{!"MaxCount", i64 849024}
+!6 = !{!"MaxInternalCount", i64 32769}
+!7 = !{!"MaxFunctionCount", i64 849024}
+!8 = !{!"NumCounts", i64 23627}
+!9 = !{!"NumFunctions", i64 3271}
+!10 = !{!"DetailedSummary", !11}
+!11 = !{!12, !13}
+!12 = !{i32 990000, i64 166, i32 73}
+!13 = !{i32 999999, i64 3, i32 1443}
+!14 = !{!"function_entry_count", i64 100000}
+!15 = !{!"function_entry_count", i64 1}
+!16 = !{!"branch_weights", i32 1, i32 99999}
+!17 = !{!"section_prefix", !"hot"}
+!18 = !{!"section_prefix", !"unlikely"}
+
+;--- input-with-data-access-prof-off.ll
+; Same as file above except that module flag `EnableDataAccessProf` has value 0.
+; Internal vars
 @hot_bss = internal global i32 0, !section_prefix !17
+@data_unknown_hotness = internal global i32 1
+; External vars
+@external_relro_array = constant [2 x ptr] [ptr @hot_bss, ptr @data_unknown_hotness], !section_prefix !18
+
+define void @cold_func() !prof !15 {
+  %9 = load i32, ptr @data_unknown_hotness
+  %11 = call i32 (...) @func_taking_arbitrary_param(i32 %9)
+  ret void
+}
 
 define void @hot_func() !prof !14 {
   %9 = load i32, ptr @hot_bss
@@ -21,8 +105,9 @@ define void @hot_func() !prof !14 {
 
 declare i32 @func_taking_arbitrary_param(...)
 
-!llvm.module.flags = !{!1}
+!llvm.module.flags = !{!0, !1}
 
+!0 = !{i32 2, !"EnableDataAccessProf", i32 0}
 !1 = !{i32 1, !"ProfileSummary", !2}
 !2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
 !3 = !{!"ProfileFormat", !"InstrProf"}
@@ -40,3 +125,4 @@ declare i32 @func_taking_arbitrary_param(...)
 !15 = !{!"function_entry_count", i64 1}
 !16 = !{!"branch_weights", i32 1, i32 99999}
 !17 = !{!"section_prefix", !"hot"}
+!18 = !{!"section_prefix", !"unlikely"}
diff --git a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
index 1a2aac657d30f..b45d01e7b3653 100644
--- a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
+++ b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
@@ -499,9 +499,9 @@ define <4 x i1> @vec_4xi32_nonsplat_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
 ; X86-SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [0,1,16776960,2147483648]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
+; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 # [1,u,2147483648,u]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
 ; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; X86-SSE2-NEXT:    pand %xmm1, %xmm0
@@ -524,9 +524,9 @@ define <4 x i1> @vec_4xi32_nonsplat_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X64-SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; X64-SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; X64-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,1,16776960,2147483648]
 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X64-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; X64-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1,u,2147483648,u]
 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
 ; X64-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; X64-SSE2-NEXT:    pand %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/isel-select-fcmov.ll b/llvm/test/CodeGen/X86/isel-select-fcmov.ll
new file mode 100644
index 0000000000000..cb441b860bb56
--- /dev/null
+++ b/llvm/test/CodeGen/X86/isel-select-fcmov.ll
@@ -0,0 +1,175 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=i686-linux-gnu -mattr=+cmov -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=i686-linux-gnu -mattr=+cmov -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=X86-GISEL
+; RUN: llc < %s -mtriple=i686-linux-gnu -mattr=+cmov -fast-isel=0 -global-isel=0 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=X64-GISEL
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel=0 -global-isel=0 | FileCheck %s --check-prefix=X64
+
+; Test that we can generate an fcmove, and also that it passes verification.
+
+define x86_fp80 @cmove_cmp(x86_fp80 %a, x86_fp80 %b, i32 %c) {
+; X86-LABEL: cmove_cmp:
+; X86:       # %bb.0:
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    fadd %st(1), %st
+; X86-NEXT:    fxch %st(1)
+; X86-NEXT:    fcmove %st(1), %st
+; X86-NEXT:    fstp %st(1)
+; X86-NEXT:    retl
+;
+; X86-GISEL-LABEL: cmove_cmp:
+; X86-GISEL:       # %bb.0:
+; X86-GISEL-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-GISEL-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-GISEL-NEXT:    xorl %eax, %eax
+; X86-GISEL-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X86-GISEL-NEXT:    sete %al
+; X86-GISEL-NEXT:    fadd %st, %st(1)
+; X86-GISEL-NEXT:    andl $1, %eax
+; X86-GISEL-NEXT:    testl %eax, %eax
+; X86-GISEL-NEXT:    fxch %st(1)
+; X86-GISEL-NEXT:    fcmove %st(1), %st
+; X86-GISEL-NEXT:    fstp %st(1)
+; X86-GISEL-NEXT:    retl
+;
+; X64-LABEL: cmove_cmp:
+; X64:       # %bb.0:
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    testl %edi, %edi
+; X64-NEXT:    fadd %st(1), %st
+; X64-NEXT:    fxch %st(1)
+; X64-NEXT:    fcmove %st(1), %st
+; X64-NEXT:    fstp %st(1)
+; X64-NEXT:    retq
+;
+; X64-GISEL-LABEL: cmove_cmp:
+; X64-GISEL:       # %bb.0:
+; X64-GISEL-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-GISEL-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-GISEL-NEXT:    xorl %eax, %eax
+; X64-GISEL-NEXT:    cmpl $0, %edi
+; X64-GISEL-NEXT:    sete %al
+; X64-GISEL-NEXT:    fadd %st, %st(1)
+; X64-GISEL-NEXT:    andl $1, %eax
+; X64-GISEL-NEXT:    testl %eax, %eax
+; X64-GISEL-NEXT:    fxch %st(1)
+; X64-GISEL-NEXT:    fcmove %st(1), %st
+; X64-GISEL-NEXT:    fstp %st(1)
+; X64-GISEL-NEXT:    retq
+  %test = icmp eq i32 %c, 0
+  %add = fadd x86_fp80 %a, %b
+  %ret = select i1 %test, x86_fp80 %add, x86_fp80 %b
+  ret x86_fp80 %ret
+}
+
+define x86_fp80 @cmove_arg(x86_fp80 %a, x86_fp80 %b, i1 %test) {
+; X86-LABEL: cmove_arg:
+; X86:       # %bb.0:
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fadd %st(1), %st
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    fxch %st(1)
+; X86-NEXT:    fcmovne %st(1), %st
+; X86-NEXT:    fstp %st(1)
+; X86-NEXT:    retl
+;
+; X86-GISEL-LABEL: cmove_arg:
+; X86-GISEL:       # %bb.0:
+; X86-GISEL-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-GISEL-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-GISEL-NEXT:    fadd %st, %st(1)
+; X86-GISEL-NEXT:    movl $1, %eax
+; X86-GISEL-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-GISEL-NEXT:    testl %eax, %eax
+; X86-GISEL-NEXT:    fxch %st(1)
+; X86-GISEL-NEXT:    fcmove %st(1), %st
+; X86-GISEL-NEXT:    fstp %st(1)
+; X86-GISEL-NEXT:    retl
+;
+; X64-LABEL: cmove_arg:
+; X64:       # %bb.0:
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fadd %st(1), %st
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    fxch %st(1)
+; X64-NEXT:    fcmovne %st(1), %st
+; X64-NEXT:    fstp %st(1)
+; X64-NEXT:    retq
+;
+; X64-GISEL-LABEL: cmove_arg:
+; X64-GISEL:       # %bb.0:
+; X64-GISEL-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-GISEL-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-GISEL-NEXT:    fadd %st, %st(1)
+; X64-GISEL-NEXT:    andl $1, %edi
+; X64-GISEL-NEXT:    testl %edi, %edi
+; X64-GISEL-NEXT:    fxch %st(1)
+; X64-GISEL-NEXT:    fcmove %st(1), %st
+; X64-GISEL-NEXT:    fstp %st(1)
+; X64-GISEL-NEXT:    retq
+  %add = fadd x86_fp80 %a, %b
+  %ret = select i1 %test, x86_fp80 %add, x86_fp80 %b
+  ret x86_fp80 %ret
+}
+
+define x86_fp80 @cmove_load(x86_fp80 %a, x86_fp80 %b, ptr %p) {
+; X86-LABEL: cmove_load:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fadd %st(1), %st
+; X86-NEXT:    cmpb $0, (%eax)
+; X86-NEXT:    fxch %st(1)
+; X86-NEXT:    fcmovne %st(1), %st
+; X86-NEXT:    fstp %st(1)
+; X86-NEXT:    retl
+;
+; X86-GISEL-LABEL: cmove_load:
+; X86-GISEL:       # %bb.0:
+; X86-GISEL-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-GISEL-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-GISEL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-GISEL-NEXT:    fadd %st, %st(1)
+; X86-GISEL-NEXT:    movzbl (%eax), %eax
+; X86-GISEL-NEXT:    andl $1, %eax
+; X86-GISEL-NEXT:    testl %eax, %eax
+; X86-GISEL-NEXT:    fxch %st(1)
+; X86-GISEL-NEXT:    fcmove %st(1), %st
+; X86-GISEL-NEXT:    fstp %st(1)
+; X86-GISEL-NEXT:    retl
+;
+; X64-LABEL: cmove_load:
+; X64:       # %bb.0:
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fadd %st(1), %st
+; X64-NEXT:    cmpb $0, (%rdi)
+; X64-NEXT:    fxch %st(1)
+; X64-NEXT:    fcmovne %st(1), %st
+; X64-NEXT:    fstp %st(1)
+; X64-NEXT:    retq
+;
+; X64-GISEL-LABEL: cmove_load:
+; X64-GISEL:       # %bb.0:
+; X64-GISEL-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-GISEL-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-GISEL-NEXT:    fadd %st, %st(1)
+; X64-GISEL-NEXT:    movzbl (%rdi), %eax
+; X64-GISEL-NEXT:    andl $1, %eax
+; X64-GISEL-NEXT:    testl %eax, %eax
+; X64-GISEL-NEXT:    fxch %st(1)
+; X64-GISEL-NEXT:    fcmove %st(1), %st
+; X64-GISEL-NEXT:    fstp %st(1)
+; X64-GISEL-NEXT:    retq
+  %test = load i1, ptr %p
+  %add = fadd x86_fp80 %a, %b
+  %ret = select i1 %test, x86_fp80 %add, x86_fp80 %b
+  ret x86_fp80 %ret
+}
diff --git a/llvm/test/CodeGen/X86/known-pow2.ll b/llvm/test/CodeGen/X86/known-pow2.ll
index e183bbc15617d..019bca7e53b4c 100644
--- a/llvm/test/CodeGen/X86/known-pow2.ll
+++ b/llvm/test/CodeGen/X86/known-pow2.ll
@@ -28,16 +28,16 @@ define <4 x i32> @pow2_non_splat_vec_fail0(<4 x i32> %x) {
 ; CHECK-NEXT:    pmuludq %xmm0, %xmm1
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; CHECK-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; CHECK-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1073741824,u,67108864,u]
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3]
 ; CHECK-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
 ; CHECK-NEXT:    movdqa %xmm1, %xmm3
 ; CHECK-NEXT:    psrld $1, %xmm3
 ; CHECK-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3]
-; CHECK-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; CHECK-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [9,4,16,64]
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; CHECK-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [4,u,64,u]
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
 ; CHECK-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; CHECK-NEXT:    psubd %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll
index bdb7c307a5759..2a2a4a5ca18d3 100644
--- a/llvm/test/CodeGen/X86/madd.ll
+++ b/llvm/test/CodeGen/X86/madd.ll
@@ -2057,10 +2057,10 @@ define <4 x i32> @pmaddwd_negative2(<8 x i16> %A) {
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
 ; SSE2-NEXT:    psrad $16, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294934528,0,0,0]
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,0,7,0,42,0,32,0]
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [32768,4294934528,0,0]
 ; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[0,2]
 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
 ; SSE2-NEXT:    paddd %xmm2, %xmm1
@@ -2071,15 +2071,15 @@ define <4 x i32> @pmaddwd_negative2(<8 x i16> %A) {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
-; AVX1-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,7,42,32]
+; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32768,4294934528,0,0]
 ; AVX1-NEXT:    vphaddd %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX256-LABEL: pmaddwd_negative2:
 ; AVX256:       # %bb.0:
 ; AVX256-NEXT:    vpmovsxwd %xmm0, %ymm0
-; AVX256-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX256-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [32768,4294934528,0,0,1,7,42,32]
 ; AVX256-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX256-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
 ; AVX256-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
index 4cde581c10508..caec02eaa19c7 100644
--- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
@@ -4765,6 +4765,66 @@ define void @scaleidx_scatter_outofrange(<8 x float> %value, ptr %base, <8 x i32
 }
 declare void @llvm.masked.scatter.v8f32.v8p0(<8 x float>, <8 x ptr>, i32 immarg, <8 x i1>)
 
+define <16 x i32> @pr163023_sext(ptr %a0, <16 x i32> %a1) {
+; X64-LABEL: pr163023_sext:
+; X64:       # %bb.0:
+; X64-NEXT:    kxnorw %k0, %k0, %k1
+; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X64-NEXT:    vpgatherdd (%rdi,%zmm0), %zmm1 {%k1}
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: pr163023_sext:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    kxnorw %k0, %k0, %k1
+; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X86-NEXT:    vpgatherdd (%eax,%zmm0), %zmm1 {%k1}
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0
+; X86-NEXT:    retl
+  %addr.p = ptrtoint ptr %a0 to i64
+  %addr.v = insertelement <1 x i64> poison, i64 %addr.p, i64 0
+  %addr.splat = shufflevector <1 x i64> %addr.v, <1 x i64> poison, <16 x i32> zeroinitializer
+  %ofs = sext <16 x i32> %a1 to <16 x i64>
+  %addr = add nuw <16 x i64> %addr.splat, %ofs
+  %ptr = inttoptr <16 x i64> %addr to <16 x ptr>
+  %gather = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> %ptr, i32 4, <16 x i1> splat (i1 true), <16 x i32> poison)
+  ret <16 x i32> %gather
+}
+
+define <16 x i32> @pr163023_zext(ptr %a0, <16 x i32> %a1) {
+; X64-LABEL: pr163023_zext:
+; X64:       # %bb.0:
+; X64-NEXT:    vpmovzxdq {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; X64-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; X64-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; X64-NEXT:    kxnorw %k0, %k0, %k1
+; X64-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; X64-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; X64-NEXT:    kxnorw %k0, %k0, %k2
+; X64-NEXT:    vpgatherqd (%rdi,%zmm0), %ymm3 {%k2}
+; X64-NEXT:    vpgatherqd (%rdi,%zmm1), %ymm2 {%k1}
+; X64-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: pr163023_zext:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    kxnorw %k0, %k0, %k1
+; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X86-NEXT:    vpgatherdd (%eax,%zmm0), %zmm1 {%k1}
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0
+; X86-NEXT:    retl
+  %addr.p = ptrtoint ptr %a0 to i64
+  %addr.v = insertelement <1 x i64> poison, i64 %addr.p, i64 0
+  %addr.splat = shufflevector <1 x i64> %addr.v, <1 x i64> poison, <16 x i32> zeroinitializer
+  %ofs = zext <16 x i32> %a1 to <16 x i64>
+  %addr = add nuw <16 x i64> %addr.splat, %ofs
+  %ptr = inttoptr <16 x i64> %addr to <16 x ptr>
+  %gather = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> %ptr, i32 4, <16 x i1> splat (i1 true), <16 x i32> poison)
+  ret <16 x i32> %gather
+}
+
 ;
 ; PR45906
 ; This used to cause fast-isel to generate bad copy instructions that would
diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
index d752659f94a50..04f0a65c99da8 100644
--- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll
+++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -mattr=prefer-256-bit | FileCheck %s --check-prefixes=CHECK,CHECK-SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -mattr=prefer-256-bit | FileCheck %s --check-prefixes=CHECK,CHECK-SKX,CHECK-SKX-NOVBMI
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -mattr=prefer-256-bit,avx512vbmi | FileCheck %s --check-prefixes=CHECK,CHECK-SKX,CHECK-SKX-VBMI
 ; Make sure CPUs default to prefer-256-bit. avx512vnni isn't interesting as it just adds an isel peephole for vpmaddwd+vpaddd
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512
@@ -883,6 +883,30 @@ define <16 x i16> @test_16f32tosb_512(ptr %ptr, <16 x i16> %passthru) "min-legal
 }
 
 define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="256" {
+; CHECK-SKX-NOVBMI-LABEL: mul256:
+; CHECK-SKX-NOVBMI:       # %bb.0:
+; CHECK-SKX-NOVBMI-NEXT:    vmovdqa (%rdi), %ymm0
+; CHECK-SKX-NOVBMI-NEXT:    vmovdqa 32(%rdi), %ymm1
+; CHECK-SKX-NOVBMI-NEXT:    vmovdqa (%rsi), %ymm2
+; CHECK-SKX-NOVBMI-NEXT:    vmovdqa 32(%rsi), %ymm3
+; CHECK-SKX-NOVBMI-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-SKX-NOVBMI-NEXT:    vpand %ymm4, %ymm3, %ymm5
+; CHECK-SKX-NOVBMI-NEXT:    vpmaddubsw %ymm5, %ymm1, %ymm5
+; CHECK-SKX-NOVBMI-NEXT:    vpandn %ymm3, %ymm4, %ymm3
+; CHECK-SKX-NOVBMI-NEXT:    vpmaddubsw %ymm3, %ymm1, %ymm1
+; CHECK-SKX-NOVBMI-NEXT:    vpsllw $8, %ymm1, %ymm1
+; CHECK-SKX-NOVBMI-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm5 & ymm4)
+; CHECK-SKX-NOVBMI-NEXT:    vpand %ymm4, %ymm2, %ymm3
+; CHECK-SKX-NOVBMI-NEXT:    vpmaddubsw %ymm3, %ymm0, %ymm3
+; CHECK-SKX-NOVBMI-NEXT:    vpandn %ymm2, %ymm4, %ymm2
+; CHECK-SKX-NOVBMI-NEXT:    vpmaddubsw %ymm2, %ymm0, %ymm0
+; CHECK-SKX-NOVBMI-NEXT:    vpsllw $8, %ymm0, %ymm0
+; CHECK-SKX-NOVBMI-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm3 & ymm4)
+; CHECK-SKX-NOVBMI-NEXT:    vmovdqa %ymm0, (%rdx)
+; CHECK-SKX-NOVBMI-NEXT:    vmovdqa %ymm1, 32(%rdx)
+; CHECK-SKX-NOVBMI-NEXT:    vzeroupper
+; CHECK-SKX-NOVBMI-NEXT:    retq
+;
 ; CHECK-SKX-VBMI-LABEL: mul256:
 ; CHECK-SKX-VBMI:       # %bb.0:
 ; CHECK-SKX-VBMI-NEXT:    vmovdqa (%rdi), %ymm0
@@ -960,6 +984,21 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
 }
 
 define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="512" {
+; CHECK-SKX-NOVBMI-LABEL: mul512:
+; CHECK-SKX-NOVBMI:       # %bb.0:
+; CHECK-SKX-NOVBMI-NEXT:    vmovdqa64 (%rdi), %zmm0
+; CHECK-SKX-NOVBMI-NEXT:    vmovdqa64 (%rsi), %zmm1
+; CHECK-SKX-NOVBMI-NEXT:    vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-SKX-NOVBMI-NEXT:    vpandq %zmm2, %zmm1, %zmm3
+; CHECK-SKX-NOVBMI-NEXT:    vpmaddubsw %zmm3, %zmm0, %zmm3
+; CHECK-SKX-NOVBMI-NEXT:    vpandnq %zmm1, %zmm2, %zmm1
+; CHECK-SKX-NOVBMI-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm0
+; CHECK-SKX-NOVBMI-NEXT:    vpsllw $8, %zmm0, %zmm0
+; CHECK-SKX-NOVBMI-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm3 & zmm2)
+; CHECK-SKX-NOVBMI-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; CHECK-SKX-NOVBMI-NEXT:    vzeroupper
+; CHECK-SKX-NOVBMI-NEXT:    retq
+;
 ; CHECK-SKX-VBMI-LABEL: mul512:
 ; CHECK-SKX-VBMI:       # %bb.0:
 ; CHECK-SKX-VBMI-NEXT:    vmovdqa64 (%rdi), %zmm0
@@ -1137,6 +1176,14 @@ define <16 x i16> @trunc_v16i32_v16i16_zeroes(ptr %x) nounwind "min-legal-vector
 }
 
 define <32 x i8> @trunc_v32i16_v32i8_zeroes(ptr %x) nounwind "min-legal-vector-width"="256" {
+; CHECK-SKX-NOVBMI-LABEL: trunc_v32i16_v32i8_zeroes:
+; CHECK-SKX-NOVBMI:       # %bb.0:
+; CHECK-SKX-NOVBMI-NEXT:    vpsrlw $8, 32(%rdi), %ymm0
+; CHECK-SKX-NOVBMI-NEXT:    vpsrlw $8, (%rdi), %ymm1
+; CHECK-SKX-NOVBMI-NEXT:    vpackuswb %ymm0, %ymm1, %ymm0
+; CHECK-SKX-NOVBMI-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; CHECK-SKX-NOVBMI-NEXT:    retq
+;
 ; CHECK-SKX-VBMI-LABEL: trunc_v32i16_v32i8_zeroes:
 ; CHECK-SKX-VBMI:       # %bb.0:
 ; CHECK-SKX-VBMI-NEXT:    vmovdqa (%rdi), %ymm1
@@ -1192,6 +1239,14 @@ define <16 x i16> @trunc_v16i32_v16i16_sign(ptr %x) nounwind "min-legal-vector-w
 }
 
 define <32 x i8> @trunc_v32i16_v32i8_sign(ptr %x) nounwind "min-legal-vector-width"="256" {
+; CHECK-SKX-NOVBMI-LABEL: trunc_v32i16_v32i8_sign:
+; CHECK-SKX-NOVBMI:       # %bb.0:
+; CHECK-SKX-NOVBMI-NEXT:    vpsrlw $8, 32(%rdi), %ymm0
+; CHECK-SKX-NOVBMI-NEXT:    vpsrlw $8, (%rdi), %ymm1
+; CHECK-SKX-NOVBMI-NEXT:    vpackuswb %ymm0, %ymm1, %ymm0
+; CHECK-SKX-NOVBMI-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; CHECK-SKX-NOVBMI-NEXT:    retq
+;
 ; CHECK-SKX-VBMI-LABEL: trunc_v32i16_v32i8_sign:
 ; CHECK-SKX-VBMI:       # %bb.0:
 ; CHECK-SKX-VBMI-NEXT:    vmovdqa (%rdi), %ymm1
diff --git a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
index 693d1992091b4..9729fd7a9d755 100644
--- a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
+++ b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
@@ -100,7 +100,7 @@ define <4 x i1> @p4_vector_urem_by_const__splat(<4 x i32> %x, <4 x i32> %y) {
 ; SSE4-LABEL: p4_vector_urem_by_const__splat:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2863311531,2863311531,2863311531,2863311531]
 ; SSE4-NEXT:    psrld $1, %xmm0
 ; SSE4-NEXT:    movdqa {{.*#+}} xmm1 = [715827883,715827883,715827883,715827883]
 ; SSE4-NEXT:    pcmpgtd %xmm0, %xmm1
@@ -128,10 +128,10 @@ define <4 x i1> @p5_vector_urem_by_const__nonsplat(<4 x i32> %x, <4 x i32> %y) {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3435973837,u,954437177,u]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2863311531,3435973837,2863311531,954437177]
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,2147483648,u]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
 ; SSE2-NEXT:    psrlq $32, %xmm0
@@ -145,7 +145,7 @@ define <4 x i1> @p5_vector_urem_by_const__nonsplat(<4 x i32> %x, <4 x i32> %y) {
 ; SSE4-LABEL: p5_vector_urem_by_const__nonsplat:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2863311531,3435973837,2863311531,954437177]
 ; SSE4-NEXT:    pmovzxdq {{.*#+}} xmm1 = [1,2147483648]
 ; SSE4-NEXT:    pmuludq %xmm0, %xmm1
 ; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
@@ -159,7 +159,7 @@ define <4 x i1> @p5_vector_urem_by_const__nonsplat(<4 x i32> %x, <4 x i32> %y) {
 ; AVX2-LABEL: p5_vector_urem_by_const__nonsplat:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2863311531,3435973837,2863311531,954437177]
 ; AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
@@ -196,7 +196,7 @@ define <4 x i1> @p6_vector_urem_by_const__nonsplat_undef0(<4 x i32> %x, <4 x i32
 ; SSE4-LABEL: p6_vector_urem_by_const__nonsplat_undef0:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2863311531,2863311531,2863311531,2863311531]
 ; SSE4-NEXT:    movdqa %xmm0, %xmm1
 ; SSE4-NEXT:    psrld $1, %xmm1
 ; SSE4-NEXT:    pslld $31, %xmm0
@@ -312,7 +312,7 @@ define <4 x i1> @p8_vector_urem_by_const__nonsplat_undef3(<4 x i32> %x, <4 x i32
 ; SSE4-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; SSE4-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; SSE4-NEXT:    psrld $2, %xmm2
-; SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [6,6,6,6]
 ; SSE4-NEXT:    psubd %xmm2, %xmm0
 ; SSE4-NEXT:    pxor %xmm1, %xmm1
 ; SSE4-NEXT:    pcmpeqd %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll
index 9aee2f11e9ea4..00731fe3e9556 100644
--- a/llvm/test/CodeGen/X86/pmul.ll
+++ b/llvm/test/CodeGen/X86/pmul.ll
@@ -91,7 +91,7 @@ define <4 x i32> @mul_v4i32c(<4 x i32> %i) nounwind  {
 ;
 ; SSE41-LABEL: mul_v4i32c:
 ; SSE41:       # %bb.0: # %entry
-; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [117,117,117,117]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: mul_v4i32c:
diff --git a/llvm/test/CodeGen/X86/pr160612.ll b/llvm/test/CodeGen/X86/pr160612.ll
new file mode 100644
index 0000000000000..6572c421b7fe6
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr160612.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -O2 | FileCheck %s
+
+; Test for issue #160612: OR conditions in branches should use multiple branches
+; instead of materializing booleans with SETCC when no special optimizations apply.
+
+declare void @subroutine_foo()
+declare void @subroutine_bar()
+
+; Original issue: (x == 0 || y == 0) was generating SETCC + TEST + BRANCH
+; instead of using two conditional branches directly.
+define void @func_a(i32 noundef %x, i32 noundef %y) {
+; CHECK-LABEL: func_a:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    je subroutine_foo@PLT # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    testl %esi, %esi
+; CHECK-NEXT:    jne subroutine_bar@PLT # TAILCALL
+; CHECK-NEXT:  # %bb.2: # %if.then
+; CHECK-NEXT:    jmp subroutine_foo@PLT # TAILCALL
+entry:
+  %cmp = icmp eq i32 %x, 0
+  %cmp1 = icmp eq i32 %y, 0
+  %or.cond = or i1 %cmp, %cmp1
+  br i1 %or.cond, label %if.then, label %if.else
+
+if.then:
+  tail call void @subroutine_foo()
+  br label %if.end
+
+if.else:
+  tail call void @subroutine_bar()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+; Reference implementation that already generated optimal code.
+; This should continue to generate the same optimal code.
+define void @func_b(i32 noundef %x, i32 noundef %y) {
+; CHECK-LABEL: func_b:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    je subroutine_foo@PLT # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.else
+; CHECK-NEXT:    testl %esi, %esi
+; CHECK-NEXT:    je subroutine_foo@PLT # TAILCALL
+; CHECK-NEXT:  # %bb.2: # %if.else3
+; CHECK-NEXT:    jmp subroutine_bar@PLT # TAILCALL
+entry:
+  %cmp = icmp eq i32 %x, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  tail call void @subroutine_foo()
+  br label %if.end4
+
+if.else:
+  %cmp1 = icmp eq i32 %y, 0
+  br i1 %cmp1, label %if.then2, label %if.else3
+
+if.then2:
+  tail call void @subroutine_foo()
+  br label %if.end4
+
+if.else3:
+  tail call void @subroutine_bar()
+  br label %if.end4
+
+if.end4:
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/pr162812.ll b/llvm/test/CodeGen/X86/pr162812.ll
index 4ea3101ce8456..cec093c3df743 100644
--- a/llvm/test/CodeGen/X86/pr162812.ll
+++ b/llvm/test/CodeGen/X86/pr162812.ll
@@ -34,61 +34,43 @@ define <32 x i8> @PR162812(<32 x i8> %a, <32 x i8> %mask) {
 ;
 ; SSE42-LABEL: PR162812:
 ; SSE42:       # %bb.0:
-; SSE42-NEXT:    movdqa %xmm2, %xmm5
-; SSE42-NEXT:    movdqa %xmm0, %xmm2
+; SSE42-NEXT:    movdqa %xmm0, %xmm4
+; SSE42-NEXT:    psrlw $2, %xmm2
+; SSE42-NEXT:    movdqa {{.*#+}} xmm5 = [8224,8224,8224,8224,8224,8224,8224,8224]
+; SSE42-NEXT:    pand %xmm5, %xmm2
+; SSE42-NEXT:    paddb %xmm2, %xmm2
+; SSE42-NEXT:    paddb %xmm2, %xmm2
 ; SSE42-NEXT:    movdqa %xmm0, %xmm6
-; SSE42-NEXT:    psllw $2, %xmm6
-; SSE42-NEXT:    movdqa {{.*#+}} xmm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; SSE42-NEXT:    pand %xmm7, %xmm6
-; SSE42-NEXT:    psrlw $2, %xmm5
-; SSE42-NEXT:    movdqa {{.*#+}} xmm4 = [8224,8224,8224,8224,8224,8224,8224,8224]
-; SSE42-NEXT:    pand %xmm4, %xmm5
+; SSE42-NEXT:    paddb %xmm0, %xmm6
+; SSE42-NEXT:    movdqa %xmm2, %xmm0
+; SSE42-NEXT:    pblendvb %xmm0, %xmm6, %xmm4
+; SSE42-NEXT:    psrlw $2, %xmm3
+; SSE42-NEXT:    pand %xmm3, %xmm5
 ; SSE42-NEXT:    paddb %xmm5, %xmm5
-; SSE42-NEXT:    movdqa %xmm5, %xmm0
-; SSE42-NEXT:    pblendvb %xmm0, %xmm6, %xmm2
-; SSE42-NEXT:    movdqa %xmm2, %xmm6
-; SSE42-NEXT:    paddb %xmm2, %xmm6
 ; SSE42-NEXT:    paddb %xmm5, %xmm5
+; SSE42-NEXT:    movdqa %xmm1, %xmm2
+; SSE42-NEXT:    paddb %xmm1, %xmm2
 ; SSE42-NEXT:    movdqa %xmm5, %xmm0
-; SSE42-NEXT:    pblendvb %xmm0, %xmm6, %xmm2
-; SSE42-NEXT:    movdqa %xmm1, %xmm5
-; SSE42-NEXT:    psllw $2, %xmm5
-; SSE42-NEXT:    pand %xmm7, %xmm5
-; SSE42-NEXT:    psrlw $2, %xmm3
-; SSE42-NEXT:    pand %xmm3, %xmm4
-; SSE42-NEXT:    paddb %xmm4, %xmm4
-; SSE42-NEXT:    movdqa %xmm4, %xmm0
-; SSE42-NEXT:    pblendvb %xmm0, %xmm5, %xmm1
-; SSE42-NEXT:    movdqa %xmm1, %xmm3
-; SSE42-NEXT:    paddb %xmm1, %xmm3
-; SSE42-NEXT:    paddb %xmm4, %xmm4
+; SSE42-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
 ; SSE42-NEXT:    movdqa %xmm4, %xmm0
-; SSE42-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
-; SSE42-NEXT:    movdqa %xmm2, %xmm0
 ; SSE42-NEXT:    retq
 ;
 ; AVX2-LABEL: PR162812:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpsllw $2, %ymm0, %ymm2
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
 ; AVX2-NEXT:    vpsrlw $2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
-; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
 ; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: PR162812:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpsllw $2, %ymm0, %ymm2
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2
+; AVX512-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
 ; AVX512-NEXT:    vpsrlw $2, %ymm1, %ymm1
 ; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1
 ; AVX512-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
-; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; AVX512-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
 ; AVX512-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll b/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll
index 885b07585e68f..59b03f8c02223 100644
--- a/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll
+++ b/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll
@@ -9,7 +9,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) {
 ; AVX256BW:       # %bb.0:
 ; AVX256BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX256BW-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX256BW-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX256BW-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
 ; AVX256BW-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
 ; AVX256BW-NEXT:    vpsrlw $8, %ymm2, %ymm2
 ; AVX256BW-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
diff --git a/llvm/test/CodeGen/X86/rotate-extract-vector.ll b/llvm/test/CodeGen/X86/rotate-extract-vector.ll
index 1ead3f98ab5d6..7d0ec6435d863 100644
--- a/llvm/test/CodeGen/X86/rotate-extract-vector.ll
+++ b/llvm/test/CodeGen/X86/rotate-extract-vector.ll
@@ -149,19 +149,12 @@ define <32 x i16> @illegal_no_extract_mul(<32 x i16> %i) nounwind {
 
 ; Result would undershift
 define <4 x i64> @no_extract_shl(<4 x i64> %i) nounwind {
-; X86-LABEL: no_extract_shl:
-; X86:       # %bb.0:
-; X86-NEXT:    vpsllq $24, %ymm0, %ymm1
-; X86-NEXT:    vpsrlq $39, %ymm0, %ymm0
-; X86-NEXT:    vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %ymm1, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: no_extract_shl:
-; X64:       # %bb.0:
-; X64-NEXT:    vpsllq $24, %ymm0, %ymm1
-; X64-NEXT:    vpsrlq $39, %ymm0, %ymm0
-; X64-NEXT:    vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: no_extract_shl:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpsllq $24, %ymm0, %ymm1
+; CHECK-NEXT:    vpsrlq $39, %ymm0, %ymm0
+; CHECK-NEXT:    vpternlogq {{.*#+}} ymm0 = (ymm0 & m64bcst) | ymm1
+; CHECK-NEXT:    ret{{[l|q]}}
   %lhs_mul = shl <4 x i64> %i, <i64 11, i64 11, i64 11, i64 11>
   %rhs_mul = shl <4 x i64> %i, <i64 24, i64 24, i64 24, i64 24>
   %lhs_shift = lshr <4 x i64> %lhs_mul, <i64 50, i64 50, i64 50, i64 50>
@@ -171,19 +164,12 @@ define <4 x i64> @no_extract_shl(<4 x i64> %i) nounwind {
 
 ; Result would overshift
 define <4 x i32> @no_extract_shrl(<4 x i32> %i) nounwind {
-; X86-LABEL: no_extract_shrl:
-; X86:       # %bb.0:
-; X86-NEXT:    vpsrld $9, %xmm0, %xmm1
-; X86-NEXT:    vpslld $25, %xmm0, %xmm0
-; X86-NEXT:    vpternlogd $236, {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm1, %xmm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: no_extract_shrl:
-; X64:       # %bb.0:
-; X64-NEXT:    vpsrld $9, %xmm0, %xmm1
-; X64-NEXT:    vpslld $25, %xmm0, %xmm0
-; X64-NEXT:    vpternlogd $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
-; X64-NEXT:    retq
+; CHECK-LABEL: no_extract_shrl:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpsrld $9, %xmm0, %xmm1
+; CHECK-NEXT:    vpslld $25, %xmm0, %xmm0
+; CHECK-NEXT:    vpternlogd {{.*#+}} xmm0 = (xmm0 & m32bcst) | xmm1
+; CHECK-NEXT:    ret{{[l|q]}}
   %lhs_div = lshr <4 x i32> %i, <i32 3, i32 3, i32 3, i32 3>
   %rhs_div = lshr <4 x i32> %i, <i32 9, i32 9, i32 9, i32 9>
   %lhs_shift = shl <4 x i32> %lhs_div, <i32 28, i32 28, i32 28, i32 28>
diff --git a/llvm/test/CodeGen/X86/sdiv-exact.ll b/llvm/test/CodeGen/X86/sdiv-exact.ll
index 456819179fcdc..7873ffa9af4af 100644
--- a/llvm/test/CodeGen/X86/sdiv-exact.ll
+++ b/llvm/test/CodeGen/X86/sdiv-exact.ll
@@ -87,7 +87,7 @@ define <4 x i32> @test5(<4 x i32> %x) {
 ; X86-NEXT:    pmuludq %xmm1, %xmm0
 ; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; X86-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X86-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [2863311531,u,3264175145,u]
 ; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; X86-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X86-NEXT:    retl
@@ -95,7 +95,7 @@ define <4 x i32> @test5(<4 x i32> %x) {
 ; X64-LABEL: test5:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2863311531,2863311531,3264175145,3264175145]
 ; X64-NEXT:    retq
   %div = sdiv exact <4 x i32> %x, <i32 24, i32 24, i32 25, i32 25>
   ret <4 x i32> %div
@@ -112,7 +112,7 @@ define <4 x i32> @test6(<4 x i32> %x) {
 ; X86-NEXT:    pmuludq %xmm0, %xmm1
 ; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; X86-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [2863311531,u,3303820997,u]
 ; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; X86-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; X86-NEXT:    movdqa %xmm1, %xmm0
@@ -121,7 +121,7 @@ define <4 x i32> @test6(<4 x i32> %x) {
 ; X64-LABEL: test6:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2863311531,2863311531,3303820997,3303820997]
 ; X64-NEXT:    retq
   %div = sdiv exact <4 x i32> %x, <i32 24, i32 24, i32 26, i32 26>
   ret <4 x i32> %div
@@ -131,16 +131,16 @@ define <4 x i32> @test7(<4 x i32> %x) {
 ; X86-LABEL: test7:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [3264175145,3264175145,1749801491,1749801491]
 ; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [3264175145,u,1749801491,u]
 ; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; X86-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test7:
 ; X64:       # %bb.0:
-; X64-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3264175145,3264175145,1749801491,1749801491]
 ; X64-NEXT:    retq
   %div = sdiv exact <4 x i32> %x, <i32 25, i32 25, i32 27, i32 27>
   ret <4 x i32> %div
@@ -156,7 +156,7 @@ define <4 x i32> @test8(<4 x i32> %x) {
 ; X86-NEXT:    pmuludq %xmm1, %xmm0
 ; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; X86-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X86-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [1,u,2863311531,u]
 ; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; X86-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X86-NEXT:    retl
@@ -164,7 +164,7 @@ define <4 x i32> @test8(<4 x i32> %x) {
 ; X64-LABEL: test8:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,1,2863311531,2863311531]
 ; X64-NEXT:    retq
   %div = sdiv exact <4 x i32> %x, <i32 1, i32 1, i32 24, i32 24>
   ret <4 x i32> %div
diff --git a/llvm/test/CodeGen/X86/setcc-wide-types.ll b/llvm/test/CodeGen/X86/setcc-wide-types.ll
index 69abf6e0bec35..d018c535ea8f7 100644
--- a/llvm/test/CodeGen/X86/setcc-wide-types.ll
+++ b/llvm/test/CodeGen/X86/setcc-wide-types.ll
@@ -1493,15 +1493,23 @@ define i1 @allbits_i128_load_arg(ptr %w) {
 }
 
 define i1 @anybits_i256_load_arg(ptr %w) {
-; ANY-LABEL: anybits_i256_load_arg:
-; ANY:       # %bb.0:
-; ANY-NEXT:    movq (%rdi), %rax
-; ANY-NEXT:    movq 8(%rdi), %rcx
-; ANY-NEXT:    orq 24(%rdi), %rcx
-; ANY-NEXT:    orq 16(%rdi), %rax
-; ANY-NEXT:    orq %rcx, %rax
-; ANY-NEXT:    setne %al
-; ANY-NEXT:    retq
+; SSE-LABEL: anybits_i256_load_arg:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq (%rdi), %rax
+; SSE-NEXT:    movq 8(%rdi), %rcx
+; SSE-NEXT:    orq 24(%rdi), %rcx
+; SSE-NEXT:    orq 16(%rdi), %rax
+; SSE-NEXT:    orq %rcx, %rax
+; SSE-NEXT:    setne %al
+; SSE-NEXT:    retq
+;
+; AVXANY-LABEL: anybits_i256_load_arg:
+; AVXANY:       # %bb.0:
+; AVXANY-NEXT:    vmovdqu (%rdi), %ymm0
+; AVXANY-NEXT:    vptest %ymm0, %ymm0
+; AVXANY-NEXT:    setne %al
+; AVXANY-NEXT:    vzeroupper
+; AVXANY-NEXT:    retq
   %ld = load i256, ptr %w
   %cmp = icmp ne i256 %ld, 0
   ret i1 %cmp
@@ -1552,21 +1560,30 @@ define i1 @allbits_i256_load_arg(ptr %w) {
 }
 
 define i1 @anybits_i512_load_arg(ptr %w) {
-; ANY-LABEL: anybits_i512_load_arg:
-; ANY:       # %bb.0:
-; ANY-NEXT:    movq 16(%rdi), %rax
-; ANY-NEXT:    movq (%rdi), %rcx
-; ANY-NEXT:    movq 8(%rdi), %rdx
-; ANY-NEXT:    movq 24(%rdi), %rsi
-; ANY-NEXT:    orq 56(%rdi), %rsi
-; ANY-NEXT:    orq 40(%rdi), %rdx
-; ANY-NEXT:    orq %rsi, %rdx
-; ANY-NEXT:    orq 48(%rdi), %rax
-; ANY-NEXT:    orq 32(%rdi), %rcx
-; ANY-NEXT:    orq %rax, %rcx
-; ANY-NEXT:    orq %rdx, %rcx
-; ANY-NEXT:    setne %al
-; ANY-NEXT:    retq
+; NO512-LABEL: anybits_i512_load_arg:
+; NO512:       # %bb.0:
+; NO512-NEXT:    movq 16(%rdi), %rax
+; NO512-NEXT:    movq (%rdi), %rcx
+; NO512-NEXT:    movq 8(%rdi), %rdx
+; NO512-NEXT:    movq 24(%rdi), %rsi
+; NO512-NEXT:    orq 56(%rdi), %rsi
+; NO512-NEXT:    orq 40(%rdi), %rdx
+; NO512-NEXT:    orq %rsi, %rdx
+; NO512-NEXT:    orq 48(%rdi), %rax
+; NO512-NEXT:    orq 32(%rdi), %rcx
+; NO512-NEXT:    orq %rax, %rcx
+; NO512-NEXT:    orq %rdx, %rcx
+; NO512-NEXT:    setne %al
+; NO512-NEXT:    retq
+;
+; AVX512-LABEL: anybits_i512_load_arg:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqu64 (%rdi), %zmm0
+; AVX512-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512-NEXT:    kortestw %k0, %k0
+; AVX512-NEXT:    setne %al
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %ld = load i512, ptr %w
   %cmp = icmp ne i512 %ld, 0
   ret i1 %cmp
diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll
index e53eed4587797..504a392301639 100644
--- a/llvm/test/CodeGen/X86/shrink_vmul.ll
+++ b/llvm/test/CodeGen/X86/shrink_vmul.ll
@@ -1760,7 +1760,7 @@ define void @mul_2xi16_varconst1(ptr nocapture readonly %a, i64 %index) {
 ; X86-AVX-NEXT:    movl c, %edx
 ; X86-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X86-AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [0,65535,u,u]
 ; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
 ; X86-AVX-NEXT:    retl
 ;
@@ -1781,7 +1781,7 @@ define void @mul_2xi16_varconst1(ptr nocapture readonly %a, i64 %index) {
 ; X64-AVX-NEXT:    movq c(%rip), %rax
 ; X64-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X64-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X64-AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,65535,u,u]
 ; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
 ; X64-AVX-NEXT:    retq
 entry:
@@ -1864,7 +1864,7 @@ define void @mul_2xi16_varconst3(ptr nocapture readonly %a, i64 %index) {
 ; X86-SSE-NEXT:    movl c, %edx
 ; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; X86-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [65536,65536,65536,65536]
 ; X86-SSE-NEXT:    psllq $32, %xmm0
 ; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
 ; X86-SSE-NEXT:    retl
@@ -1876,7 +1876,7 @@ define void @mul_2xi16_varconst3(ptr nocapture readonly %a, i64 %index) {
 ; X86-AVX-NEXT:    movl c, %edx
 ; X86-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X86-AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [0,65536,u,u]
 ; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
 ; X86-AVX-NEXT:    retl
 ;
@@ -1885,7 +1885,7 @@ define void @mul_2xi16_varconst3(ptr nocapture readonly %a, i64 %index) {
 ; X64-SSE-NEXT:    movq c(%rip), %rax
 ; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X64-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; X64-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [65536,65536,65536,65536]
 ; X64-SSE-NEXT:    psllq $32, %xmm0
 ; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
 ; X64-SSE-NEXT:    retq
@@ -1895,7 +1895,7 @@ define void @mul_2xi16_varconst3(ptr nocapture readonly %a, i64 %index) {
 ; X64-AVX-NEXT:    movq c(%rip), %rax
 ; X64-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X64-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X64-AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,65536,u,u]
 ; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
 ; X64-AVX-NEXT:    retq
 entry:
@@ -1922,7 +1922,7 @@ define void @mul_2xi16_varconst4(ptr nocapture readonly %a, i64 %index) {
 ; X86-SSE-NEXT:    movl c, %edx
 ; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    psrad $16, %xmm0
-; X86-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [32768,32768,32768,32768]
 ; X86-SSE-NEXT:    psllq $32, %xmm0
 ; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
 ; X86-SSE-NEXT:    retl
@@ -1934,7 +1934,7 @@ define void @mul_2xi16_varconst4(ptr nocapture readonly %a, i64 %index) {
 ; X86-AVX-NEXT:    movl c, %edx
 ; X86-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
-; X86-AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [0,32768,u,u]
 ; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
 ; X86-AVX-NEXT:    retl
 ;
@@ -1943,7 +1943,7 @@ define void @mul_2xi16_varconst4(ptr nocapture readonly %a, i64 %index) {
 ; X64-SSE-NEXT:    movq c(%rip), %rax
 ; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X64-SSE-NEXT:    psrad $16, %xmm0
-; X64-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32768,32768,32768,32768]
 ; X64-SSE-NEXT:    psllq $32, %xmm0
 ; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
 ; X64-SSE-NEXT:    retq
@@ -1953,7 +1953,7 @@ define void @mul_2xi16_varconst4(ptr nocapture readonly %a, i64 %index) {
 ; X64-AVX-NEXT:    movq c(%rip), %rax
 ; X64-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X64-AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
-; X64-AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,32768,u,u]
 ; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
 ; X64-AVX-NEXT:    retq
 entry:
diff --git a/llvm/test/CodeGen/X86/slow-pmulld.ll b/llvm/test/CodeGen/X86/slow-pmulld.ll
index 975ffd06db03b..e8c05f9b562d8 100644
--- a/llvm/test/CodeGen/X86/slow-pmulld.ll
+++ b/llvm/test/CodeGen/X86/slow-pmulld.ll
@@ -336,13 +336,13 @@ define <4 x i32> @test_mul_v4i32_v4i16(<4 x i16> %A) {
 ; SSE4-32-LABEL: test_mul_v4i32_v4i16:
 ; SSE4-32:       # %bb.0:
 ; SSE4-32-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE4-32-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; SSE4-32-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [18778,18778,18778,18778]
 ; SSE4-32-NEXT:    retl
 ;
 ; SSE4-64-LABEL: test_mul_v4i32_v4i16:
 ; SSE4-64:       # %bb.0:
 ; SSE4-64-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE4-64-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE4-64-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [18778,18778,18778,18778]
 ; SSE4-64-NEXT:    retq
 ;
 ; AVX2-SLOW-LABEL: test_mul_v4i32_v4i16:
@@ -838,13 +838,13 @@ define <4 x i32> @test_mul_v4i32_v4i16_minsize(<4 x i16> %A) minsize {
 ; SSE-32-LABEL: test_mul_v4i32_v4i16_minsize:
 ; SSE-32:       # %bb.0:
 ; SSE-32-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE-32-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; SSE-32-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [18778,18778,18778,18778]
 ; SSE-32-NEXT:    retl
 ;
 ; SSE-64-LABEL: test_mul_v4i32_v4i16_minsize:
 ; SSE-64:       # %bb.0:
 ; SSE-64-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE-64-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-64-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [18778,18778,18778,18778]
 ; SSE-64-NEXT:    retq
 ;
 ; AVX2-LABEL: test_mul_v4i32_v4i16_minsize:
diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll b/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll
index 42617c1573be5..18588aada145c 100644
--- a/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll
+++ b/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll
@@ -24,7 +24,7 @@ define float @sqrt_ieee_ninf(float %f) #0 {
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:fr32 = COPY $xmm0
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[VRSQRTSSr:%[0-9]+]]:fr32 = VRSQRTSSr killed [[DEF]], [[COPY]]
+  ; CHECK-NEXT:   [[VRSQRTSSr:%[0-9]+]]:fr32 = ninf afn VRSQRTSSr killed [[DEF]], [[COPY]]
   ; CHECK-NEXT:   [[VMULSSrr:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr
   ; CHECK-NEXT:   [[VMOVSSrm_alt:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.0, $noreg :: (load (s32) from constant-pool)
   ; CHECK-NEXT:   [[VFMADD213SSr:%[0-9]+]]:fr32 = ninf afn nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed [[VMULSSrr]], [[VMOVSSrm_alt]], implicit $mxcsr
@@ -71,7 +71,7 @@ define float @sqrt_daz_ninf(float %f) #1 {
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:fr32 = COPY $xmm0
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[VRSQRTSSr:%[0-9]+]]:fr32 = VRSQRTSSr killed [[DEF]], [[COPY]]
+  ; CHECK-NEXT:   [[VRSQRTSSr:%[0-9]+]]:fr32 = ninf afn VRSQRTSSr killed [[DEF]], [[COPY]]
   ; CHECK-NEXT:   [[VMULSSrr:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr
   ; CHECK-NEXT:   [[VMOVSSrm_alt:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.0, $noreg :: (load (s32) from constant-pool)
   ; CHECK-NEXT:   [[VFMADD213SSr:%[0-9]+]]:fr32 = ninf afn nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed [[VMULSSrr]], [[VMOVSSrm_alt]], implicit $mxcsr
diff --git a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll
index cc4bda81bef52..650b562e6cb5c 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=i686-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=X86
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=X64
 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 < %s | FileCheck %s --check-prefixes=X64,SSE2
 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.1 < %s | FileCheck %s --check-prefixes=X64,SSE41
 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx < %s | FileCheck %s --check-prefixes=X64,AVX1
diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
index 2d0778853fecd..bb7245c31b326 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
@@ -10,15 +10,15 @@ define <4 x i32> @test_srem_odd_even(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_srem_odd_even:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,3264175145,3264175145]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,u,3264175145,u]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; CHECK-SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,1073741824,1073741824]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; CHECK-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -30,10 +30,10 @@ define <4 x i32> @test_srem_odd_even(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_srem_odd_even:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,3264175145,3264175145]
 ; CHECK-SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,1073741824,1073741824]
 ; CHECK-SSE41-NEXT:    pxor %xmm2, %xmm2
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -47,10 +47,10 @@ define <4 x i32> @test_srem_odd_even(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_srem_odd_even:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,3264175145,3264175145]
 ; CHECK-AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,2147483648,1073741824,1073741824]
 ; CHECK-AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -63,7 +63,7 @@ define <4 x i32> @test_srem_odd_even(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX2-LABEL: test_srem_odd_even:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,3264175145,3264175145]
 ; CHECK-AVX2-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -75,7 +75,7 @@ define <4 x i32> @test_srem_odd_even(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX512VL-LABEL: test_srem_odd_even:
 ; CHECK-AVX512VL:       # %bb.0:
-; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,3264175145,3264175145]
 ; CHECK-AVX512VL-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -109,7 +109,7 @@ define <4 x i32> @test_srem_odd_allones_eq(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_srem_odd_allones_eq:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,3435973837,3435973837]
 ; CHECK-SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993458,858993458,4294967295,858993458]
 ; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
@@ -119,7 +119,7 @@ define <4 x i32> @test_srem_odd_allones_eq(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_srem_odd_allones_eq:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,3435973837,3435973837]
 ; CHECK-AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -168,7 +168,7 @@ define <4 x i32> @test_srem_odd_allones_ne(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_srem_odd_allones_ne:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,3435973837,3435973837]
 ; CHECK-SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993458,858993458,4294967295,858993458]
 ; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
@@ -178,7 +178,7 @@ define <4 x i32> @test_srem_odd_allones_ne(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_srem_odd_allones_ne:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,3435973837,3435973837]
 ; CHECK-AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -234,7 +234,7 @@ define <4 x i32> @test_srem_even_allones_eq(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_srem_even_allones_eq:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,3067833783,3067833783]
 ; CHECK-SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    psrld $1, %xmm1
@@ -248,7 +248,7 @@ define <4 x i32> @test_srem_even_allones_eq(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_srem_even_allones_eq:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,3067833783,3067833783]
 ; CHECK-AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $1, %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
@@ -308,7 +308,7 @@ define <4 x i32> @test_srem_even_allones_ne(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_srem_even_allones_ne:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,3067833783,3067833783]
 ; CHECK-SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    psrld $1, %xmm1
@@ -322,7 +322,7 @@ define <4 x i32> @test_srem_even_allones_ne(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_srem_even_allones_ne:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,3067833783,3067833783]
 ; CHECK-AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $1, %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
@@ -367,15 +367,15 @@ define <4 x i32> @test_srem_odd_even_allones_eq(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_srem_odd_even_allones_eq:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,0,3264175145]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,u,3264175145,u]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; CHECK-SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,1073741824,1073741824]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; CHECK-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -387,10 +387,10 @@ define <4 x i32> @test_srem_odd_even_allones_eq(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_srem_odd_even_allones_eq:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,0,3264175145]
 ; CHECK-SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,1073741824,1073741824]
 ; CHECK-SSE41-NEXT:    pxor %xmm2, %xmm2
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -404,10 +404,10 @@ define <4 x i32> @test_srem_odd_even_allones_eq(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_srem_odd_even_allones_eq:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,0,3264175145]
 ; CHECK-AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,2147483648,1073741824,1073741824]
 ; CHECK-AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -420,7 +420,7 @@ define <4 x i32> @test_srem_odd_even_allones_eq(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX2-LABEL: test_srem_odd_even_allones_eq:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,0,3264175145]
 ; CHECK-AVX2-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -432,7 +432,7 @@ define <4 x i32> @test_srem_odd_even_allones_eq(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX512VL-LABEL: test_srem_odd_even_allones_eq:
 ; CHECK-AVX512VL:       # %bb.0:
-; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,0,3264175145]
 ; CHECK-AVX512VL-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -448,15 +448,15 @@ define <4 x i32> @test_srem_odd_even_allones_ne(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_srem_odd_even_allones_ne:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,0,3264175145]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,u,3264175145,u]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; CHECK-SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,1073741824,1073741824]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; CHECK-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -468,10 +468,10 @@ define <4 x i32> @test_srem_odd_even_allones_ne(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_srem_odd_even_allones_ne:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,0,3264175145]
 ; CHECK-SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,1073741824,1073741824]
 ; CHECK-SSE41-NEXT:    pxor %xmm2, %xmm2
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -485,10 +485,10 @@ define <4 x i32> @test_srem_odd_even_allones_ne(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_srem_odd_even_allones_ne:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,0,3264175145]
 ; CHECK-AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,2147483648,1073741824,1073741824]
 ; CHECK-AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -501,7 +501,7 @@ define <4 x i32> @test_srem_odd_even_allones_ne(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX2-LABEL: test_srem_odd_even_allones_ne:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,0,3264175145]
 ; CHECK-AVX2-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -514,7 +514,7 @@ define <4 x i32> @test_srem_odd_even_allones_ne(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX512VL-LABEL: test_srem_odd_even_allones_ne:
 ; CHECK-AVX512VL:       # %bb.0:
-; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,0,3264175145]
 ; CHECK-AVX512VL-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -534,14 +534,14 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_srem_odd_poweroftwo:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,u,1,u]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3435973837,3435973837,3435973837,3435973837]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; CHECK-SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,268435456,u]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
 ; CHECK-SSE2-NEXT:    psrlq $32, %xmm0
@@ -553,7 +553,7 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_srem_odd_poweroftwo:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,1,3435973837]
 ; CHECK-SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    pmovsxdq {{.*#+}} xmm1 = [1,268435456]
 ; CHECK-SSE41-NEXT:    pmuludq %xmm0, %xmm1
@@ -568,9 +568,9 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_srem_odd_poweroftwo:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,1,3435973837]
 ; CHECK-AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1,u,268435456,u]
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
 ; CHECK-AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm1
 ; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
@@ -581,7 +581,7 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX2-LABEL: test_srem_odd_poweroftwo:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,1,3435973837]
 ; CHECK-AVX2-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -593,7 +593,7 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX512VL-LABEL: test_srem_odd_poweroftwo:
 ; CHECK-AVX512VL:       # %bb.0:
-; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,1,3435973837]
 ; CHECK-AVX512VL-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -611,9 +611,9 @@ define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_srem_even_poweroftwo:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,u,1,u]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,3067833783,3067833783,3067833783]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; CHECK-SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -636,11 +636,11 @@ define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_srem_even_poweroftwo:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,1,3067833783]
 ; CHECK-SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,2147483648,2147483648]
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,268435456,2147483648]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -654,11 +654,11 @@ define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_srem_even_poweroftwo:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,1,3067833783]
 ; CHECK-AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,2147483648,2147483648,2147483648]
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2147483648,2147483648,268435456,2147483648]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -671,7 +671,7 @@ define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX2-LABEL: test_srem_even_poweroftwo:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,1,3067833783]
 ; CHECK-AVX2-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -683,7 +683,7 @@ define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX512VL-LABEL: test_srem_even_poweroftwo:
 ; CHECK-AVX512VL:       # %bb.0:
-; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,1,3067833783]
 ; CHECK-AVX512VL-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -701,9 +701,9 @@ define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_srem_odd_even_poweroftwo:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,1,3264175145]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,u,3264175145,u]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; CHECK-SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -711,7 +711,7 @@ define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-NEXT:    pmuludq %xmm0, %xmm1
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [2147483648,u,1073741824,u]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
@@ -725,11 +725,11 @@ define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_srem_odd_even_poweroftwo:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,1,3264175145]
 ; CHECK-SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,u,1073741824,u]
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2147483648,268435456,1073741824]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -743,11 +743,11 @@ define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_srem_odd_even_poweroftwo:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,1,3264175145]
 ; CHECK-AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,u,1073741824,u]
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2147483648,268435456,1073741824]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -760,7 +760,7 @@ define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX2-LABEL: test_srem_odd_even_poweroftwo:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,1,3264175145]
 ; CHECK-AVX2-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -772,7 +772,7 @@ define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX512VL-LABEL: test_srem_odd_even_poweroftwo:
 ; CHECK-AVX512VL:       # %bb.0:
-; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,1,3264175145]
 ; CHECK-AVX512VL-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -806,7 +806,7 @@ define <4 x i32> @test_srem_odd_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_srem_odd_one:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,3435973837,3435973837]
 ; CHECK-SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993458,858993458,4294967295,858993458]
 ; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
@@ -816,7 +816,7 @@ define <4 x i32> @test_srem_odd_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_srem_odd_one:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,3435973837,3435973837]
 ; CHECK-AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -871,7 +871,7 @@ define <4 x i32> @test_srem_even_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_srem_even_one:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,3067833783,3067833783]
 ; CHECK-SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    psrld $1, %xmm1
@@ -885,7 +885,7 @@ define <4 x i32> @test_srem_even_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_srem_even_one:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,3067833783,3067833783]
 ; CHECK-AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $1, %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
@@ -929,15 +929,15 @@ define <4 x i32> @test_srem_odd_even_one(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_srem_odd_even_one:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,0,3264175145]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,u,3264175145,u]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; CHECK-SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,1073741824,1073741824]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; CHECK-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -949,10 +949,10 @@ define <4 x i32> @test_srem_odd_even_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_srem_odd_even_one:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,0,3264175145]
 ; CHECK-SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,1073741824,1073741824]
 ; CHECK-SSE41-NEXT:    pxor %xmm2, %xmm2
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -966,10 +966,10 @@ define <4 x i32> @test_srem_odd_even_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_srem_odd_even_one:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,0,3264175145]
 ; CHECK-AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,2147483648,1073741824,1073741824]
 ; CHECK-AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -982,7 +982,7 @@ define <4 x i32> @test_srem_odd_even_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX2-LABEL: test_srem_odd_even_one:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,0,3264175145]
 ; CHECK-AVX2-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -994,7 +994,7 @@ define <4 x i32> @test_srem_odd_even_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX512VL-LABEL: test_srem_odd_even_one:
 ; CHECK-AVX512VL:       # %bb.0:
-; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,0,3264175145]
 ; CHECK-AVX512VL-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -1018,9 +1018,9 @@ define <4 x i32> @test_srem_odd_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-NEXT:    pand %xmm0, %xmm2
 ; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,u,1,u]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3435973837,3435973837,3435973837,3435973837]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
 ; CHECK-SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
@@ -1039,7 +1039,7 @@ define <4 x i32> @test_srem_odd_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647]
 ; CHECK-SSE41-NEXT:    pand %xmm0, %xmm2
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm2
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,1,3435973837]
 ; CHECK-SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993458,858993458,1,858993458]
 ; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
@@ -1053,7 +1053,7 @@ define <4 x i32> @test_srem_odd_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; CHECK-AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm2, %xmm1
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,1,3435973837]
 ; CHECK-AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
@@ -1067,7 +1067,7 @@ define <4 x i32> @test_srem_odd_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647]
 ; CHECK-AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm2
 ; CHECK-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm2, %xmm1
-; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,1,3435973837]
 ; CHECK-AVX2-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
 ; CHECK-AVX2-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
@@ -1080,7 +1080,7 @@ define <4 x i32> @test_srem_odd_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; CHECK-AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm2
 ; CHECK-AVX512VL-NEXT:    vpcmpeqd %xmm1, %xmm2, %xmm1
-; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,1,3435973837]
 ; CHECK-AVX512VL-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
 ; CHECK-AVX512VL-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
@@ -1102,7 +1102,7 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-NEXT:    pmuludq %xmm0, %xmm1
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [3067833783,3067833783,3067833783,3067833783]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
 ; CHECK-SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
@@ -1137,8 +1137,8 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-SSE41-NEXT:    pmulld %xmm0, %xmm2
 ; CHECK-SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [2147483648,2147483648,2147483648,2147483648]
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [2147483648,2147483648,2,2147483648]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
@@ -1156,11 +1156,11 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-AVX1-LABEL: test_srem_even_INT_MIN:
 ; CHECK-AVX1:       # %bb.0:
 ; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 # [3067833783,3067833783,1,3067833783]
 ; CHECK-AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [2147483648,2147483648,2147483648,2147483648]
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [2147483648,2147483648,2,2147483648]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
@@ -1177,7 +1177,7 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-AVX2-LABEL: test_srem_even_INT_MIN:
 ; CHECK-AVX2:       # %bb.0:
 ; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 # [3067833783,3067833783,1,3067833783]
 ; CHECK-AVX2-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; CHECK-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
 ; CHECK-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
@@ -1196,7 +1196,7 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; CHECK-AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm2
 ; CHECK-AVX512VL-NEXT:    vpcmpeqd %xmm1, %xmm2, %xmm1
-; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,1,3067833783]
 ; CHECK-AVX512VL-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
@@ -1219,7 +1219,7 @@ define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-NEXT:    pmuludq %xmm0, %xmm1
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [3067833783,u,3264175145,u]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
 ; CHECK-SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
@@ -1227,7 +1227,7 @@ define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm3
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,u,1073741824,u]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,3,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
@@ -1253,8 +1253,8 @@ define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-SSE41-NEXT:    pmulld %xmm0, %xmm2
 ; CHECK-SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [2147483648,u,1073741824,u]
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1,2147483648,2,1073741824]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
@@ -1272,11 +1272,11 @@ define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-AVX1-LABEL: test_srem_odd_even_INT_MIN:
 ; CHECK-AVX1:       # %bb.0:
 ; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 # [3435973837,3067833783,1,3264175145]
 ; CHECK-AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [2147483648,u,1073741824,u]
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [1,2147483648,2,1073741824]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
@@ -1293,7 +1293,7 @@ define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-AVX2-LABEL: test_srem_odd_even_INT_MIN:
 ; CHECK-AVX2:       # %bb.0:
 ; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 # [3435973837,3067833783,1,3264175145]
 ; CHECK-AVX2-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; CHECK-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
 ; CHECK-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
@@ -1312,7 +1312,7 @@ define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; CHECK-AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm2
 ; CHECK-AVX512VL-NEXT:    vpcmpeqd %xmm1, %xmm2, %xmm1
-; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,1,3264175145]
 ; CHECK-AVX512VL-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
@@ -1333,14 +1333,14 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_srem_odd_allones_and_poweroftwo:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,0,1,3435973837]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,u,3435973837,u]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; CHECK-SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,268435456,u]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
 ; CHECK-SSE2-NEXT:    psrlq $32, %xmm0
@@ -1352,7 +1352,7 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_srem_odd_allones_and_poweroftwo:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,0,1,3435973837]
 ; CHECK-SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    pmovsxdq {{.*#+}} xmm1 = [1,268435456]
 ; CHECK-SSE41-NEXT:    pmuludq %xmm0, %xmm1
@@ -1367,9 +1367,9 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_srem_odd_allones_and_poweroftwo:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,0,1,3435973837]
 ; CHECK-AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1,u,268435456,u]
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
 ; CHECK-AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm1
 ; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
@@ -1380,7 +1380,7 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX2-LABEL: test_srem_odd_allones_and_poweroftwo:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,0,1,3435973837]
 ; CHECK-AVX2-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -1392,7 +1392,7 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX512VL-LABEL: test_srem_odd_allones_and_poweroftwo:
 ; CHECK-AVX512VL:       # %bb.0:
-; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,0,1,3435973837]
 ; CHECK-AVX512VL-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -1410,9 +1410,9 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_srem_even_allones_and_poweroftwo:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,0,1,3067833783]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,u,3067833783,u]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; CHECK-SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -1420,7 +1420,7 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-NEXT:    pmuludq %xmm0, %xmm1
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [1,u,2147483648,u]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
@@ -1434,11 +1434,11 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_srem_even_allones_and_poweroftwo:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,0,1,3067833783]
 ; CHECK-SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,2147483648,u]
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,1,268435456,2147483648]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1452,11 +1452,11 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_srem_even_allones_and_poweroftwo:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,0,1,3067833783]
 ; CHECK-AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,u,2147483648,u]
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2147483648,1,268435456,2147483648]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1469,7 +1469,7 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX2-LABEL: test_srem_even_allones_and_poweroftwo:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,0,1,3067833783]
 ; CHECK-AVX2-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -1481,7 +1481,7 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX512VL-LABEL: test_srem_even_allones_and_poweroftwo:
 ; CHECK-AVX512VL:       # %bb.0:
-; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,0,1,3067833783]
 ; CHECK-AVX512VL-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -1499,9 +1499,9 @@ define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
 ; CHECK-SSE2-LABEL: test_srem_odd_even_allones_and_poweroftwo:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,0,1,3264175145]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,u,3264175145,u]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; CHECK-SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -1509,7 +1509,7 @@ define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
 ; CHECK-SSE2-NEXT:    pmuludq %xmm0, %xmm1
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [1,u,1073741824,u]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
@@ -1523,11 +1523,11 @@ define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
 ;
 ; CHECK-SSE41-LABEL: test_srem_odd_even_allones_and_poweroftwo:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,0,1,3264175145]
 ; CHECK-SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,1073741824,u]
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,1,268435456,1073741824]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1541,11 +1541,11 @@ define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
 ;
 ; CHECK-AVX1-LABEL: test_srem_odd_even_allones_and_poweroftwo:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,0,1,3264175145]
 ; CHECK-AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,u,1073741824,u]
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,1,268435456,1073741824]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1558,7 +1558,7 @@ define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
 ;
 ; CHECK-AVX2-LABEL: test_srem_odd_even_allones_and_poweroftwo:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,0,1,3264175145]
 ; CHECK-AVX2-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -1570,7 +1570,7 @@ define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
 ;
 ; CHECK-AVX512VL-LABEL: test_srem_odd_even_allones_and_poweroftwo:
 ; CHECK-AVX512VL:       # %bb.0:
-; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,0,1,3264175145]
 ; CHECK-AVX512VL-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -1604,7 +1604,7 @@ define <4 x i32> @test_srem_odd_allones_and_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_srem_odd_allones_and_one:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,3435973837,3435973837]
 ; CHECK-SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993458,4294967295,4294967295,858993458]
 ; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
@@ -1614,7 +1614,7 @@ define <4 x i32> @test_srem_odd_allones_and_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_srem_odd_allones_and_one:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,3435973837,3435973837]
 ; CHECK-AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -1669,7 +1669,7 @@ define <4 x i32> @test_srem_even_allones_and_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_srem_even_allones_and_one:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,3067833783,3067833783]
 ; CHECK-SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    psrld $1, %xmm1
@@ -1683,7 +1683,7 @@ define <4 x i32> @test_srem_even_allones_and_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_srem_even_allones_and_one:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,3067833783,3067833783]
 ; CHECK-AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $1, %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
@@ -1727,15 +1727,15 @@ define <4 x i32> @test_srem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_srem_odd_even_allones_and_one:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,0,0,3264175145]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,u,3264175145,u]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; CHECK-SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,1,1073741824,1073741824]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; CHECK-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -1747,10 +1747,10 @@ define <4 x i32> @test_srem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_srem_odd_even_allones_and_one:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,0,0,3264175145]
 ; CHECK-SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,1,1073741824,1073741824]
 ; CHECK-SSE41-NEXT:    pxor %xmm2, %xmm2
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1764,10 +1764,10 @@ define <4 x i32> @test_srem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_srem_odd_even_allones_and_one:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,0,0,3264175145]
 ; CHECK-AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,1,1073741824,1073741824]
 ; CHECK-AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1780,7 +1780,7 @@ define <4 x i32> @test_srem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX2-LABEL: test_srem_odd_even_allones_and_one:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,0,0,3264175145]
 ; CHECK-AVX2-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -1792,7 +1792,7 @@ define <4 x i32> @test_srem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX512VL-LABEL: test_srem_odd_even_allones_and_one:
 ; CHECK-AVX512VL:       # %bb.0:
-; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,0,0,3264175145]
 ; CHECK-AVX512VL-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -1812,15 +1812,15 @@ define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_srem_odd_poweroftwo_and_one:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,1,0,3435973837]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,3435973837,u]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; CHECK-SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,268435456,1,1]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; CHECK-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -1832,10 +1832,10 @@ define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_srem_odd_poweroftwo_and_one:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,1,0,3435973837]
 ; CHECK-SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [268435456,268435456,1,1]
 ; CHECK-SSE41-NEXT:    pxor %xmm2, %xmm2
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1849,10 +1849,10 @@ define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_srem_odd_poweroftwo_and_one:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,1,0,3435973837]
 ; CHECK-AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [268435456,268435456,1,1]
 ; CHECK-AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1865,7 +1865,7 @@ define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX2-LABEL: test_srem_odd_poweroftwo_and_one:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,1,0,3435973837]
 ; CHECK-AVX2-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -1877,7 +1877,7 @@ define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX512VL-LABEL: test_srem_odd_poweroftwo_and_one:
 ; CHECK-AVX512VL:       # %bb.0:
-; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,1,0,3435973837]
 ; CHECK-AVX512VL-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -1895,9 +1895,9 @@ define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_srem_even_poweroftwo_and_one:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,1,0,3067833783]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,3067833783,u]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; CHECK-SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -1905,7 +1905,7 @@ define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-NEXT:    pmuludq %xmm0, %xmm1
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [268435456,u,2147483648,u]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
@@ -1919,11 +1919,11 @@ define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_srem_even_poweroftwo_and_one:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,1,0,3067833783]
 ; CHECK-SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [268435456,u,2147483648,u]
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,268435456,1,2147483648]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1937,11 +1937,11 @@ define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_srem_even_poweroftwo_and_one:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,1,0,3067833783]
 ; CHECK-AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [268435456,u,2147483648,u]
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2147483648,268435456,1,2147483648]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1954,7 +1954,7 @@ define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX2-LABEL: test_srem_even_poweroftwo_and_one:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,1,0,3067833783]
 ; CHECK-AVX2-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -1966,7 +1966,7 @@ define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX512VL-LABEL: test_srem_even_poweroftwo_and_one:
 ; CHECK-AVX512VL:       # %bb.0:
-; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,1,0,3067833783]
 ; CHECK-AVX512VL-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -1984,15 +1984,15 @@ define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_srem_odd_even_poweroftwo_and_one:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,1,0,3264175145]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,3264175145,u]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; CHECK-SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,268435456,1073741824,1073741824]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; CHECK-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -2004,10 +2004,10 @@ define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_srem_odd_even_poweroftwo_and_one:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,1,0,3264175145]
 ; CHECK-SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [268435456,268435456,1073741824,1073741824]
 ; CHECK-SSE41-NEXT:    pxor %xmm2, %xmm2
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -2021,10 +2021,10 @@ define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_srem_odd_even_poweroftwo_and_one:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,1,0,3264175145]
 ; CHECK-AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [268435456,268435456,1073741824,1073741824]
 ; CHECK-AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -2037,7 +2037,7 @@ define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX2-LABEL: test_srem_odd_even_poweroftwo_and_one:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,1,0,3264175145]
 ; CHECK-AVX2-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -2049,7 +2049,7 @@ define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX512VL-LABEL: test_srem_odd_even_poweroftwo_and_one:
 ; CHECK-AVX512VL:       # %bb.0:
-; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,1,0,3264175145]
 ; CHECK-AVX512VL-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -2067,9 +2067,9 @@ define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_srem_odd_allones_and_poweroftwo_and_one:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,u,1,u]
 ; CHECK-SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,268435456,u]
 ; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; CHECK-SSE2-NEXT:    psrlq $32, %xmm1
 ; CHECK-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -2081,9 +2081,9 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
 ;
 ; CHECK-SSE41-LABEL: test_srem_odd_allones_and_poweroftwo_and_one:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,u,1,u]
 ; CHECK-SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,268435456,u]
 ; CHECK-SSE41-NEXT:    pxor %xmm1, %xmm1
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    psrlq $32, %xmm0
@@ -2096,9 +2096,9 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
 ;
 ; CHECK-AVX1-LABEL: test_srem_odd_allones_and_poweroftwo_and_one:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,u,1,u]
 ; CHECK-AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,u,268435456,u]
 ; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
 ; CHECK-AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm0
@@ -2110,7 +2110,7 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
 ;
 ; CHECK-AVX2-LABEL: test_srem_odd_allones_and_poweroftwo_and_one:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,0,1,0]
 ; CHECK-AVX2-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -2122,7 +2122,7 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
 ;
 ; CHECK-AVX512VL-LABEL: test_srem_odd_allones_and_poweroftwo_and_one:
 ; CHECK-AVX512VL:       # %bb.0:
-; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,0,1,0]
 ; CHECK-AVX512VL-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -2138,9 +2138,9 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
 define <4 x i32> @test_srem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_srem_even_allones_and_poweroftwo_and_one:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,u,1,u]
 ; CHECK-SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,u,268435456,u]
 ; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; CHECK-SSE2-NEXT:    psrlq $32, %xmm1
 ; CHECK-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -2152,9 +2152,9 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
 ;
 ; CHECK-SSE41-LABEL: test_srem_even_allones_and_poweroftwo_and_one:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,u,1,u]
 ; CHECK-SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,u,268435456,u]
 ; CHECK-SSE41-NEXT:    pxor %xmm1, %xmm1
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    psrlq $32, %xmm0
@@ -2167,9 +2167,9 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
 ;
 ; CHECK-AVX1-LABEL: test_srem_even_allones_and_poweroftwo_and_one:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,u,1,u]
 ; CHECK-AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2147483648,u,268435456,u]
 ; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
 ; CHECK-AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm0
@@ -2181,7 +2181,7 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
 ;
 ; CHECK-AVX2-LABEL: test_srem_even_allones_and_poweroftwo_and_one:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,0,1,0]
 ; CHECK-AVX2-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -2193,7 +2193,7 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
 ;
 ; CHECK-AVX512VL-LABEL: test_srem_even_allones_and_poweroftwo_and_one:
 ; CHECK-AVX512VL:       # %bb.0:
-; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,0,1,0]
 ; CHECK-AVX512VL-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
@@ -2335,10 +2335,10 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
 ; CHECK-AVX1:       # %bb.0:
 ; CHECK-AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; CHECK-AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; CHECK-AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [34048,34048,26368,37632,21760,33024,22016,35072]
+; CHECK-AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [0,133,0,133,0,103,0,147,0,85,0,129,0,86,0,137]
 ; CHECK-AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
 ; CHECK-AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; CHECK-AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [20224,26368,6912,30976,33024,33024,33024,12032]
+; CHECK-AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,79,0,103,0,27,0,121,0,129,0,129,0,129,0,47]
 ; CHECK-AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
 ; CHECK-AVX1-NEXT:    vpackuswb %xmm3, %xmm4, %xmm4
 ; CHECK-AVX1-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm5 # [0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0]
@@ -2369,10 +2369,10 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
 ; CHECK-AVX1-NEXT:    vpsubb %xmm4, %xmm0, %xmm4
 ; CHECK-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; CHECK-AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 # [2304,0,10496,37632,33024,33024,21760,36096]
+; CHECK-AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 # [0,9,0,0,0,41,0,147,0,129,0,129,0,85,0,141]
 ; CHECK-AVX1-NEXT:    vpsrlw $8, %xmm6, %xmm6
 ; CHECK-AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; CHECK-AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7, %xmm7 # [22016,24320,37632,11008,12544,32512,16640,37632]
+; CHECK-AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7, %xmm7 # [0,86,0,95,0,147,0,43,0,49,0,127,0,65,0,147]
 ; CHECK-AVX1-NEXT:    vpsrlw $8, %xmm7, %xmm7
 ; CHECK-AVX1-NEXT:    vpackuswb %xmm6, %xmm7, %xmm6
 ; CHECK-AVX1-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm7 # [0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0]
@@ -2417,10 +2417,10 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
 ; CHECK-AVX2:       # %bb.0:
 ; CHECK-AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; CHECK-AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
-; CHECK-AVX2-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [34048,34048,26368,37632,21760,33024,22016,35072,2304,0,10496,37632,33024,33024,21760,36096]
+; CHECK-AVX2-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [0,133,0,133,0,103,0,147,0,85,0,129,0,86,0,137,0,9,0,0,0,41,0,147,0,129,0,129,0,85,0,141]
 ; CHECK-AVX2-NEXT:    vpsrlw $8, %ymm3, %ymm3
 ; CHECK-AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
-; CHECK-AVX2-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [20224,26368,6912,30976,33024,33024,33024,12032,22016,24320,37632,11008,12544,32512,16640,37632]
+; CHECK-AVX2-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,79,0,103,0,27,0,121,0,129,0,129,0,129,0,47,0,86,0,95,0,147,0,43,0,49,0,127,0,65,0,147]
 ; CHECK-AVX2-NEXT:    vpsrlw $8, %ymm4, %ymm4
 ; CHECK-AVX2-NEXT:    vpackuswb %ymm3, %ymm4, %ymm3
 ; CHECK-AVX2-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4 # [0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0]
diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll
index 33592027dee93..d459d018b6686 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll
@@ -24,7 +24,7 @@ define <4 x i32> @test_srem_odd_25(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_srem_odd_25:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3264175145,3264175145,3264175145,3264175145]
 ; CHECK-SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [171798690,171798690,171798690,171798690]
 ; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
@@ -34,7 +34,7 @@ define <4 x i32> @test_srem_odd_25(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_srem_odd_25:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3264175145,3264175145,3264175145,3264175145]
 ; CHECK-AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -90,7 +90,7 @@ define <4 x i32> @test_srem_even_100(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_srem_even_100:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3264175145,3264175145,3264175145,3264175145]
 ; CHECK-SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    psrld $2, %xmm1
@@ -104,7 +104,7 @@ define <4 x i32> @test_srem_even_100(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_srem_even_100:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3264175145,3264175145,3264175145,3264175145]
 ; CHECK-AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $2, %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpslld $30, %xmm0, %xmm0
@@ -165,7 +165,7 @@ define <4 x i32> @test_srem_odd_neg25(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_srem_odd_neg25:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3264175145,3264175145,3264175145,3264175145]
 ; CHECK-SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [171798690,171798690,171798690,171798690]
 ; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
@@ -175,7 +175,7 @@ define <4 x i32> @test_srem_odd_neg25(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_srem_odd_neg25:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3264175145,3264175145,3264175145,3264175145]
 ; CHECK-AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -231,7 +231,7 @@ define <4 x i32> @test_srem_even_neg100(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_srem_even_neg100:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3264175145,3264175145,3264175145,3264175145]
 ; CHECK-SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    psrld $2, %xmm1
@@ -245,7 +245,7 @@ define <4 x i32> @test_srem_even_neg100(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_srem_even_neg100:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3264175145,3264175145,3264175145,3264175145]
 ; CHECK-AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $2, %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpslld $30, %xmm0, %xmm0
@@ -333,7 +333,7 @@ define <4 x i32> @test_srem_odd_undef1(<4 x i32> %X) nounwind {
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm1
 ; CHECK-SSE41-NEXT:    psrad $3, %xmm2
 ; CHECK-SSE41-NEXT:    paddd %xmm1, %xmm2
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [25,25,25,25]
 ; CHECK-SSE41-NEXT:    psubd %xmm2, %xmm0
 ; CHECK-SSE41-NEXT:    pxor %xmm1, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
@@ -351,7 +351,7 @@ define <4 x i32> @test_srem_odd_undef1(<4 x i32> %X) nounwind {
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm1, %xmm2
 ; CHECK-AVX1-NEXT:    vpsrad $3, %xmm1, %xmm1
 ; CHECK-AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [25,25,25,25]
 ; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -444,7 +444,7 @@ define <4 x i32> @test_srem_even_undef1(<4 x i32> %X) nounwind {
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm1
 ; CHECK-SSE41-NEXT:    psrad $5, %xmm2
 ; CHECK-SSE41-NEXT:    paddd %xmm1, %xmm2
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [100,100,100,100]
 ; CHECK-SSE41-NEXT:    psubd %xmm2, %xmm0
 ; CHECK-SSE41-NEXT:    pxor %xmm1, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
@@ -462,7 +462,7 @@ define <4 x i32> @test_srem_even_undef1(<4 x i32> %X) nounwind {
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm1, %xmm2
 ; CHECK-AVX1-NEXT:    vpsrad $5, %xmm1, %xmm1
 ; CHECK-AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [100,100,100,100]
 ; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/udiv-exact.ll b/llvm/test/CodeGen/X86/udiv-exact.ll
index 271d11edff9a7..2b3f26a317000 100644
--- a/llvm/test/CodeGen/X86/udiv-exact.ll
+++ b/llvm/test/CodeGen/X86/udiv-exact.ll
@@ -87,7 +87,7 @@ define <4 x i32> @test5(<4 x i32> %x) {
 ; X86-NEXT:    pmuludq %xmm1, %xmm0
 ; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; X86-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X86-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [2863311531,u,3264175145,u]
 ; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; X86-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X86-NEXT:    retl
@@ -95,7 +95,7 @@ define <4 x i32> @test5(<4 x i32> %x) {
 ; X64-LABEL: test5:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2863311531,2863311531,3264175145,3264175145]
 ; X64-NEXT:    retq
   %div = udiv exact <4 x i32> %x, <i32 24, i32 24, i32 25, i32 25>
   ret <4 x i32> %div
@@ -112,7 +112,7 @@ define <4 x i32> @test6(<4 x i32> %x) {
 ; X86-NEXT:    pmuludq %xmm0, %xmm1
 ; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; X86-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [2863311531,u,3303820997,u]
 ; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; X86-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; X86-NEXT:    movdqa %xmm1, %xmm0
@@ -121,7 +121,7 @@ define <4 x i32> @test6(<4 x i32> %x) {
 ; X64-LABEL: test6:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2863311531,2863311531,3303820997,3303820997]
 ; X64-NEXT:    retq
   %div = udiv exact <4 x i32> %x, <i32 24, i32 24, i32 26, i32 26>
   ret <4 x i32> %div
@@ -131,16 +131,16 @@ define <4 x i32> @test7(<4 x i32> %x) {
 ; X86-LABEL: test7:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [3264175145,3264175145,1749801491,1749801491]
 ; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [3264175145,u,1749801491,u]
 ; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; X86-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test7:
 ; X64:       # %bb.0:
-; X64-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3264175145,3264175145,1749801491,1749801491]
 ; X64-NEXT:    retq
   %div = udiv exact <4 x i32> %x, <i32 25, i32 25, i32 27, i32 27>
   ret <4 x i32> %div
@@ -156,7 +156,7 @@ define <4 x i32> @test8(<4 x i32> %x) {
 ; X86-NEXT:    pmuludq %xmm1, %xmm0
 ; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; X86-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X86-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [1,u,2863311531,u]
 ; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; X86-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X86-NEXT:    retl
@@ -164,7 +164,7 @@ define <4 x i32> @test8(<4 x i32> %x) {
 ; X64-LABEL: test8:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,1,2863311531,2863311531]
 ; X64-NEXT:    retq
   %div = udiv exact <4 x i32> %x, <i32 1, i32 1, i32 24, i32 24>
   ret <4 x i32> %div
diff --git a/llvm/test/CodeGen/X86/undo-mul-and.ll b/llvm/test/CodeGen/X86/undo-mul-and.ll
index c9c40099e5464..6566153b8514b 100644
--- a/llvm/test/CodeGen/X86/undo-mul-and.ll
+++ b/llvm/test/CodeGen/X86/undo-mul-and.ll
@@ -63,9 +63,9 @@ define <4 x i32> @mul_and_to_neg_shl_and_vec_fail_no_splat(<4 x i32> %x) {
 ; CHECK-SSE-LABEL: mul_and_to_neg_shl_and_vec_fail_no_splat:
 ; CHECK-SSE:       # %bb.0:
 ; CHECK-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [56,56,56,64]
 ; CHECK-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [56,u,64,u]
 ; CHECK-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; CHECK-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -73,13 +73,13 @@ define <4 x i32> @mul_and_to_neg_shl_and_vec_fail_no_splat(<4 x i32> %x) {
 ;
 ; CHECK-AVX1-LABEL: mul_and_to_neg_shl_and_vec_fail_no_splat:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [56,56,56,64]
 ; CHECK-AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX512-LABEL: mul_and_to_neg_shl_and_vec_fail_no_splat:
 ; CHECK-AVX512:       # %bb.0:
-; CHECK-AVX512-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [56,56,56,64]
 ; CHECK-AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
 ; CHECK-AVX512-NEXT:    retq
   %mul = mul <4 x i32> %x, <i32 56, i32 56, i32 56, i32 64>
@@ -92,9 +92,9 @@ define <4 x i32> @mul_and_to_neg_shl_and_vec_todo_no_splat1(<4 x i32> %x) {
 ; CHECK-SSE-LABEL: mul_and_to_neg_shl_and_vec_todo_no_splat1:
 ; CHECK-SSE:       # %bb.0:
 ; CHECK-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [56,56,56,48]
 ; CHECK-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [56,u,48,u]
 ; CHECK-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; CHECK-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -102,13 +102,13 @@ define <4 x i32> @mul_and_to_neg_shl_and_vec_todo_no_splat1(<4 x i32> %x) {
 ;
 ; CHECK-AVX1-LABEL: mul_and_to_neg_shl_and_vec_todo_no_splat1:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [56,56,56,48]
 ; CHECK-AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
 ; CHECK-AVX512-LABEL: mul_and_to_neg_shl_and_vec_todo_no_splat1:
 ; CHECK-AVX512:       # %bb.0:
-; CHECK-AVX512-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [56,56,56,48]
 ; CHECK-AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
 ; CHECK-AVX512-NEXT:    retq
   %mul = mul <4 x i32> %x, <i32 56, i32 56, i32 56, i32 48>
@@ -131,7 +131,7 @@ define <4 x i32> @mul_and_to_neg_shl_and_vec_todo_no_splat2(<4 x i32> %x) {
 ;
 ; CHECK-AVX1-LABEL: mul_and_to_neg_shl_and_vec_todo_no_splat2:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [56,56,56,56]
 ; CHECK-AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
index 7c1a1e285ca05..759055d284d12 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=i686-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=X86
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=X64
 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 < %s | FileCheck %s --check-prefixes=X64,SSE2
 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.1 < %s | FileCheck %s --check-prefixes=X64,SSE41
 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx < %s | FileCheck %s --check-prefixes=X64,AVX1
@@ -168,7 +167,7 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
 ; SSE41-NEXT:    pinsrd $1, %esi, %xmm0
 ; SSE41-NEXT:    pinsrd $2, %edx, %xmm0
 ; SSE41-NEXT:    psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [683,1463,819,u]
 ; SSE41-NEXT:    pmovsxwd {{.*#+}} xmm1 = [2047,2047,2047,2047]
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
 ; SSE41-NEXT:    pand %xmm1, %xmm2
@@ -194,7 +193,7 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
 ; AVX1-NEXT:    vpinsrd $1, %esi, %xmm0, %xmm0
 ; AVX1-NEXT:    vpinsrd $2, %edx, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [683,1463,819,u]
 ; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [2047,2047,2047,2047]
 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm2
@@ -219,7 +218,7 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
 ; AVX2-NEXT:    vpinsrd $1, %esi, %xmm0, %xmm0
 ; AVX2-NEXT:    vpinsrd $2, %edx, %xmm0, %xmm0
 ; AVX2-NEXT:    vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [683,1463,819,u]
 ; AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2047,2047,2047,2047]
 ; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
@@ -241,7 +240,7 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
 ; AVX512VL-NEXT:    vpinsrd $1, %esi, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpinsrd $2, %edx, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [683,1463,819,u]
 ; AVX512VL-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2047,2047,2047,2047]
 ; AVX512VL-NEXT:    vpand %xmm2, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
index 838086e366fbf..2228c09bba906 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
@@ -10,10 +10,10 @@ define <4 x i32> @test_urem_odd_even(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_odd_even:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,3264175145,3264175145]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,u,3264175145,u]
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,1073741824,1073741824]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; CHECK-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
@@ -26,9 +26,9 @@ define <4 x i32> @test_urem_odd_even(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_urem_odd_even:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,3264175145,3264175145]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,1073741824,1073741824]
 ; CHECK-SSE41-NEXT:    pxor %xmm2, %xmm2
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -42,9 +42,9 @@ define <4 x i32> @test_urem_odd_even(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_urem_odd_even:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,3264175145,3264175145]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,2147483648,1073741824,1073741824]
 ; CHECK-AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -57,7 +57,7 @@ define <4 x i32> @test_urem_odd_even(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX2-LABEL: test_urem_odd_even:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,3264175145,3264175145]
 ; CHECK-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
@@ -68,7 +68,7 @@ define <4 x i32> @test_urem_odd_even(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX512VL-LABEL: test_urem_odd_even:
 ; CHECK-AVX512VL:       # %bb.0:
-; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,3264175145,3264175145]
 ; CHECK-AVX512VL-NEXT:    vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX512VL-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -87,9 +87,9 @@ define <4 x i32> @test_urem_odd_allones_eq(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_odd_allones_eq:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,4294967295,3435973837]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3435973837,3435973837,3435973837,3435973837]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; CHECK-SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -99,7 +99,7 @@ define <4 x i32> @test_urem_odd_allones_eq(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_urem_odd_allones_eq:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,4294967295,3435973837]
 ; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993459,858993459,1,858993459]
 ; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
@@ -108,7 +108,7 @@ define <4 x i32> @test_urem_odd_allones_eq(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX-LABEL: test_urem_odd_allones_eq:
 ; CHECK-AVX:       # %bb.0:
-; CHECK-AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,4294967295,3435973837]
 ; CHECK-AVX-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX-NEXT:    vpsrld $31, %xmm0, %xmm0
@@ -122,9 +122,9 @@ define <4 x i32> @test_urem_odd_allones_ne(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_odd_allones_ne:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,4294967295,3435973837]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3435973837,3435973837,3435973837,3435973837]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; CHECK-SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -134,7 +134,7 @@ define <4 x i32> @test_urem_odd_allones_ne(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_urem_odd_allones_ne:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,4294967295,3435973837]
 ; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993460,858993460,2,858993460]
 ; CHECK-SSE41-NEXT:    pmaxud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
@@ -143,7 +143,7 @@ define <4 x i32> @test_urem_odd_allones_ne(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX-LABEL: test_urem_odd_allones_ne:
 ; CHECK-AVX:       # %bb.0:
-; CHECK-AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,4294967295,3435973837]
 ; CHECK-AVX-NEXT:    vpmaxud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX-NEXT:    vpsrld $31, %xmm0, %xmm0
@@ -159,12 +159,12 @@ define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_even_allones_eq:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,3067833783,3067833783,3067833783]
 ; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
 ; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,4294967295,3067833783]
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,1,2147483648]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
@@ -178,10 +178,10 @@ define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_urem_even_allones_eq:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,4294967295,3067833783]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,2147483648,2147483648]
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,1,2147483648]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -195,10 +195,10 @@ define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_urem_even_allones_eq:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,4294967295,3067833783]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,2147483648,2147483648,2147483648]
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2147483648,2147483648,1,2147483648]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -211,7 +211,7 @@ define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX2-LABEL: test_urem_even_allones_eq:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,4294967295,3067833783]
 ; CHECK-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
@@ -222,7 +222,7 @@ define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX512VL-LABEL: test_urem_even_allones_eq:
 ; CHECK-AVX512VL:       # %bb.0:
-; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,4294967295,3067833783]
 ; CHECK-AVX512VL-NEXT:    vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX512VL-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -237,12 +237,12 @@ define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_even_allones_ne:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,3067833783,3067833783,3067833783]
 ; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
 ; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,4294967295,3067833783]
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,1,2147483648]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
@@ -256,10 +256,10 @@ define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_urem_even_allones_ne:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,4294967295,3067833783]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,2147483648,2147483648]
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,1,2147483648]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -273,10 +273,10 @@ define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_urem_even_allones_ne:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,4294967295,3067833783]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,2147483648,2147483648,2147483648]
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2147483648,2147483648,1,2147483648]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -289,7 +289,7 @@ define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX2-LABEL: test_urem_even_allones_ne:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,4294967295,3067833783]
 ; CHECK-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
@@ -300,7 +300,7 @@ define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX512VL-LABEL: test_urem_even_allones_ne:
 ; CHECK-AVX512VL:       # %bb.0:
-; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,4294967295,3067833783]
 ; CHECK-AVX512VL-NEXT:    vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vpmaxud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX512VL-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -317,10 +317,10 @@ define <4 x i32> @test_urem_odd_even_allones_eq(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_odd_even_allones_eq:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,4294967295,3264175145]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,u,3264175145,u]
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,1073741824,1073741824]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; CHECK-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
@@ -333,9 +333,9 @@ define <4 x i32> @test_urem_odd_even_allones_eq(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_urem_odd_even_allones_eq:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,4294967295,3264175145]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,1073741824,1073741824]
 ; CHECK-SSE41-NEXT:    pxor %xmm2, %xmm2
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -349,9 +349,9 @@ define <4 x i32> @test_urem_odd_even_allones_eq(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_urem_odd_even_allones_eq:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,4294967295,3264175145]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,2147483648,1073741824,1073741824]
 ; CHECK-AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -364,7 +364,7 @@ define <4 x i32> @test_urem_odd_even_allones_eq(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX2-LABEL: test_urem_odd_even_allones_eq:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,4294967295,3264175145]
 ; CHECK-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
@@ -375,7 +375,7 @@ define <4 x i32> @test_urem_odd_even_allones_eq(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX512VL-LABEL: test_urem_odd_even_allones_eq:
 ; CHECK-AVX512VL:       # %bb.0:
-; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,4294967295,3264175145]
 ; CHECK-AVX512VL-NEXT:    vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX512VL-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -390,10 +390,10 @@ define <4 x i32> @test_urem_odd_even_allones_ne(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_odd_even_allones_ne:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,4294967295,3264175145]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,u,3264175145,u]
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,1073741824,1073741824]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; CHECK-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
@@ -406,9 +406,9 @@ define <4 x i32> @test_urem_odd_even_allones_ne(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_urem_odd_even_allones_ne:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,4294967295,3264175145]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,1073741824,1073741824]
 ; CHECK-SSE41-NEXT:    pxor %xmm2, %xmm2
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -422,9 +422,9 @@ define <4 x i32> @test_urem_odd_even_allones_ne(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_urem_odd_even_allones_ne:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,4294967295,3264175145]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,2147483648,1073741824,1073741824]
 ; CHECK-AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -437,7 +437,7 @@ define <4 x i32> @test_urem_odd_even_allones_ne(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX2-LABEL: test_urem_odd_even_allones_ne:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,4294967295,3264175145]
 ; CHECK-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
@@ -448,7 +448,7 @@ define <4 x i32> @test_urem_odd_even_allones_ne(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX512VL-LABEL: test_urem_odd_even_allones_ne:
 ; CHECK-AVX512VL:       # %bb.0:
-; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,4294967295,3264175145]
 ; CHECK-AVX512VL-NEXT:    vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vpmaxud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX512VL-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -467,10 +467,10 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_odd_poweroftwo:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3435973837,3435973837,3435973837,3435973837]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,1,3435973837]
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,268435456,u]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
 ; CHECK-SSE2-NEXT:    psrlq $32, %xmm0
@@ -482,7 +482,7 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_urem_odd_poweroftwo:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,1,3435973837]
 ; CHECK-SSE41-NEXT:    pmovsxdq {{.*#+}} xmm1 = [1,268435456]
 ; CHECK-SSE41-NEXT:    pmuludq %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
@@ -496,8 +496,8 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_urem_odd_poweroftwo:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,1,3435973837]
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1,u,268435456,u]
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
 ; CHECK-AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm1
 ; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
@@ -508,7 +508,7 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX2-LABEL: test_urem_odd_poweroftwo:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,1,3435973837]
 ; CHECK-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
@@ -519,7 +519,7 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX512VL-LABEL: test_urem_odd_poweroftwo:
 ; CHECK-AVX512VL:       # %bb.0:
-; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,1,3435973837]
 ; CHECK-AVX512VL-NEXT:    vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX512VL-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -536,12 +536,12 @@ define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_even_poweroftwo:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,3067833783,3067833783,3067833783]
 ; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
 ; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,1,3067833783]
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,268435456,2147483648]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
@@ -555,10 +555,10 @@ define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_urem_even_poweroftwo:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,1,3067833783]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,2147483648,2147483648]
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,268435456,2147483648]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -572,10 +572,10 @@ define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_urem_even_poweroftwo:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,1,3067833783]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,2147483648,2147483648,2147483648]
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2147483648,2147483648,268435456,2147483648]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -588,7 +588,7 @@ define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX2-LABEL: test_urem_even_poweroftwo:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,1,3067833783]
 ; CHECK-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
@@ -599,7 +599,7 @@ define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX512VL-LABEL: test_urem_even_poweroftwo:
 ; CHECK-AVX512VL:       # %bb.0:
-; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,1,3067833783]
 ; CHECK-AVX512VL-NEXT:    vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX512VL-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -616,11 +616,11 @@ define <4 x i32> @test_urem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_odd_even_poweroftwo:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,1,3264175145]
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2147483648,268435456,1073741824]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,u,3264175145,u]
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,u,1073741824,u]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -634,10 +634,10 @@ define <4 x i32> @test_urem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_urem_odd_even_poweroftwo:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,1,3264175145]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,u,1073741824,u]
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2147483648,268435456,1073741824]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -651,10 +651,10 @@ define <4 x i32> @test_urem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_urem_odd_even_poweroftwo:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,1,3264175145]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,u,1073741824,u]
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2147483648,268435456,1073741824]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -667,7 +667,7 @@ define <4 x i32> @test_urem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX2-LABEL: test_urem_odd_even_poweroftwo:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,1,3264175145]
 ; CHECK-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
@@ -678,7 +678,7 @@ define <4 x i32> @test_urem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX512VL-LABEL: test_urem_odd_even_poweroftwo:
 ; CHECK-AVX512VL:       # %bb.0:
-; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,1,3264175145]
 ; CHECK-AVX512VL-NEXT:    vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX512VL-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -710,7 +710,7 @@ define <4 x i32> @test_urem_odd_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_urem_odd_one:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,3435973837,3435973837]
 ; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993459,858993459,4294967295,858993459]
 ; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
@@ -719,7 +719,7 @@ define <4 x i32> @test_urem_odd_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_urem_odd_one:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,3435973837,3435973837]
 ; CHECK-AVX1-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
@@ -769,7 +769,7 @@ define <4 x i32> @test_urem_even_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_urem_even_one:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,3067833783,3067833783]
 ; CHECK-SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    psrld $1, %xmm1
 ; CHECK-SSE41-NEXT:    pslld $31, %xmm0
@@ -782,7 +782,7 @@ define <4 x i32> @test_urem_even_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_urem_even_one:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,3067833783,3067833783]
 ; CHECK-AVX1-NEXT:    vpsrld $1, %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
@@ -822,10 +822,10 @@ define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_odd_even_one:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,0,3264175145]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,u,3264175145,u]
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,1073741824,1073741824]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; CHECK-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
@@ -838,9 +838,9 @@ define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_urem_odd_even_one:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,0,3264175145]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,1073741824,1073741824]
 ; CHECK-SSE41-NEXT:    pxor %xmm2, %xmm2
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -854,9 +854,9 @@ define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_urem_odd_even_one:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,0,3264175145]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,2147483648,1073741824,1073741824]
 ; CHECK-AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -869,7 +869,7 @@ define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX2-LABEL: test_urem_odd_even_one:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,0,3264175145]
 ; CHECK-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
@@ -880,7 +880,7 @@ define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX512VL-LABEL: test_urem_odd_even_one:
 ; CHECK-AVX512VL:       # %bb.0:
-; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,0,3264175145]
 ; CHECK-AVX512VL-NEXT:    vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX512VL-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -899,10 +899,10 @@ define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_odd_INT_MIN:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3435973837,3435973837,3435973837,3435973837]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,1,3435973837]
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,2,u]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
 ; CHECK-SSE2-NEXT:    psrlq $32, %xmm0
@@ -914,7 +914,7 @@ define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_urem_odd_INT_MIN:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,1,3435973837]
 ; CHECK-SSE41-NEXT:    pmovsxbq {{.*#+}} xmm1 = [1,2]
 ; CHECK-SSE41-NEXT:    pmuludq %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
@@ -928,8 +928,8 @@ define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_urem_odd_INT_MIN:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,1,3435973837]
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1,u,2,u]
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
 ; CHECK-AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm1
 ; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
@@ -940,7 +940,7 @@ define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX2-LABEL: test_urem_odd_INT_MIN:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,1,3435973837]
 ; CHECK-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
@@ -951,7 +951,7 @@ define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX512VL-LABEL: test_urem_odd_INT_MIN:
 ; CHECK-AVX512VL:       # %bb.0:
-; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,1,3435973837]
 ; CHECK-AVX512VL-NEXT:    vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX512VL-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -968,12 +968,12 @@ define <4 x i32> @test_urem_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_even_INT_MIN:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,3067833783,3067833783,3067833783]
 ; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
 ; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,1,3067833783]
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,2,2147483648]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
@@ -987,10 +987,10 @@ define <4 x i32> @test_urem_even_INT_MIN(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_urem_even_INT_MIN:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,3067833783,1,3067833783]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,2147483648,2147483648,2147483648]
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,2147483648,2,2147483648]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1004,10 +1004,10 @@ define <4 x i32> @test_urem_even_INT_MIN(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_urem_even_INT_MIN:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,1,3067833783]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,2147483648,2147483648,2147483648]
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2147483648,2147483648,2,2147483648]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1020,7 +1020,7 @@ define <4 x i32> @test_urem_even_INT_MIN(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX2-LABEL: test_urem_even_INT_MIN:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,1,3067833783]
 ; CHECK-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
@@ -1031,7 +1031,7 @@ define <4 x i32> @test_urem_even_INT_MIN(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX512VL-LABEL: test_urem_even_INT_MIN:
 ; CHECK-AVX512VL:       # %bb.0:
-; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,3067833783,1,3067833783]
 ; CHECK-AVX512VL-NEXT:    vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX512VL-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -1048,11 +1048,11 @@ define <4 x i32> @test_urem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_odd_even_INT_MIN:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,1,3264175145]
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2147483648,2,1073741824]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3067833783,u,3264175145,u]
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,u,1073741824,u]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -1066,10 +1066,10 @@ define <4 x i32> @test_urem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_urem_odd_even_INT_MIN:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3067833783,1,3264175145]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2147483648,u,1073741824,u]
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2147483648,2,1073741824]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1083,10 +1083,10 @@ define <4 x i32> @test_urem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_urem_odd_even_INT_MIN:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,1,3264175145]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2147483648,u,1073741824,u]
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2147483648,2,1073741824]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1099,7 +1099,7 @@ define <4 x i32> @test_urem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX2-LABEL: test_urem_odd_even_INT_MIN:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,1,3264175145]
 ; CHECK-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
@@ -1110,7 +1110,7 @@ define <4 x i32> @test_urem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX512VL-LABEL: test_urem_odd_even_INT_MIN:
 ; CHECK-AVX512VL:       # %bb.0:
-; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3067833783,1,3264175145]
 ; CHECK-AVX512VL-NEXT:    vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX512VL-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -1129,10 +1129,10 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_odd_allones_and_poweroftwo:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294967295,u,3435973837,u]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,4294967295,1,3435973837]
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,268435456,u]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
 ; CHECK-SSE2-NEXT:    psrlq $32, %xmm0
@@ -1144,7 +1144,7 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,4294967295,1,3435973837]
 ; CHECK-SSE41-NEXT:    pmovsxdq {{.*#+}} xmm1 = [1,268435456]
 ; CHECK-SSE41-NEXT:    pmuludq %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
@@ -1158,8 +1158,8 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_urem_odd_allones_and_poweroftwo:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,4294967295,1,3435973837]
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1,u,268435456,u]
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
 ; CHECK-AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm1
 ; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
@@ -1170,7 +1170,7 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX2-LABEL: test_urem_odd_allones_and_poweroftwo:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,4294967295,1,3435973837]
 ; CHECK-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
@@ -1181,7 +1181,7 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX512VL-LABEL: test_urem_odd_allones_and_poweroftwo:
 ; CHECK-AVX512VL:       # %bb.0:
-; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,4294967295,1,3435973837]
 ; CHECK-AVX512VL-NEXT:    vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX512VL-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -1198,11 +1198,11 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_even_allones_and_poweroftwo:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,4294967295,1,3067833783]
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,1,268435456,2147483648]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294967295,u,3067833783,u]
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,2147483648,u]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -1216,10 +1216,10 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_urem_even_allones_and_poweroftwo:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,4294967295,1,3067833783]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,2147483648,u]
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,1,268435456,2147483648]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1233,10 +1233,10 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_urem_even_allones_and_poweroftwo:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,4294967295,1,3067833783]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,u,2147483648,u]
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2147483648,1,268435456,2147483648]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1249,7 +1249,7 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX2-LABEL: test_urem_even_allones_and_poweroftwo:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,4294967295,1,3067833783]
 ; CHECK-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
@@ -1260,7 +1260,7 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX512VL-LABEL: test_urem_even_allones_and_poweroftwo:
 ; CHECK-AVX512VL:       # %bb.0:
-; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,4294967295,1,3067833783]
 ; CHECK-AVX512VL-NEXT:    vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX512VL-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -1277,11 +1277,11 @@ define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
 ; CHECK-SSE2-LABEL: test_urem_odd_even_allones_and_poweroftwo:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,4294967295,1,3264175145]
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,1,268435456,1073741824]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294967295,u,3264175145,u]
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,1073741824,u]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -1295,10 +1295,10 @@ define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
 ;
 ; CHECK-SSE41-LABEL: test_urem_odd_even_allones_and_poweroftwo:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,4294967295,1,3264175145]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,1073741824,u]
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,1,268435456,1073741824]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1312,10 +1312,10 @@ define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
 ;
 ; CHECK-AVX1-LABEL: test_urem_odd_even_allones_and_poweroftwo:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,4294967295,1,3264175145]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,u,1073741824,u]
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,1,268435456,1073741824]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1328,7 +1328,7 @@ define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
 ;
 ; CHECK-AVX2-LABEL: test_urem_odd_even_allones_and_poweroftwo:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,4294967295,1,3264175145]
 ; CHECK-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
@@ -1339,7 +1339,7 @@ define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
 ;
 ; CHECK-AVX512VL-LABEL: test_urem_odd_even_allones_and_poweroftwo:
 ; CHECK-AVX512VL:       # %bb.0:
-; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,4294967295,1,3264175145]
 ; CHECK-AVX512VL-NEXT:    vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX512VL-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -1358,9 +1358,9 @@ define <4 x i32> @test_urem_odd_allones_and_one(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_odd_allones_and_one:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,4294967295,0,3435973837]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294967295,u,3435973837,u]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; CHECK-SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -1370,7 +1370,7 @@ define <4 x i32> @test_urem_odd_allones_and_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_urem_odd_allones_and_one:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,4294967295,0,3435973837]
 ; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993459,1,4294967295,858993459]
 ; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
@@ -1379,7 +1379,7 @@ define <4 x i32> @test_urem_odd_allones_and_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX-LABEL: test_urem_odd_allones_and_one:
 ; CHECK-AVX:       # %bb.0:
-; CHECK-AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,4294967295,0,3435973837]
 ; CHECK-AVX-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX-NEXT:    vpsrld $31, %xmm0, %xmm0
@@ -1395,11 +1395,11 @@ define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_even_allones_and_one:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,4294967295,0,3067833783]
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,1,1,2147483648]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294967295,u,3067833783,u]
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,2147483648,u]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -1413,10 +1413,10 @@ define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_urem_even_allones_and_one:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,4294967295,0,3067833783]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,2147483648,u]
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,1,1,2147483648]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1430,10 +1430,10 @@ define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_urem_even_allones_and_one:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,4294967295,0,3067833783]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,u,2147483648,u]
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2147483648,1,1,2147483648]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1446,7 +1446,7 @@ define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX2-LABEL: test_urem_even_allones_and_one:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,4294967295,0,3067833783]
 ; CHECK-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
@@ -1457,7 +1457,7 @@ define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX512VL-LABEL: test_urem_even_allones_and_one:
 ; CHECK-AVX512VL:       # %bb.0:
-; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,4294967295,0,3067833783]
 ; CHECK-AVX512VL-NEXT:    vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX512VL-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -1474,10 +1474,10 @@ define <4 x i32> @test_urem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_odd_even_allones_and_one:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,4294967295,0,3264175145]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294967295,u,3264175145,u]
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,1,1073741824,1073741824]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; CHECK-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
@@ -1490,9 +1490,9 @@ define <4 x i32> @test_urem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_urem_odd_even_allones_and_one:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,4294967295,0,3264175145]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,1,1073741824,1073741824]
 ; CHECK-SSE41-NEXT:    pxor %xmm2, %xmm2
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1506,9 +1506,9 @@ define <4 x i32> @test_urem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_urem_odd_even_allones_and_one:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,4294967295,0,3264175145]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,1,1073741824,1073741824]
 ; CHECK-AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1521,7 +1521,7 @@ define <4 x i32> @test_urem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX2-LABEL: test_urem_odd_even_allones_and_one:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,4294967295,0,3264175145]
 ; CHECK-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
@@ -1532,7 +1532,7 @@ define <4 x i32> @test_urem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX512VL-LABEL: test_urem_odd_even_allones_and_one:
 ; CHECK-AVX512VL:       # %bb.0:
-; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,4294967295,0,3264175145]
 ; CHECK-AVX512VL-NEXT:    vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX512VL-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -1551,10 +1551,10 @@ define <4 x i32> @test_urem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_odd_poweroftwo_and_one:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,1,0,3435973837]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,3435973837,u]
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [268435456,268435456,1,1]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; CHECK-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
@@ -1567,9 +1567,9 @@ define <4 x i32> @test_urem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_urem_odd_poweroftwo_and_one:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,1,0,3435973837]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [268435456,268435456,1,1]
 ; CHECK-SSE41-NEXT:    pxor %xmm2, %xmm2
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1583,9 +1583,9 @@ define <4 x i32> @test_urem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_urem_odd_poweroftwo_and_one:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,1,0,3435973837]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [268435456,268435456,1,1]
 ; CHECK-AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1598,7 +1598,7 @@ define <4 x i32> @test_urem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX2-LABEL: test_urem_odd_poweroftwo_and_one:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,1,0,3435973837]
 ; CHECK-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
@@ -1609,7 +1609,7 @@ define <4 x i32> @test_urem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX512VL-LABEL: test_urem_odd_poweroftwo_and_one:
 ; CHECK-AVX512VL:       # %bb.0:
-; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,1,0,3435973837]
 ; CHECK-AVX512VL-NEXT:    vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX512VL-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -1626,11 +1626,11 @@ define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_even_poweroftwo_and_one:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,1,0,3067833783]
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,268435456,1,2147483648]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,3067833783,u]
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [268435456,u,2147483648,u]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -1644,10 +1644,10 @@ define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_urem_even_poweroftwo_and_one:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,1,0,3067833783]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [268435456,u,2147483648,u]
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,268435456,1,2147483648]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1661,10 +1661,10 @@ define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_urem_even_poweroftwo_and_one:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,1,0,3067833783]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [268435456,u,2147483648,u]
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2147483648,268435456,1,2147483648]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1677,7 +1677,7 @@ define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX2-LABEL: test_urem_even_poweroftwo_and_one:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,1,0,3067833783]
 ; CHECK-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
@@ -1688,7 +1688,7 @@ define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX512VL-LABEL: test_urem_even_poweroftwo_and_one:
 ; CHECK-AVX512VL:       # %bb.0:
-; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,1,0,3067833783]
 ; CHECK-AVX512VL-NEXT:    vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX512VL-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -1705,10 +1705,10 @@ define <4 x i32> @test_urem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_odd_even_poweroftwo_and_one:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,1,0,3264175145]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,u,3264175145,u]
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [268435456,268435456,1073741824,1073741824]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; CHECK-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
@@ -1721,9 +1721,9 @@ define <4 x i32> @test_urem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_urem_odd_even_poweroftwo_and_one:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,1,0,3264175145]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [268435456,268435456,1073741824,1073741824]
 ; CHECK-SSE41-NEXT:    pxor %xmm2, %xmm2
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1737,9 +1737,9 @@ define <4 x i32> @test_urem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_urem_odd_even_poweroftwo_and_one:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,1,0,3264175145]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [268435456,268435456,1073741824,1073741824]
 ; CHECK-AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1752,7 +1752,7 @@ define <4 x i32> @test_urem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX2-LABEL: test_urem_odd_even_poweroftwo_and_one:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,1,0,3264175145]
 ; CHECK-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
@@ -1763,7 +1763,7 @@ define <4 x i32> @test_urem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX512VL-LABEL: test_urem_odd_even_poweroftwo_and_one:
 ; CHECK-AVX512VL:       # %bb.0:
-; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,1,0,3264175145]
 ; CHECK-AVX512VL-NEXT:    vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX512VL-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -1781,10 +1781,10 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
 ; CHECK-SSE2-LABEL: test_urem_odd_allones_and_poweroftwo_and_one:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294967295,0,0,0]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,4294967295,1,0]
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,268435456,u]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
 ; CHECK-SSE2-NEXT:    psrlq $32, %xmm0
@@ -1796,7 +1796,7 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
 ;
 ; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo_and_one:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,4294967295,1,0]
 ; CHECK-SSE41-NEXT:    pmovsxdq {{.*#+}} xmm1 = [1,268435456]
 ; CHECK-SSE41-NEXT:    pmuludq %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
@@ -1810,8 +1810,8 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
 ;
 ; CHECK-AVX1-LABEL: test_urem_odd_allones_and_poweroftwo_and_one:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,4294967295,1,0]
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1,u,268435456,u]
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
 ; CHECK-AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm1
 ; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
@@ -1822,7 +1822,7 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
 ;
 ; CHECK-AVX2-LABEL: test_urem_odd_allones_and_poweroftwo_and_one:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,4294967295,1,0]
 ; CHECK-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
@@ -1833,7 +1833,7 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
 ;
 ; CHECK-AVX512VL-LABEL: test_urem_odd_allones_and_poweroftwo_and_one:
 ; CHECK-AVX512VL:       # %bb.0:
-; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,4294967295,1,0]
 ; CHECK-AVX512VL-NEXT:    vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX512VL-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -1849,10 +1849,10 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
 ; CHECK-SSE2-LABEL: test_urem_even_allones_and_poweroftwo_and_one:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4294967295,0,0,0]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,4294967295,1,0]
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2147483648,u,268435456,u]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
 ; CHECK-SSE2-NEXT:    psrlq $32, %xmm0
@@ -1864,7 +1864,7 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
 ;
 ; CHECK-SSE41-LABEL: test_urem_even_allones_and_poweroftwo_and_one:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3067833783,4294967295,1,0]
 ; CHECK-SSE41-NEXT:    pmovzxdq {{.*#+}} xmm1 = [2147483648,268435456]
 ; CHECK-SSE41-NEXT:    pmuludq %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
@@ -1878,8 +1878,8 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
 ;
 ; CHECK-AVX1-LABEL: test_urem_even_allones_and_poweroftwo_and_one:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,4294967295,1,0]
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [2147483648,u,268435456,u]
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
 ; CHECK-AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm1
 ; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
@@ -1890,7 +1890,7 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
 ;
 ; CHECK-AVX2-LABEL: test_urem_even_allones_and_poweroftwo_and_one:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,4294967295,1,0]
 ; CHECK-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
@@ -1901,7 +1901,7 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
 ;
 ; CHECK-AVX512VL-LABEL: test_urem_even_allones_and_poweroftwo_and_one:
 ; CHECK-AVX512VL:       # %bb.0:
-; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3067833783,4294967295,1,0]
 ; CHECK-AVX512VL-NEXT:    vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX512VL-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll
index 6a36cd2a86d5c..8042103dc08d0 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll
@@ -25,7 +25,7 @@ define <4 x i1> @t32_3(<4 x i32> %X) nounwind {
 ; CHECK-SSE41-LABEL: t32_3:
 ; CHECK-SSE41:       # %bb.0:
 ; CHECK-SSE41-NEXT:    psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2863311531,2863311531,2863311531,2863311531]
 ; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1431655765,1431655764,1431655764,1431655764]
 ; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
@@ -34,7 +34,7 @@ define <4 x i1> @t32_3(<4 x i32> %X) nounwind {
 ; CHECK-AVX1-LABEL: t32_3:
 ; CHECK-AVX1:       # %bb.0:
 ; CHECK-AVX1-NEXT:    vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2863311531,2863311531,2863311531,2863311531]
 ; CHECK-AVX1-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
@@ -80,7 +80,7 @@ define <4 x i1> @t32_5(<4 x i32> %X) nounwind {
 ; CHECK-SSE41-LABEL: t32_5:
 ; CHECK-SSE41:       # %bb.0:
 ; CHECK-SSE41-NEXT:    psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3435973837,3435973837,3435973837,3435973837]
 ; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993458,858993458,858993458,858993458]
 ; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
@@ -89,7 +89,7 @@ define <4 x i1> @t32_5(<4 x i32> %X) nounwind {
 ; CHECK-AVX1-LABEL: t32_5:
 ; CHECK-AVX1:       # %bb.0:
 ; CHECK-AVX1-NEXT:    vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3435973837,3435973837,3435973837,3435973837]
 ; CHECK-AVX1-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    retq
@@ -140,7 +140,7 @@ define <4 x i1> @t32_6_part0(<4 x i32> %X) nounwind {
 ; CHECK-SSE41-LABEL: t32_6_part0:
 ; CHECK-SSE41:       # %bb.0:
 ; CHECK-SSE41-NEXT:    psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2863311531,2863311531,2863311531,2863311531]
 ; CHECK-SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    psrld $1, %xmm1
 ; CHECK-SSE41-NEXT:    pslld $31, %xmm0
@@ -153,7 +153,7 @@ define <4 x i1> @t32_6_part0(<4 x i32> %X) nounwind {
 ; CHECK-AVX1-LABEL: t32_6_part0:
 ; CHECK-AVX1:       # %bb.0:
 ; CHECK-AVX1-NEXT:    vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2863311531,2863311531,2863311531,2863311531]
 ; CHECK-AVX1-NEXT:    vpsrld $1, %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
@@ -211,7 +211,7 @@ define <4 x i1> @t32_6_part1(<4 x i32> %X) nounwind {
 ; CHECK-SSE41-LABEL: t32_6_part1:
 ; CHECK-SSE41:       # %bb.0:
 ; CHECK-SSE41-NEXT:    psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2863311531,2863311531,2863311531,2863311531]
 ; CHECK-SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    psrld $1, %xmm1
 ; CHECK-SSE41-NEXT:    pslld $31, %xmm0
@@ -224,7 +224,7 @@ define <4 x i1> @t32_6_part1(<4 x i32> %X) nounwind {
 ; CHECK-AVX1-LABEL: t32_6_part1:
 ; CHECK-AVX1:       # %bb.0:
 ; CHECK-AVX1-NEXT:    vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2863311531,2863311531,2863311531,2863311531]
 ; CHECK-AVX1-NEXT:    vpsrld $1, %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
@@ -275,7 +275,7 @@ define <4 x i1> @t32_tautological(<4 x i32> %X) nounwind {
 ; CHECK-SSE41-LABEL: t32_tautological:
 ; CHECK-SSE41:       # %bb.0:
 ; CHECK-SSE41-NEXT:    psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2863311531,2863311531,2863311531,2863311531]
 ; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [4294967295,4294967295,4294967295,1431655764]
 ; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
@@ -286,7 +286,7 @@ define <4 x i1> @t32_tautological(<4 x i32> %X) nounwind {
 ; CHECK-AVX1-LABEL: t32_tautological:
 ; CHECK-AVX1:       # %bb.0:
 ; CHECK-AVX1-NEXT:    vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2863311531,2863311531,2863311531,2863311531]
 ; CHECK-AVX1-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll
index 2166e43fc4286..b490c3cfefb76 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll
@@ -23,7 +23,7 @@ define <4 x i32> @test_urem_odd_25(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_urem_odd_25:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3264175145,3264175145,3264175145,3264175145]
 ; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [171798691,171798691,171798691,171798691]
 ; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
@@ -32,7 +32,7 @@ define <4 x i32> @test_urem_odd_25(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_urem_odd_25:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3264175145,3264175145,3264175145,3264175145]
 ; CHECK-AVX1-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
@@ -83,7 +83,7 @@ define <4 x i32> @test_urem_even_100(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_urem_even_100:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3264175145,3264175145,3264175145,3264175145]
 ; CHECK-SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    psrld $2, %xmm1
 ; CHECK-SSE41-NEXT:    pslld $30, %xmm0
@@ -96,7 +96,7 @@ define <4 x i32> @test_urem_even_100(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_urem_even_100:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3264175145,3264175145,3264175145,3264175145]
 ; CHECK-AVX1-NEXT:    vpsrld $2, %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpslld $30, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
@@ -139,9 +139,9 @@ define <4 x i32> @test_urem_odd_neg25(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_odd_neg25:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3264175145,1030792151,1030792151,3264175145]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1030792151,u,3264175145,u]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; CHECK-SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -151,7 +151,7 @@ define <4 x i32> @test_urem_odd_neg25(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_urem_odd_neg25:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [3264175145,1030792151,1030792151,3264175145]
 ; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [171798691,1,1,171798691]
 ; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
@@ -160,7 +160,7 @@ define <4 x i32> @test_urem_odd_neg25(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX-LABEL: test_urem_odd_neg25:
 ; CHECK-AVX:       # %bb.0:
-; CHECK-AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [3264175145,1030792151,1030792151,3264175145]
 ; CHECK-AVX-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX-NEXT:    vpsrld $31, %xmm0, %xmm0
@@ -176,9 +176,9 @@ define <4 x i32> @test_urem_even_neg100(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_even_neg100:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4252017623,3264175145,4252017623,3264175145]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3264175145,3264175145,3264175145,3264175145]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm1
@@ -192,7 +192,7 @@ define <4 x i32> @test_urem_even_neg100(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: test_urem_even_neg100:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4252017623,3264175145,4252017623,3264175145]
 ; CHECK-SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    psrld $2, %xmm1
 ; CHECK-SSE41-NEXT:    pslld $30, %xmm0
@@ -205,7 +205,7 @@ define <4 x i32> @test_urem_even_neg100(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: test_urem_even_neg100:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4252017623,3264175145,4252017623,3264175145]
 ; CHECK-AVX1-NEXT:    vpsrld $2, %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpslld $30, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
@@ -216,7 +216,7 @@ define <4 x i32> @test_urem_even_neg100(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX2-LABEL: test_urem_even_neg100:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4252017623,3264175145,4252017623,3264175145]
 ; CHECK-AVX2-NEXT:    vpsrld $2, %xmm0, %xmm1
 ; CHECK-AVX2-NEXT:    vpslld $30, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
@@ -227,7 +227,7 @@ define <4 x i32> @test_urem_even_neg100(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX512VL-LABEL: test_urem_even_neg100:
 ; CHECK-AVX512VL:       # %bb.0:
-; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4252017623,3264175145,4252017623,3264175145]
 ; CHECK-AVX512VL-NEXT:    vprord $2, %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX512VL-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -277,7 +277,7 @@ define <4 x i32> @test_urem_odd_undef1(<4 x i32> %X) nounwind {
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    psrld $3, %xmm2
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [25,25,25,25]
 ; CHECK-SSE41-NEXT:    psubd %xmm2, %xmm0
 ; CHECK-SSE41-NEXT:    pxor %xmm1, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
@@ -293,7 +293,7 @@ define <4 x i32> @test_urem_odd_undef1(<4 x i32> %X) nounwind {
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-AVX1-NEXT:    vpsrld $3, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [25,25,25,25]
 ; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -372,7 +372,7 @@ define <4 x i32> @test_urem_even_undef1(<4 x i32> %X) nounwind {
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    psrld $5, %xmm2
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [100,100,100,100]
 ; CHECK-SSE41-NEXT:    psubd %xmm2, %xmm0
 ; CHECK-SSE41-NEXT:    pxor %xmm1, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
@@ -388,7 +388,7 @@ define <4 x i32> @test_urem_even_undef1(<4 x i32> %X) nounwind {
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; CHECK-AVX1-NEXT:    vpsrld $5, %xmm1, %xmm1
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [100,100,100,100]
 ; CHECK-AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll
index 84856aab85079..e5b19a5efe6f2 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll
@@ -25,7 +25,7 @@ define <4 x i1> @t0_all_tautological(<4 x i32> %X) nounwind {
 define <4 x i1> @t1_all_odd_eq(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: t1_all_odd_eq:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2863311531,2863311531,2863311531,2863311531]
 ; CHECK-SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -33,7 +33,7 @@ define <4 x i1> @t1_all_odd_eq(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: t1_all_odd_eq:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2863311531,2863311531,2863311531,2863311531]
 ; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1431655765,4294967295,4294967295,4294967295]
 ; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
@@ -43,7 +43,7 @@ define <4 x i1> @t1_all_odd_eq(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: t1_all_odd_eq:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2863311531,2863311531,2863311531,2863311531]
 ; CHECK-AVX1-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
@@ -76,7 +76,7 @@ define <4 x i1> @t1_all_odd_eq(<4 x i32> %X) nounwind {
 define <4 x i1> @t1_all_odd_ne(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: t1_all_odd_ne:
 ; CHECK-SSE2:       # %bb.0:
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2863311531,2863311531,2863311531,2863311531]
 ; CHECK-SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -84,7 +84,7 @@ define <4 x i1> @t1_all_odd_ne(<4 x i32> %X) nounwind {
 ;
 ; CHECK-SSE41-LABEL: t1_all_odd_ne:
 ; CHECK-SSE41:       # %bb.0:
-; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2863311531,2863311531,2863311531,2863311531]
 ; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1431655765,4294967295,4294967295,4294967295]
 ; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
@@ -95,7 +95,7 @@ define <4 x i1> @t1_all_odd_ne(<4 x i32> %X) nounwind {
 ;
 ; CHECK-AVX1-LABEL: t1_all_odd_ne:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2863311531,2863311531,2863311531,2863311531]
 ; CHECK-AVX1-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
@@ -187,7 +187,7 @@ define <2 x i1> @t3_wide(<2 x i64> %X) nounwind {
 ; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; CHECK-SSE2-NEXT:    psrlq $32, %xmm3
 ; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm3
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2863311530,2863311530]
 ; CHECK-SSE2-NEXT:    paddq %xmm3, %xmm0
 ; CHECK-SSE2-NEXT:    psllq $32, %xmm0
 ; CHECK-SSE2-NEXT:    paddq %xmm2, %xmm0
@@ -212,7 +212,7 @@ define <2 x i1> @t3_wide(<2 x i64> %X) nounwind {
 ; CHECK-SSE41-NEXT:    movdqa %xmm0, %xmm3
 ; CHECK-SSE41-NEXT:    psrlq $32, %xmm3
 ; CHECK-SSE41-NEXT:    pmuludq %xmm1, %xmm3
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2863311530,2863311530]
 ; CHECK-SSE41-NEXT:    paddq %xmm3, %xmm0
 ; CHECK-SSE41-NEXT:    psllq $32, %xmm0
 ; CHECK-SSE41-NEXT:    paddq %xmm2, %xmm0
@@ -236,7 +236,7 @@ define <2 x i1> @t3_wide(<2 x i64> %X) nounwind {
 ; CHECK-AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
 ; CHECK-AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm3
 ; CHECK-AVX1-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2863311530,2863311530]
 ; CHECK-AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpsllq $32, %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
@@ -255,7 +255,7 @@ define <2 x i1> @t3_wide(<2 x i64> %X) nounwind {
 ; CHECK-AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
 ; CHECK-AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm3
 ; CHECK-AVX2-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
-; CHECK-AVX2-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2863311530,2863311530]
 ; CHECK-AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpsllq $32, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
diff --git a/llvm/test/CodeGen/X86/var-permute-128.ll b/llvm/test/CodeGen/X86/var-permute-128.ll
index 61740115c5fae..fce879585289a 100644
--- a/llvm/test/CodeGen/X86/var-permute-128.ll
+++ b/llvm/test/CodeGen/X86/var-permute-128.ll
@@ -5,9 +5,9 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,XOP
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512,AVX512BW
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512VL,AVX512VLBW
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512vbmi | FileCheck %s --check-prefixes=AVX,AVX512VL,VLVBMI
 
@@ -241,7 +241,7 @@ define <4 x i32> @var_shuffle_v4i32(<4 x i32> %v, <4 x i32> %indices) nounwind {
 ;
 ; SSE41-LABEL: var_shuffle_v4i32:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [67372036,67372036,67372036,67372036]
 ; SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE41-NEXT:    pshufb %xmm1, %xmm0
 ; SSE41-NEXT:    retq
@@ -319,7 +319,7 @@ define <4 x i32> @var_shuffle_zero_v4i32(<4 x i32> %v, <4 x i32> %indices) nounw
 ; SSE41-NEXT:    pmaxud %xmm1, %xmm2
 ; SSE41-NEXT:    pcmpeqd %xmm1, %xmm2
 ; SSE41-NEXT:    por %xmm2, %xmm1
-; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [67372036,67372036,67372036,67372036]
 ; SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE41-NEXT:    por %xmm2, %xmm1
 ; SSE41-NEXT:    pshufb %xmm1, %xmm0
@@ -598,6 +598,33 @@ define <8 x i16> @var_shuffle_zero_v8i16(<8 x i16> %v, <8 x i16> %indices) nounw
 ; AVX2-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
 ;
+; AVX512F-LABEL: var_shuffle_zero_v8i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpmaxuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; AVX512F-NEXT:    vpcmpeqw %xmm2, %xmm1, %xmm2
+; AVX512F-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [514,514,514,514,514,514,514,514]
+; AVX512F-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512F-NEXT:    vpor %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: var_shuffle_zero_v8i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7]
+; AVX512BW-NEXT:    vpcmpnleuw %zmm2, %zmm1, %k1
+; AVX512BW-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512BW-NEXT:    vmovdqu16 %zmm2, %zmm1 {%k1}
+; AVX512BW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [514,514,514,514,514,514,514,514]
+; AVX512BW-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512BW-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
 ; AVX512VL-LABEL: var_shuffle_zero_v8i16:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
@@ -921,6 +948,28 @@ define <16 x i8> @var_shuffle_zero_v16i8(<16 x i8> %v, <16 x i8> %indices) nounw
 ; AVX2-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
 ;
+; AVX512F-LABEL: var_shuffle_zero_v16i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; AVX512F-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm2
+; AVX512F-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; AVX512F-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: var_shuffle_zero_v16i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpcmpnleub %zmm2, %zmm1, %k1
+; AVX512BW-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm1 {%k1}
+; AVX512BW-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
 ; AVX512VL-LABEL: var_shuffle_zero_v16i8:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
@@ -1212,7 +1261,7 @@ define <4 x float> @var_shuffle_v4f32(<4 x float> %v, <4 x i32> %indices) nounwi
 ;
 ; SSE41-LABEL: var_shuffle_v4f32:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [67372036,67372036,67372036,67372036]
 ; SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE41-NEXT:    pshufb %xmm1, %xmm0
 ; SSE41-NEXT:    retq
@@ -1290,7 +1339,7 @@ define <4 x float> @var_shuffle_zero_v4f32(<4 x float> %v, <4 x i32> %indices) n
 ; SSE41-NEXT:    pmaxud %xmm1, %xmm2
 ; SSE41-NEXT:    pcmpeqd %xmm1, %xmm2
 ; SSE41-NEXT:    por %xmm2, %xmm1
-; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [67372036,67372036,67372036,67372036]
 ; SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE41-NEXT:    por %xmm2, %xmm1
 ; SSE41-NEXT:    pshufb %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/vec_reassociate.ll b/llvm/test/CodeGen/X86/vec_reassociate.ll
index a9473fff942c0..4703ca35306b3 100644
--- a/llvm/test/CodeGen/X86/vec_reassociate.ll
+++ b/llvm/test/CodeGen/X86/vec_reassociate.ll
@@ -38,13 +38,13 @@ define <4 x i32> @mul_4i32(<4 x i32> %a0, <4 x i32> %a1) {
 ; X86-LABEL: mul_4i32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pmulld %xmm1, %xmm0
-; X86-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [4,6,6,4]
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: mul_4i32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pmulld %xmm1, %xmm0
-; X64-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4,6,6,4]
 ; X64-NEXT:    retq
   %1 = mul <4 x i32> %a0, <i32 1, i32 2, i32 3, i32 4>
   %2 = mul <4 x i32> %a1, <i32 4, i32 3, i32 2, i32 1>
@@ -56,13 +56,13 @@ define <4 x i32> @mul_4i32_commute(<4 x i32> %a0, <4 x i32> %a1) {
 ; X86-LABEL: mul_4i32_commute:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pmulld %xmm1, %xmm0
-; X86-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [4,6,6,4]
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: mul_4i32_commute:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pmulld %xmm1, %xmm0
-; X64-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4,6,6,4]
 ; X64-NEXT:    retq
   %1 = mul <4 x i32> <i32 1, i32 2, i32 3, i32 4>, %a0
   %2 = mul <4 x i32> <i32 4, i32 3, i32 2, i32 1>, %a1
diff --git a/llvm/test/CodeGen/X86/vector-compress.ll b/llvm/test/CodeGen/X86/vector-compress.ll
index ac932d51017ae..1a6351524ffbd 100644
--- a/llvm/test/CodeGen/X86/vector-compress.ll
+++ b/llvm/test/CodeGen/X86/vector-compress.ll
@@ -1090,7 +1090,6 @@ define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask, <16 x i8>
 ; AVX2-NEXT:    pushq %r12
 ; AVX2-NEXT:    pushq %rbx
 ; AVX2-NEXT:    vpsllw $7, %xmm1, %xmm1
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX2-NEXT:    vpcmpgtb %xmm1, %xmm3, %xmm1
 ; AVX2-NEXT:    vmovaps %xmm2, -{{[0-9]+}}(%rsp)
@@ -1335,7 +1334,6 @@ define <32 x i8> @test_compress_v32i8(<32 x i8> %vec, <32 x i1> %mask, <32 x i8>
 ; AVX2-NEXT:    andq $-32, %rsp
 ; AVX2-NEXT:    subq $64, %rsp
 ; AVX2-NEXT:    vpsllw $7, %ymm1, %ymm1
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
 ; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX2-NEXT:    vpcmpgtb %ymm1, %ymm3, %ymm3
 ; AVX2-NEXT:    vmovaps %ymm2, (%rsp)
@@ -4733,7 +4731,6 @@ define <4 x i8> @test_compress_small(<4 x i8> %vec, <4 x i1> %mask) nounwind {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX2-NEXT:    vpsllw $7, %xmm1, %xmm1
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vpcmpgtb %xmm1, %xmm2, %xmm1
 ; AVX2-NEXT:    vpextrb $0, %xmm0, -{{[0-9]+}}(%rsp)
@@ -4751,72 +4748,7 @@ define <4 x i8> @test_compress_small(<4 x i8> %vec, <4 x i1> %mask) nounwind {
 ; AVX2-NEXT:    vpextrb $3, %xmm1, %ecx
 ; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    vpextrb $4, %xmm0, -24(%rsp,%rcx)
-; AVX2-NEXT:    vpextrb $4, %xmm1, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    vpextrb $5, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
-; AVX2-NEXT:    andl $15, %eax
-; AVX2-NEXT:    vpextrb $5, %xmm0, -24(%rsp,%rax)
-; AVX2-NEXT:    vpextrb $6, %xmm1, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $15, %ecx
-; AVX2-NEXT:    vpextrb $6, %xmm0, -24(%rsp,%rcx)
-; AVX2-NEXT:    vpextrb $7, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
-; AVX2-NEXT:    andl $15, %eax
-; AVX2-NEXT:    vpextrb $7, %xmm0, -24(%rsp,%rax)
-; AVX2-NEXT:    vpextrb $8, %xmm1, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $15, %ecx
-; AVX2-NEXT:    vpextrb $8, %xmm0, -24(%rsp,%rcx)
-; AVX2-NEXT:    vpextrb $9, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
-; AVX2-NEXT:    andl $15, %eax
-; AVX2-NEXT:    vpextrb $9, %xmm0, -24(%rsp,%rax)
-; AVX2-NEXT:    vpextrb $10, %xmm1, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $15, %ecx
-; AVX2-NEXT:    vpextrb $10, %xmm0, -24(%rsp,%rcx)
-; AVX2-NEXT:    vpextrb $11, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
-; AVX2-NEXT:    andl $15, %eax
-; AVX2-NEXT:    vpextrb $11, %xmm0, -24(%rsp,%rax)
-; AVX2-NEXT:    vpextrb $12, %xmm1, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addq %rcx, %rax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $15, %ecx
-; AVX2-NEXT:    vpextrb $12, %xmm0, -24(%rsp,%rcx)
-; AVX2-NEXT:    vpextrb $13, %xmm1, %ecx
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    addq %rax, %rcx
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
-; AVX2-NEXT:    andl $15, %eax
-; AVX2-NEXT:    vpextrb $13, %xmm0, -24(%rsp,%rax)
-; AVX2-NEXT:    vpextrb $14, %xmm1, %eax
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    addl %ecx, %eax
-; AVX2-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; AVX2-NEXT:    andl $15, %ecx
-; AVX2-NEXT:    vpextrb $14, %xmm0, -24(%rsp,%rcx)
-; AVX2-NEXT:    andl $15, %eax
-; AVX2-NEXT:    vpextrb $15, %xmm0, -24(%rsp,%rax)
+; AVX2-NEXT:    vpextrb $15, %xmm0, -24(%rsp,%rcx)
 ; AVX2-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
 ; AVX2-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll
index 762900e0bb18d..a0c27601a845d 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll
@@ -1821,9 +1821,9 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,64,128]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [32,u,128,u]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; SSE2-NEXT:    por %xmm1, %xmm0
@@ -1841,7 +1841,7 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; SSE41-NEXT:    psrld $28, %xmm1
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7]
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,64,128]
 ; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -1854,7 +1854,7 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; AVX1-NEXT:    vpsrld $28, %xmm1, %xmm1
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,64,128]
 ; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
@@ -1935,9 +1935,9 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; X86-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [16,32,64,128]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
+; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 # [32,u,128,u]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
 ; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; X86-SSE2-NEXT:    por %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll
index 445e572aff403..2fadf5f101626 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll
@@ -1647,7 +1647,7 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [256,512,1024,2048]
 ; AVX1-NEXT:    vpor %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpsrld $25, %xmm1, %xmm3
 ; AVX1-NEXT:    vpsrld $27, %xmm1, %xmm4
@@ -1656,7 +1656,7 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
 ; AVX1-NEXT:    vpsrld $28, %xmm1, %xmm1
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,64,128]
 ; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
index d0690bd291f31..ec2efcd82395a 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
@@ -1302,9 +1302,9 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind {
 ; SSE2-LABEL: constant_funnnel_v4i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,64,128]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,u,128,u]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -1316,8 +1316,8 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind {
 ; SSE41-LABEL: constant_funnnel_v4i32:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,u,128,u]
+; SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,64,128]
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1328,8 +1328,8 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind {
 ; AVX1-LABEL: constant_funnnel_v4i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,u,128,u]
+; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,64,128]
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1394,9 +1394,9 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind {
 ; X86-SSE2-LABEL: constant_funnnel_v4i32:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [16,32,64,128]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [32,u,128,u]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
 ; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
index 421fa98709d48..5f7e4070b3783 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
@@ -1082,13 +1082,13 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x) nounwind {
 ; AVX1-LABEL: constant_funnnel_v8i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,u,128,u]
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [512,u,2048,u]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,64,128]
+; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [256,512,1024,2048]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vmovshdup {{.*#+}} ymm2 = ymm0[1,1,3,3,5,5,7,7]
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll
index b378dce2b52fd..304daab6d17a9 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll
@@ -319,9 +319,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
 ; SSE2-LABEL: constant_funnnel_v2i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,1,1]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,u,1,u]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -333,8 +333,8 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
 ; SSE41-LABEL: constant_funnnel_v2i32:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,u,1,u]
+; SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,1,1]
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -345,8 +345,8 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
 ; AVX1-LABEL: constant_funnnel_v2i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,u,1,u]
+; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,1,1]
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -411,9 +411,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
 ; X86-SSE2-LABEL: constant_funnnel_v2i32:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [16,32,1,1]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [32,u,1,u]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
 ; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
diff --git a/llvm/test/CodeGen/X86/vector-fshl-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-sub128.ll
index 06ff7e77753a0..ae5dd18d4b663 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-sub128.ll
@@ -500,9 +500,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
 ; SSE2-NEXT:    psrld $27, %xmm2
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,1,1]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [32,u,1,u]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; SSE2-NEXT:    por %xmm1, %xmm0
@@ -514,7 +514,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
 ; SSE41-NEXT:    psrld $27, %xmm2
 ; SSE41-NEXT:    psrld $28, %xmm1
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
-; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,1,1]
 ; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -523,7 +523,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
 ; AVX1-NEXT:    vpsrld $27, %xmm1, %xmm2
 ; AVX1-NEXT:    vpsrld $28, %xmm1, %xmm1
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
-; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,1,1]
 ; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
@@ -598,9 +598,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
 ; X86-SSE2-NEXT:    psrld $27, %xmm2
 ; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [16,32,1,1]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
+; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 # [32,u,1,u]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
 ; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; X86-SSE2-NEXT:    por %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll
index 9b528574e1188..33a6a7679bb9a 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll
@@ -1741,9 +1741,9 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,134217728,67108864,33554432]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [134217728,u,33554432,u]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; SSE2-NEXT:    por %xmm1, %xmm0
@@ -1761,7 +1761,7 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; SSE41-NEXT:    psrld $4, %xmm1
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7]
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,134217728,67108864,33554432]
 ; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -1774,7 +1774,7 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; AVX1-NEXT:    vpsrld $4, %xmm1, %xmm1
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [268435456,134217728,67108864,33554432]
 ; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
@@ -1856,9 +1856,9 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; X86-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [268435456,134217728,67108864,33554432]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
+; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 # [134217728,u,33554432,u]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
 ; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; X86-SSE2-NEXT:    por %xmm1, %xmm0
@@ -1872,7 +1872,7 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
 ; SSE2-NEXT:    pandn %xmm1, %xmm2
-; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,0,0,128,0,64,0,32,0,16,0,8,0,4,0,2]
 ; SSE2-NEXT:    por %xmm1, %xmm2
 ; SSE2-NEXT:    paddw %xmm0, %xmm0
 ; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32768,16384,8192,4096,2048,1024,512,256]
@@ -1964,7 +1964,7 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
-; X86-SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [0,0,0,128,0,64,0,32,0,16,0,8,0,4,0,2]
 ; X86-SSE2-NEXT:    por %xmm1, %xmm2
 ; X86-SSE2-NEXT:    paddw %xmm0, %xmm0
 ; X86-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [32768,16384,8192,4096,2048,1024,512,256]
diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll
index a387562ad7b5f..217431be10d88 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll
@@ -1403,7 +1403,7 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [16777216,8388608,4194304,2097152]
 ; AVX1-NEXT:    vpor %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpsrld $7, %xmm1, %xmm3
 ; AVX1-NEXT:    vpsrld $5, %xmm1, %xmm4
@@ -1412,7 +1412,7 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
 ; AVX1-NEXT:    vpsrld $4, %xmm1, %xmm1
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [268435456,134217728,67108864,33554432]
 ; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
index 4969cb500d4df..5d01dfd459f8c 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
@@ -1380,9 +1380,9 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind {
 ; SSE2-LABEL: constant_funnnel_v4i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,134217728,67108864,33554432]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [134217728,u,33554432,u]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -1394,8 +1394,8 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind {
 ; SSE41-LABEL: constant_funnnel_v4i32:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [134217728,u,33554432,u]
+; SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,134217728,67108864,33554432]
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1406,8 +1406,8 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind {
 ; AVX1-LABEL: constant_funnnel_v4i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [134217728,u,33554432,u]
+; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [268435456,134217728,67108864,33554432]
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1472,9 +1472,9 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind {
 ; X86-SSE2-LABEL: constant_funnnel_v4i32:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [268435456,134217728,67108864,33554432]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [134217728,u,33554432,u]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
 ; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
index e2a3e261c0411..4dc931dd304fb 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
@@ -1134,13 +1134,13 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x) nounwind {
 ; AVX1-LABEL: constant_funnnel_v8i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [134217728,u,33554432,u]
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [8388608,u,2097152,u]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [268435456,134217728,67108864,33554432]
+; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [16777216,8388608,4194304,2097152]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vmovshdup {{.*#+}} ymm2 = ymm0[1,1,3,3,5,5,7,7]
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll
index ef5ffe4959b9c..4b42b189538ac 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll
@@ -341,9 +341,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
 ; SSE2-LABEL: constant_funnnel_v2i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,134217728,1,1]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [134217728,u,1,u]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -355,8 +355,8 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
 ; SSE41-LABEL: constant_funnnel_v2i32:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [134217728,u,1,u]
+; SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,134217728,1,1]
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -367,8 +367,8 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
 ; AVX1-LABEL: constant_funnnel_v2i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [134217728,u,1,u]
+; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [268435456,134217728,1,1]
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -433,9 +433,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
 ; X86-SSE2-LABEL: constant_funnnel_v2i32:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [268435456,134217728,1,1]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [134217728,u,1,u]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
 ; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
index 816d5cace033a..e68d1d792c90a 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
@@ -171,7 +171,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
 ; SSE-NEXT:    pxor %xmm1, %xmm1
 ; SSE-NEXT:    pxor %xmm2, %xmm2
 ; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632]
+; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147]
 ; SSE-NEXT:    pmulhw %xmm3, %xmm2
 ; SSE-NEXT:    psrlw $8, %xmm2
 ; SSE-NEXT:    pxor %xmm4, %xmm4
@@ -193,7 +193,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632]
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147]
 ; AVX1-NEXT:    vpmulhw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
@@ -260,11 +260,11 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
 ; SSE-NEXT:    pxor %xmm1, %xmm1
 ; SSE-NEXT:    pxor %xmm2, %xmm2
 ; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; SSE-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [37632,20224,11008,47872,26368,14592,14592,37632]
+; SSE-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,147,0,79,0,43,0,187,0,103,0,57,0,57,0,147]
 ; SSE-NEXT:    psrlw $8, %xmm2
 ; SSE-NEXT:    pxor %xmm3, %xmm3
 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [37632,33024,14592,26368,47872,11008,20224,37632]
+; SSE-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [0,147,0,129,0,57,0,103,0,187,0,43,0,79,0,147]
 ; SSE-NEXT:    psrlw $8, %xmm3
 ; SSE-NEXT:    packuswb %xmm2, %xmm3
 ; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -287,10 +287,10 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [37632,20224,11008,47872,26368,14592,14592,37632]
+; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,147,0,79,0,43,0,187,0,103,0,57,0,57,0,147]
 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [37632,33024,14592,26368,47872,11008,20224,37632]
+; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [0,147,0,129,0,57,0,103,0,187,0,43,0,79,0,147]
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -561,7 +561,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
 ; SSE-NEXT:    pxor %xmm1, %xmm1
 ; SSE-NEXT:    pxor %xmm2, %xmm2
 ; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632]
+; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147]
 ; SSE-NEXT:    pmulhw %xmm3, %xmm2
 ; SSE-NEXT:    psrlw $8, %xmm2
 ; SSE-NEXT:    pxor %xmm4, %xmm4
@@ -588,7 +588,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632]
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147]
 ; AVX1-NEXT:    vpmulhw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
@@ -667,11 +667,11 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; SSE2-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [37632,20224,11008,47872,26368,14592,14592,37632]
+; SSE2-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,147,0,79,0,43,0,187,0,103,0,57,0,57,0,147]
 ; SSE2-NEXT:    psrlw $8, %xmm1
 ; SSE2-NEXT:    pxor %xmm3, %xmm3
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [37632,33024,14592,26368,47872,11008,20224,37632]
+; SSE2-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [0,147,0,129,0,57,0,103,0,187,0,43,0,79,0,147]
 ; SSE2-NEXT:    psrlw $8, %xmm3
 ; SSE2-NEXT:    packuswb %xmm1, %xmm3
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,0,0,255,0,0,255,255,0,0,255,0,0,0,255]
@@ -706,11 +706,11 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
 ; SSE41-NEXT:    pxor %xmm1, %xmm1
 ; SSE41-NEXT:    pxor %xmm2, %xmm2
 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; SSE41-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [37632,20224,11008,47872,26368,14592,14592,37632]
+; SSE41-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,147,0,79,0,43,0,187,0,103,0,57,0,57,0,147]
 ; SSE41-NEXT:    psrlw $8, %xmm2
 ; SSE41-NEXT:    pxor %xmm3, %xmm3
 ; SSE41-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE41-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [37632,33024,14592,26368,47872,11008,20224,37632]
+; SSE41-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [0,147,0,129,0,57,0,103,0,187,0,43,0,79,0,147]
 ; SSE41-NEXT:    psrlw $8, %xmm3
 ; SSE41-NEXT:    packuswb %xmm2, %xmm3
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,0,0,255,0,0,255,255,0,0,255,0,0,0,255]
@@ -741,10 +741,10 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [37632,20224,11008,47872,26368,14592,14592,37632]
+; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,147,0,79,0,43,0,187,0,103,0,57,0,57,0,147]
 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [37632,33024,14592,26368,47872,11008,20224,37632]
+; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [0,147,0,129,0,57,0,103,0,187,0,43,0,79,0,147]
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
@@ -1116,11 +1116,11 @@ define <16 x i8> @PR143238(<16 x i8> %a0) {
 ; SSE-NEXT:    pxor %xmm1, %xmm1
 ; SSE-NEXT:    pxor %xmm2, %xmm2
 ; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; SSE-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [26368,47872,11008,20224,37632,35072,33024,30976]
+; SSE-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,103,0,187,0,43,0,79,0,147,0,137,0,129,0,121]
 ; SSE-NEXT:    psrlw $8, %xmm2
 ; SSE-NEXT:    pxor %xmm3, %xmm3
 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [33024,22016,33024,26368,11008,37632,33024,14592]
+; SSE-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [0,129,0,86,0,129,0,103,0,43,0,147,0,129,0,57]
 ; SSE-NEXT:    psrlw $8, %xmm3
 ; SSE-NEXT:    packuswb %xmm2, %xmm3
 ; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -1144,10 +1144,10 @@ define <16 x i8> @PR143238(<16 x i8> %a0) {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [26368,47872,11008,20224,37632,35072,33024,30976]
+; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,103,0,187,0,43,0,79,0,147,0,137,0,129,0,121]
 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [33024,22016,33024,26368,11008,37632,33024,14592]
+; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [0,129,0,86,0,129,0,103,0,43,0,147,0,129,0,57]
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
index 63c69e59d00e1..7355f3683fc2e 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
@@ -161,7 +161,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [37632,37632,37632,37632,37632,37632,37632,37632]
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147]
 ; AVX1-NEXT:    vpmulhw %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
@@ -198,7 +198,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
 ; AVX2NOBW:       # %bb.0:
 ; AVX2NOBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2NOBW-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX2NOBW-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
+; AVX2NOBW-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147]
 ; AVX2NOBW-NEXT:    vpmulhw %ymm3, %ymm2, %ymm2
 ; AVX2NOBW-NEXT:    vpsrlw $8, %ymm2, %ymm2
 ; AVX2NOBW-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
@@ -245,10 +245,10 @@ define <32 x i8> @test_divconstant_32i8(<32 x i8> %a) nounwind {
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [37632,20224,11008,47872,26368,14592,33024,37632]
+; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [0,147,0,79,0,43,0,187,0,103,0,57,0,129,0,147]
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [47872,12544,26368,6912,14592,30976,33024,35072]
+; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,187,0,49,0,103,0,27,0,57,0,121,0,129,0,137]
 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
@@ -266,10 +266,10 @@ define <32 x i8> @test_divconstant_32i8(<32 x i8> %a) nounwind {
 ; AVX1-NEXT:    vpcmpgtb %xmm2, %xmm1, %xmm4
 ; AVX1-NEXT:    vpsubb %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [35072,33024,30976,14592,6912,26368,12544,47872]
+; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,137,0,129,0,121,0,57,0,27,0,103,0,49,0,187]
 ; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [37632,33024,14592,26368,47872,11008,20224,37632]
+; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,147,0,129,0,57,0,103,0,187,0,43,0,79,0,147]
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
@@ -291,10 +291,10 @@ define <32 x i8> @test_divconstant_32i8(<32 x i8> %a) nounwind {
 ; AVX2NOBW:       # %bb.0:
 ; AVX2NOBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2NOBW-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX2NOBW-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [35072,33024,30976,14592,6912,26368,12544,47872,37632,20224,11008,47872,26368,14592,33024,37632]
+; AVX2NOBW-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [0,137,0,129,0,121,0,57,0,27,0,103,0,49,0,187,0,147,0,79,0,43,0,187,0,103,0,57,0,129,0,147]
 ; AVX2NOBW-NEXT:    vpsrlw $8, %ymm2, %ymm2
 ; AVX2NOBW-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX2NOBW-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [37632,33024,14592,26368,47872,11008,20224,37632,47872,12544,26368,6912,14592,30976,33024,35072]
+; AVX2NOBW-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [0,147,0,129,0,57,0,103,0,187,0,43,0,79,0,147,0,187,0,49,0,103,0,27,0,57,0,121,0,129,0,137]
 ; AVX2NOBW-NEXT:    vpsrlw $8, %ymm3, %ymm3
 ; AVX2NOBW-NEXT:    vpackuswb %ymm2, %ymm3, %ymm2
 ; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
@@ -539,7 +539,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [37632,37632,37632,37632,37632,37632,37632,37632]
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147]
 ; AVX1-NEXT:    vpmulhw %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
@@ -585,7 +585,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
 ; AVX2NOBW:       # %bb.0:
 ; AVX2NOBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2NOBW-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX2NOBW-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
+; AVX2NOBW-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147]
 ; AVX2NOBW-NEXT:    vpmulhw %ymm3, %ymm2, %ymm2
 ; AVX2NOBW-NEXT:    vpsrlw $8, %ymm2, %ymm2
 ; AVX2NOBW-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
@@ -640,10 +640,10 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [37632,20224,11008,47872,26368,14592,33024,37632]
+; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [0,147,0,79,0,43,0,187,0,103,0,57,0,129,0,147]
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [47872,12544,26368,6912,14592,30976,33024,35072]
+; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,187,0,49,0,103,0,27,0,57,0,121,0,129,0,137]
 ; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4
@@ -668,10 +668,10 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
 ; AVX1-NEXT:    vpor %xmm3, %xmm5, %xmm3
 ; AVX1-NEXT:    vpsubb %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [35072,33024,30976,14592,6912,26368,12544,47872]
+; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [0,137,0,129,0,121,0,57,0,27,0,103,0,49,0,187]
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 # [37632,33024,14592,26368,47872,11008,20224,37632]
+; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 # [0,147,0,129,0,57,0,103,0,187,0,43,0,79,0,147]
 ; AVX1-NEXT:    vpsrlw $8, %xmm5, %xmm5
 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm5, %xmm3
 ; AVX1-NEXT:    vpaddb %xmm4, %xmm3, %xmm3
@@ -699,10 +699,10 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
 ; AVX2NOBW:       # %bb.0:
 ; AVX2NOBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2NOBW-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX2NOBW-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [35072,33024,30976,14592,6912,26368,12544,47872,37632,20224,11008,47872,26368,14592,33024,37632]
+; AVX2NOBW-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [0,137,0,129,0,121,0,57,0,27,0,103,0,49,0,187,0,147,0,79,0,43,0,187,0,103,0,57,0,129,0,147]
 ; AVX2NOBW-NEXT:    vpsrlw $8, %ymm2, %ymm2
 ; AVX2NOBW-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX2NOBW-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [37632,33024,14592,26368,47872,11008,20224,37632,47872,12544,26368,6912,14592,30976,33024,35072]
+; AVX2NOBW-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [0,147,0,129,0,57,0,103,0,187,0,43,0,79,0,147,0,187,0,49,0,103,0,27,0,57,0,121,0,129,0,137]
 ; AVX2NOBW-NEXT:    vpsrlw $8, %ymm3, %ymm3
 ; AVX2NOBW-NEXT:    vpackuswb %ymm2, %ymm3, %ymm2
 ; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
index 6bc4fcb6cc1ec..5445330c82922 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
@@ -132,7 +132,7 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
 ; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31]
-; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147]
 ; AVX512F-NEXT:    vpmulhw %ymm4, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
@@ -169,7 +169,7 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm3 = [0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147]
 ; AVX512BW-NEXT:    vpmulhw %zmm3, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
@@ -199,10 +199,10 @@ define <64 x i8> @test_divconstant_64i8(<64 x i8> %a) nounwind {
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
-; AVX512F-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [35072,18176,37632,4864,20224,10496,11008,45824,37632,20224,11008,47872,26368,14592,33024,37632]
+; AVX512F-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [0,137,0,71,0,147,0,19,0,79,0,41,0,43,0,179,0,147,0,79,0,43,0,187,0,103,0,57,0,129,0,147]
 ; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
-; AVX512F-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [6912,28416,14592,15104,30976,32000,33024,34048,47872,12544,26368,6912,14592,30976,33024,35072]
+; AVX512F-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [0,27,0,111,0,57,0,59,0,121,0,125,0,129,0,133,0,187,0,49,0,103,0,27,0,57,0,121,0,129,0,137]
 ; AVX512F-NEXT:    vpsrlw $8, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpackuswb %ymm3, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3
@@ -220,10 +220,10 @@ define <64 x i8> @test_divconstant_64i8(<64 x i8> %a) nounwind {
 ; AVX512F-NEXT:    vpcmpgtb %ymm2, %ymm1, %ymm4
 ; AVX512F-NEXT:    vpsubb %ymm4, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX512F-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [35072,33024,30976,14592,6912,26368,12544,47872,34048,33024,32000,30976,15104,14592,28416,6912]
+; AVX512F-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,137,0,129,0,121,0,57,0,27,0,103,0,49,0,187,0,133,0,129,0,125,0,121,0,59,0,57,0,111,0,27]
 ; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
 ; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX512F-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [37632,33024,14592,26368,47872,11008,20224,37632,45824,11008,10496,20224,4864,37632,18176,35072]
+; AVX512F-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,147,0,129,0,57,0,103,0,187,0,43,0,79,0,147,0,179,0,43,0,41,0,79,0,19,0,147,0,71,0,137]
 ; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpackuswb %ymm4, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
@@ -245,10 +245,10 @@ define <64 x i8> @test_divconstant_64i8(<64 x i8> %a) nounwind {
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
-; AVX512BW-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 # [35072,33024,30976,14592,6912,26368,12544,47872,34048,33024,32000,30976,15104,14592,28416,6912,35072,18176,37632,4864,20224,10496,11008,45824,37632,20224,11008,47872,26368,14592,33024,37632]
+; AVX512BW-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 # [0,137,0,129,0,121,0,57,0,27,0,103,0,49,0,187,0,133,0,129,0,125,0,121,0,59,0,57,0,111,0,27,0,137,0,71,0,147,0,19,0,79,0,41,0,43,0,179,0,147,0,79,0,43,0,187,0,103,0,57,0,129,0,147]
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
-; AVX512BW-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [37632,33024,14592,26368,47872,11008,20224,37632,45824,11008,10496,20224,4864,37632,18176,35072,6912,28416,14592,15104,30976,32000,33024,34048,47872,12544,26368,6912,14592,30976,33024,35072]
+; AVX512BW-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [0,147,0,129,0,57,0,103,0,187,0,43,0,79,0,147,0,179,0,43,0,41,0,79,0,19,0,147,0,71,0,137,0,27,0,111,0,57,0,59,0,121,0,125,0,129,0,133,0,187,0,49,0,103,0,27,0,57,0,121,0,129,0,137]
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
@@ -444,7 +444,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
-; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147]
 ; AVX512F-NEXT:    vpmulhw %ymm4, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
@@ -490,7 +490,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm3 = [0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147,0,147]
 ; AVX512BW-NEXT:    vpmulhw %zmm3, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
@@ -524,10 +524,10 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
-; AVX512F-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [35072,18176,37632,4864,20224,10496,11008,45824,37632,20224,11008,47872,26368,14592,33024,37632]
+; AVX512F-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [0,137,0,71,0,147,0,19,0,79,0,41,0,43,0,179,0,147,0,79,0,43,0,187,0,103,0,57,0,129,0,147]
 ; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
-; AVX512F-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [6912,28416,14592,15104,30976,32000,33024,34048,47872,12544,26368,6912,14592,30976,33024,35072]
+; AVX512F-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,27,0,111,0,57,0,59,0,121,0,125,0,129,0,133,0,187,0,49,0,103,0,27,0,57,0,121,0,129,0,137]
 ; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
 ; AVX512F-NEXT:    vpackuswb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm4
@@ -552,10 +552,10 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
 ; AVX512F-NEXT:    vpor %ymm3, %ymm5, %ymm3
 ; AVX512F-NEXT:    vpsubb %ymm3, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX512F-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [35072,33024,30976,14592,6912,26368,12544,47872,34048,33024,32000,30976,15104,14592,28416,6912]
+; AVX512F-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [0,137,0,129,0,121,0,57,0,27,0,103,0,49,0,187,0,133,0,129,0,125,0,121,0,59,0,57,0,111,0,27]
 ; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX512F-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [37632,33024,14592,26368,47872,11008,20224,37632,45824,11008,10496,20224,4864,37632,18176,35072]
+; AVX512F-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [0,147,0,129,0,57,0,103,0,187,0,43,0,79,0,147,0,179,0,43,0,41,0,79,0,19,0,147,0,71,0,137]
 ; AVX512F-NEXT:    vpsrlw $8, %ymm5, %ymm5
 ; AVX512F-NEXT:    vpackuswb %ymm3, %ymm5, %ymm3
 ; AVX512F-NEXT:    vpaddb %ymm4, %ymm3, %ymm3
@@ -583,10 +583,10 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
-; AVX512BW-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 # [35072,33024,30976,14592,6912,26368,12544,47872,34048,33024,32000,30976,15104,14592,28416,6912,35072,18176,37632,4864,20224,10496,11008,45824,37632,20224,11008,47872,26368,14592,33024,37632]
+; AVX512BW-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 # [0,137,0,129,0,121,0,57,0,27,0,103,0,49,0,187,0,133,0,129,0,125,0,121,0,59,0,57,0,111,0,27,0,137,0,71,0,147,0,19,0,79,0,41,0,43,0,179,0,147,0,79,0,43,0,187,0,103,0,57,0,129,0,147]
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
-; AVX512BW-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [37632,33024,14592,26368,47872,11008,20224,37632,45824,11008,10496,20224,4864,37632,18176,35072,6912,28416,14592,15104,30976,32000,33024,34048,47872,12544,26368,6912,14592,30976,33024,35072]
+; AVX512BW-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [0,147,0,129,0,57,0,103,0,187,0,43,0,79,0,147,0,179,0,43,0,41,0,79,0,19,0,147,0,71,0,137,0,27,0,111,0,57,0,59,0,121,0,125,0,129,0,133,0,187,0,49,0,103,0,27,0,57,0,121,0,129,0,137]
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2
diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll
index 33d80f63dbcc8..6cd5098504f91 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll
@@ -169,7 +169,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [37,37,37,37,37,37,37,37]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
 ; SSE2-NEXT:    pmullw %xmm3, %xmm2
 ; SSE2-NEXT:    psrlw $8, %xmm2
 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
@@ -209,7 +209,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [37,37,37,37,37,37,37,37]
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
 ; AVX1-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
@@ -270,22 +270,22 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
 ; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [128,256,256,256,256,256,256,256]
 ; SSE2-NEXT:    psrlw $8, %xmm2
-; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [147,79,171,117,205,57,57,37]
+; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [147,0,79,0,171,0,117,0,205,0,57,0,57,0,37,0]
 ; SSE2-NEXT:    psrlw $8, %xmm2
 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
 ; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [256,256,256,256,256,256,256,128]
 ; SSE2-NEXT:    psrlw $8, %xmm3
-; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [37,32,57,205,117,171,79,147]
+; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [37,0,32,0,57,0,205,0,117,0,171,0,79,0,147,0]
 ; SSE2-NEXT:    psrlw $8, %xmm3
 ; SSE2-NEXT:    packuswb %xmm2, %xmm3
 ; SSE2-NEXT:    psubb %xmm3, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,0,0,128,0,0,0,128]
+; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0]
 ; SSE2-NEXT:    psrlw $8, %xmm2
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [128,0,0,0,128,0,0,0]
+; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0]
 ; SSE2-NEXT:    psrlw $8, %xmm0
 ; SSE2-NEXT:    packuswb %xmm2, %xmm0
 ; SSE2-NEXT:    paddb %xmm3, %xmm0
@@ -309,7 +309,7 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
 ; SSE41-NEXT:    psllw $7, %xmm3
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1,2,3,4,5,6,7]
 ; SSE41-NEXT:    psrlw $8, %xmm3
-; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [147,79,171,117,205,57,57,37]
+; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [147,0,79,0,171,0,117,0,205,0,57,0,57,0,37,0]
 ; SSE41-NEXT:    psrlw $8, %xmm3
 ; SSE41-NEXT:    pxor %xmm2, %xmm2
 ; SSE41-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
@@ -317,15 +317,15 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
 ; SSE41-NEXT:    psllw $7, %xmm4
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5,6],xmm4[7]
 ; SSE41-NEXT:    psrlw $8, %xmm4
-; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [37,32,57,205,117,171,79,147]
+; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [37,0,32,0,57,0,205,0,117,0,171,0,79,0,147,0]
 ; SSE41-NEXT:    psrlw $8, %xmm4
 ; SSE41-NEXT:    packuswb %xmm3, %xmm4
 ; SSE41-NEXT:    psubb %xmm4, %xmm0
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,0,128,0,0,0,128]
+; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0]
 ; SSE41-NEXT:    psrlw $8, %xmm0
-; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [128,0,0,0,128,0,0,0]
+; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0]
 ; SSE41-NEXT:    psrlw $8, %xmm2
 ; SSE41-NEXT:    packuswb %xmm0, %xmm2
 ; SSE41-NEXT:    paddb %xmm4, %xmm2
@@ -346,22 +346,22 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
 ; AVX1-NEXT:    vpsllw $7, %xmm3, %xmm3
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7]
 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [147,79,171,117,205,57,57,37]
+; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [147,0,79,0,171,0,117,0,205,0,57,0,57,0,37,0]
 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX1-NEXT:    vpsllw $7, %xmm4, %xmm4
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7]
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [37,32,57,205,117,171,79,147]
+; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [37,0,32,0,57,0,205,0,117,0,171,0,79,0,147,0]
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [0,0,0,128,0,0,0,128]
+; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0]
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [128,0,0,0,128,0,0,0]
+; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
@@ -638,7 +638,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [37,37,37,37,37,37,37,37]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
 ; SSE2-NEXT:    pmullw %xmm3, %xmm2
 ; SSE2-NEXT:    psrlw $8, %xmm2
 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
@@ -690,7 +690,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [37,37,37,37,37,37,37,37]
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
 ; AVX1-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
@@ -763,23 +763,23 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
 ; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [128,256,256,256,256,256,256,256]
 ; SSE2-NEXT:    psrlw $8, %xmm2
-; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [147,79,171,117,205,57,57,37]
+; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [147,0,79,0,171,0,117,0,205,0,57,0,57,0,37,0]
 ; SSE2-NEXT:    psrlw $8, %xmm2
 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
 ; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [256,256,256,256,256,256,256,128]
 ; SSE2-NEXT:    psrlw $8, %xmm3
-; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [37,32,57,205,117,171,79,147]
+; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [37,0,32,0,57,0,205,0,117,0,171,0,79,0,147,0]
 ; SSE2-NEXT:    psrlw $8, %xmm3
 ; SSE2-NEXT:    packuswb %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    psubb %xmm3, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm4
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
-; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [0,0,0,128,0,0,0,128]
+; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0]
 ; SSE2-NEXT:    psrlw $8, %xmm4
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [128,0,0,0,128,0,0,0]
+; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0]
 ; SSE2-NEXT:    psrlw $8, %xmm2
 ; SSE2-NEXT:    packuswb %xmm4, %xmm2
 ; SSE2-NEXT:    paddb %xmm3, %xmm2
@@ -809,7 +809,7 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
 ; SSE41-NEXT:    psllw $7, %xmm3
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1,2,3,4,5,6,7]
 ; SSE41-NEXT:    psrlw $8, %xmm3
-; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [147,79,171,117,205,57,57,37]
+; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [147,0,79,0,171,0,117,0,205,0,57,0,57,0,37,0]
 ; SSE41-NEXT:    psrlw $8, %xmm3
 ; SSE41-NEXT:    pxor %xmm2, %xmm2
 ; SSE41-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
@@ -817,16 +817,16 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
 ; SSE41-NEXT:    psllw $7, %xmm4
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5,6],xmm4[7]
 ; SSE41-NEXT:    psrlw $8, %xmm4
-; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [37,32,57,205,117,171,79,147]
+; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [37,0,32,0,57,0,205,0,117,0,171,0,79,0,147,0]
 ; SSE41-NEXT:    psrlw $8, %xmm4
 ; SSE41-NEXT:    packuswb %xmm3, %xmm4
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
 ; SSE41-NEXT:    psubb %xmm4, %xmm2
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,0,0,128,0,0,0,128]
+; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0]
 ; SSE41-NEXT:    psrlw $8, %xmm2
-; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [128,0,0,0,128,0,0,0]
+; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0]
 ; SSE41-NEXT:    psrlw $8, %xmm3
 ; SSE41-NEXT:    packuswb %xmm2, %xmm3
 ; SSE41-NEXT:    paddb %xmm4, %xmm3
@@ -854,22 +854,22 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
 ; AVX1-NEXT:    vpsllw $7, %xmm3, %xmm3
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7]
 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [147,79,171,117,205,57,57,37]
+; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [147,0,79,0,171,0,117,0,205,0,57,0,57,0,37,0]
 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX1-NEXT:    vpsllw $7, %xmm4, %xmm4
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7]
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [37,32,57,205,117,171,79,147]
+; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [37,0,32,0,57,0,205,0,117,0,171,0,79,0,147,0]
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpsubb %xmm2, %xmm0, %xmm3
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
-; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,0,0,128,0,0,0,128]
+; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0]
 ; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [128,0,0,0,128,0,0,0]
+; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
 ; AVX1-NEXT:    vpackuswb %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm2
diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
index e43108fe7d784..98ea87cbe18f3 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
@@ -166,7 +166,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [37,37,37,37,37,37,37,37]
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
 ; AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
@@ -200,7 +200,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
 ; AVX2NOBW:       # %bb.0:
 ; AVX2NOBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2NOBW-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX2NOBW-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX2NOBW-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
 ; AVX2NOBW-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
 ; AVX2NOBW-NEXT:    vpsrlw $8, %ymm2, %ymm2
 ; AVX2NOBW-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
@@ -246,22 +246,22 @@ define <32 x i8> @test_divconstant_32i8(<32 x i8> %a) nounwind {
 ; AVX1-NEXT:    vpsllw $7, %xmm4, %xmm4
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3,4,5,6,7]
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [147,79,171,117,205,57,32,37]
+; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [147,0,79,0,171,0,117,0,205,0,57,0,32,0,37,0]
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
 ; AVX1-NEXT:    vpsllw $7, %xmm5, %xmm5
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3,4,5,6,7]
 ; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
-; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [187,135,205,27,57,241,16,137]
+; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [187,0,135,0,205,0,27,0,57,0,241,0,16,0,137,0]
 ; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vpsubb %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,0,0,128,0,0,0,128]
+; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0]
 ; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,128,0,0,0,0,0,0]
+; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackuswb %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
@@ -276,22 +276,22 @@ define <32 x i8> @test_divconstant_32i8(<32 x i8> %a) nounwind {
 ; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm4
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7]
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [137,16,241,57,27,205,135,187]
+; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [137,0,16,0,241,0,57,0,27,0,205,0,135,0,187,0]
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX1-NEXT:    vpsllw $7, %xmm5, %xmm5
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7]
 ; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
-; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [37,32,57,205,117,171,79,147]
+; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [37,0,32,0,57,0,205,0,117,0,171,0,79,0,147,0]
 ; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vpsubb %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,0,0,0,0,0,128,0]
+; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0]
 ; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [128,0,0,0,128,0,0,0]
+; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
@@ -312,20 +312,20 @@ define <32 x i8> @test_divconstant_32i8(<32 x i8> %a) nounwind {
 ; AVX2NOBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [256,256,256,256,256,256,256,128,128,256,256,256,256,256,256,256]
 ; AVX2NOBW-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
 ; AVX2NOBW-NEXT:    vpsrlw $8, %ymm2, %ymm2
-; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [137,16,241,57,27,205,135,187,147,79,171,117,205,57,32,37]
+; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [137,0,16,0,241,0,57,0,27,0,205,0,135,0,187,0,147,0,79,0,171,0,117,0,205,0,57,0,32,0,37,0]
 ; AVX2NOBW-NEXT:    vpsrlw $8, %ymm2, %ymm2
 ; AVX2NOBW-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
 ; AVX2NOBW-NEXT:    vpmullw %ymm3, %ymm4, %ymm3
 ; AVX2NOBW-NEXT:    vpsrlw $8, %ymm3, %ymm3
-; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [37,32,57,205,117,171,79,147,187,135,205,27,57,241,16,137]
+; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [37,0,32,0,57,0,205,0,117,0,171,0,79,0,147,0,187,0,135,0,205,0,27,0,57,0,241,0,16,0,137,0]
 ; AVX2NOBW-NEXT:    vpsrlw $8, %ymm3, %ymm3
 ; AVX2NOBW-NEXT:    vpackuswb %ymm2, %ymm3, %ymm2
 ; AVX2NOBW-NEXT:    vpsubb %ymm2, %ymm0, %ymm0
 ; AVX2NOBW-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [0,0,0,0,0,0,128,0,0,0,0,128,0,0,0,128]
+; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0]
 ; AVX2NOBW-NEXT:    vpsrlw $8, %ymm3, %ymm3
 ; AVX2NOBW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,0,0,0,128,0,0,0,0,128,0,0,0,0,0,0]
+; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX2NOBW-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; AVX2NOBW-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
 ; AVX2NOBW-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
@@ -578,7 +578,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [37,37,37,37,37,37,37,37]
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
 ; AVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
@@ -622,7 +622,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
 ; AVX2NOBW:       # %bb.0:
 ; AVX2NOBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2NOBW-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX2NOBW-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX2NOBW-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
 ; AVX2NOBW-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
 ; AVX2NOBW-NEXT:    vpsrlw $8, %ymm2, %ymm2
 ; AVX2NOBW-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
@@ -676,22 +676,22 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
 ; AVX1-NEXT:    vpsllw $7, %xmm4, %xmm4
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3,4,5,6,7]
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [147,79,171,117,205,57,32,37]
+; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [147,0,79,0,171,0,117,0,205,0,57,0,32,0,37,0]
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
 ; AVX1-NEXT:    vpsllw $7, %xmm5, %xmm5
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3,4,5,6,7]
 ; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
-; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [187,135,205,27,57,241,16,137]
+; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [187,0,135,0,205,0,27,0,57,0,241,0,16,0,137,0]
 ; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vpsubb %xmm3, %xmm2, %xmm4
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
-; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 # [0,0,0,128,0,0,0,128]
+; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 # [0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0]
 ; AVX1-NEXT:    vpsrlw $8, %xmm5, %xmm5
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
-; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,128,0,0,0,0,0,0]
+; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
 ; AVX1-NEXT:    vpackuswb %xmm5, %xmm4, %xmm4
 ; AVX1-NEXT:    vpaddb %xmm3, %xmm4, %xmm3
@@ -713,22 +713,22 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
 ; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm5
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7]
 ; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
-; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [137,16,241,57,27,205,135,187]
+; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [137,0,16,0,241,0,57,0,27,0,205,0,135,0,187,0]
 ; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX1-NEXT:    vpsllw $7, %xmm6, %xmm6
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6],xmm6[7]
 ; AVX1-NEXT:    vpsrlw $8, %xmm5, %xmm5
-; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 # [37,32,57,205,117,171,79,147]
+; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 # [37,0,32,0,57,0,205,0,117,0,171,0,79,0,147,0]
 ; AVX1-NEXT:    vpsrlw $8, %xmm5, %xmm5
 ; AVX1-NEXT:    vpackuswb %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpsubb %xmm4, %xmm0, %xmm5
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
-; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 # [0,0,0,0,0,0,128,0]
+; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 # [0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0]
 ; AVX1-NEXT:    vpsrlw $8, %xmm6, %xmm6
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
-; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 # [128,0,0,0,128,0,0,0]
+; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 # [128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpsrlw $8, %xmm5, %xmm5
 ; AVX1-NEXT:    vpackuswb %xmm6, %xmm5, %xmm5
 ; AVX1-NEXT:    vpaddb %xmm4, %xmm5, %xmm4
@@ -755,20 +755,20 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
 ; AVX2NOBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [256,256,256,256,256,256,256,128,128,256,256,256,256,256,256,256]
 ; AVX2NOBW-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
 ; AVX2NOBW-NEXT:    vpsrlw $8, %ymm2, %ymm2
-; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [137,16,241,57,27,205,135,187,147,79,171,117,205,57,32,37]
+; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [137,0,16,0,241,0,57,0,27,0,205,0,135,0,187,0,147,0,79,0,171,0,117,0,205,0,57,0,32,0,37,0]
 ; AVX2NOBW-NEXT:    vpsrlw $8, %ymm2, %ymm2
 ; AVX2NOBW-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
 ; AVX2NOBW-NEXT:    vpmullw %ymm3, %ymm4, %ymm3
 ; AVX2NOBW-NEXT:    vpsrlw $8, %ymm3, %ymm3
-; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [37,32,57,205,117,171,79,147,187,135,205,27,57,241,16,137]
+; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [37,0,32,0,57,0,205,0,117,0,171,0,79,0,147,0,187,0,135,0,205,0,27,0,57,0,241,0,16,0,137,0]
 ; AVX2NOBW-NEXT:    vpsrlw $8, %ymm3, %ymm3
 ; AVX2NOBW-NEXT:    vpackuswb %ymm2, %ymm3, %ymm2
 ; AVX2NOBW-NEXT:    vpsubb %ymm2, %ymm0, %ymm3
 ; AVX2NOBW-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15],ymm3[24],ymm1[24],ymm3[25],ymm1[25],ymm3[26],ymm1[26],ymm3[27],ymm1[27],ymm3[28],ymm1[28],ymm3[29],ymm1[29],ymm3[30],ymm1[30],ymm3[31],ymm1[31]
-; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,0,0,0,0,0,128,0,0,0,0,128,0,0,0,128]
+; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0]
 ; AVX2NOBW-NEXT:    vpsrlw $8, %ymm4, %ymm4
 ; AVX2NOBW-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[16],ymm1[16],ymm3[17],ymm1[17],ymm3[18],ymm1[18],ymm3[19],ymm1[19],ymm3[20],ymm1[20],ymm3[21],ymm1[21],ymm3[22],ymm1[22],ymm3[23],ymm1[23]
-; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [128,0,0,0,128,0,0,0,0,128,0,0,0,0,0,0]
+; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX2NOBW-NEXT:    vpsrlw $8, %ymm3, %ymm3
 ; AVX2NOBW-NEXT:    vpackuswb %ymm4, %ymm3, %ymm3
 ; AVX2NOBW-NEXT:    vpaddb %ymm2, %ymm3, %ymm2
diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
index bf98bcca59c04..a11fa370a86b7 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
@@ -135,7 +135,7 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
 ; AVX512F-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpsrlw $8, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
@@ -169,7 +169,7 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm3 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
 ; AVX512BW-NEXT:    vpmullw %zmm3, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
@@ -199,20 +199,20 @@ define <64 x i8> @test_divconstant_64i8(<64 x i8> %a) nounwind {
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31]
 ; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [256,256,64,256,256,256,256,256,128,256,256,256,256,256,256,256]
 ; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [137,27,37,19,79,41,171,101,147,79,171,117,205,57,32,37]
+; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [137,0,27,0,37,0,19,0,79,0,41,0,171,0,101,0,147,0,79,0,171,0,117,0,205,0,57,0,32,0,37,0]
 ; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
 ; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [256,256,256,256,256,256,256,256,128,256,256,256,256,256,256,256]
 ; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [27,111,57,235,241,249,8,9,187,135,205,27,57,241,16,137]
+; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [27,0,111,0,57,0,235,0,241,0,249,0,8,0,9,0,187,0,135,0,205,0,27,0,57,0,241,0,16,0,137,0]
 ; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
 ; AVX512F-NEXT:    vpackuswb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsubb %ymm3, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31]
-; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,128,0,0,0,0,0,128,0,0,0,128,0,0,0,128]
+; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,0,128,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0]
 ; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
 ; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
-; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [0,0,0,0,0,0,0,128,0,128,0,0,0,0,0,0]
+; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512F-NEXT:    vpsrlw $8, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpackuswb %ymm4, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpaddb %ymm3, %ymm2, %ymm2
@@ -226,20 +226,20 @@ define <64 x i8> @test_divconstant_64i8(<64 x i8> %a) nounwind {
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
 ; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [256,256,256,256,256,256,256,128,256,256,256,256,256,256,256,256]
 ; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [137,16,241,57,27,205,135,187,9,8,249,241,235,57,111,27]
+; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [137,0,16,0,241,0,57,0,27,0,205,0,135,0,187,0,9,0,8,0,249,0,241,0,235,0,57,0,111,0,27,0]
 ; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
 ; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [256,256,256,256,256,256,256,128,256,256,256,256,256,64,256,256]
 ; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [37,32,57,205,117,171,79,147,101,171,41,79,19,37,27,137]
+; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [37,0,32,0,57,0,205,0,117,0,171,0,79,0,147,0,101,0,171,0,41,0,79,0,19,0,37,0,27,0,137,0]
 ; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
 ; AVX512F-NEXT:    vpackuswb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsubb %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,0,0,0,0,0,128,0,128,0,0,0,0,0,0,0]
+; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
 ; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,0,0,0,128,0,0,0,128,0,0,0,0,0,128,0]
+; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0]
 ; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpackuswb %ymm4, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
@@ -259,20 +259,20 @@ define <64 x i8> @test_divconstant_64i8(<64 x i8> %a) nounwind {
 ; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
 ; AVX512BW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 # [137,16,241,57,27,205,135,187,9,8,249,241,235,57,111,27,137,27,37,19,79,41,171,101,147,79,171,117,205,57,32,37]
+; AVX512BW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 # [137,0,16,0,241,0,57,0,27,0,205,0,135,0,187,0,9,0,8,0,249,0,241,0,235,0,57,0,111,0,27,0,137,0,27,0,37,0,19,0,79,0,41,0,171,0,101,0,147,0,79,0,171,0,117,0,205,0,57,0,32,0,37,0]
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
 ; AVX512BW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm3, %zmm3
-; AVX512BW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 # [37,32,57,205,117,171,79,147,101,171,41,79,19,37,27,137,27,111,57,235,241,249,8,9,187,135,205,27,57,241,16,137]
+; AVX512BW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 # [37,0,32,0,57,0,205,0,117,0,171,0,79,0,147,0,101,0,171,0,41,0,79,0,19,0,37,0,27,0,137,0,27,0,111,0,57,0,235,0,241,0,249,0,8,0,9,0,187,0,135,0,205,0,27,0,57,0,241,0,16,0,137,0]
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm3, %zmm3
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsubb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
-; AVX512BW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 # [0,0,0,0,0,0,128,0,128,0,0,0,0,0,0,0,0,128,0,0,0,0,0,128,0,0,0,128,0,0,0,128]
+; AVX512BW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 # [0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0]
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm3, %zmm3
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
-; AVX512BW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [128,0,0,0,128,0,0,0,128,0,0,0,0,0,128,0,0,0,0,0,0,0,0,128,0,128,0,0,0,0,0,0]
+; AVX512BW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm3, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddb %zmm2, %zmm0, %zmm0
@@ -473,7 +473,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
 ; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
-; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
 ; AVX512F-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
@@ -517,7 +517,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
-; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm3 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
 ; AVX512BW-NEXT:    vpmullw %zmm3, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
@@ -551,20 +551,20 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31]
 ; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [256,256,64,256,256,256,256,256,128,256,256,256,256,256,256,256]
 ; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [137,27,37,19,79,41,171,101,147,79,171,117,205,57,32,37]
+; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [137,0,27,0,37,0,19,0,79,0,41,0,171,0,101,0,147,0,79,0,171,0,117,0,205,0,57,0,32,0,37,0]
 ; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
 ; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [256,256,256,256,256,256,256,256,128,256,256,256,256,256,256,256]
 ; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [27,111,57,235,241,249,8,9,187,135,205,27,57,241,16,137]
+; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [27,0,111,0,57,0,235,0,241,0,249,0,8,0,9,0,187,0,135,0,205,0,27,0,57,0,241,0,16,0,137,0]
 ; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
 ; AVX512F-NEXT:    vpackuswb %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpsubb %ymm3, %ymm2, %ymm4
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15],ymm4[24],ymm1[24],ymm4[25],ymm1[25],ymm4[26],ymm1[26],ymm4[27],ymm1[27],ymm4[28],ymm1[28],ymm4[29],ymm1[29],ymm4[30],ymm1[30],ymm4[31],ymm1[31]
-; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [0,128,0,0,0,0,0,128,0,0,0,128,0,0,0,128]
+; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [0,0,128,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0]
 ; AVX512F-NEXT:    vpsrlw $8, %ymm5, %ymm5
 ; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[16],ymm1[16],ymm4[17],ymm1[17],ymm4[18],ymm1[18],ymm4[19],ymm1[19],ymm4[20],ymm1[20],ymm4[21],ymm1[21],ymm4[22],ymm1[22],ymm4[23],ymm1[23]
-; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,0,0,0,0,0,0,128,0,128,0,0,0,0,0,0]
+; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
 ; AVX512F-NEXT:    vpackuswb %ymm5, %ymm4, %ymm4
 ; AVX512F-NEXT:    vpaddb %ymm3, %ymm4, %ymm3
@@ -585,20 +585,20 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
 ; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [256,256,256,256,256,256,256,128,256,256,256,256,256,256,256,256]
 ; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [137,16,241,57,27,205,135,187,9,8,249,241,235,57,111,27]
+; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [137,0,16,0,241,0,57,0,27,0,205,0,135,0,187,0,9,0,8,0,249,0,241,0,235,0,57,0,111,0,27,0]
 ; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm4
 ; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
 ; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [256,256,256,256,256,256,256,128,256,256,256,256,256,64,256,256]
 ; AVX512F-NEXT:    vpsrlw $8, %ymm5, %ymm5
-; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [37,32,57,205,117,171,79,147,101,171,41,79,19,37,27,137]
+; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [37,0,32,0,57,0,205,0,117,0,171,0,79,0,147,0,101,0,171,0,41,0,79,0,19,0,37,0,27,0,137,0]
 ; AVX512F-NEXT:    vpsrlw $8, %ymm5, %ymm5
 ; AVX512F-NEXT:    vpackuswb %ymm4, %ymm5, %ymm4
 ; AVX512F-NEXT:    vpsubb %ymm4, %ymm0, %ymm5
 ; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm6 = ymm5[8],ymm1[8],ymm5[9],ymm1[9],ymm5[10],ymm1[10],ymm5[11],ymm1[11],ymm5[12],ymm1[12],ymm5[13],ymm1[13],ymm5[14],ymm1[14],ymm5[15],ymm1[15],ymm5[24],ymm1[24],ymm5[25],ymm1[25],ymm5[26],ymm1[26],ymm5[27],ymm1[27],ymm5[28],ymm1[28],ymm5[29],ymm1[29],ymm5[30],ymm1[30],ymm5[31],ymm1[31]
-; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6 # [0,0,0,0,0,0,128,0,128,0,0,0,0,0,0,0]
+; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6 # [0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512F-NEXT:    vpsrlw $8, %ymm6, %ymm6
 ; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[4],ymm1[4],ymm5[5],ymm1[5],ymm5[6],ymm1[6],ymm5[7],ymm1[7],ymm5[16],ymm1[16],ymm5[17],ymm1[17],ymm5[18],ymm1[18],ymm5[19],ymm1[19],ymm5[20],ymm1[20],ymm5[21],ymm1[21],ymm5[22],ymm1[22],ymm5[23],ymm1[23]
-; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [128,0,0,0,128,0,0,0,128,0,0,0,0,0,128,0]
+; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0]
 ; AVX512F-NEXT:    vpsrlw $8, %ymm5, %ymm5
 ; AVX512F-NEXT:    vpackuswb %ymm6, %ymm5, %ymm5
 ; AVX512F-NEXT:    vpaddb %ymm4, %ymm5, %ymm4
@@ -624,20 +624,20 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
 ; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
 ; AVX512BW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 # [137,16,241,57,27,205,135,187,9,8,249,241,235,57,111,27,137,27,37,19,79,41,171,101,147,79,171,117,205,57,32,37]
+; AVX512BW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 # [137,0,16,0,241,0,57,0,27,0,205,0,135,0,187,0,9,0,8,0,249,0,241,0,235,0,57,0,111,0,27,0,137,0,27,0,37,0,19,0,79,0,41,0,171,0,101,0,147,0,79,0,171,0,117,0,205,0,57,0,32,0,37,0]
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
 ; AVX512BW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm3, %zmm3
-; AVX512BW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 # [37,32,57,205,117,171,79,147,101,171,41,79,19,37,27,137,27,111,57,235,241,249,8,9,187,135,205,27,57,241,16,137]
+; AVX512BW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 # [37,0,32,0,57,0,205,0,117,0,171,0,79,0,147,0,101,0,171,0,41,0,79,0,19,0,37,0,27,0,137,0,27,0,111,0,57,0,235,0,241,0,249,0,8,0,9,0,187,0,135,0,205,0,27,0,57,0,241,0,16,0,137,0]
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm3, %zmm3
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsubb %zmm2, %zmm0, %zmm3
 ; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm3[8],zmm1[8],zmm3[9],zmm1[9],zmm3[10],zmm1[10],zmm3[11],zmm1[11],zmm3[12],zmm1[12],zmm3[13],zmm1[13],zmm3[14],zmm1[14],zmm3[15],zmm1[15],zmm3[24],zmm1[24],zmm3[25],zmm1[25],zmm3[26],zmm1[26],zmm3[27],zmm1[27],zmm3[28],zmm1[28],zmm3[29],zmm1[29],zmm3[30],zmm1[30],zmm3[31],zmm1[31],zmm3[40],zmm1[40],zmm3[41],zmm1[41],zmm3[42],zmm1[42],zmm3[43],zmm1[43],zmm3[44],zmm1[44],zmm3[45],zmm1[45],zmm3[46],zmm1[46],zmm3[47],zmm1[47],zmm3[56],zmm1[56],zmm3[57],zmm1[57],zmm3[58],zmm1[58],zmm3[59],zmm1[59],zmm3[60],zmm1[60],zmm3[61],zmm1[61],zmm3[62],zmm1[62],zmm3[63],zmm1[63]
-; AVX512BW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm4 # [0,0,0,0,0,0,128,0,128,0,0,0,0,0,0,0,0,128,0,0,0,0,0,128,0,0,0,128,0,0,0,128]
+; AVX512BW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm4 # [0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0]
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm4, %zmm4
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm3 = zmm3[0],zmm1[0],zmm3[1],zmm1[1],zmm3[2],zmm1[2],zmm3[3],zmm1[3],zmm3[4],zmm1[4],zmm3[5],zmm1[5],zmm3[6],zmm1[6],zmm3[7],zmm1[7],zmm3[16],zmm1[16],zmm3[17],zmm1[17],zmm3[18],zmm1[18],zmm3[19],zmm1[19],zmm3[20],zmm1[20],zmm3[21],zmm1[21],zmm3[22],zmm1[22],zmm3[23],zmm1[23],zmm3[32],zmm1[32],zmm3[33],zmm1[33],zmm3[34],zmm1[34],zmm3[35],zmm1[35],zmm3[36],zmm1[36],zmm3[37],zmm1[37],zmm3[38],zmm1[38],zmm3[39],zmm1[39],zmm3[48],zmm1[48],zmm3[49],zmm1[49],zmm3[50],zmm1[50],zmm3[51],zmm1[51],zmm3[52],zmm1[52],zmm3[53],zmm1[53],zmm3[54],zmm1[54],zmm3[55],zmm1[55]
-; AVX512BW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 # [128,0,0,0,128,0,0,0,128,0,0,0,0,0,128,0,0,0,0,0,0,0,0,128,0,128,0,0,0,0,0,0]
+; AVX512BW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 # [128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm3, %zmm3
 ; AVX512BW-NEXT:    vpackuswb %zmm4, %zmm3, %zmm3
 ; AVX512BW-NEXT:    vpaddb %zmm2, %zmm3, %zmm2
diff --git a/llvm/test/CodeGen/X86/vector-mul.ll b/llvm/test/CodeGen/X86/vector-mul.ll
index 6e1bf25908302..d0bb90c5fc8ab 100644
--- a/llvm/test/CodeGen/X86/vector-mul.ll
+++ b/llvm/test/CodeGen/X86/vector-mul.ll
@@ -130,31 +130,31 @@ define <4 x i32> @mul_v4i32_1_2_4_8(<4 x i32> %a0) nounwind {
 ; X86-SSE2-LABEL: mul_v4i32_1_2_4_8:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,2,4,8]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [2,u,8,u]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-SSE4-LABEL: mul_v4i32_1_2_4_8:
 ; X86-SSE4:       # %bb.0:
-; X86-SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,2,4,8]
 ; X86-SSE4-NEXT:    retl
 ;
 ; X64-SSE2-LABEL: mul_v4i32_1_2_4_8:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X64-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8]
 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2,u,8,u]
 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; X64-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X64-SSE2-NEXT:    retq
 ;
 ; X64-SSE4-LABEL: mul_v4i32_1_2_4_8:
 ; X64-SSE4:       # %bb.0:
-; X64-SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8]
 ; X64-SSE4-NEXT:    retq
 ;
 ; X64-XOP-LABEL: mul_v4i32_1_2_4_8:
@@ -190,12 +190,12 @@ define <4 x i32> @mul_v4i32_1_2_4_8_optsize(<4 x i32> %a0) nounwind optsize {
 ;
 ; X86-SSE4-LABEL: mul_v4i32_1_2_4_8_optsize:
 ; X86-SSE4:       # %bb.0:
-; X86-SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,2,4,8]
 ; X86-SSE4-NEXT:    retl
 ;
 ; X64-SSE4-LABEL: mul_v4i32_1_2_4_8_optsize:
 ; X64-SSE4:       # %bb.0:
-; X64-SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8]
 ; X64-SSE4-NEXT:    retq
 ;
 ; X64-XOP-LABEL: mul_v4i32_1_2_4_8_optsize:
@@ -989,7 +989,7 @@ define <2 x i64> @mul_v2i64_17_65(<2 x i64> %a0) nounwind {
 ;
 ; X64-AVX512DQ-LABEL: mul_v2i64_17_65:
 ; X64-AVX512DQ:       # %bb.0:
-; X64-AVX512DQ-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX512DQ-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [17,65]
 ; X64-AVX512DQ-NEXT:    retq
   %1 = mul <2 x i64> %a0, <i64 17, i64 65>
   ret <2 x i64> %1
@@ -999,36 +999,36 @@ define <4 x i32> @mul_v4i32_5_17_33_65(<4 x i32> %a0) nounwind {
 ; X86-SSE2-LABEL: mul_v4i32_5_17_33_65:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [5,17,33,65]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [17,u,65,u]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-SSE4-LABEL: mul_v4i32_5_17_33_65:
 ; X86-SSE4:       # %bb.0:
-; X86-SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [5,17,33,65]
 ; X86-SSE4-NEXT:    retl
 ;
 ; X64-SSE2-LABEL: mul_v4i32_5_17_33_65:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X64-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [5,17,33,65]
 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [17,u,65,u]
 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; X64-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X64-SSE2-NEXT:    retq
 ;
 ; X64-SSE4-LABEL: mul_v4i32_5_17_33_65:
 ; X64-SSE4:       # %bb.0:
-; X64-SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [5,17,33,65]
 ; X64-SSE4-NEXT:    retq
 ;
 ; X64-AVX-LABEL: mul_v4i32_5_17_33_65:
 ; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [5,17,33,65]
 ; X64-AVX-NEXT:    retq
   %1 = mul <4 x i32> %a0, <i32 5, i32 17, i32 33, i32 65>
   ret <4 x i32> %1
@@ -1384,7 +1384,7 @@ define <2 x i64> @mul_v2i64_15_63(<2 x i64> %a0) nounwind {
 ;
 ; X64-AVX512DQ-LABEL: mul_v2i64_15_63:
 ; X64-AVX512DQ:       # %bb.0:
-; X64-AVX512DQ-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX512DQ-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [15,63]
 ; X64-AVX512DQ-NEXT:    retq
   %1 = mul <2 x i64> %a0, <i64 15, i64 63>
   ret <2 x i64> %1
@@ -1427,7 +1427,7 @@ define <2 x i64> @mul_v2i64_neg_15_63(<2 x i64> %a0) nounwind {
 ; X64-SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; X64-SSE2-NEXT:    psrlq $32, %xmm3
 ; X64-SSE2-NEXT:    pmuludq %xmm1, %xmm3
-; X64-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4294967295,4294967295]
 ; X64-SSE2-NEXT:    paddq %xmm3, %xmm0
 ; X64-SSE2-NEXT:    psllq $32, %xmm0
 ; X64-SSE2-NEXT:    paddq %xmm2, %xmm0
@@ -1441,7 +1441,7 @@ define <2 x i64> @mul_v2i64_neg_15_63(<2 x i64> %a0) nounwind {
 ; X64-SSE4-NEXT:    movdqa %xmm0, %xmm3
 ; X64-SSE4-NEXT:    psrlq $32, %xmm3
 ; X64-SSE4-NEXT:    pmuludq %xmm1, %xmm3
-; X64-SSE4-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE4-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4294967295,4294967295]
 ; X64-SSE4-NEXT:    paddq %xmm3, %xmm0
 ; X64-SSE4-NEXT:    psllq $32, %xmm0
 ; X64-SSE4-NEXT:    paddq %xmm2, %xmm0
@@ -1453,7 +1453,7 @@ define <2 x i64> @mul_v2i64_neg_15_63(<2 x i64> %a0) nounwind {
 ; X64-XOP-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
 ; X64-XOP-NEXT:    vpsrlq $32, %xmm0, %xmm3
 ; X64-XOP-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
-; X64-XOP-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-XOP-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4294967295,4294967295]
 ; X64-XOP-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; X64-XOP-NEXT:    vpsllq $32, %xmm0, %xmm0
 ; X64-XOP-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
@@ -1465,7 +1465,7 @@ define <2 x i64> @mul_v2i64_neg_15_63(<2 x i64> %a0) nounwind {
 ; X64-AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
 ; X64-AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm3
 ; X64-AVX2-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
-; X64-AVX2-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4294967295,4294967295]
 ; X64-AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vpsllq $32, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
@@ -1473,7 +1473,7 @@ define <2 x i64> @mul_v2i64_neg_15_63(<2 x i64> %a0) nounwind {
 ;
 ; X64-AVX512DQ-LABEL: mul_v2i64_neg_15_63:
 ; X64-AVX512DQ:       # %bb.0:
-; X64-AVX512DQ-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX512DQ-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [18446744073709551601,18446744073709551553]
 ; X64-AVX512DQ-NEXT:    retq
   %1 = mul <2 x i64> %a0, <i64 -15, i64 -63>
   ret <2 x i64> %1
@@ -1516,7 +1516,7 @@ define <2 x i64> @mul_v2i64_neg_17_65(<2 x i64> %a0) nounwind {
 ; X64-SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; X64-SSE2-NEXT:    psrlq $32, %xmm3
 ; X64-SSE2-NEXT:    pmuludq %xmm1, %xmm3
-; X64-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4294967295,4294967295]
 ; X64-SSE2-NEXT:    paddq %xmm3, %xmm0
 ; X64-SSE2-NEXT:    psllq $32, %xmm0
 ; X64-SSE2-NEXT:    paddq %xmm2, %xmm0
@@ -1530,7 +1530,7 @@ define <2 x i64> @mul_v2i64_neg_17_65(<2 x i64> %a0) nounwind {
 ; X64-SSE4-NEXT:    movdqa %xmm0, %xmm3
 ; X64-SSE4-NEXT:    psrlq $32, %xmm3
 ; X64-SSE4-NEXT:    pmuludq %xmm1, %xmm3
-; X64-SSE4-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE4-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4294967295,4294967295]
 ; X64-SSE4-NEXT:    paddq %xmm3, %xmm0
 ; X64-SSE4-NEXT:    psllq $32, %xmm0
 ; X64-SSE4-NEXT:    paddq %xmm2, %xmm0
@@ -1542,7 +1542,7 @@ define <2 x i64> @mul_v2i64_neg_17_65(<2 x i64> %a0) nounwind {
 ; X64-XOP-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
 ; X64-XOP-NEXT:    vpsrlq $32, %xmm0, %xmm3
 ; X64-XOP-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
-; X64-XOP-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-XOP-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4294967295,4294967295]
 ; X64-XOP-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; X64-XOP-NEXT:    vpsllq $32, %xmm0, %xmm0
 ; X64-XOP-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
@@ -1554,7 +1554,7 @@ define <2 x i64> @mul_v2i64_neg_17_65(<2 x i64> %a0) nounwind {
 ; X64-AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
 ; X64-AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm3
 ; X64-AVX2-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
-; X64-AVX2-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4294967295,4294967295]
 ; X64-AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vpsllq $32, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
@@ -1562,7 +1562,7 @@ define <2 x i64> @mul_v2i64_neg_17_65(<2 x i64> %a0) nounwind {
 ;
 ; X64-AVX512DQ-LABEL: mul_v2i64_neg_17_65:
 ; X64-AVX512DQ:       # %bb.0:
-; X64-AVX512DQ-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX512DQ-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [18446744073709551599,18446744073709551551]
 ; X64-AVX512DQ-NEXT:    retq
   %1 = mul <2 x i64> %a0, <i64 -17, i64 -65>
   ret <2 x i64> %1
@@ -1600,7 +1600,7 @@ define <2 x i64> @mul_v2i64_neg_0_1(<2 x i64> %a0) nounwind {
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; X86-SSE2-NEXT:    psrlq $32, %xmm3
 ; X86-SSE2-NEXT:    pmuludq %xmm1, %xmm3
-; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,0,0,u,u,u,u,255,255,255,255,u,u,u,u]
 ; X86-SSE2-NEXT:    paddq %xmm3, %xmm0
 ; X86-SSE2-NEXT:    psllq $32, %xmm0
 ; X86-SSE2-NEXT:    paddq %xmm2, %xmm0
@@ -1614,7 +1614,7 @@ define <2 x i64> @mul_v2i64_neg_0_1(<2 x i64> %a0) nounwind {
 ; X86-SSE4-NEXT:    movdqa %xmm0, %xmm3
 ; X86-SSE4-NEXT:    psrlq $32, %xmm3
 ; X86-SSE4-NEXT:    pmuludq %xmm1, %xmm3
-; X86-SSE4-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE4-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,0,0,u,u,u,u,255,255,255,255,u,u,u,u]
 ; X86-SSE4-NEXT:    paddq %xmm3, %xmm0
 ; X86-SSE4-NEXT:    psllq $32, %xmm0
 ; X86-SSE4-NEXT:    paddq %xmm2, %xmm0
@@ -1628,7 +1628,7 @@ define <2 x i64> @mul_v2i64_neg_0_1(<2 x i64> %a0) nounwind {
 ; X64-SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; X64-SSE2-NEXT:    psrlq $32, %xmm3
 ; X64-SSE2-NEXT:    pmuludq %xmm1, %xmm3
-; X64-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,0,0,0,0,0,0,255,255,255,255,0,0,0,0]
 ; X64-SSE2-NEXT:    paddq %xmm3, %xmm0
 ; X64-SSE2-NEXT:    psllq $32, %xmm0
 ; X64-SSE2-NEXT:    paddq %xmm2, %xmm0
@@ -1642,7 +1642,7 @@ define <2 x i64> @mul_v2i64_neg_0_1(<2 x i64> %a0) nounwind {
 ; X64-SSE4-NEXT:    movdqa %xmm0, %xmm3
 ; X64-SSE4-NEXT:    psrlq $32, %xmm3
 ; X64-SSE4-NEXT:    pmuludq %xmm1, %xmm3
-; X64-SSE4-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE4-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,0,0,0,0,0,0,255,255,255,255,0,0,0,0]
 ; X64-SSE4-NEXT:    paddq %xmm3, %xmm0
 ; X64-SSE4-NEXT:    psllq $32, %xmm0
 ; X64-SSE4-NEXT:    paddq %xmm2, %xmm0
@@ -1654,7 +1654,7 @@ define <2 x i64> @mul_v2i64_neg_0_1(<2 x i64> %a0) nounwind {
 ; X64-XOP-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
 ; X64-XOP-NEXT:    vpsrlq $32, %xmm0, %xmm3
 ; X64-XOP-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
-; X64-XOP-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-XOP-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,0,0,0,0,0,0,255,255,255,255,0,0,0,0]
 ; X64-XOP-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; X64-XOP-NEXT:    vpsllq $32, %xmm0, %xmm0
 ; X64-XOP-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
@@ -1666,7 +1666,7 @@ define <2 x i64> @mul_v2i64_neg_0_1(<2 x i64> %a0) nounwind {
 ; X64-AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
 ; X64-AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm3
 ; X64-AVX2-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
-; X64-AVX2-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,0,0,0,0,0,0,255,255,255,255,0,0,0,0]
 ; X64-AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vpsllq $32, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
@@ -1674,7 +1674,7 @@ define <2 x i64> @mul_v2i64_neg_0_1(<2 x i64> %a0) nounwind {
 ;
 ; X64-AVX512DQ-LABEL: mul_v2i64_neg_0_1:
 ; X64-AVX512DQ:       # %bb.0:
-; X64-AVX512DQ-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX512DQ-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255]
 ; X64-AVX512DQ-NEXT:    retq
   %1 = mul <2 x i64> %a0, <i64 0, i64 -1>
   ret <2 x i64> %1
@@ -1689,7 +1689,7 @@ define <2 x i64> @mul_v2i64_15_neg_63(<2 x i64> %a0) nounwind {
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; X86-SSE2-NEXT:    psrlq $32, %xmm3
 ; X86-SSE2-NEXT:    pmuludq %xmm1, %xmm3
-; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,0,0,u,u,u,u,255,255,255,255,u,u,u,u]
 ; X86-SSE2-NEXT:    paddq %xmm3, %xmm0
 ; X86-SSE2-NEXT:    psllq $32, %xmm0
 ; X86-SSE2-NEXT:    paddq %xmm2, %xmm0
@@ -1703,7 +1703,7 @@ define <2 x i64> @mul_v2i64_15_neg_63(<2 x i64> %a0) nounwind {
 ; X86-SSE4-NEXT:    movdqa %xmm0, %xmm3
 ; X86-SSE4-NEXT:    psrlq $32, %xmm3
 ; X86-SSE4-NEXT:    pmuludq %xmm1, %xmm3
-; X86-SSE4-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE4-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,0,0,u,u,u,u,255,255,255,255,u,u,u,u]
 ; X86-SSE4-NEXT:    paddq %xmm3, %xmm0
 ; X86-SSE4-NEXT:    psllq $32, %xmm0
 ; X86-SSE4-NEXT:    paddq %xmm2, %xmm0
@@ -1717,7 +1717,7 @@ define <2 x i64> @mul_v2i64_15_neg_63(<2 x i64> %a0) nounwind {
 ; X64-SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; X64-SSE2-NEXT:    psrlq $32, %xmm3
 ; X64-SSE2-NEXT:    pmuludq %xmm1, %xmm3
-; X64-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,0,0,0,0,0,0,255,255,255,255,0,0,0,0]
 ; X64-SSE2-NEXT:    paddq %xmm3, %xmm0
 ; X64-SSE2-NEXT:    psllq $32, %xmm0
 ; X64-SSE2-NEXT:    paddq %xmm2, %xmm0
@@ -1731,7 +1731,7 @@ define <2 x i64> @mul_v2i64_15_neg_63(<2 x i64> %a0) nounwind {
 ; X64-SSE4-NEXT:    movdqa %xmm0, %xmm3
 ; X64-SSE4-NEXT:    psrlq $32, %xmm3
 ; X64-SSE4-NEXT:    pmuludq %xmm1, %xmm3
-; X64-SSE4-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE4-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,0,0,0,0,0,0,255,255,255,255,0,0,0,0]
 ; X64-SSE4-NEXT:    paddq %xmm3, %xmm0
 ; X64-SSE4-NEXT:    psllq $32, %xmm0
 ; X64-SSE4-NEXT:    paddq %xmm2, %xmm0
@@ -1743,7 +1743,7 @@ define <2 x i64> @mul_v2i64_15_neg_63(<2 x i64> %a0) nounwind {
 ; X64-XOP-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
 ; X64-XOP-NEXT:    vpsrlq $32, %xmm0, %xmm3
 ; X64-XOP-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
-; X64-XOP-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-XOP-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,0,0,0,0,0,0,255,255,255,255,0,0,0,0]
 ; X64-XOP-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; X64-XOP-NEXT:    vpsllq $32, %xmm0, %xmm0
 ; X64-XOP-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
@@ -1755,7 +1755,7 @@ define <2 x i64> @mul_v2i64_15_neg_63(<2 x i64> %a0) nounwind {
 ; X64-AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
 ; X64-AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm3
 ; X64-AVX2-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
-; X64-AVX2-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,0,0,0,0,0,0,255,255,255,255,0,0,0,0]
 ; X64-AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vpsllq $32, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
@@ -1763,7 +1763,7 @@ define <2 x i64> @mul_v2i64_15_neg_63(<2 x i64> %a0) nounwind {
 ;
 ; X64-AVX512DQ-LABEL: mul_v2i64_15_neg_63:
 ; X64-AVX512DQ:       # %bb.0:
-; X64-AVX512DQ-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX512DQ-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [15,18446744073709551553]
 ; X64-AVX512DQ-NEXT:    retq
   %1 = mul <2 x i64> %a0, <i64 15, i64 -63>
   ret <2 x i64> %1
@@ -1773,36 +1773,36 @@ define <4 x i32> @mul_v4i32_0_15_31_7(<4 x i32> %a0) nounwind {
 ; X86-SSE2-LABEL: mul_v4i32_0_15_31_7:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,15,31,7]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [15,u,7,u]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-SSE4-LABEL: mul_v4i32_0_15_31_7:
 ; X86-SSE4:       # %bb.0:
-; X86-SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,15,31,7]
 ; X86-SSE4-NEXT:    retl
 ;
 ; X64-SSE2-LABEL: mul_v4i32_0_15_31_7:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X64-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,15,31,7]
 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [15,u,7,u]
 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; X64-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X64-SSE2-NEXT:    retq
 ;
 ; X64-SSE4-LABEL: mul_v4i32_0_15_31_7:
 ; X64-SSE4:       # %bb.0:
-; X64-SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,15,31,7]
 ; X64-SSE4-NEXT:    retq
 ;
 ; X64-AVX-LABEL: mul_v4i32_0_15_31_7:
 ; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,15,31,7]
 ; X64-AVX-NEXT:    retq
   %1 = mul <4 x i32> %a0, <i32 0, i32 15, i32 31, i32 7>
   ret <4 x i32> %1
@@ -1947,7 +1947,7 @@ define <2 x i64> @mul_v2i64_68_132(<2 x i64> %x) nounwind {
 ;
 ; X64-AVX512DQ-LABEL: mul_v2i64_68_132:
 ; X64-AVX512DQ:       # %bb.0:
-; X64-AVX512DQ-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX512DQ-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [68,132]
 ; X64-AVX512DQ-NEXT:    retq
   %mul = mul <2 x i64> %x, <i64 68, i64 132>
   ret <2 x i64> %mul
@@ -2009,7 +2009,7 @@ define <2 x i64> @mul_v2i64_60_120(<2 x i64> %x) nounwind {
 ;
 ; X64-AVX512DQ-LABEL: mul_v2i64_60_120:
 ; X64-AVX512DQ:       # %bb.0:
-; X64-AVX512DQ-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX512DQ-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [60,124]
 ; X64-AVX512DQ-NEXT:    retq
   %mul = mul <2 x i64> %x, <i64 60, i64 124>
   ret <2 x i64> %mul
diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
index 983ae594e3ab1..3d85d5587a45f 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
@@ -851,7 +851,7 @@ define i16 @test_v4i16_v4i8(<4 x i16> %a0) {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
 ; SSE2-NEXT:    pandn %xmm0, %xmm1
-; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,0,128,0,64,0,32,u,u,u,u,u,u,u,u]
 ; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 ; SSE2-NEXT:    paddw %xmm0, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll
index 93f4ce7573ad1..0bf5a8d6daeae 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-128.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll
@@ -1092,9 +1092,9 @@ define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind {
 ; SSE2-LABEL: constant_rotate_v4i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,64,128]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,u,128,u]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -1106,8 +1106,8 @@ define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind {
 ; SSE41-LABEL: constant_rotate_v4i32:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,u,128,u]
+; SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,64,128]
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1118,8 +1118,8 @@ define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind {
 ; AVX1-LABEL: constant_rotate_v4i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,u,128,u]
+; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,64,128]
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -1156,9 +1156,9 @@ define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind {
 ; X86-SSE2-LABEL: constant_rotate_v4i32:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [16,32,64,128]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [32,u,128,u]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
 ; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll
index 64c31187f29ef..5ae3e2f5d7621 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-256.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll
@@ -895,13 +895,13 @@ define <8 x i32> @constant_rotate_v8i32(<8 x i32> %a) nounwind {
 ; AVX1-LABEL: constant_rotate_v8i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,u,128,u]
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [512,u,2048,u]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,64,128]
+; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [256,512,1024,2048]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vmovshdup {{.*#+}} ymm2 = ymm0[1,1,3,3,5,5,7,7]
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
index d565ef01ececf..1602cde7a8e16 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
@@ -1673,7 +1673,7 @@ define <16 x i8> @constant_shift_v16i8_pairs(<16 x i8> %a) nounwind {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,65535]
 ; SSE2-NEXT:    pandn %xmm0, %xmm1
-; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,128,0,16,0,2,0,32,0,64,0,0,0,8,0,4]
 ; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [64,64,8,8,1,1,16,16,32,32,128,128,4,4,2,2]
@@ -1750,7 +1750,7 @@ define <16 x i8> @constant_shift_v16i8_pairs(<16 x i8> %a) nounwind {
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,65535]
 ; X86-SSE-NEXT:    pandn %xmm0, %xmm1
-; X86-SSE-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,128,0,16,0,2,0,32,0,64,0,0,0,8,0,4]
 ; X86-SSE-NEXT:    por %xmm1, %xmm0
 ; X86-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [64,64,8,8,1,1,16,16,32,32,128,128,4,4,2,2]
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
index 8cb2c7be3b044..a847da6eff3ff 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
@@ -1223,7 +1223,7 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
 ; SSE2-NEXT:    pandn %xmm0, %xmm1
-; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,0,128,0,64,0,32,0,16,0,8,0,4,0,2]
 ; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -1275,7 +1275,7 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
 ; X86-SSE-NEXT:    pandn %xmm0, %xmm1
-; X86-SSE-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,0,128,0,64,0,32,0,16,0,8,0,4,0,2]
 ; X86-SSE-NEXT:    por %xmm1, %xmm0
 ; X86-SSE-NEXT:    retl
   %shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
index 57874c4399277..eb39b6a0d2227 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
@@ -1480,7 +1480,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
 ; SSE2-NEXT:    pandn %xmm0, %xmm1
-; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,0,128,0,64,0,32,u,u,u,u,u,u,u,u]
 ; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -1532,7 +1532,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
 ; X86-SSE-NEXT:    pandn %xmm0, %xmm1
-; X86-SSE-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,0,128,0,64,0,32,u,u,u,u,u,u,u,u]
 ; X86-SSE-NEXT:    por %xmm1, %xmm0
 ; X86-SSE-NEXT:    retl
   %shift = lshr <4 x i16> %a, <i16 0, i16 1, i16 2, i16 3>
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
index 99dac74d8127b..3085c325e0968 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
@@ -987,21 +987,21 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
 ; SSE2-LABEL: constant_shift_v4i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,64,128]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,u,128,u]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: constant_shift_v4i32:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,64,128]
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: constant_shift_v4i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,64,128]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: constant_shift_v4i32:
@@ -1032,9 +1032,9 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
 ; X86-SSE-LABEL: constant_shift_v4i32:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [16,32,64,128]
 ; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [32,u,128,u]
 ; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X86-SSE-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
index b56a8b5b2294c..f9ccd1e8ca156 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
@@ -1117,9 +1117,9 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
 define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
 ; AVX1-LABEL: constant_shift_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [16,32,64,128]
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,512,256,128]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -1153,9 +1153,9 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
 ;
 ; X86-AVX1-LABEL: constant_shift_v8i32:
 ; X86-AVX1:       # %bb.0:
-; X86-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 # [16,32,64,128]
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; X86-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [256,512,256,128]
 ; X86-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; X86-AVX1-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
index 3279a50a1265b..7a08f3ef116bd 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
@@ -624,6 +624,52 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n
   ret void
 }
 
+define i32 @PR164107(<16 x i1> %0) {
+; AVX1-LABEL: PR164107:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsllw $15, %xmm0, %xmm0
+; AVX1-NEXT:    vpsraw $15, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    ret{{[l|q]}}
+;
+; AVX2-LABEL: PR164107:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
+; AVX2-NEXT:    vpsllw $15, %xmm0, %xmm0
+; AVX2-NEXT:    vpsraw $15, %xmm0, %xmm0
+; AVX2-NEXT:    vpmovsxwq %xmm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    ret{{[l|q]}}
+;
+; AVX512-LABEL: PR164107:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512-NEXT:    vpslld $31, %zmm0, %zmm0
+; AVX512-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
+; AVX512-NEXT:    vpbroadcastq %xmm0, %zmm0
+; AVX512-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
+; AVX512-NEXT:    vextracti32x4 $3, %zmm0, %xmm0
+; AVX512-NEXT:    vpbroadcastw %xmm0, %xmm0
+; AVX512-NEXT:    vpmovsxwq %xmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    ret{{[l|q]}}
+  %cmp = shufflevector <16 x i1> %0, <16 x i1> zeroinitializer, <16 x i32> zeroinitializer
+  %sext = sext <16 x i1> %cmp to <16 x i64>
+  %bc.1 = bitcast <16 x i64> %sext to <64 x i16>
+  %vecinit15.i = shufflevector <64 x i16> %bc.1, <64 x i16> zeroinitializer, <16 x i32> <i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56>
+  %conv16.i = sext <16 x i16> %vecinit15.i to <16 x i64>
+  %bc.2 = bitcast <16 x i64> %conv16.i to <32 x i32>
+  %conv22.i = extractelement <32 x i32> %bc.2, i64 4
+  ret i32 %conv22.i
+}
+
 define <4 x i64> @concat_self_v4i64(<2 x i64> %x) {
 ; AVX1-LABEL: concat_self_v4i64:
 ; AVX1:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
index 0e20b1813040a..18d79b67ea5bc 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
@@ -778,9 +778,9 @@ define <16 x i8> @combine_shl_pshufb(<4 x i32> %a0) {
 ; SSSE3-LABEL: combine_shl_pshufb:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,256,65536,65536]
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSSE3-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [256,u,65536,u]
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,2,3,0,5,6,7,4,9,10,11,8,12,13,14,15]
@@ -788,13 +788,13 @@ define <16 x i8> @combine_shl_pshufb(<4 x i32> %a0) {
 ;
 ; SSE41-LABEL: combine_shl_pshufb:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,256,65536,65536]
 ; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,2,3,0,5,6,7,4,9,10,11,8,12,13,14,15]
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: combine_shl_pshufb:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,256,65536,65536]
 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,2,3,0,5,6,7,4,9,10,11,8,12,13,14,15]
 ; AVX1-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll
index 1af7542436501..423537777a0e1 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-math.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll
@@ -2110,7 +2110,7 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    xorps %xmm2, %xmm2
 ; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2,3]
 ; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
 ; SSE-NEXT:    movaps %xmm2, %xmm0
 ; SSE-NEXT:    retq
@@ -2119,7 +2119,7 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,1,2,3]
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
@@ -2127,7 +2127,7 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
 ; AVX2-SLOW:       # %bb.0:
 ; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX2-SLOW-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,1,2,3]
 ; AVX2-SLOW-NEXT:    vzeroupper
 ; AVX2-SLOW-NEXT:    retq
 ;
@@ -2135,7 +2135,7 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
 ; AVX2-FAST-ALL:       # %bb.0:
 ; AVX2-FAST-ALL-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,2,4,6,0,0,0,0]
 ; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
-; AVX2-FAST-ALL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-FAST-ALL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,1,2,3]
 ; AVX2-FAST-ALL-NEXT:    vzeroupper
 ; AVX2-FAST-ALL-NEXT:    retq
 ;
@@ -2143,7 +2143,7 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
 ; AVX2-FAST-PERLANE:       # %bb.0:
 ; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX2-FAST-PERLANE-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-FAST-PERLANE-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,1,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vzeroupper
 ; AVX2-FAST-PERLANE-NEXT:    retq
 ;
@@ -2151,7 +2151,7 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,1,2,3]
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = mul <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
@@ -2253,13 +2253,13 @@ define <8 x i16> @trunc_mul_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
 define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; SSE-LABEL: trunc_mul_const_v16i64_v16i8:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5
-; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
-; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7
+; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2,3]
+; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [4,5]
+; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [6,7]
+; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [8,9]
+; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 # [10,11]
+; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [12,13]
+; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 # [14,15]
 ; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
 ; SSE-NEXT:    pand %xmm8, %xmm7
 ; SSE-NEXT:    pand %xmm8, %xmm6
@@ -2280,18 +2280,18 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ;
 ; AVX1-LABEL: trunc_mul_const_v16i64_v16i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4
+; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4 # [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm5
+; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2,3]
+; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm5 # [4,5]
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm6
+; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [6,7]
+; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm6 # [8,9]
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
-; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm7
+; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [10,11]
+; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm7 # [12,13]
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
-; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [14,15]
 ; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm8 = [255,255]
 ; AVX1-NEXT:    vpand %xmm3, %xmm8, %xmm3
 ; AVX1-NEXT:    vpand %xmm7, %xmm8, %xmm7
@@ -2313,10 +2313,10 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ;
 ; AVX2-LABEL: trunc_mul_const_v16i64_v16i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX2-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
+; AVX2-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,1,2,3]
+; AVX2-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [4,5,6,7]
+; AVX2-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [8,9,10,11]
+; AVX2-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [12,13,14,15]
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
 ; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
 ; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
@@ -2335,8 +2335,8 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ;
 ; AVX512F-LABEL: trunc_mul_const_v16i64_v16i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
-; AVX512F-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512F-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [0,1,2,3,4,5,6,7]
+; AVX512F-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [8,9,10,11,12,13,14,15]
 ; AVX512F-NEXT:    vpmovqb %zmm1, %xmm1
 ; AVX512F-NEXT:    vpmovqb %zmm0, %xmm0
 ; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -2345,8 +2345,8 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ;
 ; AVX512BW-LABEL: trunc_mul_const_v16i64_v16i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [0,1,2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [8,9,10,11,12,13,14,15]
 ; AVX512BW-NEXT:    vpmovqb %zmm1, %xmm1
 ; AVX512BW-NEXT:    vpmovqb %zmm0, %xmm0
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -2355,8 +2355,8 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ;
 ; AVX512DQ-LABEL: trunc_mul_const_v16i64_v16i8:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
-; AVX512DQ-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [0,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [8,9,10,11,12,13,14,15]
 ; AVX512DQ-NEXT:    vpmovqb %zmm1, %xmm1
 ; AVX512DQ-NEXT:    vpmovqb %zmm0, %xmm0
 ; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -2371,27 +2371,27 @@ define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
 ; SSE-LABEL: trunc_mul_const_v16i32_v16i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,1,2,3]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
+; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [1,u,3,u]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
+; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [5,u,7,u]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [8,9,10,11]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
+; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [9,u,11,u]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [12,13,14,15]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
+; SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [13,u,15,u]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
 ; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
@@ -2406,12 +2406,12 @@ define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
 ;
 ; AVX1-LABEL: trunc_mul_const_v16i32_v16i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 # [0,1,2,3]
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
+; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4,5,6,7]
+; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3 # [8,9,10,11]
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [12,13,14,15]
 ; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [255,255,255,255]
 ; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
 ; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
@@ -2425,8 +2425,8 @@ define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
 ;
 ; AVX2-LABEL: trunc_mul_const_v16i32_v16i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,1,2,3,4,5,6,7]
+; AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [8,9,10,11,12,13,14,15]
 ; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
 ; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
@@ -2439,7 +2439,7 @@ define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
 ;
 ; AVX512-LABEL: trunc_mul_const_v16i32_v16i8:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vselect-avx.ll b/llvm/test/CodeGen/X86/vselect-avx.ll
index 17315c436188a..ac330a7e60396 100644
--- a/llvm/test/CodeGen/X86/vselect-avx.ll
+++ b/llvm/test/CodeGen/X86/vselect-avx.ll
@@ -95,7 +95,7 @@ bb:
 define void @test3(<4 x i32> %induction30, ptr %tmp16, ptr %tmp17,  <4 x i16> %tmp3, <4 x i16> %tmp12) {
 ; AVX1-LABEL: test3:
 ; AVX1:       ## %bb.0:
-; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ## [2863311531,2863311531,2863311531,2863311531]
 ; AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX1-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm0
@@ -151,23 +151,19 @@ define <32 x i8> @PR22706(<32 x i1> %x) {
 ; AVX1:       ## %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vpsllw $7, %xmm1, %xmm1
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpgtb %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
-; AVX1-NEXT:    vpaddb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpgtb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsllw $7, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm3, %xmm0
-; AVX1-NEXT:    vpaddb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: PR22706:
 ; AVX2:       ## %bb.0:
 ; AVX2-NEXT:    vpsllw $7, %ymm0, %ymm0
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vselect-pcmp.ll b/llvm/test/CodeGen/X86/vselect-pcmp.ll
index 8543e9fd919b2..16700d42ad448 100644
--- a/llvm/test/CodeGen/X86/vselect-pcmp.ll
+++ b/llvm/test/CodeGen/X86/vselect-pcmp.ll
@@ -1046,7 +1046,7 @@ define <2 x i64> @blend_mask_cond_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z
 define <4 x i32> @blend_mask_cond_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; AVX1-LABEL: blend_mask_cond_v4i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32768,4194304,1073741824,2147483648]
 ; AVX1-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
 ; AVX1-NEXT:    retq
 ;
@@ -1211,9 +1211,9 @@ define <4 x i64> @blend_mask_cond_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %z
 define <8 x i32> @blend_mask_cond_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z) {
 ; AVX1-LABEL: blend_mask_cond_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
+; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 # [2147483648,1073741824,268435456,536870912]
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [268435456,2097152,1073741824,524288]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
 ; AVX1-NEXT:    vblendvps %ymm0, %ymm2, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/x86-shrink-wrap-unwind.ll b/llvm/test/CodeGen/X86/x86-shrink-wrap-unwind.ll
index b2064b11802dc..02d4d88a21682 100644
--- a/llvm/test/CodeGen/X86/x86-shrink-wrap-unwind.ll
+++ b/llvm/test/CodeGen/X86/x86-shrink-wrap-unwind.ll
@@ -181,40 +181,38 @@ define zeroext i1 @segmentedStack(ptr readonly %vk1, ptr readonly %vk2, i64 %key
 ; CHECK-LABEL: segmentedStack:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    cmpq %gs:816, %rsp
-; CHECK-NEXT:    jbe LBB3_6
+; CHECK-NEXT:    jbe LBB3_7
 ; CHECK-NEXT:  LBB3_1: ## %entry
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    testq %rdi, %rdi
-; CHECK-NEXT:    sete %al
-; CHECK-NEXT:    testq %rsi, %rsi
-; CHECK-NEXT:    sete %cl
-; CHECK-NEXT:    orb %al, %cl
 ; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:    orq %rsi, %rax
 ; CHECK-NEXT:    sete %al
-; CHECK-NEXT:    testb %cl, %cl
-; CHECK-NEXT:    jne LBB3_4
-; CHECK-NEXT:  ## %bb.2: ## %if.end4.i
+; CHECK-NEXT:    testq %rdi, %rdi
+; CHECK-NEXT:    je LBB3_5
+; CHECK-NEXT:  ## %bb.2: ## %entry
+; CHECK-NEXT:    testq %rsi, %rsi
+; CHECK-NEXT:    je LBB3_5
+; CHECK-NEXT:  ## %bb.3: ## %if.end4.i
 ; CHECK-NEXT:    movq 8(%rdi), %rdx
 ; CHECK-NEXT:    cmpq 8(%rsi), %rdx
-; CHECK-NEXT:    jne LBB3_5
-; CHECK-NEXT:  ## %bb.3: ## %land.rhs.i.i
+; CHECK-NEXT:    jne LBB3_6
+; CHECK-NEXT:  ## %bb.4: ## %land.rhs.i.i
 ; CHECK-NEXT:    movq (%rsi), %rsi
 ; CHECK-NEXT:    movq (%rdi), %rdi
 ; CHECK-NEXT:    callq _memcmp
 ; CHECK-NEXT:    testl %eax, %eax
 ; CHECK-NEXT:    sete %al
-; CHECK-NEXT:  LBB3_4: ## %__go_ptr_strings_equal.exit
+; CHECK-NEXT:  LBB3_5: ## %__go_ptr_strings_equal.exit
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    popq %rcx
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:  LBB3_5:
+; CHECK-NEXT:  LBB3_6:
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    popq %rcx
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:  LBB3_6:
+; CHECK-NEXT:  LBB3_7:
 ; CHECK-NEXT:    movl $8, %r10d
 ; CHECK-NEXT:    movl $0, %r11d
 ; CHECK-NEXT:    callq ___morestack
@@ -224,43 +222,41 @@ define zeroext i1 @segmentedStack(ptr readonly %vk1, ptr readonly %vk2, i64 %key
 ; NOCOMPACTUNWIND-LABEL: segmentedStack:
 ; NOCOMPACTUNWIND:       # %bb.0:
 ; NOCOMPACTUNWIND-NEXT:    cmpq %fs:112, %rsp
-; NOCOMPACTUNWIND-NEXT:    jbe .LBB3_6
+; NOCOMPACTUNWIND-NEXT:    jbe .LBB3_7
 ; NOCOMPACTUNWIND-NEXT:  .LBB3_1: # %entry
 ; NOCOMPACTUNWIND-NEXT:    pushq %rax
 ; NOCOMPACTUNWIND-NEXT:    .cfi_def_cfa_offset 16
-; NOCOMPACTUNWIND-NEXT:    testq %rdi, %rdi
-; NOCOMPACTUNWIND-NEXT:    sete %al
-; NOCOMPACTUNWIND-NEXT:    testq %rsi, %rsi
-; NOCOMPACTUNWIND-NEXT:    sete %cl
-; NOCOMPACTUNWIND-NEXT:    orb %al, %cl
 ; NOCOMPACTUNWIND-NEXT:    movq %rdi, %rax
 ; NOCOMPACTUNWIND-NEXT:    orq %rsi, %rax
 ; NOCOMPACTUNWIND-NEXT:    sete %al
-; NOCOMPACTUNWIND-NEXT:    testb %cl, %cl
-; NOCOMPACTUNWIND-NEXT:    jne .LBB3_4
-; NOCOMPACTUNWIND-NEXT:  # %bb.2: # %if.end4.i
+; NOCOMPACTUNWIND-NEXT:    testq %rdi, %rdi
+; NOCOMPACTUNWIND-NEXT:    je .LBB3_5
+; NOCOMPACTUNWIND-NEXT:  # %bb.2: # %entry
+; NOCOMPACTUNWIND-NEXT:    testq %rsi, %rsi
+; NOCOMPACTUNWIND-NEXT:    je .LBB3_5
+; NOCOMPACTUNWIND-NEXT:  # %bb.3: # %if.end4.i
 ; NOCOMPACTUNWIND-NEXT:    movq 8(%rdi), %rdx
 ; NOCOMPACTUNWIND-NEXT:    cmpq 8(%rsi), %rdx
-; NOCOMPACTUNWIND-NEXT:    jne .LBB3_5
-; NOCOMPACTUNWIND-NEXT:  # %bb.3: # %land.rhs.i.i
+; NOCOMPACTUNWIND-NEXT:    jne .LBB3_6
+; NOCOMPACTUNWIND-NEXT:  # %bb.4: # %land.rhs.i.i
 ; NOCOMPACTUNWIND-NEXT:    movq (%rsi), %rsi
 ; NOCOMPACTUNWIND-NEXT:    movq (%rdi), %rdi
 ; NOCOMPACTUNWIND-NEXT:    callq memcmp@PLT
 ; NOCOMPACTUNWIND-NEXT:    testl %eax, %eax
 ; NOCOMPACTUNWIND-NEXT:    sete %al
-; NOCOMPACTUNWIND-NEXT:  .LBB3_4: # %__go_ptr_strings_equal.exit
+; NOCOMPACTUNWIND-NEXT:  .LBB3_5: # %__go_ptr_strings_equal.exit
 ; NOCOMPACTUNWIND-NEXT:    # kill: def $al killed $al killed $eax
 ; NOCOMPACTUNWIND-NEXT:    popq %rcx
 ; NOCOMPACTUNWIND-NEXT:    .cfi_def_cfa_offset 8
 ; NOCOMPACTUNWIND-NEXT:    retq
-; NOCOMPACTUNWIND-NEXT:  .LBB3_5:
+; NOCOMPACTUNWIND-NEXT:  .LBB3_6:
 ; NOCOMPACTUNWIND-NEXT:    .cfi_def_cfa_offset 16
 ; NOCOMPACTUNWIND-NEXT:    xorl %eax, %eax
 ; NOCOMPACTUNWIND-NEXT:    # kill: def $al killed $al killed $eax
 ; NOCOMPACTUNWIND-NEXT:    popq %rcx
 ; NOCOMPACTUNWIND-NEXT:    .cfi_def_cfa_offset 8
 ; NOCOMPACTUNWIND-NEXT:    retq
-; NOCOMPACTUNWIND-NEXT:  .LBB3_6:
+; NOCOMPACTUNWIND-NEXT:  .LBB3_7:
 ; NOCOMPACTUNWIND-NEXT:    movl $8, %r10d
 ; NOCOMPACTUNWIND-NEXT:    movl $0, %r11d
 ; NOCOMPACTUNWIND-NEXT:    callq __morestack
diff --git a/llvm/test/CodeGen/X86/zero-call-used-regs-simd.ll b/llvm/test/CodeGen/X86/zero-call-used-regs-simd.ll
new file mode 100644
index 0000000000000..d9253e0ca127b
--- /dev/null
+++ b/llvm/test/CodeGen/X86/zero-call-used-regs-simd.ll
@@ -0,0 +1,216 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2                        -verify-machineinstrs | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx                         -verify-machineinstrs | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2                        -verify-machineinstrs | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512vl           -verify-machineinstrs | FileCheck %s --check-prefixes=AVX512,AVX512VL
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512vl,+avx512bw -verify-machineinstrs | FileCheck %s --check-prefixes=AVX512,AVX512BW
+
+define void @zero_xmm(<4 x i32> %arg) #0 {
+; SSE-LABEL: zero_xmm:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps %xmm0, 0
+; SSE-NEXT:    xorps %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: zero_xmm:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps %xmm0, 0
+; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: zero_xmm:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovaps %xmm0, 0
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  store <4 x i32> %arg, ptr null, align 32
+  ret void
+}
+
+define void @zero_ymm(<8 x i32> %arg) #0 {
+; SSE-LABEL: zero_ymm:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps %xmm1, 16
+; SSE-NEXT:    movaps %xmm0, 0
+; SSE-NEXT:    xorps %xmm0, %xmm0
+; SSE-NEXT:    xorps %xmm1, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: zero_ymm:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps %ymm0, 0
+; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: zero_ymm:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovaps %ymm0, 0
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  store <8 x i32> %arg, ptr null, align 32
+  ret void
+}
+
+define void @zero_zmm(<16 x i32> %arg) #0 {
+; SSE-LABEL: zero_zmm:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps %xmm3, 48
+; SSE-NEXT:    movaps %xmm2, 32
+; SSE-NEXT:    movaps %xmm1, 16
+; SSE-NEXT:    movaps %xmm0, 0
+; SSE-NEXT:    xorps %xmm0, %xmm0
+; SSE-NEXT:    xorps %xmm1, %xmm1
+; SSE-NEXT:    xorps %xmm2, %xmm2
+; SSE-NEXT:    xorps %xmm3, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: zero_zmm:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps %ymm1, 32
+; AVX-NEXT:    vmovaps %ymm0, 0
+; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: zero_zmm:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovups %zmm0, 0
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  store <16 x i32> %arg, ptr null, align 32
+  ret void
+}
+
+define void @zero_k(<8 x i32> %arg, <8 x i1> %mask) #0 {
+; SSE-LABEL: zero_k:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psllw $15, %xmm2
+; SSE-NEXT:    packsswb %xmm2, %xmm2
+; SSE-NEXT:    pmovmskb %xmm2, %eax
+; SSE-NEXT:    testb $1, %al
+; SSE-NEXT:    jne .LBB3_1
+; SSE-NEXT:  # %bb.2: # %else
+; SSE-NEXT:    testb $2, %al
+; SSE-NEXT:    jne .LBB3_3
+; SSE-NEXT:  .LBB3_4: # %else2
+; SSE-NEXT:    testb $4, %al
+; SSE-NEXT:    jne .LBB3_5
+; SSE-NEXT:  .LBB3_6: # %else4
+; SSE-NEXT:    testb $8, %al
+; SSE-NEXT:    jne .LBB3_7
+; SSE-NEXT:  .LBB3_8: # %else6
+; SSE-NEXT:    testb $16, %al
+; SSE-NEXT:    jne .LBB3_9
+; SSE-NEXT:  .LBB3_10: # %else8
+; SSE-NEXT:    testb $32, %al
+; SSE-NEXT:    jne .LBB3_11
+; SSE-NEXT:  .LBB3_12: # %else10
+; SSE-NEXT:    testb $64, %al
+; SSE-NEXT:    jne .LBB3_13
+; SSE-NEXT:  .LBB3_14: # %else12
+; SSE-NEXT:    testb $-128, %al
+; SSE-NEXT:    je .LBB3_16
+; SSE-NEXT:  .LBB3_15: # %cond.store13
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
+; SSE-NEXT:    movd %xmm0, 28
+; SSE-NEXT:  .LBB3_16: # %else14
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    pxor %xmm0, %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    pxor %xmm2, %xmm2
+; SSE-NEXT:    retq
+; SSE-NEXT:  .LBB3_1: # %cond.store
+; SSE-NEXT:    movd %xmm0, 0
+; SSE-NEXT:    testb $2, %al
+; SSE-NEXT:    je .LBB3_4
+; SSE-NEXT:  .LBB3_3: # %cond.store1
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; SSE-NEXT:    movd %xmm2, 4
+; SSE-NEXT:    testb $4, %al
+; SSE-NEXT:    je .LBB3_6
+; SSE-NEXT:  .LBB3_5: # %cond.store3
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; SSE-NEXT:    movd %xmm2, 8
+; SSE-NEXT:    testb $8, %al
+; SSE-NEXT:    je .LBB3_8
+; SSE-NEXT:  .LBB3_7: # %cond.store5
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE-NEXT:    movd %xmm0, 12
+; SSE-NEXT:    testb $16, %al
+; SSE-NEXT:    je .LBB3_10
+; SSE-NEXT:  .LBB3_9: # %cond.store7
+; SSE-NEXT:    movd %xmm1, 16
+; SSE-NEXT:    testb $32, %al
+; SSE-NEXT:    je .LBB3_12
+; SSE-NEXT:  .LBB3_11: # %cond.store9
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE-NEXT:    movd %xmm0, 20
+; SSE-NEXT:    testb $64, %al
+; SSE-NEXT:    je .LBB3_14
+; SSE-NEXT:  .LBB3_13: # %cond.store11
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    movd %xmm0, 24
+; SSE-NEXT:    testb $-128, %al
+; SSE-NEXT:    jne .LBB3_15
+; SSE-NEXT:    jmp .LBB3_16
+;
+; AVX1-LABEL: zero_k:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX1-NEXT:    vpslld $31, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    vmaskmovps %ymm0, %ymm1, 0
+; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: zero_k:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT:    vpslld $31, %ymm1, %ymm1
+; AVX2-NEXT:    vpmaskmovd %ymm0, %ymm1, 0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: zero_k:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpmovsxwd %xmm1, %ymm1
+; AVX512VL-NEXT:    vpslld $31, %ymm1, %ymm1
+; AVX512VL-NEXT:    vptestmd %ymm1, %ymm1, %k1
+; AVX512VL-NEXT:    vmovdqa32 %ymm0, 0 {%k1}
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    kxorw %k0, %k0, %k1
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512BW-LABEL: zero_k:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpsllw $15, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpmovw2m %xmm1, %k1
+; AVX512BW-NEXT:    vmovdqa32 %ymm0, 0 {%k1}
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX512BW-NEXT:    kxorq %k0, %k0, %k1
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+  tail call void @llvm.masked.store.v8i32.p0(<8 x i32> %arg, ptr null, i32 32, <8 x i1> %mask)
+  ret void
+}
+
+attributes #0 = { "zero-call-used-regs"="used" }
diff --git a/llvm/test/DebugInfo/AArch64/callsite.mir b/llvm/test/DebugInfo/AArch64/callsite.mir
new file mode 100644
index 0000000000000..e3bd764db430b
--- /dev/null
+++ b/llvm/test/DebugInfo/AArch64/callsite.mir
@@ -0,0 +1,68 @@
+# This test should not crash when generating call-site information. 
+# It was created to make sure that if isCopyLikeInstr in TargetInstrInfo.h 
+# returns an undef Dest Reg or Src Reg, we don't try to get a SubReg for it.
+
+# RUN: llc --mtriple=arm64e-apple-ios -start-before=aarch64-asm-printer %s -filetype=obj -o /dev/null --emit-call-site-info 
+--- |
+  %struct.rtyuio = type { i8 }
+  define noundef i32 @aserty(ptr noundef %0, ptr noundef %1) local_unnamed_addr #0 !dbg !23 {
+    ret i32 0
+  }
+  define void @asdfgh(ptr noundef %0, ptr noundef %1, i8 noundef zeroext %2) local_unnamed_addr #0 !dbg !53 {
+    %4 = alloca ptr
+    %5 = call ptr @llvm.stackguard()
+    %6 = alloca %struct.rtyuio
+    %7 = icmp eq ptr %1, null
+    br i1 %7, label %10, label %8
+    %9 = tail call i8 @polkiokl(ptr noundef %0) #6
+    br label %10
+    ret void
+  }
+  declare i8 @polkiokl(ptr noundef) local_unnamed_addr #2
+  !llvm.module.flags = !{!2, !8}
+  !llvm.dbg.cu = !{!9}
+  !2 = !{i32 2, !"Debug Info Version", i32 3}
+  !8 = !{i32 7, !"frame-pointer", i32 1}
+  !9 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_11, file: !10, emissionKind: FullDebug, sysroot: "/")
+  !10 = !DIFile(filename: "a.cpp", directory: "/")
+  !23 = distinct !DISubprogram(type: !27, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, unit: !9, retainedNodes: !46)
+  !24 = distinct !DICompositeType(tag: DW_TAG_class_type, identifier: "yshscbshhdvcm")
+  !27 = !DISubroutineType(types: !28)
+  !28 = !{}
+  !30 = !DIDerivedType(tag: DW_TAG_typedef, baseType: !33)
+  !33 = distinct !DICompositeType(tag: DW_TAG_structure_type, identifier: "tyruwyeuiwiybabd")
+  !36 = !DISubroutineType(types: !37)
+  !37 = !{}
+  !46 = !{}
+  !47 = !DILocalVariable(scope: !23, type: !48, flags: DIFlagArtificial | DIFlagObjectPointer)
+  !48 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !24, size: 64)
+  !49 = !DILocalVariable(scope: !23, type: !30)
+  !50 = !DILocation(scope: !23)
+  !51 = !DILocation(scope: !23)
+  !53 = distinct !DISubprogram(type: !36, unit: !9, retainedNodes: !54)
+  !54 = !{}
+name:            aserty
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -8, size: 8, alignment: 8, 
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -16, size: 8, alignment: 8, 
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites:
+  - { bb: 0, offset: 9, fwdArgRegs: 
+      - { arg: 2, reg: '$w2' } }
+body:             |
+  bb.0 (%ir-block.2):
+    DBG_VALUE $x0, $noreg, !47, !DIExpression(),  debug-location !50
+    DBG_VALUE $x1, $noreg, !49, !DIExpression(),  debug-location !50
+    frame-setup PACIBSP implicit-def $lr, implicit killed $lr, implicit $sp
+    early-clobber $sp = frame-setup STPXpre $fp, killed $lr, $sp, -2 :: (store (s64) into %stack.1), (store (s64) into %stack.0)
+    $fp = frame-setup ADDXri $sp, 0, 0
+    frame-setup CFI_INSTRUCTION def_cfa $w29, 16
+    frame-setup CFI_INSTRUCTION offset $w30, -8
+    frame-setup CFI_INSTRUCTION offset $w29, -16
+    $x2 = ORRXrs $xzr, undef $noreg, 0, implicit $wzr,  debug-location !51
+    BL @asdfgh, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit killed $x1, implicit killed $w2, implicit-def $sp,  debug-location !51
+...
+name:            asdfgh
+body:             |
+  bb.2 (%ir-block.10):
diff --git a/llvm/test/DebugInfo/Generic/compileunit-source-language-name.ll b/llvm/test/DebugInfo/Generic/compileunit-source-language-name.ll
index e2b6167cab910..58b5104592931 100644
--- a/llvm/test/DebugInfo/Generic/compileunit-source-language-name.ll
+++ b/llvm/test/DebugInfo/Generic/compileunit-source-language-name.ll
@@ -1,6 +1,12 @@
-; RUN: %llc_dwarf -filetype=obj -O0 < %s | llvm-dwarfdump -debug-info - | FileCheck %s --implicit-check-not "DW_AT_language"
+; AIX doesn't have support for DWARF 6 DW_AT_language_name
+; XFAIL: target={{.*}}-zos{{.*}}, target={{.*}}-aix{{.*}}
+; RUN: %llc_dwarf -filetype=obj -O0 < %s | llvm-dwarfdump -debug-info -v - | FileCheck %s --implicit-check-not "DW_AT_language"
 
-; CHECK: DW_AT_language_name (DW_LNAME_ObjC_plus_plus)
+; CHECK:     DW_AT_language_name [DW_FORM_data2] (DW_LNAME_ObjC_plus_plus)
+; CHECK:     DW_AT_language_name [DW_FORM_data2] (DW_LNAME_C_plus_plus)
+; CHECK:     DW_AT_language_version [DW_FORM_data4] (201100 C++11)
+; CHECK:     DW_AT_language_name [DW_FORM_data2] (DW_LNAME_Rust)
+; CHECK-NOT: DW_AT_language_version
 
 @x = global i32 0, align 4, !dbg !0
 
@@ -9,7 +15,7 @@ define void @_Z4funcv() !dbg !8 {
   ret void, !dbg !11
 }
 
-!llvm.dbg.cu = !{!2}
+!llvm.dbg.cu = !{!2, !12, !13}
 !llvm.module.flags = !{!6, !7}
 
 !0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
@@ -24,3 +30,5 @@ define void @_Z4funcv() !dbg !8 {
 !9 = !DISubroutineType(types: !10)
 !10 = !{null}
 !11 = !DILocation(line: 2, column: 14, scope: !8)
+!12 = distinct !DICompileUnit(sourceLanguageName: DW_LNAME_C_plus_plus, sourceLanguageVersion: 201100, file: !3, producer: "handwritten", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !4, splitDebugInlining: false, nameTableKind: Apple, sysroot: "/")
+!13 = distinct !DICompileUnit(sourceLanguageName: DW_LNAME_Rust, sourceLanguageVersion: 0, file: !3, producer: "handwritten", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !4, splitDebugInlining: false, nameTableKind: Apple, sysroot: "/")
diff --git a/llvm/test/DebugInfo/Generic/compileunit-source-language.ll b/llvm/test/DebugInfo/Generic/compileunit-source-language.ll
index bafe6206c241a..dc5bf0f4a3ee3 100644
--- a/llvm/test/DebugInfo/Generic/compileunit-source-language.ll
+++ b/llvm/test/DebugInfo/Generic/compileunit-source-language.ll
@@ -1,3 +1,5 @@
+; AIX doesn't have support for DWARF 6 DW_AT_language_name
+; XFAIL: target={{.*}}-zos{{.*}}, target={{.*}}-aix{{.*}}
 ; RUN: %llc_dwarf -filetype=obj -O0 < %s | llvm-dwarfdump -debug-info - | FileCheck %s --implicit-check-not "DW_AT_language_name"
 
 ; CHECK: DW_AT_language (DW_LANG_C)
diff --git a/llvm/test/DebugInfo/X86/DW_OP_LLVM_extract_bits.ll b/llvm/test/DebugInfo/X86/DW_OP_LLVM_extract_bits.ll
index 5928127866f7e..b79c84d532786 100644
--- a/llvm/test/DebugInfo/X86/DW_OP_LLVM_extract_bits.ll
+++ b/llvm/test/DebugInfo/X86/DW_OP_LLVM_extract_bits.ll
@@ -8,7 +8,7 @@
 ; CHECK-LABEL: DW_TAG_subprogram
 ; CHECK: DW_AT_name ("test1")
 ; CHECK: DW_TAG_variable
-; CHECK: DW_AT_location (DW_OP_fbreg -1, DW_OP_deref_size 0x1, DW_OP_constu 0x3d, DW_OP_shl, DW_OP_constu 0x3d, DW_OP_shr, DW_OP_stack_value)
+; CHECK: DW_AT_location (DW_OP_fbreg -1, DW_OP_deref_size 0x1, DW_OP_constu 0x7, DW_OP_and, DW_OP_stack_value)
 ; CHECK: DW_AT_name ("x")
 ; CHECK: DW_TAG_variable
 ; CHECK: DW_AT_location (DW_OP_fbreg -1, DW_OP_deref_size 0x1, DW_OP_constu 0x39, DW_OP_shl, DW_OP_constu 0x3c, DW_OP_shra, DW_OP_stack_value)
@@ -25,7 +25,7 @@ entry:
 ; CHECK-LABEL: DW_TAG_subprogram
 ; CHECK: DW_AT_name ("test2")
 ; CHECK: DW_TAG_variable
-; CHECK: DW_AT_location (DW_OP_breg0 {{R[^+]+}}+0, DW_OP_constu 0xff, DW_OP_and, DW_OP_constu 0x3d, DW_OP_shl, DW_OP_constu 0x3d, DW_OP_shr, DW_OP_stack_value)
+; CHECK: DW_AT_location (DW_OP_breg0 {{R[^+]+}}+0, DW_OP_constu 0xff, DW_OP_and, DW_OP_constu 0x7, DW_OP_and, DW_OP_stack_value)
 ; CHECK: DW_AT_name ("x")
 ; CHECK: DW_TAG_variable
 ; CHECK: DW_AT_location (DW_OP_breg0 {{R[^+]+}}+0, DW_OP_constu 0xff, DW_OP_and, DW_OP_constu 0x39, DW_OP_shl, DW_OP_constu 0x3c, DW_OP_shra, DW_OP_stack_value)
@@ -67,7 +67,7 @@ entry:
 ; CHECK: DW_TAG_variable
 ; CHECK: DW_AT_location (DW_OP_fbreg -4, DW_OP_plus_uconst 0x3, DW_OP_deref_size 0x1, DW_OP_constu 0x38, DW_OP_shl, DW_OP_constu 0x39, DW_OP_shr, DW_OP_stack_value)
 ; CHECK: DW_AT_name ("z")
-; CHECK: DW_AT_location (DW_OP_fbreg -4, DW_OP_deref_size 0x8, DW_OP_stack_value)
+; CHECK: DW_AT_location (DW_OP_fbreg -4, DW_OP_deref, DW_OP_stack_value)
 ; CHECK: DW_AT_name ("q")
 
 define i32 @test4() !dbg !28 {
diff --git a/llvm/test/DebugInfo/X86/shrink-wrap-frame-setup-no-loc.mir b/llvm/test/DebugInfo/X86/shrink-wrap-frame-setup-no-loc.mir
new file mode 100644
index 0000000000000..b97e91629fba0
--- /dev/null
+++ b/llvm/test/DebugInfo/X86/shrink-wrap-frame-setup-no-loc.mir
@@ -0,0 +1,99 @@
+# RUN: %llc_dwarf %s -o - -mtriple=x86_64-unknown-unknown --start-after=livedebugvalues | FileCheck %s
+
+## Check the line number from the ret above `.LBB0_2` doesn't leak onto the
+## frame setup instructions in the `.LBB0_2` block; `pushq   %rax` should
+## explicitly get set to line zero.
+
+# CHECK: loop:
+# CHECK-NEXT: .Lfunc_begin0:
+# CHECK-NEXT:         .cfi_startproc
+# CHECK-NEXT: # %bb.0:
+# CHECK-NEXT:         .file   1 "/" "test.c"
+# CHECK-NEXT:         .loc    1 5 16 prologue_end             # test.c:5:16
+# CHECK-NEXT:         testq   %rax, %rax
+# CHECK-NEXT:         je      .LBB0_2
+# CHECK-NEXT: # %bb.1:
+# CHECK-NEXT:         .loc    1 5 16                          # test.c:5:16
+# CHECK-NEXT:         retq
+# CHECK-NEXT: .LBB0_2:
+# -- Check the .loc below sets the current location to line 0.
+# CHECK-NEXT:         .loc    1 0 16 is_stmt 0                # test.c:0:16
+# CHECK-NEXT:         pushq   %rax
+# CHECK-NEXT:         .cfi_def_cfa_offset 16
+# CHECK-NEXT:         addq    $8, %rsp
+# CHECK-NEXT:         .cfi_def_cfa_offset 8
+# CHECK-NEXT:         .loc    1 5 16 is_stmt 1                # test.c:5:16
+# CHECK-NEXT:         retq
+
+--- |
+  source_filename = "reduced.ll"
+  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+  target triple = "x86_64-unknown-unknown"
+
+  define void @loop(i64 %i) !dbg !4 {
+  entry:
+    %cmp.not = icmp eq i64 %i, 0, !dbg !7
+    br i1 %cmp.not, label %for.body, label %for.end
+
+  for.body:                                         ; preds = %entry
+    %puts10 = tail call i32 null(ptr null)
+    %inc = add i64 0, 0
+    br label %for.end
+
+  for.end:                                          ; preds = %for.body, %entry
+    ret void
+  }
+
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!3}
+
+  !0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 22.0.0git", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, globals: !2, splitDebugInlining: false, nameTableKind: None)
+  !1 = !DIFile(filename: "test.c", directory: "/")
+  !2 = !{}
+  !3 = !{i32 2, !"Debug Info Version", i32 3}
+  !4 = distinct !DISubprogram(name: "loop", scope: !1, file: !1, line: 4, type: !5, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2, keyInstructions: true)
+  !5 = !DISubroutineType(types: !6)
+  !6 = !{null}
+  !7 = !DILocation(line: 5, column: 16, scope: !8, atomGroup: 720, atomRank: 2)
+  !8 = distinct !DILexicalBlock(scope: !4, file: !1, line: 5, column: 9)
+...
+---
+name:            loop
+alignment:       16
+tracksRegLiveness: true
+noPhis:          true
+isSSA:           false
+noVRegs:         true
+hasFakeUses:     false
+debugInstrRef:   true
+tracksDebugUserValues: true
+liveins:
+  - { reg: '$rdi' }
+frameInfo:
+  stackSize:       8
+  offsetAdjustment: -8
+  maxAlignment:    1
+  adjustsStack:    true
+  hasCalls:        true
+  maxCallFrameSize: 0
+  isCalleeSavedInfoValid: true
+machineFunctionInfo:
+  amxProgModel:    None
+body:             |
+  bb.0:
+    successors: %bb.1(0x30000000), %bb.2(0x50000000)
+    liveins: $rdi
+
+    TEST64rr undef renamable $rax, undef renamable $rax, implicit-def $eflags,  debug-location !7
+    JCC_1 %bb.1, 4, implicit $eflags
+
+  bb.2:
+    RET64 debug-location !7
+
+  bb.1:
+    frame-setup PUSH64r undef $rax, implicit-def $rsp, implicit $rsp
+    frame-setup CFI_INSTRUCTION def_cfa_offset 16
+    $rsp = frame-destroy ADD64ri32 $rsp, 8, implicit-def dead $eflags
+    frame-destroy CFI_INSTRUCTION def_cfa_offset 8
+    RET64 debug-location !7
+...
diff --git a/llvm/test/DebugInfo/dwarf-complex-int.ll b/llvm/test/DebugInfo/dwarf-complex-int.ll
index effd0ece2c445..84be51e96156a 100644
--- a/llvm/test/DebugInfo/dwarf-complex-int.ll
+++ b/llvm/test/DebugInfo/dwarf-complex-int.ll
@@ -1,4 +1,6 @@
 ; REQUIRES: object-emission
+; AIX doesn't have full support for DWARF 5
+; XFAIL: target={{.*}}-zos{{.*}}, target={{.*}}-aix{{.*}}
 ; RUN: %llc_dwarf %s -filetype=obj -o - | llvm-dwarfdump - | FileCheck %s
 
 ;; https://github.com/llvm/llvm-project/issues/140362
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/LocalDependencyPropagation.s b/llvm/test/ExecutionEngine/JITLink/x86-64/LocalDependencyPropagation.s
index 83d71cdf6fc83..529395822f5f7 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/LocalDependencyPropagation.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/LocalDependencyPropagation.s
@@ -16,8 +16,7 @@
 # CHECK-DAG: Symbols: { _foo }, Dependencies: { (main, { _external_func }) }
 # CHECK-DAG: Symbols: { _baz }, Dependencies: { (main, { _foo }) }
 # CHECK: Simplified dependencies:
-# CHECK-DAG: Symbols: { _foo }, Dependencies: { (main, { _external_func }) }
-# CHECK-DAG: Symbols: { _baz }, Dependencies: { (main, { _external_func }) }
+# CHECK-DAG: Defs: { (main, [ _baz _foo ]) }, Deps: { (main, [ _external_func ]) }
 
         .section	__TEXT,__text,regular,pure_instructions
 
diff --git a/llvm/test/Instrumentation/AddressSanitizer/asan-funclet.ll b/llvm/test/Instrumentation/AddressSanitizer/asan-funclet.ll
index ae8b2b385ac09..2a7216ff70cda 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/asan-funclet.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/asan-funclet.ll
@@ -23,7 +23,7 @@ declare i32 @dummyPersonality(...)
 
 define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr @__CxxFrameHandler3 {
 ; CHECK-INLINE-LABEL: define void @FuncletPersonality(
-; CHECK-INLINE-SAME: ptr [[PTRPARAM:%.*]]) #[[ATTR3:[0-9]+]] personality ptr @__CxxFrameHandler3 {
+; CHECK-INLINE-SAME: ptr [[PTRPARAM:%.*]]) #[[ATTR2:[0-9]+]] personality ptr @__CxxFrameHandler3 {
 ; CHECK-INLINE-NEXT:  entry:
 ; CHECK-INLINE-NEXT:    [[TMP0:%.*]] = alloca i64, align 32
 ; CHECK-INLINE-NEXT:    store i64 0, ptr [[TMP0]], align 8
@@ -37,33 +37,26 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
 ; CHECK-INLINE-NEXT:    br label [[TMP6]]
 ; CHECK-INLINE:       6:
 ; CHECK-INLINE-NEXT:    [[TMP7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[TMP4]] ]
+; CHECK-INLINE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-INLINE-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 0
 ; CHECK-INLINE-NEXT:    br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]]
-; CHECK-INLINE:       9:
+; CHECK-INLINE:       10:
 ; CHECK-INLINE-NEXT:    [[MYALLOCA:%.*]] = alloca i8, i64 8544, align 32
-; CHECK-INLINE-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[MYALLOCA]] to i64
 ; CHECK-INLINE-NEXT:    br label [[TMP11]]
 ; CHECK-INLINE:       11:
-; CHECK-INLINE-NEXT:    [[TMP12:%.*]] = phi i64 [ [[TMP7]], [[TMP6]] ], [ [[TMP10]], [[TMP9]] ]
-; CHECK-INLINE-NEXT:    store i64 [[TMP12]], ptr [[ASAN_LOCAL_STACK_BASE]], align 8
-; CHECK-INLINE-NEXT:    [[TMP13:%.*]] = add i64 [[TMP12]], 32
-; CHECK-INLINE-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-; CHECK-INLINE-NEXT:    [[TMP15:%.*]] = add i64 [[TMP12]], 8480
-; CHECK-INLINE-NEXT:    [[TMP16:%.*]] = inttoptr i64 [[TMP15]] to ptr
-; CHECK-INLINE-NEXT:    [[TMP17:%.*]] = add i64 [[TMP12]], 8496
-; CHECK-INLINE-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
-; CHECK-INLINE-NEXT:    [[TMP19:%.*]] = add i64 [[TMP12]], 8512
-; CHECK-INLINE-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-; CHECK-INLINE-NEXT:    [[TMP21:%.*]] = add i64 [[TMP12]], 8528
-; CHECK-INLINE-NEXT:    [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr
-; CHECK-INLINE-NEXT:    [[TMP23:%.*]] = inttoptr i64 [[TMP12]] to ptr
+; CHECK-INLINE-NEXT:    [[TMP23:%.*]] = phi ptr [ [[TMP10]], [[TMP6]] ], [ [[MYALLOCA]], [[TMP9]] ]
+; CHECK-INLINE-NEXT:    store ptr [[TMP23]], ptr [[ASAN_LOCAL_STACK_BASE]], align 8
+; CHECK-INLINE-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP23]], i64 32
+; CHECK-INLINE-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[TMP23]], i64 8480
+; CHECK-INLINE-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP23]], i64 8496
+; CHECK-INLINE-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[TMP23]], i64 8512
+; CHECK-INLINE-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[TMP23]], i64 8528
 ; CHECK-INLINE-NEXT:    store i64 1102416563, ptr [[TMP23]], align 8
-; CHECK-INLINE-NEXT:    [[TMP24:%.*]] = add i64 [[TMP12]], 8
-; CHECK-INLINE-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
+; CHECK-INLINE-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[TMP23]], i64 8
 ; CHECK-INLINE-NEXT:    store i64 ptrtoint (ptr @___asan_gen_stack to i64), ptr [[TMP25]], align 8
-; CHECK-INLINE-NEXT:    [[TMP26:%.*]] = add i64 [[TMP12]], 16
-; CHECK-INLINE-NEXT:    [[TMP27:%.*]] = inttoptr i64 [[TMP26]] to ptr
+; CHECK-INLINE-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[TMP23]], i64 16
 ; CHECK-INLINE-NEXT:    store i64 ptrtoint (ptr @FuncletPersonality to i64), ptr [[TMP27]], align 8
+; CHECK-INLINE-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[TMP23]] to i64
 ; CHECK-INLINE-NEXT:    [[TMP28:%.*]] = lshr i64 [[TMP12]], 3
 ; CHECK-INLINE-NEXT:    [[TMP29:%.*]] = add i64 [[TMP28]], [[TMP1]]
 ; CHECK-INLINE-NEXT:    call void @__asan_set_shadow_f1(i64 [[TMP29]], i64 4)
@@ -87,21 +80,22 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
 ; CHECK-INLINE-NEXT:    call void @__asan_set_shadow_f3(i64 [[TMP38]], i64 1)
 ; CHECK-INLINE-NEXT:    [[TMP39:%.*]] = add i64 [[TMP29]], 1066
 ; CHECK-INLINE-NEXT:    call void @__asan_set_shadow_04(i64 [[TMP39]], i64 1)
+; CHECK-INLINE-NEXT:    [[TMP21:%.*]] = ptrtoint ptr [[TMP22]] to i64
 ; CHECK-INLINE-NEXT:    [[TMP40:%.*]] = lshr i64 [[TMP21]], 3
 ; CHECK-INLINE-NEXT:    [[TMP41:%.*]] = add i64 [[TMP40]], [[TMP1]]
 ; CHECK-INLINE-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
 ; CHECK-INLINE-NEXT:    [[TMP43:%.*]] = load i8, ptr [[TMP42]], align 1
 ; CHECK-INLINE-NEXT:    [[TMP44:%.*]] = icmp ne i8 [[TMP43]], 0
 ; CHECK-INLINE-NEXT:    br i1 [[TMP44]], label [[TMP45:%.*]], label [[TMP50:%.*]], !prof [[PROF1:![0-9]+]]
-; CHECK-INLINE:       45:
+; CHECK-INLINE:       39:
 ; CHECK-INLINE-NEXT:    [[TMP46:%.*]] = and i64 [[TMP21]], 7
 ; CHECK-INLINE-NEXT:    [[TMP47:%.*]] = trunc i64 [[TMP46]] to i8
 ; CHECK-INLINE-NEXT:    [[TMP48:%.*]] = icmp sge i8 [[TMP47]], [[TMP43]]
 ; CHECK-INLINE-NEXT:    br i1 [[TMP48]], label [[TMP49:%.*]], label [[TMP50]]
-; CHECK-INLINE:       49:
+; CHECK-INLINE:       43:
 ; CHECK-INLINE-NEXT:    call void @__asan_report_store1(i64 [[TMP21]]) #[[ATTR7:[0-9]+]]
 ; CHECK-INLINE-NEXT:    unreachable
-; CHECK-INLINE:       50:
+; CHECK-INLINE:       44:
 ; CHECK-INLINE-NEXT:    store volatile i8 0, ptr [[TMP22]], align 1
 ; CHECK-INLINE-NEXT:    [[TMP51:%.*]] = add i64 [[TMP29]], 1066
 ; CHECK-INLINE-NEXT:    call void @__asan_set_shadow_f8(i64 [[TMP51]], i64 1)
@@ -125,10 +119,10 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
 ; CHECK-INLINE-NEXT:    [[TMP65:%.*]] = load i8, ptr [[TMP64]], align 1
 ; CHECK-INLINE-NEXT:    [[TMP66:%.*]] = icmp ne i8 [[TMP65]], 0
 ; CHECK-INLINE-NEXT:    br i1 [[TMP66]], label [[TMP67:%.*]], label [[TMP68:%.*]]
-; CHECK-INLINE:       67:
+; CHECK-INLINE:       61:
 ; CHECK-INLINE-NEXT:    call void @__asan_report_store8(i64 [[TMP59]]) #[[ATTR7]]
 ; CHECK-INLINE-NEXT:    unreachable
-; CHECK-INLINE:       68:
+; CHECK-INLINE:       62:
 ; CHECK-INLINE-NEXT:    store volatile i64 0, ptr [[TMP61]], align 8
 ; CHECK-INLINE-NEXT:    [[TMPCOPYI64:%.*]] = load i64, ptr [[TMP61]], align 8
 ; CHECK-INLINE-NEXT:    [[TMP69:%.*]] = and i64 [[TMPCOPYI64]], 31
@@ -150,15 +144,15 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
 ; CHECK-INLINE-NEXT:    [[TMP83:%.*]] = load i8, ptr [[TMP82]], align 1
 ; CHECK-INLINE-NEXT:    [[TMP84:%.*]] = icmp ne i8 [[TMP83]], 0
 ; CHECK-INLINE-NEXT:    br i1 [[TMP84]], label [[TMP85:%.*]], label [[TMP90:%.*]], !prof [[PROF1]]
-; CHECK-INLINE:       85:
+; CHECK-INLINE:       79:
 ; CHECK-INLINE-NEXT:    [[TMP86:%.*]] = and i64 [[TMP77]], 7
 ; CHECK-INLINE-NEXT:    [[TMP87:%.*]] = trunc i64 [[TMP86]] to i8
 ; CHECK-INLINE-NEXT:    [[TMP88:%.*]] = icmp sge i8 [[TMP87]], [[TMP83]]
 ; CHECK-INLINE-NEXT:    br i1 [[TMP88]], label [[TMP89:%.*]], label [[TMP90]]
-; CHECK-INLINE:       89:
+; CHECK-INLINE:       83:
 ; CHECK-INLINE-NEXT:    call void @__asan_report_store1(i64 [[TMP77]]) #[[ATTR7]]
 ; CHECK-INLINE-NEXT:    unreachable
-; CHECK-INLINE:       90:
+; CHECK-INLINE:       84:
 ; CHECK-INLINE-NEXT:    store volatile i8 0, ptr [[TMP79]], align 1
 ; CHECK-INLINE-NEXT:    invoke void @MayThrowFunc()
 ; CHECK-INLINE-NEXT:            to label [[INVOKE_CONT:%.*]] unwind label [[EHCLEANUP:%.*]]
@@ -170,15 +164,15 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
 ; CHECK-INLINE-NEXT:    store i64 1172321806, ptr [[TMP23]], align 8
 ; CHECK-INLINE-NEXT:    [[TMP93:%.*]] = icmp ne i64 [[TMP7]], 0
 ; CHECK-INLINE-NEXT:    br i1 [[TMP93]], label [[TMP94:%.*]], label [[TMP95:%.*]]
-; CHECK-INLINE:       94:
+; CHECK-INLINE:       88:
 ; CHECK-INLINE-NEXT:    call void @__asan_stack_free_8(i64 [[TMP7]], i64 8544)
 ; CHECK-INLINE-NEXT:    br label [[TMP97:%.*]]
-; CHECK-INLINE:       95:
+; CHECK-INLINE:       89:
 ; CHECK-INLINE-NEXT:    call void @__asan_set_shadow_00(i64 [[TMP29]], i64 4)
 ; CHECK-INLINE-NEXT:    [[TMP96:%.*]] = add i64 [[TMP29]], 1028
 ; CHECK-INLINE-NEXT:    call void @__asan_set_shadow_00(i64 [[TMP96]], i64 40)
 ; CHECK-INLINE-NEXT:    br label [[TMP97]]
-; CHECK-INLINE:       97:
+; CHECK-INLINE:       91:
 ; CHECK-INLINE-NEXT:    ret void
 ; CHECK-INLINE:       ehcleanup:
 ; CHECK-INLINE-NEXT:    [[TMP98:%.*]] = cleanuppad within none []
@@ -189,23 +183,27 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
 ; CHECK-INLINE-NEXT:    [[TMP102:%.*]] = load i8, ptr [[TMP101]], align 1
 ; CHECK-INLINE-NEXT:    [[TMP103:%.*]] = icmp ne i8 [[TMP102]], 0
 ; CHECK-INLINE-NEXT:    br i1 [[TMP103]], label [[TMP104:%.*]], label [[TMP109:%.*]], !prof [[PROF1]]
-; CHECK-INLINE:       104:
+; CHECK-INLINE:       98:
 ; CHECK-INLINE-NEXT:    [[TMP105:%.*]] = and i64 [[TMP54]], 7
 ; CHECK-INLINE-NEXT:    [[TMP106:%.*]] = trunc i64 [[TMP105]] to i8
 ; CHECK-INLINE-NEXT:    [[TMP107:%.*]] = icmp sge i8 [[TMP106]], [[TMP102]]
 ; CHECK-INLINE-NEXT:    br i1 [[TMP107]], label [[TMP108:%.*]], label [[TMP109]]
-; CHECK-INLINE:       108:
+; CHECK-INLINE:       102:
 ; CHECK-INLINE-NEXT:    call void @__asan_report_store1(i64 [[TMP54]]) #[[ATTR7]] [ "funclet"(token [[TMP98]]) ]
 ; CHECK-INLINE-NEXT:    unreachable
-; CHECK-INLINE:       109:
+; CHECK-INLINE:       103:
 ; CHECK-INLINE-NEXT:    store volatile i8 0, ptr [[TMP56]], align 1
 ; CHECK-INLINE-NEXT:    call void @__asan_poison_stack_memory(i64 [[TMP54]], i64 4) [ "funclet"(token [[TMP98]]) ]
 ; CHECK-INLINE-NEXT:    call void @DeInit(ptr [[TMP14]]) [ "funclet"(token [[TMP98]]) ]
 ; CHECK-INLINE-NEXT:    [[TMP110:%.*]] = call ptr @__asan_memset(ptr [[TMP16]], i32 0, i64 4) [ "funclet"(token [[TMP98]]) ]
 ; CHECK-INLINE-NEXT:    [[TMP111:%.*]] = call ptr @__asan_memcpy(ptr [[TMP18]], ptr [[TMP16]], i64 4) [ "funclet"(token [[TMP98]]) ]
 ; CHECK-INLINE-NEXT:    [[TMP112:%.*]] = call ptr @__asan_memmove(ptr [[TMP20]], ptr [[TMP16]], i64 4) [ "funclet"(token [[TMP98]]) ]
+; CHECK-INLINE-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[TMP16]] to i64
+; CHECK-INLINE-NEXT:    [[TMP17:%.*]] = ptrtoint ptr [[TMP18]] to i64
 ; CHECK-INLINE-NEXT:    call void @__sanitizer_ptr_cmp(i64 [[TMP15]], i64 [[TMP17]]) [ "funclet"(token [[TMP98]]) ]
-; CHECK-INLINE-NEXT:    call void @__sanitizer_ptr_sub(i64 [[TMP15]], i64 [[TMP17]]) [ "funclet"(token [[TMP98]]) ]
+; CHECK-INLINE-NEXT:    [[ADDR1:%.*]] = ptrtoint ptr [[TMP16]] to i64
+; CHECK-INLINE-NEXT:    [[ADDR2:%.*]] = ptrtoint ptr [[TMP18]] to i64
+; CHECK-INLINE-NEXT:    call void @__sanitizer_ptr_sub(i64 [[ADDR1]], i64 [[ADDR2]]) [ "funclet"(token [[TMP98]]) ]
 ; CHECK-INLINE-NEXT:    [[TMP113:%.*]] = ptrtoint ptr [[PTRPARAM]] to i64
 ; CHECK-INLINE-NEXT:    [[TMP114:%.*]] = add i64 [[TMP113]], 7
 ; CHECK-INLINE-NEXT:    [[TMP115:%.*]] = inttoptr i64 [[TMP114]] to ptr
@@ -216,27 +214,27 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
 ; CHECK-INLINE-NEXT:    [[TMP120:%.*]] = load i8, ptr [[TMP119]], align 1
 ; CHECK-INLINE-NEXT:    [[TMP121:%.*]] = icmp ne i8 [[TMP120]], 0
 ; CHECK-INLINE-NEXT:    br i1 [[TMP121]], label [[TMP122:%.*]], label [[TMP127:%.*]], !prof [[PROF1]]
-; CHECK-INLINE:       122:
+; CHECK-INLINE:       118:
 ; CHECK-INLINE-NEXT:    [[TMP123:%.*]] = and i64 [[TMP116]], 7
 ; CHECK-INLINE-NEXT:    [[TMP124:%.*]] = trunc i64 [[TMP123]] to i8
 ; CHECK-INLINE-NEXT:    [[TMP125:%.*]] = icmp sge i8 [[TMP124]], [[TMP120]]
 ; CHECK-INLINE-NEXT:    br i1 [[TMP125]], label [[TMP126:%.*]], label [[TMP127]]
-; CHECK-INLINE:       126:
+; CHECK-INLINE:       122:
 ; CHECK-INLINE-NEXT:    call void @__asan_report_store_n(i64 [[TMP116]], i64 8) #[[ATTR7]] [ "funclet"(token [[TMP98]]) ]
 ; CHECK-INLINE-NEXT:    unreachable
-; CHECK-INLINE:       127:
+; CHECK-INLINE:       123:
 ; CHECK-INLINE-NEXT:    [[TMP128:%.*]] = lshr i64 [[TMP114]], 3
 ; CHECK-INLINE-NEXT:    [[TMP129:%.*]] = add i64 [[TMP128]], [[TMP1]]
 ; CHECK-INLINE-NEXT:    [[TMP130:%.*]] = inttoptr i64 [[TMP129]] to ptr
 ; CHECK-INLINE-NEXT:    [[TMP131:%.*]] = load i8, ptr [[TMP130]], align 1
 ; CHECK-INLINE-NEXT:    [[TMP132:%.*]] = icmp ne i8 [[TMP131]], 0
 ; CHECK-INLINE-NEXT:    br i1 [[TMP132]], label [[TMP133:%.*]], label [[EHEXIT:%.*]], !prof [[PROF1]]
-; CHECK-INLINE:       133:
+; CHECK-INLINE:       129:
 ; CHECK-INLINE-NEXT:    [[TMP134:%.*]] = and i64 [[TMP114]], 7
 ; CHECK-INLINE-NEXT:    [[TMP135:%.*]] = trunc i64 [[TMP134]] to i8
 ; CHECK-INLINE-NEXT:    [[TMP136:%.*]] = icmp sge i8 [[TMP135]], [[TMP131]]
 ; CHECK-INLINE-NEXT:    br i1 [[TMP136]], label [[TMP137:%.*]], label [[EHEXIT]]
-; CHECK-INLINE:       137:
+; CHECK-INLINE:       133:
 ; CHECK-INLINE-NEXT:    call void @__asan_report_store_n(i64 [[TMP114]], i64 8) #[[ATTR7]] [ "funclet"(token [[TMP98]]) ]
 ; CHECK-INLINE-NEXT:    unreachable
 ; CHECK-INLINE:       ehexit:
@@ -249,19 +247,19 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
 ; CHECK-INLINE-NEXT:    store i64 1172321806, ptr [[TMP23]], align 8
 ; CHECK-INLINE-NEXT:    [[TMP142:%.*]] = icmp ne i64 [[TMP7]], 0
 ; CHECK-INLINE-NEXT:    br i1 [[TMP142]], label [[TMP143:%.*]], label [[TMP144:%.*]]
-; CHECK-INLINE:       143:
+; CHECK-INLINE:       139:
 ; CHECK-INLINE-NEXT:    call void @__asan_stack_free_8(i64 [[TMP7]], i64 8544) [ "funclet"(token [[TMP98]]) ]
 ; CHECK-INLINE-NEXT:    br label [[TMP146:%.*]]
-; CHECK-INLINE:       144:
+; CHECK-INLINE:       140:
 ; CHECK-INLINE-NEXT:    call void @__asan_set_shadow_00(i64 [[TMP29]], i64 4) [ "funclet"(token [[TMP98]]) ]
 ; CHECK-INLINE-NEXT:    [[TMP145:%.*]] = add i64 [[TMP29]], 1028
 ; CHECK-INLINE-NEXT:    call void @__asan_set_shadow_00(i64 [[TMP145]], i64 40) [ "funclet"(token [[TMP98]]) ]
 ; CHECK-INLINE-NEXT:    br label [[TMP146]]
-; CHECK-INLINE:       146:
+; CHECK-INLINE:       142:
 ; CHECK-INLINE-NEXT:    cleanupret from [[TMP98]] unwind to caller
 ;
 ; CHECK-OUTLINE-LABEL: define void @FuncletPersonality(
-; CHECK-OUTLINE-SAME: ptr [[PTRPARAM:%.*]]) #[[ATTR3:[0-9]+]] personality ptr @__CxxFrameHandler3 {
+; CHECK-OUTLINE-SAME: ptr [[PTRPARAM:%.*]]) #[[ATTR2:[0-9]+]] personality ptr @__CxxFrameHandler3 {
 ; CHECK-OUTLINE-NEXT:  entry:
 ; CHECK-OUTLINE-NEXT:    [[TMP0:%.*]] = alloca i64, align 32
 ; CHECK-OUTLINE-NEXT:    store i64 0, ptr [[TMP0]], align 8
@@ -275,37 +273,28 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
 ; CHECK-OUTLINE-NEXT:    br label [[TMP6]]
 ; CHECK-OUTLINE:       6:
 ; CHECK-OUTLINE-NEXT:    [[TMP7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[TMP4]] ]
+; CHECK-OUTLINE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-OUTLINE-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 0
 ; CHECK-OUTLINE-NEXT:    br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP11:%.*]]
-; CHECK-OUTLINE:       9:
+; CHECK-OUTLINE:       10:
 ; CHECK-OUTLINE-NEXT:    [[MYALLOCA:%.*]] = alloca i8, i64 8608, align 32
-; CHECK-OUTLINE-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[MYALLOCA]] to i64
 ; CHECK-OUTLINE-NEXT:    br label [[TMP11]]
 ; CHECK-OUTLINE:       11:
-; CHECK-OUTLINE-NEXT:    [[TMP12:%.*]] = phi i64 [ [[TMP7]], [[TMP6]] ], [ [[TMP10]], [[TMP9]] ]
-; CHECK-OUTLINE-NEXT:    store i64 [[TMP12]], ptr [[ASAN_LOCAL_STACK_BASE]], align 8
-; CHECK-OUTLINE-NEXT:    [[TMP13:%.*]] = add i64 [[TMP12]], 32
-; CHECK-OUTLINE-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-; CHECK-OUTLINE-NEXT:    [[TMP15:%.*]] = add i64 [[TMP12]], 8480
-; CHECK-OUTLINE-NEXT:    [[TMP16:%.*]] = inttoptr i64 [[TMP15]] to ptr
-; CHECK-OUTLINE-NEXT:    [[TMP17:%.*]] = add i64 [[TMP12]], 8496
-; CHECK-OUTLINE-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
-; CHECK-OUTLINE-NEXT:    [[TMP19:%.*]] = add i64 [[TMP12]], 8512
-; CHECK-OUTLINE-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-; CHECK-OUTLINE-NEXT:    [[TMP21:%.*]] = add i64 [[TMP12]], 8528
-; CHECK-OUTLINE-NEXT:    [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr
-; CHECK-OUTLINE-NEXT:    [[TMP23:%.*]] = add i64 [[TMP12]], 8544
-; CHECK-OUTLINE-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
-; CHECK-OUTLINE-NEXT:    [[TMP25:%.*]] = add i64 [[TMP12]], 8560
-; CHECK-OUTLINE-NEXT:    [[TMP26:%.*]] = inttoptr i64 [[TMP25]] to ptr
-; CHECK-OUTLINE-NEXT:    [[TMP27:%.*]] = inttoptr i64 [[TMP12]] to ptr
+; CHECK-OUTLINE-NEXT:    [[TMP27:%.*]] = phi ptr [ [[TMP10]], [[TMP6]] ], [ [[MYALLOCA]], [[TMP9]] ]
+; CHECK-OUTLINE-NEXT:    store ptr [[TMP27]], ptr [[ASAN_LOCAL_STACK_BASE]], align 8
+; CHECK-OUTLINE-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP27]], i64 32
+; CHECK-OUTLINE-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[TMP27]], i64 8480
+; CHECK-OUTLINE-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP27]], i64 8496
+; CHECK-OUTLINE-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[TMP27]], i64 8512
+; CHECK-OUTLINE-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[TMP27]], i64 8528
+; CHECK-OUTLINE-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[TMP27]], i64 8544
+; CHECK-OUTLINE-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[TMP27]], i64 8560
 ; CHECK-OUTLINE-NEXT:    store i64 1102416563, ptr [[TMP27]], align 8
-; CHECK-OUTLINE-NEXT:    [[TMP28:%.*]] = add i64 [[TMP12]], 8
-; CHECK-OUTLINE-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
+; CHECK-OUTLINE-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[TMP27]], i64 8
 ; CHECK-OUTLINE-NEXT:    store i64 ptrtoint (ptr @___asan_gen_stack to i64), ptr [[TMP29]], align 8
-; CHECK-OUTLINE-NEXT:    [[TMP30:%.*]] = add i64 [[TMP12]], 16
-; CHECK-OUTLINE-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
+; CHECK-OUTLINE-NEXT:    [[TMP31:%.*]] = getelementptr i8, ptr [[TMP27]], i64 16
 ; CHECK-OUTLINE-NEXT:    store i64 ptrtoint (ptr @FuncletPersonality to i64), ptr [[TMP31]], align 8
+; CHECK-OUTLINE-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[TMP27]] to i64
 ; CHECK-OUTLINE-NEXT:    [[TMP32:%.*]] = lshr i64 [[TMP12]], 3
 ; CHECK-OUTLINE-NEXT:    [[TMP33:%.*]] = add i64 [[TMP32]], [[TMP1]]
 ; CHECK-OUTLINE-NEXT:    call void @__asan_set_shadow_f1(i64 [[TMP33]], i64 4)
@@ -335,10 +324,12 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
 ; CHECK-OUTLINE-NEXT:    call void @__asan_set_shadow_f3(i64 [[TMP45]], i64 5)
 ; CHECK-OUTLINE-NEXT:    [[TMP46:%.*]] = add i64 [[TMP33]], 1066
 ; CHECK-OUTLINE-NEXT:    call void @__asan_set_shadow_04(i64 [[TMP46]], i64 1)
+; CHECK-OUTLINE-NEXT:    [[TMP21:%.*]] = ptrtoint ptr [[TMP22]] to i64
 ; CHECK-OUTLINE-NEXT:    call void @__asan_store1(i64 [[TMP21]])
 ; CHECK-OUTLINE-NEXT:    store volatile i8 0, ptr [[TMP22]], align 1
 ; CHECK-OUTLINE-NEXT:    [[TMP47:%.*]] = add i64 [[TMP33]], 1066
 ; CHECK-OUTLINE-NEXT:    call void @__asan_set_shadow_f8(i64 [[TMP47]], i64 1)
+; CHECK-OUTLINE-NEXT:    [[TMP25:%.*]] = ptrtoint ptr [[TMP26]] to i64
 ; CHECK-OUTLINE-NEXT:    call void @__asan_store8(i64 [[TMP25]])
 ; CHECK-OUTLINE-NEXT:    store volatile i64 0, ptr [[TMP26]], align 8
 ; CHECK-OUTLINE-NEXT:    [[TMPCOPYI64:%.*]] = load i64, ptr [[TMP26]], align 8
@@ -367,22 +358,23 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
 ; CHECK-OUTLINE-NEXT:    store i64 1172321806, ptr [[TMP27]], align 8
 ; CHECK-OUTLINE-NEXT:    [[TMP61:%.*]] = icmp ne i64 [[TMP7]], 0
 ; CHECK-OUTLINE-NEXT:    br i1 [[TMP61]], label [[TMP62:%.*]], label [[TMP63:%.*]]
-; CHECK-OUTLINE:       62:
+; CHECK-OUTLINE:       55:
 ; CHECK-OUTLINE-NEXT:    call void @__asan_stack_free_8(i64 [[TMP7]], i64 8608)
 ; CHECK-OUTLINE-NEXT:    br label [[TMP66:%.*]]
-; CHECK-OUTLINE:       63:
+; CHECK-OUTLINE:       56:
 ; CHECK-OUTLINE-NEXT:    call void @__asan_set_shadow_00(i64 [[TMP33]], i64 4)
 ; CHECK-OUTLINE-NEXT:    [[TMP64:%.*]] = add i64 [[TMP33]], 1028
 ; CHECK-OUTLINE-NEXT:    call void @__asan_set_shadow_00(i64 [[TMP64]], i64 42)
 ; CHECK-OUTLINE-NEXT:    [[TMP65:%.*]] = add i64 [[TMP33]], 1071
 ; CHECK-OUTLINE-NEXT:    call void @__asan_set_shadow_00(i64 [[TMP65]], i64 5)
 ; CHECK-OUTLINE-NEXT:    br label [[TMP66]]
-; CHECK-OUTLINE:       66:
+; CHECK-OUTLINE:       59:
 ; CHECK-OUTLINE-NEXT:    ret void
 ; CHECK-OUTLINE:       ehcleanup:
 ; CHECK-OUTLINE-NEXT:    [[TMP67:%.*]] = cleanuppad within none []
 ; CHECK-OUTLINE-NEXT:    [[TMP68:%.*]] = add i64 [[TMP33]], 1068
 ; CHECK-OUTLINE-NEXT:    call void @__asan_set_shadow_04(i64 [[TMP68]], i64 1) [ "funclet"(token [[TMP67]]) ]
+; CHECK-OUTLINE-NEXT:    [[TMP23:%.*]] = ptrtoint ptr [[TMP24]] to i64
 ; CHECK-OUTLINE-NEXT:    call void @__asan_store1(i64 [[TMP23]]) [ "funclet"(token [[TMP67]]) ]
 ; CHECK-OUTLINE-NEXT:    store volatile i8 0, ptr [[TMP24]], align 1
 ; CHECK-OUTLINE-NEXT:    [[TMP69:%.*]] = add i64 [[TMP33]], 1068
@@ -391,8 +383,12 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
 ; CHECK-OUTLINE-NEXT:    [[TMP70:%.*]] = call ptr @__asan_memset(ptr [[TMP16]], i32 0, i64 4) [ "funclet"(token [[TMP67]]) ]
 ; CHECK-OUTLINE-NEXT:    [[TMP71:%.*]] = call ptr @__asan_memcpy(ptr [[TMP18]], ptr [[TMP16]], i64 4) [ "funclet"(token [[TMP67]]) ]
 ; CHECK-OUTLINE-NEXT:    [[TMP72:%.*]] = call ptr @__asan_memmove(ptr [[TMP20]], ptr [[TMP16]], i64 4) [ "funclet"(token [[TMP67]]) ]
+; CHECK-OUTLINE-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[TMP16]] to i64
+; CHECK-OUTLINE-NEXT:    [[TMP17:%.*]] = ptrtoint ptr [[TMP18]] to i64
 ; CHECK-OUTLINE-NEXT:    call void @__sanitizer_ptr_cmp(i64 [[TMP15]], i64 [[TMP17]]) [ "funclet"(token [[TMP67]]) ]
-; CHECK-OUTLINE-NEXT:    call void @__sanitizer_ptr_sub(i64 [[TMP15]], i64 [[TMP17]]) [ "funclet"(token [[TMP67]]) ]
+; CHECK-OUTLINE-NEXT:    [[ADDR1:%.*]] = ptrtoint ptr [[TMP16]] to i64
+; CHECK-OUTLINE-NEXT:    [[ADDR2:%.*]] = ptrtoint ptr [[TMP18]] to i64
+; CHECK-OUTLINE-NEXT:    call void @__sanitizer_ptr_sub(i64 [[ADDR1]], i64 [[ADDR2]]) [ "funclet"(token [[TMP67]]) ]
 ; CHECK-OUTLINE-NEXT:    [[TMP73:%.*]] = ptrtoint ptr [[PTRPARAM]] to i64
 ; CHECK-OUTLINE-NEXT:    call void @__asan_storeN(i64 [[TMP73]], i64 8) [ "funclet"(token [[TMP67]]) ]
 ; CHECK-OUTLINE-NEXT:    store i64 0, ptr [[PTRPARAM]], align 1
@@ -404,17 +400,17 @@ define void @FuncletPersonality(ptr %ptrParam) sanitize_address personality ptr
 ; CHECK-OUTLINE-NEXT:    store i64 1172321806, ptr [[TMP27]], align 8
 ; CHECK-OUTLINE-NEXT:    [[TMP78:%.*]] = icmp ne i64 [[TMP7]], 0
 ; CHECK-OUTLINE-NEXT:    br i1 [[TMP78]], label [[TMP79:%.*]], label [[TMP80:%.*]]
-; CHECK-OUTLINE:       79:
+; CHECK-OUTLINE:       75:
 ; CHECK-OUTLINE-NEXT:    call void @__asan_stack_free_8(i64 [[TMP7]], i64 8608) [ "funclet"(token [[TMP67]]) ]
 ; CHECK-OUTLINE-NEXT:    br label [[TMP83:%.*]]
-; CHECK-OUTLINE:       80:
+; CHECK-OUTLINE:       76:
 ; CHECK-OUTLINE-NEXT:    call void @__asan_set_shadow_00(i64 [[TMP33]], i64 4) [ "funclet"(token [[TMP67]]) ]
 ; CHECK-OUTLINE-NEXT:    [[TMP81:%.*]] = add i64 [[TMP33]], 1028
 ; CHECK-OUTLINE-NEXT:    call void @__asan_set_shadow_00(i64 [[TMP81]], i64 42) [ "funclet"(token [[TMP67]]) ]
 ; CHECK-OUTLINE-NEXT:    [[TMP82:%.*]] = add i64 [[TMP33]], 1071
 ; CHECK-OUTLINE-NEXT:    call void @__asan_set_shadow_00(i64 [[TMP82]], i64 5) [ "funclet"(token [[TMP67]]) ]
 ; CHECK-OUTLINE-NEXT:    br label [[TMP83]]
-; CHECK-OUTLINE:       83:
+; CHECK-OUTLINE:       79:
 ; CHECK-OUTLINE-NEXT:    cleanupret from [[TMP67]] unwind to caller
 ;
 
@@ -487,7 +483,7 @@ nopredecessor:
 ; Non-Windows personality, ensure no funclet gets attached to asan runtime call.
 define void @OtherPersonality(ptr %ptrParam) sanitize_address personality ptr @dummyPersonality {
 ; CHECK-LABEL: define void @OtherPersonality(
-; CHECK-SAME: ptr [[PTRPARAM:%.*]]) #[[ATTR3:[0-9]+]] personality ptr @dummyPersonality {
+; CHECK-SAME: ptr [[PTRPARAM:%.*]]) #[[ATTR2:[0-9]+]] personality ptr @dummyPersonality {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8
 ; CHECK-NEXT:    [[ASAN_LOCAL_STACK_BASE:%.*]] = alloca i64, align 8
@@ -499,25 +495,22 @@ define void @OtherPersonality(ptr %ptrParam) sanitize_address personality ptr @d
 ; CHECK-NEXT:    br label [[TMP5]]
 ; CHECK:       5:
 ; CHECK-NEXT:    [[TMP6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP4]], [[TMP3]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP6]], 0
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP10:%.*]]
-; CHECK:       8:
+; CHECK:       9:
 ; CHECK-NEXT:    [[MYALLOCA:%.*]] = alloca i8, i64 64, align 32
-; CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[MYALLOCA]] to i64
 ; CHECK-NEXT:    br label [[TMP10]]
 ; CHECK:       10:
-; CHECK-NEXT:    [[TMP11:%.*]] = phi i64 [ [[TMP6]], [[TMP5]] ], [ [[TMP9]], [[TMP8]] ]
-; CHECK-NEXT:    store i64 [[TMP11]], ptr [[ASAN_LOCAL_STACK_BASE]], align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[TMP11]], 32
-; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
-; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    [[TMP14:%.*]] = phi ptr [ [[TMP9]], [[TMP5]] ], [ [[MYALLOCA]], [[TMP8]] ]
+; CHECK-NEXT:    store ptr [[TMP14]], ptr [[ASAN_LOCAL_STACK_BASE]], align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP14]], i64 32
 ; CHECK-NEXT:    store i64 1102416563, ptr [[TMP14]], align 8
-; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[TMP11]], 8
-; CHECK-NEXT:    [[TMP16:%.*]] = inttoptr i64 [[TMP15]] to ptr
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[TMP14]], i64 8
 ; CHECK-NEXT:    store i64 ptrtoint (ptr @___asan_gen_stack.1 to i64), ptr [[TMP16]], align 8
-; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[TMP11]], 16
-; CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP14]], i64 16
 ; CHECK-NEXT:    store i64 ptrtoint (ptr @OtherPersonality to i64), ptr [[TMP18]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP14]] to i64
 ; CHECK-NEXT:    [[TMP19:%.*]] = lshr i64 [[TMP11]], 3
 ; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[TMP19]], [[TMP0]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[TMP20]], 0
@@ -532,14 +525,14 @@ define void @OtherPersonality(ptr %ptrParam) sanitize_address personality ptr @d
 ; CHECK-NEXT:    store i64 1172321806, ptr [[TMP14]], align 8
 ; CHECK-NEXT:    [[TMP24:%.*]] = icmp ne i64 [[TMP6]], 0
 ; CHECK-NEXT:    br i1 [[TMP24]], label [[TMP25:%.*]], label [[TMP26:%.*]]
-; CHECK:       25:
+; CHECK:       22:
 ; CHECK-NEXT:    call void @__asan_stack_free_0(i64 [[TMP6]], i64 64)
 ; CHECK-NEXT:    br label [[TMP28:%.*]]
-; CHECK:       26:
+; CHECK:       23:
 ; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[TMP20]], 0
 ; CHECK-NEXT:    call void @__asan_set_shadow_00(i64 [[TMP27]], i64 8)
 ; CHECK-NEXT:    br label [[TMP28]]
-; CHECK:       28:
+; CHECK:       25:
 ; CHECK-NEXT:    ret void
 ; CHECK:       ehcleanup:
 ; CHECK-NEXT:    [[TMP29:%.*]] = cleanuppad within none []
@@ -547,14 +540,14 @@ define void @OtherPersonality(ptr %ptrParam) sanitize_address personality ptr @d
 ; CHECK-NEXT:    store i64 1172321806, ptr [[TMP14]], align 8
 ; CHECK-NEXT:    [[TMP31:%.*]] = icmp ne i64 [[TMP6]], 0
 ; CHECK-NEXT:    br i1 [[TMP31]], label [[TMP32:%.*]], label [[TMP33:%.*]]
-; CHECK:       32:
+; CHECK:       29:
 ; CHECK-NEXT:    call void @__asan_stack_free_0(i64 [[TMP6]], i64 64)
 ; CHECK-NEXT:    br label [[TMP35:%.*]]
-; CHECK:       33:
+; CHECK:       30:
 ; CHECK-NEXT:    [[TMP34:%.*]] = add i64 [[TMP20]], 0
 ; CHECK-NEXT:    call void @__asan_set_shadow_00(i64 [[TMP34]], i64 8)
 ; CHECK-NEXT:    br label [[TMP35]]
-; CHECK:       35:
+; CHECK:       32:
 ; CHECK-NEXT:    cleanupret from [[TMP29]] unwind to caller
 ;
 entry:
diff --git a/llvm/test/Instrumentation/AddressSanitizer/asan-masked-load-store.ll b/llvm/test/Instrumentation/AddressSanitizer/asan-masked-load-store.ll
index afa3d0966b555..60a4f2225035f 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/asan-masked-load-store.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/asan-masked-load-store.ll
@@ -22,11 +22,11 @@ define void @store.v4f32.1110(ptr %p, <4 x float> %arg) sanitize_address {
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <4 x float>, ptr [[P]], i64 0, i64 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[TMP5]] to i64
 ; CHECK-NEXT:    call void @__asan_store4(i64 [[TMP6]])
-; CHECK-NEXT:    tail call void @llvm.masked.store.v4f32.p0(<4 x float> [[ARG:%.*]], ptr [[P]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 false>)
+; CHECK-NEXT:    tail call void @llvm.masked.store.v4f32.p0(<4 x float> [[ARG:%.*]], ptr align 4 [[P]], <4 x i1> <i1 true, i1 true, i1 true, i1 false>)
 ; CHECK-NEXT:    ret void
 ;
 ; DISABLED-LABEL: @store.v4f32.1110(
-; DISABLED-NEXT:    tail call void @llvm.masked.store.v4f32.p0(<4 x float> [[ARG:%.*]], ptr [[P:%.*]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 false>)
+; DISABLED-NEXT:    tail call void @llvm.masked.store.v4f32.p0(<4 x float> [[ARG:%.*]], ptr align 4 [[P:%.*]], <4 x i1> <i1 true, i1 true, i1 true, i1 false>)
 ; DISABLED-NEXT:    ret void
 ;
   tail call void @llvm.masked.store.v4f32.p0(<4 x float> %arg, ptr %p, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 false>)
@@ -47,11 +47,11 @@ define void @store.v8i32.10010110(ptr %p, <8 x i32> %arg) sanitize_address {
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr <8 x i32>, ptr [[P]], i64 0, i64 6
 ; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP7]] to i64
 ; CHECK-NEXT:    call void @__asan_store4(i64 [[TMP8]])
-; CHECK-NEXT:    tail call void @llvm.masked.store.v8i32.p0(<8 x i32> [[ARG:%.*]], ptr [[P]], i32 8, <8 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false>)
+; CHECK-NEXT:    tail call void @llvm.masked.store.v8i32.p0(<8 x i32> [[ARG:%.*]], ptr align 8 [[P]], <8 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false>)
 ; CHECK-NEXT:    ret void
 ;
 ; DISABLED-LABEL: @store.v8i32.10010110(
-; DISABLED-NEXT:    tail call void @llvm.masked.store.v8i32.p0(<8 x i32> [[ARG:%.*]], ptr [[P:%.*]], i32 8, <8 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false>)
+; DISABLED-NEXT:    tail call void @llvm.masked.store.v8i32.p0(<8 x i32> [[ARG:%.*]], ptr align 8 [[P:%.*]], <8 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false>)
 ; DISABLED-NEXT:    ret void
 ;
   tail call void @llvm.masked.store.v8i32.p0(<8 x i32> %arg, ptr %p, i32 8, <8 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false>)
@@ -63,11 +63,11 @@ define void @store.v4i64.0001(ptr %p, <4 x ptr> %arg) sanitize_address {
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <4 x ptr>, ptr [[P:%.*]], i64 0, i64 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64
 ; CHECK-NEXT:    call void @__asan_store8(i64 [[TMP2]])
-; CHECK-NEXT:    tail call void @llvm.masked.store.v4p0.p0(<4 x ptr> [[ARG:%.*]], ptr [[P]], i32 8, <4 x i1> <i1 false, i1 false, i1 false, i1 true>)
+; CHECK-NEXT:    tail call void @llvm.masked.store.v4p0.p0(<4 x ptr> [[ARG:%.*]], ptr align 8 [[P]], <4 x i1> <i1 false, i1 false, i1 false, i1 true>)
 ; CHECK-NEXT:    ret void
 ;
 ; DISABLED-LABEL: @store.v4i64.0001(
-; DISABLED-NEXT:    tail call void @llvm.masked.store.v4p0.p0(<4 x ptr> [[ARG:%.*]], ptr [[P:%.*]], i32 8, <4 x i1> <i1 false, i1 false, i1 false, i1 true>)
+; DISABLED-NEXT:    tail call void @llvm.masked.store.v4p0.p0(<4 x ptr> [[ARG:%.*]], ptr align 8 [[P:%.*]], <4 x i1> <i1 false, i1 false, i1 false, i1 true>)
 ; DISABLED-NEXT:    ret void
 ;
   tail call void @llvm.masked.store.v4p0.p0(<4 x ptr> %arg, ptr %p, i32 8, <4 x i1> <i1 false, i1 false, i1 false, i1 true>)
@@ -108,11 +108,11 @@ define void @store.v4f32.variable(ptr %p, <4 x float> %arg, <4 x i1> %mask) sani
 ; CHECK-NEXT:    call void @__asan_store4(i64 [[TMP19]])
 ; CHECK-NEXT:    br label [[TMP20]]
 ; CHECK:       20:
-; CHECK-NEXT:    tail call void @llvm.masked.store.v4f32.p0(<4 x float> [[ARG:%.*]], ptr [[P]], i32 4, <4 x i1> [[MASK]])
+; CHECK-NEXT:    tail call void @llvm.masked.store.v4f32.p0(<4 x float> [[ARG:%.*]], ptr align 4 [[P]], <4 x i1> [[MASK]])
 ; CHECK-NEXT:    ret void
 ;
 ; DISABLED-LABEL: @store.v4f32.variable(
-; DISABLED-NEXT:    tail call void @llvm.masked.store.v4f32.p0(<4 x float> [[ARG:%.*]], ptr [[P:%.*]], i32 4, <4 x i1> [[MASK:%.*]])
+; DISABLED-NEXT:    tail call void @llvm.masked.store.v4f32.p0(<4 x float> [[ARG:%.*]], ptr align 4 [[P:%.*]], <4 x i1> [[MASK:%.*]])
 ; DISABLED-NEXT:    ret void
 ;
   tail call void @llvm.masked.store.v4f32.p0(<4 x float> %arg, ptr %p, i32 4, <4 x i1> %mask)
@@ -125,11 +125,11 @@ define void @store.v4f32.1010.split(ptr %p, <4 x float> %arg) sanitize_address {
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <4 x float>, ptr [[P:%.*]], i64 0, i64 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64
 ; CHECK-NEXT:    call void @__asan_store4(i64 [[TMP2]])
-; CHECK-NEXT:    tail call void @llvm.masked.store.v4f32.p0(<4 x float> [[ARG:%.*]], ptr [[P]], i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>)
+; CHECK-NEXT:    tail call void @llvm.masked.store.v4f32.p0(<4 x float> [[ARG:%.*]], ptr align 4 [[P]], <4 x i1> <i1 false, i1 false, i1 true, i1 false>)
 ; CHECK-NEXT:    ret void
 ;
 ; DISABLED-LABEL: @store.v4f32.1010.split(
-; DISABLED-NEXT:    tail call void @llvm.masked.store.v4f32.p0(<4 x float> [[ARG:%.*]], ptr [[P:%.*]], i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>)
+; DISABLED-NEXT:    tail call void @llvm.masked.store.v4f32.p0(<4 x float> [[ARG:%.*]], ptr align 4 [[P:%.*]], <4 x i1> <i1 false, i1 false, i1 true, i1 false>)
 ; DISABLED-NEXT:    ret void
 ;
   tail call void @llvm.masked.store.v4f32.p0(<4 x float> %arg, ptr %p, i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>)
@@ -142,12 +142,12 @@ define void @store.v4f32.0010.after.full.store(ptr %p, <4 x float> %arg) sanitiz
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; CHECK-NEXT:    call void @__asan_store16(i64 [[TMP1]])
 ; CHECK-NEXT:    store <4 x float> [[ARG:%.*]], ptr [[P]], align 16
-; CHECK-NEXT:    tail call void @llvm.masked.store.v4f32.p0(<4 x float> [[ARG]], ptr [[P]], i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>)
+; CHECK-NEXT:    tail call void @llvm.masked.store.v4f32.p0(<4 x float> [[ARG]], ptr align 4 [[P]], <4 x i1> <i1 false, i1 false, i1 true, i1 false>)
 ; CHECK-NEXT:    ret void
 ;
 ; DISABLED-LABEL: @store.v4f32.0010.after.full.store(
 ; DISABLED-NEXT:    store <4 x float> [[ARG:%.*]], ptr [[P:%.*]], align 16
-; DISABLED-NEXT:    tail call void @llvm.masked.store.v4f32.p0(<4 x float> [[ARG]], ptr [[P]], i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>)
+; DISABLED-NEXT:    tail call void @llvm.masked.store.v4f32.p0(<4 x float> [[ARG]], ptr align 4 [[P]], <4 x i1> <i1 false, i1 false, i1 true, i1 false>)
 ; DISABLED-NEXT:    ret void
 ;
   store <4 x float> %arg, ptr %p
@@ -174,11 +174,11 @@ define <8 x i32> @load.v8i32.11100001(ptr %p, <8 x i32> %arg) sanitize_address {
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr <8 x i32>, ptr [[P]], i64 0, i64 7
 ; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP7]] to i64
 ; CHECK-NEXT:    call void @__asan_load4(i64 [[TMP8]])
-; CHECK-NEXT:    [[RES:%.*]] = tail call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[P]], i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> [[ARG:%.*]])
+; CHECK-NEXT:    [[RES:%.*]] = tail call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 8 [[P]], <8 x i1> <i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> [[ARG:%.*]])
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
 ; DISABLED-LABEL: @load.v8i32.11100001(
-; DISABLED-NEXT:    [[RES:%.*]] = tail call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[P:%.*]], i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> [[ARG:%.*]])
+; DISABLED-NEXT:    [[RES:%.*]] = tail call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 8 [[P:%.*]], <8 x i1> <i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> [[ARG:%.*]])
 ; DISABLED-NEXT:    ret <8 x i32> [[RES]]
 ;
   %res = tail call <8 x i32> @llvm.masked.load.v8i32.p0(ptr %p, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> %arg)
@@ -193,11 +193,11 @@ define <4 x float> @load.v4f32.1001(ptr %p, <4 x float> %arg) sanitize_address {
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <4 x float>, ptr [[P]], i64 0, i64 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[TMP3]] to i64
 ; CHECK-NEXT:    call void @__asan_load4(i64 [[TMP4]])
-; CHECK-NEXT:    [[RES:%.*]] = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[P]], i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> [[ARG:%.*]])
+; CHECK-NEXT:    [[RES:%.*]] = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[P]], <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> [[ARG:%.*]])
 ; CHECK-NEXT:    ret <4 x float> [[RES]]
 ;
 ; DISABLED-LABEL: @load.v4f32.1001(
-; DISABLED-NEXT:    [[RES:%.*]] = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[P:%.*]], i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> [[ARG:%.*]])
+; DISABLED-NEXT:    [[RES:%.*]] = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[P:%.*]], <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> [[ARG:%.*]])
 ; DISABLED-NEXT:    ret <4 x float> [[RES]]
 ;
   %res = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr %p, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> %arg)
@@ -209,11 +209,11 @@ define <4 x ptr> @load.v4i64.0001(ptr %p, <4 x ptr> %arg) sanitize_address {
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <4 x ptr>, ptr [[P:%.*]], i64 0, i64 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64
 ; CHECK-NEXT:    call void @__asan_load8(i64 [[TMP2]])
-; CHECK-NEXT:    [[RES:%.*]] = tail call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[P]], i32 8, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x ptr> [[ARG:%.*]])
+; CHECK-NEXT:    [[RES:%.*]] = tail call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 [[P]], <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x ptr> [[ARG:%.*]])
 ; CHECK-NEXT:    ret <4 x ptr> [[RES]]
 ;
 ; DISABLED-LABEL: @load.v4i64.0001(
-; DISABLED-NEXT:    [[RES:%.*]] = tail call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[P:%.*]], i32 8, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x ptr> [[ARG:%.*]])
+; DISABLED-NEXT:    [[RES:%.*]] = tail call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 [[P:%.*]], <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x ptr> [[ARG:%.*]])
 ; DISABLED-NEXT:    ret <4 x ptr> [[RES]]
 ;
   %res = tail call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr %p, i32 8, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x ptr> %arg)
@@ -254,11 +254,11 @@ define <4 x float> @load.v4f32.variable(ptr %p, <4 x float> %arg, <4 x i1> %mask
 ; CHECK-NEXT:    call void @__asan_load4(i64 [[TMP19]])
 ; CHECK-NEXT:    br label [[TMP20]]
 ; CHECK:       20:
-; CHECK-NEXT:    [[RES:%.*]] = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[P]], i32 4, <4 x i1> [[MASK]], <4 x float> [[ARG:%.*]])
+; CHECK-NEXT:    [[RES:%.*]] = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[P]], <4 x i1> [[MASK]], <4 x float> [[ARG:%.*]])
 ; CHECK-NEXT:    ret <4 x float> [[RES]]
 ;
 ; DISABLED-LABEL: @load.v4f32.variable(
-; DISABLED-NEXT:    [[RES:%.*]] = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[P:%.*]], i32 4, <4 x i1> [[MASK:%.*]], <4 x float> [[ARG:%.*]])
+; DISABLED-NEXT:    [[RES:%.*]] = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[P:%.*]], <4 x i1> [[MASK:%.*]], <4 x float> [[ARG:%.*]])
 ; DISABLED-NEXT:    ret <4 x float> [[RES]]
 ;
   %res = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr %p, i32 4, <4 x i1> %mask, <4 x float> %arg)
@@ -271,16 +271,16 @@ define <4 x float> @load.v4f32.1001.split(ptr %p, <4 x float> %arg) sanitize_add
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <4 x float>, ptr [[P:%.*]], i64 0, i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64
 ; CHECK-NEXT:    call void @__asan_load4(i64 [[TMP2]])
-; CHECK-NEXT:    [[RES:%.*]] = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[P]], i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> [[ARG:%.*]])
+; CHECK-NEXT:    [[RES:%.*]] = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[P]], <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> [[ARG:%.*]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <4 x float>, ptr [[P]], i64 0, i64 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[TMP3]] to i64
 ; CHECK-NEXT:    call void @__asan_load4(i64 [[TMP4]])
-; CHECK-NEXT:    [[RES2:%.*]] = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[P]], i32 4, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> [[RES]])
+; CHECK-NEXT:    [[RES2:%.*]] = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[P]], <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> [[RES]])
 ; CHECK-NEXT:    ret <4 x float> [[RES2]]
 ;
 ; DISABLED-LABEL: @load.v4f32.1001.split(
-; DISABLED-NEXT:    [[RES:%.*]] = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[P:%.*]], i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> [[ARG:%.*]])
-; DISABLED-NEXT:    [[RES2:%.*]] = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[P]], i32 4, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> [[RES]])
+; DISABLED-NEXT:    [[RES:%.*]] = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[P:%.*]], <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> [[ARG:%.*]])
+; DISABLED-NEXT:    [[RES2:%.*]] = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[P]], <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> [[RES]])
 ; DISABLED-NEXT:    ret <4 x float> [[RES2]]
 ;
   %res = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr %p, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %arg)
@@ -294,12 +294,12 @@ define <4 x float> @load.v4f32.1001.after.full.load(ptr %p, <4 x float> %arg) sa
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; CHECK-NEXT:    call void @__asan_load16(i64 [[TMP1]])
 ; CHECK-NEXT:    [[RES:%.*]] = load <4 x float>, ptr [[P]], align 16
-; CHECK-NEXT:    [[RES2:%.*]] = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[P]], i32 4, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> [[ARG:%.*]])
+; CHECK-NEXT:    [[RES2:%.*]] = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[P]], <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> [[ARG:%.*]])
 ; CHECK-NEXT:    ret <4 x float> [[RES2]]
 ;
 ; DISABLED-LABEL: @load.v4f32.1001.after.full.load(
 ; DISABLED-NEXT:    [[RES:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16
-; DISABLED-NEXT:    [[RES2:%.*]] = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[P]], i32 4, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> [[ARG:%.*]])
+; DISABLED-NEXT:    [[RES2:%.*]] = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[P]], <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> [[ARG:%.*]])
 ; DISABLED-NEXT:    ret <4 x float> [[RES2]]
 ;
   %res = load <4 x float>, ptr %p
@@ -331,11 +331,11 @@ define <vscale x 4 x float> @scalable.load.nxv4f32(ptr %p, <vscale x 4 x i1> %ma
 ; CHECK-NEXT:    [[IV_CHECK:%.*]] = icmp eq i64 [[IV_NEXT]], [[TMP2]]
 ; CHECK-NEXT:    br i1 [[IV_CHECK]], label [[DOTSPLIT_SPLIT:%.*]], label [[DOTSPLIT]]
 ; CHECK:       .split.split:
-; CHECK-NEXT:    [[RES:%.*]] = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[P]], i32 4, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> undef)
+; CHECK-NEXT:    [[RES:%.*]] = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 4 [[P]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> undef)
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[RES]]
 ;
 ; DISABLED-LABEL: @scalable.load.nxv4f32(
-; DISABLED-NEXT:    [[RES:%.*]] = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[P:%.*]], i32 4, <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> undef)
+; DISABLED-NEXT:    [[RES:%.*]] = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 4 [[P:%.*]], <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> undef)
 ; DISABLED-NEXT:    ret <vscale x 4 x float> [[RES]]
 ;
   %res = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %p, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x float> undef)
@@ -361,11 +361,11 @@ define void @scalable.store.nxv4f32(ptr %p, <vscale x 4 x float> %arg, <vscale x
 ; CHECK-NEXT:    [[IV_CHECK:%.*]] = icmp eq i64 [[IV_NEXT]], [[TMP2]]
 ; CHECK-NEXT:    br i1 [[IV_CHECK]], label [[DOTSPLIT_SPLIT:%.*]], label [[DOTSPLIT]]
 ; CHECK:       .split.split:
-; CHECK-NEXT:    tail call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[ARG:%.*]], ptr [[P]], i32 4, <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT:    tail call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[ARG:%.*]], ptr align 4 [[P]], <vscale x 4 x i1> [[MASK]])
 ; CHECK-NEXT:    ret void
 ;
 ; DISABLED-LABEL: @scalable.store.nxv4f32(
-; DISABLED-NEXT:    tail call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[ARG:%.*]], ptr [[P:%.*]], i32 4, <vscale x 4 x i1> [[MASK:%.*]])
+; DISABLED-NEXT:    tail call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[ARG:%.*]], ptr align 4 [[P:%.*]], <vscale x 4 x i1> [[MASK:%.*]])
 ; DISABLED-NEXT:    ret void
 ;
   tail call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %arg, ptr %p, i32 4, <vscale x 4 x i1> %mask)
@@ -395,11 +395,11 @@ define <vscale x 4 x float> @scalable.gather.nxv4f32(<vscale x 4 x ptr> %vp, <vs
 ; CHECK-NEXT:    [[IV_CHECK:%.*]] = icmp eq i64 [[IV_NEXT]], [[TMP2]]
 ; CHECK-NEXT:    br i1 [[IV_CHECK]], label [[DOTSPLIT_SPLIT:%.*]], label [[DOTSPLIT]]
 ; CHECK:       .split.split:
-; CHECK-NEXT:    [[RES:%.*]] = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[VP]], i32 4, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> undef)
+; CHECK-NEXT:    [[RES:%.*]] = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> align 4 [[VP]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> undef)
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[RES]]
 ;
 ; DISABLED-LABEL: @scalable.gather.nxv4f32(
-; DISABLED-NEXT:    [[RES:%.*]] = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[VP:%.*]], i32 4, <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> undef)
+; DISABLED-NEXT:    [[RES:%.*]] = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> align 4 [[VP:%.*]], <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> undef)
 ; DISABLED-NEXT:    ret <vscale x 4 x float> [[RES]]
 ;
   %res = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr>  %vp, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x float> undef)
@@ -425,11 +425,11 @@ define void @scalable.scatter.nxv4f32(<vscale x 4 x float> %val, <vscale x 4 x p
 ; CHECK-NEXT:    [[IV_CHECK:%.*]] = icmp eq i64 [[IV_NEXT]], [[TMP2]]
 ; CHECK-NEXT:    br i1 [[IV_CHECK]], label [[DOTSPLIT_SPLIT:%.*]], label [[DOTSPLIT]]
 ; CHECK:       .split.split:
-; CHECK-NEXT:    tail call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> [[VAL:%.*]], <vscale x 4 x ptr> [[VP]], i32 4, <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT:    tail call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> [[VAL:%.*]], <vscale x 4 x ptr> align 4 [[VP]], <vscale x 4 x i1> [[MASK]])
 ; CHECK-NEXT:    ret void
 ;
 ; DISABLED-LABEL: @scalable.scatter.nxv4f32(
-; DISABLED-NEXT:    tail call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> [[VAL:%.*]], <vscale x 4 x ptr> [[VP:%.*]], i32 4, <vscale x 4 x i1> [[MASK:%.*]])
+; DISABLED-NEXT:    tail call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> [[VAL:%.*]], <vscale x 4 x ptr> align 4 [[VP:%.*]], <vscale x 4 x i1> [[MASK:%.*]])
 ; DISABLED-NEXT:    ret void
 ;
   tail call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> %val, <vscale x 4 x ptr>  %vp, i32 4, <vscale x 4 x i1> %mask)
diff --git a/llvm/test/Instrumentation/AddressSanitizer/asan-win-dont-instrument-catchpad.ll b/llvm/test/Instrumentation/AddressSanitizer/asan-win-dont-instrument-catchpad.ll
new file mode 100644
index 0000000000000..e38da0b5e2fd3
--- /dev/null
+++ b/llvm/test/Instrumentation/AddressSanitizer/asan-win-dont-instrument-catchpad.ll
@@ -0,0 +1,63 @@
+; RUN: opt < %s -passes=asan -S | FileCheck %s
+; CHECK: %ex = alloca i32, align 4
+; CHECK: catchpad within %{{.*}} [ptr @"??_R0H@8", i32 0, ptr %ex]
+
+; This test ensures that catch parameters are not instrumented on Windows.
+
+; This file was generated using the following source
+;
+; ```C++
+; #include <exception>
+; #include <cstdio>
+;
+; int main() {
+;  try {
+;   throw 1;
+;  } catch (const int ex) {
+;   printf("%d\n", ex);
+;   return -1;
+;  }
+;  return 0;
+; }
+;
+; ```
+; then running the following sequence of commands
+;
+; ```
+; clang.exe -g0 -O0 -emit-llvm -c main.cpp -o main.bc
+; llvm-extract.exe -func=main main.bc -o main_func.bc
+; llvm-dis.exe main_func.bc -o main_func_dis.ll
+; ```
+; and finally manually trimming the resulting `.ll` file to remove
+; unnecessary metadata, and manually adding the `sanitize_address` annotation;
+; needed for the ASan pass to run.
+
+target triple = "x86_64-pc-windows-msvc"
+
+@"??_R0H@8" = external global ptr
+
+; Function Attrs: sanitize_address
+define i32 @main() sanitize_address personality ptr @__CxxFrameHandler3 {
+entry:
+  %ex = alloca i32, align 4
+  invoke void @throw()
+          to label %unreachable unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %0 = catchswitch within none [label %catch] unwind to caller
+
+catch:                                            ; preds = %catch.dispatch
+  %1 = catchpad within %0 [ptr @"??_R0H@8", i32 0, ptr %ex]
+  call void @opaque() [ "funclet"(token %1) ]
+  catchret from %1 to label %return
+
+return:                                           ; preds = %catch
+  ret i32 0
+
+unreachable:                                      ; preds = %entry
+  unreachable
+}
+
+declare void @throw() noreturn
+declare void @opaque()
+declare i32 @__CxxFrameHandler3(...)
diff --git a/llvm/test/Instrumentation/AddressSanitizer/fake-stack.ll b/llvm/test/Instrumentation/AddressSanitizer/fake-stack.ll
index 3cccabb13cbc6..1b00cd8303444 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/fake-stack.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/fake-stack.ll
@@ -20,24 +20,20 @@ define void @Simple() uwtable sanitize_address {
 ; NEVER-LABEL: @Simple(
 ; NEVER-NEXT:  entry:
 ; NEVER-NEXT:    [[MYALLOCA:%.*]] = alloca i8, i64 64, align 32
-; NEVER-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[MYALLOCA]] to i64
-; NEVER-NEXT:    [[TMP1:%.*]] = add i64 [[TMP0]], 32
-; NEVER-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-; NEVER-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP0]] to ptr
-; NEVER-NEXT:    store i64 1102416563, ptr [[TMP3]], align 8
-; NEVER-NEXT:    [[TMP4:%.*]] = add i64 [[TMP0]], 8
-; NEVER-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; NEVER-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[MYALLOCA]], i64 32
+; NEVER-NEXT:    store i64 1102416563, ptr [[MYALLOCA]], align 8
+; NEVER-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[MYALLOCA]], i64 8
 ; NEVER-NEXT:    store i64 ptrtoint (ptr @___asan_gen_stack to i64), ptr [[TMP5]], align 8
-; NEVER-NEXT:    [[TMP6:%.*]] = add i64 [[TMP0]], 16
-; NEVER-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; NEVER-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[MYALLOCA]], i64 16
 ; NEVER-NEXT:    store i64 ptrtoint (ptr @Simple to i64), ptr [[TMP7]], align 8
+; NEVER-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[MYALLOCA]] to i64
 ; NEVER-NEXT:    [[TMP8:%.*]] = lshr i64 [[TMP0]], 3
 ; NEVER-NEXT:    [[TMP9:%.*]] = add i64 [[TMP8]], 2147450880
 ; NEVER-NEXT:    [[TMP10:%.*]] = add i64 [[TMP9]], 0
 ; NEVER-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
 ; NEVER-NEXT:    store i64 -868083113472691727, ptr [[TMP11]], align 1
 ; NEVER-NEXT:    call void @Foo(ptr [[TMP2]])
-; NEVER-NEXT:    store i64 1172321806, ptr [[TMP3]], align 8
+; NEVER-NEXT:    store i64 1172321806, ptr [[MYALLOCA]], align 8
 ; NEVER-NEXT:    [[TMP12:%.*]] = add i64 [[TMP9]], 0
 ; NEVER-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
 ; NEVER-NEXT:    store i64 0, ptr [[TMP13]], align 1
@@ -54,25 +50,22 @@ define void @Simple() uwtable sanitize_address {
 ; RUNTIME-NEXT:    br label [[TMP4]]
 ; RUNTIME:       4:
 ; RUNTIME-NEXT:    [[TMP5:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP3]], [[TMP2]] ]
+; RUNTIME-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; RUNTIME-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 0
 ; RUNTIME-NEXT:    br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
-; RUNTIME:       7:
+; RUNTIME:       8:
 ; RUNTIME-NEXT:    [[MYALLOCA:%.*]] = alloca i8, i64 64, align 32
-; RUNTIME-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[MYALLOCA]] to i64
 ; RUNTIME-NEXT:    br label [[TMP9]]
 ; RUNTIME:       9:
-; RUNTIME-NEXT:    [[TMP10:%.*]] = phi i64 [ [[TMP5]], [[TMP4]] ], [ [[TMP8]], [[TMP7]] ]
-; RUNTIME-NEXT:    store i64 [[TMP10]], ptr [[ASAN_LOCAL_STACK_BASE]], align 8
-; RUNTIME-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 32
-; RUNTIME-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
-; RUNTIME-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP10]] to ptr
+; RUNTIME-NEXT:    [[TMP13:%.*]] = phi ptr [ [[TMP8]], [[TMP4]] ], [ [[MYALLOCA]], [[TMP7]] ]
+; RUNTIME-NEXT:    store ptr [[TMP13]], ptr [[ASAN_LOCAL_STACK_BASE]], align 8
+; RUNTIME-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP13]], i64 32
 ; RUNTIME-NEXT:    store i64 1102416563, ptr [[TMP13]], align 8
-; RUNTIME-NEXT:    [[TMP14:%.*]] = add i64 [[TMP10]], 8
-; RUNTIME-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+; RUNTIME-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[TMP13]], i64 8
 ; RUNTIME-NEXT:    store i64 ptrtoint (ptr @___asan_gen_stack to i64), ptr [[TMP15]], align 8
-; RUNTIME-NEXT:    [[TMP16:%.*]] = add i64 [[TMP10]], 16
-; RUNTIME-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr
+; RUNTIME-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 16
 ; RUNTIME-NEXT:    store i64 ptrtoint (ptr @Simple to i64), ptr [[TMP17]], align 8
+; RUNTIME-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[TMP13]] to i64
 ; RUNTIME-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP10]], 3
 ; RUNTIME-NEXT:    [[TMP19:%.*]] = add i64 [[TMP18]], 2147450880
 ; RUNTIME-NEXT:    [[TMP20:%.*]] = add i64 [[TMP19]], 0
@@ -82,47 +75,43 @@ define void @Simple() uwtable sanitize_address {
 ; RUNTIME-NEXT:    store i64 1172321806, ptr [[TMP13]], align 8
 ; RUNTIME-NEXT:    [[TMP22:%.*]] = icmp ne i64 [[TMP5]], 0
 ; RUNTIME-NEXT:    br i1 [[TMP22]], label [[TMP23:%.*]], label [[TMP30:%.*]]
-; RUNTIME:       23:
+; RUNTIME:       20:
 ; RUNTIME-NEXT:    [[TMP24:%.*]] = add i64 [[TMP19]], 0
 ; RUNTIME-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
 ; RUNTIME-NEXT:    store i64 -723401728380766731, ptr [[TMP25]], align 1
-; RUNTIME-NEXT:    [[TMP26:%.*]] = add i64 [[TMP5]], 56
-; RUNTIME-NEXT:    [[TMP27:%.*]] = inttoptr i64 [[TMP26]] to ptr
+; RUNTIME-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[TMP8]], i64 56
 ; RUNTIME-NEXT:    [[TMP28:%.*]] = load i64, ptr [[TMP27]], align 8
 ; RUNTIME-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
 ; RUNTIME-NEXT:    store i8 0, ptr [[TMP29]], align 1
 ; RUNTIME-NEXT:    br label [[TMP33:%.*]]
-; RUNTIME:       30:
+; RUNTIME:       26:
 ; RUNTIME-NEXT:    [[TMP31:%.*]] = add i64 [[TMP19]], 0
 ; RUNTIME-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
 ; RUNTIME-NEXT:    store i64 0, ptr [[TMP32]], align 1
 ; RUNTIME-NEXT:    br label [[TMP33]]
-; RUNTIME:       33:
+; RUNTIME:       29:
 ; RUNTIME-NEXT:    ret void
 ;
 ; ALWAYS-LABEL: @Simple(
 ; ALWAYS-NEXT:  entry:
 ; ALWAYS-NEXT:    [[ASAN_LOCAL_STACK_BASE:%.*]] = alloca i64, align 8
 ; ALWAYS-NEXT:    [[TMP0:%.*]] = call i64 @__asan_stack_malloc_always_0(i64 64)
+; ALWAYS-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP0]] to ptr
 ; ALWAYS-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[TMP0]], 0
 ; ALWAYS-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP4:%.*]]
-; ALWAYS:       2:
+; ALWAYS:       3:
 ; ALWAYS-NEXT:    [[MYALLOCA:%.*]] = alloca i8, i64 64, align 32
-; ALWAYS-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[MYALLOCA]] to i64
 ; ALWAYS-NEXT:    br label [[TMP4]]
 ; ALWAYS:       4:
-; ALWAYS-NEXT:    [[TMP5:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP3]], [[TMP2]] ]
-; ALWAYS-NEXT:    store i64 [[TMP5]], ptr [[ASAN_LOCAL_STACK_BASE]], align 8
-; ALWAYS-NEXT:    [[TMP6:%.*]] = add i64 [[TMP5]], 32
-; ALWAYS-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
-; ALWAYS-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; ALWAYS-NEXT:    [[TMP8:%.*]] = phi ptr [ [[TMP3]], [[ENTRY:%.*]] ], [ [[MYALLOCA]], [[TMP2]] ]
+; ALWAYS-NEXT:    store ptr [[TMP8]], ptr [[ASAN_LOCAL_STACK_BASE]], align 8
+; ALWAYS-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP8]], i64 32
 ; ALWAYS-NEXT:    store i64 1102416563, ptr [[TMP8]], align 8
-; ALWAYS-NEXT:    [[TMP9:%.*]] = add i64 [[TMP5]], 8
-; ALWAYS-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
+; ALWAYS-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP8]], i64 8
 ; ALWAYS-NEXT:    store i64 ptrtoint (ptr @___asan_gen_stack to i64), ptr [[TMP10]], align 8
-; ALWAYS-NEXT:    [[TMP11:%.*]] = add i64 [[TMP5]], 16
-; ALWAYS-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; ALWAYS-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP8]], i64 16
 ; ALWAYS-NEXT:    store i64 ptrtoint (ptr @Simple to i64), ptr [[TMP12]], align 8
+; ALWAYS-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[TMP8]] to i64
 ; ALWAYS-NEXT:    [[TMP13:%.*]] = lshr i64 [[TMP5]], 3
 ; ALWAYS-NEXT:    [[TMP14:%.*]] = add i64 [[TMP13]], 2147450880
 ; ALWAYS-NEXT:    [[TMP15:%.*]] = add i64 [[TMP14]], 0
@@ -132,22 +121,21 @@ define void @Simple() uwtable sanitize_address {
 ; ALWAYS-NEXT:    store i64 1172321806, ptr [[TMP8]], align 8
 ; ALWAYS-NEXT:    [[TMP17:%.*]] = icmp ne i64 [[TMP0]], 0
 ; ALWAYS-NEXT:    br i1 [[TMP17]], label [[TMP18:%.*]], label [[TMP25:%.*]]
-; ALWAYS:       18:
+; ALWAYS:       15:
 ; ALWAYS-NEXT:    [[TMP19:%.*]] = add i64 [[TMP14]], 0
 ; ALWAYS-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
 ; ALWAYS-NEXT:    store i64 -723401728380766731, ptr [[TMP20]], align 1
-; ALWAYS-NEXT:    [[TMP21:%.*]] = add i64 [[TMP0]], 56
-; ALWAYS-NEXT:    [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr
+; ALWAYS-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[TMP3]], i64 56
 ; ALWAYS-NEXT:    [[TMP23:%.*]] = load i64, ptr [[TMP22]], align 8
 ; ALWAYS-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
 ; ALWAYS-NEXT:    store i8 0, ptr [[TMP24]], align 1
 ; ALWAYS-NEXT:    br label [[TMP28:%.*]]
-; ALWAYS:       25:
+; ALWAYS:       21:
 ; ALWAYS-NEXT:    [[TMP26:%.*]] = add i64 [[TMP14]], 0
 ; ALWAYS-NEXT:    [[TMP27:%.*]] = inttoptr i64 [[TMP26]] to ptr
 ; ALWAYS-NEXT:    store i64 0, ptr [[TMP27]], align 1
 ; ALWAYS-NEXT:    br label [[TMP28]]
-; ALWAYS:       28:
+; ALWAYS:       24:
 ; ALWAYS-NEXT:    ret void
 ;
 entry:
@@ -160,17 +148,13 @@ define void @Huge() uwtable sanitize_address {
 ; CHECK-LABEL: @Huge(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[MYALLOCA:%.*]] = alloca i8, i64 100288, align 32
-; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[MYALLOCA]] to i64
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[TMP0]], 32
-; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP0]] to ptr
-; CHECK-NEXT:    store i64 1102416563, ptr [[TMP3]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP0]], 8
-; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[MYALLOCA]], i64 32
+; CHECK-NEXT:    store i64 1102416563, ptr [[MYALLOCA]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[MYALLOCA]], i64 8
 ; CHECK-NEXT:    store i64 ptrtoint (ptr @___asan_gen_stack.1 to i64), ptr [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[TMP0]], 16
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[MYALLOCA]], i64 16
 ; CHECK-NEXT:    store i64 ptrtoint (ptr @Huge to i64), ptr [[TMP7]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[MYALLOCA]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = lshr i64 [[TMP0]], 3
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[TMP8]], 2147450880
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[TMP9]], 0
@@ -189,7 +173,7 @@ define void @Huge() uwtable sanitize_address {
 ; CHECK-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr
 ; CHECK-NEXT:    store i64 -868082074056920077, ptr [[TMP19]], align 1
 ; CHECK-NEXT:    call void @Foo(ptr [[TMP2]])
-; CHECK-NEXT:    store i64 1172321806, ptr [[TMP3]], align 8
+; CHECK-NEXT:    store i64 1172321806, ptr [[MYALLOCA]], align 8
 ; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[TMP9]], 0
 ; CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
 ; CHECK-NEXT:    store i32 0, ptr [[TMP21]], align 1
diff --git a/llvm/test/Instrumentation/AddressSanitizer/lifetime.ll b/llvm/test/Instrumentation/AddressSanitizer/lifetime.ll
index d1e01807f6c26..82e114e21a690 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/lifetime.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/lifetime.ll
@@ -16,17 +16,13 @@ define void @lifetime() sanitize_address {
 ; CHECK-DEFAULT-NEXT:    [[TMP1:%.*]] = alloca i64, align 32
 ; CHECK-DEFAULT-NEXT:    store i64 0, ptr [[TMP1]], align 8
 ; CHECK-DEFAULT-NEXT:    [[MYALLOCA:%.*]] = alloca i8, i64 64, align 32
-; CHECK-DEFAULT-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[MYALLOCA]] to i64
-; CHECK-DEFAULT-NEXT:    [[TMP3:%.*]] = add i64 [[TMP2]], 32
-; CHECK-DEFAULT-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; CHECK-DEFAULT-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP2]] to ptr
-; CHECK-DEFAULT-NEXT:    store i64 1102416563, ptr [[TMP5]], align 8
-; CHECK-DEFAULT-NEXT:    [[TMP6:%.*]] = add i64 [[TMP2]], 8
-; CHECK-DEFAULT-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-DEFAULT-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[MYALLOCA]], i64 32
+; CHECK-DEFAULT-NEXT:    store i64 1102416563, ptr [[MYALLOCA]], align 8
+; CHECK-DEFAULT-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[MYALLOCA]], i64 8
 ; CHECK-DEFAULT-NEXT:    store i64 ptrtoint (ptr @___asan_gen_stack to i64), ptr [[TMP7]], align 8
-; CHECK-DEFAULT-NEXT:    [[TMP8:%.*]] = add i64 [[TMP2]], 16
-; CHECK-DEFAULT-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-DEFAULT-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[MYALLOCA]], i64 16
 ; CHECK-DEFAULT-NEXT:    store i64 ptrtoint (ptr @lifetime to i64), ptr [[TMP9]], align 8
+; CHECK-DEFAULT-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[MYALLOCA]] to i64
 ; CHECK-DEFAULT-NEXT:    [[TMP10:%.*]] = lshr i64 [[TMP2]], 3
 ; CHECK-DEFAULT-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 2147450880
 ; CHECK-DEFAULT-NEXT:    [[TMP12:%.*]] = add i64 [[TMP11]], 0
@@ -41,16 +37,16 @@ define void @lifetime() sanitize_address {
 ; CHECK-DEFAULT-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr
 ; CHECK-DEFAULT-NEXT:    [[TMP20:%.*]] = load i8, ptr [[TMP19]], align 1
 ; CHECK-DEFAULT-NEXT:    [[TMP21:%.*]] = icmp ne i8 [[TMP20]], 0
-; CHECK-DEFAULT-NEXT:    br i1 [[TMP21]], label %[[BB22:.*]], label %[[BB27:.*]], !prof [[PROF1:![0-9]+]]
-; CHECK-DEFAULT:       [[BB22]]:
+; CHECK-DEFAULT-NEXT:    br i1 [[TMP21]], label %[[BB18:.*]], label %[[BB23:.*]], !prof [[PROF1:![0-9]+]]
+; CHECK-DEFAULT:       [[BB18]]:
 ; CHECK-DEFAULT-NEXT:    [[TMP23:%.*]] = and i64 [[TMP16]], 7
 ; CHECK-DEFAULT-NEXT:    [[TMP24:%.*]] = trunc i64 [[TMP23]] to i8
 ; CHECK-DEFAULT-NEXT:    [[TMP25:%.*]] = icmp sge i8 [[TMP24]], [[TMP20]]
-; CHECK-DEFAULT-NEXT:    br i1 [[TMP25]], label %[[BB26:.*]], label %[[BB27]]
-; CHECK-DEFAULT:       [[BB26]]:
+; CHECK-DEFAULT-NEXT:    br i1 [[TMP25]], label %[[BB22:.*]], label %[[BB23]]
+; CHECK-DEFAULT:       [[BB22]]:
 ; CHECK-DEFAULT-NEXT:    call void @__asan_report_store1(i64 [[TMP16]]) #[[ATTR4:[0-9]+]]
 ; CHECK-DEFAULT-NEXT:    unreachable
-; CHECK-DEFAULT:       [[BB27]]:
+; CHECK-DEFAULT:       [[BB23]]:
 ; CHECK-DEFAULT-NEXT:    store volatile i8 0, ptr [[TMP4]], align 1
 ; CHECK-DEFAULT-NEXT:    [[TMP28:%.*]] = add i64 [[TMP11]], 4
 ; CHECK-DEFAULT-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
@@ -73,16 +69,16 @@ define void @lifetime() sanitize_address {
 ; CHECK-DEFAULT-NEXT:    [[TMP41:%.*]] = inttoptr i64 [[TMP40]] to ptr
 ; CHECK-DEFAULT-NEXT:    [[TMP42:%.*]] = load i8, ptr [[TMP41]], align 1
 ; CHECK-DEFAULT-NEXT:    [[TMP43:%.*]] = icmp ne i8 [[TMP42]], 0
-; CHECK-DEFAULT-NEXT:    br i1 [[TMP43]], label %[[BB44:.*]], label %[[BB49:.*]], !prof [[PROF1]]
-; CHECK-DEFAULT:       [[BB44]]:
+; CHECK-DEFAULT-NEXT:    br i1 [[TMP43]], label %[[BB40:.*]], label %[[BB45:.*]], !prof [[PROF1]]
+; CHECK-DEFAULT:       [[BB40]]:
 ; CHECK-DEFAULT-NEXT:    [[TMP45:%.*]] = and i64 [[TMP38]], 7
 ; CHECK-DEFAULT-NEXT:    [[TMP46:%.*]] = trunc i64 [[TMP45]] to i8
 ; CHECK-DEFAULT-NEXT:    [[TMP47:%.*]] = icmp sge i8 [[TMP46]], [[TMP42]]
-; CHECK-DEFAULT-NEXT:    br i1 [[TMP47]], label %[[BB48:.*]], label %[[BB49]]
-; CHECK-DEFAULT:       [[BB48]]:
+; CHECK-DEFAULT-NEXT:    br i1 [[TMP47]], label %[[BB44:.*]], label %[[BB45]]
+; CHECK-DEFAULT:       [[BB44]]:
 ; CHECK-DEFAULT-NEXT:    call void @__asan_report_store1(i64 [[TMP38]]) #[[ATTR4]]
 ; CHECK-DEFAULT-NEXT:    unreachable
-; CHECK-DEFAULT:       [[BB49]]:
+; CHECK-DEFAULT:       [[BB45]]:
 ; CHECK-DEFAULT-NEXT:    store volatile i8 0, ptr [[TMP36]], align 1
 ; CHECK-DEFAULT-NEXT:    [[TMP50:%.*]] = ptrtoint ptr [[TMP36]] to i64
 ; CHECK-DEFAULT-NEXT:    call void @__asan_poison_stack_memory(i64 [[TMP50]], i64 40)
@@ -95,16 +91,16 @@ define void @lifetime() sanitize_address {
 ; CHECK-DEFAULT-NEXT:    [[TMP56:%.*]] = inttoptr i64 [[TMP55]] to ptr
 ; CHECK-DEFAULT-NEXT:    [[TMP57:%.*]] = load i8, ptr [[TMP56]], align 1
 ; CHECK-DEFAULT-NEXT:    [[TMP58:%.*]] = icmp ne i8 [[TMP57]], 0
-; CHECK-DEFAULT-NEXT:    br i1 [[TMP58]], label %[[BB59:.*]], label %[[BB64:.*]], !prof [[PROF1]]
-; CHECK-DEFAULT:       [[BB59]]:
+; CHECK-DEFAULT-NEXT:    br i1 [[TMP58]], label %[[BB55:.*]], label %[[BB60:.*]], !prof [[PROF1]]
+; CHECK-DEFAULT:       [[BB55]]:
 ; CHECK-DEFAULT-NEXT:    [[TMP60:%.*]] = and i64 [[TMP53]], 7
 ; CHECK-DEFAULT-NEXT:    [[TMP61:%.*]] = trunc i64 [[TMP60]] to i8
 ; CHECK-DEFAULT-NEXT:    [[TMP62:%.*]] = icmp sge i8 [[TMP61]], [[TMP57]]
-; CHECK-DEFAULT-NEXT:    br i1 [[TMP62]], label %[[BB63:.*]], label %[[BB64]]
-; CHECK-DEFAULT:       [[BB63]]:
+; CHECK-DEFAULT-NEXT:    br i1 [[TMP62]], label %[[BB59:.*]], label %[[BB60]]
+; CHECK-DEFAULT:       [[BB59]]:
 ; CHECK-DEFAULT-NEXT:    call void @__asan_report_store1(i64 [[TMP53]]) #[[ATTR4]]
 ; CHECK-DEFAULT-NEXT:    unreachable
-; CHECK-DEFAULT:       [[BB64]]:
+; CHECK-DEFAULT:       [[BB60]]:
 ; CHECK-DEFAULT-NEXT:    store volatile i8 0, ptr [[TMP4]], align 1
 ; CHECK-DEFAULT-NEXT:    [[TMP65:%.*]] = add i64 [[TMP11]], 4
 ; CHECK-DEFAULT-NEXT:    [[TMP66:%.*]] = inttoptr i64 [[TMP65]] to ptr
@@ -112,7 +108,7 @@ define void @lifetime() sanitize_address {
 ; CHECK-DEFAULT-NEXT:    [[TMP67:%.*]] = ptrtoint ptr [[TMP1]] to i64
 ; CHECK-DEFAULT-NEXT:    [[TMP68:%.*]] = load i64, ptr [[TMP1]], align 8
 ; CHECK-DEFAULT-NEXT:    call void @__asan_allocas_unpoison(i64 [[TMP68]], i64 [[TMP67]])
-; CHECK-DEFAULT-NEXT:    store i64 1172321806, ptr [[TMP5]], align 8
+; CHECK-DEFAULT-NEXT:    store i64 1172321806, ptr [[MYALLOCA]], align 8
 ; CHECK-DEFAULT-NEXT:    [[TMP69:%.*]] = add i64 [[TMP11]], 0
 ; CHECK-DEFAULT-NEXT:    [[TMP70:%.*]] = inttoptr i64 [[TMP69]] to ptr
 ; CHECK-DEFAULT-NEXT:    store i64 0, ptr [[TMP70]], align 1
@@ -121,17 +117,13 @@ define void @lifetime() sanitize_address {
 ; CHECK-NO-DYNAMIC-LABEL: define void @lifetime(
 ; CHECK-NO-DYNAMIC-SAME: ) #[[ATTR0:[0-9]+]] {
 ; CHECK-NO-DYNAMIC-NEXT:    [[MYALLOCA:%.*]] = alloca i8, i64 64, align 32
-; CHECK-NO-DYNAMIC-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[MYALLOCA]] to i64
-; CHECK-NO-DYNAMIC-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], 32
-; CHECK-NO-DYNAMIC-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-; CHECK-NO-DYNAMIC-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP1]] to ptr
-; CHECK-NO-DYNAMIC-NEXT:    store i64 1102416563, ptr [[TMP4]], align 8
-; CHECK-NO-DYNAMIC-NEXT:    [[TMP5:%.*]] = add i64 [[TMP1]], 8
-; CHECK-NO-DYNAMIC-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[MYALLOCA]], i64 32
+; CHECK-NO-DYNAMIC-NEXT:    store i64 1102416563, ptr [[MYALLOCA]], align 8
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[MYALLOCA]], i64 8
 ; CHECK-NO-DYNAMIC-NEXT:    store i64 ptrtoint (ptr @___asan_gen_stack to i64), ptr [[TMP6]], align 8
-; CHECK-NO-DYNAMIC-NEXT:    [[TMP7:%.*]] = add i64 [[TMP1]], 16
-; CHECK-NO-DYNAMIC-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[MYALLOCA]], i64 16
 ; CHECK-NO-DYNAMIC-NEXT:    store i64 ptrtoint (ptr @lifetime to i64), ptr [[TMP8]], align 8
+; CHECK-NO-DYNAMIC-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[MYALLOCA]] to i64
 ; CHECK-NO-DYNAMIC-NEXT:    [[TMP9:%.*]] = lshr i64 [[TMP1]], 3
 ; CHECK-NO-DYNAMIC-NEXT:    [[TMP10:%.*]] = add i64 [[TMP9]], 2147450880
 ; CHECK-NO-DYNAMIC-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 0
@@ -146,16 +138,16 @@ define void @lifetime() sanitize_address {
 ; CHECK-NO-DYNAMIC-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
 ; CHECK-NO-DYNAMIC-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
 ; CHECK-NO-DYNAMIC-NEXT:    [[TMP20:%.*]] = icmp ne i8 [[TMP19]], 0
-; CHECK-NO-DYNAMIC-NEXT:    br i1 [[TMP20]], label %[[BB21:.*]], label %[[BB26:.*]], !prof [[PROF1:![0-9]+]]
-; CHECK-NO-DYNAMIC:       [[BB21]]:
+; CHECK-NO-DYNAMIC-NEXT:    br i1 [[TMP20]], label %[[BB17:.*]], label %[[BB22:.*]], !prof [[PROF1:![0-9]+]]
+; CHECK-NO-DYNAMIC:       [[BB17]]:
 ; CHECK-NO-DYNAMIC-NEXT:    [[TMP22:%.*]] = and i64 [[TMP15]], 7
 ; CHECK-NO-DYNAMIC-NEXT:    [[TMP23:%.*]] = trunc i64 [[TMP22]] to i8
 ; CHECK-NO-DYNAMIC-NEXT:    [[TMP24:%.*]] = icmp sge i8 [[TMP23]], [[TMP19]]
-; CHECK-NO-DYNAMIC-NEXT:    br i1 [[TMP24]], label %[[BB25:.*]], label %[[BB26]]
-; CHECK-NO-DYNAMIC:       [[BB25]]:
+; CHECK-NO-DYNAMIC-NEXT:    br i1 [[TMP24]], label %[[BB21:.*]], label %[[BB22]]
+; CHECK-NO-DYNAMIC:       [[BB21]]:
 ; CHECK-NO-DYNAMIC-NEXT:    call void @__asan_report_store1(i64 [[TMP15]]) #[[ATTR4:[0-9]+]]
 ; CHECK-NO-DYNAMIC-NEXT:    unreachable
-; CHECK-NO-DYNAMIC:       [[BB26]]:
+; CHECK-NO-DYNAMIC:       [[BB22]]:
 ; CHECK-NO-DYNAMIC-NEXT:    store volatile i8 0, ptr [[TMP3]], align 1
 ; CHECK-NO-DYNAMIC-NEXT:    [[TMP27:%.*]] = add i64 [[TMP10]], 4
 ; CHECK-NO-DYNAMIC-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
@@ -171,16 +163,16 @@ define void @lifetime() sanitize_address {
 ; CHECK-NO-DYNAMIC-NEXT:    [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr
 ; CHECK-NO-DYNAMIC-NEXT:    [[TMP35:%.*]] = load i8, ptr [[TMP34]], align 1
 ; CHECK-NO-DYNAMIC-NEXT:    [[TMP36:%.*]] = icmp ne i8 [[TMP35]], 0
-; CHECK-NO-DYNAMIC-NEXT:    br i1 [[TMP36]], label %[[BB37:.*]], label %[[BB42:.*]], !prof [[PROF1]]
-; CHECK-NO-DYNAMIC:       [[BB37]]:
+; CHECK-NO-DYNAMIC-NEXT:    br i1 [[TMP36]], label %[[BB33:.*]], label %[[BB38:.*]], !prof [[PROF1]]
+; CHECK-NO-DYNAMIC:       [[BB33]]:
 ; CHECK-NO-DYNAMIC-NEXT:    [[TMP38:%.*]] = and i64 [[TMP31]], 7
 ; CHECK-NO-DYNAMIC-NEXT:    [[TMP39:%.*]] = trunc i64 [[TMP38]] to i8
 ; CHECK-NO-DYNAMIC-NEXT:    [[TMP40:%.*]] = icmp sge i8 [[TMP39]], [[TMP35]]
-; CHECK-NO-DYNAMIC-NEXT:    br i1 [[TMP40]], label %[[BB41:.*]], label %[[BB42]]
-; CHECK-NO-DYNAMIC:       [[BB41]]:
+; CHECK-NO-DYNAMIC-NEXT:    br i1 [[TMP40]], label %[[BB37:.*]], label %[[BB38]]
+; CHECK-NO-DYNAMIC:       [[BB37]]:
 ; CHECK-NO-DYNAMIC-NEXT:    call void @__asan_report_store1(i64 [[TMP31]]) #[[ATTR4]]
 ; CHECK-NO-DYNAMIC-NEXT:    unreachable
-; CHECK-NO-DYNAMIC:       [[BB42]]:
+; CHECK-NO-DYNAMIC:       [[BB38]]:
 ; CHECK-NO-DYNAMIC-NEXT:    store volatile i8 0, ptr [[ARR]], align 1
 ; CHECK-NO-DYNAMIC-NEXT:    call void @llvm.lifetime.end.p0(ptr [[ARR]])
 ; CHECK-NO-DYNAMIC-NEXT:    [[TMP43:%.*]] = add i64 [[TMP10]], 4
@@ -192,21 +184,21 @@ define void @lifetime() sanitize_address {
 ; CHECK-NO-DYNAMIC-NEXT:    [[TMP48:%.*]] = inttoptr i64 [[TMP47]] to ptr
 ; CHECK-NO-DYNAMIC-NEXT:    [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1
 ; CHECK-NO-DYNAMIC-NEXT:    [[TMP50:%.*]] = icmp ne i8 [[TMP49]], 0
-; CHECK-NO-DYNAMIC-NEXT:    br i1 [[TMP50]], label %[[BB51:.*]], label %[[BB56:.*]], !prof [[PROF1]]
-; CHECK-NO-DYNAMIC:       [[BB51]]:
+; CHECK-NO-DYNAMIC-NEXT:    br i1 [[TMP50]], label %[[BB47:.*]], label %[[BB52:.*]], !prof [[PROF1]]
+; CHECK-NO-DYNAMIC:       [[BB47]]:
 ; CHECK-NO-DYNAMIC-NEXT:    [[TMP52:%.*]] = and i64 [[TMP45]], 7
 ; CHECK-NO-DYNAMIC-NEXT:    [[TMP53:%.*]] = trunc i64 [[TMP52]] to i8
 ; CHECK-NO-DYNAMIC-NEXT:    [[TMP54:%.*]] = icmp sge i8 [[TMP53]], [[TMP49]]
-; CHECK-NO-DYNAMIC-NEXT:    br i1 [[TMP54]], label %[[BB55:.*]], label %[[BB56]]
-; CHECK-NO-DYNAMIC:       [[BB55]]:
+; CHECK-NO-DYNAMIC-NEXT:    br i1 [[TMP54]], label %[[BB51:.*]], label %[[BB52]]
+; CHECK-NO-DYNAMIC:       [[BB51]]:
 ; CHECK-NO-DYNAMIC-NEXT:    call void @__asan_report_store1(i64 [[TMP45]]) #[[ATTR4]]
 ; CHECK-NO-DYNAMIC-NEXT:    unreachable
-; CHECK-NO-DYNAMIC:       [[BB56]]:
+; CHECK-NO-DYNAMIC:       [[BB52]]:
 ; CHECK-NO-DYNAMIC-NEXT:    store volatile i8 0, ptr [[TMP3]], align 1
 ; CHECK-NO-DYNAMIC-NEXT:    [[TMP57:%.*]] = add i64 [[TMP10]], 4
 ; CHECK-NO-DYNAMIC-NEXT:    [[TMP58:%.*]] = inttoptr i64 [[TMP57]] to ptr
 ; CHECK-NO-DYNAMIC-NEXT:    store i8 -8, ptr [[TMP58]], align 1
-; CHECK-NO-DYNAMIC-NEXT:    store i64 1172321806, ptr [[TMP4]], align 8
+; CHECK-NO-DYNAMIC-NEXT:    store i64 1172321806, ptr [[MYALLOCA]], align 8
 ; CHECK-NO-DYNAMIC-NEXT:    [[TMP59:%.*]] = add i64 [[TMP10]], 0
 ; CHECK-NO-DYNAMIC-NEXT:    [[TMP60:%.*]] = inttoptr i64 [[TMP59]] to ptr
 ; CHECK-NO-DYNAMIC-NEXT:    store i64 0, ptr [[TMP60]], align 1
diff --git a/llvm/test/Instrumentation/AddressSanitizer/local_stack_base.ll b/llvm/test/Instrumentation/AddressSanitizer/local_stack_base.ll
index afa46e44e2824..8b0940d9f6baf 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/local_stack_base.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/local_stack_base.ll
@@ -16,9 +16,8 @@ entry:
   call void @llvm.dbg.declare(metadata ptr %i.addr, metadata !12, metadata !DIExpression()), !dbg !13
 
   ; CHECK: %asan_local_stack_base = alloca i64
-  ; CHECK: %[[ALLOCA:.*]] = ptrtoint ptr %MyAlloca to i64
-  ; CHECK: %[[PHI:.*]] = phi i64 {{.*}} %[[ALLOCA]],
-  ; CHECK: store i64 %[[PHI]], ptr %asan_local_stack_base
+  ; CHECK: %[[PHI:.*]] = phi ptr {{.*}} %MyAlloca
+  ; CHECK: store ptr %[[PHI]], ptr %asan_local_stack_base
 ; CHECK: #dbg_declare(ptr %asan_local_stack_base, [[VAR_I:![0-9]+]], !DIExpression(DW_OP_deref, DW_OP_plus_uconst, 32), [[LOC_I:![0-9]+]]
   %0 = load i32, ptr %i.addr, align 4, !dbg !14
   %add = add nsw i32 %0, 2, !dbg !15
diff --git a/llvm/test/Instrumentation/AddressSanitizer/stack-poisoning-byval-args.ll b/llvm/test/Instrumentation/AddressSanitizer/stack-poisoning-byval-args.ll
index d85f21702146a..7e14987418e58 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/stack-poisoning-byval-args.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/stack-poisoning-byval-args.ll
@@ -19,8 +19,7 @@ entry:
 ; CHECK-LABEL: foo
 ; CHECK: call i64 @__asan_stack_malloc
 ; CHECK: alloca i8, i64 {{.*}} align 64
-; CHECK: [[copyPtr:%[^ \t]+]] = inttoptr i64 %{{[^ \t]+}} to ptr
-; CHECK: call void @llvm.memcpy{{[^%]+}}[[copyPtr]]{{[^%]+}} align 64 %a,{{[^,]+}},
+; CHECK: call void @llvm.memcpy{{[^%]+}}[[copyPtr:%[0-9]+]]{{[^%]+}} align 64 %a,{{[^,]+}},
 ; CHECK: call i32 @bar(ptr [[copyPtr]])
 ; CHECK: ret void
 
@@ -38,8 +37,7 @@ entry:
 ; CHECK-LABEL: baz
 ; CHECK: call i64 @__asan_stack_malloc
 ; CHECK: alloca i8, i64 {{.*}} align 32
-; CHECK: [[copyPtr:%[^ \t]+]] = inttoptr i64 %{{[^ \t]+}} to ptr
-; CHECK: call void @llvm.memcpy{{[^%]+}}[[copyPtr]]{{[^%]+}} align 4 %0,{{[^,]+}}
+; CHECK: call void @llvm.memcpy{{[^%]+}}[[copyPtr:%[0-9]+]]{{[^%]+}} align 4 %0,{{[^,]+}}
 ; CHECK: call i32 @bar(ptr [[copyPtr]])
 ; CHECK: ret void
 
diff --git a/llvm/test/Instrumentation/AddressSanitizer/stack_dynamic_alloca.ll b/llvm/test/Instrumentation/AddressSanitizer/stack_dynamic_alloca.ll
index d56cd340d70ab..c8478c8b46803 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/stack_dynamic_alloca.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/stack_dynamic_alloca.ll
@@ -19,15 +19,16 @@ entry:
 
 ; CHECK-RUNTIME: [[FAKE_STACK_BB:^[0-9]+]]:
 ; CHECK-RUNTIME: [[FAKE_STACK:%[0-9]+]] = phi i64 [ 0, %entry ], [ [[FAKE_STACK_RT]], %[[UAR_ENABLED_BB]] ]
+; CHECK-RUNTIME: [[FAKE_STACK_PTR:%[0-9]+]] = inttoptr i64 [[FAKE_STACK]] to ptr
+; CHECK-ALWAYS: [[FAKE_STACK_PTR:%[0-9]+]] = inttoptr i64 [[FAKE_STACK_RT]] to ptr
 ; CHECK-RUNTIME: icmp eq i64 [[FAKE_STACK]], 0
 ; CHECK-ALWAYS: icmp eq i64 [[FAKE_STACK_RT]], 0
 
 ; CHECK: [[NO_FAKE_STACK_BB:^[0-9]+]]:
 ; CHECK: %MyAlloca = alloca i8, i64
-; CHECK: [[ALLOCA:%[0-9]+]] = ptrtoint ptr %MyAlloca
 
-; CHECK-RUNTIME: phi i64 [ [[FAKE_STACK]], %[[FAKE_STACK_BB]] ], [ [[ALLOCA]], %[[NO_FAKE_STACK_BB]] ]
-; CHECK-ALWAYS: phi i64 [ [[FAKE_STACK_RT]], %entry ], [ [[ALLOCA]], %[[NO_FAKE_STACK_BB]] ]
+; CHECK-RUNTIME: phi ptr [ [[FAKE_STACK_PTR]], %[[FAKE_STACK_BB]] ], [ %MyAlloca, %[[NO_FAKE_STACK_BB]] ]
+; CHECK-ALWAYS: phi ptr [ [[FAKE_STACK_PTR]], %entry ], [ %MyAlloca, %[[NO_FAKE_STACK_BB]] ]
 
 ; CHECK: ret void
 
diff --git a/llvm/test/Instrumentation/AllocToken/intrinsic.ll b/llvm/test/Instrumentation/AllocToken/intrinsic.ll
new file mode 100644
index 0000000000000..13aaa90008a7c
--- /dev/null
+++ b/llvm/test/Instrumentation/AllocToken/intrinsic.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; Test that the alloc-token pass lowers the intrinsic to a constant token ID.
+;
+; RUN: opt < %s -passes=alloc-token -alloc-token-mode=typehashpointersplit -alloc-token-max=2 -S | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare i64 @llvm.alloc.token.id.i64(metadata)
+
+define i64 @test_intrinsic_lowering() {
+; CHECK-LABEL: define i64 @test_intrinsic_lowering() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret i64 0
+;
+entry:
+  %token_no_ptr = call i64 @llvm.alloc.token.id.i64(metadata !0)
+  ret i64 %token_no_ptr
+}
+
+define i64 @test_intrinsic_lowering_ptr() {
+; CHECK-LABEL: define i64 @test_intrinsic_lowering_ptr() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret i64 1
+;
+entry:
+  %token_with_ptr = call i64 @llvm.alloc.token.id.i64(metadata !1)
+  ret i64 %token_with_ptr
+}
+
+!0 = !{!"NoPointerType", i1 false}
+!1 = !{!"PointerType", i1 true}
diff --git a/llvm/test/Instrumentation/AllocToken/intrinsic32.ll b/llvm/test/Instrumentation/AllocToken/intrinsic32.ll
new file mode 100644
index 0000000000000..eb5dbbe91a83e
--- /dev/null
+++ b/llvm/test/Instrumentation/AllocToken/intrinsic32.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; Test that the alloc-token pass lowers the intrinsic to a constant token ID.
+;
+; RUN: opt < %s -passes=alloc-token -alloc-token-mode=typehashpointersplit -alloc-token-max=2 -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
+target triple = "i386-pc-linux-gnu"
+
+declare i32 @llvm.alloc.token.id.i32(metadata)
+
+define i32 @test_intrinsic_lowering() {
+; CHECK-LABEL: define i32 @test_intrinsic_lowering() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %token_no_ptr = call i32 @llvm.alloc.token.id.i32(metadata !0)
+  ret i32 %token_no_ptr
+}
+
+define i32 @test_intrinsic_lowering_ptr() {
+; CHECK-LABEL: define i32 @test_intrinsic_lowering_ptr() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret i32 1
+;
+entry:
+  %token_with_ptr = call i32 @llvm.alloc.token.id.i32(metadata !1)
+  ret i32 %token_with_ptr
+}
+
+!0 = !{!"NoPointerType", i1 false}
+!1 = !{!"PointerType", i1 true}
diff --git a/llvm/test/Instrumentation/HeapProfiler/masked-load-store.ll b/llvm/test/Instrumentation/HeapProfiler/masked-load-store.ll
index 80d6e0f3b36df..7617471924dd8 100644
--- a/llvm/test/Instrumentation/HeapProfiler/masked-load-store.ll
+++ b/llvm/test/Instrumentation/HeapProfiler/masked-load-store.ll
@@ -28,7 +28,7 @@ define void @store.v4f32.1110(<4 x float> %arg) {
 ; STORE: [[GEP2:%[0-9A-Za-z]+]] = getelementptr <4 x float>, ptr %p, i64 0, i64 2
 ; STORE: [[PGEP2:%[0-9A-Za-z]+]] = ptrtoint ptr [[GEP2]] to i64
 ; STORE: call void @__memprof_store(i64 [[PGEP2]])
-; STORE: tail call void @llvm.masked.store.v4f32.p0(<4 x float> %arg, ptr %p, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 false>)
+; STORE: tail call void @llvm.masked.store.v4f32.p0(<4 x float> %arg, ptr align 4 %p, <4 x i1> <i1 true, i1 true, i1 true, i1 false>)
   tail call void @llvm.masked.store.v4f32.p0(<4 x float> %arg, ptr %p, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 false>)
   ret void
 }
@@ -49,7 +49,7 @@ define void @store.v8i32.10010110(<8 x i32> %arg) {
 ; STORE: [[GEP6:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, ptr %p, i64 0, i64 6
 ; STORE: [[PGEP6:%[0-9A-Za-z]+]] = ptrtoint ptr [[GEP6]] to i64
 ; STORE: call void @__memprof_store(i64 [[PGEP6]])
-; STORE: tail call void @llvm.masked.store.v8i32.p0(<8 x i32> %arg, ptr %p, i32 8, <8 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false>)
+; STORE: tail call void @llvm.masked.store.v8i32.p0(<8 x i32> %arg, ptr align 8 %p, <8 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false>)
   tail call void @llvm.masked.store.v8i32.p0(<8 x i32> %arg, ptr %p, i32 8, <8 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false>)
   ret void
 }
@@ -61,7 +61,7 @@ define void @store.v4i64.0001(<4 x ptr> %arg) {
 ; STORE: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x ptr>, ptr %p, i64 0, i64 3
 ; STORE: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint ptr [[GEP3]] to i64
 ; STORE: call void @__memprof_store(i64 [[PGEP3]])
-; STORE: tail call void @llvm.masked.store.v4p0.p0(<4 x ptr> %arg, ptr %p, i32 8, <4 x i1> <i1 false, i1 false, i1 false, i1 true>)
+; STORE: tail call void @llvm.masked.store.v4p0.p0(<4 x ptr> %arg, ptr align 8 %p, <4 x i1> <i1 false, i1 false, i1 false, i1 true>)
   tail call void @llvm.masked.store.v4p0.p0(<4 x ptr> %arg, ptr %p, i32 8, <4 x i1> <i1 false, i1 false, i1 false, i1 true>)
   ret void
 }
@@ -105,7 +105,7 @@ define void @store.v4f32.variable(<4 x float> %arg, <4 x i1> %mask) {
 ; STORE: br label %[[AFTER3]]
 ; STORE: [[AFTER3]]:
 
-; STORE: tail call void @llvm.masked.store.v4f32.p0(<4 x float> %arg, ptr %p, i32 4, <4 x i1> %mask)
+; STORE: tail call void @llvm.masked.store.v4f32.p0(<4 x float> %arg, ptr align 4 %p, <4 x i1> %mask)
   tail call void @llvm.masked.store.v4f32.p0(<4 x float> %arg, ptr %p, i32 4, <4 x i1> %mask)
   ret void
 }
@@ -117,12 +117,12 @@ define void @store.v4f32.1010.split(<4 x float> %arg) {
 ; STORE: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <4 x float>, ptr %p, i64 0, i64 0
 ; STORE: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint ptr [[GEP0]] to i64
 ; STORE: call void @__memprof_store(i64 [[PGEP0]])
-; STORE: tail call void @llvm.masked.store.v4f32.p0(<4 x float> %arg, ptr %p, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 false>)
+; STORE: tail call void @llvm.masked.store.v4f32.p0(<4 x float> %arg, ptr align 4 %p, <4 x i1> <i1 true, i1 false, i1 false, i1 false>)
   tail call void @llvm.masked.store.v4f32.p0(<4 x float> %arg, ptr %p, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 false>)
 ; STORE: [[GEP1:%[0-9A-Za-z]+]] = getelementptr <4 x float>, ptr %p, i64 0, i64 2
 ; STORE: [[PGEP1:%[0-9A-Za-z]+]] = ptrtoint ptr [[GEP1]] to i64
 ; STORE: call void @__memprof_store(i64 [[PGEP1]])
-; STORE: tail call void @llvm.masked.store.v4f32.p0(<4 x float> %arg, ptr %p, i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>)
+; STORE: tail call void @llvm.masked.store.v4f32.p0(<4 x float> %arg, ptr align 4 %p, <4 x i1> <i1 false, i1 false, i1 true, i1 false>)
   tail call void @llvm.masked.store.v4f32.p0(<4 x float> %arg, ptr %p, i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>)
   ret void
 }
@@ -148,7 +148,7 @@ define <8 x i32> @load.v8i32.11100001(<8 x i32> %arg) {
 ; LOAD: [[GEP7:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, ptr %p, i64 0, i64 7
 ; LOAD: [[PGEP7:%[0-9A-Za-z]+]] = ptrtoint ptr [[GEP7]] to i64
 ; LOAD: call void @__memprof_load(i64 [[PGEP7]])
-; LOAD: tail call <8 x i32> @llvm.masked.load.v8i32.p0(ptr %p, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> %arg)
+; LOAD: tail call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 8 %p, <8 x i1> <i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> %arg)
   %res = tail call <8 x i32> @llvm.masked.load.v8i32.p0(ptr %p, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> %arg)
   ret <8 x i32> %res
 }
@@ -163,7 +163,7 @@ define <4 x float> @load.v4f32.1001(<4 x float> %arg) {
 ; LOAD: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x float>, ptr %p, i64 0, i64 3
 ; LOAD: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint ptr [[GEP3]] to i64
 ; LOAD: call void @__memprof_load(i64 [[PGEP3]])
-; LOAD: tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr %p, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> %arg)
+; LOAD: tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 %p, <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> %arg)
   %res = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr %p, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> %arg)
   ret <4 x float> %res
 }
@@ -175,7 +175,7 @@ define <4 x ptr> @load.v4i64.0001(<4 x ptr> %arg) {
 ; LOAD: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x ptr>, ptr %p, i64 0, i64 3
 ; LOAD: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint ptr [[GEP3]] to i64
 ; LOAD: call void @__memprof_load(i64 [[PGEP3]])
-; LOAD: tail call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr %p, i32 8, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x ptr> %arg)
+; LOAD: tail call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 %p, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x ptr> %arg)
   %res = tail call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr %p, i32 8, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x ptr> %arg)
   ret <4 x ptr> %res
 }
@@ -219,7 +219,7 @@ define <4 x float> @load.v4f32.variable(<4 x float> %arg, <4 x i1> %mask) {
 ; LOAD: br label %[[AFTER3]]
 ; LOAD: [[AFTER3]]:
 
-; LOAD: tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr %p, i32 4, <4 x i1> %mask, <4 x float> %arg)
+; LOAD: tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 %p, <4 x i1> %mask, <4 x float> %arg)
   %res = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr %p, i32 4, <4 x i1> %mask, <4 x float> %arg)
   ret <4 x float> %res
 }
@@ -231,12 +231,12 @@ define <4 x float> @load.v4f32.1001.split(<4 x float> %arg) {
 ; LOAD: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <4 x float>, ptr %p, i64 0, i64 0
 ; LOAD: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint ptr [[GEP0]] to i64
 ; LOAD: call void @__memprof_load(i64 [[PGEP0]])
-; LOAD: %res = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr %p, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %arg)
+; LOAD: %res = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 %p, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %arg)
   %res = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr %p, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %arg)
 ; LOAD: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x float>, ptr %p, i64 0, i64 3
 ; LOAD: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint ptr [[GEP3]] to i64
 ; LOAD: call void @__memprof_load(i64 [[PGEP3]])
-; LOAD: tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr %p, i32 4, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> %res)
+; LOAD: tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 %p, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> %res)
   %res2 = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr %p, i32 4, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> %res)
   ret <4 x float> %res2
 }
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount-mini.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount-mini.ll
new file mode 100644
index 0000000000000..1ddcd4b56688c
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount-mini.ll
@@ -0,0 +1,14 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=msan -mattr=+sme -o - %s | FileCheck %s
+
+; XFAIL: *
+
+; Forked from llvm/test/CodeGen/AArch64/sme-aarch64-svcount.ll
+; Manually minimized to show MSan leads to a compiler crash
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-android9001"
+
+define target("aarch64.svcount") @test_return_arg1(target("aarch64.svcount") %arg0, target("aarch64.svcount") %arg1) nounwind {
+  ret target("aarch64.svcount") %arg1
+}
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount.ll
new file mode 100644
index 0000000000000..9caa89de63748
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount.ll
@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=msan -mattr=+sme -o - %s | FileCheck %s
+
+; XFAIL: *
+
+; Forked from llvm/test/CodeGen/AArch64/sme-aarch64-svcount.ll
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-android9001"
+
+;
+; Test simple loads, stores and return.
+;
+define target("aarch64.svcount") @test_load(ptr %ptr) nounwind {
+  %res = load target("aarch64.svcount"), ptr %ptr
+  ret target("aarch64.svcount") %res
+}
+
+define void @test_store(ptr %ptr, target("aarch64.svcount") %val) nounwind {
+  store target("aarch64.svcount") %val, ptr %ptr
+  ret void
+}
+
+define target("aarch64.svcount") @test_alloca_store_reload(target("aarch64.svcount") %val) nounwind {
+  %ptr = alloca target("aarch64.svcount"), align 1
+  store target("aarch64.svcount") %val, ptr %ptr
+  %res = load target("aarch64.svcount"), ptr %ptr
+  ret target("aarch64.svcount") %res
+}
+
+;
+; Test passing as arguments (from perspective of callee)
+;
+
+define target("aarch64.svcount") @test_return_arg1(target("aarch64.svcount") %arg0, target("aarch64.svcount") %arg1) nounwind {
+  ret target("aarch64.svcount") %arg1
+}
+
+define target("aarch64.svcount") @test_return_arg4(target("aarch64.svcount") %arg0, target("aarch64.svcount") %arg1, target("aarch64.svcount") %arg2, target("aarch64.svcount") %arg3, target("aarch64.svcount") %arg4) nounwind {
+  ret target("aarch64.svcount") %arg4
+}
+
+;
+; Test passing as arguments (from perspective of caller)
+;
+
+declare void @take_svcount_1(target("aarch64.svcount") %arg)
+define void @test_pass_1arg(target("aarch64.svcount") %arg) nounwind {
+  call void @take_svcount_1(target("aarch64.svcount") %arg)
+  ret void
+}
+
+declare void @take_svcount_5(target("aarch64.svcount") %arg0, target("aarch64.svcount") %arg1, target("aarch64.svcount") %arg2, target("aarch64.svcount") %arg3, target("aarch64.svcount") %arg4)
+define void @test_pass_5args(target("aarch64.svcount") %arg) nounwind {
+  call void @take_svcount_5(target("aarch64.svcount") %arg, target("aarch64.svcount") %arg, target("aarch64.svcount") %arg, target("aarch64.svcount") %arg, target("aarch64.svcount") %arg)
+  ret void
+}
+
+define target("aarch64.svcount") @test_sel(target("aarch64.svcount") %x, target("aarch64.svcount") %y, i1 %cmp) sanitize_memory {
+  %x.y = select i1 %cmp, target("aarch64.svcount") %x, target("aarch64.svcount") %y
+  ret target("aarch64.svcount") %x.y
+}
+
+define target("aarch64.svcount") @test_sel_cc(target("aarch64.svcount") %x, target("aarch64.svcount") %y, i32 %k) sanitize_memory {
+  %cmp = icmp sgt i32 %k, 42
+  %x.y = select i1 %cmp, target("aarch64.svcount") %x, target("aarch64.svcount") %y
+  ret target("aarch64.svcount") %x.y
+}
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll
index 25a4a9af6f5a7..f0a1791068d9e 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll
@@ -732,7 +732,7 @@ define void @test_store1(<16 x float> %data, ptr %ptr, ptr %ptr2, i16 %mask)  #0
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP2]], ptr [[TMP9]], i32 1, <16 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP2]], ptr align 1 [[TMP9]], <16 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i1> [[TMP5]] to i16
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP10]], 0
@@ -742,7 +742,7 @@ define void @test_store1(<16 x float> %data, ptr %ptr, ptr %ptr2, i16 %mask)  #0
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8:[0-9]+]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       12:
-; CHECK-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[DATA:%.*]], ptr [[PTR]], i32 1, <16 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[DATA:%.*]], ptr align 1 [[PTR]], <16 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
 ; CHECK:       13:
@@ -776,7 +776,7 @@ define void @test_store2(<8 x double> %data, ptr %ptr, ptr %ptr2, i8 %mask)  #0
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.store.v8i64.p0(<8 x i64> [[TMP2]], ptr [[TMP9]], i32 1, <8 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i64.p0(<8 x i64> [[TMP2]], ptr align 1 [[TMP9]], <8 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i1> [[TMP5]] to i8
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP10]], 0
@@ -786,7 +786,7 @@ define void @test_store2(<8 x double> %data, ptr %ptr, ptr %ptr2, i8 %mask)  #0
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       12:
-; CHECK-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[DATA:%.*]], ptr [[PTR]], i32 1, <8 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[DATA:%.*]], ptr align 1 [[PTR]], <8 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
 ; CHECK:       13:
@@ -820,7 +820,7 @@ define void @test_mask_store_aligned_ps(<16 x float> %data, ptr %ptr, ptr %ptr2,
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP2]], ptr [[TMP9]], i32 64, <16 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP2]], ptr align 64 [[TMP9]], <16 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i1> [[TMP5]] to i16
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP10]], 0
@@ -830,7 +830,7 @@ define void @test_mask_store_aligned_ps(<16 x float> %data, ptr %ptr, ptr %ptr2,
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       12:
-; CHECK-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[DATA:%.*]], ptr [[PTR]], i32 64, <16 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[DATA:%.*]], ptr align 64 [[PTR]], <16 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
 ; CHECK:       13:
@@ -864,7 +864,7 @@ define void @test_mask_store_aligned_pd(<8 x double> %data, ptr %ptr, ptr %ptr2,
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.store.v8i64.p0(<8 x i64> [[TMP2]], ptr [[TMP9]], i32 64, <8 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i64.p0(<8 x i64> [[TMP2]], ptr align 64 [[TMP9]], <8 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i1> [[TMP5]] to i8
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP10]], 0
@@ -874,7 +874,7 @@ define void @test_mask_store_aligned_pd(<8 x double> %data, ptr %ptr, ptr %ptr2,
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       12:
-; CHECK-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[DATA:%.*]], ptr [[PTR]], i32 64, <8 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[DATA:%.*]], ptr align 64 [[PTR]], <8 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
 ; CHECK:       13:
@@ -908,7 +908,7 @@ define void@test_int_x86_avx512_mask_storeu_q_512(ptr %ptr1, ptr %ptr2, <8 x i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR1:%.*]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.store.v8i64.p0(<8 x i64> [[TMP2]], ptr [[TMP9]], i32 1, <8 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i64.p0(<8 x i64> [[TMP2]], ptr align 1 [[TMP9]], <8 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i1> [[TMP5]] to i8
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP10]], 0
@@ -918,7 +918,7 @@ define void@test_int_x86_avx512_mask_storeu_q_512(ptr %ptr1, ptr %ptr2, <8 x i64
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       12:
-; CHECK-NEXT:    call void @llvm.masked.store.v8i64.p0(<8 x i64> [[X1:%.*]], ptr [[PTR1]], i32 1, <8 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i64.p0(<8 x i64> [[X1:%.*]], ptr align 1 [[PTR1]], <8 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
 ; CHECK:       13:
@@ -952,7 +952,7 @@ define void@test_int_x86_avx512_mask_storeu_d_512(ptr %ptr1, ptr %ptr2, <16 x i3
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR1:%.*]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP2]], ptr [[TMP9]], i32 1, <16 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP2]], ptr align 1 [[TMP9]], <16 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i1> [[TMP5]] to i16
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP10]], 0
@@ -962,7 +962,7 @@ define void@test_int_x86_avx512_mask_storeu_d_512(ptr %ptr1, ptr %ptr2, <16 x i3
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       12:
-; CHECK-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[X1:%.*]], ptr [[PTR1]], i32 1, <16 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[X1:%.*]], ptr align 1 [[PTR1]], <16 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
 ; CHECK:       13:
@@ -996,7 +996,7 @@ define void@test_int_x86_avx512_mask_store_q_512(ptr %ptr1, ptr %ptr2, <8 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR1:%.*]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.store.v8i64.p0(<8 x i64> [[TMP2]], ptr [[TMP9]], i32 64, <8 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i64.p0(<8 x i64> [[TMP2]], ptr align 64 [[TMP9]], <8 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i1> [[TMP5]] to i8
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP10]], 0
@@ -1006,7 +1006,7 @@ define void@test_int_x86_avx512_mask_store_q_512(ptr %ptr1, ptr %ptr2, <8 x i64>
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       12:
-; CHECK-NEXT:    call void @llvm.masked.store.v8i64.p0(<8 x i64> [[X1:%.*]], ptr [[PTR1]], i32 64, <8 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i64.p0(<8 x i64> [[X1:%.*]], ptr align 64 [[PTR1]], <8 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
 ; CHECK:       13:
@@ -1040,7 +1040,7 @@ define void@test_int_x86_avx512_mask_store_d_512(ptr %ptr1, ptr %ptr2, <16 x i32
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR1:%.*]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP2]], ptr [[TMP9]], i32 64, <16 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP2]], ptr align 64 [[TMP9]], <16 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i1> [[TMP5]] to i16
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP10]], 0
@@ -1050,7 +1050,7 @@ define void@test_int_x86_avx512_mask_store_d_512(ptr %ptr1, ptr %ptr2, <16 x i32
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       12:
-; CHECK-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[X1:%.*]], ptr [[PTR1]], i32 64, <16 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[X1:%.*]], ptr align 64 [[PTR1]], <16 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
 ; CHECK:       13:
@@ -1093,7 +1093,7 @@ define <16 x float> @test_mask_load_aligned_ps(<16 x float> %data, ptr %ptr, i16
 ; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[PTR]] to i64
 ; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080
 ; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP13]], i32 64, <16 x i1> [[TMP10]], <16 x i32> [[_MSLD]])
+; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 64 [[TMP13]], <16 x i1> [[TMP10]], <16 x i32> [[_MSLD]])
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i1> [[TMP9]] to i16
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i16 [[TMP14]], 0
@@ -1103,13 +1103,13 @@ define <16 x float> @test_mask_load_aligned_ps(<16 x float> %data, ptr %ptr, i16
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       16:
-; CHECK-NEXT:    [[TMP17:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[PTR]], i32 64, <16 x i1> [[TMP10]], <16 x float> [[TMP5]])
+; CHECK-NEXT:    [[TMP17:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 64 [[PTR]], <16 x i1> [[TMP10]], <16 x float> [[TMP5]])
 ; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP20:%.*]] = ptrtoint ptr [[PTR]] to i64
 ; CHECK-NEXT:    [[TMP21:%.*]] = xor i64 [[TMP20]], 87960930222080
 ; CHECK-NEXT:    [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP22]], i32 64, <16 x i1> [[TMP19]], <16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 64 [[TMP22]], <16 x i1> [[TMP19]], <16 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <16 x i1> [[TMP18]] to i16
 ; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i16 [[TMP23]], 0
@@ -1119,7 +1119,7 @@ define <16 x float> @test_mask_load_aligned_ps(<16 x float> %data, ptr %ptr, i16
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       25:
-; CHECK-NEXT:    [[TMP26:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[PTR]], i32 64, <16 x i1> [[TMP19]], <16 x float> zeroinitializer)
+; CHECK-NEXT:    [[TMP26:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 64 [[PTR]], <16 x i1> [[TMP19]], <16 x float> zeroinitializer)
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[_MSMASKEDLD1]], [[_MSMASKEDLD]]
 ; CHECK-NEXT:    [[RES4:%.*]] = fadd <16 x float> [[TMP26]], [[TMP17]]
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
@@ -1156,7 +1156,7 @@ define <16 x float> @test_mask_load_unaligned_ps(<16 x float> %data, ptr %ptr, i
 ; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[PTR]] to i64
 ; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080
 ; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP13]], i32 1, <16 x i1> [[TMP10]], <16 x i32> [[_MSLD]])
+; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 1 [[TMP13]], <16 x i1> [[TMP10]], <16 x i32> [[_MSLD]])
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i1> [[TMP9]] to i16
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i16 [[TMP14]], 0
@@ -1166,13 +1166,13 @@ define <16 x float> @test_mask_load_unaligned_ps(<16 x float> %data, ptr %ptr, i
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       16:
-; CHECK-NEXT:    [[TMP17:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[PTR]], i32 1, <16 x i1> [[TMP10]], <16 x float> [[TMP5]])
+; CHECK-NEXT:    [[TMP17:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 1 [[PTR]], <16 x i1> [[TMP10]], <16 x float> [[TMP5]])
 ; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP20:%.*]] = ptrtoint ptr [[PTR]] to i64
 ; CHECK-NEXT:    [[TMP21:%.*]] = xor i64 [[TMP20]], 87960930222080
 ; CHECK-NEXT:    [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP22]], i32 1, <16 x i1> [[TMP19]], <16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 1 [[TMP22]], <16 x i1> [[TMP19]], <16 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <16 x i1> [[TMP18]] to i16
 ; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i16 [[TMP23]], 0
@@ -1182,7 +1182,7 @@ define <16 x float> @test_mask_load_unaligned_ps(<16 x float> %data, ptr %ptr, i
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       25:
-; CHECK-NEXT:    [[TMP26:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[PTR]], i32 1, <16 x i1> [[TMP19]], <16 x float> zeroinitializer)
+; CHECK-NEXT:    [[TMP26:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 1 [[PTR]], <16 x i1> [[TMP19]], <16 x float> zeroinitializer)
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[_MSMASKEDLD1]], [[_MSMASKEDLD]]
 ; CHECK-NEXT:    [[RES4:%.*]] = fadd <16 x float> [[TMP26]], [[TMP17]]
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
@@ -1219,7 +1219,7 @@ define <8 x double> @test_mask_load_aligned_pd(<8 x double> %data, ptr %ptr, i8
 ; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[PTR]] to i64
 ; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080
 ; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP13]], i32 64, <8 x i1> [[TMP10]], <8 x i64> [[_MSLD]])
+; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 64 [[TMP13]], <8 x i1> [[TMP10]], <8 x i64> [[_MSLD]])
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i1> [[TMP9]] to i8
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i8 [[TMP14]], 0
@@ -1229,13 +1229,13 @@ define <8 x double> @test_mask_load_aligned_pd(<8 x double> %data, ptr %ptr, i8
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       16:
-; CHECK-NEXT:    [[TMP17:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[PTR]], i32 64, <8 x i1> [[TMP10]], <8 x double> [[TMP5]])
+; CHECK-NEXT:    [[TMP17:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 64 [[PTR]], <8 x i1> [[TMP10]], <8 x double> [[TMP5]])
 ; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP20:%.*]] = ptrtoint ptr [[PTR]] to i64
 ; CHECK-NEXT:    [[TMP21:%.*]] = xor i64 [[TMP20]], 87960930222080
 ; CHECK-NEXT:    [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP22]], i32 64, <8 x i1> [[TMP19]], <8 x i64> zeroinitializer)
+; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 64 [[TMP22]], <8 x i1> [[TMP19]], <8 x i64> zeroinitializer)
 ; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <8 x i1> [[TMP18]] to i8
 ; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP23]], 0
@@ -1245,7 +1245,7 @@ define <8 x double> @test_mask_load_aligned_pd(<8 x double> %data, ptr %ptr, i8
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       25:
-; CHECK-NEXT:    [[TMP26:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[PTR]], i32 64, <8 x i1> [[TMP19]], <8 x double> zeroinitializer)
+; CHECK-NEXT:    [[TMP26:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 64 [[PTR]], <8 x i1> [[TMP19]], <8 x double> zeroinitializer)
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[_MSMASKEDLD1]], [[_MSMASKEDLD]]
 ; CHECK-NEXT:    [[RES4:%.*]] = fadd <8 x double> [[TMP26]], [[TMP17]]
 ; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
@@ -1282,7 +1282,7 @@ define <8 x double> @test_mask_load_unaligned_pd(<8 x double> %data, ptr %ptr, i
 ; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[PTR]] to i64
 ; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080
 ; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP13]], i32 1, <8 x i1> [[TMP10]], <8 x i64> [[_MSLD]])
+; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 1 [[TMP13]], <8 x i1> [[TMP10]], <8 x i64> [[_MSLD]])
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i1> [[TMP9]] to i8
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i8 [[TMP14]], 0
@@ -1292,13 +1292,13 @@ define <8 x double> @test_mask_load_unaligned_pd(<8 x double> %data, ptr %ptr, i
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       16:
-; CHECK-NEXT:    [[TMP17:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[PTR]], i32 1, <8 x i1> [[TMP10]], <8 x double> [[TMP5]])
+; CHECK-NEXT:    [[TMP17:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 1 [[PTR]], <8 x i1> [[TMP10]], <8 x double> [[TMP5]])
 ; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP20:%.*]] = ptrtoint ptr [[PTR]] to i64
 ; CHECK-NEXT:    [[TMP21:%.*]] = xor i64 [[TMP20]], 87960930222080
 ; CHECK-NEXT:    [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP22]], i32 1, <8 x i1> [[TMP19]], <8 x i64> zeroinitializer)
+; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 1 [[TMP22]], <8 x i1> [[TMP19]], <8 x i64> zeroinitializer)
 ; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <8 x i1> [[TMP18]] to i8
 ; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP23]], 0
@@ -1308,7 +1308,7 @@ define <8 x double> @test_mask_load_unaligned_pd(<8 x double> %data, ptr %ptr, i
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       25:
-; CHECK-NEXT:    [[TMP26:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[PTR]], i32 1, <8 x i1> [[TMP19]], <8 x double> zeroinitializer)
+; CHECK-NEXT:    [[TMP26:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 1 [[PTR]], <8 x i1> [[TMP19]], <8 x double> zeroinitializer)
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[_MSMASKEDLD1]], [[_MSMASKEDLD]]
 ; CHECK-NEXT:    [[RES4:%.*]] = fadd <8 x double> [[TMP26]], [[TMP17]]
 ; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
@@ -1348,7 +1348,7 @@ define <16 x i32> @test_mask_load_unaligned_d(ptr %ptr, ptr %ptr2, <16 x i32> %d
 ; CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 87960930222080
 ; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP14]], i32 1, <16 x i1> [[TMP11]], <16 x i32> [[_MSLD]])
+; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 1 [[TMP14]], <16 x i1> [[TMP11]], <16 x i32> [[_MSLD]])
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i1> [[TMP10]] to i16
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i16 [[TMP15]], 0
@@ -1358,13 +1358,13 @@ define <16 x i32> @test_mask_load_unaligned_d(ptr %ptr, ptr %ptr2, <16 x i32> %d
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       17:
-; CHECK-NEXT:    [[TMP18:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[PTR2]], i32 1, <16 x i1> [[TMP11]], <16 x i32> [[TMP6]])
+; CHECK-NEXT:    [[TMP18:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 1 [[PTR2]], <16 x i1> [[TMP11]], <16 x i32> [[TMP6]])
 ; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = ptrtoint ptr [[PTR]] to i64
 ; CHECK-NEXT:    [[TMP22:%.*]] = xor i64 [[TMP21]], 87960930222080
 ; CHECK-NEXT:    [[TMP23:%.*]] = inttoptr i64 [[TMP22]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP23]], i32 1, <16 x i1> [[TMP20]], <16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 1 [[TMP23]], <16 x i1> [[TMP20]], <16 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <16 x i1> [[TMP19]] to i16
 ; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i16 [[TMP24]], 0
@@ -1374,7 +1374,7 @@ define <16 x i32> @test_mask_load_unaligned_d(ptr %ptr, ptr %ptr2, <16 x i32> %d
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       26:
-; CHECK-NEXT:    [[TMP27:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[PTR]], i32 1, <16 x i1> [[TMP20]], <16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP27:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 1 [[PTR]], <16 x i1> [[TMP20]], <16 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[_MSMASKEDLD1]], [[_MSMASKEDLD]]
 ; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i32> [[TMP27]], [[TMP18]]
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
@@ -1412,7 +1412,7 @@ define <8 x i64> @test_mask_load_unaligned_q(ptr %ptr, ptr %ptr2, <8 x i64> %dat
 ; CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 87960930222080
 ; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP14]], i32 1, <8 x i1> [[TMP11]], <8 x i64> [[_MSLD]])
+; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 1 [[TMP14]], <8 x i1> [[TMP11]], <8 x i64> [[_MSLD]])
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i1> [[TMP10]] to i8
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i8 [[TMP15]], 0
@@ -1422,13 +1422,13 @@ define <8 x i64> @test_mask_load_unaligned_q(ptr %ptr, ptr %ptr2, <8 x i64> %dat
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       17:
-; CHECK-NEXT:    [[TMP18:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[PTR2]], i32 1, <8 x i1> [[TMP11]], <8 x i64> [[TMP6]])
+; CHECK-NEXT:    [[TMP18:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 1 [[PTR2]], <8 x i1> [[TMP11]], <8 x i64> [[TMP6]])
 ; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = ptrtoint ptr [[PTR]] to i64
 ; CHECK-NEXT:    [[TMP22:%.*]] = xor i64 [[TMP21]], 87960930222080
 ; CHECK-NEXT:    [[TMP23:%.*]] = inttoptr i64 [[TMP22]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP23]], i32 1, <8 x i1> [[TMP20]], <8 x i64> zeroinitializer)
+; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 1 [[TMP23]], <8 x i1> [[TMP20]], <8 x i64> zeroinitializer)
 ; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <8 x i1> [[TMP19]] to i8
 ; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP24]], 0
@@ -1438,7 +1438,7 @@ define <8 x i64> @test_mask_load_unaligned_q(ptr %ptr, ptr %ptr2, <8 x i64> %dat
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       26:
-; CHECK-NEXT:    [[TMP27:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[PTR]], i32 1, <8 x i1> [[TMP20]], <8 x i64> zeroinitializer)
+; CHECK-NEXT:    [[TMP27:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 1 [[PTR]], <8 x i1> [[TMP20]], <8 x i64> zeroinitializer)
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[_MSMASKEDLD1]], [[_MSMASKEDLD]]
 ; CHECK-NEXT:    [[RES4:%.*]] = add <8 x i64> [[TMP27]], [[TMP18]]
 ; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
@@ -1475,7 +1475,7 @@ define <16 x i32> @test_mask_load_aligned_d(<16 x i32> %data, ptr %ptr, i16 %mas
 ; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[PTR]] to i64
 ; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080
 ; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP13]], i32 64, <16 x i1> [[TMP10]], <16 x i32> [[_MSLD]])
+; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 64 [[TMP13]], <16 x i1> [[TMP10]], <16 x i32> [[_MSLD]])
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i1> [[TMP9]] to i16
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i16 [[TMP14]], 0
@@ -1485,13 +1485,13 @@ define <16 x i32> @test_mask_load_aligned_d(<16 x i32> %data, ptr %ptr, i16 %mas
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       16:
-; CHECK-NEXT:    [[TMP17:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[PTR]], i32 64, <16 x i1> [[TMP10]], <16 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP17:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 64 [[PTR]], <16 x i1> [[TMP10]], <16 x i32> [[TMP5]])
 ; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP20:%.*]] = ptrtoint ptr [[PTR]] to i64
 ; CHECK-NEXT:    [[TMP21:%.*]] = xor i64 [[TMP20]], 87960930222080
 ; CHECK-NEXT:    [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP22]], i32 64, <16 x i1> [[TMP19]], <16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 64 [[TMP22]], <16 x i1> [[TMP19]], <16 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <16 x i1> [[TMP18]] to i16
 ; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i16 [[TMP23]], 0
@@ -1501,7 +1501,7 @@ define <16 x i32> @test_mask_load_aligned_d(<16 x i32> %data, ptr %ptr, i16 %mas
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       25:
-; CHECK-NEXT:    [[TMP26:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[PTR]], i32 64, <16 x i1> [[TMP19]], <16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP26:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 64 [[PTR]], <16 x i1> [[TMP19]], <16 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[_MSMASKEDLD1]], [[_MSMASKEDLD]]
 ; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i32> [[TMP26]], [[TMP17]]
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
@@ -1538,7 +1538,7 @@ define <8 x i64> @test_mask_load_aligned_q(<8 x i64> %data, ptr %ptr, i8 %mask)
 ; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[PTR]] to i64
 ; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080
 ; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP13]], i32 64, <8 x i1> [[TMP10]], <8 x i64> [[_MSLD]])
+; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 64 [[TMP13]], <8 x i1> [[TMP10]], <8 x i64> [[_MSLD]])
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i1> [[TMP9]] to i8
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i8 [[TMP14]], 0
@@ -1548,13 +1548,13 @@ define <8 x i64> @test_mask_load_aligned_q(<8 x i64> %data, ptr %ptr, i8 %mask)
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       16:
-; CHECK-NEXT:    [[TMP17:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[PTR]], i32 64, <8 x i1> [[TMP10]], <8 x i64> [[TMP5]])
+; CHECK-NEXT:    [[TMP17:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 64 [[PTR]], <8 x i1> [[TMP10]], <8 x i64> [[TMP5]])
 ; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP20:%.*]] = ptrtoint ptr [[PTR]] to i64
 ; CHECK-NEXT:    [[TMP21:%.*]] = xor i64 [[TMP20]], 87960930222080
 ; CHECK-NEXT:    [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP22]], i32 64, <8 x i1> [[TMP19]], <8 x i64> zeroinitializer)
+; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 64 [[TMP22]], <8 x i1> [[TMP19]], <8 x i64> zeroinitializer)
 ; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <8 x i1> [[TMP18]] to i8
 ; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP23]], 0
@@ -1564,7 +1564,7 @@ define <8 x i64> @test_mask_load_aligned_q(<8 x i64> %data, ptr %ptr, i8 %mask)
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       25:
-; CHECK-NEXT:    [[TMP26:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[PTR]], i32 64, <8 x i1> [[TMP19]], <8 x i64> zeroinitializer)
+; CHECK-NEXT:    [[TMP26:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 64 [[PTR]], <8 x i1> [[TMP19]], <8 x i64> zeroinitializer)
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[_MSMASKEDLD1]], [[_MSMASKEDLD]]
 ; CHECK-NEXT:    [[RES4:%.*]] = add <8 x i64> [[TMP26]], [[TMP17]]
 ; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll
index cc022e93bb7c0..03624380aac0a 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll
@@ -2031,7 +2031,7 @@ define void @test_mask_store_ss(ptr %ptr, <4 x float> %data, i8 %mask) #0 {
 ; CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 87960930222080
 ; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr [[TMP14]], i32 1, <4 x i1> [[EXTRACT]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr align 1 [[TMP14]], <4 x i1> [[EXTRACT]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i4 [[TMP15]], 0
@@ -2041,7 +2041,7 @@ define void @test_mask_store_ss(ptr %ptr, <4 x float> %data, i8 %mask) #0 {
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       17:
-; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[DATA:%.*]], ptr [[PTR]], i32 1, <4 x i1> [[EXTRACT]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[DATA:%.*]], ptr align 1 [[PTR]], <4 x i1> [[EXTRACT]])
 ; CHECK-NEXT:    ret void
 ;
   %1 = and i8 %mask, 1
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll
index dbef575b30cc4..fd4ad9604e4f5 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll
@@ -197,7 +197,7 @@ define void @test_int_x86_avx512_mask_storeu_b_512(ptr %ptr1, ptr %ptr2, <64 x i
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR1:%.*]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.store.v64i8.p0(<64 x i8> [[TMP2]], ptr [[TMP9]], i32 1, <64 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.v64i8.p0(<64 x i8> [[TMP2]], ptr align 1 [[TMP9]], <64 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <64 x i1> [[TMP5]] to i64
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP10]], 0
@@ -207,7 +207,7 @@ define void @test_int_x86_avx512_mask_storeu_b_512(ptr %ptr1, ptr %ptr2, <64 x i
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7:[0-9]+]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       12:
-; CHECK-NEXT:    call void @llvm.masked.store.v64i8.p0(<64 x i8> [[X1:%.*]], ptr [[PTR1]], i32 1, <64 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.v64i8.p0(<64 x i8> [[X1:%.*]], ptr align 1 [[PTR1]], <64 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
 ; CHECK:       13:
@@ -240,7 +240,7 @@ define void @test_int_x86_avx512_mask_storeu_w_512(ptr %ptr1, ptr %ptr2, <32 x i
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR1:%.*]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.store.v32i16.p0(<32 x i16> [[TMP2]], ptr [[TMP9]], i32 1, <32 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.v32i16.p0(<32 x i16> [[TMP2]], ptr align 1 [[TMP9]], <32 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <32 x i1> [[TMP5]] to i32
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[TMP10]], 0
@@ -250,7 +250,7 @@ define void @test_int_x86_avx512_mask_storeu_w_512(ptr %ptr1, ptr %ptr2, <32 x i
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       12:
-; CHECK-NEXT:    call void @llvm.masked.store.v32i16.p0(<32 x i16> [[X1:%.*]], ptr [[PTR1]], i32 1, <32 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.v32i16.p0(<32 x i16> [[X1:%.*]], ptr align 1 [[PTR1]], <32 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
 ; CHECK:       13:
@@ -293,7 +293,7 @@ define { <32 x i16>, <32 x i16>, <32 x i16> } @test_int_x86_avx512_mask_loadu_w_
 ; CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 87960930222080
 ; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr [[TMP14]], i32 1, <32 x i1> [[TMP11]], <32 x i16> [[_MSLD]])
+; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr align 1 [[TMP14]], <32 x i1> [[TMP11]], <32 x i16> [[_MSLD]])
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <32 x i1> [[TMP10]] to i32
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i32 [[TMP15]], 0
@@ -303,13 +303,13 @@ define { <32 x i16>, <32 x i16>, <32 x i16> } @test_int_x86_avx512_mask_loadu_w_
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       17:
-; CHECK-NEXT:    [[TMP18:%.*]] = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr [[PTR2]], i32 1, <32 x i1> [[TMP11]], <32 x i16> [[TMP6]])
+; CHECK-NEXT:    [[TMP18:%.*]] = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr align 1 [[PTR2]], <32 x i1> [[TMP11]], <32 x i16> [[TMP6]])
 ; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i32 [[TMP2]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i32 [[MASK]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = ptrtoint ptr [[PTR]] to i64
 ; CHECK-NEXT:    [[TMP22:%.*]] = xor i64 [[TMP21]], 87960930222080
 ; CHECK-NEXT:    [[TMP23:%.*]] = inttoptr i64 [[TMP22]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr [[TMP23]], i32 1, <32 x i1> [[TMP20]], <32 x i16> zeroinitializer)
+; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr align 1 [[TMP23]], <32 x i1> [[TMP20]], <32 x i16> zeroinitializer)
 ; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <32 x i1> [[TMP19]] to i32
 ; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i32 [[TMP24]], 0
@@ -319,7 +319,7 @@ define { <32 x i16>, <32 x i16>, <32 x i16> } @test_int_x86_avx512_mask_loadu_w_
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       26:
-; CHECK-NEXT:    [[TMP27:%.*]] = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr [[PTR]], i32 1, <32 x i1> [[TMP20]], <32 x i16> zeroinitializer)
+; CHECK-NEXT:    [[TMP27:%.*]] = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr align 1 [[PTR]], <32 x i1> [[TMP20]], <32 x i16> zeroinitializer)
 ; CHECK-NEXT:    [[TMP28:%.*]] = insertvalue { <32 x i16>, <32 x i16>, <32 x i16> } { <32 x i16> splat (i16 -1), <32 x i16> splat (i16 -1), <32 x i16> splat (i16 -1) }, <32 x i16> [[_MSLD]], 0
 ; CHECK-NEXT:    [[RES3:%.*]] = insertvalue { <32 x i16>, <32 x i16>, <32 x i16> } poison, <32 x i16> [[TMP6]], 0
 ; CHECK-NEXT:    [[TMP29:%.*]] = insertvalue { <32 x i16>, <32 x i16>, <32 x i16> } [[TMP28]], <32 x i16> [[_MSMASKEDLD]], 1
@@ -362,7 +362,7 @@ define { <64 x i8>, <64 x i8>, <64 x i8> } @test_int_x86_avx512_mask_loadu_b_512
 ; CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 87960930222080
 ; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr [[TMP14]], i32 1, <64 x i1> [[TMP11]], <64 x i8> [[_MSLD]])
+; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr align 1 [[TMP14]], <64 x i1> [[TMP11]], <64 x i8> [[_MSLD]])
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <64 x i1> [[TMP10]] to i64
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP15]], 0
@@ -372,13 +372,13 @@ define { <64 x i8>, <64 x i8>, <64 x i8> } @test_int_x86_avx512_mask_loadu_b_512
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       17:
-; CHECK-NEXT:    [[TMP18:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr [[PTR2]], i32 1, <64 x i1> [[TMP11]], <64 x i8> [[TMP6]])
+; CHECK-NEXT:    [[TMP18:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr align 1 [[PTR2]], <64 x i1> [[TMP11]], <64 x i8> [[TMP6]])
 ; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i64 [[TMP2]] to <64 x i1>
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i64 [[MASK]] to <64 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = ptrtoint ptr [[PTR]] to i64
 ; CHECK-NEXT:    [[TMP22:%.*]] = xor i64 [[TMP21]], 87960930222080
 ; CHECK-NEXT:    [[TMP23:%.*]] = inttoptr i64 [[TMP22]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr [[TMP23]], i32 1, <64 x i1> [[TMP20]], <64 x i8> zeroinitializer)
+; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr align 1 [[TMP23]], <64 x i1> [[TMP20]], <64 x i8> zeroinitializer)
 ; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <64 x i1> [[TMP19]] to i64
 ; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP24]], 0
@@ -388,7 +388,7 @@ define { <64 x i8>, <64 x i8>, <64 x i8> } @test_int_x86_avx512_mask_loadu_b_512
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       26:
-; CHECK-NEXT:    [[TMP27:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr [[PTR]], i32 1, <64 x i1> [[TMP20]], <64 x i8> zeroinitializer)
+; CHECK-NEXT:    [[TMP27:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr align 1 [[PTR]], <64 x i1> [[TMP20]], <64 x i8> zeroinitializer)
 ; CHECK-NEXT:    [[TMP28:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } { <64 x i8> splat (i8 -1), <64 x i8> splat (i8 -1), <64 x i8> splat (i8 -1) }, <64 x i8> [[_MSLD]], 0
 ; CHECK-NEXT:    [[RES3:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } poison, <64 x i8> [[TMP6]], 0
 ; CHECK-NEXT:    [[TMP29:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } [[TMP28]], <64 x i8> [[_MSMASKEDLD]], 1
diff --git a/llvm/test/Instrumentation/MemorySanitizer/masked-store-load.ll b/llvm/test/Instrumentation/MemorySanitizer/masked-store-load.ll
index 3ac6844b3ffe8..77b48360a64d8 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/masked-store-load.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/masked-store-load.ll
@@ -24,8 +24,8 @@ define void @Store(ptr %p, <4 x i64> %v, <4 x i1> %mask) sanitize_memory {
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
 ; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP0]], ptr [[TMP3]], i32 1, <4 x i1> [[MASK:%.*]])
-; CHECK-NEXT:    tail call void @llvm.masked.store.v4i64.p0(<4 x i64> [[V:%.*]], ptr [[P]], i32 1, <4 x i1> [[MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP0]], ptr align 1 [[TMP3]], <4 x i1> [[MASK:%.*]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[V:%.*]], ptr align 1 [[P]], <4 x i1> [[MASK]])
 ; CHECK-NEXT:    ret void
 ;
 ; ADDR-LABEL: @Store(
@@ -37,7 +37,7 @@ define void @Store(ptr %p, <4 x i64> %v, <4 x i1> %mask) sanitize_memory {
 ; ADDR-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; ADDR-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
 ; ADDR-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-; ADDR-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP0]], ptr [[TMP5]], i32 1, <4 x i1> [[MASK:%.*]])
+; ADDR-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP0]], ptr align 1 [[TMP5]], <4 x i1> [[MASK:%.*]])
 ; ADDR-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; ADDR-NEXT:    [[TMP6:%.*]] = bitcast <4 x i1> [[TMP2]] to i4
 ; ADDR-NEXT:    [[_MSCMP1:%.*]] = icmp ne i4 [[TMP6]], 0
@@ -47,7 +47,7 @@ define void @Store(ptr %p, <4 x i64> %v, <4 x i1> %mask) sanitize_memory {
 ; ADDR-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7:[0-9]+]]
 ; ADDR-NEXT:    unreachable
 ; ADDR:       8:
-; ADDR-NEXT:    tail call void @llvm.masked.store.v4i64.p0(<4 x i64> [[V:%.*]], ptr [[P]], i32 1, <4 x i1> [[MASK]])
+; ADDR-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[V:%.*]], ptr align 1 [[P]], <4 x i1> [[MASK]])
 ; ADDR-NEXT:    ret void
 ;
 ; ORIGINS-LABEL: @Store(
@@ -61,7 +61,7 @@ define void @Store(ptr %p, <4 x i64> %v, <4 x i1> %mask) sanitize_memory {
 ; ORIGINS-NEXT:    [[TMP5:%.*]] = add i64 [[TMP3]], 17592186044416
 ; ORIGINS-NEXT:    [[TMP6:%.*]] = and i64 [[TMP5]], -4
 ; ORIGINS-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
-; ORIGINS-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP0]], ptr [[TMP4]], i32 1, <4 x i1> [[MASK:%.*]])
+; ORIGINS-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP0]], ptr align 1 [[TMP4]], <4 x i1> [[MASK:%.*]])
 ; ORIGINS-NEXT:    store i32 [[TMP1]], ptr [[TMP7]], align 4
 ; ORIGINS-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i32 1
 ; ORIGINS-NEXT:    store i32 [[TMP1]], ptr [[TMP8]], align 4
@@ -77,7 +77,7 @@ define void @Store(ptr %p, <4 x i64> %v, <4 x i1> %mask) sanitize_memory {
 ; ORIGINS-NEXT:    store i32 [[TMP1]], ptr [[TMP13]], align 4
 ; ORIGINS-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[TMP7]], i32 7
 ; ORIGINS-NEXT:    store i32 [[TMP1]], ptr [[TMP14]], align 4
-; ORIGINS-NEXT:    tail call void @llvm.masked.store.v4i64.p0(<4 x i64> [[V:%.*]], ptr [[P]], i32 1, <4 x i1> [[MASK]])
+; ORIGINS-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[V:%.*]], ptr align 1 [[P]], <4 x i1> [[MASK]])
 ; ORIGINS-NEXT:    ret void
 ;
 entry:
@@ -93,8 +93,8 @@ define <4 x double> @Load(ptr %p, <4 x double> %v, <4 x i1> %mask) sanitize_memo
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
 ; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr [[TMP3]], i32 1, <4 x i1> [[MASK:%.*]], <4 x i64> [[TMP0]])
-; CHECK-NEXT:    [[X:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[P]], i32 1, <4 x i1> [[MASK]], <4 x double> [[V:%.*]])
+; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr align 1 [[TMP3]], <4 x i1> [[MASK:%.*]], <4 x i64> [[TMP0]])
+; CHECK-NEXT:    [[X:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 1 [[P]], <4 x i1> [[MASK]], <4 x double> [[V:%.*]])
 ; CHECK-NEXT:    store <4 x i64> [[_MSMASKEDLD]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x double> [[X]]
 ;
@@ -107,7 +107,7 @@ define <4 x double> @Load(ptr %p, <4 x double> %v, <4 x i1> %mask) sanitize_memo
 ; ADDR-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; ADDR-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
 ; ADDR-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-; ADDR-NEXT:    [[_MSMASKEDLD:%.*]] = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr [[TMP5]], i32 1, <4 x i1> [[MASK:%.*]], <4 x i64> [[TMP2]])
+; ADDR-NEXT:    [[_MSMASKEDLD:%.*]] = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr align 1 [[TMP5]], <4 x i1> [[MASK:%.*]], <4 x i64> [[TMP2]])
 ; ADDR-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP0]], 0
 ; ADDR-NEXT:    [[TMP6:%.*]] = bitcast <4 x i1> [[TMP1]] to i4
 ; ADDR-NEXT:    [[_MSCMP1:%.*]] = icmp ne i4 [[TMP6]], 0
@@ -117,7 +117,7 @@ define <4 x double> @Load(ptr %p, <4 x double> %v, <4 x i1> %mask) sanitize_memo
 ; ADDR-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
 ; ADDR-NEXT:    unreachable
 ; ADDR:       8:
-; ADDR-NEXT:    [[X:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[P]], i32 1, <4 x i1> [[MASK]], <4 x double> [[V:%.*]])
+; ADDR-NEXT:    [[X:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 1 [[P]], <4 x i1> [[MASK]], <4 x double> [[V:%.*]])
 ; ADDR-NEXT:    store <4 x i64> [[_MSMASKEDLD]], ptr @__msan_retval_tls, align 8
 ; ADDR-NEXT:    ret <4 x double> [[X]]
 ;
@@ -132,7 +132,7 @@ define <4 x double> @Load(ptr %p, <4 x double> %v, <4 x i1> %mask) sanitize_memo
 ; ORIGINS-NEXT:    [[TMP5:%.*]] = add i64 [[TMP3]], 17592186044416
 ; ORIGINS-NEXT:    [[TMP6:%.*]] = and i64 [[TMP5]], -4
 ; ORIGINS-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
-; ORIGINS-NEXT:    [[_MSMASKEDLD:%.*]] = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr [[TMP4]], i32 1, <4 x i1> [[MASK:%.*]], <4 x i64> [[TMP0]])
+; ORIGINS-NEXT:    [[_MSMASKEDLD:%.*]] = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr align 1 [[TMP4]], <4 x i1> [[MASK:%.*]], <4 x i64> [[TMP0]])
 ; ORIGINS-NEXT:    [[TMP8:%.*]] = sub <4 x i1> zeroinitializer, [[MASK]]
 ; ORIGINS-NEXT:    [[TMP9:%.*]] = sext <4 x i1> [[TMP8]] to <4 x i64>
 ; ORIGINS-NEXT:    [[TMP10:%.*]] = and <4 x i64> [[TMP0]], [[TMP9]]
@@ -140,7 +140,7 @@ define <4 x double> @Load(ptr %p, <4 x double> %v, <4 x i1> %mask) sanitize_memo
 ; ORIGINS-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP11]], 0
 ; ORIGINS-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP7]], align 4
 ; ORIGINS-NEXT:    [[TMP13:%.*]] = select i1 [[_MSCMP]], i32 [[TMP1]], i32 [[TMP12]]
-; ORIGINS-NEXT:    [[X:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[P]], i32 1, <4 x i1> [[MASK]], <4 x double> [[V:%.*]])
+; ORIGINS-NEXT:    [[X:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 1 [[P]], <4 x i1> [[MASK]], <4 x double> [[V:%.*]])
 ; ORIGINS-NEXT:    store <4 x i64> [[_MSMASKEDLD]], ptr @__msan_retval_tls, align 8
 ; ORIGINS-NEXT:    store i32 [[TMP13]], ptr @__msan_retval_origin_tls, align 4
 ; ORIGINS-NEXT:    ret <4 x double> [[X]]
@@ -157,8 +157,8 @@ define void @StoreNoSanitize(ptr %p, <4 x i64> %v, <4 x i1> %mask) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 87960930222080
 ; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP2]], i32 1, <4 x i1> [[MASK:%.*]])
-; CHECK-NEXT:    tail call void @llvm.masked.store.v4i64.p0(<4 x i64> [[V:%.*]], ptr [[P]], i32 1, <4 x i1> [[MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr align 1 [[TMP2]], <4 x i1> [[MASK:%.*]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[V:%.*]], ptr align 1 [[P]], <4 x i1> [[MASK]])
 ; CHECK-NEXT:    ret void
 ;
 ; ADDR-LABEL: @StoreNoSanitize(
@@ -167,8 +167,8 @@ define void @StoreNoSanitize(ptr %p, <4 x i64> %v, <4 x i1> %mask) {
 ; ADDR-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; ADDR-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 87960930222080
 ; ADDR-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-; ADDR-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP2]], i32 1, <4 x i1> [[MASK:%.*]])
-; ADDR-NEXT:    tail call void @llvm.masked.store.v4i64.p0(<4 x i64> [[V:%.*]], ptr [[P]], i32 1, <4 x i1> [[MASK]])
+; ADDR-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr align 1 [[TMP2]], <4 x i1> [[MASK:%.*]])
+; ADDR-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[V:%.*]], ptr align 1 [[P]], <4 x i1> [[MASK]])
 ; ADDR-NEXT:    ret void
 ;
 ; ORIGINS-LABEL: @StoreNoSanitize(
@@ -180,7 +180,7 @@ define void @StoreNoSanitize(ptr %p, <4 x i64> %v, <4 x i1> %mask) {
 ; ORIGINS-NEXT:    [[TMP3:%.*]] = add i64 [[TMP1]], 17592186044416
 ; ORIGINS-NEXT:    [[TMP4:%.*]] = and i64 [[TMP3]], -4
 ; ORIGINS-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-; ORIGINS-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP2]], i32 1, <4 x i1> [[MASK:%.*]])
+; ORIGINS-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr align 1 [[TMP2]], <4 x i1> [[MASK:%.*]])
 ; ORIGINS-NEXT:    store i32 0, ptr [[TMP5]], align 4
 ; ORIGINS-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[TMP5]], i32 1
 ; ORIGINS-NEXT:    store i32 0, ptr [[TMP6]], align 4
@@ -196,7 +196,7 @@ define void @StoreNoSanitize(ptr %p, <4 x i64> %v, <4 x i1> %mask) {
 ; ORIGINS-NEXT:    store i32 0, ptr [[TMP11]], align 4
 ; ORIGINS-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP5]], i32 7
 ; ORIGINS-NEXT:    store i32 0, ptr [[TMP12]], align 4
-; ORIGINS-NEXT:    tail call void @llvm.masked.store.v4i64.p0(<4 x i64> [[V:%.*]], ptr [[P]], i32 1, <4 x i1> [[MASK]])
+; ORIGINS-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[V:%.*]], ptr align 1 [[P]], <4 x i1> [[MASK]])
 ; ORIGINS-NEXT:    ret void
 ;
 entry:
@@ -208,21 +208,21 @@ define <4 x double> @LoadNoSanitize(ptr %p, <4 x double> %v, <4 x i1> %mask) {
 ; CHECK-LABEL: @LoadNoSanitize(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[X:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[P:%.*]], i32 1, <4 x i1> [[MASK:%.*]], <4 x double> [[V:%.*]])
+; CHECK-NEXT:    [[X:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 1 [[P:%.*]], <4 x i1> [[MASK:%.*]], <4 x double> [[V:%.*]])
 ; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x double> [[X]]
 ;
 ; ADDR-LABEL: @LoadNoSanitize(
 ; ADDR-NEXT:  entry:
 ; ADDR-NEXT:    call void @llvm.donothing()
-; ADDR-NEXT:    [[X:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[P:%.*]], i32 1, <4 x i1> [[MASK:%.*]], <4 x double> [[V:%.*]])
+; ADDR-NEXT:    [[X:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 1 [[P:%.*]], <4 x i1> [[MASK:%.*]], <4 x double> [[V:%.*]])
 ; ADDR-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; ADDR-NEXT:    ret <4 x double> [[X]]
 ;
 ; ORIGINS-LABEL: @LoadNoSanitize(
 ; ORIGINS-NEXT:  entry:
 ; ORIGINS-NEXT:    call void @llvm.donothing()
-; ORIGINS-NEXT:    [[X:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[P:%.*]], i32 1, <4 x i1> [[MASK:%.*]], <4 x double> [[V:%.*]])
+; ORIGINS-NEXT:    [[X:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 1 [[P:%.*]], <4 x i1> [[MASK:%.*]], <4 x double> [[V:%.*]])
 ; ORIGINS-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; ORIGINS-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
 ; ORIGINS-NEXT:    ret <4 x double> [[X]]
@@ -240,8 +240,8 @@ define <16 x float> @Gather(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x float> %pas
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint <16 x ptr> [[PTRS:%.*]] to <16 x i64>
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor <16 x i64> [[TMP2]], splat (i64 87960930222080)
 ; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr <16 x i64> [[TMP3]] to <16 x ptr>
-; CHECK-NEXT:    [[_MSMASKEDGATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> [[TMP4]], i32 4, <16 x i1> [[MASK:%.*]], <16 x i32> [[TMP1]])
-; CHECK-NEXT:    [[RET:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> [[PTRS]], i32 4, <16 x i1> [[MASK]], <16 x float> [[PASSTHRU:%.*]])
+; CHECK-NEXT:    [[_MSMASKEDGATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 4 [[TMP4]], <16 x i1> [[MASK:%.*]], <16 x i32> [[TMP1]])
+; CHECK-NEXT:    [[RET:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 [[PTRS]], <16 x i1> [[MASK]], <16 x float> [[PASSTHRU:%.*]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSMASKEDGATHER]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x float> [[RET]]
 ;
@@ -254,7 +254,7 @@ define <16 x float> @Gather(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x float> %pas
 ; ADDR-NEXT:    [[TMP4:%.*]] = ptrtoint <16 x ptr> [[PTRS:%.*]] to <16 x i64>
 ; ADDR-NEXT:    [[TMP5:%.*]] = xor <16 x i64> [[TMP4]], splat (i64 87960930222080)
 ; ADDR-NEXT:    [[TMP6:%.*]] = inttoptr <16 x i64> [[TMP5]] to <16 x ptr>
-; ADDR-NEXT:    [[_MSMASKEDGATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> [[TMP6]], i32 4, <16 x i1> [[MASK]], <16 x i32> [[TMP3]])
+; ADDR-NEXT:    [[_MSMASKEDGATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 4 [[TMP6]], <16 x i1> [[MASK]], <16 x i32> [[TMP3]])
 ; ADDR-NEXT:    [[TMP7:%.*]] = bitcast <16 x i1> [[TMP1]] to i16
 ; ADDR-NEXT:    [[_MSCMP:%.*]] = icmp ne i16 [[TMP7]], 0
 ; ADDR-NEXT:    [[TMP8:%.*]] = bitcast <16 x i64> [[_MSMASKEDPTRS]] to i1024
@@ -265,7 +265,7 @@ define <16 x float> @Gather(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x float> %pas
 ; ADDR-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
 ; ADDR-NEXT:    unreachable
 ; ADDR:       10:
-; ADDR-NEXT:    [[RET:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> [[PTRS]], i32 4, <16 x i1> [[MASK]], <16 x float> [[PASSTHRU:%.*]])
+; ADDR-NEXT:    [[RET:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 [[PTRS]], <16 x i1> [[MASK]], <16 x float> [[PASSTHRU:%.*]])
 ; ADDR-NEXT:    store <16 x i32> [[_MSMASKEDGATHER]], ptr @__msan_retval_tls, align 8
 ; ADDR-NEXT:    ret <16 x float> [[RET]]
 ;
@@ -278,8 +278,8 @@ define <16 x float> @Gather(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x float> %pas
 ; ORIGINS-NEXT:    [[TMP5:%.*]] = inttoptr <16 x i64> [[TMP4]] to <16 x ptr>
 ; ORIGINS-NEXT:    [[TMP6:%.*]] = add <16 x i64> [[TMP4]], splat (i64 17592186044416)
 ; ORIGINS-NEXT:    [[TMP7:%.*]] = inttoptr <16 x i64> [[TMP6]] to <16 x ptr>
-; ORIGINS-NEXT:    [[_MSMASKEDGATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> [[TMP5]], i32 4, <16 x i1> [[MASK:%.*]], <16 x i32> [[TMP1]])
-; ORIGINS-NEXT:    [[RET:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> [[PTRS]], i32 4, <16 x i1> [[MASK]], <16 x float> [[PASSTHRU:%.*]])
+; ORIGINS-NEXT:    [[_MSMASKEDGATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 4 [[TMP5]], <16 x i1> [[MASK:%.*]], <16 x i32> [[TMP1]])
+; ORIGINS-NEXT:    [[RET:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 [[PTRS]], <16 x i1> [[MASK]], <16 x float> [[PASSTHRU:%.*]])
 ; ORIGINS-NEXT:    store <16 x i32> [[_MSMASKEDGATHER]], ptr @__msan_retval_tls, align 8
 ; ORIGINS-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
 ; ORIGINS-NEXT:    ret <16 x float> [[RET]]
@@ -291,20 +291,20 @@ define <16 x float> @Gather(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x float> %pas
 define <16 x float> @GatherNoSanitize(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x float> %passthru) {
 ; CHECK-LABEL: @GatherNoSanitize(
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[RET:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> [[PTRS:%.*]], i32 4, <16 x i1> [[MASK:%.*]], <16 x float> [[PASSTHRU:%.*]])
+; CHECK-NEXT:    [[RET:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 [[PTRS:%.*]], <16 x i1> [[MASK:%.*]], <16 x float> [[PASSTHRU:%.*]])
 ; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x float> [[RET]]
 ;
 ; ADDR-LABEL: @GatherNoSanitize(
 ; ADDR-NEXT:    call void @llvm.donothing()
 ; ADDR-NEXT:    [[_MSMASKEDPTRS:%.*]] = select <16 x i1> [[MASK:%.*]], <16 x i64> zeroinitializer, <16 x i64> zeroinitializer
-; ADDR-NEXT:    [[RET:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> [[PTRS:%.*]], i32 4, <16 x i1> [[MASK]], <16 x float> [[PASSTHRU:%.*]])
+; ADDR-NEXT:    [[RET:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 [[PTRS:%.*]], <16 x i1> [[MASK]], <16 x float> [[PASSTHRU:%.*]])
 ; ADDR-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; ADDR-NEXT:    ret <16 x float> [[RET]]
 ;
 ; ORIGINS-LABEL: @GatherNoSanitize(
 ; ORIGINS-NEXT:    call void @llvm.donothing()
-; ORIGINS-NEXT:    [[RET:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> [[PTRS:%.*]], i32 4, <16 x i1> [[MASK:%.*]], <16 x float> [[PASSTHRU:%.*]])
+; ORIGINS-NEXT:    [[RET:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 [[PTRS:%.*]], <16 x i1> [[MASK:%.*]], <16 x float> [[PASSTHRU:%.*]])
 ; ORIGINS-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; ORIGINS-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
 ; ORIGINS-NEXT:    ret <16 x float> [[RET]]
@@ -321,8 +321,8 @@ define void @Scatter(<8 x i32> %value, <8 x ptr> %ptrs, <8 x i1> %mask) sanitize
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint <8 x ptr> [[PTRS:%.*]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 87960930222080)
 ; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr <8 x i64> [[TMP3]] to <8 x ptr>
-; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[TMP1]], <8 x ptr> [[TMP4]], i32 8, <8 x i1> [[MASK:%.*]])
-; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[VALUE:%.*]], <8 x ptr> [[PTRS]], i32 8, <8 x i1> [[MASK]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[TMP1]], <8 x ptr> align 8 [[TMP4]], <8 x i1> [[MASK:%.*]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[VALUE:%.*]], <8 x ptr> align 8 [[PTRS]], <8 x i1> [[MASK]])
 ; CHECK-NEXT:    ret void
 ;
 ; ADDR-LABEL: @Scatter(
@@ -334,7 +334,7 @@ define void @Scatter(<8 x i32> %value, <8 x ptr> %ptrs, <8 x i1> %mask) sanitize
 ; ADDR-NEXT:    [[TMP4:%.*]] = ptrtoint <8 x ptr> [[PTRS:%.*]] to <8 x i64>
 ; ADDR-NEXT:    [[TMP5:%.*]] = xor <8 x i64> [[TMP4]], splat (i64 87960930222080)
 ; ADDR-NEXT:    [[TMP6:%.*]] = inttoptr <8 x i64> [[TMP5]] to <8 x ptr>
-; ADDR-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[TMP3]], <8 x ptr> [[TMP6]], i32 8, <8 x i1> [[MASK]])
+; ADDR-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[TMP3]], <8 x ptr> align 8 [[TMP6]], <8 x i1> [[MASK]])
 ; ADDR-NEXT:    [[TMP7:%.*]] = bitcast <8 x i1> [[TMP1]] to i8
 ; ADDR-NEXT:    [[_MSCMP:%.*]] = icmp ne i8 [[TMP7]], 0
 ; ADDR-NEXT:    [[TMP8:%.*]] = bitcast <8 x i64> [[_MSMASKEDPTRS]] to i512
@@ -345,7 +345,7 @@ define void @Scatter(<8 x i32> %value, <8 x ptr> %ptrs, <8 x i1> %mask) sanitize
 ; ADDR-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
 ; ADDR-NEXT:    unreachable
 ; ADDR:       10:
-; ADDR-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[VALUE:%.*]], <8 x ptr> [[PTRS]], i32 8, <8 x i1> [[MASK]])
+; ADDR-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[VALUE:%.*]], <8 x ptr> align 8 [[PTRS]], <8 x i1> [[MASK]])
 ; ADDR-NEXT:    ret void
 ;
 ; ORIGINS-LABEL: @Scatter(
@@ -357,8 +357,8 @@ define void @Scatter(<8 x i32> %value, <8 x ptr> %ptrs, <8 x i1> %mask) sanitize
 ; ORIGINS-NEXT:    [[TMP5:%.*]] = inttoptr <8 x i64> [[TMP4]] to <8 x ptr>
 ; ORIGINS-NEXT:    [[TMP6:%.*]] = add <8 x i64> [[TMP4]], splat (i64 17592186044416)
 ; ORIGINS-NEXT:    [[TMP7:%.*]] = inttoptr <8 x i64> [[TMP6]] to <8 x ptr>
-; ORIGINS-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[TMP1]], <8 x ptr> [[TMP5]], i32 8, <8 x i1> [[MASK:%.*]])
-; ORIGINS-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[VALUE:%.*]], <8 x ptr> [[PTRS]], i32 8, <8 x i1> [[MASK]])
+; ORIGINS-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[TMP1]], <8 x ptr> align 8 [[TMP5]], <8 x i1> [[MASK:%.*]])
+; ORIGINS-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[VALUE:%.*]], <8 x ptr> align 8 [[PTRS]], <8 x i1> [[MASK]])
 ; ORIGINS-NEXT:    ret void
 ;
   call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %value, <8 x ptr> %ptrs, i32 8, <8 x i1> %mask)
@@ -371,8 +371,8 @@ define void @ScatterNoSanitize(<8 x i32> %value, <8 x ptr> %ptrs, <8 x i1> %mask
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint <8 x ptr> [[PTRS:%.*]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP2:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 87960930222080)
 ; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr <8 x i64> [[TMP2]] to <8 x ptr>
-; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> zeroinitializer, <8 x ptr> [[TMP3]], i32 8, <8 x i1> [[MASK:%.*]])
-; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[VALUE:%.*]], <8 x ptr> [[PTRS]], i32 8, <8 x i1> [[MASK]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> zeroinitializer, <8 x ptr> align 8 [[TMP3]], <8 x i1> [[MASK:%.*]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[VALUE:%.*]], <8 x ptr> align 8 [[PTRS]], <8 x i1> [[MASK]])
 ; CHECK-NEXT:    ret void
 ;
 ; ADDR-LABEL: @ScatterNoSanitize(
@@ -381,8 +381,8 @@ define void @ScatterNoSanitize(<8 x i32> %value, <8 x ptr> %ptrs, <8 x i1> %mask
 ; ADDR-NEXT:    [[TMP1:%.*]] = ptrtoint <8 x ptr> [[PTRS:%.*]] to <8 x i64>
 ; ADDR-NEXT:    [[TMP2:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 87960930222080)
 ; ADDR-NEXT:    [[TMP3:%.*]] = inttoptr <8 x i64> [[TMP2]] to <8 x ptr>
-; ADDR-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> zeroinitializer, <8 x ptr> [[TMP3]], i32 8, <8 x i1> [[MASK]])
-; ADDR-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[VALUE:%.*]], <8 x ptr> [[PTRS]], i32 8, <8 x i1> [[MASK]])
+; ADDR-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> zeroinitializer, <8 x ptr> align 8 [[TMP3]], <8 x i1> [[MASK]])
+; ADDR-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[VALUE:%.*]], <8 x ptr> align 8 [[PTRS]], <8 x i1> [[MASK]])
 ; ADDR-NEXT:    ret void
 ;
 ; ORIGINS-LABEL: @ScatterNoSanitize(
@@ -392,8 +392,8 @@ define void @ScatterNoSanitize(<8 x i32> %value, <8 x ptr> %ptrs, <8 x i1> %mask
 ; ORIGINS-NEXT:    [[TMP3:%.*]] = inttoptr <8 x i64> [[TMP2]] to <8 x ptr>
 ; ORIGINS-NEXT:    [[TMP4:%.*]] = add <8 x i64> [[TMP2]], splat (i64 17592186044416)
 ; ORIGINS-NEXT:    [[TMP5:%.*]] = inttoptr <8 x i64> [[TMP4]] to <8 x ptr>
-; ORIGINS-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> zeroinitializer, <8 x ptr> [[TMP3]], i32 8, <8 x i1> [[MASK:%.*]])
-; ORIGINS-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[VALUE:%.*]], <8 x ptr> [[PTRS]], i32 8, <8 x i1> [[MASK]])
+; ORIGINS-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> zeroinitializer, <8 x ptr> align 8 [[TMP3]], <8 x i1> [[MASK:%.*]])
+; ORIGINS-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[VALUE:%.*]], <8 x ptr> align 8 [[PTRS]], <8 x i1> [[MASK]])
 ; ORIGINS-NEXT:    ret void
 ;
   call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %value, <8 x ptr> %ptrs, i32 8, <8 x i1> %mask)
diff --git a/llvm/test/LTO/X86/memprof-supports-hot-cold-new.ll b/llvm/test/LTO/X86/memprof-supports-hot-cold-new.ll
index 3ed68e8137a35..c3a75f61f248e 100644
--- a/llvm/test/LTO/X86/memprof-supports-hot-cold-new.ll
+++ b/llvm/test/LTO/X86/memprof-supports-hot-cold-new.ll
@@ -13,14 +13,14 @@
 ; RUN:	-r=%t.o,main,plx \
 ; RUN:	-r=%t.o,_Znam, \
 ; RUN:	-memprof-dump-ccg \
-; RUN:	 -save-temps \
-; RUN:	-o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
-; DUMP: Callsite Context Graph:
+; RUN:	-print-before=memprof-context-disambiguation \
+; RUN:	-o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR
 
-; RUN: llvm-dis %t.out.0.0.preopt.bc -o - | FileCheck %s --check-prefix=IR
 ; IR: !memprof {{.*}} !callsite
 ; IR: "memprof"="cold"
 
+; DUMP: Callsite Context Graph:
+
 ;; Next check without -supports-hot-cold-new, we should not perform
 ;; context disambiguation, and we should strip memprof metadata and
 ;; attributes before optimization.
@@ -28,13 +28,16 @@
 ; RUN:	-r=%t.o,main,plx \
 ; RUN:	-r=%t.o,_Znam, \
 ; RUN:	-memprof-dump-ccg \
-; RUN:	 -save-temps \
+; RUN:	-print-before=memprof-context-disambiguation \
 ; RUN:	-o %t.out 2>&1 | FileCheck %s --allow-empty \
-; RUN:  --implicit-check-not "Callsite Context Graph:"
+; RUN:  --implicit-check-not "Callsite Context Graph:" \
+; RUN: 	--implicit-check-not "!memprof" --implicit-check-not "!callsite" \
+; RUN: 	--implicit-check-not "memprof"="cold"
 
-; RUN: llvm-dis %t.out.0.0.preopt.bc -o - | FileCheck %s \
-; RUN: --implicit-check-not "!memprof" --implicit-check-not "!callsite" \
-; RUN: --implicit-check-not "memprof"="cold"
+;; Ensure the attributes and metadata are stripped when running a non-LTO pipeline.
+; RUN: opt -O3 %t.o -S | FileCheck %s \
+; RUN: 	--implicit-check-not "!memprof" --implicit-check-not "!callsite" \
+; RUN: 	--implicit-check-not "memprof"="cold"
 
 source_filename = "memprof-supports-hot-cold-new.ll"
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/test/MC/AArch64/data-directive-specifier.s b/llvm/test/MC/AArch64/data-directive-specifier.s
index 2cb7eb3a3ca81..2d1ec4feddfa3 100644
--- a/llvm/test/MC/AArch64/data-directive-specifier.s
+++ b/llvm/test/MC/AArch64/data-directive-specifier.s
@@ -12,6 +12,7 @@ l:
 # CHECK-NEXT:   0x8 R_AARCH64_PLT32 extern 0x4
 # CHECK-NEXT:   0xC R_AARCH64_PLT32 g 0x8
 # CHECK-NEXT:   0x10 R_AARCH64_PLT32 g 0x18
+# CHECK-NEXT:   0x14 R_AARCH64_FUNCINIT64 .text 0x0
 # CHECK-NEXT: }
 .data
 .word l@plt - .
@@ -21,6 +22,8 @@ l:
 .word g@plt - . + 8
 .word g@plt - .data + 8
 
+.quad l@funcinit
+
 # CHECK:      Section ({{.*}}) .rela.data1 {
 # CHECK-NEXT:   0x0 R_AARCH64_GOTPCREL32 data1 0x0
 # CHECK-NEXT:   0x4 R_AARCH64_GOTPCREL32 extern 0x4
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_salu_lit64.s b/llvm/test/MC/AMDGPU/gfx1250_asm_salu_lit64.s
index 73653d0dd0067..6345b2f534f34 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_salu_lit64.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_salu_lit64.s
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
-// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | FileCheck --check-prefixes=GFX1250,GFX1250-ASM %s
-// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding | FileCheck --check-prefixes=GFX1250,GFX1250-DIS %s
+// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | FileCheck --check-prefixes=GFX1250 %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding | FileCheck --check-prefixes=GFX1250 %s
 
 s_mov_b64 s[2:3], 0x10abcdef12345678
 // GFX1250: s_mov_b64 s[2:3], 0x10abcdef12345678    ; encoding: [0xfe,0x01,0x82,0xbe,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
@@ -62,10 +62,8 @@ s_cselect_b64 s[2:3], s[4:5], 0x10abcdef12345678
 s_mov_b64 s[2:3], 0xffffffff01234567
 // GFX1250: s_mov_b64 s[2:3], 0xffffffff01234567    ; encoding: [0xfe,0x01,0x82,0xbe,0x67,0x45,0x23,0x01,0xff,0xff,0xff,0xff]
 
-// TODO: disasm
 s_mov_b64 s[2:3], lit64(0x777)
-// GFX1250-ASM: s_mov_b64 s[2:3], lit64(0x777)          ; encoding: [0xfe,0x01,0x82,0xbe,0x77,0x07,0x00,0x00,0x00,0x00,0x00,0x00]
-// GFX1250-DIS: s_mov_b64 s[2:3], 0x777                 ; encoding: [0xff,0x01,0x82,0xbe,0x77,0x07,0x00,0x00]
+// GFX1250: s_mov_b64 s[2:3], lit64(0x777)          ; encoding: [0xfe,0x01,0x82,0xbe,0x77,0x07,0x00,0x00,0x00,0x00,0x00,0x00]
 
 s_mov_b64 s[2:3], 0x777
 // GFX1250: s_mov_b64 s[2:3], 0x777                     ; encoding: [0xff,0x01,0x82,0xbe,0x77,0x07,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vds_alias.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vds_alias.s
index 5b6bb477bdc91..83313a29657f2 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vds_alias.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vds_alias.s
@@ -5,3 +5,15 @@ ds_load_tr_b64 v[2:3], v0
 
 ds_load_tr_b128 v[2:5], v0
 // GFX1250: ds_load_tr16_b128 v[2:5], v0            ; encoding: [0x00,0x00,0xf0,0xdb,0x00,0x00,0x00,0x02]
+
+ds_load_b128_tr_b16 v[2:5], v0
+// GFX1250: ds_load_tr16_b128 v[2:5], v0            ; encoding: [0x00,0x00,0xf0,0xdb,0x00,0x00,0x00,0x02]
+
+ds_load_b64_tr_b8 v[2:3], v0
+// GFX1250: ds_load_tr8_b64 v[2:3], v0              ; encoding: [0x00,0x00,0xf4,0xdb,0x00,0x00,0x00,0x02]
+
+ds_load_b64_tr_b4 v[2:3], v0
+// GFX1250: ds_load_tr4_b64 v[2:3], v0              ; encoding: [0x00,0x00,0xe8,0xdb,0x00,0x00,0x00,0x02]
+
+ds_load_tr6_b96 v[2:4], v0
+// GFX1250: ds_load_tr6_b96 v[2:4], v0              ; encoding: [0x00,0x00,0xec,0xdb,0x00,0x00,0x00,0x02]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vflat_alias.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vflat_alias.s
index 6b2dd67b073e3..f983bc0b3dfca 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vflat_alias.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vflat_alias.s
@@ -35,3 +35,78 @@ global_load_tr_b128 v[2:5], v[6:7], off offset:64
 
 global_load_tr_b128 v[2:5], v[6:7], off offset:-64
 // GFX1250: global_load_tr16_b128 v[2:5], v[6:7], off offset:-64 ; encoding: [0x7c,0xc0,0x15,0xee,0x02,0x00,0x00,0x00,0x06,0xc0,0xff,0xff]
+
+global_load_b64_tr_b8 v[2:3], v0, s[0:1]
+// GFX1250: global_load_tr8_b64 v[2:3], v0, s[0:1]  ; encoding: [0x00,0x00,0x16,0xee,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+
+global_load_b64_tr_b8 v[2:3], v0, s[0:1] offset:64
+// GFX1250: global_load_tr8_b64 v[2:3], v0, s[0:1] offset:64 ; encoding: [0x00,0x00,0x16,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+
+global_load_b64_tr_b8 v[2:3], v0, s[0:1] offset:-64
+// GFX1250: global_load_tr8_b64 v[2:3], v0, s[0:1] offset:-64 ; encoding: [0x00,0x00,0x16,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+
+global_load_b64_tr_b8 v[2:3], v[4:5], off
+// GFX1250: global_load_tr8_b64 v[2:3], v[4:5], off ; encoding: [0x7c,0x00,0x16,0xee,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+
+global_load_b64_tr_b8 v[2:3], v[4:5], off offset:64
+// GFX1250: global_load_tr8_b64 v[2:3], v[4:5], off offset:64 ; encoding: [0x7c,0x00,0x16,0xee,0x02,0x00,0x00,0x00,0x04,0x40,0x00,0x00]
+
+global_load_b64_tr_b8 v[2:3], v[4:5], off offset:-64
+// GFX1250: global_load_tr8_b64 v[2:3], v[4:5], off offset:-64 ; encoding: [0x7c,0x00,0x16,0xee,0x02,0x00,0x00,0x00,0x04,0xc0,0xff,0xff]
+
+global_load_b128_tr_b16 v[2:5], v0, s[0:1]
+// GFX1250: global_load_tr16_b128 v[2:5], v0, s[0:1] ; encoding: [0x00,0xc0,0x15,0xee,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+
+global_load_b128_tr_b16 v[2:5], v0, s[0:1] offset:64
+// GFX1250: global_load_tr16_b128 v[2:5], v0, s[0:1] offset:64 ; encoding: [0x00,0xc0,0x15,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+
+global_load_b128_tr_b16 v[2:5], v0, s[0:1] offset:-64
+// GFX1250: global_load_tr16_b128 v[2:5], v0, s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x15,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+
+global_load_b128_tr_b16 v[2:5], v[6:7], off
+// GFX1250: global_load_tr16_b128 v[2:5], v[6:7], off ; encoding: [0x7c,0xc0,0x15,0xee,0x02,0x00,0x00,0x00,0x06,0x00,0x00,0x00]
+
+global_load_b128_tr_b16 v[2:5], v[6:7], off offset:64
+// GFX1250: global_load_tr16_b128 v[2:5], v[6:7], off offset:64 ; encoding: [0x7c,0xc0,0x15,0xee,0x02,0x00,0x00,0x00,0x06,0x40,0x00,0x00]
+
+global_load_b128_tr_b16 v[2:5], v[6:7], off offset:-64
+// GFX1250: global_load_tr16_b128 v[2:5], v[6:7], off offset:-64 ; encoding: [0x7c,0xc0,0x15,0xee,0x02,0x00,0x00,0x00,0x06,0xc0,0xff,0xff]
+
+global_load_b64_tr_b4 v[2:3], v0, s[0:1]
+// GFX1250: global_load_tr4_b64 v[2:3], v0, s[0:1]  ; encoding: [0x00,0xc0,0x1c,0xee,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+
+global_load_b64_tr_b4 v[2:3], v0, s[0:1] offset:64
+// GFX1250: global_load_tr4_b64 v[2:3], v0, s[0:1] offset:64 ; encoding: [0x00,0xc0,0x1c,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+
+global_load_b64_tr_b4 v[2:3], v0, s[0:1] offset:-64
+// GFX1250: global_load_tr4_b64 v[2:3], v0, s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x1c,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+
+global_load_b64_tr_b4 v[2:3], v[4:5], off
+// GFX1250: global_load_tr4_b64 v[2:3], v[4:5], off ; encoding: [0x7c,0xc0,0x1c,0xee,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00]
+
+global_load_b64_tr_b4 v[2:3], v[4:5], off offset:64
+// GFX1250: global_load_tr4_b64 v[2:3], v[4:5], off offset:64 ; encoding: [0x7c,0xc0,0x1c,0xee,0x02,0x00,0x00,0x00,0x04,0x40,0x00,0x00]
+
+global_load_b64_tr_b4 v[2:3], v[4:5], off offset:-64
+// GFX1250: global_load_tr4_b64 v[2:3], v[4:5], off offset:-64 ; encoding: [0x7c,0xc0,0x1c,0xee,0x02,0x00,0x00,0x00,0x04,0xc0,0xff,0xff]
+
+global_load_b96_tr_b6 v[2:4], v0, s[0:1]
+// GFX1250: global_load_tr6_b96 v[2:4], v0, s[0:1]  ; encoding: [0x00,0x00,0x1d,0xee,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+
+global_load_b96_tr_b6 v[3:5], v0, s[0:1]
+// GFX1250: global_load_tr6_b96 v[3:5], v0, s[0:1]  ; encoding: [0x00,0x00,0x1d,0xee,0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+
+global_load_b96_tr_b6 v[2:4], v0, s[0:1] offset:64
+// GFX1250: global_load_tr6_b96 v[2:4], v0, s[0:1] offset:64 ; encoding: [0x00,0x00,0x1d,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+
+global_load_b96_tr_b6 v[2:4], v0, s[0:1] offset:-64
+// GFX1250: global_load_tr6_b96 v[2:4], v0, s[0:1] offset:-64 ; encoding: [0x00,0x00,0x1d,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+
+global_load_b96_tr_b6 v[2:4], v[6:7], off
+// GFX1250: global_load_tr6_b96 v[2:4], v[6:7], off ; encoding: [0x7c,0x00,0x1d,0xee,0x02,0x00,0x00,0x00,0x06,0x00,0x00,0x00]
+
+global_load_b96_tr_b6 v[2:4], v[6:7], off offset:64
+// GFX1250: global_load_tr6_b96 v[2:4], v[6:7], off offset:64 ; encoding: [0x7c,0x00,0x1d,0xee,0x02,0x00,0x00,0x00,0x06,0x40,0x00,0x00]
+
+global_load_b96_tr_b6 v[2:4], v[6:7], off offset:-64
+// GFX1250: global_load_tr6_b96 v[2:4], v[6:7], off offset:-64 ; encoding: [0x7c,0x00,0x1d,0xee,0x02,0x00,0x00,0x00,0x06,0xc0,0xff,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s
index 0d61c1f50885c..39de9a268db95 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
-// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -show-encoding %s | FileCheck --check-prefix=GFX1250 %s
-// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding | FileCheck --check-prefixes=GFX1250 %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX1250,GFX1250-ASM %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding | FileCheck --check-prefixes=GFX1250,GFX1250-DIS %s
 
 v_mov_b64_e32 v[4:5], v[2:3]
 // GFX1250: v_mov_b64_e32 v[4:5], v[2:3]            ; encoding: [0x02,0x3b,0x08,0x7e]
@@ -26,8 +26,10 @@ v_mov_b64 v[4:5], -1
 v_mov_b64 v[4:5], 0.5
 // GFX1250: v_mov_b64_e32 v[4:5], 0.5               ; encoding: [0xf0,0x3a,0x08,0x7e]
 
+// TODO: Encode as a 32-bit literal unless lit64() is specified.
 v_mov_b64 v[254:255], 0xaf123456
-// GFX1250: v_mov_b64_e32 v[254:255], 0xaf123456    ; encoding: [0xfe,0x3a,0xfc,0x7f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: v_mov_b64_e32 v[254:255], 0xaf123456    ; encoding: [0xfe,0x3a,0xfc,0x7f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: v_mov_b64_e32 v[254:255], lit64(0xaf123456) ; encoding: [0xfe,0x3a,0xfc,0x7f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 v_tanh_f32 v5, v1
 // GFX1250: v_tanh_f32_e32 v5, v1                   ; encoding: [0x01,0x3d,0x0a,0x7e]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop2.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop2.s
index 02872b0da76dd..d9f69343fae3b 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop2.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop2.s
@@ -196,8 +196,9 @@ v_add_nc_u64 v[4:5], -4.0, v[4:5]
 // GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_add_nc_u64 v[4:5], 0xaf123456, v[4:5]
-// GFX1250: v_add_nc_u64_e32 v[4:5], 0xaf123456, v[4:5] ; encoding: [0xfe,0x08,0x08,0x50,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
-// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// GFX1250-ASM: v_add_nc_u64_e32 v[4:5], 0xaf123456, v[4:5] ; encoding: [0xfe,0x08,0x08,0x50,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: v_add_nc_u64_e32 v[4:5], lit64(0xaf123456), v[4:5] ; encoding: [0xfe,0x08,0x08,0x50,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
 
 v_add_nc_u64 v[4:5], 0x3f717273, v[4:5]
 // GFX1250: v_add_nc_u64_e32 v[4:5], 0x3f717273, v[4:5] ; encoding: [0xff,0x08,0x08,0x50,0x73,0x72,0x71,0x3f]
@@ -316,8 +317,9 @@ v_sub_nc_u64 v[4:5], -4.0, v[4:5]
 // GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_sub_nc_u64 v[4:5], 0xaf123456, v[4:5]
-// GFX1250: v_sub_nc_u64_e32 v[4:5], 0xaf123456, v[4:5] ; encoding: [0xfe,0x08,0x08,0x52,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
-// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// GFX1250-ASM: v_sub_nc_u64_e32 v[4:5], 0xaf123456, v[4:5] ; encoding: [0xfe,0x08,0x08,0x52,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: v_sub_nc_u64_e32 v[4:5], lit64(0xaf123456), v[4:5] ; encoding: [0xfe,0x08,0x08,0x52,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
 
 v_sub_nc_u64 v[4:5], 0x3f717273, v[4:5]
 // GFX1250: v_sub_nc_u64_e32 v[4:5], 0x3f717273, v[4:5] ; encoding: [0xff,0x08,0x08,0x52,0x73,0x72,0x71,0x3f]
@@ -436,8 +438,9 @@ v_mul_u64 v[4:5], -4.0, v[4:5]
 // GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_mul_u64 v[4:5], 0xaf123456, v[4:5]
-// GFX1250: v_mul_u64_e32 v[4:5], 0xaf123456, v[4:5] ; encoding: [0xfe,0x08,0x08,0x54,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
-// GFX1200-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// GFX1250-ASM: v_mul_u64_e32 v[4:5], 0xaf123456, v[4:5] ; encoding: [0xfe,0x08,0x08,0x54,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: v_mul_u64_e32 v[4:5], lit64(0xaf123456), v[4:5] ; encoding: [0xfe,0x08,0x08,0x54,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1200-ERR: :[[@LINE-3]]:1: error: instruction not supported on this GPU
 
 v_mul_u64 v[4:5], 0x3f717273, v[4:5]
 // GFX1250: v_mul_u64_e32 v[4:5], 0x3f717273, v[4:5] ; encoding: [0xff,0x08,0x08,0x54,0x73,0x72,0x71,0x3f]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s
index ad5771bbbafef..0548e9d24c113 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
 // RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1200 %s | FileCheck --check-prefixes=GFX12,GFX1200 %s
-// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | FileCheck --check-prefixes=GFX12,GFX1250 %s
-// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding | FileCheck --check-prefixes=GFX1250 %s
+// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | FileCheck --check-prefixes=GFX12,GFX1250,GFX1250-ASM %s
+// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding | FileCheck --check-prefixes=GFX1250,GFX1250-DIS %s
 
 s_alloc_vgpr 0x1235
 // GFX12: s_alloc_vgpr 0x1235                     ; encoding: [0xff,0x53,0x80,0xbe,0x35,0x12,0x00,0x00]
@@ -860,7 +860,8 @@ s_mov_b64 s[0:1], 0x3f717273
 
 s_mov_b64 s[0:1], 0xaf123456
 // GFX1200: s_mov_b64 s[0:1], 0xaf123456            ; encoding: [0xff,0x01,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_mov_b64 s[0:1], 0xaf123456            ; encoding: [0xfe,0x01,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_mov_b64 s[0:1], 0xaf123456            ; encoding: [0xfe,0x01,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_mov_b64 s[0:1], lit64(0xaf123456)     ; encoding: [0xfe,0x01,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_mov_b64 s[0:1], null
 // GFX12: s_mov_b64 s[0:1], null                  ; encoding: [0x7c,0x01,0x80,0xbe]
@@ -969,7 +970,8 @@ s_cmov_b64 s[0:1], 0x3f717273
 
 s_cmov_b64 s[0:1], 0xaf123456
 // GFX1200: s_cmov_b64 s[0:1], 0xaf123456           ; encoding: [0xff,0x03,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_cmov_b64 s[0:1], 0xaf123456           ; encoding: [0xfe,0x03,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_cmov_b64 s[0:1], 0xaf123456           ; encoding: [0xfe,0x03,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_cmov_b64 s[0:1], lit64(0xaf123456)    ; encoding: [0xfe,0x03,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_not_b32 s0, s1
 // GFX12: s_not_b32 s0, s1                        ; encoding: [0x01,0x1e,0x80,0xbe]
@@ -1072,7 +1074,8 @@ s_not_b64 s[0:1], 0x3f717273
 
 s_not_b64 s[0:1], 0xaf123456
 // GFX1200: s_not_b64 s[0:1], 0xaf123456            ; encoding: [0xff,0x1f,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_not_b64 s[0:1], 0xaf123456            ; encoding: [0xfe,0x1f,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_not_b64 s[0:1], 0xaf123456            ; encoding: [0xfe,0x1f,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_not_b64 s[0:1], lit64(0xaf123456)     ; encoding: [0xfe,0x1f,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_wqm_b32 s0, s1
 // GFX12: s_wqm_b32 s0, s1                        ; encoding: [0x01,0x1c,0x80,0xbe]
@@ -1175,7 +1178,8 @@ s_wqm_b64 s[0:1], 0x3f717273
 
 s_wqm_b64 s[0:1], 0xaf123456
 // GFX1200: s_wqm_b64 s[0:1], 0xaf123456            ; encoding: [0xff,0x1d,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_wqm_b64 s[0:1], 0xaf123456            ; encoding: [0xfe,0x1d,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_wqm_b64 s[0:1], 0xaf123456            ; encoding: [0xfe,0x1d,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_wqm_b64 s[0:1], lit64(0xaf123456)     ; encoding: [0xfe,0x1d,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_brev_b32 s0, s1
 // GFX12: s_brev_b32 s0, s1                       ; encoding: [0x01,0x04,0x80,0xbe]
@@ -1278,7 +1282,8 @@ s_brev_b64 s[0:1], 0x3f717273
 
 s_brev_b64 s[0:1], 0xaf123456
 // GFX1200: s_brev_b64 s[0:1], 0xaf123456           ; encoding: [0xff,0x05,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_brev_b64 s[0:1], 0xaf123456           ; encoding: [0xfe,0x05,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_brev_b64 s[0:1], 0xaf123456           ; encoding: [0xfe,0x05,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_brev_b64 s[0:1], lit64(0xaf123456)    ; encoding: [0xfe,0x05,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_bcnt0_i32_b32 s0, s1
 // GFX12: s_bcnt0_i32_b32 s0, s1                  ; encoding: [0x01,0x16,0x80,0xbe]
@@ -1390,7 +1395,8 @@ s_bcnt0_i32_b64 s0, 0x3f717273
 
 s_bcnt0_i32_b64 s0, 0xaf123456
 // GFX1200: s_bcnt0_i32_b64 s0, 0xaf123456          ; encoding: [0xff,0x17,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_bcnt0_i32_b64 s0, 0xaf123456          ; encoding: [0xfe,0x17,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_bcnt0_i32_b64 s0, 0xaf123456          ; encoding: [0xfe,0x17,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_bcnt0_i32_b64 s0, lit64(0xaf123456)   ; encoding: [0xfe,0x17,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_bcnt1_i32_b32 s0, s1
 // GFX12: s_bcnt1_i32_b32 s0, s1                  ; encoding: [0x01,0x18,0x80,0xbe]
@@ -1502,7 +1508,8 @@ s_bcnt1_i32_b64 s0, 0x3f717273
 
 s_bcnt1_i32_b64 s0, 0xaf123456
 // GFX1200: s_bcnt1_i32_b64 s0, 0xaf123456          ; encoding: [0xff,0x19,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_bcnt1_i32_b64 s0, 0xaf123456          ; encoding: [0xfe,0x19,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_bcnt1_i32_b64 s0, 0xaf123456          ; encoding: [0xfe,0x19,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_bcnt1_i32_b64 s0, lit64(0xaf123456)   ; encoding: [0xfe,0x19,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_ff1_i32_b32 s0, s1
 // GFX12: s_ctz_i32_b32 s0, s1                    ; encoding: [0x01,0x08,0x80,0xbe]
@@ -1614,7 +1621,8 @@ s_ff1_i32_b64 s0, 0x3f717273
 
 s_ff1_i32_b64 s0, 0xaf123456
 // GFX1200: s_ctz_i32_b64 s0, 0xaf123456            ; encoding: [0xff,0x09,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_ctz_i32_b64 s0, 0xaf123456            ; encoding: [0xfe,0x09,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_ctz_i32_b64 s0, 0xaf123456            ; encoding: [0xfe,0x09,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_ctz_i32_b64 s0, lit64(0xaf123456)     ; encoding: [0xfe,0x09,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_flbit_i32_b32 s0, s1
 // GFX12: s_clz_i32_u32 s0, s1                    ; encoding: [0x01,0x0a,0x80,0xbe]
@@ -1726,7 +1734,8 @@ s_flbit_i32_b64 s0, 0x3f717273
 
 s_flbit_i32_b64 s0, 0xaf123456
 // GFX1200: s_clz_i32_u64 s0, 0xaf123456            ; encoding: [0xff,0x0b,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_clz_i32_u64 s0, 0xaf123456            ; encoding: [0xfe,0x0b,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_clz_i32_u64 s0, 0xaf123456            ; encoding: [0xfe,0x0b,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_clz_i32_u64 s0, lit64(0xaf123456)     ; encoding: [0xfe,0x0b,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_flbit_i32 s0, s1
 // GFX12: s_cls_i32 s0, s1                        ; encoding: [0x01,0x0c,0x80,0xbe]
@@ -1838,7 +1847,8 @@ s_flbit_i32_i64 s0, 0x3f717273
 
 s_flbit_i32_i64 s0, 0xaf123456
 // GFX1200: s_cls_i32_i64 s0, 0xaf123456            ; encoding: [0xff,0x0d,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_cls_i32_i64 s0, 0xaf123456            ; encoding: [0xfe,0x0d,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_cls_i32_i64 s0, 0xaf123456            ; encoding: [0xfe,0x0d,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_cls_i32_i64 s0, lit64(0xaf123456)     ; encoding: [0xfe,0x0d,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_sext_i32_i8 s0, s1
 // GFX12: s_sext_i32_i8 s0, s1                    ; encoding: [0x01,0x0e,0x80,0xbe]
@@ -2284,7 +2294,8 @@ s_and_saveexec_b64 s[0:1], 0x3f717273
 
 s_and_saveexec_b64 s[0:1], 0xaf123456
 // GFX1200: s_and_saveexec_b64 s[0:1], 0xaf123456   ; encoding: [0xff,0x21,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_and_saveexec_b64 s[0:1], 0xaf123456   ; encoding: [0xfe,0x21,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_and_saveexec_b64 s[0:1], 0xaf123456   ; encoding: [0xfe,0x21,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_and_saveexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x21,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_or_saveexec_b64 s[0:1], s[2:3]
 // GFX12: s_or_saveexec_b64 s[0:1], s[2:3]        ; encoding: [0x02,0x23,0x80,0xbe]
@@ -2324,7 +2335,8 @@ s_or_saveexec_b64 s[0:1], 0x3f717273
 
 s_or_saveexec_b64 s[0:1], 0xaf123456
 // GFX1200: s_or_saveexec_b64 s[0:1], 0xaf123456    ; encoding: [0xff,0x23,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_or_saveexec_b64 s[0:1], 0xaf123456    ; encoding: [0xfe,0x23,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_or_saveexec_b64 s[0:1], 0xaf123456    ; encoding: [0xfe,0x23,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_or_saveexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x23,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_xor_saveexec_b64 s[0:1], s[2:3]
 // GFX12: s_xor_saveexec_b64 s[0:1], s[2:3]       ; encoding: [0x02,0x25,0x80,0xbe]
@@ -2364,7 +2376,8 @@ s_xor_saveexec_b64 s[0:1], 0x3f717273
 
 s_xor_saveexec_b64 s[0:1], 0xaf123456
 // GFX1200: s_xor_saveexec_b64 s[0:1], 0xaf123456   ; encoding: [0xff,0x25,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_xor_saveexec_b64 s[0:1], 0xaf123456   ; encoding: [0xfe,0x25,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_xor_saveexec_b64 s[0:1], 0xaf123456   ; encoding: [0xfe,0x25,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_xor_saveexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x25,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_andn2_saveexec_b64 s[0:1], s[2:3]
 // GFX12: s_and_not1_saveexec_b64 s[0:1], s[2:3]  ; encoding: [0x02,0x31,0x80,0xbe]
@@ -2404,7 +2417,8 @@ s_andn2_saveexec_b64 s[0:1], 0x3f717273
 
 s_andn2_saveexec_b64 s[0:1], 0xaf123456
 // GFX1200: s_and_not1_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x31,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_and_not1_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x31,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_and_not1_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x31,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_and_not1_saveexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x31,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_orn2_saveexec_b64 s[0:1], s[2:3]
 // GFX12: s_or_not1_saveexec_b64 s[0:1], s[2:3]   ; encoding: [0x02,0x33,0x80,0xbe]
@@ -2444,7 +2458,8 @@ s_orn2_saveexec_b64 s[0:1], 0x3f717273
 
 s_orn2_saveexec_b64 s[0:1], 0xaf123456
 // GFX1200: s_or_not1_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x33,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_or_not1_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x33,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_or_not1_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x33,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_or_not1_saveexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x33,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_nand_saveexec_b64 s[0:1], s[2:3]
 // GFX12: s_nand_saveexec_b64 s[0:1], s[2:3]      ; encoding: [0x02,0x27,0x80,0xbe]
@@ -2484,7 +2499,8 @@ s_nand_saveexec_b64 s[0:1], 0x3f717273
 
 s_nand_saveexec_b64 s[0:1], 0xaf123456
 // GFX1200: s_nand_saveexec_b64 s[0:1], 0xaf123456  ; encoding: [0xff,0x27,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_nand_saveexec_b64 s[0:1], 0xaf123456  ; encoding: [0xfe,0x27,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_nand_saveexec_b64 s[0:1], 0xaf123456  ; encoding: [0xfe,0x27,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_nand_saveexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x27,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_nor_saveexec_b64 s[0:1], s[2:3]
 // GFX12: s_nor_saveexec_b64 s[0:1], s[2:3]       ; encoding: [0x02,0x29,0x80,0xbe]
@@ -2524,7 +2540,8 @@ s_nor_saveexec_b64 s[0:1], 0x3f717273
 
 s_nor_saveexec_b64 s[0:1], 0xaf123456
 // GFX1200: s_nor_saveexec_b64 s[0:1], 0xaf123456   ; encoding: [0xff,0x29,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_nor_saveexec_b64 s[0:1], 0xaf123456   ; encoding: [0xfe,0x29,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_nor_saveexec_b64 s[0:1], 0xaf123456   ; encoding: [0xfe,0x29,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_nor_saveexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x29,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_xnor_saveexec_b64 s[0:1], s[2:3]
 // GFX12: s_xnor_saveexec_b64 s[0:1], s[2:3]      ; encoding: [0x02,0x2b,0x80,0xbe]
@@ -2564,7 +2581,8 @@ s_xnor_saveexec_b64 s[0:1], 0x3f717273
 
 s_xnor_saveexec_b64 s[0:1], 0xaf123456
 // GFX1200: s_xnor_saveexec_b64 s[0:1], 0xaf123456  ; encoding: [0xff,0x2b,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_xnor_saveexec_b64 s[0:1], 0xaf123456  ; encoding: [0xfe,0x2b,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_xnor_saveexec_b64 s[0:1], 0xaf123456  ; encoding: [0xfe,0x2b,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_xnor_saveexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x2b,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_quadmask_b32 s0, s1
 // GFX12: s_quadmask_b32 s0, s1                   ; encoding: [0x01,0x1a,0x80,0xbe]
@@ -2667,7 +2685,8 @@ s_quadmask_b64 s[0:1], 0x3f717273
 
 s_quadmask_b64 s[0:1], 0xaf123456
 // GFX1200: s_quadmask_b64 s[0:1], 0xaf123456       ; encoding: [0xff,0x1b,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_quadmask_b64 s[0:1], 0xaf123456       ; encoding: [0xfe,0x1b,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_quadmask_b64 s[0:1], 0xaf123456       ; encoding: [0xfe,0x1b,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_quadmask_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x1b,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_movrels_b32 s0, s1
 // GFX12: s_movrels_b32 s0, s1                    ; encoding: [0x01,0x40,0x80,0xbe]
@@ -2812,7 +2831,8 @@ s_movreld_b64 s[0:1], 0x3f717273
 
 s_movreld_b64 s[0:1], 0xaf123456
 // GFX1200: s_movreld_b64 s[0:1], 0xaf123456        ; encoding: [0xff,0x43,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_movreld_b64 s[0:1], 0xaf123456        ; encoding: [0xfe,0x43,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_movreld_b64 s[0:1], 0xaf123456        ; encoding: [0xfe,0x43,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_movreld_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x43,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_abs_i32 s0, s1
 // GFX12: s_abs_i32 s0, s1                        ; encoding: [0x01,0x15,0x80,0xbe]
@@ -2912,7 +2932,8 @@ s_andn1_saveexec_b64 s[0:1], 0x3f717273
 
 s_andn1_saveexec_b64 s[0:1], 0xaf123456
 // GFX1200: s_and_not0_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x2d,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_and_not0_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x2d,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_and_not0_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x2d,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_and_not0_saveexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x2d,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_orn1_saveexec_b64 s[0:1], s[2:3]
 // GFX12: s_or_not0_saveexec_b64 s[0:1], s[2:3]   ; encoding: [0x02,0x2f,0x80,0xbe]
@@ -2952,7 +2973,8 @@ s_orn1_saveexec_b64 s[0:1], 0x3f717273
 
 s_orn1_saveexec_b64 s[0:1], 0xaf123456
 // GFX1200: s_or_not0_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x2f,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_or_not0_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x2f,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_or_not0_saveexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x2f,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_or_not0_saveexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x2f,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_andn1_wrexec_b64 s[0:1], s[2:3]
 // GFX12: s_and_not0_wrexec_b64 s[0:1], s[2:3]    ; encoding: [0x02,0x35,0x80,0xbe]
@@ -2992,7 +3014,8 @@ s_andn1_wrexec_b64 s[0:1], 0x3f717273
 
 s_andn1_wrexec_b64 s[0:1], 0xaf123456
 // GFX1200: s_and_not0_wrexec_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x35,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_and_not0_wrexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x35,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_and_not0_wrexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x35,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_and_not0_wrexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x35,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_andn2_wrexec_b64 s[0:1], s[2:3]
 // GFX12: s_and_not1_wrexec_b64 s[0:1], s[2:3]    ; encoding: [0x02,0x37,0x80,0xbe]
@@ -3032,7 +3055,8 @@ s_andn2_wrexec_b64 s[0:1], 0x3f717273
 
 s_andn2_wrexec_b64 s[0:1], 0xaf123456
 // GFX1200: s_and_not1_wrexec_b64 s[0:1], 0xaf123456 ; encoding: [0xff,0x37,0x80,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_and_not1_wrexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x37,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_and_not1_wrexec_b64 s[0:1], 0xaf123456 ; encoding: [0xfe,0x37,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_and_not1_wrexec_b64 s[0:1], lit64(0xaf123456) ; encoding: [0xfe,0x37,0x80,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_bitreplicate_b64_b32 s[0:1], s2
 // GFX12: s_bitreplicate_b64_b32 s[0:1], s2       ; encoding: [0x02,0x14,0x80,0xbe]
@@ -3831,7 +3855,8 @@ s_ctz_i32_b64 exec_hi, src_scc
 
 s_ctz_i32_b64 null, 0xaf123456
 // GFX1200: s_ctz_i32_b64 null, 0xaf123456          ; encoding: [0xff,0x09,0xfc,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_ctz_i32_b64 null, 0xaf123456          ; encoding: [0xfe,0x09,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_ctz_i32_b64 null, 0xaf123456          ; encoding: [0xfe,0x09,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_ctz_i32_b64 null, lit64(0xaf123456)   ; encoding: [0xfe,0x09,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_and_not1_saveexec_b64 s[10:11], s[2:3]
 // GFX12: s_and_not1_saveexec_b64 s[10:11], s[2:3] ; encoding: [0x02,0x31,0x8a,0xbe]
@@ -3859,7 +3884,8 @@ s_and_not1_saveexec_b64 ttmp[14:15], src_scc
 
 s_and_not1_saveexec_b64 null, 0xaf123456
 // GFX1200: s_and_not1_saveexec_b64 null, 0xaf123456 ; encoding: [0xff,0x31,0xfc,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_and_not1_saveexec_b64 null, 0xaf123456 ; encoding: [0xfe,0x31,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_and_not1_saveexec_b64 null, 0xaf123456 ; encoding: [0xfe,0x31,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_and_not1_saveexec_b64 null, lit64(0xaf123456) ; encoding: [0xfe,0x31,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_and_not0_saveexec_b32 s5, s1
 // GFX12: s_and_not0_saveexec_b32 s5, s1          ; encoding: [0x01,0x2c,0x85,0xbe]
@@ -3920,7 +3946,8 @@ s_and_not0_saveexec_b64 ttmp[14:15], src_scc
 
 s_and_not0_saveexec_b64 null, 0xaf123456
 // GFX1200: s_and_not0_saveexec_b64 null, 0xaf123456 ; encoding: [0xff,0x2d,0xfc,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_and_not0_saveexec_b64 null, 0xaf123456 ; encoding: [0xfe,0x2d,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_and_not0_saveexec_b64 null, 0xaf123456 ; encoding: [0xfe,0x2d,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_and_not0_saveexec_b64 null, lit64(0xaf123456) ; encoding: [0xfe,0x2d,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_and_not0_wrexec_b32 s5, s1
 // GFX12: s_and_not0_wrexec_b32 s5, s1            ; encoding: [0x01,0x34,0x85,0xbe]
@@ -3981,7 +4008,8 @@ s_and_not0_wrexec_b64 ttmp[14:15], src_scc
 
 s_and_not0_wrexec_b64 null, 0xaf123456
 // GFX1200: s_and_not0_wrexec_b64 null, 0xaf123456  ; encoding: [0xff,0x35,0xfc,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_and_not0_wrexec_b64 null, 0xaf123456  ; encoding: [0xfe,0x35,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_and_not0_wrexec_b64 null, 0xaf123456  ; encoding: [0xfe,0x35,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_and_not0_wrexec_b64 null, lit64(0xaf123456) ; encoding: [0xfe,0x35,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_and_not1_saveexec_b32 s5, s1
 // GFX12: s_and_not1_saveexec_b32 s5, s1          ; encoding: [0x01,0x30,0x85,0xbe]
@@ -4075,7 +4103,8 @@ s_and_not1_wrexec_b64 ttmp[14:15], src_scc
 
 s_and_not1_wrexec_b64 null, 0xaf123456
 // GFX1200: s_and_not1_wrexec_b64 null, 0xaf123456  ; encoding: [0xff,0x37,0xfc,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_and_not1_wrexec_b64 null, 0xaf123456  ; encoding: [0xfe,0x37,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_and_not1_wrexec_b64 null, 0xaf123456  ; encoding: [0xfe,0x37,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_and_not1_wrexec_b64 null, lit64(0xaf123456) ; encoding: [0xfe,0x37,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_cls_i32 s5, s1
 // GFX12: s_cls_i32 s5, s1                        ; encoding: [0x01,0x0c,0x85,0xbe]
@@ -4145,7 +4174,8 @@ s_cls_i32_i64 exec_hi, src_scc
 
 s_cls_i32_i64 null, 0xaf123456
 // GFX1200: s_cls_i32_i64 null, 0xaf123456          ; encoding: [0xff,0x0d,0xfc,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_cls_i32_i64 null, 0xaf123456          ; encoding: [0xfe,0x0d,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_cls_i32_i64 null, 0xaf123456          ; encoding: [0xfe,0x0d,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_cls_i32_i64 null, lit64(0xaf123456)   ; encoding: [0xfe,0x0d,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_clz_i32_u32 s5, s1
 // GFX12: s_clz_i32_u32 s5, s1                    ; encoding: [0x01,0x0a,0x85,0xbe]
@@ -4215,7 +4245,8 @@ s_clz_i32_u64 exec_hi, src_scc
 
 s_clz_i32_u64 null, 0xaf123456
 // GFX1200: s_clz_i32_u64 null, 0xaf123456          ; encoding: [0xff,0x0b,0xfc,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_clz_i32_u64 null, 0xaf123456          ; encoding: [0xfe,0x0b,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_clz_i32_u64 null, 0xaf123456          ; encoding: [0xfe,0x0b,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_clz_i32_u64 null, lit64(0xaf123456)   ; encoding: [0xfe,0x0b,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_or_not0_saveexec_b32 s5, s1
 // GFX12: s_or_not0_saveexec_b32 s5, s1           ; encoding: [0x01,0x2e,0x85,0xbe]
@@ -4276,7 +4307,8 @@ s_or_not0_saveexec_b64 ttmp[14:15], src_scc
 
 s_or_not0_saveexec_b64 null, 0xaf123456
 // GFX1200: s_or_not0_saveexec_b64 null, 0xaf123456 ; encoding: [0xff,0x2f,0xfc,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_or_not0_saveexec_b64 null, 0xaf123456 ; encoding: [0xfe,0x2f,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_or_not0_saveexec_b64 null, 0xaf123456 ; encoding: [0xfe,0x2f,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_or_not0_saveexec_b64 null, lit64(0xaf123456) ; encoding: [0xfe,0x2f,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_or_not1_saveexec_b32 s5, s1
 // GFX12: s_or_not1_saveexec_b32 s5, s1           ; encoding: [0x01,0x32,0x85,0xbe]
@@ -4337,4 +4369,5 @@ s_or_not1_saveexec_b64 ttmp[14:15], src_scc
 
 s_or_not1_saveexec_b64 null, 0xaf123456
 // GFX1200: s_or_not1_saveexec_b64 null, 0xaf123456 ; encoding: [0xff,0x33,0xfc,0xbe,0x56,0x34,0x12,0xaf]
-// GFX1250: s_or_not1_saveexec_b64 null, 0xaf123456 ; encoding: [0xfe,0x33,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_or_not1_saveexec_b64 null, 0xaf123456 ; encoding: [0xfe,0x33,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_or_not1_saveexec_b64 null, lit64(0xaf123456) ; encoding: [0xfe,0x33,0xfc,0xbe,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_sop2.s b/llvm/test/MC/AMDGPU/gfx12_asm_sop2.s
index 9c83879f0430e..3a24442312af6 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_sop2.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_sop2.s
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
 // RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1200 %s | FileCheck --check-prefixes=GFX12,GFX1200 %s
-// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | FileCheck --check-prefixes=GFX12,GFX1250 %s
-// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding | FileCheck --check-prefixes=GFX1250 %s
+// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | FileCheck --check-prefixes=GFX12,GFX1250-ASM %s
+// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding | FileCheck --check-prefixes=GFX12,GFX1250-DIS %s
 
 s_add_nc_u64 s[0:1], s[2:3], s[4:5]
 // GFX12: s_add_nc_u64 s[0:1], s[2:3], s[4:5]     ; encoding: [0x02,0x04,0x80,0xa9]
@@ -56,7 +56,8 @@ s_add_nc_u64 s[0:1], 0x3f717273, s[2:3]
 
 s_add_nc_u64 s[0:1], 0xaf123456, s[2:3]
 // GFX1200: s_add_nc_u64 s[0:1], 0xaf123456, s[2:3] ; encoding: [0xff,0x02,0x80,0xa9,0x56,0x34,0x12,0xaf]
-// GFX1250: s_add_nc_u64 s[0:1], 0xaf123456, s[2:3] ; encoding: [0xfe,0x02,0x80,0xa9,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_add_nc_u64 s[0:1], 0xaf123456, s[2:3] ; encoding: [0xfe,0x02,0x80,0xa9,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_add_nc_u64 s[0:1], lit64(0xaf123456), s[2:3] ; encoding: [0xfe,0x02,0x80,0xa9,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_add_nc_u64 s[0:1], s[2:3], exec
 // GFX12: s_add_nc_u64 s[0:1], s[2:3], exec       ; encoding: [0x02,0x7e,0x80,0xa9]
@@ -81,7 +82,8 @@ s_add_nc_u64 s[0:1], s[2:3], 0x3f717273
 
 s_add_nc_u64 s[0:1], s[2:3], 0xaf123456
 // GFX1200: s_add_nc_u64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xff,0x80,0xa9,0x56,0x34,0x12,0xaf]
-// GFX1250: s_add_nc_u64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0xa9,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_add_nc_u64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0xa9,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_add_nc_u64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x80,0xa9,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_sub_nc_u64 s[0:1], s[2:3], s[4:5]
 // GFX12: s_sub_nc_u64 s[0:1], s[2:3], s[4:5]     ; encoding: [0x02,0x04,0x00,0xaa]
@@ -136,7 +138,8 @@ s_sub_nc_u64 s[0:1], 0x3f717273, s[2:3]
 
 s_sub_nc_u64 s[0:1], 0xaf123456, s[2:3]
 // GFX1200: s_sub_nc_u64 s[0:1], 0xaf123456, s[2:3] ; encoding: [0xff,0x02,0x00,0xaa,0x56,0x34,0x12,0xaf]
-// GFX1250: s_sub_nc_u64 s[0:1], 0xaf123456, s[2:3] ; encoding: [0xfe,0x02,0x00,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_sub_nc_u64 s[0:1], 0xaf123456, s[2:3] ; encoding: [0xfe,0x02,0x00,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_sub_nc_u64 s[0:1], lit64(0xaf123456), s[2:3] ; encoding: [0xfe,0x02,0x00,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_sub_nc_u64 s[0:1], s[2:3], exec
 // GFX12: s_sub_nc_u64 s[0:1], s[2:3], exec       ; encoding: [0x02,0x7e,0x00,0xaa]
@@ -161,7 +164,8 @@ s_sub_nc_u64 s[0:1], s[2:3], 0x3f717273
 
 s_sub_nc_u64 s[0:1], s[2:3], 0xaf123456
 // GFX1200: s_sub_nc_u64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xff,0x00,0xaa,0x56,0x34,0x12,0xaf]
-// GFX1250: s_sub_nc_u64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x00,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_sub_nc_u64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x00,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_sub_nc_u64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x00,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_mul_u64 s[0:1], s[2:3], s[4:5]
 // GFX12: s_mul_u64 s[0:1], s[2:3], s[4:5]        ; encoding: [0x02,0x04,0x80,0xaa]
@@ -216,7 +220,8 @@ s_mul_u64 s[0:1], 0x3f717273, s[2:3]
 
 s_mul_u64 s[0:1], 0xaf123456, s[2:3]
 // GFX1200: s_mul_u64 s[0:1], 0xaf123456, s[2:3]    ; encoding: [0xff,0x02,0x80,0xaa,0x56,0x34,0x12,0xaf]
-// GFX1250: s_mul_u64 s[0:1], 0xaf123456, s[2:3]    ; encoding: [0xfe,0x02,0x80,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_mul_u64 s[0:1], 0xaf123456, s[2:3]    ; encoding: [0xfe,0x02,0x80,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_mul_u64 s[0:1], lit64(0xaf123456), s[2:3] ; encoding: [0xfe,0x02,0x80,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_mul_u64 s[0:1], s[2:3], exec
 // GFX12: s_mul_u64 s[0:1], s[2:3], exec          ; encoding: [0x02,0x7e,0x80,0xaa]
@@ -241,7 +246,8 @@ s_mul_u64 s[0:1], s[2:3], 0x3f717273
 
 s_mul_u64 s[0:1], s[2:3], 0xaf123456
 // GFX1200: s_mul_u64 s[0:1], s[2:3], 0xaf123456    ; encoding: [0x02,0xff,0x80,0xaa,0x56,0x34,0x12,0xaf]
-// GFX1250: s_mul_u64 s[0:1], s[2:3], 0xaf123456    ; encoding: [0x02,0xfe,0x80,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_mul_u64 s[0:1], s[2:3], 0xaf123456    ; encoding: [0x02,0xfe,0x80,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_mul_u64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x80,0xaa,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_add_f32 s5, s1, s2
 // GFX12: s_add_f32 s5, s1, s2                    ; encoding: [0x01,0x02,0x05,0xa0]
@@ -2359,7 +2365,8 @@ s_cselect_b64 s[0:1], 0x3f717273, s[4:5]
 
 s_cselect_b64 s[0:1], 0xaf123456, s[4:5]
 // GFX1200: s_cselect_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xff,0x04,0x80,0x98,0x56,0x34,0x12,0xaf]
-// GFX1250: s_cselect_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x98,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_cselect_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x98,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_cselect_b64 s[0:1], lit64(0xaf123456), s[4:5] ; encoding: [0xfe,0x04,0x80,0x98,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_cselect_b64 s[0:1], s[2:3], exec
 // GFX12: s_cselect_b64 s[0:1], s[2:3], exec      ; encoding: [0x02,0x7e,0x80,0x98]
@@ -2384,7 +2391,8 @@ s_cselect_b64 s[0:1], s[2:3], 0x3f717273
 
 s_cselect_b64 s[0:1], s[2:3], 0xaf123456
 // GFX1200: s_cselect_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xff,0x80,0x98,0x56,0x34,0x12,0xaf]
-// GFX1250: s_cselect_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x98,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_cselect_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x98,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_cselect_b64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x80,0x98,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_and_b32 s0, s1, s2
 // GFX12: s_and_b32 s0, s1, s2                    ; encoding: [0x01,0x02,0x00,0x8b]
@@ -2553,7 +2561,8 @@ s_and_b64 s[0:1], 0x3f717273, s[4:5]
 
 s_and_b64 s[0:1], 0xaf123456, s[4:5]
 // GFX1200: s_and_b64 s[0:1], 0xaf123456, s[4:5]    ; encoding: [0xff,0x04,0x80,0x8b,0x56,0x34,0x12,0xaf]
-// GFX1250: s_and_b64 s[0:1], 0xaf123456, s[4:5]    ; encoding: [0xfe,0x04,0x80,0x8b,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_and_b64 s[0:1], 0xaf123456, s[4:5]    ; encoding: [0xfe,0x04,0x80,0x8b,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_and_b64 s[0:1], lit64(0xaf123456), s[4:5] ; encoding: [0xfe,0x04,0x80,0x8b,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_and_b64 s[0:1], s[2:3], exec
 // GFX12: s_and_b64 s[0:1], s[2:3], exec          ; encoding: [0x02,0x7e,0x80,0x8b]
@@ -2578,7 +2587,8 @@ s_and_b64 s[0:1], s[2:3], 0x3f717273
 
 s_and_b64 s[0:1], s[2:3], 0xaf123456
 // GFX1200: s_and_b64 s[0:1], s[2:3], 0xaf123456    ; encoding: [0x02,0xff,0x80,0x8b,0x56,0x34,0x12,0xaf]
-// GFX1250: s_and_b64 s[0:1], s[2:3], 0xaf123456    ; encoding: [0x02,0xfe,0x80,0x8b,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_and_b64 s[0:1], s[2:3], 0xaf123456    ; encoding: [0x02,0xfe,0x80,0x8b,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_and_b64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x80,0x8b,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_or_b32 s0, s1, s2
 // GFX12: s_or_b32 s0, s1, s2                     ; encoding: [0x01,0x02,0x00,0x8c]
@@ -2738,7 +2748,8 @@ s_or_b64 s[0:1], 0x3f717273, s[4:5]
 
 s_or_b64 s[0:1], 0xaf123456, s[4:5]
 // GFX1200: s_or_b64 s[0:1], 0xaf123456, s[4:5]     ; encoding: [0xff,0x04,0x80,0x8c,0x56,0x34,0x12,0xaf]
-// GFX1250: s_or_b64 s[0:1], 0xaf123456, s[4:5]     ; encoding: [0xfe,0x04,0x80,0x8c,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_or_b64 s[0:1], 0xaf123456, s[4:5]     ; encoding: [0xfe,0x04,0x80,0x8c,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_or_b64 s[0:1], lit64(0xaf123456), s[4:5] ; encoding: [0xfe,0x04,0x80,0x8c,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_or_b64 s[0:1], s[2:3], exec
 // GFX12: s_or_b64 s[0:1], s[2:3], exec           ; encoding: [0x02,0x7e,0x80,0x8c]
@@ -2763,7 +2774,8 @@ s_or_b64 s[0:1], s[2:3], 0x3f717273
 
 s_or_b64 s[0:1], s[2:3], 0xaf123456
 // GFX1200: s_or_b64 s[0:1], s[2:3], 0xaf123456     ; encoding: [0x02,0xff,0x80,0x8c,0x56,0x34,0x12,0xaf]
-// GFX1250: s_or_b64 s[0:1], s[2:3], 0xaf123456     ; encoding: [0x02,0xfe,0x80,0x8c,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_or_b64 s[0:1], s[2:3], 0xaf123456     ; encoding: [0x02,0xfe,0x80,0x8c,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_or_b64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x80,0x8c,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_xor_b32 s0, s1, s2
 // GFX12: s_xor_b32 s0, s1, s2                    ; encoding: [0x01,0x02,0x00,0x8d]
@@ -2923,7 +2935,8 @@ s_xor_b64 s[0:1], 0x3f717273, s[4:5]
 
 s_xor_b64 s[0:1], 0xaf123456, s[4:5]
 // GFX1200: s_xor_b64 s[0:1], 0xaf123456, s[4:5]    ; encoding: [0xff,0x04,0x80,0x8d,0x56,0x34,0x12,0xaf]
-// GFX1250: s_xor_b64 s[0:1], 0xaf123456, s[4:5]    ; encoding: [0xfe,0x04,0x80,0x8d,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_xor_b64 s[0:1], 0xaf123456, s[4:5]    ; encoding: [0xfe,0x04,0x80,0x8d,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_xor_b64 s[0:1], lit64(0xaf123456), s[4:5] ; encoding: [0xfe,0x04,0x80,0x8d,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_xor_b64 s[0:1], s[2:3], exec
 // GFX12: s_xor_b64 s[0:1], s[2:3], exec          ; encoding: [0x02,0x7e,0x80,0x8d]
@@ -2948,7 +2961,8 @@ s_xor_b64 s[0:1], s[2:3], 0x3f717273
 
 s_xor_b64 s[0:1], s[2:3], 0xaf123456
 // GFX1200: s_xor_b64 s[0:1], s[2:3], 0xaf123456    ; encoding: [0x02,0xff,0x80,0x8d,0x56,0x34,0x12,0xaf]
-// GFX1250: s_xor_b64 s[0:1], s[2:3], 0xaf123456    ; encoding: [0x02,0xfe,0x80,0x8d,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_xor_b64 s[0:1], s[2:3], 0xaf123456    ; encoding: [0x02,0xfe,0x80,0x8d,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_xor_b64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x80,0x8d,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_andn2_b32 s0, s1, s2
 // GFX12: s_and_not1_b32 s0, s1, s2               ; encoding: [0x01,0x02,0x00,0x91]
@@ -3108,7 +3122,8 @@ s_andn2_b64 s[0:1], 0x3f717273, s[4:5]
 
 s_andn2_b64 s[0:1], 0xaf123456, s[4:5]
 // GFX1200: s_and_not1_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xff,0x04,0x80,0x91,0x56,0x34,0x12,0xaf]
-// GFX1250: s_and_not1_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_and_not1_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_and_not1_b64 s[0:1], lit64(0xaf123456), s[4:5] ; encoding: [0xfe,0x04,0x80,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_andn2_b64 s[0:1], s[2:3], exec
 // GFX12: s_and_not1_b64 s[0:1], s[2:3], exec     ; encoding: [0x02,0x7e,0x80,0x91]
@@ -3133,7 +3148,8 @@ s_andn2_b64 s[0:1], s[2:3], 0x3f717273
 
 s_andn2_b64 s[0:1], s[2:3], 0xaf123456
 // GFX1200: s_and_not1_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xff,0x80,0x91,0x56,0x34,0x12,0xaf]
-// GFX1250: s_and_not1_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_and_not1_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_and_not1_b64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x80,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_orn2_b32 s0, s1, s2
 // GFX12: s_or_not1_b32 s0, s1, s2                ; encoding: [0x01,0x02,0x00,0x92]
@@ -3293,7 +3309,8 @@ s_orn2_b64 s[0:1], 0x3f717273, s[4:5]
 
 s_orn2_b64 s[0:1], 0xaf123456, s[4:5]
 // GFX1200: s_or_not1_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xff,0x04,0x80,0x92,0x56,0x34,0x12,0xaf]
-// GFX1250: s_or_not1_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_or_not1_b64 s[0:1], 0xaf123456, s[4:5] ; encoding: [0xfe,0x04,0x80,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_or_not1_b64 s[0:1], lit64(0xaf123456), s[4:5] ; encoding: [0xfe,0x04,0x80,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_orn2_b64 s[0:1], s[2:3], exec
 // GFX12: s_or_not1_b64 s[0:1], s[2:3], exec      ; encoding: [0x02,0x7e,0x80,0x92]
@@ -3318,7 +3335,8 @@ s_orn2_b64 s[0:1], s[2:3], 0x3f717273
 
 s_orn2_b64 s[0:1], s[2:3], 0xaf123456
 // GFX1200: s_or_not1_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xff,0x80,0x92,0x56,0x34,0x12,0xaf]
-// GFX1250: s_or_not1_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_or_not1_b64 s[0:1], s[2:3], 0xaf123456 ; encoding: [0x02,0xfe,0x80,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_or_not1_b64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x80,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_nand_b32 s0, s1, s2
 // GFX12: s_nand_b32 s0, s1, s2                   ; encoding: [0x01,0x02,0x00,0x8e]
@@ -3478,7 +3496,8 @@ s_nand_b64 s[0:1], 0x3f717273, s[4:5]
 
 s_nand_b64 s[0:1], 0xaf123456, s[4:5]
 // GFX1200: s_nand_b64 s[0:1], 0xaf123456, s[4:5]   ; encoding: [0xff,0x04,0x80,0x8e,0x56,0x34,0x12,0xaf]
-// GFX1250: s_nand_b64 s[0:1], 0xaf123456, s[4:5]   ; encoding: [0xfe,0x04,0x80,0x8e,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_nand_b64 s[0:1], 0xaf123456, s[4:5]   ; encoding: [0xfe,0x04,0x80,0x8e,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_nand_b64 s[0:1], lit64(0xaf123456), s[4:5] ; encoding: [0xfe,0x04,0x80,0x8e,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_nand_b64 s[0:1], s[2:3], exec
 // GFX12: s_nand_b64 s[0:1], s[2:3], exec         ; encoding: [0x02,0x7e,0x80,0x8e]
@@ -3503,7 +3522,8 @@ s_nand_b64 s[0:1], s[2:3], 0x3f717273
 
 s_nand_b64 s[0:1], s[2:3], 0xaf123456
 // GFX1200: s_nand_b64 s[0:1], s[2:3], 0xaf123456   ; encoding: [0x02,0xff,0x80,0x8e,0x56,0x34,0x12,0xaf]
-// GFX1250: s_nand_b64 s[0:1], s[2:3], 0xaf123456   ; encoding: [0x02,0xfe,0x80,0x8e,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_nand_b64 s[0:1], s[2:3], 0xaf123456   ; encoding: [0x02,0xfe,0x80,0x8e,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_nand_b64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x80,0x8e,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_nor_b32 s0, s1, s2
 // GFX12: s_nor_b32 s0, s1, s2                    ; encoding: [0x01,0x02,0x00,0x8f]
@@ -3663,7 +3683,8 @@ s_nor_b64 s[0:1], 0x3f717273, s[4:5]
 
 s_nor_b64 s[0:1], 0xaf123456, s[4:5]
 // GFX1200: s_nor_b64 s[0:1], 0xaf123456, s[4:5]    ; encoding: [0xff,0x04,0x80,0x8f,0x56,0x34,0x12,0xaf]
-// GFX1250: s_nor_b64 s[0:1], 0xaf123456, s[4:5]    ; encoding: [0xfe,0x04,0x80,0x8f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_nor_b64 s[0:1], 0xaf123456, s[4:5]    ; encoding: [0xfe,0x04,0x80,0x8f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_nor_b64 s[0:1], lit64(0xaf123456), s[4:5] ; encoding: [0xfe,0x04,0x80,0x8f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_nor_b64 s[0:1], s[2:3], exec
 // GFX12: s_nor_b64 s[0:1], s[2:3], exec          ; encoding: [0x02,0x7e,0x80,0x8f]
@@ -3688,7 +3709,8 @@ s_nor_b64 s[0:1], s[2:3], 0x3f717273
 
 s_nor_b64 s[0:1], s[2:3], 0xaf123456
 // GFX1200: s_nor_b64 s[0:1], s[2:3], 0xaf123456    ; encoding: [0x02,0xff,0x80,0x8f,0x56,0x34,0x12,0xaf]
-// GFX1250: s_nor_b64 s[0:1], s[2:3], 0xaf123456    ; encoding: [0x02,0xfe,0x80,0x8f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_nor_b64 s[0:1], s[2:3], 0xaf123456    ; encoding: [0x02,0xfe,0x80,0x8f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_nor_b64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x80,0x8f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_xnor_b32 s0, s1, s2
 // GFX12: s_xnor_b32 s0, s1, s2                   ; encoding: [0x01,0x02,0x00,0x90]
@@ -3848,7 +3870,8 @@ s_xnor_b64 s[0:1], 0x3f717273, s[4:5]
 
 s_xnor_b64 s[0:1], 0xaf123456, s[4:5]
 // GFX1200: s_xnor_b64 s[0:1], 0xaf123456, s[4:5]   ; encoding: [0xff,0x04,0x80,0x90,0x56,0x34,0x12,0xaf]
-// GFX1250: s_xnor_b64 s[0:1], 0xaf123456, s[4:5]   ; encoding: [0xfe,0x04,0x80,0x90,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_xnor_b64 s[0:1], 0xaf123456, s[4:5]   ; encoding: [0xfe,0x04,0x80,0x90,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_xnor_b64 s[0:1], lit64(0xaf123456), s[4:5] ; encoding: [0xfe,0x04,0x80,0x90,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_xnor_b64 s[0:1], s[2:3], exec
 // GFX12: s_xnor_b64 s[0:1], s[2:3], exec         ; encoding: [0x02,0x7e,0x80,0x90]
@@ -3873,7 +3896,8 @@ s_xnor_b64 s[0:1], s[2:3], 0x3f717273
 
 s_xnor_b64 s[0:1], s[2:3], 0xaf123456
 // GFX1200: s_xnor_b64 s[0:1], s[2:3], 0xaf123456   ; encoding: [0x02,0xff,0x80,0x90,0x56,0x34,0x12,0xaf]
-// GFX1250: s_xnor_b64 s[0:1], s[2:3], 0xaf123456   ; encoding: [0x02,0xfe,0x80,0x90,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_xnor_b64 s[0:1], s[2:3], 0xaf123456   ; encoding: [0x02,0xfe,0x80,0x90,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_xnor_b64 s[0:1], s[2:3], lit64(0xaf123456) ; encoding: [0x02,0xfe,0x80,0x90,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_lshl_b32 s0, s1, s2
 // GFX12: s_lshl_b32 s0, s1, s2                   ; encoding: [0x01,0x02,0x00,0x84]
@@ -4033,7 +4057,8 @@ s_lshl_b64 s[0:1], 0x3f717273, s4
 
 s_lshl_b64 s[0:1], 0xaf123456, s4
 // GFX1200: s_lshl_b64 s[0:1], 0xaf123456, s4       ; encoding: [0xff,0x04,0x80,0x84,0x56,0x34,0x12,0xaf]
-// GFX1250: s_lshl_b64 s[0:1], 0xaf123456, s4       ; encoding: [0xfe,0x04,0x80,0x84,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_lshl_b64 s[0:1], 0xaf123456, s4       ; encoding: [0xfe,0x04,0x80,0x84,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_lshl_b64 s[0:1], lit64(0xaf123456), s4 ; encoding: [0xfe,0x04,0x80,0x84,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_lshl_b64 s[0:1], s[2:3], exec_lo
 // GFX12: s_lshl_b64 s[0:1], s[2:3], exec_lo      ; encoding: [0x02,0x7e,0x80,0x84]
@@ -4217,7 +4242,8 @@ s_lshr_b64 s[0:1], 0x3f717273, s4
 
 s_lshr_b64 s[0:1], 0xaf123456, s4
 // GFX1200: s_lshr_b64 s[0:1], 0xaf123456, s4       ; encoding: [0xff,0x04,0x80,0x85,0x56,0x34,0x12,0xaf]
-// GFX1250: s_lshr_b64 s[0:1], 0xaf123456, s4       ; encoding: [0xfe,0x04,0x80,0x85,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_lshr_b64 s[0:1], 0xaf123456, s4       ; encoding: [0xfe,0x04,0x80,0x85,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_lshr_b64 s[0:1], lit64(0xaf123456), s4 ; encoding: [0xfe,0x04,0x80,0x85,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_lshr_b64 s[0:1], s[2:3], exec_lo
 // GFX12: s_lshr_b64 s[0:1], s[2:3], exec_lo      ; encoding: [0x02,0x7e,0x80,0x85]
@@ -4401,7 +4427,8 @@ s_ashr_i64 s[0:1], 0x3f717273, s4
 
 s_ashr_i64 s[0:1], 0xaf123456, s4
 // GFX1200: s_ashr_i64 s[0:1], 0xaf123456, s4       ; encoding: [0xff,0x04,0x80,0x86,0x56,0x34,0x12,0xaf]
-// GFX1250: s_ashr_i64 s[0:1], 0xaf123456, s4       ; encoding: [0xfe,0x04,0x80,0x86,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_ashr_i64 s[0:1], 0xaf123456, s4       ; encoding: [0xfe,0x04,0x80,0x86,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_ashr_i64 s[0:1], lit64(0xaf123456), s4 ; encoding: [0xfe,0x04,0x80,0x86,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_ashr_i64 s[0:1], s[2:3], exec_lo
 // GFX12: s_ashr_i64 s[0:1], s[2:3], exec_lo      ; encoding: [0x02,0x7e,0x80,0x86]
@@ -4996,7 +5023,8 @@ s_bfe_u64 s[0:1], 0x3f717273, s4
 
 s_bfe_u64 s[0:1], 0xaf123456, s4
 // GFX1200: s_bfe_u64 s[0:1], 0xaf123456, s4        ; encoding: [0xff,0x04,0x00,0x94,0x56,0x34,0x12,0xaf]
-// GFX1250: s_bfe_u64 s[0:1], 0xaf123456, s4        ; encoding: [0xfe,0x04,0x00,0x94,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_bfe_u64 s[0:1], 0xaf123456, s4        ; encoding: [0xfe,0x04,0x00,0x94,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_bfe_u64 s[0:1], lit64(0xaf123456), s4 ; encoding: [0xfe,0x04,0x00,0x94,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_bfe_u64 s[0:1], s[2:3], exec_lo
 // GFX12: s_bfe_u64 s[0:1], s[2:3], exec_lo       ; encoding: [0x02,0x7e,0x00,0x94]
@@ -5075,7 +5103,8 @@ s_bfe_i64 s[0:1], 0x3f717273, s4
 
 s_bfe_i64 s[0:1], 0xaf123456, s4
 // GFX1200: s_bfe_i64 s[0:1], 0xaf123456, s4        ; encoding: [0xff,0x04,0x80,0x94,0x56,0x34,0x12,0xaf]
-// GFX1250: s_bfe_i64 s[0:1], 0xaf123456, s4        ; encoding: [0xfe,0x04,0x80,0x94,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_bfe_i64 s[0:1], 0xaf123456, s4        ; encoding: [0xfe,0x04,0x80,0x94,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_bfe_i64 s[0:1], lit64(0xaf123456), s4 ; encoding: [0xfe,0x04,0x80,0x94,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_bfe_i64 s[0:1], s[2:3], exec_lo
 // GFX12: s_bfe_i64 s[0:1], s[2:3], exec_lo       ; encoding: [0x02,0x7e,0x80,0x94]
@@ -6279,7 +6308,8 @@ s_and_not1_b64 s[10:11], vcc, ttmp[14:15]
 
 s_and_not1_b64 s[10:11], ttmp[14:15], 0xaf123456
 // GFX1200: s_and_not1_b64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x7a,0xff,0x8a,0x91,0x56,0x34,0x12,0xaf]
-// GFX1250: s_and_not1_b64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x7a,0xfe,0x8a,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_and_not1_b64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x7a,0xfe,0x8a,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_and_not1_b64 s[10:11], ttmp[14:15], lit64(0xaf123456) ; encoding: [0x7a,0xfe,0x8a,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_and_not1_b64 s[10:11], exec, src_scc
 // GFX12: s_and_not1_b64 s[10:11], exec, src_scc  ; encoding: [0x7e,0xfd,0x8a,0x91]
@@ -6298,7 +6328,8 @@ s_and_not1_b64 exec, src_scc, exec
 
 s_and_not1_b64 null, 0xaf123456, vcc
 // GFX1200: s_and_not1_b64 null, 0xaf123456, vcc    ; encoding: [0xff,0x6a,0xfc,0x91,0x56,0x34,0x12,0xaf]
-// GFX1250: s_and_not1_b64 null, 0xaf123456, vcc    ; encoding: [0xfe,0x6a,0xfc,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_and_not1_b64 null, 0xaf123456, vcc    ; encoding: [0xfe,0x6a,0xfc,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_and_not1_b64 null, lit64(0xaf123456), vcc ; encoding: [0xfe,0x6a,0xfc,0x91,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_or_not1_b64 s[10:11], s[2:3], s[4:5]
 // GFX12: s_or_not1_b64 s[10:11], s[2:3], s[4:5]  ; encoding: [0x02,0x04,0x8a,0x92]
@@ -6311,7 +6342,8 @@ s_or_not1_b64 s[10:11], vcc, ttmp[14:15]
 
 s_or_not1_b64 s[10:11], ttmp[14:15], 0xaf123456
 // GFX1200: s_or_not1_b64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x7a,0xff,0x8a,0x92,0x56,0x34,0x12,0xaf]
-// GFX1250: s_or_not1_b64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x7a,0xfe,0x8a,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_or_not1_b64 s[10:11], ttmp[14:15], 0xaf123456 ; encoding: [0x7a,0xfe,0x8a,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_or_not1_b64 s[10:11], ttmp[14:15], lit64(0xaf123456) ; encoding: [0x7a,0xfe,0x8a,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_or_not1_b64 s[10:11], exec, src_scc
 // GFX12: s_or_not1_b64 s[10:11], exec, src_scc   ; encoding: [0x7e,0xfd,0x8a,0x92]
@@ -6330,4 +6362,5 @@ s_or_not1_b64 exec, src_scc, exec
 
 s_or_not1_b64 null, 0xaf123456, vcc
 // GFX1200: s_or_not1_b64 null, 0xaf123456, vcc     ; encoding: [0xff,0x6a,0xfc,0x92,0x56,0x34,0x12,0xaf]
-// GFX1250: s_or_not1_b64 null, 0xaf123456, vcc     ; encoding: [0xfe,0x6a,0xfc,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_or_not1_b64 null, 0xaf123456, vcc     ; encoding: [0xfe,0x6a,0xfc,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_or_not1_b64 null, lit64(0xaf123456), vcc ; encoding: [0xfe,0x6a,0xfc,0x92,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_sopc.s b/llvm/test/MC/AMDGPU/gfx12_asm_sopc.s
index 98bb3c3e1da95..8056cef973ecf 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_sopc.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_sopc.s
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -show-encoding %s | FileCheck --check-prefixes=GFX12,GFX1200 %s
-// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | FileCheck --check-prefixes=GFX12,GFX1250 %s
-// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding | FileCheck --check-prefixes=GFX1250 %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | FileCheck --check-prefixes=GFX12,GFX1250-ASM %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | %extract-encodings | llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding | FileCheck --check-prefixes=GFX12,GFX1250-DIS %s
 
 s_cmp_lt_f32 s1, s2
 // GFX12: s_cmp_lt_f32 s1, s2                     ; encoding: [0x01,0x02,0x41,0xbf]
@@ -2120,7 +2120,8 @@ s_cmp_eq_u64 s[0:1], 0x3f717273
 
 s_cmp_eq_u64 s[0:1], 0xaf123456
 // GFX1200: s_cmp_eq_u64 s[0:1], 0xaf123456         ; encoding: [0x00,0xff,0x10,0xbf,0x56,0x34,0x12,0xaf]
-// GFX1250: s_cmp_eq_u64 s[0:1], 0xaf123456         ; encoding: [0x00,0xfe,0x10,0xbf,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_cmp_eq_u64 s[0:1], 0xaf123456         ; encoding: [0x00,0xfe,0x10,0xbf,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_cmp_eq_u64 s[0:1], lit64(0xaf123456)  ; encoding: [0x00,0xfe,0x10,0xbf,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
 
 s_cmp_lg_u64 s[0:1], s[2:3]
 // GFX12: s_cmp_lg_u64 s[0:1], s[2:3]             ; encoding: [0x00,0x02,0x11,0xbf]
@@ -2163,4 +2164,5 @@ s_cmp_lg_u64 s[0:1], 0x3f717273
 
 s_cmp_lg_u64 s[0:1], 0xaf123456
 // GFX1200: s_cmp_lg_u64 s[0:1], 0xaf123456         ; encoding: [0x00,0xff,0x11,0xbf,0x56,0x34,0x12,0xaf]
-// GFX1250: s_cmp_lg_u64 s[0:1], 0xaf123456         ; encoding: [0x00,0xfe,0x11,0xbf,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-ASM: s_cmp_lg_u64 s[0:1], 0xaf123456         ; encoding: [0x00,0xfe,0x11,0xbf,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
+// GFX1250-DIS: s_cmp_lg_u64 s[0:1], lit64(0xaf123456)  ; encoding: [0x00,0xfe,0x11,0xbf,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/lit.local.cfg b/llvm/test/MC/AMDGPU/lit.local.cfg
index c5853ad178a04..12a5c8afa7709 100644
--- a/llvm/test/MC/AMDGPU/lit.local.cfg
+++ b/llvm/test/MC/AMDGPU/lit.local.cfg
@@ -1,4 +1,4 @@
-config.substitutions.append(("%extract-encodings", "sed 's/.*encoding://p'"))
+config.substitutions.append(("%extract-encodings", "sed -n 's/.*encoding://p'"))
 
 if not "AMDGPU" in config.root.targets:
     config.unsupported = True
diff --git a/llvm/test/MC/AMDGPU/offset-expr.s b/llvm/test/MC/AMDGPU/offset-expr.s
index 92a9bf1b4ce9a..7c3c71c1645aa 100644
--- a/llvm/test/MC/AMDGPU/offset-expr.s
+++ b/llvm/test/MC/AMDGPU/offset-expr.s
@@ -9,10 +9,10 @@ BB1:
 v_nop_e64
 BB2:
 s_add_u32 vcc_lo, vcc_lo, (BB2-BB1)&4294967295
-// CHECK: s_add_u32 vcc_lo, vcc_lo, 8   // 000000000018: 806AFF6A 00000008
+// CHECK: s_add_u32 vcc_lo, vcc_lo, lit(0x8) // 000000000018: 806AFF6A 00000008
 s_addc_u32 vcc_hi, vcc_hi, (BB2-BB1)>>32
-// CHECK: s_addc_u32 vcc_hi, vcc_hi, 0  // 000000000020: 826BFF6B 00000000
+// CHECK: s_addc_u32 vcc_hi, vcc_hi, lit(0x0) // 000000000020: 826BFF6B 00000000
 s_add_u32 vcc_lo, vcc_lo, (BB0-BB1)&4294967295
-// CHECK: s_add_u32 vcc_lo, vcc_lo, -16 // 000000000028: 806AFF6A FFFFFFF0
+// CHECK: s_add_u32 vcc_lo, vcc_lo, lit(0xfffffff0) // 000000000028: 806AFF6A FFFFFFF0
 s_addc_u32 vcc_hi, vcc_hi, (BB0-BB1)>>32
-// CHECK: s_addc_u32 vcc_hi, vcc_hi, -1 // 000000000030: 826BFF6B FFFFFFFF
+// CHECK: s_addc_u32 vcc_hi, vcc_hi, lit(0xffffffff) // 000000000030: 826BFF6B FFFFFFFF
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx8-literal16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx8-literal16.txt
index d2da087a44743..856d7c22177ff 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx8-literal16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx8-literal16.txt
@@ -40,8 +40,7 @@
 # VI: v_add_f16_e32 v1, 0x41, v3 ; encoding: [0xff,0x06,0x02,0x3e,0x41,0x00,0x00,0x00]
 0xff 0x06 0x02 0x3e 0x41 0x00 0x00 0x01
 
-# FIXME: This should be able to round trip with literal after instruction
-# VI: v_add_f16_e32 v1, 0, v3 ; encoding: [0x80,0x06,0x02,0x3e]
+# VI: v_add_f16_e32 v1, lit(0x0), v3 ; encoding: [0xff,0x06,0x02,0x3e,0x00,0x00,0x00,0x00]
 0xff 0x06 0x02 0x3e 0x00 0x00 0x00 0x00
 
 # VI: v_add_f16_e32 v1, 0xffcd, v3 ; encoding: [0xff,0x06,0x02,0x3e,0xcd,0xff,0x00,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx8_vop3cx_nowarn.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx8_vop3cx_nowarn.txt
new file mode 100644
index 0000000000000..d4888ad4c5755
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx8_vop3cx_nowarn.txt
@@ -0,0 +1,422 @@
+# RUN: llvm-mc -triple=amdgcn -mcpu=tonga -disassemble -show-encoding < %s | FileCheck -strict-whitespace %s
+
+# In GFX10+, v_cmpx_* use EXEC as the implicit dst. The disassembler issues a warning when the dst
+# is not 0x7e (EXEC). In GFX9 and earlier, these instructions have explicit dst. Therefore, such
+# warnings should not be issued.
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_class_f32_e64 s[10:11], v1, v2   ; encoding: [0x0a,0x00,0x11,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0x11,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_class_f32_e64 flat_scratch, v1, v2 ; encoding: [0x66,0x00,0x11,0xd0,0x01,0x05,0x02,0x00]
+0x66,0x00,0x11,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_class_f32_e64 vcc, v1, v2        ; encoding: [0x6a,0x00,0x11,0xd0,0x01,0x05,0x02,0x00]
+0x6a,0x00,0x11,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_class_f32_e64 tba, v1, v2        ; encoding: [0x6c,0x00,0x11,0xd0,0x01,0x05,0x02,0x00]
+0x6c,0x00,0x11,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_class_f32_e64 tma, v1, v2        ; encoding: [0x6e,0x00,0x11,0xd0,0x01,0x05,0x02,0x00]
+0x6e,0x00,0x11,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_class_f32_e64 ttmp[10:11], v1, v2 ; encoding: [0x7a,0x00,0x11,0xd0,0x01,0x05,0x02,0x00]
+0x7a,0x00,0x11,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_class_f64_e64 s[0:1], v[1:2], v2 ; encoding: [0x00,0x00,0x13,0xd0,0x01,0x05,0x02,0x00]
+0x00,0x00,0x13,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_class_f16_e64 s[2:3], v1, v2     ; encoding: [0x02,0x00,0x15,0xd0,0x01,0x05,0x02,0x00]
+0x02,0x00,0x15,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_f16_e64 s[4:5], v1, v2         ; encoding: [0x04,0x00,0x30,0xd0,0x01,0x05,0x02,0x00]
+0x04,0x00,0x30,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_f16_e64 s[6:7], v1, v2        ; encoding: [0x06,0x00,0x31,0xd0,0x01,0x05,0x02,0x00]
+0x06,0x00,0x31,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_f16_e64 s[8:9], v1, v2        ; encoding: [0x08,0x00,0x32,0xd0,0x01,0x05,0x02,0x00]
+0x08,0x00,0x32,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_f16_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0x33,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0x33,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_f16_e64 s[12:13], v1, v2      ; encoding: [0x0c,0x00,0x34,0xd0,0x01,0x05,0x02,0x00]
+0x0c,0x00,0x34,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lg_f16_e64 s[14:15], v1, v2      ; encoding: [0x0e,0x00,0x35,0xd0,0x01,0x05,0x02,0x00]
+0x0e,0x00,0x35,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_f16_e64 s[16:17], v1, v2      ; encoding: [0x10,0x00,0x36,0xd0,0x01,0x05,0x02,0x00]
+0x10,0x00,0x36,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_o_f16_e64 s[18:19], v1, v2       ; encoding: [0x12,0x00,0x37,0xd0,0x01,0x05,0x02,0x00]
+0x12,0x00,0x37,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_u_f16_e64 s[20:21], v1, v2       ; encoding: [0x14,0x00,0x38,0xd0,0x01,0x05,0x02,0x00]
+0x14,0x00,0x38,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nge_f16_e64 s[22:23], v1, v2     ; encoding: [0x16,0x00,0x39,0xd0,0x01,0x05,0x02,0x00]
+0x16,0x00,0x39,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nlg_f16_e64 s[24:25], v1, v2     ; encoding: [0x18,0x00,0x3a,0xd0,0x01,0x05,0x02,0x00]
+0x18,0x00,0x3a,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ngt_f16_e64 s[26:27], v1, v2     ; encoding: [0x1a,0x00,0x3b,0xd0,0x01,0x05,0x02,0x00]
+0x1a,0x00,0x3b,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nle_f16_e64 s[28:29], v1, v2     ; encoding: [0x1c,0x00,0x3c,0xd0,0x01,0x05,0x02,0x00]
+0x1c,0x00,0x3c,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_neq_f16_e64 s[30:31], v1, v2     ; encoding: [0x1e,0x00,0x3d,0xd0,0x01,0x05,0x02,0x00]
+0x1e,0x00,0x3d,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nlt_f16_e64 s[32:33], v1, v2     ; encoding: [0x20,0x00,0x3e,0xd0,0x01,0x05,0x02,0x00]
+0x20,0x00,0x3e,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_tru_f16_e64 s[34:35], v1, v2     ; encoding: [0x22,0x00,0x3f,0xd0,0x01,0x05,0x02,0x00]
+0x22,0x00,0x3f,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_f32_e64 s[36:37], v1, v2       ; encoding: [0x24,0x00,0x50,0xd0,0x01,0x05,0x02,0x00]
+0x24,0x00,0x50,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_f32_e64 s[38:39], v1, v2      ; encoding: [0x26,0x00,0x51,0xd0,0x01,0x05,0x02,0x00]
+0x26,0x00,0x51,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_f32_e64 s[40:41], v1, v2      ; encoding: [0x28,0x00,0x52,0xd0,0x01,0x05,0x02,0x00]
+0x28,0x00,0x52,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_f32_e64 s[42:43], v1, v2      ; encoding: [0x2a,0x00,0x53,0xd0,0x01,0x05,0x02,0x00]
+0x2a,0x00,0x53,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_f32_e64 s[44:45], v1, v2      ; encoding: [0x2c,0x00,0x54,0xd0,0x01,0x05,0x02,0x00]
+0x2c,0x00,0x54,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lg_f32_e64 s[46:47], v1, v2      ; encoding: [0x2e,0x00,0x55,0xd0,0x01,0x05,0x02,0x00]
+0x2e,0x00,0x55,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_f32_e64 s[48:49], v1, v2      ; encoding: [0x30,0x00,0x56,0xd0,0x01,0x05,0x02,0x00]
+0x30,0x00,0x56,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_o_f32_e64 s[50:51], v1, v2       ; encoding: [0x32,0x00,0x57,0xd0,0x01,0x05,0x02,0x00]
+0x32,0x00,0x57,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_u_f32_e64 s[52:53], v1, v2       ; encoding: [0x34,0x00,0x58,0xd0,0x01,0x05,0x02,0x00]
+0x34,0x00,0x58,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nge_f32_e64 s[54:55], v1, v2     ; encoding: [0x36,0x00,0x59,0xd0,0x01,0x05,0x02,0x00]
+0x36,0x00,0x59,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nlg_f32_e64 s[56:57], v1, v2     ; encoding: [0x38,0x00,0x5a,0xd0,0x01,0x05,0x02,0x00]
+0x38,0x00,0x5a,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ngt_f32_e64 s[58:59], v1, v2     ; encoding: [0x3a,0x00,0x5b,0xd0,0x01,0x05,0x02,0x00]
+0x3a,0x00,0x5b,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nle_f32_e64 s[60:61], v1, v2     ; encoding: [0x3c,0x00,0x5c,0xd0,0x01,0x05,0x02,0x00]
+0x3c,0x00,0x5c,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_neq_f32_e64 s[62:63], v1, v2     ; encoding: [0x3e,0x00,0x5d,0xd0,0x01,0x05,0x02,0x00]
+0x3e,0x00,0x5d,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nlt_f32_e64 s[64:65], v1, v2     ; encoding: [0x40,0x00,0x5e,0xd0,0x01,0x05,0x02,0x00]
+0x40,0x00,0x5e,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_tru_f32_e64 s[66:67], v1, v2     ; encoding: [0x42,0x00,0x5f,0xd0,0x01,0x05,0x02,0x00]
+0x42,0x00,0x5f,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_f64_e64 s[68:69], v[1:2], v[2:3] ; encoding: [0x44,0x00,0x70,0xd0,0x01,0x05,0x02,0x00]
+0x44,0x00,0x70,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_f64_e64 s[70:71], v[1:2], v[2:3] ; encoding: [0x46,0x00,0x71,0xd0,0x01,0x05,0x02,0x00]
+0x46,0x00,0x71,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_f64_e64 s[72:73], v[1:2], v[2:3] ; encoding: [0x48,0x00,0x72,0xd0,0x01,0x05,0x02,0x00]
+0x48,0x00,0x72,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_f64_e64 s[74:75], v[1:2], v[2:3] ; encoding: [0x4a,0x00,0x73,0xd0,0x01,0x05,0x02,0x00]
+0x4a,0x00,0x73,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_f64_e64 s[76:77], v[1:2], v[2:3] ; encoding: [0x4c,0x00,0x74,0xd0,0x01,0x05,0x02,0x00]
+0x4c,0x00,0x74,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lg_f64_e64 s[78:79], v[1:2], v[2:3] ; encoding: [0x4e,0x00,0x75,0xd0,0x01,0x05,0x02,0x00]
+0x4e,0x00,0x75,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_f64_e64 s[80:81], v[1:2], v[2:3] ; encoding: [0x50,0x00,0x76,0xd0,0x01,0x05,0x02,0x00]
+0x50,0x00,0x76,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_o_f64_e64 s[82:83], v[1:2], v[2:3] ; encoding: [0x52,0x00,0x77,0xd0,0x01,0x05,0x02,0x00]
+0x52,0x00,0x77,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_u_f64_e64 s[84:85], v[1:2], v[2:3] ; encoding: [0x54,0x00,0x78,0xd0,0x01,0x05,0x02,0x00]
+0x54,0x00,0x78,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nge_f64_e64 s[86:87], v[1:2], v[2:3] ; encoding: [0x56,0x00,0x79,0xd0,0x01,0x05,0x02,0x00]
+0x56,0x00,0x79,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nlg_f64_e64 s[88:89], v[1:2], v[2:3] ; encoding: [0x58,0x00,0x7a,0xd0,0x01,0x05,0x02,0x00]
+0x58,0x00,0x7a,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ngt_f64_e64 s[90:91], v[1:2], v[2:3] ; encoding: [0x5a,0x00,0x7b,0xd0,0x01,0x05,0x02,0x00]
+0x5a,0x00,0x7b,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nle_f64_e64 s[92:93], v[1:2], v[2:3] ; encoding: [0x5c,0x00,0x7c,0xd0,0x01,0x05,0x02,0x00]
+0x5c,0x00,0x7c,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_neq_f64_e64 s[94:95], v[1:2], v[2:3] ; encoding: [0x5e,0x00,0x7d,0xd0,0x01,0x05,0x02,0x00]
+0x5e,0x00,0x7d,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nlt_f64_e64 s[96:97], v[1:2], v[2:3] ; encoding: [0x60,0x00,0x7e,0xd0,0x01,0x05,0x02,0x00]
+0x60,0x00,0x7e,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_tru_f64_e64 s[98:99], v[1:2], v[2:3] ; encoding: [0x62,0x00,0x7f,0xd0,0x01,0x05,0x02,0x00]
+0x62,0x00,0x7f,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_i16_e64 s[100:101], v1, v2     ; encoding: [0x64,0x00,0xb0,0xd0,0x01,0x05,0x02,0x00]
+0x64,0x00,0xb0,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_i16_e64 flat_scratch, v1, v2  ; encoding: [0x66,0x00,0xb1,0xd0,0x01,0x05,0x02,0x00]
+0x66,0x00,0xb1,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_i16_e64 xnack_mask, v1, v2    ; encoding: [0x68,0x00,0xb2,0xd0,0x01,0x05,0x02,0x00]
+0x68,0x00,0xb2,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_i16_e64 vcc, v1, v2           ; encoding: [0x6a,0x00,0xb3,0xd0,0x01,0x05,0x02,0x00]
+0x6a,0x00,0xb3,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_i16_e64 tba, v1, v2           ; encoding: [0x6c,0x00,0xb4,0xd0,0x01,0x05,0x02,0x00]
+0x6c,0x00,0xb4,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ne_i16_e64 tma, v1, v2           ; encoding: [0x6e,0x00,0xb5,0xd0,0x01,0x05,0x02,0x00]
+0x6e,0x00,0xb5,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_i16_e64 ttmp[0:1], v1, v2     ; encoding: [0x70,0x00,0xb6,0xd0,0x01,0x05,0x02,0x00]
+0x70,0x00,0xb6,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_t_i16_e64 ttmp[2:3], v1, v2      ; encoding: [0x72,0x00,0xb7,0xd0,0x01,0x05,0x02,0x00]
+0x72,0x00,0xb7,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_u16_e64 ttmp[4:5], v1, v2      ; encoding: [0x74,0x00,0xb8,0xd0,0x01,0x05,0x02,0x00]
+0x74,0x00,0xb8,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_u16_e64 ttmp[6:7], v1, v2     ; encoding: [0x76,0x00,0xb9,0xd0,0x01,0x05,0x02,0x00]
+0x76,0x00,0xb9,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_u16_e64 ttmp[8:9], v1, v2     ; encoding: [0x78,0x00,0xba,0xd0,0x01,0x05,0x02,0x00]
+0x78,0x00,0xba,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_u16_e64 ttmp[10:11], v1, v2   ; encoding: [0x7a,0x00,0xbb,0xd0,0x01,0x05,0x02,0x00]
+0x7a,0x00,0xbb,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_u16_e64 exec, v1, v2          ; encoding: [0x7e,0x00,0xbc,0xd0,0x01,0x05,0x02,0x00]
+0x7e,0x00,0xbc,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ne_u16_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0xbd,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xbd,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_u16_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0xbe,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xbe,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_t_u16_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0xbf,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xbf,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_i32_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0xd0,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd0,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_i32_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0xd1,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd1,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_i32_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0xd2,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd2,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_i32_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0xd3,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd3,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_i32_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0xd4,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd4,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ne_i32_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0xd5,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd5,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_i32_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0xd6,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd6,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_t_i32_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0xd7,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd7,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_u32_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0xd8,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd8,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_u32_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0xd9,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd9,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_u32_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0xda,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xda,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_u32_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0xdb,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xdb,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_u32_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0xdc,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xdc,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ne_u32_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0xdd,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xdd,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_u32_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0xde,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xde,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_t_u32_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0xdf,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xdf,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf0,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf0,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf1,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf1,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf2,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf2,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf3,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf3,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf4,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf4,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ne_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf5,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf5,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf6,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf6,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_t_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf7,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf7,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf8,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf8,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf9,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf9,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xfa,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xfa,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xfb,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xfb,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xfc,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xfc,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ne_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xfd,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xfd,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xfe,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xfe,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_t_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xff,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xff,0xd0,0x01,0x05,0x02,0x00
+
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3c_nowarn.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3c_nowarn.txt
new file mode 100644
index 0000000000000..0c4f10799aa37
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3c_nowarn.txt
@@ -0,0 +1,402 @@
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx900 -disassemble -show-encoding < %s 2>&1 | FileCheck -strict-whitespace %s
+
+# In GFX10+, v_cmpx_* use EXEC as the implicit dst. The disassembler issues a warning when the dst
+# is not 0x7e (EXEC). In GFX9 and earlier, these instructions have explicit dst. Therefore, such
+# warnings should not be issued.
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmp_class_f32_e64 s[10:11], v1, v2    ; encoding: [0x0a,0x00,0x10,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0x10,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmp_class_f32_e64 flat_scratch, v1, v2 ; encoding: [0x66,0x00,0x10,0xd0,0x01,0x05,0x02,0x00]
+0x66,0x00,0x10,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmp_class_f32_e64 vcc, v1, v2         ; encoding: [0x6a,0x00,0x10,0xd0,0x01,0x05,0x02,0x00]
+0x6a,0x00,0x10,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_class_f64_e64 s[0:1], v[1:2], v2 ; encoding: [0x00,0x00,0x13,0xd0,0x01,0x05,0x02,0x00]
+0x00,0x00,0x13,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_class_f16_e64 s[2:3], v1, v2     ; encoding: [0x02,0x00,0x15,0xd0,0x01,0x05,0x02,0x00]
+0x02,0x00,0x15,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_f16_e64 s[4:5], v1, v2         ; encoding: [0x04,0x00,0x30,0xd0,0x01,0x05,0x02,0x00]
+0x04,0x00,0x30,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_f16_e64 s[6:7], v1, v2        ; encoding: [0x06,0x00,0x31,0xd0,0x01,0x05,0x02,0x00]
+0x06,0x00,0x31,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_f16_e64 s[8:9], v1, v2        ; encoding: [0x08,0x00,0x32,0xd0,0x01,0x05,0x02,0x00]
+0x08,0x00,0x32,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_f16_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0x33,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0x33,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_f16_e64 s[12:13], v1, v2      ; encoding: [0x0c,0x00,0x34,0xd0,0x01,0x05,0x02,0x00]
+0x0c,0x00,0x34,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lg_f16_e64 s[14:15], v1, v2      ; encoding: [0x0e,0x00,0x35,0xd0,0x01,0x05,0x02,0x00]
+0x0e,0x00,0x35,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_f16_e64 s[16:17], v1, v2      ; encoding: [0x10,0x00,0x36,0xd0,0x01,0x05,0x02,0x00]
+0x10,0x00,0x36,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_o_f16_e64 s[18:19], v1, v2       ; encoding: [0x12,0x00,0x37,0xd0,0x01,0x05,0x02,0x00]
+0x12,0x00,0x37,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_u_f16_e64 s[20:21], v1, v2       ; encoding: [0x14,0x00,0x38,0xd0,0x01,0x05,0x02,0x00]
+0x14,0x00,0x38,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nge_f16_e64 s[22:23], v1, v2     ; encoding: [0x16,0x00,0x39,0xd0,0x01,0x05,0x02,0x00]
+0x16,0x00,0x39,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nlg_f16_e64 s[24:25], v1, v2     ; encoding: [0x18,0x00,0x3a,0xd0,0x01,0x05,0x02,0x00]
+0x18,0x00,0x3a,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ngt_f16_e64 s[26:27], v1, v2     ; encoding: [0x1a,0x00,0x3b,0xd0,0x01,0x05,0x02,0x00]
+0x1a,0x00,0x3b,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nle_f16_e64 s[28:29], v1, v2     ; encoding: [0x1c,0x00,0x3c,0xd0,0x01,0x05,0x02,0x00]
+0x1c,0x00,0x3c,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_neq_f16_e64 s[30:31], v1, v2     ; encoding: [0x1e,0x00,0x3d,0xd0,0x01,0x05,0x02,0x00]
+0x1e,0x00,0x3d,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nlt_f16_e64 s[32:33], v1, v2     ; encoding: [0x20,0x00,0x3e,0xd0,0x01,0x05,0x02,0x00]
+0x20,0x00,0x3e,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_tru_f16_e64 s[34:35], v1, v2     ; encoding: [0x22,0x00,0x3f,0xd0,0x01,0x05,0x02,0x00]
+0x22,0x00,0x3f,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_f32_e64 s[36:37], v1, v2       ; encoding: [0x24,0x00,0x50,0xd0,0x01,0x05,0x02,0x00]
+0x24,0x00,0x50,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_f32_e64 s[38:39], v1, v2      ; encoding: [0x26,0x00,0x51,0xd0,0x01,0x05,0x02,0x00]
+0x26,0x00,0x51,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_f32_e64 s[40:41], v1, v2      ; encoding: [0x28,0x00,0x52,0xd0,0x01,0x05,0x02,0x00]
+0x28,0x00,0x52,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_f32_e64 s[42:43], v1, v2      ; encoding: [0x2a,0x00,0x53,0xd0,0x01,0x05,0x02,0x00]
+0x2a,0x00,0x53,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_f32_e64 s[44:45], v1, v2      ; encoding: [0x2c,0x00,0x54,0xd0,0x01,0x05,0x02,0x00]
+0x2c,0x00,0x54,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lg_f32_e64 s[46:47], v1, v2      ; encoding: [0x2e,0x00,0x55,0xd0,0x01,0x05,0x02,0x00]
+0x2e,0x00,0x55,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_f32_e64 s[48:49], v1, v2      ; encoding: [0x30,0x00,0x56,0xd0,0x01,0x05,0x02,0x00]
+0x30,0x00,0x56,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_o_f32_e64 s[50:51], v1, v2       ; encoding: [0x32,0x00,0x57,0xd0,0x01,0x05,0x02,0x00]
+0x32,0x00,0x57,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_u_f32_e64 s[52:53], v1, v2       ; encoding: [0x34,0x00,0x58,0xd0,0x01,0x05,0x02,0x00]
+0x34,0x00,0x58,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nge_f32_e64 s[54:55], v1, v2     ; encoding: [0x36,0x00,0x59,0xd0,0x01,0x05,0x02,0x00]
+0x36,0x00,0x59,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nlg_f32_e64 s[56:57], v1, v2     ; encoding: [0x38,0x00,0x5a,0xd0,0x01,0x05,0x02,0x00]
+0x38,0x00,0x5a,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ngt_f32_e64 s[58:59], v1, v2     ; encoding: [0x3a,0x00,0x5b,0xd0,0x01,0x05,0x02,0x00]
+0x3a,0x00,0x5b,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nle_f32_e64 s[60:61], v1, v2     ; encoding: [0x3c,0x00,0x5c,0xd0,0x01,0x05,0x02,0x00]
+0x3c,0x00,0x5c,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_neq_f32_e64 s[62:63], v1, v2     ; encoding: [0x3e,0x00,0x5d,0xd0,0x01,0x05,0x02,0x00]
+0x3e,0x00,0x5d,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nlt_f32_e64 s[64:65], v1, v2     ; encoding: [0x40,0x00,0x5e,0xd0,0x01,0x05,0x02,0x00]
+0x40,0x00,0x5e,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_f64_e64 s[66:67], v[1:2], v[2:3] ; encoding: [0x42,0x00,0x70,0xd0,0x01,0x05,0x02,0x00]
+0x42,0x00,0x70,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_f64_e64 s[68:69], v[1:2], v[2:3] ; encoding: [0x44,0x00,0x72,0xd0,0x01,0x05,0x02,0x00]
+0x44,0x00,0x72,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_f64_e64 s[70:71], v[1:2], v[2:3] ; encoding: [0x46,0x00,0x73,0xd0,0x01,0x05,0x02,0x00]
+0x46,0x00,0x73,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_f64_e64 s[72:73], v[1:2], v[2:3] ; encoding: [0x48,0x00,0x74,0xd0,0x01,0x05,0x02,0x00]
+0x48,0x00,0x74,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lg_f64_e64 s[74:75], v[1:2], v[2:3] ; encoding: [0x4a,0x00,0x75,0xd0,0x01,0x05,0x02,0x00]
+0x4a,0x00,0x75,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_f64_e64 s[76:77], v[1:2], v[2:3] ; encoding: [0x4c,0x00,0x76,0xd0,0x01,0x05,0x02,0x00]
+0x4c,0x00,0x76,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_o_f64_e64 s[78:79], v[1:2], v[2:3] ; encoding: [0x4e,0x00,0x77,0xd0,0x01,0x05,0x02,0x00]
+0x4e,0x00,0x77,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_u_f64_e64 s[80:81], v[1:2], v[2:3] ; encoding: [0x50,0x00,0x78,0xd0,0x01,0x05,0x02,0x00]
+0x50,0x00,0x78,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nge_f64_e64 s[82:83], v[1:2], v[2:3] ; encoding: [0x52,0x00,0x79,0xd0,0x01,0x05,0x02,0x00]
+0x52,0x00,0x79,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nlg_f64_e64 s[84:85], v[1:2], v[2:3] ; encoding: [0x54,0x00,0x7a,0xd0,0x01,0x05,0x02,0x00]
+0x54,0x00,0x7a,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ngt_f64_e64 s[86:87], v[1:2], v[2:3] ; encoding: [0x56,0x00,0x7b,0xd0,0x01,0x05,0x02,0x00]
+0x56,0x00,0x7b,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nle_f64_e64 s[88:89], v[1:2], v[2:3] ; encoding: [0x58,0x00,0x7c,0xd0,0x01,0x05,0x02,0x00]
+0x58,0x00,0x7c,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_neq_f64_e64 s[90:91], v[1:2], v[2:3] ; encoding: [0x5a,0x00,0x7d,0xd0,0x01,0x05,0x02,0x00]
+0x5a,0x00,0x7d,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_nlt_f64_e64 s[92:93], v[1:2], v[2:3] ; encoding: [0x5c,0x00,0x7e,0xd0,0x01,0x05,0x02,0x00]
+0x5c,0x00,0x7e,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_tru_f64_e64 s[94:95], v[1:2], v[2:3] ; encoding: [0x5e,0x00,0x7f,0xd0,0x01,0x05,0x02,0x00]
+0x5e,0x00,0x7f,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_i16_e64 s[96:97], v1, v2       ; encoding: [0x60,0x00,0xb0,0xd0,0x01,0x05,0x02,0x00]
+0x60,0x00,0xb0,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_i16_e64 s[98:99], v1, v2      ; encoding: [0x62,0x00,0xb1,0xd0,0x01,0x05,0x02,0x00]
+0x62,0x00,0xb1,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_i16_e64 s[100:101], v1, v2    ; encoding: [0x64,0x00,0xb2,0xd0,0x01,0x05,0x02,0x00]
+0x64,0x00,0xb2,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_i16_e64 flat_scratch, v1, v2  ; encoding: [0x66,0x00,0xb3,0xd0,0x01,0x05,0x02,0x00]
+0x66,0x00,0xb3,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_i16_e64 xnack_mask, v1, v2    ; encoding: [0x68,0x00,0xb4,0xd0,0x01,0x05,0x02,0x00]
+0x68,0x00,0xb4,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ne_i16_e64 vcc, v1, v2           ; encoding: [0x6a,0x00,0xb5,0xd0,0x01,0x05,0x02,0x00]
+0x6a,0x00,0xb5,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_i16_e64 ttmp[0:1], v1, v2     ; encoding: [0x6c,0x00,0xb6,0xd0,0x01,0x05,0x02,0x00]
+0x6c,0x00,0xb6,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_t_i16_e64 ttmp[2:3], v1, v2      ; encoding: [0x6e,0x00,0xb7,0xd0,0x01,0x05,0x02,0x00]
+0x6e,0x00,0xb7,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_u16_e64 ttmp[4:5], v1, v2      ; encoding: [0x70,0x00,0xb8,0xd0,0x01,0x05,0x02,0x00]
+0x70,0x00,0xb8,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_u16_e64 ttmp[6:7], v1, v2     ; encoding: [0x72,0x00,0xb9,0xd0,0x01,0x05,0x02,0x00]
+0x72,0x00,0xb9,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_u16_e64 ttmp[8:9], v1, v2     ; encoding: [0x74,0x00,0xba,0xd0,0x01,0x05,0x02,0x00]
+0x74,0x00,0xba,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_u16_e64 ttmp[10:11], v1, v2   ; encoding: [0x76,0x00,0xbb,0xd0,0x01,0x05,0x02,0x00]
+0x76,0x00,0xbb,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_u16_e64 ttmp[12:13], v1, v2   ; encoding: [0x78,0x00,0xbc,0xd0,0x01,0x05,0x02,0x00]
+0x78,0x00,0xbc,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ne_u16_e64 ttmp[14:15], v1, v2   ; encoding: [0x7a,0x00,0xbd,0xd0,0x01,0x05,0x02,0x00]
+0x7a,0x00,0xbd,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_u16_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0xbe,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xbe,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_t_u16_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0xbf,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xbf,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_i32_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0xd0,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd0,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_i32_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0xd1,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd1,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_i32_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0xd2,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd2,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_i32_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0xd3,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd3,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_i32_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0xd4,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd4,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ne_i32_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0xd5,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd5,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_i32_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0xd6,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd6,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_t_i32_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0xd7,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd7,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_u32_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0xd8,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd8,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_u32_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0xd9,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xd9,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_u32_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0xda,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xda,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_u32_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0xdb,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xdb,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_u32_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0xdc,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xdc,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ne_u32_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0xdd,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xdd,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_u32_e64 s[10:11], v1, v2      ; encoding: [0x0a,0x00,0xde,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xde,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_t_u32_e64 s[10:11], v1, v2       ; encoding: [0x0a,0x00,0xdf,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xdf,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf0,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf0,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf1,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf1,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf2,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf2,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf3,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf3,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf4,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf4,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ne_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf5,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf5,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf6,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf6,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_t_i64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf7,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf7,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_f_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf8,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf8,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_lt_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xf9,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xf9,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_eq_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xfa,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xfa,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_le_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xfb,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xfb,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_gt_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xfc,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xfc,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ne_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xfd,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xfd,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_ge_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xfe,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xfe,0xd0,0x01,0x05,0x02,0x00
+
+# CHECK-NOT: [[@LINE+2]]:1: warning: potentially undefined instruction encoding
+# CHECK: v_cmpx_t_u64_e64 s[10:11], v[1:2], v[2:3] ; encoding: [0x0a,0x00,0xff,0xd0,0x01,0x05,0x02,0x00]
+0x0a,0x00,0xff,0xd0,0x01,0x05,0x02,0x00
+
diff --git a/llvm/test/MC/Disassembler/AMDGPU/literals.txt b/llvm/test/MC/Disassembler/AMDGPU/literals.txt
new file mode 100644
index 0000000000000..bd013a15fa23c
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/literals.txt
@@ -0,0 +1,30 @@
+# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250 %s
+
+0xff,0x94,0xfe,0x7e,0x01,0x00,0x00,0x00
+# GFX1250: v_tanh_bf16_e32 v127.l, lit(0x1)        ; encoding: [0xff,0x94,0xfe,0x7e,0x01,0x00,0x00,0x00]
+
+0xff,0xd3,0x23,0xcc,0xff,0xd6,0x00,0x68,0x01,0x00,0x00,0x00
+# GFX1250: v_pk_add_bf16 v255, lit(0x1), vcc_hi op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[1,1] neg_hi:[1,1] clamp ; encoding: [0xff,0xd3,0x23,0xcc,0xff,0xd6,0x00,0x68,0x01,0x00,0x00,0x00]
+
+0xff,0x3e,0xfe,0x7e,0x01,0x00,0x00,0x00
+# GFX1250: v_tanh_f16_e32 v127.l, lit(0x1)         ; encoding: [0xff,0x3e,0xfe,0x7e,0x01,0x00,0x00,0x00]
+
+0xff,0xfe,0xff,0x79,0x01,0x00,0x00,0x00
+# GFX1250: v_pk_fmac_f16 v255, lit(0x1), v255      ; encoding: [0xff,0xfe,0xff,0x79,0x01,0x00,0x00,0x00]
+
+# The immediate is always literal in this instruction.
+0x01,0x00,0x73,0xd7,0xff,0x00,0x00,0x00,0x01,0x00,0x00,0x00
+# GFX1250: v_cvt_pk_bf8_f16 v1.l, 1                ; encoding: [0x01,0x00,0x73,0xd7,0xff,0x00,0x00,0x00,0x01,0x00,0x00,0x00]
+
+0xff,0xec,0x02,0x7e,0x01,0x00,0x00,0x00
+# GFX1250: v_cvt_pk_f16_bf8 v1, lit(0x1)           ; encoding: [0xff,0xec,0x02,0x7e,0x01,0x00,0x00,0x00]
+
+0x0a,0x40,0x2d,0xcc,0xff,0x04,0x0e,0x1c,0x01,0x00,0x00,0x00
+# GFX1250: v_pk_add_min_i16 v10, lit(0x1), v2, v3  ; encoding: [0x0a,0x40,0x2d,0xcc,0xff,0x04,0x0e,0x1c,0x01,0x00,0x00,0x00]
+
+0xff,0x3c,0xfe,0x7f,0x01,0x00,0x00,0x00
+# GFX1250: v_tanh_f32_e32 v255, lit(0x1)           ; encoding: [0xff,0x3c,0xfe,0x7f,0x01,0x00,0x00,0x00]
+
+0xff,0x3a,0xfc,0x7f,0x01,0x00,0x00,0x00
+# GFX1250: v_mov_b64_e32 v[254:255], lit(0x1)      ; encoding: [0xfe,0x3a,0xfc,0x7f,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
diff --git a/llvm/test/MC/Disassembler/X86/apx/pushp-popp.txt b/llvm/test/MC/Disassembler/X86/apx/pushp-popp.txt
index 4ec534fc9dcf6..fa40fe6581c38 100644
--- a/llvm/test/MC/Disassembler/X86/apx/pushp-popp.txt
+++ b/llvm/test/MC/Disassembler/X86/apx/pushp-popp.txt
@@ -17,6 +17,10 @@
 # INTEL: pushp	r16
 0xd5,0x18,0x50
 
+# ATT:   pushq	%r16
+# INTEL: push	r16
+0xd5,0x10,0x50
+
 # ATT:   popp	%rax
 # INTEL: popp	rax
 0xd5,0x08,0x58
@@ -32,3 +36,7 @@
 # ATT:   popp	%r16
 # INTEL: popp	r16
 0xd5,0x18,0x58
+
+# ATT:   popq	%r16
+# INTEL: pop	r16
+0xd5,0x10,0x58
diff --git a/llvm/test/MC/X86/apx/pushp-popp-att.s b/llvm/test/MC/X86/apx/pushp-popp-att.s
index a8107448bb088..d638034f29f98 100644
--- a/llvm/test/MC/X86/apx/pushp-popp-att.s
+++ b/llvm/test/MC/X86/apx/pushp-popp-att.s
@@ -1,7 +1,7 @@
 # RUN: llvm-mc -triple x86_64 -show-encoding %s | FileCheck %s
 # RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
 
-# ERROR-COUNT-8: error:
+# ERROR-COUNT-10: error:
 # ERROR-NOT: error:
 
 # CHECK: pushp	%rax
@@ -16,6 +16,9 @@
 # CHECK: pushp	%r16
 # CHECK: encoding: [0xd5,0x18,0x50]
          pushp	%r16
+# CHECK: pushq	%r16
+# CHECK: encoding: [0xd5,0x10,0x50]
+         pushq	%r16
 
 # CHECK: popp	%rax
 # CHECK: encoding: [0xd5,0x08,0x58]
@@ -29,3 +32,6 @@
 # CHECK: popp	%r16
 # CHECK: encoding: [0xd5,0x18,0x58]
          popp	%r16
+# CHECK: popq	%r16
+# CHECK: encoding: [0xd5,0x10,0x58]
+         popq	%r16
diff --git a/llvm/test/MC/X86/verify-callgraph-section.s b/llvm/test/MC/X86/verify-callgraph-section.s
index ce07228facb15..9be5a68148f82 100644
--- a/llvm/test/MC/X86/verify-callgraph-section.s
+++ b/llvm/test/MC/X86/verify-callgraph-section.s
@@ -2,7 +2,7 @@
 /// (annotated by generated temporary labels .Ltmp*) are associated
 /// with the corresponding callee type identifiers.
 
-// RUN: llvm-mc -triple=x86_64 -filetype=obj -o - < %s | llvm-readelf -x .callgraph - | FileCheck %s
+// RUN: llvm-mc -triple=x86_64 -filetype=obj -o - < %s | llvm-readelf -x .llvm.callgraph - | FileCheck %s
 	
 	.text
 	.globl	ball                            # -- Begin function ball
@@ -38,7 +38,7 @@ ball:                                   # @ball
 	addq	$32, %rsp	
 	popq	%rbx	
 	retq
-	.section	.callgraph,"o",@progbits,.text
+	.section	.llvm.callgraph,"o",@progbits,.text
 	.quad	0
 	.quad	.Lfunc_begin0
 	.quad	1
diff --git a/llvm/test/Other/loop-pm-invalidation.ll b/llvm/test/Other/loop-pm-invalidation.ll
index 4bead0b734eae..25552f7f139fd 100644
--- a/llvm/test/Other/loop-pm-invalidation.ll
+++ b/llvm/test/Other/loop-pm-invalidation.ll
@@ -16,11 +16,6 @@
 ; RUN: opt -disable-output -disable-verify -verify-analysis-invalidation=0  -debug-pass-manager %s -aa-pipeline= 2>&1 \
 ; RUN:     -passes='loop(no-op-loop,loop-deletion),invalidate<scalar-evolution>,loop(no-op-loop)' \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-SCEV-INV-AFTER-DELETE
-;
-; Test that BFI is invalidated after the loop adapter if any of the loop passes
-; invalidated it.
-; RUN: opt -disable-output -disable-verify -verify-analysis-invalidation=0  -debug-pass-manager %s -aa-pipeline= 2>&1 \
-; RUN:     -O1 | FileCheck %s --check-prefix=CHECK-BFI-INV
 
 define void @no_loops() {
 ; CHECK-LOOP-INV: Running pass: LoopSimplifyPass
@@ -247,28 +242,3 @@ l0.header:
 exit:
   ret void
 }
-
-; CHECK-BFI-INV-LABEL: Running analysis: OuterAnalysisManagerProxy<{{.*}}> on loop %l0.header in function simplifiable_loop
-; CHECK-BFI-INV-NEXT: Running pass: LoopInstSimplifyPass on loop %l0.header in function simplifiable_loop
-; CHECK-BFI-INV-NEXT: Running pass: LoopSimplifyCFGPass on loop %l0.header in function simplifiable_loop
-; CHECK-BFI-INV-NEXT: Running pass: LICMPass on loop %l0.header in function simplifiable_loop
-; CHECK-BFI-INV-NEXT: Running pass: LoopRotatePass on loop %l0.header in function simplifiable_loop
-; CHECK-BFI-INV-NEXT: Running pass: LICMPass on loop %l0.header in function simplifiable_loop
-; CHECK-BFI-INV-NEXT: Running pass: SimpleLoopUnswitchPass on loop %l0.header in function simplifiable_loop
-; CHECK-BFI-INV-NEXT: Invalidating analysis: PostDominatorTreeAnalysis on simplifiable_loop
-; CHECK-BFI-INV-NEXT: Invalidating analysis: BranchProbabilityAnalysis on simplifiable_loop
-; CHECK-BFI-INV-NEXT: Invalidating analysis: BlockFrequencyAnalysis on simplifiable_loop
-; CHECK-BFI-INV-NEXT: Running pass: SimplifyCFGPass on simplifiable_loop (5 instructions)
-
-define void @simplifiable_loop(i1 %c) !prof !0 {
-entry:
-  br label %l0.header
-
-l0.header:
-  br label %l0.latch
-
-l0.latch:
-  br i1 %c, label %l0.header, label %l0.latch
-}
-
-!0 = !{!"function_entry_count", i64 1}
diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
index fe9a14c5ca251..3be04e1adbabb 100644
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@@ -186,7 +186,6 @@
 ; CHECK-O-NEXT: Running pass: LoopRotatePass
 ; CHECK-O-NEXT: Running pass: LICM
 ; CHECK-O-NEXT: Running pass: SimpleLoopUnswitchPass
-; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running pass: LoopSimplifyPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
index 8d7fe7085a86d..57c1f87567ee0 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
@@ -113,7 +113,6 @@
 ; CHECK-O-NEXT: Running pass: LoopRotatePass
 ; CHECK-O-NEXT: Running pass: LICM
 ; CHECK-O-NEXT: Running pass: SimpleLoopUnswitchPass
-; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running pass: LoopSimplifyPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
index 04f7fec9588ef..df56a5f8063de 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
@@ -100,7 +100,6 @@
 ; CHECK-O-NEXT: Running pass: LoopRotatePass
 ; CHECK-O-NEXT: Running pass: LICM
 ; CHECK-O-NEXT: Running pass: SimpleLoopUnswitchPass
-; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running pass: LoopSimplifyPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
index 3b75685835de2..018995267faf3 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
@@ -109,7 +109,6 @@
 ; CHECK-O-NEXT: Running pass: LoopRotatePass
 ; CHECK-O-NEXT: Running pass: LICM
 ; CHECK-O-NEXT: Running pass: SimpleLoopUnswitchPass
-; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running pass: LoopSimplifyPass
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
index 2c409221cdf57..83aec1d1ad7f1 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
@@ -146,7 +146,6 @@
 ; CHECK-O-NEXT: Running pass: LoopRotatePass
 ; CHECK-O-NEXT: Running pass: LICM
 ; CHECK-O-NEXT: Running pass: SimpleLoopUnswitchPass
-; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running pass: LoopSimplifyPass
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
index dd8993a15b0f9..2e316342e99fe 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
@@ -149,7 +149,6 @@
 ; CHECK-O-NEXT: Running pass: LoopRotatePass
 ; CHECK-O-NEXT: Running pass: LICM
 ; CHECK-O-NEXT: Running pass: SimpleLoopUnswitchPass
-; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running pass: LoopSimplifyPass
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
index 4da59744c29d1..b7ae2560b31c6 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
@@ -114,7 +114,6 @@
 ; CHECK-O-NEXT: Running pass: LoopRotatePass
 ; CHECK-O-NEXT: Running pass: LICM
 ; CHECK-O-NEXT: Running pass: SimpleLoopUnswitchPass
-; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running pass: LoopSimplifyPass
diff --git a/llvm/test/TableGen/RuntimeLibcallEmitter.td b/llvm/test/TableGen/RuntimeLibcallEmitter.td
index 0c23e3bac195e..7aaf3a0e8e1cf 100644
--- a/llvm/test/TableGen/RuntimeLibcallEmitter.td
+++ b/llvm/test/TableGen/RuntimeLibcallEmitter.td
@@ -73,6 +73,8 @@ def BlahLibrary : SystemRuntimeLibrary<isBlahArch, (add calloc, LibraryWithCondi
 
 // All entries should be emitted in Libcall enum.
 // CHECK: #ifdef GET_RUNTIME_LIBCALL_ENUM
+// CHECK-NEXT: #undef GET_RUNTIME_LIBCALL_ENUM
+// CHECK-EMPTY:
 // CHECK-NEXT: namespace llvm {
 // CHECK-NEXT: namespace RTLIB {
 // CHECK-NEXT: enum Libcall : unsigned short {
@@ -101,9 +103,12 @@ def BlahLibrary : SystemRuntimeLibrary<isBlahArch, (add calloc, LibraryWithCondi
 // CHECK-NEXT: constexpr size_t NumLibcallImpls = 9;
 // CHECK-NEXT: } // End namespace RTLIB
 // CHECK-NEXT: } // End namespace llvm
-// CHECK-NEXT: #endif
+// CHECK-EMPTY:
+// CHECK-NEXT: #endif // GET_RUNTIME_LIBCALL_ENUM
 
 // CHECK: #ifdef GET_INIT_RUNTIME_LIBCALL_NAMES
+// CHECK-NEXT: #undef GET_INIT_RUNTIME_LIBCALL_NAMES
+// CHECK-EMPTY:
 // CHECK-EMPTY:
 // CHECK-NEXT: #ifdef __GNUC__
 // CHECK-NEXT: #pragma GCC diagnostic push
@@ -163,13 +168,18 @@ def BlahLibrary : SystemRuntimeLibrary<isBlahArch, (add calloc, LibraryWithCondi
 // CHECK-NEXT: };
 
 // CHECK: #ifdef GET_LOOKUP_LIBCALL_IMPL_NAME_BODY
+// CHECK-NEXT: #undef GET_LOOKUP_LIBCALL_IMPL_NAME_BODY
+// CHECK-EMPTY:
 // CHECK-NEXT: size_t Size = Name.size();
 // CHECK-NEXT: if (Size == 0 || Size > 9)
 // CHECK-NEXT:   return enum_seq(RTLIB::Unsupported, RTLIB::Unsupported);
 // CHECK-NEXT: return lookupLibcallImplNameImpl(Name);
-// CHECK-NEXT: #endif
+// CHECK-EMPTY:
+// CHECK-NEXT: #endif // GET_LOOKUP_LIBCALL_IMPL_NAME_BODY
 
 // CHECK: #ifdef DEFINE_GET_LOOKUP_LIBCALL_IMPL_NAME
+// CHECK-NEXT: #undef DEFINE_GET_LOOKUP_LIBCALL_IMPL_NAME
+// CHECK-EMPTY:
 // CHECK-NEXT: static inline uint64_t hash(StringRef Str) {
 // CHECK-NEXT: return static_cast<uint32_t>(xxh3_64bits(Str));
 // CHECK-NEXT: }
diff --git a/llvm/test/TableGen/directive1.td b/llvm/test/TableGen/directive1.td
index 3eda077eeabf7..475faf9254157 100644
--- a/llvm/test/TableGen/directive1.td
+++ b/llvm/test/TableGen/directive1.td
@@ -177,6 +177,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // CHECK-NEXT:    static constexpr bool is_iterable = true;
 // CHECK-NEXT:  };
 // CHECK-NEXT:  } // namespace llvm
+// CHECK-EMPTY:
 // CHECK-NEXT:  #endif // LLVM_Tdl_INC
 
 
diff --git a/llvm/test/TableGen/directive2.td b/llvm/test/TableGen/directive2.td
index a25197c3efd93..ccc09446b4465 100644
--- a/llvm/test/TableGen/directive2.td
+++ b/llvm/test/TableGen/directive2.td
@@ -150,6 +150,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // CHECK-NEXT:    static constexpr bool is_iterable = true;
 // CHECK-NEXT:  };
 // CHECK-NEXT:  } // namespace llvm
+// CHECK-EMPTY:
 // CHECK-NEXT:  #endif // LLVM_Tdl_INC
 
 // IMPL:      #ifdef GEN_FLANG_DIRECTIVE_CLAUSE_SETS
diff --git a/llvm/test/ThinLTO/X86/memprof-supports-hot-cold-new.ll b/llvm/test/ThinLTO/X86/memprof-supports-hot-cold-new.ll
index 7a4d860d8d0d9..fe2a00259f4c9 100644
--- a/llvm/test/ThinLTO/X86/memprof-supports-hot-cold-new.ll
+++ b/llvm/test/ThinLTO/X86/memprof-supports-hot-cold-new.ll
@@ -17,11 +17,12 @@
 ; RUN:	-r=%t/foo.o,foo,plx \
 ; RUN:	-r=%t/foo.o,_Znam, \
 ; RUN:	-memprof-dump-ccg \
-; RUN:	 -save-temps \
-; RUN:	-o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
+; RUN:	-print-before=memprof-context-disambiguation \
+; RUN:	-thinlto-threads=1 \
+; RUN:	-o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR
+
 ; DUMP: Callsite Context Graph:
 
-; RUN: llvm-dis %t.out.1.3.import.bc -o - | FileCheck %s --check-prefix=IR
 ; IR: @main()
 ; IR: !memprof {{.*}} !callsite
 ; IR: @_Znam(i64 0) #[[ATTR:[0-9]+]]
@@ -41,13 +42,12 @@
 ; RUN:	-r=%t/foo.o,foo,plx \
 ; RUN:	-r=%t/foo.o,_Znam, \
 ; RUN:	-memprof-dump-ccg \
-; RUN:	 -save-temps \
+; RUN:	-print-before=memprof-context-disambiguation \
+; RUN:	-thinlto-threads=1 \
 ; RUN:	-o %t.out 2>&1 | FileCheck %s --allow-empty \
-; RUN:  --implicit-check-not "Callsite Context Graph:"
-
-; RUN: llvm-dis %t.out.1.3.import.bc -o - | FileCheck %s \
-; RUN: --implicit-check-not "!memprof" --implicit-check-not "!callsite" \
-; RUN: --implicit-check-not "memprof"="cold"
+; RUN:  --implicit-check-not "Callsite Context Graph:" \
+; RUN: 	--implicit-check-not "!memprof" --implicit-check-not "!callsite" \
+; RUN: 	--implicit-check-not "memprof"="cold"
 
 ;--- main.ll
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/test/Transforms/Attributor/readattrs.ll b/llvm/test/Transforms/Attributor/readattrs.ll
index 30cb0854d6478..1993472074ef8 100644
--- a/llvm/test/Transforms/Attributor/readattrs.ll
+++ b/llvm/test/Transforms/Attributor/readattrs.ll
@@ -137,13 +137,13 @@ define void @test9(<4 x ptr> %ptrs, <4 x i32>%val) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write)
 ; TUNIT-LABEL: define {{[^@]+}}@test9
 ; TUNIT-SAME: (<4 x ptr> [[PTRS:%.*]], <4 x i32> [[VAL:%.*]]) #[[ATTR0]] {
-; TUNIT-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[VAL]], <4 x ptr> [[PTRS]], i32 noundef 4, <4 x i1> noundef <i1 true, i1 false, i1 true, i1 false>) #[[ATTR16:[0-9]+]]
+; TUNIT-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[VAL]], <4 x ptr> align 4 [[PTRS]], <4 x i1> noundef <i1 true, i1 false, i1 true, i1 false>) #[[ATTR16:[0-9]+]]
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write)
 ; CGSCC-LABEL: define {{[^@]+}}@test9
 ; CGSCC-SAME: (<4 x ptr> [[PTRS:%.*]], <4 x i32> [[VAL:%.*]]) #[[ATTR0]] {
-; CGSCC-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[VAL]], <4 x ptr> [[PTRS]], i32 noundef 4, <4 x i1> noundef <i1 true, i1 false, i1 true, i1 false>) #[[ATTR18:[0-9]+]]
+; CGSCC-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[VAL]], <4 x ptr> align 4 [[PTRS]], <4 x i1> noundef <i1 true, i1 false, i1 true, i1 false>) #[[ATTR18:[0-9]+]]
 ; CGSCC-NEXT:    ret void
 ;
   call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32>%val, <4 x ptr> %ptrs, i32 4, <4 x i1><i1 true, i1 false, i1 true, i1 false>)
@@ -154,14 +154,14 @@ declare <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x
 define <4 x i32> @test10(<4 x ptr> %ptrs) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read)
 ; TUNIT-LABEL: define {{[^@]+}}@test10
-; TUNIT-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR7:[0-9]+]] {
-; TUNIT-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[PTRS]], i32 noundef 4, <4 x i1> noundef <i1 true, i1 false, i1 true, i1 false>, <4 x i32> undef) #[[ATTR17:[0-9]+]]
+; TUNIT-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR5:[0-9]+]] {
+; TUNIT-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[PTRS]], <4 x i1> noundef <i1 true, i1 false, i1 true, i1 false>, <4 x i32> undef) #[[ATTR17:[0-9]+]]
 ; TUNIT-NEXT:    ret <4 x i32> [[RES]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read)
 ; CGSCC-LABEL: define {{[^@]+}}@test10
-; CGSCC-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR8:[0-9]+]] {
-; CGSCC-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[PTRS]], i32 noundef 4, <4 x i1> noundef <i1 true, i1 false, i1 true, i1 false>, <4 x i32> undef) #[[ATTR19:[0-9]+]]
+; CGSCC-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR6:[0-9]+]] {
+; CGSCC-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[PTRS]], <4 x i1> noundef <i1 true, i1 false, i1 true, i1 false>, <4 x i32> undef) #[[ATTR19:[0-9]+]]
 ; CGSCC-NEXT:    ret <4 x i32> [[RES]]
 ;
   %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1><i1 true, i1 false, i1 true, i1 false>, <4 x i32>undef)
@@ -172,14 +172,14 @@ declare <4 x i32> @test11_1(<4 x ptr>) argmemonly nounwind readonly
 define <4 x i32> @test11_2(<4 x ptr> %ptrs) {
 ; TUNIT: Function Attrs: nosync nounwind memory(argmem: read)
 ; TUNIT-LABEL: define {{[^@]+}}@test11_2
-; TUNIT-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR9:[0-9]+]] {
-; TUNIT-NEXT:    [[RES:%.*]] = call <4 x i32> @test11_1(<4 x ptr> [[PTRS]]) #[[ATTR15:[0-9]+]]
+; TUNIT-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR7:[0-9]+]] {
+; TUNIT-NEXT:    [[RES:%.*]] = call <4 x i32> @test11_1(<4 x ptr> [[PTRS]]) #[[ATTR13:[0-9]+]]
 ; TUNIT-NEXT:    ret <4 x i32> [[RES]]
 ;
 ; CGSCC: Function Attrs: nosync nounwind memory(argmem: read)
 ; CGSCC-LABEL: define {{[^@]+}}@test11_2
-; CGSCC-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR10:[0-9]+]] {
-; CGSCC-NEXT:    [[RES:%.*]] = call <4 x i32> @test11_1(<4 x ptr> [[PTRS]]) #[[ATTR16:[0-9]+]]
+; CGSCC-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR8:[0-9]+]] {
+; CGSCC-NEXT:    [[RES:%.*]] = call <4 x i32> @test11_1(<4 x ptr> [[PTRS]]) #[[ATTR14:[0-9]+]]
 ; CGSCC-NEXT:    ret <4 x i32> [[RES]]
 ;
   %res = call <4 x i32> @test11_1(<4 x ptr> %ptrs)
@@ -191,13 +191,13 @@ declare <4 x i32> @test12_1(<4 x ptr>) argmemonly nounwind
 define <4 x i32> @test12_2(<4 x ptr> %ptrs) {
 ; TUNIT: Function Attrs: nounwind memory(argmem: readwrite)
 ; TUNIT-LABEL: define {{[^@]+}}@test12_2
-; TUNIT-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR10:[0-9]+]] {
+; TUNIT-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR8:[0-9]+]] {
 ; TUNIT-NEXT:    [[RES:%.*]] = call <4 x i32> @test12_1(<4 x ptr> [[PTRS]]) #[[ATTR18:[0-9]+]]
 ; TUNIT-NEXT:    ret <4 x i32> [[RES]]
 ;
 ; CGSCC: Function Attrs: nounwind memory(argmem: readwrite)
 ; CGSCC-LABEL: define {{[^@]+}}@test12_2
-; CGSCC-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR11:[0-9]+]] {
+; CGSCC-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR9:[0-9]+]] {
 ; CGSCC-NEXT:    [[RES:%.*]] = call <4 x i32> @test12_1(<4 x ptr> [[PTRS]]) #[[ATTR20:[0-9]+]]
 ; CGSCC-NEXT:    ret <4 x i32> [[RES]]
 ;
@@ -208,13 +208,13 @@ define <4 x i32> @test12_2(<4 x ptr> %ptrs) {
 define i32 @volatile_load(ptr %p) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
 ; TUNIT-LABEL: define {{[^@]+}}@volatile_load
-; TUNIT-SAME: (ptr nofree noundef align 4 [[P:%.*]]) #[[ATTR11:[0-9]+]] {
+; TUNIT-SAME: (ptr nofree noundef align 4 [[P:%.*]]) #[[ATTR9:[0-9]+]] {
 ; TUNIT-NEXT:    [[LOAD:%.*]] = load volatile i32, ptr [[P]], align 4
 ; TUNIT-NEXT:    ret i32 [[LOAD]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
 ; CGSCC-LABEL: define {{[^@]+}}@volatile_load
-; CGSCC-SAME: (ptr nofree noundef align 4 [[P:%.*]]) #[[ATTR12:[0-9]+]] {
+; CGSCC-SAME: (ptr nofree noundef align 4 [[P:%.*]]) #[[ATTR10:[0-9]+]] {
 ; CGSCC-NEXT:    [[LOAD:%.*]] = load volatile i32, ptr [[P]], align 4
 ; CGSCC-NEXT:    ret i32 [[LOAD]]
 ;
@@ -292,13 +292,13 @@ define void @byval_not_readonly_2(ptr byval(i8) %written) readonly {
 define void @byval_not_readnone_1(ptr byval(i8) %written) readnone {
 ; TUNIT: Function Attrs: nosync memory(none)
 ; TUNIT-LABEL: define {{[^@]+}}@byval_not_readnone_1
-; TUNIT-SAME: (ptr noalias nonnull byval(i8) dereferenceable(1) [[WRITTEN:%.*]]) #[[ATTR12:[0-9]+]] {
+; TUNIT-SAME: (ptr noalias nonnull byval(i8) dereferenceable(1) [[WRITTEN:%.*]]) #[[ATTR10:[0-9]+]] {
 ; TUNIT-NEXT:    call void @escape_i8(ptr nonnull dereferenceable(1) [[WRITTEN]])
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: nosync memory(none)
 ; CGSCC-LABEL: define {{[^@]+}}@byval_not_readnone_1
-; CGSCC-SAME: (ptr noalias nonnull byval(i8) dereferenceable(1) [[WRITTEN:%.*]]) #[[ATTR13:[0-9]+]] {
+; CGSCC-SAME: (ptr noalias nonnull byval(i8) dereferenceable(1) [[WRITTEN:%.*]]) #[[ATTR11:[0-9]+]] {
 ; CGSCC-NEXT:    call void @escape_i8(ptr nonnull dereferenceable(1) [[WRITTEN]])
 ; CGSCC-NEXT:    ret void
 ;
@@ -331,17 +331,17 @@ define void @byval_no_fnarg(ptr byval(i8) %written) {
 define void @testbyval(ptr %read_only) {
 ; TUNIT: Function Attrs: nosync
 ; TUNIT-LABEL: define {{[^@]+}}@testbyval
-; TUNIT-SAME: (ptr nonnull readonly captures(none) [[READ_ONLY:%.*]]) #[[ATTR13:[0-9]+]] {
+; TUNIT-SAME: (ptr nonnull readonly captures(none) [[READ_ONLY:%.*]]) #[[ATTR11:[0-9]+]] {
 ; TUNIT-NEXT:    call void @byval_not_readonly_1(ptr noalias nonnull readonly byval(i8) captures(none) [[READ_ONLY]]) #[[ATTR3]]
-; TUNIT-NEXT:    call void @byval_not_readnone_1(ptr noalias nonnull readnone byval(i8) captures(none) [[READ_ONLY]]) #[[ATTR13]]
+; TUNIT-NEXT:    call void @byval_not_readnone_1(ptr noalias nonnull readnone byval(i8) captures(none) [[READ_ONLY]]) #[[ATTR11]]
 ; TUNIT-NEXT:    call void @byval_no_fnarg(ptr noalias nofree noundef nonnull readonly byval(i8) captures(none) [[READ_ONLY]]) #[[ATTR19:[0-9]+]]
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: nosync
 ; CGSCC-LABEL: define {{[^@]+}}@testbyval
-; CGSCC-SAME: (ptr noundef nonnull readonly captures(none) dereferenceable(1) [[READ_ONLY:%.*]]) #[[ATTR14:[0-9]+]] {
+; CGSCC-SAME: (ptr noundef nonnull readonly captures(none) dereferenceable(1) [[READ_ONLY:%.*]]) #[[ATTR12:[0-9]+]] {
 ; CGSCC-NEXT:    call void @byval_not_readonly_1(ptr noalias noundef nonnull readonly byval(i8) captures(none) dereferenceable(1) [[READ_ONLY]]) #[[ATTR2:[0-9]+]]
-; CGSCC-NEXT:    call void @byval_not_readnone_1(ptr noalias noundef nonnull readnone byval(i8) captures(none) dereferenceable(1) [[READ_ONLY]]) #[[ATTR14]]
+; CGSCC-NEXT:    call void @byval_not_readnone_1(ptr noalias noundef nonnull readnone byval(i8) captures(none) dereferenceable(1) [[READ_ONLY]]) #[[ATTR12]]
 ; CGSCC-NEXT:    call void @byval_no_fnarg(ptr noalias nofree noundef nonnull readnone byval(i8) captures(none) dereferenceable(1) [[READ_ONLY]]) #[[ATTR21:[0-9]+]]
 ; CGSCC-NEXT:    ret void
 ;
@@ -361,16 +361,16 @@ declare void @val_use(i8 %ptr) readonly nounwind
 define void @ptr_uses(ptr %ptr) {
 ; TUNIT: Function Attrs: nosync nounwind memory(read)
 ; TUNIT-LABEL: define {{[^@]+}}@ptr_uses
-; TUNIT-SAME: (ptr nofree readonly captures(none) [[PTR:%.*]]) #[[ATTR15]] {
-; TUNIT-NEXT:    [[CALL_PTR:%.*]] = call ptr @maybe_returned_ptr(ptr nofree readonly [[PTR]]) #[[ATTR15]]
-; TUNIT-NEXT:    [[CALL_VAL:%.*]] = call i8 @maybe_returned_val(ptr readonly [[CALL_PTR]]) #[[ATTR15]]
+; TUNIT-SAME: (ptr nofree readonly captures(none) [[PTR:%.*]]) #[[ATTR13]] {
+; TUNIT-NEXT:    [[CALL_PTR:%.*]] = call ptr @maybe_returned_ptr(ptr nofree readonly [[PTR]]) #[[ATTR13]]
+; TUNIT-NEXT:    [[CALL_VAL:%.*]] = call i8 @maybe_returned_val(ptr readonly [[CALL_PTR]]) #[[ATTR13]]
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: nosync nounwind memory(read)
 ; CGSCC-LABEL: define {{[^@]+}}@ptr_uses
-; CGSCC-SAME: (ptr nofree readonly captures(none) [[PTR:%.*]]) #[[ATTR16]] {
-; CGSCC-NEXT:    [[CALL_PTR:%.*]] = call ptr @maybe_returned_ptr(ptr nofree readonly [[PTR]]) #[[ATTR16]]
-; CGSCC-NEXT:    [[CALL_VAL:%.*]] = call i8 @maybe_returned_val(ptr readonly [[CALL_PTR]]) #[[ATTR16]]
+; CGSCC-SAME: (ptr nofree readonly captures(none) [[PTR:%.*]]) #[[ATTR14]] {
+; CGSCC-NEXT:    [[CALL_PTR:%.*]] = call ptr @maybe_returned_ptr(ptr nofree readonly [[PTR]]) #[[ATTR14]]
+; CGSCC-NEXT:    [[CALL_VAL:%.*]] = call i8 @maybe_returned_val(ptr readonly [[CALL_PTR]]) #[[ATTR14]]
 ; CGSCC-NEXT:    ret void
 ;
   %call_ptr = call ptr @maybe_returned_ptr(ptr %ptr)
@@ -406,17 +406,17 @@ define i32 @read_only_constant_mem() {
 ; TUNIT: attributes #[[ATTR2:[0-9]+]] = { memory(read) }
 ; TUNIT: attributes #[[ATTR3]] = { nosync memory(read) }
 ; TUNIT: attributes #[[ATTR4]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) }
-; TUNIT: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(write) }
-; TUNIT: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(read) }
-; TUNIT: attributes #[[ATTR7]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(read) }
-; TUNIT: attributes #[[ATTR8:[0-9]+]] = { nounwind memory(argmem: read) }
-; TUNIT: attributes #[[ATTR9]] = { nosync nounwind memory(argmem: read) }
-; TUNIT: attributes #[[ATTR10]] = { nounwind memory(argmem: readwrite) }
-; TUNIT: attributes #[[ATTR11]] = { mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite) }
-; TUNIT: attributes #[[ATTR12]] = { nosync memory(none) }
-; TUNIT: attributes #[[ATTR13]] = { nosync }
-; TUNIT: attributes #[[ATTR14:[0-9]+]] = { nounwind memory(read) }
-; TUNIT: attributes #[[ATTR15]] = { nosync nounwind memory(read) }
+; TUNIT: attributes #[[ATTR5]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(read) }
+; TUNIT: attributes #[[ATTR6:[0-9]+]] = { nounwind memory(argmem: read) }
+; TUNIT: attributes #[[ATTR7]] = { nosync nounwind memory(argmem: read) }
+; TUNIT: attributes #[[ATTR8]] = { nounwind memory(argmem: readwrite) }
+; TUNIT: attributes #[[ATTR9]] = { mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite) }
+; TUNIT: attributes #[[ATTR10]] = { nosync memory(none) }
+; TUNIT: attributes #[[ATTR11]] = { nosync }
+; TUNIT: attributes #[[ATTR12:[0-9]+]] = { nounwind memory(read) }
+; TUNIT: attributes #[[ATTR13]] = { nosync nounwind memory(read) }
+; TUNIT: attributes #[[ATTR14:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(write) }
+; TUNIT: attributes #[[ATTR15:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(read) }
 ; TUNIT: attributes #[[ATTR16]] = { nofree willreturn memory(write) }
 ; TUNIT: attributes #[[ATTR17]] = { nofree willreturn memory(read) }
 ; TUNIT: attributes #[[ATTR18]] = { nounwind }
@@ -428,17 +428,17 @@ define i32 @read_only_constant_mem() {
 ; CGSCC: attributes #[[ATTR3]] = { nosync memory(read) }
 ; CGSCC: attributes #[[ATTR4]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) }
 ; CGSCC: attributes #[[ATTR5]] = { mustprogress nofree nosync nounwind willreturn memory(argmem: write) }
-; CGSCC: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(write) }
-; CGSCC: attributes #[[ATTR7:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(read) }
-; CGSCC: attributes #[[ATTR8]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(read) }
-; CGSCC: attributes #[[ATTR9:[0-9]+]] = { nounwind memory(argmem: read) }
-; CGSCC: attributes #[[ATTR10]] = { nosync nounwind memory(argmem: read) }
-; CGSCC: attributes #[[ATTR11]] = { nounwind memory(argmem: readwrite) }
-; CGSCC: attributes #[[ATTR12]] = { mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite) }
-; CGSCC: attributes #[[ATTR13]] = { nosync memory(none) }
-; CGSCC: attributes #[[ATTR14]] = { nosync }
-; CGSCC: attributes #[[ATTR15:[0-9]+]] = { nounwind memory(read) }
-; CGSCC: attributes #[[ATTR16]] = { nosync nounwind memory(read) }
+; CGSCC: attributes #[[ATTR6]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(read) }
+; CGSCC: attributes #[[ATTR7:[0-9]+]] = { nounwind memory(argmem: read) }
+; CGSCC: attributes #[[ATTR8]] = { nosync nounwind memory(argmem: read) }
+; CGSCC: attributes #[[ATTR9]] = { nounwind memory(argmem: readwrite) }
+; CGSCC: attributes #[[ATTR10]] = { mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite) }
+; CGSCC: attributes #[[ATTR11]] = { nosync memory(none) }
+; CGSCC: attributes #[[ATTR12]] = { nosync }
+; CGSCC: attributes #[[ATTR13:[0-9]+]] = { nounwind memory(read) }
+; CGSCC: attributes #[[ATTR14]] = { nosync nounwind memory(read) }
+; CGSCC: attributes #[[ATTR15:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(write) }
+; CGSCC: attributes #[[ATTR16:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(read) }
 ; CGSCC: attributes #[[ATTR17]] = { nofree nosync willreturn }
 ; CGSCC: attributes #[[ATTR18]] = { nofree willreturn memory(write) }
 ; CGSCC: attributes #[[ATTR19]] = { nofree willreturn memory(read) }
diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/dont-sink-scalable-vector-compare.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/dont-sink-scalable-vector-compare.ll
index 26de31d6f7499..b0cd589e63e9d 100644
--- a/llvm/test/Transforms/CodeGenPrepare/AArch64/dont-sink-scalable-vector-compare.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/dont-sink-scalable-vector-compare.ll
@@ -13,9 +13,9 @@ define void @do_not_sink_scalable_vector_compare(ptr %a, ptr %b) #0 {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[SRC:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[SRC]], i32 4, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[SRC]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[DST:%.*]] = getelementptr inbounds ptr, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[WIDE_LOAD]], ptr [[DST]], i32 4, <vscale x 4 x i1> [[TMP0]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[WIDE_LOAD]], ptr align 4 [[DST]], <vscale x 4 x i1> [[TMP0]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[EXIT_COND:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[VECTOR_END:.*]], label %[[VECTOR_BODY]]
diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/gather-scatter-opt-inseltpoison.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/gather-scatter-opt-inseltpoison.ll
index 3c5c07f3516c9..0c48d0c47dc74 100644
--- a/llvm/test/Transforms/CodeGenPrepare/AArch64/gather-scatter-opt-inseltpoison.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/gather-scatter-opt-inseltpoison.ll
@@ -10,7 +10,7 @@ target triple = "aarch64-unknown-linux-gnu"
 define <vscale x 4 x i32> @splat_base(ptr %base, <vscale x 4 x i64> %index, <vscale x 4 x i1> %mask) #0 {
 ; CHECK-LABEL: @splat_base(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[BASE:%.*]], <vscale x 4 x i64> [[INDEX:%.*]]
-; CHECK-NEXT:    [[RES:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP1]], i32 4, <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
+; CHECK-NEXT:    [[RES:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP1]], <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[RES]]
 ;
   %broadcast.splatinsert = insertelement <vscale x 4 x ptr> poison, ptr %base, i32 0
@@ -24,7 +24,7 @@ define <vscale x 4 x i32> @splat_struct(ptr %base, <vscale x 4 x i1> %mask) #0 {
 ; CHECK-LABEL: @splat_struct(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_A:%.*]], ptr [[BASE:%.*]], i64 0, i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[TMP1]], <vscale x 4 x i64> zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP2]], i32 4, <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
+; CHECK-NEXT:    [[RES:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP2]], <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[RES]]
 ;
   %gep = getelementptr %struct.a, ptr %base, <vscale x 4 x i64> zeroinitializer, i32 1
@@ -36,7 +36,7 @@ define <vscale x 4 x i32> @scalar_index(ptr %base, i64 %index, <vscale x 4 x i1>
 ; CHECK-LABEL: @scalar_index(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[BASE:%.*]], i64 [[INDEX:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[TMP1]], <vscale x 4 x i64> zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP2]], i32 4, <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
+; CHECK-NEXT:    [[RES:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP2]], <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[RES]]
 ;
   %broadcast.splatinsert = insertelement <vscale x 4 x ptr> poison, ptr %base, i32 0
@@ -50,7 +50,7 @@ define <vscale x 4 x i32> @splat_index(ptr %base, i64 %index, <vscale x 4 x i1>
 ; CHECK-LABEL: @splat_index(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[BASE:%.*]], i64 [[INDEX:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[TMP1]], <vscale x 4 x i64> zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP2]], i32 4, <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
+; CHECK-NEXT:    [[RES:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP2]], <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[RES]]
 ;
   %broadcast.splatinsert = insertelement <vscale x 4 x i64> poison, i64 %index, i32 0
@@ -63,7 +63,7 @@ define <vscale x 4 x i32> @splat_index(ptr %base, i64 %index, <vscale x 4 x i1>
 define <vscale x 4 x i32> @test_global_array(<vscale x 4 x i64> %indxs, <vscale x 4 x i1> %mask) #0 {
 ; CHECK-LABEL: @test_global_array(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr @glob_array, <vscale x 4 x i64> [[INDXS:%.*]]
-; CHECK-NEXT:    [[G:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP1]], i32 4, <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
+; CHECK-NEXT:    [[G:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP1]], <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[G]]
 ;
   %p = getelementptr inbounds [16 x i32], ptr @glob_array, i64 0, <vscale x 4 x i64> %indxs
@@ -73,7 +73,7 @@ define <vscale x 4 x i32> @test_global_array(<vscale x 4 x i64> %indxs, <vscale
 
 define <vscale x 4 x i32> @global_struct_splat(<vscale x 4 x i1> %mask) #0 {
 ; CHECK-LABEL: @global_struct_splat(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> shufflevector (<vscale x 4 x ptr> insertelement (<vscale x 4 x ptr> poison, ptr getelementptr ([[STRUCT_A:%.*]], ptr @c, i64 0, i32 1), i64 0), <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer), i32 4, <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 shufflevector (<vscale x 4 x ptr> insertelement (<vscale x 4 x ptr> poison, ptr getelementptr ([[STRUCT_A:%.*]], ptr @c, i64 0, i32 1), i64 0), <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 ;
   %1 = insertelement <vscale x 4 x ptr> poison, ptr @c, i32 0
@@ -86,7 +86,7 @@ define <vscale x 4 x i32> @global_struct_splat(<vscale x 4 x i1> %mask) #0 {
 define <vscale x 4 x i32> @splat_ptr_gather(ptr %ptr, <vscale x 4 x i1> %mask, <vscale x 4 x i32> %passthru) #0 {
 ; CHECK-LABEL: @splat_ptr_gather(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[PTR:%.*]], <vscale x 4 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP1]], i32 4, <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> [[PASSTHRU:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP1]], <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> [[PASSTHRU:%.*]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
 ;
   %1 = insertelement <vscale x 4 x ptr> poison, ptr %ptr, i32 0
@@ -98,7 +98,7 @@ define <vscale x 4 x i32> @splat_ptr_gather(ptr %ptr, <vscale x 4 x i1> %mask, <
 define void @splat_ptr_scatter(ptr %ptr, <vscale x 4 x i1> %mask, <vscale x 4 x i32> %val) #0 {
 ; CHECK-LABEL: @splat_ptr_scatter(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[PTR:%.*]], <vscale x 4 x i64> zeroinitializer
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[VAL:%.*]], <vscale x 4 x ptr> [[TMP1]], i32 4, <vscale x 4 x i1> [[MASK:%.*]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[VAL:%.*]], <vscale x 4 x ptr> align 4 [[TMP1]], <vscale x 4 x i1> [[MASK:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   %1 = insertelement <vscale x 4 x ptr> poison, ptr %ptr, i32 0
diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/gather-scatter-opt.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/gather-scatter-opt.ll
index 36cd69ed01ed9..c82a5411b83e4 100644
--- a/llvm/test/Transforms/CodeGenPrepare/AArch64/gather-scatter-opt.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/gather-scatter-opt.ll
@@ -10,7 +10,7 @@ target triple = "aarch64-unknown-linux-gnu"
 define <vscale x 4 x i32> @splat_base(ptr %base, <vscale x 4 x i64> %index, <vscale x 4 x i1> %mask) #0 {
 ; CHECK-LABEL: @splat_base(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[BASE:%.*]], <vscale x 4 x i64> [[INDEX:%.*]]
-; CHECK-NEXT:    [[RES:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP1]], i32 4, <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
+; CHECK-NEXT:    [[RES:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP1]], <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[RES]]
 ;
   %broadcast.splatinsert = insertelement <vscale x 4 x ptr> undef, ptr %base, i32 0
@@ -24,7 +24,7 @@ define <vscale x 4 x i32> @splat_struct(ptr %base, <vscale x 4 x i1> %mask) #0 {
 ; CHECK-LABEL: @splat_struct(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_A:%.*]], ptr [[BASE:%.*]], i64 0, i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[TMP1]], <vscale x 4 x i64> zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP2]], i32 4, <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
+; CHECK-NEXT:    [[RES:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP2]], <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[RES]]
 ;
   %gep = getelementptr %struct.a, ptr %base, <vscale x 4 x i64> zeroinitializer, i32 1
@@ -36,7 +36,7 @@ define <vscale x 4 x i32> @scalar_index(ptr %base, i64 %index, <vscale x 4 x i1>
 ; CHECK-LABEL: @scalar_index(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[BASE:%.*]], i64 [[INDEX:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[TMP1]], <vscale x 4 x i64> zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP2]], i32 4, <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
+; CHECK-NEXT:    [[RES:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP2]], <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[RES]]
 ;
   %broadcast.splatinsert = insertelement <vscale x 4 x ptr> undef, ptr %base, i32 0
@@ -50,7 +50,7 @@ define <vscale x 4 x i32> @splat_index(ptr %base, i64 %index, <vscale x 4 x i1>
 ; CHECK-LABEL: @splat_index(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[BASE:%.*]], i64 [[INDEX:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[TMP1]], <vscale x 4 x i64> zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP2]], i32 4, <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
+; CHECK-NEXT:    [[RES:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP2]], <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[RES]]
 ;
   %broadcast.splatinsert = insertelement <vscale x 4 x i64> undef, i64 %index, i32 0
@@ -63,7 +63,7 @@ define <vscale x 4 x i32> @splat_index(ptr %base, i64 %index, <vscale x 4 x i1>
 define <vscale x 4 x i32> @test_global_array(<vscale x 4 x i64> %indxs, <vscale x 4 x i1> %mask) #0 {
 ; CHECK-LABEL: @test_global_array(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr @glob_array, <vscale x 4 x i64> [[INDXS:%.*]]
-; CHECK-NEXT:    [[G:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP1]], i32 4, <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
+; CHECK-NEXT:    [[G:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP1]], <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[G]]
 ;
   %p = getelementptr inbounds [16 x i32], ptr @glob_array, i64 0, <vscale x 4 x i64> %indxs
@@ -73,7 +73,7 @@ define <vscale x 4 x i32> @test_global_array(<vscale x 4 x i64> %indxs, <vscale
 
 define <vscale x 4 x i32> @global_struct_splat(<vscale x 4 x i1> %mask) #0 {
 ; CHECK-LABEL: @global_struct_splat(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> shufflevector (<vscale x 4 x ptr> insertelement (<vscale x 4 x ptr> poison, ptr getelementptr ([[STRUCT_A:%.*]], ptr @c, i64 0, i32 1), i64 0), <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer), i32 4, <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 shufflevector (<vscale x 4 x ptr> insertelement (<vscale x 4 x ptr> poison, ptr getelementptr ([[STRUCT_A:%.*]], ptr @c, i64 0, i32 1), i64 0), <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 ;
   %1 = insertelement <vscale x 4 x ptr> undef, ptr @c, i32 0
@@ -86,7 +86,7 @@ define <vscale x 4 x i32> @global_struct_splat(<vscale x 4 x i1> %mask) #0 {
 define <vscale x 4 x i32> @splat_ptr_gather(ptr %ptr, <vscale x 4 x i1> %mask, <vscale x 4 x i32> %passthru) #0 {
 ; CHECK-LABEL: @splat_ptr_gather(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[PTR:%.*]], <vscale x 4 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP1]], i32 4, <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> [[PASSTHRU:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP1]], <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> [[PASSTHRU:%.*]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
 ;
   %1 = insertelement <vscale x 4 x ptr> undef, ptr %ptr, i32 0
@@ -98,7 +98,7 @@ define <vscale x 4 x i32> @splat_ptr_gather(ptr %ptr, <vscale x 4 x i1> %mask, <
 define void @splat_ptr_scatter(ptr %ptr, <vscale x 4 x i1> %mask, <vscale x 4 x i32> %val) #0 {
 ; CHECK-LABEL: @splat_ptr_scatter(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[PTR:%.*]], <vscale x 4 x i64> zeroinitializer
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[VAL:%.*]], <vscale x 4 x ptr> [[TMP1]], i32 4, <vscale x 4 x i1> [[MASK:%.*]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[VAL:%.*]], <vscale x 4 x ptr> align 4 [[TMP1]], <vscale x 4 x i1> [[MASK:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   %1 = insertelement <vscale x 4 x ptr> undef, ptr %ptr, i32 0
diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-gather-scatter-addressing.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-gather-scatter-addressing.ll
index f170c8ff18c1f..9c6565379742e 100644
--- a/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-gather-scatter-addressing.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-gather-scatter-addressing.ll
@@ -11,8 +11,8 @@ define <vscale x 4 x float> @gather_offsets_sink_gep(ptr %base, <vscale x 4 x i3
 ; CHECK-NEXT:    br i1 [[COND]], label [[COND_BLOCK:%.*]], label [[EXIT:%.*]]
 ; CHECK:       cond.block:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr float, ptr [[BASE]], <vscale x 4 x i32> [[INDICES]]
-; CHECK-NEXT:    [[LOAD:%.*]] = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[TMP0]], i32 4, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> poison)
-; CHECK-NEXT:    ret <vscale x 4 x float> [[LOAD]]
+; CHECK-NEXT:    [[RET:%.*]] = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP0]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> poison)
+; CHECK-NEXT:    ret <vscale x 4 x float> [[RET]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret <vscale x 4 x float> zeroinitializer
 ;
@@ -38,8 +38,8 @@ define <vscale x 4 x float> @gather_offsets_sink_sext(ptr %base, <vscale x 4 x i
 ; CHECK:       cond.block:
 ; CHECK-NEXT:    [[TMP0:%.*]] = sext <vscale x 4 x i32> [[INDICES]] to <vscale x 4 x i64>
 ; CHECK-NEXT:    [[PTRS:%.*]] = getelementptr float, ptr [[BASE]], <vscale x 4 x i64> [[TMP0]]
-; CHECK-NEXT:    [[LOAD:%.*]] = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[PTRS]], i32 4, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> poison)
-; CHECK-NEXT:    ret <vscale x 4 x float> [[LOAD]]
+; CHECK-NEXT:    [[RET:%.*]] = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> align 4 [[PTRS]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> poison)
+; CHECK-NEXT:    ret <vscale x 4 x float> [[RET]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret <vscale x 4 x float> zeroinitializer
 ;
@@ -66,8 +66,8 @@ define <vscale x 4 x float> @gather_offsets_sink_sext_get(ptr %base, <vscale x 4
 ; CHECK:       cond.block:
 ; CHECK-NEXT:    [[TMP0:%.*]] = sext <vscale x 4 x i32> [[INDICES]] to <vscale x 4 x i64>
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr float, ptr [[BASE]], <vscale x 4 x i64> [[TMP0]]
-; CHECK-NEXT:    [[LOAD:%.*]] = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[TMP1]], i32 4, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> poison)
-; CHECK-NEXT:    ret <vscale x 4 x float> [[LOAD]]
+; CHECK-NEXT:    [[RET:%.*]] = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP1]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> poison)
+; CHECK-NEXT:    ret <vscale x 4 x float> [[RET]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret <vscale x 4 x float> zeroinitializer
 ;
@@ -93,8 +93,8 @@ define <vscale x 4 x float> @gather_no_scalar_base(<vscale x 4 x ptr> %bases, <v
 ; CHECK-NEXT:    [[PTRS:%.*]] = getelementptr float, <vscale x 4 x ptr> [[BASES]], <vscale x 4 x i32> [[INDICES]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[COND_BLOCK:%.*]], label [[EXIT:%.*]]
 ; CHECK:       cond.block:
-; CHECK-NEXT:    [[LOAD:%.*]] = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[PTRS]], i32 4, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> poison)
-; CHECK-NEXT:    ret <vscale x 4 x float> [[LOAD]]
+; CHECK-NEXT:    [[RET:%.*]] = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> align 4 [[PTRS]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> poison)
+; CHECK-NEXT:    ret <vscale x 4 x float> [[RET]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret <vscale x 4 x float> zeroinitializer
 ;
@@ -121,8 +121,8 @@ define <vscale x 4 x float> @gather_offset_type_too_small(ptr %base, <vscale x 4
 ; CHECK-NEXT:    br i1 [[COND]], label [[COND_BLOCK:%.*]], label [[EXIT:%.*]]
 ; CHECK:       cond.block:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr float, ptr [[BASE]], <vscale x 4 x i32> [[INDICES_SEXT]]
-; CHECK-NEXT:    [[LOAD:%.*]] = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[TMP0]], i32 4, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> poison)
-; CHECK-NEXT:    ret <vscale x 4 x float> [[LOAD]]
+; CHECK-NEXT:    [[RET:%.*]] = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP0]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> poison)
+; CHECK-NEXT:    ret <vscale x 4 x float> [[RET]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret <vscale x 4 x float> zeroinitializer
 ;
@@ -150,8 +150,8 @@ define <vscale x 4 x float> @gather_offset_type_too_big(ptr %base, <vscale x 4 x
 ; CHECK-NEXT:    br i1 [[COND]], label [[COND_BLOCK:%.*]], label [[EXIT:%.*]]
 ; CHECK:       cond.block:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr float, ptr [[BASE]], <vscale x 4 x i64> [[INDICES_SEXT]]
-; CHECK-NEXT:    [[LOAD:%.*]] = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[TMP0]], i32 4, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> poison)
-; CHECK-NEXT:    ret <vscale x 4 x float> [[LOAD]]
+; CHECK-NEXT:    [[RET:%.*]] = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP0]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> poison)
+; CHECK-NEXT:    ret <vscale x 4 x float> [[RET]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret <vscale x 4 x float> zeroinitializer
 ;
@@ -181,8 +181,8 @@ define <vscale x 4 x float> @gather_offset_sink_zext(ptr %base, <vscale x 4 x i8
 ; CHECK:       cond.block:
 ; CHECK-NEXT:    [[TMP0:%.*]] = zext <vscale x 4 x i8> [[INDICES]] to <vscale x 4 x i64>
 ; CHECK-NEXT:    [[PTRS:%.*]] = getelementptr float, ptr [[BASE]], <vscale x 4 x i64> [[TMP0]]
-; CHECK-NEXT:    [[LOAD:%.*]] = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[PTRS]], i32 4, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> poison)
-; CHECK-NEXT:    ret <vscale x 4 x float> [[LOAD]]
+; CHECK-NEXT:    [[RET:%.*]] = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> align 4 [[PTRS]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> poison)
+; CHECK-NEXT:    ret <vscale x 4 x float> [[RET]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret <vscale x 4 x float> zeroinitializer
 ;
@@ -209,7 +209,7 @@ define void @scatter_offsets_sink_sext_get(<vscale x 4 x float> %data, ptr %base
 ; CHECK:       cond.block:
 ; CHECK-NEXT:    [[TMP0:%.*]] = sext <vscale x 4 x i32> [[INDICES]] to <vscale x 4 x i64>
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr float, ptr [[BASE]], <vscale x 4 x i64> [[TMP0]]
-; CHECK-NEXT:    tail call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> [[DATA]], <vscale x 4 x ptr> [[TMP1]], i32 4, <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT:    tail call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> [[DATA]], <vscale x 4 x ptr> align 4 [[TMP1]], <vscale x 4 x i1> [[MASK]])
 ; CHECK-NEXT:    ret void
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt-inseltpoison.ll b/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt-inseltpoison.ll
index e27d5d772a7a4..136f6aa451fde 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt-inseltpoison.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt-inseltpoison.ll
@@ -13,7 +13,7 @@ target triple = "x86_64-unknown-linux-gnu"
 define <4 x i32> @splat_base(ptr %base, <4 x i64> %index) {
 ; CHECK-LABEL: @splat_base(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[BASE:%.*]], <4 x i64> [[INDEX:%.*]]
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP1]], i32 4, <4 x i1> splat (i1 true), <4 x i32> undef)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP1]], <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
   %broadcast.splatinsert = insertelement <4 x ptr> poison, ptr %base, i32 0
@@ -27,7 +27,7 @@ define <4 x i32> @splat_struct(ptr %base) {
 ; CHECK-LABEL: @splat_struct(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_A:%.*]], ptr [[BASE:%.*]], i64 0, i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[TMP1]], <4 x i64> zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP2]], i32 4, <4 x i1> splat (i1 true), <4 x i32> undef)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP2]], <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
   %gep = getelementptr %struct.a, ptr %base, <4 x i64> zeroinitializer, i32 1
@@ -39,7 +39,7 @@ define <4 x i32> @scalar_index(ptr %base, i64 %index) {
 ; CHECK-LABEL: @scalar_index(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[BASE:%.*]], i64 [[INDEX:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[TMP1]], <4 x i64> zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP2]], i32 4, <4 x i1> splat (i1 true), <4 x i32> undef)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP2]], <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
   %broadcast.splatinsert = insertelement <4 x ptr> poison, ptr %base, i32 0
@@ -53,7 +53,7 @@ define <4 x i32> @splat_index(ptr %base, i64 %index) {
 ; CHECK-LABEL: @splat_index(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[BASE:%.*]], i64 [[INDEX:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[TMP1]], <4 x i64> zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP2]], i32 4, <4 x i1> splat (i1 true), <4 x i32> undef)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP2]], <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
   %broadcast.splatinsert = insertelement <4 x i64> poison, i64 %index, i32 0
@@ -66,7 +66,7 @@ define <4 x i32> @splat_index(ptr %base, i64 %index) {
 define <4 x i32> @test_global_array(<4 x i64> %indxs) {
 ; CHECK-LABEL: @test_global_array(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr @glob_array, <4 x i64> [[INDXS:%.*]]
-; CHECK-NEXT:    [[G:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP1]], i32 4, <4 x i1> splat (i1 true), <4 x i32> undef)
+; CHECK-NEXT:    [[G:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP1]], <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-NEXT:    ret <4 x i32> [[G]]
 ;
   %p = getelementptr inbounds [16 x i32], ptr @glob_array, i64 0, <4 x i64> %indxs
@@ -76,7 +76,7 @@ define <4 x i32> @test_global_array(<4 x i64> %indxs) {
 
 define <4 x i32> @global_struct_splat() {
 ; CHECK-LABEL: @global_struct_splat(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> <ptr getelementptr ([[STRUCT_A:%.*]], ptr @c, i64 0, i32 1), ptr getelementptr ([[STRUCT_A]], ptr @c, i64 0, i32 1), ptr getelementptr ([[STRUCT_A]], ptr @c, i64 0, i32 1), ptr getelementptr ([[STRUCT_A]], ptr @c, i64 0, i32 1)>, i32 4, <4 x i1> splat (i1 true), <4 x i32> undef)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 <ptr getelementptr ([[STRUCT_A:%.*]], ptr @c, i64 0, i32 1), ptr getelementptr ([[STRUCT_A]], ptr @c, i64 0, i32 1), ptr getelementptr ([[STRUCT_A]], ptr @c, i64 0, i32 1), ptr getelementptr ([[STRUCT_A]], ptr @c, i64 0, i32 1)>, <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 ;
   %1 = insertelement <4 x ptr> poison, ptr @c, i32 0
@@ -89,7 +89,7 @@ define <4 x i32> @global_struct_splat() {
 define <4 x i32> @splat_ptr_gather(ptr %ptr, <4 x i1> %mask, <4 x i32> %passthru) {
 ; CHECK-LABEL: @splat_ptr_gather(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[PTR:%.*]], <4 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP1]], i32 4, <4 x i1> [[MASK:%.*]], <4 x i32> [[PASSTHRU:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP1]], <4 x i1> [[MASK:%.*]], <4 x i32> [[PASSTHRU:%.*]])
 ; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 ;
   %1 = insertelement <4 x ptr> poison, ptr %ptr, i32 0
@@ -101,7 +101,7 @@ define <4 x i32> @splat_ptr_gather(ptr %ptr, <4 x i1> %mask, <4 x i32> %passthru
 define void @splat_ptr_scatter(ptr %ptr, <4 x i1> %mask, <4 x i32> %val) {
 ; CHECK-LABEL: @splat_ptr_scatter(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[PTR:%.*]], <4 x i64> zeroinitializer
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[VAL:%.*]], <4 x ptr> [[TMP1]], i32 4, <4 x i1> [[MASK:%.*]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[VAL:%.*]], <4 x ptr> align 4 [[TMP1]], <4 x i1> [[MASK:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   %1 = insertelement <4 x ptr> poison, ptr %ptr, i32 0
@@ -113,7 +113,7 @@ define void @splat_ptr_scatter(ptr %ptr, <4 x i1> %mask, <4 x i32> %val) {
 define <4 x i32> @scalar_prefix(ptr %base, i64 %index, <4 x i64> %vecidx) {
 ; CHECK-LABEL: @scalar_prefix(
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [256 x i32], ptr [[BASE:%.*]], i64 [[INDEX:%.*]], <4 x i64> [[VECIDX:%.*]]
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP2]], i32 4, <4 x i1> splat (i1 true), <4 x i32> undef)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP2]], <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
   %gep = getelementptr [256 x i32], ptr %base, i64 %index, <4 x i64> %vecidx
@@ -126,7 +126,7 @@ define <4 x i32> @scalar_prefix_with_splat(ptr %base, i64 %index, <4 x i64> %vec
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX:%.*]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [256 x i32], ptr [[BASE:%.*]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> [[VECIDX:%.*]]
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP2]], i32 4, <4 x i1> splat (i1 true), <4 x i32> undef)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP2]], <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
   %broadcast.splatinsert = insertelement <4 x i64> poison, i64 %index, i32 0
@@ -140,7 +140,7 @@ define <4 x i32> @scalar_prefix_with_splat(ptr %base, i64 %index, <4 x i64> %vec
 define <4 x i32> @scalar_prefix_with_constant_splat(ptr %base, <4 x i64> %vecidx) {
 ; CHECK-LABEL: @scalar_prefix_with_constant_splat(
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [256 x i32], ptr [[BASE:%.*]], <4 x i64> splat (i64 20), <4 x i64> [[VECIDX:%.*]]
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP2]], i32 4, <4 x i1> splat (i1 true), <4 x i32> undef)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP2]], <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
   %gep = getelementptr [256 x i32], ptr %base, <4 x i64> splat (i64 20), <4 x i64> %vecidx
@@ -151,7 +151,7 @@ define <4 x i32> @scalar_prefix_with_constant_splat(ptr %base, <4 x i64> %vecidx
 define <4 x i32> @reassociate(ptr %base, i64 %index, <4 x i64> %vecidx) {
 ; CHECK-LABEL: @reassociate(
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr [256 x i32], ptr [[BASE:%.*]], <4 x i64> [[VECIDX:%.*]], i64 [[INDEX:%.*]]
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[GEP]], i32 4, <4 x i1> splat (i1 true), <4 x i32> undef)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[GEP]], <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
   %gep = getelementptr [256 x i32], ptr %base, <4 x i64> %vecidx, i64 %index
@@ -164,7 +164,7 @@ define <4 x i32> @reassociate_with_splat(ptr %base, i64 %index, <4 x i64> %vecid
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX:%.*]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr [256 x i32], ptr [[BASE:%.*]], <4 x i64> [[VECIDX:%.*]], <4 x i64> [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[GEP]], i32 4, <4 x i1> splat (i1 true), <4 x i32> undef)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[GEP]], <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
   %broadcast.splatinsert = insertelement <4 x i64> poison, i64 %index, i32 0
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt.ll b/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt.ll
index 8328708393029..22a007df1ca96 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt.ll
@@ -12,7 +12,7 @@ target triple = "x86_64-unknown-linux-gnu"
 define <4 x i32> @splat_base(ptr %base, <4 x i64> %index) {
 ; CHECK-LABEL: @splat_base(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[BASE:%.*]], <4 x i64> [[INDEX:%.*]]
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP1]], i32 4, <4 x i1> splat (i1 true), <4 x i32> undef)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP1]], <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
   %broadcast.splatinsert = insertelement <4 x ptr> undef, ptr %base, i32 0
@@ -26,7 +26,7 @@ define <4 x i32> @splat_struct(ptr %base) {
 ; CHECK-LABEL: @splat_struct(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_A:%.*]], ptr [[BASE:%.*]], i64 0, i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[TMP1]], <4 x i64> zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP2]], i32 4, <4 x i1> splat (i1 true), <4 x i32> undef)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP2]], <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
   %gep = getelementptr %struct.a, ptr %base, <4 x i64> zeroinitializer, i32 1
@@ -38,7 +38,7 @@ define <4 x i32> @scalar_index(ptr %base, i64 %index) {
 ; CHECK-LABEL: @scalar_index(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[BASE:%.*]], i64 [[INDEX:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[TMP1]], <4 x i64> zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP2]], i32 4, <4 x i1> splat (i1 true), <4 x i32> undef)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP2]], <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
   %broadcast.splatinsert = insertelement <4 x ptr> undef, ptr %base, i32 0
@@ -52,7 +52,7 @@ define <4 x i32> @splat_index(ptr %base, i64 %index) {
 ; CHECK-LABEL: @splat_index(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[BASE:%.*]], i64 [[INDEX:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[TMP1]], <4 x i64> zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP2]], i32 4, <4 x i1> splat (i1 true), <4 x i32> undef)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP2]], <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
   %broadcast.splatinsert = insertelement <4 x i64> undef, i64 %index, i32 0
@@ -65,7 +65,7 @@ define <4 x i32> @splat_index(ptr %base, i64 %index) {
 define <4 x i32> @test_global_array(<4 x i64> %indxs) {
 ; CHECK-LABEL: @test_global_array(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr @glob_array, <4 x i64> [[INDXS:%.*]]
-; CHECK-NEXT:    [[G:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP1]], i32 4, <4 x i1> splat (i1 true), <4 x i32> undef)
+; CHECK-NEXT:    [[G:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP1]], <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-NEXT:    ret <4 x i32> [[G]]
 ;
   %p = getelementptr inbounds [16 x i32], ptr @glob_array, i64 0, <4 x i64> %indxs
@@ -75,7 +75,7 @@ define <4 x i32> @test_global_array(<4 x i64> %indxs) {
 
 define <4 x i32> @global_struct_splat() {
 ; CHECK-LABEL: @global_struct_splat(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> <ptr getelementptr ([[STRUCT_A:%.*]], ptr @c, i64 0, i32 1), ptr getelementptr ([[STRUCT_A]], ptr @c, i64 0, i32 1), ptr getelementptr ([[STRUCT_A]], ptr @c, i64 0, i32 1), ptr getelementptr ([[STRUCT_A]], ptr @c, i64 0, i32 1)>, i32 4, <4 x i1> splat (i1 true), <4 x i32> undef)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 <ptr getelementptr ([[STRUCT_A:%.*]], ptr @c, i64 0, i32 1), ptr getelementptr ([[STRUCT_A]], ptr @c, i64 0, i32 1), ptr getelementptr ([[STRUCT_A]], ptr @c, i64 0, i32 1), ptr getelementptr ([[STRUCT_A]], ptr @c, i64 0, i32 1)>, <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 ;
   %1 = insertelement <4 x ptr> undef, ptr @c, i32 0
@@ -88,7 +88,7 @@ define <4 x i32> @global_struct_splat() {
 define <4 x i32> @splat_ptr_gather(ptr %ptr, <4 x i1> %mask, <4 x i32> %passthru) {
 ; CHECK-LABEL: @splat_ptr_gather(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[PTR:%.*]], <4 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP1]], i32 4, <4 x i1> [[MASK:%.*]], <4 x i32> [[PASSTHRU:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP1]], <4 x i1> [[MASK:%.*]], <4 x i32> [[PASSTHRU:%.*]])
 ; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 ;
   %1 = insertelement <4 x ptr> undef, ptr %ptr, i32 0
@@ -100,7 +100,7 @@ define <4 x i32> @splat_ptr_gather(ptr %ptr, <4 x i1> %mask, <4 x i32> %passthru
 define void @splat_ptr_scatter(ptr %ptr, <4 x i1> %mask, <4 x i32> %val) {
 ; CHECK-LABEL: @splat_ptr_scatter(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[PTR:%.*]], <4 x i64> zeroinitializer
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[VAL:%.*]], <4 x ptr> [[TMP1]], i32 4, <4 x i1> [[MASK:%.*]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[VAL:%.*]], <4 x ptr> align 4 [[TMP1]], <4 x i1> [[MASK:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   %1 = insertelement <4 x ptr> undef, ptr %ptr, i32 0
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/masked-gather-struct-gep.ll b/llvm/test/Transforms/CodeGenPrepare/X86/masked-gather-struct-gep.ll
index dbd5e87f2c28d..220d9f8d6f74b 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/masked-gather-struct-gep.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/masked-gather-struct-gep.ll
@@ -12,7 +12,7 @@ define <4 x float> @foo(ptr %p) {
 ; CHECK-SAME: (ptr [[P:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast ptr [[P]] to ptr
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr float, ptr [[TMP1]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[GATHER:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> [[TMP2]], i32 0, <4 x i1> zeroinitializer, <4 x float> zeroinitializer)
+; CHECK-NEXT:    [[GATHER:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 [[TMP2]], <4 x i1> zeroinitializer, <4 x float> zeroinitializer)
 ; CHECK-NEXT:    ret <4 x float> [[GATHER]]
 ;
   %base.splatinsert = insertelement <4 x ptr> poison, ptr %p, i32 0
diff --git a/llvm/test/Transforms/DFAJumpThreading/dfa-constant-propagation.ll b/llvm/test/Transforms/DFAJumpThreading/dfa-constant-propagation.ll
index fdab67adc043f..afc98ceedffa6 100644
--- a/llvm/test/Transforms/DFAJumpThreading/dfa-constant-propagation.ll
+++ b/llvm/test/Transforms/DFAJumpThreading/dfa-constant-propagation.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -passes=dfa-jump-threading,sccp,simplifycfg %s | FileCheck %s
+; RUN: opt -S -passes=dfa-jump-threading,sccp,simplifycfg -verify-dom-info=1 %s | FileCheck %s
 
 ; This test checks that a constant propagation is applied for a basic loop.
 ; Related to bug 44679.
diff --git a/llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-analysis.ll b/llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-analysis.ll
index f45798b4174d3..5076517d92c74 100644
--- a/llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-analysis.ll
+++ b/llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-analysis.ll
@@ -1,6 +1,6 @@
 ; REQUIRES: asserts
-; RUN: opt -S -passes=dfa-jump-threading -debug-only=dfa-jump-threading -disable-output %s 2>&1 | FileCheck %s
-; RUN: opt -S -passes=dfa-jump-threading -print-prof-data %s -o - | FileCheck %s --check-prefix=PROFILE
+; RUN: opt -S -passes=dfa-jump-threading -verify-dom-info=1 -debug-only=dfa-jump-threading -disable-output %s 2>&1 | FileCheck %s
+; RUN: opt -S -passes=dfa-jump-threading -verify-dom-info=1 -print-prof-data %s -o - | FileCheck %s --check-prefix=PROFILE
 
 ; This test checks that the analysis identifies all threadable paths in a
 ; simple CFG. A threadable path includes a list of basic blocks, the exit
diff --git a/llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-transform.ll b/llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-transform.ll
index 092c854890462..426b51edd2099 100644
--- a/llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-transform.ll
+++ b/llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-transform.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
-; RUN: opt -S -passes=dfa-jump-threading %s | FileCheck %s
+; RUN: opt -S -passes=dfa-jump-threading -verify-dom-info=1 %s | FileCheck %s
 
 ; These tests check that the DFA jump threading transformation is applied
 ; properly to two CFGs. It checks that blocks are cloned, branches are updated,
@@ -445,9 +445,67 @@ bb2:                                              ; preds = %select.unfold
   unreachable
 }
 
+
+define i16 @DTU_update_crash() {
+; CHECK-LABEL: @DTU_update_crash(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY_SELECTBLOCK:%.*]]
+; CHECK:       for.body.selectblock:
+; CHECK-NEXT:    br i1 false, label [[SWITCHBLOCK_JT0:%.*]], label [[SEL_SI_UNFOLD_FALSE_JT0:%.*]]
+; CHECK:       sel.si.unfold.false:
+; CHECK-NEXT:    br label [[SWITCHBLOCK:%.*]]
+; CHECK:       sel.si.unfold.false.jt0:
+; CHECK-NEXT:    [[DOTSI_UNFOLD_PHI_JT0:%.*]] = phi i32 [ 0, [[FOR_BODY_SELECTBLOCK]] ]
+; CHECK-NEXT:    br label [[SWITCHBLOCK_JT0]]
+; CHECK:       switchblock:
+; CHECK-NEXT:    [[SWITCHBLOCK_PHI:%.*]] = phi i32 [ poison, [[SEL_SI_UNFOLD_FALSE:%.*]] ]
+; CHECK-NEXT:    [[P_24_ADDR_3:%.*]] = phi i32 [ 0, [[SEL_SI_UNFOLD_FALSE]] ]
+; CHECK-NEXT:    switch i32 [[SWITCHBLOCK_PHI]], label [[CLEANUP:%.*]] [
+; CHECK-NEXT:      i32 0, label [[FOR_INC:%.*]]
+; CHECK-NEXT:      i32 1, label [[CLEANUP]]
+; CHECK-NEXT:      i32 5, label [[FOR_BODY_SELECTBLOCK]]
+; CHECK-NEXT:    ]
+; CHECK:       switchblock.jt0:
+; CHECK-NEXT:    [[SWITCHBLOCK_PHI_JT0:%.*]] = phi i32 [ 0, [[FOR_BODY_SELECTBLOCK]] ], [ [[DOTSI_UNFOLD_PHI_JT0]], [[SEL_SI_UNFOLD_FALSE_JT0]] ]
+; CHECK-NEXT:    [[P_24_ADDR_3_JT0:%.*]] = phi i32 [ 0, [[FOR_BODY_SELECTBLOCK]] ], [ 0, [[SEL_SI_UNFOLD_FALSE_JT0]] ]
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    br i1 false, label [[FOR_BODY_SELECTBLOCK]], label [[CLEANUP]]
+; CHECK:       cleanup:
+; CHECK-NEXT:    call void (...) @llvm.fake.use(i32 [[P_24_ADDR_3_JT0]])
+; CHECK-NEXT:    ret i16 0
+;
+entry:
+  br label %for.body.selectblock
+
+for.body.selectblock:                                        ; preds = %for.inc, %switchblock, %entry
+  %sel = select i1 false, i32 0, i32 0
+  br label %switchblock
+
+switchblock:                                       ; preds = %for.body.selectblock
+  %switchblock.phi = phi i32 [ %sel, %for.body.selectblock ]
+  %p_24.addr.3 = phi i32 [ 0, %for.body.selectblock ]
+  switch i32 %switchblock.phi, label %cleanup [
+  i32 0, label %for.inc
+  i32 1, label %cleanup
+  i32 5, label %for.body.selectblock
+  ]
+
+for.inc:                                       ; preds = %switchblock
+  br i1 false, label %for.body.selectblock, label %cleanup
+
+cleanup:                                       ; preds = %for.inc, %switchblock, %switchblock
+  call void (...) @llvm.fake.use(i32 %p_24.addr.3)
+  ret i16 0
+}
+
+declare void @llvm.fake.use(...)
+
 !0 = !{!"function_entry_count", i32 10}
 !1 = !{!"branch_weights", i32 3, i32 5}
 ;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) }
+;.
 ; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i32 10}
 ; CHECK: [[PROF1]] = !{!"branch_weights", i32 3, i32 5}
 ;.
diff --git a/llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll b/llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll
index de38752f3d442..95d3ffaa21b30 100644
--- a/llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll
+++ b/llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -passes=dfa-jump-threading -dfa-early-exit-heuristic=false %s | FileCheck %s
+; RUN: opt -S -passes=dfa-jump-threading -dfa-early-exit-heuristic=false -verify-dom-info=1 %s | FileCheck %s
 
 ; These tests check if selects are unfolded properly for jump threading
 ; opportunities. There are three different patterns to consider:
diff --git a/llvm/test/Transforms/DFAJumpThreading/equivalent-states.ll b/llvm/test/Transforms/DFAJumpThreading/equivalent-states.ll
index 4555dfb87330f..71a469d8c089a 100644
--- a/llvm/test/Transforms/DFAJumpThreading/equivalent-states.ll
+++ b/llvm/test/Transforms/DFAJumpThreading/equivalent-states.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt -S -passes=dfa-jump-threading %s | FileCheck %s
+; RUN: opt -S -passes=dfa-jump-threading -verify-dom-info=1  %s | FileCheck %s
 
 declare void @do_something()
 declare void @user(i32)
diff --git a/llvm/test/Transforms/DFAJumpThreading/single_succ_switch.ll b/llvm/test/Transforms/DFAJumpThreading/single_succ_switch.ll
index 00500a7a2598d..cc117e7f9fce1 100644
--- a/llvm/test/Transforms/DFAJumpThreading/single_succ_switch.ll
+++ b/llvm/test/Transforms/DFAJumpThreading/single_succ_switch.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt -S -passes=dfa-jump-threading %s | FileCheck %s
+; RUN: opt -S -passes=dfa-jump-threading -verify-dom-info=1 %s | FileCheck %s
 
 define void @pr60254() {
 ; CHECK-LABEL: define void @pr60254() {
diff --git a/llvm/test/Transforms/DeadStoreElimination/masked-dead-store-inseltpoison.ll b/llvm/test/Transforms/DeadStoreElimination/masked-dead-store-inseltpoison.ll
index 0b4d657f976e2..1ff15d0404f5c 100644
--- a/llvm/test/Transforms/DeadStoreElimination/masked-dead-store-inseltpoison.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/masked-dead-store-inseltpoison.ll
@@ -6,24 +6,24 @@ define dllexport i32 @f0(ptr %a0, ptr %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i
 ; CHECK-LABEL: @f0(
 ; CHECK-NEXT:  b0:
 ; CHECK-NEXT:    [[V0:%.*]] = getelementptr inbounds ptr, ptr [[A0:%.*]], i32 [[A2:%.*]]
-; CHECK-NEXT:    [[V1:%.*]] = load ptr, ptr [[V0]], align 4, [[TBAA0:!tbaa !.*]]
+; CHECK-NEXT:    [[V1:%.*]] = load ptr, ptr [[V0]], align 4, !tbaa [[TBAA0:![0-9]+]]
 ; CHECK-NEXT:    [[V2:%.*]] = getelementptr i8, ptr [[V1]], i32 [[A3:%.*]]
 ; CHECK-NEXT:    [[V6:%.*]] = getelementptr inbounds ptr, ptr [[A1:%.*]], i32 [[A4:%.*]]
-; CHECK-NEXT:    [[V7:%.*]] = load ptr, ptr [[V6]], align 4, [[TBAA3:!tbaa !.*]]
+; CHECK-NEXT:    [[V7:%.*]] = load ptr, ptr [[V6]], align 4, !tbaa [[TBAA3:![0-9]+]]
 ; CHECK-NEXT:    [[V8:%.*]] = getelementptr i8, ptr [[V7]], i32 [[A5:%.*]]
-; CHECK-NEXT:    [[V10:%.*]] = tail call <128 x i8> @llvm.masked.load.v128i8.p0(ptr [[V8]], i32 32, <128 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <128 x i8> undef), [[TBAA5:!tbaa !.*]]
+; CHECK-NEXT:    [[V10:%.*]] = tail call <128 x i8> @llvm.masked.load.v128i8.p0(ptr align 32 [[V8]], <128 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <128 x i8> undef), !tbaa [[TBAA5:![0-9]+]]
 ; CHECK-NEXT:    [[V11:%.*]] = shufflevector <128 x i8> [[V10]], <128 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[V14:%.*]] = shufflevector <32 x i8> [[V11]], <32 x i8> poison, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[V16:%.*]] = shufflevector <128 x i8> [[V14]], <128 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[V17:%.*]] = getelementptr inbounds ptr, ptr [[A1]], i32 [[A6:%.*]]
-; CHECK-NEXT:    [[V18:%.*]] = load ptr, ptr [[V17]], align 4, [[TBAA3]]
+; CHECK-NEXT:    [[V18:%.*]] = load ptr, ptr [[V17]], align 4, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[V19:%.*]] = getelementptr i8, ptr [[V18]], i32 [[A7:%.*]]
-; CHECK-NEXT:    [[V21:%.*]] = tail call <128 x i8> @llvm.masked.load.v128i8.p0(ptr [[V19]], i32 32, <128 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <128 x i8> undef), [[TBAA5]]
+; CHECK-NEXT:    [[V21:%.*]] = tail call <128 x i8> @llvm.masked.load.v128i8.p0(ptr align 32 [[V19]], <128 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <128 x i8> undef), !tbaa [[TBAA5]]
 ; CHECK-NEXT:    [[V22:%.*]] = shufflevector <128 x i8> [[V21]], <128 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[V23:%.*]] = icmp ugt <32 x i8> [[V16]], [[V22]]
 ; CHECK-NEXT:    [[V24:%.*]] = select <32 x i1> [[V23]], <32 x i8> [[V16]], <32 x i8> [[V22]]
 ; CHECK-NEXT:    [[V25:%.*]] = shufflevector <32 x i8> [[V24]], <32 x i8> poison, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    tail call void @llvm.masked.store.v128i8.p0(<128 x i8> [[V25]], ptr [[V2]], i32 32, <128 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>), [[TBAA8:!tbaa !.*]]
+; CHECK-NEXT:    tail call void @llvm.masked.store.v128i8.p0(<128 x i8> [[V25]], ptr align 32 [[V2]], <128 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>), !tbaa [[TBAA8:![0-9]+]]
 ; CHECK-NEXT:    ret i32 0
 ;
 b0:
diff --git a/llvm/test/Transforms/DeadStoreElimination/masked-dead-store.ll b/llvm/test/Transforms/DeadStoreElimination/masked-dead-store.ll
index 7169d9f90c666..54c51ebaa8fdd 100644
--- a/llvm/test/Transforms/DeadStoreElimination/masked-dead-store.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/masked-dead-store.ll
@@ -11,19 +11,19 @@ define dllexport i32 @f0(ptr %a0, ptr %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i
 ; CHECK-NEXT:    [[V6:%.*]] = getelementptr inbounds ptr, ptr [[A1:%.*]], i32 [[A4:%.*]]
 ; CHECK-NEXT:    [[V7:%.*]] = load ptr, ptr [[V6]], align 4, !tbaa [[TBAA3:![0-9]+]]
 ; CHECK-NEXT:    [[V8:%.*]] = getelementptr i8, ptr [[V7]], i32 [[A5:%.*]]
-; CHECK-NEXT:    [[V10:%.*]] = tail call <128 x i8> @llvm.masked.load.v128i8.p0(ptr [[V8]], i32 32, <128 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <128 x i8> undef), !tbaa [[TBAA5:![0-9]+]]
+; CHECK-NEXT:    [[V10:%.*]] = tail call <128 x i8> @llvm.masked.load.v128i8.p0(ptr align 32 [[V8]], <128 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <128 x i8> undef), !tbaa [[TBAA5:![0-9]+]]
 ; CHECK-NEXT:    [[V11:%.*]] = shufflevector <128 x i8> [[V10]], <128 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[V14:%.*]] = shufflevector <32 x i8> [[V11]], <32 x i8> undef, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[V16:%.*]] = shufflevector <128 x i8> [[V14]], <128 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[V17:%.*]] = getelementptr inbounds ptr, ptr [[A1]], i32 [[A6:%.*]]
 ; CHECK-NEXT:    [[V18:%.*]] = load ptr, ptr [[V17]], align 4, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[V19:%.*]] = getelementptr i8, ptr [[V18]], i32 [[A7:%.*]]
-; CHECK-NEXT:    [[V21:%.*]] = tail call <128 x i8> @llvm.masked.load.v128i8.p0(ptr [[V19]], i32 32, <128 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <128 x i8> undef), !tbaa [[TBAA5]]
+; CHECK-NEXT:    [[V21:%.*]] = tail call <128 x i8> @llvm.masked.load.v128i8.p0(ptr align 32 [[V19]], <128 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <128 x i8> undef), !tbaa [[TBAA5]]
 ; CHECK-NEXT:    [[V22:%.*]] = shufflevector <128 x i8> [[V21]], <128 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[V23:%.*]] = icmp ugt <32 x i8> [[V16]], [[V22]]
 ; CHECK-NEXT:    [[V24:%.*]] = select <32 x i1> [[V23]], <32 x i8> [[V16]], <32 x i8> [[V22]]
 ; CHECK-NEXT:    [[V25:%.*]] = shufflevector <32 x i8> [[V24]], <32 x i8> undef, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    tail call void @llvm.masked.store.v128i8.p0(<128 x i8> [[V25]], ptr [[V2]], i32 32, <128 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>), !tbaa [[TBAA8:![0-9]+]]
+; CHECK-NEXT:    tail call void @llvm.masked.store.v128i8.p0(<128 x i8> [[V25]], ptr align 32 [[V2]], <128 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>), !tbaa [[TBAA8:![0-9]+]]
 ; CHECK-NEXT:    ret i32 0
 ;
 b0:
@@ -53,8 +53,8 @@ b0:
 
 define dllexport i32 @f1(ptr %a, <4 x i8> %v1, <4 x i32> %v2) {
 ; CHECK-LABEL: @f1(
-; CHECK-NEXT:    tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V2:%.*]], ptr [[A:%.*]], i32 1, <4 x i1> splat (i1 true))
-; CHECK-NEXT:    tail call void @llvm.masked.store.v4i8.p0(<4 x i8> [[V1:%.*]], ptr [[A]], i32 1, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V2:%.*]], ptr align 1 [[A:%.*]], <4 x i1> splat (i1 true))
+; CHECK-NEXT:    tail call void @llvm.masked.store.v4i8.p0(<4 x i8> [[V1:%.*]], ptr align 1 [[A]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    ret i32 0
 ;
   tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %v2, ptr %a, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
@@ -64,8 +64,8 @@ define dllexport i32 @f1(ptr %a, <4 x i8> %v1, <4 x i32> %v2) {
 
 define dllexport i32 @f2(ptr %a, <4 x i8> %v1, <4 x i32> %v2, <4 x i1> %mask) {
 ; CHECK-LABEL: @f2(
-; CHECK-NEXT:    tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V2:%.*]], ptr [[A:%.*]], i32 1, <4 x i1> [[MASK:%.*]])
-; CHECK-NEXT:    tail call void @llvm.masked.store.v4i8.p0(<4 x i8> [[V1:%.*]], ptr [[A]], i32 1, <4 x i1> [[MASK]])
+; CHECK-NEXT:    tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V2:%.*]], ptr align 1 [[A:%.*]], <4 x i1> [[MASK:%.*]])
+; CHECK-NEXT:    tail call void @llvm.masked.store.v4i8.p0(<4 x i8> [[V1:%.*]], ptr align 1 [[A]], <4 x i1> [[MASK]])
 ; CHECK-NEXT:    ret i32 0
 ;
   tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %v2, ptr %a, i32 1, <4 x i1> %mask)
diff --git a/llvm/test/Transforms/EarlyCSE/masked-intrinsics-unequal-masks.ll b/llvm/test/Transforms/EarlyCSE/masked-intrinsics-unequal-masks.ll
index 97397724139e5..94cbc240bc2c5 100644
--- a/llvm/test/Transforms/EarlyCSE/masked-intrinsics-unequal-masks.ll
+++ b/llvm/test/Transforms/EarlyCSE/masked-intrinsics-unequal-masks.ll
@@ -12,7 +12,7 @@
 ; Expect the second load to be removed.
 define <4 x i32> @f3(ptr %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: @f3(
-; CHECK-NEXT:    [[V0:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[A0:%.*]], i32 4, <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[V0:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[A0:%.*]], <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> [[A1:%.*]])
 ; CHECK-NEXT:    [[V2:%.*]] = add <4 x i32> [[V0]], [[V0]]
 ; CHECK-NEXT:    ret <4 x i32> [[V2]]
 ;
@@ -26,8 +26,8 @@ define <4 x i32> @f3(ptr %a0, <4 x i32> %a1) {
 ; Expect the second load to remain.
 define <4 x i32> @f4(ptr %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: @f4(
-; CHECK-NEXT:    [[V0:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[A0:%.*]], i32 4, <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> [[A1:%.*]])
-; CHECK-NEXT:    [[V1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[A0]], i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[V0:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[A0:%.*]], <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[V1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[A0]], <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[V2:%.*]] = add <4 x i32> [[V0]], [[V1]]
 ; CHECK-NEXT:    ret <4 x i32> [[V2]]
 ;
@@ -41,8 +41,8 @@ define <4 x i32> @f4(ptr %a0, <4 x i32> %a1) {
 ; Expect the second load to remain.
 define <4 x i32> @f5(ptr %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: @f5(
-; CHECK-NEXT:    [[V0:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[A0:%.*]], i32 4, <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> [[A1:%.*]])
-; CHECK-NEXT:    [[V1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[A0]], i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[V0:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[A0:%.*]], <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[V1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[A0]], <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[V2:%.*]] = add <4 x i32> [[V0]], [[V1]]
 ; CHECK-NEXT:    ret <4 x i32> [[V2]]
 ;
@@ -59,7 +59,7 @@ define <4 x i32> @f5(ptr %a0, <4 x i32> %a1) {
 ; Expect the first store to be removed.
 define void @f6(<4 x i32> %a0, ptr %a1) {
 ; CHECK-LABEL: @f6(
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[A0:%.*]], ptr [[A1:%.*]], i32 4, <4 x i1> <i1 true, i1 true, i1 false, i1 true>)
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[A0:%.*]], ptr align 4 [[A1:%.*]], <4 x i1> <i1 true, i1 true, i1 false, i1 true>)
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.masked.store.v4i32.p0(<4 x i32> %a0, ptr %a1, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 true>)
@@ -71,8 +71,8 @@ define void @f6(<4 x i32> %a0, ptr %a1) {
 ; Expect both stores to remain.
 define void @f7(<4 x i32> %a0, ptr %a1) {
 ; CHECK-LABEL: @f7(
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[A0:%.*]], ptr [[A1:%.*]], i32 4, <4 x i1> <i1 true, i1 true, i1 false, i1 true>)
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[A0]], ptr [[A1]], i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 true>)
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[A0:%.*]], ptr align 4 [[A1:%.*]], <4 x i1> <i1 true, i1 true, i1 false, i1 true>)
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[A0]], ptr align 4 [[A1]], <4 x i1> <i1 true, i1 false, i1 false, i1 true>)
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.masked.store.v4i32.p0(<4 x i32> %a0, ptr %a1, i32 4, <4 x i1> <i1 true, i1 true, i1 false, i1 true>)
@@ -87,7 +87,7 @@ define void @f7(<4 x i32> %a0, ptr %a1) {
 ; Expect the store to be removed.
 define <4 x i32> @f8(ptr %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: @f8(
-; CHECK-NEXT:    [[V0:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[A0:%.*]], i32 4, <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[V0:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[A0:%.*]], <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> [[A1:%.*]])
 ; CHECK-NEXT:    ret <4 x i32> [[V0]]
 ;
   %v0 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %a0, i32 4, <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> %a1)
@@ -99,8 +99,8 @@ define <4 x i32> @f8(ptr %a0, <4 x i32> %a1) {
 ; Expect the store to remain.
 define <4 x i32> @f9(ptr %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: @f9(
-; CHECK-NEXT:    [[V0:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[A0:%.*]], i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> [[A1:%.*]])
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V0]], ptr [[A0]], i32 4, <4 x i1> <i1 true, i1 true, i1 false, i1 true>)
+; CHECK-NEXT:    [[V0:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[A0:%.*]], <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> [[A1:%.*]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V0]], ptr align 4 [[A0]], <4 x i1> <i1 true, i1 true, i1 false, i1 true>)
 ; CHECK-NEXT:    ret <4 x i32> [[V0]]
 ;
   %v0 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %a0, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> %a1)
@@ -115,7 +115,7 @@ define <4 x i32> @f9(ptr %a0, <4 x i32> %a1) {
 ; Expect the load to be removed.
 define <4 x i32> @fa(<4 x i32> %a0, ptr %a1) {
 ; CHECK-LABEL: @fa(
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[A0:%.*]], ptr [[A1:%.*]], i32 4, <4 x i1> <i1 true, i1 true, i1 false, i1 true>)
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[A0:%.*]], ptr align 4 [[A1:%.*]], <4 x i1> <i1 true, i1 true, i1 false, i1 true>)
 ; CHECK-NEXT:    ret <4 x i32> [[A0]]
 ;
   call void @llvm.masked.store.v4i32.p0(<4 x i32> %a0, ptr %a1, i32 4, <4 x i1> <i1 true, i1 true, i1 false, i1 true>)
@@ -127,8 +127,8 @@ define <4 x i32> @fa(<4 x i32> %a0, ptr %a1) {
 ; Expect the load to remain.
 define <4 x i32> @fb(<4 x i32> %a0, ptr %a1) {
 ; CHECK-LABEL: @fb(
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[A0:%.*]], ptr [[A1:%.*]], i32 4, <4 x i1> <i1 true, i1 true, i1 false, i1 true>)
-; CHECK-NEXT:    [[V0:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[A1]], i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[A0:%.*]], ptr align 4 [[A1:%.*]], <4 x i1> <i1 true, i1 true, i1 false, i1 true>)
+; CHECK-NEXT:    [[V0:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[A1]], <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <4 x i32> [[V0]]
 ;
   call void @llvm.masked.store.v4i32.p0(<4 x i32> %a0, ptr %a1, i32 4, <4 x i1> <i1 true, i1 true, i1 false, i1 true>)
@@ -140,8 +140,8 @@ define <4 x i32> @fb(<4 x i32> %a0, ptr %a1) {
 ; Expect the load to remain.
 define <4 x i32> @fc(<4 x i32> %a0, ptr %a1) {
 ; CHECK-LABEL: @fc(
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[A0:%.*]], ptr [[A1:%.*]], i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 true>)
-; CHECK-NEXT:    [[V0:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[A1]], i32 4, <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> undef)
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[A0:%.*]], ptr align 4 [[A1:%.*]], <4 x i1> <i1 true, i1 false, i1 false, i1 true>)
+; CHECK-NEXT:    [[V0:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[A1]], <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> undef)
 ; CHECK-NEXT:    ret <4 x i32> [[V0]]
 ;
   call void @llvm.masked.store.v4i32.p0(<4 x i32> %a0, ptr %a1, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 true>)
diff --git a/llvm/test/Transforms/EarlyCSE/masked-intrinsics.ll b/llvm/test/Transforms/EarlyCSE/masked-intrinsics.ll
index 0fcb4fd8df6cd..ae729d52b7bc2 100644
--- a/llvm/test/Transforms/EarlyCSE/masked-intrinsics.ll
+++ b/llvm/test/Transforms/EarlyCSE/masked-intrinsics.ll
@@ -4,7 +4,7 @@
 define <128 x i8> @f0(ptr %a0, <128 x i8> %a1, <128 x i8> %a2) {
 ; CHECK-LABEL: @f0(
 ; CHECK-NEXT:    [[V0:%.*]] = icmp eq <128 x i8> [[A1:%.*]], [[A2:%.*]]
-; CHECK-NEXT:    call void @llvm.masked.store.v128i8.p0(<128 x i8> [[A1]], ptr [[A0:%.*]], i32 4, <128 x i1> [[V0]])
+; CHECK-NEXT:    call void @llvm.masked.store.v128i8.p0(<128 x i8> [[A1]], ptr align 4 [[A0:%.*]], <128 x i1> [[V0]])
 ; CHECK-NEXT:    ret <128 x i8> [[A1]]
 ;
   %v0 = icmp eq <128 x i8> %a1, %a2
@@ -16,7 +16,7 @@ define <128 x i8> @f0(ptr %a0, <128 x i8> %a1, <128 x i8> %a2) {
 define <128 x i8> @f1(ptr %a0, <128 x i8> %a1, <128 x i8> %a2) {
 ; CHECK-LABEL: @f1(
 ; CHECK-NEXT:    [[V0:%.*]] = icmp eq <128 x i8> [[A1:%.*]], [[A2:%.*]]
-; CHECK-NEXT:    [[V1:%.*]] = call <128 x i8> @llvm.masked.load.v128i8.p0(ptr [[A0:%.*]], i32 4, <128 x i1> [[V0]], <128 x i8> undef)
+; CHECK-NEXT:    [[V1:%.*]] = call <128 x i8> @llvm.masked.load.v128i8.p0(ptr align 4 [[A0:%.*]], <128 x i1> [[V0]], <128 x i8> undef)
 ; CHECK-NEXT:    ret <128 x i8> [[V1]]
 ;
   %v0 = icmp eq <128 x i8> %a1, %a2
@@ -28,7 +28,7 @@ define <128 x i8> @f1(ptr %a0, <128 x i8> %a1, <128 x i8> %a2) {
 define <128 x i8> @f2(ptr %a0, <128 x i8> %a1, <128 x i8> %a2) {
 ; CHECK-LABEL: @f2(
 ; CHECK-NEXT:    [[V0:%.*]] = icmp eq <128 x i8> [[A1:%.*]], [[A2:%.*]]
-; CHECK-NEXT:    [[V1:%.*]] = call <128 x i8> @llvm.masked.load.v128i8.p0(ptr [[A0:%.*]], i32 4, <128 x i1> [[V0]], <128 x i8> undef)
+; CHECK-NEXT:    [[V1:%.*]] = call <128 x i8> @llvm.masked.load.v128i8.p0(ptr align 4 [[A0:%.*]], <128 x i1> [[V0]], <128 x i8> undef)
 ; CHECK-NEXT:    [[V3:%.*]] = add <128 x i8> [[V1]], [[V1]]
 ; CHECK-NEXT:    ret <128 x i8> [[V3]]
 ;
diff --git a/llvm/test/Transforms/EarlyCSE/opaque-ptr.ll b/llvm/test/Transforms/EarlyCSE/opaque-ptr.ll
index da507f13730e8..d46159cb8dc1b 100644
--- a/llvm/test/Transforms/EarlyCSE/opaque-ptr.ll
+++ b/llvm/test/Transforms/EarlyCSE/opaque-ptr.ll
@@ -18,8 +18,8 @@ define i32 @different_types_load(ptr %p) {
 
 define i32 @different_types_vector_load(ptr %p) {
 ; CHECK-LABEL: @different_types_vector_load(
-; CHECK-NEXT:    [[V1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[P:%.*]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x i32> poison)
-; CHECK-NEXT:    [[V2:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[P]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x i32> poison)
+; CHECK-NEXT:    [[V1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[P:%.*]], <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x i32> poison)
+; CHECK-NEXT:    [[V2:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 4 [[P]], <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x i32> poison)
 ; CHECK-NEXT:    [[E1:%.*]] = extractelement <4 x i32> [[V1]], i32 0
 ; CHECK-NEXT:    [[E2:%.*]] = extractelement <8 x i32> [[V2]], i32 6
 ; CHECK-NEXT:    [[SUM:%.*]] = add i32 [[E1]], [[E2]]
@@ -50,8 +50,8 @@ define i32 @different_types_store(ptr %p, i32 %a) {
 
 define i32 @different_elt_types_vector_load(ptr %p, <4 x i1> %c) {
 ; CHECK-LABEL: @different_elt_types_vector_load(
-; CHECK-NEXT:    [[V1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[P:%.*]], i32 4, <4 x i1> [[C:%.*]], <4 x i32> poison)
-; CHECK-NEXT:    [[V2:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[P]], i32 4, <4 x i1> [[C]], <4 x float> poison)
+; CHECK-NEXT:    [[V1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[P:%.*]], <4 x i1> [[C:%.*]], <4 x i32> poison)
+; CHECK-NEXT:    [[V2:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[P]], <4 x i1> [[C]], <4 x float> poison)
 ; CHECK-NEXT:    [[E1:%.*]] = extractelement <4 x i32> [[V1]], i32 0
 ; CHECK-NEXT:    [[E2:%.*]] = extractelement <4 x float> [[V2]], i32 0
 ; CHECK-NEXT:    [[E2I:%.*]] = fptosi float [[E2]] to i32
@@ -69,8 +69,8 @@ define i32 @different_elt_types_vector_load(ptr %p, <4 x i1> %c) {
 
 define float @different_elt_types_vector_store_load(ptr %p, <4 x i32> %v1, <4 x i1> %c) {
 ; CHECK-LABEL: @different_elt_types_vector_store_load(
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V1:%.*]], ptr [[P:%.*]], i32 4, <4 x i1> [[C:%.*]])
-; CHECK-NEXT:    [[V2:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[P]], i32 4, <4 x i1> [[C]], <4 x float> poison)
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V1:%.*]], ptr align 4 [[P:%.*]], <4 x i1> [[C:%.*]])
+; CHECK-NEXT:    [[V2:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[P]], <4 x i1> [[C]], <4 x float> poison)
 ; CHECK-NEXT:    [[E2:%.*]] = extractelement <4 x float> [[V2]], i32 0
 ; CHECK-NEXT:    ret float [[E2]]
 ;
diff --git a/llvm/test/Transforms/FunctionAttrs/readattrs.ll b/llvm/test/Transforms/FunctionAttrs/readattrs.ll
index d0aec184f49c3..87f64ed3c63bc 100644
--- a/llvm/test/Transforms/FunctionAttrs/readattrs.ll
+++ b/llvm/test/Transforms/FunctionAttrs/readattrs.ll
@@ -253,20 +253,20 @@ declare void @llvm.masked.scatter.v4i32.v4p0(<4 x i32>%val, <4 x ptr>, i32, <4 x
 define void @test9(<4 x ptr> %ptrs, <4 x i32>%val) {
 ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write)
 ; FNATTRS-LABEL: define {{[^@]+}}@test9
-; FNATTRS-SAME: (<4 x ptr> [[PTRS:%.*]], <4 x i32> [[VAL:%.*]]) #[[ATTR7:[0-9]+]] {
-; FNATTRS-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[VAL]], <4 x ptr> [[PTRS]], i32 4, <4 x i1> <i1 true, i1 false, i1 true, i1 false>)
+; FNATTRS-SAME: (<4 x ptr> [[PTRS:%.*]], <4 x i32> [[VAL:%.*]]) #[[ATTR6:[0-9]+]] {
+; FNATTRS-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[VAL]], <4 x ptr> align 4 [[PTRS]], <4 x i1> <i1 true, i1 false, i1 true, i1 false>)
 ; FNATTRS-NEXT:    ret void
 ;
 ; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write)
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@test9
 ; ATTRIBUTOR-SAME: (<4 x ptr> [[PTRS:%.*]], <4 x i32> [[VAL:%.*]]) #[[ATTR0]] {
-; ATTRIBUTOR-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[VAL]], <4 x ptr> [[PTRS]], i32 4, <4 x i1> <i1 true, i1 false, i1 true, i1 false>) #[[ATTR15:[0-9]+]]
+; ATTRIBUTOR-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[VAL]], <4 x ptr> align 4 [[PTRS]], <4 x i1> <i1 true, i1 false, i1 true, i1 false>) #[[ATTR15:[0-9]+]]
 ; ATTRIBUTOR-NEXT:    ret void
 ;
 ; ATTRIBUTOR-CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write)
 ; ATTRIBUTOR-CGSCC-LABEL: define {{[^@]+}}@test9
 ; ATTRIBUTOR-CGSCC-SAME: (<4 x ptr> [[PTRS:%.*]], <4 x i32> [[VAL:%.*]]) #[[ATTR0]] {
-; ATTRIBUTOR-CGSCC-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[VAL]], <4 x ptr> [[PTRS]], i32 4, <4 x i1> <i1 true, i1 false, i1 true, i1 false>) #[[ATTR15:[0-9]+]]
+; ATTRIBUTOR-CGSCC-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[VAL]], <4 x ptr> align 4 [[PTRS]], <4 x i1> <i1 true, i1 false, i1 true, i1 false>) #[[ATTR15:[0-9]+]]
 ; ATTRIBUTOR-CGSCC-NEXT:    ret void
 ;
   call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32>%val, <4 x ptr> %ptrs, i32 4, <4 x i1><i1 true, i1 false, i1 true, i1 false>)
@@ -277,20 +277,20 @@ declare <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x
 define <4 x i32> @test10(<4 x ptr> %ptrs) {
 ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read)
 ; FNATTRS-LABEL: define {{[^@]+}}@test10
-; FNATTRS-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR9:[0-9]+]] {
-; FNATTRS-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[PTRS]], i32 4, <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> undef)
+; FNATTRS-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR7:[0-9]+]] {
+; FNATTRS-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[PTRS]], <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> undef)
 ; FNATTRS-NEXT:    ret <4 x i32> [[RES]]
 ;
 ; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read)
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@test10
-; ATTRIBUTOR-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR7:[0-9]+]] {
-; ATTRIBUTOR-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[PTRS]], i32 4, <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> undef) #[[ATTR16:[0-9]+]]
+; ATTRIBUTOR-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR5:[0-9]+]] {
+; ATTRIBUTOR-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[PTRS]], <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> undef) #[[ATTR16:[0-9]+]]
 ; ATTRIBUTOR-NEXT:    ret <4 x i32> [[RES]]
 ;
 ; ATTRIBUTOR-CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read)
 ; ATTRIBUTOR-CGSCC-LABEL: define {{[^@]+}}@test10
-; ATTRIBUTOR-CGSCC-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR8:[0-9]+]] {
-; ATTRIBUTOR-CGSCC-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[PTRS]], i32 4, <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> undef) #[[ATTR16:[0-9]+]]
+; ATTRIBUTOR-CGSCC-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR6:[0-9]+]] {
+; ATTRIBUTOR-CGSCC-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[PTRS]], <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> undef) #[[ATTR16:[0-9]+]]
 ; ATTRIBUTOR-CGSCC-NEXT:    ret <4 x i32> [[RES]]
 ;
   %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1><i1 true, i1 false, i1 true, i1 false>, <4 x i32>undef)
@@ -301,19 +301,19 @@ declare <4 x i32> @test11_1(<4 x ptr>) argmemonly nounwind readonly
 define <4 x i32> @test11_2(<4 x ptr> %ptrs) {
 ; FNATTRS: Function Attrs: nofree nounwind memory(argmem: read)
 ; FNATTRS-LABEL: define {{[^@]+}}@test11_2
-; FNATTRS-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR11:[0-9]+]] {
+; FNATTRS-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR9:[0-9]+]] {
 ; FNATTRS-NEXT:    [[RES:%.*]] = call <4 x i32> @test11_1(<4 x ptr> [[PTRS]])
 ; FNATTRS-NEXT:    ret <4 x i32> [[RES]]
 ;
 ; ATTRIBUTOR: Function Attrs: nosync nounwind memory(argmem: read)
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@test11_2
-; ATTRIBUTOR-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR9:[0-9]+]] {
+; ATTRIBUTOR-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR7:[0-9]+]] {
 ; ATTRIBUTOR-NEXT:    [[RES:%.*]] = call <4 x i32> @test11_1(<4 x ptr> [[PTRS]]) #[[ATTR3]]
 ; ATTRIBUTOR-NEXT:    ret <4 x i32> [[RES]]
 ;
 ; ATTRIBUTOR-CGSCC: Function Attrs: nosync nounwind memory(argmem: read)
 ; ATTRIBUTOR-CGSCC-LABEL: define {{[^@]+}}@test11_2
-; ATTRIBUTOR-CGSCC-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR10:[0-9]+]] {
+; ATTRIBUTOR-CGSCC-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR8:[0-9]+]] {
 ; ATTRIBUTOR-CGSCC-NEXT:    [[RES:%.*]] = call <4 x i32> @test11_1(<4 x ptr> [[PTRS]]) #[[ATTR3]]
 ; ATTRIBUTOR-CGSCC-NEXT:    ret <4 x i32> [[RES]]
 ;
@@ -325,19 +325,19 @@ declare <4 x i32> @test12_1(<4 x ptr>) argmemonly nounwind
 define <4 x i32> @test12_2(<4 x ptr> %ptrs) {
 ; FNATTRS: Function Attrs: nounwind memory(argmem: readwrite)
 ; FNATTRS-LABEL: define {{[^@]+}}@test12_2
-; FNATTRS-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR12:[0-9]+]] {
+; FNATTRS-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR10:[0-9]+]] {
 ; FNATTRS-NEXT:    [[RES:%.*]] = call <4 x i32> @test12_1(<4 x ptr> [[PTRS]])
 ; FNATTRS-NEXT:    ret <4 x i32> [[RES]]
 ;
 ; ATTRIBUTOR: Function Attrs: nounwind memory(argmem: readwrite)
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@test12_2
-; ATTRIBUTOR-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR10:[0-9]+]] {
+; ATTRIBUTOR-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR8:[0-9]+]] {
 ; ATTRIBUTOR-NEXT:    [[RES:%.*]] = call <4 x i32> @test12_1(<4 x ptr> [[PTRS]])
 ; ATTRIBUTOR-NEXT:    ret <4 x i32> [[RES]]
 ;
 ; ATTRIBUTOR-CGSCC: Function Attrs: nounwind memory(argmem: readwrite)
 ; ATTRIBUTOR-CGSCC-LABEL: define {{[^@]+}}@test12_2
-; ATTRIBUTOR-CGSCC-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR11:[0-9]+]] {
+; ATTRIBUTOR-CGSCC-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR9:[0-9]+]] {
 ; ATTRIBUTOR-CGSCC-NEXT:    [[RES:%.*]] = call <4 x i32> @test12_1(<4 x ptr> [[PTRS]])
 ; ATTRIBUTOR-CGSCC-NEXT:    ret <4 x i32> [[RES]]
 ;
@@ -348,19 +348,19 @@ define <4 x i32> @test12_2(<4 x ptr> %ptrs) {
 define i32 @volatile_load(ptr %p) {
 ; FNATTRS: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite, inaccessiblemem: readwrite)
 ; FNATTRS-LABEL: define {{[^@]+}}@volatile_load
-; FNATTRS-SAME: (ptr [[P:%.*]]) #[[ATTR13:[0-9]+]] {
+; FNATTRS-SAME: (ptr [[P:%.*]]) #[[ATTR11:[0-9]+]] {
 ; FNATTRS-NEXT:    [[LOAD:%.*]] = load volatile i32, ptr [[P]], align 4
 ; FNATTRS-NEXT:    ret i32 [[LOAD]]
 ;
 ; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@volatile_load
-; ATTRIBUTOR-SAME: (ptr nofree [[P:%.*]]) #[[ATTR11:[0-9]+]] {
+; ATTRIBUTOR-SAME: (ptr nofree [[P:%.*]]) #[[ATTR9:[0-9]+]] {
 ; ATTRIBUTOR-NEXT:    [[LOAD:%.*]] = load volatile i32, ptr [[P]], align 4
 ; ATTRIBUTOR-NEXT:    ret i32 [[LOAD]]
 ;
 ; ATTRIBUTOR-CGSCC: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
 ; ATTRIBUTOR-CGSCC-LABEL: define {{[^@]+}}@volatile_load
-; ATTRIBUTOR-CGSCC-SAME: (ptr nofree [[P:%.*]]) #[[ATTR12:[0-9]+]] {
+; ATTRIBUTOR-CGSCC-SAME: (ptr nofree [[P:%.*]]) #[[ATTR10:[0-9]+]] {
 ; ATTRIBUTOR-CGSCC-NEXT:    [[LOAD:%.*]] = load volatile i32, ptr [[P]], align 4
 ; ATTRIBUTOR-CGSCC-NEXT:    ret i32 [[LOAD]]
 ;
@@ -570,7 +570,7 @@ define void @fptr_test2c(ptr %p, ptr %f) {
 define void @alloca_recphi() {
 ; FNATTRS: Function Attrs: nofree norecurse nosync nounwind memory(none)
 ; FNATTRS-LABEL: define {{[^@]+}}@alloca_recphi
-; FNATTRS-SAME: () #[[ATTR14:[0-9]+]] {
+; FNATTRS-SAME: () #[[ATTR12:[0-9]+]] {
 ; FNATTRS-NEXT:  entry:
 ; FNATTRS-NEXT:    [[A:%.*]] = alloca [8 x i32], align 4
 ; FNATTRS-NEXT:    [[A_END:%.*]] = getelementptr i32, ptr [[A]], i64 8
@@ -587,7 +587,7 @@ define void @alloca_recphi() {
 ;
 ; ATTRIBUTOR: Function Attrs: nofree norecurse nosync nounwind memory(none)
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@alloca_recphi
-; ATTRIBUTOR-SAME: () #[[ATTR12:[0-9]+]] {
+; ATTRIBUTOR-SAME: () #[[ATTR10:[0-9]+]] {
 ; ATTRIBUTOR-NEXT:  entry:
 ; ATTRIBUTOR-NEXT:    [[A:%.*]] = alloca [8 x i32], align 4
 ; ATTRIBUTOR-NEXT:    [[A_END:%.*]] = getelementptr i32, ptr [[A]], i64 8
@@ -723,19 +723,19 @@ define void @op_bundle_readonly_unknown(ptr %p) {
 define i32 @writable_readonly(ptr writable dereferenceable(4) %p) {
 ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: read)
 ; FNATTRS-LABEL: define {{[^@]+}}@writable_readonly
-; FNATTRS-SAME: (ptr readonly captures(none) dereferenceable(4) [[P:%.*]]) #[[ATTR15:[0-9]+]] {
+; FNATTRS-SAME: (ptr readonly captures(none) dereferenceable(4) [[P:%.*]]) #[[ATTR13:[0-9]+]] {
 ; FNATTRS-NEXT:    [[V:%.*]] = load i32, ptr [[P]], align 4
 ; FNATTRS-NEXT:    ret i32 [[V]]
 ;
 ; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: read)
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@writable_readonly
-; ATTRIBUTOR-SAME: (ptr nofree nonnull readonly captures(none) dereferenceable(4) [[P:%.*]]) #[[ATTR13:[0-9]+]] {
+; ATTRIBUTOR-SAME: (ptr nofree nonnull readonly captures(none) dereferenceable(4) [[P:%.*]]) #[[ATTR11:[0-9]+]] {
 ; ATTRIBUTOR-NEXT:    [[V:%.*]] = load i32, ptr [[P]], align 4
 ; ATTRIBUTOR-NEXT:    ret i32 [[V]]
 ;
 ; ATTRIBUTOR-CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: read)
 ; ATTRIBUTOR-CGSCC-LABEL: define {{[^@]+}}@writable_readonly
-; ATTRIBUTOR-CGSCC-SAME: (ptr nofree nonnull readonly captures(none) dereferenceable(4) [[P:%.*]]) #[[ATTR13:[0-9]+]] {
+; ATTRIBUTOR-CGSCC-SAME: (ptr nofree nonnull readonly captures(none) dereferenceable(4) [[P:%.*]]) #[[ATTR11:[0-9]+]] {
 ; ATTRIBUTOR-CGSCC-NEXT:    [[V:%.*]] = load i32, ptr [[P]], align 4
 ; ATTRIBUTOR-CGSCC-NEXT:    ret i32 [[V]]
 ;
diff --git a/llvm/test/Transforms/FunctionAttrs/vector-of-pointers-getunderlyingobject-crash.ll b/llvm/test/Transforms/FunctionAttrs/vector-of-pointers-getunderlyingobject-crash.ll
index 2ccb1efd85e33..bc0c8097613c3 100644
--- a/llvm/test/Transforms/FunctionAttrs/vector-of-pointers-getunderlyingobject-crash.ll
+++ b/llvm/test/Transforms/FunctionAttrs/vector-of-pointers-getunderlyingobject-crash.ll
@@ -4,7 +4,7 @@
 define double @getUnderlyingObject_vector_ptr(<4 x i1> %arg0, <4 x i1> %arg1) {
 ; CHECK-LABEL: define double @getUnderlyingObject_vector_ptr(
 ; CHECK-SAME: <4 x i1> [[ARG0:%.*]], <4 x i1> [[ARG1:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[GATHER:%.*]] = tail call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> getelementptr inbounds (i8, <4 x ptr> zeroinitializer, <4 x i64> splat (i64 8)), i32 0, <4 x i1> [[ARG0]], <4 x double> zeroinitializer)
+; CHECK-NEXT:    [[GATHER:%.*]] = tail call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 8 getelementptr inbounds (i8, <4 x ptr> zeroinitializer, <4 x i64> splat (i64 8)), <4 x i1> [[ARG0]], <4 x double> zeroinitializer)
 ; CHECK-NEXT:    [[REDUCE_FADD:%.*]] = tail call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[GATHER]])
 ; CHECK-NEXT:    ret double [[REDUCE_FADD]]
 ;
diff --git a/llvm/test/Transforms/GVN/2016-08-30-MaskedScatterGather-inseltpoison.ll b/llvm/test/Transforms/GVN/2016-08-30-MaskedScatterGather-inseltpoison.ll
index aeb3de9215aba..f8703a82f871e 100644
--- a/llvm/test/Transforms/GVN/2016-08-30-MaskedScatterGather-inseltpoison.ll
+++ b/llvm/test/Transforms/GVN/2016-08-30-MaskedScatterGather-inseltpoison.ll
@@ -16,12 +16,12 @@ define spir_kernel void @test(<2 x ptr> %in1, <2 x ptr> %in2, ptr %out) {
 ; CHECK-NEXT:    [[TMP_1:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[TMP_I:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP_0]], i32 0
 ; CHECK-NEXT:    [[TMP:%.*]] = insertelement <2 x ptr> [[TMP_I]], ptr [[TMP_1]], i32 1
-; CHECK-NEXT:    [[IN1_V:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> [[IN1]], i32 1, <2 x i1> splat (i1 true), <2 x i32> undef)
-; CHECK-NEXT:    [[IN2_V:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> [[IN2]], i32 1, <2 x i1> splat (i1 true), <2 x i32> undef)
-; CHECK-NEXT:    call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> [[IN1_V]], <2 x ptr> [[TMP]], i32 1, <2 x i1> splat (i1 true))
-; CHECK-NEXT:    [[TMP_V_0:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> [[TMP]], i32 1, <2 x i1> splat (i1 true), <2 x i32> undef)
-; CHECK-NEXT:    call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> [[IN2_V]], <2 x ptr> [[TMP]], i32 1, <2 x i1> splat (i1 true))
-; CHECK-NEXT:    [[TMP_V_1:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> [[TMP]], i32 1, <2 x i1> splat (i1 true), <2 x i32> undef)
+; CHECK-NEXT:    [[IN1_V:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 1 [[IN1]], <2 x i1> splat (i1 true), <2 x i32> undef)
+; CHECK-NEXT:    [[IN2_V:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 1 [[IN2]], <2 x i1> splat (i1 true), <2 x i32> undef)
+; CHECK-NEXT:    call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> [[IN1_V]], <2 x ptr> align 1 [[TMP]], <2 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP_V_0:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 1 [[TMP]], <2 x i1> splat (i1 true), <2 x i32> undef)
+; CHECK-NEXT:    call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> [[IN2_V]], <2 x ptr> align 1 [[TMP]], <2 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP_V_1:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 1 [[TMP]], <2 x i1> splat (i1 true), <2 x i32> undef)
 ; CHECK-NEXT:    [[TMP_V_1_0:%.*]] = extractelement <2 x i32> [[TMP_V_1]], i32 0
 ; CHECK-NEXT:    [[TMP_V_1_1:%.*]] = extractelement <2 x i32> [[TMP_V_1]], i32 1
 ; CHECK-NEXT:    store i32 [[TMP_V_1_0]], ptr [[OUT]], align 4
diff --git a/llvm/test/Transforms/GVN/2016-08-30-MaskedScatterGather.ll b/llvm/test/Transforms/GVN/2016-08-30-MaskedScatterGather.ll
index 4c000603d8259..4d82cd07d59a9 100644
--- a/llvm/test/Transforms/GVN/2016-08-30-MaskedScatterGather.ll
+++ b/llvm/test/Transforms/GVN/2016-08-30-MaskedScatterGather.ll
@@ -16,12 +16,12 @@ define spir_kernel void @test(<2 x ptr> %in1, <2 x ptr> %in2, ptr %out) {
 ; CHECK-NEXT:    [[TMP_1:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[TMP_I:%.*]] = insertelement <2 x ptr> undef, ptr [[TMP_0]], i32 0
 ; CHECK-NEXT:    [[TMP:%.*]] = insertelement <2 x ptr> [[TMP_I]], ptr [[TMP_1]], i32 1
-; CHECK-NEXT:    [[IN1_V:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> [[IN1]], i32 1, <2 x i1> splat (i1 true), <2 x i32> undef)
-; CHECK-NEXT:    [[IN2_V:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> [[IN2]], i32 1, <2 x i1> splat (i1 true), <2 x i32> undef)
-; CHECK-NEXT:    call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> [[IN1_V]], <2 x ptr> [[TMP]], i32 1, <2 x i1> splat (i1 true))
-; CHECK-NEXT:    [[TMP_V_0:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> [[TMP]], i32 1, <2 x i1> splat (i1 true), <2 x i32> undef)
-; CHECK-NEXT:    call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> [[IN2_V]], <2 x ptr> [[TMP]], i32 1, <2 x i1> splat (i1 true))
-; CHECK-NEXT:    [[TMP_V_1:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> [[TMP]], i32 1, <2 x i1> splat (i1 true), <2 x i32> undef)
+; CHECK-NEXT:    [[IN1_V:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 1 [[IN1]], <2 x i1> splat (i1 true), <2 x i32> undef)
+; CHECK-NEXT:    [[IN2_V:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 1 [[IN2]], <2 x i1> splat (i1 true), <2 x i32> undef)
+; CHECK-NEXT:    call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> [[IN1_V]], <2 x ptr> align 1 [[TMP]], <2 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP_V_0:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 1 [[TMP]], <2 x i1> splat (i1 true), <2 x i32> undef)
+; CHECK-NEXT:    call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> [[IN2_V]], <2 x ptr> align 1 [[TMP]], <2 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP_V_1:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 1 [[TMP]], <2 x i1> splat (i1 true), <2 x i32> undef)
 ; CHECK-NEXT:    [[TMP_V_1_0:%.*]] = extractelement <2 x i32> [[TMP_V_1]], i32 0
 ; CHECK-NEXT:    [[TMP_V_1_1:%.*]] = extractelement <2 x i32> [[TMP_V_1]], i32 1
 ; CHECK-NEXT:    store i32 [[TMP_V_1_0]], ptr [[OUT]], align 4
diff --git a/llvm/test/Transforms/GVN/PRE/pre-load.ll b/llvm/test/Transforms/GVN/PRE/pre-load.ll
index 5a07f9f7aa6de..afa13549db458 100644
--- a/llvm/test/Transforms/GVN/PRE/pre-load.ll
+++ b/llvm/test/Transforms/GVN/PRE/pre-load.ll
@@ -1503,3 +1503,51 @@ wrong:
 exit:
   ret void
 }
+
+; Allow the load to be made available on the edge (%entry, %if.end) as part of PRE,
+; but ensure `%identical.l` is not hoisted to its predecessor due to the local
+; dependency with the call.
+
+define i32 @test24(ptr noalias %p, ptr noalias %q, i1 %c) {
+; MDEP-LABEL: @test24(
+; MDEP-NEXT:  entry:
+; MDEP-NEXT:    br i1 [[C:%.*]], label [[ENTRY_IF_END_CRIT_EDGE:%.*]], label [[IF_THEN:%.*]]
+; MDEP:       entry.if.end_crit_edge:
+; MDEP-NEXT:    [[VV_PRE:%.*]] = load i32, ptr [[X:%.*]], align 4
+; MDEP-NEXT:    br label [[IF_END:%.*]]
+; MDEP:       if.then:
+; MDEP-NEXT:    call void @opaque(ptr [[X]])
+; MDEP-NEXT:    [[UU:%.*]] = load i32, ptr [[X]], align 4
+; MDEP-NEXT:    store i32 [[UU]], ptr [[R:%.*]], align 4
+; MDEP-NEXT:    br label [[IF_END]]
+; MDEP:       if.end:
+; MDEP-NEXT:    [[VV:%.*]] = phi i32 [ [[VV_PRE]], [[ENTRY_IF_END_CRIT_EDGE]] ], [ [[UU]], [[IF_THEN]] ]
+; MDEP-NEXT:    ret i32 [[VV]]
+;
+; MSSA-LABEL: @test24(
+; MSSA-NEXT:  entry:
+; MSSA-NEXT:    br i1 [[C:%.*]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; MSSA:       if.then:
+; MSSA-NEXT:    call void @opaque(ptr [[X:%.*]])
+; MSSA-NEXT:    [[UU:%.*]] = load i32, ptr [[X]], align 4
+; MSSA-NEXT:    store i32 [[UU]], ptr [[R:%.*]], align 4
+; MSSA-NEXT:    br label [[IF_END]]
+; MSSA:       if.end:
+; MSSA-NEXT:    [[VV:%.*]] = load i32, ptr [[X]], align 4
+; MSSA-NEXT:    ret i32 [[VV]]
+;
+entry:
+  br i1 %c, label %if.end, label %if.then
+
+if.then:
+  call void @opaque(ptr %p)
+  %identical.l = load i32, ptr %p, align 4
+  store i32 %identical.l, ptr %q, align 4
+  br label %if.end
+
+if.end:
+  %l = load i32, ptr %p, align 4
+  ret i32 %l
+}
+
+declare void @opaque(ptr) nounwind willreturn
diff --git a/llvm/test/Transforms/GVN/masked-load-store-no-mem-dep.ll b/llvm/test/Transforms/GVN/masked-load-store-no-mem-dep.ll
index 512ea37641ab9..c61a394aae142 100644
--- a/llvm/test/Transforms/GVN/masked-load-store-no-mem-dep.ll
+++ b/llvm/test/Transforms/GVN/masked-load-store-no-mem-dep.ll
@@ -5,22 +5,22 @@
 define <4 x float> @forward_binop_with_sel(ptr %0, ptr %1, i32 %a, i32 %b, <4 x float> %passthrough) {
 ; CHECK-LABEL: @forward_binop_with_sel(
 ; CHECK-NEXT:    [[MASK:%.*]] = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[A:%.*]], i32 [[B:%.*]])
-; CHECK-NEXT:    [[LOAD_0_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer)
+; CHECK-NEXT:    [[LOAD_0_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 [[TMP0:%.*]], <4 x i1> [[MASK]], <4 x float> zeroinitializer)
 ; CHECK-NEXT:    [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
-; CHECK-NEXT:    [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[GEP_0_16]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer)
+; CHECK-NEXT:    [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 [[GEP_0_16]], <4 x i1> [[MASK]], <4 x float> zeroinitializer)
 ; CHECK-NEXT:    [[FMUL:%.*]] = fmul <4 x float> [[LOAD_0_0]], [[LOAD_0_16]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <4 x i1> [[MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[FMUL]], ptr align 1 [[TMP1:%.*]], <4 x i1> [[MASK]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[MASK]], <4 x float> [[FMUL]], <4 x float> [[PASSTHROUGH:%.*]]
 ; CHECK-NEXT:    ret <4 x float> [[TMP3]]
 ;
 ; MEMDEPFALSE-LABEL: @forward_binop_with_sel(
 ; MEMDEPFALSE-NEXT:    [[MASK:%.*]] = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[A:%.*]], i32 [[B:%.*]])
-; MEMDEPFALSE-NEXT:    [[LOAD_0_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer)
+; MEMDEPFALSE-NEXT:    [[LOAD_0_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 [[TMP0:%.*]], <4 x i1> [[MASK]], <4 x float> zeroinitializer)
 ; MEMDEPFALSE-NEXT:    [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
-; MEMDEPFALSE-NEXT:    [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[GEP_0_16]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer)
+; MEMDEPFALSE-NEXT:    [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 [[GEP_0_16]], <4 x i1> [[MASK]], <4 x float> zeroinitializer)
 ; MEMDEPFALSE-NEXT:    [[FMUL:%.*]] = fmul <4 x float> [[LOAD_0_0]], [[LOAD_0_16]]
-; MEMDEPFALSE-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <4 x i1> [[MASK]])
-; MEMDEPFALSE-NEXT:    [[LOAD_1_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP1]], i32 1, <4 x i1> [[MASK]], <4 x float> [[PASSTHROUGH:%.*]])
+; MEMDEPFALSE-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[FMUL]], ptr align 1 [[TMP1:%.*]], <4 x i1> [[MASK]])
+; MEMDEPFALSE-NEXT:    [[LOAD_1_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 [[TMP1]], <4 x i1> [[MASK]], <4 x float> [[PASSTHROUGH:%.*]])
 ; MEMDEPFALSE-NEXT:    ret <4 x float> [[LOAD_1_0]]
 ;
   %mask = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %a, i32 %b)
diff --git a/llvm/test/Transforms/GVN/masked-load-store-vn-crash.ll b/llvm/test/Transforms/GVN/masked-load-store-vn-crash.ll
index 466f787896c09..2017a025404d1 100644
--- a/llvm/test/Transforms/GVN/masked-load-store-vn-crash.ll
+++ b/llvm/test/Transforms/GVN/masked-load-store-vn-crash.ll
@@ -5,7 +5,7 @@
 define fastcc void @test() {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD_1_I:%.*]] = tail call <4 x i64> @llvm.masked.load.v4i64.p0(ptr nonnull getelementptr inbounds ([8 x i64], ptr @file_mask, i64 0, i64 7), i32 8, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i64> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD_1_I:%.*]] = tail call <4 x i64> @llvm.masked.load.v4i64.p0(ptr align 8 getelementptr inbounds ([8 x i64], ptr @file_mask, i64 0, i64 7), <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i64> undef)
 ; CHECK-NEXT:    unreachable
 ;
 entry:
diff --git a/llvm/test/Transforms/GVN/masked-load-store.ll b/llvm/test/Transforms/GVN/masked-load-store.ll
index b112e990e0c58..ee9dbd786e128 100644
--- a/llvm/test/Transforms/GVN/masked-load-store.ll
+++ b/llvm/test/Transforms/GVN/masked-load-store.ll
@@ -7,7 +7,7 @@
 define <128 x i8> @f0(ptr %a0, <128 x i8> %a1, <128 x i8> %a2) {
 ; CHECK-LABEL: @f0(
 ; CHECK-NEXT:    [[V0:%.*]] = icmp eq <128 x i8> [[A1:%.*]], [[A2:%.*]]
-; CHECK-NEXT:    [[V1:%.*]] = call <128 x i8> @llvm.masked.load.v128i8.p0(ptr [[A0:%.*]], i32 4, <128 x i1> [[V0]], <128 x i8> undef)
+; CHECK-NEXT:    [[V1:%.*]] = call <128 x i8> @llvm.masked.load.v128i8.p0(ptr align 4 [[A0:%.*]], <128 x i1> [[V0]], <128 x i8> undef)
 ; CHECK-NEXT:    [[V3:%.*]] = add <128 x i8> [[V1]], [[V1]]
 ; CHECK-NEXT:    ret <128 x i8> [[V3]]
 ;
@@ -22,8 +22,8 @@ define <128 x i8> @f1(ptr %a0, <128 x i8> %a1, <128 x i8> %a2) {
 ; CHECK-LABEL: @f1(
 ; CHECK-NEXT:    [[V0:%.*]] = icmp eq <128 x i8> [[A1:%.*]], [[A2:%.*]]
 ; CHECK-NEXT:    [[V1:%.*]] = getelementptr <128 x i8>, ptr [[A0:%.*]], i32 1
-; CHECK-NEXT:    [[V2:%.*]] = call <128 x i8> @llvm.masked.load.v128i8.p0(ptr [[A0]], i32 4, <128 x i1> [[V0]], <128 x i8> undef)
-; CHECK-NEXT:    call void @llvm.masked.store.v128i8.p0(<128 x i8> [[A2]], ptr [[V1]], i32 4, <128 x i1> [[V0]])
+; CHECK-NEXT:    [[V2:%.*]] = call <128 x i8> @llvm.masked.load.v128i8.p0(ptr align 4 [[A0]], <128 x i1> [[V0]], <128 x i8> undef)
+; CHECK-NEXT:    call void @llvm.masked.store.v128i8.p0(<128 x i8> [[A2]], ptr align 4 [[V1]], <128 x i1> [[V0]])
 ; CHECK-NEXT:    [[V4:%.*]] = add <128 x i8> [[V2]], [[V2]]
 ; CHECK-NEXT:    ret <128 x i8> [[V4]]
 ;
@@ -38,9 +38,9 @@ define <128 x i8> @f1(ptr %a0, <128 x i8> %a1, <128 x i8> %a2) {
 
 define <4 x float> @forward_masked_load(ptr %0, ptr %1) {
 ; CHECK-LABEL: @forward_masked_load(
-; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> splat (i1 true), <4 x float> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[TMP4]], ptr [[TMP1:%.*]], i32 1, <4 x i1> splat (i1 true))
-; CHECK-NEXT:    ret <4 x float> [[TMP4]]
+; CHECK-NEXT:    [[LOAD1:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 [[TMP0:%.*]], <4 x i1> splat (i1 true), <4 x float> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[LOAD1]], ptr align 1 [[TMP1:%.*]], <4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <4 x float> [[LOAD1]]
 ;
   %mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 4)
   %load1 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
@@ -51,8 +51,8 @@ define <4 x float> @forward_masked_load(ptr %0, ptr %1) {
 
 define <4 x float> @forward_masked_load_arbitrary_mask(ptr %loc_a, ptr %loc_b, <4 x i1> %mask) {
 ; CHECK-LABEL: @forward_masked_load_arbitrary_mask(
-; CHECK-NEXT:    [[LOAD1:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[LOC_A:%.*]], i32 1, <4 x i1> [[MASK:%.*]], <4 x float> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[LOAD1]], ptr [[LOC_B:%.*]], i32 1, <4 x i1> [[MASK]])
+; CHECK-NEXT:    [[LOAD1:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 [[LOC_A:%.*]], <4 x i1> [[MASK:%.*]], <4 x float> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[LOAD1]], ptr align 1 [[LOC_B:%.*]], <4 x i1> [[MASK]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = select <4 x i1> [[MASK]], <4 x float> [[LOAD1]], <4 x float> zeroinitializer
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
@@ -64,11 +64,11 @@ define <4 x float> @forward_masked_load_arbitrary_mask(ptr %loc_a, ptr %loc_b, <
 
 define <4 x float> @forward_binop_splat_i1_mask(ptr %0, ptr %1) {
 ; CHECK-LABEL: @forward_binop_splat_i1_mask(
-; CHECK-NEXT:    [[LOAD_0_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> splat (i1 true), <4 x float> zeroinitializer)
+; CHECK-NEXT:    [[LOAD_0_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 [[TMP0:%.*]], <4 x i1> splat (i1 true), <4 x float> zeroinitializer)
 ; CHECK-NEXT:    [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
-; CHECK-NEXT:    [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[GEP_0_16]], i32 1, <4 x i1> splat (i1 true), <4 x float> zeroinitializer)
+; CHECK-NEXT:    [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 [[GEP_0_16]], <4 x i1> splat (i1 true), <4 x float> zeroinitializer)
 ; CHECK-NEXT:    [[FMUL:%.*]] = fmul <4 x float> [[LOAD_0_0]], [[LOAD_0_16]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[FMUL]], ptr align 1 [[TMP1:%.*]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    ret <4 x float> [[FMUL]]
 ;
   %mask = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 4)
@@ -84,11 +84,11 @@ define <4 x float> @forward_binop_splat_i1_mask(ptr %0, ptr %1) {
 define <4 x float> @forward_binop_with_sel(ptr %0, ptr %1, i32 %a, i32 %b, <4 x float> %passthrough) {
 ; CHECK-LABEL: @forward_binop_with_sel(
 ; CHECK-NEXT:    [[MASK:%.*]] = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[A:%.*]], i32 [[B:%.*]])
-; CHECK-NEXT:    [[LOAD_0_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer)
+; CHECK-NEXT:    [[LOAD_0_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 [[TMP0:%.*]], <4 x i1> [[MASK]], <4 x float> zeroinitializer)
 ; CHECK-NEXT:    [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
-; CHECK-NEXT:    [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[GEP_0_16]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer)
+; CHECK-NEXT:    [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 [[GEP_0_16]], <4 x i1> [[MASK]], <4 x float> zeroinitializer)
 ; CHECK-NEXT:    [[FMUL:%.*]] = fmul <4 x float> [[LOAD_0_0]], [[LOAD_0_16]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <4 x i1> [[MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[FMUL]], ptr align 1 [[TMP1:%.*]], <4 x i1> [[MASK]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[MASK]], <4 x float> [[FMUL]], <4 x float> [[PASSTHROUGH:%.*]]
 ; CHECK-NEXT:    ret <4 x float> [[TMP3]]
 ;
@@ -105,9 +105,9 @@ define <4 x float> @forward_binop_with_sel(ptr %0, ptr %1, i32 %a, i32 %b, <4 x
 define <vscale x 4 x float> @forward_masked_load_scalable(ptr %0, ptr %1, <vscale x 4 x float> %passthrough) {
 ; CHECK-LABEL: @forward_masked_load_scalable(
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
-; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[TMP3]], <vscale x 4 x float> [[PASSTHROUGH:%.*]])
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP4]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[TMP3]])
-; CHECK-NEXT:    [[TMP5:%.*]] = select <vscale x 4 x i1> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x float> [[PASSTHROUGH]]
+; CHECK-NEXT:    [[LOAD1:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[TMP0:%.*]], <vscale x 4 x i1> [[TMP3]], <vscale x 4 x float> [[PASSTHROUGH:%.*]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD1]], ptr align 1 [[TMP1:%.*]], <vscale x 4 x i1> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = select <vscale x 4 x i1> [[TMP3]], <vscale x 4 x float> [[LOAD1]], <vscale x 4 x float> [[PASSTHROUGH]]
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP5]]
 ;
   %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
@@ -120,9 +120,9 @@ define <vscale x 4 x float> @forward_masked_load_scalable(ptr %0, ptr %1, <vscal
 define <vscale x 4 x float> @forward_masked_load_scalable_type_mismatch(ptr %0, ptr %1, <vscale x 4 x float> %passthrough) {
 ; CHECK-LABEL: @forward_masked_load_scalable_type_mismatch(
 ; CHECK-NEXT:    [[MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
-; CHECK-NEXT:    [[LOAD1:%.*]] = call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x double> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4f64.p0(<vscale x 4 x double> [[LOAD1]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[MASK]])
-; CHECK-NEXT:    [[LOAD2:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> [[PASSTHROUGH:%.*]])
+; CHECK-NEXT:    [[LOAD1:%.*]] = call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0(ptr align 1 [[TMP0:%.*]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x double> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f64.p0(<vscale x 4 x double> [[LOAD1]], ptr align 1 [[TMP1:%.*]], <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT:    [[LOAD2:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[TMP1]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> [[PASSTHROUGH:%.*]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[LOAD2]]
 ;
   %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
@@ -135,9 +135,9 @@ define <vscale x 4 x float> @forward_masked_load_scalable_type_mismatch(ptr %0,
 define <vscale x 4 x float> @generate_sel_with_passthrough(ptr %0, ptr %1, <vscale x 4 x float> %passthrough) {
 ; CHECK-LABEL: @generate_sel_with_passthrough(
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
-; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[TMP3]], <vscale x 4 x float> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP4]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[TMP3]])
-; CHECK-NEXT:    [[TMP5:%.*]] = select <vscale x 4 x i1> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x float> [[PASSTHROUGH:%.*]]
+; CHECK-NEXT:    [[LOAD1:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[TMP0:%.*]], <vscale x 4 x i1> [[TMP3]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD1]], ptr align 1 [[TMP1:%.*]], <vscale x 4 x i1> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = select <vscale x 4 x i1> [[TMP3]], <vscale x 4 x float> [[LOAD1]], <vscale x 4 x float> [[PASSTHROUGH:%.*]]
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP5]]
 ;
   %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
@@ -150,11 +150,11 @@ define <vscale x 4 x float> @generate_sel_with_passthrough(ptr %0, ptr %1, <vsca
 define <vscale x 4 x float> @forward_binop_with_sel_scalable(ptr %0, ptr %1, <vscale x 4 x float> %passthrough) {
 ; CHECK-LABEL: @forward_binop_with_sel_scalable(
 ; CHECK-NEXT:    [[MASK:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
-; CHECK-NEXT:    [[LOAD_0_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    [[LOAD_0_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[TMP0:%.*]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
 ; CHECK-NEXT:    [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
-; CHECK-NEXT:    [[LOAD_0_16:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[GEP_0_16]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    [[LOAD_0_16:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[GEP_0_16]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
 ; CHECK-NEXT:    [[FMUL:%.*]] = fmul <vscale x 4 x float> [[LOAD_0_0]], [[LOAD_0_16]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[FMUL]], ptr align 1 [[TMP1:%.*]], <vscale x 4 x i1> [[MASK]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = select <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> [[FMUL]], <vscale x 4 x float> [[PASSTHROUGH:%.*]]
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP3]]
 ;
@@ -172,12 +172,12 @@ define <vscale x 4 x float> @load_mask_differs(ptr %0, ptr %1, <vscale x 4 x flo
 ; CHECK-LABEL: @load_mask_differs(
 ; CHECK-NEXT:    [[MASK0:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8)
 ; CHECK-NEXT:    [[MASK1:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
-; CHECK-NEXT:    [[LOAD_0_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    [[LOAD_0_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[TMP0:%.*]], <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> zeroinitializer)
 ; CHECK-NEXT:    [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
-; CHECK-NEXT:    [[LOAD_0_16:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[GEP_0_16]], i32 1, <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    [[LOAD_0_16:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[GEP_0_16]], <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> zeroinitializer)
 ; CHECK-NEXT:    [[FMUL:%.*]] = fmul <vscale x 4 x float> [[LOAD_0_0]], [[LOAD_0_16]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[MASK0]])
-; CHECK-NEXT:    [[LOAD_1_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[MASK1]], <vscale x 4 x float> [[PASSTHROUGH:%.*]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[FMUL]], ptr align 1 [[TMP1:%.*]], <vscale x 4 x i1> [[MASK0]])
+; CHECK-NEXT:    [[LOAD_1_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[TMP1]], <vscale x 4 x i1> [[MASK1]], <vscale x 4 x float> [[PASSTHROUGH:%.*]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[LOAD_1_0]]
 ;
   %mask0 = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8)
@@ -195,12 +195,12 @@ define <vscale x 4 x float> @store_mask_differs(ptr %0, ptr %1, <vscale x 4 x fl
 ; CHECK-LABEL: @store_mask_differs(
 ; CHECK-NEXT:    [[MASK0:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8)
 ; CHECK-NEXT:    [[MASK1:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
-; CHECK-NEXT:    [[LOAD_0_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    [[LOAD_0_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[TMP0:%.*]], <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> zeroinitializer)
 ; CHECK-NEXT:    [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
-; CHECK-NEXT:    [[LOAD_0_16:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[GEP_0_16]], i32 1, <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    [[LOAD_0_16:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[GEP_0_16]], <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> zeroinitializer)
 ; CHECK-NEXT:    [[FMUL:%.*]] = fmul <vscale x 4 x float> [[LOAD_0_0]], [[LOAD_0_16]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[MASK1]])
-; CHECK-NEXT:    [[LOAD_1_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> [[PASSTHROUGH:%.*]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[FMUL]], ptr align 1 [[TMP1:%.*]], <vscale x 4 x i1> [[MASK1]])
+; CHECK-NEXT:    [[LOAD_1_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[TMP1]], <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> [[PASSTHROUGH:%.*]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[LOAD_1_0]]
 ;
   %mask0 = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8)
diff --git a/llvm/test/Transforms/GVNSink/ptrtoaddr.ll b/llvm/test/Transforms/GVNSink/ptrtoaddr.ll
new file mode 100644
index 0000000000000..3a9d16ebbcf6c
--- /dev/null
+++ b/llvm/test/Transforms/GVNSink/ptrtoaddr.ll
@@ -0,0 +1,30 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes=gvn-sink < %s | FileCheck %s
+
+define i64 @test(i1 %c, ptr %p, ptr %p2) {
+; CHECK-LABEL: define i64 @test(
+; CHECK-SAME: i1 [[C:%.*]], ptr [[P:%.*]], ptr [[P2:%.*]]) {
+; CHECK-NEXT:    br i1 [[C]], label %[[IF:.*]], label %[[ELSE:.*]]
+; CHECK:       [[IF]]:
+; CHECK-NEXT:    br label %[[JOIN:.*]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    br label %[[JOIN]]
+; CHECK:       [[JOIN]]:
+; CHECK-NEXT:    [[P2_SINK:%.*]] = phi ptr [ [[P2]], %[[ELSE]] ], [ [[P]], %[[IF]] ]
+; CHECK-NEXT:    [[PHI:%.*]] = ptrtoaddr ptr [[P2_SINK]] to i64
+; CHECK-NEXT:    ret i64 [[PHI]]
+;
+  br i1 %c, label %if, label %else
+
+if:
+  %p.addr = ptrtoaddr ptr %p to i64
+  br label %join
+
+else:
+  %p2.addr = ptrtoaddr ptr %p2 to i64
+  br label %join
+
+join:
+  %phi = phi i64 [ %p.addr, %if ], [ %p2.addr, %else ]
+  ret i64 %phi
+}
diff --git a/llvm/test/Transforms/IndVarSimplify/X86/overflow-intrinsics.ll b/llvm/test/Transforms/IndVarSimplify/X86/overflow-intrinsics.ll
index cb4e07ef3e26b..9b9bc68ba7ad8 100644
--- a/llvm/test/Transforms/IndVarSimplify/X86/overflow-intrinsics.ll
+++ b/llvm/test/Transforms/IndVarSimplify/X86/overflow-intrinsics.ll
@@ -60,8 +60,7 @@ define void @f_sadd_overflow(ptr %a) {
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[CONT:.*]] ], [ 2147483645, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    store i8 0, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV]], 2147483647
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]]
+; CHECK-NEXT:    br i1 true, label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]]
 ; CHECK:       [[TRAP]]:
 ; CHECK-NEXT:    tail call void @llvm.trap(), !nosanitize [[META0]]
 ; CHECK-NEXT:    unreachable, !nosanitize [[META0]]
@@ -150,8 +149,7 @@ define void @f_uadd_overflow(ptr %a) {
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[CONT:.*]] ], [ -6, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    store i8 0, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV]], -1
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]]
+; CHECK-NEXT:    br i1 true, label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]]
 ; CHECK:       [[TRAP]]:
 ; CHECK-NEXT:    tail call void @llvm.trap(), !nosanitize [[META0]]
 ; CHECK-NEXT:    unreachable, !nosanitize [[META0]]
@@ -243,10 +241,7 @@ define void @f_ssub_overflow(ptr nocapture %a) {
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[CONT:.*]] ], [ -2147483642, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    store i8 0, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc nsw i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[TMP0]], i32 1)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1
-; CHECK-NEXT:    br i1 [[TMP2]], label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]]
+; CHECK-NEXT:    br i1 true, label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]]
 ; CHECK:       [[TRAP]]:
 ; CHECK-NEXT:    tail call void @llvm.trap(), !nosanitize [[META0]]
 ; CHECK-NEXT:    unreachable, !nosanitize [[META0]]
@@ -339,10 +334,7 @@ define void @f_usub_overflow(ptr nocapture %a) {
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[CONT:.*]] ], [ 15, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    store i8 0, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc nuw nsw i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[TMP0]], i32 1)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1
-; CHECK-NEXT:    br i1 [[TMP2]], label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]]
+; CHECK-NEXT:    br i1 true, label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]]
 ; CHECK:       [[TRAP]]:
 ; CHECK-NEXT:    tail call void @llvm.trap(), !nosanitize [[META0]]
 ; CHECK-NEXT:    unreachable, !nosanitize [[META0]]
diff --git a/llvm/test/Transforms/IndVarSimplify/pointer-loop-guards.ll b/llvm/test/Transforms/IndVarSimplify/pointer-loop-guards.ll
index 89b132e981899..dbd572d611632 100644
--- a/llvm/test/Transforms/IndVarSimplify/pointer-loop-guards.ll
+++ b/llvm/test/Transforms/IndVarSimplify/pointer-loop-guards.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -p indvars -S %s | FileCheck %s
+; RUN: opt -p indvars  -data-layout='n32:64' -S %s | FileCheck --check-prefix=N32 %s
 
 declare i1 @cond()
 
@@ -18,7 +19,7 @@ define i64 @test_ptr_compare_guard(ptr %start, ptr %end) {
 ; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[EXIT_LOOPEXIT:.*]]
 ; CHECK:       [[LOOP_LATCH]]:
 ; CHECK-NEXT:    [[PTR_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 1
-; CHECK-NEXT:    [[I64_IV_NEXT]] = add i64 [[I64_IV]], 1
+; CHECK-NEXT:    [[I64_IV_NEXT]] = add nuw i64 [[I64_IV]], 1
 ; CHECK-NEXT:    [[C_2:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
 ; CHECK-NEXT:    br i1 [[C_2]], label %[[EXIT_LOOPEXIT]], label %[[LOOP_HEADER]]
 ; CHECK:       [[EXIT_LOOPEXIT]]:
@@ -28,6 +29,32 @@ define i64 @test_ptr_compare_guard(ptr %start, ptr %end) {
 ; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[RES_PH]], %[[EXIT_LOOPEXIT]] ]
 ; CHECK-NEXT:    ret i64 [[RES]]
 ;
+; N32-LABEL: define i64 @test_ptr_compare_guard(
+; N32-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) {
+; N32-NEXT:  [[ENTRY:.*]]:
+; N32-NEXT:    [[START2:%.*]] = ptrtoint ptr [[START]] to i64
+; N32-NEXT:    [[END1:%.*]] = ptrtoint ptr [[END]] to i64
+; N32-NEXT:    [[C_0:%.*]] = icmp eq ptr [[START]], [[END]]
+; N32-NEXT:    br i1 [[C_0]], label %[[EXIT:.*]], label %[[LOOP_HEADER_PREHEADER:.*]]
+; N32:       [[LOOP_HEADER_PREHEADER]]:
+; N32-NEXT:    [[TMP0:%.*]] = add i64 [[END1]], -1
+; N32-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
+; N32-NEXT:    br label %[[LOOP_HEADER:.*]]
+; N32:       [[LOOP_HEADER]]:
+; N32-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[START]], %[[LOOP_HEADER_PREHEADER]] ]
+; N32-NEXT:    [[C_1:%.*]] = call i1 @cond()
+; N32-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[EXIT_LOOPEXIT:.*]]
+; N32:       [[LOOP_LATCH]]:
+; N32-NEXT:    [[PTR_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 1
+; N32-NEXT:    [[C_2:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; N32-NEXT:    br i1 [[C_2]], label %[[EXIT_LOOPEXIT]], label %[[LOOP_HEADER]]
+; N32:       [[EXIT_LOOPEXIT]]:
+; N32-NEXT:    [[RES_PH:%.*]] = phi i64 [ 0, %[[LOOP_HEADER]] ], [ [[TMP1]], %[[LOOP_LATCH]] ]
+; N32-NEXT:    br label %[[EXIT]]
+; N32:       [[EXIT]]:
+; N32-NEXT:    [[RES:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[RES_PH]], %[[EXIT_LOOPEXIT]] ]
+; N32-NEXT:    ret i64 [[RES]]
+;
 entry:
   %c.0 = icmp eq ptr %start, %end
   br i1 %c.0, label %exit, label %loop.header
@@ -48,3 +75,149 @@ exit:
   %res = phi i64 [ 0, %entry ], [ %i64.iv, %loop.latch ], [ 0, %loop.header ]
   ret i64 %res
 }
+
+define void @test_sub_cmp(ptr align 8 %start, ptr %end) {
+; CHECK-LABEL: define void @test_sub_cmp(
+; CHECK-SAME: ptr align 8 [[START:%.*]], ptr [[END:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[START_INT:%.*]] = ptrtoint ptr [[START]] to i64
+; CHECK-NEXT:    [[END_INT:%.*]] = ptrtoint ptr [[END]] to i64
+; CHECK-NEXT:    [[PTR_DIFF:%.*]] = sub i64 [[START_INT]], [[END_INT]]
+; CHECK-NEXT:    [[CMP_ENTRY:%.*]] = icmp eq ptr [[START]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP_ENTRY]], label %[[EXIT:.*]], label %[[LOOP_HEADER_PREHEADER:.*]]
+; CHECK:       [[LOOP_HEADER_PREHEADER]]:
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ 0, %[[LOOP_HEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[C_1:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[C_1]], label %[[EXIT_EARLY:.*]], label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 1
+; CHECK-NEXT:    [[CMP_LATCH:%.*]] = icmp ult i64 [[IV_NEXT]], [[PTR_DIFF]]
+; CHECK-NEXT:    br i1 [[CMP_LATCH]], label %[[LOOP_HEADER]], label %[[EXIT_LOOPEXIT:.*]]
+; CHECK:       [[EXIT_EARLY]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+; N32-LABEL: define void @test_sub_cmp(
+; N32-SAME: ptr align 8 [[START:%.*]], ptr [[END:%.*]]) {
+; N32-NEXT:  [[ENTRY:.*:]]
+; N32-NEXT:    [[START_INT:%.*]] = ptrtoint ptr [[START]] to i64
+; N32-NEXT:    [[END_INT:%.*]] = ptrtoint ptr [[END]] to i64
+; N32-NEXT:    [[PTR_DIFF:%.*]] = sub i64 [[START_INT]], [[END_INT]]
+; N32-NEXT:    [[CMP_ENTRY:%.*]] = icmp eq ptr [[START]], [[END]]
+; N32-NEXT:    br i1 [[CMP_ENTRY]], label %[[EXIT:.*]], label %[[LOOP_HEADER_PREHEADER:.*]]
+; N32:       [[LOOP_HEADER_PREHEADER]]:
+; N32-NEXT:    br label %[[LOOP_HEADER:.*]]
+; N32:       [[LOOP_HEADER]]:
+; N32-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ 0, %[[LOOP_HEADER_PREHEADER]] ]
+; N32-NEXT:    [[C_1:%.*]] = call i1 @cond()
+; N32-NEXT:    br i1 [[C_1]], label %[[EXIT_EARLY:.*]], label %[[LOOP_LATCH]]
+; N32:       [[LOOP_LATCH]]:
+; N32-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 1
+; N32-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], [[PTR_DIFF]]
+; N32-NEXT:    br i1 [[EXITCOND]], label %[[LOOP_HEADER]], label %[[EXIT_LOOPEXIT:.*]]
+; N32:       [[EXIT_EARLY]]:
+; N32-NEXT:    br label %[[EXIT]]
+; N32:       [[EXIT_LOOPEXIT]]:
+; N32-NEXT:    br label %[[EXIT]]
+; N32:       [[EXIT]]:
+; N32-NEXT:    ret void
+;
+entry:
+  %start.int = ptrtoint ptr %start to i64
+  %end.int = ptrtoint ptr %end to i64
+  %ptr.diff = sub i64 %start.int, %end.int
+  %cmp.entry = icmp eq ptr %start, %end
+  br i1 %cmp.entry, label %exit, label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %c.1 = call i1 @cond()
+  br i1 %c.1, label %exit.early, label %loop.latch
+
+loop.latch:
+  %iv.next = add i64 %iv, 1
+  %cmp.latch = icmp ult i64 %iv.next, %ptr.diff
+  br i1 %cmp.latch, label %loop.header, label %exit
+
+exit.early:
+  br label %exit
+
+exit:
+  ret void
+}
+
+
+define void @test_ptr_diff_with_assume(ptr align 8 %start, ptr align 8 %end, ptr %P) {
+; CHECK-LABEL: define void @test_ptr_diff_with_assume(
+; CHECK-SAME: ptr align 8 [[START:%.*]], ptr align 8 [[END:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[START_INT:%.*]] = ptrtoint ptr [[START]] to i64
+; CHECK-NEXT:    [[END_INT:%.*]] = ptrtoint ptr [[END]] to i64
+; CHECK-NEXT:    [[PTR_DIFF:%.*]] = sub i64 [[START_INT]], [[END_INT]]
+; CHECK-NEXT:    [[DIFF_CMP:%.*]] = icmp ult i64 [[PTR_DIFF]], 2
+; CHECK-NEXT:    call void @llvm.assume(i1 [[DIFF_CMP]])
+; CHECK-NEXT:    [[COMPUTED_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[PTR_DIFF]]
+; CHECK-NEXT:    [[ENTRY_CMP:%.*]] = icmp eq ptr [[START]], [[END]]
+; CHECK-NEXT:    br i1 [[ENTRY_CMP]], label %[[EXIT:.*]], label %[[LOOP_BODY_PREHEADER:.*]]
+; CHECK:       [[LOOP_BODY_PREHEADER]]:
+; CHECK-NEXT:    br label %[[LOOP_BODY:.*]]
+; CHECK:       [[LOOP_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi ptr [ [[IV_NEXT:%.*]], %[[LOOP_BODY]] ], [ [[START]], %[[LOOP_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i1 @cond()
+; CHECK-NEXT:    [[IV_NEXT]] = getelementptr i8, ptr [[IV]], i64 1
+; CHECK-NEXT:    [[LOOP_CMP:%.*]] = icmp eq ptr [[IV_NEXT]], [[COMPUTED_END]]
+; CHECK-NEXT:    br i1 [[LOOP_CMP]], label %[[EXIT_LOOPEXIT:.*]], label %[[LOOP_BODY]]
+; CHECK:       [[EXIT_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+; N32-LABEL: define void @test_ptr_diff_with_assume(
+; N32-SAME: ptr align 8 [[START:%.*]], ptr align 8 [[END:%.*]], ptr [[P:%.*]]) {
+; N32-NEXT:  [[ENTRY:.*:]]
+; N32-NEXT:    [[START_INT:%.*]] = ptrtoint ptr [[START]] to i64
+; N32-NEXT:    [[END_INT:%.*]] = ptrtoint ptr [[END]] to i64
+; N32-NEXT:    [[PTR_DIFF:%.*]] = sub i64 [[START_INT]], [[END_INT]]
+; N32-NEXT:    [[DIFF_CMP:%.*]] = icmp ult i64 [[PTR_DIFF]], 2
+; N32-NEXT:    call void @llvm.assume(i1 [[DIFF_CMP]])
+; N32-NEXT:    [[COMPUTED_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[PTR_DIFF]]
+; N32-NEXT:    [[ENTRY_CMP:%.*]] = icmp eq ptr [[START]], [[END]]
+; N32-NEXT:    br i1 [[ENTRY_CMP]], label %[[EXIT:.*]], label %[[LOOP_BODY_PREHEADER:.*]]
+; N32:       [[LOOP_BODY_PREHEADER]]:
+; N32-NEXT:    br label %[[LOOP_BODY:.*]]
+; N32:       [[LOOP_BODY]]:
+; N32-NEXT:    [[IV:%.*]] = phi ptr [ [[IV_NEXT:%.*]], %[[LOOP_BODY]] ], [ [[START]], %[[LOOP_BODY_PREHEADER]] ]
+; N32-NEXT:    [[TMP0:%.*]] = call i1 @cond()
+; N32-NEXT:    [[IV_NEXT]] = getelementptr i8, ptr [[IV]], i64 1
+; N32-NEXT:    [[LOOP_CMP:%.*]] = icmp eq ptr [[IV_NEXT]], [[COMPUTED_END]]
+; N32-NEXT:    br i1 [[LOOP_CMP]], label %[[EXIT_LOOPEXIT:.*]], label %[[LOOP_BODY]]
+; N32:       [[EXIT_LOOPEXIT]]:
+; N32-NEXT:    br label %[[EXIT]]
+; N32:       [[EXIT]]:
+; N32-NEXT:    ret void
+;
+entry:
+  %start.int = ptrtoint ptr %start to i64
+  %end.int = ptrtoint ptr %end to i64
+  %ptr.diff = sub i64 %start.int, %end.int
+  %diff.cmp = icmp ult i64 %ptr.diff, 2
+  call void @llvm.assume(i1 %diff.cmp)
+  %computed.end = getelementptr i8, ptr %start, i64 %ptr.diff
+  %entry.cmp = icmp eq ptr %start, %end
+  br i1 %entry.cmp, label %exit, label %loop.body
+
+loop.body:
+  %iv = phi ptr [ %start, %entry ], [ %iv.next, %loop.body ]
+  call i1 @cond()
+  %iv.next = getelementptr i8, ptr %iv, i64 1
+  %loop.cmp = icmp eq ptr %iv.next, %computed.end
+  br i1 %loop.cmp, label %exit, label %loop.body
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/IndVarSimplify/unreachable-exit.ll b/llvm/test/Transforms/IndVarSimplify/unreachable-exit.ll
new file mode 100644
index 0000000000000..b9c92288b18c1
--- /dev/null
+++ b/llvm/test/Transforms/IndVarSimplify/unreachable-exit.ll
@@ -0,0 +1,738 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=indvars < %s | FileCheck %s
+
+define void @optimize_trap(i32 %block_size) {
+; CHECK-LABEL: define void @optimize_trap(
+; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[FOO_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT:    [[BAR_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT:    call void @x(ptr nonnull [[FOO_ARR]])
+; CHECK-NEXT:    [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0
+; CHECK-NEXT:    br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[BLOCK_SIZE]], -1
+; CHECK-NEXT:    [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP1]], i32 3)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 3, [[UMIN]]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    call void @x(ptr nonnull [[BAR_ARR]])
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[IF_THEN:.*]], label %[[IF_END4]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    call void @llvm.trap()
+; CHECK-NEXT:    unreachable
+; CHECK:       [[IF_END4]]:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i8 [[TMP3]], 54
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT:    store i8 [[TMP4]], ptr [[ARRAYIDX7]], align 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_015]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+;
+entry:
+  %foo_arr = alloca [2 x i8], align 16
+  %bar_arr = alloca [2 x i8], align 16
+  call void @x(ptr nonnull %foo_arr)
+  %cmp14.not = icmp eq i32 %block_size, 0
+  br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %if.end4
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  call void @x(ptr nonnull %bar_arr)
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %if.end4
+  %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ]
+  %cmp1 = icmp samesign ugt i32 %i.015, 2
+  br i1 %cmp1, label %if.then, label %if.end4
+
+if.then:                                          ; preds = %for.body
+  call void @llvm.trap()
+  unreachable
+
+if.end4:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015
+  %0 = load i8, ptr %arrayidx, align 1
+  %1 = xor i8 %0, 54
+  %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015
+  store i8 %1, ptr %arrayidx7, align 1
+  %inc = add nuw nsw i32 %i.015, 1
+  %cmp = icmp ult i32 %inc, %block_size
+  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+define void @no_optimize_atomic(i32 %block_size) {
+; CHECK-LABEL: define void @no_optimize_atomic(
+; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[FOO_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT:    [[BAR_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT:    call void @x(ptr nonnull [[FOO_ARR]])
+; CHECK-NEXT:    [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0
+; CHECK-NEXT:    br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    call void @x(ptr nonnull [[BAR_ARR]])
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2
+; CHECK-NEXT:    br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END4]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    call void @llvm.trap()
+; CHECK-NEXT:    unreachable
+; CHECK:       [[IF_END4]]:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i8 [[TMP3]], 54
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT:    store atomic i8 [[TMP4]], ptr [[ARRAYIDX7]] unordered, align 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_015]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+;
+entry:
+  %foo_arr = alloca [2 x i8], align 16
+  %bar_arr = alloca [2 x i8], align 16
+  call void @x(ptr nonnull %foo_arr)
+  %cmp14.not = icmp eq i32 %block_size, 0
+  br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %if.end4
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  call void @x(ptr nonnull %bar_arr)
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %if.end4
+  %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ]
+  %cmp1 = icmp samesign ugt i32 %i.015, 2
+  br i1 %cmp1, label %if.then, label %if.end4
+
+if.then:                                          ; preds = %for.body
+  call void @llvm.trap()
+  unreachable
+
+if.end4:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015
+  %0 = load i8, ptr %arrayidx, align 1
+  %1 = xor i8 %0, 54
+  %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015
+  store atomic i8 %1, ptr %arrayidx7 unordered, align 1
+  %inc = add nuw nsw i32 %i.015, 1
+  %cmp = icmp ult i32 %inc, %block_size
+  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+define void @no_optimize_volatile(i32 %block_size) {
+; CHECK-LABEL: define void @no_optimize_volatile(
+; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[FOO_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT:    [[BAR_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT:    call void @x(ptr nonnull [[FOO_ARR]])
+; CHECK-NEXT:    [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0
+; CHECK-NEXT:    br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    call void @x(ptr nonnull [[BAR_ARR]])
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2
+; CHECK-NEXT:    br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END4]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    call void @llvm.trap()
+; CHECK-NEXT:    unreachable
+; CHECK:       [[IF_END4]]:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i8 [[TMP3]], 54
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT:    store volatile i8 [[TMP4]], ptr [[ARRAYIDX7]], align 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_015]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+;
+entry:
+  %foo_arr = alloca [2 x i8], align 16
+  %bar_arr = alloca [2 x i8], align 16
+  call void @x(ptr nonnull %foo_arr)
+  %cmp14.not = icmp eq i32 %block_size, 0
+  br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %if.end4
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  call void @x(ptr nonnull %bar_arr)
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %if.end4
+  %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ]
+  %cmp1 = icmp samesign ugt i32 %i.015, 2
+  br i1 %cmp1, label %if.then, label %if.end4
+
+if.then:                                          ; preds = %for.body
+  call void @llvm.trap()
+  unreachable
+
+if.end4:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015
+  %0 = load i8, ptr %arrayidx, align 1
+  %1 = xor i8 %0, 54
+  %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015
+  store volatile i8 %1, ptr %arrayidx7, align 1
+  %inc = add nuw nsw i32 %i.015, 1
+  %cmp = icmp ult i32 %inc, %block_size
+  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+define void @no_optimize_call(i32 %block_size) {
+; CHECK-LABEL: define void @no_optimize_call(
+; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[FOO_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT:    [[BAR_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT:    call void @x(ptr nonnull [[FOO_ARR]])
+; CHECK-NEXT:    [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0
+; CHECK-NEXT:    br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    call void @x(ptr nonnull [[BAR_ARR]])
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2
+; CHECK-NEXT:    br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END4]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    call void @llvm.trap()
+; CHECK-NEXT:    unreachable
+; CHECK:       [[IF_END4]]:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i8 [[TMP3]], 54
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT:    call void @x(ptr null)
+; CHECK-NEXT:    store volatile i8 [[TMP4]], ptr [[ARRAYIDX7]], align 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_015]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+;
+entry:
+  %foo_arr = alloca [2 x i8], align 16
+  %bar_arr = alloca [2 x i8], align 16
+  call void @x(ptr nonnull %foo_arr)
+  %cmp14.not = icmp eq i32 %block_size, 0
+  br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %if.end4
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  call void @x(ptr nonnull %bar_arr)
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %if.end4
+  %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ]
+  %cmp1 = icmp samesign ugt i32 %i.015, 2
+  br i1 %cmp1, label %if.then, label %if.end4
+
+if.then:                                          ; preds = %for.body
+  call void @llvm.trap()
+  unreachable
+
+if.end4:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015
+  %0 = load i8, ptr %arrayidx, align 1
+  %1 = xor i8 %0, 54
+  %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015
+  call void @x(ptr null)
+  store volatile i8 %1, ptr %arrayidx7, align 1
+  %inc = add nuw nsw i32 %i.015, 1
+  %cmp = icmp ult i32 %inc, %block_size
+  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+define void @optimize_ubsan_trap(i32 %block_size) {
+; CHECK-LABEL: define void @optimize_ubsan_trap(
+; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[FOO_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT:    [[BAR_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT:    call void @x(ptr nonnull [[FOO_ARR]])
+; CHECK-NEXT:    [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0
+; CHECK-NEXT:    br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[BLOCK_SIZE]], -1
+; CHECK-NEXT:    [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP1]], i32 3)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 3, [[UMIN]]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    call void @x(ptr nonnull [[BAR_ARR]])
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[IF_THEN:.*]], label %[[IF_END4]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    call void @llvm.ubsantrap(i8 1)
+; CHECK-NEXT:    unreachable
+; CHECK:       [[IF_END4]]:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i8 [[TMP3]], 54
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT:    store i8 [[TMP4]], ptr [[ARRAYIDX7]], align 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_015]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+;
+entry:
+  %foo_arr = alloca [2 x i8], align 16
+  %bar_arr = alloca [2 x i8], align 16
+  call void @x(ptr nonnull %foo_arr)
+  %cmp14.not = icmp eq i32 %block_size, 0
+  br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %if.end4
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  call void @x(ptr nonnull %bar_arr)
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %if.end4
+  %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ]
+  %cmp1 = icmp samesign ugt i32 %i.015, 2
+  br i1 %cmp1, label %if.then, label %if.end4
+
+if.then:                                          ; preds = %for.body
+  call void @llvm.ubsantrap(i8 1)
+  unreachable
+
+if.end4:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015
+  %0 = load i8, ptr %arrayidx, align 1
+  %1 = xor i8 %0, 54
+  %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015
+  store i8 %1, ptr %arrayidx7, align 1
+  %inc = add nuw nsw i32 %i.015, 1
+  %cmp = icmp ult i32 %inc, %block_size
+  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+define void @no_optimize_arbitrary_call(i32 %block_size) {
+; CHECK-LABEL: define void @no_optimize_arbitrary_call(
+; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[FOO_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT:    [[BAR_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT:    call void @x(ptr nonnull [[FOO_ARR]])
+; CHECK-NEXT:    [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0
+; CHECK-NEXT:    br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    call void @x(ptr nonnull [[BAR_ARR]])
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2
+; CHECK-NEXT:    br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END4]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    call void @noreturn_with_argmem(ptr [[FOO_ARR]])
+; CHECK-NEXT:    unreachable
+; CHECK:       [[IF_END4]]:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i8 [[TMP3]], 54
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT:    store i8 [[TMP4]], ptr [[ARRAYIDX7]], align 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_015]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+;
+entry:
+  %foo_arr = alloca [2 x i8], align 16
+  %bar_arr = alloca [2 x i8], align 16
+  call void @x(ptr nonnull %foo_arr)
+  %cmp14.not = icmp eq i32 %block_size, 0
+  br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %if.end4
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  call void @x(ptr nonnull %bar_arr)
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %if.end4
+  %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ]
+  %cmp1 = icmp samesign ugt i32 %i.015, 2
+  br i1 %cmp1, label %if.then, label %if.end4
+
+if.then:                                          ; preds = %for.body
+  call void @noreturn_with_argmem(ptr %foo_arr)
+  unreachable
+
+if.end4:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015
+  %0 = load i8, ptr %arrayidx, align 1
+  %1 = xor i8 %0, 54
+  %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015
+  store i8 %1, ptr %arrayidx7, align 1
+  %inc = add nuw nsw i32 %i.015, 1
+  %cmp = icmp ult i32 %inc, %block_size
+  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+define void @no_optimize_two_exits(i32 %block_size) {
+; CHECK-LABEL: define void @no_optimize_two_exits(
+; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[FOO_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT:    [[BAR_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT:    call void @x(ptr nonnull [[FOO_ARR]])
+; CHECK-NEXT:    [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0
+; CHECK-NEXT:    br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    call void @x(ptr nonnull [[BAR_ARR]])
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[P:%.*]] = call i1 @pred()
+; CHECK-NEXT:    br i1 [[P]], label %[[FOR_BODY_CONT:.*]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+; CHECK:       [[FOR_BODY_CONT]]:
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2
+; CHECK-NEXT:    br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END4]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    call void @noreturn(ptr [[FOO_ARR]])
+; CHECK-NEXT:    unreachable
+; CHECK:       [[IF_END4]]:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i8 [[TMP0]], 54
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT:    store i8 [[TMP1]], ptr [[ARRAYIDX7]], align 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_015]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+;
+entry:
+  %foo_arr = alloca [2 x i8], align 16
+  %bar_arr = alloca [2 x i8], align 16
+  call void @x(ptr nonnull %foo_arr)
+  %cmp14.not = icmp eq i32 %block_size, 0
+  br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %if.end4
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  call void @x(ptr nonnull %bar_arr)
+  ret void
+
+for.body:
+  %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ]
+  %p = call i1 @pred()
+  br i1 %p, label %for.body.cont, label %for.cond.cleanup.loopexit
+
+for.body.cont:                                         ; preds = %for.body.preheader, %if.end4
+  %cmp1 = icmp samesign ugt i32 %i.015, 2
+  br i1 %cmp1, label %if.then, label %if.end4
+
+if.then:                                          ; preds = %for.body
+  call void @noreturn(ptr %foo_arr)
+  unreachable
+
+if.end4:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015
+  %0 = load i8, ptr %arrayidx, align 1
+  %1 = xor i8 %0, 54
+  %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015
+  store i8 %1, ptr %arrayidx7, align 1
+  %inc = add nuw nsw i32 %i.015, 1
+  %cmp = icmp ult i32 %inc, %block_size
+  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+define void @no_optimize_two_exits2(i32 %block_size) {
+; CHECK-LABEL: define void @no_optimize_two_exits2(
+; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[FOO_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT:    [[BAR_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT:    call void @x(ptr nonnull [[FOO_ARR]])
+; CHECK-NEXT:    [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0
+; CHECK-NEXT:    br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    call void @x(ptr nonnull [[BAR_ARR]])
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2
+; CHECK-NEXT:    br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[FOR_BODY_CONT:.*]]
+; CHECK:       [[FOR_BODY_CONT]]:
+; CHECK-NEXT:    [[P:%.*]] = call i1 @pred()
+; CHECK-NEXT:    br i1 [[P]], label %[[IF_END4]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    call void @noreturn(ptr [[FOO_ARR]])
+; CHECK-NEXT:    unreachable
+; CHECK:       [[IF_END4]]:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i8 [[TMP0]], 54
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT:    store i8 [[TMP1]], ptr [[ARRAYIDX7]], align 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_015]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+;
+entry:
+  %foo_arr = alloca [2 x i8], align 16
+  %bar_arr = alloca [2 x i8], align 16
+  call void @x(ptr nonnull %foo_arr)
+  %cmp14.not = icmp eq i32 %block_size, 0
+  br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %if.end4
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  call void @x(ptr nonnull %bar_arr)
+  ret void
+
+for.body:
+  %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ]
+  %cmp1 = icmp samesign ugt i32 %i.015, 2
+  br i1 %cmp1, label %if.then, label %for.body.cont
+
+for.body.cont:                                         ; preds = %for.body.preheader, %if.end4
+  %p = call i1 @pred()
+  br i1 %p, label %if.end4, label %for.cond.cleanup.loopexit
+
+if.then:                                          ; preds = %for.body
+  call void @noreturn(ptr %foo_arr)
+  unreachable
+
+if.end4:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015
+  %0 = load i8, ptr %arrayidx, align 1
+  %1 = xor i8 %0, 54
+  %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015
+  store i8 %1, ptr %arrayidx7, align 1
+  %inc = add nuw nsw i32 %i.015, 1
+  %cmp = icmp ult i32 %inc, %block_size
+  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+define void @no_optimize_depdendent_ubsan_trap(i32 %block_size) {
+; CHECK-LABEL: define void @no_optimize_depdendent_ubsan_trap(
+; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[FOO_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT:    [[BAR_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT:    call void @x(ptr nonnull [[FOO_ARR]])
+; CHECK-NEXT:    [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0
+; CHECK-NEXT:    br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    call void @x(ptr nonnull [[BAR_ARR]])
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2
+; CHECK-NEXT:    br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END4]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    [[I_015_LCSSA:%.*]] = phi i32 [ [[I_015]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    call void @noreturn_with_i32(i32 [[I_015_LCSSA]])
+; CHECK-NEXT:    unreachable
+; CHECK:       [[IF_END4]]:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i8 [[TMP0]], 54
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT:    store i8 [[TMP1]], ptr [[ARRAYIDX7]], align 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_015]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+;
+entry:
+  %foo_arr = alloca [2 x i8], align 16
+  %bar_arr = alloca [2 x i8], align 16
+  call void @x(ptr nonnull %foo_arr)
+  %cmp14.not = icmp eq i32 %block_size, 0
+  br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %if.end4
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  call void @x(ptr nonnull %bar_arr)
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %if.end4
+  %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ]
+  %cmp1 = icmp samesign ugt i32 %i.015, 2
+  br i1 %cmp1, label %if.then, label %if.end4
+
+if.then:                                          ; preds = %for.body
+  call void @noreturn_with_i32(i32 %i.015)
+  unreachable
+
+if.end4:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015
+  %0 = load i8, ptr %arrayidx, align 1
+  %1 = xor i8 %0, 54
+  %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015
+  store i8 %1, ptr %arrayidx7, align 1
+  %inc = add nuw nsw i32 %i.015, 1
+  %cmp = icmp ult i32 %inc, %block_size
+  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+define void @no_optimize_depdendent_load_trap(i32 %block_size) {
+; CHECK-LABEL: define void @no_optimize_depdendent_load_trap(
+; CHECK-SAME: i32 [[BLOCK_SIZE:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[FOO_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT:    [[BAR_ARR:%.*]] = alloca [2 x i8], align 16
+; CHECK-NEXT:    call void @x(ptr nonnull [[FOO_ARR]])
+; CHECK-NEXT:    [[CMP14_NOT:%.*]] = icmp eq i32 [[BLOCK_SIZE]], 0
+; CHECK-NEXT:    br i1 [[CMP14_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    call void @x(ptr nonnull [[BAR_ARR]])
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[I_015:%.*]] = phi i32 [ [[INC:%.*]], %[[IF_END4:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp samesign ugt i32 [[I_015]], 2
+; CHECK-NEXT:    br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[IF_END4]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    [[I_015_LCSSA:%.*]] = load i8, ptr [[FOO_ARR]], align 1
+; CHECK-NEXT:    call void @noreturn_with_i8(i8 [[I_015_LCSSA]])
+; CHECK-NEXT:    unreachable
+; CHECK:       [[IF_END4]]:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1024 x i8], ptr [[FOO_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i8 [[TMP0]], 54
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw [1025 x i8], ptr [[BAR_ARR]], i64 0, i32 [[I_015]]
+; CHECK-NEXT:    store i8 [[TMP1]], ptr [[ARRAYIDX7]], align 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_015]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[INC]], [[BLOCK_SIZE]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+;
+entry:
+  %foo_arr = alloca [2 x i8], align 16
+  %bar_arr = alloca [2 x i8], align 16
+  call void @x(ptr nonnull %foo_arr)
+  %cmp14.not = icmp eq i32 %block_size, 0
+  br i1 %cmp14.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %if.end4
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  call void @x(ptr nonnull %bar_arr)
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %if.end4
+  %i.015 = phi i32 [ %inc, %if.end4 ], [ 0, %for.body.preheader ]
+  %cmp1 = icmp samesign ugt i32 %i.015, 2
+  br i1 %cmp1, label %if.then, label %if.end4
+
+if.then:                                          ; preds = %for.body
+  %r = load i8, ptr %foo_arr, align 1
+  call void @noreturn_with_i8(i8 %r)
+  unreachable
+
+if.end4:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds nuw [1024 x i8], ptr %foo_arr, i64 0, i32 %i.015
+  %0 = load i8, ptr %arrayidx, align 1
+  %1 = xor i8 %0, 54
+  %arrayidx7 = getelementptr inbounds nuw [1025 x i8], ptr %bar_arr, i64 0, i32 %i.015
+  store i8 %1, ptr %arrayidx7, align 1
+  %inc = add nuw nsw i32 %i.015, 1
+  %cmp = icmp ult i32 %inc, %block_size
+  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+
+declare void @x(ptr noundef) local_unnamed_addr
+declare i1 @pred() local_unnamed_addr
+
+declare void @llvm.trap() #0
+declare void @noreturn(ptr) #0
+declare void @noreturn_with_i32(i32) #0
+declare void @noreturn_with_i8(i8) #0
+declare void @noreturn_with_argmem(ptr) #1
+
+attributes #0 = { cold noreturn nounwind memory(inaccessiblemem: write) }
+attributes #1 = { cold noreturn nounwind memory(argmem: read) }
diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/masked-load-store.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/masked-load-store.ll
index e14dfd055cbe8..ed58f394a2939 100644
--- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/masked-load-store.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/masked-load-store.ll
@@ -4,7 +4,7 @@
 define <32 x i32> @masked_load_v32i32_global_to_flat(ptr addrspace(1) %ptr, <32 x i1> %mask) {
 ; CHECK-LABEL: define <32 x i32> @masked_load_v32i32_global_to_flat(
 ; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <32 x i1> [[MASK:%.*]]) {
-; CHECK-NEXT:    [[LOAD:%.*]] = call <32 x i32> @llvm.masked.load.v32i32.p1(ptr addrspace(1) [[PTR]], i32 8, <32 x i1> [[MASK]], <32 x i32> zeroinitializer)
+; CHECK-NEXT:    [[LOAD:%.*]] = call <32 x i32> @llvm.masked.load.v32i32.p1(ptr addrspace(1) align 8 [[PTR]], <32 x i1> [[MASK]], <32 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <32 x i32> [[LOAD]]
 ;
   %cast = addrspacecast ptr addrspace(1) %ptr to ptr
@@ -14,7 +14,7 @@ define <32 x i32> @masked_load_v32i32_global_to_flat(ptr addrspace(1) %ptr, <32
 define <32 x i32> @masked_load_v32i32_local_to_flat(ptr addrspace(3) %ptr, <32 x i1> %mask) {
 ; CHECK-LABEL: define <32 x i32> @masked_load_v32i32_local_to_flat(
 ; CHECK-SAME: ptr addrspace(3) [[PTR:%.*]], <32 x i1> [[MASK:%.*]]) {
-; CHECK-NEXT:    [[LOAD:%.*]] = call <32 x i32> @llvm.masked.load.v32i32.p3(ptr addrspace(3) [[PTR]], i32 8, <32 x i1> [[MASK]], <32 x i32> zeroinitializer)
+; CHECK-NEXT:    [[LOAD:%.*]] = call <32 x i32> @llvm.masked.load.v32i32.p3(ptr addrspace(3) align 8 [[PTR]], <32 x i1> [[MASK]], <32 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <32 x i32> [[LOAD]]
 ;
   %cast = addrspacecast ptr addrspace(3) %ptr to ptr
@@ -25,7 +25,7 @@ define <32 x i32> @masked_load_v32i32_local_to_flat(ptr addrspace(3) %ptr, <32 x
 define <32 x i32> @masked_load_v32i32_private_to_flat(ptr addrspace(5) %ptr, <32 x i1> %mask) {
 ; CHECK-LABEL: define <32 x i32> @masked_load_v32i32_private_to_flat(
 ; CHECK-SAME: ptr addrspace(5) [[PTR:%.*]], <32 x i1> [[MASK:%.*]]) {
-; CHECK-NEXT:    [[LOAD:%.*]] = call <32 x i32> @llvm.masked.load.v32i32.p5(ptr addrspace(5) [[PTR]], i32 8, <32 x i1> [[MASK]], <32 x i32> zeroinitializer)
+; CHECK-NEXT:    [[LOAD:%.*]] = call <32 x i32> @llvm.masked.load.v32i32.p5(ptr addrspace(5) align 8 [[PTR]], <32 x i1> [[MASK]], <32 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <32 x i32> [[LOAD]]
 ;
   %cast = addrspacecast ptr addrspace(5) %ptr to ptr
@@ -36,7 +36,7 @@ define <32 x i32> @masked_load_v32i32_private_to_flat(ptr addrspace(5) %ptr, <32
 define void  @masked_store_v32i32_global_to_flat(ptr addrspace(1) %ptr, <32 x i1> %mask) {
 ; CHECK-LABEL: define void @masked_store_v32i32_global_to_flat(
 ; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <32 x i1> [[MASK:%.*]]) {
-; CHECK-NEXT:    tail call void @llvm.masked.store.v32i32.p1(<32 x i32> zeroinitializer, ptr addrspace(1) [[PTR]], i32 128, <32 x i1> [[MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v32i32.p1(<32 x i32> zeroinitializer, ptr addrspace(1) align 128 [[PTR]], <32 x i1> [[MASK]])
 ; CHECK-NEXT:    ret void
 ;
   %cast = addrspacecast ptr addrspace(1) %ptr to ptr
@@ -47,7 +47,7 @@ define void  @masked_store_v32i32_global_to_flat(ptr addrspace(1) %ptr, <32 x i1
 define void  @masked_store_v32i32_local_to_flat(ptr addrspace(3) %ptr, <32 x i1> %mask) {
 ; CHECK-LABEL: define void @masked_store_v32i32_local_to_flat(
 ; CHECK-SAME: ptr addrspace(3) [[PTR:%.*]], <32 x i1> [[MASK:%.*]]) {
-; CHECK-NEXT:    tail call void @llvm.masked.store.v32i32.p3(<32 x i32> zeroinitializer, ptr addrspace(3) [[PTR]], i32 128, <32 x i1> [[MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v32i32.p3(<32 x i32> zeroinitializer, ptr addrspace(3) align 128 [[PTR]], <32 x i1> [[MASK]])
 ; CHECK-NEXT:    ret void
 ;
   %cast = addrspacecast ptr addrspace(3) %ptr to ptr
@@ -58,7 +58,7 @@ define void  @masked_store_v32i32_local_to_flat(ptr addrspace(3) %ptr, <32 x i1>
 define void  @masked_store_v32i32_private_to_flat(ptr addrspace(5) %ptr, <32 x i1> %mask) {
 ; CHECK-LABEL: define void @masked_store_v32i32_private_to_flat(
 ; CHECK-SAME: ptr addrspace(5) [[PTR:%.*]], <32 x i1> [[MASK:%.*]]) {
-; CHECK-NEXT:    tail call void @llvm.masked.store.v32i32.p5(<32 x i32> zeroinitializer, ptr addrspace(5) [[PTR]], i32 128, <32 x i1> [[MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v32i32.p5(<32 x i32> zeroinitializer, ptr addrspace(5) align 128 [[PTR]], <32 x i1> [[MASK]])
 ; CHECK-NEXT:    ret void
 ;
   %cast = addrspacecast ptr addrspace(5) %ptr to ptr
diff --git a/llvm/test/Transforms/InferAddressSpaces/masked-gather-scatter.ll b/llvm/test/Transforms/InferAddressSpaces/masked-gather-scatter.ll
index e6b27cc1c452b..46af2bfd424e4 100644
--- a/llvm/test/Transforms/InferAddressSpaces/masked-gather-scatter.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/masked-gather-scatter.ll
@@ -6,7 +6,7 @@ define <4 x i32> @masked_gather_inferas(ptr addrspace(1) %out, <4 x i64> %index)
 ; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <4 x i64> [[INDEX:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[PTRS:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[OUT]], <4 x i64> [[INDEX]]
-; CHECK-NEXT:    [[VALUE:%.*]] = tail call <4 x i32> @llvm.masked.gather.v4i32.v4p1(<4 x ptr addrspace(1)> [[PTRS]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison)
+; CHECK-NEXT:    [[VALUE:%.*]] = tail call <4 x i32> @llvm.masked.gather.v4i32.v4p1(<4 x ptr addrspace(1)> align 4 [[PTRS]], <4 x i1> splat (i1 true), <4 x i32> poison)
 ; CHECK-NEXT:    ret <4 x i32> [[VALUE]]
 ;
 entry:
@@ -21,7 +21,7 @@ define void @masked_scatter_inferas(ptr addrspace(1) %out, <4 x i64> %index, <4
 ; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <4 x i64> [[INDEX:%.*]], <4 x i32> [[VALUE:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[PTRS:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[OUT]], <4 x i64> [[INDEX]]
-; CHECK-NEXT:    tail call void @llvm.masked.scatter.v4i32.v4p1(<4 x i32> [[VALUE]], <4 x ptr addrspace(1)> [[PTRS]], i32 4, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    tail call void @llvm.masked.scatter.v4i32.v4p1(<4 x i32> [[VALUE]], <4 x ptr addrspace(1)> align 4 [[PTRS]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/InferAlignment/masked.ll b/llvm/test/Transforms/InferAlignment/masked.ll
index 13acf9b50e7e8..12b5c2c262d9b 100644
--- a/llvm/test/Transforms/InferAlignment/masked.ll
+++ b/llvm/test/Transforms/InferAlignment/masked.ll
@@ -6,7 +6,7 @@ define <2 x i32> @load(<2 x i1> %mask, ptr %ptr) {
 ; CHECK-SAME: <2 x i1> [[MASK:%.*]], ptr [[PTR:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[PTR]], i64 64) ]
-; CHECK-NEXT:    [[MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr [[PTR]], i32 64, <2 x i1> [[MASK]], <2 x i32> poison)
+; CHECK-NEXT:    [[MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 64 [[PTR]], <2 x i1> [[MASK]], <2 x i32> poison)
 ; CHECK-NEXT:    ret <2 x i32> [[MASKED_LOAD]]
 ;
 entry:
@@ -20,7 +20,7 @@ define void @store(<2 x i1> %mask, <2 x i32> %val, ptr %ptr) {
 ; CHECK-SAME: <2 x i1> [[MASK:%.*]], <2 x i32> [[VAL:%.*]], ptr [[PTR:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[PTR]], i64 64) ]
-; CHECK-NEXT:    tail call void @llvm.masked.store.v2i32.p0(<2 x i32> [[VAL]], ptr [[PTR]], i32 64, <2 x i1> [[MASK]])
+; CHECK-NEXT:    tail call void @llvm.masked.store.v2i32.p0(<2 x i32> [[VAL]], ptr align 64 [[PTR]], <2 x i1> [[MASK]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -33,11 +33,11 @@ define <2 x i32> @null(<2 x i1> %mask, <2 x i32> %val) {
 ; CHECK-LABEL: define <2 x i32> @null(
 ; CHECK-SAME: <2 x i1> [[MASK:%.*]], <2 x i32> [[VAL:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[MASKED_LOAD:%.*]] = tail call <2 x i32> @llvm.masked.load.v2i32.p0(ptr null, i32 1, <2 x i1> [[MASK]], <2 x i32> [[VAL]])
+; CHECK-NEXT:    [[MASKED_LOAD:%.*]] = tail call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 4294967296 null, <2 x i1> [[MASK]], <2 x i32> [[VAL]])
 ; CHECK-NEXT:    ret <2 x i32> [[MASKED_LOAD]]
 ;
 entry:
-  %masked_load = tail call <2 x i32> @llvm.masked.load.v2f64.p0(ptr null, i32 1, <2 x i1> %mask, <2 x i32> %val)
+  %masked_load = tail call <2 x i32> @llvm.masked.load.v2i32.p0(ptr null, i32 1, <2 x i1> %mask, <2 x i32> %val)
   ret <2 x i32> %masked_load
 }
 
diff --git a/llvm/test/Transforms/Inline/ML/recursive.ll b/llvm/test/Transforms/Inline/ML/recursive.ll
new file mode 100644
index 0000000000000..2d9240a12a713
--- /dev/null
+++ b/llvm/test/Transforms/Inline/ML/recursive.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 6
+; REQUIRES: llvm_inliner_model_autogenerated
+; RUN: opt -S %s -o - -passes='inliner-ml-advisor-release' -ml-inliner-skip-policy=if-caller-not-cold | FileCheck %s
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "aarch64-unknown-linux-android29"
+
+define i32 @a_func(ptr %this, i32 %color_id, i1 %dark_mode) local_unnamed_addr {
+; CHECK-LABEL: define i32 @a_func(
+; CHECK-SAME: ptr [[THIS:%.*]], i32 [[COLOR_ID:%.*]], i1 [[DARK_MODE:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[DARK_MODE]], label %[[SW_BB97:.*]], label %[[COMMON_RET:.*]]
+; CHECK:       [[COMMON_RET]]:
+; CHECK-NEXT:    ret i32 0
+; CHECK:       [[SW_BB97]]:
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+;
+entry:
+  br i1 %dark_mode, label %sw.bb97, label %common.ret
+
+common.ret:                                       ; preds = %sw.bb97, %entry
+  ret i32 0
+
+sw.bb97:                                          ; preds = %entry
+  %call.i = tail call i32 @a_func(ptr null, i32 0, i1 false)
+  br label %common.ret
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read)
+declare ptr @llvm.load.relative.i32(ptr %0, i32 %1) #0
+
+attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
+;.
diff --git a/llvm/test/Transforms/Inline/ML/state-accounting-skip-non-cold.ll b/llvm/test/Transforms/Inline/ML/state-accounting-skip-non-cold.ll
new file mode 100644
index 0000000000000..0887f5e29187d
--- /dev/null
+++ b/llvm/test/Transforms/Inline/ML/state-accounting-skip-non-cold.ll
@@ -0,0 +1,55 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; REQUIRES: llvm_inliner_model_autogenerated && asserts
+; RUN: opt -passes='default<O3>' -enable-ml-inliner=release -ml-inliner-skip-policy=if-caller-not-cold -S %s -o - | FileCheck %s
+; RUN: opt -passes='default<O3>' -ml-inliner-stop-immediately -enable-ml-inliner=release -ml-inliner-skip-policy=if-caller-not-cold -S %s -o - | FileCheck %s
+
+declare ptr @f()
+
+define void @e() #0 {
+; CHECK-LABEL: define void @e(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    tail call void @d()
+; CHECK-NEXT:    tail call void @g()
+; CHECK-NEXT:    tail call void @d()
+; CHECK-NEXT:    tail call void @g()
+; CHECK-NEXT:    tail call void @d()
+; CHECK-NEXT:    tail call void @g()
+; CHECK-NEXT:    ret void
+;
+  call void @h()
+  call void @h()
+  call void @h()
+  ret void
+}
+
+define void @d() {
+; CHECK-LABEL: define void @d() local_unnamed_addr {
+; CHECK-NEXT:    tail call void @f()
+; CHECK-NEXT:    ret void
+;
+  call void @f()
+  ret void
+}
+
+define void @g() {
+; CHECK-LABEL: define void @g() local_unnamed_addr {
+; CHECK-NEXT:    tail call void @f()
+; CHECK-NEXT:    ret void
+;
+  call void @f()
+  ret void
+}
+
+define void @h() #0 {
+; CHECK-LABEL: define void @h(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    tail call void @d()
+; CHECK-NEXT:    tail call void @g()
+; CHECK-NEXT:    ret void
+;
+  call void @d()
+  call void @g()
+  ret void
+}
+
+attributes #0 = { "sign-return-address"="non-leaf" "sign-return-address-key"="a_key" }
diff --git a/llvm/test/Transforms/Inline/pr50589.ll b/llvm/test/Transforms/Inline/pr50589.ll
index 7be163fcd3527..cae0f12eb2c61 100644
--- a/llvm/test/Transforms/Inline/pr50589.ll
+++ b/llvm/test/Transforms/Inline/pr50589.ll
@@ -6,7 +6,7 @@
 
 define <2 x i8> @callee1(ptr %ptr1, ptr noalias %ptr2, <2 x i1> %mask, <2 x i8> %passthru) {
 ; CHECK-LABEL: @callee1(
-; CHECK-NEXT:    [[RET:%.*]] = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr [[PTR1:%.*]], i32 1, <2 x i1> [[MASK:%.*]], <2 x i8> [[PASSTHRU:%.*]])
+; CHECK-NEXT:    [[RET:%.*]] = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr align 1 [[PTR1:%.*]], <2 x i1> [[MASK:%.*]], <2 x i8> [[PASSTHRU:%.*]])
 ; CHECK-NEXT:    store <2 x i8> zeroinitializer, ptr [[PTR2:%.*]], align 2
 ; CHECK-NEXT:    ret <2 x i8> [[RET]]
 ;
@@ -18,9 +18,9 @@ define <2 x i8> @callee1(ptr %ptr1, ptr noalias %ptr2, <2 x i1> %mask, <2 x i8>
 ; The load should not have !noalias.
 define void @caller1(ptr %ptr1, ptr %ptr2) {
 ; CHECK-LABEL: @caller1(
-; CHECK-NEXT:    [[PASSTHRU:%.*]] = load <2 x i8>, ptr [[PTR2:%.*]], align 2{{$}}
+; CHECK-NEXT:    [[PASSTHRU:%.*]] = load <2 x i8>, ptr [[PTR2:%.*]], align 2
 ; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META0:![0-9]+]])
-; CHECK-NEXT:    store <2 x i8> zeroinitializer, ptr [[PTR2]], align 2, !alias.scope !0
+; CHECK-NEXT:    store <2 x i8> zeroinitializer, ptr [[PTR2]], align 2, !alias.scope [[META0]]
 ; CHECK-NEXT:    ret void
 ;
   %passthru = load <2 x i8>, ptr %ptr2
@@ -31,7 +31,7 @@ define void @caller1(ptr %ptr1, ptr %ptr2) {
 define <2 x i8> @callee2(ptr %ptr1, ptr noalias %ptr2, <2 x i1> %mask) {
 ; CHECK-LABEL: @callee2(
 ; CHECK-NEXT:    [[PASSTHRU:%.*]] = load <2 x i8>, ptr [[PTR2:%.*]], align 2
-; CHECK-NEXT:    [[RET:%.*]] = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr [[PTR1:%.*]], i32 1, <2 x i1> [[MASK:%.*]], <2 x i8> [[PASSTHRU]])
+; CHECK-NEXT:    [[RET:%.*]] = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr align 1 [[PTR1:%.*]], <2 x i1> [[MASK:%.*]], <2 x i8> [[PASSTHRU]])
 ; CHECK-NEXT:    store <2 x i8> zeroinitializer, ptr [[PTR2]], align 2
 ; CHECK-NEXT:    ret <2 x i8> [[RET]]
 ;
@@ -45,8 +45,8 @@ define <2 x i8> @callee2(ptr %ptr1, ptr noalias %ptr2, <2 x i1> %mask) {
 define void @caller2(ptr %ptr1, ptr %ptr2) {
 ; CHECK-LABEL: @caller2(
 ; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META3:![0-9]+]])
-; CHECK-NEXT:    [[PASSTHRU_I:%.*]] = load <2 x i8>, ptr [[PTR2:%.*]], align 2, !alias.scope !3{{$}}
-; CHECK-NEXT:    store <2 x i8> zeroinitializer, ptr [[PTR2]], align 2, !alias.scope !3
+; CHECK-NEXT:    [[PASSTHRU_I:%.*]] = load <2 x i8>, ptr [[PTR2:%.*]], align 2, !alias.scope [[META3]]
+; CHECK-NEXT:    store <2 x i8> zeroinitializer, ptr [[PTR2]], align 2, !alias.scope [[META3]]
 ; CHECK-NEXT:    ret void
 ;
   call <2 x i8> @callee2(ptr %ptr1, ptr %ptr2, <2 x i1> zeroinitializer)
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-gatherscatter.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-gatherscatter.ll
index 5b40d2e02749a..9f3fdcd8a767c 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-gatherscatter.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-gatherscatter.ll
@@ -10,7 +10,7 @@ target triple = "aarch64-unknown-linux-gnu"
 define <vscale x 2 x double> @test_ld1_gather_index_nxv2f64_stride1(<vscale x 2 x i1> %pred, ptr %x, i64 %base) #0 {
 ; CHECK-LABEL: @test_ld1_gather_index_nxv2f64_stride1(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[X:%.*]], i64 [[BASE:%.*]]
-; CHECK-NEXT:    [[LD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP1]], i32 1, <vscale x 2 x i1> [[PRED:%.*]], <vscale x 2 x double> zeroinitializer)
+; CHECK-NEXT:    [[LD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 1 [[TMP1]], <vscale x 2 x i1> [[PRED:%.*]], <vscale x 2 x double> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[LD]]
 ;
   %idx = tail call <vscale x 2 x i64> @llvm.aarch64.sve.index.nxv2i64(i64 %base, i64 1)
@@ -32,7 +32,7 @@ define <vscale x 2 x double> @test_ld1_gather_index_nxv2f64_stride2_negtest(<vsc
 define <vscale x 2 x double> @test_ld1_gather_index_nxv2f64_stride1_align8(<vscale x 2 x i1> %pred, ptr align 8 %x, i64 %base) #0 {
 ; CHECK-LABEL: @test_ld1_gather_index_nxv2f64_stride1_align8(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[X:%.*]], i64 [[BASE:%.*]]
-; CHECK-NEXT:    [[LD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP1]], i32 8, <vscale x 2 x i1> [[PRED:%.*]], <vscale x 2 x double> zeroinitializer)
+; CHECK-NEXT:    [[LD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP1]], <vscale x 2 x i1> [[PRED:%.*]], <vscale x 2 x double> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[LD]]
 ;
   %idx = tail call <vscale x 2 x i64> @llvm.aarch64.sve.index.nxv2i64(i64 %base, i64 1)
@@ -47,7 +47,7 @@ define <vscale x 2 x double> @test_ld1_gather_index_nxv2f64_stride1_align8(<vsca
 define void @test_st1_scatter_index_nxv2f64_stride1(<vscale x 2 x i1> %pred, ptr %x, i64 %base, <vscale x 2 x double> %val) #0 {
 ; CHECK-LABEL: @test_st1_scatter_index_nxv2f64_stride1(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[X:%.*]], i64 [[BASE:%.*]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[VAL:%.*]], ptr [[TMP1]], i32 1, <vscale x 2 x i1> [[PRED:%.*]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[VAL:%.*]], ptr align 1 [[TMP1]], <vscale x 2 x i1> [[PRED:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   %idx = tail call <vscale x 2 x i64> @llvm.aarch64.sve.index.nxv2i64(i64 %base, i64 1)
@@ -69,7 +69,7 @@ define void @test_st1_scatter_index_nxv2f64_stride2_negtest(<vscale x 2 x i1> %p
 define void @test_st1_scatter_index_nxv2f64_stride1_align8(<vscale x 2 x i1> %pred, ptr align 8 %x, i64 %base, <vscale x 2 x double> %val) #0 {
 ; CHECK-LABEL: @test_st1_scatter_index_nxv2f64_stride1_align8(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[X:%.*]], i64 [[BASE:%.*]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[VAL:%.*]], ptr [[TMP1]], i32 8, <vscale x 2 x i1> [[PRED:%.*]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[VAL:%.*]], ptr align 8 [[TMP1]], <vscale x 2 x i1> [[PRED:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   %idx = tail call <vscale x 2 x i64> @llvm.aarch64.sve.index.nxv2i64(i64 %base, i64 1)
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-loadstore.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-loadstore.ll
index d8d6740298536..fe2a183566ff3 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-loadstore.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-loadstore.ll
@@ -28,7 +28,7 @@ define <vscale x 4 x i32> @combine_ld1_casted_predicate(ptr %ptr) #0 {
 define <vscale x 4 x i32> @combine_ld1_masked(ptr %ptr) #0 {
 ; CHECK-LABEL: @combine_ld1_masked(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 16)
-; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[PTR:%.*]], i32 1, <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> zeroinitializer), !annotation [[META0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 1 [[PTR:%.*]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> zeroinitializer), !annotation [[META0]]
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP3]]
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 16)
@@ -40,7 +40,7 @@ define <vscale x 8 x i16> @combine_ld1_masked_casted_predicate(ptr %ptr) #0 {
 ; CHECK-LABEL: @combine_ld1_masked_casted_predicate(
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[TMP2]])
-; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[PTR:%.*]], i32 1, <vscale x 8 x i1> [[TMP3]], <vscale x 8 x i16> zeroinitializer), !annotation [[META0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr align 1 [[PTR:%.*]], <vscale x 8 x i1> [[TMP3]], <vscale x 8 x i16> zeroinitializer), !annotation [[META0]]
 ; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP5]]
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
@@ -75,7 +75,7 @@ define void @combine_st1_casted_predicate(<vscale x 4 x i32> %vec, ptr %ptr) #0
 define void @combine_st1_masked(<vscale x 4 x i32> %vec, ptr %ptr) #0 {
 ; CHECK-LABEL: @combine_st1_masked(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 16)
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[VEC:%.*]], ptr [[PTR:%.*]], i32 1, <vscale x 4 x i1> [[TMP1]]), !annotation [[META0]]
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[VEC:%.*]], ptr align 1 [[PTR:%.*]], <vscale x 4 x i1> [[TMP1]]), !annotation [[META0]]
 ; CHECK-NEXT:    ret void
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 16)
@@ -87,7 +87,7 @@ define void @combine_st1_masked_casted_predicate(<vscale x 8 x i16> %vec, ptr %p
 ; CHECK-LABEL: @combine_st1_masked_casted_predicate(
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[TMP2]])
-; CHECK-NEXT:    call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[VEC:%.*]], ptr [[PTR:%.*]], i32 1, <vscale x 8 x i1> [[TMP3]]), !annotation [[META0]]
+; CHECK-NEXT:    call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[VEC:%.*]], ptr align 1 [[PTR:%.*]], <vscale x 8 x i1> [[TMP3]]), !annotation [[META0]]
 ; CHECK-NEXT:    ret void
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-masked-memops.ll b/llvm/test/Transforms/InstCombine/X86/x86-masked-memops.ll
index 5e2da42994881..297d2b6522611 100644
--- a/llvm/test/Transforms/InstCombine/X86/x86-masked-memops.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-masked-memops.ll
@@ -19,7 +19,7 @@ define <4 x float> @mload(ptr %f, <4 x i32> %mask) {
 define <4 x float> @mload_v4f32_cmp(ptr %f, <4 x i32> %src) {
 ; CHECK-LABEL: @mload_v4f32_cmp(
 ; CHECK-NEXT:    [[ICMP:%.*]] = icmp ne <4 x i32> [[SRC:%.*]], zeroinitializer
-; CHECK-NEXT:    [[LD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[F:%.*]], i32 1, <4 x i1> [[ICMP]], <4 x float> zeroinitializer)
+; CHECK-NEXT:    [[LD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 [[F:%.*]], <4 x i1> [[ICMP]], <4 x float> zeroinitializer)
 ; CHECK-NEXT:    ret <4 x float> [[LD]]
 ;
   %icmp = icmp ne <4 x i32> %src, zeroinitializer
@@ -63,7 +63,7 @@ define <4 x float> @mload_real_ones(ptr %f) {
 
 define <4 x float> @mload_one_one(ptr %f) {
 ; CHECK-LABEL: @mload_one_one(
-; CHECK-NEXT:    [[LD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[F:%.*]], i32 1, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison>)
+; CHECK-NEXT:    [[LD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 [[F:%.*]], <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison>)
 ; CHECK-NEXT:    ret <4 x float> [[LD]]
 ;
   %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(ptr %f, <4 x i32> <i32 0, i32 0, i32 0, i32 -1>)
@@ -74,7 +74,7 @@ define <4 x float> @mload_one_one(ptr %f) {
 
 define <2 x double> @mload_one_one_double(ptr %f) {
 ; CHECK-LABEL: @mload_one_one_double(
-; CHECK-NEXT:    [[LD:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr [[F:%.*]], i32 1, <2 x i1> <i1 true, i1 false>, <2 x double> <double poison, double 0.000000e+00>)
+; CHECK-NEXT:    [[LD:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 1 [[F:%.*]], <2 x i1> <i1 true, i1 false>, <2 x double> <double poison, double 0.000000e+00>)
 ; CHECK-NEXT:    ret <2 x double> [[LD]]
 ;
   %ld = tail call <2 x double> @llvm.x86.avx.maskload.pd(ptr %f, <2 x i64> <i64 -1, i64 0>)
@@ -85,7 +85,7 @@ define <2 x double> @mload_one_one_double(ptr %f) {
 
 define <8 x float> @mload_v8f32(ptr %f) {
 ; CHECK-LABEL: @mload_v8f32(
-; CHECK-NEXT:    [[LD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[F:%.*]], i32 1, <8 x i1> <i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>)
+; CHECK-NEXT:    [[LD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 1 [[F:%.*]], <8 x i1> <i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>)
 ; CHECK-NEXT:    ret <8 x float> [[LD]]
 ;
   %ld = tail call <8 x float> @llvm.x86.avx.maskload.ps.256(ptr %f, <8 x i32> <i32 0, i32 0, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 0>)
@@ -97,7 +97,7 @@ define <8 x float> @mload_v8f32_cmp(ptr %f, <8 x float> %src0, <8 x float> %src1
 ; CHECK-NEXT:    [[ICMP0:%.*]] = fcmp one <8 x float> [[SRC0:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[ICMP1:%.*]] = fcmp one <8 x float> [[SRC1:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[MASK1:%.*]] = and <8 x i1> [[ICMP0]], [[ICMP1]]
-; CHECK-NEXT:    [[LD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[F:%.*]], i32 1, <8 x i1> [[MASK1]], <8 x float> zeroinitializer)
+; CHECK-NEXT:    [[LD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 1 [[F:%.*]], <8 x i1> [[MASK1]], <8 x float> zeroinitializer)
 ; CHECK-NEXT:    ret <8 x float> [[LD]]
 ;
   %icmp0 = fcmp one <8 x float> %src0, zeroinitializer
@@ -111,7 +111,7 @@ define <8 x float> @mload_v8f32_cmp(ptr %f, <8 x float> %src0, <8 x float> %src1
 
 define <4 x double> @mload_v4f64(ptr %f) {
 ; CHECK-LABEL: @mload_v4f64(
-; CHECK-NEXT:    [[LD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[F:%.*]], i32 1, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x double> <double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>)
+; CHECK-NEXT:    [[LD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 1 [[F:%.*]], <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x double> <double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>)
 ; CHECK-NEXT:    ret <4 x double> [[LD]]
 ;
   %ld = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(ptr %f, <4 x i64> <i64 -1, i64 0, i64 0, i64 0>)
@@ -122,7 +122,7 @@ define <4 x double> @mload_v4f64(ptr %f) {
 
 define <4 x i32> @mload_v4i32(ptr %f) {
 ; CHECK-LABEL: @mload_v4i32(
-; CHECK-NEXT:    [[LD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[F:%.*]], i32 1, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x i32> <i32 0, i32 0, i32 0, i32 poison>)
+; CHECK-NEXT:    [[LD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 1 [[F:%.*]], <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x i32> <i32 0, i32 0, i32 0, i32 poison>)
 ; CHECK-NEXT:    ret <4 x i32> [[LD]]
 ;
   %ld = tail call <4 x i32> @llvm.x86.avx2.maskload.d(ptr %f, <4 x i32> <i32 0, i32 0, i32 0, i32 -1>)
@@ -131,7 +131,7 @@ define <4 x i32> @mload_v4i32(ptr %f) {
 
 define <2 x i64> @mload_v2i64(ptr %f) {
 ; CHECK-LABEL: @mload_v2i64(
-; CHECK-NEXT:    [[LD:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr [[F:%.*]], i32 1, <2 x i1> <i1 true, i1 false>, <2 x i64> <i64 poison, i64 0>)
+; CHECK-NEXT:    [[LD:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr align 1 [[F:%.*]], <2 x i1> <i1 true, i1 false>, <2 x i64> <i64 poison, i64 0>)
 ; CHECK-NEXT:    ret <2 x i64> [[LD]]
 ;
   %ld = tail call <2 x i64> @llvm.x86.avx2.maskload.q(ptr %f, <2 x i64> <i64 -1, i64 0>)
@@ -140,7 +140,7 @@ define <2 x i64> @mload_v2i64(ptr %f) {
 
 define <8 x i32> @mload_v8i32(ptr %f) {
 ; CHECK-LABEL: @mload_v8i32(
-; CHECK-NEXT:    [[LD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[F:%.*]], i32 1, <8 x i1> <i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x i32> <i32 0, i32 0, i32 0, i32 poison, i32 0, i32 0, i32 0, i32 0>)
+; CHECK-NEXT:    [[LD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 1 [[F:%.*]], <8 x i1> <i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x i32> <i32 0, i32 0, i32 0, i32 poison, i32 0, i32 0, i32 0, i32 0>)
 ; CHECK-NEXT:    ret <8 x i32> [[LD]]
 ;
   %ld = tail call <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr %f, <8 x i32> <i32 0, i32 0, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 0>)
@@ -149,7 +149,7 @@ define <8 x i32> @mload_v8i32(ptr %f) {
 
 define <4 x i64> @mload_v4i64(ptr %f) {
 ; CHECK-LABEL: @mload_v4i64(
-; CHECK-NEXT:    [[LD:%.*]] = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr [[F:%.*]], i32 1, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i64> <i64 poison, i64 0, i64 0, i64 0>)
+; CHECK-NEXT:    [[LD:%.*]] = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr align 1 [[F:%.*]], <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i64> <i64 poison, i64 0, i64 0, i64 0>)
 ; CHECK-NEXT:    ret <4 x i64> [[LD]]
 ;
   %ld = tail call <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr %f, <4 x i64> <i64 -1, i64 0, i64 0, i64 0>)
@@ -159,7 +159,7 @@ define <4 x i64> @mload_v4i64(ptr %f) {
 define <4 x i64> @mload_v4i64_cmp(ptr %f, <4 x i64> %src) {
 ; CHECK-LABEL: @mload_v4i64_cmp(
 ; CHECK-NEXT:    [[ICMP:%.*]] = icmp sgt <4 x i64> [[SRC:%.*]], splat (i64 -1)
-; CHECK-NEXT:    [[LD:%.*]] = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr [[F:%.*]], i32 1, <4 x i1> [[ICMP]], <4 x i64> zeroinitializer)
+; CHECK-NEXT:    [[LD:%.*]] = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr align 1 [[F:%.*]], <4 x i1> [[ICMP]], <4 x i64> zeroinitializer)
 ; CHECK-NEXT:    ret <4 x i64> [[LD]]
 ;
   %icmp = icmp sge <4 x i64> %src, zeroinitializer
@@ -186,7 +186,7 @@ define void @mstore(ptr %f, <4 x i32> %mask, <4 x float> %v) {
 define void @mstore_v4f32_cmp(ptr %f, <4 x i32> %src, <4 x float> %v) {
 ; CHECK-LABEL: @mstore_v4f32_cmp(
 ; CHECK-NEXT:    [[ICMP:%.*]] = icmp eq <4 x i32> [[SRC:%.*]], zeroinitializer
-; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[V:%.*]], ptr [[F:%.*]], i32 1, <4 x i1> [[ICMP]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[V:%.*]], ptr align 1 [[F:%.*]], <4 x i1> [[ICMP]])
 ; CHECK-NEXT:    ret void
 ;
   %icmp = icmp eq <4 x i32> %src, zeroinitializer
@@ -230,7 +230,7 @@ define void @mstore_real_ones(ptr %f, <4 x float> %v) {
 
 define void @mstore_one_one(ptr %f, <4 x float> %v) {
 ; CHECK-LABEL: @mstore_one_one(
-; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[V:%.*]], ptr [[F:%.*]], i32 1, <4 x i1> <i1 false, i1 false, i1 false, i1 true>)
+; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[V:%.*]], ptr align 1 [[F:%.*]], <4 x i1> <i1 false, i1 false, i1 false, i1 true>)
 ; CHECK-NEXT:    ret void
 ;
   tail call void @llvm.x86.avx.maskstore.ps(ptr %f, <4 x i32> <i32 0, i32 0, i32 0, i32 -1>, <4 x float> %v)
@@ -241,7 +241,7 @@ define void @mstore_one_one(ptr %f, <4 x float> %v) {
 
 define void @mstore_one_one_double(ptr %f, <2 x double> %v) {
 ; CHECK-LABEL: @mstore_one_one_double(
-; CHECK-NEXT:    call void @llvm.masked.store.v2f64.p0(<2 x double> [[V:%.*]], ptr [[F:%.*]], i32 1, <2 x i1> <i1 true, i1 false>)
+; CHECK-NEXT:    call void @llvm.masked.store.v2f64.p0(<2 x double> [[V:%.*]], ptr align 1 [[F:%.*]], <2 x i1> <i1 true, i1 false>)
 ; CHECK-NEXT:    ret void
 ;
   tail call void @llvm.x86.avx.maskstore.pd(ptr %f, <2 x i64> <i64 -1, i64 0>, <2 x double> %v)
@@ -252,7 +252,7 @@ define void @mstore_one_one_double(ptr %f, <2 x double> %v) {
 
 define void @mstore_v8f32(ptr %f, <8 x float> %v) {
 ; CHECK-LABEL: @mstore_v8f32(
-; CHECK-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[V:%.*]], ptr [[F:%.*]], i32 1, <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[V:%.*]], ptr align 1 [[F:%.*]], <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>)
 ; CHECK-NEXT:    ret void
 ;
   tail call void @llvm.x86.avx.maskstore.ps.256(ptr %f, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 -1, i32 -2, i32 -3, i32 -4>, <8 x float> %v)
@@ -261,7 +261,7 @@ define void @mstore_v8f32(ptr %f, <8 x float> %v) {
 
 define void @mstore_v4f64(ptr %f, <4 x double> %v) {
 ; CHECK-LABEL: @mstore_v4f64(
-; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[V:%.*]], ptr [[F:%.*]], i32 1, <4 x i1> <i1 true, i1 false, i1 false, i1 false>)
+; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[V:%.*]], ptr align 1 [[F:%.*]], <4 x i1> <i1 true, i1 false, i1 false, i1 false>)
 ; CHECK-NEXT:    ret void
 ;
   tail call void @llvm.x86.avx.maskstore.pd.256(ptr %f, <4 x i64> <i64 -1, i64 0, i64 1, i64 2>, <4 x double> %v)
@@ -271,7 +271,7 @@ define void @mstore_v4f64(ptr %f, <4 x double> %v) {
 define void @mstore_v4f64_cmp(ptr %f, <4 x i32> %src, <4 x double> %v) {
 ; CHECK-LABEL: @mstore_v4f64_cmp(
 ; CHECK-NEXT:    [[ICMP:%.*]] = icmp sgt <4 x i32> [[SRC:%.*]], splat (i32 -1)
-; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[V:%.*]], ptr [[F:%.*]], i32 1, <4 x i1> [[ICMP]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[V:%.*]], ptr align 1 [[F:%.*]], <4 x i1> [[ICMP]])
 ; CHECK-NEXT:    ret void
 ;
   %icmp = icmp sge <4 x i32> %src, zeroinitializer
@@ -284,7 +284,7 @@ define void @mstore_v4f64_cmp(ptr %f, <4 x i32> %src, <4 x double> %v) {
 
 define void @mstore_v4i32(ptr %f, <4 x i32> %v) {
 ; CHECK-LABEL: @mstore_v4i32(
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V:%.*]], ptr [[F:%.*]], i32 1, <4 x i1> <i1 false, i1 false, i1 true, i1 true>)
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V:%.*]], ptr align 1 [[F:%.*]], <4 x i1> <i1 false, i1 false, i1 true, i1 true>)
 ; CHECK-NEXT:    ret void
 ;
   tail call void @llvm.x86.avx2.maskstore.d(ptr %f, <4 x i32> <i32 0, i32 1, i32 -1, i32 -2>, <4 x i32> %v)
@@ -293,7 +293,7 @@ define void @mstore_v4i32(ptr %f, <4 x i32> %v) {
 
 define void @mstore_v2i64(ptr %f, <2 x i64> %v) {
 ; CHECK-LABEL: @mstore_v2i64(
-; CHECK-NEXT:    call void @llvm.masked.store.v2i64.p0(<2 x i64> [[V:%.*]], ptr [[F:%.*]], i32 1, <2 x i1> <i1 true, i1 false>)
+; CHECK-NEXT:    call void @llvm.masked.store.v2i64.p0(<2 x i64> [[V:%.*]], ptr align 1 [[F:%.*]], <2 x i1> <i1 true, i1 false>)
 ; CHECK-NEXT:    ret void
 ;
   tail call void @llvm.x86.avx2.maskstore.q(ptr %f, <2 x i64> <i64 -1, i64 0>, <2 x i64> %v)
@@ -303,7 +303,7 @@ define void @mstore_v2i64(ptr %f, <2 x i64> %v) {
 
 define void @mstore_v8i32(ptr %f, <8 x i32> %v) {
 ; CHECK-LABEL: @mstore_v8i32(
-; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[V:%.*]], ptr [[F:%.*]], i32 1, <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[V:%.*]], ptr align 1 [[F:%.*]], <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>)
 ; CHECK-NEXT:    ret void
 ;
   tail call void @llvm.x86.avx2.maskstore.d.256(ptr %f, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 -1, i32 -2, i32 -3, i32 -4>, <8 x i32> %v)
@@ -312,7 +312,7 @@ define void @mstore_v8i32(ptr %f, <8 x i32> %v) {
 
 define void @mstore_v4i64(ptr %f, <4 x i64> %v) {
 ; CHECK-LABEL: @mstore_v4i64(
-; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[V:%.*]], ptr [[F:%.*]], i32 1, <4 x i1> <i1 true, i1 false, i1 false, i1 false>)
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[V:%.*]], ptr align 1 [[F:%.*]], <4 x i1> <i1 true, i1 false, i1 false, i1 false>)
 ; CHECK-NEXT:    ret void
 ;
   tail call void @llvm.x86.avx2.maskstore.q.256(ptr %f, <4 x i64> <i64 -1, i64 0, i64 1, i64 2>, <4 x i64> %v)
@@ -324,7 +324,7 @@ define void @mstore_v4i64_cmp(ptr %f, <4 x i64> %src0, <4 x i64> %src1, <4 x i64
 ; CHECK-NEXT:    [[ICMP0:%.*]] = icmp eq <4 x i64> [[SRC0:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[ICMP1:%.*]] = icmp ne <4 x i64> [[SRC1:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[MASK1:%.*]] = and <4 x i1> [[ICMP0]], [[ICMP1]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[V:%.*]], ptr [[F:%.*]], i32 1, <4 x i1> [[MASK1]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[V:%.*]], ptr align 1 [[F:%.*]], <4 x i1> [[MASK1]])
 ; CHECK-NEXT:    ret void
 ;
   %icmp0 = icmp eq <4 x i64> %src0, zeroinitializer
diff --git a/llvm/test/Transforms/InstCombine/cast-set-preserve-signed-dbg-val.ll b/llvm/test/Transforms/InstCombine/cast-set-preserve-signed-dbg-val.ll
index 7cc4446f1038b..ad45d1e3e3e4f 100644
--- a/llvm/test/Transforms/InstCombine/cast-set-preserve-signed-dbg-val.ll
+++ b/llvm/test/Transforms/InstCombine/cast-set-preserve-signed-dbg-val.ll
@@ -11,10 +11,8 @@ define i16 @test5(i16 %A) !dbg !34 {
   call void @llvm.dbg.value(metadata i32 %C, metadata !37, metadata !DIExpression()), !dbg !41
 
   ; Preserve the dbg.value for the DCE'd 32-bit 'and'.
-  ;
-  ; The high 16 bits of the original 'and' require sign-extending the new 16-bit and:
   ; CHECK-NEXT: #dbg_value(i16 [[and]], [[C:![0-9]+]],
-  ; CHECK-SAME:    !DIExpression(DW_OP_LLVM_convert, 16, DW_ATE_signed, DW_OP_LLVM_convert, 32, DW_ATE_signed, DW_OP_stack_value)
+  ; CHECK-SAME:    !DIExpression(DW_OP_LLVM_convert, 16, DW_ATE_unsigned, DW_OP_LLVM_convert, 32, DW_ATE_unsigned, DW_OP_stack_value)
 
   %D = trunc i32 %C to i16, !dbg !42
   call void @llvm.dbg.value(metadata i16 %D, metadata !38, metadata !DIExpression()), !dbg !42
diff --git a/llvm/test/Transforms/InstCombine/fpextend.ll b/llvm/test/Transforms/InstCombine/fpextend.ll
index 9125339c00ecf..a65b73b1ca75a 100644
--- a/llvm/test/Transforms/InstCombine/fpextend.ll
+++ b/llvm/test/Transforms/InstCombine/fpextend.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+; RUN: opt < %s -passes=instcombine -use-constant-fp-for-fixed-length-splat -S | FileCheck %s
 
 define float @test(float %x) nounwind  {
 ; CHECK-LABEL: @test(
@@ -449,6 +450,28 @@ define bfloat @bf16_frem(bfloat %x) {
   ret bfloat %t3
 }
 
+define <4 x bfloat> @v4bf16_frem_x_const(<4 x bfloat> %x) {
+; CHECK-LABEL: @v4bf16_frem_x_const(
+; CHECK-NEXT:    [[TMP1:%.*]] = frem <4 x bfloat> [[X:%.*]], splat (bfloat 0xR40C9)
+; CHECK-NEXT:    ret <4 x bfloat> [[TMP1]]
+;
+  %t1 = fpext <4 x bfloat> %x to <4 x float>
+  %t2 = frem <4 x float> %t1, splat(float 6.281250e+00)
+  %t3 = fptrunc <4 x float> %t2 to <4 x bfloat>
+  ret <4 x bfloat> %t3
+}
+
+define <4 x bfloat> @v4bf16_frem_const_x(<4 x bfloat> %x) {
+; CHECK-LABEL: @v4bf16_frem_const_x(
+; CHECK-NEXT:    [[TMP1:%.*]] = frem <4 x bfloat> splat (bfloat 0xR40C9), [[X:%.*]]
+; CHECK-NEXT:    ret <4 x bfloat> [[TMP1]]
+;
+  %t1 = fpext <4 x bfloat> %x to <4 x float>
+  %t2 = frem <4 x float> splat(float 6.281250e+00), %t1
+  %t3 = fptrunc <4 x float> %t2 to <4 x bfloat>
+  ret <4 x bfloat> %t3
+}
+
 define <4 x float> @v4f32_fadd(<4 x float> %a) {
 ; CHECK-LABEL: @v4f32_fadd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], splat (float -1.000000e+00)
@@ -459,3 +482,16 @@ define <4 x float> @v4f32_fadd(<4 x float> %a) {
   %5 = fptrunc <4 x double> %4 to <4 x float>
   ret <4 x float> %5
 }
+
+define <4 x float> @v4f32_fadd_const_not_shrinkable(<4 x float> %a) {
+; CHECK-LABEL: @v4f32_fadd_const_not_shrinkable(
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext <4 x float> [[A:%.*]] to <4 x double>
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x double> [[TMP1]], splat (double -1.000000e+100)
+; CHECK-NEXT:    [[TMP3:%.*]] = fptrunc <4 x double> [[TMP2]] to <4 x float>
+; CHECK-NEXT:    ret <4 x float> [[TMP3]]
+;
+  %2 = fpext <4 x float> %a to <4 x double>
+  %4 = fadd <4 x double> %2, splat (double -1.000000e+100)
+  %5 = fptrunc <4 x double> %4 to <4 x float>
+  ret <4 x float> %5
+}
diff --git a/llvm/test/Transforms/InstCombine/heterogeneous-poison-dbg-rauw.ll b/llvm/test/Transforms/InstCombine/heterogeneous-poison-dbg-rauw.ll
index 0084fe38d0d58..4b6f36b9a21a6 100644
--- a/llvm/test/Transforms/InstCombine/heterogeneous-poison-dbg-rauw.ll
+++ b/llvm/test/Transforms/InstCombine/heterogeneous-poison-dbg-rauw.ll
@@ -87,9 +87,9 @@ define void @test_sext_trunc(i32 %A) !dbg !29 {
 define void @test_asc_asc(ptr addrspace(1) %A, ptr %B) !dbg !34 {
 ; CHECK-LABEL: define void @test_asc_asc(
 ; CHECK-SAME: ptr addrspace(1) [[A:%.*]], ptr [[B:%.*]]) !dbg [[DBG34:![0-9]+]] {
-; CHECK-NEXT:      #dbg_value(ptr addrspace(1) [[A]], [[META36:![0-9]+]], !DIExpression(DIOpArg(0, ptr addrspace(1)), DIOpConvert(ptr addrspace(4))), [[META38:![0-9]+]])
+; CHECK-NEXT:      #dbg_value(ptr addrspace(4) poison, [[META36:![0-9]+]], !DIExpression(DIOpArg(0, ptr addrspace(4))), [[META38:![0-9]+]])
 ; CHECK-NEXT:    call void @use_ptr1(ptr addrspace(1) [[A]])
-; CHECK-NEXT:      #dbg_value(ptr [[B]], [[META39:![0-9]+]], !DIExpression(DIOpArg(0, ptr), DIOpConvert(ptr addrspace(3))), [[META38]])
+; CHECK-NEXT:      #dbg_value(ptr addrspace(3) poison, [[META39:![0-9]+]], !DIExpression(DIOpArg(0, ptr addrspace(3))), [[META38]])
 ; CHECK-NEXT:    call void @use_ptr(ptr [[B]])
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/InstCombine/icmp-trunc.ll b/llvm/test/Transforms/InstCombine/icmp-trunc.ll
index b85deabf5fa06..ad76ef7329b0a 100644
--- a/llvm/test/Transforms/InstCombine/icmp-trunc.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-trunc.ll
@@ -3,6 +3,7 @@
 ; RUN: opt < %s -passes=instcombine -S -data-layout="n8"          | FileCheck %s --check-prefixes=CHECK,DL8
 
 declare void @use(i8)
+declare void @use2(i4)
 
 define i1 @ult_2(i32 %x) {
 ; CHECK-LABEL: @ult_2(
@@ -785,3 +786,32 @@ define <2 x i1> @uge_nsw_non_splat(<2 x i32> %x) {
   ret <2 x i1> %r
 }
 
+define i1 @trunc_icmp(i8 %a0) {
+; CHECK-LABEL: @trunc_icmp(
+; CHECK-NEXT:    [[TZ:%.*]] = tail call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[A0:%.*]], i1 false)
+; CHECK-NEXT:    [[TR:%.*]] = trunc nuw i8 [[TZ]] to i4
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[A0]], 0
+; CHECK-NEXT:    call void @use2(i4 [[TR]])
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %tz = tail call range(i8 0, 9) i8 @llvm.cttz.i8(i8 %a0, i1 false)
+  %tr = trunc i8 %tz to i4
+  %c = icmp eq i4 %tr, 8
+  call void @use2(i4 %tr)
+  ret i1 %c
+}
+
+define i1 @do_not_mask_trunc_eq_i32_i8(i32 %x) {
+; DL64-LABEL: @do_not_mask_trunc_eq_i32_i8(
+; DL64-NEXT:    [[R:%.*]] = icmp eq i32 [[X:%.*]], 42
+; DL64-NEXT:    ret i1 [[R]]
+;
+; DL8-LABEL: @do_not_mask_trunc_eq_i32_i8(
+; DL8-NEXT:    [[T:%.*]] = trunc nuw i32 [[X:%.*]] to i8
+; DL8-NEXT:    [[R:%.*]] = icmp eq i8 [[T]], 42
+; DL8-NEXT:    ret i1 [[R]]
+;
+  %t = trunc nuw i32 %x to i8
+  %r = icmp eq i8 %t, 42
+  ret i1 %r
+}
diff --git a/llvm/test/Transforms/InstCombine/intrinsic-select.ll b/llvm/test/Transforms/InstCombine/intrinsic-select.ll
index a8117ce663a6d..2f1f9fc2b6f9e 100644
--- a/llvm/test/Transforms/InstCombine/intrinsic-select.ll
+++ b/llvm/test/Transforms/InstCombine/intrinsic-select.ll
@@ -210,7 +210,7 @@ declare <2 x i32> @llvm.masked.load.v2i32.p0(ptr, i32, <2 x i1>, <2 x i32>)
 define <2 x i32> @non_speculatable(i1 %b) {
 ; CHECK-LABEL: @non_speculatable(
 ; CHECK-NEXT:    [[S:%.*]] = select i1 [[B:%.*]], ptr @g1, ptr @g2
-; CHECK-NEXT:    [[C:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr nonnull [[S]], i32 64, <2 x i1> <i1 true, i1 false>, <2 x i32> poison)
+; CHECK-NEXT:    [[C:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr nonnull align 64 [[S]], <2 x i1> <i1 true, i1 false>, <2 x i32> poison)
 ; CHECK-NEXT:    ret <2 x i32> [[C]]
 ;
   %s = select i1 %b, ptr @g1, ptr @g2
diff --git a/llvm/test/Transforms/InstCombine/load-store-masked-constant-array.ll b/llvm/test/Transforms/InstCombine/load-store-masked-constant-array.ll
index 918ea605a10bf..b458b23832c1e 100644
--- a/llvm/test/Transforms/InstCombine/load-store-masked-constant-array.ll
+++ b/llvm/test/Transforms/InstCombine/load-store-masked-constant-array.ll
@@ -7,8 +7,8 @@
 define void @combine_masked_load_store_from_constant_array(ptr %ptr) {
 ; CHECK-LABEL: @combine_masked_load_store_from_constant_array(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.whilelt.nxv2i1.i32(i32 0, i32 10)
-; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr nonnull @contant_int_array, i32 8, <vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP2]], ptr [[PTR:%.*]], i32 1, <vscale x 2 x i1> [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr nonnull align 8 @contant_int_array, <vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP2]], ptr align 1 [[PTR:%.*]], <vscale x 2 x i1> [[TMP1]])
 ; CHECK-NEXT:    ret void
 ;
   %1 = alloca [10 x i64]
diff --git a/llvm/test/Transforms/InstCombine/masked_intrinsics-inseltpoison.ll b/llvm/test/Transforms/InstCombine/masked_intrinsics-inseltpoison.ll
index 155a7fd0590e4..02539d17fac04 100644
--- a/llvm/test/Transforms/InstCombine/masked_intrinsics-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/masked_intrinsics-inseltpoison.ll
@@ -37,7 +37,7 @@ define <2 x double> @load_undefmask(ptr %ptr, <2 x double> %passthru)  {
 
 define <2 x double> @load_cemask(ptr %ptr, <2 x double> %passthru)  {
 ; CHECK-LABEL: @load_cemask(
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr [[PTR:%.*]], i32 2, <2 x i1> <i1 true, i1 ptrtoint (ptr @G to i1)>, <2 x double> [[PASSTHRU:%.*]])
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 2 [[PTR:%.*]], <2 x i1> <i1 true, i1 ptrtoint (ptr @G to i1)>, <2 x double> [[PASSTHRU:%.*]])
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %ptr, i32 2, <2 x i1> <i1 1, i1 ptrtoint (ptr @G to i1)>, <2 x double> %passthru)
@@ -47,7 +47,7 @@ define <2 x double> @load_cemask(ptr %ptr, <2 x double> %passthru)  {
 define <2 x double> @load_lane0(ptr %ptr, double %pt)  {
 ; CHECK-LABEL: @load_lane0(
 ; CHECK-NEXT:    [[PTV2:%.*]] = insertelement <2 x double> poison, double [[PT:%.*]], i64 1
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr [[PTR:%.*]], i32 2, <2 x i1> <i1 true, i1 false>, <2 x double> [[PTV2]])
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 2 [[PTR:%.*]], <2 x i1> <i1 true, i1 false>, <2 x double> [[PTV2]])
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %ptv1 = insertelement <2 x double> poison, double %pt, i64 0
@@ -59,7 +59,7 @@ define <2 x double> @load_lane0(ptr %ptr, double %pt)  {
 define double @load_all(ptr %base, double %pt)  {
 ; CHECK-LABEL: @load_all(
 ; CHECK-NEXT:    [[PTRS:%.*]] = getelementptr double, ptr [[BASE:%.*]], <4 x i64> <i64 0, i64 poison, i64 2, i64 3>
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> [[PTRS]], i32 4, <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x double> poison)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 4 [[PTRS]], <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x double> poison)
 ; CHECK-NEXT:    [[ELT:%.*]] = extractelement <4 x double> [[RES]], i64 2
 ; CHECK-NEXT:    ret double [[ELT]]
 ;
@@ -73,7 +73,7 @@ define <2 x double> @load_generic(ptr %ptr, double %pt, <2 x i1> %mask)  {
 ; CHECK-LABEL: @load_generic(
 ; CHECK-NEXT:    [[PTV1:%.*]] = insertelement <2 x double> poison, double [[PT:%.*]], i64 0
 ; CHECK-NEXT:    [[PTV2:%.*]] = shufflevector <2 x double> [[PTV1]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr [[PTR:%.*]], i32 4, <2 x i1> [[MASK:%.*]], <2 x double> [[PTV2]])
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 4 [[PTR:%.*]], <2 x i1> [[MASK:%.*]], <2 x double> [[PTV2]])
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %ptv1 = insertelement <2 x double> poison, double %pt, i64 0
@@ -116,7 +116,7 @@ define <2 x double> @load_spec_neg_size(ptr dereferenceable(8) %ptr, double %pt,
 ; CHECK-LABEL: @load_spec_neg_size(
 ; CHECK-NEXT:    [[PTV1:%.*]] = insertelement <2 x double> poison, double [[PT:%.*]], i64 0
 ; CHECK-NEXT:    [[PTV2:%.*]] = shufflevector <2 x double> [[PTV1]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr nonnull [[PTR:%.*]], i32 4, <2 x i1> [[MASK:%.*]], <2 x double> [[PTV2]])
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr nonnull align 4 [[PTR:%.*]], <2 x i1> [[MASK:%.*]], <2 x double> [[PTV2]])
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %ptv1 = insertelement <2 x double> poison, double %pt, i64 0
@@ -131,7 +131,7 @@ define <2 x double> @load_spec_lan0(ptr dereferenceable(8) %ptr, double %pt, <2
 ; CHECK-NEXT:    [[PTV1:%.*]] = insertelement <2 x double> poison, double [[PT:%.*]], i64 0
 ; CHECK-NEXT:    [[PTV2:%.*]] = shufflevector <2 x double> [[PTV1]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[MASK2:%.*]] = insertelement <2 x i1> [[MASK:%.*]], i1 false, i64 1
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr nonnull [[PTR:%.*]], i32 4, <2 x i1> [[MASK2]], <2 x double> [[PTV2]])
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr nonnull align 4 [[PTR:%.*]], <2 x i1> [[MASK2]], <2 x double> [[PTV2]])
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %ptv1 = insertelement <2 x double> poison, double %pt, i64 0
@@ -161,7 +161,7 @@ define void @store_onemask(ptr %ptr, <2 x double> %val)  {
 define void @store_demandedelts(ptr %ptr, double %val)  {
 ; CHECK-LABEL: @store_demandedelts(
 ; CHECK-NEXT:    [[VALVEC1:%.*]] = insertelement <2 x double> poison, double [[VAL:%.*]], i64 0
-; CHECK-NEXT:    call void @llvm.masked.store.v2f64.p0(<2 x double> [[VALVEC1]], ptr [[PTR:%.*]], i32 4, <2 x i1> <i1 true, i1 false>)
+; CHECK-NEXT:    call void @llvm.masked.store.v2f64.p0(<2 x double> [[VALVEC1]], ptr align 4 [[PTR:%.*]], <2 x i1> <i1 true, i1 false>)
 ; CHECK-NEXT:    ret void
 ;
   %valvec1 = insertelement <2 x double> poison, double %val, i32 0
@@ -172,7 +172,7 @@ define void @store_demandedelts(ptr %ptr, double %val)  {
 
 define <2 x double> @gather_generic(<2 x ptr> %ptrs, <2 x i1> %mask, <2 x double> %passthru)  {
 ; CHECK-LABEL: @gather_generic(
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> [[PTRS:%.*]], i32 4, <2 x i1> [[MASK:%.*]], <2 x double> [[PASSTHRU:%.*]])
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 4 [[PTRS:%.*]], <2 x i1> [[MASK:%.*]], <2 x double> [[PASSTHRU:%.*]])
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %passthru)
@@ -191,7 +191,7 @@ define <2 x double> @gather_zeromask(<2 x ptr> %ptrs, <2 x double> %passthru)  {
 
 define <2 x double> @gather_onemask(<2 x ptr> %ptrs, <2 x double> %passthru)  {
 ; CHECK-LABEL: @gather_onemask(
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> [[PTRS:%.*]], i32 4, <2 x i1> splat (i1 true), <2 x double> poison)
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 4 [[PTRS:%.*]], <2 x i1> splat (i1 true), <2 x double> poison)
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> <i1 true, i1 true>, <2 x double> %passthru)
@@ -203,7 +203,7 @@ define <4 x double> @gather_lane2(ptr %base, double %pt)  {
 ; CHECK-NEXT:    [[PTRS:%.*]] = getelementptr double, ptr [[BASE:%.*]], <4 x i64> <i64 poison, i64 poison, i64 2, i64 poison>
 ; CHECK-NEXT:    [[PT_V1:%.*]] = insertelement <4 x double> poison, double [[PT:%.*]], i64 0
 ; CHECK-NEXT:    [[PT_V2:%.*]] = shufflevector <4 x double> [[PT_V1]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 poison, i32 0>
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> [[PTRS]], i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x double> [[PT_V2]])
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 4 [[PTRS]], <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x double> [[PT_V2]])
 ; CHECK-NEXT:    ret <4 x double> [[RES]]
 ;
   %ptrs = getelementptr double, ptr %base, <4 x i64> <i64 0, i64 1, i64 2, i64 3>
@@ -219,7 +219,7 @@ define <2 x double> @gather_lane0_maybe(ptr %base, double %pt, <2 x i1> %mask)
 ; CHECK-NEXT:    [[PT_V1:%.*]] = insertelement <2 x double> poison, double [[PT:%.*]], i64 0
 ; CHECK-NEXT:    [[PT_V2:%.*]] = shufflevector <2 x double> [[PT_V1]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[MASK2:%.*]] = insertelement <2 x i1> [[MASK:%.*]], i1 false, i64 1
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> [[PTRS]], i32 4, <2 x i1> [[MASK2]], <2 x double> [[PT_V2]])
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 4 [[PTRS]], <2 x i1> [[MASK2]], <2 x double> [[PT_V2]])
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %ptrs = getelementptr double, ptr %base, <2 x i64> <i64 0, i64 1>
@@ -236,7 +236,7 @@ define <2 x double> @gather_lane0_maybe_spec(ptr %base, double %pt, <2 x i1> %ma
 ; CHECK-NEXT:    [[PT_V1:%.*]] = insertelement <2 x double> poison, double [[PT:%.*]], i64 0
 ; CHECK-NEXT:    [[PT_V2:%.*]] = shufflevector <2 x double> [[PT_V1]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[MASK2:%.*]] = insertelement <2 x i1> [[MASK:%.*]], i1 false, i64 1
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> [[PTRS]], i32 4, <2 x i1> [[MASK2]], <2 x double> [[PT_V2]])
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 4 [[PTRS]], <2 x i1> [[MASK2]], <2 x double> [[PT_V2]])
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %ptrs = getelementptr double, ptr %base, <2 x i64> <i64 0, i64 1>
@@ -260,7 +260,7 @@ define void @scatter_demandedelts(ptr %ptr, double %val)  {
 ; CHECK-LABEL: @scatter_demandedelts(
 ; CHECK-NEXT:    [[PTRS:%.*]] = getelementptr double, ptr [[PTR:%.*]], <2 x i64> <i64 0, i64 poison>
 ; CHECK-NEXT:    [[VALVEC1:%.*]] = insertelement <2 x double> poison, double [[VAL:%.*]], i64 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[VALVEC1]], <2 x ptr> [[PTRS]], i32 8, <2 x i1> <i1 true, i1 false>)
+; CHECK-NEXT:    call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[VALVEC1]], <2 x ptr> align 8 [[PTRS]], <2 x i1> <i1 true, i1 false>)
 ; CHECK-NEXT:    ret void
 ;
   %ptrs = getelementptr double, ptr %ptr, <2 x i64> <i64 0, i64 1>
diff --git a/llvm/test/Transforms/InstCombine/masked_intrinsics.ll b/llvm/test/Transforms/InstCombine/masked_intrinsics.ll
index 67ab167c189b4..6aadb08b16991 100644
--- a/llvm/test/Transforms/InstCombine/masked_intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/masked_intrinsics.ll
@@ -46,7 +46,7 @@ define <2 x double> @load_undefmask(ptr %ptr, <2 x double> %passthru)  {
 
 define <2 x double> @load_cemask(ptr %ptr, <2 x double> %passthru)  {
 ; CHECK-LABEL: @load_cemask(
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr [[PTR:%.*]], i32 2, <2 x i1> <i1 true, i1 ptrtoint (ptr @G to i1)>, <2 x double> [[PASSTHRU:%.*]])
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 2 [[PTR:%.*]], <2 x i1> <i1 true, i1 ptrtoint (ptr @G to i1)>, <2 x double> [[PASSTHRU:%.*]])
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %ptr, i32 2, <2 x i1> <i1 1, i1 ptrtoint (ptr @G to i1)>, <2 x double> %passthru)
@@ -56,7 +56,7 @@ define <2 x double> @load_cemask(ptr %ptr, <2 x double> %passthru)  {
 define <2 x double> @load_lane0(ptr %ptr, double %pt)  {
 ; CHECK-LABEL: @load_lane0(
 ; CHECK-NEXT:    [[PTV2:%.*]] = insertelement <2 x double> poison, double [[PT:%.*]], i64 1
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr [[PTR:%.*]], i32 2, <2 x i1> <i1 true, i1 false>, <2 x double> [[PTV2]])
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 2 [[PTR:%.*]], <2 x i1> <i1 true, i1 false>, <2 x double> [[PTV2]])
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %ptv1 = insertelement <2 x double> undef, double %pt, i64 0
@@ -68,7 +68,7 @@ define <2 x double> @load_lane0(ptr %ptr, double %pt)  {
 define double @load_all(ptr %base, double %pt)  {
 ; CHECK-LABEL: @load_all(
 ; CHECK-NEXT:    [[PTRS:%.*]] = getelementptr double, ptr [[BASE:%.*]], <4 x i64> <i64 0, i64 poison, i64 2, i64 3>
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> [[PTRS]], i32 4, <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x double> poison)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 4 [[PTRS]], <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x double> poison)
 ; CHECK-NEXT:    [[ELT:%.*]] = extractelement <4 x double> [[RES]], i64 2
 ; CHECK-NEXT:    ret double [[ELT]]
 ;
@@ -82,7 +82,7 @@ define <2 x double> @load_generic(ptr %ptr, double %pt, <2 x i1> %mask)  {
 ; CHECK-LABEL: @load_generic(
 ; CHECK-NEXT:    [[PTV1:%.*]] = insertelement <2 x double> poison, double [[PT:%.*]], i64 0
 ; CHECK-NEXT:    [[PTV2:%.*]] = shufflevector <2 x double> [[PTV1]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr [[PTR:%.*]], i32 4, <2 x i1> [[MASK:%.*]], <2 x double> [[PTV2]])
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 4 [[PTR:%.*]], <2 x i1> [[MASK:%.*]], <2 x double> [[PTV2]])
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %ptv1 = insertelement <2 x double> undef, double %pt, i64 0
@@ -125,7 +125,7 @@ define <2 x double> @load_spec_neg_size(ptr dereferenceable(8) %ptr, double %pt,
 ; CHECK-LABEL: @load_spec_neg_size(
 ; CHECK-NEXT:    [[PTV1:%.*]] = insertelement <2 x double> poison, double [[PT:%.*]], i64 0
 ; CHECK-NEXT:    [[PTV2:%.*]] = shufflevector <2 x double> [[PTV1]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr nonnull [[PTR:%.*]], i32 4, <2 x i1> [[MASK:%.*]], <2 x double> [[PTV2]])
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr nonnull align 4 [[PTR:%.*]], <2 x i1> [[MASK:%.*]], <2 x double> [[PTV2]])
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %ptv1 = insertelement <2 x double> undef, double %pt, i64 0
@@ -140,7 +140,7 @@ define <2 x double> @load_spec_lan0(ptr dereferenceable(8) %ptr, double %pt, <2
 ; CHECK-NEXT:    [[PTV1:%.*]] = insertelement <2 x double> poison, double [[PT:%.*]], i64 0
 ; CHECK-NEXT:    [[PTV2:%.*]] = shufflevector <2 x double> [[PTV1]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[MASK2:%.*]] = insertelement <2 x i1> [[MASK:%.*]], i1 false, i64 1
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr nonnull [[PTR:%.*]], i32 4, <2 x i1> [[MASK2]], <2 x double> [[PTV2]])
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr nonnull align 4 [[PTR:%.*]], <2 x i1> [[MASK2]], <2 x double> [[PTV2]])
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %ptv1 = insertelement <2 x double> undef, double %pt, i64 0
@@ -187,7 +187,7 @@ define void @store_one_withpoison_mask(ptr %ptr, <2 x double> %val)  {
 define void @store_demandedelts(ptr %ptr, double %val)  {
 ; CHECK-LABEL: @store_demandedelts(
 ; CHECK-NEXT:    [[VALVEC1:%.*]] = insertelement <2 x double> poison, double [[VAL:%.*]], i64 0
-; CHECK-NEXT:    call void @llvm.masked.store.v2f64.p0(<2 x double> [[VALVEC1]], ptr [[PTR:%.*]], i32 4, <2 x i1> <i1 true, i1 false>)
+; CHECK-NEXT:    call void @llvm.masked.store.v2f64.p0(<2 x double> [[VALVEC1]], ptr align 4 [[PTR:%.*]], <2 x i1> <i1 true, i1 false>)
 ; CHECK-NEXT:    ret void
 ;
   %valvec1 = insertelement <2 x double> undef, double %val, i32 0
@@ -198,7 +198,7 @@ define void @store_demandedelts(ptr %ptr, double %val)  {
 
 define <2 x double> @gather_generic(<2 x ptr> %ptrs, <2 x i1> %mask, <2 x double> %passthru)  {
 ; CHECK-LABEL: @gather_generic(
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> [[PTRS:%.*]], i32 4, <2 x i1> [[MASK:%.*]], <2 x double> [[PASSTHRU:%.*]])
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 4 [[PTRS:%.*]], <2 x i1> [[MASK:%.*]], <2 x double> [[PASSTHRU:%.*]])
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %passthru)
@@ -224,7 +224,7 @@ define <2 x double> @gather_zero_withpoison_mask(<2 x ptr> %ptrs, <2 x double> %
 
 define <2 x double> @gather_onemask(<2 x ptr> %ptrs, <2 x double> %passthru)  {
 ; CHECK-LABEL: @gather_onemask(
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> [[PTRS:%.*]], i32 4, <2 x i1> splat (i1 true), <2 x double> poison)
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 4 [[PTRS:%.*]], <2 x i1> splat (i1 true), <2 x double> poison)
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> <i1 true, i1 true>, <2 x double> %passthru)
@@ -233,7 +233,7 @@ define <2 x double> @gather_onemask(<2 x ptr> %ptrs, <2 x double> %passthru)  {
 
 define <2 x double> @gather_one_withpoisonmask(<2 x ptr> %ptrs, <2 x double> %passthru)  {
 ; CHECK-LABEL: @gather_one_withpoisonmask(
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> [[PTRS:%.*]], i32 4, <2 x i1> <i1 true, i1 poison>, <2 x double> [[PASSTHRU:%.*]])
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 4 [[PTRS:%.*]], <2 x i1> <i1 true, i1 poison>, <2 x double> [[PASSTHRU:%.*]])
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> <i1 true, i1 poison>, <2 x double> %passthru)
@@ -245,7 +245,7 @@ define <4 x double> @gather_lane2(ptr %base, double %pt)  {
 ; CHECK-NEXT:    [[PTRS:%.*]] = getelementptr double, ptr [[BASE:%.*]], <4 x i64> <i64 poison, i64 poison, i64 2, i64 poison>
 ; CHECK-NEXT:    [[PT_V1:%.*]] = insertelement <4 x double> poison, double [[PT:%.*]], i64 0
 ; CHECK-NEXT:    [[PT_V2:%.*]] = shufflevector <4 x double> [[PT_V1]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 poison, i32 0>
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> [[PTRS]], i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x double> [[PT_V2]])
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 4 [[PTRS]], <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x double> [[PT_V2]])
 ; CHECK-NEXT:    ret <4 x double> [[RES]]
 ;
   %ptrs = getelementptr double, ptr %base, <4 x i64> <i64 0, i64 1, i64 2, i64 3>
@@ -261,7 +261,7 @@ define <2 x double> @gather_lane0_maybe(ptr %base, double %pt, <2 x i1> %mask)
 ; CHECK-NEXT:    [[PT_V1:%.*]] = insertelement <2 x double> poison, double [[PT:%.*]], i64 0
 ; CHECK-NEXT:    [[PT_V2:%.*]] = shufflevector <2 x double> [[PT_V1]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[MASK2:%.*]] = insertelement <2 x i1> [[MASK:%.*]], i1 false, i64 1
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> [[PTRS]], i32 4, <2 x i1> [[MASK2]], <2 x double> [[PT_V2]])
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 4 [[PTRS]], <2 x i1> [[MASK2]], <2 x double> [[PT_V2]])
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %ptrs = getelementptr double, ptr %base, <2 x i64> <i64 0, i64 1>
@@ -278,7 +278,7 @@ define <2 x double> @gather_lane0_maybe_spec(ptr %base, double %pt, <2 x i1> %ma
 ; CHECK-NEXT:    [[PT_V1:%.*]] = insertelement <2 x double> poison, double [[PT:%.*]], i64 0
 ; CHECK-NEXT:    [[PT_V2:%.*]] = shufflevector <2 x double> [[PT_V1]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[MASK2:%.*]] = insertelement <2 x i1> [[MASK:%.*]], i1 false, i64 1
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> [[PTRS]], i32 4, <2 x i1> [[MASK2]], <2 x double> [[PT_V2]])
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 4 [[PTRS]], <2 x i1> [[MASK2]], <2 x double> [[PT_V2]])
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %ptrs = getelementptr double, ptr %base, <2 x i64> <i64 0, i64 1>
@@ -308,7 +308,7 @@ define void @scatter_zero_withpoison_mask(<2 x ptr> %ptrs, <2 x double> %val)  {
 
 define void @scatter_one_withpoison_mask(<2 x ptr> %ptrs, <2 x double> %val)  {
 ; CHECK-LABEL: @scatter_one_withpoison_mask(
-; CHECK-NEXT:    call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[VAL:%.*]], <2 x ptr> [[PTRS:%.*]], i32 8, <2 x i1> <i1 true, i1 poison>)
+; CHECK-NEXT:    call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[VAL:%.*]], <2 x ptr> align 8 [[PTRS:%.*]], <2 x i1> <i1 true, i1 poison>)
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> %val, <2 x ptr> %ptrs, i32 8, <2 x i1> <i1 1, i1 poison>)
@@ -319,7 +319,7 @@ define void @scatter_demandedelts(ptr %ptr, double %val)  {
 ; CHECK-LABEL: @scatter_demandedelts(
 ; CHECK-NEXT:    [[PTRS:%.*]] = getelementptr double, ptr [[PTR:%.*]], <2 x i64> <i64 0, i64 poison>
 ; CHECK-NEXT:    [[VALVEC1:%.*]] = insertelement <2 x double> poison, double [[VAL:%.*]], i64 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[VALVEC1]], <2 x ptr> [[PTRS]], i32 8, <2 x i1> <i1 true, i1 false>)
+; CHECK-NEXT:    call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[VALVEC1]], <2 x ptr> align 8 [[PTRS]], <2 x i1> <i1 true, i1 false>)
 ; CHECK-NEXT:    ret void
 ;
   %ptrs = getelementptr double, ptr %ptr, <2 x i64> <i64 0, i64 1>
@@ -407,7 +407,7 @@ define void @negative_scatter_v4i16_no_uniform_vals_uniform_ptrs_all_inactive_ma
 ; CHECK-NEXT:    [[INSERT_ELT:%.*]] = insertelement <4 x ptr> poison, ptr [[DST:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x ptr> [[INSERT_ELT]], <4 x ptr> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 0>
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[SRC:%.*]], align 2
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> [[WIDE_LOAD]], <4 x ptr> [[BROADCAST_SPLAT]], i32 2, <4 x i1> <i1 false, i1 false, i1 true, i1 true>)
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> [[WIDE_LOAD]], <4 x ptr> align 2 [[BROADCAST_SPLAT]], <4 x i1> <i1 false, i1 false, i1 true, i1 true>)
 ; CHECK-NEXT:    ret void
 ;
   %insert.elt = insertelement <4 x ptr> poison, ptr %dst, i32 0
@@ -422,7 +422,7 @@ define void @negative_scatter_v4i16_no_uniform_vals_no_uniform_ptrs_all_active_m
 ; CHECK-LABEL: @negative_scatter_v4i16_no_uniform_vals_no_uniform_ptrs_all_active_mask(
 ; CHECK-NEXT:    [[BROADCAST:%.*]] = shufflevector <4 x ptr> [[INPTR:%.*]], <4 x ptr> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[SRC:%.*]], align 2
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> [[WIDE_LOAD]], <4 x ptr> [[BROADCAST]], i32 2, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> [[WIDE_LOAD]], <4 x ptr> align 2 [[BROADCAST]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    ret void
 ;
   %broadcast= shufflevector <4 x ptr> %inPtr, <4 x ptr> poison, <4 x i32> zeroinitializer
@@ -471,7 +471,7 @@ define <2 x i64> @gather_v2i64_uniform_ptrs_all_active_mask(ptr %src) {
 define <2 x i64> @negative_gather_v2i64_non_uniform_ptrs_all_active_mask(<2 x ptr> %inVal, ptr %src ) {
 ; CHECK-LABEL: @negative_gather_v2i64_non_uniform_ptrs_all_active_mask(
 ; CHECK-NEXT:    [[INSERT_VALUE:%.*]] = insertelement <2 x ptr> [[INVAL:%.*]], ptr [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> [[INSERT_VALUE]], i32 8, <2 x i1> splat (i1 true), <2 x i64> poison)
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 8 [[INSERT_VALUE]], <2 x i1> splat (i1 true), <2 x i64> poison)
 ; CHECK-NEXT:    ret <2 x i64> [[RES]]
 ;
   %insert.value = insertelement <2 x ptr> %inVal, ptr %src, i32 1
@@ -484,7 +484,7 @@ define <2 x i64> @negative_gather_v2i64_uniform_ptrs_no_all_active_mask(ptr %src
 ; CHECK-LABEL: @negative_gather_v2i64_uniform_ptrs_no_all_active_mask(
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x ptr> poison, ptr [[SRC:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> [[BROADCAST_SPLAT]], i32 8, <2 x i1> [[MASK:%.*]], <2 x i64> undef)
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 8 [[BROADCAST_SPLAT]], <2 x i1> [[MASK:%.*]], <2 x i64> undef)
 ; CHECK-NEXT:    ret <2 x i64> [[RES]]
 ;
   %broadcast.splatinsert = insertelement <2 x ptr> poison, ptr %src, i32 0
diff --git a/llvm/test/Transforms/InstCombine/pr83947.ll b/llvm/test/Transforms/InstCombine/pr83947.ll
index 679230a46d369..18468d10ba6fe 100644
--- a/llvm/test/Transforms/InstCombine/pr83947.ll
+++ b/llvm/test/Transforms/InstCombine/pr83947.ll
@@ -6,7 +6,7 @@
 
 define void @masked_scatter1() {
 ; CHECK-LABEL: define void @masked_scatter1() {
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x ptr> shufflevector (<vscale x 4 x ptr> insertelement (<vscale x 4 x ptr> poison, ptr @c, i64 0), <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer), i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 ptrtoint (ptr @b to i1), i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x ptr> align 4 shufflevector (<vscale x 4 x ptr> insertelement (<vscale x 4 x ptr> poison, ptr @c, i64 0), <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 ptrtoint (ptr @b to i1), i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x ptr> splat (ptr @c), i32 4, <vscale x 4 x i1> splat (i1 ptrtoint (ptr @b to i1)))
@@ -57,7 +57,7 @@ define void @masked_scatter6() {
 
 define void @masked_scatter7() {
 ; CHECK-LABEL: define void @masked_scatter7() {
-; CHECK-NEXT:    call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> zeroinitializer, <2 x ptr> <ptr @c, ptr @c>, i32 4, <2 x i1> <i1 ptrtoint (ptr @b to i1), i1 ptrtoint (ptr @b to i1)>)
+; CHECK-NEXT:    call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> zeroinitializer, <2 x ptr> align 4 <ptr @c, ptr @c>, <2 x i1> <i1 ptrtoint (ptr @b to i1), i1 ptrtoint (ptr @b to i1)>)
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> zeroinitializer, <2 x ptr> splat (ptr @c), i32 4, <2 x i1> splat (i1 ptrtoint (ptr @b to i1)))
diff --git a/llvm/test/Transforms/InstCombine/ptr-int-cast.ll b/llvm/test/Transforms/InstCombine/ptr-int-cast.ll
index 69b8f6953d61e..82ecbd41e50a7 100644
--- a/llvm/test/Transforms/InstCombine/ptr-int-cast.ll
+++ b/llvm/test/Transforms/InstCombine/ptr-int-cast.ll
@@ -86,3 +86,14 @@ define <4 x ptr> @test7(<4 x i128> %arg) nounwind {
   %p1 = inttoptr <4 x i128> %arg to <4 x ptr>
   ret <4 x ptr> %p1
 }
+
+define i64 @ptrtoint_gep_sub(ptr %ptr, i64 %end.addr) {
+; CHECK-LABEL: @ptrtoint_gep_sub(
+; CHECK-NEXT:    ret i64 [[END_ADDR:%.*]]
+;
+  %ptr.addr = ptrtoint ptr %ptr to i64
+  %size = sub i64 %end.addr, %ptr.addr
+  %end = getelementptr i8, ptr %ptr, i64 %size
+  %end.addr2 = ptrtoint ptr %end to i64
+  ret i64 %end.addr2
+}
diff --git a/llvm/test/Transforms/InstCombine/ptrtoaddr.ll b/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
index ffaa8b110e32c..410c43c807ed9 100644
--- a/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
+++ b/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
@@ -1,163 +1,42 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
-; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
 
 ; The ptrtoaddr folds are also valid for pointers that have external state.
 target datalayout = "pe1:64:64:64:32"
 
-@g = external global i8
-@g2 = external global i8
-
-@g.as1 = external addrspace(1) global i8
-@g2.as1 = external addrspace(1) global i8
-
-define i64 @ptrtoaddr_inttoptr_arg(i64 %a) {
-; CHECK-LABEL: define i64 @ptrtoaddr_inttoptr_arg(
-; CHECK-SAME: i64 [[A:%.*]]) {
-; CHECK-NEXT:    ret i64 [[A]]
-;
-  %toptr = inttoptr i64 %a to ptr
-  %toaddr = ptrtoaddr ptr %toptr to i64
-  ret i64 %toaddr
-}
-
-define i32 @ptrtoaddr_inttoptr_arg_addrsize(i32 %a) {
-; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr_arg_addrsize(
-; CHECK-SAME: i32 [[A:%.*]]) {
-; CHECK-NEXT:    ret i32 [[A]]
-;
-  %toptr = inttoptr i32 %a to ptr addrspace(1)
-  %toaddr = ptrtoaddr ptr addrspace(1) %toptr to i32
-  ret i32 %toaddr
-}
-
-define i32 @ptrtoaddr_inttoptr() {
-; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr() {
-; CHECK-NEXT:    ret i32 -1
-;
-  ret i32 ptrtoaddr (ptr addrspace(1) inttoptr (i32 -1 to ptr addrspace(1)) to i32)
-}
-
-define i32 @ptrtoaddr_inttoptr_diff_size1() {
-; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr_diff_size1() {
-; CHECK-NEXT:    ret i32 -1
-;
-  ret i32 ptrtoaddr (ptr addrspace(1) inttoptr (i64 -1 to ptr addrspace(1)) to i32)
-}
-
-define i32 @ptrtoaddr_inttoptr_diff_size2() {
-; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr_diff_size2() {
-; CHECK-NEXT:    ret i32 65535
-;
-  ret i32 ptrtoaddr (ptr addrspace(1) inttoptr (i16 -1 to ptr addrspace(1)) to i32)
-}
-
-define i64 @ptrtoaddr_inttoptr_noas1() {
-; CHECK-LABEL: define i64 @ptrtoaddr_inttoptr_noas1() {
-; CHECK-NEXT:    ret i64 1
-;
-  ret i64 ptrtoaddr (ptr getelementptr (i8, ptr null, i64 1) to i64)
-}
-
-define i64 @ptr2addr2_inttoptr_noas2() {
-; CHECK-LABEL: define i64 @ptr2addr2_inttoptr_noas2() {
-; CHECK-NEXT:    ret i64 123
-;
-  ret i64 ptrtoaddr (ptr inttoptr (i64 123 to ptr) to i64)
-}
-
-define i64 @ptrtoaddr_inttoptr_noas_diff_size1() {
-; CHECK-LABEL: define i64 @ptrtoaddr_inttoptr_noas_diff_size1() {
-; CHECK-NEXT:    ret i64 4294967295
-;
-  ret i64 ptrtoaddr (ptr inttoptr (i32 -1 to ptr) to i64)
-}
-
-define i64 @ptrtoaddr_inttoptr_noas_diff_size2() {
-; CHECK-LABEL: define i64 @ptrtoaddr_inttoptr_noas_diff_size2() {
-; CHECK-NEXT:    ret i64 -1
-;
-  ret i64 ptrtoaddr (ptr inttoptr (i128 -1 to ptr) to i64)
-}
-
-define i64 @ptrtoaddr_gep_null() {
-; CHECK-LABEL: define i64 @ptrtoaddr_gep_null() {
-; CHECK-NEXT:    ret i64 42
-;
-  ret i64 ptrtoaddr (ptr getelementptr (i8, ptr null, i64 42) to i64)
-}
-
-define i32 @ptrtoaddr_gep_null_addrsize() {
-; CHECK-LABEL: define i32 @ptrtoaddr_gep_null_addrsize() {
-; CHECK-NEXT:    ret i32 42
-;
-  ret i32 ptrtoaddr (ptr addrspace(1) getelementptr (i8, ptr addrspace(1) null, i32 42) to i32)
-}
-
-define i64 @ptrtoaddr_gep_sub() {
-; CHECK-LABEL: define i64 @ptrtoaddr_gep_sub() {
-; CHECK-NEXT:    ret i64 sub (i64 ptrtoaddr (ptr @g to i64), i64 ptrtoaddr (ptr @g2 to i64))
-;
-  ret i64 ptrtoaddr (ptr getelementptr (i8, ptr @g, i64 sub (i64 0, i64 ptrtoaddr (ptr @g2 to i64))) to i64)
-}
-
-define i32 @ptrtoaddr_gep_sub_addrsize() {
-; CHECK-LABEL: define i32 @ptrtoaddr_gep_sub_addrsize() {
-; CHECK-NEXT:    ret i32 sub (i32 ptrtoaddr (ptr addrspace(1) @g.as1 to i32), i32 ptrtoaddr (ptr addrspace(1) @g2.as1 to i32))
-;
-  ret i32 ptrtoaddr (ptr addrspace(1) getelementptr (i8, ptr addrspace(1) @g.as1, i32 sub (i32 0, i32 ptrtoaddr (ptr addrspace(1) @g2.as1 to i32))) to i32)
-}
-
-; Don't fold inttoptr of ptrtoaddr away. inttoptr will pick a previously
-; exposed provenance, which is not necessarily that of @g (especially as
-; ptrtoaddr does not expose the provenance.)
-define ptr @inttoptr_of_ptrtoaddr() {
-; CHECK-LABEL: define ptr @inttoptr_of_ptrtoaddr() {
-; CHECK-NEXT:    ret ptr inttoptr (i64 ptrtoaddr (ptr @g to i64) to ptr)
-;
-  ret ptr inttoptr (i64 ptrtoaddr (ptr @g to i64) to ptr)
-}
-
-define i64 @ptrtoaddr_sub_consts_unrelated() {
-; CHECK-LABEL: define i64 @ptrtoaddr_sub_consts_unrelated() {
-; CHECK-NEXT:    ret i64 sub (i64 ptrtoaddr (ptr @g to i64), i64 ptrtoaddr (ptr @g2 to i64))
-;
-  ret i64 sub (i64 ptrtoaddr (ptr @g to i64), i64 ptrtoaddr (ptr @g2 to i64))
-}
-
-define i64 @ptrtoaddr_sub_consts_offset() {
-; CHECK-LABEL: define i64 @ptrtoaddr_sub_consts_offset() {
-; CHECK-NEXT:    ret i64 42
-;
-  ret i64 sub (i64 ptrtoaddr (ptr getelementptr (i8, ptr @g, i64 42) to i64), i64 ptrtoaddr (ptr @g to i64))
-}
-
-define i32 @ptrtoaddr_sub_consts_offset_addrsize() {
-; CHECK-LABEL: define i32 @ptrtoaddr_sub_consts_offset_addrsize() {
-; CHECK-NEXT:    ret i32 42
+; ptrtoaddr result type is fixed, and can't be combined with integer cast.
+define i32 @ptrtoaddr_trunc(ptr %p) {
+; CHECK-LABEL: define i32 @ptrtoaddr_trunc(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[P_ADDR:%.*]] = ptrtoaddr ptr [[P]] to i64
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i64 [[P_ADDR]] to i32
+; CHECK-NEXT:    ret i32 [[TRUNC]]
 ;
-  ret i32 sub (i32 ptrtoaddr (ptr addrspace(1) getelementptr (i8, ptr addrspace(1) @g.as1, i32 42) to i32), i32 ptrtoaddr (ptr addrspace(1) @g.as1 to i32))
+  %p.addr = ptrtoaddr ptr %p to i64
+  %trunc = trunc i64 %p.addr to i32
+  ret i32 %trunc
 }
 
-define i64 @ptrtoaddr_sub_known_offset(ptr %p) {
-; CHECK-LABEL: define i64 @ptrtoaddr_sub_known_offset(
+define i128 @ptrtoaddr_zext(ptr %p) {
+; CHECK-LABEL: define i128 @ptrtoaddr_zext(
 ; CHECK-SAME: ptr [[P:%.*]]) {
-; CHECK-NEXT:    ret i64 42
+; CHECK-NEXT:    [[P_ADDR:%.*]] = ptrtoaddr ptr [[P]] to i64
+; CHECK-NEXT:    [[EXT:%.*]] = zext i64 [[P_ADDR]] to i128
+; CHECK-NEXT:    ret i128 [[EXT]]
 ;
-  %p2 = getelementptr inbounds i8, ptr %p, i64 42
   %p.addr = ptrtoaddr ptr %p to i64
-  %p2.addr = ptrtoaddr ptr %p2 to i64
-  %sub = sub i64 %p2.addr, %p.addr
-  ret i64 %sub
+  %ext = zext i64 %p.addr to i128
+  ret i128 %ext
 }
 
-define i32 @ptrtoaddr_sub_known_offset_addrsize(ptr addrspace(1) %p) {
-; CHECK-LABEL: define i32 @ptrtoaddr_sub_known_offset_addrsize(
-; CHECK-SAME: ptr addrspace(1) [[P:%.*]]) {
-; CHECK-NEXT:    ret i32 42
+define i128 @ptrtoaddr_sext(ptr %p) {
+; CHECK-LABEL: define i128 @ptrtoaddr_sext(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[P_ADDR:%.*]] = ptrtoaddr ptr [[P]] to i64
+; CHECK-NEXT:    [[EXT:%.*]] = sext i64 [[P_ADDR]] to i128
+; CHECK-NEXT:    ret i128 [[EXT]]
 ;
-  %p2 = getelementptr inbounds i8, ptr addrspace(1) %p, i32 42
-  %p.addr = ptrtoaddr ptr addrspace(1) %p to i32
-  %p2.addr = ptrtoaddr ptr addrspace(1) %p2 to i32
-  %sub = sub i32 %p2.addr, %p.addr
-  ret i32 %sub
+  %p.addr = ptrtoaddr ptr %p to i64
+  %ext = sext i64 %p.addr to i128
+  ret i128 %ext
 }
diff --git a/llvm/test/Transforms/InstCombine/scmp.ll b/llvm/test/Transforms/InstCombine/scmp.ll
index 2bf22aeb7a6e9..c0be5b986b7fd 100644
--- a/llvm/test/Transforms/InstCombine/scmp.ll
+++ b/llvm/test/Transforms/InstCombine/scmp.ll
@@ -423,6 +423,86 @@ define i8 @scmp_from_select_eq_and_gt_commuted3(i32 %x, i32 %y) {
   ret i8 %r
 }
 
+; Commutative tests for (x != y) ? (x > y ? 1 : -1) : 0
+define i8 @scmp_from_select_ne_and_gt_commuted1(i32 %x, i32 %y) {
+; CHECK-LABEL: define i8 @scmp_from_select_ne_and_gt_commuted1(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call i8 @llvm.scmp.i8.i32(i32 [[Y]], i32 [[X]])
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %ne = icmp ne i32 %x, %y
+  %gt = icmp slt i32 %x, %y
+  %sel1 = select i1 %gt, i8 1, i8 -1
+  %r = select i1 %ne, i8 %sel1, i8 0
+  ret i8 %r
+}
+
+define i8 @scmp_from_select_ne_and_gt_commuted2(i32 %x, i32 %y) {
+; CHECK-LABEL: define i8 @scmp_from_select_ne_and_gt_commuted2(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call i8 @llvm.scmp.i8.i32(i32 [[Y]], i32 [[X]])
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %ne = icmp ne i32 %x, %y
+  %gt = icmp sgt i32 %x, %y
+  %sel1 = select i1 %gt, i8 -1, i8 1
+  %r = select i1 %ne, i8 %sel1, i8 0
+  ret i8 %r
+}
+
+define i8 @scmp_from_select_ne_and_gt_commuted3(i32 %x, i32 %y) {
+; CHECK-LABEL: define i8 @scmp_from_select_ne_and_gt_commuted3(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call i8 @llvm.scmp.i8.i32(i32 [[X]], i32 [[Y]])
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %ne = icmp ne i32 %x, %y
+  %gt = icmp sgt i32 %x, %y
+  %sel1 = select i1 %gt, i8 1, i8 -1
+  %r = select i1 %ne, i8 %sel1, i8 0
+  ret i8 %r
+}
+
+; Commutative tests for x != C ? (x > C - 1 ? 1 : -1) : 0
+define i8 @scmp_from_select_ne_const_and_gt_commuted1(i32 %x) {
+; CHECK-LABEL: define i8 @scmp_from_select_ne_const_and_gt_commuted1(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call i8 @llvm.scmp.i8.i32(i32 [[X]], i32 5)
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %ne = icmp ne i32 %x, 5
+  %gt = icmp sgt i32 %x, 4
+  %sel1 = select i1 %gt, i8 1, i8 -1
+  %r = select i1 %ne, i8 %sel1, i8 0
+  ret i8 %r
+}
+
+define i8 @scmp_from_select_ne_const_and_gt_commuted2(i32 %x) {
+; CHECK-LABEL: define i8 @scmp_from_select_ne_const_and_gt_commuted2(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call i8 @llvm.scmp.i8.i32(i32 [[X]], i32 5)
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %ne = icmp ne i32 %x, 5
+  %gt = icmp sgt i32 %x, 4
+  %sel1 = select i1 %gt, i8 1, i8 -1
+  %r = select i1 %ne, i8 %sel1, i8 0
+  ret i8 %r
+}
+
+define i8 @scmp_from_select_ne_const_and_gt_commuted3(i32 %x) {
+; CHECK-LABEL: define i8 @scmp_from_select_ne_const_and_gt_commuted3(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call i8 @llvm.scmp.i8.i32(i32 [[X]], i32 5)
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %ne = icmp ne i32 %x, 5
+  %gt = icmp sgt i32 %x, 4
+  %sel1 = select i1 %gt, i8 1, i8 -1
+  %r = select i1 %ne, i8 %sel1, i8 0
+  ret i8 %r
+}
+
 define <3 x i2> @scmp_unary_shuffle_ops(<3 x i8> %x, <3 x i8> %y) {
 ; CHECK-LABEL: define <3 x i2> @scmp_unary_shuffle_ops(
 ; CHECK-SAME: <3 x i8> [[X:%.*]], <3 x i8> [[Y:%.*]]) {
@@ -436,6 +516,187 @@ define <3 x i2> @scmp_unary_shuffle_ops(<3 x i8> %x, <3 x i8> %y) {
   ret <3 x i2> %r
 }
 
+define i32 @scmp_sgt_slt(i32 %a) {
+; CHECK-LABEL: define i32 @scmp_sgt_slt(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    [[A_LOBIT:%.*]] = ashr i32 [[A]], 31
+; CHECK-NEXT:    [[CMP_INV:%.*]] = icmp slt i32 [[A]], 1
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = select i1 [[CMP_INV]], i32 [[A_LOBIT]], i32 1
+; CHECK-NEXT:    ret i32 [[RETVAL_0]]
+;
+  %cmp = icmp sgt i32 %a, 0
+  %cmp1 = icmp slt i32 %a, 0
+  %. = select i1 %cmp1, i32 -1, i32 0
+  %retval.0 = select i1 %cmp, i32 1, i32 %.
+  ret i32 %retval.0
+}
+
+define i32 @scmp_zero_slt(i32 %a) {
+; CHECK-LABEL: define i32 @scmp_zero_slt(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = call i32 @llvm.scmp.i32.i32(i32 [[A]], i32 0)
+; CHECK-NEXT:    ret i32 [[RETVAL_0]]
+;
+  %cmp = icmp eq i32 %a, 0
+  %cmp1.inv = icmp slt i32 %a, 1
+  %. = select i1 %cmp1.inv, i32 -1, i32 1
+  %retval.0 = select i1 %cmp, i32 0, i32 %.
+  ret i32 %retval.0
+}
+
+define i32 @scmp_zero_sgt(i32 %a) {
+; CHECK-LABEL: define i32 @scmp_zero_sgt(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = call i32 @llvm.scmp.i32.i32(i32 [[A]], i32 0)
+; CHECK-NEXT:    ret i32 [[RETVAL_0]]
+;
+  %cmp = icmp eq i32 %a, 0
+  %cmp1.inv = icmp sgt i32 %a, -1
+  %. = select i1 %cmp1.inv, i32 1, i32 -1
+  %retval.0 = select i1 %cmp, i32 0, i32 %.
+  ret i32 %retval.0
+}
+
+
+define i32 @scmp_zero_sgt_1(i32 %a) {
+; CHECK-LABEL: define i32 @scmp_zero_sgt_1(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    [[COND2:%.*]] = call i32 @llvm.scmp.i32.i32(i32 [[A]], i32 0)
+; CHECK-NEXT:    ret i32 [[COND2]]
+;
+  %cmp = icmp eq i32 %a, 0
+  %cmp1 = icmp sgt i32 %a, -1
+  %cond = select i1 %cmp1, i32 1, i32 -1
+  %cond2 = select i1 %cmp, i32 0, i32 %cond
+  ret i32 %cond2
+}
+
+define i32 @scmp_zero_slt_1(i32 %a) {
+; CHECK-LABEL: define i32 @scmp_zero_slt_1(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    [[COND2:%.*]] = call i32 @llvm.scmp.i32.i32(i32 [[A]], i32 0)
+; CHECK-NEXT:    ret i32 [[COND2]]
+;
+  %cmp = icmp eq i32 %a, 0
+  %cmp1 = icmp slt i32 %a, 1
+  %cond = select i1 %cmp1, i32 -1, i32 1
+  %cond2 = select i1 %cmp, i32 0, i32 %cond
+  ret i32 %cond2
+}
+
+define i32 @scmp_zero_slt_neg(i32 %a) {
+; CHECK-LABEL: define i32 @scmp_zero_slt_neg(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A]], 0
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[A]], -1
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP1]], i32 -1, i32 1
+; CHECK-NEXT:    [[COND2:%.*]] = select i1 [[CMP]], i32 0, i32 [[COND]]
+; CHECK-NEXT:    ret i32 [[COND2]]
+;
+  %cmp = icmp eq i32 %a, 0
+  %cmp1 = icmp slt i32 %a, -1
+  %cond = select i1 %cmp1, i32 -1, i32 1
+  %cond2 = select i1 %cmp, i32 0, i32 %cond
+  ret i32 %cond2
+}
+
+define i32 @scmp_zero_sgt_neg(i32 %a) {
+; CHECK-LABEL: define i32 @scmp_zero_sgt_neg(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A]], 0
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[A]], 1
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP1]], i32 1, i32 -1
+; CHECK-NEXT:    [[COND2:%.*]] = select i1 [[CMP]], i32 0, i32 [[COND]]
+; CHECK-NEXT:    ret i32 [[COND2]]
+;
+  %cmp = icmp eq i32 %a, 0
+  %cmp1 = icmp sgt i32 %a, 1
+  %cond = select i1 %cmp1, i32 1, i32 -1
+  %cond2 = select i1 %cmp, i32 0, i32 %cond
+  ret i32 %cond2
+}
+
+define i32 @ucmp_ugt_ult_neg(i32 %a) {
+; CHECK-LABEL: define i32 @ucmp_ugt_ult_neg(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp ne i32 [[A]], 0
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = zext i1 [[CMP_NOT]] to i32
+; CHECK-NEXT:    ret i32 [[RETVAL_0]]
+;
+  %cmp = icmp ugt i32 %a, 0
+  %cmp1 = icmp ult i32 %a, 0
+  %. = select i1 %cmp1, i32 -1, i32 0
+  %retval.0 = select i1 %cmp, i32 1, i32 %.
+  ret i32 %retval.0
+}
+
+define i32 @ucmp_zero_ult_neg(i32 %a) {
+; CHECK-LABEL: define i32 @ucmp_zero_ult_neg(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[A]], 0
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[RETVAL_0]]
+;
+  %cmp = icmp eq i32 %a, 0
+  %cmp1.inv = icmp ult i32 %a, 1
+  %. = select i1 %cmp1.inv, i32 -1, i32 1
+  %retval.0 = select i1 %cmp, i32 0, i32 %.
+  ret i32 %retval.0
+}
+
+define i32 @ucmp_zero_ugt_neg(i32 %a) {
+; CHECK-LABEL: define i32 @ucmp_zero_ugt_neg(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[A]], 0
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = sext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[RETVAL_0]]
+;
+  %cmp = icmp eq i32 %a, 0
+  %cmp1.inv = icmp ugt i32 %a, -1
+  %. = select i1 %cmp1.inv, i32 1, i32 -1
+  %retval.0 = select i1 %cmp, i32 0, i32 %.
+  ret i32 %retval.0
+}
+
+define i32 @scmp_sgt_slt_ab(i32 %a, i32 %b) {
+; CHECK-LABEL: define i32 @scmp_sgt_slt_ab(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = call i32 @llvm.scmp.i32.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    ret i32 [[RETVAL_0]]
+;
+  %cmp = icmp sgt i32 %a, %b
+  %cmp1 = icmp slt i32 %a, %b
+  %. = select i1 %cmp1, i32 -1, i32 0
+  %retval.0 = select i1 %cmp, i32 1, i32 %.
+  ret i32 %retval.0
+}
+
+define i32 @scmp_zero_slt_ab(i32 %a, i32 %b) {
+; CHECK-LABEL: define i32 @scmp_zero_slt_ab(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = call i32 @llvm.scmp.i32.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    ret i32 [[RETVAL_0]]
+;
+  %cmp = icmp eq i32 %a, %b
+  %cmp1.inv = icmp slt i32 %a, %b
+  %. = select i1 %cmp1.inv, i32 -1, i32 1
+  %retval.0 = select i1 %cmp, i32 0, i32 %.
+  ret i32 %retval.0
+}
+
+define i32 @scmp_zero_sgt_ab(i32 %a, i32 %b) {
+; CHECK-LABEL: define i32 @scmp_zero_sgt_ab(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = call i32 @llvm.scmp.i32.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    ret i32 [[RETVAL_0]]
+;
+  %cmp = icmp eq i32 %a, %b
+  %cmp1.inv = icmp sgt i32 %a, %b
+  %. = select i1 %cmp1.inv, i32 1, i32 -1
+  %retval.0 = select i1 %cmp, i32 0, i32 %.
+  ret i32 %retval.0
+}
+
 ; Negative test: true value of outer select is not zero
 define i8 @scmp_from_select_eq_and_gt_neg1(i32 %x, i32 %y) {
 ; CHECK-LABEL: define i8 @scmp_from_select_eq_and_gt_neg1(
diff --git a/llvm/test/Transforms/InstCombine/select-and-or.ll b/llvm/test/Transforms/InstCombine/select-and-or.ll
index 453ca666a7477..0b8eda43beb18 100644
--- a/llvm/test/Transforms/InstCombine/select-and-or.ll
+++ b/llvm/test/Transforms/InstCombine/select-and-or.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
 ; RUN: opt -S -passes=instcombine < %s | FileCheck %s
 
 declare void @use(i1)
@@ -6,6 +6,10 @@ declare i1 @gen_i1()
 declare <2 x i1> @gen_v2i1()
 
 ; Should not be converted to "and", which has different poison semantics.
+;.
+; CHECK: @g1 = external global i16
+; CHECK: @g2 = external global i16
+;.
 define i1 @logical_and(i1 %a, i1 %b) {
 ; CHECK-LABEL: @logical_and(
 ; CHECK-NEXT:    [[RES:%.*]] = select i1 [[A:%.*]], i1 [[B:%.*]], i1 false
@@ -225,29 +229,29 @@ define i1 @not_not_true(i1 %x, i1 %y) {
 
 ; (!x && !y) --> !(x || y)
 
-define i1 @not_not_false(i1 %x, i1 %y) {
+define i1 @not_not_false(i1 %x, i1 %y) !prof !0 {
 ; CHECK-LABEL: @not_not_false(
-; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[X:%.*]], i1 true, i1 [[Y:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[X:%.*]], i1 true, i1 [[Y:%.*]], !prof [[PROF1:![0-9]+]]
 ; CHECK-NEXT:    [[R:%.*]] = xor i1 [[TMP1]], true
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i1 %x, true
   %noty = xor i1 %y, true
-  %r = select i1 %notx, i1 %noty, i1 false
+  %r = select i1 %notx, i1 %noty, i1 false, !prof !1
   ret i1 %r
 }
 
 ; (!x || !y) --> !(x && y)
 
-define i1 @not_true_not(i1 %x, i1 %y) {
+define i1 @not_true_not(i1 %x, i1 %y) !prof !0 {
 ; CHECK-LABEL: @not_true_not(
-; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[X:%.*]], i1 [[Y:%.*]], i1 false
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[X:%.*]], i1 [[Y:%.*]], i1 false, !prof [[PROF1]]
 ; CHECK-NEXT:    [[R:%.*]] = xor i1 [[TMP1]], true
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i1 %x, true
   %noty = xor i1 %y, true
-  %r = select i1 %notx, i1 true, i1 %noty
+  %r = select i1 %notx, i1 true, i1 %noty, !prof !1
   ret i1 %r
 }
 
@@ -1348,3 +1352,12 @@ define i8 @test_logical_commuted_and_ne_a_b(i1 %other_cond, i8 %a, i8 %b)  {
   %select = select i1 %or.cond, i8 %a, i8 %b
   ret i8 %select
 }
+
+!0 = !{!"function_entry_count", i64 1000}
+!1 = !{!"branch_weights", i32 2, i32 3}
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+;.
+; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 3, i32 2}
+;.
diff --git a/llvm/test/Transforms/InstCombine/select-masked_gather.ll b/llvm/test/Transforms/InstCombine/select-masked_gather.ll
index b6b433c114442..410d594537c9b 100644
--- a/llvm/test/Transforms/InstCombine/select-masked_gather.ll
+++ b/llvm/test/Transforms/InstCombine/select-masked_gather.ll
@@ -4,7 +4,7 @@
 ; Fold zeroing of inactive lanes into the gather's passthrough parameter.
 define <vscale x 2 x float> @masked_gather_and_zero_inactive_1(<vscale x 2 x ptr> %ptr, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: @masked_gather_and_zero_inactive_1(
-; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0(<vscale x 2 x ptr> [[PTR:%.*]], i32 4, <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> zeroinitializer)
+; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0(<vscale x 2 x ptr> align 4 [[PTR:%.*]], <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 2 x float> [[GATHER]]
 ;
   %gather = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x ptr> %ptr, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x float> undef)
@@ -15,7 +15,7 @@ define <vscale x 2 x float> @masked_gather_and_zero_inactive_1(<vscale x 2 x ptr
 ; As above but reuse the gather's existing passthrough.
 define <vscale x 2 x i32> @masked_gather_and_zero_inactive_2(<vscale x 2 x ptr> %ptr, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: @masked_gather_and_zero_inactive_2(
-; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[PTR:%.*]], i32 4, <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> align 4 [[PTR:%.*]], <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 2 x i32> [[GATHER]]
 ;
   %gather = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x ptr> %ptr, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> zeroinitializer)
@@ -26,7 +26,7 @@ define <vscale x 2 x i32> @masked_gather_and_zero_inactive_2(<vscale x 2 x ptr>
 ; No transform when the gather's passthrough cannot be reused or altered.
 define <vscale x 2 x i32> @masked_gather_and_zero_inactive_3(<vscale x 2 x ptr> %ptr, <vscale x 2 x i1> %mask, <vscale x 2 x i32> %passthrough) {
 ; CHECK-LABEL: @masked_gather_and_zero_inactive_3(
-; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[PTR:%.*]], i32 4, <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i32> [[PASSTHROUGH:%.*]])
+; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> align 4 [[PTR:%.*]], <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i32> [[PASSTHROUGH:%.*]])
 ; CHECK-NEXT:    [[MASKED:%.*]] = select <vscale x 2 x i1> [[MASK]], <vscale x 2 x i32> [[GATHER]], <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <vscale x 2 x i32> [[MASKED]]
 ;
@@ -39,7 +39,7 @@ define <vscale x 2 x i32> @masked_gather_and_zero_inactive_3(<vscale x 2 x ptr>
 define <vscale x 2 x i32> @masked_gather_and_zero_inactive_4(<vscale x 2 x ptr> %ptr, <vscale x 2 x i1> %inv_mask) {
 ; CHECK-LABEL: @masked_gather_and_zero_inactive_4(
 ; CHECK-NEXT:    [[MASK:%.*]] = xor <vscale x 2 x i1> [[INV_MASK:%.*]], splat (i1 true)
-; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[PTR:%.*]], i32 4, <vscale x 2 x i1> [[MASK]], <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> align 4 [[PTR:%.*]], <vscale x 2 x i1> [[MASK]], <vscale x 2 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 2 x i32> [[GATHER]]
 ;
   %mask = xor <vscale x 2 x i1> %inv_mask, splat (i1 true)
@@ -52,7 +52,7 @@ define <vscale x 2 x i32> @masked_gather_and_zero_inactive_4(<vscale x 2 x ptr>
 define <vscale x 2 x i32> @masked_gather_and_zero_inactive_5(<vscale x 2 x ptr> %ptr, <vscale x 2 x i1> %inv_mask) {
 ; CHECK-LABEL: @masked_gather_and_zero_inactive_5(
 ; CHECK-NEXT:    [[MASK:%.*]] = xor <vscale x 2 x i1> [[INV_MASK:%.*]], splat (i1 true)
-; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[PTR:%.*]], i32 4, <vscale x 2 x i1> [[MASK]], <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> align 4 [[PTR:%.*]], <vscale x 2 x i1> [[MASK]], <vscale x 2 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 2 x i32> [[GATHER]]
 ;
   %mask = xor <vscale x 2 x i1> %inv_mask, splat (i1 true)
@@ -65,7 +65,7 @@ define <vscale x 2 x i32> @masked_gather_and_zero_inactive_5(<vscale x 2 x ptr>
 define <vscale x 2 x i32> @masked_gather_and_zero_inactive_6(<vscale x 2 x ptr> %ptr, <vscale x 2 x i1> %inv_mask, <vscale x 2 x i32> %passthrough) {
 ; CHECK-LABEL: @masked_gather_and_zero_inactive_6(
 ; CHECK-NEXT:    [[MASK:%.*]] = xor <vscale x 2 x i1> [[INV_MASK:%.*]], splat (i1 true)
-; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[PTR:%.*]], i32 4, <vscale x 2 x i1> [[MASK]], <vscale x 2 x i32> [[PASSTHROUGH:%.*]])
+; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> align 4 [[PTR:%.*]], <vscale x 2 x i1> [[MASK]], <vscale x 2 x i32> [[PASSTHROUGH:%.*]])
 ; CHECK-NEXT:    [[MASKED:%.*]] = select <vscale x 2 x i1> [[INV_MASK]], <vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> [[GATHER]]
 ; CHECK-NEXT:    ret <vscale x 2 x i32> [[MASKED]]
 ;
@@ -78,7 +78,7 @@ define <vscale x 2 x i32> @masked_gather_and_zero_inactive_6(<vscale x 2 x ptr>
 ; No transform when select and gather masks have no relation.
 define <vscale x 2 x i32> @masked_gather_and_zero_inactive_7(<vscale x 2 x ptr> %ptr, <vscale x 2 x i1> %mask1, <vscale x 2 x i1> %mask2) {
 ; CHECK-LABEL: @masked_gather_and_zero_inactive_7(
-; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[PTR:%.*]], i32 4, <vscale x 2 x i1> [[MASK1:%.*]], <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> align 4 [[PTR:%.*]], <vscale x 2 x i1> [[MASK1:%.*]], <vscale x 2 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[MASKED:%.*]] = select <vscale x 2 x i1> [[MASK2:%.*]], <vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> [[GATHER]]
 ; CHECK-NEXT:    ret <vscale x 2 x i32> [[MASKED]]
 ;
@@ -93,7 +93,7 @@ define <vscale x 2 x float> @masked_gather_and_zero_inactive_8(<vscale x 2 x ptr
 ; CHECK-LABEL: @masked_gather_and_zero_inactive_8(
 ; CHECK-NEXT:    [[MASK:%.*]] = xor <vscale x 2 x i1> [[INV_MASK:%.*]], splat (i1 true)
 ; CHECK-NEXT:    [[PG:%.*]] = and <vscale x 2 x i1> [[COND:%.*]], [[MASK]]
-; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0(<vscale x 2 x ptr> [[PTR:%.*]], i32 4, <vscale x 2 x i1> [[PG]], <vscale x 2 x float> zeroinitializer)
+; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0(<vscale x 2 x ptr> align 4 [[PTR:%.*]], <vscale x 2 x i1> [[PG]], <vscale x 2 x float> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 2 x float> [[GATHER]]
 ;
   %mask = xor <vscale x 2 x i1> %inv_mask, splat (i1 true)
@@ -106,7 +106,7 @@ define <vscale x 2 x float> @masked_gather_and_zero_inactive_8(<vscale x 2 x ptr
 define <vscale x 2 x float> @masked_load_and_scalar_select_cond(<vscale x 2 x ptr> %ptr, <vscale x 2 x i1> %mask, i1 %cond) {
 ; CHECK-LABEL: @masked_load_and_scalar_select_cond(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0(<vscale x 2 x ptr> [[PTR:%.*]], i32 32, <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> undef)
+; CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0(<vscale x 2 x ptr> align 32 [[PTR:%.*]], <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> undef)
 ; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[COND:%.*]], <vscale x 2 x float> zeroinitializer, <vscale x 2 x float> [[TMP0]]
 ; CHECK-NEXT:    ret <vscale x 2 x float> [[TMP1]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/select-masked_load.ll b/llvm/test/Transforms/InstCombine/select-masked_load.ll
index 22e30ac019a5d..f7584c41ce641 100644
--- a/llvm/test/Transforms/InstCombine/select-masked_load.ll
+++ b/llvm/test/Transforms/InstCombine/select-masked_load.ll
@@ -4,7 +4,7 @@
 ; Fold zeroing of inactive lanes into the load's passthrough parameter.
 define <4 x float> @masked_load_and_zero_inactive_1(ptr %ptr, <4 x i1> %mask) {
 ; CHECK-LABEL: @masked_load_and_zero_inactive_1(
-; CHECK-NEXT:    [[LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[PTR:%.*]], i32 4, <4 x i1> [[MASK:%.*]], <4 x float> zeroinitializer)
+; CHECK-NEXT:    [[LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[PTR:%.*]], <4 x i1> [[MASK:%.*]], <4 x float> zeroinitializer)
 ; CHECK-NEXT:    ret <4 x float> [[LOAD]]
 ;
   %load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %ptr, i32 4, <4 x i1> %mask, <4 x float> undef)
@@ -15,7 +15,7 @@ define <4 x float> @masked_load_and_zero_inactive_1(ptr %ptr, <4 x i1> %mask) {
 ; As above but reuse the load's existing passthrough.
 define <4 x i32> @masked_load_and_zero_inactive_2(ptr %ptr, <4 x i1> %mask) {
 ; CHECK-LABEL: @masked_load_and_zero_inactive_2(
-; CHECK-NEXT:    [[LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[PTR:%.*]], i32 4, <4 x i1> [[MASK:%.*]], <4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[PTR:%.*]], <4 x i1> [[MASK:%.*]], <4 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <4 x i32> [[LOAD]]
 ;
   %load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %ptr, i32 4, <4 x i1> %mask, <4 x i32> zeroinitializer)
@@ -26,7 +26,7 @@ define <4 x i32> @masked_load_and_zero_inactive_2(ptr %ptr, <4 x i1> %mask) {
 ; No transform when the load's passthrough cannot be reused or altered.
 define <4 x i32> @masked_load_and_zero_inactive_3(ptr %ptr, <4 x i1> %mask, <4 x i32> %passthrough) {
 ; CHECK-LABEL: @masked_load_and_zero_inactive_3(
-; CHECK-NEXT:    [[MASKED:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[PTR:%.*]], i32 4, <4 x i1> [[MASK:%.*]], <4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[MASKED:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[PTR:%.*]], <4 x i1> [[MASK:%.*]], <4 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <4 x i32> [[MASKED]]
 ;
   %load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %ptr, i32 4, <4 x i1> %mask, <4 x i32> %passthrough)
@@ -38,7 +38,7 @@ define <4 x i32> @masked_load_and_zero_inactive_3(ptr %ptr, <4 x i1> %mask, <4 x
 define <4 x i32> @masked_load_and_zero_inactive_4(ptr %ptr, <4 x i1> %inv_mask) {
 ; CHECK-LABEL: @masked_load_and_zero_inactive_4(
 ; CHECK-NEXT:    [[MASK:%.*]] = xor <4 x i1> [[INV_MASK:%.*]], splat (i1 true)
-; CHECK-NEXT:    [[LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[PTR:%.*]], i32 4, <4 x i1> [[MASK]], <4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[PTR:%.*]], <4 x i1> [[MASK]], <4 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <4 x i32> [[LOAD]]
 ;
   %mask = xor <4 x i1> %inv_mask, <i1 true, i1 true, i1 true, i1 true>
@@ -51,7 +51,7 @@ define <4 x i32> @masked_load_and_zero_inactive_4(ptr %ptr, <4 x i1> %inv_mask)
 define <4 x i32> @masked_load_and_zero_inactive_5(ptr %ptr, <4 x i1> %inv_mask) {
 ; CHECK-LABEL: @masked_load_and_zero_inactive_5(
 ; CHECK-NEXT:    [[MASK:%.*]] = xor <4 x i1> [[INV_MASK:%.*]], splat (i1 true)
-; CHECK-NEXT:    [[LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[PTR:%.*]], i32 4, <4 x i1> [[MASK]], <4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[PTR:%.*]], <4 x i1> [[MASK]], <4 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <4 x i32> [[LOAD]]
 ;
   %mask = xor <4 x i1> %inv_mask, <i1 true, i1 true, i1 true, i1 true>
@@ -64,7 +64,7 @@ define <4 x i32> @masked_load_and_zero_inactive_5(ptr %ptr, <4 x i1> %inv_mask)
 define <4 x i32> @masked_load_and_zero_inactive_6(ptr %ptr, <4 x i1> %inv_mask, <4 x i32> %passthrough) {
 ; CHECK-LABEL: @masked_load_and_zero_inactive_6(
 ; CHECK-NEXT:    [[MASK:%.*]] = xor <4 x i1> [[INV_MASK:%.*]], splat (i1 true)
-; CHECK-NEXT:    [[LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[PTR:%.*]], i32 4, <4 x i1> [[MASK]], <4 x i32> [[PASSTHROUGH:%.*]])
+; CHECK-NEXT:    [[LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[PTR:%.*]], <4 x i1> [[MASK]], <4 x i32> [[PASSTHROUGH:%.*]])
 ; CHECK-NEXT:    [[MASKED:%.*]] = select <4 x i1> [[INV_MASK]], <4 x i32> zeroinitializer, <4 x i32> [[LOAD]]
 ; CHECK-NEXT:    ret <4 x i32> [[MASKED]]
 ;
@@ -77,7 +77,7 @@ define <4 x i32> @masked_load_and_zero_inactive_6(ptr %ptr, <4 x i1> %inv_mask,
 ; No transform when select and load masks have no relation.
 define <4 x i32> @masked_load_and_zero_inactive_7(ptr %ptr, <4 x i1> %mask1, <4 x i1> %mask2) {
 ; CHECK-LABEL: @masked_load_and_zero_inactive_7(
-; CHECK-NEXT:    [[LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[PTR:%.*]], i32 4, <4 x i1> [[MASK1:%.*]], <4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[PTR:%.*]], <4 x i1> [[MASK1:%.*]], <4 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[MASKED:%.*]] = select <4 x i1> [[MASK2:%.*]], <4 x i32> zeroinitializer, <4 x i32> [[LOAD]]
 ; CHECK-NEXT:    ret <4 x i32> [[MASKED]]
 ;
@@ -92,7 +92,7 @@ define <4 x float> @masked_load_and_zero_inactive_8(ptr %ptr, <4 x i1> %inv_mask
 ; CHECK-LABEL: @masked_load_and_zero_inactive_8(
 ; CHECK-NEXT:    [[MASK:%.*]] = xor <4 x i1> [[INV_MASK:%.*]], splat (i1 true)
 ; CHECK-NEXT:    [[PG:%.*]] = and <4 x i1> [[COND:%.*]], [[MASK]]
-; CHECK-NEXT:    [[LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[PTR:%.*]], i32 4, <4 x i1> [[PG]], <4 x float> zeroinitializer)
+; CHECK-NEXT:    [[LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[PTR:%.*]], <4 x i1> [[PG]], <4 x float> zeroinitializer)
 ; CHECK-NEXT:    ret <4 x float> [[LOAD]]
 ;
   %mask = xor <4 x i1> %inv_mask, <i1 true, i1 true, i1 true, i1 true>
@@ -105,7 +105,7 @@ define <4 x float> @masked_load_and_zero_inactive_8(ptr %ptr, <4 x i1> %inv_mask
 define <8 x float> @masked_load_and_scalar_select_cond(ptr %ptr, <8 x i1> %mask, i1 %cond) {
 ; CHECK-LABEL: @masked_load_and_scalar_select_cond(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[PTR:%.*]], i32 32, <8 x i1> [[MASK:%.*]], <8 x float> undef)
+; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 32 [[PTR:%.*]], <8 x i1> [[MASK:%.*]], <8 x float> undef)
 ; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[COND:%.*]], <8 x float> zeroinitializer, <8 x float> [[TMP0]]
 ; CHECK-NEXT:    ret <8 x float> [[TMP1]]
 ;
@@ -117,7 +117,7 @@ entry:
 
 define <vscale x 4 x float> @fold_sel_into_masked_load_scalable(ptr %loc, <vscale x 4 x i1> %mask, <vscale x 4 x float> %passthrough) {
 ; CHECK-LABEL: @fold_sel_into_masked_load_scalable(
-; CHECK-NEXT:    [[SEL:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[LOC:%.*]], i32 1, <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[PASSTHROUGH:%.*]])
+; CHECK-NEXT:    [[SEL:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[LOC:%.*]], <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[PASSTHROUGH:%.*]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[SEL]]
 ;
   %load = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %loc, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
@@ -127,7 +127,7 @@ define <vscale x 4 x float> @fold_sel_into_masked_load_scalable(ptr %loc, <vscal
 
 define <vscale x 4 x float> @neg_fold_sel_into_masked_load_mask_mismatch(ptr %loc, <vscale x 4 x i1> %mask, <vscale x 4 x i1> %mask2, <vscale x 4 x float> %passthrough) {
 ; CHECK-LABEL: @neg_fold_sel_into_masked_load_mask_mismatch(
-; CHECK-NEXT:    [[LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[LOC:%.*]], i32 1, <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[PASSTHROUGH:%.*]])
+; CHECK-NEXT:    [[LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[LOC:%.*]], <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[PASSTHROUGH:%.*]])
 ; CHECK-NEXT:    [[SEL:%.*]] = select <vscale x 4 x i1> [[MASK2:%.*]], <vscale x 4 x float> [[LOAD]], <vscale x 4 x float> [[PASSTHROUGH]]
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[SEL]]
 ;
@@ -138,9 +138,9 @@ define <vscale x 4 x float> @neg_fold_sel_into_masked_load_mask_mismatch(ptr %lo
 
 define <vscale x 4 x float> @fold_sel_into_masked_load_scalable_one_use_check(ptr %loc1, <vscale x 4 x i1> %mask, <vscale x 4 x float> %passthrough, ptr %loc2) {
 ; CHECK-LABEL: @fold_sel_into_masked_load_scalable_one_use_check(
-; CHECK-NEXT:    [[LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[LOC:%.*]], i32 1, <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    [[LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[LOC1:%.*]], <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> zeroinitializer)
 ; CHECK-NEXT:    [[SEL:%.*]] = select <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> [[LOAD]], <vscale x 4 x float> [[PASSTHROUGH:%.*]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD]], ptr [[LOC2:%.*]], i32 1, <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD]], ptr align 1 [[LOC2:%.*]], <vscale x 4 x i1> [[MASK]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[SEL]]
 ;
   %load = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %loc1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
diff --git a/llvm/test/Transforms/InstCombine/select-safe-impliedcond-transforms.ll b/llvm/test/Transforms/InstCombine/select-safe-impliedcond-transforms.ll
index ba34930dc14c1..bc988a9bbab04 100644
--- a/llvm/test/Transforms/InstCombine/select-safe-impliedcond-transforms.ll
+++ b/llvm/test/Transforms/InstCombine/select-safe-impliedcond-transforms.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
 ; RUN: opt < %s -passes=instcombine -S | FileCheck %s
 
 define i1 @a_true_implies_b_true(i8 %z, i1 %X, i1 %Y) {
@@ -34,15 +34,15 @@ define <2 x i1> @a_true_implies_b_true_vec(i8 %z0, <2 x i1> %X, <2 x i1> %Y) {
   ret <2 x i1> %res
 }
 
-define i1 @a_true_implies_b_true2(i8 %z, i1 %X, i1 %Y) {
+define i1 @a_true_implies_b_true2(i8 %z, i1 %X, i1 %Y) !prof !0 {
 ; CHECK-LABEL: @a_true_implies_b_true2(
 ; CHECK-NEXT:    [[A:%.*]] = icmp ugt i8 [[Z:%.*]], 20
-; CHECK-NEXT:    [[RES:%.*]] = select i1 [[A]], i1 [[X:%.*]], i1 false
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[A]], i1 [[X:%.*]], i1 false, !prof [[PROF1:![0-9]+]]
 ; CHECK-NEXT:    ret i1 [[RES]]
 ;
   %a = icmp ugt i8 %z, 20
   %b = icmp ugt i8 %z, 10
-  %sel = select i1 %b, i1 %X, i1 %Y
+  %sel = select i1 %b, i1 %X, i1 %Y, !prof !1
   %res = and i1 %a, %sel
   ret i1 %res
 }
@@ -258,3 +258,10 @@ define i1 @neg_icmp_eq_implies_trunc(i8 %x, i1 %c) {
   %sel2 = select i1 %cmp, i1 true, i1 %sel1
   ret i1 %sel2
 }
+
+!0 = !{!"function_entry_count", i64 1000}
+!1 = !{!"branch_weights", i32 2, i32 3}
+;.
+; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 2, i32 3}
+;.
diff --git a/llvm/test/Transforms/InstCombine/select-safe-transforms.ll b/llvm/test/Transforms/InstCombine/select-safe-transforms.ll
index d88eaf8aa9ddc..3d97048f43127 100644
--- a/llvm/test/Transforms/InstCombine/select-safe-transforms.ll
+++ b/llvm/test/Transforms/InstCombine/select-safe-transforms.ll
@@ -58,15 +58,15 @@ define i1 @cond_eq_or_const(i8 %X, i8 %Y) !prof !0 {
   ret i1 %res
 }
 
-define i1 @xor_and(i1 %c, i32 %X, i32 %Y) {
+define i1 @xor_and(i1 %c, i32 %X, i32 %Y) !prof !0 {
 ; CHECK-LABEL: @xor_and(
 ; CHECK-NEXT:    [[COMP:%.*]] = icmp uge i32 [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    [[NOT_C:%.*]] = xor i1 [[C:%.*]], true
-; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[NOT_C]], i1 true, i1 [[COMP]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[NOT_C]], i1 true, i1 [[COMP]], !prof [[PROF2:![0-9]+]]
 ; CHECK-NEXT:    ret i1 [[SEL]]
 ;
   %comp = icmp ult i32 %X, %Y
-  %sel = select i1 %c, i1 %comp, i1 false
+  %sel = select i1 %c, i1 %comp, i1 false, !prof !1
   %res = xor i1 %sel, true
   ret i1 %res
 }
@@ -97,15 +97,15 @@ define <2 x i1> @xor_and3(<2 x i1> %c, <2 x i32> %X, <2 x i32> %Y) {
   ret <2 x i1> %res
 }
 
-define i1 @xor_or(i1 %c, i32 %X, i32 %Y) {
+define i1 @xor_or(i1 %c, i32 %X, i32 %Y) !prof !0 {
 ; CHECK-LABEL: @xor_or(
 ; CHECK-NEXT:    [[COMP:%.*]] = icmp uge i32 [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    [[NOT_C:%.*]] = xor i1 [[C:%.*]], true
-; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[NOT_C]], i1 [[COMP]], i1 false
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[NOT_C]], i1 [[COMP]], i1 false, !prof [[PROF2]]
 ; CHECK-NEXT:    ret i1 [[SEL]]
 ;
   %comp = icmp ult i32 %X, %Y
-  %sel = select i1 %c, i1 true, i1 %comp
+  %sel = select i1 %c, i1 true, i1 %comp, !prof !1
   %res = xor i1 %sel, true
   ret i1 %res
 }
@@ -802,4 +802,5 @@ define <2 x i1> @not_logical_and2(i1 %b, <2 x i32> %a) {
 ;.
 ; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000}
 ; CHECK: [[PROF1]] = !{!"branch_weights", i32 2, i32 3}
+; CHECK: [[PROF2]] = !{!"branch_weights", i32 3, i32 2}
 ;.
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/WebAssembly/any_all_true.ll b/llvm/test/Transforms/InstSimplify/ConstProp/WebAssembly/any_all_true.ll
index 7b30edbf7792b..71dad41b971b5 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/WebAssembly/any_all_true.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/WebAssembly/any_all_true.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 
 ; RUN: opt -passes=instsimplify -S < %s | FileCheck %s
+; RUN: opt -passes=instsimplify -use-constant-int-for-fixed-length-splat -S < %s | FileCheck %s
 
 ; Test that intrinsics wasm call are constant folded
 
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/bitcount.ll b/llvm/test/Transforms/InstSimplify/ConstProp/bitcount.ll
index 68b45a94af4bc..f68b85ed4db26 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/bitcount.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/bitcount.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=instsimplify -S | FileCheck %s
+; RUN: opt < %s -passes=instsimplify -use-constant-int-for-fixed-length-splat -use-constant-int-for-scalable-splat -S | FileCheck %s
 
 declare i31 @llvm.ctpop.i31(i31 %val)
 declare i32 @llvm.cttz.i32(i32 %val, i1)
@@ -120,6 +121,22 @@ define <2 x i31> @ctpop_vector() {
   ret <2 x i31> %x
 }
 
+define <2 x i31> @ctpop_vector_splat_v2i31() {
+; CHECK-LABEL: @ctpop_vector_splat_v2i31(
+; CHECK-NEXT:    ret <2 x i31> splat (i31 1)
+;
+  %x = call <2 x i31> @llvm.ctpop.v2i31(<2 x i31> splat(i31 16))
+  ret <2 x i31> %x
+}
+
+define <vscale x 2 x i31> @ctpop_vector_splat_nxv2i31() {
+; CHECK-LABEL: @ctpop_vector_splat_nxv2i31(
+; CHECK-NEXT:    ret <vscale x 2 x i31> splat (i31 1)
+;
+  %x = call <vscale x 2 x i31> @llvm.ctpop.nxv2i31(<vscale x 2 x i31> splat(i31 16))
+  ret <vscale x 2 x i31> %x
+}
+
 define <2 x i31> @ctpop_vector_undef() {
 ; CHECK-LABEL: @ctpop_vector_undef(
 ; CHECK-NEXT:    ret <2 x i31> zeroinitializer
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/bitreverse.ll b/llvm/test/Transforms/InstSimplify/ConstProp/bitreverse.ll
new file mode 100644
index 0000000000000..409141a2c872b
--- /dev/null
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/bitreverse.ll
@@ -0,0 +1,51 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -passes=instsimplify -S | FileCheck %s
+; RUN: opt < %s -passes=instsimplify -use-constant-int-for-fixed-length-splat -use-constant-int-for-scalable-splat -S | FileCheck %s
+
+define i16 @W() {
+; CHECK-LABEL: define i16 @W() {
+; CHECK-NEXT:    ret i16 -32768
+;
+  %Z = call i16 @llvm.bitreverse.i16(i16 1)
+  ret i16 %Z
+}
+
+define i32 @X() {
+; CHECK-LABEL: define i32 @X() {
+; CHECK-NEXT:    ret i32 -2147483648
+;
+  %Z = call i32 @llvm.bitreverse.i32(i32 1)
+  ret i32 %Z
+}
+
+define i64 @Y() {
+; CHECK-LABEL: define i64 @Y() {
+; CHECK-NEXT:    ret i64 -9223372036854775808
+;
+  %Z = call i64 @llvm.bitreverse.i64(i64 1)
+  ret i64 %Z
+}
+
+define i80 @Z() {
+; CHECK-LABEL: define i80 @Z() {
+; CHECK-NEXT:    ret i80 23777929115895377691656
+;
+  %Z = call i80 @llvm.bitreverse.i80(i80 76151636403560493650080)
+  ret i80 %Z
+}
+
+define <4 x i32> @bitreverse_splat_v4i32() {
+; CHECK-LABEL: define <4 x i32> @bitreverse_splat_v4i32() {
+; CHECK-NEXT:    ret <4 x i32> splat (i32 -2147483648)
+;
+  %Z = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> splat(i32 1))
+  ret <4 x i32> %Z
+}
+
+define <vscale x 4 x i32> @bitreverse_splat_nxv4i32() {
+; CHECK-LABEL: define <vscale x 4 x i32> @bitreverse_splat_nxv4i32() {
+; CHECK-NEXT:    ret <vscale x 4 x i32> splat (i32 -2147483648)
+;
+  %Z = call <vscale x 4 x i32> @llvm.bitreverse.v4i32(<vscale x 4 x i32> splat(i32 1))
+  ret <vscale x 4 x i32> %Z
+}
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/bswap.ll b/llvm/test/Transforms/InstSimplify/ConstProp/bswap.ll
index 42bb73344995b..4db8ced58327a 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/bswap.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/bswap.ll
@@ -2,6 +2,7 @@
 ; bswap should be constant folded when it is passed a constant argument
 
 ; RUN: opt < %s -passes=instsimplify -S | FileCheck %s
+; RUN: opt < %s -passes=instsimplify -use-constant-int-for-fixed-length-splat -use-constant-int-for-scalable-splat -S | FileCheck %s
 
 declare i16 @llvm.bswap.i16(i16)
 
@@ -42,3 +43,19 @@ define i80 @Z() {
   %Z = call i80 @llvm.bswap.i80( i80 76151636403560493650080 )
   ret i80 %Z
 }
+
+define <4 x i32> @bswap_splat_v4i32() {
+; CHECK-LABEL: define <4 x i32> @bswap_splat_v4i32() {
+; CHECK-NEXT:    ret <4 x i32> splat (i32 16777216)
+;
+  %Z = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> splat(i32 1))
+  ret <4 x i32> %Z
+}
+
+define <vscale x 4 x i32> @bswap_splat_nxv4i32() {
+; CHECK-LABEL: define <vscale x 4 x i32> @bswap_splat_nxv4i32() {
+; CHECK-NEXT:    ret <vscale x 4 x i32> splat (i32 16777216)
+;
+  %Z = call <vscale x 4 x i32> @llvm.bswap.v4i32(<vscale x 4 x i32> splat(i32 1))
+  ret <vscale x 4 x i32> %Z
+}
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll b/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll
index e994921f62574..9f9e3f9ffc070 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/vecreduce.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=instsimplify -S | FileCheck %s
+; RUN: opt < %s -passes=instsimplify -use-constant-int-for-fixed-length-splat -S | FileCheck %s
 
 declare i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a)
 declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a)
diff --git a/llvm/test/Transforms/InstSimplify/freeze-noundef.ll b/llvm/test/Transforms/InstSimplify/freeze-noundef.ll
index e6c296994cf6a..728e813c8c813 100644
--- a/llvm/test/Transforms/InstSimplify/freeze-noundef.ll
+++ b/llvm/test/Transforms/InstSimplify/freeze-noundef.ll
@@ -156,3 +156,30 @@ define {i8, i32} @noundef_metadata2(ptr %p) {
   %v.fr = freeze {i8, i32} %v
   ret {i8, i32} %v.fr
 }
+
+; Splats have two poison values but only the poison-ness of the splatted value
+; matters.
+define <4 x i32> @splat(i32 noundef %x) {
+; CHECK-LABEL: @splat(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x i32> [[INS]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <4 x i32> [[SPLAT]]
+;
+  %ins = insertelement <4 x i32> poison, i32 %x, i32 0
+  %splat = shufflevector <4 x i32> %ins, <4 x i32> poison, <4 x i32> zeroinitializer
+  %splat.fr = freeze <4 x i32> %splat
+  ret <4 x i32> %splat.fr
+}
+
+define <4 x i32> @splat_poison_idx(i32 noundef %x) {
+; CHECK-LABEL: @splat_poison_idx(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x i32> [[INS]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+; CHECK-NEXT:    [[SPLAT_FR:%.*]] = freeze <4 x i32> [[SPLAT]]
+; CHECK-NEXT:    ret <4 x i32> [[SPLAT_FR]]
+;
+  %ins = insertelement <4 x i32> poison, i32 %x, i32 0
+  %splat = shufflevector <4 x i32> %ins, <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+  %splat.fr = freeze <4 x i32> %splat
+  ret <4 x i32> %splat.fr
+}
diff --git a/llvm/test/Transforms/InstSimplify/ptrmask.ll b/llvm/test/Transforms/InstSimplify/ptrmask.ll
index 5e7c636d62318..a3483afb158e8 100644
--- a/llvm/test/Transforms/InstSimplify/ptrmask.ll
+++ b/llvm/test/Transforms/InstSimplify/ptrmask.ll
@@ -158,6 +158,26 @@ define ptr addrspace(1) @ptrmask_simplify_ptrmask_i32(ptr addrspace(1) %p) {
   ret ptr addrspace(1) %r
 }
 
+define ptr @ptrmask_simplify_ptrtoaddr(ptr %p) {
+; CHECK-LABEL: define ptr @ptrmask_simplify_ptrtoaddr
+; CHECK-SAME: (ptr [[P:%.*]]) {
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  %m = ptrtoaddr ptr %p to i64
+  %r = call ptr @llvm.ptrmask.p0.i64(ptr %p, i64 %m)
+  ret ptr %r
+}
+
+define ptr addrspace(1) @ptrmask_simplify_ptrtoaddr_i32(ptr addrspace(1) %p) {
+; CHECK-LABEL: define ptr addrspace(1) @ptrmask_simplify_ptrtoaddr_i32
+; CHECK-SAME: (ptr addrspace(1) [[P:%.*]]) {
+; CHECK-NEXT:    ret ptr addrspace(1) [[P]]
+;
+  %m = ptrtoaddr ptr addrspace(1) %p to i32
+  %r = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) %p, i32 %m)
+  ret ptr addrspace(1) %r
+}
+
 define ptr @ptrmask_simplify_aligned_unused(ptr align 64 %p) {
 ; CHECK-LABEL: define ptr @ptrmask_simplify_aligned_unused
 ; CHECK-SAME: (ptr align 64 [[P:%.*]]) {
diff --git a/llvm/test/Transforms/InstSimplify/ptrtoaddr.ll b/llvm/test/Transforms/InstSimplify/ptrtoaddr.ll
new file mode 100644
index 0000000000000..d06b520931b92
--- /dev/null
+++ b/llvm/test/Transforms/InstSimplify/ptrtoaddr.ll
@@ -0,0 +1,318 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -passes=instsimplify -S | FileCheck %s
+
+; The ptrtoaddr folds are also valid for pointers that have external state.
+target datalayout = "pe1:64:64:64:32"
+
+@g = external global i8
+@g2 = external global i8
+
+@g.as1 = external addrspace(1) global i8
+@g2.as1 = external addrspace(1) global i8
+
+define i64 @ptrtoaddr_inttoptr_arg(i64 %a) {
+; CHECK-LABEL: define i64 @ptrtoaddr_inttoptr_arg(
+; CHECK-SAME: i64 [[A:%.*]]) {
+; CHECK-NEXT:    ret i64 [[A]]
+;
+  %toptr = inttoptr i64 %a to ptr
+  %toaddr = ptrtoaddr ptr %toptr to i64
+  ret i64 %toaddr
+}
+
+define i32 @ptrtoaddr_inttoptr_arg_addrsize(i32 %a) {
+; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr_arg_addrsize(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    ret i32 [[A]]
+;
+  %toptr = inttoptr i32 %a to ptr addrspace(1)
+  %toaddr = ptrtoaddr ptr addrspace(1) %toptr to i32
+  ret i32 %toaddr
+}
+
+define i32 @ptrtoaddr_inttoptr() {
+; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr() {
+; CHECK-NEXT:    ret i32 -1
+;
+  %toptr = inttoptr i32 -1 to ptr addrspace(1)
+  %toaddr = ptrtoaddr ptr addrspace(1) %toptr to i32
+  ret i32 %toaddr
+}
+
+define i32 @ptrtoaddr_inttoptr_diff_size1() {
+; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr_diff_size1() {
+; CHECK-NEXT:    ret i32 -1
+;
+  %toptr = inttoptr i64 -1 to ptr addrspace(1)
+  %toaddr = ptrtoaddr ptr addrspace(1) %toptr to i32
+  ret i32 %toaddr
+}
+
+define i32 @ptrtoaddr_inttoptr_diff_size2() {
+; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr_diff_size2() {
+; CHECK-NEXT:    ret i32 65535
+;
+  %toptr = inttoptr i16 -1 to ptr addrspace(1)
+  %toaddr = ptrtoaddr ptr addrspace(1) %toptr to i32
+  ret i32 %toaddr
+}
+
+define i64 @ptrtoaddr_inttoptr_noas1() {
+; CHECK-LABEL: define i64 @ptrtoaddr_inttoptr_noas1() {
+; CHECK-NEXT:    ret i64 1
+;
+  %toptr = getelementptr i8, ptr null, i64 1
+  %toaddr = ptrtoaddr ptr %toptr to i64
+  ret i64 %toaddr
+}
+
+define i64 @ptr2addr2_inttoptr_noas2() {
+; CHECK-LABEL: define i64 @ptr2addr2_inttoptr_noas2() {
+; CHECK-NEXT:    ret i64 123
+;
+  %toptr = inttoptr i64 123 to ptr
+  %toaddr = ptrtoaddr ptr %toptr to i64
+  ret i64 %toaddr
+}
+
+define i64 @ptrtoaddr_inttoptr_noas_diff_size1() {
+; CHECK-LABEL: define i64 @ptrtoaddr_inttoptr_noas_diff_size1() {
+; CHECK-NEXT:    ret i64 4294967295
+;
+  %toptr = inttoptr i32 -1 to ptr
+  %toaddr = ptrtoaddr ptr %toptr to i64
+  ret i64 %toaddr
+}
+
+define i64 @ptrtoaddr_inttoptr_noas_diff_size2() {
+; CHECK-LABEL: define i64 @ptrtoaddr_inttoptr_noas_diff_size2() {
+; CHECK-NEXT:    ret i64 -1
+;
+  %toptr = inttoptr i128 -1 to ptr
+  %toaddr = ptrtoaddr ptr %toptr to i64
+  ret i64 %toaddr
+}
+
+define i64 @ptrtoaddr_gep_null() {
+; CHECK-LABEL: define i64 @ptrtoaddr_gep_null() {
+; CHECK-NEXT:    ret i64 42
+;
+  %toaddr = ptrtoaddr ptr getelementptr (i8, ptr null, i64 42) to i64
+  ret i64 %toaddr
+}
+
+define i32 @ptrtoaddr_gep_null_addrsize() {
+; CHECK-LABEL: define i32 @ptrtoaddr_gep_null_addrsize() {
+; CHECK-NEXT:    ret i32 42
+;
+  %toaddr = ptrtoaddr ptr addrspace(1) getelementptr (i8, ptr addrspace(1) null, i32 42) to i32
+  ret i32 %toaddr
+}
+
+define i64 @ptrtoaddr_gep_sub() {
+; CHECK-LABEL: define i64 @ptrtoaddr_gep_sub() {
+; CHECK-NEXT:    ret i64 sub (i64 ptrtoaddr (ptr @g to i64), i64 ptrtoaddr (ptr @g2 to i64))
+;
+  %toaddr = ptrtoaddr ptr getelementptr (i8, ptr @g, i64 sub (i64 0, i64 ptrtoaddr (ptr @g2 to i64))) to i64
+  ret i64 %toaddr
+}
+
+define i32 @ptrtoaddr_gep_sub_addrsize() {
+; CHECK-LABEL: define i32 @ptrtoaddr_gep_sub_addrsize() {
+; CHECK-NEXT:    ret i32 sub (i32 ptrtoaddr (ptr addrspace(1) @g.as1 to i32), i32 ptrtoaddr (ptr addrspace(1) @g2.as1 to i32))
+;
+  %toaddr = ptrtoaddr ptr addrspace(1) getelementptr (i8, ptr addrspace(1) @g.as1, i32 sub (i32 0, i32 ptrtoaddr (ptr addrspace(1) @g2.as1 to i32))) to i32
+  ret i32 %toaddr
+}
+
+; Don't fold inttoptr of ptrtoaddr away. inttoptr will pick a previously
+; exposed provenance, which is not necessarily that of @g (especially as
+; ptrtoaddr does not expose the provenance.)
+define ptr @inttoptr_of_ptrtoaddr() {
+; CHECK-LABEL: define ptr @inttoptr_of_ptrtoaddr() {
+; CHECK-NEXT:    ret ptr inttoptr (i64 ptrtoaddr (ptr @g to i64) to ptr)
+;
+  %toptr = inttoptr i64 ptrtoaddr (ptr @g to i64) to ptr
+  ret ptr %toptr
+}
+
+define i64 @ptrtoaddr_sub_consts_unrelated() {
+; CHECK-LABEL: define i64 @ptrtoaddr_sub_consts_unrelated() {
+; CHECK-NEXT:    ret i64 sub (i64 ptrtoaddr (ptr @g to i64), i64 ptrtoaddr (ptr @g2 to i64))
+;
+  %sub = sub i64 ptrtoaddr (ptr @g to i64), ptrtoaddr (ptr @g2 to i64)
+  ret i64 %sub
+}
+
+define i64 @ptrtoaddr_sub_consts_offset() {
+; CHECK-LABEL: define i64 @ptrtoaddr_sub_consts_offset() {
+; CHECK-NEXT:    ret i64 42
+;
+  %sub = sub i64 ptrtoaddr (ptr getelementptr (i8, ptr @g, i64 42) to i64), ptrtoaddr (ptr @g to i64)
+  ret i64 %sub
+}
+
+define i32 @ptrtoaddr_sub_consts_offset_addrsize() {
+; CHECK-LABEL: define i32 @ptrtoaddr_sub_consts_offset_addrsize() {
+; CHECK-NEXT:    ret i32 42
+;
+  %sub = sub i32 ptrtoaddr (ptr addrspace(1) getelementptr (i8, ptr addrspace(1) @g.as1, i32 42) to i32), ptrtoaddr (ptr addrspace(1) @g.as1 to i32)
+  ret i32 %sub
+}
+
+define i64 @ptrtoaddr_sub_known_offset(ptr %p) {
+; CHECK-LABEL: define i64 @ptrtoaddr_sub_known_offset(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    ret i64 42
+;
+  %p2 = getelementptr inbounds i8, ptr %p, i64 42
+  %p.addr = ptrtoaddr ptr %p to i64
+  %p2.addr = ptrtoaddr ptr %p2 to i64
+  %sub = sub i64 %p2.addr, %p.addr
+  ret i64 %sub
+}
+
+define i32 @ptrtoaddr_sub_known_offset_addrsize(ptr addrspace(1) %p) {
+; CHECK-LABEL: define i32 @ptrtoaddr_sub_known_offset_addrsize(
+; CHECK-SAME: ptr addrspace(1) [[P:%.*]]) {
+; CHECK-NEXT:    ret i32 42
+;
+  %p2 = getelementptr inbounds i8, ptr addrspace(1) %p, i32 42
+  %p.addr = ptrtoaddr ptr addrspace(1) %p to i32
+  %p2.addr = ptrtoaddr ptr addrspace(1) %p2 to i32
+  %sub = sub i32 %p2.addr, %p.addr
+  ret i32 %sub
+}
+
+define i64 @ptrtoaddr_of_ptradd_of_sub(i64 %x, ptr %p) {
+; CHECK-LABEL: define i64 @ptrtoaddr_of_ptradd_of_sub(
+; CHECK-SAME: i64 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    ret i64 [[X]]
+;
+  %p.addr = ptrtoaddr ptr %p to i64
+  %sub = sub i64 %x, %p.addr
+  %ptradd = getelementptr i8, ptr %p, i64 %sub
+  %ptradd.addr = ptrtoaddr ptr %ptradd to i64
+  ret i64 %ptradd.addr
+}
+
+define i32 @ptrtoaddr_of_ptradd_of_sub_addrsize(i32 %x, ptr addrspace(1) %p) {
+; CHECK-LABEL: define i32 @ptrtoaddr_of_ptradd_of_sub_addrsize(
+; CHECK-SAME: i32 [[X:%.*]], ptr addrspace(1) [[P:%.*]]) {
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %p.addr = ptrtoaddr ptr addrspace(1) %p to i32
+  %sub = sub i32 %x, %p.addr
+  %ptradd = getelementptr i8, ptr addrspace(1) %p, i32 %sub
+  %ptradd.addr = ptrtoaddr ptr addrspace(1) %ptradd to i32
+  ret i32 %ptradd.addr
+}
+
+define ptr @gep_of_sub_ptrtoaddr_unrelated_pointers(ptr %p, ptr %p2, i64 %x) {
+; CHECK-LABEL: define ptr @gep_of_sub_ptrtoaddr_unrelated_pointers(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[P2:%.*]], i64 [[X:%.*]]) {
+; CHECK-NEXT:    [[P2_ADDR:%.*]] = ptrtoaddr ptr [[P2]] to i64
+; CHECK-NEXT:    [[P_ADDR:%.*]] = ptrtoaddr ptr [[P]] to i64
+; CHECK-NEXT:    [[SUB:%.*]] = sub i64 [[P2_ADDR]], [[P_ADDR]]
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr [[P]], i64 [[SUB]]
+; CHECK-NEXT:    ret ptr [[GEP2]]
+;
+  %p2.addr = ptrtoaddr ptr %p2 to i64
+  %p.addr = ptrtoaddr ptr %p to i64
+  %sub = sub i64 %p2.addr, %p.addr
+  %gep2 = getelementptr i8, ptr %p, i64 %sub
+  ret ptr %gep2
+}
+
+define ptr @gep_of_sub_ptrtoaddr(ptr %p, i64 %x) {
+; CHECK-LABEL: define ptr @gep_of_sub_ptrtoaddr(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[X:%.*]]) {
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr [[P]], i64 [[X]]
+; CHECK-NEXT:    ret ptr [[GEP1]]
+;
+  %gep1 = getelementptr i8, ptr %p, i64 %x
+  %gep1.addr = ptrtoaddr ptr %gep1 to i64
+  %p.addr = ptrtoaddr ptr %p to i64
+  %sub = sub i64 %gep1.addr, %p.addr
+  %gep2 = getelementptr i8, ptr %p, i64 %sub
+  ret ptr %gep2
+}
+
+define ptr addrspace(1) @gep_of_sub_ptrtoaddr_addrsize(ptr addrspace(1) %p, i32 %x) {
+; CHECK-LABEL: define ptr addrspace(1) @gep_of_sub_ptrtoaddr_addrsize(
+; CHECK-SAME: ptr addrspace(1) [[P:%.*]], i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr addrspace(1) [[P]], i32 [[X]]
+; CHECK-NEXT:    ret ptr addrspace(1) [[GEP1]]
+;
+  %gep1 = getelementptr i8, ptr addrspace(1) %p, i32 %x
+  %gep1.addr = ptrtoaddr ptr addrspace(1) %gep1 to i32
+  %p.addr = ptrtoaddr ptr addrspace(1) %p to i32
+  %sub = sub i32 %gep1.addr, %p.addr
+  %gep2 = getelementptr i8, ptr addrspace(1) %p, i32 %sub
+  ret ptr addrspace(1) %gep2
+}
+
+define ptr @gep_of_sub_ptrtoaddr_ashr(ptr %p, i64 %x) {
+; CHECK-LABEL: define ptr @gep_of_sub_ptrtoaddr_ashr(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[X:%.*]]) {
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr [[P]], i64 [[X]]
+; CHECK-NEXT:    ret ptr [[GEP1]]
+;
+  %gep1 = getelementptr i8, ptr %p, i64 %x
+  %gep1.addr = ptrtoaddr ptr %gep1 to i64
+  %p.addr = ptrtoaddr ptr %p to i64
+  %sub = sub i64 %gep1.addr, %p.addr
+  %ashr = ashr i64 %sub, 1
+  %gep2 = getelementptr i16, ptr %p, i64 %ashr
+  ret ptr %gep2
+}
+
+define ptr addrspace(1) @gep_of_sub_ptrtoaddr_ashr_addrsize(ptr addrspace(1) %p, i32 %x) {
+; CHECK-LABEL: define ptr addrspace(1) @gep_of_sub_ptrtoaddr_ashr_addrsize(
+; CHECK-SAME: ptr addrspace(1) [[P:%.*]], i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr addrspace(1) [[P]], i32 [[X]]
+; CHECK-NEXT:    ret ptr addrspace(1) [[GEP1]]
+;
+  %gep1 = getelementptr i8, ptr addrspace(1) %p, i32 %x
+  %gep1.addr = ptrtoaddr ptr addrspace(1) %gep1 to i32
+  %p.addr = ptrtoaddr ptr addrspace(1) %p to i32
+  %sub = sub i32 %gep1.addr, %p.addr
+  %sdiv = sdiv i32 %sub, 3
+  %gep2 = getelementptr [3 x i8], ptr addrspace(1) %p, i32 %sdiv
+  ret ptr addrspace(1) %gep2
+}
+
+; Not folding this to inttoptr(123), as this may have different provenance from
+; %p, and the use of ptrtoaddr implies that the provenance of %p may not be
+; exposed, such that inttoptr cannot recover it.
+define ptr @gep_gep_neg_ptrtoaddr(ptr %p) {
+; CHECK-LABEL: define ptr @gep_gep_neg_ptrtoaddr(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 123
+; CHECK-NEXT:    [[P_ADDR:%.*]] = ptrtoaddr ptr [[P]] to i64
+; CHECK-NEXT:    [[P_ADDR_NEG:%.*]] = sub i64 0, [[P_ADDR]]
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr [[GEP1]], i64 [[P_ADDR_NEG]]
+; CHECK-NEXT:    ret ptr [[GEP2]]
+;
+  %gep1 = getelementptr inbounds i8, ptr %p, i64 123
+  %p.addr = ptrtoaddr ptr %p to i64
+  %p.addr.neg = sub i64 0, %p.addr
+  %gep2 = getelementptr i8, ptr %gep1, i64 %p.addr.neg
+  ret ptr %gep2
+}
+
+define ptr @gep_gep_inv_ptrtoaddr(ptr %p) {
+; CHECK-LABEL: define ptr @gep_gep_inv_ptrtoaddr(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 123
+; CHECK-NEXT:    [[P_ADDR:%.*]] = ptrtoaddr ptr [[P]] to i64
+; CHECK-NEXT:    [[P_ADDR_INV:%.*]] = xor i64 [[P_ADDR]], -1
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr [[GEP1]], i64 [[P_ADDR_INV]]
+; CHECK-NEXT:    ret ptr [[GEP2]]
+;
+  %gep1 = getelementptr inbounds i8, ptr %p, i64 123
+  %p.addr = ptrtoaddr ptr %p to i64
+  %p.addr.inv = xor i64 %p.addr, -1
+  %gep2 = getelementptr i8, ptr %gep1, i64 %p.addr.inv
+  ret ptr %gep2
+}
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
index ed9fba3a01965..0b8f676345326 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
@@ -191,7 +191,7 @@ define void @interleave_nxi8_factor2_masked_store_splatmask(ptr %ptr, <vscale x
 ; CHECK-NEXT:    [[INTERLEAVE:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[L]], <vscale x 16 x i8> [[R]])
 ; CHECK-NEXT:    [[MASK_INS:%.*]] = insertelement <vscale x 32 x i1> poison, i1 [[MASK]], i64 0
 ; CHECK-NEXT:    [[MASK_SPLAT:%.*]] = shufflevector <vscale x 32 x i1> [[MASK_INS]], <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
-; CHECK-NEXT:    tail call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVE]], ptr [[PTR]], i32 1, <vscale x 32 x i1> [[MASK_SPLAT]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVE]], ptr align 1 [[PTR]], <vscale x 32 x i1> [[MASK_SPLAT]])
 ; CHECK-NEXT:    ret void
 ;
   %interleave = tail call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> %l, <vscale x 16 x i8> %r)
@@ -289,6 +289,225 @@ define void @interleave_wide_nxdouble_factor2(ptr %ptr, <vscale x 4 x double> %l
   ret void
 }
 
+define void @deinterleave1_nxi64_factor3(ptr %ptr, <vscale x 4 x i64>* %s1, <vscale x 4 x i64>* %s2, <vscale x 4 x i64>* %s3) #0 {
+; CHECK-LABEL: define void @deinterleave1_nxi64_factor3
+; CHECK-SAME: (ptr [[PTR:%.*]], ptr [[S1:%.*]], ptr [[S2:%.*]], ptr [[S3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 0
+; CHECK-NEXT:    [[LDN1:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3.sret.nxv2i64(<vscale x 2 x i1> splat (i1 true), ptr [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN1]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP4]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN1]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP6]], i64 0)
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 3
+; CHECK-NEXT:    [[LDN2:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3.sret.nxv2i64(<vscale x 2 x i1> splat (i1 true), ptr [[TMP8]])
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN2]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP9]], i64 2)
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN2]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP11]], i64 2)
+; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN2]], 2
+; CHECK-NEXT:    [[TMP14:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP13]], i64 2)
+; CHECK-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } poison, <vscale x 4 x i64> [[TMP10]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = insertvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } [[TMP15]], <vscale x 4 x i64> [[TMP12]], 1
+; CHECK-NEXT:    [[TMP17:%.*]] = insertvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } [[TMP16]], <vscale x 4 x i64> [[TMP14]], 2
+; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } [[TMP17]], 0
+; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } [[TMP17]], 1
+; CHECK-NEXT:    [[TMP20:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } [[TMP17]], 2
+; CHECK-NEXT:    store <vscale x 4 x i64> [[TMP18]], ptr [[S1]], align 32
+; CHECK-NEXT:    store <vscale x 4 x i64> [[TMP19]], ptr [[S2]], align 32
+; CHECK-NEXT:    store <vscale x 4 x i64> [[TMP20]], ptr [[S3]], align 32
+; CHECK-NEXT:    ret void
+;
+  %wide.vec = load <vscale x 12 x i64>, ptr %ptr, align 8
+  %ldN = tail call { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave3.nxv12i64(<vscale x 12 x i64> %wide.vec)
+
+  %3 = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } %ldN, 0
+  %4 = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } %ldN, 1
+  %5 = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } %ldN, 2
+
+  store <vscale x 4 x i64> %3, <vscale x 4 x i64>* %s1
+  store <vscale x 4 x i64> %4, <vscale x 4 x i64>* %s2
+  store <vscale x 4 x i64> %5, <vscale x 4 x i64>* %s3
+  ret void
+}
+
+define void @deinterleave2_nxi64_factor3(ptr %ptr, <vscale x 8 x i64>* %s1, <vscale x 8 x i64>* %s2, <vscale x 8 x i64>* %s3) #0 {
+; CHECK-LABEL: define void @deinterleave2_nxi64_factor3
+; CHECK-SAME: (ptr [[PTR:%.*]], ptr [[S1:%.*]], ptr [[S2:%.*]], ptr [[S3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 0
+; CHECK-NEXT:    [[LDN1:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3.sret.nxv2i64(<vscale x 2 x i1> splat (i1 true), ptr [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN1]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP4]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN1]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[TMP6]], i64 0)
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 3
+; CHECK-NEXT:    [[LDN2:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3.sret.nxv2i64(<vscale x 2 x i1> splat (i1 true), ptr [[TMP8]])
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN2]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP9]], i64 2)
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN2]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP11]], i64 2)
+; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN2]], 2
+; CHECK-NEXT:    [[TMP14:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP13]], i64 2)
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 6
+; CHECK-NEXT:    [[LDN3:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3.sret.nxv2i64(<vscale x 2 x i1> splat (i1 true), ptr [[TMP15]])
+; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN3]], 0
+; CHECK-NEXT:    [[TMP17:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP10]], <vscale x 2 x i64> [[TMP16]], i64 4)
+; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN3]], 1
+; CHECK-NEXT:    [[TMP19:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP12]], <vscale x 2 x i64> [[TMP18]], i64 4)
+; CHECK-NEXT:    [[TMP20:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN3]], 2
+; CHECK-NEXT:    [[TMP21:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP14]], <vscale x 2 x i64> [[TMP20]], i64 4)
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 9
+; CHECK-NEXT:    [[LDN4:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3.sret.nxv2i64(<vscale x 2 x i1> splat (i1 true), ptr [[TMP22]])
+; CHECK-NEXT:    [[TMP23:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN4]], 0
+; CHECK-NEXT:    [[TMP24:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP17]], <vscale x 2 x i64> [[TMP23]], i64 6)
+; CHECK-NEXT:    [[TMP25:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN4]], 1
+; CHECK-NEXT:    [[TMP26:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP19]], <vscale x 2 x i64> [[TMP25]], i64 6)
+; CHECK-NEXT:    [[TMP27:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN4]], 2
+; CHECK-NEXT:    [[TMP28:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP21]], <vscale x 2 x i64> [[TMP27]], i64 6)
+; CHECK-NEXT:    [[TMP29:%.*]] = insertvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } poison, <vscale x 8 x i64> [[TMP24]], 0
+; CHECK-NEXT:    [[TMP30:%.*]] = insertvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } [[TMP29]], <vscale x 8 x i64> [[TMP26]], 1
+; CHECK-NEXT:    [[TMP31:%.*]] = insertvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } [[TMP30]], <vscale x 8 x i64> [[TMP28]], 2
+; CHECK-NEXT:    [[TMP32:%.*]] = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } [[TMP31]], 0
+; CHECK-NEXT:    [[TMP33:%.*]] = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } [[TMP31]], 1
+; CHECK-NEXT:    [[TMP34:%.*]] = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } [[TMP31]], 2
+; CHECK-NEXT:    store <vscale x 8 x i64> [[TMP32]], ptr [[S1]], align 64
+; CHECK-NEXT:    store <vscale x 8 x i64> [[TMP33]], ptr [[S2]], align 64
+; CHECK-NEXT:    store <vscale x 8 x i64> [[TMP34]], ptr [[S3]], align 64
+; CHECK-NEXT:    ret void
+;
+  %wide.vec = load <vscale x 24 x i64>, ptr %ptr, align 8
+  %ldN = tail call { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } @llvm.vector.deinterleave3.nxv24i64(<vscale x 24 x i64> %wide.vec)
+
+  %3 = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } %ldN, 0
+  %4 = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } %ldN, 1
+  %5 = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } %ldN, 2
+
+  store <vscale x 8 x i64> %3, <vscale x 8 x i64>* %s1
+  store <vscale x 8 x i64> %4, <vscale x 8 x i64>* %s2
+  store <vscale x 8 x i64> %5, <vscale x 8 x i64>* %s3
+  ret void
+}
+
+define void @deinterleave_neg1_nxi64_factor3(ptr %ptr, <vscale x 1 x i64>* %s1, <vscale x 1 x i64>* %s2, <vscale x 1 x i64>* %s3) #0 {
+; CHECK-LABEL: define void @deinterleave_neg1_nxi64_factor3
+; CHECK-SAME: (ptr [[PTR:%.*]], ptr [[S1:%.*]], ptr [[S2:%.*]], ptr [[S3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 3 x i64>, ptr [[PTR]], align 8
+; CHECK-NEXT:    [[LDN:%.*]] = tail call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave3.nxv3i64(<vscale x 3 x i64> [[WIDE_VEC]])
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[LDN]], 2
+; CHECK-NEXT:    store <vscale x 1 x i64> [[TMP1]], ptr [[S1]], align 8
+; CHECK-NEXT:    store <vscale x 1 x i64> [[TMP2]], ptr [[S2]], align 8
+; CHECK-NEXT:    store <vscale x 1 x i64> [[TMP3]], ptr [[S3]], align 8
+; CHECK-NEXT:    ret void
+;
+  %wide.vec = load <vscale x 3 x i64>, ptr %ptr, align 8
+  %ldN = tail call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave3.nxv3i64(<vscale x 3 x i64> %wide.vec)
+
+  %3 = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } %ldN, 0
+  %4 = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } %ldN, 1
+  %5 = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } %ldN, 2
+
+  store <vscale x 1 x i64> %3, <vscale x 1 x i64>* %s1
+  store <vscale x 1 x i64> %4, <vscale x 1 x i64>* %s2
+  store <vscale x 1 x i64> %5, <vscale x 1 x i64>* %s3
+  ret void
+}
+
+define void @deinterleave_neg2_nxi8_factor3(ptr %ptr, <vscale x 8 x i8>* %s1, <vscale x 8 x i8>* %s2, <vscale x 8 x i8>* %s3) #0 {
+; CHECK-LABEL: define void @deinterleave_neg2_nxi8_factor3
+; CHECK-SAME: (ptr [[PTR:%.*]], ptr [[S1:%.*]], ptr [[S2:%.*]], ptr [[S3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 24 x i8>, ptr [[PTR]], align 8
+; CHECK-NEXT:    [[LDN:%.*]] = tail call { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave3.nxv24i8(<vscale x 24 x i8> [[WIDE_VEC]])
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[LDN]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[LDN]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[LDN]], 2
+; CHECK-NEXT:    store <vscale x 8 x i8> [[TMP1]], ptr [[S1]], align 8
+; CHECK-NEXT:    store <vscale x 8 x i8> [[TMP2]], ptr [[S2]], align 8
+; CHECK-NEXT:    store <vscale x 8 x i8> [[TMP3]], ptr [[S3]], align 8
+; CHECK-NEXT:    ret void
+;
+  %wide.vec = load <vscale x 24 x i8>, ptr %ptr, align 8
+  %ldN = tail call { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave3.nxv12i8(<vscale x 24 x i8> %wide.vec)
+
+  %3 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %ldN, 0
+  %4 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %ldN, 1
+  %5 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %ldN, 2
+
+  store <vscale x 8 x i8> %3, <vscale x 8 x i8>* %s1
+  store <vscale x 8 x i8> %4, <vscale x 8 x i8>* %s2
+  store <vscale x 8 x i8> %5, <vscale x 8 x i8>* %s3
+  ret void
+}
+
+define void @interleave1_nxi64_factor3(ptr %ptr, <vscale x 8 x i64> %s1, <vscale x 8 x i64> %s2, <vscale x 8 x i64> %s3) #0 {
+; CHECK-LABEL: define void @interleave1_nxi64_factor3
+; CHECK-SAME: (ptr [[PTR:%.*]], <vscale x 8 x i64> [[S1:%.*]], <vscale x 8 x i64> [[S2:%.*]], <vscale x 8 x i64> [[S3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S1]], i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S3]], i64 0)
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i1> splat (i1 true), ptr [[TMP1]])
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 3
+; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S1]], i64 2)
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S2]], i64 2)
+; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S3]], i64 2)
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP8]], <vscale x 2 x i1> splat (i1 true), ptr [[TMP5]])
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 6
+; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S1]], i64 4)
+; CHECK-NEXT:    [[TMP11:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S2]], i64 4)
+; CHECK-NEXT:    [[TMP12:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S3]], i64 4)
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP10]], <vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[TMP12]], <vscale x 2 x i1> splat (i1 true), ptr [[TMP9]])
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 9
+; CHECK-NEXT:    [[TMP14:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S1]], i64 6)
+; CHECK-NEXT:    [[TMP15:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S2]], i64 6)
+; CHECK-NEXT:    [[TMP16:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[S3]], i64 6)
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP14]], <vscale x 2 x i64> [[TMP15]], <vscale x 2 x i64> [[TMP16]], <vscale x 2 x i1> splat (i1 true), ptr [[TMP13]])
+; CHECK-NEXT:    ret void
+;
+  %interleave = tail call <vscale x 24 x i64> @llvm.vector.interleave3.nxv24i64(<vscale x 8 x i64> %s1, <vscale x 8 x i64> %s2, <vscale x 8 x i64> %s3)
+
+  store <vscale x 24 x i64> %interleave, ptr %ptr, align 4
+  ret void
+}
+
+define void @interleave2_nxi64_factor3(ptr %ptr, <vscale x 4 x i64> %s1, <vscale x 4 x i64> %s2, <vscale x 4 x i64> %s3) #0 {
+; CHECK-LABEL: define void @interleave2_nxi64_factor3
+; CHECK-SAME: (ptr [[PTR:%.*]], <vscale x 4 x i64> [[S1:%.*]], <vscale x 4 x i64> [[S2:%.*]], <vscale x 4 x i64> [[S3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[S1]], i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[S2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[S3]], i64 0)
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i1> splat (i1 true), ptr [[TMP1]])
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[PTR]], i64 3
+; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[S1]], i64 2)
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[S2]], i64 2)
+; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[S3]], i64 2)
+; CHECK-NEXT:    call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP8]], <vscale x 2 x i1> splat (i1 true), ptr [[TMP5]])
+; CHECK-NEXT:    ret void
+;
+  %interleave = tail call <vscale x 12 x i64> @llvm.vector.interleave3.nxv12i64(<vscale x 4 x i64> %s1, <vscale x 4 x i64> %s2, <vscale x 4 x i64> %s3)
+
+  store <vscale x 12 x i64> %interleave, ptr %ptr, align 4
+  ret void
+}
+
+define void @interleave_neg_nxi8_factor3(ptr %ptr, <vscale x 8 x i8> %s1, <vscale x 8 x i8> %s2, <vscale x 8 x i8> %s3) #0 {
+; CHECK-LABEL: define void @interleave_neg_nxi8_factor3
+; CHECK-SAME: (ptr [[PTR:%.*]], <vscale x 8 x i8> [[S1:%.*]], <vscale x 8 x i8> [[S2:%.*]], <vscale x 8 x i8> [[S3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[INTERLEAVE:%.*]] = tail call <vscale x 24 x i8> @llvm.vector.interleave3.nxv24i8(<vscale x 8 x i8> [[S1]], <vscale x 8 x i8> [[S2]], <vscale x 8 x i8> [[S3]])
+; CHECK-NEXT:    store <vscale x 24 x i8> [[INTERLEAVE]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  %interleave = tail call <vscale x 24 x i8> @llvm.vector.interleave3.nxv24i8(<vscale x 8 x i8> %s1, <vscale x 8 x i8> %s2, <vscale x 8 x i8> %s3)
+
+  store <vscale x 24 x i8> %interleave, ptr %ptr, align 4
+  ret void
+}
+
 declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8>)
 declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
 declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
@@ -312,4 +531,15 @@ declare <vscale x 4 x ptr> @llvm.vector.interleave2.nxv4p0(<vscale x 2 x ptr>, <
 ; Larger interleaves to test 'legalization'
 declare <vscale x 8 x double> @llvm.vector.interleave2.nxv8f64(<vscale x 4 x double>, <vscale x 4 x double>)
 
+; De-Interleaves with Factor=3
+declare { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave3.nxv3i64(<vscale x 3 x i64>)
+declare { <vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave3.nxv12i64(<vscale x 12 x i64>)
+declare { <vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64> } @llvm.vector.deinterleave3.nxv24i64(<vscale x 24 x i64>)
+declare { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave3.nxv24i8(<vscale x 24 x i8>)
+
+; Interleaves with Factor=3
+declare <vscale x 24 x i8> @llvm.vector.interleave3.nxv24i8(<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>)
+declare <vscale x 24 x i64> @llvm.vector.interleave3.nxv24i64(<vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i64>)
+declare <vscale x 12 x i64> @llvm.vector.interleave3.nxv12i64(<vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i64>)
+
 attributes #0 = { vscale_range(1,16) "target-features"="+sve" }
diff --git a/llvm/test/Transforms/LICM/licm-ci.ll b/llvm/test/Transforms/LICM/licm-ci.ll
index f16c67f8ea9ad..b818442f994fc 100644
--- a/llvm/test/Transforms/LICM/licm-ci.ll
+++ b/llvm/test/Transforms/LICM/licm-ci.ll
@@ -13,7 +13,7 @@ define i16 @test(ptr %in) {
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[I_LCSSA:%.*]] = phi i32 [ [[I]], [[LOOP]] ]
 ; CHECK-NEXT:    [[GEP_LE:%.*]] = getelementptr <4 x i16>, ptr [[IN:%.*]], i32 [[I_LCSSA]]
-; CHECK-NEXT:    [[LOAD_LE:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr [[GEP_LE]], i32 2, <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i16> <i16 0, i16 poison, i16 0, i16 poison>), !alias.scope [[META0:![0-9]+]], !noalias [[META0]]
+; CHECK-NEXT:    [[LOAD_LE:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 2 [[GEP_LE]], <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i16> <i16 0, i16 poison, i16 0, i16 poison>), !alias.scope [[META0:![0-9]+]], !noalias [[META0]]
 ; CHECK-NEXT:    [[REDUCE_LE:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[LOAD_LE]])
 ; CHECK-NEXT:    ret i16 [[REDUCE_LE]]
 ;
diff --git a/llvm/test/Transforms/LoopIdiom/AArch64/byte-compare-index.ll b/llvm/test/Transforms/LoopIdiom/AArch64/byte-compare-index.ll
index 39037761c81bb..39fb7744b6f72 100644
--- a/llvm/test/Transforms/LoopIdiom/AArch64/byte-compare-index.ll
+++ b/llvm/test/Transforms/LoopIdiom/AArch64/byte-compare-index.ll
@@ -40,9 +40,9 @@ define i32 @compare_bytes_simple(ptr %a, ptr %b, i32 %len, i32 %extra, i32 %n) {
 ; CHECK-NEXT:    [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
 ; CHECK-NEXT:    [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]]
-; CHECK-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP22]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]]
-; CHECK-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP24]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne <vscale x 16 x i8> [[TMP23]], [[TMP25]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP27]])
@@ -135,9 +135,9 @@ define i32 @compare_bytes_simple(ptr %a, ptr %b, i32 %len, i32 %extra, i32 %n) {
 ; LOOP-DEL-NEXT:    [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
 ; LOOP-DEL-NEXT:    [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
 ; LOOP-DEL-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]]
-; LOOP-DEL-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; LOOP-DEL-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP22]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; LOOP-DEL-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]]
-; LOOP-DEL-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; LOOP-DEL-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP24]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; LOOP-DEL-NEXT:    [[TMP26:%.*]] = icmp ne <vscale x 16 x i8> [[TMP23]], [[TMP25]]
 ; LOOP-DEL-NEXT:    [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
 ; LOOP-DEL-NEXT:    [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP27]])
@@ -263,9 +263,9 @@ define i32 @compare_bytes_signed_wrap(ptr %a, ptr %b, i32 %len, i32 %n) {
 ; CHECK-NEXT:    [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
 ; CHECK-NEXT:    [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]]
-; CHECK-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP22]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]]
-; CHECK-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP24]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne <vscale x 16 x i8> [[TMP23]], [[TMP25]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP27]])
@@ -356,9 +356,9 @@ define i32 @compare_bytes_signed_wrap(ptr %a, ptr %b, i32 %len, i32 %n) {
 ; LOOP-DEL-NEXT:    [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
 ; LOOP-DEL-NEXT:    [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
 ; LOOP-DEL-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]]
-; LOOP-DEL-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; LOOP-DEL-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP22]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; LOOP-DEL-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]]
-; LOOP-DEL-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; LOOP-DEL-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP24]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; LOOP-DEL-NEXT:    [[TMP26:%.*]] = icmp ne <vscale x 16 x i8> [[TMP23]], [[TMP25]]
 ; LOOP-DEL-NEXT:    [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
 ; LOOP-DEL-NEXT:    [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP27]])
@@ -479,9 +479,9 @@ define i32 @compare_bytes_simple_end_ne_found(ptr %a, ptr %b, ptr %c, ptr %d, i3
 ; CHECK-NEXT:    [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
 ; CHECK-NEXT:    [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]]
-; CHECK-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP22]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]]
-; CHECK-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP24]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne <vscale x 16 x i8> [[TMP23]], [[TMP25]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP27]])
@@ -583,9 +583,9 @@ define i32 @compare_bytes_simple_end_ne_found(ptr %a, ptr %b, ptr %c, ptr %d, i3
 ; LOOP-DEL-NEXT:    [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
 ; LOOP-DEL-NEXT:    [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
 ; LOOP-DEL-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]]
-; LOOP-DEL-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; LOOP-DEL-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP22]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; LOOP-DEL-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]]
-; LOOP-DEL-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; LOOP-DEL-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP24]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; LOOP-DEL-NEXT:    [[TMP26:%.*]] = icmp ne <vscale x 16 x i8> [[TMP23]], [[TMP25]]
 ; LOOP-DEL-NEXT:    [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
 ; LOOP-DEL-NEXT:    [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP27]])
@@ -736,9 +736,9 @@ define i32 @compare_bytes_extra_cmp(ptr %a, ptr %b, i32 %len, i32 %n, i32 %x) {
 ; CHECK-NEXT:    [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
 ; CHECK-NEXT:    [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]]
-; CHECK-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP22]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]]
-; CHECK-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP24]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne <vscale x 16 x i8> [[TMP23]], [[TMP25]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP27]])
@@ -835,9 +835,9 @@ define i32 @compare_bytes_extra_cmp(ptr %a, ptr %b, i32 %len, i32 %n, i32 %x) {
 ; LOOP-DEL-NEXT:    [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
 ; LOOP-DEL-NEXT:    [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
 ; LOOP-DEL-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]]
-; LOOP-DEL-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; LOOP-DEL-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP22]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; LOOP-DEL-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]]
-; LOOP-DEL-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; LOOP-DEL-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP24]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; LOOP-DEL-NEXT:    [[TMP26:%.*]] = icmp ne <vscale x 16 x i8> [[TMP23]], [[TMP25]]
 ; LOOP-DEL-NEXT:    [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
 ; LOOP-DEL-NEXT:    [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP27]])
@@ -967,9 +967,9 @@ define void @compare_bytes_cleanup_block(ptr %src1, ptr %src2) {
 ; CHECK-NEXT:    [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP15]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP26:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
 ; CHECK-NEXT:    [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ 1, [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP25:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[MISMATCH_VEC_INDEX]]
-; CHECK-NEXT:    [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP18]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP18]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[MISMATCH_VEC_INDEX]]
-; CHECK-NEXT:    [[TMP21:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP20]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[TMP21:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP20]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; CHECK-NEXT:    [[TMP22:%.*]] = icmp ne <vscale x 16 x i8> [[TMP19]], [[TMP21]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP22]], <vscale x 16 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP24:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP23]])
diff --git a/llvm/test/Transforms/LoopIdiom/AArch64/find-first-byte.ll b/llvm/test/Transforms/LoopIdiom/AArch64/find-first-byte.ll
index ddffd396b5e83..62d15b591c256 100644
--- a/llvm/test/Transforms/LoopIdiom/AArch64/find-first-byte.ll
+++ b/llvm/test/Transforms/LoopIdiom/AArch64/find-first-byte.ll
@@ -39,14 +39,14 @@ define ptr @find_first_of_i8(ptr %search_start, ptr %search_end, ptr %needle_sta
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PSEARCH]] to i64
 ; CHECK-NEXT:    [[SEARCH_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[SEARCH_END_INT]])
 ; CHECK-NEXT:    [[SEARCH_MASKED:%.*]] = and <vscale x 16 x i1> [[TMP0]], [[SEARCH_PRED]]
-; CHECK-NEXT:    [[SEARCH_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[PSEARCH]], i32 1, <vscale x 16 x i1> [[SEARCH_MASKED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[SEARCH_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[PSEARCH]], <vscale x 16 x i1> [[SEARCH_MASKED]], <vscale x 16 x i8> zeroinitializer)
 ; CHECK-NEXT:    br label %[[MATCH_CHECK_VEC:.*]]
 ; CHECK:       [[MATCH_CHECK_VEC]]:
 ; CHECK-NEXT:    [[PNEEDLE:%.*]] = phi ptr [ [[NEEDLE_START]], %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_NEXT_VEC:%.*]], %[[NEEDLE_CHECK_VEC:.*]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64
 ; CHECK-NEXT:    [[NEEDLE_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP2]], i64 [[NEEDLE_END_INT]])
 ; CHECK-NEXT:    [[NEEDLE_MASKED:%.*]] = and <vscale x 16 x i1> [[TMP0]], [[NEEDLE_PRED]]
-; CHECK-NEXT:    [[NEEDLE_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[PNEEDLE]], i32 1, <vscale x 16 x i1> [[NEEDLE_MASKED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[NEEDLE_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[PNEEDLE]], <vscale x 16 x i1> [[NEEDLE_MASKED]], <vscale x 16 x i8> zeroinitializer)
 ; CHECK-NEXT:    [[NEEDLE0:%.*]] = extractelement <vscale x 16 x i8> [[NEEDLE_LOAD_VEC]], i64 0
 ; CHECK-NEXT:    [[NEEDLE0_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[NEEDLE0]], i64 0
 ; CHECK-NEXT:    [[NEEDLE0_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[NEEDLE0_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
@@ -195,14 +195,14 @@ define ptr @find_first_of_i16(ptr %search_start, ptr %search_end, ptr %needle_st
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PSEARCH]] to i64
 ; CHECK-NEXT:    [[SEARCH_PRED:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP1]], i64 [[SEARCH_END_INT]])
 ; CHECK-NEXT:    [[SEARCH_MASKED:%.*]] = and <vscale x 8 x i1> [[TMP0]], [[SEARCH_PRED]]
-; CHECK-NEXT:    [[SEARCH_LOAD_VEC:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[PSEARCH]], i32 1, <vscale x 8 x i1> [[SEARCH_MASKED]], <vscale x 8 x i16> zeroinitializer)
+; CHECK-NEXT:    [[SEARCH_LOAD_VEC:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr align 1 [[PSEARCH]], <vscale x 8 x i1> [[SEARCH_MASKED]], <vscale x 8 x i16> zeroinitializer)
 ; CHECK-NEXT:    br label %[[MATCH_CHECK_VEC:.*]]
 ; CHECK:       [[MATCH_CHECK_VEC]]:
 ; CHECK-NEXT:    [[PNEEDLE:%.*]] = phi ptr [ [[NEEDLE_START]], %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_NEXT_VEC:%.*]], %[[NEEDLE_CHECK_VEC:.*]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64
 ; CHECK-NEXT:    [[NEEDLE_PRED:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP2]], i64 [[NEEDLE_END_INT]])
 ; CHECK-NEXT:    [[NEEDLE_MASKED:%.*]] = and <vscale x 8 x i1> [[TMP0]], [[NEEDLE_PRED]]
-; CHECK-NEXT:    [[NEEDLE_LOAD_VEC:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[PNEEDLE]], i32 1, <vscale x 8 x i1> [[NEEDLE_MASKED]], <vscale x 8 x i16> zeroinitializer)
+; CHECK-NEXT:    [[NEEDLE_LOAD_VEC:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr align 1 [[PNEEDLE]], <vscale x 8 x i1> [[NEEDLE_MASKED]], <vscale x 8 x i16> zeroinitializer)
 ; CHECK-NEXT:    [[NEEDLE0:%.*]] = extractelement <vscale x 8 x i16> [[NEEDLE_LOAD_VEC]], i64 0
 ; CHECK-NEXT:    [[NEEDLE0_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[NEEDLE0]], i64 0
 ; CHECK-NEXT:    [[NEEDLE0_SPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[NEEDLE0_SPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
@@ -351,14 +351,14 @@ define ptr @find_first_of_i8_multi_exit(ptr %search_start, ptr %search_end, ptr
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PSEARCH]] to i64
 ; CHECK-NEXT:    [[SEARCH_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[SEARCH_END_INT]])
 ; CHECK-NEXT:    [[SEARCH_MASKED:%.*]] = and <vscale x 16 x i1> [[TMP0]], [[SEARCH_PRED]]
-; CHECK-NEXT:    [[SEARCH_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[PSEARCH]], i32 1, <vscale x 16 x i1> [[SEARCH_MASKED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[SEARCH_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[PSEARCH]], <vscale x 16 x i1> [[SEARCH_MASKED]], <vscale x 16 x i8> zeroinitializer)
 ; CHECK-NEXT:    br label %[[MATCH_CHECK_VEC:.*]]
 ; CHECK:       [[MATCH_CHECK_VEC]]:
 ; CHECK-NEXT:    [[PNEEDLE:%.*]] = phi ptr [ [[NEEDLE_START]], %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_NEXT_VEC:%.*]], %[[NEEDLE_CHECK_VEC:.*]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64
 ; CHECK-NEXT:    [[NEEDLE_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP2]], i64 [[NEEDLE_END_INT]])
 ; CHECK-NEXT:    [[NEEDLE_MASKED:%.*]] = and <vscale x 16 x i1> [[TMP0]], [[NEEDLE_PRED]]
-; CHECK-NEXT:    [[NEEDLE_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[PNEEDLE]], i32 1, <vscale x 16 x i1> [[NEEDLE_MASKED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[NEEDLE_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[PNEEDLE]], <vscale x 16 x i1> [[NEEDLE_MASKED]], <vscale x 16 x i8> zeroinitializer)
 ; CHECK-NEXT:    [[NEEDLE0:%.*]] = extractelement <vscale x 16 x i8> [[NEEDLE_LOAD_VEC]], i64 0
 ; CHECK-NEXT:    [[NEEDLE0_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[NEEDLE0]], i64 0
 ; CHECK-NEXT:    [[NEEDLE0_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[NEEDLE0_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
@@ -514,14 +514,14 @@ define ptr @ensure_not_found_successors_fixed(ptr %search_start, ptr %search_end
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PSEARCH]] to i64
 ; CHECK-NEXT:    [[SEARCH_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[SEARCH_END_INT]])
 ; CHECK-NEXT:    [[SEARCH_MASKED:%.*]] = and <vscale x 16 x i1> [[TMP0]], [[SEARCH_PRED]]
-; CHECK-NEXT:    [[SEARCH_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[PSEARCH]], i32 1, <vscale x 16 x i1> [[SEARCH_MASKED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[SEARCH_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[PSEARCH]], <vscale x 16 x i1> [[SEARCH_MASKED]], <vscale x 16 x i8> zeroinitializer)
 ; CHECK-NEXT:    br label %[[MATCH_CHECK_VEC:.*]]
 ; CHECK:       [[MATCH_CHECK_VEC]]:
 ; CHECK-NEXT:    [[PNEEDLE:%.*]] = phi ptr [ [[NEEDLE_START]], %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_NEXT_VEC:%.*]], %[[NEEDLE_CHECK_VEC:.*]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64
 ; CHECK-NEXT:    [[NEEDLE_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP2]], i64 [[NEEDLE_END_INT]])
 ; CHECK-NEXT:    [[NEEDLE_MASKED:%.*]] = and <vscale x 16 x i1> [[TMP0]], [[NEEDLE_PRED]]
-; CHECK-NEXT:    [[NEEDLE_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[PNEEDLE]], i32 1, <vscale x 16 x i1> [[NEEDLE_MASKED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[NEEDLE_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[PNEEDLE]], <vscale x 16 x i1> [[NEEDLE_MASKED]], <vscale x 16 x i8> zeroinitializer)
 ; CHECK-NEXT:    [[NEEDLE0:%.*]] = extractelement <vscale x 16 x i8> [[NEEDLE_LOAD_VEC]], i64 0
 ; CHECK-NEXT:    [[NEEDLE0_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[NEEDLE0]], i64 0
 ; CHECK-NEXT:    [[NEEDLE0_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[NEEDLE0_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
@@ -665,14 +665,14 @@ define ptr @ensure_not_found_successors_fixed2(ptr %search_start, ptr %search_en
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PSEARCH]] to i64
 ; CHECK-NEXT:    [[SEARCH_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[SEARCH_END_INT]])
 ; CHECK-NEXT:    [[SEARCH_MASKED:%.*]] = and <vscale x 16 x i1> [[TMP0]], [[SEARCH_PRED]]
-; CHECK-NEXT:    [[SEARCH_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[PSEARCH]], i32 1, <vscale x 16 x i1> [[SEARCH_MASKED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[SEARCH_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[PSEARCH]], <vscale x 16 x i1> [[SEARCH_MASKED]], <vscale x 16 x i8> zeroinitializer)
 ; CHECK-NEXT:    br label %[[MATCH_CHECK_VEC:.*]]
 ; CHECK:       [[MATCH_CHECK_VEC]]:
 ; CHECK-NEXT:    [[PNEEDLE:%.*]] = phi ptr [ [[NEEDLE_START]], %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_NEXT_VEC:%.*]], %[[NEEDLE_CHECK_VEC:.*]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64
 ; CHECK-NEXT:    [[NEEDLE_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP2]], i64 [[NEEDLE_END_INT]])
 ; CHECK-NEXT:    [[NEEDLE_MASKED:%.*]] = and <vscale x 16 x i1> [[TMP0]], [[NEEDLE_PRED]]
-; CHECK-NEXT:    [[NEEDLE_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[PNEEDLE]], i32 1, <vscale x 16 x i1> [[NEEDLE_MASKED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[NEEDLE_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[PNEEDLE]], <vscale x 16 x i1> [[NEEDLE_MASKED]], <vscale x 16 x i8> zeroinitializer)
 ; CHECK-NEXT:    [[NEEDLE0:%.*]] = extractelement <vscale x 16 x i8> [[NEEDLE_LOAD_VEC]], i64 0
 ; CHECK-NEXT:    [[NEEDLE0_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[NEEDLE0]], i64 0
 ; CHECK-NEXT:    [[NEEDLE0_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[NEEDLE0_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
@@ -960,5 +960,6 @@ exit:
 }
 
 attributes #0 = { "target-features"="+sve2" }
-
+;.
 ; CHECK: [[PROF0]] = !{!"branch_weights", i32 10, i32 90}
+;.
diff --git a/llvm/test/Transforms/LoopIdiom/RISCV/byte-compare-index.ll b/llvm/test/Transforms/LoopIdiom/RISCV/byte-compare-index.ll
index 110b4a8b2e902..a82e5eb21cfea 100644
--- a/llvm/test/Transforms/LoopIdiom/RISCV/byte-compare-index.ll
+++ b/llvm/test/Transforms/LoopIdiom/RISCV/byte-compare-index.ll
@@ -289,9 +289,9 @@ define i32 @compare_bytes_simple(ptr %a, ptr %b, i32 %len, i32 %n) {
 ; MASKED-NEXT:    [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
 ; MASKED-NEXT:    [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
 ; MASKED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]]
-; MASKED-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; MASKED-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP22]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; MASKED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]]
-; MASKED-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; MASKED-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP24]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; MASKED-NEXT:    [[TMP26:%.*]] = icmp ne <vscale x 16 x i8> [[TMP23]], [[TMP25]]
 ; MASKED-NEXT:    [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
 ; MASKED-NEXT:    [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP27]])
@@ -656,9 +656,9 @@ define i32 @compare_bytes_signed_wrap(ptr %a, ptr %b, i32 %len, i32 %n) {
 ; MASKED-NEXT:    [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
 ; MASKED-NEXT:    [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
 ; MASKED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]]
-; MASKED-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; MASKED-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP22]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; MASKED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]]
-; MASKED-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; MASKED-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP24]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; MASKED-NEXT:    [[TMP26:%.*]] = icmp ne <vscale x 16 x i8> [[TMP23]], [[TMP25]]
 ; MASKED-NEXT:    [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
 ; MASKED-NEXT:    [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP27]])
@@ -1070,9 +1070,9 @@ define i32 @compare_bytes_simple_end_ne_found(ptr %a, ptr %b, ptr %c, ptr %d, i3
 ; MASKED-NEXT:    [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
 ; MASKED-NEXT:    [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
 ; MASKED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]]
-; MASKED-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; MASKED-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP22]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; MASKED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]]
-; MASKED-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; MASKED-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP24]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; MASKED-NEXT:    [[TMP26:%.*]] = icmp ne <vscale x 16 x i8> [[TMP23]], [[TMP25]]
 ; MASKED-NEXT:    [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
 ; MASKED-NEXT:    [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP27]])
@@ -1510,9 +1510,9 @@ define i32 @compare_bytes_extra_cmp(ptr %a, ptr %b, i32 %len, i32 %n, i32 %x) {
 ; MASKED-NEXT:    [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
 ; MASKED-NEXT:    [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
 ; MASKED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]]
-; MASKED-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; MASKED-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP22]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; MASKED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]]
-; MASKED-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; MASKED-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP24]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; MASKED-NEXT:    [[TMP26:%.*]] = icmp ne <vscale x 16 x i8> [[TMP23]], [[TMP25]]
 ; MASKED-NEXT:    [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
 ; MASKED-NEXT:    [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP27]])
@@ -1852,9 +1852,9 @@ define void @compare_bytes_cleanup_block(ptr %src1, ptr %src2) {
 ; MASKED-NEXT:    [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP15]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP26:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
 ; MASKED-NEXT:    [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ 1, [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP25:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
 ; MASKED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[MISMATCH_VEC_INDEX]]
-; MASKED-NEXT:    [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP18]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; MASKED-NEXT:    [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP18]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; MASKED-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[MISMATCH_VEC_INDEX]]
-; MASKED-NEXT:    [[TMP21:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP20]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; MASKED-NEXT:    [[TMP21:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP20]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; MASKED-NEXT:    [[TMP22:%.*]] = icmp ne <vscale x 16 x i8> [[TMP19]], [[TMP21]]
 ; MASKED-NEXT:    [[TMP23:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP22]], <vscale x 16 x i1> zeroinitializer
 ; MASKED-NEXT:    [[TMP24:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP23]])
diff --git a/llvm/test/Transforms/LoopInterchange/lcssa-phi-outer-latch.ll b/llvm/test/Transforms/LoopInterchange/lcssa-phi-outer-latch.ll
new file mode 100644
index 0000000000000..a5e3accaf8e10
--- /dev/null
+++ b/llvm/test/Transforms/LoopInterchange/lcssa-phi-outer-latch.ll
@@ -0,0 +1,79 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S | FileCheck %s
+
+; This test is checking that blocks outer.body and outer.latch, where outer.body is the exit
+; block of the inner loop and outer.latch the latch of the outer loop, correctly
+; deal with the phi-node use-def chain %new.cond.lcssa -> %old.cond.lcssa. What we expect
+; here is that block outer.latch does not contain a phi node, because it is a single input
+; phi in a non-exit block.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define i16 @main(ptr %a) {
+; CHECK-LABEL: define i16 @main(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[INNER_HEADER_PREHEADER:.*]]
+; CHECK:       [[OUTER_HEADER_PREHEADER:.*]]:
+; CHECK-NEXT:    br label %[[OUTER_HEADER:.*]]
+; CHECK:       [[OUTER_HEADER]]:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], %[[OUTER_LATCH:.*]] ], [ 1, %[[OUTER_HEADER_PREHEADER]] ]
+; CHECK-NEXT:    br label %[[INNER_HEADER_SPLIT:.*]]
+; CHECK:       [[INNER_HEADER_PREHEADER]]:
+; CHECK-NEXT:    br label %[[INNER_HEADER:.*]]
+; CHECK:       [[INNER_HEADER]]:
+; CHECK-NEXT:    [[J:%.*]] = phi i16 [ [[TMP1:%.*]], %[[INNER_LATCH_SPLIT:.*]] ], [ 0, %[[INNER_HEADER_PREHEADER]] ]
+; CHECK-NEXT:    br label %[[OUTER_HEADER_PREHEADER]]
+; CHECK:       [[INNER_HEADER_SPLIT]]:
+; CHECK-NEXT:    [[ARRAYIDX_US_US:%.*]] = getelementptr i16, ptr [[A]], i16 [[J]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[ARRAYIDX_US_US]], align 1
+; CHECK-NEXT:    [[COND:%.*]] = select i1 false, i16 0, i16 0
+; CHECK-NEXT:    br label %[[INNER_LATCH:.*]]
+; CHECK:       [[INNER_LATCH]]:
+; CHECK-NEXT:    [[J_NEXT:%.*]] = add i16 [[J]], 1
+; CHECK-NEXT:    br label %[[OUTER_BODY:.*]]
+; CHECK:       [[INNER_LATCH_SPLIT]]:
+; CHECK-NEXT:    [[NEW_COND_LCSSA:%.*]] = phi i16 [ [[COND]], %[[OUTER_LATCH]] ]
+; CHECK-NEXT:    [[TMP1]] = add i16 [[J]], 1
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[INNER_HEADER]]
+; CHECK:       [[OUTER_BODY]]:
+; CHECK-NEXT:    br label %[[OUTER_LATCH]]
+; CHECK:       [[OUTER_LATCH]]:
+; CHECK-NEXT:    [[I_NEXT]] = add i64 [[I]], 1
+; CHECK-NEXT:    [[CMP286_US:%.*]] = icmp ugt i64 [[I]], 0
+; CHECK-NEXT:    br i1 [[CMP286_US]], label %[[OUTER_HEADER]], label %[[INNER_LATCH_SPLIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[OLD_COND_LCSSA_LCSSA:%.*]] = phi i16 [ [[NEW_COND_LCSSA]], %[[INNER_LATCH_SPLIT]] ]
+; CHECK-NEXT:    ret i16 [[OLD_COND_LCSSA_LCSSA]]
+;
+entry:
+  br label %outer.header
+
+outer.header:
+  %i = phi i64 [ 1, %entry ], [ %i.next, %outer.latch ]
+  br label %inner.header
+
+inner.header:
+  %j = phi i16 [ 0, %outer.header ], [ %j.next, %inner.latch ]
+  %arrayidx.us.us = getelementptr i16, ptr %a, i16 %j
+  %0 = load i16, ptr %arrayidx.us.us, align 1
+  %cond = select i1 false, i16 0, i16 0
+  br label %inner.latch
+
+inner.latch:
+  %j.next = add i16 %j, 1
+  br i1 true, label %outer.body, label %inner.header
+
+outer.body:
+  %new.cond.lcssa = phi i16 [ %cond, %inner.latch ]
+  br label %outer.latch
+
+outer.latch:
+  %old.cond.lcssa = phi i16 [ %new.cond.lcssa, %outer.body ]
+  %i.next = add i64 %i, 1
+  %cmp286.us = icmp ugt i64 %i, 0
+  br i1 %cmp286.us, label %outer.header, label %exit
+
+exit:
+  ret i16 %old.cond.lcssa
+}
diff --git a/llvm/test/Transforms/LoopPredication/preserve-bpi.ll b/llvm/test/Transforms/LoopPredication/preserve-bpi.ll
deleted file mode 100644
index 7fbb19783f464..0000000000000
--- a/llvm/test/Transforms/LoopPredication/preserve-bpi.ll
+++ /dev/null
@@ -1,60 +0,0 @@
-; RUN: opt -mtriple=x86_64 -passes='loop-mssa(loop-predication,licm,simple-loop-unswitch<nontrivial>,loop-simplifycfg)' -debug-pass-manager -debug-only=branch-prob -S < %s 2>&1 | FileCheck %s
-
-; REQUIRES: asserts
-
-; This test is to solely check that we do not run BPI every single time loop
-; predication is invoked (since BPI is preserved as part of
-; LoopStandardAnalysisResults).
-declare void @llvm.experimental.guard(i1, ...)
-
-; CHECK: Running pass: LoopPredicationPass on loop
-; CHECK-NEXT: Running pass: LICMPass on loop
-; CHECK-NEXT: Running pass: SimpleLoopUnswitchPass on loop
-; CHECK-NEXT: Running analysis: OuterAnalysisManagerProxy
-; CHECK-NEXT: Running pass: LoopPredicationPass on loop
-; CHECK-NEXT: Running pass: LICMPass on loop
-; CHECK-NEXT: Running pass: SimpleLoopUnswitchPass on loop
-; CHECK-NEXT: Running pass: LoopSimplifyCFGPass on loop
-
-define i32 @unsigned_loop_0_to_n_ult_check(ptr %array, i32 %length, i32 %n) {
-entry:
-  %tmp5 = icmp eq i32 %n, 0
-  br i1 %tmp5, label %exit, label %loop.preheader
-
-loop.preheader:                                   ; preds = %entry
-  br label %loop
-
-loop:                                             ; preds = %guarded, %loop.preheader
-  %loop.acc = phi i32 [ %loop.acc.next, %guarded ], [ 0, %loop.preheader ]
-  %i = phi i32 [ %i.next, %guarded ], [ 0, %loop.preheader ]
-  %within.bounds = icmp ult i32 %i, %length
-  %widenable_cond = call i1 @llvm.experimental.widenable.condition()
-  %exiplicit_guard_cond = and i1 %within.bounds, %widenable_cond
-  br i1 %exiplicit_guard_cond, label %guarded, label %deopt, !prof !0
-
-deopt:                                            ; preds = %loop
-  %deoptcall = call i32 (...) @llvm.experimental.deoptimize.i32(i32 9) [ "deopt"() ]
-  ret i32 %deoptcall
-
-guarded:                                          ; preds = %loop
-  %i.i64 = zext i32 %i to i64
-  %array.i.ptr = getelementptr inbounds i32, ptr %array, i64 %i.i64
-  %array.i = load i32, ptr %array.i.ptr, align 4
-  %loop.acc.next = add i32 %loop.acc, %array.i
-  %i.next = add nuw i32 %i, 1
-  %continue = icmp ult i32 %i.next, %n
-  br i1 %continue, label %loop, label %exit, !prof !2
-
-exit:                                             ; preds = %guarded, %entry
-  %result = phi i32 [ 0, %entry ], [ %loop.acc.next, %guarded ]
-  ret i32 %result
-}
-
-declare i32 @llvm.experimental.deoptimize.i32(...)
-declare i1 @llvm.experimental.widenable.condition() #0
-
-attributes #0 = { inaccessiblememonly nounwind }
-
-!0 = !{!"branch_weights", i32 1048576, i32 1}
-!1 = !{i32 1, i32 -2147483648}
-!2 = !{!"branch_weights", i32 1024, i32 1}
diff --git a/llvm/test/Transforms/LoopUnroll/ARM/mve-upperbound.ll b/llvm/test/Transforms/LoopUnroll/ARM/mve-upperbound.ll
index 3673b1b4d2241..214d6ce1ada6f 100644
--- a/llvm/test/Transforms/LoopUnroll/ARM/mve-upperbound.ll
+++ b/llvm/test/Transforms/LoopUnroll/ARM/mve-upperbound.ll
@@ -27,10 +27,10 @@ define void @unroll_upper(ptr noundef %pSrc, ptr nocapture noundef writeonly %pD
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 [[INDEX]], 1
 ; CHECK-NEXT:    [[NEXT_GEP37:%.*]] = getelementptr i8, ptr [[PSRC]], i32 [[TMP1]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[AND]])
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[NEXT_GEP37]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 2 [[NEXT_GEP37]], <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = lshr <8 x i16> [[WIDE_MASKED_LOAD]], splat (i16 8)
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
-; CHECK-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP3]], ptr [[NEXT_GEP]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP3]], ptr align 1 [[NEXT_GEP]], <8 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[WHILE_END12_LOOPEXIT:%.*]], label [[VECTOR_BODY]]
diff --git a/llvm/test/Transforms/LoopUnroll/scevunroll.ll b/llvm/test/Transforms/LoopUnroll/scevunroll.ll
index fa55eab062198..bc63f79296a30 100644
--- a/llvm/test/Transforms/LoopUnroll/scevunroll.ll
+++ b/llvm/test/Transforms/LoopUnroll/scevunroll.ll
@@ -465,8 +465,7 @@ define void @peel_int_eq_condition(i32 %start) {
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT_PEEL]], [[ENTRY_PEEL_NEWPH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    [[C_0:%.*]] = icmp eq i32 [[IV]], [[START]]
-; CHECK-NEXT:    br i1 [[C_0]], label [[IF_THEN:%.*]], label [[LOOP_LATCH]]
+; CHECK-NEXT:    br i1 false, label [[IF_THEN:%.*]], label [[LOOP_LATCH]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    call void @fn(i32 [[IV]])
 ; CHECK-NEXT:    br label [[LOOP_LATCH]]
diff --git a/llvm/test/Transforms/LoopVectorize/12-12-11-if-conv.ll b/llvm/test/Transforms/LoopVectorize/12-12-11-if-conv.ll
index 279d4e82ccb85..83623fd82bb4a 100644
--- a/llvm/test/Transforms/LoopVectorize/12-12-11-if-conv.ll
+++ b/llvm/test/Transforms/LoopVectorize/12-12-11-if-conv.ll
@@ -5,8 +5,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ;CHECK-LABEL: @foo(
 ;CHECK: icmp eq <4 x i32>
 ;CHECK: select <4 x i1>
-;CHECK: ret i32
-define i32 @foo(i32 %x, i32 %t, ptr nocapture %A) nounwind uwtable ssp {
+;CHECK: ret void
+define void @foo(i32 %x, i32 %t, ptr nocapture %A) nounwind uwtable ssp {
 entry:
   %cmp10 = icmp sgt i32 %x, 0
   br i1 %cmp10, label %for.body, label %for.end
@@ -35,5 +35,5 @@ if.end:                                           ; preds = %for.body, %if.then
   br i1 %exitcond, label %for.end, label %for.body
 
 for.end:                                          ; preds = %if.end, %entry
-  ret i32 undef
+  ret void
 }
diff --git a/llvm/test/Transforms/LoopVectorize/2012-10-20-infloop.ll b/llvm/test/Transforms/LoopVectorize/2012-10-20-infloop.ll
index 2b5960eb91e50..90d0db2a9acff 100644
--- a/llvm/test/Transforms/LoopVectorize/2012-10-20-infloop.ll
+++ b/llvm/test/Transforms/LoopVectorize/2012-10-20-infloop.ll
@@ -56,7 +56,7 @@ if.then46:                                        ; preds = %for.body40
   br label %for.inc50
 
 for.inc50:                                        ; preds = %if.then46, %for.body40
-  %k.1 = phi i32 [ undef, %for.body40 ], [ %inc47, %if.then46 ]
+  %k.1 = phi i32 [ 0, %for.body40 ], [ %inc47, %if.then46 ]
   %step.1 = phi i32 [ %step.0121, %for.body40 ], [ %inc47, %if.then46 ]
   %indvars.iv.next124 = add i64 %indvars.iv123, 1
   %lftr.wideiv = trunc i64 %indvars.iv.next124 to i32
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
index 6e3d257e531ba..0415b01d78b46 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
@@ -27,7 +27,7 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,1
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl nuw nsw <vscale x 8 x i64> [[VEC_IND]], splat (i64 3)
 ; CHECK-NEXT:    [[TMP11:%.*]] = lshr <vscale x 8 x i64> [[BROADCAST_SPLAT]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = trunc <vscale x 8 x i64> [[TMP11]] to <vscale x 8 x i8>
-; CHECK-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP14]], ptr [[NEXT_GEP]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP14]], ptr align 1 [[NEXT_GEP]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 8)
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
@@ -89,7 +89,7 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl nuw nsw <vscale x 8 x i64> [[VEC_IND]], splat (i64 3)
 ; CHECK-NEXT:    [[TMP11:%.*]] = lshr <vscale x 8 x i64> [[BROADCAST_SPLAT]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = trunc <vscale x 8 x i64> [[TMP11]] to <vscale x 8 x i8>
-; CHECK-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP14]], ptr [[NEXT_GEP]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP14]], ptr align 1 [[NEXT_GEP]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index 6cf11be0e11f7..f16351720b20f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -470,7 +470,7 @@ define i32 @header_mask_and_invariant_compare(ptr %A, ptr %B, ptr %C, ptr %D, pt
 ; DEFAULT-NEXT:    store i32 [[TMP22]], ptr [[E]], align 4, !alias.scope [[META15]], !noalias [[META17]]
 ; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE37]]
 ; DEFAULT:       [[PRED_STORE_CONTINUE37]]:
-; DEFAULT-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr [[TMP16]], i32 4, <4 x i1> [[TMP8]]), !alias.scope [[META19:![0-9]+]], !noalias [[META20:![0-9]+]]
+; DEFAULT-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr align 4 [[TMP16]], <4 x i1> [[TMP8]]), !alias.scope [[META19:![0-9]+]], !noalias [[META20:![0-9]+]]
 ; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; DEFAULT-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; DEFAULT-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
@@ -579,7 +579,7 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
 ; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
 ; PRED-NEXT:    [[TMP13:%.*]] = or <vscale x 2 x i16> [[BROADCAST_SPLAT]], splat (i16 1)
 ; PRED-NEXT:    [[TMP14:%.*]] = uitofp <vscale x 2 x i16> [[TMP13]] to <vscale x 2 x double>
-; PRED-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP14]], ptr [[NEXT_GEP]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; PRED-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP14]], ptr align 8 [[NEXT_GEP]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP10]])
 ; PRED-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -660,16 +660,17 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
 ; COMMON-NEXT:    store i8 6, ptr [[TMP6]], align 1
 ; COMMON-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
 ; COMMON:       [[PRED_STORE_CONTINUE12]]:
-; COMMON-NEXT:    br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[EXIT1:.*]]
+; COMMON-NEXT:    br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[EXIT:.*]]
 ; COMMON:       [[PRED_STORE_IF13]]:
 ; COMMON-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[DST]], i64 7
 ; COMMON-NEXT:    store i8 7, ptr [[TMP7]], align 1
-; COMMON-NEXT:    br label %[[EXIT1]]
+; COMMON-NEXT:    br label %[[EXIT]]
+; COMMON:       [[EXIT]]:
+; COMMON-NEXT:    br label %[[SCALAR_PH:.*]]
+; COMMON:       [[SCALAR_PH]]:
+; COMMON-NEXT:    br label %[[EXIT1:.*]]
 ; COMMON:       [[EXIT1]]:
-; COMMON-NEXT:    br label %[[SCALAR_PH1:.*]]
-; COMMON:       [[SCALAR_PH1]]:
-; COMMON-NEXT:    br [[EXIT:label %.*]]
-; COMMON:       [[SCALAR_PH:.*:]]
+; COMMON-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -1325,10 +1326,10 @@ define void @pred_udiv_select_cost(ptr %A, ptr %B, ptr %C, i64 %n, i8 %y) #1 {
 ; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; PRED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP12]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; PRED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP12]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
 ; PRED-NEXT:    [[TMP13:%.*]] = uitofp <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x float>
 ; PRED-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; PRED-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP14]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; PRED-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
 ; PRED-NEXT:    [[TMP15:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_MASKED_LOAD5]], zeroinitializer
 ; PRED-NEXT:    [[TMP16:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP15]], <vscale x 16 x i1> zeroinitializer
 ; PRED-NEXT:    [[TMP17:%.*]] = xor <vscale x 16 x i8> [[WIDE_MASKED_LOAD]], splat (i8 1)
@@ -1343,7 +1344,7 @@ define void @pred_udiv_select_cost(ptr %A, ptr %B, ptr %C, i64 %n, i8 %y) #1 {
 ; PRED-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x float> @llvm.fmuladd.nxv16f32(<vscale x 16 x float> [[TMP24]], <vscale x 16 x float> splat (float 3.000000e+00), <vscale x 16 x float> [[TMP13]])
 ; PRED-NEXT:    [[TMP26:%.*]] = fptoui <vscale x 16 x float> [[TMP25]] to <vscale x 16 x i8>
 ; PRED-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[C]], i64 [[INDEX]]
-; PRED-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP26]], ptr [[TMP27]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; PRED-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP26]], ptr align 1 [[TMP27]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
 ; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP11]])
 ; PRED-NEXT:    [[TMP28:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
index d10a26d1a73df..72e813b62025f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
@@ -139,7 +139,7 @@ define void @sdiv_feeding_gep_predicated(ptr %dst, i32 %x, i64 %M, i64 %conv6, i
 ; CHECK-NEXT:    [[TMP32:%.*]] = add i32 [[TMP31]], [[TMP30]]
 ; CHECK-NEXT:    [[TMP33:%.*]] = sext i32 [[TMP32]] to i64
 ; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP33]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> zeroinitializer, ptr [[TMP34]], i32 8, <vscale x 2 x i1> [[TMP23]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> zeroinitializer, ptr align 8 [[TMP34]], <vscale x 2 x i1> [[TMP23]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP14]])
 ; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -241,12 +241,12 @@ define void @udiv_urem_feeding_gep(i64 %x, ptr %dst, i64 %N) {
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP23:%.*]] = udiv <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <vscale x 2 x i64> [[TMP23]], i32 0
 ; CHECK-NEXT:    [[TMP24:%.*]] = urem i64 [[INDEX]], [[MUL_2_I]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = udiv i64 [[TMP24]], [[MUL_1_I]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = urem i64 [[TMP24]], [[MUL_1_I]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = udiv i64 [[TMP26]], [[X]]
 ; CHECK-NEXT:    [[TMP28:%.*]] = urem i64 [[TMP26]], [[X]]
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <vscale x 2 x i64> [[TMP23]], i32 0
 ; CHECK-NEXT:    [[TMP30:%.*]] = mul i64 [[X]], [[TMP29]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = add i64 [[TMP30]], [[TMP25]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = mul i64 [[TMP31]], [[X]]
@@ -256,7 +256,7 @@ define void @udiv_urem_feeding_gep(i64 %x, ptr %dst, i64 %N) {
 ; CHECK-NEXT:    [[TMP36:%.*]] = shl i64 [[TMP35]], 32
 ; CHECK-NEXT:    [[TMP37:%.*]] = ashr i64 [[TMP36]], 32
 ; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP37]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP23]], ptr [[TMP38]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP23]], ptr align 4 [[TMP38]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP14]])
 ; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/drop-poison-generating-flags.ll b/llvm/test/Transforms/LoopVectorize/AArch64/drop-poison-generating-flags.ll
index 71acac25e4efe..f78ce0ac9a32a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/drop-poison-generating-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/drop-poison-generating-flags.ll
@@ -21,7 +21,7 @@ define void @check_widen_intrinsic_with_nnan(ptr noalias %dst.0, ptr noalias %ds
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor <4 x i1> [[TMP4]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], -1
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr double, ptr [[DST_0]], i64 [[TMP6]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> zeroinitializer, ptr [[TMP7]], i32 8, <4 x i1> [[TMP5]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> zeroinitializer, ptr align 8 [[TMP7]], <4 x i1> [[TMP5]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP9]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
 ; CHECK:       [[PRED_LOAD_IF]]:
@@ -54,12 +54,12 @@ define void @check_widen_intrinsic_with_nnan(ptr noalias %dst.0, ptr noalias %ds
 ; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE6]]
 ; CHECK:       [[PRED_LOAD_CONTINUE6]]:
 ; CHECK-NEXT:    [[TMP24:%.*]] = phi <4 x double> [ [[TMP20]], %[[PRED_LOAD_CONTINUE4]] ], [ [[TMP23]], %[[PRED_LOAD_IF5]] ]
-; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> zeroinitializer, ptr [[TMP7]], i32 8, <4 x i1> [[TMP4]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> zeroinitializer, ptr align 8 [[TMP7]], <4 x i1> [[TMP4]])
 ; CHECK-NEXT:    [[TMP28:%.*]] = fcmp oeq <4 x double> [[TMP24]], zeroinitializer
 ; CHECK-NEXT:    [[TMP29:%.*]] = select <4 x i1> [[TMP4]], <4 x i1> [[TMP28]], <4 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP30:%.*]] = or <4 x i1> [[TMP5]], [[TMP29]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr i32, ptr [[DST_1]], i64 [[TMP6]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> splat (i32 10), ptr [[TMP32]], i32 4, <4 x i1> [[TMP30]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> splat (i32 10), ptr align 4 [[TMP32]], <4 x i1> [[TMP30]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP34]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
index 307d4c43198af..d23e3c29b59e5 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
@@ -27,15 +27,15 @@ define void @test_widen_ptr_induction(ptr %ptr.start.1) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x ptr> poison, ptr [[NEXT_GEP2]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x ptr> [[TMP6]], ptr [[NEXT_GEP3]], i32 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <2 x ptr> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <2 x ptr> [[TMP7]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP10]])
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP9]], i32 1
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP11]])
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i1> [[TMP9]], i32 0
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP12]])
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP9]], i32 1
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP13]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP14]])
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 2
 ; CHECK-NEXT:    store <2 x i8> zeroinitializer, ptr [[NEXT_GEP]], align 1
 ; CHECK-NEXT:    store <2 x i8> zeroinitializer, ptr [[TMP15]], align 1
@@ -61,8 +61,8 @@ define void @test_widen_ptr_induction(ptr %ptr.start.1) {
 ; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x ptr> [[TMP19]], ptr [[NEXT_GEP8]], i32 1
 ; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <2 x ptr> [[TMP20]], zeroinitializer
 ; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x i1> [[TMP21]], i32 0
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP22]])
 ; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <2 x i1> [[TMP21]], i32 1
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP22]])
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP23]])
 ; CHECK-NEXT:    store <2 x i8> zeroinitializer, ptr [[NEXT_GEP7]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[INDEX6]], 2
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence-fold-tail.ll b/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence-fold-tail.ll
index 4bb8a0e72acb7..e57b28fc83b0e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence-fold-tail.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence-fold-tail.ll
@@ -17,6 +17,7 @@ define i32 @test_phi_iterator_invalidation(ptr %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
 ; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 0>, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[PRED_LOAD_CONTINUE6]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 1)
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
 ; CHECK:       pred.load.if:
@@ -59,9 +60,8 @@ define i32 @test_phi_iterator_invalidation(ptr %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP24]] = phi <4 x i16> [ [[TMP18]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP23]], [[PRED_LOAD_IF5]] ]
 ; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[TMP24]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP26:%.*]] = sext <4 x i16> [[TMP25]] to <4 x i32>
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[TMP27]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP26]], ptr [[TMP28]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP26]], ptr align 4 [[TMP28]], <4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX_NEXT]], i64 1002)
 ; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll
index a3a0c3e76fcdd..16e9d410e4aa7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll
@@ -70,7 +70,7 @@ define void @PR34711(ptr %a, ptr %b, ptr %c, i64 %n) #0 {
 ; CHECK-VF4UF1-LABEL: @PR34711
 ; CHECK-VF4UF1: vector.body
 ; CHECK-VF4UF1: %[[VEC_RECUR:.*]] = phi <vscale x 4 x i16> [ %vector.recur.init, %vector.ph ], [ %[[MGATHER:.*]], %vector.body ]
-; CHECK-VF4UF1: %[[MGATHER]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> {{.*}}, i32 2, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i16> poison)
+; CHECK-VF4UF1: %[[MGATHER]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> {{.*}}, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i16> poison)
 ; CHECK-VF4UF1-NEXT: %[[SPLICE:.*]] = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> %[[VEC_RECUR]], <vscale x 4 x i16> %[[MGATHER]], i32 -1)
 ; CHECK-VF4UF1-NEXT: %[[SXT1:.*]] = sext <vscale x 4 x i16> %[[SPLICE]] to <vscale x 4 x i32>
 ; CHECK-VF4UF1-NEXT: %[[SXT2:.*]] = sext <vscale x 4 x i16> %[[MGATHER]] to <vscale x 4 x i32>
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fixed-wide-lane-mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fixed-wide-lane-mask.ll
index 7548bf64dcc97..faee4c1194018 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fixed-wide-lane-mask.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fixed-wide-lane-mask.ll
@@ -24,7 +24,7 @@ define void @fixed_wide_active_lane_mask(ptr noalias %dst, ptr noalias readonly
 ; CHECK-UF1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i64 0
 ; CHECK-UF1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-UF1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDEX]]
-; CHECK-UF1-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP4]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-UF1-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr align 4 [[TMP4]], <4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-UF1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-UF1-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP2]])
 ; CHECK-UF1-NEXT:    [[TMP5:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -59,10 +59,10 @@ define void @fixed_wide_active_lane_mask(ptr noalias %dst, ptr noalias readonly
 ; CHECK-UF4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 4
 ; CHECK-UF4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 8
 ; CHECK-UF4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 12
-; CHECK-UF4-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP8]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-UF4-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP17]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK4]])
-; CHECK-UF4-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP18]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK5]])
-; CHECK-UF4-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP19]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK6]])
+; CHECK-UF4-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr align 4 [[TMP8]], <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-UF4-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr align 4 [[TMP17]], <4 x i1> [[ACTIVE_LANE_MASK4]])
+; CHECK-UF4-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr align 4 [[TMP18]], <4 x i1> [[ACTIVE_LANE_MASK5]])
+; CHECK-UF4-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr align 4 [[TMP19]], <4 x i1> [[ACTIVE_LANE_MASK6]])
 ; CHECK-UF4-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
 ; CHECK-UF4-NEXT:    [[ACTIVE_LANE_MASK_NEXT:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 [[INDEX]], i64 [[TMP6]])
 ; CHECK-UF4-NEXT:    [[TMP12]] = call <4 x i1> @llvm.vector.extract.v4i1.v16i1(<16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 12)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll
index 4ddf51b9bdd58..26a9545764091 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll
@@ -88,7 +88,7 @@ define dso_local double @test(ptr nocapture noundef readonly %data, ptr nocaptur
 ; SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i32>, ptr [[TMP5]], align 4
 ; SVE-NEXT:    [[TMP7:%.*]] = sext <vscale x 2 x i32> [[WIDE_LOAD]] to <vscale x 2 x i64>
 ; SVE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds double, ptr [[DATA:%.*]], <vscale x 2 x i64> [[TMP7]]
-; SVE-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> [[TMP8]], i32 8, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x double> poison)
+; SVE-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), <vscale x 2 x double> poison)
 ; SVE-NEXT:    [[TMP9]] = fadd <vscale x 2 x double> [[VEC_PHI]], [[WIDE_MASKED_GATHER]]
 ; SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; SVE-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
index fd6e275d098ca..cfc6cc87a2a21 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -148,14 +148,14 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[INDEX]]
-; PRED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP12]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; PRED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP12]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
 ; PRED-NEXT:    [[TMP13:%.*]] = zext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i16>
 ; PRED-NEXT:    [[TMP14:%.*]] = mul <vscale x 16 x i16> [[TMP13]], [[TMP11]]
 ; PRED-NEXT:    [[TMP15:%.*]] = or <vscale x 16 x i16> [[TMP14]], [[TMP13]]
 ; PRED-NEXT:    [[TMP16:%.*]] = lshr <vscale x 16 x i16> [[TMP15]], splat (i16 1)
 ; PRED-NEXT:    [[TMP17:%.*]] = trunc <vscale x 16 x i16> [[TMP16]] to <vscale x 16 x i8>
 ; PRED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
-; PRED-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP17]], ptr [[TMP18]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; PRED-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP17]], ptr align 1 [[TMP18]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
 ; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP5]]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP10]])
 ; PRED-NEXT:    [[TMP19:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll
index 93e71af74f4ac..5d80353fcadd8 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll
@@ -42,7 +42,7 @@ define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i3
 ; CHECK-NEXT:    br label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 1, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
 ; CHECK:       [[VEC_EPILOG_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP13]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -69,7 +69,7 @@ define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i3
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI8:%.*]] = phi <vscale x 2 x i64> [ [[TMP22]], %[[VEC_EPILOG_PH]] ], [ [[TMP35:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], <vscale x 2 x i64> [[VEC_IND]], i32 0, i64 3
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0(<vscale x 2 x ptr> [[TMP28]], i32 1, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0(<vscale x 2 x ptr> align 1 [[TMP28]], <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i8> poison)
 ; CHECK-NEXT:    [[TMP29:%.*]] = zext <vscale x 2 x i8> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i32>
 ; CHECK-NEXT:    [[TMP30:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP24]], <vscale x 2 x i32> [[TMP29]])
 ; CHECK-NEXT:    [[TMP31:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP24]], <vscale x 2 x i32> [[TMP30]])
@@ -80,7 +80,7 @@ define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i3
 ; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[INDEX7]], [[TMP17]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT6]]
 ; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP37:%.*]] = call i64 @llvm.vector.reduce.or.nxv2i64(<vscale x 2 x i64> [[TMP35]])
 ; CHECK-NEXT:    br label %[[VEC_EPILOG_SCALAR_PH]]
@@ -104,7 +104,7 @@ define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i3
 ; CHECK-NEXT:    [[RED_NEXT]] = or i64 [[RED]], [[MIN_EXT]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT_I_I:%.*]] = icmp eq i64 [[IV_NEXT]], 17
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT_I_I]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT_I_I]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ]
 ; CHECK-NEXT:    ret i64 [[RED_NEXT_LCSSA]]
@@ -167,13 +167,13 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no
 ; CHECK-NEXT:    [[TMP11]] = or <16 x i64> [[VEC_PHI]], [[TMP10]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> [[TMP11]])
 ; CHECK-NEXT:    br label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 1, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3]]
 ; CHECK:       [[VEC_EPILOG_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP13]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -200,7 +200,7 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI8:%.*]] = phi <vscale x 2 x i64> [ [[TMP22]], %[[VEC_EPILOG_PH]] ], [ [[TMP35:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], <vscale x 2 x i64> [[VEC_IND]], i32 0, i64 3
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0(<vscale x 2 x ptr> [[TMP28]], i32 1, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0(<vscale x 2 x ptr> align 1 [[TMP28]], <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i8> poison)
 ; CHECK-NEXT:    [[TMP29:%.*]] = zext <vscale x 2 x i8> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i32>
 ; CHECK-NEXT:    [[TMP30:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP24]], <vscale x 2 x i32> [[TMP29]])
 ; CHECK-NEXT:    [[TMP31:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP24]], <vscale x 2 x i32> [[TMP30]])
@@ -211,7 +211,7 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no
 ; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[INDEX7]], [[TMP17]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT6]]
 ; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP37:%.*]] = call i64 @llvm.vector.reduce.or.nxv2i64(<vscale x 2 x i64> [[TMP35]])
 ; CHECK-NEXT:    br label %[[VEC_EPILOG_SCALAR_PH]]
@@ -235,7 +235,7 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no
 ; CHECK-NEXT:    [[RED_NEXT]] = or i64 [[RED]], [[MIN_EXT]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT_I_I:%.*]] = icmp eq i64 [[IV_NEXT]], 17
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT_I_I]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT_I_I]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ]
 ; CHECK-NEXT:    ret i64 [[RED_NEXT_LCSSA]]
@@ -284,17 +284,17 @@ define void @main_vector_loop_fixed_single_vector_iteration_with_runtime_checks(
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[GEP_J]], align 8
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc <4 x i64> [[STRIDED_VEC]] to <4 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i16> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i16> [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i16, ptr [[K]], i64 [[IV]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i16, ptr [[K]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i16, ptr [[K]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i16, ptr [[K]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i16> [[TMP5]], i32 0
 ; CHECK-NEXT:    store i16 [[TMP10]], ptr [[TMP6]], align 2
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i16> [[TMP5]], i32 1
 ; CHECK-NEXT:    store i16 [[TMP11]], ptr [[TMP7]], align 2
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2
 ; CHECK-NEXT:    store i16 [[TMP12]], ptr [[TMP8]], align 2
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
 ; CHECK-NEXT:    store i16 [[TMP13]], ptr [[TMP9]], align 2
 ; CHECK-NEXT:    store i64 0, ptr [[A]], align 8
 ; CHECK-NEXT:    store i64 0, ptr [[B]], align 8
@@ -308,7 +308,7 @@ define void @main_vector_loop_fixed_single_vector_iteration_with_runtime_checks(
 ; CHECK-NEXT:    store i64 0, ptr [[L]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4
-; CHECK-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[SCALAR_PH:.*]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -332,7 +332,7 @@ define void @main_vector_loop_fixed_single_vector_iteration_with_runtime_checks(
 ; CHECK-NEXT:    store i64 0, ptr [[L]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV1]], 2
 ; CHECK-NEXT:    [[EC:%.*]] = icmp ult i64 [[IV1]], 14
-; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/invalid-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/invalid-costs.ll
index 803ffa83e1a45..cc3b1c9c9db8a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/invalid-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/invalid-costs.ll
@@ -26,7 +26,7 @@ define void @replicate_sdiv_conditional(ptr noalias %a, ptr noalias %b, ptr noal
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp slt <vscale x 4 x i32> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP6]], i32 4, <vscale x 4 x i1> [[TMP5]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP6]], <vscale x 4 x i1> [[TMP5]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP7:%.*]] = sext <vscale x 4 x i32> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i64>
 ; CHECK-NEXT:    [[TMP8:%.*]] = ashr <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], splat (i32 1)
 ; CHECK-NEXT:    [[TMP9:%.*]] = add <vscale x 4 x i32> [[TMP8]], [[WIDE_LOAD]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
index e424649cf50c6..cf45f3a88f37e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
@@ -403,9 +403,9 @@ define void @overflow_indvar_known_false(ptr nocapture noundef %p, i32 noundef %
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[TMP0]], [[INDEX]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP13]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP13]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
 ; CHECK-NEXT:    [[TMP15:%.*]] = add <vscale x 16 x i8> [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP15]], ptr [[TMP13]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP15]], ptr align 1 [[TMP13]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP3]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT]], i64 [[TMP1]])
 ; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -541,3 +541,22 @@ exit:                                 ; preds = %for.body
 ; CHECK-VS1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; CHECK-VS1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK-VS1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-VS1: [[PROF3]] = !{!"branch_weights", i32 8, i32 8}
+; CHECK-VS1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK-VS1: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]}
+; CHECK-VS1: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK-VS1: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]]}
+; CHECK-VS1: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK-VS1: [[PROF9]] = !{!"branch_weights", i32 10, i32 30}
+;.
+; CHECK-VS2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-VS2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-VS2: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-VS2: [[PROF3]] = !{!"branch_weights", i32 8, i32 8}
+; CHECK-VS2: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK-VS2: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]}
+; CHECK-VS2: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK-VS2: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]]}
+; CHECK-VS2: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK-VS2: [[PROF9]] = !{!"branch_weights", i32 10, i32 30}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
index d8a81f9316e4b..31453e9509ea3 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
@@ -37,7 +37,7 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #4 {
 ; TFNONE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[N_VEC]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
 ; TFNONE-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
 ; TFNONE-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
-; TFNONE-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR4:[0-9]+]]
+; TFNONE-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR3:[0-9]+]]
 ; TFNONE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
 ; TFNONE-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; TFNONE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -56,10 +56,10 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #4 {
 ; TFCOMMON-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; TFCOMMON-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ splat (i1 true), %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; TFCOMMON-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDEX]]
-; TFCOMMON-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; TFCOMMON-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 8 [[TMP5]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
 ; TFCOMMON-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFCOMMON-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; TFCOMMON-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP6]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFCOMMON-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP6]], ptr align 8 [[TMP7]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFCOMMON-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; TFCOMMON-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
 ; TFCOMMON-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -85,16 +85,16 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #4 {
 ; TFA_INTERLEAVE-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; TFA_INTERLEAVE-NEXT:    [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 1
 ; TFA_INTERLEAVE-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[TMP7]], i64 [[TMP9]]
-; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
-; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP10]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x i64> poison)
+; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 8 [[TMP7]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 8 [[TMP10]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x i64> poison)
 ; TFA_INTERLEAVE-NEXT:    [[TMP11:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFA_INTERLEAVE-NEXT:    [[TMP12:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD3]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
 ; TFA_INTERLEAVE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
 ; TFA_INTERLEAVE-NEXT:    [[TMP15:%.*]] = shl nuw i64 [[TMP14]], 1
 ; TFA_INTERLEAVE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i64 [[TMP15]]
-; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP11]], ptr [[TMP13]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP12]], ptr [[TMP16]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
+; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP11]], ptr align 8 [[TMP13]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP12]], ptr align 8 [[TMP16]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
 ; TFA_INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
 ; TFA_INTERLEAVE-NEXT:    [[TMP18:%.*]] = shl nuw i64 [[TMP17]], 1
@@ -160,7 +160,7 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 {
 ; TFNONE-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[TMP12]], 50
 ; TFNONE-NEXT:    br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END]]
 ; TFNONE:       [[IF_THEN]]:
-; TFNONE-NEXT:    [[TMP9:%.*]] = call i64 @foo(i64 [[TMP12]]) #[[ATTR4]]
+; TFNONE-NEXT:    [[TMP9:%.*]] = call i64 @foo(i64 [[TMP12]]) #[[ATTR3]]
 ; TFNONE-NEXT:    br label %[[IF_END]]
 ; TFNONE:       [[IF_END]]:
 ; TFNONE-NEXT:    [[TMP14:%.*]] = phi i64 [ [[TMP9]], %[[IF_THEN]] ], [ 0, %[[FOR_BODY]] ]
@@ -182,13 +182,13 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 {
 ; TFCOMMON-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; TFCOMMON-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ splat (i1 true), %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; TFCOMMON-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; TFCOMMON-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; TFCOMMON-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 8 [[TMP5]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
 ; TFCOMMON-NEXT:    [[TMP6:%.*]] = icmp ugt <vscale x 2 x i64> [[WIDE_MASKED_LOAD]], splat (i64 50)
 ; TFCOMMON-NEXT:    [[TMP7:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i1> zeroinitializer
 ; TFCOMMON-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[TMP7]])
 ; TFCOMMON-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[TMP8]], <vscale x 2 x i64> zeroinitializer
 ; TFCOMMON-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; TFCOMMON-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFCOMMON-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr align 8 [[TMP9]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFCOMMON-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; TFCOMMON-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
 ; TFCOMMON-NEXT:    [[TMP10:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -214,8 +214,8 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 {
 ; TFA_INTERLEAVE-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; TFA_INTERLEAVE-NEXT:    [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 1
 ; TFA_INTERLEAVE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i64 [[TMP9]]
-; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
-; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP10]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x i64> poison)
+; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 8 [[TMP7]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 8 [[TMP10]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x i64> poison)
 ; TFA_INTERLEAVE-NEXT:    [[TMP11:%.*]] = icmp ugt <vscale x 2 x i64> [[WIDE_MASKED_LOAD]], splat (i64 50)
 ; TFA_INTERLEAVE-NEXT:    [[TMP12:%.*]] = icmp ugt <vscale x 2 x i64> [[WIDE_MASKED_LOAD3]], splat (i64 50)
 ; TFA_INTERLEAVE-NEXT:    [[TMP13:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP11]], <vscale x 2 x i1> zeroinitializer
@@ -228,8 +228,8 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 {
 ; TFA_INTERLEAVE-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
 ; TFA_INTERLEAVE-NEXT:    [[TMP19:%.*]] = shl nuw i64 [[TMP18]], 1
 ; TFA_INTERLEAVE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[TMP17]], i64 [[TMP19]]
-; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP17]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI4]], ptr [[TMP20]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
+; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr align 8 [[TMP17]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI4]], ptr align 8 [[TMP20]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
 ; TFA_INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
 ; TFA_INTERLEAVE-NEXT:    [[TMP22:%.*]] = shl nuw i64 [[TMP21]], 1
@@ -308,10 +308,10 @@ define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 {
 ; TFNONE-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[TMP13]], 50
 ; TFNONE-NEXT:    br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]]
 ; TFNONE:       [[IF_THEN]]:
-; TFNONE-NEXT:    [[TMP14:%.*]] = call i64 @foo(i64 [[TMP13]]) #[[ATTR5:[0-9]+]]
+; TFNONE-NEXT:    [[TMP14:%.*]] = call i64 @foo(i64 [[TMP13]]) #[[ATTR4:[0-9]+]]
 ; TFNONE-NEXT:    br label %[[IF_END]]
 ; TFNONE:       [[IF_ELSE]]:
-; TFNONE-NEXT:    [[TMP15:%.*]] = call i64 @foo(i64 0) #[[ATTR5]]
+; TFNONE-NEXT:    [[TMP15:%.*]] = call i64 @foo(i64 0) #[[ATTR4]]
 ; TFNONE-NEXT:    br label %[[IF_END]]
 ; TFNONE:       [[IF_END]]:
 ; TFNONE-NEXT:    [[TMP16:%.*]] = phi i64 [ [[TMP14]], %[[IF_THEN]] ], [ [[TMP15]], %[[IF_ELSE]] ]
@@ -333,7 +333,7 @@ define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 {
 ; TFCOMMON-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; TFCOMMON-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ splat (i1 true), %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; TFCOMMON-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; TFCOMMON-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; TFCOMMON-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 8 [[TMP5]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
 ; TFCOMMON-NEXT:    [[TMP6:%.*]] = icmp ugt <vscale x 2 x i64> [[WIDE_MASKED_LOAD]], splat (i64 50)
 ; TFCOMMON-NEXT:    [[TMP7:%.*]] = xor <vscale x 2 x i1> [[TMP6]], splat (i1 true)
 ; TFCOMMON-NEXT:    [[TMP8:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP7]], <vscale x 2 x i1> zeroinitializer
@@ -342,7 +342,7 @@ define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 {
 ; TFCOMMON-NEXT:    [[TMP11:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[TMP10]])
 ; TFCOMMON-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[TMP9]]
 ; TFCOMMON-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; TFCOMMON-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP12]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFCOMMON-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr align 8 [[TMP12]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFCOMMON-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; TFCOMMON-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
 ; TFCOMMON-NEXT:    [[TMP13:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -368,8 +368,8 @@ define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 {
 ; TFA_INTERLEAVE-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; TFA_INTERLEAVE-NEXT:    [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 1
 ; TFA_INTERLEAVE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i64 [[TMP9]]
-; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
-; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP10]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x i64> poison)
+; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 8 [[TMP7]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 8 [[TMP10]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x i64> poison)
 ; TFA_INTERLEAVE-NEXT:    [[TMP11:%.*]] = icmp ugt <vscale x 2 x i64> [[WIDE_MASKED_LOAD]], splat (i64 50)
 ; TFA_INTERLEAVE-NEXT:    [[TMP12:%.*]] = icmp ugt <vscale x 2 x i64> [[WIDE_MASKED_LOAD3]], splat (i64 50)
 ; TFA_INTERLEAVE-NEXT:    [[TMP13:%.*]] = xor <vscale x 2 x i1> [[TMP11]], splat (i1 true)
@@ -388,8 +388,8 @@ define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 {
 ; TFA_INTERLEAVE-NEXT:    [[TMP24:%.*]] = call i64 @llvm.vscale.i64()
 ; TFA_INTERLEAVE-NEXT:    [[TMP25:%.*]] = shl nuw i64 [[TMP24]], 1
 ; TFA_INTERLEAVE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[TMP23]], i64 [[TMP25]]
-; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP23]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI4]], ptr [[TMP26]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
+; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr align 8 [[TMP23]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI4]], ptr align 8 [[TMP26]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
 ; TFA_INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
 ; TFA_INTERLEAVE-NEXT:    [[TMP28:%.*]] = shl nuw i64 [[TMP27]], 1
@@ -464,7 +464,7 @@ define void @test_widen_nomask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFNONE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[N_VEC]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
 ; TFNONE-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
 ; TFNONE-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
-; TFNONE-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR6:[0-9]+]]
+; TFNONE-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR5:[0-9]+]]
 ; TFNONE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
 ; TFNONE-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; TFNONE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -481,7 +481,7 @@ define void @test_widen_nomask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFALWAYS-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
 ; TFALWAYS-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
 ; TFALWAYS-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
-; TFALWAYS-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR6:[0-9]+]]
+; TFALWAYS-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR5:[0-9]+]]
 ; TFALWAYS-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
 ; TFALWAYS-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; TFALWAYS-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -512,7 +512,7 @@ define void @test_widen_nomask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFFALLBACK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[N_VEC]], %[[VECTOR_BODY]] ]
 ; TFFALLBACK-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
 ; TFFALLBACK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
-; TFFALLBACK-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR6:[0-9]+]]
+; TFFALLBACK-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR5:[0-9]+]]
 ; TFFALLBACK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
 ; TFFALLBACK-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; TFFALLBACK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -529,7 +529,7 @@ define void @test_widen_nomask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFA_INTERLEAVE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
 ; TFA_INTERLEAVE-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
 ; TFA_INTERLEAVE-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
-; TFA_INTERLEAVE-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR6:[0-9]+]]
+; TFA_INTERLEAVE-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR5:[0-9]+]]
 ; TFA_INTERLEAVE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
 ; TFA_INTERLEAVE-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; TFA_INTERLEAVE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -588,7 +588,7 @@ define void @test_widen_optmask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFNONE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[N_VEC]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
 ; TFNONE-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
 ; TFNONE-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
-; TFNONE-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR7:[0-9]+]]
+; TFNONE-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR6:[0-9]+]]
 ; TFNONE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
 ; TFNONE-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; TFNONE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -607,10 +607,10 @@ define void @test_widen_optmask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFALWAYS-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; TFALWAYS-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ splat (i1 true), %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; TFALWAYS-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDEX]]
-; TFALWAYS-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; TFALWAYS-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 8 [[TMP5]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
 ; TFALWAYS-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFALWAYS-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; TFALWAYS-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP6]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFALWAYS-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP6]], ptr align 8 [[TMP7]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFALWAYS-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; TFALWAYS-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
 ; TFALWAYS-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -629,10 +629,10 @@ define void @test_widen_optmask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFFALLBACK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; TFFALLBACK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ splat (i1 true), %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; TFFALLBACK-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDEX]]
-; TFFALLBACK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; TFFALLBACK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 8 [[TMP5]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
 ; TFFALLBACK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFFALLBACK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; TFFALLBACK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP6]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFFALLBACK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP6]], ptr align 8 [[TMP7]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFFALLBACK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; TFFALLBACK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
 ; TFFALLBACK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -658,16 +658,16 @@ define void @test_widen_optmask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFA_INTERLEAVE-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; TFA_INTERLEAVE-NEXT:    [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 1
 ; TFA_INTERLEAVE-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[TMP7]], i64 [[TMP9]]
-; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
-; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP10]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x i64> poison)
+; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 8 [[TMP7]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 8 [[TMP10]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x i64> poison)
 ; TFA_INTERLEAVE-NEXT:    [[TMP11:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFA_INTERLEAVE-NEXT:    [[TMP12:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD3]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
 ; TFA_INTERLEAVE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
 ; TFA_INTERLEAVE-NEXT:    [[TMP15:%.*]] = shl nuw i64 [[TMP14]], 1
 ; TFA_INTERLEAVE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i64 [[TMP15]]
-; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP11]], ptr [[TMP13]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP12]], ptr [[TMP16]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
+; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP11]], ptr align 8 [[TMP13]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP12]], ptr align 8 [[TMP16]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
 ; TFA_INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
 ; TFA_INTERLEAVE-NEXT:    [[TMP18:%.*]] = shl nuw i64 [[TMP17]], 1
@@ -739,7 +739,7 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub
 ; TFNONE-NEXT:    [[LOAD:%.*]] = load double, ptr [[GEP]], align 8
 ; TFNONE-NEXT:    [[MULADD]] = tail call double @llvm.fmuladd.f64(double [[LOAD]], double [[M]], double [[FMA_SUM]])
 ; TFNONE-NEXT:    [[TOINT:%.*]] = fptoui double [[LOAD]] to i64
-; TFNONE-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[TOINT]]) #[[ATTR4]]
+; TFNONE-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[TOINT]]) #[[ATTR3]]
 ; TFNONE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
 ; TFNONE-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; TFNONE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -762,12 +762,12 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub
 ; TFALWAYS-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ splat (i1 true), %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; TFALWAYS-NEXT:    [[VEC_PHI:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
 ; TFALWAYS-NEXT:    [[TMP5:%.*]] = getelementptr double, ptr [[B]], i64 [[INDEX]]
-; TFALWAYS-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
+; TFALWAYS-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP5]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
 ; TFALWAYS-NEXT:    [[TMP6:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]]
 ; TFALWAYS-NEXT:    [[TMP7:%.*]] = fptoui <vscale x 2 x double> [[WIDE_MASKED_LOAD]] to <vscale x 2 x i64>
 ; TFALWAYS-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[TMP7]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFALWAYS-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; TFALWAYS-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP8]], ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFALWAYS-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP8]], ptr align 8 [[TMP9]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFALWAYS-NEXT:    [[TMP10:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> [[TMP6]], <vscale x 2 x double> splat (double -0.000000e+00)
 ; TFALWAYS-NEXT:    [[TMP11]] = call double @llvm.vector.reduce.fadd.nxv2f64(double [[VEC_PHI]], <vscale x 2 x double> [[TMP10]])
 ; TFALWAYS-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
@@ -791,12 +791,12 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub
 ; TFFALLBACK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ splat (i1 true), %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; TFFALLBACK-NEXT:    [[VEC_PHI:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
 ; TFFALLBACK-NEXT:    [[TMP5:%.*]] = getelementptr double, ptr [[B]], i64 [[INDEX]]
-; TFFALLBACK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
+; TFFALLBACK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP5]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
 ; TFFALLBACK-NEXT:    [[TMP6:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]]
 ; TFFALLBACK-NEXT:    [[TMP7:%.*]] = fptoui <vscale x 2 x double> [[WIDE_MASKED_LOAD]] to <vscale x 2 x i64>
 ; TFFALLBACK-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[TMP7]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFFALLBACK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; TFFALLBACK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP8]], ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFFALLBACK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP8]], ptr align 8 [[TMP9]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFFALLBACK-NEXT:    [[TMP10:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> [[TMP6]], <vscale x 2 x double> splat (double -0.000000e+00)
 ; TFFALLBACK-NEXT:    [[TMP11]] = call double @llvm.vector.reduce.fadd.nxv2f64(double [[VEC_PHI]], <vscale x 2 x double> [[TMP10]])
 ; TFFALLBACK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
@@ -827,8 +827,8 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub
 ; TFA_INTERLEAVE-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; TFA_INTERLEAVE-NEXT:    [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 1
 ; TFA_INTERLEAVE-NEXT:    [[TMP10:%.*]] = getelementptr double, ptr [[TMP7]], i64 [[TMP9]]
-; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
-; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP10]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x double> poison)
+; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP7]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
+; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP10]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x double> poison)
 ; TFA_INTERLEAVE-NEXT:    [[TMP11:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP12:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD3]], [[BROADCAST_SPLAT]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP13:%.*]] = fptoui <vscale x 2 x double> [[WIDE_MASKED_LOAD]] to <vscale x 2 x i64>
@@ -839,8 +839,8 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub
 ; TFA_INTERLEAVE-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
 ; TFA_INTERLEAVE-NEXT:    [[TMP19:%.*]] = shl nuw i64 [[TMP18]], 1
 ; TFA_INTERLEAVE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[TMP17]], i64 [[TMP19]]
-; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP15]], ptr [[TMP17]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP16]], ptr [[TMP20]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
+; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP15]], ptr align 8 [[TMP17]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP16]], ptr align 8 [[TMP20]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
 ; TFA_INTERLEAVE-NEXT:    [[TMP21:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> [[TMP11]], <vscale x 2 x double> splat (double -0.000000e+00)
 ; TFA_INTERLEAVE-NEXT:    [[TMP22:%.*]] = call double @llvm.vector.reduce.fadd.nxv2f64(double [[VEC_PHI]], <vscale x 2 x double> [[TMP21]])
 ; TFA_INTERLEAVE-NEXT:    [[TMP23:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x double> [[TMP12]], <vscale x 2 x double> splat (double -0.000000e+00)
@@ -912,7 +912,7 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
 ; TFNONE:       [[LOOP]]:
 ; TFNONE-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_END:.*]] ]
 ; TFNONE-NEXT:    [[LD:%.*]] = load double, ptr [[P2]], align 8
-; TFNONE-NEXT:    [[EXP:%.*]] = tail call double @llvm.exp.f64(double [[LD]]) #[[ATTR8:[0-9]+]]
+; TFNONE-NEXT:    [[EXP:%.*]] = tail call double @llvm.exp.f64(double [[LD]]) #[[ATTR7:[0-9]+]]
 ; TFNONE-NEXT:    [[COND1:%.*]] = fcmp ogt double [[EXP]], 0.000000e+00
 ; TFNONE-NEXT:    br i1 [[COND1]], label %[[LOOP_MIDDLE:.*]], label %[[LOOP_END]]
 ; TFNONE:       [[LOOP_MIDDLE]]:
@@ -933,7 +933,7 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
 ; TFCOMMON:       [[LOOP]]:
 ; TFCOMMON-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
 ; TFCOMMON-NEXT:    [[LD:%.*]] = load double, ptr [[P2]], align 8
-; TFCOMMON-NEXT:    [[EXP:%.*]] = tail call double @llvm.exp.f64(double [[LD]]) #[[ATTR7:[0-9]+]]
+; TFCOMMON-NEXT:    [[EXP:%.*]] = tail call double @llvm.exp.f64(double [[LD]]) #[[ATTR6:[0-9]+]]
 ; TFCOMMON-NEXT:    [[COND1:%.*]] = fcmp ogt double [[EXP]], 0.000000e+00
 ; TFCOMMON-NEXT:    [[SINK:%.*]] = select i1 [[COND1]], double 0.000000e+00, double 1.000000e+00
 ; TFCOMMON-NEXT:    store double [[SINK]], ptr [[P]], align 8
@@ -958,7 +958,7 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
 ; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[TMP9]] ]
 ; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK2:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY1]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT6:%.*]], %[[TMP9]] ]
 ; TFA_INTERLEAVE-NEXT:    [[TMP4:%.*]] = load double, ptr [[P2]], align 8
-; TFA_INTERLEAVE-NEXT:    [[TMP6:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7:[0-9]+]]
+; TFA_INTERLEAVE-NEXT:    [[TMP6:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR6:[0-9]+]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP8:%.*]] = fcmp ogt double [[TMP6]], 0.000000e+00
 ; TFA_INTERLEAVE-NEXT:    [[PREDPHI3:%.*]] = select i1 [[TMP8]], double 0.000000e+00, double 1.000000e+00
 ; TFA_INTERLEAVE-NEXT:    [[TMP14:%.*]] = or i1 [[ACTIVE_LANE_MASK]], [[ACTIVE_LANE_MASK2]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll
index a6e0f8a2a1c3a..5a2eee0968863 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll
@@ -40,6 +40,7 @@ define void @sincos_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noali
 ; CHECK-ARMPL:  [[ENTRY:.*:]]
 ; CHECK-ARMPL:  [[VECTOR_PH:.*:]]
 ; CHECK-ARMPL:  [[VECTOR_BODY:.*:]]
+; CHECK-ARMPL:  [[VECTOR_BODY1:.*:]]
 ; CHECK-ARMPL:    [[TMP12:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD:%.*]])
 ; CHECK-ARMPL:    [[TMP13:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD1:%.*]])
 ; CHECK-ARMPL:    [[TMP14:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP12]], 0
@@ -53,6 +54,15 @@ define void @sincos_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noali
 ; CHECK-ARMPL:  [[MIDDLE_BLOCK:.*:]]
 ; CHECK-ARMPL:  [[SCALAR_PH:.*:]]
 ; CHECK-ARMPL:  [[FOR_BODY:.*:]]
+; CHECK-ARMPL:  [[VEC_EPILOG_VECTOR_BODY:.*:]]
+; CHECK-ARMPL:    [[TMP29:%.*]] = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> [[WIDE_LOAD3:%.*]])
+; CHECK-ARMPL:    [[TMP25:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 0
+; CHECK-ARMPL:    [[TMP26:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 1
+; CHECK-ARMPL:    store <4 x float> [[TMP25]], ptr [[TMP30:%.*]], align 4
+; CHECK-ARMPL:    store <4 x float> [[TMP26]], ptr [[TMP28:%.*]], align 4
+; CHECK-ARMPL:  [[VEC_EPILOG_MIDDLE_BLOCK:.*:]]
+; CHECK-ARMPL:  [[VEC_EPILOG_SCALAR_PH:.*:]]
+; CHECK-ARMPL:  [[FOR_BODY1:.*:]]
 ; CHECK-ARMPL:    [[CALL:%.*]] = tail call { float, float } @llvm.sincos.f32(float [[IN_VAL:%.*]])
 ; CHECK-ARMPL:    [[EXTRACT_A:%.*]] = extractvalue { float, float } [[CALL]], 0
 ; CHECK-ARMPL:    [[EXTRACT_B:%.*]] = extractvalue { float, float } [[CALL]], 1
@@ -193,8 +203,8 @@ define void @predicated_sincos(float %x, ptr noalias %in, ptr noalias writeonly
 ; CHECK-ARMPL:    [[TMP15:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]])
 ; CHECK-ARMPL:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP15]], 0
 ; CHECK-ARMPL:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP15]], 1
-; CHECK-ARMPL:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP16]], ptr [[TMP19:%.*]], i32 4, <vscale x 4 x i1> [[TMP14:%.*]])
-; CHECK-ARMPL:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP17]], ptr [[TMP21:%.*]], i32 4, <vscale x 4 x i1> [[TMP14]])
+; CHECK-ARMPL:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP16]], ptr align 4 [[TMP13:%.*]], <vscale x 4 x i1> [[TMP9:%.*]])
+; CHECK-ARMPL:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP17]], ptr align 4 [[TMP14:%.*]], <vscale x 4 x i1> [[TMP9]])
 ; CHECK-ARMPL:  [[IF_MERGE:.*:]]
 ; CHECK-ARMPL:  [[FOR_END:.*:]]
 ;
@@ -262,6 +272,7 @@ define void @modf_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias
 ; CHECK-ARMPL:  [[ENTRY:.*:]]
 ; CHECK-ARMPL:  [[VECTOR_PH:.*:]]
 ; CHECK-ARMPL:  [[VECTOR_BODY:.*:]]
+; CHECK-ARMPL:  [[VECTOR_BODY1:.*:]]
 ; CHECK-ARMPL:    [[TMP12:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.modf.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD:%.*]])
 ; CHECK-ARMPL:    [[TMP13:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.modf.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD1:%.*]])
 ; CHECK-ARMPL:    [[TMP14:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP12]], 0
@@ -275,6 +286,15 @@ define void @modf_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias
 ; CHECK-ARMPL:  [[MIDDLE_BLOCK:.*:]]
 ; CHECK-ARMPL:  [[SCALAR_PH:.*:]]
 ; CHECK-ARMPL:  [[FOR_BODY:.*:]]
+; CHECK-ARMPL:  [[VEC_EPILOG_VECTOR_BODY:.*:]]
+; CHECK-ARMPL:    [[TMP29:%.*]] = call { <4 x float>, <4 x float> } @llvm.modf.v4f32(<4 x float> [[WIDE_LOAD3:%.*]])
+; CHECK-ARMPL:    [[TMP25:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 0
+; CHECK-ARMPL:    [[TMP26:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 1
+; CHECK-ARMPL:    store <4 x float> [[TMP25]], ptr [[TMP30:%.*]], align 4
+; CHECK-ARMPL:    store <4 x float> [[TMP26]], ptr [[TMP28:%.*]], align 4
+; CHECK-ARMPL:  [[VEC_EPILOG_MIDDLE_BLOCK:.*:]]
+; CHECK-ARMPL:  [[VEC_EPILOG_SCALAR_PH:.*:]]
+; CHECK-ARMPL:  [[FOR_BODY1:.*:]]
 ; CHECK-ARMPL:    [[CALL:%.*]] = tail call { float, float } @llvm.modf.f32(float [[IN_VAL:%.*]])
 ; CHECK-ARMPL:    [[EXTRACT_A:%.*]] = extractvalue { float, float } [[CALL]], 0
 ; CHECK-ARMPL:    [[EXTRACT_B:%.*]] = extractvalue { float, float } [[CALL]], 1
@@ -412,6 +432,7 @@ define void @sincospi_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noa
 ; CHECK-ARMPL:  [[ENTRY:.*:]]
 ; CHECK-ARMPL:  [[VECTOR_PH:.*:]]
 ; CHECK-ARMPL:  [[VECTOR_BODY:.*:]]
+; CHECK-ARMPL:  [[VECTOR_BODY1:.*:]]
 ; CHECK-ARMPL:    [[TMP12:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincospi.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD:%.*]])
 ; CHECK-ARMPL:    [[TMP13:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincospi.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD1:%.*]])
 ; CHECK-ARMPL:    [[TMP14:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP12]], 0
@@ -425,6 +446,15 @@ define void @sincospi_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noa
 ; CHECK-ARMPL:  [[MIDDLE_BLOCK:.*:]]
 ; CHECK-ARMPL:  [[SCALAR_PH:.*:]]
 ; CHECK-ARMPL:  [[FOR_BODY:.*:]]
+; CHECK-ARMPL:  [[VEC_EPILOG_VECTOR_BODY:.*:]]
+; CHECK-ARMPL:    [[TMP29:%.*]] = call { <4 x float>, <4 x float> } @llvm.sincospi.v4f32(<4 x float> [[WIDE_LOAD3:%.*]])
+; CHECK-ARMPL:    [[TMP25:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 0
+; CHECK-ARMPL:    [[TMP26:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 1
+; CHECK-ARMPL:    store <4 x float> [[TMP25]], ptr [[TMP30:%.*]], align 4
+; CHECK-ARMPL:    store <4 x float> [[TMP26]], ptr [[TMP28:%.*]], align 4
+; CHECK-ARMPL:  [[VEC_EPILOG_MIDDLE_BLOCK:.*:]]
+; CHECK-ARMPL:  [[VEC_EPILOG_SCALAR_PH:.*:]]
+; CHECK-ARMPL:  [[FOR_BODY1:.*:]]
 ; CHECK-ARMPL:    [[CALL:%.*]] = tail call { float, float } @llvm.sincospi.f32(float [[IN_VAL:%.*]])
 ; CHECK-ARMPL:    [[EXTRACT_A:%.*]] = extractvalue { float, float } [[CALL]], 0
 ; CHECK-ARMPL:    [[EXTRACT_B:%.*]] = extractvalue { float, float } [[CALL]], 1
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll
index 56ace5497b996..f50d0834c5dc8 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll
@@ -472,7 +472,7 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32
 ; DEFAULT-NEXT:    [[TMP20:%.*]] = mul <vscale x 16 x i8> [[TMP19]], [[BROADCAST_SPLAT4]]
 ; DEFAULT-NEXT:    [[TMP21:%.*]] = add <vscale x 16 x i8> [[TMP18]], [[TMP20]]
 ; DEFAULT-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX]]
-; DEFAULT-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP21]], ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; DEFAULT-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP21]], ptr align 1 [[TMP22]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
 ; DEFAULT-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; DEFAULT-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; DEFAULT-NEXT:    [[TMP24:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -522,7 +522,7 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32
 ; OPTSIZE-NEXT:    [[TMP20:%.*]] = mul <vscale x 16 x i8> [[TMP19]], [[BROADCAST_SPLAT4]]
 ; OPTSIZE-NEXT:    [[TMP21:%.*]] = add <vscale x 16 x i8> [[TMP18]], [[TMP20]]
 ; OPTSIZE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX]]
-; OPTSIZE-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP21]], ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; OPTSIZE-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP21]], ptr align 1 [[TMP22]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
 ; OPTSIZE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; OPTSIZE-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; OPTSIZE-NEXT:    [[TMP24:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -572,7 +572,7 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32
 ; MINSIZE-NEXT:    [[TMP20:%.*]] = mul <vscale x 16 x i8> [[TMP19]], [[BROADCAST_SPLAT4]]
 ; MINSIZE-NEXT:    [[TMP21:%.*]] = add <vscale x 16 x i8> [[TMP18]], [[TMP20]]
 ; MINSIZE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX]]
-; MINSIZE-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP21]], ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; MINSIZE-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP21]], ptr align 1 [[TMP22]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
 ; MINSIZE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; MINSIZE-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; MINSIZE-NEXT:    [[TMP24:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_prefer_scalable.ll b/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_prefer_scalable.ll
index 50df6fcd3cdca..5b61fba4ae994 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_prefer_scalable.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_prefer_scalable.ll
@@ -28,13 +28,13 @@ define void @foo() {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_LATCH:%.*]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_LATCH]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1024 x float], ptr @A, i64 0, <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[TMP6]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP6]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> poison)
 ; CHECK-NEXT:    br label [[INNER_LOOP1:%.*]]
 ; CHECK:       inner_loop1:
 ; CHECK-NEXT:    [[TMP7:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP11:%.*]], [[INNER_LOOP1]] ]
 ; CHECK-NEXT:    [[TMP8:%.*]] = phi <vscale x 4 x float> [ [[WIDE_MASKED_GATHER]], [[VECTOR_BODY]] ], [ [[TMP10:%.*]], [[INNER_LOOP1]] ]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [512 x float], ptr @B, i64 0, <vscale x 4 x i64> [[TMP7]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[TMP9]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP9]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> poison)
 ; CHECK-NEXT:    [[TMP10]] = fmul <vscale x 4 x float> [[TMP8]], [[WIDE_MASKED_GATHER2]]
 ; CHECK-NEXT:    [[TMP11]] = add nuw nsw <vscale x 4 x i64> [[TMP7]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq <vscale x 4 x i64> [[TMP11]], splat (i64 512)
@@ -42,7 +42,7 @@ define void @foo() {
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[VECTOR_LATCH]], label [[INNER_LOOP1]]
 ; CHECK:       vector.latch:
 ; CHECK-NEXT:    [[TMP14:%.*]] = phi <vscale x 4 x float> [ [[TMP10]], [[INNER_LOOP1]] ]
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> [[TMP14]], <vscale x 4 x ptr> [[TMP6]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> [[TMP14]], <vscale x 4 x ptr> align 4 [[TMP6]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll b/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll
index 604e5b32edc27..6d0777e42ab0e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll
@@ -38,13 +38,13 @@ define void @foo_i32(i32 %n) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_LATCH]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [8 x i32], ptr @arr2, i64 0, <4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i64> [[VEC_IND]] to <4 x i32>
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP1]], <4 x ptr> [[TMP0]], i32 4, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP1]], <4 x ptr> align 4 [[TMP0]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    br label %[[FOR_BODY31:.*]]
 ; CHECK:       [[FOR_BODY31]]:
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP4:%.*]], %[[FOR_BODY31]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [8 x [8 x i32]], ptr @arr, i64 0, <4 x i64> [[VEC_PHI]], <4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP2]], <4 x ptr> [[TMP3]], i32 4, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP2]], <4 x ptr> align 4 [[TMP3]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP4]] = add nuw nsw <4 x i64> [[VEC_PHI]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <4 x i64> [[TMP4]], splat (i64 8)
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
@@ -100,13 +100,13 @@ define void @foo_i64(i64 %n) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_LATCH:.*]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_LATCH]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [8 x i64], ptr @arrX, i64 0, <2 x i64> [[VEC_IND]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> [[VEC_IND]], <2 x ptr> [[TMP0]], i32 4, <2 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> [[VEC_IND]], <2 x ptr> align 4 [[TMP0]], <2 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nsw <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    br label %[[FOR_BODY31:.*]]
 ; CHECK:       [[FOR_BODY31]]:
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP3:%.*]], %[[FOR_BODY31]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [8 x [8 x i64]], ptr @arrY, i64 0, <2 x i64> [[VEC_PHI]], <2 x i64> [[VEC_IND]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> [[TMP1]], <2 x ptr> [[TMP2]], i32 4, <2 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> [[TMP1]], <2 x ptr> align 4 [[TMP2]], <2 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP3]] = add nuw nsw <2 x i64> [[VEC_PHI]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <2 x i64> [[TMP3]], splat (i64 8)
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
index 5ae08393a1804..3dfa6df3313a5 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
@@ -1361,132 +1361,6 @@ for.body:                                         ; preds = %for.body.preheader,
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
 }
 
-define i32 @red_extended_add_incomplete_chain(ptr %start, ptr %end, i32 %offset) {
-; CHECK-NEON-LABEL: define i32 @red_extended_add_incomplete_chain(
-; CHECK-NEON-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] {
-; CHECK-NEON-NEXT:  entry:
-; CHECK-NEON-NEXT:    [[START2:%.*]] = ptrtoint ptr [[START]] to i64
-; CHECK-NEON-NEXT:    [[END1:%.*]] = ptrtoint ptr [[END]] to i64
-; CHECK-NEON-NEXT:    [[TMP0:%.*]] = add i64 [[END1]], 1
-; CHECK-NEON-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
-; CHECK-NEON-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 16
-; CHECK-NEON-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-NEON:       vector.ph:
-; CHECK-NEON-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 16
-; CHECK-NEON-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
-; CHECK-NEON-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
-; CHECK-NEON-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[OFFSET]], i64 0
-; CHECK-NEON-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
-; CHECK-NEON-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-NEON:       vector.body:
-; CHECK-NEON-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEON-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEON-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
-; CHECK-NEON-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-NEON-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE:%.*]] = add <16 x i32> [[VEC_PHI]], [[TMP3]]
-; CHECK-NEON-NEXT:    [[TMP4]] = add <16 x i32> [[PARTIAL_REDUCE]], [[BROADCAST_SPLAT]]
-; CHECK-NEON-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEON-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEON-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
-; CHECK-NEON:       middle.block:
-; CHECK-NEON-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP4]])
-; CHECK-NEON-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
-; CHECK-NEON-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-NEON:       scalar.ph:
-;
-; CHECK-SVE-LABEL: define i32 @red_extended_add_incomplete_chain(
-; CHECK-SVE-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] {
-; CHECK-SVE-NEXT:  entry:
-; CHECK-SVE-NEXT:    [[START2:%.*]] = ptrtoint ptr [[START]] to i64
-; CHECK-SVE-NEXT:    [[END1:%.*]] = ptrtoint ptr [[END]] to i64
-; CHECK-SVE-NEXT:    [[TMP0:%.*]] = add i64 [[END1]], 1
-; CHECK-SVE-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
-; CHECK-SVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
-; CHECK-SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[TMP3]]
-; CHECK-SVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-SVE:       vector.ph:
-; CHECK-SVE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
-; CHECK-SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], [[TMP5]]
-; CHECK-SVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
-; CHECK-SVE-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
-; CHECK-SVE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[OFFSET]], i64 0
-; CHECK-SVE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-SVE-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-SVE:       vector.body:
-; CHECK-SVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
-; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-SVE-NEXT:    [[TMP7:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT:    [[TMP8:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP7]]
-; CHECK-SVE-NEXT:    [[TMP9]] = add <vscale x 4 x i32> [[TMP8]], [[BROADCAST_SPLAT]]
-; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-SVE-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-SVE-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
-; CHECK-SVE:       middle.block:
-; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP9]])
-; CHECK-SVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
-; CHECK-SVE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-SVE:       scalar.ph:
-;
-; CHECK-SVE-MAXBW-LABEL: define i32 @red_extended_add_incomplete_chain(
-; CHECK-SVE-MAXBW-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] {
-; CHECK-SVE-MAXBW-NEXT:  entry:
-; CHECK-SVE-MAXBW-NEXT:    [[START2:%.*]] = ptrtoint ptr [[START]] to i64
-; CHECK-SVE-MAXBW-NEXT:    [[END1:%.*]] = ptrtoint ptr [[END]] to i64
-; CHECK-SVE-MAXBW-NEXT:    [[TMP0:%.*]] = add i64 [[END1]], 1
-; CHECK-SVE-MAXBW-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
-; CHECK-SVE-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-MAXBW-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 3
-; CHECK-SVE-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[TMP3]]
-; CHECK-SVE-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-SVE-MAXBW:       vector.ph:
-; CHECK-SVE-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-MAXBW-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
-; CHECK-SVE-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], [[TMP5]]
-; CHECK-SVE-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
-; CHECK-SVE-MAXBW-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
-; CHECK-SVE-MAXBW-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[OFFSET]], i64 0
-; CHECK-SVE-MAXBW-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
-; CHECK-SVE-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-SVE-MAXBW:       vector.body:
-; CHECK-SVE-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-MAXBW-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
-; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-SVE-MAXBW-NEXT:    [[TMP7:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE:%.*]] = add <vscale x 8 x i32> [[VEC_PHI]], [[TMP7]]
-; CHECK-SVE-MAXBW-NEXT:    [[TMP8]] = add <vscale x 8 x i32> [[PARTIAL_REDUCE]], [[BROADCAST_SPLAT]]
-; CHECK-SVE-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-SVE-MAXBW-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-SVE-MAXBW-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
-; CHECK-SVE-MAXBW:       middle.block:
-; CHECK-SVE-MAXBW-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP8]])
-; CHECK-SVE-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
-; CHECK-SVE-MAXBW-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-SVE-MAXBW:       scalar.ph:
-;
-entry:
-  br label %loop
-
-loop:
-  %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ]
-  %red = phi i32 [ 0, %entry ], [ %red.next, %loop ]
-  %l = load i8, ptr %ptr.iv, align 1
-  %l.ext = zext i8 %l to i32
-  %add = add i32 %red, %l.ext
-  %red.next = add i32 %add, %offset
-  %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
-  %ec = icmp eq ptr %ptr.iv, %end
-  br i1 %ec, label %exit, label %loop
-
-exit:
-  ret i32 %red.next
-}
-
 attributes #0 = { vscale_range(1,16) }
 
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index 3a882730f0379..1ace7d44125b9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -1302,10 +1302,10 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP11]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 1 [[TMP11]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP14]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 1 [[TMP14]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = mul nsw <vscale x 4 x i32> [[TMP16]], [[TMP13]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[VEC_PHI]]
@@ -1340,10 +1340,10 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP11]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 1 [[TMP11]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
 ; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP14]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 1 [[TMP14]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
 ; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = mul nsw <vscale x 4 x i32> [[TMP16]], [[TMP13]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[VEC_PHI]]
@@ -1378,10 +1378,10 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP11]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-MAXBW-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP11]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
 ; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP14]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-MAXBW-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
 ; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = sext <vscale x 16 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = mul nsw <vscale x 16 x i32> [[TMP16]], [[TMP13]]
 ; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i32> [[TMP17]], <vscale x 16 x i32> zeroinitializer
@@ -2697,4 +2697,4 @@ for.body:                                         ; preds = %for.body.lr.ph, %fo
 !9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
 !10 = !{!"llvm.loop.vectorize.enable", i1 true}
 attributes #0 = { vscale_range(1,16) "target-features"="+sve" }
-attributes #1 = { vscale_range(1,16) "target-features"="+neon,+dotprod,+sve" "cpu"="neoverse-v2" }
+attributes #1 = { vscale_range(1,16) "target-features"="+neon,+dotprod,+sve" "target-cpu"="neoverse-v2" }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-incomplete-chains.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-incomplete-chains.ll
new file mode 100644
index 0000000000000..d80178fde45d9
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-incomplete-chains.ll
@@ -0,0 +1,72 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
+; RUN: opt --mattr=+neon,+dotprod -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S %s | FileCheck %s --check-prefixes=CHECK-NEON
+
+target triple = "arm64-apple-macosx"
+
+define i32 @red_extended_add_incomplete_chain(ptr %start, ptr %end, i32 %offset) {
+; CHECK-NEON-LABEL: define i32 @red_extended_add_incomplete_chain(
+; CHECK-NEON-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEON-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEON-NEXT:    [[START2:%.*]] = ptrtoint ptr [[START]] to i64
+; CHECK-NEON-NEXT:    [[END1:%.*]] = ptrtoint ptr [[END]] to i64
+; CHECK-NEON-NEXT:    [[TMP0:%.*]] = add i64 [[END1]], 1
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
+; CHECK-NEON-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 16
+; CHECK-NEON-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEON:       [[VECTOR_PH]]:
+; CHECK-NEON-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 16
+; CHECK-NEON-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
+; CHECK-NEON-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[OFFSET]], i64 0
+; CHECK-NEON-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEON-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NEON:       [[VECTOR_BODY]]:
+; CHECK-NEON-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEON-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEON-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
+; CHECK-NEON-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = add <16 x i32> [[VEC_PHI]], [[TMP3]]
+; CHECK-NEON-NEXT:    [[TMP5]] = add <16 x i32> [[TMP4]], [[BROADCAST_SPLAT]]
+; CHECK-NEON-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEON-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEON:       [[MIDDLE_BLOCK]]:
+; CHECK-NEON-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP5]])
+; CHECK-NEON-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
+; CHECK-NEON-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEON:       [[SCALAR_PH]]:
+; CHECK-NEON-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ]
+; CHECK-NEON-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEON-NEXT:    br label %[[LOOP:.*]]
+; CHECK-NEON:       [[LOOP]]:
+; CHECK-NEON-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[GEP_IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEON-NEXT:    [[RED:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEON-NEXT:    [[L:%.*]] = load i8, ptr [[PTR_IV]], align 1
+; CHECK-NEON-NEXT:    [[L_EXT:%.*]] = zext i8 [[L]] to i32
+; CHECK-NEON-NEXT:    [[ADD:%.*]] = add i32 [[RED]], [[L_EXT]]
+; CHECK-NEON-NEXT:    [[RED_NEXT]] = add i32 [[ADD]], [[OFFSET]]
+; CHECK-NEON-NEXT:    [[GEP_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 1
+; CHECK-NEON-NEXT:    [[EC:%.*]] = icmp eq ptr [[PTR_IV]], [[END]]
+; CHECK-NEON-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEON:       [[EXIT]]:
+; CHECK-NEON-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEON-NEXT:    ret i32 [[RED_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ]
+  %red = phi i32 [ 0, %entry ], [ %red.next, %loop ]
+  %l = load i8, ptr %ptr.iv, align 1
+  %l.ext = zext i8 %l to i32
+  %add = add i32 %red, %l.ext
+  %red.next = add i32 %add, %offset
+  %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
+  %ec = icmp eq ptr %ptr.iv, %end
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i32 %red.next
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
index 3c2ae1c74d171..70532ad6586c7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
@@ -410,20 +410,32 @@ define i32 @zext_add_reduc_i8_i32_has_neon_dotprod(ptr %a) #1 {
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE9:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 32
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]])
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP5]])
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE8]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP10]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE9]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP7]])
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
 ; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE3]], [[PARTIAL_REDUCE]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[PARTIAL_REDUCE8]], [[BIN_RDX]]
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[PARTIAL_REDUCE9]], [[BIN_RDX10]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
 ; CHECK-INTERLEAVED-NEXT:    br label [[SCALAR_PH:%.*]]
 ; CHECK-INTERLEAVED:       scalar.ph:
 ;
@@ -432,25 +444,20 @@ define i32 @zext_add_reduc_i8_i32_has_neon_dotprod(ptr %a) #1 {
 ; CHECK-MAXBW-NEXT:  entry:
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK-MAXBW:       vector.ph:
-; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
-; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
-; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-MAXBW:       vector.body:
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP7]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP9]])
-; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP1]])
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-MAXBW-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
-; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]])
-; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-MAXBW-NEXT:    br label [[SCALAR_PH:%.*]]
 ; CHECK-MAXBW:       scalar.ph:
 ;
 entry:
@@ -487,7 +494,7 @@ define i32 @zext_add_reduc_i8_i32_predicated(ptr %a) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP6]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 1 [[TMP6]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = zext <vscale x 4 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = add <vscale x 4 x i32> [[TMP8]], [[VEC_PHI]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP10]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP9]], <vscale x 4 x i32> [[VEC_PHI]]
@@ -516,7 +523,7 @@ define i32 @zext_add_reduc_i8_i32_predicated(ptr %a) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP6]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 1 [[TMP6]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = zext <vscale x 4 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = add <vscale x 4 x i32> [[TMP8]], [[VEC_PHI]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP9]], <vscale x 4 x i32> [[VEC_PHI]]
@@ -545,7 +552,7 @@ define i32 @zext_add_reduc_i8_i32_predicated(ptr %a) #0 {
 ; CHECK-MAXBW-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP6]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-MAXBW-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP6]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
 ; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = zext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i32> [[TMP8]], <vscale x 16 x i32> zeroinitializer
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP9]])
@@ -693,20 +700,32 @@ define i32 @zext_sub_reduc_i8_i32_has_neon_dotprod(ptr %a) #1 {
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI2:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI3:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 32
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = zext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = zext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP6]] = sub <16 x i32> [[VEC_PHI]], [[TMP4]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP7]] = sub <16 x i32> [[VEC_PHI1]], [[TMP5]]
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-INTERLEAVED-NEXT:    [[TMP10]] = sub <16 x i32> [[VEC_PHI2]], [[TMP12]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP11]] = sub <16 x i32> [[VEC_PHI3]], [[TMP14]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP7]], [[TMP6]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX7:%.*]] = add <16 x i32> [[TMP10]], [[BIN_RDX]]
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX8:%.*]] = add <16 x i32> [[TMP11]], [[BIN_RDX7]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX8]])
 ; CHECK-INTERLEAVED-NEXT:    br label [[SCALAR_PH:%.*]]
 ; CHECK-INTERLEAVED:       scalar.ph:
 ;
@@ -1093,9 +1112,124 @@ exit:
   ret i32 %add.lcssa
 }
 
+define i64 @sext_reduction_i32_to_i64(ptr %arr, i64 %n) #1 {
+; CHECK-INTERLEAVE1-LABEL: define i64 @sext_reduction_i32_to_i64(
+; CHECK-INTERLEAVE1-SAME: ptr [[ARR:%.*]], i64 [[N:%.*]]) #[[ATTR2]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
+; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 4
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[INDEX]]
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI]], <4 x i64> [[TMP1]])
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-INTERLEAVE1:       middle.block:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVE1:       scalar.ph:
+;
+; CHECK-INTERLEAVED-LABEL: define i64 @sext_reduction_i32_to_i64(
+; CHECK-INTERLEAVED-SAME: ptr [[ARR:%.*]], i64 [[N:%.*]]) #[[ATTR2]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
+; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 16
+; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 16
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI2:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI3:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE9:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[INDEX]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 12
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP14]], align 4
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = sext <4 x i32> [[WIDE_LOAD4]] to <4 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = sext <4 x i32> [[WIDE_LOAD5]] to <4 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = sext <4 x i32> [[WIDE_LOAD6]] to <4 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI]], <4 x i64> [[TMP15]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE7]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI1]], <4 x i64> [[TMP5]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE8]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI2]], <4 x i64> [[TMP6]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE9]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI3]], <4 x i64> [[TMP7]])
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-INTERLEAVED:       middle.block:
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE7]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX10:%.*]] = add <2 x i64> [[PARTIAL_REDUCE8]], [[BIN_RDX]]
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX11:%.*]] = add <2 x i64> [[PARTIAL_REDUCE9]], [[BIN_RDX10]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX11]])
+; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED:       scalar.ph:
+;
+; CHECK-MAXBW-LABEL: define i64 @sext_reduction_i32_to_i64(
+; CHECK-MAXBW-SAME: ptr [[ARR:%.*]], i64 [[N:%.*]]) #[[ATTR2]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
+; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 4
+; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 4
+; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[INDEX]]
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI]], <4 x i64> [[TMP1]])
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-MAXBW:       middle.block:
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
+; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-MAXBW:       scalar.ph:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %acc = phi i64 [ 0, %entry ], [ %add, %loop ]
+  %gep = getelementptr inbounds i32, ptr %arr, i64 %iv
+  %load = load i32, ptr %gep
+  %sext = sext i32 %load to i64
+  %add = add i64 %acc, %sext
+  %iv.next = add i64 %iv, 1
+  %cmp = icmp ult i64 %iv.next, %n
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret i64 %add
+}
+
 
 !0 = distinct !{!0, !1}
 !1 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
 attributes #0 = { vscale_range(1,16) "target-features"="+sve" }
-attributes #1 = { vscale_range(1,16) "target-features"="+neon,+dotprod,+sve" "cpu"="neoverse-v2" }
+attributes #1 = { vscale_range(1,16) "target-features"="+neon,+dotprod,+sve" "target-cpu"="neoverse-v2" }
 attributes #2 = { "target-features"="+neon,+dotprod" }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll
index 20b536499afa7..ebf4a4fabb605 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll
@@ -9,9 +9,7 @@ define i32 @fn1() local_unnamed_addr #0 {
 ; We expect the backend to expand all reductions.
 ; CHECK: @llvm.vector.reduce
 entry:
-  %0 = load i32, ptr @b, align 4, !tbaa !1
-  %cmp40 = icmp sgt i32 %0, 0
-  br i1 %cmp40, label %for.body.lr.ph, label %for.end
+  br label %for.body.lr.ph
 
 for.body.lr.ph:                                   ; preds = %entry
   %1 = load ptr, ptr @a, align 8, !tbaa !5
@@ -21,8 +19,8 @@ for.body.lr.ph:                                   ; preds = %entry
 
 for.body:                                         ; preds = %for.body.lr.ph, %for.body
   %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
-  %d.043 = phi i16 [ undef, %for.body.lr.ph ], [ %.sink28, %for.body ]
-  %c.042 = phi i16 [ undef, %for.body.lr.ph ], [ %c.0., %for.body ]
+  %d.043 = phi i16 [ 0, %for.body.lr.ph ], [ %.sink28, %for.body ]
+  %c.042 = phi i16 [ 0, %for.body.lr.ph ], [ %c.0., %for.body ]
   %arrayidx = getelementptr inbounds i16, ptr %1, i64 %indvars.iv
   %4 = load i16, ptr %arrayidx, align 2, !tbaa !7
   %cmp2 = icmp sgt i16 %c.042, %4
@@ -33,10 +31,8 @@ for.body:                                         ; preds = %for.body.lr.ph, %fo
   %cmp = icmp slt i64 %indvars.iv.next, %3
   br i1 %cmp, label %for.body, label %for.end
 
-for.end:                                          ; preds = %for.body, %entry
-  %c.0.lcssa = phi i16 [ undef, %entry ], [ %c.0., %for.body ]
-  %d.0.lcssa = phi i16 [ undef, %entry ], [ %.sink28, %for.body ]
-  %cmp26 = icmp sgt i16 %c.0.lcssa, %d.0.lcssa
+for.end:                                          ; preds = %for.body
+  %cmp26 = icmp sgt i16 %c.0., %.sink28
   %conv27 = zext i1 %cmp26 to i32
   ret i32 %conv27
 }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll
index 44820e061211a..33ce300d68592 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll
@@ -18,7 +18,7 @@ define void @_Z1dv() local_unnamed_addr #0 {
 ; CHECK-NEXT:    br label [[FOR_COND:%.*]]
 ; CHECK:       for.cond:
 ; CHECK-NEXT:    [[F_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD5:%.*]], [[FOR_COND_CLEANUP:%.*]] ]
-; CHECK-NEXT:    [[G_0:%.*]] = phi i32 [ undef, [[ENTRY]] ], [ [[G_1_LCSSA:%.*]], [[FOR_COND_CLEANUP]] ]
+; CHECK-NEXT:    [[G_0:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[G_1_LCSSA:%.*]], [[FOR_COND_CLEANUP]] ]
 ; CHECK-NEXT:    [[CMP12:%.*]] = icmp ult i32 [[G_0]], 4
 ; CHECK-NEXT:    [[CONV:%.*]] = and i32 [[F_0]], 65535
 ; CHECK-NEXT:    br i1 [[CMP12]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_COND_CLEANUP]]
@@ -50,7 +50,7 @@ entry:
 
 for.cond:                                         ; preds = %for.cond.cleanup, %entry
   %f.0 = phi i32 [ 0, %entry ], [ %add5, %for.cond.cleanup ]
-  %g.0 = phi i32 [ undef, %entry ], [ %g.1.lcssa, %for.cond.cleanup ]
+  %g.0 = phi i32 [ 0, %entry ], [ %g.1.lcssa, %for.cond.cleanup ]
   %cmp12 = icmp ult i32 %g.0, 4
   %conv = and i32 %f.0, 65535
   br i1 %cmp12, label %for.body.lr.ph, label %for.cond.cleanup
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
index 0f82de629afa9..44ae1757ce6e6 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
@@ -105,16 +105,16 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2)
 ; VSCALEFORTUNING2-NEXT:    [[TMP36:%.*]] = or <vscale x 4 x i32> [[TMP11]], [[TMP34]]
 ; VSCALEFORTUNING2-NEXT:    [[TMP37:%.*]] = or <vscale x 4 x i32> [[TMP35]], [[BROADCAST_SPLAT]]
 ; VSCALEFORTUNING2-NEXT:    [[TMP38:%.*]] = or <vscale x 4 x i32> [[TMP36]], [[BROADCAST_SPLAT]]
-; VSCALEFORTUNING2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[DOTSPLAT]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
-; VSCALEFORTUNING2-NEXT:    [[WIDE_MASKED_GATHER8:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[DOTSPLAT]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; VSCALEFORTUNING2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[DOTSPLAT]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; VSCALEFORTUNING2-NEXT:    [[WIDE_MASKED_GATHER8:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[DOTSPLAT]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; VSCALEFORTUNING2-NEXT:    [[TMP39:%.*]] = lshr <vscale x 4 x i32> [[TMP37]], splat (i32 1)
 ; VSCALEFORTUNING2-NEXT:    [[TMP40:%.*]] = lshr <vscale x 4 x i32> [[TMP38]], splat (i32 1)
 ; VSCALEFORTUNING2-NEXT:    [[TMP41:%.*]] = zext <vscale x 4 x i32> [[TMP39]] to <vscale x 4 x i64>
 ; VSCALEFORTUNING2-NEXT:    [[TMP42:%.*]] = zext <vscale x 4 x i32> [[TMP40]] to <vscale x 4 x i64>
 ; VSCALEFORTUNING2-NEXT:    [[TMP43:%.*]] = getelementptr i32, ptr [[SRC_2]], <vscale x 4 x i64> [[TMP41]]
 ; VSCALEFORTUNING2-NEXT:    [[TMP44:%.*]] = getelementptr i32, ptr [[SRC_2]], <vscale x 4 x i64> [[TMP42]]
-; VSCALEFORTUNING2-NEXT:    [[WIDE_MASKED_GATHER9:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP43]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
-; VSCALEFORTUNING2-NEXT:    [[WIDE_MASKED_GATHER10:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP44]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; VSCALEFORTUNING2-NEXT:    [[WIDE_MASKED_GATHER9:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP43]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; VSCALEFORTUNING2-NEXT:    [[WIDE_MASKED_GATHER10:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP44]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; VSCALEFORTUNING2-NEXT:    [[TMP45:%.*]] = or <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], [[VEC_PHI]]
 ; VSCALEFORTUNING2-NEXT:    [[TMP46:%.*]] = or <vscale x 4 x i32> [[WIDE_MASKED_GATHER8]], [[VEC_PHI5]]
 ; VSCALEFORTUNING2-NEXT:    [[TMP47]] = or <vscale x 4 x i32> [[TMP45]], [[WIDE_MASKED_GATHER9]]
@@ -225,11 +225,11 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2)
 ; PRED-NEXT:    [[TMP33:%.*]] = or <vscale x 4 x i32> [[TMP32]], splat (i32 2)
 ; PRED-NEXT:    [[TMP34:%.*]] = or <vscale x 4 x i32> [[TMP15]], [[TMP33]]
 ; PRED-NEXT:    [[TMP35:%.*]] = or <vscale x 4 x i32> [[TMP34]], [[BROADCAST_SPLAT]]
-; PRED-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[DOTSPLAT]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; PRED-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[DOTSPLAT]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; PRED-NEXT:    [[TMP36:%.*]] = lshr <vscale x 4 x i32> [[TMP35]], splat (i32 1)
 ; PRED-NEXT:    [[TMP37:%.*]] = zext <vscale x 4 x i32> [[TMP36]] to <vscale x 4 x i64>
 ; PRED-NEXT:    [[TMP38:%.*]] = getelementptr i32, ptr [[SRC_2]], <vscale x 4 x i64> [[TMP37]]
-; PRED-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP38]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; PRED-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP38]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; PRED-NEXT:    [[TMP39:%.*]] = or <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], [[VEC_PHI]]
 ; PRED-NEXT:    [[TMP40:%.*]] = or <vscale x 4 x i32> [[TMP39]], [[WIDE_MASKED_GATHER7]]
 ; PRED-NEXT:    [[TMP41]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP40]], <vscale x 4 x i32> [[VEC_PHI]]
@@ -451,7 +451,7 @@ define i16 @reduce_udiv(ptr %src, i16 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i16> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[TMP14:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[INDEX]]
-; PRED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP14]], i32 2, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i16> poison)
+; PRED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr align 2 [[TMP14]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i16> poison)
 ; PRED-NEXT:    [[TMP20:%.*]] = udiv <vscale x 8 x i16> [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]]
 ; PRED-NEXT:    [[TMP21:%.*]] = or <vscale x 8 x i16> [[TMP20]], [[VEC_PHI]]
 ; PRED-NEXT:    [[TMP16]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i16> [[TMP21]], <vscale x 8 x i16> [[VEC_PHI]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll
index ab9b48fb68f6b..7f345133f51dd 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll
@@ -66,8 +66,9 @@ define void @replicating_load_used_as_store_addr_2(ptr noalias %invar.dst, ptr n
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -107,15 +108,15 @@ define void @replicating_load_used_as_store_addr_3(ptr noalias %src, ptr noalias
 ; CHECK-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP4]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]]
 ; CHECK-NEXT:    store i8 0, ptr [[TMP7]], align 1
-; CHECK-NEXT:    store i8 0, ptr [[TMP7]], align 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP5]] to i8
 ; CHECK-NEXT:    store i8 [[TMP8]], ptr [[INVAR_DST]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -153,17 +154,20 @@ define void @uniform_gep_for_replicating_gep(ptr %dst) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <2 x i32> [[STEP_ADD]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <2 x i32> [[VEC_IND]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <2 x i32> [[STEP_ADD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = lshr i32 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = lshr i32 [[TMP2]], 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = zext <2 x i1> [[TMP5]] to <2 x i8>
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <2 x i1> [[TMP3]] to <2 x i8>
 ; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP8]] to i64
 ; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP9]] to i64
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP14]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP15]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x i8> [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i8> [[TMP6]], i32 1
 ; CHECK-NEXT:    store i8 [[TMP22]], ptr [[TMP18]], align 1
-; CHECK-NEXT:    store i8 [[TMP22]], ptr [[TMP19]], align 1
+; CHECK-NEXT:    store i8 [[TMP12]], ptr [[TMP19]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128
@@ -205,25 +209,25 @@ define void @test_load_gep_widen_induction(ptr noalias %dst, ptr noalias %dst2)
 ; CHECK-NEXT:    [[STEP_ADD_2:%.*]] = add <2 x i64> [[STEP_ADD]], splat (i64 2)
 ; CHECK-NEXT:    [[STEP_ADD_3:%.*]] = add <2 x i64> [[STEP_ADD_2]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i128, ptr [[DST]], <2 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x ptr> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x ptr> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i128, ptr [[DST]], <2 x i64> [[STEP_ADD]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i128, ptr [[DST]], <2 x i64> [[STEP_ADD_2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x ptr> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x ptr> [[TMP2]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i128, ptr [[DST]], <2 x i64> [[STEP_ADD_3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP0]], i32 0
-; CHECK-NEXT:    store ptr null, ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x ptr> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 1
 ; CHECK-NEXT:    store ptr null, ptr [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0
 ; CHECK-NEXT:    store ptr null, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
 ; CHECK-NEXT:    store ptr null, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x ptr> [[TMP2]], i32 0
 ; CHECK-NEXT:    store ptr null, ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x ptr> [[TMP2]], i32 1
 ; CHECK-NEXT:    store ptr null, ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 0
 ; CHECK-NEXT:    store ptr null, ptr [[TMP10]], align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 1
 ; CHECK-NEXT:    store ptr null, ptr [[TMP11]], align 8
+; CHECK-NEXT:    store ptr null, ptr [[TMP17]], align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr ptr, ptr [[DST2]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr ptr, ptr [[TMP12]], i32 2
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr ptr, ptr [[TMP12]], i32 4
@@ -463,21 +467,21 @@ define void @test_prefer_vector_addressing(ptr %start, ptr %ms, ptr noalias %src
 ; CHECK-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP11]]
 ; CHECK-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr [[NEXT_GEP]], align 1, !tbaa [[LONG_LONG_TBAA14:![0-9]+]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i64, ptr [[NEXT_GEP3]], align 1, !tbaa [[LONG_LONG_TBAA14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[NEXT_GEP4]], align 1, !tbaa [[LONG_LONG_TBAA14]]
-; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[NEXT_GEP5]], align 1, !tbaa [[LONG_LONG_TBAA14]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr [[NEXT_GEP]], align 1, !tbaa [[LONG_LONG_TBAA12:![0-9]+]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i64, ptr [[NEXT_GEP3]], align 1, !tbaa [[LONG_LONG_TBAA12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[NEXT_GEP4]], align 1, !tbaa [[LONG_LONG_TBAA12]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[NEXT_GEP5]], align 1, !tbaa [[LONG_LONG_TBAA12]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP14]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP15]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP16]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP17]]
-; CHECK-NEXT:    store i32 0, ptr [[TMP18]], align 4, !tbaa [[INT_TBAA19:![0-9]+]]
-; CHECK-NEXT:    store i32 0, ptr [[TMP19]], align 4, !tbaa [[INT_TBAA19]]
-; CHECK-NEXT:    store i32 0, ptr [[TMP20]], align 4, !tbaa [[INT_TBAA19]]
-; CHECK-NEXT:    store i32 0, ptr [[TMP21]], align 4, !tbaa [[INT_TBAA19]]
+; CHECK-NEXT:    store i32 0, ptr [[TMP18]], align 4, !tbaa [[INT_TBAA17:![0-9]+]]
+; CHECK-NEXT:    store i32 0, ptr [[TMP19]], align 4, !tbaa [[INT_TBAA17]]
+; CHECK-NEXT:    store i32 0, ptr [[TMP20]], align 4, !tbaa [[INT_TBAA17]]
+; CHECK-NEXT:    store i32 0, ptr [[TMP21]], align 4, !tbaa [[INT_TBAA17]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP6]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
@@ -578,10 +582,11 @@ define double @test_scalarization_cost_for_load_of_address(ptr %src.0, ptr %src.
 ; CHECK-NEXT:    [[TMP20:%.*]] = fmul <2 x double> [[TMP9]], [[TMP19]]
 ; CHECK-NEXT:    [[TMP21]] = call double @llvm.vector.reduce.fadd.v2f64(double [[VEC_PHI]], <2 x double> [[TMP20]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret double [[TMP21]]
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reduction-inloop-cond.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reduction-inloop-cond.ll
index 92b2a44967cb8..977713d389b4e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reduction-inloop-cond.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reduction-inloop-cond.ll
@@ -22,7 +22,7 @@ define float @cond_fadd(ptr noalias nocapture readonly %a, ptr noalias nocapture
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = fcmp une <vscale x 4 x float> [[WIDE_LOAD]], splat (float 2.000000e+00)
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr float, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP10]], i32 4, <vscale x 4 x i1> [[TMP9]], <vscale x 4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 4 [[TMP10]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x float> poison)
 ; CHECK-NEXT:    [[TMP12:%.*]] = select fast <vscale x 4 x i1> [[TMP9]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> zeroinitializer
 ; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> [[TMP12]])
 ; CHECK-NEXT:    [[TMP14]] = fadd fast float [[VEC_PHI]], [[TMP13]]
@@ -104,7 +104,7 @@ define float @cond_cmp_sel(ptr noalias %a, ptr noalias %cond, i64 %N) {
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = fcmp une <vscale x 4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00)
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr float, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP10]], i32 4, <vscale x 4 x i1> [[TMP9]], <vscale x 4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 4 [[TMP10]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x float> poison)
 ; CHECK-NEXT:    [[TMP12:%.*]] = select fast <vscale x 4 x i1> [[TMP9]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> splat (float 0x47EFFFFFE0000000)
 ; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> [[TMP12]])
 ; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt float [[TMP13]], [[VEC_PHI]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
index 5072058ed5b8f..d84463430179d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
@@ -134,7 +134,7 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP7]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr align 4 [[TMP7]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP8:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> [[WIDE_MASKED_LOAD]], <vscale x 8 x float> splat (float -0.000000e+00)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP9]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[TMP8]])
 ; CHECK-ORDERED-TF-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
@@ -350,10 +350,10 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 24
 ; CHECK-ORDERED-TF-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[TMP21]]
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP13]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP16]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP19]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP22]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr align 4 [[TMP13]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr align 4 [[TMP16]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr align 4 [[TMP19]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr align 4 [[TMP22]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP23:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> [[WIDE_MASKED_LOAD]], <vscale x 8 x float> splat (float -0.000000e+00)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP24:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[TMP23]])
 ; CHECK-ORDERED-TF-NEXT:    [[TMP25:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> [[WIDE_MASKED_LOAD9]], <vscale x 8 x float> splat (float -0.000000e+00)
@@ -594,7 +594,7 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-ORDERED-TF-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
 ; CHECK-ORDERED-TF-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[OFFSET_IDX]]
 ; CHECK-ORDERED-TF-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 8 x i1> @llvm.vector.interleave2.nxv8i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP10]], i32 4, <vscale x 8 x i1> [[INTERLEAVED_MASK]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr align 4 [[TMP10]], <vscale x 8 x i1> [[INTERLEAVED_MASK]], <vscale x 8 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> [[WIDE_MASKED_VEC]])
 ; CHECK-ORDERED-TF-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 0
 ; CHECK-ORDERED-TF-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 1
@@ -811,9 +811,9 @@ define float @fadd_of_sum(ptr noalias nocapture readonly %a, ptr noalias nocaptu
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP8]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 4 [[TMP8]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP9]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 4 [[TMP9]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP10:%.*]] = fadd <vscale x 4 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP11:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[TMP10]], <vscale x 4 x float> splat (float -0.000000e+00)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP12]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP11]])
@@ -904,7 +904,7 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no
 ; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP4]], align 4
 ; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = fcmp une <vscale x 4 x float> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX]]
-; CHECK-UNORDERED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP6]], i32 4, <vscale x 4 x i1> [[TMP5]], <vscale x 4 x float> poison)
+; CHECK-UNORDERED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 4 [[TMP6]], <vscale x 4 x i1> [[TMP5]], <vscale x 4 x float> poison)
 ; CHECK-UNORDERED-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP5]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> splat (float 3.000000e+00)
 ; CHECK-UNORDERED-NEXT:    [[TMP7]] = fadd <vscale x 4 x float> [[VEC_PHI]], [[PREDPHI]]
 ; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
@@ -959,7 +959,7 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no
 ; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP4]], align 4
 ; CHECK-ORDERED-NEXT:    [[TMP5:%.*]] = fcmp une <vscale x 4 x float> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX]]
-; CHECK-ORDERED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP6]], i32 4, <vscale x 4 x i1> [[TMP5]], <vscale x 4 x float> poison)
+; CHECK-ORDERED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 4 [[TMP6]], <vscale x 4 x i1> [[TMP5]], <vscale x 4 x float> poison)
 ; CHECK-ORDERED-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP5]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> splat (float 3.000000e+00)
 ; CHECK-ORDERED-NEXT:    [[TMP7]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[PREDPHI]])
 ; CHECK-ORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
@@ -1012,11 +1012,11 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP7]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 4 [[TMP7]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP8:%.*]] = fcmp une <vscale x 4 x float> [[WIDE_MASKED_LOAD]], zeroinitializer
 ; CHECK-ORDERED-TF-NEXT:    [[TMP9:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i1> zeroinitializer
 ; CHECK-ORDERED-TF-NEXT:    [[TMP10:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX]]
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP10]], i32 4, <vscale x 4 x i1> [[TMP9]], <vscale x 4 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 4 [[TMP10]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP8]], <vscale x 4 x float> [[WIDE_MASKED_LOAD1]], <vscale x 4 x float> splat (float 3.000000e+00)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP11:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[PREDPHI]], <vscale x 4 x float> splat (float -0.000000e+00)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP12]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP11]])
@@ -1419,10 +1419,10 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 24
 ; CHECK-ORDERED-TF-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[TMP21]]
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP13]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP16]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP19]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP22]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr align 4 [[TMP13]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr align 4 [[TMP16]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr align 4 [[TMP19]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr align 4 [[TMP22]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP24:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP25:%.*]] = shl nuw i64 [[TMP24]], 3
@@ -1433,10 +1433,10 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[TMP30:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP31:%.*]] = mul nuw i64 [[TMP30]], 24
 ; CHECK-ORDERED-TF-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[TMP31]]
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP23]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP26]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP29]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP32]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr align 4 [[TMP23]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr align 4 [[TMP26]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr align 4 [[TMP29]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr align 4 [[TMP32]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP33:%.*]] = fmul <vscale x 8 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD12]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP34:%.*]] = fmul <vscale x 8 x float> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD13]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP35:%.*]] = fmul <vscale x 8 x float> [[WIDE_MASKED_LOAD10]], [[WIDE_MASKED_LOAD14]]
@@ -1715,10 +1715,10 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 24
 ; CHECK-ORDERED-TF-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[TMP21]]
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP13]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP16]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP19]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP22]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr align 4 [[TMP13]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr align 4 [[TMP16]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr align 4 [[TMP19]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr align 4 [[TMP22]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP24:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP25:%.*]] = shl nuw i64 [[TMP24]], 3
@@ -1729,10 +1729,10 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[TMP30:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP31:%.*]] = mul nuw i64 [[TMP30]], 24
 ; CHECK-ORDERED-TF-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[TMP31]]
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP23]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP26]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP29]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP32]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr align 4 [[TMP23]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr align 4 [[TMP26]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr align 4 [[TMP29]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr align 4 [[TMP32]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP33:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD12]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP34:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD13]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP35:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_MASKED_LOAD10]], [[WIDE_MASKED_LOAD14]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll
index 8830ce33aecff..9a831690d632d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll
@@ -24,22 +24,23 @@ define void @struct_return_f32_widen(ptr noalias %in, ptr noalias writeonly %out
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP7]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 4 [[TMP7]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
 ; CHECK-NEXT:    [[TMP8:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @scalable_vec_masked_foo(<vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP8]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP8]], 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[OUT_A]], i64 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP9]], ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP9]], ptr align 4 [[TMP11]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[OUT_B]], i64 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP10]], ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP10]], ptr align 4 [[TMP12]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP6]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = xor i1 [[TMP13]], true
 ; CHECK-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %for.body
@@ -82,22 +83,23 @@ define void @struct_return_f64_widen(ptr noalias %in, ptr noalias writeonly %out
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds double, ptr [[IN]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP7]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
 ; CHECK-NEXT:    [[TMP8:%.*]] = call { <vscale x 2 x double>, <vscale x 2 x double> } @scalable_vec_masked_bar(<vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP8]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP8]], 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, ptr [[OUT_A]], i64 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP9]], ptr [[TMP11]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP9]], ptr align 8 [[TMP11]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, ptr [[OUT_B]], i64 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP10]], ptr [[TMP12]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP10]], ptr align 8 [[TMP12]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP6]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = xor i1 [[TMP13]], true
 ; CHECK-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %for.body
@@ -158,14 +160,14 @@ define void @struct_return_f32_widen_rt_checks(ptr %in, ptr writeonly %out_a, pt
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP15]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
 ; CHECK-NEXT:    [[TMP16:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @scalable_vec_masked_foo(<vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP16]], 0
 ; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP16]], 1
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, ptr [[OUT_A]], i64 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP17]], ptr [[TMP19]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP17]], ptr align 4 [[TMP19]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, ptr [[OUT_B]], i64 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP18]], ptr [[TMP20]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP18]], ptr align 4 [[TMP20]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP14]])
 ; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
index c775b44bd1ba6..c1f0a35069242 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
@@ -81,7 +81,7 @@ define void @cost_store_i8(ptr %dst) #0 {
 ; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
-; PRED-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> zeroinitializer, ptr [[TMP13]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; PRED-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> zeroinitializer, ptr align 1 [[TMP13]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
 ; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP11]])
 ; PRED-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -209,7 +209,7 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 {
 ; PRED-NEXT:    [[TMP8:%.*]] = trunc <vscale x 2 x i64> [[BROADCAST_SPLAT3]] to <vscale x 2 x i8>
 ; PRED-NEXT:    [[TMP9:%.*]] = and <vscale x 2 x i8> [[TMP8]], [[TMP11]]
 ; PRED-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
-; PRED-NEXT:    call void @llvm.masked.store.nxv2i8.p0(<vscale x 2 x i8> [[TMP9]], ptr [[TMP5]], i32 1, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]]), !alias.scope [[META6:![0-9]+]], !noalias [[META3]]
+; PRED-NEXT:    call void @llvm.masked.store.nxv2i8.p0(<vscale x 2 x i8> [[TMP9]], ptr align 1 [[TMP5]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]]), !alias.scope [[META6:![0-9]+]], !noalias [[META3]]
 ; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1000)
 ; PRED-NEXT:    [[TMP12:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll
index 495f9c07dae16..3f32bbe1f0102 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll
@@ -22,10 +22,10 @@ define void @cond_inv_load_i32i32i16(ptr noalias nocapture %a, ptr noalias nocap
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[COND:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 2, <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i16> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> align 2 [[BROADCAST_SPLAT]], <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i16> poison)
 ; CHECK-NEXT:    [[TMP7:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER]] to <vscale x 4 x i32>
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP7]], ptr [[TMP8]], i32 4, <vscale x 4 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP7]], ptr align 4 [[TMP8]], <vscale x 4 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -80,9 +80,9 @@ define void @cond_inv_load_f64f64f64(ptr noalias nocapture %a, ptr noalias nocap
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds double, ptr [[COND:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x double>, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = fcmp ogt <vscale x 4 x double> [[WIDE_LOAD]], splat (double 4.000000e-01)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0(<vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 8, <vscale x 4 x i1> [[TMP6]], <vscale x 4 x double> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0(<vscale x 4 x ptr> align 8 [[BROADCAST_SPLAT]], <vscale x 4 x i1> [[TMP6]], <vscale x 4 x double> poison)
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr double, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4f64.p0(<vscale x 4 x double> [[WIDE_MASKED_GATHER]], ptr [[TMP7]], i32 8, <vscale x 4 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f64.p0(<vscale x 4 x double> [[WIDE_MASKED_GATHER]], ptr align 8 [[TMP7]], <vscale x 4 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -138,11 +138,11 @@ define void @invariant_load_cond(ptr noalias nocapture %a, ptr nocapture readonl
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP8]], i32 4, <vscale x 4 x i1> [[TMP7]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[TMP7]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP8]], <vscale x 4 x i1> [[TMP7]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[BROADCAST_SPLAT]], <vscale x 4 x i1> [[TMP7]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP9]], ptr [[TMP10]], i32 4, <vscale x 4 x i1> [[TMP7]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP9]], ptr align 4 [[TMP10]], <vscale x 4 x i1> [[TMP7]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll
index d336f5f70755e..d1b1771ab1532 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll
@@ -20,7 +20,7 @@ define void @gather_nxv4i32_ind64(ptr noalias nocapture readonly %a, ptr noalias
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i64>, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], <vscale x 4 x i64> [[WIDE_LOAD]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[TMP6]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP6]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> poison)
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[C:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    store <vscale x 4 x float> [[WIDE_MASKED_GATHER]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
@@ -74,7 +74,7 @@ define void @scatter_nxv4i32_ind32(ptr noalias nocapture %a, ptr noalias nocaptu
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD1]] to <vscale x 4 x i64>
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], <vscale x 4 x i64> [[TMP7]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x ptr> [[TMP8]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x ptr> align 4 [[TMP8]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -123,7 +123,7 @@ define void @scatter_inv_nxv4i32(ptr noalias nocapture %inv, ptr noalias nocaptu
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> splat (i32 3), <vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> splat (i32 3), <vscale x 4 x ptr> align 4 [[BROADCAST_SPLAT]], <vscale x 4 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
@@ -175,8 +175,8 @@ define void @gather_inv_nxv4i32(ptr noalias nocapture %a, ptr noalias nocapture
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD]], splat (i32 3)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[WIDE_MASKED_GATHER]], ptr [[TMP5]], i32 4, <vscale x 4 x i1> [[TMP6]])
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[BROADCAST_SPLAT]], <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[WIDE_MASKED_GATHER]], ptr align 4 [[TMP5]], <vscale x 4 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll
index 95836f81918e2..3c0455938be80 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll
@@ -32,9 +32,9 @@ define void @cond_ind64(ptr noalias nocapture %a, ptr noalias nocapture readonly
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP9:%.*]] = trunc <vscale x 4 x i64> [[VEC_IND]] to <vscale x 4 x i1>
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP10]], i32 4, <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP10]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[TMP9]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], ptr align 4 [[TMP11]], <vscale x 4 x i1> [[TMP9]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index cde89763b26c3..786a2aab6d0e7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -115,10 +115,10 @@ define void @test_array_load2_i16_store2(i32 %C, i32 %D) #1 {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr @AB_i16, <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[TMP6]], i32 2, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i16> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> align 2 [[TMP6]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i16> poison)
 ; CHECK-NEXT:    [[TMP7:%.*]] = or disjoint <vscale x 4 x i64> [[VEC_IND]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr @AB_i16, <vscale x 4 x i64> [[TMP7]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[TMP8]], i32 2, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i16> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> align 2 [[TMP8]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i16> poison)
 ; CHECK-NEXT:    [[TMP9:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER]] to <vscale x 4 x i32>
 ; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <vscale x 4 x i32> [[BROADCAST_SPLAT]], [[TMP9]]
 ; CHECK-NEXT:    [[DOTIDX:%.*]] = shl i64 [[INDEX]], 3
@@ -208,11 +208,11 @@ define void @test_array_load2_store2_i16(i32 noundef %C, i32 noundef %D) #1 {
 ; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <vscale x 4 x i32> [[TMP7]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = trunc <vscale x 4 x i32> [[TMP10]] to <vscale x 4 x i16>
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, ptr @CD_i16, <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> [[TMP11]], <vscale x 4 x ptr> [[TMP12]], i32 2, <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> [[TMP11]], <vscale x 4 x ptr> align 2 [[TMP12]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP13:%.*]] = mul nsw <vscale x 4 x i32> [[TMP8]], [[BROADCAST_SPLAT2]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = trunc <vscale x 4 x i32> [[TMP13]] to <vscale x 4 x i16>
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i16, ptr @CD_i16, <vscale x 4 x i64> [[TMP9]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> [[TMP14]], <vscale x 4 x ptr> [[TMP15]], i32 2, <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> [[TMP14]], <vscale x 4 x ptr> align 2 [[TMP15]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
@@ -268,22 +268,22 @@ define i32 @test_struct_load6(ptr %S) #1 {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP2]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_ST6:%.*]], ptr [[S:%.*]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP5]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP5]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[DOTSPLIT:%.*]] = getelementptr inbounds [[STRUCT_ST6]], ptr [[S]], <vscale x 4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, <vscale x 4 x ptr> [[DOTSPLIT]], i64 4
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP6]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP6]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[DOTSPLIT6:%.*]] = getelementptr inbounds [[STRUCT_ST6]], ptr [[S]], <vscale x 4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, <vscale x 4 x ptr> [[DOTSPLIT6]], i64 8
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP7]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP7]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[DOTSPLIT7:%.*]] = getelementptr inbounds [[STRUCT_ST6]], ptr [[S]], <vscale x 4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, <vscale x 4 x ptr> [[DOTSPLIT7]], i64 12
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP8]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP8]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[DOTSPLIT8:%.*]] = getelementptr inbounds [[STRUCT_ST6]], ptr [[S]], <vscale x 4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, <vscale x 4 x ptr> [[DOTSPLIT8]], i64 16
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP9]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP9]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[DOTSPLIT9:%.*]] = getelementptr inbounds [[STRUCT_ST6]], ptr [[S]], <vscale x 4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, <vscale x 4 x ptr> [[DOTSPLIT9]], i64 20
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP10]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP10]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP11]], [[WIDE_MASKED_GATHER2]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER1]], [[WIDE_MASKED_GATHER3]]
@@ -590,10 +590,10 @@ define void @load_gap_reverse(ptr noalias nocapture readonly %P1, ptr noalias no
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[PAIR:%.*]], ptr [[P1:%.*]], <vscale x 4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[DOTSPLIT:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P2:%.*]], <vscale x 4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, <vscale x 4 x ptr> [[DOTSPLIT]], i64 8
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i64> @llvm.masked.gather.nxv4i64.nxv4p0(<vscale x 4 x ptr> [[TMP6]], i32 8, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i64> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i64> @llvm.masked.gather.nxv4i64.nxv4p0(<vscale x 4 x ptr> align 8 [[TMP6]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i64> poison)
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub nsw <vscale x 4 x i64> [[WIDE_MASKED_GATHER]], [[VEC_IND]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i64.nxv4p0(<vscale x 4 x i64> [[TMP4]], <vscale x 4 x ptr> [[TMP5]], i32 8, <vscale x 4 x i1> splat (i1 true))
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i64.nxv4p0(<vscale x 4 x i64> [[TMP7]], <vscale x 4 x ptr> [[TMP6]], i32 8, <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i64.nxv4p0(<vscale x 4 x i64> [[TMP4]], <vscale x 4 x ptr> align 8 [[TMP5]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i64.nxv4p0(<vscale x 4 x i64> [[TMP7]], <vscale x 4 x ptr> align 8 [[TMP6]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -728,8 +728,8 @@ define void @int_float_struct(ptr nocapture readonly %p) #0 {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ insertelement (<vscale x 4 x float> zeroinitializer, float undef, i32 0), [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ insertelement (<vscale x 4 x i32> zeroinitializer, i32 undef, i32 0), [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_INTFLOAT:%.*]], ptr [[P:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
@@ -755,8 +755,8 @@ for.cond.cleanup:                                 ; preds = %for.body
 
 for.body:                                         ; preds = %for.body, %entry
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %SumB.014 = phi float [ undef, %entry ], [ %add3, %for.body ]
-  %SumA.013 = phi i32 [ undef, %entry ], [ %add, %for.body ]
+  %SumB.014 = phi float [ 0.0e+00, %entry ], [ %add3, %for.body ]
+  %SumA.013 = phi i32 [ 0, %entry ], [ %add, %for.body ]
   %a = getelementptr inbounds %struct.IntFloat, ptr %p, i64 %indvars.iv, i32 0
   %load1 = load i32, ptr %a, align 4
   %add = add nsw i32 %load1, %SumA.013
@@ -812,14 +812,14 @@ define void @PR27626_0(ptr %p, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP9]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], <vscale x 4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 4 x ptr> [[TMP12]], i64 0
 ; CHECK-NEXT:    [[DOTSPLIT:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], <vscale x 4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, <vscale x 4 x ptr> [[DOTSPLIT]], i64 4
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> [[TMP12]], i32 4, <vscale x 4 x i1> splat (i1 true))
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 4 x ptr> [[TMP12]], i64 0
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP14]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP15]], <vscale x 4 x ptr> [[TMP13]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP15]], <vscale x 4 x ptr> align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -884,11 +884,11 @@ define i32 @PR27626_1(ptr %p, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[DOTSPLIT:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], <vscale x 4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, <vscale x 4 x ptr> [[DOTSPLIT]], i64 4
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 4 x ptr> [[TMP13]], i64 0
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP14]], <vscale x 4 x ptr> [[TMP13]], i32 4, <vscale x 4 x i1> splat (i1 true))
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 4 x ptr> [[TMP13]], i64 0
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP14]], <vscale x 4 x ptr> align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <vscale x 8 x i32>, ptr [[TMP15]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC1]])
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
@@ -962,11 +962,11 @@ define void @PR27626_2(ptr %p, i64 %n, i32 %z) #1 {
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 -8
 ; CHECK-NEXT:    [[DOTSPLIT:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], <vscale x 4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, <vscale x 4 x ptr> [[DOTSPLIT]], i64 4
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> [[TMP12]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP13]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP15]], <vscale x 4 x ptr> [[TMP14]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP15]], <vscale x 4 x ptr> align 4 [[TMP14]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1038,7 +1038,7 @@ define i32 @PR27626_3(ptr %p, i64 %n, i32 %z) #1 {
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP13]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP16]], <vscale x 4 x ptr> [[TMP15]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP16]], <vscale x 4 x ptr> align 4 [[TMP15]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <vscale x 8 x i32>, ptr [[TMP14]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC1]])
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
@@ -1120,8 +1120,8 @@ define void @PR27626_4(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP9]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> [[TMP13]], i32 4, <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[P:%.*]] = extractelement <vscale x 4 x ptr> [[TMP13]], i64 0
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[BROADCAST_SPLAT2]], <vscale x 4 x i32> [[BROADCAST_SPLAT4]])
 ; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[P]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
@@ -1202,9 +1202,9 @@ define void @PR27626_5(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], <vscale x 4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], <vscale x 4 x i64> [[TMP13]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], <vscale x 4 x i64> [[TMP14]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> [[TMP16]], i32 4, <vscale x 4 x i1> splat (i1 true))
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT2]], <vscale x 4 x ptr> [[TMP17]], i32 4, <vscale x 4 x i1> splat (i1 true))
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT4]], <vscale x 4 x ptr> [[TMP15]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> align 4 [[TMP16]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT2]], <vscale x 4 x ptr> align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT4]], <vscale x 4 x ptr> align 4 [[TMP15]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1289,10 +1289,10 @@ define void @PR34743(ptr %a, ptr %b, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP18:%.*]] = add nuw nsw <vscale x 4 x i64> [[VEC_IND]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP19:%.*]] = add nuw nsw <vscale x 4 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i16, ptr [[A]], <vscale x 4 x i64> [[TMP18]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[TMP20]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i16> poison), !alias.scope [[META34:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP20]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i16> poison), !alias.scope [[META34:![0-9]+]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER]] to <vscale x 4 x i32>
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[A]], <vscale x 4 x i64> [[TMP19]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER4]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[TMP22]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i16> poison), !alias.scope [[META34]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER4]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP22]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i16> poison), !alias.scope [[META34]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> [[VECTOR_RECUR]], <vscale x 4 x i16> [[WIDE_MASKED_GATHER4]], i32 -1)
 ; CHECK-NEXT:    [[TMP24:%.*]] = sext <vscale x 4 x i16> [[TMP23]] to <vscale x 4 x i32>
 ; CHECK-NEXT:    [[TMP25:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER4]] to <vscale x 4 x i32>
@@ -1373,26 +1373,26 @@ define void @interleave_deinterleave_factor3(ptr writeonly noalias %dst, ptr rea
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_XYZ:%.*]], ptr [[A:%.*]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP19]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP19]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_XYZ]], ptr [[B:%.*]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP20]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP11:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP20]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP14:%.*]] = add nsw <vscale x 4 x i32> [[TMP11]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_XYZ]], ptr [[DST:%.*]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP14]], <vscale x 4 x ptr> [[TMP10]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP14]], <vscale x 4 x ptr> align 4 [[TMP10]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw i8, <vscale x 4 x ptr> [[TMP19]], i64 4
-; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP21]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP21]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw i8, <vscale x 4 x ptr> [[TMP20]], i64 4
-; CHECK-NEXT:    [[TMP12:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP22]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP12:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP22]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP16:%.*]] = sub nsw <vscale x 4 x i32> [[TMP8]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw i8, <vscale x 4 x ptr> [[TMP10]], i64 4
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP16]], <vscale x 4 x ptr> [[TMP23]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP16]], <vscale x 4 x ptr> align 4 [[TMP23]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw i8, <vscale x 4 x ptr> [[TMP19]], i64 8
-; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP15]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP15]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw i8, <vscale x 4 x ptr> [[TMP20]], i64 8
-; CHECK-NEXT:    [[TMP13:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP24]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP13:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP24]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP17:%.*]] = shl <vscale x 4 x i32> [[TMP9]], [[TMP13]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw i8, <vscale x 4 x ptr> [[TMP10]], i64 8
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP17]], <vscale x 4 x ptr> [[TMP25]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP17]], <vscale x 4 x ptr> align 4 [[TMP25]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
index 3b0bd87587cc0..02cc499f18827 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
@@ -48,7 +48,7 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP6]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr [[TMP7]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
+; SCALAR_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr align 1 [[TMP7]], <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
 ; SCALAR_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[WIDE_MASKED_VEC]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
@@ -58,7 +58,7 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP10]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP10]], <vscale x 16 x i8> [[TMP13]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK3:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]])
-; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP12]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK3]])
+; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr align 1 [[TMP12]], <vscale x 32 x i1> [[INTERLEAVED_MASK3]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP2]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -98,7 +98,7 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP11]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP9]], <vscale x 16 x i1> [[TMP9]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr [[TMP12]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr align 1 [[TMP12]], <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[WIDE_MASKED_VEC]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
@@ -108,7 +108,7 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP15]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP15]], <vscale x 16 x i8> [[TMP18]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK3:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP9]], <vscale x 16 x i1> [[TMP9]])
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP17]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK3]])
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr align 1 [[TMP17]], <vscale x 32 x i1> [[INTERLEAVED_MASK3]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP6]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
@@ -191,12 +191,12 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], splat (i32 1)
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = zext nneg <vscale x 16 x i32> [[TMP4]] to <vscale x 16 x i64>
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP5]]
-; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 1), <vscale x 16 x ptr> [[TMP6]], i32 1, <vscale x 16 x i1> splat (i1 true))
+; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 1), <vscale x 16 x ptr> align 1 [[TMP6]], <vscale x 16 x i1> splat (i1 true))
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = or disjoint <vscale x 16 x i32> [[TMP4]], splat (i32 1)
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP9]]
-; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 2), <vscale x 16 x ptr> [[TMP10]], i32 1, <vscale x 16 x i1> [[TMP7]])
+; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 2), <vscale x 16 x ptr> align 1 [[TMP10]], <vscale x 16 x i1> [[TMP7]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP2]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -233,13 +233,13 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], splat (i32 1)
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP9]]
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 1), <vscale x 16 x ptr> [[TMP10]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 1), <vscale x 16 x ptr> align 1 [[TMP10]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP11]], <vscale x 16 x i1> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = or disjoint <vscale x 16 x i32> [[TMP8]], splat (i32 1)
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = zext nneg <vscale x 16 x i32> [[TMP13]] to <vscale x 16 x i64>
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP14]]
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 2), <vscale x 16 x ptr> [[TMP15]], i32 1, <vscale x 16 x i1> [[TMP12]])
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 2), <vscale x 16 x ptr> align 1 [[TMP15]], <vscale x 16 x i1> [[TMP12]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP6]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
@@ -321,12 +321,12 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = zext nneg <vscale x 16 x i32> [[TMP4]] to <vscale x 16 x i64>
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP6]]
-; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 1), <vscale x 16 x ptr> [[TMP7]], i32 1, <vscale x 16 x i1> [[TMP5]])
+; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 1), <vscale x 16 x ptr> align 1 [[TMP7]], <vscale x 16 x i1> [[TMP5]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = or disjoint <vscale x 16 x i32> [[TMP4]], splat (i32 1)
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64>
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP10]]
-; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 2), <vscale x 16 x ptr> [[TMP11]], i32 1, <vscale x 16 x i1> [[TMP8]])
+; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 2), <vscale x 16 x ptr> align 1 [[TMP11]], <vscale x 16 x i1> [[TMP8]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP2]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT4]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -368,13 +368,13 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP9]], <vscale x 16 x i1> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP11]]
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 1), <vscale x 16 x ptr> [[TMP12]], i32 1, <vscale x 16 x i1> [[TMP10]])
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 1), <vscale x 16 x ptr> align 1 [[TMP12]], <vscale x 16 x i1> [[TMP10]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP13]], <vscale x 16 x i1> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = or disjoint <vscale x 16 x i32> [[TMP8]], splat (i32 1)
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = zext nneg <vscale x 16 x i32> [[TMP15]] to <vscale x 16 x i64>
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP16]]
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 2), <vscale x 16 x ptr> [[TMP17]], i32 1, <vscale x 16 x i1> [[TMP14]])
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 2), <vscale x 16 x ptr> align 1 [[TMP17]], <vscale x 16 x i1> [[TMP14]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP6]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
@@ -467,7 +467,7 @@ define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p,
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP6]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr [[TMP7]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK]], <vscale x 64 x i8> poison)
+; SCALAR_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr align 1 [[TMP7]], <vscale x 64 x i1> [[INTERLEAVED_MASK]], <vscale x 64 x i8> poison)
 ; SCALAR_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> [[WIDE_MASKED_VEC]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
@@ -481,7 +481,7 @@ define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p,
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP16]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave4.nxv64i8(<vscale x 16 x i8> [[TMP12]], <vscale x 16 x i8> [[TMP13]], <vscale x 16 x i8> [[TMP14]], <vscale x 16 x i8> [[TMP15]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK3:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]])
-; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr [[TMP17]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK3]])
+; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr align 1 [[TMP17]], <vscale x 64 x i1> [[INTERLEAVED_MASK3]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP2]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -521,7 +521,7 @@ define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p,
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP11]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP9]], <vscale x 16 x i1> [[TMP9]], <vscale x 16 x i1> [[TMP9]], <vscale x 16 x i1> [[TMP9]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr [[TMP12]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK]], <vscale x 64 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr align 1 [[TMP12]], <vscale x 64 x i1> [[INTERLEAVED_MASK]], <vscale x 64 x i8> poison)
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> [[WIDE_MASKED_VEC]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
@@ -535,7 +535,7 @@ define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p,
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP21]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave4.nxv64i8(<vscale x 16 x i8> [[TMP17]], <vscale x 16 x i8> [[TMP18]], <vscale x 16 x i8> [[TMP19]], <vscale x 16 x i8> [[TMP20]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK3:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP9]], <vscale x 16 x i1> [[TMP9]], <vscale x 16 x i1> [[TMP9]], <vscale x 16 x i1> [[TMP9]])
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr [[TMP22]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK3]])
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr align 1 [[TMP22]], <vscale x 64 x i1> [[INTERLEAVED_MASK3]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP6]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP23:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll
index 4ae49354e6443..9c3f3f74e4196 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll
@@ -70,7 +70,7 @@ define void @cond_inv_store_i32(ptr noalias %dst, ptr noalias readonly %src, i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[TMP9]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x ptr> align 4 [[BROADCAST_SPLAT]], <vscale x 4 x i1> [[TMP9]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-large-strides.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-large-strides.ll
index a1a89ee78f00a..76cf7d4c2990b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-large-strides.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-large-strides.ll
@@ -7,9 +7,9 @@ define void @stride7_i32(ptr noalias nocapture %dst, i64 %n) #0 {
 ; CHECK:        %[[VEC_IND:.*]] = phi <vscale x 4 x i64> [ %{{.*}}, %vector.ph ], [ %{{.*}}, %vector.body ]
 ; CHECK-NEXT:   %[[PTR_INDICES:.*]] = mul nuw nsw <vscale x 4 x i64> %[[VEC_IND]], splat (i64 7)
 ; CHECK-NEXT:   %[[PTRS:.*]] = getelementptr inbounds i32, ptr %dst, <vscale x 4 x i64> %[[PTR_INDICES]]
-; CHECK-NEXT:   %[[GLOAD:.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> %[[PTRS]]
+; CHECK-NEXT:   %[[GLOAD:.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 %[[PTRS]]
 ; CHECK-NEXT:   %[[VALS:.*]] = add nsw <vscale x 4 x i32> %[[GLOAD]],
-; CHECK-NEXT:   call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> %[[VALS]], <vscale x 4 x ptr> %[[PTRS]]
+; CHECK-NEXT:   call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> %[[VALS]], <vscale x 4 x ptr> align 4 %[[PTRS]]
 entry:
   br label %for.body
 
@@ -34,9 +34,9 @@ define void @stride7_f64(ptr noalias nocapture %dst, i64 %n) #0 {
 ; CHECK:        %[[VEC_IND:.*]] = phi <vscale x 2 x i64> [ %{{.*}}, %vector.ph ], [ %{{.*}}, %vector.body ]
 ; CHECK-NEXT:   %[[PTR_INDICES:.*]] = mul nuw nsw <vscale x 2 x i64> %[[VEC_IND]], splat (i64 7)
 ; CHECK-NEXT:   %[[PTRS:.*]] = getelementptr inbounds double, ptr %dst, <vscale x 2 x i64> %[[PTR_INDICES]]
-; CHECK-NEXT:   %[[GLOAD:.*]] = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> %[[PTRS]],
+; CHECK-NEXT:   %[[GLOAD:.*]] = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> align 8 %[[PTRS]],
 ; CHECK-NEXT:   %[[VALS:.*]] = fadd <vscale x 2 x double> %[[GLOAD]],
-; CHECK-NEXT:  call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> %[[VALS]], <vscale x 2 x ptr> %[[PTRS]],
+; CHECK-NEXT:  call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> %[[VALS]], <vscale x 2 x ptr> align 8 %[[PTRS]],
 entry:
   br label %for.body
 
@@ -61,9 +61,9 @@ define void @cond_stride7_f64(ptr noalias nocapture %dst, ptr noalias nocapture
 ; CHECK:      vector.body
 ; CHECK:        %[[MASK:.*]] = icmp ne <vscale x 2 x i64>
 ; CHECK:        %[[PTRS:.*]] = getelementptr inbounds double, ptr %dst, <vscale x 2 x i64> %{{.*}}
-; CHECK-NEXT:   %[[GLOAD:.*]] = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> %[[PTRS]], i32 8, <vscale x 2 x i1> %[[MASK]]
+; CHECK-NEXT:   %[[GLOAD:.*]] = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> align 8 %[[PTRS]], <vscale x 2 x i1> %[[MASK]]
 ; CHECK-NEXT:   %[[VALS:.*]] = fadd <vscale x 2 x double> %[[GLOAD]],
-; CHECK-NEXT:  call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> %[[VALS]], <vscale x 2 x ptr> %[[PTRS]], i32 8, <vscale x 2 x i1> %[[MASK]])
+; CHECK-NEXT:  call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> %[[VALS]], <vscale x 2 x ptr> align 8 %[[PTRS]], <vscale x 2 x i1> %[[MASK]])
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll
index d44751799ebcf..8c62ffd9c7a98 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll
@@ -17,20 +17,21 @@ define void @trip7_i64(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 8 [[TMP5]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
 ; CHECK-NEXT:    [[TMP6:%.*]] = shl nsw <vscale x 2 x i64> [[WIDE_MASKED_LOAD]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 8 [[TMP7]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
 ; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <vscale x 2 x i64> [[WIDE_MASKED_LOAD1]], [[TMP6]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP8]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP8]], ptr align 8 [[TMP7]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 7)
 ; CHECK-NEXT:    [[EXTRACT_FIRST_LANE_MASK:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; CHECK-NEXT:    [[COND:%.*]] = xor i1 [[EXTRACT_FIRST_LANE_MASK]], true
 ; CHECK-NEXT:    br i1 [[COND]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-masked-loadstore.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-masked-loadstore.ll
index 820fd8840049e..54c0dfdfe7366 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-masked-loadstore.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-masked-loadstore.ll
@@ -7,9 +7,9 @@ define void @mloadstore_f32(ptr noalias nocapture %a, ptr noalias nocapture read
 ; CHECK:       %[[LOAD1:.*]] = load <vscale x 4 x float>, ptr
 ; CHECK-NEXT:  %[[MASK:.*]] = fcmp ogt <vscale x 4 x float> %[[LOAD1]],
 ; CHECK-NEXT:  %[[GEPA:.*]] = getelementptr float, ptr %a,
-; CHECK-NEXT:  %[[LOAD2:.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %[[GEPA]], i32 4, <vscale x 4 x i1> %[[MASK]]
+; CHECK-NEXT:  %[[LOAD2:.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 4 %[[GEPA]], <vscale x 4 x i1> %[[MASK]]
 ; CHECK-NEXT:  %[[FADD:.*]] = fadd <vscale x 4 x float> %[[LOAD1]], %[[LOAD2]]
-; CHECK-NEXT:  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %[[FADD]], ptr %[[GEPA]], i32 4, <vscale x 4 x i1> %[[MASK]])
+; CHECK-NEXT:  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %[[FADD]], ptr align 4 %[[GEPA]], <vscale x 4 x i1> %[[MASK]])
 entry:
   br label %for.body
 
@@ -42,9 +42,9 @@ define void @mloadstore_i32(ptr noalias nocapture %a, ptr noalias nocapture read
 ; CHECK:       %[[LOAD1:.*]] = load <vscale x 4 x i32>, ptr
 ; CHECK-NEXT:  %[[MASK:.*]] = icmp ne <vscale x 4 x i32> %[[LOAD1]],
 ; CHECK-NEXT:  %[[GEPA:.*]] = getelementptr i32, ptr %a,
-; CHECK-NEXT:  %[[LOAD2:.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %[[GEPA]], i32 4, <vscale x 4 x i1> %[[MASK]]
+; CHECK-NEXT:  %[[LOAD2:.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 %[[GEPA]], <vscale x 4 x i1> %[[MASK]]
 ; CHECK-NEXT:  %[[FADD:.*]] = add <vscale x 4 x i32> %[[LOAD1]], %[[LOAD2]]
-; CHECK-NEXT:  call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %[[FADD]], ptr %[[GEPA]], i32 4, <vscale x 4 x i1> %[[MASK]])
+; CHECK-NEXT:  call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %[[FADD]], ptr align 4 %[[GEPA]], <vscale x 4 x i1> %[[MASK]])
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll
index 1018bdd7a4ea1..c348decfc88bf 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll
@@ -156,7 +156,7 @@ define i32 @pred_select_const_i32_from_icmp(ptr noalias nocapture readonly %src1
 ; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
 ; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <vscale x 4 x i32>
 ; CHECK-VF4IC1:        [[MASK:%.*]] = icmp sgt <vscale x 4 x i32> [[VEC_LOAD]], splat (i32 35)
-; CHECK-VF4IC1:        [[MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr {{%.*}}, i32 4, <vscale x 4 x i1> [[MASK]], <vscale x 4 x i32> poison)
+; CHECK-VF4IC1:        [[MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 {{%.*}}, <vscale x 4 x i1> [[MASK]], <vscale x 4 x i32> poison)
 ; CHECK-VF4IC1-NEXT:   [[VEC_ICMP:%.*]] = icmp eq <vscale x 4 x i32> [[MASKED_LOAD]], splat (i32 2)
 ; CHECK-VF4IC1-NEXT:   [[VEC_SEL_TMP:%.*]] = or <vscale x 4 x i1> [[VEC_PHI]], [[VEC_ICMP]]
 ; CHECK-VF4IC1:        [[VEC_SEL:%.*]] = select <vscale x 4 x i1> [[MASK]], <vscale x 4 x i1> [[VEC_SEL_TMP]], <vscale x 4 x i1> [[VEC_PHI]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
index b8b4fbd3140de..8108320fd54ab 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
@@ -61,7 +61,7 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX1]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr align 4 [[TMP11]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP14]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll
index cb2c003872573..243ea7c3c4c43 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll
@@ -16,12 +16,12 @@ define void @trip1025_i64(ptr noalias nocapture noundef %dst, ptr noalias nocapt
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[SRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP8]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 8 [[TMP8]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl nsw <vscale x 2 x i64> [[WIDE_MASKED_LOAD]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP11]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 8 [[TMP11]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
 ; CHECK-NEXT:    [[TMP13:%.*]] = add nsw <vscale x 2 x i64> [[WIDE_MASKED_LOAD1]], [[TMP10]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP13]], ptr [[TMP11]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP13]], ptr align 8 [[TMP11]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll
index 33ee0d6e2ae2f..3b8625eb76711 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll
@@ -23,10 +23,10 @@ define void @cannot_overflow_i32_induction_var(ptr noalias %dst, ptr readonly %s
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP0]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP0]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], splat (i32 42)
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP1]], ptr [[TMP2]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP1]], ptr align 4 [[TMP2]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
@@ -83,10 +83,10 @@ define void @can_overflow_i64_induction_var(ptr noalias %dst, ptr readonly %src,
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP3]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP3]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], splat (i32 42)
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP4]], ptr [[TMP5]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP4]], ptr align 4 [[TMP5]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP2]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll
index b5544dc3310c9..ae7c9d263c179 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll
@@ -26,7 +26,7 @@ define i32 @add_reduction_i32(ptr %ptr, i64 %n) #0 {
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX1]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP14]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP13]], <vscale x 4 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]]
@@ -59,7 +59,7 @@ define i32 @add_reduction_i32(ptr %ptr, i64 %n) #0 {
 ; CHECK-IN-LOOP-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-IN-LOOP-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-IN-LOOP-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX1]]
-; CHECK-IN-LOOP-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-IN-LOOP-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-IN-LOOP-NEXT:    [[TMP13:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32> zeroinitializer
 ; CHECK-IN-LOOP-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP13]])
 ; CHECK-IN-LOOP-NEXT:    [[TMP15]] = add i32 [[VEC_PHI]], [[TMP14]]
@@ -110,7 +110,7 @@ define float @add_reduction_f32(ptr %ptr, i64 %n) #0 {
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr float, ptr [[PTR:%.*]], i64 [[INDEX1]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
 ; CHECK-NEXT:    [[TMP13:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> splat (float -0.000000e+00)
 ; CHECK-NEXT:    [[TMP14]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP13]])
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]]
@@ -142,7 +142,7 @@ define float @add_reduction_f32(ptr %ptr, i64 %n) #0 {
 ; CHECK-IN-LOOP-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-IN-LOOP-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-IN-LOOP-NEXT:    [[TMP11:%.*]] = getelementptr float, ptr [[PTR:%.*]], i64 [[INDEX1]]
-; CHECK-IN-LOOP-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
+; CHECK-IN-LOOP-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
 ; CHECK-IN-LOOP-NEXT:    [[TMP13:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> splat (float -0.000000e+00)
 ; CHECK-IN-LOOP-NEXT:    [[TMP14]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP13]])
 ; CHECK-IN-LOOP-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]]
@@ -191,11 +191,11 @@ define i32 @cond_xor_reduction(ptr noalias %a, ptr noalias %cond, i64 %N) #0 {
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ insertelement (<vscale x 4 x i32> zeroinitializer, i32 7, i32 0), [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[COND:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], splat (i32 5)
 ; CHECK-NEXT:    [[TMP15:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP14]], i32 4, <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP14]], <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP17:%.*]] = xor <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_LOAD1]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i32> [[TMP17]], <vscale x 4 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP20]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[PREDPHI]], <vscale x 4 x i32> [[VEC_PHI]]
@@ -228,11 +228,11 @@ define i32 @cond_xor_reduction(ptr noalias %a, ptr noalias %cond, i64 %N) #0 {
 ; CHECK-IN-LOOP-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-IN-LOOP-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 7, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-IN-LOOP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[COND:%.*]], i64 [[INDEX]]
-; CHECK-IN-LOOP-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-IN-LOOP-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-IN-LOOP-NEXT:    [[TMP13:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], splat (i32 5)
 ; CHECK-IN-LOOP-NEXT:    [[TMP15:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> zeroinitializer
 ; CHECK-IN-LOOP-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-IN-LOOP-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP14]], i32 4, <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> poison)
+; CHECK-IN-LOOP-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP14]], <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> poison)
 ; CHECK-IN-LOOP-NEXT:    [[TMP17:%.*]] = select <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD1]], <vscale x 4 x i32> zeroinitializer
 ; CHECK-IN-LOOP-NEXT:    [[TMP18:%.*]] = call i32 @llvm.vector.reduce.xor.nxv4i32(<vscale x 4 x i32> [[TMP17]])
 ; CHECK-IN-LOOP-NEXT:    [[TMP19]] = xor i32 [[VEC_PHI]], [[TMP18]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll
index 5531b3ca51140..c8ecb7f864521 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll
@@ -49,10 +49,10 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
 ; CHECK-NEXT:    [[TMP58:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP59:%.*]] = mul nuw i64 [[TMP58]], 12
 ; CHECK-NEXT:    [[TMP60:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP59]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP47]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP54]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK7]])
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP57]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK8]])
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP60]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK9]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr align 4 [[TMP47]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr align 4 [[TMP54]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK7]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr align 4 [[TMP57]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK8]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr align 4 [[TMP60]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK9]])
 ; CHECK-NEXT:    [[INDEX_NEXT10]] = add i64 [[INDEX6]], [[TMP62]]
 ; CHECK-NEXT:    [[TMP63:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP64:%.*]] = shl nuw i64 [[TMP63]], 2
@@ -135,10 +135,10 @@ define void @cond_memset(i32 %val, ptr noalias readonly %cond_ptr, ptr noalias %
 ; CHECK-NEXT:    [[TMP58:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP59:%.*]] = mul nuw i64 [[TMP58]], 12
 ; CHECK-NEXT:    [[TMP60:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP59]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP47]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP54]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP57]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP60]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK9]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP47]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP54]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP57]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP60]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK9]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP62:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_MASKED_LOAD10]], zeroinitializer
 ; CHECK-NEXT:    [[TMP63:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_MASKED_LOAD11]], zeroinitializer
@@ -157,10 +157,10 @@ define void @cond_memset(i32 %val, ptr noalias readonly %cond_ptr, ptr noalias %
 ; CHECK-NEXT:    [[TMP80:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP81:%.*]] = mul nuw i64 [[TMP80]], 12
 ; CHECK-NEXT:    [[TMP82:%.*]] = getelementptr i32, ptr [[TMP65]], i64 [[TMP81]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP65]], i32 4, <vscale x 4 x i1> [[TMP69]])
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP76]], i32 4, <vscale x 4 x i1> [[TMP70]])
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP79]], i32 4, <vscale x 4 x i1> [[TMP71]])
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP82]], i32 4, <vscale x 4 x i1> [[TMP72]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr align 4 [[TMP65]], <vscale x 4 x i1> [[TMP69]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr align 4 [[TMP76]], <vscale x 4 x i1> [[TMP70]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr align 4 [[TMP79]], <vscale x 4 x i1> [[TMP71]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr align 4 [[TMP82]], <vscale x 4 x i1> [[TMP72]])
 ; CHECK-NEXT:    [[INDEX_NEXT13]] = add i64 [[INDEX6]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP85:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP86:%.*]] = shl nuw i64 [[TMP85]], 2
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
index 9ebe79096adc4..945d808d3fa3f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
@@ -25,7 +25,7 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX1]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr align 4 [[TMP11]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP1]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -69,7 +69,7 @@ define void @simple_memset_v4i32(i32 %val, ptr %ptr, i64 %n) #0 {
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX1]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP4]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr align 4 [[TMP4]], <4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], 4
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX1]], i64 [[TMP2]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -115,9 +115,9 @@ define void @simple_memcpy(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[SRC:%.*]], i64 [[INDEX1]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[INDEX1]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], ptr [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], ptr align 4 [[TMP13]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP1]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -175,9 +175,9 @@ define void @copy_stride4(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i32, ptr [[SRC:%.*]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP19]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP19]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr [[DST:%.*]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[WIDE_MASKED_GATHER]], <vscale x 4 x ptr> [[TMP20]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[WIDE_MASKED_GATHER]], <vscale x 4 x ptr> align 4 [[TMP20]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP4]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP12]])
 ; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -226,11 +226,11 @@ define void @simple_gather_scatter(ptr noalias %dst, ptr noalias %src, ptr noali
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[IND:%.*]], i64 [[INDEX1]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[SRC:%.*]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP13]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[DST:%.*]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[WIDE_MASKED_GATHER]], <vscale x 4 x ptr> [[TMP14]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[WIDE_MASKED_GATHER]], <vscale x 4 x ptr> align 4 [[TMP14]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP1]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -284,7 +284,7 @@ define void @uniform_load(ptr noalias %dst, ptr noalias readonly %src, i64 %n) #
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP11]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr align 4 [[TMP12]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -337,13 +337,13 @@ define void @cond_uniform_load(ptr noalias %dst, ptr noalias readonly %src, ptr
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[COND:%.*]], i64 [[INDEX1]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP15:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> zeroinitializer
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[BROADCAST_SPLAT]], <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[INDEX1]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[PREDPHI]], ptr [[TMP16]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[PREDPHI]], ptr align 4 [[TMP16]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP1]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -404,8 +404,8 @@ define void @uniform_store(ptr noalias %dst, ptr noalias readonly %src, i64 %n)
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x ptr> align 4 [[BROADCAST_SPLAT]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -454,10 +454,10 @@ define void @simple_fdiv(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr float, ptr [[SRC:%.*]], i64 [[INDEX1]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr float, ptr [[DST:%.*]], i64 [[INDEX1]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
 ; CHECK-NEXT:    [[TMP15:%.*]] = fdiv <vscale x 4 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD2]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP15]], ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP15]], ptr align 4 [[TMP12]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP1]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -509,11 +509,11 @@ define void @simple_idiv(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[SRC:%.*]], i64 [[INDEX1]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[INDEX1]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP15:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD2]], <vscale x 4 x i32> splat (i32 1)
 ; CHECK-NEXT:    [[TMP16:%.*]] = udiv <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], [[TMP15]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP16]], ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP16]], ptr align 4 [[TMP12]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP1]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll
index 9485d827ced40..c3f7a251e37fa 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll
@@ -19,12 +19,12 @@ target triple = "aarch64-unknown-linux-gnu"
 define void @vector_reverse_mask_nxv4i1(ptr %a, ptr %cond, i64 %N) #0 {
 ; CHECK-LABEL: vector.body:
 ; CHECK: %[[REVERSE6:.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %{{.*}})
-; CHECK: %[[WIDEMSKLOAD:.*]] = call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0(ptr %{{.*}}, i32 8, <vscale x 4 x i1> %[[REVERSE6]], <vscale x 4 x double> poison)
+; CHECK: %[[WIDEMSKLOAD:.*]] = call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0(ptr align 8 %{{.*}}, <vscale x 4 x i1> %[[REVERSE6]], <vscale x 4 x double> poison)
 ; CHECK: %[[REVERSE7:.*]] = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> %[[WIDEMSKLOAD]])
 ; CHECK: %[[FADD:.*]] = fadd <vscale x 4 x double> %[[REVERSE7]]
 ; CHECK: %[[REVERSE9:.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %{{.*}})
 ; CHECK: %[[REVERSE8:.*]] = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> %[[FADD]])
-; CHECK: call void @llvm.masked.store.nxv4f64.p0(<vscale x 4 x double> %[[REVERSE8]], ptr %{{.*}}, i32 8, <vscale x 4 x i1> %[[REVERSE9]]
+; CHECK: call void @llvm.masked.store.nxv4f64.p0(<vscale x 4 x double> %[[REVERSE8]], ptr align 8 %{{.*}}, <vscale x 4 x i1> %[[REVERSE9]]
 
 entry:
   %cmp7 = icmp sgt i64 %N, 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vfabi.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vfabi.ll
index 7628b39cf4eb7..8b6c7fe8f0fa5 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vfabi.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vfabi.ll
@@ -14,12 +14,12 @@ define void @test_big_little_params(ptr readonly %a, ptr readonly %b, ptr noalia
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ splat (i1 true), [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP2]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP2]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 1 [[TMP3]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 4 x i32> @foo_vector(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i8> [[WIDE_MASKED_LOAD1]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP4]], ptr [[TMP5]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP4]], ptr align 4 [[TMP5]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 1025)
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
@@ -58,12 +58,12 @@ define void @test_little_big_params(ptr readonly %a, ptr readonly %b, ptr noalia
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ splat (i1 true), [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x float> @llvm.masked.load.nxv2f32.p0(ptr [[TMP2]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x float> @llvm.masked.load.nxv2f32.p0(ptr align 4 [[TMP2]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x float> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr double, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP3]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP3]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x double> @bar_vector(<vscale x 2 x float> [[WIDE_MASKED_LOAD]], <vscale x 2 x double> [[WIDE_MASKED_LOAD1]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP4]], ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP4]], ptr align 8 [[TMP5]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll
index 75acbea978410..f2e3b708d7820 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll
@@ -23,10 +23,10 @@ define void @scalable_wide_active_lane_mask(ptr noalias %dst, ptr readonly %src,
 ; CHECK-UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-UF1-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH1]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-UF1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]]
-; CHECK-UF1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP10]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-UF1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP10]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
 ; CHECK-UF1-NEXT:    [[TMP6:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD]], splat (i8 3)
 ; CHECK-UF1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]]
-; CHECK-UF1-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP6]], ptr [[TMP13]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-UF1-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP6]], ptr align 1 [[TMP13]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-UF1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP12]]
 ; CHECK-UF1-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; CHECK-UF1-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -68,10 +68,10 @@ define void @scalable_wide_active_lane_mask(ptr noalias %dst, ptr readonly %src,
 ; CHECK-UF4-NEXT:    [[TMP34:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-UF4-NEXT:    [[TMP29:%.*]] = mul nuw i64 [[TMP34]], 48
 ; CHECK-UF4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP20]], i64 [[TMP29]]
-; CHECK-UF4-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP20]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
-; CHECK-UF4-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 16 x i8> poison)
-; CHECK-UF4-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP33]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 16 x i8> poison)
-; CHECK-UF4-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP30]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 16 x i8> poison)
+; CHECK-UF4-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP20]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-UF4-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP24]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 16 x i8> poison)
+; CHECK-UF4-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP33]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 16 x i8> poison)
+; CHECK-UF4-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP30]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 16 x i8> poison)
 ; CHECK-UF4-NEXT:    [[TMP25:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD]], splat (i8 3)
 ; CHECK-UF4-NEXT:    [[TMP26:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD9]], splat (i8 3)
 ; CHECK-UF4-NEXT:    [[TMP27:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD10]], splat (i8 3)
@@ -86,10 +86,10 @@ define void @scalable_wide_active_lane_mask(ptr noalias %dst, ptr readonly %src,
 ; CHECK-UF4-NEXT:    [[TMP43:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-UF4-NEXT:    [[TMP44:%.*]] = mul nuw i64 [[TMP43]], 48
 ; CHECK-UF4-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i8, ptr [[TMP35]], i64 [[TMP44]]
-; CHECK-UF4-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP25]], ptr [[TMP35]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-UF4-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP26]], ptr [[TMP39]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK6]])
-; CHECK-UF4-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP27]], ptr [[TMP42]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK7]])
-; CHECK-UF4-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP28]], ptr [[TMP45]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK8]])
+; CHECK-UF4-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP25]], ptr align 1 [[TMP35]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-UF4-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP26]], ptr align 1 [[TMP39]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK6]])
+; CHECK-UF4-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP27]], ptr align 1 [[TMP42]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK7]])
+; CHECK-UF4-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP28]], ptr align 1 [[TMP45]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK8]])
 ; CHECK-UF4-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP62]]
 ; CHECK-UF4-NEXT:    [[ACTIVE_LANE_MASK_NEXT:%.*]] = call <vscale x 64 x i1> @llvm.get.active.lane.mask.nxv64i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; CHECK-UF4-NEXT:    [[TMP58]] = call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv64i1(<vscale x 64 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 48)
@@ -141,10 +141,10 @@ define void @scalable_wide_active_lane_mask_double(ptr noalias %dst, ptr readonl
 ; CHECK-UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-UF1-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-UF1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds double, ptr [[SRC]], i64 [[INDEX]]
-; CHECK-UF1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
+; CHECK-UF1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP5]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
 ; CHECK-UF1-NEXT:    [[TMP3:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD]], splat (double 3.000000e+00)
 ; CHECK-UF1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
-; CHECK-UF1-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP3]], ptr [[TMP8]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-UF1-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP3]], ptr align 8 [[TMP8]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-UF1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; CHECK-UF1-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP13]])
 ; CHECK-UF1-NEXT:    [[TMP7:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -189,10 +189,10 @@ define void @scalable_wide_active_lane_mask_double(ptr noalias %dst, ptr readonl
 ; CHECK-UF4-NEXT:    [[TMP23:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-UF4-NEXT:    [[TMP24:%.*]] = mul nuw i64 [[TMP23]], 6
 ; CHECK-UF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds double, ptr [[TMP15]], i64 [[TMP24]]
-; CHECK-UF4-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP15]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
-; CHECK-UF4-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP29]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 2 x double> poison)
-; CHECK-UF4-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP22]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 2 x double> poison)
-; CHECK-UF4-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP25]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 2 x double> poison)
+; CHECK-UF4-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP15]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
+; CHECK-UF4-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP29]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 2 x double> poison)
+; CHECK-UF4-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP22]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 2 x double> poison)
+; CHECK-UF4-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP25]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 2 x double> poison)
 ; CHECK-UF4-NEXT:    [[TMP16:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD]], splat (double 3.000000e+00)
 ; CHECK-UF4-NEXT:    [[TMP17:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD9]], splat (double 3.000000e+00)
 ; CHECK-UF4-NEXT:    [[TMP18:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD10]], splat (double 3.000000e+00)
@@ -207,10 +207,10 @@ define void @scalable_wide_active_lane_mask_double(ptr noalias %dst, ptr readonl
 ; CHECK-UF4-NEXT:    [[TMP38:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-UF4-NEXT:    [[TMP39:%.*]] = mul nuw i64 [[TMP38]], 6
 ; CHECK-UF4-NEXT:    [[TMP40:%.*]] = getelementptr inbounds double, ptr [[TMP30]], i64 [[TMP39]]
-; CHECK-UF4-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP16]], ptr [[TMP30]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-UF4-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP17]], ptr [[TMP34]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK6]])
-; CHECK-UF4-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP18]], ptr [[TMP37]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK7]])
-; CHECK-UF4-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP19]], ptr [[TMP40]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK8]])
+; CHECK-UF4-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP16]], ptr align 8 [[TMP30]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-UF4-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP17]], ptr align 8 [[TMP34]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK6]])
+; CHECK-UF4-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP18]], ptr align 8 [[TMP37]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK7]])
+; CHECK-UF4-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP19]], ptr align 8 [[TMP40]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK8]])
 ; CHECK-UF4-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP3]]
 ; CHECK-UF4-NEXT:    [[ACTIVE_LANE_MASK_NEXT:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[WIDE_TRIP_COUNT]])
 ; CHECK-UF4-NEXT:    [[TMP53]] = call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv8i1(<vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 6)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
index ef111caafbf0e..f223786a07cdf 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
@@ -35,11 +35,11 @@ define void @pointer_induction_used_as_vector(ptr noalias %start.1, ptr noalias
 ; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[START_2]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; CHECK-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <vscale x 2 x i64> [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <vscale x 2 x ptr> [[VECTOR_GEP]], i32 0
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START_1]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, <vscale x 2 x ptr> [[VECTOR_GEP]], i64 1
 ; CHECK-NEXT:    store <vscale x 2 x ptr> [[TMP9]], ptr [[NEXT_GEP]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <vscale x 2 x ptr> [[VECTOR_GEP]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[TMP10]], align 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i8> [[WIDE_LOAD]], splat (i8 1)
 ; CHECK-NEXT:    store <vscale x 2 x i8> [[TMP12]], ptr [[TMP10]], align 1
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
index cf41664c28f3b..3b2b0b5c33aa9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
@@ -239,9 +239,9 @@ define i32 @pointer_iv_mixed(ptr noalias %a, ptr noalias %b, i64 %n) #0 {
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl <vscale x 2 x i64> [[TMP9]], splat (i64 2)
 ; CHECK-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <vscale x 2 x i64> [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <vscale x 2 x ptr> [[VECTOR_GEP]], i64 0
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 3
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <vscale x 2 x ptr> [[VECTOR_GEP]], i64 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i32>, ptr [[TMP11]], align 8
 ; CHECK-NEXT:    [[TMP12]] = add <vscale x 2 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    store <vscale x 2 x ptr> [[VECTOR_GEP]], ptr [[NEXT_GEP]], align 8
@@ -313,9 +313,9 @@ define void @phi_used_in_vector_compare_and_scalar_indvar_update_and_store(ptr %
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl <vscale x 2 x i64> [[TMP4]], splat (i64 1)
 ; CHECK-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <vscale x 2 x i64> [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <vscale x 2 x ptr> [[VECTOR_GEP]], zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <vscale x 2 x ptr> [[VECTOR_GEP]], i64 0
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2i16.p0(<vscale x 2 x i16> zeroinitializer, ptr [[TMP7]], i32 2, <vscale x 2 x i1> [[TMP6]])
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <vscale x 2 x ptr> [[VECTOR_GEP]], zeroinitializer
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2i16.p0(<vscale x 2 x i16> zeroinitializer, ptr align 2 [[TMP7]], <vscale x 2 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP3]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll
index 742097bdae890..871d9be609bd7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll
@@ -552,7 +552,7 @@ define void @simple_histogram_tailfold(ptr noalias %buckets, ptr readonly %indic
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP8]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP8]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP9]]
 ; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP10]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll
index b8f4e8435e9cd..a14ea745c6ac0 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll
@@ -26,15 +26,16 @@ define void @uniform_load(ptr noalias %dst, ptr noalias readonly %src, i64 %n) #
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> poison, i32 [[LOAD_VAL]], i64 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[IDX]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP5]], ptr [[TMP6]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP5]], ptr align 4 [[TMP6]], <4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[IDX]], 4
 ; CHECK-NEXT:    [[NEXT_ACTIVE_LANE_MASK]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[IDX]], i64 [[N2]])
 ; CHECK-NEXT:    [[EXTRACT_FIRST_LANE_MASK:%.*]] = extractelement <4 x i1> [[NEXT_ACTIVE_LANE_MASK]], i32 0
 ; CHECK-NEXT:    [[FIRST_LANE_SET:%.*]] = xor i1 [[EXTRACT_FIRST_LANE_MASK]], true
 ; CHECK-NEXT:    br i1 [[FIRST_LANE_SET]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[FOR_END:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[FOR_END:.*]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    ret void
 ;
 
 entry:
@@ -74,21 +75,22 @@ define void @cond_uniform_load(ptr noalias nocapture %dst, ptr nocapture readonl
 ; CHECK-NEXT:    [[INDEX6:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[COND]], i64 [[INDEX6]]
-; CHECK-NEXT:    [[COND_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP6]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[COND_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP6]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i32> [[COND_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[MASK:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP4]], <4 x i1> zeroinitializer
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[SRC_SPLAT]], i32 4, <4 x i1> [[MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[SRC_SPLAT]], <4 x i1> [[MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> [[WIDE_MASKED_GATHER]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDEX6]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[PREDPHI]], ptr [[TMP7]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[PREDPHI]], ptr align 4 [[TMP7]], <4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX6]], 4
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX6]], i64 [[TMP3]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i1 [[TMP8]], true
 ; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[FOR_END:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[FOR_END:.*]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
index 1607755e624a3..de70da6d2558b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
@@ -66,7 +66,7 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features
 ; DATA-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; DATA-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[UMAX]])
 ; DATA-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX1]]
-; DATA-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP10]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; DATA-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr align 4 [[TMP10]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; DATA-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP5]]
 ; DATA-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
 ; DATA-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -101,7 +101,7 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features
 ; DATA_NO_LANEMASK-NEXT:    [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT3]], [[TMP11]]
 ; DATA_NO_LANEMASK-NEXT:    [[TMP12:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
 ; DATA_NO_LANEMASK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX1]]
-; DATA_NO_LANEMASK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT5]], ptr [[TMP13]], i32 4, <vscale x 4 x i1> [[TMP12]])
+; DATA_NO_LANEMASK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT5]], ptr align 4 [[TMP13]], <vscale x 4 x i1> [[TMP12]])
 ; DATA_NO_LANEMASK-NEXT:    [[INDEX_NEXT6]] = add i64 [[INDEX1]], [[TMP5]]
 ; DATA_NO_LANEMASK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC]]
 ; DATA_NO_LANEMASK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -125,7 +125,7 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features
 ; DATA_AND_CONTROL-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; DATA_AND_CONTROL-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; DATA_AND_CONTROL-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX1]]
-; DATA_AND_CONTROL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP10]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; DATA_AND_CONTROL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr align 4 [[TMP10]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; DATA_AND_CONTROL-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP5]]
 ; DATA_AND_CONTROL-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT2]], i64 [[UMAX]])
 ; DATA_AND_CONTROL-NEXT:    [[TMP6:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -156,7 +156,7 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features
 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX1]]
-; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr align 4 [[TMP11]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP1]]
 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll
index 005ca8c9b2d93..ae8dc2d02fed2 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll
@@ -175,28 +175,18 @@ define void @test_add_double_same_var_args_1(ptr %res, ptr noalias %A, ptr noali
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP0]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP1]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <4 x double>, ptr [[TMP2]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[STRIDED_VEC3]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = load <2 x double>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = load <2 x double>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[STRIDED_VEC1]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[STRIDED_VEC4]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC5]], ptr [[TMP8]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    store <2 x double> [[TMP5]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store <2 x double> [[TMP6]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -237,28 +227,18 @@ define void @test_add_double_same_var_args_2(ptr %res, ptr noalias %A, ptr noali
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP0]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP1]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <4 x double>, ptr [[TMP2]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC]]
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC3]]
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = load <2 x double>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = load <2 x double>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC1]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC4]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC5]], ptr [[TMP8]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    store <2 x double> [[TMP5]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store <2 x double> [[TMP6]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
index 2a19402347e40..b23702dc088b9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
@@ -319,46 +319,46 @@ define void @single_fmul_used_by_each_member(ptr noalias %A, ptr noalias %B, ptr
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[INDEX]], 6
+; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[INDEX]], 3
 ; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr double, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr double, ptr [[TMP23]], i32 2
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[TMP23]], i32 4
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr double, ptr [[TMP23]], i32 6
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP23]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <2 x double>, ptr [[TMP25]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = load <2 x double>, ptr [[TMP26]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <2 x double>, ptr [[TMP27]], align 8
+; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP20]]
+; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = load double, ptr [[TMP23]], align 8
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x double> poison, double [[TMP24]], i64 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT1]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP25:%.*]] = load double, ptr [[TMP33]], align 8
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <2 x double> poison, double [[TMP25]], i64 0
+; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT12]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = load double, ptr [[TMP37]], align 8
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <2 x double> poison, double [[TMP26]], i64 0
+; CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT14]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP27:%.*]] = load double, ptr [[TMP39]], align 8
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT16:%.*]] = insertelement <2 x double> poison, double [[TMP27]], i64 0
+; CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT16]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP28:%.*]] = fmul <2 x double> [[WIDE_LOAD]], splat (double 5.000000e+00)
 ; CHECK-NEXT:    [[TMP29:%.*]] = fmul <2 x double> [[WIDE_LOAD12]], splat (double 5.000000e+00)
 ; CHECK-NEXT:    [[TMP30:%.*]] = fmul <2 x double> [[WIDE_LOAD13]], splat (double 5.000000e+00)
 ; CHECK-NEXT:    [[TMP31:%.*]] = fmul <2 x double> [[WIDE_LOAD14]], splat (double 5.000000e+00)
 ; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[TMP20]]
 ; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[TMP20]]
 ; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[TMP22]]
-; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <2 x double> [[TMP28]], <2 x double> [[TMP28]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP36]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP32]], align 8
-; CHECK-NEXT:    [[TMP37:%.*]] = shufflevector <2 x double> [[TMP29]], <2 x double> [[TMP29]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[INTERLEAVED_VEC15:%.*]] = shufflevector <4 x double> [[TMP37]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC15]], ptr [[TMP33]], align 8
-; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <2 x double> [[TMP30]], <2 x double> [[TMP30]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[INTERLEAVED_VEC16:%.*]] = shufflevector <4 x double> [[TMP38]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC16]], ptr [[TMP34]], align 8
-; CHECK-NEXT:    [[TMP39:%.*]] = shufflevector <2 x double> [[TMP31]], <2 x double> [[TMP31]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[INTERLEAVED_VEC17:%.*]] = shufflevector <4 x double> [[TMP39]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC17]], ptr [[TMP35]], align 8
+; CHECK-NEXT:    store <2 x double> [[TMP28]], ptr [[TMP32]], align 8
+; CHECK-NEXT:    store <2 x double> [[TMP29]], ptr [[TMP34]], align 8
+; CHECK-NEXT:    store <2 x double> [[TMP30]], ptr [[TMP38]], align 8
+; CHECK-NEXT:    store <2 x double> [[TMP31]], ptr [[TMP35]], align 8
 ; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[TMP20]]
 ; CHECK-NEXT:    [[TMP42:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[TMP20]]
 ; CHECK-NEXT:    [[TMP43:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[TMP22]]
-; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP40]], align 8
-; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC15]], ptr [[TMP41]], align 8
-; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC16]], ptr [[TMP42]], align 8
-; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC17]], ptr [[TMP43]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    store <2 x double> [[TMP28]], ptr [[TMP40]], align 8
+; CHECK-NEXT:    store <2 x double> [[TMP29]], ptr [[TMP42]], align 8
+; CHECK-NEXT:    store <2 x double> [[TMP30]], ptr [[TMP41]], align 8
+; CHECK-NEXT:    store <2 x double> [[TMP31]], ptr [[TMP43]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP44]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -435,7 +435,7 @@ exit:
   ret void
 }
 
-; FIXME: We should interleave by 2 after narrowing interleave groups to saturate
+; We should interleave by 2 after narrowing interleave groups to saturate
 ; load/store units.
 define void @test_interleave_after_narrowing(i32 %n, ptr %x, ptr noalias %y) {
 ; CHECK-LABEL: define void @test_interleave_after_narrowing(
@@ -447,12 +447,18 @@ define void @test_interleave_after_narrowing(i32 %n, ptr %x, ptr noalias %y) {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = fneg <4 x float> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fneg <4 x float> [[WIDE_LOAD1]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[TMP5]]
 ; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[TMP2]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
+; CHECK-NEXT:    store <4 x float> [[TMP4]], ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll
index c26176028626b..2865495954140 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll
@@ -13,12 +13,8 @@ define void @load_store_interleave_group_tc_2(ptr noalias %data) {
 ; VF2:       [[VECTOR_PH]]:
 ; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; VF2:       [[VECTOR_BODY]]:
-; VF2-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[DATA]], align 8
-; VF2-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
-; VF2-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
-; VF2-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[DATA]], align 8
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[DATA]], align 8
+; VF2-NEXT:    store <2 x i64> [[WIDE_LOAD]], ptr [[DATA]], align 8
 ; VF2-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; VF2:       [[MIDDLE_BLOCK]]:
 ; VF2-NEXT:    br label %[[EXIT:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll
index 6c36dfb81311b..829acbbf71548 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll
@@ -1,29 +1,81 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph:" --version 5
-; RUN: opt -p loop-vectorize -force-vector-interleave=1 -S -mcpu=neoverse-512tvb %s | FileCheck --check-prefixes=CHECK %s
+; RUN: opt -p loop-vectorize -force-vector-interleave=1 -S -mcpu=neoverse-512tvb %s | FileCheck --check-prefixes=IC1 %s
+; RUN: opt -p loop-vectorize -S -mcpu=neoverse-512tvb %s | FileCheck --check-prefixes=CHECK %s
 
 target triple = "aarch64-unknown-linux"
 
 define void @load_store_interleave_group(ptr noalias %data) {
+; IC1-LABEL: define void @load_store_interleave_group(
+; IC1-SAME: ptr noalias [[DATA:%.*]]) #[[ATTR0:[0-9]+]] {
+; IC1-NEXT:  [[ENTRY:.*:]]
+; IC1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IC1-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1
+; IC1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP1]]
+; IC1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; IC1:       [[VECTOR_PH]]:
+; IC1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; IC1-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
+; IC1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 100, [[TMP3]]
+; IC1-NEXT:    [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]]
+; IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; IC1:       [[VECTOR_BODY]]:
+; IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC1-NEXT:    [[TMP4:%.*]] = shl nsw i64 [[INDEX]], 1
+; IC1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
+; IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP5]], align 8
+; IC1-NEXT:    store <vscale x 2 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 8
+; IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; IC1-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IC1-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IC1:       [[MIDDLE_BLOCK]]:
+; IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 100, [[N_VEC]]
+; IC1-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; IC1:       [[SCALAR_PH]]:
+;
 ; CHECK-LABEL: define void @load_store_interleave_group(
 ; CHECK-SAME: ptr noalias [[DATA:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 3
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP5]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 100, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP2]], 4
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP20]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], [[TMP6]]
+; CHECK-NEXT:    [[TMP24:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[TMP24]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP2]], 3
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[TMP12]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 1
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP17:%.*]] = shl nsw i64 [[TMP7]], 1
+; CHECK-NEXT:    [[TMP18:%.*]] = shl nsw i64 [[TMP11]], 1
+; CHECK-NEXT:    [[TMP19:%.*]] = shl nsw i64 [[TMP15]], 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP18]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP19]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i64>, ptr [[TMP21]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 2 x i64>, ptr [[TMP22]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i64>, ptr [[TMP23]], align 8
 ; CHECK-NEXT:    store <vscale x 2 x i64> [[WIDE_LOAD]], ptr [[TMP1]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; CHECK-NEXT:    store <vscale x 2 x i64> [[WIDE_LOAD1]], ptr [[TMP21]], align 8
+; CHECK-NEXT:    store <vscale x 2 x i64> [[WIDE_LOAD2]], ptr [[TMP22]], align 8
+; CHECK-NEXT:    store <vscale x 2 x i64> [[WIDE_LOAD3]], ptr [[TMP23]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -53,27 +105,82 @@ exit:
 }
 
 define void @test_2xi64_unary_op_load_interleave_group(ptr noalias %data, ptr noalias %factor) {
+; IC1-LABEL: define void @test_2xi64_unary_op_load_interleave_group(
+; IC1-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) #[[ATTR0]] {
+; IC1-NEXT:  [[ENTRY:.*:]]
+; IC1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IC1-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1
+; IC1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1111, [[TMP1]]
+; IC1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; IC1:       [[VECTOR_PH]]:
+; IC1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; IC1-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
+; IC1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1111, [[TMP3]]
+; IC1-NEXT:    [[N_VEC:%.*]] = sub i64 1111, [[N_MOD_VF]]
+; IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; IC1:       [[VECTOR_BODY]]:
+; IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC1-NEXT:    [[TMP4:%.*]] = shl nsw i64 [[INDEX]], 1
+; IC1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP4]]
+; IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x double>, ptr [[TMP5]], align 8
+; IC1-NEXT:    [[TMP6:%.*]] = fneg <vscale x 2 x double> [[WIDE_LOAD]]
+; IC1-NEXT:    store <vscale x 2 x double> [[TMP6]], ptr [[TMP5]], align 8
+; IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; IC1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IC1-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; IC1:       [[MIDDLE_BLOCK]]:
+; IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1111, [[N_VEC]]
+; IC1-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; IC1:       [[SCALAR_PH]]:
+;
 ; CHECK-LABEL: define void @test_2xi64_unary_op_load_interleave_group(
 ; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 3
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1111, [[TMP5]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1111, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1111, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP2]], 4
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP20]], 1
+; CHECK-NEXT:    [[TMP24:%.*]] = add i64 [[INDEX]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP28:%.*]] = add i64 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP29]]
+; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP2]], 3
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[TMP12]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 1
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP17:%.*]] = shl nsw i64 [[TMP24]], 1
+; CHECK-NEXT:    [[TMP18:%.*]] = shl nsw i64 [[TMP11]], 1
+; CHECK-NEXT:    [[TMP19:%.*]] = shl nsw i64 [[TMP15]], 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP18]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP19]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <vscale x 2 x double>, ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 2 x double>, ptr [[TMP21]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 2 x double>, ptr [[TMP22]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x double>, ptr [[TMP23]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = fneg <vscale x 2 x double> [[TMP7]]
+; CHECK-NEXT:    [[TMP25:%.*]] = fneg <vscale x 2 x double> [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP26:%.*]] = fneg <vscale x 2 x double> [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP27:%.*]] = fneg <vscale x 2 x double> [[WIDE_LOAD3]]
 ; CHECK-NEXT:    store <vscale x 2 x double> [[TMP9]], ptr [[TMP1]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; CHECK-NEXT:    store <vscale x 2 x double> [[TMP25]], ptr [[TMP21]], align 8
+; CHECK-NEXT:    store <vscale x 2 x double> [[TMP26]], ptr [[TMP22]], align 8
+; CHECK-NEXT:    store <vscale x 2 x double> [[TMP27]], ptr [[TMP23]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll
index d290f2d4f5bc3..abfb44dcbd36d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll
@@ -60,32 +60,26 @@ define void @test_2xi64_with_wide_load(ptr noalias %data, ptr noalias %factor) {
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 2
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = load <2 x i64>, ptr [[TMP1]], align 8
-; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = shl nsw i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = shl nsw i64 [[TMP0]], 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP7]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[WIDE_VEC3:%.*]] = load <4 x i64>, ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC3]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <4 x i64> [[WIDE_VEC3]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP10:%.*]] = mul <2 x i64> [[BROADCAST_SPLAT]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP11:%.*]] = mul <2 x i64> [[BROADCAST_SPLAT3]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = load <2 x i64>, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = load <2 x i64>, ptr [[TMP9]], align 8
 ; CHECK-NEXT:    [[TMP15:%.*]] = mul <2 x i64> [[BROADCAST_SPLAT]], [[STRIDED_VEC2]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = mul <2 x i64> [[BROADCAST_SPLAT3]], [[STRIDED_VEC5]]
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <2 x i64> [[TMP10]], <2 x i64> [[TMP15]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP17]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <2 x i64> [[TMP11]], <2 x i64> [[TMP16]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[INTERLEAVED_VEC6:%.*]] = shufflevector <4 x i64> [[TMP18]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT:    store <4 x i64> [[INTERLEAVED_VEC6]], ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    store <2 x i64> [[TMP15]], ptr [[TMP8]], align 8
+; CHECK-NEXT:    store <2 x i64> [[TMP16]], ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll
index dd63b5ed20d13..6d0c55b1d246c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll
@@ -748,15 +748,15 @@ define void @test_2xi32(ptr noalias %data, ptr noalias %factor) {
 ; VF2-NEXT:    [[TMP22:%.*]] = shufflevector <6 x i32> [[WIDE_VEC1]], <6 x i32> poison, <2 x i32> <i32 1, i32 4>
 ; VF2-NEXT:    [[TMP14:%.*]] = mul <2 x i32> [[TMP7]], [[TMP13]]
 ; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i32> [[TMP14]], i32 0
-; VF2-NEXT:    store i32 [[TMP15]], ptr [[TMP8]], align 8
 ; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i32> [[TMP14]], i32 1
+; VF2-NEXT:    store i32 [[TMP15]], ptr [[TMP8]], align 8
 ; VF2-NEXT:    store i32 [[TMP16]], ptr [[TMP9]], align 8
 ; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 1
 ; VF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 1
 ; VF2-NEXT:    [[TMP23:%.*]] = mul <2 x i32> [[TMP7]], [[TMP22]]
 ; VF2-NEXT:    [[TMP24:%.*]] = extractelement <2 x i32> [[TMP23]], i32 0
-; VF2-NEXT:    store i32 [[TMP24]], ptr [[TMP17]], align 8
 ; VF2-NEXT:    [[TMP25:%.*]] = extractelement <2 x i32> [[TMP23]], i32 1
+; VF2-NEXT:    store i32 [[TMP24]], ptr [[TMP17]], align 8
 ; VF2-NEXT:    store i32 [[TMP25]], ptr [[TMP18]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 98
@@ -789,12 +789,12 @@ define void @test_2xi32(ptr noalias %data, ptr noalias %factor) {
 ; VF4-NEXT:    [[TMP44:%.*]] = shufflevector <12 x i32> [[WIDE_VEC1]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
 ; VF4-NEXT:    [[TMP28:%.*]] = mul <4 x i32> [[TMP15]], [[TMP27]]
 ; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i32> [[TMP28]], i32 0
-; VF4-NEXT:    store i32 [[TMP29]], ptr [[TMP16]], align 8
 ; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i32> [[TMP28]], i32 1
-; VF4-NEXT:    store i32 [[TMP30]], ptr [[TMP17]], align 8
 ; VF4-NEXT:    [[TMP31:%.*]] = extractelement <4 x i32> [[TMP28]], i32 2
-; VF4-NEXT:    store i32 [[TMP31]], ptr [[TMP18]], align 8
 ; VF4-NEXT:    [[TMP32:%.*]] = extractelement <4 x i32> [[TMP28]], i32 3
+; VF4-NEXT:    store i32 [[TMP29]], ptr [[TMP16]], align 8
+; VF4-NEXT:    store i32 [[TMP30]], ptr [[TMP17]], align 8
+; VF4-NEXT:    store i32 [[TMP31]], ptr [[TMP18]], align 8
 ; VF4-NEXT:    store i32 [[TMP32]], ptr [[TMP19]], align 8
 ; VF4-NEXT:    [[TMP33:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 1
 ; VF4-NEXT:    [[TMP34:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 1
@@ -802,12 +802,12 @@ define void @test_2xi32(ptr noalias %data, ptr noalias %factor) {
 ; VF4-NEXT:    [[TMP36:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP3]], i32 1
 ; VF4-NEXT:    [[TMP45:%.*]] = mul <4 x i32> [[TMP15]], [[TMP44]]
 ; VF4-NEXT:    [[TMP46:%.*]] = extractelement <4 x i32> [[TMP45]], i32 0
-; VF4-NEXT:    store i32 [[TMP46]], ptr [[TMP33]], align 8
 ; VF4-NEXT:    [[TMP47:%.*]] = extractelement <4 x i32> [[TMP45]], i32 1
-; VF4-NEXT:    store i32 [[TMP47]], ptr [[TMP34]], align 8
 ; VF4-NEXT:    [[TMP48:%.*]] = extractelement <4 x i32> [[TMP45]], i32 2
-; VF4-NEXT:    store i32 [[TMP48]], ptr [[TMP35]], align 8
 ; VF4-NEXT:    [[TMP49:%.*]] = extractelement <4 x i32> [[TMP45]], i32 3
+; VF4-NEXT:    store i32 [[TMP46]], ptr [[TMP33]], align 8
+; VF4-NEXT:    store i32 [[TMP47]], ptr [[TMP34]], align 8
+; VF4-NEXT:    store i32 [[TMP48]], ptr [[TMP35]], align 8
 ; VF4-NEXT:    store i32 [[TMP49]], ptr [[TMP36]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
index 75980ba1189cd..f2e689c0b419d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
@@ -328,10 +328,8 @@ define void @same_live_in_store_interleave_group(i64 %x, ptr noalias %dst) {
 ; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
 ; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLAT]], <2 x i64> [[BROADCAST_SPLAT]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
-; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
 ; VF2-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF2-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll
index 187edb580f8e2..4761cb0d63de7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll
@@ -20,30 +20,30 @@ define void @test0(ptr noalias %M3, ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP4]], align 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i16> [[WIDE_LOAD]], splat (i16 10)
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = ashr exact i64 [[TMP10]], 32
-; CHECK-NEXT:    [[TMP15:%.*]] = ashr exact i64 [[TMP11]], 32
-; CHECK-NEXT:    [[TMP16:%.*]] = ashr exact i64 [[TMP12]], 32
-; CHECK-NEXT:    [[TMP17:%.*]] = ashr exact i64 [[TMP13]], 32
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP16]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP17]]
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i16> [[TMP5]], i32 0
-; CHECK-NEXT:    store i16 [[TMP22]], ptr [[TMP18]], align 2
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i16> [[TMP5]], i32 1
-; CHECK-NEXT:    store i16 [[TMP23]], ptr [[TMP19]], align 2
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2
-; CHECK-NEXT:    store i16 [[TMP24]], ptr [[TMP20]], align 2
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
-; CHECK-NEXT:    store i16 [[TMP25]], ptr [[TMP21]], align 2
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i16> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
+; CHECK-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP18:%.*]] = ashr exact i64 [[TMP14]], 32
+; CHECK-NEXT:    [[TMP19:%.*]] = ashr exact i64 [[TMP15]], 32
+; CHECK-NEXT:    [[TMP20:%.*]] = ashr exact i64 [[TMP16]], 32
+; CHECK-NEXT:    [[TMP21:%.*]] = ashr exact i64 [[TMP17]], 32
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP18]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP19]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP20]]
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP21]]
+; CHECK-NEXT:    store i16 [[TMP6]], ptr [[TMP22]], align 2
+; CHECK-NEXT:    store i16 [[TMP7]], ptr [[TMP23]], align 2
+; CHECK-NEXT:    store i16 [[TMP8]], ptr [[TMP24]], align 2
+; CHECK-NEXT:    store i16 [[TMP9]], ptr [[TMP25]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
 ; CHECK-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -95,30 +95,30 @@ define void @test1(ptr noalias %M3, ptr noalias %A, ptr noalias %B, ptr noalias
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP5]], align 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc <4 x i32> [[BROADCAST_SPLAT]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i16> [[WIDE_LOAD]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
-; CHECK-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
-; CHECK-NEXT:    [[TMP16:%.*]] = ashr exact i64 [[TMP12]], 32
-; CHECK-NEXT:    [[TMP17:%.*]] = ashr exact i64 [[TMP13]], 32
-; CHECK-NEXT:    [[TMP18:%.*]] = ashr exact i64 [[TMP14]], 32
-; CHECK-NEXT:    [[TMP19:%.*]] = ashr exact i64 [[TMP15]], 32
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP16]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP17]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP18]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP19]]
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i16> [[TMP7]], i32 0
-; CHECK-NEXT:    store i16 [[TMP24]], ptr [[TMP20]], align 2
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i16> [[TMP7]], i32 1
-; CHECK-NEXT:    store i16 [[TMP25]], ptr [[TMP21]], align 2
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i16> [[TMP7]], i32 2
-; CHECK-NEXT:    store i16 [[TMP26]], ptr [[TMP22]], align 2
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i16> [[TMP7]], i32 3
-; CHECK-NEXT:    store i16 [[TMP27]], ptr [[TMP23]], align 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i16> [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i16> [[TMP7]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i16> [[TMP7]], i32 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i16> [[TMP7]], i32 3
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP15]], align 8
+; CHECK-NEXT:    [[TMP20:%.*]] = ashr exact i64 [[TMP16]], 32
+; CHECK-NEXT:    [[TMP21:%.*]] = ashr exact i64 [[TMP17]], 32
+; CHECK-NEXT:    [[TMP22:%.*]] = ashr exact i64 [[TMP18]], 32
+; CHECK-NEXT:    [[TMP23:%.*]] = ashr exact i64 [[TMP19]], 32
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP20]]
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP22]]
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP23]]
+; CHECK-NEXT:    store i16 [[TMP8]], ptr [[TMP24]], align 2
+; CHECK-NEXT:    store i16 [[TMP9]], ptr [[TMP25]], align 2
+; CHECK-NEXT:    store i16 [[TMP10]], ptr [[TMP26]], align 2
+; CHECK-NEXT:    store i16 [[TMP11]], ptr [[TMP27]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
 ; CHECK-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll b/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll
index e9de5e21228fd..dcb890670e33b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll
@@ -21,10 +21,10 @@ define void @test_uniform(ptr noalias %dst, ptr readonly %src, i64 %uniform , i6
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP5]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x double> @foo_uniform(<vscale x 2 x double> [[WIDE_MASKED_LOAD]], i64 [[UNIFORM]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP6]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP6]], ptr align 8 [[TMP7]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP4]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
@@ -53,16 +53,16 @@ define void @test_uniform(ptr noalias %dst, ptr readonly %src, i64 %uniform , i6
 ; INTERLEAVE-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; INTERLEAVE-NEXT:    [[DOTIDX:%.*]] = shl i64 [[TMP8]], 4
 ; INTERLEAVE-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP7]], i64 [[DOTIDX]]
-; INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
-; INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x double> poison)
+; INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP7]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
+; INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP9]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x double> poison)
 ; INTERLEAVE-NEXT:    [[TMP10:%.*]] = call <vscale x 2 x double> @foo_uniform(<vscale x 2 x double> [[WIDE_MASKED_LOAD]], i64 [[UNIFORM]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; INTERLEAVE-NEXT:    [[TMP11:%.*]] = call <vscale x 2 x double> @foo_uniform(<vscale x 2 x double> [[WIDE_MASKED_LOAD3]], i64 [[UNIFORM]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
 ; INTERLEAVE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
 ; INTERLEAVE-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
 ; INTERLEAVE-NEXT:    [[DOTIDX5:%.*]] = shl i64 [[TMP13]], 4
 ; INTERLEAVE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i64 [[DOTIDX5]]
-; INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP10]], ptr [[TMP12]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP11]], ptr [[TMP14]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
+; INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP10]], ptr align 8 [[TMP12]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP11]], ptr align 8 [[TMP14]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; INTERLEAVE-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
 ; INTERLEAVE-NEXT:    [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 1
@@ -107,10 +107,10 @@ define void @test_uniform_smaller_scalar(ptr noalias %dst, ptr readonly %src, i3
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP5]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x double> @bar_uniform(<vscale x 2 x double> [[WIDE_MASKED_LOAD]], i32 [[UNIFORM]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP6]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP6]], ptr align 8 [[TMP7]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP4]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
@@ -139,16 +139,16 @@ define void @test_uniform_smaller_scalar(ptr noalias %dst, ptr readonly %src, i3
 ; INTERLEAVE-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; INTERLEAVE-NEXT:    [[DOTIDX:%.*]] = shl i64 [[TMP8]], 4
 ; INTERLEAVE-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP7]], i64 [[DOTIDX]]
-; INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
-; INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x double> poison)
+; INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP7]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
+; INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP9]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x double> poison)
 ; INTERLEAVE-NEXT:    [[TMP10:%.*]] = call <vscale x 2 x double> @bar_uniform(<vscale x 2 x double> [[WIDE_MASKED_LOAD]], i32 [[UNIFORM]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; INTERLEAVE-NEXT:    [[TMP11:%.*]] = call <vscale x 2 x double> @bar_uniform(<vscale x 2 x double> [[WIDE_MASKED_LOAD3]], i32 [[UNIFORM]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
 ; INTERLEAVE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
 ; INTERLEAVE-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
 ; INTERLEAVE-NEXT:    [[DOTIDX5:%.*]] = shl i64 [[TMP13]], 4
 ; INTERLEAVE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i64 [[DOTIDX5]]
-; INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP10]], ptr [[TMP12]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP11]], ptr [[TMP14]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
+; INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP10]], ptr align 8 [[TMP12]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP11]], ptr align 8 [[TMP14]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; INTERLEAVE-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
 ; INTERLEAVE-NEXT:    [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 1
@@ -188,7 +188,7 @@ define void @test_uniform_not_invariant(ptr noalias %dst, ptr readonly %src, i64
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[GEPSRC:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[DATA:%.*]] = load double, ptr [[GEPSRC]], align 8
-; CHECK-NEXT:    [[CALL:%.*]] = call double @foo(double [[DATA]], i64 [[INDVARS_IV]]) #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    [[CALL:%.*]] = call double @foo(double [[DATA]], i64 [[INDVARS_IV]]) #[[ATTR4:[0-9]+]]
 ; CHECK-NEXT:    [[GEPDST:%.*]] = getelementptr inbounds nuw double, ptr [[DST]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    store double [[CALL]], ptr [[GEPDST]], align 8
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -212,7 +212,7 @@ define void @test_uniform_not_invariant(ptr noalias %dst, ptr readonly %src, i64
 ; INTERLEAVE:       pred.store.if:
 ; INTERLEAVE-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]]
 ; INTERLEAVE-NEXT:    [[TMP2:%.*]] = load double, ptr [[TMP1]], align 8
-; INTERLEAVE-NEXT:    [[TMP3:%.*]] = call double @foo(double [[TMP2]], i64 [[INDEX]]) #[[ATTR5:[0-9]+]]
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = call double @foo(double [[TMP2]], i64 [[INDEX]]) #[[ATTR4:[0-9]+]]
 ; INTERLEAVE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
 ; INTERLEAVE-NEXT:    store double [[TMP3]], ptr [[TMP4]], align 8
 ; INTERLEAVE-NEXT:    br label [[PRED_STORE_CONTINUE]]
@@ -222,7 +222,7 @@ define void @test_uniform_not_invariant(ptr noalias %dst, ptr readonly %src, i64
 ; INTERLEAVE-NEXT:    [[TMP5:%.*]] = or disjoint i64 [[INDEX]], 1
 ; INTERLEAVE-NEXT:    [[TMP6:%.*]] = getelementptr double, ptr [[SRC]], i64 [[TMP5]]
 ; INTERLEAVE-NEXT:    [[TMP7:%.*]] = load double, ptr [[TMP6]], align 8
-; INTERLEAVE-NEXT:    [[TMP8:%.*]] = call double @foo(double [[TMP7]], i64 [[TMP5]]) #[[ATTR5]]
+; INTERLEAVE-NEXT:    [[TMP8:%.*]] = call double @foo(double [[TMP7]], i64 [[TMP5]]) #[[ATTR4]]
 ; INTERLEAVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[TMP5]]
 ; INTERLEAVE-NEXT:    store double [[TMP8]], ptr [[TMP9]], align 8
 ; INTERLEAVE-NEXT:    br label [[PRED_STORE_CONTINUE4]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll
index 1dd49ecf85b81..17be6cc362a2c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll
@@ -46,13 +46,13 @@ define void @vector_reverse_mask_v4i1(ptr noalias %a, ptr noalias %cond, i64 %N)
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i64 -24
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP7]], i64 -56
 ; CHECK-NEXT:    [[REVERSE3:%.*]] = shufflevector <4 x i1> [[TMP5]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP8]], i32 8, <4 x i1> [[REVERSE3]], <4 x double> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP8]], <4 x i1> [[REVERSE3]], <4 x double> poison)
 ; CHECK-NEXT:    [[REVERSE5:%.*]] = shufflevector <4 x i1> [[TMP6]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP9]], i32 8, <4 x i1> [[REVERSE5]], <4 x double> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP9]], <4 x i1> [[REVERSE5]], <4 x double> poison)
 ; CHECK-NEXT:    [[TMP10:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], splat (double 1.000000e+00)
 ; CHECK-NEXT:    [[TMP11:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD6]], splat (double 1.000000e+00)
-; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP10]], ptr [[TMP8]], i32 8, <4 x i1> [[REVERSE3]])
-; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP11]], ptr [[TMP9]], i32 8, <4 x i1> [[REVERSE5]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP10]], ptr align 8 [[TMP8]], <4 x i1> [[REVERSE3]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP11]], ptr align 8 [[TMP9]], <4 x i1> [[REVERSE5]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll
index 96a25a853f880..5999707699970 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll
@@ -116,11 +116,11 @@ define void @test(ptr noalias %src, ptr noalias %dst) {
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = fpext <2 x float> [[WIDE_LOAD]] to <2 x double>
 ; CHECK-NEXT:    [[TMP5:%.*]] = call fast <2 x double> @__simd_sin_v2f64(<2 x double> [[TMP4]])
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
 ; CHECK-NEXT:    store double [[TMP8]], ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
 ; CHECK-NEXT:    store double [[TMP9]], ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll b/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll
index 2adb3b5035582..fc0b19da47f4b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll
@@ -54,8 +54,8 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #1 {
 ; NARROW-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
 ; NARROW-NEXT:    [[TMP1:%.*]] = fptrunc <2 x double> [[WIDE_LOAD]] to <2 x float>
 ; NARROW-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
-; NARROW-NEXT:    [[TMP3:%.*]] = call float @foo(float [[TMP2]]) #[[ATTR1:[0-9]+]]
 ; NARROW-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+; NARROW-NEXT:    [[TMP3:%.*]] = call float @foo(float [[TMP2]]) #[[ATTR1:[0-9]+]]
 ; NARROW-NEXT:    [[TMP5:%.*]] = call float @foo(float [[TMP4]]) #[[ATTR1]]
 ; NARROW-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i32 0
 ; NARROW-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP5]], i32 1
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/active-lane-mask.ll b/llvm/test/Transforms/LoopVectorize/ARM/active-lane-mask.ll
index 58a24ee7c4677..583e1562327a7 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/active-lane-mask.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/active-lane-mask.ll
@@ -23,14 +23,14 @@ define void @f0(ptr noalias %dst, ptr readonly %src, i64 %n) #0 {
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK1:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 [[TMP0]], i64 [[N]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP3]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK1]], <16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP1]], <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP3]], <16 x i1> [[ACTIVE_LANE_MASK1]], <16 x i8> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul <16 x i8> [[WIDE_MASKED_LOAD]], splat (i8 3)
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul <16 x i8> [[WIDE_MASKED_LOAD2]], splat (i8 3)
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 16
-; CHECK-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[TMP4]], ptr [[TMP6]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[TMP5]], ptr [[TMP8]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK1]])
+; CHECK-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[TMP4]], ptr align 1 [[TMP6]], <16 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[TMP5]], ptr align 1 [[TMP8]], <16 x i1> [[ACTIVE_LANE_MASK1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 32
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll b/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll
index 44a48a9c262f5..0f398a69a4bc7 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll
@@ -84,15 +84,14 @@ for.end:                                          ; preds = %for.end.loopexit, %
 ; CHECK: We can vectorize this loop!
 define i32 @redi(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, i32 %N) {
 entry:
-  %cmp5 = icmp eq i32 %N, 0
-  br i1 %cmp5, label %for.end, label %for.body.preheader
+  br label %for.body.preheader
 
 for.body.preheader:                               ; preds = %entry
   br label %for.body
 
 for.body:                                         ; preds = %for.body.preheader, %for.body
   %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
-  %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ]
+  %Red.06 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
   %arrayidx = getelementptr inbounds i32, ptr %a, i32 %i.07
   %0 = load i32, ptr %arrayidx, align 4
   %arrayidx1 = getelementptr inbounds i32, ptr %b, i32 %i.07
@@ -107,9 +106,8 @@ for.end.loopexit:                                 ; preds = %for.body
   %add.lcssa = phi i32 [ %add, %for.body ]
   br label %for.end
 
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
-  ret i32 %Red.0.lcssa
+for.end:                                          ; preds = %for.end.loopexit
+  ret i32 %add.lcssa
 }
 
 ; Floating-point loops need fast-math to be vectorizeable
@@ -121,15 +119,14 @@ for.end:                                          ; preds = %for.end.loopexit, %
 ; DARWIN: We can vectorize this loop!
 define float @redf(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, i32 %N) {
 entry:
-  %cmp5 = icmp eq i32 %N, 0
-  br i1 %cmp5, label %for.end, label %for.body.preheader
+  br label %for.body.preheader
 
 for.body.preheader:                               ; preds = %entry
   br label %for.body
 
 for.body:                                         ; preds = %for.body.preheader, %for.body
   %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
-  %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ]
+  %Red.06 = phi float [ %add, %for.body ], [ 0.0e+00, %for.body.preheader ]
   %arrayidx = getelementptr inbounds float, ptr %a, i32 %i.07
   %0 = load float, ptr %arrayidx, align 4
   %arrayidx1 = getelementptr inbounds float, ptr %b, i32 %i.07
@@ -144,9 +141,8 @@ for.end.loopexit:                                 ; preds = %for.body
   %add.lcssa = phi float [ %add, %for.body ]
   br label %for.end
 
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
-  ret float %Red.0.lcssa
+for.end:                                          ; preds = %for.end.loopexit
+  ret float %add.lcssa
 }
 
 ; Make sure calls that turn into builtins are also covered
@@ -252,7 +248,7 @@ for.body.preheader:                               ; preds = %entry
 
 for.body:                                         ; preds = %for.body.preheader, %for.body
   %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
-  %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ]
+  %Red.06 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
   %arrayidx = getelementptr inbounds i32, ptr %a, i32 %i.07
   %0 = load i32, ptr %arrayidx, align 4
   %arrayidx1 = getelementptr inbounds i32, ptr %b, i32 %i.07
@@ -268,7 +264,7 @@ for.end.loopexit:                                 ; preds = %for.body
   br label %for.end
 
 for.end:                                          ; preds = %for.end.loopexit, %entry
-  %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
+  %Red.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %for.end.loopexit ]
   ret i32 %Red.0.lcssa
 }
 
@@ -277,15 +273,14 @@ for.end:                                          ; preds = %for.end.loopexit, %
 ; CHECK: We can vectorize this loop!
 define float @redf_fast(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, i32 %N) {
 entry:
-  %cmp5 = icmp eq i32 %N, 0
-  br i1 %cmp5, label %for.end, label %for.body.preheader
+  br label %for.body.preheader
 
 for.body.preheader:                               ; preds = %entry
   br label %for.body
 
 for.body:                                         ; preds = %for.body.preheader, %for.body
   %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
-  %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ]
+  %Red.06 = phi float [ %add, %for.body ], [ 0.0e+00, %for.body.preheader ]
   %arrayidx = getelementptr inbounds float, ptr %a, i32 %i.07
   %0 = load float, ptr %arrayidx, align 4
   %arrayidx1 = getelementptr inbounds float, ptr %b, i32 %i.07
@@ -300,9 +295,8 @@ for.end.loopexit:                                 ; preds = %for.body
   %add.lcssa = phi float [ %add, %for.body ]
   br label %for.end
 
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
-  ret float %Red.0.lcssa
+for.end:                                          ; preds = %for.end.loopexit
+  ret float %add.lcssa
 }
 
 ; Make sure calls that turn into builtins are also covered
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll
index 9a76019ec5f46..0b0e2d4154cb6 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll
@@ -17,10 +17,10 @@ define void @test_stride1_4i32(ptr readonly %data, ptr noalias nocapture %dst, i
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i32 [[TMP1]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP2]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP2]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> splat (i32 5), [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP4]], ptr [[TMP5]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP4]], ptr align 4 [[TMP5]], <4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -189,10 +189,10 @@ define void @test_stride3_4i32(ptr readonly %data, ptr noalias nocapture %dst, i
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw nsw <4 x i32> [[VEC_IND]], splat (i32 3)
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw <4 x i32> [[TMP1]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], <4 x i32> [[TMP2]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP3]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> splat (i32 5), [[WIDE_MASKED_GATHER]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP4]], ptr [[TMP5]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP4]], ptr align 4 [[TMP5]], <4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -236,10 +236,10 @@ define void @test_stride4_4i32(ptr readonly %data, ptr noalias nocapture %dst, i
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw <4 x i32> [[TMP1]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], <4 x i32> [[TMP2]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP3]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> splat (i32 5), [[WIDE_MASKED_GATHER]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP4]], ptr [[TMP5]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP4]], ptr align 4 [[TMP5]], <4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -284,10 +284,10 @@ define void @test_stride_loopinvar_4i32(ptr readonly %data, ptr noalias nocaptur
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i32 [[TMP1]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP2]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP2]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> splat (i32 5), [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP4]], ptr [[TMP5]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP4]], ptr align 4 [[TMP5]], <4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
@@ -346,7 +346,7 @@ define void @test_stride_noninvar_4i32(ptr readonly %data, ptr noalias nocapture
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw nsw <4 x i32> [[VEC_IND]], [[VEC_IND2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw <4 x i32> [[TMP2]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], <4 x i32> [[TMP3]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP4]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP4]], <4 x i1> splat (i1 true), <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> splat (i32 5), [[WIDE_MASKED_GATHER]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[INDEX]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4
@@ -465,7 +465,7 @@ define void @test_stride_noninvar3_4i32(ptr readonly %data, ptr noalias nocaptur
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw nsw <4 x i32> [[VEC_IND]], [[VEC_IND4]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw <4 x i32> [[TMP4]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], <4 x i32> [[TMP5]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP6]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP6]], <4 x i1> splat (i1 true), <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> splat (i32 5), [[WIDE_MASKED_GATHER]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[INDEX]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP8]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-hoist-runtime-checks.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-hoist-runtime-checks.ll
index be4a6be97d701..2686f414090d3 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-hoist-runtime-checks.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-hoist-runtime-checks.ll
@@ -70,9 +70,9 @@ define void @diff_checks(ptr nocapture noundef writeonly %dst, ptr nocapture nou
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[TMP7]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP11]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison), !alias.scope [[META0:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP11]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison), !alias.scope [[META0:![0-9]+]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP8]], i32 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[WIDE_MASKED_LOAD]], ptr [[TMP12]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]), !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[WIDE_MASKED_LOAD]], ptr align 4 [[TMP12]], <4 x i1> [[ACTIVE_LANE_MASK]]), !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll
index fe3504bc4b679..23609b1041ae2 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll
@@ -249,7 +249,7 @@ define dso_local i32 @predicated_test(i32 noundef %0, ptr %glob) #0 {
   br label %7
 
 7:                                                ; preds = %5, %155
-  %8 = phi i32 [ %10, %155 ], [ undef, %5 ]
+  %8 = phi i32 [ %10, %155 ], [ 0, %5 ]
   %9 = phi i32 [ %156, %155 ], [ 0, %5 ]
   %10 = shl i32 %8, 4
   store i32 %10, ptr %6, align 4
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll
index 0a4ed7ff2eb38..706012c155fdf 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll
@@ -15,7 +15,7 @@ define i32 @reduction_sum_single(ptr noalias nocapture %A) {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 257)
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP0]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP0]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_MASKED_LOAD]])
 ; CHECK-NEXT:    [[TMP2]] = add i32 [[VEC_PHI]], [[TMP1]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
@@ -56,9 +56,9 @@ define i32 @reduction_sum(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 257)
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP0]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP0]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP1]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[VEC_IND]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[VEC_PHI]], [[TMP3]]
@@ -108,9 +108,9 @@ define i32 @reduction_prod(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 1), [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 257)
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP0]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP0]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP1]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i32> [[VEC_PHI]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP2]], [[WIDE_MASKED_LOAD1]]
 ; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP3]], <4 x i32> [[VEC_PHI]]
@@ -155,9 +155,9 @@ define i32 @reduction_and(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 -1), [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 257)
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP0]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP0]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP1]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP2]], <4 x i32> splat (i32 -1)
 ; CHECK-NEXT:    [[TMP4]] = and <4 x i32> [[VEC_PHI]], [[TMP3]]
@@ -202,9 +202,9 @@ define i32 @reduction_or(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 257)
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP0]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP0]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP1]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4]] = or <4 x i32> [[VEC_PHI]], [[TMP3]]
@@ -249,9 +249,9 @@ define i32 @reduction_xor(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 257)
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP0]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP0]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP1]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4]] = xor <4 x i32> [[VEC_PHI]], [[TMP3]]
@@ -296,9 +296,9 @@ define float @reduction_fadd(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 257)
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[TMP0]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[TMP1]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd fast <4 x float> [[TMP2]], [[WIDE_MASKED_LOAD1]]
 ; CHECK-NEXT:    [[TMP4]] = select fast <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> [[TMP3]], <4 x float> [[VEC_PHI]]
@@ -343,9 +343,9 @@ define float @reduction_fmul(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float 0.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 257)
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[TMP0]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[TMP1]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <4 x float> [[VEC_PHI]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[WIDE_MASKED_LOAD1]]
 ; CHECK-NEXT:    [[TMP4]] = select fast <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> [[TMP3]], <4 x float> [[VEC_PHI]]
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll
index 029d8bd64fe50..3fb645cf8a10e 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll
@@ -21,9 +21,9 @@ define i32 @mla_i32(ptr noalias nocapture readonly %A, ptr noalias nocapture rea
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP0]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP0]], <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP2]], <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP5:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP4]], [[TMP5]]
@@ -82,9 +82,9 @@ define i32 @mla_i8(ptr noalias nocapture readonly %A, ptr noalias nocapture read
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP1]], <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP4]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP4]], <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
 ; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul nsw <16 x i32> [[TMP6]], [[TMP3]]
@@ -143,7 +143,7 @@ define i32 @add_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP1]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
 ; CHECK-NEXT:    [[TMP5]] = add i32 [[VEC_PHI]], [[TMP4]]
@@ -194,7 +194,7 @@ define i32 @mul_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 1), [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP1]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP3]], <4 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
@@ -245,7 +245,7 @@ define i32 @and_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 -1), [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP1]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = and <4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP3]], <4 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
@@ -296,7 +296,7 @@ define i32 @or_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP1]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP3]], <4 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
@@ -347,7 +347,7 @@ define i32 @xor_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP1]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor <4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP3]], <4 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
@@ -398,7 +398,7 @@ define float @fadd_f32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[TMP1]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd fast <4 x float> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP4]] = select fast <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> [[TMP3]], <4 x float> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
@@ -449,7 +449,7 @@ define float @fmul_f32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float 1.000000e+00), [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[TMP1]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP4]] = select fast <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> [[TMP3]], <4 x float> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll
index 1540baab53719..d9c441446750f 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll
@@ -252,7 +252,7 @@ define i32 @add_i32_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP0]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP0]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_MASKED_LOAD]])
 ; CHECK-NEXT:    [[TMP2]] = add i32 [[VEC_PHI]], [[TMP1]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
@@ -296,7 +296,7 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP0]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 2 [[TMP0]], <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i32> [[TMP1]], <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
@@ -343,7 +343,7 @@ define i32 @add_i8_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP0]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP0]], <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP1]], <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP2]])
@@ -390,7 +390,7 @@ define signext i16 @add_i16_i16(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP0]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> zeroinitializer)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 2 [[TMP0]], <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> zeroinitializer)
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[WIDE_MASKED_LOAD]])
 ; CHECK-NEXT:    [[TMP2]] = add i16 [[VEC_PHI]], [[TMP1]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
@@ -434,7 +434,7 @@ define signext i16 @add_i8_i16(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP0]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP0]], <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i16> [[TMP1]], <16 x i16> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> [[TMP2]])
@@ -481,7 +481,7 @@ define zeroext i8 @add_i8_i8(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i8 [ 0, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP0]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP0]], <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> zeroinitializer)
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> [[WIDE_MASKED_LOAD]])
 ; CHECK-NEXT:    [[TMP2]] = add i8 [[VEC_PHI]], [[TMP1]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
@@ -801,9 +801,9 @@ define i32 @mla_i32_i32(ptr nocapture readonly %x, ptr nocapture readonly %y, i3
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[Y:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP1]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[Y1:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP7]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP7]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD2]], [[WIDE_MASKED_LOAD1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
@@ -852,9 +852,9 @@ define i32 @mla_i16_i32(ptr nocapture readonly %x, ptr nocapture readonly %y, i3
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP0]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 2 [[TMP0]], <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP2]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 2 [[TMP2]], <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD1]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <8 x i32> [[TMP3]], [[TMP1]]
@@ -907,9 +907,9 @@ define i32 @mla_i8_i32(ptr nocapture readonly %x, ptr nocapture readonly %y, i32
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP0]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP0]], <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP1]], <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw nsw <16 x i32> [[TMP2]], [[TMP3]]
@@ -962,9 +962,9 @@ define signext i16 @mla_i16_i16(ptr nocapture readonly %x, ptr nocapture readonl
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP1]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 2 [[TMP1]], <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[Y1:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP7]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 2 [[TMP7]], <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul <8 x i16> [[WIDE_MASKED_LOAD2]], [[WIDE_MASKED_LOAD1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> [[TMP2]], <8 x i16> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP3]])
@@ -1013,9 +1013,9 @@ define signext i16 @mla_i8_i16(ptr nocapture readonly %x, ptr nocapture readonly
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP0]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP0]], <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP2]], <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw <16 x i16> [[TMP3]], [[TMP1]]
@@ -1068,9 +1068,9 @@ define zeroext i8 @mla_i8_i8(ptr nocapture readonly %x, ptr nocapture readonly %
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i8 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP1]], <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[Y1:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP7]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP7]], <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul <16 x i8> [[WIDE_MASKED_LOAD2]], [[WIDE_MASKED_LOAD1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> [[TMP2]], <16 x i8> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> [[TMP3]])
@@ -1119,9 +1119,9 @@ define i32 @red_mla_ext_s8_s16_s32(ptr noalias nocapture readonly %A, ptr noalia
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr [[TMP0]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr align 1 [[TMP0]], <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[B:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP2]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 2 [[TMP2]], <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD1]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i8> [[WIDE_MASKED_LOAD]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <8 x i32> [[TMP3]], [[TMP1]]
@@ -1267,10 +1267,10 @@ define i32 @red_mla_u8_s8_u32(ptr noalias nocapture readonly %A, ptr noalias noc
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr [[TMP2]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr align 1 [[TMP2]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i8> [[WIDE_MASKED_LOAD1]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[B1:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr [[TMP9]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr align 1 [[TMP9]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = sext <4 x i8> [[WIDE_MASKED_LOAD2]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[TMP3]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP4]], <4 x i32> zeroinitializer
@@ -1407,7 +1407,7 @@ define i32 @mla_i8_i32_multiuse(ptr nocapture readonly %x, ptr nocapture readonl
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP0]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP0]], <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
 ; CHECK-NEXT:    [[TMP7:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw nsw <16 x i32> [[TMP7]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP2]], <16 x i32> zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reg-pressure-vmla.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reg-pressure-vmla.ll
index d930a96371ad1..a976016b648f6 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reg-pressure-vmla.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reg-pressure-vmla.ll
@@ -35,10 +35,10 @@ define void @fn(i32 noundef %n, ptr %in, ptr %out) #0 {
 ; CHECK-NEXT:    [[VECTOR_GEP4:%.*]] = getelementptr i8, ptr [[POINTER_PHI2]], <4 x i32> <i32 0, i32 3, i32 6, i32 9>
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, <4 x ptr> [[VECTOR_GEP4]], i32 1
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> [[VECTOR_GEP4]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison), !alias.scope [[META0:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> align 1 [[VECTOR_GEP4]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison), !alias.scope [[META0:![0-9]+]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, <4 x ptr> [[VECTOR_GEP4]], i32 2
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> [[TMP1]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison), !alias.scope [[META0]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> [[TMP2]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison), !alias.scope [[META0]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> align 1 [[TMP1]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison), !alias.scope [[META0]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> align 1 [[TMP2]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison), !alias.scope [[META0]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[WIDE_MASKED_GATHER]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw nsw <4 x i32> [[TMP3]], splat (i32 19595)
 ; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[WIDE_MASKED_GATHER5]] to <4 x i32>
@@ -67,10 +67,10 @@ define void @fn(i32 noundef %n, ptr %in, ptr %out) #0 {
 ; CHECK-NEXT:    [[TMP28:%.*]] = lshr <4 x i32> [[TMP27]], splat (i32 16)
 ; CHECK-NEXT:    [[TMP29:%.*]] = trunc <4 x i32> [[TMP28]] to <4 x i8>
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw i8, <4 x ptr> [[VECTOR_GEP]], i32 1
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP13]], <4 x ptr> [[VECTOR_GEP]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]]), !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP13]], <4 x ptr> align 1 [[VECTOR_GEP]], <4 x i1> [[ACTIVE_LANE_MASK]]), !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds nuw i8, <4 x ptr> [[VECTOR_GEP]], i32 2
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP21]], <4 x ptr> [[TMP30]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]]), !alias.scope [[META3]], !noalias [[META0]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP29]], <4 x ptr> [[TMP31]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]]), !alias.scope [[META3]], !noalias [[META0]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP21]], <4 x ptr> align 1 [[TMP30]], <4 x i1> [[ACTIVE_LANE_MASK]]), !alias.scope [[META3]], !noalias [[META0]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP29]], <4 x ptr> align 1 [[TMP31]], <4 x i1> [[ACTIVE_LANE_MASK]]), !alias.scope [[META3]], !noalias [[META0]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[PTR_IND3]] = getelementptr i8, ptr [[POINTER_PHI2]], i32 12
 ; CHECK-NEXT:    [[PTR_IND6]] = getelementptr i8, ptr [[POINTER_PHI]], i32 12
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-saddsatcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-saddsatcost.ll
index 04a97f451770a..b49377c19ea19 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-saddsatcost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-saddsatcost.ll
@@ -7,10 +7,6 @@ target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv8.1m.main-arm-none-eabi"
 
 ; CHECK-COST-LABEL: arm_offset_q15
-; CHECK-COST: LV: Found an estimated cost of 2 for VF 1 For instruction:   %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
-; CHECK-COST: Cost of 36 for VF 2: REPLICATE ir<%1> = call @llvm.sadd.sat.i16(ir<%0>, ir<%offset>)
-; CHECK-COST: Cost of 8 for VF 4: WIDEN-INTRINSIC ir<%1> = call llvm.sadd.sat(ir<%0>, ir<%offset>)
-; CHECK-COST: Cost of 2 for VF 8: WIDEN-INTRINSIC ir<%1> = call llvm.sadd.sat(ir<%0>, ir<%offset>)
 
 define void @arm_offset_q15(ptr nocapture readonly %pSrc, i16 signext %offset, ptr nocapture noalias %pDst, i32 %blockSize) #0 {
 ; CHECK-LABEL: @arm_offset_q15(
@@ -30,9 +26,9 @@ define void @arm_offset_q15(ptr nocapture readonly %pSrc, i16 signext %offset, p
 ; CHECK-NEXT:    [[OFFSET_IDX5:%.*]] = shl i32 [[INDEX]], 1
 ; CHECK-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i32 [[OFFSET_IDX5]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[BLOCKSIZE]])
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[NEXT_GEP]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 2 [[NEXT_GEP]], <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[WIDE_MASKED_LOAD]], <8 x i16> [[BROADCAST_SPLAT8]])
-; CHECK-NEXT:    call void @llvm.masked.store.v8i16.p0(<8 x i16> [[TMP0]], ptr [[NEXT_GEP6]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i16.p0(<8 x i16> [[TMP0]], ptr align 2 [[NEXT_GEP6]], <8 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[WHILE_END]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll b/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll
index e8811253847be..df648c9203e53 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll
@@ -111,7 +111,7 @@ define hidden void @pointer_phi_v4i32_add3(ptr noalias nocapture readonly %A, pt
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i32> <i32 0, i32 12, i32 24, i32 36>
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[B]], i32 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP0]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP0]], <4 x i1> splat (i1 true), <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP1]], ptr [[NEXT_GEP]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
@@ -544,7 +544,7 @@ define hidden void @pointer_phi_v4f32_add3(ptr noalias nocapture readonly %A, pt
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i32> <i32 0, i32 12, i32 24, i32 36>
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[B]], i32 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> [[TMP0]], i32 4, <4 x i1> splat (i1 true), <4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 [[TMP0]], <4 x i1> splat (i1 true), <4 x float> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = fadd fast <4 x float> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[NEXT_GEP]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
@@ -749,8 +749,8 @@ define hidden void @pointer_phi_v4i32_uf2(ptr noalias nocapture readonly %A, ptr
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, <4 x ptr> [[TMP0]], i32 96
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[B]], i32 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP0]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP1]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP0]], <4 x i1> splat (i1 true), <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP1]], <4 x i1> splat (i1 true), <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER5]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 16
@@ -816,10 +816,10 @@ define hidden void @pointer_phi_v4i32_uf4(ptr noalias nocapture readonly %A, ptr
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, <4 x ptr> [[TMP0]], i32 288
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[B]], i32 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP0]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP1]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP2]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP3]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP0]], <4 x i1> splat (i1 true), <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP1]], <4 x i1> splat (i1 true), <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP2]], <4 x i1> splat (i1 true), <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP3]], <4 x i1> splat (i1 true), <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER7]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER8]], [[BROADCAST_SPLAT]]
@@ -886,18 +886,18 @@ define hidden void @mult_ptr_iv(ptr noalias nocapture readonly %x, ptr noalias n
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[POINTER_PHI5]], <4 x i32> <i32 0, i32 3, i32 6, i32 9>
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i32> <i32 0, i32 3, i32 6, i32 9>
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, <4 x ptr> [[TMP0]], i32 1
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> [[TMP0]], i32 1, <4 x i1> splat (i1 true), <4 x i8> poison), !alias.scope [[META28:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> align 1 [[TMP0]], <4 x i1> splat (i1 true), <4 x i8> poison), !alias.scope [[META28:![0-9]+]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, <4 x ptr> [[TMP0]], i32 2
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> [[TMP2]], i32 1, <4 x i1> splat (i1 true), <4 x i8> poison), !alias.scope [[META28]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> [[TMP3]], i32 1, <4 x i1> splat (i1 true), <4 x i8> poison), !alias.scope [[META28]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> align 1 [[TMP2]], <4 x i1> splat (i1 true), <4 x i8> poison), !alias.scope [[META28]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> align 1 [[TMP3]], <4 x i1> splat (i1 true), <4 x i8> poison), !alias.scope [[META28]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul <4 x i8> [[WIDE_MASKED_GATHER]], splat (i8 10)
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i8> [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER7]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i8> [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER8]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, <4 x ptr> [[TMP1]], i32 1
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP4]], <4 x ptr> [[TMP1]], i32 1, <4 x i1> splat (i1 true)), !alias.scope [[META31:![0-9]+]], !noalias [[META28]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP4]], <4 x ptr> align 1 [[TMP1]], <4 x i1> splat (i1 true)), !alias.scope [[META31:![0-9]+]], !noalias [[META28]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, <4 x ptr> [[TMP1]], i32 2
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP5]], <4 x ptr> [[TMP7]], i32 1, <4 x i1> splat (i1 true)), !alias.scope [[META31]], !noalias [[META28]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP6]], <4 x ptr> [[TMP8]], i32 1, <4 x i1> splat (i1 true)), !alias.scope [[META31]], !noalias [[META28]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP5]], <4 x ptr> align 1 [[TMP7]], <4 x i1> splat (i1 true)), !alias.scope [[META31]], !noalias [[META28]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP6]], <4 x ptr> align 1 [[TMP8]], <4 x i1> splat (i1 true)), !alias.scope [[META31]], !noalias [[META28]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i32 12
 ; CHECK-NEXT:    [[PTR_IND6]] = getelementptr i8, ptr [[POINTER_PHI5]], i32 12
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll
index 596e42e9f094d..d0c11946c9deb 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/scalar-block-cost.ll
@@ -36,7 +36,7 @@ for.body:                                         ; preds = %entry, %for.body
   br i1 %exitcond.not, label %exit, label %for.body
 }
 
-define i32 @if_convert(ptr %a, ptr %b, i32 %start, i32 %end) #0 {
+define void @if_convert(ptr %a, ptr %b, i32 %start, i32 %end) #0 {
 
 ; CHECK-COST-2: LV: Found an estimated cost of 0 for VF 1 For instruction:   %i.032 = phi i32 [ %inc, %if.end ], [ %start, %for.body.preheader ]
 ; CHECK-COST-2-NEXT: LV: Found an estimated cost of 0 for VF 1 For instruction:   %arrayidx = getelementptr inbounds i32, ptr %a, i32 %i.032
@@ -70,7 +70,7 @@ for.cond.cleanup.loopexit:                        ; preds = %if.end
   br label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  ret i32 undef
+  ret void
 
 for.body:                                         ; preds = %for.body.preheader, %if.end
   %i.032 = phi i32 [ %inc, %if.end ], [ %start, %for.body.preheader ]
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
index 0f524561eadc2..27946efc6af04 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
@@ -19,9 +19,9 @@ define dso_local void @sgt_loopguard(ptr noalias nocapture readonly %a, ptr noal
 
 ; CHECK-TF:     %[[VIVELEM0:.*]] = extractelement <16 x i32> %vec.iv, i32 0
 ; CHECK-TF:     %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %[[VIVELEM0]], i32 %N)
-; CHECK-TF:     llvm.masked.load.v16i8.p0(ptr %{{.*}}, i32 1, <16 x i1> %active.lane.mask
-; CHECK-TF:     llvm.masked.load.v16i8.p0(ptr %{{.*}}, i32 1, <16 x i1> %active.lane.mask
-; CHECK-TF:     llvm.masked.store.v16i8.p0(<16 x i8> %{{.*}}, ptr %{{.*}}, i32 1, <16 x i1> %active.lane.mask)
+; CHECK-TF:     llvm.masked.load.v16i8.p0(ptr align 1 %{{.*}}, <16 x i1> %active.lane.mask
+; CHECK-TF:     llvm.masked.load.v16i8.p0(ptr align 1 %{{.*}}, <16 x i1> %active.lane.mask
+; CHECK-TF:     llvm.masked.store.v16i8.p0(<16 x i8> %{{.*}}, ptr align 1 %{{.*}}, <16 x i1> %active.lane.mask)
 entry:
   %cmp5 = icmp sgt i32 %N, 0
   br i1 %cmp5, label %while.body.preheader, label %while.end
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll
index 956b575b48aa6..4af40b711726d 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll
@@ -504,7 +504,7 @@ define dso_local void @select_not_allowed(ptr noalias nocapture %A, ptr noalias
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x ptr> [[BROADCAST_SPLAT]], <4 x ptr> [[BROADCAST_SPLAT2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, <4 x ptr> [[TMP4]], <4 x i32> [[VEC_IND]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP5]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP5]], <4 x i1> splat (i1 true), <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[INDEX]]
 ; CHECK-NEXT:    store <4 x i32> [[WIDE_MASKED_GATHER]], ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
index 10f8f742bb1e2..f25b86d3b20c2 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
@@ -102,7 +102,7 @@ define i8 @dead_live_out_due_to_scalar_epilogue_required(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP15:%.*]] = sext <vscale x 4 x i32> [[VEC_IND]] to <vscale x 4 x i64>
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[DST]], <vscale x 4 x i64> [[TMP15]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i8.nxv4p0(<vscale x 4 x i8> zeroinitializer, <vscale x 4 x ptr> [[TMP16]], i32 1, <vscale x 4 x i1> splat (i1 true)), !alias.scope [[META3:![0-9]+]], !noalias [[META6:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i8.nxv4p0(<vscale x 4 x i8> zeroinitializer, <vscale x 4 x ptr> align 1 [[TMP16]], <vscale x 4 x i1> splat (i1 true)), !alias.scope [[META3:![0-9]+]], !noalias [[META6:![0-9]+]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP4]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -224,7 +224,7 @@ define i32 @cost_of_exit_branch_and_cond_insts(ptr %a, ptr %b, i1 %c, i16 %x) #0
 ; CHECK-NEXT:    store i1 false, ptr [[A]], align 1, !alias.scope [[META10]], !noalias [[META13]]
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE18]]
 ; CHECK:       [[PRED_STORE_CONTINUE18]]:
-; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr [[TMP11]], i32 4, <8 x i1> [[BROADCAST_SPLAT]]), !alias.scope [[META13]]
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr align 4 [[TMP11]], <8 x i1> [[BROADCAST_SPLAT]]), !alias.scope [[META13]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
index ef0f0cf8777e7..5c78cfd6daded 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
@@ -32,7 +32,7 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali
 ; SCALAR_EPILOGUE-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
 ; SCALAR_EPILOGUE-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP6]]
 ; SCALAR_EPILOGUE-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]])
-; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr [[TMP7]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
+; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr align 1 [[TMP7]], <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
 ; SCALAR_EPILOGUE-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[WIDE_MASKED_VEC]])
 ; SCALAR_EPILOGUE-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
 ; SCALAR_EPILOGUE-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
@@ -42,7 +42,7 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali
 ; SCALAR_EPILOGUE-NEXT:    [[TMP13:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP10]]
 ; SCALAR_EPILOGUE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP10]], <vscale x 16 x i8> [[TMP13]])
 ; SCALAR_EPILOGUE-NEXT:    [[INTERLEAVED_MASK3:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]])
-; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP12]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK3]])
+; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr align 1 [[TMP12]], <vscale x 32 x i1> [[INTERLEAVED_MASK3]])
 ; SCALAR_EPILOGUE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP2]]
 ; SCALAR_EPILOGUE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; SCALAR_EPILOGUE-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -79,7 +79,7 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali
 ; PREDICATED_DATA-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
 ; PREDICATED_DATA-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP6]]
 ; PREDICATED_DATA-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]])
-; PREDICATED_DATA-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr [[TMP7]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
+; PREDICATED_DATA-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr align 1 [[TMP7]], <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
 ; PREDICATED_DATA-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[WIDE_MASKED_VEC]])
 ; PREDICATED_DATA-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
 ; PREDICATED_DATA-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
@@ -89,7 +89,7 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali
 ; PREDICATED_DATA-NEXT:    [[TMP13:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP10]]
 ; PREDICATED_DATA-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP10]], <vscale x 16 x i8> [[TMP13]])
 ; PREDICATED_DATA-NEXT:    [[INTERLEAVED_MASK3:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]])
-; PREDICATED_DATA-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP12]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK3]])
+; PREDICATED_DATA-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr align 1 [[TMP12]], <vscale x 32 x i1> [[INTERLEAVED_MASK3]])
 ; PREDICATED_DATA-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP1]]
 ; PREDICATED_DATA-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; PREDICATED_DATA-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -206,7 +206,7 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali
 ; SCALAR_EPILOGUE-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
 ; SCALAR_EPILOGUE-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP6]]
 ; SCALAR_EPILOGUE-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]])
-; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr [[TMP7]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK]], <vscale x 64 x i8> poison)
+; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr align 1 [[TMP7]], <vscale x 64 x i1> [[INTERLEAVED_MASK]], <vscale x 64 x i8> poison)
 ; SCALAR_EPILOGUE-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> [[WIDE_MASKED_VEC]])
 ; SCALAR_EPILOGUE-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
 ; SCALAR_EPILOGUE-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
@@ -220,7 +220,7 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali
 ; SCALAR_EPILOGUE-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP16]]
 ; SCALAR_EPILOGUE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave4.nxv64i8(<vscale x 16 x i8> [[TMP12]], <vscale x 16 x i8> [[TMP13]], <vscale x 16 x i8> [[TMP14]], <vscale x 16 x i8> [[TMP15]])
 ; SCALAR_EPILOGUE-NEXT:    [[INTERLEAVED_MASK3:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]])
-; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr [[TMP17]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK3]])
+; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr align 1 [[TMP17]], <vscale x 64 x i1> [[INTERLEAVED_MASK3]])
 ; SCALAR_EPILOGUE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP2]]
 ; SCALAR_EPILOGUE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; SCALAR_EPILOGUE-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -257,7 +257,7 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali
 ; PREDICATED_DATA-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
 ; PREDICATED_DATA-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP6]]
 ; PREDICATED_DATA-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]])
-; PREDICATED_DATA-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr [[TMP7]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK]], <vscale x 64 x i8> poison)
+; PREDICATED_DATA-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr align 1 [[TMP7]], <vscale x 64 x i1> [[INTERLEAVED_MASK]], <vscale x 64 x i8> poison)
 ; PREDICATED_DATA-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> [[WIDE_MASKED_VEC]])
 ; PREDICATED_DATA-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
 ; PREDICATED_DATA-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
@@ -271,7 +271,7 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali
 ; PREDICATED_DATA-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP16]]
 ; PREDICATED_DATA-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave4.nxv64i8(<vscale x 16 x i8> [[TMP12]], <vscale x 16 x i8> [[TMP13]], <vscale x 16 x i8> [[TMP14]], <vscale x 16 x i8> [[TMP15]])
 ; PREDICATED_DATA-NEXT:    [[INTERLEAVED_MASK3:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]])
-; PREDICATED_DATA-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr [[TMP17]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK3]])
+; PREDICATED_DATA-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr align 1 [[TMP17]], <vscale x 64 x i1> [[INTERLEAVED_MASK3]])
 ; PREDICATED_DATA-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP1]]
 ; PREDICATED_DATA-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; PREDICATED_DATA-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -324,7 +324,7 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali
 ; PREDICATED_DATA-WITH-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP1]]
 ; PREDICATED_DATA-WITH-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
-; PREDICATED_DATA-WITH-EVL-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; PREDICATED_DATA-WITH-EVL-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; PREDICATED_DATA-WITH-EVL:       middle.block:
 ; PREDICATED_DATA-WITH-EVL-NEXT:    br label [[FOR_END:%.*]]
 ; PREDICATED_DATA-WITH-EVL:       for.end:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-store-with-gap.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-store-with-gap.ll
index c5396f26bbe7f..f36919f98dd00 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-store-with-gap.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-store-with-gap.ll
@@ -20,7 +20,7 @@ define void @store_factor_2_with_tail_gap(i64 %n, ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i64> [[VEC_IND]], <16 x i64> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i64> [[TMP2]], <32 x i64> poison, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-; CHECK-NEXT:    call void @llvm.masked.store.v32i64.p0(<32 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], i32 8, <32 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>)
+; CHECK-NEXT:    call void @llvm.masked.store.v32i64.p0(<32 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP1]], <32 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16)
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index 5ca9bfdb29c2c..f2f65685e9bad 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -870,17 +870,17 @@ define void @vector_reverse_irregular_type(ptr noalias %A, ptr noalias %B) {
 ; RV64-NEXT:    [[TMP18:%.*]] = insertelement <4 x i7> [[TMP17]], i7 [[TMP14]], i32 2
 ; RV64-NEXT:    [[TMP19:%.*]] = insertelement <4 x i7> [[TMP18]], i7 [[TMP15]], i32 3
 ; RV64-NEXT:    [[TMP20:%.*]] = add <4 x i7> [[TMP19]], splat (i7 1)
+; RV64-NEXT:    [[TMP25:%.*]] = extractelement <4 x i7> [[TMP20]], i32 0
+; RV64-NEXT:    [[TMP26:%.*]] = extractelement <4 x i7> [[TMP20]], i32 1
+; RV64-NEXT:    [[TMP27:%.*]] = extractelement <4 x i7> [[TMP20]], i32 2
+; RV64-NEXT:    [[TMP28:%.*]] = extractelement <4 x i7> [[TMP20]], i32 3
 ; RV64-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP4]]
 ; RV64-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP5]]
 ; RV64-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP6]]
 ; RV64-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP7]]
-; RV64-NEXT:    [[TMP25:%.*]] = extractelement <4 x i7> [[TMP20]], i32 0
 ; RV64-NEXT:    store i7 [[TMP25]], ptr [[TMP21]], align 1
-; RV64-NEXT:    [[TMP26:%.*]] = extractelement <4 x i7> [[TMP20]], i32 1
 ; RV64-NEXT:    store i7 [[TMP26]], ptr [[TMP22]], align 1
-; RV64-NEXT:    [[TMP27:%.*]] = extractelement <4 x i7> [[TMP20]], i32 2
 ; RV64-NEXT:    store i7 [[TMP27]], ptr [[TMP23]], align 1
-; RV64-NEXT:    [[TMP28:%.*]] = extractelement <4 x i7> [[TMP20]], i32 3
 ; RV64-NEXT:    store i7 [[TMP28]], ptr [[TMP24]], align 1
 ; RV64-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; RV64-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020
@@ -921,17 +921,17 @@ define void @vector_reverse_irregular_type(ptr noalias %A, ptr noalias %B) {
 ; RV32-NEXT:    [[TMP18:%.*]] = insertelement <4 x i7> [[TMP17]], i7 [[TMP14]], i32 2
 ; RV32-NEXT:    [[TMP19:%.*]] = insertelement <4 x i7> [[TMP18]], i7 [[TMP15]], i32 3
 ; RV32-NEXT:    [[TMP20:%.*]] = add <4 x i7> [[TMP19]], splat (i7 1)
+; RV32-NEXT:    [[TMP25:%.*]] = extractelement <4 x i7> [[TMP20]], i32 0
+; RV32-NEXT:    [[TMP26:%.*]] = extractelement <4 x i7> [[TMP20]], i32 1
+; RV32-NEXT:    [[TMP27:%.*]] = extractelement <4 x i7> [[TMP20]], i32 2
+; RV32-NEXT:    [[TMP28:%.*]] = extractelement <4 x i7> [[TMP20]], i32 3
 ; RV32-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP4]]
 ; RV32-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP5]]
 ; RV32-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP6]]
 ; RV32-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP7]]
-; RV32-NEXT:    [[TMP25:%.*]] = extractelement <4 x i7> [[TMP20]], i32 0
 ; RV32-NEXT:    store i7 [[TMP25]], ptr [[TMP21]], align 1
-; RV32-NEXT:    [[TMP26:%.*]] = extractelement <4 x i7> [[TMP20]], i32 1
 ; RV32-NEXT:    store i7 [[TMP26]], ptr [[TMP22]], align 1
-; RV32-NEXT:    [[TMP27:%.*]] = extractelement <4 x i7> [[TMP20]], i32 2
 ; RV32-NEXT:    store i7 [[TMP27]], ptr [[TMP23]], align 1
-; RV32-NEXT:    [[TMP28:%.*]] = extractelement <4 x i7> [[TMP20]], i32 3
 ; RV32-NEXT:    store i7 [[TMP28]], ptr [[TMP24]], align 1
 ; RV32-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; RV32-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020
@@ -992,7 +992,15 @@ define void @vector_reverse_irregular_type(ptr noalias %A, ptr noalias %B) {
 ; RV64-UF2-NEXT:    [[TMP38:%.*]] = insertelement <4 x i7> [[TMP37]], i7 [[TMP34]], i32 2
 ; RV64-UF2-NEXT:    [[TMP39:%.*]] = insertelement <4 x i7> [[TMP38]], i7 [[TMP35]], i32 3
 ; RV64-UF2-NEXT:    [[TMP40:%.*]] = add <4 x i7> [[TMP31]], splat (i7 1)
+; RV64-UF2-NEXT:    [[TMP50:%.*]] = extractelement <4 x i7> [[TMP40]], i32 0
+; RV64-UF2-NEXT:    [[TMP51:%.*]] = extractelement <4 x i7> [[TMP40]], i32 1
+; RV64-UF2-NEXT:    [[TMP52:%.*]] = extractelement <4 x i7> [[TMP40]], i32 2
+; RV64-UF2-NEXT:    [[TMP53:%.*]] = extractelement <4 x i7> [[TMP40]], i32 3
 ; RV64-UF2-NEXT:    [[TMP41:%.*]] = add <4 x i7> [[TMP39]], splat (i7 1)
+; RV64-UF2-NEXT:    [[TMP54:%.*]] = extractelement <4 x i7> [[TMP41]], i32 0
+; RV64-UF2-NEXT:    [[TMP55:%.*]] = extractelement <4 x i7> [[TMP41]], i32 1
+; RV64-UF2-NEXT:    [[TMP56:%.*]] = extractelement <4 x i7> [[TMP41]], i32 2
+; RV64-UF2-NEXT:    [[TMP57:%.*]] = extractelement <4 x i7> [[TMP41]], i32 3
 ; RV64-UF2-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP8]]
 ; RV64-UF2-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP9]]
 ; RV64-UF2-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP10]]
@@ -1001,21 +1009,13 @@ define void @vector_reverse_irregular_type(ptr noalias %A, ptr noalias %B) {
 ; RV64-UF2-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP13]]
 ; RV64-UF2-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP14]]
 ; RV64-UF2-NEXT:    [[TMP49:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP15]]
-; RV64-UF2-NEXT:    [[TMP50:%.*]] = extractelement <4 x i7> [[TMP40]], i32 0
 ; RV64-UF2-NEXT:    store i7 [[TMP50]], ptr [[TMP42]], align 1
-; RV64-UF2-NEXT:    [[TMP51:%.*]] = extractelement <4 x i7> [[TMP40]], i32 1
 ; RV64-UF2-NEXT:    store i7 [[TMP51]], ptr [[TMP43]], align 1
-; RV64-UF2-NEXT:    [[TMP52:%.*]] = extractelement <4 x i7> [[TMP40]], i32 2
 ; RV64-UF2-NEXT:    store i7 [[TMP52]], ptr [[TMP44]], align 1
-; RV64-UF2-NEXT:    [[TMP53:%.*]] = extractelement <4 x i7> [[TMP40]], i32 3
 ; RV64-UF2-NEXT:    store i7 [[TMP53]], ptr [[TMP45]], align 1
-; RV64-UF2-NEXT:    [[TMP54:%.*]] = extractelement <4 x i7> [[TMP41]], i32 0
 ; RV64-UF2-NEXT:    store i7 [[TMP54]], ptr [[TMP46]], align 1
-; RV64-UF2-NEXT:    [[TMP55:%.*]] = extractelement <4 x i7> [[TMP41]], i32 1
 ; RV64-UF2-NEXT:    store i7 [[TMP55]], ptr [[TMP47]], align 1
-; RV64-UF2-NEXT:    [[TMP56:%.*]] = extractelement <4 x i7> [[TMP41]], i32 2
 ; RV64-UF2-NEXT:    store i7 [[TMP56]], ptr [[TMP48]], align 1
-; RV64-UF2-NEXT:    [[TMP57:%.*]] = extractelement <4 x i7> [[TMP41]], i32 3
 ; RV64-UF2-NEXT:    store i7 [[TMP57]], ptr [[TMP49]], align 1
 ; RV64-UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; RV64-UF2-NEXT:    [[TMP58:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1016
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll b/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll
index e046816b694c0..8e562a97d51cf 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll
@@ -67,7 +67,7 @@ define void @test_may_clobber(ptr %p) {
 ; CHECK-NEXT:    store <4 x i64> [[WIDE_LOAD]], ptr [[TMP4]], align 32
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       exit:
@@ -111,7 +111,7 @@ define void @trivial_due_max_vscale(ptr %p) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]]
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       exit:
@@ -155,7 +155,7 @@ define void @no_high_lmul_or_interleave(ptr %p) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]]
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       exit:
@@ -203,11 +203,11 @@ define void @safe_load_store_distance_not_pow_of_2(i64 %N) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 3, i64 6, i64 9, i64 12, i64 15, i64 18, i64 21>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr [10 x [12 x i16]], ptr @a, i64 0, i64 8, <8 x i64> [[VEC_IND]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> zeroinitializer, <8 x ptr> [[TMP7]], i32 2, <8 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> zeroinitializer, <8 x ptr> align 2 [[TMP7]], <8 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 24)
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -221,7 +221,7 @@ define void @safe_load_store_distance_not_pow_of_2(i64 %N) {
 ; CHECK-NEXT:    store i16 0, ptr [[GEP_OFF]], align 2
 ; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 3
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IV]], [[N]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
index 2fbc73ef74d16..723b5e9cc280d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
@@ -63,12 +63,12 @@ define void @single_constant_stride_int_scaled(ptr %p) {
 ; CHECK-UF2-NEXT:    [[TMP10:%.*]] = mul nuw nsw <vscale x 4 x i64> [[STEP_ADD]], splat (i64 8)
 ; CHECK-UF2-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[P:%.*]], <vscale x 4 x i64> [[TMP9]]
 ; CHECK-UF2-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[P]], <vscale x 4 x i64> [[TMP10]]
-; CHECK-UF2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP11]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
-; CHECK-UF2-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP12]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; CHECK-UF2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP11]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; CHECK-UF2-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; CHECK-UF2-NEXT:    [[TMP13:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], splat (i32 1)
 ; CHECK-UF2-NEXT:    [[TMP14:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER1]], splat (i32 1)
-; CHECK-UF2-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP13]], <vscale x 4 x ptr> [[TMP11]], i32 4, <vscale x 4 x i1> splat (i1 true))
-; CHECK-UF2-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP14]], <vscale x 4 x ptr> [[TMP12]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; CHECK-UF2-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP13]], <vscale x 4 x ptr> align 4 [[TMP11]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-UF2-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP14]], <vscale x 4 x ptr> align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
 ; CHECK-UF2-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]]
 ; CHECK-UF2-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -133,7 +133,7 @@ define void @single_constant_stride_int_iv(ptr %p) {
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       exit:
@@ -165,12 +165,12 @@ define void @single_constant_stride_int_iv(ptr %p) {
 ; CHECK-UF2-NEXT:    [[STEP_ADD:%.*]] = add <vscale x 4 x i64> [[VEC_IND]], [[TMP6]]
 ; CHECK-UF2-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[P:%.*]], <vscale x 4 x i64> [[VEC_IND]]
 ; CHECK-UF2-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[P]], <vscale x 4 x i64> [[STEP_ADD]]
-; CHECK-UF2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP9]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
-; CHECK-UF2-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP10]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; CHECK-UF2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP9]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; CHECK-UF2-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP10]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; CHECK-UF2-NEXT:    [[TMP11:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], splat (i32 1)
 ; CHECK-UF2-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER1]], splat (i32 1)
-; CHECK-UF2-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP11]], <vscale x 4 x ptr> [[TMP9]], i32 4, <vscale x 4 x i1> splat (i1 true))
-; CHECK-UF2-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP12]], <vscale x 4 x ptr> [[TMP10]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; CHECK-UF2-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP11]], <vscale x 4 x ptr> align 4 [[TMP9]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-UF2-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP12]], <vscale x 4 x ptr> align 4 [[TMP10]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
 ; CHECK-UF2-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[STEP_ADD]], [[TMP6]]
 ; CHECK-UF2-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -237,7 +237,7 @@ define void @single_constant_stride_ptr_iv(ptr %p) {
 ; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 8, [[TMP9]]
 ; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       exit:
@@ -269,8 +269,8 @@ define void @single_constant_stride_ptr_iv(ptr %p) {
 ; CHECK-UF2-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-UF2-NEXT:    [[TMP11:%.*]] = mul <vscale x 4 x i64> [[TMP10]], splat (i64 8)
 ; CHECK-UF2-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <vscale x 4 x i64> [[TMP11]]
-; CHECK-UF2-NEXT:    [[STEP_ADD:%.*]] = getelementptr i8, <vscale x 4 x ptr> [[VECTOR_GEP]], <vscale x 4 x i64> [[TMP9]]
 ; CHECK-UF2-NEXT:    [[TMP12:%.*]] = extractelement <vscale x 4 x ptr> [[VECTOR_GEP]], i32 0
+; CHECK-UF2-NEXT:    [[STEP_ADD:%.*]] = getelementptr i8, <vscale x 4 x ptr> [[VECTOR_GEP]], <vscale x 4 x i64> [[TMP9]]
 ; CHECK-UF2-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP12]], align 4
 ; CHECK-UF2-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-UF2-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
@@ -280,8 +280,8 @@ define void @single_constant_stride_ptr_iv(ptr %p) {
 ; CHECK-UF2-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
 ; CHECK-UF2-NEXT:    [[TMP16:%.*]] = add <vscale x 4 x i32> [[TMP13]], splat (i32 1)
 ; CHECK-UF2-NEXT:    [[TMP17:%.*]] = add <vscale x 4 x i32> [[TMP15]], splat (i32 1)
-; CHECK-UF2-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP16]], <vscale x 4 x ptr> [[VECTOR_GEP]], i32 4, <vscale x 4 x i1> splat (i1 true))
-; CHECK-UF2-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP17]], <vscale x 4 x ptr> [[STEP_ADD]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; CHECK-UF2-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP16]], <vscale x 4 x ptr> align 4 [[VECTOR_GEP]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-UF2-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP17]], <vscale x 4 x ptr> align 4 [[STEP_ADD]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
 ; CHECK-UF2-NEXT:    [[TMP18:%.*]] = mul i64 8, [[TMP4]]
 ; CHECK-UF2-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP18]]
@@ -346,7 +346,7 @@ define void @single_stride_int_scaled(ptr %p, i64 %stride) {
 ; NOSTRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]]
 ; NOSTRIDED-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
 ; NOSTRIDED-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; NOSTRIDED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; NOSTRIDED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; NOSTRIDED:       middle.block:
 ; NOSTRIDED-NEXT:    br label [[EXIT:%.*]]
 ; NOSTRIDED:       scalar.ph:
@@ -360,7 +360,7 @@ define void @single_stride_int_scaled(ptr %p, i64 %stride) {
 ; NOSTRIDED-NEXT:    store i32 [[Y0]], ptr [[Q0]], align 4
 ; NOSTRIDED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
 ; NOSTRIDED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; NOSTRIDED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; NOSTRIDED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
 ; NOSTRIDED:       exit:
 ; NOSTRIDED-NEXT:    ret void
 ;
@@ -468,7 +468,7 @@ define void @single_stride_int_iv(ptr %p, i64 %stride) {
 ; NOSTRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]]
 ; NOSTRIDED-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
 ; NOSTRIDED-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; NOSTRIDED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; NOSTRIDED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; NOSTRIDED:       middle.block:
 ; NOSTRIDED-NEXT:    br label [[EXIT:%.*]]
 ; NOSTRIDED:       scalar.ph:
@@ -483,7 +483,7 @@ define void @single_stride_int_iv(ptr %p, i64 %stride) {
 ; NOSTRIDED-NEXT:    [[OFFSET_NEXT]] = add nuw nsw i64 [[OFFSET]], [[STRIDE]]
 ; NOSTRIDED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
 ; NOSTRIDED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; NOSTRIDED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; NOSTRIDED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
 ; NOSTRIDED:       exit:
 ; NOSTRIDED-NEXT:    ret void
 ;
@@ -640,7 +640,7 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
 ; NOSTRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP13]], [[INDEX]]
 ; NOSTRIDED-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP13]]
 ; NOSTRIDED-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; NOSTRIDED-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; NOSTRIDED-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; NOSTRIDED:       middle.block:
 ; NOSTRIDED-NEXT:    br label [[EXIT:%.*]]
 ; NOSTRIDED:       scalar.ph:
@@ -656,7 +656,7 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
 ; NOSTRIDED-NEXT:    store i32 [[Y0]], ptr [[Q1]], align 4
 ; NOSTRIDED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
 ; NOSTRIDED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; NOSTRIDED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; NOSTRIDED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
 ; NOSTRIDED:       exit:
 ; NOSTRIDED-NEXT:    ret void
 ;
@@ -790,14 +790,14 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
 ; STRIDED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT9]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; STRIDED-NEXT:    [[TMP18:%.*]] = mul nuw nsw <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT1]]
 ; STRIDED-NEXT:    [[TMP19:%.*]] = getelementptr i32, ptr [[P]], <vscale x 4 x i64> [[TMP18]]
-; STRIDED-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP19]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP43]]), !alias.scope [[META6:![0-9]+]]
+; STRIDED-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP19]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP43]]), !alias.scope [[META5:![0-9]+]]
 ; STRIDED-NEXT:    [[TMP20:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], splat (i32 1)
 ; STRIDED-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr [[P2]], <vscale x 4 x i64> [[TMP18]]
-; STRIDED-NEXT:    call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP20]], <vscale x 4 x ptr> align 4 [[TMP21]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP43]]), !alias.scope [[META9:![0-9]+]], !noalias [[META6]]
+; STRIDED-NEXT:    call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP20]], <vscale x 4 x ptr> align 4 [[TMP21]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP43]]), !alias.scope [[META8:![0-9]+]], !noalias [[META5]]
 ; STRIDED-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP44]]
 ; STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; STRIDED-NEXT:    [[TMP41:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; STRIDED-NEXT:    br i1 [[TMP41]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; STRIDED-NEXT:    br i1 [[TMP41]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; STRIDED:       middle.block:
 ; STRIDED-NEXT:    br label [[EXIT:%.*]]
 ; STRIDED:       scalar.ph:
@@ -813,7 +813,7 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
 ; STRIDED-NEXT:    store i32 [[Y0]], ptr [[Q1]], align 4
 ; STRIDED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
 ; STRIDED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; STRIDED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]]
+; STRIDED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
 ; STRIDED:       exit:
 ; STRIDED-NEXT:    ret void
 ;
@@ -893,14 +893,14 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
 ; STRIDED-UF2-NEXT:    [[TMP34:%.*]] = mul nuw nsw <vscale x 4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT11]]
 ; STRIDED-UF2-NEXT:    [[TMP35:%.*]] = getelementptr i32, ptr [[P]], <vscale x 4 x i64> [[TMP33]]
 ; STRIDED-UF2-NEXT:    [[TMP36:%.*]] = getelementptr i32, ptr [[P]], <vscale x 4 x i64> [[TMP34]]
-; STRIDED-UF2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP35]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison), !alias.scope [[META8:![0-9]+]]
-; STRIDED-UF2-NEXT:    [[WIDE_MASKED_GATHER12:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP36]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison), !alias.scope [[META8]]
+; STRIDED-UF2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP35]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison), !alias.scope [[META8:![0-9]+]]
+; STRIDED-UF2-NEXT:    [[WIDE_MASKED_GATHER12:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP36]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison), !alias.scope [[META8]]
 ; STRIDED-UF2-NEXT:    [[TMP37:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], splat (i32 1)
 ; STRIDED-UF2-NEXT:    [[TMP38:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER12]], splat (i32 1)
 ; STRIDED-UF2-NEXT:    [[TMP39:%.*]] = getelementptr i32, ptr [[P2]], <vscale x 4 x i64> [[TMP33]]
 ; STRIDED-UF2-NEXT:    [[TMP40:%.*]] = getelementptr i32, ptr [[P2]], <vscale x 4 x i64> [[TMP34]]
-; STRIDED-UF2-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP37]], <vscale x 4 x ptr> [[TMP39]], i32 4, <vscale x 4 x i1> splat (i1 true)), !alias.scope [[META11:![0-9]+]], !noalias [[META8]]
-; STRIDED-UF2-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP38]], <vscale x 4 x ptr> [[TMP40]], i32 4, <vscale x 4 x i1> splat (i1 true)), !alias.scope [[META11]], !noalias [[META8]]
+; STRIDED-UF2-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP37]], <vscale x 4 x ptr> align 4 [[TMP39]], <vscale x 4 x i1> splat (i1 true)), !alias.scope [[META11:![0-9]+]], !noalias [[META8]]
+; STRIDED-UF2-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP38]], <vscale x 4 x ptr> align 4 [[TMP40]], <vscale x 4 x i1> splat (i1 true)), !alias.scope [[META11]], !noalias [[META8]]
 ; STRIDED-UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP30]]
 ; STRIDED-UF2-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]]
 ; STRIDED-UF2-NEXT:    [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -965,7 +965,7 @@ define void @double_stride_int_iv(ptr %p, ptr %p2, i64 %stride) {
 ; NOSTRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]]
 ; NOSTRIDED-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
 ; NOSTRIDED-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; NOSTRIDED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; NOSTRIDED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; NOSTRIDED:       middle.block:
 ; NOSTRIDED-NEXT:    br label [[EXIT:%.*]]
 ; NOSTRIDED:       scalar.ph:
@@ -981,7 +981,7 @@ define void @double_stride_int_iv(ptr %p, ptr %p2, i64 %stride) {
 ; NOSTRIDED-NEXT:    [[OFFSET_NEXT]] = add nuw nsw i64 [[OFFSET]], [[STRIDE]]
 ; NOSTRIDED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
 ; NOSTRIDED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; NOSTRIDED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; NOSTRIDED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]]
 ; NOSTRIDED:       exit:
 ; NOSTRIDED-NEXT:    ret void
 ;
@@ -1145,16 +1145,16 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) {
 ; STRIDED-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI11]], <vscale x 4 x i64> [[TMP18]]
 ; STRIDED-NEXT:    [[VECTOR_GEP7:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <vscale x 4 x i64> [[TMP18]]
 ; STRIDED-NEXT:    [[TMP14:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
-; STRIDED-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[VECTOR_GEP7]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]]), !alias.scope [[META13:![0-9]+]]
+; STRIDED-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[VECTOR_GEP7]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]]), !alias.scope [[META12:![0-9]+]]
 ; STRIDED-NEXT:    [[TMP30:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], splat (i32 1)
-; STRIDED-NEXT:    call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP30]], <vscale x 4 x ptr> align 4 [[VECTOR_GEP]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]]), !alias.scope [[META16:![0-9]+]], !noalias [[META13]]
+; STRIDED-NEXT:    call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP30]], <vscale x 4 x ptr> align 4 [[VECTOR_GEP]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP14]]), !alias.scope [[META15:![0-9]+]], !noalias [[META12]]
 ; STRIDED-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP14]] to i64
 ; STRIDED-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]]
 ; STRIDED-NEXT:    [[TMP25:%.*]] = mul i64 [[STRIDE]], [[TMP16]]
 ; STRIDED-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP25]]
 ; STRIDED-NEXT:    [[PTR_IND12]] = getelementptr i8, ptr [[POINTER_PHI11]], i64 [[TMP25]]
 ; STRIDED-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; STRIDED-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; STRIDED-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; STRIDED:       middle.block:
 ; STRIDED-NEXT:    br label [[EXIT:%.*]]
 ; STRIDED:       scalar.ph:
@@ -1170,7 +1170,7 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) {
 ; STRIDED-NEXT:    [[PTR2_NEXT]] = getelementptr inbounds i8, ptr [[PTR2]], i64 [[STRIDE]]
 ; STRIDED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
 ; STRIDED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; STRIDED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP19:![0-9]+]]
+; STRIDED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP18:![0-9]+]]
 ; STRIDED:       exit:
 ; STRIDED-NEXT:    ret void
 ;
@@ -1227,12 +1227,12 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) {
 ; STRIDED-UF2-NEXT:    [[VECTOR_GEP12:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <vscale x 4 x i64> [[TMP16]]
 ; STRIDED-UF2-NEXT:    [[STEP_ADD:%.*]] = getelementptr i8, <vscale x 4 x ptr> [[VECTOR_GEP12]], <vscale x 4 x i64> [[TMP14]]
 ; STRIDED-UF2-NEXT:    [[STEP_ADD13:%.*]] = getelementptr i8, <vscale x 4 x ptr> [[VECTOR_GEP]], <vscale x 4 x i64> [[TMP14]]
-; STRIDED-UF2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[VECTOR_GEP12]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison), !alias.scope [[META15:![0-9]+]]
-; STRIDED-UF2-NEXT:    [[WIDE_MASKED_GATHER14:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[STEP_ADD]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison), !alias.scope [[META15]]
+; STRIDED-UF2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[VECTOR_GEP12]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison), !alias.scope [[META15:![0-9]+]]
+; STRIDED-UF2-NEXT:    [[WIDE_MASKED_GATHER14:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[STEP_ADD]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison), !alias.scope [[META15]]
 ; STRIDED-UF2-NEXT:    [[TMP19:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], splat (i32 1)
 ; STRIDED-UF2-NEXT:    [[TMP20:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER14]], splat (i32 1)
-; STRIDED-UF2-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP19]], <vscale x 4 x ptr> [[VECTOR_GEP]], i32 4, <vscale x 4 x i1> splat (i1 true)), !alias.scope [[META18:![0-9]+]], !noalias [[META15]]
-; STRIDED-UF2-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP20]], <vscale x 4 x ptr> [[STEP_ADD13]], i32 4, <vscale x 4 x i1> splat (i1 true)), !alias.scope [[META18]], !noalias [[META15]]
+; STRIDED-UF2-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP19]], <vscale x 4 x ptr> align 4 [[VECTOR_GEP]], <vscale x 4 x i1> splat (i1 true)), !alias.scope [[META18:![0-9]+]], !noalias [[META15]]
+; STRIDED-UF2-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP20]], <vscale x 4 x ptr> align 4 [[STEP_ADD13]], <vscale x 4 x i1> splat (i1 true)), !alias.scope [[META18]], !noalias [[META15]]
 ; STRIDED-UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
 ; STRIDED-UF2-NEXT:    [[TMP21:%.*]] = mul i64 [[STRIDE]], [[TMP9]]
 ; STRIDED-UF2-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP21]]
@@ -1318,7 +1318,7 @@ define void @constant_stride_reinterpret(ptr noalias %in, ptr noalias %out) {
 ; NOSTRIDED-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP3]]
 ; NOSTRIDED-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; NOSTRIDED-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; NOSTRIDED-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; NOSTRIDED-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; NOSTRIDED:       middle.block:
 ; NOSTRIDED-NEXT:    br label [[LOOP:%.*]]
 ; NOSTRIDED:       exit:
@@ -1348,8 +1348,8 @@ define void @constant_stride_reinterpret(ptr noalias %in, ptr noalias %out) {
 ; NOSTRIDED-UF2-NEXT:    [[STEP_ADD:%.*]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; NOSTRIDED-UF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i32, ptr [[IN:%.*]], <vscale x 2 x i64> [[VEC_IND]]
 ; NOSTRIDED-UF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i32, ptr [[IN]], <vscale x 2 x i64> [[STEP_ADD]]
-; NOSTRIDED-UF2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP7]], i32 8, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> poison)
-; NOSTRIDED-UF2-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP8]], i32 8, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> poison)
+; NOSTRIDED-UF2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> align 8 [[TMP7]], <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> poison)
+; NOSTRIDED-UF2-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> poison)
 ; NOSTRIDED-UF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i64, ptr [[OUT:%.*]], i64 [[INDEX]]
 ; NOSTRIDED-UF2-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
 ; NOSTRIDED-UF2-NEXT:    [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 1
@@ -1402,7 +1402,7 @@ define void @constant_stride_reinterpret(ptr noalias %in, ptr noalias %out) {
 ; STRIDED-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP3]]
 ; STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; STRIDED-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; STRIDED-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; STRIDED-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; STRIDED:       middle.block:
 ; STRIDED-NEXT:    br label [[LOOP:%.*]]
 ; STRIDED:       exit:
@@ -1432,8 +1432,8 @@ define void @constant_stride_reinterpret(ptr noalias %in, ptr noalias %out) {
 ; STRIDED-UF2-NEXT:    [[STEP_ADD:%.*]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; STRIDED-UF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i32, ptr [[IN:%.*]], <vscale x 2 x i64> [[VEC_IND]]
 ; STRIDED-UF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i32, ptr [[IN]], <vscale x 2 x i64> [[STEP_ADD]]
-; STRIDED-UF2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP7]], i32 8, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> poison)
-; STRIDED-UF2-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP8]], i32 8, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> poison)
+; STRIDED-UF2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> align 8 [[TMP7]], <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> poison)
+; STRIDED-UF2-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> poison)
 ; STRIDED-UF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i64, ptr [[OUT:%.*]], i64 [[INDEX]]
 ; STRIDED-UF2-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
 ; STRIDED-UF2-NEXT:    [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 1
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
index 0c22a9eb2acab..a07e031418762 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
@@ -142,7 +142,7 @@ define i32 @load_factor_4_with_gap(i64 %n, ptr noalias %a) {
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]]
 ; IF-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP12]])
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
@@ -173,12 +173,12 @@ define i32 @load_factor_4_with_gap(i64 %n, ptr noalias %a) {
 ; NO-VP-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 0
-; NO-VP-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP9]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; NO-VP-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP9]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; NO-VP-NEXT:    [[TMP10:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_GATHER]]
-; NO-VP-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP9]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; NO-VP-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP9]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; NO-VP-NEXT:    [[TMP11:%.*]] = add <vscale x 4 x i32> [[TMP10]], [[WIDE_MASKED_GATHER1]]
 ; NO-VP-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], <vscale x 4 x i64> [[VEC_IND]], i32 3
-; NO-VP-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP12]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; NO-VP-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; NO-VP-NEXT:    [[TMP13]] = add <vscale x 4 x i32> [[TMP11]], [[WIDE_MASKED_GATHER2]]
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; NO-VP-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
@@ -267,7 +267,7 @@ define void @store_factor_4_with_gap(i32 %n, ptr noalias %a) {
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP6]]
 ; IF-EVL-NEXT:    [[VEC_IND_NEXT5]] = add <vscale x 4 x i32> [[VEC_IND2]], [[BROADCAST_SPLAT]]
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; IF-EVL:       exit:
@@ -294,11 +294,11 @@ define void @store_factor_4_with_gap(i32 %n, ptr noalias %a) {
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VEC_IND4:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION1]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT5:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], <vscale x 4 x i32> [[VEC_IND4]], i32 0
-; NO-VP-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[VEC_IND4]], <vscale x 4 x ptr> [[TMP11]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; NO-VP-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[VEC_IND4]], <vscale x 4 x ptr> align 4 [[TMP11]], <vscale x 4 x i1> splat (i1 true))
 ; NO-VP-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], <vscale x 4 x i32> [[VEC_IND4]], i32 1
-; NO-VP-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[VEC_IND4]], <vscale x 4 x ptr> [[TMP12]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; NO-VP-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[VEC_IND4]], <vscale x 4 x ptr> align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true))
 ; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], <vscale x 4 x i32> [[VEC_IND4]], i32 3
-; NO-VP-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[VEC_IND4]], <vscale x 4 x ptr> [[TMP13]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; NO-VP-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[VEC_IND4]], <vscale x 4 x ptr> align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true))
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP9]]
 ; NO-VP-NEXT:    [[VEC_IND_NEXT5]] = add <vscale x 4 x i32> [[VEC_IND4]], [[BROADCAST_SPLAT3]]
 ; NO-VP-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -382,7 +382,7 @@ define i32 @load_factor_4_with_tail_gap(i64 %n, ptr noalias %a) {
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]]
 ; IF-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP12]])
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
@@ -413,12 +413,12 @@ define i32 @load_factor_4_with_tail_gap(i64 %n, ptr noalias %a) {
 ; NO-VP-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 0
-; NO-VP-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP9]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; NO-VP-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP9]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; NO-VP-NEXT:    [[TMP10:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_GATHER]]
-; NO-VP-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP9]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; NO-VP-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP9]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; NO-VP-NEXT:    [[TMP11:%.*]] = add <vscale x 4 x i32> [[TMP10]], [[WIDE_MASKED_GATHER1]]
 ; NO-VP-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], <vscale x 4 x i64> [[VEC_IND]], i32 2
-; NO-VP-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP12]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; NO-VP-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; NO-VP-NEXT:    [[TMP13]] = add <vscale x 4 x i32> [[TMP11]], [[WIDE_MASKED_GATHER2]]
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; NO-VP-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
@@ -508,7 +508,7 @@ define void @store_factor_4_with_tail_gap(i32 %n, ptr noalias %a) {
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP6]]
 ; IF-EVL-NEXT:    [[VEC_IND_NEXT5]] = add <vscale x 4 x i32> [[VEC_IND2]], [[BROADCAST_SPLAT]]
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; IF-EVL:       exit:
@@ -535,11 +535,11 @@ define void @store_factor_4_with_tail_gap(i32 %n, ptr noalias %a) {
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VEC_IND4:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION1]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT5:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], <vscale x 4 x i32> [[VEC_IND4]], i32 0
-; NO-VP-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[VEC_IND4]], <vscale x 4 x ptr> [[TMP11]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; NO-VP-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[VEC_IND4]], <vscale x 4 x ptr> align 4 [[TMP11]], <vscale x 4 x i1> splat (i1 true))
 ; NO-VP-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], <vscale x 4 x i32> [[VEC_IND4]], i32 1
-; NO-VP-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[VEC_IND4]], <vscale x 4 x ptr> [[TMP12]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; NO-VP-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[VEC_IND4]], <vscale x 4 x ptr> align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true))
 ; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], <vscale x 4 x i32> [[VEC_IND4]], i32 2
-; NO-VP-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[VEC_IND4]], <vscale x 4 x ptr> [[TMP13]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; NO-VP-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[VEC_IND4]], <vscale x 4 x ptr> align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true))
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP9]]
 ; NO-VP-NEXT:    [[VEC_IND_NEXT5]] = add <vscale x 4 x i32> [[VEC_IND4]], [[BROADCAST_SPLAT3]]
 ; NO-VP-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -621,7 +621,7 @@ define i32 @load_factor_4_reverse(i64 %n, ptr noalias %a) {
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]]
 ; IF-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP16]])
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
@@ -657,15 +657,15 @@ define i32 @load_factor_4_reverse(i64 %n, ptr noalias %a) {
 ; NO-VP-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 0
-; NO-VP-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP10]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; NO-VP-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP10]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; NO-VP-NEXT:    [[TMP11:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_GATHER]]
-; NO-VP-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP10]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; NO-VP-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP10]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; NO-VP-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP11]], [[WIDE_MASKED_GATHER3]]
 ; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], <vscale x 4 x i64> [[VEC_IND]], i32 2
-; NO-VP-NEXT:    [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP13]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; NO-VP-NEXT:    [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; NO-VP-NEXT:    [[TMP14:%.*]] = add <vscale x 4 x i32> [[TMP12]], [[WIDE_MASKED_GATHER4]]
 ; NO-VP-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], <vscale x 4 x i64> [[VEC_IND]], i32 3
-; NO-VP-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP15]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; NO-VP-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP15]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; NO-VP-NEXT:    [[TMP16]] = add <vscale x 4 x i32> [[TMP14]], [[WIDE_MASKED_GATHER5]]
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; NO-VP-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-masked-loadstore.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-masked-loadstore.ll
index b13c671ae3d56..e9dcf8f0f0395 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-masked-loadstore.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-masked-loadstore.ll
@@ -52,9 +52,9 @@ define void @masked_loadstore(ptr noalias %a, ptr noalias %b, i64 %n) {
 ; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP6]], align 4
 ; NO-VP-NEXT:    [[TMP7:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_LOAD]], zeroinitializer
 ; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP8]], i32 4, <vscale x 4 x i1> [[TMP7]], <vscale x 4 x i32> poison)
+; NO-VP-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP8]], <vscale x 4 x i1> [[TMP7]], <vscale x 4 x i32> poison)
 ; NO-VP-NEXT:    [[TMP9:%.*]] = add <vscale x 4 x i32> [[WIDE_LOAD]], [[WIDE_MASKED_LOAD]]
-; NO-VP-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP9]], ptr [[TMP8]], i32 4, <vscale x 4 x i1> [[TMP7]])
+; NO-VP-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP9]], ptr align 4 [[TMP8]], <vscale x 4 x i1> [[TMP7]])
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; NO-VP-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[INC]]
 ; NO-VP-NEXT:    br i1 [[TMP10]], label [[FOR_INC:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reverse-load-store.ll
index e70894b981dff..7b0ac78fb365c 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reverse-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reverse-load-store.ll
@@ -164,7 +164,7 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP28]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP28]]
 ; IF-EVL-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    br label [[FOR_INC:%.*]]
 ; IF-EVL:       loopend:
@@ -199,7 +199,7 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal
 ; NO-VP-NEXT:    [[TMP15:%.*]] = getelementptr i32, ptr [[TMP11]], i64 [[TMP12]]
 ; NO-VP-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr [[TMP15]], i64 [[TMP14]]
 ; NO-VP-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[TMP10]])
-; NO-VP-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP16]], i32 4, <vscale x 4 x i1> [[REVERSE]], <vscale x 4 x i32> poison)
+; NO-VP-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP16]], <vscale x 4 x i1> [[REVERSE]], <vscale x 4 x i32> poison)
 ; NO-VP-NEXT:    [[REVERSE2:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]])
 ; NO-VP-NEXT:    [[TMP17:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP8]]
 ; NO-VP-NEXT:    [[TMP18:%.*]] = mul i64 0, [[TMP3]]
@@ -209,7 +209,7 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal
 ; NO-VP-NEXT:    [[TMP22:%.*]] = getelementptr i32, ptr [[TMP21]], i64 [[TMP20]]
 ; NO-VP-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[TMP10]])
 ; NO-VP-NEXT:    [[REVERSE4:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[REVERSE2]])
-; NO-VP-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[REVERSE4]], ptr [[TMP22]], i32 4, <vscale x 4 x i1> [[REVERSE3]])
+; NO-VP-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[REVERSE4]], ptr align 4 [[TMP22]], <vscale x 4 x i1> [[REVERSE3]])
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; NO-VP-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; NO-VP-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -316,7 +316,7 @@ define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP27]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP27]]
 ; IF-EVL-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    br label [[LOOP:%.*]]
 ; IF-EVL:       exit:
@@ -336,7 +336,7 @@ define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr
 ; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
 ; NO-VP-NEXT:    [[REVERSE:%.*]] = shufflevector <16 x i8> [[WIDE_LOAD]], <16 x i8> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; NO-VP-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[B:%.*]], <16 x i8> [[REVERSE]]
-; NO-VP-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> [[TMP3]], i32 1, <16 x i1> splat (i1 true), <16 x i8> poison)
+; NO-VP-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> align 1 [[TMP3]], <16 x i1> splat (i1 true), <16 x i8> poison)
 ; NO-VP-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[OFFSET_IDX]]
 ; NO-VP-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
 ; NO-VP-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 -15
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
index c5319c6165f89..f4c7c6f6fba1b 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
@@ -246,6 +246,57 @@ exit:
   ret void
 }
 
+; Test for https://github.com/llvm/llvm-project/issues/162688.
+define void @test_minbws_for_trunc(i32 %n, ptr noalias %p1, ptr noalias %p2) {
+; CHECK-LABEL: define void @test_minbws_for_trunc(
+; CHECK-SAME: i32 [[N:%.*]], ptr noalias [[P1:%.*]], ptr noalias [[P2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_EXT:%.*]] = sext i16 [[IV]] to i64
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i32, ptr [[P1]], i64 [[IV_EXT]]
+; CHECK-NEXT:    [[V1:%.*]] = load i32, ptr [[GEP1]], align 4
+; CHECK-NEXT:    [[V1_TRUNC:%.*]] = trunc i32 [[V1]] to i16
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr [1 x [1 x i16]], ptr [[P2]], i64 [[IV_EXT]]
+; CHECK-NEXT:    store i16 [[V1_TRUNC]], ptr [[GEP2]], align 2
+; CHECK-NEXT:    [[V1_TRUNC_I8:%.*]] = trunc i32 [[V1]] to i8
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr i8, ptr [[P2]], i64 [[IV_EXT]]
+; CHECK-NEXT:    store i8 [[V1_TRUNC_I8]], ptr [[GEP3]], align 1
+; CHECK-NEXT:    [[GEP4:%.*]] = getelementptr [1 x i64], ptr [[P2]], i64 [[IV_EXT]]
+; CHECK-NEXT:    store i64 0, ptr [[GEP4]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 4
+; CHECK-NEXT:    [[IV_NEXT_EXT:%.*]] = sext i16 [[IV_NEXT]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[IV_NEXT_EXT]], 1024
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i16 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.ext = sext i16 %iv to i64
+  %gep1 = getelementptr i32, ptr %p1, i64 %iv.ext
+  %v1 = load i32, ptr %gep1, align 4
+  %v1.trunc = trunc i32 %v1 to i16
+  %gep2 = getelementptr [1 x [1 x i16]], ptr %p2, i64 %iv.ext
+  store i16 %v1.trunc, ptr %gep2, align 2
+  %v1.trunc.i8 = trunc i32 %v1 to i8
+  %gep3 = getelementptr i8, ptr %p2, i64 %iv.ext
+  store i8 %v1.trunc.i8, ptr %gep3, align 1
+  %gep4 = getelementptr [1 x i64], ptr %p2, i64 %iv.ext
+  store i64 0, ptr %gep4, align 8
+  %iv.next = add i16 %iv, 4
+  %iv.next.ext = sext i16 %iv.next to i32
+  %cmp = icmp ne i32 %iv.next.ext, 1024
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
 attributes #0 = { "target-features"="+64bit,+v,+zvl256b" }
 attributes #1 = { "target-features"="+64bit,+v" }
 
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
index bae97e53a1ff9..1e21c753840e9 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
@@ -129,7 +129,7 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap
 ; SCALABLE-NEXT:    store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP8]], align 8
 ; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; SCALABLE-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; SCALABLE-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; SCALABLE:       [[MIDDLE_BLOCK]]:
 ; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
@@ -143,7 +143,7 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap
 ; SCALABLE-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
 ; SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; SCALABLE:       [[FOR_END]]:
 ; SCALABLE-NEXT:    [[V_LCSSA:%.*]] = phi i64 [ [[V]], %[[FOR_BODY]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ]
 ; SCALABLE-NEXT:    ret i64 [[V_LCSSA]]
@@ -204,7 +204,7 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap
 ; TF-SCALABLE-NEXT:    store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[ARRAYIDX]], align 8
 ; TF-SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], [[TMP3]]
 ; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; TF-SCALABLE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
 ; TF-SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]]
 ; TF-SCALABLE-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
@@ -218,7 +218,7 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap
 ; TF-SCALABLE-NEXT:    store i64 [[V1]], ptr [[ARRAYIDX1]], align 8
 ; TF-SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; TF-SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; TF-SCALABLE:       [[FOR_END]]:
 ; TF-SCALABLE-NEXT:    [[V_LCSSA:%.*]] = phi i64 [ [[V1]], %[[FOR_BODY]] ], [ [[V]], %[[MIDDLE_BLOCK]] ]
 ; TF-SCALABLE-NEXT:    ret i64 [[V_LCSSA]]
@@ -269,7 +269,7 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
 ; SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
 ; SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; SCALABLE-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; SCALABLE-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; SCALABLE-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; SCALABLE:       [[MIDDLE_BLOCK]]:
 ; SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; SCALABLE:       [[FOR_END]]:
@@ -289,8 +289,8 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
 ; FIXEDLEN-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; FIXEDLEN-NEXT:    [[TMP1:%.*]] = icmp ugt <4 x i64> [[VEC_IND]], splat (i64 10)
 ; FIXEDLEN-NEXT:    [[TMP2:%.*]] = icmp ugt <4 x i64> [[STEP_ADD]], splat (i64 10)
-; FIXEDLEN-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[BROADCAST_SPLAT]], i32 8, <4 x i1> [[TMP1]], <4 x i64> poison)
-; FIXEDLEN-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[BROADCAST_SPLAT]], i32 8, <4 x i1> [[TMP2]], <4 x i64> poison)
+; FIXEDLEN-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 8 [[BROADCAST_SPLAT]], <4 x i1> [[TMP1]], <4 x i64> poison)
+; FIXEDLEN-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 8 [[BROADCAST_SPLAT]], <4 x i1> [[TMP2]], <4 x i64> poison)
 ; FIXEDLEN-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP1]], <4 x i64> [[WIDE_MASKED_GATHER]], <4 x i64> zeroinitializer
 ; FIXEDLEN-NEXT:    [[PREDPHI2:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> [[WIDE_MASKED_GATHER1]], <4 x i64> zeroinitializer
 ; FIXEDLEN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
@@ -350,7 +350,7 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
 ; TF-SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
 ; TF-SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; TF-SCALABLE-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; TF-SCALABLE-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
 ; TF-SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; TF-SCALABLE:       [[FOR_END]]:
@@ -399,7 +399,7 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt
 ; SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]]
 ; SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
 ; SCALABLE-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; SCALABLE-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; SCALABLE-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; SCALABLE:       [[MIDDLE_BLOCK]]:
 ; SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; SCALABLE:       [[FOR_END]]:
@@ -457,7 +457,7 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt
 ; TF-SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP8]], [[INDEX]]
 ; TF-SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
 ; TF-SCALABLE-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; TF-SCALABLE-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
 ; TF-SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; TF-SCALABLE:       [[FOR_END]]:
@@ -499,7 +499,7 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP10]], [[INDEX]]
 ; SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP10]]
 ; SCALABLE-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; SCALABLE-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; SCALABLE-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; SCALABLE:       [[MIDDLE_BLOCK]]:
 ; SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; SCALABLE:       [[FOR_END]]:
@@ -557,7 +557,7 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; TF-SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP7]], [[INDEX]]
 ; TF-SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]]
 ; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; TF-SCALABLE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
 ; TF-SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; TF-SCALABLE:       [[FOR_END]]:
@@ -608,7 +608,7 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
 ; SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
 ; SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; SCALABLE-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; SCALABLE-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; SCALABLE-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; SCALABLE:       [[MIDDLE_BLOCK]]:
 ; SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; SCALABLE:       [[FOR_END]]:
@@ -679,7 +679,7 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
 ; TF-SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP13]]
 ; TF-SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; TF-SCALABLE-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; TF-SCALABLE-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
 ; TF-SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; TF-SCALABLE:       [[FOR_END]]:
@@ -731,7 +731,7 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
 ; SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP14]]
 ; SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; SCALABLE-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; SCALABLE-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; SCALABLE-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; SCALABLE:       [[MIDDLE_BLOCK]]:
 ; SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; SCALABLE:       [[FOR_END]]:
@@ -753,8 +753,8 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
 ; FIXEDLEN-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; FIXEDLEN-NEXT:    [[TMP1:%.*]] = icmp ugt <4 x i64> [[VEC_IND]], splat (i64 10)
 ; FIXEDLEN-NEXT:    [[TMP2:%.*]] = icmp ugt <4 x i64> [[STEP_ADD]], splat (i64 10)
-; FIXEDLEN-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[BROADCAST_SPLAT]], <4 x ptr> [[BROADCAST_SPLAT2]], i32 8, <4 x i1> [[TMP1]])
-; FIXEDLEN-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[BROADCAST_SPLAT]], <4 x ptr> [[BROADCAST_SPLAT2]], i32 8, <4 x i1> [[TMP2]])
+; FIXEDLEN-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[BROADCAST_SPLAT]], <4 x ptr> align 8 [[BROADCAST_SPLAT2]], <4 x i1> [[TMP1]])
+; FIXEDLEN-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[BROADCAST_SPLAT]], <4 x ptr> align 8 [[BROADCAST_SPLAT2]], <4 x i1> [[TMP2]])
 ; FIXEDLEN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
 ; FIXEDLEN-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 4
 ; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP3]], align 8
@@ -812,7 +812,7 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
 ; TF-SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
 ; TF-SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; TF-SCALABLE-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; TF-SCALABLE-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
 ; TF-SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; TF-SCALABLE:       [[FOR_END]]:
@@ -860,7 +860,7 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap
 ; SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP10]], [[INDEX]]
 ; SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP10]]
 ; SCALABLE-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; SCALABLE-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; SCALABLE-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; SCALABLE:       [[MIDDLE_BLOCK]]:
 ; SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; SCALABLE:       [[FOR_END]]:
@@ -918,7 +918,7 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap
 ; TF-SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP7]], [[INDEX]]
 ; TF-SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]]
 ; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; TF-SCALABLE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
 ; TF-SCALABLE-NEXT:    br label %[[FOR_END:.*]]
 ; TF-SCALABLE:       [[FOR_END]]:
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/addressing.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/addressing.ll
index b106f99130785..1153d18656004 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/addressing.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/addressing.ll
@@ -6,7 +6,7 @@
 
 ; Check that the addresses for a scalarized memory access is not extracted
 ; from a vector register.
-define i32 @foo(ptr nocapture %A) {
+define void @foo(ptr nocapture %A) {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
@@ -27,7 +27,7 @@ define i32 @foo(ptr nocapture %A) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    ret i32 poison
+; CHECK-NEXT:    ret void
 ;
 
 entry:
@@ -44,12 +44,12 @@ for.body:
   br i1 %exitcond, label %for.end, label %for.body
 
 for.end:
-  ret i32 poison
+  ret void
 }
 
 
 ; Check that a load of address is scalarized.
-define i32 @foo1(ptr nocapture noalias %A, ptr nocapture %PtrPtr) {
+define void @foo1(ptr nocapture noalias %A, ptr nocapture %PtrPtr) {
 ; CHECK-LABEL: @foo1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
@@ -74,7 +74,7 @@ define i32 @foo1(ptr nocapture noalias %A, ptr nocapture %PtrPtr) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    ret i32 poison
+; CHECK-NEXT:    ret void
 ;
 
 entry:
@@ -93,5 +93,5 @@ for.body:
   br i1 %exitcond, label %for.end, label %for.body
 
 for.end:
-  ret i32 poison
+  ret void
 }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/avx1.ll b/llvm/test/Transforms/LoopVectorize/X86/avx1.ll
index 9e205863b8367..44fb8cb2f3beb 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/avx1.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/avx1.ll
@@ -6,8 +6,8 @@ target triple = "x86_64-apple-macosx10.8.0"
 
 ; CHECK-LABEL: @read_mod_write_single_ptr(
 ; CHECK: load <8 x float>
-; CHECK: ret i32
-define i32 @read_mod_write_single_ptr(ptr nocapture %a, i32 %n) nounwind uwtable ssp {
+; CHECK: ret void
+define void @read_mod_write_single_ptr(ptr nocapture %a, i32 %n) nounwind uwtable ssp {
   %1 = icmp sgt i32 %n, 0
   br i1 %1, label %.lr.ph, label %._crit_edge
 
@@ -23,15 +23,15 @@ define i32 @read_mod_write_single_ptr(ptr nocapture %a, i32 %n) nounwind uwtable
   br i1 %exitcond, label %._crit_edge, label %.lr.ph
 
 ._crit_edge:                                      ; preds = %.lr.ph, %0
-  ret i32 undef
+  ret void
 }
 
 
 ; CHECK-LABEL: @read_mod_i64(
 ; SLOWMEM32: load <2 x i64>
 ; FASTMEM32: load <4 x i64>
-; CHECK: ret i32
-define i32 @read_mod_i64(ptr nocapture %a, i32 %n) nounwind uwtable ssp {
+; CHECK: ret void
+define void @read_mod_i64(ptr nocapture %a, i32 %n) nounwind uwtable ssp {
   %1 = icmp sgt i32 %n, 0
   br i1 %1, label %.lr.ph, label %._crit_edge
 
@@ -47,6 +47,6 @@ define i32 @read_mod_i64(ptr nocapture %a, i32 %n) nounwind uwtable ssp {
   br i1 %exitcond, label %._crit_edge, label %.lr.ph
 
 ._crit_edge:                                      ; preds = %.lr.ph, %0
-  ret i32 undef
+  ret void
 }
 
diff --git a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
index e11b1ad7f09dc..193b5d4a788dc 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
@@ -46,7 +46,7 @@ define void @PR31671(float %x, ptr %d) #0 {
 ; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <80 x float>, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <80 x float> [[WIDE_VEC1]], <80 x float> poison, <16 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35, i32 40, i32 45, i32 50, i32 55, i32 60, i32 65, i32 70, i32 75>
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <16 x float> [[STRIDED_VEC2]], [[TMP1]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[TMP4]], <16 x ptr> [[TMP2]], i32 4, <16 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[TMP4]], <16 x ptr> align 4 [[TMP2]], <16 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 80)
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 6384
@@ -107,24 +107,24 @@ define void @PR31671(float %x, ptr %d) #0 {
 ; FORCE-NEXT:    [[WIDE_VEC13:%.*]] = load <10 x float>, ptr [[TMP22]], align 4
 ; FORCE-NEXT:    [[STRIDED_VEC14:%.*]] = shufflevector <10 x float> [[WIDE_VEC13]], <10 x float> poison, <2 x i32> <i32 0, i32 5>
 ; FORCE-NEXT:    [[TMP24:%.*]] = fadd <2 x float> [[STRIDED_VEC8]], [[TMP12]]
+; FORCE-NEXT:    [[TMP28:%.*]] = extractelement <2 x float> [[TMP24]], i32 0
+; FORCE-NEXT:    [[TMP29:%.*]] = extractelement <2 x float> [[TMP24]], i32 1
 ; FORCE-NEXT:    [[TMP25:%.*]] = fadd <2 x float> [[STRIDED_VEC10]], [[TMP13]]
+; FORCE-NEXT:    [[TMP30:%.*]] = extractelement <2 x float> [[TMP25]], i32 0
+; FORCE-NEXT:    [[TMP31:%.*]] = extractelement <2 x float> [[TMP25]], i32 1
 ; FORCE-NEXT:    [[TMP26:%.*]] = fadd <2 x float> [[STRIDED_VEC12]], [[TMP14]]
+; FORCE-NEXT:    [[TMP32:%.*]] = extractelement <2 x float> [[TMP26]], i32 0
+; FORCE-NEXT:    [[TMP33:%.*]] = extractelement <2 x float> [[TMP26]], i32 1
 ; FORCE-NEXT:    [[TMP27:%.*]] = fadd <2 x float> [[STRIDED_VEC14]], [[TMP15]]
-; FORCE-NEXT:    [[TMP28:%.*]] = extractelement <2 x float> [[TMP24]], i32 0
+; FORCE-NEXT:    [[TMP34:%.*]] = extractelement <2 x float> [[TMP27]], i32 0
+; FORCE-NEXT:    [[TMP35:%.*]] = extractelement <2 x float> [[TMP27]], i32 1
 ; FORCE-NEXT:    store float [[TMP28]], ptr [[TMP16]], align 4
-; FORCE-NEXT:    [[TMP29:%.*]] = extractelement <2 x float> [[TMP24]], i32 1
 ; FORCE-NEXT:    store float [[TMP29]], ptr [[TMP17]], align 4
-; FORCE-NEXT:    [[TMP30:%.*]] = extractelement <2 x float> [[TMP25]], i32 0
 ; FORCE-NEXT:    store float [[TMP30]], ptr [[TMP18]], align 4
-; FORCE-NEXT:    [[TMP31:%.*]] = extractelement <2 x float> [[TMP25]], i32 1
 ; FORCE-NEXT:    store float [[TMP31]], ptr [[TMP19]], align 4
-; FORCE-NEXT:    [[TMP32:%.*]] = extractelement <2 x float> [[TMP26]], i32 0
 ; FORCE-NEXT:    store float [[TMP32]], ptr [[TMP20]], align 4
-; FORCE-NEXT:    [[TMP33:%.*]] = extractelement <2 x float> [[TMP26]], i32 1
 ; FORCE-NEXT:    store float [[TMP33]], ptr [[TMP21]], align 4
-; FORCE-NEXT:    [[TMP34:%.*]] = extractelement <2 x float> [[TMP27]], i32 0
 ; FORCE-NEXT:    store float [[TMP34]], ptr [[TMP22]], align 4
-; FORCE-NEXT:    [[TMP35:%.*]] = extractelement <2 x float> [[TMP27]], i32 1
 ; FORCE-NEXT:    store float [[TMP35]], ptr [[TMP23]], align 4
 ; FORCE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FORCE-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 6392
@@ -166,7 +166,6 @@ attributes #0 = { "target-cpu"="knl" }
 ; CHECK:     LV: Found uniform instruction:   {{%.*}} = icmp eq i32 {{%.*}}, 0
 ; CHECK-NOT: LV: Found uniform instruction:   {{%.*}} = load i32, ptr {{%.*}}, align 1
 ; CHECK:     LV: Found not uniform due to requiring predication:  {{%.*}} = load i32, ptr {{%.*}}, align 1
-; CHECK:     LV: Found scalar instruction:   {{%.*}} = getelementptr inbounds [3 x i32], ptr @a, i32 0, i32 {{%.*}}
 ;
 ;
 @a = internal constant [3 x i32] [i32 7, i32 7, i32 0], align 1
@@ -215,8 +214,9 @@ define void @PR40816() #1 {
 ; FORCE-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4
 ; FORCE-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; FORCE:       [[MIDDLE_BLOCK]]:
-; FORCE-NEXT:    br [[RETURN:label %.*]]
-; FORCE:       [[SCALAR_PH:.*:]]
+; FORCE-NEXT:    br label %[[RETURN:.*]]
+; FORCE:       [[RETURN]]:
+; FORCE-NEXT:    ret void
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
index 6d2cda48f90ca..0287645d9d7f9 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
@@ -4,7 +4,7 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
 
-define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwind uwtable ssp {
+define void @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwind uwtable ssp {
 ; CHECK-LABEL: @conversion_cost1(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[N:%.*]], 3
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[ITER_CHECK:%.*]], label [[DOT_CRIT_EDGE:%.*]]
@@ -37,7 +37,7 @@ define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin
 ; CHECK:       vec.epilog.iter.check:
 ; CHECK-NEXT:    [[IND_END5:%.*]] = add i64 3, [[N_VEC]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 3, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -58,7 +58,7 @@ define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin
 ; CHECK-NEXT:    [[INDEX_NEXT11]] = add nuw i64 [[INDEX7]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT9]] = add <4 x i8> [[VEC_IND8]], splat (i8 4)
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC3]]
-; CHECK-NEXT:    br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    [[CMP_N12:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[CMP_N12]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -73,11 +73,11 @@ define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       ._crit_edge.loopexit:
 ; CHECK-NEXT:    br label [[DOT_CRIT_EDGE]]
 ; CHECK:       ._crit_edge:
-; CHECK-NEXT:    ret i32 undef
+; CHECK-NEXT:    ret void
 ;
   %1 = icmp sgt i32 %n, 3
   br i1 %1, label %.lr.ph, label %._crit_edge
@@ -93,10 +93,10 @@ define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin
   br i1 %exitcond, label %._crit_edge, label %.lr.ph
 
 ._crit_edge:                                      ; preds = %.lr.ph, %0
-  ret i32 undef
+  ret void
 }
 
-define i32 @conversion_cost2(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwind uwtable ssp {
+define void @conversion_cost2(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwind uwtable ssp {
 ; CHECK-LABEL: @conversion_cost2(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[N:%.*]], 9
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[DOTLR_PH_PREHEADER:%.*]], label [[DOT_CRIT_EDGE:%.*]]
@@ -136,7 +136,7 @@ define i32 @conversion_cost2(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD_3]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -152,11 +152,11 @@ define i32 @conversion_cost2(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       ._crit_edge.loopexit:
 ; CHECK-NEXT:    br label [[DOT_CRIT_EDGE]]
 ; CHECK:       ._crit_edge:
-; CHECK-NEXT:    ret i32 undef
+; CHECK-NEXT:    ret void
 ;
   %1 = icmp sgt i32 %n, 9
   br i1 %1, label %.lr.ph, label %._crit_edge
@@ -173,5 +173,5 @@ define i32 @conversion_cost2(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin
   br i1 %exitcond, label %._crit_edge, label %.lr.ph
 
 ._crit_edge:                                      ; preds = %.lr.ph, %0
-  ret i32 undef
+  ret void
 }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll
index 590b2691c3238..baedf34b5548f 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll
@@ -24,39 +24,39 @@ define void @test_replicate_call_chain(float %x, ptr noalias %A, ptr noalias %B,
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor <16 x i1> [[TMP6]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP8:%.*]] = shl i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[D:%.*]], i64 [[TMP8]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP9]], i32 4, <16 x i1> [[TMP7]], <16 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 4 [[TMP9]], <16 x i1> [[TMP7]], <16 x float> poison)
 ; CHECK-NEXT:    [[TMP10:%.*]] = fmul <16 x float> [[WIDE_MASKED_LOAD]], splat (float 2.000000e+00)
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x float> [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = tail call float @llvm.pow.f32(float [[TMP11]], float [[X:%.*]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <16 x float> [[TMP10]], i32 1
-; CHECK-NEXT:    [[TMP14:%.*]] = tail call float @llvm.pow.f32(float [[TMP13]], float [[X]])
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <16 x float> [[TMP10]], i32 2
-; CHECK-NEXT:    [[TMP16:%.*]] = tail call float @llvm.pow.f32(float [[TMP15]], float [[X]])
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x float> [[TMP10]], i32 3
-; CHECK-NEXT:    [[TMP18:%.*]] = tail call float @llvm.pow.f32(float [[TMP17]], float [[X]])
 ; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <16 x float> [[TMP10]], i32 4
-; CHECK-NEXT:    [[TMP20:%.*]] = tail call float @llvm.pow.f32(float [[TMP19]], float [[X]])
 ; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <16 x float> [[TMP10]], i32 5
-; CHECK-NEXT:    [[TMP22:%.*]] = tail call float @llvm.pow.f32(float [[TMP21]], float [[X]])
 ; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <16 x float> [[TMP10]], i32 6
-; CHECK-NEXT:    [[TMP24:%.*]] = tail call float @llvm.pow.f32(float [[TMP23]], float [[X]])
 ; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <16 x float> [[TMP10]], i32 7
-; CHECK-NEXT:    [[TMP26:%.*]] = tail call float @llvm.pow.f32(float [[TMP25]], float [[X]])
 ; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <16 x float> [[TMP10]], i32 8
-; CHECK-NEXT:    [[TMP28:%.*]] = tail call float @llvm.pow.f32(float [[TMP27]], float [[X]])
 ; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <16 x float> [[TMP10]], i32 9
-; CHECK-NEXT:    [[TMP30:%.*]] = tail call float @llvm.pow.f32(float [[TMP29]], float [[X]])
 ; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <16 x float> [[TMP10]], i32 10
-; CHECK-NEXT:    [[TMP32:%.*]] = tail call float @llvm.pow.f32(float [[TMP31]], float [[X]])
 ; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <16 x float> [[TMP10]], i32 11
-; CHECK-NEXT:    [[TMP34:%.*]] = tail call float @llvm.pow.f32(float [[TMP33]], float [[X]])
 ; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <16 x float> [[TMP10]], i32 12
-; CHECK-NEXT:    [[TMP36:%.*]] = tail call float @llvm.pow.f32(float [[TMP35]], float [[X]])
 ; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <16 x float> [[TMP10]], i32 13
-; CHECK-NEXT:    [[TMP38:%.*]] = tail call float @llvm.pow.f32(float [[TMP37]], float [[X]])
 ; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <16 x float> [[TMP10]], i32 14
-; CHECK-NEXT:    [[TMP40:%.*]] = tail call float @llvm.pow.f32(float [[TMP39]], float [[X]])
 ; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <16 x float> [[TMP10]], i32 15
+; CHECK-NEXT:    [[TMP12:%.*]] = tail call float @llvm.pow.f32(float [[TMP11]], float [[X:%.*]])
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call float @llvm.pow.f32(float [[TMP13]], float [[X]])
+; CHECK-NEXT:    [[TMP16:%.*]] = tail call float @llvm.pow.f32(float [[TMP15]], float [[X]])
+; CHECK-NEXT:    [[TMP18:%.*]] = tail call float @llvm.pow.f32(float [[TMP17]], float [[X]])
+; CHECK-NEXT:    [[TMP20:%.*]] = tail call float @llvm.pow.f32(float [[TMP19]], float [[X]])
+; CHECK-NEXT:    [[TMP22:%.*]] = tail call float @llvm.pow.f32(float [[TMP21]], float [[X]])
+; CHECK-NEXT:    [[TMP24:%.*]] = tail call float @llvm.pow.f32(float [[TMP23]], float [[X]])
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call float @llvm.pow.f32(float [[TMP25]], float [[X]])
+; CHECK-NEXT:    [[TMP28:%.*]] = tail call float @llvm.pow.f32(float [[TMP27]], float [[X]])
+; CHECK-NEXT:    [[TMP30:%.*]] = tail call float @llvm.pow.f32(float [[TMP29]], float [[X]])
+; CHECK-NEXT:    [[TMP32:%.*]] = tail call float @llvm.pow.f32(float [[TMP31]], float [[X]])
+; CHECK-NEXT:    [[TMP34:%.*]] = tail call float @llvm.pow.f32(float [[TMP33]], float [[X]])
+; CHECK-NEXT:    [[TMP36:%.*]] = tail call float @llvm.pow.f32(float [[TMP35]], float [[X]])
+; CHECK-NEXT:    [[TMP38:%.*]] = tail call float @llvm.pow.f32(float [[TMP37]], float [[X]])
+; CHECK-NEXT:    [[TMP40:%.*]] = tail call float @llvm.pow.f32(float [[TMP39]], float [[X]])
 ; CHECK-NEXT:    [[TMP42:%.*]] = tail call float @llvm.pow.f32(float [[TMP41]], float [[X]])
 ; CHECK-NEXT:    [[TMP43:%.*]] = tail call float @llvm.pow.f32(float [[TMP12]], float [[X]])
 ; CHECK-NEXT:    [[TMP44:%.*]] = tail call float @llvm.pow.f32(float [[TMP14]], float [[X]])
@@ -90,8 +90,8 @@ define void @test_replicate_call_chain(float %x, ptr noalias %A, ptr noalias %B,
 ; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <16 x float> [[TMP71]], float [[TMP56]], i32 13
 ; CHECK-NEXT:    [[TMP73:%.*]] = insertelement <16 x float> [[TMP72]], float [[TMP57]], i32 14
 ; CHECK-NEXT:    [[TMP74:%.*]] = insertelement <16 x float> [[TMP73]], float [[TMP58]], i32 15
-; CHECK-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP74]], ptr [[TMP5]], i32 4, <16 x i1> [[TMP7]])
-; CHECK-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> zeroinitializer, ptr [[TMP5]], i32 4, <16 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP74]], ptr align 4 [[TMP5]], <16 x i1> [[TMP7]])
+; CHECK-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> zeroinitializer, ptr align 4 [[TMP5]], <16 x i1> [[TMP6]])
 ; CHECK-NEXT:    store float 0.000000e+00, ptr [[E:%.*]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP75:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
@@ -324,7 +324,7 @@ define i64 @avx512_cond_load_cost(ptr %src, i32 %a, i64 %b, i32 %c, i32 %d) #1 {
 ; CHECK-NEXT:    [[TMP67:%.*]] = or <8 x i32> [[TMP66]], [[TMP34]]
 ; CHECK-NEXT:    [[TMP68:%.*]] = sext <8 x i32> [[TMP67]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr { i64, i64, i64 }, ptr [[SRC:%.*]], <8 x i64> [[TMP68]], i32 2
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> [[TMP69]], i32 8, <8 x i1> [[TMP1]], <8 x i64> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 8 [[TMP69]], <8 x i1> [[TMP1]], <8 x i64> poison)
 ; CHECK-NEXT:    [[TMP70:%.*]] = or <8 x i64> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <8 x i1> [[TMP1]], <8 x i64> [[TMP70]], <8 x i64> zeroinitializer
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw i32 [[IV]], 8
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
index 9453ad7c61f68..725fa49c0930c 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -540,6 +540,8 @@ define i64 @cost_assume(ptr %end, i64 %N) {
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 8
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i64 [[N:%.*]], 0
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP11]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP11]])
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -551,14 +553,6 @@ define i64 @cost_assume(ptr %end, i64 %N) {
 ; CHECK-NEXT:    [[TMP8]] = add <2 x i64> [[VEC_PHI2]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP9]] = add <2 x i64> [[VEC_PHI3]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP10]] = add <2 x i64> [[VEC_PHI4]], splat (i64 1)
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP11]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP11]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP11]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP11]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP11]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP11]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP11]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP11]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/divs-with-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/X86/divs-with-tail-folding.ll
index 04fd289f76147..62fda4e0c2098 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/divs-with-tail-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/divs-with-tail-folding.ll
@@ -37,7 +37,7 @@ define void @sdiv_feeding_gep(ptr %dst, i32 %x, i64 %M, i64 %conv6, i64 %N) {
 ; CHECK-NEXT:    [[TMP24:%.*]] = add i32 [[TMP23]], [[TMP22]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = sext i32 [[TMP24]] to i64
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP25]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> zeroinitializer, ptr [[TMP26]], i32 8, <4 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> zeroinitializer, ptr align 8 [[TMP26]], <4 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -149,7 +149,7 @@ define void @sdiv_feeding_gep_predicated(ptr %dst, i32 %x, i64 %M, i64 %conv6, i
 ; CHECK-NEXT:    [[TMP26:%.*]] = add i32 [[TMP25]], [[TMP24]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = sext i32 [[TMP26]] to i64
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP27]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> zeroinitializer, ptr [[TMP28]], i32 8, <4 x i1> [[TMP8]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> zeroinitializer, ptr align 8 [[TMP28]], <4 x i1> [[TMP8]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[TMP5]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll b/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll
index 249efe1706e0f..c5f581fad41f5 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll
@@ -26,7 +26,7 @@ define i1 @fn(ptr %nno) #0 {
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP23]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[TMP5]], i32 -3
 ; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP6]], i32 4, <4 x i1> [[REVERSE]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP6]], <4 x i1> [[REVERSE]], <4 x i32> poison)
 ; CHECK-NEXT:    [[REVERSE1:%.*]] = shufflevector <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP7:%.*]] = shl <4 x i32> [[REVERSE1]], splat (i32 1)
 ; CHECK-NEXT:    [[TMP8:%.*]] = urem <4 x i32> [[TMP7]], splat (i32 10)
diff --git a/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll b/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
index 8126c70edb0d8..4423b89e981b9 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
@@ -28,7 +28,7 @@ define void @drop_scalar_nuw_nsw(ptr noalias nocapture readonly %input, ptr %out
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr float, ptr [[INPUT]], i64 -1
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0]], i32 4, <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> poison), !invariant.load [[META0:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[TMP0]], <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> poison), !invariant.load [[META0:![0-9]+]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> [[WIDE_MASKED_LOAD]], <4 x float> zeroinitializer
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[OUTPUT]], align 4
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
@@ -70,7 +70,7 @@ define void @drop_scalar_gep_nusw(ptr noalias nocapture readonly %input, ptr %ou
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr float, ptr [[INPUT]], i64 -1
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0]], i32 4, <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> poison), !invariant.load [[META0]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[TMP0]], <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> poison), !invariant.load [[META0]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> [[WIDE_MASKED_LOAD]], <4 x float> zeroinitializer
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[OUTPUT]], align 4
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
@@ -112,7 +112,7 @@ define void @drop_scalar_gep_nuw(ptr noalias nocapture readonly %input, ptr %out
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr float, ptr [[INPUT]], i64 -1
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0]], i32 4, <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> poison), !invariant.load [[META0]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[TMP0]], <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> poison), !invariant.load [[META0]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> [[WIDE_MASKED_LOAD]], <4 x float> zeroinitializer
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[OUTPUT]], align 4
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
@@ -155,7 +155,7 @@ define void @drop_nonpred_scalar_nuw_nsw(ptr noalias nocapture readonly %input,
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr float, ptr [[INPUT]], i64 -1
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0]], i32 4, <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> poison), !invariant.load [[META0]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[TMP0]], <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> poison), !invariant.load [[META0]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> [[WIDE_MASKED_LOAD]], <4 x float> zeroinitializer
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[OUTPUT]], align 4
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
@@ -197,7 +197,7 @@ define void @preserve_vector_nuw_nsw(ptr noalias nocapture readonly %input, ptr
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[INPUT]], <4 x i64> <i64 -2, i64 0, i64 2, i64 4>
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> [[TMP0]], i32 4, <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> poison), !invariant.load [[META0]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 [[TMP0]], <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> poison), !invariant.load [[META0]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> [[WIDE_MASKED_GATHER]], <4 x float> zeroinitializer
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[OUTPUT]], align 4
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
@@ -240,9 +240,9 @@ define void @drop_vector_nuw_nsw(ptr noalias nocapture readonly %input, ptr %out
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr float, ptr [[INPUT]], <4 x i64> <i64 -1, i64 0, i64 1, i64 2>
-; CHECK-NEXT:    store <4 x ptr> [[TMP3]], ptr [[PTRS]], align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x ptr> [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP6]], i32 4, <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> poison), !invariant.load [[META0]]
+; CHECK-NEXT:    store <4 x ptr> [[TMP3]], ptr [[PTRS]], align 8
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[TMP6]], <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> poison), !invariant.load [[META0]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> [[WIDE_MASKED_LOAD]], <4 x float> zeroinitializer
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[OUTPUT]], align 4
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
@@ -301,7 +301,7 @@ define void @drop_nonvector_nuw_nsw_avx1(ptr noalias nocapture readonly %input,
 ; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x ptr> [[TMP15]], ptr [[TMP12]], i32 2
 ; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x ptr> [[TMP16]], ptr [[TMP13]], i32 3
 ; CHECK-NEXT:    store <4 x ptr> [[TMP17]], ptr [[TMP5]], align 8
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP10]], i32 4, <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> poison), !invariant.load [[META0]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[TMP10]], <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> poison), !invariant.load [[META0]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> [[WIDE_MASKED_LOAD]], <4 x float> zeroinitializer
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 0
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP21]], align 4
@@ -383,7 +383,7 @@ define void @drop_scalar_exact(ptr noalias nocapture readonly %input, ptr %outpu
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[INPUT]], i32 4, <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x float> poison), !invariant.load [[META0]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[INPUT]], <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x float> poison), !invariant.load [[META0]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x float> zeroinitializer, <4 x float> [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[OUTPUT]], align 4
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
@@ -432,14 +432,14 @@ define void @drop_zext_nneg(ptr noalias %p, ptr noalias %p1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i32> [[VEC_IND]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i64> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr double, ptr [[P]], i64 [[TMP2]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP3]], i32 8, <4 x i1> [[TMP0]], <4 x double> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP3]], <4 x i1> [[TMP0]], <4 x double> poison)
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x double> [[WIDE_MASKED_LOAD]], <4 x double> zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[PREDPHI]], i32 3
 ; CHECK-NEXT:    store double [[TMP5]], ptr [[P1]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ;
 entry:
@@ -479,7 +479,7 @@ define void @preserve_vector_exact_no_addr(ptr noalias nocapture readonly %input
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[INPUT]], <4 x i64> <i64 0, i64 0, i64 1, i64 1>
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> [[TMP0]], i32 4, <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x float> poison), !invariant.load [[META0]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 [[TMP0]], <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x float> poison), !invariant.load [[META0]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x float> zeroinitializer, <4 x float> [[WIDE_MASKED_GATHER]]
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[OUTPUT]], align 4
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
@@ -807,7 +807,7 @@ define void @Bgep_inbounds_unconditionally_due_to_store(ptr noalias %B, ptr read
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
-; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ;
 
diff --git a/llvm/test/Transforms/LoopVectorize/X86/fp80-widest-type.ll b/llvm/test/Transforms/LoopVectorize/X86/fp80-widest-type.ll
index 2ef9d4b40d9a5..3718ad23c0612 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/fp80-widest-type.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/fp80-widest-type.ll
@@ -14,7 +14,7 @@ define x86_fp80 @test() {
 ; CHECK-NEXT:    br label [[FOR_BODY3_I_3:%.*]]
 ; CHECK:       for.body3.i.3:
 ; CHECK-NEXT:    [[N_ADDR_112_I_3:%.*]] = phi i64 [ [[DEC_I_3:%.*]], [[FOR_BODY3_I_3]] ], [ 24, [[FOO_EXIT:%.*]] ]
-; CHECK-NEXT:    [[X_ADDR_111_I_3:%.*]] = phi x86_fp80 [ [[MUL_I_3:%.*]], [[FOR_BODY3_I_3]] ], [ undef, [[FOO_EXIT]] ]
+; CHECK-NEXT:    [[X_ADDR_111_I_3:%.*]] = phi x86_fp80 [ [[MUL_I_3:%.*]], [[FOR_BODY3_I_3]] ], [ 0xK00000000000000000000, [[FOO_EXIT]] ]
 ; CHECK-NEXT:    [[MUL_I_3]] = fmul x86_fp80 [[X_ADDR_111_I_3]], 0xK40008000000000000000
 ; CHECK-NEXT:    [[DEC_I_3]] = add nsw i64 [[N_ADDR_112_I_3]], -1
 ; CHECK-NEXT:    [[CMP2_I_3:%.*]] = icmp sgt i64 [[N_ADDR_112_I_3]], 1
@@ -28,7 +28,7 @@ foo.exit:
 
 for.body3.i.3:                                    ; preds = %for.body3.i.3, %foo.exit
   %n.addr.112.i.3 = phi i64 [ %dec.i.3, %for.body3.i.3 ], [ 24, %foo.exit ]
-  %x.addr.111.i.3 = phi x86_fp80 [ %mul.i.3, %for.body3.i.3 ], [ undef, %foo.exit ]
+  %x.addr.111.i.3 = phi x86_fp80 [ %mul.i.3, %for.body3.i.3 ], [ zeroinitializer, %foo.exit ]
   %mul.i.3 = fmul x86_fp80 %x.addr.111.i.3, 0xK40008000000000000000
   %dec.i.3 = add nsw i64 %n.addr.112.i.3, -1
   %cmp2.i.3 = icmp sgt i64 %n.addr.112.i.3, 1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
index 2f33e111d8ca7..db592f959bace 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
@@ -33,13 +33,13 @@ define void @foo1(ptr noalias %in, ptr noalias %out, ptr noalias %trigger, ptr n
 ; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP1]], align 4
 ; AVX512-NEXT:    [[TMP3:%.*]] = icmp sgt <16 x i32> [[WIDE_LOAD]], zeroinitializer
 ; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[INDEX:%.*]], i64 [[INDEX1]]
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP4]], i32 4, <16 x i1> [[TMP3]], <16 x i32> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 4 [[TMP4]], <16 x i1> [[TMP3]], <16 x i32> poison)
 ; AVX512-NEXT:    [[TMP6:%.*]] = sext <16 x i32> [[WIDE_MASKED_LOAD]] to <16 x i64>
 ; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], <16 x i64> [[TMP6]]
-; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> [[TMP7]], i32 4, <16 x i1> [[TMP3]], <16 x float> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 [[TMP7]], <16 x i1> [[TMP3]], <16 x float> poison)
 ; AVX512-NEXT:    [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER]], splat (float 5.000000e-01)
 ; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr float, ptr [[OUT:%.*]], i64 [[INDEX1]]
-; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP8]], ptr [[TMP9]], i32 4, <16 x i1> [[TMP3]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP8]], ptr align 4 [[TMP9]], <16 x i1> [[TMP3]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16
 ; AVX512-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
 ; AVX512-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -59,13 +59,13 @@ define void @foo1(ptr noalias %in, ptr noalias %out, ptr noalias %trigger, ptr n
 ; FVW2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
 ; FVW2-NEXT:    [[TMP3:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], zeroinitializer
 ; FVW2-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[INDEX:%.*]], i64 [[INDEX1]]
-; FVW2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr [[TMP4]], i32 4, <2 x i1> [[TMP3]], <2 x i32> poison)
+; FVW2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 4 [[TMP4]], <2 x i1> [[TMP3]], <2 x i32> poison)
 ; FVW2-NEXT:    [[TMP6:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD]] to <2 x i64>
 ; FVW2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], <2 x i64> [[TMP6]]
-; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> [[TMP7]], i32 4, <2 x i1> [[TMP3]], <2 x float> poison)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 4 [[TMP7]], <2 x i1> [[TMP3]], <2 x float> poison)
 ; FVW2-NEXT:    [[TMP8:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], splat (float 5.000000e-01)
 ; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr float, ptr [[OUT:%.*]], i64 [[INDEX1]]
-; FVW2-NEXT:    call void @llvm.masked.store.v2f32.p0(<2 x float> [[TMP8]], ptr [[TMP9]], i32 4, <2 x i1> [[TMP3]])
+; FVW2-NEXT:    call void @llvm.masked.store.v2f32.p0(<2 x float> [[TMP8]], ptr align 4 [[TMP9]], <2 x i1> [[TMP3]])
 ; FVW2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 2
 ; FVW2-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
 ; FVW2-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -126,13 +126,13 @@ define void @foo2(ptr noalias %in, ptr noalias %out, ptr noalias %trigger, ptr n
 ; AVX512-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX512-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]]
-; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> [[TMP0]], i32 4, <16 x i1> splat (i1 true), <16 x i32> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 4 [[TMP0]], <16 x i1> splat (i1 true), <16 x i32> poison)
 ; AVX512-NEXT:    [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
 ; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1
-; AVX512-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 [[TMP2]], <16 x i1> [[TMP1]], <16 x float> poison)
 ; AVX512-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER2]], splat (float 5.000000e-01)
 ; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], <16 x i64> [[VEC_IND]]
-; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[TMP3]], <16 x ptr> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[TMP3]], <16 x ptr> align 4 [[TMP4]], <16 x i1> [[TMP1]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16
 ; AVX512-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 256)
 ; AVX512-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
@@ -161,7 +161,7 @@ define void @foo2(ptr noalias %in, ptr noalias %out, ptr noalias %trigger, ptr n
 ; FVW2-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1
 ; FVW2-NEXT:    [[TMP8:%.*]] = icmp sgt <2 x i32> [[TMP7]], zeroinitializer
 ; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
-; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> [[TMP9]], i32 4, <2 x i1> [[TMP8]], <2 x float> poison)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 4 [[TMP9]], <2 x i1> [[TMP8]], <2 x float> poison)
 ; FVW2-NEXT:    [[TMP10:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], splat (float 5.000000e-01)
 ; FVW2-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
 ; FVW2-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
@@ -241,13 +241,13 @@ define void @foo3(ptr noalias %in, ptr noalias %out, ptr noalias %trigger) {
 ; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX512-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]]
-; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> [[TMP0]], i32 4, <16 x i1> splat (i1 true), <16 x i32> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 4 [[TMP0]], <16 x i1> splat (i1 true), <16 x i32> poison)
 ; AVX512-NEXT:    [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
 ; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1
-; AVX512-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 [[TMP2]], <16 x i1> [[TMP1]], <16 x float> poison)
 ; AVX512-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER1]], splat (float 5.000000e-01)
 ; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_OUT:%.*]], ptr [[OUT:%.*]], <16 x i64> [[VEC_IND]], i32 1
-; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[TMP3]], <16 x ptr> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[TMP3]], <16 x ptr> align 4 [[TMP4]], <16 x i1> [[TMP1]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; AVX512-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 256)
 ; AVX512-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
@@ -276,7 +276,7 @@ define void @foo3(ptr noalias %in, ptr noalias %out, ptr noalias %trigger) {
 ; FVW2-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1
 ; FVW2-NEXT:    [[TMP8:%.*]] = icmp sgt <2 x i32> [[TMP7]], zeroinitializer
 ; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
-; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> [[TMP9]], i32 4, <2 x i1> [[TMP8]], <2 x float> poison)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 4 [[TMP9]], <2 x i1> [[TMP8]], <2 x float> poison)
 ; FVW2-NEXT:    [[TMP10:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], splat (float 5.000000e-01)
 ; FVW2-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
 ; FVW2-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
@@ -343,13 +343,13 @@ define void @foo2_addrspace(ptr addrspace(1) noalias %in, ptr addrspace(1) noali
 ; AVX512-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX512-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]]
-; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> [[TMP0]], i32 4, <16 x i1> splat (i1 true), <16 x i32> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 4 [[TMP0]], <16 x i1> splat (i1 true), <16 x i32> poison)
 ; AVX512-NEXT:    [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
 ; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr addrspace(1) [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1
-; AVX512-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1(<16 x ptr addrspace(1)> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1(<16 x ptr addrspace(1)> align 4 [[TMP2]], <16 x i1> [[TMP1]], <16 x float> poison)
 ; AVX512-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER2]], splat (float 5.000000e-01)
 ; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[OUT:%.*]], <16 x i64> [[VEC_IND]]
-; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1(<16 x float> [[TMP3]], <16 x ptr addrspace(1)> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1(<16 x float> [[TMP3]], <16 x ptr addrspace(1)> align 4 [[TMP4]], <16 x i1> [[TMP1]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16
 ; AVX512-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 256)
 ; AVX512-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
@@ -378,7 +378,7 @@ define void @foo2_addrspace(ptr addrspace(1) noalias %in, ptr addrspace(1) noali
 ; FVW2-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1
 ; FVW2-NEXT:    [[TMP8:%.*]] = icmp sgt <2 x i32> [[TMP7]], zeroinitializer
 ; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr addrspace(1) [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
-; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1(<2 x ptr addrspace(1)> [[TMP9]], i32 4, <2 x i1> [[TMP8]], <2 x float> poison)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1(<2 x ptr addrspace(1)> align 4 [[TMP9]], <2 x i1> [[TMP8]], <2 x float> poison)
 ; FVW2-NEXT:    [[TMP10:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], splat (float 5.000000e-01)
 ; FVW2-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
 ; FVW2-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
@@ -444,13 +444,13 @@ define void @foo2_addrspace2(ptr addrspace(1) noalias %in, ptr addrspace(0) noal
 ; AVX512-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX512-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]]
-; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> [[TMP0]], i32 4, <16 x i1> splat (i1 true), <16 x i32> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 4 [[TMP0]], <16 x i1> splat (i1 true), <16 x i32> poison)
 ; AVX512-NEXT:    [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
 ; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr addrspace(1) [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1
-; AVX512-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1(<16 x ptr addrspace(1)> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1(<16 x ptr addrspace(1)> align 4 [[TMP2]], <16 x i1> [[TMP1]], <16 x float> poison)
 ; AVX512-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER2]], splat (float 5.000000e-01)
 ; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], <16 x i64> [[VEC_IND]]
-; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[TMP3]], <16 x ptr> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[TMP3]], <16 x ptr> align 4 [[TMP4]], <16 x i1> [[TMP1]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16
 ; AVX512-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 256)
 ; AVX512-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
@@ -479,7 +479,7 @@ define void @foo2_addrspace2(ptr addrspace(1) noalias %in, ptr addrspace(0) noal
 ; FVW2-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1
 ; FVW2-NEXT:    [[TMP8:%.*]] = icmp sgt <2 x i32> [[TMP7]], zeroinitializer
 ; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr addrspace(1) [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
-; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1(<2 x ptr addrspace(1)> [[TMP9]], i32 4, <2 x i1> [[TMP8]], <2 x float> poison)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1(<2 x ptr addrspace(1)> align 4 [[TMP9]], <2 x i1> [[TMP8]], <2 x float> poison)
 ; FVW2-NEXT:    [[TMP10:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], splat (float 5.000000e-01)
 ; FVW2-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
 ; FVW2-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
@@ -545,13 +545,13 @@ define void @foo2_addrspace3(ptr addrspace(0) noalias %in, ptr addrspace(1) noal
 ; AVX512-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX512-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]]
-; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> [[TMP0]], i32 4, <16 x i1> splat (i1 true), <16 x i32> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 4 [[TMP0]], <16 x i1> splat (i1 true), <16 x i32> poison)
 ; AVX512-NEXT:    [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
 ; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1
-; AVX512-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 [[TMP2]], <16 x i1> [[TMP1]], <16 x float> poison)
 ; AVX512-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER2]], splat (float 5.000000e-01)
 ; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[OUT:%.*]], <16 x i64> [[VEC_IND]]
-; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1(<16 x float> [[TMP3]], <16 x ptr addrspace(1)> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1(<16 x float> [[TMP3]], <16 x ptr addrspace(1)> align 4 [[TMP4]], <16 x i1> [[TMP1]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16
 ; AVX512-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 256)
 ; AVX512-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
@@ -580,7 +580,7 @@ define void @foo2_addrspace3(ptr addrspace(0) noalias %in, ptr addrspace(1) noal
 ; FVW2-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1
 ; FVW2-NEXT:    [[TMP8:%.*]] = icmp sgt <2 x i32> [[TMP7]], zeroinitializer
 ; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
-; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> [[TMP9]], i32 4, <2 x i1> [[TMP8]], <2 x float> poison)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 4 [[TMP9]], <2 x i1> [[TMP8]], <2 x float> poison)
 ; FVW2-NEXT:    [[TMP10:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], splat (float 5.000000e-01)
 ; FVW2-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
 ; FVW2-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
@@ -685,10 +685,10 @@ define void @test_gather_not_profitable_pr48429(i32 %d, ptr readonly noalias %pt
 ; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[OFFSET_IDX]]
 ; AVX512-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM]]
 ; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x float>, ptr [[TMP17]], align 4, !alias.scope [[META8:![0-9]+]]
-; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[WIDE_LOAD]], <16 x ptr> [[TMP14]], i32 4, <16 x i1> splat (i1 true)), !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]]
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[WIDE_LOAD]], <16 x ptr> align 4 [[TMP14]], <16 x i1> splat (i1 true)), !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]]
 ; AVX512-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x float>, ptr [[TMP16]], align 4, !alias.scope [[META15:![0-9]+]]
 ; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, <16 x ptr> [[TMP14]], i64 1
-; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[WIDE_LOAD6]], <16 x ptr> [[TMP20]], i32 4, <16 x i1> splat (i1 true)), !alias.scope [[META11]], !noalias [[META13]]
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[WIDE_LOAD6]], <16 x ptr> align 4 [[TMP20]], <16 x i1> splat (i1 true)), !alias.scope [[META11]], !noalias [[META13]]
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; AVX512-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 1024
 ; AVX512-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -721,10 +721,10 @@ define void @test_gather_not_profitable_pr48429(i32 %d, ptr readonly noalias %pt
 ; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[OFFSET_IDX21]]
 ; AVX512-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i64 [[IDXPROM]]
 ; AVX512-NEXT:    [[WIDE_LOAD13:%.*]] = load <8 x float>, ptr [[TMP29]], align 4, !alias.scope [[META8]]
-; AVX512-NEXT:    call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD13]], <8 x ptr> [[TMP26]], i32 4, <8 x i1> splat (i1 true)), !alias.scope [[META11]], !noalias [[META13]]
+; AVX512-NEXT:    call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD13]], <8 x ptr> align 4 [[TMP26]], <8 x i1> splat (i1 true)), !alias.scope [[META11]], !noalias [[META13]]
 ; AVX512-NEXT:    [[WIDE_LOAD14:%.*]] = load <8 x float>, ptr [[TMP28]], align 4, !alias.scope [[META15]]
 ; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, <8 x ptr> [[TMP26]], i64 1
-; AVX512-NEXT:    call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD14]], <8 x ptr> [[TMP32]], i32 4, <8 x i1> splat (i1 true)), !alias.scope [[META11]], !noalias [[META13]]
+; AVX512-NEXT:    call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD14]], <8 x ptr> align 4 [[TMP32]], <8 x i1> splat (i1 true)), !alias.scope [[META11]], !noalias [[META13]]
 ; AVX512-NEXT:    [[INDEX_NEXT24]] = add nuw i64 [[INDEX18]], 8
 ; AVX512-NEXT:    [[PTR_IND20]] = getelementptr i8, ptr [[POINTER_PHI19]], i64 512
 ; AVX512-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT24]], [[N_VEC10]]
@@ -786,16 +786,16 @@ define void @test_gather_not_profitable_pr48429(i32 %d, ptr readonly noalias %pt
 ; FVW2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM]]
 ; FVW2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP21]], align 4, !alias.scope [[META8:![0-9]+]]
 ; FVW2-NEXT:    [[TMP23:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 0
-; FVW2-NEXT:    store float [[TMP23]], ptr [[TMP19]], align 4, !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]]
 ; FVW2-NEXT:    [[TMP24:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 1
+; FVW2-NEXT:    store float [[TMP23]], ptr [[TMP19]], align 4, !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]]
 ; FVW2-NEXT:    store float [[TMP24]], ptr [[TMP20]], align 4, !alias.scope [[META11]], !noalias [[META13]]
 ; FVW2-NEXT:    [[WIDE_LOAD10:%.*]] = load <2 x float>, ptr [[TMP16]], align 4, !alias.scope [[META15:![0-9]+]]
-; FVW2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 1
-; FVW2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 1
 ; FVW2-NEXT:    [[TMP28:%.*]] = extractelement <2 x float> [[WIDE_LOAD10]], i32 0
-; FVW2-NEXT:    store float [[TMP28]], ptr [[TMP26]], align 4, !alias.scope [[META11]], !noalias [[META13]]
 ; FVW2-NEXT:    [[TMP29:%.*]] = extractelement <2 x float> [[WIDE_LOAD10]], i32 1
-; FVW2-NEXT:    store float [[TMP29]], ptr [[TMP27]], align 4, !alias.scope [[META11]], !noalias [[META13]]
+; FVW2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 1
+; FVW2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 1
+; FVW2-NEXT:    store float [[TMP28]], ptr [[TMP25]], align 4, !alias.scope [[META11]], !noalias [[META13]]
+; FVW2-NEXT:    store float [[TMP29]], ptr [[TMP22]], align 4, !alias.scope [[META11]], !noalias [[META13]]
 ; FVW2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; FVW2-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; FVW2-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/gep-use-outside-loop.ll b/llvm/test/Transforms/LoopVectorize/X86/gep-use-outside-loop.ll
index 2f44c7ecd770f..8eb2c0c75fb46 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/gep-use-outside-loop.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/gep-use-outside-loop.ll
@@ -15,7 +15,7 @@ define void @gep_use_in_dead_block(ptr noalias %dst, ptr %src) {
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP4]], align 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <4 x i16> [[WIDE_LOAD]], splat (i16 10)
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP0]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i16.p0(<4 x i16> zeroinitializer, ptr [[TMP8]], i32 2, <4 x i1> [[TMP7]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i16.p0(<4 x i16> zeroinitializer, ptr align 2 [[TMP8]], <4 x i1> [[TMP7]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
 ; CHECK-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -82,11 +82,11 @@ define void @gep_use_outside_loop(ptr noalias %dst, ptr %src) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i16, ptr [[DST]], <4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x ptr> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP2]], align 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <4 x i16> [[WIDE_LOAD]], splat (i16 10)
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x ptr> [[TMP1]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.v4i16.p0(<4 x i16> zeroinitializer, ptr [[TMP6]], i32 2, <4 x i1> [[TMP5]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i16.p0(<4 x i16> zeroinitializer, ptr align 2 [[TMP6]], <4 x i1> [[TMP5]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
index a19b294541172..080b4e860a135 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
@@ -31,10 +31,10 @@ define i32 @iv_used_widened_and_truncated(ptr %dst, i64 %N) #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr { i32, [8 x i32] }, ptr [[DST]], <8 x i64> [[STEP_ADD]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr { i32, [8 x i32] }, ptr [[DST]], <8 x i64> [[STEP_ADD1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr { i32, [8 x i32] }, ptr [[DST]], <8 x i64> [[STEP_ADD2]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[VEC_IND4]], <8 x ptr> [[TMP1]], i32 8, <8 x i1> splat (i1 true))
-; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[STEP_ADD5]], <8 x ptr> [[TMP2]], i32 8, <8 x i1> splat (i1 true))
-; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[STEP_ADD6]], <8 x ptr> [[TMP3]], i32 8, <8 x i1> splat (i1 true))
-; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[STEP_ADD7]], <8 x ptr> [[TMP4]], i32 8, <8 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[VEC_IND4]], <8 x ptr> align 8 [[TMP1]], <8 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[STEP_ADD5]], <8 x ptr> align 8 [[TMP2]], <8 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[STEP_ADD6]], <8 x ptr> align 8 [[TMP3]], <8 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[STEP_ADD7]], <8 x ptr> align 8 [[TMP4]], <8 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[STEP_ADD2]], splat (i64 8)
 ; CHECK-NEXT:    [[VEC_IND_NEXT9]] = add <8 x i32> [[STEP_ADD7]], splat (i32 8)
@@ -63,7 +63,7 @@ define i32 @iv_used_widened_and_truncated(ptr %dst, i64 %N) #0 {
 ; CHECK-NEXT:    [[VEC_IND12:%.*]] = phi <4 x i64> [ [[INDUCTION]], [[SCALAR_PH]] ], [ [[VEC_IND_NEXT13:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[VEC_IND17:%.*]] = phi <4 x i32> [ [[INDUCTION16]], [[SCALAR_PH]] ], [ [[VEC_IND_NEXT18:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr { i32, [8 x i32] }, ptr [[DST]], <4 x i64> [[VEC_IND12]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[VEC_IND17]], <4 x ptr> [[TMP7]], i32 8, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[VEC_IND17]], <4 x ptr> align 8 [[TMP7]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT19]] = add nuw i64 [[INDEX11]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT13]] = add <4 x i64> [[VEC_IND12]], splat (i64 4)
 ; CHECK-NEXT:    [[VEC_IND_NEXT18]] = add <4 x i32> [[VEC_IND17]], splat (i32 4)
@@ -297,36 +297,36 @@ define void @multiple_pointer_ivs_with_scalar_uses_only(ptr %A, ptr %B) #0 {
 ; CHECK-NEXT:    [[TMP24:%.*]] = lshr <16 x i32> [[TMP23]], splat (i32 1)
 ; CHECK-NEXT:    [[TMP25:%.*]] = trunc <16 x i32> [[TMP24]] to <16 x i8>
 ; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <16 x i8> [[TMP25]], i32 0
-; CHECK-NEXT:    store i8 [[TMP26]], ptr [[NEXT_GEP]], align 1, !alias.scope [[META18:![0-9]+]], !noalias [[META15]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <16 x i8> [[TMP25]], i32 1
-; CHECK-NEXT:    store i8 [[TMP27]], ptr [[NEXT_GEP7]], align 1, !alias.scope [[META18]], !noalias [[META15]]
 ; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <16 x i8> [[TMP25]], i32 2
-; CHECK-NEXT:    store i8 [[TMP28]], ptr [[NEXT_GEP8]], align 1, !alias.scope [[META18]], !noalias [[META15]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <16 x i8> [[TMP25]], i32 3
-; CHECK-NEXT:    store i8 [[TMP29]], ptr [[NEXT_GEP9]], align 1, !alias.scope [[META18]], !noalias [[META15]]
 ; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <16 x i8> [[TMP25]], i32 4
-; CHECK-NEXT:    store i8 [[TMP30]], ptr [[NEXT_GEP10]], align 1, !alias.scope [[META18]], !noalias [[META15]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <16 x i8> [[TMP25]], i32 5
-; CHECK-NEXT:    store i8 [[TMP31]], ptr [[NEXT_GEP11]], align 1, !alias.scope [[META18]], !noalias [[META15]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <16 x i8> [[TMP25]], i32 6
-; CHECK-NEXT:    store i8 [[TMP32]], ptr [[NEXT_GEP12]], align 1, !alias.scope [[META18]], !noalias [[META15]]
 ; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <16 x i8> [[TMP25]], i32 7
-; CHECK-NEXT:    store i8 [[TMP33]], ptr [[NEXT_GEP13]], align 1, !alias.scope [[META18]], !noalias [[META15]]
 ; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <16 x i8> [[TMP25]], i32 8
-; CHECK-NEXT:    store i8 [[TMP34]], ptr [[NEXT_GEP14]], align 1, !alias.scope [[META18]], !noalias [[META15]]
 ; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <16 x i8> [[TMP25]], i32 9
-; CHECK-NEXT:    store i8 [[TMP35]], ptr [[NEXT_GEP15]], align 1, !alias.scope [[META18]], !noalias [[META15]]
 ; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <16 x i8> [[TMP25]], i32 10
-; CHECK-NEXT:    store i8 [[TMP36]], ptr [[NEXT_GEP16]], align 1, !alias.scope [[META18]], !noalias [[META15]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <16 x i8> [[TMP25]], i32 11
-; CHECK-NEXT:    store i8 [[TMP37]], ptr [[NEXT_GEP17]], align 1, !alias.scope [[META18]], !noalias [[META15]]
 ; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <16 x i8> [[TMP25]], i32 12
-; CHECK-NEXT:    store i8 [[TMP38]], ptr [[NEXT_GEP18]], align 1, !alias.scope [[META18]], !noalias [[META15]]
 ; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <16 x i8> [[TMP25]], i32 13
-; CHECK-NEXT:    store i8 [[TMP39]], ptr [[NEXT_GEP19]], align 1, !alias.scope [[META18]], !noalias [[META15]]
 ; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <16 x i8> [[TMP25]], i32 14
-; CHECK-NEXT:    store i8 [[TMP40]], ptr [[NEXT_GEP20]], align 1, !alias.scope [[META18]], !noalias [[META15]]
 ; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <16 x i8> [[TMP25]], i32 15
+; CHECK-NEXT:    store i8 [[TMP26]], ptr [[NEXT_GEP]], align 1, !alias.scope [[META18:![0-9]+]], !noalias [[META15]]
+; CHECK-NEXT:    store i8 [[TMP27]], ptr [[NEXT_GEP7]], align 1, !alias.scope [[META18]], !noalias [[META15]]
+; CHECK-NEXT:    store i8 [[TMP28]], ptr [[NEXT_GEP8]], align 1, !alias.scope [[META18]], !noalias [[META15]]
+; CHECK-NEXT:    store i8 [[TMP29]], ptr [[NEXT_GEP9]], align 1, !alias.scope [[META18]], !noalias [[META15]]
+; CHECK-NEXT:    store i8 [[TMP30]], ptr [[NEXT_GEP10]], align 1, !alias.scope [[META18]], !noalias [[META15]]
+; CHECK-NEXT:    store i8 [[TMP31]], ptr [[NEXT_GEP11]], align 1, !alias.scope [[META18]], !noalias [[META15]]
+; CHECK-NEXT:    store i8 [[TMP32]], ptr [[NEXT_GEP12]], align 1, !alias.scope [[META18]], !noalias [[META15]]
+; CHECK-NEXT:    store i8 [[TMP33]], ptr [[NEXT_GEP13]], align 1, !alias.scope [[META18]], !noalias [[META15]]
+; CHECK-NEXT:    store i8 [[TMP34]], ptr [[NEXT_GEP14]], align 1, !alias.scope [[META18]], !noalias [[META15]]
+; CHECK-NEXT:    store i8 [[TMP35]], ptr [[NEXT_GEP15]], align 1, !alias.scope [[META18]], !noalias [[META15]]
+; CHECK-NEXT:    store i8 [[TMP36]], ptr [[NEXT_GEP16]], align 1, !alias.scope [[META18]], !noalias [[META15]]
+; CHECK-NEXT:    store i8 [[TMP37]], ptr [[NEXT_GEP17]], align 1, !alias.scope [[META18]], !noalias [[META15]]
+; CHECK-NEXT:    store i8 [[TMP38]], ptr [[NEXT_GEP18]], align 1, !alias.scope [[META18]], !noalias [[META15]]
+; CHECK-NEXT:    store i8 [[TMP39]], ptr [[NEXT_GEP19]], align 1, !alias.scope [[META18]], !noalias [[META15]]
+; CHECK-NEXT:    store i8 [[TMP40]], ptr [[NEXT_GEP20]], align 1, !alias.scope [[META18]], !noalias [[META15]]
 ; CHECK-NEXT:    store i8 [[TMP41]], ptr [[NEXT_GEP21]], align 1, !alias.scope [[META18]], !noalias [[META15]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4294967184
@@ -465,10 +465,10 @@ define i32 @test_scalar_predicated_cost(i64 %x, i64 %y, ptr %A) #0 {
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr [[TMP16]], i32 8
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i32, ptr [[TMP16]], i32 16
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr i32, ptr [[TMP16]], i32 24
-; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP20]], ptr [[TMP16]], i32 4, <8 x i1> [[TMP8]])
-; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP21]], ptr [[TMP25]], i32 4, <8 x i1> [[TMP9]])
-; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP22]], ptr [[TMP26]], i32 4, <8 x i1> [[TMP10]])
-; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP23]], ptr [[TMP27]], i32 4, <8 x i1> [[TMP11]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP20]], ptr align 4 [[TMP16]], <8 x i1> [[TMP8]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP21]], ptr align 4 [[TMP25]], <8 x i1> [[TMP9]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP22]], ptr align 4 [[TMP26]], <8 x i1> [[TMP10]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP23]], ptr align 4 [[TMP27]], <8 x i1> [[TMP11]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[STEP_ADD2]], splat (i64 8)
 ; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
@@ -494,7 +494,7 @@ define i32 @test_scalar_predicated_cost(i64 %x, i64 %y, ptr %A) #0 {
 ; CHECK-NEXT:    [[TMP34:%.*]] = or <4 x i64> [[BROADCAST_SPLAT6]], [[VEC_IND5]]
 ; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr i32, ptr [[A]], i64 [[INDEX4]]
 ; CHECK-NEXT:    [[TMP36:%.*]] = trunc <4 x i64> [[TMP34]] to <4 x i32>
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP36]], ptr [[TMP35]], i32 4, <4 x i1> [[TMP33]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP36]], ptr align 4 [[TMP35]], <4 x i1> [[TMP33]])
 ; CHECK-NEXT:    [[INDEX_NEXT11]] = add nuw i64 [[INDEX4]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT6]] = add <4 x i64> [[VEC_IND5]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT11]], 100
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
index 2f9627855a2c9..9240484c6998b 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
@@ -360,7 +360,7 @@ define void @geps_feeding_interleave_groups_with_reuse2(ptr %A, ptr %B, i64 %N)
 ; CHECK-NEXT:    [[STRIDED_VEC34:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
 ; CHECK-NEXT:    [[TMP56:%.*]] = getelementptr i32, ptr [[A]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr i32, ptr [[B]], <4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP54]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison), !alias.scope [[META15:![0-9]+]], !noalias [[META13]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP54]], <4 x i1> splat (i1 true), <4 x i32> poison), !alias.scope [[META15:![0-9]+]], !noalias [[META13]]
 ; CHECK-NEXT:    [[TMP58:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC]], <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP59:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC34]], <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP60:%.*]] = shufflevector <4 x i32> [[WIDE_MASKED_GATHER]], <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleave-ptradd-with-replicated-operand.ll b/llvm/test/Transforms/LoopVectorize/X86/interleave-ptradd-with-replicated-operand.ll
index 8e0401d9baff9..14fb2a76eb75b 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleave-ptradd-with-replicated-operand.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleave-ptradd-with-replicated-operand.ll
@@ -69,40 +69,40 @@ define ptr @test_interleave_ptradd_with_replicated_op(ptr %m) #0 {
 ; CHECK-NEXT:    [[STRIDED_VEC25:%.*]] = shufflevector <8 x i32> [[WIDE_VEC24]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[STRIDED_VEC26:%.*]] = shufflevector <8 x i32> [[WIDE_VEC24]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:    [[TMP36:%.*]] = add <4 x i32> [[STRIDED_VEC17]], [[STRIDED_VEC]]
+; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <4 x i32> [[TMP36]], i32 0
+; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x i32> [[TMP36]], i32 1
+; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <4 x i32> [[TMP36]], i32 2
+; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <4 x i32> [[TMP36]], i32 3
 ; CHECK-NEXT:    [[TMP37:%.*]] = add <4 x i32> [[STRIDED_VEC20]], [[STRIDED_VEC19]]
+; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <4 x i32> [[TMP37]], i32 0
+; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <4 x i32> [[TMP37]], i32 1
+; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <4 x i32> [[TMP37]], i32 2
+; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <4 x i32> [[TMP37]], i32 3
 ; CHECK-NEXT:    [[TMP38:%.*]] = add <4 x i32> [[STRIDED_VEC23]], [[STRIDED_VEC22]]
+; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <4 x i32> [[TMP38]], i32 0
+; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <4 x i32> [[TMP38]], i32 1
+; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <4 x i32> [[TMP38]], i32 2
+; CHECK-NEXT:    [[TMP51:%.*]] = extractelement <4 x i32> [[TMP38]], i32 3
 ; CHECK-NEXT:    [[TMP39:%.*]] = add <4 x i32> [[STRIDED_VEC26]], [[STRIDED_VEC25]]
-; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <4 x i32> [[TMP36]], i32 0
+; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <4 x i32> [[TMP39]], i32 0
+; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <4 x i32> [[TMP39]], i32 1
+; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <4 x i32> [[TMP39]], i32 2
+; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <4 x i32> [[TMP39]], i32 3
 ; CHECK-NEXT:    store i32 [[TMP40]], ptr [[NEXT_GEP12]], align 4
-; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x i32> [[TMP36]], i32 1
 ; CHECK-NEXT:    store i32 [[TMP41]], ptr [[NEXT_GEP2]], align 4
-; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <4 x i32> [[TMP36]], i32 2
 ; CHECK-NEXT:    store i32 [[TMP42]], ptr [[NEXT_GEP3]], align 4
-; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <4 x i32> [[TMP36]], i32 3
 ; CHECK-NEXT:    store i32 [[TMP43]], ptr [[NEXT_GEP4]], align 4
-; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <4 x i32> [[TMP37]], i32 0
 ; CHECK-NEXT:    store i32 [[TMP44]], ptr [[NEXT_GEP13]], align 4
-; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <4 x i32> [[TMP37]], i32 1
 ; CHECK-NEXT:    store i32 [[TMP45]], ptr [[NEXT_GEP6]], align 4
-; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <4 x i32> [[TMP37]], i32 2
 ; CHECK-NEXT:    store i32 [[TMP46]], ptr [[NEXT_GEP7]], align 4
-; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <4 x i32> [[TMP37]], i32 3
 ; CHECK-NEXT:    store i32 [[TMP47]], ptr [[NEXT_GEP8]], align 4
-; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <4 x i32> [[TMP38]], i32 0
 ; CHECK-NEXT:    store i32 [[TMP48]], ptr [[NEXT_GEP14]], align 4
-; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <4 x i32> [[TMP38]], i32 1
 ; CHECK-NEXT:    store i32 [[TMP49]], ptr [[NEXT_GEP10]], align 4
-; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <4 x i32> [[TMP38]], i32 2
 ; CHECK-NEXT:    store i32 [[TMP50]], ptr [[NEXT_GEP11]], align 4
-; CHECK-NEXT:    [[TMP51:%.*]] = extractelement <4 x i32> [[TMP38]], i32 3
 ; CHECK-NEXT:    store i32 [[TMP51]], ptr [[NEXT_GEP17]], align 4
-; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <4 x i32> [[TMP39]], i32 0
 ; CHECK-NEXT:    store i32 [[TMP52]], ptr [[NEXT_GEP15]], align 4
-; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <4 x i32> [[TMP39]], i32 1
 ; CHECK-NEXT:    store i32 [[TMP53]], ptr [[NEXT_GEP18]], align 4
-; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <4 x i32> [[TMP39]], i32 2
 ; CHECK-NEXT:    store i32 [[TMP54]], ptr [[NEXT_GEP19]], align 4
-; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <4 x i32> [[TMP39]], i32 3
 ; CHECK-NEXT:    store i32 [[TMP55]], ptr [[NEXT_GEP16]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP56:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-hoist-load-across-store.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-hoist-load-across-store.ll
index d17361a6caacb..829fdff5123e2 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-hoist-load-across-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-hoist-load-across-store.ll
@@ -28,18 +28,18 @@ define void @pr63602_1(ptr %arr) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i32>, ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 0
-; CHECK-NEXT:    store i32 [[TMP12]], ptr [[TMP8]], align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 1
-; CHECK-NEXT:    store i32 [[TMP13]], ptr [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 2
-; CHECK-NEXT:    store i32 [[TMP14]], ptr [[TMP10]], align 4
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 3
-; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP11]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 3
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP5]]
+; CHECK-NEXT:    store i32 [[TMP8]], ptr [[TMP12]], align 4
+; CHECK-NEXT:    store i32 [[TMP9]], ptr [[TMP13]], align 4
+; CHECK-NEXT:    store i32 [[TMP10]], ptr [[TMP14]], align 4
+; CHECK-NEXT:    store i32 [[TMP11]], ptr [[TMP15]], align 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = add nuw nsw i64 [[OFFSET_IDX]], 2
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP16]]
 ; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <12 x i32>, ptr [[TMP17]], align 4
@@ -47,13 +47,13 @@ define void @pr63602_1(ptr %arr) {
 ; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <12 x i32> [[WIDE_VEC2]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
 ; CHECK-NEXT:    [[TMP18:%.*]] = add <4 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC3]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i32> [[TMP18]], i32 0
-; CHECK-NEXT:    store i32 [[TMP19]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i32> [[TMP18]], i32 1
-; CHECK-NEXT:    store i32 [[TMP20]], ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[TMP18]], i32 2
-; CHECK-NEXT:    store i32 [[TMP21]], ptr [[TMP10]], align 4
 ; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i32> [[TMP18]], i32 3
-; CHECK-NEXT:    store i32 [[TMP22]], ptr [[TMP11]], align 4
+; CHECK-NEXT:    store i32 [[TMP19]], ptr [[TMP12]], align 4
+; CHECK-NEXT:    store i32 [[TMP20]], ptr [[TMP13]], align 4
+; CHECK-NEXT:    store i32 [[TMP21]], ptr [[TMP14]], align 4
+; CHECK-NEXT:    store i32 [[TMP22]], ptr [[TMP15]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
 ; CHECK-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -141,18 +141,18 @@ define void @pr63602_2(ptr %arr) {
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP10]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i32>, ptr [[TMP11]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 0
-; CHECK-NEXT:    store i32 [[TMP16]], ptr [[TMP12]], align 4
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 1
-; CHECK-NEXT:    store i32 [[TMP17]], ptr [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 2
-; CHECK-NEXT:    store i32 [[TMP18]], ptr [[TMP14]], align 4
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 3
-; CHECK-NEXT:    store i32 [[TMP19]], ptr [[TMP15]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 2
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 3
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP9]]
+; CHECK-NEXT:    store i32 [[TMP12]], ptr [[TMP16]], align 4
+; CHECK-NEXT:    store i32 [[TMP13]], ptr [[TMP17]], align 4
+; CHECK-NEXT:    store i32 [[TMP14]], ptr [[TMP18]], align 4
+; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP19]], align 4
 ; CHECK-NEXT:    [[TMP20:%.*]] = add nuw nsw i64 [[TMP1]], 2
 ; CHECK-NEXT:    [[TMP21:%.*]] = add nuw nsw i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[TMP22:%.*]] = add nuw nsw i64 [[TMP3]], 2
@@ -161,10 +161,10 @@ define void @pr63602_2(ptr %arr) {
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP21]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP22]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP23]]
-; CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP12]], align 4
-; CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP30:%.*]] = load i32, ptr [[TMP14]], align 4
-; CHECK-NEXT:    [[TMP31:%.*]] = load i32, ptr [[TMP15]], align 4
+; CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP16]], align 4
+; CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP17]], align 4
+; CHECK-NEXT:    [[TMP30:%.*]] = load i32, ptr [[TMP18]], align 4
+; CHECK-NEXT:    [[TMP31:%.*]] = load i32, ptr [[TMP19]], align 4
 ; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <4 x i32> poison, i32 [[TMP28]], i32 0
 ; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP29]], i32 1
 ; CHECK-NEXT:    [[TMP34:%.*]] = insertelement <4 x i32> [[TMP33]], i32 [[TMP30]], i32 2
@@ -179,13 +179,13 @@ define void @pr63602_2(ptr %arr) {
 ; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <4 x i32> [[TMP42]], i32 [[TMP39]], i32 3
 ; CHECK-NEXT:    [[TMP44:%.*]] = add <4 x i32> [[TMP35]], [[TMP43]]
 ; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <4 x i32> [[TMP44]], i32 0
-; CHECK-NEXT:    store i32 [[TMP45]], ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <4 x i32> [[TMP44]], i32 1
-; CHECK-NEXT:    store i32 [[TMP46]], ptr [[TMP13]], align 4
 ; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <4 x i32> [[TMP44]], i32 2
-; CHECK-NEXT:    store i32 [[TMP47]], ptr [[TMP14]], align 4
 ; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <4 x i32> [[TMP44]], i32 3
-; CHECK-NEXT:    store i32 [[TMP48]], ptr [[TMP15]], align 4
+; CHECK-NEXT:    store i32 [[TMP45]], ptr [[TMP16]], align 4
+; CHECK-NEXT:    store i32 [[TMP46]], ptr [[TMP17]], align 4
+; CHECK-NEXT:    store i32 [[TMP47]], ptr [[TMP18]], align 4
+; CHECK-NEXT:    store i32 [[TMP48]], ptr [[TMP19]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP49:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
 ; CHECK-NEXT:    br i1 [[TMP49]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-sink-store-across-load.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-sink-store-across-load.ll
index 3efb82de7e9f1..0678e9eea4c35 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-sink-store-across-load.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-sink-store-across-load.ll
@@ -26,14 +26,14 @@ define void @avoid_sinking_store_across_load(ptr %arr) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[ARR]], <4 x i64> [[VEC_IND2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[ARR]], <4 x i64> [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = mul <4 x i32> [[STRIDED_VEC]], splat (i32 25)
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP7]], <4 x ptr> [[TMP6]], i32 4, <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x ptr> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = mul <4 x i32> [[STRIDED_VEC]], splat (i32 25)
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP7]], <4 x ptr> align 4 [[TMP6]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[WIDE_VEC4:%.*]] = load <12 x i32>, ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <12 x i32> [[WIDE_VEC4]], <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
 ; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <12 x i32> [[WIDE_VEC4]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
 ; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[STRIDED_VEC6]], [[STRIDED_VEC5]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP9]], <4 x ptr> [[TMP4]], i32 4, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP9]], <4 x ptr> align 4 [[TMP4]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
 ; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i64> [[VEC_IND2]], splat (i64 12)
diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll
index 9a3616a4340ff..0bc86fff9831b 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll
@@ -36,7 +36,7 @@ define i32 @inv_load_conditional(ptr %a, i64 %n, ptr %b, i32 %k) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    store <16 x i32> [[BROADCAST_SPLAT5]], ptr [[TMP2]], align 4, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> [[BROADCAST_SPLAT]], i32 4, <16 x i1> [[TMP1]], <16 x i32> poison), !alias.scope [[META3]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 4 [[BROADCAST_SPLAT]], <16 x i1> [[TMP1]], <16 x i32> poison), !alias.scope [[META3]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <16 x i1> [[TMP1]], <16 x i32> [[WIDE_MASKED_GATHER]], <16 x i32> splat (i32 1)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -62,7 +62,7 @@ define i32 @inv_load_conditional(ptr %a, i64 %n, ptr %b, i32 %k) {
 ; CHECK-NEXT:    [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX10]]
 ; CHECK-NEXT:    store <8 x i32> [[BROADCAST_SPLAT12]], ptr [[TMP6]], align 4, !alias.scope [[META0]], !noalias [[META3]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER13:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[BROADCAST_SPLAT9]], i32 4, <8 x i1> [[TMP5]], <8 x i32> poison), !alias.scope [[META3]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER13:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 4 [[BROADCAST_SPLAT9]], <8 x i1> [[TMP5]], <8 x i32> poison), !alias.scope [[META3]]
 ; CHECK-NEXT:    [[PREDPHI14:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[WIDE_MASKED_GATHER13]], <8 x i32> splat (i32 1)
 ; CHECK-NEXT:    [[INDEX_NEXT15]] = add nuw i64 [[INDEX10]], 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC7]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
index 5d40e6ab954fd..7b9166de3ed05 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
@@ -160,7 +160,7 @@ define void @inv_val_store_to_inv_address_conditional(ptr %a, i64 %n, ptr %b, i3
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP1]], align 8, !alias.scope [[META11:![0-9]+]], !noalias [[META14:![0-9]+]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <16 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT5]]
 ; CHECK-NEXT:    store <16 x i32> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 4, !alias.scope [[META11]], !noalias [[META14]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> [[BROADCAST_SPLAT]], <16 x ptr> [[BROADCAST_SPLAT7]], i32 4, <16 x i1> [[TMP2]]), !alias.scope [[META14]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> [[BROADCAST_SPLAT]], <16 x ptr> align 4 [[BROADCAST_SPLAT7]], <16 x i1> [[TMP2]]), !alias.scope [[META14]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
@@ -306,8 +306,8 @@ define void @variant_val_store_to_inv_address_conditional(ptr %a, i64 %n, ptr %b
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <16 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT13]]
 ; CHECK-NEXT:    store <16 x i32> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 4, !alias.scope [[META20]], !noalias [[META23]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[C]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP3]], i32 8, <16 x i1> [[TMP2]], <16 x i32> poison), !alias.scope [[META26:![0-9]+]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> [[WIDE_MASKED_LOAD]], <16 x ptr> [[BROADCAST_SPLAT15]], i32 4, <16 x i1> [[TMP2]]), !alias.scope [[META27:![0-9]+]], !noalias [[META26]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 8 [[TMP3]], <16 x i1> [[TMP2]], <16 x i32> poison), !alias.scope [[META26:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> [[WIDE_MASKED_LOAD]], <16 x ptr> align 4 [[BROADCAST_SPLAT15]], <16 x i1> [[TMP2]]), !alias.scope [[META27:![0-9]+]], !noalias [[META26]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
@@ -335,8 +335,8 @@ define void @variant_val_store_to_inv_address_conditional(ptr %a, i64 %n, ptr %b
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD25]], [[BROADCAST_SPLAT23]]
 ; CHECK-NEXT:    store <8 x i32> [[BROADCAST_SPLAT21]], ptr [[TMP5]], align 4, !alias.scope [[META20]], !noalias [[META23]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[C]], i64 [[INDEX18]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD26:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP7]], i32 8, <8 x i1> [[TMP6]], <8 x i32> poison), !alias.scope [[META26]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[WIDE_MASKED_LOAD26]], <8 x ptr> [[BROADCAST_SPLAT26]], i32 4, <8 x i1> [[TMP6]]), !alias.scope [[META27]], !noalias [[META26]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD26:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 8 [[TMP7]], <8 x i1> [[TMP6]], <8 x i32> poison), !alias.scope [[META26]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[WIDE_MASKED_LOAD26]], <8 x ptr> align 4 [[BROADCAST_SPLAT26]], <8 x i1> [[TMP6]]), !alias.scope [[META27]], !noalias [[META26]]
 ; CHECK-NEXT:    [[INDEX_NEXT27]] = add nuw i64 [[INDEX18]], 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT27]], [[N_VEC17]]
 ; CHECK-NEXT:    br i1 [[TMP8]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/iv-live-outs.ll b/llvm/test/Transforms/LoopVectorize/X86/iv-live-outs.ll
index 78f96ca650fb2..bcb6b5c422343 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/iv-live-outs.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/iv-live-outs.ll
@@ -29,10 +29,10 @@ define i64 @test_pr98660(ptr %dst, i64 %N) {
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD3]], zeroinitializer
-; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr [[TMP9]], i32 4, <8 x i1> [[TMP17]])
-; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr [[TMP14]], i32 4, <8 x i1> [[TMP18]])
-; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr [[TMP15]], i32 4, <8 x i1> [[TMP19]])
-; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr [[TMP16]], i32 4, <8 x i1> [[TMP20]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr align 4 [[TMP9]], <8 x i1> [[TMP17]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr align 4 [[TMP14]], <8 x i1> [[TMP18]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr align 4 [[TMP15]], <8 x i1> [[TMP19]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr align 4 [[TMP16]], <8 x i1> [[TMP20]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
index 2a8c698f3f7fa..8771dc9a20379 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
@@ -721,10 +721,10 @@ define i32 @test_max_trip_count(i64 %len, ptr %test_base, i64 %n) {
 ; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, ptr [[TMP65]], i32 4
 ; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, ptr [[TMP65]], i32 8
 ; CHECK-NEXT:    [[TMP72:%.*]] = getelementptr i32, ptr [[TMP65]], i32 12
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP65]], i32 4, <4 x i1> [[TMP40]], <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP48]], <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP56]], <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP72]], i32 4, <4 x i1> [[TMP64]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP65]], <4 x i1> [[TMP40]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP70]], <4 x i1> [[TMP48]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP71]], <4 x i1> [[TMP56]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP72]], <4 x i1> [[TMP64]], <4 x i32> poison)
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP40]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI7:%.*]] = select <4 x i1> [[TMP48]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI8:%.*]] = select <4 x i1> [[TMP56]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer
@@ -1234,10 +1234,10 @@ define i32 @neg_off_by_many(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4
 ; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8
 ; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP64]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP64]], <4 x i1> [[TMP39]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP69]], <4 x i1> [[TMP47]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP70]], <4 x i1> [[TMP55]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP71]], <4 x i1> [[TMP63]], <4 x i32> poison)
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer
@@ -1365,10 +1365,10 @@ define i32 @neg_off_by_one_iteration(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4
 ; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8
 ; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP64]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP64]], <4 x i1> [[TMP39]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP69]], <4 x i1> [[TMP47]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP70]], <4 x i1> [[TMP55]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP71]], <4 x i1> [[TMP63]], <4 x i32> poison)
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer
@@ -1496,10 +1496,10 @@ define i32 @neg_off_by_one_byte(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4
 ; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8
 ; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP64]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP64]], <4 x i1> [[TMP39]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP69]], <4 x i1> [[TMP47]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP70]], <4 x i1> [[TMP55]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP71]], <4 x i1> [[TMP63]], <4 x i32> poison)
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer
@@ -1796,10 +1796,10 @@ define i32 @test_allocsize(i64 %len, ptr %test_base) nofree nosync {
 ; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4
 ; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8
 ; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP64]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP64]], <4 x i1> [[TMP39]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP69]], <4 x i1> [[TMP47]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP70]], <4 x i1> [[TMP55]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP71]], <4 x i1> [[TMP63]], <4 x i32> poison)
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer
@@ -1928,10 +1928,10 @@ define i32 @test_allocsize_array(i64 %len, ptr %test_base) nofree nosync {
 ; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4
 ; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8
 ; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP64]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP64]], <4 x i1> [[TMP39]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP69]], <4 x i1> [[TMP47]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP70]], <4 x i1> [[TMP55]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP71]], <4 x i1> [[TMP63]], <4 x i32> poison)
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer
@@ -2070,10 +2070,10 @@ define i32 @test_allocsize_cond_deref(i1 %allzero, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4
 ; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8
 ; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP64]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP64]], <4 x i1> [[TMP39]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP69]], <4 x i1> [[TMP47]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP70]], <4 x i1> [[TMP55]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP71]], <4 x i1> [[TMP63]], <4 x i32> poison)
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked-store-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/masked-store-cost.ll
index 9d4ddf9a1156f..2c172b2aecd16 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/masked-store-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/masked-store-cost.ll
@@ -38,10 +38,10 @@ define i32 @test_scalar_predicated_cost(i64 %x, i64 %y, ptr %A) #0 {
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr [[TMP16]], i32 8
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i32, ptr [[TMP16]], i32 16
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr i32, ptr [[TMP16]], i32 24
-; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP20]], ptr [[TMP16]], i32 4, <8 x i1> [[TMP8]])
-; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP21]], ptr [[TMP25]], i32 4, <8 x i1> [[TMP9]])
-; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP22]], ptr [[TMP26]], i32 4, <8 x i1> [[TMP10]])
-; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP23]], ptr [[TMP27]], i32 4, <8 x i1> [[TMP11]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP20]], ptr align 4 [[TMP16]], <8 x i1> [[TMP8]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP21]], ptr align 4 [[TMP25]], <8 x i1> [[TMP9]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP22]], ptr align 4 [[TMP26]], <8 x i1> [[TMP10]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP23]], ptr align 4 [[TMP27]], <8 x i1> [[TMP11]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[STEP_ADD2]], splat (i64 8)
 ; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
@@ -67,7 +67,7 @@ define i32 @test_scalar_predicated_cost(i64 %x, i64 %y, ptr %A) #0 {
 ; CHECK-NEXT:    [[TMP34:%.*]] = or <4 x i64> [[BROADCAST_SPLAT10]], [[VEC_IND5]]
 ; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr i32, ptr [[A]], i64 [[INDEX4]]
 ; CHECK-NEXT:    [[TMP36:%.*]] = trunc <4 x i64> [[TMP34]] to <4 x i32>
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP36]], ptr [[TMP35]], i32 4, <4 x i1> [[TMP33]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP36]], ptr align 4 [[TMP35]], <4 x i1> [[TMP33]])
 ; CHECK-NEXT:    [[INDEX_NEXT11]] = add nuw i64 [[INDEX4]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT6]] = add <4 x i64> [[VEC_IND5]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT11]], 100
@@ -132,7 +132,7 @@ define void @test_scalar_cost_single_store_loop_invariant_cond(ptr %dst, i1 %c)
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr [[NEXT_GEP]], i32 4, <8 x i1> [[BROADCAST_SPLAT]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr align 4 [[NEXT_GEP]], <8 x i1> [[BROADCAST_SPLAT]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -200,8 +200,8 @@ define void @test_scalar_cost_single_store_loop_varying_cond(ptr %dst, ptr noali
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i32> [[STRIDED_VEC]], splat (i32 123)
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <4 x i32> [[STRIDED_VEC5]], splat (i32 123)
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 4
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr [[NEXT_GEP]], i32 4, <4 x i1> [[TMP8]])
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr [[TMP11]], i32 4, <4 x i1> [[TMP9]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr align 4 [[NEXT_GEP]], <4 x i1> [[TMP8]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr align 4 [[TMP11]], <4 x i1> [[TMP9]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
index b907e7e2fbfbf..6558f761142f0 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
@@ -40,10 +40,10 @@ define void @foo1(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4
 ; AVX1-NEXT:    [[TMP5:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], splat (i32 100)
 ; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[B]], i64 [[INDEX]]
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP6]], i32 4, <8 x i1> [[TMP5]], <8 x i32> poison)
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 4 [[TMP6]], <8 x i1> [[TMP5]], <8 x i32> poison)
 ; AVX1-NEXT:    [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
 ; AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[A]], i64 [[INDEX]]
-; AVX1-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP8]], ptr [[TMP9]], i32 4, <8 x i1> [[TMP5]])
+; AVX1-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP8]], ptr align 4 [[TMP9]], <8 x i1> [[TMP5]])
 ; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; AVX1-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
 ; AVX1-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -87,10 +87,10 @@ define void @foo1(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[TMP12]], i32 8
 ; AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i32, ptr [[TMP12]], i32 16
 ; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr [[TMP12]], i32 24
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP12]], i32 4, <8 x i1> [[TMP8]], <8 x i32> poison)
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP14]], i32 4, <8 x i1> [[TMP9]], <8 x i32> poison)
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP15]], i32 4, <8 x i1> [[TMP10]], <8 x i32> poison)
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP16]], i32 4, <8 x i1> [[TMP11]], <8 x i32> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 4 [[TMP12]], <8 x i1> [[TMP8]], <8 x i32> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 4 [[TMP14]], <8 x i1> [[TMP9]], <8 x i32> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 4 [[TMP15]], <8 x i1> [[TMP10]], <8 x i32> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 4 [[TMP16]], <8 x i1> [[TMP11]], <8 x i32> poison)
 ; AVX2-NEXT:    [[TMP17:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
 ; AVX2-NEXT:    [[TMP18:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]]
 ; AVX2-NEXT:    [[TMP19:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]]
@@ -99,10 +99,10 @@ define void @foo1(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr [[TMP21]], i32 8
 ; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr i32, ptr [[TMP21]], i32 16
 ; AVX2-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr [[TMP21]], i32 24
-; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP17]], ptr [[TMP21]], i32 4, <8 x i1> [[TMP8]])
-; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP18]], ptr [[TMP23]], i32 4, <8 x i1> [[TMP9]])
-; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP19]], ptr [[TMP24]], i32 4, <8 x i1> [[TMP10]])
-; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP20]], ptr [[TMP25]], i32 4, <8 x i1> [[TMP11]])
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP17]], ptr align 4 [[TMP21]], <8 x i1> [[TMP8]])
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP18]], ptr align 4 [[TMP23]], <8 x i1> [[TMP9]])
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP19]], ptr align 4 [[TMP24]], <8 x i1> [[TMP10]])
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP20]], ptr align 4 [[TMP25]], <8 x i1> [[TMP11]])
 ; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; AVX2-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
 ; AVX2-NEXT:    br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -119,10 +119,10 @@ define void @foo1(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX2-NEXT:    [[WIDE_LOAD12:%.*]] = load <8 x i32>, ptr [[TMP38]], align 4
 ; AVX2-NEXT:    [[TMP30:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], splat (i32 100)
 ; AVX2-NEXT:    [[TMP31:%.*]] = getelementptr i32, ptr [[B]], i64 [[INDEX11]]
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP31]], i32 4, <8 x i1> [[TMP30]], <8 x i32> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 4 [[TMP31]], <8 x i1> [[TMP30]], <8 x i32> poison)
 ; AVX2-NEXT:    [[TMP33:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD13]], [[WIDE_LOAD12]]
 ; AVX2-NEXT:    [[TMP34:%.*]] = getelementptr i32, ptr [[A]], i64 [[INDEX11]]
-; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP33]], ptr [[TMP34]], i32 4, <8 x i1> [[TMP30]])
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP33]], ptr align 4 [[TMP34]], <8 x i1> [[TMP30]])
 ; AVX2-NEXT:    [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 8
 ; AVX2-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT14]], 10000
 ; AVX2-NEXT:    br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -166,10 +166,10 @@ define void @foo1(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[TMP12]], i32 16
 ; AVX512-NEXT:    [[TMP15:%.*]] = getelementptr i32, ptr [[TMP12]], i32 32
 ; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr [[TMP12]], i32 48
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP12]], i32 4, <16 x i1> [[TMP8]], <16 x i32> poison)
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP14]], i32 4, <16 x i1> [[TMP9]], <16 x i32> poison)
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP15]], i32 4, <16 x i1> [[TMP10]], <16 x i32> poison)
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP16]], i32 4, <16 x i1> [[TMP11]], <16 x i32> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 4 [[TMP12]], <16 x i1> [[TMP8]], <16 x i32> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 4 [[TMP14]], <16 x i1> [[TMP9]], <16 x i32> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 4 [[TMP15]], <16 x i1> [[TMP10]], <16 x i32> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 4 [[TMP16]], <16 x i1> [[TMP11]], <16 x i32> poison)
 ; AVX512-NEXT:    [[TMP17:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
 ; AVX512-NEXT:    [[TMP18:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]]
 ; AVX512-NEXT:    [[TMP19:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]]
@@ -178,10 +178,10 @@ define void @foo1(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX512-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr [[TMP21]], i32 16
 ; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr i32, ptr [[TMP21]], i32 32
 ; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr [[TMP21]], i32 48
-; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP17]], ptr [[TMP21]], i32 4, <16 x i1> [[TMP8]])
-; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP18]], ptr [[TMP23]], i32 4, <16 x i1> [[TMP9]])
-; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP19]], ptr [[TMP24]], i32 4, <16 x i1> [[TMP10]])
-; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP20]], ptr [[TMP25]], i32 4, <16 x i1> [[TMP11]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP17]], ptr align 4 [[TMP21]], <16 x i1> [[TMP8]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP18]], ptr align 4 [[TMP23]], <16 x i1> [[TMP9]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP19]], ptr align 4 [[TMP24]], <16 x i1> [[TMP10]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP20]], ptr align 4 [[TMP25]], <16 x i1> [[TMP11]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
 ; AVX512-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
 ; AVX512-NEXT:    br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -198,10 +198,10 @@ define void @foo1(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX512-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i32>, ptr [[TMP28]], align 4
 ; AVX512-NEXT:    [[TMP30:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD12]], splat (i32 100)
 ; AVX512-NEXT:    [[TMP31:%.*]] = getelementptr i32, ptr [[B]], i64 [[INDEX11]]
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP31]], i32 4, <16 x i1> [[TMP30]], <16 x i32> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 4 [[TMP31]], <16 x i1> [[TMP30]], <16 x i32> poison)
 ; AVX512-NEXT:    [[TMP33:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD13]], [[WIDE_LOAD12]]
 ; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr i32, ptr [[A]], i64 [[INDEX11]]
-; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP33]], ptr [[TMP34]], i32 4, <16 x i1> [[TMP30]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP33]], ptr align 4 [[TMP34]], <16 x i1> [[TMP30]])
 ; AVX512-NEXT:    [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 16
 ; AVX512-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT14]], 10000
 ; AVX512-NEXT:    br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -261,10 +261,10 @@ define void @foo1_addrspace1(ptr addrspace(1) nocapture %A, ptr addrspace(1) noc
 ; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP3]], align 4
 ; AVX1-NEXT:    [[TMP5:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], splat (i32 100)
 ; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[INDEX]]
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP6]], i32 4, <8 x i1> [[TMP5]], <8 x i32> poison)
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) align 4 [[TMP6]], <8 x i1> [[TMP5]], <8 x i32> poison)
 ; AVX1-NEXT:    [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
 ; AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[INDEX]]
-; AVX1-NEXT:    call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP8]], ptr addrspace(1) [[TMP9]], i32 4, <8 x i1> [[TMP5]])
+; AVX1-NEXT:    call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP8]], ptr addrspace(1) align 4 [[TMP9]], <8 x i1> [[TMP5]])
 ; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; AVX1-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
 ; AVX1-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -308,10 +308,10 @@ define void @foo1_addrspace1(ptr addrspace(1) nocapture %A, ptr addrspace(1) noc
 ; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP12]], i32 8
 ; AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP12]], i32 16
 ; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP12]], i32 24
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP12]], i32 4, <8 x i1> [[TMP8]], <8 x i32> poison)
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP14]], i32 4, <8 x i1> [[TMP9]], <8 x i32> poison)
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP15]], i32 4, <8 x i1> [[TMP10]], <8 x i32> poison)
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP16]], i32 4, <8 x i1> [[TMP11]], <8 x i32> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) align 4 [[TMP12]], <8 x i1> [[TMP8]], <8 x i32> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) align 4 [[TMP14]], <8 x i1> [[TMP9]], <8 x i32> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) align 4 [[TMP15]], <8 x i1> [[TMP10]], <8 x i32> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) align 4 [[TMP16]], <8 x i1> [[TMP11]], <8 x i32> poison)
 ; AVX2-NEXT:    [[TMP17:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
 ; AVX2-NEXT:    [[TMP18:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]]
 ; AVX2-NEXT:    [[TMP19:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]]
@@ -320,10 +320,10 @@ define void @foo1_addrspace1(ptr addrspace(1) nocapture %A, ptr addrspace(1) noc
 ; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP21]], i32 8
 ; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP21]], i32 16
 ; AVX2-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP21]], i32 24
-; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP17]], ptr addrspace(1) [[TMP21]], i32 4, <8 x i1> [[TMP8]])
-; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP18]], ptr addrspace(1) [[TMP23]], i32 4, <8 x i1> [[TMP9]])
-; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP19]], ptr addrspace(1) [[TMP24]], i32 4, <8 x i1> [[TMP10]])
-; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP20]], ptr addrspace(1) [[TMP25]], i32 4, <8 x i1> [[TMP11]])
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP17]], ptr addrspace(1) align 4 [[TMP21]], <8 x i1> [[TMP8]])
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP18]], ptr addrspace(1) align 4 [[TMP23]], <8 x i1> [[TMP9]])
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP19]], ptr addrspace(1) align 4 [[TMP24]], <8 x i1> [[TMP10]])
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP20]], ptr addrspace(1) align 4 [[TMP25]], <8 x i1> [[TMP11]])
 ; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; AVX2-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
 ; AVX2-NEXT:    br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -340,10 +340,10 @@ define void @foo1_addrspace1(ptr addrspace(1) nocapture %A, ptr addrspace(1) noc
 ; AVX2-NEXT:    [[WIDE_LOAD12:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP38]], align 4
 ; AVX2-NEXT:    [[TMP30:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], splat (i32 100)
 ; AVX2-NEXT:    [[TMP31:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[INDEX11]]
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP31]], i32 4, <8 x i1> [[TMP30]], <8 x i32> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) align 4 [[TMP31]], <8 x i1> [[TMP30]], <8 x i32> poison)
 ; AVX2-NEXT:    [[TMP33:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD13]], [[WIDE_LOAD12]]
 ; AVX2-NEXT:    [[TMP34:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[INDEX11]]
-; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP33]], ptr addrspace(1) [[TMP34]], i32 4, <8 x i1> [[TMP30]])
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP33]], ptr addrspace(1) align 4 [[TMP34]], <8 x i1> [[TMP30]])
 ; AVX2-NEXT:    [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 8
 ; AVX2-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT14]], 10000
 ; AVX2-NEXT:    br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
@@ -387,10 +387,10 @@ define void @foo1_addrspace1(ptr addrspace(1) nocapture %A, ptr addrspace(1) noc
 ; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP12]], i32 16
 ; AVX512-NEXT:    [[TMP15:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP12]], i32 32
 ; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP12]], i32 48
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) [[TMP12]], i32 4, <16 x i1> [[TMP8]], <16 x i32> poison)
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) [[TMP14]], i32 4, <16 x i1> [[TMP9]], <16 x i32> poison)
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) [[TMP15]], i32 4, <16 x i1> [[TMP10]], <16 x i32> poison)
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) [[TMP16]], i32 4, <16 x i1> [[TMP11]], <16 x i32> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) align 4 [[TMP12]], <16 x i1> [[TMP8]], <16 x i32> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) align 4 [[TMP14]], <16 x i1> [[TMP9]], <16 x i32> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) align 4 [[TMP15]], <16 x i1> [[TMP10]], <16 x i32> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) align 4 [[TMP16]], <16 x i1> [[TMP11]], <16 x i32> poison)
 ; AVX512-NEXT:    [[TMP17:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
 ; AVX512-NEXT:    [[TMP18:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]]
 ; AVX512-NEXT:    [[TMP19:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]]
@@ -399,10 +399,10 @@ define void @foo1_addrspace1(ptr addrspace(1) nocapture %A, ptr addrspace(1) noc
 ; AVX512-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP21]], i32 16
 ; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP21]], i32 32
 ; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP21]], i32 48
-; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP17]], ptr addrspace(1) [[TMP21]], i32 4, <16 x i1> [[TMP8]])
-; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP18]], ptr addrspace(1) [[TMP23]], i32 4, <16 x i1> [[TMP9]])
-; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP19]], ptr addrspace(1) [[TMP24]], i32 4, <16 x i1> [[TMP10]])
-; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP20]], ptr addrspace(1) [[TMP25]], i32 4, <16 x i1> [[TMP11]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP17]], ptr addrspace(1) align 4 [[TMP21]], <16 x i1> [[TMP8]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP18]], ptr addrspace(1) align 4 [[TMP23]], <16 x i1> [[TMP9]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP19]], ptr addrspace(1) align 4 [[TMP24]], <16 x i1> [[TMP10]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP20]], ptr addrspace(1) align 4 [[TMP25]], <16 x i1> [[TMP11]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
 ; AVX512-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
 ; AVX512-NEXT:    br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -419,10 +419,10 @@ define void @foo1_addrspace1(ptr addrspace(1) nocapture %A, ptr addrspace(1) noc
 ; AVX512-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i32>, ptr addrspace(1) [[TMP28]], align 4
 ; AVX512-NEXT:    [[TMP30:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD12]], splat (i32 100)
 ; AVX512-NEXT:    [[TMP31:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[INDEX11]]
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) [[TMP31]], i32 4, <16 x i1> [[TMP30]], <16 x i32> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) align 4 [[TMP31]], <16 x i1> [[TMP30]], <16 x i32> poison)
 ; AVX512-NEXT:    [[TMP33:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD13]], [[WIDE_LOAD12]]
 ; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[INDEX11]]
-; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP33]], ptr addrspace(1) [[TMP34]], i32 4, <16 x i1> [[TMP30]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP33]], ptr addrspace(1) align 4 [[TMP34]], <16 x i1> [[TMP30]])
 ; AVX512-NEXT:    [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 16
 ; AVX512-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT14]], 10000
 ; AVX512-NEXT:    br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
@@ -491,11 +491,11 @@ define void @foo2(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4
 ; AVX1-NEXT:    [[TMP5:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], splat (i32 100)
 ; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr float, ptr [[B]], i64 [[INDEX]]
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP6]], i32 4, <8 x i1> [[TMP5]], <8 x float> poison)
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 [[TMP6]], <8 x i1> [[TMP5]], <8 x float> poison)
 ; AVX1-NEXT:    [[TMP8:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float>
 ; AVX1-NEXT:    [[TMP9:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD]], [[TMP8]]
 ; AVX1-NEXT:    [[TMP10:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX]]
-; AVX1-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP9]], ptr [[TMP10]], i32 4, <8 x i1> [[TMP5]])
+; AVX1-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP9]], ptr align 4 [[TMP10]], <8 x i1> [[TMP5]])
 ; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; AVX1-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
 ; AVX1-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -539,10 +539,10 @@ define void @foo2(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr float, ptr [[TMP12]], i32 8
 ; AVX2-NEXT:    [[TMP15:%.*]] = getelementptr float, ptr [[TMP12]], i32 16
 ; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr float, ptr [[TMP12]], i32 24
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP12]], i32 4, <8 x i1> [[TMP8]], <8 x float> poison)
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP14]], i32 4, <8 x i1> [[TMP9]], <8 x float> poison)
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP15]], i32 4, <8 x i1> [[TMP10]], <8 x float> poison)
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP16]], i32 4, <8 x i1> [[TMP11]], <8 x float> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 [[TMP12]], <8 x i1> [[TMP8]], <8 x float> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 [[TMP14]], <8 x i1> [[TMP9]], <8 x float> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 [[TMP15]], <8 x i1> [[TMP10]], <8 x float> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 [[TMP16]], <8 x i1> [[TMP11]], <8 x float> poison)
 ; AVX2-NEXT:    [[TMP17:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float>
 ; AVX2-NEXT:    [[TMP18:%.*]] = sitofp <8 x i32> [[WIDE_LOAD5]] to <8 x float>
 ; AVX2-NEXT:    [[TMP19:%.*]] = sitofp <8 x i32> [[WIDE_LOAD6]] to <8 x float>
@@ -555,10 +555,10 @@ define void @foo2(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX2-NEXT:    [[TMP27:%.*]] = getelementptr float, ptr [[TMP25]], i32 8
 ; AVX2-NEXT:    [[TMP28:%.*]] = getelementptr float, ptr [[TMP25]], i32 16
 ; AVX2-NEXT:    [[TMP29:%.*]] = getelementptr float, ptr [[TMP25]], i32 24
-; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP21]], ptr [[TMP25]], i32 4, <8 x i1> [[TMP8]])
-; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP22]], ptr [[TMP27]], i32 4, <8 x i1> [[TMP9]])
-; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP23]], ptr [[TMP28]], i32 4, <8 x i1> [[TMP10]])
-; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP24]], ptr [[TMP29]], i32 4, <8 x i1> [[TMP11]])
+; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP21]], ptr align 4 [[TMP25]], <8 x i1> [[TMP8]])
+; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP22]], ptr align 4 [[TMP27]], <8 x i1> [[TMP9]])
+; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP23]], ptr align 4 [[TMP28]], <8 x i1> [[TMP10]])
+; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP24]], ptr align 4 [[TMP29]], <8 x i1> [[TMP11]])
 ; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; AVX2-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
 ; AVX2-NEXT:    br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
@@ -575,11 +575,11 @@ define void @foo2(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX2-NEXT:    [[WIDE_LOAD12:%.*]] = load <8 x i32>, ptr [[TMP43]], align 4
 ; AVX2-NEXT:    [[TMP34:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], splat (i32 100)
 ; AVX2-NEXT:    [[TMP35:%.*]] = getelementptr float, ptr [[B]], i64 [[INDEX11]]
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP35]], i32 4, <8 x i1> [[TMP34]], <8 x float> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 [[TMP35]], <8 x i1> [[TMP34]], <8 x float> poison)
 ; AVX2-NEXT:    [[TMP37:%.*]] = sitofp <8 x i32> [[WIDE_LOAD12]] to <8 x float>
 ; AVX2-NEXT:    [[TMP38:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD13]], [[TMP37]]
 ; AVX2-NEXT:    [[TMP39:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX11]]
-; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP38]], ptr [[TMP39]], i32 4, <8 x i1> [[TMP34]])
+; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP38]], ptr align 4 [[TMP39]], <8 x i1> [[TMP34]])
 ; AVX2-NEXT:    [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 8
 ; AVX2-NEXT:    [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT14]], 10000
 ; AVX2-NEXT:    br i1 [[TMP41]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -623,10 +623,10 @@ define void @foo2(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr float, ptr [[TMP12]], i32 16
 ; AVX512-NEXT:    [[TMP15:%.*]] = getelementptr float, ptr [[TMP12]], i32 32
 ; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr float, ptr [[TMP12]], i32 48
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP12]], i32 4, <16 x i1> [[TMP8]], <16 x float> poison)
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP14]], i32 4, <16 x i1> [[TMP9]], <16 x float> poison)
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP15]], i32 4, <16 x i1> [[TMP10]], <16 x float> poison)
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP16]], i32 4, <16 x i1> [[TMP11]], <16 x float> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 4 [[TMP12]], <16 x i1> [[TMP8]], <16 x float> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 4 [[TMP14]], <16 x i1> [[TMP9]], <16 x float> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 4 [[TMP15]], <16 x i1> [[TMP10]], <16 x float> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 4 [[TMP16]], <16 x i1> [[TMP11]], <16 x float> poison)
 ; AVX512-NEXT:    [[TMP17:%.*]] = sitofp <16 x i32> [[WIDE_LOAD]] to <16 x float>
 ; AVX512-NEXT:    [[TMP18:%.*]] = sitofp <16 x i32> [[WIDE_LOAD5]] to <16 x float>
 ; AVX512-NEXT:    [[TMP19:%.*]] = sitofp <16 x i32> [[WIDE_LOAD6]] to <16 x float>
@@ -639,10 +639,10 @@ define void @foo2(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX512-NEXT:    [[TMP27:%.*]] = getelementptr float, ptr [[TMP25]], i32 16
 ; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr float, ptr [[TMP25]], i32 32
 ; AVX512-NEXT:    [[TMP29:%.*]] = getelementptr float, ptr [[TMP25]], i32 48
-; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP21]], ptr [[TMP25]], i32 4, <16 x i1> [[TMP8]])
-; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP22]], ptr [[TMP27]], i32 4, <16 x i1> [[TMP9]])
-; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP23]], ptr [[TMP28]], i32 4, <16 x i1> [[TMP10]])
-; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP24]], ptr [[TMP29]], i32 4, <16 x i1> [[TMP11]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP21]], ptr align 4 [[TMP25]], <16 x i1> [[TMP8]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP22]], ptr align 4 [[TMP27]], <16 x i1> [[TMP9]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP23]], ptr align 4 [[TMP28]], <16 x i1> [[TMP10]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP24]], ptr align 4 [[TMP29]], <16 x i1> [[TMP11]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
 ; AVX512-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
 ; AVX512-NEXT:    br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
@@ -659,11 +659,11 @@ define void @foo2(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX512-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i32>, ptr [[TMP32]], align 4
 ; AVX512-NEXT:    [[TMP34:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD12]], splat (i32 100)
 ; AVX512-NEXT:    [[TMP35:%.*]] = getelementptr float, ptr [[B]], i64 [[INDEX11]]
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP35]], i32 4, <16 x i1> [[TMP34]], <16 x float> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 4 [[TMP35]], <16 x i1> [[TMP34]], <16 x float> poison)
 ; AVX512-NEXT:    [[TMP37:%.*]] = sitofp <16 x i32> [[WIDE_LOAD12]] to <16 x float>
 ; AVX512-NEXT:    [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD13]], [[TMP37]]
 ; AVX512-NEXT:    [[TMP39:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX11]]
-; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP38]], ptr [[TMP39]], i32 4, <16 x i1> [[TMP34]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP38]], ptr align 4 [[TMP39]], <16 x i1> [[TMP34]])
 ; AVX512-NEXT:    [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 16
 ; AVX512-NEXT:    [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT14]], 10000
 ; AVX512-NEXT:    br i1 [[TMP41]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -747,10 +747,10 @@ define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX1-NEXT:    [[TMP12:%.*]] = getelementptr double, ptr [[TMP10]], i32 4
 ; AVX1-NEXT:    [[TMP13:%.*]] = getelementptr double, ptr [[TMP10]], i32 8
 ; AVX1-NEXT:    [[TMP14:%.*]] = getelementptr double, ptr [[TMP10]], i32 12
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP10]], i32 8, <4 x i1> [[TMP6]], <4 x double> poison), !alias.scope [[META11:![0-9]+]]
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP12]], i32 8, <4 x i1> [[TMP7]], <4 x double> poison), !alias.scope [[META11]]
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP13]], i32 8, <4 x i1> [[TMP8]], <4 x double> poison), !alias.scope [[META11]]
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP14]], i32 8, <4 x i1> [[TMP9]], <4 x double> poison), !alias.scope [[META11]]
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP10]], <4 x i1> [[TMP6]], <4 x double> poison), !alias.scope [[META11:![0-9]+]]
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP12]], <4 x i1> [[TMP7]], <4 x double> poison), !alias.scope [[META11]]
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP13]], <4 x i1> [[TMP8]], <4 x double> poison), !alias.scope [[META11]]
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP14]], <4 x i1> [[TMP9]], <4 x double> poison), !alias.scope [[META11]]
 ; AVX1-NEXT:    [[TMP15:%.*]] = sitofp <4 x i32> [[WIDE_LOAD]] to <4 x double>
 ; AVX1-NEXT:    [[TMP16:%.*]] = sitofp <4 x i32> [[WIDE_LOAD6]] to <4 x double>
 ; AVX1-NEXT:    [[TMP17:%.*]] = sitofp <4 x i32> [[WIDE_LOAD7]] to <4 x double>
@@ -763,10 +763,10 @@ define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX1-NEXT:    [[TMP25:%.*]] = getelementptr double, ptr [[TMP23]], i32 4
 ; AVX1-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[TMP23]], i32 8
 ; AVX1-NEXT:    [[TMP27:%.*]] = getelementptr double, ptr [[TMP23]], i32 12
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP19]], ptr [[TMP23]], i32 8, <4 x i1> [[TMP6]]), !alias.scope [[META13:![0-9]+]], !noalias [[META15:![0-9]+]]
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP20]], ptr [[TMP25]], i32 8, <4 x i1> [[TMP7]]), !alias.scope [[META13]], !noalias [[META15]]
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP21]], ptr [[TMP26]], i32 8, <4 x i1> [[TMP8]]), !alias.scope [[META13]], !noalias [[META15]]
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP22]], ptr [[TMP27]], i32 8, <4 x i1> [[TMP9]]), !alias.scope [[META13]], !noalias [[META15]]
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP19]], ptr align 8 [[TMP23]], <4 x i1> [[TMP6]]), !alias.scope [[META13:![0-9]+]], !noalias [[META15:![0-9]+]]
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP20]], ptr align 8 [[TMP25]], <4 x i1> [[TMP7]]), !alias.scope [[META13]], !noalias [[META15]]
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP21]], ptr align 8 [[TMP26]], <4 x i1> [[TMP8]]), !alias.scope [[META13]], !noalias [[META15]]
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP22]], ptr align 8 [[TMP27]], <4 x i1> [[TMP9]]), !alias.scope [[META13]], !noalias [[META15]]
 ; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; AVX1-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
 ; AVX1-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
@@ -810,10 +810,10 @@ define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr double, ptr [[TMP10]], i32 4
 ; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr double, ptr [[TMP10]], i32 8
 ; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr double, ptr [[TMP10]], i32 12
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP10]], i32 8, <4 x i1> [[TMP6]], <4 x double> poison), !alias.scope [[META15:![0-9]+]]
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP12]], i32 8, <4 x i1> [[TMP7]], <4 x double> poison), !alias.scope [[META15]]
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP13]], i32 8, <4 x i1> [[TMP8]], <4 x double> poison), !alias.scope [[META15]]
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP14]], i32 8, <4 x i1> [[TMP9]], <4 x double> poison), !alias.scope [[META15]]
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP10]], <4 x i1> [[TMP6]], <4 x double> poison), !alias.scope [[META15:![0-9]+]]
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP12]], <4 x i1> [[TMP7]], <4 x double> poison), !alias.scope [[META15]]
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP13]], <4 x i1> [[TMP8]], <4 x double> poison), !alias.scope [[META15]]
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP14]], <4 x i1> [[TMP9]], <4 x double> poison), !alias.scope [[META15]]
 ; AVX2-NEXT:    [[TMP15:%.*]] = sitofp <4 x i32> [[WIDE_LOAD]] to <4 x double>
 ; AVX2-NEXT:    [[TMP16:%.*]] = sitofp <4 x i32> [[WIDE_LOAD6]] to <4 x double>
 ; AVX2-NEXT:    [[TMP17:%.*]] = sitofp <4 x i32> [[WIDE_LOAD7]] to <4 x double>
@@ -826,10 +826,10 @@ define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX2-NEXT:    [[TMP25:%.*]] = getelementptr double, ptr [[TMP23]], i32 4
 ; AVX2-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[TMP23]], i32 8
 ; AVX2-NEXT:    [[TMP27:%.*]] = getelementptr double, ptr [[TMP23]], i32 12
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP19]], ptr [[TMP23]], i32 8, <4 x i1> [[TMP6]]), !alias.scope [[META17:![0-9]+]], !noalias [[META19:![0-9]+]]
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP20]], ptr [[TMP25]], i32 8, <4 x i1> [[TMP7]]), !alias.scope [[META17]], !noalias [[META19]]
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP21]], ptr [[TMP26]], i32 8, <4 x i1> [[TMP8]]), !alias.scope [[META17]], !noalias [[META19]]
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP22]], ptr [[TMP27]], i32 8, <4 x i1> [[TMP9]]), !alias.scope [[META17]], !noalias [[META19]]
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP19]], ptr align 8 [[TMP23]], <4 x i1> [[TMP6]]), !alias.scope [[META17:![0-9]+]], !noalias [[META19:![0-9]+]]
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP20]], ptr align 8 [[TMP25]], <4 x i1> [[TMP7]]), !alias.scope [[META17]], !noalias [[META19]]
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP21]], ptr align 8 [[TMP26]], <4 x i1> [[TMP8]]), !alias.scope [[META17]], !noalias [[META19]]
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP22]], ptr align 8 [[TMP27]], <4 x i1> [[TMP9]]), !alias.scope [[META17]], !noalias [[META19]]
 ; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
 ; AVX2-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
@@ -875,10 +875,10 @@ define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr double, ptr [[TMP10]], i32 8
 ; AVX512-NEXT:    [[TMP13:%.*]] = getelementptr double, ptr [[TMP10]], i32 16
 ; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr double, ptr [[TMP10]], i32 24
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP10]], i32 8, <8 x i1> [[TMP6]], <8 x double> poison), !alias.scope [[META15:![0-9]+]]
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP12]], i32 8, <8 x i1> [[TMP7]], <8 x double> poison), !alias.scope [[META15]]
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP13]], i32 8, <8 x i1> [[TMP8]], <8 x double> poison), !alias.scope [[META15]]
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP14]], i32 8, <8 x i1> [[TMP9]], <8 x double> poison), !alias.scope [[META15]]
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 8 [[TMP10]], <8 x i1> [[TMP6]], <8 x double> poison), !alias.scope [[META15:![0-9]+]]
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 8 [[TMP12]], <8 x i1> [[TMP7]], <8 x double> poison), !alias.scope [[META15]]
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 8 [[TMP13]], <8 x i1> [[TMP8]], <8 x double> poison), !alias.scope [[META15]]
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 8 [[TMP14]], <8 x i1> [[TMP9]], <8 x double> poison), !alias.scope [[META15]]
 ; AVX512-NEXT:    [[TMP15:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x double>
 ; AVX512-NEXT:    [[TMP16:%.*]] = sitofp <8 x i32> [[WIDE_LOAD6]] to <8 x double>
 ; AVX512-NEXT:    [[TMP17:%.*]] = sitofp <8 x i32> [[WIDE_LOAD7]] to <8 x double>
@@ -891,10 +891,10 @@ define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr double, ptr [[TMP23]], i32 8
 ; AVX512-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[TMP23]], i32 16
 ; AVX512-NEXT:    [[TMP27:%.*]] = getelementptr double, ptr [[TMP23]], i32 24
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP19]], ptr [[TMP23]], i32 8, <8 x i1> [[TMP6]]), !alias.scope [[META17:![0-9]+]], !noalias [[META19:![0-9]+]]
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP20]], ptr [[TMP25]], i32 8, <8 x i1> [[TMP7]]), !alias.scope [[META17]], !noalias [[META19]]
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP21]], ptr [[TMP26]], i32 8, <8 x i1> [[TMP8]]), !alias.scope [[META17]], !noalias [[META19]]
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP22]], ptr [[TMP27]], i32 8, <8 x i1> [[TMP9]]), !alias.scope [[META17]], !noalias [[META19]]
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP19]], ptr align 8 [[TMP23]], <8 x i1> [[TMP6]]), !alias.scope [[META17:![0-9]+]], !noalias [[META19:![0-9]+]]
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP20]], ptr align 8 [[TMP25]], <8 x i1> [[TMP7]]), !alias.scope [[META17]], !noalias [[META19]]
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP21]], ptr align 8 [[TMP26]], <8 x i1> [[TMP8]]), !alias.scope [[META17]], !noalias [[META19]]
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP22]], ptr align 8 [[TMP27]], <8 x i1> [[TMP9]]), !alias.scope [[META17]], !noalias [[META19]]
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; AVX512-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
 ; AVX512-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
@@ -911,11 +911,11 @@ define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX512-NEXT:    [[WIDE_LOAD13:%.*]] = load <8 x i32>, ptr [[TMP41]], align 4, !alias.scope [[META12]]
 ; AVX512-NEXT:    [[TMP31:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], splat (i32 100)
 ; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr double, ptr [[B]], i64 [[INDEX12]]
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP32]], i32 8, <8 x i1> [[TMP31]], <8 x double> poison), !alias.scope [[META15]]
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 8 [[TMP32]], <8 x i1> [[TMP31]], <8 x double> poison), !alias.scope [[META15]]
 ; AVX512-NEXT:    [[TMP33:%.*]] = sitofp <8 x i32> [[WIDE_LOAD13]] to <8 x double>
 ; AVX512-NEXT:    [[TMP34:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD14]], [[TMP33]]
 ; AVX512-NEXT:    [[TMP35:%.*]] = getelementptr double, ptr [[A]], i64 [[INDEX12]]
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP34]], ptr [[TMP35]], i32 8, <8 x i1> [[TMP31]]), !alias.scope [[META17]], !noalias [[META19]]
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP34]], ptr align 8 [[TMP35]], <8 x i1> [[TMP31]]), !alias.scope [[META17]], !noalias [[META19]]
 ; AVX512-NEXT:    [[INDEX_NEXT15]] = add nuw i64 [[INDEX12]], 8
 ; AVX512-NEXT:    [[TMP39:%.*]] = icmp eq i64 [[INDEX_NEXT15]], 10000
 ; AVX512-NEXT:    br i1 [[TMP39]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
@@ -1011,15 +1011,15 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; AVX512-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], <8 x i64> [[VEC_IND]]
-; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP0]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !alias.scope [[META24:![0-9]+]]
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 4 [[TMP0]], <8 x i1> splat (i1 true), <8 x i32> poison), !alias.scope [[META24:![0-9]+]]
 ; AVX512-NEXT:    [[TMP1:%.*]] = icmp slt <8 x i32> [[WIDE_MASKED_GATHER]], splat (i32 100)
 ; AVX512-NEXT:    [[TMP2:%.*]] = shl nuw nsw <8 x i64> [[VEC_IND]], splat (i64 1)
 ; AVX512-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, ptr [[B]], <8 x i64> [[TMP2]]
-; AVX512-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> [[TMP3]], i32 8, <8 x i1> [[TMP1]], <8 x double> poison), !alias.scope [[META27:![0-9]+]]
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> align 8 [[TMP3]], <8 x i1> [[TMP1]], <8 x double> poison), !alias.scope [[META27:![0-9]+]]
 ; AVX512-NEXT:    [[TMP4:%.*]] = sitofp <8 x i32> [[WIDE_MASKED_GATHER]] to <8 x double>
 ; AVX512-NEXT:    [[TMP5:%.*]] = fadd <8 x double> [[WIDE_MASKED_GATHER6]], [[TMP4]]
 ; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, ptr [[A]], <8 x i64> [[VEC_IND]]
-; AVX512-NEXT:    call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> [[TMP5]], <8 x ptr> [[TMP6]], i32 8, <8 x i1> [[TMP1]]), !alias.scope [[META29:![0-9]+]], !noalias [[META31:![0-9]+]]
+; AVX512-NEXT:    call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> [[TMP5]], <8 x ptr> align 8 [[TMP6]], <8 x i1> [[TMP1]]), !alias.scope [[META29:![0-9]+]], !noalias [[META31:![0-9]+]]
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; AVX512-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 128)
 ; AVX512-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 624
@@ -1147,16 +1147,16 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr
 ; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr double, ptr [[TMP14]], i32 -12
 ; AVX2-NEXT:    [[TMP22:%.*]] = getelementptr double, ptr [[TMP21]], i32 -3
 ; AVX2-NEXT:    [[REVERSE12:%.*]] = shufflevector <4 x i1> [[TMP10]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP16]], i32 8, <4 x i1> [[REVERSE12]], <4 x double> poison), !alias.scope [[META25:![0-9]+]]
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP16]], <4 x i1> [[REVERSE12]], <4 x double> poison), !alias.scope [[META25:![0-9]+]]
 ; AVX2-NEXT:    [[REVERSE13:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:    [[REVERSE14:%.*]] = shufflevector <4 x i1> [[TMP11]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP18]], i32 8, <4 x i1> [[REVERSE14]], <4 x double> poison), !alias.scope [[META25]]
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP18]], <4 x i1> [[REVERSE14]], <4 x double> poison), !alias.scope [[META25]]
 ; AVX2-NEXT:    [[REVERSE16:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD15]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:    [[REVERSE17:%.*]] = shufflevector <4 x i1> [[TMP12]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD18:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP20]], i32 8, <4 x i1> [[REVERSE17]], <4 x double> poison), !alias.scope [[META25]]
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD18:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP20]], <4 x i1> [[REVERSE17]], <4 x double> poison), !alias.scope [[META25]]
 ; AVX2-NEXT:    [[REVERSE19:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD18]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:    [[REVERSE20:%.*]] = shufflevector <4 x i1> [[TMP13]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD21:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP22]], i32 8, <4 x i1> [[REVERSE20]], <4 x double> poison), !alias.scope [[META25]]
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD21:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP22]], <4 x i1> [[REVERSE20]], <4 x double> poison), !alias.scope [[META25]]
 ; AVX2-NEXT:    [[REVERSE22:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD21]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:    [[TMP23:%.*]] = fadd <4 x double> [[REVERSE13]], splat (double 5.000000e-01)
 ; AVX2-NEXT:    [[TMP24:%.*]] = fadd <4 x double> [[REVERSE16]], splat (double 5.000000e-01)
@@ -1172,13 +1172,13 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr
 ; AVX2-NEXT:    [[TMP34:%.*]] = getelementptr double, ptr [[TMP27]], i32 -12
 ; AVX2-NEXT:    [[TMP35:%.*]] = getelementptr double, ptr [[TMP34]], i32 -3
 ; AVX2-NEXT:    [[REVERSE24:%.*]] = shufflevector <4 x double> [[TMP23]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE24]], ptr [[TMP29]], i32 8, <4 x i1> [[REVERSE12]]), !alias.scope [[META27:![0-9]+]], !noalias [[META29:![0-9]+]]
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE24]], ptr align 8 [[TMP29]], <4 x i1> [[REVERSE12]]), !alias.scope [[META27:![0-9]+]], !noalias [[META29:![0-9]+]]
 ; AVX2-NEXT:    [[REVERSE26:%.*]] = shufflevector <4 x double> [[TMP24]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE26]], ptr [[TMP31]], i32 8, <4 x i1> [[REVERSE14]]), !alias.scope [[META27]], !noalias [[META29]]
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE26]], ptr align 8 [[TMP31]], <4 x i1> [[REVERSE14]]), !alias.scope [[META27]], !noalias [[META29]]
 ; AVX2-NEXT:    [[REVERSE28:%.*]] = shufflevector <4 x double> [[TMP25]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE28]], ptr [[TMP33]], i32 8, <4 x i1> [[REVERSE17]]), !alias.scope [[META27]], !noalias [[META29]]
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE28]], ptr align 8 [[TMP33]], <4 x i1> [[REVERSE17]]), !alias.scope [[META27]], !noalias [[META29]]
 ; AVX2-NEXT:    [[REVERSE30:%.*]] = shufflevector <4 x double> [[TMP26]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE30]], ptr [[TMP35]], i32 8, <4 x i1> [[REVERSE20]]), !alias.scope [[META27]], !noalias [[META29]]
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE30]], ptr align 8 [[TMP35]], <4 x i1> [[REVERSE20]]), !alias.scope [[META27]], !noalias [[META29]]
 ; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; AVX2-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
 ; AVX2-NEXT:    br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
@@ -1238,16 +1238,16 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr
 ; AVX512-NEXT:    [[TMP21:%.*]] = getelementptr double, ptr [[TMP14]], i32 -24
 ; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr double, ptr [[TMP21]], i32 -7
 ; AVX512-NEXT:    [[REVERSE12:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP16]], i32 8, <8 x i1> [[REVERSE12]], <8 x double> poison), !alias.scope [[META37:![0-9]+]]
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 8 [[TMP16]], <8 x i1> [[REVERSE12]], <8 x double> poison), !alias.scope [[META37:![0-9]+]]
 ; AVX512-NEXT:    [[REVERSE13:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:    [[REVERSE14:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP18]], i32 8, <8 x i1> [[REVERSE14]], <8 x double> poison), !alias.scope [[META37]]
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 8 [[TMP18]], <8 x i1> [[REVERSE14]], <8 x double> poison), !alias.scope [[META37]]
 ; AVX512-NEXT:    [[REVERSE16:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD15]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:    [[REVERSE17:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD18:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP20]], i32 8, <8 x i1> [[REVERSE17]], <8 x double> poison), !alias.scope [[META37]]
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD18:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 8 [[TMP20]], <8 x i1> [[REVERSE17]], <8 x double> poison), !alias.scope [[META37]]
 ; AVX512-NEXT:    [[REVERSE19:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD18]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:    [[REVERSE20:%.*]] = shufflevector <8 x i1> [[TMP13]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD21:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP22]], i32 8, <8 x i1> [[REVERSE20]], <8 x double> poison), !alias.scope [[META37]]
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD21:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 8 [[TMP22]], <8 x i1> [[REVERSE20]], <8 x double> poison), !alias.scope [[META37]]
 ; AVX512-NEXT:    [[REVERSE22:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD21]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:    [[TMP23:%.*]] = fadd <8 x double> [[REVERSE13]], splat (double 5.000000e-01)
 ; AVX512-NEXT:    [[TMP24:%.*]] = fadd <8 x double> [[REVERSE16]], splat (double 5.000000e-01)
@@ -1263,13 +1263,13 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr
 ; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr double, ptr [[TMP27]], i32 -24
 ; AVX512-NEXT:    [[TMP35:%.*]] = getelementptr double, ptr [[TMP34]], i32 -7
 ; AVX512-NEXT:    [[REVERSE24:%.*]] = shufflevector <8 x double> [[TMP23]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE24]], ptr [[TMP29]], i32 8, <8 x i1> [[REVERSE12]]), !alias.scope [[META39:![0-9]+]], !noalias [[META41:![0-9]+]]
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE24]], ptr align 8 [[TMP29]], <8 x i1> [[REVERSE12]]), !alias.scope [[META39:![0-9]+]], !noalias [[META41:![0-9]+]]
 ; AVX512-NEXT:    [[REVERSE26:%.*]] = shufflevector <8 x double> [[TMP24]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE26]], ptr [[TMP31]], i32 8, <8 x i1> [[REVERSE14]]), !alias.scope [[META39]], !noalias [[META41]]
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE26]], ptr align 8 [[TMP31]], <8 x i1> [[REVERSE14]]), !alias.scope [[META39]], !noalias [[META41]]
 ; AVX512-NEXT:    [[REVERSE28:%.*]] = shufflevector <8 x double> [[TMP25]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE28]], ptr [[TMP33]], i32 8, <8 x i1> [[REVERSE17]]), !alias.scope [[META39]], !noalias [[META41]]
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE28]], ptr align 8 [[TMP33]], <8 x i1> [[REVERSE17]]), !alias.scope [[META39]], !noalias [[META41]]
 ; AVX512-NEXT:    [[REVERSE30:%.*]] = shufflevector <8 x double> [[TMP26]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE30]], ptr [[TMP35]], i32 8, <8 x i1> [[REVERSE20]]), !alias.scope [[META39]], !noalias [[META41]]
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE30]], ptr align 8 [[TMP35]], <8 x i1> [[REVERSE20]]), !alias.scope [[META39]], !noalias [[META41]]
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; AVX512-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
 ; AVX512-NEXT:    br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
@@ -1351,10 +1351,10 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX1-NEXT:    [[TMP20:%.*]] = getelementptr ptr, ptr [[TMP13]], i32 4
 ; AVX1-NEXT:    [[TMP21:%.*]] = getelementptr ptr, ptr [[TMP13]], i32 8
 ; AVX1-NEXT:    [[TMP22:%.*]] = getelementptr ptr, ptr [[TMP13]], i32 12
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP13]], i32 8, <4 x i1> [[TMP14]], <4 x ptr> poison)
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP20]], i32 8, <4 x i1> [[TMP15]], <4 x ptr> poison)
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP21]], i32 8, <4 x i1> [[TMP16]], <4 x ptr> poison)
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP22]], i32 8, <4 x i1> [[TMP17]], <4 x ptr> poison)
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 [[TMP13]], <4 x i1> [[TMP14]], <4 x ptr> poison)
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 [[TMP20]], <4 x i1> [[TMP15]], <4 x ptr> poison)
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 [[TMP21]], <4 x i1> [[TMP16]], <4 x ptr> poison)
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 [[TMP22]], <4 x i1> [[TMP17]], <4 x ptr> poison)
 ; AVX1-NEXT:    [[TMP27:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer
 ; AVX1-NEXT:    [[TMP28:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer
 ; AVX1-NEXT:    [[TMP29:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer
@@ -1367,10 +1367,10 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX1-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[TMP35]], i32 4
 ; AVX1-NEXT:    [[TMP38:%.*]] = getelementptr double, ptr [[TMP35]], i32 8
 ; AVX1-NEXT:    [[TMP39:%.*]] = getelementptr double, ptr [[TMP35]], i32 12
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP35]], i32 8, <4 x i1> [[TMP31]])
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP37]], i32 8, <4 x i1> [[TMP32]])
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP38]], i32 8, <4 x i1> [[TMP33]])
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP39]], i32 8, <4 x i1> [[TMP34]])
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP35]], <4 x i1> [[TMP31]])
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP37]], <4 x i1> [[TMP32]])
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP38]], <4 x i1> [[TMP33]])
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP39]], <4 x i1> [[TMP34]])
 ; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; AVX1-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; AVX1-NEXT:    br i1 [[TMP40]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
@@ -1392,11 +1392,11 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX1-NEXT:    [[TMP44:%.*]] = and <4 x i8> [[WIDE_LOAD11]], splat (i8 1)
 ; AVX1-NEXT:    [[TMP46:%.*]] = icmp ne <4 x i8> [[TMP44]], zeroinitializer
 ; AVX1-NEXT:    [[TMP47:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[INDEX10]]
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP47]], i32 8, <4 x i1> [[TMP46]], <4 x ptr> poison)
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 [[TMP47]], <4 x i1> [[TMP46]], <4 x ptr> poison)
 ; AVX1-NEXT:    [[TMP50:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer
 ; AVX1-NEXT:    [[TMP51:%.*]] = select <4 x i1> [[TMP46]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer
 ; AVX1-NEXT:    [[TMP52:%.*]] = getelementptr double, ptr [[OUT]], i64 [[INDEX10]]
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP52]], i32 8, <4 x i1> [[TMP51]])
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP52]], <4 x i1> [[TMP51]])
 ; AVX1-NEXT:    [[INDEX_NEXT13]] = add nuw i64 [[INDEX10]], 4
 ; AVX1-NEXT:    [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC9]]
 ; AVX1-NEXT:    br i1 [[TMP54]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
@@ -1443,10 +1443,10 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 4
 ; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 8
 ; AVX2-NEXT:    [[TMP22:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 12
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP18]], i32 8, <4 x i1> [[TMP14]], <4 x ptr> poison)
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP20]], i32 8, <4 x i1> [[TMP15]], <4 x ptr> poison)
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP21]], i32 8, <4 x i1> [[TMP16]], <4 x ptr> poison)
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP22]], i32 8, <4 x i1> [[TMP17]], <4 x ptr> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 [[TMP18]], <4 x i1> [[TMP14]], <4 x ptr> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 [[TMP20]], <4 x i1> [[TMP15]], <4 x ptr> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 [[TMP21]], <4 x i1> [[TMP16]], <4 x ptr> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 [[TMP22]], <4 x i1> [[TMP17]], <4 x ptr> poison)
 ; AVX2-NEXT:    [[TMP27:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer
 ; AVX2-NEXT:    [[TMP28:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer
 ; AVX2-NEXT:    [[TMP29:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer
@@ -1459,10 +1459,10 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX2-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[TMP35]], i32 4
 ; AVX2-NEXT:    [[TMP38:%.*]] = getelementptr double, ptr [[TMP35]], i32 8
 ; AVX2-NEXT:    [[TMP39:%.*]] = getelementptr double, ptr [[TMP35]], i32 12
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP35]], i32 8, <4 x i1> [[TMP31]])
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP37]], i32 8, <4 x i1> [[TMP32]])
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP38]], i32 8, <4 x i1> [[TMP33]])
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP39]], i32 8, <4 x i1> [[TMP34]])
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP35]], <4 x i1> [[TMP31]])
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP37]], <4 x i1> [[TMP32]])
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP38]], <4 x i1> [[TMP33]])
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP39]], <4 x i1> [[TMP34]])
 ; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; AVX2-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; AVX2-NEXT:    br i1 [[TMP40]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
@@ -1484,11 +1484,11 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX2-NEXT:    [[TMP44:%.*]] = and <4 x i8> [[WIDE_LOAD11]], splat (i8 1)
 ; AVX2-NEXT:    [[TMP46:%.*]] = icmp ne <4 x i8> [[TMP44]], zeroinitializer
 ; AVX2-NEXT:    [[TMP47:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[INDEX10]]
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP47]], i32 8, <4 x i1> [[TMP46]], <4 x ptr> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 [[TMP47]], <4 x i1> [[TMP46]], <4 x ptr> poison)
 ; AVX2-NEXT:    [[TMP50:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer
 ; AVX2-NEXT:    [[TMP51:%.*]] = select <4 x i1> [[TMP46]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer
 ; AVX2-NEXT:    [[TMP52:%.*]] = getelementptr double, ptr [[OUT]], i64 [[INDEX10]]
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP52]], i32 8, <4 x i1> [[TMP51]])
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP52]], <4 x i1> [[TMP51]])
 ; AVX2-NEXT:    [[INDEX_NEXT13]] = add nuw i64 [[INDEX10]], 4
 ; AVX2-NEXT:    [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC9]]
 ; AVX2-NEXT:    br i1 [[TMP54]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
@@ -1535,10 +1535,10 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 8
 ; AVX512-NEXT:    [[TMP21:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 16
 ; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 24
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP18]], i32 8, <8 x i1> [[TMP14]], <8 x ptr> poison)
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP20]], i32 8, <8 x i1> [[TMP15]], <8 x ptr> poison)
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP21]], i32 8, <8 x i1> [[TMP16]], <8 x ptr> poison)
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP22]], i32 8, <8 x i1> [[TMP17]], <8 x ptr> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr align 8 [[TMP18]], <8 x i1> [[TMP14]], <8 x ptr> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr align 8 [[TMP20]], <8 x i1> [[TMP15]], <8 x ptr> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr align 8 [[TMP21]], <8 x i1> [[TMP16]], <8 x ptr> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr align 8 [[TMP22]], <8 x i1> [[TMP17]], <8 x ptr> poison)
 ; AVX512-NEXT:    [[TMP27:%.*]] = icmp ne <8 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer
 ; AVX512-NEXT:    [[TMP28:%.*]] = icmp ne <8 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer
 ; AVX512-NEXT:    [[TMP29:%.*]] = icmp ne <8 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer
@@ -1551,10 +1551,10 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX512-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[TMP35]], i32 8
 ; AVX512-NEXT:    [[TMP38:%.*]] = getelementptr double, ptr [[TMP35]], i32 16
 ; AVX512-NEXT:    [[TMP39:%.*]] = getelementptr double, ptr [[TMP35]], i32 24
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP35]], i32 8, <8 x i1> [[TMP31]])
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP37]], i32 8, <8 x i1> [[TMP32]])
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP38]], i32 8, <8 x i1> [[TMP33]])
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP39]], i32 8, <8 x i1> [[TMP34]])
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr align 8 [[TMP35]], <8 x i1> [[TMP31]])
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr align 8 [[TMP37]], <8 x i1> [[TMP32]])
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr align 8 [[TMP38]], <8 x i1> [[TMP33]])
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr align 8 [[TMP39]], <8 x i1> [[TMP34]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; AVX512-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; AVX512-NEXT:    br i1 [[TMP40]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]]
@@ -1576,11 +1576,11 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX512-NEXT:    [[TMP44:%.*]] = and <8 x i8> [[WIDE_LOAD11]], splat (i8 1)
 ; AVX512-NEXT:    [[TMP46:%.*]] = icmp ne <8 x i8> [[TMP44]], zeroinitializer
 ; AVX512-NEXT:    [[TMP47:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[INDEX10]]
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP47]], i32 8, <8 x i1> [[TMP46]], <8 x ptr> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr align 8 [[TMP47]], <8 x i1> [[TMP46]], <8 x ptr> poison)
 ; AVX512-NEXT:    [[TMP50:%.*]] = icmp ne <8 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer
 ; AVX512-NEXT:    [[TMP51:%.*]] = select <8 x i1> [[TMP46]], <8 x i1> [[TMP50]], <8 x i1> zeroinitializer
 ; AVX512-NEXT:    [[TMP52:%.*]] = getelementptr double, ptr [[OUT]], i64 [[INDEX10]]
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP52]], i32 8, <8 x i1> [[TMP51]])
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr align 8 [[TMP52]], <8 x i1> [[TMP51]])
 ; AVX512-NEXT:    [[INDEX_NEXT13]] = add nuw i64 [[INDEX10]], 8
 ; AVX512-NEXT:    [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC9]]
 ; AVX512-NEXT:    br i1 [[TMP54]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP45:![0-9]+]]
@@ -1672,10 +1672,10 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX1-NEXT:    [[TMP20:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 4
 ; AVX1-NEXT:    [[TMP21:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 8
 ; AVX1-NEXT:    [[TMP22:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 12
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP18]], i32 8, <4 x i1> [[TMP14]], <4 x ptr> poison)
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP20]], i32 8, <4 x i1> [[TMP15]], <4 x ptr> poison)
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP21]], i32 8, <4 x i1> [[TMP16]], <4 x ptr> poison)
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP22]], i32 8, <4 x i1> [[TMP17]], <4 x ptr> poison)
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 [[TMP18]], <4 x i1> [[TMP14]], <4 x ptr> poison)
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 [[TMP20]], <4 x i1> [[TMP15]], <4 x ptr> poison)
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 [[TMP21]], <4 x i1> [[TMP16]], <4 x ptr> poison)
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 [[TMP22]], <4 x i1> [[TMP17]], <4 x ptr> poison)
 ; AVX1-NEXT:    [[TMP27:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer
 ; AVX1-NEXT:    [[TMP28:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer
 ; AVX1-NEXT:    [[TMP29:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer
@@ -1688,10 +1688,10 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX1-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[TMP35]], i32 4
 ; AVX1-NEXT:    [[TMP38:%.*]] = getelementptr double, ptr [[TMP35]], i32 8
 ; AVX1-NEXT:    [[TMP39:%.*]] = getelementptr double, ptr [[TMP35]], i32 12
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP35]], i32 8, <4 x i1> [[TMP31]])
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP37]], i32 8, <4 x i1> [[TMP32]])
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP38]], i32 8, <4 x i1> [[TMP33]])
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP39]], i32 8, <4 x i1> [[TMP34]])
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP35]], <4 x i1> [[TMP31]])
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP37]], <4 x i1> [[TMP32]])
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP38]], <4 x i1> [[TMP33]])
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP39]], <4 x i1> [[TMP34]])
 ; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; AVX1-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; AVX1-NEXT:    br i1 [[TMP40]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
@@ -1713,11 +1713,11 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX1-NEXT:    [[TMP44:%.*]] = and <4 x i8> [[WIDE_LOAD11]], splat (i8 1)
 ; AVX1-NEXT:    [[TMP46:%.*]] = icmp ne <4 x i8> [[TMP44]], zeroinitializer
 ; AVX1-NEXT:    [[TMP45:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[INDEX10]]
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP45]], i32 8, <4 x i1> [[TMP46]], <4 x ptr> poison)
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 [[TMP45]], <4 x i1> [[TMP46]], <4 x ptr> poison)
 ; AVX1-NEXT:    [[TMP50:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer
 ; AVX1-NEXT:    [[TMP51:%.*]] = select <4 x i1> [[TMP46]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer
 ; AVX1-NEXT:    [[TMP52:%.*]] = getelementptr double, ptr [[OUT]], i64 [[INDEX10]]
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP52]], i32 8, <4 x i1> [[TMP51]])
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP52]], <4 x i1> [[TMP51]])
 ; AVX1-NEXT:    [[INDEX_NEXT13]] = add nuw i64 [[INDEX10]], 4
 ; AVX1-NEXT:    [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC9]]
 ; AVX1-NEXT:    br i1 [[TMP54]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
@@ -1764,10 +1764,10 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX2-NEXT:    [[TMP19:%.*]] = getelementptr ptr, ptr [[TMP13]], i32 4
 ; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr ptr, ptr [[TMP13]], i32 8
 ; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr ptr, ptr [[TMP13]], i32 12
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP13]], i32 8, <4 x i1> [[TMP17]], <4 x ptr> poison)
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP19]], i32 8, <4 x i1> [[TMP15]], <4 x ptr> poison)
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP20]], i32 8, <4 x i1> [[TMP16]], <4 x ptr> poison)
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD7:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP23]], i32 8, <4 x i1> [[TMP12]], <4 x ptr> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 [[TMP13]], <4 x i1> [[TMP17]], <4 x ptr> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 [[TMP19]], <4 x i1> [[TMP15]], <4 x ptr> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 [[TMP20]], <4 x i1> [[TMP16]], <4 x ptr> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD7:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 [[TMP23]], <4 x i1> [[TMP12]], <4 x ptr> poison)
 ; AVX2-NEXT:    [[TMP18:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer
 ; AVX2-NEXT:    [[TMP28:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer
 ; AVX2-NEXT:    [[TMP29:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer
@@ -1780,10 +1780,10 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX2-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[TMP35]], i32 4
 ; AVX2-NEXT:    [[TMP38:%.*]] = getelementptr double, ptr [[TMP35]], i32 8
 ; AVX2-NEXT:    [[TMP39:%.*]] = getelementptr double, ptr [[TMP35]], i32 12
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP35]], i32 8, <4 x i1> [[TMP31]])
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP37]], i32 8, <4 x i1> [[TMP32]])
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP38]], i32 8, <4 x i1> [[TMP33]])
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP39]], i32 8, <4 x i1> [[TMP34]])
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP35]], <4 x i1> [[TMP31]])
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP37]], <4 x i1> [[TMP32]])
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP38]], <4 x i1> [[TMP33]])
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP39]], <4 x i1> [[TMP34]])
 ; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; AVX2-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; AVX2-NEXT:    br i1 [[TMP40]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
@@ -1805,11 +1805,11 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX2-NEXT:    [[TMP44:%.*]] = and <4 x i8> [[WIDE_LOAD11]], splat (i8 1)
 ; AVX2-NEXT:    [[TMP46:%.*]] = icmp ne <4 x i8> [[TMP44]], zeroinitializer
 ; AVX2-NEXT:    [[TMP47:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[INDEX10]]
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP47]], i32 8, <4 x i1> [[TMP46]], <4 x ptr> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 [[TMP47]], <4 x i1> [[TMP46]], <4 x ptr> poison)
 ; AVX2-NEXT:    [[TMP50:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer
 ; AVX2-NEXT:    [[TMP51:%.*]] = select <4 x i1> [[TMP46]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer
 ; AVX2-NEXT:    [[TMP52:%.*]] = getelementptr double, ptr [[OUT]], i64 [[INDEX10]]
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP52]], i32 8, <4 x i1> [[TMP51]])
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP52]], <4 x i1> [[TMP51]])
 ; AVX2-NEXT:    [[INDEX_NEXT13]] = add nuw i64 [[INDEX10]], 4
 ; AVX2-NEXT:    [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC9]]
 ; AVX2-NEXT:    br i1 [[TMP54]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]]
@@ -1856,10 +1856,10 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 8
 ; AVX512-NEXT:    [[TMP21:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 16
 ; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 24
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP18]], i32 8, <8 x i1> [[TMP14]], <8 x ptr> poison)
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP20]], i32 8, <8 x i1> [[TMP15]], <8 x ptr> poison)
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP21]], i32 8, <8 x i1> [[TMP16]], <8 x ptr> poison)
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP22]], i32 8, <8 x i1> [[TMP17]], <8 x ptr> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr align 8 [[TMP18]], <8 x i1> [[TMP14]], <8 x ptr> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr align 8 [[TMP20]], <8 x i1> [[TMP15]], <8 x ptr> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr align 8 [[TMP21]], <8 x i1> [[TMP16]], <8 x ptr> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr align 8 [[TMP22]], <8 x i1> [[TMP17]], <8 x ptr> poison)
 ; AVX512-NEXT:    [[TMP27:%.*]] = icmp ne <8 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer
 ; AVX512-NEXT:    [[TMP28:%.*]] = icmp ne <8 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer
 ; AVX512-NEXT:    [[TMP29:%.*]] = icmp ne <8 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer
@@ -1872,10 +1872,10 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX512-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[TMP35]], i32 8
 ; AVX512-NEXT:    [[TMP38:%.*]] = getelementptr double, ptr [[TMP35]], i32 16
 ; AVX512-NEXT:    [[TMP39:%.*]] = getelementptr double, ptr [[TMP35]], i32 24
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP35]], i32 8, <8 x i1> [[TMP31]])
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP37]], i32 8, <8 x i1> [[TMP32]])
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP38]], i32 8, <8 x i1> [[TMP33]])
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP39]], i32 8, <8 x i1> [[TMP34]])
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr align 8 [[TMP35]], <8 x i1> [[TMP31]])
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr align 8 [[TMP37]], <8 x i1> [[TMP32]])
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr align 8 [[TMP38]], <8 x i1> [[TMP33]])
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr align 8 [[TMP39]], <8 x i1> [[TMP34]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; AVX512-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; AVX512-NEXT:    br i1 [[TMP40]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP47:![0-9]+]]
@@ -1897,11 +1897,11 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX512-NEXT:    [[TMP44:%.*]] = and <8 x i8> [[WIDE_LOAD11]], splat (i8 1)
 ; AVX512-NEXT:    [[TMP46:%.*]] = icmp ne <8 x i8> [[TMP44]], zeroinitializer
 ; AVX512-NEXT:    [[TMP47:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[INDEX10]]
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP47]], i32 8, <8 x i1> [[TMP46]], <8 x ptr> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr align 8 [[TMP47]], <8 x i1> [[TMP46]], <8 x ptr> poison)
 ; AVX512-NEXT:    [[TMP50:%.*]] = icmp ne <8 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer
 ; AVX512-NEXT:    [[TMP51:%.*]] = select <8 x i1> [[TMP46]], <8 x i1> [[TMP50]], <8 x i1> zeroinitializer
 ; AVX512-NEXT:    [[TMP52:%.*]] = getelementptr double, ptr [[OUT]], i64 [[INDEX10]]
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP52]], i32 8, <8 x i1> [[TMP51]])
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr align 8 [[TMP52]], <8 x i1> [[TMP51]])
 ; AVX512-NEXT:    [[INDEX_NEXT13]] = add nuw i64 [[INDEX10]], 8
 ; AVX512-NEXT:    [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC9]]
 ; AVX512-NEXT:    br i1 [[TMP54]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP48:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
index e1140b59681fe..5e1850be132bd 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
@@ -23,10 +23,10 @@ define i32 @foo_optsize() #0 {
 ; CHECK-NEXT:    [[VEC_IV:%.*]] = add <64 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <64 x i32> [[VEC_IV]], splat (i32 202)
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [32 x i8], ptr @tab, i32 0, i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr [[TMP2]], i32 1, <64 x i1> [[TMP1]], <64 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr align 1 [[TMP2]], <64 x i1> [[TMP1]], <64 x i8> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <64 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = select <64 x i1> [[TMP4]], <64 x i8> splat (i8 2), <64 x i8> splat (i8 1)
-; CHECK-NEXT:    call void @llvm.masked.store.v64i8.p0(<64 x i8> [[TMP5]], ptr [[TMP2]], i32 1, <64 x i1> [[TMP1]])
+; CHECK-NEXT:    call void @llvm.masked.store.v64i8.p0(<64 x i8> [[TMP5]], ptr align 1 [[TMP2]], <64 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 64
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -48,10 +48,10 @@ define i32 @foo_optsize() #0 {
 ; AUTOVF-NEXT:    [[VEC_IV:%.*]] = add <32 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AUTOVF-NEXT:    [[TMP1:%.*]] = icmp ule <32 x i32> [[VEC_IV]], splat (i32 202)
 ; AUTOVF-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [32 x i8], ptr @tab, i32 0, i32 [[INDEX]]
-; AUTOVF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr [[TMP2]], i32 1, <32 x i1> [[TMP1]], <32 x i8> poison)
+; AUTOVF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr align 1 [[TMP2]], <32 x i1> [[TMP1]], <32 x i8> poison)
 ; AUTOVF-NEXT:    [[TMP4:%.*]] = icmp eq <32 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer
 ; AUTOVF-NEXT:    [[TMP5:%.*]] = select <32 x i1> [[TMP4]], <32 x i8> splat (i8 2), <32 x i8> splat (i8 1)
-; AUTOVF-NEXT:    call void @llvm.masked.store.v32i8.p0(<32 x i8> [[TMP5]], ptr [[TMP2]], i32 1, <32 x i1> [[TMP1]])
+; AUTOVF-NEXT:    call void @llvm.masked.store.v32i8.p0(<32 x i8> [[TMP5]], ptr align 1 [[TMP2]], <32 x i1> [[TMP1]])
 ; AUTOVF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32
 ; AUTOVF-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 224
 ; AUTOVF-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -95,10 +95,10 @@ define i32 @foo_minsize() #1 {
 ; CHECK-NEXT:    [[VEC_IV:%.*]] = add <64 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <64 x i32> [[VEC_IV]], splat (i32 202)
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [32 x i8], ptr @tab, i32 0, i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr [[TMP2]], i32 1, <64 x i1> [[TMP1]], <64 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr align 1 [[TMP2]], <64 x i1> [[TMP1]], <64 x i8> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <64 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = select <64 x i1> [[TMP4]], <64 x i8> splat (i8 2), <64 x i8> splat (i8 1)
-; CHECK-NEXT:    call void @llvm.masked.store.v64i8.p0(<64 x i8> [[TMP5]], ptr [[TMP2]], i32 1, <64 x i1> [[TMP1]])
+; CHECK-NEXT:    call void @llvm.masked.store.v64i8.p0(<64 x i8> [[TMP5]], ptr align 1 [[TMP2]], <64 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 64
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -120,10 +120,10 @@ define i32 @foo_minsize() #1 {
 ; AUTOVF-NEXT:    [[VEC_IV:%.*]] = add <32 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AUTOVF-NEXT:    [[TMP1:%.*]] = icmp ule <32 x i32> [[VEC_IV]], splat (i32 202)
 ; AUTOVF-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [32 x i8], ptr @tab, i32 0, i32 [[INDEX]]
-; AUTOVF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr [[TMP2]], i32 1, <32 x i1> [[TMP1]], <32 x i8> poison)
+; AUTOVF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr align 1 [[TMP2]], <32 x i1> [[TMP1]], <32 x i8> poison)
 ; AUTOVF-NEXT:    [[TMP4:%.*]] = icmp eq <32 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer
 ; AUTOVF-NEXT:    [[TMP5:%.*]] = select <32 x i1> [[TMP4]], <32 x i8> splat (i8 2), <32 x i8> splat (i8 1)
-; AUTOVF-NEXT:    call void @llvm.masked.store.v32i8.p0(<32 x i8> [[TMP5]], ptr [[TMP2]], i32 1, <32 x i1> [[TMP1]])
+; AUTOVF-NEXT:    call void @llvm.masked.store.v32i8.p0(<32 x i8> [[TMP5]], ptr align 1 [[TMP2]], <32 x i1> [[TMP1]])
 ; AUTOVF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32
 ; AUTOVF-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 224
 ; AUTOVF-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -169,7 +169,7 @@ define void @scev4stride1(ptr noalias nocapture %a, ptr noalias nocapture readon
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <64 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw <64 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B]], <64 x i32> [[TMP1]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <64 x i32> @llvm.masked.gather.v64i32.v64p0(<64 x ptr> [[TMP2]], i32 4, <64 x i1> splat (i1 true), <64 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <64 x i32> @llvm.masked.gather.v64i32.v64p0(<64 x ptr> align 4 [[TMP2]], <64 x i1> splat (i1 true), <64 x i32> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]]
 ; CHECK-NEXT:    store <64 x i32> [[WIDE_MASKED_GATHER]], ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 64
@@ -194,7 +194,7 @@ define void @scev4stride1(ptr noalias nocapture %a, ptr noalias nocapture readon
 ; AUTOVF-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; AUTOVF-NEXT:    [[TMP1:%.*]] = mul nsw <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; AUTOVF-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B]], <8 x i32> [[TMP1]]
-; AUTOVF-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP2]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison)
+; AUTOVF-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 4 [[TMP2]], <8 x i1> splat (i1 true), <8 x i32> poison)
 ; AUTOVF-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]]
 ; AUTOVF-NEXT:    store <8 x i32> [[WIDE_MASKED_GATHER]], ptr [[TMP3]], align 4
 ; AUTOVF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
@@ -352,7 +352,7 @@ define void @tail_folded_store_avx512(ptr %start, ptr %end) #3 {
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <64 x i32> [[BROADCAST_SPLATINSERT]], <64 x i32> poison, <64 x i32> zeroinitializer
 ; CHECK-NEXT:    [[VEC_IV:%.*]] = add <64 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ule <64 x i32> [[VEC_IV]], [[BROADCAST_SPLAT4]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v64p0.v64p0(<64 x ptr> zeroinitializer, <64 x ptr> [[TMP5]], i32 8, <64 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.v64p0.v64p0(<64 x ptr> zeroinitializer, <64 x ptr> align 8 [[TMP5]], <64 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 64
 ; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i32 -4608
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -388,7 +388,7 @@ define void @tail_folded_store_avx512(ptr %start, ptr %end) #3 {
 ; AUTOVF-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; AUTOVF-NEXT:    [[VEC_IV:%.*]] = add <8 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AUTOVF-NEXT:    [[TMP6:%.*]] = icmp ule <8 x i32> [[VEC_IV]], [[BROADCAST_SPLAT4]]
-; AUTOVF-NEXT:    call void @llvm.masked.scatter.v8p0.v8p0(<8 x ptr> zeroinitializer, <8 x ptr> [[TMP5]], i32 8, <8 x i1> [[TMP6]])
+; AUTOVF-NEXT:    call void @llvm.masked.scatter.v8p0.v8p0(<8 x ptr> zeroinitializer, <8 x ptr> align 8 [[TMP5]], <8 x i1> [[TMP6]])
 ; AUTOVF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; AUTOVF-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i32 -576
 ; AUTOVF-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll b/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll
index 16b31ae7cfb19..113bb7a7f2aca 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll
@@ -37,13 +37,13 @@ define void @foo(i32 %n) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_LATCH]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [8 x i32], ptr @arr2, i64 0, <4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i64> [[VEC_IND]] to <4 x i32>
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP1]], <4 x ptr> [[TMP0]], i32 4, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP1]], <4 x ptr> align 4 [[TMP0]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    br label %[[FOR_BODY31:.*]]
 ; CHECK:       [[FOR_BODY31]]:
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP4:%.*]], %[[FOR_BODY31]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [8 x [8 x i32]], ptr @arr, i64 0, <4 x i64> [[VEC_PHI]], <4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP2]], <4 x ptr> [[TMP3]], i32 4, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP2]], <4 x ptr> align 4 [[TMP3]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP4]] = add nuw nsw <4 x i64> [[VEC_PHI]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <4 x i64> [[TMP4]], splat (i64 8)
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
@@ -66,13 +66,13 @@ define void @foo(i32 %n) {
 ; AVX-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; AVX-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; AVX:       [[VECTOR_BODY]]:
-; AVX-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x ptr> getelementptr inbounds ([8 x i32], ptr @arr2, <8 x i64> zeroinitializer, <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>), i32 4, <8 x i1> splat (i1 true))
+; AVX-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x ptr> align 4 getelementptr inbounds ([8 x i32], ptr @arr2, <8 x i64> zeroinitializer, <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>), <8 x i1> splat (i1 true))
 ; AVX-NEXT:    [[TMP2:%.*]] = add nsw <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[BROADCAST_SPLAT]]
 ; AVX-NEXT:    br label %[[FOR_BODY31:.*]]
 ; AVX:       [[FOR_BODY31]]:
 ; AVX-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP4:%.*]], %[[FOR_BODY31]] ]
 ; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [8 x [8 x i32]], ptr @arr, i64 0, <8 x i64> [[VEC_PHI]], <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
-; AVX-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[TMP2]], <8 x ptr> [[TMP3]], i32 4, <8 x i1> splat (i1 true))
+; AVX-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[TMP2]], <8 x ptr> align 4 [[TMP3]], <8 x i1> splat (i1 true))
 ; AVX-NEXT:    [[TMP4]] = add nuw nsw <8 x i64> [[VEC_PHI]], splat (i64 1)
 ; AVX-NEXT:    [[TMP5:%.*]] = icmp eq <8 x i64> [[TMP4]], splat (i64 8)
 ; AVX-NEXT:    [[TMP6:%.*]] = extractelement <8 x i1> [[TMP5]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll b/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll
index 5d76dfb781636..66809ebb715f0 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll
@@ -73,6 +73,10 @@ define void @parallel_loop(ptr nocapture %a, ptr nocapture %b) nounwind uwtable
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 0
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 1
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 2
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 3
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP2]], i64 4
@@ -92,14 +96,10 @@ define void @parallel_loop(ptr nocapture %a, ptr nocapture %b) nounwind uwtable
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP15]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP16]]
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 0
-; CHECK-NEXT:    store i32 [[TMP21]], ptr [[TMP17]], align 4, !llvm.access.group [[ACC_GRP1:![0-9]+]]
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 1
-; CHECK-NEXT:    store i32 [[TMP22]], ptr [[TMP18]], align 4, !llvm.access.group [[ACC_GRP1]]
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 2
-; CHECK-NEXT:    store i32 [[TMP23]], ptr [[TMP19]], align 4, !llvm.access.group [[ACC_GRP1]]
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 3
-; CHECK-NEXT:    store i32 [[TMP24]], ptr [[TMP20]], align 4, !llvm.access.group [[ACC_GRP1]]
+; CHECK-NEXT:    store i32 [[TMP1]], ptr [[TMP17]], align 4, !llvm.access.group [[ACC_GRP1:![0-9]+]]
+; CHECK-NEXT:    store i32 [[TMP21]], ptr [[TMP18]], align 4, !llvm.access.group [[ACC_GRP1]]
+; CHECK-NEXT:    store i32 [[TMP22]], ptr [[TMP19]], align 4, !llvm.access.group [[ACC_GRP1]]
+; CHECK-NEXT:    store i32 [[TMP23]], ptr [[TMP20]], align 4, !llvm.access.group [[ACC_GRP1]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr i32, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[TMP27]], i64 4
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4, !llvm.access.group [[ACC_GRP0]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr48340.ll b/llvm/test/Transforms/LoopVectorize/X86/pr48340.ll
index b6acf387fb658..485e3c4f5b941 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr48340.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr48340.ll
@@ -31,7 +31,7 @@ define ptr @foo(ptr %p, ptr %p.last) unnamed_addr #0 {
 ; CHECK-NEXT:    [[STEP_ADD:%.*]] = getelementptr i8, <4 x ptr> [[TMP5]], <4 x i64> splat (i64 4096)
 ; CHECK-NEXT:    [[STEP_ADD_2:%.*]] = getelementptr i8, <4 x ptr> [[STEP_ADD]], <4 x i64> splat (i64 4096)
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, <4 x ptr> [[STEP_ADD_2]], <4 x i64> splat (i64 4096)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> [[TMP8]], i32 8, <4 x i1> splat (i1 true), <4 x ptr> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> align 8 [[TMP8]], <4 x i1> splat (i1 true), <4 x ptr> poison)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 16384
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -91,7 +91,7 @@ define ptr @bar(ptr %p, ptr %p.last) unnamed_addr #0 {
 ; CHECK-NEXT:    [[STEP_ADD:%.*]] = getelementptr i8, <4 x ptr> [[TMP5]], <4 x i64> splat (i64 4096)
 ; CHECK-NEXT:    [[STEP_ADD_2:%.*]] = getelementptr i8, <4 x ptr> [[STEP_ADD]], <4 x i64> splat (i64 4096)
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, <4 x ptr> [[STEP_ADD_2]], <4 x i64> splat (i64 4096)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> [[TMP8]], i32 8, <4 x i1> splat (i1 true), <4 x ptr> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> align 8 [[TMP8]], <4 x i1> splat (i1 true), <4 x ptr> poison)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 16384
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll b/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll
index 286da4d31c799..785a248344825 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll
@@ -44,18 +44,18 @@ define ptr addrspace(10) @japi1_vect_42283(ptr nocapture readonly %0, i32 %1) lo
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[STEP_ADD]], i32 0
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[STEP_ADD4]], i32 0
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[STEP_ADD5]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT]], <4 x ptr addrspace(13)> [[TMP18]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10:![0-9]+]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT]], <4 x ptr addrspace(13)> [[TMP19]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT]], <4 x ptr addrspace(13)> [[TMP20]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT]], <4 x ptr addrspace(13)> [[TMP21]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT]], <4 x ptr addrspace(13)> align 8 [[TMP18]], <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT]], <4 x ptr addrspace(13)> align 8 [[TMP19]], <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT]], <4 x ptr addrspace(13)> align 8 [[TMP20]], <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT]], <4 x ptr addrspace(13)> align 8 [[TMP21]], <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[VEC_IND]], i32 1
 ; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[STEP_ADD]], i32 1
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[STEP_ADD4]], i32 1
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[STEP_ADD5]], i32 1
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT8]], <4 x ptr addrspace(13)> [[TMP22]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT8]], <4 x ptr addrspace(13)> [[TMP23]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT8]], <4 x ptr addrspace(13)> [[TMP24]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT8]], <4 x ptr addrspace(13)> [[TMP25]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT8]], <4 x ptr addrspace(13)> align 8 [[TMP22]], <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT8]], <4 x ptr addrspace(13)> align 8 [[TMP23]], <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT8]], <4 x ptr addrspace(13)> align 8 [[TMP24]], <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT8]], <4 x ptr addrspace(13)> align 8 [[TMP25]], <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD5]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -82,9 +82,9 @@ define ptr addrspace(10) @japi1_vect_42283(ptr nocapture readonly %0, i32 %1) lo
 ; CHECK-NEXT:    [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND8:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT9:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[VEC_IND8]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT11]], <4 x ptr addrspace(13)> [[TMP28]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT11]], <4 x ptr addrspace(13)> align 8 [[TMP28]], <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[VEC_IND8]], i32 1
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT13]], <4 x ptr addrspace(13)> [[TMP29]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT13]], <4 x ptr addrspace(13)> align 8 [[TMP29]], <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
 ; CHECK-NEXT:    [[INDEX_NEXT14]] = add nuw i64 [[INDEX7]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT9]] = add <4 x i64> [[VEC_IND8]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC5]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll b/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll
index 36163790706ed..ba7db65d745a9 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll
@@ -33,7 +33,7 @@ define void @test(ptr noundef align 8 dereferenceable_or_null(16) %arr) #0 {
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[TMP7]], i32 -3
 ; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i1> [[TMP4]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 1), ptr [[TMP8]], i32 8, <4 x i1> [[REVERSE]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 1), ptr align 8 [[TMP8]], <4 x i1> [[REVERSE]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4)
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 12
diff --git a/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll b/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll
index e25be6f867862..2aceb279d47db 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll
@@ -28,7 +28,7 @@ define void @switch_default_to_latch_common_dest(ptr %start, ptr %end) {
 ; COST-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 -12)
 ; COST-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 13)
 ; COST-NEXT:    [[TMP10:%.*]] = or <4 x i1> [[TMP7]], [[TMP8]]
-; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP10]])
+; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP10]])
 ; COST-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; COST-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; COST-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -85,8 +85,8 @@ define void @switch_default_to_latch_common_dest(ptr %start, ptr %end) {
 ; FORCED-NEXT:    [[TMP12:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], splat (i64 13)
 ; FORCED-NEXT:    [[TMP15:%.*]] = or <4 x i1> [[TMP9]], [[TMP11]]
 ; FORCED-NEXT:    [[TMP16:%.*]] = or <4 x i1> [[TMP10]], [[TMP12]]
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP15]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[TMP8]], i32 1, <4 x i1> [[TMP16]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP15]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[TMP8]], <4 x i1> [[TMP16]])
 ; FORCED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FORCED-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; FORCED-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -165,7 +165,7 @@ define void @switch_default_to_latch_common_dest_using_branches(ptr %start, ptr
 ; COST-NEXT:    [[TMP9:%.*]] = xor <4 x i1> [[TMP7]], splat (i1 true)
 ; COST-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[TMP9]], <4 x i1> [[TMP8]], <4 x i1> zeroinitializer
 ; COST-NEXT:    [[TMP11:%.*]] = or <4 x i1> [[TMP10]], [[TMP7]]
-; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP11]])
+; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP11]])
 ; COST-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; COST-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; COST-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -227,8 +227,8 @@ define void @switch_default_to_latch_common_dest_using_branches(ptr %start, ptr
 ; FORCED-NEXT:    [[TMP16:%.*]] = select <4 x i1> [[TMP14]], <4 x i1> [[TMP12]], <4 x i1> zeroinitializer
 ; FORCED-NEXT:    [[TMP17:%.*]] = or <4 x i1> [[TMP15]], [[TMP9]]
 ; FORCED-NEXT:    [[TMP18:%.*]] = or <4 x i1> [[TMP16]], [[TMP10]]
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP17]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[TMP8]], i32 1, <4 x i1> [[TMP18]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP17]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[TMP8]], <4 x i1> [[TMP18]])
 ; FORCED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FORCED-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; FORCED-NEXT:    br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -352,14 +352,14 @@ define void @switch_all_dests_distinct(ptr %start, ptr %end) {
 ; FORCED-NEXT:    [[TMP18:%.*]] = or <4 x i1> [[TMP16]], [[TMP14]]
 ; FORCED-NEXT:    [[TMP19:%.*]] = xor <4 x i1> [[TMP17]], splat (i1 true)
 ; FORCED-NEXT:    [[TMP20:%.*]] = xor <4 x i1> [[TMP18]], splat (i1 true)
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 1), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP13]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 1), ptr [[TMP8]], i32 1, <4 x i1> [[TMP14]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP11]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP8]], i32 1, <4 x i1> [[TMP12]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP9]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[TMP8]], i32 1, <4 x i1> [[TMP10]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP19]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr [[TMP8]], i32 1, <4 x i1> [[TMP20]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 1), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP13]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 1), ptr align 1 [[TMP8]], <4 x i1> [[TMP14]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP11]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr align 1 [[TMP8]], <4 x i1> [[TMP12]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP9]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[TMP8]], <4 x i1> [[TMP10]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP19]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr align 1 [[TMP8]], <4 x i1> [[TMP20]])
 ; FORCED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FORCED-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; FORCED-NEXT:    br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -463,10 +463,10 @@ define void @switch_all_dests_distinct_variant_using_branches(ptr %start, ptr %e
 ; COST-NEXT:    [[TMP11:%.*]] = xor <4 x i1> [[TMP8]], splat (i1 true)
 ; COST-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[TMP11]], <4 x i1> [[TMP9]], <4 x i1> zeroinitializer
 ; COST-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP10]], <4 x i1> [[TMP12]], <4 x i1> zeroinitializer
-; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 1), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP13]])
+; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 1), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP13]])
 ; COST-NEXT:    [[TMP14:%.*]] = select <4 x i1> [[TMP10]], <4 x i1> [[TMP8]], <4 x i1> zeroinitializer
-; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP14]])
-; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP7]])
+; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP14]])
+; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP7]])
 ; COST-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; COST-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; COST-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -544,14 +544,14 @@ define void @switch_all_dests_distinct_variant_using_branches(ptr %start, ptr %e
 ; FORCED-NEXT:    [[TMP20:%.*]] = select <4 x i1> [[TMP18]], <4 x i1> [[TMP14]], <4 x i1> zeroinitializer
 ; FORCED-NEXT:    [[TMP21:%.*]] = select <4 x i1> [[TMP15]], <4 x i1> [[TMP19]], <4 x i1> zeroinitializer
 ; FORCED-NEXT:    [[TMP22:%.*]] = select <4 x i1> [[TMP16]], <4 x i1> [[TMP20]], <4 x i1> zeroinitializer
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 1), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP21]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 1), ptr [[TMP8]], i32 1, <4 x i1> [[TMP22]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 1), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP21]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 1), ptr align 1 [[TMP8]], <4 x i1> [[TMP22]])
 ; FORCED-NEXT:    [[TMP23:%.*]] = select <4 x i1> [[TMP15]], <4 x i1> [[TMP11]], <4 x i1> zeroinitializer
 ; FORCED-NEXT:    [[TMP24:%.*]] = select <4 x i1> [[TMP16]], <4 x i1> [[TMP12]], <4 x i1> zeroinitializer
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP23]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP8]], i32 1, <4 x i1> [[TMP24]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP9]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[TMP8]], i32 1, <4 x i1> [[TMP10]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP23]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr align 1 [[TMP8]], <4 x i1> [[TMP24]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP9]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[TMP8]], <4 x i1> [[TMP10]])
 ; FORCED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FORCED-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; FORCED-NEXT:    br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -710,12 +710,12 @@ define void @switch_multiple_common_dests(ptr %start, ptr %end) {
 ; FORCED-NEXT:    [[TMP38:%.*]] = or <4 x i1> [[TMP28]], [[TMP36]]
 ; FORCED-NEXT:    [[TMP39:%.*]] = xor <4 x i1> [[TMP37]], splat (i1 true)
 ; FORCED-NEXT:    [[TMP40:%.*]] = xor <4 x i1> [[TMP38]], splat (i1 true)
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP35]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP8]], i32 1, <4 x i1> [[TMP36]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP27]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[TMP8]], i32 1, <4 x i1> [[TMP28]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP39]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr [[TMP8]], i32 1, <4 x i1> [[TMP40]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP35]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr align 1 [[TMP8]], <4 x i1> [[TMP36]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP27]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[TMP8]], <4 x i1> [[TMP28]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP39]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr align 1 [[TMP8]], <4 x i1> [[TMP40]])
 ; FORCED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FORCED-NEXT:    [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; FORCED-NEXT:    br i1 [[TMP41]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -847,12 +847,12 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) {
 ; FORCED-NEXT:    [[TMP18:%.*]] = or <4 x i1> [[TMP16]], [[TMP12]]
 ; FORCED-NEXT:    [[TMP20:%.*]] = xor <4 x i1> [[TMP17]], splat (i1 true)
 ; FORCED-NEXT:    [[TMP21:%.*]] = xor <4 x i1> [[TMP18]], splat (i1 true)
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP11]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP8]], i32 1, <4 x i1> [[TMP12]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP15]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[TMP8]], i32 1, <4 x i1> [[TMP16]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP20]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr [[TMP8]], i32 1, <4 x i1> [[TMP21]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP11]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr align 1 [[TMP8]], <4 x i1> [[TMP12]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP15]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[TMP8]], <4 x i1> [[TMP16]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP20]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr align 1 [[TMP8]], <4 x i1> [[TMP21]])
 ; FORCED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FORCED-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; FORCED-NEXT:    br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
@@ -952,9 +952,9 @@ define void @switch_under_br_default_common_dest_with_case(ptr %start, ptr %end,
 ; COST-NEXT:    [[TMP12:%.*]] = or <4 x i1> [[TMP10]], [[TMP11]]
 ; COST-NEXT:    [[TMP13:%.*]] = xor <4 x i1> [[TMP12]], splat (i1 true)
 ; COST-NEXT:    [[TMP14:%.*]] = select <4 x i1> [[TMP7]], <4 x i1> [[TMP13]], <4 x i1> zeroinitializer
-; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP11]])
-; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP10]])
-; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP14]])
+; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP11]])
+; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP10]])
+; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP14]])
 ; COST-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; COST-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; COST-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -1033,12 +1033,12 @@ define void @switch_under_br_default_common_dest_with_case(ptr %start, ptr %end,
 ; FORCED-NEXT:    [[TMP22:%.*]] = xor <4 x i1> [[TMP20]], splat (i1 true)
 ; FORCED-NEXT:    [[TMP23:%.*]] = select <4 x i1> [[TMP9]], <4 x i1> [[TMP21]], <4 x i1> zeroinitializer
 ; FORCED-NEXT:    [[TMP24:%.*]] = select <4 x i1> [[TMP10]], <4 x i1> [[TMP22]], <4 x i1> zeroinitializer
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP25]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP8]], i32 1, <4 x i1> [[TMP26]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP15]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[TMP8]], i32 1, <4 x i1> [[TMP16]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP23]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr [[TMP8]], i32 1, <4 x i1> [[TMP24]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP25]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr align 1 [[TMP8]], <4 x i1> [[TMP26]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP15]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[TMP8]], <4 x i1> [[TMP16]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP23]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr align 1 [[TMP8]], <4 x i1> [[TMP24]])
 ; FORCED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FORCED-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; FORCED-NEXT:    br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
@@ -1186,16 +1186,16 @@ define void @br_under_switch_default_common_dest_with_case(ptr %start, ptr %end,
 ; FORCED-NEXT:    [[TMP28:%.*]] = select <4 x i1> [[TMP10]], <4 x i1> [[TMP20]], <4 x i1> zeroinitializer
 ; FORCED-NEXT:    [[TMP29:%.*]] = or <4 x i1> [[TMP27]], [[TMP25]]
 ; FORCED-NEXT:    [[TMP30:%.*]] = or <4 x i1> [[TMP28]], [[TMP26]]
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP29]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP8]], i32 1, <4 x i1> [[TMP30]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP29]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr align 1 [[TMP8]], <4 x i1> [[TMP30]])
 ; FORCED-NEXT:    [[TMP32:%.*]] = select <4 x i1> [[TMP9]], <4 x i1> [[TMP17]], <4 x i1> zeroinitializer
 ; FORCED-NEXT:    [[TMP33:%.*]] = select <4 x i1> [[TMP10]], <4 x i1> [[TMP18]], <4 x i1> zeroinitializer
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP32]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[TMP8]], i32 1, <4 x i1> [[TMP33]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP32]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[TMP8]], <4 x i1> [[TMP33]])
 ; FORCED-NEXT:    [[TMP36:%.*]] = or <4 x i1> [[TMP32]], [[TMP15]]
 ; FORCED-NEXT:    [[TMP37:%.*]] = or <4 x i1> [[TMP33]], [[TMP16]]
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP36]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr [[TMP8]], i32 1, <4 x i1> [[TMP37]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP36]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr align 1 [[TMP8]], <4 x i1> [[TMP37]])
 ; FORCED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FORCED-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; FORCED-NEXT:    br i1 [[TMP31]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
@@ -1356,8 +1356,8 @@ define void @large_number_of_cases(ptr %start, ptr %end) {
 ; FORCED-NEXT:    [[TMP40:%.*]] = or <4 x i1> [[TMP38]], [[TMP24]]
 ; FORCED-NEXT:    [[TMP57:%.*]] = or <4 x i1> [[TMP39]], [[TMP25]]
 ; FORCED-NEXT:    [[TMP58:%.*]] = or <4 x i1> [[TMP40]], [[TMP26]]
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP57]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[TMP8]], i32 1, <4 x i1> [[TMP58]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP57]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[TMP8]], <4 x i1> [[TMP58]])
 ; FORCED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FORCED-NEXT:    [[TMP59:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; FORCED-NEXT:    br i1 [[TMP59]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/rauw-bug.ll b/llvm/test/Transforms/LoopVectorize/X86/rauw-bug.ll
index df1c4f979986c..5321d69a9f5f5 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/rauw-bug.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/rauw-bug.ll
@@ -21,10 +21,10 @@ while.cond63.preheader.while.end76_crit_edge:
   ret void
 
 while.body:
-  %d2_fx.015 = phi double [ %sub52, %while.body ], [ undef, %entry ]
-  %d2_fy.014 = phi double [ %sub58, %while.body ], [ undef, %entry ]
-  %d3_fy.013 = phi double [ %div56, %while.body ], [ undef, %entry ]
-  %d3_fx.012 = phi double [ %div50, %while.body ], [ undef, %entry ]
+  %d2_fx.015 = phi double [ %sub52, %while.body ], [ 0.0e+00, %entry ]
+  %d2_fy.014 = phi double [ %sub58, %while.body ], [ 0.0e+00, %entry ]
+  %d3_fy.013 = phi double [ %div56, %while.body ], [ 0.0e+00, %entry ]
+  %d3_fx.012 = phi double [ %div50, %while.body ], [ 0.0e+00, %entry ]
   %div50 = fmul double %d3_fx.012, 1.250000e-01
   %sub52 = fsub double 0.000000e+00, %div50
   %div56 = fmul double %d3_fy.013, 1.250000e-01
diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs-max-bandwidth.ll b/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs-max-bandwidth.ll
new file mode 100644
index 0000000000000..5011852f77e08
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs-max-bandwidth.ll
@@ -0,0 +1,378 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -p loop-vectorize -mtriple=x86_64-apple-macosx -vectorizer-maximize-bandwidth -S %s | FileCheck %s
+
+declare void @init(ptr captures(none), ptr captures(none), ptr captures(none))
+
+define void @replicating_store_with_phi_addr1(ptr noalias %array, i64 %N, i32 %x, i1 %cond) {
+; CHECK-LABEL: define void @replicating_store_with_phi_addr1(
+; CHECK-SAME: ptr noalias [[ARRAY:%.*]], i64 [[N:%.*]], i32 [[X:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[PTR1:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[PTR2:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[PTR3:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    call void @init(ptr [[PTR1]], ptr [[PTR2]], ptr [[PTR3]])
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x ptr> poison, ptr [[PTR1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x ptr> [[BROADCAST_SPLATINSERT]], <16 x ptr> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i1> poison, i1 [[COND]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i1> [[BROADCAST_SPLATINSERT1]], <16 x i1> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE32:.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAY]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <16 x i32> poison, i32 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <16 x i32> [[TMP17]], i32 [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <16 x i32> [[TMP18]], i32 [[TMP3]], i32 2
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <16 x i32> [[TMP19]], i32 [[TMP4]], i32 3
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <16 x i32> [[TMP20]], i32 [[TMP5]], i32 4
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <16 x i32> [[TMP21]], i32 [[TMP6]], i32 5
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <16 x i32> [[TMP22]], i32 [[TMP7]], i32 6
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <16 x i32> [[TMP23]], i32 [[TMP8]], i32 7
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <16 x i32> [[TMP24]], i32 [[TMP9]], i32 8
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <16 x i32> [[TMP25]], i32 [[TMP10]], i32 9
+; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <16 x i32> [[TMP26]], i32 [[TMP11]], i32 10
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <16 x i32> [[TMP27]], i32 [[TMP12]], i32 11
+; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <16 x i32> [[TMP28]], i32 [[TMP13]], i32 12
+; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <16 x i32> [[TMP29]], i32 [[TMP14]], i32 13
+; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <16 x i32> [[TMP30]], i32 [[TMP15]], i32 14
+; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <16 x i32> [[TMP31]], i32 [[TMP16]], i32 15
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp sgt <16 x i32> [[TMP32]], zeroinitializer
+; CHECK-NEXT:    [[TMP34:%.*]] = xor <16 x i1> [[TMP33]], splat (i1 true)
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp slt i32 [[X]], [[TMP1]]
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp slt i32 [[X]], [[TMP2]]
+; CHECK-NEXT:    [[TMP37:%.*]] = icmp slt i32 [[X]], [[TMP3]]
+; CHECK-NEXT:    [[TMP38:%.*]] = icmp slt i32 [[X]], [[TMP4]]
+; CHECK-NEXT:    [[TMP39:%.*]] = icmp slt i32 [[X]], [[TMP5]]
+; CHECK-NEXT:    [[TMP40:%.*]] = icmp slt i32 [[X]], [[TMP6]]
+; CHECK-NEXT:    [[TMP41:%.*]] = icmp slt i32 [[X]], [[TMP7]]
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp slt i32 [[X]], [[TMP8]]
+; CHECK-NEXT:    [[TMP43:%.*]] = icmp slt i32 [[X]], [[TMP9]]
+; CHECK-NEXT:    [[TMP44:%.*]] = icmp slt i32 [[X]], [[TMP10]]
+; CHECK-NEXT:    [[TMP45:%.*]] = icmp slt i32 [[X]], [[TMP11]]
+; CHECK-NEXT:    [[TMP46:%.*]] = icmp slt i32 [[X]], [[TMP12]]
+; CHECK-NEXT:    [[TMP47:%.*]] = icmp slt i32 [[X]], [[TMP13]]
+; CHECK-NEXT:    [[TMP48:%.*]] = icmp slt i32 [[X]], [[TMP14]]
+; CHECK-NEXT:    [[TMP49:%.*]] = icmp slt i32 [[X]], [[TMP15]]
+; CHECK-NEXT:    [[TMP50:%.*]] = icmp slt i32 [[X]], [[TMP16]]
+; CHECK-NEXT:    [[TMP51:%.*]] = select i1 [[TMP35]], ptr [[PTR2]], ptr [[PTR3]]
+; CHECK-NEXT:    [[TMP52:%.*]] = select i1 [[TMP36]], ptr [[PTR2]], ptr [[PTR3]]
+; CHECK-NEXT:    [[TMP53:%.*]] = select i1 [[TMP37]], ptr [[PTR2]], ptr [[PTR3]]
+; CHECK-NEXT:    [[TMP54:%.*]] = select i1 [[TMP38]], ptr [[PTR2]], ptr [[PTR3]]
+; CHECK-NEXT:    [[TMP55:%.*]] = select i1 [[TMP39]], ptr [[PTR2]], ptr [[PTR3]]
+; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP40]], ptr [[PTR2]], ptr [[PTR3]]
+; CHECK-NEXT:    [[TMP57:%.*]] = select i1 [[TMP41]], ptr [[PTR2]], ptr [[PTR3]]
+; CHECK-NEXT:    [[TMP58:%.*]] = select i1 [[TMP42]], ptr [[PTR2]], ptr [[PTR3]]
+; CHECK-NEXT:    [[TMP59:%.*]] = select i1 [[TMP43]], ptr [[PTR2]], ptr [[PTR3]]
+; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP44]], ptr [[PTR2]], ptr [[PTR3]]
+; CHECK-NEXT:    [[TMP61:%.*]] = select i1 [[TMP45]], ptr [[PTR2]], ptr [[PTR3]]
+; CHECK-NEXT:    [[TMP62:%.*]] = select i1 [[TMP46]], ptr [[PTR2]], ptr [[PTR3]]
+; CHECK-NEXT:    [[TMP63:%.*]] = select i1 [[TMP47]], ptr [[PTR2]], ptr [[PTR3]]
+; CHECK-NEXT:    [[TMP64:%.*]] = select i1 [[TMP48]], ptr [[PTR2]], ptr [[PTR3]]
+; CHECK-NEXT:    [[TMP65:%.*]] = select i1 [[TMP49]], ptr [[PTR2]], ptr [[PTR3]]
+; CHECK-NEXT:    [[TMP66:%.*]] = select i1 [[TMP50]], ptr [[PTR2]], ptr [[PTR3]]
+; CHECK-NEXT:    [[TMP67:%.*]] = insertelement <16 x ptr> poison, ptr [[TMP51]], i32 0
+; CHECK-NEXT:    [[TMP68:%.*]] = insertelement <16 x ptr> [[TMP67]], ptr [[TMP52]], i32 1
+; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <16 x ptr> [[TMP68]], ptr [[TMP53]], i32 2
+; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <16 x ptr> [[TMP69]], ptr [[TMP54]], i32 3
+; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <16 x ptr> [[TMP70]], ptr [[TMP55]], i32 4
+; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <16 x ptr> [[TMP71]], ptr [[TMP56]], i32 5
+; CHECK-NEXT:    [[TMP73:%.*]] = insertelement <16 x ptr> [[TMP72]], ptr [[TMP57]], i32 6
+; CHECK-NEXT:    [[TMP74:%.*]] = insertelement <16 x ptr> [[TMP73]], ptr [[TMP58]], i32 7
+; CHECK-NEXT:    [[TMP75:%.*]] = insertelement <16 x ptr> [[TMP74]], ptr [[TMP59]], i32 8
+; CHECK-NEXT:    [[TMP76:%.*]] = insertelement <16 x ptr> [[TMP75]], ptr [[TMP60]], i32 9
+; CHECK-NEXT:    [[TMP77:%.*]] = insertelement <16 x ptr> [[TMP76]], ptr [[TMP61]], i32 10
+; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <16 x ptr> [[TMP77]], ptr [[TMP62]], i32 11
+; CHECK-NEXT:    [[TMP79:%.*]] = insertelement <16 x ptr> [[TMP78]], ptr [[TMP63]], i32 12
+; CHECK-NEXT:    [[TMP80:%.*]] = insertelement <16 x ptr> [[TMP79]], ptr [[TMP64]], i32 13
+; CHECK-NEXT:    [[TMP81:%.*]] = insertelement <16 x ptr> [[TMP80]], ptr [[TMP65]], i32 14
+; CHECK-NEXT:    [[TMP82:%.*]] = insertelement <16 x ptr> [[TMP81]], ptr [[TMP66]], i32 15
+; CHECK-NEXT:    [[TMP83:%.*]] = select <16 x i1> [[TMP33]], <16 x i1> [[BROADCAST_SPLAT2]], <16 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP84:%.*]] = or <16 x i1> [[TMP83]], [[TMP34]]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <16 x i1> [[TMP33]], <16 x ptr> [[BROADCAST_SPLAT]], <16 x ptr> [[TMP82]]
+; CHECK-NEXT:    [[TMP85:%.*]] = extractelement <16 x i1> [[TMP84]], i32 0
+; CHECK-NEXT:    br i1 [[TMP85]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK:       [[PRED_STORE_IF]]:
+; CHECK-NEXT:    [[TMP86:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 0
+; CHECK-NEXT:    store i8 0, ptr [[TMP86]], align 1
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; CHECK:       [[PRED_STORE_CONTINUE]]:
+; CHECK-NEXT:    [[TMP87:%.*]] = extractelement <16 x i1> [[TMP84]], i32 1
+; CHECK-NEXT:    br i1 [[TMP87]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
+; CHECK:       [[PRED_STORE_IF3]]:
+; CHECK-NEXT:    [[TMP88:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 1
+; CHECK-NEXT:    store i8 0, ptr [[TMP88]], align 1
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
+; CHECK:       [[PRED_STORE_CONTINUE4]]:
+; CHECK-NEXT:    [[TMP89:%.*]] = extractelement <16 x i1> [[TMP84]], i32 2
+; CHECK-NEXT:    br i1 [[TMP89]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
+; CHECK:       [[PRED_STORE_IF5]]:
+; CHECK-NEXT:    [[TMP90:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 2
+; CHECK-NEXT:    store i8 0, ptr [[TMP90]], align 1
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
+; CHECK:       [[PRED_STORE_CONTINUE6]]:
+; CHECK-NEXT:    [[TMP91:%.*]] = extractelement <16 x i1> [[TMP84]], i32 3
+; CHECK-NEXT:    br i1 [[TMP91]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; CHECK:       [[PRED_STORE_IF7]]:
+; CHECK-NEXT:    [[TMP92:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 3
+; CHECK-NEXT:    store i8 0, ptr [[TMP92]], align 1
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
+; CHECK:       [[PRED_STORE_CONTINUE8]]:
+; CHECK-NEXT:    [[TMP93:%.*]] = extractelement <16 x i1> [[TMP84]], i32 4
+; CHECK-NEXT:    br i1 [[TMP93]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; CHECK:       [[PRED_STORE_IF9]]:
+; CHECK-NEXT:    [[TMP94:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 4
+; CHECK-NEXT:    store i8 0, ptr [[TMP94]], align 1
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
+; CHECK:       [[PRED_STORE_CONTINUE10]]:
+; CHECK-NEXT:    [[TMP95:%.*]] = extractelement <16 x i1> [[TMP84]], i32 5
+; CHECK-NEXT:    br i1 [[TMP95]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
+; CHECK:       [[PRED_STORE_IF11]]:
+; CHECK-NEXT:    [[TMP96:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 5
+; CHECK-NEXT:    store i8 0, ptr [[TMP96]], align 1
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
+; CHECK:       [[PRED_STORE_CONTINUE12]]:
+; CHECK-NEXT:    [[TMP97:%.*]] = extractelement <16 x i1> [[TMP84]], i32 6
+; CHECK-NEXT:    br i1 [[TMP97]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]]
+; CHECK:       [[PRED_STORE_IF13]]:
+; CHECK-NEXT:    [[TMP98:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 6
+; CHECK-NEXT:    store i8 0, ptr [[TMP98]], align 1
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE14]]
+; CHECK:       [[PRED_STORE_CONTINUE14]]:
+; CHECK-NEXT:    [[TMP99:%.*]] = extractelement <16 x i1> [[TMP84]], i32 7
+; CHECK-NEXT:    br i1 [[TMP99]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]]
+; CHECK:       [[PRED_STORE_IF15]]:
+; CHECK-NEXT:    [[TMP100:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 7
+; CHECK-NEXT:    store i8 0, ptr [[TMP100]], align 1
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE16]]
+; CHECK:       [[PRED_STORE_CONTINUE16]]:
+; CHECK-NEXT:    [[TMP101:%.*]] = extractelement <16 x i1> [[TMP84]], i32 8
+; CHECK-NEXT:    br i1 [[TMP101]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]]
+; CHECK:       [[PRED_STORE_IF17]]:
+; CHECK-NEXT:    [[TMP102:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 8
+; CHECK-NEXT:    store i8 0, ptr [[TMP102]], align 1
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE18]]
+; CHECK:       [[PRED_STORE_CONTINUE18]]:
+; CHECK-NEXT:    [[TMP103:%.*]] = extractelement <16 x i1> [[TMP84]], i32 9
+; CHECK-NEXT:    br i1 [[TMP103]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]]
+; CHECK:       [[PRED_STORE_IF19]]:
+; CHECK-NEXT:    [[TMP104:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 9
+; CHECK-NEXT:    store i8 0, ptr [[TMP104]], align 1
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE20]]
+; CHECK:       [[PRED_STORE_CONTINUE20]]:
+; CHECK-NEXT:    [[TMP105:%.*]] = extractelement <16 x i1> [[TMP84]], i32 10
+; CHECK-NEXT:    br i1 [[TMP105]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]]
+; CHECK:       [[PRED_STORE_IF21]]:
+; CHECK-NEXT:    [[TMP106:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 10
+; CHECK-NEXT:    store i8 0, ptr [[TMP106]], align 1
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE22]]
+; CHECK:       [[PRED_STORE_CONTINUE22]]:
+; CHECK-NEXT:    [[TMP107:%.*]] = extractelement <16 x i1> [[TMP84]], i32 11
+; CHECK-NEXT:    br i1 [[TMP107]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]]
+; CHECK:       [[PRED_STORE_IF23]]:
+; CHECK-NEXT:    [[TMP108:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 11
+; CHECK-NEXT:    store i8 0, ptr [[TMP108]], align 1
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE24]]
+; CHECK:       [[PRED_STORE_CONTINUE24]]:
+; CHECK-NEXT:    [[TMP109:%.*]] = extractelement <16 x i1> [[TMP84]], i32 12
+; CHECK-NEXT:    br i1 [[TMP109]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]]
+; CHECK:       [[PRED_STORE_IF25]]:
+; CHECK-NEXT:    [[TMP110:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 12
+; CHECK-NEXT:    store i8 0, ptr [[TMP110]], align 1
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE26]]
+; CHECK:       [[PRED_STORE_CONTINUE26]]:
+; CHECK-NEXT:    [[TMP111:%.*]] = extractelement <16 x i1> [[TMP84]], i32 13
+; CHECK-NEXT:    br i1 [[TMP111]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]]
+; CHECK:       [[PRED_STORE_IF27]]:
+; CHECK-NEXT:    [[TMP112:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 13
+; CHECK-NEXT:    store i8 0, ptr [[TMP112]], align 1
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE28]]
+; CHECK:       [[PRED_STORE_CONTINUE28]]:
+; CHECK-NEXT:    [[TMP113:%.*]] = extractelement <16 x i1> [[TMP84]], i32 14
+; CHECK-NEXT:    br i1 [[TMP113]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30:.*]]
+; CHECK:       [[PRED_STORE_IF29]]:
+; CHECK-NEXT:    [[TMP114:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 14
+; CHECK-NEXT:    store i8 0, ptr [[TMP114]], align 1
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE30]]
+; CHECK:       [[PRED_STORE_CONTINUE30]]:
+; CHECK-NEXT:    [[TMP115:%.*]] = extractelement <16 x i1> [[TMP84]], i32 15
+; CHECK-NEXT:    br i1 [[TMP115]], label %[[PRED_STORE_IF31:.*]], label %[[PRED_STORE_CONTINUE32]]
+; CHECK:       [[PRED_STORE_IF31]]:
+; CHECK-NEXT:    [[TMP116:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 15
+; CHECK-NEXT:    store i8 0, ptr [[TMP116]], align 1
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE32]]
+; CHECK:       [[PRED_STORE_CONTINUE32]]:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP117:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
+; CHECK-NEXT:    br i1 [[TMP117]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[SCALAR_PH:.*]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 96, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[EC:%.*]] = icmp ne i64 [[IV]], 99
+; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP_BODY:.*]], label %[[EXIT:.*]]
+; CHECK:       [[LOOP_BODY]]:
+; CHECK-NEXT:    [[ARRAY_PTR:%.*]] = load ptr, ptr [[ARRAY]], align 8
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[ARRAY_PTR]], align 4
+; CHECK-NEXT:    [[CMP_POS:%.*]] = icmp sgt i32 [[VAL]], 0
+; CHECK-NEXT:    br i1 [[CMP_POS]], label %[[ELSE:.*]], label %[[THEN:.*]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[CMP_X:%.*]] = icmp slt i32 [[X]], [[VAL]]
+; CHECK-NEXT:    [[SELECT_PTR:%.*]] = select i1 [[CMP_X]], ptr [[PTR2]], ptr [[PTR3]]
+; CHECK-NEXT:    br label %[[MERGE_AND_STORE:.*]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    br i1 [[COND]], label %[[MERGE_AND_STORE]], label %[[LOOP_LATCH]]
+; CHECK:       [[MERGE_AND_STORE]]:
+; CHECK-NEXT:    [[PTR_PHI:%.*]] = phi ptr [ [[SELECT_PTR]], %[[THEN]] ], [ [[PTR1]], %[[ELSE]] ]
+; CHECK-NEXT:    store i8 0, ptr [[PTR_PHI]], align 1
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    br label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %ptr1 = alloca i8, align 1
+  %ptr2 = alloca i8, align 1
+  %ptr3 = alloca i8, align 1
+  call void @init(ptr %ptr1, ptr %ptr2, ptr %ptr3)
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %ec = icmp ne i64 %iv, 99
+  br i1 %ec, label %loop.body, label %exit
+
+loop.body:
+  %array.ptr = load ptr, ptr %array, align 8
+  %val = load i32, ptr %array.ptr, align 4
+  %cmp.pos = icmp sgt i32 %val, 0
+  br i1 %cmp.pos, label %else, label %then
+
+then:
+  %cmp.x = icmp slt i32 %x, %val
+  %select.ptr = select i1 %cmp.x, ptr %ptr2, ptr %ptr3
+  br label %merge.and.store
+
+else:
+  br i1 %cond, label %merge.and.store, label %loop.latch
+
+merge.and.store:
+  %ptr.phi = phi ptr [ %select.ptr, %then ], [ %ptr1, %else ]
+  store i8 0, ptr %ptr.phi, align 1
+  br label %loop.latch
+
+loop.latch:
+  %iv.next = add i64 %iv, 1
+  br label %loop.header
+
+exit:
+  ret void
+}
+
+define void @replicating_store_with_phi_addr2(ptr noalias %array, ptr noalias %base, i64 %N, i32 %x, i1 %cond) {
+; CHECK-LABEL: define void @replicating_store_with_phi_addr2(
+; CHECK-SAME: ptr noalias [[ARRAY:%.*]], ptr noalias [[BASE:%.*]], i64 [[N:%.*]], i32 [[X:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[PTR1:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[PTR2:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[PTR3:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    call void @init(ptr [[PTR1]], ptr [[PTR2]], ptr [[PTR3]])
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[EC:%.*]] = icmp ne i64 [[IV]], 99
+; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP_BODY:.*]], label %[[EXIT:.*]]
+; CHECK:       [[LOOP_BODY]]:
+; CHECK-NEXT:    [[GEP_ARRAY:%.*]] = getelementptr i32, ptr [[ARRAY]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_ARRAY]], align 8
+; CHECK-NEXT:    [[L_PTR:%.*]] = load ptr, ptr [[BASE]], align 4
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[L_PTR]], align 4
+; CHECK-NEXT:    [[CMP_POS:%.*]] = icmp sgt i32 [[VAL]], 0
+; CHECK-NEXT:    br i1 [[CMP_POS]], label %[[ELSE:.*]], label %[[THEN:.*]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[SEL_CMP:%.*]] = icmp slt i32 [[X]], [[L]]
+; CHECK-NEXT:    [[SELECT_PTR:%.*]] = select i1 [[SEL_CMP]], ptr [[PTR2]], ptr [[PTR3]]
+; CHECK-NEXT:    br label %[[MERGE_AND_STORE:.*]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    br i1 [[COND]], label %[[MERGE_AND_STORE]], label %[[LOOP_LATCH]]
+; CHECK:       [[MERGE_AND_STORE]]:
+; CHECK-NEXT:    [[PTR_PHI:%.*]] = phi ptr [ [[SELECT_PTR]], %[[THEN]] ], [ [[PTR1]], %[[ELSE]] ]
+; CHECK-NEXT:    store i8 0, ptr [[PTR_PHI]], align 1
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    br label %[[LOOP_HEADER]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %ptr1 = alloca i8, align 1
+  %ptr2 = alloca i8, align 1
+  %ptr3 = alloca i8, align 1
+  call void @init(ptr %ptr1, ptr %ptr2, ptr %ptr3)
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %ec = icmp ne i64 %iv, 99
+  br i1 %ec, label %loop.body, label %exit
+
+loop.body:
+  %gep.array = getelementptr i32, ptr %array, i64 %iv
+  %l = load i32, ptr %gep.array, align 8
+  %l.ptr = load ptr, ptr %base, align 4
+  %val = load i32, ptr %l.ptr, align 4
+  %cmp.pos = icmp sgt i32 %val, 0
+  br i1 %cmp.pos, label %else, label %then
+
+then:
+  %sel.cmp = icmp slt i32 %x, %l
+  %select.ptr = select i1 %sel.cmp, ptr %ptr2, ptr %ptr3
+  br label %merge.and.store
+
+else:
+  br i1 %cond, label %merge.and.store, label %loop.latch
+
+merge.and.store:
+  %ptr.phi = phi ptr [ %select.ptr, %then ], [ %ptr1, %else ]
+  store i8 0, ptr %ptr.phi, align 1
+  br label %loop.latch
+
+loop.latch:
+  %iv.next = add i64 %iv, 1
+  br label %loop.header
+
+exit:
+  ret void
+}
+
+attributes #0 = { "target-cpu"="znver2" }
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.enable", i1 true}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll
index 65058bd86c935..14a83176f02ff 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll
@@ -45,9 +45,25 @@ define void @test_store_initially_interleave(i32 %n, ptr noalias %src) #0 {
 ; I64-NEXT:    [[TMP18:%.*]] = add i32 [[INDEX]], 14
 ; I64-NEXT:    [[TMP19:%.*]] = add i32 [[INDEX]], 15
 ; I64-NEXT:    [[TMP20:%.*]] = uitofp <4 x i32> [[VEC_IND]] to <4 x double>
+; I64-NEXT:    [[CONV:%.*]] = extractelement <4 x double> [[TMP20]], i32 0
+; I64-NEXT:    [[TMP57:%.*]] = extractelement <4 x double> [[TMP20]], i32 1
+; I64-NEXT:    [[TMP58:%.*]] = extractelement <4 x double> [[TMP20]], i32 2
+; I64-NEXT:    [[TMP59:%.*]] = extractelement <4 x double> [[TMP20]], i32 3
 ; I64-NEXT:    [[TMP21:%.*]] = uitofp <4 x i32> [[STEP_ADD]] to <4 x double>
+; I64-NEXT:    [[TMP60:%.*]] = extractelement <4 x double> [[TMP21]], i32 0
+; I64-NEXT:    [[TMP61:%.*]] = extractelement <4 x double> [[TMP21]], i32 1
+; I64-NEXT:    [[TMP62:%.*]] = extractelement <4 x double> [[TMP21]], i32 2
+; I64-NEXT:    [[TMP63:%.*]] = extractelement <4 x double> [[TMP21]], i32 3
 ; I64-NEXT:    [[TMP22:%.*]] = uitofp <4 x i32> [[STEP_ADD_2]] to <4 x double>
+; I64-NEXT:    [[TMP64:%.*]] = extractelement <4 x double> [[TMP22]], i32 0
+; I64-NEXT:    [[TMP65:%.*]] = extractelement <4 x double> [[TMP22]], i32 1
+; I64-NEXT:    [[TMP66:%.*]] = extractelement <4 x double> [[TMP22]], i32 2
+; I64-NEXT:    [[TMP67:%.*]] = extractelement <4 x double> [[TMP22]], i32 3
 ; I64-NEXT:    [[TMP23:%.*]] = uitofp <4 x i32> [[STEP_ADD_3]] to <4 x double>
+; I64-NEXT:    [[TMP68:%.*]] = extractelement <4 x double> [[TMP23]], i32 0
+; I64-NEXT:    [[TMP69:%.*]] = extractelement <4 x double> [[TMP23]], i32 1
+; I64-NEXT:    [[TMP70:%.*]] = extractelement <4 x double> [[TMP23]], i32 2
+; I64-NEXT:    [[TMP71:%.*]] = extractelement <4 x double> [[TMP23]], i32 3
 ; I64-NEXT:    [[ADD_PTR_I:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[IV]]
 ; I64-NEXT:    [[TMP25:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP5]]
 ; I64-NEXT:    [[TMP26:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP6]]
@@ -80,37 +96,21 @@ define void @test_store_initially_interleave(i32 %n, ptr noalias %src) #0 {
 ; I64-NEXT:    [[TMP53:%.*]] = load ptr, ptr [[TMP37]], align 4
 ; I64-NEXT:    [[TMP54:%.*]] = load ptr, ptr [[TMP38]], align 4
 ; I64-NEXT:    [[TMP55:%.*]] = load ptr, ptr [[TMP39]], align 4
-; I64-NEXT:    [[CONV:%.*]] = extractelement <4 x double> [[TMP20]], i32 0
 ; I64-NEXT:    store double [[CONV]], ptr [[TMP0]], align 4
-; I64-NEXT:    [[TMP57:%.*]] = extractelement <4 x double> [[TMP20]], i32 1
 ; I64-NEXT:    store double [[TMP57]], ptr [[TMP41]], align 4
-; I64-NEXT:    [[TMP58:%.*]] = extractelement <4 x double> [[TMP20]], i32 2
 ; I64-NEXT:    store double [[TMP58]], ptr [[TMP42]], align 4
-; I64-NEXT:    [[TMP59:%.*]] = extractelement <4 x double> [[TMP20]], i32 3
 ; I64-NEXT:    store double [[TMP59]], ptr [[TMP43]], align 4
-; I64-NEXT:    [[TMP60:%.*]] = extractelement <4 x double> [[TMP21]], i32 0
 ; I64-NEXT:    store double [[TMP60]], ptr [[TMP44]], align 4
-; I64-NEXT:    [[TMP61:%.*]] = extractelement <4 x double> [[TMP21]], i32 1
 ; I64-NEXT:    store double [[TMP61]], ptr [[TMP45]], align 4
-; I64-NEXT:    [[TMP62:%.*]] = extractelement <4 x double> [[TMP21]], i32 2
 ; I64-NEXT:    store double [[TMP62]], ptr [[TMP46]], align 4
-; I64-NEXT:    [[TMP63:%.*]] = extractelement <4 x double> [[TMP21]], i32 3
 ; I64-NEXT:    store double [[TMP63]], ptr [[TMP47]], align 4
-; I64-NEXT:    [[TMP64:%.*]] = extractelement <4 x double> [[TMP22]], i32 0
 ; I64-NEXT:    store double [[TMP64]], ptr [[TMP48]], align 4
-; I64-NEXT:    [[TMP65:%.*]] = extractelement <4 x double> [[TMP22]], i32 1
 ; I64-NEXT:    store double [[TMP65]], ptr [[TMP49]], align 4
-; I64-NEXT:    [[TMP66:%.*]] = extractelement <4 x double> [[TMP22]], i32 2
 ; I64-NEXT:    store double [[TMP66]], ptr [[TMP50]], align 4
-; I64-NEXT:    [[TMP67:%.*]] = extractelement <4 x double> [[TMP22]], i32 3
 ; I64-NEXT:    store double [[TMP67]], ptr [[TMP51]], align 4
-; I64-NEXT:    [[TMP68:%.*]] = extractelement <4 x double> [[TMP23]], i32 0
 ; I64-NEXT:    store double [[TMP68]], ptr [[TMP52]], align 4
-; I64-NEXT:    [[TMP69:%.*]] = extractelement <4 x double> [[TMP23]], i32 1
 ; I64-NEXT:    store double [[TMP69]], ptr [[TMP53]], align 4
-; I64-NEXT:    [[TMP70:%.*]] = extractelement <4 x double> [[TMP23]], i32 2
 ; I64-NEXT:    store double [[TMP70]], ptr [[TMP54]], align 4
-; I64-NEXT:    [[TMP71:%.*]] = extractelement <4 x double> [[TMP23]], i32 3
 ; I64-NEXT:    store double [[TMP71]], ptr [[TMP55]], align 4
 ; I64-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
 ; I64-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD_3]], splat (i32 4)
@@ -139,21 +139,21 @@ define void @test_store_initially_interleave(i32 %n, ptr noalias %src) #0 {
 ; I64-NEXT:    [[TMP77:%.*]] = add i32 [[INDEX4]], 2
 ; I64-NEXT:    [[TMP78:%.*]] = add i32 [[INDEX4]], 3
 ; I64-NEXT:    [[TMP79:%.*]] = uitofp <4 x i32> [[VEC_IND5]] to <4 x double>
-; I64-NEXT:    [[TMP80:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP75]]
-; I64-NEXT:    [[TMP81:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP76]]
-; I64-NEXT:    [[TMP82:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP77]]
-; I64-NEXT:    [[TMP83:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP78]]
-; I64-NEXT:    [[TMP84:%.*]] = load ptr, ptr [[TMP80]], align 4
-; I64-NEXT:    [[TMP85:%.*]] = load ptr, ptr [[TMP81]], align 4
-; I64-NEXT:    [[TMP86:%.*]] = load ptr, ptr [[TMP82]], align 4
-; I64-NEXT:    [[TMP87:%.*]] = load ptr, ptr [[TMP83]], align 4
 ; I64-NEXT:    [[TMP88:%.*]] = extractelement <4 x double> [[TMP79]], i32 0
-; I64-NEXT:    store double [[TMP88]], ptr [[TMP84]], align 4
 ; I64-NEXT:    [[TMP89:%.*]] = extractelement <4 x double> [[TMP79]], i32 1
-; I64-NEXT:    store double [[TMP89]], ptr [[TMP85]], align 4
 ; I64-NEXT:    [[TMP90:%.*]] = extractelement <4 x double> [[TMP79]], i32 2
-; I64-NEXT:    store double [[TMP90]], ptr [[TMP86]], align 4
 ; I64-NEXT:    [[TMP91:%.*]] = extractelement <4 x double> [[TMP79]], i32 3
+; I64-NEXT:    [[TMP84:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP75]]
+; I64-NEXT:    [[TMP85:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP76]]
+; I64-NEXT:    [[TMP86:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP77]]
+; I64-NEXT:    [[TMP93:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP78]]
+; I64-NEXT:    [[TMP94:%.*]] = load ptr, ptr [[TMP84]], align 4
+; I64-NEXT:    [[TMP95:%.*]] = load ptr, ptr [[TMP85]], align 4
+; I64-NEXT:    [[TMP96:%.*]] = load ptr, ptr [[TMP86]], align 4
+; I64-NEXT:    [[TMP87:%.*]] = load ptr, ptr [[TMP93]], align 4
+; I64-NEXT:    store double [[TMP88]], ptr [[TMP94]], align 4
+; I64-NEXT:    store double [[TMP89]], ptr [[TMP95]], align 4
+; I64-NEXT:    store double [[TMP90]], ptr [[TMP96]], align 4
 ; I64-NEXT:    store double [[TMP91]], ptr [[TMP87]], align 4
 ; I64-NEXT:    [[INDEX_NEXT6]] = add nuw i32 [[INDEX4]], 4
 ; I64-NEXT:    [[VEC_IND_NEXT7]] = add <4 x i32> [[VEC_IND5]], splat (i32 4)
@@ -201,9 +201,25 @@ define void @test_store_initially_interleave(i32 %n, ptr noalias %src) #0 {
 ; I32-NEXT:    [[TMP42:%.*]] = add i32 [[INDEX]], 14
 ; I32-NEXT:    [[TMP43:%.*]] = add i32 [[INDEX]], 15
 ; I32-NEXT:    [[TMP44:%.*]] = uitofp <4 x i32> [[VEC_IND]] to <4 x double>
+; I32-NEXT:    [[TMP31:%.*]] = extractelement <4 x double> [[TMP44]], i32 0
+; I32-NEXT:    [[TMP32:%.*]] = extractelement <4 x double> [[TMP44]], i32 1
+; I32-NEXT:    [[TMP33:%.*]] = extractelement <4 x double> [[TMP44]], i32 2
+; I32-NEXT:    [[TMP34:%.*]] = extractelement <4 x double> [[TMP44]], i32 3
 ; I32-NEXT:    [[TMP45:%.*]] = uitofp <4 x i32> [[STEP_ADD]] to <4 x double>
+; I32-NEXT:    [[TMP35:%.*]] = extractelement <4 x double> [[TMP45]], i32 0
+; I32-NEXT:    [[TMP36:%.*]] = extractelement <4 x double> [[TMP45]], i32 1
+; I32-NEXT:    [[TMP37:%.*]] = extractelement <4 x double> [[TMP45]], i32 2
+; I32-NEXT:    [[TMP38:%.*]] = extractelement <4 x double> [[TMP45]], i32 3
 ; I32-NEXT:    [[TMP46:%.*]] = uitofp <4 x i32> [[STEP_ADD_2]] to <4 x double>
+; I32-NEXT:    [[TMP63:%.*]] = extractelement <4 x double> [[TMP46]], i32 0
+; I32-NEXT:    [[TMP64:%.*]] = extractelement <4 x double> [[TMP46]], i32 1
+; I32-NEXT:    [[TMP65:%.*]] = extractelement <4 x double> [[TMP46]], i32 2
+; I32-NEXT:    [[TMP66:%.*]] = extractelement <4 x double> [[TMP46]], i32 3
 ; I32-NEXT:    [[TMP55:%.*]] = uitofp <4 x i32> [[STEP_ADD_3]] to <4 x double>
+; I32-NEXT:    [[TMP67:%.*]] = extractelement <4 x double> [[TMP55]], i32 0
+; I32-NEXT:    [[TMP68:%.*]] = extractelement <4 x double> [[TMP55]], i32 1
+; I32-NEXT:    [[TMP69:%.*]] = extractelement <4 x double> [[TMP55]], i32 2
+; I32-NEXT:    [[TMP70:%.*]] = extractelement <4 x double> [[TMP55]], i32 3
 ; I32-NEXT:    [[TMP15:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP3]]
 ; I32-NEXT:    [[TMP16:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP4]]
 ; I32-NEXT:    [[TMP17:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP5]]
@@ -236,37 +252,21 @@ define void @test_store_initially_interleave(i32 %n, ptr noalias %src) #0 {
 ; I32-NEXT:    [[TMP52:%.*]] = load ptr, ptr [[TMP61]], align 4
 ; I32-NEXT:    [[TMP53:%.*]] = load ptr, ptr [[TMP62]], align 4
 ; I32-NEXT:    [[TMP54:%.*]] = load ptr, ptr [[TMP71]], align 4
-; I32-NEXT:    [[TMP31:%.*]] = extractelement <4 x double> [[TMP44]], i32 0
 ; I32-NEXT:    store double [[TMP31]], ptr [[TMP23]], align 4
-; I32-NEXT:    [[TMP32:%.*]] = extractelement <4 x double> [[TMP44]], i32 1
 ; I32-NEXT:    store double [[TMP32]], ptr [[TMP24]], align 4
-; I32-NEXT:    [[TMP33:%.*]] = extractelement <4 x double> [[TMP44]], i32 2
 ; I32-NEXT:    store double [[TMP33]], ptr [[TMP25]], align 4
-; I32-NEXT:    [[TMP34:%.*]] = extractelement <4 x double> [[TMP44]], i32 3
 ; I32-NEXT:    store double [[TMP34]], ptr [[TMP26]], align 4
-; I32-NEXT:    [[TMP35:%.*]] = extractelement <4 x double> [[TMP45]], i32 0
 ; I32-NEXT:    store double [[TMP35]], ptr [[TMP27]], align 4
-; I32-NEXT:    [[TMP36:%.*]] = extractelement <4 x double> [[TMP45]], i32 1
 ; I32-NEXT:    store double [[TMP36]], ptr [[TMP28]], align 4
-; I32-NEXT:    [[TMP37:%.*]] = extractelement <4 x double> [[TMP45]], i32 2
 ; I32-NEXT:    store double [[TMP37]], ptr [[TMP29]], align 4
-; I32-NEXT:    [[TMP38:%.*]] = extractelement <4 x double> [[TMP45]], i32 3
 ; I32-NEXT:    store double [[TMP38]], ptr [[TMP30]], align 4
-; I32-NEXT:    [[TMP63:%.*]] = extractelement <4 x double> [[TMP46]], i32 0
 ; I32-NEXT:    store double [[TMP63]], ptr [[TMP47]], align 4
-; I32-NEXT:    [[TMP64:%.*]] = extractelement <4 x double> [[TMP46]], i32 1
 ; I32-NEXT:    store double [[TMP64]], ptr [[TMP48]], align 4
-; I32-NEXT:    [[TMP65:%.*]] = extractelement <4 x double> [[TMP46]], i32 2
 ; I32-NEXT:    store double [[TMP65]], ptr [[TMP49]], align 4
-; I32-NEXT:    [[TMP66:%.*]] = extractelement <4 x double> [[TMP46]], i32 3
 ; I32-NEXT:    store double [[TMP66]], ptr [[TMP50]], align 4
-; I32-NEXT:    [[TMP67:%.*]] = extractelement <4 x double> [[TMP55]], i32 0
 ; I32-NEXT:    store double [[TMP67]], ptr [[TMP51]], align 4
-; I32-NEXT:    [[TMP68:%.*]] = extractelement <4 x double> [[TMP55]], i32 1
 ; I32-NEXT:    store double [[TMP68]], ptr [[TMP52]], align 4
-; I32-NEXT:    [[TMP69:%.*]] = extractelement <4 x double> [[TMP55]], i32 2
 ; I32-NEXT:    store double [[TMP69]], ptr [[TMP53]], align 4
-; I32-NEXT:    [[TMP70:%.*]] = extractelement <4 x double> [[TMP55]], i32 3
 ; I32-NEXT:    store double [[TMP70]], ptr [[TMP54]], align 4
 ; I32-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
 ; I32-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD_3]], splat (i32 4)
@@ -295,21 +295,21 @@ define void @test_store_initially_interleave(i32 %n, ptr noalias %src) #0 {
 ; I32-NEXT:    [[TMP76:%.*]] = add i32 [[INDEX4]], 2
 ; I32-NEXT:    [[TMP77:%.*]] = add i32 [[INDEX4]], 3
 ; I32-NEXT:    [[TMP78:%.*]] = uitofp <4 x i32> [[VEC_IND5]] to <4 x double>
-; I32-NEXT:    [[TMP79:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP74]]
-; I32-NEXT:    [[TMP80:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP75]]
-; I32-NEXT:    [[TMP81:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP76]]
-; I32-NEXT:    [[TMP82:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP77]]
-; I32-NEXT:    [[TMP83:%.*]] = load ptr, ptr [[TMP79]], align 4
-; I32-NEXT:    [[TMP84:%.*]] = load ptr, ptr [[TMP80]], align 4
-; I32-NEXT:    [[TMP85:%.*]] = load ptr, ptr [[TMP81]], align 4
-; I32-NEXT:    [[TMP86:%.*]] = load ptr, ptr [[TMP82]], align 4
 ; I32-NEXT:    [[TMP87:%.*]] = extractelement <4 x double> [[TMP78]], i32 0
-; I32-NEXT:    store double [[TMP87]], ptr [[TMP83]], align 4
 ; I32-NEXT:    [[TMP88:%.*]] = extractelement <4 x double> [[TMP78]], i32 1
-; I32-NEXT:    store double [[TMP88]], ptr [[TMP84]], align 4
 ; I32-NEXT:    [[TMP89:%.*]] = extractelement <4 x double> [[TMP78]], i32 2
-; I32-NEXT:    store double [[TMP89]], ptr [[TMP85]], align 4
 ; I32-NEXT:    [[TMP90:%.*]] = extractelement <4 x double> [[TMP78]], i32 3
+; I32-NEXT:    [[TMP83:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP74]]
+; I32-NEXT:    [[TMP84:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP75]]
+; I32-NEXT:    [[TMP85:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP76]]
+; I32-NEXT:    [[TMP92:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP77]]
+; I32-NEXT:    [[TMP93:%.*]] = load ptr, ptr [[TMP83]], align 4
+; I32-NEXT:    [[TMP94:%.*]] = load ptr, ptr [[TMP84]], align 4
+; I32-NEXT:    [[TMP95:%.*]] = load ptr, ptr [[TMP85]], align 4
+; I32-NEXT:    [[TMP86:%.*]] = load ptr, ptr [[TMP92]], align 4
+; I32-NEXT:    store double [[TMP87]], ptr [[TMP93]], align 4
+; I32-NEXT:    store double [[TMP88]], ptr [[TMP94]], align 4
+; I32-NEXT:    store double [[TMP89]], ptr [[TMP95]], align 4
 ; I32-NEXT:    store double [[TMP90]], ptr [[TMP86]], align 4
 ; I32-NEXT:    [[INDEX_NEXT6]] = add nuw i32 [[INDEX4]], 4
 ; I32-NEXT:    [[VEC_IND_NEXT7]] = add <4 x i32> [[VEC_IND5]], splat (i32 4)
@@ -693,20 +693,20 @@ define void @loaded_address_used_by_load_through_blend(i64 %start, ptr noalias %
 ; I32-NEXT:    [[TMP76:%.*]] = insertelement <8 x ptr> [[TMP75]], ptr [[TMP68]], i32 7
 ; I32-NEXT:    [[PREDPHI:%.*]] = select <8 x i1> [[TMP51]], <8 x ptr> [[TMP76]], <8 x ptr> [[BROADCAST_SPLAT2]]
 ; I32-NEXT:    [[TMP77:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 0
-; I32-NEXT:    [[TMP78:%.*]] = load float, ptr [[TMP77]], align 4
 ; I32-NEXT:    [[TMP79:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 1
-; I32-NEXT:    [[TMP80:%.*]] = load float, ptr [[TMP79]], align 4
 ; I32-NEXT:    [[TMP81:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 2
-; I32-NEXT:    [[TMP82:%.*]] = load float, ptr [[TMP81]], align 4
 ; I32-NEXT:    [[TMP83:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 3
-; I32-NEXT:    [[TMP84:%.*]] = load float, ptr [[TMP83]], align 4
 ; I32-NEXT:    [[TMP85:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 4
-; I32-NEXT:    [[TMP86:%.*]] = load float, ptr [[TMP85]], align 4
 ; I32-NEXT:    [[TMP87:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 5
-; I32-NEXT:    [[TMP88:%.*]] = load float, ptr [[TMP87]], align 4
 ; I32-NEXT:    [[TMP89:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 6
-; I32-NEXT:    [[TMP90:%.*]] = load float, ptr [[TMP89]], align 4
 ; I32-NEXT:    [[TMP91:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 7
+; I32-NEXT:    [[TMP78:%.*]] = load float, ptr [[TMP77]], align 4
+; I32-NEXT:    [[TMP80:%.*]] = load float, ptr [[TMP79]], align 4
+; I32-NEXT:    [[TMP82:%.*]] = load float, ptr [[TMP81]], align 4
+; I32-NEXT:    [[TMP84:%.*]] = load float, ptr [[TMP83]], align 4
+; I32-NEXT:    [[TMP86:%.*]] = load float, ptr [[TMP85]], align 4
+; I32-NEXT:    [[TMP88:%.*]] = load float, ptr [[TMP87]], align 4
+; I32-NEXT:    [[TMP90:%.*]] = load float, ptr [[TMP89]], align 4
 ; I32-NEXT:    [[TMP92:%.*]] = load float, ptr [[TMP91]], align 4
 ; I32-NEXT:    [[TMP93:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]]
 ; I32-NEXT:    [[TMP94:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP4]]
@@ -847,32 +847,32 @@ define void @address_use_in_different_block(ptr noalias %dst, ptr %src.0, ptr %s
 ; I64-NEXT:    [[TMP70:%.*]] = insertelement <2 x double> poison, double [[TMP68]], i32 0
 ; I64-NEXT:    [[TMP71:%.*]] = insertelement <2 x double> [[TMP70]], double [[TMP69]], i32 1
 ; I64-NEXT:    [[TMP72:%.*]] = fsub <2 x double> zeroinitializer, [[TMP59]]
-; I64-NEXT:    [[TMP73:%.*]] = fsub <2 x double> zeroinitializer, [[TMP63]]
-; I64-NEXT:    [[TMP74:%.*]] = fsub <2 x double> zeroinitializer, [[TMP67]]
-; I64-NEXT:    [[TMP75:%.*]] = fsub <2 x double> zeroinitializer, [[TMP71]]
-; I64-NEXT:    [[TMP76:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP8]]
-; I64-NEXT:    [[TMP77:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP9]]
-; I64-NEXT:    [[TMP78:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP10]]
-; I64-NEXT:    [[TMP79:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP11]]
-; I64-NEXT:    [[TMP80:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP12]]
-; I64-NEXT:    [[TMP81:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP13]]
-; I64-NEXT:    [[TMP82:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP14]]
-; I64-NEXT:    [[TMP83:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP15]]
 ; I64-NEXT:    [[TMP84:%.*]] = extractelement <2 x double> [[TMP72]], i32 0
-; I64-NEXT:    store double [[TMP84]], ptr [[TMP76]], align 8
 ; I64-NEXT:    [[TMP85:%.*]] = extractelement <2 x double> [[TMP72]], i32 1
-; I64-NEXT:    store double [[TMP85]], ptr [[TMP77]], align 8
+; I64-NEXT:    [[TMP73:%.*]] = fsub <2 x double> zeroinitializer, [[TMP63]]
 ; I64-NEXT:    [[TMP86:%.*]] = extractelement <2 x double> [[TMP73]], i32 0
-; I64-NEXT:    store double [[TMP86]], ptr [[TMP78]], align 8
 ; I64-NEXT:    [[TMP87:%.*]] = extractelement <2 x double> [[TMP73]], i32 1
-; I64-NEXT:    store double [[TMP87]], ptr [[TMP79]], align 8
+; I64-NEXT:    [[TMP74:%.*]] = fsub <2 x double> zeroinitializer, [[TMP67]]
 ; I64-NEXT:    [[TMP88:%.*]] = extractelement <2 x double> [[TMP74]], i32 0
-; I64-NEXT:    store double [[TMP88]], ptr [[TMP80]], align 8
 ; I64-NEXT:    [[TMP89:%.*]] = extractelement <2 x double> [[TMP74]], i32 1
-; I64-NEXT:    store double [[TMP89]], ptr [[TMP81]], align 8
+; I64-NEXT:    [[TMP75:%.*]] = fsub <2 x double> zeroinitializer, [[TMP71]]
 ; I64-NEXT:    [[TMP90:%.*]] = extractelement <2 x double> [[TMP75]], i32 0
-; I64-NEXT:    store double [[TMP90]], ptr [[TMP82]], align 8
 ; I64-NEXT:    [[TMP91:%.*]] = extractelement <2 x double> [[TMP75]], i32 1
+; I64-NEXT:    [[TMP93:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP8]]
+; I64-NEXT:    [[TMP94:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP9]]
+; I64-NEXT:    [[TMP95:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP10]]
+; I64-NEXT:    [[TMP96:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP11]]
+; I64-NEXT:    [[TMP97:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP12]]
+; I64-NEXT:    [[TMP98:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP13]]
+; I64-NEXT:    [[TMP99:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP14]]
+; I64-NEXT:    [[TMP83:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP15]]
+; I64-NEXT:    store double [[TMP84]], ptr [[TMP93]], align 8
+; I64-NEXT:    store double [[TMP85]], ptr [[TMP94]], align 8
+; I64-NEXT:    store double [[TMP86]], ptr [[TMP95]], align 8
+; I64-NEXT:    store double [[TMP87]], ptr [[TMP96]], align 8
+; I64-NEXT:    store double [[TMP88]], ptr [[TMP97]], align 8
+; I64-NEXT:    store double [[TMP89]], ptr [[TMP98]], align 8
+; I64-NEXT:    store double [[TMP90]], ptr [[TMP99]], align 8
 ; I64-NEXT:    store double [[TMP91]], ptr [[TMP83]], align 8
 ; I64-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; I64-NEXT:    [[TMP92:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
@@ -928,17 +928,17 @@ define void @address_use_in_different_block(ptr noalias %dst, ptr %src.0, ptr %s
 ; I32-NEXT:    [[TMP34:%.*]] = insertelement <4 x double> [[TMP33]], double [[TMP30]], i32 2
 ; I32-NEXT:    [[TMP35:%.*]] = insertelement <4 x double> [[TMP34]], double [[TMP31]], i32 3
 ; I32-NEXT:    [[TMP36:%.*]] = fsub <4 x double> zeroinitializer, [[TMP35]]
+; I32-NEXT:    [[TMP41:%.*]] = extractelement <4 x double> [[TMP36]], i32 0
+; I32-NEXT:    [[TMP42:%.*]] = extractelement <4 x double> [[TMP36]], i32 1
+; I32-NEXT:    [[TMP43:%.*]] = extractelement <4 x double> [[TMP36]], i32 2
+; I32-NEXT:    [[TMP44:%.*]] = extractelement <4 x double> [[TMP36]], i32 3
 ; I32-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP4]]
 ; I32-NEXT:    [[TMP38:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP5]]
 ; I32-NEXT:    [[TMP39:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP6]]
 ; I32-NEXT:    [[TMP40:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP7]]
-; I32-NEXT:    [[TMP41:%.*]] = extractelement <4 x double> [[TMP36]], i32 0
 ; I32-NEXT:    store double [[TMP41]], ptr [[TMP37]], align 8
-; I32-NEXT:    [[TMP42:%.*]] = extractelement <4 x double> [[TMP36]], i32 1
 ; I32-NEXT:    store double [[TMP42]], ptr [[TMP38]], align 8
-; I32-NEXT:    [[TMP43:%.*]] = extractelement <4 x double> [[TMP36]], i32 2
 ; I32-NEXT:    store double [[TMP43]], ptr [[TMP39]], align 8
-; I32-NEXT:    [[TMP44:%.*]] = extractelement <4 x double> [[TMP36]], i32 3
 ; I32-NEXT:    store double [[TMP44]], ptr [[TMP40]], align 8
 ; I32-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; I32-NEXT:    [[TMP45:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
diff --git a/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll b/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll
index af5c921c29149..fa3b4a6609d4c 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/runtime-limit.ll
@@ -13,7 +13,7 @@ target triple = "x86_64-unknown-linux"
 ;CHECK-LABEL: func1x6(
 ;CHECK: <4 x i32>
 ;CHECK: ret
-define i32 @func1x6(ptr nocapture %out, ptr nocapture %A, ptr nocapture %B, ptr nocapture %C, ptr nocapture %D, ptr nocapture %E, ptr nocapture %F) {
+define void @func1x6(ptr nocapture %out, ptr nocapture %A, ptr nocapture %B, ptr nocapture %C, ptr nocapture %D, ptr nocapture %E, ptr nocapture %F) {
 entry:
   br label %for.body
 
@@ -40,14 +40,14 @@ for.body:                                         ; preds = %for.body, %entry
   br i1 %exitcond, label %for.end, label %for.body
 
 for.end:                                          ; preds = %for.body
-  ret i32 undef
+  ret void
 }
 
 ; We are vectorizing with 12 runtime checks.
 ;CHECK-LABEL: func2x6(
 ;CHECK: <4 x i32>
 ;CHECK: ret
-define i32 @func2x6(ptr nocapture %out, ptr nocapture %out2, ptr nocapture %A, ptr nocapture %B, ptr nocapture %C, ptr nocapture %D, ptr nocapture %E, ptr nocapture %F) {
+define void @func2x6(ptr nocapture %out, ptr nocapture %out2, ptr nocapture %A, ptr nocapture %B, ptr nocapture %C, ptr nocapture %D, ptr nocapture %E, ptr nocapture %F) {
 entry:
   br label %for.body
 
@@ -85,5 +85,5 @@ for.body:                                         ; preds = %for.body, %entry
   br i1 %exitcond, label %for.end, label %for.body
 
 for.end:                                          ; preds = %for.body
-  ret i32 undef
+  ret void
 }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
index bdbac7c1a9931..005696af67b5a 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
@@ -53,11 +53,11 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr @d, i64 0, <16 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <16 x i64> [[TMP10]], [[VEC_IND3]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [10 x i32], <16 x ptr> [[TMP11]], <16 x i64> [[TMP12]], i64 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> splat (i32 8), <16 x ptr> [[TMP13]], i32 16, <16 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> splat (i32 8), <16 x ptr> align 16 [[TMP13]], <16 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP14:%.*]] = or disjoint <16 x i64> [[VEC_IND3]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP15:%.*]] = add nsw <16 x i64> [[TMP10]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [10 x i32], <16 x ptr> [[TMP11]], <16 x i64> [[TMP15]], i64 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> splat (i32 8), <16 x ptr> [[TMP16]], i32 8, <16 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> splat (i32 8), <16 x ptr> align 8 [[TMP16]], <16 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 32)
 ; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add <16 x i64> [[VEC_IND3]], splat (i64 32)
@@ -96,11 +96,11 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr @d, i64 0, <8 x i64> [[VEC_IND15]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = add nsw <8 x i64> [[TMP18]], [[VEC_IND20]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [10 x i32], <8 x ptr> [[TMP19]], <8 x i64> [[TMP20]], i64 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> splat (i32 8), <8 x ptr> [[TMP21]], i32 16, <8 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> splat (i32 8), <8 x ptr> align 16 [[TMP21]], <8 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP22:%.*]] = or disjoint <8 x i64> [[VEC_IND20]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP23:%.*]] = add nsw <8 x i64> [[TMP18]], [[TMP22]]
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [10 x i32], <8 x ptr> [[TMP19]], <8 x i64> [[TMP23]], i64 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> splat (i32 8), <8 x ptr> [[TMP24]], i32 8, <8 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> splat (i32 8), <8 x ptr> align 8 [[TMP24]], <8 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT22]] = add nuw i64 [[INDEX14]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT16]] = add <8 x i64> [[VEC_IND15]], splat (i64 16)
 ; CHECK-NEXT:    [[VEC_IND_NEXT21]] = add <8 x i64> [[VEC_IND20]], splat (i64 16)
@@ -110,18 +110,18 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[CMP_N23:%.*]] = icmp eq i64 [[TMP6]], [[N_VEC7]]
 ; CHECK-NEXT:    br i1 [[CMP_N23]], label %[[FOR_COND_CLEANUP_LOOPEXIT99]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL10:%.*]] = phi i64 [ [[IND_END8]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END9]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 8, %[[ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL13:%.*]] = phi i64 [ [[IND_END11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END12]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL17:%.*]] = phi i64 [ [[IND_END8]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END9]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 8, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL18:%.*]] = phi i64 [ [[IND_END11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END12]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
 ; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK:       [[ITER_CHECK22]]:
 ; CHECK-NEXT:    [[TMP26:%.*]] = add nsw i64 [[TMP3]], -9
 ; CHECK-NEXT:    [[TMP27:%.*]] = lshr i64 [[TMP26]], 1
 ; CHECK-NEXT:    [[TMP28:%.*]] = add nuw i64 [[TMP27]], 1
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK25:%.*]] = icmp ult i64 [[TMP28]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK25]], label %[[VEC_EPILOG_SCALAR_PH40:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK24:.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK25]], label %[[VEC_EPILOG_SCALAR_PH42:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK24:.*]]
 ; CHECK:       [[VECTOR_MAIN_LOOP_ITER_CHECK24]]:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK28:%.*]] = icmp ult i64 [[TMP28]], 16
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK28]], label %[[VEC_EPILOG_PH42:.*]], label %[[VECTOR_PH25:.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK28]], label %[[VEC_EPILOG_PH45:.*]], label %[[VECTOR_PH25:.*]]
 ; CHECK:       [[VECTOR_PH25]]:
 ; CHECK-NEXT:    [[N_MOD_VF31:%.*]] = urem i64 [[TMP28]], 16
 ; CHECK-NEXT:    [[N_VEC32:%.*]] = sub i64 [[TMP28]], [[N_MOD_VF31]]
@@ -131,27 +131,27 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[IND_END41:%.*]] = add i64 8, [[TMP29]]
 ; CHECK-NEXT:    [[IND_END43:%.*]] = mul i64 [[N_VEC32]], 2
 ; CHECK-NEXT:    [[TMP34:%.*]] = xor <16 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
-; CHECK-NEXT:    br label %[[VECTOR_BODY29:.*]]
-; CHECK:       [[VECTOR_BODY29]]:
-; CHECK-NEXT:    [[INDEX34:%.*]] = phi i64 [ 0, %[[VECTOR_PH25]] ], [ [[INDEX_NEXT39:%.*]], %[[VECTOR_BODY29]] ]
-; CHECK-NEXT:    [[VEC_IND35:%.*]] = phi <16 x i64> [ <i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30, i64 32, i64 34, i64 36, i64 38>, %[[VECTOR_PH25]] ], [ [[VEC_IND_NEXT36:%.*]], %[[VECTOR_BODY29]] ]
-; CHECK-NEXT:    [[VEC_IND37:%.*]] = phi <16 x i64> [ <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30>, %[[VECTOR_PH25]] ], [ [[VEC_IND_NEXT38:%.*]], %[[VECTOR_BODY29]] ]
+; CHECK-NEXT:    br label %[[VECTOR_BODY30:.*]]
+; CHECK:       [[VECTOR_BODY30]]:
+; CHECK-NEXT:    [[INDEX34:%.*]] = phi i64 [ 0, %[[VECTOR_PH25]] ], [ [[INDEX_NEXT39:%.*]], %[[VECTOR_BODY30]] ]
+; CHECK-NEXT:    [[VEC_IND35:%.*]] = phi <16 x i64> [ <i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30, i64 32, i64 34, i64 36, i64 38>, %[[VECTOR_PH25]] ], [ [[VEC_IND_NEXT36:%.*]], %[[VECTOR_BODY30]] ]
+; CHECK-NEXT:    [[VEC_IND37:%.*]] = phi <16 x i64> [ <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30>, %[[VECTOR_PH25]] ], [ [[VEC_IND_NEXT38:%.*]], %[[VECTOR_BODY30]] ]
 ; CHECK-NEXT:    [[TMP30:%.*]] = sub nsw <16 x i64> splat (i64 8), [[VEC_IND35]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr @d, i64 0, <16 x i64> [[VEC_IND35]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = add nsw <16 x i64> [[TMP30]], [[VEC_IND37]]
 ; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [10 x i32], <16 x ptr> [[TMP31]], <16 x i64> [[TMP32]], i64 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> splat (i32 8), <16 x ptr> [[TMP33]], i32 16, <16 x i1> [[TMP34]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> splat (i32 8), <16 x ptr> align 16 [[TMP33]], <16 x i1> [[TMP34]])
 ; CHECK-NEXT:    [[TMP49:%.*]] = or disjoint <16 x i64> [[VEC_IND37]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP36:%.*]] = add nsw <16 x i64> [[TMP30]], [[TMP49]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [10 x i32], <16 x ptr> [[TMP31]], <16 x i64> [[TMP36]], i64 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> splat (i32 8), <16 x ptr> [[TMP37]], i32 8, <16 x i1> [[TMP34]])
-; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> splat (i32 7), <16 x ptr> [[TMP33]], i32 16, <16 x i1> [[BROADCAST_SPLAT]])
-; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> splat (i32 7), <16 x ptr> [[TMP37]], i32 8, <16 x i1> [[BROADCAST_SPLAT]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> splat (i32 8), <16 x ptr> align 8 [[TMP37]], <16 x i1> [[TMP34]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> splat (i32 7), <16 x ptr> align 16 [[TMP33]], <16 x i1> [[BROADCAST_SPLAT]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> splat (i32 7), <16 x ptr> align 8 [[TMP37]], <16 x i1> [[BROADCAST_SPLAT]])
 ; CHECK-NEXT:    [[INDEX_NEXT39]] = add nuw i64 [[INDEX34]], 16
 ; CHECK-NEXT:    [[VEC_IND_NEXT36]] = add <16 x i64> [[VEC_IND35]], splat (i64 32)
 ; CHECK-NEXT:    [[VEC_IND_NEXT38]] = add <16 x i64> [[VEC_IND37]], splat (i64 32)
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT39]], [[N_VEC32]]
-; CHECK-NEXT:    br i1 [[TMP41]], label %[[MIDDLE_BLOCK37:.*]], label %[[VECTOR_BODY29]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP41]], label %[[MIDDLE_BLOCK37:.*]], label %[[VECTOR_BODY30]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK37]]:
 ; CHECK-NEXT:    [[CMP_N40:%.*]] = icmp eq i64 [[TMP28]], [[N_VEC32]]
 ; CHECK-NEXT:    br i1 [[CMP_N40]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK43:.*]]
@@ -160,9 +160,9 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[IND_END55:%.*]] = add i64 8, [[TMP42]]
 ; CHECK-NEXT:    [[IND_END58:%.*]] = mul i64 [[N_VEC32]], 2
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK50:%.*]] = icmp ult i64 [[N_MOD_VF31]], 8
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK50]], label %[[VEC_EPILOG_SCALAR_PH40]], label %[[VEC_EPILOG_PH42]], !prof [[PROF3]]
-; CHECK:       [[VEC_EPILOG_PH42]]:
-; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL51:%.*]] = phi i64 [ [[N_VEC32]], %[[VEC_EPILOG_ITER_CHECK43]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK24]] ]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK50]], label %[[VEC_EPILOG_SCALAR_PH42]], label %[[VEC_EPILOG_PH45]], !prof [[PROF3]]
+; CHECK:       [[VEC_EPILOG_PH45]]:
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL39:%.*]] = phi i64 [ [[N_VEC32]], %[[VEC_EPILOG_ITER_CHECK43]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK24]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL42:%.*]] = phi i64 [ [[IND_END41]], %[[VEC_EPILOG_ITER_CHECK43]] ], [ 8, %[[VECTOR_MAIN_LOOP_ITER_CHECK24]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL44:%.*]] = phi i64 [ [[IND_END43]], %[[VEC_EPILOG_ITER_CHECK43]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK24]] ]
 ; CHECK-NEXT:    [[N_MOD_VF52:%.*]] = urem i64 [[TMP28]], 8
@@ -179,37 +179,37 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[DOTSPLATINSERT67:%.*]] = insertelement <8 x i64> poison, i64 [[BC_RESUME_VAL44]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT68:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT67]], <8 x i64> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION69:%.*]] = add <8 x i64> [[DOTSPLAT68]], <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14>
-; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY49:.*]]
-; CHECK:       [[VEC_EPILOG_VECTOR_BODY49]]:
-; CHECK-NEXT:    [[INDEX61:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL51]], %[[VEC_EPILOG_PH42]] ], [ [[INDEX_NEXT74:%.*]], %[[VEC_EPILOG_VECTOR_BODY49]] ]
-; CHECK-NEXT:    [[VEC_IND65:%.*]] = phi <8 x i64> [ [[INDUCTION64]], %[[VEC_EPILOG_PH42]] ], [ [[VEC_IND_NEXT66:%.*]], %[[VEC_EPILOG_VECTOR_BODY49]] ]
-; CHECK-NEXT:    [[VEC_IND70:%.*]] = phi <8 x i64> [ [[INDUCTION69]], %[[VEC_EPILOG_PH42]] ], [ [[VEC_IND_NEXT71:%.*]], %[[VEC_EPILOG_VECTOR_BODY49]] ]
+; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY56:.*]]
+; CHECK:       [[VEC_EPILOG_VECTOR_BODY56]]:
+; CHECK-NEXT:    [[INDEX61:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL39]], %[[VEC_EPILOG_PH45]] ], [ [[INDEX_NEXT74:%.*]], %[[VEC_EPILOG_VECTOR_BODY56]] ]
+; CHECK-NEXT:    [[VEC_IND65:%.*]] = phi <8 x i64> [ [[INDUCTION64]], %[[VEC_EPILOG_PH45]] ], [ [[VEC_IND_NEXT66:%.*]], %[[VEC_EPILOG_VECTOR_BODY56]] ]
+; CHECK-NEXT:    [[VEC_IND70:%.*]] = phi <8 x i64> [ [[INDUCTION69]], %[[VEC_EPILOG_PH45]] ], [ [[VEC_IND_NEXT71:%.*]], %[[VEC_EPILOG_VECTOR_BODY56]] ]
 ; CHECK-NEXT:    [[TMP44:%.*]] = sub nsw <8 x i64> splat (i64 8), [[VEC_IND65]]
 ; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr @d, i64 0, <8 x i64> [[VEC_IND65]]
 ; CHECK-NEXT:    [[TMP46:%.*]] = add nsw <8 x i64> [[TMP44]], [[VEC_IND70]]
 ; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [10 x i32], <8 x ptr> [[TMP45]], <8 x i64> [[TMP46]], i64 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> splat (i32 8), <8 x ptr> [[TMP47]], i32 16, <8 x i1> [[TMP48]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> splat (i32 8), <8 x ptr> align 16 [[TMP47]], <8 x i1> [[TMP48]])
 ; CHECK-NEXT:    [[TMP54:%.*]] = or disjoint <8 x i64> [[VEC_IND70]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP50:%.*]] = add nsw <8 x i64> [[TMP44]], [[TMP54]]
 ; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [10 x i32], <8 x ptr> [[TMP45]], <8 x i64> [[TMP50]], i64 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> splat (i32 8), <8 x ptr> [[TMP51]], i32 8, <8 x i1> [[TMP48]])
-; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> splat (i32 7), <8 x ptr> [[TMP47]], i32 16, <8 x i1> [[BROADCAST_SPLAT73]])
-; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> splat (i32 7), <8 x ptr> [[TMP51]], i32 8, <8 x i1> [[BROADCAST_SPLAT73]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> splat (i32 8), <8 x ptr> align 8 [[TMP51]], <8 x i1> [[TMP48]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> splat (i32 7), <8 x ptr> align 16 [[TMP47]], <8 x i1> [[BROADCAST_SPLAT73]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> splat (i32 7), <8 x ptr> align 8 [[TMP51]], <8 x i1> [[BROADCAST_SPLAT73]])
 ; CHECK-NEXT:    [[INDEX_NEXT74]] = add nuw i64 [[INDEX61]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT66]] = add <8 x i64> [[VEC_IND65]], splat (i64 16)
 ; CHECK-NEXT:    [[VEC_IND_NEXT71]] = add <8 x i64> [[VEC_IND70]], splat (i64 16)
 ; CHECK-NEXT:    [[TMP55:%.*]] = icmp eq i64 [[INDEX_NEXT74]], [[N_VEC53]]
-; CHECK-NEXT:    br i1 [[TMP55]], label %[[VEC_EPILOG_MIDDLE_BLOCK40:.*]], label %[[VEC_EPILOG_VECTOR_BODY49]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK40]]:
+; CHECK-NEXT:    br i1 [[TMP55]], label %[[VEC_EPILOG_MIDDLE_BLOCK63:.*]], label %[[VEC_EPILOG_VECTOR_BODY56]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK63]]:
 ; CHECK-NEXT:    [[CMP_N65:%.*]] = icmp eq i64 [[TMP28]], [[N_VEC53]]
-; CHECK-NEXT:    br i1 [[CMP_N65]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[VEC_EPILOG_SCALAR_PH40]]
-; CHECK:       [[VEC_EPILOG_SCALAR_PH40]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL56:%.*]] = phi i64 [ [[IND_END54]], %[[VEC_EPILOG_MIDDLE_BLOCK40]] ], [ [[IND_END55]], %[[VEC_EPILOG_ITER_CHECK43]] ], [ 8, %[[ITER_CHECK22]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL67:%.*]] = phi i64 [ [[IND_END57]], %[[VEC_EPILOG_MIDDLE_BLOCK40]] ], [ [[IND_END58]], %[[VEC_EPILOG_ITER_CHECK43]] ], [ 0, %[[ITER_CHECK22]] ]
+; CHECK-NEXT:    br i1 [[CMP_N65]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[VEC_EPILOG_SCALAR_PH42]]
+; CHECK:       [[VEC_EPILOG_SCALAR_PH42]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL65:%.*]] = phi i64 [ [[IND_END54]], %[[VEC_EPILOG_MIDDLE_BLOCK63]] ], [ [[IND_END55]], %[[VEC_EPILOG_ITER_CHECK43]] ], [ 8, %[[ITER_CHECK22]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL66:%.*]] = phi i64 [ [[IND_END57]], %[[VEC_EPILOG_MIDDLE_BLOCK63]] ], [ [[IND_END58]], %[[VEC_EPILOG_ITER_CHECK43]] ], [ 0, %[[ITER_CHECK22]] ]
 ; CHECK-NEXT:    br label %[[FOR_BODY_US:.*]]
 ; CHECK:       [[FOR_BODY_US]]:
-; CHECK-NEXT:    [[INDVARS_IV78:%.*]] = phi i64 [ [[INDVARS_IV_NEXT79:%.*]], %[[FOR_COND_CLEANUP4_US_LCSSA_US_US:.*]] ], [ [[BC_RESUME_VAL56]], %[[VEC_EPILOG_SCALAR_PH40]] ]
-; CHECK-NEXT:    [[INDVARS_IV70:%.*]] = phi i64 [ [[INDVARS_IV_NEXT71:%.*]], %[[FOR_COND_CLEANUP4_US_LCSSA_US_US]] ], [ [[BC_RESUME_VAL67]], %[[VEC_EPILOG_SCALAR_PH40]] ]
+; CHECK-NEXT:    [[INDVARS_IV78:%.*]] = phi i64 [ [[INDVARS_IV_NEXT79:%.*]], %[[FOR_COND_CLEANUP4_US_LCSSA_US_US:.*]] ], [ [[BC_RESUME_VAL65]], %[[VEC_EPILOG_SCALAR_PH42]] ]
+; CHECK-NEXT:    [[INDVARS_IV70:%.*]] = phi i64 [ [[INDVARS_IV_NEXT71:%.*]], %[[FOR_COND_CLEANUP4_US_LCSSA_US_US]] ], [ [[BC_RESUME_VAL66]], %[[VEC_EPILOG_SCALAR_PH42]] ]
 ; CHECK-NEXT:    [[TMP56:%.*]] = sub nsw i64 8, [[INDVARS_IV78]]
 ; CHECK-NEXT:    [[ADD_PTR_US:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr @d, i64 0, i64 [[INDVARS_IV78]]
 ; CHECK-NEXT:    [[TMP57:%.*]] = add nsw i64 [[TMP56]], [[INDVARS_IV70]]
@@ -241,8 +241,8 @@ define void @_Z3fn1v() #0 {
 ; CHECK:       [[FOR_COND_CLEANUP]]:
 ; CHECK-NEXT:    ret void
 ; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[INDVARS_IV95:%.*]] = phi i64 [ [[INDVARS_IV_NEXT96:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL10]], %[[VEC_EPILOG_SCALAR_PH]] ]
-; CHECK-NEXT:    [[INDVARS_IV87:%.*]] = phi i64 [ [[INDVARS_IV_NEXT88:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL13]], %[[VEC_EPILOG_SCALAR_PH]] ]
+; CHECK-NEXT:    [[INDVARS_IV95:%.*]] = phi i64 [ [[INDVARS_IV_NEXT96:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL17]], %[[VEC_EPILOG_SCALAR_PH]] ]
+; CHECK-NEXT:    [[INDVARS_IV87:%.*]] = phi i64 [ [[INDVARS_IV_NEXT88:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL18]], %[[VEC_EPILOG_SCALAR_PH]] ]
 ; CHECK-NEXT:    [[TMP60:%.*]] = sub nsw i64 8, [[INDVARS_IV95]]
 ; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr @d, i64 0, i64 [[INDVARS_IV95]]
 ; CHECK-NEXT:    [[TMP61:%.*]] = add nsw i64 [[TMP60]], [[INDVARS_IV87]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/scev-checks-unprofitable.ll b/llvm/test/Transforms/LoopVectorize/X86/scev-checks-unprofitable.ll
index 272b62bdbd5aa..905e67b8723f9 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/scev-checks-unprofitable.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/scev-checks-unprofitable.ll
@@ -29,7 +29,7 @@ define void @value_defined_in_loop1_used_for_trip_counts(i32 %start, i1 %c, ptr
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp ule <16 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> zeroinitializer, ptr [[DST]], i32 1, <16 x i1> [[TMP0]])
+; CHECK-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> zeroinitializer, ptr align 1 [[DST]], <16 x i1> [[TMP0]])
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT_1_LOOPEXIT1:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
index 15e26782f8e66..2ecd15e14c8a2 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
@@ -529,6 +529,14 @@ define void @test(ptr %A, ptr noalias %B) #0 {
 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 ; CHECK-NEXT:    [[TMP18:%.*]] = add <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC1]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = trunc <8 x i32> [[TMP18]] to <8 x i8>
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <8 x i8> [[TMP19]], i32 0
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <8 x i8> [[TMP19]], i32 1
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <8 x i8> [[TMP19]], i32 2
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <8 x i8> [[TMP19]], i32 3
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <8 x i8> [[TMP19]], i32 4
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <8 x i8> [[TMP19]], i32 5
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <8 x i8> [[TMP19]], i32 6
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <8 x i8> [[TMP19]], i32 7
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP9]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP10]]
@@ -537,21 +545,13 @@ define void @test(ptr %A, ptr noalias %B) #0 {
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP13]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP14]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <8 x i8> [[TMP19]], i32 0
 ; CHECK-NEXT:    store i8 [[TMP28]], ptr [[TMP20]], align 1
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <8 x i8> [[TMP19]], i32 1
 ; CHECK-NEXT:    store i8 [[TMP29]], ptr [[TMP21]], align 1
-; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <8 x i8> [[TMP19]], i32 2
 ; CHECK-NEXT:    store i8 [[TMP30]], ptr [[TMP22]], align 1
-; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <8 x i8> [[TMP19]], i32 3
 ; CHECK-NEXT:    store i8 [[TMP31]], ptr [[TMP23]], align 1
-; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <8 x i8> [[TMP19]], i32 4
 ; CHECK-NEXT:    store i8 [[TMP32]], ptr [[TMP24]], align 1
-; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <8 x i8> [[TMP19]], i32 5
 ; CHECK-NEXT:    store i8 [[TMP33]], ptr [[TMP25]], align 1
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <8 x i8> [[TMP19]], i32 6
 ; CHECK-NEXT:    store i8 [[TMP34]], ptr [[TMP26]], align 1
-; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <8 x i8> [[TMP19]], i32 7
 ; CHECK-NEXT:    store i8 [[TMP35]], ptr [[TMP27]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
@@ -608,53 +608,53 @@ define void @test(ptr %A, ptr noalias %B) #0 {
 ; MAX-BW-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <32 x i32> [[WIDE_VEC]], <32 x i32> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
 ; MAX-BW-NEXT:    [[TMP34:%.*]] = add <16 x i32> [[STRIDED_VEC]], [[STRIDED_VEC1]]
 ; MAX-BW-NEXT:    [[TMP35:%.*]] = trunc <16 x i32> [[TMP34]] to <16 x i8>
-; MAX-BW-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP16]]
-; MAX-BW-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP17]]
-; MAX-BW-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP18]]
-; MAX-BW-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP19]]
-; MAX-BW-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP20]]
-; MAX-BW-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP21]]
-; MAX-BW-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP22]]
-; MAX-BW-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP23]]
-; MAX-BW-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP24]]
-; MAX-BW-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP25]]
-; MAX-BW-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP26]]
-; MAX-BW-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP27]]
-; MAX-BW-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP28]]
-; MAX-BW-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP29]]
-; MAX-BW-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP30]]
-; MAX-BW-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP31]]
 ; MAX-BW-NEXT:    [[TMP52:%.*]] = extractelement <16 x i8> [[TMP35]], i32 0
-; MAX-BW-NEXT:    store i8 [[TMP52]], ptr [[TMP36]], align 1
 ; MAX-BW-NEXT:    [[TMP53:%.*]] = extractelement <16 x i8> [[TMP35]], i32 1
-; MAX-BW-NEXT:    store i8 [[TMP53]], ptr [[TMP37]], align 1
 ; MAX-BW-NEXT:    [[TMP54:%.*]] = extractelement <16 x i8> [[TMP35]], i32 2
-; MAX-BW-NEXT:    store i8 [[TMP54]], ptr [[TMP38]], align 1
 ; MAX-BW-NEXT:    [[TMP55:%.*]] = extractelement <16 x i8> [[TMP35]], i32 3
-; MAX-BW-NEXT:    store i8 [[TMP55]], ptr [[TMP39]], align 1
 ; MAX-BW-NEXT:    [[TMP56:%.*]] = extractelement <16 x i8> [[TMP35]], i32 4
-; MAX-BW-NEXT:    store i8 [[TMP56]], ptr [[TMP40]], align 1
 ; MAX-BW-NEXT:    [[TMP57:%.*]] = extractelement <16 x i8> [[TMP35]], i32 5
-; MAX-BW-NEXT:    store i8 [[TMP57]], ptr [[TMP41]], align 1
 ; MAX-BW-NEXT:    [[TMP58:%.*]] = extractelement <16 x i8> [[TMP35]], i32 6
-; MAX-BW-NEXT:    store i8 [[TMP58]], ptr [[TMP42]], align 1
 ; MAX-BW-NEXT:    [[TMP59:%.*]] = extractelement <16 x i8> [[TMP35]], i32 7
-; MAX-BW-NEXT:    store i8 [[TMP59]], ptr [[TMP43]], align 1
 ; MAX-BW-NEXT:    [[TMP60:%.*]] = extractelement <16 x i8> [[TMP35]], i32 8
-; MAX-BW-NEXT:    store i8 [[TMP60]], ptr [[TMP44]], align 1
 ; MAX-BW-NEXT:    [[TMP61:%.*]] = extractelement <16 x i8> [[TMP35]], i32 9
-; MAX-BW-NEXT:    store i8 [[TMP61]], ptr [[TMP45]], align 1
 ; MAX-BW-NEXT:    [[TMP62:%.*]] = extractelement <16 x i8> [[TMP35]], i32 10
-; MAX-BW-NEXT:    store i8 [[TMP62]], ptr [[TMP46]], align 1
 ; MAX-BW-NEXT:    [[TMP63:%.*]] = extractelement <16 x i8> [[TMP35]], i32 11
-; MAX-BW-NEXT:    store i8 [[TMP63]], ptr [[TMP47]], align 1
 ; MAX-BW-NEXT:    [[TMP64:%.*]] = extractelement <16 x i8> [[TMP35]], i32 12
-; MAX-BW-NEXT:    store i8 [[TMP64]], ptr [[TMP48]], align 1
 ; MAX-BW-NEXT:    [[TMP65:%.*]] = extractelement <16 x i8> [[TMP35]], i32 13
-; MAX-BW-NEXT:    store i8 [[TMP65]], ptr [[TMP49]], align 1
 ; MAX-BW-NEXT:    [[TMP66:%.*]] = extractelement <16 x i8> [[TMP35]], i32 14
-; MAX-BW-NEXT:    store i8 [[TMP66]], ptr [[TMP50]], align 1
 ; MAX-BW-NEXT:    [[TMP67:%.*]] = extractelement <16 x i8> [[TMP35]], i32 15
+; MAX-BW-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP16]]
+; MAX-BW-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP17]]
+; MAX-BW-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP18]]
+; MAX-BW-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP19]]
+; MAX-BW-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP20]]
+; MAX-BW-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP21]]
+; MAX-BW-NEXT:    [[TMP75:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP22]]
+; MAX-BW-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP23]]
+; MAX-BW-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP24]]
+; MAX-BW-NEXT:    [[TMP78:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP25]]
+; MAX-BW-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP26]]
+; MAX-BW-NEXT:    [[TMP80:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP27]]
+; MAX-BW-NEXT:    [[TMP81:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP28]]
+; MAX-BW-NEXT:    [[TMP82:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP29]]
+; MAX-BW-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP30]]
+; MAX-BW-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP31]]
+; MAX-BW-NEXT:    store i8 [[TMP52]], ptr [[TMP69]], align 1
+; MAX-BW-NEXT:    store i8 [[TMP53]], ptr [[TMP70]], align 1
+; MAX-BW-NEXT:    store i8 [[TMP54]], ptr [[TMP71]], align 1
+; MAX-BW-NEXT:    store i8 [[TMP55]], ptr [[TMP72]], align 1
+; MAX-BW-NEXT:    store i8 [[TMP56]], ptr [[TMP73]], align 1
+; MAX-BW-NEXT:    store i8 [[TMP57]], ptr [[TMP74]], align 1
+; MAX-BW-NEXT:    store i8 [[TMP58]], ptr [[TMP75]], align 1
+; MAX-BW-NEXT:    store i8 [[TMP59]], ptr [[TMP76]], align 1
+; MAX-BW-NEXT:    store i8 [[TMP60]], ptr [[TMP77]], align 1
+; MAX-BW-NEXT:    store i8 [[TMP61]], ptr [[TMP78]], align 1
+; MAX-BW-NEXT:    store i8 [[TMP62]], ptr [[TMP79]], align 1
+; MAX-BW-NEXT:    store i8 [[TMP63]], ptr [[TMP80]], align 1
+; MAX-BW-NEXT:    store i8 [[TMP64]], ptr [[TMP81]], align 1
+; MAX-BW-NEXT:    store i8 [[TMP65]], ptr [[TMP82]], align 1
+; MAX-BW-NEXT:    store i8 [[TMP66]], ptr [[TMP83]], align 1
 ; MAX-BW-NEXT:    store i8 [[TMP67]], ptr [[TMP51]], align 1
 ; MAX-BW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; MAX-BW-NEXT:    [[TMP68:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
diff --git a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll
index 7069534f3b683..4c3fd29a56021 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll
@@ -18,12 +18,12 @@ define dso_local void @tail_folding_enabled(ptr noalias nocapture %A, ptr noalia
 ; CHECK-NEXT:    [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IV]], splat (i64 429)
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP2]], i32 4, <8 x i1> [[TMP1]], <8 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 4 [[TMP2]], <8 x i1> [[TMP1]], <8 x i32> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP4]], i32 4, <8 x i1> [[TMP1]], <8 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 4 [[TMP4]], <8 x i1> [[TMP1]], <8 x i32> poison)
 ; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP6]], ptr [[TMP7]], i32 4, <8 x i1> [[TMP1]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP6]], ptr align 4 [[TMP7]], <8 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 432
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -67,12 +67,12 @@ define dso_local void @tail_folding_disabled(ptr noalias nocapture %A, ptr noali
 ; CHECK-NEXT:    [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IV]], splat (i64 429)
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP2]], i32 4, <8 x i1> [[TMP1]], <8 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 4 [[TMP2]], <8 x i1> [[TMP1]], <8 x i32> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP4]], i32 4, <8 x i1> [[TMP1]], <8 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 4 [[TMP4]], <8 x i1> [[TMP1]], <8 x i32> poison)
 ; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP6]], ptr [[TMP7]], i32 4, <8 x i1> [[TMP1]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP6]], ptr align 4 [[TMP7]], <8 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 432
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -134,9 +134,9 @@ define i32 @reduction_i32(ptr nocapture readonly %A, ptr nocapture readonly %B,
 ; CHECK-NEXT:    [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ule <8 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP5]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 4 [[TMP5]], <8 x i1> [[TMP4]], <8 x i32> poison)
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP7]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 4 [[TMP7]], <8 x i1> [[TMP4]], <8 x i32> poison)
 ; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP10]] = add <8 x i32> [[TMP9]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP4]], <8 x i32> [[TMP10]], <8 x i32> [[VEC_PHI]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory-gaps.ll b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory-gaps.ll
index 03a2a74581e72..eca70b3af159c 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory-gaps.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory-gaps.ll
@@ -21,13 +21,13 @@ define void @load_store_interleave_group_with_gaps(ptr noalias %data, i64 nounde
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[STRIDED_VEC]], <4 x ptr> [[TMP1]], i32 8, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[STRIDED_VEC]], <4 x ptr> align 8 [[TMP1]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP3:%.*]] = or disjoint <4 x i64> [[TMP0]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DATA]], <4 x i64> [[TMP3]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[STRIDED_VEC1]], <4 x ptr> [[TMP4]], i32 8, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[STRIDED_VEC1]], <4 x ptr> align 8 [[TMP4]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP5:%.*]] = or disjoint <4 x i64> [[TMP0]], splat (i64 3)
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[DATA]], <4 x i64> [[TMP5]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[STRIDED_VEC2]], <4 x ptr> [[TMP6]], i32 8, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[STRIDED_VEC2]], <4 x ptr> align 8 [[TMP6]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
index 2a3ce037e9567..f4d80af7089bb 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
@@ -8,15 +8,69 @@ target triple = "x86_64-unknown-linux"
 define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
 ; CHECK-LABEL: define void @test_4xi64(
 ; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:  [[ITER_CHECK:.*]]:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; CHECK:       [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_MOD_VF1:%.*]] = urem i64 [[N]], 16
+; CHECK-NEXT:    [[N_VEC1:%.*]] = sub i64 [[N]], [[N_MOD_VF1]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT1:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP20]], align 8
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP7]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP21]], align 8
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i64> poison, i64 [[TMP8]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT5]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP22]], align 8
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i64> poison, i64 [[TMP9]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT7]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x i64> poison, i64 [[TMP10]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT10:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT9]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[TMP2]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP11]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP12]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP23]], align 8
+; CHECK-NEXT:    [[TMP15:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT1]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP16:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT6]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP17:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT8]], [[WIDE_LOAD3]]
+; CHECK-NEXT:    [[TMP18:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT10]], [[WIDE_LOAD4]]
+; CHECK-NEXT:    store <4 x i64> [[TMP15]], ptr [[TMP11]], align 8
+; CHECK-NEXT:    store <4 x i64> [[TMP16]], ptr [[TMP12]], align 8
+; CHECK-NEXT:    store <4 x i64> [[TMP17]], ptr [[TMP13]], align 8
+; CHECK-NEXT:    store <4 x i64> [[TMP18]], ptr [[TMP23]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT1]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC1]]
+; CHECK-NEXT:    br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N1:%.*]] = icmp eq i64 [[N]], [[N_VEC1]]
+; CHECK-NEXT:    br i1 [[CMP_N1]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF1]], 4
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
+; CHECK:       [[VEC_EPILOG_PH]]:
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC1]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP5]], i64 0
@@ -27,15 +81,15 @@ define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    store <4 x i64> [[TMP4]], ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 1
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
-; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
+; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC1]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[DATA_2:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]]
 ; CHECK-NEXT:    [[L_2:%.*]] = load i64, ptr [[DATA_2]], align 8
 ; CHECK-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 0
@@ -56,7 +110,7 @@ define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    store i64 [[MUL_3]], ptr [[DATA_3]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -117,7 +171,7 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -140,7 +194,7 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -195,7 +249,7 @@ define void @test_2xi64_interleave_loads_order_flipped(ptr noalias %data, ptr no
 ; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -218,7 +272,7 @@ define void @test_2xi64_interleave_loads_order_flipped(ptr noalias %data, ptr no
 ; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -273,7 +327,7 @@ define void @test_2xi64_store_order_flipped_1(ptr noalias %data, ptr noalias %fa
 ; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -296,7 +350,7 @@ define void @test_2xi64_store_order_flipped_1(ptr noalias %data, ptr noalias %fa
 ; CHECK-NEXT:    store i64 [[MUL_0]], ptr [[DATA_1]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -351,7 +405,7 @@ define void @test_2xi64_store_order_flipped_2(ptr noalias %data, ptr noalias %fa
 ; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -374,7 +428,7 @@ define void @test_2xi64_store_order_flipped_2(ptr noalias %data, ptr noalias %fa
 ; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[DATA_0]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -435,7 +489,7 @@ define void @test_2xi64_different_loads_feeding_fmul(ptr noalias %data, ptr noal
 ; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -459,7 +513,7 @@ define void @test_2xi64_different_loads_feeding_fmul(ptr noalias %data, ptr noal
 ; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -519,7 +573,7 @@ define void @test_3xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -544,7 +598,7 @@ define void @test_3xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    store i64 [[MUL_2]], ptr [[DATA_2]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -653,7 +707,7 @@ define void @test_3xi32(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -677,7 +731,7 @@ define void @test_3xi32(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    store i32 [[MUL_2]], ptr [[DATA_2]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -711,19 +765,20 @@ exit:
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
-; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
-; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
-; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
-; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
-; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
-; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
-; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
-; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
-; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
-; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
-; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
-; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
-; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META1]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]]}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META2]], [[META1]]}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]]}
+; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META2]], [[META1]]}
+; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]], [[META2]]}
+; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META2]], [[META1]]}
+; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]], [[META2]]}
+; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META2]], [[META1]]}
+; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META1]], [[META2]]}
+; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META2]], [[META1]]}
+; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]]}
+; CHECK: [[LOOP18]] = distinct !{[[LOOP18]], [[META2]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/X86/unroll-pm.ll b/llvm/test/Transforms/LoopVectorize/X86/unroll-pm.ll
index 8971dfe507240..47355e7d9dafd 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/unroll-pm.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/unroll-pm.ll
@@ -11,7 +11,7 @@ target triple = "x86_64-apple-macosx10.8.0"
 ;CHECK-NOUNRL: store <4 x i32>
 ;CHECK-NOUNRL-NOT: store <4 x i32>
 ;CHECK-NOUNRL: ret
-define i32 @bar(ptr nocapture %A, i32 %n) nounwind uwtable ssp {
+define void @bar(ptr nocapture %A, i32 %n) nounwind uwtable ssp {
   %1 = icmp sgt i32 %n, 0
   br i1 %1, label %.lr.ph, label %._crit_edge
 
@@ -27,5 +27,5 @@ define i32 @bar(ptr nocapture %A, i32 %n) nounwind uwtable ssp {
   br i1 %exitcond, label %._crit_edge, label %.lr.ph
 
 ._crit_edge:                                      ; preds = %.lr.ph, %0
-  ret i32 undef
+  ret void
 }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
index 28de5c7915a84..fda944e072d4a 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
@@ -58,7 +58,7 @@ define void @vectorized(ptr noalias nocapture %A, ptr noalias nocapture readonly
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF5:![0-9]+]]
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
@@ -72,7 +72,7 @@ define void @vectorized(ptr noalias nocapture %A, ptr noalias nocapture readonly
 ; CHECK-NEXT:    store <4 x float> [[TMP21]], ptr [[TMP19]], align 4, !llvm.access.group [[ACC_GRP0]]
 ; CHECK-NEXT:    [[INDEX_NEXT11]] = add nuw i64 [[INDEX8]], 4
 ; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT11]], 20
-; CHECK-NEXT:    br i1 [[TMP22]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP22]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
@@ -88,7 +88,7 @@ define void @vectorized(ptr noalias nocapture %A, ptr noalias nocapture readonly
 ; CHECK-NEXT:    store float [[ADD]], ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP0]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -132,14 +132,14 @@ define void @vectorized1(ptr noalias nocapture %A, ptr noalias nocapture readonl
 ; CHECK-NEXT:    [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IV]], splat (i64 19)
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP2]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group [[ACC_GRP7:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 [[TMP2]], <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group [[ACC_GRP8:![0-9]+]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP4]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group [[ACC_GRP7]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 [[TMP4]], <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group [[ACC_GRP8]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = fadd fast <8 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]]
-; CHECK-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP6]], ptr [[TMP4]], i32 4, <8 x i1> [[TMP1]]), !llvm.access.group [[ACC_GRP7]]
+; CHECK-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP6]], ptr align 4 [[TMP4]], <8 x i1> [[TMP1]]), !llvm.access.group [[ACC_GRP8]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.end:
@@ -180,14 +180,14 @@ define void @vectorized2(ptr noalias nocapture %A, ptr noalias nocapture readonl
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP7]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP8]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP7]]
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP8]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    store <8 x float> [[TMP5]], ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP7]]
+; CHECK-NEXT:    store <8 x float> [[TMP5]], ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP8]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.end:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll
index 1e94f83a24d0a..c8e3766aa936e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll
@@ -28,12 +28,12 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <16 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = icmp ule <16 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
 ; IF-EVL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
-; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x i32> poison)
+; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 4 [[TMP2]], <16 x i1> [[TMP1]], <16 x i32> poison)
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[INDEX]]
-; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP4]], i32 4, <16 x i1> [[TMP1]], <16 x i32> poison)
+; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 4 [[TMP4]], <16 x i1> [[TMP1]], <16 x i32> poison)
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; IF-EVL-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP6]], ptr [[TMP7]], i32 4, <16 x i1> [[TMP1]])
+; IF-EVL-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP6]], ptr align 4 [[TMP7]], <16 x i1> [[TMP1]])
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; IF-EVL-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -91,7 +91,7 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; NO-VP-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; NO-VP:       vec.epilog.iter.check:
 ; NO-VP-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
-; NO-VP-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; NO-VP-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
 ; NO-VP:       vec.epilog.ph:
 ; NO-VP-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; NO-VP-NEXT:    [[N_MOD_VF9:%.*]] = urem i64 [[N]], 8
@@ -108,7 +108,7 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; NO-VP-NEXT:    store <8 x i32> [[TMP38]], ptr [[TMP39]], align 4
 ; NO-VP-NEXT:    [[INDEX_NEXT15]] = add nuw i64 [[INDEX12]], 8
 ; NO-VP-NEXT:    [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC10]]
-; NO-VP-NEXT:    br i1 [[TMP41]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; NO-VP-NEXT:    br i1 [[TMP41]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; NO-VP:       vec.epilog.middle.block:
 ; NO-VP-NEXT:    [[CMP_N11:%.*]] = icmp eq i64 [[N]], [[N_VEC10]]
 ; NO-VP-NEXT:    br i1 [[CMP_N11]], label [[FOR_COND_CLEANUP]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -126,7 +126,7 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; NO-VP-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
 ; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; NO-VP:       for.cond.cleanup:
 ; NO-VP-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorize-interleaved-accesses-gap.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorize-interleaved-accesses-gap.ll
index 455fe83dbb6df..4068498dc68db 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vectorize-interleaved-accesses-gap.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vectorize-interleaved-accesses-gap.ll
@@ -69,7 +69,7 @@ define void @test_pr59090(ptr %l_out, ptr noalias %b) #0 {
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[L_OUT:%.*]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 ; CHECK-NEXT:    [[TMP15:%.*]] = and <48 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false>
-; CHECK-NEXT:    call void @llvm.masked.store.v48i8.p0(<48 x i8> <i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison>, ptr [[TMP13]], i32 1, <48 x i1> [[TMP15]])
+; CHECK-NEXT:    call void @llvm.masked.store.v48i8.p0(<48 x i8> <i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison>, ptr align 1 [[TMP13]], <48 x i1> [[TMP15]]), !llvm.access.group [[ACC_GRP0]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10008
 ; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vplan-native-inner-loop-only.ll b/llvm/test/Transforms/LoopVectorize/X86/vplan-native-inner-loop-only.ll
index 4fb928dd3f018..38617d25bfd7a 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vplan-native-inner-loop-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vplan-native-inner-loop-only.ll
@@ -32,12 +32,12 @@ define void @test(ptr %A) {
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[TMP13:%.*]] = add <4 x i32> [[STRIDED_VEC]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[TMP13]], i32 0
-; CHECK-NEXT:    store i32 [[TMP14]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP13]], i32 1
-; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[TMP13]], i32 2
-; CHECK-NEXT:    store i32 [[TMP16]], ptr [[TMP10]], align 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[TMP13]], i32 3
+; CHECK-NEXT:    store i32 [[TMP14]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP9]], align 4
+; CHECK-NEXT:    store i32 [[TMP16]], ptr [[TMP10]], align 4
 ; CHECK-NEXT:    store i32 [[TMP17]], ptr [[TMP11]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
diff --git a/llvm/test/Transforms/LoopVectorize/X86/widened-value-used-as-scalar-and-first-lane.ll b/llvm/test/Transforms/LoopVectorize/X86/widened-value-used-as-scalar-and-first-lane.ll
index 9ea9e1193f956..efc9a4fa57292 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/widened-value-used-as-scalar-and-first-lane.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/widened-value-used-as-scalar-and-first-lane.ll
@@ -26,6 +26,7 @@ define void @iv.4_used_as_vector_and_first_lane(ptr %src, ptr noalias %dst) {
 ; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i64>, ptr [[TMP11]], align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP14:%.*]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP15:%.*]] = add <4 x i64> [[STEP_ADD_3]], splat (i64 4)
@@ -33,16 +34,15 @@ define void @iv.4_used_as_vector_and_first_lane(ptr %src, ptr noalias %dst) {
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp ule <4 x i64> [[WIDE_LOAD4]], splat (i64 128)
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp ule <4 x i64> [[WIDE_LOAD5]], splat (i64 128)
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ule <4 x i64> [[WIDE_LOAD6]], splat (i64 128)
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[TMP26]], 1
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP27]]
 ; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr i64, ptr [[TMP28]], i32 4
 ; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr i64, ptr [[TMP28]], i32 8
 ; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr i64, ptr [[TMP28]], i32 12
-; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP12]], ptr [[TMP28]], i32 4, <4 x i1> [[TMP16]])
-; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP13]], ptr [[TMP33]], i32 4, <4 x i1> [[TMP17]])
-; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP14]], ptr [[TMP34]], i32 4, <4 x i1> [[TMP18]])
-; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP15]], ptr [[TMP35]], i32 4, <4 x i1> [[TMP19]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP12]], ptr align 4 [[TMP28]], <4 x i1> [[TMP16]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP13]], ptr align 4 [[TMP33]], <4 x i1> [[TMP17]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP14]], ptr align 4 [[TMP34]], <4 x i1> [[TMP18]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP15]], ptr align 4 [[TMP35]], <4 x i1> [[TMP19]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD_3]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
@@ -105,10 +105,10 @@ define void @iv.4_used_as_first_lane(ptr %src, ptr noalias %dst) {
 ; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i64, ptr [[TMP24]], i32 4
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i64, ptr [[TMP24]], i32 8
 ; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr i64, ptr [[TMP24]], i32 12
-; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[WIDE_LOAD]], ptr [[TMP24]], i32 4, <4 x i1> [[TMP16]])
-; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[WIDE_LOAD1]], ptr [[TMP29]], i32 4, <4 x i1> [[TMP17]])
-; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[WIDE_LOAD2]], ptr [[TMP30]], i32 4, <4 x i1> [[TMP18]])
-; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[WIDE_LOAD3]], ptr [[TMP31]], i32 4, <4 x i1> [[TMP19]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[WIDE_LOAD]], ptr align 4 [[TMP24]], <4 x i1> [[TMP16]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[WIDE_LOAD1]], ptr align 4 [[TMP29]], <4 x i1> [[TMP17]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[WIDE_LOAD2]], ptr align 4 [[TMP30]], <4 x i1> [[TMP18]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[WIDE_LOAD3]], ptr align 4 [[TMP31]], <4 x i1> [[TMP19]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
 ; CHECK-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
index f29428c51c636..efcc0005acaa3 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
@@ -122,7 +122,7 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; DISABLED_MASKED_STRIDED:       pred.load.continue14:
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP49:%.*]] = phi <8 x i8> [ [[TMP43]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP48]], [[PRED_LOAD_IF13]] ]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP50:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[INDEX]]
-; DISABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP49]], ptr [[TMP50]], i32 1, <8 x i1> [[TMP0]])
+; DISABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP49]], ptr align 1 [[TMP50]], <8 x i1> [[TMP0]])
 ; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], splat (i32 8)
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP51:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
@@ -143,10 +143,10 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl i32 [[INDEX]], 1
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 [[TMP1]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
-; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison)
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP2]], <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[INDEX]]
-; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[STRIDED_VEC]], ptr [[TMP3]], i32 1, <8 x i1> [[TMP0]])
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[STRIDED_VEC]], ptr align 1 [[TMP3]], <8 x i1> [[TMP0]])
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], splat (i32 8)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1016
@@ -295,7 +295,7 @@ define dso_local void @masked_strided1_optsize(ptr noalias nocapture readonly %p
 ; DISABLED_MASKED_STRIDED:       pred.load.continue14:
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP49:%.*]] = phi <8 x i8> [ [[TMP43]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP48]], [[PRED_LOAD_IF13]] ]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP50:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[INDEX]]
-; DISABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP49]], ptr [[TMP50]], i32 1, <8 x i1> [[TMP0]])
+; DISABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP49]], ptr align 1 [[TMP50]], <8 x i1> [[TMP0]])
 ; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], splat (i32 8)
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP51:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
@@ -317,10 +317,10 @@ define dso_local void @masked_strided1_optsize(ptr noalias nocapture readonly %p
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 [[TMP1]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
-; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[TMP3]], <16 x i8> poison)
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP2]], <16 x i1> [[TMP3]], <16 x i8> poison)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[INDEX]]
-; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[STRIDED_VEC]], ptr [[TMP4]], i32 1, <8 x i1> [[TMP0]])
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[STRIDED_VEC]], ptr align 1 [[TMP4]], <8 x i1> [[TMP0]])
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], splat (i32 8)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
@@ -484,7 +484,7 @@ define dso_local void @masked_strided1_optsize_unknown_tc(ptr noalias nocapture
 ; DISABLED_MASKED_STRIDED:       pred.load.continue16:
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP51:%.*]] = phi <8 x i8> [ [[TMP45]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP50]], [[PRED_LOAD_IF15]] ]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP52:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[INDEX]]
-; DISABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP51]], ptr [[TMP52]], i32 1, <8 x i1> [[TMP3]])
+; DISABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP51]], ptr align 1 [[TMP52]], <8 x i1> [[TMP3]])
 ; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], splat (i32 8)
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP53:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -516,10 +516,10 @@ define dso_local void @masked_strided1_optsize_unknown_tc(ptr noalias nocapture
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 [[TMP2]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
-; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP3]], i32 1, <16 x i1> [[TMP5]], <16 x i8> poison)
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP3]], <16 x i1> [[TMP5]], <16 x i8> poison)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[INDEX]]
-; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[STRIDED_VEC]], ptr [[TMP6]], i32 1, <8 x i1> [[TMP4]])
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[STRIDED_VEC]], ptr align 1 [[TMP6]], <8 x i1> [[TMP4]])
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], splat (i32 8)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -688,7 +688,7 @@ define dso_local void @masked_strided3_optsize_unknown_tc(ptr noalias nocapture
 ; DISABLED_MASKED_STRIDED:       pred.load.continue16:
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP51:%.*]] = phi <8 x i8> [ [[TMP45]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP50]], [[PRED_LOAD_IF15]] ]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP52:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[INDEX]]
-; DISABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP51]], ptr [[TMP52]], i32 1, <8 x i1> [[TMP3]])
+; DISABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP51]], ptr align 1 [[TMP52]], <8 x i1> [[TMP3]])
 ; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], splat (i32 8)
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP53:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -720,10 +720,10 @@ define dso_local void @masked_strided3_optsize_unknown_tc(ptr noalias nocapture
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 [[TMP2]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = and <24 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false>
-; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <24 x i8> @llvm.masked.load.v24i8.p0(ptr [[TMP3]], i32 1, <24 x i1> [[TMP5]], <24 x i8> poison)
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <24 x i8> @llvm.masked.load.v24i8.p0(ptr align 1 [[TMP3]], <24 x i1> [[TMP5]], <24 x i8> poison)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <24 x i8> [[WIDE_MASKED_VEC]], <24 x i8> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[INDEX]]
-; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[STRIDED_VEC]], ptr [[TMP6]], i32 1, <8 x i1> [[TMP4]])
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[STRIDED_VEC]], ptr align 1 [[TMP6]], <8 x i1> [[TMP4]])
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], splat (i32 8)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -794,20 +794,20 @@ define dso_local void @unconditional_strided1_optsize(ptr noalias nocapture read
 ; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], splat (i32 1)
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = extractelement <8 x i32> [[TMP0]], i64 0
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i32 [[TMP1]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP0]], i64 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP3]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP0]], i64 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP5]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP0]], i64 3
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP7]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP0]], i64 4
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP9]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP0]], i64 5
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP11]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[TMP0]], i64 6
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP13]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP15:%.*]] = extractelement <8 x i32> [[TMP0]], i64 7
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i32 [[TMP1]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP3]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP5]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP7]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP9]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP11]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP13]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP15]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP17:%.*]] = load i8, ptr [[TMP2]], align 1
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP18:%.*]] = load i8, ptr [[TMP4]], align 1
@@ -841,7 +841,7 @@ define dso_local void @unconditional_strided1_optsize(ptr noalias nocapture read
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = shl nuw nsw i32 [[INDEX]], 1
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i32 [[TMP0]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP1]], i32 1, <16 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <16 x i8> poison)
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP1]], <16 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <16 x i8> poison)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[Q:%.*]], i32 [[INDEX]]
 ; ENABLED_MASKED_STRIDED-NEXT:    store <8 x i8> [[STRIDED_VEC]], ptr [[TMP2]], align 1
@@ -988,7 +988,7 @@ define dso_local void @unconditional_strided1_optsize_unknown_tc(ptr noalias noc
 ; DISABLED_MASKED_STRIDED:       pred.load.continue14:
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP49:%.*]] = phi <8 x i8> [ [[TMP43]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP48]], [[PRED_LOAD_IF13]] ]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i8, ptr [[Q:%.*]], i32 [[INDEX]]
-; DISABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP49]], ptr [[TMP50]], i32 1, <8 x i1> [[TMP0]])
+; DISABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP49]], ptr align 1 [[TMP50]], <8 x i1> [[TMP0]])
 ; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], splat (i32 8)
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP51:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -1017,10 +1017,10 @@ define dso_local void @unconditional_strided1_optsize_unknown_tc(ptr noalias noc
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i32 [[TMP1]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
-; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[TMP3]], <16 x i8> poison)
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP2]], <16 x i1> [[TMP3]], <16 x i8> poison)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[Q:%.*]], i32 [[INDEX]]
-; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[STRIDED_VEC]], ptr [[TMP4]], i32 1, <8 x i1> [[TMP0]])
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[STRIDED_VEC]], ptr align 1 [[TMP4]], <8 x i1> [[TMP0]])
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP5]], label [[FOR_END]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -1416,14 +1416,14 @@ define dso_local void @masked_strided2(ptr noalias nocapture readonly %p, ptr no
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl i32 [[INDEX]], 1
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 [[TMP1]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
-; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison)
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP2]], <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = call <8 x i8> @llvm.smax.v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8> [[STRIDED_VEC1]])
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[TMP1]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = sub <8 x i8> zeroinitializer, [[TMP4]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[INTERLEAVED_VEC]], ptr [[TMP6]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[INTERLEAVED_VEC]], ptr align 1 [[TMP6]], <16 x i1> [[INTERLEAVED_MASK]])
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], splat (i32 8)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
@@ -2550,14 +2550,14 @@ define dso_local void @masked_strided2_unknown_tc(ptr noalias nocapture readonly
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = shl i32 [[INDEX]], 1
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 [[TMP2]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
-; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP3]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison)
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP3]], <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = call <8 x i8> @llvm.smax.v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8> [[STRIDED_VEC3]])
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[TMP2]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = sub <8 x i8> zeroinitializer, [[TMP6]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[INTERLEAVED_VEC]], ptr [[TMP8]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[INTERLEAVED_VEC]], ptr align 1 [[TMP8]], <16 x i1> [[INTERLEAVED_MASK]])
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], splat (i32 8)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -2982,14 +2982,14 @@ define dso_local void @unconditional_masked_strided2_unknown_tc(ptr noalias noca
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i32 [[TMP1]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
-; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison)
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP2]], <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = call <8 x i8> @llvm.smax.v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8> [[STRIDED_VEC3]])
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[Q:%.*]], i32 [[TMP1]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = sub <8 x i8> zeroinitializer, [[TMP4]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[INTERLEAVED_VEC]], ptr [[TMP6]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[INTERLEAVED_VEC]], ptr align 1 [[TMP6]], <16 x i1> [[INTERLEAVED_MASK]])
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP8]], label [[FOR_END]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll
index 414394a8942e5..500e60372ac13 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll
@@ -27,41 +27,41 @@ define dso_local void @test1(ptr noalias nocapture %points, ptr noalias nocaptur
 ; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i64 [[INDEX]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP0]], align 2
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP10:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i64 0
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP11:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i64 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP12:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i64 2
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP13:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i64 3
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl nuw nsw <4 x i64> [[VEC_IND]], splat (i64 2)
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = extractelement <4 x i64> [[TMP1]], i64 0
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS:%.*]], i64 [[TMP2]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP1]], i64 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP4]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP1]], i64 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP6]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP1]], i64 3
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS:%.*]], i64 [[TMP2]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP4]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP6]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP8]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP10:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i64 0
 ; DISABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP10]], ptr [[TMP3]], align 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP11:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i64 1
 ; DISABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP11]], ptr [[TMP5]], align 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP12:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i64 2
 ; DISABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP12]], ptr [[TMP7]], align 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP13:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i64 3
 ; DISABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP13]], ptr [[TMP9]], align 2
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i64 [[INDEX]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i16>, ptr [[TMP14]], align 2
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP24:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i64 0
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP25:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i64 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP26:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i64 2
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP27:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i64 3
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP15:%.*]] = or disjoint <4 x i64> [[TMP1]], splat (i64 1)
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP16:%.*]] = extractelement <4 x i64> [[TMP15]], i64 0
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP16]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP18:%.*]] = extractelement <4 x i64> [[TMP15]], i64 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP18]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP20:%.*]] = extractelement <4 x i64> [[TMP15]], i64 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP20]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP15]], i64 3
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP16]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP18]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP20]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP22]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP24:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i64 0
 ; DISABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP24]], ptr [[TMP17]], align 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP25:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i64 1
 ; DISABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP25]], ptr [[TMP19]], align 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP26:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i64 2
 ; DISABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP26]], ptr [[TMP21]], align 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP27:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i64 3
 ; DISABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP27]], ptr [[TMP23]], align 2
 ; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
@@ -82,7 +82,7 @@ define dso_local void @test1(ptr noalias nocapture %points, ptr noalias nocaptur
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i64 [[INDEX]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i16>, ptr [[TMP2]], align 2
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i16> [[WIDE_LOAD]], <4 x i16> [[WIDE_LOAD1]], <16 x i32> <i32 0, i32 4, i32 poison, i32 poison, i32 1, i32 5, i32 poison, i32 poison, i32 2, i32 6, i32 poison, i32 poison, i32 3, i32 7, i32 poison, i32 poison>
-; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i16.p0(<16 x i16> [[INTERLEAVED_VEC]], ptr [[GEP]], i32 2, <16 x i1> <i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false>)
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i16.p0(<16 x i16> [[INTERLEAVED_VEC]], ptr align 2 [[GEP]], <16 x i1> <i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false>)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP4]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -140,7 +140,7 @@ define dso_local void @test2(ptr noalias nocapture %points, i32 %numPoints, ptr
 ; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE15]] ]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i64 [[INDEX]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr [[TMP1]], i32 2, <4 x i1> [[TMP0]], <4 x i16> poison)
+; DISABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 2 [[TMP1]], <4 x i1> [[TMP0]], <4 x i16> poison)
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = shl nsw <4 x i64> [[VEC_IND]], splat (i64 2)
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0
 ; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP3]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
@@ -179,7 +179,7 @@ define dso_local void @test2(ptr noalias nocapture %points, i32 %numPoints, ptr
 ; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE6]]
 ; DISABLED_MASKED_STRIDED:       pred.store.continue6:
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i64 [[INDEX]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_LOAD7:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr [[TMP19]], i32 2, <4 x i1> [[TMP0]], <4 x i16> poison)
+; DISABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_LOAD7:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 2 [[TMP19]], <4 x i1> [[TMP0]], <4 x i16> poison)
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP20:%.*]] = or disjoint <4 x i64> [[TMP2]], splat (i64 1)
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP21:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0
 ; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP21]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]]
@@ -245,15 +245,15 @@ define dso_local void @test2(ptr noalias nocapture %points, i32 %numPoints, ptr
 ; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IV:%.*]] = or disjoint <4 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i64 [[INDEX]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr [[TMP1]], i32 2, <4 x i1> [[TMP0]], <4 x i16> poison)
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 2 [[TMP1]], <4 x i1> [[TMP0]], <4 x i16> poison)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = shl nsw i64 [[INDEX]], 3
 ; ENABLED_MASKED_STRIDED-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[POINTS:%.*]], i64 [[TMP2]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i64 [[INDEX]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr [[TMP3]], i32 2, <4 x i1> [[TMP0]], <4 x i16> poison)
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 2 [[TMP3]], <4 x i1> [[TMP0]], <4 x i16> poison)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i16> [[WIDE_MASKED_LOAD]], <4 x i16> [[WIDE_MASKED_LOAD3]], <16 x i32> <i32 0, i32 4, i32 poison, i32 poison, i32 1, i32 5, i32 poison, i32 poison, i32 2, i32 6, i32 poison, i32 poison, i32 3, i32 7, i32 poison, i32 poison>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <4 x i1> [[TMP0]], <4 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false>
-; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i16.p0(<16 x i16> [[INTERLEAVED_VEC]], ptr [[GEP]], i32 2, <16 x i1> [[TMP5]])
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i16.p0(<16 x i16> [[INTERLEAVED_VEC]], ptr align 2 [[GEP]], <16 x i1> [[TMP5]])
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP6]], label [[FOR_END_LOOPEXIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll
index f293ed1838915..deef94aa3fe9d 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll
@@ -21,7 +21,7 @@ define i32 @predicated_sdiv_masked_load(ptr %a, ptr %b, i32 %x, i1 %c) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr [[TMP3]], i32 4, <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 4 [[TMP3]], <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> poison)
 ; CHECK-NEXT:    br i1 [[C]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]]
 ; CHECK:       pred.sdiv.if:
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[WIDE_MASKED_LOAD]], i32 0
@@ -61,7 +61,7 @@ define i32 @predicated_sdiv_masked_load(ptr %a, ptr %b, i32 %x, i1 %c) {
 ; SINK-GATHER-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
 ; SINK-GATHER-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP1]], align 4
 ; SINK-GATHER-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[INDEX]]
-; SINK-GATHER-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP3]], i32 4, <8 x i1> [[BROADCAST_SPLAT]], <8 x i32> poison)
+; SINK-GATHER-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 4 [[TMP3]], <8 x i1> [[BROADCAST_SPLAT]], <8 x i32> poison)
 ; SINK-GATHER-NEXT:    br i1 [[C]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]]
 ; SINK-GATHER:       pred.sdiv.if:
 ; SINK-GATHER-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-interleaved-access.ll b/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-interleaved-access.ll
index 368361fd760ec..0f55d79b2d299 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-interleaved-access.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-interleaved-access.ll
@@ -15,7 +15,7 @@ for.cond.cleanup:
 
 for.body:
   %i.09 = phi i16 [ 0, %entry ], [ %add3, %for.body ]
-  %res.08 = phi x86_fp80 [ undef, %entry ], [ %3, %for.body ]
+  %res.08 = phi x86_fp80 [ zeroinitializer, %entry ], [ %3, %for.body ]
   %arrayidx = getelementptr inbounds x86_fp80, ptr %a, i16 %i.09
   %0 = load x86_fp80, ptr %arrayidx, align 1
   %add = or i16 %i.09, 1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll b/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll
index c9fb05c5df3de..d29719de79ffd 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll
@@ -33,11 +33,11 @@ define void @example() {
 ; FORCED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; FORCED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
 ; FORCED-NEXT:    [[TMP2:%.*]] = sitofp <2 x i64> [[VEC_IND]] to <2 x x86_fp80>
+; FORCED-NEXT:    [[TMP5:%.*]] = extractelement <2 x x86_fp80> [[TMP2]], i32 0
+; FORCED-NEXT:    [[TMP6:%.*]] = extractelement <2 x x86_fp80> [[TMP2]], i32 1
 ; FORCED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1024 x x86_fp80], ptr @x, i64 0, i64 [[TMP0]]
 ; FORCED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1024 x x86_fp80], ptr @x, i64 0, i64 [[TMP1]]
-; FORCED-NEXT:    [[TMP5:%.*]] = extractelement <2 x x86_fp80> [[TMP2]], i32 0
 ; FORCED-NEXT:    store x86_fp80 [[TMP5]], ptr [[TMP3]], align 16
-; FORCED-NEXT:    [[TMP6:%.*]] = extractelement <2 x x86_fp80> [[TMP2]], i32 1
 ; FORCED-NEXT:    store x86_fp80 [[TMP6]], ptr [[TMP4]], align 16
 ; FORCED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; FORCED-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
@@ -100,8 +100,8 @@ define void @test_replicating_store_x86_fp80_cost(i32 %n, ptr %dst) #0 {
 ; FORCED-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; FORCED-NEXT:    [[TMP4:%.*]] = zext <2 x i32> [[VEC_IND]] to <2 x i64>
 ; FORCED-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
-; FORCED-NEXT:    [[TMP6:%.*]] = getelementptr x86_fp80, ptr [[DST]], i64 [[TMP5]]
 ; FORCED-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; FORCED-NEXT:    [[TMP6:%.*]] = getelementptr x86_fp80, ptr [[DST]], i64 [[TMP5]]
 ; FORCED-NEXT:    [[TMP8:%.*]] = getelementptr x86_fp80, ptr [[DST]], i64 [[TMP7]]
 ; FORCED-NEXT:    store x86_fp80 0xK00000000000000000000, ptr [[TMP6]], align 16
 ; FORCED-NEXT:    store x86_fp80 0xK00000000000000000000, ptr [[TMP8]], align 16
diff --git a/llvm/test/Transforms/LoopVectorize/assume.ll b/llvm/test/Transforms/LoopVectorize/assume.ll
index 65c12a15406ff..b41ddebc0c362 100644
--- a/llvm/test/Transforms/LoopVectorize/assume.ll
+++ b/llvm/test/Transforms/LoopVectorize/assume.ll
@@ -15,15 +15,15 @@ define void @test1(ptr noalias nocapture %a, ptr noalias nocapture readonly %b)
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP0]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x float>, ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ogt <2 x float> [[WIDE_LOAD]], splat (float 1.000000e+02)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = fcmp ogt <2 x float> [[WIDE_LOAD1]], splat (float 1.000000e+02)
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP3]])
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP4]])
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP5]])
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP6]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP13]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = fadd <2 x float> [[WIDE_LOAD]], splat (float 1.000000e+00)
 ; CHECK-NEXT:    [[TMP9:%.*]] = fadd <2 x float> [[WIDE_LOAD1]], splat (float 1.000000e+00)
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
@@ -34,8 +34,9 @@ define void @test1(ptr noalias nocapture %a, ptr noalias nocapture readonly %b)
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1600
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[FOR_END:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[FOR_END:.*]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %for.body
@@ -73,29 +74,28 @@ define void @test2(ptr noalias %a, ptr noalias %b) {
 ; CHECK-NEXT:    [[MASKCOND4:%.*]] = icmp eq i64 [[MASKEDPTR3]], 0
 ; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND4]])
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 2
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x float>, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x float> [[WIDE_LOAD]], splat (float 1.000000e+00)
 ; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x float> [[WIDE_LOAD1]], splat (float 1.000000e+00)
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND4]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND4]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 2
 ; CHECK-NEXT:    store <2 x float> [[TMP5]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    store <2 x float> [[TMP6]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1600
-; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[FOR_END:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[FOR_END:.*]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   %ptrint = ptrtoint ptr %a to i64
@@ -163,7 +163,7 @@ define void @predicated_assume(ptr noalias nocapture readonly %a, ptr noalias no
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], [[FOR_COND_CLEANUP_LOOPEXIT:label %.*]], label %[[SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/bsd_regex.ll b/llvm/test/Transforms/LoopVectorize/bsd_regex.ll
index f64255f29d335..784ccd22f98d7 100644
--- a/llvm/test/Transforms/LoopVectorize/bsd_regex.ll
+++ b/llvm/test/Transforms/LoopVectorize/bsd_regex.ll
@@ -8,7 +8,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; When scalarizing stores we need to preserve the original order.
 ; Make sure that we are extracting in the correct order (0101, and not 0011).
 
-define i32 @foo(ptr nocapture %A) {
+define void @foo(ptr nocapture %A) {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
@@ -18,20 +18,20 @@ define i32 @foo(ptr nocapture %A) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = shl nsw <2 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP0]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP0]], i64 1
 ; CHECK-NEXT:    [[STEP_ADD:%.*]] = shl <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i64> [[STEP_ADD]], splat (i64 8)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP0]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP0]], i64 1
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP1]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP1]], i64 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
-; CHECK-NEXT:    store i32 4, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]]
 ; CHECK-NEXT:    store i32 4, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    store i32 4, ptr [[TMP7]], align 4
 ; CHECK-NEXT:    store i32 4, ptr [[TMP9]], align 4
+; CHECK-NEXT:    store i32 4, ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
@@ -39,7 +39,7 @@ define i32 @foo(ptr nocapture %A) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    ret i32 undef
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %for.body
@@ -55,7 +55,7 @@ for.body:
   br i1 %exitcond, label %for.end, label %for.body
 
 for.end:
-  ret i32 undef
+  ret void
 }
 
 
diff --git a/llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll b/llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll
index 1fc4a017f9ec5..45405511abe90 100644
--- a/llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll
+++ b/llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll
@@ -988,30 +988,30 @@ define void @pointer_iv_non_uniform_0(ptr %a, i64 %n) {
 ; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i32> [[TMP35]], i32 [[TMP47]], i32 2
 ; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x i32> [[TMP36]], i32 [[TMP56]], i32 3
 ; CHECK-NEXT:    [[TMP25:%.*]] = sub <4 x i32> [[TMP24]], [[TMP12]]
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x i32> [[TMP25]], i32 0
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x i32> [[TMP25]], i32 1
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <4 x i32> [[TMP25]], i32 2
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <4 x i32> [[TMP25]], i32 3
 ; CHECK-NEXT:    [[TMP39:%.*]] = sub <4 x i32> [[TMP40]], [[TMP40]]
+; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <4 x i32> [[TMP39]], i32 0
+; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <4 x i32> [[TMP39]], i32 1
+; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <4 x i32> [[TMP39]], i32 2
+; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <4 x i32> [[TMP39]], i32 3
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP]], i32 2
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP1]], i32 2
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP2]], i32 2
 ; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP3]], i32 2
-; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x i32> [[TMP25]], i32 0
 ; CHECK-NEXT:    store i32 [[TMP30]], ptr [[TMP26]], align 8
-; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x i32> [[TMP25]], i32 1
 ; CHECK-NEXT:    store i32 [[TMP31]], ptr [[TMP27]], align 8
-; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <4 x i32> [[TMP25]], i32 2
 ; CHECK-NEXT:    store i32 [[TMP32]], ptr [[TMP28]], align 8
-; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <4 x i32> [[TMP25]], i32 3
 ; CHECK-NEXT:    store i32 [[TMP33]], ptr [[TMP29]], align 8
 ; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP]], i32 3
 ; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP1]], i32 3
 ; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP2]], i32 3
 ; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP3]], i32 3
-; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <4 x i32> [[TMP39]], i32 0
 ; CHECK-NEXT:    store i32 [[TMP52]], ptr [[TMP48]], align 8
-; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <4 x i32> [[TMP39]], i32 1
 ; CHECK-NEXT:    store i32 [[TMP53]], ptr [[TMP49]], align 8
-; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <4 x i32> [[TMP39]], i32 2
 ; CHECK-NEXT:    store i32 [[TMP54]], ptr [[TMP50]], align 8
-; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <4 x i32> [[TMP39]], i32 3
 ; CHECK-NEXT:    store i32 [[TMP55]], ptr [[TMP51]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1078,30 +1078,30 @@ define void @pointer_iv_non_uniform_0(ptr %a, i64 %n) {
 ; INTER-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <16 x i32> [[WIDE_VEC5]], <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
 ; INTER-NEXT:    [[STRIDED_VEC7:%.*]] = shufflevector <16 x i32> [[WIDE_VEC5]], <16 x i32> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
 ; INTER-NEXT:    [[TMP17:%.*]] = sub <4 x i32> [[STRIDED_VEC6]], [[STRIDED_VEC]]
+; INTER-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[TMP17]], i32 0
+; INTER-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[TMP17]], i32 1
+; INTER-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP17]], i32 2
+; INTER-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[TMP17]], i32 3
 ; INTER-NEXT:    [[TMP18:%.*]] = sub <4 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC4]]
+; INTER-NEXT:    [[TMP23:%.*]] = extractelement <4 x i32> [[TMP18]], i32 0
+; INTER-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[TMP18]], i32 1
+; INTER-NEXT:    [[TMP25:%.*]] = extractelement <4 x i32> [[TMP18]], i32 2
+; INTER-NEXT:    [[TMP26:%.*]] = extractelement <4 x i32> [[TMP18]], i32 3
 ; INTER-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP]], i32 2
 ; INTER-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP1]], i32 2
 ; INTER-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP2]], i32 2
 ; INTER-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP3]], i32 2
-; INTER-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[TMP17]], i32 0
 ; INTER-NEXT:    store i32 [[TMP13]], ptr [[TMP9]], align 8
-; INTER-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[TMP17]], i32 1
 ; INTER-NEXT:    store i32 [[TMP14]], ptr [[TMP10]], align 8
-; INTER-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP17]], i32 2
 ; INTER-NEXT:    store i32 [[TMP15]], ptr [[TMP11]], align 8
-; INTER-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[TMP17]], i32 3
 ; INTER-NEXT:    store i32 [[TMP16]], ptr [[TMP12]], align 8
 ; INTER-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP]], i32 3
 ; INTER-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP1]], i32 3
 ; INTER-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP2]], i32 3
 ; INTER-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP3]], i32 3
-; INTER-NEXT:    [[TMP23:%.*]] = extractelement <4 x i32> [[TMP18]], i32 0
 ; INTER-NEXT:    store i32 [[TMP23]], ptr [[TMP19]], align 8
-; INTER-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[TMP18]], i32 1
 ; INTER-NEXT:    store i32 [[TMP24]], ptr [[TMP20]], align 8
-; INTER-NEXT:    [[TMP25:%.*]] = extractelement <4 x i32> [[TMP18]], i32 2
 ; INTER-NEXT:    store i32 [[TMP25]], ptr [[TMP27]], align 8
-; INTER-NEXT:    [[TMP26:%.*]] = extractelement <4 x i32> [[TMP18]], i32 3
 ; INTER-NEXT:    store i32 [[TMP26]], ptr [[TMP22]], align 8
 ; INTER-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; INTER-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1316,9 +1316,9 @@ define i32 @pointer_iv_mixed(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[A]], %[[VECTOR_PH]] ], [ [[PTR_IND:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> <i64 0, i64 4, i64 8, i64 12>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 0
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 8, !alias.scope [[META20:![0-9]+]]
 ; CHECK-NEXT:    [[TMP7]] = add <4 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    store <4 x ptr> [[VECTOR_GEP]], ptr [[NEXT_GEP]], align 8, !alias.scope [[META23:![0-9]+]], !noalias [[META20]]
@@ -1382,9 +1382,9 @@ define i32 @pointer_iv_mixed(ptr %a, ptr %b, i64 %n) {
 ; INTER-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[A]], %[[VECTOR_PH]] ], [ [[PTR_IND:%.*]], %[[VECTOR_BODY]] ]
 ; INTER-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
 ; INTER-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> <i64 0, i64 4, i64 8, i64 12>
+; INTER-NEXT:    [[TMP6:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 0
 ; INTER-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
 ; INTER-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX]]
-; INTER-NEXT:    [[TMP6:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 0
 ; INTER-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 8, !alias.scope [[META20:![0-9]+]]
 ; INTER-NEXT:    [[TMP7]] = add <4 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
 ; INTER-NEXT:    store <4 x ptr> [[VECTOR_GEP]], ptr [[NEXT_GEP]], align 8, !alias.scope [[META23:![0-9]+]], !noalias [[META20]]
diff --git a/llvm/test/Transforms/LoopVectorize/cse-casts.ll b/llvm/test/Transforms/LoopVectorize/cse-casts.ll
index e923560bb77e8..fb45745eff1cb 100644
--- a/llvm/test/Transforms/LoopVectorize/cse-casts.ll
+++ b/llvm/test/Transforms/LoopVectorize/cse-casts.ll
@@ -319,8 +319,9 @@ define void @preserve_flags_narrowing_extends_and_truncs(ptr noalias %A, ptr noa
 ; CHECK:       [[PRED_STORE_CONTINUE60]]:
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -349,3 +350,45 @@ loop:
 exit:
   ret void
 }
+
+define void @simplified_cast_preserves_irflag_type(ptr noalias %p, ptr noalias %q, ptr noalias %r) {
+; CHECK-LABEL: define void @simplified_cast_preserves_irflag_type(
+; CHECK-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]], ptr noalias [[R:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[P]], align 1
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[TMP0]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT]] to <4 x i16>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+; CHECK-NEXT:    store i16 [[TMP2]], ptr [[Q]], align 2
+; CHECK-NEXT:    store i16 [[TMP2]], ptr [[R]], align 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 48
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[SCALAR_PH:.*]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %x = load i8, ptr %p
+  %x.i32 = zext i8 %x to i32
+  %trunc = trunc i32 %x.i32 to i16
+  store i16 %trunc, ptr %q
+  %x.i16 = zext i8 %x to i16
+  store i16 %x.i16, ptr %r
+  %iv.next = add i64 %iv, 2
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/cse-replicate-regions.ll b/llvm/test/Transforms/LoopVectorize/cse-replicate-regions.ll
new file mode 100644
index 0000000000000..c0692f3231e89
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/cse-replicate-regions.ll
@@ -0,0 +1,163 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
+; RUN: opt -p loop-vectorize -force-vector-width=2 -force-widen-divrem-via-safe-divisor=false -S %s | FileCheck %s
+
+define void @multiple_vppredinstphi_with_same_predicate(ptr %A, i32 %d) {
+; CHECK-LABEL: define void @multiple_vppredinstphi_with_same_predicate(
+; CHECK-SAME: ptr [[A:%.*]], i32 [[D:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_SDIV_CONTINUE2:.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[PRED_SDIV_IF:.*]], label %[[PRED_SDIV_CONTINUE:.*]]
+; CHECK:       [[PRED_SDIV_IF]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = sdiv i32 -10, [[D]]
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i32 0
+; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE]]
+; CHECK:       [[PRED_SDIV_CONTINUE]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP4]], %[[PRED_SDIV_IF]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[PRED_SDIV_IF1:.*]], label %[[PRED_SDIV_CONTINUE2]]
+; CHECK:       [[PRED_SDIV_IF1]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = sdiv i32 -10, [[D]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP7]], i32 1
+; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE2]]
+; CHECK:       [[PRED_SDIV_CONTINUE2]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = phi <2 x i32> [ [[TMP5]], %[[PRED_SDIV_CONTINUE]] ], [ [[TMP8]], %[[PRED_SDIV_IF1]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = add <2 x i32> [[TMP9]], [[TMP9]]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP1]], <2 x i32> [[TMP10]], <2 x i32> zeroinitializer
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], 100
+; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep.A = getelementptr inbounds i32, ptr %A, i32 %iv
+  %l = load i32, ptr %gep.A
+  %c = icmp sgt i32 %l, 0
+  br i1 %c, label %then, label %loop.latch
+
+then:
+  %div.0 = sdiv i32 -10, %d
+  %div.1 = sdiv i32 -10, %d
+  %add  = add i32 %div.1, %div.0
+  br label %loop.latch
+
+loop.latch:
+  %merge = phi i32 [ %add, %then ], [ 0, %loop.header ]
+  store i32 %merge, ptr %gep.A
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, 100
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+define void @multiple_vppredinstphi_with_different_predicate(ptr %A, i32 %d) {
+; CHECK-LABEL: define void @multiple_vppredinstphi_with_different_predicate(
+; CHECK-SAME: ptr [[A:%.*]], i32 [[D:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_SDIV_CONTINUE6:.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[PRED_SDIV_IF:.*]], label %[[PRED_SDIV_CONTINUE:.*]]
+; CHECK:       [[PRED_SDIV_IF]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = sdiv i32 -10, [[D]]
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i32 0
+; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE]]
+; CHECK:       [[PRED_SDIV_CONTINUE]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP4]], %[[PRED_SDIV_IF]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[PRED_SDIV_IF1:.*]], label %[[PRED_SDIV_CONTINUE2:.*]]
+; CHECK:       [[PRED_SDIV_IF1]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = sdiv i32 -10, [[D]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP7]], i32 1
+; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE2]]
+; CHECK:       [[PRED_SDIV_CONTINUE2]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = phi <2 x i32> [ [[TMP5]], %[[PRED_SDIV_CONTINUE]] ], [ [[TMP8]], %[[PRED_SDIV_IF1]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <2 x i1> [[TMP1]], splat (i1 true)
+; CHECK-NEXT:    [[TMP11:%.*]] = or <2 x i1> [[TMP1]], [[TMP10]]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP1]], <2 x i32> [[TMP9]], <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], splat (i32 20)
+; CHECK-NEXT:    [[TMP13:%.*]] = select <2 x i1> [[TMP11]], <2 x i1> [[TMP12]], <2 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP13]], i32 0
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[PRED_SDIV_IF3:.*]], label %[[PRED_SDIV_CONTINUE4:.*]]
+; CHECK:       [[PRED_SDIV_IF3]]:
+; CHECK-NEXT:    [[TMP15:%.*]] = sdiv i32 -10, [[D]]
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x i32> poison, i32 [[TMP15]], i32 0
+; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE4]]
+; CHECK:       [[PRED_SDIV_CONTINUE4]]:
+; CHECK-NEXT:    [[TMP17:%.*]] = phi <2 x i32> [ poison, %[[PRED_SDIV_CONTINUE2]] ], [ [[TMP16]], %[[PRED_SDIV_IF3]] ]
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x i1> [[TMP13]], i32 1
+; CHECK-NEXT:    br i1 [[TMP18]], label %[[PRED_SDIV_IF5:.*]], label %[[PRED_SDIV_CONTINUE6]]
+; CHECK:       [[PRED_SDIV_IF5]]:
+; CHECK-NEXT:    [[TMP19:%.*]] = sdiv i32 -10, [[D]]
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x i32> [[TMP17]], i32 [[TMP19]], i32 1
+; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE6]]
+; CHECK:       [[PRED_SDIV_CONTINUE6]]:
+; CHECK-NEXT:    [[TMP21:%.*]] = phi <2 x i32> [ [[TMP17]], %[[PRED_SDIV_CONTINUE4]] ], [ [[TMP20]], %[[PRED_SDIV_IF5]] ]
+; CHECK-NEXT:    [[PREDPHI7:%.*]] = select <2 x i1> [[TMP12]], <2 x i32> [[TMP21]], <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = add <2 x i32> [[PREDPHI]], [[PREDPHI7]]
+; CHECK-NEXT:    store <2 x i32> [[TMP22]], ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[INDEX_NEXT]], 100
+; CHECK-NEXT:    br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep.A = getelementptr inbounds i32, ptr %A, i32 %iv
+  %l = load i32, ptr %gep.A
+  %c.0 = icmp sgt i32 %l, 0
+  br i1 %c.0, label %then.0, label %continue
+
+then.0:
+  %div.0 = sdiv i32 -10, %d
+  br label %continue
+
+continue:
+  %merge.0 = phi i32 [ %div.0, %then.0 ], [ 0, %loop.header ]
+  %c.1 = icmp sgt i32 %l, 20
+  br i1 %c.1, label %then.1, label %loop.latch
+
+then.1:
+  %div.1 = sdiv i32 -10, %d
+  br label %loop.latch
+
+loop.latch:
+  %merge.1 = phi i32 [ %div.1, %then.1 ], [ 0, %continue ]
+  %add = add i32 %merge.0, %merge.1
+  store i32 %add, ptr %gep.A
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, 100
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/dbg-outer-loop-vect.ll b/llvm/test/Transforms/LoopVectorize/dbg-outer-loop-vect.ll
index e56b18f6769a0..9e2e74d14c95a 100644
--- a/llvm/test/Transforms/LoopVectorize/dbg-outer-loop-vect.ll
+++ b/llvm/test/Transforms/LoopVectorize/dbg-outer-loop-vect.ll
@@ -17,13 +17,13 @@ define void @foo(ptr %h) !dbg !4 {
 ; CHECK:       [[FOR_COND5_PREHEADER1]]:
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP5:%.*]], %[[FOR_COND5_PREHEADER1]] ], !dbg [[DBG23:![0-9]+]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i32, ptr [[H]], <4 x i64> [[VEC_PHI]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> zeroinitializer, <4 x ptr> [[TMP0]], i32 4, <4 x i1> splat (i1 true)), !dbg [[DBG24:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> zeroinitializer, <4 x ptr> align 4 [[TMP0]], <4 x i1> splat (i1 true)), !dbg [[DBG24:![0-9]+]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, <4 x ptr> [[TMP0]], i64 1, !dbg [[DBG26:![0-9]+]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> splat (i32 1), <4 x ptr> [[TMP2]], i32 4, <4 x i1> splat (i1 true)), !dbg [[DBG24]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> splat (i32 1), <4 x ptr> align 4 [[TMP2]], <4 x i1> splat (i1 true)), !dbg [[DBG24]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i32, <4 x ptr> [[TMP0]], i64 2, !dbg [[DBG26]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> splat (i32 2), <4 x ptr> [[TMP3]], i32 4, <4 x i1> splat (i1 true)), !dbg [[DBG24]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> splat (i32 2), <4 x ptr> align 4 [[TMP3]], <4 x i1> splat (i1 true)), !dbg [[DBG24]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, <4 x ptr> [[TMP0]], i64 3, !dbg [[DBG26]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> splat (i32 3), <4 x ptr> [[TMP4]], i32 4, <4 x i1> splat (i1 true)), !dbg [[DBG24]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> splat (i32 3), <4 x ptr> align 4 [[TMP4]], <4 x i1> splat (i1 true)), !dbg [[DBG24]]
 ; CHECK-NEXT:    [[TMP5]] = add nuw nsw <4 x i64> [[VEC_PHI]], splat (i64 1), !dbg [[DBG27:![0-9]+]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i64> [[TMP5]], splat (i64 5), !dbg [[DBG28:![0-9]+]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0, !dbg [[DBG29:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
index bcea03a6b9ba8..5177d7b6e0090 100644
--- a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
@@ -15,8 +15,8 @@ define void @deref_assumption_in_header_constant_trip_count(ptr noalias noundef
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[A]], <2 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP4]], i64 4), "dereferenceable"(ptr [[TMP4]], i64 4) ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP4]], i64 4), "dereferenceable"(ptr [[TMP4]], i64 4) ]
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP5]], i64 4), "dereferenceable"(ptr [[TMP5]], i64 4) ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4
@@ -144,8 +144,8 @@ define void @deref_assumption_too_small_in_header_constant_trip_count(ptr noalia
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[A]], <2 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP4]], i64 4), "dereferenceable"(ptr [[TMP4]], i64 2) ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP4]], i64 4), "dereferenceable"(ptr [[TMP4]], i64 2) ]
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP5]], i64 4), "dereferenceable"(ptr [[TMP5]], i64 2) ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4
@@ -219,8 +219,8 @@ define void @deref_assumption_in_header_constant_trip_count_align_1(ptr noalias
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[A]], <2 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP2]], i64 4) ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP2]], i64 4) ]
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP3]], i64 4) ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
@@ -294,8 +294,8 @@ define void @deref_assumption_in_header_constant_trip_count_align_via_arg_attrib
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[A]], <2 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP2]], i64 4) ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP2]], i64 4) ]
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP3]], i64 4) ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
@@ -369,8 +369,8 @@ define void @deref_assumption_in_header_constant_trip_count_align_not_known(ptr
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[A]], <2 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP2]], i64 4) ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP2]], i64 4) ]
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP3]], i64 4) ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
@@ -514,6 +514,8 @@ define void @deref_assumption_in_latch_constant_trip_count(ptr noalias noundef %
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2:.*]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[A]], <2 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp slt <2 x i32> [[WIDE_LOAD]], zeroinitializer
@@ -536,10 +538,8 @@ define void @deref_assumption_in_latch_constant_trip_count(ptr noalias noundef %
 ; CHECK:       [[PRED_LOAD_CONTINUE2]]:
 ; CHECK-NEXT:    [[TMP12:%.*]] = phi <2 x i32> [ [[TMP10]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], %[[PRED_LOAD_IF1]] ]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP5]], <2 x i32> [[TMP12]], <2 x i32> [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP20]], i64 4), "dereferenceable"(ptr [[TMP20]], i64 4) ]
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP19]], i64 4), "dereferenceable"(ptr [[TMP19]], i64 4) ]
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP3]], i64 4), "dereferenceable"(ptr [[TMP3]], i64 4) ]
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP4]], i64 4), "dereferenceable"(ptr [[TMP4]], i64 4) ]
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]]
 ; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP30]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 2
@@ -593,8 +593,8 @@ define void @deref_assumption_in_header_variable_trip_count(ptr noalias noundef
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[A]], <2 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP4]], i64 4), "dereferenceable"(ptr [[TMP4]], i64 4) ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP4]], i64 4), "dereferenceable"(ptr [[TMP4]], i64 4) ]
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP5]], i64 4), "dereferenceable"(ptr [[TMP5]], i64 4) ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/forked-pointers.ll b/llvm/test/Transforms/LoopVectorize/forked-pointers.ll
index c07dc8804f36e..76596130b946f 100644
--- a/llvm/test/Transforms/LoopVectorize/forked-pointers.ll
+++ b/llvm/test/Transforms/LoopVectorize/forked-pointers.ll
@@ -48,14 +48,14 @@ define dso_local void @forked_ptrs_different_base_same_offset(ptr nocapture read
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x ptr> [[BROADCAST_SPLAT]], <4 x ptr> [[BROADCAST_SPLAT9]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x ptr> [[TMP8]], i64 0
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x ptr> [[TMP8]], i64 1
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x ptr> [[TMP8]], i64 2
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x ptr> [[TMP8]], i64 3
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr float, ptr [[TMP11]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP29]], i64 4
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x ptr> [[TMP8]], i64 2
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr float, ptr [[TMP13]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP30]], i64 8
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x ptr> [[TMP8]], i64 3
 ; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr float, ptr [[TMP15]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[TMP31]], i64 12
 ; CHECK-NEXT:    [[TMP17:%.*]] = load float, ptr [[TMP10]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/histograms.ll b/llvm/test/Transforms/LoopVectorize/histograms.ll
index f0ceae7d5816b..5bb8722d734f4 100644
--- a/llvm/test/Transforms/LoopVectorize/histograms.ll
+++ b/llvm/test/Transforms/LoopVectorize/histograms.ll
@@ -16,8 +16,8 @@ define void @simple_histogram(ptr noalias %buckets, ptr readonly %indices, i64 %
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i32, ptr [[BUCKETS]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP1]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i32, ptr [[BUCKETS]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i32, ptr [[BUCKETS]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP3]], i64 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x ptr> [[TMP6]], ptr [[TMP5]], i64 1
diff --git a/llvm/test/Transforms/LoopVectorize/i8-induction.ll b/llvm/test/Transforms/LoopVectorize/i8-induction.ll
index 220fd64e6a829..712c75d3ed042 100644
--- a/llvm/test/Transforms/LoopVectorize/i8-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/i8-induction.ll
@@ -20,7 +20,7 @@ scalar.ph:
 
 for.body:
   %mul16 = phi i8 [ 0, %scalar.ph ], [ %mul, %for.body ]              ; <------- i8 induction var.
-  %c.015 = phi i8 [ undef, %scalar.ph ], [ %conv8, %for.body ]
+  %c.015 = phi i8 [ 0, %scalar.ph ], [ %conv8, %for.body ]
   %conv2 = sext i8 %c.015 to i32
   %tobool = icmp ne i8 %c.015, 0
   %.sink = select i1 %tobool, i8 %c.015, i8 %0
diff --git a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll
index 1588d02eff3db..51255b2e35707 100644
--- a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll
@@ -3,7 +3,7 @@
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
-define i32 @foo(ptr nocapture %A, ptr nocapture %B, i32 %n) {
+define void @foo(ptr nocapture %A, ptr nocapture %B, i32 %n) {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP26:%.*]] = icmp sgt i32 [[N:%.*]], 0
@@ -73,7 +73,7 @@ define i32 @foo(ptr nocapture %A, ptr nocapture %B, i32 %n) {
 ; CHECK:       for.end.loopexit:
 ; CHECK-NEXT:    br label [[FOR_END]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    ret i32 undef
+; CHECK-NEXT:    ret void
 ;
 entry:
   %cmp26 = icmp sgt i32 %n, 0
@@ -106,11 +106,11 @@ if.end14:
   br i1 %exitcond, label %for.end, label %for.body
 
 for.end:
-  ret i32 undef
+  ret void
 }
 
 ; As above but with multiple variables set per block.
-define i32 @multi_variable_if_nest(ptr nocapture %A, ptr nocapture %B, i32 %n) {
+define void @multi_variable_if_nest(ptr nocapture %A, ptr nocapture %B, i32 %n) {
 ; CHECK-LABEL: @multi_variable_if_nest(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP26:%.*]] = icmp sgt i32 [[N:%.*]], 0
@@ -188,7 +188,7 @@ define i32 @multi_variable_if_nest(ptr nocapture %A, ptr nocapture %B, i32 %n) {
 ; CHECK:       for.end.loopexit:
 ; CHECK-NEXT:    br label [[FOR_END]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    ret i32 undef
+; CHECK-NEXT:    ret void
 ;
 entry:
   %cmp26 = icmp sgt i32 %n, 0
@@ -224,5 +224,5 @@ if.end14:
   br i1 %exitcond, label %for.end, label %for.body
 
 for.end:
-  ret i32 undef
+  ret void
 }
diff --git a/llvm/test/Transforms/LoopVectorize/if-conversion.ll b/llvm/test/Transforms/LoopVectorize/if-conversion.ll
index 8a7f4a386fda1..a88a9b1466149 100644
--- a/llvm/test/Transforms/LoopVectorize/if-conversion.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-conversion.ll
@@ -17,8 +17,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ;  }
 ;}
 
-define i32 @function0(ptr nocapture %a, ptr nocapture %b, i32 %start, i32 %end) nounwind uwtable ssp {
-; CHECK-LABEL: define i32 @function0(
+define void @function0(ptr nocapture %a, ptr nocapture %b, i32 %start, i32 %end) nounwind uwtable ssp {
+; CHECK-LABEL: define void @function0(
 ; CHECK-SAME: ptr captures(none) [[A:%.*]], ptr captures(none) [[B:%.*]], i32 [[START:%.*]], i32 [[END:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[CMP16:%.*]] = icmp slt i32 [[START]], [[END]]
@@ -94,7 +94,7 @@ define i32 @function0(ptr nocapture %a, ptr nocapture %b, i32 %start, i32 %end)
 ; CHECK:       [[FOR_END_LOOPEXIT]]:
 ; CHECK-NEXT:    br label %[[FOR_END]]
 ; CHECK:       [[FOR_END]]:
-; CHECK-NEXT:    ret i32 undef
+; CHECK-NEXT:    ret void
 ;
 entry:
   %cmp16 = icmp slt i32 %start, %end
@@ -127,7 +127,7 @@ if.end:
   br i1 %cmp, label %for.body, label %for.end
 
 for.end:
-  ret i32 undef
+  ret void
 }
 
 
@@ -237,6 +237,8 @@ for.end:                                          ; preds = %for.inc, %entry
 ; Handle PHI with single incoming value having a full mask.
 ; PR34523
 
+; NOTE: Changing PHI inputs from undef to poison leads to change in
+; behaviour of the test. Left as undef for now.
 define void @PR34523() {
 ; CHECK-LABEL: define void @PR34523() {
 ; CHECK-NEXT:  [[BB1:.*:]]
diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
index f7376a0f8e205..c164c4a46bd94 100644
--- a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
@@ -277,7 +277,7 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) {
 ; UNROLL-NOSIMPLIFY-NEXT:    [[INEWCHUNKS_2_LCSSA:%.*]] = phi i32 [ [[INEWCHUNKS_2]], [[FOR_INC23]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ]
 ; UNROLL-NOSIMPLIFY-NEXT:    br label [[FOR_INC26]]
 ; UNROLL-NOSIMPLIFY:       for.inc26:
-; UNROLL-NOSIMPLIFY-NEXT:    [[INEWCHUNKS_1_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY9]] ], [ [[INEWCHUNKS_2_LCSSA]], [[FOR_INC26_LOOPEXIT]] ]
+; UNROLL-NOSIMPLIFY-NEXT:    [[INEWCHUNKS_1_LCSSA:%.*]] = phi i32 [ 0, [[FOR_BODY9]] ], [ [[INEWCHUNKS_2_LCSSA]], [[FOR_INC26_LOOPEXIT]] ]
 ; UNROLL-NOSIMPLIFY-NEXT:    unreachable
 ;
 ; VEC-LABEL: @bug18724(
@@ -376,7 +376,7 @@ for.inc23:
   br i1 %cmp13, label %for.body14, label %for.inc26
 
 for.inc26:
-  %iNewChunks.1.lcssa = phi i32 [ undef, %for.body9 ], [ %iNewChunks.2, %for.inc23 ]
+  %iNewChunks.1.lcssa = phi i32 [ 0, %for.body9 ], [ %iNewChunks.2, %for.inc23 ]
   unreachable
 }
 
diff --git a/llvm/test/Transforms/LoopVectorize/incorrect-dom-info.ll b/llvm/test/Transforms/LoopVectorize/incorrect-dom-info.ll
index 9e750022d8c4c..5cf99b8998603 100644
--- a/llvm/test/Transforms/LoopVectorize/incorrect-dom-info.ll
+++ b/llvm/test/Transforms/LoopVectorize/incorrect-dom-info.ll
@@ -53,8 +53,8 @@ thread-pre-split.loopexit:                        ; preds = %11, %.thread-pre-sp
   br i1 false, label %thread-pre-split._crit_edge, label %.lr.ph21
 
 .lr.ph21:                                         ; preds = %26, %thread-pre-split.loopexit, %thread-pre-split.preheader
-  %d.020 = phi ptr [ undef, %26 ], [ %d.1.lcssa, %thread-pre-split.loopexit ], [ undef, %thread-pre-split.preheader ]
-  %10 = phi i64 [ %28, %26 ], [ undef, %thread-pre-split.loopexit ], [ undef, %thread-pre-split.preheader ]
+  %d.020 = phi ptr [ zeroinitializer, %26 ], [ %d.1.lcssa, %thread-pre-split.loopexit ], [ zeroinitializer, %thread-pre-split.preheader ]
+  %10 = phi i64 [ %28, %26 ], [ zeroinitializer, %thread-pre-split.loopexit ], [ zeroinitializer, %thread-pre-split.preheader ]
   br i1 %arg, label %11, label %22
 
 ; <label>:11                                      ; preds = %.lr.ph21
diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll
index cc55a51e134a6..e33995327b856 100644
--- a/llvm/test/Transforms/LoopVectorize/induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction.ll
@@ -1247,8 +1247,8 @@ define void @scalarize_induction_variable_03(ptr %p, i32 %y, i64 %n) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor <2 x i32> [[TMP7]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[TMP8]], i32 0
-; CHECK-NEXT:    store i32 [[TMP9]], ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i32> [[TMP8]], i32 1
+; CHECK-NEXT:    store i32 [[TMP9]], ptr [[TMP2]], align 8
 ; CHECK-NEXT:    store i32 [[TMP10]], ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1293,8 +1293,8 @@ define void @scalarize_induction_variable_03(ptr %p, i32 %y, i64 %n) {
 ; IND-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i64 1
 ; IND-NEXT:    [[TMP7:%.*]] = xor <2 x i32> [[TMP6]], [[BROADCAST_SPLAT]]
 ; IND-NEXT:    [[TMP8:%.*]] = extractelement <2 x i32> [[TMP7]], i64 0
-; IND-NEXT:    store i32 [[TMP8]], ptr [[TMP1]], align 8
 ; IND-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[TMP7]], i64 1
+; IND-NEXT:    store i32 [[TMP8]], ptr [[TMP1]], align 8
 ; IND-NEXT:    store i32 [[TMP9]], ptr [[TMP2]], align 8
 ; IND-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; IND-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1347,14 +1347,14 @@ define void @scalarize_induction_variable_03(ptr %p, i32 %y, i64 %n) {
 ; UNROLL-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i64 0
 ; UNROLL-NEXT:    [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP12]], i64 1
 ; UNROLL-NEXT:    [[TMP15:%.*]] = xor <2 x i32> [[TMP10]], [[BROADCAST_SPLAT]]
-; UNROLL-NEXT:    [[TMP16:%.*]] = xor <2 x i32> [[TMP14]], [[BROADCAST_SPLAT]]
 ; UNROLL-NEXT:    [[TMP17:%.*]] = extractelement <2 x i32> [[TMP15]], i64 0
-; UNROLL-NEXT:    store i32 [[TMP17]], ptr [[TMP3]], align 8
 ; UNROLL-NEXT:    [[TMP18:%.*]] = extractelement <2 x i32> [[TMP15]], i64 1
-; UNROLL-NEXT:    store i32 [[TMP18]], ptr [[TMP4]], align 8
+; UNROLL-NEXT:    [[TMP16:%.*]] = xor <2 x i32> [[TMP14]], [[BROADCAST_SPLAT]]
 ; UNROLL-NEXT:    [[TMP19:%.*]] = extractelement <2 x i32> [[TMP16]], i64 0
-; UNROLL-NEXT:    store i32 [[TMP19]], ptr [[TMP5]], align 8
 ; UNROLL-NEXT:    [[TMP20:%.*]] = extractelement <2 x i32> [[TMP16]], i64 1
+; UNROLL-NEXT:    store i32 [[TMP17]], ptr [[TMP3]], align 8
+; UNROLL-NEXT:    store i32 [[TMP18]], ptr [[TMP4]], align 8
+; UNROLL-NEXT:    store i32 [[TMP19]], ptr [[TMP5]], align 8
 ; UNROLL-NEXT:    store i32 [[TMP20]], ptr [[TMP6]], align 8
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; UNROLL-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1408,14 +1408,14 @@ define void @scalarize_induction_variable_03(ptr %p, i32 %y, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = insertelement <2 x i32> poison, i32 [[TMP12]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = insertelement <2 x i32> [[TMP14]], i32 [[TMP13]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = xor <2 x i32> [[TMP11]], [[BROADCAST_SPLAT]]
-; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = xor <2 x i32> [[TMP15]], [[BROADCAST_SPLAT]]
 ; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0
-; UNROLL-NO-IC-NEXT:    store i32 [[TMP18]], ptr [[TMP4]], align 8
 ; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = extractelement <2 x i32> [[TMP16]], i32 1
-; UNROLL-NO-IC-NEXT:    store i32 [[TMP19]], ptr [[TMP5]], align 8
+; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = xor <2 x i32> [[TMP15]], [[BROADCAST_SPLAT]]
 ; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = extractelement <2 x i32> [[TMP17]], i32 0
-; UNROLL-NO-IC-NEXT:    store i32 [[TMP20]], ptr [[TMP6]], align 8
 ; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = extractelement <2 x i32> [[TMP17]], i32 1
+; UNROLL-NO-IC-NEXT:    store i32 [[TMP18]], ptr [[TMP4]], align 8
+; UNROLL-NO-IC-NEXT:    store i32 [[TMP19]], ptr [[TMP5]], align 8
+; UNROLL-NO-IC-NEXT:    store i32 [[TMP20]], ptr [[TMP6]], align 8
 ; UNROLL-NO-IC-NEXT:    store i32 [[TMP21]], ptr [[TMP7]], align 8
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1473,22 +1473,22 @@ define void @scalarize_induction_variable_03(ptr %p, i32 %y, i64 %n) {
 ; INTERLEAVE-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP13]], align 8
 ; INTERLEAVE-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; INTERLEAVE-NEXT:    [[TMP17:%.*]] = xor <4 x i32> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
-; INTERLEAVE-NEXT:    [[TMP18:%.*]] = xor <4 x i32> [[STRIDED_VEC2]], [[BROADCAST_SPLAT]]
 ; INTERLEAVE-NEXT:    [[TMP19:%.*]] = extractelement <4 x i32> [[TMP17]], i64 0
-; INTERLEAVE-NEXT:    store i32 [[TMP19]], ptr [[TMP9]], align 8
 ; INTERLEAVE-NEXT:    [[TMP20:%.*]] = extractelement <4 x i32> [[TMP17]], i64 1
-; INTERLEAVE-NEXT:    store i32 [[TMP20]], ptr [[TMP10]], align 8
 ; INTERLEAVE-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[TMP17]], i64 2
-; INTERLEAVE-NEXT:    store i32 [[TMP21]], ptr [[TMP11]], align 8
 ; INTERLEAVE-NEXT:    [[TMP22:%.*]] = extractelement <4 x i32> [[TMP17]], i64 3
-; INTERLEAVE-NEXT:    store i32 [[TMP22]], ptr [[TMP12]], align 8
+; INTERLEAVE-NEXT:    [[TMP18:%.*]] = xor <4 x i32> [[STRIDED_VEC2]], [[BROADCAST_SPLAT]]
 ; INTERLEAVE-NEXT:    [[TMP23:%.*]] = extractelement <4 x i32> [[TMP18]], i64 0
-; INTERLEAVE-NEXT:    store i32 [[TMP23]], ptr [[TMP13]], align 8
 ; INTERLEAVE-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[TMP18]], i64 1
-; INTERLEAVE-NEXT:    store i32 [[TMP24]], ptr [[TMP14]], align 8
 ; INTERLEAVE-NEXT:    [[TMP25:%.*]] = extractelement <4 x i32> [[TMP18]], i64 2
-; INTERLEAVE-NEXT:    store i32 [[TMP25]], ptr [[TMP15]], align 8
 ; INTERLEAVE-NEXT:    [[TMP26:%.*]] = extractelement <4 x i32> [[TMP18]], i64 3
+; INTERLEAVE-NEXT:    store i32 [[TMP19]], ptr [[TMP9]], align 8
+; INTERLEAVE-NEXT:    store i32 [[TMP20]], ptr [[TMP10]], align 8
+; INTERLEAVE-NEXT:    store i32 [[TMP21]], ptr [[TMP11]], align 8
+; INTERLEAVE-NEXT:    store i32 [[TMP22]], ptr [[TMP12]], align 8
+; INTERLEAVE-NEXT:    store i32 [[TMP23]], ptr [[TMP13]], align 8
+; INTERLEAVE-NEXT:    store i32 [[TMP24]], ptr [[TMP14]], align 8
+; INTERLEAVE-NEXT:    store i32 [[TMP25]], ptr [[TMP15]], align 8
 ; INTERLEAVE-NEXT:    store i32 [[TMP26]], ptr [[TMP16]], align 8
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; INTERLEAVE-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1568,10 +1568,10 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = shl nsw <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP13]], align 1, !alias.scope [[META17:![0-9]+]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP26]], align 1, !alias.scope [[META17:![0-9]+]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1, !alias.scope [[META17]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P]], i64 [[TMP9]], i32 1
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP10]], i32 1
@@ -1630,16 +1630,16 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) {
 ; IND-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; IND-NEXT:    [[TMP10:%.*]] = shl nsw <2 x i64> [[VEC_IND]], splat (i64 2)
 ; IND-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP10]], i64 0
-; IND-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]]
 ; IND-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP10]], i64 1
+; IND-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]]
 ; IND-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP13]]
-; IND-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP12]], align 1, !alias.scope [[META17:![0-9]+]]
+; IND-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP24]], align 1, !alias.scope [[META17:![0-9]+]]
 ; IND-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 1, !alias.scope [[META17]]
 ; IND-NEXT:    [[DOTSPLIT:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P]], i64 [[INDEX]]
 ; IND-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTSPLIT]], i64 4
 ; IND-NEXT:    [[TMP17:%.*]] = getelementptr [[PAIR_I32]], ptr [[P]], i64 [[INDEX]]
 ; IND-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i64 12
-; IND-NEXT:    store i32 [[TMP24]], ptr [[TMP16]], align 1, !alias.scope [[META20:![0-9]+]], !noalias [[META17]]
+; IND-NEXT:    store i32 [[TMP25]], ptr [[TMP16]], align 1, !alias.scope [[META20:![0-9]+]], !noalias [[META17]]
 ; IND-NEXT:    store i32 [[TMP15]], ptr [[TMP18]], align 1, !alias.scope [[META20]], !noalias [[META17]]
 ; IND-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; IND-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
@@ -1694,20 +1694,20 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) {
 ; UNROLL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NEXT:    [[TMP12:%.*]] = shl nsw <2 x i64> [[VEC_IND]], splat (i64 2)
+; UNROLL-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i64 0
+; UNROLL-NEXT:    [[TMP18:%.*]] = extractelement <2 x i64> [[TMP12]], i64 1
 ; UNROLL-NEXT:    [[STEP_ADD:%.*]] = shl <2 x i64> [[VEC_IND]], splat (i64 2)
 ; UNROLL-NEXT:    [[TMP13:%.*]] = add <2 x i64> [[STEP_ADD]], splat (i64 8)
-; UNROLL-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i64 0
-; UNROLL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]]
-; UNROLL-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i64 1
+; UNROLL-NEXT:    [[TMP20:%.*]] = extractelement <2 x i64> [[TMP13]], i64 0
+; UNROLL-NEXT:    [[TMP35:%.*]] = extractelement <2 x i64> [[TMP13]], i64 1
 ; UNROLL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP16]]
-; UNROLL-NEXT:    [[TMP18:%.*]] = extractelement <2 x i64> [[TMP13]], i64 0
 ; UNROLL-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP18]]
-; UNROLL-NEXT:    [[TMP20:%.*]] = extractelement <2 x i64> [[TMP13]], i64 1
 ; UNROLL-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP20]]
-; UNROLL-NEXT:    [[TMP35:%.*]] = load i32, ptr [[TMP15]], align 1, !alias.scope [[META17:![0-9]+]]
-; UNROLL-NEXT:    [[TMP36:%.*]] = load i32, ptr [[TMP17]], align 1, !alias.scope [[META17]]
-; UNROLL-NEXT:    [[TMP37:%.*]] = load i32, ptr [[TMP19]], align 1, !alias.scope [[META17]]
-; UNROLL-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 1, !alias.scope [[META17]]
+; UNROLL-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP35]]
+; UNROLL-NEXT:    [[TMP37:%.*]] = load i32, ptr [[TMP17]], align 1, !alias.scope [[META17:![0-9]+]]
+; UNROLL-NEXT:    [[TMP38:%.*]] = load i32, ptr [[TMP19]], align 1, !alias.scope [[META17]]
+; UNROLL-NEXT:    [[TMP39:%.*]] = load i32, ptr [[TMP21]], align 1, !alias.scope [[META17]]
+; UNROLL-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP36]], align 1, !alias.scope [[META17]]
 ; UNROLL-NEXT:    [[DOTSPLIT:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P]], i64 [[INDEX]]
 ; UNROLL-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTSPLIT]], i64 4
 ; UNROLL-NEXT:    [[TMP24:%.*]] = getelementptr [[PAIR_I32]], ptr [[P]], i64 [[INDEX]]
@@ -1716,9 +1716,9 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) {
 ; UNROLL-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[TMP26]], i64 20
 ; UNROLL-NEXT:    [[TMP28:%.*]] = getelementptr [[PAIR_I32]], ptr [[P]], i64 [[INDEX]]
 ; UNROLL-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[TMP28]], i64 28
-; UNROLL-NEXT:    store i32 [[TMP35]], ptr [[TMP23]], align 1, !alias.scope [[META20:![0-9]+]], !noalias [[META17]]
-; UNROLL-NEXT:    store i32 [[TMP36]], ptr [[TMP25]], align 1, !alias.scope [[META20]], !noalias [[META17]]
-; UNROLL-NEXT:    store i32 [[TMP37]], ptr [[TMP27]], align 1, !alias.scope [[META20]], !noalias [[META17]]
+; UNROLL-NEXT:    store i32 [[TMP37]], ptr [[TMP23]], align 1, !alias.scope [[META20:![0-9]+]], !noalias [[META17]]
+; UNROLL-NEXT:    store i32 [[TMP38]], ptr [[TMP25]], align 1, !alias.scope [[META20]], !noalias [[META17]]
+; UNROLL-NEXT:    store i32 [[TMP39]], ptr [[TMP27]], align 1, !alias.scope [[META20]], !noalias [[META17]]
 ; UNROLL-NEXT:    store i32 [[TMP22]], ptr [[TMP29]], align 1, !alias.scope [[META20]], !noalias [[META17]]
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; UNROLL-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
@@ -1779,19 +1779,19 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 2
 ; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 3
 ; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = shl nsw <2 x i64> [[VEC_IND]], splat (i64 2)
+; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
+; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = shl nsw <2 x i64> [[STEP_ADD]], splat (i64 2)
-; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
-; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP15]]
-; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
+; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0
+; UNROLL-NO-IC-NEXT:    [[TMP37:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP17]]
-; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP19]]
-; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP21]]
-; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP16]], align 1, !alias.scope [[META17:![0-9]+]]
-; UNROLL-NO-IC-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP18]], align 1, !alias.scope [[META17]]
-; UNROLL-NO-IC-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP20]], align 1, !alias.scope [[META17]]
-; UNROLL-NO-IC-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP22]], align 1, !alias.scope [[META17]]
+; UNROLL-NO-IC-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP37]]
+; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP18]], align 1, !alias.scope [[META17:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP20]], align 1, !alias.scope [[META17]]
+; UNROLL-NO-IC-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP22]], align 1, !alias.scope [[META17]]
+; UNROLL-NO-IC-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP38]], align 1, !alias.scope [[META17]]
 ; UNROLL-NO-IC-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P]], i64 [[TMP9]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP10]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP11]], i32 1
@@ -1859,7 +1859,15 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) {
 ; INTERLEAVE-NEXT:    [[DOTIDX5:%.*]] = shl nsw i64 [[TMP14]], 4
 ; INTERLEAVE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[DOTIDX5]]
 ; INTERLEAVE-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP18]], align 1, !alias.scope [[META17:![0-9]+]]
+; INTERLEAVE-NEXT:    [[TMP28:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 0
+; INTERLEAVE-NEXT:    [[TMP29:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 4
+; INTERLEAVE-NEXT:    [[TMP30:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 8
+; INTERLEAVE-NEXT:    [[TMP31:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 12
 ; INTERLEAVE-NEXT:    [[WIDE_VEC3:%.*]] = load <16 x i32>, ptr [[TMP19]], align 1, !alias.scope [[META17]]
+; INTERLEAVE-NEXT:    [[TMP32:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 0
+; INTERLEAVE-NEXT:    [[TMP33:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 4
+; INTERLEAVE-NEXT:    [[TMP34:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 8
+; INTERLEAVE-NEXT:    [[TMP35:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 12
 ; INTERLEAVE-NEXT:    [[DOTSPLIT:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P]], i64 [[INDEX]]
 ; INTERLEAVE-NEXT:    [[TMP41:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTSPLIT]], i64 4
 ; INTERLEAVE-NEXT:    [[TMP15:%.*]] = getelementptr [[PAIR_I32]], ptr [[P]], i64 [[INDEX]]
@@ -1876,21 +1884,13 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) {
 ; INTERLEAVE-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[TMP24]], i64 52
 ; INTERLEAVE-NEXT:    [[TMP26:%.*]] = getelementptr [[PAIR_I32]], ptr [[P]], i64 [[INDEX]]
 ; INTERLEAVE-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[TMP26]], i64 60
-; INTERLEAVE-NEXT:    [[TMP28:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 0
 ; INTERLEAVE-NEXT:    store i32 [[TMP28]], ptr [[TMP41]], align 1, !alias.scope [[META20:![0-9]+]], !noalias [[META17]]
-; INTERLEAVE-NEXT:    [[TMP29:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 4
 ; INTERLEAVE-NEXT:    store i32 [[TMP29]], ptr [[TMP16]], align 1, !alias.scope [[META20]], !noalias [[META17]]
-; INTERLEAVE-NEXT:    [[TMP30:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 8
 ; INTERLEAVE-NEXT:    store i32 [[TMP30]], ptr [[TMP42]], align 1, !alias.scope [[META20]], !noalias [[META17]]
-; INTERLEAVE-NEXT:    [[TMP31:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 12
 ; INTERLEAVE-NEXT:    store i32 [[TMP31]], ptr [[TMP20]], align 1, !alias.scope [[META20]], !noalias [[META17]]
-; INTERLEAVE-NEXT:    [[TMP32:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 0
 ; INTERLEAVE-NEXT:    store i32 [[TMP32]], ptr [[TMP21]], align 1, !alias.scope [[META20]], !noalias [[META17]]
-; INTERLEAVE-NEXT:    [[TMP33:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 4
 ; INTERLEAVE-NEXT:    store i32 [[TMP33]], ptr [[TMP23]], align 1, !alias.scope [[META20]], !noalias [[META17]]
-; INTERLEAVE-NEXT:    [[TMP34:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 8
 ; INTERLEAVE-NEXT:    store i32 [[TMP34]], ptr [[TMP25]], align 1, !alias.scope [[META20]], !noalias [[META17]]
-; INTERLEAVE-NEXT:    [[TMP35:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 12
 ; INTERLEAVE-NEXT:    store i32 [[TMP35]], ptr [[TMP27]], align 1, !alias.scope [[META20]], !noalias [[META17]]
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; INTERLEAVE-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -2445,11 +2445,11 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc <2 x i32> [[TMP5]] to <2 x i16>
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i16> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i16> [[TMP6]], i32 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], ptr [[P:%.*]], i64 [[TMP3]], i32 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP4]], i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i16> [[TMP6]], i32 0
 ; CHECK-NEXT:    store i16 [[TMP9]], ptr [[TMP7]], align 2
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i16> [[TMP6]], i32 1
 ; CHECK-NEXT:    store i16 [[TMP10]], ptr [[TMP8]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
@@ -2492,13 +2492,13 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) {
 ; IND-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; IND-NEXT:    [[TMP4:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
 ; IND-NEXT:    [[TMP5:%.*]] = trunc <2 x i32> [[TMP4]] to <2 x i16>
+; IND-NEXT:    [[TMP8:%.*]] = extractelement <2 x i16> [[TMP5]], i64 0
+; IND-NEXT:    [[TMP9:%.*]] = extractelement <2 x i16> [[TMP5]], i64 1
 ; IND-NEXT:    [[DOTSPLIT:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], ptr [[P:%.*]], i64 [[INDEX]]
 ; IND-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTSPLIT]], i64 2
 ; IND-NEXT:    [[TMP16:%.*]] = getelementptr [[PAIR_I16]], ptr [[P]], i64 [[INDEX]]
 ; IND-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP16]], i64 6
-; IND-NEXT:    [[TMP8:%.*]] = extractelement <2 x i16> [[TMP5]], i64 0
 ; IND-NEXT:    store i16 [[TMP8]], ptr [[TMP6]], align 2
-; IND-NEXT:    [[TMP9:%.*]] = extractelement <2 x i16> [[TMP5]], i64 1
 ; IND-NEXT:    store i16 [[TMP9]], ptr [[TMP7]], align 2
 ; IND-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; IND-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
@@ -2544,7 +2544,11 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) {
 ; UNROLL-NEXT:    [[TMP6:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
 ; UNROLL-NEXT:    [[TMP7:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[STEP_ADD]]
 ; UNROLL-NEXT:    [[TMP8:%.*]] = trunc <2 x i32> [[TMP6]] to <2 x i16>
+; UNROLL-NEXT:    [[TMP14:%.*]] = extractelement <2 x i16> [[TMP8]], i64 0
+; UNROLL-NEXT:    [[TMP15:%.*]] = extractelement <2 x i16> [[TMP8]], i64 1
 ; UNROLL-NEXT:    [[TMP9:%.*]] = trunc <2 x i32> [[TMP7]] to <2 x i16>
+; UNROLL-NEXT:    [[TMP16:%.*]] = extractelement <2 x i16> [[TMP9]], i64 0
+; UNROLL-NEXT:    [[TMP17:%.*]] = extractelement <2 x i16> [[TMP9]], i64 1
 ; UNROLL-NEXT:    [[DOTSPLIT:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], ptr [[P:%.*]], i64 [[INDEX]]
 ; UNROLL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTSPLIT]], i64 2
 ; UNROLL-NEXT:    [[TMP24:%.*]] = getelementptr [[PAIR_I16]], ptr [[P]], i64 [[INDEX]]
@@ -2553,13 +2557,9 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) {
 ; UNROLL-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP25]], i64 10
 ; UNROLL-NEXT:    [[TMP26:%.*]] = getelementptr [[PAIR_I16]], ptr [[P]], i64 [[INDEX]]
 ; UNROLL-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP26]], i64 14
-; UNROLL-NEXT:    [[TMP14:%.*]] = extractelement <2 x i16> [[TMP8]], i64 0
 ; UNROLL-NEXT:    store i16 [[TMP14]], ptr [[TMP10]], align 2
-; UNROLL-NEXT:    [[TMP15:%.*]] = extractelement <2 x i16> [[TMP8]], i64 1
 ; UNROLL-NEXT:    store i16 [[TMP15]], ptr [[TMP11]], align 2
-; UNROLL-NEXT:    [[TMP16:%.*]] = extractelement <2 x i16> [[TMP9]], i64 0
 ; UNROLL-NEXT:    store i16 [[TMP16]], ptr [[TMP12]], align 2
-; UNROLL-NEXT:    [[TMP17:%.*]] = extractelement <2 x i16> [[TMP9]], i64 1
 ; UNROLL-NEXT:    store i16 [[TMP17]], ptr [[TMP13]], align 2
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; UNROLL-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 4)
@@ -2610,18 +2610,18 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
 ; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[STEP_ADD]]
 ; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = trunc <2 x i32> [[TMP7]] to <2 x i16>
+; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = extractelement <2 x i16> [[TMP9]], i32 0
+; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = extractelement <2 x i16> [[TMP9]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = trunc <2 x i32> [[TMP8]] to <2 x i16>
+; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = extractelement <2 x i16> [[TMP10]], i32 0
+; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = extractelement <2 x i16> [[TMP10]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], ptr [[P:%.*]], i64 [[TMP3]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP4]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP5]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP6]], i32 1
-; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = extractelement <2 x i16> [[TMP9]], i32 0
 ; UNROLL-NO-IC-NEXT:    store i16 [[TMP15]], ptr [[TMP11]], align 2
-; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = extractelement <2 x i16> [[TMP9]], i32 1
 ; UNROLL-NO-IC-NEXT:    store i16 [[TMP16]], ptr [[TMP12]], align 2
-; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = extractelement <2 x i16> [[TMP10]], i32 0
 ; UNROLL-NO-IC-NEXT:    store i16 [[TMP17]], ptr [[TMP13]], align 2
-; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = extractelement <2 x i16> [[TMP10]], i32 1
 ; UNROLL-NO-IC-NEXT:    store i16 [[TMP18]], ptr [[TMP14]], align 2
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], splat (i32 2)
@@ -2666,7 +2666,15 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) {
 ; INTERLEAVE-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
 ; INTERLEAVE-NEXT:    [[TMP11:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], [[STEP_ADD]]
 ; INTERLEAVE-NEXT:    [[TMP12:%.*]] = trunc <4 x i32> [[TMP10]] to <4 x i16>
+; INTERLEAVE-NEXT:    [[TMP22:%.*]] = extractelement <4 x i16> [[TMP12]], i64 0
+; INTERLEAVE-NEXT:    [[TMP23:%.*]] = extractelement <4 x i16> [[TMP12]], i64 1
+; INTERLEAVE-NEXT:    [[TMP24:%.*]] = extractelement <4 x i16> [[TMP12]], i64 2
+; INTERLEAVE-NEXT:    [[TMP25:%.*]] = extractelement <4 x i16> [[TMP12]], i64 3
 ; INTERLEAVE-NEXT:    [[TMP13:%.*]] = trunc <4 x i32> [[TMP11]] to <4 x i16>
+; INTERLEAVE-NEXT:    [[TMP26:%.*]] = extractelement <4 x i16> [[TMP13]], i64 0
+; INTERLEAVE-NEXT:    [[TMP27:%.*]] = extractelement <4 x i16> [[TMP13]], i64 1
+; INTERLEAVE-NEXT:    [[TMP28:%.*]] = extractelement <4 x i16> [[TMP13]], i64 2
+; INTERLEAVE-NEXT:    [[TMP29:%.*]] = extractelement <4 x i16> [[TMP13]], i64 3
 ; INTERLEAVE-NEXT:    [[DOTSPLIT:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], ptr [[P:%.*]], i64 [[INDEX]]
 ; INTERLEAVE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTSPLIT]], i64 2
 ; INTERLEAVE-NEXT:    [[TMP8:%.*]] = getelementptr [[PAIR_I16]], ptr [[P]], i64 [[INDEX]]
@@ -2683,21 +2691,13 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) {
 ; INTERLEAVE-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[TMP40]], i64 26
 ; INTERLEAVE-NEXT:    [[TMP41:%.*]] = getelementptr [[PAIR_I16]], ptr [[P]], i64 [[INDEX]]
 ; INTERLEAVE-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[TMP41]], i64 30
-; INTERLEAVE-NEXT:    [[TMP22:%.*]] = extractelement <4 x i16> [[TMP12]], i64 0
 ; INTERLEAVE-NEXT:    store i16 [[TMP22]], ptr [[TMP14]], align 2
-; INTERLEAVE-NEXT:    [[TMP23:%.*]] = extractelement <4 x i16> [[TMP12]], i64 1
 ; INTERLEAVE-NEXT:    store i16 [[TMP23]], ptr [[TMP15]], align 2
-; INTERLEAVE-NEXT:    [[TMP24:%.*]] = extractelement <4 x i16> [[TMP12]], i64 2
 ; INTERLEAVE-NEXT:    store i16 [[TMP24]], ptr [[TMP16]], align 2
-; INTERLEAVE-NEXT:    [[TMP25:%.*]] = extractelement <4 x i16> [[TMP12]], i64 3
 ; INTERLEAVE-NEXT:    store i16 [[TMP25]], ptr [[TMP17]], align 2
-; INTERLEAVE-NEXT:    [[TMP26:%.*]] = extractelement <4 x i16> [[TMP13]], i64 0
 ; INTERLEAVE-NEXT:    store i16 [[TMP26]], ptr [[TMP18]], align 2
-; INTERLEAVE-NEXT:    [[TMP27:%.*]] = extractelement <4 x i16> [[TMP13]], i64 1
 ; INTERLEAVE-NEXT:    store i16 [[TMP27]], ptr [[TMP19]], align 2
-; INTERLEAVE-NEXT:    [[TMP28:%.*]] = extractelement <4 x i16> [[TMP13]], i64 2
 ; INTERLEAVE-NEXT:    store i16 [[TMP28]], ptr [[TMP20]], align 2
-; INTERLEAVE-NEXT:    [[TMP29:%.*]] = extractelement <4 x i16> [[TMP13]], i64 3
 ; INTERLEAVE-NEXT:    store i16 [[TMP29]], ptr [[TMP21]], align 2
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; INTERLEAVE-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 8)
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
index 0ebb652ef9648..16a56f3a38ac9 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
@@ -149,8 +149,8 @@ define void @interleaved_with_cond_store_1(ptr %p, i64 %x, i64 %n) {
 ; CHECK:       pred.store.continue2:
 ; CHECK-NEXT:    [[WIDE_VEC3:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[WIDE_VEC3]], i64 0
-; CHECK-NEXT:    store i64 [[TMP11]], ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[WIDE_VEC3]], i64 2
+; CHECK-NEXT:    store i64 [[TMP11]], ptr [[TMP2]], align 8
 ; CHECK-NEXT:    store i64 [[TMP12]], ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
index b4cad1142134c..8efe29a54d913 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
@@ -557,6 +557,10 @@ define void @load_gap_reverse(ptr noalias nocapture %P1, ptr noalias nocapture %
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 1021, [[INDEX]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 1020, [[INDEX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i64> [[BROADCAST_SPLAT]], [[VEC_IND]]
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i64> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP3]], i64 1
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP3]], i64 2
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP3]], i64 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[PAIR:%.*]], ptr [[P1:%.*]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P1]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P1]], i64 [[TMP1]]
@@ -578,21 +582,17 @@ define void @load_gap_reverse(ptr noalias nocapture %P1, ptr noalias nocapture %
 ; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i64 2
 ; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i64 3
 ; CHECK-NEXT:    [[TMP20:%.*]] = sub nsw <4 x i64> [[TMP19]], [[VEC_IND]]
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i64> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP20]], i64 0
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP20]], i64 1
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP20]], i64 2
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP20]], i64 3
 ; CHECK-NEXT:    store i64 [[TMP21]], ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP3]], i64 1
 ; CHECK-NEXT:    store i64 [[TMP22]], ptr [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP3]], i64 2
 ; CHECK-NEXT:    store i64 [[TMP23]], ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP3]], i64 3
 ; CHECK-NEXT:    store i64 [[TMP24]], ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP20]], i64 0
 ; CHECK-NEXT:    store i64 [[TMP25]], ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP20]], i64 1
 ; CHECK-NEXT:    store i64 [[TMP26]], ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP20]], i64 2
 ; CHECK-NEXT:    store i64 [[TMP27]], ptr [[TMP10]], align 8
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP20]], i64 3
 ; CHECK-NEXT:    store i64 [[TMP28]], ptr [[TMP11]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4)
@@ -791,8 +791,8 @@ define void @int_float_struct(ptr nocapture readonly %A) #0 {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ <i32 undef, i32 0, i32 0, i32 0>, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_INTFLOAT:%.*]], ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP0]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -822,8 +822,8 @@ for.cond.cleanup:                                 ; preds = %for.body
 
 for.body:                                         ; preds = %for.body, %entry
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %SumB.014 = phi float [ undef, %entry ], [ %add3, %for.body ]
-  %SumA.013 = phi i32 [ undef, %entry ], [ %add, %for.body ]
+  %SumB.014 = phi float [ 0.0e+00, %entry ], [ %add3, %for.body ]
+  %SumA.013 = phi i32 [ 0, %entry ], [ %add, %for.body ]
   %a = getelementptr inbounds %struct.IntFloat, ptr %A, i64 %indvars.iv, i32 0
   %tmp = load i32, ptr %a, align 4
   %add = add nsw i32 %tmp, %SumA.013
@@ -888,12 +888,12 @@ define void @PR27626_0(ptr %p, i32 %z, i64 %n) {
 ; CHECK-NEXT:    store i32 [[Z]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 0
-; CHECK-NEXT:    store i32 [[TMP13]], ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 2
-; CHECK-NEXT:    store i32 [[TMP14]], ptr [[TMP10]], align 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 4
-; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP11]], align 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 6
+; CHECK-NEXT:    store i32 [[TMP13]], ptr [[TMP9]], align 4
+; CHECK-NEXT:    store i32 [[TMP14]], ptr [[TMP10]], align 4
+; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP11]], align 4
 ; CHECK-NEXT:    store i32 [[TMP16]], ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -971,12 +971,12 @@ define i32 @PR27626_1(ptr %p, i64 %n) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP20]], i64 28
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 0
-; CHECK-NEXT:    store i32 [[TMP10]], ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 2
-; CHECK-NEXT:    store i32 [[TMP11]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 4
-; CHECK-NEXT:    store i32 [[TMP12]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 6
+; CHECK-NEXT:    store i32 [[TMP10]], ptr [[TMP6]], align 4
+; CHECK-NEXT:    store i32 [[TMP11]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store i32 [[TMP12]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    store i32 [[TMP13]], ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1073,12 +1073,12 @@ define void @PR27626_2(ptr %p, i64 %n, i32 %z) {
 ; CHECK-NEXT:    store i32 [[Z]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 0
-; CHECK-NEXT:    store i32 [[TMP14]], ptr [[TMP10]], align 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 2
-; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP11]], align 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 4
-; CHECK-NEXT:    store i32 [[TMP16]], ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 6
+; CHECK-NEXT:    store i32 [[TMP14]], ptr [[TMP10]], align 4
+; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP11]], align 4
+; CHECK-NEXT:    store i32 [[TMP16]], ptr [[TMP12]], align 4
 ; CHECK-NEXT:    store i32 [[TMP17]], ptr [[TMP13]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1150,29 +1150,29 @@ define i32 @PR27626_3(ptr %p, i64 %n, i32 %z) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 1)
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i64 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i64 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP2]], i64 3
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[DOTSPLIT:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTSPLIT]], i64 4
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i64 0
 ; CHECK-NEXT:    [[DOTSPLIT3:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTSPLIT3]], i64 4
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i64 1
 ; CHECK-NEXT:    [[DOTSPLIT4:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTSPLIT4]], i64 4
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i64 2
 ; CHECK-NEXT:    [[DOTSPLIT5:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTSPLIT5]], i64 4
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP2]], i64 3
 ; CHECK-NEXT:    [[DOTSPLIT6:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP11]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTSPLIT6]], i64 4
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 0
-; CHECK-NEXT:    store i32 [[TMP13]], ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 2
-; CHECK-NEXT:    store i32 [[TMP14]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 4
-; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP10]], align 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 6
+; CHECK-NEXT:    store i32 [[TMP13]], ptr [[TMP6]], align 4
+; CHECK-NEXT:    store i32 [[TMP14]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP10]], align 4
 ; CHECK-NEXT:    store i32 [[TMP16]], ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1347,7 +1347,15 @@ define void @PR27626_5(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 5, i64 7, i64 9>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = shl i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i64 0
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i64> [[TMP7]], i64 1
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i64> [[TMP7]], i64 2
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i64> [[TMP7]], i64 3
 ; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -3)
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP8]], i64 0
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP8]], i64 1
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP8]], i64 2
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP8]], i64 3
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP12]], i64 12
 ; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP4]]
@@ -1356,21 +1364,13 @@ define void @PR27626_5(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) {
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP34]], i64 28
 ; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP35]], i64 36
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i64 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i64> [[TMP7]], i64 1
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i64> [[TMP7]], i64 2
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i64> [[TMP7]], i64 3
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP20]]
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP8]], i64 0
 ; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP22]]
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP8]], i64 1
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP8]], i64 2
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP26]]
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP8]], i64 3
 ; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP28]]
 ; CHECK-NEXT:    store i32 [[X:%.*]], ptr [[TMP15]], align 4
 ; CHECK-NEXT:    store i32 [[X]], ptr [[TMP17]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
index 742ee649db3a2..eea22374ade30 100644
--- a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
@@ -337,7 +337,7 @@ for.end:                                          ; preds = %for.body
 ;    }
 ;  }
 
-define i32 @multiple_uniform_stores(ptr nocapture %var1, ptr nocapture readonly %var2, i32 %itr) #0 {
+define void @multiple_uniform_stores(ptr nocapture %var1, ptr nocapture readonly %var2, i32 %itr) #0 {
 ; CHECK-LABEL: @multiple_uniform_stores(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP20:%.*]] = icmp eq i32 [[ITR:%.*]], 0
@@ -429,7 +429,7 @@ define i32 @multiple_uniform_stores(ptr nocapture %var1, ptr nocapture readonly
 ; CHECK:       for.end10.loopexit:
 ; CHECK-NEXT:    br label [[FOR_END10]]
 ; CHECK:       for.end10:
-; CHECK-NEXT:    ret i32 undef
+; CHECK-NEXT:    ret void
 ;
 entry:
   %cmp20 = icmp eq i32 %itr, 0
@@ -469,12 +469,12 @@ for.inc8:                                         ; preds = %for.body3, %for.con
   br i1 %exitcond26, label %for.end10, label %for.cond1.preheader
 
 for.end10:                                        ; preds = %for.inc8, %entry
-  ret i32 undef
+  ret void
 }
 
 ; second uniform store to the same address is conditional.
 ; we do not vectorize this.
-define i32 @multiple_uniform_stores_conditional(ptr nocapture %var1, ptr nocapture readonly %var2, i32 %itr) #0 {
+define void @multiple_uniform_stores_conditional(ptr nocapture %var1, ptr nocapture readonly %var2, i32 %itr) #0 {
 ; CHECK-LABEL: @multiple_uniform_stores_conditional(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP20:%.*]] = icmp eq i32 [[ITR:%.*]], 0
@@ -520,7 +520,7 @@ define i32 @multiple_uniform_stores_conditional(ptr nocapture %var1, ptr nocaptu
 ; CHECK:       for.end10.loopexit:
 ; CHECK-NEXT:    br label [[FOR_END10]]
 ; CHECK:       for.end10:
-; CHECK-NEXT:    ret i32 undef
+; CHECK-NEXT:    ret void
 ;
 entry:
   %cmp20 = icmp eq i32 %itr, 0
@@ -567,7 +567,7 @@ for.inc8:                                         ; preds = %for.body3, %for.con
   br i1 %exitcond26, label %for.end10, label %for.cond1.preheader
 
 for.end10:                                        ; preds = %for.inc8, %entry
-  ret i32 undef
+  ret void
 }
 
 ; cannot vectorize loop with unsafe dependency between uniform load (%i10) and store
diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
index 9358fd9cc8440..00256a5c4a456 100644
--- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
+++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
@@ -480,8 +480,8 @@ define i16 @test_strided_access(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp sge <2 x i8> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i16, ptr [[TMP6]], align 2
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP8]], align 2
@@ -551,8 +551,8 @@ define void @test_rev_loops_strided_deref_loads(ptr nocapture noundef writeonly
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <2 x i32> [[REVERSE]], splat (i32 3)
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP9]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/loop-scalars.ll b/llvm/test/Transforms/LoopVectorize/loop-scalars.ll
index ffeb3b19d11c4..f8ddd344f5587 100644
--- a/llvm/test/Transforms/LoopVectorize/loop-scalars.ll
+++ b/llvm/test/Transforms/LoopVectorize/loop-scalars.ll
@@ -192,8 +192,8 @@ define void @no_gep_or_bitcast(ptr noalias %a, i64 %n) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds ptr, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x ptr>, ptr [[TMP0]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x ptr> [[WIDE_LOAD]], i64 0
-; CHECK-NEXT:    store i32 0, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x ptr> [[WIDE_LOAD]], i64 1
+; CHECK-NEXT:    store i32 0, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    store i32 0, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/memdep.ll b/llvm/test/Transforms/LoopVectorize/memdep.ll
index b891b4312f18d..d9d9eec9b7d33 100644
--- a/llvm/test/Transforms/LoopVectorize/memdep.ll
+++ b/llvm/test/Transforms/LoopVectorize/memdep.ll
@@ -132,7 +132,7 @@ for.end:
 ; CHECK-LABEL: @f6
 ; CHECK-NOT: <2 x i32>
 
-define i32 @f6(ptr %a, i32 %tmp) {
+define void @f6(ptr %a, i32 %tmp) {
 entry:
   br label %for.body
 
@@ -149,7 +149,7 @@ for.body:
   br i1 %exitcond, label %for.body, label %for.end
 
 for.end:
-  ret i32 undef
+  ret void
 }
 
 ; Don't vectorize true loop carried dependencies that are not a multiple of the
diff --git a/llvm/test/Transforms/LoopVectorize/metadata.ll b/llvm/test/Transforms/LoopVectorize/metadata.ll
index 3c59a279e077d..fe25d1b231efc 100644
--- a/llvm/test/Transforms/LoopVectorize/metadata.ll
+++ b/llvm/test/Transforms/LoopVectorize/metadata.ll
@@ -447,8 +447,8 @@ define void @unknown_metadata(ptr nocapture %a, ptr noalias %b, i64 %size) {
 ; INTERLEAVE-NEXT:    [[STEP_ADD3:%.*]] = add <2 x i32> [[VEC_IND1]], splat (i32 2)
 ; INTERLEAVE-NEXT:    [[TMP0:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[INDEX]], !custom_md [[META2:![0-9]+]]
 ; INTERLEAVE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], <2 x i64> [[VEC_IND]]
-; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B]], <2 x i64> [[STEP_ADD]]
 ; INTERLEAVE-NEXT:    [[TMP3:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B]], <2 x i64> [[STEP_ADD]]
 ; INTERLEAVE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 2
 ; INTERLEAVE-NEXT:    store <2 x i32> [[VEC_IND1]], ptr [[TMP3]], align 4
 ; INTERLEAVE-NEXT:    store <2 x i32> [[STEP_ADD3]], ptr [[TMP5]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/middle-block-dbg.ll b/llvm/test/Transforms/LoopVectorize/middle-block-dbg.ll
index 741ca559343d1..1675a595072fd 100644
--- a/llvm/test/Transforms/LoopVectorize/middle-block-dbg.ll
+++ b/llvm/test/Transforms/LoopVectorize/middle-block-dbg.ll
@@ -48,7 +48,7 @@ for.cond.cleanup.loopexit:
   br label %for.cond.cleanup, !dbg !33
 
 for.cond.cleanup:
-  %2 = phi i32 [ %.pre, %for.cond.cleanup.loopexit ], [ undef, %entry ], !dbg !33
+  %2 = phi i32 [ %.pre, %for.cond.cleanup.loopexit ], [ 0, %entry ], !dbg !33
   %sub = add nsw i32 %0, -5, !dbg !33
   %idxprom3 = sext i32 %sub to i64, !dbg !33
   %arrayidx4 = getelementptr inbounds i32, ptr %vla, i64 %idxprom3, !dbg !33
diff --git a/llvm/test/Transforms/LoopVectorize/multi-use-reduction-bug.ll b/llvm/test/Transforms/LoopVectorize/multi-use-reduction-bug.ll
index 659dc62e7ab58..20386334da600 100644
--- a/llvm/test/Transforms/LoopVectorize/multi-use-reduction-bug.ll
+++ b/llvm/test/Transforms/LoopVectorize/multi-use-reduction-bug.ll
@@ -22,8 +22,8 @@ entry:
   br label %for.body
 
 for.body:
-  %inc107 = phi i32 [ undef, %entry ], [ %inc10, %for.body ]
-  %inc6 = phi i32 [ %nf.promoted, %entry ], [ undef, %for.body ]
+  %inc107 = phi i32 [ 0, %entry ], [ %inc10, %for.body ]
+  %inc6 = phi i32 [ %nf.promoted, %entry ], [ 3, %for.body ]
   %add55 = phi i32 [ %n.promoted, %entry ], [ %add5, %for.body ]
   %.neg2 = sub i32 0, %inc6
   %add.neg = add i32 0, %add55
diff --git a/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll b/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll
index 1533906247739..7b0c366e16c7b 100644
--- a/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll
+++ b/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll
@@ -74,8 +74,7 @@ exit:
   ret void
 }
 
-; FIXME: Currently this mis-compiled when interleaving; all stores store the
-; last lane of the last part, instead of the last lane per part.
+; Check each unrolled store stores the last lane of the corresponding part.
 ; Test case for https://github.com/llvm/llvm-project/issues/162498.
 define void @narrow_to_single_scalar_store_address_not_uniform_across_all_parts(ptr %dst) {
 ; VF4IC1-LABEL: define void @narrow_to_single_scalar_store_address_not_uniform_across_all_parts(
@@ -93,12 +92,12 @@ define void @narrow_to_single_scalar_store_address_not_uniform_across_all_parts(
 ; VF4IC1-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 3
 ; VF4IC1-NEXT:    [[TMP4:%.*]] = lshr <4 x i32> [[VEC_IND]], splat (i32 1)
 ; VF4IC1-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
-; VF4IC1-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[DST]], i32 [[TMP5]]
 ; VF4IC1-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
-; VF4IC1-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[DST]], i32 [[TMP7]]
 ; VF4IC1-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
-; VF4IC1-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[DST]], i32 [[TMP9]]
 ; VF4IC1-NEXT:    [[TMP11:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; VF4IC1-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[DST]], i32 [[TMP5]]
+; VF4IC1-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[DST]], i32 [[TMP7]]
+; VF4IC1-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[DST]], i32 [[TMP9]]
 ; VF4IC1-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[DST]], i32 [[TMP11]]
 ; VF4IC1-NEXT:    store i32 [[TMP0]], ptr [[TMP6]], align 4
 ; VF4IC1-NEXT:    store i32 [[TMP1]], ptr [[TMP8]], align 4
@@ -121,13 +120,15 @@ define void @narrow_to_single_scalar_store_address_not_uniform_across_all_parts(
 ; VF2IC2-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; VF2IC2:       [[VECTOR_BODY]]:
 ; VF2IC2-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF2IC2-NEXT:    [[TMP7:%.*]] = add i32 [[INDEX]], 0
+; VF2IC2-NEXT:    [[TMP8:%.*]] = add i32 [[INDEX]], 1
 ; VF2IC2-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 2
 ; VF2IC2-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 3
-; VF2IC2-NEXT:    [[TMP2:%.*]] = lshr i32 [[INDEX]], 1
+; VF2IC2-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP7]], 1
 ; VF2IC2-NEXT:    [[TMP3:%.*]] = lshr i32 [[TMP0]], 1
 ; VF2IC2-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[DST]], i32 [[TMP2]]
 ; VF2IC2-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[DST]], i32 [[TMP3]]
-; VF2IC2-NEXT:    store i32 [[TMP1]], ptr [[TMP4]], align 4
+; VF2IC2-NEXT:    store i32 [[TMP8]], ptr [[TMP4]], align 4
 ; VF2IC2-NEXT:    store i32 [[TMP1]], ptr [[TMP5]], align 4
 ; VF2IC2-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; VF2IC2-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 100
diff --git a/llvm/test/Transforms/LoopVectorize/operand-bundles.ll b/llvm/test/Transforms/LoopVectorize/operand-bundles.ll
new file mode 100644
index 0000000000000..ce0736486de28
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/operand-bundles.ll
@@ -0,0 +1,227 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
+; RUN: opt -p loop-vectorize -force-vector-width=4 -S %s | FileCheck %s
+
+define void @call_loop_invariant_operand_bundle(ptr %dst, {float, float} %sv) {
+; CHECK-LABEL: define void @call_loop_invariant_operand_bundle(
+; CHECK-SAME: ptr [[DST:%.*]], { float, float } [[SV:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = extractvalue { float, float } [[SV]], 0
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { float, float } [[SV]], 1
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT1]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr float, ptr [[DST]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.pow.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[BROADCAST_SPLAT2]]) [ "deopt"(float 1.000000e+01) ]
+; CHECK-NEXT:    store <4 x float> [[TMP3]], ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %a = extractvalue { float, float } %sv, 0
+  %b = extractvalue { float, float } %sv, 1
+  %addr = getelementptr float, ptr %dst, i32 %iv
+  %p = call float @llvm.pow.f32(float %a, float %b) [ "deopt"(float 10.0) ]
+  store float %p, ptr %addr
+  %iv.next = add nsw i32 %iv, 1
+  %cond = icmp ne i32 %iv.next, 1000
+  br i1 %cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @call_unknown_operand_bundle(ptr %dst, {float, float} %sv) {
+; CHECK-LABEL: define void @call_unknown_operand_bundle(
+; CHECK-SAME: ptr [[DST:%.*]], { float, float } [[SV:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = extractvalue { float, float } [[SV]], 0
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { float, float } [[SV]], 1
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT1]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr float, ptr [[DST]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.pow.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[BROADCAST_SPLAT2]]) [ "unknown"(ptr null) ]
+; CHECK-NEXT:    store <4 x float> [[TMP3]], ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %a = extractvalue { float, float } %sv, 0
+  %b = extractvalue { float, float } %sv, 1
+  %addr = getelementptr float, ptr %dst, i32 %iv
+  %p = call float @llvm.pow.f32(float %a, float %b) [ "unknown"(ptr null) ]
+  store float %p, ptr %addr
+  %iv.next = add nsw i32 %iv, 1
+  %cond = icmp ne i32 %iv.next, 1000
+  br i1 %cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @call_cold_operand_bundle(ptr %dst, {float, float} %sv) {
+; CHECK-LABEL: define void @call_cold_operand_bundle(
+; CHECK-SAME: ptr [[DST:%.*]], { float, float } [[SV:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = extractvalue { float, float } [[SV]], 0
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { float, float } [[SV]], 1
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT1]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr float, ptr [[DST]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.pow.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[BROADCAST_SPLAT2]]) [ "cold"() ]
+; CHECK-NEXT:    store <4 x float> [[TMP3]], ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %a = extractvalue { float, float } %sv, 0
+  %b = extractvalue { float, float } %sv, 1
+  %addr = getelementptr float, ptr %dst, i32 %iv
+  %p = call float @llvm.pow.f32(float %a, float %b) [ "cold"() ]
+  store float %p, ptr %addr
+  %iv.next = add nsw i32 %iv, 1
+  %cond = icmp ne i32 %iv.next, 1000
+  br i1 %cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @assume_loop_variant_operand_bundle(ptr noalias %a, ptr noalias %b) {
+; CHECK-LABEL: define void @assume_loop_variant_operand_bundle(
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP8]], align 4
+; CHECK-NEXT:    tail call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 [[TMP0]]) ]
+; CHECK-NEXT:    tail call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 [[TMP1]]) ]
+; CHECK-NEXT:    tail call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 [[TMP2]]) ]
+; CHECK-NEXT:    tail call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 [[TMP3]]) ]
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[WIDE_LOAD]], splat (float 1.000000e+00)
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]]
+; CHECK-NEXT:    store <4 x float> [[TMP5]], ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1600
+; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds float, ptr %b, i64 %iv
+  %0 = load float, ptr %arrayidx, align 4
+  %cmp1 = fcmp ogt float %0, 1.000000e+02
+  tail call void @llvm.assume(i1 true) [ "align"(ptr %a, i64 %iv) ]
+  %add = fadd float %0, 1.000000e+00
+  %arrayidx5 = getelementptr inbounds float, ptr %a, i64 %iv
+  store float %add, ptr %arrayidx5, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv, 1599
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @assume_cold_operand_bundle(ptr noalias %a, ptr noalias %b) {
+; CHECK-LABEL: define void @assume_cold_operand_bundle(
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    tail call void @llvm.assume(i1 true) [ "cold"() ]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[WIDE_LOAD]], splat (float 1.000000e+00)
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1600
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds float, ptr %b, i64 %iv
+  %0 = load float, ptr %arrayidx, align 4
+  %cmp1 = fcmp ogt float %0, 1.000000e+02
+  tail call void @llvm.assume(i1 true) [ "cold"() ]
+  %add = fadd float %0, 1.000000e+00
+  %arrayidx5 = getelementptr inbounds float, ptr %a, i64 %iv
+  store float %add, ptr %arrayidx5, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv, 1599
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
index 1319d068145a8..bfc7feecafbc4 100644
--- a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
@@ -802,17 +802,17 @@ define void @multiple_ivs_wide(ptr %dst) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP0]], 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP0]], 6
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 2)
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i32> [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[TMP5]], i32 2
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[TMP5]], i32 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP1]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP2]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP3]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[TMP5]], i32 0
 ; CHECK-NEXT:    store i32 [[TMP10]], ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i32> [[TMP5]], i32 1
 ; CHECK-NEXT:    store i32 [[TMP11]], ptr [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[TMP5]], i32 2
 ; CHECK-NEXT:    store i32 [[TMP12]], ptr [[TMP8]], align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[TMP5]], i32 3
 ; CHECK-NEXT:    store i32 [[TMP13]], ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 8)
@@ -838,17 +838,17 @@ define void @multiple_ivs_wide(ptr %dst) {
 ; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[OFFSET_IDX]], 4
 ; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[OFFSET_IDX]], 6
 ; CHECK-NEXT:    [[TMP19:%.*]] = add <4 x i32> [[VEC_IND2]], splat (i32 2)
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[TMP19]], i32 0
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i32> [[TMP19]], i32 1
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i32> [[TMP19]], i32 2
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i32> [[TMP19]], i32 3
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP15]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP16]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP17]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP18]]
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[TMP19]], i32 0
 ; CHECK-NEXT:    store i32 [[TMP24]], ptr [[TMP20]], align 4
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i32> [[TMP19]], i32 1
 ; CHECK-NEXT:    store i32 [[TMP25]], ptr [[TMP21]], align 4
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i32> [[TMP19]], i32 2
 ; CHECK-NEXT:    store i32 [[TMP26]], ptr [[TMP22]], align 4
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i32> [[TMP19]], i32 3
 ; CHECK-NEXT:    store i32 [[TMP27]], ptr [[TMP23]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i32 [[INDEX1]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], splat (i32 8)
diff --git a/llvm/test/Transforms/LoopVectorize/optsize.ll b/llvm/test/Transforms/LoopVectorize/optsize.ll
index 9f82795e1f71c..763072ab16f73 100644
--- a/llvm/test/Transforms/LoopVectorize/optsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/optsize.ll
@@ -262,8 +262,8 @@ define void @pr43371() optsize {
 ; CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]]
 ; CHECK-NEXT:    store i16 0, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    store i16 0, ptr [[TMP5]], align 1
@@ -288,8 +288,8 @@ define void @pr43371() optsize {
 ; PGSO-NEXT:    [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]]
 ; PGSO-NEXT:    [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32>
 ; PGSO-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; PGSO-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
 ; PGSO-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; PGSO-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
 ; PGSO-NEXT:    [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]]
 ; PGSO-NEXT:    store i16 0, ptr [[TMP3]], align 1
 ; PGSO-NEXT:    store i16 0, ptr [[TMP5]], align 1
@@ -314,8 +314,8 @@ define void @pr43371() optsize {
 ; NPGSO-NEXT:    [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]]
 ; NPGSO-NEXT:    [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32>
 ; NPGSO-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; NPGSO-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
 ; NPGSO-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; NPGSO-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
 ; NPGSO-NEXT:    [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]]
 ; NPGSO-NEXT:    store i16 0, ptr [[TMP3]], align 1
 ; NPGSO-NEXT:    store i16 0, ptr [[TMP5]], align 1
@@ -363,8 +363,8 @@ define void @pr43371_pgso() !prof !14 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]]
 ; CHECK-NEXT:    store i16 0, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    store i16 0, ptr [[TMP5]], align 1
@@ -389,8 +389,8 @@ define void @pr43371_pgso() !prof !14 {
 ; PGSO-NEXT:    [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]]
 ; PGSO-NEXT:    [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32>
 ; PGSO-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; PGSO-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
 ; PGSO-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; PGSO-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
 ; PGSO-NEXT:    [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]]
 ; PGSO-NEXT:    store i16 0, ptr [[TMP3]], align 1
 ; PGSO-NEXT:    store i16 0, ptr [[TMP5]], align 1
diff --git a/llvm/test/Transforms/LoopVectorize/outer-loop-inner-latch-successors.ll b/llvm/test/Transforms/LoopVectorize/outer-loop-inner-latch-successors.ll
index 568a0db481605..8a77d14b9d3ba 100644
--- a/llvm/test/Transforms/LoopVectorize/outer-loop-inner-latch-successors.ll
+++ b/llvm/test/Transforms/LoopVectorize/outer-loop-inner-latch-successors.ll
@@ -22,14 +22,14 @@ define void @inner_latch_header_first_successor(i64 %N, i32 %c, i64 %M) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_LATCH:.*]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_LATCH]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [1024 x i64], ptr @A, i64 0, <4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP0]], i32 4, <4 x i1> splat (i1 true), <4 x i64> poison)
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[BROADCAST_SPLAT]], <4 x ptr> [[TMP0]], i32 4, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 4 [[TMP0]], <4 x i1> splat (i1 true), <4 x i64> poison)
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[BROADCAST_SPLAT]], <4 x ptr> align 4 [[TMP0]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    br label %[[INNER3:.*]]
 ; CHECK:       [[INNER3]]:
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP4:%.*]], %[[INNER3]] ]
 ; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x i64> [ [[WIDE_MASKED_GATHER]], %[[VECTOR_BODY]] ], [ [[TMP3:%.*]], %[[INNER3]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1024 x i64], ptr @B, i64 0, <4 x i64> [[VEC_PHI]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP1]], i32 4, <4 x i1> splat (i1 true), <4 x i64> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 4 [[TMP1]], <4 x i1> splat (i1 true), <4 x i64> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i64> [[WIDE_MASKED_GATHER5]], [[VEC_PHI4]]
 ; CHECK-NEXT:    [[TMP3]] = add nsw <4 x i64> [[TMP2]], [[VEC_PHI4]]
 ; CHECK-NEXT:    [[TMP4]] = add nuw nsw <4 x i64> [[VEC_PHI]], splat (i64 1)
@@ -37,8 +37,8 @@ define void @inner_latch_header_first_successor(i64 %N, i32 %c, i64 %M) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP9]], label %[[VECTOR_LATCH]], label %[[INNER3]]
 ; CHECK:       [[VECTOR_LATCH]]:
-; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i64> [ [[TMP3]], %[[INNER3]] ]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[VEC_PHI6]], <4 x ptr> [[TMP0]], i32 4, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP10:%.*]] = phi <4 x i64> [ [[TMP3]], %[[INNER3]] ]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[TMP10]], <4 x ptr> align 4 [[TMP0]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -124,14 +124,14 @@ define void @inner_latch_header_second_successor(i64 %N, i32 %c, i64 %M) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_LATCH:.*]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_LATCH]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [1024 x i64], ptr @A, i64 0, <4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP0]], i32 4, <4 x i1> splat (i1 true), <4 x i64> poison)
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[BROADCAST_SPLAT]], <4 x ptr> [[TMP0]], i32 4, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 4 [[TMP0]], <4 x i1> splat (i1 true), <4 x i64> poison)
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[BROADCAST_SPLAT]], <4 x ptr> align 4 [[TMP0]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    br label %[[INNER3:.*]]
 ; CHECK:       [[INNER3]]:
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP4:%.*]], %[[INNER3]] ]
 ; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x i64> [ [[WIDE_MASKED_GATHER]], %[[VECTOR_BODY]] ], [ [[TMP3:%.*]], %[[INNER3]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1024 x i64], ptr @B, i64 0, <4 x i64> [[VEC_PHI]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP1]], i32 4, <4 x i1> splat (i1 true), <4 x i64> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 4 [[TMP1]], <4 x i1> splat (i1 true), <4 x i64> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i64> [[WIDE_MASKED_GATHER5]], [[VEC_PHI4]]
 ; CHECK-NEXT:    [[TMP3]] = add nsw <4 x i64> [[TMP2]], [[VEC_PHI4]]
 ; CHECK-NEXT:    [[TMP4]] = add nuw nsw <4 x i64> [[VEC_PHI]], splat (i64 1)
@@ -139,8 +139,8 @@ define void @inner_latch_header_second_successor(i64 %N, i32 %c, i64 %M) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[VECTOR_LATCH]], label %[[INNER3]]
 ; CHECK:       [[VECTOR_LATCH]]:
-; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i64> [ [[TMP3]], %[[INNER3]] ]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[VEC_PHI6]], <4 x ptr> [[TMP0]], i32 4, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP9:%.*]] = phi <4 x i64> [ [[TMP3]], %[[INNER3]] ]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[TMP9]], <4 x ptr> align 4 [[TMP0]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/outer-loop-vec-phi-predecessor-order.ll b/llvm/test/Transforms/LoopVectorize/outer-loop-vec-phi-predecessor-order.ll
index 32b1fc4455d37..2e17f7adca279 100644
--- a/llvm/test/Transforms/LoopVectorize/outer-loop-vec-phi-predecessor-order.ll
+++ b/llvm/test/Transforms/LoopVectorize/outer-loop-vec-phi-predecessor-order.ll
@@ -28,9 +28,9 @@ define void @test(ptr %src, i64 %n) {
 ; CHECK:       loop.32:
 ; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i64> [ zeroinitializer, [[LOOP_2_HEADER1]] ], [ [[TMP2:%.*]], [[LOOP_32]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [2000 x i32], ptr [[SRC:%.*]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI3]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP0]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP0]], <4 x i1> splat (i1 true), <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_GATHER]], splat (i32 10)
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP1]], <4 x ptr> [[TMP0]], i32 4, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP1]], <4 x ptr> align 4 [[TMP0]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP2]] = add nuw nsw <4 x i64> [[VEC_PHI3]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i64> [[TMP2]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/outer-loop-wide-phis.ll b/llvm/test/Transforms/LoopVectorize/outer-loop-wide-phis.ll
index c0dc1ccc530b1..59e3d71f27a38 100644
--- a/llvm/test/Transforms/LoopVectorize/outer-loop-wide-phis.ll
+++ b/llvm/test/Transforms/LoopVectorize/outer-loop-wide-phis.ll
@@ -14,7 +14,7 @@ define void @wide_phi_2_predecessors(ptr noalias %A, ptr noalias %B, i32 %c, i1
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_LATCH:.*]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_LATCH]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[A]], <4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[BROADCAST_SPLAT]], <4 x ptr> [[TMP0]], i32 4, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[BROADCAST_SPLAT]], <4 x ptr> align 4 [[TMP0]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    br label %[[INNER_HEADER1:.*]]
 ; CHECK:       [[INNER_HEADER1]]:
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP4:%.*]], %[[INNER_LATCH3:.*]] ]
@@ -22,7 +22,7 @@ define void @wide_phi_2_predecessors(ptr noalias %A, ptr noalias %B, i32 %c, i1
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[B]], <4 x i64> [[VEC_PHI]]
 ; CHECK-NEXT:    br i1 [[COND]], label %[[THEN2:.*]], label %[[INNER_LATCH3]]
 ; CHECK:       [[THEN2]]:
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP1]], i32 8, <4 x i1> splat (i1 true), <4 x i64> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 8 [[TMP1]], <4 x i1> splat (i1 true), <4 x i64> poison)
 ; CHECK-NEXT:    br label %[[INNER_LATCH3]]
 ; CHECK:       [[INNER_LATCH3]]:
 ; CHECK-NEXT:    [[VEC_PHI5:%.*]] = phi <4 x i64> [ [[WIDE_MASKED_GATHER]], %[[THEN2]] ], [ zeroinitializer, %[[INNER_HEADER1]] ]
@@ -33,8 +33,8 @@ define void @wide_phi_2_predecessors(ptr noalias %A, ptr noalias %B, i32 %c, i1
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[VECTOR_LATCH]], label %[[INNER_HEADER1]]
 ; CHECK:       [[VECTOR_LATCH]]:
-; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i64> [ [[TMP3]], %[[INNER_LATCH3]] ]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[VEC_PHI6]], <4 x ptr> [[TMP0]], i32 8, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP10:%.*]] = phi <4 x i64> [ [[TMP3]], %[[INNER_LATCH3]] ]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[TMP10]], <4 x ptr> align 8 [[TMP0]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
@@ -122,7 +122,7 @@ define void @wide_phi_2_predecessors_phi_ops_swapped(ptr noalias %A, ptr noalias
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_LATCH:.*]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_LATCH]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[A]], <4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[BROADCAST_SPLAT]], <4 x ptr> [[TMP0]], i32 4, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[BROADCAST_SPLAT]], <4 x ptr> align 4 [[TMP0]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    br label %[[INNER_HEADER1:.*]]
 ; CHECK:       [[INNER_HEADER1]]:
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP4:%.*]], %[[INNER_LATCH3:.*]] ]
@@ -130,7 +130,7 @@ define void @wide_phi_2_predecessors_phi_ops_swapped(ptr noalias %A, ptr noalias
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[B]], <4 x i64> [[VEC_PHI]]
 ; CHECK-NEXT:    br i1 [[COND]], label %[[THEN2:.*]], label %[[INNER_LATCH3]]
 ; CHECK:       [[THEN2]]:
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP1]], i32 8, <4 x i1> splat (i1 true), <4 x i64> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 8 [[TMP1]], <4 x i1> splat (i1 true), <4 x i64> poison)
 ; CHECK-NEXT:    br label %[[INNER_LATCH3]]
 ; CHECK:       [[INNER_LATCH3]]:
 ; CHECK-NEXT:    [[VEC_PHI5:%.*]] = phi <4 x i64> [ [[WIDE_MASKED_GATHER]], %[[THEN2]] ], [ zeroinitializer, %[[INNER_HEADER1]] ]
@@ -141,8 +141,8 @@ define void @wide_phi_2_predecessors_phi_ops_swapped(ptr noalias %A, ptr noalias
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[VECTOR_LATCH]], label %[[INNER_HEADER1]]
 ; CHECK:       [[VECTOR_LATCH]]:
-; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i64> [ [[TMP3]], %[[INNER_LATCH3]] ]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[VEC_PHI6]], <4 x ptr> [[TMP0]], i32 8, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP10:%.*]] = phi <4 x i64> [ [[TMP3]], %[[INNER_LATCH3]] ]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[TMP10]], <4 x ptr> align 8 [[TMP0]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
diff --git a/llvm/test/Transforms/LoopVectorize/outer_loop_hcfg_construction.ll b/llvm/test/Transforms/LoopVectorize/outer_loop_hcfg_construction.ll
index 7e90724744fb3..b2f1954ca989b 100644
--- a/llvm/test/Transforms/LoopVectorize/outer_loop_hcfg_construction.ll
+++ b/llvm/test/Transforms/LoopVectorize/outer_loop_hcfg_construction.ll
@@ -41,12 +41,12 @@ define void @non_outermost_loop_hcfg_construction(i64 %n, ptr %a) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[MIDDLE_LOOP_LATCH4:%.*]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[MIDDLE_LOOP_LATCH4]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds ptr, ptr [[A]], <4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> [[TMP3]], i32 8, <4 x i1> splat (i1 true), <4 x ptr> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> align 8 [[TMP3]], <4 x i1> splat (i1 true), <4 x ptr> poison)
 ; CHECK-NEXT:    br label [[INNERMOST_LOOP3:%.*]]
 ; CHECK:       innermost.loop3:
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP5:%.*]], [[INNERMOST_LOOP3]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, <4 x ptr> [[WIDE_MASKED_GATHER]], <4 x i64> [[VEC_PHI]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[BROADCAST_SPLAT]], <4 x ptr> [[TMP4]], i32 4, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[BROADCAST_SPLAT]], <4 x ptr> align 4 [[TMP4]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP5]] = add nuw nsw <4 x i64> [[VEC_PHI]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i64> [[TMP5]], [[BROADCAST_SPLAT3]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0
@@ -187,9 +187,9 @@ define void @non_outermost_loop_hcfg_construction_other_loops_at_same_level(i64
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw <4 x i64> [[BROADCAST_SPLAT]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = trunc <4 x i64> [[TMP1]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr ptr, ptr [[INVARIANT_GEP]], <4 x i64> [[VEC_PHI]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> [[TMP3]], i32 8, <4 x i1> splat (i1 true), <4 x ptr> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> align 8 [[TMP3]], <4 x i1> splat (i1 true), <4 x ptr> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, <4 x ptr> [[WIDE_MASKED_GATHER]], <4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP2]], <4 x ptr> [[TMP4]], i32 4, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP2]], <4 x ptr> align 4 [[TMP4]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP5]] = add nuw nsw <4 x i64> [[VEC_PHI]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i64> [[TMP5]], [[BROADCAST_SPLAT3]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/outer_loop_scalable.ll b/llvm/test/Transforms/LoopVectorize/outer_loop_scalable.ll
index 70ce7a7f33ab4..47743753349ed 100644
--- a/llvm/test/Transforms/LoopVectorize/outer_loop_scalable.ll
+++ b/llvm/test/Transforms/LoopVectorize/outer_loop_scalable.ll
@@ -34,13 +34,13 @@ define void @foo() {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[OUTER_LOOP_LATCH4:%.*]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[OUTER_LOOP_LATCH4]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1024 x float], ptr @A, i64 0, <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[TMP10]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP10]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> poison)
 ; CHECK-NEXT:    br label [[INNER_LOOP1:%.*]]
 ; CHECK:       inner_loop1:
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP13:%.*]], [[INNER_LOOP1]] ]
 ; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <vscale x 4 x float> [ [[WIDE_MASKED_GATHER]], [[VECTOR_BODY]] ], [ [[TMP12:%.*]], [[INNER_LOOP1]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [512 x float], ptr @B, i64 0, <vscale x 4 x i64> [[VEC_PHI]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[TMP11]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP11]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> poison)
 ; CHECK-NEXT:    [[TMP12]] = fmul <vscale x 4 x float> [[VEC_PHI2]], [[WIDE_MASKED_GATHER3]]
 ; CHECK-NEXT:    [[TMP13]] = add nuw nsw <vscale x 4 x i64> [[VEC_PHI]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq <vscale x 4 x i64> [[TMP13]], splat (i64 512)
@@ -48,7 +48,7 @@ define void @foo() {
 ; CHECK-NEXT:    br i1 [[TMP15]], label [[OUTER_LOOP_LATCH4]], label [[INNER_LOOP1]]
 ; CHECK:       vector.latch:
 ; CHECK-NEXT:    [[VEC_PHI5:%.*]] = phi <vscale x 4 x float> [ [[TMP12]], [[INNER_LOOP1]] ]
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> [[VEC_PHI5]], <vscale x 4 x ptr> [[TMP10]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> [[VEC_PHI5]], <vscale x 4 x ptr> align 4 [[TMP10]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll b/llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll
index c68a6d143b895..4086c79082cce 100644
--- a/llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll
+++ b/llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll
@@ -35,13 +35,13 @@ define void @foo(i32 %n) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_LATCH]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [8 x i32], ptr @arr2, i64 0, <4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i64> [[VEC_IND]] to <4 x i32>
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP1]], <4 x ptr> [[TMP0]], i32 4, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP1]], <4 x ptr> align 4 [[TMP0]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    br label %[[FOR_BODY31:.*]]
 ; CHECK:       [[FOR_BODY31]]:
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP4:%.*]], %[[FOR_BODY31]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [8 x [8 x i32]], ptr @arr, i64 0, <4 x i64> [[VEC_PHI]], <4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP2]], <4 x ptr> [[TMP3]], i32 4, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP2]], <4 x ptr> align 4 [[TMP3]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP4]] = add nuw nsw <4 x i64> [[VEC_PHI]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <4 x i64> [[TMP4]], splat (i64 8)
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/outer_loop_test2.ll b/llvm/test/Transforms/LoopVectorize/outer_loop_test2.ll
index 29e633316b1e4..fb9b1c7d62e3e 100644
--- a/llvm/test/Transforms/LoopVectorize/outer_loop_test2.ll
+++ b/llvm/test/Transforms/LoopVectorize/outer_loop_test2.ll
@@ -23,11 +23,11 @@
 ; CHECK: %[[Ind:.*]] = phi i64 [ 0, %vector.ph ], [ %[[IndNext:.*]], %[[ForInc:.*]] ]
 ; CHECK: %[[VecInd:.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %vector.ph ], [ %[[VecIndNext:.*]], %[[ForInc]] ]
 ; CHECK: %[[AAddr:.*]] = getelementptr inbounds [1024 x i32], ptr @A, i64 0, <4 x i64> %[[VecInd]]
-; CHECK: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %[[CSplat]], <4 x ptr> %[[AAddr]], i32 4, <4 x i1> splat (i1 true))
+; CHECK: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %[[CSplat]], <4 x ptr> align 4 %[[AAddr]], <4 x i1> splat (i1 true))
 ; CHECK: br i1 %[[ZeroTripChk]], label %[[InnerForPh:.*]], label %[[OuterInc:.*]]
 
 ; CHECK: [[InnerForPh]]:
-; CHECK: %[[WideAVal:.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %[[AAddr]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison)
+; CHECK: %[[WideAVal:.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %[[AAddr]], <4 x i1> splat (i1 true), <4 x i32> poison)
 ; CHECK: %[[VecIndTr:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32>
 ; CHECK: br label %[[InnerForBody:.*]]
 
@@ -35,7 +35,7 @@
 ; CHECK: %[[InnerInd:.*]] = phi <4 x i64> [ zeroinitializer, %[[InnerForPh]] ], [ %[[InnerIndNext:.*]], %[[InnerForBody]] ]
 ; CHECK: %[[AccumPhi:.*]] = phi <4 x i32> [ %[[WideAVal]], %[[InnerForPh]] ], [ %[[AccumPhiNext:.*]], %[[InnerForBody]] ]
 ; CHECK: %[[BAddr:.*]] = getelementptr inbounds [1024 x i32], ptr @B, i64 0, <4 x i64> %[[InnerInd]]
-; CHECK: %[[WideBVal:.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %[[BAddr]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison)
+; CHECK: %[[WideBVal:.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %[[BAddr]], <4 x i1> splat (i1 true), <4 x i32> poison)
 ; CHECK: %[[Add1:.*]] = add nsw <4 x i32> %[[WideBVal]], %[[VecIndTr]]
 ; CHECK: %[[AccumPhiNext]] = add nsw <4 x i32> %[[Add1]], %[[AccumPhi]]
 ; CHECK: %[[InnerIndNext]] = add nuw nsw <4 x i64> %[[InnerInd]], splat (i64 1)
@@ -45,7 +45,7 @@
 
 ; CHECK: [[InnerCrit]]:
 ; CHECK: %[[StorePhi:.*]] = phi <4 x i32> [ %[[AccumPhiNext]], %[[InnerForBody]] ]
-; CHECK: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %[[StorePhi]], <4 x ptr> %[[AAddr]], i32 4, <4 x i1> splat (i1 true))
+; CHECK: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %[[StorePhi]], <4 x ptr> align 4 %[[AAddr]], <4 x i1> splat (i1 true))
 ; CHECK:  br label %[[ForInc]]
 
 ; CHECK: [[ForInc]]:
diff --git a/llvm/test/Transforms/LoopVectorize/partial-lcssa.ll b/llvm/test/Transforms/LoopVectorize/partial-lcssa.ll
index d700d484c56bc..f5e480c910c33 100644
--- a/llvm/test/Transforms/LoopVectorize/partial-lcssa.ll
+++ b/llvm/test/Transforms/LoopVectorize/partial-lcssa.ll
@@ -10,7 +10,7 @@
 ; CHECK: store i64 %indvars.outer, ptr %O2, align 4
 
 
-define i64 @foo(ptr nocapture %A, ptr nocapture %B, i64 %n, i64 %m, ptr %O1, ptr %O2) {
+define void @foo(ptr nocapture %A, ptr nocapture %B, i64 %n, i64 %m, ptr %O1, ptr %O2) {
 entry:
   %cmp = icmp sgt i64 %n, 0
   br i1 %cmp, label %for.body.outer.preheader, label %for.end.outer
@@ -50,5 +50,5 @@ for.end.outer.loopexit:                           ; preds = %for.end.inner
   br label %for.end.outer
 
 for.end.outer:                                    ; preds = %for.end.outer.loopexit, %entry
-  ret i64 undef
+  ret void
 }
diff --git a/llvm/test/Transforms/LoopVectorize/pointer-induction-index-width-smaller-than-iv-width.ll b/llvm/test/Transforms/LoopVectorize/pointer-induction-index-width-smaller-than-iv-width.ll
index bf23485ebdf16..cebf90ae9f8ce 100644
--- a/llvm/test/Transforms/LoopVectorize/pointer-induction-index-width-smaller-than-iv-width.ll
+++ b/llvm/test/Transforms/LoopVectorize/pointer-induction-index-width-smaller-than-iv-width.ll
@@ -17,11 +17,14 @@ define void @wide_ptr_induction_index_width_smaller_than_iv_width(ptr noalias %s
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[SRC]], %[[VECTOR_PH]] ], [ [[PTR_IND:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i32> <i32 0, i32 8, i32 16, i32 24>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 1
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 2
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP5]], align 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DST_0]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DST_0]], i64 [[TMP2]]
@@ -29,11 +32,8 @@ define void @wide_ptr_induction_index_width_smaller_than_iv_width(ptr noalias %s
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[DST_0]], i64 [[TMP4]]
 ; CHECK-NEXT:    store <4 x i64> [[WIDE_LOAD]], ptr [[TMP7]], align 8
 ; CHECK-NEXT:    store ptr [[TMP5]], ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 1
 ; CHECK-NEXT:    store ptr [[TMP12]], ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 2
 ; CHECK-NEXT:    store ptr [[TMP13]], ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 3
 ; CHECK-NEXT:    store ptr [[TMP14]], ptr [[TMP10]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i32 32
diff --git a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll
index 5c04e4caaa830..5c62ca3ff3d01 100644
--- a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll
@@ -147,11 +147,11 @@ define void @pointer_induction_used_as_vector(ptr noalias %start.1, ptr noalias
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[START_2]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 0
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START_1]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, <4 x ptr> [[VECTOR_GEP]], i64 1
 ; CHECK-NEXT:    store <4 x ptr> [[TMP2]], ptr [[NEXT_GEP]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i8> [[WIDE_LOAD]], splat (i8 1)
 ; CHECK-NEXT:    store <4 x i8> [[TMP6]], ptr [[TMP4]], align 1
@@ -551,12 +551,12 @@ define i64 @ivopt_widen_ptr_indvar_2(ptr noalias %a, i64 %stride, i64 %n) {
 ; STRIDED-NEXT:    [[TMP21:%.*]] = getelementptr i64, ptr [[A:%.*]], i64 [[INDEX]]
 ; STRIDED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP21]], align 8
 ; STRIDED-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[WIDE_LOAD]], i32 0
-; STRIDED-NEXT:    store i64 [[TMP23]], ptr [[NEXT_GEP]], align 8
 ; STRIDED-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[WIDE_LOAD]], i32 1
-; STRIDED-NEXT:    store i64 [[TMP24]], ptr [[NEXT_GEP1]], align 8
 ; STRIDED-NEXT:    [[TMP16:%.*]] = extractelement <4 x i64> [[WIDE_LOAD]], i32 2
-; STRIDED-NEXT:    store i64 [[TMP16]], ptr [[NEXT_GEP2]], align 8
 ; STRIDED-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[WIDE_LOAD]], i32 3
+; STRIDED-NEXT:    store i64 [[TMP23]], ptr [[NEXT_GEP]], align 8
+; STRIDED-NEXT:    store i64 [[TMP24]], ptr [[NEXT_GEP1]], align 8
+; STRIDED-NEXT:    store i64 [[TMP16]], ptr [[NEXT_GEP2]], align 8
 ; STRIDED-NEXT:    store i64 [[TMP25]], ptr [[NEXT_GEP3]], align 8
 ; STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; STRIDED-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/pr28541.ll b/llvm/test/Transforms/LoopVectorize/pr28541.ll
index ad7f6e7b16b16..0a9c8c1504055 100644
--- a/llvm/test/Transforms/LoopVectorize/pr28541.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr28541.ll
@@ -28,7 +28,7 @@
 ; CHECK-NOT: vectorized loop
 ; CHECK-LABEL: fn1
 
-define i32 @fn1() {
+define void @fn1() {
 entry:
   %tmp2 = load i32, ptr @b, align 4
   %dec3 = add nsw i32 %tmp2, -1
@@ -67,5 +67,5 @@ while.cond.while.end_crit_edge:                   ; preds = %while.cond
   br label %while.end
 
 while.end:                                        ; preds = %while.cond.while.end_crit_edge, %entry
-  ret i32 undef
+  ret void
 }
diff --git a/llvm/test/Transforms/LoopVectorize/pr34681.ll b/llvm/test/Transforms/LoopVectorize/pr34681.ll
index e1c1e2065498c..0f509a5c4eeb3 100644
--- a/llvm/test/Transforms/LoopVectorize/pr34681.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr34681.ll
@@ -62,12 +62,12 @@ define i32 @foo1(i32 %N, ptr nocapture readnone %A, ptr nocapture readonly %B, i
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = add <4 x i32> [[TMP10]], [[BROADCAST_SPLAT3]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[TMP11]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP12]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[TMP11]], i32 1
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP14]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[TMP11]], i32 2
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP16]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i32> [[TMP11]], i32 3
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP12]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP16]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP18]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = load i16, ptr [[TMP13]], align 2
 ; CHECK-NEXT:    [[TMP21:%.*]] = load i16, ptr [[TMP15]], align 2
@@ -167,12 +167,12 @@ define i32 @foo2(i16 zeroext %N, ptr nocapture readnone %A, ptr nocapture readon
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP7]], [[BROADCAST_SPLAT3]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP9]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i32> [[TMP8]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[TMP8]], i32 2
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP13]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP13]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP15]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = load i16, ptr [[TMP10]], align 2
 ; CHECK-NEXT:    [[TMP18:%.*]] = load i16, ptr [[TMP12]], align 2
diff --git a/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll b/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll
index 32762a4e46a41..1bb6454cdeea2 100644
--- a/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll
@@ -54,12 +54,12 @@ define void @scev4stride1(ptr noalias nocapture %a, ptr noalias nocapture readon
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP9]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP9]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP8]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/pr48832.ll b/llvm/test/Transforms/LoopVectorize/pr48832.ll
index b89be885a5042..c6ebe85a55f6f 100644
--- a/llvm/test/Transforms/LoopVectorize/pr48832.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr48832.ll
@@ -23,7 +23,7 @@ for.body:                                         ; preds = %for.cond
   br i1 true, label %cond.false, label %land.rhs
 
 land.rhs:                                         ; preds = %for.body
-  br i1 poison, label %cond.end, label %cond.false
+  br i1 false, label %cond.end, label %cond.false
 
 cond.false:                                       ; preds = %for.body, %land.rhs
   br label %cond.end
diff --git a/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll b/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll
index 6542c42678cc5..cf973affae5f2 100644
--- a/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll
+++ b/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll
@@ -131,8 +131,8 @@ define void @widen_ptr_induction_dbg(ptr %start, ptr %end) {
 ; DEBUGLOC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; DEBUGLOC-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[START]], %[[VECTOR_PH]] ], [ [[PTR_IND:%.*]], %[[VECTOR_BODY]] ], !dbg [[DBG35:![0-9]+]]
 ; DEBUGLOC-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> <i64 0, i64 8, i64 16, i64 24>, !dbg [[DBG35]]
-; DEBUGLOC-NEXT:    [[TMP6:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 0, !dbg [[DBG36:![0-9]+]]
-; DEBUGLOC-NEXT:    store <4 x ptr> [[VECTOR_GEP]], ptr [[TMP6]], align 1, !dbg [[DBG36]]
+; DEBUGLOC-NEXT:    [[TMP6:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 0
+; DEBUGLOC-NEXT:    store <4 x ptr> [[VECTOR_GEP]], ptr [[TMP6]], align 1, !dbg [[DBG36:![0-9]+]]
 ; DEBUGLOC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; DEBUGLOC-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 32, !dbg [[DBG35]]
 ; DEBUGLOC-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]], !dbg [[DBG37:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
index ec7fde81b205b..964a257ef352f 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
@@ -1207,11 +1207,11 @@ for.end:
 define i32 @reduction_sum_multiuse(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-LABEL: define i32 @reduction_sum_multiuse(
 ; CHECK-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]]) {
-; CHECK-NEXT:  [[_LR_PH1:.*]]:
+; CHECK-NEXT:  [[_LR_PH:.*]]:
 ; CHECK-NEXT:    br label %[[DOTLR_PH:.*]]
-; CHECK:       [[_LR_PH:.*:]]
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[DOTLR_PH]] ], [ 0, %[[_LR_PH1]] ]
-; CHECK-NEXT:    [[SUM_02:%.*]] = phi i32 [ [[L10:%.*]], %[[DOTLR_PH]] ], [ 0, %[[_LR_PH1]] ]
+; CHECK:       [[_LR_PH1:.*:]]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[DOTLR_PH]] ], [ 0, %[[_LR_PH]] ]
+; CHECK-NEXT:    [[SUM_02:%.*]] = phi i32 [ [[L10:%.*]], %[[DOTLR_PH]] ], [ 0, %[[_LR_PH]] ]
 ; CHECK-NEXT:    [[L2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[L3:%.*]] = load i32, ptr [[L2]], align 4
 ; CHECK-NEXT:    [[L4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
@@ -1231,11 +1231,11 @@ define i32 @reduction_sum_multiuse(ptr noalias nocapture %A, ptr noalias nocaptu
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @reduction_sum_multiuse(
 ; CHECK-INTERLEAVED-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]]) {
-; CHECK-INTERLEAVED-NEXT:  [[_LR_PH1:.*]]:
+; CHECK-INTERLEAVED-NEXT:  [[_LR_PH:.*]]:
 ; CHECK-INTERLEAVED-NEXT:    br label %[[DOTLR_PH:.*]]
-; CHECK-INTERLEAVED:       [[_LR_PH:.*:]]
-; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[DOTLR_PH]] ], [ 0, %[[_LR_PH1]] ]
-; CHECK-INTERLEAVED-NEXT:    [[SUM_02:%.*]] = phi i32 [ [[L10:%.*]], %[[DOTLR_PH]] ], [ 0, %[[_LR_PH1]] ]
+; CHECK-INTERLEAVED:       [[_LR_PH1:.*:]]
+; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[DOTLR_PH]] ], [ 0, %[[_LR_PH]] ]
+; CHECK-INTERLEAVED-NEXT:    [[SUM_02:%.*]] = phi i32 [ [[L10:%.*]], %[[DOTLR_PH]] ], [ 0, %[[_LR_PH]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[L2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
 ; CHECK-INTERLEAVED-NEXT:    [[L3:%.*]] = load i32, ptr [[L2]], align 4
 ; CHECK-INTERLEAVED-NEXT:    [[L4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
@@ -1947,7 +1947,7 @@ define i32 @predicated_not_dominates_reduction(ptr nocapture noundef readonly %h
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ undef, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[H]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], zeroinitializer
@@ -1966,7 +1966,7 @@ define i32 @predicated_not_dominates_reduction(ptr nocapture noundef readonly %h
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END7:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ undef, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[FOR_BODY2:.*]]
 ; CHECK:       [[FOR_BODY2]]:
 ; CHECK-NEXT:    [[A_117:%.*]] = phi i32 [ [[INC6:%.*]], %[[FOR_INC5:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
@@ -2002,7 +2002,7 @@ define i32 @predicated_not_dominates_reduction(ptr nocapture noundef readonly %h
 ; CHECK-INTERLEAVED-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-INTERLEAVED:       [[VECTOR_BODY]]:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ undef, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[H]], i32 [[INDEX]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 4
@@ -2033,7 +2033,7 @@ define i32 @predicated_not_dominates_reduction(ptr nocapture noundef readonly %h
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label %[[FOR_END7:.*]], label %[[SCALAR_PH]]
 ; CHECK-INTERLEAVED:       [[SCALAR_PH]]:
 ; CHECK-INTERLEAVED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ], [ undef, %[[ENTRY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-INTERLEAVED-NEXT:    br label %[[FOR_BODY2:.*]]
 ; CHECK-INTERLEAVED:       [[FOR_BODY2]]:
 ; CHECK-INTERLEAVED-NEXT:    [[A_117:%.*]] = phi i32 [ [[INC6:%.*]], %[[FOR_INC5:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
@@ -2063,7 +2063,7 @@ entry:
 
 for.body2:                                        ; preds = %entry, %for.inc5
   %a.117 = phi i32 [ %inc6, %for.inc5 ], [ 0, %entry ]
-  %g.016 = phi i32 [ %g.1, %for.inc5 ], [ undef, %entry ]
+  %g.016 = phi i32 [ %g.1, %for.inc5 ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds i8, ptr %h, i32 %a.117
   %0 = load i8, ptr %arrayidx, align 1
   %tobool3.not = icmp eq i8 %0, 0
@@ -2100,7 +2100,7 @@ define i32 @predicated_not_dominates_reduction_twoadd(ptr nocapture noundef read
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ undef, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[H]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], zeroinitializer
@@ -2122,7 +2122,7 @@ define i32 @predicated_not_dominates_reduction_twoadd(ptr nocapture noundef read
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END7:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ undef, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[FOR_BODY2:.*]]
 ; CHECK:       [[FOR_BODY2]]:
 ; CHECK-NEXT:    [[A_117:%.*]] = phi i32 [ [[INC6:%.*]], %[[FOR_INC5:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
@@ -2159,7 +2159,7 @@ define i32 @predicated_not_dominates_reduction_twoadd(ptr nocapture noundef read
 ; CHECK-INTERLEAVED-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-INTERLEAVED:       [[VECTOR_BODY]]:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ undef, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP20:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[H]], i32 [[INDEX]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 4
@@ -2196,7 +2196,7 @@ define i32 @predicated_not_dominates_reduction_twoadd(ptr nocapture noundef read
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label %[[FOR_END7:.*]], label %[[SCALAR_PH]]
 ; CHECK-INTERLEAVED:       [[SCALAR_PH]]:
 ; CHECK-INTERLEAVED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ], [ undef, %[[ENTRY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-INTERLEAVED-NEXT:    br label %[[FOR_BODY2:.*]]
 ; CHECK-INTERLEAVED:       [[FOR_BODY2]]:
 ; CHECK-INTERLEAVED-NEXT:    [[A_117:%.*]] = phi i32 [ [[INC6:%.*]], %[[FOR_INC5:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
@@ -2227,7 +2227,7 @@ entry:
 
 for.body2:                                        ; preds = %entry, %for.inc5
   %a.117 = phi i32 [ %inc6, %for.inc5 ], [ 0, %entry ]
-  %g.016 = phi i32 [ %g.1, %for.inc5 ], [ undef, %entry ]
+  %g.016 = phi i32 [ %g.1, %for.inc5 ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds i8, ptr %h, i32 %a.117
   %0 = load i8, ptr %arrayidx, align 1
   %tobool3.not = icmp eq i8 %0, 0
@@ -2263,7 +2263,7 @@ define i32 @predicated_or_dominates_reduction(ptr %b) {
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE6:.*]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ undef, %[[VECTOR_PH]] ], [ [[TMP48:%.*]], %[[PRED_LOAD_CONTINUE6]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP48:%.*]], %[[PRED_LOAD_CONTINUE6]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[INDEX]], 2
@@ -2340,7 +2340,7 @@ define i32 @predicated_or_dominates_reduction(ptr %b) {
 ; CHECK-INTERLEAVED-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-INTERLEAVED:       [[VECTOR_BODY]]:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE15:.*]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ undef, %[[VECTOR_PH]] ], [ [[TMP94:%.*]], %[[PRED_LOAD_CONTINUE15]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP94:%.*]], %[[PRED_LOAD_CONTINUE15]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP98:%.*]], %[[PRED_LOAD_CONTINUE15]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 1
@@ -2480,7 +2480,7 @@ for.cond.cleanup:                                 ; preds = %for.inc
 
 for.body:                                         ; preds = %entry, %for.inc
   %g.09 = phi i32 [ 0, %entry ], [ %inc3, %for.inc ]
-  %a.08 = phi i32 [ undef, %entry ], [ %a.1, %for.inc ]
+  %a.08 = phi i32 [ 0, %entry ], [ %a.1, %for.inc ]
   %d = getelementptr inbounds [0 x %struct.e], ptr %b, i32 0, i32 %g.09, i32 1
   %0 = load i32, ptr %d, align 4
   %tobool.not = icmp eq i32 %0, 0
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-minmax-users-and-predicated.ll b/llvm/test/Transforms/LoopVectorize/reduction-minmax-users-and-predicated.ll
new file mode 100644
index 0000000000000..e4322cfcc00ac
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/reduction-minmax-users-and-predicated.ll
@@ -0,0 +1,588 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
+; RUN: opt -p loop-vectorize -force-vector-width=4 -S %s | FileCheck %s
+
+define i32 @umax_phi_used_outside(ptr %src, i32 %n) {
+; CHECK-LABEL: define i32 @umax_phi_used_outside(
+; CHECK-SAME: ptr [[SRC:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1
+; CHECK-NEXT:    [[L_EXT:%.*]] = zext i8 [[L]] to i32
+; CHECK-NEXT:    [[SPEC_SELECT]] = tail call i32 @llvm.umax.i32(i32 [[MAX]], i32 [[L_EXT]])
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MAX_LCSSA:%.*]] = phi i32 [ [[MAX]], %[[LOOP]] ]
+; CHECK-NEXT:    ret i32 [[MAX_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %max = phi i32 [ 0, %entry ], [ %spec.select, %loop ]
+  %gep.src = getelementptr inbounds i8, ptr %src, i32 %iv
+  %l = load i8, ptr %gep.src
+  %l.ext = zext i8 %l to i32
+  %spec.select = tail call i32 @llvm.umax.i32(i32 %max, i32 %l.ext)
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i32 %max
+}
+
+define i32 @chained_smax(i32 %x, ptr %src) {
+; CHECK-LABEL: define i32 @chained_smax(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE6:.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i8> [ <i8 0, i8 1, i8 2, i8 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_LOAD_CONTINUE6]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP26:%.*]], %[[PRED_LOAD_CONTINUE6]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ule <4 x i8> [[VEC_IND]], splat (i8 1)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[BROADCAST_SPLAT]], <4 x i32> [[VEC_PHI]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; CHECK:       [[PRED_LOAD_IF]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr [3 x i32], ptr [[SRC]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP5]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; CHECK:       [[PRED_LOAD_CONTINUE]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = phi <4 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP6]], %[[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2:.*]]
+; CHECK:       [[PRED_LOAD_IF1]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr [3 x i32], ptr [[SRC]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP11]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
+; CHECK:       [[PRED_LOAD_CONTINUE2]]:
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <4 x i32> [ [[TMP7]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], %[[PRED_LOAD_IF1]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[PRED_LOAD_IF3:.*]], label %[[PRED_LOAD_CONTINUE4:.*]]
+; CHECK:       [[PRED_LOAD_IF3]]:
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr [3 x i32], ptr [[SRC]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP17]], i32 2
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE4]]
+; CHECK:       [[PRED_LOAD_CONTINUE4]]:
+; CHECK-NEXT:    [[TMP19:%.*]] = phi <4 x i32> [ [[TMP13]], %[[PRED_LOAD_CONTINUE2]] ], [ [[TMP18]], %[[PRED_LOAD_IF3]] ]
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3
+; CHECK-NEXT:    br i1 [[TMP20]], label %[[PRED_LOAD_IF5:.*]], label %[[PRED_LOAD_CONTINUE6]]
+; CHECK:       [[PRED_LOAD_IF5]]:
+; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr [3 x i32], ptr [[SRC]], i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x i32> [[TMP19]], i32 [[TMP23]], i32 3
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE6]]
+; CHECK:       [[PRED_LOAD_CONTINUE6]]:
+; CHECK-NEXT:    [[TMP25:%.*]] = phi <4 x i32> [ [[TMP19]], %[[PRED_LOAD_CONTINUE4]] ], [ [[TMP24]], %[[PRED_LOAD_IF5]] ]
+; CHECK-NEXT:    [[TMP26]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP25]], <4 x i32> [[TMP1]])
+; CHECK-NEXT:    [[TMP27:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP26]], <4 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
+; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP28:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP27]])
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i32 [[TMP28]]
+;
+entry:
+  br label %loop
+
+loop:                              ; preds = %loop, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %max = phi i32 [ 0, %entry ], [ %max.next, %loop ]
+  %gep.src = getelementptr [3 x i32], ptr %src, i64 %iv
+  %max.1 = tail call i32 @llvm.smax.i32(i32 %x, i32 %max)
+  %l = load i32, ptr %gep.src, align 4
+  %max.next = tail call i32 @llvm.smax.i32(i32 %l, i32 %max.1)
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv, 1
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i32 %max.next
+}
+
+define void @smax_with_invariant_store_user(ptr noalias %src, ptr %dst, i64 %n) {
+; CHECK-LABEL: define void @smax_with_invariant_store_user(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]])
+; CHECK-NEXT:    store i32 [[TMP4]], ptr [[DST]], align 4
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP4]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[MAX_NEXT]] = tail call i32 @llvm.smax.i32(i32 [[MAX]], i32 [[L]])
+; CHECK-NEXT:    store i32 [[MAX_NEXT]], ptr [[DST]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %max = phi i32 [ 0, %entry ], [ %max.next, %loop ]
+  %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
+  %l = load i32, ptr %gep.src, align 4
+  %max.next = tail call i32 @llvm.smax.i32(i32 %max, i32 %l)
+  store i32 %max.next, ptr %dst, align 4
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @smax_with_multiple_invariant_store_user_same_addr(ptr noalias %src, ptr %dst, i64 %n) {
+; CHECK-LABEL: define void @smax_with_multiple_invariant_store_user_same_addr(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]])
+; CHECK-NEXT:    store i32 [[TMP4]], ptr [[DST]], align 4
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP4]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[MAX_NEXT]] = tail call i32 @llvm.smax.i32(i32 [[MAX]], i32 [[L]])
+; CHECK-NEXT:    store i32 [[MAX_NEXT]], ptr [[DST]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    store i32 [[MAX_NEXT]], ptr [[DST]], align 4
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %max = phi i32 [ 0, %entry ], [ %max.next, %loop ]
+  %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
+  %l = load i32, ptr %gep.src, align 4
+  %max.next = tail call i32 @llvm.smax.i32(i32 %max, i32 %l)
+  store i32 %max.next, ptr %dst, align 4
+  %iv.next = add i64 %iv, 1
+  store i32 %max.next, ptr %dst, align 4
+  %ec = icmp eq i64 %iv, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @smax_with_multiple_invariant_store_user_same_addr2(ptr noalias %src, ptr %dst, i64 %n) {
+; CHECK-LABEL: define void @smax_with_multiple_invariant_store_user_same_addr2(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[MAX_NEXT]] = tail call i32 @llvm.smax.i32(i32 [[MAX]], i32 [[L]])
+; CHECK-NEXT:    store i32 [[MAX_NEXT]], ptr [[DST]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    store i32 0, ptr [[DST]], align 4
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %max = phi i32 [ 0, %entry ], [ %max.next, %loop ]
+  %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
+  %l = load i32, ptr %gep.src, align 4
+  %max.next = tail call i32 @llvm.smax.i32(i32 %max, i32 %l)
+  store i32 %max.next, ptr %dst, align 4
+  %iv.next = add i64 %iv, 1
+  store i32 0, ptr %dst, align 4
+  %ec = icmp eq i64 %iv, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @smax_with_multiple_invariant_store_user_same_addr3(ptr noalias %src, ptr %dst, i64 %n) {
+; CHECK-LABEL: define void @smax_with_multiple_invariant_store_user_same_addr3(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]])
+; CHECK-NEXT:    store i32 [[TMP4]], ptr [[DST]], align 4
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP4]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[MAX_NEXT]] = tail call i32 @llvm.smax.i32(i32 [[MAX]], i32 [[L]])
+; CHECK-NEXT:    store i32 0, ptr [[DST]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    store i32 [[MAX_NEXT]], ptr [[DST]], align 4
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %max = phi i32 [ 0, %entry ], [ %max.next, %loop ]
+  %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
+  %l = load i32, ptr %gep.src, align 4
+  %max.next = tail call i32 @llvm.smax.i32(i32 %max, i32 %l)
+  store i32 0, ptr %dst, align 4
+  %iv.next = add i64 %iv, 1
+  store i32 %max.next, ptr %dst, align 4
+  %ec = icmp eq i64 %iv, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @smax_with_multiple_invariant_store_user_different_addr(ptr noalias %src, ptr noalias %dst, ptr noalias %dst.2, i64 %n) {
+; CHECK-LABEL: define void @smax_with_multiple_invariant_store_user_different_addr(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], ptr noalias [[DST_2:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[MAX_NEXT]] = tail call i32 @llvm.smax.i32(i32 [[MAX]], i32 [[L]])
+; CHECK-NEXT:    store i32 [[MAX_NEXT]], ptr [[DST]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    store i32 [[MAX_NEXT]], ptr [[DST_2]], align 4
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %max = phi i32 [ 0, %entry ], [ %max.next, %loop ]
+  %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
+  %l = load i32, ptr %gep.src, align 4
+  %max.next = tail call i32 @llvm.smax.i32(i32 %max, i32 %l)
+  store i32 %max.next, ptr %dst, align 4
+  %iv.next = add i64 %iv, 1
+  store i32 %max.next, ptr %dst.2, align 4
+  %ec = icmp eq i64 %iv, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define i32 @chained_instructions_feeding_max1(i32 %x, ptr %src) {
+; CHECK-LABEL: define i32 @chained_instructions_feeding_max1(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr [3 x i32], ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[MAX]], [[L]]
+; CHECK-NEXT:    [[MAX_NEXT]] = tail call i32 @llvm.smax.i32(i32 [[ADD]], i32 [[L]])
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 1
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi i32 [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    ret i32 [[MAX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:                              ; preds = %loop, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %max = phi i32 [ 0, %entry ], [ %max.next, %loop ]
+  %gep.src = getelementptr [3 x i32], ptr %src, i64 %iv
+  %l = load i32, ptr %gep.src, align 4
+  %add = add i32 %max, %l
+  %max.next = tail call i32 @llvm.smax.i32(i32 %add, i32 %l)
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv, 1
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i32 %max.next
+}
+
+define i32 @chained_instructions_feeding_max2(i32 %x, ptr %src) {
+; CHECK-LABEL: define i32 @chained_instructions_feeding_max2(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr [3 x i32], ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[MAX_1:%.*]] = tail call i32 @llvm.smax.i32(i32 [[X]], i32 [[MAX]])
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[L]], [[MAX_1]]
+; CHECK-NEXT:    [[MAX_NEXT]] = tail call i32 @llvm.smax.i32(i32 [[ADD]], i32 100)
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 1
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi i32 [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    ret i32 [[MAX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:                              ; preds = %loop, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %max = phi i32 [ 0, %entry ], [ %max.next, %loop ]
+  %gep.src = getelementptr [3 x i32], ptr %src, i64 %iv
+  %max.1 = tail call i32 @llvm.smax.i32(i32 %x, i32 %max)
+  %l = load i32, ptr %gep.src, align 4
+  %add = add i32 %l, %max.1
+  %max.next = tail call i32 @llvm.smax.i32(i32 %add, i32 100)
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv, 1
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i32 %max.next
+}
+
+
+define i32 @test_predicated_smin(ptr %src) {
+; CHECK-LABEL: define i32 @test_predicated_smin(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PREDPHI:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr float, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp une <4 x float> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = fdiv <4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00)
+; CHECK-NEXT:    [[TMP3:%.*]] = fptosi <4 x float> [[TMP2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[VEC_PHI]], <4 x i32> [[TMP3]])
+; CHECK-NEXT:    [[PREDPHI]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP4]], <4 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 112
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[PREDPHI]])
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret i32 [[TMP6]]
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %min = phi i32 [ 0, %entry ], [ %min.merge, %loop.latch ]
+  %gep.src = getelementptr float, ptr %src, i64 %iv
+  %l = load float, ptr %gep.src, align 4
+  %c = fcmp une float %l, 0.0
+  br i1 %c, label %then, label %loop.latch
+
+then:
+  %div = fdiv float %l, 3.0
+  %div.i32 = fptosi float %div to i32
+  %min.next = tail call i32 @llvm.smin.i32(i32 %min, i32 %div.i32)
+  br label %loop.latch
+
+loop.latch:
+  %min.merge = phi i32 [ %min.next, %then ], [ %min, %loop.header ]
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv, 111
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret i32 %min.merge
+}
+
+define i32 @smax_reduction_multiple_incoming(ptr %src, i32 %n, i1 %cond) {
+; CHECK-LABEL: define i32 @smax_reduction_multiple_incoming(
+; CHECK-SAME: ptr [[SRC:%.*]], i32 [[N:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 [[COND]], label %[[LOOP_HEADER_PREHEADER:.*]], label %[[ELSE:.*]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    br label %[[LOOP_HEADER_PREHEADER]]
+; CHECK:       [[LOOP_HEADER_PREHEADER]]:
+; CHECK-NEXT:    [[IV_PH:%.*]] = phi i32 [ 10, %[[ELSE]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[MAX_PH:%.*]] = phi i32 [ 5, %[[ELSE]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[TMP0]], [[IV_PH]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP1]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; CHECK:       [[VECTOR_SCEVCHECK]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt i32 [[N]], [[IV_PH]]
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP1]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP1]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[IV_PH]], [[N_VEC]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[MAX_PH]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[IV_PH]], [[INDEX]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP3]], %[[MIDDLE_BLOCK]] ], [ [[IV_PH]], %[[LOOP_HEADER_PREHEADER]] ], [ [[IV_PH]], %[[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ [[MAX_PH]], %[[LOOP_HEADER_PREHEADER]] ], [ [[MAX_PH]], %[[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[LOOP_HEADER]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi i32 [ [[MAX_NEXT:%.*]], %[[LOOP_HEADER]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[MAX_NEXT]] = tail call i32 @llvm.smax.i32(i32 [[MAX]], i32 [[L]])
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi i32 [ [[MAX_NEXT]], %[[LOOP_HEADER]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[MAX_NEXT_LCSSA]]
+;
+entry:
+  br i1 %cond, label %loop.header, label %else
+
+else:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i32 [ 0, %entry ], [ 10, %else ], [ %iv.next, %loop.header ]
+  %max = phi i32 [ 0, %entry ], [ 5, %else ], [ %max.next, %loop.header ]
+  %gep.src = getelementptr inbounds i32, ptr %src, i32 %iv
+  %l = load i32, ptr %gep.src, align 4
+  %max.next = tail call i32 @llvm.smax.i32(i32 %max, i32 %l)
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv, %n
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret i32 %max.next
+}
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll b/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll
index 7dd29958504eb..c708715c623e6 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll
@@ -348,16 +348,16 @@ define void @reduc_store_inside_unrolled(ptr %dst, ptr readonly %src) {
 ; CHECK-NEXT:    [[TMP16:%.*]] = add <4 x i32> [[TMP15]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = or disjoint <4 x i64> [[VEC_IND]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i64> [[TMP17]], i32 0
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP18]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i64> [[TMP17]], i32 1
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP20]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP17]], i32 2
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP22]]
 ; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP17]], i32 3
+; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP18]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP20]]
+; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP22]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP19]], align 4, !alias.scope [[META16]]
-; CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP21]], align 4, !alias.scope [[META16]]
-; CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP23]], align 4, !alias.scope [[META16]]
+; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP37]], align 4, !alias.scope [[META16]]
+; CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP23]], align 4, !alias.scope [[META16]]
+; CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP38]], align 4, !alias.scope [[META16]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP25]], align 4, !alias.scope [[META16]]
 ; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <4 x i32> poison, i32 [[TMP26]], i32 0
 ; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP27]], i32 1
@@ -554,16 +554,16 @@ define void @reduc_store_middle_store_predicated(ptr %dst, ptr readonly %src) {
 ; CHECK-NEXT:    [[TMP16:%.*]] = add <4 x i32> [[TMP15]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = or disjoint <4 x i64> [[VEC_IND]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i64> [[TMP17]], i32 0
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP18]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i64> [[TMP17]], i32 1
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP20]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP17]], i32 2
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP22]]
 ; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP17]], i32 3
+; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP18]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP20]]
+; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP22]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP19]], align 4, !alias.scope [[META23]]
-; CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP21]], align 4, !alias.scope [[META23]]
-; CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP23]], align 4, !alias.scope [[META23]]
+; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP37]], align 4, !alias.scope [[META23]]
+; CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP23]], align 4, !alias.scope [[META23]]
+; CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP38]], align 4, !alias.scope [[META23]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP25]], align 4, !alias.scope [[META23]]
 ; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <4 x i32> poison, i32 [[TMP26]], i32 0
 ; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP27]], i32 1
diff --git a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
index 5894c3af1d637..c270a23344f54 100644
--- a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
+++ b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
@@ -41,8 +41,8 @@ define void @reuse_lcssa_phi_for_add_rec1(ptr %head) {
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x ptr>, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <2 x ptr> [[WIDE_LOAD]], <2 x ptr> poison, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x ptr> [[REVERSE]], i32 0
-; CHECK-NEXT:    store ptr null, ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x ptr> [[REVERSE]], i32 1
+; CHECK-NEXT:    store ptr null, ptr [[TMP8]], align 8
 ; CHECK-NEXT:    store ptr null, ptr [[TMP9]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/reverse_iter.ll b/llvm/test/Transforms/LoopVectorize/reverse_iter.ll
index 43e916b48265e..6576675a913ca 100644
--- a/llvm/test/Transforms/LoopVectorize/reverse_iter.ll
+++ b/llvm/test/Transforms/LoopVectorize/reverse_iter.ll
@@ -17,8 +17,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ;CHECK:  <i32 0, i32 -1, i32 -2, i32 -3>
 ;CHECK: ret
 define i32 @foo(i32 %n, ptr nocapture %A) {
-  %1 = icmp sgt i32 %n, 0
-  br i1 %1, label %.lr.ph, label %._crit_edge
+  br label %.lr.ph
 
 .lr.ph:                                           ; preds = %0
   %2 = sext i32 %n to i64
@@ -26,7 +25,7 @@ define i32 @foo(i32 %n, ptr nocapture %A) {
 
 ; <label>:3                                       ; preds = %.lr.ph, %3
   %indvars.iv = phi i64 [ %2, %.lr.ph ], [ %indvars.iv.next, %3 ]
-  %sum.01 = phi i32 [ undef, %.lr.ph ], [ %9, %3 ]
+  %sum.01 = phi i32 [ 0, %.lr.ph ], [ %9, %3 ]
   %4 = trunc i64 %indvars.iv to i32
   %5 = shl nsw i32 %4, 1
   %6 = sext i32 %5 to i64
@@ -38,8 +37,7 @@ define i32 @foo(i32 %n, ptr nocapture %A) {
   %11 = icmp sgt i32 %10, 0
   br i1 %11, label %3, label %._crit_edge
 
-._crit_edge:                                      ; preds = %3, %0
-  %sum.0.lcssa = phi i32 [ undef, %0 ], [ %9, %3 ]
-  ret i32 %sum.0.lcssa
+._crit_edge:                                      ; preds = %3
+  ret i32 %9
 }
 
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check.ll b/llvm/test/Transforms/LoopVectorize/runtime-check.ll
index f87be5a115044..6ea227f7492ee 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-check.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check.ll
@@ -10,7 +10,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ;     a[i] = b[i] * 3;
 ; }
 
-define i32 @foo(ptr nocapture %a, ptr nocapture %b, i32 %n) nounwind uwtable ssp {
+define void @foo(ptr nocapture %a, ptr nocapture %b, i32 %n) nounwind uwtable ssp {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[B2:%.*]] = ptrtoint ptr [[B:%.*]] to i64, !dbg [[DBG4:![0-9]+]]
@@ -58,7 +58,7 @@ define i32 @foo(ptr nocapture %a, ptr nocapture %b, i32 %n) nounwind uwtable ssp
 ; CHECK:       for.end.loopexit:
 ; CHECK-NEXT:    br label [[FOR_END]], !dbg [[DBG14:![0-9]+]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    ret i32 undef, !dbg [[DBG14]]
+; CHECK-NEXT:    ret void, !dbg [[DBG14]]
 ;
 ; FORCED_OPTSIZE-LABEL: @foo(
 ; FORCED_OPTSIZE-NEXT:  entry:
@@ -80,7 +80,7 @@ define i32 @foo(ptr nocapture %a, ptr nocapture %b, i32 %n) nounwind uwtable ssp
 ; FORCED_OPTSIZE:       for.end.loopexit:
 ; FORCED_OPTSIZE-NEXT:    br label [[FOR_END]], !dbg [[DBG10:![0-9]+]]
 ; FORCED_OPTSIZE:       for.end:
-; FORCED_OPTSIZE-NEXT:    ret i32 undef, !dbg [[DBG10]]
+; FORCED_OPTSIZE-NEXT:    ret void, !dbg [[DBG10]]
 ;
 entry:
   %cmp6 = icmp sgt i32 %n, 0, !dbg !6
@@ -99,7 +99,7 @@ for.body:                                         ; preds = %entry, %for.body
   br i1 %exitcond, label %for.end, label %for.body, !dbg !7
 
 for.end:                                          ; preds = %for.body, %entry
-  ret i32 undef, !dbg !8
+  ret void, !dbg !8
 }
 
 ; Make sure that we try to vectorize loops with a runtime check if the
@@ -505,11 +505,11 @@ define void @test_scev_check_mul_add_expansion(ptr %out, ptr %in, i32 %len, i32
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[INDEX]], 6
 ; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[OFFSET_IDX]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i16, ptr [[OUT]], i64 [[TMP6]]
-; CHECK-NEXT:    store <4 x i16> zeroinitializer, ptr [[TMP7]], align 2, !alias.scope [[META37:![0-9]+]], !noalias [[META40:![0-9]+]]
-; CHECK-NEXT:    store i32 0, ptr [[IN]], align 4, !alias.scope [[META40]]
+; CHECK-NEXT:    store <4 x i16> zeroinitializer, ptr [[TMP7]], align 2, !alias.scope [[META36:![0-9]+]], !noalias [[META39:![0-9]+]]
+; CHECK-NEXT:    store i32 0, ptr [[IN]], align 4, !alias.scope [[META39]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -524,7 +524,7 @@ define void @test_scev_check_mul_add_expansion(ptr %out, ptr %in, i32 %len, i32
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
 ; CHECK-NEXT:    store i32 0, ptr [[IN]], align 4
 ; CHECK-NEXT:    [[CMP7_NOT:%.*]] = icmp sgt i32 [[LEN]], [[IV_NEXT]]
-; CHECK-NEXT:    br i1 [[CMP7_NOT]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP43:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP7_NOT]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP42:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-assume.ll b/llvm/test/Transforms/LoopVectorize/scalable-assume.ll
index ad8cd425cd6c2..c858f201e01fa 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-assume.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-assume.ll
@@ -22,10 +22,10 @@ define void @test1(ptr noalias nocapture %a, ptr noalias nocapture readonly %b)
 ; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw i64 [[TMP7]], 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x float>, ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 2 x float>, ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <vscale x 2 x float> [[WIDE_LOAD]], i32 0
-; CHECK-NEXT:    [[FCMP1:%.*]] = fcmp ogt float [[TMP10]], 1.000000e+02
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 2 x float>, ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <vscale x 2 x float> [[WIDE_LOAD1]], i32 0
+; CHECK-NEXT:    [[FCMP1:%.*]] = fcmp ogt float [[TMP10]], 1.000000e+02
 ; CHECK-NEXT:    [[FCMP2:%.*]] = fcmp ogt float [[TMP12]], 1.000000e+02
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[FCMP1]])
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[FCMP2]])
@@ -88,11 +88,11 @@ define void @test2(ptr %a, ptr noalias %b) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1600, [[TMP7]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1600, [[N_MOD_VF]]
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND4]])
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP12:%.*]] = shl nuw i64 [[TMP11]], 1
@@ -101,8 +101,6 @@ define void @test2(ptr %a, ptr noalias %b) {
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x float>, ptr [[TMP13]], align 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = fadd <vscale x 2 x float> [[WIDE_LOAD]], splat (float 1.000000e+00)
 ; CHECK-NEXT:    [[TMP15:%.*]] = fadd <vscale x 2 x float> [[WIDE_LOAD3]], splat (float 1.000000e+00)
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND4]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND4]])
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP18:%.*]] = shl nuw i64 [[TMP17]], 1
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
index 60da3368b664e..1216bc1dc33cc 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
@@ -178,13 +178,14 @@ for.exit:
 define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
 ; CHECK-VF4UF1-LABEL: define i32 @recurrence_2(
 ; CHECK-VF4UF1-SAME: ptr readonly captures(none) [[A:%.*]], i32 [[N:%.*]]) {
-; CHECK-VF4UF1-NEXT:  [[ENTRY:.*]]:
-; CHECK-VF4UF1-NEXT:    [[CMP27:%.*]] = icmp sgt i32 [[N]], 0
-; CHECK-VF4UF1-NEXT:    br i1 [[CMP27]], label %[[FOR_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; CHECK-VF4UF1-NEXT:  [[ENTRY:.*:]]
+; CHECK-VF4UF1-NEXT:    br label %[[FOR_PREHEADER:.*]]
 ; CHECK-VF4UF1:       [[FOR_PREHEADER]]:
 ; CHECK-VF4UF1-NEXT:    [[ARRAYIDX2_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 -1
 ; CHECK-VF4UF1-NEXT:    [[DOTPRE:%.*]] = load i32, ptr [[ARRAYIDX2_PHI_TRANS_INSERT]], align 4
-; CHECK-VF4UF1-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-VF4UF1-NEXT:    [[TMP5:%.*]] = add i32 [[N]], -1
+; CHECK-VF4UF1-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP5]] to i64
+; CHECK-VF4UF1-NEXT:    [[TMP0:%.*]] = add nuw nsw i64 [[TMP6]], 1
 ; CHECK-VF4UF1-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-VF4UF1-NEXT:    [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 2
 ; CHECK-VF4UF1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
@@ -202,7 +203,7 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
 ; CHECK-VF4UF1:       [[VECTOR_BODY]]:
 ; CHECK-VF4UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4UF1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ undef, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4UF1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
 ; CHECK-VF4UF1-NEXT:    [[WIDE_LOAD]] = load <vscale x 4 x i32>, ptr [[TMP10]], align 4
 ; CHECK-VF4UF1-NEXT:    [[TMP12:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[WIDE_LOAD]], i32 -1)
@@ -225,25 +226,25 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
 ; CHECK-VF4UF1:       [[SCALAR_PH]]:
 ; CHECK-VF4UF1-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ [[DOTPRE]], %[[FOR_PREHEADER]] ]
 ; CHECK-VF4UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PREHEADER]] ]
-; CHECK-VF4UF1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], %[[MIDDLE_BLOCK]] ], [ undef, %[[FOR_PREHEADER]] ]
+; CHECK-VF4UF1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PREHEADER]] ]
 ; CHECK-VF4UF1-NEXT:    br label %[[SCALAR_BODY:.*]]
 ; CHECK-VF4UF1:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
-; CHECK-VF4UF1-NEXT:    [[MINMAX_0_COND_LCSSA:%.*]] = phi i32 [ [[MINMAX_0_COND:%.*]], %[[SCALAR_BODY]] ], [ [[TMP19]], %[[MIDDLE_BLOCK]] ]
-; CHECK-VF4UF1-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; CHECK-VF4UF1-NEXT:    [[MINMAX_0_LCSSA:%.*]] = phi i32 [ [[MINMAX_0_COND:%.*]], %[[SCALAR_BODY]] ], [ [[TMP19]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4UF1-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
 ; CHECK-VF4UF1:       [[FOR_COND_CLEANUP]]:
-; CHECK-VF4UF1-NEXT:    [[MINMAX_0_LCSSA:%.*]] = phi i32 [ undef, %[[ENTRY]] ], [ [[MINMAX_0_COND_LCSSA]], %[[FOR_COND_CLEANUP_LOOPEXIT]] ]
 ; CHECK-VF4UF1-NEXT:    ret i32 [[MINMAX_0_LCSSA]]
 ; CHECK-VF4UF1:       [[SCALAR_BODY]]:
 ;
 ; CHECK-VF4UF2-LABEL: define i32 @recurrence_2(
 ; CHECK-VF4UF2-SAME: ptr readonly captures(none) [[A:%.*]], i32 [[N:%.*]]) {
-; CHECK-VF4UF2-NEXT:  [[ENTRY:.*]]:
-; CHECK-VF4UF2-NEXT:    [[CMP27:%.*]] = icmp sgt i32 [[N]], 0
-; CHECK-VF4UF2-NEXT:    br i1 [[CMP27]], label %[[FOR_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; CHECK-VF4UF2-NEXT:  [[ENTRY:.*:]]
+; CHECK-VF4UF2-NEXT:    br label %[[FOR_PREHEADER:.*]]
 ; CHECK-VF4UF2:       [[FOR_PREHEADER]]:
 ; CHECK-VF4UF2-NEXT:    [[ARRAYIDX2_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 -1
 ; CHECK-VF4UF2-NEXT:    [[DOTPRE:%.*]] = load i32, ptr [[ARRAYIDX2_PHI_TRANS_INSERT]], align 4
-; CHECK-VF4UF2-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-VF4UF2-NEXT:    [[TMP5:%.*]] = add i32 [[N]], -1
+; CHECK-VF4UF2-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP5]] to i64
+; CHECK-VF4UF2-NEXT:    [[TMP0:%.*]] = add nuw nsw i64 [[TMP6]], 1
 ; CHECK-VF4UF2-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-VF4UF2-NEXT:    [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 3
 ; CHECK-VF4UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
@@ -261,8 +262,8 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
 ; CHECK-VF4UF2:       [[VECTOR_BODY]]:
 ; CHECK-VF4UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD2:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4UF2-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ undef, %[[VECTOR_PH]] ], [ [[TMP25:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4UF2-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ undef, %[[VECTOR_PH]] ], [ [[TMP26:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF2-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP25:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF2-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP26:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4UF2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
 ; CHECK-VF4UF2-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-VF4UF2-NEXT:    [[TMP13:%.*]] = shl nuw i64 [[TMP12]], 2
@@ -296,19 +297,17 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
 ; CHECK-VF4UF2:       [[SCALAR_PH]]:
 ; CHECK-VF4UF2-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ [[DOTPRE]], %[[FOR_PREHEADER]] ]
 ; CHECK-VF4UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PREHEADER]] ]
-; CHECK-VF4UF2-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP28]], %[[MIDDLE_BLOCK]] ], [ undef, %[[FOR_PREHEADER]] ]
+; CHECK-VF4UF2-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP28]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PREHEADER]] ]
 ; CHECK-VF4UF2-NEXT:    br label %[[SCALAR_BODY:.*]]
 ; CHECK-VF4UF2:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
-; CHECK-VF4UF2-NEXT:    [[MINMAX_0_COND_LCSSA:%.*]] = phi i32 [ [[MINMAX_0_COND:%.*]], %[[SCALAR_BODY]] ], [ [[TMP28]], %[[MIDDLE_BLOCK]] ]
-; CHECK-VF4UF2-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; CHECK-VF4UF2-NEXT:    [[MINMAX_0_LCSSA:%.*]] = phi i32 [ [[MINMAX_0_COND:%.*]], %[[SCALAR_BODY]] ], [ [[TMP28]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4UF2-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
 ; CHECK-VF4UF2:       [[FOR_COND_CLEANUP]]:
-; CHECK-VF4UF2-NEXT:    [[MINMAX_0_LCSSA:%.*]] = phi i32 [ undef, %[[ENTRY]] ], [ [[MINMAX_0_COND_LCSSA]], %[[FOR_COND_CLEANUP_LOOPEXIT]] ]
 ; CHECK-VF4UF2-NEXT:    ret i32 [[MINMAX_0_LCSSA]]
 ; CHECK-VF4UF2:       [[SCALAR_BODY]]:
 ;
 entry:
-  %cmp27 = icmp sgt i32 %n, 0
-  br i1 %cmp27, label %for.preheader, label %for.cond.cleanup
+  br label %for.preheader
 
 for.preheader:
   %arrayidx2.phi.trans.insert = getelementptr inbounds i32, ptr %a, i64 -1
@@ -320,13 +319,12 @@ for.cond.cleanup.loopexit:
   br label %for.cond.cleanup
 
 for.cond.cleanup:
-  %minmax.0.lcssa = phi i32 [ undef, %entry ], [ %minmax.0.cond.lcssa, %for.cond.cleanup.loopexit ]
-  ret i32 %minmax.0.lcssa
+  ret i32 %minmax.0.cond.lcssa
 
 scalar.body:
   %0 = phi i32 [ %.pre, %for.preheader ], [ %1, %scalar.body ]
   %indvars.iv = phi i64 [ 0, %for.preheader ], [ %indvars.iv.next, %scalar.body ]
-  %minmax.028 = phi i32 [ undef, %for.preheader ], [ %minmax.0.cond, %scalar.body ]
+  %minmax.028 = phi i32 [ 0, %for.preheader ], [ %minmax.0.cond, %scalar.body ]
   %arrayidx = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
   %1 = load i32, ptr %arrayidx, align 4
   %sub3 = sub nsw i32 %1, %0
diff --git a/llvm/test/Transforms/LoopVectorize/scev-exitlim-crash.ll b/llvm/test/Transforms/LoopVectorize/scev-exitlim-crash.ll
index 58ad64df1cbe1..8224d6b5df9f4 100644
--- a/llvm/test/Transforms/LoopVectorize/scev-exitlim-crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/scev-exitlim-crash.ll
@@ -16,7 +16,7 @@ entry:
   br label %for.cond
 
 for.cond:                                         ; preds = %for.cond, %entry
-  %i.0 = phi i32 [ undef, %entry ], [ %inc, %for.cond ]
+  %i.0 = phi i32 [ poison, %entry ], [ %inc, %for.cond ]
   %cmp = icmp slt i32 %i.0, 0
   %fsub = fsub double undef, undef
   %fadd = fadd double %fsub, 1.000000e+00
@@ -36,7 +36,7 @@ for.cond7.preheader.lr.ph:                        ; preds = %for.cond4.preheader
 for.cond7.preheader:                              ; preds = %for.cond7.preheader.lr.ph, %for.inc23
   %y.017 = phi i32 [ 0, %for.cond7.preheader.lr.ph ], [ %inc24, %for.inc23 ]
   %i.116 = phi i32 [ 0, %for.cond7.preheader.lr.ph ], [ %i.2.lcssa, %for.inc23 ]
-  %n.015 = phi i32 [ undef, %for.cond7.preheader.lr.ph ], [ %inc25, %for.inc23 ]
+  %n.015 = phi i32 [ poison, %for.cond7.preheader.lr.ph ], [ %inc25, %for.inc23 ]
   %1 = load i32, ptr @b, align 4, !tbaa !5
   %tobool11 = icmp eq i32 %1, 0
   br i1 %tobool11, label %for.inc23, label %for.body8.lr.ph
diff --git a/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll b/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll
index 58542f4f27618..163faa2254700 100644
--- a/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll
+++ b/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll
@@ -29,16 +29,16 @@ define void @step_direction_unknown(i32 %arg, ptr %dst) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = zext <4 x i32> [[TMP8]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP9]], i32 1
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP9]], i32 2
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i64> [[TMP9]], i32 3
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP16]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP11]], align 8
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP9]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP9]], i32 2
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP9]], i32 3
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP13]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP14]], align 8
 ; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP15]], align 8
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP16]], align 8
 ; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP17]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
diff --git a/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll b/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll
index f4d5a84fe67c8..cd9eb72089646 100644
--- a/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll
@@ -23,7 +23,7 @@ define i64 @pr62565_incoming_value_known_undef(i64 %a, ptr %src) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[TMP2]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = freeze i1 [[TMP4]]
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP5]], i64 [[A]], i64 undef
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP5]], i64 [[A]], i64 poison
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret i64 [[RDX_SELECT]]
@@ -33,7 +33,7 @@ entry:
 
 loop:
   %iv = phi i32 [ 1, %entry ], [ %add, %loop ]
-  %red = phi i64 [ undef, %entry ], [ %select, %loop ]
+  %red = phi i64 [ poison, %entry ], [ %select, %loop ]
   %gep = getelementptr inbounds i32, ptr %src, i32 %iv
   %l = load i32, ptr %gep
   %c = icmp eq i32 %l, 1
diff --git a/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll b/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll
index cc3bda4a7f43b..5cdde057da1ad 100644
--- a/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll
+++ b/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll
@@ -387,7 +387,7 @@ exit:
   ret i64 %res
 }
 
-; TODO: The existing assumptions should be strong enough to vectorize this.
+; The existing assumptions is strong enough to vectorize this.
 define ptr @find_deref_pointer_distance_align_attribute_argument(ptr align 2 %first, ptr align 2 %last) nofree nosync {
 ; CHECK-LABEL: define ptr @find_deref_pointer_distance_align_attribute_argument(
 ; CHECK-SAME: ptr align 2 [[FIRST:%.*]], ptr align 2 [[LAST:%.*]]) #[[ATTR0]] {
@@ -401,18 +401,55 @@ define ptr @find_deref_pointer_distance_align_attribute_argument(ptr align 2 %fi
 ; CHECK-NEXT:    [[C_0:%.*]] = icmp eq ptr [[FIRST]], [[LAST]]
 ; CHECK-NEXT:    br i1 [[C_0]], label %[[EXIT:.*]], label %[[LOOP_HEADER_PREHEADER:.*]]
 ; CHECK:       [[LOOP_HEADER_PREHEADER]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[LAST_I64]], -2
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[FIRST_I64]]
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw i64 [[TMP2]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[N_VEC]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[TMP4]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[NEXT_GEP]], align 2
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i16> [[WIDE_LOAD]], splat (i16 1)
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = freeze <4 x i1> [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       [[MIDDLE_SPLIT]]:
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[VECTOR_EARLY_EXIT]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 2
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[TMP13]]
+; CHECK-NEXT:    br label %[[EXIT_LOOPEXIT]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP5]], %[[MIDDLE_BLOCK]] ], [ [[FIRST]], %[[LOOP_HEADER_PREHEADER]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi ptr [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[FIRST]], %[[LOOP_HEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi ptr [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[L:%.*]] = load i16, ptr [[IV]], align 2
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp eq i16 [[L]], 1
-; CHECK-NEXT:    br i1 [[C_1]], label %[[EXIT_LOOPEXIT:.*]], label %[[LOOP_LATCH]]
+; CHECK-NEXT:    br i1 [[C_1]], label %[[EXIT_LOOPEXIT]], label %[[LOOP_LATCH]]
 ; CHECK:       [[LOOP_LATCH]]:
 ; CHECK-NEXT:    [[IV_NEXT]] = getelementptr inbounds nuw i8, ptr [[IV]], i64 2
 ; CHECK-NEXT:    [[C_2:%.*]] = icmp eq ptr [[IV_NEXT]], [[LAST]]
-; CHECK-NEXT:    br i1 [[C_2]], label %[[EXIT_LOOPEXIT]], label %[[LOOP_HEADER]]
+; CHECK-NEXT:    br i1 [[C_2]], label %[[EXIT_LOOPEXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[EXIT_LOOPEXIT]]:
-; CHECK-NEXT:    [[FIRST_ADDR_0_LCSSA_I_PH:%.*]] = phi ptr [ [[IV_NEXT]], %[[LOOP_LATCH]] ], [ [[IV]], %[[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[FIRST_ADDR_0_LCSSA_I_PH:%.*]] = phi ptr [ [[IV_NEXT]], %[[LOOP_LATCH]] ], [ [[IV]], %[[LOOP_HEADER]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ], [ [[TMP14]], %[[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    br label %[[EXIT]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    [[FIRST_ADDR_0_LCSSA_I:%.*]] = phi ptr [ [[FIRST]], %[[ENTRY]] ], [ [[FIRST_ADDR_0_LCSSA_I_PH]], %[[EXIT_LOOPEXIT]] ]
@@ -444,7 +481,7 @@ exit:
   ret ptr %first.addr.0.lcssa.i
 }
 
-; TODO: The existing assumptions should be strong enough to vectorize this.
+; The existing assumptions is strong enough to vectorize this.
 define ptr @find_deref_pointer_distance_align_assumption(ptr %first, ptr %last) nofree nosync {
 ; CHECK-LABEL: define ptr @find_deref_pointer_distance_align_assumption(
 ; CHECK-SAME: ptr [[FIRST:%.*]], ptr [[LAST:%.*]]) #[[ATTR0]] {
@@ -458,18 +495,55 @@ define ptr @find_deref_pointer_distance_align_assumption(ptr %first, ptr %last)
 ; CHECK-NEXT:    [[C_0:%.*]] = icmp eq ptr [[FIRST]], [[LAST]]
 ; CHECK-NEXT:    br i1 [[C_0]], label %[[EXIT:.*]], label %[[LOOP_HEADER_PREHEADER:.*]]
 ; CHECK:       [[LOOP_HEADER_PREHEADER]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[LAST_I64]], -2
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[FIRST_I64]]
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw i64 [[TMP2]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[N_VEC]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[TMP4]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[NEXT_GEP]], align 2
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i16> [[WIDE_LOAD]], splat (i16 1)
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = freeze <4 x i1> [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       [[MIDDLE_SPLIT]]:
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[VECTOR_EARLY_EXIT]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 2
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[TMP13]]
+; CHECK-NEXT:    br label %[[EXIT_LOOPEXIT]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP5]], %[[MIDDLE_BLOCK]] ], [ [[FIRST]], %[[LOOP_HEADER_PREHEADER]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi ptr [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[FIRST]], %[[LOOP_HEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi ptr [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[L:%.*]] = load i16, ptr [[IV]], align 2
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp eq i16 [[L]], 1
-; CHECK-NEXT:    br i1 [[C_1]], label %[[EXIT_LOOPEXIT:.*]], label %[[LOOP_LATCH]]
+; CHECK-NEXT:    br i1 [[C_1]], label %[[EXIT_LOOPEXIT]], label %[[LOOP_LATCH]]
 ; CHECK:       [[LOOP_LATCH]]:
 ; CHECK-NEXT:    [[IV_NEXT]] = getelementptr inbounds nuw i8, ptr [[IV]], i64 2
 ; CHECK-NEXT:    [[C_2:%.*]] = icmp eq ptr [[IV_NEXT]], [[LAST]]
-; CHECK-NEXT:    br i1 [[C_2]], label %[[EXIT_LOOPEXIT]], label %[[LOOP_HEADER]]
+; CHECK-NEXT:    br i1 [[C_2]], label %[[EXIT_LOOPEXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       [[EXIT_LOOPEXIT]]:
-; CHECK-NEXT:    [[FIRST_ADDR_0_LCSSA_I_PH:%.*]] = phi ptr [ [[IV_NEXT]], %[[LOOP_LATCH]] ], [ [[IV]], %[[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[FIRST_ADDR_0_LCSSA_I_PH:%.*]] = phi ptr [ [[IV_NEXT]], %[[LOOP_LATCH]] ], [ [[IV]], %[[LOOP_HEADER]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ], [ [[TMP14]], %[[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    br label %[[EXIT]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    [[FIRST_ADDR_0_LCSSA_I:%.*]] = phi ptr [ [[FIRST]], %[[ENTRY]] ], [ [[FIRST_ADDR_0_LCSSA_I_PH]], %[[EXIT_LOOPEXIT]] ]
@@ -522,7 +596,7 @@ define i64 @early_exit_alignment_and_deref_known_via_assumption_with_constant_si
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 1024
 ; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       [[MIDDLE_SPLIT]]:
 ; CHECK-NEXT:    br i1 [[TMP4]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll b/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll
index 3bb39b95235ed..cde2de73b7bfd 100644
--- a/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll
+++ b/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll
@@ -142,8 +142,8 @@ define void @multiple_incoming_phi_with_blend_mask(i64 %a, ptr noalias %dst) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP1]], <2 x i16> [[VEC_IND3]], <2 x i16> [[VEC_IND1]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [32 x i16], ptr @src, i16 0, i16 [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [32 x i16], ptr @src, i16 0, i16 [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [32 x i16], ptr @src, i16 0, i16 [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
diff --git a/llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll b/llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll
index 99916a503750a..8123092df1ccc 100644
--- a/llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll
+++ b/llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll
@@ -14,12 +14,12 @@ define void @struct_return_1xi64_replicate(ptr noalias %in, ptr noalias writeonl
 ; VF4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[INDEX]]
 ; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
 ; VF4-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 0
-; VF4-NEXT:    [[TMP3:%.*]] = tail call { i64 } @fn1(float [[TMP2]]) #[[ATTR0:[0-9]+]]
 ; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 1
-; VF4-NEXT:    [[TMP5:%.*]] = tail call { i64 } @fn1(float [[TMP4]]) #[[ATTR0]]
 ; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 2
-; VF4-NEXT:    [[TMP7:%.*]] = tail call { i64 } @fn1(float [[TMP6]]) #[[ATTR0]]
 ; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 3
+; VF4-NEXT:    [[TMP3:%.*]] = tail call { i64 } @fn1(float [[TMP2]]) #[[ATTR0:[0-9]+]]
+; VF4-NEXT:    [[TMP5:%.*]] = tail call { i64 } @fn1(float [[TMP4]]) #[[ATTR0]]
+; VF4-NEXT:    [[TMP7:%.*]] = tail call { i64 } @fn1(float [[TMP6]]) #[[ATTR0]]
 ; VF4-NEXT:    [[TMP9:%.*]] = tail call { i64 } @fn1(float [[TMP8]]) #[[ATTR0]]
 ; VF4-NEXT:    [[TMP10:%.*]] = extractvalue { i64 } [[TMP3]], 0
 ; VF4-NEXT:    [[TMP11:%.*]] = insertelement <4 x i64> poison, i64 [[TMP10]], i64 0
@@ -55,11 +55,13 @@ define void @struct_return_1xi64_replicate(ptr noalias %in, ptr noalias writeonl
 ; VF2IC2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[INDEX]]
 ; VF2IC2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 2
 ; VF2IC2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP0]], align 4
+; VF2IC2-NEXT:    [[TMP14:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 0
+; VF2IC2-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 1
 ; VF2IC2-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x float>, ptr [[TMP2]], align 4
-; VF2IC2-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 0
-; VF2IC2-NEXT:    [[TMP4:%.*]] = tail call { i64 } @fn1(float [[TMP3]]) #[[ATTR0:[0-9]+]]
-; VF2IC2-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 1
-; VF2IC2-NEXT:    [[TMP6:%.*]] = tail call { i64 } @fn1(float [[TMP5]]) #[[ATTR0]]
+; VF2IC2-NEXT:    [[TMP16:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 0
+; VF2IC2-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 1
+; VF2IC2-NEXT:    [[TMP4:%.*]] = tail call { i64 } @fn1(float [[TMP14]]) #[[ATTR0:[0-9]+]]
+; VF2IC2-NEXT:    [[TMP6:%.*]] = tail call { i64 } @fn1(float [[TMP3]]) #[[ATTR0]]
 ; VF2IC2-NEXT:    [[TMP7:%.*]] = extractvalue { i64 } [[TMP4]], 0
 ; VF2IC2-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i64 0
 ; VF2IC2-NEXT:    [[TMP9:%.*]] = insertvalue { <2 x i64> } poison, <2 x i64> [[TMP8]], 0
@@ -67,10 +69,8 @@ define void @struct_return_1xi64_replicate(ptr noalias %in, ptr noalias writeonl
 ; VF2IC2-NEXT:    [[TMP11:%.*]] = extractvalue { <2 x i64> } [[TMP9]], 0
 ; VF2IC2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP10]], i64 1
 ; VF2IC2-NEXT:    [[TMP13:%.*]] = insertvalue { <2 x i64> } [[TMP9]], <2 x i64> [[TMP12]], 0
-; VF2IC2-NEXT:    [[TMP14:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 0
-; VF2IC2-NEXT:    [[TMP15:%.*]] = tail call { i64 } @fn1(float [[TMP14]]) #[[ATTR0]]
-; VF2IC2-NEXT:    [[TMP16:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 1
-; VF2IC2-NEXT:    [[TMP17:%.*]] = tail call { i64 } @fn1(float [[TMP16]]) #[[ATTR0]]
+; VF2IC2-NEXT:    [[TMP15:%.*]] = tail call { i64 } @fn1(float [[TMP16]]) #[[ATTR0]]
+; VF2IC2-NEXT:    [[TMP17:%.*]] = tail call { i64 } @fn1(float [[TMP5]]) #[[ATTR0]]
 ; VF2IC2-NEXT:    [[TMP18:%.*]] = extractvalue { i64 } [[TMP15]], 0
 ; VF2IC2-NEXT:    [[TMP19:%.*]] = insertelement <2 x i64> poison, i64 [[TMP18]], i64 0
 ; VF2IC2-NEXT:    [[TMP20:%.*]] = insertvalue { <2 x i64> } poison, <2 x i64> [[TMP19]], 0
@@ -120,12 +120,12 @@ define void @struct_return_2xf32_replicate(ptr noalias %in, ptr noalias writeonl
 ; VF4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[INDEX]]
 ; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
 ; VF4-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 0
-; VF4-NEXT:    [[TMP3:%.*]] = tail call { float, float } @fn2(float [[TMP2]]) #[[ATTR1:[0-9]+]]
 ; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 1
-; VF4-NEXT:    [[TMP5:%.*]] = tail call { float, float } @fn2(float [[TMP4]]) #[[ATTR1]]
 ; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 2
-; VF4-NEXT:    [[TMP7:%.*]] = tail call { float, float } @fn2(float [[TMP6]]) #[[ATTR1]]
 ; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 3
+; VF4-NEXT:    [[TMP3:%.*]] = tail call { float, float } @fn2(float [[TMP2]]) #[[ATTR1:[0-9]+]]
+; VF4-NEXT:    [[TMP5:%.*]] = tail call { float, float } @fn2(float [[TMP4]]) #[[ATTR1]]
+; VF4-NEXT:    [[TMP7:%.*]] = tail call { float, float } @fn2(float [[TMP6]]) #[[ATTR1]]
 ; VF4-NEXT:    [[TMP9:%.*]] = tail call { float, float } @fn2(float [[TMP8]]) #[[ATTR1]]
 ; VF4-NEXT:    [[TMP10:%.*]] = extractvalue { float, float } [[TMP3]], 0
 ; VF4-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> poison, float [[TMP10]], i64 0
@@ -180,11 +180,13 @@ define void @struct_return_2xf32_replicate(ptr noalias %in, ptr noalias writeonl
 ; VF2IC2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[INDEX]]
 ; VF2IC2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 2
 ; VF2IC2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP0]], align 4
+; VF2IC2-NEXT:    [[TMP22:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 0
+; VF2IC2-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 1
 ; VF2IC2-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x float>, ptr [[TMP2]], align 4
-; VF2IC2-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 0
-; VF2IC2-NEXT:    [[TMP4:%.*]] = tail call { float, float } @fn2(float [[TMP3]]) #[[ATTR1:[0-9]+]]
-; VF2IC2-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 1
-; VF2IC2-NEXT:    [[TMP6:%.*]] = tail call { float, float } @fn2(float [[TMP5]]) #[[ATTR1]]
+; VF2IC2-NEXT:    [[TMP24:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 0
+; VF2IC2-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 1
+; VF2IC2-NEXT:    [[TMP4:%.*]] = tail call { float, float } @fn2(float [[TMP22]]) #[[ATTR1:[0-9]+]]
+; VF2IC2-NEXT:    [[TMP6:%.*]] = tail call { float, float } @fn2(float [[TMP3]]) #[[ATTR1]]
 ; VF2IC2-NEXT:    [[TMP7:%.*]] = extractvalue { float, float } [[TMP4]], 0
 ; VF2IC2-NEXT:    [[TMP8:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i64 0
 ; VF2IC2-NEXT:    [[TMP9:%.*]] = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> [[TMP8]], 0
@@ -200,10 +202,8 @@ define void @struct_return_2xf32_replicate(ptr noalias %in, ptr noalias writeonl
 ; VF2IC2-NEXT:    [[TMP19:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP17]], 1
 ; VF2IC2-NEXT:    [[TMP20:%.*]] = insertelement <2 x float> [[TMP19]], float [[TMP18]], i64 1
 ; VF2IC2-NEXT:    [[TMP21:%.*]] = insertvalue { <2 x float>, <2 x float> } [[TMP17]], <2 x float> [[TMP20]], 1
-; VF2IC2-NEXT:    [[TMP22:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 0
-; VF2IC2-NEXT:    [[TMP23:%.*]] = tail call { float, float } @fn2(float [[TMP22]]) #[[ATTR1]]
-; VF2IC2-NEXT:    [[TMP24:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 1
-; VF2IC2-NEXT:    [[TMP25:%.*]] = tail call { float, float } @fn2(float [[TMP24]]) #[[ATTR1]]
+; VF2IC2-NEXT:    [[TMP23:%.*]] = tail call { float, float } @fn2(float [[TMP24]]) #[[ATTR1]]
+; VF2IC2-NEXT:    [[TMP25:%.*]] = tail call { float, float } @fn2(float [[TMP5]]) #[[ATTR1]]
 ; VF2IC2-NEXT:    [[TMP26:%.*]] = extractvalue { float, float } [[TMP23]], 0
 ; VF2IC2-NEXT:    [[TMP27:%.*]] = insertelement <2 x float> poison, float [[TMP26]], i64 0
 ; VF2IC2-NEXT:    [[TMP28:%.*]] = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> [[TMP27]], 0
@@ -271,12 +271,12 @@ define void @struct_return_3xi32_replicate(ptr noalias %in, ptr noalias writeonl
 ; VF4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[IN]], i64 [[INDEX]]
 ; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
 ; VF4-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 0
-; VF4-NEXT:    [[TMP3:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP2]]) #[[ATTR2:[0-9]+]]
 ; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 1
-; VF4-NEXT:    [[TMP5:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP4]]) #[[ATTR2]]
 ; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 2
-; VF4-NEXT:    [[TMP7:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP6]]) #[[ATTR2]]
 ; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3
+; VF4-NEXT:    [[TMP3:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP2]]) #[[ATTR2:[0-9]+]]
+; VF4-NEXT:    [[TMP5:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP4]]) #[[ATTR2]]
+; VF4-NEXT:    [[TMP7:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP6]]) #[[ATTR2]]
 ; VF4-NEXT:    [[TMP9:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP8]]) #[[ATTR2]]
 ; VF4-NEXT:    [[TMP10:%.*]] = extractvalue { i32, i32, i32 } [[TMP3]], 0
 ; VF4-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP10]], i64 0
@@ -350,11 +350,13 @@ define void @struct_return_3xi32_replicate(ptr noalias %in, ptr noalias writeonl
 ; VF2IC2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[IN]], i64 [[INDEX]]
 ; VF2IC2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 2
 ; VF2IC2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP0]], align 4
+; VF2IC2-NEXT:    [[TMP30:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0
+; VF2IC2-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1
 ; VF2IC2-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
-; VF2IC2-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0
-; VF2IC2-NEXT:    [[TMP4:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP3]]) #[[ATTR2:[0-9]+]]
-; VF2IC2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1
-; VF2IC2-NEXT:    [[TMP6:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP5]]) #[[ATTR2]]
+; VF2IC2-NEXT:    [[TMP32:%.*]] = extractelement <2 x i32> [[WIDE_LOAD1]], i32 0
+; VF2IC2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[WIDE_LOAD1]], i32 1
+; VF2IC2-NEXT:    [[TMP4:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP30]]) #[[ATTR2:[0-9]+]]
+; VF2IC2-NEXT:    [[TMP6:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP3]]) #[[ATTR2]]
 ; VF2IC2-NEXT:    [[TMP7:%.*]] = extractvalue { i32, i32, i32 } [[TMP4]], 0
 ; VF2IC2-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i64 0
 ; VF2IC2-NEXT:    [[TMP9:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } poison, <2 x i32> [[TMP8]], 0
@@ -378,10 +380,8 @@ define void @struct_return_3xi32_replicate(ptr noalias %in, ptr noalias writeonl
 ; VF2IC2-NEXT:    [[TMP27:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP25]], 2
 ; VF2IC2-NEXT:    [[TMP28:%.*]] = insertelement <2 x i32> [[TMP27]], i32 [[TMP26]], i64 1
 ; VF2IC2-NEXT:    [[TMP29:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP25]], <2 x i32> [[TMP28]], 2
-; VF2IC2-NEXT:    [[TMP30:%.*]] = extractelement <2 x i32> [[WIDE_LOAD1]], i32 0
-; VF2IC2-NEXT:    [[TMP31:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP30]]) #[[ATTR2]]
-; VF2IC2-NEXT:    [[TMP32:%.*]] = extractelement <2 x i32> [[WIDE_LOAD1]], i32 1
-; VF2IC2-NEXT:    [[TMP33:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP32]]) #[[ATTR2]]
+; VF2IC2-NEXT:    [[TMP31:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP32]]) #[[ATTR2]]
+; VF2IC2-NEXT:    [[TMP33:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP5]]) #[[ATTR2]]
 ; VF2IC2-NEXT:    [[TMP34:%.*]] = extractvalue { i32, i32, i32 } [[TMP31]], 0
 ; VF2IC2-NEXT:    [[TMP35:%.*]] = insertelement <2 x i32> poison, i32 [[TMP34]], i64 0
 ; VF2IC2-NEXT:    [[TMP36:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } poison, <2 x i32> [[TMP35]], 0
diff --git a/llvm/test/Transforms/LoopVectorize/uniform-args-call-variants.ll b/llvm/test/Transforms/LoopVectorize/uniform-args-call-variants.ll
index 63ca45495335f..abdd5e9562590 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform-args-call-variants.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform-args-call-variants.ll
@@ -76,10 +76,10 @@ define void @test_uniform_not_invariant(ptr noalias %dst, ptr readonly %src, i64
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[WIDE_LOAD]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = call double @foo(double [[TMP2]], i64 [[INDEX]]) #[[ATTR0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[WIDE_LOAD]], i64 1
-; CHECK-NEXT:    [[TMP5:%.*]] = call double @foo(double [[TMP4]], i64 [[TMP0]]) #[[ATTR0]]
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> poison, double [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[WIDE_LOAD]], i64 1
+; CHECK-NEXT:    [[TMP4:%.*]] = call double @foo(double [[TMP2]], i64 [[INDEX]]) #[[ATTR0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call double @foo(double [[TMP3]], i64 [[TMP0]]) #[[ATTR0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> poison, double [[TMP4]], i64 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[TMP5]], i64 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
 ; CHECK-NEXT:    store <2 x double> [[TMP7]], ptr [[TMP8]], align 8
diff --git a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
index 985a9a2c2d155..71311db33cf1a 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
@@ -102,14 +102,14 @@ define void @blend_chain_iv(i1 %c) {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[PREDPHI1:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[PREDPHI2:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i64> [[PREDPHI1]], <4 x i64> undef
+; CHECK-NEXT:    [[PREDPHI2:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i64> [[PREDPHI1]], <4 x i64> poison
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i64> [[PREDPHI2]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[PREDPHI2]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[PREDPHI2]], i32 2
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[PREDPHI2]], i32 3
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP7]]
 ; CHECK-NEXT:    store i16 0, ptr [[TMP2]], align 2
 ; CHECK-NEXT:    store i16 0, ptr [[TMP4]], align 2
@@ -139,11 +139,11 @@ loop.next.2:
   br label %loop.next.3
 
 loop.next.3:
-  %blend.1 = phi i64 [ undef, %loop.next ], [ %iv, %loop.next.2 ]
+  %blend.1 = phi i64 [ poison, %loop.next ], [ %iv, %loop.next.2 ]
   br label %loop.latch
 
 loop.latch:                                       ; preds = %loop.next, %loop.header
-  %blend = phi i64 [ undef, %loop.header ], [ %blend.1, %loop.next.3 ]
+  %blend = phi i64 [ poison, %loop.header ], [ %blend.1, %loop.next.3 ]
   %dst.ptr = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 %blend
   store i16 0, ptr %dst.ptr
   %iv.next = add nuw nsw i64 %iv, 1
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll
index 571c55c276dd5..927fefc73ceea 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll
@@ -101,10 +101,10 @@ define void @ld_div3_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 3)
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP6]], i32 1
@@ -153,20 +153,20 @@ define void @ld_div1_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42)
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
-; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
-; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    store i64 [[TMP12]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
@@ -210,12 +210,12 @@ define void @ld_div2_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42)
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
-; CHECK-NEXT:    store i64 [[TMP7]], ptr [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
-; CHECK-NEXT:    store i64 [[TMP8]], ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    store i64 [[TMP5]], ptr [[TMP7]], align 8
+; CHECK-NEXT:    store i64 [[TMP6]], ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -257,20 +257,20 @@ define void @ld_div3_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 3)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42)
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
-; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
-; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    store i64 [[TMP12]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
@@ -313,20 +313,20 @@ define void @ld_div1_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42)
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
-; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
-; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    store i64 [[TMP12]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
@@ -368,20 +368,20 @@ define void @ld_div2_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42)
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
-; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
-; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    store i64 [[TMP12]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
@@ -424,12 +424,12 @@ define void @ld_div3_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42)
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
-; CHECK-NEXT:    store i64 [[TMP7]], ptr [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
-; CHECK-NEXT:    store i64 [[TMP8]], ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    store i64 [[TMP5]], ptr [[TMP7]], align 8
+; CHECK-NEXT:    store i64 [[TMP6]], ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
@@ -509,10 +509,10 @@ define void @ld_div2_step1_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP6]], i32 1
@@ -558,10 +558,10 @@ define void @ld_div3_step1_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 3)
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP6]], i32 1
@@ -610,20 +610,20 @@ define void @ld_div1_step2_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42)
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
-; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
-; CHECK-NEXT:    store i64 [[TMP16]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
+; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498
@@ -667,12 +667,12 @@ define void @ld_div2_step2_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42)
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
-; CHECK-NEXT:    store i64 [[TMP8]], ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
-; CHECK-NEXT:    store i64 [[TMP9]], ptr [[TMP7]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    store i64 [[TMP6]], ptr [[TMP8]], align 8
+; CHECK-NEXT:    store i64 [[TMP7]], ptr [[TMP9]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
@@ -714,20 +714,20 @@ define void @ld_div3_step2_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 3)
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42)
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
-; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
-; CHECK-NEXT:    store i64 [[TMP16]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
+; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498
@@ -770,20 +770,20 @@ define void @ld_div1_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; CHECK-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42)
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
-; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
-; CHECK-NEXT:    store i64 [[TMP16]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
+; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
@@ -826,20 +826,20 @@ define void @ld_div2_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; CHECK-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42)
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
-; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
-; CHECK-NEXT:    store i64 [[TMP16]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
+; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
@@ -883,12 +883,12 @@ define void @ld_div3_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42)
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
-; CHECK-NEXT:    store i64 [[TMP8]], ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
-; CHECK-NEXT:    store i64 [[TMP9]], ptr [[TMP7]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    store i64 [[TMP6]], ptr [[TMP8]], align 8
+; CHECK-NEXT:    store i64 [[TMP7]], ptr [[TMP9]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
@@ -931,10 +931,10 @@ define void @test_step_is_not_invariant(ptr %A) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = udiv <2 x i16> [[TMP4]], splat (i16 6)
 ; CHECK-NEXT:    [[TMP6:%.*]] = zext <2 x i16> [[TMP5]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    store i16 [[TMP1]], ptr [[TMP8]], align 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP8]]
+; CHECK-NEXT:    store i16 [[TMP1]], ptr [[TMP9]], align 2
 ; CHECK-NEXT:    store i16 [[TMP2]], ptr [[TMP10]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll
index 6cf82fc2c9d48..d6277d657ea7e 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll
@@ -101,10 +101,10 @@ define void @ld_and_neg3_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = and <2 x i64> [[VEC_IND]], splat (i64 -3)
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP6]], i32 1
@@ -153,20 +153,20 @@ define void @ld_and_neg1_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i64> [[VEC_IND]], splat (i64 -1)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42)
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
-; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
-; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    store i64 [[TMP12]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
@@ -255,20 +255,20 @@ define void @ld_and_neg1_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i64> [[VEC_IND]], splat (i64 -1)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42)
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
-; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
-; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    store i64 [[TMP12]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
@@ -310,20 +310,20 @@ define void @ld_and_neg2_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i64> [[VEC_IND]], splat (i64 -2)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42)
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
-; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
-; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    store i64 [[TMP12]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
@@ -363,10 +363,10 @@ define void @ld_and_neg2_step1_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = and <2 x i64> [[VEC_IND]], splat (i64 -2)
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP6]], i32 1
@@ -415,20 +415,20 @@ define void @ld_and_neg2_step2_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = and <2 x i64> [[VEC_IND]], splat (i64 -2)
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42)
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
-; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
-; CHECK-NEXT:    store i64 [[TMP16]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
+; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498
@@ -471,20 +471,20 @@ define void @ld_and_neg2_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; CHECK-NEXT:    [[TMP3:%.*]] = and <2 x i64> [[VEC_IND]], splat (i64 -2)
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42)
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
-; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
-; CHECK-NEXT:    store i64 [[TMP16]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
+; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
@@ -527,20 +527,20 @@ define void @ld_and_neg3_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; CHECK-NEXT:    [[TMP3:%.*]] = and <2 x i64> [[VEC_IND]], splat (i64 -3)
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42)
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
-; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
-; CHECK-NEXT:    store i64 [[TMP16]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
+; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll
index 9357adf85b6a6..edf04bbcbcdff 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll
@@ -18,28 +18,28 @@ define void @ld_div2_urem3_1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = udiv <8 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP1:%.*]] = urem <8 x i64> [[TMP0]], splat (i64 3)
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i64> [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i64> [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i64> [[TMP1]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i64> [[TMP1]], i32 3
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i64> [[TMP1]], i32 4
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <8 x i64> [[TMP1]], i32 5
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i64> [[TMP1]], i32 6
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i64> [[TMP1]], i32 7
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP3]], align 8
-; CHECK-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP20:%.*]] = load i64, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[TMP22:%.*]] = load i64, ptr [[TMP11]], align 8
-; CHECK-NEXT:    [[TMP23:%.*]] = load i64, ptr [[TMP13]], align 8
-; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP15]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x i64> [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i64> [[TMP1]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i64> [[TMP1]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i64> [[TMP1]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i64> [[TMP1]], i32 5
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i64> [[TMP1]], i32 6
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i64> [[TMP1]], i32 7
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP10]], align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP11]], align 8
+; CHECK-NEXT:    [[TMP20:%.*]] = load i64, ptr [[TMP12]], align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP22:%.*]] = load i64, ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = load i64, ptr [[TMP15]], align 8
+; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP17]], align 8
 ; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <8 x i64> poison, i64 [[TMP18]], i32 0
 ; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <8 x i64> [[TMP26]], i64 [[TMP19]], i32 1
@@ -94,28 +94,28 @@ define void @ld_div2_urem3_2(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = udiv <8 x i64> [[TMP0]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP2:%.*]] = urem <8 x i64> [[TMP1]], splat (i64 3)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x i64> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i64> [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i64> [[TMP2]], i32 2
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i64> [[TMP2]], i32 3
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i64> [[TMP2]], i32 4
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i64> [[TMP2]], i32 5
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <8 x i64> [[TMP2]], i32 6
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <8 x i64> [[TMP2]], i32 7
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP17]]
-; CHECK-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP20:%.*]] = load i64, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[TMP22:%.*]] = load i64, ptr [[TMP10]], align 8
-; CHECK-NEXT:    [[TMP23:%.*]] = load i64, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP14]], align 8
-; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP16]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i64> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i64> [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i64> [[TMP2]], i32 3
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i64> [[TMP2]], i32 4
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i64> [[TMP2]], i32 5
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i64> [[TMP2]], i32 6
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i64> [[TMP2]], i32 7
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP11]], align 8
+; CHECK-NEXT:    [[TMP20:%.*]] = load i64, ptr [[TMP12]], align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP22:%.*]] = load i64, ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = load i64, ptr [[TMP15]], align 8
+; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP16]], align 8
+; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP17]], align 8
 ; CHECK-NEXT:    [[TMP26:%.*]] = load i64, ptr [[TMP18]], align 8
 ; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <8 x i64> poison, i64 [[TMP19]], i32 0
 ; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <8 x i64> [[TMP27]], i64 [[TMP20]], i32 1
@@ -168,28 +168,28 @@ define void @ld_div4(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = udiv <8 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <8 x i64> [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x i64> [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i64> [[TMP0]], i32 2
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i64> [[TMP0]], i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i64> [[TMP0]], i32 4
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i64> [[TMP0]], i32 5
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i64> [[TMP0]], i32 6
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <8 x i64> [[TMP0]], i32 7
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP2]], align 8
-; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[TMP20:%.*]] = load i64, ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP10]], align 8
-; CHECK-NEXT:    [[TMP22:%.*]] = load i64, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[TMP23:%.*]] = load i64, ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i64> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x i64> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i64> [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i64> [[TMP0]], i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i64> [[TMP0]], i32 5
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i64> [[TMP0]], i32 6
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i64> [[TMP0]], i32 7
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP10]], align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP11]], align 8
+; CHECK-NEXT:    [[TMP20:%.*]] = load i64, ptr [[TMP12]], align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP22:%.*]] = load i64, ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = load i64, ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <8 x i64> poison, i64 [[TMP17]], i32 0
 ; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <8 x i64> [[TMP25]], i64 [[TMP18]], i32 1
@@ -317,28 +317,28 @@ define void @ld_div2_ld_scevunknown_nonuniform(ptr %src.a, ptr noalias %src.b, p
 ; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <8 x i64> [[TMP30]], i64 [[TMP23]], i32 7
 ; CHECK-NEXT:    [[TMP32:%.*]] = udiv <8 x i64> [[TMP31]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <8 x i64> [[TMP32]], i32 0
-; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP33]]
-; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <8 x i64> [[TMP32]], i32 1
-; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP35]]
-; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <8 x i64> [[TMP32]], i32 2
-; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP37]]
-; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <8 x i64> [[TMP32]], i32 3
-; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP39]]
-; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <8 x i64> [[TMP32]], i32 4
-; CHECK-NEXT:    [[TMP42:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP41]]
-; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <8 x i64> [[TMP32]], i32 5
-; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP43]]
-; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <8 x i64> [[TMP32]], i32 6
-; CHECK-NEXT:    [[TMP46:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP45]]
-; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <8 x i64> [[TMP32]], i32 7
-; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP47]]
-; CHECK-NEXT:    [[TMP49:%.*]] = load i32, ptr [[TMP34]], align 4
-; CHECK-NEXT:    [[TMP50:%.*]] = load i32, ptr [[TMP36]], align 4
-; CHECK-NEXT:    [[TMP51:%.*]] = load i32, ptr [[TMP38]], align 4
-; CHECK-NEXT:    [[TMP52:%.*]] = load i32, ptr [[TMP40]], align 4
-; CHECK-NEXT:    [[TMP53:%.*]] = load i32, ptr [[TMP42]], align 4
-; CHECK-NEXT:    [[TMP54:%.*]] = load i32, ptr [[TMP44]], align 4
-; CHECK-NEXT:    [[TMP55:%.*]] = load i32, ptr [[TMP46]], align 4
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <8 x i64> [[TMP32]], i32 1
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <8 x i64> [[TMP32]], i32 2
+; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <8 x i64> [[TMP32]], i32 3
+; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <8 x i64> [[TMP32]], i32 4
+; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <8 x i64> [[TMP32]], i32 5
+; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <8 x i64> [[TMP32]], i32 6
+; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <8 x i64> [[TMP32]], i32 7
+; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP33]]
+; CHECK-NEXT:    [[TMP42:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP34]]
+; CHECK-NEXT:    [[TMP43:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP35]]
+; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP36]]
+; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP37]]
+; CHECK-NEXT:    [[TMP46:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP38]]
+; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP39]]
+; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP40]]
+; CHECK-NEXT:    [[TMP49:%.*]] = load i32, ptr [[TMP41]], align 4
+; CHECK-NEXT:    [[TMP50:%.*]] = load i32, ptr [[TMP42]], align 4
+; CHECK-NEXT:    [[TMP51:%.*]] = load i32, ptr [[TMP43]], align 4
+; CHECK-NEXT:    [[TMP52:%.*]] = load i32, ptr [[TMP44]], align 4
+; CHECK-NEXT:    [[TMP53:%.*]] = load i32, ptr [[TMP45]], align 4
+; CHECK-NEXT:    [[TMP54:%.*]] = load i32, ptr [[TMP46]], align 4
+; CHECK-NEXT:    [[TMP55:%.*]] = load i32, ptr [[TMP47]], align 4
 ; CHECK-NEXT:    [[TMP56:%.*]] = load i32, ptr [[TMP48]], align 4
 ; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <8 x i32> poison, i32 [[TMP49]], i32 0
 ; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <8 x i32> [[TMP57]], i32 [[TMP50]], i32 1
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll
index 2b5d0f3cb0125..32873a4e90e81 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll
@@ -105,16 +105,16 @@ define void @ld_lshr1_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[TMP0:%.*]] = lshr <4 x i64> [[VEC_IND]], splat (i64 1)
 ; VF4-NEXT:    [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
-; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1
-; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP0]], i32 2
-; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP0]], i32 3
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP2]], align 8
-; VF4-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP4]], align 8
-; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF4-NEXT:    [[TMP2:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1
+; VF4-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP0]], i32 2
+; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP0]], i32 3
+; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF4-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF4-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8
 ; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
 ; VF4-NEXT:    [[TMP13:%.*]] = insertelement <4 x i64> poison, i64 [[TMP9]], i32 0
 ; VF4-NEXT:    [[TMP14:%.*]] = insertelement <4 x i64> [[TMP13]], i64 [[TMP10]], i32 1
@@ -233,20 +233,20 @@ define void @ld_lshr0_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2
 ; VF2-NEXT:    [[TMP2:%.*]] = lshr <2 x i64> [[VEC_IND]], zeroinitializer
 ; VF2-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
 ; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; VF2-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
 ; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
 ; VF2-NEXT:    [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42)
-; VF2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
-; VF2-NEXT:    store i64 [[TMP14]], ptr [[TMP12]], align 8
-; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
-; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
+; VF2-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
+; VF2-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
+; VF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    store i64 [[TMP12]], ptr [[TMP14]], align 8
+; VF2-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; VF2-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
@@ -272,34 +272,34 @@ define void @ld_lshr0_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 6
 ; VF4-NEXT:    [[TMP4:%.*]] = lshr <4 x i64> [[VEC_IND]], zeroinitializer
 ; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP4]], i32 0
-; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3
-; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
-; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP6]], align 8
-; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP8]], align 8
-; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP10]], align 8
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1
+; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
+; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
+; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
 ; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
 ; VF4-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> poison, i64 [[TMP13]], i32 0
 ; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 1
 ; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 2
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 3
 ; VF4-NEXT:    [[TMP21:%.*]] = add nsw <4 x i64> [[TMP20]], splat (i64 42)
-; VF4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF4-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP21]], i32 0
-; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP22]], align 8
-; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP21]], i32 1
-; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP23]], align 8
-; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP21]], i32 2
-; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP24]], align 8
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP21]], i32 3
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
+; VF4-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP21]], i32 0
+; VF4-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP21]], i32 1
+; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP21]], i32 2
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP21]], i32 3
+; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    store i64 [[TMP22]], ptr [[TMP26]], align 8
+; VF4-NEXT:    store i64 [[TMP23]], ptr [[TMP27]], align 8
+; VF4-NEXT:    store i64 [[TMP24]], ptr [[TMP28]], align 8
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8)
 ; VF4-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
@@ -343,12 +343,12 @@ define void @ld_lshr1_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
 ; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8
 ; VF2-NEXT:    [[TMP4:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42)
-; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
-; VF2-NEXT:    store i64 [[TMP7]], ptr [[TMP5]], align 8
-; VF2-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
-; VF2-NEXT:    store i64 [[TMP8]], ptr [[TMP6]], align 8
+; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
+; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    store i64 [[TMP5]], ptr [[TMP7]], align 8
+; VF2-NEXT:    store i64 [[TMP6]], ptr [[TMP8]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
 ; VF2-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -374,18 +374,18 @@ define void @ld_lshr1_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
 ; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
 ; VF4-NEXT:    [[TMP6:%.*]] = add nsw <4 x i64> [[WIDE_LOAD]], splat (i64 42)
-; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0
-; VF4-NEXT:    store i64 [[TMP11]], ptr [[TMP7]], align 8
-; VF4-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
-; VF4-NEXT:    store i64 [[TMP12]], ptr [[TMP8]], align 8
-; VF4-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
-; VF4-NEXT:    store i64 [[TMP13]], ptr [[TMP9]], align 8
-; VF4-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
-; VF4-NEXT:    store i64 [[TMP14]], ptr [[TMP10]], align 8
+; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    store i64 [[TMP7]], ptr [[TMP11]], align 8
+; VF4-NEXT:    store i64 [[TMP8]], ptr [[TMP12]], align 8
+; VF4-NEXT:    store i64 [[TMP9]], ptr [[TMP13]], align 8
+; VF4-NEXT:    store i64 [[TMP10]], ptr [[TMP14]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
 ; VF4-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -427,20 +427,20 @@ define void @ld_lshr0_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; VF2-NEXT:    [[TMP2:%.*]] = lshr <2 x i64> [[VEC_IND]], zeroinitializer
 ; VF2-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
 ; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; VF2-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
 ; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
 ; VF2-NEXT:    [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42)
-; VF2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
-; VF2-NEXT:    store i64 [[TMP14]], ptr [[TMP12]], align 8
-; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
-; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
+; VF2-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
+; VF2-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
+; VF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    store i64 [[TMP12]], ptr [[TMP14]], align 8
+; VF2-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; VF2-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
@@ -465,34 +465,34 @@ define void @ld_lshr0_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 9
 ; VF4-NEXT:    [[TMP4:%.*]] = lshr <4 x i64> [[VEC_IND]], zeroinitializer
 ; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP4]], i32 0
-; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3
-; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
-; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP6]], align 8
-; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP8]], align 8
-; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP10]], align 8
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1
+; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
+; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
+; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
 ; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
 ; VF4-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> poison, i64 [[TMP13]], i32 0
 ; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 1
 ; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 2
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 3
 ; VF4-NEXT:    [[TMP21:%.*]] = add nsw <4 x i64> [[TMP20]], splat (i64 42)
-; VF4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF4-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP21]], i32 0
-; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP22]], align 8
-; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP21]], i32 1
-; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP23]], align 8
-; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP21]], i32 2
-; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP24]], align 8
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP21]], i32 3
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
+; VF4-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP21]], i32 0
+; VF4-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP21]], i32 1
+; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP21]], i32 2
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP21]], i32 3
+; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    store i64 [[TMP22]], ptr [[TMP26]], align 8
+; VF4-NEXT:    store i64 [[TMP23]], ptr [[TMP27]], align 8
+; VF4-NEXT:    store i64 [[TMP24]], ptr [[TMP28]], align 8
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
 ; VF4-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
@@ -534,20 +534,20 @@ define void @ld_lshr1_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; VF2-NEXT:    [[TMP2:%.*]] = lshr <2 x i64> [[VEC_IND]], splat (i64 1)
 ; VF2-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
 ; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; VF2-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
 ; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
 ; VF2-NEXT:    [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42)
-; VF2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
-; VF2-NEXT:    store i64 [[TMP14]], ptr [[TMP12]], align 8
-; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
-; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
+; VF2-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
+; VF2-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
+; VF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    store i64 [[TMP12]], ptr [[TMP14]], align 8
+; VF2-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; VF2-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
@@ -572,34 +572,34 @@ define void @ld_lshr1_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 9
 ; VF4-NEXT:    [[TMP4:%.*]] = lshr <4 x i64> [[VEC_IND]], splat (i64 1)
 ; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP4]], i32 0
-; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3
-; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
-; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP6]], align 8
-; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP8]], align 8
-; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP10]], align 8
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1
+; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
+; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
+; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
 ; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
 ; VF4-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> poison, i64 [[TMP13]], i32 0
 ; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 1
 ; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 2
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 3
 ; VF4-NEXT:    [[TMP21:%.*]] = add nsw <4 x i64> [[TMP20]], splat (i64 42)
-; VF4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF4-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP21]], i32 0
-; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP22]], align 8
-; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP21]], i32 1
-; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP23]], align 8
-; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP21]], i32 2
-; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP24]], align 8
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP21]], i32 3
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
+; VF4-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP21]], i32 0
+; VF4-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP21]], i32 1
+; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP21]], i32 2
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP21]], i32 3
+; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    store i64 [[TMP22]], ptr [[TMP26]], align 8
+; VF4-NEXT:    store i64 [[TMP23]], ptr [[TMP27]], align 8
+; VF4-NEXT:    store i64 [[TMP24]], ptr [[TMP28]], align 8
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
 ; VF4-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
@@ -640,10 +640,10 @@ define void @ld_lshr1_step1_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
 ; VF2-NEXT:    [[TMP0:%.*]] = lshr <2 x i64> [[VEC_IND]], splat (i64 1)
 ; VF2-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0
-; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
-; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF2-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
+; VF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
+; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
+; VF2-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 8
 ; VF2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8
 ; VF2-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0
 ; VF2-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP6]], i32 1
@@ -670,16 +670,16 @@ define void @ld_lshr1_step1_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
 ; VF4-NEXT:    [[TMP0:%.*]] = lshr <4 x i64> [[VEC_IND]], splat (i64 1)
 ; VF4-NEXT:    [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
-; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1
-; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP0]], i32 2
-; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP0]], i32 3
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP2]], align 8
-; VF4-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP4]], align 8
-; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF4-NEXT:    [[TMP2:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1
+; VF4-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP0]], i32 2
+; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP0]], i32 3
+; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF4-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF4-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8
 ; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
 ; VF4-NEXT:    [[TMP13:%.*]] = insertelement <4 x i64> poison, i64 [[TMP9]], i32 0
 ; VF4-NEXT:    [[TMP14:%.*]] = insertelement <4 x i64> [[TMP13]], i64 [[TMP10]], i32 1
@@ -731,12 +731,12 @@ define void @ld_lshr1_step2_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
 ; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
 ; VF2-NEXT:    [[TMP5:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42)
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF2-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
-; VF2-NEXT:    store i64 [[TMP8]], ptr [[TMP6]], align 8
-; VF2-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
-; VF2-NEXT:    store i64 [[TMP9]], ptr [[TMP7]], align 8
+; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
+; VF2-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF2-NEXT:    store i64 [[TMP6]], ptr [[TMP8]], align 8
+; VF2-NEXT:    store i64 [[TMP7]], ptr [[TMP9]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498
 ; VF2-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
@@ -762,18 +762,18 @@ define void @ld_lshr1_step2_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
 ; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP6]], align 8
 ; VF4-NEXT:    [[TMP7:%.*]] = add nsw <4 x i64> [[WIDE_LOAD]], splat (i64 42)
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
-; VF4-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0
-; VF4-NEXT:    store i64 [[TMP12]], ptr [[TMP8]], align 8
-; VF4-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1
-; VF4-NEXT:    store i64 [[TMP13]], ptr [[TMP9]], align 8
-; VF4-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2
-; VF4-NEXT:    store i64 [[TMP14]], ptr [[TMP10]], align 8
-; VF4-NEXT:    [[TMP15:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3
-; VF4-NEXT:    store i64 [[TMP15]], ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2
+; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
+; VF4-NEXT:    store i64 [[TMP8]], ptr [[TMP12]], align 8
+; VF4-NEXT:    store i64 [[TMP9]], ptr [[TMP13]], align 8
+; VF4-NEXT:    store i64 [[TMP10]], ptr [[TMP14]], align 8
+; VF4-NEXT:    store i64 [[TMP11]], ptr [[TMP15]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 496
 ; VF4-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
@@ -815,20 +815,20 @@ define void @ld_lshr1_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; VF2-NEXT:    [[TMP3:%.*]] = lshr <2 x i64> [[VEC_IND]], splat (i64 1)
 ; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
 ; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
 ; VF2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
 ; VF2-NEXT:    [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42)
-; VF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
-; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
-; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
-; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP14]], align 8
+; VF2-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
+; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
+; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF2-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
+; VF2-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; VF2-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
@@ -854,34 +854,34 @@ define void @ld_lshr1_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 9
 ; VF4-NEXT:    [[TMP5:%.*]] = lshr <4 x i64> [[VEC_IND]], splat (i64 1)
 ; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP5]], i32 0
-; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP5]], i32 1
-; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP5]], i32 2
-; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
-; VF4-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3
-; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]]
-; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP7]], align 8
-; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP9]], align 8
-; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP5]], i32 1
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP5]], i32 2
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3
+; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
+; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
 ; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
 ; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> poison, i64 [[TMP14]], i32 0
 ; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 1
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 2
 ; VF4-NEXT:    [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 3
 ; VF4-NEXT:    [[TMP22:%.*]] = add nsw <4 x i64> [[TMP21]], splat (i64 42)
-; VF4-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
-; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP22]], i32 0
-; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP23]], align 8
-; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP22]], i32 1
-; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP24]], align 8
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP22]], i32 2
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
-; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP22]], i32 3
-; VF4-NEXT:    store i64 [[TMP30]], ptr [[TMP26]], align 8
+; VF4-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP22]], i32 0
+; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP22]], i32 1
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP22]], i32 2
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP22]], i32 3
+; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
+; VF4-NEXT:    store i64 [[TMP23]], ptr [[TMP27]], align 8
+; VF4-NEXT:    store i64 [[TMP24]], ptr [[TMP28]], align 8
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
+; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
 ; VF4-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
@@ -924,20 +924,20 @@ define void @ld_lshr2_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; VF2-NEXT:    [[TMP3:%.*]] = lshr <2 x i64> [[VEC_IND]], splat (i64 2)
 ; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
 ; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
 ; VF2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
 ; VF2-NEXT:    [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42)
-; VF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
-; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
-; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
-; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP14]], align 8
+; VF2-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
+; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
+; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF2-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
+; VF2-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; VF2-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
@@ -963,34 +963,34 @@ define void @ld_lshr2_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 9
 ; VF4-NEXT:    [[TMP5:%.*]] = lshr <4 x i64> [[VEC_IND]], splat (i64 2)
 ; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP5]], i32 0
-; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP5]], i32 1
-; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP5]], i32 2
-; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
-; VF4-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3
-; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]]
-; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP7]], align 8
-; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP9]], align 8
-; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP5]], i32 1
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP5]], i32 2
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3
+; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
+; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
 ; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
 ; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> poison, i64 [[TMP14]], i32 0
 ; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 1
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 2
 ; VF4-NEXT:    [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 3
 ; VF4-NEXT:    [[TMP22:%.*]] = add nsw <4 x i64> [[TMP21]], splat (i64 42)
-; VF4-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
-; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP22]], i32 0
-; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP23]], align 8
-; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP22]], i32 1
-; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP24]], align 8
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP22]], i32 2
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
-; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP22]], i32 3
-; VF4-NEXT:    store i64 [[TMP30]], ptr [[TMP26]], align 8
+; VF4-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP22]], i32 0
+; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP22]], i32 1
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP22]], i32 2
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP22]], i32 3
+; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
+; VF4-NEXT:    store i64 [[TMP23]], ptr [[TMP27]], align 8
+; VF4-NEXT:    store i64 [[TMP24]], ptr [[TMP28]], align 8
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
+; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
 ; VF4-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll
index 12851d7d91cc7..607d1365098f2 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll
@@ -18,10 +18,10 @@ define void @ld_div1_step1_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP1:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 1)
 ; VF2-NEXT:    [[TMP2:%.*]] = add <2 x i64> [[TMP0]], [[TMP1]]
 ; VF2-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
 ; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; VF2-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
 ; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
@@ -52,16 +52,16 @@ define void @ld_div1_step1_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP1:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 1)
 ; VF4-NEXT:    [[TMP2:%.*]] = add <4 x i64> [[TMP0]], [[TMP1]]
 ; VF4-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0
-; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
-; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP4]], align 8
-; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP6]], align 8
-; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
+; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
 ; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
 ; VF4-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[TMP11]], i32 0
 ; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP12]], i32 1
@@ -143,16 +143,16 @@ define void @ld_div2_step1_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP1:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF4-NEXT:    [[TMP2:%.*]] = add <4 x i64> [[TMP0]], [[TMP1]]
 ; VF4-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0
-; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
-; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP4]], align 8
-; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP6]], align 8
-; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
+; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
 ; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
 ; VF4-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[TMP11]], i32 0
 ; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP12]], i32 1
@@ -208,10 +208,10 @@ define void @ld_div3_step1_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP1:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 3)
 ; VF2-NEXT:    [[TMP2:%.*]] = add <2 x i64> [[TMP0]], [[TMP1]]
 ; VF2-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
 ; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; VF2-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
 ; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
@@ -242,16 +242,16 @@ define void @ld_div3_step1_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP1:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 3)
 ; VF4-NEXT:    [[TMP2:%.*]] = add <4 x i64> [[TMP0]], [[TMP1]]
 ; VF4-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0
-; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
-; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP4]], align 8
-; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP6]], align 8
-; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
+; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
 ; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
 ; VF4-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[TMP11]], i32 0
 ; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP12]], i32 1
@@ -310,20 +310,20 @@ define void @ld_div1_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 1)
 ; VF2-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
 ; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF2-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
-; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
 ; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8
 ; VF2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> poison, i64 [[TMP9]], i32 0
 ; VF2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP10]], i32 1
 ; VF2-NEXT:    [[TMP13:%.*]] = add nsw <2 x i64> [[TMP12]], splat (i64 42)
-; VF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
-; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP14]], align 8
-; VF2-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
-; VF2-NEXT:    store i64 [[TMP17]], ptr [[TMP15]], align 8
+; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
+; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
+; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
+; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
@@ -353,34 +353,34 @@ define void @ld_div1_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 1)
 ; VF4-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[TMP4]], [[TMP5]]
 ; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
-; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
-; VF4-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
-; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP13]]
-; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP8]], align 8
-; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP10]], align 8
-; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
 ; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8
 ; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> poison, i64 [[TMP15]], i32 0
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 1
 ; VF4-NEXT:    [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 2
 ; VF4-NEXT:    [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 3
 ; VF4-NEXT:    [[TMP23:%.*]] = add nsw <4 x i64> [[TMP22]], splat (i64 42)
-; VF4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0
-; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP24]], align 8
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
-; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2
-; VF4-NEXT:    store i64 [[TMP30]], ptr [[TMP26]], align 8
-; VF4-NEXT:    [[TMP31:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3
-; VF4-NEXT:    store i64 [[TMP31]], ptr [[TMP27]], align 8
+; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2
+; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3
+; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    store i64 [[TMP24]], ptr [[TMP28]], align 8
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
+; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
+; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
@@ -431,20 +431,20 @@ define void @ld_div2_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
 ; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF2-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
-; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
 ; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8
 ; VF2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> poison, i64 [[TMP9]], i32 0
 ; VF2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP10]], i32 1
 ; VF2-NEXT:    [[TMP13:%.*]] = add nsw <2 x i64> [[TMP12]], splat (i64 42)
-; VF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
-; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP14]], align 8
-; VF2-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
-; VF2-NEXT:    store i64 [[TMP17]], ptr [[TMP15]], align 8
+; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
+; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
+; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
+; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
@@ -474,34 +474,34 @@ define void @ld_div2_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF4-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[TMP4]], [[TMP5]]
 ; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
-; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
-; VF4-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
-; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP13]]
-; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP8]], align 8
-; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP10]], align 8
-; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
 ; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8
 ; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> poison, i64 [[TMP15]], i32 0
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 1
 ; VF4-NEXT:    [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 2
 ; VF4-NEXT:    [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 3
 ; VF4-NEXT:    [[TMP23:%.*]] = add nsw <4 x i64> [[TMP22]], splat (i64 42)
-; VF4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0
-; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP24]], align 8
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
-; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2
-; VF4-NEXT:    store i64 [[TMP30]], ptr [[TMP26]], align 8
-; VF4-NEXT:    [[TMP31:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3
-; VF4-NEXT:    store i64 [[TMP31]], ptr [[TMP27]], align 8
+; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2
+; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3
+; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    store i64 [[TMP24]], ptr [[TMP28]], align 8
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
+; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
+; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
@@ -552,20 +552,20 @@ define void @ld_div3_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 3)
 ; VF2-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
 ; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF2-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
-; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
 ; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8
 ; VF2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> poison, i64 [[TMP9]], i32 0
 ; VF2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP10]], i32 1
 ; VF2-NEXT:    [[TMP13:%.*]] = add nsw <2 x i64> [[TMP12]], splat (i64 42)
-; VF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
-; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP14]], align 8
-; VF2-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
-; VF2-NEXT:    store i64 [[TMP17]], ptr [[TMP15]], align 8
+; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
+; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
+; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
+; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
@@ -595,34 +595,34 @@ define void @ld_div3_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 3)
 ; VF4-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[TMP4]], [[TMP5]]
 ; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
-; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
-; VF4-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
-; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP13]]
-; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP8]], align 8
-; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP10]], align 8
-; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
 ; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8
 ; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> poison, i64 [[TMP15]], i32 0
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 1
 ; VF4-NEXT:    [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 2
 ; VF4-NEXT:    [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 3
 ; VF4-NEXT:    [[TMP23:%.*]] = add nsw <4 x i64> [[TMP22]], splat (i64 42)
-; VF4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0
-; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP24]], align 8
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
-; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2
-; VF4-NEXT:    store i64 [[TMP30]], ptr [[TMP26]], align 8
-; VF4-NEXT:    [[TMP31:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3
-; VF4-NEXT:    store i64 [[TMP31]], ptr [[TMP27]], align 8
+; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2
+; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3
+; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    store i64 [[TMP24]], ptr [[TMP28]], align 8
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
+; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
+; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
@@ -673,20 +673,20 @@ define void @ld_div1_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 1)
 ; VF2-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
 ; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF2-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
-; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
 ; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8
 ; VF2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> poison, i64 [[TMP9]], i32 0
 ; VF2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP10]], i32 1
 ; VF2-NEXT:    [[TMP13:%.*]] = add nsw <2 x i64> [[TMP12]], splat (i64 42)
-; VF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
-; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP14]], align 8
-; VF2-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
-; VF2-NEXT:    store i64 [[TMP17]], ptr [[TMP15]], align 8
+; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
+; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
+; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
+; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
@@ -715,34 +715,34 @@ define void @ld_div1_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 1)
 ; VF4-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[TMP4]], [[TMP5]]
 ; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
-; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
-; VF4-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
-; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP13]]
-; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP8]], align 8
-; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP10]], align 8
-; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
 ; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8
 ; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> poison, i64 [[TMP15]], i32 0
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 1
 ; VF4-NEXT:    [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 2
 ; VF4-NEXT:    [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 3
 ; VF4-NEXT:    [[TMP23:%.*]] = add nsw <4 x i64> [[TMP22]], splat (i64 42)
-; VF4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0
-; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP24]], align 8
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
-; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2
-; VF4-NEXT:    store i64 [[TMP30]], ptr [[TMP26]], align 8
-; VF4-NEXT:    [[TMP31:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3
-; VF4-NEXT:    store i64 [[TMP31]], ptr [[TMP27]], align 8
+; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2
+; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3
+; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    store i64 [[TMP24]], ptr [[TMP28]], align 8
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
+; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
+; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
@@ -792,20 +792,20 @@ define void @ld_div2_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
 ; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF2-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
-; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
 ; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8
 ; VF2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> poison, i64 [[TMP9]], i32 0
 ; VF2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP10]], i32 1
 ; VF2-NEXT:    [[TMP13:%.*]] = add nsw <2 x i64> [[TMP12]], splat (i64 42)
-; VF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
-; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP14]], align 8
-; VF2-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
-; VF2-NEXT:    store i64 [[TMP17]], ptr [[TMP15]], align 8
+; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
+; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
+; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
+; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
@@ -834,34 +834,34 @@ define void @ld_div2_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF4-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[TMP4]], [[TMP5]]
 ; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
-; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
-; VF4-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
-; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP13]]
-; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP8]], align 8
-; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP10]], align 8
-; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
 ; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8
 ; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> poison, i64 [[TMP15]], i32 0
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 1
 ; VF4-NEXT:    [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 2
 ; VF4-NEXT:    [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 3
 ; VF4-NEXT:    [[TMP23:%.*]] = add nsw <4 x i64> [[TMP22]], splat (i64 42)
-; VF4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0
-; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP24]], align 8
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
-; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2
-; VF4-NEXT:    store i64 [[TMP30]], ptr [[TMP26]], align 8
-; VF4-NEXT:    [[TMP31:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3
-; VF4-NEXT:    store i64 [[TMP31]], ptr [[TMP27]], align 8
+; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2
+; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3
+; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    store i64 [[TMP24]], ptr [[TMP28]], align 8
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
+; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
+; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
@@ -911,20 +911,20 @@ define void @ld_div3_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 3)
 ; VF2-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
 ; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF2-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
-; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
 ; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8
 ; VF2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> poison, i64 [[TMP9]], i32 0
 ; VF2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP10]], i32 1
 ; VF2-NEXT:    [[TMP13:%.*]] = add nsw <2 x i64> [[TMP12]], splat (i64 42)
-; VF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
-; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP14]], align 8
-; VF2-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
-; VF2-NEXT:    store i64 [[TMP17]], ptr [[TMP15]], align 8
+; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
+; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
+; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
+; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
@@ -953,34 +953,34 @@ define void @ld_div3_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 3)
 ; VF4-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[TMP4]], [[TMP5]]
 ; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
-; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
-; VF4-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
-; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP13]]
-; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP8]], align 8
-; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP10]], align 8
-; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
 ; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8
 ; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> poison, i64 [[TMP15]], i32 0
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 1
 ; VF4-NEXT:    [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 2
 ; VF4-NEXT:    [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 3
 ; VF4-NEXT:    [[TMP23:%.*]] = add nsw <4 x i64> [[TMP22]], splat (i64 42)
-; VF4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0
-; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP24]], align 8
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
-; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2
-; VF4-NEXT:    store i64 [[TMP30]], ptr [[TMP26]], align 8
-; VF4-NEXT:    [[TMP31:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3
-; VF4-NEXT:    store i64 [[TMP31]], ptr [[TMP27]], align 8
+; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2
+; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3
+; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    store i64 [[TMP24]], ptr [[TMP28]], align 8
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
+; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
+; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
@@ -1028,10 +1028,10 @@ define void @ld_div1_step1_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP1:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 1)
 ; VF2-NEXT:    [[TMP2:%.*]] = add <2 x i64> [[TMP0]], [[TMP1]]
 ; VF2-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
 ; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; VF2-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
 ; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
@@ -1062,16 +1062,16 @@ define void @ld_div1_step1_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP1:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 1)
 ; VF4-NEXT:    [[TMP2:%.*]] = add <4 x i64> [[TMP0]], [[TMP1]]
 ; VF4-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0
-; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
-; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP4]], align 8
-; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP6]], align 8
-; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
+; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
 ; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
 ; VF4-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[TMP11]], i32 0
 ; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP12]], i32 1
@@ -1127,10 +1127,10 @@ define void @ld_div2_step1_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP1:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP2:%.*]] = add <2 x i64> [[TMP0]], [[TMP1]]
 ; VF2-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
 ; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; VF2-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
 ; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
@@ -1161,16 +1161,16 @@ define void @ld_div2_step1_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP1:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF4-NEXT:    [[TMP2:%.*]] = add <4 x i64> [[TMP0]], [[TMP1]]
 ; VF4-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0
-; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
-; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP4]], align 8
-; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP6]], align 8
-; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
+; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
 ; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
 ; VF4-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[TMP11]], i32 0
 ; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP12]], i32 1
@@ -1226,10 +1226,10 @@ define void @ld_div3_step1_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP1:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 3)
 ; VF2-NEXT:    [[TMP2:%.*]] = add <2 x i64> [[TMP0]], [[TMP1]]
 ; VF2-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
 ; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; VF2-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
 ; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
@@ -1260,16 +1260,16 @@ define void @ld_div3_step1_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP1:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 3)
 ; VF4-NEXT:    [[TMP2:%.*]] = add <4 x i64> [[TMP0]], [[TMP1]]
 ; VF4-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0
-; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
-; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP4]], align 8
-; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP6]], align 8
-; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
+; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
 ; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
 ; VF4-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[TMP11]], i32 0
 ; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP12]], i32 1
@@ -1328,20 +1328,20 @@ define void @ld_div1_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP4:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 1)
 ; VF2-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP3]], [[TMP4]]
 ; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
-; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; VF2-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
-; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF2-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8
 ; VF2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 8
 ; VF2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i32 0
 ; VF2-NEXT:    [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP11]], i32 1
 ; VF2-NEXT:    [[TMP14:%.*]] = add nsw <2 x i64> [[TMP13]], splat (i64 42)
-; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF2-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0
-; VF2-NEXT:    store i64 [[TMP17]], ptr [[TMP15]], align 8
-; VF2-NEXT:    [[TMP18:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1
-; VF2-NEXT:    store i64 [[TMP18]], ptr [[TMP16]], align 8
+; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0
+; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
+; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP18]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
@@ -1371,34 +1371,34 @@ define void @ld_div1_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP6:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 1)
 ; VF4-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]]
 ; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0
-; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1
-; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
-; VF4-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2
-; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]]
-; VF4-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3
-; VF4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP14]]
-; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP9]], align 8
-; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP11]], align 8
-; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP13]], align 8
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2
+; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
+; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
+; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8
 ; VF4-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP15]], align 8
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> poison, i64 [[TMP16]], i32 0
 ; VF4-NEXT:    [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 1
 ; VF4-NEXT:    [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 2
 ; VF4-NEXT:    [[TMP23:%.*]] = insertelement <4 x i64> [[TMP22]], i64 [[TMP19]], i32 3
 ; VF4-NEXT:    [[TMP24:%.*]] = add nsw <4 x i64> [[TMP23]], splat (i64 42)
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
-; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1
-; VF4-NEXT:    store i64 [[TMP30]], ptr [[TMP26]], align 8
-; VF4-NEXT:    [[TMP31:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2
-; VF4-NEXT:    store i64 [[TMP31]], ptr [[TMP27]], align 8
-; VF4-NEXT:    [[TMP32:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3
-; VF4-NEXT:    store i64 [[TMP32]], ptr [[TMP28]], align 8
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1
+; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2
+; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
+; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
+; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
+; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP32]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
@@ -1449,20 +1449,20 @@ define void @ld_div2_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP4:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP3]], [[TMP4]]
 ; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
-; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; VF2-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
-; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF2-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8
 ; VF2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 8
 ; VF2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i32 0
 ; VF2-NEXT:    [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP11]], i32 1
 ; VF2-NEXT:    [[TMP14:%.*]] = add nsw <2 x i64> [[TMP13]], splat (i64 42)
-; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF2-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0
-; VF2-NEXT:    store i64 [[TMP17]], ptr [[TMP15]], align 8
-; VF2-NEXT:    [[TMP18:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1
-; VF2-NEXT:    store i64 [[TMP18]], ptr [[TMP16]], align 8
+; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0
+; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
+; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP18]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
@@ -1492,34 +1492,34 @@ define void @ld_div2_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP6:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF4-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]]
 ; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0
-; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1
-; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
-; VF4-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2
-; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]]
-; VF4-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3
-; VF4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP14]]
-; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP9]], align 8
-; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP11]], align 8
-; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP13]], align 8
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2
+; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
+; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
+; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8
 ; VF4-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP15]], align 8
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> poison, i64 [[TMP16]], i32 0
 ; VF4-NEXT:    [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 1
 ; VF4-NEXT:    [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 2
 ; VF4-NEXT:    [[TMP23:%.*]] = insertelement <4 x i64> [[TMP22]], i64 [[TMP19]], i32 3
 ; VF4-NEXT:    [[TMP24:%.*]] = add nsw <4 x i64> [[TMP23]], splat (i64 42)
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
-; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1
-; VF4-NEXT:    store i64 [[TMP30]], ptr [[TMP26]], align 8
-; VF4-NEXT:    [[TMP31:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2
-; VF4-NEXT:    store i64 [[TMP31]], ptr [[TMP27]], align 8
-; VF4-NEXT:    [[TMP32:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3
-; VF4-NEXT:    store i64 [[TMP32]], ptr [[TMP28]], align 8
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1
+; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2
+; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
+; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
+; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
+; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP32]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
@@ -1570,20 +1570,20 @@ define void @ld_div3_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP4:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 3)
 ; VF2-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP3]], [[TMP4]]
 ; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
-; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; VF2-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
-; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF2-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8
 ; VF2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 8
 ; VF2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i32 0
 ; VF2-NEXT:    [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP11]], i32 1
 ; VF2-NEXT:    [[TMP14:%.*]] = add nsw <2 x i64> [[TMP13]], splat (i64 42)
-; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF2-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0
-; VF2-NEXT:    store i64 [[TMP17]], ptr [[TMP15]], align 8
-; VF2-NEXT:    [[TMP18:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1
-; VF2-NEXT:    store i64 [[TMP18]], ptr [[TMP16]], align 8
+; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0
+; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
+; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP18]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
@@ -1613,34 +1613,34 @@ define void @ld_div3_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP6:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 3)
 ; VF4-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]]
 ; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0
-; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1
-; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
-; VF4-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2
-; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]]
-; VF4-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3
-; VF4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP14]]
-; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP9]], align 8
-; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP11]], align 8
-; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP13]], align 8
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2
+; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
+; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
+; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8
 ; VF4-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP15]], align 8
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> poison, i64 [[TMP16]], i32 0
 ; VF4-NEXT:    [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 1
 ; VF4-NEXT:    [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 2
 ; VF4-NEXT:    [[TMP23:%.*]] = insertelement <4 x i64> [[TMP22]], i64 [[TMP19]], i32 3
 ; VF4-NEXT:    [[TMP24:%.*]] = add nsw <4 x i64> [[TMP23]], splat (i64 42)
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
-; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1
-; VF4-NEXT:    store i64 [[TMP30]], ptr [[TMP26]], align 8
-; VF4-NEXT:    [[TMP31:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2
-; VF4-NEXT:    store i64 [[TMP31]], ptr [[TMP27]], align 8
-; VF4-NEXT:    [[TMP32:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3
-; VF4-NEXT:    store i64 [[TMP32]], ptr [[TMP28]], align 8
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1
+; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2
+; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
+; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
+; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
+; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP32]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
@@ -1691,20 +1691,20 @@ define void @ld_div1_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP4:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 1)
 ; VF2-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP3]], [[TMP4]]
 ; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
-; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; VF2-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
-; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF2-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8
 ; VF2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 8
 ; VF2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i32 0
 ; VF2-NEXT:    [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP11]], i32 1
 ; VF2-NEXT:    [[TMP14:%.*]] = add nsw <2 x i64> [[TMP13]], splat (i64 42)
-; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF2-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0
-; VF2-NEXT:    store i64 [[TMP17]], ptr [[TMP15]], align 8
-; VF2-NEXT:    [[TMP18:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1
-; VF2-NEXT:    store i64 [[TMP18]], ptr [[TMP16]], align 8
+; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0
+; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
+; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP18]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
@@ -1734,34 +1734,34 @@ define void @ld_div1_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP6:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 1)
 ; VF4-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]]
 ; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0
-; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1
-; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
-; VF4-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2
-; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]]
-; VF4-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3
-; VF4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP14]]
-; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP9]], align 8
-; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP11]], align 8
-; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP13]], align 8
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2
+; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
+; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
+; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8
 ; VF4-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP15]], align 8
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> poison, i64 [[TMP16]], i32 0
 ; VF4-NEXT:    [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 1
 ; VF4-NEXT:    [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 2
 ; VF4-NEXT:    [[TMP23:%.*]] = insertelement <4 x i64> [[TMP22]], i64 [[TMP19]], i32 3
 ; VF4-NEXT:    [[TMP24:%.*]] = add nsw <4 x i64> [[TMP23]], splat (i64 42)
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
-; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1
-; VF4-NEXT:    store i64 [[TMP30]], ptr [[TMP26]], align 8
-; VF4-NEXT:    [[TMP31:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2
-; VF4-NEXT:    store i64 [[TMP31]], ptr [[TMP27]], align 8
-; VF4-NEXT:    [[TMP32:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3
-; VF4-NEXT:    store i64 [[TMP32]], ptr [[TMP28]], align 8
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1
+; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2
+; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
+; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
+; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
+; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP32]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
@@ -1812,20 +1812,20 @@ define void @ld_div2_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP4:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP3]], [[TMP4]]
 ; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
-; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; VF2-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
-; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF2-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8
 ; VF2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 8
 ; VF2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i32 0
 ; VF2-NEXT:    [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP11]], i32 1
 ; VF2-NEXT:    [[TMP14:%.*]] = add nsw <2 x i64> [[TMP13]], splat (i64 42)
-; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF2-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0
-; VF2-NEXT:    store i64 [[TMP17]], ptr [[TMP15]], align 8
-; VF2-NEXT:    [[TMP18:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1
-; VF2-NEXT:    store i64 [[TMP18]], ptr [[TMP16]], align 8
+; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0
+; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
+; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP18]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
@@ -1855,34 +1855,34 @@ define void @ld_div2_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP6:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF4-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]]
 ; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0
-; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1
-; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
-; VF4-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2
-; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]]
-; VF4-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3
-; VF4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP14]]
-; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP9]], align 8
-; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP11]], align 8
-; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP13]], align 8
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2
+; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
+; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
+; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8
 ; VF4-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP15]], align 8
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> poison, i64 [[TMP16]], i32 0
 ; VF4-NEXT:    [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 1
 ; VF4-NEXT:    [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 2
 ; VF4-NEXT:    [[TMP23:%.*]] = insertelement <4 x i64> [[TMP22]], i64 [[TMP19]], i32 3
 ; VF4-NEXT:    [[TMP24:%.*]] = add nsw <4 x i64> [[TMP23]], splat (i64 42)
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
-; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1
-; VF4-NEXT:    store i64 [[TMP30]], ptr [[TMP26]], align 8
-; VF4-NEXT:    [[TMP31:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2
-; VF4-NEXT:    store i64 [[TMP31]], ptr [[TMP27]], align 8
-; VF4-NEXT:    [[TMP32:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3
-; VF4-NEXT:    store i64 [[TMP32]], ptr [[TMP28]], align 8
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1
+; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2
+; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
+; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
+; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
+; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP32]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
@@ -1933,20 +1933,20 @@ define void @ld_div3_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP4:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 3)
 ; VF2-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP3]], [[TMP4]]
 ; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
-; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; VF2-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
-; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF2-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8
 ; VF2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 8
 ; VF2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i32 0
 ; VF2-NEXT:    [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP11]], i32 1
 ; VF2-NEXT:    [[TMP14:%.*]] = add nsw <2 x i64> [[TMP13]], splat (i64 42)
-; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF2-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0
-; VF2-NEXT:    store i64 [[TMP17]], ptr [[TMP15]], align 8
-; VF2-NEXT:    [[TMP18:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1
-; VF2-NEXT:    store i64 [[TMP18]], ptr [[TMP16]], align 8
+; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0
+; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
+; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP18]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
@@ -1976,34 +1976,34 @@ define void @ld_div3_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP6:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 3)
 ; VF4-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]]
 ; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0
-; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1
-; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
-; VF4-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2
-; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]]
-; VF4-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3
-; VF4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP14]]
-; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP9]], align 8
-; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP11]], align 8
-; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP13]], align 8
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2
+; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
+; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
+; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8
 ; VF4-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP15]], align 8
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> poison, i64 [[TMP16]], i32 0
 ; VF4-NEXT:    [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 1
 ; VF4-NEXT:    [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 2
 ; VF4-NEXT:    [[TMP23:%.*]] = insertelement <4 x i64> [[TMP22]], i64 [[TMP19]], i32 3
 ; VF4-NEXT:    [[TMP24:%.*]] = add nsw <4 x i64> [[TMP23]], splat (i64 42)
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
-; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1
-; VF4-NEXT:    store i64 [[TMP30]], ptr [[TMP26]], align 8
-; VF4-NEXT:    [[TMP31:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2
-; VF4-NEXT:    store i64 [[TMP31]], ptr [[TMP27]], align 8
-; VF4-NEXT:    [[TMP32:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3
-; VF4-NEXT:    store i64 [[TMP32]], ptr [[TMP28]], align 8
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1
+; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2
+; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
+; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
+; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
+; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP32]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
diff --git a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
index 027dcaf771072..6a6ae316f4e52 100644
--- a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
+++ b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
@@ -330,12 +330,12 @@ define void @test_versioned_with_non_ex_use(i32 %offset, ptr noalias %dst.1, ptr
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i32> [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[TMP10]], i32 1
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP13]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP10]], i32 2
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP15]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[TMP10]], i32 3
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP15]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP17]]
 ; CHECK-NEXT:    store i32 0, ptr [[TMP12]], align 8
 ; CHECK-NEXT:    store i32 0, ptr [[TMP14]], align 8
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-native-path-inner-loop-with-runtime-checks.ll b/llvm/test/Transforms/LoopVectorize/vplan-native-path-inner-loop-with-runtime-checks.ll
index 9ace6be64b69a..e5e02674704f9 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-native-path-inner-loop-with-runtime-checks.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-native-path-inner-loop-with-runtime-checks.ll
@@ -61,29 +61,29 @@ define void @expand(ptr %src, ptr %dst, i64 %0) {
 ; CHECK-NEXT:    [[TMP19:%.*]] = load double, ptr [[SRC]], align 8, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = shl <4 x i64> [[VEC_IND]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i64> [[TMP20]], i32 0
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP21]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP20]], i32 1
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP23]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP20]], i32 2
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP25]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP20]], i32 3
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP23]]
+; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP25]]
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP27]]
-; CHECK-NEXT:    store double [[TMP19]], ptr [[TMP22]], align 8, !alias.scope [[META3]]
-; CHECK-NEXT:    store double [[TMP19]], ptr [[TMP24]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT:    store double [[TMP19]], ptr [[TMP31]], align 8, !alias.scope [[META3]]
 ; CHECK-NEXT:    store double [[TMP19]], ptr [[TMP26]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT:    store double [[TMP19]], ptr [[TMP33]], align 8, !alias.scope [[META3]]
 ; CHECK-NEXT:    store double [[TMP19]], ptr [[TMP28]], align 8, !alias.scope [[META3]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = or disjoint <4 x i64> [[TMP20]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP29]], i32 0
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP30]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <4 x i64> [[TMP29]], i32 1
-; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP32]]
 ; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x i64> [[TMP29]], i32 2
-; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP34]]
 ; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <4 x i64> [[TMP29]], i32 3
+; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP30]]
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP32]]
+; CHECK-NEXT:    [[TMP42:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP34]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP36]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP31]], align 8, !alias.scope [[META3]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP33]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP41]], align 8, !alias.scope [[META3]]
 ; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP35]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP42]], align 8, !alias.scope [[META3]]
 ; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP37]], align 8, !alias.scope [[META3]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-vectorize-inner-loop-reduction.ll b/llvm/test/Transforms/LoopVectorize/vplan-vectorize-inner-loop-reduction.ll
index 7aa7293de9bbd..eaebfebf533ea 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-vectorize-inner-loop-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-vectorize-inner-loop-reduction.ll
@@ -22,9 +22,9 @@ define void @inner_loop_reduction(ptr noalias nocapture readonly %a.in, ptr noal
 ; CHECK-NEXT: %[[FOR1_INDEX:.*]] = phi i64 [ 0, %[[LABEL_PR:.*]] ], [ %{{.*}}, %[[LABEL_FOR1_LATCH:.*]] ]
 ; CHECK: %[[VEC_INDEX:.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[LABEL_PR]] ], [ %{{.*}}, %[[LABEL_FOR1_LATCH]] ]
 ; CHECK-NEXT: %[[A_PTR:.*]] = getelementptr inbounds double, ptr %a.in, <4 x i64> %[[VEC_INDEX]]
-; CHECK-NEXT: %[[MASKED_GATHER1:.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> %[[A_PTR]], i32 8, <4 x i1> splat (i1 true), <4 x double> poison)
+; CHECK-NEXT: %[[MASKED_GATHER1:.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 8 %[[A_PTR]], <4 x i1> splat (i1 true), <4 x double> poison)
 ; CHECK-NEXT: %[[B_PTR:.*]] = getelementptr inbounds double, ptr %b.in, <4 x i64> %[[VEC_INDEX]]
-; CHECK-NEXT: %[[MASKED_GATHER2:.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> %[[B_PTR]], i32 8, <4 x i1> splat (i1 true), <4 x double> poison)
+; CHECK-NEXT: %[[MASKED_GATHER2:.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 8 %[[B_PTR]], <4 x i1> splat (i1 true), <4 x double> poison)
 ; CHECK-NEXT: br label %[[FOR2_HEADER:.*]]
 
 ; CHECK: [[FOR2_HEADER]]:
@@ -39,7 +39,7 @@ define void @inner_loop_reduction(ptr noalias nocapture readonly %a.in, ptr noal
 ; CHECK: [[FOR1_LATCH]]:
 ; CHECK-NEXT: %[[REDUCTION:.*]] = phi <4 x double> [ %[[REDUCTION_NEXT]], %[[FOR2_HEADER]] ]
 ; CHECK-NEXT: %[[C_PTR:.*]] = getelementptr inbounds double, ptr %c.out, <4 x i64> %[[VEC_INDEX]]
-; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> %[[REDUCTION]], <4 x ptr> %[[C_PTR]], i32 8, <4 x i1> splat (i1 true))
+; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> %[[REDUCTION]], <4 x ptr> align 8 %[[C_PTR]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT: %[[FOR1_INDEX_NEXT:.*]] = add nuw i64 %[[FOR1_INDEX]], 4
 ; CHECK-NEXT: %{{.*}} = add <4 x i64> %[[VEC_INDEX]], splat (i64 4)
 ; CHECK-NEXT: %[[EXIT_COND:.*]] = icmp eq i64 %[[FOR1_INDEX_NEXT]], 1000
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-widen-call-instruction.ll b/llvm/test/Transforms/LoopVectorize/vplan-widen-call-instruction.ll
index a4833f241767c..180fd84c14450 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-widen-call-instruction.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-widen-call-instruction.ll
@@ -16,9 +16,9 @@ define void @widen_call_instruction(ptr noalias nocapture readonly %a.in, ptr no
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_LATCH:.*]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_LATCH]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds double, ptr [[A_IN]], <4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> [[TMP0]], i32 8, <4 x i1> splat (i1 true), <4 x double> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 8 [[TMP0]], <4 x i1> splat (i1 true), <4 x double> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[B_IN]], <4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> [[TMP1]], i32 8, <4 x i1> splat (i1 true), <4 x double> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 8 [[TMP1]], <4 x i1> splat (i1 true), <4 x double> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> [[WIDE_MASKED_GATHER1]])
 ; CHECK-NEXT:    br label %[[FOR2_HEADER2:.*]]
 ; CHECK:       [[FOR2_HEADER2]]:
@@ -30,9 +30,9 @@ define void @widen_call_instruction(ptr noalias nocapture readonly %a.in, ptr no
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[VECTOR_LATCH]], label %[[FOR2_HEADER2]]
 ; CHECK:       [[VECTOR_LATCH]]:
-; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x double> [ [[TMP3]], %[[FOR2_HEADER2]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = phi <4 x double> [ [[TMP3]], %[[FOR2_HEADER2]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds double, ptr [[C_OUT]], <4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> [[VEC_PHI4]], <4 x ptr> [[TMP7]], i32 8, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> [[TMP9]], <4 x ptr> align 8 [[TMP7]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-widen-select-instruction.ll b/llvm/test/Transforms/LoopVectorize/vplan-widen-select-instruction.ll
index c782e0914719d..48a11fa9182e7 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-widen-select-instruction.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-widen-select-instruction.ll
@@ -29,7 +29,7 @@ define void @loop_invariant_select(ptr noalias nocapture %out, i1 %select, doubl
 ; CHECK:       [[FOR2_HEADER3]]:
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP2:%.*]], %[[FOR2_HEADER3]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[SELECT]], <4 x double> [[BROADCAST_SPLAT]], <4 x double> [[BROADCAST_SPLAT3]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> [[TMP1]], <4 x ptr> [[TMP0]], i32 8, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> [[TMP1]], <4 x ptr> align 8 [[TMP0]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP2]] = add nuw nsw <4 x i64> [[VEC_PHI]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i64> [[TMP2]], splat (i64 10000)
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0
@@ -89,7 +89,7 @@ define void @outer_loop_dependant_select(ptr noalias nocapture %out, double %a,
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP3:%.*]], %[[FOR2_HEADER3]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i64> [[VEC_IND]] to <4 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x double> [[BROADCAST_SPLAT]], <4 x double> [[BROADCAST_SPLAT3]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> [[TMP2]], <4 x ptr> [[TMP0]], i32 8, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> [[TMP2]], <4 x ptr> align 8 [[TMP0]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP3]] = add nuw nsw <4 x i64> [[VEC_PHI]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <4 x i64> [[TMP3]], splat (i64 10000)
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
@@ -150,7 +150,7 @@ define void @inner_loop_dependant_select(ptr noalias nocapture %out, double %a,
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP3:%.*]], %[[FOR2_HEADER3]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i64> [[VEC_PHI]] to <4 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x double> [[BROADCAST_SPLAT]], <4 x double> [[BROADCAST_SPLAT3]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> [[TMP2]], <4 x ptr> [[TMP0]], i32 8, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> [[TMP2]], <4 x ptr> align 8 [[TMP0]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP3]] = add nuw nsw <4 x i64> [[VEC_PHI]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <4 x i64> [[TMP3]], splat (i64 10000)
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
@@ -212,7 +212,7 @@ define void @outer_and_inner_loop_dependant_select(ptr noalias nocapture %out, d
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = trunc <4 x i64> [[TMP1]] to <4 x i1>
 ; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x double> [[BROADCAST_SPLAT]], <4 x double> [[BROADCAST_SPLAT3]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> [[TMP3]], <4 x ptr> [[TMP0]], i32 8, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> [[TMP3]], <4 x ptr> align 8 [[TMP0]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP4]] = add nuw nsw <4 x i64> [[VEC_PHI]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i64> [[TMP4]], splat (i64 10000)
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/write-only.ll b/llvm/test/Transforms/LoopVectorize/write-only.ll
index cc21b94bc3070..8df71e8394874 100644
--- a/llvm/test/Transforms/LoopVectorize/write-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/write-only.ll
@@ -4,8 +4,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 
 ;CHECK-LABEL: @read_mod_write_single_ptr(
 ;CHECK: load <4 x float>
-;CHECK: ret i32
-define i32 @read_mod_write_single_ptr(ptr nocapture %a, i32 %n) nounwind uwtable ssp {
+;CHECK: ret void
+define void @read_mod_write_single_ptr(ptr nocapture %a, i32 %n) nounwind uwtable ssp {
   %1 = icmp sgt i32 %n, 0
   br i1 %1, label %.lr.ph, label %._crit_edge
 
@@ -21,14 +21,14 @@ define i32 @read_mod_write_single_ptr(ptr nocapture %a, i32 %n) nounwind uwtable
   br i1 %exitcond, label %._crit_edge, label %.lr.ph
 
 ._crit_edge:                                      ; preds = %.lr.ph, %0
-  ret i32 undef
+  ret void
 }
 
 ; Ensure that volatile stores are not vectorized.
 ; CHECK-LABEL: @read_mod_write_single_ptr_volatile_store(
 ; CHECK-NOT: store <4 x float>
-; CHECK: ret i32
-define i32 @read_mod_write_single_ptr_volatile_store(ptr nocapture %a, i32 %n) nounwind uwtable ssp {
+; CHECK: ret void
+define void @read_mod_write_single_ptr_volatile_store(ptr nocapture %a, i32 %n) nounwind uwtable ssp {
   %1 = icmp sgt i32 %n, 0
   br i1 %1, label %.lr.ph, label %._crit_edge
 
@@ -44,5 +44,5 @@ define i32 @read_mod_write_single_ptr_volatile_store(ptr nocapture %a, i32 %n) n
   br i1 %exitcond, label %._crit_edge, label %.lr.ph
 
 ._crit_edge:                                      ; preds = %.lr.ph, %0
-  ret i32 undef
+  ret void
 }
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll
index d281905e6f973..abd1d96937b28 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll
@@ -1,5 +1,4 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=lower-matrix-intrinsics,instcombine -data-layout='p:128:128' -fuse-matrix-use-loops=false -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s --check-prefix=PTR128
 ; RUN: opt -passes=lower-matrix-intrinsics,instcombine -data-layout='p:64:64' -fuse-matrix-use-loops=false -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s --check-prefix=PTR64
 ; RUN: opt -passes=lower-matrix-intrinsics,instcombine -data-layout='p:32:32' -fuse-matrix-use-loops=false -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s --check-prefix=PTR32
 
@@ -10,179 +9,6 @@
 target triple = "aarch64-unknown-unknown"
 
 define void @multiply(ptr %A, ptr %B, ptr %C) {
-; PTR128-LABEL: @multiply(
-; PTR128-NEXT:  entry:
-; PTR128-NEXT:    [[STORE_BEGIN:%.*]] = ptrtoint ptr [[C:%.*]] to i128
-; PTR128-NEXT:    [[STORE_END:%.*]] = add nuw nsw i128 [[STORE_BEGIN]], 128
-; PTR128-NEXT:    [[LOAD_BEGIN:%.*]] = ptrtoint ptr [[A:%.*]] to i128
-; PTR128-NEXT:    [[TMP0:%.*]] = icmp ugt i128 [[STORE_END]], [[LOAD_BEGIN]]
-; PTR128-NEXT:    br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]]
-; PTR128:       alias_cont:
-; PTR128-NEXT:    [[LOAD_END:%.*]] = add nuw nsw i128 [[LOAD_BEGIN]], 128
-; PTR128-NEXT:    [[TMP1:%.*]] = icmp ugt i128 [[LOAD_END]], [[STORE_BEGIN]]
-; PTR128-NEXT:    br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]]
-; PTR128:       copy:
-; PTR128-NEXT:    [[TMP2:%.*]] = alloca [16 x double], align 8
-; PTR128-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP2]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
-; PTR128-NEXT:    br label [[NO_ALIAS]]
-; PTR128:       no_alias:
-; PTR128-NEXT:    [[TMP3:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ [[A]], [[ALIAS_CONT]] ], [ [[TMP2]], [[COPY]] ]
-; PTR128-NEXT:    [[STORE_BEGIN4:%.*]] = ptrtoint ptr [[C]] to i128
-; PTR128-NEXT:    [[STORE_END5:%.*]] = add nuw nsw i128 [[STORE_BEGIN4]], 128
-; PTR128-NEXT:    [[LOAD_BEGIN6:%.*]] = ptrtoint ptr [[A]] to i128
-; PTR128-NEXT:    [[TMP4:%.*]] = icmp ugt i128 [[STORE_END5]], [[LOAD_BEGIN6]]
-; PTR128-NEXT:    br i1 [[TMP4]], label [[ALIAS_CONT1:%.*]], label [[NO_ALIAS3:%.*]]
-; PTR128:       alias_cont1:
-; PTR128-NEXT:    [[LOAD_END7:%.*]] = add nuw nsw i128 [[LOAD_BEGIN6]], 128
-; PTR128-NEXT:    [[TMP5:%.*]] = icmp ugt i128 [[LOAD_END7]], [[STORE_BEGIN4]]
-; PTR128-NEXT:    br i1 [[TMP5]], label [[COPY2:%.*]], label [[NO_ALIAS3]]
-; PTR128:       copy2:
-; PTR128-NEXT:    [[TMP6:%.*]] = alloca [16 x double], align 8
-; PTR128-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP6]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
-; PTR128-NEXT:    br label [[NO_ALIAS3]]
-; PTR128:       no_alias3:
-; PTR128-NEXT:    [[TMP7:%.*]] = phi ptr [ [[A]], [[NO_ALIAS]] ], [ [[A]], [[ALIAS_CONT1]] ], [ [[TMP6]], [[COPY2]] ]
-; PTR128-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
-; PTR128-NEXT:    [[VEC_GEP:%.*]] = getelementptr i8, ptr [[TMP3]], i128 32
-; PTR128-NEXT:    [[COL_LOAD8:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
-; PTR128-NEXT:    [[COL_LOAD9:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
-; PTR128-NEXT:    [[VEC_GEP10:%.*]] = getelementptr i8, ptr [[TMP7]], i128 32
-; PTR128-NEXT:    [[COL_LOAD11:%.*]] = load <2 x double>, ptr [[VEC_GEP10]], align 8
-; PTR128-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT:    [[TMP8:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]]
-; PTR128-NEXT:    [[SPLAT_SPLAT14:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT:    [[TMP9:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT14]], <2 x double> [[TMP8]])
-; PTR128-NEXT:    [[SPLAT_SPLAT17:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT:    [[TMP10:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT17]]
-; PTR128-NEXT:    [[SPLAT_SPLAT20:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT:    [[TMP11:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT20]], <2 x double> [[TMP10]])
-; PTR128-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP3]], i128 64
-; PTR128-NEXT:    [[COL_LOAD21:%.*]] = load <2 x double>, ptr [[TMP12]], align 8
-; PTR128-NEXT:    [[VEC_GEP22:%.*]] = getelementptr i8, ptr [[TMP3]], i128 96
-; PTR128-NEXT:    [[COL_LOAD23:%.*]] = load <2 x double>, ptr [[VEC_GEP22]], align 8
-; PTR128-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP7]], i128 16
-; PTR128-NEXT:    [[COL_LOAD24:%.*]] = load <2 x double>, ptr [[TMP13]], align 8
-; PTR128-NEXT:    [[VEC_GEP25:%.*]] = getelementptr i8, ptr [[TMP7]], i128 48
-; PTR128-NEXT:    [[COL_LOAD26:%.*]] = load <2 x double>, ptr [[VEC_GEP25]], align 8
-; PTR128-NEXT:    [[SPLAT_SPLAT30:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT:    [[TMP14:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT30]], <2 x double> [[TMP9]])
-; PTR128-NEXT:    [[SPLAT_SPLAT33:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT:    [[TMP15:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT33]], <2 x double> [[TMP14]])
-; PTR128-NEXT:    [[SPLAT_SPLAT37:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT:    [[TMP16:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT37]], <2 x double> [[TMP11]])
-; PTR128-NEXT:    [[SPLAT_SPLAT40:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT:    [[TMP17:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT40]], <2 x double> [[TMP16]])
-; PTR128-NEXT:    store <2 x double> [[TMP15]], ptr [[C]], align 8
-; PTR128-NEXT:    [[VEC_GEP41:%.*]] = getelementptr i8, ptr [[C]], i128 32
-; PTR128-NEXT:    store <2 x double> [[TMP17]], ptr [[VEC_GEP41]], align 8
-; PTR128-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP3]], i128 16
-; PTR128-NEXT:    [[COL_LOAD42:%.*]] = load <2 x double>, ptr [[TMP18]], align 8
-; PTR128-NEXT:    [[VEC_GEP43:%.*]] = getelementptr i8, ptr [[TMP3]], i128 48
-; PTR128-NEXT:    [[COL_LOAD44:%.*]] = load <2 x double>, ptr [[VEC_GEP43]], align 8
-; PTR128-NEXT:    [[COL_LOAD45:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
-; PTR128-NEXT:    [[VEC_GEP46:%.*]] = getelementptr i8, ptr [[TMP7]], i128 32
-; PTR128-NEXT:    [[COL_LOAD47:%.*]] = load <2 x double>, ptr [[VEC_GEP46]], align 8
-; PTR128-NEXT:    [[SPLAT_SPLAT50:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT:    [[TMP19:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT50]]
-; PTR128-NEXT:    [[SPLAT_SPLAT53:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT:    [[TMP20:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT53]], <2 x double> [[TMP19]])
-; PTR128-NEXT:    [[SPLAT_SPLAT56:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT:    [[TMP21:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT56]]
-; PTR128-NEXT:    [[SPLAT_SPLAT59:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT:    [[TMP22:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT59]], <2 x double> [[TMP21]])
-; PTR128-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[TMP3]], i128 80
-; PTR128-NEXT:    [[COL_LOAD60:%.*]] = load <2 x double>, ptr [[TMP23]], align 8
-; PTR128-NEXT:    [[VEC_GEP61:%.*]] = getelementptr i8, ptr [[TMP3]], i128 112
-; PTR128-NEXT:    [[COL_LOAD62:%.*]] = load <2 x double>, ptr [[VEC_GEP61]], align 8
-; PTR128-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[TMP7]], i128 16
-; PTR128-NEXT:    [[COL_LOAD63:%.*]] = load <2 x double>, ptr [[TMP24]], align 8
-; PTR128-NEXT:    [[VEC_GEP64:%.*]] = getelementptr i8, ptr [[TMP7]], i128 48
-; PTR128-NEXT:    [[COL_LOAD65:%.*]] = load <2 x double>, ptr [[VEC_GEP64]], align 8
-; PTR128-NEXT:    [[SPLAT_SPLAT69:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT:    [[TMP25:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT69]], <2 x double> [[TMP20]])
-; PTR128-NEXT:    [[SPLAT_SPLAT72:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT:    [[TMP26:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT72]], <2 x double> [[TMP25]])
-; PTR128-NEXT:    [[SPLAT_SPLAT76:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT:    [[TMP27:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT76]], <2 x double> [[TMP22]])
-; PTR128-NEXT:    [[SPLAT_SPLAT79:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT:    [[TMP28:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT79]], <2 x double> [[TMP27]])
-; PTR128-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[C]], i128 16
-; PTR128-NEXT:    store <2 x double> [[TMP26]], ptr [[TMP29]], align 8
-; PTR128-NEXT:    [[VEC_GEP80:%.*]] = getelementptr i8, ptr [[C]], i128 48
-; PTR128-NEXT:    store <2 x double> [[TMP28]], ptr [[VEC_GEP80]], align 8
-; PTR128-NEXT:    [[COL_LOAD81:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
-; PTR128-NEXT:    [[VEC_GEP82:%.*]] = getelementptr i8, ptr [[TMP3]], i128 32
-; PTR128-NEXT:    [[COL_LOAD83:%.*]] = load <2 x double>, ptr [[VEC_GEP82]], align 8
-; PTR128-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[TMP7]], i128 64
-; PTR128-NEXT:    [[COL_LOAD84:%.*]] = load <2 x double>, ptr [[TMP30]], align 8
-; PTR128-NEXT:    [[VEC_GEP85:%.*]] = getelementptr i8, ptr [[TMP7]], i128 96
-; PTR128-NEXT:    [[COL_LOAD86:%.*]] = load <2 x double>, ptr [[VEC_GEP85]], align 8
-; PTR128-NEXT:    [[SPLAT_SPLAT89:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT:    [[TMP31:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT89]]
-; PTR128-NEXT:    [[SPLAT_SPLAT92:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT:    [[TMP32:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT92]], <2 x double> [[TMP31]])
-; PTR128-NEXT:    [[SPLAT_SPLAT95:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT:    [[TMP33:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT95]]
-; PTR128-NEXT:    [[SPLAT_SPLAT98:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT:    [[TMP34:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT98]], <2 x double> [[TMP33]])
-; PTR128-NEXT:    [[TMP35:%.*]] = getelementptr i8, ptr [[TMP3]], i128 64
-; PTR128-NEXT:    [[COL_LOAD99:%.*]] = load <2 x double>, ptr [[TMP35]], align 8
-; PTR128-NEXT:    [[VEC_GEP100:%.*]] = getelementptr i8, ptr [[TMP3]], i128 96
-; PTR128-NEXT:    [[COL_LOAD101:%.*]] = load <2 x double>, ptr [[VEC_GEP100]], align 8
-; PTR128-NEXT:    [[TMP36:%.*]] = getelementptr i8, ptr [[TMP7]], i128 80
-; PTR128-NEXT:    [[COL_LOAD102:%.*]] = load <2 x double>, ptr [[TMP36]], align 8
-; PTR128-NEXT:    [[VEC_GEP103:%.*]] = getelementptr i8, ptr [[TMP7]], i128 112
-; PTR128-NEXT:    [[COL_LOAD104:%.*]] = load <2 x double>, ptr [[VEC_GEP103]], align 8
-; PTR128-NEXT:    [[SPLAT_SPLAT108:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT:    [[TMP37:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT108]], <2 x double> [[TMP32]])
-; PTR128-NEXT:    [[SPLAT_SPLAT111:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT:    [[TMP38:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT111]], <2 x double> [[TMP37]])
-; PTR128-NEXT:    [[SPLAT_SPLAT115:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT:    [[TMP39:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT115]], <2 x double> [[TMP34]])
-; PTR128-NEXT:    [[SPLAT_SPLAT118:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT:    [[TMP40:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT118]], <2 x double> [[TMP39]])
-; PTR128-NEXT:    [[TMP41:%.*]] = getelementptr i8, ptr [[C]], i128 64
-; PTR128-NEXT:    store <2 x double> [[TMP38]], ptr [[TMP41]], align 8
-; PTR128-NEXT:    [[VEC_GEP119:%.*]] = getelementptr i8, ptr [[C]], i128 96
-; PTR128-NEXT:    store <2 x double> [[TMP40]], ptr [[VEC_GEP119]], align 8
-; PTR128-NEXT:    [[TMP42:%.*]] = getelementptr i8, ptr [[TMP3]], i128 16
-; PTR128-NEXT:    [[COL_LOAD120:%.*]] = load <2 x double>, ptr [[TMP42]], align 8
-; PTR128-NEXT:    [[VEC_GEP121:%.*]] = getelementptr i8, ptr [[TMP3]], i128 48
-; PTR128-NEXT:    [[COL_LOAD122:%.*]] = load <2 x double>, ptr [[VEC_GEP121]], align 8
-; PTR128-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[TMP7]], i128 64
-; PTR128-NEXT:    [[COL_LOAD123:%.*]] = load <2 x double>, ptr [[TMP43]], align 8
-; PTR128-NEXT:    [[VEC_GEP124:%.*]] = getelementptr i8, ptr [[TMP7]], i128 96
-; PTR128-NEXT:    [[COL_LOAD125:%.*]] = load <2 x double>, ptr [[VEC_GEP124]], align 8
-; PTR128-NEXT:    [[SPLAT_SPLAT128:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT:    [[TMP44:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT128]]
-; PTR128-NEXT:    [[SPLAT_SPLAT131:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT:    [[TMP45:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT131]], <2 x double> [[TMP44]])
-; PTR128-NEXT:    [[SPLAT_SPLAT134:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT:    [[TMP46:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT134]]
-; PTR128-NEXT:    [[SPLAT_SPLAT137:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT:    [[TMP47:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT137]], <2 x double> [[TMP46]])
-; PTR128-NEXT:    [[TMP48:%.*]] = getelementptr i8, ptr [[TMP3]], i128 80
-; PTR128-NEXT:    [[COL_LOAD138:%.*]] = load <2 x double>, ptr [[TMP48]], align 8
-; PTR128-NEXT:    [[VEC_GEP139:%.*]] = getelementptr i8, ptr [[TMP3]], i128 112
-; PTR128-NEXT:    [[COL_LOAD140:%.*]] = load <2 x double>, ptr [[VEC_GEP139]], align 8
-; PTR128-NEXT:    [[TMP49:%.*]] = getelementptr i8, ptr [[TMP7]], i128 80
-; PTR128-NEXT:    [[COL_LOAD141:%.*]] = load <2 x double>, ptr [[TMP49]], align 8
-; PTR128-NEXT:    [[VEC_GEP142:%.*]] = getelementptr i8, ptr [[TMP7]], i128 112
-; PTR128-NEXT:    [[COL_LOAD143:%.*]] = load <2 x double>, ptr [[VEC_GEP142]], align 8
-; PTR128-NEXT:    [[SPLAT_SPLAT147:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT:    [[TMP50:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT147]], <2 x double> [[TMP45]])
-; PTR128-NEXT:    [[SPLAT_SPLAT150:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT:    [[TMP51:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT150]], <2 x double> [[TMP50]])
-; PTR128-NEXT:    [[SPLAT_SPLAT154:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> poison, <2 x i32> zeroinitializer
-; PTR128-NEXT:    [[TMP52:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT154]], <2 x double> [[TMP47]])
-; PTR128-NEXT:    [[SPLAT_SPLAT157:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; PTR128-NEXT:    [[TMP53:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT157]], <2 x double> [[TMP52]])
-; PTR128-NEXT:    [[TMP54:%.*]] = getelementptr i8, ptr [[C]], i128 80
-; PTR128-NEXT:    store <2 x double> [[TMP51]], ptr [[TMP54]], align 8
-; PTR128-NEXT:    [[VEC_GEP158:%.*]] = getelementptr i8, ptr [[C]], i128 112
-; PTR128-NEXT:    store <2 x double> [[TMP53]], ptr [[VEC_GEP158]], align 8
-; PTR128-NEXT:    ret void
-;
 ; PTR64-LABEL: @multiply(
 ; PTR64-NEXT:  entry:
 ; PTR64-NEXT:    [[STORE_BEGIN:%.*]] = ptrtoint ptr [[C:%.*]] to i64
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout.ll
index 87def6b33d6cc..3d050140666c2 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout.ll
@@ -1,5 +1,4 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes='lower-matrix-intrinsics' -data-layout='p:128:128' -S < %s | FileCheck %s --check-prefix=PTR128
 ; RUN: opt -passes='lower-matrix-intrinsics' -data-layout='p:64:64' -S < %s | FileCheck %s --check-prefix=PTR64
 ; RUN: opt -passes='lower-matrix-intrinsics' -data-layout='p:32:32' -S < %s | FileCheck %s --check-prefix=PTR32
 
@@ -7,128 +6,13 @@
 ; the need to emit `libc` calls), we perform strided index calculations using
 ; the same pointer bit-width as the matrix pointers, as determined by the data
 ; layout. To verify this behaviour, this test runs several strided loads and
-; stores through the lowering pass with (32|64|128)-bit pointers, and verifies
-; the generated code extends / truncates strides accordingly. Similarly,
+; stores through the lowering pass with (32|64)-bit pointers, and verifies the
+; generated code extends / truncates strides accordingly. Similarly,
 ; `data-layout-multiply-fused.ll` adopts this approach to verify the same
 ; behaviour for index calculations emitted while lowering fused matrix
 ; multiplies.
 
-define <9 x double> @strided_load_3x3_i128(ptr %in, i128 %stride) {
-; PTR128-LABEL: @strided_load_3x3_i128(
-; PTR128-NEXT:  entry:
-; PTR128-NEXT:    [[VEC_START:%.*]] = mul i128 0, [[STRIDE:%.*]]
-; PTR128-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i128 [[VEC_START]]
-; PTR128-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
-; PTR128-NEXT:    [[VEC_START1:%.*]] = mul i128 1, [[STRIDE]]
-; PTR128-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START1]]
-; PTR128-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
-; PTR128-NEXT:    [[VEC_START4:%.*]] = mul i128 2, [[STRIDE]]
-; PTR128-NEXT:    [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START4]]
-; PTR128-NEXT:    [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
-; PTR128-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
-; PTR128-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
-; PTR128-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-; PTR128-NEXT:    ret <9 x double> [[TMP2]]
-;
-; PTR64-LABEL: @strided_load_3x3_i128(
-; PTR64-NEXT:  entry:
-; PTR64-NEXT:    [[STRIDE_CAST:%.*]] = trunc i128 [[STRIDE:%.*]] to i64
-; PTR64-NEXT:    [[VEC_START:%.*]] = mul i64 0, [[STRIDE_CAST]]
-; PTR64-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i64 [[VEC_START]]
-; PTR64-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
-; PTR64-NEXT:    [[VEC_START1:%.*]] = mul i64 1, [[STRIDE_CAST]]
-; PTR64-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START1]]
-; PTR64-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
-; PTR64-NEXT:    [[VEC_START4:%.*]] = mul i64 2, [[STRIDE_CAST]]
-; PTR64-NEXT:    [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START4]]
-; PTR64-NEXT:    [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
-; PTR64-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
-; PTR64-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
-; PTR64-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-; PTR64-NEXT:    ret <9 x double> [[TMP2]]
-;
-; PTR32-LABEL: @strided_load_3x3_i128(
-; PTR32-NEXT:  entry:
-; PTR32-NEXT:    [[STRIDE_CAST:%.*]] = trunc i128 [[STRIDE:%.*]] to i32
-; PTR32-NEXT:    [[VEC_START:%.*]] = mul i32 0, [[STRIDE_CAST]]
-; PTR32-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i32 [[VEC_START]]
-; PTR32-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
-; PTR32-NEXT:    [[VEC_START1:%.*]] = mul i32 1, [[STRIDE_CAST]]
-; PTR32-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START1]]
-; PTR32-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
-; PTR32-NEXT:    [[VEC_START4:%.*]] = mul i32 2, [[STRIDE_CAST]]
-; PTR32-NEXT:    [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START4]]
-; PTR32-NEXT:    [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
-; PTR32-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
-; PTR32-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
-; PTR32-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-; PTR32-NEXT:    ret <9 x double> [[TMP2]]
-;
-entry:
-  %load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i128(ptr %in, i128 %stride, i1 false, i32 3, i32 3)
-  ret <9 x double> %load
-}
-
-define <9 x double> @strided_load_3x3_const_stride_i128(ptr %in) {
-; PTR128-LABEL: @strided_load_3x3_const_stride_i128(
-; PTR128-NEXT:  entry:
-; PTR128-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
-; PTR128-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i128 16
-; PTR128-NEXT:    [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
-; PTR128-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 32
-; PTR128-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
-; PTR128-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
-; PTR128-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
-; PTR128-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-; PTR128-NEXT:    ret <9 x double> [[TMP2]]
-;
-; PTR64-LABEL: @strided_load_3x3_const_stride_i128(
-; PTR64-NEXT:  entry:
-; PTR64-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
-; PTR64-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i64 16
-; PTR64-NEXT:    [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
-; PTR64-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 32
-; PTR64-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
-; PTR64-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
-; PTR64-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
-; PTR64-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-; PTR64-NEXT:    ret <9 x double> [[TMP2]]
-;
-; PTR32-LABEL: @strided_load_3x3_const_stride_i128(
-; PTR32-NEXT:  entry:
-; PTR32-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
-; PTR32-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i32 16
-; PTR32-NEXT:    [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
-; PTR32-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 32
-; PTR32-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
-; PTR32-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
-; PTR32-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
-; PTR32-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-; PTR32-NEXT:    ret <9 x double> [[TMP2]]
-;
-entry:
-  %load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i128(ptr %in, i128 16, i1 false, i32 3, i32 3)
-  ret <9 x double> %load
-}
-
 define <9 x double> @strided_load_3x3_i64(ptr %in, i64 %stride) {
-; PTR128-LABEL: @strided_load_3x3_i64(
-; PTR128-NEXT:  entry:
-; PTR128-NEXT:    [[STRIDE_CAST:%.*]] = zext i64 [[STRIDE:%.*]] to i128
-; PTR128-NEXT:    [[VEC_START:%.*]] = mul i128 0, [[STRIDE_CAST]]
-; PTR128-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i128 [[VEC_START]]
-; PTR128-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
-; PTR128-NEXT:    [[VEC_START1:%.*]] = mul i128 1, [[STRIDE_CAST]]
-; PTR128-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START1]]
-; PTR128-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
-; PTR128-NEXT:    [[VEC_START4:%.*]] = mul i128 2, [[STRIDE_CAST]]
-; PTR128-NEXT:    [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START4]]
-; PTR128-NEXT:    [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
-; PTR128-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
-; PTR128-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
-; PTR128-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-; PTR128-NEXT:    ret <9 x double> [[TMP2]]
-;
 ; PTR64-LABEL: @strided_load_3x3_i64(
 ; PTR64-NEXT:  entry:
 ; PTR64-NEXT:    [[VEC_START:%.*]] = mul i64 0, [[STRIDE:%.*]]
@@ -168,18 +52,6 @@ entry:
 }
 
 define <9 x double> @strided_load_3x3_const_stride_i64(ptr %in) {
-; PTR128-LABEL: @strided_load_3x3_const_stride_i64(
-; PTR128-NEXT:  entry:
-; PTR128-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
-; PTR128-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i128 16
-; PTR128-NEXT:    [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
-; PTR128-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 32
-; PTR128-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
-; PTR128-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
-; PTR128-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
-; PTR128-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-; PTR128-NEXT:    ret <9 x double> [[TMP2]]
-;
 ; PTR64-LABEL: @strided_load_3x3_const_stride_i64(
 ; PTR64-NEXT:  entry:
 ; PTR64-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
@@ -210,23 +82,6 @@ entry:
 }
 
 define <9 x double> @strided_load_3x3_i32(ptr %in, i32 %stride) {
-; PTR128-LABEL: @strided_load_3x3_i32(
-; PTR128-NEXT:  entry:
-; PTR128-NEXT:    [[STRIDE_CAST:%.*]] = zext i32 [[STRIDE:%.*]] to i128
-; PTR128-NEXT:    [[VEC_START:%.*]] = mul i128 0, [[STRIDE_CAST]]
-; PTR128-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i128 [[VEC_START]]
-; PTR128-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
-; PTR128-NEXT:    [[VEC_START1:%.*]] = mul i128 1, [[STRIDE_CAST]]
-; PTR128-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START1]]
-; PTR128-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
-; PTR128-NEXT:    [[VEC_START4:%.*]] = mul i128 2, [[STRIDE_CAST]]
-; PTR128-NEXT:    [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START4]]
-; PTR128-NEXT:    [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
-; PTR128-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
-; PTR128-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
-; PTR128-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-; PTR128-NEXT:    ret <9 x double> [[TMP2]]
-;
 ; PTR64-LABEL: @strided_load_3x3_i32(
 ; PTR64-NEXT:  entry:
 ; PTR64-NEXT:    [[STRIDE_CAST:%.*]] = zext i32 [[STRIDE:%.*]] to i64
@@ -266,18 +121,6 @@ entry:
 }
 
 define <9 x double> @strided_load_3x3_const_stride_i32(ptr %in) {
-; PTR128-LABEL: @strided_load_3x3_const_stride_i32(
-; PTR128-NEXT:  entry:
-; PTR128-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
-; PTR128-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i128 16
-; PTR128-NEXT:    [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
-; PTR128-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 32
-; PTR128-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
-; PTR128-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
-; PTR128-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
-; PTR128-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-; PTR128-NEXT:    ret <9 x double> [[TMP2]]
-;
 ; PTR64-LABEL: @strided_load_3x3_const_stride_i32(
 ; PTR64-NEXT:  entry:
 ; PTR64-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
@@ -307,6 +150,5 @@ entry:
   ret <9 x double> %load
 }
 
-declare <9 x double> @llvm.matrix.column.major.load.v9f64.i128(ptr, i128, i1, i32, i32)
 declare <9 x double> @llvm.matrix.column.major.load.v9f64.i64(ptr, i64, i1, i32, i32)
 declare <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr, i32, i1, i32, i32)
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-remainder-rm.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-remainder-rm.ll
new file mode 100644
index 0000000000000..4ec58988bca7b
--- /dev/null
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-remainder-rm.ll
@@ -0,0 +1,96 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes='lower-matrix-intrinsics'                                                  -matrix-default-layout=row-major -S < %s | FileCheck --check-prefix=SPLIT_REMAINDER %s
+; RUN: opt -passes='lower-matrix-intrinsics' -matrix-split-matmul-remainder-over-threshold=96 -matrix-default-layout=row-major -S < %s | FileCheck --check-prefix=NO_SPLIT_REMAINDER %s
+; RUN: opt -passes='lower-matrix-intrinsics' -matrix-split-matmul-remainder-over-threshold=64 -matrix-default-layout=row-major -S < %s | FileCheck --check-prefix=SPLIT_REMAINDER %s
+
+; REQUIRES: aarch64-registered-target
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:8:32:64-S128"
+target triple = "aarch64-apple-ios"
+
+define void @matmul(ptr %a, ptr %b, ptr %c) {
+; SPLIT_REMAINDER-LABEL: define void @matmul(
+; SPLIT_REMAINDER-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) {
+; SPLIT_REMAINDER-NEXT:    [[COL_LOAD:%.*]] = load <3 x float>, ptr [[A]], align 4
+; SPLIT_REMAINDER-NEXT:    [[COL_LOAD1:%.*]] = load <3 x float>, ptr [[B]], align 4
+; SPLIT_REMAINDER-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, ptr [[B]], i64 3
+; SPLIT_REMAINDER-NEXT:    [[COL_LOAD2:%.*]] = load <3 x float>, ptr [[VEC_GEP]], align 4
+; SPLIT_REMAINDER-NEXT:    [[VEC_GEP3:%.*]] = getelementptr float, ptr [[B]], i64 6
+; SPLIT_REMAINDER-NEXT:    [[COL_LOAD4:%.*]] = load <3 x float>, ptr [[VEC_GEP3]], align 4
+; SPLIT_REMAINDER-NEXT:    [[BLOCK:%.*]] = shufflevector <3 x float> [[COL_LOAD1]], <3 x float> poison, <2 x i32> <i32 0, i32 1>
+; SPLIT_REMAINDER-NEXT:    [[TMP1:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 0
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i64 0
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x float> [[SPLAT_SPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT:    [[TMP2:%.*]] = fmul <2 x float> [[SPLAT_SPLAT]], [[BLOCK]]
+; SPLIT_REMAINDER-NEXT:    [[BLOCK5:%.*]] = shufflevector <3 x float> [[COL_LOAD2]], <3 x float> poison, <2 x i32> <i32 0, i32 1>
+; SPLIT_REMAINDER-NEXT:    [[TMP3:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 1
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT6:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i64 0
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT7:%.*]] = shufflevector <2 x float> [[SPLAT_SPLATINSERT6]], <2 x float> poison, <2 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT:    [[TMP4:%.*]] = fmul <2 x float> [[SPLAT_SPLAT7]], [[BLOCK5]]
+; SPLIT_REMAINDER-NEXT:    [[TMP5:%.*]] = fadd <2 x float> [[TMP2]], [[TMP4]]
+; SPLIT_REMAINDER-NEXT:    [[BLOCK8:%.*]] = shufflevector <3 x float> [[COL_LOAD4]], <3 x float> poison, <2 x i32> <i32 0, i32 1>
+; SPLIT_REMAINDER-NEXT:    [[TMP6:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 2
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT9:%.*]] = insertelement <2 x float> poison, float [[TMP6]], i64 0
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT10:%.*]] = shufflevector <2 x float> [[SPLAT_SPLATINSERT9]], <2 x float> poison, <2 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT:    [[TMP7:%.*]] = fmul <2 x float> [[SPLAT_SPLAT10]], [[BLOCK8]]
+; SPLIT_REMAINDER-NEXT:    [[TMP8:%.*]] = fadd <2 x float> [[TMP5]], [[TMP7]]
+; SPLIT_REMAINDER-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; SPLIT_REMAINDER-NEXT:    [[TMP10:%.*]] = shufflevector <3 x float> poison, <3 x float> [[TMP9]], <3 x i32> <i32 3, i32 4, i32 2>
+; SPLIT_REMAINDER-NEXT:    [[BLOCK11:%.*]] = shufflevector <3 x float> [[COL_LOAD1]], <3 x float> poison, <1 x i32> <i32 2>
+; SPLIT_REMAINDER-NEXT:    [[TMP11:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 0
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT12:%.*]] = insertelement <1 x float> poison, float [[TMP11]], i64 0
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT13:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT12]], <1 x float> poison, <1 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT:    [[TMP12:%.*]] = fmul <1 x float> [[SPLAT_SPLAT13]], [[BLOCK11]]
+; SPLIT_REMAINDER-NEXT:    [[BLOCK14:%.*]] = shufflevector <3 x float> [[COL_LOAD2]], <3 x float> poison, <1 x i32> <i32 2>
+; SPLIT_REMAINDER-NEXT:    [[TMP13:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 1
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT15:%.*]] = insertelement <1 x float> poison, float [[TMP13]], i64 0
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT16:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT15]], <1 x float> poison, <1 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT:    [[TMP14:%.*]] = fmul <1 x float> [[SPLAT_SPLAT16]], [[BLOCK14]]
+; SPLIT_REMAINDER-NEXT:    [[TMP15:%.*]] = fadd <1 x float> [[TMP12]], [[TMP14]]
+; SPLIT_REMAINDER-NEXT:    [[BLOCK17:%.*]] = shufflevector <3 x float> [[COL_LOAD4]], <3 x float> poison, <1 x i32> <i32 2>
+; SPLIT_REMAINDER-NEXT:    [[TMP16:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 2
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT18:%.*]] = insertelement <1 x float> poison, float [[TMP16]], i64 0
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT19:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT18]], <1 x float> poison, <1 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT:    [[TMP17:%.*]] = fmul <1 x float> [[SPLAT_SPLAT19]], [[BLOCK17]]
+; SPLIT_REMAINDER-NEXT:    [[TMP18:%.*]] = fadd <1 x float> [[TMP15]], [[TMP17]]
+; SPLIT_REMAINDER-NEXT:    [[TMP19:%.*]] = shufflevector <1 x float> [[TMP18]], <1 x float> poison, <3 x i32> <i32 0, i32 poison, i32 poison>
+; SPLIT_REMAINDER-NEXT:    [[TMP20:%.*]] = shufflevector <3 x float> [[TMP10]], <3 x float> [[TMP19]], <3 x i32> <i32 0, i32 1, i32 3>
+; SPLIT_REMAINDER-NEXT:    store <3 x float> [[TMP20]], ptr [[C]], align 4
+; SPLIT_REMAINDER-NEXT:    ret void
+;
+; NO_SPLIT_REMAINDER-LABEL: define void @matmul(
+; NO_SPLIT_REMAINDER-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) {
+; NO_SPLIT_REMAINDER-NEXT:    [[COL_LOAD:%.*]] = load <3 x float>, ptr [[A]], align 4
+; NO_SPLIT_REMAINDER-NEXT:    [[COL_LOAD1:%.*]] = load <3 x float>, ptr [[B]], align 4
+; NO_SPLIT_REMAINDER-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, ptr [[B]], i64 3
+; NO_SPLIT_REMAINDER-NEXT:    [[COL_LOAD2:%.*]] = load <3 x float>, ptr [[VEC_GEP]], align 4
+; NO_SPLIT_REMAINDER-NEXT:    [[VEC_GEP3:%.*]] = getelementptr float, ptr [[B]], i64 6
+; NO_SPLIT_REMAINDER-NEXT:    [[COL_LOAD4:%.*]] = load <3 x float>, ptr [[VEC_GEP3]], align 4
+; NO_SPLIT_REMAINDER-NEXT:    [[BLOCK:%.*]] = shufflevector <3 x float> [[COL_LOAD1]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; NO_SPLIT_REMAINDER-NEXT:    [[TMP1:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 0
+; NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <3 x float> poison, float [[TMP1]], i64 0
+; NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <3 x float> [[SPLAT_SPLATINSERT]], <3 x float> poison, <3 x i32> zeroinitializer
+; NO_SPLIT_REMAINDER-NEXT:    [[TMP2:%.*]] = fmul <3 x float> [[SPLAT_SPLAT]], [[BLOCK]]
+; NO_SPLIT_REMAINDER-NEXT:    [[BLOCK5:%.*]] = shufflevector <3 x float> [[COL_LOAD2]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; NO_SPLIT_REMAINDER-NEXT:    [[TMP3:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 1
+; NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT6:%.*]] = insertelement <3 x float> poison, float [[TMP3]], i64 0
+; NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT7:%.*]] = shufflevector <3 x float> [[SPLAT_SPLATINSERT6]], <3 x float> poison, <3 x i32> zeroinitializer
+; NO_SPLIT_REMAINDER-NEXT:    [[TMP4:%.*]] = fmul <3 x float> [[SPLAT_SPLAT7]], [[BLOCK5]]
+; NO_SPLIT_REMAINDER-NEXT:    [[TMP5:%.*]] = fadd <3 x float> [[TMP2]], [[TMP4]]
+; NO_SPLIT_REMAINDER-NEXT:    [[BLOCK8:%.*]] = shufflevector <3 x float> [[COL_LOAD4]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; NO_SPLIT_REMAINDER-NEXT:    [[TMP6:%.*]] = extractelement <3 x float> [[COL_LOAD]], i64 2
+; NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT9:%.*]] = insertelement <3 x float> poison, float [[TMP6]], i64 0
+; NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT10:%.*]] = shufflevector <3 x float> [[SPLAT_SPLATINSERT9]], <3 x float> poison, <3 x i32> zeroinitializer
+; NO_SPLIT_REMAINDER-NEXT:    [[TMP7:%.*]] = fmul <3 x float> [[SPLAT_SPLAT10]], [[BLOCK8]]
+; NO_SPLIT_REMAINDER-NEXT:    [[TMP8:%.*]] = fadd <3 x float> [[TMP5]], [[TMP7]]
+; NO_SPLIT_REMAINDER-NEXT:    [[TMP9:%.*]] = shufflevector <3 x float> [[TMP8]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; NO_SPLIT_REMAINDER-NEXT:    [[TMP10:%.*]] = shufflevector <3 x float> poison, <3 x float> [[TMP9]], <3 x i32> <i32 3, i32 4, i32 5>
+; NO_SPLIT_REMAINDER-NEXT:    store <3 x float> [[TMP10]], ptr [[C]], align 4
+; NO_SPLIT_REMAINDER-NEXT:    ret void
+;
+  %a_load = load <3 x float>, ptr %a, align 4
+  %b_load = load <9 x float>, ptr %b, align 4
+  %matmul = tail call <3 x float> @llvm.matrix.multiply.v3f32.v9f32.v3f32(<3 x float> %a_load, <9 x float> %b_load, i32 1, i32 3, i32 3)
+  store <3 x float> %matmul, ptr %c, align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-remainder.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-remainder.ll
new file mode 100644
index 0000000000000..fbc2cbc164daf
--- /dev/null
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-remainder.ll
@@ -0,0 +1,96 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes='lower-matrix-intrinsics'                                                  -S < %s | FileCheck --check-prefix=SPLIT_REMAINDER %s
+; RUN: opt -passes='lower-matrix-intrinsics' -matrix-split-matmul-remainder-over-threshold=96 -S < %s | FileCheck --check-prefix=NO_SPLIT_REMAINDER %s
+; RUN: opt -passes='lower-matrix-intrinsics' -matrix-split-matmul-remainder-over-threshold=64 -S < %s | FileCheck --check-prefix=SPLIT_REMAINDER %s
+
+; REQUIRES: aarch64-registered-target
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:8:32:64-S128"
+target triple = "aarch64-apple-ios"
+
+define void @matmul(ptr %a, ptr %b, ptr %c) {
+; SPLIT_REMAINDER-LABEL: define void @matmul(
+; SPLIT_REMAINDER-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) {
+; SPLIT_REMAINDER-NEXT:    [[COL_LOAD:%.*]] = load <3 x float>, ptr [[A]], align 4
+; SPLIT_REMAINDER-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, ptr [[A]], i64 3
+; SPLIT_REMAINDER-NEXT:    [[COL_LOAD1:%.*]] = load <3 x float>, ptr [[VEC_GEP]], align 4
+; SPLIT_REMAINDER-NEXT:    [[VEC_GEP2:%.*]] = getelementptr float, ptr [[A]], i64 6
+; SPLIT_REMAINDER-NEXT:    [[COL_LOAD3:%.*]] = load <3 x float>, ptr [[VEC_GEP2]], align 4
+; SPLIT_REMAINDER-NEXT:    [[COL_LOAD4:%.*]] = load <3 x float>, ptr [[B]], align 4
+; SPLIT_REMAINDER-NEXT:    [[BLOCK:%.*]] = shufflevector <3 x float> [[COL_LOAD]], <3 x float> poison, <2 x i32> <i32 0, i32 1>
+; SPLIT_REMAINDER-NEXT:    [[TMP1:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 0
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i64 0
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x float> [[SPLAT_SPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT:    [[TMP2:%.*]] = fmul <2 x float> [[BLOCK]], [[SPLAT_SPLAT]]
+; SPLIT_REMAINDER-NEXT:    [[BLOCK5:%.*]] = shufflevector <3 x float> [[COL_LOAD1]], <3 x float> poison, <2 x i32> <i32 0, i32 1>
+; SPLIT_REMAINDER-NEXT:    [[TMP3:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 1
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT6:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i64 0
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT7:%.*]] = shufflevector <2 x float> [[SPLAT_SPLATINSERT6]], <2 x float> poison, <2 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT:    [[TMP4:%.*]] = fmul <2 x float> [[BLOCK5]], [[SPLAT_SPLAT7]]
+; SPLIT_REMAINDER-NEXT:    [[TMP5:%.*]] = fadd <2 x float> [[TMP2]], [[TMP4]]
+; SPLIT_REMAINDER-NEXT:    [[BLOCK8:%.*]] = shufflevector <3 x float> [[COL_LOAD3]], <3 x float> poison, <2 x i32> <i32 0, i32 1>
+; SPLIT_REMAINDER-NEXT:    [[TMP6:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 2
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT9:%.*]] = insertelement <2 x float> poison, float [[TMP6]], i64 0
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT10:%.*]] = shufflevector <2 x float> [[SPLAT_SPLATINSERT9]], <2 x float> poison, <2 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT:    [[TMP7:%.*]] = fmul <2 x float> [[BLOCK8]], [[SPLAT_SPLAT10]]
+; SPLIT_REMAINDER-NEXT:    [[TMP8:%.*]] = fadd <2 x float> [[TMP5]], [[TMP7]]
+; SPLIT_REMAINDER-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; SPLIT_REMAINDER-NEXT:    [[TMP10:%.*]] = shufflevector <3 x float> poison, <3 x float> [[TMP9]], <3 x i32> <i32 3, i32 4, i32 2>
+; SPLIT_REMAINDER-NEXT:    [[BLOCK11:%.*]] = shufflevector <3 x float> [[COL_LOAD]], <3 x float> poison, <1 x i32> <i32 2>
+; SPLIT_REMAINDER-NEXT:    [[TMP11:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 0
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT12:%.*]] = insertelement <1 x float> poison, float [[TMP11]], i64 0
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT13:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT12]], <1 x float> poison, <1 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT:    [[TMP12:%.*]] = fmul <1 x float> [[BLOCK11]], [[SPLAT_SPLAT13]]
+; SPLIT_REMAINDER-NEXT:    [[BLOCK14:%.*]] = shufflevector <3 x float> [[COL_LOAD1]], <3 x float> poison, <1 x i32> <i32 2>
+; SPLIT_REMAINDER-NEXT:    [[TMP13:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 1
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT15:%.*]] = insertelement <1 x float> poison, float [[TMP13]], i64 0
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT16:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT15]], <1 x float> poison, <1 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT:    [[TMP14:%.*]] = fmul <1 x float> [[BLOCK14]], [[SPLAT_SPLAT16]]
+; SPLIT_REMAINDER-NEXT:    [[TMP15:%.*]] = fadd <1 x float> [[TMP12]], [[TMP14]]
+; SPLIT_REMAINDER-NEXT:    [[BLOCK17:%.*]] = shufflevector <3 x float> [[COL_LOAD3]], <3 x float> poison, <1 x i32> <i32 2>
+; SPLIT_REMAINDER-NEXT:    [[TMP16:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 2
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT18:%.*]] = insertelement <1 x float> poison, float [[TMP16]], i64 0
+; SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT19:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT18]], <1 x float> poison, <1 x i32> zeroinitializer
+; SPLIT_REMAINDER-NEXT:    [[TMP17:%.*]] = fmul <1 x float> [[BLOCK17]], [[SPLAT_SPLAT19]]
+; SPLIT_REMAINDER-NEXT:    [[TMP18:%.*]] = fadd <1 x float> [[TMP15]], [[TMP17]]
+; SPLIT_REMAINDER-NEXT:    [[TMP19:%.*]] = shufflevector <1 x float> [[TMP18]], <1 x float> poison, <3 x i32> <i32 0, i32 poison, i32 poison>
+; SPLIT_REMAINDER-NEXT:    [[TMP20:%.*]] = shufflevector <3 x float> [[TMP10]], <3 x float> [[TMP19]], <3 x i32> <i32 0, i32 1, i32 3>
+; SPLIT_REMAINDER-NEXT:    store <3 x float> [[TMP20]], ptr [[C]], align 4
+; SPLIT_REMAINDER-NEXT:    ret void
+;
+; NO_SPLIT_REMAINDER-LABEL: define void @matmul(
+; NO_SPLIT_REMAINDER-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) {
+; NO_SPLIT_REMAINDER-NEXT:    [[COL_LOAD:%.*]] = load <3 x float>, ptr [[A]], align 4
+; NO_SPLIT_REMAINDER-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, ptr [[A]], i64 3
+; NO_SPLIT_REMAINDER-NEXT:    [[COL_LOAD1:%.*]] = load <3 x float>, ptr [[VEC_GEP]], align 4
+; NO_SPLIT_REMAINDER-NEXT:    [[VEC_GEP2:%.*]] = getelementptr float, ptr [[A]], i64 6
+; NO_SPLIT_REMAINDER-NEXT:    [[COL_LOAD3:%.*]] = load <3 x float>, ptr [[VEC_GEP2]], align 4
+; NO_SPLIT_REMAINDER-NEXT:    [[COL_LOAD4:%.*]] = load <3 x float>, ptr [[B]], align 4
+; NO_SPLIT_REMAINDER-NEXT:    [[BLOCK:%.*]] = shufflevector <3 x float> [[COL_LOAD]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; NO_SPLIT_REMAINDER-NEXT:    [[TMP1:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 0
+; NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <3 x float> poison, float [[TMP1]], i64 0
+; NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <3 x float> [[SPLAT_SPLATINSERT]], <3 x float> poison, <3 x i32> zeroinitializer
+; NO_SPLIT_REMAINDER-NEXT:    [[TMP2:%.*]] = fmul <3 x float> [[BLOCK]], [[SPLAT_SPLAT]]
+; NO_SPLIT_REMAINDER-NEXT:    [[BLOCK5:%.*]] = shufflevector <3 x float> [[COL_LOAD1]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; NO_SPLIT_REMAINDER-NEXT:    [[TMP3:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 1
+; NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT6:%.*]] = insertelement <3 x float> poison, float [[TMP3]], i64 0
+; NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT7:%.*]] = shufflevector <3 x float> [[SPLAT_SPLATINSERT6]], <3 x float> poison, <3 x i32> zeroinitializer
+; NO_SPLIT_REMAINDER-NEXT:    [[TMP4:%.*]] = fmul <3 x float> [[BLOCK5]], [[SPLAT_SPLAT7]]
+; NO_SPLIT_REMAINDER-NEXT:    [[TMP5:%.*]] = fadd <3 x float> [[TMP2]], [[TMP4]]
+; NO_SPLIT_REMAINDER-NEXT:    [[BLOCK8:%.*]] = shufflevector <3 x float> [[COL_LOAD3]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; NO_SPLIT_REMAINDER-NEXT:    [[TMP6:%.*]] = extractelement <3 x float> [[COL_LOAD4]], i64 2
+; NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLATINSERT9:%.*]] = insertelement <3 x float> poison, float [[TMP6]], i64 0
+; NO_SPLIT_REMAINDER-NEXT:    [[SPLAT_SPLAT10:%.*]] = shufflevector <3 x float> [[SPLAT_SPLATINSERT9]], <3 x float> poison, <3 x i32> zeroinitializer
+; NO_SPLIT_REMAINDER-NEXT:    [[TMP7:%.*]] = fmul <3 x float> [[BLOCK8]], [[SPLAT_SPLAT10]]
+; NO_SPLIT_REMAINDER-NEXT:    [[TMP8:%.*]] = fadd <3 x float> [[TMP5]], [[TMP7]]
+; NO_SPLIT_REMAINDER-NEXT:    [[TMP9:%.*]] = shufflevector <3 x float> [[TMP8]], <3 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; NO_SPLIT_REMAINDER-NEXT:    [[TMP10:%.*]] = shufflevector <3 x float> poison, <3 x float> [[TMP9]], <3 x i32> <i32 3, i32 4, i32 5>
+; NO_SPLIT_REMAINDER-NEXT:    store <3 x float> [[TMP10]], ptr [[C]], align 4
+; NO_SPLIT_REMAINDER-NEXT:    ret void
+;
+  %a_load = load <9 x float>, ptr %a, align 4
+  %b_load = load <3 x float>, ptr %b, align 4
+  %matmul = tail call <3 x float> @llvm.matrix.multiply.v9f32.v3f32.v3f32(<9 x float> %a_load, <3 x float> %b_load, i32 3, i32 3, i32 1)
+  store <3 x float> %matmul, ptr %c, align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/MemCpyOpt/vscale-crashes.ll b/llvm/test/Transforms/MemCpyOpt/vscale-crashes.ll
index 80f2977fcb3ad..e6ebe974cffa8 100644
--- a/llvm/test/Transforms/MemCpyOpt/vscale-crashes.ll
+++ b/llvm/test/Transforms/MemCpyOpt/vscale-crashes.ll
@@ -71,7 +71,7 @@ define void @callslotoptzn(<vscale x 4 x float> %val, ptr %out) {
 ; CHECK-NEXT:    [[ALLOC:%.*]] = alloca <vscale x 4 x float>, align 16
 ; CHECK-NEXT:    [[IDX:%.*]] = tail call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; CHECK-NEXT:    [[STRIDE:%.*]] = getelementptr inbounds float, ptr [[ALLOC]], <vscale x 4 x i32> [[IDX]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> [[VAL:%.*]], <vscale x 4 x ptr> [[STRIDE]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> [[VAL:%.*]], <vscale x 4 x ptr> align 4 [[STRIDE]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[LI:%.*]] = load <vscale x 4 x float>, ptr [[ALLOC]], align 4
 ; CHECK-NEXT:    store <vscale x 4 x float> [[LI]], ptr [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/NewGVN/2016-08-30-MaskedScatterGather-xfail.ll b/llvm/test/Transforms/NewGVN/2016-08-30-MaskedScatterGather-xfail.ll
index 47db666132db3..2acbdae6fad68 100644
--- a/llvm/test/Transforms/NewGVN/2016-08-30-MaskedScatterGather-xfail.ll
+++ b/llvm/test/Transforms/NewGVN/2016-08-30-MaskedScatterGather-xfail.ll
@@ -7,12 +7,12 @@ declare <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x
 ; This test ensures that masked scatter and gather operations, which take vectors of pointers,
 ; do not have pointer aliasing ignored when being processed.
 ; No scatter/gather calls should end up eliminated
-; CHECK: llvm.masked.gather
-; CHECK: llvm.masked.gather
-; CHECK: llvm.masked.scatter
-; CHECK: llvm.masked.gather
-; CHECK: llvm.masked.scatter
-; CHECK: llvm.masked.gather
+; CHECK: call{{.*}}llvm.masked.gather
+; CHECK: call{{.*}}llvm.masked.gather
+; CHECK: call{{.*}}llvm.masked.scatter
+; CHECK: call{{.*}}llvm.masked.gather
+; CHECK: call{{.*}}llvm.masked.scatter
+; CHECK: call{{.*}}llvm.masked.gather
 define spir_kernel void @test(<2 x ptr> %in1, <2 x ptr> %in2, ptr %out) {
 entry:
   ; Just some temporary storage
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll
index 52d279a8aafa6..e3765ed541e7a 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll
@@ -114,14 +114,14 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i64 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP6]], i64 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ult <2 x i64> [[TMP8]], splat (i64 225)
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp ult <2 x i64> [[TMP10]], splat (i64 225)
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP11]], i64 0
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP13]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP11]], i64 1
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP14]])
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ult <2 x i64> [[TMP10]], splat (i64 225)
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i1> [[TMP12]], i64 0
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP15]])
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[TMP12]], i64 1
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP13]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP14]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP15]])
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP16]])
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw double, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP17]], i64 16
@@ -190,14 +190,14 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea
 ; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <2 x i64> poison, i64 [[TMP37]], i64 0
 ; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <2 x i64> [[TMP39]], i64 [[TMP38]], i64 1
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp ult <2 x i64> [[TMP36]], splat (i64 225)
-; CHECK-NEXT:    [[TMP42:%.*]] = icmp ult <2 x i64> [[TMP40]], splat (i64 225)
 ; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <2 x i1> [[TMP41]], i64 0
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP43]])
 ; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <2 x i1> [[TMP41]], i64 1
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP44]])
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp ult <2 x i64> [[TMP40]], splat (i64 225)
 ; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <2 x i1> [[TMP42]], i64 0
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP45]])
 ; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <2 x i1> [[TMP42]], i64 1
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP43]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP44]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP45]])
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP46]])
 ; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr inbounds nuw double, ptr [[A]], i64 [[TMP33]]
 ; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP47]], i64 16
@@ -267,14 +267,14 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea
 ; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <2 x i64> poison, i64 [[TMP68]], i64 0
 ; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <2 x i64> [[TMP70]], i64 [[TMP69]], i64 1
 ; CHECK-NEXT:    [[TMP72:%.*]] = icmp ult <2 x i64> [[TMP67]], splat (i64 225)
-; CHECK-NEXT:    [[TMP73:%.*]] = icmp ult <2 x i64> [[TMP71]], splat (i64 225)
 ; CHECK-NEXT:    [[TMP74:%.*]] = extractelement <2 x i1> [[TMP72]], i64 0
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP74]])
 ; CHECK-NEXT:    [[TMP75:%.*]] = extractelement <2 x i1> [[TMP72]], i64 1
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP75]])
+; CHECK-NEXT:    [[TMP73:%.*]] = icmp ult <2 x i64> [[TMP71]], splat (i64 225)
 ; CHECK-NEXT:    [[TMP76:%.*]] = extractelement <2 x i1> [[TMP73]], i64 0
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP76]])
 ; CHECK-NEXT:    [[TMP77:%.*]] = extractelement <2 x i1> [[TMP73]], i64 1
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP74]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP75]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP76]])
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP77]])
 ; CHECK-NEXT:    [[TMP78:%.*]] = getelementptr inbounds nuw double, ptr [[A]], i64 [[TMP64]]
 ; CHECK-NEXT:    [[TMP79:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP78]], i64 16
@@ -344,14 +344,14 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea
 ; CHECK-NEXT:    [[TMP101:%.*]] = insertelement <2 x i64> poison, i64 [[TMP99]], i64 0
 ; CHECK-NEXT:    [[TMP102:%.*]] = insertelement <2 x i64> [[TMP101]], i64 [[TMP100]], i64 1
 ; CHECK-NEXT:    [[TMP103:%.*]] = icmp ult <2 x i64> [[TMP98]], splat (i64 225)
-; CHECK-NEXT:    [[TMP104:%.*]] = icmp ult <2 x i64> [[TMP102]], splat (i64 225)
 ; CHECK-NEXT:    [[TMP105:%.*]] = extractelement <2 x i1> [[TMP103]], i64 0
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP105]])
 ; CHECK-NEXT:    [[TMP106:%.*]] = extractelement <2 x i1> [[TMP103]], i64 1
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP106]])
+; CHECK-NEXT:    [[TMP104:%.*]] = icmp ult <2 x i64> [[TMP102]], splat (i64 225)
 ; CHECK-NEXT:    [[TMP107:%.*]] = extractelement <2 x i1> [[TMP104]], i64 0
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP107]])
 ; CHECK-NEXT:    [[TMP108:%.*]] = extractelement <2 x i1> [[TMP104]], i64 1
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP105]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP106]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP107]])
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP108]])
 ; CHECK-NEXT:    [[TMP109:%.*]] = getelementptr inbounds nuw double, ptr [[A]], i64 [[TMP95]]
 ; CHECK-NEXT:    [[TMP110:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP109]], i64 16
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_add_q7.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_add_q7.ll
index 0023dea2149b5..7fb72e6c5147a 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_add_q7.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_add_q7.ll
@@ -23,10 +23,10 @@ define dso_local void @arm_add_q7(ptr %pSrcA, ptr %pSrcB, ptr noalias %pDst, i32
 ; CHECK-NEXT:    [[NEXT_GEP14:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[NEXT_GEP15:%.*]] = getelementptr i8, ptr [[PSRCB:%.*]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = tail call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[BLOCKSIZE]])
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[NEXT_GEP]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD16:%.*]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[NEXT_GEP15]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[NEXT_GEP]], <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD16:%.*]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[NEXT_GEP15]], <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[WIDE_MASKED_LOAD]], <16 x i8> [[WIDE_MASKED_LOAD16]])
-; CHECK-NEXT:    tail call void @llvm.masked.store.v16i8.p0(<16 x i8> [[TMP2]], ptr [[NEXT_GEP14]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    tail call void @llvm.masked.store.v16i8.p0(<16 x i8> [[TMP2]], ptr align 1 [[NEXT_GEP14]], <16 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[WHILE_END]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll
index 5e9fe8c4135ac..c18620710b685 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll
@@ -37,7 +37,7 @@ define void @arm_mean_q7(ptr noundef %pSrc, i32 noundef %blockSize, ptr noundef
 ; CHECK-NEXT:    br i1 [[CMP2_NOT15]], label [[WHILE_END5:%.*]], label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = tail call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 0, i32 [[AND]])
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[PSRC_ADDR_0_LCSSA]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[PSRC_ADDR_0_LCSSA]], <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP5:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP5]])
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/addsub-inseltpoison.ll b/llvm/test/Transforms/PhaseOrdering/X86/addsub-inseltpoison.ll
index a3af048c4e442..2c1d73eaafc5e 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/addsub-inseltpoison.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/addsub-inseltpoison.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
 ; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX
 ; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
 ; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX
 ; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX
 
@@ -12,6 +12,400 @@
 ; That may require some coordination between VectorCombine, SLP, and other passes.
 ; The end goal is to get a single "vaddsubps" instruction for x86 with AVX.
 
+define <2 x double> @test_addsub_v2f64(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: @test_addsub_v2f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x double> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    ret <2 x double> [[TMP3]]
+;
+  %1 = extractelement <2 x double> %A, i32 0
+  %2 = extractelement <2 x double> %B, i32 0
+  %sub = fsub double %1, %2
+  %3 = extractelement <2 x double> %A, i32 1
+  %4 = extractelement <2 x double> %B, i32 1
+  %add = fadd double %3, %4
+  %vecinsert1 = insertelement <2 x double> poison, double %sub, i32 0
+  %vecinsert2 = insertelement <2 x double> %vecinsert1, double %add, i32 1
+  ret <2 x double> %vecinsert2
+}
+
+define <4 x double> @test_addsub_v4f64(<4 x double> %A, <4 x double> %B) {
+; CHECK-LABEL: @test_addsub_v4f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x double> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+;
+  %1 = extractelement <4 x double> %A, i32 0
+  %2 = extractelement <4 x double> %B, i32 0
+  %sub = fsub double %1, %2
+  %3 = extractelement <4 x double> %A, i32 2
+  %4 = extractelement <4 x double> %B, i32 2
+  %sub2 = fsub double %3, %4
+  %5 = extractelement <4 x double> %A, i32 1
+  %6 = extractelement <4 x double> %B, i32 1
+  %add = fadd double %5, %6
+  %7 = extractelement <4 x double> %A, i32 3
+  %8 = extractelement <4 x double> %B, i32 3
+  %add2 = fadd double %7, %8
+  %vecinsert1 = insertelement <4 x double> poison, double %add, i32 1
+  %vecinsert2 = insertelement <4 x double> %vecinsert1, double %add2, i32 3
+  %vecinsert3 = insertelement <4 x double> %vecinsert2, double %sub, i32 0
+  %vecinsert4 = insertelement <4 x double> %vecinsert3, double %sub2, i32 2
+  ret <4 x double> %vecinsert4
+}
+
+define <8 x double> @test_addsub_v8f64(<8 x double> %A, <8 x double> %B) {
+; SSE2-LABEL: @test_addsub_v8f64(
+; SSE2-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[A:%.*]], [[B:%.*]]
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; SSE2-NEXT:    [[TMP3:%.*]] = fadd <8 x double> [[A]], [[B]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x double> [[TMP3]], <8 x double> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP4]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; SSE2-NEXT:    ret <8 x double> [[TMP5]]
+;
+; SSE4-LABEL: @test_addsub_v8f64(
+; SSE4-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[A:%.*]], [[B:%.*]]
+; SSE4-NEXT:    [[TMP2:%.*]] = fadd <8 x double> [[A]], [[B]]
+; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; SSE4-NEXT:    ret <8 x double> [[TMP3]]
+;
+; AVX-LABEL: @test_addsub_v8f64(
+; AVX-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[A:%.*]], [[B:%.*]]
+; AVX-NEXT:    [[TMP2:%.*]] = fadd <8 x double> [[A]], [[B]]
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; AVX-NEXT:    ret <8 x double> [[TMP3]]
+;
+  %1 = extractelement <8 x double> %A, i32 0
+  %2 = extractelement <8 x double> %B, i32 0
+  %sub = fsub double %1, %2
+  %3 = extractelement <8 x double> %A, i32 2
+  %4 = extractelement <8 x double> %B, i32 2
+  %sub2 = fsub double %3, %4
+  %5 = extractelement <8 x double> %A, i32 1
+  %6 = extractelement <8 x double> %B, i32 1
+  %add = fadd double %5, %6
+  %7 = extractelement <8 x double> %A, i32 3
+  %8 = extractelement <8 x double> %B, i32 3
+  %add2 = fadd double %7, %8
+  %9 = extractelement <8 x double> %A, i32 4
+  %10 = extractelement <8 x double> %B, i32 4
+  %sub3 = fsub double %9, %10
+  %11 = extractelement <8 x double> %A, i32 6
+  %12 = extractelement <8 x double> %B, i32 6
+  %sub4 = fsub double %11, %12
+  %13 = extractelement <8 x double> %A, i32 5
+  %14 = extractelement <8 x double> %B, i32 5
+  %add3 = fadd double %13, %14
+  %15 = extractelement <8 x double> %A, i32 7
+  %16 = extractelement <8 x double> %B, i32 7
+  %add4 = fadd double %15, %16
+  %vecinsert1 = insertelement <8 x double> poison, double %add, i32 1
+  %vecinsert2 = insertelement <8 x double> %vecinsert1, double %add2, i32 3
+  %vecinsert3 = insertelement <8 x double> %vecinsert2, double %sub, i32 0
+  %vecinsert4 = insertelement <8 x double> %vecinsert3, double %sub2, i32 2
+  %vecinsert5 = insertelement <8 x double> %vecinsert4, double %add3, i32 5
+  %vecinsert6 = insertelement <8 x double> %vecinsert5, double %add4, i32 7
+  %vecinsert7 = insertelement <8 x double> %vecinsert6, double %sub3, i32 4
+  %vecinsert8 = insertelement <8 x double> %vecinsert7, double %sub4, i32 6
+  ret <8 x double> %vecinsert8
+}
+
+define <2 x float> @test_addsub_v2f32(<2 x float> %v0, <2 x float> %v1) {
+; CHECK-LABEL: @test_addsub_v2f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x float> [[V0:%.*]], [[V1:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x float> [[V0]], [[V1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    ret <2 x float> [[TMP3]]
+;
+  %v2 = extractelement <2 x float> %v0, i32 0
+  %v3 = extractelement <2 x float> %v1, i32 0
+  %v4 = extractelement <2 x float> %v0, i32 1
+  %v5 = extractelement <2 x float> %v1, i32 1
+  %sub = fsub float %v2, %v3
+  %add = fadd float %v5, %v4
+  %res0 = insertelement <2 x float> poison, float %sub, i32 0
+  %res1 = insertelement <2 x float> %res0, float %add, i32 1
+  ret <2 x float> %res1
+}
+
+define <4 x float> @test_addsub_v4f32(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: @test_addsub_v4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[TMP3]]
+;
+  %1 = extractelement <4 x float> %A, i32 0
+  %2 = extractelement <4 x float> %B, i32 0
+  %sub = fsub float %1, %2
+  %3 = extractelement <4 x float> %A, i32 2
+  %4 = extractelement <4 x float> %B, i32 2
+  %sub2 = fsub float %3, %4
+  %5 = extractelement <4 x float> %A, i32 1
+  %6 = extractelement <4 x float> %B, i32 1
+  %add = fadd float %5, %6
+  %7 = extractelement <4 x float> %A, i32 3
+  %8 = extractelement <4 x float> %B, i32 3
+  %add2 = fadd float %7, %8
+  %vecinsert1 = insertelement <4 x float> poison, float %add, i32 1
+  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
+  %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub, i32 0
+  %vecinsert4 = insertelement <4 x float> %vecinsert3, float %sub2, i32 2
+  ret <4 x float> %vecinsert4
+}
+
+define <8 x float> @test_v8f32(<8 x float> %A, <8 x float> %B) {
+; SSE2-LABEL: @test_v8f32(
+; SSE2-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; SSE2-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; SSE2-NEXT:    ret <8 x float> [[TMP5]]
+;
+; SSE4-LABEL: @test_v8f32(
+; SSE4-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
+; SSE4-NEXT:    [[TMP2:%.*]] = fadd <8 x float> [[A]], [[B]]
+; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; SSE4-NEXT:    ret <8 x float> [[TMP3]]
+;
+; AVX-LABEL: @test_v8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
+; AVX-NEXT:    [[TMP2:%.*]] = fadd <8 x float> [[A]], [[B]]
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; AVX-NEXT:    ret <8 x float> [[TMP3]]
+;
+  %1 = extractelement <8 x float> %A, i32 0
+  %2 = extractelement <8 x float> %B, i32 0
+  %sub = fsub float %1, %2
+  %3 = extractelement <8 x float> %A, i32 2
+  %4 = extractelement <8 x float> %B, i32 2
+  %sub2 = fsub float %3, %4
+  %5 = extractelement <8 x float> %A, i32 1
+  %6 = extractelement <8 x float> %B, i32 1
+  %add = fadd float %5, %6
+  %7 = extractelement <8 x float> %A, i32 3
+  %8 = extractelement <8 x float> %B, i32 3
+  %add2 = fadd float %7, %8
+  %9 = extractelement <8 x float> %A, i32 4
+  %10 = extractelement <8 x float> %B, i32 4
+  %sub3 = fsub float %9, %10
+  %11 = extractelement <8 x float> %A, i32 6
+  %12 = extractelement <8 x float> %B, i32 6
+  %sub4 = fsub float %11, %12
+  %13 = extractelement <8 x float> %A, i32 5
+  %14 = extractelement <8 x float> %B, i32 5
+  %add3 = fadd float %13, %14
+  %15 = extractelement <8 x float> %A, i32 7
+  %16 = extractelement <8 x float> %B, i32 7
+  %add4 = fadd float %15, %16
+  %vecinsert1 = insertelement <8 x float> poison, float %add, i32 1
+  %vecinsert2 = insertelement <8 x float> %vecinsert1, float %add2, i32 3
+  %vecinsert3 = insertelement <8 x float> %vecinsert2, float %sub, i32 0
+  %vecinsert4 = insertelement <8 x float> %vecinsert3, float %sub2, i32 2
+  %vecinsert5 = insertelement <8 x float> %vecinsert4, float %add3, i32 5
+  %vecinsert6 = insertelement <8 x float> %vecinsert5, float %add4, i32 7
+  %vecinsert7 = insertelement <8 x float> %vecinsert6, float %sub3, i32 4
+  %vecinsert8 = insertelement <8 x float> %vecinsert7, float %sub4, i32 6
+  ret <8 x float> %vecinsert8
+}
+
+define <16 x float> @test_addsub_v16f32(<16 x float> %A, <16 x float> %B) {
+; SSE2-LABEL: @test_addsub_v16f32(
+; SSE2-NEXT:    [[TMP1:%.*]] = fsub <16 x float> [[A:%.*]], [[B:%.*]]
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x float> [[TMP1]], <16 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; SSE2-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[A]], [[B]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> [[TMP4]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; SSE2-NEXT:    ret <16 x float> [[TMP5]]
+;
+; SSE4-LABEL: @test_addsub_v16f32(
+; SSE4-NEXT:    [[TMP1:%.*]] = fsub <16 x float> [[A:%.*]], [[B:%.*]]
+; SSE4-NEXT:    [[TMP2:%.*]] = fadd <16 x float> [[A]], [[B]]
+; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <16 x float> [[TMP1]], <16 x float> [[TMP2]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+; SSE4-NEXT:    ret <16 x float> [[TMP3]]
+;
+; AVX-LABEL: @test_addsub_v16f32(
+; AVX-NEXT:    [[TMP1:%.*]] = fsub <16 x float> [[A:%.*]], [[B:%.*]]
+; AVX-NEXT:    [[TMP2:%.*]] = fadd <16 x float> [[A]], [[B]]
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <16 x float> [[TMP1]], <16 x float> [[TMP2]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+; AVX-NEXT:    ret <16 x float> [[TMP3]]
+;
+  %1 = extractelement <16 x float> %A, i32 0
+  %2 = extractelement <16 x float> %B, i32 0
+  %sub = fsub float %1, %2
+  %3 = extractelement <16 x float> %A, i32 2
+  %4 = extractelement <16 x float> %B, i32 2
+  %sub2 = fsub float %3, %4
+  %5 = extractelement <16 x float> %A, i32 1
+  %6 = extractelement <16 x float> %B, i32 1
+  %add = fadd float %5, %6
+  %7 = extractelement <16 x float> %A, i32 3
+  %8 = extractelement <16 x float> %B, i32 3
+  %add2 = fadd float %7, %8
+  %9 = extractelement <16 x float> %A, i32 4
+  %10 = extractelement <16 x float> %B, i32 4
+  %sub3 = fsub float %9, %10
+  %11 = extractelement <16 x float> %A, i32 6
+  %12 = extractelement <16 x float> %B, i32 6
+  %sub4 = fsub float %11, %12
+  %13 = extractelement <16 x float> %A, i32 5
+  %14 = extractelement <16 x float> %B, i32 5
+  %add3 = fadd float %13, %14
+  %15 = extractelement <16 x float> %A, i32 7
+  %16 = extractelement <16 x float> %B, i32 7
+  %add4 = fadd float %15, %16
+  %17 = extractelement <16 x float> %A, i32 8
+  %18 = extractelement <16 x float> %B, i32 8
+  %sub5 = fsub float %17, %18
+  %19 = extractelement <16 x float> %A, i32 10
+  %20 = extractelement <16 x float> %B, i32 10
+  %sub6 = fsub float %19, %20
+  %21 = extractelement <16 x float> %A, i32 9
+  %22 = extractelement <16 x float> %B, i32 9
+  %add5 = fadd float %21, %22
+  %23 = extractelement <16 x float> %A, i32 11
+  %24 = extractelement <16 x float> %B, i32 11
+  %add6 = fadd float %23, %24
+  %25 = extractelement <16 x float> %A, i32 12
+  %26 = extractelement <16 x float> %B, i32 12
+  %sub7 = fsub float %25, %26
+  %27 = extractelement <16 x float> %A, i32 14
+  %28 = extractelement <16 x float> %B, i32 14
+  %sub8 = fsub float %27, %28
+  %29 = extractelement <16 x float> %A, i32 13
+  %30 = extractelement <16 x float> %B, i32 13
+  %add7 = fadd float %29, %30
+  %31 = extractelement <16 x float> %A, i32 15
+  %32 = extractelement <16 x float> %B, i32 15
+  %add8 = fadd float %31, %32
+  %vecinsert1 = insertelement <16 x float> poison, float %add, i32 1
+  %vecinsert2 = insertelement <16 x float> %vecinsert1, float %add2, i32 3
+  %vecinsert3 = insertelement <16 x float> %vecinsert2, float %sub, i32 0
+  %vecinsert4 = insertelement <16 x float> %vecinsert3, float %sub2, i32 2
+  %vecinsert5 = insertelement <16 x float> %vecinsert4, float %add3, i32 5
+  %vecinsert6 = insertelement <16 x float> %vecinsert5, float %add4, i32 7
+  %vecinsert7 = insertelement <16 x float> %vecinsert6, float %sub3, i32 4
+  %vecinsert8 = insertelement <16 x float> %vecinsert7, float %sub4, i32 6
+  %vecinsert9 = insertelement <16 x float> %vecinsert8, float %add5, i32 9
+  %vecinsert10 = insertelement <16 x float> %vecinsert9, float %add6, i32 11
+  %vecinsert11 = insertelement <16 x float> %vecinsert10, float %sub5, i32 8
+  %vecinsert12 = insertelement <16 x float> %vecinsert11, float %sub6, i32 10
+  %vecinsert13 = insertelement <16 x float> %vecinsert12, float %add7, i32 13
+  %vecinsert14 = insertelement <16 x float> %vecinsert13, float %add8, i32 15
+  %vecinsert15 = insertelement <16 x float> %vecinsert14, float %sub7, i32 12
+  %vecinsert16 = insertelement <16 x float> %vecinsert15, float %sub8, i32 14
+  ret <16 x float> %vecinsert16
+}
+
+; Test that non-sequential / partial add-sub patterns are still folded.
+
+define <4 x float> @test_addsub_v4f32_shuffle_1302(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: @test_addsub_v4f32_shuffle_1302(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[TMP3]]
+;
+  %1 = extractelement <4 x float> %A, i32 0
+  %2 = extractelement <4 x float> %B, i32 0
+  %sub = fsub float %1, %2
+  %3 = extractelement <4 x float> %A, i32 2
+  %4 = extractelement <4 x float> %B, i32 2
+  %sub2 = fsub float %3, %4
+  %5 = extractelement <4 x float> %A, i32 1
+  %6 = extractelement <4 x float> %B, i32 1
+  %add = fadd float %5, %6
+  %7 = extractelement <4 x float> %A, i32 3
+  %8 = extractelement <4 x float> %B, i32 3
+  %add2 = fadd float %7, %8
+  %vecinsert1 = insertelement <4 x float> poison, float %add, i32 1
+  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
+  %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub, i32 0
+  %vecinsert4 = insertelement <4 x float> %vecinsert3, float %sub2, i32 2
+  ret <4 x float> %vecinsert4
+}
+
+define <4 x float> @test_addsub_v4f32_partial_23(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: @test_addsub_v4f32_partial_23(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[VECINSERT21:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <4 x i32> <i32 poison, i32 poison, i32 0, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[VECINSERT21]]
+;
+  %1 = extractelement <4 x float> %A, i32 2
+  %2 = extractelement <4 x float> %B, i32 2
+  %sub2 = fsub float %1, %2
+  %3 = extractelement <4 x float> %A, i32 3
+  %4 = extractelement <4 x float> %B, i32 3
+  %add2 = fadd float %3, %4
+  %vecinsert1 = insertelement <4 x float> poison, float %sub2, i32 2
+  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
+  ret <4 x float> %vecinsert2
+}
+
+define <4 x float> @test_addsub_v4f32_partial_03(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: @test_addsub_v4f32_partial_03(
+; CHECK-NEXT:    [[FOLDEXTEXTBINOP:%.*]] = fsub <4 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[FOLDEXTEXTBINOP2:%.*]] = fadd <4 x float> [[A]], [[B]]
+; CHECK-NEXT:    [[VECINSERT2:%.*]] = shufflevector <4 x float> [[FOLDEXTEXTBINOP]], <4 x float> [[FOLDEXTEXTBINOP2]], <4 x i32> <i32 0, i32 poison, i32 poison, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[VECINSERT2]]
+;
+  %1 = extractelement <4 x float> %A, i32 0
+  %2 = extractelement <4 x float> %B, i32 0
+  %sub = fsub float %1, %2
+  %3 = extractelement <4 x float> %A, i32 3
+  %4 = extractelement <4 x float> %B, i32 3
+  %add = fadd float %4, %3
+  %vecinsert1 = insertelement <4 x float> poison, float %sub, i32 0
+  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add, i32 3
+  ret <4 x float> %vecinsert2
+}
+
+define <4 x float> @test_addsub_v4f32_partial_12(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: @test_addsub_v4f32_partial_12(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[VECINSERT21:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <4 x i32> <i32 poison, i32 0, i32 3, i32 poison>
+; CHECK-NEXT:    ret <4 x float> [[VECINSERT21]]
+;
+  %1 = extractelement <4 x float> %A, i32 2
+  %2 = extractelement <4 x float> %B, i32 2
+  %sub = fsub float %1, %2
+  %3 = extractelement <4 x float> %A, i32 1
+  %4 = extractelement <4 x float> %B, i32 1
+  %add = fadd float %3, %4
+  %vecinsert1 = insertelement <4 x float> poison, float %sub, i32 2
+  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add, i32 1
+  ret <4 x float> %vecinsert2
+}
+
+define <4 x float> @test_addsub_v4f32_partial_01(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: @test_addsub_v4f32_partial_01(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    ret <4 x float> [[TMP6]]
+;
+  %1 = extractelement <4 x float> %A, i32 0
+  %2 = extractelement <4 x float> %B, i32 0
+  %sub2 = fsub float %1, %2
+  %3 = extractelement <4 x float> %A, i32 1
+  %4 = extractelement <4 x float> %B, i32 1
+  %add2 = fadd float %3, %4
+  %vecinsert1 = insertelement <4 x float> poison, float %sub2, i32 0
+  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 1
+  ret <4 x float> %vecinsert2
+}
+
 define <4 x float> @PR45015(<4 x float> %arg, <4 x float> %arg1) {
 ; CHECK-LABEL: @PR45015(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> [[ARG:%.*]], [[ARG1:%.*]]
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll
index 40dc2aaeced57..fa6403f3d4267 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
 ; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX
 ; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
 ; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX
 ; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX
 
@@ -12,6 +12,404 @@
 ; That may require some coordination between VectorCombine, SLP, and other passes.
 ; The end goal is to get a single "vaddsubps" instruction for x86 with AVX.
 
+define <2 x double> @test_addsub_v2f64(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: @test_addsub_v2f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x double> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    ret <2 x double> [[TMP3]]
+;
+  %1 = extractelement <2 x double> %A, i32 0
+  %2 = extractelement <2 x double> %B, i32 0
+  %sub = fsub double %1, %2
+  %3 = extractelement <2 x double> %A, i32 1
+  %4 = extractelement <2 x double> %B, i32 1
+  %add = fadd double %3, %4
+  %vecinsert1 = insertelement <2 x double> undef, double %sub, i32 0
+  %vecinsert2 = insertelement <2 x double> %vecinsert1, double %add, i32 1
+  ret <2 x double> %vecinsert2
+}
+
+define <4 x double> @test_addsub_v4f64(<4 x double> %A, <4 x double> %B) {
+; CHECK-LABEL: @test_addsub_v4f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x double> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+;
+  %1 = extractelement <4 x double> %A, i32 0
+  %2 = extractelement <4 x double> %B, i32 0
+  %sub = fsub double %1, %2
+  %3 = extractelement <4 x double> %A, i32 2
+  %4 = extractelement <4 x double> %B, i32 2
+  %sub2 = fsub double %3, %4
+  %5 = extractelement <4 x double> %A, i32 1
+  %6 = extractelement <4 x double> %B, i32 1
+  %add = fadd double %5, %6
+  %7 = extractelement <4 x double> %A, i32 3
+  %8 = extractelement <4 x double> %B, i32 3
+  %add2 = fadd double %7, %8
+  %vecinsert1 = insertelement <4 x double> undef, double %add, i32 1
+  %vecinsert2 = insertelement <4 x double> %vecinsert1, double %add2, i32 3
+  %vecinsert3 = insertelement <4 x double> %vecinsert2, double %sub, i32 0
+  %vecinsert4 = insertelement <4 x double> %vecinsert3, double %sub2, i32 2
+  ret <4 x double> %vecinsert4
+}
+
+define <8 x double> @test_addsub_v8f64(<8 x double> %A, <8 x double> %B) {
+; SSE2-LABEL: @test_addsub_v8f64(
+; SSE2-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[A:%.*]], [[B:%.*]]
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; SSE2-NEXT:    [[TMP3:%.*]] = fadd <8 x double> [[A]], [[B]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x double> [[TMP3]], <8 x double> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP4]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; SSE2-NEXT:    ret <8 x double> [[TMP5]]
+;
+; SSE4-LABEL: @test_addsub_v8f64(
+; SSE4-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[A:%.*]], [[B:%.*]]
+; SSE4-NEXT:    [[TMP2:%.*]] = fadd <8 x double> [[A]], [[B]]
+; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; SSE4-NEXT:    ret <8 x double> [[TMP3]]
+;
+; AVX-LABEL: @test_addsub_v8f64(
+; AVX-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[A:%.*]], [[B:%.*]]
+; AVX-NEXT:    [[TMP2:%.*]] = fadd <8 x double> [[A]], [[B]]
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; AVX-NEXT:    ret <8 x double> [[TMP3]]
+;
+  %1 = extractelement <8 x double> %A, i32 0
+  %2 = extractelement <8 x double> %B, i32 0
+  %sub = fsub double %1, %2
+  %3 = extractelement <8 x double> %A, i32 2
+  %4 = extractelement <8 x double> %B, i32 2
+  %sub2 = fsub double %3, %4
+  %5 = extractelement <8 x double> %A, i32 1
+  %6 = extractelement <8 x double> %B, i32 1
+  %add = fadd double %5, %6
+  %7 = extractelement <8 x double> %A, i32 3
+  %8 = extractelement <8 x double> %B, i32 3
+  %add2 = fadd double %7, %8
+  %9 = extractelement <8 x double> %A, i32 4
+  %10 = extractelement <8 x double> %B, i32 4
+  %sub3 = fsub double %9, %10
+  %11 = extractelement <8 x double> %A, i32 6
+  %12 = extractelement <8 x double> %B, i32 6
+  %sub4 = fsub double %11, %12
+  %13 = extractelement <8 x double> %A, i32 5
+  %14 = extractelement <8 x double> %B, i32 5
+  %add3 = fadd double %13, %14
+  %15 = extractelement <8 x double> %A, i32 7
+  %16 = extractelement <8 x double> %B, i32 7
+  %add4 = fadd double %15, %16
+  %vecinsert1 = insertelement <8 x double> undef, double %add, i32 1
+  %vecinsert2 = insertelement <8 x double> %vecinsert1, double %add2, i32 3
+  %vecinsert3 = insertelement <8 x double> %vecinsert2, double %sub, i32 0
+  %vecinsert4 = insertelement <8 x double> %vecinsert3, double %sub2, i32 2
+  %vecinsert5 = insertelement <8 x double> %vecinsert4, double %add3, i32 5
+  %vecinsert6 = insertelement <8 x double> %vecinsert5, double %add4, i32 7
+  %vecinsert7 = insertelement <8 x double> %vecinsert6, double %sub3, i32 4
+  %vecinsert8 = insertelement <8 x double> %vecinsert7, double %sub4, i32 6
+  ret <8 x double> %vecinsert8
+}
+
+define <2 x float> @test_addsub_v2f32(<2 x float> %v0, <2 x float> %v1) {
+; CHECK-LABEL: @test_addsub_v2f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x float> [[V0:%.*]], [[V1:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x float> [[V0]], [[V1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    ret <2 x float> [[TMP3]]
+;
+  %v2 = extractelement <2 x float> %v0, i32 0
+  %v3 = extractelement <2 x float> %v1, i32 0
+  %v4 = extractelement <2 x float> %v0, i32 1
+  %v5 = extractelement <2 x float> %v1, i32 1
+  %sub = fsub float %v2, %v3
+  %add = fadd float %v5, %v4
+  %res0 = insertelement <2 x float> undef, float %sub, i32 0
+  %res1 = insertelement <2 x float> %res0, float %add, i32 1
+  ret <2 x float> %res1
+}
+
+define <4 x float> @test_addsub_v4f32(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: @test_addsub_v4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[TMP3]]
+;
+  %1 = extractelement <4 x float> %A, i32 0
+  %2 = extractelement <4 x float> %B, i32 0
+  %sub = fsub float %1, %2
+  %3 = extractelement <4 x float> %A, i32 2
+  %4 = extractelement <4 x float> %B, i32 2
+  %sub2 = fsub float %3, %4
+  %5 = extractelement <4 x float> %A, i32 1
+  %6 = extractelement <4 x float> %B, i32 1
+  %add = fadd float %5, %6
+  %7 = extractelement <4 x float> %A, i32 3
+  %8 = extractelement <4 x float> %B, i32 3
+  %add2 = fadd float %7, %8
+  %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
+  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
+  %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub, i32 0
+  %vecinsert4 = insertelement <4 x float> %vecinsert3, float %sub2, i32 2
+  ret <4 x float> %vecinsert4
+}
+
+define <8 x float> @test_v8f32(<8 x float> %A, <8 x float> %B) {
+; SSE2-LABEL: @test_v8f32(
+; SSE2-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; SSE2-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; SSE2-NEXT:    ret <8 x float> [[TMP5]]
+;
+; SSE4-LABEL: @test_v8f32(
+; SSE4-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
+; SSE4-NEXT:    [[TMP2:%.*]] = fadd <8 x float> [[A]], [[B]]
+; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; SSE4-NEXT:    ret <8 x float> [[TMP3]]
+;
+; AVX-LABEL: @test_v8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
+; AVX-NEXT:    [[TMP2:%.*]] = fadd <8 x float> [[A]], [[B]]
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; AVX-NEXT:    ret <8 x float> [[TMP3]]
+;
+  %1 = extractelement <8 x float> %A, i32 0
+  %2 = extractelement <8 x float> %B, i32 0
+  %sub = fsub float %1, %2
+  %3 = extractelement <8 x float> %A, i32 2
+  %4 = extractelement <8 x float> %B, i32 2
+  %sub2 = fsub float %3, %4
+  %5 = extractelement <8 x float> %A, i32 1
+  %6 = extractelement <8 x float> %B, i32 1
+  %add = fadd float %5, %6
+  %7 = extractelement <8 x float> %A, i32 3
+  %8 = extractelement <8 x float> %B, i32 3
+  %add2 = fadd float %7, %8
+  %9 = extractelement <8 x float> %A, i32 4
+  %10 = extractelement <8 x float> %B, i32 4
+  %sub3 = fsub float %9, %10
+  %11 = extractelement <8 x float> %A, i32 6
+  %12 = extractelement <8 x float> %B, i32 6
+  %sub4 = fsub float %11, %12
+  %13 = extractelement <8 x float> %A, i32 5
+  %14 = extractelement <8 x float> %B, i32 5
+  %add3 = fadd float %13, %14
+  %15 = extractelement <8 x float> %A, i32 7
+  %16 = extractelement <8 x float> %B, i32 7
+  %add4 = fadd float %15, %16
+  %vecinsert1 = insertelement <8 x float> undef, float %add, i32 1
+  %vecinsert2 = insertelement <8 x float> %vecinsert1, float %add2, i32 3
+  %vecinsert3 = insertelement <8 x float> %vecinsert2, float %sub, i32 0
+  %vecinsert4 = insertelement <8 x float> %vecinsert3, float %sub2, i32 2
+  %vecinsert5 = insertelement <8 x float> %vecinsert4, float %add3, i32 5
+  %vecinsert6 = insertelement <8 x float> %vecinsert5, float %add4, i32 7
+  %vecinsert7 = insertelement <8 x float> %vecinsert6, float %sub3, i32 4
+  %vecinsert8 = insertelement <8 x float> %vecinsert7, float %sub4, i32 6
+  ret <8 x float> %vecinsert8
+}
+
+define <16 x float> @test_addsub_v16f32(<16 x float> %A, <16 x float> %B) {
+; SSE2-LABEL: @test_addsub_v16f32(
+; SSE2-NEXT:    [[TMP1:%.*]] = fsub <16 x float> [[A:%.*]], [[B:%.*]]
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x float> [[TMP1]], <16 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; SSE2-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[A]], [[B]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> [[TMP4]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; SSE2-NEXT:    ret <16 x float> [[TMP5]]
+;
+; SSE4-LABEL: @test_addsub_v16f32(
+; SSE4-NEXT:    [[TMP1:%.*]] = fsub <16 x float> [[A:%.*]], [[B:%.*]]
+; SSE4-NEXT:    [[TMP2:%.*]] = fadd <16 x float> [[A]], [[B]]
+; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <16 x float> [[TMP1]], <16 x float> [[TMP2]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+; SSE4-NEXT:    ret <16 x float> [[TMP3]]
+;
+; AVX-LABEL: @test_addsub_v16f32(
+; AVX-NEXT:    [[TMP1:%.*]] = fsub <16 x float> [[A:%.*]], [[B:%.*]]
+; AVX-NEXT:    [[TMP2:%.*]] = fadd <16 x float> [[A]], [[B]]
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <16 x float> [[TMP1]], <16 x float> [[TMP2]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+; AVX-NEXT:    ret <16 x float> [[TMP3]]
+;
+  %1 = extractelement <16 x float> %A, i32 0
+  %2 = extractelement <16 x float> %B, i32 0
+  %sub = fsub float %1, %2
+  %3 = extractelement <16 x float> %A, i32 2
+  %4 = extractelement <16 x float> %B, i32 2
+  %sub2 = fsub float %3, %4
+  %5 = extractelement <16 x float> %A, i32 1
+  %6 = extractelement <16 x float> %B, i32 1
+  %add = fadd float %5, %6
+  %7 = extractelement <16 x float> %A, i32 3
+  %8 = extractelement <16 x float> %B, i32 3
+  %add2 = fadd float %7, %8
+  %9 = extractelement <16 x float> %A, i32 4
+  %10 = extractelement <16 x float> %B, i32 4
+  %sub3 = fsub float %9, %10
+  %11 = extractelement <16 x float> %A, i32 6
+  %12 = extractelement <16 x float> %B, i32 6
+  %sub4 = fsub float %11, %12
+  %13 = extractelement <16 x float> %A, i32 5
+  %14 = extractelement <16 x float> %B, i32 5
+  %add3 = fadd float %13, %14
+  %15 = extractelement <16 x float> %A, i32 7
+  %16 = extractelement <16 x float> %B, i32 7
+  %add4 = fadd float %15, %16
+  %17 = extractelement <16 x float> %A, i32 8
+  %18 = extractelement <16 x float> %B, i32 8
+  %sub5 = fsub float %17, %18
+  %19 = extractelement <16 x float> %A, i32 10
+  %20 = extractelement <16 x float> %B, i32 10
+  %sub6 = fsub float %19, %20
+  %21 = extractelement <16 x float> %A, i32 9
+  %22 = extractelement <16 x float> %B, i32 9
+  %add5 = fadd float %21, %22
+  %23 = extractelement <16 x float> %A, i32 11
+  %24 = extractelement <16 x float> %B, i32 11
+  %add6 = fadd float %23, %24
+  %25 = extractelement <16 x float> %A, i32 12
+  %26 = extractelement <16 x float> %B, i32 12
+  %sub7 = fsub float %25, %26
+  %27 = extractelement <16 x float> %A, i32 14
+  %28 = extractelement <16 x float> %B, i32 14
+  %sub8 = fsub float %27, %28
+  %29 = extractelement <16 x float> %A, i32 13
+  %30 = extractelement <16 x float> %B, i32 13
+  %add7 = fadd float %29, %30
+  %31 = extractelement <16 x float> %A, i32 15
+  %32 = extractelement <16 x float> %B, i32 15
+  %add8 = fadd float %31, %32
+  %vecinsert1 = insertelement <16 x float> undef, float %add, i32 1
+  %vecinsert2 = insertelement <16 x float> %vecinsert1, float %add2, i32 3
+  %vecinsert3 = insertelement <16 x float> %vecinsert2, float %sub, i32 0
+  %vecinsert4 = insertelement <16 x float> %vecinsert3, float %sub2, i32 2
+  %vecinsert5 = insertelement <16 x float> %vecinsert4, float %add3, i32 5
+  %vecinsert6 = insertelement <16 x float> %vecinsert5, float %add4, i32 7
+  %vecinsert7 = insertelement <16 x float> %vecinsert6, float %sub3, i32 4
+  %vecinsert8 = insertelement <16 x float> %vecinsert7, float %sub4, i32 6
+  %vecinsert9 = insertelement <16 x float> %vecinsert8, float %add5, i32 9
+  %vecinsert10 = insertelement <16 x float> %vecinsert9, float %add6, i32 11
+  %vecinsert11 = insertelement <16 x float> %vecinsert10, float %sub5, i32 8
+  %vecinsert12 = insertelement <16 x float> %vecinsert11, float %sub6, i32 10
+  %vecinsert13 = insertelement <16 x float> %vecinsert12, float %add7, i32 13
+  %vecinsert14 = insertelement <16 x float> %vecinsert13, float %add8, i32 15
+  %vecinsert15 = insertelement <16 x float> %vecinsert14, float %sub7, i32 12
+  %vecinsert16 = insertelement <16 x float> %vecinsert15, float %sub8, i32 14
+  ret <16 x float> %vecinsert16
+}
+
+; Test that non-sequential / partial add-sub patterns are still folded.
+
+define <4 x float> @test_addsub_v4f32_shuffle_1302(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: @test_addsub_v4f32_shuffle_1302(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[TMP3]]
+;
+  %1 = extractelement <4 x float> %A, i32 0
+  %2 = extractelement <4 x float> %B, i32 0
+  %sub = fsub float %1, %2
+  %3 = extractelement <4 x float> %A, i32 2
+  %4 = extractelement <4 x float> %B, i32 2
+  %sub2 = fsub float %3, %4
+  %5 = extractelement <4 x float> %A, i32 1
+  %6 = extractelement <4 x float> %B, i32 1
+  %add = fadd float %5, %6
+  %7 = extractelement <4 x float> %A, i32 3
+  %8 = extractelement <4 x float> %B, i32 3
+  %add2 = fadd float %7, %8
+  %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
+  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
+  %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub, i32 0
+  %vecinsert4 = insertelement <4 x float> %vecinsert3, float %sub2, i32 2
+  ret <4 x float> %vecinsert4
+}
+
+define <4 x float> @test_addsub_v4f32_partial_23(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: @test_addsub_v4f32_partial_23(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[VECINSERT21:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> <float undef, float undef, float poison, float poison>, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
+; CHECK-NEXT:    ret <4 x float> [[VECINSERT21]]
+;
+  %1 = extractelement <4 x float> %A, i32 2
+  %2 = extractelement <4 x float> %B, i32 2
+  %sub2 = fsub float %1, %2
+  %3 = extractelement <4 x float> %A, i32 3
+  %4 = extractelement <4 x float> %B, i32 3
+  %add2 = fadd float %3, %4
+  %vecinsert1 = insertelement <4 x float> undef, float %sub2, i32 2
+  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
+  ret <4 x float> %vecinsert2
+}
+
+define <4 x float> @test_addsub_v4f32_partial_03(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: @test_addsub_v4f32_partial_03(
+; CHECK-NEXT:    [[FOLDEXTEXTBINOP:%.*]] = fsub <4 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[FOLDEXTEXTBINOP2:%.*]] = fadd <4 x float> [[A]], [[B]]
+; CHECK-NEXT:    [[VECINSERT1:%.*]] = shufflevector <4 x float> [[FOLDEXTEXTBINOP]], <4 x float> <float poison, float undef, float undef, float poison>, <4 x i32> <i32 0, i32 5, i32 6, i32 poison>
+; CHECK-NEXT:    [[VECINSERT2:%.*]] = shufflevector <4 x float> [[VECINSERT1]], <4 x float> [[FOLDEXTEXTBINOP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[VECINSERT2]]
+;
+  %1 = extractelement <4 x float> %A, i32 0
+  %2 = extractelement <4 x float> %B, i32 0
+  %sub = fsub float %1, %2
+  %3 = extractelement <4 x float> %A, i32 3
+  %4 = extractelement <4 x float> %B, i32 3
+  %add = fadd float %4, %3
+  %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 0
+  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add, i32 3
+  ret <4 x float> %vecinsert2
+}
+
+define <4 x float> @test_addsub_v4f32_partial_12(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: @test_addsub_v4f32_partial_12(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[VECINSERT21:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> <float undef, float poison, float poison, float undef>, <4 x i32> <i32 4, i32 0, i32 1, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[VECINSERT21]]
+;
+  %1 = extractelement <4 x float> %A, i32 2
+  %2 = extractelement <4 x float> %B, i32 2
+  %sub = fsub float %1, %2
+  %3 = extractelement <4 x float> %A, i32 1
+  %4 = extractelement <4 x float> %B, i32 1
+  %add = fadd float %3, %4
+  %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 2
+  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add, i32 1
+  ret <4 x float> %vecinsert2
+}
+
+define <4 x float> @test_addsub_v4f32_partial_01(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: @test_addsub_v4f32_partial_01(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[TMP6]]
+;
+  %1 = extractelement <4 x float> %A, i32 0
+  %2 = extractelement <4 x float> %B, i32 0
+  %sub2 = fsub float %1, %2
+  %3 = extractelement <4 x float> %A, i32 1
+  %4 = extractelement <4 x float> %B, i32 1
+  %add2 = fadd float %3, %4
+  %vecinsert1 = insertelement <4 x float> undef, float %sub2, i32 0
+  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 1
+  ret <4 x float> %vecinsert2
+}
+
 define <4 x float> @PR45015(<4 x float> %arg, <4 x float> %arg1) {
 ; CHECK-LABEL: @PR45015(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> [[ARG:%.*]], [[ARG1:%.*]]
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll b/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll
index 3ee9cf8d6920a..ffdff21f94f0e 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll
@@ -124,18 +124,14 @@ define void @test_known_trip_count() {
 ; CHECK-NEXT:    [[TMP27:%.*]] = fadd <2 x double> [[WIDE_LOAD3_13]], [[WIDE_LOAD5_13]]
 ; CHECK-NEXT:    store <2 x double> [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 416), align 16
 ; CHECK-NEXT:    store <2 x double> [[TMP27]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 432), align 16
-; CHECK-NEXT:    [[WIDE_LOAD_14:%.*]] = load <2 x double>, ptr getelementptr inbounds nuw (i8, ptr @b, i64 448), align 16
-; CHECK-NEXT:    [[WIDE_LOAD3_14:%.*]] = load <2 x double>, ptr getelementptr inbounds nuw (i8, ptr @b, i64 464), align 16
-; CHECK-NEXT:    [[WIDE_LOAD4_14:%.*]] = load <2 x double>, ptr getelementptr inbounds nuw (i8, ptr @c, i64 448), align 16
-; CHECK-NEXT:    [[WIDE_LOAD5_14:%.*]] = load <2 x double>, ptr getelementptr inbounds nuw (i8, ptr @c, i64 464), align 16
-; CHECK-NEXT:    [[TMP28:%.*]] = fadd <2 x double> [[WIDE_LOAD_14]], [[WIDE_LOAD4_14]]
-; CHECK-NEXT:    [[TMP29:%.*]] = fadd <2 x double> [[WIDE_LOAD3_14]], [[WIDE_LOAD5_14]]
-; CHECK-NEXT:    store <2 x double> [[TMP28]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 448), align 16
-; CHECK-NEXT:    store <2 x double> [[TMP29]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 464), align 16
-; CHECK-NEXT:    [[TMP30:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @b, i64 480), align 16
-; CHECK-NEXT:    [[TMP31:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @c, i64 480), align 16
+; CHECK-NEXT:    [[TMP30:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @b, i64 448), align 16
+; CHECK-NEXT:    [[TMP31:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @c, i64 448), align 16
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[TMP30]], [[TMP31]]
-; CHECK-NEXT:    store double [[ADD]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 480), align 16
+; CHECK-NEXT:    store double [[ADD]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 448), align 16
+; CHECK-NEXT:    [[TMP32:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @b, i64 456), align 8
+; CHECK-NEXT:    [[TMP33:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @c, i64 456), align 8
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd double [[TMP32]], [[TMP33]]
+; CHECK-NEXT:    store double [[ADD_1]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 456), align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -143,7 +139,7 @@ entry:
 
 for.cond:
   %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
-  %cmp = icmp slt i32 %i.0, 61
+  %cmp = icmp slt i32 %i.0, 58
   br i1 %cmp, label %for.body, label %exit
 
 for.body:
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/masked-memory-ops-with-cf.ll b/llvm/test/Transforms/PhaseOrdering/X86/masked-memory-ops-with-cf.ll
index c649f29effeda..dca864e596664 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/masked-memory-ops-with-cf.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/masked-memory-ops-with-cf.ll
@@ -6,18 +6,18 @@ define void @basic(i1 %cond, ptr %b, ptr %p, ptr %q) {
 ; CHECK-LABEL: @basic(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND:%.*]] to <1 x i1>
-; CHECK-NEXT:    [[TMP1:%.*]] = call <1 x i16> @llvm.masked.load.v1i16.p0(ptr [[P:%.*]], i32 2, <1 x i1> [[TMP0]], <1 x i16> poison)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <1 x i16> @llvm.masked.load.v1i16.p0(ptr align 2 [[P:%.*]], <1 x i1> [[TMP0]], <1 x i16> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x i16> [[TMP1]] to i16
-; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 4 [[Q:%.*]], <1 x i1> [[TMP0]], <1 x i32> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
-; CHECK-NEXT:    [[TMP5:%.*]] = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr [[B:%.*]], i32 8, <1 x i1> [[TMP0]], <1 x i64> poison)
+; CHECK-NEXT:    [[TMP5:%.*]] = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr align 8 [[B:%.*]], <1 x i1> [[TMP0]], <1 x i64> poison)
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <1 x i16>
-; CHECK-NEXT:    call void @llvm.masked.store.v1i16.p0(<1 x i16> [[TMP7]], ptr [[B]], i32 8, <1 x i1> [[TMP0]])
+; CHECK-NEXT:    call void @llvm.masked.store.v1i16.p0(<1 x i16> [[TMP7]], ptr align 8 [[B]], <1 x i1> [[TMP0]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32 [[TMP4]] to <1 x i32>
-; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP8]], ptr [[P]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP8]], ptr align 4 [[P]], <1 x i1> [[TMP0]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[TMP6]] to <1 x i64>
-; CHECK-NEXT:    call void @llvm.masked.store.v1i64.p0(<1 x i64> [[TMP9]], ptr [[Q]], i32 8, <1 x i1> [[TMP0]])
+; CHECK-NEXT:    call void @llvm.masked.store.v1i64.p0(<1 x i64> [[TMP9]], ptr align 8 [[Q]], <1 x i1> [[TMP0]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/masked-memory-ops.ll b/llvm/test/Transforms/PhaseOrdering/X86/masked-memory-ops.ll
index eec8910fa69e2..13fec64f35c73 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/masked-memory-ops.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/masked-memory-ops.ll
@@ -15,7 +15,7 @@ declare void @llvm.masked.store.v8f32.p0(<8 x float>, ptr, i32, <8 x i1>)
 define void @PR11210_v8f32_maskstore_maskstore(ptr %ptr, <8 x float> %x, <8 x float> %y, <8 x i32> %src) {
 ; CHECK-LABEL: @PR11210_v8f32_maskstore_maskstore(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt <8 x i32> [[SRC:%.*]], zeroinitializer
-; CHECK-NEXT:    tail call void @llvm.masked.store.v8f32.p0(<8 x float> [[Y:%.*]], ptr [[PTR:%.*]], i32 1, <8 x i1> [[CMP]])
+; CHECK-NEXT:    tail call void @llvm.masked.store.v8f32.p0(<8 x float> [[Y:%.*]], ptr align 1 [[PTR:%.*]], <8 x i1> [[CMP]])
 ; CHECK-NEXT:    ret void
 ;
   %cmp = icmp sgt <8 x i32> %src, zeroinitializer
@@ -41,7 +41,7 @@ define void @PR11210_v8f32_maskstore_maskstore_raw_mask(ptr %ptr, <8 x float> %x
 define void @PR11210_v8f32_mstore_maskstore(ptr %ptr, <8 x float> %x, <8 x float> %y, <8 x i32> %src) {
 ; CHECK-LABEL: @PR11210_v8f32_mstore_maskstore(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt <8 x i32> [[SRC:%.*]], zeroinitializer
-; CHECK-NEXT:    tail call void @llvm.masked.store.v8f32.p0(<8 x float> [[Y:%.*]], ptr [[PTR:%.*]], i32 1, <8 x i1> [[CMP]])
+; CHECK-NEXT:    tail call void @llvm.masked.store.v8f32.p0(<8 x float> [[Y:%.*]], ptr align 1 [[PTR:%.*]], <8 x i1> [[CMP]])
 ; CHECK-NEXT:    ret void
 ;
   %cmp = icmp sgt <8 x i32> %src, zeroinitializer
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll
index 92891286d11d1..dcfebe32302be 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll
@@ -72,10 +72,10 @@ define dso_local void @test(ptr %start, ptr %end) #0 {
 ; AVX2-NEXT:    [[TMP17:%.*]] = or <8 x i1> [[TMP9]], [[TMP13]]
 ; AVX2-NEXT:    [[TMP18:%.*]] = or <8 x i1> [[TMP10]], [[TMP14]]
 ; AVX2-NEXT:    [[TMP19:%.*]] = or <8 x i1> [[TMP11]], [[TMP15]]
-; AVX2-NEXT:    tail call void @llvm.masked.store.v8i32.p0(<8 x i32> splat (i32 42), ptr [[NEXT_GEP]], i32 4, <8 x i1> [[TMP16]])
-; AVX2-NEXT:    tail call void @llvm.masked.store.v8i32.p0(<8 x i32> splat (i32 42), ptr [[TMP5]], i32 4, <8 x i1> [[TMP17]])
-; AVX2-NEXT:    tail call void @llvm.masked.store.v8i32.p0(<8 x i32> splat (i32 42), ptr [[TMP6]], i32 4, <8 x i1> [[TMP18]])
-; AVX2-NEXT:    tail call void @llvm.masked.store.v8i32.p0(<8 x i32> splat (i32 42), ptr [[TMP7]], i32 4, <8 x i1> [[TMP19]])
+; AVX2-NEXT:    tail call void @llvm.masked.store.v8i32.p0(<8 x i32> splat (i32 42), ptr align 4 [[NEXT_GEP]], <8 x i1> [[TMP16]])
+; AVX2-NEXT:    tail call void @llvm.masked.store.v8i32.p0(<8 x i32> splat (i32 42), ptr align 4 [[TMP5]], <8 x i1> [[TMP17]])
+; AVX2-NEXT:    tail call void @llvm.masked.store.v8i32.p0(<8 x i32> splat (i32 42), ptr align 4 [[TMP6]], <8 x i1> [[TMP18]])
+; AVX2-NEXT:    tail call void @llvm.masked.store.v8i32.p0(<8 x i32> splat (i32 42), ptr align 4 [[TMP7]], <8 x i1> [[TMP19]])
 ; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; AVX2-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; AVX2-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -86,7 +86,7 @@ define dso_local void @test(ptr %start, ptr %end) #0 {
 ; AVX2-NEXT:    [[TMP26:%.*]] = shl i64 [[N_VEC]], 2
 ; AVX2-NEXT:    [[IND_END11:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP26]]
 ; AVX2-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
-; AVX2-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[BB12_PREHEADER1]], label [[BB12_PREHEADER11]]
+; AVX2-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[BB12_PREHEADER1]], label [[BB12_PREHEADER11]], !prof [[PROF3:![0-9]+]]
 ; AVX2:       vec.epilog.ph:
 ; AVX2-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; AVX2-NEXT:    [[N_VEC10:%.*]] = and i64 [[TMP3]], 9223372036854775800
@@ -101,10 +101,10 @@ define dso_local void @test(ptr %start, ptr %end) #0 {
 ; AVX2-NEXT:    [[TMP22:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD15]], splat (i32 -12)
 ; AVX2-NEXT:    [[TMP23:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD15]], splat (i32 13)
 ; AVX2-NEXT:    [[TMP24:%.*]] = or <8 x i1> [[TMP22]], [[TMP23]]
-; AVX2-NEXT:    tail call void @llvm.masked.store.v8i32.p0(<8 x i32> splat (i32 42), ptr [[NEXT_GEP14]], i32 4, <8 x i1> [[TMP24]])
+; AVX2-NEXT:    tail call void @llvm.masked.store.v8i32.p0(<8 x i32> splat (i32 42), ptr align 4 [[NEXT_GEP14]], <8 x i1> [[TMP24]])
 ; AVX2-NEXT:    [[INDEX_NEXT16]] = add nuw i64 [[INDEX12]], 8
 ; AVX2-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT16]], [[N_VEC10]]
-; AVX2-NEXT:    br i1 [[TMP25]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[BB12]], !llvm.loop [[LOOP3:![0-9]+]]
+; AVX2-NEXT:    br i1 [[TMP25]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[BB12]], !llvm.loop [[LOOP4:![0-9]+]]
 ; AVX2:       vec.epilog.middle.block:
 ; AVX2-NEXT:    [[CMP_N17:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC10]]
 ; AVX2-NEXT:    br i1 [[CMP_N17]], label [[EXIT]], label [[BB12_PREHEADER1]]
@@ -124,7 +124,7 @@ define dso_local void @test(ptr %start, ptr %end) #0 {
 ; AVX2:       latch:
 ; AVX2-NEXT:    [[PTR_NEXT]] = getelementptr inbounds nuw i8, ptr [[PTR2]], i64 4
 ; AVX2-NEXT:    [[I11_NOT:%.*]] = icmp eq ptr [[PTR_NEXT]], [[END]]
-; AVX2-NEXT:    br i1 [[I11_NOT]], label [[EXIT]], label [[BB13]], !llvm.loop [[LOOP4:![0-9]+]]
+; AVX2-NEXT:    br i1 [[I11_NOT]], label [[EXIT]], label [[BB13]], !llvm.loop [[LOOP5:![0-9]+]]
 ; AVX2:       exit:
 ; AVX2-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/preserve-access-group.ll b/llvm/test/Transforms/PhaseOrdering/X86/preserve-access-group.ll
index ac736518c0cbd..d9803697a7265 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/preserve-access-group.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/preserve-access-group.ll
@@ -30,11 +30,11 @@ define void @test(i32 noundef %nface, i32 noundef %ncell, ptr noalias noundef %f
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, ptr [[Y]], <4 x i64> [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = sext <4 x i32> [[WIDE_LOAD12]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, ptr [[X]], <4 x i64> [[TMP5]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = tail call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> [[TMP4]], i32 8, <4 x i1> splat (i1 true), <4 x double> poison), !tbaa [[DOUBLE_TBAA5:![0-9]+]], !llvm.access.group [[ACC_GRP4]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER13:%.*]] = tail call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> [[TMP6]], i32 8, <4 x i1> splat (i1 true), <4 x double> poison), !tbaa [[DOUBLE_TBAA5]], !llvm.access.group [[ACC_GRP4]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = tail call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 8 [[TMP4]], <4 x i1> splat (i1 true), <4 x double> poison), !tbaa [[DOUBLE_TBAA5:![0-9]+]], !llvm.access.group [[ACC_GRP4]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER13:%.*]] = tail call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 8 [[TMP6]], <4 x i1> splat (i1 true), <4 x double> poison), !tbaa [[DOUBLE_TBAA5]], !llvm.access.group [[ACC_GRP4]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = fcmp fast olt <4 x double> [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER13]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x double> [[WIDE_MASKED_GATHER13]], <4 x double> [[WIDE_MASKED_GATHER]]
-; CHECK-NEXT:    tail call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> [[TMP8]], <4 x ptr> [[TMP4]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[DOUBLE_TBAA5]], !llvm.access.group [[ACC_GRP4]]
+; CHECK-NEXT:    tail call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> [[TMP8]], <4 x ptr> align 8 [[TMP4]], <4 x i1> splat (i1 true)), !tbaa [[DOUBLE_TBAA5]], !llvm.access.group [[ACC_GRP4]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDVARS_IV_EPIL]], 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[UNROLL_ITER]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch.ll b/llvm/test/Transforms/PhaseOrdering/unswitch-cold-func.ll
similarity index 89%
rename from llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch.ll
rename to llvm/test/Transforms/PhaseOrdering/unswitch-cold-func.ll
index f1ffcc788a019..a6ebdf052411d 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch.ll
+++ b/llvm/test/Transforms/PhaseOrdering/unswitch-cold-func.ll
@@ -1,13 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
 
-; RUN: opt < %s -passes='require<profile-summary>,function(loop-mssa(simple-loop-unswitch<nontrivial>))' -S | FileCheck %s
+; RUN: opt < %s -passes='pgo-force-function-attrs,function(loop-mssa(simple-loop-unswitch<nontrivial>))' -pgo-kind=pgo-instr-use-pipeline -pgo-cold-func-opt=optsize -S | FileCheck %s
+; RUN: opt < %s -passes='pgo-force-function-attrs,function(loop-mssa(simple-loop-unswitch<nontrivial>))' -pgo-kind=pgo-instr-use-pipeline -pgo-cold-func-opt=minsize -S | FileCheck %s
 
 ;; Check that non-trivial loop unswitching is not applied to a cold loop in a
 ;; cold loop nest.
 
 ;; IR was generated from the following loop nest, profiled when called
 ;; with M=0 and N=0.
-;; void hotFunction(bool cond, int M, int N, int * A, int *B, int *C) {
+;; void function(bool cond, int M, int N, int * A, int *B, int *C) {
 ;;   for (unsigned j = 0; j < M; j++)
 ;;     for (unsigned i=0; i < N; i++) {
 ;;       A[i] = B[i] + C[i];
@@ -15,9 +16,9 @@
 ;;     }
 ;; }
 
-define void @_Z11hotFunctionbiiPiS_S_(i1 %cond, i32 %M, i32 %N, ptr %A, ptr %B, ptr %C) !prof !36 {
-; CHECK-LABEL: define void @_Z11hotFunctionbiiPiS_S_
-; CHECK-SAME: (i1 [[COND:%.*]], i32 [[M:%.*]], i32 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) !prof [[PROF16:![0-9]+]] {
+define void @_Z11functionbiiPiS_S_(i1 %cond, i32 %M, i32 %N, ptr %A, ptr %B, ptr %C) !prof !36 {
+; CHECK-LABEL: define void @_Z11functionbiiPiS_S_
+; CHECK-SAME: (i1 [[COND:%.*]], i32 [[M:%.*]], i32 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) {{.*}}{
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP19_NOT:%.*]] = icmp eq i32 [[M]], 0
 ; CHECK-NEXT:    br i1 [[CMP19_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER_LR_PH:%.*]], !prof [[PROF17:![0-9]+]]
diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/expand-vp-gather-scatter.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/expand-vp-gather-scatter.ll
index 23096eccc6136..93138f59813f8 100644
--- a/llvm/test/Transforms/PreISelIntrinsicLowering/expand-vp-gather-scatter.ll
+++ b/llvm/test/Transforms/PreISelIntrinsicLowering/expand-vp-gather-scatter.ll
@@ -8,7 +8,7 @@ define <4 x i32> @vpgather_v4i32(<4 x ptr> %ptrs, <4 x i1> %m, i32 zeroext %evl)
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i1> [[TMP1]], [[M:%.*]]
-; CHECK-NEXT:    [[V1:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[PTRS:%.*]], i32 4, <4 x i1> [[TMP2]], <4 x i32> poison)
+; CHECK-NEXT:    [[V1:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[PTRS:%.*]], <4 x i1> [[TMP2]], <4 x i32> poison)
 ; CHECK-NEXT:    ret <4 x i32> [[V1]]
 ;
   %v = call <4 x i32> @llvm.vp.gather.v4i32.v4p0(<4 x ptr> %ptrs, <4 x i1> %m, i32 %evl)
@@ -21,7 +21,7 @@ define <2 x i64> @vpgather_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, i32 zeroext %evl)
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <2 x i32> <i32 0, i32 1>, [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i1> [[TMP1]], [[M:%.*]]
-; CHECK-NEXT:    [[V1:%.*]] = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> [[PTRS:%.*]], i32 8, <2 x i1> [[TMP2]], <2 x i64> poison)
+; CHECK-NEXT:    [[V1:%.*]] = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 8 [[PTRS:%.*]], <2 x i1> [[TMP2]], <2 x i64> poison)
 ; CHECK-NEXT:    ret <2 x i64> [[V1]]
 ;
   %v = call <2 x i64> @llvm.vp.gather.v2i64.v2p0(<2 x ptr> %ptrs, <2 x i1> %m, i32 %evl)
@@ -34,7 +34,7 @@ define void @vpscatter_v4i32(<4 x i32> %val, <4 x ptr> %ptrs, <4 x i1> %m, i32 z
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i1> [[TMP1]], [[M:%.*]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[VAL:%.*]], <4 x ptr> [[PTRS:%.*]], i32 4, <4 x i1> [[TMP2]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[VAL:%.*]], <4 x ptr> align 4 [[PTRS:%.*]], <4 x i1> [[TMP2]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.vp.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, <4 x i1> %m, i32 %evl)
@@ -47,7 +47,7 @@ define void @vpscatter_v2i64(<2 x i64> %val, <2 x ptr> %ptrs, <2 x i1> %m, i32 z
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <2 x i32> <i32 0, i32 1>, [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i1> [[TMP1]], [[M:%.*]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> [[VAL:%.*]], <2 x ptr> [[PTRS:%.*]], i32 8, <2 x i1> [[TMP2]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> [[VAL:%.*]], <2 x ptr> align 8 [[PTRS:%.*]], <2 x i1> [[TMP2]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.vp.scatter.v2i64.v2p0(<2 x i64> %val, <2 x ptr> %ptrs, <2 x i1> %m, i32 %evl)
@@ -61,7 +61,7 @@ define <vscale x 2 x i32> @vpgather_nxv2i32(<vscale x 2 x ptr> %ptrs, <vscale x
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <vscale x 2 x i1> [[TMP1]], [[M:%.*]]
 ; CHECK-NEXT:    [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 2
-; CHECK-NEXT:    [[V1:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[PTRS:%.*]], i32 4, <vscale x 2 x i1> [[TMP2]], <vscale x 2 x i32> poison)
+; CHECK-NEXT:    [[V1:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> align 4 [[PTRS:%.*]], <vscale x 2 x i1> [[TMP2]], <vscale x 2 x i32> poison)
 ; CHECK-NEXT:    ret <vscale x 2 x i32> [[V1]]
 ;
   %v = call <vscale x 2 x i32> @llvm.vp.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> %ptrs, <vscale x 2 x i1> %m, i32 %evl)
@@ -74,7 +74,7 @@ define <vscale x 1 x i64> @vpgather_nxv1i64(<vscale x 1 x ptr> %ptrs, <vscale x
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <vscale x 1 x i1> [[TMP1]], [[M:%.*]]
 ; CHECK-NEXT:    [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 1
-; CHECK-NEXT:    [[V1:%.*]] = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> [[PTRS:%.*]], i32 8, <vscale x 1 x i1> [[TMP2]], <vscale x 1 x i64> poison)
+; CHECK-NEXT:    [[V1:%.*]] = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> align 8 [[PTRS:%.*]], <vscale x 1 x i1> [[TMP2]], <vscale x 1 x i64> poison)
 ; CHECK-NEXT:    ret <vscale x 1 x i64> [[V1]]
 ;
   %v = call <vscale x 1 x i64> @llvm.vp.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> %ptrs, <vscale x 1 x i1> %m, i32 %evl)
@@ -87,7 +87,7 @@ define void @vpscatter_nxv2i32(<vscale x 2 x i32> %val, <vscale x 2 x ptr> %ptrs
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <vscale x 2 x i1> [[TMP1]], [[M:%.*]]
 ; CHECK-NEXT:    [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 2
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> [[VAL:%.*]], <vscale x 2 x ptr> [[PTRS:%.*]], i32 4, <vscale x 2 x i1> [[TMP2]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> [[VAL:%.*]], <vscale x 2 x ptr> align 4 [[PTRS:%.*]], <vscale x 2 x i1> [[TMP2]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.vp.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> %val, <vscale x 2 x ptr> %ptrs, <vscale x 2 x i1> %m, i32 %evl)
@@ -100,7 +100,7 @@ define void @vpscatter_nxv1i64(<vscale x 1 x i64> %val, <vscale x 1 x ptr> %ptrs
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <vscale x 1 x i1> [[TMP1]], [[M:%.*]]
 ; CHECK-NEXT:    [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 1
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> [[VAL:%.*]], <vscale x 1 x ptr> [[PTRS:%.*]], i32 8, <vscale x 1 x i1> [[TMP2]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> [[VAL:%.*]], <vscale x 1 x ptr> align 8 [[PTRS:%.*]], <vscale x 1 x i1> [[TMP2]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.vp.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> %val, <vscale x 1 x ptr> %ptrs, <vscale x 1 x i1> %m, i32 %evl)
diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/expand-vp-load-store.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/expand-vp-load-store.ll
index 81923642811d9..bd2726e3bb507 100644
--- a/llvm/test/Transforms/PreISelIntrinsicLowering/expand-vp-load-store.ll
+++ b/llvm/test/Transforms/PreISelIntrinsicLowering/expand-vp-load-store.ll
@@ -9,7 +9,7 @@ define <2 x i64> @vpload_v2i64(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <2 x i32> <i32 0, i32 1>, [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i1> [[TMP1]], [[M:%.*]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr [[PTR:%.*]], i32 1, <2 x i1> [[TMP2]], <2 x i64> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr align 1 [[PTR:%.*]], <2 x i1> [[TMP2]], <2 x i64> poison)
 ; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
 ;
   %load = call <2 x i64> @llvm.vp.load.v2i64.p0(ptr %ptr, <2 x i1> %m, i32 %evl)
@@ -18,7 +18,7 @@ define <2 x i64> @vpload_v2i64(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) {
 
 define <2 x i64> @vpload_v2i64_vlmax(ptr %ptr, <2 x i1> %m) {
 ; CHECK-LABEL: @vpload_v2i64_vlmax(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr [[PTR:%.*]], i32 1, <2 x i1> [[M:%.*]], <2 x i64> poison)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr align 1 [[PTR:%.*]], <2 x i1> [[M:%.*]], <2 x i64> poison)
 ; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
 ;
   %load = call <2 x i64> @llvm.vp.load.v2i64.p0(ptr %ptr, <2 x i1> %m, i32 2)
@@ -31,7 +31,7 @@ define <2 x i64> @vpload_v2i64_allones_mask(ptr %ptr, i32 zeroext %evl) {
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <2 x i32> <i32 0, i32 1>, [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i1> [[TMP1]], splat (i1 true)
-; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr [[PTR:%.*]], i32 1, <2 x i1> [[TMP2]], <2 x i64> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr align 1 [[PTR:%.*]], <2 x i1> [[TMP2]], <2 x i64> poison)
 ; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
 ;
   %load = call <2 x i64> @llvm.vp.load.v2i64.p0(ptr %ptr, <2 x i1> <i1 1, i1 1>, i32 %evl)
@@ -53,7 +53,7 @@ define void @vpstore_v2i64(<2 x i64> %val, ptr %ptr, <2 x i1> %m, i32 zeroext %e
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <2 x i32> <i32 0, i32 1>, [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i1> [[TMP1]], [[M:%.*]]
-; CHECK-NEXT:    call void @llvm.masked.store.v2i64.p0(<2 x i64> [[VAL:%.*]], ptr [[PTR:%.*]], i32 1, <2 x i1> [[TMP2]])
+; CHECK-NEXT:    call void @llvm.masked.store.v2i64.p0(<2 x i64> [[VAL:%.*]], ptr align 1 [[PTR:%.*]], <2 x i1> [[TMP2]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.vp.store.v2i64.p0(<2 x i64> %val, ptr %ptr, <2 x i1> %m, i32 %evl)
@@ -62,7 +62,7 @@ define void @vpstore_v2i64(<2 x i64> %val, ptr %ptr, <2 x i1> %m, i32 zeroext %e
 
 define void @vpstore_v2i64_vlmax(<2 x i64> %val, ptr %ptr, <2 x i1> %m) {
 ; CHECK-LABEL: @vpstore_v2i64_vlmax(
-; CHECK-NEXT:    call void @llvm.masked.store.v2i64.p0(<2 x i64> [[VAL:%.*]], ptr [[PTR:%.*]], i32 1, <2 x i1> [[M:%.*]])
+; CHECK-NEXT:    call void @llvm.masked.store.v2i64.p0(<2 x i64> [[VAL:%.*]], ptr align 1 [[PTR:%.*]], <2 x i1> [[M:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.vp.store.v2i64.p0(<2 x i64> %val, ptr %ptr, <2 x i1> %m, i32 2)
@@ -75,7 +75,7 @@ define void @vpstore_v2i64_allones_mask(<2 x i64> %val, ptr %ptr, i32 zeroext %e
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <2 x i32> <i32 0, i32 1>, [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i1> [[TMP1]], splat (i1 true)
-; CHECK-NEXT:    call void @llvm.masked.store.v2i64.p0(<2 x i64> [[VAL:%.*]], ptr [[PTR:%.*]], i32 1, <2 x i1> [[TMP2]])
+; CHECK-NEXT:    call void @llvm.masked.store.v2i64.p0(<2 x i64> [[VAL:%.*]], ptr align 1 [[PTR:%.*]], <2 x i1> [[TMP2]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.vp.store.v2i64.p0(<2 x i64> %val, ptr %ptr, <2 x i1> <i1 1, i1 1>, i32 %evl)
@@ -98,7 +98,7 @@ define <vscale x 1 x i64> @vpload_nxv1i64(ptr %ptr, <vscale x 1 x i1> %m, i32 ze
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <vscale x 1 x i1> [[TMP1]], [[M:%.*]]
 ; CHECK-NEXT:    [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0(ptr [[PTR:%.*]], i32 1, <vscale x 1 x i1> [[TMP2]], <vscale x 1 x i64> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0(ptr align 1 [[PTR:%.*]], <vscale x 1 x i1> [[TMP2]], <vscale x 1 x i64> poison)
 ; CHECK-NEXT:    ret <vscale x 1 x i64> [[TMP3]]
 ;
   %load = call <vscale x 1 x i64> @llvm.vp.load.nxv1i64.p0(ptr %ptr, <vscale x 1 x i1> %m, i32 %evl)
@@ -109,7 +109,7 @@ define <vscale x 1 x i64> @vpload_nxv1i64_vscale(ptr %ptr, <vscale x 1 x i1> %m)
 ; CHECK-LABEL: @vpload_nxv1i64_vscale(
 ; CHECK-NEXT:    [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[VLMAX:%.*]] = mul nuw i32 [[VSCALE]], 1
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0(ptr [[PTR:%.*]], i32 1, <vscale x 1 x i1> [[M:%.*]], <vscale x 1 x i64> poison)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0(ptr align 1 [[PTR:%.*]], <vscale x 1 x i1> [[M:%.*]], <vscale x 1 x i64> poison)
 ; CHECK-NEXT:    ret <vscale x 1 x i64> [[TMP1]]
 ;
   %vscale = call i32 @llvm.vscale.i32()
@@ -124,7 +124,7 @@ define <vscale x 1 x i64> @vpload_nxv1i64_allones_mask(ptr %ptr, i32 zeroext %ev
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <vscale x 1 x i1> [[TMP1]], splat (i1 true)
 ; CHECK-NEXT:    [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0(ptr [[PTR:%.*]], i32 1, <vscale x 1 x i1> [[TMP2]], <vscale x 1 x i64> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0(ptr align 1 [[PTR:%.*]], <vscale x 1 x i1> [[TMP2]], <vscale x 1 x i64> poison)
 ; CHECK-NEXT:    ret <vscale x 1 x i64> [[TMP3]]
 ;
   %load = call <vscale x 1 x i64> @llvm.vp.load.nxv1i64.p0(ptr %ptr, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -150,7 +150,7 @@ define void @vpstore_nxv1i64(<vscale x 1 x i64> %val, ptr %ptr, <vscale x 1 x i1
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <vscale x 1 x i1> [[TMP1]], [[M:%.*]]
 ; CHECK-NEXT:    [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 1
-; CHECK-NEXT:    call void @llvm.masked.store.nxv1i64.p0(<vscale x 1 x i64> [[VAL:%.*]], ptr [[PTR:%.*]], i32 1, <vscale x 1 x i1> [[TMP2]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv1i64.p0(<vscale x 1 x i64> [[VAL:%.*]], ptr align 1 [[PTR:%.*]], <vscale x 1 x i1> [[TMP2]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.vp.store.nxv1i64.p0(<vscale x 1 x i64> %val, ptr %ptr, <vscale x 1 x i1> %m, i32 %evl)
@@ -161,7 +161,7 @@ define void @vpstore_nxv1i64_vscale(<vscale x 1 x i64> %val, ptr %ptr, <vscale x
 ; CHECK-LABEL: @vpstore_nxv1i64_vscale(
 ; CHECK-NEXT:    [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[VLMAX:%.*]] = mul nuw i32 [[VSCALE]], 1
-; CHECK-NEXT:    call void @llvm.masked.store.nxv1i64.p0(<vscale x 1 x i64> [[VAL:%.*]], ptr [[PTR:%.*]], i32 1, <vscale x 1 x i1> [[M:%.*]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv1i64.p0(<vscale x 1 x i64> [[VAL:%.*]], ptr align 1 [[PTR:%.*]], <vscale x 1 x i1> [[M:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   %vscale = call i32 @llvm.vscale.i32()
@@ -176,7 +176,7 @@ define void @vpstore_nxv1i64_allones_mask(<vscale x 1 x i64> %val, ptr %ptr, i32
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <vscale x 1 x i1> [[TMP1]], splat (i1 true)
 ; CHECK-NEXT:    [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 1
-; CHECK-NEXT:    call void @llvm.masked.store.nxv1i64.p0(<vscale x 1 x i64> [[VAL:%.*]], ptr [[PTR:%.*]], i32 1, <vscale x 1 x i1> [[TMP2]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv1i64.p0(<vscale x 1 x i64> [[VAL:%.*]], ptr align 1 [[PTR:%.*]], <vscale x 1 x i1> [[TMP2]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.vp.store.nxv1i64.p0(<vscale x 1 x i64> %val, ptr %ptr, <vscale x 1 x i1> splat (i1 true), i32 %evl)
diff --git a/llvm/test/Transforms/RewriteStatepointsForGC/vector-nonlive-clobber.ll b/llvm/test/Transforms/RewriteStatepointsForGC/vector-nonlive-clobber.ll
index caf4d5274bed9..c747a4d2b5113 100644
--- a/llvm/test/Transforms/RewriteStatepointsForGC/vector-nonlive-clobber.ll
+++ b/llvm/test/Transforms/RewriteStatepointsForGC/vector-nonlive-clobber.ll
@@ -16,7 +16,7 @@ define void @test_vector_clobber(ptr addrspace(1) %ptr) gc "statepoint-example"
 ; CHECK-NEXT:    [[GEP_RELOCATED:%.*]] = call coldcc <8 x ptr addrspace(1)> @llvm.experimental.gc.relocate.v8p1(token [[STATEPOINT_TOKEN1]], i32 2, i32 0)
 ; CHECK-NEXT:    [[PTR_RELOCATED2:%.*]] = call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token [[STATEPOINT_TOKEN1]], i32 1, i32 1)
 ; CHECK-NEXT:    [[DOTSPLAT_BASE_RELOCATED:%.*]] = call coldcc <8 x ptr addrspace(1)> @llvm.experimental.gc.relocate.v8p1(token [[STATEPOINT_TOKEN1]], i32 2, i32 2)
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p1(<8 x ptr addrspace(1)> [[GEP_RELOCATED]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison)
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p1(<8 x ptr addrspace(1)> align 4 [[GEP_RELOCATED]], <8 x i1> splat (i1 true), <8 x float> poison)
 ; CHECK-NEXT:    unreachable
 ;
 entry:
diff --git a/llvm/test/Transforms/SCCP/constant-range-struct.ll b/llvm/test/Transforms/SCCP/constant-range-struct.ll
index 7a399df11fb13..0f45b38cb39b5 100644
--- a/llvm/test/Transforms/SCCP/constant-range-struct.ll
+++ b/llvm/test/Transforms/SCCP/constant-range-struct.ll
@@ -25,7 +25,7 @@ true:
   br label %exit
 
 false:
-  %s.3 = insertvalue {i64, i64} undef, i64 30, 0
+  %s.3 = insertvalue {i64, i64} poison, i64 30, 0
   %s.4 = insertvalue {i64, i64} %s.3, i64 300, 1
   br label %exit
 
@@ -39,14 +39,14 @@ define void @struct1_caller() {
 ; CHECK-NEXT:    [[S:%.*]] = call { i64, i64 } @struct1()
 ; CHECK-NEXT:    [[V1:%.*]] = extractvalue { i64, i64 } [[S]], 0
 ; CHECK-NEXT:    [[V2:%.*]] = extractvalue { i64, i64 } [[S]], 1
-; CHECK-NEXT:    [[T_1:%.*]] = icmp ne i64 [[V1]], 10
-; CHECK-NEXT:    call void @use(i1 [[T_1]])
-; CHECK-NEXT:    [[T_2:%.*]] = icmp ult i64 [[V1]], 100
-; CHECK-NEXT:    call void @use(i1 [[T_2]])
-; CHECK-NEXT:    [[T_3:%.*]] = icmp ne i64 [[V2]], 0
+; CHECK-NEXT:    call void @use(i1 true)
+; CHECK-NEXT:    call void @use(i1 true)
+; CHECK-NEXT:    [[T_3:%.*]] = icmp eq i64 [[V1]], 20
 ; CHECK-NEXT:    call void @use(i1 [[T_3]])
-; CHECK-NEXT:    [[T_4:%.*]] = icmp ult i64 [[V2]], 301
-; CHECK-NEXT:    call void @use(i1 [[T_4]])
+; CHECK-NEXT:    call void @use(i1 true)
+; CHECK-NEXT:    call void @use(i1 true)
+; CHECK-NEXT:    [[T_6:%.*]] = icmp eq i64 [[V2]], 300
+; CHECK-NEXT:    call void @use(i1 [[T_6]])
 ; CHECK-NEXT:    ret void
 ;
   %s = call {i64, i64} @struct1()
@@ -57,10 +57,14 @@ define void @struct1_caller() {
   call void @use(i1 %t.1)
   %t.2 = icmp ult i64 %v1, 100
   call void @use(i1 %t.2)
-  %t.3 = icmp ne i64 %v2, 0
+  %t.3 = icmp eq i64 %v1, 20
   call void @use(i1 %t.3)
-  %t.4 = icmp ult i64 %v2, 301
+  %t.4 = icmp ne i64 %v2, 0
   call void @use(i1 %t.4)
+  %t.5 = icmp ult i64 %v2, 301
+  call void @use(i1 %t.5)
+  %t.6 = icmp eq i64 %v2, 300
+  call void @use(i1 %t.6)
 
   ret void
 }
@@ -76,7 +80,7 @@ define internal {i64, i64} @struct2() {
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[V1:%.*]] = phi i64 [ 20, [[TRUE]] ], [ 30, [[FALSE]] ]
 ; CHECK-NEXT:    [[V2:%.*]] = phi i64 [ 200, [[TRUE]] ], [ 300, [[FALSE]] ]
-; CHECK-NEXT:    [[S_1:%.*]] = insertvalue { i64, i64 } undef, i64 [[V1]], 0
+; CHECK-NEXT:    [[S_1:%.*]] = insertvalue { i64, i64 } poison, i64 [[V1]], 0
 ; CHECK-NEXT:    [[S_2:%.*]] = insertvalue { i64, i64 } [[S_1]], i64 [[V2]], 1
 ; CHECK-NEXT:    ret { i64, i64 } [[S_2]]
 ;
@@ -92,7 +96,7 @@ false:
 exit:
   %v1 = phi i64 [ 20, %true ], [ 30, %false ]
   %v2 = phi i64 [ 200, %true ], [ 300, %false ]
-  %s.1 = insertvalue {i64, i64} undef, i64 %v1, 0
+  %s.1 = insertvalue {i64, i64} poison, i64 %v1, 0
   %s.2 = insertvalue {i64, i64} %s.1, i64 %v2, 1
   ret {i64, i64} %s.2
 }
@@ -153,3 +157,40 @@ define void @struct2_caller() {
 
   ret void
 }
+
+%"phi_type" =  type {i64, i64}
+
+define internal %"phi_type" @test(i32 %input) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:    br label [[COND_TRUE_I:%.*]]
+; CHECK:       cond.true.i:
+; CHECK-NEXT:    br label [[COND_END_I:%.*]]
+; CHECK:       cond.end.i:
+; CHECK-NEXT:    ret [[PHI_TYPE:%.*]] poison
+;
+  %cmp.cond = icmp eq i32 %input, 1
+  br i1 %cmp.cond, label %cond.true.i, label %cond.false.i
+
+cond.true.i:
+  %r1.tmp = insertvalue %"phi_type" poison, i64 1, 0
+  %r1.tmp.2 = insertvalue %"phi_type" %r1.tmp, i64 2, 1
+  br label %cond.end.i
+
+cond.false.i:
+  %r2.tmp = insertvalue %"phi_type" poison, i64 3, 0
+  %r2.tmp.2 = insertvalue %"phi_type" %r2.tmp, i64 4, 1
+  br label %cond.end.i
+
+cond.end.i:
+  %retval = phi %"phi_type" [ %r1.tmp.2, %cond.true.i ], [ %r2.tmp.2, %cond.false.i ]
+  ret %"phi_type" %retval
+}
+
+define %"phi_type" @test2() {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    [[CALL_1:%.*]] = tail call fastcc [[PHI_TYPE:%.*]] @[[TEST:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 noundef 1)
+; CHECK-NEXT:    ret [[PHI_TYPE]] { i64 1, i64 2 }
+;
+  %call.1 = tail call fastcc noundef %"phi_type" @test(i32 noundef 1)
+  ret %"phi_type" %call.1
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/div-like-mixed-with-undefs.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/div-like-mixed-with-undefs.ll
new file mode 100644
index 0000000000000..d16843c81144d
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/div-like-mixed-with-undefs.ll
@@ -0,0 +1,53 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=slp-vectorizer -S -slp-threshold=-100 -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
+
+define ptr @test(ptr %d) {
+; CHECK-LABEL: define ptr @test(
+; CHECK-SAME: ptr [[D:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr null, align 1
+; CHECK-NEXT:    [[CMP4_2:%.*]] = icmp eq i8 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[CMP4_2]], i64 0, i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 0, 0
+; CHECK-NEXT:    [[TMP3:%.*]] = udiv i64 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = udiv i64 1, 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <6 x i64> poison, i64 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <6 x i64> [[TMP5]], i64 [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <6 x i64> [[TMP6]], i64 [[TMP4]], i32 4
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <6 x i64> [[TMP7]], <6 x i64> poison, <6 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <6 x i64> [[TMP8]], <i64 2, i64 6, i64 1, i64 1, i64 1, i64 0>
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <6 x i64> [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[D]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <6 x i64> [[TMP9]], i32 1
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[D]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <6 x i64> [[TMP9]], i32 2
+; CHECK-NEXT:    [[SCEVGEP42:%.*]] = getelementptr i8, ptr [[D]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <6 x i64> [[TMP9]], i32 3
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[D]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <6 x i64> [[TMP9]], i32 4
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[D]], i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <6 x i64> [[TMP9]], i32 5
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[D]], i64 [[TMP19]]
+; CHECK-NEXT:    ret ptr [[TMP20]]
+;
+entry:
+  %0 = load i8, ptr null, align 1
+  %cmp4.2 = icmp eq i8 %0, 0
+  %1 = select i1 %cmp4.2, i64 0, i64 0
+  %2 = shl i64 %1, 1
+  %3 = getelementptr i8, ptr %d, i64 %2
+  %4 = xor i64 0, 0
+  %5 = udiv i64 %4, 0
+  %6 = mul i64 %5, 6
+  %7 = getelementptr i8, ptr %d, i64 %6
+  %8 = shl i64 %1, 0
+  %scevgep42 = getelementptr i8, ptr %d, i64 %8
+  %9 = mul i64 %5, 1
+  %10 = getelementptr i8, ptr %d, i64 %9
+  %11 = udiv i64 1, 0
+  %12 = mul i64 %11, 1
+  %13 = getelementptr i8, ptr %d, i64 %12
+  %14 = mul i64 %11, 0
+  %15 = getelementptr i8, ptr %d, i64 %14
+  ret ptr %15
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/scalable-vector.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/scalable-vector.ll
index 2bde20231c076..9b34469e36c99 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/scalable-vector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/scalable-vector.ll
@@ -8,10 +8,10 @@ target triple = "aarch64-unknown-linux-gnu"
 
 define void @test() {
 ; CHECK-LABEL: @test(
-; CHECK-NEXT:    [[LOAD0:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr undef, i32 1, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
-; CHECK-NEXT:    [[LOAD1:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr undef, i32 1, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
+; CHECK-NEXT:    [[LOAD0:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 undef, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
+; CHECK-NEXT:    [[LOAD1:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 undef, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
 ; CHECK-NEXT:    [[ADD:%.*]] = add <vscale x 16 x i8> [[LOAD1]], [[LOAD0]]
-; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[ADD]], ptr undef, i32 1, <vscale x 16 x i1> undef)
+; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[ADD]], ptr align 1 undef, <vscale x 16 x i1> undef)
 ; CHECK-NEXT:    ret void
 ;
   %load0 = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr undef, i32 1, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll
index 02e05b2e4138a..f8229b3555653 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll
@@ -170,7 +170,7 @@ define void @const_stride_2_no_reordering(ptr %pl, ptr %ps) {
 ; CHECK-SAME: ptr [[PL:%.*]], ptr [[PS:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 0
 ; CHECK-NEXT:    [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = call <31 x i8> @llvm.masked.load.v31i8.p0(ptr [[GEP_L0]], i32 1, <31 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <31 x i8> poison)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <31 x i8> @llvm.masked.load.v31i8.p0(ptr align 1 [[GEP_L0]], <31 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <31 x i8> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <31 x i8> [[TMP2]], <31 x i8> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
 ; CHECK-NEXT:    store <16 x i8> [[TMP1]], ptr [[GEP_S0]], align 1
 ; CHECK-NEXT:    ret void
@@ -251,7 +251,7 @@ define void @const_stride_2_with_reordering(ptr %pl, ptr %ps) {
 ; CHECK-SAME: ptr [[PL:%.*]], ptr [[PS:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 0
 ; CHECK-NEXT:    [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = call <31 x i8> @llvm.masked.load.v31i8.p0(ptr [[GEP_L0]], i32 1, <31 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <31 x i8> poison)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <31 x i8> @llvm.masked.load.v31i8.p0(ptr align 1 [[GEP_L0]], <31 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <31 x i8> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <31 x i8> [[TMP1]], <31 x i8> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <31 x i8> [[TMP1]], <31 x i8> poison, <16 x i32> <i32 2, i32 0, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
 ; CHECK-NEXT:    store <16 x i8> [[TMP2]], ptr [[GEP_S0]], align 1
@@ -532,7 +532,7 @@ define void @constant_stride_masked_no_reordering(ptr %pl, i64 %stride, ptr %ps)
 ; CHECK-SAME: ptr [[PL:%.*]], i64 [[STRIDE:%.*]], ptr [[PS:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 0
 ; CHECK-NEXT:    [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = call <28 x i8> @llvm.masked.load.v28i8.p0(ptr [[GEP_L0]], i32 1, <28 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <28 x i8> poison)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <28 x i8> @llvm.masked.load.v28i8.p0(ptr align 1 [[GEP_L0]], <28 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <28 x i8> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <28 x i8> [[TMP1]], <28 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
 ; CHECK-NEXT:    store <16 x i8> [[TMP2]], ptr [[GEP_S0]], align 1
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/gather-insert-point-restore.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/gather-insert-point-restore.ll
index 82c940353ba5a..3a7112fb8ab08 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/gather-insert-point-restore.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/gather-insert-point-restore.ll
@@ -13,7 +13,7 @@ define i16 @test(ptr %i) {
 ; CHECK-NEXT:    br label %[[FOR_COND5_US:.*]]
 ; CHECK:       [[FOR_COND5_US]]:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i16> @llvm.experimental.vp.strided.load.v4i16.p0.i64(ptr align 2 [[GEP_US154_2]], i64 4914, <4 x i1> splat (i1 true), i32 4)
-; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> [[TMP3]], i32 2, <4 x i1> splat (i1 true), <4 x i16> poison)
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 2 [[TMP3]], <4 x i1> splat (i1 true), <4 x i16> poison)
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i16> [[TMP5]], <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/gather-node-with-no-users.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/gather-node-with-no-users.ll
index 8e80aee7070a9..5697292efa319 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/gather-node-with-no-users.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/gather-node-with-no-users.ll
@@ -9,8 +9,8 @@ define void @test(ptr %c) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, <8 x ptr> [[TMP1]], <8 x i64> <i64 222, i64 228, i64 276, i64 279, i64 282, i64 285, i64 288, i64 0>
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, <8 x ptr> [[TMP1]], <8 x i64> <i64 0, i64 345, i64 348, i64 351, i64 354, i64 357, i64 360, i64 363>
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> [[TMP2]], i32 1, <8 x i1> splat (i1 true), <8 x i8> poison)
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> [[TMP3]], i32 1, <8 x i1> splat (i1 true), <8 x i8> poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> align 1 [[TMP2]], <8 x i1> splat (i1 true), <8 x i8> poison)
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> align 1 [[TMP3]], <8 x i1> splat (i1 true), <8 x i8> poison)
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i8> [[TMP5]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    br label %[[FOR_COND:.*]]
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll
index bf6e2bd91ae46..7fd88e54d4440 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll
@@ -17,7 +17,7 @@ define void @test() {
 ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr null, align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> zeroinitializer, i32 4, <2 x i1> splat (i1 true), <2 x float> poison)
+; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 4 zeroinitializer, <2 x i1> splat (i1 true), <2 x float> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = fcmp ogt <2 x float> [[TMP3]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i1> [[TMP6]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
index da08718d5c248..e13dfce8c29f3 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
@@ -7,7 +7,7 @@ define i32 @test() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[IF_END_I87:%.*]]
 ; CHECK:       if.end.i87:
-; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> getelementptr (i32, <4 x ptr> <ptr inttoptr (i64 64036 to ptr), ptr inttoptr (i64 64036 to ptr), ptr inttoptr (i64 64064 to ptr), ptr inttoptr (i64 64064 to ptr)>, <4 x i64> <i64 0, i64 1, i64 0, i64 1>), i32 4, <4 x i1> splat (i1 true), <4 x i32> poison)
+; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 getelementptr (i32, <4 x ptr> <ptr inttoptr (i64 64036 to ptr), ptr inttoptr (i64 64036 to ptr), ptr inttoptr (i64 64064 to ptr), ptr inttoptr (i64 64064 to ptr)>, <4 x i64> <i64 0, i64 1, i64 0, i64 1>), <4 x i1> splat (i1 true), <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> <i32 undef, i32 undef, i32 0, i32 0>, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
 ; CHECK-NEXT:    switch i32 0, label [[SW_BB509_I:%.*]] [
 ; CHECK-NEXT:      i32 1, label [[SW_BB509_I]]
@@ -147,8 +147,8 @@ define ptr @test4() {
 ; POWEROF2-NEXT:    [[TMP13:%.*]] = fmul <2 x float> [[TMP12]], zeroinitializer
 ; POWEROF2-NEXT:    [[TMP14:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
 ; POWEROF2-NEXT:    [[TMP15:%.*]] = fmul <2 x float> zeroinitializer, [[TMP14]]
-; POWEROF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x float> [[TMP9]], i32 0
-; POWEROF2-NEXT:    [[TMP17:%.*]] = fmul float 0.000000e+00, [[TMP16]]
+; POWEROF2-NEXT:    [[TMP30:%.*]] = extractelement <2 x float> [[TMP9]], i32 0
+; POWEROF2-NEXT:    [[TMP17:%.*]] = fmul float 0.000000e+00, [[TMP30]]
 ; POWEROF2-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[TMP9]], i32 1
 ; POWEROF2-NEXT:    [[TMP19:%.*]] = fmul float [[TMP18]], 0.000000e+00
 ; POWEROF2-NEXT:    [[TMP20:%.*]] = extractelement <2 x float> [[TMP13]], i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll
index ff897180cc9b7..a0930a2fa1317 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll
@@ -13,7 +13,7 @@ define void @test(ptr %a, i64 %0) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = or disjoint <2 x i64> [[TMP3]], <i64 1, i64 0>
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr double, <2 x ptr> [[TMP2]], <2 x i64> [[TMP5]]
 ; CHECK-NEXT:    [[ARRAYIDX17_I28_1:%.*]] = extractelement <2 x ptr> [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> [[TMP6]], i32 8, <2 x i1> splat (i1 true), <2 x double> poison)
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 8 [[TMP6]], <2 x i1> splat (i1 true), <2 x double> poison)
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x double>, ptr [[A]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x double>, ptr [[A]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = fsub <2 x double> [[TMP8]], [[TMP9]]
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/scatter-vectorize-reversed.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/scatter-vectorize-reversed.ll
index 8aa75294c4d51..16423e87d0814 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/scatter-vectorize-reversed.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/scatter-vectorize-reversed.ll
@@ -9,7 +9,7 @@ define <4 x i32> @test(<2 x i64> %v, ptr %p) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x ptr> poison, ptr [[P]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x ptr> [[TMP0]], <2 x ptr> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i16, <2 x ptr> [[TMP1]], <2 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> [[TMP2]], i32 2, <2 x i1> splat (i1 true), <2 x i16> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> align 2 [[TMP2]], <2 x i1> splat (i1 true), <2 x i16> poison)
 ; CHECK-NEXT:    [[TMP7:%.*]] = zext <2 x i16> [[TMP3]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> zeroinitializer, <4 x i32> [[TMP6]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads-simple.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads-simple.ll
index 65bf24c958734..8497db2bd4344 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads-simple.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads-simple.ll
@@ -5,7 +5,7 @@ define i32 @sum_of_abs_stride_2(ptr noalias %a, ptr noalias %b) {
 ; CHECK-LABEL: define i32 @sum_of_abs_stride_2
 ; CHECK-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call <15 x i8> @llvm.masked.load.v15i8.p0(ptr [[A]], i32 1, <15 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <15 x i8> poison)
+; CHECK-NEXT:    [[TMP0:%.*]] = call <15 x i8> @llvm.masked.load.v15i8.p0(ptr align 1 [[A]], <15 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <15 x i8> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <15 x i8> [[TMP0]], <15 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i8> @llvm.abs.v8i8(<8 x i8> [[TMP1]], i1 false)
 ; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32>
@@ -58,7 +58,7 @@ define i32 @sum_of_abs_stride_3(ptr noalias %a, ptr noalias %b) {
 ; CHECK-LABEL: define i32 @sum_of_abs_stride_3
 ; CHECK-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call <22 x i8> @llvm.masked.load.v22i8.p0(ptr [[A]], i32 1, <22 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i8> poison)
+; CHECK-NEXT:    [[TMP0:%.*]] = call <22 x i8> @llvm.masked.load.v22i8.p0(ptr align 1 [[A]], <22 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i8> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <22 x i8> [[TMP0]], <22 x i8> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i8> @llvm.abs.v8i8(<8 x i8> [[TMP1]], i1 false)
 ; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32>
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-indices.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-indices.ll
index a079203696cdd..2b9891d03d447 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-indices.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-indices.ll
@@ -10,7 +10,7 @@ define void @test() {
 ; CHECK-NEXT:    [[SUB4_I_I65_US:%.*]] = or i64 0, 1
 ; CHECK-NEXT:    br label [[BODY:%.*]]
 ; CHECK:       body:
-; CHECK-NEXT:    [[TMP0:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> getelementptr ([[CLASS_A:%.*]], <2 x ptr> zeroinitializer, <2 x i64> <i64 0, i64 1>), i32 4, <2 x i1> splat (i1 true), <2 x i32> poison)
+; CHECK-NEXT:    [[TMP0:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 4 getelementptr ([[CLASS_A:%.*]], <2 x ptr> zeroinitializer, <2 x i64> <i64 0, i64 1>), <2 x i1> splat (i1 true), <2 x i32> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[CMP_I_I_I_I67_US:%.*]] = icmp slt i32 [[TMP1]], [[TMP2]]
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll
index 07094c642f8da..8fddb2e21e248 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll
@@ -15,7 +15,7 @@ define i16 @test() {
 ; CHECK-NEXT:    [[PEDGE_061_I:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ null, [[ENTRY]] ]
 ; CHECK-NEXT:    [[INCDEC_PTR_I]] = getelementptr [[S]], ptr [[PEDGE_061_I]], i64 -1
 ; CHECK-NEXT:    [[PPREV_0_I]] = getelementptr [[S]], ptr [[PPREV_062_I]], i64 -1
-; CHECK-NEXT:    [[TMP1:%.*]] = call <3 x i16> @llvm.masked.load.v3i16.p0(ptr [[PPREV_0_I]], i32 2, <3 x i1> <i1 true, i1 false, i1 true>, <3 x i16> poison)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <3 x i16> @llvm.masked.load.v3i16.p0(ptr align 2 [[PPREV_0_I]], <3 x i1> <i1 true, i1 false, i1 true>, <3 x i16> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <3 x i16> [[TMP1]], <3 x i16> poison, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i16> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i16> [[TMP2]], i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/unordered-loads-operands.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/unordered-loads-operands.ll
index 510cf45edbb52..8be8e96b7981d 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/unordered-loads-operands.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/unordered-loads-operands.ll
@@ -14,11 +14,11 @@ define void @test(ptr %mdct_forward_x) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x ptr> [[TMP1]], <4 x ptr> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, <4 x ptr> [[TMP2]], <4 x i64> <i64 28, i64 36, i64 24, i64 28>
-; CHECK-NEXT:    [[TMP5:%.*]] = call <3 x float> @llvm.masked.load.v3f32.p0(ptr [[ADD_PTR_I]], i32 4, <3 x i1> <i1 true, i1 false, i1 true>, <3 x float> poison)
+; CHECK-NEXT:    [[TMP5:%.*]] = call <3 x float> @llvm.masked.load.v3f32.p0(ptr align 4 [[ADD_PTR_I]], <3 x i1> <i1 true, i1 false, i1 true>, <3 x float> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <3 x float> [[TMP5]], <3 x float> poison, <2 x i32> <i32 2, i32 0>
-; CHECK-NEXT:    [[TMP6:%.*]] = call <3 x float> @llvm.masked.load.v3f32.p0(ptr [[ARRAYIDX5_I_I]], i32 4, <3 x i1> <i1 true, i1 false, i1 true>, <3 x float> poison)
+; CHECK-NEXT:    [[TMP6:%.*]] = call <3 x float> @llvm.masked.load.v3f32.p0(ptr align 4 [[ARRAYIDX5_I_I]], <3 x i1> <i1 true, i1 false, i1 true>, <3 x float> poison)
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <3 x float> [[TMP6]], <3 x float> poison, <2 x i32> <i32 2, i32 0>
-; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> [[TMP3]], i32 4, <4 x i1> splat (i1 true), <4 x float> poison)
+; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 [[TMP3]], <4 x i1> splat (i1 true), <4 x float> poison)
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <3 x float> [[TMP6]], <3 x float> poison, <4 x i32> <i32 2, i32 0, i32 2, i32 2>
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <3 x float> [[TMP5]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll b/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll
index 80ba7a40fb193..dfd2c4a217dd7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll
@@ -9,7 +9,7 @@ define void @test() {
 ; CHECK-NEXT:    [[COND_IN_V:%.*]] = select i1 false, ptr null, ptr null
 ; CHECK-NEXT:    br label [[BB:%.*]]
 ; CHECK:       bb:
-; CHECK-NEXT:    [[TMP0:%.*]] = call <13 x i64> @llvm.masked.load.v13i64.p0(ptr [[COND_IN_V]], i32 8, <13 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true>, <13 x i64> poison)
+; CHECK-NEXT:    [[TMP0:%.*]] = call <13 x i64> @llvm.masked.load.v13i64.p0(ptr align 8 [[COND_IN_V]], <13 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true>, <13 x i64> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <13 x i64> [[TMP0]], <13 x i64> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i64> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    ret void
@@ -20,7 +20,7 @@ define void @test() {
 ; CHECK-SLP-THRESHOLD-NEXT:    [[COND_IN_V:%.*]] = select i1 false, ptr null, ptr null
 ; CHECK-SLP-THRESHOLD-NEXT:    br label [[BB:%.*]]
 ; CHECK-SLP-THRESHOLD:       bb:
-; CHECK-SLP-THRESHOLD-NEXT:    [[TMP0:%.*]] = call <13 x i64> @llvm.masked.load.v13i64.p0(ptr [[COND_IN_V]], i32 8, <13 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true>, <13 x i64> poison)
+; CHECK-SLP-THRESHOLD-NEXT:    [[TMP0:%.*]] = call <13 x i64> @llvm.masked.load.v13i64.p0(ptr align 8 [[COND_IN_V]], <13 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true>, <13 x i64> poison)
 ; CHECK-SLP-THRESHOLD-NEXT:    [[TMP1:%.*]] = shufflevector <13 x i64> [[TMP0]], <13 x i64> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
 ; CHECK-SLP-THRESHOLD-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i64> [[TMP1]], zeroinitializer
 ; CHECK-SLP-THRESHOLD-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/last-non-copyable-inst-used-outside-bb.ll b/llvm/test/Transforms/SLPVectorizer/X86/last-non-copyable-inst-used-outside-bb.ll
new file mode 100644
index 0000000000000..2f97b41347891
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/last-non-copyable-inst-used-outside-bb.ll
@@ -0,0 +1,89 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-99999 < %s | FileCheck %s
+
+define void @test() {
+; CHECK-LABEL: define void @test() {
+; CHECK-NEXT:  [[BB:.*]]:
+; CHECK-NEXT:    br label %[[BB1:.*]]
+; CHECK:       [[BB1]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <4 x i32> [ zeroinitializer, %[[BB]] ], [ [[TMP7:%.*]], %[[BB16:.*]] ], [ zeroinitializer, %[[BB1]] ]
+; CHECK-NEXT:    br i1 false, label %[[BB1]], label %[[BB5:.*]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[PHI8:%.*]] = phi double [ 0.000000e+00, %[[BB16]] ], [ 0.000000e+00, %[[BB1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi <4 x i32> [ [[TMP8:%.*]], %[[BB16]] ], [ <i32 poison, i32 poison, i32 0, i32 0>, %[[BB1]] ]
+; CHECK-NEXT:    switch i32 0, label %[[BB21:.*]] [
+; CHECK-NEXT:      i32 4, label %[[BB21]]
+; CHECK-NEXT:      i32 1, label %[[BB21]]
+; CHECK-NEXT:      i32 0, label %[[BB9:.*]]
+; CHECK-NEXT:    ]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[PHI13:%.*]] = phi double [ 0.000000e+00, %[[BB21]] ], [ 0.000000e+00, %[[BB5]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <4 x i32> [ [[TMP1]], %[[BB21]] ], [ <i32 poison, i32 poison, i32 0, i32 0>, %[[BB5]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi <4 x i32> [ [[TMP9:%.*]], %[[BB21]] ], [ <i32 poison, i32 poison, i32 0, i32 0>, %[[BB5]] ]
+; CHECK-NEXT:    switch i32 0, label %[[BB15:.*]] [
+; CHECK-NEXT:      i32 1, label %[[BB14:.*]]
+; CHECK-NEXT:      i32 0, label %[[BB16]]
+; CHECK-NEXT:    ]
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> <i32 poison, i32 poison, i32 0, i32 poison>, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT:    br label %[[BB16]]
+; CHECK:       [[BB15]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> <i32 poison, i32 poison, i32 0, i32 0>, [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP5]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 7>
+; CHECK-NEXT:    br label %[[BB16]]
+; CHECK:       [[BB16]]:
+; CHECK-NEXT:    [[PHI20:%.*]] = phi double [ 0.000000e+00, %[[BB15]] ], [ 0.000000e+00, %[[BB14]] ], [ 0.000000e+00, %[[BB9]] ]
+; CHECK-NEXT:    [[TMP7]] = phi <4 x i32> [ [[TMP5]], %[[BB15]] ], [ [[TMP4]], %[[BB14]] ], [ <i32 poison, i32 poison, i32 0, i32 0>, %[[BB9]] ]
+; CHECK-NEXT:    [[TMP8]] = phi <4 x i32> [ [[TMP6]], %[[BB15]] ], [ [[TMP3]], %[[BB14]] ], [ <i32 poison, i32 poison, i32 0, i32 0>, %[[BB9]] ]
+; CHECK-NEXT:    br i1 false, label %[[BB5]], label %[[BB1]]
+; CHECK:       [[BB21]]:
+; CHECK-NEXT:    [[TMP9]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> <i32 poison, i32 poison, i32 0, i32 poison>, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT:    br label %[[BB9]]
+;
+bb:
+  br label %bb1
+
+bb1:
+  %phi = phi i32 [ 0, %bb ], [ 0, %bb1 ], [ %phi17, %bb16 ]
+  %phi2 = phi i32 [ 0, %bb ], [ 0, %bb1 ], [ %phi18, %bb16 ]
+  %phi3 = phi i32 [ 0, %bb ], [ poison, %bb16 ], [ 0, %bb1 ]
+  %phi4 = phi i32 [ 0, %bb ], [ poison, %bb16 ], [ 0, %bb1 ]
+  br i1 false, label %bb1, label %bb5
+
+bb5:
+  %phi6 = phi i32 [ %phi17, %bb16 ], [ 0, %bb1 ]
+  %phi7 = phi i32 [ %phi19, %bb16 ], [ 0, %bb1 ]
+  %phi8 = phi double [ 0.000000e+00, %bb16 ], [ 0.000000e+00, %bb1 ]
+  switch i32 0, label %bb21 [
+  i32 4, label %bb21
+  i32 1, label %bb21
+  i32 0, label %bb9
+  ]
+
+bb9:
+  %phi10 = phi i32 [ %phi6, %bb21 ], [ 0, %bb5 ]
+  %phi11 = phi i32 [ %phi7, %bb21 ], [ 0, %bb5 ]
+  %phi12 = phi i32 [ 0, %bb21 ], [ 0, %bb5 ]
+  %phi13 = phi double [ 0.000000e+00, %bb21 ], [ 0.000000e+00, %bb5 ]
+  switch i32 0, label %bb15 [
+  i32 1, label %bb14
+  i32 0, label %bb16
+  ]
+
+bb14:
+  br label %bb16
+
+bb15:
+  %add = add i32 0, %phi10
+  br label %bb16
+
+bb16:
+  %phi17 = phi i32 [ %add, %bb15 ], [ %phi10, %bb14 ], [ 0, %bb9 ]
+  %phi18 = phi i32 [ %phi11, %bb15 ], [ 0, %bb14 ], [ 0, %bb9 ]
+  %phi19 = phi i32 [ %phi12, %bb15 ], [ %phi12, %bb14 ], [ 0, %bb9 ]
+  %phi20 = phi double [ 0.000000e+00, %bb15 ], [ 0.000000e+00, %bb14 ], [ 0.000000e+00, %bb9 ]
+  br i1 false, label %bb5, label %bb1
+
+bb21:
+  br label %bb9
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-parent-multi-copyables.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-parent-multi-copyables.ll
new file mode 100644
index 0000000000000..7accca311af3c
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-parent-multi-copyables.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s -slp-threshold=-99999 | FileCheck %s
+
+define void @test() {
+; CHECK-LABEL: define void @test() {
+; CHECK-NEXT:  [[BB:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[BB1:.*]], label %[[BB6:.*]]
+; CHECK:       [[BB1]]:
+; CHECK-NEXT:    br label %[[BB6]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <4 x i32> [ <i32 0, i32 0, i32 poison, i32 0>, %[[BB]] ], [ <i32 0, i32 0, i32 -1, i32 -1>, %[[BB1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[TMP1]], 0
+; CHECK-NEXT:    ret void
+;
+bb:
+  br i1 false, label %bb1, label %bb6
+
+bb1:
+  %add = add i32 0, 0
+  %shl = shl i32 %add, 0
+  %sub = sub i32 0, 1
+  %add2 = add i32 %sub, %shl
+  %add3 = add i32 0, 0
+  %shl4 = shl i32 %add3, 0
+  %ashr = ashr i32 %shl4, 1
+  %add5 = add i32 0, 0
+  br label %bb6
+
+bb6:
+  %phi = phi i32 [ poison, %bb ], [ %add2, %bb1 ]
+  %phi7 = phi i32 [ 0, %bb ], [ %ashr, %bb1 ]
+  %phi8 = phi i32 [ 0, %bb ], [ %add2, %bb1 ]
+  %phi9 = phi i32 [ 0, %bb ], [ %add5, %bb1 ]
+  %or = or i32 %phi8, 0
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/num-uses-for-copyable-elements.ll b/llvm/test/Transforms/SLPVectorizer/X86/num-uses-for-copyable-elements.ll
new file mode 100644
index 0000000000000..867464e00d135
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/num-uses-for-copyable-elements.ll
@@ -0,0 +1,30 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-apple-macosx15.0.0  -mcpu=skylake-avx512 -S < %s | FileCheck %s
+
+define void @test(ptr %output) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr [[OUTPUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[ARRAYIDX_2_I:%.*]] = getelementptr i8, ptr [[OUTPUT]], i64 8
+; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[OUTPUT]], <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> poison)
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <2 x i32> <i32 -1, i32 0>, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> <i32 0, i32 poison>, <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> splat (i32 2))
+; CHECK-NEXT:    store <2 x i32> [[TMP4]], ptr [[ARRAYIDX_2_I]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %arrayidx.2.i = getelementptr i8, ptr %output, i64 8
+  %0 = load i32, ptr %output, align 4
+  %arrayidx.3.i = getelementptr i8, ptr %output, i64 12
+  %1 = load i32, ptr %arrayidx.3.i, align 4
+  %xor7 = xor i32 -1, %0
+  %or.i = tail call i32 @llvm.fshl.i32(i32 %xor7, i32 0, i32 2)
+  %or.i11 = tail call i32 @llvm.fshl.i32(i32 %1, i32 %1, i32 2)
+  store i32 %or.i, ptr %arrayidx.2.i, align 4
+  store i32 %or.i11, ptr %arrayidx.3.i, align 4
+  ret void
+}
+
+declare i32 @llvm.fshl.i32(i32, i32, i32)
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi-operand-gathered-loads.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi-operand-gathered-loads.ll
index 57eb1e7173618..1fe1217d361a8 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/phi-operand-gathered-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/phi-operand-gathered-loads.ll
@@ -19,7 +19,7 @@ define void @test(ptr %this, i1 %cmp4.not) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x ptr> [[TMP1]], <4 x ptr> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, <4 x ptr> [[TMP2]], <4 x i64> [[TMP0]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[NEWPT]], i64 92
-; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP3]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison)
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP3]], <4 x i1> splat (i1 true), <4 x i32> poison)
 ; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP4]], align 4
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
index fde76f8b0e8b9..8b58d0cdccf2c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
@@ -188,7 +188,7 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado
 ; AVX512F-LABEL: define void @gather_load_2(
 ; AVX512F-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
 ; AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
-; AVX512F-NEXT:    [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr [[TMP3]], i32 4, <10 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <10 x i32> poison), !tbaa [[SHORT_TBAA0]]
+; AVX512F-NEXT:    [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr align 4 [[TMP3]], <10 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <10 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512F-NEXT:    [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 9>
 ; AVX512F-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], <i32 1, i32 3, i32 4, i32 2>
 ; AVX512F-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
@@ -198,7 +198,7 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado
 ; AVX512VL-LABEL: define void @gather_load_2(
 ; AVX512VL-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
 ; AVX512VL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
-; AVX512VL-NEXT:    [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr [[TMP3]], i32 4, <10 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <10 x i32> poison), !tbaa [[SHORT_TBAA0]]
+; AVX512VL-NEXT:    [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr align 4 [[TMP3]], <10 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <10 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512VL-NEXT:    [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 9>
 ; AVX512VL-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], <i32 1, i32 3, i32 4, i32 2>
 ; AVX512VL-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
@@ -302,7 +302,7 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado
 ;
 ; AVX2-LABEL: define void @gather_load_3(
 ; AVX2-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
-; AVX2-NEXT:    [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
+; AVX2-NEXT:    [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr align 4 [[TMP1]], <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <22 x i32> [[TMP3]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
 ; AVX2-NEXT:    [[TMP5:%.*]] = add <8 x i32> [[TMP4]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
 ; AVX2-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
@@ -311,7 +311,7 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado
 ;
 ; AVX512F-LABEL: define void @gather_load_3(
 ; AVX512F-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
-; AVX512F-NEXT:    [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
+; AVX512F-NEXT:    [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr align 4 [[TMP1]], <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512F-NEXT:    [[TMP4:%.*]] = shufflevector <22 x i32> [[TMP3]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
 ; AVX512F-NEXT:    [[TMP5:%.*]] = add <8 x i32> [[TMP4]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
 ; AVX512F-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
@@ -320,7 +320,7 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado
 ;
 ; AVX512VL-LABEL: define void @gather_load_3(
 ; AVX512VL-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
-; AVX512VL-NEXT:    [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
+; AVX512VL-NEXT:    [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr align 4 [[TMP1]], <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512VL-NEXT:    [[TMP4:%.*]] = shufflevector <22 x i32> [[TMP3]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
 ; AVX512VL-NEXT:    [[TMP5:%.*]] = add <8 x i32> [[TMP4]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
 ; AVX512VL-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
@@ -442,7 +442,7 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
 ;
 ; AVX2-LABEL: define void @gather_load_4(
 ; AVX2-SAME: ptr noalias captures(none) [[T0:%.*]], ptr noalias readonly captures(none) [[T1:%.*]]) #[[ATTR0]] {
-; AVX2-NEXT:    [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
+; AVX2-NEXT:    [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr align 4 [[T1]], <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <22 x i32> [[TMP1]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
 ; AVX2-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
 ; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
@@ -451,7 +451,7 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
 ;
 ; AVX512F-LABEL: define void @gather_load_4(
 ; AVX512F-SAME: ptr noalias captures(none) [[T0:%.*]], ptr noalias readonly captures(none) [[T1:%.*]]) #[[ATTR0]] {
-; AVX512F-NEXT:    [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
+; AVX512F-NEXT:    [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr align 4 [[T1]], <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512F-NEXT:    [[TMP2:%.*]] = shufflevector <22 x i32> [[TMP1]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
 ; AVX512F-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
 ; AVX512F-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
@@ -460,7 +460,7 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
 ;
 ; AVX512VL-LABEL: define void @gather_load_4(
 ; AVX512VL-SAME: ptr noalias captures(none) [[T0:%.*]], ptr noalias readonly captures(none) [[T1:%.*]]) #[[ATTR0]] {
-; AVX512VL-NEXT:    [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
+; AVX512VL-NEXT:    [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr align 4 [[T1]], <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512VL-NEXT:    [[TMP2:%.*]] = shufflevector <22 x i32> [[TMP1]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
 ; AVX512VL-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
 ; AVX512VL-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
@@ -666,7 +666,7 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea
 ;
 ; AVX512F-LABEL: define void @gather_load_div(
 ; AVX512F-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
-; AVX512F-NEXT:    [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1]], i32 4, <45 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <45 x float> poison), !tbaa [[SHORT_TBAA0]]
+; AVX512F-NEXT:    [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr align 4 [[TMP1]], <45 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <45 x float> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512F-NEXT:    [[TMP4:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <16 x i32> <i32 0, i32 3, i32 4, i32 5, i32 8, i32 10, i32 11, i32 13, i32 14, i32 17, i32 20, i32 23, i32 27, i32 30, i32 33, i32 44>
 ; AVX512F-NEXT:    [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 0, i32 3, i32 5, i32 8, i32 10, i32 14, i32 17, i32 20>
 ; AVX512F-NEXT:    [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 4, i32 11, i32 27, i32 30, i32 13, i32 44, i32 33, i32 23>
@@ -677,7 +677,7 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea
 ;
 ; AVX512VL-LABEL: define void @gather_load_div(
 ; AVX512VL-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
-; AVX512VL-NEXT:    [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1]], i32 4, <45 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <45 x float> poison), !tbaa [[SHORT_TBAA0]]
+; AVX512VL-NEXT:    [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr align 4 [[TMP1]], <45 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <45 x float> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512VL-NEXT:    [[TMP4:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <16 x i32> <i32 0, i32 3, i32 4, i32 5, i32 8, i32 10, i32 11, i32 13, i32 14, i32 17, i32 20, i32 23, i32 27, i32 30, i32 33, i32 44>
 ; AVX512VL-NEXT:    [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 0, i32 3, i32 5, i32 8, i32 10, i32 14, i32 17, i32 20>
 ; AVX512VL-NEXT:    [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 4, i32 11, i32 27, i32 30, i32 13, i32 44, i32 33, i32 23>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
index cf380f04a6939..2d6f007cb2341 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
@@ -188,7 +188,7 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado
 ; AVX512F-LABEL: define void @gather_load_2(
 ; AVX512F-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
 ; AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
-; AVX512F-NEXT:    [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr [[TMP3]], i32 4, <10 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <10 x i32> poison), !tbaa [[SHORT_TBAA0]]
+; AVX512F-NEXT:    [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr align 4 [[TMP3]], <10 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <10 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512F-NEXT:    [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 9>
 ; AVX512F-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], <i32 1, i32 3, i32 4, i32 2>
 ; AVX512F-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
@@ -198,7 +198,7 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado
 ; AVX512VL-LABEL: define void @gather_load_2(
 ; AVX512VL-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
 ; AVX512VL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
-; AVX512VL-NEXT:    [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr [[TMP3]], i32 4, <10 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <10 x i32> poison), !tbaa [[SHORT_TBAA0]]
+; AVX512VL-NEXT:    [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr align 4 [[TMP3]], <10 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <10 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512VL-NEXT:    [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 9>
 ; AVX512VL-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], <i32 1, i32 3, i32 4, i32 2>
 ; AVX512VL-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
@@ -302,7 +302,7 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado
 ;
 ; AVX2-LABEL: define void @gather_load_3(
 ; AVX2-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
-; AVX2-NEXT:    [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
+; AVX2-NEXT:    [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr align 4 [[TMP1]], <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <22 x i32> [[TMP3]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
 ; AVX2-NEXT:    [[TMP5:%.*]] = add <8 x i32> [[TMP4]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
 ; AVX2-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
@@ -311,7 +311,7 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado
 ;
 ; AVX512F-LABEL: define void @gather_load_3(
 ; AVX512F-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
-; AVX512F-NEXT:    [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
+; AVX512F-NEXT:    [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr align 4 [[TMP1]], <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512F-NEXT:    [[TMP4:%.*]] = shufflevector <22 x i32> [[TMP3]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
 ; AVX512F-NEXT:    [[TMP5:%.*]] = add <8 x i32> [[TMP4]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
 ; AVX512F-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
@@ -320,7 +320,7 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado
 ;
 ; AVX512VL-LABEL: define void @gather_load_3(
 ; AVX512VL-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
-; AVX512VL-NEXT:    [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
+; AVX512VL-NEXT:    [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr align 4 [[TMP1]], <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512VL-NEXT:    [[TMP4:%.*]] = shufflevector <22 x i32> [[TMP3]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
 ; AVX512VL-NEXT:    [[TMP5:%.*]] = add <8 x i32> [[TMP4]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
 ; AVX512VL-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
@@ -442,7 +442,7 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
 ;
 ; AVX2-LABEL: define void @gather_load_4(
 ; AVX2-SAME: ptr noalias captures(none) [[T0:%.*]], ptr noalias readonly captures(none) [[T1:%.*]]) #[[ATTR0]] {
-; AVX2-NEXT:    [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
+; AVX2-NEXT:    [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr align 4 [[T1]], <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <22 x i32> [[TMP1]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
 ; AVX2-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
 ; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
@@ -451,7 +451,7 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
 ;
 ; AVX512F-LABEL: define void @gather_load_4(
 ; AVX512F-SAME: ptr noalias captures(none) [[T0:%.*]], ptr noalias readonly captures(none) [[T1:%.*]]) #[[ATTR0]] {
-; AVX512F-NEXT:    [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
+; AVX512F-NEXT:    [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr align 4 [[T1]], <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512F-NEXT:    [[TMP2:%.*]] = shufflevector <22 x i32> [[TMP1]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
 ; AVX512F-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
 ; AVX512F-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
@@ -460,7 +460,7 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
 ;
 ; AVX512VL-LABEL: define void @gather_load_4(
 ; AVX512VL-SAME: ptr noalias captures(none) [[T0:%.*]], ptr noalias readonly captures(none) [[T1:%.*]]) #[[ATTR0]] {
-; AVX512VL-NEXT:    [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
+; AVX512VL-NEXT:    [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr align 4 [[T1]], <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512VL-NEXT:    [[TMP2:%.*]] = shufflevector <22 x i32> [[TMP1]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
 ; AVX512VL-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
 ; AVX512VL-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
@@ -666,7 +666,7 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea
 ;
 ; AVX512F-LABEL: define void @gather_load_div(
 ; AVX512F-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
-; AVX512F-NEXT:    [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1]], i32 4, <45 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <45 x float> poison), !tbaa [[SHORT_TBAA0]]
+; AVX512F-NEXT:    [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr align 4 [[TMP1]], <45 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <45 x float> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512F-NEXT:    [[TMP4:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <16 x i32> <i32 0, i32 3, i32 4, i32 5, i32 8, i32 10, i32 11, i32 13, i32 14, i32 17, i32 20, i32 23, i32 27, i32 30, i32 33, i32 44>
 ; AVX512F-NEXT:    [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 0, i32 3, i32 5, i32 8, i32 10, i32 14, i32 17, i32 20>
 ; AVX512F-NEXT:    [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 4, i32 11, i32 27, i32 30, i32 13, i32 44, i32 33, i32 23>
@@ -677,7 +677,7 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea
 ;
 ; AVX512VL-LABEL: define void @gather_load_div(
 ; AVX512VL-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
-; AVX512VL-NEXT:    [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1]], i32 4, <45 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <45 x float> poison), !tbaa [[SHORT_TBAA0]]
+; AVX512VL-NEXT:    [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr align 4 [[TMP1]], <45 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <45 x float> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512VL-NEXT:    [[TMP4:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <16 x i32> <i32 0, i32 3, i32 4, i32 5, i32 8, i32 10, i32 11, i32 13, i32 14, i32 17, i32 20, i32 23, i32 27, i32 30, i32 33, i32 44>
 ; AVX512VL-NEXT:    [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 0, i32 3, i32 5, i32 8, i32 10, i32 14, i32 17, i32 20>
 ; AVX512VL-NEXT:    [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 4, i32 11, i32 27, i32 30, i32 13, i32 44, i32 33, i32 23>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
index f921278cdecf3..885e28dec2df2 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
@@ -14,7 +14,7 @@ define void @test(ptr nocapture readonly %arg, ptr nocapture readonly %arg1, ptr
 ; CHECK-NEXT:    [[GEP2_0:%.*]] = getelementptr inbounds double, ptr [[ARG1:%.*]], i64 16
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[GEP2_0]], align 8
 ; CHECK-NEXT:    [[GEP2_4:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 20
-; CHECK-NEXT:    [[TMP1:%.*]] = call <15 x double> @llvm.masked.load.v15f64.p0(ptr [[GEP1_0]], i32 8, <15 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <15 x double> poison)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <15 x double> @llvm.masked.load.v15f64.p0(ptr align 8 [[GEP1_0]], <15 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <15 x double> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <15 x double> [[TMP1]], <15 x double> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x double>, ptr [[ARG1]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = fmul fast <8 x double> [[TMP3]], [[TMP2]]
@@ -29,7 +29,7 @@ define void @test(ptr nocapture readonly %arg, ptr nocapture readonly %arg1, ptr
 ; CHECK-NEXT:    [[I142:%.*]] = insertelement <2 x double> poison, double [[TMP7]], i64 0
 ; CHECK-NEXT:    [[I143:%.*]] = insertelement <2 x double> [[I142]], double [[TMP11]], i64 1
 ; CHECK-NEXT:    [[P:%.*]] = getelementptr inbounds double, ptr [[ARG2:%.*]], <2 x i64> <i64 0, i64 16>
-; CHECK-NEXT:    call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[I143]], <2 x ptr> [[P]], i32 8, <2 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[I143]], <2 x ptr> align 8 [[P]], <2 x i1> splat (i1 true))
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-insertelement.ll b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-insertelement.ll
index f0272d591f0c3..a907a1f753b92 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-insertelement.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-insertelement.ll
@@ -11,7 +11,7 @@ define void @rdx_feeds_single_insert(<2 x double> %v, ptr nocapture readonly %ar
 ; CHECK-NEXT:    [[TMP2:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP1]])
 ; CHECK-NEXT:    [[I:%.*]] = insertelement <2 x double> [[V:%.*]], double [[TMP2]], i64 1
 ; CHECK-NEXT:    [[P:%.*]] = getelementptr inbounds double, ptr [[ARG2:%.*]], <2 x i64> <i64 0, i64 16>
-; CHECK-NEXT:    call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[I]], <2 x ptr> [[P]], i32 8, <2 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[I]], <2 x ptr> align 8 [[P]], <2 x i1> splat (i1 true))
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark-masked-loads-consecutive-loads-same-ptr.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark-masked-loads-consecutive-loads-same-ptr.ll
index 3fd9e126f4685..23a901fd768b3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/remark-masked-loads-consecutive-loads-same-ptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark-masked-loads-consecutive-loads-same-ptr.ll
@@ -15,7 +15,7 @@
 define void @test(ptr noalias %p, ptr noalias %p1) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call <35 x i32> @llvm.masked.load.v35i32.p0(ptr [[P:%.*]], i32 4, <35 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true>, <35 x i32> poison)
+; CHECK-NEXT:    [[TMP0:%.*]] = call <35 x i32> @llvm.masked.load.v35i32.p0(ptr align 4 [[P:%.*]], <35 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true>, <35 x i32> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <35 x i32> [[TMP0]], <35 x i32> poison, <4 x i32> <i32 0, i32 32, i32 33, i32 34>
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[P]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[TMP4]], [[TMP1]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
index bbb1b87fc3dfa..cadf038b58ff4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
@@ -6,12 +6,12 @@ define i32 @test(ptr noalias %p, ptr noalias %addr) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[OFF0_1:%.*]] = getelementptr inbounds i32, ptr [[ADDR:%.*]], i32 1
-; CHECK-NEXT:    [[TMP0:%.*]] = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr [[OFF0_1]], i32 8, <15 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <15 x i32> poison)
+; CHECK-NEXT:    [[TMP0:%.*]] = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr align 8 [[OFF0_1]], <15 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <15 x i32> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <15 x i32> [[TMP0]], <15 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x ptr> poison, ptr [[P:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP4]], <8 x ptr> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, <8 x ptr> [[TMP5]], <8 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP6]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison)
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 4 [[TMP6]], <8 x i1> splat (i1 true), <8 x i32> poison)
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP7]])
 ; CHECK-NEXT:    ret i32 [[TMP8]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll
index 19ce11c457f63..723537befb9de 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll
@@ -10,7 +10,7 @@ define void @test() {
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 33
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr null, i32 4, <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> poison)
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 4 null, <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> poison)
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> <i32 0, i32 7>
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2
@@ -71,7 +71,7 @@ define void @test1() {
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 33
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr null, i32 4, <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> poison)
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 4 null, <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> poison)
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> <i32 0, i32 7>
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2
@@ -134,7 +134,7 @@ define void @test_div() {
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 33
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr null, i32 4, <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> poison)
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 4 null, <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> <i32 0, i32 7>
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2
@@ -195,7 +195,7 @@ define void @test_rem() {
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 33
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr null, i32 4, <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> poison)
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 4 null, <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> <i32 0, i32 7>
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll
index 7bb436b9543bf..71bda895d822d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll
@@ -4,7 +4,7 @@
 define void @test(ptr noalias %0, ptr %p) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP0:%.*]], i64 2
-; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[P:%.*]], i32 16, <16 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <16 x float> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 16 [[P:%.*]], <16 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <16 x float> poison)
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <8 x i32> <i32 15, i32 4, i32 5, i32 0, i32 2, i32 6, i32 7, i32 8>
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <16 x i32> <i32 15, i32 4, i32 5, i32 15, i32 4, i32 5, i32 15, i32 0, i32 5, i32 2, i32 6, i32 7, i32 8, i32 6, i32 7, i32 8>
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 4, i32 24, i32 15, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll
index 63dbf3ce78c32..e9741213a9c3c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll
@@ -8,7 +8,7 @@ define void @"foo"(ptr addrspace(1) %0, ptr addrspace(1) %1) #0 {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0:%.*]], i64 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1:%.*]], i64 8
-; CHECK-NEXT:    [[TMP5:%.*]] = call <6 x float> @llvm.masked.load.v6f32.p1(ptr addrspace(1) [[TMP3]], i32 4, <6 x i1> <i1 true, i1 true, i1 false, i1 false, i1 true, i1 true>, <6 x float> poison)
+; CHECK-NEXT:    [[TMP5:%.*]] = call <6 x float> @llvm.masked.load.v6f32.p1(ptr addrspace(1) align 4 [[TMP3]], <6 x i1> <i1 true, i1 true, i1 false, i1 false, i1 true, i1 true>, <6 x float> poison)
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <6 x float> [[TMP5]], <6 x float> poison, <4 x i32> <i32 0, i32 1, i32 5, i32 4>
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x float>, ptr addrspace(1) [[TMP4]], align 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <6 x float> [[TMP5]], <6 x float> poison, <8 x i32> <i32 0, i32 4, i32 0, i32 4, i32 5, i32 1, i32 5, i32 1>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/revec-load-compress.ll b/llvm/test/Transforms/SLPVectorizer/X86/revec-load-compress.ll
index 75a866f97a8b4..42271fa75f1c5 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/revec-load-compress.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/revec-load-compress.ll
@@ -6,7 +6,7 @@ define void @test(ptr %in) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[IN:%.*]], i64 32
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[IN]], i64 64
-; CHECK-NEXT:    [[TMP2:%.*]] = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr [[TMP1]], i32 2, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i16> poison)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr align 2 [[TMP1]], <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i16> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i16> [[TMP2]], <32 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[TMP7:%.*]] = or <16 x i16> zeroinitializer, [[TMP3]]
 ; CHECK-NEXT:    store <16 x i16> [[TMP7]], ptr [[TMP0]], align 2
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder-non-empty.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder-non-empty.ll
index 9d63c0f1aa59e..94172cffb0295 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder-non-empty.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder-non-empty.ll
@@ -6,7 +6,7 @@ define double @test01() {
 ; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr null, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr double, <2 x ptr> zeroinitializer, <2 x i32> [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> [[TMP2]], i32 8, <2 x i1> splat (i1 true), <2 x double> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 8 [[TMP2]], <2 x i1> splat (i1 true), <2 x double> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> <double 0.000000e+00, double poison>, <2 x i32> <i32 2, i32 0>
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP4]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP3]], [[TMP5]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll
index d487e3616956c..77084f5b97e7d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll
@@ -5,11 +5,11 @@ define void @test(i1 %c, ptr %arg) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
 ; CHECK:       if:
-; CHECK-NEXT:    [[TMP1:%.*]] = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr [[ARG:%.*]], i32 8, <5 x i1> <i1 true, i1 true, i1 false, i1 true, i1 true>, <5 x i64> poison)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr align 8 [[ARG:%.*]], <5 x i1> <i1 true, i1 true, i1 false, i1 true, i1 true>, <5 x i64> poison)
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <5 x i64> [[TMP1]], <5 x i64> poison, <4 x i32> <i32 0, i32 1, i32 3, i32 4>
 ; CHECK-NEXT:    br label [[JOIN:%.*]]
 ; CHECK:       else:
-; CHECK-NEXT:    [[TMP3:%.*]] = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr [[ARG]], i32 8, <5 x i1> <i1 true, i1 true, i1 false, i1 true, i1 true>, <5 x i64> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr align 8 [[ARG]], <5 x i1> <i1 true, i1 true, i1 false, i1 true, i1 true>, <5 x i64> poison)
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <5 x i64> [[TMP3]], <5 x i64> poison, <4 x i32> <i32 0, i32 1, i32 3, i32 4>
 ; CHECK-NEXT:    br label [[JOIN]]
 ; CHECK:       join:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll
index 8dc8db9b444dc..5a66f376800db 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll
@@ -9,7 +9,7 @@ define dso_local void @_Z4testP1S(ptr %p) local_unnamed_addr {
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[P:%.*]], i64 0, i32 1, i64 0
 ; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr [[ARRAYIDX20]], i32 4, <12 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <12 x i32> poison)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr align 4 [[ARRAYIDX20]], <12 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <12 x i32> poison)
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <12 x i32> [[TMP1]], <12 x i32> poison, <8 x i32> <i32 11, i32 3, i32 2, i32 0, i32 8, i32 9, i32 10, i32 1>
 ; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <8 x i32> [[TMP6]], [[TMP2]]
 ; CHECK-NEXT:    store <8 x i32> [[TMP7]], ptr [[P]], align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll
index fdc0bc0e00eb8..771df950af2b1 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll
@@ -5,16 +5,16 @@ define void @test(ptr noalias %p, ptr noalias %addr, ptr noalias %s) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i32, ptr [[ADDR:%.*]], i32 1
-; CHECK-NEXT:    [[TMP0:%.*]] = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr [[ADDR]], i32 8, <15 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <15 x i32> poison)
+; CHECK-NEXT:    [[TMP0:%.*]] = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr align 8 [[ADDR]], <15 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <15 x i32> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <15 x i32> [[TMP0]], <15 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x ptr> poison, ptr [[P:%.*]], i32 0
 ; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <8 x ptr> [[TMP4]], <8 x ptr> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, <8 x ptr> [[SHUFFLE2]], <8 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison)
-; CHECK-NEXT:    [[TMP11:%.*]] = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr [[GEP2]], i32 8, <15 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <15 x i32> poison)
+; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 4 [[TMP5]], <8 x i1> splat (i1 true), <8 x i32> poison)
+; CHECK-NEXT:    [[TMP11:%.*]] = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr align 8 [[GEP2]], <15 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <15 x i32> poison)
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <15 x i32> [[TMP11]], <15 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, <8 x ptr> [[SHUFFLE2]], <8 x i32> [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP8]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison)
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 4 [[TMP8]], <8 x i1> splat (i1 true), <8 x i32> poison)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <8 x i32> [[TMP9]], [[TMP6]]
 ; CHECK-NEXT:    store <8 x i32> [[TMP10]], ptr [[S:%.*]], align 4
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SROA/phi-and-select.ll b/llvm/test/Transforms/SROA/phi-and-select.ll
index 616617b079828..5d5a61082688f 100644
--- a/llvm/test/Transforms/SROA/phi-and-select.ll
+++ b/llvm/test/Transforms/SROA/phi-and-select.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
 ; RUN: opt < %s -passes='sroa<preserve-cfg>' -S | FileCheck %s --check-prefixes=CHECK,CHECK-PRESERVE-CFG
 ; RUN: opt < %s -passes='sroa<modify-cfg>' -S | FileCheck %s --check-prefixes=CHECK,CHECK-MODIFY-CFG
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
@@ -36,11 +36,11 @@ exit:
   ret i32 %result
 }
 
-define i32 @test2() {
+define i32 @test2() !prof !0 {
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[COND:%.*]] = icmp sle i32 0, 1
-; CHECK-NEXT:    [[RESULT_SROA_SPECULATED:%.*]] = select i1 [[COND]], i32 1, i32 0
+; CHECK-NEXT:    [[RESULT_SROA_SPECULATED:%.*]] = select i1 [[COND]], i32 1, i32 0, !prof [[PROF1:![0-9]+]]
 ; CHECK-NEXT:    ret i32 [[RESULT_SROA_SPECULATED]]
 ;
 entry:
@@ -53,7 +53,7 @@ entry:
   %v1 = load i32, ptr %a1
 
   %cond = icmp sle i32 %v0, %v1
-  %select = select i1 %cond, ptr %a1, ptr %a
+  %select = select i1 %cond, ptr %a1, ptr %a, !prof !1
 
   %result = load i32, ptr %select
   ret i32 %result
@@ -870,3 +870,17 @@ define i8 @volatile_select(ptr %p, i1 %b) {
   %v2 = load i8, ptr %px
   ret i8 %v2
 }
+
+!0 = !{!"function_entry_count", i32 10}
+!1 = !{!"branch_weights", i32 3, i32 5}
+;.
+; CHECK-PRESERVE-CFG: attributes #[[ATTR0:[0-9]+]] = { sanitize_address }
+;.
+; CHECK-MODIFY-CFG: attributes #[[ATTR0:[0-9]+]] = { sanitize_address }
+;.
+; CHECK-PRESERVE-CFG: [[META0:![0-9]+]] = !{!"function_entry_count", i32 10}
+; CHECK-PRESERVE-CFG: [[PROF1]] = !{!"branch_weights", i32 3, i32 5}
+;.
+; CHECK-MODIFY-CFG: [[META0:![0-9]+]] = !{!"function_entry_count", i32 10}
+; CHECK-MODIFY-CFG: [[PROF1]] = !{!"branch_weights", i32 3, i32 5}
+;.
diff --git a/llvm/test/Transforms/SROA/phi-gep.ll b/llvm/test/Transforms/SROA/phi-gep.ll
index 776624c0798cf..45c3bbdeb3897 100644
--- a/llvm/test/Transforms/SROA/phi-gep.ll
+++ b/llvm/test/Transforms/SROA/phi-gep.ll
@@ -1,9 +1,12 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
 ; RUN: opt -S -passes='sroa<preserve-cfg>' < %s | FileCheck %s --check-prefixes=CHECK,CHECK-PRESERVE-CFG
 ; RUN: opt -S -passes='sroa<modify-cfg>' < %s | FileCheck %s --check-prefixes=CHECK,CHECK-MODIFY-CFG
 
 %pair = type { i32, i32 }
 
+;.
+; CHECK: @g = global %pair zeroinitializer, align 4
+;.
 define i32 @test_sroa_phi_gep(i1 %cond) {
 ; CHECK-LABEL: @test_sroa_phi_gep(
 ; CHECK-NEXT:  entry:
@@ -334,18 +337,18 @@ exit:
   unreachable
 }
 
-define void @test_sroa_gep_phi_select_same_block(i1 %c1, i1 %c2, ptr %ptr) {
+define void @test_sroa_gep_phi_select_same_block(i1 %c1, i1 %c2, ptr %ptr) !prof !0 {
 ; CHECK-LABEL: @test_sroa_gep_phi_select_same_block(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [[PAIR:%.*]], align 8
 ; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
 ; CHECK:       while.body:
 ; CHECK-NEXT:    [[PHI:%.*]] = phi ptr [ [[ALLOCA]], [[ENTRY:%.*]] ], [ [[SELECT:%.*]], [[WHILE_BODY]] ]
-; CHECK-NEXT:    [[SELECT]] = select i1 [[C1:%.*]], ptr [[PHI]], ptr [[PTR:%.*]]
+; CHECK-NEXT:    [[SELECT]] = select i1 [[C1:%.*]], ptr [[PHI]], ptr [[PTR:%.*]], !prof [[PROF1:![0-9]+]]
 ; CHECK-NEXT:    [[PHI_SROA_GEP:%.*]] = getelementptr inbounds [[PAIR]], ptr [[PHI]], i64 1
 ; CHECK-NEXT:    [[PTR_SROA_GEP:%.*]] = getelementptr inbounds [[PAIR]], ptr [[PTR]], i64 1
-; CHECK-NEXT:    [[SELECT_SROA_SEL:%.*]] = select i1 [[C1]], ptr [[PHI_SROA_GEP]], ptr [[PTR_SROA_GEP]]
-; CHECK-NEXT:    br i1 [[C2:%.*]], label [[EXIT:%.*]], label [[WHILE_BODY]]
+; CHECK-NEXT:    [[SELECT_SROA_SEL:%.*]] = select i1 [[C1]], ptr [[PHI_SROA_GEP]], ptr [[PTR_SROA_GEP]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[C2:%.*]], label [[EXIT:%.*]], label [[WHILE_BODY]], !prof [[PROF2:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -355,9 +358,9 @@ entry:
 
 while.body:
   %phi = phi ptr [ %alloca, %entry ], [ %select, %while.body ]
-  %select = select i1 %c1, ptr %phi, ptr %ptr
+  %select = select i1 %c1, ptr %phi, ptr %ptr, !prof !1
   %gep = getelementptr inbounds %pair, ptr %select, i64 1
-  br i1 %c2, label %exit, label %while.body
+  br i1 %c2, label %exit, label %while.body, !prof !2
 
 exit:
   ret void
@@ -747,6 +750,18 @@ declare ptr @foo()
 declare i32 @__gxx_personality_v0(...)
 
 declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg)
+
+!0 = !{!"function_entry_count", i32 10}
+!1 = !{!"branch_weights", i32 3, i32 5}
+!2 = !{!"branch_weights", i32 7, i32 11}
+
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+;.
+; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i32 10}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 3, i32 5}
+; CHECK: [[PROF2]] = !{!"branch_weights", i32 7, i32 11}
+;.
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; CHECK-MODIFY-CFG: {{.*}}
 ; CHECK-PRESERVE-CFG: {{.*}}
diff --git a/llvm/test/Transforms/SROA/select-gep.ll b/llvm/test/Transforms/SROA/select-gep.ll
index b48b0f77aa991..a701d78ff3f7a 100644
--- a/llvm/test/Transforms/SROA/select-gep.ll
+++ b/llvm/test/Transforms/SROA/select-gep.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
 ; RUN: opt -S -passes='sroa<preserve-cfg>' < %s | FileCheck %s --check-prefixes=CHECK,CHECK-PRESERVE-CFG
 ; RUN: opt -S -passes='sroa<modify-cfg>' < %s | FileCheck %s --check-prefixes=CHECK,CHECK-MODIFY-CFG
 
@@ -203,10 +203,10 @@ define i32 @test_select_idx_mem2reg(i1 %c) {
 
 ; Test gep with a select-like zext index unfolding on an alloca that is
 ; splittable and promotable.
-define i64 @test_select_like_zext_idx_mem2reg(i1 %c) {
+define i64 @test_select_like_zext_idx_mem2reg(i1 %c) !prof !0 {
 ; CHECK-LABEL: @test_select_like_zext_idx_mem2reg(
 ; CHECK-NEXT:    [[IDX:%.*]] = zext i1 [[C:%.*]] to i64
-; CHECK-NEXT:    [[RES:%.*]] = select i1 [[C]], i64 2, i64 1
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[C]], i64 2, i64 1, !prof [[PROF1:![0-9]+]]
 ; CHECK-NEXT:    ret i64 [[RES]]
 ;
   %alloca = alloca [2 x i64], align 8
@@ -352,3 +352,16 @@ define i32 @test_select_idx_not_constant3(i1 %c, ptr %p, i64 %arg) {
   %res = load i32, ptr %gep, align 4
   ret i32 %res
 }
+
+!0 = !{!"function_entry_count", i32 10}
+;.
+; CHECK-PRESERVE-CFG: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+;.
+; CHECK-MODIFY-CFG: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+;.
+; CHECK-PRESERVE-CFG: [[META0:![0-9]+]] = !{!"function_entry_count", i32 10}
+; CHECK-PRESERVE-CFG: [[PROF1]] = !{!"unknown", !"sroa"}
+;.
+; CHECK-MODIFY-CFG: [[META0:![0-9]+]] = !{!"function_entry_count", i32 10}
+; CHECK-MODIFY-CFG: [[PROF1]] = !{!"unknown", !"sroa"}
+;.
diff --git a/llvm/test/Transforms/SROA/slice-width.ll b/llvm/test/Transforms/SROA/slice-width.ll
index eabb6978c9125..3b77e49e78358 100644
--- a/llvm/test/Transforms/SROA/slice-width.ll
+++ b/llvm/test/Transforms/SROA/slice-width.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
 ; RUN: opt < %s -passes='sroa<preserve-cfg>' -S | FileCheck %s --check-prefixes=CHECK,CHECK-PRESERVE-CFG
 ; RUN: opt < %s -passes='sroa<modify-cfg>' -S | FileCheck %s --check-prefixes=CHECK,CHECK-MODIFY-CFG
 target datalayout = "e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-f80:128-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
@@ -8,6 +8,10 @@ declare void @llvm.memset.p0.i32(ptr nocapture, i8, i32, i1) nounwind
 declare void @llvm.memset.p0.i64(ptr nocapture, i8, i64, i1) nounwind
 
 ; This tests that allocas are not split into slices that are not byte width multiple
+;.
+; CHECK: @foo_copy_source = external constant %union.Foo
+; CHECK: @i64_sink = global i64 0
+;.
 define void @no_split_on_non_byte_width(i32) {
 ; CHECK-LABEL: @no_split_on_non_byte_width(
 ; CHECK-NEXT:    [[ARG_SROA_0:%.*]] = alloca i8, align 8
@@ -92,12 +96,12 @@ declare i32 @memcpy_vec3float_helper(ptr)
 
 ; PR18726: Check that SROA does not rewrite a 12-byte memcpy into a 16-byte
 ; vector store, hence accidentally putting gibberish onto the stack.
-define i32 @memcpy_vec3float_widening(ptr %x) {
+define i32 @memcpy_vec3float_widening(ptr %x) !prof !0 {
 ; CHECK-LABEL: @memcpy_vec3float_widening(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP1_SROA_0_0_COPYLOAD:%.*]] = load <3 x float>, ptr [[X:%.*]], align 4
 ; CHECK-NEXT:    [[TMP1_SROA_0_0_VEC_EXPAND:%.*]] = shufflevector <3 x float> [[TMP1_SROA_0_0_COPYLOAD]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-; CHECK-NEXT:    [[TMP1_SROA_0_0_VECBLEND:%.*]] = select <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x float> [[TMP1_SROA_0_0_VEC_EXPAND]], <4 x float> undef
+; CHECK-NEXT:    [[TMP1_SROA_0_0_VECBLEND:%.*]] = select <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x float> [[TMP1_SROA_0_0_VEC_EXPAND]], <4 x float> undef, !prof [[PROF1:![0-9]+]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = alloca [[S_VEC3FLOAT:%.*]], align 4
 ; CHECK-NEXT:    [[TMP1_SROA_0_0_VEC_EXTRACT:%.*]] = shufflevector <4 x float> [[TMP1_SROA_0_0_VECBLEND]], <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
 ; CHECK-NEXT:    store <3 x float> [[TMP1_SROA_0_0_VEC_EXTRACT]], ptr [[TMP2]], align 4
@@ -158,6 +162,15 @@ define i1 @presplit_overlarge_load() {
   %L2 = load i1, ptr %A
   ret i1 %L2
 }
+!0 = !{!"function_entry_count", i32 10}
+
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
+;.
+; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i32 10}
+; CHECK: [[PROF1]] = !{!"unknown", !"sroa"}
+;.
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; CHECK-MODIFY-CFG: {{.*}}
 ; CHECK-PRESERVE-CFG: {{.*}}
diff --git a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AArch64/expand-masked-load.ll b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AArch64/expand-masked-load.ll
index 51425c6305fe1..c0055aff4df70 100644
--- a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AArch64/expand-masked-load.ll
+++ b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AArch64/expand-masked-load.ll
@@ -29,7 +29,7 @@ define <2 x i64> @scalarize_v2i64(ptr %p, <2 x i1> %mask, <2 x i64> %passthru) {
 ; CHECK-LE-NEXT:    ret <2 x i64> [[RES_PHI_ELSE3]]
 ;
 ; CHECK-LE-SVE-LABEL: @scalarize_v2i64(
-; CHECK-LE-SVE-NEXT:    [[RET:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr [[P:%.*]], i32 128, <2 x i1> [[MASK:%.*]], <2 x i64> [[PASSTHRU:%.*]])
+; CHECK-LE-SVE-NEXT:    [[RET:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr align 128 [[P:%.*]], <2 x i1> [[MASK:%.*]], <2 x i64> [[PASSTHRU:%.*]])
 ; CHECK-LE-SVE-NEXT:    ret <2 x i64> [[RET]]
 ;
 ; CHECK-BE-LABEL: @scalarize_v2i64(
@@ -66,7 +66,7 @@ define <2 x i64> @scalarize_v2i64_ones_mask(ptr %p, <2 x i64> %passthru) {
 ; CHECK-LE-NEXT:    ret <2 x i64> [[TMP1]]
 ;
 ; CHECK-LE-SVE-LABEL: @scalarize_v2i64_ones_mask(
-; CHECK-LE-SVE-NEXT:    [[RET:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr [[P:%.*]], i32 8, <2 x i1> splat (i1 true), <2 x i64> [[PASSTHRU:%.*]])
+; CHECK-LE-SVE-NEXT:    [[RET:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr align 8 [[P:%.*]], <2 x i1> splat (i1 true), <2 x i64> [[PASSTHRU:%.*]])
 ; CHECK-LE-SVE-NEXT:    ret <2 x i64> [[RET]]
 ;
 ; CHECK-BE-LABEL: @scalarize_v2i64_ones_mask(
@@ -82,7 +82,7 @@ define <2 x i64> @scalarize_v2i64_zero_mask(ptr %p, <2 x i64> %passthru) {
 ; CHECK-LE-NEXT:    ret <2 x i64> [[PASSTHRU:%.*]]
 ;
 ; CHECK-LE-SVE-LABEL: @scalarize_v2i64_zero_mask(
-; CHECK-LE-SVE-NEXT:    [[RET:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr [[P:%.*]], i32 8, <2 x i1> zeroinitializer, <2 x i64> [[PASSTHRU:%.*]])
+; CHECK-LE-SVE-NEXT:    [[RET:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr align 8 [[P:%.*]], <2 x i1> zeroinitializer, <2 x i64> [[PASSTHRU:%.*]])
 ; CHECK-LE-SVE-NEXT:    ret <2 x i64> [[RET]]
 ;
 ; CHECK-BE-LABEL: @scalarize_v2i64_zero_mask(
@@ -100,7 +100,7 @@ define <2 x i64> @scalarize_v2i64_const_mask(ptr %p, <2 x i64> %passthru) {
 ; CHECK-LE-NEXT:    ret <2 x i64> [[TMP3]]
 ;
 ; CHECK-LE-SVE-LABEL: @scalarize_v2i64_const_mask(
-; CHECK-LE-SVE-NEXT:    [[RET:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr [[P:%.*]], i32 8, <2 x i1> <i1 false, i1 true>, <2 x i64> [[PASSTHRU:%.*]])
+; CHECK-LE-SVE-NEXT:    [[RET:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr align 8 [[P:%.*]], <2 x i1> <i1 false, i1 true>, <2 x i64> [[PASSTHRU:%.*]])
 ; CHECK-LE-SVE-NEXT:    ret <2 x i64> [[RET]]
 ;
 ; CHECK-BE-LABEL: @scalarize_v2i64_const_mask(
diff --git a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AArch64/expand-masked-store.ll b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AArch64/expand-masked-store.ll
index 0acc551f81f64..a7f2ac5128a27 100644
--- a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AArch64/expand-masked-store.ll
+++ b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AArch64/expand-masked-store.ll
@@ -27,7 +27,7 @@ define void @scalarize_v2i64(ptr %p, <2 x i1> %mask, <2 x i64> %data) {
 ; CHECK-LE-NEXT:    ret void
 ;
 ; CHECK-SVE-LE-LABEL: @scalarize_v2i64(
-; CHECK-SVE-LE-NEXT:    call void @llvm.masked.store.v2i64.p0(<2 x i64> [[DATA:%.*]], ptr [[P:%.*]], i32 128, <2 x i1> [[MASK:%.*]])
+; CHECK-SVE-LE-NEXT:    call void @llvm.masked.store.v2i64.p0(<2 x i64> [[DATA:%.*]], ptr align 128 [[P:%.*]], <2 x i1> [[MASK:%.*]])
 ; CHECK-SVE-LE-NEXT:    ret void
 ;
 ; CHECK-BE-LABEL: @scalarize_v2i64(
@@ -62,7 +62,7 @@ define void @scalarize_v2i64_ones_mask(ptr %p, <2 x i64> %data) {
 ; CHECK-LE-NEXT:    ret void
 ;
 ; CHECK-SVE-LE-LABEL: @scalarize_v2i64_ones_mask(
-; CHECK-SVE-LE-NEXT:    call void @llvm.masked.store.v2i64.p0(<2 x i64> [[DATA:%.*]], ptr [[P:%.*]], i32 8, <2 x i1> splat (i1 true))
+; CHECK-SVE-LE-NEXT:    call void @llvm.masked.store.v2i64.p0(<2 x i64> [[DATA:%.*]], ptr align 8 [[P:%.*]], <2 x i1> splat (i1 true))
 ; CHECK-SVE-LE-NEXT:    ret void
 ;
 ; CHECK-BE-LABEL: @scalarize_v2i64_ones_mask(
@@ -78,7 +78,7 @@ define void @scalarize_v2i64_zero_mask(ptr %p, <2 x i64> %data) {
 ; CHECK-LE-NEXT:    ret void
 ;
 ; CHECK-SVE-LE-LABEL: @scalarize_v2i64_zero_mask(
-; CHECK-SVE-LE-NEXT:    call void @llvm.masked.store.v2i64.p0(<2 x i64> [[DATA:%.*]], ptr [[P:%.*]], i32 8, <2 x i1> zeroinitializer)
+; CHECK-SVE-LE-NEXT:    call void @llvm.masked.store.v2i64.p0(<2 x i64> [[DATA:%.*]], ptr align 8 [[P:%.*]], <2 x i1> zeroinitializer)
 ; CHECK-SVE-LE-NEXT:    ret void
 ;
 ; CHECK-BE-LABEL: @scalarize_v2i64_zero_mask(
@@ -96,7 +96,7 @@ define void @scalarize_v2i64_const_mask(ptr %p, <2 x i64> %data) {
 ; CHECK-LE-NEXT:    ret void
 ;
 ; CHECK-SVE-LE-LABEL: @scalarize_v2i64_const_mask(
-; CHECK-SVE-LE-NEXT:    call void @llvm.masked.store.v2i64.p0(<2 x i64> [[DATA:%.*]], ptr [[P:%.*]], i32 8, <2 x i1> <i1 false, i1 true>)
+; CHECK-SVE-LE-NEXT:    call void @llvm.masked.store.v2i64.p0(<2 x i64> [[DATA:%.*]], ptr align 8 [[P:%.*]], <2 x i1> <i1 false, i1 true>)
 ; CHECK-SVE-LE-NEXT:    ret void
 ;
 ; CHECK-BE-LABEL: @scalarize_v2i64_const_mask(
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-markloopasdeleted.ll b/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-markloopasdeleted.ll
index 9ab713cc8a4f5..383407b371da0 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-markloopasdeleted.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-markloopasdeleted.ll
@@ -18,7 +18,6 @@
 ; the analysis caches.
 ;
 ; CHECK: Running pass: SimpleLoopUnswitchPass on loop %loop_begin in function test6
-; CHECK-NEXT: Running analysis: OuterAnalysisManagerProxy
 ; CHECK-NEXT: Clearing all analysis results for: loop_a_inner
 
 
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll b/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll
index 100806612dffc..4eda91ffaa268 100644
--- a/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll
+++ b/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll
@@ -9,18 +9,18 @@ define void @basic(i1 %cond, ptr %b, ptr %p, ptr %q) {
 ; LOADSTORE-LABEL: @basic(
 ; LOADSTORE-NEXT:  entry:
 ; LOADSTORE-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND:%.*]] to <1 x i1>
-; LOADSTORE-NEXT:    [[TMP1:%.*]] = call <1 x i16> @llvm.masked.load.v1i16.p0(ptr [[P:%.*]], i32 2, <1 x i1> [[TMP0]], <1 x i16> poison)
+; LOADSTORE-NEXT:    [[TMP1:%.*]] = call <1 x i16> @llvm.masked.load.v1i16.p0(ptr align 2 [[P:%.*]], <1 x i1> [[TMP0]], <1 x i16> poison)
 ; LOADSTORE-NEXT:    [[TMP2:%.*]] = bitcast <1 x i16> [[TMP1]] to i16
-; LOADSTORE-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison)
+; LOADSTORE-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 4 [[Q:%.*]], <1 x i1> [[TMP0]], <1 x i32> poison)
 ; LOADSTORE-NEXT:    [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
-; LOADSTORE-NEXT:    [[TMP5:%.*]] = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr [[B:%.*]], i32 8, <1 x i1> [[TMP0]], <1 x i64> poison)
+; LOADSTORE-NEXT:    [[TMP5:%.*]] = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr align 8 [[B:%.*]], <1 x i1> [[TMP0]], <1 x i64> poison)
 ; LOADSTORE-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to i64
 ; LOADSTORE-NEXT:    [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <1 x i16>
-; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i16.p0(<1 x i16> [[TMP7]], ptr [[B]], i32 2, <1 x i1> [[TMP0]])
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i16.p0(<1 x i16> [[TMP7]], ptr align 2 [[B]], <1 x i1> [[TMP0]])
 ; LOADSTORE-NEXT:    [[TMP8:%.*]] = bitcast i32 [[TMP4]] to <1 x i32>
-; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP8]], ptr [[P]], i32 4, <1 x i1> [[TMP0]])
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP8]], ptr align 4 [[P]], <1 x i1> [[TMP0]])
 ; LOADSTORE-NEXT:    [[TMP9:%.*]] = bitcast i64 [[TMP6]] to <1 x i64>
-; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i64.p0(<1 x i64> [[TMP9]], ptr [[Q]], i32 8, <1 x i1> [[TMP0]])
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i64.p0(<1 x i64> [[TMP9]], ptr align 8 [[Q]], <1 x i1> [[TMP0]])
 ; LOADSTORE-NEXT:    ret void
 ;
 ; NONE-LABEL: @basic(
@@ -63,10 +63,10 @@ define void @succ1to0(ptr %p, ptr %q, i32 %a) {
 ; LOADSTORE-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[A:%.*]], 0
 ; LOADSTORE-NEXT:    [[TMP0:%.*]] = xor i1 [[TOBOOL]], true
 ; LOADSTORE-NEXT:    [[TMP1:%.*]] = bitcast i1 [[TMP0]] to <1 x i1>
-; LOADSTORE-NEXT:    [[TMP2:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP1]], <1 x i32> poison)
+; LOADSTORE-NEXT:    [[TMP2:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 4 [[Q:%.*]], <1 x i1> [[TMP1]], <1 x i32> poison)
 ; LOADSTORE-NEXT:    [[TMP3:%.*]] = bitcast <1 x i32> [[TMP2]] to i32
 ; LOADSTORE-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP3]] to <1 x i32>
-; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP4]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP1]])
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP4]], ptr align 4 [[P:%.*]], <1 x i1> [[TMP1]])
 ; LOADSTORE-NEXT:    ret void
 ;
 ; NONE-LABEL: @succ1to0(
@@ -100,7 +100,7 @@ define i32 @succ1to0_phi(ptr %p)  {
 ; LOADSTORE-NEXT:    [[COND:%.*]] = icmp eq ptr [[P:%.*]], null
 ; LOADSTORE-NEXT:    [[TMP0:%.*]] = xor i1 [[COND]], true
 ; LOADSTORE-NEXT:    [[TMP1:%.*]] = bitcast i1 [[TMP0]] to <1 x i1>
-; LOADSTORE-NEXT:    [[TMP2:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[P]], i32 4, <1 x i1> [[TMP1]], <1 x i32> zeroinitializer)
+; LOADSTORE-NEXT:    [[TMP2:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 4 [[P]], <1 x i1> [[TMP1]], <1 x i32> zeroinitializer)
 ; LOADSTORE-NEXT:    [[TMP3:%.*]] = bitcast <1 x i32> [[TMP2]] to i32
 ; LOADSTORE-NEXT:    ret i32 [[TMP3]]
 ;
@@ -120,7 +120,7 @@ define i32 @succ1to0_phi(ptr %p)  {
 ; LOADONLY-NEXT:    [[COND:%.*]] = icmp eq ptr [[P:%.*]], null
 ; LOADONLY-NEXT:    [[TMP0:%.*]] = xor i1 [[COND]], true
 ; LOADONLY-NEXT:    [[TMP1:%.*]] = bitcast i1 [[TMP0]] to <1 x i1>
-; LOADONLY-NEXT:    [[TMP2:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[P]], i32 4, <1 x i1> [[TMP1]], <1 x i32> zeroinitializer)
+; LOADONLY-NEXT:    [[TMP2:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 4 [[P]], <1 x i1> [[TMP1]], <1 x i32> zeroinitializer)
 ; LOADONLY-NEXT:    [[TMP3:%.*]] = bitcast <1 x i32> [[TMP2]] to i32
 ; LOADONLY-NEXT:    ret i32 [[TMP3]]
 ;
@@ -154,10 +154,10 @@ define void @succ0to1(i32 %a, ptr %b, ptr %p, ptr %q) {
 ; LOADSTORE-NEXT:  entry:
 ; LOADSTORE-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
 ; LOADSTORE-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
-; LOADSTORE-NEXT:    [[TMP1:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[B:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison)
+; LOADSTORE-NEXT:    [[TMP1:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 4 [[B:%.*]], <1 x i1> [[TMP0]], <1 x i32> poison)
 ; LOADSTORE-NEXT:    [[TMP2:%.*]] = bitcast <1 x i32> [[TMP1]] to i32
 ; LOADSTORE-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP2]] to <1 x i32>
-; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr align 4 [[P:%.*]], <1 x i1> [[TMP0]])
 ; LOADSTORE-NEXT:    store i32 1, ptr [[Q:%.*]], align 4
 ; LOADSTORE-NEXT:    ret void
 ;
@@ -196,8 +196,8 @@ define i64 @load_after_store(i32 %a, ptr %b, ptr %p) {
 ; LOADSTORE-NEXT:  entry:
 ; LOADSTORE-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
 ; LOADSTORE-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
-; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 1), ptr [[B:%.*]], i32 4, <1 x i1> [[TMP0]])
-; LOADSTORE-NEXT:    [[TMP1:%.*]] = call <1 x i16> @llvm.masked.load.v1i16.p0(ptr [[P:%.*]], i32 2, <1 x i1> [[TMP0]], <1 x i16> poison)
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 1), ptr align 4 [[B:%.*]], <1 x i1> [[TMP0]])
+; LOADSTORE-NEXT:    [[TMP1:%.*]] = call <1 x i16> @llvm.masked.load.v1i16.p0(ptr align 2 [[P:%.*]], <1 x i1> [[TMP0]], <1 x i16> poison)
 ; LOADSTORE-NEXT:    [[TMP2:%.*]] = bitcast <1 x i16> [[TMP1]] to i16
 ; LOADSTORE-NEXT:    [[ZEXT:%.*]] = zext i16 [[TMP2]] to i64
 ; LOADSTORE-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[COND]], i64 [[ZEXT]], i64 0
@@ -238,7 +238,7 @@ define void @load_skip_speculatable_memory_read(i32 %a, ptr %p, ptr %q) {
 ; LOADSTORE-NEXT:    [[READ:%.*]] = call i32 @read_memory_only()
 ; LOADSTORE-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
 ; LOADSTORE-NEXT:    [[TMP1:%.*]] = bitcast i32 [[READ]] to <1 x i32>
-; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP1]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP1]], ptr align 4 [[P:%.*]], <1 x i1> [[TMP0]])
 ; LOADSTORE-NEXT:    store i32 1, ptr [[Q:%.*]], align 4
 ; LOADSTORE-NEXT:    ret void
 ;
@@ -248,7 +248,7 @@ define void @load_skip_speculatable_memory_read(i32 %a, ptr %p, ptr %q) {
 ; STOREONLY-NEXT:    [[READ:%.*]] = call i32 @read_memory_only()
 ; STOREONLY-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
 ; STOREONLY-NEXT:    [[TMP1:%.*]] = bitcast i32 [[READ]] to <1 x i32>
-; STOREONLY-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP1]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
+; STOREONLY-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP1]], ptr align 4 [[P:%.*]], <1 x i1> [[TMP0]])
 ; STOREONLY-NEXT:    store i32 1, ptr [[Q:%.*]], align 4
 ; STOREONLY-NEXT:    ret void
 ;
@@ -301,7 +301,7 @@ define i32 @load_from_gep(ptr %p)  {
 ; LOADSTORE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 16
 ; LOADSTORE-NEXT:    [[TMP0:%.*]] = xor i1 [[COND]], true
 ; LOADSTORE-NEXT:    [[TMP1:%.*]] = bitcast i1 [[TMP0]] to <1 x i1>
-; LOADSTORE-NEXT:    [[TMP2:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[ARRAYIDX]], i32 4, <1 x i1> [[TMP1]], <1 x i32> zeroinitializer)
+; LOADSTORE-NEXT:    [[TMP2:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 4 [[ARRAYIDX]], <1 x i1> [[TMP1]], <1 x i32> zeroinitializer)
 ; LOADSTORE-NEXT:    [[TMP3:%.*]] = bitcast <1 x i32> [[TMP2]] to i32
 ; LOADSTORE-NEXT:    ret i32 [[TMP3]]
 ;
@@ -323,7 +323,7 @@ define i32 @load_from_gep(ptr %p)  {
 ; LOADONLY-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 16
 ; LOADONLY-NEXT:    [[TMP0:%.*]] = xor i1 [[COND]], true
 ; LOADONLY-NEXT:    [[TMP1:%.*]] = bitcast i1 [[TMP0]] to <1 x i1>
-; LOADONLY-NEXT:    [[TMP2:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[ARRAYIDX]], i32 4, <1 x i1> [[TMP1]], <1 x i32> zeroinitializer)
+; LOADONLY-NEXT:    [[TMP2:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 4 [[ARRAYIDX]], <1 x i1> [[TMP1]], <1 x i32> zeroinitializer)
 ; LOADONLY-NEXT:    [[TMP3:%.*]] = bitcast <1 x i32> [[TMP2]] to i32
 ; LOADONLY-NEXT:    ret i32 [[TMP3]]
 ;
@@ -358,14 +358,14 @@ define void @nondebug_metadata(i1 %cond, ptr %p, ptr %q) {
 ; LOADSTORE-LABEL: @nondebug_metadata(
 ; LOADSTORE-NEXT:  entry:
 ; LOADSTORE-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND:%.*]] to <1 x i1>
-; LOADSTORE-NEXT:    [[TMP1:%.*]] = call range(i16 0, 10) <1 x i16> @llvm.masked.load.v1i16.p0(ptr [[P:%.*]], i32 2, <1 x i1> [[TMP0]], <1 x i16> poison)
+; LOADSTORE-NEXT:    [[TMP1:%.*]] = call range(i16 0, 10) <1 x i16> @llvm.masked.load.v1i16.p0(ptr align 2 [[P:%.*]], <1 x i1> [[TMP0]], <1 x i16> poison)
 ; LOADSTORE-NEXT:    [[TMP2:%.*]] = bitcast <1 x i16> [[TMP1]] to i16
-; LOADSTORE-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison), !annotation [[META5:![0-9]+]]
+; LOADSTORE-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 4 [[Q:%.*]], <1 x i1> [[TMP0]], <1 x i32> poison), !annotation [[META5:![0-9]+]]
 ; LOADSTORE-NEXT:    [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
 ; LOADSTORE-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <1 x i16>
-; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i16.p0(<1 x i16> [[TMP5]], ptr [[Q]], i32 4, <1 x i1> [[TMP0]]), !annotation [[META5]]
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i16.p0(<1 x i16> [[TMP5]], ptr align 4 [[Q]], <1 x i1> [[TMP0]]), !annotation [[META5]]
 ; LOADSTORE-NEXT:    [[TMP6:%.*]] = bitcast i32 [[TMP4]] to <1 x i32>
-; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP6]], ptr [[P]], i32 2, <1 x i1> [[TMP0]])
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP6]], ptr align 2 [[P]], <1 x i1> [[TMP0]])
 ; LOADSTORE-NEXT:    ret void
 ;
 ; NONE-LABEL: @nondebug_metadata(
@@ -398,14 +398,14 @@ define i16 @debug_metadata_diassign(i1 %cond, i16 %a, ptr %p) {
 ; LOADSTORE-LABEL: @debug_metadata_diassign(
 ; LOADSTORE-NEXT:  bb0:
 ; LOADSTORE-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND:%.*]] to <1 x i1>
-; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i16.p0(<1 x i16> splat (i16 7), ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i16.p0(<1 x i16> splat (i16 7), ptr align 4 [[P:%.*]], <1 x i1> [[TMP0]])
 ; LOADSTORE-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[COND]], i16 3, i16 2
 ; LOADSTORE-NEXT:    ret i16 [[SPEC_SELECT]]
 ;
 ; STOREONLY-LABEL: @debug_metadata_diassign(
 ; STOREONLY-NEXT:  bb0:
 ; STOREONLY-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND:%.*]] to <1 x i1>
-; STOREONLY-NEXT:    call void @llvm.masked.store.v1i16.p0(<1 x i16> splat (i16 7), ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
+; STOREONLY-NEXT:    call void @llvm.masked.store.v1i16.p0(<1 x i16> splat (i16 7), ptr align 4 [[P:%.*]], <1 x i1> [[TMP0]])
 ; STOREONLY-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[COND]], i16 3, i16 2
 ; STOREONLY-NEXT:    ret i16 [[SPEC_SELECT]]
 ;
@@ -451,7 +451,7 @@ define i32 @hoist_cond_stores(i1 %cond, ptr %p) {
 ; LOADSTORE-NEXT:    store i1 false, ptr [[P:%.*]], align 2
 ; LOADSTORE-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND:%.*]] to <1 x i1>
 ; LOADSTORE-NEXT:    [[SPEC_STORE_SELECT:%.*]] = select i1 [[COND]], i1 false, i1 false
-; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> zeroinitializer, ptr [[P]], i32 8, <1 x i1> [[TMP0]])
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> zeroinitializer, ptr align 8 [[P]], <1 x i1> [[TMP0]])
 ; LOADSTORE-NEXT:    store i1 [[SPEC_STORE_SELECT]], ptr [[P]], align 2
 ; LOADSTORE-NEXT:    ret i32 0
 ;
@@ -460,7 +460,7 @@ define i32 @hoist_cond_stores(i1 %cond, ptr %p) {
 ; STOREONLY-NEXT:    store i1 false, ptr [[P:%.*]], align 2
 ; STOREONLY-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND:%.*]] to <1 x i1>
 ; STOREONLY-NEXT:    [[SPEC_STORE_SELECT:%.*]] = select i1 [[COND]], i1 false, i1 false
-; STOREONLY-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> zeroinitializer, ptr [[P]], i32 8, <1 x i1> [[TMP0]])
+; STOREONLY-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> zeroinitializer, ptr align 8 [[P]], <1 x i1> [[TMP0]])
 ; STOREONLY-NEXT:    store i1 [[SPEC_STORE_SELECT]], ptr [[P]], align 2
 ; STOREONLY-NEXT:    ret i32 0
 ;
@@ -507,10 +507,10 @@ define i32 @single_predecessor(ptr %p, ptr %q, i32 %a) {
 ; LOADSTORE-NEXT:    [[TMP0:%.*]] = xor i1 [[TOBOOL]], true
 ; LOADSTORE-NEXT:    [[TMP1:%.*]] = bitcast i1 [[TMP0]] to <1 x i1>
 ; LOADSTORE-NEXT:    [[TMP2:%.*]] = bitcast i1 [[TOBOOL]] to <1 x i1>
-; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 1), ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP2]])
-; LOADSTORE-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q]], i32 4, <1 x i1> [[TMP1]], <1 x i32> poison)
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 1), ptr align 4 [[Q:%.*]], <1 x i1> [[TMP2]])
+; LOADSTORE-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 4 [[Q]], <1 x i1> [[TMP1]], <1 x i32> poison)
 ; LOADSTORE-NEXT:    [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
-; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP1]])
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr align 4 [[P:%.*]], <1 x i1> [[TMP1]])
 ; LOADSTORE-NEXT:    [[DOT:%.*]] = select i1 [[TOBOOL]], i32 2, i32 3
 ; LOADSTORE-NEXT:    ret i32 [[DOT]]
 ;
@@ -548,23 +548,23 @@ define void @threshold_6(i1 %cond, ptr %p1, ptr %p2, ptr %p3, ptr %p4, ptr %p5,
 ; LOADSTORE-LABEL: @threshold_6(
 ; LOADSTORE-NEXT:  entry:
 ; LOADSTORE-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND:%.*]] to <1 x i1>
-; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 1), ptr [[P1:%.*]], i32 4, <1 x i1> [[TMP0]])
-; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 2), ptr [[P2:%.*]], i32 4, <1 x i1> [[TMP0]])
-; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 3), ptr [[P3:%.*]], i32 4, <1 x i1> [[TMP0]])
-; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 4), ptr [[P4:%.*]], i32 4, <1 x i1> [[TMP0]])
-; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 5), ptr [[P5:%.*]], i32 4, <1 x i1> [[TMP0]])
-; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 6), ptr [[P6:%.*]], i32 4, <1 x i1> [[TMP0]])
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 1), ptr align 4 [[P1:%.*]], <1 x i1> [[TMP0]])
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 2), ptr align 4 [[P2:%.*]], <1 x i1> [[TMP0]])
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 3), ptr align 4 [[P3:%.*]], <1 x i1> [[TMP0]])
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 4), ptr align 4 [[P4:%.*]], <1 x i1> [[TMP0]])
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 5), ptr align 4 [[P5:%.*]], <1 x i1> [[TMP0]])
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 6), ptr align 4 [[P6:%.*]], <1 x i1> [[TMP0]])
 ; LOADSTORE-NEXT:    ret void
 ;
 ; STOREONLY-LABEL: @threshold_6(
 ; STOREONLY-NEXT:  entry:
 ; STOREONLY-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND:%.*]] to <1 x i1>
-; STOREONLY-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 1), ptr [[P1:%.*]], i32 4, <1 x i1> [[TMP0]])
-; STOREONLY-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 2), ptr [[P2:%.*]], i32 4, <1 x i1> [[TMP0]])
-; STOREONLY-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 3), ptr [[P3:%.*]], i32 4, <1 x i1> [[TMP0]])
-; STOREONLY-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 4), ptr [[P4:%.*]], i32 4, <1 x i1> [[TMP0]])
-; STOREONLY-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 5), ptr [[P5:%.*]], i32 4, <1 x i1> [[TMP0]])
-; STOREONLY-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 6), ptr [[P6:%.*]], i32 4, <1 x i1> [[TMP0]])
+; STOREONLY-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 1), ptr align 4 [[P1:%.*]], <1 x i1> [[TMP0]])
+; STOREONLY-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 2), ptr align 4 [[P2:%.*]], <1 x i1> [[TMP0]])
+; STOREONLY-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 3), ptr align 4 [[P3:%.*]], <1 x i1> [[TMP0]])
+; STOREONLY-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 4), ptr align 4 [[P4:%.*]], <1 x i1> [[TMP0]])
+; STOREONLY-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 5), ptr align 4 [[P5:%.*]], <1 x i1> [[TMP0]])
+; STOREONLY-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 6), ptr align 4 [[P6:%.*]], <1 x i1> [[TMP0]])
 ; STOREONLY-NEXT:    ret void
 ;
 ; LOADONLY-LABEL: @threshold_6(
@@ -987,7 +987,7 @@ define i32 @succ_phi_has_3input(i1 %cond1, ptr %p, i1 %cond2) {
 ; LOADSTORE-NEXT:    br i1 [[COND1:%.*]], label [[BB3:%.*]], label [[BB1:%.*]]
 ; LOADSTORE:       bb1:
 ; LOADSTORE-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND2:%.*]] to <1 x i1>
-; LOADSTORE-NEXT:    [[TMP1:%.*]] = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr [[P:%.*]], i32 8, <1 x i1> [[TMP0]], <1 x i64> zeroinitializer)
+; LOADSTORE-NEXT:    [[TMP1:%.*]] = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr align 8 [[P:%.*]], <1 x i1> [[TMP0]], <1 x i64> zeroinitializer)
 ; LOADSTORE-NEXT:    [[TMP2:%.*]] = bitcast <1 x i64> [[TMP1]] to i64
 ; LOADSTORE-NEXT:    br label [[BB3]]
 ; LOADSTORE:       bb3:
@@ -1013,7 +1013,7 @@ define i32 @succ_phi_has_3input(i1 %cond1, ptr %p, i1 %cond2) {
 ; LOADONLY-NEXT:    br i1 [[COND1:%.*]], label [[BB3:%.*]], label [[BB1:%.*]]
 ; LOADONLY:       bb1:
 ; LOADONLY-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND2:%.*]] to <1 x i1>
-; LOADONLY-NEXT:    [[TMP1:%.*]] = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr [[P:%.*]], i32 8, <1 x i1> [[TMP0]], <1 x i64> zeroinitializer)
+; LOADONLY-NEXT:    [[TMP1:%.*]] = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr align 8 [[P:%.*]], <1 x i1> [[TMP0]], <1 x i64> zeroinitializer)
 ; LOADONLY-NEXT:    [[TMP2:%.*]] = bitcast <1 x i64> [[TMP1]] to i64
 ; LOADONLY-NEXT:    br label [[BB3]]
 ; LOADONLY:       bb3:
@@ -1056,10 +1056,10 @@ define i32 @succ1to0_phi2(ptr %p, ptr %p2) {
 ; LOADSTORE-NEXT:    [[COND:%.*]] = icmp eq ptr [[P:%.*]], null
 ; LOADSTORE-NEXT:    [[TMP0:%.*]] = xor i1 [[COND]], true
 ; LOADSTORE-NEXT:    [[TMP1:%.*]] = bitcast i1 [[TMP0]] to <1 x i1>
-; LOADSTORE-NEXT:    [[TMP2:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[P]], i32 4, <1 x i1> [[TMP1]], <1 x i32> zeroinitializer)
+; LOADSTORE-NEXT:    [[TMP2:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 4 [[P]], <1 x i1> [[TMP1]], <1 x i32> zeroinitializer)
 ; LOADSTORE-NEXT:    [[TMP3:%.*]] = bitcast <1 x i32> [[TMP2]] to i32
 ; LOADSTORE-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP3]] to <1 x i32>
-; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP4]], ptr [[P2:%.*]], i32 4, <1 x i1> [[TMP1]])
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP4]], ptr align 4 [[P2:%.*]], <1 x i1> [[TMP1]])
 ; LOADSTORE-NEXT:    ret i32 [[TMP3]]
 ;
 ; NONE-LABEL: @succ1to0_phi2(
@@ -1095,10 +1095,10 @@ define i32 @succ1to0_phi3(ptr %p, ptr %p2, i32 %x) {
 ; LOADSTORE-NEXT:    [[TMP0:%.*]] = xor i1 [[COND]], true
 ; LOADSTORE-NEXT:    [[TMP1:%.*]] = bitcast i1 [[TMP0]] to <1 x i1>
 ; LOADSTORE-NEXT:    [[TMP2:%.*]] = bitcast i32 [[X:%.*]] to <1 x i32>
-; LOADSTORE-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[P]], i32 4, <1 x i1> [[TMP1]], <1 x i32> zeroinitializer)
+; LOADSTORE-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 4 [[P]], <1 x i1> [[TMP1]], <1 x i32> zeroinitializer)
 ; LOADSTORE-NEXT:    [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
 ; LOADSTORE-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP4]] to <1 x i32>
-; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP5]], ptr [[P2:%.*]], i32 4, <1 x i1> [[TMP1]])
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP5]], ptr align 4 [[P2:%.*]], <1 x i1> [[TMP1]])
 ; LOADSTORE-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[COND]], i32 [[X]], i32 [[TMP4]]
 ; LOADSTORE-NEXT:    [[RES:%.*]] = add i32 [[TMP4]], [[SPEC_SELECT]]
 ; LOADSTORE-NEXT:    ret i32 [[RES]]
@@ -1177,10 +1177,10 @@ define void @hoist_store_without_cstore(ptr %0, ptr %1, i1 %cmp) {
 ; LOADSTORE-NEXT:  entry:
 ; LOADSTORE-NEXT:    store i32 0, ptr [[TMP1:%.*]], align 8
 ; LOADSTORE-NEXT:    [[TMP2:%.*]] = bitcast i1 [[CMP:%.*]] to <1 x i1>
-; LOADSTORE-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[TMP0:%.*]], i32 4, <1 x i1> [[TMP2]], <1 x i32> poison)
+; LOADSTORE-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 4 [[TMP0:%.*]], <1 x i1> [[TMP2]], <1 x i32> poison)
 ; LOADSTORE-NEXT:    [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
 ; LOADSTORE-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP4]] to <1 x i32>
-; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP5]], ptr [[TMP1]], i32 8, <1 x i1> [[TMP2]])
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP5]], ptr align 8 [[TMP1]], <1 x i1> [[TMP2]])
 ; LOADSTORE-NEXT:    ret void
 ;
 ; STOREONLY-LABEL: @hoist_store_without_cstore(
@@ -1198,7 +1198,7 @@ define void @hoist_store_without_cstore(ptr %0, ptr %1, i1 %cmp) {
 ; LOADONLY-NEXT:  entry:
 ; LOADONLY-NEXT:    store i32 0, ptr [[TMP1:%.*]], align 8
 ; LOADONLY-NEXT:    [[TMP2:%.*]] = bitcast i1 [[CMP:%.*]] to <1 x i1>
-; LOADONLY-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[TMP0:%.*]], i32 4, <1 x i1> [[TMP2]], <1 x i32> poison)
+; LOADONLY-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 4 [[TMP0:%.*]], <1 x i1> [[TMP2]], <1 x i32> poison)
 ; LOADONLY-NEXT:    [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
 ; LOADONLY-NEXT:    [[SPEC_STORE_SELECT:%.*]] = select i1 [[CMP]], i32 [[TMP4]], i32 0
 ; LOADONLY-NEXT:    store i32 [[SPEC_STORE_SELECT]], ptr [[TMP1]], align 8
diff --git a/llvm/test/Transforms/SpeculativeExecution/spec-casts.ll b/llvm/test/Transforms/SpeculativeExecution/spec-casts.ll
index c966bed2f2973..c7500cf8f4392 100644
--- a/llvm/test/Transforms/SpeculativeExecution/spec-casts.ll
+++ b/llvm/test/Transforms/SpeculativeExecution/spec-casts.ll
@@ -1,147 +1,234 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt < %s -S -passes=speculative-execution \
 ; RUN:   -spec-exec-max-speculation-cost 4 -spec-exec-max-not-hoisted 3 \
 ; RUN:   | FileCheck %s
 
-; CHECK-LABEL: @ifThen_bitcast(
-; CHECK: bitcast
-; CHECK: br i1 true
-define void @ifThen_bitcast() {
+define void @ifThen_bitcast(i32 %arg) {
+; CHECK-LABEL: define void @ifThen_bitcast(
+; CHECK-SAME: i32 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = bitcast i32 [[ARG]] to float
+; CHECK-NEXT:    br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK:       [[A]]:
+; CHECK-NEXT:    br label %[[B]]
+; CHECK:       [[B]]:
+; CHECK-NEXT:    ret void
+;
   br i1 true, label %a, label %b
 
 a:
-  %x = bitcast i32 undef to float
+  %x = bitcast i32 %arg to float
   br label %b
 
 b:
   ret void
 }
 
-; CHECK-LABEL: @ifThen_ptrtoint(
-; CHECK: ptrtoint
-; CHECK: br i1 true
-define void @ifThen_ptrtoint() {
+define void @ifThen_ptrtoint(ptr %arg) {
+; CHECK-LABEL: define void @ifThen_ptrtoint(
+; CHECK-SAME: ptr [[ARG:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = ptrtoint ptr [[ARG]] to i64
+; CHECK-NEXT:    br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK:       [[A]]:
+; CHECK-NEXT:    br label %[[B]]
+; CHECK:       [[B]]:
+; CHECK-NEXT:    ret void
+;
   br i1 true, label %a, label %b
 
 a:
-  %x = ptrtoint ptr undef to i64
+  %x = ptrtoint ptr %arg to i64
   br label %b
 
 b:
   ret void
 }
 
-; CHECK-LABEL: @ifThen_inttoptr(
-; CHECK: inttoptr
-; CHECK: br i1 true
-define void @ifThen_inttoptr() {
+define void @ifThen_ptrtoaddr(ptr %arg) {
+; CHECK-LABEL: define void @ifThen_ptrtoaddr(
+; CHECK-SAME: ptr [[ARG:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = ptrtoaddr ptr [[ARG]] to i64
+; CHECK-NEXT:    br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK:       [[A]]:
+; CHECK-NEXT:    br label %[[B]]
+; CHECK:       [[B]]:
+; CHECK-NEXT:    ret void
+;
   br i1 true, label %a, label %b
 
 a:
-  %x = inttoptr i64 undef to ptr
+  %x = ptrtoaddr ptr %arg to i64
   br label %b
 
 b:
   ret void
 }
 
-; CHECK-LABEL: @ifThen_addrspacecast(
-; CHECK: addrspacecast
-; CHECK: br i1 true
-define void @ifThen_addrspacecast() {
+define void @ifThen_inttoptr(i64 %arg) {
+; CHECK-LABEL: define void @ifThen_inttoptr(
+; CHECK-SAME: i64 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = inttoptr i64 [[ARG]] to ptr
+; CHECK-NEXT:    br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK:       [[A]]:
+; CHECK-NEXT:    br label %[[B]]
+; CHECK:       [[B]]:
+; CHECK-NEXT:    ret void
+;
+  br i1 true, label %a, label %b
+
+a:
+  %x = inttoptr i64 %arg to ptr
+  br label %b
+
+b:
+  ret void
+}
+
+define void @ifThen_addrspacecast(ptr %arg) {
+; CHECK-LABEL: define void @ifThen_addrspacecast(
+; CHECK-SAME: ptr [[ARG:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = addrspacecast ptr [[ARG]] to ptr addrspace(1)
+; CHECK-NEXT:    br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK:       [[A]]:
+; CHECK-NEXT:    br label %[[B]]
+; CHECK:       [[B]]:
+; CHECK-NEXT:    ret void
+;
   br i1 true, label %a, label %b
 a:
-  %x = addrspacecast ptr undef to ptr addrspace(1)
+  %x = addrspacecast ptr %arg to ptr addrspace(1)
   br label %b
 
 b:
   ret void
 }
 
-; CHECK-LABEL: @ifThen_fptoui(
-; CHECK: fptoui
-; CHECK: br i1 true
-define void @ifThen_fptoui() {
+define void @ifThen_fptoui(float %arg) {
+; CHECK-LABEL: define void @ifThen_fptoui(
+; CHECK-SAME: float [[ARG:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = fptoui float [[ARG]] to i32
+; CHECK-NEXT:    br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK:       [[A]]:
+; CHECK-NEXT:    br label %[[B]]
+; CHECK:       [[B]]:
+; CHECK-NEXT:    ret void
+;
   br i1 true, label %a, label %b
 a:
-  %x = fptoui float undef to i32
+  %x = fptoui float %arg to i32
   br label %b
 
 b:
   ret void
 }
 
-; CHECK-LABEL: @ifThen_fptosi(
-; CHECK: fptosi
-; CHECK: br i1 true
-define void @ifThen_fptosi() {
+define void @ifThen_fptosi(float %arg) {
+; CHECK-LABEL: define void @ifThen_fptosi(
+; CHECK-SAME: float [[ARG:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = fptosi float [[ARG]] to i32
+; CHECK-NEXT:    br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK:       [[A]]:
+; CHECK-NEXT:    br label %[[B]]
+; CHECK:       [[B]]:
+; CHECK-NEXT:    ret void
+;
   br i1 true, label %a, label %b
 a:
-  %x = fptosi float undef to i32
+  %x = fptosi float %arg to i32
   br label %b
 
 b:
   ret void
 }
 
-; CHECK-LABEL: @ifThen_uitofp(
-; CHECK: uitofp
-; CHECK: br i1 true
-define void @ifThen_uitofp() {
+define void @ifThen_uitofp(i32 %arg) {
+; CHECK-LABEL: define void @ifThen_uitofp(
+; CHECK-SAME: i32 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = uitofp i32 [[ARG]] to float
+; CHECK-NEXT:    br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK:       [[A]]:
+; CHECK-NEXT:    br label %[[B]]
+; CHECK:       [[B]]:
+; CHECK-NEXT:    ret void
+;
   br i1 true, label %a, label %b
 a:
-  %x = uitofp i32 undef to float
+  %x = uitofp i32 %arg to float
   br label %b
 
 b:
   ret void
 }
 
-; CHECK-LABEL: @ifThen_sitofp(
-; CHECK: sitofp
-; CHECK: br i1 true
-define void @ifThen_sitofp() {
+define void @ifThen_sitofp(i32 %arg) {
+; CHECK-LABEL: define void @ifThen_sitofp(
+; CHECK-SAME: i32 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = sitofp i32 [[ARG]] to float
+; CHECK-NEXT:    br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK:       [[A]]:
+; CHECK-NEXT:    br label %[[B]]
+; CHECK:       [[B]]:
+; CHECK-NEXT:    ret void
+;
   br i1 true, label %a, label %b
 a:
-  %x = sitofp i32 undef to float
+  %x = sitofp i32 %arg to float
   br label %b
 
 b:
   ret void
 }
 
-; CHECK-LABEL: @ifThen_fpext(
-; CHECK: fpext
-; CHECK: br i1 true
-define void @ifThen_fpext() {
+define void @ifThen_fpext(float %arg) {
+; CHECK-LABEL: define void @ifThen_fpext(
+; CHECK-SAME: float [[ARG:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = fpext float [[ARG]] to double
+; CHECK-NEXT:    br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK:       [[A]]:
+; CHECK-NEXT:    br label %[[B]]
+; CHECK:       [[B]]:
+; CHECK-NEXT:    ret void
+;
   br i1 true, label %a, label %b
 a:
-  %x = fpext float undef to double
+  %x = fpext float %arg to double
   br label %b
 
 b:
   ret void
 }
 
-; CHECK-LABEL: @ifThen_fptrunc(
-; CHECK: fptrunc
-; CHECK: br i1 true
-define void @ifThen_fptrunc() {
+define void @ifThen_fptrunc(double %arg) {
+; CHECK-LABEL: define void @ifThen_fptrunc(
+; CHECK-SAME: double [[ARG:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = fptrunc double [[ARG]] to float
+; CHECK-NEXT:    br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK:       [[A]]:
+; CHECK-NEXT:    br label %[[B]]
+; CHECK:       [[B]]:
+; CHECK-NEXT:    ret void
+;
   br i1 true, label %a, label %b
 a:
-  %x = fptrunc double undef to float
+  %x = fptrunc double %arg to float
   br label %b
 
 b:
   ret void
 }
 
-; CHECK-LABEL: @ifThen_trunc(
-; CHECK: trunc
-; CHECK: br i1 true
-define void @ifThen_trunc() {
+define void @ifThen_trunc(i32 %arg) {
+; CHECK-LABEL: define void @ifThen_trunc(
+; CHECK-SAME: i32 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = trunc i32 [[ARG]] to i16
+; CHECK-NEXT:    br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK:       [[A]]:
+; CHECK-NEXT:    br label %[[B]]
+; CHECK:       [[B]]:
+; CHECK-NEXT:    ret void
+;
   br i1 true, label %a, label %b
 a:
-  %x = trunc i32 undef to i16
+  %x = trunc i32 %arg to i16
   br label %b
 
 b:
diff --git a/llvm/test/Verifier/amdgpu-intrinsics.ll b/llvm/test/Verifier/amdgpu-intrinsics.ll
new file mode 100644
index 0000000000000..b774c4cb12fbd
--- /dev/null
+++ b/llvm/test/Verifier/amdgpu-intrinsics.ll
@@ -0,0 +1,66 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+
+; ---------- i32 metadata ------------------------------------------------------
+; CHECK: global load/store intrinsics require that the last argument is a metadata string
+; CHECK-NEXT: call <4 x i32> @llvm.amdgcn.global.load.b128({{.*}})
+; CHECK-NEXT: metadata i32 1
+define <4 x i32> @global_load_b128_00(ptr addrspace(1) %addr) {
+entry:
+  %data = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %addr, metadata !3)
+  ret <4 x i32> %data
+}
+
+; CHECK: global load/store intrinsics require that the last argument is a metadata string
+; CHECK-NEXT: call void @llvm.amdgcn.global.store.b128({{.*}})
+; CHECK-NEXT: metadata i32 1
+define void @global_store_b128_00(ptr addrspace(1) %addr, <4 x i32> %data) {
+entry:
+  call void @llvm.amdgcn.global.store.b128(ptr addrspace(1) %addr, <4 x i32> %data, metadata !3)
+  ret void
+}
+
+; ---------- non-tuple metadata ------------------------------------------------
+; CHECK:      global load/store intrinsics require that the last argument is a metadata string
+; CHECK-NEXT: call <4 x i32> @llvm.amdgcn.global.load.b128({{.*}})
+; CHECK-NEXT: metadata !0
+define <4 x i32> @global_load_b128_01(ptr addrspace(1) %addr) {
+entry:
+  %data = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %addr, metadata !0)
+  ret <4 x i32> %data
+}
+
+; CHECK:      global load/store intrinsics require that the last argument is a metadata string
+; CHECK-NEXT: call void @llvm.amdgcn.global.store.b128({{.*}})
+; CHECK-NEXT: metadata !0
+define void @global_store_b128_01(ptr addrspace(1) %addr, <4 x i32> %data) {
+entry:
+  call void @llvm.amdgcn.global.store.b128(ptr addrspace(1) %addr, <4 x i32> %data, metadata !0)
+  ret void
+}
+
+; ---------- invalid string metadata -------------------------------------------
+; CHECK:      'wave' is not a valid scope for global load/store intrinsics
+; CHECK-NEXT: call <4 x i32> @llvm.amdgcn.global.load.b128({{.*}})
+; CHECK-NEXT: metadata !2
+define <4 x i32> @global_load_b128_02(ptr addrspace(1) %addr) {
+entry:
+  %data = call <4 x i32> @llvm.amdgcn.global.load.b128(ptr addrspace(1) %addr, metadata !2)
+  ret <4 x i32> %data
+}
+
+; CHECK:      'wave' is not a valid scope for global load/store intrinsics
+; CHECK-NEXT: call void @llvm.amdgcn.global.store.b128({{.*}})
+; CHECK-NEXT: metadata !2
+define void @global_store_b128_02(ptr addrspace(1) %addr, <4 x i32> %data) {
+entry:
+  call void @llvm.amdgcn.global.store.b128(ptr addrspace(1) %addr, <4 x i32> %data, metadata !2)
+  ret void
+}
+
+
+!0 = !{!1}
+!1 = !{!""}
+
+!2 = !{!"wave"}
+
+!3 = !{i32 1}
diff --git a/llvm/test/Verifier/intrinsic-bad-arg-type.ll b/llvm/test/Verifier/intrinsic-bad-arg-type.ll
index e0684a84630fa..e9866173cd203 100644
--- a/llvm/test/Verifier/intrinsic-bad-arg-type.ll
+++ b/llvm/test/Verifier/intrinsic-bad-arg-type.ll
@@ -4,7 +4,7 @@
 ; CHECK-NEXT: ptr @llvm.masked.load.nxv4i32.p0
 
 define <vscale x 4 x i32> @masked_load(ptr %addr, <4 x i1> %mask, <vscale x 4 x i32> %dst) {
-  %res = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <vscale x 4 x i32> %dst)
+  %res = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %addr, <4 x i1> %mask, <vscale x 4 x i32> %dst)
   ret <vscale x 4 x i32> %res
 }
-declare <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr, i32, <4 x i1>, <vscale x 4 x i32>)
+declare <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr, <4 x i1>, <vscale x 4 x i32>)
diff --git a/llvm/test/Verifier/intrinsic-immarg.ll b/llvm/test/Verifier/intrinsic-immarg.ll
index d5aef3dcbbfe0..6e68dde62afae 100644
--- a/llvm/test/Verifier/intrinsic-immarg.ll
+++ b/llvm/test/Verifier/intrinsic-immarg.ll
@@ -127,42 +127,6 @@ define i64 @umul_fix_sat(i64 %arg0, i64 %arg1, i32 %arg2) {
   ret i64 %ret
 }
 
-declare <2 x double> @llvm.masked.load.v2f64.p0(ptr, i32, <2 x i1>, <2 x double>)
-define <2 x double> @masked_load(<2 x i1> %mask, ptr %addr, <2 x double> %dst, i32 %align) {
-  ; CHECK: immarg operand has non-immediate parameter
-  ; CHECK-NEXT: i32 %align
-  ; CHECK-NEXT: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 %align, <2 x i1> %mask, <2 x double> %dst)
-  %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 %align, <2 x i1> %mask, <2 x double> %dst)
-  ret <2 x double> %res
-}
-
-declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32, <4 x i1>)
-define void @masked_store(<4 x i1> %mask, ptr %addr, <4 x i32> %val, i32 %align) {
-  ; CHECK: immarg operand has non-immediate parameter
-  ; CHECK-NEXT: i32 %align
-  ; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 %align, <4 x i1> %mask)
-  call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 %align, <4 x i1> %mask)
-  ret void
-}
-
-declare <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x double>)
-define <2 x double> @test_gather(<2 x ptr> %ptrs, <2 x i1> %mask, <2 x double> %src0, i32 %align)  {
-  ; CHECK: immarg operand has non-immediate parameter
-  ; CHECK: i32 %align
-  ; CHECK: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 %align, <2 x i1> %mask, <2 x double> %src0)
-  %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 %align, <2 x i1> %mask, <2 x double> %src0)
-  ret <2 x double> %res
-}
-
-declare void @llvm.masked.scatter.v8i32.v8p0(<8 x i32>, <8 x ptr>, i32, <8 x i1>)
-define void @test_scatter_8i32(<8 x i32> %a1, <8 x ptr> %ptr, <8 x i1> %mask, i32 %align) {
-  ; CHECK: immarg operand has non-immediate parameter
-  ; CHECK-NEXT: i32 %align
-  ; CHECK-NEXT: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 %align, <8 x i1> %mask)
-  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 %align, <8 x i1> %mask)
-  ret void
-}
-
 declare ptr @llvm.invariant.start.p0(i64, ptr)
 define void @test_invariant_start(i64 %arg0, ptr %ptr) {
   ; CHECK: immarg operand has non-immediate parameter
diff --git a/llvm/test/Verifier/masked-load.ll b/llvm/test/Verifier/masked-load.ll
deleted file mode 100644
index 152295814a15b..0000000000000
--- a/llvm/test/Verifier/masked-load.ll
+++ /dev/null
@@ -1,10 +0,0 @@
-; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
-
-declare <2 x double> @llvm.masked.load.v2f64.p0(ptr, i32, <2 x i1>, <2 x double>)
-
-define <2 x double> @masked_load(<2 x i1> %mask, ptr %addr, <2 x double> %dst) {
-  ; CHECK: masked_load: alignment must be a power of 2
-  ; CHECK-NEXT: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 3, <2 x i1> %mask, <2 x double> %dst)
-  %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 3, <2 x i1>%mask, <2 x double> %dst)
-  ret <2 x double> %res
-}
diff --git a/llvm/test/Verifier/masked-store.ll b/llvm/test/Verifier/masked-store.ll
deleted file mode 100644
index 324adbd81b140..0000000000000
--- a/llvm/test/Verifier/masked-store.ll
+++ /dev/null
@@ -1,10 +0,0 @@
-; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
-
-declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32, <4 x i1>)
-
-define void @masked_store(<4 x i1> %mask, ptr %addr, <4 x i32> %val) {
-  ; CHECK: masked_store: alignment must be a power of 2
-  ; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 3, <4 x i1> %mask)
-  call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 3, <4 x i1> %mask)
-  ret void
-}
diff --git a/llvm/test/Verifier/matrix-intrinsics.ll b/llvm/test/Verifier/matrix-intrinsics.ll
index b6d5ad9a3cc49..43d1a79f0853f 100644
--- a/llvm/test/Verifier/matrix-intrinsics.ll
+++ b/llvm/test/Verifier/matrix-intrinsics.ll
@@ -1,8 +1,7 @@
-; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
+; RUN: not opt -S %s 2>&1 | FileCheck %s
 
 define <4 x float> @transpose(<4 x float> %m, i32 %arg) {
-; CHECK: assembly parsed, but does not verify as correct!
-; CHECK-NEXT: Result of a matrix operation does not fit in the returned vector!
+; CHECK: Result of a matrix operation does not fit in the returned vector!
 ; CHECK-NEXT: Result of a matrix operation does not fit in the returned vector!
 ; CHECK-NEXT: Result of a matrix operation does not fit in the returned vector!
 ; CHECK-NEXT: immarg operand has non-immediate parameter
@@ -118,16 +117,34 @@ define void @column.major_store_stride_too_small(ptr %m, i64 %arg) {
   ret void
 }
 
+define <4 x float> @column.major_load_stride_i128(ptr %m, i32 %arg) {
+; CHECK-NEXT: Stride bitwidth cannot exceed 64!
+; CHECK-NEXT: ptr @llvm.matrix.column.major.load.v4f32.i128
+  %result.1 = call <4 x float> @llvm.matrix.column.major.load.v4f32.i128(ptr %m, i128 u0x10000000000000000, i1 false, i32 2, i32 2)
+  ret <4 x float> %result.1
+}
+
+define void @column.major_store_stride_i128(ptr %m, i64 %arg) {
+; CHECK-NEXT: Stride bitwidth cannot exceed 64!
+; CHECK-NEXT: ptr @llvm.matrix.column.major.store.v4f32.i128
+  call void @llvm.matrix.column.major.store.v4f32.i128(<4 x float> zeroinitializer, ptr %m, i128 u0x10000000000000000, i1 false, i32 2, i32 2)
+  ret void
+}
+
 declare <4 x i32>   @llvm.matrix.column.major.load.v4i32.i64(ptr, i64, i1, i32, i32)
 declare <4 x float> @llvm.matrix.column.major.load.v4f32.p0(ptr, i64, i1, i32, i32)
 declare <4 x float> @llvm.matrix.column.major.load.v4f32.i64(ptr, i64, i1, i32, i32)
 declare <6 x float> @llvm.matrix.column.major.load.v6f32.i64(ptr, i64, i1, i32, i32)
+declare <6 x float> @llvm.matrix.column.major.load.v6f32.i8(ptr, i8, i1, i32, i32)
+declare <6 x float> @llvm.matrix.column.major.load.v6f32.i128(ptr, i28, i1, i32, i32)
 
 declare void @llvm.matrix.column.major.store.v4f32.i64(<4 x float>, ptr, i64, i1, i32, i32)
 declare void @llvm.matrix.column.major.store.v6f32.i64(<6 x float>, ptr, i64, i1, i32, i32)
 declare void @llvm.matrix.column.major.store.v4i32.vi32(<4 x i32>, ptr, i64, i1, i32, i32)
 declare void @llvm.matrix.column.major.store.v4f32.p0(<4 x float>, ptr, i64, i1, i32, i32)
 declare void @llvm.matrix.column.major.store.v4p0.i64(<4 x ptr>, ptr, i64, i1, i32, i32)
+declare void @llvm.matrix.column.major.store.v4p0.i8(<4 x ptr>, ptr, i8, i1, i32, i32)
+declare void @llvm.matrix.column.major.store.v4p0.i128(<4 x ptr>, ptr, i128, i1, i32, i32)
 
 declare <4 x i32>   @llvm.matrix.transpose.v4i32.v4f32(<4 x float>, i32, i32)
 declare <4 x float> @llvm.matrix.transpose.v4f32(<4 x float>, i32, i32)
diff --git a/llvm/test/Verifier/opaque-ptr.ll b/llvm/test/Verifier/opaque-ptr.ll
index 3ac90448228be..e58fb569747b4 100644
--- a/llvm/test/Verifier/opaque-ptr.ll
+++ b/llvm/test/Verifier/opaque-ptr.ll
@@ -52,9 +52,9 @@ define void @opaque_mangle() {
 
 define void @intrinsic_calls(ptr %a) {
 ; CHECK-LABEL: @intrinsic_calls(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr [[A:%.*]], i32 4, <2 x i1> zeroinitializer, <2 x i32> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.store.v2i32.p0(<2 x i32> zeroinitializer, ptr [[A]], i32 4, <2 x i1> zeroinitializer)
-; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> zeroinitializer, i32 4, <2 x i1> zeroinitializer, <2 x i64> zeroinitializer)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 4 [[A:%.*]], <2 x i1> zeroinitializer, <2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.v2i32.p0(<2 x i32> zeroinitializer, ptr align 4 [[A]], <2 x i1> zeroinitializer)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 4 zeroinitializer, <2 x i1> zeroinitializer, <2 x i64> zeroinitializer)
 ; CHECK-NEXT:    [[TMP3:%.*]] = call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype(i32) null, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Verifier/scatter_gather.ll b/llvm/test/Verifier/scatter_gather.ll
index 53be5026f759e..1d6c36b7c5f05 100644
--- a/llvm/test/Verifier/scatter_gather.ll
+++ b/llvm/test/Verifier/scatter_gather.ll
@@ -3,112 +3,112 @@
 ; Mask is not a vector
 ; CHECK: Intrinsic has incorrect argument type!
 define <16 x float> @gather2(<16 x ptr> %ptrs, ptr %mask, <16 x float> %passthru) {
-  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %ptrs, i32 4, ptr %mask, <16 x float> %passthru)
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %ptrs, ptr %mask, <16 x float> %passthru)
   ret <16 x float> %res
 }
-declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, i32, ptr, <16 x float>)
+declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, ptr, <16 x float>)
 
 ; Mask length != return length
 ; CHECK: Intrinsic has incorrect argument type!
 define <8 x float> @gather3(<8 x ptr> %ptrs, <16 x i1> %mask, <8 x float> %passthru) {
-  %res = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> %ptrs, i32 4, <16 x i1> %mask, <8 x float> %passthru)
+  %res = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> %ptrs, <16 x i1> %mask, <8 x float> %passthru)
   ret <8 x float> %res
 }
-declare <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr>, i32, <16 x i1>, <8 x float>)
+declare <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr>, <16 x i1>, <8 x float>)
 
 ; Return type is not a vector
 ; CHECK: Intrinsic has incorrect return type!
 define ptr @gather4(<8 x ptr> %ptrs, <8 x i1> %mask, <8 x float> %passthru) {
-  %res = call ptr @llvm.masked.gather.p0.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> %mask, <8 x float> %passthru)
+  %res = call ptr @llvm.masked.gather.p0.v8p0(<8 x ptr> %ptrs, <8 x i1> %mask, <8 x float> %passthru)
   ret ptr %res
 }
-declare ptr @llvm.masked.gather.p0.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x float>)
+declare ptr @llvm.masked.gather.p0.v8p0(<8 x ptr>, <8 x i1>, <8 x float>)
 
 ; Value type is not a vector
 ; CHECK: Intrinsic has incorrect argument type!
 define <8 x float> @gather5(ptr %ptrs, <8 x i1> %mask, <8 x float> %passthru) {
-  %res = call <8 x float> @llvm.masked.gather.v8f32.p0(ptr %ptrs, i32 4, <8 x i1> %mask, <8 x float> %passthru)
+  %res = call <8 x float> @llvm.masked.gather.v8f32.p0(ptr %ptrs, <8 x i1> %mask, <8 x float> %passthru)
   ret <8 x float> %res
 }
-declare <8 x float> @llvm.masked.gather.v8f32.p0(ptr, i32, <8 x i1>, <8 x float>)
+declare <8 x float> @llvm.masked.gather.v8f32.p0(ptr, <8 x i1>, <8 x float>)
 
 ; Value type is not a vector of pointers
 ; CHECK: Intrinsic has incorrect argument type!
 define <8 x float> @gather6(<8 x float> %ptrs, <8 x i1> %mask, <8 x float> %passthru) {
-  %res = call <8 x float> @llvm.masked.gather.v8f32.v8f32(<8 x float> %ptrs, i32 4, <8 x i1> %mask, <8 x float> %passthru)
+  %res = call <8 x float> @llvm.masked.gather.v8f32.v8f32(<8 x float> %ptrs, <8 x i1> %mask, <8 x float> %passthru)
   ret <8 x float> %res
 }
-declare <8 x float> @llvm.masked.gather.v8f32.v8f32(<8 x float>, i32, <8 x i1>, <8 x float>)
+declare <8 x float> @llvm.masked.gather.v8f32.v8f32(<8 x float>, <8 x i1>, <8 x float>)
 
 ; Value length!= vector of pointers length
 ; CHECK: Intrinsic has incorrect argument type!
 ; CHECK-NEXT: ptr @llvm.masked.gather.v8f32.v16p0
 define <8 x float> @gather8(<16 x ptr> %ptrs, <8 x i1> %mask, <8 x float> %passthru) {
-  %res = call <8 x float> @llvm.masked.gather.v8f32.v16p0(<16 x ptr> %ptrs, i32 4, <8 x i1> %mask, <8 x float> %passthru)
+  %res = call <8 x float> @llvm.masked.gather.v8f32.v16p0(<16 x ptr> %ptrs, <8 x i1> %mask, <8 x float> %passthru)
   ret <8 x float> %res
 }
-declare <8 x float> @llvm.masked.gather.v8f32.v16p0(<16 x ptr>, i32, <8 x i1>, <8 x float>)
+declare <8 x float> @llvm.masked.gather.v8f32.v16p0(<16 x ptr>, <8 x i1>, <8 x float>)
 
 ; Passthru type doesn't match return type
 ; CHECK: Intrinsic has incorrect argument type!
 ; CHECK-NEXT: ptr @llvm.masked.gather.v16i32.v16p0
 define <16 x i32> @gather9(<16 x ptr> %ptrs, <16 x i1> %mask, <8 x i32> %passthru) {
-  %res = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> %ptrs, i32 4, <16 x i1> %mask, <8 x i32> %passthru)
+  %res = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> %ptrs, <16 x i1> %mask, <8 x i32> %passthru)
   ret <16 x i32> %res
 }
-declare <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr>, i32, <16 x i1>, <8 x i32>)
+declare <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr>, <16 x i1>, <8 x i32>)
 
 ; Mask is not a vector
 ; CHECK: Intrinsic has incorrect argument type!
 ; CHECK-NEXT: ptr @llvm.masked.scatter.v16f32.v16p0
 define void @scatter2(<16 x float> %value, <16 x ptr> %ptrs, ptr %mask) {
-  call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %value, <16 x ptr> %ptrs, i32 4, ptr %mask)
+  call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %value, <16 x ptr> %ptrs, ptr %mask)
   ret void
 }
-declare void @llvm.masked.scatter.v16f32.v16p0(<16 x float>, <16 x ptr>, i32, ptr)
+declare void @llvm.masked.scatter.v16f32.v16p0(<16 x float>, <16 x ptr>, ptr)
 
 ; Mask length != value length
 ; CHECK: Intrinsic has incorrect argument type!
 ; CHECK-NEXT: ptr @llvm.masked.scatter.v8f32.v8p0
 define void @scatter3(<8 x float> %value, <8 x ptr> %ptrs, <16 x i1> %mask) {
-  call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> %value, <8 x ptr> %ptrs, i32 4, <16 x i1> %mask)
+  call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> %value, <8 x ptr> %ptrs, <16 x i1> %mask)
   ret void
 }
-declare void @llvm.masked.scatter.v8f32.v8p0(<8 x float>, <8 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8f32.v8p0(<8 x float>, <8 x ptr>, <16 x i1>)
 
 ; Value type is not a vector
 ; CHECK: Intrinsic has incorrect argument type!
 ; CHECK-NEXT: ptr @llvm.masked.scatter.p0.v8p0
 define void @scatter4(ptr %value, <8 x ptr> %ptrs, <8 x i1> %mask) {
-  call void @llvm.masked.scatter.p0.v8p0(ptr %value, <8 x ptr> %ptrs, i32 4, <8 x i1> %mask)
+  call void @llvm.masked.scatter.p0.v8p0(ptr %value, <8 x ptr> %ptrs, <8 x i1> %mask)
   ret void
 }
-declare void @llvm.masked.scatter.p0.v8p0(ptr, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.p0.v8p0(ptr, <8 x ptr>, <8 x i1>)
 
 ; ptrs is not a vector
 ; CHECK: Intrinsic has incorrect argument type!
 ; CHECK-NEXT: ptr @llvm.masked.scatter.v8f32.p0
 define void @scatter5(<8 x float> %value, ptr %ptrs, <8 x i1> %mask) {
-  call void @llvm.masked.scatter.v8f32.p0(<8 x float> %value, ptr %ptrs, i32 4, <8 x i1> %mask)
+  call void @llvm.masked.scatter.v8f32.p0(<8 x float> %value, ptr %ptrs, <8 x i1> %mask)
   ret void
 }
-declare void @llvm.masked.scatter.v8f32.p0(<8 x float>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v8f32.p0(<8 x float>, ptr, <8 x i1>)
 
 ; Value type is not a vector of pointers
 ; CHECK: Intrinsic has incorrect argument type!
 ; CHECK-NEXT: ptr @llvm.masked.scatter.v8f32.v8f32
 define void @scatter6(<8 x float> %value, <8 x float> %ptrs, <8 x i1> %mask) {
-  call void @llvm.masked.scatter.v8f32.v8f32(<8 x float> %value, <8 x float> %ptrs, i32 4, <8 x i1> %mask)
+  call void @llvm.masked.scatter.v8f32.v8f32(<8 x float> %value, <8 x float> %ptrs, <8 x i1> %mask)
   ret void
 }
-declare void @llvm.masked.scatter.v8f32.v8f32(<8 x float>, <8 x float>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v8f32.v8f32(<8 x float>, <8 x float>, <8 x i1>)
 
 ; Value length!= vector of pointers length
 ; CHECK: Intrinsic has incorrect argument type!
 ; CHECK-NEXT: ptr @llvm.masked.scatter.v8f32.v16p0
 define void @scatter8(<8 x float> %value, <16 x ptr> %ptrs, <8 x i1> %mask) {
-  call void @llvm.masked.scatter.v8f32.v16p0(<8 x float> %value, <16 x ptr> %ptrs, i32 4, <8 x i1> %mask)
+  call void @llvm.masked.scatter.v8f32.v16p0(<8 x float> %value, <16 x ptr> %ptrs, <8 x i1> %mask)
   ret void
 }
-declare void @llvm.masked.scatter.v8f32.v16p0(<8 x float>, <16 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v8f32.v16p0(<8 x float>, <16 x ptr>, <8 x i1>)
 
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/DW_AT_language_version-pretty.s b/llvm/test/tools/llvm-dwarfdump/X86/DW_AT_language_version-pretty.s
new file mode 100644
index 0000000000000..b889ce5cfdcc3
--- /dev/null
+++ b/llvm/test/tools/llvm-dwarfdump/X86/DW_AT_language_version-pretty.s
@@ -0,0 +1,44 @@
+# Demonstrate dumping DW_AT_language_version in human-readable form.
+# RUN: llvm-mc -triple=x86_64--linux -filetype=obj -o %t.o < %s
+# RUN: llvm-dwarfdump %t.o -v | FileCheck %s --check-prefix=VERBOSE
+# RUN: llvm-dwarfdump %t.o | FileCheck %s --check-prefix=NO-VERBOSE
+
+# VERBOSE: .debug_info contents:
+# VERBOSE: DW_AT_language_name [DW_FORM_data2] (DW_LNAME_C)
+# VERBOSE: DW_AT_language_version [DW_FORM_data4] (201112 C11)
+# VERBOSE: DW_AT_language_name [DW_FORM_data2] (0x0000)
+# VERBOSE: DW_AT_language_version [DW_FORM_data4] (12 Unknown)
+
+# NO-VERBOSE: .debug_info contents:
+# NO-VERBOSE: DW_AT_language_name (DW_LNAME_C)
+# NO-VERBOSE: DW_AT_language_version (C11)
+# NO-VERBOSE: DW_AT_language_name (0x0000)
+# NO-VERBOSE: DW_AT_language_version (Unknown)
+
+        .section        .debug_abbrev,"",@progbits
+        .byte   1                       # Abbreviation Code
+        .byte   17                      # DW_TAG_compile_unit
+        .byte   1                       # DW_CHILDREN_no 
+        .ascii  "\220\001"              # DW_AT_language_name
+        .byte   5                       # DW_FORM_data2
+        .ascii  "\221\001"              # DW_AT_language_version
+        .byte   6                       # DW_FORM_data4
+        .byte   0                       # EOM(1)
+        .byte   0                       # EOM(2)
+        .byte   0                       # EOM(3)
+
+        .section        .debug_info,"",@progbits
+        .long   .Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+        .short  5                       # DWARF version number
+        .byte   1                       # Unit type
+        .byte   8                       # Address Size (in bytes)
+        .long   .debug_abbrev           # Offset Into Abbrev. Section
+        .byte   1                       # Abbrev [1] DW_TAG_compile_unit
+        .short  3                       # DW_AT_language_name
+        .long   201112                  # DW_AT_language_version
+        .byte   1                       # Abbrev [1] DW_TAG_compile_unit
+        .short  0                       # DW_AT_language_name
+        .long   12                      # DW_AT_language_version
+        .byte   0 
+.Ldebug_info_end0:
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/DW_AT_language_version.s b/llvm/test/tools/llvm-dwarfdump/X86/DW_AT_language_version.s
index f1be8fdf6cd3e..985836fcd1ef5 100644
--- a/llvm/test/tools/llvm-dwarfdump/X86/DW_AT_language_version.s
+++ b/llvm/test/tools/llvm-dwarfdump/X86/DW_AT_language_version.s
@@ -1,13 +1,19 @@
-# Demonstrate dumping DW_AT_language_version.
-# RUN: llvm-mc -triple=x86_64--linux -filetype=obj < %s | \
-# RUN:     llvm-dwarfdump -v - | FileCheck %s
+# Demonstrate dumping DW_AT_language_version without an
+# accompanying DW_AT_language_name.
+# RUN: llvm-mc -triple=x86_64--linux -filetype=obj -o %t.o < %s
+# RUN: llvm-dwarfdump -v %t.o | FileCheck %s --check-prefix=VERBOSE
+# RUN: llvm-dwarfdump %t.o | FileCheck %s --check-prefix=NO-VERBOSE
 
-# CHECK: .debug_abbrev contents:
-# CHECK: DW_AT_language_version DW_FORM_data4
-# CHECK: DW_AT_language_version DW_FORM_data2
-# CHECK: .debug_info contents:
-# CHECK: DW_AT_language_version [DW_FORM_data4] (201402)
-# CHECK: DW_AT_language_version [DW_FORM_data2] (0)
+# VERBOSE: .debug_abbrev contents:
+# VERBOSE: DW_AT_language_version DW_FORM_data4
+# VERBOSE: DW_AT_language_version DW_FORM_data2
+# VERBOSE: .debug_info contents:
+# VERBOSE: DW_AT_language_version [DW_FORM_data4] (201402)
+# VERBOSE: DW_AT_language_version [DW_FORM_data2] (0)
+
+# NO-VERBOSE: .debug_info contents:
+# NO-VERBOSE: DW_AT_language_version (201402)
+# NO-VERBOSE: DW_AT_language_version (0)
 
         .section        .debug_abbrev,"",@progbits
         .byte   1                       # Abbreviation Code
diff --git a/llvm/test/tools/llvm-gsymutil/ARM_AArch64/macho-gsym-merged-callsites-dsym.yaml b/llvm/test/tools/llvm-gsymutil/ARM_AArch64/macho-gsym-merged-callsites-dsym.yaml
index 42ef7f301e536..5f19a2962da44 100644
--- a/llvm/test/tools/llvm-gsymutil/ARM_AArch64/macho-gsym-merged-callsites-dsym.yaml
+++ b/llvm/test/tools/llvm-gsymutil/ARM_AArch64/macho-gsym-merged-callsites-dsym.yaml
@@ -4,6 +4,7 @@
 
 # RUN: split-file %s %t
 # RUN: yaml2obj %t/merged_callsites.dSYM.yaml -o %t/merged_callsites.dSYM
+# The object file is manually tampered with such that the LLVM_stmt_seq of function_bad_stmt_seq is 0xFFFFFFFF.
 
 # RUN: llvm-gsymutil --num-threads=1 --convert=%t/merged_callsites.dSYM --merged-functions --callsites-yaml-file=%t/callsites.yaml -o %t/call_sites_dSYM.gsym
 # RUN: llvm-gsymutil --num-threads=1 --convert=%t/merged_callsites.dSYM --merged-functions --dwarf-callsites -o %t/dwarf_call_sites_dSYM.gsym
@@ -36,7 +37,13 @@
 # CHECK-MERGED-CALLSITES:      CallSites (by relative return offset):
 # CHECK-MERGED-CALLSITES-NEXT:   0x[[#%.4x,]] Flags[None] MatchRegex[function2_copy1]
 
-# CHECK-MERGED-CALLSITES:      FunctionInfo @ 0x[[#%x,MAIN:]]: [0x[[#%x,MAIN_START:]] - 0x[[#%x,MAIN_END:]]) "main"
+# If we don't do anything, a function with bad LLVM_stmt_seq won't have any call site filter
+# More importantly, the line number will be at the function definition.
+# CHECK-MERGED-CALLSITES:      FunctionInfo @ 0x[[#%x,BAD_STMT_SEQ:]]: [0x[[#%x,BAD_STMT_SEQ_START:]] - 0x[[#%x,BAD_STMT_SEQ_END:]]) "function_bad_stmt_seq"
+# CHECK-MERGED-CALLSITES-NEXT: LineTable:
+# CHECK-MERGED-CALLSITES-NEXT:   0x00000001000003b0 ./merged_funcs_test.cpp:65
+
+# CHECK-MERGED-CALLSITES-NEXT: FunctionInfo @ 0x[[#%x,MAIN:]]: [0x[[#%x,MAIN_START:]] - 0x[[#%x,MAIN_END:]]) "main"
 # CHECK-MERGED-CALLSITES:      CallSites (by relative return offset):
 # CHECK-MERGED-CALLSITES-NEXT:   0x[[#%.4x,]] Flags[None] MatchRegex[function1]
 # CHECK-MERGED-CALLSITES-NEXT:   0x[[#%.4x,]] Flags[None] MatchRegex[function2_copy2]
@@ -44,16 +51,17 @@
 # CHECK-MERGED-CALLSITES-NEXT:   0x[[#%.4x,]] Flags[None] MatchRegex[function3_copy2]
 # CHECK-MERGED-CALLSITES-NEXT:   0x[[#%.4x,]] Flags[None] MatchRegex[function2_copy1]
 
-
 ### Check that we can correctly resove merged functions using callstacks:
 ### Resolve two callstacks containing merged functions.
 ### We use the value obtained from `CallSites:[FILTER]` to pass to the next call to `llvm-gsymutil` via `--merged-functions-filter`.
 ### The callstacks resolve differently based on the merged functions filter.
-###     0x00000001000003d0  =>  0x000000010000037c  =>  0x000000010000035c  =>  0x0000000100000340
-###     0x00000001000003e8  =========================>  0x000000010000035c  =>  0x0000000100000340
+###     0x00000001000003d8  =>  0x000000010000037c  =>  0x000000010000035c  =>  0x0000000100000340
+###     0x00000001000003f0  =========================>  0x000000010000035c  =>  0x0000000100000340
+###
+### We added a new function, for main function, +8 for line numbers, +0x8 for addresses.
 
-# RUN: llvm-gsymutil %t/dwarf_call_sites_dSYM.gsym --merged-functions --address=0x00000001000003d0 | FileCheck --check-prefix=CHECK-C1 %s
-# CHECK-C1:       0x00000001000003d0: main + 32 @ ./merged_funcs_test.cpp:63
+# RUN: llvm-gsymutil %t/dwarf_call_sites_dSYM.gsym --merged-functions --address=0x00000001000003d8 | FileCheck --check-prefix=CHECK-C1 %s
+# CHECK-C1:       0x00000001000003d8: main + 32 @ ./merged_funcs_test.cpp:71
 # CHECK-C1-NEXT:      CallSites: function2_copy2
 
 # RUN: llvm-gsymutil %t/dwarf_call_sites_dSYM.gsym --merged-functions --address=0x000000010000037c --merged-functions-filter="function2_copy2" | FileCheck --check-prefix=CHECK-C2 %s
@@ -73,9 +81,9 @@
 ### ----------------------------------------------------------------------------------------------------------------------------------
 ### Resolve the 2nd call stack - the 2nd and 3rd addresses are the same but they resolve to a different function because of the filter
 
-# RUN: llvm-gsymutil %t/dwarf_call_sites_dSYM.gsym --address=0x00000001000003e8 --merged-functions | FileCheck --check-prefix=CHECK-C5 %s
-# CHECK-C5:       Found 1 function at address 0x00000001000003e8:
-# CHECK-C5-NEXT:     0x00000001000003e8: main + 56 @ ./merged_funcs_test.cpp:64
+# RUN: llvm-gsymutil %t/dwarf_call_sites_dSYM.gsym --address=0x00000001000003f0 --merged-functions | FileCheck --check-prefix=CHECK-C5 %s
+# CHECK-C5:       Found 1 function at address 0x00000001000003f0:
+# CHECK-C5-NEXT:     0x00000001000003f0: main + 56 @ ./merged_funcs_test.cpp:72
 # CHECK-C5-NEXT:        CallSites: function3_copy2
 
 # RUN: llvm-gsymutil %t/dwarf_call_sites_dSYM.gsym --merged-functions --address=0x000000010000035c --merged-functions-filter="function3_copy2" | FileCheck --check-prefix=CHECK-C6 %s
@@ -87,6 +95,9 @@
 # CHECK-C7:       Found 1 function at address 0x0000000100000340:
 # CHECK-C7-NEXT:     0x0000000100000340: function4_copy2 + 8 @ ./merged_funcs_test.cpp:14
 
+# RUN: llvm-gsymutil %t/dwarf_call_sites_dSYM.gsym --address=0x00000001000003b4 --merged-functions | FileCheck --check-prefix=CHECK-BAD %s
+# CHECK-BAD:       Found 1 function at address 0x00000001000003b4:
+# CHECK-BAD-NEXT:     0x00000001000003b4: function_bad_stmt_seq + 4 @ ./merged_funcs_test.cpp:65
 
 #--- merged_funcs_test.cpp
 #define ATTRIB extern "C" __attribute__((noinline))
@@ -148,6 +159,14 @@ ATTRIB int function1(int a) {
     return result;
 }
 
+// Intentional multi-line function definition
+ATTRIB
+int function_bad_stmt_seq(
+    int a
+) {
+    return a + 1;
+}
+
 int main() {
     int sum = 0;
     sum += function1(1);
@@ -155,6 +174,7 @@ int main() {
     sum += function4_copy2(4);
     sum += function3_copy2(41);
     sum += function2_copy1(11);
+    sum += function_bad_stmt_seq(sum);
     return sum;
 }
 
@@ -229,7 +249,7 @@ FileHeader:
 LoadCommands:
   - cmd:             LC_UUID
     cmdsize:         24
-    uuid:            4C4C441A-5555-3144-A124-E395C0E7AA96
+    uuid:            4C4C44C9-5555-3144-A16C-82A90B2087B3
   - cmd:             LC_BUILD_VERSION
     cmdsize:         24
     platform:        1
@@ -239,9 +259,9 @@ LoadCommands:
   - cmd:             LC_SYMTAB
     cmdsize:         24
     symoff:          4096
-    nsyms:           10
-    stroff:          4256
-    strsize:         156
+    nsyms:           11
+    stroff:          4272
+    strsize:         179
   - cmd:             LC_SEGMENT_64
     cmdsize:         72
     segname:         __PAGEZERO
@@ -268,7 +288,7 @@ LoadCommands:
       - sectname:        __text
         segname:         __TEXT
         addr:            0x100000338
-        size:            208
+        size:            228
         offset:          0x0
         align:           2
         reloff:          0x0
@@ -277,7 +297,7 @@ LoadCommands:
         reserved1:       0x0
         reserved2:       0x0
         reserved3:       0x0
-        content:         CFFAEDFE0C000001000000000A00000008000000C005000000000000000000001B000000180000004C4C441A55553144A124E395C0E7AA9632000000180000000100000000000B0000000B00000000000200000018000000001000000A000000A01000009C00000019000000480000005F5F504147455A45524F00000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000019000000980000005F5F54455854000000000000000000000000000001000000
+        content:         CFFAEDFE0C000001000000000A00000008000000C005000000000000000000001B000000180000004C4C44C955553144A16C82A90B2087B332000000180000000100000000000B0000000B00000000000200000018000000001000000B000000B0100000B300000019000000480000005F5F504147455A45524F00000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000019000000980000005F5F544558540000000000000000000000000000010000000040000000000000000000000000000000000000
   - cmd:             LC_SEGMENT_64
     cmdsize:         152
     segname:         __DATA
@@ -308,7 +328,7 @@ LoadCommands:
     vmaddr:          4295000064
     vmsize:          4096
     fileoff:         4096
-    filesize:        316
+    filesize:        355
     maxprot:         1
     initprot:        1
     nsects:          0
@@ -319,7 +339,7 @@ LoadCommands:
     vmaddr:          4295004160
     vmsize:          4096
     fileoff:         8192
-    filesize:        3507
+    filesize:        3884
     maxprot:         7
     initprot:        3
     nsects:          11
@@ -328,7 +348,7 @@ LoadCommands:
       - sectname:        __debug_line
         segname:         __DWARF
         addr:            0x100009000
-        size:            323
+        size:            357
         offset:          0x2000
         align:           0
         reloff:          0x0
@@ -339,9 +359,9 @@ LoadCommands:
         reserved3:       0x0
       - sectname:        __debug_aranges
         segname:         __DWARF
-        addr:            0x100009143
+        addr:            0x100009165
         size:            48
-        offset:          0x2143
+        offset:          0x2165
         align:           0
         reloff:          0x0
         nreloc:          0
@@ -351,9 +371,9 @@ LoadCommands:
         reserved3:       0x0
       - sectname:        __debug_loc
         segname:         __DWARF
-        addr:            0x100009173
-        size:            1026
-        offset:          0x2173
+        addr:            0x100009195
+        size:            1083
+        offset:          0x2195
         align:           0
         reloff:          0x0
         nreloc:          0
@@ -361,12 +381,12 @@ LoadCommands:
         reserved1:       0x0
         reserved2:       0x0
         reserved3:       0x0
-        content:         00000000000000000800000000000000010050080000000000000014000000000000000400A301509F0000000000000000000000000000000004000000000000000C0000000000000001005800000000000000000000000000000000080000000000000014000000000000000100500000000000000000000000000000000000000000000000000800000000000000010050080000000000000014000000000000000400A301509F0000000000000000000000000000000004000000000000000C0000000000000001005800000000000000000000000000000000080000000000000014000000000000000100500000000000000000000000000000000014000000000000002000000000000000010050200000000000000034000000000000000400A301509F00000000000000000000000000000000240000000000000034000000000000000100500000000000000000000000000000000014000000000000002000000000000000010050200000000000000034000000000000000400A301509F00000000000000000000000000000000240000000000000034000000000000000100500000000000000000000000000000000034000000000000004000000000000000010050400000000000000058000000000000000400A301509F000000000000000000000000000000003C0000000000000040000000000000000300707E9F000000000000000000000000000000004400000000000000580000000000000001005000000000000000000000000000000000340000000000000040000000000000000300707E9F00000000000000000000000000000000440000000000000058000000000000000100500000000000000000000000000000000034000000000000004000000000000000010050400000000000000058000000000000000400A301509F000000000000000000000000000000003C0000000000000040000000000000000300707E9F000000000000000000000000000000004400000000000000580000000000000001005000000000000000000000000000000000340000000000000040000000000000000300707E9F00000000000000000000000000000000440000000000000058000000000000000100500000000000000000000000000000000058000000000000006400000000000000010050640000000000000078000000000000000400A301509F00000000000000000000000000000000680000000000000078000000000000000100500000000000000000000000000000000084000000000000009000000000000000030011009F90000000000000009C000000000000000100639C00000000000000A800000000000000010064B800000000000000C00000000000000001006300000000000000000000000000000000
+        content:         00000000000000000800000000000000010050080000000000000014000000000000000400A301509F0000000000000000000000000000000004000000000000000C0000000000000001005800000000000000000000000000000000080000000000000014000000000000000100500000000000000000000000000000000000000000000000000800000000000000010050080000000000000014000000000000000400A301509F0000000000000000000000000000000004000000000000000C0000000000000001005800000000000000000000000000000000080000000000000014000000000000000100500000000000000000000000000000000014000000000000002000000000000000010050200000000000000034000000000000000400A301509F00000000000000000000000000000000240000000000000034000000000000000100500000000000000000000000000000000014000000000000002000000000000000010050200000000000000034000000000000000400A301509F00000000000000000000000000000000240000000000000034000000000000000100500000000000000000000000000000000034000000000000004000000000000000010050400000000000000058000000000000000400A301509F000000000000000000000000000000003C0000000000000040000000000000000300707E9F000000000000000000000000000000004400000000000000580000000000000001005000000000000000000000000000000000340000000000000040000000000000000300707E9F00000000000000000000000000000000440000000000000058000000000000000100500000000000000000000000000000000034000000000000004000000000000000010050400000000000000058000000000000000400A301509F000000000000000000000000000000003C0000000000000040000000000000000300707E9F000000000000000000000000000000004400000000000000580000000000000001005000000000000000000000000000000000340000000000000040000000000000000300707E9F00000000000000000000000000000000440000000000000058000000000000000100500000000000000000000000000000000058000000000000006400000000000000010050640000000000000078000000000000000400A301509F00000000000000000000000000000000680000000000000078000000000000000100500000000000000000000000000000000078000000000000007C000000000000000100507C0000000000000080000000000000000400A301509F000000000000000000000000000000008C000000000000009800000000000000030011009F9800000000000000A400000000000000010063A400000000000000B000000000000000010064C000000000000000D40000000000000001006300000000000000000000000000000000
       - sectname:        __debug_info
         segname:         __DWARF
-        addr:            0x100009575
-        size:            923
-        offset:          0x2575
+        addr:            0x1000095D0
+        size:            988
+        offset:          0x25D0
         align:           0
         reloff:          0x0
         nreloc:          0
@@ -376,9 +396,9 @@ LoadCommands:
         reserved3:       0x0
       - sectname:        __debug_frame
         segname:         __DWARF
-        addr:            0x100009910
-        size:            272
-        offset:          0x2910
+        addr:            0x1000099AC
+        size:            296
+        offset:          0x29AC
         align:           0
         reloff:          0x0
         nreloc:          0
@@ -386,12 +406,12 @@ LoadCommands:
         reserved1:       0x0
         reserved2:       0x0
         reserved3:       0x0
-        content:         14000000FFFFFFFF0400080001781E0C1F000000000000001400000000000000380300000100000014000000000000001400000000000000380300000100000014000000000000001C000000000000004C030000010000002000000000000000480C1D109E019D021C000000000000004C030000010000002000000000000000480C1D109E019D021C000000000000006C030000010000002400000000000000480C1D109E019D021C000000000000006C030000010000002400000000000000480C1D109E019D021C0000000000000090030000010000002000000000000000480C1D109E019D022400000000000000B00300000100000058000000000000004C0C1D109E019D029303940400000000
+        content:         14000000FFFFFFFF0400080001781E0C1F000000000000001400000000000000380300000100000014000000000000001400000000000000380300000100000014000000000000001C000000000000004C030000010000002000000000000000480C1D109E019D021C000000000000004C030000010000002000000000000000480C1D109E019D021C000000000000006C030000010000002400000000000000480C1D109E019D021C000000000000006C030000010000002400000000000000480C1D109E019D021C0000000000000090030000010000002000000000000000480C1D109E019D021400000000000000B00300000100000008000000000000002400000000000000B80300000100000064000000000000004C0C1D109E019D029303940400000000
       - sectname:        __debug_abbrev
         segname:         __DWARF
-        addr:            0x100009A20
+        addr:            0x100009AD4
         size:            260
-        offset:          0x2A20
+        offset:          0x2AD4
         align:           0
         reloff:          0x0
         nreloc:          0
@@ -401,9 +421,9 @@ LoadCommands:
         reserved3:       0x0
       - sectname:        __debug_str
         segname:         __DWARF
-        addr:            0x100009B24
-        size:            188
-        offset:          0x2B24
+        addr:            0x100009BD8
+        size:            357
+        offset:          0x2BD8
         align:           0
         reloff:          0x0
         nreloc:          0
@@ -413,9 +433,9 @@ LoadCommands:
         reserved3:       0x0
       - sectname:        __apple_namespac
         segname:         __DWARF
-        addr:            0x100009BE0
+        addr:            0x100009D3D
         size:            36
-        offset:          0x2BE0
+        offset:          0x2D3D
         align:           0
         reloff:          0x0
         nreloc:          0
@@ -426,9 +446,9 @@ LoadCommands:
         content:         485341480100000001000000000000000C000000000000000100000001000600FFFFFFFF
       - sectname:        __apple_names
         segname:         __DWARF
-        addr:            0x100009C04
-        size:            316
-        offset:          0x2C04
+        addr:            0x100009D61
+        size:            344
+        offset:          0x2D61
         align:           0
         reloff:          0x0
         nreloc:          0
@@ -436,12 +456,12 @@ LoadCommands:
         reserved1:       0x0
         reserved2:       0x0
         reserved3:       0x0
-        content:         48534148010000000A0000000A0000000C0000000000000001000000010006000000000002000000030000000400000006000000FFFFFFFF0800000009000000FFFFFFFFFFFFFFFF88CB36CFF4B03BD389CB36CF0A452B694908311C0B452B694A08311CDC41AB586A7F9A7CAD7ED75898000000A8000000B8000000C8000000D8000000E8000000F80000000801000018010000280100008900000001000000BB010000000000001B000000010000002E0000000000000099000000010000003A020000000000002D000000010000004F000000000000005800000001000000E50000000000000048000000010000009A0000000000000068000000010000003901000000000000A900000001000000B902000000000000B3000000010000000D030000000000007800000002000000050200008402000000000000
+        content:         48534148010000000B0000000B0000000C0000000000000001000000010006000000000002000000FFFFFFFFFFFFFFFF03000000FFFFFFFFFFFFFFFF04000000080000000A000000FFFFFFFFAD7ED75888CB36CF89CB36CFF4B03BD34908311CDC41AB580A452B696A7F9A7C4A08311C0B452B69404C8F68A4000000B8000000C8000000D8000000E8000000F800000008010000180100002801000038010000480100000B010000020000000502000084020000000000001C01000001000000BB010000000000002C010000010000003A02000000000000AE000000010000002E00000000000000EB00000001000000E5000000000000003C01000001000000B902000000000000C0000000010000004F000000000000005C010000010000003A03000000000000FB000000010000003901000000000000DB000000010000009A0000000000000046010000010000000D03000000000000
       - sectname:        __apple_types
         segname:         __DWARF
-        addr:            0x100009D40
+        addr:            0x100009EB9
         size:            79
-        offset:          0x2D40
+        offset:          0x2EB9
         align:           0
         reloff:          0x0
         nreloc:          0
@@ -449,12 +469,12 @@ LoadCommands:
         reserved1:       0x0
         reserved2:       0x0
         reserved3:       0x0
-        content:         48534148010000000100000001000000180000000000000004000000010006000300050005000B0006000600000000003080880B38000000290000000100000048000000240000A4283A0C00000000
+        content:         48534148010000000100000001000000180000000000000004000000010006000300050005000B0006000600000000003080880B38000000BC0000000100000048000000240000A4283A0C00000000
       - sectname:        __apple_objc
         segname:         __DWARF
-        addr:            0x100009D8F
+        addr:            0x100009F08
         size:            36
-        offset:          0x2D8F
+        offset:          0x2F08
         align:           0
         reloff:          0x0
         nreloc:          0
@@ -469,7 +489,7 @@ LinkEditData:
       n_type:          0xF
       n_sect:          1
       n_desc:          0
-      n_value:         4294968240
+      n_value:         4294968248
     - n_strx:          8
       n_type:          0xF
       n_sect:          1
@@ -506,11 +526,16 @@ LinkEditData:
       n_desc:          0
       n_value:         4294968208
     - n_strx:          121
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         4294968240
+    - n_strx:          144
       n_type:          0xF
       n_sect:          2
       n_desc:          0
       n_value:         4294983680
-    - n_strx:          136
+    - n_strx:          159
       n_type:          0xF
       n_sect:          1
       n_desc:          16
@@ -526,11 +551,13 @@ LinkEditData:
     - _function2_copy1
     - _function2_copy2
     - _function1
+    - _function_bad_stmt_seq
     - _global_result
     - __mh_execute_header
 DWARF:
   debug_str:
     - ''
+    - 'Facebook clang version 15.82.1 (https://git.internal.tfbnw.net/repos/git/ro/osmeta/external/llvm-project 3c2ff7d3c6ef63fb9a83574cee1b376c969bd5ec)'
     - merged_funcs_test.cpp
     - '/'
     - .
@@ -547,6 +574,7 @@ DWARF:
     - function2_copy1
     - function2_copy2
     - function1
+    - function_bad_stmt_seq
     - main
     - sum
   debug_abbrev:
@@ -793,9 +821,9 @@ DWARF:
       AddressSize:     0x8
       Descriptors:
         - Address:         0x100000338
-          Length:          0xD0
+          Length:          0xE4
   debug_info:
-    - Length:          0x397
+    - Length:          0x3D8
       Version:         4
       AbbrevTableID:   0
       AbbrOffset:      0x0
@@ -803,31 +831,31 @@ DWARF:
       Entries:
         - AbbrCode:        0x1
           Values:
-            - Value:           0x0
-            - Value:           0x21
             - Value:           0x1
-            - Value:           0x17
+            - Value:           0x21
+            - Value:           0x94
+            - Value:           0xAA
             - Value:           0x0
-            - Value:           0x19
+            - Value:           0xAC
             - Value:           0x1
             - Value:           0x100000338
-            - Value:           0xD0
+            - Value:           0xE4
         - AbbrCode:        0x2
           Values:
-            - Value:           0x1B
+            - Value:           0xAE
             - Value:           0x43
             - Value:           0x1
             - Value:           0x1
             - Value:           0x2
             - Value:           0x9
-              BlockData:       [ 0x3, 0x0, 0x40, 0x0, 0x0, 0x1, 0x0, 0x0, 
+              BlockData:       [ 0x3, 0x0, 0x40, 0x0, 0x0, 0x1, 0x0, 0x0,
                                  0x0 ]
         - AbbrCode:        0x3
           Values:
             - Value:           0x48
         - AbbrCode:        0x4
           Values:
-            - Value:           0x29
+            - Value:           0xBC
             - Value:           0x5
             - Value:           0x4
         - AbbrCode:        0x5
@@ -839,7 +867,7 @@ DWARF:
             - Value:           0x1
               BlockData:       [ 0x6F ]
             - Value:           0x1
-            - Value:           0x2D
+            - Value:           0xC0
             - Value:           0x1
             - Value:           0x4
             - Value:           0x48
@@ -848,21 +876,21 @@ DWARF:
         - AbbrCode:        0x6
           Values:
             - Value:           0x0
-            - Value:           0x3D
+            - Value:           0xD0
             - Value:           0x1
             - Value:           0x4
             - Value:           0x48
         - AbbrCode:        0x7
           Values:
             - Value:           0x39
-            - Value:           0x3F
+            - Value:           0xD2
             - Value:           0x1
             - Value:           0x5
             - Value:           0x48
         - AbbrCode:        0x7
           Values:
             - Value:           0x5C
-            - Value:           0x41
+            - Value:           0xD4
             - Value:           0x1
             - Value:           0x6
             - Value:           0x48
@@ -876,7 +904,7 @@ DWARF:
             - Value:           0x1
               BlockData:       [ 0x6F ]
             - Value:           0x1
-            - Value:           0x48
+            - Value:           0xDB
             - Value:           0x1
             - Value:           0xB
             - Value:           0x48
@@ -885,21 +913,21 @@ DWARF:
         - AbbrCode:        0x6
           Values:
             - Value:           0x7F
-            - Value:           0x3D
+            - Value:           0xD0
             - Value:           0x1
             - Value:           0xB
             - Value:           0x48
         - AbbrCode:        0x7
           Values:
             - Value:           0xB8
-            - Value:           0x3F
+            - Value:           0xD2
             - Value:           0x1
             - Value:           0xC
             - Value:           0x48
         - AbbrCode:        0x7
           Values:
             - Value:           0xDB
-            - Value:           0x41
+            - Value:           0xD4
             - Value:           0x1
             - Value:           0xD
             - Value:           0x48
@@ -912,7 +940,7 @@ DWARF:
             - Value:           0x1
               BlockData:       [ 0x6D ]
             - Value:           0x1
-            - Value:           0x58
+            - Value:           0xEB
             - Value:           0x1
             - Value:           0x12
             - Value:           0x48
@@ -921,20 +949,20 @@ DWARF:
         - AbbrCode:        0x6
           Values:
             - Value:           0xFE
-            - Value:           0x3D
+            - Value:           0xD0
             - Value:           0x1
             - Value:           0x12
             - Value:           0x48
         - AbbrCode:        0x7
           Values:
             - Value:           0x137
-            - Value:           0x41
+            - Value:           0xD4
             - Value:           0x1
             - Value:           0x14
             - Value:           0x48
         - AbbrCode:        0x9
           Values:
-            - Value:           0x3F
+            - Value:           0xD2
             - Value:           0x1
             - Value:           0x13
             - Value:           0x48
@@ -951,7 +979,7 @@ DWARF:
             - Value:           0x1
               BlockData:       [ 0x6D ]
             - Value:           0x1
-            - Value:           0x68
+            - Value:           0xFB
             - Value:           0x1
             - Value:           0x19
             - Value:           0x48
@@ -960,20 +988,20 @@ DWARF:
         - AbbrCode:        0x6
           Values:
             - Value:           0x15A
-            - Value:           0x3D
+            - Value:           0xD0
             - Value:           0x1
             - Value:           0x19
             - Value:           0x48
         - AbbrCode:        0x7
           Values:
             - Value:           0x193
-            - Value:           0x41
+            - Value:           0xD4
             - Value:           0x1
             - Value:           0x1B
             - Value:           0x48
         - AbbrCode:        0x9
           Values:
-            - Value:           0x3F
+            - Value:           0xD2
             - Value:           0x1
             - Value:           0x1A
             - Value:           0x48
@@ -984,7 +1012,7 @@ DWARF:
         - AbbrCode:        0x0
         - AbbrCode:        0xB
           Values:
-            - Value:           0x78
+            - Value:           0x10B
             - Value:           0x1
             - Value:           0x20
             - Value:           0x48
@@ -993,19 +1021,19 @@ DWARF:
             - Value:           0x1
         - AbbrCode:        0xC
           Values:
-            - Value:           0x3D
+            - Value:           0xD0
             - Value:           0x1
             - Value:           0x20
             - Value:           0x48
         - AbbrCode:        0x9
           Values:
-            - Value:           0x41
+            - Value:           0xD4
             - Value:           0x1
             - Value:           0x22
             - Value:           0x48
         - AbbrCode:        0x9
           Values:
-            - Value:           0x3F
+            - Value:           0xD2
             - Value:           0x1
             - Value:           0x21
             - Value:           0x48
@@ -1018,7 +1046,7 @@ DWARF:
             - Value:           0x1
               BlockData:       [ 0x6D ]
             - Value:           0x1
-            - Value:           0x89
+            - Value:           0x11C
             - Value:           0x1
             - Value:           0x27
             - Value:           0x48
@@ -1027,21 +1055,21 @@ DWARF:
         - AbbrCode:        0x6
           Values:
             - Value:           0x1B6
-            - Value:           0x3D
+            - Value:           0xD0
             - Value:           0x1
             - Value:           0x27
             - Value:           0x48
         - AbbrCode:        0x7
           Values:
             - Value:           0x1EF
-            - Value:           0x3F
+            - Value:           0xD2
             - Value:           0x1
             - Value:           0x28
             - Value:           0x48
         - AbbrCode:        0x7
           Values:
             - Value:           0x214
-            - Value:           0x41
+            - Value:           0xD4
             - Value:           0x1
             - Value:           0x29
             - Value:           0x48
@@ -1075,7 +1103,7 @@ DWARF:
             - Value:           0x1
               BlockData:       [ 0x6D ]
             - Value:           0x1
-            - Value:           0x99
+            - Value:           0x12C
             - Value:           0x1
             - Value:           0x2E
             - Value:           0x48
@@ -1084,21 +1112,21 @@ DWARF:
         - AbbrCode:        0x6
           Values:
             - Value:           0x27F
-            - Value:           0x3D
+            - Value:           0xD0
             - Value:           0x1
             - Value:           0x2E
             - Value:           0x48
         - AbbrCode:        0x7
           Values:
             - Value:           0x2B8
-            - Value:           0x3F
+            - Value:           0xD2
             - Value:           0x1
             - Value:           0x2F
             - Value:           0x48
         - AbbrCode:        0x7
           Values:
             - Value:           0x2DD
-            - Value:           0x41
+            - Value:           0xD4
             - Value:           0x1
             - Value:           0x30
             - Value:           0x48
@@ -1132,7 +1160,7 @@ DWARF:
             - Value:           0x1
               BlockData:       [ 0x6D ]
             - Value:           0x1
-            - Value:           0xA9
+            - Value:           0x13C
             - Value:           0x1
             - Value:           0x35
             - Value:           0x48
@@ -1141,20 +1169,20 @@ DWARF:
         - AbbrCode:        0x6
           Values:
             - Value:           0x348
-            - Value:           0x3D
+            - Value:           0xD0
             - Value:           0x1
             - Value:           0x35
             - Value:           0x48
         - AbbrCode:        0x7
           Values:
             - Value:           0x381
-            - Value:           0x41
+            - Value:           0xD4
             - Value:           0x1
             - Value:           0x37
             - Value:           0x48
         - AbbrCode:        0x9
           Values:
-            - Value:           0x3F
+            - Value:           0xD2
             - Value:           0x1
             - Value:           0x36
             - Value:           0x48
@@ -1163,31 +1191,54 @@ DWARF:
             - Value:           0x1BB
             - Value:           0x1000003A0
         - AbbrCode:        0x0
-        - AbbrCode:        0x8
+        - AbbrCode:        0x5
           Values:
             - Value:           0x1000003B0
-            - Value:           0x58
-            - Value:           0x112
+            - Value:           0x8
+            - Value:           0x1
+            - Value:           0xFFFFFFFF
+            - Value:           0x1
+              BlockData:       [ 0x6F ]
+            - Value:           0x1
+            - Value:           0x146
+            - Value:           0x1
+            - Value:           0x3E
+            - Value:           0x48
+            - Value:           0x1
+            - Value:           0x1
+        - AbbrCode:        0x6
+          Values:
+            - Value:           0x3A4
+            - Value:           0xD0
+            - Value:           0x1
+            - Value:           0x3F
+            - Value:           0x48
+        - AbbrCode:        0x0
+        - AbbrCode:        0x8
+          Values:
+            - Value:           0x1000003B8
+            - Value:           0x64
+            - Value:           0x12B
             - Value:           0x1
               BlockData:       [ 0x6D ]
             - Value:           0x1
-            - Value:           0xB3
+            - Value:           0x15C
             - Value:           0x1
-            - Value:           0x3C
+            - Value:           0x44
             - Value:           0x48
             - Value:           0x1
             - Value:           0x1
         - AbbrCode:        0x7
           Values:
-            - Value:           0x3A4
-            - Value:           0xB8
+            - Value:           0x3DD
+            - Value:           0x161
             - Value:           0x1
-            - Value:           0x3D
+            - Value:           0x45
             - Value:           0x48
         - AbbrCode:        0x10
           Values:
             - Value:           0x2B9
-            - Value:           0x1000003C4
+            - Value:           0x1000003CC
         - AbbrCode:        0x11
           Values:
             - Value:           0x1
@@ -1198,7 +1249,7 @@ DWARF:
         - AbbrCode:        0x10
           Values:
             - Value:           0x23A
-            - Value:           0x1000003D0
+            - Value:           0x1000003D8
         - AbbrCode:        0x11
           Values:
             - Value:           0x1
@@ -1209,7 +1260,7 @@ DWARF:
         - AbbrCode:        0x10
           Values:
             - Value:           0x9A
-            - Value:           0x1000003DC
+            - Value:           0x1000003E4
         - AbbrCode:        0x11
           Values:
             - Value:           0x1
@@ -1220,7 +1271,7 @@ DWARF:
         - AbbrCode:        0x10
           Values:
             - Value:           0x139
-            - Value:           0x1000003E8
+            - Value:           0x1000003F0
         - AbbrCode:        0x11
           Values:
             - Value:           0x1
@@ -1231,7 +1282,7 @@ DWARF:
         - AbbrCode:        0x10
           Values:
             - Value:           0x1BB
-            - Value:           0x1000003F8
+            - Value:           0x100000400
         - AbbrCode:        0x11
           Values:
             - Value:           0x1
@@ -1239,10 +1290,21 @@ DWARF:
             - Value:           0x1
               BlockData:       [ 0x3B ]
         - AbbrCode:        0x0
+        - AbbrCode:        0x10
+          Values:
+            - Value:           0x30D
+            - Value:           0x10000040C
+        - AbbrCode:        0x11
+          Values:
+            - Value:           0x1
+              BlockData:       [ 0x50 ]
+            - Value:           0x2
+              BlockData:       [ 0x83, 0x0 ]
+        - AbbrCode:        0x0
         - AbbrCode:        0x0
         - AbbrCode:        0x0
   debug_line:
-    - Length:          319
+    - Length:          353
       Version:         4
       PrologueLength:  45
       MinInstLength:   1
@@ -1495,8 +1557,31 @@ DWARF:
           ExtLen:          9
           SubOpcode:       DW_LNE_set_address
           Data:            4294968240
+        - Opcode:          DW_LNS_set_column
+          Data:            14
+        - Opcode:          DW_LNS_set_prologue_end
+          Data:            0
         - Opcode:          DW_LNS_advance_line
-          SData:           59
+          SData:           64
+          Data:            0
+        - Opcode:          DW_LNS_copy
+          Data:            0
+        - Opcode:          DW_LNS_set_column
+          Data:            5
+        - Opcode:          DW_LNS_negate_stmt
+          Data:            0
+        - Opcode:          0x4A
+          Data:            0
+        - Opcode:          DW_LNS_extended_op
+          ExtLen:          1
+          SubOpcode:       DW_LNE_end_sequence
+          Data:            0
+        - Opcode:          DW_LNS_extended_op
+          ExtLen:          9
+          SubOpcode:       DW_LNE_set_address
+          Data:            4294968248
+        - Opcode:          DW_LNS_advance_line
+          SData:           67
           Data:            0
         - Opcode:          DW_LNS_copy
           Data:            0
@@ -1538,6 +1623,18 @@ DWARF:
           Data:            0
         - Opcode:          0x82
           Data:            0
+        - Opcode:          DW_LNS_set_column
+          Data:            12
+        - Opcode:          DW_LNS_negate_stmt
+          Data:            0
+        - Opcode:          0x4B
+          Data:            0
+        - Opcode:          DW_LNS_set_column
+          Data:            9
+        - Opcode:          DW_LNS_negate_stmt
+          Data:            0
+        - Opcode:          0x82
+          Data:            0
         - Opcode:          DW_LNS_set_column
           Data:            5
         - Opcode:          DW_LNS_negate_stmt
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFive7/vgather-vcompress.s b/llvm/test/tools/llvm-mca/RISCV/SiFive7/vrgather-vcompress.s
similarity index 100%
rename from llvm/test/tools/llvm-mca/RISCV/SiFive7/vgather-vcompress.s
rename to llvm/test/tools/llvm-mca/RISCV/SiFive7/vrgather-vcompress.s
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/vgather-vcompress.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/vrgather-vcompress.s
similarity index 100%
rename from llvm/test/tools/llvm-mca/RISCV/SiFiveX390/vgather-vcompress.s
rename to llvm/test/tools/llvm-mca/RISCV/SiFiveX390/vrgather-vcompress.s
diff --git a/llvm/test/tools/llvm-objcopy/DXContainer/dump-section-errors.yaml b/llvm/test/tools/llvm-objcopy/DXContainer/dump-section-errors.yaml
new file mode 100644
index 0000000000000..e748eecf73528
--- /dev/null
+++ b/llvm/test/tools/llvm-objcopy/DXContainer/dump-section-errors.yaml
@@ -0,0 +1,27 @@
+# RUN: yaml2obj %s -o %t.dxbc
+# RUN: not llvm-objcopy --dump-section=FKE0=%t.fek0 %t.dxbc 2>&1 | FileCheck %s --check-prefix=CHECK-ZEROSIZE -DFILE=%t.fek0
+# RUN: not llvm-objcopy --dump-section=FKE3=%t.fek1 %t.dxbc 2>&1 | FileCheck %s --check-prefix=CHECK-MISSING -DFILE=%t.fek1
+# RUN: not llvm-objcopy --dump-section=FKE2=%t/does_not_exist/.fek2 %t.dxbc 2>&1 | FileCheck %s --check-prefix=CHECK-BAD-PATH -DFILE=%t/does_not_exist/.fek2 -DMSG=%errc_ENOENT
+
+# CHECK-ZEROSIZE: error: '[[FILE]]': part 'FKE0' is empty
+# CHECK-MISSING: error: '[[FILE]]': part 'FKE3' not found
+# CHECK-BAD-PATH: error: '[[FILE]]': [[MSG]]
+
+--- !dxcontainer
+Header:
+  Hash:        [ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ]
+  Version:
+    Major:     1
+    Minor:     0
+  FileSize:    108
+  PartCount:   3
+  PartOffsets: [ 60, 68, 76 ]
+Parts:
+  - Name: FKE0
+    Size: 0
+  - Name: FKE1
+    Size: 0
+  - Name: FKE2
+    Size: 8
+...
diff --git a/llvm/test/tools/llvm-objcopy/DXContainer/dump-section.yaml b/llvm/test/tools/llvm-objcopy/DXContainer/dump-section.yaml
new file mode 100644
index 0000000000000..7d80a2c3ae4a7
--- /dev/null
+++ b/llvm/test/tools/llvm-objcopy/DXContainer/dump-section.yaml
@@ -0,0 +1,278 @@
+# RUN: yaml2obj %s -o %t.dxbc
+# RUN: llvm-objcopy --dump-section=DXIL=%t.bc %t.dxbc
+# RUN: llvm-dis %t.bc -o - | FileCheck %s --check-prefix=BITCODE
+# RUN: wc -c %t.bc | FileCheck %s --check-prefix=DXIL-SIZE
+
+## Verify that when dumping the DXIL part we get a valid bitcode file.
+# BITCODE: define void @main()
+## Verify the size of the bitcode data.
+# DXIL-SIZE: 1708
+
+## Dump the PSV0 part and verify its size.
+# RUN: llvm-objcopy --dump-section=PSV0=%t.psv0 %t.dxbc
+# RUN: wc -c %t.psv0 | FileCheck %s --check-prefix=PSV0-SIZE
+# RUN: od -v -Ax -t x1 %t.psv0 | FileCheck %s --check-prefix=PSV0-CONTENTS
+# PSV0-SIZE: 76
+
+# For a compute shader the structure size is encoded followed by a bunch of 00'd
+# bytes until you get to the unused wave size min and max (0xffff), followed by
+# the shader stage (5 for compute).
+# TODO: Update this test to use objdump or obj2yaml once we support
+# --add-section in objcopy. See issue:
+# https://github.com/llvm/llvm-project/issues/162159.
+# PSV0-CONTENTS: 0000 34 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+# PSV0-CONTENTS: 0010 00 00 00 00 00 00 00 00 ff ff ff ff 05 00 00 00
+
+--- !dxcontainer
+Header:
+  Hash:            [ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                     0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ]
+  Version:
+    Major:           1
+    Minor:           0
+  FileSize:        1872
+  PartCount:       2
+  PartOffsets:     [ 40, 1780 ]
+Parts:
+  - Name:            DXIL
+    Size:            1732
+    Program:
+      MajorVersion:    6
+      MinorVersion:    0
+      ShaderKind:      5
+      Size:            433
+      DXILMajorVersion: 1
+      DXILMinorVersion: 0
+      DXILSize:        1708
+      DXIL:            [ 0x42, 0x43, 0xC0, 0xDE, 0x21, 0xC, 0x0, 0x0, 0xA8,
+                         0x1, 0x0, 0x0, 0xB, 0x82, 0x20, 0x0, 0x2, 0x0,
+                         0x0, 0x0, 0x13, 0x0, 0x0, 0x0, 0x7, 0x81, 0x23,
+                         0x91, 0x41, 0xC8, 0x4, 0x49, 0x6, 0x10, 0x32,
+                         0x39, 0x92, 0x1, 0x84, 0xC, 0x25, 0x5, 0x8, 0x19,
+                         0x1E, 0x4, 0x8B, 0x62, 0x80, 0x10, 0x45, 0x2,
+                         0x42, 0x92, 0xB, 0x42, 0x84, 0x10, 0x32, 0x14,
+                         0x38, 0x8, 0x18, 0x4B, 0xA, 0x32, 0x42, 0x88,
+                         0x48, 0x90, 0x14, 0x20, 0x43, 0x46, 0x88, 0xA5,
+                         0x0, 0x19, 0x32, 0x42, 0xE4, 0x48, 0xE, 0x90,
+                         0x11, 0x22, 0xC4, 0x50, 0x41, 0x51, 0x81, 0x8C,
+                         0xE1, 0x83, 0xE5, 0x8A, 0x4, 0x21, 0x46, 0x6,
+                         0x51, 0x18, 0x0, 0x0, 0x4, 0x0, 0x0, 0x0, 0x1B,
+                         0x90, 0xE0, 0xFF, 0xFF, 0xFF, 0xFF, 0x7, 0xC0,
+                         0x1, 0x24, 0x80, 0x2, 0x0, 0x0, 0x0, 0x49, 0x18,
+                         0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x13, 0x82, 0x0,
+                         0x0, 0x89, 0x20, 0x0, 0x0, 0x11, 0x0, 0x0, 0x0,
+                         0x32, 0x22, 0x8, 0x9, 0x20, 0x64, 0x85, 0x4, 0x13,
+                         0x22, 0xA4, 0x84, 0x4, 0x13, 0x22, 0xE3, 0x84,
+                         0xA1, 0x90, 0x14, 0x12, 0x4C, 0x88, 0x8C, 0xB,
+                         0x84, 0x84, 0x4C, 0x10, 0x20, 0x73, 0x4, 0x8,
+                         0xC1, 0x65, 0xC3, 0x85, 0x2C, 0xE8, 0x3, 0x40,
+                         0x14, 0x91, 0x4E, 0xD1, 0x4A, 0x48, 0x44, 0x54,
+                         0x11, 0xC3, 0x9, 0x30, 0xC4, 0x18, 0x1, 0x30,
+                         0x2, 0x50, 0x82, 0x21, 0x1A, 0x8, 0x98, 0x23,
+                         0x0, 0x3, 0x0, 0x13, 0x14, 0x72, 0xC0, 0x87, 0x74,
+                         0x60, 0x87, 0x36, 0x68, 0x87, 0x79, 0x68, 0x3,
+                         0x72, 0xC0, 0x87, 0xD, 0xAE, 0x50, 0xE, 0x6D,
+                         0xD0, 0xE, 0x7A, 0x50, 0xE, 0x6D, 0x0, 0xF, 0x7A,
+                         0x30, 0x7, 0x72, 0xA0, 0x7, 0x73, 0x20, 0x7, 0x6D,
+                         0x90, 0xE, 0x71, 0xA0, 0x7, 0x73, 0x20, 0x7, 0x6D,
+                         0x90, 0xE, 0x78, 0xA0, 0x7, 0x78, 0xD0, 0x6, 0xE9,
+                         0x10, 0x7, 0x76, 0xA0, 0x7, 0x71, 0x60, 0x7, 0x6D,
+                         0x90, 0xE, 0x73, 0x20, 0x7, 0x7A, 0x30, 0x7, 0x72,
+                         0xD0, 0x6, 0xE9, 0x60, 0x7, 0x74, 0xA0, 0x7, 0x76,
+                         0x40, 0x7, 0x6D, 0x60, 0xE, 0x71, 0x60, 0x7, 0x7A,
+                         0x10, 0x7, 0x76, 0xD0, 0x6, 0xE6, 0x30, 0x7, 0x72,
+                         0xA0, 0x7, 0x73, 0x20, 0x7, 0x6D, 0x60, 0xE, 0x76,
+                         0x40, 0x7, 0x7A, 0x60, 0x7, 0x74, 0xD0, 0x6, 0xEE,
+                         0x80, 0x7, 0x7A, 0x10, 0x7, 0x76, 0xA0, 0x7, 0x73,
+                         0x20, 0x7, 0x7A, 0x60, 0x7, 0x74, 0x30, 0xE4,
+                         0x21, 0x0, 0x0, 0x8, 0x0, 0x0, 0x0, 0x2, 0x0,
+                         0x0, 0x0, 0x20, 0xB, 0x4, 0x7, 0x0, 0x0, 0x0,
+                         0x32, 0x1E, 0x98, 0xC, 0x19, 0x11, 0x4C, 0x90,
+                         0x8C, 0x9, 0x26, 0x47, 0xC6, 0x4, 0x43, 0xBA,
+                         0x12, 0x28, 0x88, 0x62, 0x18, 0x1, 0x28, 0x84,
+                         0x22, 0x0, 0x0, 0x0, 0x79, 0x18, 0x0, 0x0, 0xE5,
+                         0x0, 0x0, 0x0, 0x33, 0x8, 0x80, 0x1C, 0xC4, 0xE1,
+                         0x1C, 0x66, 0x14, 0x1, 0x3D, 0x88, 0x43, 0x38,
+                         0x84, 0xC3, 0x8C, 0x42, 0x80, 0x7, 0x79, 0x78,
+                         0x7, 0x73, 0x98, 0x71, 0xC, 0xE6, 0x0, 0xF, 0xED,
+                         0x10, 0xE, 0xF4, 0x80, 0xE, 0x33, 0xC, 0x42, 0x1E,
+                         0xC2, 0xC1, 0x1D, 0xCE, 0xA1, 0x1C, 0x66, 0x30,
+                         0x5, 0x3D, 0x88, 0x43, 0x38, 0x84, 0x83, 0x1B,
+                         0xCC, 0x3, 0x3D, 0xC8, 0x43, 0x3D, 0x8C, 0x3,
+                         0x3D, 0xCC, 0x78, 0x8C, 0x74, 0x70, 0x7, 0x7B,
+                         0x8, 0x7, 0x79, 0x48, 0x87, 0x70, 0x70, 0x7, 0x7A,
+                         0x70, 0x3, 0x76, 0x78, 0x87, 0x70, 0x20, 0x87,
+                         0x19, 0xCC, 0x11, 0xE, 0xEC, 0x90, 0xE, 0xE1,
+                         0x30, 0xF, 0x6E, 0x30, 0xF, 0xE3, 0xF0, 0xE, 0xF0,
+                         0x50, 0xE, 0x33, 0x10, 0xC4, 0x1D, 0xDE, 0x21,
+                         0x1C, 0xD8, 0x21, 0x1D, 0xC2, 0x61, 0x1E, 0x66,
+                         0x30, 0x89, 0x3B, 0xBC, 0x83, 0x3B, 0xD0, 0x43,
+                         0x39, 0xB4, 0x3, 0x3C, 0xBC, 0x83, 0x3C, 0x84,
+                         0x3, 0x3B, 0xCC, 0xF0, 0x14, 0x76, 0x60, 0x7,
+                         0x7B, 0x68, 0x7, 0x37, 0x68, 0x87, 0x72, 0x68,
+                         0x7, 0x37, 0x80, 0x87, 0x70, 0x90, 0x87, 0x70,
+                         0x60, 0x7, 0x76, 0x28, 0x7, 0x76, 0xF8, 0x5, 0x76,
+                         0x78, 0x87, 0x77, 0x80, 0x87, 0x5F, 0x8, 0x87,
+                         0x71, 0x18, 0x87, 0x72, 0x98, 0x87, 0x79, 0x98,
+                         0x81, 0x2C, 0xEE, 0xF0, 0xE, 0xEE, 0xE0, 0xE,
+                         0xF5, 0xC0, 0xE, 0xEC, 0x30, 0x3, 0x62, 0xC8,
+                         0xA1, 0x1C, 0xE4, 0xA1, 0x1C, 0xCC, 0xA1, 0x1C,
+                         0xE4, 0xA1, 0x1C, 0xDC, 0x61, 0x1C, 0xCA, 0x21,
+                         0x1C, 0xC4, 0x81, 0x1D, 0xCA, 0x61, 0x6, 0xD6,
+                         0x90, 0x43, 0x39, 0xC8, 0x43, 0x39, 0x98, 0x43,
+                         0x39, 0xC8, 0x43, 0x39, 0xB8, 0xC3, 0x38, 0x94,
+                         0x43, 0x38, 0x88, 0x3, 0x3B, 0x94, 0xC3, 0x2F,
+                         0xBC, 0x83, 0x3C, 0xFC, 0x82, 0x3B, 0xD4, 0x3,
+                         0x3B, 0xB0, 0xC3, 0xC, 0xC7, 0x69, 0x87, 0x70,
+                         0x58, 0x87, 0x72, 0x70, 0x83, 0x74, 0x68, 0x7,
+                         0x78, 0x60, 0x87, 0x74, 0x18, 0x87, 0x74, 0xA0,
+                         0x87, 0x19, 0xCE, 0x53, 0xF, 0xEE, 0x0, 0xF, 0xF2,
+                         0x50, 0xE, 0xE4, 0x90, 0xE, 0xE3, 0x40, 0xF, 0xE1,
+                         0x20, 0xE, 0xEC, 0x50, 0xE, 0x33, 0x20, 0x28,
+                         0x1D, 0xDC, 0xC1, 0x1E, 0xC2, 0x41, 0x1E, 0xD2,
+                         0x21, 0x1C, 0xDC, 0x81, 0x1E, 0xDC, 0xE0, 0x1C,
+                         0xE4, 0xE1, 0x1D, 0xEA, 0x1, 0x1E, 0x66, 0x18,
+                         0x51, 0x38, 0xB0, 0x43, 0x3A, 0x9C, 0x83, 0x3B,
+                         0xCC, 0x50, 0x24, 0x76, 0x60, 0x7, 0x7B, 0x68,
+                         0x7, 0x37, 0x60, 0x87, 0x77, 0x78, 0x7, 0x78,
+                         0x98, 0x51, 0x4C, 0xF4, 0x90, 0xF, 0xF0, 0x50,
+                         0xE, 0x33, 0x1E, 0x6A, 0x1E, 0xCA, 0x61, 0x1C,
+                         0xE8, 0x21, 0x1D, 0xDE, 0xC1, 0x1D, 0x7E, 0x1,
+                         0x1E, 0xE4, 0xA1, 0x1C, 0xCC, 0x21, 0x1D, 0xF0,
+                         0x61, 0x6, 0x54, 0x85, 0x83, 0x38, 0xCC, 0xC3,
+                         0x3B, 0xB0, 0x43, 0x3D, 0xD0, 0x43, 0x39, 0xFC,
+                         0xC2, 0x3C, 0xE4, 0x43, 0x3B, 0x88, 0xC3, 0x3B,
+                         0xB0, 0xC3, 0x8C, 0xC5, 0xA, 0x87, 0x79, 0x98,
+                         0x87, 0x77, 0x18, 0x87, 0x74, 0x8, 0x7, 0x7A,
+                         0x28, 0x7, 0x72, 0x98, 0x81, 0x5C, 0xE3, 0x10,
+                         0xE, 0xEC, 0xC0, 0xE, 0xE5, 0x50, 0xE, 0xF3, 0x30,
+                         0x23, 0xC1, 0xD2, 0x41, 0x1E, 0xE4, 0xE1, 0x17,
+                         0xD8, 0xE1, 0x1D, 0xDE, 0x1, 0x1E, 0x66, 0x48,
+                         0x19, 0x3B, 0xB0, 0x83, 0x3D, 0xB4, 0x83, 0x1B,
+                         0x84, 0xC3, 0x38, 0x8C, 0x43, 0x39, 0xCC, 0xC3,
+                         0x3C, 0xB8, 0xC1, 0x39, 0xC8, 0xC3, 0x3B, 0xD4,
+                         0x3, 0x3C, 0xCC, 0x48, 0xB4, 0x71, 0x8, 0x7, 0x76,
+                         0x60, 0x7, 0x71, 0x8, 0x87, 0x71, 0x58, 0x87,
+                         0x19, 0xDB, 0xC6, 0xE, 0xEC, 0x60, 0xF, 0xED,
+                         0xE0, 0x6, 0xF0, 0x20, 0xF, 0xE5, 0x30, 0xF, 0xE5,
+                         0x20, 0xF, 0xF6, 0x50, 0xE, 0x6E, 0x10, 0xE, 0xE3,
+                         0x30, 0xE, 0xE5, 0x30, 0xF, 0xF3, 0xE0, 0x6, 0xE9,
+                         0xE0, 0xE, 0xE4, 0x50, 0xE, 0xF8, 0x30, 0x23,
+                         0xE2, 0xEC, 0x61, 0x1C, 0xC2, 0x81, 0x1D, 0xD8,
+                         0xE1, 0x17, 0xEC, 0x21, 0x1D, 0xE6, 0x21, 0x1D,
+                         0xC4, 0x21, 0x1D, 0xD8, 0x21, 0x1D, 0xE8, 0x21,
+                         0x1F, 0x66, 0x20, 0x9D, 0x3B, 0xBC, 0x43, 0x3D,
+                         0xB8, 0x3, 0x39, 0x94, 0x83, 0x39, 0xCC, 0x58,
+                         0xBC, 0x70, 0x70, 0x7, 0x77, 0x78, 0x7, 0x7A,
+                         0x8, 0x7, 0x7A, 0x48, 0x87, 0x77, 0x70, 0x87,
+                         0x19, 0xCB, 0xE7, 0xE, 0xEF, 0x30, 0xF, 0xE1,
+                         0xE0, 0xE, 0xE9, 0x40, 0xF, 0xE9, 0xA0, 0xF, 0xE5,
+                         0x30, 0xC3, 0x1, 0x3, 0x73, 0xA8, 0x7, 0x77, 0x18,
+                         0x87, 0x5F, 0x98, 0x87, 0x70, 0x70, 0x87, 0x74,
+                         0xA0, 0x87, 0x74, 0xD0, 0x87, 0x72, 0x98, 0x81,
+                         0x84, 0x41, 0x39, 0xE0, 0xC3, 0x38, 0xB0, 0x43,
+                         0x3D, 0x90, 0x43, 0x39, 0xCC, 0x40, 0xC4, 0xA0,
+                         0x1D, 0xCA, 0xA1, 0x1D, 0xE0, 0x41, 0x1E, 0xDE,
+                         0xC1, 0x1C, 0x66, 0x24, 0x63, 0x30, 0xE, 0xE1,
+                         0xC0, 0xE, 0xEC, 0x30, 0xF, 0xE9, 0x40, 0xF, 0xE5,
+                         0x30, 0x43, 0x21, 0x83, 0x75, 0x18, 0x7, 0x73,
+                         0x48, 0x87, 0x5F, 0xA0, 0x87, 0x7C, 0x80, 0x87,
+                         0x72, 0x98, 0xB1, 0x94, 0x1, 0x3C, 0x8C, 0xC3,
+                         0x3C, 0x94, 0xC3, 0x38, 0xD0, 0x43, 0x3A, 0xBC,
+                         0x83, 0x3B, 0xCC, 0xC3, 0x8C, 0xC5, 0xC, 0x48,
+                         0x21, 0x15, 0x42, 0x61, 0x1E, 0xE6, 0x21, 0x1D,
+                         0xCE, 0xC1, 0x1D, 0x52, 0x81, 0x14, 0x66, 0x4C,
+                         0x67, 0x30, 0xE, 0xEF, 0x20, 0xF, 0xEF, 0xE0,
+                         0x6, 0xEF, 0x50, 0xF, 0xF4, 0x30, 0xF, 0xE9, 0x40,
+                         0xE, 0xE5, 0xE0, 0x6, 0xE6, 0x20, 0xF, 0xE1, 0xD0,
+                         0xE, 0xE5, 0x30, 0xA3, 0x40, 0x83, 0x76, 0x68,
+                         0x7, 0x79, 0x8, 0x87, 0x19, 0x52, 0x1A, 0xB8,
+                         0xC3, 0x3B, 0x84, 0x3, 0x3B, 0xA4, 0x43, 0x38,
+                         0xCC, 0x83, 0x1B, 0x84, 0x3, 0x39, 0x90, 0x83,
+                         0x3C, 0xCC, 0x3, 0x3C, 0x84, 0xC3, 0x38, 0x94,
+                         0xC3, 0xC, 0x46, 0xD, 0xC6, 0x21, 0x1C, 0xD8,
+                         0x81, 0x1D, 0xCA, 0xA1, 0x1C, 0x7E, 0x81, 0x1E,
+                         0xF2, 0x1, 0x1E, 0xCA, 0x61, 0xC6, 0xB1, 0x6,
+                         0xEE, 0xF0, 0xE, 0xE6, 0x20, 0xF, 0xE5, 0x50,
+                         0xE, 0x33, 0x1C, 0x36, 0x20, 0x7, 0x7C, 0x70,
+                         0x3, 0x77, 0x78, 0x7, 0x77, 0xA8, 0x7, 0x77, 0x48,
+                         0x7, 0x73, 0x78, 0x7, 0x79, 0x68, 0x87, 0x19,
+                         0x55, 0x1B, 0x90, 0x3, 0x3E, 0xB8, 0xC1, 0x38,
+                         0xBC, 0x83, 0x3B, 0xD0, 0x83, 0x3C, 0xBC, 0x3,
+                         0x3B, 0x98, 0x3, 0x3B, 0xBC, 0xC3, 0x3D, 0xB8,
+                         0x1, 0x3A, 0xA4, 0x83, 0x3B, 0xD0, 0xC3, 0x3C,
+                         0xCC, 0x58, 0xDC, 0x80, 0x1C, 0xF0, 0xC1, 0xD,
+                         0xE0, 0x41, 0x1E, 0xCA, 0x61, 0x1C, 0xD2, 0x61,
+                         0x1E, 0xCA, 0x1, 0x0, 0x79, 0x28, 0x0, 0x0, 0x52,
+                         0x0, 0x0, 0x0, 0xC2, 0x3C, 0x90, 0x40, 0x86, 0x10,
+                         0x19, 0x32, 0xE2, 0x64, 0x90, 0x40, 0x46, 0x2,
+                         0x19, 0x23, 0x23, 0x46, 0x2, 0x13, 0x24, 0xC6,
+                         0x0, 0x13, 0x74, 0xCE, 0x61, 0x8C, 0x2D, 0xCC,
+                         0xED, 0xC, 0xC4, 0xAE, 0x4C, 0x6E, 0x2E, 0xED,
+                         0xCD, 0xD, 0x44, 0x46, 0xC6, 0x5, 0xC6, 0x5, 0xE6,
+                         0x2C, 0x8D, 0xE, 0x4, 0xE5, 0x2C, 0x8D, 0xE, 0xE8,
+                         0x2C, 0x8D, 0xE, 0xAD, 0x4E, 0xCC, 0x65, 0xEC,
+                         0xAD, 0x4D, 0x87, 0x8D, 0xCD, 0xAE, 0xED, 0x85,
+                         0x8D, 0xCD, 0xAE, 0xAD, 0x5, 0x4E, 0xEE, 0x4D,
+                         0xAD, 0x6C, 0x8C, 0xCE, 0xE5, 0x2C, 0x8D, 0xE,
+                         0xA4, 0xEC, 0xC6, 0x86, 0xA6, 0x2C, 0x26, 0x7,
+                         0xA6, 0xAC, 0xC, 0x26, 0x26, 0xE7, 0x46, 0x6C,
+                         0x2C, 0xA6, 0xC, 0x66, 0xA6, 0x6C, 0xC6, 0x4C,
+                         0x6, 0x86, 0x6C, 0x4C, 0x6, 0x46, 0xCC, 0x66,
+                         0x6C, 0x2C, 0xC, 0x27, 0x46, 0x6C, 0x86, 0x6C,
+                         0x2C, 0xE5, 0x8, 0x63, 0x73, 0x87, 0x68, 0xB,
+                         0x4B, 0x73, 0x3B, 0xCA, 0xDD, 0x18, 0x5A, 0x98,
+                         0xDC, 0xD7, 0x5C, 0x9A, 0x5E, 0xD9, 0x69, 0xCC,
+                         0xE4, 0xC2, 0xDA, 0xCA, 0x5A, 0xE0, 0xDE, 0xD2,
+                         0xDC, 0xE8, 0xCA, 0xE4, 0x86, 0x20, 0x1C, 0xC1,
+                         0x10, 0x84, 0x43, 0x18, 0x82, 0x70, 0xC, 0x43,
+                         0x10, 0xE, 0x62, 0x8, 0x42, 0x1, 0xC, 0x41, 0x38,
+                         0x8A, 0x21, 0x8, 0x87, 0x31, 0x6, 0xC1, 0x38,
+                         0xC6, 0x10, 0x4, 0x63, 0x18, 0x4, 0x24, 0x19,
+                         0x83, 0x60, 0x24, 0x63, 0x18, 0xC, 0xC3, 0x18,
+                         0x83, 0xB0, 0x44, 0x63, 0x28, 0x94, 0x1, 0x0,
+                         0xA4, 0x31, 0xC, 0x6, 0xB1, 0x8C, 0x61, 0x60,
+                         0xA, 0xC6, 0x24, 0x64, 0x78, 0x2E, 0x76, 0x61,
+                         0x6C, 0x76, 0x65, 0x72, 0x43, 0x9, 0x18, 0xA3,
+                         0xB0, 0xB1, 0xD9, 0xB5, 0xB9, 0xA4, 0x91, 0x95,
+                         0xB9, 0xD1, 0xD, 0x25, 0x68, 0x8C, 0x43, 0x86,
+                         0xE7, 0x32, 0x87, 0x16, 0x46, 0x56, 0x26, 0xD7,
+                         0xF4, 0x46, 0x56, 0xC6, 0x36, 0x94, 0xC0, 0x31,
+                         0xA, 0x19, 0x9E, 0x8B, 0x5D, 0x99, 0xDC, 0x5C,
+                         0xDA, 0x9B, 0xDB, 0x50, 0x82, 0xC7, 0x38, 0x64,
+                         0x78, 0x2E, 0x65, 0x6E, 0x74, 0x72, 0x79, 0x50,
+                         0x6F, 0x69, 0x6E, 0x74, 0x73, 0x43, 0x9, 0x24,
+                         0x13, 0xB1, 0xB1, 0xD9, 0xB5, 0xB9, 0xB4, 0xBD,
+                         0x91, 0xD5, 0xB1, 0x95, 0xB9, 0x98, 0xB1, 0x85,
+                         0x9D, 0xCD, 0xD, 0x45, 0x98, 0x28, 0x0, 0x71,
+                         0x20, 0x0, 0x0, 0x2, 0x0, 0x0, 0x0, 0x6, 0x40,
+                         0x30, 0x0, 0xD2, 0x0, 0x0, 0x0, 0x61, 0x20, 0x0,
+                         0x0, 0x6, 0x0, 0x0, 0x0, 0x13, 0x4, 0x1, 0x86,
+                         0x3, 0x1, 0x0, 0x0, 0x2, 0x0, 0x0, 0x0, 0x7, 0x50,
+                         0x10, 0xCD, 0x14, 0x61, 0x0, 0x0, 0x0, 0x0, 0x0,
+                         0x0, 0x0, 0x0, 0x0, 0x0 ]
+  - Name:            PSV0
+    Size:            76
+    PSVInfo:
+      Version:         3
+      ShaderStage:     5
+      MinimumWaveLaneCount: 0
+      MaximumWaveLaneCount: 4294967295
+      UsesViewID:      0
+      SigInputVectors: 0
+      SigOutputVectors: [ 0, 0, 0, 0 ]
+      NumThreadsX:     1
+      NumThreadsY:     1
+      NumThreadsZ:     1
+      EntryName:       main
+      ResourceStride:  24
+      Resources:       []
+      SigInputElements: []
+      SigOutputElements: []
+      SigPatchOrPrimElements: []
+      InputOutputMap:
+        - [  ]
+        - [  ]
+        - [  ]
+        - [  ]
+...
diff --git a/llvm/test/tools/llvm-objdump/ELF/Hexagon/packet-reset-on-label.s b/llvm/test/tools/llvm-objdump/ELF/Hexagon/packet-reset-on-label.s
new file mode 100644
index 0000000000000..02a52bbb3fbd8
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/ELF/Hexagon/packet-reset-on-label.s
@@ -0,0 +1,23 @@
+// RUN: llvm-mc -triple=hexagon -mcpu=hexagonv75 -filetype=obj %s \
+// RUN:   | llvm-objdump -d - \
+// RUN:   | FileCheck %s
+
+foo:
+  { nop }
+  /// a nop without end-of-packet bits set to simulate data that is
+  /// not a proper packet end.
+  .long 0x7f004000
+bar:
+  { nop
+    nop
+  }
+
+// CHECK-LABEL: <foo>:
+// CHECK: { nop }
+// CHECK-NEXT: { nop
+
+/// The instruction starting after <bar> should start in a new packet.
+// CHECK-LABEL: <bar>:
+// CHECK: { nop
+// CHECK-NEXT: nop }
+
diff --git a/llvm/test/tools/llvm-readobj/ELF/bb-addr-map.test b/llvm/test/tools/llvm-readobj/ELF/bb-addr-map.test
index fd1492fa091a8..bcffd402d36a0 100644
--- a/llvm/test/tools/llvm-readobj/ELF/bb-addr-map.test
+++ b/llvm/test/tools/llvm-readobj/ELF/bb-addr-map.test
@@ -34,6 +34,7 @@
 # CHECK-NEXT:           {
 # CHECK-NEXT:             ID: 0
 # CHECK-NEXT:             Offset: 0x0
+# CHECK-NEXT:             Hash: 0x0
 # CHECK-NEXT:             Size: 0x1
 # CHECK-NEXT:             HasReturn: No
 # CHECK-NEXT:             HasTailCall: Yes
@@ -50,6 +51,7 @@
 # CHECK-NEXT:             ID: 2
 # CHECK-NEXT:             Offset: 0x3
 # CHECK-NEXT:             Callsite End Offsets: [1, 3]
+# CHECK-NEXT:             Hash: 0x123
 # CHECK-NEXT:             Size: 0x7
 # CHECK-NEXT:             HasReturn: Yes
 # CHECK-NEXT:             HasTailCall: No
@@ -144,8 +146,8 @@ Sections:
     ShSize: [[SIZE=<none>]]
     Link:   .text
     Entries:
-      - Version: 3
-        Feature: 0x28
+      - Version: 4
+        Feature: 0x68
         BBRanges:
           - BaseAddress: [[ADDR=0x11111]]
             BBEntries:
@@ -160,6 +162,7 @@ Sections:
                 Size:          0x4
                 Metadata:      0x15
                 CallsiteEndOffsets: [ 0x1 , 0x2 ]
+                Hash:          0x123
       - Version: 2
         BBRanges:
           - BaseAddress: 0x22222
diff --git a/llvm/test/tools/llvm-reduce/reduce-instructions-alloca.ll b/llvm/test/tools/llvm-reduce/reduce-instructions-alloca.ll
new file mode 100644
index 0000000000000..94b45d236f824
--- /dev/null
+++ b/llvm/test/tools/llvm-reduce/reduce-instructions-alloca.ll
@@ -0,0 +1,16 @@
+; RUN: llvm-reduce --abort-on-invalid-reduction --delta-passes=instructions --test FileCheck --test-arg --check-prefixes=CHECK,INTERESTING --test-arg %s --test-arg --input-file %s -o %t
+; RUN: FileCheck -check-prefixes=CHECK,RESULT %s < %t
+
+; CHECK-LABEL: define void @alloca(
+; INTERESTING: call void @llvm.lifetime.start.p0(
+; INTERESTING: call void @llvm.lifetime.end.p0(
+
+; RESULT: call void @llvm.lifetime.start.p0(ptr poison)
+; RESULT-NEXT: call void @llvm.lifetime.end.p0(ptr poison)
+; RESULT-NEXT: ret void
+define void @alloca(ptr %ptr) {
+  %alloca = alloca i32, align 4
+  call void @llvm.lifetime.start.p0(ptr %alloca)
+  call void @llvm.lifetime.end.p0(ptr %alloca)
+  ret void
+}
diff --git a/llvm/test/tools/obj2yaml/ELF/bb-addr-map.yaml b/llvm/test/tools/obj2yaml/ELF/bb-addr-map.yaml
index dc14025689b60..7a22efe2d0f70 100644
--- a/llvm/test/tools/obj2yaml/ELF/bb-addr-map.yaml
+++ b/llvm/test/tools/obj2yaml/ELF/bb-addr-map.yaml
@@ -162,6 +162,92 @@ Sections:
         BBRanges:
           - BaseAddress: 0x20
 
+## Check that obj2yaml can dump basic block hash in the .llvm_bb_addr_map section.
+
+# RUN: yaml2obj --docnum=4 %s -o %t4
+# RUN: obj2yaml %t4 | FileCheck %s --check-prefix=BBHASH
+
+# BBHASH:      --- !ELF
+# BBHASH-NEXT: FileHeader:
+# BBHASH-NEXT:   Class: ELFCLASS64
+# BBHASH-NEXT:   Data:  ELFDATA2LSB
+# BBHASH-NEXT:   Type:  ET_EXEC
+# BBHASH-NEXT: Sections:
+# BBHASH-NEXT:   - Name: .llvm_bb_addr_map
+# BBHASH-NEXT:     Type: SHT_LLVM_BB_ADDR_MAP
+# BBHASH-NEXT:     Entries:
+# BBHASH-NEXT:       - Version: 4
+# BBHASH-NEXT:         Feature: 0x40
+# BBHASH-NEXT:         BBRanges:
+# BBHASH-NEXT:           - BBEntries:
+# BBHASH-NEXT:             - ID:            0
+# BBHASH-NEXT:               AddressOffset: 0x1
+# BBHASH-NEXT:               Size:          0x2
+# BBHASH-NEXT:               Metadata:      0x3
+# BBHASH-NEXT:               Hash:          0x1
+# BBHASH-NEXT:             - ID:            2
+# BBHASH-NEXT:               AddressOffset: 0x4
+# BBHASH-NEXT:               Size:          0x5
+# BBHASH-NEXT:               Metadata:      0x6
+# BBHASH-NEXT:               Hash:          0x2
+# BBHASH-NEXT:             - ID:            4
+# BBHASH-NEXT:               AddressOffset: 0xFFFFFFFFFFFFFFF7
+# BBHASH-NEXT:               Size:          0xFFFFFFFFFFFFFFF8
+# BBHASH-NEXT:               Metadata:      0xFFFFFFFFFFFFFFF9
+# BBHASH-NEXT:               Hash:          0x3
+# BBHASH-NEXT:       - Version: 4
+# BBHASH-NEXT:         Feature: 0x68
+# BBHASH-NEXT:         BBRanges:
+# BBHASH-NEXT:           - BaseAddress: 0xFFFFFFFFFFFFFF20
+# BBHASH-NEXT:             BBEntries:
+# BBHASH-NEXT:               - ID:              6
+# BBHASH-NEXT:                 AddressOffset:   0xA
+# BBHASH-NEXT:                 Size:            0xB
+# BBHASH-NEXT:                 Metadata:        0xC
+# BBHASH-NEXT:                 CallsiteEndOffsets: [ 0x1, 0x2 ]
+# BBHASH-NEXT:                 Hash:          0x123
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  Type:  ET_EXEC
+Sections:
+  - Name:   .llvm_bb_addr_map
+    Type:   SHT_LLVM_BB_ADDR_MAP
+    Entries:
+      - Version: 4
+        Feature: 0x40
+        BBRanges:
+          - BaseAddress: 0x0
+            BBEntries:
+              - ID:            0
+                AddressOffset: 0x1
+                Size:          0x2
+                Metadata:      0x3
+                Hash:          0x1
+              - ID:            2
+                AddressOffset: 0x4
+                Size:          0x5
+                Metadata:      0x6
+                Hash:          0x2
+              - ID:            4
+                AddressOffset: 0xFFFFFFFFFFFFFFF7
+                Size:          0xFFFFFFFFFFFFFFF8
+                Metadata:      0xFFFFFFFFFFFFFFF9
+                Hash:          0x3
+      - Version:   4
+        Feature:   0x68
+        BBRanges:
+          - BaseAddress:   0xFFFFFFFFFFFFFF20
+            BBEntries:
+             - ID:              6
+               AddressOffset:   0xA
+               Size:            0xB
+               Metadata:        0xC
+               CallsiteEndOffsets: [ 0x1, 0x2 ]
+               Hash:            0x123
+
 ## Check that obj2yaml uses the "Content" tag to describe an .llvm_bb_addr_map section
 ## when it can't extract the entries, for example, when the section is truncated, or
 ## when an invalid 'NumBlocks' or 'NumBBRanges` field is specified.
diff --git a/llvm/test/tools/yaml2obj/ELF/bb-addr-map.yaml b/llvm/test/tools/yaml2obj/ELF/bb-addr-map.yaml
index 418f90ff8bafe..339e419b39458 100644
--- a/llvm/test/tools/yaml2obj/ELF/bb-addr-map.yaml
+++ b/llvm/test/tools/yaml2obj/ELF/bb-addr-map.yaml
@@ -72,6 +72,13 @@
 # CHECK-NEXT:     0000: 03202000 00000000 0000010E 01000203
 # CHECK-NEXT:   )
 
+# Case 10: Specify basic block hash.
+# CHECK:        Name: .llvm_bb_addr_map (1)
+# CHECK:        SectionData (
+# CHECK-NEXT:     0000: 04602000 00000000 0000010E 01000203
+# CHECK-NEXT:     0010: 23010000 00000000
+# CHECK-NEXT:   )
+
 
 --- !ELF
 FileHeader:
@@ -176,6 +183,22 @@ Sections:
                Metadata:        0x00000003
                CallsiteEndOffsets: []
 
+## 10) We can produce a SHT_LLVM_BB_ADDR_MAP section with basic block hash.
+  - Name: '.llvm_bb_addr_map (10)'
+    Type: SHT_LLVM_BB_ADDR_MAP
+    Entries:
+      - Version: 4
+        Feature: 0x60
+        BBRanges:
+          - BaseAddress: 0x0000000000000020
+            BBEntries:
+             - ID:              14
+               AddressOffset:   0x00000001
+               Size:            0x00000002
+               Metadata:        0x00000003
+               CallsiteEndOffsets: []
+               Hash:            0x123
+
 ## Check we can't use Entries at the same time as either Content or Size.
 # RUN: not yaml2obj --docnum=2 -DCONTENT="00" %s 2>&1 | FileCheck %s --check-prefix=INVALID
 # RUN: not yaml2obj --docnum=2 -DSIZE="0" %s 2>&1 | FileCheck %s --check-prefix=INVALID
@@ -197,7 +220,7 @@ Sections:
 
 ## Check that yaml2obj generates a warning when we use unsupported versions.
 # RUN: yaml2obj --docnum=3  %s 2>&1 | FileCheck %s --check-prefix=INVALID-VERSION
-# INVALID-VERSION: warning: unsupported SHT_LLVM_BB_ADDR_MAP version: 4; encoding using the most recent version
+# INVALID-VERSION: warning: unsupported SHT_LLVM_BB_ADDR_MAP version: 5; encoding using the most recent version
 
 --- !ELF
 FileHeader:
@@ -209,4 +232,4 @@ Sections:
     Type: SHT_LLVM_BB_ADDR_MAP
     Entries:
 ##  Specify unsupported version
-      - Version: 4
+      - Version: 5
diff --git a/llvm/tools/llc/llc.cpp b/llvm/tools/llc/llc.cpp
index f04b256e2e6c9..152f7db0719a1 100644
--- a/llvm/tools/llc/llc.cpp
+++ b/llvm/tools/llc/llc.cpp
@@ -172,6 +172,11 @@ static cl::opt<bool>
                       cl::desc("Print MIR2Vec vocabulary contents"),
                       cl::init(false));
 
+static cl::opt<bool>
+    PrintMIR2Vec("print-mir2vec", cl::Hidden,
+                 cl::desc("Print MIR2Vec embeddings for functions"),
+                 cl::init(false));
+
 static cl::list<std::string> IncludeDirs("I", cl::desc("include search path"));
 
 static cl::opt<bool> RemarksWithHotness(
@@ -218,13 +223,12 @@ static cl::opt<std::string> PassPipeline(
 static cl::alias PassPipeline2("p", cl::aliasopt(PassPipeline),
                                cl::desc("Alias for -passes"));
 
-namespace {
-
-std::vector<std::string> &getRunPassNames() {
+static std::vector<std::string> &getRunPassNames() {
   static std::vector<std::string> RunPassNames;
   return RunPassNames;
 }
 
+namespace {
 struct RunPassOption {
   void operator=(const std::string &Val) const {
     if (Val.empty())
@@ -776,6 +780,11 @@ static int compileModule(char **argv, LLVMContext &Context) {
         PM.add(createMIR2VecVocabPrinterLegacyPass(errs()));
       }
 
+      // Add MIR2Vec printer if requested
+      if (PrintMIR2Vec) {
+        PM.add(createMIR2VecPrinterLegacyPass(errs()));
+      }
+
       PM.add(createFreeMachineFunctionPass());
     } else {
       if (Target->addPassesToEmitFile(PM, *OS, DwoOut ? &DwoOut->os() : nullptr,
@@ -789,6 +798,11 @@ static int compileModule(char **argv, LLVMContext &Context) {
       if (PrintMIR2VecVocab) {
         PM.add(createMIR2VecVocabPrinterLegacyPass(errs()));
       }
+
+      // Add MIR2Vec printer if requested
+      if (PrintMIR2Vec) {
+        PM.add(createMIR2VecPrinterLegacyPass(errs()));
+      }
     }
 
     Target->getObjFileLowering()->Initialize(MMIWP->getMMI().getContext(),
diff --git a/llvm/tools/lli/lli.cpp b/llvm/tools/lli/lli.cpp
index 875ec1b7fe64a..7fee06b5d7b4f 100644
--- a/llvm/tools/lli/lli.cpp
+++ b/llvm/tools/lli/lli.cpp
@@ -92,206 +92,202 @@ static codegen::RegisterCodeGenFlags CGF;
 #define DEBUG_TYPE "lli"
 
 namespace {
-
-  enum class JITKind { MCJIT, Orc, OrcLazy };
-  enum class JITLinkerKind { Default, RuntimeDyld, JITLink };
-
-  cl::opt<std::string>
-  InputFile(cl::desc("<input bitcode>"), cl::Positional, cl::init("-"));
-
-  cl::list<std::string>
-  InputArgv(cl::ConsumeAfter, cl::desc("<program arguments>..."));
-
-  cl::opt<bool> ForceInterpreter("force-interpreter",
-                                 cl::desc("Force interpretation: disable JIT"),
-                                 cl::init(false));
-
-  cl::opt<JITKind> UseJITKind(
-      "jit-kind", cl::desc("Choose underlying JIT kind."),
-      cl::init(JITKind::Orc),
-      cl::values(clEnumValN(JITKind::MCJIT, "mcjit", "MCJIT"),
-                 clEnumValN(JITKind::Orc, "orc", "Orc JIT"),
-                 clEnumValN(JITKind::OrcLazy, "orc-lazy",
-                            "Orc-based lazy JIT.")));
-
-  cl::opt<JITLinkerKind>
-      JITLinker("jit-linker", cl::desc("Choose the dynamic linker/loader."),
-                cl::init(JITLinkerKind::Default),
-                cl::values(clEnumValN(JITLinkerKind::Default, "default",
-                                      "Default for platform and JIT-kind"),
-                           clEnumValN(JITLinkerKind::RuntimeDyld, "rtdyld",
-                                      "RuntimeDyld"),
-                           clEnumValN(JITLinkerKind::JITLink, "jitlink",
-                                      "Orc-specific linker")));
-  cl::opt<std::string> OrcRuntime("orc-runtime",
-                                  cl::desc("Use ORC runtime from given path"),
-                                  cl::init(""));
-
-  cl::opt<unsigned>
-  LazyJITCompileThreads("compile-threads",
-                        cl::desc("Choose the number of compile threads "
-                                 "(jit-kind=orc-lazy only)"),
-                        cl::init(0));
-
-  cl::list<std::string>
-  ThreadEntryPoints("thread-entry",
-                    cl::desc("calls the given entry-point on a new thread "
-                             "(jit-kind=orc-lazy only)"));
-
-  cl::opt<bool> PerModuleLazy(
-      "per-module-lazy",
-      cl::desc("Performs lazy compilation on whole module boundaries "
-               "rather than individual functions"),
-      cl::init(false));
-
-  cl::list<std::string>
-      JITDylibs("jd",
-                cl::desc("Specifies the JITDylib to be used for any subsequent "
-                         "-extra-module arguments."));
-
-  cl::list<std::string>
-      Dylibs("dlopen", cl::desc("Dynamic libraries to load before linking"));
-
-  // The MCJIT supports building for a target address space separate from
-  // the JIT compilation process. Use a forked process and a copying
-  // memory manager with IPC to execute using this functionality.
-  cl::opt<bool> RemoteMCJIT("remote-mcjit",
-    cl::desc("Execute MCJIT'ed code in a separate process."),
+enum class JITKind { MCJIT, Orc, OrcLazy };
+enum class JITLinkerKind { Default, RuntimeDyld, JITLink };
+} // namespace
+
+static cl::opt<std::string> InputFile(cl::desc("<input bitcode>"),
+                                      cl::Positional, cl::init("-"));
+
+static cl::list<std::string> InputArgv(cl::ConsumeAfter,
+                                       cl::desc("<program arguments>..."));
+
+static cl::opt<bool>
+    ForceInterpreter("force-interpreter",
+                     cl::desc("Force interpretation: disable JIT"),
+                     cl::init(false));
+
+static cl::opt<JITKind>
+    UseJITKind("jit-kind", cl::desc("Choose underlying JIT kind."),
+               cl::init(JITKind::Orc),
+               cl::values(clEnumValN(JITKind::MCJIT, "mcjit", "MCJIT"),
+                          clEnumValN(JITKind::Orc, "orc", "Orc JIT"),
+                          clEnumValN(JITKind::OrcLazy, "orc-lazy",
+                                     "Orc-based lazy JIT.")));
+
+static cl::opt<JITLinkerKind> JITLinker(
+    "jit-linker", cl::desc("Choose the dynamic linker/loader."),
+    cl::init(JITLinkerKind::Default),
+    cl::values(clEnumValN(JITLinkerKind::Default, "default",
+                          "Default for platform and JIT-kind"),
+               clEnumValN(JITLinkerKind::RuntimeDyld, "rtdyld", "RuntimeDyld"),
+               clEnumValN(JITLinkerKind::JITLink, "jitlink",
+                          "Orc-specific linker")));
+static cl::opt<std::string>
+    OrcRuntime("orc-runtime", cl::desc("Use ORC runtime from given path"),
+               cl::init(""));
+
+static cl::opt<unsigned>
+    LazyJITCompileThreads("compile-threads",
+                          cl::desc("Choose the number of compile threads "
+                                   "(jit-kind=orc-lazy only)"),
+                          cl::init(0));
+
+static cl::list<std::string>
+    ThreadEntryPoints("thread-entry",
+                      cl::desc("calls the given entry-point on a new thread "
+                               "(jit-kind=orc-lazy only)"));
+
+static cl::opt<bool> PerModuleLazy(
+    "per-module-lazy",
+    cl::desc("Performs lazy compilation on whole module boundaries "
+             "rather than individual functions"),
     cl::init(false));
 
-  // Manually specify the child process for remote execution. This overrides
-  // the simulated remote execution that allocates address space for child
-  // execution. The child process will be executed and will communicate with
-  // lli via stdin/stdout pipes.
-  cl::opt<std::string>
-  ChildExecPath("mcjit-remote-process",
-                cl::desc("Specify the filename of the process to launch "
-                         "for remote MCJIT execution.  If none is specified,"
-                         "\n\tremote execution will be simulated in-process."),
-                cl::value_desc("filename"), cl::init(""));
-
-  // Determine optimization level.
-  cl::opt<char> OptLevel("O",
-                         cl::desc("Optimization level. [-O0, -O1, -O2, or -O3] "
-                                  "(default = '-O2')"),
-                         cl::Prefix, cl::init('2'));
-
-  cl::opt<std::string>
-  TargetTriple("mtriple", cl::desc("Override target triple for module"));
-
-  cl::opt<std::string>
-  EntryFunc("entry-function",
-            cl::desc("Specify the entry function (default = 'main') "
-                     "of the executable"),
-            cl::value_desc("function"),
-            cl::init("main"));
-
-  cl::list<std::string>
-  ExtraModules("extra-module",
-         cl::desc("Extra modules to be loaded"),
-         cl::value_desc("input bitcode"));
-
-  cl::list<std::string>
-  ExtraObjects("extra-object",
-         cl::desc("Extra object files to be loaded"),
-         cl::value_desc("input object"));
-
-  cl::list<std::string>
-  ExtraArchives("extra-archive",
-         cl::desc("Extra archive files to be loaded"),
-         cl::value_desc("input archive"));
-
-  cl::opt<bool>
-  EnableCacheManager("enable-cache-manager",
-        cl::desc("Use cache manager to save/load modules"),
-        cl::init(false));
-
-  cl::opt<std::string>
-  ObjectCacheDir("object-cache-dir",
-                  cl::desc("Directory to store cached object files "
-                           "(must be user writable)"),
-                  cl::init(""));
-
-  cl::opt<std::string>
-  FakeArgv0("fake-argv0",
-            cl::desc("Override the 'argv[0]' value passed into the executing"
-                     " program"), cl::value_desc("executable"));
-
-  cl::opt<bool>
-  DisableCoreFiles("disable-core-files", cl::Hidden,
-                   cl::desc("Disable emission of core files if possible"));
-
-  cl::opt<bool>
-  NoLazyCompilation("disable-lazy-compilation",
-                  cl::desc("Disable JIT lazy compilation"),
-                  cl::init(false));
-
-  cl::opt<bool>
-  GenerateSoftFloatCalls("soft-float",
-    cl::desc("Generate software floating point library calls"),
+static cl::list<std::string>
+    JITDylibs("jd",
+              cl::desc("Specifies the JITDylib to be used for any subsequent "
+                       "-extra-module arguments."));
+
+static cl::list<std::string>
+    Dylibs("dlopen", cl::desc("Dynamic libraries to load before linking"));
+
+// The MCJIT supports building for a target address space separate from
+// the JIT compilation process. Use a forked process and a copying
+// memory manager with IPC to execute using this functionality.
+static cl::opt<bool>
+    RemoteMCJIT("remote-mcjit",
+                cl::desc("Execute MCJIT'ed code in a separate process."),
+                cl::init(false));
+
+// Manually specify the child process for remote execution. This overrides
+// the simulated remote execution that allocates address space for child
+// execution. The child process will be executed and will communicate with
+// lli via stdin/stdout pipes.
+static cl::opt<std::string> ChildExecPath(
+    "mcjit-remote-process",
+    cl::desc("Specify the filename of the process to launch "
+             "for remote MCJIT execution.  If none is specified,"
+             "\n\tremote execution will be simulated in-process."),
+    cl::value_desc("filename"), cl::init(""));
+
+// Determine optimization level.
+static cl::opt<char>
+    OptLevel("O",
+             cl::desc("Optimization level. [-O0, -O1, -O2, or -O3] "
+                      "(default = '-O2')"),
+             cl::Prefix, cl::init('2'));
+
+static cl::opt<std::string>
+    TargetTriple("mtriple", cl::desc("Override target triple for module"));
+
+static cl::opt<std::string>
+    EntryFunc("entry-function",
+              cl::desc("Specify the entry function (default = 'main') "
+                       "of the executable"),
+              cl::value_desc("function"), cl::init("main"));
+
+static cl::list<std::string>
+    ExtraModules("extra-module", cl::desc("Extra modules to be loaded"),
+                 cl::value_desc("input bitcode"));
+
+static cl::list<std::string>
+    ExtraObjects("extra-object", cl::desc("Extra object files to be loaded"),
+                 cl::value_desc("input object"));
+
+static cl::list<std::string>
+    ExtraArchives("extra-archive", cl::desc("Extra archive files to be loaded"),
+                  cl::value_desc("input archive"));
+
+static cl::opt<bool>
+    EnableCacheManager("enable-cache-manager",
+                       cl::desc("Use cache manager to save/load modules"),
+                       cl::init(false));
+
+static cl::opt<std::string>
+    ObjectCacheDir("object-cache-dir",
+                   cl::desc("Directory to store cached object files "
+                            "(must be user writable)"),
+                   cl::init(""));
+
+static cl::opt<std::string>
+    FakeArgv0("fake-argv0",
+              cl::desc("Override the 'argv[0]' value passed into the executing"
+                       " program"),
+              cl::value_desc("executable"));
+
+static cl::opt<bool>
+    DisableCoreFiles("disable-core-files", cl::Hidden,
+                     cl::desc("Disable emission of core files if possible"));
+
+static cl::opt<bool> NoLazyCompilation("disable-lazy-compilation",
+                                       cl::desc("Disable JIT lazy compilation"),
+                                       cl::init(false));
+
+static cl::opt<bool> GenerateSoftFloatCalls(
+    "soft-float", cl::desc("Generate software floating point library calls"),
     cl::init(false));
 
-  cl::opt<bool> NoProcessSymbols(
-      "no-process-syms",
-      cl::desc("Do not resolve lli process symbols in JIT'd code"),
-      cl::init(false));
-
-  enum class LLJITPlatform { Inactive, Auto, ExecutorNative, GenericIR };
-
-  cl::opt<LLJITPlatform> Platform(
-      "lljit-platform", cl::desc("Platform to use with LLJIT"),
-      cl::init(LLJITPlatform::Auto),
-      cl::values(clEnumValN(LLJITPlatform::Auto, "Auto",
-                            "Like 'ExecutorNative' if ORC runtime "
-                            "provided, otherwise like 'GenericIR'"),
-                 clEnumValN(LLJITPlatform::ExecutorNative, "ExecutorNative",
-                            "Use the native platform for the executor."
-                            "Requires -orc-runtime"),
-                 clEnumValN(LLJITPlatform::GenericIR, "GenericIR",
-                            "Use LLJITGenericIRPlatform"),
-                 clEnumValN(LLJITPlatform::Inactive, "Inactive",
-                            "Disable platform support explicitly")),
-      cl::Hidden);
-
-  enum class DumpKind {
-    NoDump,
-    DumpFuncsToStdOut,
-    DumpModsToStdOut,
-    DumpModsToDisk,
-    DumpDebugDescriptor,
-    DumpDebugObjects,
-  };
+static cl::opt<bool> NoProcessSymbols(
+    "no-process-syms",
+    cl::desc("Do not resolve lli process symbols in JIT'd code"),
+    cl::init(false));
 
-  cl::opt<DumpKind> OrcDumpKind(
-      "orc-lazy-debug", cl::desc("Debug dumping for the orc-lazy JIT."),
-      cl::init(DumpKind::NoDump),
-      cl::values(
-          clEnumValN(DumpKind::NoDump, "no-dump", "Don't dump anything."),
-          clEnumValN(DumpKind::DumpFuncsToStdOut, "funcs-to-stdout",
-                     "Dump function names to stdout."),
-          clEnumValN(DumpKind::DumpModsToStdOut, "mods-to-stdout",
-                     "Dump modules to stdout."),
-          clEnumValN(DumpKind::DumpModsToDisk, "mods-to-disk",
-                     "Dump modules to the current "
-                     "working directory. (WARNING: "
-                     "will overwrite existing files)."),
-          clEnumValN(DumpKind::DumpDebugDescriptor, "jit-debug-descriptor",
-                     "Dump __jit_debug_descriptor contents to stdout"),
-          clEnumValN(DumpKind::DumpDebugObjects, "jit-debug-objects",
-                     "Dump __jit_debug_descriptor in-memory debug "
-                     "objects as tool output")),
-      cl::Hidden);
-
-  ExitOnError ExitOnErr;
-}
+enum class LLJITPlatform { Inactive, Auto, ExecutorNative, GenericIR };
+
+static cl::opt<LLJITPlatform> Platform(
+    "lljit-platform", cl::desc("Platform to use with LLJIT"),
+    cl::init(LLJITPlatform::Auto),
+    cl::values(clEnumValN(LLJITPlatform::Auto, "Auto",
+                          "Like 'ExecutorNative' if ORC runtime "
+                          "provided, otherwise like 'GenericIR'"),
+               clEnumValN(LLJITPlatform::ExecutorNative, "ExecutorNative",
+                          "Use the native platform for the executor."
+                          "Requires -orc-runtime"),
+               clEnumValN(LLJITPlatform::GenericIR, "GenericIR",
+                          "Use LLJITGenericIRPlatform"),
+               clEnumValN(LLJITPlatform::Inactive, "Inactive",
+                          "Disable platform support explicitly")),
+    cl::Hidden);
+
+enum class DumpKind {
+  NoDump,
+  DumpFuncsToStdOut,
+  DumpModsToStdOut,
+  DumpModsToDisk,
+  DumpDebugDescriptor,
+  DumpDebugObjects,
+};
 
-LLVM_ATTRIBUTE_USED void linkComponents() {
+static cl::opt<DumpKind> OrcDumpKind(
+    "orc-lazy-debug", cl::desc("Debug dumping for the orc-lazy JIT."),
+    cl::init(DumpKind::NoDump),
+    cl::values(clEnumValN(DumpKind::NoDump, "no-dump", "Don't dump anything."),
+               clEnumValN(DumpKind::DumpFuncsToStdOut, "funcs-to-stdout",
+                          "Dump function names to stdout."),
+               clEnumValN(DumpKind::DumpModsToStdOut, "mods-to-stdout",
+                          "Dump modules to stdout."),
+               clEnumValN(DumpKind::DumpModsToDisk, "mods-to-disk",
+                          "Dump modules to the current "
+                          "working directory. (WARNING: "
+                          "will overwrite existing files)."),
+               clEnumValN(DumpKind::DumpDebugDescriptor, "jit-debug-descriptor",
+                          "Dump __jit_debug_descriptor contents to stdout"),
+               clEnumValN(DumpKind::DumpDebugObjects, "jit-debug-objects",
+                          "Dump __jit_debug_descriptor in-memory debug "
+                          "objects as tool output")),
+    cl::Hidden);
+
+static ExitOnError ExitOnErr;
+
+LLVM_ATTRIBUTE_USED static void linkComponents() {
   errs() << (void *)&llvm_orc_registerEHFrameSectionAllocAction
          << (void *)&llvm_orc_deregisterEHFrameSectionAllocAction
          << (void *)&llvm_orc_registerJITLoaderGDBWrapper
          << (void *)&llvm_orc_registerJITLoaderGDBAllocAction;
 }
 
+namespace {
 //===----------------------------------------------------------------------===//
 // Object cache
 //
@@ -367,6 +363,7 @@ class LLIObjectCache : public ObjectCache {
     return true;
   }
 };
+} // namespace
 
 // On Mingw and Cygwin, an external symbol named '__main' is called from the
 // generated 'main' function to allow static initialization.  To avoid linking
@@ -400,7 +397,7 @@ static void addCygMingExtraModule(ExecutionEngine &EE, LLVMContext &Context,
   EE.addModule(std::move(M));
 }
 
-CodeGenOptLevel getOptLevel() {
+static CodeGenOptLevel getOptLevel() {
   if (auto Level = CodeGenOpt::parseLevel(OptLevel))
     return *Level;
   WithColor::error(errs(), "lli") << "invalid optimization level.\n";
@@ -412,10 +409,10 @@ CodeGenOptLevel getOptLevel() {
   exit(1);
 }
 
-Error loadDylibs();
-int runOrcJIT(const char *ProgName);
-void disallowOrcOptions();
-Expected<std::unique_ptr<orc::ExecutorProcessControl>> launchRemote();
+static Error loadDylibs();
+static int runOrcJIT(const char *ProgName);
+static void disallowOrcOptions();
+static Expected<std::unique_ptr<orc::ExecutorProcessControl>> launchRemote();
 
 //===----------------------------------------------------------------------===//
 // main Driver function
@@ -863,7 +860,7 @@ static std::function<void(MemoryBuffer &)> createObjDebugDumper() {
   llvm_unreachable("Unknown DumpKind");
 }
 
-Error loadDylibs() {
+static Error loadDylibs() {
   for (const auto &Dylib : Dylibs) {
     std::string ErrMsg;
     if (sys::DynamicLibrary::LoadLibraryPermanently(Dylib.c_str(), &ErrMsg))
@@ -875,7 +872,7 @@ Error loadDylibs() {
 
 static void exitOnLazyCallThroughFailure() { exit(1); }
 
-Expected<orc::ThreadSafeModule>
+static Expected<orc::ThreadSafeModule>
 loadModule(StringRef Path, orc::ThreadSafeContext TSCtx) {
   SMDiagnostic Err;
   auto M = TSCtx.withContextDo(
@@ -895,7 +892,7 @@ loadModule(StringRef Path, orc::ThreadSafeContext TSCtx) {
   return orc::ThreadSafeModule(std::move(M), std::move(TSCtx));
 }
 
-int mingw_noop_main(void) {
+static int mingw_noop_main(void) {
   // Cygwin and MinGW insert calls from the main function to the runtime
   // function __main. The __main function is responsible for setting up main's
   // environment (e.g. running static constructors), however this is not needed
@@ -912,7 +909,7 @@ int mingw_noop_main(void) {
 // Try to enable debugger support for the given instance.
 // This alway returns success, but prints a warning if it's not able to enable
 // debugger support.
-Error tryEnableDebugSupport(orc::LLJIT &J) {
+static Error tryEnableDebugSupport(orc::LLJIT &J) {
   if (auto Err = enableDebuggerSupport(J)) {
     [[maybe_unused]] std::string ErrMsg = toString(std::move(Err));
     LLVM_DEBUG(dbgs() << "lli: " << ErrMsg << "\n");
@@ -920,7 +917,7 @@ Error tryEnableDebugSupport(orc::LLJIT &J) {
   return Error::success();
 }
 
-int runOrcJIT(const char *ProgName) {
+static int runOrcJIT(const char *ProgName) {
   // Start setting up the JIT environment.
 
   // Parse the main module.
@@ -1187,7 +1184,7 @@ int runOrcJIT(const char *ProgName) {
   return Result;
 }
 
-void disallowOrcOptions() {
+static void disallowOrcOptions() {
   // Make sure nobody used an orc-lazy specific option accidentally.
 
   if (LazyJITCompileThreads != 0) {
@@ -1206,7 +1203,7 @@ void disallowOrcOptions() {
   }
 }
 
-Expected<std::unique_ptr<orc::ExecutorProcessControl>> launchRemote() {
+static Expected<std::unique_ptr<orc::ExecutorProcessControl>> launchRemote() {
 #ifndef LLVM_ON_UNIX
   llvm_unreachable("launchRemote not supported on non-Unix platforms");
 #else
diff --git a/llvm/tools/llvm-c-test/debuginfo.c b/llvm/tools/llvm-c-test/debuginfo.c
index 205c6d63f6e02..2e165d99717ae 100644
--- a/llvm/tools/llvm-c-test/debuginfo.c
+++ b/llvm/tools/llvm-c-test/debuginfo.c
@@ -43,6 +43,9 @@ int llvm_test_dibuilder(void) {
   LLVMMetadataRef File = LLVMDIBuilderCreateFile(DIB, Filename,
     strlen(Filename), ".", 1);
 
+  LLVMMetadataRef FileCS = LLVMDIBuilderCreateFileWithChecksum(
+      DIB, Filename, strlen(Filename), ".", 1, CSK_MD5, "1234", 4, "source", 6);
+
   LLVMMetadataRef CompileUnit = LLVMDIBuilderCreateCompileUnit(
       DIB, LLVMDWARFSourceLanguageC, File, "llvm-c-test", 11, 0, NULL, 0, 0,
       NULL, 0, LLVMDWARFEmissionFull, 0, 0, 0, "/", 1, "", 0);
@@ -61,7 +64,7 @@ int llvm_test_dibuilder(void) {
                               "/test/include/llvm-c-test-import.h", 34,
                               "", 0);
   LLVMMetadataRef ImportedModule = LLVMDIBuilderCreateImportedModuleFromModule(
-      DIB, Module, OtherModule, File, 42, NULL, 0);
+      DIB, Module, OtherModule, FileCS, 42, NULL, 0);
   LLVMDIBuilderCreateImportedModuleFromAlias(DIB, Module, ImportedModule, File,
                                              42, NULL, 0);
 
diff --git a/llvm/tools/llvm-config/llvm-config.cpp b/llvm/tools/llvm-config/llvm-config.cpp
index 7f8c55ab00989..020b1b5e093d5 100644
--- a/llvm/tools/llvm-config/llvm-config.cpp
+++ b/llvm/tools/llvm-config/llvm-config.cpp
@@ -76,7 +76,7 @@ enum LinkMode {
 /// libraries.
 /// \param GetComponentNames - Get the component names instead of the
 /// library name.
-static void VisitComponent(const std::string &Name,
+static void visitComponent(const std::string &Name,
                            const StringMap<AvailableComponent *> &ComponentMap,
                            std::set<AvailableComponent *> &VisitedComponents,
                            std::vector<std::string> &RequiredLibs,
@@ -89,9 +89,8 @@ static void VisitComponent(const std::string &Name,
   AvailableComponent *AC = ComponentMap.lookup(Name);
   if (!AC) {
     errs() << "Can't find component: '" << Name << "' in the map. Available components are: ";
-    for (const auto &Component : ComponentMap) {
+    for (const auto &Component : ComponentMap)
       errs() << "'" << Component.first() << "' ";
-    }
     errs() << "\n";
     report_fatal_error("abort");
   }
@@ -108,9 +107,11 @@ static void VisitComponent(const std::string &Name,
     return;
 
   // Otherwise, visit all the dependencies.
-  for (unsigned i = 0; AC->RequiredLibraries[i]; ++i) {
-    VisitComponent(AC->RequiredLibraries[i], ComponentMap, VisitedComponents,
-                   RequiredLibs, IncludeNonInstalled, GetComponentNames,
+  for (const char *Lib : AC->RequiredLibraries) {
+    if (!Lib)
+      break;
+    visitComponent(Lib, ComponentMap, VisitedComponents, RequiredLibs,
+                   IncludeNonInstalled, GetComponentNames,
                    GetComponentLibraryPath, Missing, DirSep);
   }
 
@@ -118,17 +119,17 @@ static void VisitComponent(const std::string &Name,
   // not populated by llvm-build, but later in the process and loaded from
   // ExtensionDependencies.inc.
   if (Name == "extensions") {
-    for (auto const &AvailableExtension : AvailableExtensions) {
-      for (const char *const *Iter = &AvailableExtension.RequiredLibraries[0];
-           *Iter; ++Iter) {
-        AvailableComponent *AC = ComponentMap.lookup(*Iter);
-        if (!AC) {
-          RequiredLibs.push_back(*Iter);
-        } else {
-          VisitComponent(*Iter, ComponentMap, VisitedComponents, RequiredLibs,
+    for (const ExtensionDescriptor &AvailableExtension : AvailableExtensions) {
+      for (const char *Lib : AvailableExtension.RequiredLibraries) {
+        if (!Lib)
+          break;
+        AvailableComponent *AC = ComponentMap.lookup(Lib);
+        if (!AC)
+          RequiredLibs.push_back(Lib);
+        else
+          visitComponent(Lib, ComponentMap, VisitedComponents, RequiredLibs,
                          IncludeNonInstalled, GetComponentNames,
                          GetComponentLibraryPath, Missing, DirSep);
-        }
       }
     }
   }
@@ -159,11 +160,13 @@ static void VisitComponent(const std::string &Name,
 /// \param IncludeNonInstalled - Whether non-installed components should be
 /// reported.
 /// \param GetComponentNames - True if one would prefer the component names.
-static std::vector<std::string> ComputeLibsForComponents(
-    const std::vector<StringRef> &Components, bool IncludeNonInstalled,
-    bool GetComponentNames, const std::function<std::string(const StringRef &)>
-                                *GetComponentLibraryPath,
-    std::vector<std::string> *Missing, const std::string &DirSep) {
+static std::vector<std::string>
+computeLibsForComponents(ArrayRef<StringRef> Components,
+                         bool IncludeNonInstalled, bool GetComponentNames,
+                         const std::function<std::string(const StringRef &)>
+                             *GetComponentLibraryPath,
+                         std::vector<std::string> *Missing,
+                         const std::string &DirSep) {
   std::vector<std::string> RequiredLibs;
   std::set<AvailableComponent *> VisitedComponents;
 
@@ -173,18 +176,17 @@ static std::vector<std::string> ComputeLibsForComponents(
     ComponentMap[AC.Name] = &AC;
 
   // Visit the components.
-  for (unsigned i = 0, e = Components.size(); i != e; ++i) {
+  for (StringRef Component : Components) {
     // Users are allowed to provide mixed case component names.
-    std::string ComponentLower = Components[i].lower();
+    std::string ComponentLower = Component.lower();
 
     // Validate that the user supplied a valid component name.
     if (!ComponentMap.count(ComponentLower)) {
-      llvm::errs() << "llvm-config: unknown component name: " << Components[i]
-                   << "\n";
+      errs() << "llvm-config: unknown component name: " << Component << "\n";
       exit(1);
     }
 
-    VisitComponent(ComponentLower, ComponentMap, VisitedComponents,
+    visitComponent(ComponentLower, ComponentMap, VisitedComponents,
                    RequiredLibs, IncludeNonInstalled, GetComponentNames,
                    GetComponentLibraryPath, Missing, DirSep);
   }
@@ -196,8 +198,6 @@ static std::vector<std::string> ComputeLibsForComponents(
   return RequiredLibs;
 }
 
-/* *** */
-
 static void usage(bool ExitWithFailure = true) {
   errs() << "\
 usage: llvm-config <OPTION>... [<COMPONENT>...]\n\
@@ -244,18 +244,18 @@ Typical components:\n\
 }
 
 /// Compute the path to the main executable.
-std::string GetExecutablePath(const char *Argv0) {
+static std::string getExecutablePath(const char *Argv0) {
   // This just needs to be some symbol in the binary; C++ doesn't
   // allow taking the address of ::main however.
-  void *P = (void *)(intptr_t)GetExecutablePath;
-  return llvm::sys::fs::getMainExecutable(Argv0, P);
+  void *P = (void *)(intptr_t)getExecutablePath;
+  return sys::fs::getMainExecutable(Argv0, P);
 }
 
 /// Expand the semi-colon delimited LLVM_DYLIB_COMPONENTS into
 /// the full list of components.
-std::vector<std::string> GetAllDyLibComponents(const bool IsInDevelopmentTree,
-                                               const bool GetComponentNames,
-                                               const std::string &DirSep) {
+static std::vector<std::string>
+getAllDyLibComponents(const bool IsInDevelopmentTree,
+                      const bool GetComponentNames, const std::string &DirSep) {
   std::vector<StringRef> DyLibComponents;
 
   StringRef DyLibComponentsStr(LLVM_DYLIB_COMPONENTS);
@@ -263,15 +263,14 @@ std::vector<std::string> GetAllDyLibComponents(const bool IsInDevelopmentTree,
   while (true) {
     const size_t NextOffset = DyLibComponentsStr.find(';', Offset);
     DyLibComponents.push_back(DyLibComponentsStr.substr(Offset, NextOffset-Offset));
-    if (NextOffset == std::string::npos) {
+    if (NextOffset == std::string::npos)
       break;
-    }
     Offset = NextOffset + 1;
   }
 
   assert(!DyLibComponents.empty());
 
-  return ComputeLibsForComponents(DyLibComponents,
+  return computeLibsForComponents(DyLibComponents,
                                   /*IncludeNonInstalled=*/IsInDevelopmentTree,
                                   GetComponentNames, nullptr, nullptr, DirSep);
 }
@@ -288,7 +287,7 @@ int main(int argc, char **argv) {
   // tree.
   bool IsInDevelopmentTree;
   enum { CMakeStyle, CMakeBuildModeStyle } DevelopmentTreeLayout;
-  llvm::SmallString<256> CurrentPath(GetExecutablePath(argv[0]));
+  SmallString<256> CurrentPath(getExecutablePath(argv[0]));
   std::string CurrentExecPrefix;
   std::string ActiveObjRoot;
 
@@ -453,13 +452,12 @@ int main(int argc, char **argv) {
                                           StringRef &Out) {
     if (Lib.starts_with(StaticPrefix) || Lib.starts_with(SharedPrefix)) {
       unsigned FromEnd;
-      if (Lib.ends_with(StaticExt)) {
+      if (Lib.ends_with(StaticExt))
         FromEnd = StaticExt.size() + 1;
-      } else if (Lib.ends_with(SharedExt)) {
+      else if (Lib.ends_with(SharedExt))
         FromEnd = SharedExt.size() + 1;
-      } else {
+      else
         FromEnd = 0;
-      }
 
       if (FromEnd != 0) {
         unsigned FromStart = Lib.starts_with(SharedPrefix)
@@ -496,11 +494,10 @@ int main(int argc, char **argv) {
   /// Get the full path for a possibly shared component library.
   auto GetComponentLibraryPath = [&](const StringRef &Name, const bool Shared) {
     auto LibFileName = GetComponentLibraryFileName(Name, Shared);
-    if (Shared) {
+    if (Shared)
       return (SharedDir + DirSep + LibFileName).str();
-    } else {
+    else
       return (StaticDir + DirSep + LibFileName).str();
-    }
   };
 
   raw_ostream &OS = outs();
@@ -555,20 +552,14 @@ int main(int argc, char **argv) {
               llvm::replace(path, '/', '\\');
             if (DyLibExists && !sys::fs::exists(path)) {
               Components =
-                  GetAllDyLibComponents(IsInDevelopmentTree, true, DirSep);
+                  getAllDyLibComponents(IsInDevelopmentTree, true, DirSep);
               llvm::sort(Components);
               break;
             }
           }
         }
 
-        for (unsigned I = 0; I < Components.size(); ++I) {
-          if (I) {
-            OS << ' ';
-          }
-
-          OS << Components[I];
-        }
+        interleave(Components, OS, " ");
         OS << '\n';
       } else if (Arg == "--targets-built") {
         OS << LLVM_TARGETS_BUILT << '\n';
@@ -633,7 +624,7 @@ int main(int argc, char **argv) {
           return GetComponentLibraryPath(Name, LinkMode == LinkModeShared);
         };
     std::vector<std::string> MissingLibs;
-    std::vector<std::string> RequiredLibs = ComputeLibsForComponents(
+    std::vector<std::string> RequiredLibs = computeLibsForComponents(
         Components,
         /*IncludeNonInstalled=*/IsInDevelopmentTree, false,
         &GetComponentLibraryPathFunction, &MissingLibs, DirSep);
@@ -666,11 +657,10 @@ int main(int argc, char **argv) {
     if (PrintSharedMode) {
       std::unordered_set<std::string> FullDyLibComponents;
       std::vector<std::string> DyLibComponents =
-          GetAllDyLibComponents(IsInDevelopmentTree, false, DirSep);
+          getAllDyLibComponents(IsInDevelopmentTree, false, DirSep);
 
-      for (auto &Component : DyLibComponents) {
+      for (auto &Component : DyLibComponents)
         FullDyLibComponents.insert(Component);
-      }
       DyLibComponents.clear();
 
       for (auto &Lib : RequiredLibs) {
@@ -681,13 +671,11 @@ int main(int argc, char **argv) {
       }
       FullDyLibComponents.clear();
 
-      if (LinkMode == LinkModeShared) {
+      if (LinkMode == LinkModeShared)
         OS << "shared\n";
-        return 0;
-      } else {
+      else
         OS << "static\n";
-        return 0;
-      }
+      return 0;
     }
 
     if (PrintLibs || PrintLibNames || PrintLibFiles) {
@@ -716,17 +704,10 @@ int main(int argc, char **argv) {
         }
       };
 
-      if (LinkMode == LinkModeShared && LinkDyLib) {
+      if (LinkMode == LinkModeShared && LinkDyLib)
         PrintForLib(DyLibName);
-      } else {
-        for (unsigned i = 0, e = RequiredLibs.size(); i != e; ++i) {
-          auto Lib = RequiredLibs[i];
-          if (i)
-            OS << ' ';
-
-          PrintForLib(Lib);
-        }
-      }
+      else
+        interleave(RequiredLibs, OS, PrintForLib, " ");
       OS << '\n';
     }
 
diff --git a/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp b/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
index 83edd4e18cc86..4533875338919 100644
--- a/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
+++ b/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
@@ -202,9 +202,10 @@ static alias IgnoreCaseAlias("i", desc("Alias for --ignore-case."),
                              aliasopt(IgnoreCase), cl::NotHidden);
 static list<std::string> Name(
     "name",
-    desc("Find and print all debug info entries whose name (DW_AT_name "
-         "attribute) matches the exact text in <pattern>.  When used with the "
-         "the -regex option <pattern> is interpreted as a regular expression."),
+    desc("Find and print all debug info entries whose name "
+         "(DW_AT_name/DW_AT_linkage_name attribute) matches the exact text "
+         "in <pattern>.  When used with the the -regex option <pattern> is "
+         "interpreted as a regular expression."),
     value_desc("pattern"), cat(DwarfDumpCategory));
 static alias NameAlias("n", desc("Alias for --name"), aliasopt(Name),
                        cl::NotHidden);
diff --git a/llvm/tools/llvm-gsymutil/llvm-gsymutil.cpp b/llvm/tools/llvm-gsymutil/llvm-gsymutil.cpp
index de83a0dc8ebe2..4c08b57fb2f2b 100644
--- a/llvm/tools/llvm-gsymutil/llvm-gsymutil.cpp
+++ b/llvm/tools/llvm-gsymutil/llvm-gsymutil.cpp
@@ -386,7 +386,9 @@ static llvm::Error handleObjectFile(ObjectFile &Obj, const std::string &OutFile,
 
   // Make a DWARF transformer object and populate the ranges of the code
   // so we don't end up adding invalid functions to GSYM data.
-  DwarfTransformer DT(*DICtx, Gsym, LoadDwarfCallSites);
+  bool IsMachO = dyn_cast<object::MachOObjectFile>(&Obj) != nullptr;
+
+  DwarfTransformer DT(*DICtx, Gsym, LoadDwarfCallSites, IsMachO);
   if (!TextRanges.empty())
     Gsym.SetValidTextRanges(TextRanges);
 
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
index b7f898ff23eba..79216e89c7cba 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
@@ -40,6 +40,7 @@
 #include "llvm/ExecutionEngine/Orc/SectCreate.h"
 #include "llvm/ExecutionEngine/Orc/SelfExecutorProcessControl.h"
 #include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h"
+#include "llvm/ExecutionEngine/Orc/SimpleRemoteMemoryMapper.h"
 #include "llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.h"
 #include "llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderPerf.h"
 #include "llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderVTune.h"
@@ -312,10 +313,19 @@ static cl::opt<bool>
                                cl::desc("Show FailedToMaterialize errors"),
                                cl::init(false), cl::cat(JITLinkCategory));
 
-static cl::opt<bool> UseSharedMemory(
-    "use-shared-memory",
-    cl::desc("Use shared memory to transfer generated code and data"),
-    cl::init(false), cl::cat(JITLinkCategory));
+enum class MemMgr { Default, Generic, SimpleRemote, Shared };
+
+static cl::opt<MemMgr> UseMemMgr(
+    "use-memmgr", cl::desc("Choose memory manager"), cl::init(MemMgr::Generic),
+    cl::values(clEnumValN(MemMgr::Default, "default",
+                          "Use setup default (InProcess or EPCGeneric)"),
+               clEnumValN(MemMgr::Generic, "generic",
+                          "Generic remote memory manager"),
+               clEnumValN(MemMgr::SimpleRemote, "simple-remote",
+                          "Mapper memory manager with simple-remote backend"),
+               clEnumValN(MemMgr::Shared, "shared",
+                          "Mapper memory manager with shared-memory manager")),
+    cl::cat(JITLinkCategory));
 
 static cl::opt<std::string>
     OverrideTriple("triple", cl::desc("Override target triple detection"),
@@ -717,6 +727,27 @@ static std::unique_ptr<JITLinkMemoryManager> createInProcessMemoryManager() {
           SlabSize));
 }
 
+Expected<std::unique_ptr<jitlink::JITLinkMemoryManager>>
+createSimpleRemoteMemoryManager(SimpleRemoteEPC &SREPC) {
+  SimpleRemoteMemoryMapper::SymbolAddrs SAs;
+  if (auto Err = SREPC.getBootstrapSymbols(
+          {{SAs.Instance, rt::SimpleExecutorMemoryManagerInstanceName},
+           {SAs.Reserve, rt::SimpleExecutorMemoryManagerReserveWrapperName},
+           {SAs.Initialize,
+            rt::SimpleExecutorMemoryManagerInitializeWrapperName},
+           {SAs.Deinitialize,
+            rt::SimpleExecutorMemoryManagerDeinitializeWrapperName},
+           {SAs.Release, rt::SimpleExecutorMemoryManagerReleaseWrapperName}}))
+    return std::move(Err);
+#ifdef _WIN32
+  size_t SlabSize = 1024 * 1024;
+#else
+  size_t SlabSize = 1024 * 1024 * 1024;
+#endif
+  return MapperJITLinkMemoryManager::CreateWithMapper<SimpleRemoteMemoryMapper>(
+      SlabSize, SREPC, SAs);
+}
+
 Expected<std::unique_ptr<jitlink::JITLinkMemoryManager>>
 createSharedMemoryManager(SimpleRemoteEPC &SREPC) {
   SharedMemoryMapper::SymbolAddrs SAs;
@@ -745,6 +776,19 @@ createSharedMemoryManager(SimpleRemoteEPC &SREPC) {
       SlabSize, SREPC, SAs);
 }
 
+static void setupEPCRemoteMemoryManager(SimpleRemoteEPC::Setup &S) {
+  switch (UseMemMgr) {
+  case MemMgr::Default:
+  case MemMgr::Generic:
+    break;
+  case MemMgr::SimpleRemote:
+    S.CreateMemoryManager = createSimpleRemoteMemoryManager;
+    break;
+  case MemMgr::Shared:
+    S.CreateMemoryManager = createSharedMemoryManager;
+    break;
+  }
+}
 
 static Expected<MaterializationUnit::Interface>
 getTestObjectFileInterface(Session &S, MemoryBufferRef O) {
@@ -904,8 +948,7 @@ static Expected<std::unique_ptr<ExecutorProcessControl>> launchExecutor() {
   close(FromExecutor[WriteEnd]);
 
   auto S = SimpleRemoteEPC::Setup();
-  if (UseSharedMemory)
-    S.CreateMemoryManager = createSharedMemoryManager;
+  setupEPCRemoteMemoryManager(S);
 
   return SimpleRemoteEPC::Create<FDSimpleRemoteEPCTransport>(
       std::make_unique<DynamicThreadPoolTaskDispatcher>(MaterializationThreads),
@@ -994,8 +1037,7 @@ static Expected<std::unique_ptr<ExecutorProcessControl>> connectToExecutor() {
     return SockFD.takeError();
 
   auto S = SimpleRemoteEPC::Setup();
-  if (UseSharedMemory)
-    S.CreateMemoryManager = createSharedMemoryManager;
+  setupEPCRemoteMemoryManager(S);
 
   return SimpleRemoteEPC::Create<FDSimpleRemoteEPCTransport>(
       std::make_unique<DynamicThreadPoolTaskDispatcher>(std::nullopt),
diff --git a/llvm/tools/llvm-link/llvm-link.cpp b/llvm/tools/llvm-link/llvm-link.cpp
index 93b1fb6031cec..33c3e6fc350fb 100644
--- a/llvm/tools/llvm-link/llvm-link.cpp
+++ b/llvm/tools/llvm-link/llvm-link.cpp
@@ -420,7 +420,7 @@ static bool linkFiles(const char *argv0, LLVMContext &Context, Linker &L,
       // does not do the ThinLink that would normally determine what values to
       // promote.
       for (auto &I : *Index) {
-        for (auto &S : I.second.SummaryList) {
+        for (auto &S : I.second.getSummaryList()) {
           if (GlobalValue::isLocalLinkage(S->linkage()))
             S->setLinkage(GlobalValue::ExternalLinkage);
         }
diff --git a/llvm/tools/llvm-lto/llvm-lto.cpp b/llvm/tools/llvm-lto/llvm-lto.cpp
index 05e9502e3abbe..09f7142d0dcdc 100644
--- a/llvm/tools/llvm-lto/llvm-lto.cpp
+++ b/llvm/tools/llvm-lto/llvm-lto.cpp
@@ -379,7 +379,7 @@ static void printIndexStats() {
 
     unsigned Calls = 0, Refs = 0, Functions = 0, Alias = 0, Globals = 0;
     for (auto &Summaries : *Index) {
-      for (auto &Summary : Summaries.second.SummaryList) {
+      for (auto &Summary : Summaries.second.getSummaryList()) {
         Refs += Summary->refs().size();
         if (auto *FuncSummary = dyn_cast<FunctionSummary>(Summary.get())) {
           Functions++;
diff --git a/llvm/tools/llvm-mc-assemble-fuzzer/llvm-mc-assemble-fuzzer.cpp b/llvm/tools/llvm-mc-assemble-fuzzer/llvm-mc-assemble-fuzzer.cpp
index fa56d0dc998ef..fb5c0bf45b8cd 100644
--- a/llvm/tools/llvm-mc-assemble-fuzzer/llvm-mc-assemble-fuzzer.cpp
+++ b/llvm/tools/llvm-mc-assemble-fuzzer/llvm-mc-assemble-fuzzer.cpp
@@ -31,6 +31,7 @@
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/Host.h"
 #include "llvm/TargetParser/SubtargetFeature.h"
@@ -142,6 +143,7 @@ int AssembleOneInput(const uint8_t *Data, size_t Size) {
 
   static const std::vector<std::string> NoIncludeDirs;
   SrcMgr.setIncludeDirs(NoIncludeDirs);
+  SrcMgr.setVirtualFileSystem(vfs::getRealFileSystem());
 
   static std::string ArchName;
   std::string Error;
diff --git a/llvm/tools/llvm-mc/llvm-mc.cpp b/llvm/tools/llvm-mc/llvm-mc.cpp
index 2a89961cd7bbb..3b2d4f8625a4c 100644
--- a/llvm/tools/llvm-mc/llvm-mc.cpp
+++ b/llvm/tools/llvm-mc/llvm-mc.cpp
@@ -40,6 +40,7 @@
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/TimeProfiler.h"
 #include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/Support/WithColor.h"
 #include "llvm/TargetParser/Host.h"
 #include <memory>
@@ -439,6 +440,7 @@ int main(int argc, char **argv) {
   // Record the location of the include directories so that the lexer can find
   // it later.
   SrcMgr.setIncludeDirs(IncludeDirs);
+  SrcMgr.setVirtualFileSystem(vfs::getRealFileSystem());
 
   std::unique_ptr<MCRegisterInfo> MRI(TheTarget->createMCRegInfo(TheTriple));
   assert(MRI && "Unable to create target register info!");
diff --git a/llvm/tools/llvm-ml/llvm-ml.cpp b/llvm/tools/llvm-ml/llvm-ml.cpp
index cda86e77f3eb4..7b88576e6075c 100644
--- a/llvm/tools/llvm-ml/llvm-ml.cpp
+++ b/llvm/tools/llvm-ml/llvm-ml.cpp
@@ -41,6 +41,7 @@
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/Support/WithColor.h"
 #include "llvm/TargetParser/Host.h"
 #include <ctime>
@@ -313,6 +314,7 @@ int llvm_ml_main(int Argc, char **Argv, const llvm::ToolContext &) {
     }
   }
   SrcMgr.setIncludeDirs(IncludeDirs);
+  SrcMgr.setVirtualFileSystem(vfs::getRealFileSystem());
 
   std::unique_ptr<MCRegisterInfo> MRI(TheTarget->createMCRegInfo(TheTriple));
   assert(MRI && "Unable to create target register info!");
diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp
index 901722c8f9ccd..299c51fb999c3 100644
--- a/llvm/tools/llvm-objdump/llvm-objdump.cpp
+++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp
@@ -729,11 +729,17 @@ class PrettyPrinter {
     } while (!Comments.empty());
     FOS.flush();
   }
+
+  // Hook invoked when starting to disassemble a symbol at the current position.
+  // Default is no-op.
+  virtual void onSymbolStart() {}
 };
 PrettyPrinter PrettyPrinterInst;
 
 class HexagonPrettyPrinter : public PrettyPrinter {
 public:
+  void onSymbolStart() override { reset(); }
+
   void printLead(ArrayRef<uint8_t> Bytes, uint64_t Address,
                  formatted_raw_ostream &OS) {
     if (LeadingAddr)
@@ -2229,6 +2235,8 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj,
         Start += Size;
         break;
       }
+      // Allow targets to reset any per-symbol state.
+      DT->Printer->onSymbolStart();
       formatted_raw_ostream FOS(OS);
       Index = Start;
       if (SectionAddr < StartAddress)
diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index ab93316907cc6..9c9b2dd79e686 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -8155,6 +8155,8 @@ void LLVMELFDumper<ELFT>::printBBAddrMaps(bool PrettyPGOAnalysis) {
             W.printHex("Offset", BBE.Offset);
             if (!BBE.CallsiteEndOffsets.empty())
               W.printList("Callsite End Offsets", BBE.CallsiteEndOffsets);
+            if (PAM.FeatEnable.BBHash)
+              W.printHex("Hash", BBE.Hash);
             W.printHex("Size", BBE.Size);
             W.printBoolean("HasReturn", BBE.hasReturn());
             W.printBoolean("HasTailCall", BBE.hasTailCall());
diff --git a/llvm/tools/llvm-reduce/deltas/ReduceInstructions.cpp b/llvm/tools/llvm-reduce/deltas/ReduceInstructions.cpp
index f1f5d6b83de10..19b69e84c2ebc 100644
--- a/llvm/tools/llvm-reduce/deltas/ReduceInstructions.cpp
+++ b/llvm/tools/llvm-reduce/deltas/ReduceInstructions.cpp
@@ -13,6 +13,8 @@
 
 #include "ReduceInstructions.h"
 #include "Utils.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
 
 using namespace llvm;
 
@@ -37,7 +39,9 @@ void llvm::reduceInstructionsDeltaPass(Oracle &O, ReducerWorkItem &WorkItem) {
       for (auto &Inst :
            make_early_inc_range(make_range(BB.begin(), std::prev(BB.end())))) {
         if (!shouldAlwaysKeep(Inst) && !O.shouldKeep()) {
-          Inst.replaceAllUsesWith(getDefaultValue(Inst.getType()));
+          Inst.replaceAllUsesWith(isa<AllocaInst>(Inst)
+                                      ? PoisonValue::get(Inst.getType())
+                                      : getDefaultValue(Inst.getType()));
           Inst.eraseFromParent();
         }
       }
diff --git a/llvm/tools/llvm-xray/func-id-helper.h b/llvm/tools/llvm-xray/func-id-helper.h
index d99fb7c1cfb0c..d1a66282bab0b 100644
--- a/llvm/tools/llvm-xray/func-id-helper.h
+++ b/llvm/tools/llvm-xray/func-id-helper.h
@@ -17,8 +17,7 @@
 #include "llvm/DebugInfo/Symbolize/Symbolize.h"
 #include <unordered_map>
 
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
 
 // This class consolidates common operations related to Function IDs.
 class FuncIdConversionHelper {
@@ -45,7 +44,6 @@ class FuncIdConversionHelper {
   std::string FileLineAndColumn(int32_t FuncId) const;
 };
 
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
 
 #endif // LLVM_TOOLS_LLVM_XRAY_FUNC_ID_HELPER_H
diff --git a/llvm/tools/llvm-xray/trie-node.h b/llvm/tools/llvm-xray/trie-node.h
index 7bff81473b5df..b42b0293620dd 100644
--- a/llvm/tools/llvm-xray/trie-node.h
+++ b/llvm/tools/llvm-xray/trie-node.h
@@ -21,6 +21,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 
+namespace llvm {
 /// A type to represent a trie of invocations. It is useful to construct a
 /// graph of these nodes from reading an XRay trace, such that each function
 /// call can be placed in a larger context.
@@ -87,5 +88,6 @@ mergeTrieNodes(const TrieNode<T> &Left, const TrieNode<T> &Right,
 
   return Node;
 }
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_XRAY_STACK_TRIE_H
diff --git a/llvm/tools/llvm-xray/xray-account.cpp b/llvm/tools/llvm-xray/xray-account.cpp
index 24a3552cfb91e..5168aa1c88bc6 100644
--- a/llvm/tools/llvm-xray/xray-account.cpp
+++ b/llvm/tools/llvm-xray/xray-account.cpp
@@ -118,18 +118,16 @@ static cl::opt<std::string>
 static cl::alias AccountInstrMap2("m", cl::aliasopt(AccountInstrMap),
                                   cl::desc("Alias for -instr_map"));
 
-namespace {
-
-template <class T, class U> void setMinMax(std::pair<T, T> &MM, U &&V) {
+template <class T, class U> static void setMinMax(std::pair<T, T> &MM, U &&V) {
   if (MM.first == 0 || MM.second == 0)
     MM = std::make_pair(std::forward<U>(V), std::forward<U>(V));
   else
     MM = std::make_pair(std::min(MM.first, V), std::max(MM.second, V));
 }
 
-template <class T> T diff(T L, T R) { return std::max(L, R) - std::min(L, R); }
-
-} // namespace
+template <class T> static T diff(T L, T R) {
+  return std::max(L, R) - std::min(L, R);
+}
 
 using RecursionStatus = LatencyAccountant::FunctionStack::RecursionStatus;
 RecursionStatus &RecursionStatus::operator++() {
@@ -143,6 +141,7 @@ RecursionStatus &RecursionStatus::operator++() {
                                                 true); // Storage |= INT_MIN
   return *this;
 }
+
 RecursionStatus &RecursionStatus::operator--() {
   auto Depth = Bitfield::get<RecursionStatus::Depth>(Storage);
   assert(Depth > 0);
@@ -153,6 +152,7 @@ RecursionStatus &RecursionStatus::operator--() {
     Bitfield::set<RecursionStatus::IsRecursive>(Storage, false); // Storage = 0
   return *this;
 }
+
 bool RecursionStatus::isRecursive() const {
   return Bitfield::get<RecursionStatus::IsRecursive>(Storage); // Storage s< 0
 }
@@ -270,8 +270,9 @@ struct ResultRow {
   std::string DebugInfo;
   std::string Function;
 };
+} // namespace
 
-ResultRow getStats(MutableArrayRef<uint64_t> Timings) {
+static ResultRow getStats(MutableArrayRef<uint64_t> Timings) {
   assert(!Timings.empty());
   ResultRow R;
   R.Sum = std::accumulate(Timings.begin(), Timings.end(), 0.0);
@@ -296,8 +297,6 @@ ResultRow getStats(MutableArrayRef<uint64_t> Timings) {
   return R;
 }
 
-} // namespace
-
 using TupleType = std::tuple<int32_t, uint64_t, ResultRow>;
 
 template <typename F>
@@ -417,11 +416,8 @@ void LatencyAccountant::exportStatsAsCSV(raw_ostream &OS,
   });
 }
 
-using namespace llvm::xray;
-
-namespace llvm {
-template <> struct format_provider<llvm::xray::RecordTypes> {
-  static void format(const llvm::xray::RecordTypes &T, raw_ostream &Stream,
+template <> struct llvm::format_provider<RecordTypes> {
+  static void format(const RecordTypes &T, raw_ostream &Stream,
                      StringRef Style) {
     switch (T) {
     case RecordTypes::ENTER:
@@ -445,7 +441,6 @@ template <> struct format_provider<llvm::xray::RecordTypes> {
     }
   }
 };
-} // namespace llvm
 
 static CommandRegistration Unused(&Account, []() -> Error {
   InstrumentationMap Map;
@@ -468,10 +463,10 @@ static CommandRegistration Unused(&Account, []() -> Error {
 
   const auto &FunctionAddresses = Map.getFunctionAddresses();
   symbolize::LLVMSymbolizer Symbolizer;
-  llvm::xray::FuncIdConversionHelper FuncIdHelper(AccountInstrMap, Symbolizer,
-                                                  FunctionAddresses);
-  xray::LatencyAccountant FCA(FuncIdHelper, AccountRecursiveCallsOnly,
-                              AccountDeduceSiblingCalls);
+  FuncIdConversionHelper FuncIdHelper(AccountInstrMap, Symbolizer,
+                                      FunctionAddresses);
+  LatencyAccountant FCA(FuncIdHelper, AccountRecursiveCallsOnly,
+                        AccountDeduceSiblingCalls);
   auto TraceOrErr = loadTraceFile(AccountInput);
   if (!TraceOrErr)
     return joinErrors(
diff --git a/llvm/tools/llvm-xray/xray-account.h b/llvm/tools/llvm-xray/xray-account.h
index 0f24f93b9fd10..c4e20731af083 100644
--- a/llvm/tools/llvm-xray/xray-account.h
+++ b/llvm/tools/llvm-xray/xray-account.h
@@ -21,8 +21,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/XRay/XRayRecord.h"
 
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
 
 class LatencyAccountant {
 public:
@@ -107,7 +106,6 @@ class LatencyAccountant {
   template <class F> void exportStats(const XRayFileHeader &Header, F fn) const;
 };
 
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
 
 #endif // LLVM_TOOLS_LLVM_XRAY_XRAY_ACCOUNT_H
diff --git a/llvm/tools/llvm-xray/xray-color-helper.h b/llvm/tools/llvm-xray/xray-color-helper.h
index 3141e90cc893c..ae95c921769f0 100644
--- a/llvm/tools/llvm-xray/xray-color-helper.h
+++ b/llvm/tools/llvm-xray/xray-color-helper.h
@@ -16,8 +16,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include <tuple>
 
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
 
 /// The color helper class it a healper class which allows you to easily get a
 /// color in a gradient. This is used to color-code edges in XRay-Graph tools.
@@ -82,6 +81,6 @@ class ColorHelper {
   // Convert a tuple to a string
   static std::string getColorString(std::tuple<uint8_t, uint8_t, uint8_t> t);
 };
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
+
 #endif
diff --git a/llvm/tools/llvm-xray/xray-converter.cpp b/llvm/tools/llvm-xray/xray-converter.cpp
index 34832ebda0224..0f45fef670014 100644
--- a/llvm/tools/llvm-xray/xray-converter.cpp
+++ b/llvm/tools/llvm-xray/xray-converter.cpp
@@ -176,13 +176,14 @@ struct StackIdData {
   // unique ID.
   SmallVector<TrieNode<StackIdData> *, 4> siblings;
 };
+} // namespace
 
 using StackTrieNode = TrieNode<StackIdData>;
 
 // A helper function to find the sibling nodes for an encountered function in a
 // thread of execution. Relies on the invariant that each time a new node is
 // traversed in a thread, sibling bidirectional pointers are maintained.
-SmallVector<StackTrieNode *, 4>
+static SmallVector<StackTrieNode *, 4>
 findSiblings(StackTrieNode *parent, int32_t FnId, uint32_t TId,
              const DenseMap<uint32_t, SmallVector<StackTrieNode *, 4>>
                  &StackRootsByThreadId) {
@@ -213,7 +214,7 @@ findSiblings(StackTrieNode *parent, int32_t FnId, uint32_t TId,
 // StackTrie representing the function call stack. If no node exists, creates
 // the node. Assigns unique IDs to stacks newly encountered among all threads
 // and keeps sibling links up to when creating new nodes.
-StackTrieNode *findOrCreateStackNode(
+static StackTrieNode *findOrCreateStackNode(
     StackTrieNode *Parent, int32_t FuncId, uint32_t TId,
     DenseMap<uint32_t, SmallVector<StackTrieNode *, 4>> &StackRootsByThreadId,
     DenseMap<unsigned, StackTrieNode *> &StacksByStackId, unsigned *id_counter,
@@ -244,12 +245,13 @@ StackTrieNode *findOrCreateStackNode(
   return CurrentStack;
 }
 
-void writeTraceViewerRecord(uint16_t Version, raw_ostream &OS, int32_t FuncId,
-                            uint32_t TId, uint32_t PId, bool Symbolize,
-                            const FuncIdConversionHelper &FuncIdHelper,
-                            double EventTimestampUs,
-                            const StackTrieNode &StackCursor,
-                            StringRef FunctionPhenotype) {
+static void writeTraceViewerRecord(uint16_t Version, raw_ostream &OS,
+                                   int32_t FuncId, uint32_t TId, uint32_t PId,
+                                   bool Symbolize,
+                                   const FuncIdConversionHelper &FuncIdHelper,
+                                   double EventTimestampUs,
+                                   const StackTrieNode &StackCursor,
+                                   StringRef FunctionPhenotype) {
   OS << "    ";
   if (Version >= 3) {
     OS << llvm::formatv(
@@ -269,8 +271,6 @@ void writeTraceViewerRecord(uint16_t Version, raw_ostream &OS, int32_t FuncId,
   }
 }
 
-} // namespace
-
 void TraceConverter::exportAsChromeTraceEventFormat(const Trace &Records,
                                                     raw_ostream &OS) {
   const auto &FH = Records.getFileHeader();
@@ -364,9 +364,6 @@ void TraceConverter::exportAsChromeTraceEventFormat(const Trace &Records,
   OS << "}\n";     // Close the JSON entry.
 }
 
-namespace llvm {
-namespace xray {
-
 static CommandRegistration Unused(&Convert, []() -> Error {
   // FIXME: Support conversion to BINARY when upgrading XRay trace versions.
   InstrumentationMap Map;
@@ -386,9 +383,9 @@ static CommandRegistration Unused(&Convert, []() -> Error {
   if (Demangle.getPosition() < NoDemangle.getPosition())
     SymbolizerOpts.Demangle = false;
   symbolize::LLVMSymbolizer Symbolizer(SymbolizerOpts);
-  llvm::xray::FuncIdConversionHelper FuncIdHelper(ConvertInstrMap, Symbolizer,
-                                                  FunctionAddresses);
-  llvm::xray::TraceConverter TC(FuncIdHelper, ConvertSymbolize);
+  FuncIdConversionHelper FuncIdHelper(ConvertInstrMap, Symbolizer,
+                                      FunctionAddresses);
+  TraceConverter TC(FuncIdHelper, ConvertSymbolize);
   std::error_code EC;
   raw_fd_ostream OS(ConvertOutput, EC,
                     ConvertOutputFormat == ConvertFormats::BINARY
@@ -420,6 +417,3 @@ static CommandRegistration Unused(&Convert, []() -> Error {
   }
   return Error::success();
 });
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/tools/llvm-xray/xray-converter.h b/llvm/tools/llvm-xray/xray-converter.h
index db6d2b1614ee2..0f8efa4626081 100644
--- a/llvm/tools/llvm-xray/xray-converter.h
+++ b/llvm/tools/llvm-xray/xray-converter.h
@@ -17,8 +17,7 @@
 #include "llvm/XRay/Trace.h"
 #include "llvm/XRay/XRayRecord.h"
 
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
 
 class TraceConverter {
   FuncIdConversionHelper &FuncIdHelper;
@@ -37,7 +36,6 @@ class TraceConverter {
   void exportAsChromeTraceEventFormat(const Trace &Records, raw_ostream &OS);
 };
 
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
 
 #endif // LLVM_TOOLS_LLVM_XRAY_XRAY_CONVERTER_H
diff --git a/llvm/tools/llvm-xray/xray-extract.cpp b/llvm/tools/llvm-xray/xray-extract.cpp
index 52767a00f6152..70fe0111b0c40 100644
--- a/llvm/tools/llvm-xray/xray-extract.cpp
+++ b/llvm/tools/llvm-xray/xray-extract.cpp
@@ -52,10 +52,8 @@ static cl::opt<bool> NoDemangle("no-demangle",
                                 cl::desc("don't demangle symbols"),
                                 cl::sub(Extract));
 
-namespace {
-
-void exportAsYAML(const InstrumentationMap &Map, raw_ostream &OS,
-                  FuncIdConversionHelper &FH) {
+static void exportAsYAML(const InstrumentationMap &Map, raw_ostream &OS,
+                         FuncIdConversionHelper &FH) {
   // First we translate the sleds into the YAMLXRaySledEntry objects in a deque.
   std::vector<YAMLXRaySledEntry> YAMLSleds;
   auto Sleds = Map.sleds();
@@ -72,8 +70,6 @@ void exportAsYAML(const InstrumentationMap &Map, raw_ostream &OS,
   Out << YAMLSleds;
 }
 
-} // namespace
-
 static CommandRegistration Unused(&Extract, []() -> Error {
   auto InstrumentationMapOrError = loadInstrumentationMap(ExtractInput);
   if (!InstrumentationMapOrError)
@@ -94,8 +90,8 @@ static CommandRegistration Unused(&Extract, []() -> Error {
   if (Demangle.getPosition() < NoDemangle.getPosition())
     opts.Demangle = false;
   symbolize::LLVMSymbolizer Symbolizer(opts);
-  llvm::xray::FuncIdConversionHelper FuncIdHelper(ExtractInput, Symbolizer,
-                                                  FunctionAddresses);
+  FuncIdConversionHelper FuncIdHelper(ExtractInput, Symbolizer,
+                                      FunctionAddresses);
   exportAsYAML(*InstrumentationMapOrError, OS, FuncIdHelper);
   return Error::success();
 });
diff --git a/llvm/tools/llvm-xray/xray-graph-diff.cpp b/llvm/tools/llvm-xray/xray-graph-diff.cpp
index b5c63ab0a9183..6679909121fab 100644
--- a/llvm/tools/llvm-xray/xray-graph-diff.cpp
+++ b/llvm/tools/llvm-xray/xray-graph-diff.cpp
@@ -236,6 +236,7 @@ Expected<GraphDiffRenderer> GraphDiffRenderer::Factory::getGraphDiffRenderer() {
 
   return R;
 }
+
 // Returns the Relative change With respect to LeftStat between LeftStat
 // and RightStat.
 static double statRelDiff(const GraphDiffRenderer::TimeStat &LeftStat,
@@ -363,9 +364,8 @@ void GraphDiffRenderer::exportGraphAsDOT(raw_ostream &OS, StatType EdgeLabel,
   StringMap<int32_t> VertexNo;
 
   int i = 0;
-  for (const auto &V : G.vertices()) {
+  for (const auto &V : G.vertices())
     VertexNo[V.first] = i++;
-  }
 
   ColorHelper H(ColorHelper::DivergingScheme::PiYG);
 
diff --git a/llvm/tools/llvm-xray/xray-graph-diff.h b/llvm/tools/llvm-xray/xray-graph-diff.h
index c2b2a938bfbc6..709f1bf1339d9 100644
--- a/llvm/tools/llvm-xray/xray-graph-diff.h
+++ b/llvm/tools/llvm-xray/xray-graph-diff.h
@@ -17,8 +17,7 @@
 #include "xray-graph.h"
 #include "llvm/XRay/Graph.h"
 
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
 
 // This class creates a graph representing the difference between two
 // xray-graphs And allows you to print it to a dot file, with optional color
@@ -66,7 +65,6 @@ class GraphDiffRenderer {
 
   const GraphT &getGraph() { return G; }
 };
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
 
 #endif
diff --git a/llvm/tools/llvm-xray/xray-graph.cpp b/llvm/tools/llvm-xray/xray-graph.cpp
index a97aacfd67b7a..71d30295e4dbf 100644
--- a/llvm/tools/llvm-xray/xray-graph.cpp
+++ b/llvm/tools/llvm-xray/xray-graph.cpp
@@ -153,7 +153,9 @@ static cl::opt<GraphRenderer::StatType> GraphVertexColorType(
 static cl::alias GraphVertexColorType2("b", cl::aliasopt(GraphVertexColorType),
                                        cl::desc("Alias for -edge-label"));
 
-template <class T> T diff(T L, T R) { return std::max(L, R) - std::min(L, R); }
+template <class T> static T diff(T L, T R) {
+  return std::max(L, R) - std::min(L, R);
+}
 
 // Updates the statistics for a GraphRenderer::TimeStat
 static void updateStat(GraphRenderer::TimeStat &S, int64_t L) {
@@ -459,10 +461,9 @@ Expected<GraphRenderer> GraphRenderer::Factory::getGraphRenderer() {
   symbolize::LLVMSymbolizer Symbolizer;
   const auto &Header = Trace.getFileHeader();
 
-  llvm::xray::FuncIdConversionHelper FuncIdHelper(InstrMap, Symbolizer,
-                                                  FunctionAddresses);
+  FuncIdConversionHelper FuncIdHelper(InstrMap, Symbolizer, FunctionAddresses);
 
-  xray::GraphRenderer GR(FuncIdHelper, DeduceSiblingCalls);
+  GraphRenderer GR(FuncIdHelper, DeduceSiblingCalls);
   for (const auto &Record : Trace) {
     auto E = GR.accountRecord(Record);
     if (!E)
diff --git a/llvm/tools/llvm-xray/xray-graph.h b/llvm/tools/llvm-xray/xray-graph.h
index 23372d40f05ee..fd9644908426f 100644
--- a/llvm/tools/llvm-xray/xray-graph.h
+++ b/llvm/tools/llvm-xray/xray-graph.h
@@ -28,8 +28,7 @@
 #include "llvm/XRay/Trace.h"
 #include "llvm/XRay/XRayRecord.h"
 
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
 
 /// A class encapsulating the logic related to analyzing XRay traces, producting
 /// Graphs from them and then exporting those graphs for review.
@@ -225,7 +224,6 @@ inline GraphRenderer::TimeStat operator/(const GraphRenderer::TimeStat &A,
           A.Pct90 / B.Pct90, A.Pct99 / B.Pct99, A.Max / B.Max,
           A.Sum / B.Sum};
 }
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
 
 #endif // XRAY_GRAPH_H
diff --git a/llvm/tools/llvm-xray/xray-registry.cpp b/llvm/tools/llvm-xray/xray-registry.cpp
index 34ac07ebe45c1..ae18f8d6f57c1 100644
--- a/llvm/tools/llvm-xray/xray-registry.cpp
+++ b/llvm/tools/llvm-xray/xray-registry.cpp
@@ -13,8 +13,8 @@
 
 #include <unordered_map>
 
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace xray;
 
 using HandlerType = std::function<Error()>;
 
@@ -31,12 +31,9 @@ CommandRegistration::CommandRegistration(cl::SubCommand *SC,
   getCommands()[SC] = Command;
 }
 
-HandlerType dispatch(cl::SubCommand *SC) {
+HandlerType xray::dispatch(cl::SubCommand *SC) {
   auto It = getCommands().find(SC);
   assert(It != getCommands().end() &&
          "Attempting to dispatch on un-registered SubCommand.");
   return It->second;
 }
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/tools/llvm-xray/xray-registry.h b/llvm/tools/llvm-xray/xray-registry.h
index d6fae78ea53cc..3921a423c8fad 100644
--- a/llvm/tools/llvm-xray/xray-registry.h
+++ b/llvm/tools/llvm-xray/xray-registry.h
@@ -15,8 +15,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Error.h"
 
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
 
 // Use |CommandRegistration| as a global initialiser that registers a function
 // and associates it with |SC|. This requires that a command has not been
@@ -34,7 +33,6 @@ struct CommandRegistration {
 // Requires that |SC| is not null and has an associated function to it.
 std::function<Error()> dispatch(cl::SubCommand *SC);
 
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
 
 #endif // TOOLS_LLVM_XRAY_XRAY_REGISTRY_H
diff --git a/llvm/tools/llvm-xray/xray-stacks.cpp b/llvm/tools/llvm-xray/xray-stacks.cpp
index b11d732a4fcc0..8101cad28ca1c 100644
--- a/llvm/tools/llvm-xray/xray-stacks.cpp
+++ b/llvm/tools/llvm-xray/xray-stacks.cpp
@@ -107,6 +107,8 @@ static cl::opt<AggregationType> RequestedAggregation(
                    "of all callees.")),
     cl::sub(Stack), cl::init(AggregationType::TOTAL_TIME));
 
+namespace {
+
 /// A helper struct to work with formatv and XRayRecords. Makes it easier to
 /// use instrumentation map names or addresses in formatted output.
 struct format_xray_record : public FormatAdapter<XRayRecord> {
@@ -252,10 +254,9 @@ struct format_xray_record : public FormatAdapter<XRayRecord> {
 /// maintain an index of unique functions, and provide a means of iterating
 /// through all the instrumented call stacks which we know about.
 
-namespace {
 struct StackDuration {
-  llvm::SmallVector<int64_t, 4> TerminalDurations;
-  llvm::SmallVector<int64_t, 4> IntermediateDurations;
+  SmallVector<int64_t, 4> TerminalDurations;
+  SmallVector<int64_t, 4> IntermediateDurations;
 };
 } // namespace
 
@@ -310,6 +311,7 @@ std::size_t GetValueForStack(const StackTrieNode *Node) {
   return 0;
 }
 
+namespace {
 class StackTrie {
   // Avoid the magic number of 4 propagated through the code with an alias.
   // We use this SmallVector to track the root nodes in a call graph.
@@ -649,6 +651,7 @@ class StackTrie {
     OS << "\n";
   }
 };
+} // namespace
 
 static std::string CreateErrorMessage(StackTrie::AccountRecordStatus Error,
                                       const XRayRecord &Record,
diff --git a/llvm/tools/obj2yaml/elf2yaml.cpp b/llvm/tools/obj2yaml/elf2yaml.cpp
index ef4552f734736..68e18f6c79202 100644
--- a/llvm/tools/obj2yaml/elf2yaml.cpp
+++ b/llvm/tools/obj2yaml/elf2yaml.cpp
@@ -900,7 +900,7 @@ ELFDumper<ELFT>::dumpBBAddrMapSection(const Elf_Shdr *Shdr) {
   while (Cur && Cur.tell() < Content.size()) {
     if (Shdr->sh_type == ELF::SHT_LLVM_BB_ADDR_MAP) {
       Version = Data.getU8(Cur);
-      if (Cur && Version > 3)
+      if (Cur && Version > 4)
         return createStringError(
             errc::invalid_argument,
             "invalid SHT_LLVM_BB_ADDR_MAP section version: " +
@@ -946,8 +946,11 @@ ELFDumper<ELFT>::dumpBBAddrMapSection(const Elf_Shdr *Shdr) {
         }
         uint64_t Size = Data.getULEB128(Cur);
         uint64_t Metadata = Data.getULEB128(Cur);
+        std::optional<llvm::yaml::Hex64> Hash;
+        if (FeatureOrErr->BBHash)
+          Hash = Data.getU64(Cur);
         BBEntries.push_back(
-            {ID, Offset, Size, Metadata, std::move(CallsiteEndOffsets)});
+            {ID, Offset, Size, Metadata, std::move(CallsiteEndOffsets), Hash});
       }
       TotalNumBlocks += BBEntries.size();
       BBRanges.push_back({BaseAddress, /*NumBlocks=*/{}, BBEntries});
diff --git a/llvm/tools/opt/NewPMDriver.cpp b/llvm/tools/opt/NewPMDriver.cpp
index c19fc19f90afe..a383415ff1cb2 100644
--- a/llvm/tools/opt/NewPMDriver.cpp
+++ b/llvm/tools/opt/NewPMDriver.cpp
@@ -45,32 +45,30 @@
 using namespace llvm;
 using namespace opt_tool;
 
-namespace llvm {
-cl::opt<bool> DebugifyEach(
+cl::opt<bool> llvm::DebugifyEach(
     "debugify-each",
     cl::desc("Start each pass with debugify and end it with check-debugify"));
 
-cl::opt<std::string>
-    DebugifyExport("debugify-export",
-                   cl::desc("Export per-pass debugify statistics to this file"),
-                   cl::value_desc("filename"));
+cl::opt<std::string> llvm::DebugifyExport(
+    "debugify-export",
+    cl::desc("Export per-pass debugify statistics to this file"),
+    cl::value_desc("filename"));
 
-cl::opt<bool> VerifyEachDebugInfoPreserve(
+cl::opt<bool> llvm::VerifyEachDebugInfoPreserve(
     "verify-each-debuginfo-preserve",
     cl::desc("Start each pass with collecting and end it with checking of "
              "debug info preservation."));
 
+cl::opt<std::string> llvm::VerifyDIPreserveExport(
+    "verify-di-preserve-export",
+    cl::desc("Export debug info preservation failures into "
+             "specified (JSON) file (should be abs path as we use"
+             " append mode to insert new JSON objects)"),
+    cl::value_desc("filename"), cl::init(""));
+
 static cl::opt<bool> EnableLoopFusion("enable-loopfusion", cl::init(false),
                                       cl::Hidden,
                                       cl::desc("Enable the LoopFuse Pass"));
-cl::opt<std::string>
-    VerifyDIPreserveExport("verify-di-preserve-export",
-                   cl::desc("Export debug info preservation failures into "
-                            "specified (JSON) file (should be abs path as we use"
-                            " append mode to insert new JSON objects)"),
-                   cl::value_desc("filename"), cl::init(""));
-
-} // namespace llvm
 
 enum class DebugLogging { None, Normal, Verbose, Quiet };
 
@@ -356,7 +354,7 @@ bool llvm::runPassPipeline(
     ToolOutputFile *Out, ToolOutputFile *ThinLTOLinkOut,
     ToolOutputFile *OptRemarkFile, StringRef PassPipeline,
     ArrayRef<PassPlugin> PassPlugins,
-    ArrayRef<std::function<void(llvm::PassBuilder &)>> PassBuilderCallbacks,
+    ArrayRef<std::function<void(PassBuilder &)>> PassBuilderCallbacks,
     OutputKind OK, VerifierKind VK, bool ShouldPreserveAssemblyUseListOrder,
     bool ShouldPreserveBitcodeUseListOrder, bool EmitSummaryIndex,
     bool EmitModuleHash, bool EnableDebugify, bool VerifyDIPreserve,
diff --git a/llvm/tools/opt/NewPMDriver.h b/llvm/tools/opt/NewPMDriver.h
index 6c21d6cae4e75..042d5d4bbfe47 100644
--- a/llvm/tools/opt/NewPMDriver.h
+++ b/llvm/tools/opt/NewPMDriver.h
@@ -52,7 +52,7 @@ enum PGOKind {
   SampleUse
 };
 enum CSPGOKind { NoCSPGO, CSInstrGen, CSInstrUse };
-}
+} // namespace opt_tool
 
 void printPasses(raw_ostream &OS);
 
@@ -70,7 +70,7 @@ bool runPassPipeline(
     ToolOutputFile *Out, ToolOutputFile *ThinLinkOut,
     ToolOutputFile *OptRemarkFile, StringRef PassPipeline,
     ArrayRef<PassPlugin> PassPlugins,
-    ArrayRef<std::function<void(llvm::PassBuilder &)>> PassBuilderCallbacks,
+    ArrayRef<std::function<void(PassBuilder &)>> PassBuilderCallbacks,
     opt_tool::OutputKind OK, opt_tool::VerifierKind VK,
     bool ShouldPreserveAssemblyUseListOrder,
     bool ShouldPreserveBitcodeUseListOrder, bool EmitSummaryIndex,
diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp
index fc8bd665cbbe9..ad0f8f4ac05f9 100644
--- a/llvm/tools/opt/opt.cpp
+++ b/llvm/tools/opt/opt.cpp
@@ -14,12 +14,14 @@
 #include "llvm/ADT/ArrayRef.h"
 #include <functional>
 
+using namespace llvm;
+
 namespace llvm {
 class PassBuilder;
 }
 
-extern "C" int optMain(int argc, char **argv,
-                       llvm::ArrayRef<std::function<void(llvm::PassBuilder &)>>
-                           PassBuilderCallbacks);
+extern "C" int
+optMain(int argc, char **argv,
+        ArrayRef<std::function<void(PassBuilder &)>> PassBuilderCallbacks);
 
 int main(int argc, char **argv) { return optMain(argc, argv, {}); }
diff --git a/llvm/tools/opt/optdriver.cpp b/llvm/tools/opt/optdriver.cpp
index 2ac8de7999c47..f70db3133f69d 100644
--- a/llvm/tools/opt/optdriver.cpp
+++ b/llvm/tools/opt/optdriver.cpp
@@ -296,23 +296,25 @@ static CodeGenOptLevel GetCodeGenOptLevel() {
   return static_cast<CodeGenOptLevel>(unsigned(CodeGenOptLevelCL));
 }
 
+namespace {
 struct TimeTracerRAII {
   TimeTracerRAII(StringRef ProgramName) {
     if (TimeTrace)
       timeTraceProfilerInitialize(TimeTraceGranularity, ProgramName);
   }
   ~TimeTracerRAII() {
-    if (TimeTrace) {
-      if (auto E = timeTraceProfilerWrite(TimeTraceFile, OutputFilename)) {
-        handleAllErrors(std::move(E), [&](const StringError &SE) {
-          errs() << SE.getMessage() << "\n";
-        });
-        return;
-      }
-      timeTraceProfilerCleanup();
+    if (!TimeTrace)
+      return;
+    if (auto E = timeTraceProfilerWrite(TimeTraceFile, OutputFilename)) {
+      handleAllErrors(std::move(E), [&](const StringError &SE) {
+        errs() << SE.getMessage() << "\n";
+      });
+      return;
     }
+    timeTraceProfilerCleanup();
   }
 };
+} // namespace
 
 // For use in NPM transition. Currently this contains most codegen-specific
 // passes. Remove passes from here when porting to the NPM.
@@ -377,10 +379,10 @@ static bool shouldPinPassToLegacyPM(StringRef Pass) {
       "callbrprepare",
       "scalarizer",
   };
-  for (const auto &P : PassNamePrefix)
+  for (StringLiteral P : PassNamePrefix)
     if (Pass.starts_with(P))
       return true;
-  for (const auto &P : PassNameContain)
+  for (StringLiteral P : PassNameContain)
     if (Pass.contains(P))
       return true;
   return llvm::is_contained(PassNameExact, Pass);
@@ -388,7 +390,7 @@ static bool shouldPinPassToLegacyPM(StringRef Pass) {
 
 // For use in NPM transition.
 static bool shouldForceLegacyPM() {
-  for (const auto &P : PassList) {
+  for (const PassInfo *P : PassList) {
     StringRef Arg = P->getPassArgument();
     if (shouldPinPassToLegacyPM(Arg))
       return true;
@@ -399,9 +401,9 @@ static bool shouldForceLegacyPM() {
 //===----------------------------------------------------------------------===//
 // main for opt
 //
-extern "C" int optMain(
-    int argc, char **argv,
-    ArrayRef<std::function<void(llvm::PassBuilder &)>> PassBuilderCallbacks) {
+extern "C" int
+optMain(int argc, char **argv,
+        ArrayRef<std::function<void(PassBuilder &)>> PassBuilderCallbacks) {
   InitLLVM X(argc, argv);
 
   // Enable debug stream buffering.
@@ -673,7 +675,7 @@ extern "C" int optMain(
   else {
     // Disable individual builtin functions in TargetLibraryInfo.
     LibFunc F;
-    for (auto &FuncName : DisableBuiltins) {
+    for (const std::string &FuncName : DisableBuiltins) {
       if (TLII.getLibFunc(FuncName, F))
         TLII.setUnavailable(F);
       else {
@@ -683,7 +685,7 @@ extern "C" int optMain(
       }
     }
 
-    for (auto &FuncName : EnableBuiltins) {
+    for (const std::string &FuncName : EnableBuiltins) {
       if (TLII.getLibFunc(FuncName, F))
         TLII.setAvailable(F);
       else {
@@ -814,9 +816,8 @@ extern "C" int optMain(
     Passes.add(TPC);
   }
 
-  // Create a new optimization pass for each one specified on the command line
-  for (unsigned i = 0; i < PassList.size(); ++i) {
-    const PassInfo *PassInf = PassList[i];
+  // Create a new optimization pass for each one specified on the command line.
+  for (const PassInfo *PassInf : PassList) {
     if (PassInf->getNormalCtor()) {
       Pass *P = PassInf->getNormalCtor()();
       if (P) {
@@ -826,9 +827,10 @@ extern "C" int optMain(
         if (VerifyEach)
           Passes.add(createVerifierPass());
       }
-    } else
+    } else {
       errs() << argv[0] << ": cannot create pass: " << PassInf->getPassName()
              << "\n";
+    }
   }
 
   // Check that the module is well formed on completion of optimization
diff --git a/llvm/unittests/ADT/BitTest.cpp b/llvm/unittests/ADT/BitTest.cpp
index eaed4e1fe327d..e8041bb6f7f58 100644
--- a/llvm/unittests/ADT/BitTest.cpp
+++ b/llvm/unittests/ADT/BitTest.cpp
@@ -270,6 +270,44 @@ TEST(BitTest, BitWidthConstexpr) {
       llvm::bit_width_constexpr(std::numeric_limits<uint64_t>::max()) == 64);
 }
 
+TEST(BitTest, BitCeilConstexpr) {
+  static_assert(llvm::bit_ceil_constexpr(0u) == 1);
+  static_assert(llvm::bit_ceil_constexpr(1u) == 1);
+  static_assert(llvm::bit_ceil_constexpr(2u) == 2);
+  static_assert(llvm::bit_ceil_constexpr(3u) == 4);
+  static_assert(llvm::bit_ceil_constexpr(4u) == 4);
+  static_assert(llvm::bit_ceil_constexpr(5u) == 8);
+  static_assert(llvm::bit_ceil_constexpr(6u) == 8);
+  static_assert(llvm::bit_ceil_constexpr(7u) == 8);
+  static_assert(llvm::bit_ceil_constexpr(8u) == 8);
+
+  static_assert(llvm::bit_ceil_constexpr(255u) == 256);
+  static_assert(llvm::bit_ceil_constexpr(256u) == 256);
+  static_assert(llvm::bit_ceil_constexpr(257u) == 512);
+}
+
+TEST(BitTest, CountrZeroConstexpr) {
+  static_assert(llvm::countr_zero_constexpr(0u) == 32);
+  static_assert(llvm::countr_zero_constexpr(1u) == 0);
+  static_assert(llvm::countr_zero_constexpr(2u) == 1);
+  static_assert(llvm::countr_zero_constexpr(3u) == 0);
+  static_assert(llvm::countr_zero_constexpr(4u) == 2);
+  static_assert(llvm::countr_zero_constexpr(8u) == 3);
+  static_assert(llvm::countr_zero_constexpr(0x80000000u) == 31);
+
+  static_assert(llvm::countr_zero_constexpr(0ull) == 64);
+  static_assert(llvm::countr_zero_constexpr(1ull) == 0);
+  static_assert(llvm::countr_zero_constexpr(0x100000000ull) == 32);
+  static_assert(llvm::countr_zero_constexpr(0x8000000000000000ull) == 63);
+
+  static_assert(
+      llvm::countr_zero_constexpr(std::numeric_limits<uint16_t>::max()) == 0);
+  static_assert(
+      llvm::countr_zero_constexpr(std::numeric_limits<uint32_t>::max()) == 0);
+  static_assert(
+      llvm::countr_zero_constexpr(std::numeric_limits<uint64_t>::max()) == 0);
+}
+
 TEST(BitTest, CountlZero) {
   uint8_t Z8 = 0;
   uint16_t Z16 = 0;
diff --git a/llvm/unittests/ADT/STLForwardCompatTest.cpp b/llvm/unittests/ADT/STLForwardCompatTest.cpp
index 4a8f53cf72f94..2a97e8d6a552f 100644
--- a/llvm/unittests/ADT/STLForwardCompatTest.cpp
+++ b/llvm/unittests/ADT/STLForwardCompatTest.cpp
@@ -184,4 +184,26 @@ TEST(TransformTest, ToUnderlying) {
   static_assert(llvm::to_underlying(E3::B3) == 0);
 }
 
+TEST(STLForwardCompatTest, IdentityCxx20) {
+  llvm::identity_cxx20 identity;
+
+  // Test with an lvalue.
+  int X = 42;
+  int &Y = identity(X);
+  EXPECT_EQ(&X, &Y);
+
+  // Test with a const lvalue.
+  const int CX = 10;
+  const int &CY = identity(CX);
+  EXPECT_EQ(&CX, &CY);
+
+  // Test with an rvalue.
+  EXPECT_EQ(identity(123), 123);
+
+  // Test perfect forwarding.
+  static_assert(std::is_same_v<int &, decltype(identity(X))>);
+  static_assert(std::is_same_v<const int &, decltype(identity(CX))>);
+  static_assert(std::is_same_v<int &&, decltype(identity(int(5)))>);
+}
+
 } // namespace
diff --git a/llvm/unittests/ADT/SmallVectorTest.cpp b/llvm/unittests/ADT/SmallVectorTest.cpp
index 137dd43b47306..e2e778f44b5e9 100644
--- a/llvm/unittests/ADT/SmallVectorTest.cpp
+++ b/llvm/unittests/ADT/SmallVectorTest.cpp
@@ -127,24 +127,24 @@ class Constructable {
     return c0.getValue() == c1.getValue();
   }
 
-  friend bool LLVM_ATTRIBUTE_UNUSED operator!=(const Constructable &c0,
-                                               const Constructable &c1) {
+  [[maybe_unused]] friend bool operator!=(const Constructable &c0,
+                                          const Constructable &c1) {
     return c0.getValue() != c1.getValue();
   }
 
   friend bool operator<(const Constructable &c0, const Constructable &c1) {
     return c0.getValue() < c1.getValue();
   }
-  friend bool LLVM_ATTRIBUTE_UNUSED operator<=(const Constructable &c0,
-                                               const Constructable &c1) {
+  [[maybe_unused]] friend bool operator<=(const Constructable &c0,
+                                          const Constructable &c1) {
     return c0.getValue() <= c1.getValue();
   }
-  friend bool LLVM_ATTRIBUTE_UNUSED operator>(const Constructable &c0,
-                                              const Constructable &c1) {
+  [[maybe_unused]] friend bool operator>(const Constructable &c0,
+                                         const Constructable &c1) {
     return c0.getValue() > c1.getValue();
   }
-  friend bool LLVM_ATTRIBUTE_UNUSED operator>=(const Constructable &c0,
-                                               const Constructable &c1) {
+  [[maybe_unused]] friend bool operator>=(const Constructable &c0,
+                                          const Constructable &c1) {
     return c0.getValue() >= c1.getValue();
   }
 };
diff --git a/llvm/unittests/ADT/StringSwitchTest.cpp b/llvm/unittests/ADT/StringSwitchTest.cpp
index 0fbf37153593e..d88a0ff034d61 100644
--- a/llvm/unittests/ADT/StringSwitchTest.cpp
+++ b/llvm/unittests/ADT/StringSwitchTest.cpp
@@ -159,7 +159,7 @@ TEST(StringSwitchTest, Cases) {
     return llvm::StringSwitch<OSType>(S)
         .Cases(StringLiteral::withInnerNUL("wind\0ws"), "win32", "winnt",
                OSType::Windows)
-        .Cases("linux", "unix", "*nix", "posix", OSType::Linux)
+        .Cases({"linux", "unix", "*nix", "posix"}, OSType::Linux)
         .Cases({"macos", "osx"}, OSType::MacOS)
         .Default(OSType::Unknown);
   };
@@ -191,7 +191,7 @@ TEST(StringSwitchTest, CasesLower) {
     return llvm::StringSwitch<OSType>(S)
         .CasesLower(StringLiteral::withInnerNUL("wind\0ws"), "win32", "winnt",
                     OSType::Windows)
-        .CasesLower("linux", "unix", "*nix", "posix", OSType::Linux)
+        .CasesLower({"linux", "unix", "*nix", "posix"}, OSType::Linux)
         .CasesLower({"macos", "osx"}, OSType::MacOS)
         .Default(OSType::Unknown);
   };
@@ -230,13 +230,13 @@ TEST(StringSwitchTest, CasesCopies) {
 
   // Check that evaluating multiple cases does not cause unnecessary copies.
   unsigned NumCopies = 0;
-  llvm::StringSwitch<Copyable, void>("baz").Cases("foo", "bar", "baz", "qux",
+  llvm::StringSwitch<Copyable, void>("baz").Cases({"foo", "bar", "baz", "qux"},
                                                   Copyable{NumCopies});
   EXPECT_EQ(NumCopies, 1u);
 
   NumCopies = 0;
   llvm::StringSwitch<Copyable, void>("baz").CasesLower(
-      "Foo", "Bar", "Baz", "Qux", Copyable{NumCopies});
+      {"Foo", "Bar", "Baz", "Qux"}, Copyable{NumCopies});
   EXPECT_EQ(NumCopies, 1u);
 }
 
diff --git a/llvm/unittests/ADT/TypeTraitsTest.cpp b/llvm/unittests/ADT/TypeTraitsTest.cpp
index a56aa7e98cfe0..f9b8d6d2573d4 100644
--- a/llvm/unittests/ADT/TypeTraitsTest.cpp
+++ b/llvm/unittests/ADT/TypeTraitsTest.cpp
@@ -40,9 +40,7 @@ struct Foo {
 struct CheckMethodPointer : CheckFunctionTraits<decltype(&Foo::func)> {};
 
 /// Test lambda references.
-LLVM_ATTRIBUTE_UNUSED auto lambdaFunc = [](const int &v) -> bool {
-  return true;
-};
+[[maybe_unused]] auto lambdaFunc = [](const int &v) -> bool { return true; };
 struct CheckLambda : CheckFunctionTraits<decltype(lambdaFunc)> {};
 
 } // end anonymous namespace
diff --git a/llvm/unittests/Analysis/DXILResourceTest.cpp b/llvm/unittests/Analysis/DXILResourceTest.cpp
index ee37fad04f389..8c3a2134ae0d6 100644
--- a/llvm/unittests/Analysis/DXILResourceTest.cpp
+++ b/llvm/unittests/Analysis/DXILResourceTest.cpp
@@ -369,10 +369,8 @@ TEST(DXILResource, AnnotationsAndMetadata) {
   {
     StructType *CBufStruct =
         StructType::create(Context, {Floatx4Ty, Floatx4Ty}, "cb0");
-    TargetExtType *CBufLayoutType =
-        llvm::TargetExtType::get(Context, "dx.Layout", CBufStruct, {32, 0, 16});
     ResourceTypeInfo RTI(
-        llvm::TargetExtType::get(Context, "dx.CBuffer", CBufLayoutType));
+        llvm::TargetExtType::get(Context, "dx.CBuffer", CBufStruct));
     EXPECT_EQ(RTI.getResourceClass(), ResourceClass::CBuffer);
     EXPECT_EQ(RTI.getCBufferSize(DL), 32u);
     EXPECT_EQ(RTI.getResourceKind(), ResourceKind::CBuffer);
diff --git a/llvm/unittests/Analysis/ValueTrackingTest.cpp b/llvm/unittests/Analysis/ValueTrackingTest.cpp
index 559a0b724f383..bb0280ee69cfd 100644
--- a/llvm/unittests/Analysis/ValueTrackingTest.cpp
+++ b/llvm/unittests/Analysis/ValueTrackingTest.cpp
@@ -1091,6 +1091,16 @@ TEST_F(ValueTrackingTest, isGuaranteedNotToBeUndefOrPoison) {
   }
 }
 
+TEST_F(ValueTrackingTest, isGuaranteedNotToBeUndefOrPoison_splat) {
+  parseAssembly(
+      "define <4 x i32> @test(i32 noundef %x) {\n"
+      "  %ins = insertelement <4 x i32> poison, i32 %x, i32 0\n"
+      "  %A = shufflevector <4 x i32> %ins, <4 x i32> poison, <4 x i32> zeroinitializer\n"
+      "  ret <4 x i32> %A\n"
+      "}");
+  EXPECT_TRUE(isGuaranteedNotToBeUndefOrPoison(A));
+}
+
 TEST_F(ValueTrackingTest, isGuaranteedNotToBeUndefOrPoison_assume) {
   parseAssembly("declare i1 @f_i1()\n"
                 "declare i32 @f_i32()\n"
diff --git a/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp b/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp
index 3ab2caf702f6a..57e15a48c0bff 100644
--- a/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp
+++ b/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp
@@ -39,20 +39,21 @@ TEST(DataLayoutUpgradeTest, ValidDataLayoutUpgrade) {
                  "64-i128:128-n32:64-S128-Fn32");
 
   // Check that AMDGPU targets add -G1 if it's not present.
-  EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32", "r600"), "e-p:32:32-G1");
+  EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32", "r600"), "m:e-e-p:32:32-G1");
   // and that ANDGCN adds p7 and p8 as well.
   EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64", "amdgcn"),
-            "e-p:64:64-G1-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:192:"
-            "256:256:32");
+            "m:e-e-p:64:64-G1-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:"
+            "192:256:256:32");
   EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64-G1", "amdgcn"),
-            "e-p:64:64-G1-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:192:"
-            "256:256:32");
+            "m:e-e-p:64:64-G1-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:"
+            "192:256:256:32");
   // Check that the old AMDGCN p8:128:128 definition is upgraded
   EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64-p8:128:128-G1", "amdgcn"),
-            "e-p:64:64-p8:128:128:128:48-G1-ni:7:8:9-p7:160:256:256:32-"
-            "p9:192:256:256:32");
+            "m:e-e-p:64:64-p8:128:128:128:48-G1-ni:7:8:9-p7:160:256:256:32-p9:"
+            "192:256:256:32");
   // but that r600 does not.
-  EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32-G1", "r600"), "e-p:32:32-G1");
+  EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32-G1", "r600"),
+            "m:e-e-p:32:32-G1");
 
   // Ensure that the non-integral direction for address space 8 doesn't get
   // added in to pointer declarations.
@@ -62,11 +63,10 @@ TEST(DataLayoutUpgradeTest, ValidDataLayoutUpgrade) {
           "64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-"
           "v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7",
           "amdgcn"),
-      "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-"
-      "v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:"
+      "m:e-e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:"
+      "64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:"
       "1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9-p7:160:256:256:32-p8:128:128:"
-      "128:48-"
-      "p9:192:256:256:32");
+      "128:48-p9:192:256:256:32");
 
   // Check that RISCV64 upgrades -n64 to -n32:64.
   EXPECT_EQ(UpgradeDataLayoutString("e-m:e-p:64:64-i64:64-i128:128-n64-S128",
@@ -147,28 +147,29 @@ TEST(DataLayoutUpgradeTest, NoDataLayoutUpgrade) {
                  "64-S128-Fn32");
 
   // Check that AMDGPU targets don't add -G1 if there is already a -G flag.
-  EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32-G2", "r600"), "e-p:32:32-G2");
-  EXPECT_EQ(UpgradeDataLayoutString("G2", "r600"), "G2");
+  EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32-G2", "r600"),
+            "m:e-e-p:32:32-G2");
+  EXPECT_EQ(UpgradeDataLayoutString("G2", "r600"), "m:e-G2");
   EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64-G2", "amdgcn"),
-            "e-p:64:64-G2-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:192:"
-            "256:256:32");
+            "m:e-e-p:64:64-G2-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:"
+            "192:256:256:32");
   EXPECT_EQ(UpgradeDataLayoutString("G2-e-p:64:64", "amdgcn"),
-            "G2-e-p:64:64-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:192:"
-            "256:256:32");
+            "m:e-G2-e-p:64:64-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:"
+            "192:256:256:32");
   EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64-G0", "amdgcn"),
-            "e-p:64:64-G0-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:192:"
-            "256:256:32");
+            "m:e-e-p:64:64-G0-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:"
+            "192:256:256:32");
 
   // Check that AMDGCN targets don't add already declared address space 7.
   EXPECT_EQ(
       UpgradeDataLayoutString("e-p:64:64-p7:64:64", "amdgcn"),
-      "e-p:64:64-p7:64:64-G1-ni:7:8:9-p8:128:128:128:48-p9:192:256:256:32");
+      "m:e-e-p:64:64-p7:64:64-G1-ni:7:8:9-p8:128:128:128:48-p9:192:256:256:32");
   EXPECT_EQ(
       UpgradeDataLayoutString("p7:64:64-G2-e-p:64:64", "amdgcn"),
-      "p7:64:64-G2-e-p:64:64-ni:7:8:9-p8:128:128:128:48-p9:192:256:256:32");
+      "m:e-p7:64:64-G2-e-p:64:64-ni:7:8:9-p8:128:128:128:48-p9:192:256:256:32");
   EXPECT_EQ(
       UpgradeDataLayoutString("e-p:64:64-p7:64:64-G1", "amdgcn"),
-      "e-p:64:64-p7:64:64-G1-ni:7:8:9-p8:128:128:128:48-p9:192:256:256:32");
+      "m:e-e-p:64:64-p7:64:64-G1-ni:7:8:9-p8:128:128:128:48-p9:192:256:256:32");
 
   // Check that SPIR & SPIRV targets don't add -G1 if there is already a -G
   // flag.
@@ -198,10 +199,10 @@ TEST(DataLayoutUpgradeTest, EmptyDataLayout) {
   EXPECT_EQ(DL2, "e-m:e-p:32:32-i64:64-f80:128-n8:16:32:64-S128");
 
   // Check that AMDGPU targets add G1 if it's not present.
-  EXPECT_EQ(UpgradeDataLayoutString("", "r600"), "G1");
+  EXPECT_EQ(UpgradeDataLayoutString("", "r600"), "m:e-G1");
   EXPECT_EQ(
       UpgradeDataLayoutString("", "amdgcn"),
-      "G1-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32");
+      "m:e-G1-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32");
 
   // Check that SPIR & SPIRV targets add G1 if it's not present.
   EXPECT_EQ(UpgradeDataLayoutString("", "spir"), "G1");
diff --git a/llvm/unittests/CAS/CASTestConfig.cpp b/llvm/unittests/CAS/CASTestConfig.cpp
index 29e2db48db5c0..91d0970367ac3 100644
--- a/llvm/unittests/CAS/CASTestConfig.cpp
+++ b/llvm/unittests/CAS/CASTestConfig.cpp
@@ -19,3 +19,17 @@ static CASTestingEnv createInMemory(int I) {
 
 INSTANTIATE_TEST_SUITE_P(InMemoryCAS, CASTest,
                          ::testing::Values(createInMemory));
+
+#if LLVM_ENABLE_ONDISK_CAS
+namespace llvm::cas::ondisk {
+extern void setMaxMappingSize(uint64_t Size);
+} // namespace llvm::cas::ondisk
+
+void setMaxOnDiskCASMappingSize() {
+  static std::once_flag Flag;
+  std::call_once(
+      Flag, [] { llvm::cas::ondisk::setMaxMappingSize(100 * 1024 * 1024); });
+}
+#else
+void setMaxOnDiskCASMappingSize() {}
+#endif /* LLVM_ENABLE_ONDISK_CAS */
diff --git a/llvm/unittests/CAS/CASTestConfig.h b/llvm/unittests/CAS/CASTestConfig.h
index 8093a0b0864f9..c08968b95b9cc 100644
--- a/llvm/unittests/CAS/CASTestConfig.h
+++ b/llvm/unittests/CAS/CASTestConfig.h
@@ -18,6 +18,17 @@ struct CASTestingEnv {
   std::unique_ptr<llvm::cas::ActionCache> Cache;
 };
 
+void setMaxOnDiskCASMappingSize();
+
+// Test fixture for on-disk data base tests.
+class OnDiskCASTest : public ::testing::Test {
+protected:
+  void SetUp() override {
+    // Use a smaller database size for testing to conserve disk space.
+    setMaxOnDiskCASMappingSize();
+  }
+};
+
 class CASTest
     : public testing::TestWithParam<std::function<CASTestingEnv(int)>> {
 protected:
diff --git a/llvm/unittests/CAS/CMakeLists.txt b/llvm/unittests/CAS/CMakeLists.txt
index ee40e6c9879a1..da469f7fccb5a 100644
--- a/llvm/unittests/CAS/CMakeLists.txt
+++ b/llvm/unittests/CAS/CMakeLists.txt
@@ -1,3 +1,19 @@
+set(ONDISK_CAS_TEST_SOURCES
+  OnDiskGraphDBTest.cpp
+  OnDiskDataAllocatorTest.cpp
+  OnDiskKeyValueDBTest.cpp
+  OnDiskTrieRawHashMapTest.cpp
+  ProgramTest.cpp
+  )
+
+set(LLVM_OPTIONAL_SOURCES
+  ${ONDISK_CAS_TEST_SOURCES}
+  )
+
+if (NOT LLVM_ENABLE_ONDISK_CAS)
+  unset(ONDISK_CAS_TEST_SOURCES)
+endif()
+
 set(LLVM_LINK_COMPONENTS
   Support
   CAS
@@ -8,9 +24,8 @@ add_llvm_unittest(CASTests
   ActionCacheTest.cpp
   CASTestConfig.cpp
   ObjectStoreTest.cpp
-  OnDiskDataAllocatorTest.cpp
-  OnDiskTrieRawHashMapTest.cpp
-  ProgramTest.cpp
+
+  ${ONDISK_CAS_TEST_SOURCES}
   )
 
 target_link_libraries(CASTests PRIVATE LLVMTestingSupport)
diff --git a/llvm/unittests/CAS/OnDiskCommonUtils.h b/llvm/unittests/CAS/OnDiskCommonUtils.h
new file mode 100644
index 0000000000000..57c8c228867fd
--- /dev/null
+++ b/llvm/unittests/CAS/OnDiskCommonUtils.h
@@ -0,0 +1,76 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Helper functions to test OnDiskCASDatabases.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/BuiltinObjectHasher.h"
+#include "llvm/CAS/OnDiskGraphDB.h"
+#include "llvm/Support/BLAKE3.h"
+#include "llvm/Testing/Support/Error.h"
+
+namespace llvm::unittest::cas {
+
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+
+using HasherT = BLAKE3;
+using HashType = decltype(HasherT::hash(std::declval<ArrayRef<uint8_t> &>()));
+using ValueType = std::array<char, 20>;
+
+inline HashType digest(StringRef Data, ArrayRef<ArrayRef<uint8_t>> RefHashes) {
+  return BuiltinObjectHasher<HasherT>::hashObject(
+      RefHashes, arrayRefFromStringRef<char>(Data));
+}
+
+inline ObjectID digest(OnDiskGraphDB &DB, StringRef Data,
+                       ArrayRef<ObjectID> Refs) {
+  SmallVector<ArrayRef<uint8_t>, 8> RefHashes;
+  for (ObjectID Ref : Refs)
+    RefHashes.push_back(DB.getDigest(Ref));
+  HashType Digest = digest(Data, RefHashes);
+  std::optional<ObjectID> ID;
+  EXPECT_THAT_ERROR(DB.getReference(Digest).moveInto(ID), Succeeded());
+  return *ID;
+}
+
+inline HashType digest(StringRef Data) {
+  return HasherT::hash(arrayRefFromStringRef(Data));
+}
+
+inline ValueType valueFromString(StringRef S) {
+  ValueType Val;
+  llvm::copy(S.substr(0, sizeof(Val)), Val.data());
+  return Val;
+}
+
+inline Expected<ObjectID> store(OnDiskGraphDB &DB, StringRef Data,
+                                ArrayRef<ObjectID> Refs) {
+  ObjectID ID = digest(DB, Data, Refs);
+  if (Error E = DB.store(ID, Refs, arrayRefFromStringRef<char>(Data)))
+    return std::move(E);
+  return ID;
+}
+
+inline Error printTree(OnDiskGraphDB &DB, ObjectID ID, raw_ostream &OS,
+                       unsigned Indent = 0) {
+  std::optional<ondisk::ObjectHandle> Obj;
+  if (Error E = DB.load(ID).moveInto(Obj))
+    return E;
+  if (!Obj)
+    return Error::success();
+  OS.indent(Indent) << toStringRef(DB.getObjectData(*Obj)) << '\n';
+  for (ObjectID Ref : DB.getObjectRefs(*Obj)) {
+    if (Error E = printTree(DB, Ref, OS, Indent + 2))
+      return E;
+  }
+  return Error::success();
+}
+
+} // namespace llvm::unittest::cas
diff --git a/llvm/unittests/CAS/OnDiskGraphDBTest.cpp b/llvm/unittests/CAS/OnDiskGraphDBTest.cpp
new file mode 100644
index 0000000000000..3c2e96318a5ed
--- /dev/null
+++ b/llvm/unittests/CAS/OnDiskGraphDBTest.cpp
@@ -0,0 +1,310 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "CASTestConfig.h"
+#include "OnDiskCommonUtils.h"
+#include "llvm/Testing/Support/Error.h"
+#include "llvm/Testing/Support/SupportHelpers.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+using namespace llvm::unittest::cas;
+
+TEST_F(OnDiskCASTest, OnDiskGraphDBTest) {
+  unittest::TempDir Temp("ondiskcas", /*Unique=*/true);
+  std::unique_ptr<OnDiskGraphDB> DB;
+  ASSERT_THAT_ERROR(
+      OnDiskGraphDB::open(Temp.path(), "blake3", sizeof(HashType)).moveInto(DB),
+      Succeeded());
+
+  auto digest = [&DB](StringRef Data, ArrayRef<ObjectID> Refs) -> ObjectID {
+    return ::digest(*DB, Data, Refs);
+  };
+
+  auto store = [&](StringRef Data,
+                   ArrayRef<ObjectID> Refs) -> Expected<ObjectID> {
+    return ::store(*DB, Data, Refs);
+  };
+
+  std::optional<ObjectID> ID1;
+  ASSERT_THAT_ERROR(store("hello", {}).moveInto(ID1), Succeeded());
+
+  std::optional<ondisk::ObjectHandle> Obj1;
+  ASSERT_THAT_ERROR(DB->load(*ID1).moveInto(Obj1), Succeeded());
+  ASSERT_TRUE(Obj1.has_value());
+  EXPECT_EQ(toStringRef(DB->getObjectData(*Obj1)), "hello");
+
+  ArrayRef<uint8_t> Digest1 = DB->getDigest(*ID1);
+  std::optional<ObjectID> ID2;
+  ASSERT_THAT_ERROR(DB->getReference(Digest1).moveInto(ID2), Succeeded());
+  EXPECT_EQ(ID1, ID2);
+
+  ObjectID ID3 = digest("world", {});
+  EXPECT_FALSE(DB->containsObject(ID3));
+  std::optional<ondisk::ObjectHandle> Obj2;
+  ASSERT_THAT_ERROR(DB->load(ID3).moveInto(Obj2), Succeeded());
+  EXPECT_FALSE(Obj2.has_value());
+
+  ASSERT_THAT_ERROR(DB->store(ID3, {}, arrayRefFromStringRef<char>("world")),
+                    Succeeded());
+  EXPECT_TRUE(DB->containsObject(ID3));
+  ASSERT_THAT_ERROR(DB->load(ID3).moveInto(Obj2), Succeeded());
+  ASSERT_TRUE(Obj2.has_value());
+  EXPECT_EQ(toStringRef(DB->getObjectData(*Obj2)), "world");
+
+  size_t LargeDataSize = 256LL * 1024LL; // 256K.
+  // The precise size number is not important, we mainly check that the large
+  // object will be properly accounted for.
+  EXPECT_TRUE(DB->getStorageSize() > 10 &&
+              DB->getStorageSize() < LargeDataSize);
+
+  SmallString<16> Buffer;
+  Buffer.resize(LargeDataSize);
+  ASSERT_THAT_ERROR(store(Buffer, {}).moveInto(ID1), Succeeded());
+  size_t StorageSize = DB->getStorageSize();
+  EXPECT_TRUE(StorageSize > LargeDataSize);
+
+  // Close & re-open the DB and check that it reports the same storage size.
+  DB.reset();
+  ASSERT_THAT_ERROR(
+      OnDiskGraphDB::open(Temp.path(), "blake3", sizeof(HashType)).moveInto(DB),
+      Succeeded());
+  EXPECT_EQ(DB->getStorageSize(), StorageSize);
+}
+
+TEST_F(OnDiskCASTest, OnDiskGraphDBFaultInSingleNode) {
+  unittest::TempDir TempUpstream("ondiskcas-upstream", /*Unique=*/true);
+  std::unique_ptr<OnDiskGraphDB> UpstreamDB;
+  ASSERT_THAT_ERROR(
+      OnDiskGraphDB::open(TempUpstream.path(), "blake3", sizeof(HashType))
+          .moveInto(UpstreamDB),
+      Succeeded());
+  {
+    std::optional<ObjectID> ID1;
+    ASSERT_THAT_ERROR(store(*UpstreamDB, "hello", {}).moveInto(ID1),
+                      Succeeded());
+    std::optional<ObjectID> ID2;
+    ASSERT_THAT_ERROR(store(*UpstreamDB, "another", {}).moveInto(ID2),
+                      Succeeded());
+    std::optional<ObjectID> ID3;
+    ASSERT_THAT_ERROR(store(*UpstreamDB, "world", {*ID1, *ID2}).moveInto(ID3),
+                      Succeeded());
+  }
+
+  unittest::TempDir Temp("ondiskcas", /*Unique=*/true);
+  std::unique_ptr<OnDiskGraphDB> DB;
+  ASSERT_THAT_ERROR(
+      OnDiskGraphDB::open(Temp.path(), "blake3", sizeof(HashType),
+                          std::move(UpstreamDB),
+                          OnDiskGraphDB::FaultInPolicy::SingleNode)
+          .moveInto(DB),
+      Succeeded());
+
+  ObjectID ID1 = digest(*DB, "hello", {});
+  ObjectID ID2 = digest(*DB, "another", {});
+  ObjectID ID3 = digest(*DB, "world", {ID1, ID2});
+  ObjectID ID4 = digest(*DB, "world", {});
+
+  EXPECT_TRUE(DB->containsObject(ID1));
+  EXPECT_TRUE(DB->containsObject(ID2));
+  EXPECT_TRUE(DB->containsObject(ID3));
+  EXPECT_FALSE(DB->containsObject(ID4));
+
+  EXPECT_TRUE(DB->getExistingReference(digest("hello", {})).has_value());
+  EXPECT_TRUE(DB->getExistingReference(DB->getDigest(ID3)).has_value());
+  EXPECT_FALSE(DB->getExistingReference(digest("world", {})).has_value());
+
+  {
+    std::optional<ondisk::ObjectHandle> Obj;
+    ASSERT_THAT_ERROR(DB->load(ID1).moveInto(Obj), Succeeded());
+    ASSERT_TRUE(Obj.has_value());
+    EXPECT_EQ(toStringRef(DB->getObjectData(*Obj)), "hello");
+    auto Refs = DB->getObjectRefs(*Obj);
+    EXPECT_TRUE(Refs.empty());
+  }
+  {
+    std::optional<ondisk::ObjectHandle> Obj;
+    ASSERT_THAT_ERROR(DB->load(ID3).moveInto(Obj), Succeeded());
+    ASSERT_TRUE(Obj.has_value());
+    EXPECT_EQ(toStringRef(DB->getObjectData(*Obj)), "world");
+    auto Refs = DB->getObjectRefs(*Obj);
+    ASSERT_EQ(std::distance(Refs.begin(), Refs.end()), 2);
+    EXPECT_EQ(Refs.begin()[0], ID1);
+    EXPECT_EQ(Refs.begin()[1], ID2);
+  }
+  {
+    std::optional<ondisk::ObjectHandle> Obj;
+    ASSERT_THAT_ERROR(DB->load(ID4).moveInto(Obj), Succeeded());
+    EXPECT_FALSE(Obj.has_value());
+  }
+
+  // Re-open the primary without chaining, to verify the data were copied from
+  // the upstream.
+  ASSERT_THAT_ERROR(
+      OnDiskGraphDB::open(Temp.path(), "blake3", sizeof(HashType),
+                          /*UpstreamDB=*/nullptr,
+                          OnDiskGraphDB::FaultInPolicy::SingleNode)
+          .moveInto(DB),
+      Succeeded());
+  ID1 = digest(*DB, "hello", {});
+  ID2 = digest(*DB, "another", {});
+  ID3 = digest(*DB, "world", {ID1, ID2});
+  EXPECT_TRUE(DB->containsObject(ID1));
+  EXPECT_FALSE(DB->containsObject(ID2));
+  EXPECT_TRUE(DB->containsObject(ID3));
+  {
+    std::optional<ondisk::ObjectHandle> Obj;
+    ASSERT_THAT_ERROR(DB->load(ID1).moveInto(Obj), Succeeded());
+    ASSERT_TRUE(Obj.has_value());
+    EXPECT_EQ(toStringRef(DB->getObjectData(*Obj)), "hello");
+    auto Refs = DB->getObjectRefs(*Obj);
+    EXPECT_TRUE(Refs.empty());
+  }
+}
+
+TEST_F(OnDiskCASTest, OnDiskGraphDBFaultInFullTree) {
+  unittest::TempDir TempUpstream("ondiskcas-upstream", /*Unique=*/true);
+  std::unique_ptr<OnDiskGraphDB> UpstreamDB;
+  ASSERT_THAT_ERROR(
+      OnDiskGraphDB::open(TempUpstream.path(), "blake3", sizeof(HashType))
+          .moveInto(UpstreamDB),
+      Succeeded());
+  HashType RootHash;
+  {
+    std::optional<ObjectID> ID11;
+    ASSERT_THAT_ERROR(store(*UpstreamDB, "11", {}).moveInto(ID11), Succeeded());
+    std::optional<ObjectID> ID121;
+    ASSERT_THAT_ERROR(store(*UpstreamDB, "121", {}).moveInto(ID121),
+                      Succeeded());
+    std::optional<ObjectID> ID12;
+    ASSERT_THAT_ERROR(store(*UpstreamDB, "12", {*ID121}).moveInto(ID12),
+                      Succeeded());
+    std::optional<ObjectID> ID1;
+    ASSERT_THAT_ERROR(store(*UpstreamDB, "1", {*ID11, *ID12}).moveInto(ID1),
+                      Succeeded());
+    std::optional<ObjectID> ID21;
+    ASSERT_THAT_ERROR(store(*UpstreamDB, "21", {}).moveInto(ID21), Succeeded());
+    std::optional<ObjectID> ID22;
+    ASSERT_THAT_ERROR(store(*UpstreamDB, "22", {}).moveInto(ID22), Succeeded());
+    std::optional<ObjectID> ID2;
+    ASSERT_THAT_ERROR(
+        store(*UpstreamDB, "2", {*ID12, *ID21, *ID22}).moveInto(ID2),
+        Succeeded());
+    std::optional<ObjectID> IDRoot;
+    ASSERT_THAT_ERROR(store(*UpstreamDB, "root", {*ID1, *ID2}).moveInto(IDRoot),
+                      Succeeded());
+    ArrayRef<uint8_t> Digest = UpstreamDB->getDigest(*IDRoot);
+    ASSERT_EQ(Digest.size(), RootHash.size());
+    llvm::copy(Digest, RootHash.data());
+  }
+
+  unittest::TempDir Temp("ondiskcas", /*Unique=*/true);
+  std::unique_ptr<OnDiskGraphDB> DB;
+  ASSERT_THAT_ERROR(OnDiskGraphDB::open(Temp.path(), "blake3", sizeof(HashType),
+                                        std::move(UpstreamDB),
+                                        OnDiskGraphDB::FaultInPolicy::FullTree)
+                        .moveInto(DB),
+                    Succeeded());
+
+  {
+    std::optional<ObjectID> IDRoot;
+    ASSERT_THAT_ERROR(DB->getReference(RootHash).moveInto(IDRoot), Succeeded());
+    std::optional<ondisk::ObjectHandle> Obj;
+    ASSERT_THAT_ERROR(DB->load(*IDRoot).moveInto(Obj), Succeeded());
+    ASSERT_TRUE(Obj.has_value());
+    EXPECT_EQ(toStringRef(DB->getObjectData(*Obj)), "root");
+    auto Refs = DB->getObjectRefs(*Obj);
+    ASSERT_EQ(std::distance(Refs.begin(), Refs.end()), 2);
+  }
+
+  // Re-open the primary without chaining, to verify the data were copied from
+  // the upstream.
+  ASSERT_THAT_ERROR(OnDiskGraphDB::open(Temp.path(), "blake3", sizeof(HashType),
+                                        /*UpstreamDB=*/nullptr,
+                                        OnDiskGraphDB::FaultInPolicy::FullTree)
+                        .moveInto(DB),
+                    Succeeded());
+
+  std::optional<ObjectID> IDRoot;
+  ASSERT_THAT_ERROR(DB->getReference(RootHash).moveInto(IDRoot), Succeeded());
+  std::string PrintedTree;
+  raw_string_ostream OS(PrintedTree);
+  ASSERT_THAT_ERROR(printTree(*DB, *IDRoot, OS), Succeeded());
+  StringRef Expected = R"(root
+  1
+    11
+    12
+      121
+  2
+    12
+      121
+    21
+    22
+)";
+  EXPECT_EQ(PrintedTree, Expected);
+}
+
+TEST_F(OnDiskCASTest, OnDiskGraphDBFaultInPolicyConflict) {
+  auto tryFaultInPolicyConflict = [](OnDiskGraphDB::FaultInPolicy Policy1,
+                                     OnDiskGraphDB::FaultInPolicy Policy2) {
+    unittest::TempDir TempUpstream("ondiskcas-upstream", /*Unique=*/true);
+    std::unique_ptr<OnDiskGraphDB> UpstreamDB;
+    ASSERT_THAT_ERROR(
+        OnDiskGraphDB::open(TempUpstream.path(), "blake3", sizeof(HashType))
+            .moveInto(UpstreamDB),
+        Succeeded());
+
+    unittest::TempDir Temp("ondiskcas", /*Unique=*/true);
+    std::unique_ptr<OnDiskGraphDB> DB;
+    ASSERT_THAT_ERROR(OnDiskGraphDB::open(Temp.path(), "blake3",
+                                          sizeof(HashType),
+                                          std::move(UpstreamDB), Policy1)
+                          .moveInto(DB),
+                      Succeeded());
+    DB.reset();
+    ASSERT_THAT_ERROR(OnDiskGraphDB::open(Temp.path(), "blake3",
+                                          sizeof(HashType),
+                                          std::move(UpstreamDB), Policy2)
+                          .moveInto(DB),
+                      Failed());
+  };
+  // Open as 'single', then as 'full'.
+  tryFaultInPolicyConflict(OnDiskGraphDB::FaultInPolicy::SingleNode,
+                           OnDiskGraphDB::FaultInPolicy::FullTree);
+  // Open as 'full', then as 'single'.
+  tryFaultInPolicyConflict(OnDiskGraphDB::FaultInPolicy::FullTree,
+                           OnDiskGraphDB::FaultInPolicy::SingleNode);
+}
+
+#if defined(EXPENSIVE_CHECKS) && !defined(_WIN32)
+TEST_F(OnDiskCASTest, OnDiskGraphDBSpaceLimit) {
+  setMaxOnDiskCASMappingSize();
+  unittest::TempDir Temp("ondiskcas", /*Unique=*/true);
+  std::unique_ptr<OnDiskGraphDB> DB;
+  ASSERT_THAT_ERROR(
+      OnDiskGraphDB::open(Temp.path(), "blake3", sizeof(HashType)).moveInto(DB),
+      Succeeded());
+
+  std::optional<ObjectID> ID;
+  std::string Data(500, '0');
+  auto storeSmallObject = [&]() {
+    SmallVector<ObjectID, 1> Refs;
+    if (ID)
+      Refs.push_back(*ID);
+    ASSERT_THAT_ERROR(store(*DB, Data, Refs).moveInto(ID), Succeeded());
+  };
+
+  // Insert enough small elements to overflow the data pool.
+  for (unsigned I = 0; I < 1024 * 256; ++I)
+    storeSmallObject();
+
+  EXPECT_GE(DB->getHardStorageLimitUtilization(), 99U);
+}
+#endif
diff --git a/llvm/unittests/CAS/OnDiskKeyValueDBTest.cpp b/llvm/unittests/CAS/OnDiskKeyValueDBTest.cpp
new file mode 100644
index 0000000000000..19ea8f5d30b71
--- /dev/null
+++ b/llvm/unittests/CAS/OnDiskKeyValueDBTest.cpp
@@ -0,0 +1,77 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/OnDiskKeyValueDB.h"
+#include "CASTestConfig.h"
+#include "OnDiskCommonUtils.h"
+#include "llvm/Testing/Support/Error.h"
+#include "llvm/Testing/Support/SupportHelpers.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+using namespace llvm::unittest::cas;
+
+TEST_F(OnDiskCASTest, OnDiskKeyValueDBTest) {
+  unittest::TempDir Temp("ondiskkv", /*Unique=*/true);
+  std::unique_ptr<OnDiskKeyValueDB> DB;
+  ASSERT_THAT_ERROR(OnDiskKeyValueDB::open(Temp.path(), "blake3",
+                                           sizeof(HashType), "test",
+                                           sizeof(ValueType))
+                        .moveInto(DB),
+                    Succeeded());
+  {
+    std::optional<ArrayRef<char>> Val;
+    ASSERT_THAT_ERROR(DB->get(digest("hello")).moveInto(Val), Succeeded());
+    EXPECT_FALSE(Val.has_value());
+  }
+
+  ValueType ValW = valueFromString("world");
+  std::optional<ArrayRef<char>> Val;
+  ASSERT_THAT_ERROR(DB->put(digest("hello"), ValW).moveInto(Val), Succeeded());
+  EXPECT_EQ(*Val, ArrayRef(ValW));
+  ASSERT_THAT_ERROR(
+      DB->put(digest("hello"), valueFromString("other")).moveInto(Val),
+      Succeeded());
+  EXPECT_EQ(*Val, ArrayRef(ValW));
+
+  {
+    std::optional<ArrayRef<char>> Val;
+    ASSERT_THAT_ERROR(DB->get(digest("hello")).moveInto(Val), Succeeded());
+    EXPECT_TRUE(Val.has_value());
+    EXPECT_EQ(*Val, ArrayRef(ValW));
+  }
+
+  // Validate
+  {
+    auto ValidateFunc = [](FileOffset Offset, ArrayRef<char> Data) -> Error {
+      EXPECT_EQ(Data.size(), sizeof(ValueType));
+      return Error::success();
+    };
+    ASSERT_THAT_ERROR(DB->validate(ValidateFunc), Succeeded());
+  }
+
+  // Size
+  {
+    size_t InitSize = DB->getStorageSize();
+    unsigned InitPrecent = DB->getHardStorageLimitUtilization();
+
+    // Insert a lot of entries.
+    for (unsigned I = 0; I < 1024 * 100; ++I) {
+      std::string Index = Twine(I).str();
+      ArrayRef<char> Val;
+      ASSERT_THAT_ERROR(
+          DB->put(digest(Index), valueFromString(Index)).moveInto(Val),
+          Succeeded());
+    }
+
+    EXPECT_GT(DB->getStorageSize(), InitSize);
+    EXPECT_GT(DB->getHardStorageLimitUtilization(), InitPrecent);
+  }
+}
diff --git a/llvm/unittests/CodeGen/GlobalISel/LegalizerInfoTest.cpp b/llvm/unittests/CodeGen/GlobalISel/LegalizerInfoTest.cpp
index 7340f56b52f49..04cd66c7acb2a 100644
--- a/llvm/unittests/CodeGen/GlobalISel/LegalizerInfoTest.cpp
+++ b/llvm/unittests/CodeGen/GlobalISel/LegalizerInfoTest.cpp
@@ -420,12 +420,14 @@ TEST(LegalizerInfoTest, RuleSets) {
 
     // Raw type form
     LI.getActionDefinitionsBuilder(G_ADD)
-      .fewerElementsIf(typeIs(0, v4s32), changeElementCountTo(0, v2s32))
-      .fewerElementsIf(typeIs(0, v8s32), changeElementCountTo(0, s32))
-      .fewerElementsIf(typeIs(0, LLT::scalable_vector(4, 16)),
-                       changeElementCountTo(0, LLT::scalable_vector(2, 16)))
-      .fewerElementsIf(typeIs(0, LLT::scalable_vector(8, 16)),
-                       changeElementCountTo(0, s16));
+        .fewerElementsIf(typeIs(0, v4s32),
+                         changeElementCountTo(0, ElementCount::getFixed(2)))
+        .fewerElementsIf(typeIs(0, v8s32),
+                         changeElementCountTo(0, ElementCount::getFixed(1)))
+        .fewerElementsIf(typeIs(0, LLT::scalable_vector(4, s16)),
+                         changeElementCountTo(0, ElementCount::getScalable(2)))
+        .fewerElementsIf(typeIs(0, LLT::scalable_vector(8, s16)),
+                         changeElementCountTo(0, ElementCount::getFixed(1)));
 
     LegacyInfo.computeTables();
 
diff --git a/llvm/unittests/CodeGen/MIR2VecTest.cpp b/llvm/unittests/CodeGen/MIR2VecTest.cpp
index 11222b4d02fa3..8710d6b45969a 100644
--- a/llvm/unittests/CodeGen/MIR2VecTest.cpp
+++ b/llvm/unittests/CodeGen/MIR2VecTest.cpp
@@ -82,6 +82,9 @@ class MIR2VecVocabTestFixture : public ::testing::Test {
       return;
     }
 
+    // Set the data layout to match the target machine
+    M->setDataLayout(TM->createDataLayout());
+
     // Create a dummy function to get subtarget info
     FunctionType *FT = FunctionType::get(Type::getVoidTy(*Ctx), false);
     Function *F =
@@ -96,16 +99,27 @@ class MIR2VecVocabTestFixture : public ::testing::Test {
   }
 
   void TearDown() override { TII = nullptr; }
-};
 
-// Function to find an opcode by name
-static int findOpcodeByName(const TargetInstrInfo *TII, StringRef Name) {
-  for (unsigned Opcode = 1; Opcode < TII->getNumOpcodes(); ++Opcode) {
-    if (TII->getName(Opcode) == Name)
-      return Opcode;
+  // Find an opcode by name
+  int findOpcodeByName(StringRef Name) {
+    for (unsigned Opcode = 1; Opcode < TII->getNumOpcodes(); ++Opcode) {
+      if (TII->getName(Opcode) == Name)
+        return Opcode;
+    }
+    return -1; // Not found
   }
-  return -1; // Not found
-}
+
+  // Create a vocabulary with specific opcodes and embeddings
+  Expected<MIRVocabulary>
+  createTestVocab(std::initializer_list<std::pair<const char *, float>> opcodes,
+                  unsigned dimension = 2) {
+    assert(TII && "TargetInstrInfo not initialized");
+    VocabMap VMap;
+    for (const auto &[name, value] : opcodes)
+      VMap[name] = Embedding(dimension, value);
+    return MIRVocabulary::create(std::move(VMap), *TII);
+  }
+};
 
 TEST_F(MIR2VecVocabTestFixture, CanonicalOpcodeMappingTest) {
   // Test that same base opcodes get same canonical indices
@@ -118,10 +132,8 @@ TEST_F(MIR2VecVocabTestFixture, CanonicalOpcodeMappingTest) {
 
   // Create a MIRVocabulary instance to test the mapping
   // Use a minimal MIRVocabulary to trigger canonical mapping construction
-  VocabMap VMap;
   Embedding Val = Embedding(64, 1.0f);
-  VMap["ADD"] = Val;
-  auto TestVocabOrErr = MIRVocabulary::create(std::move(VMap), *TII);
+  auto TestVocabOrErr = createTestVocab({{"ADD", 1.0f}}, 64);
   ASSERT_TRUE(static_cast<bool>(TestVocabOrErr))
       << "Failed to create vocabulary: "
       << toString(TestVocabOrErr.takeError());
@@ -156,16 +168,16 @@ TEST_F(MIR2VecVocabTestFixture, CanonicalOpcodeMappingTest) {
             6880u); // X86 has >6880 unique base opcodes
 
   // Check that the embeddings for opcodes not in the vocab are zero vectors
-  int Add32rrOpcode = findOpcodeByName(TII, "ADD32rr");
+  int Add32rrOpcode = findOpcodeByName("ADD32rr");
   ASSERT_NE(Add32rrOpcode, -1) << "ADD32rr opcode not found";
   EXPECT_TRUE(TestVocab[Add32rrOpcode].approximatelyEquals(Val));
 
-  int Sub32rrOpcode = findOpcodeByName(TII, "SUB32rr");
+  int Sub32rrOpcode = findOpcodeByName("SUB32rr");
   ASSERT_NE(Sub32rrOpcode, -1) << "SUB32rr opcode not found";
   EXPECT_TRUE(
       TestVocab[Sub32rrOpcode].approximatelyEquals(Embedding(64, 0.0f)));
 
-  int Mov32rrOpcode = findOpcodeByName(TII, "MOV32rr");
+  int Mov32rrOpcode = findOpcodeByName("MOV32rr");
   ASSERT_NE(Mov32rrOpcode, -1) << "MOV32rr opcode not found";
   EXPECT_TRUE(
       TestVocab[Mov32rrOpcode].approximatelyEquals(Embedding(64, 0.0f)));
@@ -178,9 +190,7 @@ TEST_F(MIR2VecVocabTestFixture, DeterministicMapping) {
 
   // Create a MIRVocabulary instance to test deterministic mapping
   // Use a minimal MIRVocabulary to trigger canonical mapping construction
-  VocabMap VMap;
-  VMap["ADD"] = Embedding(64, 1.0f);
-  auto TestVocabOrErr = MIRVocabulary::create(std::move(VMap), *TII);
+  auto TestVocabOrErr = createTestVocab({{"ADD", 1.0f}}, 64);
   ASSERT_TRUE(static_cast<bool>(TestVocabOrErr))
       << "Failed to create vocabulary: "
       << toString(TestVocabOrErr.takeError());
@@ -189,8 +199,6 @@ TEST_F(MIR2VecVocabTestFixture, DeterministicMapping) {
   unsigned Index1 = TestVocab.getCanonicalIndexForBaseName(BaseName);
   unsigned Index2 = TestVocab.getCanonicalIndexForBaseName(BaseName);
   unsigned Index3 = TestVocab.getCanonicalIndexForBaseName(BaseName);
-
-  EXPECT_EQ(Index1, Index2);
   EXPECT_EQ(Index2, Index3);
 
   // Test across multiple runs
@@ -202,11 +210,7 @@ TEST_F(MIR2VecVocabTestFixture, DeterministicMapping) {
 
 // Test MIRVocabulary construction
 TEST_F(MIR2VecVocabTestFixture, VocabularyConstruction) {
-  VocabMap VMap;
-  VMap["ADD"] = Embedding(128, 1.0f); // Dimension 128, all values 1.0
-  VMap["SUB"] = Embedding(128, 2.0f); // Dimension 128, all values 2.0
-
-  auto VocabOrErr = MIRVocabulary::create(std::move(VMap), *TII);
+  auto VocabOrErr = createTestVocab({{"ADD", 1.0f}, {"SUB", 2.0f}}, 128);
   ASSERT_TRUE(static_cast<bool>(VocabOrErr))
       << "Failed to create vocabulary: " << toString(VocabOrErr.takeError());
   auto &Vocab = *VocabOrErr;
@@ -243,4 +247,251 @@ TEST_F(MIR2VecVocabTestFixture, EmptyVocabularyCreation) {
   }
 }
 
+// Fixture for embedding related tests
+class MIR2VecEmbeddingTestFixture : public MIR2VecVocabTestFixture {
+protected:
+  std::unique_ptr<MachineModuleInfo> MMI;
+  MachineFunction *MF = nullptr;
+
+  void SetUp() override {
+    MIR2VecVocabTestFixture::SetUp();
+    // If base class setup was skipped (TII not initialized), skip derived setup
+    if (!TII)
+      GTEST_SKIP() << "Failed to get target instruction info in "
+                      "the base class setup; Skipping test";
+
+    // Create a dummy function for MachineFunction
+    FunctionType *FT = FunctionType::get(Type::getVoidTy(*Ctx), false);
+    Function *F =
+        Function::Create(FT, Function::ExternalLinkage, "test", M.get());
+
+    MMI = std::make_unique<MachineModuleInfo>(TM.get());
+    MF = &MMI->getOrCreateMachineFunction(*F);
+  }
+
+  void TearDown() override { MIR2VecVocabTestFixture::TearDown(); }
+
+  // Create a machine instruction
+  MachineInstr *createMachineInstr(MachineBasicBlock &MBB, unsigned Opcode) {
+    const MCInstrDesc &Desc = TII->get(Opcode);
+    // Create instruction - operands don't affect opcode-based embeddings
+    MachineInstr *MI = BuildMI(MBB, MBB.end(), DebugLoc(), Desc);
+    return MI;
+  }
+
+  MachineInstr *createMachineInstr(MachineBasicBlock &MBB,
+                                   const char *OpcodeName) {
+    int Opcode = findOpcodeByName(OpcodeName);
+    if (Opcode == -1)
+      return nullptr;
+    return createMachineInstr(MBB, Opcode);
+  }
+
+  void createMachineInstrs(MachineBasicBlock &MBB,
+                           std::initializer_list<const char *> Opcodes) {
+    for (const char *OpcodeName : Opcodes) {
+      MachineInstr *MI = createMachineInstr(MBB, OpcodeName);
+      ASSERT_TRUE(MI != nullptr);
+    }
+  }
+};
+
+// Test factory method for creating embedder
+TEST_F(MIR2VecEmbeddingTestFixture, CreateSymbolicEmbedder) {
+  auto VocabOrErr = MIRVocabulary::createDummyVocabForTest(*TII, 1);
+  ASSERT_TRUE(static_cast<bool>(VocabOrErr))
+      << "Failed to create vocabulary: " << toString(VocabOrErr.takeError());
+  auto &V = *VocabOrErr;
+  auto Emb = MIREmbedder::create(MIR2VecKind::Symbolic, *MF, V);
+  EXPECT_NE(Emb, nullptr);
+}
+
+TEST_F(MIR2VecEmbeddingTestFixture, CreateInvalidMode) {
+  auto VocabOrErr = MIRVocabulary::createDummyVocabForTest(*TII, 1);
+  ASSERT_TRUE(static_cast<bool>(VocabOrErr))
+      << "Failed to create vocabulary: " << toString(VocabOrErr.takeError());
+  auto &V = *VocabOrErr;
+  auto Result = MIREmbedder::create(static_cast<MIR2VecKind>(-1), *MF, V);
+  EXPECT_FALSE(static_cast<bool>(Result));
+}
+
+// Test SymbolicMIREmbedder with simple target opcodes
+TEST_F(MIR2VecEmbeddingTestFixture, TestSymbolicEmbedder) {
+  // Create a test vocabulary with specific values
+  auto VocabOrErr = createTestVocab(
+      {
+          {"NOOP", 1.0f}, // [1.0, 1.0, 1.0, 1.0]
+          {"RET", 2.0f},  // [2.0, 2.0, 2.0, 2.0]
+          {"TRAP", 3.0f}  // [3.0, 3.0, 3.0, 3.0]
+      },
+      4);
+  ASSERT_TRUE(static_cast<bool>(VocabOrErr))
+      << "Failed to create vocabulary: " << toString(VocabOrErr.takeError());
+  auto &Vocab = *VocabOrErr;
+  // Create a basic block using fixture's MF
+  MachineBasicBlock *MBB = MF->CreateMachineBasicBlock();
+  MF->push_back(MBB);
+
+  // Use real X86 opcodes that should exist and not be pseudo
+  auto NoopInst = createMachineInstr(*MBB, "NOOP");
+  ASSERT_TRUE(NoopInst != nullptr);
+
+  auto RetInst = createMachineInstr(*MBB, "RET64");
+  ASSERT_TRUE(RetInst != nullptr);
+
+  auto TrapInst = createMachineInstr(*MBB, "TRAP");
+  ASSERT_TRUE(TrapInst != nullptr);
+
+  // Verify these are not pseudo instructions
+  ASSERT_FALSE(NoopInst->isPseudo()) << "NOOP is marked as pseudo instruction";
+  ASSERT_FALSE(RetInst->isPseudo()) << "RET is marked as pseudo instruction";
+  ASSERT_FALSE(TrapInst->isPseudo()) << "TRAP is marked as pseudo instruction";
+
+  // Create embedder
+  auto Embedder = SymbolicMIREmbedder::create(*MF, Vocab);
+  ASSERT_TRUE(Embedder != nullptr);
+
+  // Test instruction embeddings
+  auto NoopEmb = Embedder->getMInstVector(*NoopInst);
+  auto RetEmb = Embedder->getMInstVector(*RetInst);
+  auto TrapEmb = Embedder->getMInstVector(*TrapInst);
+
+  // Verify embeddings match expected values (accounting for weight scaling)
+  float ExpectedWeight = mir2vec::OpcWeight; // Global weight from command line
+  EXPECT_TRUE(NoopEmb.approximatelyEquals(Embedding(4, 1.0f * ExpectedWeight)));
+  EXPECT_TRUE(RetEmb.approximatelyEquals(Embedding(4, 2.0f * ExpectedWeight)));
+  EXPECT_TRUE(TrapEmb.approximatelyEquals(Embedding(4, 3.0f * ExpectedWeight)));
+
+  // Test basic block embedding (should be sum of instruction embeddings)
+  auto MBBVector = Embedder->getMBBVector(*MBB);
+
+  // Expected BB vector: NOOP + RET + TRAP = [1+2+3, 1+2+3, 1+2+3, 1+2+3] *
+  // weight = [6, 6, 6, 6] * weight
+  Embedding ExpectedMBBVector(4, 6.0f * ExpectedWeight);
+  EXPECT_TRUE(MBBVector.approximatelyEquals(ExpectedMBBVector));
+
+  // Test function embedding (should equal MBB embedding since we have one MBB)
+  auto MFuncVector = Embedder->getMFunctionVector();
+  EXPECT_TRUE(MFuncVector.approximatelyEquals(ExpectedMBBVector));
+}
+
+// Test embedder with multiple basic blocks
+TEST_F(MIR2VecEmbeddingTestFixture, MultipleBasicBlocks) {
+  // Create a test vocabulary
+  auto VocabOrErr = createTestVocab({{"NOOP", 1.0f}, {"TRAP", 2.0f}});
+  ASSERT_TRUE(static_cast<bool>(VocabOrErr))
+      << "Failed to create vocabulary: " << toString(VocabOrErr.takeError());
+  auto &Vocab = *VocabOrErr;
+
+  // Create two basic blocks using fixture's MF
+  MachineBasicBlock *MBB1 = MF->CreateMachineBasicBlock();
+  MachineBasicBlock *MBB2 = MF->CreateMachineBasicBlock();
+  MF->push_back(MBB1);
+  MF->push_back(MBB2);
+
+  createMachineInstrs(*MBB1, {"NOOP", "NOOP"});
+  createMachineInstr(*MBB2, "TRAP");
+
+  // Create embedder
+  auto Embedder = SymbolicMIREmbedder::create(*MF, Vocab);
+  ASSERT_TRUE(Embedder != nullptr);
+
+  // Test basic block embeddings
+  auto MBB1Vector = Embedder->getMBBVector(*MBB1);
+  auto MBB2Vector = Embedder->getMBBVector(*MBB2);
+
+  float ExpectedWeight = mir2vec::OpcWeight;
+  // BB1: NOOP + NOOP = 2 * ([1, 1] * weight)
+  Embedding ExpectedMBB1Vector(2, 2.0f * ExpectedWeight);
+  EXPECT_TRUE(MBB1Vector.approximatelyEquals(ExpectedMBB1Vector));
+
+  // BB2: TRAP = [2, 2] * weight
+  Embedding ExpectedMBB2Vector(2, 2.0f * ExpectedWeight);
+  EXPECT_TRUE(MBB2Vector.approximatelyEquals(ExpectedMBB2Vector));
+
+  // Function embedding: BB1 + BB2 = [2+2, 2+2] * weight = [4, 4] * weight
+  // Function embedding should be just the first BB embedding as the second BB
+  // is unreachable
+  auto MFuncVector = Embedder->getMFunctionVector();
+  EXPECT_TRUE(MFuncVector.approximatelyEquals(ExpectedMBB1Vector));
+
+  // Add a branch from BB1 to BB2 to make both reachable; now function embedding
+  // should be MBB1 + MBB2
+  MBB1->addSuccessor(MBB2);
+  auto NewMFuncVector = Embedder->getMFunctionVector(); // Recompute embeddings
+  Embedding ExpectedFuncVector = MBB1Vector + MBB2Vector;
+  EXPECT_TRUE(NewMFuncVector.approximatelyEquals(ExpectedFuncVector));
+}
+
+// Test embedder with empty basic block
+TEST_F(MIR2VecEmbeddingTestFixture, EmptyBasicBlock) {
+
+  // Create an empty basic block
+  MachineBasicBlock *MBB = MF->CreateMachineBasicBlock();
+  MF->push_back(MBB);
+
+  // Create embedder
+  auto VocabOrErr = MIRVocabulary::createDummyVocabForTest(*TII, 2);
+  ASSERT_TRUE(static_cast<bool>(VocabOrErr))
+      << "Failed to create vocabulary: " << toString(VocabOrErr.takeError());
+  auto &V = *VocabOrErr;
+  auto Embedder = SymbolicMIREmbedder::create(*MF, V);
+  ASSERT_TRUE(Embedder != nullptr);
+
+  // Test that empty BB has zero embedding
+  auto MBBVector = Embedder->getMBBVector(*MBB);
+  Embedding ExpectedBBVector(2, 0.0f);
+  EXPECT_TRUE(MBBVector.approximatelyEquals(ExpectedBBVector));
+
+  // Function embedding should also be zero
+  auto MFuncVector = Embedder->getMFunctionVector();
+  EXPECT_TRUE(MFuncVector.approximatelyEquals(ExpectedBBVector));
+}
+
+// Test embedder with opcodes not in vocabulary
+TEST_F(MIR2VecEmbeddingTestFixture, UnknownOpcodes) {
+  // Create a test vocabulary with limited entries
+  // SUB is intentionally not included
+  auto VocabOrErr = createTestVocab({{"ADD", 1.0f}});
+  ASSERT_TRUE(static_cast<bool>(VocabOrErr))
+      << "Failed to create vocabulary: " << toString(VocabOrErr.takeError());
+  auto &Vocab = *VocabOrErr;
+
+  // Create a basic block
+  MachineBasicBlock *MBB = MF->CreateMachineBasicBlock();
+  MF->push_back(MBB);
+
+  // Find opcodes
+  int AddOpcode = findOpcodeByName("ADD32rr");
+  int SubOpcode = findOpcodeByName("SUB32rr");
+
+  ASSERT_NE(AddOpcode, -1) << "ADD32rr opcode not found";
+  ASSERT_NE(SubOpcode, -1) << "SUB32rr opcode not found";
+
+  // Create instructions
+  MachineInstr *AddInstr = createMachineInstr(*MBB, AddOpcode);
+  MachineInstr *SubInstr = createMachineInstr(*MBB, SubOpcode);
+
+  // Create embedder
+  auto Embedder = SymbolicMIREmbedder::create(*MF, Vocab);
+  ASSERT_TRUE(Embedder != nullptr);
+
+  // Test instruction embeddings
+  auto AddVector = Embedder->getMInstVector(*AddInstr);
+  auto SubVector = Embedder->getMInstVector(*SubInstr);
+
+  float ExpectedWeight = mir2vec::OpcWeight;
+  // ADD should have the embedding from vocabulary
+  EXPECT_TRUE(
+      AddVector.approximatelyEquals(Embedding(2, 1.0f * ExpectedWeight)));
+
+  // SUB should have zero embedding (not in vocabulary)
+  EXPECT_TRUE(SubVector.approximatelyEquals(Embedding(2, 0.0f)));
+
+  // Basic block embedding should be ADD + SUB = [1.0, 1.0] * weight + [0.0,
+  // 0.0] = [1.0, 1.0] * weight
+  const auto &MBBVector = Embedder->getMBBVector(*MBB);
+  Embedding ExpectedBBVector(2, 1.0f * ExpectedWeight);
+  EXPECT_TRUE(MBBVector.approximatelyEquals(ExpectedBBVector));
+}
 } // namespace
diff --git a/llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp b/llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp
index 33f53de2e77bc..d56007371b2f2 100644
--- a/llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp
+++ b/llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp
@@ -4899,3 +4899,189 @@ TEST(GSYMTest, TestLookupsOfOverlappingAndUnequalRanges) {
   for (const auto &Line : ExpectedDumpLines)
     EXPECT_TRUE(DumpStr.find(Line) != std::string::npos);
 }
+
+TEST(GSYMTest, TestUnableToLocateDWO) {
+  // Test that llvm-gsymutil will not produce "uanble to locate DWO file" for
+  // Apple binaries. Apple uses DW_AT_GNU_dwo_id for non split DWARF purposes
+  // and this makes llvm-gsymutil create warnings and errors.
+  //
+  // 0x0000000b: DW_TAG_compile_unit
+  //               DW_AT_name        ("main.cpp")
+  //               DW_AT_language    (DW_LANG_C)
+  //               DW_AT_GNU_dwo_id  (0xfffffffe)
+  StringRef yamldata = R"(
+  debug_str:
+    - ''
+    - main.cpp
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_language
+              Form:            DW_FORM_udata
+            - Attribute:       DW_AT_GNU_dwo_id
+              Form:            DW_FORM_data4
+  debug_info:
+    - Length:          0x11
+      Version:         4
+      AbbrevTableID:   0
+      AbbrOffset:      0x0
+      AddrSize:        8
+      Entries:
+        - AbbrCode:        0x1
+          Values:
+            - Value:           0x1
+            - Value:           0x2
+            - Value:           0xFFFFFFFE
+  )";
+  auto ErrOrSections = DWARFYAML::emitDebugSections(yamldata);
+  ASSERT_THAT_EXPECTED(ErrOrSections, Succeeded());
+  std::unique_ptr<DWARFContext> DwarfContext =
+      DWARFContext::create(*ErrOrSections, 8);
+  ASSERT_TRUE(DwarfContext.get() != nullptr);
+  std::string errors;
+  raw_string_ostream OS(errors);
+  OutputAggregator OSAgg(&OS);
+  GsymCreator GC;
+  // Make a DWARF transformer that is MachO (Apple) to avoid warnings about
+  // not finding DWO files.
+  DwarfTransformer DT(*DwarfContext, GC, /*LDCS=*/false, /*MachO*/ true);
+  const uint32_t ThreadCount = 1;
+  ASSERT_THAT_ERROR(DT.convert(ThreadCount, OSAgg), Succeeded());
+  ASSERT_THAT_ERROR(GC.finalize(OSAgg), Succeeded());
+
+  // Make sure this warning is not in the binary
+  std::string warn("warning: Unable to retrieve DWO .debug_info section for");
+  EXPECT_TRUE(errors.find(warn) == std::string::npos);
+}
+
+TEST(GSYMTest, TestDWARFTransformNoErrorForMissingFileDecl) {
+  // Test that if llvm-gsymutil finds a line table for a compile unit and if
+  // there are no matching entries for a function in that compile unit, that
+  // it doesn't print out a error saying that a DIE has an invalid file index
+  // if there is no DW_AT_decl_file attribute.
+  //
+  // 0x0000000b: DW_TAG_compile_unit
+  //               DW_AT_name        ("main.cpp")
+  //               DW_AT_language    (DW_LANG_C)
+  //               DW_AT_stmt_list   (0x00000000)
+  //
+  // 0x00000015:   DW_TAG_subprogram
+  //                 DW_AT_name      ("foo")
+  //                 DW_AT_low_pc    (0x0000000000001000)
+  //                 DW_AT_high_pc   (0x0000000000001050)
+  //
+  // 0x0000002a:   NULL
+  //
+  // Line table that has entries, but none that match "foo":
+  //
+  // Address            Line   Column File   ISA Discriminator OpIndex Flags
+  // ------------------ ------ ------ ------ --- ------------- ------- -----
+  // 0x0000000000002000     10      0      1   0             0       0 is_stmt
+  // 0x0000000000002050     13      0      1   0             0       0 is_stmt
+
+  StringRef yamldata = R"(
+  debug_str:
+    - ''
+    - main.cpp
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_language
+              Form:            DW_FORM_udata
+            - Attribute:       DW_AT_stmt_list
+              Form:            DW_FORM_sec_offset
+        - Code:            0x2
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_string
+            - Attribute:       DW_AT_low_pc
+              Form:            DW_FORM_addr
+            - Attribute:       DW_AT_high_pc
+              Form:            DW_FORM_addr
+  debug_info:
+    - Length:          0x27
+      Version:         4
+      AbbrevTableID:   0
+      AbbrOffset:      0x0
+      AddrSize:        8
+      Entries:
+        - AbbrCode:        0x1
+          Values:
+            - Value:           0x1
+            - Value:           0x2
+            - Value:           0x0
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0xDEADBEEFDEADBEEF
+              CStr:            foo
+            - Value:           0x1000
+            - Value:           0x1050
+        - AbbrCode:        0x0
+  debug_line:
+    - Length:          58
+      Version:         2
+      PrologueLength:  31
+      MinInstLength:   1
+      DefaultIsStmt:   1
+      LineBase:        251
+      LineRange:       14
+      OpcodeBase:      13
+      StandardOpcodeLengths: [ 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1 ]
+      Files:
+        - Name:            main.cpp
+          DirIdx:          0
+          ModTime:         0
+          Length:          0
+      Opcodes:
+        - Opcode:          DW_LNS_extended_op
+          ExtLen:          9
+          SubOpcode:       DW_LNE_set_address
+          Data:            8192
+        - Opcode:          DW_LNS_advance_line
+          SData:           9
+          Data:            0
+        - Opcode:          DW_LNS_copy
+          Data:            0
+        - Opcode:          DW_LNS_advance_pc
+          Data:            80
+        - Opcode:          DW_LNS_advance_line
+          SData:           3
+          Data:            0
+        - Opcode:          DW_LNS_extended_op
+          ExtLen:          1
+          SubOpcode:       DW_LNE_end_sequence
+          Data:            0
+  )";
+  auto ErrOrSections = DWARFYAML::emitDebugSections(yamldata);
+  ASSERT_THAT_EXPECTED(ErrOrSections, Succeeded());
+  std::unique_ptr<DWARFContext> DwarfContext =
+      DWARFContext::create(*ErrOrSections, 8);
+  ASSERT_TRUE(DwarfContext.get() != nullptr);
+  std::string errors;
+  raw_string_ostream OS(errors);
+  OutputAggregator OSAgg(&OS);
+  GsymCreator GC;
+  DwarfTransformer DT(*DwarfContext, GC);
+  const uint32_t ThreadCount = 1;
+  ASSERT_THAT_ERROR(DT.convert(ThreadCount, OSAgg), Succeeded());
+  ASSERT_THAT_ERROR(GC.finalize(OSAgg), Succeeded());
+
+  // Make sure this warning is not in the binary
+  std::string error_str("error: function DIE at 0x00000015 has an invalid file "
+                        "index 4294967295 in its DW_AT_decl_file attribute");
+  EXPECT_TRUE(errors.find(error_str) == std::string::npos);
+}
diff --git a/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt b/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt
index a2bbb10039c9a..7e3ebc88cea63 100644
--- a/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt
+++ b/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt
@@ -42,6 +42,7 @@ add_llvm_unittest(OrcJITTests
   SymbolStringPoolTest.cpp
   TaskDispatchTest.cpp
   ThreadSafeModuleTest.cpp
+  WaitingOnGraphTest.cpp
   WrapperFunctionUtilsTest.cpp
   JITLinkRedirectionManagerTest.cpp
   ReOptimizeLayerTest.cpp
diff --git a/llvm/unittests/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManagerTest.cpp b/llvm/unittests/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManagerTest.cpp
index d4b45ea0dca0b..2c6650d25a0ec 100644
--- a/llvm/unittests/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManagerTest.cpp
+++ b/llvm/unittests/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManagerTest.cpp
@@ -39,8 +39,11 @@ class SimpleAllocator {
     return ExecutorAddr::fromPtr(MB.base());
   }
 
-  Error finalize(tpctypes::FinalizeRequest FR) {
+  Expected<ExecutorAddr> initialize(tpctypes::FinalizeRequest FR) {
+    assert(!FR.Segments.empty());
+    ExecutorAddr Base = FR.Segments[0].Addr;
     for (auto &Seg : FR.Segments) {
+      Base = std::min(Base, Seg.Addr);
       char *Mem = Seg.Addr.toPtr<char *>();
       memcpy(Mem, Seg.Content.data(), Seg.Content.size());
       memset(Mem + Seg.Content.size(), 0, Seg.Size - Seg.Content.size());
@@ -52,10 +55,10 @@ class SimpleAllocator {
       if ((Seg.RAG.Prot & MemProt::Exec) != MemProt::Exec)
         sys::Memory::InvalidateInstructionCache(Mem, Seg.Size);
     }
-    return Error::success();
+    return Base;
   }
 
-  Error deallocate(std::vector<ExecutorAddr> &Bases) {
+  Error release(std::vector<ExecutorAddr> &Bases) {
     Error Err = Error::success();
     for (auto &Base : Bases) {
       auto I = Blocks.find(Base.toPtr<void *>());
@@ -86,18 +89,18 @@ CWrapperFunctionResult testReserve(const char *ArgData, size_t ArgSize) {
           .release();
 }
 
-CWrapperFunctionResult testFinalize(const char *ArgData, size_t ArgSize) {
-  return WrapperFunction<rt::SPSSimpleExecutorMemoryManagerFinalizeSignature>::
+CWrapperFunctionResult testInitialize(const char *ArgData, size_t ArgSize) {
+  return WrapperFunction<
+             rt::SPSSimpleExecutorMemoryManagerInitializeSignature>::
       handle(ArgData, ArgSize,
-             makeMethodWrapperHandler(&SimpleAllocator::finalize))
+             makeMethodWrapperHandler(&SimpleAllocator::initialize))
           .release();
 }
 
-CWrapperFunctionResult testDeallocate(const char *ArgData, size_t ArgSize) {
-  return WrapperFunction<
-             rt::SPSSimpleExecutorMemoryManagerDeallocateSignature>::
+CWrapperFunctionResult testRelease(const char *ArgData, size_t ArgSize) {
+  return WrapperFunction<rt::SPSSimpleExecutorMemoryManagerReleaseSignature>::
       handle(ArgData, ArgSize,
-             makeMethodWrapperHandler(&SimpleAllocator::deallocate))
+             makeMethodWrapperHandler(&SimpleAllocator::release))
           .release();
 }
 
@@ -108,8 +111,8 @@ TEST(EPCGenericJITLinkMemoryManagerTest, AllocFinalizeFree) {
   EPCGenericJITLinkMemoryManager::SymbolAddrs SAs;
   SAs.Allocator = ExecutorAddr::fromPtr(&SA);
   SAs.Reserve = ExecutorAddr::fromPtr(&testReserve);
-  SAs.Finalize = ExecutorAddr::fromPtr(&testFinalize);
-  SAs.Deallocate = ExecutorAddr::fromPtr(&testDeallocate);
+  SAs.Initialize = ExecutorAddr::fromPtr(&testInitialize);
+  SAs.Release = ExecutorAddr::fromPtr(&testRelease);
 
   auto MemMgr = std::make_unique<EPCGenericJITLinkMemoryManager>(*SelfEPC, SAs);
   StringRef Hello = "hello";
diff --git a/llvm/unittests/ExecutionEngine/Orc/ExecutorAddressTest.cpp b/llvm/unittests/ExecutionEngine/Orc/ExecutorAddressTest.cpp
index ae9db141bdd4a..9a37980a8428d 100644
--- a/llvm/unittests/ExecutionEngine/Orc/ExecutorAddressTest.cpp
+++ b/llvm/unittests/ExecutionEngine/Orc/ExecutorAddressTest.cpp
@@ -97,10 +97,16 @@ TEST(ExecutorAddrTest, AddrRanges) {
   EXPECT_FALSE(R1.contains(A0));
   EXPECT_FALSE(R1.contains(A2));
 
+  EXPECT_TRUE(R3.contains(R0));  // True for singleton range at start.
+  EXPECT_TRUE(R3.contains(R1));  // True for singleton range at end.
+  EXPECT_FALSE(R3.contains(R2)); // False for non-overlaping singleton range.
+  EXPECT_FALSE(R3.contains(R4)); // False for overlapping, uncontained range.
+
   EXPECT_FALSE(R1.overlaps(R0));
   EXPECT_FALSE(R1.overlaps(R2));
   EXPECT_TRUE(R1.overlaps(R3));
   EXPECT_TRUE(R1.overlaps(R4));
+  EXPECT_TRUE(R3.overlaps(R4));
 
   EXPECT_LE(R0, R0);
   EXPECT_LT(R0, R1);
diff --git a/llvm/unittests/ExecutionEngine/Orc/ReOptimizeLayerTest.cpp b/llvm/unittests/ExecutionEngine/Orc/ReOptimizeLayerTest.cpp
index cd10ffe53d0f0..686d85dc6c15b 100644
--- a/llvm/unittests/ExecutionEngine/Orc/ReOptimizeLayerTest.cpp
+++ b/llvm/unittests/ExecutionEngine/Orc/ReOptimizeLayerTest.cpp
@@ -9,6 +9,7 @@
 #include "llvm/ExecutionEngine/Orc/IRTransformLayer.h"
 #include "llvm/ExecutionEngine/Orc/JITLinkRedirectableSymbolManager.h"
 #include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
+#include "llvm/ExecutionEngine/Orc/MapperJITLinkMemoryManager.h"
 #include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h"
 #include "llvm/ExecutionEngine/Orc/ObjectTransformLayer.h"
 #include "llvm/ExecutionEngine/Orc/SelfExecutorProcessControl.h"
@@ -42,7 +43,7 @@ class ReOptimizeLayerTest : public testing::Test {
 
     // COFF-ARM64 is not supported yet
     auto Triple = JTMB->getTargetTriple();
-    if (Triple.isOSBinFormatCOFF() && Triple.isAArch64())
+    if (Triple.isOSBinFormatCOFF())
       GTEST_SKIP();
 
     // SystemZ is not supported yet.
@@ -84,8 +85,11 @@ class ReOptimizeLayerTest : public testing::Test {
 
     ES = std::make_unique<ExecutionSession>(std::move(*EPC));
     JD = &ES->createBareJITDylib("main");
+
     ObjLinkingLayer = std::make_unique<ObjectLinkingLayer>(
-        *ES, std::make_unique<InProcessMemoryManager>(*PageSize));
+        *ES, std::make_unique<MapperJITLinkMemoryManager>(
+                 10 * 1024 * 1024,
+                 std::make_unique<InProcessMemoryMapper>(*PageSize)));
     DL = std::make_unique<DataLayout>(std::move(*DLOrErr));
 
     auto TM = JTMB->createTargetMachine();
diff --git a/llvm/unittests/ExecutionEngine/Orc/SimpleExecutorMemoryManagerTest.cpp b/llvm/unittests/ExecutionEngine/Orc/SimpleExecutorMemoryManagerTest.cpp
index 6e9b0b2d8c73f..9c6f19c82c998 100644
--- a/llvm/unittests/ExecutionEngine/Orc/SimpleExecutorMemoryManagerTest.cpp
+++ b/llvm/unittests/ExecutionEngine/Orc/SimpleExecutorMemoryManagerTest.cpp
@@ -34,12 +34,12 @@ TEST(SimpleExecutorMemoryManagerTest, AllocFinalizeFree) {
   SimpleExecutorMemoryManager MemMgr;
 
   constexpr unsigned AllocSize = 16384;
-  auto Mem = MemMgr.allocate(AllocSize);
+  auto Mem = MemMgr.reserve(AllocSize);
   EXPECT_THAT_ERROR(Mem.takeError(), Succeeded());
 
   std::string HW = "Hello, world!";
 
-  int FinalizeCounter = 0;
+  int InitializeCounter = 0;
   int DeallocateCounter = 0;
 
   tpctypes::FinalizeRequest FR;
@@ -52,27 +52,27 @@ TEST(SimpleExecutorMemoryManagerTest, AllocFinalizeFree) {
       {/* Finalize: */
        cantFail(WrapperFunctionCall::Create<SPSArgList<SPSExecutorAddr>>(
            ExecutorAddr::fromPtr(incrementWrapper),
-           ExecutorAddr::fromPtr(&FinalizeCounter))),
+           ExecutorAddr::fromPtr(&InitializeCounter))),
        /*  Deallocate: */
        cantFail(WrapperFunctionCall::Create<SPSArgList<SPSExecutorAddr>>(
            ExecutorAddr::fromPtr(incrementWrapper),
            ExecutorAddr::fromPtr(&DeallocateCounter)))});
 
-  EXPECT_EQ(FinalizeCounter, 0);
+  EXPECT_EQ(InitializeCounter, 0);
   EXPECT_EQ(DeallocateCounter, 0);
 
-  auto FinalizeErr = MemMgr.finalize(FR);
-  EXPECT_THAT_ERROR(std::move(FinalizeErr), Succeeded());
+  auto InitializeErr = MemMgr.initialize(FR);
+  EXPECT_THAT_EXPECTED(std::move(InitializeErr), Succeeded());
 
-  EXPECT_EQ(FinalizeCounter, 1);
+  EXPECT_EQ(InitializeCounter, 1);
   EXPECT_EQ(DeallocateCounter, 0);
 
   EXPECT_EQ(HW, std::string(Mem->toPtr<const char *>()));
 
-  auto DeallocateErr = MemMgr.deallocate({*Mem});
-  EXPECT_THAT_ERROR(std::move(DeallocateErr), Succeeded());
+  auto ReleaseErr = MemMgr.release({*Mem});
+  EXPECT_THAT_ERROR(std::move(ReleaseErr), Succeeded());
 
-  EXPECT_EQ(FinalizeCounter, 1);
+  EXPECT_EQ(InitializeCounter, 1);
   EXPECT_EQ(DeallocateCounter, 1);
 }
 
diff --git a/llvm/unittests/ExecutionEngine/Orc/WaitingOnGraphTest.cpp b/llvm/unittests/ExecutionEngine/Orc/WaitingOnGraphTest.cpp
new file mode 100644
index 0000000000000..08b4e8f40e3d0
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/Orc/WaitingOnGraphTest.cpp
@@ -0,0 +1,555 @@
+//===--------- WaitingOnGraphTest.cpp - Test WaitingOnGraph APIs ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/WaitingOnGraph.h"
+#include "gtest/gtest.h"
+
+namespace llvm::orc::detail {
+
+class WaitingOnGraphTest : public testing::Test {
+public:
+  using TestGraph = WaitingOnGraph<uintptr_t, uintptr_t>;
+
+protected:
+  using SuperNode = TestGraph::SuperNode;
+  using SuperNodeBuilder = TestGraph::SuperNodeBuilder;
+  using ContainerElementsMap = TestGraph::ContainerElementsMap;
+  using ElemToSuperNodeMap = TestGraph::ElemToSuperNodeMap;
+  using SimplifyResult = TestGraph::SimplifyResult;
+  using EmitResult = TestGraph::EmitResult;
+
+  static const ContainerElementsMap &getDefs(SuperNode &SN) { return SN.Defs; }
+
+  static const ContainerElementsMap &getDeps(SuperNode &SN) { return SN.Deps; }
+
+  static std::vector<std::unique_ptr<SuperNode>> &getSNs(SimplifyResult &SR) {
+    return SR.SNs;
+  }
+
+  static ElemToSuperNodeMap &getElemToSN(SimplifyResult &SR) {
+    return SR.ElemToSN;
+  }
+
+  static std::vector<std::unique_ptr<SuperNode>> &getPendingSNs(TestGraph &G) {
+    return G.PendingSNs;
+  }
+
+  static ContainerElementsMap merge(ContainerElementsMap M1,
+                                    const ContainerElementsMap &M2) {
+    ContainerElementsMap Result = std::move(M1);
+    for (auto &[Container, Elems] : M2)
+      Result[Container].insert(Elems.begin(), Elems.end());
+    return Result;
+  }
+
+  ContainerElementsMap
+  collapseDefs(std::vector<std::unique_ptr<SuperNode>> &SNs,
+               bool DepsMustMatch = true) {
+    if (SNs.empty())
+      return ContainerElementsMap();
+
+    ContainerElementsMap Result = SNs[0]->defs();
+#ifndef NDEBUG
+    const ContainerElementsMap &Deps = SNs[0]->deps();
+#endif // NDEBUG
+
+    for (size_t I = 1; I != SNs.size(); ++I) {
+      assert(!DepsMustMatch || SNs[I]->deps() == Deps);
+      Result = merge(std::move(Result), SNs[I]->defs());
+    }
+
+    return Result;
+  }
+
+  EmitResult integrate(EmitResult ER) {
+    for (auto &SN : ER.Ready)
+      for (auto &[Container, Elems] : SN->defs())
+        Ready[Container].insert(Elems.begin(), Elems.end());
+    for (auto &SN : ER.Failed)
+      for (auto &[Container, Elems] : SN->defs())
+        Failed[Container].insert(Elems.begin(), Elems.end());
+    return ER;
+  }
+
+  EmitResult emit(SimplifyResult SR) {
+    return integrate(G.emit(std::move(SR), GetExternalState));
+  }
+
+  TestGraph G;
+  ContainerElementsMap Ready;
+  ContainerElementsMap Failed;
+
+  class ExternalStateGetter {
+  public:
+    ExternalStateGetter(WaitingOnGraphTest &T) : T(T) {}
+    TestGraph::ExternalState operator()(TestGraph::ContainerId C,
+                                        TestGraph::ElementId E) {
+      {
+        auto I = T.Failed.find(C);
+        if (I != T.Failed.end())
+          if (I->second.count(E))
+            return TestGraph::ExternalState::Failed;
+      }
+
+      {
+        auto I = T.Ready.find(C);
+        if (I != T.Ready.end())
+          if (I->second.count(E))
+            return TestGraph::ExternalState::Ready;
+      }
+
+      return TestGraph::ExternalState::None;
+    }
+
+  private:
+    WaitingOnGraphTest &T;
+  };
+
+  ExternalStateGetter GetExternalState{*this};
+};
+
+} // namespace llvm::orc::detail
+
+using namespace llvm;
+using namespace llvm::orc;
+using namespace llvm::orc::detail;
+
+TEST_F(WaitingOnGraphTest, ConstructAndDestroyEmpty) {
+  // Nothing to do here -- we're just testing construction and destruction
+  // of the WaitingOnGraphTest::G member.
+}
+
+TEST_F(WaitingOnGraphTest, Build_TrivialSingleSuperNode) {
+  // Add one set of trivial defs and empty deps to the builder, make sure that
+  // they're passed through to the resulting super-node.
+  SuperNodeBuilder B;
+  ContainerElementsMap Defs({{0, {0}}});
+  ContainerElementsMap Deps;
+  B.add(Defs, Deps);
+  auto SNs = B.takeSuperNodes();
+  EXPECT_EQ(SNs.size(), 1U);
+  EXPECT_EQ(getDefs(*SNs[0]), Defs);
+  EXPECT_EQ(getDeps(*SNs[0]), Deps);
+}
+
+TEST_F(WaitingOnGraphTest, Build_EmptyDefs) {
+  // Adding empty def sets is ok, but should not result in creation of a
+  // SuperNode.
+  SuperNodeBuilder B;
+  ContainerElementsMap Empty;
+  B.add(Empty, Empty);
+  auto SNs = B.takeSuperNodes();
+  EXPECT_TRUE(SNs.empty());
+}
+
+TEST_F(WaitingOnGraphTest, Build_NonTrivialSingleSuperNode) {
+  // Add one non-trivwial set of defs and deps. Make sure that they're passed
+  // through to the resulting super-node.
+  SuperNodeBuilder B;
+  ContainerElementsMap Defs({{0, {0, 1, 2}}});
+  ContainerElementsMap Deps({{1, {3, 4, 5}}});
+  B.add(Defs, Deps);
+  auto SNs = B.takeSuperNodes();
+  EXPECT_EQ(SNs.size(), 1U);
+  EXPECT_EQ(getDefs(*SNs[0]), Defs);
+  EXPECT_EQ(getDeps(*SNs[0]), Deps);
+}
+
+TEST_F(WaitingOnGraphTest, Build_CoalesceEmptyDeps) {
+  // Add two trivial defs both with empty deps to the builder. Check that
+  // they're coalesced into a single super-node.
+  SuperNodeBuilder B;
+  ContainerElementsMap Defs1({{0, {0}}});
+  ContainerElementsMap Defs2({{0, {1}}});
+  ContainerElementsMap Deps;
+  B.add(Defs1, Deps);
+  B.add(Defs2, Deps);
+  auto SNs = B.takeSuperNodes();
+  EXPECT_EQ(SNs.size(), 1U);
+  EXPECT_EQ(getDefs(*SNs[0]), merge(Defs1, Defs2));
+  EXPECT_EQ(getDeps(*SNs[0]), Deps);
+}
+
+TEST_F(WaitingOnGraphTest, Build_CoalesceNonEmptyDeps) {
+  // Add two sets trivial of trivial defs with empty deps to the builder. Check
+  // that the two coalesce into a single super node.
+  SuperNodeBuilder B;
+  ContainerElementsMap Defs1({{0, {0}}});
+  ContainerElementsMap Defs2({{0, {1}}});
+  ContainerElementsMap Deps({{1, {1}}});
+  B.add(Defs1, Deps);
+  B.add(Defs2, Deps);
+  auto SNs = B.takeSuperNodes();
+  EXPECT_EQ(SNs.size(), 1U);
+  EXPECT_EQ(getDefs(*SNs[0]), merge(Defs1, Defs2));
+  EXPECT_EQ(getDeps(*SNs[0]), Deps);
+}
+
+TEST_F(WaitingOnGraphTest, Build_CoalesceInterleaved) {
+  // Add multiple sets of defs, some with the same dep sets. Check that nodes
+  // are still coalesced as expected.
+  SuperNodeBuilder B;
+
+  ContainerElementsMap DefsA1({{0, {0}}});
+  ContainerElementsMap DefsA2({{0, {1}}});
+  ContainerElementsMap DefsB1({{1, {0}}});
+  ContainerElementsMap DefsB2({{1, {1}}});
+  ContainerElementsMap DepsA({{2, {0}}, {3, {0}}});
+  ContainerElementsMap DepsB({{4, {0}}, {5, {0}}});
+  B.add(DefsA1, DepsA);
+  B.add(DefsB1, DepsB);
+  B.add(DefsA2, DepsA);
+  B.add(DefsB2, DepsB);
+  auto SNs = B.takeSuperNodes();
+  EXPECT_EQ(SNs.size(), 2U);
+  EXPECT_EQ(getDefs(*SNs[0]), merge(DefsA1, DefsA2));
+  EXPECT_EQ(getDeps(*SNs[0]), DepsA);
+  EXPECT_EQ(getDefs(*SNs[1]), merge(DefsB1, DefsB2));
+  EXPECT_EQ(getDeps(*SNs[1]), DepsB);
+}
+
+TEST_F(WaitingOnGraphTest, Build_SelfDepRemoval) {
+  // Add multiple sets of defs, some with the same dep sets. Check that nodes
+  // are still coalesced as expected.
+  SuperNodeBuilder B;
+  ContainerElementsMap Defs({{0, {0, 1}}});
+  ContainerElementsMap Deps({{0, {1}}});
+  ContainerElementsMap Empty;
+  B.add(Defs, Deps);
+  auto SNs = B.takeSuperNodes();
+  EXPECT_EQ(SNs.size(), 1U);
+  EXPECT_EQ(getDefs(*SNs[0]), Defs);
+  EXPECT_EQ(getDeps(*SNs[0]), Empty);
+}
+
+TEST_F(WaitingOnGraphTest, Simplification_EmptySimplification) {
+  auto SR = TestGraph::simplify({});
+  auto &SNs = getSNs(SR);
+  EXPECT_EQ(SNs.size(), 0U);
+  EXPECT_EQ(getElemToSN(SR), ElemToSuperNodeMap());
+}
+
+TEST_F(WaitingOnGraphTest, Simplification_TrivialSingleSuperNode) {
+  // Test trivial call to simplify.
+  SuperNodeBuilder B;
+  ContainerElementsMap Defs({{0, {0}}});
+  ContainerElementsMap Deps({{0, {0}}});
+  B.add(Defs, Deps);
+  auto SR = TestGraph::simplify(B.takeSuperNodes());
+  ContainerElementsMap Empty;
+
+  // Check SNs.
+  auto &SNs = getSNs(SR);
+  EXPECT_EQ(SNs.size(), 1U);
+  EXPECT_EQ(getDefs(*SNs.at(0)), Defs);
+  EXPECT_EQ(getDeps(*SNs.at(0)), Empty);
+
+  // Check ElemToSNs.
+  ElemToSuperNodeMap ExpectedElemToSNs;
+  ExpectedElemToSNs[0][0] = SNs[0].get();
+  EXPECT_EQ(getElemToSN(SR), ExpectedElemToSNs);
+}
+
+TEST_F(WaitingOnGraphTest, Simplification_SimplifySingleContainerSimpleCycle) {
+  // Test trivial simplification call with two nodes and one internal
+  // dependence cycle within a single container:
+  // N0: (0, 0) -> (0, 1)
+  // N1: (0, 1) -> (0, 0)
+  // We expect intra-simplify cycle elimination to clear both dependence sets,
+  // and coalescing to join them into one supernode covering both defs.
+  SuperNodeBuilder B;
+  ContainerElementsMap Defs0({{0, {0}}});
+  ContainerElementsMap Deps0({{0, {1}}});
+  B.add(Defs0, Deps0);
+  ContainerElementsMap Defs1({{0, {1}}});
+  ContainerElementsMap Deps1({{0, {0}}});
+  B.add(Defs1, Deps1);
+  auto SR = TestGraph::simplify(B.takeSuperNodes());
+
+  // Check SNs.
+  auto &SNs = getSNs(SR);
+  ContainerElementsMap Empty;
+  EXPECT_EQ(SNs.size(), 1U);
+  EXPECT_EQ(getDefs(*SNs.at(0)), merge(Defs0, Defs1));
+  EXPECT_EQ(getDeps(*SNs.at(0)), Empty);
+
+  // Check ElemToSNs.
+  ElemToSuperNodeMap ExpectedElemToSNs;
+  ExpectedElemToSNs[0][0] = SNs[0].get();
+  ExpectedElemToSNs[0][1] = SNs[0].get();
+
+  EXPECT_EQ(getElemToSN(SR), ExpectedElemToSNs);
+}
+
+TEST_F(WaitingOnGraphTest,
+       Simplification_SimplifySingleContainerNElementCycle) {
+  // Test trivial simplification call with M nodes and one internal
+  // dependence cycle within a single container:
+  // N0: (0, 0) -> (0, 1)
+  // N1: (0, 1) -> (0, 2)
+  // ...
+  // NM: (0, M) -> (0, 0)
+  // We expect intra-simplify cycle elimination to clear all dependence sets,
+  // and coalescing to join them into one supernode covering all defs.
+  SuperNodeBuilder B;
+  constexpr size_t M = 10;
+  for (size_t I = 0; I != M; ++I) {
+    ContainerElementsMap Defs({{0, {I}}});
+    ContainerElementsMap Deps({{0, {(I + 1) % M}}});
+    B.add(Defs, Deps);
+  }
+  auto InitSNs = B.takeSuperNodes();
+  EXPECT_EQ(InitSNs.size(), M);
+
+  auto SR = TestGraph::simplify(std::move(InitSNs));
+
+  // Check SNs.
+  auto &SNs = getSNs(SR);
+  ContainerElementsMap ExpectedDefs;
+  for (size_t I = 0; I != M; ++I)
+    ExpectedDefs[0].insert(I);
+  ContainerElementsMap Empty;
+  EXPECT_EQ(SNs.size(), 1U);
+  EXPECT_EQ(getDefs(*SNs.at(0)), ExpectedDefs);
+  EXPECT_EQ(getDeps(*SNs.at(0)), Empty);
+
+  // Check ElemToSNs.
+  ElemToSuperNodeMap ExpectedElemToSNs;
+  for (size_t I = 0; I != M; ++I)
+    ExpectedElemToSNs[0][I] = SNs[0].get();
+
+  EXPECT_EQ(getElemToSN(SR), ExpectedElemToSNs);
+}
+
+TEST_F(WaitingOnGraphTest, Simplification_SimplifyIntraSimplifyPropagateDeps) {
+  // Test trivial simplification call with two nodes and one internal
+  // dependence cycle within a single container:
+  // N0: (0, 0) -> (0, {1, 2})
+  // N1: (0, 1) -> (0, {3})
+  // We expect intra-simplify cycle elimination to replace the dependence of
+  // (0, 0) on (0, 1) with a dependence on (0, 3) instead.
+  SuperNodeBuilder B;
+  ContainerElementsMap Defs0({{0, {0}}});
+  ContainerElementsMap Deps0({{0, {1, 2}}});
+  B.add(Defs0, Deps0);
+  ContainerElementsMap Defs1({{0, {1}}});
+  ContainerElementsMap Deps1({{0, {3}}});
+  B.add(Defs1, Deps1);
+  auto SR = TestGraph::simplify(B.takeSuperNodes());
+
+  // Check SNs.
+  auto &SNs = getSNs(SR);
+  EXPECT_EQ(SNs.size(), 2U);
+
+  // ContainerElemenstMap ExpectedDefs0({{0, {0}}});
+  // ContainerElemenstMap ExpectedDeps0({{0, {1, 3}}});
+  EXPECT_EQ(getDefs(*SNs.at(0)), ContainerElementsMap({{0, {0}}}));
+  EXPECT_EQ(getDeps(*SNs.at(0)), ContainerElementsMap({{0, {2, 3}}}));
+
+  EXPECT_EQ(getDefs(*SNs.at(1)), ContainerElementsMap({{0, {1}}}));
+  EXPECT_EQ(getDeps(*SNs.at(1)), ContainerElementsMap({{0, {3}}}));
+
+  // Check ElemToSNs.
+  ElemToSuperNodeMap ExpectedElemToSNs;
+  ExpectedElemToSNs[0][0] = SNs[0].get();
+  ExpectedElemToSNs[0][1] = SNs[1].get();
+
+  EXPECT_EQ(getElemToSN(SR), ExpectedElemToSNs);
+}
+
+TEST_F(WaitingOnGraphTest, Emit_EmptyEmit) {
+  // Check that empty emits work as expected.
+  auto ER = G.emit(TestGraph::simplify({}), GetExternalState);
+
+  EXPECT_EQ(ER.Ready.size(), 0U);
+  EXPECT_EQ(ER.Failed.size(), 0U);
+}
+
+TEST_F(WaitingOnGraphTest, Emit_TrivialSingleNode) {
+  // Check that emitting a single node behaves as expected.
+  SuperNodeBuilder B;
+  ContainerElementsMap Defs({{0, {0}}});
+  B.add(Defs, ContainerElementsMap());
+  auto ER = emit(TestGraph::simplify(B.takeSuperNodes()));
+  EXPECT_EQ(collapseDefs(ER.Ready), Defs);
+  EXPECT_EQ(ER.Failed.size(), 0U);
+}
+
+TEST_F(WaitingOnGraphTest, Emit_TrivialSequence) {
+  // Perform a sequence of two emits where the second emit depends on the
+  // first. Check that nodes become ready after each emit.
+  SuperNodeBuilder B;
+  ContainerElementsMap Defs0({{0, {0}}});
+  ContainerElementsMap Empty;
+  B.add(Defs0, Empty);
+  auto ER0 = emit(TestGraph::simplify(B.takeSuperNodes()));
+  EXPECT_EQ(collapseDefs(ER0.Ready), Defs0);
+  EXPECT_EQ(ER0.Failed.size(), 0U);
+
+  ContainerElementsMap Defs1({{0, {1}}});
+  ContainerElementsMap Deps1({{0, {0}}});
+  B.add(Defs1, Deps1);
+  auto ER1 = emit(TestGraph::simplify(B.takeSuperNodes()));
+  EXPECT_EQ(collapseDefs(ER1.Ready), Defs1);
+  EXPECT_EQ(ER1.Failed.size(), 0U);
+}
+
+TEST_F(WaitingOnGraphTest, Emit_TrivialReverseSequence) {
+  // Perform a sequence of two emits where the first emit depends on the
+  // second. Check that both nodes become ready after the second emit.
+  SuperNodeBuilder B;
+  ContainerElementsMap Defs0({{0, {0}}});
+  ContainerElementsMap Deps0({{0, {1}}});
+  B.add(Defs0, Deps0);
+  auto ER0 = emit(TestGraph::simplify(B.takeSuperNodes()));
+  EXPECT_EQ(ER0.Ready.size(), 0U);
+  EXPECT_EQ(ER0.Failed.size(), 0U);
+
+  ContainerElementsMap Defs1({{0, {1}}});
+  ContainerElementsMap Empty;
+  B.add(Defs1, Empty);
+  auto ER1 = emit(TestGraph::simplify(B.takeSuperNodes()));
+  EXPECT_EQ(collapseDefs(ER1.Ready), merge(Defs0, Defs1));
+  EXPECT_EQ(ER1.Failed.size(), 0U);
+}
+
+TEST_F(WaitingOnGraphTest, Emit_Coalescing) {
+  SuperNodeBuilder B;
+  ContainerElementsMap Defs0({{0, {0}}});
+  ContainerElementsMap Deps0({{1, {0}}});
+  B.add(Defs0, Deps0);
+  auto ER0 = emit(TestGraph::simplify(B.takeSuperNodes()));
+  EXPECT_EQ(ER0.Ready.size(), 0U);
+  EXPECT_EQ(ER0.Failed.size(), 0U);
+
+  ContainerElementsMap Defs1({{0, {1}}});
+  ContainerElementsMap Deps1({{1, {0}}});
+  B.add(Defs1, Deps1);
+  auto ER1 = emit(TestGraph::simplify(B.takeSuperNodes()));
+  EXPECT_EQ(ER1.Ready.size(), 0U);
+  EXPECT_EQ(ER1.Failed.size(), 0U);
+
+  // Check that after emitting two nodes with the same dep set we have only one
+  // pending supernode whose defs are the union of the defs in the two emits.
+  auto &PendingSNs = getPendingSNs(G);
+  EXPECT_EQ(PendingSNs.size(), 1U);
+  EXPECT_EQ(getDefs(*PendingSNs.at(0)), merge(Defs0, Defs1));
+
+  ContainerElementsMap Defs2({{1, {0}}});
+  ContainerElementsMap Empty;
+  B.add(Defs2, Empty);
+  auto ER2 = emit(TestGraph::simplify(B.takeSuperNodes()));
+  EXPECT_EQ(collapseDefs(ER2.Ready), merge(merge(Defs0, Defs1), Defs2));
+  EXPECT_EQ(ER2.Failed.size(), 0U);
+}
+
+TEST_F(WaitingOnGraphTest, Emit_ZigZag) {
+  // Perform a sequence of four emits, where the first three contain a zig-zag
+  // pattern:
+  // 1. (0, 0) -> (0, 1)
+  // 2. (0, 2) -> (0, 3)
+  //   ^ -- At this point we expect two pending supernodes.
+  // 3. (0, 1) -> (0, 2)
+  //   ^ -- Resolution of (0, 1) should cause all three emitted nodes to coalsce
+  //        into one supernode defining (0, {1, 2, 3}).
+  // 4. (0, 3)
+  //   ^ -- Should cause all four nodes to become ready.
+
+  SuperNodeBuilder B;
+  ContainerElementsMap Defs0({{0, {0}}});
+  ContainerElementsMap Deps0({{0, {1}}});
+  B.add(Defs0, Deps0);
+  auto ER0 = emit(TestGraph::simplify(B.takeSuperNodes()));
+  EXPECT_EQ(ER0.Ready.size(), 0U);
+  EXPECT_EQ(ER0.Failed.size(), 0U);
+
+  ContainerElementsMap Defs1({{0, {2}}});
+  ContainerElementsMap Deps1({{0, {3}}});
+  B.add(Defs1, Deps1);
+  auto ER1 = emit(TestGraph::simplify(B.takeSuperNodes()));
+  EXPECT_EQ(ER1.Ready.size(), 0U);
+  EXPECT_EQ(ER1.Failed.size(), 0U);
+
+  // Check that after emitting two nodes with the same dep set we have only one
+  // pending supernode whose defs are the union of the defs in the two emits.
+  auto &PendingSNs = getPendingSNs(G);
+  EXPECT_EQ(PendingSNs.size(), 2U);
+  EXPECT_EQ(getDefs(*PendingSNs.at(0)), Defs0);
+  EXPECT_EQ(getDeps(*PendingSNs.at(0)), Deps0);
+  EXPECT_EQ(getDefs(*PendingSNs.at(1)), Defs1);
+  EXPECT_EQ(getDeps(*PendingSNs.at(1)), Deps1);
+
+  ContainerElementsMap Defs2({{0, {1}}});
+  ContainerElementsMap Deps2({{0, {2}}});
+  B.add(Defs2, Deps2);
+  auto ER2 = emit(TestGraph::simplify(B.takeSuperNodes()));
+  EXPECT_EQ(ER2.Ready.size(), 0U);
+  EXPECT_EQ(ER2.Failed.size(), 0U);
+
+  // Check that after emitting the third node we've coalesced all three.
+  EXPECT_EQ(PendingSNs.size(), 1U);
+  EXPECT_EQ(getDefs(*PendingSNs.at(0)), merge(merge(Defs0, Defs1), Defs2));
+  EXPECT_EQ(getDeps(*PendingSNs.at(0)), Deps1);
+
+  ContainerElementsMap Defs3({{0, {3}}});
+  ContainerElementsMap Empty;
+  B.add(Defs3, Empty);
+  auto ER3 = emit(TestGraph::simplify(B.takeSuperNodes()));
+
+  EXPECT_EQ(collapseDefs(ER3.Ready),
+            merge(merge(merge(Defs0, Defs1), Defs2), Defs3));
+  EXPECT_EQ(ER2.Failed.size(), 0U);
+  EXPECT_TRUE(PendingSNs.empty());
+}
+
+TEST_F(WaitingOnGraphTest, Fail_Empty) {
+  // Check that failing an empty set is a no-op.
+  auto FR = G.fail(ContainerElementsMap());
+  EXPECT_EQ(FR.size(), 0U);
+}
+
+TEST_F(WaitingOnGraphTest, Fail_Single) {
+  // Check that failing a set with no existing dependencies works.
+  auto FR = G.fail({{0, {0}}});
+  EXPECT_EQ(FR.size(), 0U);
+}
+
+TEST_F(WaitingOnGraphTest, Fail_EmitDependenceOnFailure) {
+  // Check that emitted nodes that directly depend on failed nodes also fail.
+  Failed = {{0, {0}}};
+
+  SuperNodeBuilder B;
+  ContainerElementsMap Defs({{0, {1}}});
+  ContainerElementsMap Deps({{0, {0}}});
+  B.add(Defs, Deps);
+  auto ER = emit(TestGraph::simplify(B.takeSuperNodes()));
+  EXPECT_EQ(ER.Ready.size(), 0U);
+  EXPECT_EQ(collapseDefs(ER.Failed, false), Defs);
+}
+
+TEST_F(WaitingOnGraphTest, Fail_ZigZag) {
+  // Check that if an emit introduces a transitive dependence of a failed
+  // node, then all nodes that depend on the failed node are also failed.
+  SuperNodeBuilder B;
+
+  ContainerElementsMap Defs0({{0, {0}}});
+  ContainerElementsMap Deps0({{0, {1}}});
+  B.add(Defs0, Deps0);
+  auto ER0 = emit(TestGraph::simplify(B.takeSuperNodes()));
+  EXPECT_EQ(ER0.Ready.size(), 0U);
+  EXPECT_EQ(ER0.Failed.size(), 0U);
+
+  Failed = {{0, {2}}};
+
+  ContainerElementsMap Defs1({{0, {1}}});
+  ContainerElementsMap Deps1({{0, {2}}});
+  B.add(Defs1, Deps1);
+  auto ER1 = emit(TestGraph::simplify(B.takeSuperNodes()));
+  EXPECT_EQ(ER1.Ready.size(), 0U);
+  EXPECT_EQ(collapseDefs(ER1.Failed, false), merge(Defs0, Defs1));
+}
diff --git a/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp b/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp
index 95c26b10c9a0c..a8706ceac141f 100644
--- a/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp
@@ -431,8 +431,8 @@ TEST_F(OpenMPDecompositionTest, Firstprivate3) {
   std::string Dir0 = stringify(Dec.output[0]);
   std::string Dir1 = stringify(Dec.output[1]);
   std::string Dir2 = stringify(Dec.output[2]);
-  ASSERT_EQ(Dir0, "target map(2, , , , , (x))"); // (12), (27)
-  ASSERT_EQ(Dir1, "teams shared(x)");            // (6), (17)
+  ASSERT_EQ(Dir0, "target map(2, , , , , , (x))"); // (12), (27)
+  ASSERT_EQ(Dir1, "teams shared(x)");              // (6), (17)
   ASSERT_EQ(Dir2, "distribute firstprivate(x) lastprivate(, (x))"); // (5), (21)
 }
 
@@ -574,9 +574,9 @@ TEST_F(OpenMPDecompositionTest, Lastprivate3) {
   std::string Dir0 = stringify(Dec.output[0]);
   std::string Dir1 = stringify(Dec.output[1]);
   std::string Dir2 = stringify(Dec.output[2]);
-  ASSERT_EQ(Dir0, "target map(2, , , , , (x))"); // (21), (27)
-  ASSERT_EQ(Dir1, "parallel shared(x)");         // (22)
-  ASSERT_EQ(Dir2, "do lastprivate(, (x))");      // (21)
+  ASSERT_EQ(Dir0, "target map(2, , , , , , (x))"); // (21), (27)
+  ASSERT_EQ(Dir1, "parallel shared(x)");           // (22)
+  ASSERT_EQ(Dir2, "do lastprivate(, (x))");        // (21)
 }
 
 // SHARED
@@ -984,9 +984,9 @@ TEST_F(OpenMPDecompositionTest, Reduction7) {
   std::string Dir0 = stringify(Dec.output[0]);
   std::string Dir1 = stringify(Dec.output[1]);
   std::string Dir2 = stringify(Dec.output[2]);
-  ASSERT_EQ(Dir0, "target map(2, , , , , (x))"); // (36), (10)
-  ASSERT_EQ(Dir1, "parallel shared(x)");         // (36), (1), (4)
-  ASSERT_EQ(Dir2, "do reduction(, (3), (x))");   // (36)
+  ASSERT_EQ(Dir0, "target map(2, , , , , , (x))"); // (36), (10)
+  ASSERT_EQ(Dir1, "parallel shared(x)");           // (36), (1), (4)
+  ASSERT_EQ(Dir2, "do reduction(, (3), (x))");     // (36)
 }
 
 // IF
diff --git a/llvm/unittests/IR/ConstantsTest.cpp b/llvm/unittests/IR/ConstantsTest.cpp
index 54c7ddd003fc6..6376165cbe766 100644
--- a/llvm/unittests/IR/ConstantsTest.cpp
+++ b/llvm/unittests/IR/ConstantsTest.cpp
@@ -564,13 +564,17 @@ TEST(ConstantsTest, FoldGlobalVariablePtr) {
 
   Global->setAlignment(Align(4));
 
-  ConstantInt *TheConstant(ConstantInt::get(IntType, 2));
+  ConstantInt *TheConstant = ConstantInt::get(IntType, 2);
 
-  Constant *TheConstantExpr(ConstantExpr::getPtrToInt(Global.get(), IntType));
+  Constant *PtrToInt = ConstantExpr::getPtrToInt(Global.get(), IntType);
+  ASSERT_TRUE(
+      ConstantFoldBinaryInstruction(Instruction::And, PtrToInt, TheConstant)
+          ->isNullValue());
 
-  ASSERT_TRUE(ConstantFoldBinaryInstruction(Instruction::And, TheConstantExpr,
-                                            TheConstant)
-                  ->isNullValue());
+  Constant *PtrToAddr = ConstantExpr::getPtrToAddr(Global.get(), IntType);
+  ASSERT_TRUE(
+      ConstantFoldBinaryInstruction(Instruction::And, PtrToAddr, TheConstant)
+          ->isNullValue());
 }
 
 // Check that containsUndefOrPoisonElement and containsPoisonElement is working
diff --git a/llvm/unittests/IR/IRBuilderTest.cpp b/llvm/unittests/IR/IRBuilderTest.cpp
index 37826b2dbaecf..4d5bbe971a060 100644
--- a/llvm/unittests/IR/IRBuilderTest.cpp
+++ b/llvm/unittests/IR/IRBuilderTest.cpp
@@ -201,7 +201,6 @@ TEST_F(IRBuilderTest, IntrinsicsWithScalableVectors) {
 
   Args.clear();
   Args.push_back(UndefValue::get(PtrToVecTy));
-  Args.push_back(UndefValue::get(Builder.getInt32Ty()));
   Args.push_back(UndefValue::get(PredTy));
   Args.push_back(UndefValue::get(VecTy));
 
diff --git a/llvm/unittests/IR/RuntimeLibcallsTest.cpp b/llvm/unittests/IR/RuntimeLibcallsTest.cpp
index 26cb7e344a4f6..8925d2b1155e5 100644
--- a/llvm/unittests/IR/RuntimeLibcallsTest.cpp
+++ b/llvm/unittests/IR/RuntimeLibcallsTest.cpp
@@ -44,9 +44,9 @@ TEST(RuntimeLibcallsTest, LibcallImplByName) {
         RTLIB::RuntimeLibcallsInfo::lookupLibcallImplName("sqrtl");
     ASSERT_EQ(size(SquirtleSquad), 3);
     auto I = SquirtleSquad.begin();
-    EXPECT_EQ(*I++, RTLIB::impl_sqrt_f128);
-    EXPECT_EQ(*I++, RTLIB::impl_sqrt_f80);
-    EXPECT_EQ(*I++, RTLIB::impl_sqrt_ppcf128);
+    EXPECT_EQ(*I++, RTLIB::impl_sqrtl_f128);
+    EXPECT_EQ(*I++, RTLIB::impl_sqrtl_f80);
+    EXPECT_EQ(*I++, RTLIB::impl_sqrtl_ppcf128);
   }
 
   // Last libcall
@@ -54,9 +54,9 @@ TEST(RuntimeLibcallsTest, LibcallImplByName) {
     auto Truncs = RTLIB::RuntimeLibcallsInfo::lookupLibcallImplName("truncl");
     ASSERT_EQ(size(Truncs), 3);
     auto I = Truncs.begin();
-    EXPECT_EQ(*I++, RTLIB::impl_trunc_f128);
-    EXPECT_EQ(*I++, RTLIB::impl_trunc_f80);
-    EXPECT_EQ(*I++, RTLIB::impl_trunc_ppcf128);
+    EXPECT_EQ(*I++, RTLIB::impl_truncl_f128);
+    EXPECT_EQ(*I++, RTLIB::impl_truncl_f80);
+    EXPECT_EQ(*I++, RTLIB::impl_truncl_ppcf128);
   }
 }
 
diff --git a/llvm/unittests/Object/ELFObjectFileTest.cpp b/llvm/unittests/Object/ELFObjectFileTest.cpp
index 17d9f5076b73f..d6a3ca53b2154 100644
--- a/llvm/unittests/Object/ELFObjectFileTest.cpp
+++ b/llvm/unittests/Object/ELFObjectFileTest.cpp
@@ -531,7 +531,7 @@ TEST(ELFObjectFileTest, InvalidDecodeBBAddrMap) {
   // Check that we can detect unsupported versions.
   SmallString<128> UnsupportedVersionYamlString(CommonYamlString);
   UnsupportedVersionYamlString += R"(
-      - Version: 4
+      - Version: 5
         BBRanges:
           - BaseAddress: 0x11111
             BBEntries:
@@ -543,7 +543,7 @@ TEST(ELFObjectFileTest, InvalidDecodeBBAddrMap) {
   {
     SCOPED_TRACE("unsupported version");
     DoCheck(UnsupportedVersionYamlString,
-            "unsupported SHT_LLVM_BB_ADDR_MAP version: 4");
+            "unsupported SHT_LLVM_BB_ADDR_MAP version: 5");
   }
 
   SmallString<128> ZeroBBRangesYamlString(CommonYamlString);
@@ -761,14 +761,14 @@ TEST(ELFObjectFileTest, ReadBBAddrMap) {
 
   BBAddrMap E1 = {
       {{0x11111,
-        {{1, 0x0, 0x3, {false, true, false, false, false}, {0x1, 0x2}}}}}};
+        {{1, 0x0, 0x3, {false, true, false, false, false}, {0x1, 0x2}, 0}}}}};
   BBAddrMap E2 = {
-      {{0x22222, {{2, 0x0, 0x2, {false, false, true, false, false}, {}}}},
-       {0xFFFFF, {{15, 0xF0, 0xF1, {true, true, true, true, true}, {}}}}}};
+      {{0x22222, {{2, 0x0, 0x2, {false, false, true, false, false}, {}, 0}}},
+       {0xFFFFF, {{15, 0xF0, 0xF1, {true, true, true, true, true}, {}, 0}}}}};
   BBAddrMap E3 = {
-      {{0x33333, {{0, 0x0, 0x3, {false, true, true, false, false}, {}}}}}};
+      {{0x33333, {{0, 0x0, 0x3, {false, true, true, false, false}, {}, 0}}}}};
   BBAddrMap E4 = {
-      {{0x44444, {{0, 0x0, 0x4, {false, false, false, true, true}, {}}}}}};
+      {{0x44444, {{0, 0x0, 0x4, {false, false, false, true, true}, {}, 0}}}}};
 
   std::vector<BBAddrMap> Section0BBAddrMaps = {E4};
   std::vector<BBAddrMap> Section1BBAddrMaps = {E3};
@@ -988,6 +988,123 @@ TEST(ELFObjectFileTest, InvalidDecodePGOAnalysisMap) {
   }
 }
 
+// Test for the ELFObjectFile::readBBAddrMap API with BBHash.
+TEST(ELFObjectFileTest, ReadBBHash) {
+  StringRef CommonYamlString(R"(
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  Type:  ET_EXEC
+Sections:
+  - Name: .llvm_bb_addr_map_1
+    Type: SHT_LLVM_BB_ADDR_MAP
+    Link: 1
+    Entries:
+      - Version: 4
+        Feature: 0x60
+        BBRanges:
+          - BaseAddress: 0x11111
+            BBEntries:
+              - ID:              1
+                AddressOffset:   0x0
+                Size:            0x1
+                Metadata:        0x2
+                CallsiteEndOffsets: [ 0x1 , 0x1 ]
+                Hash:            0x1
+  - Name: .llvm_bb_addr_map_2
+    Type: SHT_LLVM_BB_ADDR_MAP
+    Link: 1
+    Entries:
+      - Version: 4
+        Feature: 0x48
+        BBRanges:
+          - BaseAddress: 0x22222
+            BBEntries:
+              - ID:            2
+                AddressOffset: 0x0
+                Size:          0x2
+                Metadata:      0x4
+                Hash:          0x2
+          - BaseAddress: 0xFFFFF
+            BBEntries:
+              - ID:            15
+                AddressOffset: 0xF0
+                Size:          0xF1
+                Metadata:      0x1F
+                Hash:          0xF
+  - Name: .llvm_bb_addr_map_3
+    Type: SHT_LLVM_BB_ADDR_MAP
+    Link: 2
+    Entries:
+      - Version: 4
+        Feature: 0x40
+        BBRanges:
+          - BaseAddress: 0x33333
+            BBEntries:
+              - ID:            0
+                AddressOffset: 0x0
+                Size:          0x3
+                Metadata:      0x6
+                Hash:          0x3
+  - Name: .llvm_bb_addr_map_4
+    Type: SHT_LLVM_BB_ADDR_MAP
+  # Link: 0 (by default, can be overriden)
+    Entries:
+      - Version: 4
+        Feature: 0x40
+        BBRanges:
+          - BaseAddress: 0x44444
+            BBEntries:
+              - ID:            0
+                AddressOffset: 0x0
+                Size:          0x4
+                Metadata:      0x18
+                Hash:          0x4
+)");
+
+  BBAddrMap E1 = {
+      {{0x11111,
+        {{1, 0x0, 0x3, {false, true, false, false, false}, {0x1, 0x2}, 0x1}}}}};
+  BBAddrMap E2 = {
+      {{0x22222, {{2, 0x0, 0x2, {false, false, true, false, false}, {}, 0x2}}},
+       {0xFFFFF, {{15, 0xF0, 0xF1, {true, true, true, true, true}, {}, 0xF}}}}};
+  BBAddrMap E3 = {
+      {{0x33333, {{0, 0x0, 0x3, {false, true, true, false, false}, {}, 0x3}}}}};
+  BBAddrMap E4 = {
+      {{0x44444, {{0, 0x0, 0x4, {false, false, false, true, true}, {}, 0x4}}}}};
+
+  std::vector<BBAddrMap> Section0BBAddrMaps = {E4};
+  std::vector<BBAddrMap> Section1BBAddrMaps = {E3};
+  std::vector<BBAddrMap> Section2BBAddrMaps = {E1, E2};
+  std::vector<BBAddrMap> AllBBAddrMaps = {E1, E2, E3, E4};
+
+  auto DoCheckSucceeds = [&](StringRef YamlString,
+                             std::optional<unsigned> TextSectionIndex,
+                             std::vector<BBAddrMap> ExpectedResult) {
+    SCOPED_TRACE("for TextSectionIndex: " +
+                 (TextSectionIndex ? llvm::Twine(*TextSectionIndex) : "{}") +
+                 " and object yaml:\n" + YamlString);
+    SmallString<0> Storage;
+    Expected<ELFObjectFile<ELF64LE>> ElfOrErr =
+        toBinary<ELF64LE>(Storage, YamlString);
+    ASSERT_THAT_EXPECTED(ElfOrErr, Succeeded());
+
+    Expected<const typename ELF64LE::Shdr *> BBAddrMapSecOrErr =
+        ElfOrErr->getELFFile().getSection(1);
+    ASSERT_THAT_EXPECTED(BBAddrMapSecOrErr, Succeeded());
+    auto BBAddrMaps = ElfOrErr->readBBAddrMap(TextSectionIndex);
+    ASSERT_THAT_EXPECTED(BBAddrMaps, Succeeded());
+    EXPECT_EQ(*BBAddrMaps, ExpectedResult);
+  };
+
+  DoCheckSucceeds(CommonYamlString, /*TextSectionIndex=*/std::nullopt,
+                  AllBBAddrMaps);
+  DoCheckSucceeds(CommonYamlString, /*TextSectionIndex=*/0, Section0BBAddrMaps);
+  DoCheckSucceeds(CommonYamlString, /*TextSectionIndex=*/2, Section1BBAddrMaps);
+  DoCheckSucceeds(CommonYamlString, /*TextSectionIndex=*/1, Section2BBAddrMaps);
+}
+
 // Test for the ELFObjectFile::readBBAddrMap API with PGOAnalysisMap.
 TEST(ELFObjectFileTest, ReadPGOAnalysisMap) {
   StringRef CommonYamlString(R"(
@@ -1159,29 +1276,32 @@ TEST(ELFObjectFileTest, ReadPGOAnalysisMap) {
 )");
 
   BBAddrMap E1 = {
-      {{0x11111, {{1, 0x0, 0x1, {false, true, false, false, false}, {}}}}}};
-  PGOAnalysisMap P1 = {892, {}, {true, false, false, false, false, false}};
+      {{0x11111, {{1, 0x0, 0x1, {false, true, false, false, false}, {}, 0}}}}};
+  PGOAnalysisMap P1 = {
+      892, {}, {true, false, false, false, false, false, false}};
   BBAddrMap E2 = {
-      {{0x22222, {{2, 0x0, 0x2, {false, false, true, false, false}, {}}}}}};
+      {{0x22222, {{2, 0x0, 0x2, {false, false, true, false, false}, {}, 0}}}}};
   PGOAnalysisMap P2 = {{},
                        {{BlockFrequency(343), {}}},
-                       {false, true, false, false, false, false}};
-  BBAddrMap E3 = {{{0x33333,
-                    {{0, 0x0, 0x3, {false, true, true, false, false}, {}},
-                     {1, 0x3, 0x3, {false, false, true, false, false}, {}},
-                     {2, 0x6, 0x3, {false, false, false, false, false}, {}}}}}};
+                       {false, true, false, false, false, false, false}};
+  BBAddrMap E3 = {
+      {{0x33333,
+        {{0, 0x0, 0x3, {false, true, true, false, false}, {}, 0},
+         {1, 0x3, 0x3, {false, false, true, false, false}, {}, 0},
+         {2, 0x6, 0x3, {false, false, false, false, false}, {}, 0}}}}};
   PGOAnalysisMap P3 = {{},
                        {{{},
                          {{1, BranchProbability::getRaw(0x1111'1111)},
                           {2, BranchProbability::getRaw(0xeeee'eeee)}}},
                         {{}, {{2, BranchProbability::getRaw(0xffff'ffff)}}},
                         {{}, {}}},
-                       {false, false, true, false, false, false}};
-  BBAddrMap E4 = {{{0x44444,
-                    {{0, 0x0, 0x4, {false, false, false, true, true}, {}},
-                     {1, 0x4, 0x4, {false, false, false, false, false}, {}},
-                     {2, 0x8, 0x4, {false, false, false, false, false}, {}},
-                     {3, 0xc, 0x4, {false, false, false, false, false}, {}}}}}};
+                       {false, false, true, false, false, false, false}};
+  BBAddrMap E4 = {
+      {{0x44444,
+        {{0, 0x0, 0x4, {false, false, false, true, true}, {}, 0},
+         {1, 0x4, 0x4, {false, false, false, false, false}, {}, 0},
+         {2, 0x8, 0x4, {false, false, false, false, false}, {}, 0},
+         {3, 0xc, 0x4, {false, false, false, false, false}, {}, 0}}}}};
   PGOAnalysisMap P4 = {
       1000,
       {{BlockFrequency(1000),
@@ -1193,22 +1313,24 @@ TEST(ELFObjectFileTest, ReadPGOAnalysisMap) {
          {3, BranchProbability::getRaw(0xeeee'eeee)}}},
        {BlockFrequency(18), {{3, BranchProbability::getRaw(0xffff'ffff)}}},
        {BlockFrequency(1000), {}}},
-      {true, true, true, false, false, false}};
+      {true, true, true, false, false, false, false}};
   BBAddrMap E5 = {
-      {{0x55555, {{2, 0x0, 0x2, {false, false, true, false, false}, {}}}}}};
-  PGOAnalysisMap P5 = {{}, {}, {false, false, false, false, false, false}};
+      {{0x55555, {{2, 0x0, 0x2, {false, false, true, false, false}, {}, 0}}}}};
+  PGOAnalysisMap P5 = {
+      {}, {}, {false, false, false, false, false, false, false}};
   BBAddrMap E6 = {
       {{0x66666,
-        {{0, 0x0, 0x6, {false, true, true, false, false}, {}},
-         {1, 0x6, 0x6, {false, false, true, false, false}, {}}}},
-       {0x666661, {{2, 0x0, 0x6, {false, false, false, false, false}, {}}}}}};
+        {{0, 0x0, 0x6, {false, true, true, false, false}, {}, 0},
+         {1, 0x6, 0x6, {false, false, true, false, false}, {}, 0}}},
+       {0x666661,
+        {{2, 0x0, 0x6, {false, false, false, false, false}, {}, 0}}}}};
   PGOAnalysisMap P6 = {{},
                        {{{},
                          {{1, BranchProbability::getRaw(0x2222'2222)},
                           {2, BranchProbability::getRaw(0xcccc'cccc)}}},
                         {{}, {{2, BranchProbability::getRaw(0x8888'8888)}}},
                         {{}, {}}},
-                       {false, false, true, true, false, false}};
+                       {false, false, true, true, false, false, false}};
 
   std::vector<BBAddrMap> Section0BBAddrMaps = {E4, E5, E6};
   std::vector<BBAddrMap> Section1BBAddrMaps = {E3};
diff --git a/llvm/unittests/Object/ELFTypesTest.cpp b/llvm/unittests/Object/ELFTypesTest.cpp
index f88931b5f544c..1765e15003963 100644
--- a/llvm/unittests/Object/ELFTypesTest.cpp
+++ b/llvm/unittests/Object/ELFTypesTest.cpp
@@ -101,21 +101,22 @@ static_assert(
     "PGOAnalysisMap should use the same type for basic block ID as BBAddrMap");
 
 TEST(ELFTypesTest, BBAddrMapFeaturesEncodingTest) {
-  const std::array<BBAddrMap::Features, 11> Decoded = {
-      {{false, false, false, false, false, false},
-       {true, false, false, false, false, false},
-       {false, true, false, false, false, false},
-       {false, false, true, false, false, false},
-       {false, false, false, true, false, false},
-       {true, true, false, false, false, false},
-       {false, true, true, false, false, false},
-       {false, true, true, true, false, false},
-       {true, true, true, true, false, false},
-       {false, false, false, false, true, false},
-       {false, false, false, false, false, true}}};
-  const std::array<uint8_t, 11> Encoded = {{0b0000, 0b0001, 0b0010, 0b0100,
-                                            0b1000, 0b0011, 0b0110, 0b1110,
-                                            0b1111, 0b1'0000, 0b10'0000}};
+  const std::array<BBAddrMap::Features, 12> Decoded = {
+      {{false, false, false, false, false, false, false},
+       {true, false, false, false, false, false, false},
+       {false, true, false, false, false, false, false},
+       {false, false, true, false, false, false, false},
+       {false, false, false, true, false, false, false},
+       {true, true, false, false, false, false, false},
+       {false, true, true, false, false, false, false},
+       {false, true, true, true, false, false, false},
+       {true, true, true, true, false, false, false},
+       {false, false, false, false, true, false, false},
+       {false, false, false, false, false, true, false},
+       {false, false, false, false, false, false, true}}};
+  const std::array<uint8_t, 12> Encoded = {
+      {0b0000, 0b0001, 0b0010, 0b0100, 0b1000, 0b0011, 0b0110, 0b1110, 0b1111,
+       0b1'0000, 0b10'0000, 0b100'0000}};
   for (const auto &[Feat, EncodedVal] : llvm::zip(Decoded, Encoded))
     EXPECT_EQ(Feat.encode(), EncodedVal);
   for (const auto &[Feat, EncodedVal] : llvm::zip(Decoded, Encoded)) {
@@ -128,9 +129,9 @@ TEST(ELFTypesTest, BBAddrMapFeaturesEncodingTest) {
 
 TEST(ELFTypesTest, BBAddrMapFeaturesInvalidEncodingTest) {
   const std::array<std::string, 2> Errors = {
-      "invalid encoding for BBAddrMap::Features: 0x40",
+      "invalid encoding for BBAddrMap::Features: 0x80",
       "invalid encoding for BBAddrMap::Features: 0xf0"};
-  const std::array<uint8_t, 2> Values = {{0b100'0000, 0b1111'0000}};
+  const std::array<uint8_t, 2> Values = {{0b1000'0000, 0b1111'0000}};
   for (const auto &[Val, Error] : llvm::zip(Values, Errors)) {
     EXPECT_THAT_ERROR(BBAddrMap::Features::decode(Val).takeError(),
                       FailedWithMessage(Error));
diff --git a/llvm/unittests/Support/SourceMgrTest.cpp b/llvm/unittests/Support/SourceMgrTest.cpp
index 301b64f36a49a..c65f0015bce86 100644
--- a/llvm/unittests/Support/SourceMgrTest.cpp
+++ b/llvm/unittests/Support/SourceMgrTest.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/Support/raw_ostream.h"
 #include "gtest/gtest.h"
 
@@ -506,3 +507,13 @@ TEST_F(SourceMgrTest, PrintWithoutLoc) {
   Diag.print(nullptr, OS, false, false, false);
   EXPECT_EQ("message\n", Output);
 }
+
+TEST_F(SourceMgrTest, IncludeDirs) {
+  auto VFS = makeIntrusiveRefCnt<vfs::InMemoryFileSystem>();
+  VFS->addFile("include/file", 0, MemoryBuffer::getMemBuffer("contents"));
+  SM.setVirtualFileSystem(std::move(VFS));
+  SM.setIncludeDirs({"include"});
+  std::string ResolvedPath;
+  unsigned NumBuffers = SM.AddIncludeFile("file", SMLoc(), ResolvedPath);
+  EXPECT_EQ(NumBuffers, 1u);
+}
diff --git a/llvm/unittests/Target/AArch64/AArch64SelectionDAGTest.cpp b/llvm/unittests/Target/AArch64/AArch64SelectionDAGTest.cpp
index 5ac4c53b71354..809960d368e4e 100644
--- a/llvm/unittests/Target/AArch64/AArch64SelectionDAGTest.cpp
+++ b/llvm/unittests/Target/AArch64/AArch64SelectionDAGTest.cpp
@@ -228,6 +228,114 @@ TEST_F(AArch64SelectionDAGTest, ComputeNumSignBits_SUB) {
   EXPECT_EQ(DAG->ComputeNumSignBits(Op), 2u);
 }
 
+TEST_F(AArch64SelectionDAGTest, ComputeNumSignBits_ADD) {
+  SDLoc Loc;
+  auto IntVT = EVT::getIntegerVT(Context, 8);
+  auto Nneg1 = DAG->getConstant(0xFF, Loc, IntVT);
+  auto N0 = DAG->getConstant(0x00, Loc, IntVT);
+  auto N1 = DAG->getConstant(0x01, Loc, IntVT);
+  auto N5 = DAG->getConstant(0x05, Loc, IntVT);
+  auto N8 = DAG->getConstant(0x08, Loc, IntVT);
+  auto Nsign1 = DAG->getConstant(0x55, Loc, IntVT);
+  auto UnknownOp = DAG->getRegister(0, IntVT);
+  auto Mask = DAG->getConstant(0x1e, Loc, IntVT);
+  auto Nsign3 = DAG->getNode(ISD::AND, Loc, IntVT, Mask, UnknownOp);
+  // RHS early out
+  // Nsign1 = 01010101
+  // Nsign3 = 000????0
+  auto OpRhsEo = DAG->getNode(ISD::ADD, Loc, IntVT, Nsign3, Nsign1);
+  EXPECT_EQ(DAG->ComputeNumSignBits(OpRhsEo), 1u);
+
+  // ADD 0 -1
+  // N0    = 00000000
+  // Nneg1 = 11111111
+  auto OpNegZero = DAG->getNode(ISD::ADD, Loc, IntVT, N0, Nneg1);
+  EXPECT_EQ(DAG->ComputeNumSignBits(OpNegZero), 8u);
+
+  // ADD 1 -1
+  // N1    = 00000001
+  // Nneg1 = 11111111
+  auto OpNegOne = DAG->getNode(ISD::ADD, Loc, IntVT, N1, Nneg1);
+  EXPECT_EQ(DAG->ComputeNumSignBits(OpNegOne), 8u);
+
+  // ADD 8 -1
+  // N8    = 00001000
+  // Nneg1 = 11111111
+  auto OpSeven = DAG->getNode(ISD::ADD, Loc, IntVT, N8, Nneg1);
+  EXPECT_EQ(DAG->ComputeNumSignBits(OpSeven), 5u);
+
+  // Non negative
+  // Nsign3 = 000????0
+  // Nneg1  = 11111111
+  auto OpNonNeg = DAG->getNode(ISD::ADD, Loc, IntVT, Nsign3, Nneg1);
+  EXPECT_EQ(DAG->ComputeNumSignBits(OpNonNeg), 3u);
+
+  // LHS early out
+  // Nsign1 = 01010101
+  // Nsign3 = 000????0
+  auto OpLhsEo = DAG->getNode(ISD::ADD, Loc, IntVT, Nsign1, Nsign3);
+  EXPECT_EQ(DAG->ComputeNumSignBits(OpLhsEo), 1u);
+
+  // Nsign3 = 000????0
+  // N5     = 00000101
+  auto Op = DAG->getNode(ISD::ADD, Loc, IntVT, Nsign3, N5);
+  EXPECT_EQ(DAG->ComputeNumSignBits(Op), 2u);
+}
+
+TEST_F(AArch64SelectionDAGTest, ComputeNumSignBits_ADDC) {
+  SDLoc Loc;
+  auto IntVT = EVT::getIntegerVT(Context, 8);
+  auto Nneg1 = DAG->getConstant(0xFF, Loc, IntVT);
+  auto N0 = DAG->getConstant(0x00, Loc, IntVT);
+  auto N1 = DAG->getConstant(0x01, Loc, IntVT);
+  auto N5 = DAG->getConstant(0x05, Loc, IntVT);
+  auto N8 = DAG->getConstant(0x08, Loc, IntVT);
+  auto Nsign1 = DAG->getConstant(0x55, Loc, IntVT);
+  auto UnknownOp = DAG->getRegister(0, IntVT);
+  auto Mask = DAG->getConstant(0x1e, Loc, IntVT);
+  auto Nsign3 = DAG->getNode(ISD::AND, Loc, IntVT, Mask, UnknownOp);
+  // RHS early out
+  // Nsign1 = 01010101
+  // Nsign3 = 000????0
+  auto OpRhsEo = DAG->getNode(ISD::ADDC, Loc, IntVT, Nsign3, Nsign1);
+  EXPECT_EQ(DAG->ComputeNumSignBits(OpRhsEo), 1u);
+
+  // ADD 0 -1
+  // N0    = 00000000
+  // Nneg1 = 11111111
+  auto OpNegZero = DAG->getNode(ISD::ADDC, Loc, IntVT, N0, Nneg1);
+  EXPECT_EQ(DAG->ComputeNumSignBits(OpNegZero), 8u);
+
+  // ADD 1 -1
+  // N1    = 00000001
+  // Nneg1 = 11111111
+  auto OpNegOne = DAG->getNode(ISD::ADDC, Loc, IntVT, N1, Nneg1);
+  EXPECT_EQ(DAG->ComputeNumSignBits(OpNegOne), 8u);
+
+  // ADD 8 -1
+  // N8    = 00001000
+  // Nneg1 = 11111111
+  auto OpSeven = DAG->getNode(ISD::ADDC, Loc, IntVT, N8, Nneg1);
+  EXPECT_EQ(DAG->ComputeNumSignBits(OpSeven), 4u);
+
+  // Non negative
+  // Nsign3 = 000????0
+  // Nneg1  = 11111111
+  auto OpNonNeg = DAG->getNode(ISD::ADDC, Loc, IntVT, Nsign3, Nneg1);
+  EXPECT_EQ(DAG->ComputeNumSignBits(OpNonNeg), 3u);
+
+  // LHS early out
+  // Nsign1 = 01010101
+  // Nsign3 = 000????0
+  auto OpLhsEo = DAG->getNode(ISD::ADDC, Loc, IntVT, Nsign1, Nsign3);
+  EXPECT_EQ(DAG->ComputeNumSignBits(OpLhsEo), 1u);
+
+  // Nsign3 = 000????0
+  // N5     = 00000101
+  auto Op = DAG->getNode(ISD::ADDC, Loc, IntVT, Nsign3, N5);
+  EXPECT_EQ(DAG->ComputeNumSignBits(Op), 2u);
+}
+
 TEST_F(AArch64SelectionDAGTest, SimplifyDemandedVectorElts_EXTRACT_SUBVECTOR) {
   TargetLowering TL(*TM);
 
diff --git a/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp b/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp
index 2a0f500525a65..e108c4d77c5a9 100644
--- a/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp
@@ -33,7 +33,7 @@ TEST_F(VPDominatorTreeTest, DominanceNoRegionsTest) {
   VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("VPBB2");
   VPBasicBlock *VPBB3 = Plan.createVPBasicBlock("VPBB3");
   VPBasicBlock *VPBB4 = Plan.createVPBasicBlock("VPBB4");
-  VPRegionBlock *R1 = Plan.createVPRegionBlock(VPBB1, VPBB4);
+  VPRegionBlock *R1 = Plan.createLoopRegion("R1", VPBB1, VPBB4);
   VPBB2->setParent(R1);
   VPBB3->setParent(R1);
 
@@ -99,7 +99,7 @@ TEST_F(VPDominatorTreeTest, DominanceRegionsTest) {
     VPBasicBlock *R1BB2 = Plan.createVPBasicBlock("");
     VPBasicBlock *R1BB3 = Plan.createVPBasicBlock("");
     VPBasicBlock *R1BB4 = Plan.createVPBasicBlock("");
-    VPRegionBlock *R1 = Plan.createVPRegionBlock(R1BB1, R1BB4, "R1");
+    VPRegionBlock *R1 = Plan.createLoopRegion("R1", R1BB1, R1BB4);
     R1BB2->setParent(R1);
     R1BB3->setParent(R1);
     VPBlockUtils::connectBlocks(VPBB0, R1);
@@ -112,7 +112,7 @@ TEST_F(VPDominatorTreeTest, DominanceRegionsTest) {
 
     VPBasicBlock *R2BB1 = Plan.createVPBasicBlock("");
     VPBasicBlock *R2BB2 = Plan.createVPBasicBlock("");
-    VPRegionBlock *R2 = Plan.createVPRegionBlock(R2BB1, R2BB2, "R2");
+    VPRegionBlock *R2 = Plan.createLoopRegion("R2", R2BB1, R2BB2);
     VPBlockUtils::connectBlocks(R2BB1, R2BB2);
     VPBlockUtils::connectBlocks(R1, R2);
 
@@ -171,12 +171,12 @@ TEST_F(VPDominatorTreeTest, DominanceRegionsTest) {
     VPBasicBlock *R1BB1 = Plan.createVPBasicBlock("R1BB1");
     VPBasicBlock *R1BB2 = Plan.createVPBasicBlock("R1BB2");
     VPBasicBlock *R1BB3 = Plan.createVPBasicBlock("R1BB3");
-    VPRegionBlock *R1 = Plan.createVPRegionBlock(R1BB1, R1BB3, "R1");
+    VPRegionBlock *R1 = Plan.createLoopRegion("R1", R1BB1, R1BB3);
 
     VPBasicBlock *R2BB1 = Plan.createVPBasicBlock("R2BB1");
     VPBasicBlock *R2BB2 = Plan.createVPBasicBlock("R2BB2");
     VPBasicBlock *R2BB3 = Plan.createVPBasicBlock("R2BB#");
-    VPRegionBlock *R2 = Plan.createVPRegionBlock(R2BB1, R2BB3, "R2");
+    VPRegionBlock *R2 = Plan.createLoopRegion("R2", R2BB1, R2BB3);
     R2BB2->setParent(R2);
     VPBlockUtils::connectBlocks(R2BB1, R2BB2);
     VPBlockUtils::connectBlocks(R2BB2, R2BB1);
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
index db64c755d005f..c1791dfa5b761 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -269,7 +269,7 @@ TEST_F(VPBasicBlockTest, getPlan) {
     // VPBasicBlock is the entry into the VPlan, followed by a region.
     VPBasicBlock *R1BB1 = Plan.createVPBasicBlock("");
     VPBasicBlock *R1BB2 = Plan.createVPBasicBlock("");
-    VPRegionBlock *R1 = Plan.createVPRegionBlock(R1BB1, R1BB2, "R1");
+    VPRegionBlock *R1 = Plan.createLoopRegion("R1", R1BB1, R1BB2);
     VPBlockUtils::connectBlocks(R1BB1, R1BB2);
 
     VPBlockUtils::connectBlocks(VPBB1, R1);
@@ -286,12 +286,12 @@ TEST_F(VPBasicBlockTest, getPlan) {
     VPlan &Plan = getPlan();
     VPBasicBlock *R1BB1 = Plan.createVPBasicBlock("");
     VPBasicBlock *R1BB2 = Plan.createVPBasicBlock("");
-    VPRegionBlock *R1 = Plan.createVPRegionBlock(R1BB1, R1BB2, "R1");
+    VPRegionBlock *R1 = Plan.createLoopRegion("R1", R1BB1, R1BB2);
     VPBlockUtils::connectBlocks(R1BB1, R1BB2);
 
     VPBasicBlock *R2BB1 = Plan.createVPBasicBlock("");
     VPBasicBlock *R2BB2 = Plan.createVPBasicBlock("");
-    VPRegionBlock *R2 = Plan.createVPRegionBlock(R2BB1, R2BB2, "R2");
+    VPRegionBlock *R2 = Plan.createLoopRegion("R2", R2BB1, R2BB2);
     VPBlockUtils::connectBlocks(R2BB1, R2BB2);
 
     VPBasicBlock *VPBB1 = Plan.getEntry();
@@ -369,7 +369,7 @@ TEST_F(VPBasicBlockTest, TraversingIteratorTest) {
     VPBasicBlock *R1BB2 = Plan.createVPBasicBlock("");
     VPBasicBlock *R1BB3 = Plan.createVPBasicBlock("");
     VPBasicBlock *R1BB4 = Plan.createVPBasicBlock("");
-    VPRegionBlock *R1 = Plan.createVPRegionBlock(R1BB1, R1BB4, "R1");
+    VPRegionBlock *R1 = Plan.createLoopRegion("R1", R1BB1, R1BB4);
     R1BB2->setParent(R1);
     R1BB3->setParent(R1);
     VPBlockUtils::connectBlocks(VPBB0, R1);
@@ -382,7 +382,7 @@ TEST_F(VPBasicBlockTest, TraversingIteratorTest) {
 
     VPBasicBlock *R2BB1 = Plan.createVPBasicBlock("");
     VPBasicBlock *R2BB2 = Plan.createVPBasicBlock("");
-    VPRegionBlock *R2 = Plan.createVPRegionBlock(R2BB1, R2BB2, "R2");
+    VPRegionBlock *R2 = Plan.createLoopRegion("R2", R2BB1, R2BB2);
     VPBlockUtils::connectBlocks(R2BB1, R2BB2);
     VPBlockUtils::connectBlocks(R1, R2);
 
@@ -467,12 +467,12 @@ TEST_F(VPBasicBlockTest, TraversingIteratorTest) {
     VPBasicBlock *R1BB1 = Plan.createVPBasicBlock("R1BB1");
     VPBasicBlock *R1BB2 = Plan.createVPBasicBlock("R1BB2");
     VPBasicBlock *R1BB3 = Plan.createVPBasicBlock("R1BB3");
-    VPRegionBlock *R1 = Plan.createVPRegionBlock(R1BB1, R1BB3, "R1");
+    VPRegionBlock *R1 = Plan.createLoopRegion("R1", R1BB1, R1BB3);
 
     VPBasicBlock *R2BB1 = Plan.createVPBasicBlock("R2BB1");
     VPBasicBlock *R2BB2 = Plan.createVPBasicBlock("R2BB2");
     VPBasicBlock *R2BB3 = Plan.createVPBasicBlock("R2BB3");
-    VPRegionBlock *R2 = Plan.createVPRegionBlock(R2BB1, R2BB3, "R2");
+    VPRegionBlock *R2 = Plan.createLoopRegion("R2", R2BB1, R2BB3);
     R2BB2->setParent(R2);
     VPBlockUtils::connectBlocks(R2BB1, R2BB2);
     VPBlockUtils::connectBlocks(R2BB2, R2BB1);
@@ -537,10 +537,10 @@ TEST_F(VPBasicBlockTest, TraversingIteratorTest) {
     VPlan &Plan = getPlan();
     VPBasicBlock *R2BB1 = Plan.createVPBasicBlock("R2BB1");
     VPBasicBlock *R2BB2 = Plan.createVPBasicBlock("R2BB2");
-    VPRegionBlock *R2 = Plan.createVPRegionBlock(R2BB1, R2BB2, "R2");
+    VPRegionBlock *R2 = Plan.createLoopRegion("R2", R2BB1, R2BB2);
     VPBlockUtils::connectBlocks(R2BB1, R2BB2);
 
-    VPRegionBlock *R1 = Plan.createVPRegionBlock(R2, R2, "R1");
+    VPRegionBlock *R1 = Plan.createLoopRegion("R1", R2, R2);
     R2->setParent(R1);
 
     VPBasicBlock *VPBB1 = Plan.getEntry();
@@ -590,14 +590,14 @@ TEST_F(VPBasicBlockTest, TraversingIteratorTest) {
     //
     VPlan &Plan = getPlan();
     VPBasicBlock *R3BB1 = Plan.createVPBasicBlock("R3BB1");
-    VPRegionBlock *R3 = Plan.createVPRegionBlock(R3BB1, R3BB1, "R3");
+    VPRegionBlock *R3 = Plan.createLoopRegion("R3", R3BB1, R3BB1);
 
     VPBasicBlock *R2BB1 = Plan.createVPBasicBlock("R2BB1");
-    VPRegionBlock *R2 = Plan.createVPRegionBlock(R2BB1, R3, "R2");
+    VPRegionBlock *R2 = Plan.createLoopRegion("R2", R2BB1, R3);
     R3->setParent(R2);
     VPBlockUtils::connectBlocks(R2BB1, R3);
 
-    VPRegionBlock *R1 = Plan.createVPRegionBlock(R2, R2, "R1");
+    VPRegionBlock *R1 = Plan.createLoopRegion("R1", R2, R2);
     R2->setParent(R1);
 
     VPBasicBlock *VPBB1 = Plan.getEntry();
@@ -687,7 +687,7 @@ TEST_F(VPBasicBlockTest, reassociateBlocks) {
     VPlan &Plan = getPlan();
     VPBasicBlock *VPBB1 = Plan.createVPBasicBlock("VPBB1");
     VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("VPBB2");
-    VPRegionBlock *R1 = Plan.createVPRegionBlock(VPBB2, VPBB2, "R1");
+    VPRegionBlock *R1 = Plan.createLoopRegion("R1", VPBB2, VPBB2);
     VPBlockUtils::connectBlocks(VPBB1, R1);
 
     auto *WidenPhi = new VPWidenPHIRecipe(nullptr);
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
index c2f045bf524e9..50ad4d5fa61ff 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
@@ -32,7 +32,7 @@ TEST_F(VPVerifierTest, VPInstructionUseBeforeDefSameBB) {
 
   VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("");
   VPBB2->appendRecipe(CanIV);
-  VPRegionBlock *R1 = Plan.createVPRegionBlock(VPBB2, VPBB2, "R1");
+  VPRegionBlock *R1 = Plan.createLoopRegion("R1", VPBB2, VPBB2);
   VPBlockUtils::connectBlocks(VPBB1, R1);
   VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader());
 
@@ -71,7 +71,7 @@ TEST_F(VPVerifierTest, VPInstructionUseBeforeDefDifferentBB) {
   VPBB2->appendRecipe(DefI);
   VPBB2->appendRecipe(BranchOnCond);
 
-  VPRegionBlock *R1 = Plan.createVPRegionBlock(VPBB2, VPBB2, "R1");
+  VPRegionBlock *R1 = Plan.createLoopRegion("R1", VPBB2, VPBB2);
   VPBlockUtils::connectBlocks(VPBB1, R1);
   VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader());
 
@@ -117,7 +117,7 @@ TEST_F(VPVerifierTest, VPBlendUseBeforeDefDifferentBB) {
 
   VPBlockUtils::connectBlocks(VPBB2, VPBB3);
   VPBlockUtils::connectBlocks(VPBB3, VPBB4);
-  VPRegionBlock *R1 = Plan.createVPRegionBlock(VPBB2, VPBB4, "R1");
+  VPRegionBlock *R1 = Plan.createLoopRegion("R1", VPBB2, VPBB4);
   VPBlockUtils::connectBlocks(VPBB1, R1);
   VPBB3->setParent(R1);
 
@@ -160,7 +160,7 @@ TEST_F(VPVerifierTest, VPPhiIncomingValueDoesntDominateIncomingBlock) {
   auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {});
   VPBB3->appendRecipe(CanIV);
 
-  VPRegionBlock *R1 = Plan.createVPRegionBlock(VPBB3, VPBB3, "R1");
+  VPRegionBlock *R1 = Plan.createLoopRegion("R1", VPBB3, VPBB3);
   VPBlockUtils::connectBlocks(VPBB1, VPBB2);
   VPBlockUtils::connectBlocks(VPBB2, R1);
   VPBlockUtils::connectBlocks(VPBB4, Plan.getScalarHeader());
@@ -200,7 +200,7 @@ TEST_F(VPVerifierTest, DuplicateSuccessorsOutsideRegion) {
   VPBB2->appendRecipe(CanIV);
   VPBB2->appendRecipe(BranchOnCond);
 
-  VPRegionBlock *R1 = Plan.createVPRegionBlock(VPBB2, VPBB2, "R1");
+  VPRegionBlock *R1 = Plan.createLoopRegion("R1", VPBB2, VPBB2);
   VPBlockUtils::connectBlocks(VPBB1, R1);
   VPBlockUtils::connectBlocks(VPBB1, R1);
 
@@ -237,7 +237,7 @@ TEST_F(VPVerifierTest, DuplicateSuccessorsInsideRegion) {
 
   VPBlockUtils::connectBlocks(VPBB2, VPBB3);
   VPBlockUtils::connectBlocks(VPBB2, VPBB3);
-  VPRegionBlock *R1 = Plan.createVPRegionBlock(VPBB2, VPBB3, "R1");
+  VPRegionBlock *R1 = Plan.createLoopRegion("R1", VPBB2, VPBB3);
   VPBlockUtils::connectBlocks(VPBB1, R1);
   VPBB3->setParent(R1);
 
@@ -270,7 +270,7 @@ TEST_F(VPVerifierTest, BlockOutsideRegionWithParent) {
   VPBB1->appendRecipe(DefI);
   VPBB2->appendRecipe(BranchOnCond);
 
-  VPRegionBlock *R1 = Plan.createVPRegionBlock(VPBB2, VPBB2, "R1");
+  VPRegionBlock *R1 = Plan.createLoopRegion("R1", VPBB2, VPBB2);
   VPBlockUtils::connectBlocks(VPBB1, R1);
 
   VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader());
@@ -302,7 +302,7 @@ TEST_F(VPVerifierTest, NonHeaderPHIInHeader) {
   VPBB2->appendRecipe(IRPhi);
   VPBB2->appendRecipe(BranchOnCond);
 
-  VPRegionBlock *R1 = Plan.createVPRegionBlock(VPBB2, VPBB2, "R1");
+  VPRegionBlock *R1 = Plan.createLoopRegion("R1", VPBB2, VPBB2);
   VPBlockUtils::connectBlocks(VPBB1, R1);
   VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader());
 
diff --git a/llvm/utils/Misc/zkill b/llvm/utils/Misc/zkill
index bc0bfd586f7a4..8e10144ed4ac9 100755
--- a/llvm/utils/Misc/zkill
+++ b/llvm/utils/Misc/zkill
@@ -14,7 +14,7 @@ def _write_message(kind, message):
     file,line,_,_,_ = inspect.getframeinfo(f)
     location = '%s:%d' % (os.path.basename(file), line)
 
-    print >>sys.stderr, '%s: %s: %s' % (location, kind, message)
+    print('%s: %s: %s' % (location, kind, message), file=sys.stderr)
 
 note = lambda message: _write_message('note', message)
 warning = lambda message: _write_message('warning', message)
@@ -53,7 +53,7 @@ def extractExecutable(command):
 
 class Struct:
     def __init__(self, **kwargs):
-        self.fields = kwargs.keys()
+        self.fields = list(kwargs.keys())
         self.__dict__.update(kwargs)
 
     def __repr__(self):
@@ -144,7 +144,7 @@ def main():
     parser.add_option("-s", "", dest="signalName",
                       help="Name of the signal to use (default=%default)",
                       action="store", default='INT',
-                      choices=kSignals.keys())
+                      choices=list(kSignals.keys()))
     parser.add_option("-l", "", dest="listSignals",
                       help="List known signal names",
                       action="store_true", default=False)
@@ -202,18 +202,18 @@ def main():
     (opts, args) = parser.parse_args()
 
     if opts.listSignals:
-        items = [(v,k) for k,v in kSignals.items()]
+        items = [(v,k) for k,v in list(kSignals.items())]
         items.sort()
         for i in range(0, len(items), 4):
-            print '\t'.join(['%2d) SIG%s' % (k,v)
-                             for k,v in items[i:i+4]])
+            print('\t'.join(['%2d) SIG%s' % (k,v)
+                             for k,v in items[i:i+4]]))
         sys.exit(0)
 
     # Figure out the signal to use.
     signal = kSignals[opts.signalName]
     signalValueName = str(signal)
     if opts.verbose:
-        name = dict((v,k) for k,v in kSignals.items()).get(signal,None)
+        name = dict((v,k) for k,v in list(kSignals.items())).get(signal,None)
         if name:
             signalValueName = name
             note('using signal %d (SIG%s)' % (signal, name))
diff --git a/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp b/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp
index b4d816ea211f7..3c6ff1132230b 100644
--- a/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp
@@ -266,10 +266,9 @@ static void emitDirectivesDecl(const RecordKeeper &Records, raw_ostream &OS) {
     return;
 
   StringRef Lang = DirLang.getName();
+  IncludeGuardEmitter IncGuard(OS, (Twine("LLVM_") + Lang + "_INC").str());
 
-  OS << "#ifndef LLVM_" << Lang << "_INC\n";
-  OS << "#define LLVM_" << Lang << "_INC\n";
-  OS << "\n#include \"llvm/ADT/ArrayRef.h\"\n";
+  OS << "#include \"llvm/ADT/ArrayRef.h\"\n";
 
   if (DirLang.hasEnableBitmaskEnumInNamespace())
     OS << "#include \"llvm/ADT/BitmaskEnum.h\"\n";
@@ -370,7 +369,6 @@ static void emitDirectivesDecl(const RecordKeeper &Records, raw_ostream &OS) {
     OS << "};\n";
   }
   LlvmNS.close();
-  OS << "#endif // LLVM_" << Lang << "_INC\n";
 }
 
 // Given a list of spellings (for a given clause/directive), order them
diff --git a/llvm/utils/TableGen/Basic/RISCVTargetDefEmitter.cpp b/llvm/utils/TableGen/Basic/RISCVTargetDefEmitter.cpp
index df14c77233ca9..f7959376adc4a 100644
--- a/llvm/utils/TableGen/Basic/RISCVTargetDefEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/RISCVTargetDefEmitter.cpp
@@ -68,13 +68,14 @@ static void emitRISCVExtensions(const RecordKeeper &Records, raw_ostream &OS) {
   if (!Extensions.empty()) {
     OS << "\nstatic constexpr ImpliedExtsEntry ImpliedExts[] = {\n";
     for (const Record *Ext : Extensions) {
-      auto ImpliesList = Ext->getValueAsListOfDefs("Implies");
+      std::vector<const Record *> ImpliesList =
+          Ext->getValueAsListOfDefs("Implies");
       if (ImpliesList.empty())
         continue;
 
       StringRef Name = getExtensionName(Ext);
 
-      for (auto *ImpliedExt : ImpliesList) {
+      for (const Record *ImpliedExt : ImpliesList) {
         if (!ImpliedExt->isSubClassOf("RISCVExtension"))
           continue;
 
@@ -150,11 +151,12 @@ static void emitRISCVProfiles(const RecordKeeper &Records, raw_ostream &OS) {
   OS << "#ifdef GET_SUPPORTED_PROFILES\n";
   OS << "#undef GET_SUPPORTED_PROFILES\n\n";
 
-  auto Profiles = Records.getAllDerivedDefinitionsIfDefined("RISCVProfile");
+  ArrayRef<const Record *> Profiles =
+      Records.getAllDerivedDefinitionsIfDefined("RISCVProfile");
 
   if (!Profiles.empty()) {
     printProfileTable(OS, Profiles, /*Experimental=*/false);
-    bool HasExperimentalProfiles = any_of(Profiles, [&](auto &Rec) {
+    bool HasExperimentalProfiles = any_of(Profiles, [&](const Record *Rec) {
       return Rec->getValueAsBit("Experimental");
     });
     if (HasExperimentalProfiles)
@@ -173,15 +175,17 @@ static void emitRISCVProcs(const RecordKeeper &RK, raw_ostream &OS) {
   // Iterate on all definition records.
   for (const Record *Rec :
        RK.getAllDerivedDefinitionsIfDefined("RISCVProcessorModel")) {
-    const std::vector<const Record *> &Features =
+    std::vector<const Record *> Features =
         Rec->getValueAsListOfDefs("Features");
-    bool FastScalarUnalignedAccess = any_of(Features, [&](auto &Feature) {
-      return Feature->getValueAsString("Name") == "unaligned-scalar-mem";
-    });
-
-    bool FastVectorUnalignedAccess = any_of(Features, [&](auto &Feature) {
-      return Feature->getValueAsString("Name") == "unaligned-vector-mem";
-    });
+    bool FastScalarUnalignedAccess =
+        any_of(Features, [&](const Record *Feature) {
+          return Feature->getValueAsString("Name") == "unaligned-scalar-mem";
+        });
+
+    bool FastVectorUnalignedAccess =
+        any_of(Features, [&](const Record *Feature) {
+          return Feature->getValueAsString("Name") == "unaligned-vector-mem";
+        });
 
     OS << "PROC(" << Rec->getName() << ", {\"" << Rec->getValueAsString("Name")
        << "\"}, {\"";
diff --git a/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp b/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp
index 3938d3910f62a..ed802e20477d3 100644
--- a/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/xxhash.h"
+#include "llvm/TableGen/CodeGenHelpers.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/SetTheory.h"
@@ -290,8 +291,9 @@ class RuntimeLibcallEmitter {
 } // End anonymous namespace.
 
 void RuntimeLibcallEmitter::emitGetRuntimeLibcallEnum(raw_ostream &OS) const {
-  OS << "#ifdef GET_RUNTIME_LIBCALL_ENUM\n"
-        "namespace llvm {\n"
+  IfDefEmitter IfDef(OS, "GET_RUNTIME_LIBCALL_ENUM");
+
+  OS << "namespace llvm {\n"
         "namespace RTLIB {\n"
         "enum Libcall : unsigned short {\n";
 
@@ -315,8 +317,7 @@ void RuntimeLibcallEmitter::emitGetRuntimeLibcallEnum(raw_ostream &OS) const {
      << RuntimeLibcallImplDefList.size() + 1
      << ";\n"
         "} // End namespace RTLIB\n"
-        "} // End namespace llvm\n"
-        "#endif\n\n";
+        "} // End namespace llvm\n";
 }
 
 // StringMap uses xxh3_64bits, truncated to uint32_t.
@@ -432,14 +433,16 @@ void RuntimeLibcallEmitter::emitNameMatchHashTable(
   //
   // TODO: It may make more sense to split the search by string size more. There
   // are a few outliers, most call names are small.
-  OS << "#ifdef GET_LOOKUP_LIBCALL_IMPL_NAME_BODY\n"
-        "  size_t Size = Name.size();\n"
-        "  if (Size == 0 || Size > "
-     << MaxFuncNameSize
-     << ")\n"
-        "    return enum_seq(RTLIB::Unsupported, RTLIB::Unsupported);\n"
-        " return lookupLibcallImplNameImpl(Name);\n"
-        "#endif\n";
+  {
+    IfDefEmitter IfDef(OS, "GET_LOOKUP_LIBCALL_IMPL_NAME_BODY");
+
+    OS << "  size_t Size = Name.size();\n"
+          "  if (Size == 0 || Size > "
+       << MaxFuncNameSize
+       << ")\n"
+          "    return enum_seq(RTLIB::Unsupported, RTLIB::Unsupported);\n"
+          " return lookupLibcallImplNameImpl(Name);\n";
+  }
 
   auto [Size, Collisions] = computePerfectHashParameters(Hashes);
   std::vector<unsigned> Lookup =
@@ -449,7 +452,7 @@ void RuntimeLibcallEmitter::emitNameMatchHashTable(
   LLVM_DEBUG(dbgs() << "Runtime libcall perfect hashing parameters: Size = "
                     << Size << ", maximum collisions = " << Collisions << '\n');
 
-  OS << "#ifdef DEFINE_GET_LOOKUP_LIBCALL_IMPL_NAME\n";
+  IfDefEmitter IfDef(OS, "DEFINE_GET_LOOKUP_LIBCALL_IMPL_NAME");
   emitHashFunction(OS);
 
   OS << "iota_range<RTLIB::LibcallImpl> RTLIB::RuntimeLibcallsInfo::"
@@ -481,59 +484,57 @@ void RuntimeLibcallEmitter::emitNameMatchHashTable(
   return enum_seq(RTLIB::Unsupported, RTLIB::Unsupported);
 }
 )";
-
-  OS << "#endif\n\n";
 }
 
 void RuntimeLibcallEmitter::emitGetInitRuntimeLibcallNames(
     raw_ostream &OS) const {
-  OS << "#ifdef GET_INIT_RUNTIME_LIBCALL_NAMES\n";
-
   // Emit the implementation names
   StringToOffsetTable Table(/*AppendZero=*/true,
                             "RTLIB::RuntimeLibcallsInfo::");
 
-  for (const RuntimeLibcallImpl &LibCallImpl : RuntimeLibcallImplDefList)
-    Table.GetOrAddStringOffset(LibCallImpl.getLibcallFuncName());
+  {
+    IfDefEmitter IfDef(OS, "GET_INIT_RUNTIME_LIBCALL_NAMES");
+
+    for (const RuntimeLibcallImpl &LibCallImpl : RuntimeLibcallImplDefList)
+      Table.GetOrAddStringOffset(LibCallImpl.getLibcallFuncName());
 
-  Table.EmitStringTableDef(OS, "RuntimeLibcallImplNameTable");
-  OS << R"(
+    Table.EmitStringTableDef(OS, "RuntimeLibcallImplNameTable");
+    OS << R"(
 const uint16_t RTLIB::RuntimeLibcallsInfo::RuntimeLibcallNameOffsetTable[] = {
 )";
 
-  OS << formatv("  {}, // {}\n", Table.GetStringOffset(""),
-                ""); // Unsupported entry
-  for (const RuntimeLibcallImpl &LibCallImpl : RuntimeLibcallImplDefList) {
-    StringRef ImplName = LibCallImpl.getLibcallFuncName();
-    OS << formatv("  {}, // {}\n", Table.GetStringOffset(ImplName), ImplName);
-  }
-  OS << "};\n";
+    OS << formatv("  {}, // {}\n", Table.GetStringOffset(""),
+                  ""); // Unsupported entry
+    for (const RuntimeLibcallImpl &LibCallImpl : RuntimeLibcallImplDefList) {
+      StringRef ImplName = LibCallImpl.getLibcallFuncName();
+      OS << formatv("  {}, // {}\n", Table.GetStringOffset(ImplName), ImplName);
+    }
+    OS << "};\n";
 
-  OS << R"(
+    OS << R"(
 const uint8_t RTLIB::RuntimeLibcallsInfo::RuntimeLibcallNameSizeTable[] = {
 )";
 
-  OS << "  0,\n";
-  for (const RuntimeLibcallImpl &LibCallImpl : RuntimeLibcallImplDefList)
-    OS << "  " << LibCallImpl.getLibcallFuncName().size() << ",\n";
-  OS << "};\n\n";
+    OS << "  0,\n";
+    for (const RuntimeLibcallImpl &LibCallImpl : RuntimeLibcallImplDefList)
+      OS << "  " << LibCallImpl.getLibcallFuncName().size() << ",\n";
+    OS << "};\n\n";
 
-  // Emit the reverse mapping from implementation libraries to RTLIB::Libcall
-  OS << "const RTLIB::Libcall llvm::RTLIB::RuntimeLibcallsInfo::"
-        "ImplToLibcall[RTLIB::NumLibcallImpls] = {\n"
-        "  RTLIB::UNKNOWN_LIBCALL, // RTLIB::Unsupported\n";
+    // Emit the reverse mapping from implementation libraries to RTLIB::Libcall
+    OS << "const RTLIB::Libcall llvm::RTLIB::RuntimeLibcallsInfo::"
+          "ImplToLibcall[RTLIB::NumLibcallImpls] = {\n"
+          "  RTLIB::UNKNOWN_LIBCALL, // RTLIB::Unsupported\n";
 
-  for (const RuntimeLibcallImpl &LibCallImpl : RuntimeLibcallImplDefList) {
-    const RuntimeLibcall *Provides = LibCallImpl.getProvides();
-    OS << "  ";
-    Provides->emitEnumEntry(OS);
-    OS << ", // ";
-    LibCallImpl.emitEnumEntry(OS);
-    OS << '\n';
+    for (const RuntimeLibcallImpl &LibCallImpl : RuntimeLibcallImplDefList) {
+      const RuntimeLibcall *Provides = LibCallImpl.getProvides();
+      OS << "  ";
+      Provides->emitEnumEntry(OS);
+      OS << ", // ";
+      LibCallImpl.emitEnumEntry(OS);
+      OS << '\n';
+    }
+    OS << "};\n\n";
   }
-  OS << "};\n\n";
-
-  OS << "#endif\n\n";
 
   emitNameMatchHashTable(OS, Table);
 }
@@ -757,9 +758,10 @@ void RuntimeLibcallEmitter::run(raw_ostream &OS) {
 
   emitGetInitRuntimeLibcallNames(OS);
 
-  OS << "#ifdef GET_SET_TARGET_RUNTIME_LIBCALL_SETS\n";
-  emitSystemRuntimeLibrarySetCalls(OS);
-  OS << "#endif\n\n";
+  {
+    IfDefEmitter IfDef(OS, "GET_SET_TARGET_RUNTIME_LIBCALL_SETS");
+    emitSystemRuntimeLibrarySetCalls(OS);
+  }
 }
 
 void LibcallPredicateExpander::expand(SetTheory &ST, const Record *Def,
diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
index 5d49715879280..17cd0930d0ec7 100644
--- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
+++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
@@ -1474,6 +1474,12 @@ Error OperandMatcher::addTypeCheckPredicate(const TypeSetByHwMode &VTy,
     return Error::success();
   }
 
+  llvm::MVT::SimpleValueType STy = VTy.getMachineValueType().SimpleTy;
+  if (STy == MVT::Metadata) {
+    addPredicate<MachineOperandTypeMatcher>(MachineOperand::MO_Metadata);
+    return Error::success();
+  }
+
   auto OpTyOrNone = MVTToLLT(VTy.getMachineValueType().SimpleTy);
   if (!OpTyOrNone)
     return failUnsupported("unsupported type");
@@ -1939,6 +1945,17 @@ bool InstructionOperandMatcher::isHigherPriorityThan(
   return false;
 }
 
+//===- MachineOperandTypeMatcher -----------------------------------------===//
+
+void MachineOperandTypeMatcher::emitPredicateOpcodes(MatchTable &Table,
+                                                     RuleMatcher &Rule) const {
+  Table << MatchTable::Opcode("GIM_CheckMachineOperandType")
+        << MatchTable::Comment("MI") << MatchTable::ULEB128Value(InsnVarID)
+        << MatchTable::Comment("Op") << MatchTable::ULEB128Value(OpIdx)
+        << MatchTable::Comment("Ty") << MatchTable::ULEB128Value(MOTy)
+        << MatchTable::LineBreak;
+}
+
 //===- OperandRenderer ----------------------------------------------------===//
 
 OperandRenderer::~OperandRenderer() {}
diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
index a310fc82d081b..ab88d312c0580 100644
--- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
+++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
@@ -23,6 +23,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGenTypes/LowLevelType.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/SaveAndRestore.h"
@@ -840,6 +841,7 @@ class PredicateMatcher {
     OPM_MBB,
     OPM_RecordNamedOperand,
     OPM_RecordRegType,
+    OPM_MOType,
   };
 
 protected:
@@ -1929,6 +1931,22 @@ class InstructionOperandMatcher : public OperandPredicateMatcher {
   }
 };
 
+class MachineOperandTypeMatcher : public OperandPredicateMatcher {
+  const MachineOperand::MachineOperandType MOTy;
+
+public:
+  MachineOperandTypeMatcher(unsigned InsnVarID, unsigned OpIdx,
+                            MachineOperand::MachineOperandType MOTy)
+      : OperandPredicateMatcher(OPM_MOType, InsnVarID, OpIdx), MOTy(MOTy) {}
+
+  static bool classof(const PredicateMatcher *P) {
+    return P->getKind() == OPM_MOType;
+  }
+
+  void emitPredicateOpcodes(MatchTable &Table,
+                            RuleMatcher &Rule) const override;
+};
+
 //===- Actions ------------------------------------------------------------===//
 class OperandRenderer {
 public:
diff --git a/llvm/utils/TableGen/Common/Types.cpp b/llvm/utils/TableGen/Common/Types.cpp
index 35b79b320dc32..8e8d6f67349c1 100644
--- a/llvm/utils/TableGen/Common/Types.cpp
+++ b/llvm/utils/TableGen/Common/Types.cpp
@@ -8,16 +8,12 @@
 
 #include "Types.h"
 
-// For LLVM_ATTRIBUTE_UNUSED
-#include "llvm/Support/Compiler.h"
-
 #include <cassert>
 
 using namespace llvm;
 
-const char *
-llvm::getMinimalTypeForRange(uint64_t Range,
-                             unsigned MaxSize LLVM_ATTRIBUTE_UNUSED) {
+const char *llvm::getMinimalTypeForRange(uint64_t Range,
+                                         [[maybe_unused]] unsigned MaxSize) {
   // TODO: The original callers only used 32 and 64 so these are the only
   //       values permitted. Rather than widen the supported values we should
   //       allow 64 for the callers that currently use 32 and remove the
diff --git a/llvm/utils/TableGen/FastISelEmitter.cpp b/llvm/utils/TableGen/FastISelEmitter.cpp
index dba8bde54ff4d..e0be104c883c5 100644
--- a/llvm/utils/TableGen/FastISelEmitter.cpp
+++ b/llvm/utils/TableGen/FastISelEmitter.cpp
@@ -555,7 +555,7 @@ void FastISelMap::collectPatterns(const CodeGenDAGPatterns &CGP) {
     raw_string_ostream SuffixOS(ManglingSuffix);
     Operands.PrintManglingSuffix(SuffixOS, ImmediatePredicates, true);
     if (!StringSwitch<bool>(ManglingSuffix)
-             .Cases("", "r", "rr", "ri", "i", "f", true)
+             .Cases({"", "r", "rr", "ri", "i", "f"}, true)
              .Default(false))
       continue;
 
diff --git a/llvm/utils/TableGen/InstrInfoEmitter.cpp b/llvm/utils/TableGen/InstrInfoEmitter.cpp
index 176e4b250b82a..d1b14fbbdcd3e 100644
--- a/llvm/utils/TableGen/InstrInfoEmitter.cpp
+++ b/llvm/utils/TableGen/InstrInfoEmitter.cpp
@@ -29,6 +29,7 @@
 #include "llvm/Support/Format.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/TableGen/CodeGenHelpers.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/TGTimer.h"
@@ -1135,6 +1136,22 @@ void InstrInfoEmitter::run(raw_ostream &OS) {
 
   OS << "\n};\n} // end namespace llvm\n";
 
+  {
+    NamespaceEmitter LlvmNS(OS, "llvm");
+    NamespaceEmitter TargetNS(OS, Target.getInstNamespace());
+    for (const Record *R : Records.getAllDerivedDefinitions("Operand")) {
+      if (R->isAnonymous())
+        continue;
+      if (const DagInit *D = R->getValueAsDag("MIOperandInfo")) {
+        for (unsigned i = 0, e = D->getNumArgs(); i < e; ++i) {
+          if (const StringInit *Name = D->getArgName(i))
+            OS << "constexpr unsigned SUBOP_" << R->getName() << "_"
+               << Name->getValue() << " = " << i << ";\n";
+        }
+      }
+    }
+  }
+
   OS << "#endif // GET_INSTRINFO_HEADER\n\n";
 
   OS << "#ifdef GET_INSTRINFO_HELPER_DECLS\n";
diff --git a/llvm/utils/TableGen/X86DisassemblerTables.cpp b/llvm/utils/TableGen/X86DisassemblerTables.cpp
index ed7a4fedbf730..3414190b9c9b4 100644
--- a/llvm/utils/TableGen/X86DisassemblerTables.cpp
+++ b/llvm/utils/TableGen/X86DisassemblerTables.cpp
@@ -99,6 +99,7 @@ static inline bool inheritsFrom(InstructionContext child,
             (noPrefix && inheritsFrom(child, IC_XS, noPrefix)));
   case IC_64BIT:
     return (inheritsFrom(child, IC_64BIT_REXW) ||
+            inheritsFrom(child, IC_64BIT_REX2) ||
             (noPrefix && inheritsFrom(child, IC_64BIT_OPSIZE, noPrefix)) ||
             (!AdSize64 && inheritsFrom(child, IC_64BIT_ADSIZE)) ||
             (noPrefix && inheritsFrom(child, IC_64BIT_XD, noPrefix)) ||
@@ -151,8 +152,10 @@ static inline bool inheritsFrom(InstructionContext child,
   case IC_64BIT_REXW_XS:
   case IC_64BIT_REXW_OPSIZE:
   case IC_64BIT_REXW_ADSIZE:
-  case IC_64BIT_REX2:
+  case IC_64BIT_REX2_REXW:
     return false;
+  case IC_64BIT_REX2:
+    return inheritsFrom(child, IC_64BIT_REX2_REXW);
   case IC_VEX:
     return (VEX_LIG && WIG && inheritsFrom(child, IC_VEX_L_W)) ||
            (WIG && inheritsFrom(child, IC_VEX_W)) ||
@@ -980,9 +983,11 @@ void DisassemblerTables::emitContextTable(raw_ostream &o, unsigned &i) const {
         if ((index & ATTR_EVEXB) && (index & ATTR_EVEXU))
           o << "_U";
       }
-    } else if ((index & ATTR_64BIT) && (index & ATTR_REX2))
+    } else if ((index & ATTR_64BIT) && (index & ATTR_REX2)) {
       o << "IC_64BIT_REX2";
-    else if ((index & ATTR_64BIT) && (index & ATTR_REXW) && (index & ATTR_XS))
+      if (index & ATTR_REXW)
+        o << "_REXW";
+    } else if ((index & ATTR_64BIT) && (index & ATTR_REXW) && (index & ATTR_XS))
       o << "IC_64BIT_REXW_XS";
     else if ((index & ATTR_64BIT) && (index & ATTR_REXW) && (index & ATTR_XD))
       o << "IC_64BIT_REXW_XD";
diff --git a/llvm/utils/TableGen/X86RecognizableInstr.cpp b/llvm/utils/TableGen/X86RecognizableInstr.cpp
index e87a1c9aa928e..a006888a2352c 100644
--- a/llvm/utils/TableGen/X86RecognizableInstr.cpp
+++ b/llvm/utils/TableGen/X86RecognizableInstr.cpp
@@ -365,6 +365,8 @@ InstructionContext RecognizableInstr::insnContext() const {
       insnContext = IC_64BIT_XD;
     else if (OpPrefix == X86Local::XS)
       insnContext = IC_64BIT_XS;
+    else if (HasREX_W && ExplicitREX2Prefix)
+      insnContext = IC_64BIT_REX2_REXW;
     else if (ExplicitREX2Prefix)
       insnContext = IC_64BIT_REX2;
     else if (HasREX_W)
diff --git a/llvm/utils/clang-parse-diagnostics-file b/llvm/utils/clang-parse-diagnostics-file
index 1f720c34544a7..fac5866d364a8 100755
--- a/llvm/utils/clang-parse-diagnostics-file
+++ b/llvm/utils/clang-parse-diagnostics-file
@@ -87,14 +87,14 @@ Utility for dumping Clang-style logged diagnostics.\
         return
 
     # Otherwise, print out the diagnostics.
-    print
-    print "**** BUILD DIAGNOSTICS ****"
+    print()
+    print("**** BUILD DIAGNOSTICS ****")
     for file,selected_diags in to_report:
-        print "*** %s ***" % file
+        print(("*** %s ***" % file))
         for d in selected_diags:
-            print " %s:%s:%s: %s: %s" % (
+            print((" %s:%s:%s: %s: %s" % (
                 d.get('filename'), d.get('line'), d.get('column'),
-                d.get('level'), d.get('message'))
+                d.get('level'), d.get('message'))))
 
 if __name__ == "__main__":
     main()
diff --git a/llvm/utils/extract_symbols.py b/llvm/utils/extract_symbols.py
index 388723421d660..0cbfd2e2910e1 100755
--- a/llvm/utils/extract_symbols.py
+++ b/llvm/utils/extract_symbols.py
@@ -105,6 +105,14 @@ def should_keep_microsoft_symbol(symbol, calling_convention_decoration):
     # Skip X86GenMnemonicTables functions, they are not exposed from llvm/include/.
     elif re.match(r"\?is[A-Z0-9]*@X86@llvm", symbol):
         return None
+    # Keep Registry<T>::Head and Registry<T>::Tail static members for plugin support.
+    # Pattern matches: ?Head@?$Registry@<template_args>@llvm@@ or ?Tail@?$Registry@...
+    elif (
+        "?$Registry@" in symbol
+        and "@llvm@@" in symbol
+        and (symbol.startswith("?Head@") or symbol.startswith("?Tail@"))
+    ):
+        return symbol
     # Keep mangled llvm:: and clang:: function symbols. How we detect these is a
     # bit of a mess and imprecise, but that avoids having to completely demangle
     # the symbol name. The outermost namespace is at the end of the identifier
diff --git a/llvm/utils/git/code-format-helper.py b/llvm/utils/git/code-format-helper.py
index e9fd132d8b0e4..406a72817acb8 100755
--- a/llvm/utils/git/code-format-helper.py
+++ b/llvm/utils/git/code-format-helper.py
@@ -391,7 +391,7 @@ def format_run(self, changed_files: List[str], args: FormatArgs) -> Optional[str
             return None
 
         # Use git to find files that have had a change in the number of undefs
-        regex = "([^a-zA-Z0-9#_-]undef[^a-zA-Z0-9_-]|UndefValue::get)"
+        regex = "([^a-zA-Z0-9#_-]undef([^a-zA-Z0-9_-]|$)|UndefValue::get)"
         cmd = ["git", "diff", "-U0", "--pickaxe-regex", "-S", regex]
 
         if args.start_rev and args.end_rev:
diff --git a/llvm/utils/gn/secondary/clang/lib/Analysis/FlowSensitive/Models/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Analysis/FlowSensitive/Models/BUILD.gn
index 3fd3aab7970d5..01a05bcfda415 100644
--- a/llvm/utils/gn/secondary/clang/lib/Analysis/FlowSensitive/Models/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Analysis/FlowSensitive/Models/BUILD.gn
@@ -11,5 +11,6 @@ static_library("Models") {
   sources = [
     "ChromiumCheckModel.cpp",
     "UncheckedOptionalAccessModel.cpp",
+    "UncheckedStatusOrAccessModel.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn
index c9f3a0745ed46..c74a44c7016c6 100644
--- a/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn
@@ -44,6 +44,8 @@ unittest("ClangAnalysisFlowSensitiveTests") {
     "TransferTest.cpp",
     "TypeErasedDataflowAnalysisTest.cpp",
     "UncheckedOptionalAccessModelTest.cpp",
+    "UncheckedStatusOrAccessModelTest.cpp",
+    "UncheckedStatusOrAccessModelTestFixture.cpp",
     "ValueTest.cpp",
     "WatchedLiteralsSolverTest.cpp",
   ]
diff --git a/llvm/utils/gn/secondary/clang/unittests/Basic/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/Basic/BUILD.gn
index 1449dc7036a51..954de884c3918 100644
--- a/llvm/utils/gn/secondary/clang/unittests/Basic/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/unittests/Basic/BUILD.gn
@@ -14,6 +14,7 @@ unittest("BasicTests") {
     "DiagnosticTest.cpp",
     "FileEntryTest.cpp",
     "FileManagerTest.cpp",
+    "LangOptionsTest.cpp",
     "LineOffsetMappingTest.cpp",
     "OffloadArchTest.cpp",
     "SanitizersTest.cpp",
diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index f771099cb4c4a..e7470060899a1 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -1439,7 +1439,6 @@ if (current_toolchain == default_toolchain) {
       "__tuple/sfinae_helpers.h",
       "__tuple/tuple_element.h",
       "__tuple/tuple_like.h",
-      "__tuple/tuple_like_ext.h",
       "__tuple/tuple_like_no_subrange.h",
       "__tuple/tuple_size.h",
       "__tuple/tuple_types.h",
diff --git a/llvm/utils/gn/secondary/lld/test/BUILD.gn b/llvm/utils/gn/secondary/lld/test/BUILD.gn
index dabc578a863cd..585e0a4bdda81 100644
--- a/llvm/utils/gn/secondary/lld/test/BUILD.gn
+++ b/llvm/utils/gn/secondary/lld/test/BUILD.gn
@@ -1,5 +1,6 @@
 import("//llvm/lib/DebugInfo/PDB/enable_dia.gni")
 import("//llvm/triples.gni")
+import("//llvm/utils/gn/build/libs/pthread/enable.gni")
 import("//llvm/utils/gn/build/libs/xml/enable.gni")
 import("//llvm/utils/gn/build/libs/zlib/enable.gni")
 import("//llvm/utils/gn/build/libs/zstd/enable.gni")
@@ -88,6 +89,12 @@ write_lit_cfg("lit_site_cfg") {
     extra_values += [ "LLVM_ENABLE_LIBXML2=0" ]  # Must be 0.
   }
 
+  if (llvm_enable_threads) {
+    extra_values += [ "LLVM_ENABLE_THREADS=1" ]
+  } else {
+    extra_values += [ "LLVM_ENABLE_THREADS=0" ]  # Must be 0.
+  }
+
   if (llvm_enable_zlib) {
     extra_values += [ "LLVM_ENABLE_ZLIB=1" ]
   } else {
diff --git a/llvm/utils/gn/secondary/lldb/source/Plugins/ExpressionParser/Clang/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Plugins/ExpressionParser/Clang/BUILD.gn
index 5efc1531857b8..51911d7fca2ee 100644
--- a/llvm/utils/gn/secondary/lldb/source/Plugins/ExpressionParser/Clang/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/Plugins/ExpressionParser/Clang/BUILD.gn
@@ -47,7 +47,6 @@ static_library("Clang") {
     "ClangASTImporter.cpp",
     "ClangASTMetadata.cpp",
     "ClangASTSource.cpp",
-    "ClangDeclVendor.cpp",
     "ClangExpressionDeclMap.cpp",
     "ClangExpressionHelper.cpp",
     "ClangExpressionParser.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn
index 9b69a4435a562..84384217897c4 100644
--- a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn
@@ -68,6 +68,7 @@ static_library("Orc") {
     "SectCreate.cpp",
     "SelfExecutorProcessControl.cpp",
     "SimpleRemoteEPC.cpp",
+    "SimpleRemoteMemoryMapper.cpp",
     "SpeculateAnalyses.cpp",
     "Speculation.cpp",
     "TaskDispatch.cpp",
diff --git a/llvm/utils/lit/tests/Inputs/googletest-cmd-wrapper/lit.cfg b/llvm/utils/lit/tests/Inputs/googletest-cmd-wrapper/lit.cfg
index 9f93bac51456d..d3eb987922995 100644
--- a/llvm/utils/lit/tests/Inputs/googletest-cmd-wrapper/lit.cfg
+++ b/llvm/utils/lit/tests/Inputs/googletest-cmd-wrapper/lit.cfg
@@ -2,5 +2,5 @@ import lit.formats
 
 config.name = "googletest-cmd-wrapper"
 config.test_format = lit.formats.GoogleTest(
-    "DummySubDir", "Test" if "win32" in sys.platform else ".exe", [sys.executable]
+    "DummySubDir", "Test" if sys.platform in ["win32", "cygwin"] else ".exe", [sys.executable]
 )
diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt
index 343c2bb7146c3..b570f8de06880 100644
--- a/llvm/utils/profcheck-xfail.txt
+++ b/llvm/utils/profcheck-xfail.txt
@@ -107,6 +107,7 @@ Instrumentation/AddressSanitizer/asan-stack-safety.ll
 Instrumentation/AddressSanitizer/asan-struct-scalable.ll
 Instrumentation/AddressSanitizer/asan-vp-load-store.ll
 Instrumentation/AddressSanitizer/asan-vs-gvn.ll
+Instrumentation/AddressSanitizer/asan-win-dont-instrument-catchpad.ll
 Instrumentation/AddressSanitizer/basic.ll
 Instrumentation/AddressSanitizer/basic-msvc64.ll
 Instrumentation/AddressSanitizer/byref-args.ll
@@ -906,7 +907,6 @@ Transforms/InstCombine/select_frexp.ll
 Transforms/InstCombine/select.ll
 Transforms/InstCombine/select-min-max.ll
 Transforms/InstCombine/select-of-symmetric-selects.ll
-Transforms/InstCombine/select-safe-impliedcond-transforms.ll
 Transforms/InstCombine/select-safe-transforms.ll
 Transforms/InstCombine/select-select.ll
 Transforms/InstCombine/select-with-extreme-eq-cond.ll
@@ -1237,7 +1237,6 @@ Transforms/PartiallyInlineLibCalls/X86/good-prototype.ll
 Transforms/PGOProfile/chr-dead-pred.ll
 Transforms/PGOProfile/chr-dup-threshold.ll
 Transforms/PGOProfile/chr-lifetimes.ll
-Transforms/PGOProfile/chr.ll
 Transforms/PGOProfile/chr-poison.ll
 Transforms/PGOProfile/comdat.ll
 Transforms/PGOProfile/memop_profile_funclet_wasm.ll
@@ -1312,17 +1311,6 @@ Transforms/SimpleLoopUnswitch/pr60736.ll
 Transforms/SimpleLoopUnswitch/trivial-unswitch-freeze-individual-conditions.ll
 Transforms/SimpleLoopUnswitch/trivial-unswitch.ll
 Transforms/SimpleLoopUnswitch/trivial-unswitch-logical-and-or.ll
-Transforms/SROA/addrspacecast.ll
-Transforms/SROA/phi-and-select.ll
-Transforms/SROA/phi-gep.ll
-Transforms/SROA/scalable-vectors-with-known-vscale.ll
-Transforms/SROA/select-gep.ll
-Transforms/SROA/select-load.ll
-Transforms/SROA/slice-width.ll
-Transforms/SROA/std-clamp.ll
-Transforms/SROA/vector-conversion.ll
-Transforms/SROA/vector-promotion-cannot-tree-structure-merge.ll
-Transforms/SROA/vector-promotion.ll
 Transforms/StackProtector/cross-dso-cfi-stack-chk-fail.ll
 Transforms/StructurizeCFG/AMDGPU/uniform-regions.ll
 Transforms/StructurizeCFG/hoist-zerocost.ll
diff --git a/llvm/utils/release/build_llvm_release.bat b/llvm/utils/release/build_llvm_release.bat
old mode 100755
new mode 100644
index 54645d0c6369f..001339f2a8f05
--- a/llvm/utils/release/build_llvm_release.bat
+++ b/llvm/utils/release/build_llvm_release.bat
@@ -156,16 +156,14 @@ set common_cmake_flags=^
   -DLLVM_BUILD_LLVM_C_DYLIB=ON ^
   -DPython3_FIND_REGISTRY=NEVER ^
   -DPACKAGE_VERSION=%package_version% ^
-  -DLLDB_RELOCATABLE_PYTHON=1 ^
-  -DLLDB_EMBED_PYTHON_HOME=OFF ^
   -DCMAKE_CL_SHOWINCLUDES_PREFIX="Note: including file: " ^
   -DLLVM_ENABLE_LIBXML2=FORCE_ON ^
-  -DLLDB_ENABLE_LIBXML2=OFF ^
   -DCLANG_ENABLE_LIBXML2=OFF ^
   -DCMAKE_C_FLAGS="%common_compiler_flags%" ^
   -DCMAKE_CXX_FLAGS="%common_compiler_flags%" ^
   -DLLVM_ENABLE_RPMALLOC=ON ^
-  -DLLVM_ENABLE_PROJECTS="clang;clang-tools-extra;lld;compiler-rt;lldb;openmp"
+  -DLLVM_ENABLE_PROJECTS="clang;clang-tools-extra;lld" ^
+  -DLLVM_ENABLE_RUNTIMES="compiler-rt;openmp"
 
 if "%force-msvc%" == "" (
   where /q clang-cl
@@ -185,6 +183,11 @@ if "%force-msvc%" == "" (
   )
 )
 
+set common_lldb_flags=^
+  -DLLDB_RELOCATABLE_PYTHON=1 ^
+  -DLLDB_EMBED_PYTHON_HOME=OFF ^
+  -DLLDB_ENABLE_LIBXML2=OFF
+
 set cmake_profile_flags=""
 
 REM Preserve original path
@@ -192,8 +195,8 @@ set OLDPATH=%PATH%
 
 REM Build the 32-bits and/or 64-bits binaries.
 if "%x86%" == "true" call :do_build_32 || exit /b 1
-if "%x64%" == "true" call :do_build_64 || exit /b 1
-if "%arm64%" == "true" call :do_build_arm64 || exit /b 1
+if "%x64%" == "true" call :do_build_64_common amd64 %python64_dir% || exit /b 1
+if "%arm64%" == "true" call :do_build_64_common arm64 %pythonarm64_dir% || exit /b 1
 exit /b 0
 
 ::==============================================================================
@@ -212,8 +215,6 @@ set "stage0_bin_dir=%build_dir%/build32_stage0/bin"
 set cmake_flags=^
   %common_cmake_flags% ^
   -DLLVM_ENABLE_RPMALLOC=OFF ^
-  -DLLDB_TEST_COMPILER=%stage0_bin_dir%/clang.exe ^
-  -DPYTHON_HOME=%PYTHONHOME% ^
   -DPython3_ROOT_DIR=%PYTHONHOME% ^
   -DLIBXML2_INCLUDE_DIR=%libxmldir%/include/libxml2 ^
   -DLIBXML2_LIBRARIES=%libxmldir%/lib/libxml2s.lib
@@ -231,6 +232,9 @@ REM CMake expects the paths that specifies the compiler and linker to be
 REM with forward slash.
 set all_cmake_flags=^
   %cmake_flags% ^
+  -DLLVM_ENABLE_PROJECTS="clang;clang-tools-extra;lld;lldb;" ^
+  %common_lldb_flags% ^
+  -DPYTHON_HOME=%PYTHONHOME% ^
   -DCMAKE_C_COMPILER=%stage0_bin_dir%/clang-cl.exe ^
   -DCMAKE_CXX_COMPILER=%stage0_bin_dir%/clang-cl.exe ^
   -DCMAKE_LINKER=%stage0_bin_dir%/lld-link.exe ^
@@ -254,32 +258,42 @@ exit /b 0
 ::==============================================================================
 
 ::==============================================================================
-:: Build 64-bits binaries.
+:: Build 64-bits binaries (common function for both x64 and arm64)
 ::==============================================================================
-:do_build_64
-call :set_environment %python64_dir% || exit /b 1
-call "%vsdevcmd%" -arch=amd64 || exit /b 1
+:do_build_64_common
+set arch=%1
+set python_dir=%2
+
+call :set_environment %python_dir% || exit /b 1
+call "%vsdevcmd%" -arch=%arch% || exit /b 1
 @echo on
-mkdir build64_stage0
-cd build64_stage0
+mkdir build_%arch%_stage0
+cd build_%arch%_stage0
 call :do_build_libxml || exit /b 1
 
 REM Stage0 binaries directory; used in stage1.
-set "stage0_bin_dir=%build_dir%/build64_stage0/bin"
+set "stage0_bin_dir=%build_dir%/build_%arch%_stage0/bin"
 set cmake_flags=^
   %common_cmake_flags% ^
-  -DLLDB_TEST_COMPILER=%stage0_bin_dir%/clang.exe ^
-  -DPYTHON_HOME=%PYTHONHOME% ^
   -DPython3_ROOT_DIR=%PYTHONHOME% ^
   -DLIBXML2_INCLUDE_DIR=%libxmldir%/include/libxml2 ^
-  -DLIBXML2_LIBRARIES=%libxmldir%/lib/libxml2s.lib
+  -DLIBXML2_LIBRARIES=%libxmldir%/lib/libxml2s.lib ^
+  -DCLANG_DEFAULT_LINKER=lld
+if "%arch%"=="arm64" (
+  set cmake_flags=%cmake_flags% ^
+    -DCOMPILER_RT_BUILD_SANITIZERS=OFF
+)
 
-cmake -GNinja %cmake_flags% %llvm_src%\llvm || exit /b 1
+cmake -GNinja %cmake_flags% ^
+  -DLLVM_TARGETS_TO_BUILD=Native ^
+  %llvm_src%\llvm || exit /b 1
 ninja || ninja || ninja || exit /b 1
 ninja check-llvm || ninja check-llvm || ninja check-llvm || exit /b 1
 ninja check-clang || ninja check-clang || ninja check-clang || exit /b 1
 ninja check-lld || ninja check-lld || ninja check-lld || exit /b 1
-ninja check-sanitizer || ninja check-sanitizer || ninja check-sanitizer || exit /b 1
+if "%arch%"=="amd64" (
+  ninja check-runtimes || ninja check-runtimes || ninja check-runtimes || exit /b 1
+)
 ninja check-clang-tools || ninja check-clang-tools || ninja check-clang-tools || exit /b 1
 ninja check-clangd || ninja check-clangd || ninja check-clangd || exit /b 1
 cd..
@@ -293,24 +307,40 @@ set all_cmake_flags=^
   -DCMAKE_LINKER=%stage0_bin_dir%/lld-link.exe ^
   -DCMAKE_AR=%stage0_bin_dir%/llvm-lib.exe ^
   -DCMAKE_RC=%stage0_bin_dir%/llvm-windres.exe
+if "%arch%"=="arm64" (
+  set all_cmake_flags=%all_cmake_flags% ^
+    -DCPACK_SYSTEM_NAME=woa64
+)
 set cmake_flags=%all_cmake_flags:\=/%
 
-
-mkdir build64
-cd build64
+mkdir build_%arch%
+cd build_%arch%
 call :do_generate_profile || exit /b 1
-cmake -GNinja %cmake_flags% %cmake_profile_flags% %llvm_src%\llvm || exit /b 1
+cmake -GNinja %cmake_flags% ^
+  -DLLVM_ENABLE_PROJECTS="clang;clang-tools-extra;lld;lldb;flang;mlir" ^
+  %common_lldb_flags% ^
+  -DPYTHON_HOME=%PYTHONHOME% ^
+  %cmake_profile_flags% %llvm_src%\llvm || exit /b 1
 ninja || ninja || ninja || exit /b 1
 ninja check-llvm || ninja check-llvm || ninja check-llvm || exit /b 1
 ninja check-clang || ninja check-clang || ninja check-clang || exit /b 1
 ninja check-lld || ninja check-lld || ninja check-lld || exit /b 1
-ninja check-sanitizer || ninja check-sanitizer || ninja check-sanitizer || exit /b 1
+if "%arch%"=="amd64" (
+  ninja check-runtimes || ninja check-runtimes || ninja check-runtimes || exit /b 1
+)
 ninja check-clang-tools || ninja check-clang-tools || ninja check-clang-tools || exit /b 1
 ninja check-clangd || ninja check-clangd || ninja check-clangd || exit /b 1
+REM ninja check-flang || ninja check-flang || ninja check-flang || exit /b 1
+REM ninja check-mlir || ninja check-mlir || ninja check-mlir || exit /b 1
+REM ninja check-lldb || ninja check-lldb || ninja check-lldb || exit /b 1
 ninja package || exit /b 1
 
 :: generate tarball with install toolchain only off
-set filename=clang+llvm-%version%-x86_64-pc-windows-msvc
+if "%arch%"=="amd64" (
+  set filename=clang+llvm-%version%-x86_64-pc-windows-msvc
+) else (
+  set filename=clang+llvm-%version%-aarch64-pc-windows-msvc
+)
 cmake -GNinja %cmake_flags% %cmake_profile_flags% -DLLVM_INSTALL_TOOLCHAIN_ONLY=OFF ^
   -DCMAKE_INSTALL_PREFIX=%build_dir%/%filename% ..\llvm-project\llvm || exit /b 1
 ninja install || exit /b 1
@@ -320,75 +350,7 @@ cd ..
 7z a -ttar -so %filename%.tar %filename% | 7z a -txz -si %filename%.tar.xz
 
 exit /b 0
-::==============================================================================
-
-::==============================================================================
-:: Build arm64 binaries.
-::==============================================================================
-:do_build_arm64
-call :set_environment %pythonarm64_dir% || exit /b 1
-call "%vsdevcmd%" -host_arch=x64 -arch=arm64 || exit /b 1
-@echo on
-mkdir build_arm64_stage0
-cd build_arm64_stage0
-call :do_build_libxml || exit /b 1
-
-REM Stage0 binaries directory; used in stage1.
-set "stage0_bin_dir=%build_dir%/build_arm64_stage0/bin"
-set cmake_flags=^
-  %common_cmake_flags% ^
-  -DCLANG_DEFAULT_LINKER=lld ^
-  -DLIBXML2_INCLUDE_DIR=%libxmldir%/include/libxml2 ^
-  -DLIBXML2_LIBRARIES=%libxmldir%/lib/libxml2s.lib ^
-  -DPython3_ROOT_DIR=%PYTHONHOME% ^
-  -DCOMPILER_RT_BUILD_PROFILE=OFF ^
-  -DCOMPILER_RT_BUILD_SANITIZERS=OFF
-
-REM We need to build stage0 compiler-rt with clang-cl (msvc lacks some builtins).
-cmake -GNinja %cmake_flags% ^
-  -DCMAKE_C_COMPILER=clang-cl.exe ^
-  -DCMAKE_CXX_COMPILER=clang-cl.exe ^
-  %llvm_src%\llvm || exit /b 1
-ninja || exit /b 1
-::ninja check-llvm || exit /b 1
-::ninja check-clang || exit /b 1
-::ninja check-lld || exit /b 1
-::ninja check-sanitizer || exit /b 1
-::ninja check-clang-tools || exit /b 1
-::ninja check-clangd || exit /b 1
-cd..
-
-REM CMake expects the paths that specifies the compiler and linker to be
-REM with forward slash.
-REM CPACK_SYSTEM_NAME is set to have a correct name for installer generated.
-set all_cmake_flags=^
-  %cmake_flags% ^
-  -DCMAKE_C_COMPILER=%stage0_bin_dir%/clang-cl.exe ^
-  -DCMAKE_CXX_COMPILER=%stage0_bin_dir%/clang-cl.exe ^
-  -DCMAKE_LINKER=%stage0_bin_dir%/lld-link.exe ^
-  -DCMAKE_AR=%stage0_bin_dir%/llvm-lib.exe ^
-  -DCMAKE_RC=%stage0_bin_dir%/llvm-windres.exe ^
-  -DCPACK_SYSTEM_NAME=woa64
-set cmake_flags=%all_cmake_flags:\=/%
 
-mkdir build_arm64
-cd build_arm64
-cmake -GNinja %cmake_flags% %llvm_src%\llvm || exit /b 1
-ninja || exit /b 1
-REM Check but do not fail on errors.
-ninja check-lldb
-::ninja check-llvm || exit /b 1
-::ninja check-clang || exit /b 1
-::ninja check-lld || exit /b 1
-::ninja check-sanitizer || exit /b 1
-::ninja check-clang-tools || exit /b 1
-::ninja check-clangd || exit /b 1
-ninja package || exit /b 1
-cd ..
-
-exit /b 0
-::==============================================================================
-::
 ::==============================================================================
 :: Set PATH and some environment variables.
 ::==============================================================================
diff --git a/llvm/utils/unicode-case-fold.py b/llvm/utils/unicode-case-fold.py
index 9639aa0dc44b4..4afb41d4060f0 100755
--- a/llvm/utils/unicode-case-fold.py
+++ b/llvm/utils/unicode-case-fold.py
@@ -21,11 +21,7 @@
 
 import sys
 import re
-
-try:
-    from urllib.request import urlopen
-except ImportError:
-    from urllib2 import urlopen
+from urllib.request import urlopen
 
 
 # This variable will body of the mappings function
diff --git a/mlir/Maintainers.md b/mlir/Maintainers.md
index 5d3b576c2e751..b495d25426a44 100644
--- a/mlir/Maintainers.md
+++ b/mlir/Maintainers.md
@@ -97,7 +97,7 @@ available, should be contacted first, as they're more active in those areas.
 * ‘rocdl’ Dialect ([@krzysz00](https://github.com/krzysz00))
 * ‘nvgpu’ Dialect ([@grypp](https://github.com/grypp))
 * ‘nvvm’ Dialect ([@grypp](https://github.com/grypp))
-* ‘xegpu’ Dialect ([@chencha3](https://github.com/chencha3), [@Jianhui-Li](https://github.com/Jianhui-Li))
+* ‘xegpu’ Dialect ([@charithaintc](https://github.com/charithaintc), [@Jianhui-Li](https://github.com/Jianhui-Li))
 * 'xevm' Dialect ([@silee2](https://github.com/silee2))
 
 #### CPU Dialects
diff --git a/mlir/docs/Canonicalization.md b/mlir/docs/Canonicalization.md
index 686e50025e9f0..2622c08e535fe 100644
--- a/mlir/docs/Canonicalization.md
+++ b/mlir/docs/Canonicalization.md
@@ -55,7 +55,7 @@ Some important things to think about w.r.t. canonicalization patterns:
 *   It is always good to eliminate operations entirely when possible, e.g. by
     folding known identities (like "x + 0 = x").
 
-*   Pattens with expensive running time (i.e. have O(n) complexity) or
+*   Patterns with expensive running time (i.e. have O(n) complexity) or
     complicated cost models don't belong to canonicalization: since the
     algorithm is executed iteratively until fixed-point we want patterns that
     execute quickly (in particular their matching phase).
diff --git a/mlir/docs/DialectConversion.md b/mlir/docs/DialectConversion.md
index 5ae35155b4ed5..b73aa04398103 100644
--- a/mlir/docs/DialectConversion.md
+++ b/mlir/docs/DialectConversion.md
@@ -153,11 +153,11 @@ target.markOpRecursivelyLegal<MyOp>([](MyOp op) { ... });
 
 After the conversion target has been defined, a set of legalization patterns
 must be provided to transform illegal operations into legal ones. The patterns
-supplied here have the same structure and restrictions as those described in the
-main [Pattern](PatternRewriter.md) documentation. The patterns provided do not
-need to generate operations that are directly legal on the target. The framework
-will automatically build a graph of conversions to convert non-legal operations
-into a set of legal ones.
+supplied here have the same structure and similar restrictions as those
+described in the main [Pattern](PatternRewriter.md) documentation. The patterns
+provided do not need to generate operations that are directly legal on the
+target. The framework will automatically build a graph of conversions to convert
+non-legal operations into a set of legal ones.
 
 As an example, say you define a target that supports one operation: `foo.add`.
 When providing the following patterns: [`bar.add` -> `baz.add`, `baz.add` ->
@@ -171,23 +171,12 @@ means that you don’t have to define a direct legalization pattern for `bar.add
 Along with the general `RewritePattern` classes, the conversion framework
 provides a special type of rewrite pattern that can be used when a pattern
 relies on interacting with constructs specific to the conversion process, the
-`ConversionPattern`. For example, the conversion process does not necessarily
-update operations in-place and instead creates a mapping of events such as
-replacements and erasures, and only applies them when the entire conversion
-process is successful. Certain classes of patterns rely on using the
-updated/remapped operands of an operation, such as when the types of results
-defined by an operation have changed. The general Rewrite Patterns can no longer
-be used in these situations, as the types of the operands of the operation being
-matched will not correspond with those expected by the user. This pattern
-provides, as an additional argument to the `matchAndRewrite` method, the list
-of operands that the operation should use after conversion. If an operand was
-the result of a non-converted operation, for example if it was already legal,
-the original operand is used. This means that the operands provided always have
-a 1-1 non-null correspondence with the operands on the operation. The original
-operands of the operation are still intact and may be inspected as normal.
-These patterns also utilize a special `PatternRewriter`,
-`ConversionPatternRewriter`, that provides special hooks for use with the
-conversion infrastructure.
+`ConversionPattern`.
+
+#### Remapped Operands / Adaptor
+Conversion patterns have an additional `operands` / `adaptor` argument for the
+`matchAndRewrite` method. These operands correspond to the most recent
+replacement values of the respective operands of the matched operation.
 
 ```c++
 struct MyConversionPattern : public ConversionPattern {
@@ -200,6 +189,128 @@ struct MyConversionPattern : public ConversionPattern {
 };
 ```
 
+Example:
+
+```mlir
+%0 = "test.foo"() : () -> i1  // matched by pattern A
+"test.bar"(%0) : (i1) -> ()   // matched by pattern B
+```
+
+Let's assume that the two patterns are applied back-to-back: first, pattern A
+replaces `"test.foo"` with `"test.qux"`, an op that has a different result
+type. The dialect conversion infrastructure has special support for such
+type-changing IR modifications.
+
+```mlir
+%0 = "test.qux"() : () -> i2
+%r = builtin.unrealized_conversion_cast %0 : i2 to i1
+"test.bar"(%r) : (i1) -> ()
+```
+
+Simply swapping out the operand of `"test.bar"` during the `replaceOp` call
+would be unsafe, because that would change the type of operand and, therefore,
+potentially the semantics of the operation. Instead, the dialect conversion
+driver (conceptually) inserts a `builtin.unrealized_conversion_cast` op that
+connects the newly created `"test.qux"` op with the `"test.bar"` op, without
+changing the types of the latter one.
+
+Now, the second pattern B is applied. The `operands` argument contains an SSA
+value with the most recent replacement type (`%0` with type `i2`), whereas
+querying the operand from the matched op still returns an SSA value with the
+original operand type `i1`.
+
+Note: If the conversion pattern is instantiated with a type converter, the
+`operands` argument contains SSA values whose types match the legalized operand
+types as per the type converter. See [Type Safety](#type-safety) for more
+details.
+
+Note: The dialect conversion framework does not guarantee the presence of any
+particular value in the `operands` argument. The only thing that's guaranteed
+is the type of the `operands` SSA values. E.g., instead of the actual
+replacement values supplied to a `replaceOp` API call, `operands` may contain
+results of transitory `builtin.unrealized_conversion_cast` ops that were
+inserted by the conversion driver but typically fold away again throughout the
+conversion process.
+
+#### Immediate vs. Delayed IR Modification
+
+The dialect conversion driver can operate in two modes: (a) rollback mode
+(default) and (b) no-rollback mode. This can be controlled by
+`ConversionConfig::allowPatternRollback`. When running in rollback mode, the
+driver is able backtrack and roll back already applied patterns when the
+current legalization path (sequence of pattern applications) gets stuck with
+unlegalizable operations.
+
+When running in no-rollback mode, all IR modifications such as op replacement,
+op erasure, op insertion or in-place op modification are applied immediately.
+
+When running in rollback mode, certain IR modifications are delayed to the end
+of the conversion process. For example, a `ConversionPatternRewriter::eraseOp`
+API call does not immediately erase the op, but just marks it for erasure. The
+op will stay visible to patterns and IR traversals throughout the conversion
+process. As another example, `replaceOp` and `replaceAllUsesWith` does not
+immediately update users of the original SSA values. This step is also delayed
+to the end of the conversion process.
+
+Delaying certain IR modifications has two benefits: (1) pattern rollback is
+simpler because fewer IR modifications must be rolled back, (2) pointers of
+erased operations / blocks are preserved upon rollback, and (3) patterns can
+still access/traverse the original IR to some degree. However, additional
+bookkeeping in the form of complex internal C++ data structures is required to
+support pattern rollback. Running in rollback mode has a significant toll on
+compilation time, is error-prone and makes debugging conversion passes more
+complicated. Therefore, programmers are encouraged to run in no-rollback mode
+when possible.
+
+The following table gives an overview of which IR changes are applied in a
+delayed fasion in rollback mode.
+
+| Type                                                    | Rollback Mode     | No-rollback Mode |
+| ------------------------------------------------------- | ----------------- | ---------------- |
+| Op Insertion / Movement (`create`/`insert`)             | Immediate         | Immediate        |
+| Op Replacement (`replaceOp`)                            | Delayed           | Immediate        |
+| Op Erasure (`eraseOp`)                                  | Delayed           | Immediate        |
+| Op Modification (`modifyOpInPlace`)                     | Immediate         | Immediate        |
+| Value Replacement (`replaceAllUsesWith`)                | Delayed           | Immediate        |
+| Block Insertion (`createBlock`)                         | Immediate         | Immediate        |
+| Block Replacement                                       | Not supported     | Not supported    |
+| Block Erasure                                           | Partly delayed    | Immediate        |
+| Block Signature Conversion (`applySignatureConversion`) | Partially delayed | Immediate        |
+| Region / Block Inlining  (`inlineBlockBefore`, etc.)    | Partially delayed | Immediate        |
+
+Value replacement is delayed and has different semantics in rollback mode:
+Since the actual replacement is delayed to the end of the conversion process,
+additional uses of the replaced value can be created after the
+`replaceAllUsesWith` call. Those uses will also be replaced at the end of the
+conversion process.
+
+Block replacement is not supported in either mode, because the rewriter
+infrastructure currently has no API for replacing blocks: there is no overload
+of `replaceAllUsesWith` that accepts `Block *`.
+
+Block erasure is partly delayed in rollback mode: the block is detached from
+the IR graph, but not memory for the block is not released until the end of the
+conversion process. This mechanism ensures that block pointers do not change
+when a block erasure is rolled back.
+
+Block signature conversion is a combination of block insertion, op insertion,
+value replacement and block erasure. In rollback mode, the first two steps are
+immediate, but the last two steps are delayed.
+
+Region / block inlining is a combination of block / op insertion and
+(optionally) value replacement. In rollback mode, the insertion steps are
+immediate, but the replacement step is delayed.
+
+Note: When running in rollback mode, the conversion driver inserts fewer
+transitory `builtin.unrealized_conversion_cast` ops. Such ops are needed less
+frequently because certain IR modifications are delayed, making it unnecessary
+to connect old (not yet rewritten) and new (already rewritten) IR in a
+type-safe way. This has a negative effect on the debugging experience: when
+dumping IR throughout the conversion process, users see a mixture of old and
+new IR, but the way they are connected is not always visibile in the IR. Some
+of that information is stored in internal C++ data structures that is not
+visibile during an IR dump.
+
 #### Type Safety
 
 The types of the remapped operands provided to a conversion pattern (through
diff --git a/mlir/docs/Dialects/GPU.md b/mlir/docs/Dialects/GPU.md
index 8d4d2ca3e5743..c16ed57737e5b 100644
--- a/mlir/docs/Dialects/GPU.md
+++ b/mlir/docs/Dialects/GPU.md
@@ -121,7 +121,7 @@ func.func @main() {
     gpu.launch
         blocks(%0, %1, %2) in (%3 = %c1, %4 = %c1, %5 = %c1)
         threads(%6, %7, %8) in (%9 = %c2, %10 = %c1, %11 = %c1) {
-        gpu.printf "Hello from %d\n" %6 : index
+        gpu.printf "Hello from %d\n", %6 : index
         gpu.terminator
     }
     return
diff --git a/mlir/docs/Dialects/Shard.md b/mlir/docs/Dialects/Shard.md
index eb6ff6150e474..573e888e6541f 100644
--- a/mlir/docs/Dialects/Shard.md
+++ b/mlir/docs/Dialects/Shard.md
@@ -27,9 +27,9 @@ the tensor is sharded - not specified manually.
 
 ### Device Groups
 
-Each collective operation runs within a group of devices. You define groups
-using the `grid` and `grid_axes` attributes, which describe how to slice the
-full device grid into smaller groups.
+Collective operations run within groups of devices, which are defined
+using the `grid` and `grid_axes` attributes. These describe
+how the full device grid is sliced into smaller groups.
 
 Devices that have the same coordinates *outside* the listed `grid_axes` belong
 to the same group.
diff --git a/mlir/include/mlir-c/Rewrite.h b/mlir/include/mlir-c/Rewrite.h
index 2db1d84cd1d89..fe42a20e73482 100644
--- a/mlir/include/mlir-c/Rewrite.h
+++ b/mlir/include/mlir-c/Rewrite.h
@@ -352,7 +352,7 @@ typedef struct {
 
 /// Create a rewrite pattern that matches the operation
 /// with the given rootName, corresponding to mlir::OpRewritePattern.
-MLIR_CAPI_EXPORTED MlirRewritePattern mlirOpRewritePattenCreate(
+MLIR_CAPI_EXPORTED MlirRewritePattern mlirOpRewritePatternCreate(
     MlirStringRef rootName, unsigned benefit, MlirContext context,
     MlirRewritePatternCallbacks callbacks, void *userData,
     size_t nGeneratedNames, MlirStringRef *generatedNames);
diff --git a/mlir/include/mlir-c/Target/LLVMIR.h b/mlir/include/mlir-c/Target/LLVMIR.h
index b5f948961e898..ec5ae88d485dc 100644
--- a/mlir/include/mlir-c/Target/LLVMIR.h
+++ b/mlir/include/mlir-c/Target/LLVMIR.h
@@ -33,6 +33,9 @@ extern "C" {
 MLIR_CAPI_EXPORTED LLVMModuleRef
 mlirTranslateModuleToLLVMIR(MlirOperation module, LLVMContextRef context);
 
+MLIR_CAPI_EXPORTED char *
+mlirTranslateModuleToLLVMIRToString(MlirOperation module);
+
 struct MlirTypeFromLLVMIRTranslator {
   void *ptr;
 };
diff --git a/mlir/include/mlir/Analysis/DataFlow/StridedMetadataRangeAnalysis.h b/mlir/include/mlir/Analysis/DataFlow/StridedMetadataRangeAnalysis.h
new file mode 100644
index 0000000000000..72ac2477435db
--- /dev/null
+++ b/mlir/include/mlir/Analysis/DataFlow/StridedMetadataRangeAnalysis.h
@@ -0,0 +1,54 @@
+//===- StridedMetadataRange.h - Strided metadata range analysis -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_ANALYSIS_DATAFLOW_STRIDEDMETADATARANGE_H
+#define MLIR_ANALYSIS_DATAFLOW_STRIDEDMETADATARANGE_H
+
+#include "mlir/Analysis/DataFlow/SparseAnalysis.h"
+#include "mlir/Interfaces/InferStridedMetadataInterface.h"
+
+namespace mlir {
+namespace dataflow {
+
+/// This lattice element represents the strided metadata of an SSA value.
+class StridedMetadataRangeLattice : public Lattice<StridedMetadataRange> {
+public:
+  using Lattice::Lattice;
+};
+
+/// Strided metadata range analysis determines the strided metadata ranges of
+/// SSA values using operations that define `InferStridedMetadataInterface`.
+///
+/// This analysis depends on DeadCodeAnalysis, SparseConstantPropagation, and
+/// IntegerRangeAnalysis, and will be a silent no-op if the analyses are not
+/// loaded in the same solver context.
+class StridedMetadataRangeAnalysis
+    : public SparseForwardDataFlowAnalysis<StridedMetadataRangeLattice> {
+public:
+  StridedMetadataRangeAnalysis(DataFlowSolver &solver,
+                               int32_t indexBitwidth = 64);
+
+  /// At an entry point, we cannot reason about strided metadata ranges unless
+  /// the type also encodes the data. For example, a memref with static layout.
+  void setToEntryState(StridedMetadataRangeLattice *lattice) override;
+
+  /// Visit an operation. Invoke the transfer function on each operation that
+  /// implements `InferStridedMetadataInterface`.
+  LogicalResult
+  visitOperation(Operation *op,
+                 ArrayRef<const StridedMetadataRangeLattice *> operands,
+                 ArrayRef<StridedMetadataRangeLattice *> results) override;
+
+private:
+  /// Index bitwidth to use when operating with the int-ranges.
+  int32_t indexBitwidth = 64;
+};
+} // namespace dataflow
+} // end namespace mlir
+
+#endif // MLIR_ANALYSIS_DATAFLOW_STRIDEDMETADATARANGE_H
diff --git a/mlir/include/mlir/Conversion/MathToROCDL/MathToROCDL.h b/mlir/include/mlir/Conversion/MathToROCDL/MathToROCDL.h
index 46573e7966ccc..60f1888569362 100644
--- a/mlir/include/mlir/Conversion/MathToROCDL/MathToROCDL.h
+++ b/mlir/include/mlir/Conversion/MathToROCDL/MathToROCDL.h
@@ -9,6 +9,7 @@
 #define MLIR_CONVERSION_MATHTOROCDL_MATHTOROCDL_H_
 
 #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
 #include "mlir/IR/PatternMatch.h"
 #include <memory>
 
@@ -19,8 +20,11 @@ class Pass;
 #include "mlir/Conversion/Passes.h.inc"
 
 /// Populate the given list with patterns that convert from Math to ROCDL calls.
-void populateMathToROCDLConversionPatterns(const LLVMTypeConverter &converter,
-                                           RewritePatternSet &patterns);
+// `chipset` specifies the AMDGPU chipset to target. If `std::nullopt`,
+// none of the chipset dependent patterns are added.
+void populateMathToROCDLConversionPatterns(
+    const LLVMTypeConverter &converter, RewritePatternSet &patterns,
+    std::optional<amdgpu::Chipset> chipset);
 } // namespace mlir
 
 #endif // MLIR_CONVERSION_MATHTOROCDL_MATHTOROCDL_H_
diff --git a/mlir/include/mlir/Conversion/MathToXeVM/MathToXeVM.h b/mlir/include/mlir/Conversion/MathToXeVM/MathToXeVM.h
new file mode 100644
index 0000000000000..91d3c92fd6296
--- /dev/null
+++ b/mlir/include/mlir/Conversion/MathToXeVM/MathToXeVM.h
@@ -0,0 +1,27 @@
+//===- MathToXeVM.h - Utils for converting Math to XeVM -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef MLIR_CONVERSION_MATHTOXEVM_MATHTOXEVM_H_
+#define MLIR_CONVERSION_MATHTOXEVM_MATHTOXEVM_H_
+
+#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/Dialect/LLVMIR/XeVMDialect.h"
+#include "mlir/IR/PatternMatch.h"
+#include <memory>
+
+namespace mlir {
+class Pass;
+
+#define GEN_PASS_DECL_CONVERTMATHTOXEVM
+#include "mlir/Conversion/Passes.h.inc"
+
+/// Populate the given list with patterns that convert from Math to XeVM calls.
+void populateMathToXeVMConversionPatterns(RewritePatternSet &patterns,
+                                          bool convertArith);
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_MATHTOXEVM_MATHTOXEVM_H_
diff --git a/mlir/include/mlir/Conversion/Passes.h b/mlir/include/mlir/Conversion/Passes.h
index da061b269daf7..40d866ec7bf10 100644
--- a/mlir/include/mlir/Conversion/Passes.h
+++ b/mlir/include/mlir/Conversion/Passes.h
@@ -49,6 +49,7 @@
 #include "mlir/Conversion/MathToLibm/MathToLibm.h"
 #include "mlir/Conversion/MathToROCDL/MathToROCDL.h"
 #include "mlir/Conversion/MathToSPIRV/MathToSPIRVPass.h"
+#include "mlir/Conversion/MathToXeVM/MathToXeVM.h"
 #include "mlir/Conversion/MemRefToEmitC/MemRefToEmitCPass.h"
 #include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
 #include "mlir/Conversion/MemRefToSPIRV/MemRefToSPIRVPass.h"
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index 3d3f0fce6a50c..e0cac8b699c30 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -778,6 +778,10 @@ def ConvertMathToROCDL : Pass<"convert-math-to-rocdl", "ModuleOp"> {
   let summary = "Convert Math dialect to ROCDL library calls";
   let description = [{
     This pass converts supported Math ops to ROCDL library calls.
+
+    The chipset option specifies the target AMDGPU architecture. If the chipset
+    is empty, none of the chipset-dependent patterns are added, and the pass
+    will not attempt to parse the chipset.
   }];
   let dependentDialects = [
     "arith::ArithDialect",
@@ -785,6 +789,9 @@ def ConvertMathToROCDL : Pass<"convert-math-to-rocdl", "ModuleOp"> {
     "ROCDL::ROCDLDialect",
     "vector::VectorDialect",
   ];
+  let options = [Option<"chipset", "chipset", "std::string",
+                        /*default=*/"\"\"",
+                        "Chipset that these operations will run on">];
 }
 
 //===----------------------------------------------------------------------===//
@@ -796,6 +803,31 @@ def ConvertMathToSPIRVPass : Pass<"convert-math-to-spirv"> {
   let dependentDialects = ["spirv::SPIRVDialect"];
 }
 
+//===----------------------------------------------------------------------===//
+// MathToXeVM
+//===----------------------------------------------------------------------===//
+
+def ConvertMathToXeVM : Pass<"convert-math-to-xevm"> {
+  let summary =
+      "Convert (fast) math operations to native XeVM/SPIRV equivalents";
+  let description = [{
+    This pass converts supported math ops marked with the `afn` fastmath flag
+    to function calls for OpenCL `native_` math intrinsics: These intrinsics
+    are typically mapped directly to native device instructions, often resulting
+    in better performance. However, the precision/error of these intrinsics
+    are implementation-defined, and thus math ops are only converted when they
+    have the `afn` fastmath flag enabled.
+  }];
+  let options = [Option<
+      "convertArith", "convert-arith", "bool", /*default=*/"true",
+      "Convert supported Arith ops (e.g. arith.divf) as well.">];
+  let dependentDialects = [
+    "arith::ArithDialect",
+    "xevm::XeVMDialect",
+    "LLVM::LLVMDialect",
+  ];
+}
+
 //===----------------------------------------------------------------------===//
 // MathToEmitC
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 8370d350afd1e..7184de93bfacb 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -112,6 +112,97 @@ def AMDGPU_ExtPackedFp8Op :
   }];
 }
 
+def IsValidBlockSize: AttrConstraint<
+    CPred<"::llvm::is_contained({16, 32}, ::llvm::cast<::mlir::IntegerAttr>($_self).getInt())">,
+    "whose value is 16 or 32">;
+
+def AMDGPU_ScaledExtPacked816Op
+    : AMDGPU_Op<"scaled_ext_packed816", [Pure, AllShapesMatch<["source", "res"]>]>,
+      Arguments<(
+          ins AnyTypeOf<[FixedVectorOfShapeAndType<[8], F4E2M1FN>,
+                         FixedVectorOfShapeAndType<[8], F8E4M3FN>,
+                         FixedVectorOfShapeAndType<[8], F8E5M2>,
+                         FixedVectorOfShapeAndType<[16], F6E2M3FN>,
+                         FixedVectorOfShapeAndType<[16], F6E3M2FN>]>:$source,
+          FixedVectorOfShapeAndType<[4], F8E8M0FNU>:$scale,
+          ConfinedAttr<I32Attr, [IsValidBlockSize]>:$blockSize,
+          ConfinedAttr<I32Attr, [IntMinValue<0>, IntMaxValue<1>]>:$firstScaleLane,
+          ConfinedAttr<I32Attr, [IntMinValue<0>, IntMaxValue<2>]>:$firstScaleByte)>,
+      Results<(
+          outs AnyTypeOf<[FixedVectorOfShapeAndType<[8], F32>,
+                          FixedVectorOfShapeAndType<[8], F16>,
+                          FixedVectorOfShapeAndType<[8], BF16>,
+                          FixedVectorOfShapeAndType<[16], F32>,
+                          FixedVectorOfShapeAndType<[16], F16>,
+                          FixedVectorOfShapeAndType<[16], BF16>]>:$res)> {
+
+  let summary = "Extend a vector of packed floating point values";
+
+  let description = [{
+    The scales applied to the input microfloats are stored in two bytes which
+    come from the `scales` input provided in a *half* of the wave identified
+    by `firstScaleLane`. The pair of bytes used is selected by
+    `firstScaleByte`. The 16 vectors in consecutive lanes starting from
+    `firstScaleLane` (which we'll call the scale vectors) will be used by both
+    halves of the wave (with lane L reading from L % 16'th scale vector), but
+    each half will use a different byte.
+
+    When the block size is 32, `firstScaleByte` can be either 0 or 2,
+    selecting halves of the scale vectors. Lanes 0-15 will read from
+    `firstScaleByte` and lanes 16-31 will read from `firstScaleByte` + 1.
+    For example:
+    ```mlir
+    // Input: 8-element vector of F8E4M3FN, converting to F32
+    // Lanes 0-15 read from byte 0, lanes 16-31 read from byte 1
+    %result = amdgpu.scaled_ext_packed816 %source scale(%scales)
+      blockSize(32) firstScaleLane(0) firstScaleByte(0)
+      : vector<8xf8E4M3FN>, vector<4xf8E8M0FNU> -> vector<8xf32>
+
+    // Input: 16-element vector of F6E2M3FN, converting to F16
+    // Lanes 0-15 read from byte 2, lanes 16-31 read from byte 3
+    %result = amdgpu.scaled_ext_packed816 %source scale(%scales)
+      blockSize(32) firstScaleLane(1) firstScaleByte(2)
+      : vector<16xf6E2M3FN>, vector<4xf8E8M0FNU> -> vector<16xf16>
+    ```
+
+    However, when the block size is 16, `firstScaleByte` can be 0 or 1.
+    Lanes 0-15 read from the `firstScaleByte`th element of the scale vectors,
+    while lanes 16-31 read from `firstScaleByte` + 2.
+    For example:
+    ```mlir
+    // Input: 8-element vector of F8E5M2, converting to BF16
+    // Lanes 0-15 read from byte 0, lanes 16-31 read from byte 2 (0+2)
+    %result = amdgpu.scaled_ext_packed816 %source scale(%scales)
+      blockSize(16) firstScaleLane(0) firstScaleByte(0)
+      : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xbf16>
+
+    // Input: 16-element vector of F6E3M2FN, converting to F32
+    // Lanes 0-15 read from byte 1, lanes 16-31 read from byte 3 (1+2)
+    %result = amdgpu.scaled_ext_packed816 %source scale(%scales)
+      blockSize(16) firstScaleLane(1) firstScaleByte(1)
+      : vector<16xf6E3M2FN>, vector<4xf8E8M0FNU> -> vector<16xf32>
+    ```
+
+    Note: the layout for the scales generally mirrors how the WMMA
+    instructions use for matix scales. These selection operands allows
+    one to choose portions of the matrix to convert.
+
+    Available on gfx1250+.
+  }];
+
+  let assemblyFormat = [{
+    attr-dict $source
+    `scale` `(` $scale `)`
+    `blockSize` `(` $blockSize `)`
+    `firstScaleLane` `(` $firstScaleLane`)`
+    `firstScaleByte` `(` $firstScaleByte `)`
+    `:` type($source) `,` type($scale) `->` type($res)
+  }];
+
+  let hasVerifier = 1;
+
+}
+
 def AMDGPU_ScaledExtPackedOp
     : AMDGPU_Op<"scaled_ext_packed", [Pure]>,
       Arguments<(
@@ -860,7 +951,7 @@ def AMDGPU_MFMAOp :
     based on the provided `m`, `k`, `n`, and `nBlks` attributes, along with the
     types of the source and destination arguments.
 
-    For information on the layouts of the input and output matrces (which are stored
+    For information on the layouts of the input and output matrices (which are stored
     in `sourceA`, `sourceB`, `destC`, and `destD`), see the CDNA ISA documentation.
 
     The `cbsz`, `abid`, and `blgp` parameters control how the lanes of the wave
diff --git a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
index e52b7d2090d53..12a79358d42f1 100644
--- a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
+++ b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
@@ -330,7 +330,6 @@ def AffineForOp : Affine_Op<"for",
     Speculation::Speculatability getSpeculatability();
   }];
 
-  let hasCanonicalizer = 1;
   let hasCustomAssemblyFormat = 1;
   let hasFolder = 1;
   let hasRegionVerifier = 1;
diff --git a/mlir/include/mlir/Dialect/Affine/LoopUtils.h b/mlir/include/mlir/Dialect/Affine/LoopUtils.h
index 9b59af73977d3..830c394f0bde6 100644
--- a/mlir/include/mlir/Dialect/Affine/LoopUtils.h
+++ b/mlir/include/mlir/Dialect/Affine/LoopUtils.h
@@ -61,7 +61,7 @@ LogicalResult loopUnrollUpToFactor(AffineForOp forOp, uint64_t unrollFactor);
 
 /// Returns true if `loops` is a perfectly nested loop nest, where loops appear
 /// in it from outermost to innermost.
-bool LLVM_ATTRIBUTE_UNUSED isPerfectlyNested(ArrayRef<AffineForOp> loops);
+[[maybe_unused]] bool isPerfectlyNested(ArrayRef<AffineForOp> loops);
 
 /// Get perfectly nested sequence of loops starting at root of loop nest
 /// (the first op being another AffineFor, and the second op - a terminator).
diff --git a/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td b/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
index 20c9097b51e6d..a38cf41a3e09b 100644
--- a/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
+++ b/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
@@ -1229,37 +1229,50 @@ def Arith_ScalingExtFOp
   let summary = "Upcasts input floats using provided scales values following "
                 "OCP MXFP Spec";
   let description = [{
-  This operation upcasts input floating-point values using provided scale
-  values. It expects both scales and the input operand to be of the same shape,
-  making the operation elementwise. Scales are usually calculated per block
-  following the OCP MXFP spec as described in https://arxiv.org/abs/2310.10537.
-
-  If scales are calculated per block where blockSize != 1, then scales may
-  require broadcasting to make this operation elementwise. For example, let's
-  say the input is of shape `<dim1 x dim2 x ... dimN>`. Given blockSize != 1 and
-  assuming quantization happens on the last axis, the input can be reshaped to
-  `<dim1 x dim2 x ... (dimN/blockSize) x blockSize>`. Scales will be calculated
-  per block on the last axis. Therefore, scales will be of shape
-  `<dim1 x dim2 x ... (dimN/blockSize) x 1>`. Scales could also be of some other
-  shape as long as it is broadcast compatible with the input, e.g.,
-  `<1 x 1 x ... (dimN/blockSize) x 1>`.
-
-  In this example, before calling into `arith.scaling_extf`, scales must be
-  broadcasted to `<dim1 x dim2 x dim3 ... (dimN/blockSize) x blockSize>`. Note
-  that there could be multiple quantization axes. Internally,
-  `arith.scaling_extf` would perform the following:
+    This operation upcasts input floating-point values using provided scale
+    values. It expects both scales and the input operand to be of the same shape,
+    making the operation elementwise. Scales are usually calculated per block
+    following the OCP MXFP spec as described in https://arxiv.org/abs/2310.10537.
 
-    ```
-    resultTy = get_type(result)
-    scaleTy  = get_type(scale)
-    inputTy = get_type(input)
-    scale.exponent = arith.truncf(scale) : scaleTy to f8E8M0
-    scale.extf = arith.extf(scale.exponent) : f8E8M0 to resultTy
-    input.extf = arith.extf(input) : inputTy to resultTy
-    result = arith.mulf(scale.extf, input.extf)
+    If scales are calculated per block where blockSize != 1, then scales may
+    require broadcasting to make this operation elementwise. For example, let's
+    say the input is of shape `<dim1 x dim2 x ... dimN>`. Given blockSize != 1 and
+    assuming quantization happens on the last axis, the input can be reshaped to
+    `<dim1 x dim2 x ... (dimN/blockSize) x blockSize>`. Scales will be calculated
+    per block on the last axis. Therefore, scales will be of shape
+    `<dim1 x dim2 x ... (dimN/blockSize) x 1>`. Scales could also be of some other
+    shape as long as it is broadcast compatible with the input, e.g.,
+    `<1 x 1 x ... (dimN/blockSize) x 1>`.
+
+    In this example, before calling into `arith.scaling_extf`, scales must be
+    broadcasted to `<dim1 x dim2 x dim3 ... (dimN/blockSize) x blockSize>`. Note
+    that there could be multiple quantization axes. Internally,
+    `arith.scaling_extf` would perform the following:
+
+    ```mlir
+    // Cast scale to result type.
+    %0 = arith.truncf %1 : f32 to f8E8M0FNU
+    %1 = arith.extf %0 : f8E8M0FNU to f16
+
+    // Cast input to result type.
+    %2 = arith.extf %3 : f4E2M1FN to f16
+
+    // Perform scaling
+    %3 = arith.mulf %2, %1 : f16
     ```
     It propagates NaN values. Therefore, if either scale or the input element
     contains NaN, then the output element value will also be a NaN.
+
+    Example:
+
+    ```mlir
+    // Upcast from f4E2M1FN to f32.
+    %a = arith.scaling_extf %b, %c : f4E2M1FN, f8E8M0FNU to f32
+
+    // Element-wise upcast with broadcast (blockSize = 32).
+    %f = vector.broadcast %g : vector<1xf8E8M0FNU> to vector<32xf8E8M0FNU>
+    %h = arith.scaling_extf %i, %f : vector<32xf4E2M1FN>, vector<32xf8E8M0FNU> to vector<32xbf16>
+    ```
   }];
   let hasVerifier = 1;
   let assemblyFormat =
@@ -1397,14 +1410,27 @@ def Arith_ScalingTruncFOp
     that there could be multiple quantization axes. Internally,
     `arith.scaling_truncf` would perform the following:
 
+    ```mlir
+    // Cast scale to input type.
+    %0 = arith.truncf %1 : f32 to f8E8M0FNU
+    %1 = arith.extf %0 : f8E8M0FNU to f16
+
+    // Perform scaling.
+    %3 = arith.divf %2, %1 : f16
+
+    // Cast to result type.
+    %4 = arith.truncf %3 : f16 to f4E2M1FN
     ```
-    scaleTy = get_type(scale)
-    inputTy = get_type(input)
-    resultTy = get_type(result)
-    scale.exponent = arith.truncf(scale) : scaleTy to f8E8M0
-    scale.extf = arith.extf(scale.exponent) : f8E8M0 to inputTy
-    result = arith.divf(input, scale.extf)
-    result.cast = arith.truncf(result, resultTy)
+
+    Example:
+
+    ```mlir
+    // Downcast from f32 to f4E2M1FN.
+    %a = arith.scaling_truncf %b, %c : f32, f8E8M0FNU to f4E2M1FN
+
+    // Element-wise downcast with broadcast (blockSize = 32).
+    %f = vector.broadcast %g : vector<1xf8E8M0FNU> to vector<32xf8E8M0FNU>
+    %h = arith.scaling_truncf %i, %f : vector<32xbf16>, vector<32xf8E8M0FNU> to vector<32xf4E2M1FN>
     ```
   }];
   let hasVerifier = 1;
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 987fc13e0508d..a6c6038e1e224 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -584,7 +584,7 @@ def GPU_DynamicSharedMemoryOp : GPU_Op<"dynamic_shared_memory", [Pure]>
     This operation provides a memref pointer to the start of dynamic shared
     memory, often referred to as workgroup memory. It's important to note that
     this dynamic shared memory needs to be allocated at kernel launch. One can
-    conveniently utilize `the dynamic_shared_memory_size` parameter of
+    conveniently utilize the `dynamic_shared_memory_size` parameter of
     `gpu.launch` for this purpose.
 
     Examples:
diff --git a/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h b/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h
index 035235fc7174a..fccb49d49da70 100644
--- a/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h
+++ b/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h
@@ -1,4 +1,4 @@
-//===- Passes.h - GPU NVVM pipeline entry points --------------------------===//
+//===- Passes.h - GPU pipeline entry points--------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -60,6 +60,52 @@ struct GPUToNVVMPipelineOptions
       llvm::cl::init(false)};
 };
 
+// Options for the gpu to xevm pipeline.
+struct GPUToXeVMPipelineOptions
+    : public PassPipelineOptions<GPUToXeVMPipelineOptions> {
+  PassOptions::Option<std::string> xegpuOpLevel{
+      *this, "xegpu-op-level",
+      llvm::cl::desc("Granularity of XeGPU operations to target: workgroup | "
+                     "subgroup | lane"),
+      llvm::cl::init("workgroup")};
+  // General lowering controls.
+  PassOptions::Option<bool> use64bitIndex{
+      *this, "use-64bit-index",
+      llvm::cl::desc("Bitwidth of the index type (host & device)"),
+      llvm::cl::init(true)};
+  PassOptions::Option<bool> kernelBarePtrCallConv{
+      *this, "kernel-bare-ptr-calling-convention",
+      llvm::cl::desc("Use bare pointer calling convention for device kernels"),
+      llvm::cl::init(false)};
+  PassOptions::Option<bool> hostBarePtrCallConv{
+      *this, "host-bare-ptr-calling-convention",
+      llvm::cl::desc("Use bare pointer calling convention for host launches"),
+      llvm::cl::init(false)};
+  PassOptions::Option<std::string> binaryFormat{
+      *this, "binary-format",
+      llvm::cl::desc("Final GPU binary emission format (e.g. fatbin)"),
+      llvm::cl::init("fatbin")};
+  // Options mirroring xevm-attach-target (GpuXeVMAttachTarget).
+  PassOptions::Option<std::string> xevmModuleMatcher{
+      *this, "xevm-module-matcher",
+      llvm::cl::desc("Regex to match gpu.module names for XeVM target attach"),
+      llvm::cl::init("")};
+  PassOptions::Option<std::string> zebinTriple{
+      *this, "zebin-triple", llvm::cl::desc("Target triple for XeVM codegen"),
+      llvm::cl::init("spirv64-unknown-unknown")};
+  PassOptions::Option<std::string> zebinChip{
+      *this, "zebin-chip", llvm::cl::desc("Target chip (e.g. pvc, bmg)"),
+      llvm::cl::init("bmg")};
+  PassOptions::Option<unsigned> optLevel{
+      *this, "opt-level",
+      llvm::cl::desc("Optimization level for attached target/codegen"),
+      llvm::cl::init(2)};
+  PassOptions::Option<std::string> cmdOptions{
+      *this, "igc-cmd-options",
+      llvm::cl::desc("Additional downstream compiler command line options"),
+      llvm::cl::init("")};
+};
+
 //===----------------------------------------------------------------------===//
 // Building and Registering.
 //===----------------------------------------------------------------------===//
@@ -70,8 +116,15 @@ struct GPUToNVVMPipelineOptions
 void buildLowerToNVVMPassPipeline(OpPassManager &pm,
                                   const GPUToNVVMPipelineOptions &options);
 
-/// Register all pipeleines for the `gpu` dialect.
+/// Adds the GPU to XeVM pipeline to the given pass manager. Transforms main
+/// dialects into XeVM targets. Begins with GPU code regions, then handles host
+/// code.
+void buildLowerToXeVMPassPipeline(OpPassManager &pm,
+                                  const GPUToXeVMPipelineOptions &options);
+
+/// Register all pipelines for the `gpu` dialect.
 void registerGPUToNVVMPipeline();
+void registerGPUToXeVMPipeline();
 
 } // namespace gpu
 } // namespace mlir
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
index 398388bd720be..490130fde8550 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
@@ -1025,11 +1025,13 @@ def LLVM_MaskedLoadOp : LLVM_OneResultIntrOp<"masked.load"> {
   string mlirBuilder = [{
     auto *intrinInst = dyn_cast<llvm::IntrinsicInst>(inst);
     bool nontemporal = intrinInst->hasMetadata(llvm::LLVMContext::MD_nontemporal);
+    llvm::Align alignment = intrinInst->getParamAlign(0).valueOrOne();
     $res = LLVM::MaskedLoadOp::create($_builder, $_location,
-      $_resultType, $data, $mask, $pass_thru, $_int_attr($alignment),
-        nontemporal ? $_builder.getUnitAttr() : nullptr);
+      $_resultType, $data, $mask, $pass_thru,
+      $_builder.getI32IntegerAttr(alignment.value()),
+      nontemporal ? $_builder.getUnitAttr() : nullptr);
   }];
-  list<int> llvmArgIndices = [0, 2, 3, 1, -1];
+  list<int> llvmArgIndices = [0, 1, 2, -1, -1];
 }
 
 /// Create a call to Masked Store intrinsic.
@@ -1045,10 +1047,12 @@ def LLVM_MaskedStoreOp : LLVM_ZeroResultIntrOp<"masked.store"> {
       $value, $data, llvm::Align($alignment), $mask);
   }];
   string mlirBuilder = [{
+    auto *intrinInst = dyn_cast<llvm::IntrinsicInst>(inst);
+    llvm::Align alignment = intrinInst->getParamAlign(1).valueOrOne();
     $_op = LLVM::MaskedStoreOp::create($_builder, $_location,
-      $value, $data, $mask, $_int_attr($alignment));
+      $value, $data, $mask, $_builder.getI32IntegerAttr(alignment.value()));
   }];
-  list<int> llvmArgIndices = [0, 1, 3, 2];
+  list<int> llvmArgIndices = [0, 1, 2, -1];
 }
 
 /// Create a call to Masked Gather intrinsic.
@@ -1068,10 +1072,13 @@ def LLVM_masked_gather : LLVM_OneResultIntrOp<"masked.gather"> {
         $_resultType, $ptrs, llvm::Align($alignment), $mask, $pass_thru[0]);
   }];
   string mlirBuilder = [{
+    auto *intrinInst = dyn_cast<llvm::IntrinsicInst>(inst);
+    llvm::Align alignment = intrinInst->getParamAlign(0).valueOrOne();
     $res = LLVM::masked_gather::create($_builder, $_location,
-      $_resultType, $ptrs, $mask, $pass_thru, $_int_attr($alignment));
+      $_resultType, $ptrs, $mask, $pass_thru,
+      $_builder.getI32IntegerAttr(alignment.value()));
   }];
-  list<int> llvmArgIndices = [0, 2, 3, 1];
+  list<int> llvmArgIndices = [0, 1, 2, -1];
 
   let hasVerifier = 1;
 }
@@ -1089,10 +1096,12 @@ def LLVM_masked_scatter : LLVM_ZeroResultIntrOp<"masked.scatter"> {
       $value, $ptrs, llvm::Align($alignment), $mask);
   }];
   string mlirBuilder = [{
+    auto *intrinInst = dyn_cast<llvm::IntrinsicInst>(inst);
+    llvm::Align alignment = intrinInst->getParamAlign(1).valueOrOne();
     $_op = LLVM::masked_scatter::create($_builder, $_location,
-      $value, $ptrs, $mask, $_int_attr($alignment));
+      $value, $ptrs, $mask, $_builder.getI32IntegerAttr(alignment.value()));
   }];
-  list<int> llvmArgIndices = [0, 1, 3, 2];
+  list<int> llvmArgIndices = [0, 1, 2, -1];
 
   let hasVerifier = 1;
 }
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
index 9753dca67c23d..e425e16a4b1a6 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -973,6 +973,7 @@ def LLVM_ShuffleVectorOp : LLVM_Op<"shufflevector",
     custom<ShuffleType>(ref(type($v1)), type($res), ref($mask))
   }];
 
+  let hasFolder = 1;
   let hasVerifier = 1;
 
   string llvmInstName = "ShuffleVector";
@@ -1974,7 +1975,6 @@ def LLVM_LLVMFuncOp : LLVM_Op<"func", [
     OptionalAttr<StrAttr>:$reciprocal_estimates,
     OptionalAttr<StrAttr>:$prefer_vector_width,
     OptionalAttr<LLVM_TargetFeaturesAttr>:$target_features,
-    OptionalAttr<BoolAttr>:$unsafe_fp_math,
     OptionalAttr<BoolAttr>:$no_infs_fp_math,
     OptionalAttr<BoolAttr>:$no_nans_fp_math,
     OptionalAttr<BoolAttr>:$no_signed_zeros_fp_math,
@@ -1985,6 +1985,7 @@ def LLVM_LLVMFuncOp : LLVM_Op<"func", [
     OptionalAttr<StrAttr>:$instrument_function_exit,
     OptionalAttr<UnitAttr>:$no_inline,
     OptionalAttr<UnitAttr>:$always_inline,
+    OptionalAttr<UnitAttr>:$inline_hint,
     OptionalAttr<UnitAttr>:$no_unwind,
     OptionalAttr<UnitAttr>:$will_return,
     OptionalAttr<UnitAttr>:$optimize_none,
@@ -2037,6 +2038,9 @@ def LLVM_LLVMFuncOp : LLVM_Op<"func", [
     /// Returns true if the `always_inline` attribute is set, false otherwise.
     bool isAlwaysInline() { return bool(getAlwaysInlineAttr()); }
 
+    /// Returns true if the `inline_hint` attribute is set, false otherwise.
+    bool isInlineHint() { return bool(getInlineHintAttr()); }
+
     /// Returns true if the `optimize_none` attribute is set, false otherwise.
     bool isOptimizeNone() { return bool(getOptimizeNoneAttr()); }
   }];
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 89fbeb7270a38..d959464836043 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -263,6 +263,7 @@ class NVVM_PureSpecialRangeableRegisterOp<string mnemonic, list<Trait> traits =
   let assemblyFormat = "(`range` $range^)? attr-dict `:` type($res)";
   let llvmBuilder = baseLlvmBuilder # setRangeRetAttrCode # baseLlvmBuilderCoda;
   let mlirBuilder = baseMlirBuilder # importRangeRetAttrCode # baseMlirBuilderCoda;
+  let hasVerifier = 1;
 
   // Backwards-compatibility builder for an unspecified range.
   let builders = [
@@ -279,6 +280,11 @@ class NVVM_PureSpecialRangeableRegisterOp<string mnemonic, list<Trait> traits =
         SetIntRangeFn setResultRanges) {
         nvvmInferResultRanges(getOperation(), getResult(), argRanges, setResultRanges);
     }
+
+    // Verify the range attribute satisfies LLVM ConstantRange constructor requirements.
+    ::llvm::LogicalResult $cppClass::verify() {
+      return verifyConstantRangeAttr(getOperation(), getRange());
+    }
   }];
 
 }
@@ -1655,6 +1661,40 @@ def NVVM_ConvertFloatToTF32Op : NVVM_Op<"convert.float.to.tf32"> {
   }];
 }
 
+def NVVM_ConvertF32x2ToF4x2Op : NVVM_Op<"convert.f32x2.to.f4x2"> {
+  let summary = "Convert a pair of float inputs to f4x2";
+  let description = [{
+    This Op converts each of the given float inputs to the specified fp4 type.
+    The result `dst` is returned as an i8 type where the converted values are 
+    packed such that the value converted from `a` is stored in the upper 4 bits 
+    of `dst` and the value converted from `b` is stored in the lower 4 bits of 
+    `dst`.
+    The `relu` attribute, when set, lowers to the '.relu' variant of
+    the cvt instruction.
+
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt)
+  }];
+
+  let results = (outs I8:$dst);
+  let arguments = (ins F32:$a, F32:$b,
+                       DefaultValuedAttr<BoolAttr, "false">:$relu,
+                       TypeAttr:$dstTy);
+  let assemblyFormat = "$a `,` $b attr-dict `:` type($dst) `(` $dstTy `)`";
+  let hasVerifier = 1;
+
+  let extraClassDeclaration = [{
+    static mlir::NVVM::IDArgPair
+    getIntrinsicIDAndArgs(NVVM::ConvertF32x2ToF4x2Op op, 
+      LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder);
+  }];
+
+  string llvmBuilder = [{
+    auto [intId, args] = NVVM::ConvertF32x2ToF4x2Op::getIntrinsicIDAndArgs(op, moduleTranslation, builder);
+    llvm::Value *packedI16 = createIntrinsicCall(builder, intId, args);
+    $dst = builder.CreateTruncOrBitCast(packedI16, llvm::Type::getInt8Ty(builder.getContext()));
+  }];
+}
+
 def NVVM_ConvertF32x2ToF6x2Op : NVVM_Op<"convert.f32x2.to.f6x2"> {
   let summary = "Convert a pair of float inputs to f6x2";
   let description = [{
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 6925cec42b5b4..d2df244eb9363 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -412,6 +412,32 @@ def ROCDL_WaitExpcntOp: ROCDL_ConcreteNonMemIntrOp<"s.wait.expcnt", [], 0, [0],
   let assemblyFormat = "$count attr-dict";
 }
 
+def ROCDL_WaitAsynccntOp: ROCDL_ConcreteNonMemIntrOp<"s.wait.asynccnt", [], 0, [0], ["count"]>,
+  Arguments<(ins I16Attr:$count)> {
+  let summary = "Wait until ASYNCCNT is less than or equal to `count`";
+  let description = [{
+      Wait for the counter specified to be less-than or equal-to the `count`
+      before continuing.
+
+      Available on gfx1250+.
+  }];
+  let results = (outs);
+  let assemblyFormat = "$count attr-dict";
+}
+
+def ROCDL_WaitTensorcntOp: ROCDL_ConcreteNonMemIntrOp<"s.wait.tensorcnt", [], 0, [0], ["count"]>,
+  Arguments<(ins I16Attr:$count)> {
+  let summary = "Wait until TENSORCNT is less than or equal to `count`";
+  let description = [{
+      Wait for the counter specified to be less-than or equal-to the `count`
+      before continuing.
+
+      Available on gfx1250+.
+  }];
+  let results = (outs);
+  let assemblyFormat = "$count attr-dict";
+}
+
 def ROCDL_SetPrioOp : ROCDL_ConcreteNonMemIntrOp<"s.setprio", [], 0, [0], ["priority"]>,
   Arguments<(ins I16Attr:$priority)> {
   let assemblyFormat = "$priority attr-dict";
@@ -548,6 +574,30 @@ def ROCDL_wmma_f32_16x16x16_fp8_bf8 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x16.fp8_b
 def ROCDL_wmma_f32_16x16x16_bf8_bf8 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x16.bf8_bf8", [1]>;
 def ROCDL_wmma_f32_16x16x16_bf8_fp8 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x16.bf8_fp8", [1]>;
 def ROCDL_wmma_i32_16x16x32_iu4 : ROCDL_Wmma_IntrOp<"wmma.i32.16x16x32.iu4", [1]>;
+// Available from gfx1250
+def ROCDL_wmma_f32_16x16x4_f32 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x4.f32", [1]>;
+def ROCDL_wmma_f32_16x16x32_bf16 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x32.bf16", [1]>;
+def ROCDL_wmma_f32_16x16x32_f16 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x32.f16", [1]>;
+def ROCDL_wmma_f16_16x16x32_f16 : ROCDL_Wmma_IntrOp<"wmma.f16.16x16x32.f16", [1]>;
+def ROCDL_wmma_bf16_16x16x32_bf16 : ROCDL_Wmma_IntrOp<"wmma.bf16.16x16x32.bf16", [1]>;
+def ROCDL_wmma_bf16f32_16x16x32_bf16 : ROCDL_Wmma_IntrOp<"wmma.bf16f32.16x16x32.bf16", [1,5]>;
+def ROCDL_wmma_f32_16x16x64_fp8_fp8 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x64.fp8_fp8", [0]>;
+def ROCDL_wmma_f32_16x16x64_fp8_bf8 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x64.fp8_bf8", [0]>;
+def ROCDL_wmma_f32_16x16x64_bf8_fp8 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x64.bf8_fp8", [0]>;
+def ROCDL_wmma_f32_16x16x64_bf8_bf8 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x64.bf8_bf8", [0]>;
+def ROCDL_wmma_f16_16x16x64_fp8_fp8 : ROCDL_Wmma_IntrOp<"wmma.f16.16x16x64.fp8_fp8", [0]>;
+def ROCDL_wmma_f16_16x16x64_fp8_bf8 : ROCDL_Wmma_IntrOp<"wmma.f16.16x16x64.fp8_bf8", [0]>;
+def ROCDL_wmma_f16_16x16x64_bf8_fp8 : ROCDL_Wmma_IntrOp<"wmma.f16.16x16x64.bf8_fp8", [0]>;
+def ROCDL_wmma_f16_16x16x64_bf8_bf8 : ROCDL_Wmma_IntrOp<"wmma.f16.16x16x64.bf8_bf8", [0]>;
+def ROCDL_wmma_f32_16x16x128_fp8_fp8 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x128.fp8_fp8", [0]>;
+def ROCDL_wmma_f32_16x16x128_fp8_bf8 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x128.fp8_bf8", [0]>;
+def ROCDL_wmma_f32_16x16x128_bf8_fp8 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x128.bf8_fp8", [0]>;
+def ROCDL_wmma_f32_16x16x128_bf8_bf8 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x128.bf8_bf8", [0]>;
+def ROCDL_wmma_f16_16x16x128_fp8_fp8 : ROCDL_Wmma_IntrOp<"wmma.f16.16x16x128.fp8_fp8", [0]>;
+def ROCDL_wmma_f16_16x16x128_fp8_bf8 : ROCDL_Wmma_IntrOp<"wmma.f16.16x16x128.fp8_bf8", [0]>;
+def ROCDL_wmma_f16_16x16x128_bf8_fp8 : ROCDL_Wmma_IntrOp<"wmma.f16.16x16x128.bf8_fp8", [0]>;
+def ROCDL_wmma_f16_16x16x128_bf8_bf8 : ROCDL_Wmma_IntrOp<"wmma.f16.16x16x128.bf8_bf8", [0]>;
+def ROCDL_wmma_i32_16x16x64_iu8 : ROCDL_Wmma_IntrOp<"wmma.i32.16x16x64.iu8", [1]>;
 
 //===---------------------------------------------------------------------===//
 // LDS transpose intrinsics (available in GFX950)
@@ -1117,6 +1167,7 @@ foreach smallT = [
     ScaleArgInfo<ROCDL_V16BF16Type, "Bf16">,
     ScaleArgInfo<ROCDL_V16F32Type, "F32">,
   ] in {
+    // Up-scaling
     def ROCDL_CvtPkScalePk16 # largeT.nameForOp # smallT.nameForOp # Op :
           ROCDL_ConcreteNonMemIntrOp<"cvt.scale.pk16." # largeT.name # "." # smallT.name,
           [Pure], 1, [2], ["scaleSel"]>,
@@ -1132,6 +1183,42 @@ foreach smallT = [
       }];
 
     }
+
+    // Down-scaling
+    def ROCDL_CvtScaleF32Pk16 # smallT.nameForOp # largeT.nameForOp # Op :
+        ROCDL_ConcreteNonMemIntrOp<"cvt.scalef32.pk16." # smallT.name # "." # largeT.name,
+          [Pure], 1>,
+        Arguments<(ins largeT.type:$src, F32:$scale)> {
+      let results = (outs smallT.type:$res);
+      let summary = "Scale and convert packed "
+        # largeT.name # " to packed " # smallT.name ;
+     let description = [{
+        Convert 8 packed }] # largeT.name # [{ values to packed }]
+        # smallT.name # [{, multiplying by the exponent part of `scale`
+        before doing so. This op is for gfx1250+ arch.
+      }];
+      let assemblyFormat = [{
+        attr-dict $src `,` $scale `:` type($res)
+      }];
+    }
+
+    def ROCDL_CvtScaleF32SrPk16 # smallT.nameForOp # largeT.nameForOp # Op :
+        ROCDL_ConcreteNonMemIntrOp<"cvt.scalef32.sr.pk16." # smallT.name # "." # largeT.name,
+          [Pure], 1>,
+        Arguments<(ins largeT.type:$src, I32:$seed, F32:$scale)> {
+      let results = (outs smallT.type:$res);
+      let summary = "Scale and convert packed "
+        # largeT.name # " to packed " # smallT.name # " with stochastic rounding";
+     let description = [{
+        Convert 8 packed }] # largeT.name # [{ values to packed }]
+        # smallT.name # [{, multiplying by the exponent part of `scale`
+        before doing so and apply stochastic rounding. This op is for gfx1250+ arch.
+      }];
+      let assemblyFormat = [{
+        attr-dict $src `,` $seed `,` $scale `:` type($res)
+      }];
+    }
+
   } // foreach largeT
 } // foreach smallTOp
 
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index ae7a085a1f7a8..c89fc59c91830 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -25,7 +25,6 @@
 #include "mlir/Interfaces/TilingInterface.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "llvm/ADT/SmallBitVector.h"
-#include "llvm/ADT/SmallSet.h"
 
 namespace mlir {
 namespace bufferization {
@@ -621,35 +620,43 @@ LogicalResult rewriteAsPaddedOp(RewriterBase &rewriter, LinalgOp opToPad,
 /// In the future, more general interfaces can be devised to encode similar
 /// shape evolutions and map between an op and its operands.
 SmallVector<OpFoldResult>
-computePaddedShape(RewriterBase &rewriter, TypedValue<RankedTensorType> v,
+computePaddedShape(OpBuilder &, TypedValue<RankedTensorType> v,
                    AffineMap indexingMap, ArrayRef<OpFoldResult> indexingSizes,
                    const PadTilingInterfaceOptions &options);
 
 using PadSizeComputationFunction =
     std::function<FailureOr<SmallVector<OpFoldResult>>(
-        RewriterBase &, OpOperand &, ArrayRef<Range>,
+        OpBuilder &, OpOperand &, ArrayRef<Range>,
         const PadTilingInterfaceOptions &)>;
 
 /// Specific helper for Linalg ops.
-FailureOr<SmallVector<OpFoldResult>> computeIndexingMapOpInterfacePaddedShape(
-    RewriterBase &rewriter, OpOperand &operandToPad,
-    ArrayRef<Range> iterationDomain, const PadTilingInterfaceOptions &options);
+FailureOr<SmallVector<OpFoldResult>>
+computeIndexingMapOpInterfacePaddedShape(OpBuilder &, OpOperand &operandToPad,
+                                         ArrayRef<Range> iterationDomain,
+                                         const PadTilingInterfaceOptions &);
+
+/// Operations and values created in the process of padding a TilingInterface
+/// operation.
+struct PadTilingInterfaceResult {
+  /// The operands of the padded op.
+  SmallVector<tensor::PadOp> padOps;
+  /// The padded op, a clone of `toPad` with padded operands.
+  TilingInterface paddedOp;
+  /// Slices of the padded op's results, same types as `toPad`.
+  SmallVector<Value> replacements;
+};
 
-/// Pad the iterator dimensions `options.paddingDimensions` of `opToPad`.
-///
+/// Pad the iterator dimensions of `toPad`.
 /// * "options.paddingSizes" indicates that each padding dimension should be
 ///   padded to the specified padding size.
 /// * "options.padToMultipleOf" indicates that the paddingSizes should be
 //    interpreted as the bounding box (dynamic) value to pad to.
 /// * Use "options.paddingValues" to set the padding value of the created
 //    tensor::PadOp.
-/// * The tensor::PadOp is returned on success.
-
-FailureOr<TilingInterface>
-rewriteAsPaddedOp(RewriterBase &rewriter, TilingInterface opToPad,
-                  const PadTilingInterfaceOptions &constOptions,
-                  SmallVector<tensor::PadOp> &padOps,
-                  const PadSizeComputationFunction &computePaddingSizeFun =
+FailureOr<PadTilingInterfaceResult>
+rewriteAsPaddedOp(OpBuilder &, TilingInterface toPad,
+                  PadTilingInterfaceOptions options,
+                  const PadSizeComputationFunction & =
                       &computeIndexingMapOpInterfacePaddedShape);
 
 namespace detail {
diff --git a/mlir/include/mlir/Dialect/MemRef/IR/MemRef.h b/mlir/include/mlir/Dialect/MemRef/IR/MemRef.h
index 30f33ed2fd1d6..69447f74ec403 100644
--- a/mlir/include/mlir/Dialect/MemRef/IR/MemRef.h
+++ b/mlir/include/mlir/Dialect/MemRef/IR/MemRef.h
@@ -17,6 +17,7 @@
 #include "mlir/Interfaces/CastInterfaces.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "mlir/Interfaces/InferIntRangeInterface.h"
+#include "mlir/Interfaces/InferStridedMetadataInterface.h"
 #include "mlir/Interfaces/InferTypeOpInterface.h"
 #include "mlir/Interfaces/MemOpInterfaces.h"
 #include "mlir/Interfaces/MemorySlotInterfaces.h"
diff --git a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
index 89bd0f103d9f3..b39207fc30dd7 100644
--- a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
+++ b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
@@ -14,6 +14,7 @@ include "mlir/Dialect/MemRef/IR/MemRefBase.td"
 include "mlir/Interfaces/CastInterfaces.td"
 include "mlir/Interfaces/ControlFlowInterfaces.td"
 include "mlir/Interfaces/InferIntRangeInterface.td"
+include "mlir/Interfaces/InferStridedMetadataInterface.td"
 include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/Interfaces/MemOpInterfaces.td"
 include "mlir/Interfaces/MemorySlotInterfaces.td"
@@ -2085,6 +2086,7 @@ def MemRef_StoreOp : MemRef_Op<"store",
 
 def SubViewOp : MemRef_OpWithOffsetSizesAndStrides<"subview", [
     DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>,
+    DeclareOpInterfaceMethods<InferStridedMetadataOpInterface>,
     DeclareOpInterfaceMethods<MemorySpaceCastConsumerOpInterface>,
     DeclareOpInterfaceMethods<ViewLikeOpInterface>,
     AttrSizedOperandSegments,
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACC.h b/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
index 8f87235fcd237..e2a60f57940f6 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
@@ -152,12 +152,6 @@ mlir::ValueRange getDataOperands(mlir::Operation *accOp);
 /// Used to get a mutable range iterating over the data operands.
 mlir::MutableOperandRange getMutableDataOperands(mlir::Operation *accOp);
 
-/// Used to obtain the enclosing compute construct operation that contains
-/// the provided `region`. Returns nullptr if no compute construct operation
-/// is found. The returns operation is one of types defined by
-///`ACC_COMPUTE_CONSTRUCT_OPS`.
-mlir::Operation *getEnclosingComputeOp(mlir::Region &region);
-
 /// Used to check whether the provided `type` implements the `PointerLikeType`
 /// interface.
 inline bool isPointerLikeType(mlir::Type type) {
@@ -183,6 +177,10 @@ static constexpr StringLiteral getRoutineInfoAttrName() {
   return StringLiteral("acc.routine_info");
 }
 
+static constexpr StringLiteral getVarNameAttrName() {
+  return VarNameAttr::name;
+}
+
 static constexpr StringLiteral getCombinedConstructsAttrName() {
   return CombinedConstructsTypeAttr::name;
 }
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
index 77e833f8f9492..e78cdbe3d3e64 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
@@ -415,6 +415,13 @@ def OpenACC_ConstructResource : Resource<"::mlir::acc::ConstructResource">;
 // Define a resource for the OpenACC current device setting.
 def OpenACC_CurrentDeviceIdResource : Resource<"::mlir::acc::CurrentDeviceIdResource">;
 
+// Attribute for saving variable names - this can be attached to non-acc-dialect
+// operations in order to ensure the name is preserved.
+def OpenACC_VarNameAttr : OpenACC_Attr<"VarName", "var_name"> {
+  let parameters = (ins StringRefParameter<"">:$name);
+  let assemblyFormat = "`<` $name `>`";
+}
+
 // Used for data specification in data clauses (2.7.1).
 // Either (or both) extent and upperbound must be specified.
 def OpenACC_DataBoundsOp : OpenACC_Op<"bounds",
@@ -1316,6 +1323,24 @@ def OpenACC_PrivateRecipeOp
   }];
 
   let hasRegionVerifier = 1;
+
+  let extraClassDeclaration = [{
+    /// Creates a PrivateRecipeOp and populates its regions based on the
+    /// variable type as long as the type implements MappableType or
+    /// PointerLikeType interface. If a type implements both, the MappableType
+    /// API will be preferred. Returns std::nullopt if the recipe cannot be
+    /// created or populated. The builder's current insertion point will be used
+    /// and it must be a valid place for this operation to be inserted. The
+    /// `recipeName` must be a unique name to prevent "redefinition of symbol"
+    /// IR errors.
+    static std::optional<PrivateRecipeOp> createAndPopulate(
+        ::mlir::OpBuilder &builder,
+        ::mlir::Location loc,
+        ::llvm::StringRef recipeName,
+        ::mlir::Type varType,
+        ::llvm::StringRef varName = "",
+        ::mlir::ValueRange bounds = {});
+  }];
 }
 
 //===----------------------------------------------------------------------===//
@@ -1410,6 +1435,24 @@ def OpenACC_FirstprivateRecipeOp
   }];
 
   let hasRegionVerifier = 1;
+
+  let extraClassDeclaration = [{
+    /// Creates a FirstprivateRecipeOp and populates its regions based on the
+    /// variable type as long as the type implements MappableType or
+    /// PointerLikeType interface. If a type implements both, the MappableType
+    /// API will be preferred. Returns std::nullopt if the recipe cannot be
+    /// created or populated. The builder's current insertion point will be used
+    /// and it must be a valid place for this operation to be inserted. The
+    /// `recipeName` must be a unique name to prevent "redefinition of symbol"
+    /// IR errors.
+    static std::optional<FirstprivateRecipeOp> createAndPopulate(
+        ::mlir::OpBuilder &builder,
+        ::mlir::Location loc,
+        ::llvm::StringRef recipeName,
+        ::mlir::Type varType,
+        ::llvm::StringRef varName = "",
+        ::mlir::ValueRange bounds = {});
+  }];
 }
 
 //===----------------------------------------------------------------------===//
@@ -2744,10 +2787,14 @@ def AtomicReadOp : OpenACC_Op<"atomic.read", [AtomicReadOpInterface]> {
 
   let arguments = (ins OpenACC_PointerLikeType:$x,
                        OpenACC_PointerLikeType:$v,
-                       TypeAttr:$element_type);
+                       TypeAttr:$element_type, Optional<I1>:$ifCond);
   let assemblyFormat = [{
+    oilist(
+        `if` `(` $ifCond `)`
+    )
     $v `=` $x
-    `:` type($v) `,` type($x) `,` $element_type attr-dict
+    `:` type($v) `,` type($x) `,` $element_type
+    attr-dict
   }];
   let hasVerifier = 1;
 }
@@ -2766,8 +2813,12 @@ def AtomicWriteOp : OpenACC_Op<"atomic.write",[AtomicWriteOpInterface]> {
   }];
 
   let arguments = (ins OpenACC_PointerLikeType:$x,
-                       AnyType:$expr);
+                       AnyType:$expr,
+                       Optional<I1>:$ifCond);
   let assemblyFormat = [{
+    oilist(
+        `if` `(` $ifCond `)`
+    )
     $x `=` $expr
     `:` type($x) `,` type($expr)
     attr-dict
@@ -2807,10 +2858,15 @@ def AtomicUpdateOp : OpenACC_Op<"atomic.update",
 
   let arguments = (ins Arg<OpenACC_PointerLikeType,
                            "Address of variable to be updated",
-                           [MemRead, MemWrite]>:$x);
+                           [MemRead, MemWrite]>:$x,
+                       Optional<I1>:$ifCond);
   let regions = (region SizedRegion<1>:$region);
   let assemblyFormat = [{
-    $x `:` type($x) $region attr-dict
+    oilist(
+        `if` `(` $ifCond `)`
+    )
+    $x `:` type($x)
+      $region attr-dict
   }];
   let hasVerifier = 1;
   let hasRegionVerifier = 1;
@@ -2853,8 +2909,13 @@ def AtomicCaptureOp : OpenACC_Op<"atomic.capture",
 
   }];
 
+  let arguments = (ins Optional<I1>:$ifCond);
+
   let regions = (region SizedRegion<1>:$region);
   let assemblyFormat = [{
+    oilist(
+        `if` `(` $ifCond `)`
+    )
     $region attr-dict
   }];
   let hasRegionVerifier = 1;
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCTypeInterfaces.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCTypeInterfaces.td
index 0d16255c5a994..93e9e3d0689f7 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCTypeInterfaces.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCTypeInterfaces.td
@@ -73,17 +73,31 @@ def OpenACC_PointerLikeTypeInterface : TypeInterface<"PointerLikeType"> {
     InterfaceMethod<
       /*description=*/[{
         Generates allocation operations for the pointer-like type. It will create
-        an allocate that produces memory space for an instance of the current type.
+        an allocate operation that produces memory space for an instance of the
+        current type.
 
         The `varName` parameter is optional and can be used to provide a name
-        for the allocated variable. If the current type is represented
-        in a way that it does not capture the pointee type, `varType` must be
-        passed in to provide the necessary type information.
+        for the allocated variable. When provided, it must be used by the
+        implementation; and if the implementing dialect does not have its own
+        way to save it, the discardable `acc.var_name` attribute from the acc
+        dialect will be used.
+
+        If the current type is represented in a way that it does not capture
+        the pointee type, `varType` must be passed in to provide the necessary
+        type information.
 
         The `originalVar` parameter is optional but enables support for dynamic
         types (e.g., dynamic memrefs). When provided, implementations can extract
         runtime dimension information from the original variable to create
-        allocations with matching dynamic sizes.
+        allocations with matching dynamic sizes. When generating recipe bodies,
+        `originalVar` should be the block argument representing the original
+        variable in the recipe region.
+
+        The `needsFree` output parameter indicates whether the allocated memory
+        requires explicit deallocation. Implementations should set this to true
+        for heap allocations that need a matching deallocation operation (e.g.,
+        alloc) and false for stack-based allocations (e.g., alloca). During
+        recipe generation, this determines whether a destroy region is created.
 
         Returns a Value representing the result of the allocation. If no value
         is returned, it means the allocation was not successfully generated.
@@ -94,7 +108,8 @@ def OpenACC_PointerLikeTypeInterface : TypeInterface<"PointerLikeType"> {
                     "::mlir::Location":$loc,
                     "::llvm::StringRef":$varName,
                     "::mlir::Type":$varType,
-                    "::mlir::Value":$originalVar),
+                    "::mlir::Value":$originalVar,
+                    "bool &":$needsFree),
       /*methodBody=*/"",
       /*defaultImplementation=*/[{
         return {};
@@ -102,23 +117,34 @@ def OpenACC_PointerLikeTypeInterface : TypeInterface<"PointerLikeType"> {
     >,
     InterfaceMethod<
       /*description=*/[{
-        Generates deallocation operations for the pointer-like type. It deallocates
-        the instance provided.
+        Generates deallocation operations for the pointer-like type.
 
-        The `varPtr` parameter is required and must represent an instance that was
-        previously allocated. If the current type is represented in a way that it
-        does not capture the pointee type, `varType` must be passed in to provide
-        the necessary type information. Nothing is generated in case the allocate
-        is `alloca`-like.
+        The `varToFree` parameter is required and must represent an instance
+        that was previously allocated. When generating recipe bodies, this
+        should be the block argument representing the private variable in the
+        destroy region.
+
+        The `allocRes` parameter is optional and provides the result of the
+        corresponding allocation from the init region. This allows implementations
+        to inspect the allocation operation to determine the appropriate
+        deallocation strategy. This is necessary because in recipe generation,
+        the allocation and deallocation occur in separate regions. Dialects that
+        use only one allocation type or can determine deallocation from type
+        information alone may ignore this parameter.
+
+        The `varType` parameter must be provided if the current type does not
+        capture the pointee type information. No deallocation is generated for
+        stack-based allocations (e.g., alloca).
 
-        Returns true if deallocation was successfully generated or successfully
-        deemed as not needed to be generated, false otherwise.
+        Returns true if deallocation was successfully generated or determined to
+        be unnecessary, false otherwise.
       }],
       /*retTy=*/"bool",
       /*methodName=*/"genFree",
       /*args=*/(ins "::mlir::OpBuilder &":$builder,
                     "::mlir::Location":$loc,
-                    "::mlir::TypedValue<::mlir::acc::PointerLikeType>":$varPtr,
+                    "::mlir::TypedValue<::mlir::acc::PointerLikeType>":$varToFree,
+                    "::mlir::Value":$allocRes,
                     "::mlir::Type":$varType),
       /*methodBody=*/"",
       /*defaultImplementation=*/[{
@@ -274,6 +300,14 @@ def OpenACC_MappableTypeInterface : TypeInterface<"MappableType"> {
         The `initVal` can be empty - it is primarily needed for reductions
         to ensure the variable is also initialized with appropriate value.
 
+        The `needsDestroy` out-parameter is set by implementations to indicate
+        that destruction code must be generated after the returned private
+        variable usages, typically in the destroy region of recipe operations
+        (for example, when heap allocations or temporaries requiring cleanup
+        are created during initialization). When `needsDestroy` is set, callers
+        should invoke `generatePrivateDestroy` in the recipe's destroy region
+        with the privatized value returned by this method.
+
         If the return value is empty, it means that recipe body was not
         successfully generated.
       }],
@@ -284,12 +318,38 @@ def OpenACC_MappableTypeInterface : TypeInterface<"MappableType"> {
                     "::mlir::TypedValue<::mlir::acc::MappableType>":$var,
                     "::llvm::StringRef":$varName,
                     "::mlir::ValueRange":$extents,
-                    "::mlir::Value":$initVal),
+                    "::mlir::Value":$initVal,
+                    "bool &":$needsDestroy),
       /*methodBody=*/"",
       /*defaultImplementation=*/[{
         return {};
       }]
     >,
+    InterfaceMethod<
+      /*description=*/[{
+        Generates destruction operations for a privatized value previously
+        produced by `generatePrivateInit`. This is typically inserted in a
+        recipe's destroy region, after all uses of the privatized value.
+
+        The `privatized` value is the SSA value yielded by the init region
+        (and passed as the privatized argument to the destroy region).
+        Implementations should free heap-allocated storage or perform any
+        cleanup required for the given type. If no destruction is required,
+        this function should be a no-op and return `true`.
+
+        Returns true if destruction was successfully generated or deemed not
+        necessary, false otherwise.
+      }],
+      /*retTy=*/"bool",
+      /*methodName=*/"generatePrivateDestroy",
+      /*args=*/(ins "::mlir::OpBuilder &":$builder,
+                    "::mlir::Location":$loc,
+                    "::mlir::Value":$privatized),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{
+        return true;
+      }]
+    >,
   ];
 }
 
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h b/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h
new file mode 100644
index 0000000000000..378f4348f2cf1
--- /dev/null
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h
@@ -0,0 +1,44 @@
+//===- OpenACCUtils.h - OpenACC Utilities -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_OPENACC_OPENACCUTILS_H_
+#define MLIR_DIALECT_OPENACC_OPENACCUTILS_H_
+
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+
+namespace mlir {
+namespace acc {
+
+/// Used to obtain the enclosing compute construct operation that contains
+/// the provided `region`. Returns nullptr if no compute construct operation
+/// is found. The returned operation is one of types defined by
+/// `ACC_COMPUTE_CONSTRUCT_OPS`.
+mlir::Operation *getEnclosingComputeOp(mlir::Region &region);
+
+/// Returns true if this value is only used by `acc.private` operations in the
+/// `region`.
+bool isOnlyUsedByPrivateClauses(mlir::Value val, mlir::Region &region);
+
+/// Returns true if this value is only used by `acc.reduction` operations in
+/// the `region`.
+bool isOnlyUsedByReductionClauses(mlir::Value val, mlir::Region &region);
+
+/// Looks for an OpenACC default attribute on the current operation `op` or in
+/// a parent operation which encloses `op`. This is useful because OpenACC
+/// specification notes that a visible default clause is the nearest default
+/// clause appearing on the compute construct or a lexically containing data
+/// construct.
+std::optional<ClauseDefaultValue> getDefaultAttr(mlir::Operation *op);
+
+/// Get the type category of an OpenACC variable.
+mlir::acc::VariableTypeCategory getTypeCategory(mlir::Value var);
+
+} // namespace acc
+} // namespace mlir
+
+#endif // MLIR_DIALECT_OPENACC_OPENACCUTILS_H_
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td
index f714e7bf71070..ada3a3edd8a30 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td
@@ -103,6 +103,62 @@ def ClauseRequires : OpenMP_BitEnumAttr<
 
 def ClauseRequiresAttr : OpenMP_EnumAttr<ClauseRequires, "clause_requires">;
 
+
+//===----------------------------------------------------------------------===//
+// clause_map_flag enum.
+//===----------------------------------------------------------------------===//
+
+def ClauseMapFlagsNone : I32BitEnumAttrCaseNone<"none">;
+def ClauseMapFlagsStorage : I32BitEnumAttrCaseBit<"storage", 0>; // alloc/release synonym
+def ClauseMapFlagsTo : I32BitEnumAttrCaseBit<"to", 1>;
+def ClauseMapFlagsFrom : I32BitEnumAttrCaseBit<"from", 2>;
+def ClauseMapFlagsAlways : I32BitEnumAttrCaseBit<"always", 3>;
+def ClauseMapFlagsDelete : I32BitEnumAttrCaseBit<"del", 4>; // delete, is reserved by C/C++
+def ClauseMapFlagsReturnParam : I32BitEnumAttrCaseBit<"return_param", 5>;
+def ClauseMapFlagsPrivate : I32BitEnumAttrCaseBit<"priv", 6>; // private, is reserved by C/C++
+def ClauseMapFlagsLiteral : I32BitEnumAttrCaseBit<"literal", 7>;
+def ClauseMapFlagsImplicit : I32BitEnumAttrCaseBit<"implicit", 8>;
+def ClauseMapFlagsClose : I32BitEnumAttrCaseBit<"close", 9>;
+def ClauseMapFlagsPresent : I32BitEnumAttrCaseBit<"present", 10>;
+def ClauseMapFlagsOMPXHold : I32BitEnumAttrCaseBit<"ompx_hold", 11>;
+def ClauseMapFlagsAttach : I32BitEnumAttrCaseBit<"attach", 12>;
+def ClauseMapFlagsAttachAlways : I32BitEnumAttrCaseBit<"attach_always", 13>;
+def ClauseMapFlagsAttachNone : I32BitEnumAttrCaseBit<"attach_none", 14>;
+def ClauseMapFlagsAttachAuto : I32BitEnumAttrCaseBit<"attach_auto", 15>;
+def ClauseMapFlagsRefPtr : I32BitEnumAttrCaseBit<"ref_ptr", 16>;
+def ClauseMapFlagsRefPtee : I32BitEnumAttrCaseBit<"ref_ptee", 17>;
+def ClauseMapFlagsRefPtrPtee : I32BitEnumAttrCaseBit<"ref_ptr_ptee", 18>;
+def ClauseMapFlagsDescriptor : I32BitEnumAttrCaseBit<"descriptor", 19>;
+
+def ClauseMapFlags : OpenMP_BitEnumAttr<
+    "ClauseMapFlags",
+    "Map types and modifiers tied to data maps", [
+      ClauseMapFlagsNone,
+      ClauseMapFlagsStorage,
+      ClauseMapFlagsTo,
+      ClauseMapFlagsFrom,
+      ClauseMapFlagsAlways,
+      ClauseMapFlagsDelete,
+      ClauseMapFlagsReturnParam,
+      ClauseMapFlagsPrivate,
+      ClauseMapFlagsLiteral,
+      ClauseMapFlagsImplicit,
+      ClauseMapFlagsClose,
+      ClauseMapFlagsPresent,
+      ClauseMapFlagsOMPXHold,
+      ClauseMapFlagsAttach,
+      ClauseMapFlagsAttachAlways,
+      ClauseMapFlagsAttachNone,
+      ClauseMapFlagsAttachAuto,
+      ClauseMapFlagsRefPtr,
+      ClauseMapFlagsRefPtee,
+      ClauseMapFlagsRefPtrPtee,
+      ClauseMapFlagsDescriptor
+    ]>;
+
+def ClauseMapFlagsAttr : OpenMP_EnumAttr<ClauseMapFlags,
+                                         "clause_map_flags">;
+
 //===----------------------------------------------------------------------===//
 // clause_task_depend enum.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index b73091ea0ca53..377f1febf6b8f 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -1198,7 +1198,7 @@ def MapBoundsOp : OpenMP_Op<"map.bounds",
 def MapInfoOp : OpenMP_Op<"map.info", [AttrSizedOperandSegments]> {
   let arguments = (ins OpenMP_PointerLikeType:$var_ptr,
                        TypeAttr:$var_type,
-                       UI64Attr:$map_type,
+                       ClauseMapFlagsAttr:$map_type,
                        VariableCaptureKindAttr:$map_capture_type,
                        Optional<OpenMP_PointerLikeType>:$var_ptr_ptr,
                        Variadic<OpenMP_PointerLikeType>:$members,
diff --git a/mlir/include/mlir/Dialect/SMT/IR/SMTOps.td b/mlir/include/mlir/Dialect/SMT/IR/SMTOps.td
index 3143ab7de1b14..99b22e5609c74 100644
--- a/mlir/include/mlir/Dialect/SMT/IR/SMTOps.td
+++ b/mlir/include/mlir/Dialect/SMT/IR/SMTOps.td
@@ -220,8 +220,6 @@ def YieldOp : SMTOp<"yield", [
   Pure,
   Terminator,
   ReturnLike,
-  ParentOneOf<["smt::SolverOp", "smt::CheckOp",
-               "smt::ForallOp", "smt::ExistsOp"]>,
 ]> {
   let summary = "terminator operation for various regions of SMT operations";
   let arguments = (ins Variadic<AnyType>:$values);
diff --git a/mlir/include/mlir/Dialect/Shard/IR/ShardOps.td b/mlir/include/mlir/Dialect/Shard/IR/ShardOps.td
index 29b384f401876..5e68f75ee08bf 100644
--- a/mlir/include/mlir/Dialect/Shard/IR/ShardOps.td
+++ b/mlir/include/mlir/Dialect/Shard/IR/ShardOps.td
@@ -174,7 +174,7 @@ def Shard_NeighborsLinearIndicesOp : Shard_Op<"neighbors_linear_indices", [
     ```
     The above returns two indices, `633` and `693`, which correspond to the
     index of the previous process `(1, 1, 3)`, and the next process 
-    `(1, 3, 3) along the split axis `1`.
+    `(1, 3, 3)` along the split axis `1`.
 
     A negative value is returned if there is no neighbor in the respective
     direction along the given `split_axes`.
@@ -494,7 +494,9 @@ def Shard_AllGatherOp : Shard_CollectiveCommunicationOpBase<"all_gather", [
   ]> {
   let summary = "All-gather over a device grid.";
   let description = [{
-    Gathers along the `gather_axis` tensor axis.
+    Concatenates all tensor slices from a device group defined by `grid_axes` along
+    the tensor dimension `gather_axis` and replicates the result across all devices
+    in the group.
 
     Example:
     ```mlir
@@ -546,10 +548,13 @@ def Shard_AllReduceOp : Shard_CollectiveCommunicationOpBase<"all_reduce", [
     SameOperandsAndResultShape]> {
   let summary = "All-reduce over a device grid.";
   let description = [{
-    The accumulation element type is specified by the result type and
-    it does not need to match the input element type.
-    The input element is converted to the result element type before
-    performing the reduction.
+    Reduces the input tensor across all devices within the groups defined by
+    `grid_axes`, using the specified reduction method. The operation performs an
+    element-wise reduction over the tensor slices from all devices in each group.
+    Each device in a group receives a replicated copy of the reduction result.
+    The accumulation element type is determined by the result type and does not
+    need to match the input element type. Before performing the reduction, each
+    input element is converted to the result element type.
 
     Attributes:
     `reduction`: Indicates the reduction method.
@@ -583,13 +588,15 @@ def Shard_AllSliceOp : Shard_CollectiveCommunicationOpBase<"all_slice", [
     SameOperandsAndResultElementType,
     SameOperandsAndResultRank
   ]> {
-  let summary = "All-slice over a device grid. This is the inverse of all-gather.";
+  let summary = "All-slice over a device grid.";
   let description = [{
-    Slice along the `slice_axis` tensor axis.
-    This operation can be thought of as the inverse of all-gather.
-    Technically, it is not required that all processes have the same input tensor.
-    Each process will slice a piece of its local tensor based on its in-group device index.
-    The operation does not communicate data between devices. 
+    Within each device group defined by `grid_axes`, slices the input tensor along
+    the `slice_axis` dimension. It can be viewed as the inverse of an all-gather if
+    the input data is replicated along the `slice_axis`.
+    Each process simply crops its local data to the slice corresponding to its
+    in-group device index.
+    Notice: `AllSliceOp` does not involve any communication between devices and
+            devices within a group may not have replicated input data.
 
     Example:
     ```mlir
@@ -610,7 +617,7 @@ def Shard_AllSliceOp : Shard_CollectiveCommunicationOpBase<"all_slice", [
     ```
     Result:
     ```
-    gather tensor
+    slice tensor
     axis 1
     ------------>
                      +-------+-------+
@@ -646,8 +653,10 @@ def Shard_AllToAllOp : Shard_CollectiveCommunicationOpBase<"all_to_all", [
     SameOperandsAndResultRank]> {
   let summary = "All-to-all over a device grid.";
   let description = [{
-    Performs an all-to-all on tensor pieces split along `split_axis`.
-    The resulting pieces are concatenated along `concat_axis` on ech device.
+    Each participant logically splits its input along split_axis,
+    then scatters the resulting pieces across the group defined by `grid_axes`.
+    After receiving data pieces from other participants' scatters,
+    it concatenates them along concat_axis to produce the final result.
 
     Example:
     ```
@@ -702,10 +711,9 @@ def Shard_BroadcastOp : Shard_CollectiveCommunicationOpBase<"broadcast", [
   ]> {
   let summary = "Broadcast over a device grid.";
   let description = [{
-    Broadcast the tensor on `root` to all devices in each respective group.
-    The operation broadcasts along grid axes `grid_axes`.
-    The `root` device specifies the in-group multi-index that is broadcast to
-    all other devices in the group.
+    Copies the input tensor on `root` to all devices in each group defined by
+    `grid_axes`. The `root` device is defined by its in-group multi-index.
+    The contents of input tensors on non-root devices are ignored.
     
     Example:
     ```
@@ -722,7 +730,7 @@ def Shard_BroadcastOp : Shard_CollectiveCommunicationOpBase<"broadcast", [
                      +-------+-------+                   | broadcast
     device (0, 0) -> |  1  2 |  3  4 | <- device (0, 1)  | along axis 0
                      +-------+-------+                   ↓
-    device (1, 0) -> |       |       | <- device (1, 1) 
+    device (1, 0) -> |  *  * |  *  * | <- device (1, 1)
                      +-------+-------+
     ```
 
@@ -758,11 +766,10 @@ def Shard_GatherOp : Shard_CollectiveCommunicationOpBase<"gather", [
   ]> {
   let summary = "Gather over a device grid.";
   let description = [{
-    Gathers on device `root` along the `gather_axis` tensor axis.
-    `root` specifies the coordinates of a device along `grid_axes`.
-    It uniquely identifies the root device for each device group.
-    The result tensor on non-root devices is undefined.
-    Using it will result in undefined behavior.
+    Concatenates all tensor slices from a device group defined by `grid_axes` along
+    the tensor dimension `gather_axis` and returns the resulting tensor on each
+    `root` device. The result on all other (non-root) devices is undefined.
+    The `root` device is defined by its in-group multi-index.
 
     Example:
     ```mlir
@@ -821,7 +828,9 @@ def Shard_RecvOp : Shard_CollectiveCommunicationOpBase<"recv", [
   ]> {
   let summary = "Send over a device grid.";
   let description = [{
-    Receive from a device within a device group.
+    Receive tensor from device `source`, which is defined by its in-group
+    multi-index. The groups are defined by `grid_axes`.
+    The content of input tensor is ignored.
   }];
   let arguments = !con(commonArgs, (ins
     AnyNon0RankedTensor:$input,
@@ -845,13 +854,15 @@ def Shard_ReduceOp : Shard_CollectiveCommunicationOpBase<"reduce", [
   ]> {
   let summary = "Reduce over a device grid.";
   let description = [{
-    Reduces on device `root` within each device group.
-    `root` specifies the coordinates of a device along `grid_axes`.
-    It uniquely identifies the root device within its device group.
-    The accumulation element type is specified by the result type and
-    it does not need to match the input element type.
-    The input element is converted to the result element type before
-    performing the reduction.
+    Reduces the input tensor across all devices within the groups defined by
+    `grid_axes`, using the specified reduction method. The operation performs an
+    element-wise reduction over the tensor slices from all devices in each group.
+    The reduction result will be returned on the `root` device of each group.
+    It is undefined on all other (non-root) devices.
+    The `root` device is defined by its in-group multi-index.
+    The accumulation element type is determined by the result type and does not
+    need to match the input element type. Before performing the reduction, each
+    input element is converted to the result element type.
 
     Attributes:
     `reduction`: Indicates the reduction method.
@@ -886,16 +897,18 @@ def Shard_ReduceScatterOp : Shard_CollectiveCommunicationOpBase<"reduce_scatter"
     SameOperandsAndResultRank]> {
   let summary = "Reduce-scatter over a device grid.";
   let description = [{
-    After the reduction, the result is scattered within each device group.
-    The tensor is split along `scatter_axis` and the pieces distributed
-    across the device group.
+    Reduces the input tensor across all devices within the groups defined by
+    `grid_axes` using the specified reduction method. The reduction is performed
+    element-wise across the tensor pieces from all devices in the group.
+    After reduction, the reduction result is scattered (split and distributed)
+    across the device group along `scatter_axis`.
     Example:
     ```
     shard.grid @grid0(shape = 2x2)
     ...
     %1 = shard.reduce_scatter %0 on @grid0 grid_axes = [1]
       reduction = <max> scatter_axis = 0
-      : tensor<3x4xf32> -> tensor<1x4xf64>
+      : tensor<2x2xf32> -> tensor<1x2xf64>
     ```
     Input:
     ```
@@ -916,13 +929,13 @@ def Shard_ReduceScatterOp : Shard_CollectiveCommunicationOpBase<"reduce_scatter"
     Result:
     ```
     +-------+
-    |  6  8 | <- devices (0, 0)
+    |  5  6 | <- devices (0, 0)
     +-------+
-    | 10 12 | <- devices (0, 1)
+    |  7  8 | <- devices (0, 1)
     +-------+
-    | 22 24 | <- devices (1, 0)
+    | 13 14 | <- devices (1, 0)
     +-------+
-    | 26 28 | <- devices (1, 1)
+    | 15 16 | <- devices (1, 1)
     +-------+
     ```
   }];
@@ -950,8 +963,10 @@ def Shard_ScatterOp : Shard_CollectiveCommunicationOpBase<"scatter", [
   ]> {
   let summary = "Scatter over a device grid.";
   let description = [{
-    For each device group split the input tensor on the `root` device along
-    axis `scatter_axis` and scatter the parts across the group devices.
+    For each device group defined by `grid_axes`, the input tensor on the `root`
+    device is split along axis `scatter_axis` and distributed across the group.
+    The content of the input on all other (non-root) devices is ignored.
+    The `root` device is defined by its in-group multi-index.
 
     Example:
     ```
@@ -968,8 +983,8 @@ def Shard_ScatterOp : Shard_CollectiveCommunicationOpBase<"scatter", [
                               (0, 1)
                                  ↓
                      +-------+-------+  | scatter tensor
-    device (0, 0) -> |       |       |  | axis 0
-                     |       |       |  ↓
+    device (0, 0) -> |  *  * |  *  * |  | axis 0
+                     |  *  * |  *  * |  ↓
                      +-------+-------+
     device (1, 0) -> |  1  2 |  5  6 |
                      |  3  4 |  7  8 |
@@ -1018,7 +1033,8 @@ def Shard_SendOp : Shard_CollectiveCommunicationOpBase<"send", [
   ]> {
   let summary = "Send over a device grid.";
   let description = [{
-    Send from one device to another within a device group.
+    Send input tensor to device `destination`, which is defined by its in-group
+    multi-index. The groups are defined by `grid_axes`.
   }];
   let arguments = !con(commonArgs, (ins
     AnyNon0RankedTensor:$input,
@@ -1043,12 +1059,11 @@ def Shard_ShiftOp : Shard_CollectiveCommunicationOpBase<"shift", [
   ]> {
   let summary = "Shift over a device grid.";
   let description = [{
-    Within each device group shift along grid axis `shift_axis` by an offset
-    `offset`.
-    The result on devices that do not have a corresponding source is undefined.
-    `shift_axis` must be one of `grid_axes`.
-    If the `rotate` attribute is present,
-    instead of a shift a rotation is done.
+    Within each device group defined by `grid_axes`, shifts input tensors along the
+    device grid's axis `shift_axis` by the specified offset. The `shift_axis` must
+    be one of the `grid_axes`. If the `rotate` attribute is set, the shift is circular.
+    That is, the offset wraps around according to the group size along `shift_axis`.
+    Otherwise, the results on devices without a corresponding source are undefined.
 
     Example:
     ```
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TargetEnv.h b/mlir/include/mlir/Dialect/Tosa/IR/TargetEnv.h
index 10491f65d37af..4ecf03c34c1a5 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TargetEnv.h
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TargetEnv.h
@@ -50,28 +50,63 @@ TargetEnvAttr getDefaultTargetEnv(MLIRContext *context);
 /// returned by getDefaultTargetEnv() if not provided.
 TargetEnvAttr lookupTargetEnvOrDefault(Operation *op);
 
+/// A thin wrapper around the SpecificationVersion enum to represent
+/// and provide utilities around the TOSA specification version.
+class TosaSpecificationVersion {
+public:
+  TosaSpecificationVersion(uint32_t major, uint32_t minor)
+      : majorVersion(major), minorVersion(minor) {}
+  TosaSpecificationVersion(SpecificationVersion version)
+      : TosaSpecificationVersion(fromVersionEnum(version)) {}
+
+  bool isBackwardsCompatibleWith(TosaSpecificationVersion baseVersion) const {
+    return this->majorVersion == baseVersion.majorVersion &&
+           this->minorVersion >= baseVersion.minorVersion;
+  }
+
+  uint32_t getMajor() const { return majorVersion; }
+  uint32_t getMinor() const { return minorVersion; }
+
+private:
+  uint32_t majorVersion = 0;
+  uint32_t minorVersion = 0;
+
+  static TosaSpecificationVersion
+  fromVersionEnum(SpecificationVersion version) {
+    switch (version) {
+    case SpecificationVersion::V_1_0:
+      return TosaSpecificationVersion(1, 0);
+    case SpecificationVersion::V_1_1_DRAFT:
+      return TosaSpecificationVersion(1, 1);
+    }
+    llvm_unreachable("Unknown TOSA version");
+  }
+};
+
+llvm::SmallString<4> stringifyVersion(TosaSpecificationVersion version);
+
 /// This class represents the capability enabled in the target implementation
 /// such as profile, extension, and level. It's a wrapper class around
 /// tosa::TargetEnvAttr.
 class TargetEnv {
 public:
   TargetEnv() {}
-  explicit TargetEnv(Level level, const ArrayRef<Profile> &profiles,
+  explicit TargetEnv(SpecificationVersion specificationVersion, Level level,
+                     const ArrayRef<Profile> &profiles,
                      const ArrayRef<Extension> &extensions)
-      : level(level) {
+      : specificationVersion(specificationVersion), level(level) {
     enabledProfiles.insert_range(profiles);
     enabledExtensions.insert_range(extensions);
   }
 
   explicit TargetEnv(TargetEnvAttr targetAttr)
-      : TargetEnv(targetAttr.getLevel(), targetAttr.getProfiles(),
-                  targetAttr.getExtensions()) {}
+      : TargetEnv(targetAttr.getSpecificationVersion(), targetAttr.getLevel(),
+                  targetAttr.getProfiles(), targetAttr.getExtensions()) {}
 
   void addProfile(Profile p) { enabledProfiles.insert(p); }
   void addExtension(Extension e) { enabledExtensions.insert(e); }
 
-  // TODO implement the following utilities.
-  // Version getSpecVersion() const;
+  SpecificationVersion getSpecVersion() const { return specificationVersion; }
 
   TosaLevel getLevel() const {
     if (level == Level::eightK)
@@ -105,6 +140,7 @@ class TargetEnv {
   }
 
 private:
+  SpecificationVersion specificationVersion;
   Level level;
   llvm::SmallSet<Profile, 3> enabledProfiles;
   llvm::SmallSet<Extension, 13> enabledExtensions;
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaComplianceData.h.inc b/mlir/include/mlir/Dialect/Tosa/IR/TosaComplianceData.h.inc
index 1f718accabd15..c1b5e785bd739 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaComplianceData.h.inc
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaComplianceData.h.inc
@@ -2,441 +2,779 @@
 // `tools/genspec.py` in https://git.mlplatform.org/tosa/specification.git
 profileComplianceMap = {
     {"tosa.argmax",
-     {{{Profile::pro_int}, {{i8T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, i32T}, {fp32T, i32T}}}}},
+     {{{Profile::pro_int}, {{{i8T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, i32T}, SpecificationVersion::V_1_0},
+        {{fp32T, i32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.avg_pool2d",
-     {{{Profile::pro_int}, {{i8T, i8T, i8T, i32T, i8T}}},
+     {{{Profile::pro_int},
+       {{{i8T, i8T, i8T, i32T, i8T}, SpecificationVersion::V_1_0}}},
       {{Profile::pro_fp},
-       {{fp16T, fp16T, fp16T, fp16T, fp16T},
-        {fp16T, fp16T, fp16T, fp32T, fp16T},
-        {fp32T, fp32T, fp32T, fp32T, fp32T}}}}},
+       {{{fp16T, fp16T, fp16T, fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp16T, fp16T, fp16T, fp32T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, fp32T, fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.conv2d",
-     {{{Profile::pro_int}, {{i8T, i8T, i32T, i8T, i8T, i32T, i32T}}},
+     {{{Profile::pro_int},
+       {{{i8T, i8T, i32T, i8T, i8T, i32T, i32T}, SpecificationVersion::V_1_0}}},
       {{Profile::pro_fp},
-       {{fp16T, fp16T, fp16T, fp16T, fp16T, fp16T, fp16T},
-        {fp16T, fp16T, fp16T, fp16T, fp16T, fp32T, fp16T},
-        {fp32T, fp32T, fp32T, fp32T, fp32T, fp32T, fp32T}}}}},
+       {{{fp16T, fp16T, fp16T, fp16T, fp16T, fp16T, fp16T},
+         SpecificationVersion::V_1_0},
+        {{fp16T, fp16T, fp16T, fp16T, fp16T, fp32T, fp16T},
+         SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, fp32T, fp32T, fp32T, fp32T, fp32T},
+         SpecificationVersion::V_1_0}}}}},
     {"tosa.conv3d",
-     {{{Profile::pro_int}, {{i8T, i8T, i32T, i8T, i8T, i32T, i32T}}},
+     {{{Profile::pro_int},
+       {{{i8T, i8T, i32T, i8T, i8T, i32T, i32T}, SpecificationVersion::V_1_0}}},
       {{Profile::pro_fp},
-       {{fp16T, fp16T, fp16T, fp16T, fp16T, fp16T, fp16T},
-        {fp16T, fp16T, fp16T, fp16T, fp16T, fp32T, fp16T},
-        {fp32T, fp32T, fp32T, fp32T, fp32T, fp32T, fp32T}}}}},
+       {{{fp16T, fp16T, fp16T, fp16T, fp16T, fp16T, fp16T},
+         SpecificationVersion::V_1_0},
+        {{fp16T, fp16T, fp16T, fp16T, fp16T, fp32T, fp16T},
+         SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, fp32T, fp32T, fp32T, fp32T, fp32T},
+         SpecificationVersion::V_1_0}}}}},
     {"tosa.depthwise_conv2d",
-     {{{Profile::pro_int}, {{i8T, i8T, i32T, i8T, i8T, i32T, i32T}}},
+     {{{Profile::pro_int},
+       {{{i8T, i8T, i32T, i8T, i8T, i32T, i32T}, SpecificationVersion::V_1_0}}},
       {{Profile::pro_fp},
-       {{fp16T, fp16T, fp16T, fp16T, fp16T, fp16T, fp16T},
-        {fp16T, fp16T, fp16T, fp16T, fp16T, fp32T, fp16T},
-        {fp32T, fp32T, fp32T, fp32T, fp32T, fp32T, fp32T}}}}},
+       {{{fp16T, fp16T, fp16T, fp16T, fp16T, fp16T, fp16T},
+         SpecificationVersion::V_1_0},
+        {{fp16T, fp16T, fp16T, fp16T, fp16T, fp32T, fp16T},
+         SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, fp32T, fp32T, fp32T, fp32T, fp32T},
+         SpecificationVersion::V_1_0}}}}},
     {"tosa.matmul",
-     {{{Profile::pro_int}, {{i8T, i8T, i8T, i8T, i32T}}},
+     {{{Profile::pro_int},
+       {{{i8T, i8T, i8T, i8T, i32T}, SpecificationVersion::V_1_0}}},
       {{Profile::pro_fp},
-       {{fp16T, fp16T, fp16T, fp16T, fp16T},
-        {fp16T, fp16T, fp16T, fp16T, fp32T},
-        {fp32T, fp32T, fp32T, fp32T, fp32T}}}}},
+       {{{fp16T, fp16T, fp16T, fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp16T, fp16T, fp16T, fp16T, fp32T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, fp32T, fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.max_pool2d",
-     {{{Profile::pro_int}, {{i8T, i8T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_int}, {{{i8T, i8T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.transpose_conv2d",
-     {{{Profile::pro_int}, {{i8T, i8T, i32T, i8T, i8T, i32T, i32T}}},
+     {{{Profile::pro_int},
+       {{{i8T, i8T, i32T, i8T, i8T, i32T, i32T}, SpecificationVersion::V_1_0}}},
       {{Profile::pro_fp},
-       {{fp16T, fp16T, fp16T, fp16T, fp16T, fp16T, fp16T},
-        {fp16T, fp16T, fp16T, fp16T, fp16T, fp32T, fp16T},
-        {fp32T, fp32T, fp32T, fp32T, fp32T, fp32T, fp32T}}}}},
+       {{{fp16T, fp16T, fp16T, fp16T, fp16T, fp16T, fp16T},
+         SpecificationVersion::V_1_0},
+        {{fp16T, fp16T, fp16T, fp16T, fp16T, fp32T, fp16T},
+         SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, fp32T, fp32T, fp32T, fp32T, fp32T},
+         SpecificationVersion::V_1_0}}}}},
     {"tosa.clamp",
-     {{{Profile::pro_int}, {{i8T, i8T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
-    {"tosa.erf", {{{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
-    {"tosa.sigmoid", {{{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
-    {"tosa.tanh", {{{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_int}, {{{i8T, i8T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.erf",
+     {{{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.sigmoid",
+     {{{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.tanh",
+     {{{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.add",
-     {{{Profile::pro_int, Profile::pro_fp}, {{i32T, i32T, i32T}}, anyOf},
-      {{Profile::pro_fp}, {{fp16T, fp16T, fp16T}, {fp32T, fp32T, fp32T}}}}},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{i32T, i32T, i32T}, SpecificationVersion::V_1_0}},
+       anyOf},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.arithmetic_right_shift",
      {{{Profile::pro_int},
-       {{i8T, i8T, i8T}, {i16T, i16T, i16T}, {i32T, i32T, i32T}}}}},
+       {{{i8T, i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T, i32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.bitwise_and",
      {{{Profile::pro_int},
-       {{i8T, i8T, i8T}, {i16T, i16T, i16T}, {i32T, i32T, i32T}}}}},
+       {{{i8T, i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T, i32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.bitwise_or",
      {{{Profile::pro_int},
-       {{i8T, i8T, i8T}, {i16T, i16T, i16T}, {i32T, i32T, i32T}}}}},
+       {{{i8T, i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T, i32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.bitwise_xor",
      {{{Profile::pro_int},
-       {{i8T, i8T, i8T}, {i16T, i16T, i16T}, {i32T, i32T, i32T}}}}},
+       {{{i8T, i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T, i32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.intdiv",
-     {{{Profile::pro_int, Profile::pro_fp}, {{i32T, i32T, i32T}}, anyOf}}},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{i32T, i32T, i32T}, SpecificationVersion::V_1_0}},
+       anyOf}}},
     {"tosa.logical_and",
-     {{{Profile::pro_int, Profile::pro_fp}, {{boolT, boolT, boolT}}, anyOf}}},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{boolT, boolT, boolT}, SpecificationVersion::V_1_0}},
+       anyOf}}},
     {"tosa.logical_left_shift",
      {{{Profile::pro_int, Profile::pro_fp},
-       {{i8T, i8T, i8T}, {i16T, i16T, i16T}, {i32T, i32T, i32T}},
+       {{{i8T, i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T, i32T}, SpecificationVersion::V_1_0}},
        anyOf}}},
     {"tosa.logical_right_shift",
      {{{Profile::pro_int, Profile::pro_fp},
-       {{i8T, i8T, i8T}, {i16T, i16T, i16T}, {i32T, i32T, i32T}},
+       {{{i8T, i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T, i32T}, SpecificationVersion::V_1_0}},
        anyOf}}},
     {"tosa.logical_or",
-     {{{Profile::pro_int, Profile::pro_fp}, {{boolT, boolT, boolT}}, anyOf}}},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{boolT, boolT, boolT}, SpecificationVersion::V_1_0}},
+       anyOf}}},
     {"tosa.logical_xor",
-     {{{Profile::pro_int, Profile::pro_fp}, {{boolT, boolT, boolT}}, anyOf}}},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{boolT, boolT, boolT}, SpecificationVersion::V_1_0}},
+       anyOf}}},
     {"tosa.maximum",
-     {{{Profile::pro_int}, {{i32T, i32T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T, fp16T}, {fp32T, fp32T, fp32T}}}}},
+     {{{Profile::pro_int}, {{{i32T, i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.minimum",
-     {{{Profile::pro_int}, {{i32T, i32T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T, fp16T}, {fp32T, fp32T, fp32T}}}}},
+     {{{Profile::pro_int}, {{{i32T, i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.mul",
-     {{{Profile::pro_int}, {{i8T, i8T, i32T}, {i16T, i16T, i32T}}},
-      {{Profile::pro_int, Profile::pro_fp}, {{i32T, i32T, i32T}}, anyOf},
-      {{Profile::pro_fp}, {{fp16T, fp16T, fp16T}, {fp32T, fp32T, fp32T}}}}},
+     {{{Profile::pro_int},
+       {{{i8T, i8T, i32T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_int, Profile::pro_fp},
+       {{{i32T, i32T, i32T}, SpecificationVersion::V_1_0}},
+       anyOf},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.pow",
-     {{{Profile::pro_fp}, {{fp16T, fp16T, fp16T}, {fp32T, fp32T, fp32T}}}}},
+     {{{Profile::pro_fp},
+       {{{fp16T, fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.sub",
-     {{{Profile::pro_int, Profile::pro_fp}, {{i32T, i32T, i32T}}, anyOf},
-      {{Profile::pro_fp}, {{fp16T, fp16T, fp16T}, {fp32T, fp32T, fp32T}}}}},
-    {"tosa.table", {{{Profile::pro_int}, {{i8T, i8T, i8T}}}}},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{i32T, i32T, i32T}, SpecificationVersion::V_1_0}},
+       anyOf},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.table",
+     {{{Profile::pro_int}, {{{i8T, i8T, i8T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.abs",
-     {{{Profile::pro_int}, {{i32T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_int}, {{{i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.bitwise_not",
-     {{{Profile::pro_int}, {{i8T, i8T}, {i16T, i16T}, {i32T, i32T}}}}},
-    {"tosa.ceil", {{{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
-    {"tosa.clz", {{{Profile::pro_int}, {{i32T, i32T}}}}},
-    {"tosa.cos", {{{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
-    {"tosa.exp", {{{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
-    {"tosa.floor", {{{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
-    {"tosa.log", {{{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_int},
+       {{{i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.ceil",
+     {{{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.clz",
+     {{{Profile::pro_int}, {{{i32T, i32T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.cos",
+     {{{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.exp",
+     {{{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.floor",
+     {{{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.log",
+     {{{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.logical_not",
-     {{{Profile::pro_int, Profile::pro_fp}, {{boolT, boolT}}, anyOf}}},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{boolT, boolT}, SpecificationVersion::V_1_0}},
+       anyOf}}},
     {"tosa.negate",
      {{{Profile::pro_int},
-       {{i8T, i8T, i8T, i8T},
-        {i16T, i16T, i16T, i16T},
-        {i32T, i32T, i32T, i32T}}},
+       {{{i8T, i8T, i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T, i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T, i32T, i32T}, SpecificationVersion::V_1_0}}},
       {{Profile::pro_fp},
-       {{fp16T, fp16T, fp16T, fp16T}, {fp32T, fp32T, fp32T, fp32T}}}}},
+       {{{fp16T, fp16T, fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.reciprocal",
-     {{{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
-    {"tosa.rsqrt", {{{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
-    {"tosa.sin", {{{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.rsqrt",
+     {{{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.sin",
+     {{{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.select",
-     {{{Profile::pro_int, Profile::pro_fp}, {{boolT, boolT, boolT}}, anyOf},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{boolT, boolT, boolT}, SpecificationVersion::V_1_0}},
+       anyOf},
       {{Profile::pro_int},
-       {{i8T, i8T, i8T}, {i16T, i16T, i16T}, {i32T, i32T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T, fp16T}, {fp32T, fp32T, fp32T}}}}},
+       {{{i8T, i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.equal",
-     {{{Profile::pro_int}, {{i32T, i32T, boolT}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T, boolT}, {fp32T, fp32T, boolT}}}}},
+     {{{Profile::pro_int},
+       {{{i32T, i32T, boolT}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T, boolT}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, boolT}, SpecificationVersion::V_1_0}}}}},
     {"tosa.greater",
-     {{{Profile::pro_int}, {{i32T, i32T, boolT}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T, boolT}, {fp32T, fp32T, boolT}}}}},
+     {{{Profile::pro_int},
+       {{{i32T, i32T, boolT}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T, boolT}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, boolT}, SpecificationVersion::V_1_0}}}}},
     {"tosa.greater_equal",
-     {{{Profile::pro_int}, {{i32T, i32T, boolT}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T, boolT}, {fp32T, fp32T, boolT}}}}},
+     {{{Profile::pro_int},
+       {{{i32T, i32T, boolT}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T, boolT}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, boolT}, SpecificationVersion::V_1_0}}}}},
     {"tosa.reduce_all",
-     {{{Profile::pro_int, Profile::pro_fp}, {{boolT, boolT}}, anyOf}}},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{boolT, boolT}, SpecificationVersion::V_1_0}},
+       anyOf}}},
     {"tosa.reduce_any",
-     {{{Profile::pro_int, Profile::pro_fp}, {{boolT, boolT}}, anyOf}}},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{boolT, boolT}, SpecificationVersion::V_1_0}},
+       anyOf}}},
     {"tosa.reduce_max",
-     {{{Profile::pro_int}, {{i8T, i8T}, {i16T, i16T}, {i32T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_int},
+       {{{i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.reduce_min",
-     {{{Profile::pro_int}, {{i8T, i8T}, {i16T, i16T}, {i32T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_int},
+       {{{i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.reduce_product",
-     {{{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.reduce_sum",
-     {{{Profile::pro_int}, {{i32T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_int}, {{{i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.concat",
-     {{{Profile::pro_int, Profile::pro_fp}, {{boolT, boolT}}, anyOf},
-      {{Profile::pro_int}, {{i8T, i8T}, {i32T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{boolT, boolT}, SpecificationVersion::V_1_0}},
+       anyOf},
+      {{Profile::pro_int},
+       {{{i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.pad",
-     {{{Profile::pro_int, Profile::pro_fp}, {{boolT, boolT, boolT}}, anyOf},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{boolT, boolT, boolT}, SpecificationVersion::V_1_0}},
+       anyOf},
       {{Profile::pro_int},
-       {{i8T, i8T, i8T}, {i16T, i16T, i16T}, {i32T, i32T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T, fp16T}, {fp32T, fp32T, fp32T}}}}},
+       {{{i8T, i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.reshape",
-     {{{Profile::pro_int, Profile::pro_fp}, {{boolT, boolT}}, anyOf},
-      {{Profile::pro_int}, {{i8T, i8T}, {i16T, i16T}, {i32T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{boolT, boolT}, SpecificationVersion::V_1_0}},
+       anyOf},
+      {{Profile::pro_int},
+       {{{i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.reverse",
-     {{{Profile::pro_int, Profile::pro_fp}, {{boolT, boolT}}, anyOf},
-      {{Profile::pro_int}, {{i8T, i8T}, {i16T, i16T}, {i32T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{boolT, boolT}, SpecificationVersion::V_1_0}},
+       anyOf},
+      {{Profile::pro_int},
+       {{{i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.slice",
-     {{{Profile::pro_int, Profile::pro_fp}, {{boolT, boolT}}, anyOf},
-      {{Profile::pro_int}, {{i8T, i8T}, {i16T, i16T}, {i32T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{boolT, boolT}, SpecificationVersion::V_1_0}},
+       anyOf},
+      {{Profile::pro_int},
+       {{{i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.tile",
-     {{{Profile::pro_int, Profile::pro_fp}, {{boolT, boolT}}, anyOf},
-      {{Profile::pro_int}, {{i8T, i8T}, {i16T, i16T}, {i32T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{boolT, boolT}, SpecificationVersion::V_1_0}},
+       anyOf},
+      {{Profile::pro_int},
+       {{{i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.transpose",
-     {{{Profile::pro_int, Profile::pro_fp}, {{boolT, boolT}}, anyOf},
-      {{Profile::pro_int}, {{i8T, i8T}, {i16T, i16T}, {i32T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_int, Profile::pro_fp},
+       {{{boolT, boolT}, SpecificationVersion::V_1_0}},
+       anyOf},
+      {{Profile::pro_int},
+       {{{i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.gather",
      {{{Profile::pro_int},
-       {{i8T, i32T, i8T}, {i16T, i32T, i16T}, {i32T, i32T, i32T}}},
-      {{Profile::pro_fp}, {{fp16T, i32T, fp16T}, {fp32T, i32T, fp32T}}}}},
+       {{{i8T, i32T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i32T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, i32T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, i32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.scatter",
      {{{Profile::pro_int},
-       {{i8T, i32T, i8T, i8T},
-        {i16T, i32T, i16T, i16T},
-        {i32T, i32T, i32T, i32T}}},
+       {{{i8T, i32T, i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i32T, i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T, i32T, i32T}, SpecificationVersion::V_1_0}}},
       {{Profile::pro_fp},
-       {{fp16T, i32T, fp16T, fp16T}, {fp32T, i32T, fp32T, fp32T}}}}},
+       {{{fp16T, i32T, fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, i32T, fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.resize",
-     {{{Profile::pro_int}, {{i8T, i32T}, {i8T, i8T}}},
-      {{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+     {{{Profile::pro_int},
+       {{{i8T, i32T}, SpecificationVersion::V_1_0},
+        {{i8T, i8T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.cast",
      {{{Profile::pro_int},
-       {{boolT, i8T},
-        {boolT, i16T},
-        {boolT, i32T},
-        {i8T, boolT},
-        {i8T, i16T},
-        {i8T, i32T},
-        {i16T, boolT},
-        {i16T, i8T},
-        {i16T, i32T},
-        {i32T, boolT},
-        {i32T, i8T},
-        {i32T, i16T}}},
-      {{Profile::pro_fp},
-       {{i8T, fp16T},
-        {i8T, fp32T},
-        {i16T, fp16T},
-        {i16T, fp32T},
-        {i32T, fp16T},
-        {i32T, fp32T},
-        {fp16T, i8T},
-        {fp16T, i16T},
-        {fp16T, i32T},
-        {fp16T, fp32T},
-        {fp32T, i8T},
-        {fp32T, i16T},
-        {fp32T, i32T},
-        {fp32T, fp16T}}}}},
+       {{{boolT, i8T}, SpecificationVersion::V_1_0},
+        {{boolT, i16T}, SpecificationVersion::V_1_0},
+        {{boolT, i32T}, SpecificationVersion::V_1_0},
+        {{i8T, boolT}, SpecificationVersion::V_1_0},
+        {{i8T, i16T}, SpecificationVersion::V_1_0},
+        {{i8T, i32T}, SpecificationVersion::V_1_0},
+        {{i16T, boolT}, SpecificationVersion::V_1_0},
+        {{i16T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i32T}, SpecificationVersion::V_1_0},
+        {{i32T, boolT}, SpecificationVersion::V_1_0},
+        {{i32T, i8T}, SpecificationVersion::V_1_0},
+        {{i32T, i16T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{i8T, fp16T}, SpecificationVersion::V_1_0},
+        {{i8T, fp32T}, SpecificationVersion::V_1_0},
+        {{i16T, fp16T}, SpecificationVersion::V_1_0},
+        {{i16T, fp32T}, SpecificationVersion::V_1_0},
+        {{i32T, fp16T}, SpecificationVersion::V_1_0},
+        {{i32T, fp32T}, SpecificationVersion::V_1_0},
+        {{fp16T, i8T}, SpecificationVersion::V_1_0},
+        {{fp16T, i16T}, SpecificationVersion::V_1_0},
+        {{fp16T, i32T}, SpecificationVersion::V_1_0},
+        {{fp16T, fp32T}, SpecificationVersion::V_1_0},
+        {{fp32T, i8T}, SpecificationVersion::V_1_0},
+        {{fp32T, i16T}, SpecificationVersion::V_1_0},
+        {{fp32T, i32T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.rescale",
      {{{Profile::pro_int},
-       {{i8T, i8T, i8T, i8T},
-        {i8T, i8T, i16T, i16T},
-        {i8T, i8T, i32T, i32T},
-        {i16T, i16T, i8T, i8T},
-        {i16T, i16T, i16T, i16T},
-        {i16T, i16T, i32T, i32T},
-        {i32T, i32T, i8T, i8T},
-        {i32T, i32T, i16T, i16T},
-        {i32T, i32T, i32T, i32T}}}}},
+       {{{i8T, i8T, i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i8T, i8T, i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i8T, i8T, i32T, i32T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T, i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T, i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T, i32T, i32T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T, i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T, i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T, i32T, i32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.const",
      {{{Profile::pro_int, Profile::pro_fp},
-       {{boolT}, {i8T}, {i16T}, {i32T}},
+       {{{boolT}, SpecificationVersion::V_1_0},
+        {{i8T}, SpecificationVersion::V_1_0},
+        {{i16T}, SpecificationVersion::V_1_0},
+        {{i32T}, SpecificationVersion::V_1_0}},
        anyOf},
-      {{Profile::pro_fp}, {{fp16T}, {fp32T}}}}},
+      {{Profile::pro_fp},
+       {{{fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.identity",
      {{{Profile::pro_int, Profile::pro_fp},
-       {{boolT, boolT}, {i8T, i8T}, {i16T, i16T}, {i32T, i32T}},
+       {{{boolT, boolT}, SpecificationVersion::V_1_0},
+        {{i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i32T, i32T}, SpecificationVersion::V_1_0}},
        anyOf},
-      {{Profile::pro_fp}, {{fp16T, fp16T}, {fp32T, fp32T}}}}},
+      {{Profile::pro_fp},
+       {{{fp16T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.variable",
-     {{{Profile::pro_int}, {{i8T}}}, {{Profile::pro_fp}, {{fp16T}, {fp32T}}}}},
+     {{{Profile::pro_int}, {{{i8T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.variable_write",
-     {{{Profile::pro_int}, {{i8T}}}, {{Profile::pro_fp}, {{fp16T}, {fp32T}}}}},
+     {{{Profile::pro_int}, {{{i8T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.variable_read",
-     {{{Profile::pro_int}, {{i8T}}}, {{Profile::pro_fp}, {{fp16T}, {fp32T}}}}},
+     {{{Profile::pro_int}, {{{i8T}, SpecificationVersion::V_1_0}}},
+      {{Profile::pro_fp},
+       {{{fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T}, SpecificationVersion::V_1_0}}}}},
 };
 
 extensionComplianceMap = {
     {"tosa.argmax",
-     {{{Extension::int16}, {{i16T, i32T}}},
-      {{Extension::fp8e4m3}, {{fp8e4m3T, i32T}}},
-      {{Extension::fp8e5m2}, {{fp8e5m2T, i32T}}},
-      {{Extension::bf16}, {{bf16T, i32T}}}}},
+     {{{Extension::int16}, {{{i16T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e4m3}, {{{fp8e4m3T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e5m2}, {{{fp8e5m2T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Extension::bf16}, {{{bf16T, i32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.avg_pool2d",
-     {{{Extension::int16}, {{i16T, i16T, i16T, i32T, i16T}}},
-      {{Extension::fp8e4m3}, {{fp8e4m3T, fp8e4m3T, fp8e4m3T, fp16T, fp8e4m3T}}},
-      {{Extension::fp8e5m2}, {{fp8e5m2T, fp8e5m2T, fp8e5m2T, fp16T, fp8e5m2T}}},
-      {{Extension::bf16}, {{bf16T, bf16T, bf16T, fp32T, bf16T}}}}},
+     {{{Extension::int16},
+       {{{i16T, i16T, i16T, i32T, i16T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e4m3},
+       {{{fp8e4m3T, fp8e4m3T, fp8e4m3T, fp16T, fp8e4m3T},
+         SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e5m2},
+       {{{fp8e5m2T, fp8e5m2T, fp8e5m2T, fp16T, fp8e5m2T},
+         SpecificationVersion::V_1_0}}},
+      {{Extension::bf16},
+       {{{bf16T, bf16T, bf16T, fp32T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.conv2d",
-     {{{Extension::int4}, {{i8T, i4T, i32T, i8T, i4T, i32T, i32T}}},
-      {{Extension::int16}, {{i16T, i8T, i48T, i16T, i8T, i48T, i48T}}},
+     {{{Extension::int4},
+       {{{i8T, i4T, i32T, i8T, i4T, i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Extension::int16},
+       {{{i16T, i8T, i48T, i16T, i8T, i48T, i48T},
+         SpecificationVersion::V_1_0}}},
       {{Extension::fp8e4m3},
-       {{fp8e4m3T, fp8e4m3T, fp16T, fp8e4m3T, fp8e4m3T, fp16T, fp16T}}},
+       {{{fp8e4m3T, fp8e4m3T, fp16T, fp8e4m3T, fp8e4m3T, fp16T, fp16T},
+         SpecificationVersion::V_1_0}}},
       {{Extension::fp8e5m2},
-       {{fp8e5m2T, fp8e5m2T, fp16T, fp8e5m2T, fp8e5m2T, fp16T, fp16T}}},
+       {{{fp8e5m2T, fp8e5m2T, fp16T, fp8e5m2T, fp8e5m2T, fp16T, fp16T},
+         SpecificationVersion::V_1_0}}},
       {{Extension::bf16},
-       {{bf16T, bf16T, bf16T, bf16T, bf16T, fp32T, bf16T}}}}},
+       {{{bf16T, bf16T, bf16T, bf16T, bf16T, fp32T, bf16T},
+         SpecificationVersion::V_1_0}}}}},
     {"tosa.conv3d",
-     {{{Extension::int4}, {{i8T, i4T, i32T, i8T, i4T, i32T, i32T}}},
-      {{Extension::int16}, {{i16T, i8T, i48T, i16T, i8T, i48T, i48T}}},
+     {{{Extension::int4},
+       {{{i8T, i4T, i32T, i8T, i4T, i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Extension::int16},
+       {{{i16T, i8T, i48T, i16T, i8T, i48T, i48T},
+         SpecificationVersion::V_1_0}}},
       {{Extension::fp8e4m3},
-       {{fp8e4m3T, fp8e4m3T, fp16T, fp8e4m3T, fp8e4m3T, fp16T, fp16T}}},
+       {{{fp8e4m3T, fp8e4m3T, fp16T, fp8e4m3T, fp8e4m3T, fp16T, fp16T},
+         SpecificationVersion::V_1_0}}},
       {{Extension::fp8e5m2},
-       {{fp8e5m2T, fp8e5m2T, fp16T, fp8e5m2T, fp8e5m2T, fp16T, fp16T}}},
+       {{{fp8e5m2T, fp8e5m2T, fp16T, fp8e5m2T, fp8e5m2T, fp16T, fp16T},
+         SpecificationVersion::V_1_0}}},
       {{Extension::bf16},
-       {{bf16T, bf16T, bf16T, bf16T, bf16T, fp32T, bf16T}}}}},
+       {{{bf16T, bf16T, bf16T, bf16T, bf16T, fp32T, bf16T},
+         SpecificationVersion::V_1_0}}}}},
     {"tosa.depthwise_conv2d",
-     {{{Extension::int4}, {{i8T, i4T, i32T, i8T, i4T, i32T, i32T}}},
-      {{Extension::int16}, {{i16T, i8T, i48T, i16T, i8T, i48T, i48T}}},
+     {{{Extension::int4},
+       {{{i8T, i4T, i32T, i8T, i4T, i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Extension::int16},
+       {{{i16T, i8T, i48T, i16T, i8T, i48T, i48T},
+         SpecificationVersion::V_1_0}}},
       {{Extension::fp8e4m3},
-       {{fp8e4m3T, fp8e4m3T, fp16T, fp8e4m3T, fp8e4m3T, fp16T, fp16T}}},
+       {{{fp8e4m3T, fp8e4m3T, fp16T, fp8e4m3T, fp8e4m3T, fp16T, fp16T},
+         SpecificationVersion::V_1_0}}},
       {{Extension::fp8e5m2},
-       {{fp8e5m2T, fp8e5m2T, fp16T, fp8e5m2T, fp8e5m2T, fp16T, fp16T}}},
+       {{{fp8e5m2T, fp8e5m2T, fp16T, fp8e5m2T, fp8e5m2T, fp16T, fp16T},
+         SpecificationVersion::V_1_0}}},
       {{Extension::bf16},
-       {{bf16T, bf16T, bf16T, bf16T, bf16T, fp32T, bf16T}}}}},
-    {"tosa.fft2d", {{{Extension::fft}, {{fp32T, fp32T, fp32T, fp32T}}}}},
+       {{{bf16T, bf16T, bf16T, bf16T, bf16T, fp32T, bf16T},
+         SpecificationVersion::V_1_0}}}}},
+    {"tosa.fft2d",
+     {{{Extension::fft},
+       {{{fp32T, fp32T, fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.matmul",
-     {{{Extension::int16}, {{i16T, i16T, i16T, i16T, i48T}}},
+     {{{Extension::int16},
+       {{{i16T, i16T, i16T, i16T, i48T}, SpecificationVersion::V_1_0}}},
       {{Extension::fp8e4m3},
-       {{fp8e4m3T, fp8e4m3T, fp8e4m3T, fp8e4m3T, fp16T},
-        {fp8e4m3T, fp8e4m3T, fp8e4m3T, fp8e4m3T, fp32T}}},
+       {{{fp8e4m3T, fp8e4m3T, fp8e4m3T, fp8e4m3T, fp16T},
+         SpecificationVersion::V_1_0},
+        {{fp8e4m3T, fp8e4m3T, fp8e4m3T, fp8e4m3T, fp32T},
+         SpecificationVersion::V_1_1_DRAFT}}},
       {{Extension::fp8e5m2},
-       {{fp8e5m2T, fp8e5m2T, fp8e5m2T, fp8e5m2T, fp16T},
-        {fp8e5m2T, fp8e5m2T, fp8e5m2T, fp8e5m2T, fp32T}}},
+       {{{fp8e5m2T, fp8e5m2T, fp8e5m2T, fp8e5m2T, fp16T},
+         SpecificationVersion::V_1_0},
+        {{fp8e5m2T, fp8e5m2T, fp8e5m2T, fp8e5m2T, fp32T},
+         SpecificationVersion::V_1_1_DRAFT}}},
       {{Extension::fp8e4m3, Extension::fp8e5m2},
-       {{fp8e4m3T, fp8e5m2T, fp8e4m3T, fp8e5m2T, fp16T},
-        {fp8e4m3T, fp8e5m2T, fp8e4m3T, fp8e5m2T, fp32T},
-        {fp8e5m2T, fp8e4m3T, fp8e5m2T, fp8e4m3T, fp16T},
-        {fp8e5m2T, fp8e4m3T, fp8e5m2T, fp8e4m3T, fp32T}},
+       {{{fp8e4m3T, fp8e5m2T, fp8e4m3T, fp8e5m2T, fp16T},
+         SpecificationVersion::V_1_1_DRAFT},
+        {{fp8e4m3T, fp8e5m2T, fp8e4m3T, fp8e5m2T, fp32T},
+         SpecificationVersion::V_1_1_DRAFT},
+        {{fp8e5m2T, fp8e4m3T, fp8e5m2T, fp8e4m3T, fp16T},
+         SpecificationVersion::V_1_1_DRAFT},
+        {{fp8e5m2T, fp8e4m3T, fp8e5m2T, fp8e4m3T, fp32T},
+         SpecificationVersion::V_1_1_DRAFT}},
        allOf},
-      {{Extension::bf16}, {{bf16T, bf16T, bf16T, bf16T, fp32T}}}}},
+      {{Extension::bf16},
+       {{{bf16T, bf16T, bf16T, bf16T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.max_pool2d",
-     {{{Extension::int16}, {{i16T, i16T}}},
-      {{Extension::fp8e4m3}, {{fp8e4m3T, fp8e4m3T}}},
-      {{Extension::fp8e5m2}, {{fp8e5m2T, fp8e5m2T}}},
-      {{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.rfft2d", {{{Extension::fft}, {{fp32T, fp32T, fp32T}}}}},
+     {{{Extension::int16}, {{{i16T, i16T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e4m3},
+       {{{fp8e4m3T, fp8e4m3T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e5m2},
+       {{{fp8e5m2T, fp8e5m2T}, SpecificationVersion::V_1_0}}},
+      {{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.rfft2d",
+     {{{Extension::fft},
+       {{{fp32T, fp32T, fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.transpose_conv2d",
-     {{{Extension::int4}, {{i8T, i4T, i32T, i8T, i4T, i32T, i32T}}},
-      {{Extension::int16}, {{i16T, i8T, i48T, i16T, i8T, i48T, i48T}}},
+     {{{Extension::int4},
+       {{{i8T, i4T, i32T, i8T, i4T, i32T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Extension::int16},
+       {{{i16T, i8T, i48T, i16T, i8T, i48T, i48T},
+         SpecificationVersion::V_1_0}}},
       {{Extension::fp8e4m3},
-       {{fp8e4m3T, fp8e4m3T, fp16T, fp8e4m3T, fp8e4m3T, fp16T, fp16T}}},
+       {{{fp8e4m3T, fp8e4m3T, fp16T, fp8e4m3T, fp8e4m3T, fp16T, fp16T},
+         SpecificationVersion::V_1_0}}},
       {{Extension::fp8e5m2},
-       {{fp8e5m2T, fp8e5m2T, fp16T, fp8e5m2T, fp8e5m2T, fp16T, fp16T}}},
+       {{{fp8e5m2T, fp8e5m2T, fp16T, fp8e5m2T, fp8e5m2T, fp16T, fp16T},
+         SpecificationVersion::V_1_0}}},
       {{Extension::bf16},
-       {{bf16T, bf16T, bf16T, bf16T, bf16T, fp32T, bf16T}}}}},
+       {{{bf16T, bf16T, bf16T, bf16T, bf16T, fp32T, bf16T},
+         SpecificationVersion::V_1_0}}}}},
     {"tosa.clamp",
-     {{{Extension::int16}, {{i16T, i16T}}},
-      {{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.erf", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.sigmoid", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.tanh", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.add", {{{Extension::bf16}, {{bf16T, bf16T, bf16T}}}}},
-    {"tosa.maximum", {{{Extension::bf16}, {{bf16T, bf16T, bf16T}}}}},
-    {"tosa.minimum", {{{Extension::bf16}, {{bf16T, bf16T, bf16T}}}}},
-    {"tosa.mul", {{{Extension::bf16}, {{bf16T, bf16T, bf16T}}}}},
-    {"tosa.pow", {{{Extension::bf16}, {{bf16T, bf16T, bf16T}}}}},
-    {"tosa.sub", {{{Extension::bf16}, {{bf16T, bf16T, bf16T}}}}},
-    {"tosa.table", {{{Extension::int16}, {{i16T, i16T, i32T}}}}},
-    {"tosa.abs", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.ceil", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.cos", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.exp", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.floor", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.log", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.negate", {{{Extension::bf16}, {{bf16T, bf16T, bf16T, bf16T}}}}},
-    {"tosa.reciprocal", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.rsqrt", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.sin", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.select", {{{Extension::bf16}, {{bf16T, bf16T, bf16T}}}}},
-    {"tosa.equal", {{{Extension::bf16}, {{bf16T, bf16T, boolT}}}}},
-    {"tosa.greater", {{{Extension::bf16}, {{bf16T, bf16T, boolT}}}}},
-    {"tosa.greater_equal", {{{Extension::bf16}, {{bf16T, bf16T, boolT}}}}},
-    {"tosa.reduce_max", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.reduce_min", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.reduce_product", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.reduce_sum", {{{Extension::bf16}, {{bf16T, bf16T}}}}},
+     {{{Extension::int16}, {{{i16T, i16T}, SpecificationVersion::V_1_0}}},
+      {{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.erf",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.sigmoid",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.tanh",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.add",
+     {{{Extension::bf16},
+       {{{bf16T, bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.maximum",
+     {{{Extension::bf16},
+       {{{bf16T, bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.minimum",
+     {{{Extension::bf16},
+       {{{bf16T, bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.mul",
+     {{{Extension::bf16},
+       {{{bf16T, bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.pow",
+     {{{Extension::bf16},
+       {{{bf16T, bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.sub",
+     {{{Extension::bf16},
+       {{{bf16T, bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.table",
+     {{{Extension::int16},
+       {{{i16T, i16T, i32T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.abs",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.ceil",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.cos",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.exp",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.floor",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.log",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.negate",
+     {{{Extension::bf16},
+       {{{bf16T, bf16T, bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.reciprocal",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.rsqrt",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.sin",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.select",
+     {{{Extension::bf16},
+       {{{bf16T, bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.equal",
+     {{{Extension::bf16},
+       {{{bf16T, bf16T, boolT}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.greater",
+     {{{Extension::bf16},
+       {{{bf16T, bf16T, boolT}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.greater_equal",
+     {{{Extension::bf16},
+       {{{bf16T, bf16T, boolT}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.reduce_max",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.reduce_min",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.reduce_product",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.reduce_sum",
+     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.concat",
-     {{{Extension::int16}, {{i16T, i16T}}},
-      {{Extension::fp8e4m3}, {{fp8e4m3T, fp8e4m3T}}},
-      {{Extension::fp8e5m2}, {{fp8e5m2T, fp8e5m2T}}},
-      {{Extension::bf16}, {{bf16T, bf16T}}}}},
+     {{{Extension::int16}, {{{i16T, i16T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e4m3},
+       {{{fp8e4m3T, fp8e4m3T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e5m2},
+       {{{fp8e5m2T, fp8e5m2T}, SpecificationVersion::V_1_0}}},
+      {{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.pad",
-     {{{Extension::fp8e4m3}, {{fp8e4m3T, fp8e4m3T, fp8e4m3T}}},
-      {{Extension::fp8e5m2}, {{fp8e5m2T, fp8e5m2T, fp8e5m2T}}},
-      {{Extension::bf16}, {{bf16T, bf16T, bf16T}}}}},
+     {{{Extension::fp8e4m3},
+       {{{fp8e4m3T, fp8e4m3T, fp8e4m3T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e5m2},
+       {{{fp8e5m2T, fp8e5m2T, fp8e5m2T}, SpecificationVersion::V_1_0}}},
+      {{Extension::bf16},
+       {{{bf16T, bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.reshape",
-     {{{Extension::fp8e4m3}, {{fp8e4m3T, fp8e4m3T}}},
-      {{Extension::fp8e5m2}, {{fp8e5m2T, fp8e5m2T}}},
-      {{Extension::bf16}, {{bf16T, bf16T}}}}},
+     {{{Extension::fp8e4m3},
+       {{{fp8e4m3T, fp8e4m3T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e5m2},
+       {{{fp8e5m2T, fp8e5m2T}, SpecificationVersion::V_1_0}}},
+      {{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.reverse",
-     {{{Extension::fp8e4m3}, {{fp8e4m3T, fp8e4m3T}}},
-      {{Extension::fp8e5m2}, {{fp8e5m2T, fp8e5m2T}}},
-      {{Extension::bf16}, {{bf16T, bf16T}}}}},
+     {{{Extension::fp8e4m3},
+       {{{fp8e4m3T, fp8e4m3T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e5m2},
+       {{{fp8e5m2T, fp8e5m2T}, SpecificationVersion::V_1_0}}},
+      {{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.slice",
-     {{{Extension::fp8e4m3}, {{fp8e4m3T, fp8e4m3T}}},
-      {{Extension::fp8e5m2}, {{fp8e5m2T, fp8e5m2T}}},
-      {{Extension::bf16}, {{bf16T, bf16T}}}}},
+     {{{Extension::fp8e4m3},
+       {{{fp8e4m3T, fp8e4m3T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e5m2},
+       {{{fp8e5m2T, fp8e5m2T}, SpecificationVersion::V_1_0}}},
+      {{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.tile",
-     {{{Extension::fp8e4m3}, {{fp8e4m3T, fp8e4m3T}}},
-      {{Extension::fp8e5m2}, {{fp8e5m2T, fp8e5m2T}}},
-      {{Extension::bf16}, {{bf16T, bf16T}}}}},
+     {{{Extension::fp8e4m3},
+       {{{fp8e4m3T, fp8e4m3T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e5m2},
+       {{{fp8e5m2T, fp8e5m2T}, SpecificationVersion::V_1_0}}},
+      {{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.transpose",
-     {{{Extension::fp8e4m3}, {{fp8e4m3T, fp8e4m3T}}},
-      {{Extension::fp8e5m2}, {{fp8e5m2T, fp8e5m2T}}},
-      {{Extension::bf16}, {{bf16T, bf16T}}}}},
+     {{{Extension::fp8e4m3},
+       {{{fp8e4m3T, fp8e4m3T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e5m2},
+       {{{fp8e5m2T, fp8e5m2T}, SpecificationVersion::V_1_0}}},
+      {{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.gather",
-     {{{Extension::fp8e4m3}, {{fp8e4m3T, i32T, fp8e4m3T}}},
-      {{Extension::fp8e5m2}, {{fp8e5m2T, i32T, fp8e5m2T}}},
-      {{Extension::bf16}, {{bf16T, i32T, bf16T}}}}},
+     {{{Extension::fp8e4m3},
+       {{{fp8e4m3T, i32T, fp8e4m3T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e5m2},
+       {{{fp8e5m2T, i32T, fp8e5m2T}, SpecificationVersion::V_1_0}}},
+      {{Extension::bf16},
+       {{{bf16T, i32T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.scatter",
-     {{{Extension::fp8e4m3}, {{fp8e4m3T, i32T, fp8e4m3T, fp8e4m3T}}},
-      {{Extension::fp8e5m2}, {{fp8e5m2T, i32T, fp8e5m2T, fp8e5m2T}}},
-      {{Extension::bf16}, {{bf16T, i32T, bf16T, bf16T}}}}},
+     {{{Extension::fp8e4m3},
+       {{{fp8e4m3T, i32T, fp8e4m3T, fp8e4m3T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e5m2},
+       {{{fp8e5m2T, i32T, fp8e5m2T, fp8e5m2T}, SpecificationVersion::V_1_0}}},
+      {{Extension::bf16},
+       {{{bf16T, i32T, bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.resize",
-     {{{Extension::int16}, {{i16T, i48T}, {i16T, i16T}}},
-      {{Extension::bf16}, {{bf16T, bf16T}}}}},
+     {{{Extension::int16},
+       {{{i16T, i48T}, SpecificationVersion::V_1_0},
+        {{i16T, i16T}, SpecificationVersion::V_1_0}}},
+      {{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.cast",
      {{{Extension::bf16},
-       {{i8T, bf16T},
-        {i16T, bf16T},
-        {i32T, bf16T},
-        {bf16T, i8T},
-        {bf16T, i16T},
-        {bf16T, i32T},
-        {bf16T, fp32T},
-        {fp32T, bf16T}}},
+       {{{i8T, bf16T}, SpecificationVersion::V_1_0},
+        {{i16T, bf16T}, SpecificationVersion::V_1_0},
+        {{i32T, bf16T}, SpecificationVersion::V_1_0},
+        {{bf16T, i8T}, SpecificationVersion::V_1_0},
+        {{bf16T, i16T}, SpecificationVersion::V_1_0},
+        {{bf16T, i32T}, SpecificationVersion::V_1_0},
+        {{bf16T, fp32T}, SpecificationVersion::V_1_0},
+        {{fp32T, bf16T}, SpecificationVersion::V_1_0}}},
       {{Extension::bf16, Extension::fp8e4m3},
-       {{bf16T, fp8e4m3T}, {fp8e4m3T, bf16T}},
+       {{{bf16T, fp8e4m3T}, SpecificationVersion::V_1_0},
+        {{fp8e4m3T, bf16T}, SpecificationVersion::V_1_0}},
        allOf},
       {{Extension::bf16, Extension::fp8e5m2},
-       {{bf16T, fp8e5m2T}, {fp8e5m2T, bf16T}},
+       {{{bf16T, fp8e5m2T}, SpecificationVersion::V_1_0},
+        {{fp8e5m2T, bf16T}, SpecificationVersion::V_1_0}},
        allOf},
       {{Extension::fp8e4m3},
-       {{fp8e4m3T, fp16T},
-        {fp8e4m3T, fp32T},
-        {fp16T, fp8e4m3T},
-        {fp32T, fp8e4m3T}}},
+       {{{fp8e4m3T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp8e4m3T, fp32T}, SpecificationVersion::V_1_0},
+        {{fp16T, fp8e4m3T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp8e4m3T}, SpecificationVersion::V_1_0}}},
       {{Extension::fp8e5m2},
-       {{fp8e5m2T, fp16T},
-        {fp8e5m2T, fp32T},
-        {fp16T, fp8e5m2T},
-        {fp32T, fp8e5m2T}}}}},
+       {{{fp8e5m2T, fp16T}, SpecificationVersion::V_1_0},
+        {{fp8e5m2T, fp32T}, SpecificationVersion::V_1_0},
+        {{fp16T, fp8e5m2T}, SpecificationVersion::V_1_0},
+        {{fp32T, fp8e5m2T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.rescale",
      {{{Extension::int16},
-       {{i48T, i48T, i8T, i8T},
-        {i48T, i48T, i16T, i16T},
-        {i48T, i48T, i32T, i32T}}}}},
+       {{{i48T, i48T, i8T, i8T}, SpecificationVersion::V_1_0},
+        {{i48T, i48T, i16T, i16T}, SpecificationVersion::V_1_0},
+        {{i48T, i48T, i32T, i32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.const",
-     {{{Extension::int4}, {{i4T}}},
-      {{Extension::int16}, {{i48T}}},
-      {{Extension::fp8e4m3}, {{fp8e4m3T}}},
-      {{Extension::fp8e5m2}, {{fp8e5m2T}}},
-      {{Extension::bf16}, {{bf16T}}}}},
+     {{{Extension::int4}, {{{i4T}, SpecificationVersion::V_1_0}}},
+      {{Extension::int16}, {{{i48T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e4m3}, {{{fp8e4m3T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e5m2}, {{{fp8e5m2T}, SpecificationVersion::V_1_0}}},
+      {{Extension::bf16}, {{{bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.identity",
-     {{{Extension::int4}, {{i4T, i4T}}},
-      {{Extension::int16}, {{i48T, i48T}}},
-      {{Extension::fp8e4m3}, {{fp8e4m3T, fp8e4m3T}}},
-      {{Extension::fp8e5m2}, {{fp8e5m2T, fp8e5m2T}}},
-      {{Extension::bf16}, {{bf16T, bf16T}}}}},
-    {"tosa.variable", {{{Extension::variable}, {{i8T}, {fp16T}, {fp32T}}}}},
+     {{{Extension::int4}, {{{i4T, i4T}, SpecificationVersion::V_1_0}}},
+      {{Extension::int16}, {{{i48T, i48T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e4m3},
+       {{{fp8e4m3T, fp8e4m3T}, SpecificationVersion::V_1_0}}},
+      {{Extension::fp8e5m2},
+       {{{fp8e5m2T, fp8e5m2T}, SpecificationVersion::V_1_0}}},
+      {{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.variable",
+     {{{Extension::variable},
+       {{{i8T}, SpecificationVersion::V_1_0},
+        {{fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.variable_write",
-     {{{Extension::variable}, {{i8T}, {fp16T}, {fp32T}}}}},
+     {{{Extension::variable},
+       {{{i8T}, SpecificationVersion::V_1_0},
+        {{fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.variable_read",
-     {{{Extension::variable}, {{i8T}, {fp16T}, {fp32T}}}}},
+     {{{Extension::variable},
+       {{{i8T}, SpecificationVersion::V_1_0},
+        {{fp16T}, SpecificationVersion::V_1_0},
+        {{fp32T}, SpecificationVersion::V_1_0}}}}},
 };
+
 // End of auto-generated metadata
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td
index 38cb2936ad8d9..8376a4c87dbf2 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td
@@ -221,7 +221,7 @@ class Tosa_I32EnumAttr<string name, string description, string mnemonic,
 }
 
 //===----------------------------------------------------------------------===//
-// TOSA Spec Section 1.5.
+// TOSA Profiles and extensions
 //
 // Profile:
 // INT : Integer Inference. Integer operations, primarily 8 and 32-bit values.
@@ -293,12 +293,6 @@ def Tosa_ExtensionAttr
 def Tosa_ExtensionArrayAttr
     : TypedArrayAttrBase<Tosa_ExtensionAttr, "TOSA extension array attribute">;
 
-def Tosa_LVL_NONE : I32EnumAttrCase<"none", 0>;
-def Tosa_LVL_8K   : I32EnumAttrCase<"eightK", 1, "8k">;
-
-def Tosa_LevelAttr
-    : Tosa_I32EnumAttr<"Level", "supported TOSA levels", "level", [Tosa_LVL_NONE, Tosa_LVL_8K]>;
-
 // The base class for defining op availability dimensions.
 class Availability {
   // The following are fields for controlling the generated C++ OpInterface.
@@ -404,18 +398,41 @@ class Extension<list<I32EnumAttrCase> extensions> : Availability {
   let instance = "ref";
 }
 
+//===----------------------------------------------------------------------===//
+// TOSA Levels
+//===----------------------------------------------------------------------===//
+
+def Tosa_LVL_NONE : I32EnumAttrCase<"none", 0>;
+def Tosa_LVL_8K   : I32EnumAttrCase<"eightK", 1, "8k">;
+
+def Tosa_LevelAttr
+    : Tosa_I32EnumAttr<"Level", "supported TOSA levels", "level", [Tosa_LVL_NONE, Tosa_LVL_8K]>;
+
+//===----------------------------------------------------------------------===//
+// TOSA Specification versions
+//===----------------------------------------------------------------------===//
+
+def Tosa_V_1_0 : I32EnumAttrCase<"V_1_0", 0, "1.0">;
+def Tosa_V_1_1_DRAFT : I32EnumAttrCase<"V_1_1_DRAFT", 1, "1.1.draft">;
+
+def Tosa_SpecificationVersion : Tosa_I32EnumAttr<
+      "SpecificationVersion", "TOSA specification version", "specification_version",
+      [Tosa_V_1_0, Tosa_V_1_1_DRAFT]>;
+
 //===----------------------------------------------------------------------===//
 // TOSA target environment.
 //===----------------------------------------------------------------------===//
 def Tosa_TargetEnv : Tosa_Attr<"TargetEnv", "target_env"> {
   let summary = "Target environment information.";
   let parameters = ( ins
+    "SpecificationVersion": $specification_version,
     "Level": $level,
     ArrayRefParameter<"Profile">: $profiles,
     ArrayRefParameter<"Extension">: $extensions
   );
 
-  let assemblyFormat = "`<` `level` `=` $level `,` `profiles` `=` `[` $profiles `]` `,` "
+  let assemblyFormat = "`<` `specification_version` `=` $specification_version `,` "
+                       "`level` `=` $level `,` `profiles` `=` `[` $profiles `]` `,` "
                        "`extensions` `=` `[` $extensions `]` `>`";
 }
 
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
index 48759f2a3c9e8..697a04e94441a 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
@@ -1027,7 +1027,6 @@ def Tosa_MinimumOp : Tosa_ElementwiseOp<"minimum", [
 def Tosa_MulOp : Tosa_Op<"mul", [
     DeclareOpInterfaceMethods<InferShapedTypeOpInterface,
                               ["inferReturnTypeComponents"]>,
-    Commutative,
     Pure]> {
   let summary = "Multiplication operator.";
 
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaProfileCompliance.h b/mlir/include/mlir/Dialect/Tosa/IR/TosaProfileCompliance.h
index 8f5c72bc5f7a9..7b946ad6c6a89 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaProfileCompliance.h
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaProfileCompliance.h
@@ -36,12 +36,15 @@ enum CheckCondition {
   allOf
 };
 
+using VersionedTypeInfo =
+    std::pair<SmallVector<TypeInfo>, SpecificationVersion>;
+
 template <typename T>
 struct OpComplianceInfo {
   // Certain operations require multiple modes enabled.
   // e.g. cast bf16 to fp8e4m3 requires EXT-BF16 and EXT-FP8E4M3.
   SmallVector<T> mode;
-  SmallVector<SmallVector<TypeInfo>> operandTypeInfoSet;
+  SmallVector<VersionedTypeInfo> operandTypeInfoSet;
   CheckCondition condition = CheckCondition::anyOf;
 };
 
@@ -130,9 +133,8 @@ class TosaProfileCompliance {
   // Find the required profiles or extensions from the compliance info according
   // to the operand type combination.
   template <typename T>
-  SmallVector<T> findMatchedProfile(Operation *op,
-                                    SmallVector<OpComplianceInfo<T>> compInfo,
-                                    CheckCondition &condition);
+  OpComplianceInfo<T>
+  findMatchedEntry(Operation *op, SmallVector<OpComplianceInfo<T>> compInfo);
 
   SmallVector<Profile> getCooperativeProfiles(Extension ext) {
     switch (ext) {
@@ -168,8 +170,7 @@ class TosaProfileCompliance {
 
 private:
   template <typename T>
-  FailureOr<SmallVector<T>> getOperatorDefinition(Operation *op,
-                                                  CheckCondition &condition);
+  FailureOr<OpComplianceInfo<T>> getOperatorDefinition(Operation *op);
 
   OperationProfileComplianceMap profileComplianceMap;
   OperationExtensionComplianceMap extensionComplianceMap;
diff --git a/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.td b/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.td
index 6ae19d81e0820..14b00b04ccc18 100644
--- a/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.td
@@ -137,6 +137,13 @@ def TosaAttachTarget : Pass<"tosa-attach-target", "ModuleOp"> {
   ];
 
   let options = [
+    Option<"specificationVersion", "specification_version", "mlir::tosa::SpecificationVersion",
+              /*default=*/"mlir::tosa::SpecificationVersion::V_1_0",
+              "The specification version that TOSA operators should conform to.",
+              [{::llvm::cl::values(
+                clEnumValN(mlir::tosa::SpecificationVersion::V_1_0, "1.0", "TOSA Specification version 1.0"),
+                clEnumValN(mlir::tosa::SpecificationVersion::V_1_1_DRAFT, "1.1.draft", "TOSA Specification version 1.1.draft")
+              )}]>,
     Option<"level", "level", "mlir::tosa::Level",
               /*default=*/"mlir::tosa::Level::eightK",
               "The TOSA level that operators should conform to. A TOSA level defines "
diff --git a/mlir/include/mlir/Dialect/Transform/SMTExtension/SMTExtensionOps.h b/mlir/include/mlir/Dialect/Transform/SMTExtension/SMTExtensionOps.h
index fc69b039f24ff..f6353a995d747 100644
--- a/mlir/include/mlir/Dialect/Transform/SMTExtension/SMTExtensionOps.h
+++ b/mlir/include/mlir/Dialect/Transform/SMTExtension/SMTExtensionOps.h
@@ -10,6 +10,7 @@
 #define MLIR_DIALECT_TRANSFORM_SMTEXTENSION_SMTEXTENSIONOPS_H
 
 #include "mlir/Bytecode/BytecodeOpInterface.h"
+#include "mlir/Dialect/SMT/IR/SMTOps.h"
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
 #include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/IR/OpDefinition.h"
diff --git a/mlir/include/mlir/Dialect/Transform/SMTExtension/SMTExtensionOps.td b/mlir/include/mlir/Dialect/Transform/SMTExtension/SMTExtensionOps.td
index b987cb31e54bb..9d9783aa66ed9 100644
--- a/mlir/include/mlir/Dialect/Transform/SMTExtension/SMTExtensionOps.td
+++ b/mlir/include/mlir/Dialect/Transform/SMTExtension/SMTExtensionOps.td
@@ -16,7 +16,7 @@ include "mlir/Interfaces/SideEffectInterfaces.td"
 def ConstrainParamsOp : Op<Transform_Dialect, "smt.constrain_params", [
   DeclareOpInterfaceMethods<TransformOpInterface>,
   DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
-  NoTerminator
+  SingleBlockImplicitTerminator<"::mlir::smt::YieldOp">
 ]> {
   let cppNamespace = [{ mlir::transform::smt }];
 
@@ -24,14 +24,20 @@ def ConstrainParamsOp : Op<Transform_Dialect, "smt.constrain_params", [
   let description = [{
     Allows expressing constraints on params using the SMT dialect.
 
-    Each Transform dialect param provided as an operand has a corresponding
+    Each Transform-dialect param provided as an operand has a corresponding
     argument of SMT-type in the region. The SMT-Dialect ops in the region use
-    these arguments as operands.
+    these params-as-SMT-vars as operands, thereby expressing relevant
+    constraints on their allowed values.
+
+    Computations w.r.t. passed-in params can also be expressed through the
+    region's SMT-ops. Namely, the constraints express relationships to other
+    SMT-variables which can then be yielded from the region (with `smt.yield`).
 
     The semantics of this op is that all the ops in the region together express
     a constraint on the params-interpreted-as-smt-vars. The op fails in case the
     expressed constraint is not satisfiable per SMTLIB semantics. Otherwise the
-    op succeeds.
+    op succeeds and any one satisfying assignment is used to map the
+    SMT-variables yielded in the region to `transform.param`s.
 
     ---
 
@@ -42,9 +48,10 @@ def ConstrainParamsOp : Op<Transform_Dialect, "smt.constrain_params", [
   }];
 
   let arguments = (ins Variadic<TransformParamTypeInterface>:$params);
+  let results = (outs Variadic<TransformParamTypeInterface>:$results);
   let regions = (region SizedRegion<1>:$body);
   let assemblyFormat =
-      "`(` $params `)` attr-dict `:` type(operands) $body";
+      "`(` $params `)` attr-dict `:` functional-type(operands, results) $body";
 
   let hasVerifier = 1;
 }
diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
index 6e79085afac9f..6e15b1e7df606 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
@@ -2999,6 +2999,7 @@ def Vector_StepOp : Vector_Op<"step", [
   }];
   let results = (outs VectorOfRankAndType<[1], [Index]>:$result);
   let assemblyFormat = "attr-dict `:` type($result)";
+  let hasCanonicalizer = 1;
 }
 
 def Vector_YieldOp : Vector_Op<"yield", [
diff --git a/mlir/include/mlir/Dialect/WasmSSA/IR/WasmSSAOps.td b/mlir/include/mlir/Dialect/WasmSSA/IR/WasmSSAOps.td
index b80ee2c38546a..e9425e884ae83 100644
--- a/mlir/include/mlir/Dialect/WasmSSA/IR/WasmSSAOps.td
+++ b/mlir/include/mlir/Dialect/WasmSSA/IR/WasmSSAOps.td
@@ -43,9 +43,41 @@ class WasmSSA_BlockLikeOp<string mnemonic, string summaryStr> :
   let assemblyFormat = "(`(`$inputs^`)` `:` type($inputs))? attr-dict  `:` $body `>` $target";
 }
 
-def WasmSSA_BlockOp : WasmSSA_BlockLikeOp<"block", "Create a nesting level"> {}
+def WasmSSA_BlockOp : WasmSSA_BlockLikeOp<
+    "block",
+    "Create a nesting level with a label at its exit."> {
+  let description = [{
+  Defines a Wasm block, creating a new nested scope.
+  A block contains a body region and an optional list of input values.
+  Control can enter the block and later branch out to the block target.
+  Example:
+
+  ```mlir
+
+  wasmssa.block {
+
+    // instructions
+
+  } > ^successor
+  }];
+}
+
+def WasmSSA_LoopOp : WasmSSA_BlockLikeOp<
+    "loop",
+    "Create a nesting level that define its entry as jump target."> {
+  let description = [{
+  Represents a Wasm loop construct. This defines a nesting level with
+  a label at the entry of the region.
 
-def WasmSSA_LoopOp : WasmSSA_BlockLikeOp<"loop", "Create a nesting level similar to Block Op, except that it has itself as a successor."> {}
+  Example:
+
+  ```mlir
+
+  wasmssa.loop {
+
+  } > ^successor
+  }];
+}
 
 def WasmSSA_BlockReturnOp : WasmSSA_Op<"block_return", [Terminator,
     DeclareOpInterfaceMethods<LabelBranchingOpInterface>]> {
@@ -55,9 +87,16 @@ def WasmSSA_BlockReturnOp : WasmSSA_Op<"block_return", [Terminator,
     ::mlir::Block* getTarget();
   }];
   let description = [{
-     Marks a return from the current block.
+    Escape from the current nesting level and return the control flow to its successor.
+    Optionally, mark the arguments that should be transfered to the successor block.
 
-     Example:
+    This shouldn't be confused with branch operations that targets the label defined
+    by the nesting level operation.
+
+    For instance, a `wasmssa.block_return` in a loop will give back control to the
+    successor of the loop, where a `branch` targeting the loop will flow back to the entry block of the loop.
+
+    Example:
 
     ```mlir
       wasmssa.block_return
@@ -127,12 +166,18 @@ def WasmSSA_FuncOp : WasmSSA_Op<"func", [
     - Arguments of the entry block of type `!wasm<local T>`, with T the corresponding type
      in the function type.
 
+    By default, `wasmssa.func` have nested visibility. Functions exported by the module
+    are marked with the exported attribute. This gives them public visibility.
+
      Example:
 
      ```mlir
-     // A simple function with no arguments that returns a float32
+     // Internal function with no arguments that returns a float32
      wasmssa.func @my_f32_func() -> f32
 
+     // Exported function with no arguments that returns a float32
+     wasmssa.func exported @my_f32_func() -> f32
+
      // A function that takes a local ref argument
      wasmssa.func @i64_wrap(%a: !wasmssa<local ref to i64>) -> i32
      ```
@@ -141,7 +186,7 @@ def WasmSSA_FuncOp : WasmSSA_Op<"func", [
                      WasmSSA_FuncTypeAttr: $functionType,
                      OptionalAttr<DictArrayAttr>:$arg_attrs,
                      OptionalAttr<DictArrayAttr>:$res_attrs,
-                     DefaultValuedAttr<StrAttr, "\"nested\"">:$sym_visibility);
+                     UnitAttr: $exported);
   let regions = (region AnyRegion: $body);
   let extraClassDeclaration = [{
 
@@ -162,6 +207,12 @@ def WasmSSA_FuncOp : WasmSSA_Op<"func", [
 
     /// Returns the result types of this function.
     ArrayRef<Type> getResultTypes() { return getFunctionType().getResults(); }
+
+    ::mlir::SymbolTable::Visibility getVisibility() {
+      return getExported() ?
+        ::mlir::SymbolTable::Visibility::Public :
+        ::mlir::SymbolTable::Visibility::Nested;
+    };
   }];
 
   let builders = [
@@ -207,8 +258,7 @@ def WasmSSA_FuncImportOp : WasmSSA_Op<"import_func", [
                      StrAttr: $importName,
                      WasmSSA_FuncTypeAttr: $type,
                      OptionalAttr<DictArrayAttr>:$arg_attrs,
-                     OptionalAttr<DictArrayAttr>:$res_attrs,
-                     OptionalAttr<StrAttr>:$sym_visibility);
+                     OptionalAttr<DictArrayAttr>:$res_attrs);
   let extraClassDeclaration = [{
     bool isDeclaration() const { return true; }
 
@@ -221,6 +271,10 @@ def WasmSSA_FuncImportOp : WasmSSA_Op<"import_func", [
     ::llvm::ArrayRef<Type> getResultTypes() {
       return getType().getResults();
     }
+
+    ::mlir::SymbolTable::Visibility getVisibility() {
+      return ::mlir::SymbolTable::Visibility::Nested;
+    };
   }];
   let builders = [
     OpBuilder<(ins "StringRef":$symbol,
@@ -238,30 +292,41 @@ def WasmSSA_GlobalOp : WasmSSA_Op<"global", [
   let arguments = (ins SymbolNameAttr: $sym_name,
                      WasmSSA_ValTypeAttr: $type,
                      UnitAttr: $isMutable,
-                     OptionalAttr<StrAttr>:$sym_visibility);
+                     UnitAttr: $exported);
   let description = [{
     WebAssembly global variable.
     Body contains the initialization instructions for the variable value.
     The body must contain only instructions considered `const` in a webassembly context,
     such as `wasmssa.const` or `global.get`.
 
+    By default, `wasmssa.global` have nested visibility. Global exported by the module
+    are marked with the exported attribute. This gives them public visibility.
+
     Example:
 
     ```mlir
-    // Define a global_var, a mutable i32 global variable equal to 10.
-    wasmssa.global @global_var i32 mutable nested : {
+    // Define module_global_var, an internal mutable i32 global variable equal to 10.
+    wasmssa.global @module_global_var i32 mutable : {
           %[[VAL_0:.*]] = wasmssa.const 10 : i32
           wasmssa.return %[[VAL_0]] : i32
     }
+
+    // Define global_var, an exported constant i32 global variable equal to 42.
+    wasmssa.global @global_var i32 : {
+          %[[VAL_0:.*]] = wasmssa.const 42 : i32
+          wasmssa.return %[[VAL_0]] : i32
+    }
     ```
   }];
   let regions = (region AnyRegion: $initializer);
 
-  let builders = [
-    OpBuilder<(ins "StringRef":$symbol,
-                   "Type": $type,
-                   "bool": $isMutable)>
-  ];
+  let extraClassDeclaration = [{
+    ::mlir::SymbolTable::Visibility getVisibility() {
+      return getExported() ?
+        ::mlir::SymbolTable::Visibility::Public :
+        ::mlir::SymbolTable::Visibility::Nested;
+    };
+  }];
   let hasCustomAssemblyFormat = 1;
 }
 
@@ -283,18 +348,14 @@ def WasmSSA_GlobalImportOp : WasmSSA_Op<"import_global", [
                      StrAttr: $moduleName,
                      StrAttr: $importName,
                      WasmSSA_ValTypeAttr: $type,
-                     UnitAttr: $isMutable,
-                     OptionalAttr<StrAttr>:$sym_visibility);
+                     UnitAttr: $isMutable);
   let extraClassDeclaration = [{
     bool isDeclaration() const { return true; }
+
+    ::mlir::SymbolTable::Visibility getVisibility() {
+      return ::mlir::SymbolTable::Visibility::Nested;
+    };
   }];
-  let builders = [
-    OpBuilder<(ins "StringRef":$symbol,
-                   "StringRef":$moduleName,
-                   "StringRef":$importName,
-                   "Type": $type,
-                   "bool": $isMutable)>
-  ];
   let hasCustomAssemblyFormat = 1;
 }
 
@@ -442,23 +503,33 @@ def WasmSSA_MemOp : WasmSSA_Op<"memory", [Symbol]> {
     Define a memory to be used by the program.
     Multiple memories can be defined in the same module.
 
+    By default, `wasmssa.memory` have nested visibility. Memory exported by
+    the module are marked with the exported attribute. This gives them public
+    visibility.
+
      Example:
 
      ```mlir
-     // Define the `mem_0` memory with defined bounds of 0 -> 65536
+     // Define the `mem_0` (internal)  memory with defined size bounds of [0:65536]
      wasmssa.memory @mem_0 !wasmssa<limit[0:65536]>
+
+     // Define the `mem_1` exported  memory with minimal size of 512
+     wasmssa.memory exported @mem_1 !wasmssa<limit[512:]>
      ```
     }];
   let arguments = (ins SymbolNameAttr: $sym_name,
                      WasmSSA_LimitTypeAttr: $limits,
-                     OptionalAttr<StrAttr>:$sym_visibility);
-  let builders = [
-    OpBuilder<(ins
-    "::llvm::StringRef":$symbol,
-    "wasmssa::LimitType":$limit)>
-  ];
+                     UnitAttr: $exported);
 
-  let assemblyFormat = "$sym_name custom<WasmVisibility>($sym_visibility) $limits attr-dict";
+  let extraClassDeclaration = [{
+    ::mlir::SymbolTable::Visibility getVisibility() {
+      return getExported() ?
+        ::mlir::SymbolTable::Visibility::Public :
+        ::mlir::SymbolTable::Visibility::Nested;
+    };
+  }];
+
+  let assemblyFormat = "(`exported` $exported^)? $sym_name $limits attr-dict";
 }
 
 def WasmSSA_MemImportOp : WasmSSA_Op<"import_mem", [Symbol, ImportOpInterface]> {
@@ -476,16 +547,13 @@ def WasmSSA_MemImportOp : WasmSSA_Op<"import_mem", [Symbol, ImportOpInterface]>
   let arguments = (ins SymbolNameAttr: $sym_name,
                      StrAttr: $moduleName,
                      StrAttr: $importName,
-                     WasmSSA_LimitTypeAttr: $limits,
-                     OptionalAttr<StrAttr>:$sym_visibility);
+                     WasmSSA_LimitTypeAttr: $limits);
   let extraClassDeclaration = [{
-     bool isDeclaration() const { return true; }
+      bool isDeclaration() const { return true; }
+      ::mlir::SymbolTable::Visibility getVisibility() {
+       return ::mlir::SymbolTable::Visibility::Nested;
+      };
    }];
-  let builders = [OpBuilder<(ins
-    "::llvm::StringRef":$symbol,
-    "::llvm::StringRef":$moduleName,
-    "::llvm::StringRef":$importName,
-    "wasmssa::LimitType":$limits)>];
   let assemblyFormat = "$importName `from` $moduleName `as` $sym_name attr-dict";
 }
 
@@ -493,11 +561,15 @@ def WasmSSA_TableOp : WasmSSA_Op<"table", [Symbol]> {
   let summary= "WebAssembly table value";
   let arguments = (ins SymbolNameAttr: $sym_name,
                      WasmSSA_TableTypeAttr: $type,
-                     OptionalAttr<StrAttr>:$sym_visibility);
-  let builders = [OpBuilder<(ins
-      "::llvm::StringRef":$symbol,
-      "wasmssa::TableType":$type)>];
-  let assemblyFormat = "$sym_name custom<WasmVisibility>($sym_visibility) $type attr-dict";
+                     UnitAttr: $exported);
+  let extraClassDeclaration = [{
+    ::mlir::SymbolTable::Visibility getVisibility() {
+      return getExported() ?
+        ::mlir::SymbolTable::Visibility::Public :
+        ::mlir::SymbolTable::Visibility::Nested;
+    };
+  }];
+  let assemblyFormat = "(`exported` $exported^)? $sym_name $type attr-dict";
 }
 
 def WasmSSA_TableImportOp : WasmSSA_Op<"import_table", [Symbol, ImportOpInterface]> {
@@ -515,17 +587,14 @@ def WasmSSA_TableImportOp : WasmSSA_Op<"import_table", [Symbol, ImportOpInterfac
   let arguments = (ins SymbolNameAttr: $sym_name,
                      StrAttr: $moduleName,
                      StrAttr: $importName,
-                     WasmSSA_TableTypeAttr: $type,
-                     OptionalAttr<StrAttr>:$sym_visibility);
+                     WasmSSA_TableTypeAttr: $type);
   let extraClassDeclaration = [{
     bool isDeclaration() const { return true; }
+    ::mlir::SymbolTable::Visibility getVisibility() {
+       return ::mlir::SymbolTable::Visibility::Nested;
+    };
   }];
   let assemblyFormat = "$importName `from` $moduleName `as` $sym_name attr-dict";
-  let builders = [OpBuilder<(ins
-      "::llvm::StringRef":$symbol,
-      "::llvm::StringRef":$moduleName,
-      "::llvm::StringRef":$importName,
-      "wasmssa::TableType":$type)>];
 }
 
 def WasmSSA_ReturnOp : WasmSSA_Op<"return", [Terminator]> {
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 5695d5d515d7f..19a52317956d2 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -712,10 +712,14 @@ def XeGPU_MemLayoutAttr : XeGPUAttr<"MemLayout", "mem_layout"> {
       return getAttrs().contains(name);
     }
 
-    ArrayAttr getStrides() {
+    ArrayAttr getStrideAttr() {
       return getAttrs().getAs<ArrayAttr>("stride");
     }
 
+    ArrayAttr getBlockAttr() {
+      return getAttrs().getAs<ArrayAttr>("block");
+    }
+
   }];
 
 }
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 73f9061f5debe..426377fcf598f 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -1298,14 +1298,14 @@ def XeGPU_CreateMemDescOp: XeGPU_Op<"create_mem_desc", [Pure,
 }
 
 def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
-                              AllElementTypesMatch<["mem_desc", "res"]>,
-                              AllRanksMatch<["mem_desc", "res"]>]>  {
+                              AllElementTypesMatch<["mem_desc", "res"]>]>  {
   let arguments = (ins XeGPU_MemDesc:$mem_desc,
     Variadic<Index>: $offsets,
     DenseI64ArrayAttr: $const_offsets,
+    OptionalAttr<UnitAttr>:$subgroup_block_io,
     OptionalAttr<DistributeLayoutAttr>:$layout
   );
-  let results = (outs XeGPU_ValueType:$res);
+  let results = (outs AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$res);  
   let assemblyFormat = [{
     $mem_desc `` custom<DynamicIndexList>($offsets, $const_offsets)
     prop-dict attr-dict `` `:` type(operands) `->` type(results)
@@ -1319,6 +1319,9 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
     Arguments:
      - `mem_desc`: the memory descriptor identifying the SLM region.
      - `offsets`: the coordinates within the matrix to read from.
+     - `subgroup_block_io`: [optional] An attribute indicating that the operation can be 
+                 lowered to a subgroup block load. When this attribute is present, 
+                 the offsets are subgroup-uniform across all lanes.
      - `layout`: [optional] An attribute for guiding distributions among
                  subgroups and/or work-items. It currently can accept either
                  LayoutAttr or SliceAttr.
@@ -1336,7 +1339,10 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
     }
 
     ArrayRef<int64_t> getDataShape() {
-      return getRes().getType().getShape();
+      auto resTy = getRes().getType();
+      if (auto vecTy = llvm::dyn_cast<VectorType>(resTy))
+        return vecTy.getShape();
+      return {};
     }
   }];
 
@@ -1344,13 +1350,13 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
 }
 
 def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
-                              AllElementTypesMatch<["mem_desc", "data"]>,
-                              AllRanksMatch<["mem_desc", "data"]>]> {
+                              AllElementTypesMatch<["mem_desc", "data"]>]> {
   let arguments = (ins
-    XeGPU_ValueType:$data,
+    AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$data,
     XeGPU_MemDesc:$mem_desc,
     Variadic<Index>: $offsets,
     DenseI64ArrayAttr: $const_offsets,
+    OptionalAttr<UnitAttr>:$subgroup_block_io,
     OptionalAttr<DistributeLayoutAttr>:$layout
   );
   let assemblyFormat = [{ $data `,` $mem_desc `` custom<DynamicIndexList>($offsets, $const_offsets)
@@ -1364,6 +1370,9 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
      - `mem_desc`: the memory descriptor specifying the SLM region.
      - `offsets`: the coordinates within the matrix where the data will be written.
      - `data`: the values to be stored in the matrix.
+     - `subgroup_block_io`: [optional] An attribute indicating that the operation can be 
+                 lowered to a subgroup block store. When this attribute is present, 
+                 the offsets are subgroup-uniform across all lanes.     
      - `layout`: [optional] An attribute for guiding distributions among
                  subgroups and/or work-items. It currently can accept either
                  LayoutAttr or SliceAttr.
@@ -1378,7 +1387,10 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
     }
 
     ArrayRef<int64_t> getDataShape() {
-      return getData().getType().getShape();
+      auto DataTy = getData().getType();
+      if (auto vecTy = llvm::dyn_cast<VectorType>(DataTy))
+        return vecTy.getShape();
+      return {};
     }
 
   }];
@@ -1386,41 +1398,4 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
   let hasVerifier = 1;
 }
 
-def XeGPU_MemDescSubviewOp: XeGPU_Op<"mem_desc_subview",
-          [Pure, ViewLikeOpInterface, AllElementTypesMatch<["src", "res"]>]> {
-  let description = [{
-    Creates a subview of a memory descriptor. The resulting memory descriptor can have
-    a lower rank than the source; in this case, the result dimensions correspond to the
-    higher-order dimensions of the source memory descriptor.
-
-    Arguments:
-     - `src` : a memory descriptor.
-     - `offsets` : the coordinates within the matrix the subview will be created from.
-
-    Results:
-    - `res` : a memory descriptor with smaller size.
-
-  }];
-  let arguments = (ins XeGPU_MemDesc:$src,
-                       Variadic<Index>:$offsets,
-                       DenseI64ArrayAttr:$const_offsets);
-  let results = (outs XeGPU_MemDesc:$res);
-  let assemblyFormat = [{$src `` custom<DynamicIndexList>($offsets, $const_offsets) prop-dict
-                         attr-dict `` `:` qualified(type($src)) `->` qualified(type($res))}];
-  let builders = [
-    OpBuilder<(ins "Type": $res, "Value":$src, "llvm::ArrayRef<OpFoldResult>": $offsets)>
-  ];
-
-  let extraClassDeclaration = [{
-    mlir::Value getViewSource() { return getSrc(); }
-
-    SmallVector<OpFoldResult> getMixedOffsets() {
-      return getMixedValues(getConstOffsets(), getOffsets(), getContext());
-    }
-  }];
-
-  let hasVerifier = 1;
-}
-
-
 #endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index 84902b2039643..b1196fbe9c66a 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -237,12 +237,11 @@ def XeGPU_MemDesc: XeGPUTypeDef<"MemDesc", "mem_desc", [ShapedTypeInterface], "m
       return MemDescType::get(getContext(), shape.value_or(getShape()), elementType, getMemLayout());
     }
 
-    ArrayAttr getStrides() {
+    ArrayAttr getStrideAttr() {
       auto layout = getMemLayout();
       if (layout && layout.hasAttr("stride")) {
-        return layout.getStrides();
+        return layout.getStrideAttr();
       }
-
       // derive and return default strides
       SmallVector<int64_t> defaultStrides;
       llvm::append_range(defaultStrides, getShape().drop_front());
@@ -250,6 +249,63 @@ def XeGPU_MemDesc: XeGPUTypeDef<"MemDesc", "mem_desc", [ShapedTypeInterface], "m
       Builder builder(getContext());
       return builder.getI64ArrayAttr(defaultStrides);
     }
+
+    ArrayAttr getBlockAttr() {
+      auto layout = getMemLayout();
+      if (layout && layout.hasAttr("block")) {
+        return layout.getBlockAttr();
+      }
+      Builder builder(getContext());
+      return builder.getI64ArrayAttr({});
+    }
+
+    /// Heuristic to determine if the MemDesc uses column-major layout,
+    /// based on the rank and the value of the first stride dimension.
+    bool isColMajor() {
+      auto dim0 = dyn_cast<IntegerAttr>(getStrideAttr()[0]);
+      return getRank() == 2 && dim0.getInt() == 1;
+    }
+
+    // Get the Blocking shape for a MemDescType, Which is represented
+    // as an attribute in MemDescType. By default it is the shape
+    // of the mdescTy
+    SmallVector<int64_t> getBlockShape() {
+      SmallVector<int64_t> size(getShape());
+      ArrayAttr blockAttr = getBlockAttr();
+      if (!blockAttr.empty()) {
+        size.clear();
+        for (auto attr : blockAttr.getValue()) {
+          size.push_back(cast<IntegerAttr>(attr).getInt());
+        }
+      }
+      return size;
+    }
+
+    // Get strides as vector of integer. 
+    // If it contains block attribute, the strides are blocked strides.
+    //
+    // The blocking is applied to the base matrix shape derived from the
+    // memory descriptor's stride information. If the matrix described by
+    // the memory descriptor is not contiguous, it is assumed that the base
+    // matrix is contiguous and follows the same memory layout.
+    //
+    // It first computes the original matrix shape using the stride info,
+    // then computes the number of blocks in each dimension of original shape,
+    // then compute the outer block shape and stride,
+    // then combines the inner and outer block shape and stride
+    // e.g. for `mem_desc<32x256xf16, @block=[16, 8], @strides=[1, 32]>`
+    // its memory layout tuple is ([2,32,16,8],[128,256,1,16])
+    // for  `mem_desc<256x32xf16, @block=[8, 16]>` with default @stride[32, 1]
+    // its memory layout tuple is ([32,2,8,16],[256,128,16,1])
+    SmallVector<int64_t> getStrideShape();
+    
+    /// Generates instructions to compute the linearize offset 
+    //  if the memory descriptor is blocked, it returns linearize offset based on the blocked layout
+    //  the strides of memory descriptor is always considered regardless of blocked or not
+    Value getLinearOffsets(OpBuilder &builder,
+               Location loc, ArrayRef<OpFoldResult> offsets);
+
+
   }];
 
   let hasCustomAssemblyFormat = true;
diff --git a/mlir/include/mlir/IR/CommonTypeConstraints.td b/mlir/include/mlir/IR/CommonTypeConstraints.td
index 6b4e3dd603198..8427ba560c8aa 100644
--- a/mlir/include/mlir/IR/CommonTypeConstraints.td
+++ b/mlir/include/mlir/IR/CommonTypeConstraints.td
@@ -623,6 +623,14 @@ class VectorOfLengthAndType<list<int> allowedLengths,
    VectorOfNonZeroRankOf<allowedTypes>.summary # VectorOfLength<allowedLengths>.summary,
   "::mlir::VectorType">;
 
+class FixedVectorOfShapeAndType<list<int> shape, Type elType>: ShapedContainerType<
+  [elType],
+  And<[IsVectorOfShape<shape>, IsFixedVectorOfAnyRankTypePred]>,
+  "vector<" # !interleave(shape, "x") # "x" # elType # ">",
+  "::mlir::VectorType">,
+  BuildableType<"::mlir::VectorType::get({" # !interleave(shape, " ,") # "} , " # elType.builderCall # " );">;
+
+
 // Any fixed-length vector where the number of elements is from the given
 // `allowedLengths` list and the type is from the given `allowedTypes` list
 class FixedVectorOfLengthAndType<list<int> allowedLengths,
diff --git a/mlir/include/mlir/IR/Remarks.h b/mlir/include/mlir/IR/Remarks.h
index 20e84ec83cd01..9877926116e24 100644
--- a/mlir/include/mlir/IR/Remarks.h
+++ b/mlir/include/mlir/IR/Remarks.h
@@ -18,7 +18,6 @@
 #include "llvm/Remarks/Remark.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/Regex.h"
-#include <optional>
 
 #include "mlir/IR/Diagnostics.h"
 #include "mlir/IR/MLIRContext.h"
@@ -144,7 +143,7 @@ class Remark {
 
   llvm::StringRef getCategoryName() const { return categoryName; }
 
-  llvm::StringRef getFullCategoryName() const {
+  llvm::StringRef getCombinedCategoryName() const {
     if (categoryName.empty() && subCategoryName.empty())
       return {};
     if (subCategoryName.empty())
@@ -318,7 +317,7 @@ class InFlightRemark {
 };
 
 //===----------------------------------------------------------------------===//
-// MLIR Remark Streamer
+// Pluggable Remark Utilities
 //===----------------------------------------------------------------------===//
 
 /// Base class for MLIR remark streamers that is used to stream
@@ -338,6 +337,26 @@ class MLIRRemarkStreamerBase {
   virtual void finalize() {} // optional
 };
 
+using ReportFn = llvm::unique_function<void(const Remark &)>;
+
+/// Base class for MLIR remark emitting policies that is used to emit
+/// optimization remarks to the underlying remark streamer. The derived classes
+/// should implement the `reportRemark` method to provide the actual emitting
+/// implementation.
+class RemarkEmittingPolicyBase {
+protected:
+  ReportFn reportImpl;
+
+public:
+  RemarkEmittingPolicyBase() = default;
+  virtual ~RemarkEmittingPolicyBase() = default;
+
+  void initialize(ReportFn fn) { reportImpl = std::move(fn); }
+
+  virtual void reportRemark(const Remark &remark) = 0;
+  virtual void finalize() = 0;
+};
+
 //===----------------------------------------------------------------------===//
 // Remark Engine (MLIR Context will own this class)
 //===----------------------------------------------------------------------===//
@@ -355,6 +374,8 @@ class RemarkEngine {
   std::optional<llvm::Regex> failedFilter;
   /// The MLIR remark streamer that will be used to emit the remarks.
   std::unique_ptr<MLIRRemarkStreamerBase> remarkStreamer;
+  /// The MLIR remark policy that will be used to emit the remarks.
+  std::unique_ptr<RemarkEmittingPolicyBase> remarkEmittingPolicy;
   /// When is enabled, engine also prints remarks as mlir::emitRemarks.
   bool printAsEmitRemarks = false;
 
@@ -392,6 +413,8 @@ class RemarkEngine {
   InFlightRemark emitIfEnabled(Location loc, RemarkOpts opts,
                                bool (RemarkEngine::*isEnabled)(StringRef)
                                    const);
+  /// Report a remark.
+  void reportImpl(const Remark &remark);
 
 public:
   /// Default constructor is deleted, use the other constructor.
@@ -407,8 +430,15 @@ class RemarkEngine {
   ~RemarkEngine();
 
   /// Setup the remark engine with the given output path and format.
-  LogicalResult initialize(std::unique_ptr<MLIRRemarkStreamerBase> streamer,
-                           std::string *errMsg);
+  LogicalResult
+  initialize(std::unique_ptr<MLIRRemarkStreamerBase> streamer,
+             std::unique_ptr<RemarkEmittingPolicyBase> remarkEmittingPolicy,
+             std::string *errMsg);
+
+  /// Get the remark emitting policy.
+  RemarkEmittingPolicyBase *getRemarkEmittingPolicy() const {
+    return remarkEmittingPolicy.get();
+  }
 
   /// Report a remark.
   void report(const Remark &&remark);
@@ -446,6 +476,46 @@ inline InFlightRemark withEngine(Fn fn, Location loc, Args &&...args) {
 
 namespace mlir::remark {
 
+//===----------------------------------------------------------------------===//
+// Remark Emitting Policies
+//===----------------------------------------------------------------------===//
+
+/// Policy that emits all remarks.
+class RemarkEmittingPolicyAll : public detail::RemarkEmittingPolicyBase {
+public:
+  RemarkEmittingPolicyAll();
+
+  void reportRemark(const detail::Remark &remark) override {
+    assert(reportImpl && "reportImpl is not set");
+    reportImpl(remark);
+  }
+  void finalize() override {}
+};
+
+/// Policy that emits final remarks.
+class RemarkEmittingPolicyFinal : public detail::RemarkEmittingPolicyBase {
+private:
+  /// user can intercept them for custom processing via a registered callback,
+  /// otherwise they will be reported on engine destruction.
+  llvm::DenseSet<detail::Remark> postponedRemarks;
+
+public:
+  RemarkEmittingPolicyFinal();
+
+  void reportRemark(const detail::Remark &remark) override {
+    postponedRemarks.erase(remark);
+    postponedRemarks.insert(remark);
+  }
+
+  void finalize() override {
+    assert(reportImpl && "reportImpl is not set");
+    for (auto &remark : postponedRemarks) {
+      if (reportImpl)
+        reportImpl(remark);
+    }
+  }
+};
+
 /// Create a Reason with llvm::formatv formatting.
 template <class... Ts>
 inline detail::LazyTextBuild reason(const char *fmt, Ts &&...ts) {
@@ -505,16 +575,72 @@ inline detail::InFlightRemark analysis(Location loc, RemarkOpts opts) {
 
 /// Setup remarks for the context. This function will enable the remark engine
 /// and set the streamer to be used for optimization remarks. The remark
-/// categories are used to filter the remarks that will be emitted by the remark
-/// engine. If a category is not specified, it will not be emitted. If
+/// categories are used to filter the remarks that will be emitted by the
+/// remark engine. If a category is not specified, it will not be emitted. If
 /// `printAsEmitRemarks` is true, the remarks will be printed as
 /// mlir::emitRemarks. 'streamer' must inherit from MLIRRemarkStreamerBase and
 /// will be used to stream the remarks.
 LogicalResult enableOptimizationRemarks(
     MLIRContext &ctx,
     std::unique_ptr<remark::detail::MLIRRemarkStreamerBase> streamer,
+    std::unique_ptr<remark::detail::RemarkEmittingPolicyBase>
+        remarkEmittingPolicy,
     const remark::RemarkCategories &cats, bool printAsEmitRemarks = false);
 
 } // namespace mlir::remark
 
+// DenseMapInfo specialization for Remark
+namespace llvm {
+template <>
+struct DenseMapInfo<mlir::remark::detail::Remark> {
+  static constexpr StringRef kEmptyKey = "<EMPTY_KEY>";
+  static constexpr StringRef kTombstoneKey = "<TOMBSTONE_KEY>";
+
+  /// Helper to provide a static dummy context for sentinel keys.
+  static mlir::MLIRContext *getStaticDummyContext() {
+    static mlir::MLIRContext dummyContext;
+    return &dummyContext;
+  }
+
+  /// Create an empty remark
+  static inline mlir::remark::detail::Remark getEmptyKey() {
+    return mlir::remark::detail::Remark(
+        mlir::remark::RemarkKind::RemarkUnknown, mlir::DiagnosticSeverity::Note,
+        mlir::UnknownLoc::get(getStaticDummyContext()),
+        mlir::remark::RemarkOpts::name(kEmptyKey));
+  }
+
+  /// Create a dead remark
+  static inline mlir::remark::detail::Remark getTombstoneKey() {
+    return mlir::remark::detail::Remark(
+        mlir::remark::RemarkKind::RemarkUnknown, mlir::DiagnosticSeverity::Note,
+        mlir::UnknownLoc::get(getStaticDummyContext()),
+        mlir::remark::RemarkOpts::name(kTombstoneKey));
+  }
+
+  /// Compute the hash value of the remark
+  static unsigned getHashValue(const mlir::remark::detail::Remark &remark) {
+    return llvm::hash_combine(
+        remark.getLocation().getAsOpaquePointer(),
+        llvm::hash_value(remark.getRemarkName()),
+        llvm::hash_value(remark.getCombinedCategoryName()));
+  }
+
+  static bool isEqual(const mlir::remark::detail::Remark &lhs,
+                      const mlir::remark::detail::Remark &rhs) {
+    // Check for empty/tombstone keys first
+    if (lhs.getRemarkName() == kEmptyKey ||
+        lhs.getRemarkName() == kTombstoneKey ||
+        rhs.getRemarkName() == kEmptyKey ||
+        rhs.getRemarkName() == kTombstoneKey) {
+      return lhs.getRemarkName() == rhs.getRemarkName();
+    }
+
+    // For regular remarks, compare key identifying fields
+    return lhs.getLocation() == rhs.getLocation() &&
+           lhs.getRemarkName() == rhs.getRemarkName() &&
+           lhs.getCombinedCategoryName() == rhs.getCombinedCategoryName();
+  }
+};
+} // namespace llvm
 #endif // MLIR_IR_REMARKS_H
diff --git a/mlir/include/mlir/Interfaces/CMakeLists.txt b/mlir/include/mlir/Interfaces/CMakeLists.txt
index a5feb592045c0..72ed046a1ba5d 100644
--- a/mlir/include/mlir/Interfaces/CMakeLists.txt
+++ b/mlir/include/mlir/Interfaces/CMakeLists.txt
@@ -6,6 +6,7 @@ add_mlir_interface(DestinationStyleOpInterface)
 add_mlir_interface(FunctionInterfaces)
 add_mlir_interface(IndexingMapOpInterface)
 add_mlir_interface(InferIntRangeInterface)
+add_mlir_interface(InferStridedMetadataInterface)
 add_mlir_interface(InferTypeOpInterface)
 add_mlir_interface(LoopLikeInterface)
 add_mlir_interface(MemOpInterfaces)
diff --git a/mlir/include/mlir/Interfaces/InferIntRangeInterface.h b/mlir/include/mlir/Interfaces/InferIntRangeInterface.h
index 0e107e88f5232..a6de3d1885eec 100644
--- a/mlir/include/mlir/Interfaces/InferIntRangeInterface.h
+++ b/mlir/include/mlir/Interfaces/InferIntRangeInterface.h
@@ -117,7 +117,8 @@ class IntegerValueRange {
   IntegerValueRange(ConstantIntRanges value) : value(std::move(value)) {}
 
   /// Create an integer value range lattice value.
-  IntegerValueRange(std::optional<ConstantIntRanges> value = std::nullopt)
+  explicit IntegerValueRange(
+      std::optional<ConstantIntRanges> value = std::nullopt)
       : value(std::move(value)) {}
 
   /// Whether the range is uninitialized. This happens when the state hasn't
@@ -167,6 +168,15 @@ using SetIntRangeFn =
 using SetIntLatticeFn =
     llvm::function_ref<void(Value, const IntegerValueRange &)>;
 
+/// Helper callback type to get the integer range of a value.
+using GetIntRangeFn = function_ref<IntegerValueRange(Value)>;
+
+/// Helper function to collect the integer range values of an array of op fold
+/// results.
+SmallVector<IntegerValueRange> getIntValueRanges(ArrayRef<OpFoldResult> values,
+                                                 GetIntRangeFn getIntRange,
+                                                 int32_t indexBitwidth);
+
 class InferIntRangeInterface;
 
 namespace intrange::detail {
diff --git a/mlir/include/mlir/Interfaces/InferStridedMetadataInterface.h b/mlir/include/mlir/Interfaces/InferStridedMetadataInterface.h
new file mode 100644
index 0000000000000..0c572e0196a03
--- /dev/null
+++ b/mlir/include/mlir/Interfaces/InferStridedMetadataInterface.h
@@ -0,0 +1,145 @@
+//===- InferStridedMetadataInterface.h - Strided Metadata Inference -C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains definitions of the strided metadata inference interface
+// defined in `InferStridedMetadataInterface.td`
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_INTERFACES_INFERSTRIDEDMETADATAINTERFACE_H
+#define MLIR_INTERFACES_INFERSTRIDEDMETADATAINTERFACE_H
+
+#include "mlir/Interfaces/InferIntRangeInterface.h"
+
+namespace mlir {
+/// A class that represents the strided metadata range information, including
+/// offsets, sizes, and strides as integer ranges.
+class StridedMetadataRange {
+public:
+  /// Default constructor creates uninitialized ranges.
+  StridedMetadataRange() = default;
+
+  /// Returns a ranked strided metadata range.
+  static StridedMetadataRange
+  getRanked(SmallVectorImpl<ConstantIntRanges> &&offsets,
+            SmallVectorImpl<ConstantIntRanges> &&sizes,
+            SmallVectorImpl<ConstantIntRanges> &&strides) {
+    return StridedMetadataRange(std::move(offsets), std::move(sizes),
+                                std::move(strides));
+  }
+
+  /// Returns a strided metadata range with maximum ranges.
+  static StridedMetadataRange getMaxRanges(int32_t indexBitwidth,
+                                           int32_t offsetsRank,
+                                           int32_t sizeRank,
+                                           int32_t stridedRank) {
+    return StridedMetadataRange(
+        SmallVector<ConstantIntRanges>(
+            offsetsRank, ConstantIntRanges::maxRange(indexBitwidth)),
+        SmallVector<ConstantIntRanges>(
+            sizeRank, ConstantIntRanges::maxRange(indexBitwidth)),
+        SmallVector<ConstantIntRanges>(
+            stridedRank, ConstantIntRanges::maxRange(indexBitwidth)));
+  }
+
+  static StridedMetadataRange getMaxRanges(int32_t indexBitwidth,
+                                           int32_t rank) {
+    return getMaxRanges(indexBitwidth, 1, rank, rank);
+  }
+
+  /// Returns whether the metadata is uninitialized.
+  bool isUninitialized() const { return !offsets.has_value(); }
+
+  /// Get the offsets range.
+  ArrayRef<ConstantIntRanges> getOffsets() const {
+    return offsets ? *offsets : ArrayRef<ConstantIntRanges>();
+  }
+  MutableArrayRef<ConstantIntRanges> getOffsets() {
+    return offsets ? *offsets : MutableArrayRef<ConstantIntRanges>();
+  }
+
+  /// Get the sizes ranges.
+  ArrayRef<ConstantIntRanges> getSizes() const { return sizes; }
+  MutableArrayRef<ConstantIntRanges> getSizes() { return sizes; }
+
+  /// Get the strides ranges.
+  ArrayRef<ConstantIntRanges> getStrides() const { return strides; }
+  MutableArrayRef<ConstantIntRanges> getStrides() { return strides; }
+
+  /// Compare two strided metadata ranges.
+  bool operator==(const StridedMetadataRange &other) const {
+    return offsets == other.offsets && sizes == other.sizes &&
+           strides == other.strides;
+  }
+
+  /// Print the strided metadata range.
+  void print(raw_ostream &os) const;
+
+  /// Join two strided metadata ranges, by taking the element-wise union of the
+  /// metadata.
+  static StridedMetadataRange join(const StridedMetadataRange &lhs,
+                                   const StridedMetadataRange &rhs) {
+    if (lhs.isUninitialized())
+      return rhs;
+    if (rhs.isUninitialized())
+      return lhs;
+
+    // Helper fuction to compute the range union of constant ranges.
+    auto rangeUnion =
+        +[](const std::tuple<ConstantIntRanges, ConstantIntRanges> &lhsRhs)
+        -> ConstantIntRanges {
+      return std::get<0>(lhsRhs).rangeUnion(std::get<1>(lhsRhs));
+    };
+
+    // Get the elementwise range union. Note, that `zip_equal` will assert if
+    // sizes are not equal.
+    SmallVector<ConstantIntRanges> offsets = llvm::map_to_vector(
+        llvm::zip_equal(*lhs.offsets, *rhs.offsets), rangeUnion);
+    SmallVector<ConstantIntRanges> sizes =
+        llvm::map_to_vector(llvm::zip_equal(lhs.sizes, rhs.sizes), rangeUnion);
+    SmallVector<ConstantIntRanges> strides = llvm::map_to_vector(
+        llvm::zip_equal(lhs.strides, rhs.strides), rangeUnion);
+
+    // Return the joined metadata.
+    return StridedMetadataRange(std::move(offsets), std::move(sizes),
+                                std::move(strides));
+  }
+
+private:
+  /// Create a strided metadata range with the given offset, sizes, and strides.
+  StridedMetadataRange(SmallVectorImpl<ConstantIntRanges> &&offsets,
+                       SmallVectorImpl<ConstantIntRanges> &&sizes,
+                       SmallVectorImpl<ConstantIntRanges> &&strides)
+      : offsets(std::move(offsets)), sizes(std::move(sizes)),
+        strides(std::move(strides)) {}
+
+  /// The offsets range.
+  std::optional<SmallVector<ConstantIntRanges>> offsets;
+
+  /// The sizes ranges.
+  SmallVector<ConstantIntRanges> sizes;
+
+  /// The strides ranges.
+  SmallVector<ConstantIntRanges> strides;
+};
+
+/// Print the strided metadata to `os`.
+inline raw_ostream &operator<<(raw_ostream &os,
+                               const StridedMetadataRange &range) {
+  range.print(os);
+  return os;
+}
+
+/// Callback function type for setting the strided metadata of a value.
+using SetStridedMetadataRangeFn =
+    function_ref<void(Value, const StridedMetadataRange &)>;
+} // end namespace mlir
+
+#include "mlir/Interfaces/InferStridedMetadataInterface.h.inc"
+
+#endif // MLIR_INTERFACES_INFERSTRIDEDMETADATAINTERFACE_H
diff --git a/mlir/include/mlir/Interfaces/InferStridedMetadataInterface.td b/mlir/include/mlir/Interfaces/InferStridedMetadataInterface.td
new file mode 100644
index 0000000000000..ee5b0942f683e
--- /dev/null
+++ b/mlir/include/mlir/Interfaces/InferStridedMetadataInterface.td
@@ -0,0 +1,45 @@
+//===- InferStridedMetadataInterface.td - Strided MD Inference ----------*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines the interface for strided metadata range analysis
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_INTERFACES_INFERSTRIDEDMETADATAINTERFACE
+#define MLIR_INTERFACES_INFERSTRIDEDMETADATAINTERFACE
+
+include "mlir/IR/OpBase.td"
+
+def InferStridedMetadataOpInterface :
+    OpInterface<"InferStridedMetadataOpInterface"> {
+  let description = [{
+    Allows operations to participate in strided metadata analysis by providing
+    methods that allow them to specify bounds on offsets, sizes, and strides
+    of their result(s) given bounds on their input(s) if known.
+  }];
+  let cppNamespace = "::mlir";
+
+  let methods = [
+    InterfaceMethod<[{
+      Infer the strided metadata bounds on the results of this op given
+      the bounds on its operands.
+      For each result value or block argument of interest, the method should
+      call `setMetadata` with that `Value` as an argument.
+      The `operands` parameter contains the strided metadata ranges for all the
+      operands of the operation in order.
+      The `getIntRange` callback is provided for obtaining the int-range
+      analysis result for a given value.
+    }],
+    "void", "inferStridedMetadataRanges",
+    (ins "::llvm::ArrayRef<::mlir::StridedMetadataRange>":$operands,
+         "::mlir::GetIntRangeFn":$getIntRange,
+         "::mlir::SetStridedMetadataRangeFn":$setMetadata,
+         "int32_t":$indexBitwidth)>
+  ];
+}
+#endif // MLIR_INTERFACES_INFERSTRIDEDMETADATAINTERFACE
diff --git a/mlir/include/mlir/Remark/RemarkStreamer.h b/mlir/include/mlir/Remark/RemarkStreamer.h
index 170d6b439a442..19a70fa4c4daa 100644
--- a/mlir/include/mlir/Remark/RemarkStreamer.h
+++ b/mlir/include/mlir/Remark/RemarkStreamer.h
@@ -45,6 +45,7 @@ namespace mlir::remark {
 /// mlir::emitRemarks.
 LogicalResult enableOptimizationRemarksWithLLVMStreamer(
     MLIRContext &ctx, StringRef filePath, llvm::remarks::Format fmt,
+    std::unique_ptr<detail::RemarkEmittingPolicyBase> remarkEmittingPolicy,
     const RemarkCategories &cat, bool printAsEmitRemarks = false);
 
 } // namespace mlir::remark
diff --git a/mlir/include/mlir/TableGen/CodeGenHelpers.h b/mlir/include/mlir/TableGen/CodeGenHelpers.h
index 252da21419b6e..997aef26bdc01 100644
--- a/mlir/include/mlir/TableGen/CodeGenHelpers.h
+++ b/mlir/include/mlir/TableGen/CodeGenHelpers.h
@@ -88,7 +88,7 @@ class StaticVerifierFunctionEmitter {
   ///
   /// Constraints that do not meet the restriction that they can only reference
   /// `$_self` and `$_op` are not uniqued.
-  void emitOpConstraints(ArrayRef<const llvm::Record *> opDefs);
+  void emitOpConstraints();
 
   /// Unique all compatible type and attribute constraints from a pattern file
   /// and emit them at the top of the generated file.
diff --git a/mlir/include/mlir/Target/Wasm/WasmBinaryEncoding.h b/mlir/include/mlir/Target/Wasm/WasmBinaryEncoding.h
index 21adde878994e..cd9ef5b2132a4 100644
--- a/mlir/include/mlir/Target/Wasm/WasmBinaryEncoding.h
+++ b/mlir/include/mlir/Target/Wasm/WasmBinaryEncoding.h
@@ -19,6 +19,14 @@ namespace mlir {
 struct WasmBinaryEncoding {
   /// Byte encodings for Wasm instructions.
   struct OpCode {
+    // Control instructions.
+    static constexpr std::byte block{0x02};
+    static constexpr std::byte loop{0x03};
+    static constexpr std::byte ifOpCode{0x04};
+    static constexpr std::byte elseOpCode{0x05};
+    static constexpr std::byte branchIf{0x0D};
+    static constexpr std::byte call{0x10};
+
     // Locals, globals, constants.
     static constexpr std::byte localGet{0x20};
     static constexpr std::byte localSet{0x21};
@@ -29,6 +37,42 @@ struct WasmBinaryEncoding {
     static constexpr std::byte constFP32{0x43};
     static constexpr std::byte constFP64{0x44};
 
+    // Comparisons.
+    static constexpr std::byte eqzI32{0x45};
+    static constexpr std::byte eqI32{0x46};
+    static constexpr std::byte neI32{0x47};
+    static constexpr std::byte ltSI32{0x48};
+    static constexpr std::byte ltUI32{0x49};
+    static constexpr std::byte gtSI32{0x4A};
+    static constexpr std::byte gtUI32{0x4B};
+    static constexpr std::byte leSI32{0x4C};
+    static constexpr std::byte leUI32{0x4D};
+    static constexpr std::byte geSI32{0x4E};
+    static constexpr std::byte geUI32{0x4F};
+    static constexpr std::byte eqzI64{0x50};
+    static constexpr std::byte eqI64{0x51};
+    static constexpr std::byte neI64{0x52};
+    static constexpr std::byte ltSI64{0x53};
+    static constexpr std::byte ltUI64{0x54};
+    static constexpr std::byte gtSI64{0x55};
+    static constexpr std::byte gtUI64{0x56};
+    static constexpr std::byte leSI64{0x57};
+    static constexpr std::byte leUI64{0x58};
+    static constexpr std::byte geSI64{0x59};
+    static constexpr std::byte geUI64{0x5A};
+    static constexpr std::byte eqF32{0x5B};
+    static constexpr std::byte neF32{0x5C};
+    static constexpr std::byte ltF32{0x5D};
+    static constexpr std::byte gtF32{0x5E};
+    static constexpr std::byte leF32{0x5F};
+    static constexpr std::byte geF32{0x60};
+    static constexpr std::byte eqF64{0x61};
+    static constexpr std::byte neF64{0x62};
+    static constexpr std::byte ltF64{0x63};
+    static constexpr std::byte gtF64{0x64};
+    static constexpr std::byte leF64{0x65};
+    static constexpr std::byte geF64{0x66};
+
     // Numeric operations.
     static constexpr std::byte clzI32{0x67};
     static constexpr std::byte ctzI32{0x68};
@@ -93,6 +137,33 @@ struct WasmBinaryEncoding {
     static constexpr std::byte maxF64{0xA5};
     static constexpr std::byte copysignF64{0xA6};
     static constexpr std::byte wrap{0xA7};
+
+    // Conversion operations
+    static constexpr std::byte extendS{0xAC};
+    static constexpr std::byte extendU{0xAD};
+    static constexpr std::byte convertSI32F32{0xB2};
+    static constexpr std::byte convertUI32F32{0xB3};
+    static constexpr std::byte convertSI64F32{0xB4};
+    static constexpr std::byte convertUI64F32{0xB5};
+
+    static constexpr std::byte demoteF64ToF32{0xB6};
+
+    static constexpr std::byte convertSI32F64{0xB7};
+    static constexpr std::byte convertUI32F64{0xB8};
+    static constexpr std::byte convertSI64F64{0xB9};
+    static constexpr std::byte convertUI64F64{0xBA};
+
+    static constexpr std::byte promoteF32ToF64{0xBB};
+    static constexpr std::byte reinterpretF32AsI32{0xBC};
+    static constexpr std::byte reinterpretF64AsI64{0xBD};
+    static constexpr std::byte reinterpretI32AsF32{0xBE};
+    static constexpr std::byte reinterpretI64AsF64{0xBF};
+
+    static constexpr std::byte extendI328S{0xC0};
+    static constexpr std::byte extendI3216S{0xC1};
+    static constexpr std::byte extendI648S{0xC2};
+    static constexpr std::byte extendI6416S{0xC3};
+    static constexpr std::byte extendI6432S{0xC4};
   };
 
   /// Byte encodings of types in Wasm binaries
diff --git a/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h b/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h
index 0fbe15fa2e0db..b7394387b0f9a 100644
--- a/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h
+++ b/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h
@@ -44,6 +44,11 @@ enum class RemarkFormat {
   REMARK_FORMAT_BITSTREAM,
 };
 
+enum class RemarkPolicy {
+  REMARK_POLICY_ALL,
+  REMARK_POLICY_FINAL,
+};
+
 /// Configuration options for the mlir-opt tool.
 /// This is intended to help building tools like mlir-opt by collecting the
 /// supported options.
@@ -242,6 +247,8 @@ class MlirOptMainConfig {
 
   /// Set the reproducer output filename
   RemarkFormat getRemarkFormat() const { return remarkFormatFlag; }
+  /// Set the remark policy to use.
+  RemarkPolicy getRemarkPolicy() const { return remarkPolicyFlag; }
   /// Set the remark format to use.
   std::string getRemarksAllFilter() const { return remarksAllFilterFlag; }
   /// Set the remark output file.
@@ -265,6 +272,8 @@ class MlirOptMainConfig {
 
   /// Remark format
   RemarkFormat remarkFormatFlag = RemarkFormat::REMARK_FORMAT_STDOUT;
+  /// Remark policy
+  RemarkPolicy remarkPolicyFlag = RemarkPolicy::REMARK_POLICY_ALL;
   /// Remark file to output to
   std::string remarksOutputFileFlag = "";
   /// Remark filters
diff --git a/mlir/lib/Analysis/CMakeLists.txt b/mlir/lib/Analysis/CMakeLists.txt
index 609cb34309829..db10ebcf2c311 100644
--- a/mlir/lib/Analysis/CMakeLists.txt
+++ b/mlir/lib/Analysis/CMakeLists.txt
@@ -40,6 +40,7 @@ add_mlir_library(MLIRAnalysis
   DataFlow/IntegerRangeAnalysis.cpp
   DataFlow/LivenessAnalysis.cpp
   DataFlow/SparseAnalysis.cpp
+  DataFlow/StridedMetadataRangeAnalysis.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Analysis
@@ -53,6 +54,7 @@ add_mlir_library(MLIRAnalysis
   MLIRDataLayoutInterfaces
   MLIRFunctionInterfaces
   MLIRInferIntRangeInterface
+  MLIRInferStridedMetadataInterface
   MLIRInferTypeOpInterface
   MLIRLoopLikeInterface
   MLIRPresburger
diff --git a/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp b/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp
index d705d8d4c7819..20be50c8e8a5b 100644
--- a/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp
+++ b/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp
@@ -137,7 +137,8 @@ void LivenessAnalysis::visitBranchOperand(OpOperand &operand) {
   // Populating such blocks in `blocks`.
   bool mayLive = false;
   SmallVector<Block *, 4> blocks;
-  if (isa<RegionBranchOpInterface>(op)) {
+  SmallVector<BlockArgument> argumentNotOperand;
+  if (auto regionBranchOp = dyn_cast<RegionBranchOpInterface>(op)) {
     if (op->getNumResults() != 0) {
       // This mark value of type 1.c liveness as may live, because the region
       // branch operation has a return value, and the non-forwarded operand can
@@ -165,6 +166,25 @@ void LivenessAnalysis::visitBranchOperand(OpOperand &operand) {
           blocks.push_back(&block);
       }
     }
+
+    // In the block of the successor block argument of RegionBranchOpInterface,
+    // there may be arguments of RegionBranchOpInterface, such as the IV of
+    // scf.forOp. Explicitly set this argument to live.
+    for (Region &region : op->getRegions()) {
+      SmallVector<RegionSuccessor> successors;
+      regionBranchOp.getSuccessorRegions(region, successors);
+      for (RegionSuccessor successor : successors) {
+        if (successor.isParent())
+          continue;
+        auto arguments = successor.getSuccessor()->getArguments();
+        ValueRange regionInputs = successor.getSuccessorInputs();
+        for (auto argument : arguments) {
+          if (llvm::find(regionInputs, argument) == regionInputs.end()) {
+            argumentNotOperand.push_back(argument);
+          }
+        }
+      }
+    }
   } else if (isa<BranchOpInterface>(op)) {
     // We cannot track all successor blocks of the branch operation(More
     // specifically, it's the successor's successor). Additionally, different
@@ -224,6 +244,15 @@ void LivenessAnalysis::visitBranchOperand(OpOperand &operand) {
     Liveness *operandLiveness = getLatticeElement(operand.get());
     LDBG() << "Marking branch operand live: " << operand.get();
     propagateIfChanged(operandLiveness, operandLiveness->markLive());
+    for (BlockArgument argument : argumentNotOperand) {
+      Liveness *argumentLiveness = getLatticeElement(argument);
+      LDBG() << "Marking RegionBranchOp's argument live: " << argument;
+      // TODO: this is overly conservative: we should be able to eliminate
+      // unused values in a RegionBranchOpInterface operation but that may
+      // requires removing operation results which is beyond current
+      // capabilities of this pass right now.
+      propagateIfChanged(argumentLiveness, argumentLiveness->markLive());
+    }
   }
 
   // Now that we have checked for memory-effecting ops in the blocks of concern,
@@ -231,6 +260,8 @@ void LivenessAnalysis::visitBranchOperand(OpOperand &operand) {
   // mark it "live" due to type (1.a/3) liveness.
   SmallVector<Liveness *, 4> operandLiveness;
   operandLiveness.push_back(getLatticeElement(operand.get()));
+  for (BlockArgument argument : argumentNotOperand)
+    operandLiveness.push_back(getLatticeElement(argument));
   SmallVector<const Liveness *, 4> resultsLiveness;
   for (const Value result : op->getResults())
     resultsLiveness.push_back(getLatticeElement(result));
diff --git a/mlir/lib/Analysis/DataFlow/StridedMetadataRangeAnalysis.cpp b/mlir/lib/Analysis/DataFlow/StridedMetadataRangeAnalysis.cpp
new file mode 100644
index 0000000000000..01c9dafaddf10
--- /dev/null
+++ b/mlir/lib/Analysis/DataFlow/StridedMetadataRangeAnalysis.cpp
@@ -0,0 +1,127 @@
+//===- StridedMetadataRangeAnalysis.cpp - Integer range analysis --------*- C++
+//-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the dataflow analysis class for integer range inference
+// which is used in transformations over the `arith` dialect such as
+// branch elimination or signed->unsigned rewriting
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/DataFlow/StridedMetadataRangeAnalysis.h"
+#include "mlir/Analysis/DataFlow/IntegerRangeAnalysis.h"
+#include "mlir/Dialect/Utils/IndexingUtils.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Support/DebugStringHelper.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugLog.h"
+
+#define DEBUG_TYPE "strided-metadata-range-analysis"
+
+using namespace mlir;
+using namespace mlir::dataflow;
+
+/// Get the entry state for a value. For any value that is not a ranked memref,
+/// this function sets the metadata to a top state with no offsets, sizes, or
+/// strides. For `memref` types, this function will use the metadata in the type
+/// to try to deduce as much informaiton as possible.
+static StridedMetadataRange getEntryStateImpl(Value v, int32_t indexBitwidth) {
+  // TODO: generalize this method with a type interface.
+  auto mTy = dyn_cast<BaseMemRefType>(v.getType());
+
+  // If not a memref or it's un-ranked, don't infer any metadata.
+  if (!mTy || !mTy.hasRank())
+    return StridedMetadataRange::getMaxRanges(indexBitwidth, 0, 0, 0);
+
+  // Get the top state.
+  auto metadata =
+      StridedMetadataRange::getMaxRanges(indexBitwidth, mTy.getRank());
+
+  // Compute the offset and strides.
+  int64_t offset;
+  SmallVector<int64_t> strides;
+  if (failed(cast<MemRefType>(mTy).getStridesAndOffset(strides, offset)))
+    return metadata;
+
+  // Refine the metadata if we know it from the type.
+  if (!ShapedType::isDynamic(offset)) {
+    metadata.getOffsets()[0] =
+        ConstantIntRanges::constant(APInt(indexBitwidth, offset));
+  }
+  for (auto &&[size, range] :
+       llvm::zip_equal(mTy.getShape(), metadata.getSizes())) {
+    if (ShapedType::isDynamic(size))
+      continue;
+    range = ConstantIntRanges::constant(APInt(indexBitwidth, size));
+  }
+  for (auto &&[stride, range] :
+       llvm::zip_equal(strides, metadata.getStrides())) {
+    if (ShapedType::isDynamic(stride))
+      continue;
+    range = ConstantIntRanges::constant(APInt(indexBitwidth, stride));
+  }
+
+  return metadata;
+}
+
+StridedMetadataRangeAnalysis::StridedMetadataRangeAnalysis(
+    DataFlowSolver &solver, int32_t indexBitwidth)
+    : SparseForwardDataFlowAnalysis(solver), indexBitwidth(indexBitwidth) {
+  assert(indexBitwidth > 0 && "invalid bitwidth");
+}
+
+void StridedMetadataRangeAnalysis::setToEntryState(
+    StridedMetadataRangeLattice *lattice) {
+  propagateIfChanged(lattice, lattice->join(getEntryStateImpl(
+                                  lattice->getAnchor(), indexBitwidth)));
+}
+
+LogicalResult StridedMetadataRangeAnalysis::visitOperation(
+    Operation *op, ArrayRef<const StridedMetadataRangeLattice *> operands,
+    ArrayRef<StridedMetadataRangeLattice *> results) {
+  auto inferrable = dyn_cast<InferStridedMetadataOpInterface>(op);
+
+  // Bail if we cannot reason about the op.
+  if (!inferrable) {
+    setAllToEntryStates(results);
+    return success();
+  }
+
+  LDBG() << "Inferring metadata for: "
+         << OpWithFlags(op, OpPrintingFlags().skipRegions());
+
+  // Helper function to retrieve int range values.
+  auto getIntRange = [&](Value value) -> IntegerValueRange {
+    auto lattice = getOrCreateFor<IntegerValueRangeLattice>(
+        getProgramPointAfter(op), value);
+    return lattice ? lattice->getValue() : IntegerValueRange();
+  };
+
+  // Convert the arguments lattices to a vector.
+  SmallVector<StridedMetadataRange> argRanges = llvm::map_to_vector(
+      operands, [](const StridedMetadataRangeLattice *lattice) {
+        return lattice->getValue();
+      });
+
+  // Callback to set metadata on a result.
+  auto joinCallback = [&](Value v, const StridedMetadataRange &md) {
+    auto result = cast<OpResult>(v);
+    assert(llvm::is_contained(op->getResults(), result));
+    LDBG() << "- Inferred metadata: " << md;
+    StridedMetadataRangeLattice *lattice = results[result.getResultNumber()];
+    ChangeResult changed = lattice->join(md);
+    LDBG() << "- Joined metadata: " << lattice->getValue();
+    propagateIfChanged(lattice, changed);
+  };
+
+  // Infer the metadata.
+  inferrable.inferStridedMetadataRanges(argRanges, getIntRange, joinCallback,
+                                        indexBitwidth);
+  return success();
+}
diff --git a/mlir/lib/Analysis/FlatLinearValueConstraints.cpp b/mlir/lib/Analysis/FlatLinearValueConstraints.cpp
index 30ce1fb320017..6588b53cbd9f8 100644
--- a/mlir/lib/Analysis/FlatLinearValueConstraints.cpp
+++ b/mlir/lib/Analysis/FlatLinearValueConstraints.cpp
@@ -1244,8 +1244,9 @@ bool FlatLinearValueConstraints::areVarsAlignedWithOther(
 
 /// Checks if the SSA values associated with `cst`'s variables in range
 /// [start, end) are unique.
-static bool LLVM_ATTRIBUTE_UNUSED areVarsUnique(
-    const FlatLinearValueConstraints &cst, unsigned start, unsigned end) {
+[[maybe_unused]] static bool
+areVarsUnique(const FlatLinearValueConstraints &cst, unsigned start,
+              unsigned end) {
 
   assert(start <= cst.getNumDimAndSymbolVars() &&
          "Start position out of bounds");
@@ -1267,14 +1268,14 @@ static bool LLVM_ATTRIBUTE_UNUSED areVarsUnique(
 }
 
 /// Checks if the SSA values associated with `cst`'s variables are unique.
-static bool LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]] static bool
 areVarsUnique(const FlatLinearValueConstraints &cst) {
   return areVarsUnique(cst, 0, cst.getNumDimAndSymbolVars());
 }
 
 /// Checks if the SSA values associated with `cst`'s variables of kind `kind`
 /// are unique.
-static bool LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]] static bool
 areVarsUnique(const FlatLinearValueConstraints &cst, VarKind kind) {
 
   if (kind == VarKind::SetDim)
diff --git a/mlir/lib/Analysis/Presburger/Simplex.cpp b/mlir/lib/Analysis/Presburger/Simplex.cpp
index a1cbe2990c67d..547a4c2c99db7 100644
--- a/mlir/lib/Analysis/Presburger/Simplex.cpp
+++ b/mlir/lib/Analysis/Presburger/Simplex.cpp
@@ -34,7 +34,7 @@ using Direction = Simplex::Direction;
 const int nullIndex = std::numeric_limits<int>::max();
 
 // Return a + scale*b;
-LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]]
 static SmallVector<DynamicAPInt, 8>
 scaleAndAddForAssert(ArrayRef<DynamicAPInt> a, const DynamicAPInt &scale,
                      ArrayRef<DynamicAPInt> b) {
diff --git a/mlir/lib/Bindings/Python/DialectLLVM.cpp b/mlir/lib/Bindings/Python/DialectLLVM.cpp
index 38de4a0e329a0..870a713b8edcb 100644
--- a/mlir/lib/Bindings/Python/DialectLLVM.cpp
+++ b/mlir/lib/Bindings/Python/DialectLLVM.cpp
@@ -11,6 +11,7 @@
 #include "mlir-c/Dialect/LLVM.h"
 #include "mlir-c/IR.h"
 #include "mlir-c/Support.h"
+#include "mlir-c/Target/LLVMIR.h"
 #include "mlir/Bindings/Python/Diagnostics.h"
 #include "mlir/Bindings/Python/Nanobind.h"
 #include "mlir/Bindings/Python/NanobindAdaptors.h"
@@ -24,7 +25,7 @@ using namespace mlir;
 using namespace mlir::python;
 using namespace mlir::python::nanobind_adaptors;
 
-static void populateDialectLLVMSubmodule(const nanobind::module_ &m) {
+static void populateDialectLLVMSubmodule(nanobind::module_ &m) {
 
   //===--------------------------------------------------------------------===//
   // StructType
@@ -154,6 +155,16 @@ static void populateDialectLLVMSubmodule(const nanobind::module_ &m) {
       .def_property_readonly("address_space", [](MlirType type) {
         return mlirLLVMPointerTypeGetAddressSpace(type);
       });
+
+  m.def(
+      "translate_module_to_llvmir",
+      [](MlirOperation module) {
+        return mlirTranslateModuleToLLVMIRToString(module);
+      },
+      // clang-format off
+      nb::sig("def translate_module_to_llvmir(module: " MAKE_MLIR_PYTHON_QUALNAME("ir.Operation") ") -> str"),
+      // clang-format on
+      "module"_a, nb::rv_policy::take_ownership);
 }
 
 NB_MODULE(_mlirDialectsLLVM, m) {
diff --git a/mlir/lib/Bindings/Python/IRAttributes.cpp b/mlir/lib/Bindings/Python/IRAttributes.cpp
index 045c0fbf4630f..c0a945e3f4f3b 100644
--- a/mlir/lib/Bindings/Python/IRAttributes.cpp
+++ b/mlir/lib/Bindings/Python/IRAttributes.cpp
@@ -1306,6 +1306,10 @@ PyType_Slot PyDenseElementsAttribute::slots[] = {
     e.restore();
     nb::chain_error(PyExc_BufferError, "Error converting attribute to buffer");
     return -1;
+  } catch (std::exception &e) {
+    nb::chain_error(PyExc_BufferError,
+                    "Error converting attribute to buffer: %s", e.what());
+    return -1;
   }
   view->obj = obj;
   view->ndim = 1;
diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp
index 7b1710656243a..06d0256e6287b 100644
--- a/mlir/lib/Bindings/Python/IRCore.cpp
+++ b/mlir/lib/Bindings/Python/IRCore.cpp
@@ -2730,6 +2730,17 @@ class PyOpAttributeMap {
         operation->get(), toMlirStringRef(name)));
   }
 
+  static void
+  forEachAttr(MlirOperation op,
+              llvm::function_ref<void(MlirStringRef, MlirAttribute)> fn) {
+    intptr_t n = mlirOperationGetNumAttributes(op);
+    for (intptr_t i = 0; i < n; ++i) {
+      MlirNamedAttribute na = mlirOperationGetAttribute(op, i);
+      MlirStringRef name = mlirIdentifierStr(na.name);
+      fn(name, na.attribute);
+    }
+  }
+
   static void bind(nb::module_ &m) {
     nb::class_<PyOpAttributeMap>(m, "OpAttributeMap")
         .def("__contains__", &PyOpAttributeMap::dunderContains)
@@ -2737,7 +2748,50 @@ class PyOpAttributeMap {
         .def("__getitem__", &PyOpAttributeMap::dunderGetItemNamed)
         .def("__getitem__", &PyOpAttributeMap::dunderGetItemIndexed)
         .def("__setitem__", &PyOpAttributeMap::dunderSetItem)
-        .def("__delitem__", &PyOpAttributeMap::dunderDelItem);
+        .def("__delitem__", &PyOpAttributeMap::dunderDelItem)
+        .def("__iter__",
+             [](PyOpAttributeMap &self) {
+               nb::list keys;
+               PyOpAttributeMap::forEachAttr(
+                   self.operation->get(),
+                   [&](MlirStringRef name, MlirAttribute) {
+                     keys.append(nb::str(name.data, name.length));
+                   });
+               return nb::iter(keys);
+             })
+        .def("keys",
+             [](PyOpAttributeMap &self) {
+               nb::list out;
+               PyOpAttributeMap::forEachAttr(
+                   self.operation->get(),
+                   [&](MlirStringRef name, MlirAttribute) {
+                     out.append(nb::str(name.data, name.length));
+                   });
+               return out;
+             })
+        .def("values",
+             [](PyOpAttributeMap &self) {
+               nb::list out;
+               PyOpAttributeMap::forEachAttr(
+                   self.operation->get(),
+                   [&](MlirStringRef, MlirAttribute attr) {
+                     out.append(PyAttribute(self.operation->getContext(), attr)
+                                    .maybeDownCast());
+                   });
+               return out;
+             })
+        .def("items", [](PyOpAttributeMap &self) {
+          nb::list out;
+          PyOpAttributeMap::forEachAttr(
+              self.operation->get(),
+              [&](MlirStringRef name, MlirAttribute attr) {
+                out.append(nb::make_tuple(
+                    nb::str(name.data, name.length),
+                    PyAttribute(self.operation->getContext(), attr)
+                        .maybeDownCast()));
+              });
+          return out;
+        });
   }
 
 private:
diff --git a/mlir/lib/Bindings/Python/Rewrite.cpp b/mlir/lib/Bindings/Python/Rewrite.cpp
index 5ddb3fbbb1317..0f0ed22c50fa9 100644
--- a/mlir/lib/Bindings/Python/Rewrite.cpp
+++ b/mlir/lib/Bindings/Python/Rewrite.cpp
@@ -205,7 +205,7 @@ class PyRewritePatternSet {
       nb::object res = f(opView, PyPatternRewriter(rewriter));
       return logicalResultFromObject(res);
     };
-    MlirRewritePattern pattern = mlirOpRewritePattenCreate(
+    MlirRewritePattern pattern = mlirOpRewritePatternCreate(
         rootName, benefit, ctx, callbacks, matchAndRewrite.ptr(),
         /* nGeneratedNames */ 0,
         /* generatedNames */ nullptr);
diff --git a/mlir/lib/CAPI/Target/LLVMIR.cpp b/mlir/lib/CAPI/Target/LLVMIR.cpp
index 1c1912aec0f2f..00229dffafb61 100644
--- a/mlir/lib/CAPI/Target/LLVMIR.cpp
+++ b/mlir/lib/CAPI/Target/LLVMIR.cpp
@@ -34,6 +34,15 @@ LLVMModuleRef mlirTranslateModuleToLLVMIR(MlirOperation module,
   return moduleRef;
 }
 
+char *mlirTranslateModuleToLLVMIRToString(MlirOperation module) {
+  LLVMContextRef llvmCtx = LLVMContextCreate();
+  LLVMModuleRef llvmModule = mlirTranslateModuleToLLVMIR(module, llvmCtx);
+  char *llvmir = LLVMPrintModuleToString(llvmModule);
+  LLVMDisposeModule(llvmModule);
+  LLVMContextDispose(llvmCtx);
+  return llvmir;
+}
+
 DEFINE_C_API_PTR_METHODS(MlirTypeFromLLVMIRTranslator,
                          mlir::LLVM::TypeFromLLVMIRTranslator)
 
diff --git a/mlir/lib/CAPI/Transforms/Rewrite.cpp b/mlir/lib/CAPI/Transforms/Rewrite.cpp
index 46c329d8433b4..41ceb1580a4e8 100644
--- a/mlir/lib/CAPI/Transforms/Rewrite.cpp
+++ b/mlir/lib/CAPI/Transforms/Rewrite.cpp
@@ -341,7 +341,7 @@ class ExternalRewritePattern : public mlir::RewritePattern {
 
 } // namespace mlir
 
-MlirRewritePattern mlirOpRewritePattenCreate(
+MlirRewritePattern mlirOpRewritePatternCreate(
     MlirStringRef rootName, unsigned benefit, MlirContext context,
     MlirRewritePatternCallbacks callbacks, void *userData,
     size_t nGeneratedNames, MlirStringRef *generatedNames) {
diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt
index 71986f83c4870..bebf1b8fff3f9 100644
--- a/mlir/lib/Conversion/CMakeLists.txt
+++ b/mlir/lib/Conversion/CMakeLists.txt
@@ -40,6 +40,7 @@ add_subdirectory(MathToLibm)
 add_subdirectory(MathToLLVM)
 add_subdirectory(MathToROCDL)
 add_subdirectory(MathToSPIRV)
+add_subdirectory(MathToXeVM)
 add_subdirectory(MemRefToEmitC)
 add_subdirectory(MemRefToLLVM)
 add_subdirectory(MemRefToSPIRV)
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index b215211e131d4..c03f3a5d3889c 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -484,5 +484,5 @@ void mlir::populateGpuToROCDLConversionPatterns(
                GPUSubgroupBroadcastOpToROCDL>(converter);
   patterns.add<GPUSubgroupSizeOpToROCDL>(converter, chipset);
 
-  populateMathToROCDLConversionPatterns(converter, patterns);
+  populateMathToROCDLConversionPatterns(converter, patterns, chipset);
 }
diff --git a/mlir/lib/Conversion/MathToROCDL/CMakeLists.txt b/mlir/lib/Conversion/MathToROCDL/CMakeLists.txt
index 2771955aa9493..8cc3fde827830 100644
--- a/mlir/lib/Conversion/MathToROCDL/CMakeLists.txt
+++ b/mlir/lib/Conversion/MathToROCDL/CMakeLists.txt
@@ -11,6 +11,7 @@ add_mlir_conversion_library(MLIRMathToROCDL
   Core
 
   LINK_LIBS PUBLIC
+  MLIRAMDGPUUtils
   MLIRDialectUtils
   MLIRFuncDialect
   MLIRGPUToGPURuntimeTransforms
diff --git a/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp b/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp
index df219f3ff4f6e..a2dfc12cc9d63 100644
--- a/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp
+++ b/mlir/lib/Conversion/MathToROCDL/MathToROCDL.cpp
@@ -10,6 +10,8 @@
 #include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
 #include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
 #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/Conversion/LLVMCommon/VectorPattern.h"
+#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
@@ -19,6 +21,7 @@
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include "llvm/Support/DebugLog.h"
 
 #include "../GPUCommon/GPUOpsLowering.h"
 #include "../GPUCommon/OpToFuncCallLowering.h"
@@ -42,8 +45,46 @@ static void populateOpPatterns(const LLVMTypeConverter &converter,
                                            f32ApproxFunc, f16Func);
 }
 
+struct ClampFOpConversion final
+    : public ConvertOpToLLVMPattern<math::ClampFOp> {
+  using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
+
+  LogicalResult
+  matchAndRewrite(math::ClampFOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    // Only f16 and f32 types are supported by fmed3
+    Type opTy = op.getType();
+    Type resultType = getTypeConverter()->convertType(opTy);
+
+    if (auto vectorType = dyn_cast<VectorType>(opTy))
+      opTy = vectorType.getElementType();
+
+    if (!isa<Float16Type, Float32Type>(opTy))
+      return rewriter.notifyMatchFailure(
+          op, "fmed3 only supports f16 and f32 types");
+
+    // Handle multi-dimensional vectors (converted to LLVM arrays)
+    if (auto arrayType = dyn_cast<LLVM::LLVMArrayType>(resultType))
+      return LLVM::detail::handleMultidimensionalVectors(
+          op.getOperation(), adaptor.getOperands(), *getTypeConverter(),
+          [&](Type llvm1DVectorTy, ValueRange operands) -> Value {
+            typename math::ClampFOp::Adaptor adaptor(operands);
+            return ROCDL::FMed3Op::create(rewriter, op.getLoc(), llvm1DVectorTy,
+                                          adaptor.getValue(), adaptor.getMin(),
+                                          adaptor.getMax());
+          },
+          rewriter);
+
+    // Handle 1D vectors and scalars directly
+    rewriter.replaceOpWithNewOp<ROCDL::FMed3Op>(op, op.getType(), op.getValue(),
+                                                op.getMin(), op.getMax());
+    return success();
+  }
+};
+
 void mlir::populateMathToROCDLConversionPatterns(
-    const LLVMTypeConverter &converter, RewritePatternSet &patterns) {
+    const LLVMTypeConverter &converter, RewritePatternSet &patterns,
+    std::optional<amdgpu::Chipset> chipset) {
   // Handled by mathToLLVM: math::AbsIOp
   // Handled by mathToLLVM: math::AbsFOp
   // Handled by mathToLLVM: math::CopySignOp
@@ -118,15 +159,21 @@ void mlir::populateMathToROCDLConversionPatterns(
   // worth creating a separate pass for it.
   populateOpPatterns<arith::RemFOp>(converter, patterns, "__ocml_fmod_f32",
                                     "__ocml_fmod_f64", "__ocml_fmod_f16");
+
+  if (chipset.has_value() && chipset->majorVersion >= 9) {
+    patterns.add<ClampFOpConversion>(converter);
+  } else {
+    LDBG() << "Chipset dependent patterns were not added";
+  }
 }
 
-namespace {
-struct ConvertMathToROCDLPass
-    : public impl::ConvertMathToROCDLBase<ConvertMathToROCDLPass> {
-  ConvertMathToROCDLPass() = default;
+struct ConvertMathToROCDLPass final
+    : impl::ConvertMathToROCDLBase<ConvertMathToROCDLPass> {
+  using impl::ConvertMathToROCDLBase<
+      ConvertMathToROCDLPass>::ConvertMathToROCDLBase;
+
   void runOnOperation() override;
 };
-} // namespace
 
 void ConvertMathToROCDLPass::runOnOperation() {
   auto m = getOperation();
@@ -135,10 +182,21 @@ void ConvertMathToROCDLPass::runOnOperation() {
   RewritePatternSet patterns(&getContext());
   LowerToLLVMOptions options(ctx, DataLayout(m));
   LLVMTypeConverter converter(ctx, options);
-  populateMathToROCDLConversionPatterns(converter, patterns);
+
+  FailureOr<amdgpu::Chipset> maybeChipset;
+  if (!chipset.empty()) {
+    maybeChipset = amdgpu::Chipset::parse(chipset);
+    if (failed(maybeChipset))
+      return signalPassFailure();
+  }
+  populateMathToROCDLConversionPatterns(
+      converter, patterns,
+      succeeded(maybeChipset) ? std::optional(*maybeChipset) : std::nullopt);
+
   ConversionTarget target(getContext());
-  target.addLegalDialect<BuiltinDialect, func::FuncDialect,
-                         vector::VectorDialect, LLVM::LLVMDialect>();
+  target
+      .addLegalDialect<BuiltinDialect, func::FuncDialect, vector::VectorDialect,
+                       LLVM::LLVMDialect, ROCDL::ROCDLDialect>();
   target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op, LLVM::FAbsOp,
                       LLVM::FCeilOp, LLVM::FFloorOp, LLVM::FRemOp, LLVM::LogOp,
                       LLVM::Log10Op, LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp,
diff --git a/mlir/lib/Conversion/MathToSPIRV/MathToSPIRV.cpp b/mlir/lib/Conversion/MathToSPIRV/MathToSPIRV.cpp
index f0d8b78058607..610ce1f13c56b 100644
--- a/mlir/lib/Conversion/MathToSPIRV/MathToSPIRV.cpp
+++ b/mlir/lib/Conversion/MathToSPIRV/MathToSPIRV.cpp
@@ -407,11 +407,11 @@ struct PowFOpPattern final : public OpConversionPattern<math::PowFOp> {
     if (auto vectorType = dyn_cast<VectorType>(operandType))
       nanAttr = DenseElementsAttr::get(vectorType, nan);
 
-    Value NanValue =
+    Value nanValue =
         spirv::ConstantOp::create(rewriter, loc, operandType, nanAttr);
     Value lhs =
         spirv::SelectOp::create(rewriter, loc, cmpNegativeWithFractionalExp,
-                                NanValue, adaptor.getLhs());
+                                nanValue, adaptor.getLhs());
     Value abs = spirv::GLFAbsOp::create(rewriter, loc, lhs);
 
     // TODO: The following just forcefully casts y into an integer value in
diff --git a/mlir/lib/Conversion/MathToXeVM/CMakeLists.txt b/mlir/lib/Conversion/MathToXeVM/CMakeLists.txt
new file mode 100644
index 0000000000000..050c0ed90e383
--- /dev/null
+++ b/mlir/lib/Conversion/MathToXeVM/CMakeLists.txt
@@ -0,0 +1,22 @@
+add_mlir_conversion_library(MLIRMathToXeVM
+  MathToXeVM.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/MathToXeVM
+
+  DEPENDS
+  MLIRConversionPassIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  MLIRArithAttrToLLVMConversion
+  MLIRArithDialect
+  MLIRLLVMCommonConversion
+  MLIRLLVMDialect
+  MLIRMathDialect
+  MLIRXeVMDialect
+  MLIRPass
+  MLIRTransforms
+  )
diff --git a/mlir/lib/Conversion/MathToXeVM/MathToXeVM.cpp b/mlir/lib/Conversion/MathToXeVM/MathToXeVM.cpp
new file mode 100644
index 0000000000000..0fe31d000237d
--- /dev/null
+++ b/mlir/lib/Conversion/MathToXeVM/MathToXeVM.cpp
@@ -0,0 +1,167 @@
+//===-- MathToXeVM.cpp - conversion from Math to XeVM ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/MathToXeVM/MathToXeVM.h"
+#include "mlir/Conversion/ArithCommon/AttrToLLVMConverter.h"
+#include "mlir/Dialect/LLVMIR/FunctionCallUtils.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/IR/BuiltinDialect.h"
+#include "mlir/Pass/Pass.h"
+#include "llvm/Support/FormatVariadic.h"
+
+namespace mlir {
+#define GEN_PASS_DEF_CONVERTMATHTOXEVM
+#include "mlir/Conversion/Passes.h.inc"
+} // namespace mlir
+
+using namespace mlir;
+
+#define DEBUG_TYPE "math-to-xevm"
+
+/// Convert math ops marked with `fast` (`afn`) to native OpenCL intrinsics.
+template <typename Op>
+struct ConvertNativeFuncPattern final : public OpConversionPattern<Op> {
+
+  ConvertNativeFuncPattern(MLIRContext *context, StringRef nativeFunc,
+                           PatternBenefit benefit = 1)
+      : OpConversionPattern<Op>(context, benefit), nativeFunc(nativeFunc) {}
+
+  LogicalResult
+  matchAndRewrite(Op op, typename Op::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    if (!isSPIRVCompatibleFloatOrVec(op.getType()))
+      return failure();
+
+    arith::FastMathFlags fastFlags = op.getFastmath();
+    if (!arith::bitEnumContainsAll(fastFlags, arith::FastMathFlags::afn))
+      return rewriter.notifyMatchFailure(op, "not a fastmath `afn` operation");
+
+    SmallVector<Type, 1> operandTypes;
+    for (auto operand : adaptor.getOperands()) {
+      Type opTy = operand.getType();
+      // This pass only supports operations on vectors that are already in SPIRV
+      // supported vector sizes: Distributing unsupported vector sizes to SPIRV
+      // supported vector sizes are done in other blocking optimization passes.
+      if (!isSPIRVCompatibleFloatOrVec(opTy))
+        return rewriter.notifyMatchFailure(
+            op, llvm::formatv("incompatible operand type: '{0}'", opTy));
+      operandTypes.push_back(opTy);
+    }
+
+    auto moduleOp = op->template getParentWithTrait<OpTrait::SymbolTable>();
+    auto funcOpRes = LLVM::lookupOrCreateFn(
+        rewriter, moduleOp, getMangledNativeFuncName(operandTypes),
+        operandTypes, op.getType());
+    assert(!failed(funcOpRes));
+    LLVM::LLVMFuncOp funcOp = funcOpRes.value();
+
+    auto callOp = rewriter.replaceOpWithNewOp<LLVM::CallOp>(
+        op, funcOp, adaptor.getOperands());
+    // Preserve fastmath flags in our MLIR op when converting to llvm function
+    // calls, in order to allow further fastmath optimizations: We thus need to
+    // convert arith fastmath attrs into attrs recognized by llvm.
+    arith::AttrConvertFastMathToLLVM<Op, LLVM::CallOp> fastAttrConverter(op);
+    mlir::NamedAttribute fastAttr = fastAttrConverter.getAttrs()[0];
+    callOp->setAttr(fastAttr.getName(), fastAttr.getValue());
+    return success();
+  }
+
+  inline bool isSPIRVCompatibleFloatOrVec(Type type) const {
+    if (type.isFloat())
+      return true;
+    if (auto vecType = dyn_cast<VectorType>(type)) {
+      if (!vecType.getElementType().isFloat())
+        return false;
+      // SPIRV distinguishes between vectors and matrices: OpenCL native math
+      // intrsinics are not compatible with matrices.
+      ArrayRef<int64_t> shape = vecType.getShape();
+      if (shape.size() != 1)
+        return false;
+      // SPIRV only allows vectors of size 2, 3, 4, 8, 16.
+      if (shape[0] == 2 || shape[0] == 3 || shape[0] == 4 || shape[0] == 8 ||
+          shape[0] == 16)
+        return true;
+    }
+    return false;
+  }
+
+  inline std::string
+  getMangledNativeFuncName(const ArrayRef<Type> operandTypes) const {
+    std::string mangledFuncName =
+        "_Z" + std::to_string(nativeFunc.size()) + nativeFunc.str();
+
+    auto appendFloatToMangledFunc = [&mangledFuncName](Type type) {
+      if (type.isF32())
+        mangledFuncName += "f";
+      else if (type.isF16())
+        mangledFuncName += "Dh";
+      else if (type.isF64())
+        mangledFuncName += "d";
+    };
+
+    for (auto type : operandTypes) {
+      if (auto vecType = dyn_cast<VectorType>(type)) {
+        mangledFuncName += "Dv" + std::to_string(vecType.getShape()[0]) + "_";
+        appendFloatToMangledFunc(vecType.getElementType());
+      } else
+        appendFloatToMangledFunc(type);
+    }
+
+    return mangledFuncName;
+  }
+
+  const StringRef nativeFunc;
+};
+
+void mlir::populateMathToXeVMConversionPatterns(RewritePatternSet &patterns,
+                                                bool convertArith) {
+  patterns.add<ConvertNativeFuncPattern<math::ExpOp>>(patterns.getContext(),
+                                                      "__spirv_ocl_native_exp");
+  patterns.add<ConvertNativeFuncPattern<math::CosOp>>(patterns.getContext(),
+                                                      "__spirv_ocl_native_cos");
+  patterns.add<ConvertNativeFuncPattern<math::Exp2Op>>(
+      patterns.getContext(), "__spirv_ocl_native_exp2");
+  patterns.add<ConvertNativeFuncPattern<math::LogOp>>(patterns.getContext(),
+                                                      "__spirv_ocl_native_log");
+  patterns.add<ConvertNativeFuncPattern<math::Log2Op>>(
+      patterns.getContext(), "__spirv_ocl_native_log2");
+  patterns.add<ConvertNativeFuncPattern<math::Log10Op>>(
+      patterns.getContext(), "__spirv_ocl_native_log10");
+  patterns.add<ConvertNativeFuncPattern<math::PowFOp>>(
+      patterns.getContext(), "__spirv_ocl_native_powr");
+  patterns.add<ConvertNativeFuncPattern<math::RsqrtOp>>(
+      patterns.getContext(), "__spirv_ocl_native_rsqrt");
+  patterns.add<ConvertNativeFuncPattern<math::SinOp>>(patterns.getContext(),
+                                                      "__spirv_ocl_native_sin");
+  patterns.add<ConvertNativeFuncPattern<math::SqrtOp>>(
+      patterns.getContext(), "__spirv_ocl_native_sqrt");
+  patterns.add<ConvertNativeFuncPattern<math::TanOp>>(patterns.getContext(),
+                                                      "__spirv_ocl_native_tan");
+  if (convertArith)
+    patterns.add<ConvertNativeFuncPattern<arith::DivFOp>>(
+        patterns.getContext(), "__spirv_ocl_native_divide");
+}
+
+namespace {
+struct ConvertMathToXeVMPass
+    : public impl::ConvertMathToXeVMBase<ConvertMathToXeVMPass> {
+  using Base::Base;
+  void runOnOperation() override;
+};
+} // namespace
+
+void ConvertMathToXeVMPass::runOnOperation() {
+  RewritePatternSet patterns(&getContext());
+  populateMathToXeVMConversionPatterns(patterns, convertArith);
+  ConversionTarget target(getContext());
+  target.addLegalDialect<BuiltinDialect, LLVM::LLVMDialect>();
+  if (failed(
+          applyPartialConversion(getOperation(), target, std::move(patterns))))
+    signalPassFailure();
+}
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index 5355909b62a7f..41d8d532757ad 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -1723,17 +1723,18 @@ struct VectorBroadcastScalarToLowRankLowering
       return success();
     }
 
-    // For 1-d vector, we additionally do a `vectorshuffle`.
     auto v =
         LLVM::InsertElementOp::create(rewriter, broadcast.getLoc(), vectorType,
                                       poison, adaptor.getSource(), zero);
 
+    // For 1-d vector, we additionally do a `shufflevector`.
     int64_t width = cast<VectorType>(broadcast.getType()).getDimSize(0);
     SmallVector<int32_t> zeroValues(width, 0);
 
     // Shuffle the value across the desired number of elements.
-    rewriter.replaceOpWithNewOp<LLVM::ShuffleVectorOp>(broadcast, v, poison,
-                                                       zeroValues);
+    auto shuffle = rewriter.createOrFold<LLVM::ShuffleVectorOp>(
+        broadcast.getLoc(), v, poison, zeroValues);
+    rewriter.replaceOp(broadcast, shuffle);
     return success();
   }
 };
diff --git a/mlir/lib/Conversion/XeGPUToXeVM/CMakeLists.txt b/mlir/lib/Conversion/XeGPUToXeVM/CMakeLists.txt
index 84b25809f1ed0..dd9edc43a1657 100644
--- a/mlir/lib/Conversion/XeGPUToXeVM/CMakeLists.txt
+++ b/mlir/lib/Conversion/XeGPUToXeVM/CMakeLists.txt
@@ -21,6 +21,7 @@ add_mlir_conversion_library(MLIRXeGPUToXeVM
   MLIRIndexDialect
   MLIRSCFDialect
   MLIRXeGPUDialect
+  MLIRXeGPUUtils
   MLIRPass
   MLIRTransforms
   MLIRSCFTransforms
diff --git a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
index 71687b1479a7d..fcbf66dbe9e45 100644
--- a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
+++ b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
@@ -20,7 +20,9 @@
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/SCF/Transforms/Patterns.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
+#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/LLVM.h"
 #include "llvm/ADT/STLExtras.h"
@@ -62,6 +64,7 @@ static int32_t getNumericXeVMAddrSpace(xegpu::MemorySpace xeGpuMemspace) {
   case xegpu::MemorySpace::SLM:
     return static_cast<int>(xevm::AddrSpace::SHARED);
   }
+  llvm_unreachable("Unknown XeGPU memory space");
 }
 
 // Get same bitwidth flat vector type of new element type.
@@ -185,6 +188,7 @@ class CreateNdDescToXeVMPattern
     int64_t rank = mixedSizes.size();
     if (rank != 2)
       return rewriter.notifyMatchFailure(op, "Expected 2D shape.");
+
     auto sourceTy = source.getType();
     auto sourceMemrefTy = dyn_cast<MemRefType>(sourceTy);
     // If source is a memref, we need to extract the aligned pointer as index.
@@ -363,10 +367,11 @@ class LoadStorePrefetchNdToXeVMPattern : public OpConversionPattern<OpType> {
 
 // Add a builder that creates
 // offset * elemByteSize + baseAddr
-static Value addOffset(ConversionPatternRewriter &rewriter, Location loc,
-                       Value baseAddr, Value offset, int64_t elemByteSize) {
+static Value addOffsetToBaseAddr(ConversionPatternRewriter &rewriter,
+                                 Location loc, Value baseAddr, Value offset,
+                                 int64_t elemByteSize) {
   Value byteSize = arith::ConstantIntOp::create(
-      rewriter, loc, rewriter.getI64Type(), elemByteSize);
+      rewriter, loc, baseAddr.getType(), elemByteSize);
   Value byteOffset = arith::MulIOp::create(rewriter, loc, offset, byteSize);
   Value newAddr = arith::AddIOp::create(rewriter, loc, baseAddr, byteOffset);
   return newAddr;
@@ -390,7 +395,8 @@ class LoadStoreToXeVMPattern : public OpConversionPattern<OpType> {
     // Load result or Store valye Type can be vector or scalar.
     Type valOrResTy;
     if constexpr (std::is_same_v<OpType, xegpu::LoadGatherOp>)
-      valOrResTy = op.getResult().getType();
+      valOrResTy =
+          this->getTypeConverter()->convertType(op.getResult().getType());
     else
       valOrResTy = adaptor.getValue().getType();
     VectorType valOrResVecTy = dyn_cast<VectorType>(valOrResTy);
@@ -441,7 +447,8 @@ class LoadStoreToXeVMPattern : public OpConversionPattern<OpType> {
       // If offset is provided, we add them to the base pointer.
       // Offset is in number of elements, we need to multiply by
       // element byte size.
-      basePtrI64 = addOffset(rewriter, loc, basePtrI64, offset, elemByteSize);
+      basePtrI64 =
+          addOffsetToBaseAddr(rewriter, loc, basePtrI64, offset, elemByteSize);
     }
     // Convert base pointer (i64) to LLVM pointer type.
     Value basePtrLLVM =
@@ -504,6 +511,147 @@ class LoadStoreToXeVMPattern : public OpConversionPattern<OpType> {
   }
 };
 
+// Lower xegpu::CreateMemDescOp to memref::ViewOp. Since SLM access instructions
+// on Xe2 and Xe3 operate on 32-bit or 64-bit units, all data types smaller than
+// 32 bits will be converted to 32 bits.
+class CreateMemDescOpPattern final
+    : public OpConversionPattern<xegpu::CreateMemDescOp> {
+public:
+  using OpConversionPattern<xegpu::CreateMemDescOp>::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(xegpu::CreateMemDescOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    auto resTy = op.getMemDesc();
+
+    // Create the result MemRefType with the same shape, element type, and
+    // memory space
+    auto newResTy = getTypeConverter()->convertType<MemRefType>(resTy);
+
+    Value zero = arith::ConstantIndexOp::create(rewriter, op.getLoc(), 0);
+    auto viewOp = memref::ViewOp::create(rewriter, op.getLoc(), newResTy,
+                                         op.getSource(), zero, ValueRange());
+    rewriter.replaceOp(op, viewOp);
+    return success();
+  }
+};
+
+template <typename OpType,
+          typename = std::enable_if_t<llvm::is_one_of<
+              OpType, xegpu::LoadMatrixOp, xegpu::StoreMatrixOp>::value>>
+class LoadStoreMatrixToXeVMPattern : public OpConversionPattern<OpType> {
+  using OpConversionPattern<OpType>::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(OpType op, typename OpType::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    SmallVector<OpFoldResult> offsets = op.getMixedOffsets();
+    if (offsets.empty())
+      return rewriter.notifyMatchFailure(op, "Expected offset to be provided.");
+
+    auto loc = op.getLoc();
+    auto ctxt = rewriter.getContext();
+    Value basePtrStruct = adaptor.getMemDesc();
+    Value mdescVal = op.getMemDesc();
+    // Load result or Store value Type can be vector or scalar.
+    Value data;
+    if constexpr (std::is_same_v<OpType, xegpu::LoadMatrixOp>)
+      data = op.getResult();
+    else
+      data = adaptor.getData();
+    VectorType valOrResVecTy = dyn_cast<VectorType>(data.getType());
+    if (!valOrResVecTy)
+      valOrResVecTy = VectorType::get(1, data.getType());
+
+    int64_t elemBitWidth =
+        valOrResVecTy.getElementType().getIntOrFloatBitWidth();
+    // Element type must be multiple of 8 bits.
+    if (elemBitWidth % 8 != 0)
+      return rewriter.notifyMatchFailure(
+          op, "Expected element type bit width to be multiple of 8.");
+    int64_t elemByteSize = elemBitWidth / 8;
+
+    // Default memory space is SLM.
+    LLVM::LLVMPointerType ptrTypeLLVM = LLVM::LLVMPointerType::get(
+        ctxt, getNumericXeVMAddrSpace(xegpu::MemorySpace::SLM));
+
+    auto mdescTy = cast<xegpu::MemDescType>(mdescVal.getType());
+
+    Value basePtrLLVM = memref::ExtractAlignedPointerAsIndexOp::create(
+        rewriter, loc, basePtrStruct);
+
+    // Convert base pointer (ptr) to i32
+    Value basePtrI32 = arith::IndexCastUIOp::create(
+        rewriter, loc, rewriter.getI32Type(), basePtrLLVM);
+
+    Value linearOffset = mdescTy.getLinearOffsets(rewriter, loc, offsets);
+    linearOffset = arith::IndexCastUIOp::create(
+        rewriter, loc, rewriter.getI32Type(), linearOffset);
+    basePtrI32 = addOffsetToBaseAddr(rewriter, loc, basePtrI32, linearOffset,
+                                     elemByteSize);
+
+    // convert base pointer (i32) to LLVM pointer type
+    basePtrLLVM =
+        LLVM::IntToPtrOp::create(rewriter, loc, ptrTypeLLVM, basePtrI32);
+
+    if (op.getSubgroupBlockIoAttr()) {
+      // if the attribute 'subgroup_block_io' is set to true, it lowers to
+      // xevm.blockload
+
+      Type intElemTy = rewriter.getIntegerType(elemBitWidth);
+      VectorType intVecTy =
+          VectorType::get(valOrResVecTy.getShape(), intElemTy);
+
+      if constexpr (std::is_same_v<OpType, xegpu::LoadMatrixOp>) {
+        Value loadOp =
+            xevm::BlockLoadOp::create(rewriter, loc, intVecTy, basePtrLLVM);
+        if (intVecTy != valOrResVecTy) {
+          loadOp =
+              vector::BitCastOp::create(rewriter, loc, valOrResVecTy, loadOp);
+        }
+        rewriter.replaceOp(op, loadOp);
+      } else {
+        Value dataToStore = adaptor.getData();
+        if (valOrResVecTy != intVecTy) {
+          dataToStore =
+              vector::BitCastOp::create(rewriter, loc, intVecTy, dataToStore);
+        }
+        xevm::BlockStoreOp::create(rewriter, loc, basePtrLLVM, dataToStore,
+                                   nullptr);
+        rewriter.eraseOp(op);
+      }
+      return success();
+    }
+
+    if (valOrResVecTy.getNumElements() >= 1) {
+      auto chipOpt = xegpu::getChipStr(op);
+      if (!chipOpt || (*chipOpt != "pvc" && *chipOpt != "bmg")) {
+        // the lowering for chunk load only works for pvc and bmg
+        return rewriter.notifyMatchFailure(
+            op, "The lowering is specific to pvc or bmg.");
+      }
+    }
+
+    if constexpr (std::is_same_v<OpType, xegpu::LoadMatrixOp>) {
+      // if the size of valOrResVecTy is 1, it lowers to a scalar load/store
+      // operation. LLVM load/store does not support vector of size 1, so we
+      // need to handle this case separately.
+      auto scalarTy = valOrResVecTy.getElementType();
+      LLVM::LoadOp loadOp;
+      if (valOrResVecTy.getNumElements() == 1)
+        loadOp = LLVM::LoadOp::create(rewriter, loc, scalarTy, basePtrLLVM);
+      else
+        loadOp =
+            LLVM::LoadOp::create(rewriter, loc, valOrResVecTy, basePtrLLVM);
+      rewriter.replaceOp(op, loadOp);
+    } else {
+      LLVM::StoreOp::create(rewriter, loc, adaptor.getData(), basePtrLLVM);
+      rewriter.eraseOp(op);
+    }
+    return success();
+  }
+};
+
 class PrefetchToXeVMPattern : public OpConversionPattern<xegpu::PrefetchOp> {
   using OpConversionPattern::OpConversionPattern;
   LogicalResult
@@ -546,8 +694,8 @@ class PrefetchToXeVMPattern : public OpConversionPattern<xegpu::PrefetchOp> {
                 op, "Expected element type bit width to be multiple of 8.");
           elemByteSize = elemBitWidth / 8;
         }
-        basePtrI64 =
-            addOffset(rewriter, loc, basePtrI64, offsets, elemByteSize);
+        basePtrI64 = addOffsetToBaseAddr(rewriter, loc, basePtrI64, offsets,
+                                         elemByteSize);
       }
     }
     // Default memory space is global.
@@ -784,6 +932,13 @@ struct ConvertXeGPUToXeVMPass
       auto i32Type = IntegerType::get(&getContext(), 32);
       return VectorType::get(8, i32Type);
     });
+    // Convert MemDescType into flattened MemRefType for SLM
+    typeConverter.addConversion([&](xegpu::MemDescType type) -> Type {
+      Type elemTy = type.getElementType();
+      int numElems = type.getNumElements();
+      return MemRefType::get(numElems, elemTy, AffineMap(), 3);
+    });
+
     typeConverter.addConversion([&](MemRefType type) -> Type {
       // Convert MemRefType to i64 type.
       return IntegerType::get(&getContext(), 64);
@@ -878,10 +1033,30 @@ struct ConvertXeGPUToXeVMPass
       }
       return {};
     };
-    typeConverter.addSourceMaterialization(memrefMaterializationCast);
-    typeConverter.addSourceMaterialization(ui64MaterializationCast);
-    typeConverter.addSourceMaterialization(ui32MaterializationCast);
-    typeConverter.addSourceMaterialization(vectorMaterializationCast);
+
+    // If result type of original op is single element vector and lowered type
+    // is scalar. This materialization cast creates a single element vector by
+    // broadcasting the scalar value.
+    auto singleElementVectorMaterializationCast =
+        [](OpBuilder &builder, Type type, ValueRange inputs,
+           Location loc) -> Value {
+      if (inputs.size() != 1)
+        return {};
+      auto input = inputs.front();
+      if (input.getType().isIntOrIndexOrFloat()) {
+        // If the input is a scalar, and the target type is a vector of single
+        // element, create a single element vector by broadcasting.
+        if (auto vecTy = dyn_cast<VectorType>(type)) {
+          if (vecTy.getNumElements() == 1) {
+            return vector::BroadcastOp::create(builder, loc, vecTy, input)
+                .getResult();
+          }
+        }
+      }
+      return {};
+    };
+    typeConverter.addSourceMaterialization(
+        singleElementVectorMaterializationCast);
     typeConverter.addTargetMaterialization(memrefMaterializationCast);
     typeConverter.addTargetMaterialization(ui32MaterializationCast);
     typeConverter.addTargetMaterialization(ui64MaterializationCast);
@@ -918,6 +1093,9 @@ void mlir::populateXeGPUToXeVMConversionPatterns(
                LoadStoreToXeVMPattern<xegpu::LoadGatherOp>,
                LoadStoreToXeVMPattern<xegpu::StoreScatterOp>>(
       typeConverter, patterns.getContext());
+  patterns.add<LoadStoreMatrixToXeVMPattern<xegpu::LoadMatrixOp>,
+               LoadStoreMatrixToXeVMPattern<xegpu::StoreMatrixOp>,
+               CreateMemDescOpPattern>(typeConverter, patterns.getContext());
   patterns.add<FenceToXeVMPattern, DpasToXeVMPattern>(typeConverter,
                                                       patterns.getContext());
 }
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index f405d0cc7aa02..61166db0ff210 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -338,6 +338,25 @@ void RawBufferAtomicCmpswapOp::getCanonicalizationPatterns(
       context);
 }
 
+//===----------------------------------------------------------------------===//
+// ScaledExtPacked816Op
+//===----------------------------------------------------------------------===//
+LogicalResult ScaledExtPacked816Op::verify() {
+  int blockSize = getBlockSize();
+  assert((blockSize == 16 || blockSize == 32) && "invalid block size");
+  int firstScaleByte = getFirstScaleByte();
+  if (blockSize == 16 && !llvm::is_contained({0, 1}, firstScaleByte)) {
+    return emitOpError(
+        "blockSize of 16 can only have firstScaleByte be 0 or 1.");
+  }
+  if (blockSize == 32 && !llvm::is_contained({0, 2}, firstScaleByte)) {
+    return emitOpError(
+        "blockSize of 32 can only have firstScaleByte be 0 or 2.");
+  }
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // WMMAOp
 //===----------------------------------------------------------------------===//
@@ -757,13 +776,13 @@ struct PackScales final : OpRewritePattern<ScaledMFMAOp> {
         offset = numElements - 4l;
       }
       Type scaleSrcElemType = scaleSrcType.getElementType();
-      auto newSrcType = VectorType::get(SmallVector<int64_t>({numElements}),
-                                        scaleSrcElemType);
+      auto newSrcType =
+          VectorType::get(ArrayRef{numElements}, scaleSrcElemType);
       Value newScaleSrc =
           vector::ShapeCastOp::create(rewriter, loc, newSrcType, scaleSrc);
       auto extract = vector::ExtractStridedSliceOp::create(
-          rewriter, loc, newScaleSrc, ArrayRef<int64_t>{offset},
-          ArrayRef<int64_t>{size}, ArrayRef<int64_t>{1});
+          rewriter, loc, newScaleSrc, ArrayRef{offset}, ArrayRef{size},
+          ArrayRef{int64_t(1)});
       rewriter.modifyOpInPlace(op, [&] {
         op->setOperand(opIdx, extract);
         setOpsel(opIdx, opsel);
diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
index 7e5ce26b5f733..e0a53cd52f143 100644
--- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
+++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
@@ -125,9 +125,9 @@ static bool remainsLegalAfterInline(OpTy op, Region *src, Region *dest,
 //  Use "unused attribute" marker to silence clang-tidy warning stemming from
 //  the inability to see through "llvm::TypeSwitch".
 template <>
-bool LLVM_ATTRIBUTE_UNUSED remainsLegalAfterInline(AffineApplyOp op,
-                                                   Region *src, Region *dest,
-                                                   const IRMapping &mapping) {
+[[maybe_unused]] bool remainsLegalAfterInline(AffineApplyOp op, Region *src,
+                                              Region *dest,
+                                              const IRMapping &mapping) {
   // If it's a valid dimension, we need to check that it remains so.
   if (isValidDim(op.getResult(), src))
     return remainsLegalAfterInline(
@@ -1032,8 +1032,8 @@ static void simplifyMinOrMaxExprWithOperands(AffineMap &map,
 /// Simplify the map while exploiting information on the values in `operands`.
 //  Use "unused attribute" marker to silence warning stemming from the inability
 //  to see through the template expansion.
-static void LLVM_ATTRIBUTE_UNUSED
-simplifyMapWithOperands(AffineMap &map, ArrayRef<Value> operands) {
+[[maybe_unused]] static void simplifyMapWithOperands(AffineMap &map,
+                                                     ArrayRef<Value> operands) {
   assert(map.getNumInputs() == operands.size() && "invalid operands for map");
   SmallVector<AffineExpr> newResults;
   newResults.reserve(map.getNumResults());
@@ -1125,6 +1125,141 @@ static LogicalResult replaceAffineMinBoundingBoxExpression(AffineMinOp minOp,
   return success(*map != initialMap);
 }
 
+/// Recursively traverse `e`. If `e` or one of its sub-expressions has the form
+/// e1 + e2 + ... + eK, where the e_i are a super(multi)set of `exprsToRemove`,
+/// place a map between e and `newVal` + sum({e1, e2, .. eK} - exprsToRemove)
+/// into `replacementsMap`. If no entries were added to `replacementsMap`,
+/// nothing was found.
+static void shortenAddChainsContainingAll(
+    AffineExpr e, const llvm::SmallDenseSet<AffineExpr, 4> &exprsToRemove,
+    AffineExpr newVal, DenseMap<AffineExpr, AffineExpr> &replacementsMap) {
+  auto binOp = dyn_cast<AffineBinaryOpExpr>(e);
+  if (!binOp)
+    return;
+  AffineExpr lhs = binOp.getLHS();
+  AffineExpr rhs = binOp.getRHS();
+  if (binOp.getKind() != AffineExprKind::Add) {
+    shortenAddChainsContainingAll(lhs, exprsToRemove, newVal, replacementsMap);
+    shortenAddChainsContainingAll(rhs, exprsToRemove, newVal, replacementsMap);
+    return;
+  }
+  SmallVector<AffineExpr> toPreserve;
+  llvm::SmallDenseSet<AffineExpr, 4> ourTracker(exprsToRemove);
+  AffineExpr thisTerm = rhs;
+  AffineExpr nextTerm = lhs;
+
+  while (thisTerm) {
+    if (!ourTracker.erase(thisTerm)) {
+      toPreserve.push_back(thisTerm);
+      shortenAddChainsContainingAll(thisTerm, exprsToRemove, newVal,
+                                    replacementsMap);
+    }
+    auto nextBinOp = dyn_cast_if_present<AffineBinaryOpExpr>(nextTerm);
+    if (!nextBinOp || nextBinOp.getKind() != AffineExprKind::Add) {
+      thisTerm = nextTerm;
+      nextTerm = AffineExpr();
+    } else {
+      thisTerm = nextBinOp.getRHS();
+      nextTerm = nextBinOp.getLHS();
+    }
+  }
+  if (!ourTracker.empty())
+    return;
+  // We reverse the terms to be preserved here in order to preserve
+  // associativity between them.
+  AffineExpr newExpr = newVal;
+  for (AffineExpr preserved : llvm::reverse(toPreserve))
+    newExpr = newExpr + preserved;
+  replacementsMap.insert({e, newExpr});
+}
+
+/// If this map contains of the expression `x_1 + x_1 * C_1 + ... x_n * C_N +
+/// ...` (not necessarily in order) where the set of the `x_i` is the set of
+/// outputs of an `affine.delinearize_index` whos inverse is that expression,
+/// replace that expression with the input of that delinearize_index op.
+///
+/// `unitDimInput` is the input that was detected as the potential start to this
+/// replacement chain - if it isn't the rightmost result of the delinearization,
+/// this method fails. (This is intended to ensure we don't have redundant scans
+/// over the same expression).
+///
+/// While this currently only handles delinearizations with a constant basis,
+/// that isn't a fundamental limitation.
+///
+/// This is a utility function for `replaceDimOrSym` below.
+static LogicalResult replaceAffineDelinearizeIndexInverseExpression(
+    AffineDelinearizeIndexOp delinOp, Value resultToReplace, AffineMap *map,
+    SmallVectorImpl<Value> &dims, SmallVectorImpl<Value> &syms) {
+  if (!delinOp.getDynamicBasis().empty())
+    return failure();
+  if (resultToReplace != delinOp.getMultiIndex().back())
+    return failure();
+
+  MLIRContext *ctx = delinOp.getContext();
+  SmallVector<AffineExpr> resToExpr(delinOp.getNumResults(), AffineExpr());
+  for (auto [pos, dim] : llvm::enumerate(dims)) {
+    auto asResult = dyn_cast_if_present<OpResult>(dim);
+    if (!asResult)
+      continue;
+    if (asResult.getOwner() == delinOp.getOperation())
+      resToExpr[asResult.getResultNumber()] = getAffineDimExpr(pos, ctx);
+  }
+  for (auto [pos, sym] : llvm::enumerate(syms)) {
+    auto asResult = dyn_cast_if_present<OpResult>(sym);
+    if (!asResult)
+      continue;
+    if (asResult.getOwner() == delinOp.getOperation())
+      resToExpr[asResult.getResultNumber()] = getAffineSymbolExpr(pos, ctx);
+  }
+  if (llvm::is_contained(resToExpr, AffineExpr()))
+    return failure();
+
+  bool isDimReplacement = llvm::all_of(resToExpr, llvm::IsaPred<AffineDimExpr>);
+  int64_t stride = 1;
+  llvm::SmallDenseSet<AffineExpr, 4> expectedExprs;
+  // This isn't zip_equal since sometimes the delinearize basis is missing a
+  // size for the first result.
+  for (auto [binding, size] : llvm::zip(
+           llvm::reverse(resToExpr), llvm::reverse(delinOp.getStaticBasis()))) {
+    expectedExprs.insert(binding * getAffineConstantExpr(stride, ctx));
+    stride *= size;
+  }
+  if (resToExpr.size() != delinOp.getStaticBasis().size())
+    expectedExprs.insert(resToExpr[0] * stride);
+
+  DenseMap<AffineExpr, AffineExpr> replacements;
+  AffineExpr delinInExpr = isDimReplacement
+                               ? getAffineDimExpr(dims.size(), ctx)
+                               : getAffineSymbolExpr(syms.size(), ctx);
+
+  for (AffineExpr e : map->getResults())
+    shortenAddChainsContainingAll(e, expectedExprs, delinInExpr, replacements);
+  if (replacements.empty())
+    return failure();
+
+  AffineMap origMap = *map;
+  if (isDimReplacement)
+    dims.push_back(delinOp.getLinearIndex());
+  else
+    syms.push_back(delinOp.getLinearIndex());
+  *map = origMap.replace(replacements, dims.size(), syms.size());
+
+  // Blank out dead dimensions and symbols
+  for (AffineExpr e : resToExpr) {
+    if (auto d = dyn_cast<AffineDimExpr>(e)) {
+      unsigned pos = d.getPosition();
+      if (!map->isFunctionOfDim(pos))
+        dims[pos] = nullptr;
+    }
+    if (auto s = dyn_cast<AffineSymbolExpr>(e)) {
+      unsigned pos = s.getPosition();
+      if (!map->isFunctionOfSymbol(pos))
+        syms[pos] = nullptr;
+    }
+  }
+  return success();
+}
+
 /// Replace all occurrences of AffineExpr at position `pos` in `map` by the
 /// defining AffineApplyOp expression and operands.
 /// When `dimOrSymbolPosition < dims.size()`, AffineDimExpr@[pos] is replaced.
@@ -1157,6 +1292,11 @@ static LogicalResult replaceDimOrSym(AffineMap *map,
                                                  syms);
   }
 
+  if (auto delinOp = v.getDefiningOp<affine::AffineDelinearizeIndexOp>()) {
+    return replaceAffineDelinearizeIndexInverseExpression(delinOp, v, map, dims,
+                                                          syms);
+  }
+
   auto affineApply = v.getDefiningOp<AffineApplyOp>();
   if (!affineApply)
     return failure();
@@ -2460,6 +2600,65 @@ static LogicalResult foldLoopBounds(AffineForOp forOp) {
   return success(folded);
 }
 
+/// Returns constant trip count in trivial cases.
+static std::optional<uint64_t> getTrivialConstantTripCount(AffineForOp forOp) {
+  int64_t step = forOp.getStepAsInt();
+  if (!forOp.hasConstantBounds() || step <= 0)
+    return std::nullopt;
+  int64_t lb = forOp.getConstantLowerBound();
+  int64_t ub = forOp.getConstantUpperBound();
+  return ub - lb <= 0 ? 0 : (ub - lb + step - 1) / step;
+}
+
+/// Fold the empty loop.
+static SmallVector<OpFoldResult> AffineForEmptyLoopFolder(AffineForOp forOp) {
+  if (!llvm::hasSingleElement(*forOp.getBody()))
+    return {};
+  if (forOp.getNumResults() == 0)
+    return {};
+  std::optional<uint64_t> tripCount = getTrivialConstantTripCount(forOp);
+  if (tripCount == 0) {
+    // The initial values of the iteration arguments would be the op's
+    // results.
+    return forOp.getInits();
+  }
+  SmallVector<Value, 4> replacements;
+  auto yieldOp = cast<AffineYieldOp>(forOp.getBody()->getTerminator());
+  auto iterArgs = forOp.getRegionIterArgs();
+  bool hasValDefinedOutsideLoop = false;
+  bool iterArgsNotInOrder = false;
+  for (unsigned i = 0, e = yieldOp->getNumOperands(); i < e; ++i) {
+    Value val = yieldOp.getOperand(i);
+    BlockArgument *iterArgIt = llvm::find(iterArgs, val);
+    // TODO: It should be possible to perform a replacement by computing the
+    // last value of the IV based on the bounds and the step.
+    if (val == forOp.getInductionVar())
+      return {};
+    if (iterArgIt == iterArgs.end()) {
+      // `val` is defined outside of the loop.
+      assert(forOp.isDefinedOutsideOfLoop(val) &&
+             "must be defined outside of the loop");
+      hasValDefinedOutsideLoop = true;
+      replacements.push_back(val);
+    } else {
+      unsigned pos = std::distance(iterArgs.begin(), iterArgIt);
+      if (pos != i)
+        iterArgsNotInOrder = true;
+      replacements.push_back(forOp.getInits()[pos]);
+    }
+  }
+  // Bail out when the trip count is unknown and the loop returns any value
+  // defined outside of the loop or any iterArg out of order.
+  if (!tripCount.has_value() &&
+      (hasValDefinedOutsideLoop || iterArgsNotInOrder))
+    return {};
+  // Bail out when the loop iterates more than once and it returns any iterArg
+  // out of order.
+  if (tripCount.has_value() && tripCount.value() >= 2 && iterArgsNotInOrder)
+    return {};
+  return llvm::to_vector_of<OpFoldResult>(replacements);
+}
+
 /// Canonicalize the bounds of the given loop.
 static LogicalResult canonicalizeLoopBounds(AffineForOp forOp) {
   SmallVector<Value, 4> lbOperands(forOp.getLowerBoundOperands());
@@ -2491,79 +2690,30 @@ static LogicalResult canonicalizeLoopBounds(AffineForOp forOp) {
   return success();
 }
 
-namespace {
-/// Returns constant trip count in trivial cases.
-static std::optional<uint64_t> getTrivialConstantTripCount(AffineForOp forOp) {
-  int64_t step = forOp.getStepAsInt();
-  if (!forOp.hasConstantBounds() || step <= 0)
-    return std::nullopt;
-  int64_t lb = forOp.getConstantLowerBound();
-  int64_t ub = forOp.getConstantUpperBound();
-  return ub - lb <= 0 ? 0 : (ub - lb + step - 1) / step;
+/// Returns true if the affine.for has zero iterations in trivial cases.
+static bool hasTrivialZeroTripCount(AffineForOp op) {
+  return getTrivialConstantTripCount(op) == 0;
 }
 
-/// This is a pattern to fold trivially empty loop bodies.
-/// TODO: This should be moved into the folding hook.
-struct AffineForEmptyLoopFolder : public OpRewritePattern<AffineForOp> {
-  using OpRewritePattern<AffineForOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(AffineForOp forOp,
-                                PatternRewriter &rewriter) const override {
-    // Check that the body only contains a yield.
-    if (!llvm::hasSingleElement(*forOp.getBody()))
-      return failure();
-    if (forOp.getNumResults() == 0)
-      return success();
-    std::optional<uint64_t> tripCount = getTrivialConstantTripCount(forOp);
-    if (tripCount == 0) {
-      // The initial values of the iteration arguments would be the op's
-      // results.
-      rewriter.replaceOp(forOp, forOp.getInits());
-      return success();
-    }
-    SmallVector<Value, 4> replacements;
-    auto yieldOp = cast<AffineYieldOp>(forOp.getBody()->getTerminator());
-    auto iterArgs = forOp.getRegionIterArgs();
-    bool hasValDefinedOutsideLoop = false;
-    bool iterArgsNotInOrder = false;
-    for (unsigned i = 0, e = yieldOp->getNumOperands(); i < e; ++i) {
-      Value val = yieldOp.getOperand(i);
-      auto *iterArgIt = llvm::find(iterArgs, val);
-      // TODO: It should be possible to perform a replacement by computing the
-      // last value of the IV based on the bounds and the step.
-      if (val == forOp.getInductionVar())
-        return failure();
-      if (iterArgIt == iterArgs.end()) {
-        // `val` is defined outside of the loop.
-        assert(forOp.isDefinedOutsideOfLoop(val) &&
-               "must be defined outside of the loop");
-        hasValDefinedOutsideLoop = true;
-        replacements.push_back(val);
-      } else {
-        unsigned pos = std::distance(iterArgs.begin(), iterArgIt);
-        if (pos != i)
-          iterArgsNotInOrder = true;
-        replacements.push_back(forOp.getInits()[pos]);
-      }
-    }
-    // Bail out when the trip count is unknown and the loop returns any value
-    // defined outside of the loop or any iterArg out of order.
-    if (!tripCount.has_value() &&
-        (hasValDefinedOutsideLoop || iterArgsNotInOrder))
-      return failure();
-    // Bail out when the loop iterates more than once and it returns any iterArg
-    // out of order.
-    if (tripCount.has_value() && tripCount.value() >= 2 && iterArgsNotInOrder)
-      return failure();
-    rewriter.replaceOp(forOp, replacements);
-    return success();
+LogicalResult AffineForOp::fold(FoldAdaptor adaptor,
+                                SmallVectorImpl<OpFoldResult> &results) {
+  bool folded = succeeded(foldLoopBounds(*this));
+  folded |= succeeded(canonicalizeLoopBounds(*this));
+  if (hasTrivialZeroTripCount(*this) && getNumResults() != 0) {
+    // The initial values of the loop-carried variables (iter_args) are the
+    // results of the op. But this must be avoided for an affine.for op that
+    // does not return any results. Since ops that do not return results cannot
+    // be folded away, we would enter an infinite loop of folds on the same
+    // affine.for op.
+    results.assign(getInits().begin(), getInits().end());
+    folded = true;
   }
-};
-} // namespace
-
-void AffineForOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                              MLIRContext *context) {
-  results.add<AffineForEmptyLoopFolder>(context);
+  SmallVector<OpFoldResult> foldResults = AffineForEmptyLoopFolder(*this);
+  if (!foldResults.empty()) {
+    results.assign(foldResults);
+    folded = true;
+  }
+  return success(folded);
 }
 
 OperandRange AffineForOp::getEntrySuccessorOperands(RegionBranchPoint point) {
@@ -2606,27 +2756,6 @@ void AffineForOp::getSuccessorRegions(
   regions.push_back(RegionSuccessor(getResults()));
 }
 
-/// Returns true if the affine.for has zero iterations in trivial cases.
-static bool hasTrivialZeroTripCount(AffineForOp op) {
-  return getTrivialConstantTripCount(op) == 0;
-}
-
-LogicalResult AffineForOp::fold(FoldAdaptor adaptor,
-                                SmallVectorImpl<OpFoldResult> &results) {
-  bool folded = succeeded(foldLoopBounds(*this));
-  folded |= succeeded(canonicalizeLoopBounds(*this));
-  if (hasTrivialZeroTripCount(*this) && getNumResults() != 0) {
-    // The initial values of the loop-carried variables (iter_args) are the
-    // results of the op. But this must be avoided for an affine.for op that
-    // does not return any results. Since ops that do not return results cannot
-    // be folded away, we would enter an infinite loop of folds on the same
-    // affine.for op.
-    results.assign(getInits().begin(), getInits().end());
-    folded = true;
-  }
-  return success(folded);
-}
-
 AffineBound AffineForOp::getLowerBound() {
   return AffineBound(*this, getLowerBoundOperands(), getLowerBoundMap());
 }
diff --git a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
index cd216efb66992..4743941deff3f 100644
--- a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
@@ -1357,7 +1357,7 @@ bool mlir::affine::isValidLoopInterchangePermutation(
 
 /// Returns true if `loops` is a perfectly nested loop nest, where loops appear
 /// in it from outermost to innermost.
-bool LLVM_ATTRIBUTE_UNUSED
+[[maybe_unused]] bool
 mlir::affine::isPerfectlyNested(ArrayRef<AffineForOp> loops) {
   assert(!loops.empty() && "no loops provided");
 
@@ -1920,8 +1920,7 @@ generatePointWiseCopy(Location loc, Value memref, Value fastMemRef,
   return copyNestRoot;
 }
 
-static InFlightDiagnostic LLVM_ATTRIBUTE_UNUSED
-emitRemarkForBlock(Block &block) {
+[[maybe_unused]] static InFlightDiagnostic emitRemarkForBlock(Block &block) {
   return block.getParentOp()->emitRemark();
 }
 
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/DropEquivalentBufferResults.cpp b/mlir/lib/Dialect/Bufferization/Transforms/DropEquivalentBufferResults.cpp
index 624519fbd9d93..bc1799099de31 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/DropEquivalentBufferResults.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/DropEquivalentBufferResults.cpp
@@ -41,18 +41,37 @@ namespace bufferization {
 
 using namespace mlir;
 
-/// Return the unique ReturnOp that terminates `funcOp`.
-/// Return nullptr if there is no such unique ReturnOp.
-static func::ReturnOp getAssumedUniqueReturnOp(func::FuncOp funcOp) {
-  func::ReturnOp returnOp;
+/// Get all the ReturnOp in the funcOp.
+static SmallVector<func::ReturnOp> getReturnOps(func::FuncOp funcOp) {
+  SmallVector<func::ReturnOp> returnOps;
   for (Block &b : funcOp.getBody()) {
     if (auto candidateOp = dyn_cast<func::ReturnOp>(b.getTerminator())) {
-      if (returnOp)
-        return nullptr;
-      returnOp = candidateOp;
+      returnOps.push_back(candidateOp);
     }
   }
-  return returnOp;
+  return returnOps;
+}
+
+/// Get the operands at the specified position for all returnOps.
+static SmallVector<Value>
+getReturnOpsOperandInPos(ArrayRef<func::ReturnOp> returnOps, size_t pos) {
+  return llvm::map_to_vector(returnOps, [&](func::ReturnOp returnOp) {
+    return returnOp.getOperand(pos);
+  });
+}
+
+/// Check if all given values are the same buffer as the block argument (modulo
+/// cast ops).
+static bool operandsEqualFuncArgument(ArrayRef<Value> operands,
+                                      BlockArgument argument) {
+  for (Value val : operands) {
+    while (auto castOp = val.getDefiningOp<memref::CastOp>())
+      val = castOp.getSource();
+
+    if (val != argument)
+      return false;
+  }
+  return true;
 }
 
 LogicalResult
@@ -64,47 +83,53 @@ mlir::bufferization::dropEquivalentBufferResults(ModuleOp module) {
   module.walk([&](func::CallOp callOp) {
     if (func::FuncOp calledFunc =
             dyn_cast_or_null<func::FuncOp>(callOp.resolveCallable())) {
-      callerMap[calledFunc].insert(callOp);
+      if (!calledFunc.isPublic() && !calledFunc.isExternal())
+        callerMap[calledFunc].insert(callOp);
     }
   });
 
   for (auto funcOp : module.getOps<func::FuncOp>()) {
-    if (funcOp.isExternal())
+    if (funcOp.isExternal() || funcOp.isPublic())
       continue;
-    func::ReturnOp returnOp = getAssumedUniqueReturnOp(funcOp);
-    // TODO: Support functions with multiple blocks.
-    if (!returnOp)
+    SmallVector<func::ReturnOp> returnOps = getReturnOps(funcOp);
+    if (returnOps.empty())
       continue;
 
     // Compute erased results.
-    SmallVector<Value> newReturnValues;
-    BitVector erasedResultIndices(funcOp.getFunctionType().getNumResults());
+    size_t numReturnOps = returnOps.size();
+    size_t numReturnValues = funcOp.getFunctionType().getNumResults();
+    SmallVector<SmallVector<Value>> newReturnValues(numReturnOps);
+    BitVector erasedResultIndices(numReturnValues);
     DenseMap<int64_t, int64_t> resultToArgs;
-    for (const auto &it : llvm::enumerate(returnOp.getOperands())) {
+    for (size_t i = 0; i < numReturnValues; ++i) {
       bool erased = false;
+      SmallVector<Value> returnOperands =
+          getReturnOpsOperandInPos(returnOps, i);
       for (BlockArgument bbArg : funcOp.getArguments()) {
-        Value val = it.value();
-        while (auto castOp = val.getDefiningOp<memref::CastOp>())
-          val = castOp.getSource();
-
-        if (val == bbArg) {
-          resultToArgs[it.index()] = bbArg.getArgNumber();
+        if (operandsEqualFuncArgument(returnOperands, bbArg)) {
+          resultToArgs[i] = bbArg.getArgNumber();
           erased = true;
           break;
         }
       }
 
       if (erased) {
-        erasedResultIndices.set(it.index());
+        erasedResultIndices.set(i);
       } else {
-        newReturnValues.push_back(it.value());
+        for (auto [newReturnValue, operand] :
+             llvm::zip(newReturnValues, returnOperands)) {
+          newReturnValue.push_back(operand);
+        }
       }
     }
 
     // Update function.
     if (failed(funcOp.eraseResults(erasedResultIndices)))
       return failure();
-    returnOp.getOperandsMutable().assign(newReturnValues);
+
+    for (auto [returnOp, newReturnValue] :
+         llvm::zip(returnOps, newReturnValues))
+      returnOp.getOperandsMutable().assign(newReturnValue);
 
     // Update function calls.
     for (func::CallOp callOp : callerMap[funcOp]) {
diff --git a/mlir/lib/Dialect/GPU/Pipelines/CMakeLists.txt b/mlir/lib/Dialect/GPU/Pipelines/CMakeLists.txt
index 70a9c77a6d796..ec68acfee7ef1 100644
--- a/mlir/lib/Dialect/GPU/Pipelines/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/Pipelines/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_mlir_dialect_library(MLIRGPUPipelines
   GPUToNVVMPipeline.cpp
+  GPUToXeVMPipeline.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU
@@ -11,12 +12,17 @@ add_mlir_dialect_library(MLIRGPUPipelines
   MLIRTransforms
   MLIRLinalgTransforms
   MLIRAffineToStandard
+  MLIRGPUToLLVMSPV
   MLIRGPUToNVVMTransforms
   MLIRIndexToLLVM
   MLIRMathToLLVM
+  MLIRMathToXeVM
   MLIRNVGPUToNVVM
   MLIRNVVMToLLVM
   MLIRReconcileUnrealizedCasts
   MLIRSCFToControlFlow
   MLIRVectorToSCF
+  MLIRXeGPUTransforms
+  MLIRXeGPUToXeVM
+  MLIRXeVMToLLVM
 )
diff --git a/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp b/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
new file mode 100644
index 0000000000000..1a1485ba2e02c
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp
@@ -0,0 +1,139 @@
+//===- GPUToXeVMPipeline.cpp - Lowering pipeline to XeVM/LLVM -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass for testing the lowering to XeVM as a generally
+// usable sink pass. If XeGPU ops are used, it expects the MLIR code to have
+// XeGPU ops already embedded in gpu code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
+#include "mlir/Conversion/MathToXeVM/MathToXeVM.h"
+#include "mlir/Conversion/Passes.h"
+#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
+#include "mlir/Conversion/VectorToSCF/VectorToSCF.h"
+#include "mlir/Conversion/XeGPUToXeVM/XeGPUToXeVM.h"
+#include "mlir/Conversion/XeVMToLLVM/XeVMToLLVM.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/GPU/Pipelines/Passes.h"
+#include "mlir/Dialect/GPU/Transforms/Passes.h"
+#include "mlir/Dialect/LLVMIR/Transforms/RequestCWrappers.h"
+#include "mlir/Dialect/MemRef/Transforms/Passes.h"
+#include "mlir/Dialect/XeGPU/Transforms/Passes.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Pass/PassOptions.h"
+#include "mlir/Target/LLVM/XeVM/Target.h"
+#include "mlir/Transforms/Passes.h"
+
+using namespace mlir;
+
+namespace {
+//===----------------------------------------------------------------------===//
+// Pre-GPU common pipeline for both Host and GPU.
+//===----------------------------------------------------------------------===//
+void buildPreGPUCommonPassPipeline(
+    OpPassManager &pm, const mlir::gpu::GPUToXeVMPipelineOptions &options) {
+  // builtin.module scope passes.
+  pm.addPass(createCSEPass());
+  pm.addPass(createConvertVectorToSCFPass());
+  {
+    GpuXeVMAttachTargetOptions xevmTargetOptions;
+    xevmTargetOptions.moduleMatcher = options.xevmModuleMatcher;
+    xevmTargetOptions.triple = options.zebinTriple;
+    xevmTargetOptions.chip = options.zebinChip;
+    xevmTargetOptions.optLevel = options.optLevel;
+    xevmTargetOptions.cmdOptions = options.cmdOptions;
+    pm.addPass(createGpuXeVMAttachTarget(xevmTargetOptions));
+  }
+  pm.addPass(createLowerAffinePass());
+  pm.addNestedPass<func::FuncOp>(createGpuAsyncRegionPass());
+}
+
+//===----------------------------------------------------------------------===//
+// GPUModule-specific stuff.
+//===----------------------------------------------------------------------===//
+void buildGPUPassPipeline(OpPassManager &pm,
+                          const mlir::gpu::GPUToXeVMPipelineOptions &options) {
+  if (options.xegpuOpLevel == "workgroup") {
+    pm.addNestedPass<gpu::GPUModuleOp>(xegpu::createXeGPUWgToSgDistribute());
+    pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass());
+    pm.addNestedPass<gpu::GPUModuleOp>(xegpu::createXeGPUBlocking());
+    pm.addNestedPass<gpu::GPUModuleOp>(createCanonicalizerPass());
+    pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass());
+  }
+  if (options.xegpuOpLevel == "subgroup" ||
+      options.xegpuOpLevel == "workgroup") {
+    pm.addNestedPass<gpu::GPUModuleOp>(xegpu::createXeGPUPropagateLayout());
+    pm.addNestedPass<gpu::GPUModuleOp>(xegpu::createXeGPUSubgroupDistribute());
+    pm.addNestedPass<gpu::GPUModuleOp>(createCanonicalizerPass());
+    pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass());
+    pm.addNestedPass<gpu::GPUModuleOp>(createLoopInvariantCodeMotionPass());
+    pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass());
+    pm.addNestedPass<gpu::GPUModuleOp>(xegpu::createXeGPUVectorLinearize());
+  }
+  pm.addNestedPass<gpu::GPUModuleOp>(createConvertMathToXeVM());
+  pm.addNestedPass<gpu::GPUModuleOp>(createConvertXeGPUToXeVMPass());
+  {
+    ConvertGpuOpsToLLVMSPVOpsOptions gpuToLLVMSPVOptions;
+    gpuToLLVMSPVOptions.use64bitIndex = options.use64bitIndex;
+    pm.addNestedPass<gpu::GPUModuleOp>(
+        createConvertGpuOpsToLLVMSPVOps(gpuToLLVMSPVOptions));
+  }
+  pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass());
+  pm.addNestedPass<gpu::GPUModuleOp>(createReconcileUnrealizedCastsPass());
+}
+
+//===----------------------------------------------------------------------===//
+// Post-GPU pipeline for both Host and GPU.
+//===----------------------------------------------------------------------===//
+void buildPostGPUCommonPassPipeline(
+    OpPassManager &pm, const mlir::gpu::GPUToXeVMPipelineOptions &options) {
+  // builtin.module scope passes.
+  pm.addPass(createSCFToControlFlowPass());
+  pm.addPass(memref::createExpandStridedMetadataPass());
+  {
+    GpuToLLVMConversionPassOptions gpuToLLVMOptions;
+    gpuToLLVMOptions.hostBarePtrCallConv = options.hostBarePtrCallConv;
+    gpuToLLVMOptions.kernelBarePtrCallConv = options.kernelBarePtrCallConv;
+    pm.addPass(createGpuToLLVMConversionPass(gpuToLLVMOptions));
+  }
+  pm.addPass(createLowerAffinePass());
+  pm.addPass(createConvertToLLVMPass());
+  pm.addPass(createReconcileUnrealizedCastsPass());
+  // gpu-module-to-binary
+  {
+    GpuModuleToBinaryPassOptions gpuToModuleBinOptions;
+    gpuToModuleBinOptions.compilationTarget = options.binaryFormat;
+    gpuToModuleBinOptions.cmdOptions = options.cmdOptions;
+    pm.addPass(createGpuModuleToBinaryPass(gpuToModuleBinOptions));
+  }
+}
+} // namespace
+
+void mlir::gpu::buildLowerToXeVMPassPipeline(
+    OpPassManager &pm, const GPUToXeVMPipelineOptions &options) {
+  // Pre-GPU common pipelines.
+  buildPreGPUCommonPassPipeline(pm, options);
+
+  // GPUModule-specific stuff.
+  buildGPUPassPipeline(pm, options);
+
+  // Post-GPU pipeline for both Host and GPU.
+  buildPostGPUCommonPassPipeline(pm, options);
+}
+
+void mlir::gpu::registerGPUToXeVMPipeline() {
+  PassPipelineRegistration<GPUToXeVMPipelineOptions>(
+      "gpu-lower-to-xevm-pipeline",
+      "The default GPU to XeVM lowering pipeline. It starts by lowering GPU "
+      "code to the "
+      "specified compilation target (default is fatbin) then lowers the host "
+      "code.",
+      buildLowerToXeVMPassPipeline);
+}
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
index 3c551aab49032..af790fb2716c5 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -2826,6 +2826,20 @@ LogicalResult ShuffleVectorOp::verify() {
   return success();
 }
 
+// Folding for shufflevector op when v1 is single element 1D vector
+// and the mask is a single zero. OpFoldResult will be v1 in this case.
+OpFoldResult ShuffleVectorOp::fold(FoldAdaptor adaptor) {
+  // Check if operand 0 is a single element vector.
+  auto vecType = llvm::dyn_cast<VectorType>(getV1().getType());
+  if (!vecType || vecType.getRank() != 1 || vecType.getNumElements() != 1)
+    return {};
+  // Check if the mask is a single zero.
+  // Note: The mask is guaranteed to be non-empty.
+  if (getMask().size() != 1 || getMask()[0] != 0)
+    return {};
+  return getV1();
+}
+
 //===----------------------------------------------------------------------===//
 // Implementations for LLVM::LLVMFuncOp.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp
index 01a16ce2d8a7f..ac35eea66e9d6 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp
@@ -134,10 +134,10 @@ static void printExtTypeParams(AsmPrinter &p, ArrayRef<Type> typeParams,
 
 /// These are unused for now.
 /// TODO: Move over to these once more types have been migrated to TypeDef.
-LLVM_ATTRIBUTE_UNUSED static OptionalParseResult
+[[maybe_unused]] static OptionalParseResult
 generatedTypeParser(AsmParser &parser, StringRef *mnemonic, Type &value);
-LLVM_ATTRIBUTE_UNUSED static LogicalResult
-generatedTypePrinter(Type def, AsmPrinter &printer);
+[[maybe_unused]] static LogicalResult generatedTypePrinter(Type def,
+                                                           AsmPrinter &printer);
 
 #include "mlir/Dialect/LLVMIR/LLVMTypeInterfaces.cpp.inc"
 
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index 5edcc40bd2d32..2a8c330b761be 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -309,6 +309,17 @@ LogicalResult ConvertBF16x2ToF8x2Op::verify() {
   return success();
 }
 
+LogicalResult ConvertF32x2ToF4x2Op::verify() {
+  mlir::MLIRContext *ctx = getContext();
+
+  if (!llvm::isa<mlir::Float4E2M1FNType>(getDstTy()))
+    return emitOpError("Only ")
+           << mlir::Float4E2M1FNType::get(ctx)
+           << " type is supported for conversions from f32x2 to f4x2.";
+
+  return success();
+}
+
 LogicalResult BulkStoreOp::verify() {
   if (getInitVal() != 0)
     return emitOpError("only 0 is supported for initVal, got ") << getInitVal();
@@ -787,6 +798,26 @@ LogicalResult MmaOp::verify() {
                          " attribute");
   }
 
+  // Validate layout combinations. According to the operation description, most
+  // MMA operations require layoutA=row and layoutB=col. Only m8n8k4 with f16
+  // can use other layout combinations.
+  bool isM8N8K4_F16 =
+      (mmaShape[0] == 8 && mmaShape[1] == 8 && mmaShape[2] == 4 &&
+       getMultiplicandAPtxType() == MMATypes::f16);
+
+  if (!isM8N8K4_F16) {
+    // For all other shapes/types, layoutA must be row and layoutB must be col
+    if (getLayoutA() != MMALayout::row || getLayoutB() != MMALayout::col) {
+      return emitOpError("requires layoutA = #nvvm.mma_layout<row> and "
+                         "layoutB = #nvvm.mma_layout<col> for shape <")
+             << mmaShape[0] << ", " << mmaShape[1] << ", " << mmaShape[2]
+             << "> with element types "
+             << stringifyEnum(*getMultiplicandAPtxType()) << " and "
+             << stringifyEnum(*getMultiplicandBPtxType())
+             << ". Only m8n8k4 with f16 supports other layouts.";
+    }
+  }
+
   return success();
 }
 
@@ -2047,6 +2078,23 @@ ConvertFloatToTF32Op::getIntrinsicID(NVVM::FPRoundingMode rnd,
   }
 }
 
+NVVM::IDArgPair
+ConvertF32x2ToF4x2Op::getIntrinsicIDAndArgs(NVVM::ConvertF32x2ToF4x2Op op,
+                                            LLVM::ModuleTranslation &mt,
+                                            llvm::IRBuilderBase &builder) {
+  llvm::SmallVector<llvm::Value *> args;
+  args.push_back(mt.lookupValue(op.getA()));
+  args.push_back(mt.lookupValue(op.getB()));
+
+  bool hasRelu = op.getRelu();
+
+  llvm::Intrinsic::ID intId =
+      hasRelu ? llvm::Intrinsic::nvvm_ff_to_e2m1x2_rn_relu_satfinite
+              : llvm::Intrinsic::nvvm_ff_to_e2m1x2_rn_satfinite;
+
+  return {intId, std::move(args)};
+}
+
 #define GET_F32x2_TO_F6x2_ID(type, has_relu)                                   \
   has_relu ? llvm::Intrinsic::nvvm_ff_to_##type##_rn_relu_satfinite            \
            : llvm::Intrinsic::nvvm_ff_to_##type##_rn_satfinite
@@ -2306,6 +2354,32 @@ static void nvvmInferResultRanges(Operation *op, Value result,
   }
 }
 
+/// Verify the range attribute satisfies LLVM ConstantRange constructor
+/// requirements for NVVM SpecialRangeableRegisterOp.
+static LogicalResult
+verifyConstantRangeAttr(Operation *op,
+                        std::optional<LLVM::ConstantRangeAttr> rangeAttr) {
+  if (!rangeAttr)
+    return success();
+
+  const llvm::APInt &lower = rangeAttr->getLower();
+  const llvm::APInt &upper = rangeAttr->getUpper();
+
+  // Check LLVM ConstantRange constructor condition
+  if (lower == upper && !lower.isMaxValue() && !lower.isMinValue()) {
+    unsigned bitWidth = lower.getBitWidth();
+    llvm::APInt minVal = llvm::APInt::getMinValue(bitWidth);
+    llvm::APInt maxVal = llvm::APInt::getMaxValue(bitWidth);
+    return op->emitOpError(
+               "invalid range attribute: Lower == Upper, but they aren't min (")
+           << llvm::toString(minVal, 10, false) << ") or max ("
+           << llvm::toString(maxVal, 10, false)
+           << ") value! This is an invalid constant range.";
+  }
+
+  return success();
+}
+
 static llvm::Value *getAsPackedI32(llvm::Value *arg,
                                    llvm::IRBuilderBase &builder) {
   return builder.CreateBitCast(arg,
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
index c477c6cada2c9..dcc1ef9e997ea 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
@@ -315,7 +315,8 @@ bool mlir::linalg::detail::isContractionBody(
 
   Value yielded = getSourceSkipUnary(terminator->getOperand(0));
   Operation *reductionOp = yielded.getDefiningOp();
-  if (reductionOp->getNumResults() != 1 || reductionOp->getNumOperands() != 2) {
+  if (!reductionOp || reductionOp->getNumResults() != 1 ||
+      reductionOp->getNumOperands() != 2) {
     errs << "expected reduction op to be binary";
     return false;
   }
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index 1f155a06e8a1f..a6ec9f68c8fcb 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -2457,26 +2457,24 @@ transform::PadTilingInterfaceOp::apply(transform::TransformRewriter &rewriter,
     }
 
     // Set options.
-    TilingInterface paddedOp;
     PadTilingInterfaceOptions options;
     options.setPaddingValues(paddingValues)
         .setPaddingSizes(getMixedPaddingSizes())
         .setPadToMultipleOf(getPadToMultipleOf());
 
-    // Apply padding.
-    SmallVector<tensor::PadOp> newPadOps;
-    FailureOr<TilingInterface> maybePaddedOp = rewriteAsPaddedOp(
-        rewriter, cast<TilingInterface>(targetOp.getOperation()), options,
-        newPadOps);
-    if (failed(maybePaddedOp)) {
+    auto maybePadOps = rewriteAsPaddedOp(
+        rewriter, cast<TilingInterface>(targetOp.getOperation()), options);
+    if (failed(maybePadOps)) {
       auto diag = emitSilenceableError() << "failed to pad op";
       diag.attachNote(target->getLoc()) << "target op";
       return diag;
     }
+    const auto &[paddedOperands, paddedOp, slicedResults] = maybePadOps.value();
 
     // Set transform results.
-    paddedOps.push_back(cast<TilingInterface>(maybePaddedOp->getOperation()));
-    padOps.append(newPadOps.begin(), newPadOps.end());
+    paddedOps.push_back(paddedOp);
+    padOps.append(paddedOperands.begin(), paddedOperands.end());
+    rewriter.replaceOp(targetOp.getOperation(), slicedResults);
   }
 
   results.set(cast<OpResult>(getPadded()), paddedOps);
@@ -3024,10 +3022,10 @@ ParseResult SplitOp::parse(OpAsmParser &parser, OperationState &result) {
     return failure();
   }
   if (dynamicPointParseResult.has_value()) {
-    Type ChunkSizesType;
+    Type chunkSizesType;
     if (failed(*dynamicPointParseResult) || parser.parseComma() ||
-        parser.parseType(ChunkSizesType) ||
-        parser.resolveOperand(dynamicChunkSizes, ChunkSizesType,
+        parser.parseType(chunkSizesType) ||
+        parser.resolveOperand(dynamicChunkSizes, chunkSizesType,
                               result.operands)) {
       return failure();
     }
@@ -3399,9 +3397,9 @@ void transform::ContinuousTileSizesOp::getEffects(
 }
 
 static void printContinuousTileSizeTypes(OpAsmPrinter &printer, Operation *op,
-                                         Type targetType, Type tile_sizes,
+                                         Type targetType, Type tileSizes,
                                          Type) {
-  printer.printFunctionalType(TypeRange{targetType}, TypeRange{tile_sizes});
+  printer.printFunctionalType(TypeRange{targetType}, TypeRange{tileSizes});
 }
 
 static ParseResult parseContinuousTileSizeTypes(OpAsmParser &parser,
diff --git a/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp b/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp
index 0956c5d771394..3e787a2ad0ef5 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp
@@ -95,10 +95,11 @@ static int64_t extractConstantMultiplier(AffineExpr expr) {
 /// - affine_map<(d0, d1) -> (d0 * 3 + d1)>
 /// In the future, more general interfaces can be devised to encode similar
 /// shape evolutions and map between an op and its operands.
-SmallVector<OpFoldResult> linalg::computePaddedShape(
-    RewriterBase &rewriter, TypedValue<RankedTensorType> v,
-    AffineMap indexingMap, ArrayRef<OpFoldResult> indexingSizes,
-    const PadTilingInterfaceOptions &options) {
+SmallVector<OpFoldResult>
+linalg::computePaddedShape(OpBuilder &builder, TypedValue<RankedTensorType> v,
+                           AffineMap indexingMap,
+                           ArrayRef<OpFoldResult> indexingSizes,
+                           const PadTilingInterfaceOptions &options) {
   Location loc = v.getLoc();
   SmallVector<OpFoldResult> paddedShape;
   auto tensorType = cast<RankedTensorType>(v.getType());
@@ -109,7 +110,7 @@ SmallVector<OpFoldResult> linalg::computePaddedShape(
 
   // "Full-rank" padding specification.
   SmallVector<OpFoldResult> paddingSizes =
-      getFullRankPaddingSizes(rewriter, indexingSizes, options);
+      getFullRankPaddingSizes(builder, indexingSizes, options);
 
   // For each dimension in the operand's shape, iterate over indexingSizes and
   // add the various term contributions.
@@ -147,28 +148,27 @@ SmallVector<OpFoldResult> linalg::computePaddedShape(
       OpFoldResult paddingDimOfr;
       if (options.padToMultipleOf) {
         AffineExpr d0, s0;
-        bindDims(rewriter.getContext(), d0);
-        bindSymbols(rewriter.getContext(), s0);
+        bindDims(builder.getContext(), d0);
+        bindSymbols(builder.getContext(), s0);
         AffineMap ceilMap = AffineMap::get(1, 1, d0.ceilDiv(s0) * s0);
         AffineMap composedMap = projectedMap.compose(ceilMap);
         paddingDimOfr = affine::makeComposedFoldedAffineApply(
-            rewriter, loc, composedMap,
-            {indexingSizes[paddingDim], paddingSize},
+            builder, loc, composedMap, {indexingSizes[paddingDim], paddingSize},
             /*composeAffineMin=*/true);
       } else {
         // Otherwise just set to paddingSize.
         paddingDimOfr = affine::makeComposedFoldedAffineApply(
-            rewriter, loc, projectedMap, paddingSize);
+            builder, loc, projectedMap, paddingSize);
       }
 
       // Adjust for the maximum accessed index, which is (paddingSize - 1) *
       // multiplier.
       AffineExpr d0;
-      bindDims(rewriter.getContext(), d0);
+      bindDims(builder.getContext(), d0);
       int64_t multiplier = extractConstantMultiplier(projectedMap.getResult(0));
       AffineMap subtractMap = AffineMap::get(1, 0, d0 - multiplier);
       OpFoldResult maxAccessIdx = affine::makeComposedFoldedAffineApply(
-          rewriter, loc, subtractMap, {paddingDimOfr});
+          builder, loc, subtractMap, {paddingDimOfr});
       terms.push_back(maxAccessIdx);
 
       LLVM_DEBUG(DBGS() << "------new term: " << terms.back() << "\n");
@@ -177,19 +177,19 @@ SmallVector<OpFoldResult> linalg::computePaddedShape(
     // If there are no terms, just return the dim.
     if (terms.empty()) {
       paddedShape[resultIndex] =
-          createFoldedDimOp(rewriter, loc, v, resultIndex);
+          createFoldedDimOp(builder, loc, v, resultIndex);
       continue;
     }
 
     // Sum individual terms' contributions.
     SmallVector<AffineExpr> dims(terms.size());
-    bindDimsList(rewriter.getContext(), MutableArrayRef{dims});
+    bindDimsList(builder.getContext(), MutableArrayRef{dims});
     AffineExpr sumExpr = dims.front();
     for (unsigned i = 1; i < dims.size(); ++i)
       sumExpr = sumExpr + dims[i];
     // Add 1 to the maximum accessed index and get the final padded size.
-    OpFoldResult paddedDimOfr = affine::makeComposedFoldedAffineApply(
-        rewriter, loc, sumExpr + 1, terms);
+    OpFoldResult paddedDimOfr =
+        affine::makeComposedFoldedAffineApply(builder, loc, sumExpr + 1, terms);
     paddedShape[resultIndex] = paddedDimOfr;
   }
 
@@ -198,7 +198,7 @@ SmallVector<OpFoldResult> linalg::computePaddedShape(
 
 FailureOr<SmallVector<OpFoldResult>>
 linalg::computeIndexingMapOpInterfacePaddedShape(
-    RewriterBase &rewriter, OpOperand &operandToPad,
+    OpBuilder &builder, OpOperand &operandToPad,
     ArrayRef<Range> iterationDomain, const PadTilingInterfaceOptions &options) {
   auto transferOp =
       llvm::dyn_cast<IndexingMapOpInterface>(operandToPad.getOwner());
@@ -206,9 +206,9 @@ linalg::computeIndexingMapOpInterfacePaddedShape(
     return failure();
 
   // clang-format off
-  assert(llvm::all_of(iterationDomain, [&rewriter](Range r) {
-    return r.offset == OpFoldResult(rewriter.getIndexAttr(0)) &&
-    r.stride == OpFoldResult(rewriter.getIndexAttr(1));
+  assert(llvm::all_of(iterationDomain, [&builder](Range r) {
+    return r.offset == OpFoldResult(builder.getIndexAttr(0)) &&
+    r.stride == OpFoldResult(builder.getIndexAttr(1));
   }) && "expected 0-offset 1-stride loop ranges");
   // clang-format on
   SmallVector<OpFoldResult> loopUpperBounds;
@@ -218,13 +218,13 @@ linalg::computeIndexingMapOpInterfacePaddedShape(
 
   AffineMap indexingMap = transferOp.getMatchingIndexingMap(&operandToPad);
   return computePaddedShape(
-      rewriter, cast<TypedValue<RankedTensorType>>(operandToPad.get()),
+      builder, cast<TypedValue<RankedTensorType>>(operandToPad.get()),
       indexingMap, loopUpperBounds, options);
 }
 
 /// Pad a single operand to `paddedShape` using `paddingValueAttr` as padding
 /// Value.
-static Value padOperand(RewriterBase &rewriter, TilingInterface opToPad,
+static Value padOperand(OpBuilder &builder, TilingInterface opToPad,
                         TypedValue<RankedTensorType> v,
                         ArrayRef<OpFoldResult> paddedShape,
                         Attribute paddingValueAttr) {
@@ -232,15 +232,15 @@ static Value padOperand(RewriterBase &rewriter, TilingInterface opToPad,
   if (auto complexTy =
           dyn_cast<ComplexType>(getElementTypeOrSelf(v.getType()))) {
     if (auto complexAttr = dyn_cast<ArrayAttr>(paddingValueAttr)) {
-      paddingValue = complex::ConstantOp::create(rewriter, opToPad.getLoc(),
+      paddingValue = complex::ConstantOp::create(builder, opToPad.getLoc(),
                                                  complexTy, complexAttr);
     }
   } else if (isa<ub::PoisonAttr>(paddingValueAttr)) {
-    paddingValue = ub::PoisonOp::create(rewriter, opToPad.getLoc(),
+    paddingValue = ub::PoisonOp::create(builder, opToPad.getLoc(),
                                         getElementTypeOrSelf(v.getType()));
   } else if (auto typedAttr = dyn_cast<TypedAttr>(paddingValueAttr)) {
     paddingValue =
-        arith::ConstantOp::create(rewriter, opToPad.getLoc(), typedAttr);
+        arith::ConstantOp::create(builder, opToPad.getLoc(), typedAttr);
   }
   assert(paddingValue && "failed to create value from padding attribute");
 
@@ -259,49 +259,48 @@ static Value padOperand(RewriterBase &rewriter, TilingInterface opToPad,
       RankedTensorType::get(tensorShape, getElementTypeOrSelf(v));
   LLVM_DEBUG(DBGS() << "--SUCCESS, makeComposedPadHighOp with type: "
                     << paddedTensorType);
-  return makeComposedPadHighOp(rewriter, opToPad.getLoc(), paddedTensorType, v,
+  return makeComposedPadHighOp(builder, opToPad.getLoc(), paddedTensorType, v,
                                paddingValue, /*nofold=*/false, dynDims);
 }
 
-FailureOr<TilingInterface> linalg::rewriteAsPaddedOp(
-    RewriterBase &rewriter, TilingInterface opToPad,
-    const PadTilingInterfaceOptions &constOptions,
-    SmallVector<tensor::PadOp> &padOps,
+FailureOr<PadTilingInterfaceResult> linalg::rewriteAsPaddedOp(
+    OpBuilder &builder, TilingInterface toPad,
+    PadTilingInterfaceOptions options,
     const PadSizeComputationFunction &computePaddingSizeFun) {
-  LLVM_DEBUG(DBGS() << "Start rewriteAsPaddedOp : " << opToPad << "\n");
+  LLVM_DEBUG(DBGS() << "Start rewriteAsPaddedOp : " << toPad << "\n");
+  SmallVector<tensor::PadOp> padOps;
+  Location loc = toPad.getLoc();
 
-  Location loc = opToPad.getLoc();
-  PadTilingInterfaceOptions options(constOptions);
   // Allow inference of pad values if they are not explicitly specified.
   // TODO: be mindful about the value depending on the actual operation.
   if (options.paddingValues.empty()) {
-    SmallVector<Type> types(opToPad->getOperandTypes());
-    llvm::append_range(types, opToPad->getResultTypes());
+    SmallVector<Type> types(toPad->getOperandTypes());
+    llvm::append_range(types, toPad->getResultTypes());
     for (Type t : types) {
       options.paddingValues.push_back(
-          rewriter.getZeroAttr(getElementTypeOrSelf(t)));
+          builder.getZeroAttr(getElementTypeOrSelf(t)));
     }
   }
 
-  if (llvm::any_of(opToPad->getOperands(),
+  if (llvm::any_of(toPad->getOperands(),
                    [](Value v) { return isa<MemRefType>(v.getType()); })) {
-    return rewriter.notifyMatchFailure(opToPad,
-                                       "expected operation on tensors");
+    LLVM_DEBUG(DBGS() << "Not an operation on tensors: FAIL\n");
+    return failure();
   }
 
-  OpBuilder::InsertionGuard g(rewriter);
-  // Set IP after opToPad because we also take the dims of opToPad's output.
-  rewriter.setInsertionPointAfter(opToPad);
+  OpBuilder::InsertionGuard g(builder);
+  // Set IP after toPad because we also take the dims of toPad's output.
+  builder.setInsertionPointAfter(toPad);
 
   // 1. Get the loopUpperBounds from the TilingInterface.
-  SmallVector<Range> iterationDomain = opToPad.getIterationDomain(rewriter);
+  SmallVector<Range> iterationDomain = toPad.getIterationDomain(builder);
 
   // 2. For each operand.
   SmallVector<Value> newOperands;
-  newOperands.reserve(opToPad->getNumOperands());
-  for (OpOperand &opOperand : opToPad->getOpOperands()) {
+  newOperands.reserve(toPad->getNumOperands());
+  for (OpOperand &opOperand : toPad->getOpOperands()) {
     Value operand = opOperand.get();
-    LLVM_DEBUG(DBGS() << "--start padding oprd: " << operand << "\n");
+    LLVM_DEBUG(DBGS() << "--start padding operand: " << operand << "\n");
 
     // 2.a. Skip scalar-like operands.
     Type operandType = operand.getType();
@@ -311,30 +310,31 @@ FailureOr<TilingInterface> linalg::rewriteAsPaddedOp(
       newOperands.push_back(operand);
       continue;
     }
+
     // 2.a. Compute padded shape.
     FailureOr<SmallVector<OpFoldResult>> maybePaddedShape =
-        computePaddingSizeFun(rewriter, opOperand, iterationDomain, options);
+        computePaddingSizeFun(builder, opOperand, iterationDomain, options);
     if (failed(maybePaddedShape)) {
-      return rewriter.notifyMatchFailure(opToPad, "could not pad op");
+      LLVM_DEBUG(DBGS() << "Could not get padded shape of operand: FAIL\n");
+      return failure();
     }
 
     // 2.b. Expect proper `paddingValues`.
     // TODO: we may want to allow garbage padding in the future, in which case
     // we would just not assert.
     if (opOperand.getOperandNumber() >= options.paddingValues.size()) {
-      return rewriter.notifyMatchFailure(opToPad,
-                                         "--no padding value specified");
+      LLVM_DEBUG(DBGS() << "Too few padding values specified: FAIL\n");
+      return failure();
     }
     Attribute paddingValueAttr =
         options.paddingValues[opOperand.getOperandNumber()];
 
     // 2.c. Perform actual padding.
-    Value paddedOperand = padOperand(
-        rewriter, opToPad, cast<TypedValue<RankedTensorType>>(operand),
-        *maybePaddedShape, paddingValueAttr);
+    Value paddedOperand =
+        padOperand(builder, toPad, cast<TypedValue<RankedTensorType>>(operand),
+                   *maybePaddedShape, paddingValueAttr);
     LLVM_DEBUG(DBGS() << "--done padding operand: " << paddedOperand << "\n");
 
-    // 2.d. Perform actual padding.
     newOperands.push_back(paddedOperand);
     if (auto padOp = paddedOperand.getDefiningOp<tensor::PadOp>())
       padOps.push_back(padOp);
@@ -342,38 +342,34 @@ FailureOr<TilingInterface> linalg::rewriteAsPaddedOp(
 
   // 3. Form the resulting tensor::ExtractSliceOp.
   ReifiedRankedShapedTypeDims reifiedResultShapes;
-  if (failed(reifyResultShapes(rewriter, opToPad, reifiedResultShapes))) {
-    LLVM_DEBUG(DBGS() << "--failed to reify result shapes -> FAIL\n");
-    return rewriter.notifyMatchFailure(opToPad,
-                                       "failed to reify result shapes");
+  if (failed(reifyResultShapes(builder, toPad, reifiedResultShapes))) {
+    LLVM_DEBUG(DBGS() << "Failed to reify result shapes: FAIL\n");
+    return failure();
   }
-  assert(reifiedResultShapes.size() == opToPad->getNumResults() &&
+  assert(reifiedResultShapes.size() == toPad->getNumResults() &&
          "expected same number of results");
 
-  // Clone `opToPad` to operate on the statically padded shapes.
+  // Clone `toPad` to operate on the statically padded shapes.
   auto resultTensorTypes =
-      ValueRange(newOperands).take_back(opToPad->getNumResults()).getTypes();
-  // clone **should** properly notify the rewriter.
+      ValueRange(newOperands).take_back(toPad->getNumResults()).getTypes();
+  // clone **should** properly notify the builder.
   TilingInterface paddedOp =
-      clone(rewriter, opToPad, resultTensorTypes, newOperands);
+      clone(builder, toPad, resultTensorTypes, newOperands);
   LLVM_DEBUG(DBGS() << "--cloned padded op: " << paddedOp << "\n");
 
-  // Recover the slice out of the new static results. This keeps the original
-  // opToPad around because it uses the dims of the original results.
+  // Recover the slice out of the new static results.
   SmallVector<Value> paddedSubtensorResults;
-  paddedSubtensorResults.reserve(opToPad->getNumResults());
+  paddedSubtensorResults.reserve(toPad->getNumResults());
   for (const auto &en : llvm::enumerate(paddedOp->getResults())) {
     Value paddedResult = en.value();
     int64_t resultNumber = en.index();
     int64_t rank = cast<RankedTensorType>(paddedResult.getType()).getRank();
-    SmallVector<OpFoldResult> offsets(rank, rewriter.getIndexAttr(0));
-    SmallVector<OpFoldResult> strides(rank, rewriter.getIndexAttr(1));
+    SmallVector<OpFoldResult> offsets(rank, builder.getIndexAttr(0));
+    SmallVector<OpFoldResult> strides(rank, builder.getIndexAttr(1));
     paddedSubtensorResults.push_back(tensor::ExtractSliceOp::create(
-        rewriter, loc, paddedResult, offsets, reifiedResultShapes[resultNumber],
+        builder, loc, paddedResult, offsets, reifiedResultShapes[resultNumber],
         strides));
   }
 
-  rewriter.replaceOp(opToPad, paddedSubtensorResults);
-
-  return paddedOp;
+  return PadTilingInterfaceResult{padOps, paddedOp, paddedSubtensorResults};
 }
diff --git a/mlir/lib/Dialect/MemRef/IR/CMakeLists.txt b/mlir/lib/Dialect/MemRef/IR/CMakeLists.txt
index e25a0121a3359..1382c7aceea79 100644
--- a/mlir/lib/Dialect/MemRef/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/MemRef/IR/CMakeLists.txt
@@ -5,7 +5,7 @@ add_mlir_dialect_library(MLIRMemRefDialect
   ValueBoundsOpInterfaceImpl.cpp
 
   ADDITIONAL_HEADER_DIRS
-  ${PROJECT_SOURCE_DIR}/inlude/mlir/Dialect/MemRefDialect
+  ${PROJECT_SOURCE_DIR}/inlude/mlir/Dialect/MemRef/IR
 
   DEPENDS
   MLIRMemRefOpsIncGen
@@ -18,6 +18,7 @@ add_mlir_dialect_library(MLIRMemRefDialect
   MLIRDialectUtils
   MLIRInferIntRangeCommon
   MLIRInferIntRangeInterface
+  MLIRInferStridedMetadataInterface
   MLIRInferTypeOpInterface
   MLIRIR
   MLIRMemOpInterfaces
diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
index e9bdcda296da5..94947b760251e 100644
--- a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
+++ b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
@@ -2158,11 +2158,45 @@ struct ReinterpretCastOpExtractStridedMetadataFolder
     return success();
   }
 };
+
+struct ReinterpretCastOpConstantFolder
+    : public OpRewritePattern<ReinterpretCastOp> {
+public:
+  using OpRewritePattern<ReinterpretCastOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(ReinterpretCastOp op,
+                                PatternRewriter &rewriter) const override {
+    unsigned srcStaticCount = llvm::count_if(
+        llvm::concat<OpFoldResult>(op.getMixedOffsets(), op.getMixedSizes(),
+                                   op.getMixedStrides()),
+        [](OpFoldResult ofr) { return isa<Attribute>(ofr); });
+
+    SmallVector<OpFoldResult> offsets = {op.getConstifiedMixedOffset()};
+    SmallVector<OpFoldResult> sizes = op.getConstifiedMixedSizes();
+    SmallVector<OpFoldResult> strides = op.getConstifiedMixedStrides();
+
+    // TODO: Using counting comparison instead of direct comparison because
+    // getMixedValues (and therefore ReinterpretCastOp::getMixed...) returns
+    // IntegerAttrs, while constifyIndexValues (and therefore
+    // ReinterpretCastOp::getConstifiedMixed...) returns IndexAttrs.
+    if (srcStaticCount ==
+        llvm::count_if(llvm::concat<OpFoldResult>(offsets, sizes, strides),
+                       [](OpFoldResult ofr) { return isa<Attribute>(ofr); }))
+      return failure();
+
+    auto newReinterpretCast = ReinterpretCastOp::create(
+        rewriter, op->getLoc(), op.getSource(), offsets[0], sizes, strides);
+
+    rewriter.replaceOpWithNewOp<CastOp>(op, op.getType(), newReinterpretCast);
+    return success();
+  }
+};
 } // namespace
 
 void ReinterpretCastOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                                     MLIRContext *context) {
-  results.add<ReinterpretCastOpExtractStridedMetadataFolder>(context);
+  results.add<ReinterpretCastOpExtractStridedMetadataFolder,
+              ReinterpretCastOpConstantFolder>(context);
 }
 
 FailureOr<std::optional<SmallVector<Value>>>
@@ -3437,6 +3471,65 @@ SubViewOp::bubbleDownCasts(OpBuilder &builder) {
   return bubbleDownCastsPassthroughOpImpl(*this, builder, getSourceMutable());
 }
 
+void SubViewOp::inferStridedMetadataRanges(
+    ArrayRef<StridedMetadataRange> ranges, GetIntRangeFn getIntRange,
+    SetStridedMetadataRangeFn setMetadata, int32_t indexBitwidth) {
+  auto isUninitialized =
+      +[](IntegerValueRange range) { return range.isUninitialized(); };
+
+  // Bail early if any of the operands metadata is not ready:
+  SmallVector<IntegerValueRange> offsetOperands =
+      getIntValueRanges(getMixedOffsets(), getIntRange, indexBitwidth);
+  if (llvm::any_of(offsetOperands, isUninitialized))
+    return;
+
+  SmallVector<IntegerValueRange> sizeOperands =
+      getIntValueRanges(getMixedSizes(), getIntRange, indexBitwidth);
+  if (llvm::any_of(sizeOperands, isUninitialized))
+    return;
+
+  SmallVector<IntegerValueRange> stridesOperands =
+      getIntValueRanges(getMixedStrides(), getIntRange, indexBitwidth);
+  if (llvm::any_of(stridesOperands, isUninitialized))
+    return;
+
+  StridedMetadataRange sourceRange =
+      ranges[getSourceMutable().getOperandNumber()];
+  if (sourceRange.isUninitialized())
+    return;
+
+  ArrayRef<ConstantIntRanges> srcStrides = sourceRange.getStrides();
+
+  // Get the dropped dims.
+  llvm::SmallBitVector droppedDims = getDroppedDims();
+
+  // Compute the new offset, strides and sizes.
+  ConstantIntRanges offset = sourceRange.getOffsets()[0];
+  SmallVector<ConstantIntRanges> strides, sizes;
+
+  for (size_t i = 0, e = droppedDims.size(); i < e; ++i) {
+    bool dropped = droppedDims.test(i);
+    // Compute the new offset.
+    ConstantIntRanges off =
+        intrange::inferMul({offsetOperands[i].getValue(), srcStrides[i]});
+    offset = intrange::inferAdd({offset, off});
+
+    // Skip dropped dimensions.
+    if (dropped)
+      continue;
+    // Multiply the strides.
+    strides.push_back(
+        intrange::inferMul({stridesOperands[i].getValue(), srcStrides[i]}));
+    // Get the sizes.
+    sizes.push_back(sizeOperands[i].getValue());
+  }
+
+  setMetadata(getResult(),
+              StridedMetadataRange::getRanked(
+                  SmallVector<ConstantIntRanges>({std::move(offset)}),
+                  std::move(sizes), std::move(strides)));
+}
+
 //===----------------------------------------------------------------------===//
 // TransposeOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/MemRef/Transforms/EmulateWideInt.cpp b/mlir/lib/Dialect/MemRef/Transforms/EmulateWideInt.cpp
index 49b71625291db..6f815ae46904c 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/EmulateWideInt.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/EmulateWideInt.cpp
@@ -121,7 +121,7 @@ struct EmulateWideIntPass final
         [&typeConverter](Operation *op) { return typeConverter.isLegal(op); });
 
     RewritePatternSet patterns(ctx);
-    // Add common pattenrs to support contants, functions, etc.
+    // Add common patterns to support contants, functions, etc.
     arith::populateArithWideIntEmulationPatterns(typeConverter, patterns);
 
     memref::populateMemRefWideIntEmulationPatterns(typeConverter, patterns);
diff --git a/mlir/lib/Dialect/OpenACC/CMakeLists.txt b/mlir/lib/Dialect/OpenACC/CMakeLists.txt
index 9f57627c321fb..7117520599fa6 100644
--- a/mlir/lib/Dialect/OpenACC/CMakeLists.txt
+++ b/mlir/lib/Dialect/OpenACC/CMakeLists.txt
@@ -1,2 +1,3 @@
 add_subdirectory(IR)
+add_subdirectory(Utils)
 add_subdirectory(Transforms)
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index 6564a4ecdccd3..5ca0100debe55 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -17,6 +17,7 @@
 #include "mlir/IR/DialectImplementation.h"
 #include "mlir/IR/Matchers.h"
 #include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/SymbolTable.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "llvm/ADT/SmallSet.h"
@@ -39,6 +40,16 @@ static bool isScalarLikeType(Type type) {
   return type.isIntOrIndexOrFloat() || isa<ComplexType>(type);
 }
 
+/// Helper function to attach the `VarName` attribute to an operation
+/// if a variable name is provided.
+static void attachVarNameAttr(Operation *op, OpBuilder &builder,
+                              StringRef varName) {
+  if (!varName.empty()) {
+    auto varNameAttr = acc::VarNameAttr::get(builder.getContext(), varName);
+    op->setAttr(acc::getVarNameAttrName(), varNameAttr);
+  }
+}
+
 struct MemRefPointerLikeModel
     : public PointerLikeType::ExternalModel<MemRefPointerLikeModel,
                                             MemRefType> {
@@ -74,14 +85,18 @@ struct MemRefPointerLikeModel
   }
 
   mlir::Value genAllocate(Type pointer, OpBuilder &builder, Location loc,
-                          StringRef varName, Type varType,
-                          Value originalVar) const {
+                          StringRef varName, Type varType, Value originalVar,
+                          bool &needsFree) const {
     auto memrefTy = cast<MemRefType>(pointer);
 
     // Check if this is a static memref (all dimensions are known) - if yes
     // then we can generate an alloca operation.
-    if (memrefTy.hasStaticShape())
-      return memref::AllocaOp::create(builder, loc, memrefTy).getResult();
+    if (memrefTy.hasStaticShape()) {
+      needsFree = false; // alloca doesn't need deallocation
+      auto allocaOp = memref::AllocaOp::create(builder, loc, memrefTy);
+      attachVarNameAttr(allocaOp, builder, varName);
+      return allocaOp.getResult();
+    }
 
     // For dynamic memrefs, extract sizes from the original variable if
     // provided. Otherwise they cannot be handled.
@@ -99,8 +114,11 @@ struct MemRefPointerLikeModel
         // Note: We only add dynamic sizes to the dynamicSizes array
         // Static dimensions are handled automatically by AllocOp
       }
-      return memref::AllocOp::create(builder, loc, memrefTy, dynamicSizes)
-          .getResult();
+      needsFree = true; // alloc needs deallocation
+      auto allocOp =
+          memref::AllocOp::create(builder, loc, memrefTy, dynamicSizes);
+      attachVarNameAttr(allocOp, builder, varName);
+      return allocOp.getResult();
     }
 
     // TODO: Unranked not yet supported.
@@ -108,10 +126,14 @@ struct MemRefPointerLikeModel
   }
 
   bool genFree(Type pointer, OpBuilder &builder, Location loc,
-               TypedValue<PointerLikeType> varPtr, Type varType) const {
-    if (auto memrefValue = dyn_cast<TypedValue<MemRefType>>(varPtr)) {
+               TypedValue<PointerLikeType> varToFree, Value allocRes,
+               Type varType) const {
+    if (auto memrefValue = dyn_cast<TypedValue<MemRefType>>(varToFree)) {
+      // Use allocRes if provided to determine the allocation type
+      Value valueToInspect = allocRes ? allocRes : memrefValue;
+
       // Walk through casts to find the original allocation
-      Value currentValue = memrefValue;
+      Value currentValue = valueToInspect;
       Operation *originalAlloc = nullptr;
 
       // Follow the chain of operations to find the original allocation
@@ -150,7 +172,7 @@ struct MemRefPointerLikeModel
           return true;
         }
         if (isa<memref::AllocOp>(originalAlloc)) {
-          // This is an alloc - generate dealloc
+          // This is an alloc - generate dealloc on varToFree
           memref::DeallocOp::create(builder, loc, memrefValue);
           return true;
         }
@@ -1003,6 +1025,138 @@ struct RemoveConstantIfConditionWithRegion : public OpRewritePattern<OpTy> {
   }
 };
 
+//===----------------------------------------------------------------------===//
+// Recipe Region Helpers
+//===----------------------------------------------------------------------===//
+
+/// Create and populate an init region for privatization recipes.
+/// Returns success if the region is populated, failure otherwise.
+/// Sets needsFree to indicate if the allocated memory requires deallocation.
+static LogicalResult createInitRegion(OpBuilder &builder, Location loc,
+                                      Region &initRegion, Type varType,
+                                      StringRef varName, ValueRange bounds,
+                                      bool &needsFree) {
+  // Create init block with arguments: original value + bounds
+  SmallVector<Type> argTypes{varType};
+  SmallVector<Location> argLocs{loc};
+  for (Value bound : bounds) {
+    argTypes.push_back(bound.getType());
+    argLocs.push_back(loc);
+  }
+
+  Block *initBlock = builder.createBlock(&initRegion);
+  initBlock->addArguments(argTypes, argLocs);
+  builder.setInsertionPointToStart(initBlock);
+
+  Value privatizedValue;
+
+  // Get the block argument that represents the original variable
+  Value blockArgVar = initBlock->getArgument(0);
+
+  // Generate init region body based on variable type
+  if (isa<MappableType>(varType)) {
+    auto mappableTy = cast<MappableType>(varType);
+    auto typedVar = cast<TypedValue<MappableType>>(blockArgVar);
+    privatizedValue = mappableTy.generatePrivateInit(
+        builder, loc, typedVar, varName, bounds, {}, needsFree);
+    if (!privatizedValue)
+      return failure();
+  } else {
+    assert(isa<PointerLikeType>(varType) && "Expected PointerLikeType");
+    auto pointerLikeTy = cast<PointerLikeType>(varType);
+    // Use PointerLikeType's allocation API with the block argument
+    privatizedValue = pointerLikeTy.genAllocate(builder, loc, varName, varType,
+                                                blockArgVar, needsFree);
+    if (!privatizedValue)
+      return failure();
+  }
+
+  // Add yield operation to init block
+  acc::YieldOp::create(builder, loc, privatizedValue);
+
+  return success();
+}
+
+/// Create and populate a copy region for firstprivate recipes.
+/// Returns success if the region is populated, failure otherwise.
+/// TODO: Handle MappableType - it does not yet have a copy API.
+static LogicalResult createCopyRegion(OpBuilder &builder, Location loc,
+                                      Region &copyRegion, Type varType,
+                                      ValueRange bounds) {
+  // Create copy block with arguments: original value + privatized value +
+  // bounds
+  SmallVector<Type> copyArgTypes{varType, varType};
+  SmallVector<Location> copyArgLocs{loc, loc};
+  for (Value bound : bounds) {
+    copyArgTypes.push_back(bound.getType());
+    copyArgLocs.push_back(loc);
+  }
+
+  Block *copyBlock = builder.createBlock(&copyRegion);
+  copyBlock->addArguments(copyArgTypes, copyArgLocs);
+  builder.setInsertionPointToStart(copyBlock);
+
+  bool isMappable = isa<MappableType>(varType);
+  bool isPointerLike = isa<PointerLikeType>(varType);
+  // TODO: Handle MappableType - it does not yet have a copy API.
+  // Otherwise, for now just fallback to pointer-like behavior.
+  if (isMappable && !isPointerLike)
+    return failure();
+
+  // Generate copy region body based on variable type
+  if (isPointerLike) {
+    auto pointerLikeTy = cast<PointerLikeType>(varType);
+    Value originalArg = copyBlock->getArgument(0);
+    Value privatizedArg = copyBlock->getArgument(1);
+
+    // Generate copy operation using PointerLikeType interface
+    if (!pointerLikeTy.genCopy(
+            builder, loc, cast<TypedValue<PointerLikeType>>(privatizedArg),
+            cast<TypedValue<PointerLikeType>>(originalArg), varType))
+      return failure();
+  }
+
+  // Add terminator to copy block
+  acc::TerminatorOp::create(builder, loc);
+
+  return success();
+}
+
+/// Create and populate a destroy region for privatization recipes.
+/// Returns success if the region is populated, failure otherwise.
+static LogicalResult createDestroyRegion(OpBuilder &builder, Location loc,
+                                         Region &destroyRegion, Type varType,
+                                         Value allocRes, ValueRange bounds) {
+  // Create destroy block with arguments: original value + privatized value +
+  // bounds
+  SmallVector<Type> destroyArgTypes{varType, varType};
+  SmallVector<Location> destroyArgLocs{loc, loc};
+  for (Value bound : bounds) {
+    destroyArgTypes.push_back(bound.getType());
+    destroyArgLocs.push_back(loc);
+  }
+
+  Block *destroyBlock = builder.createBlock(&destroyRegion);
+  destroyBlock->addArguments(destroyArgTypes, destroyArgLocs);
+  builder.setInsertionPointToStart(destroyBlock);
+
+  auto varToFree =
+      cast<TypedValue<PointerLikeType>>(destroyBlock->getArgument(1));
+  if (isa<MappableType>(varType)) {
+    auto mappableTy = cast<MappableType>(varType);
+    if (!mappableTy.generatePrivateDestroy(builder, loc, varToFree))
+      return failure();
+  } else {
+    assert(isa<PointerLikeType>(varType) && "Expected PointerLikeType");
+    auto pointerLikeTy = cast<PointerLikeType>(varType);
+    if (!pointerLikeTy.genFree(builder, loc, varToFree, allocRes, varType))
+      return failure();
+  }
+
+  acc::TerminatorOp::create(builder, loc);
+  return success();
+}
+
 } // namespace
 
 //===----------------------------------------------------------------------===//
@@ -1050,6 +1204,48 @@ LogicalResult acc::PrivateRecipeOp::verifyRegions() {
   return success();
 }
 
+std::optional<PrivateRecipeOp>
+PrivateRecipeOp::createAndPopulate(OpBuilder &builder, Location loc,
+                                   StringRef recipeName, Type varType,
+                                   StringRef varName, ValueRange bounds) {
+  // First, validate that we can handle this variable type
+  bool isMappable = isa<MappableType>(varType);
+  bool isPointerLike = isa<PointerLikeType>(varType);
+
+  // Unsupported type
+  if (!isMappable && !isPointerLike)
+    return std::nullopt;
+
+  OpBuilder::InsertionGuard guard(builder);
+
+  // Create the recipe operation first so regions have proper parent context
+  auto recipe = PrivateRecipeOp::create(builder, loc, recipeName, varType);
+
+  // Populate the init region
+  bool needsFree = false;
+  if (failed(createInitRegion(builder, loc, recipe.getInitRegion(), varType,
+                              varName, bounds, needsFree))) {
+    recipe.erase();
+    return std::nullopt;
+  }
+
+  // Only create destroy region if the allocation needs deallocation
+  if (needsFree) {
+    // Extract the allocated value from the init block's yield operation
+    auto yieldOp =
+        cast<acc::YieldOp>(recipe.getInitRegion().front().getTerminator());
+    Value allocRes = yieldOp.getOperand(0);
+
+    if (failed(createDestroyRegion(builder, loc, recipe.getDestroyRegion(),
+                                   varType, allocRes, bounds))) {
+      recipe.erase();
+      return std::nullopt;
+    }
+  }
+
+  return recipe;
+}
+
 //===----------------------------------------------------------------------===//
 // FirstprivateRecipeOp
 //===----------------------------------------------------------------------===//
@@ -1080,6 +1276,55 @@ LogicalResult acc::FirstprivateRecipeOp::verifyRegions() {
   return success();
 }
 
+std::optional<FirstprivateRecipeOp>
+FirstprivateRecipeOp::createAndPopulate(OpBuilder &builder, Location loc,
+                                        StringRef recipeName, Type varType,
+                                        StringRef varName, ValueRange bounds) {
+  // First, validate that we can handle this variable type
+  bool isMappable = isa<MappableType>(varType);
+  bool isPointerLike = isa<PointerLikeType>(varType);
+
+  // Unsupported type
+  if (!isMappable && !isPointerLike)
+    return std::nullopt;
+
+  OpBuilder::InsertionGuard guard(builder);
+
+  // Create the recipe operation first so regions have proper parent context
+  auto recipe = FirstprivateRecipeOp::create(builder, loc, recipeName, varType);
+
+  // Populate the init region
+  bool needsFree = false;
+  if (failed(createInitRegion(builder, loc, recipe.getInitRegion(), varType,
+                              varName, bounds, needsFree))) {
+    recipe.erase();
+    return std::nullopt;
+  }
+
+  // Populate the copy region
+  if (failed(createCopyRegion(builder, loc, recipe.getCopyRegion(), varType,
+                              bounds))) {
+    recipe.erase();
+    return std::nullopt;
+  }
+
+  // Only create destroy region if the allocation needs deallocation
+  if (needsFree) {
+    // Extract the allocated value from the init block's yield operation
+    auto yieldOp =
+        cast<acc::YieldOp>(recipe.getInitRegion().front().getTerminator());
+    Value allocRes = yieldOp.getOperand(0);
+
+    if (failed(createDestroyRegion(builder, loc, recipe.getDestroyRegion(),
+                                   varType, allocRes, bounds))) {
+      recipe.erase();
+      return std::nullopt;
+    }
+  }
+
+  return recipe;
+}
+
 //===----------------------------------------------------------------------===//
 // ReductionRecipeOp
 //===----------------------------------------------------------------------===//
@@ -3597,7 +3842,8 @@ LogicalResult AtomicUpdateOp::canonicalize(AtomicUpdateOp op,
   }
 
   if (Value writeVal = op.getWriteOpVal()) {
-    rewriter.replaceOpWithNewOp<AtomicWriteOp>(op, op.getX(), writeVal);
+    rewriter.replaceOpWithNewOp<AtomicWriteOp>(op, op.getX(), writeVal,
+                                               op.getIfCond());
     return success();
   }
 
@@ -4404,14 +4650,3 @@ mlir::acc::getMutableDataOperands(mlir::Operation *accOp) {
           .Default([&](mlir::Operation *) { return nullptr; })};
   return dataOperands;
 }
-
-mlir::Operation *mlir::acc::getEnclosingComputeOp(mlir::Region &region) {
-  mlir::Operation *parentOp = region.getParentOp();
-  while (parentOp) {
-    if (mlir::isa<ACC_COMPUTE_CONSTRUCT_OPS>(parentOp)) {
-      return parentOp;
-    }
-    parentOp = parentOp->getParentOp();
-  }
-  return nullptr;
-}
diff --git a/mlir/lib/Dialect/OpenACC/Utils/CMakeLists.txt b/mlir/lib/Dialect/OpenACC/Utils/CMakeLists.txt
new file mode 100644
index 0000000000000..68e124625921f
--- /dev/null
+++ b/mlir/lib/Dialect/OpenACC/Utils/CMakeLists.txt
@@ -0,0 +1,20 @@
+add_mlir_dialect_library(MLIROpenACCUtils
+  OpenACCUtils.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/OpenACC
+
+  DEPENDS
+  MLIROpenACCPassIncGen
+  MLIROpenACCOpsIncGen
+  MLIROpenACCEnumsIncGen
+  MLIROpenACCAttributesIncGen
+  MLIROpenACCMPOpsInterfacesIncGen
+  MLIROpenACCOpsInterfacesIncGen
+  MLIROpenACCTypeInterfacesIncGen
+
+  LINK_LIBS PUBLIC
+  MLIROpenACCDialect
+  MLIRIR
+  MLIRSupport
+)
diff --git a/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp
new file mode 100644
index 0000000000000..12233254f3fb4
--- /dev/null
+++ b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp
@@ -0,0 +1,80 @@
+//===- OpenACCUtils.cpp ---------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/OpenACC/OpenACCUtils.h"
+
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "llvm/ADT/TypeSwitch.h"
+
+mlir::Operation *mlir::acc::getEnclosingComputeOp(mlir::Region &region) {
+  mlir::Operation *parentOp = region.getParentOp();
+  while (parentOp) {
+    if (mlir::isa<ACC_COMPUTE_CONSTRUCT_OPS>(parentOp))
+      return parentOp;
+    parentOp = parentOp->getParentOp();
+  }
+  return nullptr;
+}
+
+template <typename OpTy>
+static bool isOnlyUsedByOpClauses(mlir::Value val, mlir::Region &region) {
+  auto checkIfUsedOnlyByOpInside = [&](mlir::Operation *user) {
+    // For any users which are not in the current acc region, we can ignore.
+    // Return true so that it can be used in a `all_of` check.
+    if (!region.isAncestor(user->getParentRegion()))
+      return true;
+    return mlir::isa<OpTy>(user);
+  };
+
+  return llvm::all_of(val.getUsers(), checkIfUsedOnlyByOpInside);
+}
+
+bool mlir::acc::isOnlyUsedByPrivateClauses(mlir::Value val,
+                                           mlir::Region &region) {
+  return isOnlyUsedByOpClauses<mlir::acc::PrivateOp>(val, region);
+}
+
+bool mlir::acc::isOnlyUsedByReductionClauses(mlir::Value val,
+                                             mlir::Region &region) {
+  return isOnlyUsedByOpClauses<mlir::acc::ReductionOp>(val, region);
+}
+
+std::optional<mlir::acc::ClauseDefaultValue>
+mlir::acc::getDefaultAttr(Operation *op) {
+  std::optional<mlir::acc::ClauseDefaultValue> defaultAttr;
+  Operation *currOp = op;
+
+  // Iterate outwards until a default clause is found (since OpenACC
+  // specification notes that a visible default clause is the nearest default
+  // clause appearing on the compute construct or a lexically containing data
+  // construct.
+  while (!defaultAttr.has_value() && currOp) {
+    defaultAttr =
+        llvm::TypeSwitch<mlir::Operation *,
+                         std::optional<mlir::acc::ClauseDefaultValue>>(currOp)
+            .Case<ACC_COMPUTE_CONSTRUCT_OPS, mlir::acc::DataOp>(
+                [&](auto op) { return op.getDefaultAttr(); })
+            .Default([&](Operation *) { return std::nullopt; });
+    currOp = currOp->getParentOp();
+  }
+
+  return defaultAttr;
+}
+
+mlir::acc::VariableTypeCategory mlir::acc::getTypeCategory(mlir::Value var) {
+  mlir::acc::VariableTypeCategory typeCategory =
+      mlir::acc::VariableTypeCategory::uncategorized;
+  if (auto mappableTy = dyn_cast<mlir::acc::MappableType>(var.getType()))
+    typeCategory = mappableTy.getTypeCategory(var);
+  else if (auto pointerLikeTy =
+               dyn_cast<mlir::acc::PointerLikeType>(var.getType()))
+    typeCategory = pointerLikeTy.getPointeeTypeCategory(
+        cast<TypedValue<mlir::acc::PointerLikeType>>(var),
+        pointerLikeTy.getElementType());
+  return typeCategory;
+}
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index 8675a64256796..172f21ff1779e 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -32,7 +32,6 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/ADT/bit.h"
-#include "llvm/Frontend/OpenMP/OMPConstants.h"
 #include "llvm/Support/InterleavedRange.h"
 #include <cstddef>
 #include <iterator>
@@ -1737,21 +1736,21 @@ static LogicalResult verifySynchronizationHint(Operation *op, uint64_t hint) {
 // Parser, printer and verifier for Target
 //===----------------------------------------------------------------------===//
 
-// Helper function to get bitwise AND of `value` and 'flag'
-static uint64_t mapTypeToBitFlag(uint64_t value,
-                                 llvm::omp::OpenMPOffloadMappingFlags flag) {
-  return value & llvm::to_underlying(flag);
+// Helper function to get bitwise AND of `value` and 'flag' then return it as a
+// boolean
+static bool mapTypeToBool(ClauseMapFlags value, ClauseMapFlags flag) {
+  return (value & flag) == flag;
 }
 
 /// Parses a map_entries map type from a string format back into its numeric
 /// value.
 ///
-/// map-clause = `map_clauses (  ( `(` `always, `? `implicit, `? `ompx_hold, `?
-/// `close, `? `present, `? ( `to` | `from` | `delete` `)` )+ `)` )
-static ParseResult parseMapClause(OpAsmParser &parser, IntegerAttr &mapType) {
-  llvm::omp::OpenMPOffloadMappingFlags mapTypeBits =
-      llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_NONE;
-
+/// map-clause = `map_clauses (  ( `(` `attach, `? `always, `? `implicit, `?
+/// `ompx_hold, `? `close, `? `present, `? ( `to` | `from` | `delete` `)` )+ `)`
+/// )
+static ParseResult parseMapClause(OpAsmParser &parser,
+                                  ClauseMapFlagsAttr &mapType) {
+  ClauseMapFlags mapTypeBits = ClauseMapFlags::none;
   // This simply verifies the correct keyword is read in, the
   // keyword itself is stored inside of the operation
   auto parseTypeAndMod = [&]() -> ParseResult {
@@ -1760,38 +1759,67 @@ static ParseResult parseMapClause(OpAsmParser &parser, IntegerAttr &mapType) {
       return failure();
 
     if (mapTypeMod == "always")
-      mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_ALWAYS;
+      mapTypeBits |= ClauseMapFlags::always;
 
     if (mapTypeMod == "implicit")
-      mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT;
+      mapTypeBits |= ClauseMapFlags::implicit;
 
     if (mapTypeMod == "ompx_hold")
-      mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_OMPX_HOLD;
+      mapTypeBits |= ClauseMapFlags::ompx_hold;
 
     if (mapTypeMod == "close")
-      mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_CLOSE;
+      mapTypeBits |= ClauseMapFlags::close;
 
     if (mapTypeMod == "present")
-      mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_PRESENT;
+      mapTypeBits |= ClauseMapFlags::present;
 
     if (mapTypeMod == "descriptor")
-      mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_DESCRIPTOR;
+      mapTypeBits |= ClauseMapFlags::descriptor;
 
     if (mapTypeMod == "to")
-      mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO;
+      mapTypeBits |= ClauseMapFlags::to;
 
     if (mapTypeMod == "from")
-      mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM;
+      mapTypeBits |= ClauseMapFlags::from;
 
     if (mapTypeMod == "tofrom")
-      mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO |
-                     llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM;
+      mapTypeBits |= ClauseMapFlags::to | ClauseMapFlags::from;
 
     if (mapTypeMod == "delete")
-      mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_DELETE;
+      mapTypeBits |= ClauseMapFlags::del;
+
+    if (mapTypeMod == "storage")
+      mapTypeBits |= ClauseMapFlags::storage;
 
     if (mapTypeMod == "return_param")
-      mapTypeBits |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_RETURN_PARAM;
+      mapTypeBits |= ClauseMapFlags::return_param;
+
+    if (mapTypeMod == "private")
+      mapTypeBits |= ClauseMapFlags::priv;
+
+    if (mapTypeMod == "literal")
+      mapTypeBits |= ClauseMapFlags::literal;
+
+    if (mapTypeMod == "attach")
+      mapTypeBits |= ClauseMapFlags::attach;
+
+    if (mapTypeMod == "attach_always")
+      mapTypeBits |= ClauseMapFlags::attach_always;
+
+    if (mapTypeMod == "attach_none")
+      mapTypeBits |= ClauseMapFlags::attach_none;
+
+    if (mapTypeMod == "attach_auto")
+      mapTypeBits |= ClauseMapFlags::attach_auto;
+
+    if (mapTypeMod == "ref_ptr")
+      mapTypeBits |= ClauseMapFlags::ref_ptr;
+
+    if (mapTypeMod == "ref_ptee")
+      mapTypeBits |= ClauseMapFlags::ref_ptee;
+
+    if (mapTypeMod == "ref_ptr_ptee")
+      mapTypeBits |= ClauseMapFlags::ref_ptr_ptee;
 
     return success();
   };
@@ -1799,9 +1827,8 @@ static ParseResult parseMapClause(OpAsmParser &parser, IntegerAttr &mapType) {
   if (parser.parseCommaSeparatedList(parseTypeAndMod))
     return failure();
 
-  mapType = parser.getBuilder().getIntegerAttr(
-      parser.getBuilder().getIntegerType(64, /*isSigned=*/false),
-      llvm::to_underlying(mapTypeBits));
+  mapType =
+      parser.getBuilder().getAttr<mlir::omp::ClauseMapFlagsAttr>(mapTypeBits);
 
   return success();
 }
@@ -1809,65 +1836,64 @@ static ParseResult parseMapClause(OpAsmParser &parser, IntegerAttr &mapType) {
 /// Prints a map_entries map type from its numeric value out into its string
 /// format.
 static void printMapClause(OpAsmPrinter &p, Operation *op,
-                           IntegerAttr mapType) {
-  uint64_t mapTypeBits = mapType.getUInt();
-
-  bool emitAllocRelease = true;
+                           ClauseMapFlagsAttr mapType) {
   llvm::SmallVector<std::string, 4> mapTypeStrs;
+  ClauseMapFlags mapFlags = mapType.getValue();
 
   // handling of always, close, present placed at the beginning of the string
   // to aid readability
-  if (mapTypeToBitFlag(mapTypeBits,
-                       llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_ALWAYS))
+  if (mapTypeToBool(mapFlags, ClauseMapFlags::always))
     mapTypeStrs.push_back("always");
-  if (mapTypeToBitFlag(mapTypeBits,
-                       llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT))
+  if (mapTypeToBool(mapFlags, ClauseMapFlags::implicit))
     mapTypeStrs.push_back("implicit");
-  if (mapTypeToBitFlag(mapTypeBits,
-                       llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_OMPX_HOLD))
+  if (mapTypeToBool(mapFlags, ClauseMapFlags::ompx_hold))
     mapTypeStrs.push_back("ompx_hold");
-  if (mapTypeToBitFlag(mapTypeBits,
-                       llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_CLOSE))
+  if (mapTypeToBool(mapFlags, ClauseMapFlags::close))
     mapTypeStrs.push_back("close");
-  if (mapTypeToBitFlag(mapTypeBits,
-                       llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_PRESENT))
+  if (mapTypeToBool(mapFlags, ClauseMapFlags::present))
     mapTypeStrs.push_back("present");
-
-  if (mapTypeToBitFlag(
-          mapTypeBits,
-          llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_DESCRIPTOR))
+  if (mapTypeToBool(mapFlags, ClauseMapFlags::descriptor))
     mapTypeStrs.push_back("descriptor");
 
   // special handling of to/from/tofrom/delete and release/alloc, release +
   // alloc are the abscense of one of the other flags, whereas tofrom requires
   // both the to and from flag to be set.
-  bool to = mapTypeToBitFlag(mapTypeBits,
-                             llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO);
-  bool from = mapTypeToBitFlag(
-      mapTypeBits, llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM);
-  if (to && from) {
-    emitAllocRelease = false;
+  bool to = mapTypeToBool(mapFlags, ClauseMapFlags::to);
+  bool from = mapTypeToBool(mapFlags, ClauseMapFlags::from);
+
+  if (to && from)
     mapTypeStrs.push_back("tofrom");
-  } else if (from) {
-    emitAllocRelease = false;
+  else if (from)
     mapTypeStrs.push_back("from");
-  } else if (to) {
-    emitAllocRelease = false;
+  else if (to)
     mapTypeStrs.push_back("to");
-  }
-  if (mapTypeToBitFlag(mapTypeBits,
-                       llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_DELETE)) {
-    emitAllocRelease = false;
+
+  if (mapTypeToBool(mapFlags, ClauseMapFlags::del))
     mapTypeStrs.push_back("delete");
-  }
-  if (mapTypeToBitFlag(
-          mapTypeBits,
-          llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_RETURN_PARAM)) {
-    emitAllocRelease = false;
+  if (mapTypeToBool(mapFlags, ClauseMapFlags::return_param))
     mapTypeStrs.push_back("return_param");
-  }
-  if (emitAllocRelease)
-    mapTypeStrs.push_back("exit_release_or_enter_alloc");
+  if (mapTypeToBool(mapFlags, ClauseMapFlags::storage))
+    mapTypeStrs.push_back("storage");
+  if (mapTypeToBool(mapFlags, ClauseMapFlags::priv))
+    mapTypeStrs.push_back("private");
+  if (mapTypeToBool(mapFlags, ClauseMapFlags::literal))
+    mapTypeStrs.push_back("literal");
+  if (mapTypeToBool(mapFlags, ClauseMapFlags::attach))
+    mapTypeStrs.push_back("attach");
+  if (mapTypeToBool(mapFlags, ClauseMapFlags::attach_always))
+    mapTypeStrs.push_back("attach_always");
+  if (mapTypeToBool(mapFlags, ClauseMapFlags::attach_none))
+    mapTypeStrs.push_back("attach_none");
+  if (mapTypeToBool(mapFlags, ClauseMapFlags::attach_auto))
+    mapTypeStrs.push_back("attach_auto");
+  if (mapTypeToBool(mapFlags, ClauseMapFlags::ref_ptr))
+    mapTypeStrs.push_back("ref_ptr");
+  if (mapTypeToBool(mapFlags, ClauseMapFlags::ref_ptee))
+    mapTypeStrs.push_back("ref_ptee");
+  if (mapTypeToBool(mapFlags, ClauseMapFlags::ref_ptr_ptee))
+    mapTypeStrs.push_back("ref_ptr_ptee");
+  if (mapFlags == ClauseMapFlags::none)
+    mapTypeStrs.push_back("none");
 
   for (unsigned int i = 0; i < mapTypeStrs.size(); ++i) {
     p << mapTypeStrs[i];
@@ -1970,23 +1996,16 @@ static LogicalResult verifyMapClause(Operation *op, OperandRange mapVars) {
     if (!mapOp.getDefiningOp())
       return emitError(op->getLoc(), "missing map operation");
 
-    if (auto mapInfoOp =
-            mlir::dyn_cast<mlir::omp::MapInfoOp>(mapOp.getDefiningOp())) {
-      uint64_t mapTypeBits = mapInfoOp.getMapType();
-
-      bool to = mapTypeToBitFlag(
-          mapTypeBits, llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO);
-      bool from = mapTypeToBitFlag(
-          mapTypeBits, llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM);
-      bool del = mapTypeToBitFlag(
-          mapTypeBits, llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_DELETE);
-
-      bool always = mapTypeToBitFlag(
-          mapTypeBits, llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_ALWAYS);
-      bool close = mapTypeToBitFlag(
-          mapTypeBits, llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_CLOSE);
-      bool implicit = mapTypeToBitFlag(
-          mapTypeBits, llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT);
+    if (auto mapInfoOp = mapOp.getDefiningOp<mlir::omp::MapInfoOp>()) {
+      mlir::omp::ClauseMapFlags mapTypeBits = mapInfoOp.getMapType();
+
+      bool to = mapTypeToBool(mapTypeBits, ClauseMapFlags::to);
+      bool from = mapTypeToBool(mapTypeBits, ClauseMapFlags::from);
+      bool del = mapTypeToBool(mapTypeBits, ClauseMapFlags::del);
+
+      bool always = mapTypeToBool(mapTypeBits, ClauseMapFlags::always);
+      bool close = mapTypeToBool(mapTypeBits, ClauseMapFlags::close);
+      bool implicit = mapTypeToBool(mapTypeBits, ClauseMapFlags::implicit);
 
       if ((isa<TargetDataOp>(op) || isa<TargetOp>(op)) && del)
         return emitError(op->getLoc(),
diff --git a/mlir/lib/Dialect/Quant/IR/TypeParser.cpp b/mlir/lib/Dialect/Quant/IR/TypeParser.cpp
index 6e79daa0da996..eb956f20220dd 100644
--- a/mlir/lib/Dialect/Quant/IR/TypeParser.cpp
+++ b/mlir/lib/Dialect/Quant/IR/TypeParser.cpp
@@ -157,7 +157,7 @@ static Type parseAnyType(DialectAsmParser &parser) {
 /// Checks if the given scale value is within the valid range of the expressed
 /// type. The `expressedType` argument is the floating-point type used for
 /// expressing the quantized values, and `scale` is the double value to check.
-LogicalResult
+static LogicalResult
 isScaleInExpressedTypeRange(function_ref<InFlightDiagnostic()> emitError,
                             Type expressedType, double scale) {
   auto floatType = cast<FloatType>(expressedType);
@@ -579,10 +579,10 @@ static void printUniformQuantizedPerAxisType(UniformQuantizedPerAxisType type,
 ///   would print:
 ///
 ///     {{1.0, 2.0}, {3.0:1, 4.0:9}}
-void printDenseQuantizationParameters(ArrayRef<APFloat> scales,
-                                      ArrayRef<APInt> zeroPoints,
-                                      ArrayRef<int64_t> shape,
-                                      DialectAsmPrinter &out) {
+static void printDenseQuantizationParameters(ArrayRef<APFloat> scales,
+                                             ArrayRef<APInt> zeroPoints,
+                                             ArrayRef<int64_t> shape,
+                                             DialectAsmPrinter &out) {
   int64_t rank = shape.size();
   SmallVector<unsigned, 4> counter(rank, 0);
   unsigned openBrackets = 0;
diff --git a/mlir/lib/Dialect/SCF/Transforms/LoopSpecialization.cpp b/mlir/lib/Dialect/SCF/Transforms/LoopSpecialization.cpp
index e3717aa9d940e..c7588b433dee1 100644
--- a/mlir/lib/Dialect/SCF/Transforms/LoopSpecialization.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/LoopSpecialization.cpp
@@ -240,10 +240,10 @@ LogicalResult mlir::scf::peelForLoopFirstIteration(RewriterBase &b, ForOp forOp,
         loc, forOp.getUpperBound().getType(), splitBound);
 
   // Peel the first iteration.
-  IRMapping map;
-  map.map(forOp.getUpperBound(), splitBound);
-  firstIteration = cast<ForOp>(b.clone(*forOp.getOperation(), map));
-
+  firstIteration = cast<ForOp>(b.clone(*forOp.getOperation()));
+  b.modifyOpInPlace(firstIteration, [&]() {
+    firstIteration.getUpperBoundMutable().assign(splitBound);
+  });
   // Update main loop with new lower bound.
   b.modifyOpInPlace(forOp, [&]() {
     forOp.getInitArgsMutable().assign(firstIteration->getResults());
diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVCanonicalization.td b/mlir/lib/Dialect/SPIRV/IR/SPIRVCanonicalization.td
index 39fbab8f37a2e..e8d2274d29aa0 100644
--- a/mlir/lib/Dialect/SPIRV/IR/SPIRVCanonicalization.td
+++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVCanonicalization.td
@@ -75,11 +75,3 @@ def ConvertComparisonIntoClamp2_#CmpClampPair[0] : Pat<
         )),
     (CmpClampPair[1] $input, $min, $max)>;
 }
-
-//===----------------------------------------------------------------------===//
-// spirv.GL.Length -> spirv.GL.FAbs
-//===----------------------------------------------------------------------===//
-
-def ConvertGLLengthToGLFAbs : Pat<
-    (SPIRV_GLLengthOp SPIRV_Float:$operand),
-    (SPIRV_GLFAbsOp $operand)>;
diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVGLCanonicalization.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVGLCanonicalization.cpp
index 46acb8c156fc6..3ad8057a58dc9 100644
--- a/mlir/lib/Dialect/SPIRV/IR/SPIRVGLCanonicalization.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVGLCanonicalization.cpp
@@ -34,8 +34,8 @@ void populateSPIRVGLCanonicalizationPatterns(RewritePatternSet &results) {
               ConvertComparisonIntoClamp2_SPIRV_SLessThanOp,
               ConvertComparisonIntoClamp2_SPIRV_SLessThanEqualOp,
               ConvertComparisonIntoClamp2_SPIRV_ULessThanOp,
-              ConvertComparisonIntoClamp2_SPIRV_ULessThanEqualOp,
-              ConvertGLLengthToGLFAbs>(results.getContext());
+              ConvertComparisonIntoClamp2_SPIRV_ULessThanEqualOp>(
+      results.getContext());
 }
 } // namespace spirv
 } // namespace mlir
diff --git a/mlir/lib/Dialect/Shard/IR/ShardOps.cpp b/mlir/lib/Dialect/Shard/IR/ShardOps.cpp
index 135c03311ce0e..645cbff113402 100644
--- a/mlir/lib/Dialect/Shard/IR/ShardOps.cpp
+++ b/mlir/lib/Dialect/Shard/IR/ShardOps.cpp
@@ -158,7 +158,7 @@ static FailureOr<GridOp> getGridAndVerify(Operation *op,
 }
 
 template <typename It>
-bool isUnique(It begin, It end) {
+static bool isUnique(It begin, It end) {
   if (begin == end) {
     return true;
   }
diff --git a/mlir/lib/Dialect/SparseTensor/IR/Detail/Var.cpp b/mlir/lib/Dialect/SparseTensor/IR/Detail/Var.cpp
index a1711a69ce8d6..069191c21b091 100644
--- a/mlir/lib/Dialect/SparseTensor/IR/Detail/Var.cpp
+++ b/mlir/lib/Dialect/SparseTensor/IR/Detail/Var.cpp
@@ -143,8 +143,8 @@ void VarInfo::setNum(Var::Num n) {
 
 /// Helper function for `assertUsageConsistency` to better handle SMLoc
 /// mismatches.
-LLVM_ATTRIBUTE_UNUSED static llvm::SMLoc
-minSMLoc(AsmParser &parser, llvm::SMLoc sm1, llvm::SMLoc sm2) {
+[[maybe_unused]] static llvm::SMLoc minSMLoc(AsmParser &parser, llvm::SMLoc sm1,
+                                             llvm::SMLoc sm2) {
   const auto loc1 = dyn_cast<FileLineColLoc>(parser.getEncodedSourceLoc(sm1));
   assert(loc1 && "Could not get `FileLineColLoc` for first `SMLoc`");
   const auto loc2 = dyn_cast<FileLineColLoc>(parser.getEncodedSourceLoc(sm2));
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp
index f53950242e10c..684c088eb9b0f 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp
@@ -43,8 +43,8 @@ using namespace mlir::sparse_tensor;
 //===----------------------------------------------------------------------===//
 
 #ifndef NDEBUG
-LLVM_ATTRIBUTE_UNUSED static void dumpIndexMemRef(OpBuilder &builder,
-                                                  Location loc, Value memref) {
+[[maybe_unused]] static void dumpIndexMemRef(OpBuilder &builder, Location loc,
+                                             Value memref) {
   memref = memref::CastOp::create(
       builder, loc, UnrankedMemRefType::get(builder.getIndexType(), 0), memref);
   createFuncCall(builder, loc, "printMemrefInd", TypeRange{},
diff --git a/mlir/lib/Dialect/Tosa/IR/TargetEnv.cpp b/mlir/lib/Dialect/Tosa/IR/TargetEnv.cpp
index 5aad67173cc61..1cba1bb540c02 100644
--- a/mlir/lib/Dialect/Tosa/IR/TargetEnv.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TargetEnv.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/Tosa/IR/TargetEnv.h"
+#include "llvm/Support/FormatVariadic.h"
 
 namespace mlir {
 namespace tosa {
@@ -27,7 +28,7 @@ TargetEnvAttr lookupTargetEnv(Operation *op) {
 }
 
 TargetEnvAttr getDefaultTargetEnv(MLIRContext *context) {
-  return TargetEnvAttr::get(context, Level::eightK,
+  return TargetEnvAttr::get(context, SpecificationVersion::V_1_0, Level::eightK,
                             {Profile::pro_int, Profile::pro_fp}, {});
 }
 
@@ -38,5 +39,9 @@ TargetEnvAttr lookupTargetEnvOrDefault(Operation *op) {
   return getDefaultTargetEnv(op->getContext());
 }
 
+llvm::SmallString<4> stringifyVersion(TosaSpecificationVersion version) {
+  return llvm::formatv("{0}.{1}", version.getMajor(), version.getMinor());
+}
+
 } // namespace tosa
 } // namespace mlir
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaAttachTarget.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaAttachTarget.cpp
index bcb880a808b36..a0661e4ee0bd2 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaAttachTarget.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaAttachTarget.cpp
@@ -61,8 +61,8 @@ class TosaAttachTarget
 
     ModuleOp mod = getOperation();
     MLIRContext *ctx = &getContext();
-    const auto targetEnvAttr =
-        TargetEnvAttr::get(ctx, level, selectedProfiles, selectedExtensions);
+    const auto targetEnvAttr = TargetEnvAttr::get(
+        ctx, specificationVersion, level, selectedProfiles, selectedExtensions);
     mod->setAttr(TargetEnvAttr::name, targetEnvAttr);
   }
 
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp
index 20f9333e7c951..f072e3eff1975 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp
@@ -335,16 +335,15 @@ LogicalResult ProfileInfoDepot::populatationDispatch(Operation *op) {
 //===----------------------------------------------------------------------===//
 
 template <typename T>
-FailureOr<SmallVector<T>>
-TosaProfileCompliance::getOperatorDefinition(Operation *op,
-                                             CheckCondition &condition) {
+FailureOr<OpComplianceInfo<T>>
+TosaProfileCompliance::getOperatorDefinition(Operation *op) {
   const std::string opName = op->getName().getStringRef().str();
   const auto complianceMap = getProfileComplianceMap<T>();
   const auto it = complianceMap.find(opName);
   if (it == complianceMap.end())
     return {};
 
-  return findMatchedProfile<T>(op, it->second, condition);
+  return findMatchedEntry<T>(op, it->second);
 }
 
 template <typename T>
@@ -356,22 +355,21 @@ LogicalResult TosaProfileCompliance::checkProfileOrExtension(
   if (specRequiredModeSet.size() == 0)
     return success();
 
-  CheckCondition condition = CheckCondition::invalid;
-  const auto maybeOpRequiredMode = getOperatorDefinition<T>(op, condition);
-  if (failed(maybeOpRequiredMode)) {
+  const auto maybeOpDefinition = getOperatorDefinition<T>(op);
+  if (failed(maybeOpDefinition)) {
     // Operators such as control-flow and shape ops do not have an operand type
     // restriction. When the profile compliance information of operation is not
     // found, confirm if the target have enabled the profile required from the
     // specification.
-    int mode_count = 0;
+    int modeCount = 0;
     for (const auto &cands : specRequiredModeSet) {
       if (targetEnv.allowsAnyOf(cands))
         return success();
-      mode_count += cands.size();
+      modeCount += cands.size();
     }
 
     op->emitOpError() << "illegal: requires"
-                      << (mode_count > 1 ? " any of " : " ") << "["
+                      << (modeCount > 1 ? " any of " : " ") << "["
                       << llvm::join(stringifyProfile<T>(specRequiredModeSet),
                                     ", ")
                       << "] but not enabled in target\n";
@@ -381,7 +379,10 @@ LogicalResult TosaProfileCompliance::checkProfileOrExtension(
 
   // Find the required profiles or extensions according to the operand type
   // combination.
-  const auto opRequiredMode = maybeOpRequiredMode.value();
+  const auto opDefinition = maybeOpDefinition.value();
+  const SmallVector<T> opRequiredMode = opDefinition.mode;
+  const CheckCondition condition = opDefinition.condition;
+
   if (opRequiredMode.size() == 0) {
     // No matched restriction found.
     return success();
@@ -437,6 +438,21 @@ LogicalResult TosaProfileCompliance::checkProfileOrExtension(
     }
   }
 
+  // Ensure the matched op compliance version does not exceed the target
+  // specification version.
+  const VersionedTypeInfo versionedTypeInfo =
+      opDefinition.operandTypeInfoSet[0];
+  const TosaSpecificationVersion complianceVersion{versionedTypeInfo.second};
+  const TosaSpecificationVersion targetVersion{targetEnv.getSpecVersion()};
+  if (!targetVersion.isBackwardsCompatibleWith(complianceVersion)) {
+    op->emitOpError() << "illegal: the target specification version ("
+                      << stringifyVersion(targetVersion)
+                      << ") is not backwards compatible with the op compliance "
+                         "specification version ("
+                      << stringifyVersion(complianceVersion) << ")\n";
+    return failure();
+  }
+
   return success();
 }
 
@@ -461,14 +477,14 @@ TosaProfileCompliance::checkExtension(Operation *op,
 }
 
 LogicalResult TosaProfileCompliance::checkInvalid(Operation *op) {
-  CheckCondition condition = CheckCondition::invalid;
-  const auto maybeProfDef = getOperatorDefinition<Profile>(op, condition);
-  const auto maybeExtDef = getOperatorDefinition<Extension>(op, condition);
+  const auto maybeProfDef = getOperatorDefinition<Profile>(op);
+  const auto maybeExtDef = getOperatorDefinition<Extension>(op);
   if (failed(maybeProfDef) && failed(maybeExtDef))
     return success();
 
-  const bool hasEntry = (succeeded(maybeProfDef) && !maybeProfDef->empty()) ||
-                        (succeeded(maybeExtDef) && !maybeExtDef->empty());
+  const bool hasEntry =
+      (succeeded(maybeProfDef) && !maybeProfDef->mode.empty()) ||
+      (succeeded(maybeExtDef) && !maybeExtDef->mode.empty());
   if (!hasEntry) {
     std::string message;
     llvm::raw_string_ostream os(message);
@@ -488,7 +504,9 @@ LogicalResult TosaProfileCompliance::checkInvalid(Operation *op) {
     SmallVector<TypeInfo> bestTypeInfo;
     const auto searchBestMatch = [&](auto map) {
       for (const auto &complianceInfos : map[opName]) {
-        for (const auto &typeInfos : complianceInfos.operandTypeInfoSet) {
+        for (const auto &versionedTypeInfos :
+             complianceInfos.operandTypeInfoSet) {
+          const SmallVector<TypeInfo> typeInfos = versionedTypeInfos.first;
           const int matches = llvm::count_if(
               llvm::zip_equal(current, typeInfos), [&](const auto zipType) {
                 return isSameTypeInfo(std::get<0>(zipType),
@@ -520,9 +538,8 @@ LogicalResult TosaProfileCompliance::checkInvalid(Operation *op) {
 // Find the profiles or extensions requirement according to the signature of
 // type of the operand list.
 template <typename T>
-SmallVector<T> TosaProfileCompliance::findMatchedProfile(
-    Operation *op, SmallVector<OpComplianceInfo<T>> compInfo,
-    CheckCondition &condition) {
+OpComplianceInfo<T> TosaProfileCompliance::findMatchedEntry(
+    Operation *op, SmallVector<OpComplianceInfo<T>> compInfo) {
   assert(compInfo.size() != 0 &&
          "profile-based compliance information is empty");
 
@@ -533,27 +550,30 @@ SmallVector<T> TosaProfileCompliance::findMatchedProfile(
     return {};
 
   for (size_t i = 0; i < compInfo.size(); i++) {
-    SmallVector<SmallVector<TypeInfo>> sets = compInfo[i].operandTypeInfoSet;
-    for (SmallVector<TypeInfo> expected : sets) {
+    SmallVector<VersionedTypeInfo> sets = compInfo[i].operandTypeInfoSet;
+    for (const auto &set : sets) {
+      SmallVector<TypeInfo> expected = set.first;
       assert(present.size() == expected.size() &&
              "the entries for profile-based compliance do not match between "
              "the generated metadata and the type definition retrieved from "
              " the operation");
 
-      bool is_found = true;
+      bool isFound = true;
       // Compare the type signature between the given operation and the
       // compliance metadata.
       for (size_t j = 0; j < expected.size(); j++) {
         if (!isSameTypeInfo(present[j], expected[j])) {
           // Verify the next mode set from the list.
-          is_found = false;
+          isFound = false;
           break;
         }
       }
 
-      if (is_found == true) {
-        condition = compInfo[i].condition;
-        return compInfo[i].mode;
+      if (isFound == true) {
+        SmallVector<VersionedTypeInfo> typeInfoSet{set};
+        OpComplianceInfo<T> info{compInfo[i].mode, typeInfoSet,
+                                 compInfo[i].condition};
+        return info;
       }
     }
   }
diff --git a/mlir/lib/Dialect/Transform/IR/TransformTypes.cpp b/mlir/lib/Dialect/Transform/IR/TransformTypes.cpp
index 9a24c2baebabb..a2cff6acecd5a 100644
--- a/mlir/lib/Dialect/Transform/IR/TransformTypes.cpp
+++ b/mlir/lib/Dialect/Transform/IR/TransformTypes.cpp
@@ -21,10 +21,10 @@ using namespace mlir;
 
 // These are automatically generated by ODS but are not used as the Transform
 // dialect uses a different dispatch mechanism to support dialect extensions.
-LLVM_ATTRIBUTE_UNUSED static OptionalParseResult
+[[maybe_unused]] static OptionalParseResult
 generatedTypeParser(AsmParser &parser, StringRef *mnemonic, Type &value);
-LLVM_ATTRIBUTE_UNUSED static LogicalResult
-generatedTypePrinter(Type def, AsmPrinter &printer);
+[[maybe_unused]] static LogicalResult generatedTypePrinter(Type def,
+                                                           AsmPrinter &printer);
 
 #define GET_TYPEDEF_CLASSES
 #include "mlir/Dialect/Transform/IR/TransformTypes.cpp.inc"
diff --git a/mlir/lib/Dialect/Transform/SMTExtension/SMTExtensionOps.cpp b/mlir/lib/Dialect/Transform/SMTExtension/SMTExtensionOps.cpp
index 8e7af05353de7..abc131639fb3a 100644
--- a/mlir/lib/Dialect/Transform/SMTExtension/SMTExtensionOps.cpp
+++ b/mlir/lib/Dialect/Transform/SMTExtension/SMTExtensionOps.cpp
@@ -8,8 +8,8 @@
 
 #include "mlir/Dialect/Transform/SMTExtension/SMTExtensionOps.h"
 #include "mlir/Dialect/SMT/IR/SMTDialect.h"
-#include "mlir/Dialect/Transform/IR/TransformOps.h"
-#include "mlir/Dialect/Transform/SMTExtension/SMTExtension.h"
+#include "mlir/Dialect/SMT/IR/SMTOps.h"
+#include "mlir/Dialect/Transform/IR/TransformTypes.h"
 
 using namespace mlir;
 
@@ -23,6 +23,7 @@ using namespace mlir;
 void transform::smt::ConstrainParamsOp::getEffects(
     SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
   onlyReadsHandle(getParamsMutable(), effects);
+  producesHandle(getResults(), effects);
 }
 
 DiagnosedSilenceableFailure
@@ -37,19 +38,95 @@ transform::smt::ConstrainParamsOp::apply(transform::TransformRewriter &rewriter,
   //       and allow for users to attach their own implementation, which would,
   //       e.g., translate the ops to SMTLIB and hand that over to the user's
   //       favourite solver. This requires changes to the dialect's verifier.
-  return emitDefiniteFailure() << "op does not have interpreted semantics yet";
+  return emitSilenceableFailure(getLoc())
+         << "op does not have interpreted semantics yet";
 }
 
 LogicalResult transform::smt::ConstrainParamsOp::verify() {
+  auto yieldTerminator =
+      dyn_cast<mlir::smt::YieldOp>(getRegion().front().back());
+  if (!yieldTerminator)
+    return emitOpError() << "expected '"
+                         << mlir::smt::YieldOp::getOperationName()
+                         << "' as terminator";
+
+  auto checkTypes = [](size_t idx, Type smtType, StringRef smtDesc,
+                       Type paramType, StringRef paramDesc,
+                       auto *atOp) -> InFlightDiagnostic {
+    if (!isa<mlir::smt::BoolType, mlir::smt::IntType, mlir::smt::BitVectorType>(
+            smtType))
+      return atOp->emitOpError() << "the type of " << smtDesc << " #" << idx
+                                 << " is expected to be either a !smt.bool, a "
+                                    "!smt.int, or a !smt.bv";
+
+    assert(isa<TransformParamTypeInterface>(paramType) &&
+           "ODS specifies params' type should implement param interface");
+    if (isa<transform::AnyParamType>(paramType))
+      return {}; // No further checks can be done.
+
+    // NB: This cast must succeed as long as the only implementors of
+    //     TransformParamTypeInterface are AnyParamType and ParamType.
+    Type typeWrappedByParam = cast<ParamType>(paramType).getType();
+
+    if (isa<mlir::smt::IntType>(smtType)) {
+      if (!isa<IntegerType>(typeWrappedByParam))
+        return atOp->emitOpError()
+               << "the type of " << smtDesc << " #" << idx
+               << " is !smt.int though the corresponding " << paramDesc
+               << " type (" << paramType << ") is not wrapping an integer type";
+    } else if (isa<mlir::smt::BoolType>(smtType)) {
+      auto wrappedIntType = dyn_cast<IntegerType>(typeWrappedByParam);
+      if (!wrappedIntType || wrappedIntType.getWidth() != 1)
+        return atOp->emitOpError()
+               << "the type of " << smtDesc << " #" << idx
+               << " is !smt.bool though the corresponding " << paramDesc
+               << " type (" << paramType << ") is not wrapping i1";
+    } else if (auto bvSmtType = dyn_cast<mlir::smt::BitVectorType>(smtType)) {
+      auto wrappedIntType = dyn_cast<IntegerType>(typeWrappedByParam);
+      if (!wrappedIntType || wrappedIntType.getWidth() != bvSmtType.getWidth())
+        return atOp->emitOpError()
+               << "the type of " << smtDesc << " #" << idx << " is " << smtType
+               << " though the corresponding " << paramDesc << " type ("
+               << paramType
+               << ") is not wrapping an integer type of the same bitwidth";
+    }
+
+    return {};
+  };
+
   if (getOperands().size() != getBody().getNumArguments())
     return emitOpError(
         "must have the same number of block arguments as operands");
 
+  for (auto [idx, operandType, blockArgType] :
+       llvm::enumerate(getOperandTypes(), getBody().getArgumentTypes())) {
+    InFlightDiagnostic typeCheckResult =
+        checkTypes(idx, blockArgType, "block arg", operandType, "operand",
+                   /*atOp=*/this);
+    if (LogicalResult(typeCheckResult).failed())
+      return typeCheckResult;
+  }
+
   for (auto &op : getBody().getOps()) {
     if (!isa<mlir::smt::SMTDialect>(op.getDialect()))
       return emitOpError(
           "ops contained in region should belong to SMT-dialect");
   }
 
+  if (yieldTerminator->getNumOperands() != getNumResults())
+    return yieldTerminator.emitOpError()
+           << "expected terminator to have as many operands as the parent op "
+              "has results";
+
+  for (auto [idx, termOperandType, resultType] : llvm::enumerate(
+           yieldTerminator->getOperands().getType(), getResultTypes())) {
+    InFlightDiagnostic typeCheckResult =
+        checkTypes(idx, termOperandType, "terminator operand",
+                   cast<transform::ParamType>(resultType), "result",
+                   /*atOp=*/&yieldTerminator);
+    if (LogicalResult(typeCheckResult).failed())
+      return typeCheckResult;
+  }
+
   return success();
 }
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index d06066d4dfb03..3459564e72318 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -6836,6 +6836,73 @@ class FoldTransposeShapeCast final : public OpRewritePattern<TransposeOp> {
   }
 };
 
+/// Folds transpose(from_elements(...)) into a new from_elements with permuted
+/// operands matching the transposed shape.
+///
+/// Example:
+///
+///   %v = vector.from_elements %a00, %a01, %a02, %a10, %a11, %a12 :
+///   vector<2x3xi32> %t = vector.transpose %v, [1, 0] : vector<2x3xi32> to
+///   vector<3x2xi32>
+///
+/// becomes ->
+///
+///   %r = vector.from_elements %a00, %a10, %a01, %a11, %a02, %a12 :
+///   vector<3x2xi32>
+///
+class FoldTransposeFromElements final : public OpRewritePattern<TransposeOp> {
+public:
+  using Base::Base;
+  LogicalResult matchAndRewrite(vector::TransposeOp transposeOp,
+                                PatternRewriter &rewriter) const override {
+    auto fromElementsOp =
+        transposeOp.getVector().getDefiningOp<vector::FromElementsOp>();
+    if (!fromElementsOp)
+      return failure();
+
+    VectorType srcTy = fromElementsOp.getDest().getType();
+    VectorType dstTy = transposeOp.getType();
+
+    ArrayRef<int64_t> permutation = transposeOp.getPermutation();
+    int64_t rank = srcTy.getRank();
+
+    // Build inverse permutation to map destination indices back to source.
+    SmallVector<int64_t> inversePerm(rank, 0);
+    for (int64_t i = 0; i < rank; ++i)
+      inversePerm[permutation[i]] = i;
+
+    ArrayRef<int64_t> srcShape = srcTy.getShape();
+    ArrayRef<int64_t> dstShape = dstTy.getShape();
+    SmallVector<int64_t> srcIdx(rank, 0);
+    SmallVector<int64_t> dstIdx(rank, 0);
+    SmallVector<int64_t> srcStrides = computeStrides(srcShape);
+    SmallVector<int64_t> dstStrides = computeStrides(dstShape);
+
+    auto elementsOld = fromElementsOp.getElements();
+    SmallVector<Value> elementsNew;
+    int64_t dstNumElements = dstTy.getNumElements();
+    elementsNew.reserve(dstNumElements);
+
+    // For each element in destination row-major order, pick the corresponding
+    // source element.
+    for (int64_t linearIdx = 0; linearIdx < dstNumElements; ++linearIdx) {
+      // Pick the destination element index.
+      dstIdx = delinearize(linearIdx, dstStrides);
+      // Map the destination element index to the source element index.
+      for (int64_t j = 0; j < rank; ++j)
+        srcIdx[j] = dstIdx[inversePerm[j]];
+      // Linearize the source element index.
+      int64_t srcLin = linearize(srcIdx, srcStrides);
+      // Add the source element to the new elements.
+      elementsNew.push_back(elementsOld[srcLin]);
+    }
+
+    rewriter.replaceOpWithNewOp<FromElementsOp>(transposeOp, dstTy,
+                                                elementsNew);
+    return success();
+  }
+};
+
 /// Folds transpose(broadcast(x)) to broadcast(x) if the transpose is
 /// 'order preserving', where 'order preserving' means the flattened
 /// inputs and outputs of the transpose have identical (numerical) values.
@@ -6936,7 +7003,8 @@ class FoldTransposeBroadcast : public OpRewritePattern<vector::TransposeOp> {
 void vector::TransposeOp::getCanonicalizationPatterns(
     RewritePatternSet &results, MLIRContext *context) {
   results.add<FoldTransposeCreateMask, FoldTransposeShapeCast, TransposeFolder,
-              FoldTransposeSplat, FoldTransposeBroadcast>(context);
+              FoldTransposeSplat, FoldTransposeFromElements,
+              FoldTransposeBroadcast>(context);
 }
 
 //===----------------------------------------------------------------------===//
@@ -7602,6 +7670,111 @@ void StepOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
   setResultRanges(getResult(), result);
 }
 
+namespace {
+
+/// Fold `vector.step -> arith.cmpi` when the step value is compared to a
+/// constant large enough such that the result is the same at all indices.
+///
+/// For example, rewrite the 'greater than' comparison below,
+///
+/// ```mlir
+/// %cst = arith.constant dense<7> : vector<3xindex>
+/// %stp = vector.step : vector<3xindex>
+/// %out = arith.cmpi ugt, %stp, %cst : vector<3xindex>
+/// ```
+///
+/// as,
+///
+/// ```mlir
+/// %out = arith.constant dense<false> : vector<3xi1>.
+/// ```
+///
+/// Above `[0, 1, 2] > [7, 7, 7]` => `[false, false, false]`. Because the result
+/// is false at ALL indices we fold. If the constant was 1, then
+/// `[0, 1, 2] > [1, 1, 1]` => `[false, false, true]` and we do fold,
+/// conservatively preferring the 'compact' vector.step representation.
+///
+/// Note: this folder only works for the case where the constant (`%cst` above)
+/// is the second operand of the comparison. The arith.cmpi canonicalizer will
+/// ensure that constants are always second (on the right).
+struct StepCompareFolder : public OpRewritePattern<StepOp> {
+  using Base::Base;
+
+  LogicalResult matchAndRewrite(StepOp stepOp,
+                                PatternRewriter &rewriter) const override {
+    const int64_t stepSize = stepOp.getResult().getType().getNumElements();
+
+    for (OpOperand &use : stepOp.getResult().getUses()) {
+      auto cmpiOp = dyn_cast<arith::CmpIOp>(use.getOwner());
+      if (!cmpiOp)
+        continue;
+
+      // arith.cmpi canonicalizer makes constants final operands.
+      const unsigned stepOperandNumber = use.getOperandNumber();
+      if (stepOperandNumber != 0)
+        continue;
+
+      // Check that operand 1 is a constant.
+      unsigned constOperandNumber = 1;
+      Value otherOperand = cmpiOp.getOperand(constOperandNumber);
+      std::optional<int64_t> maybeConstValue =
+          getConstantIntValue(otherOperand);
+      if (!maybeConstValue.has_value())
+        continue;
+
+      int64_t constValue = maybeConstValue.value();
+      arith::CmpIPredicate pred = cmpiOp.getPredicate();
+
+      auto maybeSplat = [&]() -> std::optional<bool> {
+        // Handle ult (unsigned less than) and uge (unsigned greater equal).
+        if ((pred == arith::CmpIPredicate::ult ||
+             pred == arith::CmpIPredicate::uge) &&
+            stepSize <= constValue)
+          return pred == arith::CmpIPredicate::ult;
+
+        // Handle ule and ugt.
+        if ((pred == arith::CmpIPredicate::ule ||
+             pred == arith::CmpIPredicate::ugt) &&
+            stepSize - 1 <= constValue) {
+          return pred == arith::CmpIPredicate::ule;
+        }
+
+        // Handle eq and ne.
+        if ((pred == arith::CmpIPredicate::eq ||
+             pred == arith::CmpIPredicate::ne) &&
+            stepSize <= constValue)
+          return pred == arith::CmpIPredicate::ne;
+
+        return std::nullopt;
+      }();
+
+      if (!maybeSplat.has_value())
+        continue;
+
+      rewriter.setInsertionPointAfter(cmpiOp);
+
+      auto type = dyn_cast<VectorType>(cmpiOp.getResult().getType());
+      if (!type)
+        continue;
+
+      auto boolAttr = DenseElementsAttr::get(type, maybeSplat.value());
+      Value splat = mlir::arith::ConstantOp::create(rewriter, cmpiOp.getLoc(),
+                                                    type, boolAttr);
+
+      rewriter.replaceOp(cmpiOp, splat);
+      return success();
+    }
+
+    return failure();
+  }
+};
+} // namespace
+
+void StepOp::getCanonicalizationPatterns(RewritePatternSet &results,
+                                         MLIRContext *context) {
+  results.add<StepCompareFolder>(context);
+}
+
 //===----------------------------------------------------------------------===//
 // Vector Masking Utilities
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
index e95338f7d18be..7c019e7d25bf2 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
@@ -928,17 +928,20 @@ struct WarpOpDeadResult : public WarpDistributionPattern {
     // Some values may be yielded multiple times and correspond to multiple
     // results. Deduplicating occurs by taking each result with its matching
     // yielded value, and:
-    //   1. recording the unique first position at which the value is yielded.
+    //   1. recording the unique first position at which the value with uses is
+    //   yielded.
     //   2. recording for the result, the first position at which the dedup'ed
     //      value is yielded.
     //   3. skipping from the new result types / new yielded values any result
     //      that has no use or whose yielded value has already been seen.
     for (OpResult result : warpOp.getResults()) {
+      if (result.use_empty())
+        continue;
       Value yieldOperand = yield.getOperand(result.getResultNumber());
       auto it = dedupYieldOperandPositionMap.insert(
           std::make_pair(yieldOperand, newResultTypes.size()));
       dedupResultPositionMap.insert(std::make_pair(result, it.first->second));
-      if (result.use_empty() || !it.second)
+      if (!it.second)
         continue;
       newResultTypes.push_back(result.getType());
       newYieldValues.push_back(yieldOperand);
@@ -1843,16 +1846,16 @@ struct WarpOpScfIfOp : public WarpDistributionPattern {
     newWarpOpDistTypes.append(escapingValueDistTypesElse.begin(),
                               escapingValueDistTypesElse.end());
 
-    llvm::SmallDenseMap<unsigned, unsigned> origToNewYieldIdx;
     for (auto [idx, val] :
          llvm::zip_equal(nonIfYieldIndices, nonIfYieldValues)) {
-      origToNewYieldIdx[idx] = newWarpOpYieldValues.size();
       newWarpOpYieldValues.push_back(val);
       newWarpOpDistTypes.push_back(warpOp.getResult(idx).getType());
     }
-    // Create the new `WarpOp` with the updated yield values and types.
-    WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndReplaceReturns(
-        rewriter, warpOp, newWarpOpYieldValues, newWarpOpDistTypes);
+    // Replace the old `WarpOp` with the new one that has additional yield
+    // values and types.
+    SmallVector<size_t> newIndices;
+    WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+        rewriter, warpOp, newWarpOpYieldValues, newWarpOpDistTypes, newIndices);
     // `ifOp` returns the result of the inner warp op.
     SmallVector<Type> newIfOpDistResTypes;
     for (auto [i, res] : llvm::enumerate(ifOp.getResults())) {
@@ -1870,8 +1873,8 @@ struct WarpOpScfIfOp : public WarpDistributionPattern {
     OpBuilder::InsertionGuard g(rewriter);
     rewriter.setInsertionPointAfter(newWarpOp);
     auto newIfOp = scf::IfOp::create(
-        rewriter, ifOp.getLoc(), newIfOpDistResTypes, newWarpOp.getResult(0),
-        static_cast<bool>(ifOp.thenBlock()),
+        rewriter, ifOp.getLoc(), newIfOpDistResTypes,
+        newWarpOp.getResult(newIndices[0]), static_cast<bool>(ifOp.thenBlock()),
         static_cast<bool>(ifOp.elseBlock()));
     auto encloseRegionInWarpOp =
         [&](Block *oldIfBranch, Block *newIfBranch,
@@ -1888,7 +1891,7 @@ struct WarpOpScfIfOp : public WarpDistributionPattern {
           for (size_t i = 0; i < escapingValues.size();
                ++i, ++warpResRangeStart) {
             innerWarpInputVals.push_back(
-                newWarpOp.getResult(warpResRangeStart));
+                newWarpOp.getResult(newIndices[warpResRangeStart]));
             escapeValToBlockArgIndex[escapingValues[i]] =
                 innerWarpInputTypes.size();
             innerWarpInputTypes.push_back(escapingValueInputTypes[i]);
@@ -1936,17 +1939,8 @@ struct WarpOpScfIfOp : public WarpDistributionPattern {
     // Update the users of `<- WarpOp.yield <- IfOp.yield` to use the new `IfOp`
     // result.
     for (auto [origIdx, newIdx] : ifResultMapping)
-      rewriter.replaceAllUsesExcept(warpOp.getResult(origIdx),
+      rewriter.replaceAllUsesExcept(newWarpOp.getResult(origIdx),
                                     newIfOp.getResult(newIdx), newIfOp);
-    // Similarly, update any users of the `WarpOp` results that were not
-    // results of the `IfOp`.
-    for (auto [origIdx, newIdx] : origToNewYieldIdx)
-      rewriter.replaceAllUsesWith(warpOp.getResult(origIdx),
-                                  newWarpOp.getResult(newIdx));
-    // Remove the original `WarpOp` and `IfOp`, they should not have any uses
-    // at this point.
-    rewriter.eraseOp(ifOp);
-    rewriter.eraseOp(warpOp);
     return success();
   }
 
@@ -2038,11 +2032,19 @@ struct WarpOpScfForOp : public WarpDistributionPattern {
     }
 
     // Newly created `WarpOp` will yield values in following order:
-    // 1. All init args of the `ForOp`.
-    // 2. All escaping values.
-    // 3. All non-`ForOp` yielded values.
+    // 1. Loop bounds.
+    // 2. All init args of the `ForOp`.
+    // 3. All escaping values.
+    // 4. All non-`ForOp` yielded values.
     SmallVector<Value> newWarpOpYieldValues;
     SmallVector<Type> newWarpOpDistTypes;
+    newWarpOpYieldValues.insert(
+        newWarpOpYieldValues.end(),
+        {forOp.getLowerBound(), forOp.getUpperBound(), forOp.getStep()});
+    newWarpOpDistTypes.insert(newWarpOpDistTypes.end(),
+                              {forOp.getLowerBound().getType(),
+                               forOp.getUpperBound().getType(),
+                               forOp.getStep().getType()});
     for (auto [i, initArg] : llvm::enumerate(forOp.getInitArgs())) {
       newWarpOpYieldValues.push_back(initArg);
       // Compute the distributed type for this init arg.
@@ -2065,36 +2067,37 @@ struct WarpOpScfForOp : public WarpDistributionPattern {
                               escapingValueDistTypes.begin(),
                               escapingValueDistTypes.end());
     // Next, we insert all non-`ForOp` yielded values and their distributed
-    // types. We also create a mapping between the non-`ForOp` yielded value
-    // index and the corresponding new `WarpOp` yield value index (needed to
-    // update users later).
-    llvm::SmallDenseMap<unsigned, unsigned> nonForResultMapping;
+    // types.
     for (auto [i, v] :
          llvm::zip_equal(nonForResultIndices, nonForYieldedValues)) {
-      nonForResultMapping[i] = newWarpOpYieldValues.size();
       newWarpOpYieldValues.push_back(v);
       newWarpOpDistTypes.push_back(warpOp.getResult(i).getType());
     }
     // Create the new `WarpOp` with the updated yield values and types.
-    WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndReplaceReturns(
-        rewriter, warpOp, newWarpOpYieldValues, newWarpOpDistTypes);
+    SmallVector<size_t> newIndices;
+    WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+        rewriter, warpOp, newWarpOpYieldValues, newWarpOpDistTypes, newIndices);
 
     // Next, we create a new `ForOp` with the init args yielded by the new
     // `WarpOp`.
+    const unsigned initArgsStartIdx = 3; // After loop bounds.
     const unsigned escapingValuesStartIdx =
+        initArgsStartIdx +
         forOp.getInitArgs().size(); // `ForOp` init args are positioned before
                                     // escaping values in the new `WarpOp`.
     SmallVector<Value> newForOpOperands;
-    for (size_t i = 0; i < escapingValuesStartIdx; ++i)
-      newForOpOperands.push_back(newWarpOp.getResult(i));
+    for (size_t i = initArgsStartIdx; i < escapingValuesStartIdx; ++i)
+      newForOpOperands.push_back(newWarpOp.getResult(newIndices[i]));
 
     // Create a new `ForOp` outside the new `WarpOp` region.
     OpBuilder::InsertionGuard g(rewriter);
     rewriter.setInsertionPointAfter(newWarpOp);
     auto newForOp = scf::ForOp::create(
-        rewriter, forOp.getLoc(), forOp.getLowerBound(), forOp.getUpperBound(),
-        forOp.getStep(), newForOpOperands, /*bodyBuilder=*/nullptr,
-        forOp.getUnsignedCmp());
+        rewriter, forOp.getLoc(),
+        /**LowerBound=**/ newWarpOp.getResult(newIndices[0]),
+        /**UpperBound=**/ newWarpOp.getResult(newIndices[1]),
+        /**Step=**/ newWarpOp.getResult(newIndices[2]), newForOpOperands,
+        /*bodyBuilder=*/nullptr, forOp.getUnsignedCmp());
     // Next, we insert a new `WarpOp` (called inner `WarpOp`) inside the
     // newly created `ForOp`. This `WarpOp` will contain all ops that were
     // contained within the original `ForOp` body.
@@ -2110,7 +2113,7 @@ struct WarpOpScfForOp : public WarpDistributionPattern {
     llvm::SmallDenseMap<Value, int64_t> argIndexMapping;
     for (size_t i = escapingValuesStartIdx;
          i < escapingValuesStartIdx + escapingValues.size(); ++i) {
-      innerWarpInput.push_back(newWarpOp.getResult(i));
+      innerWarpInput.push_back(newWarpOp.getResult(newIndices[i]));
       argIndexMapping[escapingValues[i - escapingValuesStartIdx]] =
           innerWarpInputType.size();
       innerWarpInputType.push_back(
@@ -2146,20 +2149,11 @@ struct WarpOpScfForOp : public WarpDistributionPattern {
     if (!innerWarp.getResults().empty())
       scf::YieldOp::create(rewriter, forOp.getLoc(), innerWarp.getResults());
 
-    // Update the users of original `WarpOp` results that were coming from the
+    // Update the users of the new `WarpOp` results that were coming from the
     // original `ForOp` to the corresponding new `ForOp` result.
     for (auto [origIdx, newIdx] : forResultMapping)
-      rewriter.replaceAllUsesExcept(warpOp.getResult(origIdx),
+      rewriter.replaceAllUsesExcept(newWarpOp.getResult(origIdx),
                                     newForOp.getResult(newIdx), newForOp);
-    // Similarly, update any users of the `WarpOp` results that were not
-    // results of the `ForOp`.
-    for (auto [origIdx, newIdx] : nonForResultMapping)
-      rewriter.replaceAllUsesWith(warpOp.getResult(origIdx),
-                                  newWarpOp.getResult(newIdx));
-    // Remove the original `WarpOp` and `ForOp`, they should not have any uses
-    // at this point.
-    rewriter.eraseOp(forOp);
-    rewriter.eraseOp(warpOp);
     // Update any users of escaping values that were forwarded to the
     // inner `WarpOp`. These values are now arguments of the inner `WarpOp`.
     newForOp.walk([&](Operation *op) {
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
index 1b656d82f3201..ea93085849e0b 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
@@ -817,6 +817,50 @@ struct LinearizeVectorToElements final
   }
 };
 
+/// Convert broadcasts from scalars or 1-element vectors, such as
+///
+/// ```mlir
+///   vector.broadcast %value : f32 to vector<4x4xf32>
+/// ```
+///
+/// to broadcasts to rank-1 vectors, with shape_casts before/after as needed.
+/// The above becomes,
+///
+/// ```mlir
+///   %out_1d = vector.broadcast %value : f32 to vector<16xf32>
+///   %out_nd = vector.shape_cast %out_1d : vector<16xf32> to vector<4x4xf32>
+/// ```
+struct LinearizeVectorBroadcast final
+    : public OpConversionPattern<vector::BroadcastOp> {
+  using Base::Base;
+
+  LinearizeVectorBroadcast(const TypeConverter &typeConverter,
+                           MLIRContext *context, PatternBenefit benefit = 1)
+      : OpConversionPattern(typeConverter, context, benefit) {}
+
+  LogicalResult
+  matchAndRewrite(vector::BroadcastOp broadcastOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    int numElements = 1;
+    Type sourceType = broadcastOp.getSourceType();
+    if (auto vecType = dyn_cast<VectorType>(sourceType)) {
+      numElements = vecType.getNumElements();
+    }
+
+    if (numElements != 1) {
+      return rewriter.notifyMatchFailure(
+          broadcastOp, "only broadcasts of single elements can be linearized.");
+    }
+
+    auto dstTy = getTypeConverter()->convertType(broadcastOp.getType());
+    rewriter.replaceOpWithNewOp<vector::BroadcastOp>(broadcastOp, dstTy,
+                                                     adaptor.getSource());
+
+    return success();
+  }
+};
+
 } // namespace
 
 /// This method defines the set of operations that are linearizable, and hence
@@ -909,8 +953,8 @@ void mlir::vector::populateVectorLinearizeBasePatterns(
   patterns
       .add<LinearizeConstantLike, LinearizeVectorizable, LinearizeVectorBitCast,
            LinearizeVectorCreateMask, LinearizeVectorLoad, LinearizeVectorStore,
-           LinearizeVectorFromElements, LinearizeVectorToElements>(
-          typeConverter, patterns.getContext());
+           LinearizeVectorBroadcast, LinearizeVectorFromElements,
+           LinearizeVectorToElements>(typeConverter, patterns.getContext());
 }
 
 void mlir::vector::populateVectorLinearizeShuffleLikeOpsPatterns(
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorUnroll.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorUnroll.cpp
index 14639c5f1cdd3..fbae0989bed26 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorUnroll.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorUnroll.cpp
@@ -465,26 +465,33 @@ struct UnrollElementwisePattern : public RewritePattern {
     auto targetShape = getTargetShape(options, op);
     if (!targetShape)
       return failure();
+    int64_t targetShapeRank = targetShape->size();
     auto dstVecType = cast<VectorType>(op->getResult(0).getType());
     SmallVector<int64_t> originalSize =
         *cast<VectorUnrollOpInterface>(op).getShapeForUnroll();
-    // Bail-out if rank(source) != rank(target). The main limitation here is the
-    // fact that `ExtractStridedSlice` requires the rank for the input and
-    // output to match. If needed, we can relax this later.
-    if (originalSize.size() != targetShape->size())
-      return rewriter.notifyMatchFailure(
-          op, "expected input vector rank to match target shape rank");
+    int64_t originalShapeRank = originalSize.size();
+
     Location loc = op->getLoc();
+
+    // Handle rank mismatch by adding leading unit dimensions to targetShape
+    SmallVector<int64_t> adjustedTargetShape(originalShapeRank);
+    int64_t rankDiff = originalShapeRank - targetShapeRank;
+    std::fill(adjustedTargetShape.begin(),
+              adjustedTargetShape.begin() + rankDiff, 1);
+    std::copy(targetShape->begin(), targetShape->end(),
+              adjustedTargetShape.begin() + rankDiff);
+
+    int64_t adjustedTargetShapeRank = adjustedTargetShape.size();
     // Prepare the result vector.
     Value result = arith::ConstantOp::create(rewriter, loc, dstVecType,
                                              rewriter.getZeroAttr(dstVecType));
-    SmallVector<int64_t> strides(targetShape->size(), 1);
-    VectorType newVecType =
+    SmallVector<int64_t> strides(adjustedTargetShapeRank, 1);
+    VectorType unrolledVecType =
         VectorType::get(*targetShape, dstVecType.getElementType());
 
     // Create the unrolled computation.
     for (SmallVector<int64_t> offsets :
-         StaticTileOffsetRange(originalSize, *targetShape)) {
+         StaticTileOffsetRange(originalSize, adjustedTargetShape)) {
       SmallVector<Value> extractOperands;
       for (OpOperand &operand : op->getOpOperands()) {
         auto vecType = dyn_cast<VectorType>(operand.get().getType());
@@ -492,14 +499,31 @@ struct UnrollElementwisePattern : public RewritePattern {
           extractOperands.push_back(operand.get());
           continue;
         }
-        extractOperands.push_back(
-            rewriter.createOrFold<vector::ExtractStridedSliceOp>(
-                loc, operand.get(), offsets, *targetShape, strides));
+        Value extracted = rewriter.createOrFold<vector::ExtractStridedSliceOp>(
+            loc, operand.get(), offsets, adjustedTargetShape, strides);
+
+        // Reshape to remove leading unit dims if needed
+        if (adjustedTargetShapeRank > targetShapeRank) {
+          extracted = rewriter.createOrFold<vector::ShapeCastOp>(
+              loc, VectorType::get(*targetShape, vecType.getElementType()),
+              extracted);
+        }
+        extractOperands.push_back(extracted);
       }
+
       Operation *newOp = cloneOpWithOperandsAndTypes(
-          rewriter, loc, op, extractOperands, newVecType);
+          rewriter, loc, op, extractOperands, unrolledVecType);
+
+      Value computeResult = newOp->getResult(0);
+
+      // Use strides sized to targetShape for proper insertion
+      SmallVector<int64_t> insertStrides =
+          (adjustedTargetShapeRank > targetShapeRank)
+              ? SmallVector<int64_t>(targetShapeRank, 1)
+              : strides;
+
       result = rewriter.createOrFold<vector::InsertStridedSliceOp>(
-          loc, newOp->getResult(0), result, offsets, strides);
+          loc, computeResult, result, offsets, insertStrides);
     }
     rewriter.replaceOp(op, result);
     return success();
diff --git a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
index 025ee9a04a1de..c809c50206793 100644
--- a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
+++ b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
@@ -91,7 +91,7 @@ mlir::vector::isTranspose2DSlice(vector::TransposeOp op) {
 
   // Check whether the two source vector dimensions that are greater than one
   // must be transposed with each other so that we can apply one of the 2-D
-  // transpose pattens. Otherwise, these patterns are not applicable.
+  // transpose patterns. Otherwise, these patterns are not applicable.
   if (!areDimsTransposedIn2DSlice(srcGtOneDims[0], srcGtOneDims[1],
                                   op.getPermutation()))
     return failure();
diff --git a/mlir/lib/Dialect/WasmSSA/IR/WasmSSAOps.cpp b/mlir/lib/Dialect/WasmSSA/IR/WasmSSAOps.cpp
index 89b62a21c4f52..a514ea9218bd7 100644
--- a/mlir/lib/Dialect/WasmSSA/IR/WasmSSAOps.cpp
+++ b/mlir/lib/Dialect/WasmSSA/IR/WasmSSAOps.cpp
@@ -12,6 +12,7 @@
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/Diagnostics.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/Region.h"
 #include "mlir/IR/SymbolTable.h"
@@ -39,28 +40,6 @@ void printElseRegion(OpAsmPrinter &opPrinter, Operation *op,
   opPrinter.printKeywordOrString("else ");
   opPrinter.printRegion(elseRegion);
 }
-
-ParseResult parseWasmVisibility(OpAsmParser &opParser, StringAttr &visibility) {
-  std::string keyword;
-  auto initLocation = opParser.getCurrentLocation();
-  std::ignore = opParser.parseOptionalKeywordOrString(&keyword);
-  if (keyword == "nested" or keyword == "") {
-    visibility = StringAttr::get(opParser.getContext(), "nested");
-    return ParseResult::success();
-  }
-
-  if (keyword == "public" || keyword == "private") {
-    visibility = StringAttr::get(opParser.getContext(), keyword);
-    return ParseResult::success();
-  }
-  opParser.emitError(initLocation, "expecting symbol visibility");
-  return ParseResult::failure();
-}
-
-void printWasmVisibility(OpAsmPrinter &opPrinter, Operation *op,
-                         Attribute visibility) {
-  opPrinter.printKeywordOrString(cast<StringAttr>(visibility).strref());
-}
 } // namespace
 
 #define GET_OP_CLASSES
@@ -167,10 +146,23 @@ Block *FuncOp::addEntryBlock() {
 
 void FuncOp::build(OpBuilder &odsBuilder, OperationState &odsState,
                    StringRef symbol, FunctionType funcType) {
-  FuncOp::build(odsBuilder, odsState, symbol, funcType, {}, {}, "nested");
+  FuncOp::build(odsBuilder, odsState, symbol, funcType, {}, {});
 }
 
 ParseResult FuncOp::parse(OpAsmParser &parser, OperationState &result) {
+  auto *ctx = parser.getContext();
+  std::string visibilityString;
+  auto loc = parser.getNameLoc();
+  ParseResult res = parser.parseOptionalKeywordOrString(&visibilityString);
+  bool exported{false};
+  if (res.succeeded()) {
+    if (visibilityString != "exported")
+      return parser.emitError(
+                 loc, "expecting either `exported` or symbol name. got ")
+             << visibilityString;
+    exported = true;
+  }
+
   auto buildFuncType = [&parser](Builder &builder, ArrayRef<Type> argTypes,
                                  ArrayRef<Type> results,
                                  function_interface_impl::VariadicFlag,
@@ -191,11 +183,13 @@ ParseResult FuncOp::parse(OpAsmParser &parser, OperationState &result) {
 
     return builder.getFunctionType(argTypesWithoutLocal, results);
   };
-
-  return function_interface_impl::parseFunctionOp(
+  auto funcParseRes = function_interface_impl::parseFunctionOp(
       parser, result, /*allowVariadic=*/false,
       getFunctionTypeAttrName(result.name), buildFuncType,
       getArgAttrsAttrName(result.name), getResAttrsAttrName(result.name));
+  if (exported)
+    result.addAttribute(getExportedAttrName(result.name), UnitAttr::get(ctx));
+  return funcParseRes;
 }
 
 LogicalResult FuncOp::verifyBody() {
@@ -224,9 +218,18 @@ LogicalResult FuncOp::verifyBody() {
 }
 
 void FuncOp::print(OpAsmPrinter &p) {
+  /// If exported, print it before and mask it before printing
+  /// using generic interface.
+  auto exported = getExported();
+  if (exported) {
+    p << " exported";
+    removeExportedAttr();
+  }
   function_interface_impl::printFunctionOp(
       p, *this, /*isVariadic=*/false, getFunctionTypeAttrName(),
       getArgAttrsAttrName(), getResAttrsAttrName());
+  if (exported)
+    setExported(true);
 }
 
 //===----------------------------------------------------------------------===//
@@ -237,38 +240,37 @@ void FuncImportOp::build(OpBuilder &odsBuilder, OperationState &odsState,
                          StringRef symbol, StringRef moduleName,
                          StringRef importName, FunctionType type) {
   FuncImportOp::build(odsBuilder, odsState, symbol, moduleName, importName,
-                      type, {}, {}, odsBuilder.getStringAttr("nested"));
+                      type, {}, {});
 }
 
 //===----------------------------------------------------------------------===//
 // GlobalOp
 //===----------------------------------------------------------------------===//
-
-void GlobalOp::build(OpBuilder &odsBuilder, OperationState &odsState,
-                     StringRef symbol, Type type, bool isMutable) {
-  GlobalOp::build(odsBuilder, odsState, symbol, type, isMutable,
-                  odsBuilder.getStringAttr("nested"));
-}
-
 // Custom formats
 ParseResult GlobalOp::parse(OpAsmParser &parser, OperationState &result) {
   StringAttr symbolName;
   Type globalType;
   auto *ctx = parser.getContext();
-  ParseResult res = parser.parseSymbolName(
-      symbolName, SymbolTable::getSymbolAttrName(), result.attributes);
+  std::string visibilityString;
+  auto loc = parser.getNameLoc();
+  ParseResult res = parser.parseOptionalKeywordOrString(&visibilityString);
+  if (res.succeeded()) {
+    if (visibilityString != "exported")
+      return parser.emitError(
+                 loc, "expecting either `exported` or symbol name. got ")
+             << visibilityString;
+    result.addAttribute(getExportedAttrName(result.name), UnitAttr::get(ctx));
+  }
 
+  res = parser.parseSymbolName(symbolName, SymbolTable::getSymbolAttrName(),
+                               result.attributes);
   res = parser.parseType(globalType);
   result.addAttribute(getTypeAttrName(result.name), TypeAttr::get(globalType));
   std::string mutableString;
   res = parser.parseOptionalKeywordOrString(&mutableString);
   if (res.succeeded() && mutableString == "mutable")
     result.addAttribute("isMutable", UnitAttr::get(ctx));
-  std::string visibilityString;
-  res = parser.parseOptionalKeywordOrString(&visibilityString);
-  if (res.succeeded())
-    result.addAttribute("sym_visibility",
-                        StringAttr::get(ctx, visibilityString));
+
   res = parser.parseColon();
   Region *globalInitRegion = result.addRegion();
   res = parser.parseRegion(*globalInitRegion);
@@ -276,11 +278,11 @@ ParseResult GlobalOp::parse(OpAsmParser &parser, OperationState &result) {
 }
 
 void GlobalOp::print(OpAsmPrinter &printer) {
+  if (getExported())
+    printer << " exported";
   printer << " @" << getSymName().str() << " " << getType();
   if (getIsMutable())
     printer << " mutable";
-  if (auto vis = getSymVisibility())
-    printer << " " << *vis;
   printer << " :";
   Region &body = getRegion();
   if (!body.empty()) {
@@ -319,13 +321,6 @@ GlobalGetOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
 // GlobalImportOp
 //===----------------------------------------------------------------------===//
 
-void GlobalImportOp::build(OpBuilder &odsBuilder, OperationState &odsState,
-                           StringRef symbol, StringRef moduleName,
-                           StringRef importName, Type type, bool isMutable) {
-  GlobalImportOp::build(odsBuilder, odsState, symbol, moduleName, importName,
-                        type, isMutable, odsBuilder.getStringAttr("nested"));
-}
-
 ParseResult GlobalImportOp::parse(OpAsmParser &parser, OperationState &result) {
   auto *ctx = parser.getContext();
   ParseResult res = parseImportOp(parser, result);
@@ -335,12 +330,8 @@ ParseResult GlobalImportOp::parse(OpAsmParser &parser, OperationState &result) {
   res = parser.parseOptionalKeywordOrString(&mutableOrSymVisString);
   if (res.succeeded() && mutableOrSymVisString == "mutable") {
     result.addAttribute("isMutable", UnitAttr::get(ctx));
-    res = parser.parseOptionalKeywordOrString(&mutableOrSymVisString);
   }
 
-  if (res.succeeded())
-    result.addAttribute("sym_visibility",
-                        StringAttr::get(ctx, mutableOrSymVisString));
   res = parser.parseColon();
 
   Type importedType;
@@ -356,8 +347,6 @@ void GlobalImportOp::print(OpAsmPrinter &printer) {
           << "\" as @" << getSymName();
   if (getIsMutable())
     printer << " mutable";
-  if (auto vis = getSymVisibility())
-    printer << " " << *vis;
   printer << " : " << getType();
 }
 
@@ -430,27 +419,6 @@ LogicalResult LocalTeeOp::verify() {
 
 Block *LoopOp::getLabelTarget() { return &getBody().front(); }
 
-//===----------------------------------------------------------------------===//
-// MemOp
-//===----------------------------------------------------------------------===//
-
-void MemOp::build(OpBuilder &odsBuilder, OperationState &odsState,
-                  StringRef symbol, LimitType limit) {
-  MemOp::build(odsBuilder, odsState, symbol, limit,
-               odsBuilder.getStringAttr("nested"));
-}
-
-//===----------------------------------------------------------------------===//
-// MemImportOp
-//===----------------------------------------------------------------------===//
-
-void MemImportOp::build(OpBuilder &odsBuilder, OperationState &odsState,
-                        StringRef symbol, StringRef moduleName,
-                        StringRef importName, LimitType limits) {
-  MemImportOp::build(odsBuilder, odsState, symbol, moduleName, importName,
-                     limits, odsBuilder.getStringAttr("nested"));
-}
-
 //===----------------------------------------------------------------------===//
 // ReinterpretOp
 //===----------------------------------------------------------------------===//
@@ -471,24 +439,3 @@ LogicalResult ReinterpretOp::verify() {
 //===----------------------------------------------------------------------===//
 
 void ReturnOp::build(OpBuilder &odsBuilder, OperationState &odsState) {}
-
-//===----------------------------------------------------------------------===//
-// TableOp
-//===----------------------------------------------------------------------===//
-
-void TableOp::build(OpBuilder &odsBuilder, OperationState &odsState,
-                    StringRef symbol, TableType type) {
-  TableOp::build(odsBuilder, odsState, symbol, type,
-                 odsBuilder.getStringAttr("nested"));
-}
-
-//===----------------------------------------------------------------------===//
-// TableImportOp
-//===----------------------------------------------------------------------===//
-
-void TableImportOp::build(OpBuilder &odsBuilder, OperationState &odsState,
-                          StringRef symbol, StringRef moduleName,
-                          StringRef importName, TableType type) {
-  TableImportOp::build(odsBuilder, odsState, symbol, moduleName, importName,
-                       type, odsBuilder.getStringAttr("nested"));
-}
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 9beb22d517473..1599ae9f28e90 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -727,6 +727,152 @@ void MemLayoutAttr::print(AsmPrinter &printer) const {
   }
   printer << ">";
 }
+// a helper utility to perform binary operation on OpFoldResult.
+// If both a and b are attributes, it will simply return the result.
+// Otherwise, the corresponding arith op will be generated, and an
+// contant op will be created if one of them is an attribute.
+template <typename ArithOp>
+OpFoldResult genBinOp(OpFoldResult a, OpFoldResult b, Location loc,
+                      OpBuilder &builder) {
+  auto aVal = getValueOrCreateConstantIndexOp(builder, loc, a);
+  auto bVal = getValueOrCreateConstantIndexOp(builder, loc, b);
+  return builder.create<ArithOp>(loc, aVal, bVal).getResult();
+}
+
+// a helper utility to perform division operation on OpFoldResult and int64_t.
+#define div(a, b)                                                              \
+  genBinOp<arith::DivSIOp>(a, builder.getIndexAttr(b), loc, builder)
+
+// a helper utility to perform reminder operation on OpFoldResult and int64_t.
+#define rem(a, b)                                                              \
+  genBinOp<arith::RemSIOp>(a, builder.getIndexAttr(b), loc, builder)
+
+// a helper utility to perform multiply operation on OpFoldResult and int64_t.
+#define mul(a, b)                                                              \
+  genBinOp<arith::MulIOp>(a, builder.getIndexAttr(b), loc, builder)
+
+// a helper utility to perform addition operation on two OpFoldResult.
+#define add(a, b) genBinOp<arith::AddIOp>(a, b, loc, builder)
+
+// block the given offsets according to the block shape
+// say the original offset is [y, x], and the block shape is [By, Bx],
+// then the blocked offset is [y/By, x/Bx, y%By, x%Bx]
+SmallVector<OpFoldResult> getBlockedOffsets(OpBuilder &builder, Location loc,
+                                            ArrayRef<OpFoldResult> offsets,
+                                            ArrayRef<int64_t> blockShape) {
+
+  assert(offsets.size() == blockShape.size() &&
+         "offsets and blockShape must have the same size");
+  SmallVector<OpFoldResult> blockedOffsets;
+  SmallVector<OpFoldResult> divs, rems;
+
+  for (auto [offset, block] : llvm::zip(offsets, blockShape)) {
+    divs.push_back(div(offset, block));
+    rems.push_back(rem(offset, block));
+  }
+  blockedOffsets.append(divs.begin(), divs.end());
+  blockedOffsets.append(rems.begin(), rems.end());
+
+  return blockedOffsets;
+}
+
+// Get strides as vector of integer for MemDesc.
+SmallVector<int64_t> MemDescType::getStrideShape() {
+
+  SmallVector<int64_t> matrixShape(getShape().begin(), getShape().end());
+
+  ArrayAttr strideAttr = getStrideAttr();
+  SmallVector<int64_t> strides;
+  for (Attribute attr : strideAttr.getValue()) {
+    strides.push_back(cast<IntegerAttr>(attr).getInt());
+  }
+
+  SmallVector<int64_t> innerBlkShape = getBlockShape();
+
+  // get perm from FCD to LCD
+  // perm[i] = the dim with i-th smallest stride
+  SmallVector<int, 4> perm =
+      llvm::to_vector<4>(llvm::seq<int>(0, strides.size()));
+  llvm::sort(perm, [&](int a, int b) { return strides[a] < strides[b]; });
+
+  assert(strides[perm[0]] == 1 && "inner most dim must have stride 1");
+
+  SmallVector<int64_t> innerBlkStride(innerBlkShape.size());
+  innerBlkStride[perm[0]] = 1;
+  for (size_t i = 1; i < perm.size(); ++i)
+    innerBlkStride[perm[i]] =
+        innerBlkStride[perm[i - 1]] * innerBlkShape[perm[i - 1]];
+
+  // compute the original matrix shape using the stride info
+  // and compute the number of blocks in each dimension
+  // The shape of highest dim can't be derived from stride info,
+  // but doesn't impact the stride computation for blocked layout.
+  SmallVector<int64_t> matrixShapeOrig(matrixShape.size());
+  SmallVector<int64_t> BlkShapeOrig(matrixShape.size());
+  for (size_t i = 0; i < perm.size() - 1; ++i) {
+    matrixShapeOrig[perm[i]] = strides[perm[i + 1]] / strides[perm[i]];
+    BlkShapeOrig[perm[i]] = matrixShapeOrig[perm[i]] / innerBlkShape[perm[i]];
+  }
+
+  int64_t innerBlkSize = 1;
+  for (auto s : innerBlkShape)
+    innerBlkSize *= s;
+
+  SmallVector<int64_t> outerBlkStride(matrixShape.size());
+  outerBlkStride[perm[0]] = innerBlkSize;
+  for (size_t i = 0; i < perm.size() - 1; ++i) {
+    outerBlkStride[perm[i + 1]] =
+        outerBlkStride[perm[i]] * BlkShapeOrig[perm[i]];
+  }
+
+  // combine the inner and outer strides
+  SmallVector<int64_t> blockedStrides;
+  blockedStrides.append(outerBlkStride.begin(), outerBlkStride.end());
+  blockedStrides.append(innerBlkStride.begin(), innerBlkStride.end());
+
+  return blockedStrides;
+}
+
+// Calculate the linear offset using the blocked offsets and stride
+Value MemDescType::getLinearOffsets(OpBuilder &builder, Location loc,
+                                    ArrayRef<OpFoldResult> offsets) {
+
+  SmallVector<int64_t> matrixShape(getShape().begin(), getShape().end());
+  SmallVector<int64_t> blockShape = getBlockShape();
+  SmallVector<int64_t> strides = getStrideShape();
+  SmallVector<OpFoldResult> blockedOffsets;
+
+  // blockshape equal to matrixshape means no blocking
+  if (llvm::equal(blockShape, matrixShape)) {
+    // remove the outer dims from strides
+    strides.erase(strides.begin(), strides.begin() + matrixShape.size());
+  } else {
+    assert(offsets.size() == blockShape.size() &&
+           "offsets and blockShape must have the same size");
+    // say the original offset is [y, x], and the block shape is [By, Bx],
+    // then the blocked offset is [y/By, x/Bx, y%By, x%Bx]
+
+    SmallVector<OpFoldResult> divs, rems;
+
+    for (auto [offset, block] : llvm::zip(offsets, blockShape)) {
+      divs.push_back(div(offset, block));
+      rems.push_back(rem(offset, block));
+    }
+    blockedOffsets.append(divs.begin(), divs.end());
+    blockedOffsets.append(rems.begin(), rems.end());
+    offsets = blockedOffsets;
+  }
+
+  // Start with initial value as matrix descriptor's base offset.
+  Value linearOffset = arith::ConstantIndexOp::create(builder, loc, 0);
+  for (size_t i = 0; i < offsets.size(); ++i) {
+    OpFoldResult mulResult = mul(offsets[i], strides[i]);
+    Value mulVal = getValueOrCreateConstantIndexOp(builder, loc, mulResult);
+    linearOffset = arith::AddIOp::create(builder, loc, mulVal, linearOffset);
+  }
+
+  return linearOffset;
+}
 
 } // namespace xegpu
 } // namespace mlir
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 81b5788d0b9b4..abd12e2e69ac0 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -20,8 +20,8 @@
 
 #define DEBUG_TYPE "xegpu"
 
-namespace mlir {
-namespace xegpu {
+using namespace mlir;
+using namespace mlir::xegpu;
 
 static bool isSharedMemory(const MemRefType &memrefTy) {
   Attribute attr = memrefTy.getMemorySpace();
@@ -173,6 +173,49 @@ isValidGatherScatterBufferParams(Type offsetsTy, Type maskTy,
   return success();
 }
 
+LogicalResult
+IsValidMatrixOpParams(VectorType dataTy, MemDescType mdescTy,
+                      UnitAttr subgroup_block_io,
+                      function_ref<InFlightDiagnostic()> emitError) {
+
+  if (!dataTy) {
+    if (subgroup_block_io)
+      return emitError() << "subgroup_block_io "
+                            "are only allowed when result is a 1D VectorType.";
+    else
+      return success();
+  }
+
+  if (mdescTy.getRank() != 2)
+    return emitError() << "mem_desc must be 2D.";
+
+  ArrayRef<int64_t> dataShape = dataTy.getShape();
+  ArrayRef<int64_t> mdescShape = mdescTy.getShape();
+
+  if (dataShape.size() == 2) {
+    if (subgroup_block_io)
+      return emitError() << "subgroup_block_io "
+                            "are only allowed when result is a 1D VectorType.";
+    if (llvm::any_of(llvm::zip_equal(dataShape, mdescShape),
+                     [](auto p) { return std::get<0>(p) > std::get<1>(p); }))
+      return emitError() << "data shape must not exceed mem_desc shape.";
+  } else {
+    SmallVector<int64_t> blockShape = mdescTy.getBlockShape();
+    // if the subgroup_block_io attribute is set,  mdescTy must have block
+    // attribute
+    if (subgroup_block_io && !blockShape.size())
+      return emitError() << "mem_desc must have block attribute when "
+                            "subgroup_block_io is set.";
+    // if the subgroup_block_io attribute is set, the memdesc should be row
+    // major
+    if (subgroup_block_io && mdescTy.isColMajor())
+      return emitError() << "mem_desc should be row major when "
+                            "subgroup_block_io is set.";
+  }
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // XeGPU_CreateNdDescOp
 //===----------------------------------------------------------------------===//
@@ -1049,23 +1092,20 @@ void LoadMatrixOp::build(OpBuilder &builder, OperationState &state, Type res,
   llvm::SmallVector<int64_t> staticOffsets;
   dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
   auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets);
+  // Call the generated builder with all parameters (including optional ones as
+  // nullptr/empty)
   build(builder, state, res, memDesc, dynamicOffsets, staticOffsetsAttr,
-        layout);
+        /*subgroup_block_io=*/nullptr, layout);
 }
 
 LogicalResult LoadMatrixOp::verify() {
-  VectorType resTy = getRes().getType();
-  MemDescType mdescTy = getMemDesc().getType();
 
-  if (mdescTy.getRank() != 2)
-    return emitOpError("mem_desc must be 2D.");
+  auto resTy = dyn_cast<VectorType>(getRes().getType());
+  UnitAttr subgroup_block_io = getSubgroupBlockIoAttr();
+  MemDescType mdescTy = getMemDesc().getType();
 
-  ArrayRef<int64_t> valueShape = resTy.getShape();
-  ArrayRef<int64_t> mdescShape = mdescTy.getShape();
-  if (llvm::any_of(llvm::zip_equal(valueShape, mdescShape),
-                   [](auto p) { return std::get<0>(p) > std::get<1>(p); }))
-    return emitOpError("result shape must not exceed mem_desc shape.");
-  return success();
+  return IsValidMatrixOpParams(resTy, mdescTy, subgroup_block_io,
+                               [&]() { return emitError(); });
 }
 
 //===----------------------------------------------------------------------===//
@@ -1080,62 +1120,18 @@ void StoreMatrixOp::build(OpBuilder &builder, OperationState &state, Value data,
   dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
   auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets);
   build(builder, state, data, memDesc, dynamicOffsets, staticOffsetsAttr,
-        layout);
+        /*subgroup_block_io=*/nullptr, layout);
 }
 
 LogicalResult StoreMatrixOp::verify() {
-  VectorType dataTy = getData().getType();
-  MemDescType mdescTy = getMemDesc().getType();
-
-  if (mdescTy.getRank() != 2)
-    return emitOpError("mem_desc must be 2D.");
-
-  ArrayRef<int64_t> dataShape = dataTy.getShape();
-  ArrayRef<int64_t> mdescShape = mdescTy.getShape();
-  if (llvm::any_of(llvm::zip_equal(dataShape, mdescShape),
-                   [](auto p) { return std::get<0>(p) > std::get<1>(p); }))
-    return emitOpError("data shape must not exceed mem_desc shape.");
-
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// XeGPU_MemDescSubviewOp
-//===----------------------------------------------------------------------===//
-
-void MemDescSubviewOp::build(OpBuilder &builder, OperationState &state,
-                             Type resTy, Value src,
-                             llvm::ArrayRef<OpFoldResult> offsets) {
-  llvm::SmallVector<Value> dynamicOffsets;
-  llvm::SmallVector<int64_t> staticOffsets;
-  dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
-  auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets);
-  build(builder, state, resTy, src, dynamicOffsets, staticOffsetsAttr);
-}
-
-LogicalResult MemDescSubviewOp::verify() {
-  MemDescType srcTy = getSrc().getType();
-  MemDescType resTy = getRes().getType();
-  ArrayRef<int64_t> srcShape = srcTy.getShape();
-  ArrayRef<int64_t> resShape = resTy.getShape();
-
-  if (srcTy.getRank() < resTy.getRank())
-    return emitOpError("result rank must not exceed source rank.");
 
-  if (llvm::any_of(
-          llvm::zip_equal(resShape, srcShape.take_back(resShape.size())),
-          [](auto p) { return std::get<0>(p) > std::get<1>(p); }))
-    return emitOpError("result shape must not exceed source shape.");
-
-  if (srcTy.getStrides() != resTy.getStrides())
-    return emitOpError("result must inherit the source strides.");
-
-  return success();
+  auto dataTy = dyn_cast<VectorType>(getData().getType());
+  UnitAttr subgroup_block_io = getSubgroupBlockIoAttr();
+  MemDescType mdescTy = getMemDesc().getType();
+  return IsValidMatrixOpParams(dataTy, mdescTy, subgroup_block_io,
+                               [&]() { return emitError(); });
 }
 
-} // namespace xegpu
-} // namespace mlir
-
 namespace mlir {
 #include <mlir/Dialect/XeGPU/IR/XeGPUAttrInterface.cpp.inc>
 } // namespace mlir
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
index a178d0fe4b0b0..aafa1b7deb84b 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
@@ -941,7 +941,9 @@ struct UnrollLoadMatrixOp : public UnrollPattern<xegpu::LoadMatrixOp> {
   LogicalResult matchAndRewrite(xegpu::LoadMatrixOp op,
                                 PatternRewriter &rewriter) const override {
     Location loc = op.getLoc();
-    VectorType valueTy = op.getType();
+    VectorType valueTy = llvm::dyn_cast<VectorType>(op.getType());
+    assert(valueTy && "the value type must be vector type!");
+
     std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
     if (!targetShape || targetShape->size() != (size_t)valueTy.getRank())
       return failure();
@@ -984,7 +986,8 @@ struct UnrollStoreMatrixOp : public UnrollPattern<xegpu::StoreMatrixOp> {
       return failure();
 
     Location loc = op.getLoc();
-    VectorType valueTy = op.getData().getType();
+    VectorType valueTy = llvm::dyn_cast<VectorType>(op.getData().getType());
+    assert(valueTy && "the value type must be vector type!");
     ArrayRef<int64_t> shape = valueTy.getShape();
     auto layout = dyn_cast<xegpu::LayoutAttr>(op.getLayoutAttr());
 
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index c28d2fc6c2b63..31a967dcd04c7 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -991,7 +991,8 @@ struct WgToSgLoadMatrixOp : public OpConversionPattern<xegpu::LoadMatrixOp> {
       return failure();
 
     ArrayRef<int64_t> wgShape = op.getDataShape();
-    VectorType valueTy = op.getRes().getType();
+    VectorType valueTy = llvm::dyn_cast<VectorType>(op.getRes().getType());
+    assert(valueTy && "the value type must be vector type!");
     Type elemTy = valueTy.getElementType();
 
     xegpu::DistributeLayoutAttr layout = op.getLayoutAttr();
diff --git a/mlir/lib/IR/MLIRContext.cpp b/mlir/lib/IR/MLIRContext.cpp
index 2238efa4f570a..5def0fb26d250 100644
--- a/mlir/lib/IR/MLIRContext.cpp
+++ b/mlir/lib/IR/MLIRContext.cpp
@@ -121,6 +121,11 @@ namespace mlir {
 /// This class is completely private to this file, so everything is public.
 class MLIRContextImpl {
 public:
+  //===--------------------------------------------------------------------===//
+  // Remark
+  //===--------------------------------------------------------------------===//
+  std::unique_ptr<remark::detail::RemarkEngine> remarkEngine;
+
   //===--------------------------------------------------------------------===//
   // Debugging
   //===--------------------------------------------------------------------===//
@@ -135,11 +140,6 @@ class MLIRContextImpl {
   //===--------------------------------------------------------------------===//
   DiagnosticEngine diagEngine;
 
-  //===--------------------------------------------------------------------===//
-  // Remark
-  //===--------------------------------------------------------------------===//
-  std::unique_ptr<remark::detail::RemarkEngine> remarkEngine;
-
   //===--------------------------------------------------------------------===//
   // Options
   //===--------------------------------------------------------------------===//
@@ -358,7 +358,10 @@ MLIRContext::MLIRContext(const DialectRegistry &registry, Threading setting)
   impl->affineUniquer.registerParametricStorageType<IntegerSetStorage>();
 }
 
-MLIRContext::~MLIRContext() = default;
+MLIRContext::~MLIRContext() {
+  // finalize remark engine before destroying anything else.
+  impl->remarkEngine.reset();
+}
 
 /// Copy the specified array of elements into memory managed by the provided
 /// bump pointer allocator.  This assumes the elements are all PODs.
@@ -1202,7 +1205,7 @@ AffineMap AffineMap::getImpl(unsigned dimCount, unsigned symbolCount,
 /// present in result expressions is less than `dimCount` and the highest index
 /// of symbolic identifier present in result expressions is less than
 /// `symbolCount`.
-LLVM_ATTRIBUTE_UNUSED static bool
+[[maybe_unused]] static bool
 willBeValidAffineMap(unsigned dimCount, unsigned symbolCount,
                      ArrayRef<AffineExpr> results) {
   int64_t maxDimPosition = -1;
diff --git a/mlir/lib/IR/Remarks.cpp b/mlir/lib/IR/Remarks.cpp
index a55f61aff77bb..031eae22af7f2 100644
--- a/mlir/lib/IR/Remarks.cpp
+++ b/mlir/lib/IR/Remarks.cpp
@@ -16,7 +16,7 @@
 #include "llvm/ADT/StringRef.h"
 
 using namespace mlir::remark::detail;
-
+using namespace mlir::remark;
 //------------------------------------------------------------------------------
 // Remark
 //------------------------------------------------------------------------------
@@ -70,7 +70,7 @@ static void printArgs(llvm::raw_ostream &os, llvm::ArrayRef<Remark::Arg> args) {
 void Remark::print(llvm::raw_ostream &os, bool printLocation) const {
   // Header: [Type] pass:remarkName
   StringRef type = getRemarkTypeString();
-  StringRef categoryName = getFullCategoryName();
+  StringRef categoryName = getCombinedCategoryName();
   StringRef name = remarkName;
 
   os << '[' << type << "] ";
@@ -81,9 +81,10 @@ void Remark::print(llvm::raw_ostream &os, bool printLocation) const {
     os << "Function=" << getFunction() << " | ";
 
   if (printLocation) {
-    if (auto flc = mlir::dyn_cast<mlir::FileLineColLoc>(getLocation()))
+    if (auto flc = mlir::dyn_cast<mlir::FileLineColLoc>(getLocation())) {
       os << " @" << flc.getFilename() << ":" << flc.getLine() << ":"
          << flc.getColumn();
+    }
   }
 
   printArgs(os, getArgs());
@@ -140,7 +141,7 @@ llvm::remarks::Remark Remark::generateRemark() const {
   r.RemarkType = getRemarkType();
   r.RemarkName = getRemarkName();
   // MLIR does not use passes; instead, it has categories and sub-categories.
-  r.PassName = getFullCategoryName();
+  r.PassName = getCombinedCategoryName();
   r.FunctionName = getFunction();
   r.Loc = locLambda();
   for (const Remark::Arg &arg : getArgs()) {
@@ -225,26 +226,42 @@ InFlightRemark RemarkEngine::emitOptimizationRemarkAnalysis(Location loc,
 // RemarkEngine
 //===----------------------------------------------------------------------===//
 
-void RemarkEngine::report(const Remark &&remark) {
+void RemarkEngine::reportImpl(const Remark &remark) {
   // Stream the remark
-  if (remarkStreamer)
+  if (remarkStreamer) {
     remarkStreamer->streamOptimizationRemark(remark);
+  }
 
   // Print using MLIR's diagnostic
   if (printAsEmitRemarks)
     emitRemark(remark.getLocation(), remark.getMsg());
 }
 
+void RemarkEngine::report(const Remark &&remark) {
+  if (remarkEmittingPolicy)
+    remarkEmittingPolicy->reportRemark(remark);
+}
+
 RemarkEngine::~RemarkEngine() {
+  if (remarkEmittingPolicy)
+    remarkEmittingPolicy->finalize();
+
   if (remarkStreamer)
     remarkStreamer->finalize();
 }
 
-llvm::LogicalResult
-RemarkEngine::initialize(std::unique_ptr<MLIRRemarkStreamerBase> streamer,
-                         std::string *errMsg) {
-  // If you need to validate categories/filters, do so here and set errMsg.
+llvm::LogicalResult RemarkEngine::initialize(
+    std::unique_ptr<MLIRRemarkStreamerBase> streamer,
+    std::unique_ptr<RemarkEmittingPolicyBase> remarkEmittingPolicy,
+    std::string *errMsg) {
+
   remarkStreamer = std::move(streamer);
+
+  auto reportFunc =
+      std::bind(&RemarkEngine::reportImpl, this, std::placeholders::_1);
+  remarkEmittingPolicy->initialize(ReportFn(std::move(reportFunc)));
+
+  this->remarkEmittingPolicy = std::move(remarkEmittingPolicy);
   return success();
 }
 
@@ -301,14 +318,15 @@ RemarkEngine::RemarkEngine(bool printAsEmitRemarks,
 }
 
 llvm::LogicalResult mlir::remark::enableOptimizationRemarks(
-    MLIRContext &ctx,
-    std::unique_ptr<remark::detail::MLIRRemarkStreamerBase> streamer,
-    const remark::RemarkCategories &cats, bool printAsEmitRemarks) {
+    MLIRContext &ctx, std::unique_ptr<detail::MLIRRemarkStreamerBase> streamer,
+    std::unique_ptr<detail::RemarkEmittingPolicyBase> remarkEmittingPolicy,
+    const RemarkCategories &cats, bool printAsEmitRemarks) {
   auto engine =
-      std::make_unique<remark::detail::RemarkEngine>(printAsEmitRemarks, cats);
+      std::make_unique<detail::RemarkEngine>(printAsEmitRemarks, cats);
 
   std::string errMsg;
-  if (failed(engine->initialize(std::move(streamer), &errMsg))) {
+  if (failed(engine->initialize(std::move(streamer),
+                                std::move(remarkEmittingPolicy), &errMsg))) {
     llvm::report_fatal_error(
         llvm::Twine("Failed to initialize remark engine. Error: ") + errMsg);
   }
@@ -316,3 +334,12 @@ llvm::LogicalResult mlir::remark::enableOptimizationRemarks(
 
   return success();
 }
+
+//===----------------------------------------------------------------------===//
+// Remark emitting policies
+//===----------------------------------------------------------------------===//
+
+namespace mlir::remark {
+RemarkEmittingPolicyAll::RemarkEmittingPolicyAll() = default;
+RemarkEmittingPolicyFinal::RemarkEmittingPolicyFinal() = default;
+} // namespace mlir::remark
diff --git a/mlir/lib/Interfaces/CMakeLists.txt b/mlir/lib/Interfaces/CMakeLists.txt
index 388de1c3e5abf..f96af02db0be7 100644
--- a/mlir/lib/Interfaces/CMakeLists.txt
+++ b/mlir/lib/Interfaces/CMakeLists.txt
@@ -9,6 +9,7 @@ set(LLVM_OPTIONAL_SOURCES
   FunctionInterfaces.cpp
   IndexingMapOpInterface.cpp
   InferIntRangeInterface.cpp
+  InferStridedMetadataInterface.cpp
   InferTypeOpInterface.cpp
   LoopLikeInterface.cpp
   MemOpInterfaces.cpp
@@ -64,6 +65,21 @@ add_mlir_library(MLIRFunctionInterfaces
 
 add_mlir_interface_library(IndexingMapOpInterface)
 add_mlir_interface_library(InferIntRangeInterface)
+
+add_mlir_library(MLIRInferStridedMetadataInterface
+  InferStridedMetadataInterface.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Interfaces
+
+  DEPENDS
+  MLIRInferStridedMetadataInterfaceIncGen
+
+  LINK_LIBS PUBLIC
+  MLIRInferIntRangeInterface
+  MLIRIR
+)
+
 add_mlir_interface_library(InferTypeOpInterface)
 
 add_mlir_library(MLIRLoopLikeInterface
diff --git a/mlir/lib/Interfaces/InferIntRangeInterface.cpp b/mlir/lib/Interfaces/InferIntRangeInterface.cpp
index 9f3e97d051c85..84fc9b8b61a11 100644
--- a/mlir/lib/Interfaces/InferIntRangeInterface.cpp
+++ b/mlir/lib/Interfaces/InferIntRangeInterface.cpp
@@ -146,6 +146,25 @@ raw_ostream &mlir::operator<<(raw_ostream &os, const IntegerValueRange &range) {
   return os;
 }
 
+SmallVector<IntegerValueRange>
+mlir::getIntValueRanges(ArrayRef<OpFoldResult> values,
+                        GetIntRangeFn getIntRange, int32_t indexBitwidth) {
+  SmallVector<IntegerValueRange> ranges;
+  ranges.reserve(values.size());
+  for (OpFoldResult ofr : values) {
+    if (auto value = dyn_cast<Value>(ofr)) {
+      ranges.push_back(getIntRange(value));
+      continue;
+    }
+
+    // Create a constant range.
+    auto attr = cast<IntegerAttr>(cast<Attribute>(ofr));
+    ranges.emplace_back(ConstantIntRanges::constant(
+        attr.getValue().sextOrTrunc(indexBitwidth)));
+  }
+  return ranges;
+}
+
 void mlir::intrange::detail::defaultInferResultRanges(
     InferIntRangeInterface interface, ArrayRef<IntegerValueRange> argRanges,
     SetIntLatticeFn setResultRanges) {
diff --git a/mlir/lib/Interfaces/InferStridedMetadataInterface.cpp b/mlir/lib/Interfaces/InferStridedMetadataInterface.cpp
new file mode 100644
index 0000000000000..483e9f192cdcd
--- /dev/null
+++ b/mlir/lib/Interfaces/InferStridedMetadataInterface.cpp
@@ -0,0 +1,36 @@
+//===- InferStridedMetadataInterface.cpp - Strided md inference interface -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Interfaces/InferStridedMetadataInterface.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/TypeUtilities.h"
+#include <optional>
+
+using namespace mlir;
+
+#include "mlir/Interfaces/InferStridedMetadataInterface.cpp.inc"
+
+void StridedMetadataRange::print(raw_ostream &os) const {
+  if (isUninitialized()) {
+    os << "strided_metadata<None>";
+    return;
+  }
+  os << "strided_metadata<offset = [";
+  llvm::interleaveComma(*offsets, os, [&](const ConstantIntRanges &range) {
+    os << "{" << range << "}";
+  });
+  os << "], sizes = [";
+  llvm::interleaveComma(sizes, os, [&](const ConstantIntRanges &range) {
+    os << "{" << range << "}";
+  });
+  os << "], strides = [";
+  llvm::interleaveComma(strides, os, [&](const ConstantIntRanges &range) {
+    os << "{" << range << "}";
+  });
+  os << "]>";
+}
diff --git a/mlir/lib/RegisterAllPasses.cpp b/mlir/lib/RegisterAllPasses.cpp
index c67b24226ae45..dd413d2de8710 100644
--- a/mlir/lib/RegisterAllPasses.cpp
+++ b/mlir/lib/RegisterAllPasses.cpp
@@ -98,4 +98,5 @@ void mlir::registerAllPasses() {
   sparse_tensor::registerSparseTensorPipelines();
   tosa::registerTosaToLinalgPipelines();
   gpu::registerGPUToNVVMPipeline();
+  gpu::registerGPUToXeVMPipeline();
 }
diff --git a/mlir/lib/Remark/RemarkStreamer.cpp b/mlir/lib/Remark/RemarkStreamer.cpp
index d213a1a2068d6..bf362862d24f6 100644
--- a/mlir/lib/Remark/RemarkStreamer.cpp
+++ b/mlir/lib/Remark/RemarkStreamer.cpp
@@ -60,6 +60,7 @@ void LLVMRemarkStreamer::finalize() {
 namespace mlir::remark {
 LogicalResult enableOptimizationRemarksWithLLVMStreamer(
     MLIRContext &ctx, StringRef path, llvm::remarks::Format fmt,
+    std::unique_ptr<detail::RemarkEmittingPolicyBase> remarkEmittingPolicy,
     const RemarkCategories &cat, bool printAsEmitRemarks) {
 
   FailureOr<std::unique_ptr<detail::MLIRRemarkStreamerBase>> sOr =
@@ -67,7 +68,8 @@ LogicalResult enableOptimizationRemarksWithLLVMStreamer(
   if (failed(sOr))
     return failure();
 
-  return remark::enableOptimizationRemarks(ctx, std::move(*sOr), cat,
+  return remark::enableOptimizationRemarks(ctx, std::move(*sOr),
+                                           std::move(remarkEmittingPolicy), cat,
                                            printAsEmitRemarks);
 }
 
diff --git a/mlir/lib/TableGen/CodeGenHelpers.cpp b/mlir/lib/TableGen/CodeGenHelpers.cpp
index cb90ef82c1c39..d52d5e769ee6d 100644
--- a/mlir/lib/TableGen/CodeGenHelpers.cpp
+++ b/mlir/lib/TableGen/CodeGenHelpers.cpp
@@ -49,9 +49,7 @@ StaticVerifierFunctionEmitter::StaticVerifierFunctionEmitter(
     raw_ostream &os, const RecordKeeper &records, StringRef tag)
     : os(os), uniqueOutputLabel(getUniqueOutputLabel(records, tag)) {}
 
-void StaticVerifierFunctionEmitter::emitOpConstraints(
-    ArrayRef<const Record *> opDefs) {
-  NamespaceEmitter namespaceEmitter(os, Operator(*opDefs[0]).getCppNamespace());
+void StaticVerifierFunctionEmitter::emitOpConstraints() {
   emitTypeConstraints();
   emitAttrConstraints();
   emitPropConstraints();
diff --git a/mlir/lib/Target/LLVMIR/DebugImporter.cpp b/mlir/lib/Target/LLVMIR/DebugImporter.cpp
index 4bbcd8e2177f3..db39c70c87531 100644
--- a/mlir/lib/Target/LLVMIR/DebugImporter.cpp
+++ b/mlir/lib/Target/LLVMIR/DebugImporter.cpp
@@ -34,11 +34,9 @@ Location DebugImporter::translateFuncLocation(llvm::Function *func) {
     return UnknownLoc::get(context);
 
   // Add a fused location to link the subprogram information.
-  StringAttr funcName = StringAttr::get(context, subprogram->getName());
   StringAttr fileName = StringAttr::get(context, subprogram->getFilename());
   return FusedLocWith<DISubprogramAttr>::get(
-      {NameLoc::get(funcName),
-       FileLineColLoc::get(fileName, subprogram->getLine(), /*column=*/0)},
+      {FileLineColLoc::get(fileName, subprogram->getLine(), /*column=*/0)},
       translate(subprogram), context);
 }
 
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 138f07f8cefc5..2ec941a69bf45 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -250,7 +250,7 @@ class LinearClauseProcessor {
 
   // Rewrite all uses of the original variable in `BBName`
   //  with the linear variable in-place
-  void rewriteInPlace(llvm::IRBuilderBase &builder, std::string BBName,
+  void rewriteInPlace(llvm::IRBuilderBase &builder, const std::string &BBName,
                       size_t varIndex) {
     llvm::SmallVector<llvm::User *> users;
     for (llvm::User *user : linearOrigVal[varIndex]->users())
@@ -3880,6 +3880,61 @@ static llvm::Value *getSizeInBytes(DataLayout &dl, const mlir::Type &type,
   return builder.getInt64(dl.getTypeSizeInBits(type) / 8);
 }
 
+// Convert the MLIR map flag set to the runtime map flag set for embedding
+// in LLVM-IR. This is important as the two bit-flag lists do not correspond
+// 1-to-1 as there's flags the runtime doesn't care about and vice versa.
+// Certain flags are discarded here such as RefPtee and co.
+static llvm::omp::OpenMPOffloadMappingFlags
+convertClauseMapFlags(omp::ClauseMapFlags mlirFlags) {
+  auto mapTypeToBool = [&mlirFlags](omp::ClauseMapFlags flag) {
+    return (mlirFlags & flag) == flag;
+  };
+
+  llvm::omp::OpenMPOffloadMappingFlags mapType =
+      llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_NONE;
+
+  if (mapTypeToBool(omp::ClauseMapFlags::to))
+    mapType |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO;
+
+  if (mapTypeToBool(omp::ClauseMapFlags::from))
+    mapType |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM;
+
+  if (mapTypeToBool(omp::ClauseMapFlags::always))
+    mapType |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_ALWAYS;
+
+  if (mapTypeToBool(omp::ClauseMapFlags::del))
+    mapType |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_DELETE;
+
+  if (mapTypeToBool(omp::ClauseMapFlags::return_param))
+    mapType |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_RETURN_PARAM;
+
+  if (mapTypeToBool(omp::ClauseMapFlags::priv))
+    mapType |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_PRIVATE;
+
+  if (mapTypeToBool(omp::ClauseMapFlags::literal))
+    mapType |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_LITERAL;
+
+  if (mapTypeToBool(omp::ClauseMapFlags::implicit))
+    mapType |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT;
+
+  if (mapTypeToBool(omp::ClauseMapFlags::close))
+    mapType |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_CLOSE;
+
+  if (mapTypeToBool(omp::ClauseMapFlags::present))
+    mapType |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_PRESENT;
+
+  if (mapTypeToBool(omp::ClauseMapFlags::ompx_hold))
+    mapType |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_OMPX_HOLD;
+
+  if (mapTypeToBool(omp::ClauseMapFlags::attach))
+    mapType |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_ATTACH;
+
+  if (mapTypeToBool(omp::ClauseMapFlags::descriptor))
+    mapType |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_DESCRIPTOR;
+
+  return mapType;
+}
+
 static void collectMapDataFromMapOperands(
     MapInfoData &mapData, SmallVectorImpl<Value> &mapVars,
     LLVM::ModuleTranslation &moduleTranslation, DataLayout &dl,
@@ -3930,8 +3985,7 @@ static void collectMapDataFromMapOperands(
         getSizeInBytes(dl, mapOp.getVarType(), mapOp, mapData.Pointers.back(),
                        mapData.BaseType.back(), builder, moduleTranslation));
     mapData.MapClause.push_back(mapOp.getOperation());
-    mapData.Types.push_back(
-        llvm::omp::OpenMPOffloadMappingFlags(mapOp.getMapType()));
+    mapData.Types.push_back(convertClauseMapFlags(mapOp.getMapType()));
     mapData.Names.push_back(LLVM::createMappingInformation(
         mapOp.getLoc(), *moduleTranslation.getOpenMPBuilder()));
     mapData.DevicePointers.push_back(llvm::OpenMPIRBuilder::DeviceInfoTy::None);
@@ -4007,8 +4061,7 @@ static void collectMapDataFromMapOperands(
     Value offloadPtr =
         mapOp.getVarPtrPtr() ? mapOp.getVarPtrPtr() : mapOp.getVarPtr();
     llvm::Value *origValue = moduleTranslation.lookupValue(offloadPtr);
-    auto mapType =
-        static_cast<llvm::omp::OpenMPOffloadMappingFlags>(mapOp.getMapType());
+    auto mapType = convertClauseMapFlags(mapOp.getMapType());
     auto mapTypeAlways = llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_ALWAYS;
 
     mapData.OriginalValue.push_back(origValue);
@@ -4537,7 +4590,7 @@ static void processMapMembersWithParent(
     // being mapped at this stage.
     if (checkIfPointerMap(memberClause)) {
       auto mapFlag =
-          llvm::omp::OpenMPOffloadMappingFlags(memberClause.getMapType());
+          llvm::omp::OpenMPOffloadMappingFlags(mapData.Types[memberDataIdx]);
       // We wish to remove user specified always, as the pointer is a
       // seperate implementation detail/entity. And tagging it with
       // always can cause the data to be overwritten. It is likely
@@ -4563,7 +4616,7 @@ static void processMapMembersWithParent(
     // Same MemberOfFlag to indicate its link with parent and other members
     // of.
     auto mapFlag =
-        llvm::omp::OpenMPOffloadMappingFlags(memberClause.getMapType());
+        llvm::omp::OpenMPOffloadMappingFlags(mapData.Types[memberDataIdx]);
     mapFlag &= ~llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TARGET_PARAM;
     mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF;
     ompBuilder.setCorrectMemberOfFlag(mapFlag, memberOfFlag);
@@ -5360,41 +5413,44 @@ handleDeclareTargetMapVar(MapInfoData &mapData,
     // function to link the two variables in the runtime and then both the
     // reference pointer and the pointer are assigned in the kernel argument
     // structure for the host.
-    if (mapData.IsDeclareTarget[i]) {
-      // If the original map value is a constant, then we have to make sure all
-      // of it's uses within the current kernel/function that we are going to
-      // rewrite are converted to instructions, as we will be altering the old
-      // use (OriginalValue) from a constant to an instruction, which will be
-      // illegal and ICE the compiler if the user is a constant expression of
-      // some kind e.g. a constant GEP.
-      if (auto *constant = dyn_cast<llvm::Constant>(mapData.OriginalValue[i]))
-        convertUsersOfConstantsToInstructions(constant, func, false);
-
-      // The users iterator will get invalidated if we modify an element,
-      // so we populate this vector of uses to alter each user on an
-      // individual basis to emit its own load (rather than one load for
-      // all).
-      llvm::SmallVector<llvm::User *> userVec;
-      for (llvm::User *user : mapData.OriginalValue[i]->users())
-        userVec.push_back(user);
-
-      for (llvm::User *user : userVec) {
-        if (auto *insn = dyn_cast<llvm::Instruction>(user)) {
-          if (insn->getFunction() == func) {
-            auto mapOp = cast<omp::MapInfoOp>(mapData.MapClause[i]);
-            llvm::Value *substitute = mapData.BasePointers[i];
-            if (isDeclareTargetLink(mapOp.getVarPtrPtr() ? mapOp.getVarPtrPtr()
-                                                         : mapOp.getVarPtr())) {
-              builder.SetCurrentDebugLocation(insn->getDebugLoc());
-              auto *load = builder.CreateLoad(
-                  mapData.BasePointers[i]->getType(), mapData.BasePointers[i]);
-              load->moveBefore(insn);
-              substitute = load;
-            }
-            user->replaceUsesOfWith(mapData.OriginalValue[i], substitute);
-          }
-        }
+    if (!mapData.IsDeclareTarget[i])
+      continue;
+    // If the original map value is a constant, then we have to make sure all
+    // of it's uses within the current kernel/function that we are going to
+    // rewrite are converted to instructions, as we will be altering the old
+    // use (OriginalValue) from a constant to an instruction, which will be
+    // illegal and ICE the compiler if the user is a constant expression of
+    // some kind e.g. a constant GEP.
+    if (auto *constant = dyn_cast<llvm::Constant>(mapData.OriginalValue[i]))
+      convertUsersOfConstantsToInstructions(constant, func, false);
+
+    // The users iterator will get invalidated if we modify an element,
+    // so we populate this vector of uses to alter each user on an
+    // individual basis to emit its own load (rather than one load for
+    // all).
+    llvm::SmallVector<llvm::User *> userVec;
+    for (llvm::User *user : mapData.OriginalValue[i]->users())
+      userVec.push_back(user);
+
+    for (llvm::User *user : userVec) {
+      auto *insn = dyn_cast<llvm::Instruction>(user);
+      if (!insn || insn->getFunction() != func)
+        continue;
+      auto mapOp = cast<omp::MapInfoOp>(mapData.MapClause[i]);
+      llvm::Value *substitute = mapData.BasePointers[i];
+      if (isDeclareTargetLink(mapOp.getVarPtrPtr() ? mapOp.getVarPtrPtr()
+                                                   : mapOp.getVarPtr()) ||
+          (isDeclareTargetTo(mapOp.getVarPtrPtr() ? mapOp.getVarPtrPtr()
+                                                  : mapOp.getVarPtr()) &&
+           moduleTranslation.getOpenMPBuilder()
+               ->Config.hasRequiresUnifiedSharedMemory())) {
+        builder.SetCurrentDebugLocation(insn->getDebugLoc());
+        auto *load = builder.CreateLoad(mapData.BasePointers[i]->getType(),
+                                        mapData.BasePointers[i]);
+        load->moveBefore(insn);
+        substitute = load;
       }
+      user->replaceUsesOfWith(mapData.OriginalValue[i], substitute);
     }
   }
 }
@@ -6314,15 +6370,19 @@ convertDeclareTargetAttr(Operation *op, mlir::omp::DeclareTargetAttr attribute,
           gVal->getType(), gVal);
 
       if (ompBuilder->Config.isTargetDevice() &&
-          (attribute.getCaptureClause().getValue() !=
-               mlir::omp::DeclareTargetCaptureClause::to ||
+          ((attribute.getCaptureClause().getValue() !=
+                mlir::omp::DeclareTargetCaptureClause::to &&
+            attribute.getCaptureClause().getValue() !=
+                mlir::omp::DeclareTargetCaptureClause::enter) ||
            ompBuilder->Config.hasRequiresUnifiedSharedMemory())) {
+        llvm::Type *ptrTy = gVal->getType();
+        if (ompBuilder->Config.hasRequiresUnifiedSharedMemory())
+          ptrTy = llvm::PointerType::get(llvmModule->getContext(), 0);
         ompBuilder->getAddrOfDeclareTargetVar(
             captureClause, deviceClause, isDeclaration, isExternallyVisible,
             ompBuilder->getTargetEntryUniqueInfo(fileInfoCallBack, *vfs),
             mangledName, generatedRefs, /*OpenMPSimd*/ false, targetTriple,
-            gVal->getType(), /*GlobalInitializer*/ nullptr,
-            /*VariableLinkage*/ nullptr);
+            ptrTy, /*GlobalInitializer*/ nullptr, /*VariableLinkage*/ nullptr);
       }
     }
   }
diff --git a/mlir/lib/Target/LLVMIR/ModuleImport.cpp b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
index 9603813e059d3..d9891e3168820 100644
--- a/mlir/lib/Target/LLVMIR/ModuleImport.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
@@ -2604,6 +2604,7 @@ static constexpr std::array kExplicitLLVMFuncOpAttributes{
     StringLiteral("denormal-fp-math-f32"),
     StringLiteral("fp-contract"),
     StringLiteral("frame-pointer"),
+    StringLiteral("inlinehint"),
     StringLiteral("instrument-function-entry"),
     StringLiteral("instrument-function-exit"),
     StringLiteral("memory"),
@@ -2615,7 +2616,6 @@ static constexpr std::array kExplicitLLVMFuncOpAttributes{
     StringLiteral("optnone"),
     StringLiteral("target-features"),
     StringLiteral("tune-cpu"),
-    StringLiteral("unsafe-fp-math"),
     StringLiteral("uwtable"),
     StringLiteral("vscale_range"),
     StringLiteral("willreturn"),
@@ -2643,6 +2643,8 @@ void ModuleImport::processFunctionAttributes(llvm::Function *func,
     funcOp.setNoInline(true);
   if (func->hasFnAttribute(llvm::Attribute::AlwaysInline))
     funcOp.setAlwaysInline(true);
+  if (func->hasFnAttribute(llvm::Attribute::InlineHint))
+    funcOp.setInlineHint(true);
   if (func->hasFnAttribute(llvm::Attribute::OptimizeNone))
     funcOp.setOptimizeNone(true);
   if (func->hasFnAttribute(llvm::Attribute::Convergent))
@@ -2711,10 +2713,6 @@ void ModuleImport::processFunctionAttributes(llvm::Function *func,
       attr.isStringAttribute())
     funcOp.setPreferVectorWidth(attr.getValueAsString());
 
-  if (llvm::Attribute attr = func->getFnAttribute("unsafe-fp-math");
-      attr.isStringAttribute())
-    funcOp.setUnsafeFpMath(attr.getValueAsBool());
-
   if (llvm::Attribute attr = func->getFnAttribute("no-infs-fp-math");
       attr.isStringAttribute())
     funcOp.setNoInfsFpMath(attr.getValueAsBool());
diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
index 845a14f34c016..2acbd036a020a 100644
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -1560,9 +1560,6 @@ LogicalResult ModuleTranslation::convertOneFunction(LLVMFuncOp func) {
         getLLVMContext(), attr->getMinRange().getInt(),
         attr->getMaxRange().getInt()));
 
-  if (auto unsafeFpMath = func.getUnsafeFpMath())
-    llvmFunc->addFnAttr("unsafe-fp-math", llvm::toStringRef(*unsafeFpMath));
-
   if (auto noInfsFpMath = func.getNoInfsFpMath())
     llvmFunc->addFnAttr("no-infs-fp-math", llvm::toStringRef(*noInfsFpMath));
 
@@ -1652,6 +1649,8 @@ static void convertFunctionAttributes(LLVMFuncOp func,
     llvmFunc->addFnAttr(llvm::Attribute::NoInline);
   if (func.getAlwaysInlineAttr())
     llvmFunc->addFnAttr(llvm::Attribute::AlwaysInline);
+  if (func.getInlineHintAttr())
+    llvmFunc->addFnAttr(llvm::Attribute::InlineHint);
   if (func.getOptimizeNoneAttr())
     llvmFunc->addFnAttr(llvm::Attribute::OptimizeNone);
   if (func.getConvergentAttr())
diff --git a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
index 0c3e87a8dc1ef..d9ad8fb144b61 100644
--- a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
+++ b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
@@ -2619,6 +2619,11 @@ LogicalResult ControlFlowStructurizer::structurize() {
   // region. We cannot handle such cases given that once a value is sinked into
   // the SelectionOp/LoopOp's region, there is no escape for it.
   for (auto *block : constructBlocks) {
+    if (!block->use_empty())
+      return emitError(block->getParent()->getLoc(),
+                       "failed control flow structurization: "
+                       "block has uses outside of the "
+                       "enclosing selection/loop construct");
     for (Operation &op : *block)
       if (!op.use_empty())
         return op.emitOpError("failed control flow structurization: value has "
diff --git a/mlir/lib/Target/SPIRV/Serialization/SerializeOps.cpp b/mlir/lib/Target/SPIRV/Serialization/SerializeOps.cpp
index e9b180a70bb23..85e92c7ced394 100644
--- a/mlir/lib/Target/SPIRV/Serialization/SerializeOps.cpp
+++ b/mlir/lib/Target/SPIRV/Serialization/SerializeOps.cpp
@@ -311,6 +311,14 @@ LogicalResult Serializer::processFuncOp(spirv::FuncOp op) {
     op.addEntryBlock();
     if (failed(processFuncParameter(op)))
       return failure();
+
+    // Erasing the body of the function destroys arguments, so we need to remove
+    // them from the map to avoid problems when processing invalid values used
+    // as keys. We have already serialized function arguments so we probably can
+    // remove them from the map as external function will not have any uses.
+    for (Value arg : op.getArguments())
+      valueIDMap.erase(arg);
+
     // Don't need to process the added block, there is nothing to process,
     // the fake body was added just to get the arguments, remove the body,
     // since it's use is done.
diff --git a/mlir/lib/Target/Wasm/TranslateFromWasm.cpp b/mlir/lib/Target/Wasm/TranslateFromWasm.cpp
index 132be4e42ea2e..366ba8f0115f4 100644
--- a/mlir/lib/Target/Wasm/TranslateFromWasm.cpp
+++ b/mlir/lib/Target/Wasm/TranslateFromWasm.cpp
@@ -14,6 +14,7 @@
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributeInterfaces.h"
+#include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Location.h"
 #include "mlir/Support/LLVM.h"
@@ -138,6 +139,10 @@ using ImportDesc =
 
 using parsed_inst_t = FailureOr<SmallVector<Value>>;
 
+struct EmptyBlockMarker {};
+using BlockTypeParseResult =
+    std::variant<EmptyBlockMarker, TypeIdxRecord, Type>;
+
 struct WasmModuleSymbolTables {
   SmallVector<FunctionSymbolRefContainer> funcSymbols;
   SmallVector<GlobalSymbolRefContainer> globalSymbols;
@@ -175,6 +180,9 @@ class ParserHead;
 /// Wrapper around SmallVector to only allow access as push and pop on the
 /// stack. Makes sure that there are no "free accesses" on the stack to preserve
 /// its state.
+/// This class also keep tracks of the Wasm labels defined by different ops,
+/// which can be targeted by control flow ops. This can be modeled as part of
+/// the Value Stack as Wasm control flow ops can only target enclosing labels.
 class ValueStack {
 private:
   struct LabelLevel {
@@ -206,6 +214,16 @@ class ValueStack {
   ///   if an error occurs.
   LogicalResult pushResults(ValueRange results, Location *opLoc);
 
+  void addLabelLevel(LabelLevelOpInterface levelOp) {
+    labelLevel.push_back({values.size(), levelOp});
+    LDBG() << "Adding a new frame context to ValueStack";
+  }
+
+  void dropLabelLevel() {
+    assert(!labelLevel.empty() && "Trying to drop a frame from empty context");
+    auto newSize = labelLevel.pop_back_val().stackIdx;
+    values.truncate(newSize);
+  }
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// A simple dump function for debugging.
   /// Writes output to llvm::dbgs().
@@ -214,6 +232,7 @@ class ValueStack {
 
 private:
   SmallVector<Value> values;
+  SmallVector<LabelLevel> labelLevel;
 };
 
 using local_val_t = TypedValue<wasmssa::LocalRefType>;
@@ -248,6 +267,19 @@ class ExpressionParser {
   buildNumericOp(OpBuilder &builder,
                  std::enable_if_t<std::is_arithmetic_v<valueType>> * = nullptr);
 
+  /// Construct a conversion operation of type \p opType that takes a value from
+  /// type \p inputType on the stack and will produce a value of type
+  /// \p outputType.
+  ///
+  /// \p opType - The WASM dialect operation to build.
+  /// \p inputType - The operand type for the built instruction.
+  /// \p outputType - The result type for the built instruction.
+  ///
+  /// \returns The parsed instruction result, or failure.
+  template <typename opType, typename inputType, typename outputType,
+            typename... extraArgsT>
+  inline parsed_inst_t buildConvertOp(OpBuilder &builder, extraArgsT...);
+
   /// This function generates a dispatch tree to associate an opcode with a
   /// parser. Parsers are registered by specialising the
   /// `parseSpecificInstruction` function for the op code to handle.
@@ -280,11 +312,105 @@ class ExpressionParser {
     }
   }
 
+  ///
+  /// RAII guard class for creating a nesting level
+  ///
+  struct NestingContextGuard {
+    NestingContextGuard(ExpressionParser &parser, LabelLevelOpInterface levelOp)
+        : parser{parser} {
+      parser.addNestingContextLevel(levelOp);
+    }
+    NestingContextGuard(NestingContextGuard &&other) : parser{other.parser} {
+      other.shouldDropOnDestruct = false;
+    }
+    NestingContextGuard(NestingContextGuard const &) = delete;
+    ~NestingContextGuard() {
+      if (shouldDropOnDestruct)
+        parser.dropNestingContextLevel();
+    }
+    ExpressionParser &parser;
+    bool shouldDropOnDestruct = true;
+  };
+
+  void addNestingContextLevel(LabelLevelOpInterface levelOp) {
+    valueStack.addLabelLevel(levelOp);
+  }
+
+  void dropNestingContextLevel() {
+    // Should always succeed as we are droping the frame that was previously
+    // created.
+    valueStack.dropLabelLevel();
+  }
+
+  llvm::FailureOr<FunctionType> getFuncTypeFor(OpBuilder &builder,
+                                               EmptyBlockMarker) {
+    return builder.getFunctionType({}, {});
+  }
+
+  llvm::FailureOr<FunctionType> getFuncTypeFor(OpBuilder &builder,
+                                               TypeIdxRecord type) {
+    if (type.id >= symbols.moduleFuncTypes.size())
+      return emitError(*currentOpLoc,
+                       "type index references nonexistent type (")
+             << type.id << "). Only " << symbols.moduleFuncTypes.size()
+             << " types are registered";
+    return symbols.moduleFuncTypes[type.id];
+  }
+
+  llvm::FailureOr<FunctionType> getFuncTypeFor(OpBuilder &builder,
+                                               Type valType) {
+    return builder.getFunctionType({}, {valType});
+  }
+
+  llvm::FailureOr<FunctionType>
+  getFuncTypeFor(OpBuilder &builder, BlockTypeParseResult parseResult) {
+    return std::visit(
+        [this, &builder](auto value) { return getFuncTypeFor(builder, value); },
+        parseResult);
+  }
+
+  llvm::FailureOr<FunctionType>
+  getFuncTypeFor(OpBuilder &builder,
+                 llvm::FailureOr<BlockTypeParseResult> parseResult) {
+    if (llvm::failed(parseResult))
+      return failure();
+    return getFuncTypeFor(builder, *parseResult);
+  }
+
+  llvm::FailureOr<FunctionType> parseBlockFuncType(OpBuilder &builder);
+
   struct ParseResultWithInfo {
     SmallVector<Value> opResults;
     std::byte endingByte;
   };
 
+  template <typename FilterT = ByteSequence<WasmBinaryEncoding::endByte>>
+  /// @param blockToFill: the block which content will be populated
+  /// @param resType: the type that this block is supposed to return
+  llvm::FailureOr<std::byte>
+  parseBlockContent(OpBuilder &builder, Block *blockToFill, TypeRange resTypes,
+                    Location opLoc, LabelLevelOpInterface levelOp,
+                    FilterT parseEndBytes = {}) {
+    OpBuilder::InsertionGuard guard{builder};
+    builder.setInsertionPointToStart(blockToFill);
+    LDBG() << "parsing a block of type "
+           << builder.getFunctionType(blockToFill->getArgumentTypes(),
+                                      resTypes);
+    auto nC = addNesting(levelOp);
+
+    if (failed(pushResults(blockToFill->getArguments())))
+      return failure();
+    auto bodyParsingRes = parse(builder, parseEndBytes);
+    if (failed(bodyParsingRes))
+      return failure();
+    auto returnOperands = popOperands(resTypes);
+    if (failed(returnOperands))
+      return failure();
+    builder.create<BlockReturnOp>(opLoc, *returnOperands);
+    LDBG() << "end of parsing of a block";
+    return bodyParsingRes->endingByte;
+  }
+
 public:
   template <std::byte ParseEndByte = WasmBinaryEncoding::endByte>
   parsed_inst_t parse(OpBuilder &builder, UniqueByte<ParseEndByte> = {});
@@ -294,7 +420,11 @@ class ExpressionParser {
   parse(OpBuilder &builder,
         ByteSequence<ExpressionParseEnd...> parsingEndFilters);
 
-  FailureOr<SmallVector<Value>> popOperands(TypeRange operandTypes) {
+  NestingContextGuard addNesting(LabelLevelOpInterface levelOp) {
+    return NestingContextGuard{*this, levelOp};
+  }
+
+  FailureOr<llvm::SmallVector<Value>> popOperands(TypeRange operandTypes) {
     return valueStack.popOperands(operandTypes, &currentOpLoc.value());
   }
 
@@ -308,6 +438,12 @@ class ExpressionParser {
   template <typename OpToCreate>
   parsed_inst_t parseSetOrTee(OpBuilder &);
 
+  /// Blocks and Loops have a similar format and differ only in how their exit
+  /// is handled which doesn´t matter at parsing time. Factorizes in one
+  /// function.
+  template <typename OpToCreate>
+  parsed_inst_t parseBlockLikeOp(OpBuilder &);
+
 private:
   std::optional<Location> currentOpLoc;
   ParserHead &parser;
@@ -586,6 +722,29 @@ class ParserHead {
     return success();
   }
 
+  llvm::FailureOr<BlockTypeParseResult> parseBlockType(MLIRContext *ctx) {
+    auto loc = getLocation();
+    auto blockIndicator = peek();
+    if (failed(blockIndicator))
+      return failure();
+    if (*blockIndicator == WasmBinaryEncoding::Type::emptyBlockType) {
+      offset += 1;
+      return {EmptyBlockMarker{}};
+    }
+    if (isValueOneOf(*blockIndicator, valueTypesEncodings))
+      return parseValueType(ctx);
+    /// Block type idx is a 32 bit positive integer encoded as a 33 bit signed
+    /// value
+    auto typeIdx = parseI64();
+    if (failed(typeIdx))
+      return failure();
+    if (*typeIdx < 0 || *typeIdx > std::numeric_limits<uint32_t>::max())
+      return emitError(loc, "type ID should be representable with an unsigned "
+                            "32 bits integer. Got ")
+             << *typeIdx;
+    return {TypeIdxRecord{static_cast<uint32_t>(*typeIdx)}};
+  }
+
   bool end() const { return curHead().empty(); }
 
   ParserHead copy() const { return *this; }
@@ -701,17 +860,41 @@ inline parsed_inst_t ExpressionParser::parseSpecificInstruction(OpBuilder &) {
 void ValueStack::dump() const {
   llvm::dbgs() << "================= Wasm ValueStack =======================\n";
   llvm::dbgs() << "size: " << size() << "\n";
+  llvm::dbgs() << "nbFrames: " << labelLevel.size() << '\n';
   llvm::dbgs() << "<Top>"
                << "\n";
   // Stack is pushed to via push_back. Therefore the top of the stack is the
   // end of the vector. Iterate in reverse so that the first thing we print
   // is the top of the stack.
+  auto indexGetter = [this]() {
+    size_t idx = labelLevel.size();
+    return [this, idx]() mutable -> std::optional<std::pair<size_t, size_t>> {
+      llvm::dbgs() << "IDX: " << idx << '\n';
+      if (idx == 0)
+        return std::nullopt;
+      auto frameId = idx - 1;
+      auto frameLimit = labelLevel[frameId].stackIdx;
+      idx -= 1;
+      return {{frameId, frameLimit}};
+    };
+  };
+  auto getNextFrameIndex = indexGetter();
+  auto nextFrameIdx = getNextFrameIndex();
   size_t stackSize = size();
-  for (size_t idx = 0; idx < stackSize; idx++) {
+  for (size_t idx = 0; idx < stackSize; ++idx) {
     size_t actualIdx = stackSize - 1 - idx;
+    while (nextFrameIdx && (nextFrameIdx->second > actualIdx)) {
+      llvm::dbgs() << "  --------------- Frame (" << nextFrameIdx->first
+                   << ")\n";
+      nextFrameIdx = getNextFrameIndex();
+    }
     llvm::dbgs() << "  ";
     values[actualIdx].dump();
   }
+  while (nextFrameIdx) {
+    llvm::dbgs() << "  --------------- Frame (" << nextFrameIdx->first << ")\n";
+    nextFrameIdx = getNextFrameIndex();
+  }
   llvm::dbgs() << "<Bottom>"
                << "\n";
   llvm::dbgs() << "=========================================================\n";
@@ -726,7 +909,7 @@ parsed_inst_t ValueStack::popOperands(TypeRange operandTypes, Location *opLoc) {
     return emitError(*opLoc,
                      "stack doesn't contain enough values. trying to get ")
            << operandTypes.size() << " operands on a stack containing only "
-           << values.size() << " values.";
+           << values.size() << " values";
   size_t stackIdxOffset = values.size() - operandTypes.size();
   SmallVector<Value> res{};
   res.reserve(operandTypes.size());
@@ -735,8 +918,7 @@ parsed_inst_t ValueStack::popOperands(TypeRange operandTypes, Location *opLoc) {
     Type stackType = operand.getType();
     if (stackType != operandTypes[i])
       return emitError(*opLoc, "invalid operand type on stack. expecting ")
-             << operandTypes[i] << ", value on stack is of type " << stackType
-             << ".";
+             << operandTypes[i] << ", value on stack is of type " << stackType;
     LDBG() << "    POP: " << operand;
     res.push_back(operand);
   }
@@ -792,6 +974,151 @@ ExpressionParser::parse(OpBuilder &builder,
   }
 }
 
+llvm::FailureOr<FunctionType>
+ExpressionParser::parseBlockFuncType(OpBuilder &builder) {
+  return getFuncTypeFor(builder, parser.parseBlockType(builder.getContext()));
+}
+
+template <typename OpToCreate>
+parsed_inst_t ExpressionParser::parseBlockLikeOp(OpBuilder &builder) {
+  auto opLoc = currentOpLoc;
+  auto funcType = parseBlockFuncType(builder);
+  if (failed(funcType))
+    return failure();
+
+  auto inputTypes = funcType->getInputs();
+  auto inputOps = popOperands(inputTypes);
+  if (failed(inputOps))
+    return failure();
+
+  Block *curBlock = builder.getBlock();
+  Region *curRegion = curBlock->getParent();
+  auto resTypes = funcType->getResults();
+  llvm::SmallVector<Location> locations{};
+  locations.resize(resTypes.size(), *currentOpLoc);
+  auto *successor =
+      builder.createBlock(curRegion, curRegion->end(), resTypes, locations);
+  builder.setInsertionPointToEnd(curBlock);
+  auto blockOp =
+      builder.create<OpToCreate>(*currentOpLoc, *inputOps, successor);
+  auto *blockBody = blockOp.createBlock();
+  if (failed(parseBlockContent(builder, blockBody, resTypes, *opLoc, blockOp)))
+    return failure();
+  builder.setInsertionPointToStart(successor);
+  return {ValueRange{successor->getArguments()}};
+}
+
+template <>
+inline parsed_inst_t
+ExpressionParser::parseSpecificInstruction<WasmBinaryEncoding::OpCode::block>(
+    OpBuilder &builder) {
+  return parseBlockLikeOp<BlockOp>(builder);
+}
+
+template <>
+inline parsed_inst_t
+ExpressionParser::parseSpecificInstruction<WasmBinaryEncoding::OpCode::loop>(
+    OpBuilder &builder) {
+  return parseBlockLikeOp<LoopOp>(builder);
+}
+
+template <>
+inline parsed_inst_t ExpressionParser::parseSpecificInstruction<
+    WasmBinaryEncoding::OpCode::ifOpCode>(OpBuilder &builder) {
+  auto opLoc = currentOpLoc;
+  auto funcType = parseBlockFuncType(builder);
+  if (failed(funcType))
+    return failure();
+
+  LDBG() << "Parsing an if instruction of type " << *funcType;
+  auto inputTypes = funcType->getInputs();
+  auto conditionValue = popOperands(builder.getI32Type());
+  if (failed(conditionValue))
+    return failure();
+  auto inputOps = popOperands(inputTypes);
+  if (failed(inputOps))
+    return failure();
+
+  Block *curBlock = builder.getBlock();
+  Region *curRegion = curBlock->getParent();
+  auto resTypes = funcType->getResults();
+  llvm::SmallVector<Location> locations{};
+  locations.resize(resTypes.size(), *currentOpLoc);
+  auto *successor =
+      builder.createBlock(curRegion, curRegion->end(), resTypes, locations);
+  builder.setInsertionPointToEnd(curBlock);
+  auto ifOp = builder.create<IfOp>(*currentOpLoc, conditionValue->front(),
+                                   *inputOps, successor);
+  auto *ifEntryBlock = ifOp.createIfBlock();
+  constexpr auto ifElseFilter =
+      ByteSequence<WasmBinaryEncoding::endByte,
+                   WasmBinaryEncoding::OpCode::elseOpCode>{};
+  auto parseIfRes = parseBlockContent(builder, ifEntryBlock, resTypes, *opLoc,
+                                      ifOp, ifElseFilter);
+  if (failed(parseIfRes))
+    return failure();
+  if (*parseIfRes == WasmBinaryEncoding::OpCode::elseOpCode) {
+    LDBG() << "  else block is present.";
+    Block *elseEntryBlock = ifOp.createElseBlock();
+    auto parseElseRes =
+        parseBlockContent(builder, elseEntryBlock, resTypes, *opLoc, ifOp);
+    if (failed(parseElseRes))
+      return failure();
+  }
+  builder.setInsertionPointToStart(successor);
+  return {ValueRange{successor->getArguments()}};
+}
+
+template <>
+inline parsed_inst_t ExpressionParser::parseSpecificInstruction<
+    WasmBinaryEncoding::OpCode::branchIf>(OpBuilder &builder) {
+  auto level = parser.parseLiteral<uint32_t>();
+  if (failed(level))
+    return failure();
+  Block *curBlock = builder.getBlock();
+  Region *curRegion = curBlock->getParent();
+  auto sip = builder.saveInsertionPoint();
+  Block *elseBlock = builder.createBlock(curRegion, curRegion->end());
+  auto condition = popOperands(builder.getI32Type());
+  if (failed(condition))
+    return failure();
+  builder.restoreInsertionPoint(sip);
+  auto targetOp =
+      LabelBranchingOpInterface::getTargetOpFromBlock(curBlock, *level);
+  if (failed(targetOp))
+    return failure();
+  auto inputTypes = targetOp->getLabelTarget()->getArgumentTypes();
+  auto branchArgs = popOperands(inputTypes);
+  if (failed(branchArgs))
+    return failure();
+  builder.create<BranchIfOp>(*currentOpLoc, condition->front(),
+                             builder.getUI32IntegerAttr(*level), *branchArgs,
+                             elseBlock);
+  builder.setInsertionPointToStart(elseBlock);
+  return {*branchArgs};
+}
+
+template <>
+inline parsed_inst_t
+ExpressionParser::parseSpecificInstruction<WasmBinaryEncoding::OpCode::call>(
+    OpBuilder &builder) {
+  auto loc = *currentOpLoc;
+  auto funcIdx = parser.parseLiteral<uint32_t>();
+  if (failed(funcIdx))
+    return failure();
+  if (*funcIdx >= symbols.funcSymbols.size())
+    return emitError(loc, "Invalid function index: ") << *funcIdx;
+  auto callee = symbols.funcSymbols[*funcIdx];
+  llvm::ArrayRef<Type> inTypes = callee.functionType.getInputs();
+  llvm::ArrayRef<Type> resTypes = callee.functionType.getResults();
+  parsed_inst_t inOperands = popOperands(inTypes);
+  if (failed(inOperands))
+    return failure();
+  auto callOp =
+      builder.create<FuncCallOp>(loc, resTypes, callee.symbol, *inOperands);
+  return {callOp.getResults()};
+}
+
 template <>
 inline parsed_inst_t ExpressionParser::parseSpecificInstruction<
     WasmBinaryEncoding::OpCode::localGet>(OpBuilder &builder) {
@@ -834,7 +1161,7 @@ parsed_inst_t ExpressionParser::parseSetOrTee(OpBuilder &builder) {
   if (valueStack.empty())
     return emitError(
         *currentOpLoc,
-        "invalid stack access, trying to access a value on an empty stack.");
+        "invalid stack access, trying to access a value on an empty stack");
 
   parsed_inst_t poppedOp = popOperands(locals[*id].getType().getElementType());
   if (failed(poppedOp))
@@ -956,7 +1283,7 @@ inline parsed_inst_t ExpressionParser::buildNumericOp(
          << ", type = " << ty << " ***";
   auto tysToPop = SmallVector<Type, numOperands>();
   tysToPop.resize(numOperands);
-  std::fill(tysToPop.begin(), tysToPop.end(), ty);
+  llvm::fill(tysToPop, ty);
   auto operands = popOperands(tysToPop);
   if (failed(operands))
     return failure();
@@ -1000,11 +1327,23 @@ inline parsed_inst_t ExpressionParser::buildNumericOp(
 
 BUILD_NUMERIC_BINOP_FP(CopySignOp, copysign)
 BUILD_NUMERIC_BINOP_FP(DivOp, div)
+BUILD_NUMERIC_BINOP_FP(GeOp, ge)
+BUILD_NUMERIC_BINOP_FP(GtOp, gt)
+BUILD_NUMERIC_BINOP_FP(LeOp, le)
+BUILD_NUMERIC_BINOP_FP(LtOp, lt)
 BUILD_NUMERIC_BINOP_FP(MaxOp, max)
 BUILD_NUMERIC_BINOP_FP(MinOp, min)
 BUILD_NUMERIC_BINOP_INT(AndOp, and)
 BUILD_NUMERIC_BINOP_INT(DivSIOp, divS)
 BUILD_NUMERIC_BINOP_INT(DivUIOp, divU)
+BUILD_NUMERIC_BINOP_INT(GeSIOp, geS)
+BUILD_NUMERIC_BINOP_INT(GeUIOp, geU)
+BUILD_NUMERIC_BINOP_INT(GtSIOp, gtS)
+BUILD_NUMERIC_BINOP_INT(GtUIOp, gtU)
+BUILD_NUMERIC_BINOP_INT(LeSIOp, leS)
+BUILD_NUMERIC_BINOP_INT(LeUIOp, leU)
+BUILD_NUMERIC_BINOP_INT(LtSIOp, ltS)
+BUILD_NUMERIC_BINOP_INT(LtUIOp, ltU)
 BUILD_NUMERIC_BINOP_INT(OrOp, or)
 BUILD_NUMERIC_BINOP_INT(RemSIOp, remS)
 BUILD_NUMERIC_BINOP_INT(RemUIOp, remU)
@@ -1015,7 +1354,9 @@ BUILD_NUMERIC_BINOP_INT(ShRSOp, shrS)
 BUILD_NUMERIC_BINOP_INT(ShRUOp, shrU)
 BUILD_NUMERIC_BINOP_INT(XOrOp, xor)
 BUILD_NUMERIC_BINOP_INTFP(AddOp, add)
+BUILD_NUMERIC_BINOP_INTFP(EqOp, eq)
 BUILD_NUMERIC_BINOP_INTFP(MulOp, mul)
+BUILD_NUMERIC_BINOP_INTFP(NeOp, ne)
 BUILD_NUMERIC_BINOP_INTFP(SubOp, sub)
 BUILD_NUMERIC_UNARY_OP_FP(AbsOp, abs)
 BUILD_NUMERIC_UNARY_OP_FP(CeilOp, ceil)
@@ -1025,6 +1366,7 @@ BUILD_NUMERIC_UNARY_OP_FP(SqrtOp, sqrt)
 BUILD_NUMERIC_UNARY_OP_FP(TruncOp, trunc)
 BUILD_NUMERIC_UNARY_OP_INT(ClzOp, clz)
 BUILD_NUMERIC_UNARY_OP_INT(CtzOp, ctz)
+BUILD_NUMERIC_UNARY_OP_INT(EqzOp, eqz)
 BUILD_NUMERIC_UNARY_OP_INT(PopCntOp, popcnt)
 
 // Don't need these anymore so let's undef them.
@@ -1036,6 +1378,105 @@ BUILD_NUMERIC_UNARY_OP_INT(PopCntOp, popcnt)
 #undef BUILD_NUMERIC_OP
 #undef BUILD_NUMERIC_CAST_OP
 
+template <typename opType, typename inputType, typename outputType,
+          typename... extraArgsT>
+inline parsed_inst_t ExpressionParser::buildConvertOp(OpBuilder &builder,
+                                                      extraArgsT... extraArgs) {
+  static_assert(std::is_arithmetic_v<inputType>,
+                "InputType should be an arithmetic type");
+  static_assert(std::is_arithmetic_v<outputType>,
+                "OutputType should be an arithmetic type");
+  auto intype = buildLiteralType<inputType>(builder);
+  auto outType = buildLiteralType<outputType>(builder);
+  auto operand = popOperands(intype);
+  if (failed(operand))
+    return failure();
+  auto op = builder.create<opType>(*currentOpLoc, outType, operand->front(),
+                                   extraArgs...);
+  LDBG() << "Built operation: " << op;
+  return {{op.getResult()}};
+}
+
+template <>
+inline parsed_inst_t ExpressionParser::parseSpecificInstruction<
+    WasmBinaryEncoding::OpCode::demoteF64ToF32>(OpBuilder &builder) {
+  return buildConvertOp<DemoteOp, double, float>(builder);
+}
+
+template <>
+inline parsed_inst_t
+ExpressionParser::parseSpecificInstruction<WasmBinaryEncoding::OpCode::wrap>(
+    OpBuilder &builder) {
+  return buildConvertOp<WrapOp, int64_t, int32_t>(builder);
+}
+
+#define BUILD_CONVERSION_OP(IN_T, OUT_T, SOURCE_OP, TARGET_OP)                 \
+  template <>                                                                  \
+  inline parsed_inst_t ExpressionParser::parseSpecificInstruction<             \
+      WasmBinaryEncoding::OpCode::SOURCE_OP>(OpBuilder & builder) {            \
+    return buildConvertOp<TARGET_OP, IN_T, OUT_T>(builder);                    \
+  }
+
+#define BUILD_CONVERT_OP_FOR(DEST_T, WIDTH)                                    \
+  BUILD_CONVERSION_OP(uint32_t, DEST_T, convertUI32F##WIDTH, ConvertUOp)       \
+  BUILD_CONVERSION_OP(int32_t, DEST_T, convertSI32F##WIDTH, ConvertSOp)        \
+  BUILD_CONVERSION_OP(uint64_t, DEST_T, convertUI64F##WIDTH, ConvertUOp)       \
+  BUILD_CONVERSION_OP(int64_t, DEST_T, convertSI64F##WIDTH, ConvertSOp)
+
+BUILD_CONVERT_OP_FOR(float, 32)
+BUILD_CONVERT_OP_FOR(double, 64)
+
+#undef BUILD_CONVERT_OP_FOR
+
+BUILD_CONVERSION_OP(int32_t, int64_t, extendS, ExtendSI32Op)
+BUILD_CONVERSION_OP(int32_t, int64_t, extendU, ExtendUI32Op)
+
+#undef BUILD_CONVERSION_OP
+
+#define BUILD_SLICE_EXTEND_PARSER(IT_WIDTH, EXTRACT_WIDTH)                     \
+  template <>                                                                  \
+  parsed_inst_t ExpressionParser::parseSpecificInstruction<                    \
+      WasmBinaryEncoding::OpCode::extendI##IT_WIDTH##EXTRACT_WIDTH##S>(        \
+      OpBuilder & builder) {                                                   \
+    using inout_t = int##IT_WIDTH##_t;                                         \
+    auto attr = builder.getUI32IntegerAttr(EXTRACT_WIDTH);                     \
+    return buildConvertOp<ExtendLowBitsSOp, inout_t, inout_t>(builder, attr);  \
+  }
+
+BUILD_SLICE_EXTEND_PARSER(32, 8)
+BUILD_SLICE_EXTEND_PARSER(32, 16)
+BUILD_SLICE_EXTEND_PARSER(64, 8)
+BUILD_SLICE_EXTEND_PARSER(64, 16)
+BUILD_SLICE_EXTEND_PARSER(64, 32)
+
+#undef BUILD_SLICE_EXTEND_PARSER
+
+template <>
+inline parsed_inst_t ExpressionParser::parseSpecificInstruction<
+    WasmBinaryEncoding::OpCode::promoteF32ToF64>(OpBuilder &builder) {
+  return buildConvertOp<PromoteOp, float, double>(builder);
+}
+
+#define BUILD_REINTERPRET_PARSER(WIDTH, FP_TYPE)                               \
+  template <>                                                                  \
+  inline parsed_inst_t ExpressionParser::parseSpecificInstruction<             \
+      WasmBinaryEncoding::OpCode::reinterpretF##WIDTH##AsI##WIDTH>(OpBuilder & \
+                                                                   builder) {  \
+    return buildConvertOp<ReinterpretOp, FP_TYPE, int##WIDTH##_t>(builder);    \
+  }                                                                            \
+                                                                               \
+  template <>                                                                  \
+  inline parsed_inst_t ExpressionParser::parseSpecificInstruction<             \
+      WasmBinaryEncoding::OpCode::reinterpretI##WIDTH##AsF##WIDTH>(OpBuilder & \
+                                                                   builder) {  \
+    return buildConvertOp<ReinterpretOp, int##WIDTH##_t, FP_TYPE>(builder);    \
+  }
+
+BUILD_REINTERPRET_PARSER(32, float)
+BUILD_REINTERPRET_PARSER(64, double)
+
+#undef BUILD_REINTERPRET_PARSER
+
 class WasmBinaryParser {
 private:
   struct SectionRegistry {
@@ -1153,7 +1594,7 @@ class WasmBinaryParser {
     if (tid.id >= symbols.moduleFuncTypes.size())
       return emitError(loc, "invalid type id: ")
              << tid.id << ". Only " << symbols.moduleFuncTypes.size()
-             << " type registration.";
+             << " type registrations";
     FunctionType type = symbols.moduleFuncTypes[tid.id];
     std::string symbol = symbols.getNewFuncSymbolName();
     auto funcOp = FuncImportOp::create(builder, loc, symbol, moduleName,
@@ -1221,7 +1662,7 @@ class WasmBinaryParser {
     FileLineColLoc magicLoc = parser.getLocation();
     FailureOr<StringRef> magic = parser.consumeNBytes(wasmHeader.size());
     if (failed(magic) || magic->compare(wasmHeader)) {
-      emitError(magicLoc, "source file does not contain valid Wasm header.");
+      emitError(magicLoc, "source file does not contain valid Wasm header");
       return;
     }
     auto const expectedVersionString = StringRef{"\1\0\0\0", 4};
@@ -1391,7 +1832,7 @@ WasmBinaryParser::parseSectionItem<WasmSectionType::EXPORT>(ParserHead &ph,
     return failure();
 
   Operation *op = SymbolTable::lookupSymbolIn(mOp, *currentSymbol);
-  SymbolTable::setSymbolVisibility(op, SymbolTable::Visibility::Public);
+  op->setAttr("exported", UnitAttr::get(op->getContext()));
   StringAttr symName = SymbolTable::getSymbolName(op);
   return SymbolTable{mOp}.rename(symName, *exportName);
 }
diff --git a/mlir/lib/Tools/PDLL/CodeGen/CPPGen.cpp b/mlir/lib/Tools/PDLL/CodeGen/CPPGen.cpp
index 9670285e767b8..3fda5a7a93344 100644
--- a/mlir/lib/Tools/PDLL/CodeGen/CPPGen.cpp
+++ b/mlir/lib/Tools/PDLL/CodeGen/CPPGen.cpp
@@ -93,7 +93,7 @@ void CodeGen::generate(const ast::Module &astModule, ModuleOp module) {
 
   // Emit function to add the generated matchers to the pattern list.
   os << "template <typename... ConfigsT>\n"
-        "static void LLVM_ATTRIBUTE_UNUSED populateGeneratedPDLLPatterns("
+        "[[maybe_unused]] static void populateGeneratedPDLLPatterns("
         "::mlir::RewritePatternSet &patterns, ConfigsT &&...configs) {\n";
   for (const auto &name : patternNames)
     os << "  patterns.add<" << name
diff --git a/mlir/lib/Tools/PDLL/Parser/Parser.cpp b/mlir/lib/Tools/PDLL/Parser/Parser.cpp
index c883baa7be2c5..3236b4f0af5b5 100644
--- a/mlir/lib/Tools/PDLL/Parser/Parser.cpp
+++ b/mlir/lib/Tools/PDLL/Parser/Parser.cpp
@@ -27,6 +27,7 @@
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/SaveAndRestore.h"
 #include "llvm/Support/ScopedPrinter.h"
+#include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Parser.h"
 #include <optional>
@@ -828,6 +829,7 @@ LogicalResult Parser::parseTdInclude(StringRef filename, llvm::SMRange fileLoc,
   llvm::SourceMgr tdSrcMgr;
   tdSrcMgr.AddNewSourceBuffer(std::move(*includeBuffer), SMLoc());
   tdSrcMgr.setIncludeDirs(parserSrcMgr.getIncludeDirs());
+  tdSrcMgr.setVirtualFileSystem(llvm::vfs::getRealFileSystem());
 
   // This class provides a context argument for the llvm::SourceMgr diagnostic
   // handler.
diff --git a/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp b/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
index 30fd384f3977c..9ef405dad5a70 100644
--- a/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
+++ b/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
@@ -37,6 +37,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Remarks/RemarkFormat.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/LogicalResult.h"
 #include "llvm/Support/ManagedStatic.h"
@@ -226,6 +227,18 @@ struct MlirOptMainConfigCLOptions : public MlirOptMainConfig {
                                     "bitstream", "Print bitstream file")),
         llvm::cl::cat(remarkCategory)};
 
+    static llvm::cl::opt<RemarkPolicy, /*ExternalStorage=*/true> remarkPolicy{
+        "remark-policy",
+        llvm::cl::desc("Specify the policy for remark output."),
+        cl::location(remarkPolicyFlag),
+        llvm::cl::value_desc("format"),
+        llvm::cl::init(RemarkPolicy::REMARK_POLICY_ALL),
+        llvm::cl::values(clEnumValN(RemarkPolicy::REMARK_POLICY_ALL, "all",
+                                    "Print all remarks"),
+                         clEnumValN(RemarkPolicy::REMARK_POLICY_FINAL, "final",
+                                    "Print final remarks")),
+        llvm::cl::cat(remarkCategory)};
+
     static cl::opt<std::string, /*ExternalStorage=*/true> remarksAll(
         "remarks-filter",
         cl::desc("Show all remarks: passed, missed, failed, analysis"),
@@ -517,18 +530,28 @@ performActions(raw_ostream &os,
     return failure();
 
   context->enableMultithreading(wasThreadingEnabled);
-
+  // Set the remark categories and policy.
   remark::RemarkCategories cats{
       config.getRemarksAllFilter(), config.getRemarksPassedFilter(),
       config.getRemarksMissedFilter(), config.getRemarksAnalyseFilter(),
       config.getRemarksFailedFilter()};
 
   mlir::MLIRContext &ctx = *context;
+  // Helper to create the appropriate policy based on configuration
+  auto createPolicy = [&config]()
+      -> std::unique_ptr<mlir::remark::detail::RemarkEmittingPolicyBase> {
+    if (config.getRemarkPolicy() == RemarkPolicy::REMARK_POLICY_ALL)
+      return std::make_unique<mlir::remark::RemarkEmittingPolicyAll>();
+    if (config.getRemarkPolicy() == RemarkPolicy::REMARK_POLICY_FINAL)
+      return std::make_unique<mlir::remark::RemarkEmittingPolicyFinal>();
+
+    llvm_unreachable("Invalid remark policy");
+  };
 
   switch (config.getRemarkFormat()) {
   case RemarkFormat::REMARK_FORMAT_STDOUT:
     if (failed(mlir::remark::enableOptimizationRemarks(
-            ctx, nullptr, cats, true /*printAsEmitRemarks*/)))
+            ctx, nullptr, createPolicy(), cats, true /*printAsEmitRemarks*/)))
       return failure();
     break;
 
@@ -537,7 +560,7 @@ performActions(raw_ostream &os,
                            ? "mlir-remarks.yaml"
                            : config.getRemarksOutputFile();
     if (failed(mlir::remark::enableOptimizationRemarksWithLLVMStreamer(
-            ctx, file, llvm::remarks::Format::YAML, cats)))
+            ctx, file, llvm::remarks::Format::YAML, createPolicy(), cats)))
       return failure();
     break;
   }
@@ -547,7 +570,7 @@ performActions(raw_ostream &os,
                            ? "mlir-remarks.bitstream"
                            : config.getRemarksOutputFile();
     if (failed(mlir::remark::enableOptimizationRemarksWithLLVMStreamer(
-            ctx, file, llvm::remarks::Format::Bitstream, cats)))
+            ctx, file, llvm::remarks::Format::Bitstream, createPolicy(), cats)))
       return failure();
     break;
   }
@@ -593,6 +616,12 @@ performActions(raw_ostream &os,
   AsmState asmState(op.get(), OpPrintingFlags(), /*locationMap=*/nullptr,
                     &fallbackResourceMap);
   os << OpWithState(op.get(), asmState) << '\n';
+
+  // This is required if the remark policy is final. Otherwise, the remarks are
+  // not emitted.
+  if (remark::detail::RemarkEngine *engine = ctx.getRemarkEngine())
+    engine->getRemarkEmittingPolicy()->finalize();
+
   return success();
 }
 
diff --git a/mlir/lib/Tools/mlir-pdll-lsp-server/PDLLServer.cpp b/mlir/lib/Tools/mlir-pdll-lsp-server/PDLLServer.cpp
index 60b9567ff7804..1dbe7ecac925a 100644
--- a/mlir/lib/Tools/mlir-pdll-lsp-server/PDLLServer.cpp
+++ b/mlir/lib/Tools/mlir-pdll-lsp-server/PDLLServer.cpp
@@ -31,6 +31,7 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/LSP/Logging.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/VirtualFileSystem.h"
 #include <optional>
 
 using namespace mlir;
@@ -402,6 +403,7 @@ PDLDocument::PDLDocument(const llvm::lsp::URIForFile &uri, StringRef contents,
   llvm::append_range(includeDirs, extraDirs);
 
   sourceMgr.setIncludeDirs(includeDirs);
+  sourceMgr.setVirtualFileSystem(llvm::vfs::getRealFileSystem());
   sourceMgr.AddNewSourceBuffer(std::move(memBuffer), SMLoc());
 
   astContext.getDiagEngine().setHandlerFn([&](const ast::Diagnostic &diag) {
diff --git a/mlir/lib/Tools/tblgen-lsp-server/TableGenServer.cpp b/mlir/lib/Tools/tblgen-lsp-server/TableGenServer.cpp
index 3080b78f187b1..2d817bee72e20 100644
--- a/mlir/lib/Tools/tblgen-lsp-server/TableGenServer.cpp
+++ b/mlir/lib/Tools/tblgen-lsp-server/TableGenServer.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Support/LSP/Logging.h"
 #include "llvm/Support/LSP/Protocol.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/TableGen/Parser.h"
 #include "llvm/TableGen/Record.h"
 #include <optional>
@@ -448,6 +449,7 @@ void TableGenTextFile::initialize(
     return;
   }
   sourceMgr.setIncludeDirs(includeDirs);
+  sourceMgr.setVirtualFileSystem(llvm::vfs::getRealFileSystem());
   sourceMgr.AddNewSourceBuffer(std::move(memBuffer), SMLoc());
 
   // This class provides a context argument for the SourceMgr diagnostic
diff --git a/mlir/lib/Transforms/Utils/LoopInvariantCodeMotionUtils.cpp b/mlir/lib/Transforms/Utils/LoopInvariantCodeMotionUtils.cpp
index 111f58ef92f51..5f3b04a6d8bce 100644
--- a/mlir/lib/Transforms/Utils/LoopInvariantCodeMotionUtils.cpp
+++ b/mlir/lib/Transforms/Utils/LoopInvariantCodeMotionUtils.cpp
@@ -66,7 +66,9 @@ size_t mlir::moveLoopInvariantCode(
   size_t numMoved = 0;
 
   for (Region *region : regions) {
-    LDBG() << "Original loop:\n" << *region->getParentOp();
+    LDBG() << "Original loop:\n"
+           << OpWithFlags(region->getParentOp(),
+                          OpPrintingFlags().skipRegions());
 
     std::queue<Operation *> worklist;
     // Add top-level operations in the loop body to the worklist.
@@ -90,7 +92,8 @@ size_t mlir::moveLoopInvariantCode(
           !canBeHoisted(op, definedOutside))
         continue;
 
-      LDBG() << "Moving loop-invariant op: " << *op;
+      LDBG() << "Moving loop-invariant op: "
+             << OpWithFlags(op, OpPrintingFlags().skipRegions());
       moveOutOfRegion(op, region);
       ++numMoved;
 
@@ -111,9 +114,7 @@ size_t mlir::moveLoopInvariantCode(LoopLikeOpInterface loopLike) {
       [&](Value value, Region *) {
         return loopLike.isDefinedOutsideOfLoop(value);
       },
-      [&](Operation *op, Region *) {
-        return isMemoryEffectFree(op) && isSpeculatable(op);
-      },
+      [&](Operation *op, Region *) { return isPure(op); },
       [&](Operation *op, Region *) { loopLike.moveOutOfLoop(op); });
 }
 
diff --git a/mlir/python/CMakeLists.txt b/mlir/python/CMakeLists.txt
index 9f5246de6bda0..84ff08a7631d6 100644
--- a/mlir/python/CMakeLists.txt
+++ b/mlir/python/CMakeLists.txt
@@ -134,6 +134,16 @@ declare_mlir_dialect_python_bindings(
     dialects/func.py
   DIALECT_NAME func)
 
+declare_mlir_dialect_python_bindings(
+  ADD_TO_PARENT MLIRPythonSources.Dialects
+  ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/mlir"
+  TD_FILE dialects/OpenACCOps.td
+  SOURCES
+    dialects/openacc.py
+  DIALECT_NAME acc
+  DEPENDS acc_common_td
+  )
+
 declare_mlir_dialect_python_bindings(
   ADD_TO_PARENT MLIRPythonSources.Dialects
   ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/mlir"
@@ -591,6 +601,8 @@ declare_mlir_python_extension(MLIRPythonExtension.Dialects.LLVM.Pybind
   EMBED_CAPI_LINK_LIBS
     MLIRCAPIIR
     MLIRCAPILLVM
+    # Misnomer - this is only the LLVMIR translation target.
+    MLIRCAPITarget
 )
 
 declare_mlir_python_extension(MLIRPythonExtension.Dialects.Quant.Pybind
diff --git a/mlir/python/mlir/dialects/OpenACCOps.td b/mlir/python/mlir/dialects/OpenACCOps.td
new file mode 100644
index 0000000000000..69a3002e73b81
--- /dev/null
+++ b/mlir/python/mlir/dialects/OpenACCOps.td
@@ -0,0 +1,14 @@
+//===-- OpenACCOps.td - Entry point for OpenACCOps bind ------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PYTHON_BINDINGS_OPENACC_OPS
+#define PYTHON_BINDINGS_OPENACC_OPS
+
+include "mlir/Dialect/OpenACC/OpenACCOps.td"
+
+#endif
diff --git a/mlir/python/mlir/dialects/gpu/__init__.py b/mlir/python/mlir/dialects/gpu/__init__.py
index 4cd80aa8b7ca8..2fbcbb059f87a 100644
--- a/mlir/python/mlir/dialects/gpu/__init__.py
+++ b/mlir/python/mlir/dialects/gpu/__init__.py
@@ -3,5 +3,333 @@
 #  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 from .._gpu_ops_gen import *
+from .._gpu_ops_gen import _Dialect
 from .._gpu_enum_gen import *
 from ..._mlir_libs._mlirDialectsGPU import *
+from typing import Any, Callable, Sequence, Tuple, Union, Optional, List
+
+try:
+    from ...ir import (
+        FunctionType,
+        TypeAttr,
+        StringAttr,
+        UnitAttr,
+        Block,
+        InsertionPoint,
+        ArrayAttr,
+        Type,
+        DictAttr,
+        Attribute,
+        DenseI32ArrayAttr,
+        Value,
+    )
+    from ...extras.meta import region_op
+    from ...extras import types as T
+    from ..arith import constant, ConstantOp
+    from .._ods_common import (
+        get_default_loc_context as _get_default_loc_context,
+        _cext as _ods_cext,
+        get_op_result_or_op_results,
+    )
+except ImportError as e:
+    raise RuntimeError("Error loading imports from extension module") from e
+
+
+def gpu_async_token():
+    return Type.parse("!gpu.async.token")
+
+
+@_ods_cext.register_operation(_Dialect, replace=True)
+class GPUFuncOp(GPUFuncOp):
+    __doc__ = GPUFuncOp.__doc__
+
+    KERNEL_ATTR_NAME = "gpu.kernel"
+    KNOWN_BLOCK_SIZE_ATTR_NAME = "known_block_size"
+    KNOWN_GRID_SIZE_ATTR_NAME = "known_grid_size"
+
+    FUNCTION_TYPE_ATTR_NAME = "function_type"
+    SYM_NAME_ATTR_NAME = "sym_name"
+    ARGUMENT_ATTR_NAME = "arg_attrs"
+    RESULT_ATTR_NAME = "res_attrs"
+
+    def __init__(
+        self,
+        function_type: Union[FunctionType, TypeAttr],
+        sym_name: Optional[Union[str, StringAttr]] = None,
+        kernel: Optional[bool] = None,
+        workgroup_attrib_attrs: Optional[Sequence[dict]] = None,
+        private_attrib_attrs: Optional[Sequence[dict]] = None,
+        known_block_size: Optional[Union[Sequence[int], DenseI32ArrayAttr]] = None,
+        known_grid_size: Optional[Union[Sequence[int], DenseI32ArrayAttr]] = None,
+        loc=None,
+        ip=None,
+        body_builder: Optional[Callable[[GPUFuncOp], None]] = None,
+    ):
+        """
+        Create a GPUFuncOp with the provided `function_type`, `sym_name`,
+        `kernel`, `workgroup_attrib_attrs`, `private_attrib_attrs`, `known_block_size`,
+        `known_grid_size`, and `body_builder`.
+        - `function_type` is a FunctionType or a TypeAttr.
+        - `sym_name` is a string or a StringAttr representing the function name.
+        - `kernel` is a boolean representing whether the function is a kernel.
+        - `workgroup_attrib_attrs` is an optional list of dictionaries.
+        - `private_attrib_attrs` is an optional list of dictionaries.
+        - `known_block_size` is an optional list of integers or a DenseI32ArrayAttr representing the known block size.
+        - `known_grid_size` is an optional list of integers or a DenseI32ArrayAttr representing the known grid size.
+        - `body_builder` is an optional callback. When provided, a new entry block
+          is created and the callback is invoked with the new op as argument within
+          an InsertionPoint context already set for the block. The callback is
+          expected to insert a terminator in the block.
+        """
+        function_type = (
+            TypeAttr.get(function_type)
+            if not isinstance(function_type, TypeAttr)
+            else function_type
+        )
+        super().__init__(
+            function_type,
+            workgroup_attrib_attrs=workgroup_attrib_attrs,
+            private_attrib_attrs=private_attrib_attrs,
+            loc=loc,
+            ip=ip,
+        )
+
+        if isinstance(sym_name, str):
+            self.attributes[self.SYM_NAME_ATTR_NAME] = StringAttr.get(sym_name)
+        elif isinstance(sym_name, StringAttr):
+            self.attributes[self.SYM_NAME_ATTR_NAME] = sym_name
+        else:
+            raise ValueError("sym_name must be a string or a StringAttr")
+
+        if kernel:
+            self.attributes[self.KERNEL_ATTR_NAME] = UnitAttr.get()
+
+        if known_block_size is not None:
+            if isinstance(known_block_size, Sequence):
+                block_size = DenseI32ArrayAttr.get(known_block_size)
+                self.attributes[self.KNOWN_BLOCK_SIZE_ATTR_NAME] = block_size
+            elif isinstance(known_block_size, DenseI32ArrayAttr):
+                self.attributes[self.KNOWN_BLOCK_SIZE_ATTR_NAME] = known_block_size
+            else:
+                raise ValueError(
+                    "known_block_size must be a list of integers or a DenseI32ArrayAttr"
+                )
+
+        if known_grid_size is not None:
+            if isinstance(known_grid_size, Sequence):
+                grid_size = DenseI32ArrayAttr.get(known_grid_size)
+                self.attributes[self.KNOWN_GRID_SIZE_ATTR_NAME] = grid_size
+            elif isinstance(known_grid_size, DenseI32ArrayAttr):
+                self.attributes[self.KNOWN_GRID_SIZE_ATTR_NAME] = known_grid_size
+            else:
+                raise ValueError(
+                    "known_grid_size must be a list of integers or a DenseI32ArrayAttr"
+                )
+
+        if body_builder is not None:
+            with InsertionPoint(self.add_entry_block()):
+                body_builder(self)
+
+    @property
+    def name(self) -> StringAttr:
+        return StringAttr(self.attributes[self.SYM_NAME_ATTR_NAME])
+
+    @property
+    def is_kernel(self) -> bool:
+        return self.KERNEL_ATTR_NAME in self.attributes
+
+    def add_entry_block(self) -> Block:
+        if len(self.body.blocks) > 0:
+            raise RuntimeError(f"Entry block already exists for {self.name.value}")
+
+        function_type = self.function_type.value
+        return self.body.blocks.append(
+            *function_type.inputs,
+            arg_locs=[self.location for _ in function_type.inputs],
+        )
+
+    @property
+    def entry_block(self) -> Block:
+        if len(self.body.blocks) == 0:
+            raise RuntimeError(
+                f"Entry block does not exist for {self.name.value}."
+                + " Do you need to call the add_entry_block() method on this GPUFuncOp?"
+            )
+        return self.body.blocks[0]
+
+    @property
+    def arguments(self) -> Sequence[Type]:
+        return self.function_type.value.inputs
+
+
+def _convert_literal_to_constant(value: Union[int, ConstantOp, Value]) -> Value:
+    if isinstance(value, int):
+        return constant(T.index(), value)
+    elif isinstance(value, (ConstantOp, Value)):
+        return value
+    else:
+        raise ValueError(f"Invalid value: {value}")
+
+
+@_ods_cext.register_operation(_Dialect, replace=True)
+class LaunchFuncOp(LaunchFuncOp):
+    __doc__ = LaunchFuncOp.__doc__
+
+    def __init__(
+        self,
+        kernel: List[str],
+        grid_size: Tuple[Any, Any, Any],
+        block_size: Tuple[Any, Any, Any],
+        kernel_operands: Optional[List[Value]] = None,
+        async_dependencies: Optional[List[Value]] = None,
+        dynamic_shared_memory_size: Optional[Value] = None,
+        async_object=None,
+        *,
+        loc=None,
+        ip=None,
+    ):
+        if async_dependencies is None:
+            async_dependencies = []
+        async_token = None
+        if len(async_dependencies):
+            async_token = gpu_async_token()
+
+        grid_size_x, grid_size_y, grid_size_z = map(
+            _convert_literal_to_constant, grid_size
+        )
+        block_size_x, block_size_y, block_size_z = map(
+            _convert_literal_to_constant, block_size
+        )
+
+        super().__init__(
+            async_token,
+            async_dependencies,
+            kernel,
+            grid_size_x,
+            grid_size_y,
+            grid_size_z,
+            block_size_x,
+            block_size_y,
+            block_size_z,
+            kernel_operands,
+            dynamicSharedMemorySize=dynamic_shared_memory_size,
+            asyncObject=async_object,
+            loc=loc,
+            ip=ip,
+        )
+
+
+def launch_func(
+    kernel: List[str],
+    grid_size: Tuple[Any, Any, Any],
+    block_size: Tuple[Any, Any, Any],
+    kernel_operands: Optional[List[Value]] = None,
+    async_dependencies: Optional[List[Value]] = None,
+    dynamic_shared_memory_size: Optional[Value] = None,
+    async_object=None,
+    *,
+    loc=None,
+    ip=None,
+) -> Union[Value, List[Value], LaunchFuncOp]:
+    op = LaunchFuncOp(
+        kernel=kernel,
+        grid_size=grid_size,
+        block_size=block_size,
+        kernel_operands=kernel_operands,
+        async_dependencies=async_dependencies,
+        dynamic_shared_memory_size=dynamic_shared_memory_size,
+        async_object=async_object,
+        loc=loc,
+        ip=ip,
+    )
+    results = op.results
+    if len(results) == 1:
+        return results[0]
+    elif len(results) > 1:
+        return results
+    else:
+        return op
+
+
+def wait(
+    async_dependencies: Optional[List[Value]] = None, *, loc=None, ip=None
+) -> Union[Value, List[Value], WaitOp]:
+    if async_dependencies is None:
+        async_dependencies = []
+    return get_op_result_or_op_results(
+        WaitOp(gpu_async_token(), async_dependencies, loc=loc, ip=ip)
+    )
+
+
+@_ods_cext.register_operation(_Dialect, replace=True)
+class LaunchOp(LaunchOp):
+    __doc__ = LaunchOp.__doc__
+
+    def __init__(
+        self,
+        grid_size: Tuple[Any, Any, Any],
+        block_size: Tuple[Any, Any, Any],
+        async_dependencies=None,
+        dynamic_shared_memory_size: Optional[Value] = None,
+        *,
+        loc=None,
+        ip=None,
+    ):
+        if async_dependencies is None:
+            async_dependencies = []
+        async_token = None
+        if len(async_dependencies):
+            async_token = gpu_async_token()
+        grid_size_x, grid_size_y, grid_size_z = map(
+            _convert_literal_to_constant, grid_size
+        )
+        block_size_x, block_size_y, block_size_z = map(
+            _convert_literal_to_constant, block_size
+        )
+
+        super().__init__(
+            async_token,
+            async_dependencies,
+            grid_size_x,
+            grid_size_y,
+            grid_size_z,
+            block_size_x,
+            block_size_y,
+            block_size_z,
+            dynamicSharedMemorySize=dynamic_shared_memory_size,
+            loc=loc,
+            ip=ip,
+        )
+        self.regions[0].blocks.append(*[T.index() for _ in range(12)])
+
+
+def launch_(
+    grid_size: Tuple[Any, Any, Any],
+    block_size: Tuple[Any, Any, Any],
+    async_dependencies=None,
+    dynamic_shared_memory_size: Optional[Value] = None,
+    *,
+    loc=None,
+    ip=None,
+):
+    grid_size = tuple(map(_convert_literal_to_constant, grid_size))
+    block_size = tuple(map(_convert_literal_to_constant, block_size))
+    launch_op = LaunchOp(
+        grid_size,
+        block_size,
+        async_dependencies,
+        dynamic_shared_memory_size,
+        loc=loc,
+        ip=ip,
+    )
+    return launch_op
+
+
+launch = region_op(launch_, terminator=lambda *_args: terminator())
+
+
+_printf = printf
+
+
+def printf(format, *args, loc=None, ip=None):
+    return _printf(format=format, args=args, loc=loc, ip=ip)
diff --git a/mlir/python/mlir/dialects/openacc.py b/mlir/python/mlir/dialects/openacc.py
new file mode 100644
index 0000000000000..057f71aed20a6
--- /dev/null
+++ b/mlir/python/mlir/dialects/openacc.py
@@ -0,0 +1,5 @@
+#  Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+#  See https://llvm.org/LICENSE.txt for license information.
+#  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from ._acc_ops_gen import *
diff --git a/mlir/python/mlir/dialects/transform/smt.py b/mlir/python/mlir/dialects/transform/smt.py
index 1f0b7f066118c..af88fffcd3bba 100644
--- a/mlir/python/mlir/dialects/transform/smt.py
+++ b/mlir/python/mlir/dialects/transform/smt.py
@@ -19,6 +19,7 @@
 class ConstrainParamsOp(ConstrainParamsOp):
     def __init__(
         self,
+        results: Sequence[Type],
         params: Sequence[transform.AnyParamType],
         arg_types: Sequence[Type],
         loc=None,
@@ -27,6 +28,7 @@ def __init__(
         if len(params) != len(arg_types):
             raise ValueError(f"{params=} not same length as {arg_types=}")
         super().__init__(
+            results,
             params,
             loc=loc,
             ip=ip,
@@ -36,3 +38,13 @@ def __init__(
     @property
     def body(self) -> Block:
         return self.regions[0].blocks[0]
+
+
+def constrain_params(
+    results: Sequence[Type],
+    params: Sequence[transform.AnyParamType],
+    arg_types: Sequence[Type],
+    loc=None,
+    ip=None,
+):
+    return ConstrainParamsOp(results, params, arg_types, loc=loc, ip=ip)
diff --git a/mlir/test/Analysis/DataFlow/test-strided-metadata-range-analysis.mlir b/mlir/test/Analysis/DataFlow/test-strided-metadata-range-analysis.mlir
new file mode 100644
index 0000000000000..808c1c2bfd2a8
--- /dev/null
+++ b/mlir/test/Analysis/DataFlow/test-strided-metadata-range-analysis.mlir
@@ -0,0 +1,67 @@
+// RUN: mlir-opt -test-strided-metadata-range-analysis %s 2>&1 | FileCheck %s
+
+func.func @memref_subview(%arg0: memref<8x16x4xf32, strided<[64, 4, 1]>>, %arg1: memref<1x128x1x32x1xf32, strided<[4096, 32, 32, 1, 1]>>, %arg2: memref<8x16x4xf32, strided<[1, 64, 8], offset: 16>>, %arg3: index, %arg4: index, %arg5: index) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %0 = test.with_bounds {smax = 13 : index, smin = 11 : index, umax = 13 : index, umin = 11 : index} : index
+  %1 = test.with_bounds {smax = 7 : index, smin = 5 : index, umax = 7 : index, umin = 5 : index} : index
+
+  // Test subview with unknown sizes, and constant offsets and strides.
+  // CHECK: Op:  %[[SV0:.*]] = memref.subview
+  // CHECK-NEXT: result[0]: strided_metadata<
+  // CHECK-SAME: offset = [{unsigned : [1, 1] signed : [1, 1]}]
+  // CHECK-SAME: sizes = [{unsigned : [0, 18446744073709551615] signed : [-9223372036854775808, 9223372036854775807]}, {unsigned : [0, 18446744073709551615] signed : [-9223372036854775808, 9223372036854775807]}, {unsigned : [0, 18446744073709551615] signed : [-9223372036854775808, 9223372036854775807]}]
+  // CHECK-SAME: strides = [{unsigned : [64, 64] signed : [64, 64]}, {unsigned : [4, 4] signed : [4, 4]}, {unsigned : [1, 1] signed : [1, 1]}]
+  %subview = memref.subview %arg0[%c0, %c0, %c1] [%arg3, %arg4, %arg5] [%c1, %c1, %c1] : memref<8x16x4xf32, strided<[64, 4, 1]>> to memref<?x?x?xf32, strided<[?, ?, ?], offset: ?>>
+
+  // Test a subview of a subview, with bounded dynamic offsets.
+  // CHECK: Op:  %[[SV1:.*]] = memref.subview
+  // CHECK-NEXT: result[0]: strided_metadata<
+  // CHECK-SAME: offset = [{unsigned : [346, 484] signed : [346, 484]}]
+  // CHECK-SAME: sizes = [{unsigned : [2, 2] signed : [2, 2]}, {unsigned : [2, 2] signed : [2, 2]}, {unsigned : [2, 2] signed : [2, 2]}]
+  // CHECK-SAME: strides = [{unsigned : [704, 832] signed : [704, 832]}, {unsigned : [44, 52] signed : [44, 52]}, {unsigned : [11, 13] signed : [11, 13]}]
+  %subview_0 = memref.subview %subview[%1, %1, %1] [%c2, %c2, %c2] [%0, %0, %0] : memref<?x?x?xf32, strided<[?, ?, ?], offset: ?>> to memref<?x?x?xf32, strided<[?, ?, ?], offset: ?>>
+
+  // Test a subview of a subview, with constant operands.
+  // CHECK: Op:  %[[SV2:.*]] = memref.subview
+  // CHECK-NEXT: result[0]: strided_metadata<
+  // CHECK-SAME: offset = [{unsigned : [368, 510] signed : [368, 510]}]
+  // CHECK-SAME: sizes = [{unsigned : [2, 2] signed : [2, 2]}, {unsigned : [2, 2] signed : [2, 2]}, {unsigned : [2, 2] signed : [2, 2]}]
+  // CHECK-SAME: strides = [{unsigned : [704, 832] signed : [704, 832]}, {unsigned : [44, 52] signed : [44, 52]}, {unsigned : [11, 13] signed : [11, 13]}]
+  %subview_1 = memref.subview %subview_0[%c0, %c0, %c2] [%c2, %c2, %c2] [%c1, %c1, %c1] : memref<?x?x?xf32, strided<[?, ?, ?], offset: ?>> to memref<?x?x?xf32, strided<[?, ?, ?], offset: ?>>
+
+  // Test a rank-reducing subview.
+  // CHECK: Op:  %[[SV3:.*]] = memref.subview
+  // CHECK-NEXT: result[0]: strided_metadata<
+  // CHECK-SAME: offset = [{unsigned : [0, 18446744073709551615] signed : [-9223372036854775808, 9223372036854775807]}]
+  // CHECK-SAME: sizes = [{unsigned : [64, 64] signed : [64, 64]}, {unsigned : [16, 16] signed : [16, 16]}]
+  // CHECK-SAME: strides = [{unsigned : [0, 18446744073709551615] signed : [-9223372036854775808, 9223372036854775807]}, {unsigned : [0, 18446744073709551615] signed : [-9223372036854775808, 9223372036854775807]}]
+  %subview_2 = memref.subview %arg1[%arg4, %arg4, %arg4, %arg4, %arg4] [1, 64, 1, 16, 1] [%arg5, %arg5, %arg5, %arg5, %arg5] : memref<1x128x1x32x1xf32, strided<[4096, 32, 32, 1, 1]>> to memref<64x16xf32, strided<[?, ?], offset: ?>>
+
+  // Test a subview of a rank-reducing subview
+  // CHECK: Op:  %[[SV4:.*]] = memref.subview
+  // CHECK-NEXT: result[0]: strided_metadata<
+  // CHECK-SAME: offset = [{unsigned : [0, 18446744073709551615] signed : [-9223372036854775808, 9223372036854775807]}]
+  // CHECK-SAME: sizes = [{unsigned : [5, 7] signed : [5, 7]}]
+  // CHECK-SAME: strides = [{unsigned : [0, 18446744073709551615] signed : [-9223372036854775808, 9223372036854775807]}]
+  %subview_3 = memref.subview %subview_2[%c0, %0] [1, %1] [%c1, %c2] : memref<64x16xf32, strided<[?, ?], offset: ?>> to memref<?xf32, strided<[?], offset: ?>>
+
+  // Test a subview with mixed bounded and unbound dynamic sizes.
+  // CHECK: Op:  %[[SV5:.*]] = memref.subview
+  // CHECK-NEXT: result[0]: strided_metadata<
+  // CHECK-SAME: offset = [{unsigned : [32, 32] signed : [32, 32]}]
+  // CHECK-SAME: sizes = [{unsigned : [11, 13] signed : [11, 13]}, {unsigned : [5, 7] signed : [5, 7]}, {unsigned : [0, 18446744073709551615] signed : [-9223372036854775808, 9223372036854775807]}]
+  // CHECK-SAME: strides = [{unsigned : [1, 1] signed : [1, 1]}, {unsigned : [64, 64] signed : [64, 64]}, {unsigned : [8, 8] signed : [8, 8]}]
+  %subview_4 = memref.subview %arg2[%c0, %c0, %c2] [%0, %1, %arg5] [%c1, %c1, %c1] : memref<8x16x4xf32, strided<[1, 64, 8], offset: 16>> to memref<?x?x?xf32, strided<[?, ?, ?], offset: ?>>
+  return
+}
+
+// CHECK:       func.func @memref_subview
+// CHECK:       %[[A0:.*]]: memref<8x16x4xf32, strided<[64, 4, 1]>>
+// CHECK:       %[[SV0]] = memref.subview %[[A0]]
+// CHECK-NEXT:  %[[SV1]] = memref.subview
+// CHECK-NEXT:  %[[SV2]] = memref.subview
+// CHECK-NEXT:  %[[SV3]] = memref.subview
+// CHECK-NEXT:  %[[SV4]] = memref.subview
+// CHECK-NEXT:  %[[SV5]] = memref.subview
diff --git a/mlir/test/Conversion/MathToROCDL/math-to-rocdl.mlir b/mlir/test/Conversion/MathToROCDL/math-to-rocdl.mlir
index dbff23339d8b3..455f886839604 100644
--- a/mlir/test/Conversion/MathToROCDL/math-to-rocdl.mlir
+++ b/mlir/test/Conversion/MathToROCDL/math-to-rocdl.mlir
@@ -1,4 +1,5 @@
-// RUN: mlir-opt %s -convert-math-to-rocdl -allow-unregistered-dialect -split-input-file | FileCheck %s
+// RUN: mlir-opt %s -allow-unregistered-dialect -split-input-file -pass-pipeline='builtin.module(convert-math-to-rocdl{chipset=gfx803})' | FileCheck %s --check-prefix=PRE9
+// RUN: mlir-opt %s -allow-unregistered-dialect -split-input-file -pass-pipeline='builtin.module(convert-math-to-rocdl{chipset=gfx942})' | FileCheck %s --check-prefix=POST9
 
 module @test_module {
   // CHECK: llvm.func @__ocml_fmod_f16(f16, f16) -> f16
@@ -596,3 +597,76 @@ module @test_module {
     func.return %result : vector<2x2xf16>
   }
 }
+
+// -----
+
+// f16 clamp → rocdl.fmed3 on gfx9+
+// CHECK-LABEL: func.func @clampf_f16
+func.func @clampf_f16(%x: f16, %lo: f16, %hi: f16) -> f16 {
+  %r = math.clampf %x to [%lo, %hi] : f16
+  return %r : f16
+  // POST9: rocdl.fmed3 {{.*}} : f16
+  // PRE9-NOT: rocdl.fmed3
+  // PRE9: math.clampf {{.*}} : f16
+}
+
+// f32 clamp → rocdl.fmed3 on gfx9+
+// CHECK-LABEL: func.func @clampf_f32
+func.func @clampf_f32(%x: f32, %lo: f32, %hi: f32) -> f32 {
+  %r = math.clampf %x to [%lo, %hi] : f32
+  return %r : f32
+  // POST9: rocdl.fmed3 {{.*}} : f32
+  // PRE9-NOT: rocdl.fmed3
+  // PRE9: math.clampf {{.*}} : f32
+}
+
+// -----
+
+// Vector f16 clamp → rocdl.fmed3 on gfx9+
+// CHECK-LABEL: func.func @clampf_vector_f16
+func.func @clampf_vector_f16(%x: vector<2xf16>, %lo: vector<2xf16>, %hi: vector<2xf16>) -> vector<2xf16> {
+  %r = math.clampf %x to [%lo, %hi] : vector<2xf16>
+  return %r : vector<2xf16>
+  // POST9: rocdl.fmed3 {{.*}} : vector<2xf16>
+  // PRE9-NOT: rocdl.fmed3
+  // PRE9: math.clampf {{.*}} : vector<2xf16>
+}
+
+// -----
+
+// Vector f32 clamp → rocdl.fmed3 on gfx9+
+// CHECK-LABEL: func.func @clampf_vector_f32
+func.func @clampf_vector_f32(%x: vector<2xf32>, %lo: vector<2xf32>, %hi: vector<2xf32>) -> vector<2xf32> {
+  %r = math.clampf %x to [%lo, %hi] : vector<2xf32>
+  return %r : vector<2xf32>
+  // POST9: rocdl.fmed3 {{.*}} : vector<2xf32>
+  // PRE9-NOT: rocdl.fmed3
+  // PRE9: math.clampf {{.*}} : vector<2xf32>
+}
+
+// -----
+
+// Multi-dimensional vector f16 clamp → rocdl.fmed3 on gfx9+ (unrolled to 1D vectors)
+// CHECK-LABEL: func.func @clampf_vector_2d_f16
+func.func @clampf_vector_2d_f16(%x: vector<2x2xf16>, %lo: vector<2x2xf16>, %hi: vector<2x2xf16>) -> vector<2x2xf16> {
+  %r = math.clampf %x to [%lo, %hi] : vector<2x2xf16>
+  return %r : vector<2x2xf16>
+  // POST9: builtin.unrealized_conversion_cast {{.*}} : vector<2x2xf16> to !llvm.array<2 x vector<2xf16>>
+  // POST9: llvm.extractvalue {{.*}} : !llvm.array<2 x vector<2xf16>>
+  // POST9: rocdl.fmed3 {{.*}} : vector<2xf16>
+  // POST9: llvm.insertvalue {{.*}} : !llvm.array<2 x vector<2xf16>>
+  // POST9: llvm.extractvalue {{.*}} : !llvm.array<2 x vector<2xf16>>
+  // POST9: rocdl.fmed3 {{.*}} : vector<2xf16>
+  // POST9: llvm.insertvalue {{.*}} : !llvm.array<2 x vector<2xf16>>
+  // PRE9-NOT: rocdl.fmed3
+  // PRE9: math.clampf {{.*}} : vector<2x2xf16>
+}
+
+// -----
+// CHECK-LABEL: func.func @clampf_bf16
+func.func @clampf_bf16(%x: bf16, %lo: bf16, %hi: bf16) -> bf16 {
+  %r = math.clampf %x to [%lo, %hi] : bf16
+  return %r : bf16
+  // CHECK: math.clampf {{.*}} : bf16
+  // CHECK-NOT: rocdl.fmed3
+}
diff --git a/mlir/test/Conversion/MathToXeVM/lit.local.cfg b/mlir/test/Conversion/MathToXeVM/lit.local.cfg
new file mode 100644
index 0000000000000..cc1ce3535f2c7
--- /dev/null
+++ b/mlir/test/Conversion/MathToXeVM/lit.local.cfg
@@ -0,0 +1,7 @@
+spirv_backend_tests = [
+    'native-spirv-builtins.mlir',
+]
+ 
+# Exclude SPIRV backend tests if SPIRV target is disabled: 
+if(not config.run_xevm_tests):
+    config.excludes.update(spirv_backend_tests)
diff --git a/mlir/test/Conversion/MathToXeVM/math-to-xevm.mlir b/mlir/test/Conversion/MathToXeVM/math-to-xevm.mlir
new file mode 100644
index 0000000000000..c61640c2afc4f
--- /dev/null
+++ b/mlir/test/Conversion/MathToXeVM/math-to-xevm.mlir
@@ -0,0 +1,199 @@
+// RUN: mlir-opt %s -convert-math-to-xevm \
+// RUN:   | FileCheck %s -check-prefixes='CHECK,CHECK-ARITH' 
+// RUN: mlir-opt %s -convert-math-to-xevm='convert-arith=false' \
+// RUN:   | FileCheck %s -check-prefixes='CHECK,CHECK-NO-ARITH'
+
+// RUN: mlir-opt --pass-pipeline="builtin.module(convert-math-to-xevm)" %s \
+// RUN:   | FileCheck %s -check-prefixes='CHECK-MODULE,CHECK-ENTIRE-MODULE'
+// RUN: mlir-opt --pass-pipeline="builtin.module(gpu.module(convert-math-to-xevm))" %s \
+// RUN:   | FileCheck %s -check-prefixes='CHECK-MODULE,CHECK-ONLY-GPU'
+
+// This test:
+// - check that MathToXeVM converts fastmath math/arith ops properly;
+// - check that MathToXeVM handles nested modules while respecting pass manager.
+
+module @test_module {
+  // CHECK-DAG: llvm.func @_Z22__spirv_ocl_native_expDh(f16) -> f16
+  // CHECK-DAG: llvm.func @_Z22__spirv_ocl_native_expf(f32) -> f32
+  // CHECK-DAG: llvm.func @_Z22__spirv_ocl_native_expd(f64) -> f64
+  //
+  // CHECK-DAG: llvm.func @_Z22__spirv_ocl_native_expDv2_d(vector<2xf64>) -> vector<2xf64>
+  // CHECK-DAG: llvm.func @_Z22__spirv_ocl_native_expDv3_d(vector<3xf64>) -> vector<3xf64>
+  // CHECK-DAG: llvm.func @_Z22__spirv_ocl_native_expDv4_d(vector<4xf64>) -> vector<4xf64>
+  // CHECK-DAG: llvm.func @_Z22__spirv_ocl_native_expDv8_d(vector<8xf64>) -> vector<8xf64>
+  // CHECK-DAG: llvm.func @_Z22__spirv_ocl_native_expDv16_d(vector<16xf64>) -> vector<16xf64>
+  // CHECK-DAG: llvm.func @_Z22__spirv_ocl_native_expDv16_f(vector<16xf32>) -> vector<16xf32>
+  // CHECK-DAG: llvm.func @_Z22__spirv_ocl_native_expDv4_Dh(vector<4xf16>) -> vector<4xf16>
+  //
+  // CHECK-DAG: llvm.func @_Z22__spirv_ocl_native_cosDh(f16) -> f16
+  // CHECK-DAG: llvm.func @_Z23__spirv_ocl_native_exp2f(f32) -> f32
+  // CHECK-DAG: llvm.func @_Z22__spirv_ocl_native_logDh(f16) -> f16
+  // CHECK-DAG: llvm.func @_Z23__spirv_ocl_native_log2f(f32) -> f32
+  // CHECK-DAG: llvm.func @_Z24__spirv_ocl_native_log10d(f64) -> f64
+  // CHECK-DAG: llvm.func @_Z23__spirv_ocl_native_powrDhDh(f16, f16) -> f16
+  // CHECK-DAG: llvm.func @_Z24__spirv_ocl_native_rsqrtd(f64) -> f64
+  // CHECK-DAG: llvm.func @_Z22__spirv_ocl_native_sinDh(f16) -> f16
+  // CHECK-DAG: llvm.func @_Z23__spirv_ocl_native_sqrtf(f32) -> f32
+  // CHECK-DAG: llvm.func @_Z22__spirv_ocl_native_tand(f64) -> f64
+  // CHECK-ARITH-DAG: llvm.func @_Z25__spirv_ocl_native_divideff(f32, f32) -> f32
+
+  // CHECK-LABEL: func @math_ops
+  func.func @math_ops() {
+
+    %c1_f16 = arith.constant 1. : f16
+    %c1_f32 = arith.constant 1. : f32
+    %c1_f64 = arith.constant 1. : f64
+
+    // CHECK: math.exp
+    %exp_normal_f16 = math.exp %c1_f16 : f16
+    // CHECK: math.exp
+    %exp_normal_f32 = math.exp %c1_f32 : f32
+    // CHECK: math.exp
+    %exp_normal_f64 = math.exp %c1_f64 : f64
+
+    // Check float operations are converted properly:
+
+    // CHECK: llvm.call @_Z22__spirv_ocl_native_expDh(%{{.*}}) {fastmathFlags = #llvm.fastmath<fast>} : (f16) -> f16
+    %exp_fast_f16 = math.exp %c1_f16 fastmath<fast> : f16
+    // CHECK: llvm.call @_Z22__spirv_ocl_native_expf(%{{.*}}) {fastmathFlags = #llvm.fastmath<fast>} : (f32) -> f32
+    %exp_fast_f32 = math.exp %c1_f32 fastmath<fast> : f32
+    // CHECK: llvm.call @_Z22__spirv_ocl_native_expd(%{{.*}}) {fastmathFlags = #llvm.fastmath<fast>} : (f64) -> f64
+    %exp_fast_f64 = math.exp %c1_f64 fastmath<fast> : f64
+    
+    // CHECK: llvm.call @_Z22__spirv_ocl_native_expDh(%{{.*}}) {fastmathFlags = #llvm.fastmath<afn>} : (f16) -> f16
+    %exp_afn_f16 = math.exp %c1_f16 fastmath<afn> : f16
+    // CHECK: llvm.call @_Z22__spirv_ocl_native_expf(%{{.*}}) {fastmathFlags = #llvm.fastmath<afn>} : (f32) -> f32
+    %exp_afn_f32 = math.exp %c1_f32 fastmath<afn> : f32
+    // CHECK: llvm.call @_Z22__spirv_ocl_native_expd(%{{.*}}) {fastmathFlags = #llvm.fastmath<afn>} : (f64) -> f64
+    %exp_afn_f64 = math.exp %c1_f64 fastmath<afn> : f64
+
+    // CHECK: math.exp
+    %exp_none_f16 = math.exp %c1_f16 fastmath<none> : f16
+    // CHECK: math.exp
+    %exp_none_f32 = math.exp %c1_f32 fastmath<none> : f32
+    // CHECK: math.exp
+    %exp_none_f64 = math.exp %c1_f64 fastmath<none> : f64
+
+    // Check vector operations:
+
+    %v2_c1_f64 = arith.constant dense<1.> : vector<2xf64>
+    %v3_c1_f64 = arith.constant dense<1.> : vector<3xf64>
+    %v4_c1_f64 = arith.constant dense<1.> : vector<4xf64>
+    %v8_c1_f64 = arith.constant dense<1.> : vector<8xf64>
+    %v16_c1_f64 = arith.constant dense<1.> : vector<16xf64>
+
+    // CHECK: llvm.call @_Z22__spirv_ocl_native_expDv2_d(%{{.*}}) {fastmathFlags = #llvm.fastmath<afn>} : (vector<2xf64>) -> vector<2xf64>
+    %exp_v2_f64 = math.exp %v2_c1_f64 fastmath<afn> : vector<2xf64>
+    // CHECK: llvm.call @_Z22__spirv_ocl_native_expDv3_d(%{{.*}}) {fastmathFlags = #llvm.fastmath<afn>} : (vector<3xf64>) -> vector<3xf64>
+    %exp_v3_f64 = math.exp %v3_c1_f64 fastmath<afn> : vector<3xf64>
+    // CHECK: llvm.call @_Z22__spirv_ocl_native_expDv4_d(%{{.*}}) {fastmathFlags = #llvm.fastmath<afn>} : (vector<4xf64>) -> vector<4xf64>
+    %exp_v4_f64 = math.exp %v4_c1_f64 fastmath<afn> : vector<4xf64>
+    // CHECK: llvm.call @_Z22__spirv_ocl_native_expDv8_d(%{{.*}}) {fastmathFlags = #llvm.fastmath<afn>} : (vector<8xf64>) -> vector<8xf64>
+    %exp_v8_f64 = math.exp %v8_c1_f64 fastmath<afn> : vector<8xf64>
+    // CHECK: llvm.call @_Z22__spirv_ocl_native_expDv16_d(%{{.*}}) {fastmathFlags = #llvm.fastmath<afn>} : (vector<16xf64>) -> vector<16xf64>
+    %exp_v16_f64 = math.exp %v16_c1_f64 fastmath<afn> : vector<16xf64>
+
+    %v16_c1_f32 = arith.constant dense<1.> : vector<16xf32>
+    %v4_c1_f16 = arith.constant dense<1.> : vector<4xf16>
+
+    // CHECK: llvm.call @_Z22__spirv_ocl_native_expDv16_f(%{{.*}}) {fastmathFlags = #llvm.fastmath<fast>} : (vector<16xf32>) -> vector<16xf32>
+    %exp_v16_f32 = math.exp %v16_c1_f32 fastmath<fast> : vector<16xf32>
+    // CHECK: llvm.call @_Z22__spirv_ocl_native_expDv4_Dh(%{{.*}}) {fastmathFlags = #llvm.fastmath<fast>} : (vector<4xf16>) -> vector<4xf16>
+    %exp_v4_f16 = math.exp %v4_c1_f16 fastmath<fast> : vector<4xf16>
+
+    // Check unsupported vector sizes are not converted:
+
+    %v5_c1_f64 = arith.constant dense<1.> : vector<5xf64>
+    %v32_c1_f64 = arith.constant dense<1.> : vector<32xf64>
+
+    // CHECK: math.exp
+    %exp_v5_f64 = math.exp %v5_c1_f64 fastmath<afn> : vector<5xf64>
+    // CHECK: math.exp
+    %exp_v32_f64 = math.exp %v32_c1_f64 fastmath<afn> : vector<32xf64>
+
+    // Check fastmath flags propagate properly:
+
+    // CHECK: llvm.call @_Z22__spirv_ocl_native_expDh(%{{.*}}) {fastmathFlags = #llvm.fastmath<fast>} : (f16) -> f16
+    %exp_fastmath_all_f16 = math.exp %c1_f16 fastmath<reassoc,nnan,ninf,nsz,arcp,contract,afn> : f16
+    // CHECK: llvm.call @_Z22__spirv_ocl_native_expf(%{{.*}}) {fastmathFlags = #llvm.fastmath<nnan, ninf, nsz, arcp, contract, afn>} : (f32) -> f32
+    %exp_fastmath_most_f32 = math.exp %c1_f32 fastmath<nnan,ninf,nsz,arcp,contract,afn> : f32
+    // CHECK: llvm.call @_Z22__spirv_ocl_native_expf(%{{.*}}) {fastmathFlags = #llvm.fastmath<nnan, afn, reassoc>} : (f32) -> f32
+    %exp_afn_reassoc_nnan_f32 = math.exp %c1_f32 fastmath<afn,reassoc,nnan> : f32
+
+    // Check all other math operations:
+
+    // CHECK: llvm.call @_Z22__spirv_ocl_native_cosDh(%{{.*}}) {fastmathFlags = #llvm.fastmath<afn>} : (f16) -> f16
+    %cos_afn_f16 = math.cos %c1_f16 fastmath<afn> : f16
+
+    // CHECK: llvm.call @_Z23__spirv_ocl_native_exp2f(%{{.*}}) {fastmathFlags = #llvm.fastmath<afn>} : (f32) -> f32
+    %exp2_afn_f32 = math.exp2 %c1_f32 fastmath<afn> : f32
+
+    // CHECK: llvm.call @_Z22__spirv_ocl_native_logDh(%{{.*}}) {fastmathFlags = #llvm.fastmath<afn>} : (f16) -> f16
+    %log_afn_f16 = math.log %c1_f16 fastmath<afn> : f16
+
+    // CHECK: llvm.call @_Z23__spirv_ocl_native_log2f(%{{.*}}) {fastmathFlags = #llvm.fastmath<afn>} : (f32) -> f32
+    %log2_afn_f32 = math.log2 %c1_f32 fastmath<afn> : f32
+
+    // CHECK: llvm.call @_Z24__spirv_ocl_native_log10d(%{{.*}}) {fastmathFlags = #llvm.fastmath<afn>} : (f64) -> f64
+    %log10_afn_f64 = math.log10 %c1_f64 fastmath<afn> : f64
+
+    // CHECK: llvm.call @_Z23__spirv_ocl_native_powrDhDh(%{{.*}}, %{{.*}}) {fastmathFlags = #llvm.fastmath<afn>} : (f16, f16) -> f16
+    %powr_afn_f16 = math.powf %c1_f16, %c1_f16 fastmath<afn> : f16
+
+    // CHECK: llvm.call @_Z24__spirv_ocl_native_rsqrtd(%{{.*}}) {fastmathFlags = #llvm.fastmath<afn>} : (f64) -> f64
+    %rsqrt_afn_f64 = math.rsqrt %c1_f64 fastmath<afn> : f64
+
+    // CHECK: llvm.call @_Z22__spirv_ocl_native_sinDh(%{{.*}}) {fastmathFlags = #llvm.fastmath<afn>} : (f16) -> f16
+    %sin_afn_f16 = math.sin %c1_f16 fastmath<afn> : f16
+
+    // CHECK: llvm.call @_Z23__spirv_ocl_native_sqrtf(%{{.*}}) {fastmathFlags = #llvm.fastmath<afn>} : (f32) -> f32
+    %sqrt_afn_f32 = math.sqrt %c1_f32 fastmath<afn> : f32
+
+    // CHECK: llvm.call @_Z22__spirv_ocl_native_tand(%{{.*}}) {fastmathFlags = #llvm.fastmath<afn>} : (f64) -> f64
+    %tan_afn_f64 = math.tan %c1_f64 fastmath<afn> : f64
+
+    %c6_9_f32 = arith.constant 6.9 : f32
+    %c7_f32 = arith.constant 7. : f32
+
+    // CHECK-ARITH: llvm.call @_Z25__spirv_ocl_native_divideff(%{{.*}}) {fastmathFlags = #llvm.fastmath<afn>} : (f32, f32) -> f32
+    // CHECK-NO-ARITH: arith.divf
+    %divf_afn_f32 = arith.divf %c6_9_f32, %c7_f32 fastmath<afn> : f32
+
+    return
+  }
+
+  // Check that MathToXeVM handles nested modules while respecting pass manager:
+
+  // CHECK-ENTIRE-MODULE: llvm.func @_Z22__spirv_ocl_native_expf(f32) -> f32
+  // CHECK-ONLY-GPU-NOT: llvm.func @_Z22__spirv_ocl_native_expf(f32) -> f32
+  
+  // CHECK-MODULE-LABEL: @test_gpu
+  gpu.module @test_gpu {
+    // CHECK-MODULE: llvm.func @_Z22__spirv_ocl_native_expf(f32) -> f32
+    gpu.func @exp_gpu() {
+      %c1_f32 = arith.constant 1. : f32
+
+      // CHECK-MODULE: math.exp
+      %exp_normal_f32 = math.exp %c1_f32 : f32
+
+      // CHECK-MODULE: llvm.call @_Z22__spirv_ocl_native_expf(%{{.*}}) {fastmathFlags = #llvm.fastmath<afn>} : (f32) -> f32
+      %exp_fast_f32 = math.exp %c1_f32 fastmath<afn> : f32
+
+      gpu.return
+    }
+  }
+
+  // CHECK-MODULE-LABEL: @exp_func
+  func.func @exp_func() {
+    %c1_f32 = arith.constant 1. : f32
+
+    // CHECK-MODULE: math.exp
+    %exp_normal_f32 = math.exp %c1_f32 : f32
+
+    // CHECK-ENTIRE-MODULE: llvm.call @_Z22__spirv_ocl_native_expf(%{{.*}}) {fastmathFlags = #llvm.fastmath<afn>} : (f32) -> f32
+    // CHECK-ONLY-GPU: math.exp
+    %exp_fast_f32 = math.exp %c1_f32 fastmath<afn> : f32
+
+    return
+  }
+}
diff --git a/mlir/test/Conversion/MathToXeVM/native-spirv-builtins.mlir b/mlir/test/Conversion/MathToXeVM/native-spirv-builtins.mlir
new file mode 100644
index 0000000000000..82426c44ddb1f
--- /dev/null
+++ b/mlir/test/Conversion/MathToXeVM/native-spirv-builtins.mlir
@@ -0,0 +1,119 @@
+// RUN: mlir-opt %s -gpu-module-to-binary="format=isa" \
+// RUN:             -debug-only=serialize-to-isa 2> %t 
+// RUN: FileCheck --input-file=%t %s
+// REQUIRES: asserts
+//
+// MathToXeVM pass generates OpenCL intrinsics function calls when converting
+// Math ops with `fastmath` attr to native function calls. It is assumed that
+// the SPIRV backend would correctly convert these intrinsics calls to OpenCL
+// ExtInst instructions in SPIRV (See llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp).
+//
+// To ensure this assumption holds, this test verifies that the SPIRV backend
+// behaves as expected.
+
+module @test_ocl_intrinsics attributes {gpu.container_module} {
+  gpu.module @kernel [#xevm.target] {
+    llvm.func spir_kernelcc @native_fcns() attributes {gpu.kernel} {
+      // CHECK-DAG: %[[F16T:.+]] = OpTypeFloat 16
+      // CHECK-DAG: %[[ZERO_F16:.+]] = OpConstantNull %[[F16T]]
+      %c0_f16 = llvm.mlir.constant(0. : f16) : f16
+      // CHECK-DAG: %[[F32T:.+]] = OpTypeFloat 32
+      // CHECK-DAG: %[[ZERO_F32:.+]] = OpConstantNull %[[F32T]]
+      %c0_f32 = llvm.mlir.constant(0. : f32) : f32
+      // CHECK-DAG: %[[F64T:.+]] = OpTypeFloat 64
+      // CHECK-DAG: %[[ZERO_F64:.+]] = OpConstantNull %[[F64T]]
+      %c0_f64 = llvm.mlir.constant(0. : f64) : f64
+
+      // CHECK-DAG: %[[V2F64T:.+]] = OpTypeVector %[[F64T]] 2
+      // CHECK-DAG: %[[V2_ZERO_F64:.+]] = OpConstantNull %[[V2F64T]]
+      %v2_c0_f64 = llvm.mlir.constant(dense<0.> : vector<2xf64>) : vector<2xf64>
+      // CHECK-DAG: %[[V3F32T:.+]] = OpTypeVector %[[F32T]] 3
+      // CHECK-DAG: %[[V3_ZERO_F32:.+]] = OpConstantNull %[[V3F32T]]
+      %v3_c0_f32 = llvm.mlir.constant(dense<0.> : vector<3xf32>) : vector<3xf32>
+      // CHECK-DAG: %[[V4F64T:.+]] = OpTypeVector %[[F64T]] 4
+      // CHECK-DAG: %[[V4_ZERO_F64:.+]] = OpConstantNull %[[V4F64T]]
+      %v4_c0_f64 = llvm.mlir.constant(dense<0.> : vector<4xf64>) : vector<4xf64>
+      // CHECK-DAG: %[[V8F64T:.+]] = OpTypeVector %[[F64T]] 8
+      // CHECK-DAG: %[[V8_ZERO_F64:.+]] = OpConstantNull %[[V8F64T]]
+      %v8_c0_f64 = llvm.mlir.constant(dense<0.> : vector<8xf64>) : vector<8xf64>
+      // CHECK-DAG: %[[V16F16T:.+]] = OpTypeVector %[[F16T]] 16
+      // CHECK-DAG: %[[V16_ZERO_F16:.+]] = OpConstantNull %[[V16F16T]]
+      %v16_c0_f16 = llvm.mlir.constant(dense<0.> : vector<16xf16>) : vector<16xf16>     
+
+      // CHECK: OpExtInst %[[F16T]] %{{.+}} native_exp %[[ZERO_F16]]
+      %exp_f16 = llvm.call @_Z22__spirv_ocl_native_expDh(%c0_f16) : (f16) -> f16
+      // CHECK: OpExtInst %[[F32T]] %{{.+}} native_exp %[[ZERO_F32]]
+      %exp_f32 = llvm.call @_Z22__spirv_ocl_native_expf(%c0_f32) : (f32) -> f32
+      // CHECK: OpExtInst %[[F64T]] %{{.+}} native_exp %[[ZERO_F64]]
+      %exp_f64 = llvm.call @_Z22__spirv_ocl_native_expd(%c0_f64) : (f64) -> f64
+
+      // CHECK: OpExtInst %[[V2F64T]] %{{.+}} native_exp %[[V2_ZERO_F64]]
+      %exp_v2_f64 = llvm.call @_Z22__spirv_ocl_native_expDv2_f64(%v2_c0_f64) : (vector<2xf64>) -> vector<2xf64>
+      // CHECK: OpExtInst %[[V3F32T]] %{{.+}} native_exp %[[V3_ZERO_F32]]
+      %exp_v3_f32 = llvm.call @_Z22__spirv_ocl_native_expDv3_f32(%v3_c0_f32) : (vector<3xf32>) -> vector<3xf32>
+      // CHECK: OpExtInst %[[V4F64T]] %{{.+}} native_exp %[[V4_ZERO_F64]]
+      %exp_v4_f64 = llvm.call @_Z22__spirv_ocl_native_expDv4_f64(%v4_c0_f64) : (vector<4xf64>) -> vector<4xf64>
+      // CHECK: OpExtInst %[[V8F64T]] %{{.+}} native_exp %[[V8_ZERO_F64]]
+      %exp_v8_f64 = llvm.call @_Z22__spirv_ocl_native_expDv8_f64(%v8_c0_f64) : (vector<8xf64>) -> vector<8xf64>
+      // CHECK: OpExtInst %[[V16F16T]] %{{.+}} native_exp %[[V16_ZERO_F16]]
+      %exp_v16_f16 = llvm.call @_Z22__spirv_ocl_native_expDv16_f16(%v16_c0_f16) : (vector<16xf16>) -> vector<16xf16>
+
+      // SPIRV backend does not currently handle fastmath flags: The SPIRV
+      // backend would need to generate OpDecorate calls to decorate math ops
+      // with FPFastMathMode/FPFastMathModeINTEL decorations.
+      //
+      // FIXME: When support for fastmath flags in the SPIRV backend is added, 
+      // add tests here to ensure fastmath flags are converted to the correct
+      // OpDecorate calls.
+      // 
+      // See:
+      // - https://registry.khronos.org/SPIR-V/specs/unified1/OpenCL.ExtendedInstructionSet.100.html#_math_extended_instructions
+      // - https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#OpDecorate
+
+      // CHECK: OpExtInst %[[F16T]] %{{.+}} native_cos %[[ZERO_F16]]
+      %cos_afn_f16 = llvm.call @_Z22__spirv_ocl_native_cosDh(%c0_f16) {fastmathFlags = #llvm.fastmath<afn>} : (f16) -> f16
+      // CHECK: OpExtInst %[[F32T]] %{{.+}} native_exp2 %[[ZERO_F32]]
+      %exp2_afn_f32 = llvm.call @_Z23__spirv_ocl_native_exp2f(%c0_f32) {fastmathFlags = #llvm.fastmath<afn>} : (f32) -> f32
+      // CHECK: OpExtInst %[[F16T]] %{{.+}} native_log %[[ZERO_F16]]
+      %log_afn_f16 = llvm.call @_Z22__spirv_ocl_native_logDh(%c0_f16) {fastmathFlags = #llvm.fastmath<afn>} : (f16) -> f16
+      // CHECK: OpExtInst %[[F32T]] %{{.+}} native_log2 %[[ZERO_F32]]
+      %log2_afn_f32 = llvm.call @_Z23__spirv_ocl_native_log2f(%c0_f32) {fastmathFlags = #llvm.fastmath<afn>} : (f32) -> f32
+      // CHECK: OpExtInst %[[V8F64T]] %{{.+}} native_log10 %[[V8_ZERO_F64]]
+      %log10_afn_f64 = llvm.call @_Z24__spirv_ocl_native_log10Dv8_d(%v8_c0_f64) {fastmathFlags = #llvm.fastmath<afn>} : (vector<8xf64>) -> vector<8xf64>
+      // CHECK: OpExtInst %[[V16F16T]] %{{.+}} native_powr %[[V16_ZERO_F16]] %[[V16_ZERO_F16]]
+      %powr_afn_f16 = llvm.call @_Z23__spirv_ocl_native_powrDv16_DhS_(%v16_c0_f16, %v16_c0_f16) {fastmathFlags = #llvm.fastmath<afn>} : (vector<16xf16>, vector<16xf16>) -> vector<16xf16>
+      // CHECK: OpExtInst %[[F64T]] %{{.+}} native_rsqrt %[[ZERO_F64]]
+      %rsqrt_afn_f64 = llvm.call @_Z24__spirv_ocl_native_rsqrtd(%c0_f64) {fastmathFlags = #llvm.fastmath<afn>} : (f64) -> f64
+      // CHECK: OpExtInst %[[F16T]] %{{.+}} native_sin %[[ZERO_F16]]
+      %sin_afn_f16 = llvm.call @_Z22__spirv_ocl_native_sinDh(%c0_f16) {fastmathFlags = #llvm.fastmath<afn>} : (f16) -> f16
+      // CHECK: OpExtInst %[[F32T]] %{{.+}} native_sqrt %[[ZERO_F32]]
+      %sqrt_afn_f32 = llvm.call @_Z23__spirv_ocl_native_sqrtf(%c0_f32) {fastmathFlags = #llvm.fastmath<afn>} : (f32) -> f32
+      // CHECK: OpExtInst %[[F64T]] %{{.+}} native_tan %[[ZERO_F64]]
+      %tan_afn_f64 = llvm.call @_Z22__spirv_ocl_native_tand(%c0_f64) {fastmathFlags = #llvm.fastmath<afn>} : (f64) -> f64
+      // CHECK: OpExtInst %[[F32T]] %{{.+}} native_divide %[[ZERO_F32]] %[[ZERO_F32]]
+      %divide_afn_f32 = llvm.call @_Z25__spirv_ocl_native_divideff(%c0_f32, %c0_f32) {fastmathFlags = #llvm.fastmath<afn>} : (f32, f32) -> f32
+
+      llvm.return
+    }
+
+    llvm.func @_Z22__spirv_ocl_native_expDh(f16) -> f16
+    llvm.func @_Z22__spirv_ocl_native_expf(f32) -> f32
+    llvm.func @_Z22__spirv_ocl_native_expd(f64) -> f64
+    llvm.func @_Z22__spirv_ocl_native_expDv2_f64(vector<2xf64>) -> vector<2xf64>
+    llvm.func @_Z22__spirv_ocl_native_expDv3_f32(vector<3xf32>) -> vector<3xf32>
+    llvm.func @_Z22__spirv_ocl_native_expDv4_f64(vector<4xf64>) -> vector<4xf64>
+    llvm.func @_Z22__spirv_ocl_native_expDv8_f64(vector<8xf64>) -> vector<8xf64>
+    llvm.func @_Z22__spirv_ocl_native_expDv16_f16(vector<16xf16>) -> vector<16xf16>
+    llvm.func @_Z22__spirv_ocl_native_cosDh(f16) -> f16
+    llvm.func @_Z23__spirv_ocl_native_exp2f(f32) -> f32
+    llvm.func @_Z22__spirv_ocl_native_logDh(f16) -> f16
+    llvm.func @_Z23__spirv_ocl_native_log2f(f32) -> f32
+    llvm.func @_Z24__spirv_ocl_native_log10Dv8_d(vector<8xf64>) -> vector<8xf64>
+    llvm.func @_Z23__spirv_ocl_native_powrDv16_DhS_(vector<16xf16>, vector<16xf16>) -> vector<16xf16>
+    llvm.func @_Z24__spirv_ocl_native_rsqrtd(f64) -> f64
+    llvm.func @_Z22__spirv_ocl_native_sinDh(f16) -> f16
+    llvm.func @_Z23__spirv_ocl_native_sqrtf(f32) -> f32
+    llvm.func @_Z22__spirv_ocl_native_tand(f64) -> f64
+    llvm.func @_Z25__spirv_ocl_native_divideff(f32, f32) -> f32
+  }
+}
diff --git a/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir b/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir
index 7d8ccd910cdf4..f2fbe91a41ecd 100644
--- a/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir
+++ b/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir
@@ -216,22 +216,22 @@ func.func @task_depend(%arg0: !llvm.ptr) {
 // CHECK: (%[[ARG0:.*]]: !llvm.ptr, %[[ARG1:.*]]: !llvm.ptr, %[[ARG2:.*]]: !llvm.ptr, %[[ARG3:.*]]: !llvm.ptr)
 // CHECK: %[[MAP0:.*]] = omp.map.info var_ptr(%[[ARG0]] : !llvm.ptr, i32)   map_clauses(to) capture(ByRef) -> !llvm.ptr {name = ""}
 // CHECK: %[[MAP1:.*]] = omp.map.info var_ptr(%[[ARG1]] : !llvm.ptr, i32)   map_clauses(to) capture(ByRef) -> !llvm.ptr {name = ""}
-// CHECK: %[[MAP2:.*]] = omp.map.info var_ptr(%[[ARG2]] : !llvm.ptr, i32)   map_clauses(always, exit_release_or_enter_alloc) capture(ByRef) -> !llvm.ptr {name = ""}
+// CHECK: %[[MAP2:.*]] = omp.map.info var_ptr(%[[ARG2]] : !llvm.ptr, i32)   map_clauses(always, storage) capture(ByRef) -> !llvm.ptr {name = ""}
 // CHECK: omp.target_enter_data map_entries(%[[MAP0]], %[[MAP1]], %[[MAP2]] : !llvm.ptr, !llvm.ptr, !llvm.ptr)
 // CHECK: %[[MAP3:.*]] = omp.map.info var_ptr(%[[ARG0]] : !llvm.ptr, i32)   map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""}
 // CHECK: %[[MAP4:.*]] = omp.map.info var_ptr(%[[ARG1]] : !llvm.ptr, i32)   map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""}
-// CHECK: %[[MAP5:.*]] = omp.map.info var_ptr(%[[ARG2]] : !llvm.ptr, i32)   map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> !llvm.ptr {name = ""}
+// CHECK: %[[MAP5:.*]] = omp.map.info var_ptr(%[[ARG2]] : !llvm.ptr, i32)   map_clauses(storage) capture(ByRef) -> !llvm.ptr {name = ""}
 // CHECK: %[[MAP6:.*]] = omp.map.info var_ptr(%[[ARG3]] : !llvm.ptr, i32)   map_clauses(always, delete) capture(ByRef) -> !llvm.ptr {name = ""}
 // CHECK: omp.target_exit_data map_entries(%[[MAP3]], %[[MAP4]], %[[MAP5]], %[[MAP6]] : !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr)
 
 llvm.func @_QPomp_target_data(%a : !llvm.ptr, %b : !llvm.ptr, %c : !llvm.ptr, %d : !llvm.ptr) {
   %0 = omp.map.info var_ptr(%a : !llvm.ptr, i32)   map_clauses(to) capture(ByRef) -> !llvm.ptr {name = ""}
   %1 = omp.map.info var_ptr(%b : !llvm.ptr, i32)   map_clauses(to) capture(ByRef) -> !llvm.ptr {name = ""}
-  %2 = omp.map.info var_ptr(%c : !llvm.ptr, i32)   map_clauses(always, exit_release_or_enter_alloc) capture(ByRef) -> !llvm.ptr {name = ""}
+  %2 = omp.map.info var_ptr(%c : !llvm.ptr, i32)   map_clauses(always, storage) capture(ByRef) -> !llvm.ptr {name = ""}
   omp.target_enter_data map_entries(%0, %1, %2 : !llvm.ptr, !llvm.ptr, !llvm.ptr) {}
   %3 = omp.map.info var_ptr(%a : !llvm.ptr, i32)   map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""}
   %4 = omp.map.info var_ptr(%b : !llvm.ptr, i32)   map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""}
-  %5 = omp.map.info var_ptr(%c : !llvm.ptr, i32)   map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> !llvm.ptr {name = ""}
+  %5 = omp.map.info var_ptr(%c : !llvm.ptr, i32)   map_clauses(storage) capture(ByRef) -> !llvm.ptr {name = ""}
   %6 = omp.map.info var_ptr(%d : !llvm.ptr, i32)   map_clauses(always, delete) capture(ByRef) -> !llvm.ptr {name = ""}
   omp.target_exit_data map_entries(%3, %4, %5, %6 : !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) {}
   llvm.return
@@ -266,7 +266,7 @@ llvm.func @_QPomp_target_data_region(%a : !llvm.ptr, %i : !llvm.ptr) {
 // CHECK:                             %[[ARG_1:.*]]: !llvm.ptr) {
 // CHECK:           %[[VAL_0:.*]] = llvm.mlir.constant(64 : i32) : i32
 // CHECK:           %[[MAP1:.*]] = omp.map.info var_ptr(%[[ARG_0]] : !llvm.ptr, !llvm.array<1024 x i32>)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
-// CHECK:           %[[MAP2:.*]] = omp.map.info var_ptr(%[[ARG_1]] : !llvm.ptr, i32)   map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = ""}
+// CHECK:           %[[MAP2:.*]] = omp.map.info var_ptr(%[[ARG_1]] : !llvm.ptr, i32)   map_clauses(implicit, storage) capture(ByCopy) -> !llvm.ptr {name = ""}
 // CHECK:           omp.target thread_limit(%[[VAL_0]] : i32) map_entries(%[[MAP1]] -> %[[BB_ARG0:.*]], %[[MAP2]] -> %[[BB_ARG1:.*]] : !llvm.ptr, !llvm.ptr) {
 // CHECK:             %[[VAL_1:.*]] = llvm.mlir.constant(10 : i32) : i32
 // CHECK:             llvm.store %[[VAL_1]], %[[BB_ARG1]] : i32, !llvm.ptr
@@ -278,7 +278,7 @@ llvm.func @_QPomp_target_data_region(%a : !llvm.ptr, %i : !llvm.ptr) {
 llvm.func @_QPomp_target(%a : !llvm.ptr, %i : !llvm.ptr) {
   %0 = llvm.mlir.constant(64 : i32) : i32
   %1 = omp.map.info var_ptr(%a : !llvm.ptr, !llvm.array<1024 x i32>)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
-  %3 = omp.map.info var_ptr(%i : !llvm.ptr, i32)   map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = ""}
+  %3 = omp.map.info var_ptr(%i : !llvm.ptr, i32)   map_clauses(implicit, storage) capture(ByCopy) -> !llvm.ptr {name = ""}
   omp.target   thread_limit(%0 : i32) map_entries(%1 -> %arg0, %3 -> %arg1 : !llvm.ptr, !llvm.ptr) {
     %2 = llvm.mlir.constant(10 : i32) : i32
     llvm.store %2, %arg1 : i32, !llvm.ptr
diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
index 2d33888854ea7..d669a3bac3336 100644
--- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
@@ -76,6 +76,18 @@ func.func @broadcast_vec1d_from_f32(%arg0: f32) -> vector<2xf32> {
 
 // -----
 
+func.func @broadcast_single_elem_vec1d_from_f32(%arg0: f32) -> vector<1xf32> {
+  %0 = vector.broadcast %arg0 : f32 to vector<1xf32>
+  return %0 : vector<1xf32>
+}
+// CHECK-LABEL: @broadcast_single_elem_vec1d_from_f32
+// CHECK-SAME:  %[[A:.*]]: f32)
+// CHECK:       %[[T0:.*]] = llvm.insertelement %[[A]]
+// CHECK-NOT:   llvm.shufflevector
+// CHECK:       return %[[T0]] : vector<1xf32>
+
+// -----
+
 func.func @broadcast_vec1d_from_f32_scalable(%arg0: f32) -> vector<[2]xf32> {
   %0 = vector.broadcast %arg0 : f32 to vector<[2]xf32>
   return %0 : vector<[2]xf32>
diff --git a/mlir/test/Conversion/XeGPUToXeVM/dpas.mlir b/mlir/test/Conversion/XeGPUToXeVM/dpas.mlir
index e6f22f0a9acbb..a9ab0be00722c 100644
--- a/mlir/test/Conversion/XeGPUToXeVM/dpas.mlir
+++ b/mlir/test/Conversion/XeGPUToXeVM/dpas.mlir
@@ -1,17 +1,13 @@
 // RUN: mlir-opt -convert-xegpu-to-xevm %s | FileCheck %s
 
-#sg_map_a_f16 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
-#sg_map_b_f16 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>
-#sg_map_c_f32 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
-
-gpu.module @load_store_check {
+gpu.module @test_kernel {
     // CHECK-LABEL: func.func @dpas(
     // CHECK-SAME: %[[ARG0:.*]]: vector<8xf16>, %[[ARG1:.*]]: vector<16xf16>, %[[ARG2:.*]]: vector<8xf32>
     func.func @dpas(%a_loaded: vector<8xf16>, %b_loaded: vector<16xf16>, %c_loaded: vector<8xf32>) -> vector<8xf32> {
         // Loads are checked in a separate test.
         // CHECK: %[[D:.*]] = xevm.mma %[[ARG0]], %[[ARG1]], %[[ARG2]] {shape = <m = 8, n = 16, k = 16>, types = <d = f32, a = f16, b = f16, c = f32>}
         // CHECK-SAME:    : (vector<8xf16>, vector<16xf16>, vector<8xf32>) -> vector<8xf32>
-        %d = xegpu.dpas %a_loaded, %b_loaded, %c_loaded {a_layout = #sg_map_a_f16, b_layout = #sg_map_b_f16, c_layout = #sg_map_c_f32}
+        %d = xegpu.dpas %a_loaded, %b_loaded, %c_loaded
             : vector<8xf16>, vector<16xf16>, vector<8xf32> -> vector<8xf32>
         return %d : vector<8xf32>
     }
diff --git a/mlir/test/Conversion/XeGPUToXeVM/loadstore_matrix.mlir b/mlir/test/Conversion/XeGPUToXeVM/loadstore_matrix.mlir
new file mode 100644
index 0000000000000..d4cb493271d0d
--- /dev/null
+++ b/mlir/test/Conversion/XeGPUToXeVM/loadstore_matrix.mlir
@@ -0,0 +1,201 @@
+// RUN: mlir-opt  -split-input-file -convert-xegpu-to-xevm -cse %s | FileCheck %s
+
+gpu.module @test_kernel [#xevm.target<chip = "pvc">] {
+
+ // e.g. for mem_desc<32x32xf16, @strides=[1, 16]>
+  // its memory layout tuple is (blocked shape = [1,1,32,32],strides=[1024,1024,32,1])
+  //CHECK-LABEL: load_store_matrix_1
+  gpu.func @load_store_matrix_1(%arg0: memref<4096xi8, 3>) -> f32 {
+    %0 = xegpu.create_mem_desc %arg0 : memref<4096xi8, 3> -> !xegpu.mem_desc<32x32xf32>
+
+    //CHECK: %[[TID:.*]] = gpu.thread_id x
+    //CHECK: %[[C1:.*]] = arith.constant 1 : index
+    //CHECK: %[[MUL1:.*]] = arith.muli %[[TID]], %[[C1]] : index
+    //CHECK: %[[C4:.*]] = arith.constant 4 : i32
+    //CHECK: %[[MUL2:.*]] = arith.muli {{.*}}, %[[C4]] : i32
+    //CHECK: llvm.load {{.*}} : !llvm.ptr<3> -> f32
+
+    %tid_x = gpu.thread_id x
+    %c0 = arith.constant 0 : index
+    %1 = xegpu.load_matrix %0[%c0, %tid_x]: !xegpu.mem_desc<32x32xf32>, index, index -> f32
+
+    //CHECK: llvm.store {{.*}}, {{.*}} : f32, !llvm.ptr<3>
+
+     xegpu.store_matrix %1, %0[%c0, %tid_x]: f32, !xegpu.mem_desc<32x32xf32>, index, index
+
+    gpu.return %1: f32
+  }
+
+// e.g. for mem_desc<32x64xf16, @block=[16, 16], @strides=[1, 32]>
+  // its memory layout tuple is ([2,4,16,16],[256,512,1,16])
+  //CHECK-LABEL: load_store_matrix_2
+  gpu.func @load_store_matrix_2(%arg0: memref<4096xi8, 3>) -> f16 {
+    %0 = xegpu.create_mem_desc %arg0 : memref<4096xi8, 3> -> !xegpu.mem_desc<32x64xf16, #xegpu.mem_layout<stride = [1, 32], block = [16, 16]>>
+    //CHECK: %[[c0:.*]] = arith.constant 0 : index
+    //CHECK: %[[tid_x:.*]] = gpu.thread_id x
+    //CHECK: %[[c13:.*]] = arith.constant 13 : index
+    //CHECK: %[[c16:.*]] = arith.constant 16 : index
+    //CHECK: %[[offsetx_0:.*]] = arith.divsi %[[c13]], %[[c16]] : index
+    //CHECK: %[[offsetx_1:.*]] = arith.remsi %[[c13]], %[[c16]] : index
+    //CHECK: %[[offsety_0:.*]] = arith.divsi %[[tid_x]], %[[c16]] : index
+    //CHECK: %[[offsety_1:.*]] = arith.remsi %[[tid_x]], %[[c16]] : index
+
+    //CHECK: %[[c256:.*]] = arith.constant 256 : index
+    //CHECK: %[[mul0:.*]] = arith.muli %[[offsetx_0]], %[[c256]] : index
+    //CHECK: %[[add0:.*]] = arith.addi %[[mul0]], %[[c0]] : index
+    //CHECK: %[[c512:.*]] = arith.constant 512 : index
+    //CHECK: %[[mul1:.*]] = arith.muli %[[offsety_0]], %[[c512]] : index
+    //CHECK: %[[add1:.*]] = arith.addi %[[mul1]], %[[add0]] : index
+    //CHECK: %[[c1:.*]] = arith.constant 1 : index
+    //CHECK: %[[mul2:.*]] = arith.muli %[[offsetx_1]], %[[c1]] : index
+    //CHECK: %[[add2:.*]] = arith.addi %[[mul2]], %[[add1]] : index
+    //CHECK: %[[mul3:.*]] = arith.muli %[[offsety_1]], %[[c16]] : index
+    //CHECK: %[[add3:.*]] = arith.addi %[[mul3]], %[[add2]] : index
+
+    //CHECK: %[[loaded:.*]] = llvm.load {{.*}}: !llvm.ptr<3> -> f16
+ 
+
+    %tid_x = gpu.thread_id x
+    %c13 = arith.constant 13 : index
+    %1 = xegpu.load_matrix %0[%c13, %tid_x]: !xegpu.mem_desc<32x64xf16, #xegpu.mem_layout<stride = [1, 32], block = [16, 16]>>, index, index -> f16
+
+    //CHECK: llvm.store %[[loaded]], {{.*}} : f16, !llvm.ptr<3>
+   
+    xegpu.store_matrix %1, %0[%c13, %tid_x]: f16, !xegpu.mem_desc<32x64xf16, #xegpu.mem_layout<stride = [1, 32], block = [16, 16]>>, index, index 
+    gpu.return %1: f16
+  }
+
+
+  // e.g. for mem_desc<32x64xf16, @block=[16, 16]>
+  // its memory layout tuple is ([2,4,16,16],[1024,256,16,1])
+  //CHECK-LABEL: load_store_matrix_3
+  gpu.func @load_store_matrix_3(%arg0: memref<4096xi8, 3>) -> f16 {
+    //CHECK: %[[c0:.*]] = arith.constant 0 : index
+    //CHECK: %[[view:.*]] = memref.view %arg0[%[[c0]]][] : memref<4096xi8, 3> to memref<2048xf16, 3>
+    %0 = xegpu.create_mem_desc %arg0 : memref<4096xi8, 3> -> !xegpu.mem_desc<32x64xf16, #xegpu.mem_layout<block = [16, 16]>>
+    
+    //CHECK: %[[tid_x:.*]] = gpu.thread_id x
+    //CHECK: %[[c19:.*]] = arith.constant 19 : index
+    %tid_x = gpu.thread_id x
+    %c19 = arith.constant 19: index
+    
+    //CHECK: %[[intptr:.*]] = memref.extract_aligned_pointer_as_index %[[view]] : memref<2048xf16, 3> -> index
+    //CHECK: %[[basePtrI64:.*]] = arith.index_castui %[[intptr]] : index to i32
+    //CHECK: %[[c16:.*]] = arith.constant 16 : index
+    //CHECK: %[[offsetx_0:.*]] = arith.divsi %[[c19]], %[[c16]] : index
+    //CHECK: %[[offsetx_1:.*]] = arith.remsi %[[c19]], %[[c16]] : index
+    //CHECK: %[[offsety_0:.*]] = arith.divsi %[[tid_x]], %[[c16]] : index
+    //CHECK: %[[offsety_1:.*]] = arith.remsi %[[tid_x]], %[[c16]] : index
+    //CHECK: %[[c1024:.*]] = arith.constant 1024 : index
+    //CHECK: %[[mul0:.*]] = arith.muli %[[offsetx_0]], %[[c1024]] : index
+    //CHECK: %[[add0:.*]] = arith.addi %[[mul0]], %[[c0]] : index
+    //CHECK: %[[c256:.*]] = arith.constant 256 : index
+    //CHECK: %[[mul1:.*]] = arith.muli %[[offsety_0]], %[[c256]] : index
+    //CHECK: %[[add1:.*]] = arith.addi %[[mul1]], %[[add0]] : index
+    //CHECK: %[[mul2:.*]] = arith.muli %[[offsetx_1]], %[[c16]] : index
+    //CHECK: %[[add2:.*]] = arith.addi %[[mul2]], %[[add1]] : index
+    //CHECK: %[[c1:.*]] = arith.constant 1 : index
+    //CHECK: %[[mul3:.*]] = arith.muli %[[offsety_1]], %[[c1]] : index
+    //CHECK: %[[add3:.*]] = arith.addi %[[mul3]], %[[add2]] : index
+
+    //CHECK: %[[loaded:.*]] = llvm.load {{.*}} : !llvm.ptr<3> -> f16
+    %1 = xegpu.load_matrix %0[%c19, %tid_x]: !xegpu.mem_desc<32x64xf16, #xegpu.mem_layout<block = [16, 16]>>, index, index -> f16
+    
+    //CHECK: llvm.store %[[loaded]], {{.*}} : f16, !llvm.ptr<3>
+    xegpu.store_matrix %1, %0[%c19, %tid_x]:  f16, !xegpu.mem_desc<32x64xf16, #xegpu.mem_layout<block = [16, 16]>>, index, index
+    
+    //CHECK: gpu.return %[[loaded]] : f16
+    gpu.return %1: f16
+  }
+
+   // e.g. for mem_desc<32x64xf16, @block=[16, 16], @strides=[1, 16]>
+  // its memory layout tuple is ([2,4,16,16],[256,512,1,16])
+  //CHECK-LABEL: load_store_matrix_4
+  gpu.func @load_store_matrix_4(%arg0: memref<4096xi8, 3>) -> vector<8xf16> {
+    %0 = xegpu.create_mem_desc %arg0 : memref<4096xi8, 3> -> !xegpu.mem_desc<32x64xf16, #xegpu.mem_layout<stride = [1, 32], block = [16, 16]>>
+
+    //CHECK: %[[c0:.*]] = arith.constant 0 : index
+    //CHECK: %[[tid_x:.*]] = gpu.thread_id x
+
+    //CHECK: %[[c16:.*]] = arith.constant 16 : index
+    //CHECK: %[[offsetx_0:.*]] = arith.divsi %[[c16]], %[[c16]] : index
+    //CHECK: %[[offsetx_1:.*]] = arith.remsi %[[c16]], %[[c16]] : index
+    //CHECK: %[[offsety_0:.*]] = arith.divsi %[[tid_x]], %[[c16]] : index
+    //CHECK: %[[offsety_1:.*]] = arith.remsi %[[tid_x]], %[[c16]] : index
+
+    //CHECK: %[[c256:.*]] = arith.constant 256 : index
+    //CHECK: %[[mul0:.*]] = arith.muli %[[offsetx_0]], %[[c256]] : index
+    //CHECK: %[[add0:.*]] = arith.addi %[[mul0]], %[[c0]] : index
+    //CHECK: %[[c512:.*]] = arith.constant 512 : index
+    //CHECK: %[[mul1:.*]] = arith.muli %[[offsety_0]], %[[c512]] : index
+    //CHECK: %[[add1:.*]] = arith.addi %[[mul1]], %[[add0]] : index
+    //CHECK: %[[c1:.*]] = arith.constant 1 : index
+    //CHECK: %[[mul2:.*]] = arith.muli %[[offsetx_1]], %[[c1]] : index
+    //CHECK: %[[add2:.*]] = arith.addi %[[mul2]], %[[add1]] : index
+    //CHECK: %[[mul3:.*]] = arith.muli %[[offsety_1]], %[[c16]] : index
+    //CHECK: %[[add3:.*]] = arith.addi %[[mul3]], %[[add2]] : index
+
+    //CHECK: %[[loaded:.*]] = llvm.load {{.*}}: !llvm.ptr<3> -> vector<8xf16>
+     
+    %tid_x = gpu.thread_id x
+    %c16 = arith.constant 16 : index
+    %1 = xegpu.load_matrix %0[%c16, %tid_x] : !xegpu.mem_desc<32x64xf16, #xegpu.mem_layout<stride = [1, 32], block = [16, 16]>>, index, index -> vector<8xf16>
+
+    //CHECK: llvm.store %[[loaded]], {{.*}} : vector<8xf16>, !llvm.ptr<3>
+    xegpu.store_matrix %1, %0[%c16, %tid_x] : vector<8xf16>, !xegpu.mem_desc<32x64xf16, #xegpu.mem_layout<stride = [1, 32], block = [16, 16]>>, index, index
+
+    gpu.return %1: vector<8xf16>
+  }
+
+ 
+  // e.g. for mem_desc<32x64xf16, @block=[16, 16]>
+  // its memory layout tuple is ([2,4,16,16],[1024,256,16,1])
+  //CHECK-LABEL: load_store_matrix_5
+  gpu.func @load_store_matrix_5(%arg0: memref<4096xi8, 3>) -> vector<8xf16> {
+    //CHECK: %[[c0:.*]] = arith.constant 0 : index
+    //CHECK: %[[view:.*]] = memref.view %arg0[%[[c0]]][] : memref<4096xi8, 3> to memref<2048xf16, 3>
+ 
+    %0 = xegpu.create_mem_desc %arg0 : memref<4096xi8, 3> -> !xegpu.mem_desc<32x64xf16, #xegpu.mem_layout<block = [16, 16]>>
+ 
+    //CHECK: %[[c16:.*]] = arith.constant 16 : index
+    //CHECK: %[[c48:.*]] = arith.constant 48 : index
+  
+    %c16 = arith.constant 16 : index
+    %c48 = arith.constant 48 : index
+
+    //CHECK: %[[intptr:.*]] = memref.extract_aligned_pointer_as_index %[[view]] : memref<2048xf16, 3> -> index
+    //CHECK: %[[basePtrI64:.*]] = arith.index_castui %[[intptr]] : index to i32
+    //CHECK: %[[offset0:.*]] = arith.divsi %[[c16]], %[[c16]] : index
+    //CHECK: %[[offset1:.*]] = arith.remsi %[[c16]], %[[c16]] : index
+    //CHECK: %[[offset2:.*]] = arith.divsi %[[c48]], %[[c16]] : index
+    //CHECK: %[[offset3:.*]] = arith.remsi %[[c48]], %[[c16]] : index
+    //CHECK: %[[c1024:.*]] = arith.constant 1024 : index
+    //CHECK: %[[mul0:.*]] = arith.muli %[[offset0]], %[[c1024]] : index
+    //CHECK: %[[add0:.*]] = arith.addi %[[mul0]], %[[c0]] : index
+    //CHECK: %[[c256:.*]] = arith.constant 256 : index
+    //CHECK: %[[mul1:.*]] = arith.muli %[[offset2]], %[[c256]] : index
+    //CHECK: %[[add1:.*]] = arith.addi %[[mul1]], %[[add0]] : index
+    //CHECK: %[[mul2:.*]] = arith.muli %[[offset1]], %[[c16]] : index
+    //CHECK: %[[add2:.*]] = arith.addi %[[mul2]], %[[add1]] : index
+    //CHECK: %[[c1:.*]] = arith.constant 1 : index
+    //CHECK: %[[mul3:.*]] = arith.muli %[[offset3]], %[[c1]] : index
+    //CHECK: %[[linearOffset:.*]] = arith.addi %[[mul3]], %[[add2]] : index
+    //CHECK: %[[linearOffsetI64:.*]] = arith.index_castui %[[linearOffset]] : index to i32
+    //CHECK: %[[c2:.*]] = arith.constant 2 : i32
+    //CHECK: %[[byteOffset:.*]] = arith.muli %[[linearOffsetI64]], %[[c2]] : i32
+    //CHECK: %[[finalPtr:.*]] = arith.addi %[[basePtrI64]], %[[byteOffset]] : i32
+    //CHECK: %[[ptr:.*]] = llvm.inttoptr %[[finalPtr]] : i32 to !llvm.ptr<3>
+    //CHECK: %[[loadedI16:.*]] = xevm.blockload %[[ptr]] : (!llvm.ptr<3>) -> vector<8xi16>
+    //CHECK: %[[loaded:.*]] = vector.bitcast %[[loadedI16]] : vector<8xi16> to vector<8xf16>
+
+    %1 = xegpu.load_matrix %0[%c16, %c48] {subgroup_block_io}: !xegpu.mem_desc<32x64xf16, #xegpu.mem_layout<block = [16, 16]>>, index, index -> vector<8xf16>
+
+    //CHECK: %[[storeDataI16:.*]] = vector.bitcast %[[loaded]] : vector<8xf16> to vector<8xi16>
+    //CHECK: xevm.blockstore %[[ptr]], %[[storeDataI16]] : (!llvm.ptr<3>, vector<8xi16>) 
+
+    xegpu.store_matrix %1, %0[%c16, %c48] {subgroup_block_io}: vector<8xf16>, !xegpu.mem_desc<32x64xf16, #xegpu.mem_layout<block = [16, 16]>>, index, index
+
+    gpu.return %1: vector<8xf16>
+  }
+
+}
diff --git a/mlir/test/Conversion/XeGPUToXeVM/loadstoreprefetch.mlir b/mlir/test/Conversion/XeGPUToXeVM/loadstoreprefetch.mlir
index 0b150e9d58153..9c552d849c12c 100644
--- a/mlir/test/Conversion/XeGPUToXeVM/loadstoreprefetch.mlir
+++ b/mlir/test/Conversion/XeGPUToXeVM/loadstoreprefetch.mlir
@@ -14,19 +14,36 @@ gpu.func @load_gather_i64_src_value_offset(%src: i64, %offset: vector<1xindex>)
   // CHECK: %[[VAR4:.*]] = arith.addi %[[ARG0]], %[[VAR3]] : i64
   // CHECK: %[[VAR5:.*]] = llvm.inttoptr %[[VAR4]] : i64 to !llvm.ptr<1>
   // CHECK: %[[VAR6:.*]] = scf.if %[[VAR2]] -> (f16) {
-  // CHECK:   %[[VAR7:.*]] = llvm.load %[[VAR5]] {cache_control = #xevm.load_cache_control<L1c_L2uc_L3uc>} : !llvm.ptr<1> -> vector<1xf16>
-  // CHECK:   %[[VAR8:.*]] = vector.extract %[[VAR7]][0] : f16 from vector<1xf16>
-  // CHECK:   scf.yield %[[VAR8]] : f16
-  // CHECK: } else {
-  // CHECK:   %[[CST_0:.*]] = arith.constant dense<0.000000e+00> : vector<1xf16>
-  // CHECK:   %[[VAR7:.*]] = vector.extract %[[CST_0]][0] : f16 from vector<1xf16>
+  // CHECK:   %[[VAR7:.*]] = llvm.load %[[VAR5]] {cache_control = #xevm.load_cache_control<L1c_L2uc_L3uc>} : !llvm.ptr<1> -> f16
   // CHECK:   scf.yield %[[VAR7]] : f16
+  // CHECK: } else {
+  // CHECK:   %[[CST_0:.*]] = arith.constant 0.000000e+00 : f16
+  // CHECK:   scf.yield %[[CST_0]] : f16
   // CHECK: }
   %3 = xegpu.load %src[%offset], %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
       : i64, vector<1xindex>, vector<1xi1> -> vector<1xf16>
   gpu.return
 }
 }
+
+// -----
+gpu.module @test {
+// CHECK-LABEL: @source_materialize_single_elem_vec
+// CHECK-SAME: %[[ARG0:.*]]: i64, %[[ARG1:.*]]: vector<1xindex>, %[[ARG2:.*]]: memref<1xf16>
+gpu.func @source_materialize_single_elem_vec(%src: i64, %offset: vector<1xindex>, %dst: memref<1xf16>) {
+  %1 = arith.constant dense<1>: vector<1xi1>
+  %3 = xegpu.load %src[%offset], %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
+      : i64, vector<1xindex>, vector<1xi1> -> vector<1xf16>
+  // CHECK: %[[VAR_IF:.*]] = scf.if
+  // CHECK: %[[VAR_RET:.*]] = vector.broadcast %[[VAR_IF]] : f16 to vector<1xf16>
+  // CHECK: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK: vector.store %[[VAR_RET]], %[[ARG2]][%[[C0]]] : memref<1xf16>, vector<1xf16>
+  %c0 = arith.constant 0 : index
+  vector.store %3, %dst[%c0] : memref<1xf16>, vector<1xf16>
+  gpu.return
+}
+}
+
 // -----
 
 gpu.module @test {
diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir
index 66e7dd4014af9..a8256b16ed8a1 100644
--- a/mlir/test/Dialect/AMDGPU/invalid.mlir
+++ b/mlir/test/Dialect/AMDGPU/invalid.mlir
@@ -238,3 +238,27 @@ func.func @gather_to_lds_non_lds(%idx1 : index, %mem1 : memref<32xf16>, %mem2 :
   amdgpu.gather_to_lds %mem1[%idx1], %mem2[%idx1] : vector<2xf16>, memref<32xf16>, memref<32xf16, strided<[?]>, #gpu.address_space<workgroup>>
   func.return
 }
+
+// -----
+
+func.func @amdgpu.scaled_ext_packed816_invalid_block_size_and_first_scale_byte_16(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) {
+  // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op blockSize of 16 can only have firstScaleByte be 0 or 1.}}
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(2) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xf16>
+  func.return
+}
+
+// -----
+
+func.func @amdgpu.scaled_ext_packed816_invalid_block_size_and_first_scale_byte_32(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) {
+  // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op blockSize of 32 can only have firstScaleByte be 0 or 2.}}
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(1) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xf16>
+  func.return
+}
+
+// -----
+
+func.func @amdgpu.scaled_ext_packed816_invalid_input_output_sizes(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) {
+  // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op failed to verify that all of {source, res} have same shape}}
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<16xf16>
+  func.return
+}
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 8f427e9d56f45..f9c6899dadfc1 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -221,6 +221,61 @@ func.func @scaled_ext_scalar_f4e2m1_bf16(%v: vector<2xf4E2M1FN>, %scale: f32) ->
   func.return %ret : vector<2xbf16>
 }
 
+// CHECK-LABEL: func.func @scaled_ext_packed816_fp4
+func.func @scaled_ext_packed816_fp4(%v: vector<8xf4E2M1FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) {
+  // CHECK: amdgpu.scaled_ext_packed816
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xf16>
+  // CHECK: amdgpu.scaled_ext_packed816
+  %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xbf16>
+  // CHECK: amdgpu.scaled_ext_packed816
+  %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xf32>
+  func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_packed816_fp8
+func.func @scaled_ext_packed816_fp8(%v: vector<8xf8E4M3FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) {
+  // CHECK: amdgpu.scaled_ext_packed816
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E4M3FN>, vector<4xf8E8M0FNU> -> vector<8xf16>
+  // CHECK: amdgpu.scaled_ext_packed816
+  %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E4M3FN>, vector<4xf8E8M0FNU> -> vector<8xbf16>
+  // CHECK: amdgpu.scaled_ext_packed816
+  %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E4M3FN>, vector<4xf8E8M0FNU> -> vector<8xf32>
+  func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_packed816_bf8
+func.func @scaled_ext_packed816_bf8(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) {
+  // CHECK: amdgpu.scaled_ext_packed816
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xf16>
+  // CHECK: amdgpu.scaled_ext_packed816
+  %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xbf16>
+  // CHECK: amdgpu.scaled_ext_packed816
+  %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xf32>
+  func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_packed816_fp6
+func.func @scaled_ext_packed816_fp6(%v: vector<16xf6E2M3FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<16xf16>, vector<16xbf16>, vector<16xf32>) {
+  // CHECK: amdgpu.scaled_ext_packed816
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E2M3FN>, vector<4xf8E8M0FNU> -> vector<16xf16>
+  // CHECK: amdgpu.scaled_ext_packed816
+  %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E2M3FN>, vector<4xf8E8M0FNU> -> vector<16xbf16>
+  // CHECK: amdgpu.scaled_ext_packed816
+  %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E2M3FN>, vector<4xf8E8M0FNU> -> vector<16xf32>
+  func.return %ret0, %ret1, %ret2 : vector<16xf16>, vector<16xbf16>, vector<16xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_packed816_bf16
+func.func @scaled_ext_packed816_bf16(%v: vector<16xf6E3M2FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<16xf16>, vector<16xbf16>, vector<16xf32>) {
+  // CHECK: amdgpu.scaled_ext_packed816
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E3M2FN>, vector<4xf8E8M0FNU> -> vector<16xf16>
+  // CHECK: amdgpu.scaled_ext_packed816
+  %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E3M2FN>, vector<4xf8E8M0FNU> -> vector<16xbf16>
+  // CHECK: amdgpu.scaled_ext_packed816
+  %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E3M2FN>, vector<4xf8E8M0FNU> -> vector<16xf32>
+  func.return %ret0, %ret1, %ret2 : vector<16xf16>, vector<16xbf16>, vector<16xf32>
+}
+
 // CHECK-LABEL: func.func @packed_scaled_trunc_f8e4m3_f32
 // CHECK: amdgpu.packed_scaled_trunc
 func.func @packed_scaled_trunc_f8e4m3_f32(%v: vector<2xf32>, %scale: f32) -> vector<4xf8E4M3FN> {
diff --git a/mlir/test/Dialect/Affine/canonicalize.mlir b/mlir/test/Dialect/Affine/canonicalize.mlir
index e56079c1cccd4..1169cd1c29d74 100644
--- a/mlir/test/Dialect/Affine/canonicalize.mlir
+++ b/mlir/test/Dialect/Affine/canonicalize.mlir
@@ -2235,6 +2235,136 @@ func.func @affine_leading_zero_no_outer_bound(%arg0: index, %arg1: index) -> ind
 
 // -----
 
+// CHECK-LABEL: func @delin_apply_cancel_exact
+// CHECK-SAME: (%[[ARG0:.+]]: index, %[[ARG1:.+]]: memref<?xindex>)
+// CHECK-COUNT-6: memref.store %[[ARG0]], %[[ARG1]][%[[ARG0]]]
+// CHECK-NOT: memref.store
+// CHECK: return
+func.func @delin_apply_cancel_exact(%arg0:  index, %arg1: memref<?xindex>) {
+  %a:3 = affine.delinearize_index %arg0 into (4, 5) : index, index, index
+  %b:3 = affine.delinearize_index %arg0 into (3, 4, 5) : index, index, index
+  %c:2 = affine.delinearize_index %arg0 into (20) : index, index
+
+  %t1 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 * 5 + s2 * 20)>()[%a#2, %a#1, %a#0]
+  memref.store %t1, %arg1[%t1] : memref<?xindex>
+
+  %t2 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s2 * 20 + s1 * 5)>()[%a#2, %a#1, %a#0]
+  memref.store %t2, %arg1[%t2] : memref<?xindex>
+
+  %t3 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 20 + s2 * 5 + s0)>()[%a#2, %a#0, %a#1]
+  memref.store %t3, %arg1[%t3] : memref<?xindex>
+
+  %t4 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 * 5 + s2 * 20)>()[%b#2, %b#1, %b#0]
+  memref.store %t4, %arg1[%t4] : memref<?xindex>
+
+  %t5 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 20)>()[%c#1, %c#0]
+  memref.store %t5, %arg1[%t5] : memref<?xindex>
+
+  %t6 = affine.apply affine_map<()[s0, s1] -> (s1 * 20 + s0)>()[%c#1, %c#0]
+  memref.store %t6, %arg1[%t5] : memref<?xindex>
+
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @delin_apply_cancel_exact_dim
+// CHECK: affine.for %[[arg1:.+]] = 0 to 256
+// CHECK: memref.store %[[arg1]]
+// CHECK: return
+func.func @delin_apply_cancel_exact_dim(%arg0: memref<?xindex>) {
+  affine.for %arg1 = 0 to 256 {
+    %a:3 = affine.delinearize_index %arg1 into (2, 2, 64) : index, index, index
+    %i = affine.apply affine_map<(d0, d1, d2) -> (d0 + d1 * 128 + d2 * 64)>(%a#2, %a#0, %a#1)
+    memref.store %i, %arg0[%i] : memref<?xindex>
+  }
+  return
+}
+
+// -----
+
+// CHECK-DAG: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 + 512)>
+// CHECK-LABEL: func @delin_apply_cancel_const_term
+// CHECK-SAME: (%[[ARG0:.+]]: index, %[[ARG1:.+]]: memref<?xindex>)
+// CHECK: affine.apply #[[$MAP]]()[%[[ARG0]]]
+// CHECK: return
+func.func @delin_apply_cancel_const_term(%arg0:  index, %arg1: memref<?xindex>) {
+  %a:3 = affine.delinearize_index %arg0 into (2, 2, 64) : index, index, index
+
+  %t1 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 * 128 + s2 * 64 + 512)>()[%a#2, %a#0, %a#1]
+  memref.store %t1, %arg1[%t1] : memref<?xindex>
+
+  return
+}
+
+// -----
+
+// CHECK-DAG: #[[$MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1 + 512)>
+// CHECK-LABEL: func @delin_apply_cancel_var_term
+// CHECK-SAME: (%[[ARG0:.+]]: index, %[[ARG1:.+]]: memref<?xindex>, %[[ARG2:.+]]: index)
+// CHECK: affine.apply #[[$MAP]]()[%[[ARG2]], %[[ARG0]]]
+// CHECK: return
+func.func @delin_apply_cancel_var_term(%arg0:  index, %arg1: memref<?xindex>, %arg2: index) {
+  %a:3 = affine.delinearize_index %arg0 into (2, 2, 64) : index, index, index
+
+  %t1 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s1 * 128 + s2 * 64 + s3 + 512)>()[%a#2, %a#0, %a#1, %arg2]
+  memref.store %t1, %arg1[%t1] : memref<?xindex>
+
+  return
+}
+
+// -----
+
+// CHECK-DAG: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 * 2 + s0 ceildiv 4)>
+// CHECK-LABEL: func @delin_apply_cancel_nested_exprs
+// CHECK-SAME: (%[[ARG0:.+]]: index, %[[ARG1:.+]]: memref<?xindex>)
+// CHECK: affine.apply #[[$MAP]]()[%[[ARG0]]]
+// CHECK: return
+func.func @delin_apply_cancel_nested_exprs(%arg0:  index, %arg1: memref<?xindex>) {
+  %a:2 = affine.delinearize_index %arg0 into (20) : index, index
+
+  %t1 = affine.apply affine_map<()[s0, s1] -> ((s0 + s1 * 20) ceildiv 4 + (s1 * 20 + s0) * 2)>()[%a#1, %a#0]
+  memref.store %t1, %arg1[%t1] : memref<?xindex>
+
+  return
+}
+
+// -----
+
+// CHECK-DAG: #[[$MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1)>
+// CHECK-LABEL: func @delin_apply_cancel_preserve_rotation
+// CHECK-SAME: (%[[ARG0:.+]]: index, %[[ARG1:.+]]: memref<?xindex>)
+// CHECK: %[[A:.+]]:2 = affine.delinearize_index %[[ARG0]] into (20)
+// CHECK: affine.apply #[[$MAP]]()[%[[A]]#1, %[[ARG0]]]
+// CHECK: return
+func.func @delin_apply_cancel_preserve_rotation(%arg0:  index, %arg1: memref<?xindex>) {
+  %a:2 = affine.delinearize_index %arg0 into (20) : index, index
+
+  %t1 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 20 + s0)>()[%a#1, %a#0]
+  memref.store %t1, %arg1[%t1] : memref<?xindex>
+
+  return
+}
+
+// -----
+
+// CHECK-DAG: #[[$MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1 * 5)>
+// CHECK-LABEL: func @delin_apply_dont_cancel_partial
+// CHECK-SAME: (%[[ARG0:.+]]: index, %[[ARG1:.+]]: memref<?xindex>)
+// CHECK: %[[A:.+]]:3 = affine.delinearize_index %[[ARG0]] into (3, 4, 5)
+// CHECK: affine.apply #[[$MAP]]()[%[[A]]#2, %[[A]]#1]
+// CHECK: return
+func.func @delin_apply_dont_cancel_partial(%arg0:  index, %arg1: memref<?xindex>) {
+  %a:3 = affine.delinearize_index %arg0 into (3, 4, 5) : index, index, index
+
+  %t1 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 5)>()[%a#2, %a#1]
+  memref.store %t1, %arg1[%t1] : memref<?xindex>
+
+  return
+}
+
+// -----
+
 // CHECK-LABEL: @cst_value_to_cst_attr_basis_delinearize_index
 // CHECK-SAME:    (%[[ARG0:.*]]: index)
 // CHECK:         %[[RET:.*]]:3 = affine.delinearize_index %[[ARG0]] into (3, 4, 2) : index, index
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-allow-return-allocs.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-allow-return-allocs.mlir
index c58b153d438ca..21b508ee1dfc3 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-allow-return-allocs.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-allow-return-allocs.mlir
@@ -65,13 +65,13 @@ func.func @main(%t: tensor<?xf32>, %sz: index, %idx: index) -> (f32, f32) {
 
 // -----
 
-func.func @return_arg(%A: tensor<?xf32>) -> tensor<?xf32> {
+func.func private @return_arg(%A: tensor<?xf32>) -> tensor<?xf32> {
   func.return %A : tensor<?xf32>
 }
-// CHECK-LABEL: func @return_arg
+// CHECK-LABEL: func private @return_arg
 // CHECK-SAME:      %[[A:.*]]: memref<?xf32
 //  CHECK-NOT:    return %[[A]]
 
-// NO-DROP-LABEL: func @return_arg
+// NO-DROP-LABEL: func private @return_arg
 //  NO-DROP-SAME:     %[[A:.*]]: memref<?xf32
 //       NO-DROP:   return %[[A]]
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir
index 6054a61912532..d5f834bce9b83 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir
@@ -171,9 +171,9 @@ func.func @func_without_tensor_args(%v : vector<10xf32>) -> () {
 // Bufferization of a function that is reading and writing. %t0 is writable, so
 // no copy should be inserted.
 
-// CHECK-LABEL: func @inner_func(
+// CHECK-LABEL: func private @inner_func(
 //  CHECK-SAME:     %[[arg0:.*]]: memref<?xf32
-func.func @inner_func(%t: tensor<?xf32>) -> (tensor<?xf32>, f32) {
+func.func private @inner_func(%t: tensor<?xf32>) -> (tensor<?xf32>, f32) {
   // CHECK-NOT: copy
   %f = arith.constant 1.0 : f32
   %c0 = arith.constant 0 : index
@@ -186,9 +186,9 @@ func.func @inner_func(%t: tensor<?xf32>) -> (tensor<?xf32>, f32) {
   return %0, %1 : tensor<?xf32>, f32
 }
 
-// CHECK-LABEL: func @call_func_with_non_tensor_return(
+// CHECK-LABEL: func private @call_func_with_non_tensor_return(
 //  CHECK-SAME:     %[[arg0:.*]]: memref<?xf32
-func.func @call_func_with_non_tensor_return(
+func.func private @call_func_with_non_tensor_return(
     %t0: tensor<?xf32> {bufferization.writable = true}) -> (f32, tensor<?xf32>) {
   // CHECK-NOT: alloc
   // CHECK-NOT: copy
@@ -203,9 +203,9 @@ func.func @call_func_with_non_tensor_return(
 // Bufferization of a function that is reading and writing. %t0 is not writable,
 // so a copy is needed.
 
-// CHECK-LABEL: func @inner_func(
+// CHECK-LABEL: func private @inner_func(
 //  CHECK-SAME:     %[[arg0:.*]]: memref<?xf32
-func.func @inner_func(%t: tensor<?xf32>) -> (tensor<?xf32>, f32) {
+func.func private @inner_func(%t: tensor<?xf32>) -> (tensor<?xf32>, f32) {
   // CHECK-NOT: copy
   %f = arith.constant 1.0 : f32
   %c0 = arith.constant 0 : index
@@ -276,10 +276,10 @@ func.func @main(%t: tensor<?xf32> {bufferization.writable = false}) -> (f32) {
 
 // This function does not read, just write. We need an alloc, but no copy.
 
-// CHECK-LABEL: func @does_not_read(
+// CHECK-LABEL: func private @does_not_read(
 //   CHECK-NOT:   alloc
 //   CHECK-NOT:   copy
-func.func @does_not_read(%t: tensor<?xf32>) -> tensor<?xf32> {
+func.func private @does_not_read(%t: tensor<?xf32>) -> tensor<?xf32> {
   %f0 = arith.constant 0.0 : f32
   %r = linalg.fill ins(%f0 : f32) outs(%t : tensor<?xf32>) -> tensor<?xf32>
   return %r : tensor<?xf32>
@@ -354,9 +354,9 @@ func.func @main() {
 
 // A write inside an scf.execute_region. An equivalent tensor is yielded.
 
-// CHECK-LABEL: func @execute_region_test(
+// CHECK-LABEL: func private @execute_region_test(
 //  CHECK-SAME:     %[[m1:.*]]: memref<?xf32
-func.func @execute_region_test(%t1 : tensor<?xf32>)
+func.func private @execute_region_test(%t1 : tensor<?xf32>)
     -> (f32, tensor<?xf32>, f32)
 {
   %f1 = arith.constant 0.0 : f32
@@ -397,11 +397,11 @@ func.func @no_inline_execute_region_not_canonicalized() {
 //      CHECK:  func private @some_external_func(memref<?xf32, strided<[?], offset: ?>>)
 func.func private @some_external_func(tensor<?xf32>)
 
-//      CHECK:  func @scf_for_with_tensor_insert_slice(
+//      CHECK:  func private @scf_for_with_tensor_insert_slice(
 // CHECK-SAME:    %[[A:[a-zA-Z0-9]*]]: memref<?xf32, strided<[?], offset: ?>>
 // CHECK-SAME:    %[[B:[a-zA-Z0-9]*]]: memref<?xf32, strided<[?], offset: ?>>
 // CHECK-SAME:    %[[C:[a-zA-Z0-9]*]]: memref<4xf32, strided<[?], offset: ?>>
-func.func @scf_for_with_tensor_insert_slice(
+func.func private @scf_for_with_tensor_insert_slice(
     %A : tensor<?xf32>, %B : tensor<?xf32>, %C : tensor<4xf32>,
     %lb : index, %ub : index, %step : index)
   -> (tensor<?xf32>, tensor<?xf32>)
@@ -456,11 +456,11 @@ func.func @bar(
 
 // -----
 
-//      CHECK:  func @init_and_dot(
+//      CHECK:  func private @init_and_dot(
 // CHECK-SAME:    %[[A:[a-zA-Z0-9]*]]: memref<64xf32, strided<[?], offset: ?>>
 // CHECK-SAME:    %[[B:[a-zA-Z0-9]*]]: memref<64xf32, strided<[?], offset: ?>>
 // CHECK-SAME:    %[[C:[a-zA-Z0-9]*]]: memref<f32, strided<[], offset: ?>>
-func.func @init_and_dot(%a: tensor<64xf32>, %b: tensor<64xf32>, %c: tensor<f32>) -> tensor<f32> {
+func.func private @init_and_dot(%a: tensor<64xf32>, %b: tensor<64xf32>, %c: tensor<f32>) -> tensor<f32> {
   // CHECK-NEXT:   %[[C0:.*]] = arith.constant 0{{.*}} : f32
   %v0 = arith.constant 0.0 : f32
 
@@ -574,9 +574,9 @@ func.func @entry(%A : tensor<?xf32> {bufferization.buffer_layout = affine_map<(i
 
 // No alloc or copy inside of the loop.
 
-// CHECK-LABEL: func @inner_func(
+// CHECK-LABEL: func private @inner_func(
 //  CHECK-SAME:     %[[arg0:.*]]: memref<?xf32
-func.func @inner_func(%t: tensor<?xf32>) -> tensor<?xf32> {
+func.func private @inner_func(%t: tensor<?xf32>) -> tensor<?xf32> {
   %f = arith.constant 1.0 : f32
   %c0 = arith.constant 0 : index
   // CHECK: memref.store %{{.*}}, %[[arg0]]
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-non-module-bufferize.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-non-module-bufferize.mlir
index e2ab876f8b46a..b52612d0d1f10 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-non-module-bufferize.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-non-module-bufferize.mlir
@@ -24,10 +24,46 @@
     // CHECK-NOT: copy
     // CHECK: %[[call:.*]]:2 = call @inner_func(%[[arg0]])
     %0, %1 = call @inner_func(%t0) : (tensor<?xf32>) -> (tensor<?xf32>, f32)
-    // CHECK: return %[[call]]#1, %[[call]]#0 : f32, memref<?xf32,{{.*}}>
+    // CHECK: return %[[call]]#1, %[[call]]#0 : f32, memref<?xf32{{.*}}>
     return %1, %0 : f32, tensor<?xf32>
   }
   "test.finish" () : () -> ()
 }) : () -> ()
 
+// -----
 
+#enc1 = #test.tensor_encoding<"hello">
+#enc2 = #test.tensor_encoding<"not hello">
+
+"test.symbol_scope_isolated"() ({
+  // CHECK: func @inner_func(
+  // CHECK-SAME:  %[[arg0:.*]]: memref<?xf32, #test.memref_layout<"hello">>)
+  // CHECK-SAME:  -> memref<?xf32, #test.memref_layout<"hello">>
+  func.func @inner_func(%t: tensor<?xf32, #enc1>)
+      -> tensor<?xf32, #enc1> {
+    // CHECK: return %[[arg0]]
+    return %t : tensor<?xf32, #enc1>
+  }
+
+  // CHECK: func @outer_func(
+  // CHECK-SAME:  %[[arg0:.*]]: memref<?xf32, #test.memref_layout<"hello">>)
+  // CHECK-SAME:  -> (memref<?xf32, #test.memref_layout<"hello">>,
+  // CHECK-SAME:      memref<?xf32, #test.memref_layout<"not hello">>)
+  func.func @outer_func(%t0: tensor<?xf32, #enc1>)
+      -> (tensor<?xf32, #enc1>, tensor<?xf32, #enc2>) {
+    // CHECK: %[[call:.*]] = call @inner_func(%[[arg0]])
+    %0 = call @inner_func(%t0)
+      : (tensor<?xf32, #enc1>) -> (tensor<?xf32, #enc1>)
+
+    // CHECK: %[[local:.*]] = "test.create_memref_op"() : ()
+    // CHECK-SAME:  -> memref<?xf32, #test.memref_layout<"not hello">>
+    %local = "test.create_tensor_op"() : () -> tensor<?xf32, #enc2>
+    // CHECK: %[[dummy:.*]] = "test.dummy_memref_op"(%[[local]])
+    %1 = "test.dummy_tensor_op"(%local) : (tensor<?xf32, #enc2>)
+      -> tensor<?xf32, #enc2>
+
+    // CHECK: return %[[call]], %[[dummy]]
+    return %0, %1 : tensor<?xf32, #enc1>, tensor<?xf32, #enc2>
+  }
+  "test.finish" () : () -> ()
+}) : () -> ()
diff --git a/mlir/test/Dialect/LLVMIR/canonicalize.mlir b/mlir/test/Dialect/LLVMIR/canonicalize.mlir
index 8accf6e263863..755e3a3a5fa09 100644
--- a/mlir/test/Dialect/LLVMIR/canonicalize.mlir
+++ b/mlir/test/Dialect/LLVMIR/canonicalize.mlir
@@ -235,6 +235,17 @@ llvm.func @fold_gep_canon(%x : !llvm.ptr) -> !llvm.ptr {
 
 // -----
 
+// CHECK-LABEL: fold_shufflevector
+// CHECK-SAME: %[[ARG1:[[:alnum:]]+]]: vector<1xf32>, %[[ARG2:[[:alnum:]]+]]: vector<1xf32>
+llvm.func @fold_shufflevector(%v1 : vector<1xf32>, %v2 : vector<1xf32>) -> vector<1xf32> {
+  // CHECK-NOT: llvm.shufflevector
+  %c = llvm.shufflevector %v1, %v2 [0] : vector<1xf32>
+  // CHECK: llvm.return %[[ARG1]]
+  llvm.return %c : vector<1xf32>
+}
+
+// -----
+
 // Check that LLVM constants participate in cross-dialect constant folding. The
 // resulting constant is created in the arith dialect because the last folded
 // operation belongs to it.
diff --git a/mlir/test/Dialect/LLVMIR/func.mlir b/mlir/test/Dialect/LLVMIR/func.mlir
index 071f12431e5b7..cec4586b80074 100644
--- a/mlir/test/Dialect/LLVMIR/func.mlir
+++ b/mlir/test/Dialect/LLVMIR/func.mlir
@@ -258,12 +258,6 @@ module {
     llvm.return
   }
 
-  llvm.func @unsafe_fp_math_roundtrip() attributes {unsafe_fp_math = true} {
-    // CHECK: @unsafe_fp_math_roundtrip
-    // CHECK-SAME: attributes {unsafe_fp_math = true}
-    llvm.return
-  }
-
   llvm.func @no_infs_fp_math_roundtrip() attributes {no_infs_fp_math = true} {
     // CHECK: @no_infs_fp_math_roundtrip
     // CHECK-SAME: attributes {no_infs_fp_math = true}
diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir
index 358bd332883f2..d270ee8b089aa 100644
--- a/mlir/test/Dialect/LLVMIR/rocdl.mlir
+++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir
@@ -1035,6 +1035,20 @@ llvm.func @rocdl.s.wait.expcnt() {
   llvm.return
 }
 
+llvm.func @rocdl.s.wait.asynccnt() {
+  // CHECK-LABEL: rocdl.s.wait.asynccnt
+  // CHECK: rocdl.s.wait.asynccnt 0
+  rocdl.s.wait.asynccnt 0
+  llvm.return
+}
+
+llvm.func @rocdl.s.wait.tensorcnt() {
+  // CHECK-LABEL: rocdl.s.wait.tensorcnt
+  // CHECK: rocdl.s.wait.tensorcnt 0
+  rocdl.s.wait.tensorcnt 0
+  llvm.return
+}
+
 // -----
 
 llvm.func @rocdl.readfirstlane(%src : f32) -> f32 {
@@ -1197,6 +1211,57 @@ llvm.func @rocdl.cvt.scale.pk16(%v3xi32: vector<3xi32>, %scale:i32) {
 
 // -----
 
+// CHECK-LABEL: rocdl.cvt.scalef32.pk16
+llvm.func @rocdl.cvt.scalef32.pk16(%v16xf32: vector<16xf32>,
+                                  %v16xf16: vector<16xf16>,
+                                  %v16xbf16: vector<16xbf16>,
+                                  %scale: f32) {
+
+  // CHECK: rocdl.cvt.scalef32.pk16.fp6.f16
+  %0 =      rocdl.cvt.scalef32.pk16.fp6.f16 %v16xf16, %scale : vector<3xi32>
+  // CHECK: rocdl.cvt.scalef32.pk16.fp6.bf16
+  %1 =      rocdl.cvt.scalef32.pk16.fp6.bf16 %v16xbf16, %scale : vector<3xi32>
+  // CHECK: rocdl.cvt.scalef32.pk16.fp6.f32
+  %2 =      rocdl.cvt.scalef32.pk16.fp6.f32 %v16xf32, %scale : vector<3xi32>
+
+  // CHECK: rocdl.cvt.scalef32.pk16.bf6.f16
+  %3 =      rocdl.cvt.scalef32.pk16.bf6.f16 %v16xf16, %scale : vector<3xi32>
+  // CHECK: rocdl.cvt.scalef32.pk16.bf6.bf16
+  %4 =      rocdl.cvt.scalef32.pk16.bf6.bf16 %v16xbf16, %scale : vector<3xi32>
+  // CHECK: rocdl.cvt.scalef32.pk16.bf6.f32
+  %5 =      rocdl.cvt.scalef32.pk16.bf6.f32 %v16xf32, %scale : vector<3xi32>
+
+  llvm.return
+}
+
+// -----
+
+// CHECK-LABEL: rocdl.cvt.scalef32.sr.pk16
+llvm.func @rocdl.cvt.scalef32.sr.pk16(%v16xf32: vector<16xf32>,
+                                     %v16xf16: vector<16xf16>,
+                                     %v16xbf16: vector<16xbf16>,
+                                     %seed: i32,
+                                     %scale: f32) {
+
+  // CHECK: rocdl.cvt.scalef32.sr.pk16.fp6.f16
+  %0 =      rocdl.cvt.scalef32.sr.pk16.fp6.f16 %v16xf16, %seed, %scale : vector<3xi32>
+  // CHECK: rocdl.cvt.scalef32.sr.pk16.fp6.bf16
+  %1 =      rocdl.cvt.scalef32.sr.pk16.fp6.bf16 %v16xbf16, %seed, %scale : vector<3xi32>
+  // CHECK: rocdl.cvt.scalef32.sr.pk16.fp6.f32
+  %2 =      rocdl.cvt.scalef32.sr.pk16.fp6.f32 %v16xf32, %seed, %scale : vector<3xi32>
+
+  // CHECK: rocdl.cvt.scalef32.sr.pk16.bf6.f16
+  %3 =      rocdl.cvt.scalef32.sr.pk16.bf6.f16 %v16xf16, %seed, %scale : vector<3xi32>
+  // CHECK: rocdl.cvt.scalef32.sr.pk16.bf6.bf16
+  %4 =      rocdl.cvt.scalef32.sr.pk16.bf6.bf16 %v16xbf16, %seed, %scale : vector<3xi32>
+  // CHECK: rocdl.cvt.scalef32.sr.pk16.bf6.f32
+  %5 =      rocdl.cvt.scalef32.sr.pk16.bf6.f32 %v16xf32, %seed, %scale : vector<3xi32>
+
+  llvm.return
+}
+
+// -----
+
 // expected-error@below {{attribute attached to unexpected op}}
 func.func private @expected_llvm_func() attributes { rocdl.kernel }
 
diff --git a/mlir/test/Dialect/Linalg/match-ops-interpreter.mlir b/mlir/test/Dialect/Linalg/match-ops-interpreter.mlir
index 618ba3402ff52..66cae5cfd923d 100644
--- a/mlir/test/Dialect/Linalg/match-ops-interpreter.mlir
+++ b/mlir/test/Dialect/Linalg/match-ops-interpreter.mlir
@@ -1011,6 +1011,20 @@ module attributes { transform.target_tag = "start_here" } {
     } -> tensor<1x1x4xf32>
     return
   }
+
+  func.func @generic_none(%arg0: tensor<128x128xi32>, %arg1: tensor<128x128xi32>, %arg2: tensor<128x128xi32>) {
+    %0 = linalg.generic {
+      indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>,
+                       affine_map<(d0, d1, d2) -> (d2, d1)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>],
+      iterator_types = ["parallel", "parallel", "reduction"]}
+      ins(%arg0, %arg1 : tensor<128x128xi32>, tensor<128x128xi32>)
+      outs(%arg2 : tensor<128x128xi32>) {
+      ^bb0(%in: i32, %in_0: i32, %out: i32):
+        linalg.yield %out : i32
+      } -> tensor<128x128xi32>
+    return
+  }
 }
 
 // -----
diff --git a/mlir/test/Dialect/Linalg/one-shot-bufferize.mlir b/mlir/test/Dialect/Linalg/one-shot-bufferize.mlir
index 9616a3e32a064..1df15e85bac17 100644
--- a/mlir/test/Dialect/Linalg/one-shot-bufferize.mlir
+++ b/mlir/test/Dialect/Linalg/one-shot-bufferize.mlir
@@ -10,10 +10,10 @@
 
 // TODO: Some test cases from this file should be moved to other dialects.
 
-// CHECK-LABEL: func @fill_inplace(
+// CHECK-LABEL: func private @fill_inplace(
 //  CHECK-SAME:   %[[A:[a-zA-Z0-9]*]]: memref<?xf32, strided<[?], offset: ?>>
-// CHECK-NO-LAYOUT-MAP-LABEL: func @fill_inplace(%{{.*}}: memref<?xf32>) {
-func.func @fill_inplace(
+// CHECK-NO-LAYOUT-MAP-LABEL: func private @fill_inplace(%{{.*}}: memref<?xf32>) {
+func.func private @fill_inplace(
     %A : tensor<?xf32> {bufferization.writable = true})
   -> tensor<?xf32>
 {
@@ -56,10 +56,10 @@ func.func @not_inplace(
 // -----
 
 
-// CHECK-LABEL: func @not_inplace
+// CHECK-LABEL: func private @not_inplace
 //  CHECK-SAME:   %[[A:[a-zA-Z0-9]*]]: memref<?x?xf32, strided<[?, ?], offset: ?>>) {
-// CHECK-NO-LAYOUT-MAP-LABEL: func @not_inplace(%{{.*}}: memref<?x?xf32>) {
-func.func @not_inplace(
+// CHECK-NO-LAYOUT-MAP-LABEL: func private @not_inplace(%{{.*}}: memref<?x?xf32>) {
+func.func private @not_inplace(
     %A : tensor<?x?xf32> {bufferization.writable = true})
   -> tensor<?x?xf32>
 {
@@ -235,7 +235,7 @@ func.func @dominance_violation_bug_1(
 
 // -----
 
-func.func @gather_like(
+func.func private @gather_like(
     %arg0 : tensor<?x?xf32> {bufferization.writable = false},
     %arg1 : tensor<?xi32> {bufferization.writable = false},
     %arg2 : tensor<?x?xf32> {bufferization.writable = true})
@@ -254,7 +254,7 @@ func.func @gather_like(
       } -> tensor<?x?xf32>
   return %0 : tensor<?x?xf32>
 }
-// CHECK-LABEL: func @gather_like(
+// CHECK-LABEL: func private @gather_like(
 //  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: memref<?x?xf32,
 //  CHECK-SAME:     %[[ARG1:.+]]: memref<?xi32
 //  CHECK-SAME:     %[[ARG2:.+]]: memref<?x?xf32
diff --git a/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir b/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir
index 35f520a9f22a8..93a03369be239 100644
--- a/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir
@@ -1,5 +1,9 @@
 // RUN: mlir-opt %s -transform-interpreter -split-input-file | FileCheck %s
 
+///----------------------------------------------------------------------------------------
+/// Tests for linalg.dot
+///----------------------------------------------------------------------------------------
+
 // CHECK-LABEL: contraction_dot
 func.func @contraction_dot(%A: memref<1584xf32>, %B: memref<1584xf32>, %C: memref<f32>) {
 
@@ -20,6 +24,10 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+///----------------------------------------------------------------------------------------
+/// Tests for linalg.matvec
+///----------------------------------------------------------------------------------------
+
 // CHECK-LABEL: contraction_matvec
 func.func @contraction_matvec(%A: memref<1584x1584xf32>, %B: memref<1584xf32>, %C: memref<1584xf32>) {
 
@@ -41,6 +49,10 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+///----------------------------------------------------------------------------------------
+/// Tests for linalg.matmul
+///----------------------------------------------------------------------------------------
+
 // CHECK-LABEL: contraction_matmul
 func.func @contraction_matmul(%A: memref<1584x1584xf32>, %B: memref<1584x1584xf32>, %C: memref<1584x1584xf32>) {
 // CHECK: arith.mulf %{{.*}}, %{{.*}} : vector<1584x1584x1584xf32>
@@ -138,6 +150,10 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+///----------------------------------------------------------------------------------------
+/// Tests for linalg.batch_matmul
+///----------------------------------------------------------------------------------------
+
 // CHECK-LABEL: contraction_batch_matmul
 func.func @contraction_batch_matmul(%A: memref<1584x1584x1584xf32>, %B: memref<1584x1584x1584xf32>, %C: memref<1584x1584x1584xf32>) {
 // CHECK: arith.mulf %{{.*}}, %{{.*}} : vector<1584x1584x1584x1584xf32>
@@ -159,6 +175,10 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+///----------------------------------------------------------------------------------------
+/// Tests for linalg.cantract
+///----------------------------------------------------------------------------------------
+
 // CHECK-LABEL: @matmul_as_contract
 // CHECK-SAME: %[[A:.*]]: tensor<24x12xf32>
 // CHECK-SAME: %[[B:.*]]: tensor<12x25xf32>
@@ -220,6 +240,10 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+///----------------------------------------------------------------------------------------
+/// Tests for linalg.fill
+///----------------------------------------------------------------------------------------
+
 // CHECK-LABEL: func @test_vectorize_fill
 func.func @test_vectorize_fill(%A : memref<8x16xf32>, %arg0 : f32) {
   //       CHECK: %[[V:.*]] = vector.broadcast {{.*}} : f32 to vector<8x16xf32>
@@ -259,70 +283,14 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
-// CHECK-LABEL: func @test_vectorize_copy
-func.func @test_vectorize_copy(%A : memref<8x16xf32>, %B : memref<8x16xf32>) {
-  //       CHECK: %[[V:.*]] = vector.transfer_read {{.*}} : memref<8x16xf32>, vector<8x16xf32>
-  //       CHECK: vector.transfer_write %[[V]], {{.*}} : vector<8x16xf32>, memref<8x16xf32>
-  memref.copy %A, %B :  memref<8x16xf32> to memref<8x16xf32>
-  return
-}
+///----------------------------------------------------------------------------------------
+/// Tests for linalg.pack
+///----------------------------------------------------------------------------------------
 
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["memref.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-    %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
-    transform.yield
-  }
-}
+// Note, see a similar test in:
+//  * vectorization.mlir.
 
-// -----
-
-// CHECK-LABEL: func @test_vectorize_copy_0d
-func.func @test_vectorize_copy_0d(%A : memref<f32>, %B : memref<f32>) {
-  //  CHECK-SAME: (%[[A:.*]]: memref<f32>, %[[B:.*]]: memref<f32>)
-  //       CHECK:   %[[V:.*]] = vector.transfer_read %[[A]][]{{.*}} : memref<f32>, vector<f32>
-  //       CHECK:   %[[val:.*]] = vector.extract %[[V]][] : f32 from vector<f32>
-  //       CHECK:   %[[VV:.*]] = vector.broadcast %[[val]] : f32 to vector<f32>
-  //       CHECK:   vector.transfer_write %[[VV]], %[[B]][] : vector<f32>, memref<f32>
-  memref.copy %A, %B :  memref<f32> to memref<f32>
-  return
-}
-
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["memref.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-    %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
-    transform.yield
-  }
-}
-
-// -----
-
-// CHECK-LABEL: func @test_vectorize_copy_complex
-// CHECK-NOT: vector<
-func.func @test_vectorize_copy_complex(%A : memref<8x16xcomplex<f32>>, %B : memref<8x16xcomplex<f32>>) {
-  memref.copy %A, %B :  memref<8x16xcomplex<f32>> to memref<8x16xcomplex<f32>>
-  return
-}
-
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["memref.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-    %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
-    transform.yield
-  }
-}
-
-// -----
-
-// Input identical as the test in vectorization.mlir. Output is different -
-// vector sizes are inferred (rather than user-specified) and hence _no_
-// masking was used.
-
-func.func @test_vectorize_pack(%arg0: tensor<32x8x16xf32>, %arg1: tensor<4x1x32x16x2xf32>) -> tensor<4x1x32x16x2xf32> {
+func.func @pack_no_padding(%arg0: tensor<32x8x16xf32>, %arg1: tensor<4x1x32x16x2xf32>) -> tensor<4x1x32x16x2xf32> {
   %pack = linalg.pack %arg0 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x8x16xf32> -> tensor<4x1x32x16x2xf32>
   return %pack : tensor<4x1x32x16x2xf32>
 }
@@ -336,7 +304,7 @@ module attributes {transform.with_named_sequence} {
   }
 }
 
-// CHECK-LABEL:   func.func @test_vectorize_pack(
+// CHECK-LABEL:   func.func @pack_no_padding(
 // CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x8x16xf32>,
 // CHECK-SAME:      %[[VAL_1:.*]]: tensor<4x1x32x16x2xf32>) -> tensor<4x1x32x16x2xf32> {
 // CHECK-DAG:       %[[VAL_2:.*]] = ub.poison : f32
@@ -349,13 +317,16 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
-func.func @test_vectorize_padded_pack(%arg0: tensor<32x7x15xf32>, %arg1: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> {
+// Note, see a similar test in:
+//  * vectorization.mlir.
+
+func.func @pack_with_padding(%arg0: tensor<32x7x15xf32>, %arg1: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> {
   %pad = arith.constant 0.000000e+00 : f32
   %pack = linalg.pack %arg0 padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32>
   return %pack : tensor<32x4x1x16x2xf32>
 }
 
-// CHECK-LABEL:   func.func @test_vectorize_padded_pack(
+// CHECK-LABEL:   func.func @pack_with_padding(
 // CHECK-SAME:      %[[VAL_0:.*]]: tensor<32x7x15xf32>,
 // CHECK-SAME:      %[[VAL_1:.*]]: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> {
 // CHECK:           %[[VAL_2:.*]] = arith.constant 0.000000e+00 : f32
@@ -377,6 +348,10 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+///----------------------------------------------------------------------------------------
+/// Tests for linalg.map
+///----------------------------------------------------------------------------------------
+
 func.func @vectorize_map(%arg0: memref<64xf32>,
     %arg1: memref<64xf32>, %arg2: memref<64xf32>) {
   linalg.map ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>)
@@ -403,6 +378,10 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+///----------------------------------------------------------------------------------------
+/// Tests for linalg.transpose
+///----------------------------------------------------------------------------------------
+
 func.func @vectorize_transpose(%arg0: memref<16x32x64xf32>,
                                %arg1: memref<32x64x16xf32>) {
   linalg.transpose ins(%arg0 : memref<16x32x64xf32>)
@@ -424,6 +403,10 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+///----------------------------------------------------------------------------------------
+/// Tests for linalg.reduce
+///----------------------------------------------------------------------------------------
+
 func.func @vectorize_reduce(%arg0: memref<16x32x64xf32>,
                   %arg1: memref<16x64xf32>) {
   linalg.reduce ins(%arg0 : memref<16x32x64xf32>)
@@ -449,6 +432,10 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+///----------------------------------------------------------------------------------------
+/// Tests for linalg.generic
+///----------------------------------------------------------------------------------------
+
 #matmul_trait = {
   indexing_maps = [
     affine_map<(m, n, k) -> (m, k)>,
@@ -1446,6 +1433,8 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+// TODO: Two Linalg Ops in one tests - either split or document "why".
+
 // CHECK-DAG: #[[$M6:.*]] = affine_map<(d0, d1) -> (d0, 0)>
 
 // CHECK-LABEL:   func @fused_broadcast_red_2d
@@ -1896,3 +1885,65 @@ module attributes {transform.with_named_sequence} {
   }
 }
 
+// -----
+
+///----------------------------------------------------------------------------------------
+/// Tests for memref.copy
+///----------------------------------------------------------------------------------------
+
+// CHECK-LABEL: func @test_vectorize_copy
+func.func @test_vectorize_copy(%A : memref<8x16xf32>, %B : memref<8x16xf32>) {
+  //       CHECK: %[[V:.*]] = vector.transfer_read {{.*}} : memref<8x16xf32>, vector<8x16xf32>
+  //       CHECK: vector.transfer_write %[[V]], {{.*}} : vector<8x16xf32>, memref<8x16xf32>
+  memref.copy %A, %B :  memref<8x16xf32> to memref<8x16xf32>
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["memref.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+    %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: func @test_vectorize_copy_0d
+func.func @test_vectorize_copy_0d(%A : memref<f32>, %B : memref<f32>) {
+  //  CHECK-SAME: (%[[A:.*]]: memref<f32>, %[[B:.*]]: memref<f32>)
+  //       CHECK:   %[[V:.*]] = vector.transfer_read %[[A]][]{{.*}} : memref<f32>, vector<f32>
+  //       CHECK:   %[[val:.*]] = vector.extract %[[V]][] : f32 from vector<f32>
+  //       CHECK:   %[[VV:.*]] = vector.broadcast %[[val]] : f32 to vector<f32>
+  //       CHECK:   vector.transfer_write %[[VV]], %[[B]][] : vector<f32>, memref<f32>
+  memref.copy %A, %B :  memref<f32> to memref<f32>
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["memref.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+    %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: func @test_vectorize_copy_complex
+// CHECK-NOT: vector<
+func.func @test_vectorize_copy_complex(%A : memref<8x16xcomplex<f32>>, %B : memref<8x16xcomplex<f32>>) {
+  memref.copy %A, %B :  memref<8x16xcomplex<f32>> to memref<8x16xcomplex<f32>>
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["memref.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+    %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
diff --git a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
index 11bea8d92432c..1304a90349f71 100644
--- a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
@@ -1307,14 +1307,17 @@ func.func @test_vectorize_unpack_no_vector_sizes_permute(%source: tensor<4x7x4xf
 /// Tests for linalg.pack
 ///----------------------------------------------------------------------------------------
 
-// Input identical as the test in vectorization-with-patterns.mlir. Output is
-// different - vector sizes are inferred (rather than user-specified) and hence
-// masking was used.
+// This packing requires no padding, so no out-of-bounds read/write vector Ops.
 
-// CHECK-LABEL: func @test_vectorize_pack
+// Note, see a similar test in:
+//  * vectorization-with-patterns.mlir
+// The output is identical (the input vector sizes == the inferred vector
+// sizes, i.e. the tensor sizes).
+
+// CHECK-LABEL: func @pack_no_padding
 // CHECK-SAME:      %[[SRC:.*]]: tensor<32x8x16xf32>,
 // CHECK-SAME:      %[[DEST:.*]]: tensor<4x1x32x16x2xf32>
-func.func @test_vectorize_pack(%src: tensor<32x8x16xf32>, %dest: tensor<4x1x32x16x2xf32>) -> tensor<4x1x32x16x2xf32> {
+func.func @pack_no_padding(%src: tensor<32x8x16xf32>, %dest: tensor<4x1x32x16x2xf32>) -> tensor<4x1x32x16x2xf32> {
   %pack = linalg.pack %src outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %dest : tensor<32x8x16xf32> -> tensor<4x1x32x16x2xf32>
   return %pack : tensor<4x1x32x16x2xf32>
 }
@@ -1325,9 +1328,9 @@ func.func @test_vectorize_pack(%src: tensor<32x8x16xf32>, %dest: tensor<4x1x32x1
 //      CHECK: %[[SC:.*]] = vector.shape_cast %[[READ]] : vector<32x8x16xf32> to vector<32x4x2x1x16xf32>
 //      CHECK: %[[TR:.*]] = vector.transpose %[[SC]], [1, 3, 0, 4, 2] : vector<32x4x2x1x16xf32> to vector<4x1x32x16x2xf32>
 //  CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index
-//      CHECK: %[[write:.*]] = vector.transfer_write %[[TR]], %[[DEST]][%[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]]]
+//      CHECK: %[[WRITE:.*]] = vector.transfer_write %[[TR]], %[[DEST]][%[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]]]
 // CHECK-SAME:   {in_bounds = [true, true, true, true, true]} : vector<4x1x32x16x2xf32>, tensor<4x1x32x16x2xf32>
-//      CHECK: return %[[write]] : tensor<4x1x32x16x2xf32>
+//      CHECK: return %[[WRITE]] : tensor<4x1x32x16x2xf32>
 
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%src: !transform.any_op {transform.readonly}) {
@@ -1339,14 +1342,18 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
-// Input identical as the test in vectorization-with-patterns.mlir. Output is
-// different - vector sizes are inferred (rather than user-specified) and hence
-// masking was used.
+// This packing does require padding, so there are out-of-bounds read/write
+// vector Ops.
+
+// Note, see a similar test in:
+//  * vectorization-with-patterns.mlir.
+// The output is different (the input vector sizes != inferred vector sizes,
+// i.e. the tensor sizes).
 
-// CHECK-LABEL: func @test_vectorize_padded_pack
+// CHECK-LABEL: func @pack_with_padding
 // CHECK-SAME:      %[[SRC:.*]]: tensor<32x7x15xf32>,
 // CHECK-SAME:      %[[DEST:.*]]: tensor<32x4x1x16x2xf32>
-func.func @test_vectorize_padded_pack(%src: tensor<32x7x15xf32>, %dest: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> {
+func.func @pack_with_padding(%src: tensor<32x7x15xf32>, %dest: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> {
   %pad = arith.constant 0.000000e+00 : f32
   %pack = linalg.pack %src padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %dest : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32>
   return %pack : tensor<32x4x1x16x2xf32>
@@ -1364,9 +1371,9 @@ func.func @test_vectorize_padded_pack(%src: tensor<32x7x15xf32>, %dest: tensor<3
 //      CHECK: %[[SC:.*]] = vector.shape_cast %[[READ]] : vector<32x8x16xf32> to vector<32x4x2x1x16xf32>
 //      CHECK: %[[TR:.*]] = vector.transpose %[[SC]], [0, 1, 3, 4, 2] : vector<32x4x2x1x16xf32> to vector<32x4x1x16x2xf32>
 //  CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index
-//      CHECK: %[[write:.*]] = vector.transfer_write %[[TR]], %[[DEST]][%[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]]]
+//      CHECK: %[[WRITE:.*]] = vector.transfer_write %[[TR]], %[[DEST]][%[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]]]
 // CHECK-SAME:   {in_bounds = [true, true, true, true, true]} : vector<32x4x1x16x2xf32>, tensor<32x4x1x16x2xf32>
-//      CHECK: return %[[write]] : tensor<32x4x1x16x2xf32>
+//      CHECK: return %[[WRITE]] : tensor<32x4x1x16x2xf32>
 
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
@@ -1378,10 +1385,46 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
-// CHECK-LABEL: func @test_vectorize_dynamic_pack
+// This packing does require padding, so there are out-of-bounds read/write
+// vector Ops.
+
+// Note, see a similar test in:
+//  * vectorization-with-patterns.mlir.
+// The output is identical (in both cases the vector sizes are inferred).
+
+// CHECK-LABEL: func @pack_with_padding_no_vector_sizes
+// CHECK-SAME:      %[[SRC:.*]]: tensor<32x7x15xf32>,
+// CHECK-SAME:      %[[DEST:.*]]: tensor<32x4x1x16x2xf32>
+func.func @pack_with_padding_no_vector_sizes(%src: tensor<32x7x15xf32>, %dest: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> {
+  %pad = arith.constant 0.000000e+00 : f32
+  %pack = linalg.pack %src padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %dest : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32>
+  return %pack : tensor<32x4x1x16x2xf32>
+}
+//  CHECK-DAG: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
+//  CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+//      CHECK: %[[READ:.*]] =  vector.transfer_read %{{.*}}[%[[C0]], %[[C0]], %[[C0]]], %[[CST]]
+// CHECK-SAME:   {in_bounds = [true, false, false]} : tensor<32x7x15xf32>, vector<32x8x16xf32>
+//      CHECK: %[[SC:.*]] = vector.shape_cast %[[READ]] : vector<32x8x16xf32> to vector<32x4x2x1x16xf32>
+//      CHECK: %[[TR:.*]] = vector.transpose %[[SC]], [0, 1, 3, 4, 2] : vector<32x4x2x1x16xf32> to vector<32x4x1x16x2xf32>
+//  CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index
+//      CHECK: %[[WRITE:.*]] = vector.transfer_write %[[TR]], %[[DEST]][%[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]]]
+// CHECK-SAME:   {in_bounds = [true, true, true, true, true]} : vector<32x4x1x16x2xf32>, tensor<32x4x1x16x2xf32>
+//      CHECK: return %[[WRITE]] : tensor<32x4x1x16x2xf32>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: func @pack_with_dynamic_dims
 // CHECK-SAME:      %[[SRC:.*]]: tensor<?x?xf32>,
 // CHECK-SAME:      %[[DEST:.*]]: tensor<?x?x16x2xf32>
-func.func @test_vectorize_dynamic_pack(%src: tensor<?x?xf32>, %dest: tensor<?x?x16x2xf32>) -> tensor<?x?x16x2xf32> {
+func.func @pack_with_dynamic_dims(%src: tensor<?x?xf32>, %dest: tensor<?x?x16x2xf32>) -> tensor<?x?x16x2xf32> {
   %pack = linalg.pack %src inner_dims_pos = [1, 0] inner_tiles = [16, 2] into %dest : tensor<?x?xf32> -> tensor<?x?x16x2xf32>
   return %pack : tensor<?x?x16x2xf32>
 }
@@ -1418,64 +1461,6 @@ module attributes {transform.with_named_sequence} {
   }
 }
 
-// -----
-
-// CHECK-LABEL: func @test_vectorize_pack_no_vector_sizes
-// CHECK-SAME:      %[[SRC:.*]]: tensor<64x4xf32>,
-// CHECK-SAME:      %[[DEST:.*]]: tensor<2x4x16x2xf32>
-func.func @test_vectorize_pack_no_vector_sizes(%src: tensor<64x4xf32>, %dest: tensor<2x4x16x2xf32>) -> tensor<2x4x16x2xf32> {
-  %pack = linalg.pack %src outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [16, 2] into %dest : tensor<64x4xf32> -> tensor<2x4x16x2xf32>
-  return %pack : tensor<2x4x16x2xf32>
-}
-//  CHECK-DAG: %[[CST:.*]] = ub.poison : f32
-//  CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
-//      CHECK: %[[READ:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]]], %[[CST]]
-// CHECK-SAME:    {in_bounds = [true, true]} : tensor<64x4xf32>, vector<64x4xf32>
-//      CHECK: %[[SC:.*]] = vector.shape_cast %[[READ]] : vector<64x4xf32> to vector<4x16x2x2xf32>
-//      CHECK: %[[TR:.*]] = vector.transpose %[[SC]], [2, 0, 1, 3] : vector<4x16x2x2xf32> to vector<2x4x16x2xf32>
-//  CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index
-//      CHECK: %[[WRITE:.*]] = vector.transfer_write %[[TR]], %[[DEST]][%[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]]]
-// CHECK-SAME:   {in_bounds = [true, true, true, true]} : vector<2x4x16x2xf32>, tensor<2x4x16x2xf32>
-//      CHECK: return %[[WRITE]] : tensor<2x4x16x2xf32>
-
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-    transform.structured.vectorize %0 : !transform.any_op
-    transform.yield
-  }
-}
-
-// -----
-
-// CHECK-LABEL: test_vectorize_padded_pack_no_vector_sizes
-// CHECK-SAME:      %[[SRC:.*]]: tensor<32x7x15xf32>,
-// CHECK-SAME:      %[[DEST:.*]]: tensor<32x4x1x16x2xf32>
-func.func @test_vectorize_padded_pack_no_vector_sizes(%src: tensor<32x7x15xf32>, %dest: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> {
-  %pad = arith.constant 0.000000e+00 : f32
-  %pack = linalg.pack %src padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %dest : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32>
-  return %pack : tensor<32x4x1x16x2xf32>
-}
-//  CHECK-DAG: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
-//  CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
-//      CHECK: %[[READ:.*]] =  vector.transfer_read %{{.*}}[%[[C0]], %[[C0]], %[[C0]]], %[[CST]]
-// CHECK-SAME:   {in_bounds = [true, false, false]} : tensor<32x7x15xf32>, vector<32x8x16xf32>
-//      CHECK: %[[SC:.*]] = vector.shape_cast %[[READ]] : vector<32x8x16xf32> to vector<32x4x2x1x16xf32>
-//      CHECK: %[[TR:.*]] = vector.transpose %[[SC]], [0, 1, 3, 4, 2] : vector<32x4x2x1x16xf32> to vector<32x4x1x16x2xf32>
-//  CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index
-//      CHECK: %[[WRITE:.*]] = vector.transfer_write %[[TR]], %[[DEST]][%[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]], %[[C0_1]]]
-// CHECK-SAME:   {in_bounds = [true, true, true, true, true]} : vector<32x4x1x16x2xf32>, tensor<32x4x1x16x2xf32>
-//      CHECK: return %[[WRITE]] : tensor<32x4x1x16x2xf32>
-
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-    transform.structured.vectorize %0 : !transform.any_op
-    transform.yield
-  }
-}
-
-
 ///----------------------------------------------------------------------------------------
 /// Tests for other Ops
 ///----------------------------------------------------------------------------------------
diff --git a/mlir/test/Dialect/MemRef/canonicalize.mlir b/mlir/test/Dialect/MemRef/canonicalize.mlir
index 16b7a5c8bcb08..7160b52af6353 100644
--- a/mlir/test/Dialect/MemRef/canonicalize.mlir
+++ b/mlir/test/Dialect/MemRef/canonicalize.mlir
@@ -911,6 +911,21 @@ func.func @reinterpret_noop(%arg : memref<2x3x4xf32>) -> memref<2x3x4xf32> {
 
 // -----
 
+// CHECK-LABEL: func @reinterpret_constant_fold
+//  CHECK-SAME: (%[[ARG:.*]]: memref<f32>)
+//       CHECK: %[[RES:.*]] = memref.reinterpret_cast %[[ARG]] to offset: [0], sizes: [100, 100], strides: [100, 1]
+//       CHECK: %[[CAST:.*]] = memref.cast %[[RES]]
+//       CHECK: return %[[CAST]]
+func.func @reinterpret_constant_fold(%arg0: memref<f32>) -> memref<?x?xf32, strided<[?, ?], offset: ?>> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c100 = arith.constant 100 : index
+  %reinterpret_cast = memref.reinterpret_cast %arg0 to offset: [%c0], sizes: [%c100, %c100], strides: [%c100, %c1] : memref<f32> to memref<?x?xf32, strided<[?, ?], offset: ?>>
+  return %reinterpret_cast : memref<?x?xf32, strided<[?, ?], offset: ?>>
+}
+
+// -----
+
 // CHECK-LABEL: func @reinterpret_of_reinterpret
 //  CHECK-SAME: (%[[ARG:.*]]: memref<?xi8>, %[[SIZE1:.*]]: index, %[[SIZE2:.*]]: index)
 //       CHECK: %[[RES:.*]] = memref.reinterpret_cast %[[ARG]] to offset: [0], sizes: [%[[SIZE2]]], strides: [1]
@@ -996,10 +1011,9 @@ func.func @reinterpret_of_extract_strided_metadata_same_type(%arg0 : memref<?x?x
 // when the strides don't match.
 // CHECK-LABEL: func @reinterpret_of_extract_strided_metadata_w_different_stride
 //  CHECK-SAME: (%[[ARG:.*]]: memref<8x2xf32>)
-//   CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
-//   CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
-//       CHECK: %[[RES:.*]] = memref.reinterpret_cast %[[ARG]] to offset: [%[[C0]]], sizes: [4, 2, 2], strides: [1, 1, %[[C1]]]
-//       CHECK: return %[[RES]]
+//       CHECK: %[[RES:.*]] = memref.reinterpret_cast %[[ARG]] to offset: [0], sizes: [4, 2, 2], strides: [1, 1, 1]
+//       CHECK: %[[CAST:.*]] = memref.cast %[[RES]]
+//       CHECK: return %[[CAST]]
 func.func @reinterpret_of_extract_strided_metadata_w_different_stride(%arg0 : memref<8x2xf32>) -> memref<?x?x?xf32, strided<[?, ?, ?], offset: ?>> {
   %base, %offset, %sizes:2, %strides:2 = memref.extract_strided_metadata %arg0 : memref<8x2xf32> -> memref<f32>, index, index, index, index, index
   %m2 = memref.reinterpret_cast %base to offset: [%offset], sizes: [4, 2, 2], strides: [1, 1, %strides#1] : memref<f32> to memref<?x?x?xf32, strided<[?, ?, ?], offset: ?>>
@@ -1011,11 +1025,9 @@ func.func @reinterpret_of_extract_strided_metadata_w_different_stride(%arg0 : me
 // when the offset doesn't match.
 // CHECK-LABEL: func @reinterpret_of_extract_strided_metadata_w_different_offset
 //  CHECK-SAME: (%[[ARG:.*]]: memref<8x2xf32>)
-//   CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index
-//   CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
-//   CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
-//       CHECK: %[[RES:.*]] = memref.reinterpret_cast %[[ARG]] to offset: [1], sizes: [%[[C8]], %[[C2]]], strides: [%[[C2]], %[[C1]]]
-//       CHECK: return %[[RES]]
+//       CHECK: %[[RES:.*]] = memref.reinterpret_cast %[[ARG]] to offset: [1], sizes: [8, 2], strides: [2, 1]
+//       CHECK: %[[CAST:.*]] = memref.cast %[[RES]]
+//       CHECK: return %[[CAST]]
 func.func @reinterpret_of_extract_strided_metadata_w_different_offset(%arg0 : memref<8x2xf32>) -> memref<?x?xf32, strided<[?, ?], offset: ?>> {
   %base, %offset, %sizes:2, %strides:2 = memref.extract_strided_metadata %arg0 : memref<8x2xf32> -> memref<f32>, index, index, index, index, index
   %m2 = memref.reinterpret_cast %base to offset: [1], sizes: [%sizes#0, %sizes#1], strides: [%strides#0, %strides#1] : memref<f32> to memref<?x?xf32, strided<[?, ?], offset: ?>>
diff --git a/mlir/test/Dialect/OpenACC/ops.mlir b/mlir/test/Dialect/OpenACC/ops.mlir
index 1484d7efd87c2..8713689ed5799 100644
--- a/mlir/test/Dialect/OpenACC/ops.mlir
+++ b/mlir/test/Dialect/OpenACC/ops.mlir
@@ -1766,6 +1766,12 @@ acc.set default_async(%i32Value : i32)
 func.func @acc_atomic_read(%v: memref<i32>, %x: memref<i32>) {
   // CHECK: acc.atomic.read %[[v]] = %[[x]] : memref<i32>, memref<i32>, i32
   acc.atomic.read %v = %x : memref<i32>, memref<i32>, i32
+
+  // CHECK-NEXT: %[[IFCOND1:.*]] = arith.constant true
+  // CHECK-NEXT: acc.atomic.read if(%[[IFCOND1]]) %[[v]] = %[[x]] : memref<i32>, memref<i32>, i32
+  %ifCond = arith.constant true
+  acc.atomic.read if(%ifCond) %v = %x : memref<i32>, memref<i32>, i32
+
   return
 }
 
@@ -1776,6 +1782,12 @@ func.func @acc_atomic_read(%v: memref<i32>, %x: memref<i32>) {
 func.func @acc_atomic_write(%addr : memref<i32>, %val : i32) {
   // CHECK: acc.atomic.write %[[ADDR]] = %[[VAL]] : memref<i32>, i32
   acc.atomic.write %addr = %val : memref<i32>, i32
+
+  // CHECK-NEXT: %[[IFCOND1:.*]] = arith.constant true
+  // CHECK-NEXT: acc.atomic.write if(%[[IFCOND1]]) %[[ADDR]] = %[[VAL]] : memref<i32>, i32
+  %ifCond = arith.constant true
+  acc.atomic.write if(%ifCond) %addr = %val : memref<i32>, i32
+
   return
 }
 
@@ -1793,6 +1805,19 @@ func.func @acc_atomic_update(%x : memref<i32>, %expr : i32, %xBool : memref<i1>,
     %newval = llvm.add %xval, %expr : i32
     acc.yield %newval : i32
   }
+
+  // CHECK: %[[IFCOND1:.*]] = arith.constant true
+  // CHECK-NEXT: acc.atomic.update if(%[[IFCOND1]]) %[[X]] : memref<i32>
+  // CHECK-NEXT: (%[[XVAL:.*]]: i32):
+  // CHECK-NEXT:   %[[NEWVAL:.*]] = llvm.add %[[XVAL]], %[[EXPR]] : i32
+  // CHECK-NEXT:   acc.yield %[[NEWVAL]] : i32
+  %ifCond = arith.constant true
+  acc.atomic.update if (%ifCond) %x : memref<i32> {
+  ^bb0(%xval: i32):
+    %newval = llvm.add %xval, %expr : i32
+    acc.yield %newval : i32
+  }
+
   // CHECK: acc.atomic.update %[[XBOOL]] : memref<i1>
   // CHECK-NEXT: (%[[XVAL:.*]]: i1):
   // CHECK-NEXT:   %[[NEWVAL:.*]] = llvm.and %[[XVAL]], %[[EXPRBOOL]] : i1
@@ -1902,6 +1927,17 @@ func.func @acc_atomic_capture(%v: memref<i32>, %x: memref<i32>, %expr: i32) {
     acc.atomic.write %x = %expr : memref<i32>, i32
   }
 
+  // CHECK: %[[IFCOND1:.*]] = arith.constant true
+  // CHECK-NEXT: acc.atomic.capture if(%[[IFCOND1]]) {
+  // CHECK-NEXT: acc.atomic.read %[[v]] = %[[x]] : memref<i32>, memref<i32>, i32
+  // CHECK-NEXT: acc.atomic.write %[[x]] = %[[expr]] : memref<i32>, i32
+  // CHECK-NEXT: }
+  %ifCond = arith.constant true
+  acc.atomic.capture if (%ifCond) {
+    acc.atomic.read %v = %x : memref<i32>, memref<i32>, i32
+    acc.atomic.write %x = %expr : memref<i32>, i32
+  }
+
   return
 }
 
diff --git a/mlir/test/Dialect/OpenACC/pointer-like-interface-alloc.mlir b/mlir/test/Dialect/OpenACC/pointer-like-interface-alloc.mlir
index 603ace85072ac..3d4bec7f4637e 100644
--- a/mlir/test/Dialect/OpenACC/pointer-like-interface-alloc.mlir
+++ b/mlir/test/Dialect/OpenACC/pointer-like-interface-alloc.mlir
@@ -3,7 +3,7 @@
 func.func @test_static_memref_alloc() {
   %0 = memref.alloca() {test.ptr} : memref<10x20xf32>
   // CHECK: Successfully generated alloc for operation: %[[ORIG:.*]] = memref.alloca() {test.ptr} : memref<10x20xf32>
-  // CHECK: Generated: %{{.*}} = memref.alloca() : memref<10x20xf32>
+  // CHECK: Generated: %{{.*}} = memref.alloca() {acc.var_name = #acc.var_name<"test_alloc">} : memref<10x20xf32>
   return
 }
 
@@ -19,6 +19,6 @@ func.func @test_dynamic_memref_alloc() {
   // CHECK: Generated: %[[DIM0:.*]] = memref.dim %[[ORIG]], %[[C0]] : memref<?x?xf32>
   // CHECK: Generated: %[[C1:.*]] = arith.constant 1 : index
   // CHECK: Generated: %[[DIM1:.*]] = memref.dim %[[ORIG]], %[[C1]] : memref<?x?xf32>
-  // CHECK: Generated: %{{.*}} = memref.alloc(%[[DIM0]], %[[DIM1]]) : memref<?x?xf32>
+  // CHECK: Generated: %{{.*}} = memref.alloc(%[[DIM0]], %[[DIM1]]) {acc.var_name = #acc.var_name<"test_alloc">} : memref<?x?xf32>
   return
 }
diff --git a/mlir/test/Dialect/OpenACC/recipe-populate-firstprivate.mlir b/mlir/test/Dialect/OpenACC/recipe-populate-firstprivate.mlir
new file mode 100644
index 0000000000000..8846c9e76a27a
--- /dev/null
+++ b/mlir/test/Dialect/OpenACC/recipe-populate-firstprivate.mlir
@@ -0,0 +1,102 @@
+// RUN: mlir-opt %s --split-input-file --pass-pipeline="builtin.module(test-acc-recipe-populate{recipe-type=firstprivate})" | FileCheck %s
+
+// CHECK: acc.firstprivate.recipe @firstprivate_scalar : memref<f32> init {
+// CHECK: ^bb0(%{{.*}}: memref<f32>):
+// CHECK:   %[[ALLOC:.*]] = memref.alloca() {acc.var_name = #acc.var_name<"scalar">} : memref<f32>
+// CHECK:   acc.yield %[[ALLOC]] : memref<f32>
+// CHECK: } copy {
+// CHECK: ^bb0(%[[SRC:.*]]: memref<f32>, %[[DST:.*]]: memref<f32>):
+// CHECK:   memref.copy %[[SRC]], %[[DST]] : memref<f32> to memref<f32>
+// CHECK:   acc.terminator
+// CHECK: }
+// CHECK-NOT: destroy
+
+func.func @test_scalar() {
+  %0 = memref.alloca() {test.var = "scalar"} : memref<f32>
+  return
+}
+
+// -----
+
+// CHECK: acc.firstprivate.recipe @firstprivate_static_2d : memref<10x20xf32> init {
+// CHECK: ^bb0(%{{.*}}: memref<10x20xf32>):
+// CHECK:   %[[ALLOC:.*]] = memref.alloca() {acc.var_name = #acc.var_name<"static_2d">} : memref<10x20xf32>
+// CHECK:   acc.yield %[[ALLOC]] : memref<10x20xf32>
+// CHECK: } copy {
+// CHECK: ^bb0(%[[SRC:.*]]: memref<10x20xf32>, %[[DST:.*]]: memref<10x20xf32>):
+// CHECK:   memref.copy %[[SRC]], %[[DST]] : memref<10x20xf32> to memref<10x20xf32>
+// CHECK:   acc.terminator
+// CHECK: }
+// CHECK-NOT: destroy
+
+func.func @test_static_2d() {
+  %0 = memref.alloca() {test.var = "static_2d"} : memref<10x20xf32>
+  return
+}
+
+// -----
+
+// CHECK: acc.firstprivate.recipe @firstprivate_dynamic_2d : memref<?x?xf32> init {
+// CHECK: ^bb0(%[[ARG:.*]]: memref<?x?xf32>):
+// CHECK:   %[[C0:.*]] = arith.constant 0 : index
+// CHECK:   %[[DIM0:.*]] = memref.dim %[[ARG]], %[[C0]] : memref<?x?xf32>
+// CHECK:   %[[C1:.*]] = arith.constant 1 : index
+// CHECK:   %[[DIM1:.*]] = memref.dim %[[ARG]], %[[C1]] : memref<?x?xf32>
+// CHECK:   %[[ALLOC:.*]] = memref.alloc(%[[DIM0]], %[[DIM1]]) {acc.var_name = #acc.var_name<"dynamic_2d">} : memref<?x?xf32>
+// CHECK:   acc.yield %[[ALLOC]] : memref<?x?xf32>
+// CHECK: } copy {
+// CHECK: ^bb0(%[[SRC:.*]]: memref<?x?xf32>, %[[DST:.*]]: memref<?x?xf32>):
+// CHECK:   memref.copy %[[SRC]], %[[DST]] : memref<?x?xf32> to memref<?x?xf32>
+// CHECK:   acc.terminator
+// CHECK: } destroy {
+// CHECK: ^bb0(%{{.*}}: memref<?x?xf32>, %[[VAL:.*]]: memref<?x?xf32>):
+// CHECK:   memref.dealloc %[[VAL]] : memref<?x?xf32>
+// CHECK:   acc.terminator
+// CHECK: }
+
+func.func @test_dynamic_2d(%arg0: index, %arg1: index) {
+  %0 = memref.alloc(%arg0, %arg1) {test.var = "dynamic_2d"} : memref<?x?xf32>
+  return
+}
+
+// -----
+
+// CHECK: acc.firstprivate.recipe @firstprivate_mixed_dims : memref<10x?xf32> init {
+// CHECK: ^bb0(%[[ARG:.*]]: memref<10x?xf32>):
+// CHECK:   %[[C1:.*]] = arith.constant 1 : index
+// CHECK:   %[[DIM1:.*]] = memref.dim %[[ARG]], %[[C1]] : memref<10x?xf32>
+// CHECK:   %[[ALLOC:.*]] = memref.alloc(%[[DIM1]]) {acc.var_name = #acc.var_name<"mixed_dims">} : memref<10x?xf32>
+// CHECK:   acc.yield %[[ALLOC]] : memref<10x?xf32>
+// CHECK: } copy {
+// CHECK: ^bb0(%[[SRC:.*]]: memref<10x?xf32>, %[[DST:.*]]: memref<10x?xf32>):
+// CHECK:   memref.copy %[[SRC]], %[[DST]] : memref<10x?xf32> to memref<10x?xf32>
+// CHECK:   acc.terminator
+// CHECK: } destroy {
+// CHECK: ^bb0(%{{.*}}: memref<10x?xf32>, %[[VAL:.*]]: memref<10x?xf32>):
+// CHECK:   memref.dealloc %[[VAL]] : memref<10x?xf32>
+// CHECK:   acc.terminator
+// CHECK: }
+
+func.func @test_mixed_dims(%arg0: index) {
+  %0 = memref.alloc(%arg0) {test.var = "mixed_dims"} : memref<10x?xf32>
+  return
+}
+
+// -----
+
+// CHECK: acc.firstprivate.recipe @firstprivate_scalar_int : memref<i32> init {
+// CHECK: ^bb0(%{{.*}}: memref<i32>):
+// CHECK:   %[[ALLOC:.*]] = memref.alloca() {acc.var_name = #acc.var_name<"scalar_int">} : memref<i32>
+// CHECK:   acc.yield %[[ALLOC]] : memref<i32>
+// CHECK: } copy {
+// CHECK: ^bb0(%[[SRC:.*]]: memref<i32>, %[[DST:.*]]: memref<i32>):
+// CHECK:   memref.copy %[[SRC]], %[[DST]] : memref<i32> to memref<i32>
+// CHECK:   acc.terminator
+// CHECK: }
+// CHECK-NOT: destroy
+
+func.func @test_scalar_int() {
+  %0 = memref.alloca() {test.var = "scalar_int"} : memref<i32>
+  return
+}
+
diff --git a/mlir/test/Dialect/OpenACC/recipe-populate-private.mlir b/mlir/test/Dialect/OpenACC/recipe-populate-private.mlir
new file mode 100644
index 0000000000000..3d5a91839da0d
--- /dev/null
+++ b/mlir/test/Dialect/OpenACC/recipe-populate-private.mlir
@@ -0,0 +1,82 @@
+// RUN: mlir-opt %s --split-input-file --pass-pipeline="builtin.module(test-acc-recipe-populate{recipe-type=private})" | FileCheck %s
+
+// CHECK: acc.private.recipe @private_scalar : memref<f32> init {
+// CHECK: ^bb0(%{{.*}}: memref<f32>):
+// CHECK:   %[[ALLOC:.*]] = memref.alloca() {acc.var_name = #acc.var_name<"scalar">} : memref<f32>
+// CHECK:   acc.yield %[[ALLOC]] : memref<f32>
+// CHECK: }
+// CHECK-NOT: destroy
+
+func.func @test_scalar() {
+  %0 = memref.alloca() {test.var = "scalar"} : memref<f32>
+  return
+}
+
+// -----
+
+// CHECK: acc.private.recipe @private_static_2d : memref<10x20xf32> init {
+// CHECK: ^bb0(%{{.*}}: memref<10x20xf32>):
+// CHECK:   %[[ALLOC:.*]] = memref.alloca() {acc.var_name = #acc.var_name<"static_2d">} : memref<10x20xf32>
+// CHECK:   acc.yield %[[ALLOC]] : memref<10x20xf32>
+// CHECK: }
+// CHECK-NOT: destroy
+
+func.func @test_static_2d() {
+  %0 = memref.alloca() {test.var = "static_2d"} : memref<10x20xf32>
+  return
+}
+
+// -----
+
+// CHECK: acc.private.recipe @private_dynamic_2d : memref<?x?xf32> init {
+// CHECK: ^bb0(%[[ARG:.*]]: memref<?x?xf32>):
+// CHECK:   %[[C0:.*]] = arith.constant 0 : index
+// CHECK:   %[[DIM0:.*]] = memref.dim %[[ARG]], %[[C0]] : memref<?x?xf32>
+// CHECK:   %[[C1:.*]] = arith.constant 1 : index
+// CHECK:   %[[DIM1:.*]] = memref.dim %[[ARG]], %[[C1]] : memref<?x?xf32>
+// CHECK:   %[[ALLOC:.*]] = memref.alloc(%[[DIM0]], %[[DIM1]]) {acc.var_name = #acc.var_name<"dynamic_2d">} : memref<?x?xf32>
+// CHECK:   acc.yield %[[ALLOC]] : memref<?x?xf32>
+// CHECK: } destroy {
+// CHECK: ^bb0(%{{.*}}: memref<?x?xf32>, %[[VAL:.*]]: memref<?x?xf32>):
+// CHECK:   memref.dealloc %[[VAL]] : memref<?x?xf32>
+// CHECK:   acc.terminator
+// CHECK: }
+
+func.func @test_dynamic_2d(%arg0: index, %arg1: index) {
+  %0 = memref.alloc(%arg0, %arg1) {test.var = "dynamic_2d"} : memref<?x?xf32>
+  return
+}
+
+// -----
+
+// CHECK: acc.private.recipe @private_mixed_dims : memref<10x?xf32> init {
+// CHECK: ^bb0(%[[ARG:.*]]: memref<10x?xf32>):
+// CHECK:   %[[C1:.*]] = arith.constant 1 : index
+// CHECK:   %[[DIM1:.*]] = memref.dim %[[ARG]], %[[C1]] : memref<10x?xf32>
+// CHECK:   %[[ALLOC:.*]] = memref.alloc(%[[DIM1]]) {acc.var_name = #acc.var_name<"mixed_dims">} : memref<10x?xf32>
+// CHECK:   acc.yield %[[ALLOC]] : memref<10x?xf32>
+// CHECK: } destroy {
+// CHECK: ^bb0(%{{.*}}: memref<10x?xf32>, %[[VAL:.*]]: memref<10x?xf32>):
+// CHECK:   memref.dealloc %[[VAL]] : memref<10x?xf32>
+// CHECK:   acc.terminator
+// CHECK: }
+
+func.func @test_mixed_dims(%arg0: index) {
+  %0 = memref.alloc(%arg0) {test.var = "mixed_dims"} : memref<10x?xf32>
+  return
+}
+
+// -----
+
+// CHECK: acc.private.recipe @private_scalar_int : memref<i32> init {
+// CHECK: ^bb0(%{{.*}}: memref<i32>):
+// CHECK:   %[[ALLOC:.*]] = memref.alloca() {acc.var_name = #acc.var_name<"scalar_int">} : memref<i32>
+// CHECK:   acc.yield %[[ALLOC]] : memref<i32>
+// CHECK: }
+// CHECK-NOT: destroy
+
+func.func @test_scalar_int() {
+  %0 = memref.alloca() {test.var = "scalar_int"} : memref<i32>
+  return
+}
+
diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir
index cbd863f88fd1f..ac29e20907b55 100644
--- a/mlir/test/Dialect/OpenMP/ops.mlir
+++ b/mlir/test/Dialect/OpenMP/ops.mlir
@@ -828,11 +828,11 @@ func.func @omp_target(%if_cond : i1, %device : si32,  %num_threads : i32, %devic
     // Test with optional map clause.
     // CHECK: %[[MAP_A:.*]] = omp.map.info var_ptr(%[[VAL_1:.*]] : memref<?xi32>, tensor<?xi32>)   map_clauses(always, to) capture(ByRef) -> memref<?xi32> {name = ""}
     // CHECK: %[[MAP_B:.*]] = omp.map.info var_ptr(%[[VAL_2:.*]] : memref<?xi32>, tensor<?xi32>)   map_clauses(tofrom) capture(ByRef) -> memref<?xi32> {name = ""}
-    // CHECK: %[[MAP_C:.*]] = omp.map.info var_ptr(%[[VAL_3:.*]] : memref<?xi32>, tensor<?xi32>)   map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref<?xi32> {name = ""}
+    // CHECK: %[[MAP_C:.*]] = omp.map.info var_ptr(%[[VAL_3:.*]] : memref<?xi32>, tensor<?xi32>)   map_clauses(storage) capture(ByRef) -> memref<?xi32> {name = ""}
     // CHECK: omp.target is_device_ptr(%[[VAL_4:.*]] : memref<i32>) has_device_addr(%[[MAP_A]] -> {{.*}} : memref<?xi32>) map_entries(%[[MAP_B]] -> {{.*}}, %[[MAP_C]] -> {{.*}} : memref<?xi32>, memref<?xi32>) {
     %mapv0 = omp.map.info var_ptr(%device_addr : memref<?xi32>, tensor<?xi32>)   map_clauses(always, to) capture(ByRef) -> memref<?xi32> {name = ""}
     %mapv1 = omp.map.info var_ptr(%map1 : memref<?xi32>, tensor<?xi32>)   map_clauses(tofrom) capture(ByRef) -> memref<?xi32> {name = ""}
-    %mapv2 = omp.map.info var_ptr(%map2 : memref<?xi32>, tensor<?xi32>)   map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref<?xi32> {name = ""}
+    %mapv2 = omp.map.info var_ptr(%map2 : memref<?xi32>, tensor<?xi32>)   map_clauses(storage) capture(ByRef) -> memref<?xi32> {name = ""}
     omp.target is_device_ptr(%device_ptr : memref<i32>) has_device_addr(%mapv0 -> %arg0 : memref<?xi32>) map_entries(%mapv1 -> %arg1, %mapv2 -> %arg2 : memref<?xi32>, memref<?xi32>) {
       omp.terminator
     }
@@ -868,20 +868,20 @@ func.func @omp_target_data (%if_cond : i1, %device : si32, %device_ptr: memref<i
     }
 
     // CHECK: %[[MAP_A:.*]] = omp.map.info var_ptr(%[[VAL_1:.*]] : memref<?xi32>, tensor<?xi32>)   map_clauses(tofrom) capture(ByRef) -> memref<?xi32> {name = ""}
-    // CHECK: %[[MAP_B:.*]] = omp.map.info var_ptr(%[[VAL_2:.*]] : memref<?xi32>, tensor<?xi32>)   map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref<?xi32> {name = ""}
+    // CHECK: %[[MAP_B:.*]] = omp.map.info var_ptr(%[[VAL_2:.*]] : memref<?xi32>, tensor<?xi32>)   map_clauses(storage) capture(ByRef) -> memref<?xi32> {name = ""}
     // CHECK: omp.target_data map_entries(%[[MAP_A]], %[[MAP_B]] : memref<?xi32>, memref<?xi32>)
     %mapv3 = omp.map.info var_ptr(%map1 : memref<?xi32>, tensor<?xi32>)   map_clauses(tofrom) capture(ByRef) -> memref<?xi32> {name = ""}
-    %mapv4 = omp.map.info var_ptr(%map2 : memref<?xi32>, tensor<?xi32>)   map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref<?xi32> {name = ""}
+    %mapv4 = omp.map.info var_ptr(%map2 : memref<?xi32>, tensor<?xi32>)   map_clauses(storage) capture(ByRef) -> memref<?xi32> {name = ""}
     omp.target_data map_entries(%mapv3, %mapv4 : memref<?xi32>, memref<?xi32>) {}
 
-    // CHECK: %[[MAP_A:.*]] = omp.map.info var_ptr(%[[VAL_3:.*]] : memref<?xi32>, tensor<?xi32>)   map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref<?xi32> {name = ""}
+    // CHECK: %[[MAP_A:.*]] = omp.map.info var_ptr(%[[VAL_3:.*]] : memref<?xi32>, tensor<?xi32>)   map_clauses(storage) capture(ByRef) -> memref<?xi32> {name = ""}
     // CHECK: omp.target_enter_data device(%[[VAL_1:.*]] : si32) if(%[[VAL_0:.*]]) map_entries(%[[MAP_A]] : memref<?xi32>) nowait
-    %mapv5 = omp.map.info var_ptr(%map1 : memref<?xi32>, tensor<?xi32>)   map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref<?xi32> {name = ""}
+    %mapv5 = omp.map.info var_ptr(%map1 : memref<?xi32>, tensor<?xi32>)   map_clauses(storage) capture(ByRef) -> memref<?xi32> {name = ""}
     omp.target_enter_data if(%if_cond) device(%device : si32) nowait map_entries(%mapv5 : memref<?xi32>)
 
-    // CHECK: %[[MAP_A:.*]] = omp.map.info var_ptr(%[[VAL_3:.*]] : memref<?xi32>, tensor<?xi32>)   map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref<?xi32> {name = ""}
+    // CHECK: %[[MAP_A:.*]] = omp.map.info var_ptr(%[[VAL_3:.*]] : memref<?xi32>, tensor<?xi32>)   map_clauses(storage) capture(ByRef) -> memref<?xi32> {name = ""}
     // CHECK: omp.target_exit_data device(%[[VAL_1:.*]] : si32) if(%[[VAL_0:.*]]) map_entries(%[[MAP_A]] : memref<?xi32>) nowait
-    %mapv6 = omp.map.info var_ptr(%map2 : memref<?xi32>, tensor<?xi32>)   map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref<?xi32> {name = ""}
+    %mapv6 = omp.map.info var_ptr(%map2 : memref<?xi32>, tensor<?xi32>)   map_clauses(storage) capture(ByRef) -> memref<?xi32> {name = ""}
     omp.target_exit_data if(%if_cond) device(%device : si32) nowait map_entries(%mapv6 : memref<?xi32>)
 
     // CHECK: %[[MAP_A:.*]] = omp.map.info var_ptr(%[[VAL_2:.*]] : memref<?xi32>, tensor<?xi32>)   map_clauses(ompx_hold, to) capture(ByRef) -> memref<?xi32> {name = ""}
@@ -2790,13 +2790,13 @@ func.func @omp_targets_with_map_bounds(%arg0: !llvm.ptr, %arg1: !llvm.ptr) -> ()
   // CHECK: %[[C_12:.*]] = llvm.mlir.constant(2 : index) : i64
   // CHECK: %[[C_13:.*]] = llvm.mlir.constant(2 : index) : i64
   // CHECK: %[[BOUNDS1:.*]] = omp.map.bounds   lower_bound(%[[C_11]] : i64) upper_bound(%[[C_10]] : i64) stride(%[[C_12]] : i64) start_idx(%[[C_13]] : i64)
-  // CHECK: %[[MAP1:.*]] = omp.map.info var_ptr(%[[ARG1]] : !llvm.ptr, !llvm.array<10 x i32>) map_clauses(exit_release_or_enter_alloc) capture(ByCopy) mapper(@my_mapper) bounds(%[[BOUNDS1]]) -> !llvm.ptr {name = ""}
+  // CHECK: %[[MAP1:.*]] = omp.map.info var_ptr(%[[ARG1]] : !llvm.ptr, !llvm.array<10 x i32>) map_clauses(storage) capture(ByCopy) mapper(@my_mapper) bounds(%[[BOUNDS1]]) -> !llvm.ptr {name = ""}
     %6 = llvm.mlir.constant(9 : index) : i64
     %7 = llvm.mlir.constant(1 : index) : i64
     %8 = llvm.mlir.constant(2 : index) : i64
     %9 = llvm.mlir.constant(2 : index) : i64
     %10 = omp.map.bounds   lower_bound(%7 : i64) upper_bound(%6 : i64) stride(%8 : i64) start_idx(%9 : i64)
-    %mapv2 = omp.map.info var_ptr(%arg1 : !llvm.ptr, !llvm.array<10 x i32>) map_clauses(exit_release_or_enter_alloc) capture(ByCopy) mapper(@my_mapper) bounds(%10) -> !llvm.ptr {name = ""}
+    %mapv2 = omp.map.info var_ptr(%arg1 : !llvm.ptr, !llvm.array<10 x i32>) map_clauses(storage) capture(ByCopy) mapper(@my_mapper) bounds(%10) -> !llvm.ptr {name = ""}
 
     // CHECK: omp.target map_entries(%[[MAP0]] -> {{.*}}, %[[MAP1]] -> {{.*}} : !llvm.ptr, !llvm.ptr)
     omp.target map_entries(%mapv1 -> %arg2, %mapv2 -> %arg3 : !llvm.ptr, !llvm.ptr) {
@@ -2806,14 +2806,14 @@ func.func @omp_targets_with_map_bounds(%arg0: !llvm.ptr, %arg1: !llvm.ptr) -> ()
     // CHECK: omp.target_data map_entries(%[[MAP0]], %[[MAP1]] : !llvm.ptr, !llvm.ptr)
     omp.target_data map_entries(%mapv1, %mapv2 : !llvm.ptr, !llvm.ptr){}
 
-    // CHECK: %[[MAP2:.*]] = omp.map.info var_ptr(%[[ARG0]] : !llvm.ptr, !llvm.array<10 x i32>)   map_clauses(exit_release_or_enter_alloc) capture(VLAType) bounds(%[[BOUNDS0]]) -> !llvm.ptr {name = ""}
+    // CHECK: %[[MAP2:.*]] = omp.map.info var_ptr(%[[ARG0]] : !llvm.ptr, !llvm.array<10 x i32>)   map_clauses(storage) capture(VLAType) bounds(%[[BOUNDS0]]) -> !llvm.ptr {name = ""}
     // CHECK: omp.target_enter_data map_entries(%[[MAP2]] : !llvm.ptr)
-    %mapv3 = omp.map.info var_ptr(%arg0 : !llvm.ptr, !llvm.array<10 x i32>)   map_clauses(exit_release_or_enter_alloc) capture(VLAType) bounds(%4) -> !llvm.ptr {name = ""}
+    %mapv3 = omp.map.info var_ptr(%arg0 : !llvm.ptr, !llvm.array<10 x i32>)   map_clauses(storage) capture(VLAType) bounds(%4) -> !llvm.ptr {name = ""}
     omp.target_enter_data map_entries(%mapv3 : !llvm.ptr){}
 
-    // CHECK: %[[MAP3:.*]] = omp.map.info var_ptr(%[[ARG1]] : !llvm.ptr, !llvm.array<10 x i32>)   map_clauses(exit_release_or_enter_alloc) capture(This) bounds(%[[BOUNDS1]]) -> !llvm.ptr {name = ""}
+    // CHECK: %[[MAP3:.*]] = omp.map.info var_ptr(%[[ARG1]] : !llvm.ptr, !llvm.array<10 x i32>)   map_clauses(storage) capture(This) bounds(%[[BOUNDS1]]) -> !llvm.ptr {name = ""}
     // CHECK: omp.target_exit_data map_entries(%[[MAP3]] : !llvm.ptr)
-    %mapv4 = omp.map.info var_ptr(%arg1 : !llvm.ptr, !llvm.array<10 x i32>)   map_clauses(exit_release_or_enter_alloc) capture(This) bounds(%10) -> !llvm.ptr {name = ""}
+    %mapv4 = omp.map.info var_ptr(%arg1 : !llvm.ptr, !llvm.array<10 x i32>)   map_clauses(storage) capture(This) bounds(%10) -> !llvm.ptr {name = ""}
     omp.target_exit_data map_entries(%mapv4 : !llvm.ptr){}
 
     return
@@ -2852,7 +2852,7 @@ func.func @omp_target_enter_update_exit_data_depend(%a: memref<?xi32>, %b: memre
 // CHECK-NEXT: [[MAP2:%.*]] = omp.map.info
   %map_a = omp.map.info var_ptr(%a: memref<?xi32>, tensor<?xi32>) map_clauses(to) capture(ByRef) -> memref<?xi32>
   %map_b = omp.map.info var_ptr(%b: memref<?xi32>, tensor<?xi32>) map_clauses(from) capture(ByRef) -> memref<?xi32>
-  %map_c = omp.map.info var_ptr(%c: memref<?xi32>, tensor<?xi32>) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref<?xi32>
+  %map_c = omp.map.info var_ptr(%c: memref<?xi32>, tensor<?xi32>) map_clauses(storage) capture(ByRef) -> memref<?xi32>
 
   // Do some work on the host that writes to 'a'
   omp.task depend(taskdependout -> %a : memref<?xi32>) {
@@ -3014,7 +3014,7 @@ func.func @parallel_op_reduction_and_private(%priv_var: !llvm.ptr, %priv_var2: !
 // CHECK-LABEL: omp_target_private
 func.func @omp_target_private(%map1: memref<?xi32>, %map2: memref<?xi32>, %priv_var: !llvm.ptr) -> () {
   %mapv1 = omp.map.info var_ptr(%map1 : memref<?xi32>, tensor<?xi32>) map_clauses(tofrom) capture(ByRef) -> memref<?xi32> {name = ""}
-  %mapv2 = omp.map.info var_ptr(%map2 : memref<?xi32>, tensor<?xi32>) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref<?xi32> {name = ""}
+  %mapv2 = omp.map.info var_ptr(%map2 : memref<?xi32>, tensor<?xi32>) map_clauses(storage) capture(ByRef) -> memref<?xi32> {name = ""}
 
   // CHECK: omp.target
   // CHECK-SAME: private(
@@ -3047,7 +3047,7 @@ func.func @omp_target_private(%map1: memref<?xi32>, %map2: memref<?xi32>, %priv_
 // CHECK-LABEL: omp_target_private_with_map_idx
 func.func @omp_target_private_with_map_idx(%map1: memref<?xi32>, %map2: memref<?xi32>, %priv_var: !llvm.ptr) -> () {
   %mapv1 = omp.map.info var_ptr(%map1 : memref<?xi32>, tensor<?xi32>) map_clauses(tofrom) capture(ByRef) -> memref<?xi32> {name = ""}
-  %mapv2 = omp.map.info var_ptr(%map2 : memref<?xi32>, tensor<?xi32>) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> memref<?xi32> {name = ""}
+  %mapv2 = omp.map.info var_ptr(%map2 : memref<?xi32>, tensor<?xi32>) map_clauses(storage) capture(ByRef) -> memref<?xi32> {name = ""}
 
   // CHECK: omp.target
 
@@ -3321,3 +3321,49 @@ func.func @omp_workdistribute() {
   }
   return
 }
+
+func.func @omp_target_map_clause_type_test(%arg0 : memref<?xi32>) -> () {
+    // Test new map clause additions
+    // CHECK: %{{.*}}map_clauses(none){{.*}}
+    // CHECK: %{{.*}}map_clauses(to){{.*}}
+    // CHECK: %{{.*}}map_clauses(from){{.*}}
+    // CHECK: %{{.*}}map_clauses(tofrom){{.*}}
+    // CHECK: %{{.*}}map_clauses(storage){{.*}}
+    // CHECK: %{{.*}}map_clauses(delete){{.*}}
+    // CHECK: %{{.*}}map_clauses(return_param){{.*}}
+    // CHECK: %{{.*}}map_clauses(private){{.*}}
+    // CHECK: %{{.*}}map_clauses(literal){{.*}}
+    // CHECK: %{{.*}}map_clauses(implicit){{.*}}
+    // CHECK: %{{.*}}map_clauses(close){{.*}}
+    // CHECK: %{{.*}}map_clauses(present){{.*}}
+    // CHECK: %{{.*}}map_clauses(ompx_hold){{.*}}
+    // CHECK: %{{.*}}map_clauses(attach){{.*}}
+    // CHECK: %{{.*}}map_clauses(attach_always){{.*}}
+    // CHECK: %{{.*}}map_clauses(attach_none){{.*}}
+    // CHECK: %{{.*}}map_clauses(attach_auto){{.*}}
+    // CHECK: %{{.*}}map_clauses(ref_ptr){{.*}}
+    // CHECK: %{{.*}}map_clauses(ref_ptee){{.*}}
+    // CHECK: %{{.*}}map_clauses(ref_ptr_ptee){{.*}}
+    %mapv0 = omp.map.info var_ptr(%arg0 : memref<?xi32>, tensor<?xi32>) map_clauses(none) capture(ByRef) -> memref<?xi32> {name = ""}
+    %mapv1 = omp.map.info var_ptr(%arg0 : memref<?xi32>, tensor<?xi32>) map_clauses(to) capture(ByRef) -> memref<?xi32> {name = ""}
+    %mapv2 = omp.map.info var_ptr(%arg0 : memref<?xi32>, tensor<?xi32>) map_clauses(from) capture(ByRef) -> memref<?xi32> {name = ""}
+    %mapv3 = omp.map.info var_ptr(%arg0 : memref<?xi32>, tensor<?xi32>) map_clauses(tofrom) capture(ByRef) -> memref<?xi32> {name = ""}
+    %mapv4 = omp.map.info var_ptr(%arg0 : memref<?xi32>, tensor<?xi32>) map_clauses(storage) capture(ByRef) -> memref<?xi32> {name = ""}
+    %mapv5 = omp.map.info var_ptr(%arg0 : memref<?xi32>, tensor<?xi32>) map_clauses(delete) capture(ByRef) -> memref<?xi32> {name = ""}
+    %mapv6 = omp.map.info var_ptr(%arg0 : memref<?xi32>, tensor<?xi32>) map_clauses(return_param) capture(ByRef) -> memref<?xi32> {name = ""}
+    %mapv7 = omp.map.info var_ptr(%arg0 : memref<?xi32>, tensor<?xi32>) map_clauses(private) capture(ByRef) -> memref<?xi32> {name = ""}
+    %mapv8 = omp.map.info var_ptr(%arg0 : memref<?xi32>, tensor<?xi32>) map_clauses(literal) capture(ByRef) -> memref<?xi32> {name = ""}
+    %mapv9 = omp.map.info var_ptr(%arg0 : memref<?xi32>, tensor<?xi32>) map_clauses(implicit) capture(ByRef) -> memref<?xi32> {name = ""}
+    %mapv10 = omp.map.info var_ptr(%arg0 : memref<?xi32>, tensor<?xi32>) map_clauses(close) capture(ByRef) -> memref<?xi32> {name = ""}
+    %mapv11 = omp.map.info var_ptr(%arg0 : memref<?xi32>, tensor<?xi32>) map_clauses(present) capture(ByRef) -> memref<?xi32> {name = ""}
+    %mapv12 = omp.map.info var_ptr(%arg0 : memref<?xi32>, tensor<?xi32>) map_clauses(ompx_hold) capture(ByRef) -> memref<?xi32> {name = ""}
+    %mapv13 = omp.map.info var_ptr(%arg0 : memref<?xi32>, tensor<?xi32>) map_clauses(attach) capture(ByRef) -> memref<?xi32> {name = ""}
+    %mapv14 = omp.map.info var_ptr(%arg0 : memref<?xi32>, tensor<?xi32>) map_clauses(attach_always) capture(ByRef) -> memref<?xi32> {name = ""}
+    %mapv15 = omp.map.info var_ptr(%arg0 : memref<?xi32>, tensor<?xi32>) map_clauses(attach_none) capture(ByRef) -> memref<?xi32> {name = ""}
+    %mapv16 = omp.map.info var_ptr(%arg0 : memref<?xi32>, tensor<?xi32>) map_clauses(attach_auto) capture(ByRef) -> memref<?xi32> {name = ""}
+    %mapv17 = omp.map.info var_ptr(%arg0 : memref<?xi32>, tensor<?xi32>) map_clauses(ref_ptr) capture(ByRef) -> memref<?xi32> {name = ""}
+    %mapv18 = omp.map.info var_ptr(%arg0 : memref<?xi32>, tensor<?xi32>) map_clauses(ref_ptee) capture(ByRef) -> memref<?xi32> {name = ""}
+    %mapv19 = omp.map.info var_ptr(%arg0 : memref<?xi32>, tensor<?xi32>) map_clauses(ref_ptr_ptee) capture(ByRef) -> memref<?xi32> {name = ""}
+
+    return
+}
diff --git a/mlir/test/Dialect/SCF/for-loop-peeling-front.mlir b/mlir/test/Dialect/SCF/for-loop-peeling-front.mlir
index fe3b3e686a3e4..1737c6b6f6c5f 100644
--- a/mlir/test/Dialect/SCF/for-loop-peeling-front.mlir
+++ b/mlir/test/Dialect/SCF/for-loop-peeling-front.mlir
@@ -8,7 +8,7 @@
 //  CHECK-DAG:   %[[C17:.*]] = arith.constant 17 : index
 //      CHECK:   %[[FIRST:.*]] = scf.for %[[IV:.*]] = %[[C0]] to %[[C4]]
 // CHECK-SAME:       step %[[C4]] iter_args(%[[ACC:.*]] = %[[C0_I32]]) -> (i32) {
-//      CHECK:     %[[MIN:.*]] = affine.min #[[MAP]](%[[C4]], %[[IV]])[%[[C4]]]
+//      CHECK:     %[[MIN:.*]] = affine.min #[[MAP]](%[[C17]], %[[IV]])[%[[C4]]]
 //      CHECK:     %[[CAST:.*]] = arith.index_cast %[[MIN]] : index to i32
 //      CHECK:     %[[INIT:.*]] = arith.addi %[[ACC]], %[[CAST]] : i32
 //      CHECK:     scf.yield %[[INIT]]
@@ -35,7 +35,49 @@ func.func @fully_static_bounds() -> i32 {
   }
   return %r : i32
 }
+// -----
 
+// CHECK-LABEL:   func.func @static_two_iterations_ub_used_in_loop(
+// CHECK-SAME:                                                    %[[IFM1:.*]]: memref<1xi32>) -> i32 {
+// CHECK:           %[[C0_I32:.*]] = arith.constant 0 : i32
+// CHECK:           %[[C0_IDX:.*]] = arith.constant 0 : index
+// CHECK:           %[[C4_IDX:.*]] = arith.constant 4 : index
+// CHECK:           %[[C7_IDX:.*]] = arith.constant 7 : index
+// CHECK:           %[[FOR1:.*]] = scf.for %[[IV1:.*]] = %[[C0_IDX]] to %[[C4_IDX]] step %[[C4_IDX]] iter_args(%[[ARG1:.*]] = %[[C0_I32]]) -> (i32) {
+// CHECK:             %[[AFFINE_MIN1:.*]] = affine.min #map(%[[C7_IDX]], %[[IV1]]){{\[}}%[[C4_IDX]]]
+// CHECK:             %[[CAST1:.*]] = arith.index_cast %[[AFFINE_MIN1]] : index to i32
+// CHECK:             %[[ADDI1:.*]] = arith.addi %[[ARG1]], %[[CAST1]] : i32
+// CHECK:             %[[LOAD1:.*]] = memref.load %[[IFM1]]{{\[}}%[[C7_IDX]]] : memref<1xi32>
+// CHECK:             %[[ADDI2:.*]] = arith.addi %[[ADDI1]], %[[LOAD1]] : i32
+// CHECK:             scf.yield %[[ADDI2]] : i32
+// CHECK:           }
+// CHECK:           %[[FOR2:.*]] = scf.for %[[IV2:.*]] = %[[C4_IDX]] to %[[C7_IDX]] step %[[C4_IDX]] iter_args(%[[ARG2:.*]] = %[[RESULT1:.*]]) -> (i32) {
+// CHECK:             %[[AFFINE_MIN2:.*]] = affine.min #map(%[[C7_IDX]], %[[IV2]]){{\[}}%[[C4_IDX]]]
+// CHECK:             %[[CAST2:.*]] = arith.index_cast %[[AFFINE_MIN2]] : index to i32
+// CHECK:             %[[ADDI3:.*]] = arith.addi %[[ARG2]], %[[CAST2]] : i32
+// CHECK:             %[[LOAD2:.*]] = memref.load %[[IFM1]]{{\[}}%[[C7_IDX]]] : memref<1xi32>
+// CHECK:             %[[ADDI4:.*]] = arith.addi %[[ADDI3]], %[[LOAD2]] : i32
+// CHECK:             scf.yield %[[ADDI4]] : i32
+// CHECK:           }
+// CHECK:           return %[[RESULT2:.*]] : i32
+// CHECK:         }
+
+#map = affine_map<(d0, d1)[s0] -> (s0, d0 - d1)>
+func.func @static_two_iterations_ub_used_in_loop(%arg0: memref<1xi32>) -> i32 {
+  %c0_i32 = arith.constant 0 : i32
+  %lb = arith.constant 0 : index
+  %step = arith.constant 4 : index
+  %ub = arith.constant 7 : index
+  %r = scf.for %iv = %lb to %ub step %step iter_args(%arg = %c0_i32) -> i32 {
+    %s = affine.min #map(%ub, %iv)[%step]
+    %casted = arith.index_cast %s : index to i32
+    %0 = arith.addi %arg, %casted : i32
+    %1 = memref.load %arg0[%ub] : memref<1xi32>
+    %result = arith.addi %0, %1 : i32
+    scf.yield %result : i32
+  }
+  return %r : i32
+}
 // -----
 
 //  CHECK-DAG: #[[MAP:.*]] = affine_map<(d0, d1)[s0] -> (4, d0 - d1)>
@@ -44,7 +86,7 @@ func.func @fully_static_bounds() -> i32 {
 //  CHECK-DAG:   %[[C4:.*]] = arith.constant 4 : index
 //  CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
 //      CHECK:   scf.for %[[IV:.*]] = %[[C0]] to %[[C4]] step %[[C4]] {
-//      CHECK:     %[[MIN:.*]] = affine.min #[[MAP]](%[[C4]], %[[IV]])[%[[C4]]]
+//      CHECK:     %[[MIN:.*]] = affine.min #[[MAP]](%[[UB]], %[[IV]])[%[[C4]]]
 //      CHECK:     %[[LOAD:.*]] = memref.load %[[MEMREF]][]
 //      CHECK:     %[[CAST:.*]] = arith.index_cast %[[MIN]]
 //      CHECK:     %[[ADD:.*]] = arith.addi %[[LOAD]], %[[CAST]] : i32
@@ -83,7 +125,7 @@ func.func @no_loop_results(%ub : index, %d : memref<i32>) {
 //      CHECK:   %[[NEW_UB:.*]] = affine.apply #[[MAP0]]()[%[[LB]], %[[STEP]]]
 //      CHECK:   %[[FIRST:.*]] = scf.for %[[IV:.*]] = %[[LB]] to %[[NEW_UB]]
 // CHECK-SAME:       step %[[STEP]] iter_args(%[[ACC:.*]] = %[[C0_I32]]) -> (i32) {
-//      CHECK:     %[[MIN:.*]] = affine.min #[[MAP1]](%[[NEW_UB]], %[[IV]])[%[[STEP]]]
+//      CHECK:     %[[MIN:.*]] = affine.min #[[MAP1]](%[[UB]], %[[IV]])[%[[STEP]]]
 //      CHECK:     %[[CAST:.*]] = arith.index_cast %[[MIN]] : index to i32
 //      CHECK:     %[[ADD:.*]] = arith.addi %[[ACC]], %[[CAST]] : i32
 //      CHECK:     scf.yield %[[ADD]]
@@ -119,7 +161,7 @@ func.func @fully_dynamic_bounds(%lb : index, %ub: index, %step: index) -> i32 {
 //  CHECK-DAG:   %[[C6:.*]] = arith.constant 6 : index
 //      CHECK:   %[[FIRST:.*]] = scf.for %[[IV:.*]] = %[[C2]] to %[[C6]]
 // CHECK-SAME:       step %[[C4]] iter_args(%[[ACC:.*]] = %[[C0_I32]]) -> (i32) {
-//      CHECK:     %[[MIN:.*]] = affine.min #[[MAP]](%[[C6]], %[[IV]])[%[[C4]]]
+//      CHECK:     %[[MIN:.*]] = affine.min #[[MAP]](%[[C8]], %[[IV]])[%[[C4]]]
 //      CHECK:     %[[CAST:.*]] = arith.index_cast %[[MIN]] : index to i32
 //      CHECK:     %[[INIT:.*]] = arith.addi %[[ACC]], %[[CAST]] : i32
 //      CHECK:     scf.yield %[[INIT]]
diff --git a/mlir/test/Dialect/SCF/for-loop-peeling.mlir b/mlir/test/Dialect/SCF/for-loop-peeling.mlir
index be58548b1fcfe..084576625f32c 100644
--- a/mlir/test/Dialect/SCF/for-loop-peeling.mlir
+++ b/mlir/test/Dialect/SCF/for-loop-peeling.mlir
@@ -35,6 +35,35 @@ func.func @fully_dynamic_bounds(%lb : index, %ub: index, %step: index) -> i32 {
 
 // -----
 
+// CHECK-LABEL:   func.func @static_two_iterations_ub_used_in_loop(
+// CHECK-SAME:                                                         %[[IFM1:.*]]: memref<1xi32>) -> i32 {
+// CHECK:           %[[C7_I32:.*]] = arith.constant 7 : i32
+// CHECK:           %[[C7_IDX:.*]] = arith.constant 7 : index
+// CHECK:           %[[LOAD1:.*]] = memref.load %[[IFM1]]{{\[}}%[[C7_IDX]]] : memref<1xi32>
+// CHECK:           %[[ADDI1:.*]] = arith.addi %[[LOAD1]], %[[C7_I32]] : i32
+// CHECK:           %[[LOAD2:.*]] = memref.load %[[IFM1]]{{\[}}%[[C7_IDX]]] : memref<1xi32>
+// CHECK:           %[[ADDI2:.*]] = arith.addi %[[ADDI1]], %[[LOAD2]] : i32
+// CHECK:           return %[[ADDI2]] : i32
+// CHECK:         }
+
+#map = affine_map<(d0, d1)[s0] -> (s0, d0 - d1)>
+func.func @static_two_iterations_ub_used_in_loop(%arg0: memref<1xi32>) -> i32 {
+  %c0_i32 = arith.constant 0 : i32
+  %lb = arith.constant 0 : index
+  %step = arith.constant 4 : index
+  %ub = arith.constant 7 : index
+  %r = scf.for %iv = %lb to %ub step %step iter_args(%arg = %c0_i32) -> i32 {
+    %s = affine.min #map(%ub, %iv)[%step]
+    %casted = arith.index_cast %s : index to i32
+    %0 = arith.addi %arg, %casted : i32
+    %1 = memref.load %arg0[%ub] : memref<1xi32>
+    %result = arith.addi %0, %1 : i32
+    scf.yield %result : i32
+  }
+  return %r : i32
+}
+// -----
+
 //      CHECK: func @fully_static_bounds(
 //  CHECK-DAG:   %[[C0_I32:.*]] = arith.constant 0 : i32
 //  CHECK-DAG:   %[[C1_I32:.*]] = arith.constant 1 : i32
diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize.mlir
index a1067ec3ba05f..af09dc865e2de 100644
--- a/mlir/test/Dialect/SCF/one-shot-bufferize.mlir
+++ b/mlir/test/Dialect/SCF/one-shot-bufferize.mlir
@@ -8,11 +8,11 @@
 // Test bufferization using memref types that have no layout map.
 // RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="allow-return-allocs-from-loops unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries" -split-input-file -o /dev/null
 
-// CHECK-LABEL: func @scf_for_yield_only(
+// CHECK-LABEL: func private @scf_for_yield_only(
 //  CHECK-SAME:   %[[A:[a-zA-Z0-9]*]]: memref<?xf32, strided<[?], offset: ?>>,
 //  CHECK-SAME:   %[[t:[a-zA-Z0-9]*]]: memref<?xf32, strided<[?], offset: ?>>
 //  CHECK-SAME:   ) -> memref<?xf32> {
-func.func @scf_for_yield_only(
+func.func private @scf_for_yield_only(
     %A : tensor<?xf32> {bufferization.writable = false},
     %B : tensor<?xf32> {bufferization.writable = true},
     %lb : index, %ub : index, %step : index)
@@ -85,11 +85,11 @@ func.func @nested_scf_for(%A : tensor<?xf32> {bufferization.writable = true},
 
 // -----
 
-// CHECK-LABEL: func @scf_for_with_tensor.insert_slice
+// CHECK-LABEL: func private @scf_for_with_tensor.insert_slice
 //  CHECK-SAME:   %[[A:[a-zA-Z0-9]*]]: memref<?xf32, strided<[?], offset: ?>>
 //  CHECK-SAME:   %[[B:[a-zA-Z0-9]*]]: memref<?xf32, strided<[?], offset: ?>>
 //  CHECK-SAME:   %[[C:[a-zA-Z0-9]*]]: memref<4xf32, strided<[?], offset: ?>>
-func.func @scf_for_with_tensor.insert_slice(
+func.func private @scf_for_with_tensor.insert_slice(
     %A : tensor<?xf32> {bufferization.writable = false},
     %B : tensor<?xf32> {bufferization.writable = true},
     %C : tensor<4xf32> {bufferization.writable = false},
@@ -471,11 +471,11 @@ func.func @scf_while_iter_arg_result_mismatch(%arg0: tensor<5xi1>,
 
 // -----
 
-// CHECK-LABEL: func.func @parallel_insert_slice_no_conflict(
+// CHECK-LABEL: func private @parallel_insert_slice_no_conflict(
 //  CHECK-SAME:     %[[idx:.*]]: index, %[[idx2:.*]]: index,
 //  CHECK-SAME:     %[[arg1:.*]]: memref<?xf32, strided{{.*}}>,
 //  CHECK-SAME:     %[[arg2:.*]]: memref<?xf32, strided{{.*}}>
-func.func @parallel_insert_slice_no_conflict(
+func.func private @parallel_insert_slice_no_conflict(
     %idx: index,
     %idx2: index,
     %arg1: tensor<?xf32> {bufferization.writable = true},
diff --git a/mlir/test/Dialect/SPIRV/Transforms/gl-canonicalize.mlir b/mlir/test/Dialect/SPIRV/Transforms/gl-canonicalize.mlir
index 33b877667512e..c1447b38f0a48 100644
--- a/mlir/test/Dialect/SPIRV/Transforms/gl-canonicalize.mlir
+++ b/mlir/test/Dialect/SPIRV/Transforms/gl-canonicalize.mlir
@@ -177,25 +177,3 @@ func.func @clamp_ulessthanequal(%input: i32, %min: i32, %max: i32) -> i32 {
   // CHECK-NEXT: spirv.ReturnValue [[RES]]
   spirv.ReturnValue %2 : i32
 }
-
-// -----
-
-//===----------------------------------------------------------------------===//
-// spirv.GL.Length
-//===----------------------------------------------------------------------===//
-
-// CHECK-LABEL: @convert_length_into_fabs_scalar
-func.func @convert_length_into_fabs_scalar(%arg0 : f32) -> f32 {
-  //CHECK: spirv.GL.FAbs {{%.*}} : f32
-  //CHECK-NOT: spirv.GL.Length
-  %0 = spirv.GL.Length %arg0 : f32 -> f32
-  spirv.ReturnValue %0 : f32
-}
-
-// CHECK-LABEL: @dont_convert_length_into_fabs_vec
-func.func @dont_convert_length_into_fabs_vec(%arg0 : vector<3xf32>) -> f32 {
-  //CHECK: spirv.GL.Length {{%.*}} : vector<3xf32> -> f32
-  //CHECK-NOT: spirv.GL.FAbs
-  %0 = spirv.GL.Length %arg0 : vector<3xf32> -> f32
-  spirv.ReturnValue %0 : f32
-}
diff --git a/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir b/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir
index 5f95da25cbc74..f66cf7ae53266 100644
--- a/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir
+++ b/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir
@@ -8,12 +8,12 @@
 // Test bufferization using memref types that have no layout map.
 // RUN: mlir-opt %s -one-shot-bufferize="unknown-type-conversion=identity-layout-map bufferize-function-boundaries" -split-input-file -o /dev/null
 
-// CHECK-LABEL: func @insert_slice_fun
+// CHECK-LABEL: func private @insert_slice_fun
 //  CHECK-SAME:   %[[A0:[a-zA-Z0-9]*]]: memref<?xf32, strided<[?], offset: ?>>,
 //  CHECK-SAME:   %[[A1:[a-zA-Z0-9]*]]: memref<?xf32, strided<[?], offset: ?>>,
 //  CHECK-SAME:   %[[t0:[a-zA-Z0-9]*]]: memref<4xf32, strided<[?], offset: ?>>,
 //  CHECK-SAME:   %[[t1:[a-zA-Z0-9]*]]: memref<4xf32, strided<[?], offset: ?>>
-func.func @insert_slice_fun(
+func.func private @insert_slice_fun(
     %A0 : tensor<?xf32> {bufferization.writable = false},
     %A1 : tensor<?xf32> {bufferization.writable = true},
     %t0 : tensor<4xf32> {bufferization.writable = false},
@@ -331,12 +331,12 @@ func.func @dim_not_reading(%t: tensor<?xf32>, %f: f32, %pos: index)
 // -----
 
 //       CHECK: #[[$map:.*]] = affine_map<(d0) -> (d0 + 5)>
-// CHECK-LABEL: func.func @cast_retains_buffer_layout(
+// CHECK-LABEL: func.func private @cast_retains_buffer_layout(
 //  CHECK-SAME:     %[[t:.*]]: memref<?xf32, #[[$map]]>, %[[sz:.*]]: index) -> memref<?xf32, strided<[1], offset: 7>> {
 //       CHECK:   %[[casted:.*]] = memref.cast %[[t]] : memref<?xf32, #[[$map]]> to memref<10xf32, #[[$map]]>
 //       CHECK:   %[[slice:.*]] = memref.subview %[[casted]][2] [%[[sz]]] [1] : memref<10xf32, #[[$map]]> to memref<?xf32, strided<[1], offset: 7>>
 //       CHECK:   return %[[slice]]
-func.func @cast_retains_buffer_layout(
+func.func private @cast_retains_buffer_layout(
     %t: tensor<?xf32>
         {bufferization.buffer_layout = affine_map<(d0) -> (d0 + 5)>},
     %sz: index)
@@ -353,12 +353,12 @@ func.func @cast_retains_buffer_layout(
 
 // -----
 
-// CHECK-LABEL: func.func @cast_retains_buffer_layout_strided(
+// CHECK-LABEL: func private @cast_retains_buffer_layout_strided(
 //  CHECK-SAME:     %[[t:.*]]: memref<?xf32, strided<[1], offset: 5>>, %[[sz:.*]]: index) -> memref<?xf32, strided<[1], offset: 7>> {
 //       CHECK:   %[[casted:.*]] = memref.cast %[[t]] : memref<?xf32, strided<[1], offset: 5>> to memref<10xf32, strided<[1], offset: 5>>
 //       CHECK:   %[[slice:.*]] = memref.subview %[[casted]][2] [%[[sz]]] [1] : memref<10xf32, strided<[1], offset: 5>> to memref<?xf32, strided<[1], offset: 7>>
 //       CHECK:   return %[[slice]]
-func.func @cast_retains_buffer_layout_strided(
+func.func private @cast_retains_buffer_layout_strided(
     %t: tensor<?xf32>
         {bufferization.buffer_layout = strided<[1], offset: 5>},
     %sz: index)
@@ -490,3 +490,32 @@ func.func @collapse_shape_regression(
   tensor.collapse_shape %0[[0, 1]] : tensor<5x6xf32> into tensor<30xf32>
   return
 }
+
+// -----
+
+// CHECK-LABEL: func private @mult_return_callee(
+//  CHECK-SAME:   %[[T:.*]]: memref<?xf32, strided<[?], offset: ?>>, %[[COND:.*]]: i1,
+//  CHECK-SAME:   %[[A:.*]]: index, %[[B:.*]]: index) -> index {
+//       CHECK:   cf.cond_br %[[COND]], ^bb1, ^bb2
+//       CHECK: ^bb1:
+//       CHECK:   return %[[A]] : index
+//       CHECK: ^bb2:
+//       CHECK:   return %[[B]] : index
+func.func private @mult_return_callee(%t: tensor<?xf32>,  %cond:i1, %a: index, %b: index) -> (tensor<10xf32>, index) {
+  %casted = tensor.cast %t : tensor<?xf32> to tensor<10xf32>
+  cf.cond_br %cond,^a, ^b
+^a:
+  return %casted, %a : tensor<10xf32>, index
+^b:
+  return %casted, %b : tensor<10xf32>, index
+}
+
+// CHECK-LABEL: func @mult_return(
+//  CHECK-SAME:   %[[T:.*]]: memref<?xf32, strided<[?], offset: ?>>, %[[COND:.*]]: i1,
+//  CHECK-SAME:   %[[A:.*]]: index, %[[B:.*]]: index) -> (memref<?xf32, strided<[?], offset: ?>>, index) {
+func.func @mult_return(%t: tensor<?xf32>,  %cond:i1, %a: index, %b: index) -> (tensor<10xf32>, index) {
+  // CHECK: %[[RET:.*]] = call @mult_return_callee(%[[T]], %[[COND]], %[[A]], %[[B]]) : (memref<?xf32, strided<[?], offset: ?>>, i1, index, index) -> index
+  // CHECK: return %[[T]], %[[RET]] : memref<?xf32, strided<[?], offset: ?>>, index
+  %t_res, %v = func.call @mult_return_callee(%t, %cond, %a, %b) : (tensor<?xf32>, i1, index, index) -> (tensor<10xf32>, index) 
+  return %t_res, %v : tensor<10xf32>, index
+}
diff --git a/mlir/test/Dialect/Tosa/constant-op-fold.mlir b/mlir/test/Dialect/Tosa/constant-op-fold.mlir
index d9d188dd25061..b1fbcdcc53e2f 100644
--- a/mlir/test/Dialect/Tosa/constant-op-fold.mlir
+++ b/mlir/test/Dialect/Tosa/constant-op-fold.mlir
@@ -1172,3 +1172,15 @@ func.func @reduce_sum_constant_aggressive() -> tensor<2x3xi32> {
   %res1 = tosa.add %res0, %argmax1 : (tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
   return %res1 : tensor<2x3xi32>
 }
+
+// -----
+
+// no_shift_op_reorder checks that %arg1 won't be reorder with %0
+// by the folder pass.
+// CHECK-LABEL: @no_shift_op_reorder
+func.func @no_shift_op_reorder (%arg0 : tensor<44x1xi16>, %arg1 : tensor<1xi8>) -> tensor<44x57xi32> {
+  %0 = "tosa.const"() {values = dense<1> : tensor<44x57xi16>} : () -> tensor<44x57xi16>
+  // CHECK: tosa.mul %arg0, %0, %arg1
+  %1 = tosa.mul %arg0, %0, %arg1 : (tensor<44x1xi16>, tensor<44x57xi16>, tensor<1xi8>) -> tensor<44x57xi32>
+  return %1 : tensor<44x57xi32>
+}
diff --git a/mlir/test/Dialect/Tosa/tosa-attach-target.mlir b/mlir/test/Dialect/Tosa/tosa-attach-target.mlir
index d6c886c44b013..a0c59c0c4bb3b 100644
--- a/mlir/test/Dialect/Tosa/tosa-attach-target.mlir
+++ b/mlir/test/Dialect/Tosa/tosa-attach-target.mlir
@@ -1,12 +1,14 @@
 // RUN: mlir-opt %s -split-input-file -tosa-attach-target="profiles=pro_int,pro_fp extensions=int16,int4,bf16,fp8e4m3,fp8e5m2,fft,variable,controlflow,doubleround,inexactround,dynamic level=none" | FileCheck %s --check-prefix=CHECK-ALL
 // RUN: mlir-opt %s -split-input-file -tosa-attach-target="level=8k" | FileCheck %s --check-prefix=CHECK-LVL-8K
 // RUN: mlir-opt %s -split-input-file -tosa-attach-target | FileCheck %s --check-prefix=CHECK-DEFAULT
+// RUN: mlir-opt %s -split-input-file -tosa-attach-target="specification_version=1.1.draft" | FileCheck %s --check-prefix=CHECK-VERSION-1P1
 
 // -----
 
-// CHECK-ALL: module attributes {tosa.target_env = #tosa.target_env<level = none, profiles = [pro_int, pro_fp], extensions = [int16, int4, bf16, fp8e4m3, fp8e5m2, fft, variable, controlflow, doubleround, inexactround, dynamic]>}
-// CHECK-LVL-8K: module attributes {tosa.target_env = #tosa.target_env<level = "8k", profiles = [], extensions = []>}
-// CHECK-DEFAULT: module attributes {tosa.target_env = #tosa.target_env<level = "8k", profiles = [], extensions = []>}
+// CHECK-ALL: module attributes {tosa.target_env = #tosa.target_env<specification_version = "1.0", level = none, profiles = [pro_int, pro_fp], extensions = [int16, int4, bf16, fp8e4m3, fp8e5m2, fft, variable, controlflow, doubleround, inexactround, dynamic]>}
+// CHECK-LVL-8K: module attributes {tosa.target_env = #tosa.target_env<specification_version = "1.0", level = "8k", profiles = [], extensions = []>}
+// CHECK-DEFAULT: module attributes {tosa.target_env = #tosa.target_env<specification_version = "1.0", level = "8k", profiles = [], extensions = []>}
+// CHECK-VERSION-1P1: module attributes {tosa.target_env = #tosa.target_env<specification_version = "1.1.draft", level = "8k", profiles = [], extensions = []>}
 // CHECK-LABEL: test_simple
 func.func @test_simple(%arg0 : tensor<1x1x1x1xf32>, %arg1 : tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32> {
   %1 = tosa.add %arg0, %arg1 : (tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
diff --git a/mlir/test/Dialect/Tosa/tosa-validation-version-1p0-invalid.mlir b/mlir/test/Dialect/Tosa/tosa-validation-version-1p0-invalid.mlir
new file mode 100644
index 0000000000000..51089df238b84
--- /dev/null
+++ b/mlir/test/Dialect/Tosa/tosa-validation-version-1p0-invalid.mlir
@@ -0,0 +1,21 @@
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics -tosa-attach-target="specification_version=1.0 profiles=pro_int,pro_fp extensions=int16,int4,bf16,fp8e4m3,fp8e5m2,fft,variable,controlflow,dynamic,doubleround,inexactround" -tosa-validate="strict-op-spec-alignment"
+
+// -----
+
+func.func @test_matmul_fp8_mixed_precision_operands(%arg0: tensor<1x14x19xf8E4M3FN>, %arg1: tensor<1x19x28xf8E5M2>) -> tensor<1x14x28xf16> {
+  %azp0 = "tosa.const"() <{values = dense<0.0> : tensor<1xf8E4M3FN>}> : () -> tensor<1xf8E4M3FN>
+  %bzp0 = "tosa.const"() <{values = dense<0.0> : tensor<1xf8E5M2>}> : () -> tensor<1xf8E5M2>
+  // expected-error@+1 {{'tosa.matmul' op illegal: the target specification version (1.0) is not backwards compatible with the op compliance specification version (1.1)}}
+  %0 = tosa.matmul %arg0, %arg1, %azp0, %bzp0 : (tensor<1x14x19xf8E4M3FN>, tensor<1x19x28xf8E5M2>, tensor<1xf8E4M3FN>, tensor<1xf8E5M2>)  -> tensor<1x14x28xf16>
+  return %0 : tensor<1x14x28xf16>
+}
+
+// -----
+
+func.func @test_matmul_fp8_input_fp32_acc_type(%arg0: tensor<1x14x19xf8E4M3FN>, %arg1: tensor<1x19x28xf8E4M3FN>) -> tensor<1x14x28xf32> {
+  %azp0 = "tosa.const"() <{values = dense<0.0> : tensor<1xf8E4M3FN>}> : () -> tensor<1xf8E4M3FN>
+  %bzp0 = "tosa.const"() <{values = dense<0.0> : tensor<1xf8E4M3FN>}> : () -> tensor<1xf8E4M3FN>
+  // expected-error@+1 {{'tosa.matmul' op illegal: the target specification version (1.0) is not backwards compatible with the op compliance specification version (1.1)}}
+  %0 = tosa.matmul %arg0, %arg1, %azp0, %bzp0 : (tensor<1x14x19xf8E4M3FN>, tensor<1x19x28xf8E4M3FN>, tensor<1xf8E4M3FN>, tensor<1xf8E4M3FN>)  -> tensor<1x14x28xf32>
+  return %0 : tensor<1x14x28xf32>
+}
diff --git a/mlir/test/Dialect/Tosa/tosa-validation-version-1p1-valid.mlir b/mlir/test/Dialect/Tosa/tosa-validation-version-1p1-valid.mlir
new file mode 100644
index 0000000000000..81645092bf195
--- /dev/null
+++ b/mlir/test/Dialect/Tosa/tosa-validation-version-1p1-valid.mlir
@@ -0,0 +1,20 @@
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics -tosa-attach-target="specification_version=1.1.draft profiles=pro_int,pro_fp extensions=int16,int4,bf16,fp8e4m3,fp8e5m2,fft,variable,controlflow,doubleround,inexactround" -tosa-validate="strict-op-spec-alignment" | FileCheck %s
+
+// -----
+
+func.func @test_matmul_fp8_mixed_precision_operands(%arg0: tensor<1x14x19xf8E4M3FN>, %arg1: tensor<1x19x28xf8E5M2>) -> tensor<1x14x28xf16> {
+  %azp0 = "tosa.const"() <{values = dense<0.0> : tensor<1xf8E4M3FN>}> : () -> tensor<1xf8E4M3FN>
+  %bzp0 = "tosa.const"() <{values = dense<0.0> : tensor<1xf8E5M2>}> : () -> tensor<1xf8E5M2>
+  %0 = tosa.matmul %arg0, %arg1, %azp0, %bzp0 : (tensor<1x14x19xf8E4M3FN>, tensor<1x19x28xf8E5M2>, tensor<1xf8E4M3FN>, tensor<1xf8E5M2>)  -> tensor<1x14x28xf16>
+  return %0 : tensor<1x14x28xf16>
+}
+
+// -----
+
+// CHECK-LABEL: test_matmul_fp8_input_fp32_acc_type
+func.func @test_matmul_fp8_input_fp32_acc_type(%arg0: tensor<1x14x19xf8E4M3FN>, %arg1: tensor<1x19x28xf8E4M3FN>) -> tensor<1x14x28xf32> {
+  %azp0 = "tosa.const"() <{values = dense<0.0> : tensor<1xf8E4M3FN>}> : () -> tensor<1xf8E4M3FN>
+  %bzp0 = "tosa.const"() <{values = dense<0.0> : tensor<1xf8E4M3FN>}> : () -> tensor<1xf8E4M3FN>
+  %0 = tosa.matmul %arg0, %arg1, %azp0, %bzp0 : (tensor<1x14x19xf8E4M3FN>, tensor<1x19x28xf8E4M3FN>, tensor<1xf8E4M3FN>, tensor<1xf8E4M3FN>)  -> tensor<1x14x28xf32>
+  return %0 : tensor<1x14x28xf32>
+}
diff --git a/mlir/test/Dialect/Transform/test-smt-extension-invalid.mlir b/mlir/test/Dialect/Transform/test-smt-extension-invalid.mlir
index 314b8d493c5d4..d91d69a756458 100644
--- a/mlir/test/Dialect/Transform/test-smt-extension-invalid.mlir
+++ b/mlir/test/Dialect/Transform/test-smt-extension-invalid.mlir
@@ -1,11 +1,40 @@
 // RUN: mlir-opt %s --transform-interpreter --split-input-file --verify-diagnostics
 
+// CHECK-LABEL: @incorrect terminator
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @operands_not_one_to_one_with_vars(%arg0: !transform.any_op {transform.readonly}) {
+    %param_as_param = transform.param.constant 42 -> !transform.param<i64>
+    // expected-error@below {{op expected 'smt.yield' as terminator}}
+    transform.smt.constrain_params(%param_as_param) : (!transform.param<i64>) -> () {
+      ^bb0(%param_as_smt_var: !smt.int):
+      transform.yield
+    }
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @operands_not_one_to_one_with_vars
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @operands_not_one_to_one_with_vars(%arg0: !transform.any_op {transform.readonly}) {
+    %param_as_param = transform.param.constant 42 -> !transform.param<i64>
+    // expected-error@below {{must have the same number of block arguments as operands}}
+    transform.smt.constrain_params(%param_as_param) : (!transform.param<i64>) -> () {
+      ^bb0(%param_as_smt_var: !smt.int, %param_as_another_smt_var: !smt.int):
+    }
+    transform.yield
+  }
+}
+
+// -----
+
 // CHECK-LABEL: @constraint_not_using_smt_ops
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @constraint_not_using_smt_ops(%arg0: !transform.any_op {transform.readonly}) {
     %param_as_param = transform.param.constant 42 -> !transform.param<i64>
     // expected-error@below {{ops contained in region should belong to SMT-dialect}}
-    transform.smt.constrain_params(%param_as_param) : !transform.param<i64> {
+    transform.smt.constrain_params(%param_as_param) : (!transform.param<i64>) -> () {
       ^bb0(%param_as_smt_var: !smt.int):
       %c4 = arith.constant 4 : i32
       // This is the kind of thing one might think works:
@@ -17,13 +46,90 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
-// CHECK-LABEL: @operands_not_one_to_one_with_vars
+// CHECK-LABEL: @results_not_one_to_one_with_vars
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @operands_not_one_to_one_with_vars(%arg0: !transform.any_op {transform.readonly}) {
+  transform.named_sequence @results_not_one_to_one_with_vars(%arg0: !transform.any_op {transform.readonly}) {
     %param_as_param = transform.param.constant 42 -> !transform.param<i64>
-    // expected-error@below {{must have the same number of block arguments as operands}}
-    transform.smt.constrain_params(%param_as_param) : !transform.param<i64> {
+    transform.smt.constrain_params(%param_as_param, %param_as_param) : (!transform.param<i64>, !transform.param<i64>) -> () {
       ^bb0(%param_as_smt_var: !smt.int, %param_as_another_smt_var: !smt.int):
+      // expected-error@below {{expected terminator to have as many operands as the parent op has results}}
+      smt.yield %param_as_smt_var : !smt.int
+    }
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @non_smt_type_block_args
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @non_smt_type_block_args(%arg0: !transform.any_op {transform.readonly}) {
+    %param_as_param = transform.param.constant 42 -> !transform.param<i8>
+    // expected-error@below {{the type of block arg #0 is expected to be either a !smt.bool, a !smt.int, or a !smt.bv}}
+    transform.smt.constrain_params(%param_as_param) : (!transform.param<i8>) -> (!transform.param<i8>) {
+      ^bb0(%param_as_smt_var: !transform.param<i8>):
+      smt.yield %param_as_smt_var : !transform.param<i8>
+    }
+    transform.yield
+  }
+}
+
+
+// -----
+
+// CHECK-LABEL: @mismatched_arg_type_bool
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @mismatched_arg_type_bool(%arg0: !transform.any_op {transform.readonly}) {
+    %param_as_param = transform.param.constant 42 -> !transform.param<i64>
+    // expected-error@below {{the type of block arg #0 is !smt.bool though the corresponding operand type ('!transform.param<i64>') is not wrapping i1}}
+    transform.smt.constrain_params(%param_as_param) : (!transform.param<i64>) -> (!transform.param<i64>) {
+      ^bb0(%param_as_smt_var: !smt.bool):
+      smt.yield %param_as_smt_var : !smt.bool
+    }
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @mismatched_arg_type_bitvector
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @mismatched_arg_type_bitvector(%arg0: !transform.any_op {transform.readonly}) {
+    %param_as_param = transform.param.constant 42 -> !transform.param<i64>
+    // expected-error@below {{the type of block arg #0 is '!smt.bv<8>' though the corresponding operand type ('!transform.param<i64>') is not wrapping an integer type of the same bitwidth}}
+    transform.smt.constrain_params(%param_as_param) : (!transform.param<i64>) -> (!transform.param<i64>) {
+      ^bb0(%param_as_smt_var: !smt.bv<8>):
+      smt.yield %param_as_smt_var : !smt.bv<8>
+    }
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @mismatched_result_type_bool
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @mismatched_result_type_bool(%arg0: !transform.any_op {transform.readonly}) {
+    %param_as_param = transform.param.constant 1 -> !transform.param<i1>
+    transform.smt.constrain_params(%param_as_param) : (!transform.param<i1>) -> (!transform.param<i64>) {
+      ^bb0(%param_as_smt_var: !smt.bool):
+      // expected-error@below {{the type of terminator operand #0 is !smt.bool though the corresponding result type ('!transform.param<i64>') is not wrapping i1}}
+      smt.yield %param_as_smt_var : !smt.bool
+    }
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @mismatched_result_type_bitvector
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @mismatched_result_type_bitvector(%arg0: !transform.any_op {transform.readonly}) {
+    %param_as_param = transform.param.constant 42 -> !transform.param<i8>
+    transform.smt.constrain_params(%param_as_param) : (!transform.param<i8>) -> (!transform.param<i64>) {
+      ^bb0(%param_as_smt_var: !smt.bv<8>):
+      // expected-error@below {{the type of terminator operand #0 is '!smt.bv<8>' though the corresponding result type ('!transform.param<i64>') is not wrapping an integer type of the same bitwidth}}
+      smt.yield %param_as_smt_var : !smt.bv<8>
     }
     transform.yield
   }
diff --git a/mlir/test/Dialect/Transform/test-smt-extension.mlir b/mlir/test/Dialect/Transform/test-smt-extension.mlir
index 29d15175ae4ec..6cc41dd52473e 100644
--- a/mlir/test/Dialect/Transform/test-smt-extension.mlir
+++ b/mlir/test/Dialect/Transform/test-smt-extension.mlir
@@ -7,7 +7,7 @@ module attributes {transform.with_named_sequence} {
     %param_as_param = transform.param.constant 42 -> !transform.param<i64>
 
     // CHECK: transform.smt.constrain_params(%[[PARAM_AS_PARAM]])
-    transform.smt.constrain_params(%param_as_param) : !transform.param<i64> {
+    transform.smt.constrain_params(%param_as_param) : (!transform.param<i64>) -> () {
       // CHECK: ^bb{{.*}}(%[[PARAM_AS_SMT_SYMB:.*]]: !smt.int):
       ^bb0(%param_as_smt_var: !smt.int):
       // CHECK: %[[C0:.*]] = smt.int.constant 0
@@ -31,18 +31,20 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
-// CHECK-LABEL: @schedule_with_constraint_on_multiple_params
+// CHECK-LABEL: @schedule_with_constraint_on_multiple_params_returning_computed_value
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @schedule_with_constraint_on_multiple_params(%arg0: !transform.any_op {transform.readonly}) {
+  transform.named_sequence @schedule_with_constraint_on_multiple_params_returning_computed_value(%arg0: !transform.any_op {transform.readonly}) {
     // CHECK: %[[PARAM_A:.*]] = transform.param.constant
     %param_a = transform.param.constant 4 -> !transform.param<i64>
     // CHECK: %[[PARAM_B:.*]] = transform.param.constant
-    %param_b = transform.param.constant 16 -> !transform.param<i64>
+    %param_b = transform.param.constant 32 -> !transform.param<i64>
 
     // CHECK: transform.smt.constrain_params(%[[PARAM_A]], %[[PARAM_B]])
-    transform.smt.constrain_params(%param_a, %param_b) : !transform.param<i64>, !transform.param<i64> {
+    %divisor = transform.smt.constrain_params(%param_a, %param_b) : (!transform.param<i64>, !transform.param<i64>) -> (!transform.param<i64>) {
       // CHECK: ^bb{{.*}}(%[[VAR_A:.*]]: !smt.int, %[[VAR_B:.*]]: !smt.int):
       ^bb0(%var_a: !smt.int, %var_b: !smt.int):
+      // CHECK: %[[DIV:.*]] = smt.int.div %[[VAR_B]], %[[VAR_A]]
+      %divisor = smt.int.div %var_b, %var_a
       // CHECK: %[[C0:.*]] = smt.int.constant 0
       %c0 = smt.int.constant 0
       // CHECK: %[[REMAINDER:.*]] = smt.int.mod %[[VAR_B]], %[[VAR_A]]
@@ -51,8 +53,11 @@ module attributes {transform.with_named_sequence} {
       %eq = smt.eq %remainder, %c0 : !smt.int
       // CHECK: smt.assert %[[EQ]]
       smt.assert %eq
+      // CHECK: smt.yield %[[DIV]]
+      smt.yield %divisor : !smt.int
     }
-    // NB: from here can rely on that %param_a is a divisor of %param_b
+    // NB: from here can rely on that %param_a is a divisor of %param_b and
+    //     that the relevant factor, 8, got associated to %divisor.
     transform.yield
   }
 }
@@ -63,10 +68,10 @@ module attributes {transform.with_named_sequence} {
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @schedule_with_param_as_a_bool(%arg0: !transform.any_op {transform.readonly}) {
     // CHECK: %[[PARAM_AS_PARAM:.*]] = transform.param.constant
-    %param_as_param = transform.param.constant true -> !transform.any_param
+    %param_as_param = transform.param.constant true -> !transform.param<i1>
 
     // CHECK: transform.smt.constrain_params(%[[PARAM_AS_PARAM]])
-    transform.smt.constrain_params(%param_as_param) : !transform.any_param {
+    transform.smt.constrain_params(%param_as_param) : (!transform.param<i1>) -> () {
       // CHECK: ^bb{{.*}}(%[[PARAM_AS_SMT_VAR:.*]]: !smt.bool):
       ^bb0(%param_as_smt_var: !smt.bool):
       // CHECK: %[[C0:.*]] = smt.int.constant 0
diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir
index 59774f92cac36..084f49fca212f 100644
--- a/mlir/test/Dialect/Vector/canonicalize.mlir
+++ b/mlir/test/Dialect/Vector/canonicalize.mlir
@@ -3530,6 +3530,62 @@ func.func @from_elements_index_to_i64_conversion() -> vector<3xi64> {
 
 // -----
 
+// +---------------------------------------------------------------------------
+// Tests for FoldTransposeFromElements
+// +---------------------------------------------------------------------------
+
+// CHECK-LABEL: transpose_from_elements_1d
+// CHECK-SAME:  %[[EL_0:.*]]: i32, %[[EL_1:.*]]: i32 
+func.func @transpose_from_elements_1d(%el_0: i32, %el_1: i32) -> vector<2xi32> {
+  %v = vector.from_elements %el_0, %el_1 : vector<2xi32>
+  %t = vector.transpose %v, [0] : vector<2xi32> to vector<2xi32>
+  return %t : vector<2xi32>
+  // CHECK: %[[R:.*]] = vector.from_elements %[[EL_0]], %[[EL_1]] : vector<2xi32>
+  // CHECK-NOT: vector.transpose
+  // CHECK: return %[[R]]
+}
+
+// CHECK-LABEL: transpose_from_elements_2d
+// CHECK-SAME:  %[[EL_0_0:.*]]: i32, %[[EL_0_1:.*]]: i32, %[[EL_0_2:.*]]: i32, %[[EL_1_0:.*]]: i32, %[[EL_1_1:.*]]: i32, %[[EL_1_2:.*]]: i32 
+func.func @transpose_from_elements_2d(
+  %el_0_0: i32, %el_0_1: i32, %el_0_2: i32,
+  %el_1_0: i32, %el_1_1: i32, %el_1_2: i32
+) -> vector<3x2xi32> {
+  %v = vector.from_elements %el_0_0, %el_0_1, %el_0_2, %el_1_0, %el_1_1, %el_1_2 : vector<2x3xi32>
+  %t = vector.transpose %v, [1, 0] : vector<2x3xi32> to vector<3x2xi32>
+  return %t : vector<3x2xi32>
+  // CHECK: %[[R:.*]] = vector.from_elements %[[EL_0_0:.*]], %[[EL_1_0:.*]], %[[EL_0_1:.*]], %[[EL_1_1:.*]], %[[EL_0_2:.*]], %[[EL_1_2:.*]] : vector<3x2xi32>
+  // CHECK-NOT: vector.transpose
+  // CHECK: return %[[R]]
+}
+
+// CHECK-LABEL: transpose_from_elements_3d
+// CHECK-SAME:  %[[EL_0_0_0:.*]]: i32, %[[EL_0_0_1:.*]]: i32, %[[EL_0_1_0:.*]]: i32, %[[EL_0_1_1:.*]]: i32, %[[EL_0_2_0:.*]]: i32, %[[EL_0_2_1:.*]]: i32, %[[EL_1_0_0:.*]]: i32, %[[EL_1_0_1:.*]]: i32, %[[EL_1_1_0:.*]]: i32, %[[EL_1_1_1:.*]]: i32, %[[EL_1_2_0:.*]]: i32, %[[EL_1_2_1:.*]]: i32 
+func.func @transpose_from_elements_3d(
+  %el_0_0_0: i32, %el_0_0_1: i32, %el_0_1_0: i32, %el_0_1_1: i32, %el_0_2_0: i32, %el_0_2_1: i32,
+  %el_1_0_0: i32, %el_1_0_1: i32, %el_1_1_0: i32, %el_1_1_1: i32, %el_1_2_0: i32, %el_1_2_1: i32
+) -> vector<2x2x3xi32> {
+  %v = vector.from_elements
+    %el_0_0_0, %el_0_0_1,
+    %el_0_1_0, %el_0_1_1,
+    %el_0_2_0, %el_0_2_1,
+    %el_1_0_0, %el_1_0_1,
+    %el_1_1_0, %el_1_1_1,
+    %el_1_2_0, %el_1_2_1
+    : vector<2x3x2xi32>
+  %t = vector.transpose %v, [0, 2, 1] : vector<2x3x2xi32> to vector<2x2x3xi32>
+  return %t : vector<2x2x3xi32>
+  // CHECK: %[[R:.*]] = vector.from_elements %[[EL_0_0_0:.*]], %[[EL_0_1_0:.*]], %[[EL_0_2_0:.*]], %[[EL_0_0_1:.*]], %[[EL_0_1_1:.*]], %[[EL_0_2_1:.*]], %[[EL_1_0_0:.*]], %[[EL_1_1_0:.*]], %[[EL_1_2_0:.*]], %[[EL_1_0_1:.*]], %[[EL_1_1_1:.*]], %[[EL_1_2_1:.*]] : vector<2x2x3xi32>
+  // CHECK-NOT: vector.transpose
+  // CHECK: return %[[R]]
+}
+
+// +---------------------------------------------------------------------------
+// End of  Tests for FoldTransposeFromElements
+// +---------------------------------------------------------------------------
+
+// -----
+
 // Not a DenseElementsAttr, don't fold.
 
 // CHECK-LABEL: func @negative_insert_llvm_undef(
diff --git a/mlir/test/Dialect/Vector/canonicalize/vector-step.mlir b/mlir/test/Dialect/Vector/canonicalize/vector-step.mlir
new file mode 100644
index 0000000000000..023a0e52b65dc
--- /dev/null
+++ b/mlir/test/Dialect/Vector/canonicalize/vector-step.mlir
@@ -0,0 +1,311 @@
+// RUN: mlir-opt %s -canonicalize="test-convergence" -split-input-file | FileCheck %s
+
+///===----------------------------------------------===//
+///  Tests of `StepCompareFolder`
+///===----------------------------------------------===//
+
+
+///===------------------------------------===//
+///  Tests of `ugt` (unsigned greater than)
+///===------------------------------------===//
+
+// CHECK-LABEL: @ugt_constant_3_lhs
+//       CHECK: %[[CST:.*]] = arith.constant dense<true> : vector<3xi1>
+//       CHECK: return %[[CST]] : vector<3xi1>
+func.func @ugt_constant_3_lhs() -> vector<3xi1> {
+  %cst = arith.constant dense<3> : vector<3xindex>
+  %0 = vector.step : vector<3xindex>
+  // 3 > [0, 1, 2] => [true, true, true] => true for all indices => fold
+  %1 = arith.cmpi ugt, %cst, %0 : vector<3xindex>
+  return %1 : vector<3xi1>
+}
+
+// -----
+
+// CHECK-LABEL: @negative_ugt_constant_2_lhs
+//       CHECK: %[[CMP:.*]] = arith.cmpi
+//       CHECK: return %[[CMP]]
+func.func @negative_ugt_constant_2_lhs() -> vector<3xi1> {
+  %cst = arith.constant dense<2> : vector<3xindex>
+  %0 = vector.step : vector<3xindex>
+  // 2 > [0, 1, 2] => [true, true, false] => not same for all indices => don't fold
+  %1 = arith.cmpi ugt, %cst, %0 : vector<3xindex>
+  return %1 : vector<3xi1>
+}
+
+// -----
+
+// CHECK-LABEL: @ugt_constant_3_rhs
+//       CHECK: %[[CST:.*]] = arith.constant dense<false> : vector<3xi1>
+//       CHECK: return %[[CST]] : vector<3xi1>
+func.func @ugt_constant_3_rhs() -> vector<3xi1> {
+  %cst = arith.constant dense<3> : vector<3xindex>
+  %0 = vector.step : vector<3xindex>
+  // [0, 1, 2] > 3 => [false, false, false] => false for all indices => fold
+  %1 = arith.cmpi ugt, %0, %cst : vector<3xindex>
+  return %1 : vector<3xi1>
+}
+
+// -----
+
+// CHECK-LABEL: @ugt_constant_max_rhs
+//       CHECK: %[[CST:.*]] = arith.constant dense<false> : vector<3xi1>
+//       CHECK: return %[[CST]] : vector<3xi1>
+func.func @ugt_constant_max_rhs() -> vector<3xi1> {
+  // The largest i64 possible:
+  %cst = arith.constant dense<0x7fffffffffffffff> : vector<3xindex>
+  %0 = vector.step : vector<3xindex>
+  %1 = arith.cmpi ugt, %0, %cst: vector<3xindex>
+  return %1 : vector<3xi1>
+}
+
+
+// -----
+
+// CHECK-LABEL: @ugt_constant_2_rhs
+//       CHECK: %[[CST:.*]] = arith.constant dense<false> : vector<3xi1>
+//       CHECK: return %[[CST]] : vector<3xi1>
+func.func @ugt_constant_2_rhs() -> vector<3xi1> {
+  %cst = arith.constant dense<2> : vector<3xindex>
+  %0 = vector.step : vector<3xindex>
+  // [0, 1, 2] > 2 => [false, false, false] => false for all indices => fold
+  %1 = arith.cmpi ugt, %0, %cst : vector<3xindex>
+  return %1 : vector<3xi1>
+}
+
+// -----
+
+// CHECK-LABEL: @negative_ugt_constant_1_rhs
+//       CHECK: %[[CMP:.*]] = arith.cmpi
+//       CHECK: return %[[CMP]]
+func.func @negative_ugt_constant_1_rhs() -> vector<3xi1> {
+  %cst = arith.constant dense<1> : vector<3xindex>
+  %0 = vector.step : vector<3xindex>
+  // [0, 1, 2] > 1 => [false, false, true] => not same for all indices => don't fold
+  %1 = arith.cmpi ugt, %0, %cst: vector<3xindex>
+  return %1 : vector<3xi1>
+}
+
+// -----
+
+///===------------------------------------===//
+///  Tests of `uge` (unsigned greater than or equal)
+///===------------------------------------===//
+
+
+// CHECK-LABEL: @uge_constant_2_lhs
+//       CHECK: %[[CST:.*]] = arith.constant dense<true> : vector<3xi1>
+//       CHECK: return %[[CST]] : vector<3xi1>
+func.func @uge_constant_2_lhs() -> vector<3xi1> {
+  %cst = arith.constant dense<2> : vector<3xindex>
+  %0 = vector.step : vector<3xindex>
+  // 2 >= [0, 1, 2] => [true, true, true] => true for all indices => fold
+  %1 = arith.cmpi uge, %cst, %0 : vector<3xindex>
+  return %1 : vector<3xi1>
+}
+
+// -----
+
+// CHECK-LABEL: @negative_uge_constant_1_lhs
+//       CHECK: %[[CMP:.*]] = arith.cmpi
+//       CHECK: return %[[CMP]]
+func.func @negative_uge_constant_1_lhs() -> vector<3xi1> {
+  %cst = arith.constant dense<1> : vector<3xindex>
+  %0 = vector.step : vector<3xindex>
+  // 1 >= [0, 1, 2] => [true, false, false] => not same for all indices => don't fold
+  %1 = arith.cmpi uge, %cst, %0 : vector<3xindex>
+  return %1 : vector<3xi1>
+}
+
+// -----
+
+// CHECK-LABEL: @uge_constant_3_rhs
+//       CHECK: %[[CST:.*]] = arith.constant dense<false> : vector<3xi1>
+//       CHECK: return %[[CST]] : vector<3xi1>
+func.func @uge_constant_3_rhs() -> vector<3xi1> {
+  %cst = arith.constant dense<3> : vector<3xindex>
+  %0 = vector.step : vector<3xindex>
+  // [0, 1, 2] >= 3 => [false, false, false] => false for all indices => fold
+  %1 = arith.cmpi uge, %0, %cst : vector<3xindex>
+  return %1 : vector<3xi1>
+}
+
+// -----
+
+// CHECK-LABEL: @negative_uge_constant_2_rhs
+//       CHECK: %[[CMP:.*]] = arith.cmpi
+//       CHECK: return %[[CMP]]
+func.func @negative_uge_constant_2_rhs() -> vector<3xi1> {
+  %cst = arith.constant dense<2> : vector<3xindex>
+  %0 = vector.step : vector<3xindex>
+  // [0, 1, 2] >= 2 => [false, false, true] => not same for all indices => don't fold
+  %1 = arith.cmpi uge, %0, %cst : vector<3xindex>
+  return %1 : vector<3xi1>
+}
+
+// -----
+
+
+///===------------------------------------===//
+///  Tests of `ult` (unsigned less than)
+///===------------------------------------===//
+
+
+// CHECK-LABEL: @ult_constant_2_lhs
+//       CHECK: %[[CST:.*]] = arith.constant dense<false> : vector<3xi1>
+//       CHECK: return %[[CST]] : vector<3xi1>
+func.func @ult_constant_2_lhs() -> vector<3xi1> {
+  %cst = arith.constant dense<2> : vector<3xindex>
+  %0 = vector.step : vector<3xindex>
+  // 2 < [0, 1, 2] => [false, false, false] => false for all indices => fold
+  %1 = arith.cmpi ult, %cst, %0 : vector<3xindex>
+  return %1 : vector<3xi1>
+}
+
+// -----
+
+// CHECK-LABEL: @negative_ult_constant_1_lhs
+//       CHECK: %[[CMP:.*]] = arith.cmpi
+//       CHECK: return %[[CMP]]
+func.func @negative_ult_constant_1_lhs() -> vector<3xi1> {
+  %cst = arith.constant dense<1> : vector<3xindex>
+  %0 = vector.step : vector<3xindex>
+  // 1 < [0, 1, 2] => [false, false, true] => not same for all indices => don't fold
+  %1 = arith.cmpi ult, %cst, %0 : vector<3xindex>
+  return %1 : vector<3xi1>
+}
+
+// -----
+
+// CHECK-LABEL: @ult_constant_3_rhs
+//       CHECK: %[[CST:.*]] = arith.constant dense<true> : vector<3xi1>
+//       CHECK: return %[[CST]] : vector<3xi1>
+func.func @ult_constant_3_rhs() -> vector<3xi1> {
+  %cst = arith.constant dense<3> : vector<3xindex>
+  %0 = vector.step : vector<3xindex>
+  // [0, 1, 2] < 3 => [true, true, true] => true for all indices => fold
+  %1 = arith.cmpi ult, %0, %cst : vector<3xindex>
+  return %1 : vector<3xi1>
+}
+
+// -----
+
+// CHECK-LABEL: @negative_ult_constant_2_rhs
+//       CHECK: %[[CMP:.*]] = arith.cmpi
+//       CHECK: return %[[CMP]]
+func.func @negative_ult_constant_2_rhs() -> vector<3xi1> {
+  %cst = arith.constant dense<2> : vector<3xindex>
+  %0 = vector.step : vector<3xindex>
+  // [0, 1, 2] < 2 => [true, true, false] => not same for all indices => don't fold
+  %1 = arith.cmpi ult, %0, %cst : vector<3xindex>
+  return %1 : vector<3xi1>
+}
+
+// -----
+
+///===------------------------------------===//
+///  Tests of `ule` (unsigned less than or equal)
+///===------------------------------------===//
+
+// CHECK-LABEL: @ule_constant_3_lhs
+//       CHECK: %[[CST:.*]] = arith.constant dense<false> : vector<3xi1>
+//       CHECK: return %[[CST]] : vector<3xi1>
+func.func @ule_constant_3_lhs() -> vector<3xi1> {
+  %cst = arith.constant dense<3> : vector<3xindex>
+  %0 = vector.step : vector<3xindex>
+  %1 = arith.cmpi ule, %cst, %0 : vector<3xindex>
+  return %1 : vector<3xi1>
+}
+
+// -----
+
+// CHECK-LABEL: @negative_ule_constant_2_lhs
+//       CHECK: %[[CMP:.*]] = arith.cmpi
+//       CHECK: return %[[CMP]]
+func.func @negative_ule_constant_2_lhs() -> vector<3xi1> {
+  %cst = arith.constant dense<2> : vector<3xindex>
+  %0 = vector.step : vector<3xindex>
+  %1 = arith.cmpi ule, %cst, %0 : vector<3xindex>
+  return %1 : vector<3xi1>
+}
+
+// -----
+
+// CHECK-LABEL: @ule_constant_2_rhs
+//       CHECK: %[[CST:.*]] = arith.constant dense<true> : vector<3xi1>
+//       CHECK: return %[[CST]] : vector<3xi1>
+func.func @ule_constant_2_rhs() -> vector<3xi1> {
+  %cst = arith.constant dense<2> : vector<3xindex>
+  %0 = vector.step : vector<3xindex>
+  %1 = arith.cmpi ule, %0, %cst : vector<3xindex>
+  return %1 : vector<3xi1>
+}
+
+// -----
+
+// CHECK-LABEL: @negative_ule_constant_1_rhs
+//       CHECK: %[[CMP:.*]] = arith.cmpi
+//       CHECK: return %[[CMP]]
+func.func @negative_ule_constant_1_rhs() -> vector<3xi1> {
+  %cst = arith.constant dense<1> : vector<3xindex>
+  %0 = vector.step : vector<3xindex>
+  %1 = arith.cmpi ule, %0, %cst: vector<3xindex>
+  return %1 : vector<3xi1>
+}
+
+// -----
+
+///===------------------------------------===//
+///  Tests of `eq` (equal)
+///===------------------------------------===//
+
+// CHECK-LABEL: @eq_constant_3
+//       CHECK: %[[CST:.*]] = arith.constant dense<false> : vector<3xi1>
+//       CHECK: return %[[CST]] : vector<3xi1>
+func.func @eq_constant_3() -> vector<3xi1> {
+  %cst = arith.constant dense<3> : vector<3xindex>
+  %0 = vector.step : vector<3xindex>
+  %1 = arith.cmpi eq, %0, %cst: vector<3xindex>
+  return %1 : vector<3xi1>
+}
+
+// -----
+
+// CHECK-LABEL: @negative_eq_constant_2
+//       CHECK: %[[CMP:.*]] = arith.cmpi
+//       CHECK: return %[[CMP]]
+func.func @negative_eq_constant_2() -> vector<3xi1> {
+  %cst = arith.constant dense<2> : vector<3xindex>
+  %0 = vector.step : vector<3xindex>
+  %1 = arith.cmpi eq, %0, %cst: vector<3xindex>
+  return %1 : vector<3xi1>
+}
+
+// -----
+
+///===------------------------------------===//
+///  Tests of `ne` (not equal)
+///===------------------------------------===//
+
+// CHECK-LABEL: @ne_constant_3
+//       CHECK: %[[CST:.*]] = arith.constant dense<true> : vector<3xi1>
+//       CHECK: return %[[CST]] : vector<3xi1>
+func.func @ne_constant_3() -> vector<3xi1> {
+  %cst = arith.constant dense<3> : vector<3xindex>
+  %0 = vector.step : vector<3xindex>
+  %1 = arith.cmpi ne, %0, %cst: vector<3xindex>
+  return %1 : vector<3xi1>
+}
+
+// -----
+
+// CHECK-LABEL: @negative_ne_constant_2
+//       CHECK: %[[CMP:.*]] = arith.cmpi
+//       CHECK: return %[[CMP]]
+func.func @negative_ne_constant_2() -> vector<3xi1> {
+  %cst = arith.constant dense<2> : vector<3xindex>
+  %0 = vector.step : vector<3xindex>
+  %1 = arith.cmpi ne, %0, %cst: vector<3xindex>
+  return %1 : vector<3xi1>
+}
+
diff --git a/mlir/test/Dialect/Vector/linearize.mlir b/mlir/test/Dialect/Vector/linearize.mlir
index ee5cfbcda5c19..cbbc833d7a51d 100644
--- a/mlir/test/Dialect/Vector/linearize.mlir
+++ b/mlir/test/Dialect/Vector/linearize.mlir
@@ -428,6 +428,47 @@ func.func @test_linearize_across_for(%arg0 : vector<4xi8>) -> vector<4xi8> {
 
 // -----
 
+// CHECK-LABEL: linearize_vector_broadcast_scalar_source
+// CHECK-SAME: (%[[ARG:.*]]: i32) -> vector<4x2xi32>
+func.func @linearize_vector_broadcast_scalar_source(%arg0: i32) -> vector<4x2xi32> {
+
+  // CHECK: %[[BROADCAST:.*]] = vector.broadcast %[[ARG]] : i32 to vector<8xi32>
+  // CHECK: %[[CAST:.*]] = vector.shape_cast %[[BROADCAST]] : vector<8xi32> to vector<4x2xi32>
+  // CHECK: return %[[CAST]] : vector<4x2xi32>
+  %0 = vector.broadcast %arg0 : i32 to vector<4x2xi32>
+  return %0 : vector<4x2xi32>
+}
+
+// -----
+
+// CHECK-LABEL: linearize_vector_broadcast_rank_two_source
+// CHECK-SAME: (%[[ARG:.*]]: vector<1x1xi32>) -> vector<4x2xi32>
+func.func @linearize_vector_broadcast_rank_two_source(%arg0: vector<1x1xi32>) -> vector<4x2xi32> {
+
+  // CHECK: %[[CAST0:.*]] = vector.shape_cast %[[ARG]] : vector<1x1xi32> to vector<1xi32>
+  // CHECK: %[[BROADCAST:.*]] = vector.broadcast %[[CAST0]] : vector<1xi32> to vector<8xi32>
+  // CHECK: %[[CAST1:.*]] = vector.shape_cast %[[BROADCAST]] : vector<8xi32> to vector<4x2xi32>
+  // CHECK: return %[[CAST1]] : vector<4x2xi32>
+  %0 = vector.broadcast %arg0 : vector<1x1xi32> to vector<4x2xi32>
+  return %0 : vector<4x2xi32>
+}
+
+// -----
+
+// CHECK-LABEL: linearize_scalable_vector_broadcast
+// CHECK-SAME: (%[[ARG:.*]]: i32) -> vector<4x[2]xi32>
+func.func @linearize_scalable_vector_broadcast(%arg0: i32) -> vector<4x[2]xi32> {
+
+  // CHECK: %[[BROADCAST:.*]] = vector.broadcast %[[ARG]] : i32 to vector<[8]xi32>
+  // CHECK: %[[CAST:.*]] = vector.shape_cast %[[BROADCAST]] : vector<[8]xi32> to vector<4x[2]xi32>
+  // CHECK: return %[[CAST]] : vector<4x[2]xi32>
+  %0 = vector.broadcast %arg0 : i32 to vector<4x[2]xi32>
+  return %0 : vector<4x[2]xi32>
+
+}
+
+// -----
+
 // CHECK-LABEL: linearize_create_mask
 // CHECK-SAME: (%[[ARG0:.*]]: index, %[[ARG1:.*]]: index) -> vector<1x16xi1>
 func.func @linearize_create_mask(%arg0 : index, %arg1 : index) -> vector<1x16xi1> {
diff --git a/mlir/test/Dialect/Vector/vector-unroll-options.mlir b/mlir/test/Dialect/Vector/vector-unroll-options.mlir
index 35db14e0f7f1d..e5a98b5c67f33 100644
--- a/mlir/test/Dialect/Vector/vector-unroll-options.mlir
+++ b/mlir/test/Dialect/Vector/vector-unroll-options.mlir
@@ -188,15 +188,38 @@ func.func @vector_fma(%a: vector<4x4xf32>, %b: vector<4x4xf32>, %c: vector<4x4xf
 //   CHECK-LABEL: func @vector_fma
 // CHECK-COUNT-4: vector.fma %{{.+}}, %{{.+}}, %{{.+}} : vector<2x2xf32>
 
-// TODO: We should be able to unroll this like the example above - this will require extending UnrollElementwisePattern.
-func.func @negative_vector_fma_3d(%a: vector<3x2x2xf32>) -> vector<3x2x2xf32>{
+func.func @vector_fma_3d(%a: vector<3x2x2xf32>) -> vector<3x2x2xf32>{
   %0 = vector.fma %a, %a, %a : vector<3x2x2xf32>
   return %0 : vector<3x2x2xf32>
 }
-// CHECK-LABEL: func @negative_vector_fma_3d
-//   CHECK-NOT: vector.extract_strided_slice
-//       CHECK: %[[R0:.*]] = vector.fma %{{.+}} : vector<3x2x2xf32>
-//       CHECK: return
+// CHECK-LABEL: func @vector_fma_3d
+//  CHECK-SAME:   (%[[SRC:.*]]: vector<3x2x2xf32>) -> vector<3x2x2xf32> {
+//       CHECK:   %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<3x2x2xf32>
+//       CHECK:   %[[E_LHS_0:.*]] = vector.extract_strided_slice %[[SRC]] {offsets = [0, 0, 0], sizes = [1, 2, 2], strides = [1, 1, 1]} : vector<3x2x2xf32> to vector<1x2x2xf32>
+//       CHECK:   %[[S_LHS_0:.*]] = vector.shape_cast %[[E_LHS_0]] : vector<1x2x2xf32> to vector<2x2xf32>
+//       CHECK:   %[[E_RHS_0:.*]] = vector.extract_strided_slice %[[SRC]] {offsets = [0, 0, 0], sizes = [1, 2, 2], strides = [1, 1, 1]} : vector<3x2x2xf32> to vector<1x2x2xf32>
+//       CHECK:   %[[S_RHS_0:.*]] = vector.shape_cast %[[E_RHS_0]] : vector<1x2x2xf32> to vector<2x2xf32>
+//       CHECK:   %[[E_OUT_0:.*]] = vector.extract_strided_slice %[[SRC]] {offsets = [0, 0, 0], sizes = [1, 2, 2], strides = [1, 1, 1]} : vector<3x2x2xf32> to vector<1x2x2xf32>
+//       CHECK:   %[[S_OUT_0:.*]] = vector.shape_cast %[[E_OUT_0]] : vector<1x2x2xf32> to vector<2x2xf32>
+//       CHECK:   %[[FMA0:.*]] = vector.fma %[[S_LHS_0]], %[[S_RHS_0]], %[[S_OUT_0]] : vector<2x2xf32>
+//       CHECK:   %[[I0:.*]] = vector.insert_strided_slice %[[FMA0]], %[[CST]] {offsets = [0, 0, 0], strides = [1, 1]} : vector<2x2xf32> into vector<3x2x2xf32>
+//       CHECK:   %[[E_LHS_1:.*]] = vector.extract_strided_slice %[[SRC]] {offsets = [1, 0, 0], sizes = [1, 2, 2], strides = [1, 1, 1]} : vector<3x2x2xf32> to vector<1x2x2xf32>
+//       CHECK:   %[[S_LHS_1:.*]] = vector.shape_cast %[[E_LHS_1]] : vector<1x2x2xf32> to vector<2x2xf32>
+//       CHECK:   %[[E_RHS_1:.*]] = vector.extract_strided_slice %[[SRC]] {offsets = [1, 0, 0], sizes = [1, 2, 2], strides = [1, 1, 1]} : vector<3x2x2xf32> to vector<1x2x2xf32>
+//       CHECK:   %[[S_RHS_1:.*]] = vector.shape_cast %[[E_RHS_1]] : vector<1x2x2xf32> to vector<2x2xf32>
+//       CHECK:   %[[E_OUT_1:.*]] = vector.extract_strided_slice %[[SRC]] {offsets = [1, 0, 0], sizes = [1, 2, 2], strides = [1, 1, 1]} : vector<3x2x2xf32> to vector<1x2x2xf32>
+//       CHECK:   %[[S_OUT_1:.*]] = vector.shape_cast %[[E_OUT_1]] : vector<1x2x2xf32> to vector<2x2xf32>
+//       CHECK:   %[[FMA1:.*]] = vector.fma %[[S_LHS_1]], %[[S_RHS_1]], %[[S_OUT_1]] : vector<2x2xf32>
+//       CHECK:   %[[I1:.*]] = vector.insert_strided_slice %[[FMA1]], %[[I0]] {offsets = [1, 0, 0], strides = [1, 1]} : vector<2x2xf32> into vector<3x2x2xf32>
+//       CHECK:   %[[E_LHS_2:.*]] = vector.extract_strided_slice %[[SRC]] {offsets = [2, 0, 0], sizes = [1, 2, 2], strides = [1, 1, 1]} : vector<3x2x2xf32> to vector<1x2x2xf32>
+//       CHECK:   %[[S_LHS_2:.*]] = vector.shape_cast %[[E_LHS_2]] : vector<1x2x2xf32> to vector<2x2xf32>
+//       CHECK:   %[[E_RHS_2:.*]] = vector.extract_strided_slice %[[SRC]] {offsets = [2, 0, 0], sizes = [1, 2, 2], strides = [1, 1, 1]} : vector<3x2x2xf32> to vector<1x2x2xf32>
+//       CHECK:   %[[S_RHS_2:.*]] = vector.shape_cast %[[E_RHS_2]] : vector<1x2x2xf32> to vector<2x2xf32>
+//       CHECK:   %[[E_OUT_2:.*]] = vector.extract_strided_slice %[[SRC]] {offsets = [2, 0, 0], sizes = [1, 2, 2], strides = [1, 1, 1]} : vector<3x2x2xf32> to vector<1x2x2xf32>
+//       CHECK:   %[[S_OUT_2:.*]] = vector.shape_cast %[[E_OUT_2]] : vector<1x2x2xf32> to vector<2x2xf32>
+//       CHECK:   %[[FMA2:.*]] = vector.fma %[[S_LHS_2]], %[[S_RHS_2]], %[[S_OUT_2]] : vector<2x2xf32>
+//       CHECK:   %[[I2:.*]] = vector.insert_strided_slice %[[FMA2]], %[[I1]] {offsets = [2, 0, 0], strides = [1, 1]} : vector<2x2xf32> into vector<3x2x2xf32>
+//       CHECK:   return %[[I2]] : vector<3x2x2xf32>
 
 func.func @vector_multi_reduction(%v : vector<4x6xf32>, %acc: vector<4xf32>) -> vector<4xf32> {
   %0 = vector.multi_reduction #vector.kind<add>, %v, %acc [1] : vector<4x6xf32> to vector<4xf32>
@@ -440,3 +463,36 @@ func.func @vector_step() -> vector<32xindex> {
 // CHECK: %[[ADD3:.*]] = arith.addi %[[STEP]], %[[CST]] : vector<8xindex>
 // CHECK: %[[INS3:.*]] = vector.insert_strided_slice %[[ADD3]], %[[INS2]] {offsets = [24], strides = [1]} : vector<8xindex> into vector<32xindex>
 // CHECK: return %[[INS3]] : vector<32xindex>
+
+
+func.func @elementwise_3D_to_2D(%v1: vector<2x2x2xf32>, %v2: vector<2x2x2xf32>) -> vector<2x2x2xf32> {
+  %0 = arith.addf %v1, %v2 : vector<2x2x2xf32>
+  return %0 : vector<2x2x2xf32>
+}
+// CHECK-LABEL: func @elementwise_3D_to_2D
+//  CHECK-SAME: (%[[ARG0:.*]]: vector<2x2x2xf32>, %[[ARG1:.*]]: vector<2x2x2xf32>) -> vector<2x2x2xf32> {
+//       CHECK:   %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<2x2x2xf32>
+//       CHECK:   %[[E_LHS_0:.*]] = vector.extract_strided_slice %[[ARG0]] {offsets = [0, 0, 0], sizes = [1, 2, 2], strides = [1, 1, 1]} : vector<2x2x2xf32> to vector<1x2x2xf32>
+//       CHECK:   %[[S_LHS_0:.*]] = vector.shape_cast %[[E_LHS_0]] : vector<1x2x2xf32> to vector<2x2xf32>
+//       CHECK:   %[[E_RHS_0:.*]] = vector.extract_strided_slice %[[ARG1]] {offsets = [0, 0, 0], sizes = [1, 2, 2], strides = [1, 1, 1]} : vector<2x2x2xf32> to vector<1x2x2xf32>
+//       CHECK:   %[[S_RHS_0:.*]] = vector.shape_cast %[[E_RHS_0]] : vector<1x2x2xf32> to vector<2x2xf32>
+//       CHECK:   %[[ADD0:.*]] = arith.addf %[[S_LHS_0]], %[[S_RHS_0]] : vector<2x2xf32>
+//       CHECK:   %[[I0:.*]] = vector.insert_strided_slice %[[ADD0]], %[[CST]] {offsets = [0, 0, 0], strides = [1, 1]} : vector<2x2xf32> into vector<2x2x2xf32>
+//       CHECK:   %[[E_LHS_1:.*]] = vector.extract_strided_slice %[[ARG0]] {offsets = [1, 0, 0], sizes = [1, 2, 2], strides = [1, 1, 1]} : vector<2x2x2xf32> to vector<1x2x2xf32>
+//       CHECK:   %[[S_LHS_1:.*]] = vector.shape_cast %[[E_LHS_1]] : vector<1x2x2xf32> to vector<2x2xf32>
+//       CHECK:   %[[E_RHS_1:.*]] = vector.extract_strided_slice %[[ARG1]] {offsets = [1, 0, 0], sizes = [1, 2, 2], strides = [1, 1, 1]} : vector<2x2x2xf32> to vector<1x2x2xf32>
+//       CHECK:   %[[S_RHS_1:.*]] = vector.shape_cast %[[E_RHS_1]] : vector<1x2x2xf32> to vector<2x2xf32>
+//       CHECK:   %[[ADD1:.*]] = arith.addf %[[S_LHS_1]], %[[S_RHS_1]] : vector<2x2xf32>
+//       CHECK:   %[[I1:.*]] = vector.insert_strided_slice %[[ADD1]], %[[I0]] {offsets = [1, 0, 0], strides = [1, 1]} : vector<2x2xf32> into vector<2x2x2xf32>
+//       CHECK:   return %[[I1]] : vector<2x2x2xf32>
+
+
+func.func @elementwise_4D_to_2D(%v1: vector<2x2x2x2xf32>, %v2: vector<2x2x2x2xf32>) -> vector<2x2x2x2xf32> {
+  %0 = arith.addf %v1, %v2 : vector<2x2x2x2xf32>
+  return %0 : vector<2x2x2x2xf32>
+}
+
+// CHECK-LABEL: func @elementwise_4D_to_2D
+// CHECK-COUNT-4:   arith.addf %{{.*}}, %{{.*}} : vector<2x2xf32>
+// CHECK-NOT: arith.addf
+// CHECK: return
diff --git a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
index bb7639204022f..0cf6dd151e16c 100644
--- a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
+++ b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
@@ -473,6 +473,41 @@ func.func @warp_scf_for_use_from_above(%arg0: index) {
   return
 }
 
+// -----
+// CHECK-PROP-LABEL:  func.func @warp_scf_for_local_loop_bounds
+// CHECK-PROP:          (%{{.*}}: index, %[[ARG1:[a-zA-Z0-9]+]]: index) {
+// CHECK-PROP:          %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] args(%[[ARG1]] : index) -> (vector<4xf32>) {
+// CHECK-PROP:          ^bb0(%{{.*}}: index):
+// CHECK-PROP:            %[[T2:.*]] = "some_def"() : () -> vector<128xf32>
+// CHECK-PROP:            gpu.yield %[[T2]] : vector<128xf32>
+// CHECK-PROP:          }
+// CHECK-PROP:          %[[FOR:.*]] = scf.for %{{.*}} to %[[ARG1]] step %{{.*}} iter_args(%{{.*}}) -> (vector<4xf32>) {
+// CHECK-PROP:            %[[W2:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32]
+// CHECK-PROP-SAME:          args(%{{.*}} : vector<4xf32>) -> (vector<4xf32>) {
+// CHECK-PROP:            ^bb0(%{{.*}}: vector<128xf32>):
+// CHECK-PROP:              gpu.yield %{{.*}} : vector<128xf32>
+// CHECK-PROP:            }
+// CHECK-PROP:            scf.yield %[[W2]] : vector<4xf32>
+// CHECK-PROP:          }
+// CHECK-PROP:          "some_use"(%[[FOR]]) : (vector<4xf32>) -> ()
+// CHECK-PROP:          return
+func.func @warp_scf_for_local_loop_bounds(%arg0: index, %bound: index) {
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %0 = gpu.warp_execute_on_lane_0(%arg0)[32]
+    args(%bound : index) -> (vector<4xf32>) {
+    ^bb0(%arg1: index):
+    %ini = "some_def"() : () -> (vector<128xf32>)
+    %3 = scf.for %arg3 = %c0 to %arg1 step %c1 iter_args(%arg4 = %ini) -> (vector<128xf32>) {
+      %acc = "some_def"(%arg4) : (vector<128xf32>) -> (vector<128xf32>)
+      scf.yield %acc : vector<128xf32>
+    }
+    gpu.yield %3 : vector<128xf32>
+  }
+  "some_use"(%0) : (vector<4xf32>) -> ()
+  return
+}
+
 // -----
 
 // CHECK-PROP-LABEL:   func @warp_scf_for_swap(
@@ -1925,3 +1960,22 @@ func.func @warp_scf_if_distribute(%pred : i1)  {
 //       CHECK-PROP:    "some_use"(%[[IF_YIELD_DIST]]) : (vector<1xf32>) -> ()
 //       CHECK-PROP:    return
 //       CHECK-PROP:  }
+
+// -----
+func.func @dedup_unused_result(%laneid : index) -> (vector<1xf32>) {
+  %r:3 = gpu.warp_execute_on_lane_0(%laneid)[32] ->
+    (vector<1xf32>, vector<2xf32>, vector<1xf32>) {
+    %2 = "some_def"() : () -> (vector<32xf32>)
+    %3 = "some_def"() : () -> (vector<64xf32>)
+    gpu.yield %2, %3, %2 : vector<32xf32>, vector<64xf32>, vector<32xf32>
+  }
+  %r0 = "some_use"(%r#2, %r#2) : (vector<1xf32>, vector<1xf32>) -> (vector<1xf32>)
+  return %r0 : vector<1xf32>
+}
+
+// CHECK-PROP: func @dedup_unused_result
+// CHECK-PROP: %[[R:.*]] = gpu.warp_execute_on_lane_0(%arg0)[32] -> (vector<1xf32>)
+// CHECK-PROP:   %[[Y0:.*]] = "some_def"() : () -> vector<32xf32>
+// CHECK-PROP:   %[[Y1:.*]] = "some_def"() : () -> vector<64xf32>
+// CHECK-PROP:   gpu.yield %[[Y0]] : vector<32xf32>
+// CHECK-PROP: "some_use"(%[[R]], %[[R]]) : (vector<1xf32>, vector<1xf32>) -> vector<1xf32>
diff --git a/mlir/test/Dialect/WasmSSA/custom_parser/global.mlir b/mlir/test/Dialect/WasmSSA/custom_parser/global.mlir
index b9b342052ff1b..a25abbd2a8662 100644
--- a/mlir/test/Dialect/WasmSSA/custom_parser/global.mlir
+++ b/mlir/test/Dialect/WasmSSA/custom_parser/global.mlir
@@ -1,7 +1,7 @@
 // RUN: mlir-opt %s | FileCheck %s
 
 module {
-  wasmssa.import_global "from_js" from "env" as @global_0 nested : i32
+  wasmssa.import_global "from_js" from "env" as @global_0 : i32
 
   wasmssa.global @global_1 i32 : {
     %0 = wasmssa.const 10 : i32
@@ -21,7 +21,7 @@ module {
   }
 }
 
-// CHECK-LABEL:   wasmssa.import_global "from_js" from "env" as @global_0 nested : i32
+// CHECK-LABEL:   wasmssa.import_global "from_js" from "env" as @global_0 : i32
 
 // CHECK-LABEL:   wasmssa.global @global_1 i32 : {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.const 10 : i32
diff --git a/mlir/test/Dialect/WasmSSA/custom_parser/if.mlir b/mlir/test/Dialect/WasmSSA/custom_parser/if.mlir
index 01068cbe94fc2..cee3c695af2f0 100644
--- a/mlir/test/Dialect/WasmSSA/custom_parser/if.mlir
+++ b/mlir/test/Dialect/WasmSSA/custom_parser/if.mlir
@@ -1,6 +1,6 @@
 // RUN: mlir-opt %s | FileCheck %s
 
-// CHECK-LABEL:   wasmssa.func nested @func_0(
+// CHECK-LABEL:   wasmssa.func @func_0(
 // CHECK-SAME:      %[[ARG0:.*]]: !wasmssa<local ref to i32>) -> i32 {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.local_get %[[ARG0]] :  ref to i32
 // CHECK:           wasmssa.if %[[VAL_0]] : {
@@ -12,7 +12,7 @@
 // CHECK:           }> ^bb1
 // CHECK:         ^bb1(%[[VAL_3:.*]]: f32):
 // CHECK:           wasmssa.return %[[VAL_3]] : f32
-wasmssa.func nested @func_0(%arg0 : !wasmssa<local ref to i32>) -> i32 {
+wasmssa.func @func_0(%arg0 : !wasmssa<local ref to i32>) -> i32 {
   %cond = wasmssa.local_get %arg0 : ref to i32
   wasmssa.if %cond : {
     %c0 = wasmssa.const 0.5 : f32
@@ -25,7 +25,7 @@ wasmssa.func nested @func_0(%arg0 : !wasmssa<local ref to i32>) -> i32 {
   wasmssa.return %retVal : f32
 }
 
-// CHECK-LABEL:   wasmssa.func nested @func_1(
+// CHECK-LABEL:   wasmssa.func @func_1(
 // CHECK-SAME:      %[[ARG0:.*]]: !wasmssa<local ref to i32>) -> i32 {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.local_get %[[ARG0]] :  ref to i32
 // CHECK:           %[[VAL_1:.*]] = wasmssa.local of type i32
@@ -38,7 +38,7 @@ wasmssa.func nested @func_0(%arg0 : !wasmssa<local ref to i32>) -> i32 {
 // CHECK:         ^bb1:
 // CHECK:           %[[VAL_4:.*]] = wasmssa.local_get %[[VAL_1]] :  ref to i32
 // CHECK:           wasmssa.return %[[VAL_4]] : i32
-wasmssa.func nested @func_1(%arg0 : !wasmssa<local ref to i32>) -> i32 {
+wasmssa.func @func_1(%arg0 : !wasmssa<local ref to i32>) -> i32 {
   %cond = wasmssa.local_get %arg0 : ref to i32
   %var = wasmssa.local of type i32
   %zero = wasmssa.const 0
diff --git a/mlir/test/Dialect/WasmSSA/custom_parser/import.mlir b/mlir/test/Dialect/WasmSSA/custom_parser/import.mlir
index 3cc05486c4a4d..dc232299afadd 100644
--- a/mlir/test/Dialect/WasmSSA/custom_parser/import.mlir
+++ b/mlir/test/Dialect/WasmSSA/custom_parser/import.mlir
@@ -5,13 +5,13 @@ module {
   wasmssa.import_func "bar" from "my_module" as @func_1 {sym_visibility = "nested", type = (i32) -> ()}
   wasmssa.import_table "table" from "my_module" as @table_0 {sym_visibility = "nested", type = !wasmssa<tabletype !wasmssa.funcref [2:]>}
   wasmssa.import_mem "mem" from "my_module" as @mem_0 {limits = !wasmssa<limit[2:]>, sym_visibility = "nested"}
-  wasmssa.import_global "glob" from "my_module" as @global_0 nested : i32
-  wasmssa.import_global "glob_mut" from "my_other_module" as @global_1 mutable nested : i32
+  wasmssa.import_global "glob" from "my_module" as @global_0 : i32
+  wasmssa.import_global "glob_mut" from "my_other_module" as @global_1 mutable : i32
 }
 
 // CHECK-LABEL:   wasmssa.import_func "foo" from "my_module" as @func_0 {sym_visibility = "nested", type = (i32) -> ()}
 // CHECK:         wasmssa.import_func "bar" from "my_module" as @func_1 {sym_visibility = "nested", type = (i32) -> ()}
 // CHECK:         wasmssa.import_table "table" from "my_module" as @table_0 {sym_visibility = "nested", type = !wasmssa<tabletype !wasmssa.funcref [2:]>}
 // CHECK:         wasmssa.import_mem "mem" from "my_module" as @mem_0 {limits = !wasmssa<limit[2:]>, sym_visibility = "nested"}
-// CHECK:         wasmssa.import_global "glob" from "my_module" as @global_0 nested : i32
-// CHECK:         wasmssa.import_global "glob_mut" from "my_other_module" as @global_1 mutable nested : i32
+// CHECK:         wasmssa.import_global "glob" from "my_module" as @global_0 : i32
+// CHECK:         wasmssa.import_global "glob_mut" from "my_other_module" as @global_1 mutable : i32
diff --git a/mlir/test/Dialect/WasmSSA/custom_parser/local.mlir b/mlir/test/Dialect/WasmSSA/custom_parser/local.mlir
index 3f6423fa2a1f5..f613ebf07be1d 100644
--- a/mlir/test/Dialect/WasmSSA/custom_parser/local.mlir
+++ b/mlir/test/Dialect/WasmSSA/custom_parser/local.mlir
@@ -1,7 +1,7 @@
 // RUN: mlir-opt %s | FileCheck %s
 
 module {
-  wasmssa.func nested @func_0() -> f32 {
+  wasmssa.func @func_0() -> f32 {
     %0 = wasmssa.local of type f32
     %1 = wasmssa.local of type f32
     %2 = wasmssa.const 8.000000e+00 : f32
@@ -9,7 +9,7 @@ module {
     %4 = wasmssa.add %2 %3 : f32
     wasmssa.return %4 : f32
   }
-  wasmssa.func nested @func_1() -> i32 {
+  wasmssa.func @func_1() -> i32 {
     %0 = wasmssa.local of type i32
     %1 = wasmssa.local of type i32
     %2 = wasmssa.const 8 : i32
@@ -17,13 +17,13 @@ module {
     %4 = wasmssa.add %2 %3 : i32
     wasmssa.return %4 : i32
   }
-  wasmssa.func nested @func_2(%arg0: !wasmssa<local ref to i32>) -> i32 {
+  wasmssa.func @func_2(%arg0: !wasmssa<local ref to i32>) -> i32 {
     %0 = wasmssa.const 3 : i32
     wasmssa.return %0 : i32
   }
 }
 
-// CHECK-LABEL:   wasmssa.func nested @func_0() -> f32 {
+// CHECK-LABEL:   wasmssa.func @func_0() -> f32 {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.local of type f32
 // CHECK:           %[[VAL_1:.*]] = wasmssa.local of type f32
 // CHECK:           %[[VAL_2:.*]] = wasmssa.const 8.000000e+00 : f32
@@ -31,7 +31,7 @@ module {
 // CHECK:           %[[VAL_4:.*]] = wasmssa.add %[[VAL_2]] %[[VAL_3]] : f32
 // CHECK:           wasmssa.return %[[VAL_4]] : f32
 
-// CHECK-LABEL:   wasmssa.func nested @func_1() -> i32 {
+// CHECK-LABEL:   wasmssa.func @func_1() -> i32 {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.local of type i32
 // CHECK:           %[[VAL_1:.*]] = wasmssa.local of type i32
 // CHECK:           %[[VAL_2:.*]] = wasmssa.const 8 : i32
@@ -39,7 +39,7 @@ module {
 // CHECK:           %[[VAL_4:.*]] = wasmssa.add %[[VAL_2]] %[[VAL_3]] : i32
 // CHECK:           wasmssa.return %[[VAL_4]] : i32
 
-// CHECK-LABEL:   wasmssa.func nested @func_2(
+// CHECK-LABEL:   wasmssa.func @func_2(
 // CHECK-SAME:      %[[ARG0:.*]]: !wasmssa<local ref to i32>) -> i32 {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.const 3 : i32
 // CHECK:           wasmssa.return %[[VAL_0]] : i32
diff --git a/mlir/test/Dialect/WasmSSA/custom_parser/memory.mlir b/mlir/test/Dialect/WasmSSA/custom_parser/memory.mlir
index 47551dbf18292..ca6ebe01b1f7e 100644
--- a/mlir/test/Dialect/WasmSSA/custom_parser/memory.mlir
+++ b/mlir/test/Dialect/WasmSSA/custom_parser/memory.mlir
@@ -1,7 +1,7 @@
 // RUN: mlir-opt %s | FileCheck %s
 
-// CHECK:   wasmssa.memory @mem0 public !wasmssa<limit[0: 65536]>
-wasmssa.memory @mem0 public !wasmssa<limit[0:65536]>
-
-// CHECK:   wasmssa.memory @mem1 nested !wasmssa<limit[512:]>
+// CHECK:   wasmssa.memory @mem1 !wasmssa<limit[512:]>
 wasmssa.memory @mem1 !wasmssa<limit[512:]>
+
+// CHECK:   wasmssa.memory exported @mem2 !wasmssa<limit[0: 65536]>
+wasmssa.memory exported @mem2 !wasmssa<limit[0:65536]>
diff --git a/mlir/test/Dialect/WasmSSA/custom_parser/table.mlir b/mlir/test/Dialect/WasmSSA/custom_parser/table.mlir
index 5a874f4ac9e08..ea630deaff69c 100644
--- a/mlir/test/Dialect/WasmSSA/custom_parser/table.mlir
+++ b/mlir/test/Dialect/WasmSSA/custom_parser/table.mlir
@@ -1,7 +1,7 @@
 // RUN: mlir-opt %s | FileCheck %s
 
-// CHECK:   wasmssa.table @tab0 public !wasmssa<tabletype !wasmssa.externref [0: 65536]>
-wasmssa.table @tab0 public !wasmssa<tabletype !wasmssa.externref [0:65536]>
+// CHECK:   wasmssa.table exported @tab0 !wasmssa<tabletype !wasmssa.externref [0: 65536]>
+wasmssa.table exported @tab0 !wasmssa<tabletype !wasmssa.externref [0:65536]>
 
-// CHECK:   wasmssa.table @tab1 nested !wasmssa<tabletype !wasmssa.funcref [348:]>
+// CHECK:   wasmssa.table @tab1 !wasmssa<tabletype !wasmssa.funcref [348:]>
 wasmssa.table @tab1 !wasmssa<tabletype !wasmssa.funcref [348:]>
diff --git a/mlir/test/Dialect/WasmSSA/extend-invalid.mlir b/mlir/test/Dialect/WasmSSA/extend-invalid.mlir
index 8d782801c33f9..7687e5f32f799 100644
--- a/mlir/test/Dialect/WasmSSA/extend-invalid.mlir
+++ b/mlir/test/Dialect/WasmSSA/extend-invalid.mlir
@@ -1,7 +1,7 @@
 // RUN: mlir-opt %s -split-input-file -verify-diagnostics
 
 
-wasmssa.func nested @extend_low_64() -> i32 {
+wasmssa.func @extend_low_64() -> i32 {
   %0 = wasmssa.const 10 : i32
   // expected-error@+1 {{extend op can only take 8, 16 or 32 bits. Got 64}}
   %1 = wasmssa.extend 64 low bits from %0: i32
@@ -10,7 +10,7 @@ wasmssa.func nested @extend_low_64() -> i32 {
 
 // -----
 
-wasmssa.func nested @extend_too_much() -> i32 {
+wasmssa.func @extend_too_much() -> i32 {
   %0 = wasmssa.const 10 : i32
   // expected-error@+1 {{trying to extend the 32 low bits from a 'i32' value is illegal}}
   %1 = wasmssa.extend 32 low bits from %0: i32
diff --git a/mlir/test/Dialect/WasmSSA/global-invalid.mlir b/mlir/test/Dialect/WasmSSA/global-invalid.mlir
index b9cafd8b900bf..c5bc606fd13f3 100644
--- a/mlir/test/Dialect/WasmSSA/global-invalid.mlir
+++ b/mlir/test/Dialect/WasmSSA/global-invalid.mlir
@@ -13,7 +13,7 @@ module {
 // -----
 
 module {
-  wasmssa.import_global "glob" from "my_module" as @global_0 mutable nested : i32
+  wasmssa.import_global "glob" from "my_module" as @global_0 mutable : i32
   wasmssa.global @global_1 i32 : {
   // expected-error@+1 {{global.get op is considered constant if it's referring to a import.global symbol marked non-mutable}}
     %0 = wasmssa.global_get @global_0 : i32
@@ -30,3 +30,13 @@ module {
     wasmssa.return %0 : i32
   }
 }
+
+// -----
+
+module {
+  // expected-error@+1 {{expecting either `exported` or symbol name. got exproted}}
+  wasmssa.global exproted @global_1 i32 : {
+    %0 = wasmssa.const 17 : i32
+    wasmssa.return %0 : i32
+  }
+}
diff --git a/mlir/test/Dialect/WasmSSA/locals-invalid.mlir b/mlir/test/Dialect/WasmSSA/locals-invalid.mlir
index 35c590b36b289..eaad80e19584d 100644
--- a/mlir/test/Dialect/WasmSSA/locals-invalid.mlir
+++ b/mlir/test/Dialect/WasmSSA/locals-invalid.mlir
@@ -1,6 +1,6 @@
 // RUN: mlir-opt %s -split-input-file -verify-diagnostics
 
-wasmssa.func nested @local_set_err(%arg0: !wasmssa<local ref to i32>) -> i64 {
+wasmssa.func @local_set_err(%arg0: !wasmssa<local ref to i32>) -> i64 {
   %0 = wasmssa.const 3 : i64
   // expected-error@+1 {{input type and result type of local.set do not match}}
   wasmssa.local_set %arg0 : ref to i32 to %0 : i64
@@ -9,7 +9,7 @@ wasmssa.func nested @local_set_err(%arg0: !wasmssa<local ref to i32>) -> i64 {
 
 // -----
 
-wasmssa.func nested @local_tee_err(%arg0: !wasmssa<local ref to i32>) -> i32 {
+wasmssa.func @local_tee_err(%arg0: !wasmssa<local ref to i32>) -> i32 {
   %0 = wasmssa.const 3 : i64
   // expected-error@+1 {{input type and output type of local.tee do not match}}
   %1 = wasmssa.local_tee %arg0 :  ref to i32 to %0 : i64
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index 228ef69d9a478..ebbe3ce0ec0d0 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -858,7 +858,7 @@ func.func @load_mem_desc_mismatch_element_type(%arg0: !xegpu.mem_desc<16x64xf16>
 
 // -----
 func.func @load_mem_desc_invalid_result_size(%arg0: !xegpu.mem_desc<16x64xf16>) {
-  // expected-error@+1 {{result shape must not exceed mem_desc shape}}
+  // expected-error@+1 {{data shape must not exceed mem_desc shape}}
   %data = xegpu.load_matrix %arg0[8, 8]: !xegpu.mem_desc<16x64xf16> -> vector<32x16xf16>
   return
 }
@@ -870,6 +870,14 @@ func.func @load_mem_desc_invalid_rank(%arg0: !xegpu.mem_desc<64xf16>) {
   return
 }
 
+// -----
+func.func @load_mem_desc_invalid_attr2(%arg0: !xegpu.mem_desc<16x64xf16>) {
+  // expected-error@+1 {{subgroup_block_io are only allowed when result is a 1D VectorType.}}
+  %data2 = xegpu.load_matrix %arg0[8, 8] <{subgroup_block_io}>: !xegpu.mem_desc<16x64xf16> -> vector<16x16xf16>
+  return
+}
+
+
 // -----
 func.func @store_mem_desc_mismatch_element_type(%arg0: !xegpu.mem_desc<16x64xf16>, %arg1: vector<16x16xf32>) {
   // expected-error@+1 {{failed to verify that all of {mem_desc, data} have same element type}}
@@ -892,30 +900,16 @@ func.func @store_mem_desc_invalid_rank(%arg0: !xegpu.mem_desc<64xf16>, %arg1: ve
 }
 
 // -----
-func.func @mem_desc_subview_size_mismatch(%arg0: !xegpu.mem_desc<16x64xf16>) {
-  // expected-error@+1 {{result shape must not exceed source shape}}
-  %data = xegpu.mem_desc_subview %arg0[8, 8]: !xegpu.mem_desc<16x64xf16> -> !xegpu.mem_desc<32x16xf16>
-  return
-}
-
-// -----
-func.func @mem_desc_subview_layout_mismatch(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride =[1, 16]>>) {
-  // expected-error@+1 {{result must inherit the source strides}}
-  %data = xegpu.mem_desc_subview %arg0[8, 8]: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride =[1, 16]>> -> !xegpu.mem_desc<8x16xf16>
-  return
-}
-
-// -----
-func.func @mem_desc_subview_element_type_mismatch(%arg0: !xegpu.mem_desc<16x64xf16>) {
-  // expected-error@+1 {{failed to verify that all of {src, res} have same element type}}
-  %data = xegpu.mem_desc_subview %arg0[8, 8]: !xegpu.mem_desc<16x64xf16> -> !xegpu.mem_desc<8x16xf32, #xegpu.mem_layout<stride =[64, 1]>>
+func.func @store_mem_desc_invalid_attr2(%arg0: !xegpu.mem_desc<16x64xf16>, %data: vector<16x16xf16>) {
+  // expected-error@+1 {{subgroup_block_io are only allowed when result is a 1D VectorType.}}
+  xegpu.store_matrix %data,  %arg0[8, 8] <{subgroup_block_io}>: vector<16x16xf16>, !xegpu.mem_desc<16x64xf16>
   return
 }
 
 // -----
-func.func @mem_desc_subview_rank_mismatch(%arg0: !xegpu.mem_desc<16x64xf16>) {
-  // expected-error@+1 {{result rank must not exceed source rank}}
-  %data = xegpu.mem_desc_subview %arg0[8, 8]: !xegpu.mem_desc<16x64xf16> -> !xegpu.mem_desc<4x8x16xf16>
+func.func @store_mem_desc_invalid_attr2(%arg0: !xegpu.mem_desc<16x64xf16>, %data: vector<16x16xf16>) {
+  // expected-error@+1 {{subgroup_block_io are only allowed when result is a 1D VectorType.}}
+  xegpu.store_matrix %data,  %arg0[8, 8] <{subgroup_block_io}>: vector<16x16xf16>, !xegpu.mem_desc<16x64xf16>
   return
 }
 
diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir
index bb379024a34d7..0a10f6814ae96 100644
--- a/mlir/test/Dialect/XeGPU/ops.mlir
+++ b/mlir/test/Dialect/XeGPU/ops.mlir
@@ -825,53 +825,73 @@ gpu.func @create_mem_desc_with_stride() {
   gpu.return
 }
 
-// CHECK: gpu.func @load_mem_desc([[ARG0:%.+]]: !xegpu.mem_desc<16x64xf16>)
-gpu.func @load_mem_desc(%arg0: !xegpu.mem_desc<16x64xf16>) {
+// CHECK: gpu.func @load_matrix([[ARG0:%.+]]: !xegpu.mem_desc<16x64xf16>)
+gpu.func @load_matrix(%arg0: !xegpu.mem_desc<16x64xf16>) {
   // CHECK: xegpu.load_matrix [[ARG0]][8, 8] : !xegpu.mem_desc<16x64xf16> -> vector<8x16xf16>
   %data = xegpu.load_matrix %arg0[8, 8]: !xegpu.mem_desc<16x64xf16> -> vector<8x16xf16>
   gpu.return
 }
 
-// CHECK: gpu.func @load_mem_desc_with_stride(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>)
-gpu.func @load_mem_desc_with_stride(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>) {
+// CHECK: gpu.func @load_matrix_with_stride(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>)
+gpu.func @load_matrix_with_stride(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>) {
   // CHECK: xegpu.load_matrix [[ARG0]][8, 8] : !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>> -> vector<8x16xf16>
   %data = xegpu.load_matrix %arg0[8, 8]: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>> -> vector<8x16xf16>
   gpu.return
 }
 
+// CHECK: gpu.func @simt_load_matrix(%arg0: !xegpu.mem_desc<16x64xf16>)
+gpu.func @simt_load_matrix(%arg0: !xegpu.mem_desc<16x64xf16>) {
+  // CHECK: xegpu.load_matrix [[ARG0]][8, 16] : !xegpu.mem_desc<16x64xf16> -> vector<1xf16>
+  %data = xegpu.load_matrix %arg0[8, 16]: !xegpu.mem_desc<16x64xf16> -> vector<1xf16>
+  gpu.return
+}
+
+// CHECK: gpu.func @simt_load_matrix_subgroup_block_io(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<block = [16, 16]>>)
+gpu.func @simt_load_matrix_subgroup_block_io(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<block = [16, 16]>>) {
+  // CHECK: xegpu.load_matrix [[ARG0]][8, 16] <{subgroup_block_io}>: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<block = [16, 16]>> -> vector<8xf16>
+  %data = xegpu.load_matrix %arg0[8, 16] <{subgroup_block_io}>: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<block = [16, 16]>> -> vector<8xf16>
+  gpu.return
+}
+
+// CHECK: gpu.func @simt_load_matrix_vector(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>)
+gpu.func @simt_load_matrix_vector(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>) {
+  // CHECK: xegpu.load_matrix [[ARG0]][8, 8] : !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>> -> vector<8xf16> 
+  %data = xegpu.load_matrix %arg0[8, 8] : !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>> -> vector<8xf16>
+  gpu.return
+}
 
-// CHECK: gpu.func @store_mem_desc([[ARG0:%.+]]: !xegpu.mem_desc<16x64xf16>, [[ARG1:%.+]]: vector<16x16xf16>)
-gpu.func @store_mem_desc(%arg0: !xegpu.mem_desc<16x64xf16>, %arg1: vector<16x16xf16>) {
+// CHECK: gpu.func @store_matrix([[ARG0:%.+]]: !xegpu.mem_desc<16x64xf16>, [[ARG1:%.+]]: vector<16x16xf16>)
+gpu.func @store_matrix(%arg0: !xegpu.mem_desc<16x64xf16>, %arg1: vector<16x16xf16>) {
   // CHECK: xegpu.store_matrix [[ARG1]], [[ARG0]][8, 8] : vector<16x16xf16>, !xegpu.mem_desc<16x64xf16>
   xegpu.store_matrix %arg1, %arg0[8, 8]: vector<16x16xf16>, !xegpu.mem_desc<16x64xf16>
   gpu.return
 }
 
-// CHECK: gpu.func @store_mem_desc_with_stride([[ARG0:%.+]]: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>, [[ARG1:%.+]]: vector<16x16xf16>)
-gpu.func @store_mem_desc_with_stride(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>, %arg1: vector<16x16xf16>) {
+// CHECK: gpu.func @store_matrix_with_stride([[ARG0:%.+]]: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>, [[ARG1:%.+]]: vector<16x16xf16>)
+gpu.func @store_matrix_with_stride(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>, %arg1: vector<16x16xf16>) {
   // CHECK: xegpu.store_matrix [[ARG1]], [[ARG0]][0, 8] : vector<16x16xf16>, !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>
   xegpu.store_matrix %arg1, %arg0[0, 8]: vector<16x16xf16>, !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>
   gpu.return
 }
 
-// CHECK: gpu.func @mem_desc_subview([[ARG0:%.+]]: !xegpu.mem_desc<16x64xf16>)
-gpu.func @mem_desc_subview(%arg0: !xegpu.mem_desc<16x64xf16>) {
-  //CHECK: xegpu.mem_desc_subview [[ARG0]][8, 8] : !xegpu.mem_desc<16x64xf16> -> !xegpu.mem_desc<8x16xf16, #xegpu.mem_layout<stride = [64, 1]>>
-  %data = xegpu.mem_desc_subview %arg0[8, 8]: !xegpu.mem_desc<16x64xf16> -> !xegpu.mem_desc<8x16xf16, #xegpu.mem_layout<stride = [64, 1]>>
+// CHECK: gpu.func @simt_store_matrix(%arg0: !xegpu.mem_desc<16x64xf16>, %arg1: vector<1xf16>) { 
+gpu.func @simt_store_matrix(%arg0: !xegpu.mem_desc<16x64xf16>, %arg1: vector<1xf16>) {
+  // CHECK: xegpu.store_matrix [[ARG1]], [[ARG0]][8, 16] : vector<1xf16>, !xegpu.mem_desc<16x64xf16>
+  xegpu.store_matrix %arg1, %arg0[8, 16]: vector<1xf16>, !xegpu.mem_desc<16x64xf16>
   gpu.return
 }
 
-// CHECK: gpu.func @mem_desc_subview_lower_rank([[ARG0:%.+]]: !xegpu.mem_desc<16x64xf16>)
-gpu.func @mem_desc_subview_lower_rank(%arg0: !xegpu.mem_desc<16x64xf16>) {
-  //CHECK: xegpu.mem_desc_subview [[ARG0]][8, 8] : !xegpu.mem_desc<16x64xf16> -> !xegpu.mem_desc<16xf16, #xegpu.mem_layout<stride = [64, 1]>>
-  %data = xegpu.mem_desc_subview %arg0[8, 8]: !xegpu.mem_desc<16x64xf16> -> !xegpu.mem_desc<16xf16, #xegpu.mem_layout<stride = [64, 1]>>
+// CHECK: gpu.func @simt_store_matrix_subgroup_block_io(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<block = [16, 16]>>, %arg1: vector<8xf16>)
+gpu.func @simt_store_matrix_subgroup_block_io(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<block = [16, 16]>>, %arg1: vector<8xf16>) {
+  // CHECK: xegpu.store_matrix [[ARG1]], [[ARG0]][8, 16] <{subgroup_block_io}>: vector<8xf16>, !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<block = [16, 16]>>
+  xegpu.store_matrix %arg1, %arg0[8, 16] <{subgroup_block_io}>: vector<8xf16>, !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<block = [16, 16]>>
   gpu.return
 }
 
-// CHECK: gpu.func @mem_desc_subview_with_stride([[ARG0:%.+]]: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>)
-gpu.func @mem_desc_subview_with_stride(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>) {
-  //CHECK: xegpu.mem_desc_subview [[ARG0]][8, 8] : !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>> -> !xegpu.mem_desc<8x16xf16, #xegpu.mem_layout<stride = [1, 16]>>
-  %data = xegpu.mem_desc_subview %arg0[8, 8]: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>> -> !xegpu.mem_desc<8x16xf16, #xegpu.mem_layout<stride = [1, 16]>>
+// CHECK: gpu.func @simt_store_matrix_vector(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>, %arg1: vector<8xf16>) {
+gpu.func @simt_store_matrix_vector(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>, %arg1: vector<8xf16>) {
+  // CHECK: xegpu.store_matrix [[ARG1]], [[ARG0]][8, 8] : vector<8xf16>, !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>> 
+  xegpu.store_matrix %arg1, %arg0[8, 8] : vector<8xf16>, !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout<stride = [1, 16]>>
   gpu.return
 }
 
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/lit.local.cfg b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/lit.local.cfg
index 37d3a74874ce4..0a40e0a1eba28 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/lit.local.cfg
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/lit.local.cfg
@@ -7,3 +7,6 @@ if not config.mlir_run_arm_sve_tests:
 # No JIT on win32.
 if sys.platform == "win32":
     config.unsupported = True
+
+# Skip the directory with input TD sequences
+config.excludes = ["td"]
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/pack-unpack-scalable-inner-tile.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/pack-unpack-scalable-inner-tile.mlir
index 85de18ad92d56..76d6a83f41451 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/pack-unpack-scalable-inner-tile.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/pack-unpack-scalable-inner-tile.mlir
@@ -1,32 +1,64 @@
-// DEFINE: %{compile} =  mlir-opt %s \
-// DEFINE:  -transform-interpreter -test-transform-dialect-erase-schedule \
-// DEFINE:    --lower-vector-mask |\
-// DEFINE: mlir-opt -arm-sve-legalize-vector-storage -convert-vector-to-llvm="enable-arm-sve"\
-// DEFINE:  -test-lower-to-llvm -o %t
+// DEFINE: %{td_entry_point} =
+
+// DEFINE: %{compile} = mlir-opt %s \
+// DEFINE:    -transform-preload-library='transform-library-paths=%p/td/pack-unpack.mlir' \
+// DEFINE:    -transform-interpreter=entry-point=%{td_entry_point} \
+// DEFINE:    -lower-vector-mask -convert-vector-to-scf="full-unroll target-rank=0" \
+// DEFINE:    -arm-sve-legalize-vector-storage -convert-vector-to-llvm="enable-arm-sve"\
+// DEFINE:    -test-lower-to-llvm -o %t
 // DEFINE: %{entry_point} = main
 // DEFINE: %{run} = %mcr_aarch64_cmd %t -e %{entry_point} -entry-point-result=void  --march=aarch64 --mattr="+sve"\
 // DEFINE:    -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%native_mlir_arm_runner_utils
 
+/// Run _without_ vectorization
+// REDEFINE: %{td_entry_point} = __transform_main_basic
 // RUN: rm -f %t && %{compile} && %{run} | FileCheck %s
 
-/// End-to-end test for linalg.pack + linalg.unpack where one of the inner tile sizes is
-/// scalable.
-/// NOTE: Vectorization has not been enabled yet!
-
+/// Run _with_ vectorization
+// REDEFINE: %{td_entry_point} = __transform_main_vectorized
+// RUN: rm -f %t && %{compile} && %{run} | FileCheck %s
 
-/// The main entry point
+//===----------------------------------------------------------------------===//
+/// HIGH-LEVEL OVERVIEW
+///
+/// End-to-end test for linalg.pack + linalg.unpack where one of the inner tile
+/// sizes is scalable.
+///
+/// Two versions of the transform IR are tested:
+///   * without vectorization (see @__transform_main_basic in pack-unpack.mlir)
+///   * with vectorization (see @__transform_main_vectorized in pack-unpack.mlir)
+///
+/// With the payload IR fixed, the runtime output is identical. Note - in both
+/// cases the tile sizes are scalable.
+///
+/// TODO: ATM only linalg.unpack is vectorized. Add linalg.pack vectorization.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// @main
+//
+// Thin wrapper over the main test function to allow changing the runtime
+// vector length via @setArmVLBits (calling setArmVLBits() in a function that
+// uses SVE vectors is UB).
+//===----------------------------------------------------------------------===//
 func.func @main() {
   // Set vscale to 2 (vector width = 256). This will have identical effect to:
   //  * qemu-aarch64 -cpu max,sve-max-vq=2 (...)
   // (If your platform supports it, you can play with other values as well)
   %c256 = arith.constant 256 : i32
   func.call @setArmVLBits(%c256) : (i32) -> ()
-  func.call @test_pack_unpack_scalable_inner_tile() : () -> ()
+  func.call @pack_unpack_with_scalable_inner_tile() : () -> ()
 
   return
 }
 
-func.func @test_pack_unpack_scalable_inner_tile() attributes {no_inline} {
+//===----------------------------------------------------------------------===//
+// @pack_unpack_with_scalable_inner_tile
+//
+// The main test function that initilaises the matrices an calls pack/unpack
+// hooks.
+//===----------------------------------------------------------------------===//
+func.func @pack_unpack_with_scalable_inner_tile() attributes {no_inline} {
   // Dynamic/scalable tile size (vscale x 4)
   %c4 = arith.constant 4 : index
   %vs = vector.vscale
@@ -95,7 +127,11 @@ func.func @test_pack_unpack_scalable_inner_tile() attributes {no_inline} {
   return
 }
 
-/// Takes the unpacked matrix + inner tile size to use and return the packed matrix.
+//===----------------------------------------------------------------------===//
+// @pack_main
+//
+// Takes the unpacked matrix + inner tile size to use and return the packed matrix.
+//===----------------------------------------------------------------------===//
 func.func private @pack_main(%A: tensor<7x12xi32>, %inner_tile_size: index) -> (tensor<2x?x4x?xi32>) {
   // Get the size of dim (we could skip tensor.dim, but this way we can keep it generic)
   %c1 = arith.constant 1 : index
@@ -122,7 +158,11 @@ func.func private @pack_main(%A: tensor<7x12xi32>, %inner_tile_size: index) -> (
   return %A_pack : tensor<2x?x4x?xi32>
 }
 
+//===----------------------------------------------------------------------===//
+// @unpack_main
+//
 /// Takes the packed matrix, unpacks it and returns the result.
+//===----------------------------------------------------------------------===//
 func.func private @unpack_main(%A_pack : tensor<2x?x4x?xi32>, %inner_tile_size: index) -> tensor<7x12xi32> {
   %A_unpack_empty = tensor.empty() : tensor<7x12xi32>
 
@@ -134,57 +174,5 @@ func.func private @unpack_main(%A_pack : tensor<2x?x4x?xi32>, %inner_tile_size:
   return %A_unpack : tensor<7x12xi32>
 }
 
-module @transforms attributes { transform.with_named_sequence } {
-  transform.named_sequence @__transform_main(%module: !transform.any_op {transform.consume}) {
-    %pack = transform.structured.match ops{["linalg.pack"]} in %module : (!transform.any_op) -> !transform.any_op
-    %unpack = transform.structured.match ops{["linalg.unpack"]} in %module : (!transform.any_op) -> !transform.any_op
-
-    // 1.1 Tile the linalg.pack Op so that we can decompose it into e.g. tensor.pad
-    //    and other lower-level Ops (see step 2.1)
-    %tiled_pack_op_p, %loops_pack:2 = transform.structured.tile_using_for %pack tile_sizes [1, 1]
-       : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-    // 1.2 Tile the linalg.unpack Op so that we can decompose it into e.g. tensor.pad
-    //    and other lower-level Ops (see step 2)
-    %tiled_unpack_op_p, %loops_unpack:2 = transform.structured.tile_using_for %unpack tile_sizes [4, 1]
-       : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-    // 2.1. Decompose tiled PackOp into lower-level Ops
-    %func_op_pack = transform.get_parent_op %tiled_pack_op_p {isolated_from_above} : (!transform.any_op) -> !transform.op<"func.func">
-    transform.apply_patterns to %func_op_pack {
-      transform.apply_patterns.linalg.decompose_pack_unpack
-      transform.apply_patterns.linalg.decompose_pad
-    } : !transform.op<"func.func">
-
-    transform.apply_patterns to %func_op_pack {
-      transform.apply_patterns.tensor.fold_tensor_subset_ops
-      transform.apply_patterns.canonicalization
-    } : !transform.op<"func.func">
-
-    // 2.1. Decompose tiled UnpackOp into lower-level Ops
-    %func_op_unpack = transform.get_parent_op %tiled_unpack_op_p {isolated_from_above} : (!transform.any_op) -> !transform.op<"func.func">
-    transform.apply_patterns to %func_op_unpack {
-      transform.apply_patterns.linalg.decompose_pack_unpack
-    } : !transform.op<"func.func">
-
-    transform.apply_patterns to %func_op_unpack {
-      transform.apply_patterns.tensor.fold_tensor_subset_ops
-      transform.apply_patterns.canonicalization
-    } : !transform.op<"func.func">
-
-   // 3. Bufferize before lowering to LLVM
-   %bufferize = transform.bufferization.one_shot_bufferize %module
-     {bufferize_function_boundaries=true} : (!transform.any_op) -> !transform.any_op
-
-   // 4. Canonicalize
-   %func_op_bufferized = transform.structured.match ops{["func.func"]} in %bufferize : (!transform.any_op) -> !transform.op<"func.func">
-   transform.apply_patterns to %func_op_bufferized {
-     transform.apply_patterns.canonicalization
-   } : !transform.op<"func.func">
-
-    transform.yield
-  }
-}
-
 func.func private @printMemrefI32(%ptr : tensor<*xi32>)
 func.func private @setArmVLBits(%bits : i32)
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/td/pack-unpack.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/td/pack-unpack.mlir
new file mode 100644
index 0000000000000..80aa49ab4d945
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/td/pack-unpack.mlir
@@ -0,0 +1,106 @@
+module @transforms attributes { transform.with_named_sequence } {
+
+  //===----------------------------------------------------------------------===//
+  // TD sequence _without_ vectorization
+  //===----------------------------------------------------------------------===//
+  transform.named_sequence @__transform_main_basic(%module: !transform.any_op {transform.consume}) {
+    %pack = transform.structured.match ops{["linalg.pack"]} in %module : (!transform.any_op) -> !transform.any_op
+    %unpack = transform.structured.match ops{["linalg.unpack"]} in %module : (!transform.any_op) -> !transform.any_op
+
+    // 1.1 Tile the linalg.pack Op so that we can decompose it into e.g. tensor.pad
+    //    and other lower-level Ops (see step 2.1)
+    %tiled_pack_op_p, %loops_pack:2 = transform.structured.tile_using_for %pack tile_sizes [1, 1]
+       : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+
+    // 1.2 Tile the linalg.unpack Op so that we can decompose it into e.g. tensor.pad
+    //    and other lower-level Ops (see step 2.2)
+    %tiled_unpack_op_p, %loops_unpack:2 = transform.structured.tile_using_for %unpack tile_sizes [4, 1]
+       : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+
+    // 2.1. Decompose tiled PackOp into lower-level Ops
+    %func_op_pack = transform.get_parent_op %tiled_pack_op_p {isolated_from_above} : (!transform.any_op) -> !transform.op<"func.func">
+    transform.apply_patterns to %func_op_pack {
+      transform.apply_patterns.linalg.decompose_pack_unpack
+      transform.apply_patterns.linalg.decompose_pad
+    } : !transform.op<"func.func">
+
+    transform.apply_patterns to %func_op_pack {
+      transform.apply_patterns.tensor.fold_tensor_subset_ops
+      transform.apply_patterns.canonicalization
+    } : !transform.op<"func.func">
+
+    // 2.2. Decompose tiled UnpackOp into lower-level Ops
+    %func_op_unpack = transform.get_parent_op %tiled_unpack_op_p {isolated_from_above} : (!transform.any_op) -> !transform.op<"func.func">
+    transform.apply_patterns to %func_op_unpack {
+      transform.apply_patterns.linalg.decompose_pack_unpack
+    } : !transform.op<"func.func">
+
+    transform.apply_patterns to %func_op_unpack {
+      transform.apply_patterns.tensor.fold_tensor_subset_ops
+      transform.apply_patterns.canonicalization
+    } : !transform.op<"func.func">
+
+   // 3. Bufferize before lowering to LLVM
+   %bufferize = transform.bufferization.one_shot_bufferize %module
+     {bufferize_function_boundaries=true} : (!transform.any_op) -> !transform.any_op
+
+   // 4. Canonicalize
+   %func_op_bufferized = transform.structured.match ops{["func.func"]} in %bufferize : (!transform.any_op) -> !transform.op<"func.func">
+   transform.apply_patterns to %func_op_bufferized {
+     transform.apply_patterns.canonicalization
+   } : !transform.op<"func.func">
+
+    transform.yield
+  }
+
+  //===----------------------------------------------------------------------===//
+  // TD sequence _with_ vectorization
+  //===----------------------------------------------------------------------===//
+  transform.named_sequence @__transform_main_vectorized(%module: !transform.any_op {transform.consume}) {
+    %pack = transform.structured.match ops{["linalg.pack"]} in %module : (!transform.any_op) -> !transform.any_op
+    %unpack = transform.structured.match ops{["linalg.unpack"]} in %module : (!transform.any_op) -> !transform.any_op
+
+    // 1.1 Tile the linalg.pack Op so that we can decompose it into e.g. tensor.pad
+    //    and other lower-level Ops (see step 2.1)
+    %tiled_pack_op_p, %loops_pack:2 = transform.structured.tile_using_for %pack tile_sizes [1, 1]
+       : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+
+    // 1.2 Tile the linalg.unpack Op 
+    %tiled_unpack_op_p, %loops_unpack:2 = transform.structured.tile_using_for %unpack tile_sizes [1, 1]
+       : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+
+    // 2.1. Decompose tiled PackOp into lower-level Ops
+    %func_op_pack = transform.get_parent_op %tiled_pack_op_p {isolated_from_above} : (!transform.any_op) -> !transform.op<"func.func">
+    transform.apply_patterns to %func_op_pack {
+      transform.apply_patterns.linalg.decompose_pack_unpack
+      transform.apply_patterns.linalg.decompose_pad
+    } : !transform.op<"func.func">
+
+    transform.apply_patterns to %func_op_pack {
+      transform.apply_patterns.tensor.fold_tensor_subset_ops
+      transform.apply_patterns.canonicalization
+    } : !transform.op<"func.func">
+
+    // 2.2. Vectorize tiled UnpackOp into lower-level Ops
+    %func_op_unpack = transform.get_parent_op %tiled_unpack_op_p {isolated_from_above} : (!transform.any_op) -> !transform.op<"func.func">
+    transform.structured.vectorize %tiled_unpack_op_p vector_sizes  [1, 1, 4, [4]]  {assume_dynamic_dims_match_vec_sizes} : !transform.any_op
+
+    transform.apply_patterns to %func_op_unpack {
+      transform.apply_patterns.vector.transfer_permutation_patterns
+      transform.apply_patterns.vector.lower_masked_transfers
+      transform.apply_patterns.vector.sink_ops
+    } : !transform.op<"func.func">
+
+    // 3. Bufferize
+    %bufferize = transform.bufferization.one_shot_bufferize %module
+     {bufferize_function_boundaries=true} : (!transform.any_op) -> !transform.any_op
+
+    // 4. Canonicalize
+    %func_op_bufferized = transform.structured.match ops{["func.func"]} in %bufferize : (!transform.any_op) -> !transform.op<"func.func">
+    transform.apply_patterns to %func_op_bufferized {
+     transform.apply_patterns.canonicalization
+    } : !transform.op<"func.func">
+
+    transform.yield
+  }
+}
diff --git a/mlir/test/Integration/Dialect/XeGPU/LANE/lit.local.cfg b/mlir/test/Integration/Dialect/XeGPU/LANE/lit.local.cfg
new file mode 100644
index 0000000000000..d0d51c6020588
--- /dev/null
+++ b/mlir/test/Integration/Dialect/XeGPU/LANE/lit.local.cfg
@@ -0,0 +1,4 @@
+if not config.run_xevm_tests:
+    config.unsupported = True
+if not config.enable_levelzero_runner:
+    config.unsupported = True
diff --git a/mlir/test/Integration/Dialect/XeGPU/LANE/simple_gemm.mlir b/mlir/test/Integration/Dialect/XeGPU/LANE/simple_gemm.mlir
new file mode 100644
index 0000000000000..ffe29ef35a5c9
--- /dev/null
+++ b/mlir/test/Integration/Dialect/XeGPU/LANE/simple_gemm.mlir
@@ -0,0 +1,121 @@
+// RUN: mlir-opt %s --gpu-lower-to-xevm-pipeline="xegpu-op-level=lane" \
+// RUN: | mlir-runner \
+// RUN:   --shared-libs=%mlir_levelzero_runtime \
+// RUN:   --shared-libs=%mlir_runner_utils \
+// RUN:   --entry-point-result=void \
+// RUN: | FileCheck %s
+
+module @gemm attributes {gpu.container_module} {
+  gpu.module @kernel {
+    gpu.func @simple_gemm(%a: memref<256x256xf16>, %b: memref<256x256xf16>, %c: memref<256x256xf32>) kernel {
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c8 = arith.constant 8 : index
+      %c16 = arith.constant 16 : index
+      %c32 = arith.constant 32 : index
+      %c256 = arith.constant 256 : index
+      %block_x = gpu.block_id x
+      %block_y = gpu.block_id y
+      %x_block_offset = arith.muli %block_x, %c8 : index
+      %y_block_offset = arith.muli %block_y, %c16 : index
+
+      %c_tdesc = xegpu.create_nd_tdesc %c : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32>
+      %c_init_value = xegpu.load_nd %c_tdesc[%x_block_offset, %y_block_offset] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
+      %a_tdesc = xegpu.create_nd_tdesc %a : memref<256x256xf16> -> !xegpu.tensor_desc<8x16xf16>
+      %b_tdesc = xegpu.create_nd_tdesc %b : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16>
+
+      %r = scf.for %k = %c0 to %c256 step %c16 iter_args(%arg_c = %c_init_value) -> (vector<8xf32>) {
+        %a_val = xegpu.load_nd %a_tdesc[%x_block_offset, %k] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
+        %b_val = xegpu.load_nd %b_tdesc[%k, %y_block_offset] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
+        %dpas = xegpu.dpas %a_val, %b_val, %arg_c : vector<8xf16>, vector<16xf16>, vector<8xf32> -> vector<8xf32>
+        scf.yield %dpas : vector<8xf32>
+      }
+      xegpu.store_nd %r, %c_tdesc[%x_block_offset, %y_block_offset] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
+      gpu.return
+    }
+  }
+
+  func.func @test(%a : memref<256x256xf16>, %b : memref<256x256xf16>, %c : memref<256x256xf32>) -> memref<256x256xf32> attributes {llvm.emit_c_interface} {
+    %c1 = arith.constant 1 : index
+    %c16 = arith.constant 16 : index
+    %c32 = arith.constant 32 : index
+    %memref_a = gpu.alloc  () : memref<256x256xf16>
+    gpu.memcpy %memref_a, %a : memref<256x256xf16>, memref<256x256xf16>
+    %memref_b = gpu.alloc  () : memref<256x256xf16>
+    gpu.memcpy %memref_b, %b : memref<256x256xf16>, memref<256x256xf16>
+    %memref_c = gpu.alloc  () : memref<256x256xf32>
+    gpu.memcpy %memref_c, %c : memref<256x256xf32>, memref<256x256xf32>
+    gpu.launch_func @kernel::@simple_gemm blocks in (%c32, %c16, %c1) threads in (%c16, %c1, %c1) args(%memref_a : memref<256x256xf16>, %memref_b : memref<256x256xf16>, %memref_c : memref<256x256xf32>)
+    gpu.wait // Wait for the kernel to finish.
+    gpu.memcpy %c, %memref_c : memref<256x256xf32>, memref<256x256xf32>
+    gpu.dealloc %memref_a : memref<256x256xf16>
+    gpu.dealloc %memref_b : memref<256x256xf16>
+    gpu.dealloc %memref_c : memref<256x256xf32>
+    return %c : memref<256x256xf32>
+  }
+
+  func.func @main() attributes {llvm.emit_c_interface} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c1_f16 = arith.constant 1.0 : f16
+    %c2_f16 = arith.constant 2.0 : f16
+    %c256 = arith.constant 256 : index
+    %cf_0 = arith.constant 0.0 : f16
+    %cf_1 = arith.constant 1.0 : f16
+    %A = memref.alloc() : memref<256x256xf16>
+    %B = memref.alloc() : memref<256x256xf16>
+    %C = memref.alloc() : memref<256x256xf32>
+    %C_ref = memref.alloc() : memref<256x256xf32>
+    %c_gen_int = arith.constant 0 : i1
+    %cf_lower = arith.constant -0.5 : f32
+    %cf_upper = arith.constant 0.5 : f32
+
+    // Initialize matrix A ; A[i, j] = j
+    scf.for %i = %c0 to %c256 step %c1 {
+      scf.for %j = %c0 to %c256 step %c1 {
+        %t = index.castu %j : index to i16
+        %val = arith.uitofp %t : i16 to f16
+        memref.store %val, %A[%i, %j] : memref<256x256xf16>
+      }
+    }
+
+    // Initialize the B matrix.
+    // Make matrix B an identity matrix.
+    scf.for %i = %c0 to %c256 step %c1 {
+      scf.for %j = %c0 to %c256 step %c1 {
+        %i_i32 = index.castu %i : index to i32
+        %j_i32 = index.castu %j : index to i32
+        %i_j_same = arith.cmpi eq, %i_i32, %j_i32 : i32
+
+        scf.if %i_j_same {
+          memref.store %cf_1, %B[%i, %j] : memref<256x256xf16>
+        } else {
+          memref.store %cf_0, %B[%i, %j] : memref<256x256xf16>
+        }
+      }
+    }
+
+    // Initialize matrix C and C_ref ; C[i, j] = 0
+    %c0_f32 = arith.constant 0.0 : f32
+    scf.for %i = %c0 to %c256 step %c1 {
+      scf.for %j = %c0 to %c256 step %c1 {
+        memref.store %c0_f32, %C[%i, %j] : memref<256x256xf32>
+        memref.store %c0_f32, %C_ref[%i, %j] : memref<256x256xf32>
+      }
+    }
+
+    // Run GPU version.
+    %2 = call @test(%A, %B, %C) : (memref<256x256xf16>, memref<256x256xf16>, memref<256x256xf32>) -> memref<256x256xf32>
+    %gpu_result_cast = memref.cast %2 : memref<256x256xf32> to memref<*xf32>
+
+    // CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}}
+    // CHECK-COUNT-256: [0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,   11,   12,   13,   14,   15,   16,   17,   18,   19,   20,   21,   22,   23,   24,   25,   26,   27,   28,   29,   30,   31,   32,   33,   34,   35,   36,   37,   38,   39,   40,   41,   42,   43,   44,   45,   46,   47,   48,   49,   50,   51,   52,   53,   54,   55,   56,   57,   58,   59,   60,   61,   62,   63,   64,   65,   66,   67,   68,   69,   70,   71,   72,   73,   74,   75,   76,   77,   78,   79,   80,   81,   82,   83,   84,   85,   86,   87,   88,   89,   90,   91,   92,   93,   94,   95,   96,   97,   98,   99,   100,   101,   102,   103,   104,   105,   106,   107,   108,   109,   110,   111,   112,   113,   114,   115,   116,   117,   118,   119,   120,   121,   122,   123,   124,   125,   126,   127,   128,   129,   130,   131,   132,   133,   134,   135,   136,   137,   138,   139,   140,   141,   142,   143,   144,   145,   146,   147,   148,   149,   150,   151,   152,   153,   154,   155,   156,   157,   158,   159,   160,   161,   162,   163,   164,   165,   166,   167,   168,   169,   170,   171,   172,   173,   174,   175,   176,   177,   178,   179,   180,   181,   182,   183,   184,   185,   186,   187,   188,   189,   190,   191,   192,   193,   194,   195,   196,   197,   198,   199,   200,   201,   202,   203,   204,   205,   206,   207,   208,   209,   210,   211,   212,   213,   214,   215,   216,   217,   218,   219,   220,   221,   222,   223,   224,   225,   226,   227,   228,   229,   230,   231,   232,   233,   234,   235,   236,   237,   238,   239,   240,   241,   242,   243,   244,   245,   246,   247,   248,   249,   250,   251,   252,   253,   254,   255]
+    call @printMemrefF32(%gpu_result_cast) : (memref<*xf32>) -> ()
+    memref.dealloc %A : memref<256x256xf16>
+    memref.dealloc %B : memref<256x256xf16>
+    memref.dealloc %C : memref<256x256xf32>
+    memref.dealloc %C_ref : memref<256x256xf32>
+    return
+  }
+  func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface}
+}
diff --git a/mlir/test/Integration/Dialect/XeGPU/SG/lit.local.cfg b/mlir/test/Integration/Dialect/XeGPU/SG/lit.local.cfg
new file mode 100644
index 0000000000000..d0d51c6020588
--- /dev/null
+++ b/mlir/test/Integration/Dialect/XeGPU/SG/lit.local.cfg
@@ -0,0 +1,4 @@
+if not config.run_xevm_tests:
+    config.unsupported = True
+if not config.enable_levelzero_runner:
+    config.unsupported = True
diff --git a/mlir/test/Integration/Dialect/XeGPU/SG/simple_gemm.mlir b/mlir/test/Integration/Dialect/XeGPU/SG/simple_gemm.mlir
new file mode 100644
index 0000000000000..877edf47fcd15
--- /dev/null
+++ b/mlir/test/Integration/Dialect/XeGPU/SG/simple_gemm.mlir
@@ -0,0 +1,120 @@
+// RUN: mlir-opt %s --gpu-lower-to-xevm-pipeline="xegpu-op-level=subgroup" \
+// RUN: | mlir-runner \
+// RUN:   --shared-libs=%mlir_levelzero_runtime \
+// RUN:   --shared-libs=%mlir_runner_utils \
+// RUN:   --entry-point-result=void \
+// RUN: | FileCheck %s
+
+module @gemm attributes {gpu.container_module} {
+  gpu.module @kernel {
+    gpu.func @simple_gemm(%a: memref<256x256xf16>, %b: memref<256x256xf16>, %c: memref<256x256xf32>) kernel {
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c8 = arith.constant 8 : index
+      %c16 = arith.constant 16 : index
+      %c32 = arith.constant 32 : index
+      %c256 = arith.constant 256 : index
+      %block_x = gpu.block_id x
+      %block_y = gpu.block_id y
+      %x_block_offset = arith.muli %block_x, %c8 : index
+      %y_block_offset = arith.muli %block_y, %c16 : index
+
+      %c_tdesc = xegpu.create_nd_tdesc %c : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32>
+      %c_init_value = xegpu.load_nd %c_tdesc[%x_block_offset, %y_block_offset] : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
+      %a_tdesc = xegpu.create_nd_tdesc %a : memref<256x256xf16> -> !xegpu.tensor_desc<8x16xf16>
+      %b_tdesc = xegpu.create_nd_tdesc %b : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16>
+
+      %r = scf.for %k = %c0 to %c256 step %c16 iter_args(%arg_c = %c_init_value) -> (vector<8x16xf32>) {
+        %a_val = xegpu.load_nd %a_tdesc[%x_block_offset, %k]  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
+        %b_val = xegpu.load_nd %b_tdesc[%k, %y_block_offset] : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+        %dpas = xegpu.dpas %a_val, %b_val, %arg_c : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+        scf.yield %dpas : vector<8x16xf32>
+      }
+      xegpu.store_nd %r, %c_tdesc[%x_block_offset, %y_block_offset] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+      gpu.return
+    }
+  }
+
+  func.func @test(%a : memref<256x256xf16>, %b : memref<256x256xf16>, %c : memref<256x256xf32>) -> memref<256x256xf32> attributes {llvm.emit_c_interface} {
+    %c1 = arith.constant 1 : index
+    %c16 = arith.constant 16 : index
+    %c32 = arith.constant 32 : index
+    %memref_a = gpu.alloc  () : memref<256x256xf16>
+    gpu.memcpy %memref_a, %a : memref<256x256xf16>, memref<256x256xf16>
+    %memref_b = gpu.alloc () : memref<256x256xf16>
+    gpu.memcpy %memref_b, %b : memref<256x256xf16>, memref<256x256xf16>
+    %memref_c = gpu.alloc  () : memref<256x256xf32>
+    gpu.memcpy %memref_c, %c : memref<256x256xf32>, memref<256x256xf32>
+    gpu.launch_func  @kernel::@simple_gemm blocks in (%c32, %c16, %c1) threads in (%c16, %c1, %c1) args(%memref_a : memref<256x256xf16>, %memref_b : memref<256x256xf16>, %memref_c : memref<256x256xf32>)
+    gpu.wait  // Wait for the kernel to finish.
+    gpu.memcpy %c, %memref_c : memref<256x256xf32>, memref<256x256xf32>
+    gpu.dealloc %memref_a : memref<256x256xf16>
+    gpu.dealloc %memref_b : memref<256x256xf16>
+    gpu.dealloc %memref_c : memref<256x256xf32>
+    return %c : memref<256x256xf32>
+  }
+
+
+  func.func @main() attributes {llvm.emit_c_interface} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c1_f16 = arith.constant 1.0 : f16
+    %c2_f16 = arith.constant 2.0 : f16
+    %c256 = arith.constant 256 : index
+    %cf_0 = arith.constant 0.0 : f16
+    %cf_1 = arith.constant 1.0 : f16
+    %A = memref.alloc() : memref<256x256xf16>
+    %B = memref.alloc() : memref<256x256xf16>
+    %C = memref.alloc() : memref<256x256xf32>
+    %C_ref = memref.alloc() : memref<256x256xf32>
+    %c_gen_int = arith.constant 0 : i1
+    %cf_lower = arith.constant -0.5 : f32
+    %cf_upper = arith.constant 0.5 : f32
+    // Option 1: intialize matrix A ; A[i, j] = j
+    scf.for %i = %c0 to %c256 step %c1 {
+      scf.for %j = %c0 to %c256 step %c1 {
+        %t = index.castu %j : index to i16
+        %val = arith.uitofp %t : i16 to f16
+        memref.store %val, %A[%i, %j] : memref<256x256xf16>
+      }
+    }
+
+    // Initialize the B matrix
+    // Make matrix B an identity matrix
+    scf.for %i = %c0 to %c256 step %c1 {
+      scf.for %j = %c0 to %c256 step %c1 {
+        %i_i32 = index.castu %i : index to i32
+        %j_i32 = index.castu %j : index to i32
+        %i_j_same = arith.cmpi eq, %i_i32, %j_i32 : i32
+
+        scf.if %i_j_same {
+          memref.store %cf_1, %B[%i, %j] : memref<256x256xf16>
+        } else {
+          memref.store %cf_0, %B[%i, %j] : memref<256x256xf16>
+        }
+      }
+    }
+    // intialize matrix C and C_ref ; C[i, j] = 0
+    %c0_f32 = arith.constant 0.0 : f32
+    scf.for %i = %c0 to %c256 step %c1 {
+      scf.for %j = %c0 to %c256 step %c1 {
+        memref.store %c0_f32, %C[%i, %j] : memref<256x256xf32>
+        memref.store %c0_f32, %C_ref[%i, %j] : memref<256x256xf32>
+      }
+    }
+
+    // Run GPU.
+    %2 = call @test(%A, %B, %C) : (memref<256x256xf16>, memref<256x256xf16>, memref<256x256xf32>) -> memref<256x256xf32>
+    %cast_C = memref.cast %2 : memref<256x256xf32> to memref<*xf32>
+    // CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}}
+    // CHECK-COUNT-256: [0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,   11,   12,   13,   14,   15,   16,   17,   18,   19,   20,   21,   22,   23,   24,   25,   26,   27,   28,   29,   30,   31,   32,   33,   34,   35,   36,   37,   38,   39,   40,   41,   42,   43,   44,   45,   46,   47,   48,   49,   50,   51,   52,   53,   54,   55,   56,   57,   58,   59,   60,   61,   62,   63,   64,   65,   66,   67,   68,   69,   70,   71,   72,   73,   74,   75,   76,   77,   78,   79,   80,   81,   82,   83,   84,   85,   86,   87,   88,   89,   90,   91,   92,   93,   94,   95,   96,   97,   98,   99,   100,   101,   102,   103,   104,   105,   106,   107,   108,   109,   110,   111,   112,   113,   114,   115,   116,   117,   118,   119,   120,   121,   122,   123,   124,   125,   126,   127,   128,   129,   130,   131,   132,   133,   134,   135,   136,   137,   138,   139,   140,   141,   142,   143,   144,   145,   146,   147,   148,   149,   150,   151,   152,   153,   154,   155,   156,   157,   158,   159,   160,   161,   162,   163,   164,   165,   166,   167,   168,   169,   170,   171,   172,   173,   174,   175,   176,   177,   178,   179,   180,   181,   182,   183,   184,   185,   186,   187,   188,   189,   190,   191,   192,   193,   194,   195,   196,   197,   198,   199,   200,   201,   202,   203,   204,   205,   206,   207,   208,   209,   210,   211,   212,   213,   214,   215,   216,   217,   218,   219,   220,   221,   222,   223,   224,   225,   226,   227,   228,   229,   230,   231,   232,   233,   234,   235,   236,   237,   238,   239,   240,   241,   242,   243,   244,   245,   246,   247,   248,   249,   250,   251,   252,   253,   254,   255]
+    call @printMemrefF32(%cast_C) : (memref<*xf32>) -> ()
+
+    memref.dealloc %A : memref<256x256xf16>
+    memref.dealloc %B : memref<256x256xf16>
+    memref.dealloc %C : memref<256x256xf32>
+    memref.dealloc %C_ref : memref<256x256xf32>
+    return
+  }
+  func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface}
+}
diff --git a/mlir/test/Integration/Dialect/XeGPU/WG/lit.local.cfg b/mlir/test/Integration/Dialect/XeGPU/WG/lit.local.cfg
new file mode 100644
index 0000000000000..d0d51c6020588
--- /dev/null
+++ b/mlir/test/Integration/Dialect/XeGPU/WG/lit.local.cfg
@@ -0,0 +1,4 @@
+if not config.run_xevm_tests:
+    config.unsupported = True
+if not config.enable_levelzero_runner:
+    config.unsupported = True
diff --git a/mlir/test/Integration/Dialect/XeGPU/WG/simple_gemm.mlir b/mlir/test/Integration/Dialect/XeGPU/WG/simple_gemm.mlir
new file mode 100644
index 0000000000000..3f2fff9ab51e9
--- /dev/null
+++ b/mlir/test/Integration/Dialect/XeGPU/WG/simple_gemm.mlir
@@ -0,0 +1,151 @@
+// RUN: mlir-opt %s --gpu-lower-to-xevm-pipeline="xegpu-op-level=workgroup" \
+// RUN: | mlir-runner \
+// RUN:   --shared-libs=%mlir_levelzero_runtime \
+// RUN:   --shared-libs=%mlir_runner_utils \
+// RUN:   --entry-point-result=void \
+// RUN: | FileCheck %s
+
+#a = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], inst_data = [8, 16]>
+#b = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 64], inst_data = [16, 16]>
+#c = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 64], inst_data = [8, 16]>
+#a_prefetch = #xegpu.layout<sg_layout = [32, 1], sg_data = [8, 32], inst_data = [8, 16]>
+#b_prefetch = #xegpu.layout<sg_layout = [4, 8], sg_data = [8, 32], inst_data = [8, 16]>
+module @gemm attributes {gpu.container_module} {
+  func.func @test(%A: memref<256x256xf16>, %B: memref<256x256xf16>, %C: memref<256x256xf32>) -> memref<256x256xf32> attributes {llvm.emit_c_interface} {
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c8 = arith.constant 8 : index
+    %c16 = arith.constant 16 : index
+    %c32 = arith.constant 32 : index
+    %c64 = arith.constant 64 : index
+    %c128 = arith.constant 128 : index
+    %c512 = arith.constant 512 : index
+    %A_gpu = gpu.alloc () : memref<256x256xf16>
+    gpu.memcpy %A_gpu, %A : memref<256x256xf16>, memref<256x256xf16>
+    %B_gpu = gpu.alloc () : memref<256x256xf16>
+    gpu.memcpy %B_gpu, %B : memref<256x256xf16>, memref<256x256xf16>
+    %C_gpu = gpu.alloc () : memref<256x256xf32>
+    gpu.memcpy %C_gpu, %C : memref<256x256xf32>, memref<256x256xf32>
+    // NOTE: Here we can't use [8, 64] wi threads following
+    // the SG thread layout of [8, 4]. Because runtime will linearize
+    // the x dimension first (we need y dimension to be linearized first).
+    // So just use linearized thread layout of [512, 1] wi threads.
+    gpu.launch_func  @test_kernel::@test_kernel blocks in (%c1, %c1, %c1) threads in (%c512, %c1, %c1) args(%A_gpu : memref<256x256xf16>, %B_gpu : memref<256x256xf16>, %C_gpu : memref<256x256xf32>)
+    gpu.wait // Wait for the kernel to finish.
+    gpu.memcpy %C, %C_gpu : memref<256x256xf32>, memref<256x256xf32>
+    gpu.dealloc %A_gpu : memref<256x256xf16>
+    gpu.dealloc %B_gpu : memref<256x256xf16>
+    gpu.dealloc %C_gpu : memref<256x256xf32>
+    return %C : memref<256x256xf32>
+  }
+
+  gpu.module @test_kernel   {
+    gpu.func @test_kernel(%A: memref<256x256xf16>, %B: memref<256x256xf16>, %C: memref<256x256xf32>) kernel  {
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c32 = arith.constant 32 : index
+      %c64 = arith.constant 64 : index
+      %c96 = arith.constant 96 : index
+      %c256 = arith.constant 256 : index
+      %c4096 = arith.constant 4096 : index
+      %block_id_x = gpu.block_id x
+      %block_id_y = gpu.block_id y
+      %m = arith.muli %block_id_x, %c256 : index
+      %n = arith.muli %block_id_y, %c256 : index
+      %c_tdesc = xegpu.create_nd_tdesc %C : memref<256x256xf32> -> !xegpu.tensor_desc<256x256xf32, #c>
+      %c_init_value = xegpu.load_nd %c_tdesc[%m, %n] : !xegpu.tensor_desc<256x256xf32, #c> -> vector<256x256xf32>
+      %a_tdesc = xegpu.create_nd_tdesc %A : memref<256x256xf16> -> !xegpu.tensor_desc<256x32xf16, #a>
+      %b_tdesc = xegpu.create_nd_tdesc %B : memref<256x256xf16> -> !xegpu.tensor_desc<32x256xf16, #b>
+      // Prefetch A 3 times.
+      %a_prefetch_tdesc = xegpu.create_nd_tdesc %A : memref<256x256xf16> -> !xegpu.tensor_desc<256x32xf16, #a_prefetch>
+      xegpu.prefetch_nd %a_prefetch_tdesc[%m, %c0] : !xegpu.tensor_desc<256x32xf16, #a_prefetch>
+      xegpu.prefetch_nd %a_prefetch_tdesc[%m, %c32] : !xegpu.tensor_desc<256x32xf16, #a_prefetch>
+      xegpu.prefetch_nd %a_prefetch_tdesc[%m, %c64] : !xegpu.tensor_desc<256x32xf16, #a_prefetch>
+       // Prefetch B 3 times.
+      %b_prefetch_tdesc = xegpu.create_nd_tdesc %B : memref<256x256xf16> -> !xegpu.tensor_desc<32x256xf16, #b_prefetch>
+      xegpu.prefetch_nd %b_prefetch_tdesc[%c0, %n] : !xegpu.tensor_desc<32x256xf16, #b_prefetch>
+      xegpu.prefetch_nd %b_prefetch_tdesc[%c32, %n] : !xegpu.tensor_desc<32x256xf16, #b_prefetch>
+      xegpu.prefetch_nd %b_prefetch_tdesc[%c64, %n] : !xegpu.tensor_desc<32x256xf16, #b_prefetch>
+
+      %out = scf.for %k = %c0 to %c256 step %c32
+        iter_args(%c_value = %c_init_value)
+        -> (vector<256x256xf32>) {
+        %a_value = xegpu.load_nd %a_tdesc[%m, %k]  : !xegpu.tensor_desc<256x32xf16, #a> -> vector<256x32xf16>
+        %b_value = xegpu.load_nd %b_tdesc[%k, %n] : !xegpu.tensor_desc<32x256xf16, #b> -> vector<32x256xf16>
+        // Prefetch next tiles.
+        %prefetch_offset = arith.addi %k, %c96 : index
+        xegpu.prefetch_nd %a_prefetch_tdesc[%m, %prefetch_offset] : !xegpu.tensor_desc<256x32xf16, #a_prefetch>
+        xegpu.prefetch_nd %b_prefetch_tdesc[%prefetch_offset, %n] : !xegpu.tensor_desc<32x256xf16, #b_prefetch>
+        %c_new_value = xegpu.dpas %a_value, %b_value, %c_value {layout_result_0 = #c}
+          : vector<256x32xf16>, vector<32x256xf16>, vector<256x256xf32> -> vector<256x256xf32>
+        scf.yield %c_new_value : vector<256x256xf32>
+      }
+      xegpu.store_nd %out, %c_tdesc[%m, %n] : vector<256x256xf32>, !xegpu.tensor_desc<256x256xf32, #c>
+      gpu.return
+    }
+  }
+
+  func.func @main() attributes {llvm.emit_c_interface} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c1_f16 = arith.constant 1.0 : f16
+    %c2_f16 = arith.constant 2.0 : f16
+    %c256 = arith.constant 256 : index
+    %cf_0 = arith.constant 0.0 : f16
+    %cf_1 = arith.constant 1.0 : f16
+    %A = memref.alloc() : memref<256x256xf16>
+    %B = memref.alloc() : memref<256x256xf16>
+    %C = memref.alloc() : memref<256x256xf32>
+    %C_ref = memref.alloc() : memref<256x256xf32>
+    %c_gen_int = arith.constant 0 : i1
+    %cf_lower = arith.constant -0.5 : f32
+    %cf_upper = arith.constant 0.5 : f32
+    // Intialize matrix A ; A[i, j] = j
+    scf.for %i = %c0 to %c256 step %c1 {
+      scf.for %j = %c0 to %c256 step %c1 {
+        %t = index.castu %j : index to i16
+        %val = arith.uitofp %t : i16 to f16
+        memref.store %val, %A[%i, %j] : memref<256x256xf16>
+      }
+    }
+
+    // Initialize the B matrix
+    // Make matrix B an identity matrix
+    scf.for %i = %c0 to %c256 step %c1 {
+      scf.for %j = %c0 to %c256 step %c1 {
+        %i_i32 = index.castu %i : index to i32
+        %j_i32 = index.castu %j : index to i32
+        %i_j_same = arith.cmpi eq, %i_i32, %j_i32 : i32
+
+        scf.if %i_j_same {
+          memref.store %cf_1, %B[%i, %j] : memref<256x256xf16>
+        } else {
+          memref.store %cf_0, %B[%i, %j] : memref<256x256xf16>
+        }
+      }
+    }
+
+    // Initialize matrix C and C_ref ; C[i, j] = 0
+    %c0_f32 = arith.constant 0.0 : f32
+    scf.for %i = %c0 to %c256 step %c1 {
+      scf.for %j = %c0 to %c256 step %c1 {
+        memref.store %c0_f32, %C[%i, %j] : memref<256x256xf32>
+        memref.store %c0_f32, %C_ref[%i, %j] : memref<256x256xf32>
+      }
+    }
+
+    // Run GPU version.
+    %2 = call @test(%A, %B, %C) : (memref<256x256xf16>, memref<256x256xf16>, memref<256x256xf32>) -> memref<256x256xf32>
+    %gpu_result_cast = memref.cast %2 : memref<256x256xf32> to memref<*xf32>
+    // CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}}
+    // CHECK-COUNT-256: [0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,   11,   12,   13,   14,   15,   16,   17,   18,   19,   20,   21,   22,   23,   24,   25,   26,   27,   28,   29,   30,   31,   32,   33,   34,   35,   36,   37,   38,   39,   40,   41,   42,   43,   44,   45,   46,   47,   48,   49,   50,   51,   52,   53,   54,   55,   56,   57,   58,   59,   60,   61,   62,   63,   64,   65,   66,   67,   68,   69,   70,   71,   72,   73,   74,   75,   76,   77,   78,   79,   80,   81,   82,   83,   84,   85,   86,   87,   88,   89,   90,   91,   92,   93,   94,   95,   96,   97,   98,   99,   100,   101,   102,   103,   104,   105,   106,   107,   108,   109,   110,   111,   112,   113,   114,   115,   116,   117,   118,   119,   120,   121,   122,   123,   124,   125,   126,   127,   128,   129,   130,   131,   132,   133,   134,   135,   136,   137,   138,   139,   140,   141,   142,   143,   144,   145,   146,   147,   148,   149,   150,   151,   152,   153,   154,   155,   156,   157,   158,   159,   160,   161,   162,   163,   164,   165,   166,   167,   168,   169,   170,   171,   172,   173,   174,   175,   176,   177,   178,   179,   180,   181,   182,   183,   184,   185,   186,   187,   188,   189,   190,   191,   192,   193,   194,   195,   196,   197,   198,   199,   200,   201,   202,   203,   204,   205,   206,   207,   208,   209,   210,   211,   212,   213,   214,   215,   216,   217,   218,   219,   220,   221,   222,   223,   224,   225,   226,   227,   228,   229,   230,   231,   232,   233,   234,   235,   236,   237,   238,   239,   240,   241,   242,   243,   244,   245,   246,   247,   248,   249,   250,   251,   252,   253,   254,   255]
+    call @printMemrefF32(%gpu_result_cast) : (memref<*xf32>) -> ()
+
+    memref.dealloc %A : memref<256x256xf16>
+    memref.dealloc %B : memref<256x256xf16>
+    memref.dealloc %C : memref<256x256xf32>
+    memref.dealloc %C_ref : memref<256x256xf32>
+    return
+  }
+  func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface}
+}
diff --git a/mlir/test/Integration/GPU/SPIRV/simple_add.mlir b/mlir/test/Integration/GPU/SPIRV/simple_add.mlir
index cb16c376eaa6f..b3154d4b91516 100644
--- a/mlir/test/Integration/GPU/SPIRV/simple_add.mlir
+++ b/mlir/test/Integration/GPU/SPIRV/simple_add.mlir
@@ -3,7 +3,16 @@
 // RUN: | FileCheck %s
 
 // CHECK: data =
-// CHECK-RAW: [[[7.7,    0,    0], [7.7,    0,    0], [7.7,    0,    0]], [[0,    7.7,    0], [0,    7.7,    0], [0,    7.7,    0]], [[0,    0,    7.7], [0,    0,    7.7], [0,    0,    7.7]]]
+// CHECK{LITERAL}: [[[7.7,    0,    0],
+// CHECK{LITERAL}: [7.7,    0,    0],
+// CHECK{LITERAL}: [7.7,    0,    0]],
+// CHECK{LITERAL}: [[0,    7.7,    0],
+// CHECK{LITERAL}: [0,    7.7,    0],
+// CHECK{LITERAL}: [0,    7.7,    0]],
+// CHECK{LITERAL}: [[0,    0,    7.7],
+// CHECK{LITERAL}: [0,    0,    7.7],
+// CHECK{LITERAL}: [0,    0,    7.7]]]
+
 module attributes {
   gpu.container_module,
   spirv.target_env = #spirv.target_env<
diff --git a/mlir/test/Pass/remark-final.mlir b/mlir/test/Pass/remark-final.mlir
new file mode 100644
index 0000000000000..325271e04cc5c
--- /dev/null
+++ b/mlir/test/Pass/remark-final.mlir
@@ -0,0 +1,17 @@
+// RUN: mlir-opt %s --test-remark --remarks-filter="category.*" --remark-policy=final 2>&1 | FileCheck %s 
+// RUN: mlir-opt %s --test-remark --remarks-filter="category.*" --remark-policy=final --remark-format=yaml --remarks-output-file=%t.yaml
+// RUN: FileCheck --check-prefix=CHECK-YAML %s < %t.yaml
+module @foo {
+  "test.op"() : () -> ()
+  
+}
+
+// CHECK-YAML-NOT: This is a test passed remark (should be dropped)
+// CHECK-YAML-DAG: !Analysis
+// CHECK-YAML-DAG: !Failure
+// CHECK-YAML-DAG: !Passed
+
+// CHECK-NOT: This is a test passed remark (should be dropped)
+// CHECK-DAG: remark: [Analysis] test-remark
+// CHECK-DAG: remark: [Failure] test-remark | Category:category-2-failed
+// CHECK-DAG: remark: [Passed] test-remark | Category:category-1-passed
diff --git a/mlir/test/Target/LLVMIR/Import/debug-info.ll b/mlir/test/Target/LLVMIR/Import/debug-info.ll
index bfe672f8721ae..a373132c6fc58 100644
--- a/mlir/test/Target/LLVMIR/Import/debug-info.ll
+++ b/mlir/test/Target/LLVMIR/Import/debug-info.ll
@@ -242,11 +242,10 @@ define void @subprogram() !dbg !3 {
 define void @func_loc() !dbg !3 {
   ret void
 }
-; CHECK-DAG: #[[NAME_LOC:.+]] = loc("func_loc")
 ; CHECK-DAG: #[[FILE_LOC:.+]] = loc("debug-info.ll":42:0)
 ; CHECK-DAG: #[[SP:.+]] =  #llvm.di_subprogram<id = distinct[{{.*}}]<>, compileUnit = #{{.*}}, scope = #{{.*}}, name = "func_loc", file = #{{.*}}, line = 42, subprogramFlags = Definition>
 
-; CHECK: loc(fused<#[[SP]]>[#[[NAME_LOC]], #[[FILE_LOC]]]
+; CHECK: loc(fused<#[[SP]]>[#[[FILE_LOC]]]
 
 !llvm.dbg.cu = !{!1}
 !llvm.module.flags = !{!0}
diff --git a/mlir/test/Target/LLVMIR/Import/function-attributes.ll b/mlir/test/Target/LLVMIR/Import/function-attributes.ll
index cc3d799bfc626..83c0438840f35 100644
--- a/mlir/test/Target/LLVMIR/Import/function-attributes.ll
+++ b/mlir/test/Target/LLVMIR/Import/function-attributes.ll
@@ -303,18 +303,6 @@ declare void @align_decl() align 64
 
 ; // -----
 
-; CHECK-LABEL: @func_attr_unsafe_fp_math_true
-; CHECK-SAME: attributes {unsafe_fp_math = true}
-declare void @func_attr_unsafe_fp_math_true() "unsafe-fp-math"="true"
-
-; // -----
-
-; CHECK-LABEL: @func_attr_unsafe_fp_math_false
-; CHECK-SAME: attributes {unsafe_fp_math = false}
-declare void @func_attr_unsafe_fp_math_false() "unsafe-fp-math"="false"
-
-; // -----
-
 ; CHECK-LABEL: @func_attr_no_infs_fp_math_true
 ; CHECK-SAME: attributes {no_infs_fp_math = true}
 declare void @func_attr_no_infs_fp_math_true() "no-infs-fp-math"="true"
@@ -393,6 +381,12 @@ declare void @alwaysinline_attribute() alwaysinline
 
 // -----
 
+; CHECK-LABEL: @inlinehint_attribute
+; CHECK-SAME: attributes {inline_hint}
+declare void @inlinehint_attribute() inlinehint
+
+// -----
+
 ; CHECK-LABEL: @optnone_attribute
 ; CHECK-SAME: attributes {no_inline, optimize_none}
 declare void @optnone_attribute() noinline optnone
diff --git a/mlir/test/Target/LLVMIR/fp-math-function-attributes.mlir b/mlir/test/Target/LLVMIR/fp-math-function-attributes.mlir
index 7b11fdc6121f6..f76a6cf9d6fa1 100644
--- a/mlir/test/Target/LLVMIR/fp-math-function-attributes.mlir
+++ b/mlir/test/Target/LLVMIR/fp-math-function-attributes.mlir
@@ -1,23 +1,5 @@
 // RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
 
-// CHECK-LABEL: define void @unsafe_fp_math_func_true()
-// CHECK-SAME: #[[ATTRS:[0-9]+]]
-llvm.func @unsafe_fp_math_func_true() attributes {unsafe_fp_math = true}  {
-  llvm.return
-}
-// CHECK: attributes #[[ATTRS]] = { "unsafe-fp-math"="true" }
-
-// -----
-
-// CHECK-LABEL: define void @unsafe_fp_math_func_false()
-// CHECK-SAME: #[[ATTRS:[0-9]+]]
-llvm.func @unsafe_fp_math_func_false() attributes {unsafe_fp_math = false}  {
-  llvm.return
-}
-// CHECK: attributes #[[ATTRS]] = { "unsafe-fp-math"="false" }
-
-// -----
-
 // CHECK-LABEL: define void @no_infs_fp_math_func_true()
 // CHECK-SAME: #[[ATTRS:[0-9]+]]
 llvm.func @no_infs_fp_math_func_true() attributes {no_infs_fp_math = true}  {
diff --git a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
index d63584e5e03ab..60bd24a27868e 100644
--- a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
@@ -543,16 +543,16 @@ llvm.func @get_active_lane_mask(%base: i64, %n: i64) -> (vector<7xi1>) {
 
 // CHECK-LABEL: @masked_load_store_intrinsics
 llvm.func @masked_load_store_intrinsics(%A: !llvm.ptr, %mask: vector<7xi1>) {
-  // CHECK: call <7 x float> @llvm.masked.load.v7f32.p0(ptr %{{.*}}, i32 1, <7 x i1> %{{.*}}, <7 x float> poison)
+  // CHECK: call <7 x float> @llvm.masked.load.v7f32.p0(ptr align 1 %{{.*}}, <7 x i1> %{{.*}}, <7 x float> poison)
   %a = llvm.intr.masked.load %A, %mask { alignment = 1: i32} :
     (!llvm.ptr, vector<7xi1>) -> vector<7xf32>
-  // CHECK: call <7 x float> @llvm.masked.load.v7f32.p0(ptr %{{.*}}, i32 1, <7 x i1> %{{.*}}, <7 x float> poison), !nontemporal !1
+  // CHECK: call <7 x float> @llvm.masked.load.v7f32.p0(ptr align 1 %{{.*}}, <7 x i1> %{{.*}}, <7 x float> poison), !nontemporal !1
   %b = llvm.intr.masked.load %A, %mask { alignment = 1: i32, nontemporal} :
     (!llvm.ptr, vector<7xi1>) -> vector<7xf32>
-  // CHECK: call <7 x float> @llvm.masked.load.v7f32.p0(ptr %{{.*}}, i32 1, <7 x i1> %{{.*}}, <7 x float> %{{.*}})
+  // CHECK: call <7 x float> @llvm.masked.load.v7f32.p0(ptr align 1 %{{.*}}, <7 x i1> %{{.*}}, <7 x float> %{{.*}})
   %c = llvm.intr.masked.load %A, %mask, %a { alignment = 1: i32} :
     (!llvm.ptr, vector<7xi1>, vector<7xf32>) -> vector<7xf32>
-  // CHECK: call void @llvm.masked.store.v7f32.p0(<7 x float> %{{.*}}, ptr %0, i32 {{.*}}, <7 x i1> %{{.*}})
+  // CHECK: call void @llvm.masked.store.v7f32.p0(<7 x float> %{{.*}}, ptr align 1 %0, <7 x i1> %{{.*}})
   llvm.intr.masked.store %b, %A, %mask { alignment = 1: i32} :
     vector<7xf32>, vector<7xi1> into !llvm.ptr
   llvm.return
@@ -560,13 +560,13 @@ llvm.func @masked_load_store_intrinsics(%A: !llvm.ptr, %mask: vector<7xi1>) {
 
 // CHECK-LABEL: @masked_gather_scatter_intrinsics
 llvm.func @masked_gather_scatter_intrinsics(%M: vector<7 x !llvm.ptr>, %mask: vector<7xi1>) {
-  // CHECK: call <7 x float> @llvm.masked.gather.v7f32.v7p0(<7 x ptr> %{{.*}}, i32 1, <7 x i1> %{{.*}}, <7 x float> poison)
+  // CHECK: call <7 x float> @llvm.masked.gather.v7f32.v7p0(<7 x ptr> align 1 %{{.*}}, <7 x i1> %{{.*}}, <7 x float> poison)
   %a = llvm.intr.masked.gather %M, %mask { alignment = 1: i32} :
       (vector<7 x !llvm.ptr>, vector<7xi1>) -> vector<7xf32>
-  // CHECK: call <7 x float> @llvm.masked.gather.v7f32.v7p0(<7 x ptr> %{{.*}}, i32 1, <7 x i1> %{{.*}}, <7 x float> %{{.*}})
+  // CHECK: call <7 x float> @llvm.masked.gather.v7f32.v7p0(<7 x ptr> align 1 %{{.*}}, <7 x i1> %{{.*}}, <7 x float> %{{.*}})
   %b = llvm.intr.masked.gather %M, %mask, %a { alignment = 1: i32} :
       (vector<7 x !llvm.ptr>, vector<7xi1>, vector<7xf32>) -> vector<7xf32>
-  // CHECK: call void @llvm.masked.scatter.v7f32.v7p0(<7 x float> %{{.*}}, <7 x ptr> %{{.*}}, i32 1, <7 x i1> %{{.*}})
+  // CHECK: call void @llvm.masked.scatter.v7f32.v7p0(<7 x float> %{{.*}}, <7 x ptr> align 1 %{{.*}}, <7 x i1> %{{.*}})
   llvm.intr.masked.scatter %b, %M, %mask { alignment = 1: i32} :
       vector<7xf32>, vector<7xi1> into vector<7 x !llvm.ptr>
   llvm.return
@@ -1335,10 +1335,10 @@ llvm.func @experimental_constrained_fpext(%s: f32, %v: vector<4xf32>) {
 // CHECK-DAG: declare <48 x float> @llvm.matrix.column.major.load.v48f32.i64(ptr captures(none), i64, i1 immarg, i32 immarg, i32 immarg)
 // CHECK-DAG: declare void @llvm.matrix.column.major.store.v48f32.i64(<48 x float>, ptr writeonly captures(none), i64, i1 immarg, i32 immarg, i32 immarg)
 // CHECK-DAG: declare <7 x i1> @llvm.get.active.lane.mask.v7i1.i64(i64, i64)
-// CHECK-DAG: declare <7 x float> @llvm.masked.load.v7f32.p0(ptr captures(none), i32 immarg, <7 x i1>, <7 x float>)
-// CHECK-DAG: declare void @llvm.masked.store.v7f32.p0(<7 x float>, ptr captures(none), i32 immarg, <7 x i1>)
-// CHECK-DAG: declare <7 x float> @llvm.masked.gather.v7f32.v7p0(<7 x ptr>, i32 immarg, <7 x i1>, <7 x float>)
-// CHECK-DAG: declare void @llvm.masked.scatter.v7f32.v7p0(<7 x float>, <7 x ptr>, i32 immarg, <7 x i1>)
+// CHECK-DAG: declare <7 x float> @llvm.masked.load.v7f32.p0(ptr captures(none), <7 x i1>, <7 x float>)
+// CHECK-DAG: declare void @llvm.masked.store.v7f32.p0(<7 x float>, ptr captures(none), <7 x i1>)
+// CHECK-DAG: declare <7 x float> @llvm.masked.gather.v7f32.v7p0(<7 x ptr>, <7 x i1>, <7 x float>)
+// CHECK-DAG: declare void @llvm.masked.scatter.v7f32.v7p0(<7 x float>, <7 x ptr>, <7 x i1>)
 // CHECK-DAG: declare <7 x float> @llvm.masked.expandload.v7f32(ptr captures(none), <7 x i1>, <7 x float>)
 // CHECK-DAG: declare void @llvm.masked.compressstore.v7f32(<7 x float>, ptr captures(none), <7 x i1>)
 // CHECK-DAG: declare void @llvm.var.annotation.p0.p0(ptr, ptr, ptr, i32, ptr)
diff --git a/mlir/test/Target/LLVMIR/llvmir.mlir b/mlir/test/Target/LLVMIR/llvmir.mlir
index 69814f2748e1d..cc243c86ca902 100644
--- a/mlir/test/Target/LLVMIR/llvmir.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir.mlir
@@ -2555,6 +2555,17 @@ llvm.func @always_inline() attributes { always_inline } {
 
 // -----
 
+// CHECK-LABEL: @inline_hint
+// CHECK-SAME: #[[ATTRS:[0-9]+]]
+llvm.func @inline_hint() attributes { inline_hint } {
+  llvm.return
+}
+
+// CHECK: #[[ATTRS]]
+// CHECK-SAME: inlinehint
+
+// -----
+
 // CHECK-LABEL: @optimize_none
 // CHECK-SAME: #[[ATTRS:[0-9]+]]
 llvm.func @optimize_none() attributes { no_inline, optimize_none } {
diff --git a/mlir/test/Target/LLVMIR/nvvm/convert_fp4x2.mlir b/mlir/test/Target/LLVMIR/nvvm/convert_fp4x2.mlir
new file mode 100644
index 0000000000000..04e2ddff802a9
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/nvvm/convert_fp4x2.mlir
@@ -0,0 +1,12 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// CHECK-LABEL: @convert_f32x2_to_f4x2_e2m1
+llvm.func @convert_f32x2_to_f4x2_e2m1(%srcA : f32, %srcB : f32) {
+  // CHECK: %[[res1:.*]] = call i16 @llvm.nvvm.ff.to.e2m1x2.rn.satfinite(float %{{.*}}, float %{{.*}})
+  // CHECK-NEXT: %{{.*}} = trunc i16 %[[res1]] to i8
+  %res1 = nvvm.convert.f32x2.to.f4x2 %srcA, %srcB : i8 (f4E2M1FN)
+  // CHECK: %[[res2:.*]] = call i16 @llvm.nvvm.ff.to.e2m1x2.rn.relu.satfinite(float %{{.*}}, float %{{.*}})
+  // CHECK-NEXT: %{{.*}} = trunc i16 %[[res2]] to i8
+  %res2 = nvvm.convert.f32x2.to.f4x2 %srcA, %srcB {relu = true} : i8 (f4E2M1FN)
+  llvm.return
+}
diff --git a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
index 0b3615487716d..6cccfe424d293 100644
--- a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
@@ -254,6 +254,14 @@ llvm.func @nvvm_cvt_f32x2_to_f6x2_invalid_type(%a : f32, %b : f32) {
 
 // -----
 
+llvm.func @nvvm_cvt_f32x2_to_f4x2_invalid_type(%a : f32, %b : f32) {
+  // expected-error @below {{Only 'f4E2M1FN' type is supported for conversions from f32x2 to f4x2.}}
+  %res = nvvm.convert.f32x2.to.f4x2 %a, %b : i8 (f8E4M3FN)
+  llvm.return
+}
+
+// -----
+
 llvm.func @nvvm_prefetch_L1_with_evict_priority(%global_ptr: !llvm.ptr<1>) {
   // expected-error @below {{cache eviction priority supported only for cache level L2}}
   nvvm.prefetch level = L1, evict_priority = evict_last, %global_ptr : !llvm.ptr<1>
@@ -559,3 +567,25 @@ llvm.func @clusterlaunchcontrol_query_cancel_get_first_cta_id_invalid_return_typ
   %res = nvvm.clusterlaunchcontrol.query.cancel query = get_first_cta_id_x, %try_cancel_response : i1
   llvm.return
 }
+
+// -----
+
+// Test that ensures invalid row/col layouts for matrices A and B are not accepted
+llvm.func @nvvm_mma_m16n8k32_s4_s4(%a0 : i32, %a1 : i32, %b0 : i32, %c0 : i32, %c1 : i32, %c2 : i32, %c3 : i32) -> !llvm.struct<(i32,i32,i32,i32)> {
+  // expected-error@+1 {{Only m8n8k4 with f16 supports other layouts.}}
+  %0 = nvvm.mma.sync A[%a0, %a1] B[%b0] C[%c0, %c1, %c2, %c3]
+    {layoutA = #nvvm.mma_layout<col>, layoutB = #nvvm.mma_layout<col>,
+     multiplicandAPtxType = #nvvm.mma_type<s4>, multiplicandBPtxType = #nvvm.mma_type<s4>,
+     intOverflowBehavior=#nvvm.mma_int_overflow<satfinite>,
+     shape = #nvvm.shape<m = 16, n = 8, k = 32>} : (i32, i32, i32) -> !llvm.struct<(i32,i32,i32,i32)>
+  llvm.return %0 : !llvm.struct<(i32,i32,i32,i32)>
+}
+
+// -----
+
+// Test for range validation - invalid range where lower == upper but not at extremes
+func.func @invalid_range_equal_bounds() {
+  // expected-error @below {{invalid range attribute: Lower == Upper, but they aren't min (0) or max (4294967295) value! This is an invalid constant range.}}
+  %0 = nvvm.read.ptx.sreg.warpsize range <i32, 32, 32> : i32
+  return
+}
diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir
index 00a479d1f877d..594ae4849e3eb 100644
--- a/mlir/test/Target/LLVMIR/nvvmir.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -152,6 +152,10 @@ llvm.func @nvvm_special_regs() -> i32 {
   %74 = nvvm.read.ptx.sreg.lanemask.ge : i32
   //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.gt
   %75 = nvvm.read.ptx.sreg.lanemask.gt : i32
+  // CHECK: %76 = call range(i32 0, 0) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  %76 = nvvm.read.ptx.sreg.tid.x range <i32, 0, 0> : i32
+  // CHECK: %77 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  %77 = nvvm.read.ptx.sreg.tid.x range <i32, 4294967295, 4294967295> : i32
   llvm.return %1 : i32
 }
 
diff --git a/mlir/test/Target/LLVMIR/opaque-ptr.mlir b/mlir/test/Target/LLVMIR/opaque-ptr.mlir
index 895e998edebae..cd7c31a0945a8 100644
--- a/mlir/test/Target/LLVMIR/opaque-ptr.mlir
+++ b/mlir/test/Target/LLVMIR/opaque-ptr.mlir
@@ -67,7 +67,7 @@ llvm.func @opaque_ptr_masked_load(%arg0: !llvm.ptr, %arg1: vector<7xi1>) -> vect
 
 // CHECK-LABEL: @opaque_ptr_gather
 llvm.func @opaque_ptr_gather(%M: vector<7 x !llvm.ptr>, %mask: vector<7xi1>) -> vector<7xf32> {
-  // CHECK: call <7 x float> @llvm.masked.gather.v7f32.v7p0(<7 x ptr> {{.*}}, i32
+  // CHECK: call <7 x float> @llvm.masked.gather.v7f32.v7p0(<7 x ptr> align 1 {{.*}},
   %a = llvm.intr.masked.gather %M, %mask { alignment = 1: i32} :
       (vector<7 x !llvm.ptr>, vector<7xi1>) -> vector<7xf32>
   llvm.return %a : vector<7xf32>
diff --git a/mlir/test/Target/LLVMIR/ptr.mlir b/mlir/test/Target/LLVMIR/ptr.mlir
index e2687e52ece57..473ac0598e9ce 100644
--- a/mlir/test/Target/LLVMIR/ptr.mlir
+++ b/mlir/test/Target/LLVMIR/ptr.mlir
@@ -92,8 +92,8 @@ llvm.func @store_ops(%arg0: !ptr.ptr<#llvm.address_space<0>>, %arg1: f32, %arg2:
 
 // CHECK-LABEL: define <4 x float> @gather_ops
 // CHECK-SAME: (<4 x ptr> %[[PTRS:.*]], <4 x i1> %[[MASK:.*]], <4 x float> %[[PASSTHROUGH:.*]]) {
-// CHECK-NEXT:   %[[V0:.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %[[PTRS]], i32 1, <4 x i1> %[[MASK]], <4 x float> %[[PASSTHROUGH]])
-// CHECK-NEXT:   %[[V1:.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %[[PTRS]], i32 4, <4 x i1> %[[MASK]], <4 x float> %[[PASSTHROUGH]])
+// CHECK-NEXT:   %[[V0:.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 1 %[[PTRS]], <4 x i1> %[[MASK]], <4 x float> %[[PASSTHROUGH]])
+// CHECK-NEXT:   %[[V1:.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %[[PTRS]], <4 x i1> %[[MASK]], <4 x float> %[[PASSTHROUGH]])
 // CHECK-NEXT:   ret <4 x float> %[[V0]]
 // CHECK-NEXT: }
 llvm.func @gather_ops(%ptrs: vector<4x!ptr.ptr<#llvm.address_space<0>>>, %mask: vector<4xi1>, %passthrough: vector<4xf32>) -> vector<4xf32> {
@@ -106,7 +106,7 @@ llvm.func @gather_ops(%ptrs: vector<4x!ptr.ptr<#llvm.address_space<0>>>, %mask:
 
 // CHECK-LABEL: define <8 x i32> @gather_ops_i32
 // CHECK-SAME: (<8 x ptr> %[[PTRS:.*]], <8 x i1> %[[MASK:.*]], <8 x i32> %[[PASSTHROUGH:.*]]) {
-// CHECK-NEXT:   %[[V0:.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %[[PTRS]], i32 8, <8 x i1> %[[MASK]], <8 x i32> %[[PASSTHROUGH]])
+// CHECK-NEXT:   %[[V0:.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 8 %[[PTRS]], <8 x i1> %[[MASK]], <8 x i32> %[[PASSTHROUGH]])
 // CHECK-NEXT:   ret <8 x i32> %[[V0]]
 // CHECK-NEXT: }
 llvm.func @gather_ops_i32(%ptrs: vector<8x!ptr.ptr<#llvm.address_space<0>>>, %mask: vector<8xi1>, %passthrough: vector<8xi32>) -> vector<8xi32> {
@@ -116,8 +116,8 @@ llvm.func @gather_ops_i32(%ptrs: vector<8x!ptr.ptr<#llvm.address_space<0>>>, %ma
 
 // CHECK-LABEL: define <4 x float> @masked_load_ops
 // CHECK-SAME: (ptr %[[PTR:.*]], <4 x i1> %[[MASK:.*]], <4 x float> %[[PASSTHROUGH:.*]]) {
-// CHECK-NEXT:   %[[V0:.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %[[PTR]], i32 1, <4 x i1> %[[MASK]], <4 x float> %[[PASSTHROUGH]])
-// CHECK-NEXT:   %[[V1:.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %[[PTR]], i32 16, <4 x i1> %[[MASK]], <4 x float> %[[PASSTHROUGH]])
+// CHECK-NEXT:   %[[V0:.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 %[[PTR]], <4 x i1> %[[MASK]], <4 x float> %[[PASSTHROUGH]])
+// CHECK-NEXT:   %[[V1:.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 16 %[[PTR]], <4 x i1> %[[MASK]], <4 x float> %[[PASSTHROUGH]])
 // CHECK-NEXT:   ret <4 x float> %[[V0]]
 // CHECK-NEXT: }
 llvm.func @masked_load_ops(%ptr: !ptr.ptr<#llvm.address_space<0>>, %mask: vector<4xi1>, %passthrough: vector<4xf32>) -> vector<4xf32> {
@@ -130,7 +130,7 @@ llvm.func @masked_load_ops(%ptr: !ptr.ptr<#llvm.address_space<0>>, %mask: vector
 
 // CHECK-LABEL: define <8 x i64> @masked_load_ops_i64
 // CHECK-SAME: (ptr %[[PTR:.*]], <8 x i1> %[[MASK:.*]], <8 x i64> %[[PASSTHROUGH:.*]]) {
-// CHECK-NEXT:   %[[V0:.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr %[[PTR]], i32 8, <8 x i1> %[[MASK]], <8 x i64> %[[PASSTHROUGH]])
+// CHECK-NEXT:   %[[V0:.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 8 %[[PTR]], <8 x i1> %[[MASK]], <8 x i64> %[[PASSTHROUGH]])
 // CHECK-NEXT:   ret <8 x i64> %[[V0]]
 // CHECK-NEXT: }
 llvm.func @masked_load_ops_i64(%ptr: !ptr.ptr<#llvm.address_space<0>>, %mask: vector<8xi1>, %passthrough: vector<8xi64>) -> vector<8xi64> {
@@ -140,8 +140,8 @@ llvm.func @masked_load_ops_i64(%ptr: !ptr.ptr<#llvm.address_space<0>>, %mask: ve
 
 // CHECK-LABEL: define void @masked_store_ops
 // CHECK-SAME: (ptr %[[PTR:.*]], <4 x float> %[[VALUE:.*]], <4 x i1> %[[MASK:.*]]) {
-// CHECK-NEXT:   call void @llvm.masked.store.v4f32.p0(<4 x float> %[[VALUE]], ptr %[[PTR]], i32 1, <4 x i1> %[[MASK]])
-// CHECK-NEXT:   call void @llvm.masked.store.v4f32.p0(<4 x float> %[[VALUE]], ptr %[[PTR]], i32 32, <4 x i1> %[[MASK]])
+// CHECK-NEXT:   call void @llvm.masked.store.v4f32.p0(<4 x float> %[[VALUE]], ptr align 1 %[[PTR]], <4 x i1> %[[MASK]])
+// CHECK-NEXT:   call void @llvm.masked.store.v4f32.p0(<4 x float> %[[VALUE]], ptr align 32 %[[PTR]], <4 x i1> %[[MASK]])
 // CHECK-NEXT:   ret void
 // CHECK-NEXT: }
 llvm.func @masked_store_ops(%ptr: !ptr.ptr<#llvm.address_space<0>>, %value: vector<4xf32>, %mask: vector<4xi1>) {
@@ -154,7 +154,7 @@ llvm.func @masked_store_ops(%ptr: !ptr.ptr<#llvm.address_space<0>>, %value: vect
 
 // CHECK-LABEL: define void @masked_store_ops_i16
 // CHECK-SAME: (ptr %[[PTR:.*]], <8 x i16> %[[VALUE:.*]], <8 x i1> %[[MASK:.*]]) {
-// CHECK-NEXT:   call void @llvm.masked.store.v8i16.p0(<8 x i16> %[[VALUE]], ptr %[[PTR]], i32 4, <8 x i1> %[[MASK]])
+// CHECK-NEXT:   call void @llvm.masked.store.v8i16.p0(<8 x i16> %[[VALUE]], ptr align 4 %[[PTR]], <8 x i1> %[[MASK]])
 // CHECK-NEXT:   ret void
 // CHECK-NEXT: }
 llvm.func @masked_store_ops_i16(%ptr: !ptr.ptr<#llvm.address_space<0>>, %value: vector<8xi16>, %mask: vector<8xi1>) {
@@ -164,8 +164,8 @@ llvm.func @masked_store_ops_i16(%ptr: !ptr.ptr<#llvm.address_space<0>>, %value:
 
 // CHECK-LABEL: define void @scatter_ops
 // CHECK-SAME: (<4 x float> %[[VALUE:.*]], <4 x ptr> %[[PTRS:.*]], <4 x i1> %[[MASK:.*]]) {
-// CHECK-NEXT:   call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %[[VALUE]], <4 x ptr> %[[PTRS]], i32 1, <4 x i1> %[[MASK]])
-// CHECK-NEXT:   call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %[[VALUE]], <4 x ptr> %[[PTRS]], i32 8, <4 x i1> %[[MASK]])
+// CHECK-NEXT:   call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %[[VALUE]], <4 x ptr> align 1 %[[PTRS]], <4 x i1> %[[MASK]])
+// CHECK-NEXT:   call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %[[VALUE]], <4 x ptr> align 8 %[[PTRS]], <4 x i1> %[[MASK]])
 // CHECK-NEXT:   ret void
 // CHECK-NEXT: }
 llvm.func @scatter_ops(%value: vector<4xf32>, %ptrs: vector<4x!ptr.ptr<#llvm.address_space<0>>>, %mask: vector<4xi1>) {
@@ -178,7 +178,7 @@ llvm.func @scatter_ops(%value: vector<4xf32>, %ptrs: vector<4x!ptr.ptr<#llvm.add
 
 // CHECK-LABEL: define void @scatter_ops_i64
 // CHECK-SAME: (<8 x i64> %[[VALUE:.*]], <8 x ptr> %[[PTRS:.*]], <8 x i1> %[[MASK:.*]]) {
-// CHECK-NEXT:   call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> %[[VALUE]], <8 x ptr> %[[PTRS]], i32 16, <8 x i1> %[[MASK]])
+// CHECK-NEXT:   call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> %[[VALUE]], <8 x ptr> align 16 %[[PTRS]], <8 x i1> %[[MASK]])
 // CHECK-NEXT:   ret void
 // CHECK-NEXT: }
 llvm.func @scatter_ops_i64(%value: vector<8xi64>, %ptrs: vector<8x!ptr.ptr<#llvm.address_space<0>>>, %mask: vector<8xi1>) {
@@ -188,10 +188,10 @@ llvm.func @scatter_ops_i64(%value: vector<8xi64>, %ptrs: vector<8x!ptr.ptr<#llvm
 
 // CHECK-LABEL: define void @mixed_masked_ops_address_spaces
 // CHECK-SAME: (ptr addrspace(3) %[[PTR_SHARED:.*]], <4 x ptr addrspace(3)> %[[PTRS_SHARED:.*]], <4 x i1> %[[MASK:.*]], <4 x double> %[[VALUE_F64:.*]], <4 x double> %[[PASSTHROUGH_F64:.*]]) {
-// CHECK-NEXT:   %[[V0:.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p3(<4 x ptr addrspace(3)> %[[PTRS_SHARED]], i32 8, <4 x i1> %[[MASK]], <4 x double> %[[PASSTHROUGH_F64]])
-// CHECK-NEXT:   call void @llvm.masked.scatter.v4f64.v4p3(<4 x double> %[[VALUE_F64]], <4 x ptr addrspace(3)> %[[PTRS_SHARED]], i32 8, <4 x i1> %[[MASK]])
-// CHECK-NEXT:   %[[V1:.*]] = call <4 x double> @llvm.masked.load.v4f64.p3(ptr addrspace(3) %[[PTR_SHARED]], i32 8, <4 x i1> %[[MASK]], <4 x double> %[[PASSTHROUGH_F64]])
-// CHECK-NEXT:   call void @llvm.masked.store.v4f64.p3(<4 x double> %[[VALUE_F64]], ptr addrspace(3) %[[PTR_SHARED]], i32 8, <4 x i1> %[[MASK]])
+// CHECK-NEXT:   %[[V0:.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p3(<4 x ptr addrspace(3)> align 8 %[[PTRS_SHARED]], <4 x i1> %[[MASK]], <4 x double> %[[PASSTHROUGH_F64]])
+// CHECK-NEXT:   call void @llvm.masked.scatter.v4f64.v4p3(<4 x double> %[[VALUE_F64]], <4 x ptr addrspace(3)> align 8 %[[PTRS_SHARED]], <4 x i1> %[[MASK]])
+// CHECK-NEXT:   %[[V1:.*]] = call <4 x double> @llvm.masked.load.v4f64.p3(ptr addrspace(3) align 8 %[[PTR_SHARED]], <4 x i1> %[[MASK]], <4 x double> %[[PASSTHROUGH_F64]])
+// CHECK-NEXT:   call void @llvm.masked.store.v4f64.p3(<4 x double> %[[VALUE_F64]], ptr addrspace(3) align 8 %[[PTR_SHARED]], <4 x i1> %[[MASK]])
 // CHECK-NEXT:   ret void
 // CHECK-NEXT: }
 llvm.func @mixed_masked_ops_address_spaces(%ptr: !ptr.ptr<#llvm.address_space<3>>, %ptrs: vector<4x!ptr.ptr<#llvm.address_space<3>>>,
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index fdd2c91f6a5b5..30126f6bff05a 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -276,6 +276,20 @@ llvm.func @rocdl.s.wait.expcnt() {
   llvm.return
 }
 
+llvm.func @rocdl.s.wait.asynccnt() {
+  // CHECK-LABEL: rocdl.s.wait.asynccnt
+  // CHECK-NEXT: call void @llvm.amdgcn.s.wait.asynccnt(i16 0)
+  rocdl.s.wait.asynccnt 0
+  llvm.return
+}
+
+llvm.func @rocdl.s.wait.tensorcnt() {
+  // CHECK-LABEL: rocdl.s.wait.tensorcnt
+  // CHECK-NEXT: call void @llvm.amdgcn.s.wait.tensorcnt(i16 0)
+  rocdl.s.wait.tensorcnt 0
+  llvm.return
+}
+
 llvm.func @rocdl.setprio() {
   // CHECK: call void @llvm.amdgcn.s.setprio(i16 0)
   rocdl.s.setprio 0
@@ -858,9 +872,11 @@ llvm.func @rocdl.mfma.scale.f32.16x16x128.f8f6f4(%arg0 : i32,
 }
 
 llvm.func @rocdl.wmma(%arg0 : vector<8xf32>, %arg1 : vector<16 x f16>, %arg2 : vector<16 x i16>, %arg3 : vector<8 x i32>,
-                      %arg4 : vector<2xi32>, %arg5 : vector<4xi32>, %arg6 : vector<4xf32>, %arg7 : vector<8xf16>, %arg8 : vector<8xi16>) -> vector<8xf32> {
+                      %arg4 : vector<2xi32>, %arg5 : vector<4xi32>, %arg6 : vector<4xf32>, %arg7 : vector<8xf16>, %arg8 : vector<8xi16>,
+                      %arg9 : vector<32xf16>, %arg10 : vector<16xf32>, %arg11 : vector<4xf32>, %arg12 : vector<32xf32>, %arg13 : vector<64xf32>, 
+                      %arg14 : vector<64xi32>, %arg15 : vector<64xf16>, %arg16 : vector<16xbf16>, %arg17 : vector<32xbf16>) -> vector<8xf32> {
   %zero = llvm.mlir.constant(false) : i1
-
+  %zero_i16 = llvm.mlir.constant(0 : i16) : i16
   // ---- Wave32 -----
 
   // f16 -> f32
@@ -891,6 +907,83 @@ llvm.func @rocdl.wmma(%arg0 : vector<8xf32>, %arg1 : vector<16 x f16>, %arg2 : v
   // CHECK: call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 {{.*}}, <2 x i32> %{{.*}}, i1 {{.*}}, <2 x i32> %{{.*}}, <8 x i32> %{{.*}}, i1 {{.*}})
   %r6.gfx12 = rocdl.wmma.i32.16x16x32.iu4 %zero, %arg4, %zero, %arg4, %arg3, %zero : (i1, vector<2xi32>, i1, vector<2xi32>, vector<8xi32>, i1) -> vector<8xi32>
 
+  // f32 -> f32
+  // CHECK: call <4 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v4f32.v16f32(i1 {{.*}}, <16 x float> %{{.*}}, i1 {{.*}}, <16 x float> %{{.*}}, i16 0, <4 x float> %{{.*}}, i1 {{.*}}, i1 {{.*}})
+  %r1.gfx1250 = rocdl.wmma.f32.16x16x4.f32 %zero, %arg10, %zero, %arg10, %zero_i16, %arg11, %zero, %zero : (i1, vector<16xf32>, i1, vector<16xf32>, i16, vector<4xf32>, i1, i1) -> vector<4xf32>
+
+  // f16 -> f32
+  // CHECK: call <32 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v32f32.v16f16(i1 {{.*}}, <16 x half> %{{.*}}, i1 {{.*}}, <16 x half> %{{.*}}, i16 0, <32 x float> %{{.*}}, i1 {{.*}}, i1 {{.*}})
+  %r2.gfx1250 = rocdl.wmma.f32.16x16x32.f16 %zero, %arg1, %zero, %arg1, %zero_i16, %arg12, %zero, %zero : (i1, vector<16xf16>, i1, vector<16xf16>, i16, vector<32xf32>, i1, i1) -> vector<32xf32>
+
+  // bf16 -> f32
+  // CHECK: call <32 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v32f32.v16bf16(i1 {{.*}}, <16 x bfloat> %{{.*}}, i1 {{.*}}, <16 x bfloat> %{{.*}}, i16 0, <32 x float> %{{.*}}, i1 {{.*}}, i1 {{.*}})
+  %r3.gfx1250 = rocdl.wmma.f32.16x16x32.bf16 %zero, %arg16, %zero, %arg16, %zero_i16, %arg12, %zero, %zero : (i1, vector<16xbf16>, i1, vector<16xbf16>, i16, vector<32xf32>, i1, i1) -> vector<32xf32>
+
+  // f16 -> f16
+  // CHECK: call <32 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v32f16.v16f16(i1 {{.*}}, <16 x half> %{{.*}}, i1 {{.*}}, <16 x half> %{{.*}}, i16 0, <32 x half> %{{.*}}, i1 {{.*}}, i1 {{.*}})
+  %r4.gfx1250 = rocdl.wmma.f16.16x16x32.f16 %zero, %arg1, %zero, %arg1, %zero_i16, %arg9, %zero, %zero : (i1, vector<16xf16>, i1, vector<16xf16>, i16, vector<32xf16>, i1, i1) -> vector<32xf16>
+
+  // bf16 -> bf16
+  // CHECK: call <32 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v32bf16.v16bf16(i1 {{.*}}, <16 x bfloat> %{{.*}}, i1 {{.*}}, <16 x bfloat> %{{.*}}, i16 0, <32 x bfloat> %{{.*}}, i1 {{.*}}, i1 {{.*}})
+  %r5.gfx1250 = rocdl.wmma.bf16.16x16x32.bf16 %zero, %arg16, %zero, %arg16, %zero_i16, %arg17, %zero, %zero : (i1, vector<16xbf16>, i1, vector<16xbf16>, i16, vector<32xbf16>, i1, i1) -> vector<32xbf16>
+
+  // bf16 -> bf16 / f32
+  // CHECK: call <32 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v32bf16.v16bf16.v32f32(i1 {{.*}}, <16 x bfloat> %{{.*}}, i1 {{.*}}, <16 x bfloat> %{{.*}}, i16 0, <32 x float> %{{.*}}, i1 {{.*}}, i1 {{.*}})
+  %r6.gfx1250 = rocdl.wmma.bf16f32.16x16x32.bf16 %zero, %arg16, %zero, %arg16, %zero_i16, %arg12, %zero, %zero : (i1, vector<16xbf16>, i1, vector<16xbf16>, i16, vector<32xf32>, i1, i1) -> vector<32xbf16>
+
+  // f8/bf8 -> f16/f32
+  // CHECK: call <64 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.fp8.v64f32.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i16 0, <64 x float> %{{.*}}, i1 {{.*}}, i1 {{.*}})
+  %r7.gfx1250 = rocdl.wmma.f32.16x16x64.fp8_fp8 %arg5, %arg5, %zero_i16, %arg13, %zero, %zero : (vector<4xi32>, vector<4xi32>, i16, vector<64xf32>, i1, i1) -> vector<64xf32>
+
+  // CHECK: call <64 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.bf8.v64f32.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i16 0, <64 x float> %{{.*}}, i1 {{.*}}, i1 {{.*}})
+  %r8.gfx1250 = rocdl.wmma.f32.16x16x64.fp8_bf8 %arg5, %arg5, %zero_i16, %arg13, %zero, %zero : (vector<4xi32>, vector<4xi32>, i16, vector<64xf32>, i1, i1) -> vector<64xf32>
+
+  // CHECK: call <64 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.fp8.v64f32.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i16 0, <64 x float> %{{.*}}, i1 {{.*}}, i1 {{.*}})
+  %r9.gfx1250 = rocdl.wmma.f32.16x16x64.bf8_fp8 %arg5, %arg5, %zero_i16, %arg13, %zero, %zero : (vector<4xi32>, vector<4xi32>, i16, vector<64xf32>, i1, i1) -> vector<64xf32>
+
+  // CHECK: call <64 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.bf8.v64f32.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i16 0, <64 x float> %{{.*}}, i1 {{.*}}, i1 {{.*}})
+  %r10.gfx1250 = rocdl.wmma.f32.16x16x64.bf8_bf8 %arg5, %arg5, %zero_i16, %arg13, %zero, %zero : (vector<4xi32>, vector<4xi32>, i16, vector<64xf32>, i1, i1) -> vector<64xf32>
+
+  // CHECK: call <64 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.fp8.v64f16.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i16 0, <64 x half> %{{.*}}, i1 {{.*}}, i1 {{.*}})
+  %r11.gfx1250 = rocdl.wmma.f16.16x16x64.fp8_fp8 %arg5, %arg5, %zero_i16, %arg15, %zero, %zero : (vector<4xi32>, vector<4xi32>, i16, vector<64xf16>, i1, i1) -> vector<64xf16>
+
+  // CHECK: call <64 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v64f16.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i16 0, <64 x half> %{{.*}}, i1 {{.*}}, i1 {{.*}})
+  %r12.gfx1250 = rocdl.wmma.f16.16x16x64.fp8_bf8 %arg5, %arg5, %zero_i16, %arg15, %zero, %zero : (vector<4xi32>, vector<4xi32>, i16, vector<64xf16>, i1, i1) -> vector<64xf16>
+
+  // CHECK: call <64 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v64f16.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i16 0, <64 x half> %{{.*}}, i1 {{.*}}, i1 {{.*}})
+  %r13.gfx1250 = rocdl.wmma.f16.16x16x64.bf8_fp8 %arg5, %arg5, %zero_i16, %arg15, %zero, %zero : (vector<4xi32>, vector<4xi32>, i16, vector<64xf16>, i1, i1) -> vector<64xf16>
+
+  // CHECK: call <64 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v64f16.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i16 0, <64 x half> %{{.*}}, i1 {{.*}}, i1 {{.*}})
+  %r14.gfx1250 = rocdl.wmma.f16.16x16x64.bf8_bf8 %arg5, %arg5, %zero_i16, %arg15, %zero, %zero : (vector<4xi32>, vector<4xi32>, i16, vector<64xf16>, i1, i1) -> vector<64xf16>
+
+  // CHECK: call <64 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.fp8.v64f32.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i16 0, <64 x float> %{{.*}}, i1 {{.*}}, i1 {{.*}})
+  %r15.gfx1250 = rocdl.wmma.f32.16x16x128.fp8_fp8 %arg5, %arg5, %zero_i16, %arg13, %zero, %zero : (vector<4xi32>, vector<4xi32>, i16, vector<64xf32>, i1, i1) -> vector<64xf32>
+
+  // CHECK: call <64 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v64f32.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i16 0, <64 x float> %{{.*}}, i1 {{.*}}, i1 {{.*}})
+  %r16.gfx1250 = rocdl.wmma.f32.16x16x128.fp8_bf8 %arg5, %arg5, %zero_i16, %arg13, %zero, %zero : (vector<4xi32>, vector<4xi32>, i16, vector<64xf32>, i1, i1) -> vector<64xf32>
+
+  // CHECK: call <64 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v64f32.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i16 0, <64 x float> %{{.*}}, i1 {{.*}}, i1 {{.*}})
+  %r17.gfx1250 = rocdl.wmma.f32.16x16x128.bf8_fp8 %arg5, %arg5, %zero_i16, %arg13, %zero, %zero : (vector<4xi32>, vector<4xi32>, i16, vector<64xf32>, i1, i1) -> vector<64xf32>
+
+  // CHECK: call <64 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v64f32.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i16 0, <64 x float> %{{.*}}, i1 {{.*}}, i1 {{.*}})
+  %r18.gfx1250 = rocdl.wmma.f32.16x16x128.bf8_bf8 %arg5, %arg5, %zero_i16, %arg13, %zero, %zero : (vector<4xi32>, vector<4xi32>, i16, vector<64xf32>, i1, i1) -> vector<64xf32>
+
+  // CHECK: call <64 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v64f16.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i16 0, <64 x half> %{{.*}}, i1 {{.*}}, i1 {{.*}})
+  %r19.gfx1250 = rocdl.wmma.f16.16x16x128.fp8_fp8 %arg5, %arg5, %zero_i16, %arg15, %zero, %zero : (vector<4xi32>, vector<4xi32>, i16, vector<64xf16>, i1, i1) -> vector<64xf16>
+
+  // CHECK: call <64 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v64f16.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i16 0, <64 x half> %{{.*}}, i1 {{.*}}, i1 {{.*}})
+  %r20.gfx1250 = rocdl.wmma.f16.16x16x128.fp8_bf8 %arg5, %arg5, %zero_i16, %arg15, %zero, %zero : (vector<4xi32>, vector<4xi32>, i16, vector<64xf16>, i1, i1) -> vector<64xf16>
+
+  // CHECK: call <64 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v64f16.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i16 0, <64 x half> %{{.*}}, i1 {{.*}}, i1 {{.*}})
+  %r21.gfx1250 = rocdl.wmma.f16.16x16x128.bf8_fp8 %arg5, %arg5, %zero_i16, %arg15, %zero, %zero : (vector<4xi32>, vector<4xi32>, i16, vector<64xf16>, i1, i1) -> vector<64xf16>
+
+  // CHECK: call <64 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.bf8.v64f16.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i16 0, <64 x half> %{{.*}}, i1 {{.*}}, i1 {{.*}})
+  %r22.gfx1250 = rocdl.wmma.f16.16x16x128.bf8_bf8 %arg5, %arg5, %zero_i16, %arg15, %zero, %zero : (vector<4xi32>, vector<4xi32>, i16, vector<64xf16>, i1, i1) -> vector<64xf16>
+
+  // iu8 -> i32
+  // CHECK: call <64 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v64i32.v4i32(i1 {{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}, <4 x i32> %{{.*}}, <64 x i32> %{{.*}}, i1 {{.*}}, i1 {{.*}})
+  %r23.gfx1250 = rocdl.wmma.i32.16x16x64.iu8 %zero, %arg5, %zero, %arg5, %arg14, %zero, %zero : (i1, vector<4xi32>, i1, vector<4xi32>, vector<64xi32>, i1, i1) -> vector<64xi32>
+
   // ---- Wave64 -----
 
   // f16 -> f32
@@ -1463,6 +1556,52 @@ llvm.func @rocdl.cvt.scale.pk16(%v3xi32: vector<3xi32>, %scale:i32) {
   llvm.return
 }
 
+// CHECK-LABEL: rocdl.cvt.scalef32.pk16
+// CHECK-SAME:(<16 x float> %[[V16F32:.+]], <16 x half> %[[V16F16:.+]], <16 x bfloat> %[[V16BF16:.+]], float %[[SCALE:.+]])
+llvm.func @rocdl.cvt.scalef32.pk16(%v16xf32: vector<16xf32>, %v16xf16: vector<16xf16>, %v16xbf16: vector<16xbf16>, %scale: f32) {
+
+  // CHECK: call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.fp6.f16(<16 x half> %[[V16F16]], float %[[SCALE]])
+  %0 = rocdl.cvt.scalef32.pk16.fp6.f16 %v16xf16, %scale : vector<3xi32>
+  // CHECK: call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.fp6.bf16(<16 x bfloat> %[[V16BF16]], float %[[SCALE]])
+  %1 = rocdl.cvt.scalef32.pk16.fp6.bf16 %v16xbf16, %scale : vector<3xi32>
+  // CHECK: call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.fp6.f32(<16 x float> %[[V16F32]], float %[[SCALE]])
+  %2 = rocdl.cvt.scalef32.pk16.fp6.f32 %v16xf32, %scale : vector<3xi32>
+
+  // CHECK: call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.bf6.f16(<16 x half> %[[V16F16]], float %[[SCALE]])
+  %3 = rocdl.cvt.scalef32.pk16.bf6.f16 %v16xf16, %scale : vector<3xi32>
+  // CHECK: call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.bf6.bf16(<16 x bfloat> %[[V16BF16]], float %[[SCALE]])
+  %4 = rocdl.cvt.scalef32.pk16.bf6.bf16 %v16xbf16, %scale : vector<3xi32>
+  // CHECK: call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.bf6.f32(<16 x float> %[[V16F32]], float %[[SCALE]])
+  %5 = rocdl.cvt.scalef32.pk16.bf6.f32 %v16xf32, %scale : vector<3xi32>
+
+  llvm.return
+}
+
+// CHECK-LABEL: rocdl.cvt.scalef32.sr.pk16
+// CHECK-SAME:(<16 x float> %[[V16F32:.+]], <16 x half> %[[V16F16:.+]], <16 x bfloat> %[[V16BF16:.+]], i32 %[[SEED:.+]],  float %[[SCALE:.+]])
+llvm.func @rocdl.cvt.scalef32.sr.pk16(%v16xf32: vector<16xf32>,
+                                     %v16xf16: vector<16xf16>,
+                                     %v16xbf16: vector<16xbf16>,
+                                     %seed: i32,
+                                     %scale: f32) {
+
+  // CHECK: call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.f16(<16 x half> %[[V16F16]], i32 %[[SEED]], float %[[SCALE]])
+  %0 = rocdl.cvt.scalef32.sr.pk16.fp6.f16 %v16xf16, %seed, %scale : vector<3xi32>
+  // CHECK: call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.bf16(<16 x bfloat> %[[V16BF16]], i32 %[[SEED]], float %[[SCALE]])
+  %1 = rocdl.cvt.scalef32.sr.pk16.fp6.bf16 %v16xbf16, %seed, %scale : vector<3xi32>
+  // CHECK: call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.f32(<16 x float> %[[V16F32]], i32 %[[SEED]], float %[[SCALE]])
+  %2 = rocdl.cvt.scalef32.sr.pk16.fp6.f32 %v16xf32, %seed, %scale : vector<3xi32>
+
+  // CHECK: call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f16(<16 x half> %[[V16F16]], i32 %[[SEED]], float %[[SCALE]])
+  %3 = rocdl.cvt.scalef32.sr.pk16.bf6.f16 %v16xf16, %seed, %scale : vector<3xi32>
+  // CHECK: call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.bf16(<16 x bfloat> %[[V16BF16]], i32 %[[SEED]], float %[[SCALE]])
+  %4 = rocdl.cvt.scalef32.sr.pk16.bf6.bf16 %v16xbf16, %seed, %scale : vector<3xi32>
+  // CHECK: call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f32(<16 x float> %[[V16F32]], i32 %[[SEED]], float %[[SCALE]])
+  %5 = rocdl.cvt.scalef32.sr.pk16.bf6.f32 %v16xf32, %seed, %scale : vector<3xi32>
+
+  llvm.return
+}
+
 // CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "uniform-work-group-size"="true" }
 // CHECK-DAG: attributes #[[$KERNEL_WORKGROUP_ATTRS]] = { "amdgpu-flat-work-group-size"="1,1024"
 // CHECK-DAG: attributes #[[$KNOWN_BLOCK_SIZE_ATTRS]] = { "amdgpu-flat-work-group-size"="128,128"
diff --git a/mlir/test/Target/SPIRV/function-decorations.mlir b/mlir/test/Target/SPIRV/function-decorations.mlir
index ef627145aac3e..cf6edaa0a3d5b 100644
--- a/mlir/test/Target/SPIRV/function-decorations.mlir
+++ b/mlir/test/Target/SPIRV/function-decorations.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-translate -no-implicit-module -test-spirv-roundtrip -split-input-file -verify-diagnostics %s | FileCheck %s
+// RUN: mlir-translate --no-implicit-module --test-spirv-roundtrip --split-input-file %s | FileCheck %s
 
 spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], []> {
     spirv.func @linkage_attr_test_kernel()  "DontInline"  attributes {}  {
diff --git a/mlir/test/Target/Wasm/abs.mlir b/mlir/test/Target/Wasm/abs.mlir
index 9c45ba78507ad..fe3602a521698 100644
--- a/mlir/test/Target/Wasm/abs.mlir
+++ b/mlir/test/Target/Wasm/abs.mlir
@@ -12,12 +12,12 @@
 )
 */
 
-// CHECK-LABEL:   wasmssa.func @abs_f32() -> f32 {
+// CHECK-LABEL:   wasmssa.func exported @abs_f32() -> f32 {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.const 1.000000e+01 : f32
 // CHECK:           %[[VAL_1:.*]] = wasmssa.abs %[[VAL_0]] : f32
 // CHECK:           wasmssa.return %[[VAL_1]] : f32
 
-// CHECK-LABEL:   wasmssa.func @abs_f64() -> f64 {
+// CHECK-LABEL:   wasmssa.func exported @abs_f64() -> f64 {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.const 1.000000e+01 : f64
 // CHECK:           %[[VAL_1:.*]] = wasmssa.abs %[[VAL_0]] : f64
 // CHECK:           wasmssa.return %[[VAL_1]] : f64
diff --git a/mlir/test/Target/Wasm/add_div.mlir b/mlir/test/Target/Wasm/add_div.mlir
new file mode 100644
index 0000000000000..8a87c6084892d
--- /dev/null
+++ b/mlir/test/Target/Wasm/add_div.mlir
@@ -0,0 +1,40 @@
+// RUN: yaml2obj %S/inputs/add_div.yaml.wasm -o - | mlir-translate --import-wasm | FileCheck %s
+
+/* Source code used to create this test:
+ (module $test.wasm
+  (type (;0;) (func (param i32) (result i32)))
+  (type (;1;) (func (param i32 i32) (result i32)))
+  (import "env" "twoTimes" (func $twoTimes (type 0)))
+  (func $add (type 1) (param i32 i32) (result i32)
+    local.get 0
+    call $twoTimes
+    local.get 1
+    call $twoTimes
+    i32.add
+    i32.const 2
+    i32.div_s)
+  (memory (;0;) 2)
+  (global $__stack_pointer (mut i32) (i32.const 66560))
+  (export "memory" (memory 0))
+  (export "add" (func $add)))
+*/
+
+// CHECK-LABEL:   wasmssa.import_func "twoTimes" from "env" as @func_0 {type = (i32) -> i32}
+
+// CHECK-LABEL:   wasmssa.func exported @add(
+// CHECK-SAME:      %[[ARG0:.*]]: !wasmssa<local ref to i32>,
+// CHECK-SAME:      %[[ARG1:.*]]: !wasmssa<local ref to i32>) -> i32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.local_get %[[ARG0]] :  ref to i32
+// CHECK:           %[[VAL_1:.*]] = wasmssa.call @func_0(%[[VAL_0]]) : (i32) -> i32
+// CHECK:           %[[VAL_2:.*]] = wasmssa.local_get %[[ARG1]] :  ref to i32
+// CHECK:           %[[VAL_3:.*]] = wasmssa.call @func_0(%[[VAL_2]]) : (i32) -> i32
+// CHECK:           %[[VAL_4:.*]] = wasmssa.add %[[VAL_1]] %[[VAL_3]] : i32
+// CHECK:           %[[VAL_5:.*]] = wasmssa.const 2 : i32
+// CHECK:           %[[VAL_6:.*]] = wasmssa.div_si %[[VAL_4]] %[[VAL_5]] : i32
+// CHECK:           wasmssa.return %[[VAL_6]] : i32
+// CHECK:         }
+// CHECK:         wasmssa.memory exported @memory !wasmssa<limit[2:]>
+
+// CHECK-LABEL:   wasmssa.global @global_0 i32 mutable : {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 66560 : i32
+// CHECK:           wasmssa.return %[[VAL_0]] : i32
diff --git a/mlir/test/Target/Wasm/and.mlir b/mlir/test/Target/Wasm/and.mlir
index 4c0fea01b3ae6..323d41a522bab 100644
--- a/mlir/test/Target/Wasm/and.mlir
+++ b/mlir/test/Target/Wasm/and.mlir
@@ -14,13 +14,13 @@
 )
 */
 
-// CHECK-LABEL: wasmssa.func @and_i32() -> i32 {
+// CHECK-LABEL: wasmssa.func exported @and_i32() -> i32 {
 // CHECK:    %0 = wasmssa.const 10 : i32
 // CHECK:    %1 = wasmssa.const 3 : i32
 // CHECK:    %2 = wasmssa.and %0 %1 : i32
 // CHECK:    wasmssa.return %2 : i32
 
-// CHECK-LABEL: wasmssa.func @and_i64() -> i64 {
+// CHECK-LABEL: wasmssa.func exported @and_i64() -> i64 {
 // CHECK:    %0 = wasmssa.const 10 : i64
 // CHECK:    %1 = wasmssa.const 3 : i64
 // CHECK:    %2 = wasmssa.and %0 %1 : i64
diff --git a/mlir/test/Target/Wasm/block.mlir b/mlir/test/Target/Wasm/block.mlir
new file mode 100644
index 0000000000000..c85fc1e12d647
--- /dev/null
+++ b/mlir/test/Target/Wasm/block.mlir
@@ -0,0 +1,16 @@
+// RUN: yaml2obj %S/inputs/block.yaml.wasm -o - | mlir-translate --import-wasm | FileCheck %s
+
+/* Source code used to create this test:
+(module
+(func(export "i_am_a_block")
+(block $i_am_a_block)
+)
+)
+*/
+
+// CHECK-LABEL:   wasmssa.func exported @i_am_a_block() {
+// CHECK:           wasmssa.block : {
+// CHECK:             wasmssa.block_return
+// CHECK:           }> ^bb1
+// CHECK:         ^bb1:
+// CHECK:           wasmssa.return
diff --git a/mlir/test/Target/Wasm/block_complete_type.mlir b/mlir/test/Target/Wasm/block_complete_type.mlir
new file mode 100644
index 0000000000000..67df1983687ef
--- /dev/null
+++ b/mlir/test/Target/Wasm/block_complete_type.mlir
@@ -0,0 +1,24 @@
+// RUN: yaml2obj %S/inputs/block_complete_type.yaml.wasm -o - | mlir-translate --import-wasm | FileCheck %s
+
+/* Source code used to create this test:
+(module
+  (type (;0;) (func (param i32) (result i32)))
+  (type (;1;) (func (result i32)))
+  (func (;0;) (type 1) (result i32)
+    i32.const 14
+    block (param i32) (result i32)  ;; label = @1
+      i32.const 1
+      i32.add
+    end))
+*/
+
+// CHECK-LABEL:   wasmssa.func @func_0() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 14 : i32
+// CHECK:           wasmssa.block(%[[VAL_0]]) : i32 : {
+// CHECK:           ^bb0(%[[VAL_1:.*]]: i32):
+// CHECK:             %[[VAL_2:.*]] = wasmssa.const 1 : i32
+// CHECK:             %[[VAL_3:.*]] = wasmssa.add %[[VAL_1]] %[[VAL_2]] : i32
+// CHECK:             wasmssa.block_return %[[VAL_3]] : i32
+// CHECK:           }> ^bb1
+// CHECK:         ^bb1(%[[VAL_4:.*]]: i32):
+// CHECK:           wasmssa.return %[[VAL_4]] : i32
diff --git a/mlir/test/Target/Wasm/block_value_type.mlir b/mlir/test/Target/Wasm/block_value_type.mlir
new file mode 100644
index 0000000000000..fa30f08274956
--- /dev/null
+++ b/mlir/test/Target/Wasm/block_value_type.mlir
@@ -0,0 +1,19 @@
+// RUN: yaml2obj %S/inputs/block_value_type.yaml.wasm -o - | mlir-translate --import-wasm | FileCheck %s
+
+/* Source code used to create this test:
+(module
+  (type (;0;) (func (result i32)))
+  (func (;0;) (type 0) (result i32)
+    block (result i32)  ;; label = @1
+      i32.const 17
+    end))
+*/
+
+
+// CHECK-LABEL:   wasmssa.func @func_0() -> i32 {
+// CHECK:           wasmssa.block : {
+// CHECK:             %[[VAL_0:.*]] = wasmssa.const 17 : i32
+// CHECK:             wasmssa.block_return %[[VAL_0]] : i32
+// CHECK:           }> ^bb1
+// CHECK:         ^bb1(%[[VAL_1:.*]]: i32):
+// CHECK:           wasmssa.return %[[VAL_1]] : i32
diff --git a/mlir/test/Target/Wasm/branch_if.mlir b/mlir/test/Target/Wasm/branch_if.mlir
new file mode 100644
index 0000000000000..c91ff37555478
--- /dev/null
+++ b/mlir/test/Target/Wasm/branch_if.mlir
@@ -0,0 +1,29 @@
+// RUN: yaml2obj %S/inputs/branch_if.yaml.wasm -o - | mlir-translate --import-wasm | FileCheck %s
+
+/* Source code used to create this test:
+(module
+  (type $produce_i32 (func (result i32)))
+  (func (type $produce_i32)
+    (block $my_block (type $produce_i32)
+      i32.const 1
+      i32.const 2
+      br_if $my_block
+      i32.const 1
+      i32.add
+    )
+  )
+)
+*/
+
+// CHECK-LABEL:   wasmssa.func @func_0() -> i32 {
+// CHECK:           wasmssa.block : {
+// CHECK:             %[[VAL_0:.*]] = wasmssa.const 1 : i32
+// CHECK:             %[[VAL_1:.*]] = wasmssa.const 2 : i32
+// CHECK:             wasmssa.branch_if %[[VAL_1]] to level 0 with args(%[[VAL_0]] : i32) else ^bb1
+// CHECK:           ^bb1:
+// CHECK:             %[[VAL_2:.*]] = wasmssa.const 1 : i32
+// CHECK:             %[[VAL_3:.*]] = wasmssa.add %[[VAL_0]] %[[VAL_2]] : i32
+// CHECK:             wasmssa.block_return %[[VAL_3]] : i32
+// CHECK:           }> ^bb1
+// CHECK:         ^bb1(%[[VAL_4:.*]]: i32):
+// CHECK:           wasmssa.return %[[VAL_4]] : i32
diff --git a/mlir/test/Target/Wasm/call.mlir b/mlir/test/Target/Wasm/call.mlir
new file mode 100644
index 0000000000000..c0169aa238794
--- /dev/null
+++ b/mlir/test/Target/Wasm/call.mlir
@@ -0,0 +1,17 @@
+// RUN: yaml2obj %S/inputs/call.yaml.wasm -o - | mlir-translate --import-wasm | FileCheck %s
+
+/* Source code used to create this test:
+(module
+(func $forty_two (result i32)
+i32.const 42)
+(func(export "forty_two")(result i32)
+call $forty_two))
+*/
+
+// CHECK-LABEL:   wasmssa.func @func_0() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 42 : i32
+// CHECK:           wasmssa.return %[[VAL_0]] : i32
+
+// CHECK-LABEL:   wasmssa.func exported @forty_two() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.call @func_0 : () -> i32
+// CHECK:           wasmssa.return %[[VAL_0]] : i32
diff --git a/mlir/test/Target/Wasm/clz.mlir b/mlir/test/Target/Wasm/clz.mlir
index 3e6641d9a2a96..858c09da6d8df 100644
--- a/mlir/test/Target/Wasm/clz.mlir
+++ b/mlir/test/Target/Wasm/clz.mlir
@@ -14,12 +14,12 @@
 )
 */
 
-// CHECK-LABEL:   wasmssa.func @clz_i32() -> i32 {
+// CHECK-LABEL:   wasmssa.func exported @clz_i32() -> i32 {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.const 10 : i32
 // CHECK:           %[[VAL_1:.*]] = wasmssa.clz %[[VAL_0]] : i32
 // CHECK:           wasmssa.return %[[VAL_1]] : i32
 
-// CHECK-LABEL:   wasmssa.func @clz_i64() -> i64 {
+// CHECK-LABEL:   wasmssa.func exported @clz_i64() -> i64 {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.const 10 : i64
 // CHECK:           %[[VAL_1:.*]] = wasmssa.clz %[[VAL_0]] : i64
 // CHECK:           wasmssa.return %[[VAL_1]] : i64
diff --git a/mlir/test/Target/Wasm/comparison_ops.mlir b/mlir/test/Target/Wasm/comparison_ops.mlir
new file mode 100644
index 0000000000000..91e3a6a0f6fc8
--- /dev/null
+++ b/mlir/test/Target/Wasm/comparison_ops.mlir
@@ -0,0 +1,269 @@
+// RUN: yaml2obj %S/inputs/comparison_ops.yaml.wasm -o - | mlir-translate --import-wasm | FileCheck %s
+/* Source code used to create this test:
+(module
+    (func $lt_si32 (result i32)
+        i32.const 12
+        i32.const 50
+        i32.lt_s
+    )
+    (func $le_si32 (result i32)
+        i32.const 12
+        i32.const 50
+        i32.le_s
+    )
+    (func $lt_ui32 (result i32)
+        i32.const 12
+        i32.const 50
+        i32.lt_u
+    )
+    (func $le_ui32 (result i32)
+        i32.const 12
+        i32.const 50
+        i32.le_u
+    )
+    (func $gt_si32 (result i32)
+        i32.const 12
+        i32.const 50
+        i32.gt_s
+    )
+    (func $gt_ui32 (result i32)
+        i32.const 12
+        i32.const 50
+        i32.gt_u
+    )
+    (func $ge_si32 (result i32)
+        i32.const 12
+        i32.const 50
+        i32.ge_s
+    )
+    (func $ge_ui32 (result i32)
+        i32.const 12
+        i32.const 50
+        i32.ge_u
+    )
+    (func $lt_si64 (result i32)
+        i64.const 12
+        i64.const 50
+        i64.lt_s
+    )
+    (func $le_si64 (result i32)
+        i64.const 12
+        i64.const 50
+        i64.le_s
+    )
+    (func $lt_ui64 (result i32)
+        i64.const 12
+        i64.const 50
+        i64.lt_u
+    )
+    (func $le_ui64 (result i32)
+        i64.const 12
+        i64.const 50
+        i64.le_u
+    )
+    (func $gt_si64 (result i32)
+        i64.const 12
+        i64.const 50
+        i64.gt_s
+    )
+    (func $gt_ui64 (result i32)
+        i64.const 12
+        i64.const 50
+        i64.gt_u
+    )
+    (func $ge_si64 (result i32)
+        i64.const 12
+        i64.const 50
+        i64.ge_s
+    )
+    (func $ge_ui64 (result i32)
+        i64.const 12
+        i64.const 50
+        i64.ge_u
+    )
+    (func $lt_f32 (result i32)
+        f32.const 5
+        f32.const 14
+        f32.lt
+    )
+    (func $le_f32 (result i32)
+        f32.const 5
+        f32.const 14
+        f32.le
+    )
+    (func $gt_f32 (result i32)
+        f32.const 5
+        f32.const 14
+        f32.gt
+    )
+    (func $ge_f32 (result i32)
+        f32.const 5
+        f32.const 14
+        f32.ge
+    )
+    (func $lt_f64 (result i32)
+        f64.const 5
+        f64.const 14
+        f64.lt
+    )
+    (func $le_f64 (result i32)
+        f64.const 5
+        f64.const 14
+        f64.le
+    )
+    (func $gt_f64 (result i32)
+        f64.const 5
+        f64.const 14
+        f64.gt
+    )
+    (func $ge_f64 (result i32)
+        f64.const 5
+        f64.const 14
+        f64.ge
+    )
+)
+*/
+
+// CHECK-LABEL:   wasmssa.func @func_0() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 12 : i32
+// CHECK:           %[[VAL_1:.*]] = wasmssa.const 50 : i32
+// CHECK:           %[[VAL_2:.*]] = wasmssa.lt_si %[[VAL_0]] %[[VAL_1]] : i32 -> i32
+// CHECK:           wasmssa.return %[[VAL_2]] : i32
+
+// CHECK-LABEL:   wasmssa.func @func_1() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 12 : i32
+// CHECK:           %[[VAL_1:.*]] = wasmssa.const 50 : i32
+// CHECK:           %[[VAL_2:.*]] = wasmssa.le_si %[[VAL_0]] %[[VAL_1]] : i32 -> i32
+// CHECK:           wasmssa.return %[[VAL_2]] : i32
+
+// CHECK-LABEL:   wasmssa.func @func_2() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 12 : i32
+// CHECK:           %[[VAL_1:.*]] = wasmssa.const 50 : i32
+// CHECK:           %[[VAL_2:.*]] = wasmssa.lt_ui %[[VAL_0]] %[[VAL_1]] : i32 -> i32
+// CHECK:           wasmssa.return %[[VAL_2]] : i32
+
+// CHECK-LABEL:   wasmssa.func @func_3() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 12 : i32
+// CHECK:           %[[VAL_1:.*]] = wasmssa.const 50 : i32
+// CHECK:           %[[VAL_2:.*]] = wasmssa.le_ui %[[VAL_0]] %[[VAL_1]] : i32 -> i32
+// CHECK:           wasmssa.return %[[VAL_2]] : i32
+
+// CHECK-LABEL:   wasmssa.func @func_4() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 12 : i32
+// CHECK:           %[[VAL_1:.*]] = wasmssa.const 50 : i32
+// CHECK:           %[[VAL_2:.*]] = wasmssa.gt_si %[[VAL_0]] %[[VAL_1]] : i32 -> i32
+// CHECK:           wasmssa.return %[[VAL_2]] : i32
+
+// CHECK-LABEL:   wasmssa.func @func_5() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 12 : i32
+// CHECK:           %[[VAL_1:.*]] = wasmssa.const 50 : i32
+// CHECK:           %[[VAL_2:.*]] = wasmssa.gt_ui %[[VAL_0]] %[[VAL_1]] : i32 -> i32
+// CHECK:           wasmssa.return %[[VAL_2]] : i32
+
+// CHECK-LABEL:   wasmssa.func @func_6() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 12 : i32
+// CHECK:           %[[VAL_1:.*]] = wasmssa.const 50 : i32
+// CHECK:           %[[VAL_2:.*]] = wasmssa.ge_si %[[VAL_0]] %[[VAL_1]] : i32 -> i32
+// CHECK:           wasmssa.return %[[VAL_2]] : i32
+
+// CHECK-LABEL:   wasmssa.func @func_7() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 12 : i32
+// CHECK:           %[[VAL_1:.*]] = wasmssa.const 50 : i32
+// CHECK:           %[[VAL_2:.*]] = wasmssa.ge_ui %[[VAL_0]] %[[VAL_1]] : i32 -> i32
+// CHECK:           wasmssa.return %[[VAL_2]] : i32
+
+// CHECK-LABEL:   wasmssa.func @func_8() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 12 : i64
+// CHECK:           %[[VAL_1:.*]] = wasmssa.const 50 : i64
+// CHECK:           %[[VAL_2:.*]] = wasmssa.lt_si %[[VAL_0]] %[[VAL_1]] : i64 -> i32
+// CHECK:           wasmssa.return %[[VAL_2]] : i32
+
+// CHECK-LABEL:   wasmssa.func @func_9() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 12 : i64
+// CHECK:           %[[VAL_1:.*]] = wasmssa.const 50 : i64
+// CHECK:           %[[VAL_2:.*]] = wasmssa.le_si %[[VAL_0]] %[[VAL_1]] : i64 -> i32
+// CHECK:           wasmssa.return %[[VAL_2]] : i32
+
+// CHECK-LABEL:   wasmssa.func @func_10() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 12 : i64
+// CHECK:           %[[VAL_1:.*]] = wasmssa.const 50 : i64
+// CHECK:           %[[VAL_2:.*]] = wasmssa.lt_ui %[[VAL_0]] %[[VAL_1]] : i64 -> i32
+// CHECK:           wasmssa.return %[[VAL_2]] : i32
+
+// CHECK-LABEL:   wasmssa.func @func_11() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 12 : i64
+// CHECK:           %[[VAL_1:.*]] = wasmssa.const 50 : i64
+// CHECK:           %[[VAL_2:.*]] = wasmssa.le_ui %[[VAL_0]] %[[VAL_1]] : i64 -> i32
+// CHECK:           wasmssa.return %[[VAL_2]] : i32
+
+// CHECK-LABEL:   wasmssa.func @func_12() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 12 : i64
+// CHECK:           %[[VAL_1:.*]] = wasmssa.const 50 : i64
+// CHECK:           %[[VAL_2:.*]] = wasmssa.gt_si %[[VAL_0]] %[[VAL_1]] : i64 -> i32
+// CHECK:           wasmssa.return %[[VAL_2]] : i32
+
+// CHECK-LABEL:   wasmssa.func @func_13() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 12 : i64
+// CHECK:           %[[VAL_1:.*]] = wasmssa.const 50 : i64
+// CHECK:           %[[VAL_2:.*]] = wasmssa.gt_ui %[[VAL_0]] %[[VAL_1]] : i64 -> i32
+// CHECK:           wasmssa.return %[[VAL_2]] : i32
+
+// CHECK-LABEL:   wasmssa.func @func_14() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 12 : i64
+// CHECK:           %[[VAL_1:.*]] = wasmssa.const 50 : i64
+// CHECK:           %[[VAL_2:.*]] = wasmssa.ge_si %[[VAL_0]] %[[VAL_1]] : i64 -> i32
+// CHECK:           wasmssa.return %[[VAL_2]] : i32
+
+// CHECK-LABEL:   wasmssa.func @func_15() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 12 : i64
+// CHECK:           %[[VAL_1:.*]] = wasmssa.const 50 : i64
+// CHECK:           %[[VAL_2:.*]] = wasmssa.ge_ui %[[VAL_0]] %[[VAL_1]] : i64 -> i32
+// CHECK:           wasmssa.return %[[VAL_2]] : i32
+
+// CHECK-LABEL:   wasmssa.func @func_16() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 5.000000e+00 : f32
+// CHECK:           %[[VAL_1:.*]] = wasmssa.const 1.400000e+01 : f32
+// CHECK:           %[[VAL_2:.*]] = wasmssa.lt %[[VAL_0]] %[[VAL_1]] : f32 -> i32
+// CHECK:           wasmssa.return %[[VAL_2]] : i32
+
+// CHECK-LABEL:   wasmssa.func @func_17() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 5.000000e+00 : f32
+// CHECK:           %[[VAL_1:.*]] = wasmssa.const 1.400000e+01 : f32
+// CHECK:           %[[VAL_2:.*]] = wasmssa.le %[[VAL_0]] %[[VAL_1]] : f32 -> i32
+// CHECK:           wasmssa.return %[[VAL_2]] : i32
+
+// CHECK-LABEL:   wasmssa.func @func_18() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 5.000000e+00 : f32
+// CHECK:           %[[VAL_1:.*]] = wasmssa.const 1.400000e+01 : f32
+// CHECK:           %[[VAL_2:.*]] = wasmssa.gt %[[VAL_0]] %[[VAL_1]] : f32 -> i32
+// CHECK:           wasmssa.return %[[VAL_2]] : i32
+
+// CHECK-LABEL:   wasmssa.func @func_19() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 5.000000e+00 : f32
+// CHECK:           %[[VAL_1:.*]] = wasmssa.const 1.400000e+01 : f32
+// CHECK:           %[[VAL_2:.*]] = wasmssa.ge %[[VAL_0]] %[[VAL_1]] : f32 -> i32
+// CHECK:           wasmssa.return %[[VAL_2]] : i32
+
+// CHECK-LABEL:   wasmssa.func @func_20() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 5.000000e+00 : f64
+// CHECK:           %[[VAL_1:.*]] = wasmssa.const 1.400000e+01 : f64
+// CHECK:           %[[VAL_2:.*]] = wasmssa.lt %[[VAL_0]] %[[VAL_1]] : f64 -> i32
+// CHECK:           wasmssa.return %[[VAL_2]] : i32
+
+// CHECK-LABEL:   wasmssa.func @func_21() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 5.000000e+00 : f64
+// CHECK:           %[[VAL_1:.*]] = wasmssa.const 1.400000e+01 : f64
+// CHECK:           %[[VAL_2:.*]] = wasmssa.le %[[VAL_0]] %[[VAL_1]] : f64 -> i32
+// CHECK:           wasmssa.return %[[VAL_2]] : i32
+
+// CHECK-LABEL:   wasmssa.func @func_22() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 5.000000e+00 : f64
+// CHECK:           %[[VAL_1:.*]] = wasmssa.const 1.400000e+01 : f64
+// CHECK:           %[[VAL_2:.*]] = wasmssa.gt %[[VAL_0]] %[[VAL_1]] : f64 -> i32
+// CHECK:           wasmssa.return %[[VAL_2]] : i32
+
+// CHECK-LABEL:   wasmssa.func @func_23() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 5.000000e+00 : f64
+// CHECK:           %[[VAL_1:.*]] = wasmssa.const 1.400000e+01 : f64
+// CHECK:           %[[VAL_2:.*]] = wasmssa.ge %[[VAL_0]] %[[VAL_1]] : f64 -> i32
+// CHECK:           wasmssa.return %[[VAL_2]] : i32
diff --git a/mlir/test/Target/Wasm/const.mlir b/mlir/test/Target/Wasm/const.mlir
index aa9e76f8502b6..adb792a70e3fe 100644
--- a/mlir/test/Target/Wasm/const.mlir
+++ b/mlir/test/Target/Wasm/const.mlir
@@ -16,22 +16,22 @@
 )
 */
 
-// CHECK-LABEL:   wasmssa.func nested @func_0() -> i32 {
+// CHECK-LABEL:   wasmssa.func @func_0() -> i32 {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.const 1 : i32
 // CHECK:           wasmssa.return %[[VAL_0]] : i32
 // CHECK:         }
 
-// CHECK-LABEL:   wasmssa.func nested @func_1() -> i64 {
+// CHECK-LABEL:   wasmssa.func @func_1() -> i64 {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.const 3 : i64
 // CHECK:           wasmssa.return %[[VAL_0]] : i64
 // CHECK:         }
 
-// CHECK-LABEL:   wasmssa.func nested @func_2() -> f32 {
+// CHECK-LABEL:   wasmssa.func @func_2() -> f32 {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.const 4.000000e+00 : f32
 // CHECK:           wasmssa.return %[[VAL_0]] : f32
 // CHECK:         }
 
-// CHECK-LABEL:   wasmssa.func nested @func_3() -> f64 {
+// CHECK-LABEL:   wasmssa.func @func_3() -> f64 {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.const 9.000000e+00 : f64
 // CHECK:           wasmssa.return %[[VAL_0]] : f64
 // CHECK:         }
diff --git a/mlir/test/Target/Wasm/convert.mlir b/mlir/test/Target/Wasm/convert.mlir
new file mode 100644
index 0000000000000..ddc29a798af28
--- /dev/null
+++ b/mlir/test/Target/Wasm/convert.mlir
@@ -0,0 +1,85 @@
+// RUN: yaml2obj %S/inputs/convert.yaml.wasm -o - | mlir-translate --import-wasm | FileCheck %s
+
+/* Source code used to generate this test:
+(module
+    (func (export "convert_i32_u_to_f32") (result f32)
+    i32.const 10
+    f32.convert_i32_u
+    )
+
+    (func (export "convert_i32_s_to_f32") (result f32)
+    i32.const 42
+    f32.convert_i32_s
+    )
+
+    (func (export "convert_i64_u_to_f32") (result f32)
+    i64.const 17
+    f32.convert_i64_u
+    )
+
+    (func (export "convert_i64s_to_f32") (result f32)
+    i64.const 10
+    f32.convert_i64_s
+    )
+
+    (func (export "convert_i32_u_to_f64") (result f64)
+    i32.const 10
+    f64.convert_i32_u
+    )
+
+    (func (export "convert_i32_s_to_f64") (result f64)
+    i32.const 42
+    f64.convert_i32_s
+    )
+
+    (func (export "convert_i64_u_to_f64") (result f64)
+    i64.const 17
+    f64.convert_i64_u
+    )
+
+    (func (export "convert_i64s_to_f64") (result f64)
+    i64.const 10
+    f64.convert_i64_s
+    )
+)
+*/
+
+// CHECK-LABEL:   wasmssa.func exported @convert_i32_u_to_f32() -> f32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 10 : i32
+// CHECK:           %[[VAL_1:.*]] = wasmssa.convert_u %[[VAL_0]] : i32 to f32
+// CHECK:           wasmssa.return %[[VAL_1]] : f32
+
+// CHECK-LABEL:   wasmssa.func exported @convert_i32_s_to_f32() -> f32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 42 : i32
+// CHECK:           %[[VAL_1:.*]] = wasmssa.convert_s %[[VAL_0]] : i32 to f32
+// CHECK:           wasmssa.return %[[VAL_1]] : f32
+
+// CHECK-LABEL:   wasmssa.func exported @convert_i64_u_to_f32() -> f32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 17 : i64
+// CHECK:           %[[VAL_1:.*]] = wasmssa.convert_u %[[VAL_0]] : i64 to f32
+// CHECK:           wasmssa.return %[[VAL_1]] : f32
+
+// CHECK-LABEL:   wasmssa.func exported @convert_i64s_to_f32() -> f32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 10 : i64
+// CHECK:           %[[VAL_1:.*]] = wasmssa.convert_s %[[VAL_0]] : i64 to f32
+// CHECK:           wasmssa.return %[[VAL_1]] : f32
+
+// CHECK-LABEL:   wasmssa.func exported @convert_i32_u_to_f64() -> f64 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 10 : i32
+// CHECK:           %[[VAL_1:.*]] = wasmssa.convert_u %[[VAL_0]] : i32 to f64
+// CHECK:           wasmssa.return %[[VAL_1]] : f64
+
+// CHECK-LABEL:   wasmssa.func exported @convert_i32_s_to_f64() -> f64 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 42 : i32
+// CHECK:           %[[VAL_1:.*]] = wasmssa.convert_s %[[VAL_0]] : i32 to f64
+// CHECK:           wasmssa.return %[[VAL_1]] : f64
+
+// CHECK-LABEL:   wasmssa.func exported @convert_i64_u_to_f64() -> f64 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 17 : i64
+// CHECK:           %[[VAL_1:.*]] = wasmssa.convert_u %[[VAL_0]] : i64 to f64
+// CHECK:           wasmssa.return %[[VAL_1]] : f64
+
+// CHECK-LABEL:   wasmssa.func exported @convert_i64s_to_f64() -> f64 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 10 : i64
+// CHECK:           %[[VAL_1:.*]] = wasmssa.convert_s %[[VAL_0]] : i64 to f64
+// CHECK:           wasmssa.return %[[VAL_1]] : f64
diff --git a/mlir/test/Target/Wasm/copysign.mlir b/mlir/test/Target/Wasm/copysign.mlir
index 33d7a56694c1c..90c5b119e9265 100644
--- a/mlir/test/Target/Wasm/copysign.mlir
+++ b/mlir/test/Target/Wasm/copysign.mlir
@@ -16,14 +16,14 @@
 )
 */
 
-// CHECK-LABEL:   wasmssa.func @copysign_f32() -> f32 {
+// CHECK-LABEL:   wasmssa.func exported @copysign_f32() -> f32 {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.const 1.000000e+01 : f32
 // CHECK:           %[[VAL_1:.*]] = wasmssa.const 1.000000e+00 : f32
 // CHECK:           %[[VAL_2:.*]] = wasmssa.copysign %[[VAL_0]] %[[VAL_1]] : f32
 // CHECK:           wasmssa.return %[[VAL_2]] : f32
 // CHECK:         }
 
-// CHECK-LABEL:   wasmssa.func @copysign_f64() -> f64 {
+// CHECK-LABEL:   wasmssa.func exported @copysign_f64() -> f64 {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.const 1.000000e+01 : f64
 // CHECK:           %[[VAL_1:.*]] = wasmssa.const 1.000000e+00 : f64
 // CHECK:           %[[VAL_2:.*]] = wasmssa.copysign %[[VAL_0]] %[[VAL_1]] : f64
diff --git a/mlir/test/Target/Wasm/ctz.mlir b/mlir/test/Target/Wasm/ctz.mlir
index 6c0806f62303c..9e7cc5ef93e91 100644
--- a/mlir/test/Target/Wasm/ctz.mlir
+++ b/mlir/test/Target/Wasm/ctz.mlir
@@ -14,12 +14,12 @@
 )
 */
 
-// CHECK-LABEL:   wasmssa.func @ctz_i32() -> i32 {
+// CHECK-LABEL:   wasmssa.func exported @ctz_i32() -> i32 {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.const 10 : i32
 // CHECK:           %[[VAL_1:.*]] = wasmssa.ctz %[[VAL_0]] : i32
 // CHECK:           wasmssa.return %[[VAL_1]] : i32
 
-// CHECK-LABEL:   wasmssa.func @ctz_i64() -> i64 {
+// CHECK-LABEL:   wasmssa.func exported @ctz_i64() -> i64 {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.const 10 : i64
 // CHECK:           %[[VAL_1:.*]] = wasmssa.ctz %[[VAL_0]] : i64
 // CHECK:           wasmssa.return %[[VAL_1]] : i64
diff --git a/mlir/test/Target/Wasm/demote.mlir b/mlir/test/Target/Wasm/demote.mlir
new file mode 100644
index 0000000000000..3d2bc05b17609
--- /dev/null
+++ b/mlir/test/Target/Wasm/demote.mlir
@@ -0,0 +1,15 @@
+// RUN: yaml2obj %S/inputs/demote.yaml.wasm -o - | mlir-translate --import-wasm | FileCheck %s
+
+/* Source code used to create this test:
+(module
+  (func $main (result f32)
+    f64.const 2.24
+    f32.demote_f64
+    )
+)
+*/
+
+// CHECK-LABEL:   wasmssa.func @func_0() -> f32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 2.240000e+00 : f64
+// CHECK:           %[[VAL_1:.*]] = wasmssa.demote %[[VAL_0]] : f64 to f32
+// CHECK:           wasmssa.return %[[VAL_1]] : f32
diff --git a/mlir/test/Target/Wasm/div.mlir b/mlir/test/Target/Wasm/div.mlir
index c91f7809a255d..4967d96971f2a 100644
--- a/mlir/test/Target/Wasm/div.mlir
+++ b/mlir/test/Target/Wasm/div.mlir
@@ -66,61 +66,61 @@
 )
 */
 
-// CHECK-LABEL:   wasmssa.func @div_u_i32() -> i32 {
+// CHECK-LABEL:   wasmssa.func exported @div_u_i32() -> i32 {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.const 10 : i32
 // CHECK:           %[[VAL_1:.*]] = wasmssa.const 2 : i32
 // CHECK:           %[[VAL_2:.*]] = wasmssa.div_ui %[[VAL_0]] %[[VAL_1]] : i32
 // CHECK:           wasmssa.return %[[VAL_2]] : i32
 
-// CHECK-LABEL:   wasmssa.func @div_u_i32_zero() -> i32 {
+// CHECK-LABEL:   wasmssa.func exported @div_u_i32_zero() -> i32 {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.const 10 : i32
 // CHECK:           %[[VAL_1:.*]] = wasmssa.const 0 : i32
 // CHECK:           %[[VAL_2:.*]] = wasmssa.div_ui %[[VAL_0]] %[[VAL_1]] : i32
 // CHECK:           wasmssa.return %[[VAL_2]] : i32
 
-// CHECK-LABEL:   wasmssa.func @div_s_i32() -> i32 {
+// CHECK-LABEL:   wasmssa.func exported @div_s_i32() -> i32 {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.const 10 : i32
 // CHECK:           %[[VAL_1:.*]] = wasmssa.const 2 : i32
 // CHECK:           %[[VAL_2:.*]] = wasmssa.div_si %[[VAL_0]] %[[VAL_1]] : i32
 // CHECK:           wasmssa.return %[[VAL_2]] : i32
 
-// CHECK-LABEL:   wasmssa.func @div_s_i32_zero() -> i32 {
+// CHECK-LABEL:   wasmssa.func exported @div_s_i32_zero() -> i32 {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.const 10 : i32
 // CHECK:           %[[VAL_1:.*]] = wasmssa.const 0 : i32
 // CHECK:           %[[VAL_2:.*]] = wasmssa.div_si %[[VAL_0]] %[[VAL_1]] : i32
 // CHECK:           wasmssa.return %[[VAL_2]] : i32
 
-// CHECK-LABEL:   wasmssa.func @div_u_i64() -> i64 {
+// CHECK-LABEL:   wasmssa.func exported @div_u_i64() -> i64 {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.const 10 : i64
 // CHECK:           %[[VAL_1:.*]] = wasmssa.const 2 : i64
 // CHECK:           %[[VAL_2:.*]] = wasmssa.div_ui %[[VAL_0]] %[[VAL_1]] : i64
 // CHECK:           wasmssa.return %[[VAL_2]] : i64
 
-// CHECK-LABEL:   wasmssa.func @div_u_i64_zero() -> i64 {
+// CHECK-LABEL:   wasmssa.func exported @div_u_i64_zero() -> i64 {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.const 10 : i64
 // CHECK:           %[[VAL_1:.*]] = wasmssa.const 0 : i64
 // CHECK:           %[[VAL_2:.*]] = wasmssa.div_ui %[[VAL_0]] %[[VAL_1]] : i64
 // CHECK:           wasmssa.return %[[VAL_2]] : i64
 
-// CHECK-LABEL:   wasmssa.func @div_s_i64() -> i64 {
+// CHECK-LABEL:   wasmssa.func exported @div_s_i64() -> i64 {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.const 10 : i64
 // CHECK:           %[[VAL_1:.*]] = wasmssa.const 2 : i64
 // CHECK:           %[[VAL_2:.*]] = wasmssa.div_si %[[VAL_0]] %[[VAL_1]] : i64
 // CHECK:           wasmssa.return %[[VAL_2]] : i64
 
-// CHECK-LABEL:   wasmssa.func @div_s_i64_zero() -> i64 {
+// CHECK-LABEL:   wasmssa.func exported @div_s_i64_zero() -> i64 {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.const 10 : i64
 // CHECK:           %[[VAL_1:.*]] = wasmssa.const 0 : i64
 // CHECK:           %[[VAL_2:.*]] = wasmssa.div_si %[[VAL_0]] %[[VAL_1]] : i64
 // CHECK:           wasmssa.return %[[VAL_2]] : i64
 
-// CHECK-LABEL:   wasmssa.func @div_f32() -> f32 {
+// CHECK-LABEL:   wasmssa.func exported @div_f32() -> f32 {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.const 1.000000e+01 : f32
 // CHECK:           %[[VAL_1:.*]] = wasmssa.const 2.000000e+00 : f32
 // CHECK:           %[[VAL_2:.*]] = wasmssa.div %[[VAL_0]] %[[VAL_1]] : f32
 // CHECK:           wasmssa.return %[[VAL_2]] : f32
 
-// CHECK-LABEL:   wasmssa.func @div_f64() -> f64 {
+// CHECK-LABEL:   wasmssa.func exported @div_f64() -> f64 {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.const 1.000000e+01 : f64
 // CHECK:           %[[VAL_1:.*]] = wasmssa.const 2.000000e+00 : f64
 // CHECK:           %[[VAL_2:.*]] = wasmssa.div %[[VAL_0]] %[[VAL_1]] : f64
diff --git a/mlir/test/Target/Wasm/double_nested_loop.mlir b/mlir/test/Target/Wasm/double_nested_loop.mlir
new file mode 100644
index 0000000000000..8b3e4997e63e1
--- /dev/null
+++ b/mlir/test/Target/Wasm/double_nested_loop.mlir
@@ -0,0 +1,63 @@
+// RUN: yaml2obj %S/inputs/double_nested_loop.yaml.wasm -o - | mlir-translate --import-wasm | FileCheck %s
+
+/*
+(module
+  (func
+    ;; create a local variable and initialize it to 0
+    (local $i i32)
+    (local $j i32)
+
+    (loop $my_loop
+
+      ;; add one to $i
+      local.get $i
+      i32.const 1
+      i32.add
+      local.set $i
+      (loop $my_second_loop (result i32)
+        i32.const 1
+        local.get $j
+        i32.const 12
+        i32.add
+        local.tee $j
+        local.get $i
+        i32.gt_s
+        br_if $my_second_loop
+      )
+      i32.const 10
+      i32.lt_s
+      br_if $my_loop
+    )
+  )
+)
+*/
+
+// CHECK-LABEL:   wasmssa.func @func_0() {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.local of type i32
+// CHECK:           %[[VAL_1:.*]] = wasmssa.local of type i32
+// CHECK:           wasmssa.loop : {
+// CHECK:             %[[VAL_2:.*]] = wasmssa.local_get %[[VAL_0]] :  ref to i32
+// CHECK:             %[[VAL_3:.*]] = wasmssa.const 1 : i32
+// CHECK:             %[[VAL_4:.*]] = wasmssa.add %[[VAL_2]] %[[VAL_3]] : i32
+// CHECK:             wasmssa.local_set %[[VAL_0]] :  ref to i32 to %[[VAL_4]] : i32
+// CHECK:             wasmssa.loop : {
+// CHECK:               %[[VAL_5:.*]] = wasmssa.const 1 : i32
+// CHECK:               %[[VAL_6:.*]] = wasmssa.local_get %[[VAL_1]] :  ref to i32
+// CHECK:               %[[VAL_7:.*]] = wasmssa.const 12 : i32
+// CHECK:               %[[VAL_8:.*]] = wasmssa.add %[[VAL_6]] %[[VAL_7]] : i32
+// CHECK:               %[[VAL_9:.*]] = wasmssa.local_tee %[[VAL_1]] :  ref to i32 to %[[VAL_8]] : i32
+// CHECK:               %[[VAL_10:.*]] = wasmssa.local_get %[[VAL_0]] :  ref to i32
+// CHECK:               %[[VAL_11:.*]] = wasmssa.gt_si %[[VAL_9]] %[[VAL_10]] : i32 -> i32
+// CHECK:               wasmssa.branch_if %[[VAL_11]] to level 0 else ^bb1
+// CHECK:             ^bb1:
+// CHECK:               wasmssa.block_return %[[VAL_5]] : i32
+// CHECK:             }> ^bb1
+// CHECK:           ^bb1(%[[VAL_12:.*]]: i32):
+// CHECK:             %[[VAL_13:.*]] = wasmssa.const 10 : i32
+// CHECK:             %[[VAL_14:.*]] = wasmssa.lt_si %[[VAL_12]] %[[VAL_13]] : i32 -> i32
+// CHECK:             wasmssa.branch_if %[[VAL_14]] to level 0 else ^bb2
+// CHECK:           ^bb2:
+// CHECK:             wasmssa.block_return
+// CHECK:           }> ^bb1
+// CHECK:         ^bb1:
+// CHECK:           wasmssa.return
diff --git a/mlir/test/Target/Wasm/empty_blocks_list_and_stack.mlir b/mlir/test/Target/Wasm/empty_blocks_list_and_stack.mlir
new file mode 100644
index 0000000000000..5c98f1a2676c7
--- /dev/null
+++ b/mlir/test/Target/Wasm/empty_blocks_list_and_stack.mlir
@@ -0,0 +1,53 @@
+// RUN: yaml2obj %S/inputs/empty_blocks_list_and_stack.yaml.wasm -o - | mlir-translate --import-wasm | FileCheck %s
+
+/* Source code used to create this test:
+(module
+  (func (param $num i32)
+    (block $b1
+        (block $b2
+            (block $b3
+            )
+        )
+    )
+  )
+
+  (func (param $num i32)
+    (block $b1)
+    (block $b2)
+    (block $b3)
+  )
+)
+
+*/
+
+// CHECK-LABEL:   wasmssa.func @func_0(
+// CHECK-SAME:      %[[ARG0:.*]]: !wasmssa<local ref to i32>) {
+// CHECK:           wasmssa.block : {
+// CHECK:             wasmssa.block : {
+// CHECK:               wasmssa.block : {
+// CHECK:                 wasmssa.block_return
+// CHECK:               }> ^bb1
+// CHECK:             ^bb1:
+// CHECK:               wasmssa.block_return
+// CHECK:             }> ^bb1
+// CHECK:           ^bb1:
+// CHECK:             wasmssa.block_return
+// CHECK:           }> ^bb1
+// CHECK:         ^bb1:
+// CHECK:           wasmssa.return
+
+// CHECK-LABEL:   wasmssa.func @func_1(
+// CHECK-SAME:      %[[ARG0:.*]]: !wasmssa<local ref to i32>) {
+// CHECK:           wasmssa.block : {
+// CHECK:             wasmssa.block_return
+// CHECK:           }> ^bb1
+// CHECK:         ^bb1:
+// CHECK:           wasmssa.block : {
+// CHECK:             wasmssa.block_return
+// CHECK:           }> ^bb2
+// CHECK:         ^bb2:
+// CHECK:           wasmssa.block : {
+// CHECK:             wasmssa.block_return
+// CHECK:           }> ^bb3
+// CHECK:         ^bb3:
+// CHECK:           wasmssa.return
diff --git a/mlir/test/Target/Wasm/eq.mlir b/mlir/test/Target/Wasm/eq.mlir
new file mode 100644
index 0000000000000..ba3ae2f759c2f
--- /dev/null
+++ b/mlir/test/Target/Wasm/eq.mlir
@@ -0,0 +1,56 @@
+// RUN: yaml2obj %S/inputs/eq.yaml.wasm -o - | mlir-translate --import-wasm | FileCheck %s
+/* Source code used to create this test:
+(module
+    (func $eq_i32 (result i32)
+        i32.const 12
+        i32.const 50
+        i32.eq
+    )
+
+    (func $eq_i64 (result i32)
+        i64.const 20
+        i64.const 5
+        i64.eq
+    )
+
+    (func $eq_f32 (result i32)
+        f32.const 5
+        f32.const 14
+        f32.eq
+    )
+
+    (func $eq_f64 (result i32)
+        f64.const 17
+        f64.const 0
+        f64.eq
+    )
+)
+*/
+
+// CHECK-LABEL:   wasmssa.func @func_0() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 12 : i32
+// CHECK:           %[[VAL_1:.*]] = wasmssa.const 50 : i32
+// CHECK:           %[[VAL_2:.*]] = wasmssa.eq %[[VAL_0]] %[[VAL_1]] : i32 -> i32
+// CHECK:           wasmssa.return %[[VAL_2]] : i32
+// CHECK:         }
+
+// CHECK-LABEL:   wasmssa.func @func_1() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 20 : i64
+// CHECK:           %[[VAL_1:.*]] = wasmssa.const 5 : i64
+// CHECK:           %[[VAL_2:.*]] = wasmssa.eq %[[VAL_0]] %[[VAL_1]] : i64 -> i32
+// CHECK:           wasmssa.return %[[VAL_2]] : i32
+// CHECK:         }
+
+// CHECK-LABEL:   wasmssa.func @func_2() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 5.000000e+00 : f32
+// CHECK:           %[[VAL_1:.*]] = wasmssa.const 1.400000e+01 : f32
+// CHECK:           %[[VAL_2:.*]] = wasmssa.eq %[[VAL_0]] %[[VAL_1]] : f32 -> i32
+// CHECK:           wasmssa.return %[[VAL_2]] : i32
+// CHECK:         }
+
+// CHECK-LABEL:   wasmssa.func @func_3() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 1.700000e+01 : f64
+// CHECK:           %[[VAL_1:.*]] = wasmssa.const 0.000000e+00 : f64
+// CHECK:           %[[VAL_2:.*]] = wasmssa.eq %[[VAL_0]] %[[VAL_1]] : f64 -> i32
+// CHECK:           wasmssa.return %[[VAL_2]] : i32
+// CHECK:         }
diff --git a/mlir/test/Target/Wasm/eqz.mlir b/mlir/test/Target/Wasm/eqz.mlir
new file mode 100644
index 0000000000000..55cf94adc92e0
--- /dev/null
+++ b/mlir/test/Target/Wasm/eqz.mlir
@@ -0,0 +1,21 @@
+// RUN: yaml2obj %S/inputs/eqz.yaml.wasm -o - | mlir-translate --import-wasm | FileCheck %s
+/* Source code used to create this test:
+(module
+    (func (export "eqz_i32") (result i32)
+    i32.const 13
+    i32.eqz)
+
+    (func (export "eqz_i64") (result i32)
+    i64.const 13
+    i64.eqz)
+)
+*/
+// CHECK-LABEL:   wasmssa.func exported @eqz_i32() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 13 : i32
+// CHECK:           %[[VAL_1:.*]] = wasmssa.eqz %[[VAL_0]] : i32 -> i32
+// CHECK:           wasmssa.return %[[VAL_1]] : i32
+
+// CHECK-LABEL:   wasmssa.func exported @eqz_i64() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 13 : i64
+// CHECK:           %[[VAL_1:.*]] = wasmssa.eqz %[[VAL_0]] : i64 -> i32
+// CHECK:           wasmssa.return %[[VAL_1]] : i32
diff --git a/mlir/test/Target/Wasm/extend.mlir b/mlir/test/Target/Wasm/extend.mlir
new file mode 100644
index 0000000000000..5d4446ac62c81
--- /dev/null
+++ b/mlir/test/Target/Wasm/extend.mlir
@@ -0,0 +1,69 @@
+// RUN: yaml2obj %S/inputs/extend.yaml.wasm -o - | mlir-translate --import-wasm | FileCheck %s
+
+/* Source code used to create this test:
+(module
+  (func $i32_s (result i64)
+    i32.const 10
+    i64.extend_i32_s
+  )
+  (func $i32_u (result i64)
+    i32.const 10
+    i64.extend_i32_u
+  )
+  (func $extend8_32 (result i32)
+    i32.const 10
+    i32.extend8_s
+  )
+  (func $extend16_32 (result i32)
+    i32.const 10
+    i32.extend16_s
+  )
+  (func $extend8_64 (result i64)
+    i64.const 10
+    i64.extend8_s
+  )
+  (func $extend16_64 (result i64)
+    i64.const 10
+    i64.extend16_s
+  )
+  (func $extend32_64 (result i64)
+    i64.const 10
+    i64.extend32_s
+  )
+)
+*/
+
+// CHECK-LABEL:   wasmssa.func @func_0() -> i64 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 10 : i32
+// CHECK:           %[[VAL_1:.*]] = wasmssa.extend_i32_s %[[VAL_0]] to i64
+// CHECK:           wasmssa.return %[[VAL_1]] : i64
+
+// CHECK-LABEL:   wasmssa.func @func_1() -> i64 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 10 : i32
+// CHECK:           %[[VAL_1:.*]] = wasmssa.extend_i32_u %[[VAL_0]] to i64
+// CHECK:           wasmssa.return %[[VAL_1]] : i64
+
+// CHECK-LABEL:   wasmssa.func @func_2() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 10 : i32
+// CHECK:           %[[VAL_1:.*]] = wasmssa.extend 8 : ui32 low bits from %[[VAL_0]] : i32
+// CHECK:           wasmssa.return %[[VAL_1]] : i32
+
+// CHECK-LABEL:   wasmssa.func @func_3() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 10 : i32
+// CHECK:           %[[VAL_1:.*]] = wasmssa.extend 16 : ui32 low bits from %[[VAL_0]] : i32
+// CHECK:           wasmssa.return %[[VAL_1]] : i32
+
+// CHECK-LABEL:   wasmssa.func @func_4() -> i64 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 10 : i64
+// CHECK:           %[[VAL_1:.*]] = wasmssa.extend 8 : ui32 low bits from %[[VAL_0]] : i64
+// CHECK:           wasmssa.return %[[VAL_1]] : i64
+
+// CHECK-LABEL:   wasmssa.func @func_5() -> i64 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 10 : i64
+// CHECK:           %[[VAL_1:.*]] = wasmssa.extend 16 : ui32 low bits from %[[VAL_0]] : i64
+// CHECK:           wasmssa.return %[[VAL_1]] : i64
+
+// CHECK-LABEL:   wasmssa.func @func_6() -> i64 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 10 : i64
+// CHECK:           %[[VAL_1:.*]] = wasmssa.extend 32 : ui32 low bits from %[[VAL_0]] : i64
+// CHECK:           wasmssa.return %[[VAL_1]] : i64
diff --git a/mlir/test/Target/Wasm/global.mlir b/mlir/test/Target/Wasm/global.mlir
index e72fe69666999..1e4fe44742de7 100644
--- a/mlir/test/Target/Wasm/global.mlir
+++ b/mlir/test/Target/Wasm/global.mlir
@@ -29,9 +29,9 @@ i32.add
 )
 */
 
-// CHECK-LABEL:   wasmssa.import_global "from_js" from "env" as @global_0 nested : i32
+// CHECK-LABEL:   wasmssa.import_global "from_js" from "env" as @global_0 : i32
 
-// CHECK-LABEL:   wasmssa.func nested @func_0() -> i32 {
+// CHECK-LABEL:   wasmssa.func @func_0() -> i32 {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.global_get @global_0 : i32
 // CHECK:           %[[VAL_1:.*]] = wasmssa.global_get @global_1 : i32
 // CHECK:           %[[VAL_2:.*]] = wasmssa.add %[[VAL_0]] %[[VAL_1]] : i32
@@ -41,26 +41,26 @@ i32.add
 // CHECK:           %[[VAL_6:.*]] = wasmssa.add %[[VAL_2]] %[[VAL_5]] : i32
 // CHECK:           wasmssa.return %[[VAL_6]] : i32
 
-// CHECK-LABEL:   wasmssa.global @global_1 i32 nested : {
+// CHECK-LABEL:   wasmssa.global @global_1 i32 : {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.const 10 : i32
 // CHECK:           wasmssa.return %[[VAL_0]] : i32
 
-// CHECK-LABEL:   wasmssa.global @global_2 i32 mutable nested : {
+// CHECK-LABEL:   wasmssa.global @global_2 i32 mutable : {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.const 10 : i32
 // CHECK:           wasmssa.return %[[VAL_0]] : i32
 
-// CHECK-LABEL:   wasmssa.global @global_3 i32 mutable nested : {
+// CHECK-LABEL:   wasmssa.global @global_3 i32 mutable : {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.const 10 : i32
 // CHECK:           wasmssa.return %[[VAL_0]] : i32
 
-// CHECK-LABEL:   wasmssa.global @global_4 i64 nested : {
+// CHECK-LABEL:   wasmssa.global @global_4 i64 : {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.const 11 : i64
 // CHECK:           wasmssa.return %[[VAL_0]] : i64
 
-// CHECK-LABEL:   wasmssa.global @global_5 f32 nested : {
+// CHECK-LABEL:   wasmssa.global @global_5 f32 : {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.const 1.200000e+01 : f32
 // CHECK:           wasmssa.return %[[VAL_0]] : f32
 
-// CHECK-LABEL:   wasmssa.global @global_6 f64 nested : {
+// CHECK-LABEL:   wasmssa.global @global_6 f64 : {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.const 1.300000e+01 : f64
 // CHECK:           wasmssa.return %[[VAL_0]] : f64
diff --git a/mlir/test/Target/Wasm/if.mlir b/mlir/test/Target/Wasm/if.mlir
new file mode 100644
index 0000000000000..2d7bfbed6f754
--- /dev/null
+++ b/mlir/test/Target/Wasm/if.mlir
@@ -0,0 +1,112 @@
+// RUN: yaml2obj %S/inputs/if.yaml.wasm -o - | mlir-translate --import-wasm | FileCheck %s
+
+/* Source code used to create this test:
+(module
+(type $intMapper (func (param $input i32) (result i32)))
+(func $if_else (type $intMapper)
+  local.get 0
+  i32.const 1
+  i32.and
+  if $isOdd (result i32)
+    local.get 0
+    i32.const 3
+    i32.mul
+    i32.const 1
+    i32.add
+  else
+    local.get 0
+    i32.const 1
+    i32.shr_u
+  end
+)
+
+(func $if_only (type $intMapper)
+  local.get 0
+  local.get 0
+  i32.const 1
+  i32.and
+  if $isOdd (type $intMapper)
+    i32.const 1
+    i32.add
+  end
+)
+
+(func $if_if (type $intMapper)
+  local.get 0
+  i32.ctz
+  if $isEven (result i32)
+    i32.const 2
+    local.get 0
+    i32.const 1
+    i32.shr_u
+    i32.ctz
+    if $isMultipleOfFour (type $intMapper)
+      i32.const 2
+      i32.add
+    end
+  else
+    i32.const 1
+  end
+)
+)
+*/
+// CHECK-LABEL:   wasmssa.func @func_0(
+// CHECK-SAME:      %[[ARG0:.*]]: !wasmssa<local ref to i32>) -> i32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.local_get %[[ARG0]] :  ref to i32
+// CHECK:           %[[VAL_1:.*]] = wasmssa.const 1 : i32
+// CHECK:           %[[VAL_2:.*]] = wasmssa.and %[[VAL_0]] %[[VAL_1]] : i32
+// CHECK:           wasmssa.if %[[VAL_2]] : {
+// CHECK:             %[[VAL_3:.*]] = wasmssa.local_get %[[ARG0]] :  ref to i32
+// CHECK:             %[[VAL_4:.*]] = wasmssa.const 3 : i32
+// CHECK:             %[[VAL_5:.*]] = wasmssa.mul %[[VAL_3]] %[[VAL_4]] : i32
+// CHECK:             %[[VAL_6:.*]] = wasmssa.const 1 : i32
+// CHECK:             %[[VAL_7:.*]] = wasmssa.add %[[VAL_5]] %[[VAL_6]] : i32
+// CHECK:             wasmssa.block_return %[[VAL_7]] : i32
+// CHECK:           } "else "{
+// CHECK:             %[[VAL_8:.*]] = wasmssa.local_get %[[ARG0]] :  ref to i32
+// CHECK:             %[[VAL_9:.*]] = wasmssa.const 1 : i32
+// CHECK:             %[[VAL_10:.*]] = wasmssa.shr_u %[[VAL_8]] by %[[VAL_9]] bits : i32
+// CHECK:             wasmssa.block_return %[[VAL_10]] : i32
+// CHECK:           }> ^bb1
+// CHECK:         ^bb1(%[[VAL_11:.*]]: i32):
+// CHECK:           wasmssa.return %[[VAL_11]] : i32
+
+// CHECK-LABEL:   wasmssa.func @func_1(
+// CHECK-SAME:      %[[ARG0:.*]]: !wasmssa<local ref to i32>) -> i32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.local_get %[[ARG0]] :  ref to i32
+// CHECK:           %[[VAL_1:.*]] = wasmssa.local_get %[[ARG0]] :  ref to i32
+// CHECK:           %[[VAL_2:.*]] = wasmssa.const 1 : i32
+// CHECK:           %[[VAL_3:.*]] = wasmssa.and %[[VAL_1]] %[[VAL_2]] : i32
+// CHECK:           wasmssa.if %[[VAL_3]](%[[VAL_0]]) : i32 : {
+// CHECK:           ^bb0(%[[VAL_4:.*]]: i32):
+// CHECK:             %[[VAL_5:.*]] = wasmssa.const 1 : i32
+// CHECK:             %[[VAL_6:.*]] = wasmssa.add %[[VAL_4]] %[[VAL_5]] : i32
+// CHECK:             wasmssa.block_return %[[VAL_6]] : i32
+// CHECK:           } > ^bb1
+// CHECK:         ^bb1(%[[VAL_7:.*]]: i32):
+// CHECK:           wasmssa.return %[[VAL_7]] : i32
+
+// CHECK-LABEL:   wasmssa.func @func_2(
+// CHECK-SAME:      %[[ARG0:.*]]: !wasmssa<local ref to i32>) -> i32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.local_get %[[ARG0]] :  ref to i32
+// CHECK:           %[[VAL_1:.*]] = wasmssa.ctz %[[VAL_0]] : i32
+// CHECK:           wasmssa.if %[[VAL_1]] : {
+// CHECK:             %[[VAL_2:.*]] = wasmssa.const 2 : i32
+// CHECK:             %[[VAL_3:.*]] = wasmssa.local_get %[[ARG0]] :  ref to i32
+// CHECK:             %[[VAL_4:.*]] = wasmssa.const 1 : i32
+// CHECK:             %[[VAL_5:.*]] = wasmssa.shr_u %[[VAL_3]] by %[[VAL_4]] bits : i32
+// CHECK:             %[[VAL_6:.*]] = wasmssa.ctz %[[VAL_5]] : i32
+// CHECK:             wasmssa.if %[[VAL_6]](%[[VAL_2]]) : i32 : {
+// CHECK:             ^bb0(%[[VAL_7:.*]]: i32):
+// CHECK:               %[[VAL_8:.*]] = wasmssa.const 2 : i32
+// CHECK:               %[[VAL_9:.*]] = wasmssa.add %[[VAL_7]] %[[VAL_8]] : i32
+// CHECK:               wasmssa.block_return %[[VAL_9]] : i32
+// CHECK:             } > ^bb1
+// CHECK:           ^bb1(%[[VAL_10:.*]]: i32):
+// CHECK:             wasmssa.block_return %[[VAL_10]] : i32
+// CHECK:           } "else "{
+// CHECK:             %[[VAL_11:.*]] = wasmssa.const 1 : i32
+// CHECK:             wasmssa.block_return %[[VAL_11]] : i32
+// CHECK:           }> ^bb1
+// CHECK:         ^bb1(%[[VAL_12:.*]]: i32):
+// CHECK:           wasmssa.return %[[VAL_12]] : i32
diff --git a/mlir/test/Target/Wasm/import.mlir b/mlir/test/Target/Wasm/import.mlir
index 541dcf3a2d9eb..dcdfa52703539 100644
--- a/mlir/test/Target/Wasm/import.mlir
+++ b/mlir/test/Target/Wasm/import.mlir
@@ -11,9 +11,9 @@
 )
 */
 
-// CHECK-LABEL:   wasmssa.import_func "foo" from "my_module" as @func_0 {sym_visibility = "nested", type = (i32) -> ()}
-// CHECK:         wasmssa.import_func "bar" from "my_module" as @func_1 {sym_visibility = "nested", type = (i32) -> ()}
-// CHECK:         wasmssa.import_table "table" from "my_module" as @table_0 {sym_visibility = "nested", type = !wasmssa<tabletype !wasmssa.funcref [2:]>}
-// CHECK:         wasmssa.import_mem "mem" from "my_module" as @mem_0 {limits = !wasmssa<limit[2:]>, sym_visibility = "nested"}
-// CHECK:         wasmssa.import_global "glob" from "my_module" as @global_0 nested : i32
-// CHECK:         wasmssa.import_global "glob_mut" from "my_other_module" as @global_1 mutable nested : i32
+// CHECK-LABEL:   wasmssa.import_func "foo" from "my_module" as @func_0 {type = (i32) -> ()}
+// CHECK:         wasmssa.import_func "bar" from "my_module" as @func_1 {type = (i32) -> ()}
+// CHECK:         wasmssa.import_table "table" from "my_module" as @table_0 {type = !wasmssa<tabletype !wasmssa.funcref [2:]>}
+// CHECK:         wasmssa.import_mem "mem" from "my_module" as @mem_0 {limits = !wasmssa<limit[2:]>}
+// CHECK:         wasmssa.import_global "glob" from "my_module" as @global_0 : i32
+// CHECK:         wasmssa.import_global "glob_mut" from "my_other_module" as @global_1 mutable : i32
diff --git a/mlir/test/Target/Wasm/inputs/add_div.yaml.wasm b/mlir/test/Target/Wasm/inputs/add_div.yaml.wasm
new file mode 100644
index 0000000000000..865c31581a82f
--- /dev/null
+++ b/mlir/test/Target/Wasm/inputs/add_div.yaml.wasm
@@ -0,0 +1,50 @@
+--- !WASM
+FileHeader:
+  Version:         0x1
+Sections:
+  - Type:            TYPE
+    Signatures:
+      - Index:           0
+        ParamTypes:
+          - I32
+        ReturnTypes:
+          - I32
+      - Index:           1
+        ParamTypes:
+          - I32
+          - I32
+        ReturnTypes:
+          - I32
+  - Type:            IMPORT
+    Imports:
+      - Module:          env
+        Field:           twoTimes
+        Kind:            FUNCTION
+        SigIndex:        0
+  - Type:            FUNCTION
+    FunctionTypes:   [ 1 ]
+  - Type:            MEMORY
+    Memories:
+      - Minimum:         0x2
+  - Type:            GLOBAL
+    Globals:
+      - Index:           0
+        Type:            I32
+        Mutable:         true
+        InitExpr:
+          Opcode:          I32_CONST
+          Value:           66560
+  - Type:            EXPORT
+    Exports:
+      - Name:            memory
+        Kind:            MEMORY
+        Index:           0
+      - Name:            add
+        Kind:            FUNCTION
+        Index:           1
+  - Type:            CODE
+    Functions:
+      - Index:           1
+        Locals:          []
+        Body:            20001000200110006A41026D0B
+...
diff --git a/mlir/test/Target/Wasm/inputs/block.yaml.wasm b/mlir/test/Target/Wasm/inputs/block.yaml.wasm
new file mode 100644
index 0000000000000..dd5118a119fa2
--- /dev/null
+++ b/mlir/test/Target/Wasm/inputs/block.yaml.wasm
@@ -0,0 +1,22 @@
+--- !WASM
+FileHeader:
+  Version:         0x1
+Sections:
+  - Type:            TYPE
+    Signatures:
+      - Index:           0
+        ParamTypes:      []
+        ReturnTypes:     []
+  - Type:            FUNCTION
+    FunctionTypes:   [ 0 ]
+  - Type:            EXPORT
+    Exports:
+      - Name:            i_am_a_block
+        Kind:            FUNCTION
+        Index:           0
+  - Type:            CODE
+    Functions:
+      - Index:           0
+        Locals:          []
+        Body:            02400B0B
+...
diff --git a/mlir/test/Target/Wasm/inputs/block_complete_type.yaml.wasm b/mlir/test/Target/Wasm/inputs/block_complete_type.yaml.wasm
new file mode 100644
index 0000000000000..7a125bfa06d59
--- /dev/null
+++ b/mlir/test/Target/Wasm/inputs/block_complete_type.yaml.wasm
@@ -0,0 +1,23 @@
+--- !WASM
+FileHeader:
+  Version:         0x1
+Sections:
+  - Type:            TYPE
+    Signatures:
+      - Index:           0
+        ParamTypes:
+          - I32
+        ReturnTypes:
+          - I32
+      - Index:           1
+        ParamTypes:      []
+        ReturnTypes:
+          - I32
+  - Type:            FUNCTION
+    FunctionTypes:   [ 1 ]
+  - Type:            CODE
+    Functions:
+      - Index:           0
+        Locals:          []
+        Body:            410E020041016A0B0B
+...
diff --git a/mlir/test/Target/Wasm/inputs/block_value_type.yaml.wasm b/mlir/test/Target/Wasm/inputs/block_value_type.yaml.wasm
new file mode 100644
index 0000000000000..4ba291d6ac7a5
--- /dev/null
+++ b/mlir/test/Target/Wasm/inputs/block_value_type.yaml.wasm
@@ -0,0 +1,18 @@
+--- !WASM
+FileHeader:
+  Version:         0x1
+Sections:
+  - Type:            TYPE
+    Signatures:
+      - Index:           0
+        ParamTypes:      []
+        ReturnTypes:
+          - I32
+  - Type:            FUNCTION
+    FunctionTypes:   [ 0 ]
+  - Type:            CODE
+    Functions:
+      - Index:           0
+        Locals:          []
+        Body:            027F41110B0B
+...
diff --git a/mlir/test/Target/Wasm/inputs/branch_if.yaml.wasm b/mlir/test/Target/Wasm/inputs/branch_if.yaml.wasm
new file mode 100644
index 0000000000000..40536ed3a8723
--- /dev/null
+++ b/mlir/test/Target/Wasm/inputs/branch_if.yaml.wasm
@@ -0,0 +1,18 @@
+--- !WASM
+FileHeader:
+  Version:         0x1
+Sections:
+  - Type:            TYPE
+    Signatures:
+      - Index:           0
+        ParamTypes:      []
+        ReturnTypes:
+          - I32
+  - Type:            FUNCTION
+    FunctionTypes:   [ 0 ]
+  - Type:            CODE
+    Functions:
+      - Index:           0
+        Locals:          []
+        Body:            027F410141020D0041016A0B0B
+...
diff --git a/mlir/test/Target/Wasm/inputs/call.yaml.wasm b/mlir/test/Target/Wasm/inputs/call.yaml.wasm
new file mode 100644
index 0000000000000..535a623edc6da
--- /dev/null
+++ b/mlir/test/Target/Wasm/inputs/call.yaml.wasm
@@ -0,0 +1,26 @@
+--- !WASM
+FileHeader:
+  Version:         0x1
+Sections:
+  - Type:            TYPE
+    Signatures:
+      - Index:           0
+        ParamTypes:      []
+        ReturnTypes:
+          - I32
+  - Type:            FUNCTION
+    FunctionTypes:   [ 0, 0 ]
+  - Type:            EXPORT
+    Exports:
+      - Name:            forty_two
+        Kind:            FUNCTION
+        Index:           1
+  - Type:            CODE
+    Functions:
+      - Index:           0
+        Locals:          []
+        Body:            412A0B
+      - Index:           1
+        Locals:          []
+        Body:            10000B
+...
diff --git a/mlir/test/Target/Wasm/inputs/comparison_ops.yaml.wasm b/mlir/test/Target/Wasm/inputs/comparison_ops.yaml.wasm
new file mode 100644
index 0000000000000..cde9ee13e7f1f
--- /dev/null
+++ b/mlir/test/Target/Wasm/inputs/comparison_ops.yaml.wasm
@@ -0,0 +1,88 @@
+--- !WASM
+FileHeader:
+  Version:         0x1
+Sections:
+  - Type:            TYPE
+    Signatures:
+      - Index:           0
+        ParamTypes:      []
+        ReturnTypes:
+          - I32
+  - Type:            FUNCTION
+    FunctionTypes:   [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                       0, 0, 0, 0, 0, 0, 0, 0 ]
+  - Type:            CODE
+    Functions:
+      - Index:           0
+        Locals:          []
+        Body:            410C4132480B
+      - Index:           1
+        Locals:          []
+        Body:            410C41324C0B
+      - Index:           2
+        Locals:          []
+        Body:            410C4132490B
+      - Index:           3
+        Locals:          []
+        Body:            410C41324D0B
+      - Index:           4
+        Locals:          []
+        Body:            410C41324A0B
+      - Index:           5
+        Locals:          []
+        Body:            410C41324B0B
+      - Index:           6
+        Locals:          []
+        Body:            410C41324E0B
+      - Index:           7
+        Locals:          []
+        Body:            410C41324F0B
+      - Index:           8
+        Locals:          []
+        Body:            420C4232530B
+      - Index:           9
+        Locals:          []
+        Body:            420C4232570B
+      - Index:           10
+        Locals:          []
+        Body:            420C4232540B
+      - Index:           11
+        Locals:          []
+        Body:            420C4232580B
+      - Index:           12
+        Locals:          []
+        Body:            420C4232550B
+      - Index:           13
+        Locals:          []
+        Body:            420C4232560B
+      - Index:           14
+        Locals:          []
+        Body:            420C4232590B
+      - Index:           15
+        Locals:          []
+        Body:            420C42325A0B
+      - Index:           16
+        Locals:          []
+        Body:            430000A04043000060415D0B
+      - Index:           17
+        Locals:          []
+        Body:            430000A04043000060415F0B
+      - Index:           18
+        Locals:          []
+        Body:            430000A04043000060415E0B
+      - Index:           19
+        Locals:          []
+        Body:            430000A0404300006041600B
+      - Index:           20
+        Locals:          []
+        Body:            440000000000001440440000000000002C40630B
+      - Index:           21
+        Locals:          []
+        Body:            440000000000001440440000000000002C40650B
+      - Index:           22
+        Locals:          []
+        Body:            440000000000001440440000000000002C40640B
+      - Index:           23
+        Locals:          []
+        Body:            440000000000001440440000000000002C40660B
+...
diff --git a/mlir/test/Target/Wasm/inputs/convert.yaml.wasm b/mlir/test/Target/Wasm/inputs/convert.yaml.wasm
new file mode 100644
index 0000000000000..c346a751bf93a
--- /dev/null
+++ b/mlir/test/Target/Wasm/inputs/convert.yaml.wasm
@@ -0,0 +1,69 @@
+--- !WASM
+FileHeader:
+  Version:         0x1
+Sections:
+  - Type:            TYPE
+    Signatures:
+      - Index:           0
+        ParamTypes:      []
+        ReturnTypes:
+          - F32
+      - Index:           1
+        ParamTypes:      []
+        ReturnTypes:
+          - F64
+  - Type:            FUNCTION
+    FunctionTypes:   [ 0, 0, 0, 0, 1, 1, 1, 1 ]
+  - Type:            EXPORT
+    Exports:
+      - Name:            convert_i32_u_to_f32
+        Kind:            FUNCTION
+        Index:           0
+      - Name:            convert_i32_s_to_f32
+        Kind:            FUNCTION
+        Index:           1
+      - Name:            convert_i64_u_to_f32
+        Kind:            FUNCTION
+        Index:           2
+      - Name:            convert_i64s_to_f32
+        Kind:            FUNCTION
+        Index:           3
+      - Name:            convert_i32_u_to_f64
+        Kind:            FUNCTION
+        Index:           4
+      - Name:            convert_i32_s_to_f64
+        Kind:            FUNCTION
+        Index:           5
+      - Name:            convert_i64_u_to_f64
+        Kind:            FUNCTION
+        Index:           6
+      - Name:            convert_i64s_to_f64
+        Kind:            FUNCTION
+        Index:           7
+  - Type:            CODE
+    Functions:
+      - Index:           0
+        Locals:          []
+        Body:            410AB30B
+      - Index:           1
+        Locals:          []
+        Body:            412AB20B
+      - Index:           2
+        Locals:          []
+        Body:            4211B50B
+      - Index:           3
+        Locals:          []
+        Body:            420AB40B
+      - Index:           4
+        Locals:          []
+        Body:            410AB80B
+      - Index:           5
+        Locals:          []
+        Body:            412AB70B
+      - Index:           6
+        Locals:          []
+        Body:            4211BA0B
+      - Index:           7
+        Locals:          []
+        Body:            420AB90B
+...
diff --git a/mlir/test/Target/Wasm/inputs/demote.yaml.wasm b/mlir/test/Target/Wasm/inputs/demote.yaml.wasm
new file mode 100644
index 0000000000000..3997045cd8315
--- /dev/null
+++ b/mlir/test/Target/Wasm/inputs/demote.yaml.wasm
@@ -0,0 +1,18 @@
+--- !WASM
+FileHeader:
+  Version:         0x1
+Sections:
+  - Type:            TYPE
+    Signatures:
+      - Index:           0
+        ParamTypes:      []
+        ReturnTypes:
+          - F32
+  - Type:            FUNCTION
+    FunctionTypes:   [ 0 ]
+  - Type:            CODE
+    Functions:
+      - Index:           0
+        Locals:          []
+        Body:            44EC51B81E85EB0140B60B
+...
diff --git a/mlir/test/Target/Wasm/inputs/double_nested_loop.yaml.wasm b/mlir/test/Target/Wasm/inputs/double_nested_loop.yaml.wasm
new file mode 100644
index 0000000000000..41a2944249eba
--- /dev/null
+++ b/mlir/test/Target/Wasm/inputs/double_nested_loop.yaml.wasm
@@ -0,0 +1,19 @@
+--- !WASM
+FileHeader:
+  Version:         0x1
+Sections:
+  - Type:            TYPE
+    Signatures:
+      - Index:           0
+        ParamTypes:      []
+        ReturnTypes:     []
+  - Type:            FUNCTION
+    FunctionTypes:   [ 0 ]
+  - Type:            CODE
+    Functions:
+      - Index:           0
+        Locals:
+          - Type:            I32
+            Count:           2
+        Body:            0340200041016A2100037F41012001410C6A220120004A0D000B410A480D000B0B
+...
diff --git a/mlir/test/Target/Wasm/inputs/empty_blocks_list_and_stack.yaml.wasm b/mlir/test/Target/Wasm/inputs/empty_blocks_list_and_stack.yaml.wasm
new file mode 100644
index 0000000000000..3171409dff954
--- /dev/null
+++ b/mlir/test/Target/Wasm/inputs/empty_blocks_list_and_stack.yaml.wasm
@@ -0,0 +1,21 @@
+--- !WASM
+FileHeader:
+  Version:         0x1
+Sections:
+  - Type:            TYPE
+    Signatures:
+      - Index:           0
+        ParamTypes:
+          - I32
+        ReturnTypes:     []
+  - Type:            FUNCTION
+    FunctionTypes:   [ 0, 0 ]
+  - Type:            CODE
+    Functions:
+      - Index:           0
+        Locals:          []
+        Body:            0240024002400B0B0B0B
+      - Index:           1
+        Locals:          []
+        Body:            02400B02400B02400B0B
+...
diff --git a/mlir/test/Target/Wasm/inputs/eq.yaml.wasm b/mlir/test/Target/Wasm/inputs/eq.yaml.wasm
new file mode 100644
index 0000000000000..19983695c6854
--- /dev/null
+++ b/mlir/test/Target/Wasm/inputs/eq.yaml.wasm
@@ -0,0 +1,27 @@
+--- !WASM
+FileHeader:
+  Version:         0x1
+Sections:
+  - Type:            TYPE
+    Signatures:
+      - Index:           0
+        ParamTypes:      []
+        ReturnTypes:
+          - I32
+  - Type:            FUNCTION
+    FunctionTypes:   [ 0, 0, 0, 0 ]
+  - Type:            CODE
+    Functions:
+      - Index:           0
+        Locals:          []
+        Body:            410C4132460B
+      - Index:           1
+        Locals:          []
+        Body:            42144205510B
+      - Index:           2
+        Locals:          []
+        Body:            430000A04043000060415B0B
+      - Index:           3
+        Locals:          []
+        Body:            440000000000003140440000000000000000610B
+...
diff --git a/mlir/test/Target/Wasm/inputs/eqz.yaml.wasm b/mlir/test/Target/Wasm/inputs/eqz.yaml.wasm
new file mode 100644
index 0000000000000..894ac50a8dc97
--- /dev/null
+++ b/mlir/test/Target/Wasm/inputs/eqz.yaml.wasm
@@ -0,0 +1,29 @@
+--- !WASM
+FileHeader:
+  Version:         0x1
+Sections:
+  - Type:            TYPE
+    Signatures:
+      - Index:           0
+        ParamTypes:      []
+        ReturnTypes:
+          - I32
+  - Type:            FUNCTION
+    FunctionTypes:   [ 0, 0 ]
+  - Type:            EXPORT
+    Exports:
+      - Name:            eqz_i32
+        Kind:            FUNCTION
+        Index:           0
+      - Name:            eqz_i64
+        Kind:            FUNCTION
+        Index:           1
+  - Type:            CODE
+    Functions:
+      - Index:           0
+        Locals:          []
+        Body:            410D450B
+      - Index:           1
+        Locals:          []
+        Body:            420D500B
+...
diff --git a/mlir/test/Target/Wasm/inputs/extend.yaml.wasm b/mlir/test/Target/Wasm/inputs/extend.yaml.wasm
new file mode 100644
index 0000000000000..7e872ba0b8296
--- /dev/null
+++ b/mlir/test/Target/Wasm/inputs/extend.yaml.wasm
@@ -0,0 +1,40 @@
+--- !WASM
+FileHeader:
+  Version:         0x1
+Sections:
+  - Type:            TYPE
+    Signatures:
+      - Index:           0
+        ParamTypes:      []
+        ReturnTypes:
+          - I64
+      - Index:           1
+        ParamTypes:      []
+        ReturnTypes:
+          - I32
+  - Type:            FUNCTION
+    FunctionTypes:   [ 0, 0, 1, 1, 0, 0, 0 ]
+  - Type:            CODE
+    Functions:
+      - Index:           0
+        Locals:          []
+        Body:            410AAC0B
+      - Index:           1
+        Locals:          []
+        Body:            410AAD0B
+      - Index:           2
+        Locals:          []
+        Body:            410AC00B
+      - Index:           3
+        Locals:          []
+        Body:            410AC10B
+      - Index:           4
+        Locals:          []
+        Body:            420AC20B
+      - Index:           5
+        Locals:          []
+        Body:            420AC30B
+      - Index:           6
+        Locals:          []
+        Body:            420AC40B
+...
diff --git a/mlir/test/Target/Wasm/inputs/if.yaml.wasm b/mlir/test/Target/Wasm/inputs/if.yaml.wasm
new file mode 100644
index 0000000000000..ccc38f676e2e4
--- /dev/null
+++ b/mlir/test/Target/Wasm/inputs/if.yaml.wasm
@@ -0,0 +1,25 @@
+--- !WASM
+FileHeader:
+  Version:         0x1
+Sections:
+  - Type:            TYPE
+    Signatures:
+      - Index:           0
+        ParamTypes:
+          - I32
+        ReturnTypes:
+          - I32
+  - Type:            FUNCTION
+    FunctionTypes:   [ 0, 0, 0 ]
+  - Type:            CODE
+    Functions:
+      - Index:           0
+        Locals:          []
+        Body:            2000410171047F200041036C41016A0520004101760B0B
+      - Index:           1
+        Locals:          []
+        Body:            20002000410171040041016A0B0B
+      - Index:           2
+        Locals:          []
+        Body:            200068047F4102200041017668040041026A0B0541010B0B
+...
diff --git a/mlir/test/Target/Wasm/inputs/loop.yaml.wasm b/mlir/test/Target/Wasm/inputs/loop.yaml.wasm
new file mode 100644
index 0000000000000..9d33894bfea50
--- /dev/null
+++ b/mlir/test/Target/Wasm/inputs/loop.yaml.wasm
@@ -0,0 +1,17 @@
+--- !WASM
+FileHeader:
+  Version:         0x1
+Sections:
+  - Type:            TYPE
+    Signatures:
+      - Index:           0
+        ParamTypes:      []
+        ReturnTypes:     []
+  - Type:            FUNCTION
+    FunctionTypes:   [ 0 ]
+  - Type:            CODE
+    Functions:
+      - Index:           0
+        Locals:          []
+        Body:            03400B0B
+...
diff --git a/mlir/test/Target/Wasm/inputs/loop_with_inst.yaml.wasm b/mlir/test/Target/Wasm/inputs/loop_with_inst.yaml.wasm
new file mode 100644
index 0000000000000..4b8cc545fe0e0
--- /dev/null
+++ b/mlir/test/Target/Wasm/inputs/loop_with_inst.yaml.wasm
@@ -0,0 +1,20 @@
+--- !WASM
+FileHeader:
+  Version:         0x1
+Sections:
+  - Type:            TYPE
+    Signatures:
+      - Index:           0
+        ParamTypes:      []
+        ReturnTypes:
+          - I32
+  - Type:            FUNCTION
+    FunctionTypes:   [ 0 ]
+  - Type:            CODE
+    Functions:
+      - Index:           0
+        Locals:
+          - Type:            I32
+            Count:           1
+        Body:            037F200041016A21002000410A480B0B
+...
diff --git a/mlir/test/Target/Wasm/inputs/ne.yaml.wasm b/mlir/test/Target/Wasm/inputs/ne.yaml.wasm
new file mode 100644
index 0000000000000..0167519b27459
--- /dev/null
+++ b/mlir/test/Target/Wasm/inputs/ne.yaml.wasm
@@ -0,0 +1,27 @@
+--- !WASM
+FileHeader:
+  Version:         0x1
+Sections:
+  - Type:            TYPE
+    Signatures:
+      - Index:           0
+        ParamTypes:      []
+        ReturnTypes:
+          - I32
+  - Type:            FUNCTION
+    FunctionTypes:   [ 0, 0, 0, 0 ]
+  - Type:            CODE
+    Functions:
+      - Index:           0
+        Locals:          []
+        Body:            410C4132470B
+      - Index:           1
+        Locals:          []
+        Body:            42144205520B
+      - Index:           2
+        Locals:          []
+        Body:            430000A04043000060415C0B
+      - Index:           3
+        Locals:          []
+        Body:            440000000000003140440000000000000000620B
+...
diff --git a/mlir/test/Target/Wasm/inputs/promote.yaml.wasm b/mlir/test/Target/Wasm/inputs/promote.yaml.wasm
new file mode 100644
index 0000000000000..d38603e65deda
--- /dev/null
+++ b/mlir/test/Target/Wasm/inputs/promote.yaml.wasm
@@ -0,0 +1,18 @@
+--- !WASM
+FileHeader:
+  Version:         0x1
+Sections:
+  - Type:            TYPE
+    Signatures:
+      - Index:           0
+        ParamTypes:      []
+        ReturnTypes:
+          - F64
+  - Type:            FUNCTION
+    FunctionTypes:   [ 0 ]
+  - Type:            CODE
+    Functions:
+      - Index:           0
+        Locals:          []
+        Body:            4300002841BB0B
+...
diff --git a/mlir/test/Target/Wasm/inputs/reinterpret.yaml.wasm b/mlir/test/Target/Wasm/inputs/reinterpret.yaml.wasm
new file mode 100644
index 0000000000000..c01c1b10c3109
--- /dev/null
+++ b/mlir/test/Target/Wasm/inputs/reinterpret.yaml.wasm
@@ -0,0 +1,53 @@
+--- !WASM
+FileHeader:
+  Version:         0x1
+Sections:
+  - Type:            TYPE
+    Signatures:
+      - Index:           0
+        ParamTypes:      []
+        ReturnTypes:
+          - I32
+      - Index:           1
+        ParamTypes:      []
+        ReturnTypes:
+          - I64
+      - Index:           2
+        ParamTypes:      []
+        ReturnTypes:
+          - F32
+      - Index:           3
+        ParamTypes:      []
+        ReturnTypes:
+          - F64
+  - Type:            FUNCTION
+    FunctionTypes:   [ 0, 1, 2, 3 ]
+  - Type:            EXPORT
+    Exports:
+      - Name:            i32.reinterpret_f32
+        Kind:            FUNCTION
+        Index:           0
+      - Name:            i64.reinterpret_f64
+        Kind:            FUNCTION
+        Index:           1
+      - Name:            f32.reinterpret_i32
+        Kind:            FUNCTION
+        Index:           2
+      - Name:            f64.reinterpret_i64
+        Kind:            FUNCTION
+        Index:           3
+  - Type:            CODE
+    Functions:
+      - Index:           0
+        Locals:          []
+        Body:            43000080BFBC0B
+      - Index:           1
+        Locals:          []
+        Body:            44000000000000F0BFBD0B
+      - Index:           2
+        Locals:          []
+        Body:            417FBE0B
+      - Index:           3
+        Locals:          []
+        Body:            427FBF0B
+...
diff --git a/mlir/test/Target/Wasm/inputs/rounding.yaml.wasm b/mlir/test/Target/Wasm/inputs/rounding.yaml.wasm
new file mode 100644
index 0000000000000..c6e8bf6d24f96
--- /dev/null
+++ b/mlir/test/Target/Wasm/inputs/rounding.yaml.wasm
@@ -0,0 +1,37 @@
+--- !WASM
+FileHeader:
+  Version:         0x1
+Sections:
+  - Type:            TYPE
+    Signatures:
+      - Index:           0
+        ParamTypes:      []
+        ReturnTypes:
+          - F64
+      - Index:           1
+        ParamTypes:      []
+        ReturnTypes:
+          - F32
+  - Type:            FUNCTION
+    FunctionTypes:   [ 0, 1, 0, 1, 0, 1 ]
+  - Type:            CODE
+    Functions:
+      - Index:           0
+        Locals:          []
+        Body:            4433333333333328C09B0B
+      - Index:           1
+        Locals:          []
+        Body:            43A01ACF3F8D0B
+      - Index:           2
+        Locals:          []
+        Body:            4433333333333328C09C0B
+      - Index:           3
+        Locals:          []
+        Body:            43A01ACF3F8E0B
+      - Index:           4
+        Locals:          []
+        Body:            4433333333333328C09D0B
+      - Index:           5
+        Locals:          []
+        Body:            43A01ACF3F8F0B
+...
diff --git a/mlir/test/Target/Wasm/inputs/wrap.yaml.wasm b/mlir/test/Target/Wasm/inputs/wrap.yaml.wasm
new file mode 100644
index 0000000000000..51c0b027e7158
--- /dev/null
+++ b/mlir/test/Target/Wasm/inputs/wrap.yaml.wasm
@@ -0,0 +1,24 @@
+--- !WASM
+FileHeader:
+  Version:         0x1
+Sections:
+  - Type:            TYPE
+    Signatures:
+      - Index:           0
+        ParamTypes:
+          - I64
+        ReturnTypes:
+          - I32
+  - Type:            FUNCTION
+    FunctionTypes:   [ 0 ]
+  - Type:            EXPORT
+    Exports:
+      - Name:            i64_wrap
+        Kind:            FUNCTION
+        Index:           0
+  - Type:            CODE
+    Functions:
+      - Index:           0
+        Locals:          []
+        Body:            2000A70B
+...
diff --git a/mlir/test/Target/Wasm/invalid_block_type_index.yaml b/mlir/test/Target/Wasm/invalid_block_type_index.yaml
new file mode 100644
index 0000000000000..5b83e2e32c8a2
--- /dev/null
+++ b/mlir/test/Target/Wasm/invalid_block_type_index.yaml
@@ -0,0 +1,28 @@
+
+# RUN: yaml2obj %s | not mlir-translate --import-wasm -o - 2>&1 | FileCheck %s
+
+# CHECK: type index references nonexistent type (2)
+
+--- !WASM
+FileHeader:
+  Version:         0x1
+Sections:
+  - Type:            TYPE
+    Signatures:
+      - Index:           0
+        ParamTypes:
+          - I32
+        ReturnTypes:
+          - I32
+      - Index:           1
+        ParamTypes:      []
+        ReturnTypes:
+          - I32
+  - Type:            FUNCTION
+    FunctionTypes:   [ 1 ]
+  - Type:            CODE
+    Functions:
+      - Index:           0
+        Locals:          []
+        Body:            410E020241016A0B0B
+# -----------------------------^^ Invalid type ID
diff --git a/mlir/test/Target/Wasm/local.mlir b/mlir/test/Target/Wasm/local.mlir
index 32f590021b959..9844f9c64a212 100644
--- a/mlir/test/Target/Wasm/local.mlir
+++ b/mlir/test/Target/Wasm/local.mlir
@@ -29,7 +29,7 @@
 )
 */
 
-// CHECK-LABEL:   wasmssa.func nested @func_0() -> f32 {
+// CHECK-LABEL:   wasmssa.func @func_0() -> f32 {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.local of type f32
 // CHECK:           %[[VAL_1:.*]] = wasmssa.local of type f32
 // CHECK:           %[[VAL_2:.*]] = wasmssa.const 8.000000e+00 : f32
@@ -40,7 +40,7 @@
 // CHECK:           %[[VAL_6:.*]] = wasmssa.add %[[VAL_3]] %[[VAL_5]] : f32
 // CHECK:           wasmssa.return %[[VAL_6]] : f32
 
-// CHECK-LABEL:   wasmssa.func nested @func_1() -> i32 {
+// CHECK-LABEL:   wasmssa.func @func_1() -> i32 {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.local of type i32
 // CHECK:           %[[VAL_1:.*]] = wasmssa.local of type i32
 // CHECK:           %[[VAL_2:.*]] = wasmssa.const 8 : i32
@@ -51,7 +51,7 @@
 // CHECK:           %[[VAL_6:.*]] = wasmssa.add %[[VAL_3]] %[[VAL_5]] : i32
 // CHECK:           wasmssa.return %[[VAL_6]] : i32
 
-// CHECK-LABEL:   wasmssa.func nested @func_2(
+// CHECK-LABEL:   wasmssa.func @func_2(
 // CHECK-SAME:      %[[ARG0:.*]]: !wasmssa<local ref to i32>) -> i32 {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.const 3 : i32
 // CHECK:           wasmssa.local_set %[[ARG0]] :  ref to i32 to %[[VAL_0]] : i32
diff --git a/mlir/test/Target/Wasm/loop.mlir b/mlir/test/Target/Wasm/loop.mlir
new file mode 100644
index 0000000000000..29ad5026772b8
--- /dev/null
+++ b/mlir/test/Target/Wasm/loop.mlir
@@ -0,0 +1,17 @@
+// RUN: yaml2obj %S/inputs/loop.yaml.wasm -o - | mlir-translate --import-wasm | FileCheck %s
+
+/* IR generated from:
+(module
+  (func
+    (loop $my_loop
+    )
+  )
+)*/
+
+// CHECK-LABEL:   wasmssa.func @func_0() {
+// CHECK:           wasmssa.loop : {
+// CHECK:             wasmssa.block_return
+// CHECK:           }> ^bb1
+// CHECK:         ^bb1:
+// CHECK:           wasmssa.return
+// CHECK:         }
diff --git a/mlir/test/Target/Wasm/loop_with_inst.mlir b/mlir/test/Target/Wasm/loop_with_inst.mlir
new file mode 100644
index 0000000000000..311d007226516
--- /dev/null
+++ b/mlir/test/Target/Wasm/loop_with_inst.mlir
@@ -0,0 +1,33 @@
+// RUN: yaml2obj %S/inputs/loop_with_inst.yaml.wasm -o - | mlir-translate --import-wasm | FileCheck %s
+
+/* Code used to create this test:
+
+(module
+  (func (result i32)
+    (local $i i32)
+    (loop $my_loop (result i32)
+      local.get $i
+      i32.const 1
+      i32.add
+      local.set $i
+      local.get $i
+      i32.const 10
+      i32.lt_s
+    )
+  )
+)*/
+
+// CHECK-LABEL:   wasmssa.func @func_0() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.local of type i32
+// CHECK:           wasmssa.loop : {
+// CHECK:             %[[VAL_1:.*]] = wasmssa.local_get %[[VAL_0]] :  ref to i32
+// CHECK:             %[[VAL_2:.*]] = wasmssa.const 1 : i32
+// CHECK:             %[[VAL_3:.*]] = wasmssa.add %[[VAL_1]] %[[VAL_2]] : i32
+// CHECK:             wasmssa.local_set %[[VAL_0]] :  ref to i32 to %[[VAL_3]] : i32
+// CHECK:             %[[VAL_4:.*]] = wasmssa.local_get %[[VAL_0]] :  ref to i32
+// CHECK:             %[[VAL_5:.*]] = wasmssa.const 10 : i32
+// CHECK:             %[[VAL_6:.*]] = wasmssa.lt_si %[[VAL_4]] %[[VAL_5]] : i32 -> i32
+// CHECK:             wasmssa.block_return %[[VAL_6]] : i32
+// CHECK:           }> ^bb1
+// CHECK:         ^bb1(%[[VAL_7:.*]]: i32):
+// CHECK:           wasmssa.return %[[VAL_7]] : i32
diff --git a/mlir/test/Target/Wasm/max.mlir b/mlir/test/Target/Wasm/max.mlir
index 4ef2042d923b6..9160bde7779bb 100644
--- a/mlir/test/Target/Wasm/max.mlir
+++ b/mlir/test/Target/Wasm/max.mlir
@@ -16,14 +16,14 @@
 )
 */
 
-// CHECK-LABEL:   wasmssa.func @min_f32() -> f32 {
+// CHECK-LABEL:   wasmssa.func exported @min_f32() -> f32 {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.const 1.000000e+01 : f32
 // CHECK:           %[[VAL_1:.*]] = wasmssa.const 1.000000e+00 : f32
 // CHECK:           %[[VAL_2:.*]] = wasmssa.max %[[VAL_0]] %[[VAL_1]] : f32
 // CHECK:           wasmssa.return %[[VAL_2]] : f32
 
 
-// CHECK-LABEL:   wasmssa.func @min_f64() -> f64 {
+// CHECK-LABEL:   wasmssa.func exported @min_f64() -> f64 {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.const 1.000000e+01 : f64
 // CHECK:           %[[VAL_1:.*]] = wasmssa.const 1.000000e+00 : f64
 // CHECK:           %[[VAL_2:.*]] = wasmssa.max %[[VAL_0]] %[[VAL_1]] : f64
diff --git a/mlir/test/Target/Wasm/memory_min_eq_max.mlir b/mlir/test/Target/Wasm/memory_min_eq_max.mlir
index 2ba5ab50d51fa..ea8f71948200f 100644
--- a/mlir/test/Target/Wasm/memory_min_eq_max.mlir
+++ b/mlir/test/Target/Wasm/memory_min_eq_max.mlir
@@ -4,4 +4,4 @@
 (module (memory 0 0))
 */
 
-// CHECK-LABEL:   wasmssa.memory @mem_0 nested !wasmssa<limit[0: 0]>
+// CHECK-LABEL:   wasmssa.memory @mem_0 !wasmssa<limit[0: 0]>
diff --git a/mlir/test/Target/Wasm/memory_min_max.mlir b/mlir/test/Target/Wasm/memory_min_max.mlir
index ebf64189189f8..88782eca2b96c 100644
--- a/mlir/test/Target/Wasm/memory_min_max.mlir
+++ b/mlir/test/Target/Wasm/memory_min_max.mlir
@@ -4,4 +4,4 @@
 (module (memory 0 65536))
 */
 
-// CHECK-LABEL:  wasmssa.memory @mem_0 nested !wasmssa<limit[0: 65536]>
+// CHECK-LABEL:  wasmssa.memory @mem_0 !wasmssa<limit[0: 65536]>
diff --git a/mlir/test/Target/Wasm/memory_min_no_max.mlir b/mlir/test/Target/Wasm/memory_min_no_max.mlir
index 8d8878618bcc0..c10c5ccef3b6d 100644
--- a/mlir/test/Target/Wasm/memory_min_no_max.mlir
+++ b/mlir/test/Target/Wasm/memory_min_no_max.mlir
@@ -4,4 +4,4 @@
 (module (memory 1))
 */
 
-// CHECK-LABEL:  wasmssa.memory @mem_0 nested !wasmssa<limit[1:]>
+// CHECK-LABEL:  wasmssa.memory @mem_0 !wasmssa<limit[1:]>
diff --git a/mlir/test/Target/Wasm/min.mlir b/mlir/test/Target/Wasm/min.mlir
index 1058c7d32be71..2372bcc8d25b6 100644
--- a/mlir/test/Target/Wasm/min.mlir
+++ b/mlir/test/Target/Wasm/min.mlir
@@ -16,13 +16,13 @@
 )
 */
 
-// CHECK-LABEL:   wasmssa.func @min_f32() -> f32 {
+// CHECK-LABEL:   wasmssa.func exported @min_f32() -> f32 {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.const 1.000000e+01 : f32
 // CHECK:           %[[VAL_1:.*]] = wasmssa.const 1.000000e+00 : f32
 // CHECK:           %[[VAL_2:.*]] = wasmssa.min %[[VAL_0]] %[[VAL_1]] : f32
 // CHECK:           wasmssa.return %[[VAL_2]] : f32
 
-// CHECK-LABEL:   wasmssa.func @min_f64() -> f64 {
+// CHECK-LABEL:   wasmssa.func exported @min_f64() -> f64 {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.const 1.000000e+01 : f64
 // CHECK:           %[[VAL_1:.*]] = wasmssa.const 1.000000e+00 : f64
 // CHECK:           %[[VAL_2:.*]] = wasmssa.min %[[VAL_0]] %[[VAL_1]] : f64
diff --git a/mlir/test/Target/Wasm/ne.mlir b/mlir/test/Target/Wasm/ne.mlir
new file mode 100644
index 0000000000000..331df757f1934
--- /dev/null
+++ b/mlir/test/Target/Wasm/ne.mlir
@@ -0,0 +1,52 @@
+// RUN: yaml2obj %S/inputs/ne.yaml.wasm -o - | mlir-translate --import-wasm | FileCheck %s
+/* Source code used to create this test:
+(module
+    (func $ne_i32 (result i32)
+        i32.const 12
+        i32.const 50
+        i32.ne
+    )
+
+    (func $ne_i64 (result i32)
+        i64.const 20
+        i64.const 5
+        i64.ne
+    )
+
+    (func $ne_f32 (result i32)
+        f32.const 5
+        f32.const 14
+        f32.ne
+    )
+
+    (func $ne_f64 (result i32)
+        f64.const 17
+        f64.const 0
+        f64.ne
+    )
+)
+*/
+
+// CHECK-LABEL:   wasmssa.func @func_0() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 12 : i32
+// CHECK:           %[[VAL_1:.*]] = wasmssa.const 50 : i32
+// CHECK:           %[[VAL_2:.*]] = wasmssa.ne %[[VAL_0]] %[[VAL_1]] : i32 -> i32
+// CHECK:           wasmssa.return %[[VAL_2]] : i32
+
+// CHECK-LABEL:   wasmssa.func @func_1() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 20 : i64
+// CHECK:           %[[VAL_1:.*]] = wasmssa.const 5 : i64
+// CHECK:           %[[VAL_2:.*]] = wasmssa.ne %[[VAL_0]] %[[VAL_1]] : i64 -> i32
+// CHECK:           wasmssa.return %[[VAL_2]] : i32
+
+// CHECK-LABEL:   wasmssa.func @func_2() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 5.000000e+00 : f32
+// CHECK:           %[[VAL_1:.*]] = wasmssa.const 1.400000e+01 : f32
+// CHECK:           %[[VAL_2:.*]] = wasmssa.ne %[[VAL_0]] %[[VAL_1]] : f32 -> i32
+// CHECK:           wasmssa.return %[[VAL_2]] : i32
+
+// CHECK-LABEL:   wasmssa.func @func_3() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 1.700000e+01 : f64
+// CHECK:           %[[VAL_1:.*]] = wasmssa.const 0.000000e+00 : f64
+// CHECK:           %[[VAL_2:.*]] = wasmssa.ne %[[VAL_0]] %[[VAL_1]] : f64 -> i32
+// CHECK:           wasmssa.return %[[VAL_2]] : i32
diff --git a/mlir/test/Target/Wasm/neg.mlir b/mlir/test/Target/Wasm/neg.mlir
index 5811ab50348bd..dae8ee58767fc 100644
--- a/mlir/test/Target/Wasm/neg.mlir
+++ b/mlir/test/Target/Wasm/neg.mlir
@@ -12,12 +12,12 @@
 )
 */
 
-// CHECK-LABEL:   wasmssa.func @neg_f32() -> f32 {
+// CHECK-LABEL:   wasmssa.func exported @neg_f32() -> f32 {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.const 1.000000e+01 : f32
 // CHECK:           %[[VAL_1:.*]] = wasmssa.neg %[[VAL_0]] : f32
 // CHECK:           wasmssa.return %[[VAL_1]] : f32
 
-// CHECK-LABEL:   wasmssa.func @neg_f64() -> f64 {
+// CHECK-LABEL:   wasmssa.func exported @neg_f64() -> f64 {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.const 1.000000e+01 : f64
 // CHECK:           %[[VAL_1:.*]] = wasmssa.neg %[[VAL_0]] : f64
 // CHECK:           wasmssa.return %[[VAL_1]] : f64
diff --git a/mlir/test/Target/Wasm/or.mlir b/mlir/test/Target/Wasm/or.mlir
index 521f2bac9fa6a..be0b3d7c89173 100644
--- a/mlir/test/Target/Wasm/or.mlir
+++ b/mlir/test/Target/Wasm/or.mlir
@@ -14,13 +14,13 @@
 )
 */
 
-// CHECK-LABEL: wasmssa.func @or_i32() -> i32 {
+// CHECK-LABEL: wasmssa.func exported @or_i32() -> i32 {
 // CHECK:    %0 = wasmssa.const 10 : i32
 // CHECK:    %1 = wasmssa.const 3 : i32
 // CHECK:    %2 = wasmssa.or %0 %1 : i32
 // CHECK:    wasmssa.return %2 : i32
 
-// CHECK-LABEL: wasmssa.func @or_i64() -> i64 {
+// CHECK-LABEL: wasmssa.func exported @or_i64() -> i64 {
 // CHECK:    %0 = wasmssa.const 10 : i64
 // CHECK:    %1 = wasmssa.const 3 : i64
 // CHECK:    %2 = wasmssa.or %0 %1 : i64
diff --git a/mlir/test/Target/Wasm/popcnt.mlir b/mlir/test/Target/Wasm/popcnt.mlir
index 235333a085177..bfaa8ebaee996 100644
--- a/mlir/test/Target/Wasm/popcnt.mlir
+++ b/mlir/test/Target/Wasm/popcnt.mlir
@@ -14,12 +14,12 @@
 )
 */
 
-// CHECK-LABEL:   wasmssa.func @popcnt_i32() -> i32 {
+// CHECK-LABEL:   wasmssa.func exported @popcnt_i32() -> i32 {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.const 10 : i32
 // CHECK:           %[[VAL_1:.*]] = wasmssa.popcnt %[[VAL_0]] : i32
 // CHECK:           wasmssa.return %[[VAL_1]] : i32
 
-// CHECK-LABEL:   wasmssa.func @popcnt_i64() -> i64 {
+// CHECK-LABEL:   wasmssa.func exported @popcnt_i64() -> i64 {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.const 10 : i64
 // CHECK:           %[[VAL_1:.*]] = wasmssa.popcnt %[[VAL_0]] : i64
 // CHECK:           wasmssa.return %[[VAL_1]] : i64
diff --git a/mlir/test/Target/Wasm/promote.mlir b/mlir/test/Target/Wasm/promote.mlir
new file mode 100644
index 0000000000000..44c31b60548a2
--- /dev/null
+++ b/mlir/test/Target/Wasm/promote.mlir
@@ -0,0 +1,14 @@
+// RUN: yaml2obj %S/inputs/promote.yaml.wasm -o - | mlir-translate --import-wasm | FileCheck %s
+
+/* Source code used to generate this test:
+(module
+  (func $main (result f64)
+    f32.const 10.5
+    f64.promote_f32
+  )
+)*/
+
+// CHECK-LABEL:   wasmssa.func @func_0() -> f64 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 1.050000e+01 : f32
+// CHECK:           %[[VAL_1:.*]] = wasmssa.promote %[[VAL_0]] : f32 to f64
+// CHECK:           wasmssa.return %[[VAL_1]] : f64
diff --git a/mlir/test/Target/Wasm/reinterpret.mlir b/mlir/test/Target/Wasm/reinterpret.mlir
new file mode 100644
index 0000000000000..574d13f8d3f45
--- /dev/null
+++ b/mlir/test/Target/Wasm/reinterpret.mlir
@@ -0,0 +1,46 @@
+// RUN: yaml2obj %S/inputs/reinterpret.yaml.wasm -o - | mlir-translate --import-wasm | FileCheck %s
+
+/*
+Test generated from:
+(module
+    (func (export "i32.reinterpret_f32") (result i32)
+        f32.const -1
+        i32.reinterpret_f32
+    )
+
+    (func (export "i64.reinterpret_f64") (result i64)
+        f64.const -1
+        i64.reinterpret_f64
+    )
+
+    (func (export "f32.reinterpret_i32") (result f32)
+        i32.const -1
+        f32.reinterpret_i32
+    )
+
+    (func (export "f64.reinterpret_i64") (result f64)
+        i64.const -1
+        f64.reinterpret_i64
+    )
+)
+*/
+
+// CHECK-LABEL:   wasmssa.func exported @i32.reinterpret_f32() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const -1.000000e+00 : f32
+// CHECK:           %[[VAL_1:.*]] = wasmssa.reinterpret %[[VAL_0]] : f32 as i32
+// CHECK:           wasmssa.return %[[VAL_1]] : i32
+
+// CHECK-LABEL:   wasmssa.func exported @i64.reinterpret_f64() -> i64 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const -1.000000e+00 : f64
+// CHECK:           %[[VAL_1:.*]] = wasmssa.reinterpret %[[VAL_0]] : f64 as i64
+// CHECK:           wasmssa.return %[[VAL_1]] : i64
+
+// CHECK-LABEL:   wasmssa.func exported @f32.reinterpret_i32() -> f32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const -1 : i32
+// CHECK:           %[[VAL_1:.*]] = wasmssa.reinterpret %[[VAL_0]] : i32 as f32
+// CHECK:           wasmssa.return %[[VAL_1]] : f32
+
+// CHECK-LABEL:   wasmssa.func exported @f64.reinterpret_i64() -> f64 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const -1 : i64
+// CHECK:           %[[VAL_1:.*]] = wasmssa.reinterpret %[[VAL_0]] : i64 as f64
+// CHECK:           wasmssa.return %[[VAL_1]] : f64
diff --git a/mlir/test/Target/Wasm/rem.mlir b/mlir/test/Target/Wasm/rem.mlir
index b19b8d9d45ed7..16c9c78d04e00 100644
--- a/mlir/test/Target/Wasm/rem.mlir
+++ b/mlir/test/Target/Wasm/rem.mlir
@@ -24,28 +24,28 @@
 )
 */
 
-// CHECK-LABEL: wasmssa.func @rem_u_i32() -> i32 {
+// CHECK-LABEL: wasmssa.func exported @rem_u_i32() -> i32 {
 // CHECK:    %0 = wasmssa.const 10 : i32
 // CHECK:    %1 = wasmssa.const 3 : i32
 // CHECK:    %2 = wasmssa.rem_ui %0 %1 : i32
 // CHECK:    wasmssa.return %2 : i32
 // CHECK:  }
 
-// CHECK-LABEL: wasmssa.func @rem_u_i64() -> i64 {
+// CHECK-LABEL: wasmssa.func exported @rem_u_i64() -> i64 {
 // CHECK:    %0 = wasmssa.const 10 : i64
 // CHECK:    %1 = wasmssa.const 3 : i64
 // CHECK:    %2 = wasmssa.rem_ui %0 %1 : i64
 // CHECK:    wasmssa.return %2 : i64
 // CHECK:  }
 
-// CHECK-LABEL:  wasmssa.func @rem_s_i32() -> i32 {
+// CHECK-LABEL:  wasmssa.func exported @rem_s_i32() -> i32 {
 // CHECK:    %0 = wasmssa.const 10 : i32
 // CHECK:    %1 = wasmssa.const 3 : i32
 // CHECK:    %2 = wasmssa.rem_si %0 %1 : i32
 // CHECK:    wasmssa.return %2 : i32
 // CHECK:  }
 
-// CHECK-LABEL:  wasmssa.func @rem_s_i64() -> i64 {
+// CHECK-LABEL:  wasmssa.func exported @rem_s_i64() -> i64 {
 // CHECK:    %0 = wasmssa.const 10 : i64
 // CHECK:    %1 = wasmssa.const 3 : i64
 // CHECK:    %2 = wasmssa.rem_si %0 %1 : i64
diff --git a/mlir/test/Target/Wasm/rotl.mlir b/mlir/test/Target/Wasm/rotl.mlir
index ec573554f0a8c..4c2e5af22c477 100644
--- a/mlir/test/Target/Wasm/rotl.mlir
+++ b/mlir/test/Target/Wasm/rotl.mlir
@@ -14,13 +14,13 @@
 )
 */
 
-// CHECK-LABEL: wasmssa.func @rotl_i32() -> i32 {
+// CHECK-LABEL: wasmssa.func exported @rotl_i32() -> i32 {
 // CHECK:    %0 = wasmssa.const 10 : i32
 // CHECK:    %1 = wasmssa.const 3 : i32
 // CHECK:    %2 = wasmssa.rotl %0 by %1 bits : i32
 // CHECK:    wasmssa.return %2 : i32
 
-// CHECK-LABEL: wasmssa.func @rotl_i64() -> i64 {
+// CHECK-LABEL: wasmssa.func exported @rotl_i64() -> i64 {
 // CHECK:    %0 = wasmssa.const 10 : i64
 // CHECK:    %1 = wasmssa.const 3 : i64
 // CHECK:    %2 = wasmssa.rotl %0 by %1 bits : i64
diff --git a/mlir/test/Target/Wasm/rotr.mlir b/mlir/test/Target/Wasm/rotr.mlir
index 5618b432693ad..ec403d0e36e4d 100644
--- a/mlir/test/Target/Wasm/rotr.mlir
+++ b/mlir/test/Target/Wasm/rotr.mlir
@@ -14,13 +14,13 @@
 )
 */
 
-// CHECK-LABEL: wasmssa.func @rotr_i32() -> i32 {
+// CHECK-LABEL: wasmssa.func exported @rotr_i32() -> i32 {
 // CHECK:    %0 = wasmssa.const 10 : i32
 // CHECK:    %1 = wasmssa.const 3 : i32
 // CHECK:    %2 = wasmssa.rotr %0 by %1 bits : i32
 // CHECK:    wasmssa.return %2 : i32
 
-// CHECK-LABEL: wasmssa.func @rotr_i64() -> i64 {
+// CHECK-LABEL: wasmssa.func exported @rotr_i64() -> i64 {
 // CHECK:    %0 = wasmssa.const 10 : i64
 // CHECK:    %1 = wasmssa.const 3 : i64
 // CHECK:    %2 = wasmssa.rotr %0 by %1 bits : i64
diff --git a/mlir/test/Target/Wasm/rounding.mlir b/mlir/test/Target/Wasm/rounding.mlir
new file mode 100644
index 0000000000000..947637eb8aa27
--- /dev/null
+++ b/mlir/test/Target/Wasm/rounding.mlir
@@ -0,0 +1,50 @@
+// RUN: yaml2obj %S/inputs/rounding.yaml.wasm -o - | mlir-translate --import-wasm | FileCheck %s
+/* Source code used to create this test:
+(module
+  (func $ceil_f64 (result f64)
+    f64.const -12.1
+    f64.ceil
+  )
+  (func $ceil_f32 (result f32)
+    f32.const 1.618
+    f32.ceil
+  )
+  (func $floor_f64 (result f64)
+    f64.const -12.1
+    f64.floor
+  )
+  (func $floor_f32 (result f32)
+    f32.const 1.618
+    f32.floor
+  )
+*/
+
+// CHECK-LABEL:   wasmssa.func @func_0() -> f64 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const -1.210000e+01 : f64
+// CHECK:           %[[VAL_1:.*]] = wasmssa.ceil %[[VAL_0]] : f64
+// CHECK:           wasmssa.return %[[VAL_1]] : f64
+
+// CHECK-LABEL:   wasmssa.func @func_1() -> f32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 1.618000e+00 : f32
+// CHECK:           %[[VAL_1:.*]] = wasmssa.ceil %[[VAL_0]] : f32
+// CHECK:           wasmssa.return %[[VAL_1]] : f32
+
+// CHECK-LABEL:   wasmssa.func @func_2() -> f64 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const -1.210000e+01 : f64
+// CHECK:           %[[VAL_1:.*]] = wasmssa.floor %[[VAL_0]] : f64
+// CHECK:           wasmssa.return %[[VAL_1]] : f64
+
+// CHECK-LABEL:   wasmssa.func @func_3() -> f32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 1.618000e+00 : f32
+// CHECK:           %[[VAL_1:.*]] = wasmssa.floor %[[VAL_0]] : f32
+// CHECK:           wasmssa.return %[[VAL_1]] : f32
+
+// CHECK-LABEL:   wasmssa.func @func_4() -> f64 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const -1.210000e+01 : f64
+// CHECK:           %[[VAL_1:.*]] = wasmssa.trunc %[[VAL_0]] : f64
+// CHECK:           wasmssa.return %[[VAL_1]] : f64
+
+// CHECK-LABEL:   wasmssa.func @func_5() -> f32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.const 1.618000e+00 : f32
+// CHECK:           %[[VAL_1:.*]] = wasmssa.trunc %[[VAL_0]] : f32
+// CHECK:           wasmssa.return %[[VAL_1]] : f32
diff --git a/mlir/test/Target/Wasm/shl.mlir b/mlir/test/Target/Wasm/shl.mlir
index f2bdd573fab81..1363112377695 100644
--- a/mlir/test/Target/Wasm/shl.mlir
+++ b/mlir/test/Target/Wasm/shl.mlir
@@ -14,13 +14,13 @@
 )
 */
 
-// CHECK-LABEL: wasmssa.func @shl_i32() -> i32 {
+// CHECK-LABEL: wasmssa.func exported @shl_i32() -> i32 {
 // CHECK:    %0 = wasmssa.const 10 : i32
 // CHECK:    %1 = wasmssa.const 3 : i32
 // CHECK:    %2 = wasmssa.shl %0 by %1 bits : i32
 // CHECK:    wasmssa.return %2 : i32
 
-// CHECK-LABEL: wasmssa.func @shl_i64() -> i64 {
+// CHECK-LABEL: wasmssa.func exported @shl_i64() -> i64 {
 // CHECK:    %0 = wasmssa.const 10 : i64
 // CHECK:    %1 = wasmssa.const 3 : i64
 // CHECK:    %2 = wasmssa.shl %0 by %1 bits : i64
diff --git a/mlir/test/Target/Wasm/shr_s.mlir b/mlir/test/Target/Wasm/shr_s.mlir
index 247d9be28403c..da1a38f6397f6 100644
--- a/mlir/test/Target/Wasm/shr_s.mlir
+++ b/mlir/test/Target/Wasm/shr_s.mlir
@@ -14,13 +14,13 @@
 )
 */
 
-// CHECK-LABEL: wasmssa.func @shr_s_i32() -> i32 {
+// CHECK-LABEL: wasmssa.func exported @shr_s_i32() -> i32 {
 // CHECK:    %0 = wasmssa.const 10 : i32
 // CHECK:    %1 = wasmssa.const 3 : i32
 // CHECK:    %2 = wasmssa.shr_s %0 by %1 bits : i32
 // CHECK:    wasmssa.return %2 : i32
 
-// CHECK-LABEL: wasmssa.func @shr_s_i64() -> i64 {
+// CHECK-LABEL: wasmssa.func exported @shr_s_i64() -> i64 {
 // CHECK:    %0 = wasmssa.const 10 : i64
 // CHECK:    %1 = wasmssa.const 3 : i64
 // CHECK:    %2 = wasmssa.shr_s %0 by %1 bits : i64
diff --git a/mlir/test/Target/Wasm/shr_u.mlir b/mlir/test/Target/Wasm/shr_u.mlir
index 9a79eed9b1d4a..2991c2a724cb6 100644
--- a/mlir/test/Target/Wasm/shr_u.mlir
+++ b/mlir/test/Target/Wasm/shr_u.mlir
@@ -14,13 +14,13 @@
 )
 */
 
-// CHECK-LABEL: wasmssa.func @shr_u_i32() -> i32 {
+// CHECK-LABEL: wasmssa.func exported @shr_u_i32() -> i32 {
 // CHECK:    %0 = wasmssa.const 10 : i32
 // CHECK:    %1 = wasmssa.const 3 : i32
 // CHECK:    %2 = wasmssa.shr_u %0 by %1 bits : i32
 // CHECK:    wasmssa.return %2 : i32
 
-// CHECK-LABEL: wasmssa.func @shr_u_i64() -> i64 {
+// CHECK-LABEL: wasmssa.func exported @shr_u_i64() -> i64 {
 // CHECK:    %0 = wasmssa.const 10 : i64
 // CHECK:    %1 = wasmssa.const 3 : i64
 // CHECK:    %2 = wasmssa.shr_u %0 by %1 bits : i64
diff --git a/mlir/test/Target/Wasm/sqrt.mlir b/mlir/test/Target/Wasm/sqrt.mlir
index 77444ad69b16f..6b968d6b9ff1a 100644
--- a/mlir/test/Target/Wasm/sqrt.mlir
+++ b/mlir/test/Target/Wasm/sqrt.mlir
@@ -12,12 +12,12 @@
 )
 */
 
-// CHECK-LABEL:   wasmssa.func @sqrt_f32() -> f32 {
+// CHECK-LABEL:   wasmssa.func exported @sqrt_f32() -> f32 {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.const 1.000000e+01 : f32
 // CHECK:           %[[VAL_1:.*]] = wasmssa.sqrt %[[VAL_0]] : f32
 // CHECK:           wasmssa.return %[[VAL_1]] : f32
 
-// CHECK-LABEL:   wasmssa.func @sqrt_f64() -> f64 {
+// CHECK-LABEL:   wasmssa.func exported @sqrt_f64() -> f64 {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.const 1.000000e+01 : f64
 // CHECK:           %[[VAL_1:.*]] = wasmssa.sqrt %[[VAL_0]] : f64
 // CHECK:           wasmssa.return %[[VAL_1]] : f64
diff --git a/mlir/test/Target/Wasm/sub.mlir b/mlir/test/Target/Wasm/sub.mlir
index b9c6caf006583..5b242f433a5ea 100644
--- a/mlir/test/Target/Wasm/sub.mlir
+++ b/mlir/test/Target/Wasm/sub.mlir
@@ -27,25 +27,25 @@
 )
 */
 
-// CHECK-LABEL:   wasmssa.func nested @func_0() -> i32 {
+// CHECK-LABEL:   wasmssa.func @func_0() -> i32 {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.const 12 : i32
 // CHECK:           %[[VAL_1:.*]] = wasmssa.const 50 : i32
 // CHECK:           %[[VAL_2:.*]] = wasmssa.sub %[[VAL_0]] %[[VAL_1]] : i32
 // CHECK:           wasmssa.return %[[VAL_2]] : i32
 
-// CHECK-LABEL:   wasmssa.func nested @func_1() -> i64 {
+// CHECK-LABEL:   wasmssa.func @func_1() -> i64 {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.const 20 : i64
 // CHECK:           %[[VAL_1:.*]] = wasmssa.const 5 : i64
 // CHECK:           %[[VAL_2:.*]] = wasmssa.sub %[[VAL_0]] %[[VAL_1]] : i64
 // CHECK:           wasmssa.return %[[VAL_2]] : i64
 
-// CHECK-LABEL:   wasmssa.func nested @func_2() -> f32 {
+// CHECK-LABEL:   wasmssa.func @func_2() -> f32 {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.const 5.000000e+00 : f32
 // CHECK:           %[[VAL_1:.*]] = wasmssa.const 1.400000e+01 : f32
 // CHECK:           %[[VAL_2:.*]] = wasmssa.sub %[[VAL_0]] %[[VAL_1]] : f32
 // CHECK:           wasmssa.return %[[VAL_2]] : f32
 
-// CHECK-LABEL:   wasmssa.func nested @func_3() -> f64 {
+// CHECK-LABEL:   wasmssa.func @func_3() -> f64 {
 // CHECK:           %[[VAL_0:.*]] = wasmssa.const 1.700000e+01 : f64
 // CHECK:           %[[VAL_1:.*]] = wasmssa.const 0.000000e+00 : f64
 // CHECK:           %[[VAL_2:.*]] = wasmssa.sub %[[VAL_0]] %[[VAL_1]] : f64
diff --git a/mlir/test/Target/Wasm/wrap.mlir b/mlir/test/Target/Wasm/wrap.mlir
new file mode 100644
index 0000000000000..1266758c56594
--- /dev/null
+++ b/mlir/test/Target/Wasm/wrap.mlir
@@ -0,0 +1,15 @@
+// RUN: yaml2obj %S/inputs/wrap.yaml.wasm -o - | mlir-translate --import-wasm | FileCheck %s
+/* Source code used to create this test:
+(module
+    (func (export "i64_wrap") (param $in i64) (result i32)
+    local.get $in
+    i32.wrap_i64
+    )
+)
+*/
+
+// CHECK-LABEL:   wasmssa.func exported @i64_wrap(
+// CHECK-SAME:      %[[ARG0:.*]]: !wasmssa<local ref to i64>) -> i32 {
+// CHECK:           %[[VAL_0:.*]] = wasmssa.local_get %[[ARG0]] :  ref to i64
+// CHECK:           %[[VAL_1:.*]] = wasmssa.wrap %[[VAL_0]] : i64 to i32
+// CHECK:           wasmssa.return %[[VAL_1]] : i32
diff --git a/mlir/test/Target/Wasm/xor.mlir b/mlir/test/Target/Wasm/xor.mlir
index 94691de2b3be4..56407db501c0d 100644
--- a/mlir/test/Target/Wasm/xor.mlir
+++ b/mlir/test/Target/Wasm/xor.mlir
@@ -14,13 +14,13 @@
 )
 */
 
-// CHECK-LABEL: wasmssa.func @xor_i32() -> i32 {
+// CHECK-LABEL: wasmssa.func exported @xor_i32() -> i32 {
 // CHECK:    %0 = wasmssa.const 10 : i32
 // CHECK:    %1 = wasmssa.const 3 : i32
 // CHECK:    %2 = wasmssa.xor %0 %1 : i32
 // CHECK:    wasmssa.return %2 : i32
 
-// CHECK-LABEL: wasmssa.func @xor_i64() -> i64 {
+// CHECK-LABEL: wasmssa.func exported @xor_i64() -> i64 {
 // CHECK:    %0 = wasmssa.const 10 : i64
 // CHECK:    %1 = wasmssa.const 3 : i64
 // CHECK:    %2 = wasmssa.xor %0 %1 : i64
diff --git a/mlir/test/Transforms/remove-dead-values.mlir b/mlir/test/Transforms/remove-dead-values.mlir
index 56449469dc29f..e7304505c809e 100644
--- a/mlir/test/Transforms/remove-dead-values.mlir
+++ b/mlir/test/Transforms/remove-dead-values.mlir
@@ -649,3 +649,28 @@ func.func @callee(%arg0: index, %arg1: index, %arg2: index) -> index {
   %res = call @mutl_parameter(%arg0, %arg1, %arg2) : (index, index, index) -> (index)
   return %res : index
 }
+
+// -----
+
+// This test verifies that the induction variables in loops are not deleted, the loop has results.
+
+// CHECK-LABEL: func @dead_value_loop_ivs
+func.func @dead_value_loop_ivs_has_result(%lb: index, %ub: index, %step: index, %b: i1) -> i1 {
+  %loop_ret = scf.for %iv = %lb to %ub step %step iter_args(%iter = %b) -> (i1) {
+    cf.assert %b, "loop not dead"
+    scf.yield %b : i1
+  }
+  return %loop_ret : i1
+}
+
+// -----
+
+// This test verifies that the induction variables in loops are not deleted, the loop has no results.
+
+// CHECK-LABEL: func @dead_value_loop_ivs_no_result
+func.func @dead_value_loop_ivs_no_result(%lb: index, %ub: index, %step: index, %input: memref<?xf32>, %value: f32, %pos: index) {
+  scf.for %iv = %lb to %ub step %step {
+    memref.store %value, %input[%pos] : memref<?xf32>
+  }
+  return
+}
diff --git a/mlir/test/lib/Analysis/CMakeLists.txt b/mlir/test/lib/Analysis/CMakeLists.txt
index 91879981bffd2..c37671ade37b3 100644
--- a/mlir/test/lib/Analysis/CMakeLists.txt
+++ b/mlir/test/lib/Analysis/CMakeLists.txt
@@ -17,6 +17,7 @@ add_mlir_library(MLIRTestAnalysis
   DataFlow/TestDenseForwardDataFlowAnalysis.cpp
   DataFlow/TestLivenessAnalysis.cpp
   DataFlow/TestSparseBackwardDataFlowAnalysis.cpp
+  DataFlow/TestStridedMetadataRangeAnalysis.cpp
 
   EXCLUDE_FROM_LIBMLIR
 
diff --git a/mlir/test/lib/Analysis/DataFlow/TestStridedMetadataRangeAnalysis.cpp b/mlir/test/lib/Analysis/DataFlow/TestStridedMetadataRangeAnalysis.cpp
new file mode 100644
index 0000000000000..6ac09fdeed136
--- /dev/null
+++ b/mlir/test/lib/Analysis/DataFlow/TestStridedMetadataRangeAnalysis.cpp
@@ -0,0 +1,86 @@
+//===- TestStridedMetadataRangeAnalysis.cpp - Test strided md analysis ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/DataFlow/ConstantPropagationAnalysis.h"
+#include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h"
+#include "mlir/Analysis/DataFlow/IntegerRangeAnalysis.h"
+#include "mlir/Analysis/DataFlow/StridedMetadataRangeAnalysis.h"
+#include "mlir/Analysis/DataFlowFramework.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassRegistry.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+using namespace mlir::dataflow;
+
+static void printAnalysisResults(DataFlowSolver &solver, Operation *op,
+                                 raw_ostream &os) {
+  // Collect the strided metadata of the op results.
+  SmallVector<std::pair<unsigned, const StridedMetadataRangeLattice *>> results;
+  for (OpResult result : op->getResults()) {
+    const auto *state = solver.lookupState<StridedMetadataRangeLattice>(result);
+    // Skip the result if it's uninitialized.
+    if (!state || state->getValue().isUninitialized())
+      continue;
+
+    // Skip the result if the range is empty.
+    const mlir::StridedMetadataRange &md = state->getValue();
+    if (md.getOffsets().empty() && md.getSizes().empty() &&
+        md.getStrides().empty())
+      continue;
+    results.push_back({result.getResultNumber(), state});
+  }
+
+  // Early exit if there's no metadata to print.
+  if (results.empty())
+    return;
+
+  // Print the metadata.
+  os << "Op: " << OpWithFlags(op, OpPrintingFlags().skipRegions()) << "\n";
+  for (auto [idx, state] : results)
+    os << "  result[" << idx << "]: " << state->getValue() << "\n";
+  os << "\n";
+}
+
+namespace {
+struct TestStridedMetadataRangeAnalysisPass
+    : public PassWrapper<TestStridedMetadataRangeAnalysisPass,
+                         OperationPass<>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(
+      TestStridedMetadataRangeAnalysisPass)
+
+  StringRef getArgument() const override {
+    return "test-strided-metadata-range-analysis";
+  }
+  void runOnOperation() override {
+    Operation *op = getOperation();
+
+    DataFlowSolver solver;
+    solver.load<DeadCodeAnalysis>();
+    solver.load<SparseConstantPropagation>();
+    solver.load<IntegerRangeAnalysis>();
+    solver.load<StridedMetadataRangeAnalysis>();
+    if (failed(solver.initializeAndRun(op)))
+      return signalPassFailure();
+
+    op->walk(
+        [&](Operation *op) { printAnalysisResults(solver, op, llvm::errs()); });
+  }
+};
+} // end anonymous namespace
+
+namespace mlir {
+namespace test {
+void registerTestStridedMetadataRangeAnalysisPass() {
+  PassRegistration<TestStridedMetadataRangeAnalysisPass>();
+}
+} // end namespace test
+} // end namespace mlir
diff --git a/mlir/test/lib/Dialect/Bufferization/TestOneShotModuleBufferize.cpp b/mlir/test/lib/Dialect/Bufferization/TestOneShotModuleBufferize.cpp
index 1e2d4a7c8f08d..4069a74dbf63c 100644
--- a/mlir/test/lib/Dialect/Bufferization/TestOneShotModuleBufferize.cpp
+++ b/mlir/test/lib/Dialect/Bufferization/TestOneShotModuleBufferize.cpp
@@ -11,11 +11,25 @@
 #include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
 #include "mlir/Dialect/Bufferization/Transforms/OneShotModuleBufferize.h"
 #include "mlir/Dialect/Bufferization/Transforms/Transforms.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Pass/Pass.h"
 
+#include "TestAttributes.h" // TestTensorEncodingAttr, TestMemRefLayoutAttr
+#include "TestDialect.h"
+
 using namespace mlir;
 
 namespace {
+MemRefLayoutAttrInterface
+getMemRefLayoutForTensorEncoding(RankedTensorType tensorType) {
+  if (auto encoding = dyn_cast_if_present<test::TestTensorEncodingAttr>(
+          tensorType.getEncoding())) {
+    return cast<MemRefLayoutAttrInterface>(test::TestMemRefLayoutAttr::get(
+        tensorType.getContext(), encoding.getDummy()));
+  }
+  return {};
+}
+
 struct TestOneShotModuleBufferizePass
     : public PassWrapper<TestOneShotModuleBufferizePass, OperationPass<>> {
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestOneShotModuleBufferizePass)
@@ -25,6 +39,7 @@ struct TestOneShotModuleBufferizePass
       : PassWrapper(pass) {}
 
   void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<test::TestDialect>();
     registry.insert<bufferization::BufferizationDialect>();
   }
   StringRef getArgument() const final {
@@ -41,6 +56,17 @@ struct TestOneShotModuleBufferizePass
     bufferization::OneShotBufferizationOptions opt;
 
     opt.bufferizeFunctionBoundaries = true;
+    opt.functionArgTypeConverterFn =
+        [&](bufferization::TensorLikeType tensor, Attribute memSpace,
+            func::FuncOp, const bufferization::BufferizationOptions &) {
+          assert(isa<RankedTensorType>(tensor) && "tests only builtin tensors");
+          auto tensorType = cast<RankedTensorType>(tensor);
+          auto layout = getMemRefLayoutForTensorEncoding(tensorType);
+          return cast<bufferization::BufferLikeType>(
+              MemRefType::get(tensorType.getShape(),
+                              tensorType.getElementType(), layout, memSpace));
+        };
+
     bufferization::BufferizationState bufferizationState;
 
     if (failed(bufferization::runOneShotModuleBufferize(getOperation(), opt,
diff --git a/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp b/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp
index 727c84cc9c07a..8c5c8e873ffa6 100644
--- a/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp
+++ b/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp
@@ -276,10 +276,8 @@ void TestLinalgTransforms::runOnOperation() {
       Operation *consumer = opOperand->getOwner();
       // If we have a pack/unpack consumer and a producer that has multiple
       // uses, do not apply the folding patterns.
-      if (isa<linalg::PackOp, linalg::UnPackOp>(consumer) &&
-          isa<TilingInterface>(producer) && !producer->hasOneUse())
-        return false;
-      return true;
+      return !(isa<linalg::PackOp, linalg::UnPackOp>(consumer) &&
+               isa<TilingInterface>(producer) && !producer->hasOneUse());
     };
     applyFoldIntoPackAndUnpackPatterns(rootOp, controlFn);
   }
diff --git a/mlir/test/lib/Dialect/OpenACC/CMakeLists.txt b/mlir/test/lib/Dialect/OpenACC/CMakeLists.txt
index f84055df1b6c6..1e593389ec683 100644
--- a/mlir/test/lib/Dialect/OpenACC/CMakeLists.txt
+++ b/mlir/test/lib/Dialect/OpenACC/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_mlir_library(MLIROpenACCTestPasses
   TestOpenACC.cpp
   TestPointerLikeTypeInterface.cpp
+  TestRecipePopulate.cpp
   
   EXCLUDE_FROM_LIBMLIR
 )
diff --git a/mlir/test/lib/Dialect/OpenACC/TestOpenACC.cpp b/mlir/test/lib/Dialect/OpenACC/TestOpenACC.cpp
index 98862401748e1..bea21b9827f7e 100644
--- a/mlir/test/lib/Dialect/OpenACC/TestOpenACC.cpp
+++ b/mlir/test/lib/Dialect/OpenACC/TestOpenACC.cpp
@@ -15,9 +15,13 @@ namespace test {
 
 // Forward declarations of individual test pass registration functions
 void registerTestPointerLikeTypeInterfacePass();
+void registerTestRecipePopulatePass();
 
 // Unified registration function for all OpenACC tests
-void registerTestOpenACC() { registerTestPointerLikeTypeInterfacePass(); }
+void registerTestOpenACC() {
+  registerTestPointerLikeTypeInterfacePass();
+  registerTestRecipePopulatePass();
+}
 
 } // namespace test
 } // namespace mlir
diff --git a/mlir/test/lib/Dialect/OpenACC/TestPointerLikeTypeInterface.cpp b/mlir/test/lib/Dialect/OpenACC/TestPointerLikeTypeInterface.cpp
index 85f92833a3269..027b0a1a8b80b 100644
--- a/mlir/test/lib/Dialect/OpenACC/TestPointerLikeTypeInterface.cpp
+++ b/mlir/test/lib/Dialect/OpenACC/TestPointerLikeTypeInterface.cpp
@@ -196,13 +196,15 @@ void TestPointerLikeTypeInterfacePass::testGenAllocate(
   newBuilder.setInsertionPointAfter(op);
 
   // Call the genAllocate API
+  bool needsFree = false;
   Value allocRes = pointerType.genAllocate(newBuilder, loc, "test_alloc",
-                                           result.getType(), result);
+                                           result.getType(), result, needsFree);
 
   if (allocRes) {
     llvm::errs() << "Successfully generated alloc for operation: ";
     op->print(llvm::errs());
     llvm::errs() << "\n";
+    llvm::errs() << "\tneeds free: " << (needsFree ? "true" : "false") << "\n";
 
     // Print all operations that were inserted
     for (Operation *insertedOp : tracker.insertedOps) {
@@ -230,8 +232,8 @@ void TestPointerLikeTypeInterfacePass::testGenFree(Operation *op, Value result,
 
   // Call the genFree API
   auto typedResult = cast<TypedValue<PointerLikeType>>(result);
-  bool success =
-      pointerType.genFree(newBuilder, loc, typedResult, result.getType());
+  bool success = pointerType.genFree(newBuilder, loc, typedResult, result,
+                                     result.getType());
 
   if (success) {
     llvm::errs() << "Successfully generated free for operation: ";
diff --git a/mlir/test/lib/Dialect/OpenACC/TestRecipePopulate.cpp b/mlir/test/lib/Dialect/OpenACC/TestRecipePopulate.cpp
new file mode 100644
index 0000000000000..35f092c2188a6
--- /dev/null
+++ b/mlir/test/lib/Dialect/OpenACC/TestRecipePopulate.cpp
@@ -0,0 +1,110 @@
+//===- TestRecipePopulate.cpp - Test Recipe Population -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains test passes for testing the createAndPopulate methods
+// of the recipe operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Pass/Pass.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace mlir;
+using namespace mlir::acc;
+
+namespace {
+
+struct TestRecipePopulatePass
+    : public PassWrapper<TestRecipePopulatePass, OperationPass<ModuleOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestRecipePopulatePass)
+
+  TestRecipePopulatePass() = default;
+  TestRecipePopulatePass(const TestRecipePopulatePass &pass)
+      : PassWrapper(pass) {
+    recipeType = pass.recipeType;
+  }
+
+  Pass::Option<std::string> recipeType{
+      *this, "recipe-type",
+      llvm::cl::desc("Recipe type: private or firstprivate"),
+      llvm::cl::init("private")};
+
+  StringRef getArgument() const override { return "test-acc-recipe-populate"; }
+
+  StringRef getDescription() const override {
+    return "Test OpenACC recipe population";
+  }
+
+  void runOnOperation() override;
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<acc::OpenACCDialect>();
+    registry.insert<arith::ArithDialect>();
+    registry.insert<memref::MemRefDialect>();
+  }
+};
+
+void TestRecipePopulatePass::runOnOperation() {
+  auto module = getOperation();
+  OpBuilder builder(&getContext());
+
+  // Collect all test variables
+  SmallVector<std::tuple<Operation *, Value, std::string>> testVars;
+
+  module.walk([&](Operation *op) {
+    if (auto varName = op->getAttrOfType<StringAttr>("test.var")) {
+      for (auto result : op->getResults()) {
+        testVars.push_back({op, result, varName.str()});
+      }
+    }
+  });
+
+  // Generate recipes at module level
+  builder.setInsertionPoint(&module.getBodyRegion().front(),
+                            module.getBodyRegion().front().begin());
+
+  for (auto [op, var, varName] : testVars) {
+    Location loc = op->getLoc();
+
+    std::string recipeName = recipeType.getValue() + "_" + varName;
+    ValueRange bounds; // No bounds for memref tests
+
+    if (recipeType == "private") {
+      auto recipe = PrivateRecipeOp::createAndPopulate(
+          builder, loc, recipeName, var.getType(), varName, bounds);
+
+      if (!recipe) {
+        op->emitError("Failed to create private recipe for ") << varName;
+      }
+    } else if (recipeType == "firstprivate") {
+      auto recipe = FirstprivateRecipeOp::createAndPopulate(
+          builder, loc, recipeName, var.getType(), varName, bounds);
+
+      if (!recipe) {
+        op->emitError("Failed to create firstprivate recipe for ") << varName;
+      }
+    }
+  }
+}
+
+} // namespace
+
+namespace mlir {
+namespace test {
+
+void registerTestRecipePopulatePass() {
+  PassRegistration<TestRecipePopulatePass>();
+}
+
+} // namespace test
+} // namespace mlir
diff --git a/mlir/test/lib/Dialect/Test/TestAttrDefs.td b/mlir/test/lib/Dialect/Test/TestAttrDefs.td
index 5685004bbbd25..9e7e4f883b576 100644
--- a/mlir/test/lib/Dialect/Test/TestAttrDefs.td
+++ b/mlir/test/lib/Dialect/Test/TestAttrDefs.td
@@ -22,6 +22,7 @@ include "mlir/IR/AttrTypeBase.td"
 include "mlir/IR/BuiltinAttributeInterfaces.td"
 include "mlir/IR/EnumAttr.td"
 include "mlir/IR/OpAsmInterface.td"
+include "mlir/IR/TensorEncoding.td"
 
 // All of the attributes will extend this class.
 class Test_Attr<string name, list<Trait> traits = []>
@@ -439,4 +440,20 @@ def TestCustomStorageCtorAttr : Test_Attr<"TestCustomStorageCtorAttr"> {
     let hasStorageCustomConstructor = 1;
 }
 
+def TestTensorEncodingAttr : Test_Attr<"TestTensorEncoding",
+    [DeclareAttrInterfaceMethods<VerifiableTensorEncoding>]> {
+  let mnemonic = "tensor_encoding";
+
+  let parameters = (ins "mlir::StringAttr":$dummy);
+  let assemblyFormat = "`<` $dummy `>`";
+}
+
+def TestMemRefLayoutAttr : Test_Attr<"TestMemRefLayout",
+    [DeclareAttrInterfaceMethods<MemRefLayoutAttrInterface>]> {
+  let mnemonic = "memref_layout";
+
+  let parameters = (ins "mlir::StringAttr":$dummy);
+  let assemblyFormat = "`<` $dummy `>`";
+}
+
 #endif // TEST_ATTRDEFS
diff --git a/mlir/test/lib/Dialect/Test/TestAttributes.cpp b/mlir/test/lib/Dialect/Test/TestAttributes.cpp
index fe1e9166a3099..9db7b01dd193b 100644
--- a/mlir/test/lib/Dialect/Test/TestAttributes.cpp
+++ b/mlir/test/lib/Dialect/Test/TestAttributes.cpp
@@ -541,6 +541,24 @@ test::detail::TestCustomStorageCtorAttrAttrStorage::construct(
   return nullptr;
 }
 
+//===----------------------------------------------------------------------===//
+// TestTensorEncodingAttr
+//===----------------------------------------------------------------------===//
+
+::llvm::LogicalResult TestTensorEncodingAttr::verifyEncoding(
+    mlir::ArrayRef<int64_t> shape, mlir::Type elementType,
+    llvm::function_ref<::mlir::InFlightDiagnostic()> emitError) const {
+  return mlir::success();
+}
+
+//===----------------------------------------------------------------------===//
+// TestMemRefLayoutAttr
+//===----------------------------------------------------------------------===//
+
+mlir::AffineMap TestMemRefLayoutAttr::getAffineMap() const {
+  return mlir::AffineMap::getMultiDimIdentityMap(1, getContext());
+}
+
 //===----------------------------------------------------------------------===//
 // TestDialect
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/lib/Dialect/Test/TestAttributes.h b/mlir/test/lib/Dialect/Test/TestAttributes.h
index 778d84fae7365..0ad5ab641c6d0 100644
--- a/mlir/test/lib/Dialect/Test/TestAttributes.h
+++ b/mlir/test/lib/Dialect/Test/TestAttributes.h
@@ -24,6 +24,7 @@
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/DialectImplementation.h"
 #include "mlir/IR/DialectResourceBlobManager.h"
+#include "mlir/IR/TensorEncoding.h"
 
 // generated files require above includes to come first
 #include "TestAttrInterfaces.h.inc"
diff --git a/mlir/test/lib/Dialect/Test/TestDialect.h b/mlir/test/lib/Dialect/Test/TestDialect.h
index f2adca6310d78..bcf3b55d33cb9 100644
--- a/mlir/test/lib/Dialect/Test/TestDialect.h
+++ b/mlir/test/lib/Dialect/Test/TestDialect.h
@@ -18,6 +18,7 @@
 #include "TestInterfaces.h"
 #include "TestTypes.h"
 #include "mlir/Bytecode/BytecodeImplementation.h"
+#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Dialect/DLTI/DLTI.h"
 #include "mlir/Dialect/DLTI/Traits.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
diff --git a/mlir/test/lib/Dialect/Test/TestDialect.td b/mlir/test/lib/Dialect/Test/TestDialect.td
index 2b5491fc0c6a0..37a263f1d10b8 100644
--- a/mlir/test/lib/Dialect/Test/TestDialect.td
+++ b/mlir/test/lib/Dialect/Test/TestDialect.td
@@ -24,7 +24,10 @@ def Test_Dialect : Dialect {
   let useDefaultTypePrinterParser = 0;
   let useDefaultAttributePrinterParser = 1;
   let isExtensible = 1;
-  let dependentDialects = ["::mlir::DLTIDialect"];
+  let dependentDialects = [
+    "::mlir::DLTIDialect",
+    "::mlir::bufferization::BufferizationDialect"
+  ];
   let discardableAttrs = (ins
      "mlir::IntegerAttr":$discardable_attr_key,
      "SimpleAAttr":$other_discardable_attr_key
diff --git a/mlir/test/lib/Dialect/Test/TestOpDefs.cpp b/mlir/test/lib/Dialect/Test/TestOpDefs.cpp
index 53055fea215b7..b211e243f234c 100644
--- a/mlir/test/lib/Dialect/Test/TestOpDefs.cpp
+++ b/mlir/test/lib/Dialect/Test/TestOpDefs.cpp
@@ -1425,6 +1425,39 @@ TestMultiSlotAlloca::handleDestructuringComplete(
   return createNewMultiAllocaWithoutSlot(slot, builder, *this);
 }
 
+namespace {
+/// Returns test dialect's memref layout for test dialect's tensor encoding when
+/// applicable.
+MemRefLayoutAttrInterface
+getMemRefLayoutForTensorEncoding(RankedTensorType tensorType) {
+  if (auto encoding =
+          dyn_cast<test::TestTensorEncodingAttr>(tensorType.getEncoding())) {
+    return cast<MemRefLayoutAttrInterface>(test::TestMemRefLayoutAttr::get(
+        tensorType.getContext(), encoding.getDummy()));
+  }
+  return {};
+}
+
+/// Auxiliary bufferization function for test and builtin tensors.
+bufferization::BufferLikeType
+convertTensorToBuffer(mlir::Operation *op,
+                      const bufferization::BufferizationOptions &options,
+                      bufferization::TensorLikeType tensorLike) {
+  auto buffer =
+      *tensorLike.getBufferType(options, [&]() { return op->emitError(); });
+  if (auto memref = dyn_cast<MemRefType>(buffer)) {
+    // Note: For the sake of testing, we want to ensure that encoding -> layout
+    // bufferization happens. This is currently achieved manually.
+    auto layout =
+        getMemRefLayoutForTensorEncoding(cast<RankedTensorType>(tensorLike));
+    return cast<bufferization::BufferLikeType>(
+        MemRefType::get(memref.getShape(), memref.getElementType(), layout,
+                        memref.getMemorySpace()));
+  }
+  return buffer;
+}
+} // namespace
+
 ::mlir::LogicalResult test::TestDummyTensorOp::bufferize(
     ::mlir::RewriterBase &rewriter,
     const ::mlir::bufferization::BufferizationOptions &options,
@@ -1435,8 +1468,8 @@ ::mlir::LogicalResult test::TestDummyTensorOp::bufferize(
     return failure();
 
   const auto outType = getOutput().getType();
-  const auto bufferizedOutType = test::TestMemrefType::get(
-      getContext(), outType.getShape(), outType.getElementType(), nullptr);
+  const auto bufferizedOutType =
+      convertTensorToBuffer(getOperation(), options, outType);
   // replace op with memref analogy
   auto dummyMemrefOp = test::TestDummyMemrefOp::create(
       rewriter, getLoc(), bufferizedOutType, *buffer);
@@ -1470,13 +1503,12 @@ ::mlir::LogicalResult test::TestCreateTensorOp::bufferize(
 
 mlir::FailureOr<mlir::bufferization::BufferLikeType>
 test::TestCreateTensorOp::getBufferType(
-    mlir::Value value, const mlir::bufferization::BufferizationOptions &,
+    mlir::Value value, const mlir::bufferization::BufferizationOptions &options,
     const mlir::bufferization::BufferizationState &,
     llvm::SmallVector<::mlir::Value> &) {
-  const auto type = dyn_cast<test::TestTensorType>(value.getType());
+  const auto type = dyn_cast<bufferization::TensorLikeType>(value.getType());
   if (type == nullptr)
     return failure();
 
-  return cast<mlir::bufferization::BufferLikeType>(test::TestMemrefType::get(
-      getContext(), type.getShape(), type.getElementType(), nullptr));
+  return convertTensorToBuffer(getOperation(), options, type);
 }
diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td
index 6329d61ba691b..05a33cf1afd94 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.td
+++ b/mlir/test/lib/Dialect/Test/TestOps.td
@@ -32,6 +32,7 @@ include "mlir/Interfaces/MemorySlotInterfaces.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/Interfaces/ValueBoundsOpInterface.td"
 include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td"
+include "mlir/Dialect/Bufferization/IR/BufferizationTypeInterfaces.td"
 
 // Include the attribute definitions.
 include "TestAttrDefs.td"
@@ -2335,7 +2336,7 @@ def SideEffectWithRegionOp : TEST_Op<"side_effect_with_region_op",
 }
 
 //===----------------------------------------------------------------------===//
-// Copy Operation Test 
+// Copy Operation Test
 //===----------------------------------------------------------------------===//
 
 def CopyOp : TEST_Op<"copy", []> {
@@ -3676,10 +3677,10 @@ def TestDummyTensorOp : TEST_Op<"dummy_tensor_op",
         ["bufferize", "bufferizesToMemoryRead",
          "bufferizesToMemoryWrite", "getAliasingValues"]>]> {
   let arguments = (ins
-    Arg<TestTensorType>:$input
+    Arg<Bufferization_TensorLikeTypeInterface>:$input
   );
   let results = (outs
-    Arg<TestTensorType>:$output
+    Arg<Bufferization_TensorLikeTypeInterface>:$output
   );
 
   let extraClassDefinition = [{
@@ -3701,10 +3702,10 @@ def TestDummyTensorOp : TEST_Op<"dummy_tensor_op",
 
 def TestDummyMemrefOp : TEST_Op<"dummy_memref_op", []> {
   let arguments = (ins
-    Arg<TestMemrefType>:$input
+    Arg<Bufferization_BufferLikeTypeInterface>:$input
   );
   let results = (outs
-    Arg<TestMemrefType>:$output
+    Arg<Bufferization_BufferLikeTypeInterface>:$output
   );
 }
 
@@ -3714,7 +3715,7 @@ def TestCreateTensorOp : TEST_Op<"create_tensor_op",
          "bufferizesToMemoryWrite", "getAliasingValues",
          "bufferizesToAllocation"]>]> {
   let arguments = (ins);
-  let results = (outs Arg<TestTensorType>:$output);
+  let results = (outs Arg<Bufferization_TensorLikeTypeInterface>:$output);
   let extraClassDefinition = [{
     bool test::TestCreateTensorOp::bufferizesToMemoryRead(::mlir::OpOperand&,
         const ::mlir::bufferization::AnalysisState&) {
@@ -3738,7 +3739,7 @@ def TestCreateTensorOp : TEST_Op<"create_tensor_op",
 
 def TestCreateMemrefOp : TEST_Op<"create_memref_op"> {
   let arguments = (ins);
-  let results = (outs Arg<TestMemrefType>:$output);
+  let results = (outs Arg<Bufferization_BufferLikeTypeInterface>:$output);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.cpp b/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.cpp
index 97fc699636566..496f18bc49fad 100644
--- a/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.cpp
+++ b/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.cpp
@@ -938,10 +938,10 @@ class TestTransformDialectExtension
 
 // These are automatically generated by ODS but are not used as the Transform
 // dialect uses a different dispatch mechanism to support dialect extensions.
-LLVM_ATTRIBUTE_UNUSED static OptionalParseResult
+[[maybe_unused]] static OptionalParseResult
 generatedTypeParser(AsmParser &parser, StringRef *mnemonic, Type &value);
-LLVM_ATTRIBUTE_UNUSED static LogicalResult
-generatedTypePrinter(Type def, AsmPrinter &printer);
+[[maybe_unused]] static LogicalResult generatedTypePrinter(Type def,
+                                                           AsmPrinter &printer);
 
 #define GET_TYPEDEF_CLASSES
 #include "TestTransformDialectExtensionTypes.cpp.inc"
diff --git a/mlir/test/lib/Pass/TestRemarksPass.cpp b/mlir/test/lib/Pass/TestRemarksPass.cpp
index 3b25686b3dc14..5ca2d1a8550aa 100644
--- a/mlir/test/lib/Pass/TestRemarksPass.cpp
+++ b/mlir/test/lib/Pass/TestRemarksPass.cpp
@@ -43,7 +43,12 @@ class TestRemarkPass : public PassWrapper<TestRemarkPass, OperationPass<>> {
           << remark::add("This is a test missed remark")
           << remark::reason("because we are testing the remark pipeline")
           << remark::suggest("try using the remark pipeline feature");
-
+      mlir::remark::passed(
+          loc,
+          remark::RemarkOpts::name("test-remark").category("category-1-passed"))
+          << remark::add("This is a test passed remark (should be dropped)")
+          << remark::reason("because we are testing the remark pipeline")
+          << remark::suggest("try using the remark pipeline feature");
       mlir::remark::passed(
           loc,
           remark::RemarkOpts::name("test-remark").category("category-1-passed"))
diff --git a/mlir/test/mlir-pdll/CodeGen/CPP/general.pdll b/mlir/test/mlir-pdll/CodeGen/CPP/general.pdll
index 4e869e5dbcebe..4be30d86dda35 100644
--- a/mlir/test/mlir-pdll/CodeGen/CPP/general.pdll
+++ b/mlir/test/mlir-pdll/CodeGen/CPP/general.pdll
@@ -28,7 +28,7 @@
 // CHECK:      operation "test.op3"
 // CHECK:  )mlir", context), std::forward<ConfigsT>(configs)...)
 
-// CHECK:      static void LLVM_ATTRIBUTE_UNUSED populateGeneratedPDLLPatterns(::mlir::RewritePatternSet &patterns, ConfigsT &&...configs) {
+// CHECK{LITERAL}: [[maybe_unused]] static void populateGeneratedPDLLPatterns(::mlir::RewritePatternSet &patterns, ConfigsT &&...configs) {
 // CHECK-NEXT:   patterns.add<GeneratedPDLLPattern0>(patterns.getContext(), configs...);
 // CHECK-NEXT:   patterns.add<NamedPattern>(patterns.getContext(), configs...);
 // CHECK-NEXT:   patterns.add<GeneratedPDLLPattern1>(patterns.getContext(), configs...);
diff --git a/mlir/test/mlir-tblgen/cpp-class-comments.td b/mlir/test/mlir-tblgen/cpp-class-comments.td
index a896888d944b2..9dcf975e45286 100644
--- a/mlir/test/mlir-tblgen/cpp-class-comments.td
+++ b/mlir/test/mlir-tblgen/cpp-class-comments.td
@@ -96,17 +96,14 @@ def EncodingTrait : AttrInterface<"EncodingTrait"> {
   }];
   let methods = [
   ];
-// ATTR-INTERFACE: namespace mlir
-// ATTR-INTERFACE-NEXT: namespace a
-// ATTR-INTERFACE-NEXT: namespace traits
+// ATTR-INTERFACE: namespace mlir::a::traits {
 // ATTR-INTERFACE-NEXT: /// Common trait for all layouts.
 // ATTR-INTERFACE-NEXT: class EncodingTrait;
 }
 
 def SimpleEncodingTrait : AttrInterface<"SimpleEncodingTrait"> {
   let cppNamespace = "a::traits";
-// ATTR-INTERFACE: namespace a {
-// ATTR-INTERFACE-NEXT: namespace traits {
+// ATTR-INTERFACE: namespace a::traits {
 // ATTR-INTERFACE-NEXT: class SimpleEncodingTrait;
 }
 
@@ -116,8 +113,7 @@ def SimpleOpInterface : OpInterface<"SimpleOpInterface"> {
 
     Simple Op Interface description
     }];
-// OP-INTERFACE: namespace a {
-// OP-INTERFACE-NEXT: namespace traits {
+// OP-INTERFACE: namespace a::traits {
 // OP-INTERFACE-NEXT: /// Simple Op Interface description
 // OP-INTERFACE-NEXT: class SimpleOpInterface;
 }
diff --git a/mlir/test/python/dialects/gpu/dialect.py b/mlir/test/python/dialects/gpu/dialect.py
index 26ee9f34cb332..3945c99c41091 100644
--- a/mlir/test/python/dialects/gpu/dialect.py
+++ b/mlir/test/python/dialects/gpu/dialect.py
@@ -1,7 +1,9 @@
 # RUN: %PYTHON %s | FileCheck %s
 
 from mlir.ir import *
-import mlir.dialects.gpu as gpu
+import mlir.ir as ir
+from mlir.dialects import gpu, func, arith, math
+from mlir.extras import types as T
 import mlir.dialects.gpu.passes
 from mlir.passmanager import *
 
@@ -64,3 +66,188 @@ def testObjectAttr():
     # CHECK: #gpu.object<#nvvm.target, kernels = <[#gpu.kernel_metadata<"kernel", () -> ()>]>, "BC\C0\DE5\14\00\00\05\00\00\00b\0C0$MY\BEf">
     print(o)
     assert o.kernels == kernelTable
+
+
+# CHECK-LABEL: testGPUFuncOp
+@run
+def testGPUFuncOp():
+    assert gpu.GPUFuncOp.__doc__ is not None
+    module = Module.create()
+    with InsertionPoint(module.body):
+        gpu_module_name = StringAttr.get("gpu_module")
+        gpumodule = gpu.GPUModuleOp(gpu_module_name)
+        block = gpumodule.bodyRegion.blocks.append()
+
+        def builder(func: gpu.GPUFuncOp) -> None:
+            gpu.GlobalIdOp(gpu.Dimension.x)
+            gpu.ReturnOp([])
+
+        with InsertionPoint(block):
+            name = StringAttr.get("kernel0")
+            func_type = ir.FunctionType.get(inputs=[], results=[])
+            type_attr = TypeAttr.get(func_type)
+            func = gpu.GPUFuncOp(type_attr, name)
+            func.attributes["sym_name"] = name
+            func.attributes["gpu.kernel"] = UnitAttr.get()
+
+            try:
+                func.entry_block
+                assert False, "Expected RuntimeError"
+            except RuntimeError as e:
+                assert (
+                    str(e)
+                    == "Entry block does not exist for kernel0. Do you need to call the add_entry_block() method on this GPUFuncOp?"
+                )
+
+            block = func.add_entry_block()
+            with InsertionPoint(block):
+                builder(func)
+
+            try:
+                func.add_entry_block()
+                assert False, "Expected RuntimeError"
+            except RuntimeError as e:
+                assert str(e) == "Entry block already exists for kernel0"
+
+            func = gpu.GPUFuncOp(
+                func_type,
+                sym_name="kernel1",
+                kernel=True,
+                body_builder=builder,
+                known_block_size=[1, 2, 3],
+                known_grid_size=DenseI32ArrayAttr.get([4, 5, 6]),
+            )
+
+            assert func.name.value == "kernel1"
+            assert func.function_type.value == func_type
+            assert func.arg_attrs == None
+            assert func.res_attrs == None
+            assert func.arguments == []
+            assert func.entry_block == func.body.blocks[0]
+            assert func.is_kernel
+            assert func.known_block_size == DenseI32ArrayAttr.get(
+                [1, 2, 3]
+            ), func.known_block_size
+            assert func.known_grid_size == DenseI32ArrayAttr.get(
+                [4, 5, 6]
+            ), func.known_grid_size
+
+            func = gpu.GPUFuncOp(
+                func_type,
+                sym_name="non_kernel_func",
+                body_builder=builder,
+            )
+            assert not func.is_kernel
+            assert func.known_block_size is None
+            assert func.known_grid_size is None
+
+    print(module)
+
+    # CHECK: gpu.module @gpu_module
+    # CHECK: gpu.func @kernel0() kernel {
+    # CHECK:   %[[VAL_0:.*]] = gpu.global_id  x
+    # CHECK:   gpu.return
+    # CHECK: }
+    # CHECK: gpu.func @kernel1() kernel attributes
+    # CHECK-SAME: known_block_size = array<i32: 1, 2, 3>
+    # CHECK-SAME: known_grid_size = array<i32: 4, 5, 6>
+    # CHECK:   %[[VAL_0:.*]] = gpu.global_id  x
+    # CHECK:   gpu.return
+    # CHECK: }
+    # CHECK: gpu.func @non_kernel_func() {
+    # CHECK:   %[[VAL_0:.*]] = gpu.global_id  x
+    # CHECK:   gpu.return
+    # CHECK: }
+
+
+# CHECK-LABEL: testGPULaunchFuncOp
+@run
+def testGPULaunchFuncOp():
+    module = Module.create()
+
+    module.operation.attributes["gpu.container_module"] = UnitAttr.get()
+    with InsertionPoint(module.body):
+        gpu_module = gpu.GPUModuleOp("gpu_module")
+        block = gpu_module.bodyRegion.blocks.append()
+
+    with InsertionPoint(block):
+        gpu_func = gpu.GPUFuncOp(
+            FunctionType.get([], []),
+            "kernel",
+            body_builder=lambda func: gpu.return_([]),
+            kernel=True,
+        )
+
+    with InsertionPoint(module.body):
+        host = func.FuncOp(type=FunctionType.get([], []), name="host")
+
+    with InsertionPoint(host.add_entry_block()):
+        c1 = arith.constant(T.index(), 1)
+        grid_sizes = (1, 1, 1)
+        block_sizes = (1, 1, 1)
+        token = gpu.wait()
+        token = gpu.launch_func(
+            async_dependencies=[token],
+            kernel=[gpu_module.sym_name.value, gpu_func.name.value],
+            grid_size=grid_sizes,
+            block_size=block_sizes,
+            kernel_operands=[],
+        )
+        gpu.wait(async_dependencies=[token])
+        func.ReturnOp([])
+
+    print(module)
+
+    # CHECK-LABEL:   gpu.module @gpu_module {
+    # CHECK:           gpu.func @kernel() kernel {
+    # CHECK:             gpu.return
+    # CHECK:           }
+    # CHECK:         }
+
+    # CHECK-LABEL:   func.func @host() {
+    # CHECK:           %[[CONSTANT_0:.*]] = arith.constant 1 : index
+    # CHECK:           %[[WAIT_0:.*]] = gpu.wait async
+    # CHECK:           %[[CONSTANT_1:.*]] = arith.constant 1 : index
+    # CHECK:           %[[CONSTANT_2:.*]] = arith.constant 1 : index
+    # CHECK:           %[[CONSTANT_3:.*]] = arith.constant 1 : index
+    # CHECK:           %[[CONSTANT_4:.*]] = arith.constant 1 : index
+    # CHECK:           %[[CONSTANT_5:.*]] = arith.constant 1 : index
+    # CHECK:           %[[CONSTANT_6:.*]] = arith.constant 1 : index
+    # CHECK:           %[[LAUNCH_FUNC_0:.*]] = gpu.launch_func async {{\[}}%[[WAIT_0]]] @gpu_module::@kernel blocks in (%[[CONSTANT_1]], %[[CONSTANT_2]], %[[CONSTANT_3]]) threads in (%[[CONSTANT_4]], %[[CONSTANT_5]], %[[CONSTANT_6]])
+    # CHECK:           %[[WAIT_1:.*]] = gpu.wait async {{\[}}%[[LAUNCH_FUNC_0]]]
+    # CHECK:           return
+    # CHECK:         }
+
+
+# CHECK-LABEL: testGPULaunchOp
+@run
+def testGPULaunchOp():
+    module = Module.create()
+
+    with InsertionPoint(module.body):
+        host = func.FuncOp(type=FunctionType.get([T.f32()], []), name="gpu_printf")
+
+    entry_block = host.add_entry_block()
+    with InsertionPoint(entry_block):
+        c1 = arith.constant(T.index(), 1)
+        grid_sizes = (c1, c1, c1)
+        block_sizes = (c1, c1, c1)
+
+        launch = gpu.launch(grid_sizes, block_sizes)
+
+    op = launch(lambda *args: gpu.printf("%f", args[0]))
+
+    with InsertionPoint(entry_block):
+        func.ReturnOp([])
+
+    print(module)
+
+    # CHECK-LABEL:   func.func @gpu_printf(
+    # CHECK-SAME:      %[[ARG0:.*]]: f32) {
+    # CHECK:           %[[CONSTANT_0:.*]] = arith.constant 1 : index
+    # CHECK:           gpu.launch blocks(%[[VAL_0:.*]], %[[VAL_1:.*]], %[[VAL_2:.*]]) in (%[[VAL_3:.*]] = %[[CONSTANT_0]], %[[VAL_4:.*]] = %[[CONSTANT_0]], %[[VAL_5:.*]] = %[[CONSTANT_0]]) threads(%[[VAL_6:.*]], %[[VAL_7:.*]], %[[VAL_8:.*]]) in (%[[VAL_9:.*]] = %[[CONSTANT_0]], %[[VAL_10:.*]] = %[[CONSTANT_0]], %[[VAL_11:.*]] = %[[CONSTANT_0]]) {
+    # CHECK:             gpu.printf "%[[VAL_12:.*]]", %[[VAL_0]] : index
+    # CHECK:             gpu.terminator
+    # CHECK:           }
+    # CHECK:           return
+    # CHECK:         }
diff --git a/mlir/test/python/dialects/llvm.py b/mlir/test/python/dialects/llvm.py
index d9ffdeb65bfd4..8ea0fddee3f7c 100644
--- a/mlir/test/python/dialects/llvm.py
+++ b/mlir/test/python/dialects/llvm.py
@@ -150,3 +150,22 @@ def testIntrinsics():
     result = llvm.intr_memset(alloca, c_0, c_128, False)
     # CHECK: "llvm.intr.memset"(%[[ALLOCA]], %[[CST0]], %[[CST128]]) <{isVolatile = false}> : (!llvm.ptr, i8, i32) -> ()
     print(result)
+
+
+# CHECK-LABEL: testTranslateToLLVMIR
+@constructAndPrintInModule
+def testTranslateToLLVMIR():
+    with Context(), Location.unknown():
+        module = Module.parse(
+            """\
+            llvm.func @add(%arg0: i64, %arg1: i64) -> i64 { 
+               %0 = llvm.add %arg0, %arg1  : i64 
+               llvm.return %0 : i64 
+            }
+        """
+        )
+        # CHECK: define i64 @add(i64 %0, i64 %1) {
+        # CHECK:   %3 = add i64 %0, %1
+        # CHECK:   ret i64 %3
+        # CHECK: }
+        print(llvm.translate_module_to_llvmir(module.operation))
diff --git a/mlir/test/python/dialects/openacc.py b/mlir/test/python/dialects/openacc.py
new file mode 100644
index 0000000000000..8f2142a74c7a1
--- /dev/null
+++ b/mlir/test/python/dialects/openacc.py
@@ -0,0 +1,171 @@
+# RUN: %PYTHON %s | FileCheck %s
+from unittest import result
+from mlir.ir import (
+    Context,
+    FunctionType,
+    Location,
+    Module,
+    InsertionPoint,
+    IntegerType,
+    IndexType,
+    MemRefType,
+    F32Type,
+    Block,
+    ArrayAttr,
+    Attribute,
+    UnitAttr,
+    StringAttr,
+    DenseI32ArrayAttr,
+    ShapedType,
+)
+from mlir.dialects import openacc, func, arith, memref
+from mlir.extras import types
+
+
+def run(f):
+    print("\n// TEST:", f.__name__)
+    with Context(), Location.unknown():
+        f()
+    return f
+
+
+@run
+def testParallelMemcpy():
+    module = Module.create()
+
+    dynamic = ShapedType.get_dynamic_size()
+    memref_f32_1d_any = MemRefType.get([dynamic], types.f32())
+
+    with InsertionPoint(module.body):
+        function_type = FunctionType.get(
+            [memref_f32_1d_any, memref_f32_1d_any, types.i64()], []
+        )
+        f = func.FuncOp(
+            type=function_type,
+            name="memcpy_idiom",
+        )
+        f.attributes["sym_visibility"] = StringAttr.get("public")
+
+    with InsertionPoint(f.add_entry_block()):
+        c1024 = arith.ConstantOp(types.i32(), 1024)
+        c128 = arith.ConstantOp(types.i32(), 128)
+
+        arg0, arg1, arg2 = f.arguments
+
+        copied = openacc.copyin(
+            acc_var=arg0.type,
+            var=arg0,
+            var_type=types.f32(),
+            bounds=[],
+            async_operands=[],
+            implicit=False,
+            structured=True,
+        )
+        created = openacc.create_(
+            acc_var=arg1.type,
+            var=arg1,
+            var_type=types.f32(),
+            bounds=[],
+            async_operands=[],
+            implicit=False,
+            structured=True,
+        )
+
+        parallel_op = openacc.ParallelOp(
+            asyncOperands=[],
+            waitOperands=[],
+            numGangs=[c1024],
+            numWorkers=[],
+            vectorLength=[c128],
+            reductionOperands=[],
+            privateOperands=[],
+            firstprivateOperands=[],
+            dataClauseOperands=[],
+        )
+
+        # Set required device_type and segment attributes to satisfy verifier
+        acc_device_none = ArrayAttr.get([Attribute.parse("#acc.device_type<none>")])
+        parallel_op.numGangsDeviceType = acc_device_none
+        parallel_op.numGangsSegments = DenseI32ArrayAttr.get([1])
+        parallel_op.vectorLengthDeviceType = acc_device_none
+
+        parallel_block = Block.create_at_start(parent=parallel_op.region, arg_types=[])
+
+        with InsertionPoint(parallel_block):
+            c0 = arith.ConstantOp(types.i64(), 0)
+            c1 = arith.ConstantOp(types.i64(), 1)
+
+            loop_op = openacc.LoopOp(
+                results_=[],
+                lowerbound=[c0],
+                upperbound=[f.arguments[2]],
+                step=[c1],
+                gangOperands=[],
+                workerNumOperands=[],
+                vectorOperands=[],
+                tileOperands=[],
+                cacheOperands=[],
+                privateOperands=[],
+                reductionOperands=[],
+                firstprivateOperands=[],
+            )
+
+            # Set loop attributes: gang and independent on device_type<none>
+            acc_device_none = ArrayAttr.get([Attribute.parse("#acc.device_type<none>")])
+            loop_op.gang = acc_device_none
+            loop_op.independent = acc_device_none
+
+            loop_block = Block.create_at_start(
+                parent=loop_op.region, arg_types=[types.i64()]
+            )
+
+            with InsertionPoint(loop_block):
+                idx = arith.index_cast(out=IndexType.get(), in_=loop_block.arguments[0])
+                val = memref.load(memref=copied, indices=[idx])
+                memref.store(value=val, memref=created, indices=[idx])
+                openacc.YieldOp([])
+
+            openacc.YieldOp([])
+
+        deleted = openacc.delete(
+            acc_var=copied,
+            bounds=[],
+            async_operands=[],
+            implicit=False,
+            structured=True,
+        )
+        copied = openacc.copyout(
+            acc_var=created,
+            var=arg1,
+            var_type=types.f32(),
+            bounds=[],
+            async_operands=[],
+            implicit=False,
+            structured=True,
+        )
+        func.ReturnOp([])
+
+    print(module)
+
+    # CHECK: TEST: testParallelMemcpy
+    # CHECK-LABEL:   func.func public @memcpy_idiom(
+    # CHECK-SAME:      %[[ARG0:.*]]: memref<?xf32>, %[[ARG1:.*]]: memref<?xf32>, %[[ARG2:.*]]: i64) {
+    # CHECK:           %[[CONSTANT_0:.*]] = arith.constant 1024 : i32
+    # CHECK:           %[[CONSTANT_1:.*]] = arith.constant 128 : i32
+    # CHECK:           %[[COPYIN_0:.*]] = acc.copyin varPtr(%[[ARG0]] : memref<?xf32>) -> memref<?xf32>
+    # CHECK:           %[[CREATE_0:.*]] = acc.create varPtr(%[[ARG1]] : memref<?xf32>) -> memref<?xf32>
+    # CHECK:           acc.parallel num_gangs({%[[CONSTANT_0]] : i32}) vector_length(%[[CONSTANT_1]] : i32) {
+    # CHECK:             %[[CONSTANT_2:.*]] = arith.constant 0 : i64
+    # CHECK:             %[[CONSTANT_3:.*]] = arith.constant 1 : i64
+    # CHECK:             acc.loop gang control(%[[VAL_0:.*]] : i64) = (%[[CONSTANT_2]] : i64) to (%[[ARG2]] : i64)  step (%[[CONSTANT_3]] : i64) {
+    # CHECK:               %[[INDEX_CAST_0:.*]] = arith.index_cast %[[VAL_0]] : i64 to index
+    # CHECK:               %[[LOAD_0:.*]] = memref.load %[[COPYIN_0]]{{\[}}%[[INDEX_CAST_0]]] : memref<?xf32>
+    # CHECK:               memref.store %[[LOAD_0]], %[[CREATE_0]]{{\[}}%[[INDEX_CAST_0]]] : memref<?xf32>
+    # CHECK:               acc.yield
+    # CHECK:             } attributes {independent = [#acc.device_type<none>]}
+    # CHECK:             acc.yield
+    # CHECK:           }
+    # CHECK:           acc.delete accPtr(%[[COPYIN_0]] : memref<?xf32>)
+    # CHECK:           acc.copyout accPtr(%[[CREATE_0]] : memref<?xf32>) to varPtr(%[[ARG1]] : memref<?xf32>)
+    # CHECK:           return
+    # CHECK:         }
diff --git a/mlir/test/python/dialects/transform_smt_ext.py b/mlir/test/python/dialects/transform_smt_ext.py
index 3692fd92344a6..e28c56f277439 100644
--- a/mlir/test/python/dialects/transform_smt_ext.py
+++ b/mlir/test/python/dialects/transform_smt_ext.py
@@ -25,26 +25,44 @@ def run(f):
 # CHECK-LABEL: TEST: testConstrainParamsOp
 @run
 def testConstrainParamsOp(target):
-    dummy_value = ir.IntegerAttr.get(ir.IntegerType.get_signless(32), 42)
+    c42_attr = ir.IntegerAttr.get(ir.IntegerType.get_signless(32), 42)
     # CHECK: %[[PARAM_AS_PARAM:.*]] = transform.param.constant
-    symbolic_value = transform.ParamConstantOp(
-        transform.AnyParamType.get(), dummy_value
+    symbolic_value_as_param = transform.ParamConstantOp(
+        transform.AnyParamType.get(), c42_attr
     )
     # CHECK: transform.smt.constrain_params(%[[PARAM_AS_PARAM]])
     constrain_params = transform_smt.ConstrainParamsOp(
-        [symbolic_value], [smt.IntType.get()]
+        [], [symbolic_value_as_param], [smt.IntType.get()]
     )
     # CHECK-NEXT: ^bb{{.*}}(%[[PARAM_AS_SMT_SYMB:.*]]: !smt.int):
     with ir.InsertionPoint(constrain_params.body):
+        symbolic_value_as_smt_var = constrain_params.body.arguments[0]
         # CHECK: %[[C0:.*]] = smt.int.constant 0
         c0 = smt.IntConstantOp(ir.IntegerAttr.get(ir.IntegerType.get_signless(32), 0))
         # CHECK: %[[C43:.*]] = smt.int.constant 43
         c43 = smt.IntConstantOp(ir.IntegerAttr.get(ir.IntegerType.get_signless(32), 43))
         # CHECK: %[[LB:.*]] = smt.int.cmp le %[[C0]], %[[PARAM_AS_SMT_SYMB]]
-        lb = smt.IntCmpOp(smt.IntPredicate.le, c0, constrain_params.body.arguments[0])
+        lb = smt.IntCmpOp(smt.IntPredicate.le, c0, symbolic_value_as_smt_var)
         # CHECK: %[[UB:.*]] = smt.int.cmp le %[[PARAM_AS_SMT_SYMB]], %[[C43]]
-        ub = smt.IntCmpOp(smt.IntPredicate.le, constrain_params.body.arguments[0], c43)
+        ub = smt.IntCmpOp(smt.IntPredicate.le, symbolic_value_as_smt_var, c43)
         # CHECK: %[[BOUNDED:.*]] = smt.and %[[LB]], %[[UB]]
         bounded = smt.AndOp([lb, ub])
         # CHECK: smt.assert %[[BOUNDED:.*]]
         smt.AssertOp(bounded)
+        smt.YieldOp([])
+
+    # CHECK: transform.smt.constrain_params(%[[PARAM_AS_PARAM]])
+    compute_with_params = transform_smt.ConstrainParamsOp(
+        [transform.ParamType.get(ir.IntegerType.get_signless(32))],
+        [symbolic_value_as_param],
+        [smt.IntType.get()],
+    )
+    # CHECK-NEXT: ^bb{{.*}}(%[[SMT_SYMB:.*]]: !smt.int):
+    with ir.InsertionPoint(compute_with_params.body):
+        symbolic_value_as_smt_var = compute_with_params.body.arguments[0]
+        # CHECK: %[[TWICE:.*]] = smt.int.add %[[SMT_SYMB]], %[[SMT_SYMB]]
+        twice_symb = smt.IntAddOp(
+            [symbolic_value_as_smt_var, symbolic_value_as_smt_var]
+        )
+        # CHECK: smt.yield %[[TWICE]]
+        smt.YieldOp([twice_symb])
diff --git a/mlir/test/python/ir/operation.py b/mlir/test/python/ir/operation.py
index cb4cfc8c8a6ec..1d4ede1ee336a 100644
--- a/mlir/test/python/ir/operation.py
+++ b/mlir/test/python/ir/operation.py
@@ -569,12 +569,30 @@ def testOperationAttributes():
     # CHECK: Attribute value b'text'
     print(f"Attribute value {sattr.value_bytes}")
 
+    # Python dict-style iteration
     # We don't know in which order the attributes are stored.
-    # CHECK-DAG: NamedAttribute(dependent="text")
-    # CHECK-DAG: NamedAttribute(other.attribute=3.000000e+00 : f64)
-    # CHECK-DAG: NamedAttribute(some.attribute=1 : i8)
-    for attr in op.attributes:
-        print(str(attr))
+    # CHECK-DAG: dependent
+    # CHECK-DAG: other.attribute
+    # CHECK-DAG: some.attribute
+    for name in op.attributes:
+        print(name)
+
+    # Basic dict-like introspection
+    # CHECK: True
+    print("some.attribute" in op.attributes)
+    # CHECK: False
+    print("missing" in op.attributes)
+    # CHECK: Keys: ['dependent', 'other.attribute', 'some.attribute']
+    print("Keys:", sorted(op.attributes.keys()))
+    # CHECK: Values count 3
+    print("Values count", len(op.attributes.values()))
+    # CHECK: Items count 3
+    print("Items count", len(op.attributes.items()))
+
+    # Dict() conversion test
+    d = {k: v.value for k, v in dict(op.attributes).items()}
+    # CHECK: Dict mapping {'dependent': 'text', 'other.attribute': 3.0, 'some.attribute': 1}
+    print("Dict mapping", d)
 
     # Check that exceptions are raised as expected.
     try:
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
index 6432fae615f88..88421800fed1e 100644
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -151,6 +151,7 @@ void registerTestSliceAnalysisPass();
 void registerTestSPIRVCPURunnerPipeline();
 void registerTestSPIRVFuncSignatureConversion();
 void registerTestSPIRVVectorUnrolling();
+void registerTestStridedMetadataRangeAnalysisPass();
 void registerTestTensorCopyInsertionPass();
 void registerTestTensorLikeAndBufferLikePass();
 void registerTestTensorTransforms();
@@ -299,6 +300,7 @@ void registerTestPasses() {
   mlir::test::registerTestSPIRVCPURunnerPipeline();
   mlir::test::registerTestSPIRVFuncSignatureConversion();
   mlir::test::registerTestSPIRVVectorUnrolling();
+  mlir::test::registerTestStridedMetadataRangeAnalysisPass();
   mlir::test::registerTestTensorCopyInsertionPass();
   mlir::test::registerTestTensorLikeAndBufferLikePass();
   mlir::test::registerTestTensorTransforms();
diff --git a/mlir/tools/mlir-pdll/mlir-pdll.cpp b/mlir/tools/mlir-pdll/mlir-pdll.cpp
index f99dcdb53fe97..76122a0411513 100644
--- a/mlir/tools/mlir-pdll/mlir-pdll.cpp
+++ b/mlir/tools/mlir-pdll/mlir-pdll.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/VirtualFileSystem.h"
 #include <set>
 
 using namespace mlir;
@@ -41,6 +42,7 @@ processBuffer(raw_ostream &os, std::unique_ptr<llvm::MemoryBuffer> chunkBuffer,
               bool dumpODS, std::set<std::string> *includedFiles) {
   llvm::SourceMgr sourceMgr;
   sourceMgr.setIncludeDirs(includeDirs);
+  sourceMgr.setVirtualFileSystem(llvm::vfs::getRealFileSystem());
   sourceMgr.AddNewSourceBuffer(std::move(chunkBuffer), SMLoc());
 
   // If we are dumping ODS information, also enable documentation to ensure the
diff --git a/mlir/tools/mlir-tblgen/EnumsGen.cpp b/mlir/tools/mlir-tblgen/EnumsGen.cpp
index d55ad482f02c2..11bf9ce732ce6 100644
--- a/mlir/tools/mlir-tblgen/EnumsGen.cpp
+++ b/mlir/tools/mlir-tblgen/EnumsGen.cpp
@@ -20,6 +20,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/TableGen/CodeGenHelpers.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/TableGenBackend.h"
@@ -701,11 +702,7 @@ static void emitEnumDecl(const Record &enumDef, raw_ostream &os) {
   StringRef underlyingToSymFnName = enumInfo.getUnderlyingToSymbolFnName();
   auto enumerants = enumInfo.getAllCases();
 
-  SmallVector<StringRef, 2> namespaces;
-  llvm::SplitString(cppNamespace, namespaces, "::");
-
-  for (auto ns : namespaces)
-    os << "namespace " << ns << " {\n";
+  llvm::NamespaceEmitter ns(os, cppNamespace);
 
   // Emit the enum class definition
   emitEnumClass(enumDef, enumName, underlyingType, description, enumerants, os);
@@ -766,8 +763,7 @@ class {1} : public ::mlir::{2} {
     os << formatv(attrClassDecl, enumName, attrClassName, baseAttrClassName);
   }
 
-  for (auto ns : llvm::reverse(namespaces))
-    os << "} // namespace " << ns << "\n";
+  ns.close();
 
   // Generate a generic parser and printer for the enum.
   std::string qualName =
@@ -790,13 +786,8 @@ static bool emitEnumDecls(const RecordKeeper &records, raw_ostream &os) {
 
 static void emitEnumDef(const Record &enumDef, raw_ostream &os) {
   EnumInfo enumInfo(enumDef);
-  StringRef cppNamespace = enumInfo.getCppNamespace();
 
-  SmallVector<StringRef, 2> namespaces;
-  llvm::SplitString(cppNamespace, namespaces, "::");
-
-  for (auto ns : namespaces)
-    os << "namespace " << ns << " {\n";
+  llvm::NamespaceEmitter ns(os, enumInfo.getCppNamespace());
 
   if (enumInfo.isBitEnum()) {
     emitSymToStrFnForBitEnum(enumDef, os);
@@ -810,10 +801,6 @@ static void emitEnumDef(const Record &enumDef, raw_ostream &os) {
 
   if (enumInfo.genSpecializedAttr())
     emitSpecializedAttrDef(enumDef, os);
-
-  for (auto ns : llvm::reverse(namespaces))
-    os << "} // namespace " << ns << "\n";
-  os << "\n";
 }
 
 static bool emitEnumDefs(const RecordKeeper &records, raw_ostream &os) {
diff --git a/mlir/tools/mlir-tblgen/LLVMIRConversionGen.cpp b/mlir/tools/mlir-tblgen/LLVMIRConversionGen.cpp
index 96af14d36817b..11a2db4a1cc67 100644
--- a/mlir/tools/mlir-tblgen/LLVMIRConversionGen.cpp
+++ b/mlir/tools/mlir-tblgen/LLVMIRConversionGen.cpp
@@ -416,7 +416,7 @@ static void emitOneEnumToConversion(const Record *record, raw_ostream &os) {
 
   // Emit the function converting the enum attribute to its LLVM counterpart.
   os << formatv(
-      "static LLVM_ATTRIBUTE_UNUSED {0} convert{1}ToLLVM({2}::{1} value) {{\n",
+      "[[maybe_unused]] static {0} convert{1}ToLLVM({2}::{1} value) {{\n",
       llvmClass, cppClassName, cppNamespace);
   os << "  switch (value) {\n";
 
@@ -444,7 +444,7 @@ static void emitOneCEnumToConversion(const Record *record, raw_ostream &os) {
   StringRef cppNamespace = enumAttr.getCppNamespace();
 
   // Emit the function converting the enum attribute to its LLVM counterpart.
-  os << formatv("static LLVM_ATTRIBUTE_UNUSED int64_t "
+  os << formatv("[[maybe_unused]] static int64_t "
                 "convert{0}ToLLVM({1}::{0} value) {{\n",
                 cppClassName, cppNamespace);
   os << "  switch (value) {\n";
@@ -474,7 +474,7 @@ static void emitOneEnumFromConversion(const Record *record, raw_ostream &os) {
   StringRef cppNamespace = enumInfo.getCppNamespace();
 
   // Emit the function converting the enum attribute from its LLVM counterpart.
-  os << formatv("inline LLVM_ATTRIBUTE_UNUSED {0}::{1} convert{1}FromLLVM({2} "
+  os << formatv("[[maybe_unused]] inline {0}::{1} convert{1}FromLLVM({2} "
                 "value) {{\n",
                 cppNamespace, cppClassName, llvmClass);
   os << "  switch (value) {\n";
@@ -509,10 +509,9 @@ static void emitOneCEnumFromConversion(const Record *record, raw_ostream &os) {
   StringRef cppNamespace = enumInfo.getCppNamespace();
 
   // Emit the function converting the enum attribute from its LLVM counterpart.
-  os << formatv(
-      "inline LLVM_ATTRIBUTE_UNUSED {0}::{1} convert{1}FromLLVM(int64_t "
-      "value) {{\n",
-      cppNamespace, cppClassName);
+  os << formatv("[[maybe_unused]] inline {0}::{1} convert{1}FromLLVM(int64_t "
+                "value) {{\n",
+                cppNamespace, cppClassName);
   os << "  switch (value) {\n";
 
   for (const auto &enumerant : enumInfo.getAllCases()) {
diff --git a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
index daae3c79ffd43..371864830a3c1 100644
--- a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
@@ -4896,7 +4896,7 @@ static void emitOpClassDefs(const RecordKeeper &records,
                                                       constraintPrefix);
   os << formatv(opCommentHeader, "Local Utility Method", "Definitions");
   staticVerifierEmitter.collectOpConstraints(defs);
-  staticVerifierEmitter.emitOpConstraints(defs);
+  staticVerifierEmitter.emitOpConstraints();
 
   // Emit the classes.
   emitOpClasses(records, defs, os, staticVerifierEmitter,
diff --git a/mlir/tools/mlir-tblgen/OpInterfacesGen.cpp b/mlir/tools/mlir-tblgen/OpInterfacesGen.cpp
index 730b5b26a1675..ab8d534a99f19 100644
--- a/mlir/tools/mlir-tblgen/OpInterfacesGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpInterfacesGen.cpp
@@ -19,6 +19,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/TableGen/CodeGenHelpers.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/TableGenBackend.h"
@@ -342,11 +343,7 @@ void InterfaceGenerator::emitModelDecl(const Interface &interface) {
 }
 
 void InterfaceGenerator::emitModelMethodsDef(const Interface &interface) {
-  llvm::SmallVector<StringRef, 2> namespaces;
-  llvm::SplitString(interface.getCppNamespace(), namespaces, "::");
-  for (StringRef ns : namespaces)
-    os << "namespace " << ns << " {\n";
-
+  llvm::NamespaceEmitter ns(os, interface.getCppNamespace());
   for (auto &method : interface.getMethods()) {
     os << "template<typename " << valueTemplate << ">\n";
     emitCPPType(method.getReturnType(), os);
@@ -442,18 +439,11 @@ void InterfaceGenerator::emitModelMethodsDef(const Interface &interface) {
                         method.isStatic() ? &ctx : &nonStaticMethodFmt);
     os << "\n}\n";
   }
-
-  for (StringRef ns : llvm::reverse(namespaces))
-    os << "} // namespace " << ns << "\n";
 }
 
 void InterfaceGenerator::emitInterfaceTraitDecl(const Interface &interface) {
-  llvm::SmallVector<StringRef, 2> namespaces;
-  llvm::SplitString(interface.getCppNamespace(), namespaces, "::");
-  for (StringRef ns : namespaces)
-    os << "namespace " << ns << " {\n";
-
-  os << "namespace detail {\n";
+  auto cppNamespace = (interface.getCppNamespace() + "::detail").str();
+  llvm::NamespaceEmitter ns(os, cppNamespace);
 
   StringRef interfaceName = interface.getName();
   auto interfaceTraitsName = (interfaceName + "InterfaceTraits").str();
@@ -504,10 +494,6 @@ void InterfaceGenerator::emitInterfaceTraitDecl(const Interface &interface) {
     os << tblgen::tgfmt(*extraTraitDecls, &traitMethodFmt) << "\n";
 
   os << "  };\n";
-  os << "}// namespace detail\n";
-
-  for (StringRef ns : llvm::reverse(namespaces))
-    os << "} // namespace " << ns << "\n";
 }
 
 static void emitInterfaceDeclMethods(const Interface &interface,
@@ -533,10 +519,7 @@ static void emitInterfaceDeclMethods(const Interface &interface,
 }
 
 void InterfaceGenerator::forwardDeclareInterface(const Interface &interface) {
-  llvm::SmallVector<StringRef, 2> namespaces;
-  llvm::SplitString(interface.getCppNamespace(), namespaces, "::");
-  for (StringRef ns : namespaces)
-    os << "namespace " << ns << " {\n";
+  llvm::NamespaceEmitter ns(os, interface.getCppNamespace());
 
   // Emit a forward declaration of the interface class so that it becomes usable
   // in the signature of its methods.
@@ -545,16 +528,10 @@ void InterfaceGenerator::forwardDeclareInterface(const Interface &interface) {
 
   StringRef interfaceName = interface.getName();
   os << "class " << interfaceName << ";\n";
-
-  for (StringRef ns : llvm::reverse(namespaces))
-    os << "} // namespace " << ns << "\n";
 }
 
 void InterfaceGenerator::emitInterfaceDecl(const Interface &interface) {
-  llvm::SmallVector<StringRef, 2> namespaces;
-  llvm::SplitString(interface.getCppNamespace(), namespaces, "::");
-  for (StringRef ns : namespaces)
-    os << "namespace " << ns << " {\n";
+  llvm::NamespaceEmitter ns(os, interface.getCppNamespace());
 
   StringRef interfaceName = interface.getName();
   auto interfaceTraitsName = (interfaceName + "InterfaceTraits").str();
@@ -631,9 +608,6 @@ void InterfaceGenerator::emitInterfaceDecl(const Interface &interface) {
   }
 
   os << "};\n";
-
-  for (StringRef ns : llvm::reverse(namespaces))
-    os << "} // namespace " << ns << "\n";
 }
 
 bool InterfaceGenerator::emitInterfaceDecls() {
diff --git a/mlir/tools/mlir-tblgen/RewriterGen.cpp b/mlir/tools/mlir-tblgen/RewriterGen.cpp
index 40bc1a9c3868c..c3034bb843c4a 100644
--- a/mlir/tools/mlir-tblgen/RewriterGen.cpp
+++ b/mlir/tools/mlir-tblgen/RewriterGen.cpp
@@ -2120,7 +2120,7 @@ static void emitRewriters(const RecordKeeper &records, raw_ostream &os) {
   }
 
   // Emit function to add the generated matchers to the pattern list.
-  os << "void LLVM_ATTRIBUTE_UNUSED populateWithGenerated("
+  os << "[[maybe_unused]] void populateWithGenerated("
         "::mlir::RewritePatternSet &patterns) {\n";
   for (const auto &name : rewriterNames) {
     os << "  patterns.add<" << name << ">(patterns.getContext());\n";
diff --git a/mlir/tools/mlir-tblgen/SPIRVUtilsGen.cpp b/mlir/tools/mlir-tblgen/SPIRVUtilsGen.cpp
index 3ead2f0e37214..ca291b57f4344 100644
--- a/mlir/tools/mlir-tblgen/SPIRVUtilsGen.cpp
+++ b/mlir/tools/mlir-tblgen/SPIRVUtilsGen.cpp
@@ -259,8 +259,8 @@ static void emitInterfaceDecl(const Availability &availability,
   std::string interfaceTraitsName =
       std::string(formatv("{0}Traits", interfaceName));
 
-  StringRef cppNamespace = availability.getInterfaceClassNamespace();
-  llvm::NamespaceEmitter nsEmitter(os, cppNamespace);
+  llvm::NamespaceEmitter nsEmitter(os,
+                                   availability.getInterfaceClassNamespace());
   os << "class " << interfaceName << ";\n\n";
 
   // Emit the traits struct containing the concept and model declarations.
@@ -418,15 +418,9 @@ static void emitAvailabilityQueryForBitEnum(const Record &enumDef,
 static void emitEnumDecl(const Record &enumDef, raw_ostream &os) {
   EnumInfo enumInfo(enumDef);
   StringRef enumName = enumInfo.getEnumClassName();
-  StringRef cppNamespace = enumInfo.getCppNamespace();
   auto enumerants = enumInfo.getAllCases();
 
-  llvm::SmallVector<StringRef, 2> namespaces;
-  llvm::SplitString(cppNamespace, namespaces, "::");
-
-  for (auto ns : namespaces)
-    os << "namespace " << ns << " {\n";
-
+  llvm::NamespaceEmitter ns(os, enumInfo.getCppNamespace());
   llvm::StringSet<> handledClasses;
 
   // Place all availability specifications to their corresponding
@@ -441,9 +435,6 @@ static void emitEnumDecl(const Record &enumDef, raw_ostream &os) {
                     enumName);
       handledClasses.insert(className);
     }
-
-  for (auto ns : llvm::reverse(namespaces))
-    os << "} // namespace " << ns << "\n";
 }
 
 static bool emitEnumDecls(const RecordKeeper &records, raw_ostream &os) {
@@ -459,31 +450,19 @@ static bool emitEnumDecls(const RecordKeeper &records, raw_ostream &os) {
 
 static void emitEnumDef(const Record &enumDef, raw_ostream &os) {
   EnumInfo enumInfo(enumDef);
-  StringRef cppNamespace = enumInfo.getCppNamespace();
-
-  llvm::SmallVector<StringRef, 2> namespaces;
-  llvm::SplitString(cppNamespace, namespaces, "::");
-
-  for (auto ns : namespaces)
-    os << "namespace " << ns << " {\n";
+  llvm::NamespaceEmitter ns(os, enumInfo.getCppNamespace());
 
-  if (enumInfo.isBitEnum()) {
+  if (enumInfo.isBitEnum())
     emitAvailabilityQueryForBitEnum(enumDef, os);
-  } else {
+  else
     emitAvailabilityQueryForIntEnum(enumDef, os);
-  }
-
-  for (auto ns : llvm::reverse(namespaces))
-    os << "} // namespace " << ns << "\n";
-  os << "\n";
 }
 
 static bool emitEnumDefs(const RecordKeeper &records, raw_ostream &os) {
   llvm::emitSourceFileHeader("SPIR-V Enum Availability Definitions", os,
                              records);
 
-  auto defs = records.getAllDerivedDefinitions("EnumInfo");
-  for (const auto *def : defs)
+  for (const Record *def : records.getAllDerivedDefinitions("EnumInfo"))
     emitEnumDef(*def, os);
 
   return false;
diff --git a/mlir/unittests/Dialect/OpenACC/CMakeLists.txt b/mlir/unittests/Dialect/OpenACC/CMakeLists.txt
index d5f40a44f8cc6..177c8680b0040 100644
--- a/mlir/unittests/Dialect/OpenACC/CMakeLists.txt
+++ b/mlir/unittests/Dialect/OpenACC/CMakeLists.txt
@@ -1,8 +1,13 @@
 add_mlir_unittest(MLIROpenACCTests
   OpenACCOpsTest.cpp
+  OpenACCUtilsTest.cpp
 )
 mlir_target_link_libraries(MLIROpenACCTests
   PRIVATE
   MLIRIR
+  MLIRFuncDialect
+  MLIRMemRefDialect
+  MLIRArithDialect
   MLIROpenACCDialect
+  MLIROpenACCUtils
 )
diff --git a/mlir/unittests/Dialect/OpenACC/OpenACCUtilsTest.cpp b/mlir/unittests/Dialect/OpenACC/OpenACCUtilsTest.cpp
new file mode 100644
index 0000000000000..ab817b640edb3
--- /dev/null
+++ b/mlir/unittests/Dialect/OpenACC/OpenACCUtilsTest.cpp
@@ -0,0 +1,412 @@
+//===- OpenACCUtilsTest.cpp - Unit tests for OpenACC utilities -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/OpenACC/OpenACCUtils.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OwningOpRef.h"
+#include "mlir/IR/Value.h"
+#include "gtest/gtest.h"
+
+using namespace mlir;
+using namespace mlir::acc;
+
+//===----------------------------------------------------------------------===//
+// Test Fixture
+//===----------------------------------------------------------------------===//
+
+class OpenACCUtilsTest : public ::testing::Test {
+protected:
+  OpenACCUtilsTest() : b(&context), loc(UnknownLoc::get(&context)) {
+    context.loadDialect<acc::OpenACCDialect, arith::ArithDialect,
+                        memref::MemRefDialect, func::FuncDialect>();
+  }
+
+  MLIRContext context;
+  OpBuilder b;
+  Location loc;
+};
+
+//===----------------------------------------------------------------------===//
+// getEnclosingComputeOp Tests
+//===----------------------------------------------------------------------===//
+
+TEST_F(OpenACCUtilsTest, getEnclosingComputeOpParallel) {
+  // Create a parallel op with a region
+  OwningOpRef<ParallelOp> parallelOp =
+      ParallelOp::create(b, loc, TypeRange{}, ValueRange{});
+  Region &parallelRegion = parallelOp->getRegion();
+  parallelRegion.emplaceBlock();
+
+  // Test that we can find the parallel op from its region
+  Operation *enclosingOp = getEnclosingComputeOp(parallelRegion);
+  EXPECT_EQ(enclosingOp, parallelOp.get());
+}
+
+TEST_F(OpenACCUtilsTest, getEnclosingComputeOpKernels) {
+  // Create a kernels op with a region
+  OwningOpRef<KernelsOp> kernelsOp =
+      KernelsOp::create(b, loc, TypeRange{}, ValueRange{});
+  Region &kernelsRegion = kernelsOp->getRegion();
+  kernelsRegion.emplaceBlock();
+
+  // Test that we can find the kernels op from its region
+  Operation *enclosingOp = getEnclosingComputeOp(kernelsRegion);
+  EXPECT_EQ(enclosingOp, kernelsOp.get());
+}
+
+TEST_F(OpenACCUtilsTest, getEnclosingComputeOpSerial) {
+  // Create a serial op with a region
+  OwningOpRef<SerialOp> serialOp =
+      SerialOp::create(b, loc, TypeRange{}, ValueRange{});
+  Region &serialRegion = serialOp->getRegion();
+  serialRegion.emplaceBlock();
+
+  // Test that we can find the serial op from its region
+  Operation *enclosingOp = getEnclosingComputeOp(serialRegion);
+  EXPECT_EQ(enclosingOp, serialOp.get());
+}
+
+TEST_F(OpenACCUtilsTest, getEnclosingComputeOpNested) {
+  // Create nested ops: parallel containing a loop op
+  OwningOpRef<ParallelOp> parallelOp =
+      ParallelOp::create(b, loc, TypeRange{}, ValueRange{});
+  Region &parallelRegion = parallelOp->getRegion();
+  Block *parallelBlock = &parallelRegion.emplaceBlock();
+
+  OpBuilder::InsertionGuard guard(b);
+  b.setInsertionPointToStart(parallelBlock);
+
+  // Create a loop op inside the parallel region
+  OwningOpRef<LoopOp> loopOp =
+      LoopOp::create(b, loc, TypeRange{}, ValueRange{});
+  Region &loopRegion = loopOp->getRegion();
+  loopRegion.emplaceBlock();
+
+  // Test that from the loop region, we find the parallel op (loop is not a
+  // compute op)
+  Operation *enclosingOp = getEnclosingComputeOp(loopRegion);
+  EXPECT_EQ(enclosingOp, parallelOp.get());
+}
+
+TEST_F(OpenACCUtilsTest, getEnclosingComputeOpNone) {
+  // Create a module with a region that's not inside a compute construct
+  OwningOpRef<ModuleOp> moduleOp = ModuleOp::create(loc);
+  Region &moduleRegion = moduleOp->getBodyRegion();
+
+  // Test that we get nullptr when there's no enclosing compute op
+  Operation *enclosingOp = getEnclosingComputeOp(moduleRegion);
+  EXPECT_EQ(enclosingOp, nullptr);
+}
+
+//===----------------------------------------------------------------------===//
+// isOnlyUsedByPrivateClauses Tests
+//===----------------------------------------------------------------------===//
+
+TEST_F(OpenACCUtilsTest, isOnlyUsedByPrivateClausesTrue) {
+  // Create a value (memref) outside the compute region
+  auto memrefTy = MemRefType::get({10}, b.getI32Type());
+  OwningOpRef<memref::AllocaOp> allocOp =
+      memref::AllocaOp::create(b, loc, memrefTy);
+  TypedValue<PointerLikeType> varPtr =
+      cast<TypedValue<PointerLikeType>>(allocOp->getResult());
+
+  // Create a parallel op with a region
+  OwningOpRef<ParallelOp> parallelOp =
+      ParallelOp::create(b, loc, TypeRange{}, ValueRange{});
+  Region &parallelRegion = parallelOp->getRegion();
+  Block *parallelBlock = &parallelRegion.emplaceBlock();
+
+  OpBuilder::InsertionGuard guard(b);
+  b.setInsertionPointToStart(parallelBlock);
+
+  // Create a private op using the value
+  OwningOpRef<PrivateOp> privateOp = PrivateOp::create(
+      b, loc, varPtr, /*structured=*/true, /*implicit=*/false);
+
+  // Test that the value is only used by private clauses
+  EXPECT_TRUE(isOnlyUsedByPrivateClauses(varPtr, parallelRegion));
+}
+
+TEST_F(OpenACCUtilsTest, isOnlyUsedByPrivateClausesFalse) {
+  // Create a value (memref) outside the compute region
+  auto memrefTy = MemRefType::get({10}, b.getI32Type());
+  OwningOpRef<memref::AllocaOp> allocOp =
+      memref::AllocaOp::create(b, loc, memrefTy);
+  TypedValue<PointerLikeType> varPtr =
+      cast<TypedValue<PointerLikeType>>(allocOp->getResult());
+
+  // Create a parallel op with a region
+  OwningOpRef<ParallelOp> parallelOp =
+      ParallelOp::create(b, loc, TypeRange{}, ValueRange{});
+  Region &parallelRegion = parallelOp->getRegion();
+  Block *parallelBlock = &parallelRegion.emplaceBlock();
+
+  OpBuilder::InsertionGuard guard(b);
+  b.setInsertionPointToStart(parallelBlock);
+
+  // Create a private op using the value
+  OwningOpRef<PrivateOp> privateOp = PrivateOp::create(
+      b, loc, varPtr, /*structured=*/true, /*implicit=*/false);
+
+  // Also use the value in a function call (escape)
+  OwningOpRef<func::CallOp> callOp = func::CallOp::create(
+      b, loc, "some_func", TypeRange{}, ValueRange{varPtr});
+
+  // Test that the value is NOT only used by private clauses (it escapes via
+  // call)
+  EXPECT_FALSE(isOnlyUsedByPrivateClauses(varPtr, parallelRegion));
+}
+
+TEST_F(OpenACCUtilsTest, isOnlyUsedByPrivateClausesMultiple) {
+  // Create a value (memref) outside the compute region
+  auto memrefTy = MemRefType::get({10}, b.getI32Type());
+  OwningOpRef<memref::AllocaOp> allocOp =
+      memref::AllocaOp::create(b, loc, memrefTy);
+  TypedValue<PointerLikeType> varPtr =
+      cast<TypedValue<PointerLikeType>>(allocOp->getResult());
+
+  // Create a parallel op with a region
+  OwningOpRef<ParallelOp> parallelOp =
+      ParallelOp::create(b, loc, TypeRange{}, ValueRange{});
+  Region &parallelRegion = parallelOp->getRegion();
+  Block *parallelBlock = &parallelRegion.emplaceBlock();
+
+  OpBuilder::InsertionGuard guard(b);
+  b.setInsertionPointToStart(parallelBlock);
+
+  // Create multiple private ops using the value
+  OwningOpRef<PrivateOp> privateOp1 = PrivateOp::create(
+      b, loc, varPtr, /*structured=*/true, /*implicit=*/false);
+  OwningOpRef<PrivateOp> privateOp2 = PrivateOp::create(
+      b, loc, varPtr, /*structured=*/true, /*implicit=*/false);
+
+  // Test that the value is only used by private clauses even with multiple uses
+  EXPECT_TRUE(isOnlyUsedByPrivateClauses(varPtr, parallelRegion));
+}
+
+//===----------------------------------------------------------------------===//
+// isOnlyUsedByReductionClauses Tests
+//===----------------------------------------------------------------------===//
+
+TEST_F(OpenACCUtilsTest, isOnlyUsedByReductionClausesTrue) {
+  // Create a value (memref) outside the compute region
+  auto memrefTy = MemRefType::get({10}, b.getI32Type());
+  OwningOpRef<memref::AllocaOp> allocOp =
+      memref::AllocaOp::create(b, loc, memrefTy);
+  TypedValue<PointerLikeType> varPtr =
+      cast<TypedValue<PointerLikeType>>(allocOp->getResult());
+
+  // Create a parallel op with a region
+  OwningOpRef<ParallelOp> parallelOp =
+      ParallelOp::create(b, loc, TypeRange{}, ValueRange{});
+  Region &parallelRegion = parallelOp->getRegion();
+  Block *parallelBlock = &parallelRegion.emplaceBlock();
+
+  OpBuilder::InsertionGuard guard(b);
+  b.setInsertionPointToStart(parallelBlock);
+
+  // Create a reduction op using the value
+  OwningOpRef<ReductionOp> reductionOp = ReductionOp::create(
+      b, loc, varPtr, /*structured=*/true, /*implicit=*/false);
+
+  // Test that the value is only used by reduction clauses
+  EXPECT_TRUE(isOnlyUsedByReductionClauses(varPtr, parallelRegion));
+}
+
+TEST_F(OpenACCUtilsTest, isOnlyUsedByReductionClausesFalse) {
+  // Create a value (memref) outside the compute region
+  auto memrefTy = MemRefType::get({10}, b.getI32Type());
+  OwningOpRef<memref::AllocaOp> allocOp =
+      memref::AllocaOp::create(b, loc, memrefTy);
+  TypedValue<PointerLikeType> varPtr =
+      cast<TypedValue<PointerLikeType>>(allocOp->getResult());
+
+  // Create a parallel op with a region
+  OwningOpRef<ParallelOp> parallelOp =
+      ParallelOp::create(b, loc, TypeRange{}, ValueRange{});
+  Region &parallelRegion = parallelOp->getRegion();
+  Block *parallelBlock = &parallelRegion.emplaceBlock();
+
+  OpBuilder::InsertionGuard guard(b);
+  b.setInsertionPointToStart(parallelBlock);
+
+  // Create a reduction op using the value
+  OwningOpRef<ReductionOp> reductionOp = ReductionOp::create(
+      b, loc, varPtr, /*structured=*/true, /*implicit=*/false);
+
+  // Also use the value in a function call (escape)
+  OwningOpRef<func::CallOp> callOp = func::CallOp::create(
+      b, loc, "some_func", TypeRange{}, ValueRange{varPtr});
+
+  // Test that the value is NOT only used by reduction clauses (it escapes via
+  // call)
+  EXPECT_FALSE(isOnlyUsedByReductionClauses(varPtr, parallelRegion));
+}
+
+TEST_F(OpenACCUtilsTest, isOnlyUsedByReductionClausesMultiple) {
+  // Create a value (memref) outside the compute region
+  auto memrefTy = MemRefType::get({10}, b.getI32Type());
+  OwningOpRef<memref::AllocaOp> allocOp =
+      memref::AllocaOp::create(b, loc, memrefTy);
+  TypedValue<PointerLikeType> varPtr =
+      cast<TypedValue<PointerLikeType>>(allocOp->getResult());
+
+  // Create a parallel op with a region
+  OwningOpRef<ParallelOp> parallelOp =
+      ParallelOp::create(b, loc, TypeRange{}, ValueRange{});
+  Region &parallelRegion = parallelOp->getRegion();
+  Block *parallelBlock = &parallelRegion.emplaceBlock();
+
+  OpBuilder::InsertionGuard guard(b);
+  b.setInsertionPointToStart(parallelBlock);
+
+  // Create multiple reduction ops using the value
+  OwningOpRef<ReductionOp> reductionOp1 = ReductionOp::create(
+      b, loc, varPtr, /*structured=*/true, /*implicit=*/false);
+  OwningOpRef<ReductionOp> reductionOp2 = ReductionOp::create(
+      b, loc, varPtr, /*structured=*/true, /*implicit=*/false);
+
+  // Test that the value is only used by reduction clauses even with multiple
+  // uses
+  EXPECT_TRUE(isOnlyUsedByReductionClauses(varPtr, parallelRegion));
+}
+
+//===----------------------------------------------------------------------===//
+// getDefaultAttr Tests
+//===----------------------------------------------------------------------===//
+
+TEST_F(OpenACCUtilsTest, getDefaultAttrOnParallel) {
+  // Create a parallel op with a default attribute
+  OwningOpRef<ParallelOp> parallelOp =
+      ParallelOp::create(b, loc, TypeRange{}, ValueRange{});
+  parallelOp->setDefaultAttr(ClauseDefaultValue::None);
+
+  // Test that we can retrieve the default attribute
+  std::optional<ClauseDefaultValue> defaultAttr =
+      getDefaultAttr(parallelOp.get());
+  EXPECT_TRUE(defaultAttr.has_value());
+  EXPECT_EQ(defaultAttr.value(), ClauseDefaultValue::None);
+}
+
+TEST_F(OpenACCUtilsTest, getDefaultAttrOnKernels) {
+  // Create a kernels op with a default attribute
+  OwningOpRef<KernelsOp> kernelsOp =
+      KernelsOp::create(b, loc, TypeRange{}, ValueRange{});
+  kernelsOp->setDefaultAttr(ClauseDefaultValue::Present);
+
+  // Test that we can retrieve the default attribute
+  std::optional<ClauseDefaultValue> defaultAttr =
+      getDefaultAttr(kernelsOp.get());
+  EXPECT_TRUE(defaultAttr.has_value());
+  EXPECT_EQ(defaultAttr.value(), ClauseDefaultValue::Present);
+}
+
+TEST_F(OpenACCUtilsTest, getDefaultAttrOnSerial) {
+  // Create a serial op with a default attribute
+  OwningOpRef<SerialOp> serialOp =
+      SerialOp::create(b, loc, TypeRange{}, ValueRange{});
+  serialOp->setDefaultAttr(ClauseDefaultValue::None);
+
+  // Test that we can retrieve the default attribute
+  std::optional<ClauseDefaultValue> defaultAttr =
+      getDefaultAttr(serialOp.get());
+  EXPECT_TRUE(defaultAttr.has_value());
+  EXPECT_EQ(defaultAttr.value(), ClauseDefaultValue::None);
+}
+
+TEST_F(OpenACCUtilsTest, getDefaultAttrOnData) {
+  // Create a data op with a default attribute
+  OwningOpRef<DataOp> dataOp =
+      DataOp::create(b, loc, TypeRange{}, ValueRange{});
+  dataOp->setDefaultAttr(ClauseDefaultValue::Present);
+
+  // Test that we can retrieve the default attribute
+  std::optional<ClauseDefaultValue> defaultAttr = getDefaultAttr(dataOp.get());
+  EXPECT_TRUE(defaultAttr.has_value());
+  EXPECT_EQ(defaultAttr.value(), ClauseDefaultValue::Present);
+}
+
+TEST_F(OpenACCUtilsTest, getDefaultAttrNone) {
+  // Create a parallel op without setting a default attribute
+  OwningOpRef<ParallelOp> parallelOp =
+      ParallelOp::create(b, loc, TypeRange{}, ValueRange{});
+  // Do not set default attribute
+
+  // Test that we get std::nullopt when there's no default attribute
+  std::optional<ClauseDefaultValue> defaultAttr =
+      getDefaultAttr(parallelOp.get());
+  EXPECT_FALSE(defaultAttr.has_value());
+}
+
+TEST_F(OpenACCUtilsTest, getDefaultAttrNearest) {
+  // Create a data op with a default attribute
+  OwningOpRef<DataOp> dataOp =
+      DataOp::create(b, loc, TypeRange{}, ValueRange{});
+  dataOp->setDefaultAttr(ClauseDefaultValue::Present);
+
+  Region &dataRegion = dataOp->getRegion();
+  Block *dataBlock = &dataRegion.emplaceBlock();
+
+  OpBuilder::InsertionGuard guard(b);
+  b.setInsertionPointToStart(dataBlock);
+
+  // Create a parallel op inside the data region with NO default attribute
+  OwningOpRef<ParallelOp> parallelOp =
+      ParallelOp::create(b, loc, TypeRange{}, ValueRange{});
+  // Do not set default attribute on parallel op
+
+  Region &parallelRegion = parallelOp->getRegion();
+  Block *parallelBlock = &parallelRegion.emplaceBlock();
+
+  b.setInsertionPointToStart(parallelBlock);
+
+  // Create a loop op inside the parallel region
+  OwningOpRef<LoopOp> loopOp =
+      LoopOp::create(b, loc, TypeRange{}, ValueRange{});
+
+  // Test that from the loop op, we find the nearest default attribute (from
+  // data op)
+  std::optional<ClauseDefaultValue> defaultAttr = getDefaultAttr(loopOp.get());
+  EXPECT_TRUE(defaultAttr.has_value());
+  EXPECT_EQ(defaultAttr.value(), ClauseDefaultValue::Present);
+}
+
+//===----------------------------------------------------------------------===//
+// getTypeCategory Tests
+//===----------------------------------------------------------------------===//
+
+TEST_F(OpenACCUtilsTest, getTypeCategoryScalar) {
+  // Create a scalar memref (no dimensions)
+  auto scalarMemrefTy = MemRefType::get({}, b.getI32Type());
+  OwningOpRef<memref::AllocaOp> allocOp =
+      memref::AllocaOp::create(b, loc, scalarMemrefTy);
+  Value varPtr = allocOp->getResult();
+
+  // Test that a scalar memref returns scalar category
+  VariableTypeCategory category = getTypeCategory(varPtr);
+  EXPECT_EQ(category, VariableTypeCategory::scalar);
+}
+
+TEST_F(OpenACCUtilsTest, getTypeCategoryArray) {
+  // Create an array memref (with dimensions)
+  auto arrayMemrefTy = MemRefType::get({10}, b.getI32Type());
+  OwningOpRef<memref::AllocaOp> allocOp =
+      memref::AllocaOp::create(b, loc, arrayMemrefTy);
+  Value varPtr = allocOp->getResult();
+
+  // Test that an array memref returns array category
+  VariableTypeCategory category = getTypeCategory(varPtr);
+  EXPECT_EQ(category, VariableTypeCategory::array);
+}
diff --git a/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp b/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp
index fe26fc1f1d366..2b92e47d9288e 100644
--- a/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp
+++ b/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp
@@ -113,8 +113,7 @@ static Match tensorMatch(TensorId tid) { return Match(tid); }
 static Match synZeroMatch() { return Match(); }
 
 #define IMPL_BINOP_PATTERN(OP, KIND)                                           \
-  LLVM_ATTRIBUTE_UNUSED static Match OP##Match(const Match &e0,                \
-                                               const Match &e1) {              \
+  [[maybe_unused]] static Match OP##Match(const Match &e0, const Match &e1) {  \
     return Match(KIND, e0, e1);                                                \
   }
 FOREVERY_BINOP(IMPL_BINOP_PATTERN)
@@ -142,7 +141,7 @@ class MergerTestBase : public ::testing::TestWithParam<LevelFormat> {
   }
 
 #define IMPL_BINOP_EXPR(OP, KIND)                                              \
-  LLVM_ATTRIBUTE_UNUSED ExprId OP##Expr(ExprId e0, ExprId e1) {                \
+  [[maybe_unused]] ExprId OP##Expr(ExprId e0, ExprId e1) {                     \
     return merger.addExp(KIND, e0, e1);                                        \
   }
   FOREVERY_BINOP(IMPL_BINOP_EXPR)
diff --git a/mlir/unittests/IR/RemarkTest.cpp b/mlir/unittests/IR/RemarkTest.cpp
index bcbda90cf69cb..94753c10a9a93 100644
--- a/mlir/unittests/IR/RemarkTest.cpp
+++ b/mlir/unittests/IR/RemarkTest.cpp
@@ -53,10 +53,12 @@ TEST(Remark, TestOutputOptimizationRemark) {
                                         /*missed=*/categoryUnroll,
                                         /*analysis=*/categoryRegister,
                                         /*failed=*/categoryInliner};
-
+    std::unique_ptr<remark::RemarkEmittingPolicyAll> policy =
+        std::make_unique<remark::RemarkEmittingPolicyAll>();
     LogicalResult isEnabled =
         mlir::remark::enableOptimizationRemarksWithLLVMStreamer(
-            context, yamlFile, llvm::remarks::Format::YAML, cats);
+            context, yamlFile, llvm::remarks::Format::YAML, std::move(policy),
+            cats);
     ASSERT_TRUE(succeeded(isEnabled)) << "Failed to enable remark engine";
 
     // PASS: something succeeded
@@ -202,9 +204,10 @@ TEST(Remark, TestOutputOptimizationRemarkDiagnostic) {
                                         /*missed=*/categoryUnroll,
                                         /*analysis=*/categoryRegister,
                                         /*failed=*/categoryUnroll};
-
-    LogicalResult isEnabled =
-        remark::enableOptimizationRemarks(context, nullptr, cats, true);
+    std::unique_ptr<remark::RemarkEmittingPolicyAll> policy =
+        std::make_unique<remark::RemarkEmittingPolicyAll>();
+    LogicalResult isEnabled = remark::enableOptimizationRemarks(
+        context, nullptr, std::move(policy), cats, true);
 
     ASSERT_TRUE(succeeded(isEnabled)) << "Failed to enable remark engine";
 
@@ -282,8 +285,11 @@ TEST(Remark, TestCustomOptimizationRemarkDiagnostic) {
                                         /*analysis=*/std::nullopt,
                                         /*failed=*/categoryLoopunroll};
 
+    std::unique_ptr<remark::RemarkEmittingPolicyAll> policy =
+        std::make_unique<remark::RemarkEmittingPolicyAll>();
     LogicalResult isEnabled = remark::enableOptimizationRemarks(
-        context, std::make_unique<MyCustomStreamer>(), cats, true);
+        context, std::make_unique<MyCustomStreamer>(), std::move(policy), cats,
+        true);
     ASSERT_TRUE(succeeded(isEnabled)) << "Failed to enable remark engine";
 
     // Remark 1: pass, category LoopUnroll
@@ -311,4 +317,64 @@ TEST(Remark, TestCustomOptimizationRemarkDiagnostic) {
   EXPECT_NE(errOut.find(pass2Msg), std::string::npos); // printed
   EXPECT_EQ(errOut.find(pass3Msg), std::string::npos); // filtered out
 }
+
+TEST(Remark, TestRemarkFinal) {
+  testing::internal::CaptureStderr();
+  const auto *pass1Msg = "I failed";
+  const auto *pass2Msg = "I failed too";
+  const auto *pass3Msg = "I succeeded";
+  const auto *pass4Msg = "I succeeded too";
+
+  std::string categoryLoopunroll("LoopUnroll");
+
+  {
+    MLIRContext context;
+    Location loc = FileLineColLoc::get(&context, "test.cpp", 1, 5);
+    Location locOther = FileLineColLoc::get(&context, "test.cpp", 55, 5);
+
+    // Setup the remark engine
+    mlir::remark::RemarkCategories cats{/*all=*/"",
+                                        /*passed=*/categoryLoopunroll,
+                                        /*missed=*/categoryLoopunroll,
+                                        /*analysis=*/categoryLoopunroll,
+                                        /*failed=*/categoryLoopunroll};
+
+    std::unique_ptr<remark::RemarkEmittingPolicyFinal> policy =
+        std::make_unique<remark::RemarkEmittingPolicyFinal>();
+    LogicalResult isEnabled = remark::enableOptimizationRemarks(
+        context, std::make_unique<MyCustomStreamer>(), std::move(policy), cats,
+        true);
+    ASSERT_TRUE(succeeded(isEnabled)) << "Failed to enable remark engine";
+
+    // Remark 1: failure
+    remark::failed(
+        loc, remark::RemarkOpts::name("Unroller").category(categoryLoopunroll))
+        << pass1Msg;
+
+    // Remark 2: failure
+    remark::missed(
+        loc, remark::RemarkOpts::name("Unroller").category(categoryLoopunroll))
+        << remark::reason(pass2Msg);
+
+    // Remark 3: pass
+    remark::passed(
+        loc, remark::RemarkOpts::name("Unroller").category(categoryLoopunroll))
+        << pass3Msg;
+
+    // Remark 4: pass
+    remark::passed(
+        locOther,
+        remark::RemarkOpts::name("Unroller").category(categoryLoopunroll))
+        << pass4Msg;
+  }
+
+  llvm::errs().flush();
+  std::string errOut = ::testing::internal::GetCapturedStderr();
+
+  // Containment checks for messages.
+  EXPECT_EQ(errOut.find(pass1Msg), std::string::npos); // dropped
+  EXPECT_EQ(errOut.find(pass2Msg), std::string::npos); // dropped
+  EXPECT_NE(errOut.find(pass3Msg), std::string::npos); // shown
+  EXPECT_NE(errOut.find(pass4Msg), std::string::npos); // shown
+}
 } // namespace
diff --git a/mlir/utils/generate-test-checks.py b/mlir/utils/generate-test-checks.py
index f80a1811f418c..3712a6b9c963d 100755
--- a/mlir/utils/generate-test-checks.py
+++ b/mlir/utils/generate-test-checks.py
@@ -31,13 +31,16 @@
 import os  # Used to advertise this file's name ("autogenerated_note").
 import re
 import sys
+from collections import Counter
 
 ADVERT_BEGIN = "// NOTE: Assertions have been autogenerated by "
 ADVERT_END = """
-// The script is designed to make adding checks to
-// a test case fast, it is *not* designed to be authoritative
-// about what constitutes a good test! The CHECK should be
-// minimized and named to reflect the test intent.
+// This script is intended to make adding checks to a test case quick and easy.
+// It is *not* authoritative about what constitutes a good test. After using the
+// script, be sure to review and refine the generated checks. For example,
+// CHECK lines should be minimized and named to reflect the test’s intent.
+// For comprehensive guidelines, see:
+//   * https://mlir.llvm.org/getting_started/TestingGuide/
 """
 
 
@@ -45,6 +48,9 @@
 SSA_RE_STR = "[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*"
 SSA_RE = re.compile(SSA_RE_STR)
 
+# Regex matching `dialect.op_name` (e.g. `vector.transfer_read`).
+SSA_OP_NAME_RE = re.compile(r"\b(?:\s=\s[a-z_]+)[.]([a-z_]+)\b")
+
 # Regex matching the left-hand side of an assignment
 SSA_RESULTS_STR = r'\s*(%' + SSA_RE_STR + r')(\s*,\s*(%' + SSA_RE_STR + r'))*\s*='
 SSA_RESULTS_RE = re.compile(SSA_RESULTS_STR)
@@ -63,7 +69,12 @@
 class VariableNamer:
     def __init__(self, variable_names):
         self.scopes = []
+        # Counter for generic FileCHeck names, e.g. VAL_#N
         self.name_counter = 0
+        # Counters for FileCheck names derived from Op names, e.g.
+        # TRANSFER_READ_#N (based on `vector.transfer_read`). Note, there's a
+        # dedicated counter for every Op type present in the input.
+        self.op_name_counter = Counter()
 
         # Number of variable names to still generate in parent scope
         self.generate_in_parent_scope_left = 0
@@ -77,17 +88,29 @@ def generate_in_parent_scope(self, n):
         self.generate_in_parent_scope_left = n
 
     # Generate a substitution name for the given ssa value name.
-    def generate_name(self, source_variable_name, use_ssa_name):
+    def generate_name(self, source_variable_name, use_ssa_name, op_name=""):
 
         # Compute variable name
-        variable_name = self.variable_names.pop(0) if len(self.variable_names) > 0 else ''
-        if variable_name == '':
+        variable_name = (
+            self.variable_names.pop(0) if len(self.variable_names) > 0 else ""
+        )
+        if variable_name == "":
             # If `use_ssa_name` is set, use the MLIR SSA value name to generate
             # a FileCHeck substation string. As FileCheck requires these
             # strings to start with a character, skip MLIR variables starting
             # with a digit (e.g. `%0`).
+            #
+            # The next fallback option is to use the op name, if the
+            # corresponding match succeeds.
+            #
+            # If neither worked, use a generic name: `VAL_#N`.
             if use_ssa_name and source_variable_name[0].isalpha():
                 variable_name = source_variable_name.upper()
+            elif op_name != "":
+                variable_name = (
+                    op_name.upper() + "_" + str(self.op_name_counter[op_name])
+                )
+                self.op_name_counter[op_name] += 1
             else:
                 variable_name = "VAL_" + str(self.name_counter)
                 self.name_counter += 1
@@ -123,6 +146,7 @@ def num_scopes(self):
     def clear_names(self):
         self.name_counter = 0
         self.used_variable_names = set()
+        self.op_name_counter.clear()
 
 class AttributeNamer:
 
@@ -170,8 +194,12 @@ def process_line(line_chunks, variable_namer, use_ssa_name=False, strict_name_re
 
     # Process the rest that contained an SSA value name.
     for chunk in line_chunks:
-        m = SSA_RE.match(chunk)
-        ssa_name = m.group(0) if m is not None else ''
+        ssa = SSA_RE.match(chunk)
+        op_name_with_dialect = SSA_OP_NAME_RE.search(chunk)
+        ssa_name = ssa.group(0) if ssa is not None else ""
+        op_name = (
+            op_name_with_dialect.group(1) if op_name_with_dialect is not None else ""
+        )
 
         # Check if an existing variable exists for this name.
         variable = None
@@ -185,7 +213,7 @@ def process_line(line_chunks, variable_namer, use_ssa_name=False, strict_name_re
             output_line += "%[[" + variable + "]]"
         else:
             # Otherwise, generate a new variable.
-            variable = variable_namer.generate_name(ssa_name, use_ssa_name)
+            variable = variable_namer.generate_name(ssa_name, use_ssa_name, op_name)
             if strict_name_re:
                 # Use stricter regexp for the variable name, if requested.
                 # Greedy matching may cause issues with the generic '.*'
diff --git a/offload/include/OpenMP/InteropAPI.h b/offload/include/OpenMP/InteropAPI.h
index 53ac4be2e2e98..8c06ba36fc3f3 100644
--- a/offload/include/OpenMP/InteropAPI.h
+++ b/offload/include/OpenMP/InteropAPI.h
@@ -25,8 +25,8 @@ typedef enum kmp_interop_type_t {
 } kmp_interop_type_t;
 
 struct interop_attrs_t {
-  bool inorder : 1;
-  int reserved : 31;
+  uint32_t inorder : 1;
+  uint32_t reserved : 31;
 
   /// Check if the supported attributes are compatible with the current
   ///   attributes. Only if an attribute is supported can the value be true,
@@ -44,15 +44,15 @@ struct interop_spec_t {
 };
 
 struct interop_flags_t {
-  bool implicit : 1; // dispatch (true) or interop (false)
-  bool nowait : 1;   // has nowait flag
-  int reserved : 30;
+  uint32_t implicit : 1; // dispatch (true) or interop (false)
+  uint32_t nowait : 1;   // has nowait flag
+  uint32_t reserved : 30;
 };
 
 struct interop_ctx_t {
   uint32_t version; // version of the interface (current is 0)
   interop_flags_t flags;
-  int gtid;
+  int32_t gtid;
 };
 
 struct dep_pack_t {
diff --git a/offload/liboffload/API/Common.td b/offload/liboffload/API/Common.td
index ac27d85b6c964..b47223612479a 100644
--- a/offload/liboffload/API/Common.td
+++ b/offload/liboffload/API/Common.td
@@ -140,9 +140,10 @@ def ol_dimensions_t : Struct {
 }
 
 def olInit : Function {
-  let desc = "Perform initialization of the Offload library and plugins";
+  let desc = "Perform initialization of the Offload library";
   let details = [
     "This must be the first API call made by a user of the Offload library",
+    "The underlying platforms are lazily initialized on their first use"
     "Each call will increment an internal reference count that is decremented by `olShutDown`"
   ];
   let params = [];
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index c549ae04361d0..6d22faeb0e57e 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -42,9 +42,7 @@ using namespace error;
 struct ol_platform_impl_t {
   ol_platform_impl_t(std::unique_ptr<GenericPluginTy> Plugin,
                      ol_platform_backend_t BackendType)
-      : Plugin(std::move(Plugin)), BackendType(BackendType) {}
-  std::unique_ptr<GenericPluginTy> Plugin;
-  llvm::SmallVector<std::unique_ptr<ol_device_impl_t>> Devices;
+      : BackendType(BackendType), Plugin(std::move(Plugin)) {}
   ol_platform_backend_t BackendType;
 
   /// Complete all pending work for this platform and perform any needed
@@ -53,6 +51,14 @@ struct ol_platform_impl_t {
   /// After calling this function, no liboffload functions should be called with
   /// this platform handle.
   llvm::Error destroy();
+
+  /// Initialize the associated plugin and devices.
+  llvm::Error init();
+
+  /// Direct access to the plugin, may be uninitialized if accessed here.
+  std::unique_ptr<GenericPluginTy> Plugin;
+
+  llvm::SmallVector<std::unique_ptr<ol_device_impl_t>> Devices;
 };
 
 // Handle type definitions. Ideally these would be 1:1 with the plugins, but
@@ -130,6 +136,28 @@ llvm::Error ol_platform_impl_t::destroy() {
   return Result;
 }
 
+llvm::Error ol_platform_impl_t::init() {
+  if (!Plugin)
+    return llvm::Error::success();
+
+  if (llvm::Error Err = Plugin->init())
+    return Err;
+
+  for (auto Id = 0, End = Plugin->getNumDevices(); Id != End; Id++) {
+    if (llvm::Error Err = Plugin->initDevice(Id))
+      return Err;
+
+    auto Device = &Plugin->getDevice(Id);
+    auto Info = Device->obtainInfoImpl();
+    if (llvm::Error Err = Info.takeError())
+      return Err;
+    Devices.emplace_back(std::make_unique<ol_device_impl_t>(Id, Device, *this,
+                                                            std::move(*Info)));
+  }
+
+  return llvm::Error::success();
+}
+
 struct ol_queue_impl_t {
   ol_queue_impl_t(__tgt_async_info *AsyncInfo, ol_device_handle_t Device)
       : AsyncInfo(AsyncInfo), Device(Device), Id(IdCounter++) {}
@@ -207,15 +235,11 @@ struct OffloadContext {
   std::mutex AllocInfoMapMutex{};
   // Partitioned list of memory base addresses. Each element in this list is a
   // key in AllocInfoMap
-  llvm::SmallVector<void *> AllocBases{};
+  SmallVector<void *> AllocBases{};
   SmallVector<std::unique_ptr<ol_platform_impl_t>, 4> Platforms{};
+  ol_device_handle_t HostDevice;
   size_t RefCount;
 
-  ol_device_handle_t HostDevice() {
-    // The host platform is always inserted last
-    return Platforms.back()->Devices[0].get();
-  }
-
   static OffloadContext &get() {
     assert(OffloadContextVal);
     return *OffloadContextVal;
@@ -259,28 +283,21 @@ Error initPlugins(OffloadContext &Context) {
   } while (false);
 #include "Shared/Targets.def"
 
-  // Preemptively initialize all devices in the plugin
+  // Eagerly initialize all of the plugins and devices. We need to make sure
+  // that the platform is initialized at a consistent point to maintain the
+  // expected teardown order in the vendor libraries.
   for (auto &Platform : Context.Platforms) {
-    auto Err = Platform->Plugin->init();
-    [[maybe_unused]] std::string InfoMsg = toString(std::move(Err));
-    for (auto DevNum = 0; DevNum < Platform->Plugin->number_of_devices();
-         DevNum++) {
-      if (Platform->Plugin->init_device(DevNum) == OFFLOAD_SUCCESS) {
-        auto Device = &Platform->Plugin->getDevice(DevNum);
-        auto Info = Device->obtainInfoImpl();
-        if (auto Err = Info.takeError())
-          return Err;
-        Platform->Devices.emplace_back(std::make_unique<ol_device_impl_t>(
-            DevNum, Device, *Platform, std::move(*Info)));
-      }
-    }
+    if (Error Err = Platform->init())
+      return Err;
   }
 
-  // Add the special host device
+  // Add the special host device.
   auto &HostPlatform = Context.Platforms.emplace_back(
       std::make_unique<ol_platform_impl_t>(nullptr, OL_PLATFORM_BACKEND_HOST));
-  HostPlatform->Devices.emplace_back(std::make_unique<ol_device_impl_t>(
-      -1, nullptr, *HostPlatform, InfoTreeNode{}));
+  Context.HostDevice = HostPlatform->Devices
+                           .emplace_back(std::make_unique<ol_device_impl_t>(
+                               -1, nullptr, *HostPlatform, InfoTreeNode{}))
+                           .get();
 
   Context.TracingEnabled = std::getenv("OFFLOAD_TRACE");
   Context.ValidationEnabled = !std::getenv("OFFLOAD_DISABLE_VALIDATION");
@@ -312,16 +329,16 @@ Error olShutDown_impl() {
   if (--OffloadContext::get().RefCount != 0)
     return Error::success();
 
-  llvm::Error Result = Error::success();
+  Error Result = Error::success();
   auto *OldContext = OffloadContextVal.exchange(nullptr);
 
-  for (auto &P : OldContext->Platforms) {
+  for (auto &Platform : OldContext->Platforms) {
     // Host plugin is nullptr and has no deinit
-    if (!P->Plugin || !P->Plugin->is_initialized())
+    if (!Platform->Plugin || !Platform->Plugin->is_initialized())
       continue;
 
-    if (auto Res = P->destroy())
-      Result = llvm::joinErrors(std::move(Result), std::move(Res));
+    if (auto Res = Platform->destroy())
+      Result = joinErrors(std::move(Result), std::move(Res));
   }
 
   delete OldContext;
@@ -334,6 +351,8 @@ Error olGetPlatformInfoImplDetail(ol_platform_handle_t Platform,
   InfoWriter Info(PropSize, PropValue, PropSizeRet);
   bool IsHost = Platform->BackendType == OL_PLATFORM_BACKEND_HOST;
 
+  // Note that the plugin is potentially uninitialized here. It will need to be
+  // initialized once info is added that requires it to be initialized.
   switch (PropName) {
   case OL_PLATFORM_INFO_NAME:
     return Info.writeString(IsHost ? "Host" : Platform->Plugin->getName());
@@ -373,12 +392,12 @@ Error olGetPlatformInfoSize_impl(ol_platform_handle_t Platform,
 Error olGetDeviceInfoImplDetail(ol_device_handle_t Device,
                                 ol_device_info_t PropName, size_t PropSize,
                                 void *PropValue, size_t *PropSizeRet) {
-  assert(Device != OffloadContext::get().HostDevice());
+  assert(Device != OffloadContext::get().HostDevice);
   InfoWriter Info(PropSize, PropValue, PropSizeRet);
 
   auto makeError = [&](ErrorCode Code, StringRef Err) {
     std::string ErrBuffer;
-    llvm::raw_string_ostream(ErrBuffer) << PropName << ": " << Err;
+    raw_string_ostream(ErrBuffer) << PropName << ": " << Err;
     return Plugin::error(ErrorCode::UNIMPLEMENTED, ErrBuffer.c_str());
   };
 
@@ -511,7 +530,7 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device,
 Error olGetDeviceInfoImplDetailHost(ol_device_handle_t Device,
                                     ol_device_info_t PropName, size_t PropSize,
                                     void *PropValue, size_t *PropSizeRet) {
-  assert(Device == OffloadContext::get().HostDevice());
+  assert(Device == OffloadContext::get().HostDevice);
   InfoWriter Info(PropSize, PropValue, PropSizeRet);
 
   constexpr auto uint32_max = std::numeric_limits<uint32_t>::max();
@@ -579,7 +598,7 @@ Error olGetDeviceInfoImplDetailHost(ol_device_handle_t Device,
 
 Error olGetDeviceInfo_impl(ol_device_handle_t Device, ol_device_info_t PropName,
                            size_t PropSize, void *PropValue) {
-  if (Device == OffloadContext::get().HostDevice())
+  if (Device == OffloadContext::get().HostDevice)
     return olGetDeviceInfoImplDetailHost(Device, PropName, PropSize, PropValue,
                                          nullptr);
   return olGetDeviceInfoImplDetail(Device, PropName, PropSize, PropValue,
@@ -588,7 +607,7 @@ Error olGetDeviceInfo_impl(ol_device_handle_t Device, ol_device_info_t PropName,
 
 Error olGetDeviceInfoSize_impl(ol_device_handle_t Device,
                                ol_device_info_t PropName, size_t *PropSizeRet) {
-  if (Device == OffloadContext::get().HostDevice())
+  if (Device == OffloadContext::get().HostDevice)
     return olGetDeviceInfoImplDetailHost(Device, PropName, 0, nullptr,
                                          PropSizeRet);
   return olGetDeviceInfoImplDetail(Device, PropName, 0, nullptr, PropSizeRet);
@@ -598,7 +617,7 @@ Error olIterateDevices_impl(ol_device_iterate_cb_t Callback, void *UserData) {
   for (auto &Platform : OffloadContext::get().Platforms) {
     for (auto &Device : Platform->Devices) {
       if (!Callback(Device.get(), UserData)) {
-        break;
+        return Error::success();
       }
     }
   }
@@ -949,7 +968,7 @@ Error olCreateEvent_impl(ol_queue_handle_t Queue, ol_event_handle_t *EventOut) {
 Error olMemcpy_impl(ol_queue_handle_t Queue, void *DstPtr,
                     ol_device_handle_t DstDevice, const void *SrcPtr,
                     ol_device_handle_t SrcDevice, size_t Size) {
-  auto Host = OffloadContext::get().HostDevice();
+  auto Host = OffloadContext::get().HostDevice;
   if (DstDevice == Host && SrcDevice == Host) {
     if (!Queue) {
       std::memcpy(DstPtr, SrcPtr, Size);
@@ -1138,7 +1157,7 @@ Error olGetSymbolInfoImplDetail(ol_symbol_handle_t Symbol,
   auto CheckKind = [&](ol_symbol_kind_t Required) {
     if (Symbol->Kind != Required) {
       std::string ErrBuffer;
-      llvm::raw_string_ostream(ErrBuffer)
+      raw_string_ostream(ErrBuffer)
           << PropName << ": Expected a symbol of Kind " << Required
           << " but given a symbol of Kind " << Symbol->Kind;
       return Plugin::error(ErrorCode::SYMBOL_KIND, ErrBuffer.c_str());
diff --git a/offload/libomptarget/OpenMP/InteropAPI.cpp b/offload/libomptarget/OpenMP/InteropAPI.cpp
index c55ef2c2e672c..d6ef17c06355c 100644
--- a/offload/libomptarget/OpenMP/InteropAPI.cpp
+++ b/offload/libomptarget/OpenMP/InteropAPI.cpp
@@ -22,8 +22,7 @@ extern "C" {
 
 void __kmpc_omp_wait_deps(ident_t *loc_ref, int32_t gtid, int32_t ndeps,
                           kmp_depend_info_t *dep_list, int32_t ndeps_noalias,
-                          kmp_depend_info_t *noalias_dep_list)
-    __attribute__((weak));
+                          kmp_depend_info_t *noalias_dep_list);
 
 } // extern "C"
 
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 6fc2c7f6d93c8..0d410d1cc5fe4 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -1672,6 +1672,10 @@ struct AMDGPUQueueTy {
 /// devices. This class relies on signals to implement streams and define the
 /// dependencies between asynchronous operations.
 struct AMDGPUStreamTy {
+public:
+  /// Function pointer type for `pushHostCallback`
+  using HostFnType = void (*)(void *);
+
 private:
   /// Utility struct holding arguments for async H2H memory copies.
   struct MemcpyArgsTy {
@@ -1913,18 +1917,19 @@ struct AMDGPUStreamTy {
   /// Arguments for callback function to collect kernel duration.
   KernelDurationTracingArgsTy KernelDurationTracingArgs;
 
+  struct CallbackDataType {
+    HostFnType UserFn;
+    void *UserData;
+    AMDGPUSignalTy *OutputSignal;
+  };
   /// Wrapper function for implementing host callbacks
-  static void CallbackWrapper(AMDGPUSignalTy *InputSignal,
-                              AMDGPUSignalTy *OutputSignal,
-                              void (*Callback)(void *), void *UserData) {
-    // The wait call will not error in this context.
-    if (InputSignal)
-      if (auto Err = InputSignal->wait())
-        reportFatalInternalError(std::move(Err));
-
-    Callback(UserData);
-
-    OutputSignal->signal();
+  static bool callbackWrapper([[maybe_unused]] hsa_signal_value_t Signal,
+                              void *UserData) {
+    auto CallbackData = reinterpret_cast<CallbackDataType *>(UserData);
+    CallbackData->UserFn(CallbackData->UserData);
+    CallbackData->OutputSignal->signal();
+    delete CallbackData;
+    return false;
   }
 
   /// Return the current number of asynchronous operations on the stream.
@@ -2557,7 +2562,7 @@ struct AMDGPUStreamTy {
                                    OutputSignal->get());
   }
 
-  Error pushHostCallback(void (*Callback)(void *), void *UserData) {
+  Error pushHostCallback(HostFnType Callback, void *UserData) {
     // Retrieve an available signal for the operation's output.
     AMDGPUSignalTy *OutputSignal = nullptr;
     if (auto Err = SignalManager.getResource(OutputSignal))
@@ -2573,12 +2578,21 @@ struct AMDGPUStreamTy {
       InputSignal = consume(OutputSignal).second;
     }
 
-    // "Leaking" the thread here is consistent with other work added to the
-    // queue. The input and output signals will remain valid until the output is
-    // signaled.
-    std::thread(CallbackWrapper, InputSignal, OutputSignal, Callback, UserData)
-        .detach();
+    auto *CallbackData = new CallbackDataType{Callback, UserData, OutputSignal};
+    if (InputSignal && InputSignal->load()) {
+      hsa_status_t Status = hsa_amd_signal_async_handler(
+          InputSignal->get(), HSA_SIGNAL_CONDITION_EQ, 0, callbackWrapper,
+          CallbackData);
 
+      return Plugin::check(Status, "error in hsa_amd_signal_async_handler: %s");
+    }
+
+    // No dependencies - schedule it now.
+    // Using a seperate thread because this function should run asynchronously
+    // and not block the main thread.
+    std::thread([](void *CallbackData) { callbackWrapper(0, CallbackData); },
+                CallbackData)
+        .detach();
     return Plugin::success();
   }
 
@@ -4137,7 +4151,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     return Plugin::success();
   }
 
-  Error enqueueHostCallImpl(void (*Callback)(void *), void *UserData,
+  Error enqueueHostCallImpl(AMDGPUStreamTy::HostFnType Callback, void *UserData,
                             AsyncInfoWrapperTy &AsyncInfo) override {
     AMDGPUStreamTy *Stream = nullptr;
     if (auto Err = getStream(AsyncInfo, Stream))
diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_class_member.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_class_member.cpp
new file mode 100644
index 0000000000000..6fef34f665b66
--- /dev/null
+++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_class_member.cpp
@@ -0,0 +1,34 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+#include <omp.h>
+#include <stdio.h>
+
+int x = 0;
+int *y = &x;
+int z = 0;
+
+struct ST {
+  int n = 111;
+  int *a = &x;
+  int *&b = y;
+  int c = 0;
+  int &d = z;
+  int m = 0;
+
+  void f7() {
+#pragma omp target data map(to : c)
+    {
+      void *mapped_ptr = omp_get_mapped_ptr(&c, omp_get_default_device());
+      printf("%d\n", mapped_ptr != NULL); // CHECK: 1
+#pragma omp target data use_device_addr(c)
+      {
+        printf("%d\n", &c == mapped_ptr); // CHECK: 1
+      }
+    }
+  }
+};
+
+int main() {
+  ST s;
+  s.f7();
+}
diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_class_member_ref.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_class_member_ref.cpp
new file mode 100644
index 0000000000000..8ca02ddd0425c
--- /dev/null
+++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_class_member_ref.cpp
@@ -0,0 +1,34 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+#include <omp.h>
+#include <stdio.h>
+
+int x = 0;
+int *y = &x;
+int z = 0;
+
+struct ST {
+  int n = 111;
+  int *a = &x;
+  int *&b = y;
+  int c = 0;
+  int &d = z;
+  int m = 0;
+
+  void f8() {
+#pragma omp target enter data map(to : d)
+    {
+      void *mapped_ptr = omp_get_mapped_ptr(&d, omp_get_default_device());
+      printf("%d\n", mapped_ptr != NULL); // CHECK: 1
+#pragma omp target data use_device_addr(d)
+      {
+        printf("%d\n", &d == mapped_ptr); // CHECK: 1
+      }
+    }
+  }
+};
+
+int main() {
+  ST s;
+  s.f8();
+}
diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_class_member_ref_with_map.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_class_member_ref_with_map.cpp
new file mode 100644
index 0000000000000..5e8769eb3079d
--- /dev/null
+++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_class_member_ref_with_map.cpp
@@ -0,0 +1,49 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+#include <omp.h>
+#include <stdio.h>
+
+int x = 0;
+int *y = &x;
+int z = 0;
+
+struct ST {
+  int n = 111;
+  int *a = &x;
+  int *&b = y;
+  int c = 0;
+  int &d = z;
+  int m = 0;
+
+  void f6() {
+    uintptr_t offset = (uintptr_t)&d - n;
+#pragma omp target data map(to : m, d)
+    {
+      void *mapped_ptr = omp_get_mapped_ptr(&d, omp_get_default_device());
+      printf("%d\n", mapped_ptr != NULL); // CHECK: 1
+#pragma omp target data map(m, d) use_device_addr(d)
+      {
+        // FIXME: Clang is mapping class member references using:
+        //   &this[0], &ref_ptee(this[0].d), 4, PTR_AND_OBJ
+        // but a load from `this[0]` cannot be used to compute the offset
+        // in the runtime, because for example in this case, it would mean
+        // that the base address of the pointee is a load from `n`, i.e. 111.
+        // clang should be emitting the following instead:
+        //   &ref_ptr(this[0].d), &ref_ptee(this[0].d), 4, PTR_AND_OBJ
+        // And eventually, the following that's compatible with the
+        // ref/attach modifiers:
+        //  &ref_ptee(this[0].[d])), &ref_ptee(this[0].d), TO | FROM
+        //  &ref_ptr(this[0].d), &ref_ptee(this[0].d), 4, ATTACH
+        // EXPECTED: 1 0
+        // CHECK:    0 1
+        printf("%d %d\n", &d == mapped_ptr,
+               (uintptr_t)&d == (uintptr_t)mapped_ptr - offset);
+      }
+    }
+  }
+};
+
+int main() {
+  ST s;
+  s.f6();
+}
diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_class_member_with_map.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_class_member_with_map.cpp
new file mode 100644
index 0000000000000..f5db4ecc66175
--- /dev/null
+++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_class_member_with_map.cpp
@@ -0,0 +1,43 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+#include <omp.h>
+#include <stdio.h>
+
+int x = 0;
+int *y = &x;
+int z = 0;
+
+struct ST {
+  int n = 111;
+  int *a = &x;
+  int *&b = y;
+  int c = 0;
+  int &d = z;
+  int m = 0;
+
+  void f5() {
+    uintptr_t offset = (uintptr_t)&c - (uintptr_t)this;
+#pragma omp target data map(to : m, c)
+    {
+      void *mapped_ptr = omp_get_mapped_ptr(&c, omp_get_default_device());
+      printf("%d\n", mapped_ptr != NULL); // CHECK: 1
+#pragma omp target data map(m, c) use_device_addr(c)
+      {
+        // FIXME: RT is currently doing the translation for "&this[0]" instead
+        // of &this->c, for a map like:
+        //   this, &this->c, ..., RETURN_PARAM
+        // We either need to fix RT, or emit a separate entry for such
+        // use_device_addr, even if there is a matching map entry already.
+        // EXPECTED: 1 0
+        // CHECK:    0 1
+        printf("%d %d\n", &c == mapped_ptr,
+               (uintptr_t)&c == (uintptr_t)mapped_ptr - offset);
+      }
+    }
+  }
+};
+
+int main() {
+  ST s;
+  s.f5();
+}
diff --git a/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_class_member.cpp b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_class_member.cpp
new file mode 100644
index 0000000000000..b0253cdbe20d9
--- /dev/null
+++ b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_class_member.cpp
@@ -0,0 +1,34 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+#include <omp.h>
+#include <stdio.h>
+
+int x = 0;
+int *y = &x;
+int z = 0;
+
+struct ST {
+  int n = 111;
+  int *a = &x;
+  int *&b = y;
+  int c = 0;
+  int &d = z;
+  int m = 0;
+
+  void f3() {
+#pragma omp target data map(to : a[0])
+    {
+      void *mapped_ptr = omp_get_mapped_ptr(a, omp_get_default_device());
+      printf("%d\n", mapped_ptr != NULL); // CHECK: 1
+#pragma omp target data use_device_ptr(a)
+      {
+        printf("%d\n", a == mapped_ptr); // CHECK: 1
+      }
+    }
+  }
+};
+
+int main() {
+  ST s;
+  s.f3();
+}
diff --git a/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_class_member_ref.cpp b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_class_member_ref.cpp
new file mode 100644
index 0000000000000..4de34487c2b04
--- /dev/null
+++ b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_class_member_ref.cpp
@@ -0,0 +1,34 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+#include <omp.h>
+#include <stdio.h>
+
+int x = 0;
+int *y = &x;
+int z = 0;
+
+struct ST {
+  int n = 111;
+  int *a = &x;
+  int *&b = y;
+  int c = 0;
+  int &d = z;
+  int m = 0;
+
+  void f4() {
+#pragma omp target data map(to : b[0])
+    {
+      void *mapped_ptr = omp_get_mapped_ptr(b, omp_get_default_device());
+      printf("%d\n", mapped_ptr != NULL); // CHECK: 1
+#pragma omp target data use_device_ptr(b)
+      {
+        printf("%d\n", b == mapped_ptr); // CHECK: 1
+      }
+    }
+  }
+};
+
+int main() {
+  ST s;
+  s.f4();
+}
diff --git a/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_class_member_ref_with_map.cpp b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_class_member_ref_with_map.cpp
new file mode 100644
index 0000000000000..27fda743b989e
--- /dev/null
+++ b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_class_member_ref_with_map.cpp
@@ -0,0 +1,36 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+// XFAIL: *
+
+#include <omp.h>
+#include <stdio.h>
+
+int x = 0;
+int *y = &x;
+int z = 0;
+
+struct ST {
+  int n = 111;
+  int *a = &x;
+  int *&b = y;
+  int c = 0;
+  int &d = z;
+  int m = 0;
+
+  void f2() {
+#pragma omp target data map(to : b[0])
+    {
+      void *mapped_ptr = omp_get_mapped_ptr(b, omp_get_default_device());
+      printf("%d\n", mapped_ptr != NULL); // CHECK: 1
+#pragma omp target data map(b[0], m) use_device_ptr(b)
+      {
+        printf("%d\n", b == mapped_ptr); // CHECK: 1
+      }
+    }
+  }
+};
+
+int main() {
+  ST s;
+  s.f2();
+}
diff --git a/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_class_member_with_map.cpp b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_class_member_with_map.cpp
new file mode 100644
index 0000000000000..38a369659d13d
--- /dev/null
+++ b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_class_member_with_map.cpp
@@ -0,0 +1,36 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+// XFAIL: *
+
+#include <omp.h>
+#include <stdio.h>
+
+int x = 0;
+int *y = &x;
+int z = 0;
+
+struct ST {
+  int n = 111;
+  int *a = &x;
+  int *&b = y;
+  int c = 0;
+  int &d = z;
+  int m = 0;
+
+  void f1() {
+#pragma omp target data map(to : a[0])
+    {
+      void *mapped_ptr = omp_get_mapped_ptr(a, omp_get_default_device());
+      printf("%d\n", mapped_ptr != NULL); // CHECK: 1
+#pragma omp target data map(a[0], m) use_device_ptr(a)
+      {
+        printf("%d\n", a == mapped_ptr); // CHECK: 1
+      }
+    }
+  }
+};
+
+int main() {
+  ST s;
+  s.f1();
+}
diff --git a/offload/test/offloading/CUDA/basic_launch_multi_arg.cu b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu
index b2e1edf51e171..7a32983f51f7c 100644
--- a/offload/test/offloading/CUDA/basic_launch_multi_arg.cu
+++ b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu
@@ -8,7 +8,7 @@
 // REQUIRES: gpu
 //
 // FIXME: https://github.com/llvm/llvm-project/issues/161265
-// XFAIL: gpu
+// UNSUPPORTED: gpu
 
 #include <stdio.h>
 
diff --git a/offload/test/offloading/barrier_fence.c b/offload/test/offloading/barrier_fence.c
index 73d259d4f71b6..e43db0a59e7d6 100644
--- a/offload/test/offloading/barrier_fence.c
+++ b/offload/test/offloading/barrier_fence.c
@@ -4,6 +4,9 @@
 // RUN: %libomptarget-run-generic
 
 // REQUIRES: gpu
+//
+// FIXME: https://github.com/llvm/llvm-project/issues/161265
+// UNSUPPORTED: gpu
 
 #include <omp.h>
 #include <stdio.h>
diff --git a/offload/test/offloading/gpupgo/pgo_atomic_teams.c b/offload/test/offloading/gpupgo/pgo_atomic_teams.c
index b3b72db080392..42d8ae43beba1 100644
--- a/offload/test/offloading/gpupgo/pgo_atomic_teams.c
+++ b/offload/test/offloading/gpupgo/pgo_atomic_teams.c
@@ -18,6 +18,7 @@
 
 // REQUIRES: amdgpu
 // REQUIRES: pgo
+// XFAIL: amdgpu
 
 int test1(int a) { return a / 2; }
 int test2(int a) { return a * 2; }
diff --git a/offload/test/offloading/gpupgo/pgo_atomic_threads.c b/offload/test/offloading/gpupgo/pgo_atomic_threads.c
index 440a6b533317d..09a4dc1577822 100644
--- a/offload/test/offloading/gpupgo/pgo_atomic_threads.c
+++ b/offload/test/offloading/gpupgo/pgo_atomic_threads.c
@@ -18,6 +18,7 @@
 
 // REQUIRES: amdgpu
 // REQUIRES: pgo
+// XFAIL: amdgpu
 
 int test1(int a) { return a / 2; }
 
diff --git a/offload/test/offloading/gpupgo/pgo_device_and_host.c b/offload/test/offloading/gpupgo/pgo_device_and_host.c
index 3e95791ce9a50..c53e69a25e50d 100644
--- a/offload/test/offloading/gpupgo/pgo_device_and_host.c
+++ b/offload/test/offloading/gpupgo/pgo_device_and_host.c
@@ -50,6 +50,7 @@
 
 // REQUIRES: amdgpu
 // REQUIRES: pgo
+// XFAIL: amdgpu
 
 int main() {
   int host_var = 0;
diff --git a/offload/test/offloading/gpupgo/pgo_device_only.c b/offload/test/offloading/gpupgo/pgo_device_only.c
index 2939af613b6dd..644df6e7b0339 100644
--- a/offload/test/offloading/gpupgo/pgo_device_only.c
+++ b/offload/test/offloading/gpupgo/pgo_device_only.c
@@ -16,6 +16,7 @@
 
 // REQUIRES: amdgpu
 // REQUIRES: pgo
+// XFAIL: amdgpu
 
 int test1(int a) { return a / 2; }
 int test2(int a) { return a * 2; }
diff --git a/offload/test/offloading/interop-print.c b/offload/test/offloading/interop-print.c
new file mode 100644
index 0000000000000..a3864209e17bc
--- /dev/null
+++ b/offload/test/offloading/interop-print.c
@@ -0,0 +1,83 @@
+// RUN: %libomptarget-compile-amdgcn-amd-amdhsa
+// RUN:   %libomptarget-run-generic 2>&1 | \
+// RUN:   %fcheck-amdgcn-amd-amdhsa -check-prefixes=AMD
+
+// RUN: %libomptarget-compile-nvptx64-nvidia-cuda
+// RUN:   %libomptarget-run-generic 2>&1 | \
+// RUN:   %fcheck-nvptx64-nvidia-cuda -check-prefixes=NVIDIA
+
+// REQUIRES: gpu
+// XFAIL: nvptx64-nvidia-cuda
+
+#include <omp.h>
+#include <stdio.h>
+
+const char *interop_int_to_string(const int interop_int) {
+  switch (interop_int) {
+  case 1:
+    return "cuda";
+  case 2:
+    return "cuda_driver";
+  case 3:
+    return "opencl";
+  case 4:
+    return "sycl";
+  case 5:
+    return "hip";
+  case 6:
+    return "level_zero";
+  case 7:
+    return "hsa";
+  default:
+    return "unknown";
+  }
+}
+
+int main(int argc, char **argv) {
+
+  // Loop over all available devices
+  for (int id = 0; id < omp_get_num_devices(); ++id) {
+    omp_interop_t iobj = omp_interop_none;
+
+    // TODO: Change targetsync to target when AMD toolchain supports it.
+#pragma omp interop init(target : iobj) device(id)
+
+    int err;
+    int interop_int = omp_get_interop_int(iobj, omp_ipr_fr_id, &err);
+
+    if (err) {
+      fprintf(stderr, "omp_get_interop_int failed: %d\n", err);
+      return -1;
+    }
+
+    // AMD: {{.*}} hsa
+    // NVIDIA: {{.*}} cuda
+    printf("omp_get_interop_int returned %s\n",
+           interop_int_to_string(interop_int));
+
+    const char *interop_vendor =
+        omp_get_interop_str(iobj, omp_ipr_vendor_name, &err);
+    if (err) {
+      fprintf(stderr, "omp_get_interop_str failed: %d\n", err);
+      return -1;
+    }
+
+    // AMD: {{.*}} amd
+    // NVIDIA: {{.*}} nvidia
+    printf("omp_get_interop_str returned %s\n", interop_vendor);
+
+    const char *interop_fr_name =
+        omp_get_interop_str(iobj, omp_ipr_fr_name, &err);
+    if (err) {
+      fprintf(stderr, "omp_get_interop_str failed: %d\n", err);
+      return -1;
+    }
+
+    // AMD: {{.*}} hsa
+    // NVIDIA: {{.*}} cuda
+    printf("omp_get_interop_str returned %s\n", interop_fr_name);
+
+#pragma omp interop destroy(iobj)
+  }
+  return 0;
+}
diff --git a/openmp/runtime/openmp-config.cmake.in b/openmp/runtime/openmp-config.cmake.in
index b32e006a6d617..8cb3642b39c3a 100644
--- a/openmp/runtime/openmp-config.cmake.in
+++ b/openmp/runtime/openmp-config.cmake.in
@@ -15,3 +15,6 @@ include( "${CMAKE_CURRENT_LIST_DIR}/openmpTargets.cmake" )
 set_property(TARGET OpenMP::omp APPEND PROPERTY
     INTERFACE_COMPILE_OPTIONS "-fopenmp"
 )
+set_property(TARGET OpenMP::omp APPEND PROPERTY
+    INTERFACE_LINK_OPTIONS "-fopenmp"
+)
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index 9ebc887307afc..ab42b85c0ee6f 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -97,12 +97,15 @@ class kmp_stats_list;
 // OMPD_SKIP_HWLOC used in libompd/omp-icv.cpp to avoid OMPD depending on hwloc
 #if KMP_USE_HWLOC && KMP_AFFINITY_SUPPORTED && !defined(OMPD_SKIP_HWLOC)
 #include "hwloc.h"
+#define KMP_HWLOC_ENABLED 1
 #ifndef HWLOC_OBJ_NUMANODE
 #define HWLOC_OBJ_NUMANODE HWLOC_OBJ_NODE
 #endif
 #ifndef HWLOC_OBJ_PACKAGE
 #define HWLOC_OBJ_PACKAGE HWLOC_OBJ_SOCKET
 #endif
+#else
+#define KMP_HWLOC_ENABLED 0
 #endif
 
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
@@ -672,10 +675,10 @@ typedef BOOL (*kmp_SetThreadGroupAffinity_t)(HANDLE, const GROUP_AFFINITY *,
 extern kmp_SetThreadGroupAffinity_t __kmp_SetThreadGroupAffinity;
 #endif /* KMP_OS_WINDOWS */
 
-#if KMP_USE_HWLOC && !defined(OMPD_SKIP_HWLOC)
+#if KMP_HWLOC_ENABLED
 extern hwloc_topology_t __kmp_hwloc_topology;
 extern int __kmp_hwloc_error;
-#endif
+#endif // KMP_HWLOC_ENABLED
 
 extern size_t __kmp_affin_mask_size;
 #define KMP_AFFINITY_CAPABLE() (__kmp_affin_mask_size > 0)
@@ -784,10 +787,10 @@ class KMPAffinity {
   static void destroy_api();
   enum api_type {
     NATIVE_OS
-#if KMP_USE_HWLOC
+#if KMP_HWLOC_ENABLED
     ,
     HWLOC
-#endif
+#endif // KMP_HWLOC_ENABLED
   };
   virtual api_type get_api_type() const {
     KMP_ASSERT(0);
@@ -856,9 +859,9 @@ enum affinity_top_method {
   affinity_top_method_group,
 #endif /* KMP_GROUP_AFFINITY */
   affinity_top_method_flat,
-#if KMP_USE_HWLOC
+#if KMP_HWLOC_ENABLED
   affinity_top_method_hwloc,
-#endif
+#endif // KMP_HWLOC_ENABLED
   affinity_top_method_default
 };
 
@@ -1130,9 +1133,9 @@ typedef struct kmp_allocator_t {
   omp_alloctrait_value_t target_access;
   omp_alloctrait_value_t atomic_scope;
   size_t part_size;
-#if KMP_USE_HWLOC
+#if KMP_HWLOC_ENABLED
   omp_alloctrait_value_t membind;
-#endif
+#endif // KMP_HWLOC_ENABLED
 } kmp_allocator_t;
 
 extern omp_memspace_handle_t
@@ -2099,12 +2102,12 @@ typedef struct dispatch_shared_info {
 #if KMP_USE_HIER_SCHED
   void *hier;
 #endif
-#if KMP_USE_HWLOC
+#if KMP_HWLOC_ENABLED
   // When linking with libhwloc, the ORDERED EPCC test slows down on big
   // machines (> 48 cores). Performance analysis showed that a cache thrash
   // was occurring and this padding helps alleviate the problem.
   char padding[64];
-#endif
+#endif // KMP_HWLOC_ENABLED
 } dispatch_shared_info_t;
 
 typedef struct kmp_disp {
diff --git a/openmp/runtime/src/kmp_affinity.cpp b/openmp/runtime/src/kmp_affinity.cpp
index 6bfdfbf2d3cdc..6a0e2910db8e0 100644
--- a/openmp/runtime/src/kmp_affinity.cpp
+++ b/openmp/runtime/src/kmp_affinity.cpp
@@ -19,13 +19,13 @@
 #if KMP_USE_HIER_SCHED
 #include "kmp_dispatch_hier.h"
 #endif
-#if KMP_USE_HWLOC
+#if KMP_HWLOC_ENABLED
 // Copied from hwloc
 #define HWLOC_GROUP_KIND_INTEL_MODULE 102
 #define HWLOC_GROUP_KIND_INTEL_TILE 103
 #define HWLOC_GROUP_KIND_INTEL_DIE 104
 #define HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP 220
-#endif
+#endif // KMP_HWLOC_ENABLED
 #include <ctype.h>
 
 // The machine topology
@@ -1440,7 +1440,7 @@ void KMPAffinity::pick_api() {
   KMPAffinity *affinity_dispatch;
   if (picked_api)
     return;
-#if KMP_USE_HWLOC
+#if KMP_HWLOC_ENABLED
   // Only use Hwloc if affinity isn't explicitly disabled and
   // user requests Hwloc topology method
   if (__kmp_affinity_top_method == affinity_top_method_hwloc &&
@@ -1448,7 +1448,7 @@ void KMPAffinity::pick_api() {
     affinity_dispatch = new KMPHwlocAffinity();
     __kmp_hwloc_available = true;
   } else
-#endif
+#endif // KMP_HWLOC_ENABLED
   {
     affinity_dispatch = new KMPNativeAffinity();
   }
@@ -1699,7 +1699,7 @@ kmp_affin_mask_t *__kmp_affin_fullMask = NULL;
 // Original mask is a subset of full mask in multiple processor groups topology
 kmp_affin_mask_t *__kmp_affin_origMask = NULL;
 
-#if KMP_USE_HWLOC
+#if KMP_HWLOC_ENABLED
 static inline bool __kmp_hwloc_is_cache_type(hwloc_obj_t obj) {
 #if HWLOC_API_VERSION >= 0x00020000
   return hwloc_obj_type_is_cache(obj->type);
@@ -2007,7 +2007,7 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
   __kmp_topology->sort_ids();
   return true;
 }
-#endif // KMP_USE_HWLOC
+#endif // KMP_HWLOC_ENABLED
 
 // If we don't know how to retrieve the machine's processor topology, or
 // encounter an error in doing so, this routine is called to form a "flat"
@@ -4854,7 +4854,7 @@ static bool __kmp_aux_affinity_initialize_topology(kmp_affinity_t &affinity) {
 // In the default code path, errors are not fatal - we just try using
 // another method. We only emit a warning message if affinity is on, or the
 // verbose flag is set, an the nowarnings flag was not set.
-#if KMP_USE_HWLOC
+#if KMP_HWLOC_ENABLED
     if (!success &&
         __kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) {
       if (!__kmp_hwloc_error) {
@@ -4866,7 +4866,7 @@ static bool __kmp_aux_affinity_initialize_topology(kmp_affinity_t &affinity) {
         KMP_INFORM(AffIgnoringHwloc, env_var);
       }
     }
-#endif
+#endif // KMP_HWLOC_ENABLED
 
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
     if (!success) {
@@ -4914,7 +4914,7 @@ static bool __kmp_aux_affinity_initialize_topology(kmp_affinity_t &affinity) {
 // If the user has specified that a paricular topology discovery method is to be
 // used, then we abort if that method fails. The exception is group affinity,
 // which might have been implicitly set.
-#if KMP_USE_HWLOC
+#if KMP_HWLOC_ENABLED
   else if (__kmp_affinity_top_method == affinity_top_method_hwloc) {
     KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC);
     success = __kmp_affinity_create_hwloc_map(&msg_id);
@@ -4923,7 +4923,7 @@ static bool __kmp_aux_affinity_initialize_topology(kmp_affinity_t &affinity) {
       KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
     }
   }
-#endif // KMP_USE_HWLOC
+#endif // KMP_HWLOC_ENABLED
 
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
   else if (__kmp_affinity_top_method == affinity_top_method_x2apicid ||
@@ -5322,12 +5322,12 @@ void __kmp_affinity_uninitialize(void) {
     __kmp_free(__kmp_osid_to_hwthread_map);
     __kmp_osid_to_hwthread_map = NULL;
   }
-#if KMP_USE_HWLOC
+#if KMP_HWLOC_ENABLED
   if (__kmp_hwloc_topology != NULL) {
     hwloc_topology_destroy(__kmp_hwloc_topology);
     __kmp_hwloc_topology = NULL;
   }
-#endif
+#endif // KMP_HWLOC_ENABLED
   if (__kmp_hw_subset) {
     kmp_hw_subset_t::deallocate(__kmp_hw_subset);
     __kmp_hw_subset = nullptr;
diff --git a/openmp/runtime/src/kmp_affinity.h b/openmp/runtime/src/kmp_affinity.h
index dc3191caae634..fa69585f7e2d5 100644
--- a/openmp/runtime/src/kmp_affinity.h
+++ b/openmp/runtime/src/kmp_affinity.h
@@ -18,7 +18,7 @@
 #include <limits>
 
 #if KMP_AFFINITY_SUPPORTED
-#if KMP_USE_HWLOC
+#if KMP_HWLOC_ENABLED
 class KMPHwlocAffinity : public KMPAffinity {
 public:
   class Mask : public KMPAffinity::Mask {
@@ -109,7 +109,7 @@ class KMPHwlocAffinity : public KMPAffinity {
       }
       return error;
     }
-#endif
+#endif // KMP_OS_WINDOWS
     int get_proc_group() const override {
       int group = -1;
 #if KMP_OS_WINDOWS
@@ -191,7 +191,7 @@ class KMPHwlocAffinity : public KMPAffinity {
   }
   api_type get_api_type() const override { return HWLOC; }
 };
-#endif /* KMP_USE_HWLOC */
+#endif /* KMP_HWLOC_ENABLED */
 
 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY ||     \
     KMP_OS_AIX
diff --git a/openmp/runtime/src/kmp_alloc.cpp b/openmp/runtime/src/kmp_alloc.cpp
index 3efb80681a392..989a83602feac 100644
--- a/openmp/runtime/src/kmp_alloc.cpp
+++ b/openmp/runtime/src/kmp_alloc.cpp
@@ -14,7 +14,7 @@
 #include "kmp_io.h"
 #include "kmp_wrapper_malloc.h"
 
-#if KMP_USE_HWLOC
+#if KMP_HWLOC_ENABLED
 #if HWLOC_API_VERSION > 0x00020300
 #define KMP_HWLOC_LOCATION_TYPE_CPUSET HWLOC_LOCATION_TYPE_CPUSET
 #elif HWLOC_API_VERSION == 0x00020300
@@ -26,7 +26,7 @@ enum hwloc_memattr_id_e {
   HWLOC_MEMATTR_ID_CAPACITY
 };
 #endif
-#endif // KMP_USE_HWLOC
+#endif // KMP_HWLOC_ENABLED
 
 // Disable bget when it is not used
 #if KMP_USE_BGET
@@ -1547,7 +1547,7 @@ void __kmp_fini_memkind() {
 #endif
 }
 
-#if KMP_USE_HWLOC
+#if KMP_HWLOC_ENABLED
 static bool __kmp_is_hwloc_membind_supported(hwloc_membind_policy_t policy) {
 #if HWLOC_API_VERSION >= 0x00020300
   const hwloc_topology_support *support;
@@ -1563,7 +1563,7 @@ static bool __kmp_is_hwloc_membind_supported(hwloc_membind_policy_t policy) {
   return false;
 #else
   return false;
-#endif
+#endif // KMP_HWLOC_ENABLED
 }
 
 void *__kmp_hwloc_alloc_membind(hwloc_memattr_id_e attr, size_t size,
@@ -1613,7 +1613,7 @@ void *__kmp_hwloc_membind_policy(omp_memspace_handle_t ms, size_t size,
   return NULL;
 #endif
 }
-#endif // KMP_USE_HWLOC
+#endif // KMP_HWLOC_ENABLED
 
 void __kmp_init_target_mem() {
   *(void **)(&kmp_target_alloc_host) = KMP_DLSYM("llvm_omp_target_alloc_host");
@@ -1742,13 +1742,13 @@ omp_allocator_handle_t __kmpc_init_allocator(int gtid, omp_memspace_handle_t ms,
       al->fb_data = RCAST(kmp_allocator_t *, traits[i].value);
       break;
     case omp_atk_partition:
-#if KMP_USE_HWLOC
+#if KMP_HWLOC_ENABLED
       al->membind = (omp_alloctrait_value_t)traits[i].value;
       KMP_DEBUG_ASSERT(al->membind == omp_atv_environment ||
                        al->membind == omp_atv_nearest ||
                        al->membind == omp_atv_blocked ||
                        al->membind == omp_atv_interleaved);
-#endif
+#endif // KMP_HWLOC_ENABLED
       al->memkind = RCAST(void **, traits[i].value);
       break;
     case omp_atk_pin_device:
@@ -2045,6 +2045,101 @@ void *__kmp_alloc(int gtid, size_t algn, size_t size,
     }
   }
 
+  #if KMP_HWLOC_ENABLED
+    if (__kmp_hwloc_available) {
+      if (__kmp_is_hwloc_membind_supported(HWLOC_MEMBIND_BIND)) {
+        if (allocator < kmp_max_mem_alloc) {
+          // pre-defined allocator
+          if (allocator == omp_high_bw_mem_alloc) {
+            ptr = __kmp_hwloc_alloc_membind(HWLOC_MEMATTR_ID_BANDWIDTH,
+                                            desc.size_a, HWLOC_MEMBIND_BIND);
+            if (ptr == NULL)
+              use_default_allocator = true;
+          } else if (allocator == omp_large_cap_mem_alloc) {
+            ptr = __kmp_hwloc_alloc_membind(HWLOC_MEMATTR_ID_CAPACITY,
+                                            desc.size_a, HWLOC_MEMBIND_BIND);
+            if (ptr == NULL)
+              use_default_allocator = true;
+          } else {
+            use_default_allocator = true;
+          }
+          if (use_default_allocator) {
+            ptr = hwloc_alloc(__kmp_hwloc_topology, desc.size_a);
+          }
+        } else if (al->pool_size > 0) {
+          // custom allocator with pool size requested
+          kmp_uint64 used =
+              KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, desc.size_a);
+          if (used + desc.size_a > al->pool_size) {
+            // not enough space, need to go fallback path
+            KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a);
+            if (al->fb == omp_atv_default_mem_fb) {
+              al = (kmp_allocator_t *)omp_default_mem_alloc;
+              ptr = hwloc_alloc(__kmp_hwloc_topology, desc.size_a);
+            } else if (al->fb == omp_atv_abort_fb) {
+              KMP_ASSERT(0); // abort fallback requested
+            } else if (al->fb == omp_atv_allocator_fb) {
+              KMP_ASSERT(al != al->fb_data);
+              al = al->fb_data;
+              return __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);
+            } // else ptr == NULL;
+          } else {
+            // pool has enough space
+            if (al->membind == omp_atv_interleaved) {
+              if (__kmp_is_hwloc_membind_supported(HWLOC_MEMBIND_INTERLEAVE)) {
+                ptr = __kmp_hwloc_membind_policy(al->memspace, desc.size_a,
+                                                HWLOC_MEMBIND_INTERLEAVE);
+              }
+            } else if (al->membind == omp_atv_environment) {
+              ptr = __kmp_hwloc_membind_policy(al->memspace, desc.size_a,
+                                              HWLOC_MEMBIND_DEFAULT);
+            } else {
+              ptr = hwloc_alloc(__kmp_hwloc_topology, desc.size_a);
+            }
+            if (ptr == NULL) {
+              if (al->fb == omp_atv_default_mem_fb) {
+                al = (kmp_allocator_t *)omp_default_mem_alloc;
+                ptr = hwloc_alloc(__kmp_hwloc_topology, desc.size_a);
+              } else if (al->fb == omp_atv_abort_fb) {
+                KMP_ASSERT(0); // abort fallback requested
+              } else if (al->fb == omp_atv_allocator_fb) {
+                KMP_ASSERT(al != al->fb_data);
+                al = al->fb_data;
+                return __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);
+              }
+            }
+          }
+        } else {
+          // custom allocator, pool size not requested
+          if (al->membind == omp_atv_interleaved) {
+            if (__kmp_is_hwloc_membind_supported(HWLOC_MEMBIND_INTERLEAVE)) {
+              ptr = __kmp_hwloc_membind_policy(al->memspace, desc.size_a,
+                                              HWLOC_MEMBIND_INTERLEAVE);
+            }
+          } else if (al->membind == omp_atv_environment) {
+            ptr = __kmp_hwloc_membind_policy(al->memspace, desc.size_a,
+                                            HWLOC_MEMBIND_DEFAULT);
+          } else {
+            ptr = hwloc_alloc(__kmp_hwloc_topology, desc.size_a);
+          }
+          if (ptr == NULL) {
+            if (al->fb == omp_atv_default_mem_fb) {
+              al = (kmp_allocator_t *)omp_default_mem_alloc;
+              ptr = hwloc_alloc(__kmp_hwloc_topology, desc.size_a);
+            } else if (al->fb == omp_atv_abort_fb) {
+              KMP_ASSERT(0); // abort fallback requested
+            } else if (al->fb == omp_atv_allocator_fb) {
+              KMP_ASSERT(al != al->fb_data);
+              al = al->fb_data;
+              return __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);
+            }
+          }
+        }
+      } else { // alloc membind not supported, use hwloc_alloc
+        ptr = hwloc_alloc(__kmp_hwloc_topology, desc.size_a);
+      }
+    } else {
+  #endif // KMP_HWLOC_ENABLED
   if (__kmp_memkind_available) {
     if (allocator < kmp_max_mem_alloc) {
       // pre-defined allocator
@@ -2170,6 +2265,9 @@ void *__kmp_alloc(int gtid, size_t algn, size_t size,
       KMP_ASSERT(0); // abort fallback requested
     } // no sense to look for another fallback because of same internal alloc
   }
+#if KMP_HWLOC_ENABLED
+  }
+#endif
   KE_TRACE(10, ("__kmp_alloc: T#%d %p=alloc(%d)\n", gtid, ptr, desc.size_a));
   if (ptr == NULL)
     return NULL;
@@ -2315,7 +2413,7 @@ void ___kmpc_free(int gtid, void *ptr, omp_allocator_handle_t allocator) {
     kmp_target_unlock_mem(desc.ptr_alloc, device);
   }
 
-#if KMP_USE_HWLOC
+#if KMP_HWLOC_ENABLED
   if (__kmp_hwloc_available) {
     if (oal > kmp_max_mem_alloc && al->pool_size > 0) {
       kmp_uint64 used =
@@ -2325,7 +2423,7 @@ void ___kmpc_free(int gtid, void *ptr, omp_allocator_handle_t allocator) {
     }
     hwloc_free(__kmp_hwloc_topology, desc.ptr_alloc, desc.size_a);
   } else {
-#endif
+#endif // KMP_HWLOC_ENABLED
     if (__kmp_memkind_available) {
       if (oal < kmp_max_mem_alloc) {
         // pre-defined allocator
@@ -2354,9 +2452,9 @@ void ___kmpc_free(int gtid, void *ptr, omp_allocator_handle_t allocator) {
       }
       __kmp_thread_free(__kmp_thread_from_gtid(gtid), desc.ptr_alloc);
     }
-#if KMP_USE_HWLOC
+#if KMP_HWLOC_ENABLED
   }
-#endif
+#endif // KMP_HWLOC_ENABLED
 }
 
 /* If LEAK_MEMORY is defined, __kmp_free() will *not* free memory. It causes
diff --git a/openmp/runtime/src/kmp_dispatch.h b/openmp/runtime/src/kmp_dispatch.h
index cf19eb52662ce..f161a801700f4 100644
--- a/openmp/runtime/src/kmp_dispatch.h
+++ b/openmp/runtime/src/kmp_dispatch.h
@@ -182,12 +182,12 @@ template <typename T> struct dispatch_shared_info_template {
 #if KMP_USE_HIER_SCHED
   kmp_hier_t<T> *hier;
 #endif
-#if KMP_USE_HWLOC
+#if KMP_HWLOC_ENABLED
   // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
   // machines (> 48 cores). Performance analysis showed that a cache thrash
   // was occurring and this padding helps alleviate the problem.
   char padding[64];
-#endif
+#endif // KMP_HWLOC_ENABLED
 };
 
 /* ------------------------------------------------------------------------ */
diff --git a/openmp/runtime/src/kmp_global.cpp b/openmp/runtime/src/kmp_global.cpp
index 0b1d1813d7148..69121e4e853f2 100644
--- a/openmp/runtime/src/kmp_global.cpp
+++ b/openmp/runtime/src/kmp_global.cpp
@@ -248,10 +248,10 @@ enum mic_type __kmp_mic_type = non_mic;
 
 KMPAffinity *__kmp_affinity_dispatch = NULL;
 
-#if KMP_USE_HWLOC
+#if KMP_HWLOC_ENABLED
 int __kmp_hwloc_error = FALSE;
 hwloc_topology_t __kmp_hwloc_topology = NULL;
-#endif
+#endif // KMP_HWLOC_ENABLED
 
 #if KMP_OS_WINDOWS
 #if KMP_GROUP_AFFINITY
diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp
index d5e73114370aa..183d9865f7f6e 100644
--- a/openmp/runtime/src/kmp_settings.cpp
+++ b/openmp/runtime/src/kmp_settings.cpp
@@ -1069,10 +1069,10 @@ static void __kmp_stg_print_warnings(kmp_str_buf_t *buffer, char const *name,
 static void __kmp_stg_parse_nesting_mode(char const *name, char const *value,
                                          void *data) {
   __kmp_stg_parse_int(name, value, 0, INT_MAX, &__kmp_nesting_mode);
-#if KMP_AFFINITY_SUPPORTED && KMP_USE_HWLOC
+#if KMP_HWLOC_ENABLED
   if (__kmp_nesting_mode > 0)
     __kmp_affinity_top_method = affinity_top_method_hwloc;
-#endif
+#endif // KMP_HWLOC_ENABLED
 } // __kmp_stg_parse_nesting_mode
 
 static void __kmp_stg_print_nesting_mode(kmp_str_buf_t *buffer,
@@ -3307,11 +3307,11 @@ static void __kmp_stg_parse_topology_method(char const *name, char const *value,
   if (__kmp_str_match("all", 1, value)) {
     __kmp_affinity_top_method = affinity_top_method_all;
   }
-#if KMP_USE_HWLOC
+#if KMP_HWLOC_ENABLED
   else if (__kmp_str_match("hwloc", 1, value)) {
     __kmp_affinity_top_method = affinity_top_method_hwloc;
   }
-#endif
+#endif // KMP_HWLOC_ENABLED
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
   else if (__kmp_str_match("cpuid_leaf31", 12, value) ||
            __kmp_str_match("cpuid 1f", 8, value) ||
@@ -3415,11 +3415,11 @@ static void __kmp_stg_print_topology_method(kmp_str_buf_t *buffer,
     break;
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
 
-#if KMP_USE_HWLOC
+#if KMP_HWLOC_ENABLED
   case affinity_top_method_hwloc:
     value = "hwloc";
     break;
-#endif
+#endif // KMP_HWLOC_ENABLED
 
   case affinity_top_method_cpuinfo:
     value = "cpuinfo";
@@ -6283,7 +6283,7 @@ void __kmp_env_initialize(char const *string) {
 #if KMP_AFFINITY_SUPPORTED
 
   if (!TCR_4(__kmp_init_middle)) {
-#if KMP_USE_HWLOC
+#if KMP_HWLOC_ENABLED
     // Force using hwloc when either tiles or numa nodes requested within
     // KMP_HW_SUBSET or granularity setting and no other topology method
     // is requested
@@ -6298,12 +6298,12 @@ void __kmp_env_initialize(char const *string) {
     if (__kmp_affinity.gran == KMP_HW_NUMA ||
         __kmp_affinity.gran == KMP_HW_TILE)
       __kmp_affinity_top_method = affinity_top_method_hwloc;
-#endif
+#endif // KMP_HWLOC_ENABLED
     // Determine if the machine/OS is actually capable of supporting
     // affinity.
     const char *var = "KMP_AFFINITY";
     KMPAffinity::pick_api();
-#if KMP_USE_HWLOC
+#if KMP_HWLOC_ENABLED
     // If Hwloc topology discovery was requested but affinity was also disabled,
     // then tell user that Hwloc request is being ignored and use default
     // topology discovery method.
@@ -6312,7 +6312,7 @@ void __kmp_env_initialize(char const *string) {
       KMP_WARNING(AffIgnoringHwloc, var);
       __kmp_affinity_top_method = affinity_top_method_all;
     }
-#endif
+#endif // KMP_HWLOC_ENABLED
     if (__kmp_affinity.type == affinity_disabled) {
       KMP_AFFINITY_DISABLE();
     } else if (!KMP_AFFINITY_CAPABLE()) {
diff --git a/openmp/runtime/src/z_Linux_asm.S b/openmp/runtime/src/z_Linux_asm.S
index de422f8327a26..89359759fcb42 100644
--- a/openmp/runtime/src/z_Linux_asm.S
+++ b/openmp/runtime/src/z_Linux_asm.S
@@ -18,6 +18,7 @@
 #include "kmp_config.h"
 
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
+.att_syntax
 
 # if defined(__ELF__) && defined(__CET__) && defined(__has_include)
 # if __has_include(<cet.h>)
@@ -120,7 +121,8 @@ KMP_PREFIX_UNDERSCORE(\proc):
 # endif // KMP_OS_DARWIN
 #endif // KMP_ARCH_X86 || KMP_ARCH_x86_64
 
-#if (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && (KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32 || KMP_ARCH_ARM)
+#if (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS || KMP_OS_OPENBSD) &&     \
+    (KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32 || KMP_ARCH_ARM)
 
 # if KMP_OS_DARWIN
 #  define KMP_PREFIX_UNDERSCORE(x) _##x  // extra underscore for OS X* symbols
@@ -188,7 +190,7 @@ KMP_PREFIX_UNDERSCORE(\proc):
 .endm
 # endif // KMP_OS_DARWIN
 
-# if KMP_OS_LINUX
+# if KMP_OS_LINUX || KMP_OS_OPENBSD
 // BTI and PAC gnu property note
 #  define NT_GNU_PROPERTY_TYPE_0 5
 #  define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
@@ -235,7 +237,8 @@ KMP_PREFIX_UNDERSCORE(\proc):
 #  define PACBTI_RET
 #  define GNU_PROPERTY_BTI_PAC
 # endif
-#endif // (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && (KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32 || KMP_ARCH_ARM)
+#endif // (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS || KMP_OS_OPENBSD) && \
+          (KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32 || KMP_ARCH_ARM)
 
 .macro COMMON name, size, align_power
 #if KMP_OS_DARWIN
@@ -2516,6 +2519,7 @@ KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr):
 # endif
 #endif
 
-#if KMP_OS_LINUX && (KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32)
+#if (KMP_OS_LINUX || KMP_OS_OPENBSD) &&                                        \
+    (KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32)
 GNU_PROPERTY_BTI_PAC
 #endif
diff --git a/openmp/tools/omptest/include/InternalEvent.h b/openmp/tools/omptest/include/InternalEvent.h
index 1348c48f72005..743fad99ff4a0 100644
--- a/openmp/tools/omptest/include/InternalEvent.h
+++ b/openmp/tools/omptest/include/InternalEvent.h
@@ -371,6 +371,7 @@ struct BufferRecordDeallocation : public EventBase<BufferRecordDeallocation> {
 // take precedence over the following default equality operator definition.
 bool operator==(const ParallelBegin &, const ParallelBegin &);
 bool operator==(const Work &, const Work &);
+bool operator==(const Dispatch &, const Dispatch &);
 bool operator==(const ImplicitTask &, const ImplicitTask &);
 bool operator==(const SyncRegion &, const SyncRegion &);
 bool operator==(const Target &, const Target &);
diff --git a/openmp/tools/omptest/src/InternalEventOperators.cpp b/openmp/tools/omptest/src/InternalEventOperators.cpp
index 1ec33d574381f..dde64098ed31b 100644
--- a/openmp/tools/omptest/src/InternalEventOperators.cpp
+++ b/openmp/tools/omptest/src/InternalEventOperators.cpp
@@ -36,6 +36,11 @@ bool operator==(const Work &Expected, const Work &Observed) {
          isSameTaskData && isSameCount;
 }
 
+bool operator==(const Dispatch &Expected, const Dispatch &Observed) {
+  bool isSameKind = (Expected.Kind == Observed.Kind);
+  return isSameKind;
+}
+
 bool operator==(const ImplicitTask &Expected, const ImplicitTask &Observed) {
   bool isSameEndpoint = (Expected.Endpoint == Observed.Endpoint);
   bool isSameActualParallelism =
diff --git a/openmp/tools/omptest/test/unittests/internal-event-eq-test.cpp b/openmp/tools/omptest/test/unittests/internal-event-eq-test.cpp
new file mode 100644
index 0000000000000..d30d6daecca5f
--- /dev/null
+++ b/openmp/tools/omptest/test/unittests/internal-event-eq-test.cpp
@@ -0,0 +1,65 @@
+#include "InternalEvent.h"
+#include <omp-tools.h>
+#include <sstream>
+
+#include "gtest/gtest.h"
+
+using namespace omptest;
+
+TEST(InternalEvent_equality_ops, Dispatch_identity) {
+  ompt_data_t DI{.value = 31};
+  internal::Dispatch D{/*ParallelData=*/(ompt_data_t *)0x11,
+                       /*TaskData=*/(ompt_data_t *)0x22,
+                       /*Kind=*/ompt_dispatch_t::ompt_dispatch_iteration,
+                       /*Instance=*/DI};
+
+  EXPECT_EQ(D == D, true);
+}
+
+TEST(InternalEvent_equality_ops, Dispatch_same) {
+  ompt_data_t DI{.ptr = (void *)0x33};
+  internal::Dispatch D1{/*ParallelData=*/(ompt_data_t *)0x11,
+                        /*TaskData=*/(ompt_data_t *)0x22,
+                        /*Kind=*/ompt_dispatch_t::ompt_dispatch_section,
+                        /*Instance=*/DI};
+
+  internal::Dispatch D2{/*ParallelData=*/(ompt_data_t *)0x11,
+                        /*TaskData=*/(ompt_data_t *)0x22,
+                        /*Kind=*/ompt_dispatch_t::ompt_dispatch_section,
+                        /*Instance=*/DI};
+
+  EXPECT_EQ(D1 == D2, true);
+}
+
+TEST(InternalEvent_equality_ops, Dispatch_different_kind) {
+  ompt_data_t DI{.ptr = (void *)0x33};
+  internal::Dispatch D1{/*ParallelData=*/(ompt_data_t *)0x11,
+                        /*TaskData=*/(ompt_data_t *)0x22,
+                        /*Kind=*/ompt_dispatch_t::ompt_dispatch_section,
+                        /*Instance=*/DI};
+
+  internal::Dispatch D2{/*ParallelData=*/(ompt_data_t *)0x11,
+                        /*TaskData=*/(ompt_data_t *)0x22,
+                        /*Kind=*/ompt_dispatch_t::ompt_dispatch_iteration,
+                        /*Instance=*/DI};
+
+  // Demonstrate that 'Kind' is the only relevant field for equality.
+  EXPECT_EQ(D1 == D2, false);
+}
+
+TEST(InternalEvent_equality_ops, Dispatch_same_kind_different_other) {
+  ompt_data_t DI1{.ptr = (void *)0x33};
+  internal::Dispatch D1{/*ParallelData=*/(ompt_data_t *)0x11,
+                        /*TaskData=*/(ompt_data_t *)0x22,
+                        /*Kind=*/ompt_dispatch_t::ompt_dispatch_section,
+                        /*Instance=*/DI1};
+
+  ompt_data_t DI2{.ptr = (void *)0x66};
+  internal::Dispatch D2{/*ParallelData=*/(ompt_data_t *)0x44,
+                        /*TaskData=*/(ompt_data_t *)0x55,
+                        /*Kind=*/ompt_dispatch_t::ompt_dispatch_section,
+                        /*Instance=*/DI2};
+
+  // Demonstrate that 'Kind' is the only relevant field for equality.
+  EXPECT_EQ(D1 == D2, true);
+}
diff --git a/openmp/tools/omptest/test/unittests/internal-event-test.cpp b/openmp/tools/omptest/test/unittests/internal-event-tostring-test.cpp
similarity index 100%
rename from openmp/tools/omptest/test/unittests/internal-event-test.cpp
rename to openmp/tools/omptest/test/unittests/internal-event-tostring-test.cpp
diff --git a/orc-rt/include/orc-rt/ExecutorAddress.h b/orc-rt/include/orc-rt/ExecutorAddress.h
index cc7bbf53ac51a..6ec583feace12 100644
--- a/orc-rt/include/orc-rt/ExecutorAddress.h
+++ b/orc-rt/include/orc-rt/ExecutorAddress.h
@@ -204,6 +204,9 @@ struct ExecutorAddrRange {
   constexpr bool contains(ExecutorAddr Addr) const noexcept {
     return Start <= Addr && Addr < End;
   }
+  constexpr bool contains(const ExecutorAddrRange &Other) const noexcept {
+    return (Other.Start >= Start && Other.End <= End);
+  }
   constexpr bool overlaps(const ExecutorAddrRange &Other) const noexcept {
     return !(Other.End <= Start || End <= Other.Start);
   }
diff --git a/orc-rt/unittests/ExecutorAddressTest.cpp b/orc-rt/unittests/ExecutorAddressTest.cpp
index 98074a7617de4..2e049012cbfd2 100644
--- a/orc-rt/unittests/ExecutorAddressTest.cpp
+++ b/orc-rt/unittests/ExecutorAddressTest.cpp
@@ -97,10 +97,16 @@ TEST(ExecutorAddrTest, AddrRanges) {
   EXPECT_FALSE(R1.contains(A0));
   EXPECT_FALSE(R1.contains(A2));
 
+  EXPECT_TRUE(R3.contains(R0));  // True for singleton range at start.
+  EXPECT_TRUE(R3.contains(R1));  // True for singleton range at end.
+  EXPECT_FALSE(R3.contains(R2)); // False for non-overlaping singleton range.
+  EXPECT_FALSE(R3.contains(R4)); // False for overlapping, uncontained range.
+
   EXPECT_FALSE(R1.overlaps(R0));
   EXPECT_FALSE(R1.overlaps(R2));
   EXPECT_TRUE(R1.overlaps(R3));
   EXPECT_TRUE(R1.overlaps(R4));
+  EXPECT_TRUE(R3.overlaps(R4));
 }
 
 TEST(ExecutorAddrTest, Hashable) {
diff --git a/polly/include/polly/LinkAllPasses.h b/polly/include/polly/LinkAllPasses.h
index d2e10d1a7acc6..c3b68a74056ac 100644
--- a/polly/include/polly/LinkAllPasses.h
+++ b/polly/include/polly/LinkAllPasses.h
@@ -42,8 +42,6 @@ llvm::Pass *createJSONExporterPass();
 llvm::Pass *createJSONImporterPass();
 llvm::Pass *createJSONImporterPrinterLegacyPass(llvm::raw_ostream &OS);
 llvm::Pass *createPollyCanonicalizePass();
-llvm::Pass *createPolyhedralInfoPass();
-llvm::Pass *createPolyhedralInfoPrinterLegacyPass(llvm::raw_ostream &OS);
 llvm::Pass *createScopDetectionWrapperPassPass();
 llvm::Pass *createScopDetectionPrinterLegacyPass(llvm::raw_ostream &OS);
 llvm::Pass *createScopInfoRegionPassPass();
@@ -98,8 +96,6 @@ struct PollyForcePassLinking {
     polly::createScopInfoWrapperPassPass();
     polly::createScopInfoPrinterLegacyFunctionPass(llvm::outs());
     polly::createPollyCanonicalizePass();
-    polly::createPolyhedralInfoPass();
-    polly::createPolyhedralInfoPrinterLegacyPass(llvm::outs());
     polly::createIslAstInfoWrapperPassPass();
     polly::createIslAstInfoPrinterLegacyPass(llvm::outs());
     polly::createCodeGenerationPass();
@@ -155,8 +151,6 @@ void initializeDeLICMPrinterLegacyPassPass(llvm::PassRegistry &);
 void initializeSimplifyWrapperPassPass(llvm::PassRegistry &);
 void initializeSimplifyPrinterLegacyPassPass(llvm::PassRegistry &);
 void initializePruneUnprofitableWrapperPassPass(llvm::PassRegistry &);
-void initializePolyhedralInfoPass(llvm::PassRegistry &);
-void initializePolyhedralInfoPrinterLegacyPassPass(llvm::PassRegistry &);
 } // namespace llvm
 
 #endif
diff --git a/polly/include/polly/PolyhedralInfo.h b/polly/include/polly/PolyhedralInfo.h
deleted file mode 100644
index b7534cd6b8df7..0000000000000
--- a/polly/include/polly/PolyhedralInfo.h
+++ /dev/null
@@ -1,104 +0,0 @@
-//===- polly/PolyhedralInfo.h - PolyhedralInfo class definition -*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// This file contains the declaration of the PolyhedralInfo class, which will
-/// provide an interface to expose polyhedral analysis information of Polly.
-///
-/// This is work in progress. We will add more API's as and when deemed
-/// required.
-//===----------------------------------------------------------------------===///
-
-#ifndef POLLY_POLYHEDRAL_INFO_H
-#define POLLY_POLYHEDRAL_INFO_H
-
-#include "llvm/Pass.h"
-#include "isl/aff_type.h"
-#include "isl/ctx.h"
-#include "isl/union_map_type.h"
-
-namespace llvm {
-class Loop;
-} // namespace llvm
-
-namespace polly {
-
-class Scop;
-class ScopInfo;
-class DependenceInfoWrapperPass;
-
-class PolyhedralInfo final : public llvm::FunctionPass {
-public:
-  static char ID; // Pass identification, replacement for typeid
-
-  /// Construct a new PolyhedralInfo pass.
-  PolyhedralInfo() : FunctionPass(ID) {}
-  ~PolyhedralInfo() {}
-
-  /// Check if a given loop is parallel.
-  ///
-  /// @param L The loop.
-  ///
-  /// @return  Returns true, if loop is parallel false otherwise.
-  bool isParallel(llvm::Loop *L) const;
-
-  /// Return the SCoP containing the @p L loop.
-  ///
-  /// @param L The loop.
-  ///
-  /// @return  Returns the SCoP containing the given loop.
-  ///          Returns null if the loop is not contained in any SCoP.
-  const Scop *getScopContainingLoop(llvm::Loop *L) const;
-
-  /// Computes the partial schedule for the given @p L loop.
-  ///
-  /// @param S The SCoP containing the given loop
-  /// @param L The loop.
-  ///
-  /// @return  Returns the partial schedule for the given loop
-  __isl_give isl_union_map *getScheduleForLoop(const Scop *S,
-                                               llvm::Loop *L) const;
-
-  /// Get the SCoP and dependence analysis information for @p F.
-  bool runOnFunction(llvm::Function &F) override;
-
-  /// Release the internal memory.
-  void releaseMemory() override {}
-
-  /// Print to @p OS if each dimension of a loop nest is parallel or not.
-  void print(llvm::raw_ostream &OS,
-             const llvm::Module *M = nullptr) const override;
-
-  /// Register all analyses and transformation required.
-  void getAnalysisUsage(llvm::AnalysisUsage &AU) const override;
-
-private:
-  /// Check if a given loop is parallel or vectorizable.
-  ///
-  /// @param L             The loop.
-  /// @param MinDepDistPtr If not nullptr, the minimal dependence distance will
-  ///                      be returned at the address of that pointer
-  ///
-  /// @return  Returns true if loop is parallel or vectorizable, false
-  ///          otherwise.
-  bool checkParallel(llvm::Loop *L,
-                     __isl_give isl_pw_aff **MinDepDistPtr = nullptr) const;
-
-  ScopInfo *SI;
-  DependenceInfoWrapperPass *DI;
-};
-
-llvm::Pass *createPolyhedralInfoPrinterLegacyPass(llvm::raw_ostream &OS);
-} // end namespace polly
-
-namespace llvm {
-class PassRegistry;
-void initializePolyhedralInfoPass(llvm::PassRegistry &);
-void initializePolyhedralInfoPrinterLegacyPassPass(llvm::PassRegistry &);
-} // namespace llvm
-
-#endif
diff --git a/polly/lib/Analysis/PolyhedralInfo.cpp b/polly/lib/Analysis/PolyhedralInfo.cpp
deleted file mode 100644
index 8d8e81a9049df..0000000000000
--- a/polly/lib/Analysis/PolyhedralInfo.cpp
+++ /dev/null
@@ -1,215 +0,0 @@
-//===--------- PolyhedralInfo.cpp  - Create Scops from LLVM IR-------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// An interface to the Polyhedral analysis engine(Polly) of LLVM.
-//
-// This pass provides an interface to the polyhedral analysis performed by
-// Polly.
-//
-// This interface provides basic interface like isParallel, isVectorizable
-// that can be used in LLVM transformation passes.
-//
-// Work in progress, this file is subject to change.
-//
-//===----------------------------------------------------------------------===//
-
-#include "polly/PolyhedralInfo.h"
-#include "polly/DependenceInfo.h"
-#include "polly/LinkAllPasses.h"
-#include "polly/Options.h"
-#include "polly/ScopInfo.h"
-#include "polly/Support/GICHelper.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Support/Debug.h"
-#include "isl/union_map.h"
-
-using namespace llvm;
-using namespace polly;
-
-#include "polly/Support/PollyDebug.h"
-#define DEBUG_TYPE "polyhedral-info"
-
-static cl::opt<bool> CheckParallel("polly-check-parallel",
-                                   cl::desc("Check for parallel loops"),
-                                   cl::Hidden, cl::cat(PollyCategory));
-
-static cl::opt<bool> CheckVectorizable("polly-check-vectorizable",
-                                       cl::desc("Check for vectorizable loops"),
-                                       cl::Hidden, cl::cat(PollyCategory));
-
-void PolyhedralInfo::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequiredTransitive<DependenceInfoWrapperPass>();
-  AU.addRequired<LoopInfoWrapperPass>();
-  AU.addRequiredTransitive<ScopInfoWrapperPass>();
-  AU.setPreservesAll();
-}
-
-bool PolyhedralInfo::runOnFunction(Function &F) {
-  DI = &getAnalysis<DependenceInfoWrapperPass>();
-  SI = getAnalysis<ScopInfoWrapperPass>().getSI();
-  return false;
-}
-
-void PolyhedralInfo::print(raw_ostream &OS, const Module *) const {
-  auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  for (auto *TopLevelLoop : LI) {
-    for (auto *L : depth_first(TopLevelLoop)) {
-      OS.indent(2) << L->getHeader()->getName() << ":\t";
-      if (CheckParallel && isParallel(L))
-        OS << "Loop is parallel.\n";
-      else if (CheckParallel)
-        OS << "Loop is not parallel.\n";
-    }
-  }
-}
-
-bool PolyhedralInfo::checkParallel(Loop *L, isl_pw_aff **MinDepDistPtr) const {
-  bool IsParallel;
-  const Scop *S = getScopContainingLoop(L);
-  if (!S)
-    return false;
-  const Dependences &D =
-      DI->getDependences(const_cast<Scop *>(S), Dependences::AL_Access);
-  if (!D.hasValidDependences())
-    return false;
-  POLLY_DEBUG(dbgs() << "Loop :\t" << L->getHeader()->getName() << ":\n");
-
-  isl_union_map *Deps =
-      D.getDependences(Dependences::TYPE_RAW | Dependences::TYPE_WAW |
-                       Dependences::TYPE_WAR | Dependences::TYPE_RED)
-          .release();
-
-  POLLY_DEBUG(dbgs() << "Dependences :\t" << stringFromIslObj(Deps, "null")
-                     << "\n");
-
-  isl_union_map *Schedule = getScheduleForLoop(S, L);
-  POLLY_DEBUG(dbgs() << "Schedule: \t" << stringFromIslObj(Schedule, "null")
-                     << "\n");
-
-  IsParallel = D.isParallel(Schedule, Deps, MinDepDistPtr);
-  isl_union_map_free(Schedule);
-  return IsParallel;
-}
-
-bool PolyhedralInfo::isParallel(Loop *L) const { return checkParallel(L); }
-
-const Scop *PolyhedralInfo::getScopContainingLoop(Loop *L) const {
-  assert((SI) && "ScopInfoWrapperPass is required by PolyhedralInfo pass!\n");
-  for (auto &It : *SI) {
-    Region *R = It.first;
-    if (R->contains(L))
-      return It.second.get();
-  }
-  return nullptr;
-}
-
-//  Given a Loop and the containing SCoP, we compute the partial schedule
-//  by taking union of individual schedules of each ScopStmt within the loop
-//  and projecting out the inner dimensions from the range of the schedule.
-//   for (i = 0; i < n; i++)
-//      for (j = 0; j < n; j++)
-//        A[j] = 1;  //Stmt
-//
-//  The original schedule will be
-//    Stmt[i0, i1] -> [i0, i1]
-//  The schedule for the outer loop will be
-//    Stmt[i0, i1] -> [i0]
-//  The schedule for the inner loop will be
-//    Stmt[i0, i1] -> [i0, i1]
-__isl_give isl_union_map *PolyhedralInfo::getScheduleForLoop(const Scop *S,
-                                                             Loop *L) const {
-  isl_union_map *Schedule = isl_union_map_empty(S->getParamSpace().release());
-  int CurrDim = S->getRelativeLoopDepth(L);
-  POLLY_DEBUG(dbgs() << "Relative loop depth:\t" << CurrDim << "\n");
-  assert(CurrDim >= 0 && "Loop in region should have at least depth one");
-
-  for (auto &SS : *S) {
-    if (L->contains(SS.getSurroundingLoop())) {
-
-      unsigned int MaxDim = SS.getNumIterators();
-      POLLY_DEBUG(dbgs() << "Maximum depth of Stmt:\t" << MaxDim << "\n");
-      isl_map *ScheduleMap = SS.getSchedule().release();
-      assert(
-          ScheduleMap &&
-          "Schedules that contain extension nodes require special handling.");
-
-      ScheduleMap = isl_map_project_out(ScheduleMap, isl_dim_out, CurrDim + 1,
-                                        MaxDim - CurrDim - 1);
-      ScheduleMap = isl_map_set_tuple_id(ScheduleMap, isl_dim_in,
-                                         SS.getDomainId().release());
-      Schedule =
-          isl_union_map_union(Schedule, isl_union_map_from_map(ScheduleMap));
-    }
-  }
-  Schedule = isl_union_map_coalesce(Schedule);
-  return Schedule;
-}
-
-char PolyhedralInfo::ID = 0;
-
-Pass *polly::createPolyhedralInfoPass() { return new PolyhedralInfo(); }
-
-INITIALIZE_PASS_BEGIN(PolyhedralInfo, "polyhedral-info",
-                      "Polly - Interface to polyhedral analysis engine", false,
-                      false);
-INITIALIZE_PASS_DEPENDENCY(DependenceInfoWrapperPass);
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass);
-INITIALIZE_PASS_DEPENDENCY(ScopInfoWrapperPass);
-INITIALIZE_PASS_END(PolyhedralInfo, "polyhedral-info",
-                    "Polly - Interface to polyhedral analysis engine", false,
-                    false)
-
-//===----------------------------------------------------------------------===//
-
-namespace {
-/// Print result from PolyhedralInfo.
-class PolyhedralInfoPrinterLegacyPass final : public FunctionPass {
-public:
-  static char ID;
-
-  PolyhedralInfoPrinterLegacyPass() : PolyhedralInfoPrinterLegacyPass(outs()) {}
-  explicit PolyhedralInfoPrinterLegacyPass(llvm::raw_ostream &OS)
-      : FunctionPass(ID), OS(OS) {}
-
-  bool runOnFunction(Function &F) override {
-    PolyhedralInfo &P = getAnalysis<PolyhedralInfo>();
-
-    OS << "Printing analysis '" << P.getPassName() << "' for function '"
-       << F.getName() << "':\n";
-    P.print(OS);
-
-    return false;
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    FunctionPass::getAnalysisUsage(AU);
-    AU.addRequired<PolyhedralInfo>();
-    AU.setPreservesAll();
-  }
-
-private:
-  llvm::raw_ostream &OS;
-};
-
-char PolyhedralInfoPrinterLegacyPass::ID = 0;
-} // namespace
-
-Pass *polly::createPolyhedralInfoPrinterLegacyPass(raw_ostream &OS) {
-  return new PolyhedralInfoPrinterLegacyPass(OS);
-}
-
-INITIALIZE_PASS_BEGIN(
-    PolyhedralInfoPrinterLegacyPass, "print-polyhedral-info",
-    "Polly - Print interface to polyhedral analysis engine analysis", false,
-    false);
-INITIALIZE_PASS_DEPENDENCY(PolyhedralInfo);
-INITIALIZE_PASS_END(
-    PolyhedralInfoPrinterLegacyPass, "print-polyhedral-info",
-    "Polly - Print interface to polyhedral analysis engine analysis", false,
-    false)
diff --git a/polly/lib/CMakeLists.txt b/polly/lib/CMakeLists.txt
index d91f4ecd37e6c..0ed673815ff34 100644
--- a/polly/lib/CMakeLists.txt
+++ b/polly/lib/CMakeLists.txt
@@ -43,7 +43,6 @@ add_llvm_pass_plugin(Polly
   NO_MODULE
   SUBPROJECT Polly
   Analysis/DependenceInfo.cpp
-  Analysis/PolyhedralInfo.cpp
   Analysis/ScopDetection.cpp
   Analysis/ScopDetectionDiagnostic.cpp
   Analysis/ScopInfo.cpp
diff --git a/polly/lib/Support/RegisterPasses.cpp b/polly/lib/Support/RegisterPasses.cpp
index 56cb8aadce3b6..0420dff944f62 100644
--- a/polly/lib/Support/RegisterPasses.cpp
+++ b/polly/lib/Support/RegisterPasses.cpp
@@ -30,7 +30,6 @@
 #include "polly/JSONExporter.h"
 #include "polly/LinkAllPasses.h"
 #include "polly/MaximalStaticExpansion.h"
-#include "polly/PolyhedralInfo.h"
 #include "polly/PruneUnprofitable.h"
 #include "polly/ScheduleOptimizer.h"
 #include "polly/ScopDetection.h"
@@ -232,8 +231,6 @@ void initializePollyPasses(llvm::PassRegistry &Registry) {
   initializeIslScheduleOptimizerWrapperPassPass(Registry);
   initializeIslScheduleOptimizerPrinterLegacyPassPass(Registry);
   initializePollyCanonicalizePass(Registry);
-  initializePolyhedralInfoPass(Registry);
-  initializePolyhedralInfoPrinterLegacyPassPass(Registry);
   initializeScopDetectionWrapperPassPass(Registry);
   initializeScopDetectionPrinterLegacyPassPass(Registry);
   initializeScopInlinerPass(Registry);
diff --git a/polly/lib/Transform/Canonicalization.cpp b/polly/lib/Transform/Canonicalization.cpp
index 748d710dd08c1..1be560e64af40 100644
--- a/polly/lib/Transform/Canonicalization.cpp
+++ b/polly/lib/Transform/Canonicalization.cpp
@@ -104,8 +104,7 @@ polly::buildCanonicalicationPassesForNPM(llvm::ModulePassManager &MPM,
     LoopPassManager LPM;
     LPM.addPass(LoopRotatePass(Level != OptimizationLevel::Oz));
     FPM.addPass(createFunctionToLoopPassAdaptor<LoopPassManager>(
-        std::move(LPM), /*UseMemorySSA=*/false,
-        /*UseBlockFrequencyInfo=*/false));
+        std::move(LPM), /*UseMemorySSA=*/false));
   }
   if (PollyInliner) {
     MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
@@ -121,8 +120,7 @@ polly::buildCanonicalicationPassesForNPM(llvm::ModulePassManager &MPM,
     LoopPassManager LPM;
     LPM.addPass(IndVarSimplifyPass());
     FPM.addPass(createFunctionToLoopPassAdaptor<LoopPassManager>(
-        std::move(LPM), /*UseMemorySSA=*/false,
-        /*UseBlockFrequencyInfo=*/true));
+        std::move(LPM), /*UseMemorySSA=*/false));
   }
 
   return FPM;
diff --git a/polly/test/IstAstInfo/OpenMP/multiple_loops_outer_parallel.ll b/polly/test/IstAstInfo/OpenMP/multiple_loops_outer_parallel.ll
index 00f18aebbcd5a..ec1ccdce94508 100644
--- a/polly/test/IstAstInfo/OpenMP/multiple_loops_outer_parallel.ll
+++ b/polly/test/IstAstInfo/OpenMP/multiple_loops_outer_parallel.ll
@@ -1,13 +1,10 @@
 ; RUN: opt %loadPolly -polly-print-ast -polly-parallel -polly-parallel-force -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -print-polyhedral-info -polly-check-parallel -disable-output < %s | FileCheck %s -check-prefix=PINFO
 ;
 ;       void jd(int *A) {
 ; CHECK:  #pragma omp parallel for
-; PINFO:  for.cond2: Loop is parallel.
 ;         for (int i = 0; i < 1024; i++)
 ;           A[i] = 1;
 ; CHECK:  #pragma omp parallel for
-; PINFO:  for.cond: Loop is parallel.
 ;         for (int i = 0; i < 1024; i++)
 ;           A[i] = A[i] * 2;
 ;       }
diff --git a/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel.ll b/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel.ll
index bcb35cb4b07c1..9c00690605408 100644
--- a/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel.ll
+++ b/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel.ll
@@ -1,5 +1,4 @@
 ; RUN: opt %loadPolly -polly-print-ast -polly-parallel -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -print-polyhedral-info -polly-check-parallel -disable-output < %s | FileCheck %s -check-prefix=PINFO
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 ; for (i = 0; i < 1024; i++)
@@ -50,6 +49,3 @@ ret:
 ; CHECK-NOT:   #pragma omp parallel for
 ; CHECK:       for (int c1 = 0; c1 <= 1023; c1 += 1)
 ; CHECK:         Stmt_loop_body(c0, c1);
-;
-; PINFO:      loop.i: Loop is parallel.
-; PINFO-NEXT: loop.j: Loop is parallel.
diff --git a/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel_parametric.ll b/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel_parametric.ll
index e2ff5d5756b13..356762a2ae5b9 100644
--- a/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel_parametric.ll
+++ b/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel_parametric.ll
@@ -1,5 +1,4 @@
 ; RUN: opt %loadPolly -polly-print-ast -polly-parallel -polly-parallel-force -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -print-polyhedral-info -polly-check-parallel -disable-output < %s | FileCheck %s -check-prefix=PINFO
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 ; int A[1024][1024];
 ; void bar(int n) {
@@ -47,6 +46,3 @@ ret:
 ; CHECK:     #pragma simd
 ; CHECK:     for (int c1 = 0; c1 < n; c1 += 1)
 ; CHECK:       Stmt_loop_body(c0, c1);
-
-; PINFO:      loop.i: Loop is parallel.
-; PINFO-NEXT: loop.j: Loop is parallel.
diff --git a/polly/test/IstAstInfo/OpenMP/nested_loop_inner_parallel.ll b/polly/test/IstAstInfo/OpenMP/nested_loop_inner_parallel.ll
index 17ef7fe6f251d..066fc39def6ac 100644
--- a/polly/test/IstAstInfo/OpenMP/nested_loop_inner_parallel.ll
+++ b/polly/test/IstAstInfo/OpenMP/nested_loop_inner_parallel.ll
@@ -1,5 +1,4 @@
 ; RUN: opt %loadPolly -polly-print-ast -polly-parallel -polly-parallel-force -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -print-polyhedral-info -polly-check-parallel -disable-output< %s | FileCheck %s -check-prefix=PINFO
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 ; for (i = 0; i < n; i++)
@@ -45,6 +44,3 @@ ret:
 ; CHECK:   #pragma omp parallel for
 ; CHECK:   for (int c1 = 0; c1 < n; c1 += 1)
 ; CHECK:     Stmt_loop_body(c0, c1);
-
-; PINFO:      loop.i: Loop is not parallel.
-; PINFO-NEXT: loop.j: Loop is parallel.
diff --git a/polly/test/IstAstInfo/OpenMP/nested_loop_outer_parallel.ll b/polly/test/IstAstInfo/OpenMP/nested_loop_outer_parallel.ll
index bc381e2c87fdb..77dd55cb7605e 100644
--- a/polly/test/IstAstInfo/OpenMP/nested_loop_outer_parallel.ll
+++ b/polly/test/IstAstInfo/OpenMP/nested_loop_outer_parallel.ll
@@ -1,5 +1,4 @@
 ; RUN: opt %loadPolly -polly-print-ast -polly-parallel -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -print-polyhedral-info -polly-check-parallel -disable-output < %s | FileCheck %s -check-prefix=PINFO
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 ; for (i = 0; i < n; i++)
@@ -44,6 +43,3 @@ ret:
 ; CHECK: for (int c0 = 0; c0 < n; c0 += 1)
 ; CHECK:   for (int c1 = 0; c1 < n; c1 += 1)
 ; CHECK:     Stmt_loop_body(c0, c1);
-
-; PINFO:      loop.i: Loop is parallel.
-; PINFO-NEXT: loop.j: Loop is not parallel.
diff --git a/polly/test/IstAstInfo/OpenMP/single_loop_param_non_parallel.ll b/polly/test/IstAstInfo/OpenMP/single_loop_param_non_parallel.ll
index ee02dafeedeb1..b61ebc9379b7f 100644
--- a/polly/test/IstAstInfo/OpenMP/single_loop_param_non_parallel.ll
+++ b/polly/test/IstAstInfo/OpenMP/single_loop_param_non_parallel.ll
@@ -1,5 +1,4 @@
 ; RUN: opt %loadPolly -polly-print-ast -polly-parallel -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -print-polyhedral-info -polly-check-parallel -disable-output < %s | FileCheck %s -check-prefix=PINFO
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 ; for (i = 0; i < n; i++)
@@ -31,4 +30,3 @@ ret:
 
 ; CHECK: for (int c0 = 0; c0 < n; c0 += 1)
 ; CHECK:   Stmt_loop_body(c0)
-; PINFO: loop.header: Loop is not parallel.
diff --git a/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel.ll b/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel.ll
index a5831302471ee..5c92a91681867 100644
--- a/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel.ll
+++ b/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel.ll
@@ -1,5 +1,4 @@
 ; RUN: opt %loadPolly -polly-print-ast -polly-parallel -polly-parallel-force -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -print-polyhedral-info -polly-check-parallel -disable-output < %s | FileCheck %s -check-prefix=PINFO
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 ; for (i = 0; i < n; i++)
@@ -34,4 +33,3 @@ ret:
 ; CHECK: #pragma omp parallel for
 ; CHECK: for (int c0 = 0; c0 < n; c0 += 1)
 ; CHECK:   Stmt_loop_body(c0)
-; PINFO: loop.header: Loop is parallel.
diff --git a/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel_computeout.ll b/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel_computeout.ll
index 31a906ed403c8..352d879199675 100644
--- a/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel_computeout.ll
+++ b/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel_computeout.ll
@@ -1,5 +1,4 @@
 ; RUN: opt %loadPolly -polly-print-ast -polly-parallel -polly-dependences-computeout=1 -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -print-polyhedral-info -polly-check-parallel -disable-output < %s | FileCheck %s -check-prefix=PINFO
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 ; for (i = 0; i < n; i++)
@@ -34,4 +33,3 @@ ret:
 ; CHECK-NOT: #pragma omp parallel for
 ; CHECK: for (int c0 = 0; c0 < n; c0 += 1)
 ; CHECK:   Stmt_loop_body(c0)
-; PINFO: loop.header: Loop is parallel.
diff --git a/polly/test/IstAstInfo/dependence_distance_constant.ll b/polly/test/IstAstInfo/dependence_distance_constant.ll
index 8b0e4d267c14d..9b7fb93f2f676 100644
--- a/polly/test/IstAstInfo/dependence_distance_constant.ll
+++ b/polly/test/IstAstInfo/dependence_distance_constant.ll
@@ -1,12 +1,9 @@
 ; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -print-polyhedral-info -polly-check-parallel -disable-output < %s | FileCheck %s -check-prefix=PINFO
 ;
 ;        void f(int *A, int N) {
 ; CHECK:   #pragma minimal dependence distance: 1
-; PINFO:   for.cond: Loop is not parallel.
 ;          for (int j = 0; j < N; j++)
 ; CHECK:      #pragma minimal dependence distance: 8
-; PINFO-NEXT: for.cond1: Loop is not parallel.
 ;             for (int i = 0; i < N; i++)
 ;               A[i + 8] = A[i] + 1;
 ;        }
diff --git a/polly/test/IstAstInfo/dependence_distance_multiple_constant.ll b/polly/test/IstAstInfo/dependence_distance_multiple_constant.ll
index 4dae80902457c..bc21e9e07ad89 100644
--- a/polly/test/IstAstInfo/dependence_distance_multiple_constant.ll
+++ b/polly/test/IstAstInfo/dependence_distance_multiple_constant.ll
@@ -1,9 +1,7 @@
 ; RUN: opt %loadPolly -basic-aa -polly-stmt-granularity=bb -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -print-polyhedral-info -polly-check-parallel -disable-output < %s | FileCheck %s -check-prefix=PINFO
 ;
 ;        void f(int *restrict A, int *restrict B, int N) {
 ; CHECK:   #pragma minimal dependence distance: 5
-; PINFO:   for.cond: Loop is not parallel.
 ;          for (int i = 0; i < N; i++) {
 ;            A[i + 7] = A[i] + 1;
 ;            B[i + 5] = B[i] + 1;
diff --git a/polly/test/IstAstInfo/dependence_distance_parametric.ll b/polly/test/IstAstInfo/dependence_distance_parametric.ll
index 3133b732c9dbc..fa569a8386b86 100644
--- a/polly/test/IstAstInfo/dependence_distance_parametric.ll
+++ b/polly/test/IstAstInfo/dependence_distance_parametric.ll
@@ -1,12 +1,9 @@
 ; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -print-polyhedral-info -polly-check-parallel -disable-output < %s | FileCheck %s -check-prefix=PINFO
 ;
 ;        void f(int *A, int N, int c) {
 ; CHECK:   #pragma minimal dependence distance: 1
-; PINFO:   for.cond: Loop is not parallel.
 ;          for (int j = 0; j < N; j++)
 ; CHECK:      #pragma minimal dependence distance: max(-c, c)
-; PINFO-NEXT: for.cond1: Loop is not parallel.
 ;             for (int i = 0; i < N; i++)
 ;               A[i + c] = A[i] + 1;
 ;        }
diff --git a/polly/test/IstAstInfo/dependence_distance_parametric_expr.ll b/polly/test/IstAstInfo/dependence_distance_parametric_expr.ll
index 5cce8c84a903c..7f280e0c542ca 100644
--- a/polly/test/IstAstInfo/dependence_distance_parametric_expr.ll
+++ b/polly/test/IstAstInfo/dependence_distance_parametric_expr.ll
@@ -1,12 +1,9 @@
 ; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -print-polyhedral-info -polly-check-parallel -disable-output < %s | FileCheck %s -check-prefix=PINFO
 ;
 ;        void f(int *A, int N, int c, int v) {
 ; CHECK:   #pragma minimal dependence distance: 1
-; PINFO:   for.cond: Loop is not parallel.
 ;          for (int j = 0; j < N; j++)
 ; CHECK:      #pragma minimal dependence distance: max(-c - v, c + v)
-; PINFO-NEXT: for.cond1: Loop is not parallel.
 ;             for (int i = 0; i < N; i++)
 ;               A[i + c + v] = A[i] + 1;
 ;        }
diff --git a/polly/test/IstAstInfo/dependence_distance_varying.ll b/polly/test/IstAstInfo/dependence_distance_varying.ll
index 71c045b69e28a..d609c2f210f8d 100644
--- a/polly/test/IstAstInfo/dependence_distance_varying.ll
+++ b/polly/test/IstAstInfo/dependence_distance_varying.ll
@@ -1,9 +1,7 @@
 ; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -print-polyhedral-info -polly-check-parallel -disable-output < %s | FileCheck %s -check-prefix=PINFO
 ;
 ;         void f(int *A, int N) {
 ; CHECK:    #pragma minimal dependence distance: -(N % 2) + 2
-; PINFO:    for.cond: Loop is not parallel.
 ;           for (int i = 0; i < N; i++)
 ;             A[i] = A[N - i] + 1;
 ;         }
diff --git a/polly/test/IstAstInfo/dependence_distance_varying_in_outer_loop.ll b/polly/test/IstAstInfo/dependence_distance_varying_in_outer_loop.ll
index 463e942fc958a..8ed3220353c1b 100644
--- a/polly/test/IstAstInfo/dependence_distance_varying_in_outer_loop.ll
+++ b/polly/test/IstAstInfo/dependence_distance_varying_in_outer_loop.ll
@@ -1,12 +1,9 @@
 ; RUN: opt %loadPolly -polly-canonicalize -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -print-polyhedral-info -polly-check-parallel -disable-output < %s | FileCheck %s -check-prefix=PINFO
 ;
 ;        void f(int *restrict A, int *restrict sum) {
 ; CHECK:   #pragma minimal dependence distance: 1
-; PINFO:    for.cond: Loop is not parallel.
 ;          for (int j = 0; j < 1024; j++)
 ; CHECK:      #pragma minimal dependence distance: 1
-; PINFO-NEXT: for.cond1: Loop is not parallel.
 ;             for (int i = j; i < 1024; i++)
 ;               A[i - 3] = A[j] * 2 + A[j] + 2;
 ;        }
diff --git a/polly/test/IstAstInfo/dependence_distance_varying_multiple.ll b/polly/test/IstAstInfo/dependence_distance_varying_multiple.ll
index 67917b4a919f7..73768e9c308a4 100644
--- a/polly/test/IstAstInfo/dependence_distance_varying_multiple.ll
+++ b/polly/test/IstAstInfo/dependence_distance_varying_multiple.ll
@@ -1,10 +1,8 @@
 ; RUN: opt %loadPolly -basic-aa -polly-stmt-granularity=bb -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -print-polyhedral-info -polly-check-parallel -disable-output < %s | FileCheck %s -check-prefix=PINFO
 ;
 ;        void f(int *restrict A, int *restrict B, int *restrict C, int *restrict D,
 ;               int *restrict E, int N) {
 ; CHECK:   #pragma minimal dependence distance: N >= 35 ? 1 : N >= 17 && N <= 34 ? 2 : 5
-; PINFO:   for.cond: Loop is not parallel.
 ;          for (int i = 0; i < N; i++) {
 ;            A[i] = A[100 - 2 * i] + 1;
 ;            B[i] = B[100 - 3 * i] + 1;
diff --git a/polly/test/IstAstInfo/reduction_clauses_multidimensional_access.ll b/polly/test/IstAstInfo/reduction_clauses_multidimensional_access.ll
index b588e42df5d1e..697b6ca50d444 100644
--- a/polly/test/IstAstInfo/reduction_clauses_multidimensional_access.ll
+++ b/polly/test/IstAstInfo/reduction_clauses_multidimensional_access.ll
@@ -1,14 +1,10 @@
 ; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -print-polyhedral-info -polly-check-parallel -disable-output < %s | FileCheck %s -check-prefix=PINFO
 ;
 ; CHECK: #pragma known-parallel reduction (^ : MemRef_sum)
 ;        void f(int N, int M, int P, int sum[P][M]) {
-; PINFO:   for.cond: Loop is not parallel.
 ;          for (int i = 0; i < N; i++)
-; PINFO-NEXT: for.cond1: Loop is parallel.
 ;             for (int j = 0; j < P; j++)
 ; CHECK:        #pragma simd
-; PINFO-NEXT:   for.cond4: Loop is parallel.
 ;               for (int k = 0; k < M; k++)
 ;                 sum[j][k] ^= j;
 ;        }
diff --git a/polly/test/IstAstInfo/reduction_in_one_dimension.ll b/polly/test/IstAstInfo/reduction_in_one_dimension.ll
index 86a1b67f7292d..797115b6f8d70 100644
--- a/polly/test/IstAstInfo/reduction_in_one_dimension.ll
+++ b/polly/test/IstAstInfo/reduction_in_one_dimension.ll
@@ -1,13 +1,10 @@
 ; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -print-polyhedral-info -polly-check-parallel -disable-output < %s | FileCheck %s -check-prefix=PINFO
 ;
 ; Verify that we won't privatize anything in the outer dimension
 ;
 ; CHECK:    #pragma known-parallel
-; PINFO:    for.cond: Loop is parallel.
 ; CHECK:    for (int c0 = 0; c0 < 2 * n; c0 += 1)
 ; CHECK:      #pragma simd reduction
-; PINFO-NEXT: for.cond1: Loop is not parallel.
 ; CHECK:      for (int c1 = 0; c1 <= 1023; c1 += 1)
 ; CHECK:        Stmt_for_body3(c0, c1);
 ;
diff --git a/polly/test/IstAstInfo/reduction_loop_reversal.ll b/polly/test/IstAstInfo/reduction_loop_reversal.ll
index c940f5c08fa1e..d30119787d8e0 100644
--- a/polly/test/IstAstInfo/reduction_loop_reversal.ll
+++ b/polly/test/IstAstInfo/reduction_loop_reversal.ll
@@ -1,5 +1,4 @@
 ; RUN: opt %loadPolly -polly-import-jscop -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -print-polyhedral-info -polly-check-parallel -disable-output < %s | FileCheck %s -check-prefix=PINFO
 ;
 ; CHECK-NOT: #pragma simd{{\s*$}}
 ; CHECK: #pragma simd reduction
@@ -7,9 +6,6 @@
 ; CHECK: #pragma simd{{\s*$}}
 ; CHECK: Stmt_S1(n - c1)
 ;
-; PINFO:       for.cond2: Loop is parallel.
-; PINFO-NEXT:  for.cond: Loop is not parallel.
-;
 ;    void rlr(int *A, long n) {
 ;      for (long i = 0; i < 2 * n; i++)
 ; S0:    A[0] += i;
diff --git a/polly/test/IstAstInfo/reduction_modulo_schedule.ll b/polly/test/IstAstInfo/reduction_modulo_schedule.ll
index 21a78e5487621..c39ffa591484d 100644
--- a/polly/test/IstAstInfo/reduction_modulo_schedule.ll
+++ b/polly/test/IstAstInfo/reduction_modulo_schedule.ll
@@ -1,5 +1,4 @@
 ; RUN: opt %loadPolly -polly-import-jscop -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -print-polyhedral-info -polly-check-parallel -disable-output < %s | FileCheck %s -check-prefix=PINFO
 ;
 ; CHECK:          #pragma known-parallel reduction (+ : MemRef_A)
 ; CHECK-NEXT:     for (int c0 = 0; c0 <= 2; c0 += 1) {
@@ -13,9 +12,6 @@
 ; CHECK-NEXT:           Stmt_S0(c1);
 ; CHECK-NEXT:     }
 ;
-; PINFO:      for.cond2: Loop is parallel.
-; PINFO-NEXT: for.cond: Loop is not parallel.
-;
 ;    void rms(int *A, long n) {
 ;      for (long i = 0; i < 2 * n; i++)
 ; S0:    A[0] += i;
diff --git a/polly/test/Unit/lit.cfg b/polly/test/Unit/lit.cfg
index 6c450fbc54b5a..21d7bc4ab25c5 100644
--- a/polly/test/Unit/lit.cfg
+++ b/polly/test/Unit/lit.cfg
@@ -50,7 +50,7 @@ for var in [
 
 if platform.system() == 'Darwin':
     shlibpath_var = 'DYLD_LIBRARY_PATH'
-elif platform.system() == 'Windows':
+elif platform.system() == 'Windows' or sys.platform == "cygwin":
     shlibpath_var = 'PATH'
 else:
     shlibpath_var = 'LD_LIBRARY_PATH'
diff --git a/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-query/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-query/BUILD.bazel
new file mode 100644
index 0000000000000..05fcbf7beb99f
--- /dev/null
+++ b/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-query/BUILD.bazel
@@ -0,0 +1,28 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("@rules_cc//cc:defs.bzl", "cc_library")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+)
+
+licenses(["notice"])
+
+cc_library(
+    name = "lib",
+    srcs = glob(["*.cpp"]),
+    hdrs = glob(["*.h"]),
+    deps = [
+        "//clang:ast",
+        "//clang:ast_matchers",
+        "//clang:ast_matchers_dynamic",
+        "//clang:basic",
+        "//clang:frontend",
+        "//clang:serialization",
+        "//llvm:LineEditor",
+        "//llvm:Support",
+    ],
+)
diff --git a/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-tidy/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-tidy/BUILD.bazel
index 2808288c562a9..baad2cf053b8d 100644
--- a/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-tidy/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-tidy/BUILD.bazel
@@ -23,6 +23,11 @@ bool_flag(
     build_setting_default = True,
 )
 
+bool_flag(
+    name = "enable_custom_checks",
+    build_setting_default = True,
+)
+
 config_setting(
     name = "static_analyzer_enabled",
     flag_values = {
@@ -30,13 +35,25 @@ config_setting(
     },
 )
 
+config_setting(
+    name = "custom_checks_enabled",
+    flag_values = {
+        ":enable_custom_checks": "true",
+    },
+)
+
 expand_template(
     name = "config",
     out = "clang-tidy-config.h",
     substitutions =
-        {
-            "#cmakedefine01 CLANG_TIDY_ENABLE_QUERY_BASED_CUSTOM_CHECKS": "#define CLANG_TIDY_ENABLE_QUERY_BASED_CUSTOM_CHECKS 0",
-        } | select({
+        select({
+            ":custom_checks_enabled": {
+                "#cmakedefine01 CLANG_TIDY_ENABLE_QUERY_BASED_CUSTOM_CHECKS": "#define CLANG_TIDY_ENABLE_QUERY_BASED_CUSTOM_CHECKS 1",
+            },
+            "//conditions:default": {
+                "#cmakedefine01 CLANG_TIDY_ENABLE_QUERY_BASED_CUSTOM_CHECKS": "#define CLANG_TIDY_ENABLE_QUERY_BASED_CUSTOM_CHECKS 0",
+            },
+        }) | select({
             ":static_analyzer_enabled": {
                 "#cmakedefine01 CLANG_TIDY_ENABLE_STATIC_ANALYZER": "#define CLANG_TIDY_ENABLE_STATIC_ANALYZER 1",
             },
@@ -208,6 +225,15 @@ clang_tidy_library(
     ],
 )
 
+clang_tidy_library(
+    name = "custom",
+    deps = [
+        ":lib",
+        "//clang:ast_matchers_dynamic",
+        "//clang-tools-extra/clang-query:lib",
+    ],
+)
+
 clang_tidy_library(
     name = "concurrency",
     deps = [":lib"],
@@ -365,6 +391,9 @@ CHECKS = [
 ] + select({
     ":static_analyzer_enabled": [":mpi"],
     "//conditions:default": [],
+}) + select({
+    ":custom_checks_enabled": [":custom"],
+    "//conditions:default": [],
 })
 
 cc_library(
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 936bc1248b6ef..2d9433f6e4dc7 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -1501,6 +1501,21 @@ libc_support_library(
     ],
 )
 
+libc_support_library(
+    name = "__support_osutil_linux_auxv",
+    hdrs = ["src/__support/OSUtil/linux/auxv.h"],
+    target_compatible_with = select({
+        "@platforms//os:linux": [],
+        "//conditions:default": ["@platforms//:incompatible"],
+    }),
+    deps = [
+        ":__support_common",
+        ":__support_osutil_syscall",
+        ":__support_threads_callonce",
+        ":hdr_fcntl_macros",
+    ],
+)
+
 libc_support_library(
     name = "__support_osutil_vdso",
     hdrs = [
@@ -2932,6 +2947,22 @@ libc_support_library(
     ],
 )
 
+libc_support_library(
+    name = "__support_math_exp2m1f",
+    hdrs = ["src/__support/math/exp2m1f.h"],
+    deps = [
+        ":__support_fputil_except_value_utils",
+        ":__support_fputil_fma",
+        ":__support_fputil_multiply_add",
+        ":__support_fputil_nearest_integer",
+        ":__support_fputil_polyeval",
+        ":__support_fputil_rounding_mode",
+        ":__support_macros_optimization",
+        ":__support_math_common_constants",
+        ":__support_math_exp10f_utils",
+    ],
+)
+
 libc_support_library(
     name = "__support_math_exp10",
     hdrs = ["src/__support/math/exp10.h"],
@@ -3061,8 +3092,12 @@ libc_support_library(
 
 libc_support_library(
     name = "__support_sincosf_utils",
-    hdrs = ["src/__support/math/sincosf_utils.h"],
+    hdrs = [
+        "src/__support/math/sincosf_utils.h",
+        "src/__support/math/sincosf_float_eval.h",
+    ],
     deps = [
+        ":__support_fputil_double_double",
         ":__support_fputil_fp_bits",
         ":__support_fputil_polyeval",
         ":__support_range_reduction",
@@ -3719,8 +3754,7 @@ libc_math_function(
 libc_math_function(
     name = "exp2m1f",
     additional_deps = [
-        ":__support_fputil_polyeval",
-        ":__support_math_exp10f_utils",
+        ":__support_math_exp2m1f",
     ],
 )
 
@@ -6336,6 +6370,19 @@ libc_function(
     ],
 )
 
+# WARNING: NOT FULLY IMPLEMENTED, FOR TESTING USE ONLY
+libc_function(
+    name = "sysconf",
+    srcs = ["src/unistd/linux/sysconf.cpp"],
+    hdrs = ["src/unistd/sysconf.h"],
+    deps = [
+        ":__support_common",
+        ":__support_osutil_linux_auxv",
+        ":errno",
+        ":hdr_unistd_macros",
+    ],
+)
+
 libc_function(
     name = "write",
     srcs = ["src/unistd/linux/write.cpp"],
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/sys/mman/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/sys/mman/BUILD.bazel
index e2c7f7a8bf60b..6de76e2357b70 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/sys/mman/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/sys/mman/BUILD.bazel
@@ -30,6 +30,7 @@ libc_test(
         "//libc:mmap",
         "//libc:munlock",
         "//libc:munmap",
+        "//libc:sysconf",
     ],
 )
 
@@ -48,6 +49,7 @@ libc_test(
         "//libc:munlock",
         "//libc:munlockall",
         "//libc:munmap",
+        "//libc:sysconf",
     ],
 )
 
@@ -89,6 +91,7 @@ libc_test(
         "//libc:msync",
         "//libc:munlock",
         "//libc:munmap",
+        "//libc:sysconf",
     ],
 )
 
@@ -111,6 +114,7 @@ libc_test(
         "//libc:munmap",
         "//libc:open",
         "//libc:remap_file_pages",
+        "//libc:sysconf",
     ],
 )
 
diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index 5357a6abe0ca1..b415a367d2c19 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -5671,6 +5671,57 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "llvm-sim",
+    testonly = True,
+    srcs = glob([
+        "tools/llvm-sim/*.cpp",
+    ]),
+    copts = llvm_copts,
+    stamp = 0,
+    deps = [
+        ":Analysis",
+        ":Core",
+        ":IRReader",
+        ":Support",
+        ":config",
+    ],
+)
+
+cc_binary(
+    name = "llvm-ir2vec",
+    testonly = True,
+    srcs = glob([
+        "tools/llvm-ir2vec/*.cpp",
+    ]),
+    copts = llvm_copts,
+    stamp = 0,
+    deps = [
+        ":Analysis",
+        ":Core",
+        ":IRReader",
+        ":Support",
+        ":config",
+    ],
+)
+
+cc_binary(
+    name = "llvm-ctxprof-util",
+    testonly = True,
+    srcs = glob([
+        "tools/llvm-ctxprof-util/*.cpp",
+    ]),
+    copts = llvm_copts,
+    stamp = 0,
+    deps = [
+        ":Core",
+        ":Object",
+        ":ProfileData",
+        ":Support",
+        ":config",
+    ],
+)
+
 cc_binary(
     name = "obj2yaml",
     testonly = True,
diff --git a/utils/bazel/llvm-project-overlay/llvm/config.bzl b/utils/bazel/llvm-project-overlay/llvm/config.bzl
index 3e9c032d5b8ce..fbd6f6bfdd6ae 100644
--- a/utils/bazel/llvm-project-overlay/llvm/config.bzl
+++ b/utils/bazel/llvm-project-overlay/llvm/config.bzl
@@ -105,6 +105,7 @@ llvm_config_defines = os_defines + builtin_thread_pointer + select({
     "@bazel_tools//src/conditions:darwin_x86_64": native_arch_defines("X86", "x86_64-unknown-darwin"),
     "@bazel_tools//src/conditions:linux_aarch64": native_arch_defines("AArch64", "aarch64-unknown-linux-gnu"),
     "@bazel_tools//src/conditions:linux_ppc64le": native_arch_defines("PowerPC", "powerpc64le-unknown-linux-gnu"),
+    "@bazel_tools//src/conditions:linux_riscv64": native_arch_defines("RISCV", "riscv64-unknown-linux-gnu"),
     "@bazel_tools//src/conditions:linux_s390x": native_arch_defines("SystemZ", "systemz-unknown-linux_gnu"),
     "//conditions:default": native_arch_defines("X86", "x86_64-unknown-linux-gnu"),
 }) + [
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 61c30b8359e7d..b5f14ff6f9b71 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -712,6 +712,19 @@ mlir_c_api_cc_library(
     ],
 )
 
+mlir_c_api_cc_library(
+    name = "CAPIShape",
+    srcs = ["lib/CAPI/Dialect/Shape.cpp"],
+    hdrs = ["include/mlir-c/Dialect/Shape.h"],
+    capi_deps = [
+        ":CAPIIR",
+    ],
+    includes = ["include"],
+    deps = [
+        ":ShapeDialect",
+    ],
+)
+
 mlir_c_api_cc_library(
     name = "CAPITarget",
     srcs = ["lib/CAPI/Target/LLVMIR.cpp"],
@@ -733,6 +746,19 @@ mlir_c_api_cc_library(
     ],
 )
 
+mlir_c_api_cc_library(
+    name = "CAPITensor",
+    srcs = ["lib/CAPI/Dialect/Tensor.cpp"],
+    hdrs = ["include/mlir-c/Dialect/Tensor.h"],
+    capi_deps = [
+        ":CAPIIR",
+    ],
+    includes = ["include"],
+    deps = [
+        ":TensorDialect",
+    ],
+)
+
 mlir_c_api_cc_library(
     name = "CAPIGPU",
     srcs = [
@@ -1254,6 +1280,7 @@ cc_binary(
     deps = [
         ":CAPIIR",
         ":CAPILLVM",
+        ":CAPITarget",
         ":MLIRBindingsPythonNanobindHeadersAndDeps",
         "@nanobind",
     ],
@@ -1411,6 +1438,13 @@ td_library(
     deps = [":OpBaseTdFiles"],
 )
 
+td_library(
+    name = "InferStridedMetadataInterfaceTdFiles",
+    srcs = ["include/mlir/Interfaces/InferStridedMetadataInterface.td"],
+    includes = ["include"],
+    deps = [":OpBaseTdFiles"],
+)
+
 td_library(
     name = "InferTypeOpInterfaceTdFiles",
     srcs = ["include/mlir/Interfaces/InferTypeOpInterface.td"],
@@ -4159,6 +4193,7 @@ cc_library(
         ":MathToLibm",
         ":MathToROCDL",
         ":MathToSPIRV",
+        ":MathToXeVM",
         ":MemRefToEmitC",
         ":MemRefToLLVM",
         ":MemRefToSPIRV",
@@ -5227,6 +5262,7 @@ cc_library(
         ":IR",
         ":InferTypeOpInterface",
         ":InliningUtils",
+        ":LLVMDialectBytecodeIncGen",
         ":LLVMDialectInterfaceIncGen",
         ":LLVMIntrinsicOpsIncGen",
         ":LLVMOpsIncGen",
@@ -5477,12 +5513,13 @@ gentbl_cc_library(
 
 cc_library(
     name = "GPUPipelines",
-    srcs = ["lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp"],
+    srcs = glob(["lib/Dialect/GPU/Pipelines/*.cpp"]),
     hdrs = ["include/mlir/Dialect/GPU/Pipelines/Passes.h"],
     includes = ["include"],
     deps = [
         ":AffineToStandard",
         ":ArithToLLVM",
+        ":ConversionPasses",
         ":FuncDialect",
         ":FuncToLLVM",
         ":GPUDialect",
@@ -5491,8 +5528,10 @@ cc_library(
         ":GPUTransforms",
         ":IndexToLLVM",
         ":LLVMDialect",
+        ":LLVMIRTransforms",
         ":LinalgTransforms",
         ":MathToLLVM",
+        ":MathToXeVM",
         ":MemRefToLLVM",
         ":MemRefTransforms",
         ":NVGPUToNVVM",
@@ -5503,6 +5542,10 @@ cc_library(
         ":Transforms",
         ":VectorToLLVM",
         ":VectorToSCF",
+        ":XeGPUToXeVM",
+        ":XeGPUTransforms",
+        ":XeVMTarget",
+        ":XeVMToLLVM",
     ],
 )
 
@@ -5679,6 +5722,16 @@ td_library(
     ],
 )
 
+td_library(
+    name = "LLVMDialectBytecodeTdFiles",
+    srcs = ["include/mlir/Dialect/LLVMIR/LLVMDialectBytecode.td"],
+    includes = ["include"],
+    deps = [
+        ":BytecodeTdFiles",
+        ":LLVMOpsTdFiles",
+    ],
+)
+
 cc_library(
     name = "GPUCommonTransforms",
     hdrs = [
@@ -6031,6 +6084,17 @@ gentbl_cc_library(
     deps = [":LLVMOpsTdFiles"],
 )
 
+gentbl_cc_library(
+    name = "LLVMDialectBytecodeIncGen",
+    tbl_outs = {"include/mlir/Dialect/LLVMIR/LLVMDialectBytecode.cpp.inc": [
+        "-gen-bytecode",
+        "-bytecode-dialect=LLVM",
+    ]},
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/Dialect/LLVMIR/LLVMDialectBytecode.td",
+    deps = [":LLVMDialectBytecodeTdFiles"],
+)
+
 gentbl_cc_library(
     name = "LLVMTypesIncGen",
     tbl_outs = {
@@ -7010,6 +7074,7 @@ cc_library(
     ]),
     includes = ["include"],
     deps = [
+        ":AMDGPUUtils",
         ":ConversionPassIncGen",
         ":DialectUtils",
         ":FuncDialect",
@@ -7023,6 +7088,34 @@ cc_library(
         ":ROCDLDialect",
         ":TransformUtils",
         ":VectorDialect",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "MathToXeVM",
+    srcs = glob([
+        "lib/Conversion/MathToXeVM/*.cpp",
+    ]),
+    hdrs = glob([
+        "include/mlir/Conversion/MathToXeVM/*.h",
+    ]),
+    includes = [
+        "include",
+        "lib/Conversion/MathToXeVM",
+    ],
+    deps = [
+        ":ArithAttrToLLVMConversion",
+        ":ArithDialect",
+        ":ConversionPassIncGen",
+        ":IR",
+        ":LLVMCommonConversion",
+        ":LLVMDialect",
+        ":MathDialect",
+        ":Pass",
+        ":Transforms",
+        ":XeVMDialect",
+        "//llvm:Support",
     ],
 )
 
@@ -7600,6 +7693,30 @@ cc_library(
     ],
 )
 
+gentbl_cc_library(
+    name = "InferStridedMetadataInterfaceIncGen",
+    tbl_outs = {
+        "include/mlir/Interfaces/InferStridedMetadataInterface.h.inc": ["-gen-op-interface-decls"],
+        "include/mlir/Interfaces/InferStridedMetadataInterface.cpp.inc": ["-gen-op-interface-defs"],
+    },
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/Interfaces/InferStridedMetadataInterface.td",
+    deps = [":InferStridedMetadataInterfaceTdFiles"],
+)
+
+cc_library(
+    name = "InferStridedMetadataInterface",
+    srcs = ["lib/Interfaces/InferStridedMetadataInterface.cpp"],
+    hdrs = ["include/mlir/Interfaces/InferStridedMetadataInterface.h"],
+    includes = ["include"],
+    deps = [
+        ":IR",
+        ":InferIntRangeInterface",
+        ":InferStridedMetadataInterfaceIncGen",
+        "//llvm:Support",
+    ],
+)
+
 td_library(
     name = "DataLayoutInterfacesTdFiles",
     srcs = ["include/mlir/Interfaces/DataLayoutInterfaces.td"],
@@ -8528,9 +8645,11 @@ cc_library(
         ":CallOpInterfaces",
         ":ControlFlowInterfaces",
         ":DataLayoutInterfaces",
+        ":DialectUtils",
         ":FunctionInterfaces",
         ":IR",
         ":InferIntRangeInterface",
+        ":InferStridedMetadataInterface",
         ":LoopLikeInterface",
         ":Pass",
         ":SideEffectInterfaces",
@@ -12452,6 +12571,7 @@ cc_library(
         ":IR",
         ":InferIntRangeCommon",
         ":InferIntRangeInterface",
+        ":InferStridedMetadataInterface",
         ":InferTypeOpInterface",
         ":InliningUtils",
         ":Pass",
@@ -12673,6 +12793,7 @@ td_library(
         ":ArithOpsTdFiles",
         ":CastInterfacesTdFiles",
         ":ControlFlowInterfacesTdFiles",
+        ":InferStridedMetadataInterfaceTdFiles",
         ":MemOpInterfacesTdFiles",
         ":MemorySlotInterfacesTdFiles",
         ":OpBaseTdFiles",
@@ -12763,6 +12884,7 @@ cc_library(
         ":IR",
         ":InferIntRangeCommon",
         ":InferIntRangeInterface",
+        ":InferStridedMetadataInterface",
         ":InferTypeOpInterface",
         ":InliningUtils",
         ":MemOpInterfaces",
@@ -14113,6 +14235,7 @@ cc_library(
         ":TransformUtils",
         ":VectorDialect",
         ":XeGPUDialect",
+        ":XeGPUUtils",
         ":XeVMDialect",
         "//llvm:Support",
     ],
diff --git a/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel
index 102c4161eb74c..0a84bd246b8e0 100644
--- a/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel
@@ -632,6 +632,34 @@ filegroup(
     ],
 )
 
+##---------------------------------------------------------------------------##
+# OpenACC dialect.
+##---------------------------------------------------------------------------##
+
+gentbl_filegroup(
+    name = "OpenACCOpsPyGen",
+    tbl_outs = {"mlir/dialects/_acc_ops_gen.py": [
+        "-gen-python-op-bindings",
+        "-bind-dialect=acc",
+    ]},
+    tblgen = "//mlir:mlir-tblgen",
+    td_file = "mlir/dialects/OpenACCOps.td",
+    deps = [
+        "//mlir:BuiltinDialectTdFiles",
+        "//mlir:OpBaseTdFiles",
+        "//mlir:OpenAccOpsTdFiles",
+        "//mlir:SideEffectInterfacesTdFiles",
+    ],
+)
+
+filegroup(
+    name = "OpenACCOpsPyFiles",
+    srcs = [
+        "mlir/dialects/openacc.py",
+        ":OpenACCOpsPyGen",
+    ],
+)
+
 ##---------------------------------------------------------------------------##
 # OpenMP dialect.
 ##---------------------------------------------------------------------------##
diff --git a/utils/bazel/llvm-project-overlay/mlir/test/Conversion/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/Conversion/BUILD.bazel
index b00e8f243c29a..d8fcb5360d26b 100644
--- a/utils/bazel/llvm-project-overlay/mlir/test/Conversion/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/test/Conversion/BUILD.bazel
@@ -1,4 +1,5 @@
 load("//llvm:lit_test.bzl", "lit_test")
+load("//llvm:targets.bzl", "llvm_targets")
 
 licenses(["notice"])
 
@@ -15,6 +16,9 @@ package(default_visibility = ["//visibility:public"])
     )
     for src in glob(
         include = ["**/*.mlir"],
-        exclude = ["GPUToROCm/lower-rocdl-kernel-to-hsaco.mlir"],
+        exclude = ["GPUToROCm/lower-rocdl-kernel-to-hsaco.mlir"] + (
+            # MathToXeVM needs SPIRV; see MathToXeVM/lit.local.cfg
+            ["MathToXeVM/**"] if "SPIRV" not in llvm_targets else []
+        ),
     )
 ]